diff --git a/.bazelrc b/.bazelrc
index 0322618b53f..ff910cd186e 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -149,6 +149,12 @@ build --experimental_cc_shared_library
 # cc_shared_library ensures no library is linked statically more than once.
 build --experimental_link_static_libraries_once=false
 
+# Prevent regressions on those two incompatible changes
+# TODO: remove those flags when they are flipped in the default Bazel version TF uses.
+build --incompatible_enforce_config_setting_visibility
+# TODO: also enable this flag after fixing the visbility violations
+# build --incompatible_config_setting_private_default_visibility
+
 # Default options should come above this line.
 
 # Allow builds using libc++ as a linker library
@@ -324,7 +330,9 @@ build:linux --copt="-Wunused-result"
 # build:linux --copt="-Werror=unused-result"
 # Add switch as an error on Linux.
 build:linux --copt="-Wswitch"
-# build:linux --copt="-Werror=switch"
+build:linux --copt="-Werror=switch"
+# Required for building with clang
+build:linux --copt="-Wno-error=unused-but-set-variable"
 
 # On Windows, `__cplusplus` is wrongly defined without this switch
 # See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
@@ -382,8 +390,8 @@ build:windows --host_copt=-DNOGDI
 
 # MSVC (Windows): Standards-conformant preprocessor mode
 # See https://docs.microsoft.com/en-us/cpp/preprocessor/preprocessor-experimental-overview
-build:windows --copt=/experimental:preprocessor
-build:windows --host_copt=/experimental:preprocessor
+build:windows --copt=/Zc:preprocessor
+build:windows --host_copt=/Zc:preprocessor
 
 # Misc build options we need for windows.
 build:windows --linkopt=/DEBUG
@@ -559,8 +567,8 @@ build:rbe_linux_py3_base --python_path="/usr/local/bin/python3.9"
 build:rbe_linux_py3_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_python3.9"
 
 build:rbe_win --config=rbe
-build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_06152022:toolchain"
-build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_06152022:cc-toolchain-x64_windows"
+build:rbe_win --crosstool_top="//tensorflow/tools/toolchains/win/tf_win_01232023:toolchain"
+build:rbe_win --extra_toolchains="//tensorflow/tools/toolchains/win/tf_win_01232023:cc-toolchain-x64_windows"
 build:rbe_win --extra_execution_platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
 build:rbe_win --host_platform="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
 build:rbe_win --platforms="//tensorflow/tools/toolchains/win:rbe_windows_ltsc2019"
@@ -672,6 +680,7 @@ build:asan --copt -g
 build:asan --copt -O3
 build:asan --copt -fno-omit-frame-pointer
 build:asan --linkopt -fsanitize=address
+build:asan --@libjpeg_turbo//:noasm=yes
 
 # Memory sanitizer
 # CC=clang bazel build --config msan
@@ -695,7 +704,17 @@ build:ubsan --linkopt -fsanitize=undefined
 build:ubsan --linkopt -lubsan
 
 # Disable TFRT integration for now unless --config=tfrt is specified.
-build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/common,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
 # TODO(b/240450920): We are in the process of migrating JitRt backend to XLA
 # and while we are doing this we can't keep it buildable/testable in OSS.
-build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/common,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+
+# TF Fuzztest config
+try-import fuzztest.bazelrc
+run:tf_fuzztest --config=fuzztest
+# Should aim to remove these
+build:tf_fuzztest --action_env=CC=clang
+build:tf_fuzztest --action_env=CXX=clang++
+build:tf_fuzztest --spawn_strategy=sandboxed
+build:tf_fuzztest --config=monolithic
+build:tf_fuzztest --@libjpeg_turbo//:noasm=yes
diff --git a/.bazelversion b/.bazelversion
index e230c8396d1..f53152b50eb 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1,2 @@
-5.3.0
\ No newline at end of file
+5.3.0
+# NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml b/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
index 6e4753d8674..70bdc6160cb 100644
--- a/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
+++ b/.github/ISSUE_TEMPLATE/tensorflow_issue_template.yaml
@@ -23,6 +23,17 @@ body:
       value: |
         Please make sure that this is a bug. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md),we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub.
 
+  - type: dropdown
+    id: tf-nightly
+    attributes:
+      label: Have you reproduced the bug with TF nightly?
+      description: It is strongly suggested that you have reproduced the bug with [TF nightly](https://www.tensorflow.org/install/pip#nightly)
+      options:
+        - "Yes"
+        - "No"
+    validations:
+      required: true
+          
   - type: markdown
     attributes:
       value: | 
@@ -38,6 +49,7 @@ body:
         - binary
     validations:
       required: true
+      
   - type: input
     id: tfversion
     attributes:
diff --git a/.github/ISSUE_TEMPLATE/tflite-other.md b/.github/ISSUE_TEMPLATE/tflite-other.md
new file mode 100644
index 00000000000..8b8246f2b72
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/tflite-other.md
@@ -0,0 +1,62 @@
+name: TensorFlow Lite Other Issue description: Use this template to report any
+issue in TensorFlow Lite that is not about Converters, Play Services or Ops
+body: - type: dropdown id: issue-type attributes: label: Issue Type description:
+What type of issue would you like to report? multiple: false options: - Bug -
+Build/Install - Performance - Support - Feature Request - Documentation Feature
+Request - Documentation Bug - Others validations: required: true - type:
+markdown attributes: value: | Please make sure that this is a bug. As per our
+[GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md),we
+only address code/doc bugs, performance issues, feature requests and
+build/installation issues on GitHub.
+
+-   type: markdown
+    attributes:
+      value: |
+        You can collect some of this information using our environment capture [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)  You can also obtain the TensorFlow version with: <br> 1. TF 1.0: `python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` <br>2. TF 2.0: `python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
+
+-   type: dropdown id: source attributes: label: Source description: Tensorflow
+    installed from options: - source - binary validations: required: true
+
+-   type: input id: tfversion attributes: label: Tensorflow Version description:
+    placeholder: ex,. tf 2.8 validations: required: true
+
+-   type: dropdown id: Code attributes: label: Custom Code description:
+    options: - "Yes" - "No" validations: required: true
+
+-   type: input id: OS attributes: label: OS Platform and Distribution
+    description: placeholder: e.g., Linux Ubuntu 16.04 validations: required:
+    false
+
+-   type: input id: Mobile attributes: label: Mobile device description:
+    placeholder: e.g., Linux Ubuntu 16.04 validations: required: false
+
+-   type: input id: Python attributes: label: Python version description:
+    placeholder: e.g., 3.9 validations: required: false
+
+-   type: input id: Bazel attributes: label: Bazel version description: if
+    compiling from source placeholder: validations: required: false
+
+-   type: input id: Compiler attributes: label: GCC/Compiler version
+    description: if compiling from source placeholder: validations: required:
+    false
+
+-   type: input id: Cuda attributes: label: CUDA/cuDNN version description:
+    placeholder: validations: required: false
+
+-   type: input id: Gpu attributes: label: GPU model and memory description: if
+    compiling from source placeholder: validations: required: false
+
+-   type: textarea id: what-happened attributes: label: Current Behaviour?
+    description: Also tell us, what did you expect to happen? placeholder: Tell
+    us what you see! value: "A bug happened!" render: shell validations:
+    required: true
+
+-   type: textarea id: code-to-reproduce attributes: label: Standalone code to
+    reproduce the issue description: Provide a reproducible test case that is
+    the bare minimum necessary to generate the problem. If possible, please
+    share a link to Colab/Jupyter/any notebook. placeholder: Tell us what you
+    see! value: render: shell validations: required: true
+
+-   type: textarea id: logs attributes: label: Relevant log output description:
+    Please copy and paste any relevant log output. This will be automatically
+    formatted into code, so no need for backticks. render: shell
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index 3f039b9e176..bab88af1a8e 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -15,9 +15,10 @@
 
 # A list of assignees
 assignees:
-   - tilakrayal
+   - synandi
    - tiruk007
-   - Mohantym
+   - gaikwadrahul8
+   - pjpratik
 # A list of assignees for compiler folder
 compiler_assignees:
    - joker-eph
diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index 1698cf0f0b3..b601b0054c7 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -26,9 +26,14 @@ jobs:
   build:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     runs-on: [self-hosted, linux, ARM64]
+    continue-on-error: ${{ matrix.experimental }}
     strategy:
       matrix:
-        pyver: ['3.7', '3.8', '3.9', '3.10']
+        pyver: ['3.8', '3.9', '3.10']
+        experimental: [false]
+        include:
+          - pyver: '3.11'
+            experimental: true
     steps:
       - name: Stop old running containers (if any)
         shell: bash
@@ -46,12 +51,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@v3
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
         with:
           ref: 'nightly'
       - name: Checkout repository for releases (skipped for nightly)
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@v3
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Build and test pip wheel
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 0fcf49e340a..1592f4ed18a 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -50,7 +50,7 @@ jobs:
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Build binary and run non-pip tests
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index 067e29131e7..e6ddbb9eec9 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -21,14 +21,15 @@ on:
       - master
       - r2.**
   pull_request:
-    types: [opened, synchronize, reopened]
+    types: [labeled, opened, synchronize, reopened]
     branches:
       - master
       - r2.**
 
 jobs:
   build:
-    if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
+    # Don't do this in forks, and if labeled, only for 'kokoro:force-run'
+    if: github.repository == 'tensorflow/tensorflow' && (github.event.action != 'labeled' || (github.event.action == 'labeled' && github.event.label.name == 'kokoro:force-run'))
     runs-on: [self-hosted, linux, ARM64]
     strategy:
       matrix:
@@ -49,14 +50,14 @@ jobs:
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Build and test pip wheel
         shell: bash
         run: |
           CI_DOCKER_BUILD_EXTRA_PARAMS='--build-arg py_major_minor_version=${{ matrix.pyver }}' \
           ./tensorflow/tools/ci_build/ci_build.sh cpu.arm64 bash tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
       - name: Upload pip wheel to GitHub
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@83fd05a356d7e2593de66fc9913b3002723633cb # v3.1.1
         with:
           name: tensorflow_py${{ matrix.pyver }}_wheel
           path: /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/whl/*.whl
diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index fdae2ac19e6..21ac759f3ef 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -27,9 +27,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
 
       - name: Check whether the citation metadata from CITATION.cff is valid
-        uses: citation-file-format/cffconvert-github-action@2.0.0
+        uses: citation-file-format/cffconvert-github-action@4cf11baa70a673bfdf9dad0acc7ee33b3f4b6084 # v2.0.0
         with:
           args: "--validate"
diff --git a/.github/workflows/issue-on-pr-rollback.yml b/.github/workflows/issue-on-pr-rollback.yml
index ce0182bedc2..fa76923a2ba 100644
--- a/.github/workflows/issue-on-pr-rollback.yml
+++ b/.github/workflows/issue-on-pr-rollback.yml
@@ -27,9 +27,9 @@ jobs:
       startsWith(github.event.head_commit.message, 'Rollback of PR #')
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Create a new Github Issue
-        uses: actions/github-script@v5
+        uses: actions/github-script@d556feaca394842dc55e4734bf3bb9f685482fa0 # v6.3.3
         with:
           github-token: ${{secrets.GITHUB_TOKEN}}
           script: |
diff --git a/.github/workflows/pylint-presubmit.yml b/.github/workflows/pylint-presubmit.yml
index f1b539f551b..e97f34472d8 100644
--- a/.github/workflows/pylint-presubmit.yml
+++ b/.github/workflows/pylint-presubmit.yml
@@ -25,17 +25,17 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout code
-      uses: actions/checkout@v2
+      uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
     - name: Get file changes
       id: get_file_changes
-      uses: trilom/file-changes-action@v1.2.4
+      uses: trilom/file-changes-action@a6ca26c14274c33b15e6499323aac178af06ad4b # v1.2.4
       with:
         output: ' '
     - name: Report list of changed files
       run: |
         echo Changed files: ${{ steps.get_file_changes.outputs.files }}
     - name: Set up Python 3.9
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@2c3dd9e7e29afd70cc0950079bde6c979d1f69f9 # v4.3.1
       with:
         python-version: "3.9"
     - name: Install Python dependencies
diff --git a/.github/workflows/release-branch-cherrypick.yml b/.github/workflows/release-branch-cherrypick.yml
index a57852a9644..5ff69e46805 100644
--- a/.github/workflows/release-branch-cherrypick.yml
+++ b/.github/workflows/release-branch-cherrypick.yml
@@ -42,7 +42,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@v2
+      uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       with:
         ref: ${{ github.event.inputs.release_branch }}
     - name: Get some helpful info for formatting
@@ -52,10 +52,10 @@ jobs:
           git config --global user.email "jenkins@tensorflow.org"
           git fetch origin master
           git cherry-pick ${{ github.event.inputs.git_commit }}
-          echo ::set-output name=SHORTSHA::$(git log -1 ${{ github.event.inputs.git_commit }} --format="%h")
-          echo ::set-output name=TITLE::$(git log -1 ${{ github.event.inputs.git_commit }} --format="%s")
+          echo "SHORTSHA=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%h")" >> "$GITHUB_OUTPUT"
+          echo "TITLE=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%s")" >> "$GITHUB_OUTPUT"
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@v3
+      uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
         title: '${{ github.event.inputs.release_branch }} cherry-pick: ${{ steps.cherrypick.outputs.SHORTSHA }} "${{ steps.cherrypick.outputs.TITLE }}"'
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index 8f9dab872b6..1c520aa86fd 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -34,23 +34,18 @@ jobs:
       # Needed to upload the results to code-scanning dashboard.
       security-events: write
       id-token: write
-      actions: read
-      contents: read
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@08dd0cebb088ac0fd6364339b1b3b68b75041ea8 # v2.0.0-alpha.2
+        uses: ossf/scorecard-action@15c10fcf1cf912bd22260bfec67569a359ab87da # v2.1.1
         with:
           results_file: results.sarif
           results_format: sarif
-          # Read-only PAT token. To create it,
-          # follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
-          repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
           # Publish the results to enable scorecard badges. For more details, see
           # https://github.com/ossf/scorecard-action#publishing-results.
           # For private repositories, `publish_results` will automatically be set to `false`,
@@ -59,7 +54,7 @@ jobs:
 
       # Upload the results as artifacts (optional).
       - name: "Upload artifact"
-        uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
+        uses: actions/upload-artifact@83fd05a356d7e2593de66fc9913b3002723633cb # v3.1.1
         with:
           name: SARIF file
           path: results.sarif
@@ -67,6 +62,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
+        uses: github/codeql-action/upload-sarif@896079047b4bb059ba6f150a5d87d47dde99e6e5 # v2.11.6
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/sigbuild-docker-branch.yml b/.github/workflows/sigbuild-docker-branch.yml
index 41b0fe5a13a..c898381efd5 100644
--- a/.github/workflows/sigbuild-docker-branch.yml
+++ b/.github/workflows/sigbuild-docker-branch.yml
@@ -31,23 +31,23 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.7, python3.8, python3.9, python3.10]
+        python-version: [python3.8, python3.9, python3.10, python3.11]
     steps:
       -
         name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@8c0edbc76e98fa90f69d9a2c020dcb50019dc325 # v2.2.1
       -
         name: Login to DockerHub
-        uses: docker/login-action@v1 
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
       -
         name: Login to GCR
-        uses: docker/login-action@v1
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
         with:
           registry: gcr.io
           username: _json_key
@@ -55,14 +55,14 @@ jobs:
       -
         name: Generate variables for cache busting and tag naming
         run: |
-          echo "::set-output name=DATE::$(date +'%Y-%m-%d')"
+          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
           # Converts r2.9 to just 2.9
-          echo "::set-output name=REF::$(echo $GITHUB_REF_NAME | sed 's/r//g')"
+          echo "REF=$(echo $GITHUB_REF_NAME | sed 's/r//g')" >> "$GITHUB_OUTPUT"
         id: vars
       -
         name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
         with:
           push: true
           context: ./tensorflow/tools/tf_sig_build_dockerfiles
diff --git a/.github/workflows/sigbuild-docker-presubmit.yml b/.github/workflows/sigbuild-docker-presubmit.yml
index c77c0d66311..065fd91319e 100644
--- a/.github/workflows/sigbuild-docker-presubmit.yml
+++ b/.github/workflows/sigbuild-docker-presubmit.yml
@@ -29,18 +29,18 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.7, python3.8, python3.9, python3.10]
+        python-version: [python3.8, python3.9, python3.10, python3.11]
     steps:
       -
         name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@8c0edbc76e98fa90f69d9a2c020dcb50019dc325 # v2.2.1
       -
         name: Login to GCR
         if: contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging')
-        uses: docker/login-action@v1
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
         with:
           registry: gcr.io
           username: _json_key
@@ -48,12 +48,12 @@ jobs:
       -
         name: Grab the date to do cache busting (assumes same day OK to keep)
         run: |
-          echo "::set-output name=DATE::$(date +'%Y-%m-%d')"
+          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
         id: date
       -
         name: Build containers, and push to GCR only if the 'build and push to gcr.io for staging' label is applied
         id: docker_build
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
         with:
           push: ${{ contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging') }}
           context: ./tensorflow/tools/tf_sig_build_dockerfiles
@@ -69,17 +69,17 @@ jobs:
           cache-to: type=inline
       -
         name: Add a comment with the pushed containers
-        uses: mshick/add-pr-comment@v1
+        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1 # v2
         if: contains(github.event.pull_request.labels.*.name, 'build and push to gcr.io for staging')
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           message: |
             I pushed these containers:
             
+            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.11`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.10`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.9`
             - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.8`
-            - `gcr.io/tensorflow-sigs/build:${{ github.event.number }}-python3.7`
             
             Re-apply the `build and push to gcr.io for staging` label to rebuild and push again. This comment will only be posted once.
       -
diff --git a/.github/workflows/sigbuild-docker.yml b/.github/workflows/sigbuild-docker.yml
index 276a0abc242..c9b12a39076 100644
--- a/.github/workflows/sigbuild-docker.yml
+++ b/.github/workflows/sigbuild-docker.yml
@@ -34,23 +34,23 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [python3.7, python3.8, python3.9, python3.10]
+        python-version: [python3.8, python3.9, python3.10, python3.11]
     steps:
       -
         name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       -
         name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@8c0edbc76e98fa90f69d9a2c020dcb50019dc325 # v2.2.1
       -
         name: Login to DockerHub
-        uses: docker/login-action@v1 
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
       -
         name: Login to GCR
-        uses: docker/login-action@v1
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a # v2.1.0
         with:
           registry: gcr.io
           username: _json_key
@@ -61,15 +61,15 @@ jobs:
           # [[:digit:]] searches for numbers and \+ joins them together
           major_version=$(grep "^#define TF_MAJOR_VERSION" ./tensorflow/core/public/version.h | grep -o "[[:digit:]]\+")
           minor_version=$(grep "^#define TF_MINOR_VERSION" ./tensorflow/core/public/version.h | grep -o "[[:digit:]]\+")
-          echo ::set-output name=TF_VERSION::${major_version}.${minor_version}
+          echo "TF_VERSION=${major_version}.${minor_version}" >> "$GITHUB_OUTPUT"
           # Also get the current date to do cache busting. Assumes one day
           # is an ok range for rebuilds
-          echo "::set-output name=DATE::$(date +'%Y-%m-%d')"
+          echo "DATE=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
         id: tf-version
       -
         name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@c56af957549030174b10d6867f20e78cfd7debc5 # v3.2.0
         with:
           push: true
           context: ./tensorflow/tools/tf_sig_build_dockerfiles
diff --git a/.github/workflows/trusted-partners.yml b/.github/workflows/trusted-partners.yml
index abf62dd2b8a..7c2fb863d15 100644
--- a/.github/workflows/trusted-partners.yml
+++ b/.github/workflows/trusted-partners.yml
@@ -30,9 +30,9 @@ jobs:
       github.event.sender.type == 'User'
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v2
+        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
       - name: Trusted-Partners-PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@d556feaca394842dc55e4734bf3bb9f685482fa0 # v6.3.3
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
@@ -49,6 +49,9 @@ jobs:
             case "nvidia.com":
               console.log(await script.filter({github, context, domain}));
               break;
+            case "linaro.org":
+              console.log(await script.filter({github, context, domain}));
+              break;
             case "google.com":
               console.log("Googler. No action necessary");
               break;
diff --git a/.github/workflows/trusted_partners.js b/.github/workflows/trusted_partners.js
index 6b6de25946e..60de918108d 100644
--- a/.github/workflows/trusted_partners.js
+++ b/.github/workflows/trusted_partners.js
@@ -39,9 +39,9 @@ const get_email_domain = async ({github, username}) => {
   return domain;
 };
 
-/** For trusted parters like Intel, we want to auto-run tests and mark the PR as ready to pull
+/** For trusted parters like Intel, we want to auto-run tests
     This allows us to reduce the delay to external partners
-    Add Labels - kokoro:force-run, ready to pull
+    Add Labels - kokoro:force-run
     The PR is also assigned to specific teams to fast track review
     Additional reviewers can be added manually based on PR contents
   @param {!object}
@@ -50,34 +50,41 @@ const get_email_domain = async ({github, username}) => {
   @return {string} Returns the message with labels attached and assignees added
 */
 const filter_action = async ({github, context, domain}) => {
-  const labels = ['kokoro:force-run', 'ready to pull'];
+  const labels = ['kokoro:force-run'];
 
   let assignees = [];
   const title = context.payload.pull_request && context.payload.pull_request.title;
+  const lowercased_title = (title || '').toLowerCase();
   const onednn_assignees = ['penpornk'];
-  if (title && title.toLowerCase().includes("onednn"))
-    assignees = onednn_assignees;
+  if (lowercased_title.includes('onednn')) assignees = onednn_assignees;
   const intel_windows_assignees = ['nitins17', 'learning-to-play'];
-  if (title && title.toLowerCase().includes('intel') &&
-      title.toLowerCase().includes('windows') && domain.includes('intel.com'))
+  if (lowercased_title.includes('intel') &&
+      lowercased_title.includes('windows') && domain.includes('intel.com'))
     assignees = intel_windows_assignees;
   const apple_silicon_assignees = ['penpornk', 'nitins17'];
-  if (title && title.toLowerCase().includes('apple') &&
-      title.toLowerCase().includes('silicon') && domain.includes('apple.com'))
+  if (lowercased_title.includes('apple') &&
+      lowercased_title.includes('silicon') && domain.includes('apple.com'))
     assignees = apple_silicon_assignees;
-  if (title && title.toLowerCase().includes('nvidia') &&
-      domain.includes('nvidia.com')) {
-    if (title.toLowerCase().includes('jax')) {
+  if (lowercased_title.includes('tf-trt') && domain.includes('nvidia.com')) {
+    assignees.push(
+        'DEKHTIARJonathan', 'meena-at-work', 'nluehr', 'pjannaty', 'poulsbo');
+  } else if (
+      lowercased_title.includes('nvidia') && domain.includes('nvidia.com')) {
+    if (lowercased_title.includes('jax')) {
       assignees.push('hawkinsp', 'yashk2810', 'skye');
     }
-    if (title.toLowerCase().includes('xla') ||
-        title.toLowerCase().includes('gpu')) {
+    if (lowercased_title.includes('xla') || lowercased_title.includes('gpu')) {
       assignees.push('cheshire', 'gcforster', 'reedwm', 'chsigg', 'xla-rotation');
     }
-    if (title.toLowerCase().includes('tf')) {
+    if (lowercased_title.includes('tf')) {
       assignees.push('rohan100jain', 'bfontain');
     }
   }
+  if (lowercased_title.includes('linaro') && domain.includes('linaro.org')) {
+    if (lowercased_title.includes('arm_ci')) {
+      assignees.push('nitins17', 'penpornk');
+    }
+  }
 
   const resp_label = await github.rest.issues.addLabels({
     issue_number: context.issue.number,
diff --git a/.github/workflows/update-nightly.yml b/.github/workflows/update-nightly.yml
index 0265ffbebe2..60372fddd27 100644
--- a/.github/workflows/update-nightly.yml
+++ b/.github/workflows/update-nightly.yml
@@ -23,7 +23,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     runs-on: ubuntu-latest
     steps:
-    - uses: zofrex/mirror-branch@v1
+    - uses: zofrex/mirror-branch@a8809f0b42f9dfe9b2c5c2162a46327c23d15266 # v1.0.3
       name: Set nightly branch to master HEAD
       with:
         target-branch: 'nightly'
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index 2f86ff2b2e5..ce31d59868a 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -27,7 +27,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@v2
+      uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
     - name: Update the RBE Configs
       run: |
         function map() {
@@ -48,28 +48,40 @@ jobs:
         # See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/toolchains/remote_config/configs.bzl
         # This is a mapping of name_container_map keys under sigbuild_tf_configs
         # to tag names on gcr.io/tensorflow-sigs/build.
+        # TF 2.9
         map sigbuild-r2.9 2.9-python3.9
-        map sigbuild-r2.9-python3.7 2.9-python3.7
         map sigbuild-r2.9-python3.8 2.9-python3.8
         map sigbuild-r2.9-python3.9 2.9-python3.9
         map sigbuild-r2.9-python3.10 2.9-python3.10
+        # TF 2.10
         map sigbuild-r2.10 2.10-python3.9
-        map sigbuild-r2.10-python3.7 2.10-python3.7
         map sigbuild-r2.10-python3.8 2.10-python3.8
         map sigbuild-r2.10-python3.9 2.10-python3.9
         map sigbuild-r2.10-python3.10 2.10-python3.10
-        map sigbuild-128 128-python3.9
-        map sigbuild-128-python3.7 128-python3.7
-        map sigbuild-128-python3.8 128-python3.8
-        map sigbuild-128-python3.9 128-python3.9
-        map sigbuild-128-python3.10 128-python3.10
+        # TF 2.11
         map sigbuild-r2.11 2.11-python3.9
-        map sigbuild-r2.11-python3.7 2.11-python3.7
         map sigbuild-r2.11-python3.8 2.11-python3.8
         map sigbuild-r2.11-python3.9 2.11-python3.9
-        map sigbuild-r2.11-python3.11 2.11-python3.10
+        map sigbuild-r2.11-python3.10 2.11-python3.10
+        # WIP Clang Containers, used by TVCs
+        map sigbuild-57469 57469-python3.9
+        map sigbuild-57469-python3.8 57469-python3.8
+        map sigbuild-57469-python3.9 57469-python3.9
+        map sigbuild-57469-python3.10 57469-python3.10
+        # TF 2.12
+        map sigbuild-r2.12 2.12-python3.9
+        map sigbuild-r2.12-python3.8 2.12-python3.8
+        map sigbuild-r2.12-python3.9 2.12-python3.9
+        map sigbuild-r2.12-python3.10 2.12-python3.10
+        map sigbuild-r2.12-python3.11 2.12-python3.11
+        # TF 2.12 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.12-clang 2.12-python3.9
+        map sigbuild-r2.12-clang-python3.8 2.12-python3.8
+        map sigbuild-r2.12-clang-python3.9 2.12-python3.9
+        map sigbuild-r2.12-clang-python3.10 2.12-python3.10
+        map sigbuild-r2.12-clang-python3.11 2.12-python3.11
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@v3
+      uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 01e20da7c87..ccc170b5c6e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -243,7 +243,7 @@ There are two ways to run TensorFlow unit tests.
     For a single component e.g. softmax op:
 
     ```bash
-    bazel test ${flags} tensorflow/python/kernel_tests:softmax_op_test
+    bazel test ${flags} tensorflow/python/kernel_tests/nn_ops:softmax_op_test
     ```
 
     For a single/parameterized test e.g. `test_capture_variables` in
diff --git a/README.md b/README.md
index 73e75c1df81..c94227d26d7 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,19 @@ for general questions and discussion, and please direct specific questions to
 The TensorFlow project strives to abide by generally accepted best practices in
 open-source software development.
 
+## Patching guidelines
+
+Follow these steps to patch a specific version of TensorFlow, for example, to
+apply fixes to bugs or security vulnerabilities:
+
+*   Clone the TensorFlow repo and switch to the corresponding branch for your
+    desired TensorFlow version, for example, branch `r2.8` for version 2.8.
+*   Apply (that is, cherry pick) the desired changes and resolve any code
+    conflicts.
+*   Run TensorFlow tests and ensure they pass.
+*   [Build](https://www.tensorflow.org/install/source) the TensorFlow pip
+    package from source.
+
 ## Continuous build status
 
 You can find more community-supported platforms and configurations in the
diff --git a/RELEASE.md b/RELEASE.md
index 40320f2a172..ea4ab08237e 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,18 +1,114 @@
-# Release 2.12.0
+# Release 2.13.0
 
-*   `tf.keras`:
+# Breaking Changes
 
-    *   Added `jit_compile` as a settable property to `tf.keras.Model`.
-    *   Added `synchronized` optional parameter to `layers.BatchNormalization`.
-    *   Added deprecation warning to
-        `layers.experimental.SyncBatchNormalization` and suggested to use
-        `layers.BatchNormalization` with `synchronized=True` instead.
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+# Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
+* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
+* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
+
+# Major Features and Improvements
+
+*   `tf.lite`:
+
+    *   Add 16-bit and 64-bit float type support for built-in op `cast`.
+
+*   `tf.keras`
+
+    *   Added Keras metrics `tf.keras.metrics.FBetaScore` and
+        `tf.keras.metrics.F1Score`.
+
+# Bug Fixes and Other Changes
+
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+
+# Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
+
+# Release 2.12.0
 
 # Breaking Changes
 
 * <DOCUMENT BREAKING CHANGES HERE>
 * <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
 
+*   Build, Compilation and Packaging
+
+    *   Removal of redundant packages: the `tensorflow-gpu` and `tf-nightly-gpu`
+        packages have been effectively removed and replaced with packages that
+        direct users to switch to `tensorflow` or `tf-nightly` respectively.
+        The naming difference was the only difference between the two sets of
+        packages ever since TensorFlow 2.1, so there is no loss of functionality
+        or GPU support. See
+        https://pypi.org/project/tensorflow-gpu for more details.
+
+*   `tf.function`:
+
+    *   tf.function now uses the Python inspect library directly for parsing
+        the signature of the Python function it is decorated on.
+    *   This can break certain cases that were previously ignored where the
+        signature is malformed, e.g.
+            *   Using functools.wraps on a function with different signature
+            *   Using functools.partial with an invalid tf.function input
+    *   tf.function now enforces input parameter names to be valid Python
+        identifiers. Incompatible names are automatically sanitized similarly to
+        existing SavedModel signature behavior.
+    *   Parameterless tf.functions are assumed to have an empty input_signature
+        instead of an undefined one even if the input_signature is unspecified.
+    *   tf.types.experimental.TraceType now requires an additional
+        `placeholder_value` method to be defined.
+    *   tf.function now traces with placeholder values generated by TraceType
+        instead of the value itself.
+
+*   `tf.config.experimental.enable_mlir_graph_optimization`:
+
+    * Experimental API removed.
+
+*   `tf.config.experimental.disable_mlir_graph_optimization`:
+
+    * Experimental API removed.
+
+*   `tf.keras`
+
+    * Moved all saving-related utilities to a new namespace, `keras.saving`,
+      i.e. `keras.saving.load_model`, `keras.saving.save_model`,
+      `keras.saving.custom_object_scope`, `keras.saving.get_custom_objects`,
+      `keras.saving.register_keras_serializable`,
+      `keras.saving.get_registered_name` and
+      `keras.saving.get_registered_object`.
+      The previous API locations (in `keras.utils` and `keras.models`) will
+      stay available indefinitely, but we recommend that you update your code
+      to point to the new API locations.
+    * Improvements and fixes in Keras loss masking:
+        * Whether you represent a ragged tensor as a `tf.RaggedTensor` or using
+          [keras masking](https://www.tensorflow.org/guide/keras/masking_and_padding),
+          the returned loss values should be the identical to each other.
+          In previous versions Keras may have silently ignored the mask.
+        * If you use masked losses with Keras the loss values may be different
+          in TensorFlow `2.12` compared to previous versions.
+        * In cases where the mask was previously ignored, you will now get
+          an error if you pass a mask with an incompatible shape.
+
+*   `tf.SavedModel`
+
+    * Introduce new class `tf.saved_model.experimental.Fingerprint` that
+      contains the fingerprint of the SavedModel. See the
+      [SavedModel Fingerprinting RFC](https://github.com/tensorflow/community/pull/415)
+      for details.
+    * Introduce API `tf.saved_model.experimental.read_fingerprint(export_dir)`
+      for reading the fingerprint of a SavedModel.
+
+
 # Known Caveats
 
 * <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
@@ -25,13 +121,90 @@
 
     *   Add 16-bit float type support for built-in op `fill`.
     *   Transpose now supports 6D tensors.
+    *   Float LSTM now supports diagonal recurrent tensors:
+        https://arxiv.org/abs/1903.08023
 
 *   `tf.keras`:
 
+    *   The new Keras model saving format (`.keras`) is available. You can start
+        using it via `model.save(f"{fname}.keras", save_format="keras_v3")`. In
+        the future it will become the default for all files with the `.keras`
+        extension. This file format targets the Python runtime only and makes
+        it possible to reload Python objects identical to the saved originals.
+        The format supports non-numerical state such as vocabulary files and
+        lookup tables, and it is easy to customize in the case of custom layers
+        with exotic elements of state (e.g. a FIFOQueue). The format
+        does not rely on bytecode or pickling, and is safe by default. Note
+        that as a result, Python `lambdas` are disallowed at loading time. If
+        you want to use `lambdas`, you can pass `safe_mode=False` to the loading
+        method (only do this if you trust the source of the model).
+    *   Added a `model.export(filepath)` API to create a lightweight SavedModel
+        artifact that can be used for inference (e.g. with TF-Serving).
+    *   Added `keras.export.ExportArchive` class for low-level customization of
+        the process of exporting SavedModel artifacts for inference.
+        Both ways of exporting models are based on `tf.function` tracing
+        and produce a TF program composed of TF ops. They are meant primarily
+        for environments where the TF runtime is available,
+        but not the Python interpreter, as is typical
+        for production with TF Serving.
+    *   Added utility `tf.keras.utils.FeatureSpace`, a one-stop shop for
+        structured data preprocessing and encoding.
     *   Added `tf.SparseTensor` input support to `tf.keras.layers.Embedding`
         layer. The layer now accepts a new boolean argument `sparse`. If
         `sparse` is set to True, the layer returns a SparseTensor instead of a
         dense Tensor. Defaults to False.
+    *   Added `jit_compile` as a settable property to `tf.keras.Model`.
+    *   Added `synchronized` optional parameter to `layers.BatchNormalization`.
+    *   Added deprecation warning to
+        `layers.experimental.SyncBatchNormalization` and suggested to use
+        `layers.BatchNormalization` with `synchronized=True` instead.
+    *   Updated `tf.keras.layers.BatchNormalization` to support masking of the
+        inputs (`mask` argument) when computing the mean and variance.
+    *   Add `tf.keras.layers.Identity`, a placeholder pass-through layer.
+    *   Add `show_trainable` option to `tf.keras.utils.model_to_dot` to display
+        layer trainable status in model plots.
+    *   Add ability to save a `tf.keras.utils.FeatureSpace` object, via
+        `feature_space.save("myfeaturespace.keras")`, and reload it via
+        `feature_space = tf.keras.models.load_model("myfeaturespace.keras")`.
+    *   Added utility `tf.keras.utils.to_ordinal` to convert class vector to
+        ordinal regression / classification matrix.
+
+*   `tf.experimental.dtensor`:
+
+    *   Coordination service now works with
+        `dtensor.initialize_accelerator_system`, and enabled by default.
+    *   Add `tf.experimental.dtensor.is_dtensor` to check if a tensor is a
+        DTensor instance.
+
+*   `tf.data`:
+
+    *   Added support for alternative checkpointing protocol which makes it
+        possible to checkpoint the state of the input pipeline without having to
+        store the contents of internal buffers. The new functionality can be
+        enabled through the `experimental_symbolic_checkpoint` option of
+        `tf.data.Options()`.
+    *   Added a new `rerandomize_each_iteration` argument for the
+        `tf.data.Dataset.random()` operation, which controls whether the
+        sequence of generated random numbers should be re-randomized every epoch
+        or not (the default behavior). If `seed` is set and
+        `rerandomize_each_iteration=True`, the `random()` operation will produce
+        a different (deterministic) sequence of numbers every epoch.
+    *   Added a new `rerandomize_each_iteration` argument for the
+        `tf.data.Dataset.sample_from_datasets()` operation, which controls
+        whether the sequence of generated random numbers used for sampling
+        should be re-randomized every epoch or not. If `seed` is set and
+        `rerandomize_each_iteration=True`, the `sample_from_datasets()`
+        operation will use a different (deterministic) sequence of numbers every
+        epoch.
+
+*   `tf.test`:
+
+    *   Added `tf.test.experimental.sync_devices`, which is useful for
+        accurately measuring performance in benchmarks.
+
+*   `tf.experimental.dtensor`:
+
+    *   Added experimental support to ReduceScatter fuse on GPU (NCCL).
 
 # Bug Fixes and Other Changes
 
@@ -39,6 +212,29 @@
 * <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
 * <NOTES SHOULD BE GROUPED PER AREA>
 
+* `tf.random`
+  * Added non-experimental aliases for `tf.random.split` and
+    `tf.random.fold_in`, the experimental endpoints are still available
+    so no code changes are necessary.
+* `tf.experimental.ExtensionType`
+  * Added function `experimental.extension_type.as_dict()`, which converts an
+    instance of `tf.experimental.ExtensionType` to a `dict` representation.
+* `stream_executor`
+  * Top level `stream_executor` directory has been deleted, users should use
+    equivalent headers and targets under `compiler/xla/stream_executor`.
+* `tf.nn`
+  * Added `tf.nn.experimental.general_dropout`, which is similar to
+    `tf.random.experimental.stateless_dropout` but accepts a custom sampler
+    function.
+* `tf.types.experimental.GenericFunction`
+  * The `experimental_get_compiler_ir` method supports tf.TensorSpec
+   compilation arguments.
+*  `tf.config.experimental.mlir_bridge_rollout`
+    *   Removed enums `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED` and
+    `MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED` which are no longer used by
+    the tf2xla bridge
+
+
 # Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
@@ -47,12 +243,6 @@ This release contains contributions from many people at Google, as well as:
 
 # Release 2.11.0
 
-<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
-
-* `StatusOr::ConsumeValueOrDie` and `StatusOr::ValueOrDie`, both deprecated in
-  TF 2.10 has been removed.
-
-
 ## Breaking Changes
 *   `tf.keras.optimizers.Optimizer` now points to the new Keras optimizer, and
     old optimizers have moved to the `tf.keras.optimizers.legacy` namespace.
@@ -106,12 +296,6 @@ This release contains contributions from many people at Google, as well as:
     only be implemented based on `tf.keras.optimizers.Optimizer`, the new
     base class.
 
-## Known Caveats
-
-* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
-* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
-* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
-
 ## Major Features and Improvements
 
 *   `tf.lite`:
@@ -160,7 +344,7 @@ This release contains contributions from many people at Google, as well as:
         file is a protobuf containing the "fingerprint" of the SavedModel. See
         the [RFC](https://github.com/tensorflow/community/pull/415) for more
         details regarding its design and properties.
-  
+
 *   `tf.data`:
     *   Graduated experimental APIs:
         * [`tf.data.Dataset.ragged_batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset/#ragged_batch), which batches elements of `tf.data.Dataset`s into `tf.RaggedTensor`s.
@@ -185,11 +369,152 @@ This release contains contributions from many people at Google, as well as:
 *   `tf.SparseTensor`:
     *   Introduced `set_shape`, which sets the static dense shape of the sparse tensor and has the same semantics as `tf.Tensor.set_shape`.
 
+## Security
+
+* TF is currently using giflib 5.2.1 which has [CVE-2022-28506](https://nvd.nist.gov/vuln/detail/CVE-2022-28506). TF is not affected by the CVE as it does not use `DumpScreen2RGB` at all.
+*   Fixes an OOB seg fault in `DynamicStitch` due to missing validation ([CVE-2022-41883](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41883))
+*   Fixes an overflow in `tf.keras.losses.poisson` ([CVE-2022-41887](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41887))
+*   Fixes a heap OOB failure in `ThreadUnsafeUnigramCandidateSampler` caused by missing validation ([CVE-2022-41880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41880))
+*   Fixes a segfault in `ndarray_tensor_bridge` ([CVE-2022-41884](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41884))
+*   Fixes an overflow in `FusedResizeAndPadConv2D` ([CVE-2022-41885](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41885))
+*   Fixes a overflow in `ImageProjectiveTransformV2` ([CVE-2022-41886](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41886))
+*   Fixes an FPE in `tf.image.generate_bounding_box_proposals` on GPU ([CVE-2022-41888](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41888))
+*   Fixes a segfault in `pywrap_tfe_src` caused by invalid attributes ([CVE-2022-41889](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41889))
+*   Fixes a `CHECK` fail in `BCast` ([CVE-2022-41890](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41890))
+*   Fixes a segfault in `TensorListConcat` ([CVE-2022-41891](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41891))
+*   Fixes a `CHECK_EQ` fail in `TensorListResize` ([CVE-2022-41893](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41893))
+*   Fixes an overflow in `CONV_3D_TRANSPOSE` on TFLite ([CVE-2022-41894](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41894))
+*   Fixes a heap OOB in `MirrorPadGrad` ([CVE-2022-41895](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41895))
+*   Fixes a crash in `Mfcc` ([CVE-2022-41896](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41896))
+*   Fixes a heap OOB in `FractionalMaxPoolGrad` ([CVE-2022-41897](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41897))
+*   Fixes a `CHECK` fail in `SparseFillEmptyRowsGrad` ([CVE-2022-41898](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41898))
+*   Fixes a `CHECK` fail in `SdcaOptimizer` ([CVE-2022-41899](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41899))
+*   Fixes a heap OOB in `FractionalAvgPool` and `FractionalMaxPool`([CVE-2022-41900](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41900))
+*   Fixes a `CHECK_EQ` in `SparseMatrixNNZ` ([CVE-2022-41901](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41901))
+*   Fixes an OOB write in grappler ([CVE-2022-41902](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41902))
+*   Fixes a overflow in `ResizeNearestNeighborGrad` ([CVE-2022-41907](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41907))
+*   Fixes a `CHECK` fail in `PyFunc` ([CVE-2022-41908](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41908))
+*   Fixes a segfault in `CompositeTensorVariantToComponents` ([CVE-2022-41909](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41909))
+*   Fixes a invalid char to bool conversion in printing a tensor ([CVE-2022-41911](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41911))
+*   Fixes a heap overflow in `QuantizeAndDequantizeV2` ([CVE-2022-41910](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41910))
+*   Fixes a `CHECK` failure in `SobolSample` via missing validation ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+*   Fixes a `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
-<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+103yiran, 8bitmp3, Aakar Dwivedi, Alexander Grund, alif_elham, Aman Agarwal,
+amoitra, Andrei Ivanov, andreii, Andrew Goodbody, angerson, Ashay Rane,
+Azeem Shaikh, Ben Barsdell, bhack, Bhavani Subramanian, Cedric Nugteren,
+Chandra Kumar Ramasamy, Christopher Bate, CohenAriel, Cotarou, cramasam,
+Enrico Minack, Francisco Unda, Frederic Bastien, gadagashwini, Gauri1 Deshpande,
+george, Jake, Jeff, Jerry Ge, Jingxuan He, Jojimon Varghese, Jonathan Dekhtiar,
+Kaixi Hou, Kanvi Khanna, kcoul, Keith Smiley, Kevin Hu, Kun Lu, kushanam,
+Lianmin Zheng, liuyuanqiang, Louis Sugy, Mahmoud Abuzaina, Marius Brehler,
+mdfaijul, Meenakshi Venkataraman, Milos Puzovic, mohantym, Namrata-Ibm,
+Nathan John Sircombe, Nathan Luehr, Olaf Lipinski, Om Thakkar, Osman F Bayram,
+Patrice Vignola, Pavani Majety, Philipp Hack, Prianka Liz Kariat, Rahul Batra,
+RajeshT, Renato Golin, riestere, Roger Iyengar, Rohit Santhanam, Rsanthanam-Amd,
+Sadeed Pv, Samuel Marks, Shimokawa, Naoaki, Siddhesh Kothadi, Simengliu-Nv,
+Sindre Seppola, snadampal, Srinivasan Narayanamoorthy, sushreebarsa,
+syedshahbaaz, Tamas Bela Feher, Tatwai Chong, Thibaut Goetghebuer-Planchon,
+tilakrayal, Tom Anderson, Tomohiro Endo, Trevor Morris, vibhutisawant,
+Victor Zhang, Vremold, Xavier Bonaventura, Yanming Wang, Yasir Modak,
+Yimei Sun, Yong Tang, Yulv-Git, zhuoran.liu, zotanika
+
+# Release 2.10.1
+
+This release introduces several vulnerability fixes:
+
+*   Fixes an OOB seg fault in `DynamicStitch` due to missing validation ([CVE-2022-41883](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41883))
+*   Fixes an overflow in `tf.keras.losses.poisson` ([CVE-2022-41887](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41887))
+*   Fixes a heap OOB failure in `ThreadUnsafeUnigramCandidateSampler` caused by missing validation ([CVE-2022-41880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41880))
+*   Fixes a segfault in `ndarray_tensor_bridge` ([CVE-2022-41884](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41884))
+*   Fixes an overflow in `FusedResizeAndPadConv2D` ([CVE-2022-41885](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41885))
+*   Fixes a overflow in `ImageProjectiveTransformV2` ([CVE-2022-41886](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41886))
+*   Fixes an FPE in `tf.image.generate_bounding_box_proposals` on GPU ([CVE-2022-41888](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41888))
+*   Fixes a segfault in `pywrap_tfe_src` caused by invalid attributes ([CVE-2022-41889](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41889))
+*   Fixes a `CHECK` fail in `BCast` ([CVE-2022-41890](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41890))
+*   Fixes a segfault in `TensorListConcat` ([CVE-2022-41891](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41891))
+*   Fixes a `CHECK_EQ` fail in `TensorListResize` ([CVE-2022-41893](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41893))
+*   Fixes an overflow in `CONV_3D_TRANSPOSE` on TFLite ([CVE-2022-41894](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41894))
+*   Fixes a heap OOB in `MirrorPadGrad` ([CVE-2022-41895](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41895))
+*   Fixes a crash in `Mfcc` ([CVE-2022-41896](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41896))
+*   Fixes a heap OOB in `FractionalMaxPoolGrad` ([CVE-2022-41897](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41897))
+*   Fixes a `CHECK` fail in `SparseFillEmptyRowsGrad` ([CVE-2022-41898](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41898))
+*   Fixes a `CHECK` fail in `SdcaOptimizer` ([CVE-2022-41899](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41899))
+*   Fixes a heap OOB in `FractionalAvgPool` and `FractionalMaxPool`([CVE-2022-41900](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41900))
+*   Fixes a `CHECK_EQ` in `SparseMatrixNNZ` ([CVE-2022-41901](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41901))
+*   Fixes an OOB write in grappler ([CVE-2022-41902](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41902))
+*   Fixes a overflow in `ResizeNearestNeighborGrad` ([CVE-2022-41907](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41907))
+*   Fixes a `CHECK` fail in `PyFunc` ([CVE-2022-41908](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41908))
+*   Fixes a segfault in `CompositeTensorVariantToComponents` ([CVE-2022-41909](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41909))
+*   Fixes a invalid char to bool conversion in printing a tensor ([CVE-2022-41911](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41911))
+*   Fixes a heap overflow in `QuantizeAndDequantizeV2` ([CVE-2022-41910](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41910))
+*   Fixes a `CHECK` failure in `SobolSample` via missing validation ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+*   Fixes a `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+
+# Release 2.9.3
+
+This release introduces several vulnerability fixes:
+
+*   Fixes an overflow in `tf.keras.losses.poisson` ([CVE-2022-41887](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41887))
+*   Fixes a heap OOB failure in `ThreadUnsafeUnigramCandidateSampler` caused by missing validation ([CVE-2022-41880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41880))
+*   Fixes a segfault in `ndarray_tensor_bridge` ([CVE-2022-41884](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41884))
+*   Fixes an overflow in `FusedResizeAndPadConv2D` ([CVE-2022-41885](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41885))
+*   Fixes a overflow in `ImageProjectiveTransformV2` ([CVE-2022-41886](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41886))
+*   Fixes an FPE in `tf.image.generate_bounding_box_proposals` on GPU ([CVE-2022-41888](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41888))
+*   Fixes a segfault in `pywrap_tfe_src` caused by invalid attributes ([CVE-2022-41889](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41889))
+*   Fixes a `CHECK` fail in `BCast` ([CVE-2022-41890](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41890))
+*   Fixes a segfault in `TensorListConcat` ([CVE-2022-41891](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41891))
+*   Fixes a `CHECK_EQ` fail in `TensorListResize` ([CVE-2022-41893](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41893))
+*   Fixes an overflow in `CONV_3D_TRANSPOSE` on TFLite ([CVE-2022-41894](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41894))
+*   Fixes a heap OOB in `MirrorPadGrad` ([CVE-2022-41895](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41895))
+*   Fixes a crash in `Mfcc` ([CVE-2022-41896](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41896))
+*   Fixes a heap OOB in `FractionalMaxPoolGrad` ([CVE-2022-41897](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41897))
+*   Fixes a `CHECK` fail in `SparseFillEmptyRowsGrad` ([CVE-2022-41898](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41898))
+*   Fixes a `CHECK` fail in `SdcaOptimizer` ([CVE-2022-41899](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41899))
+*   Fixes a heap OOB in `FractionalAvgPool` and `FractionalMaxPool`([CVE-2022-41900](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41900))
+*   Fixes a `CHECK_EQ` in `SparseMatrixNNZ` ([CVE-2022-41901](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41901))
+*   Fixes an OOB write in grappler ([CVE-2022-41902](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41902))
+*   Fixes a overflow in `ResizeNearestNeighborGrad` ([CVE-2022-41907](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41907))
+*   Fixes a `CHECK` fail in `PyFunc` ([CVE-2022-41908](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41908))
+*   Fixes a segfault in `CompositeTensorVariantToComponents` ([CVE-2022-41909](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41909))
+*   Fixes a invalid char to bool conversion in printing a tensor ([CVE-2022-41911](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41911))
+*   Fixes a heap overflow in `QuantizeAndDequantizeV2` ([CVE-2022-41910](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41910))
+*   Fixes a `CHECK` failure in `SobolSample` via missing validation ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+*   Fixes a `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+
+# Release 2.8.4
+
+This release introduces several vulnerability fixes:
+
+*   Fixes a heap OOB failure in `ThreadUnsafeUnigramCandidateSampler` caused by missing validation ([CVE-2022-41880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41880))
+*   Fixes a segfault in `ndarray_tensor_bridge` ([CVE-2022-41884](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41884))
+*   Fixes an overflow in `FusedResizeAndPadConv2D` ([CVE-2022-41885](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41885))
+*   Fixes a overflow in `ImageProjectiveTransformV2` ([CVE-2022-41886](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41886))
+*   Fixes an FPE in `tf.image.generate_bounding_box_proposals` on GPU ([CVE-2022-41888](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41888))
+*   Fixes a segfault in `pywrap_tfe_src` caused by invalid attributes ([CVE-2022-41889](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41889))
+*   Fixes a `CHECK` fail in `BCast` ([CVE-2022-41890](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41890))
+*   Fixes a segfault in `TensorListConcat` ([CVE-2022-41891](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41891))
+*   Fixes a `CHECK_EQ` fail in `TensorListResize` ([CVE-2022-41893](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41893))
+*   Fixes an overflow in `CONV_3D_TRANSPOSE` on TFLite ([CVE-2022-41894](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41894))
+*   Fixes a heap OOB in `MirrorPadGrad` ([CVE-2022-41895](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41895))
+*   Fixes a crash in `Mfcc` ([CVE-2022-41896](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41896))
+*   Fixes a heap OOB in `FractionalMaxPoolGrad` ([CVE-2022-41897](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41897))
+*   Fixes a `CHECK` fail in `SparseFillEmptyRowsGrad` ([CVE-2022-41898](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41898))
+*   Fixes a `CHECK` fail in `SdcaOptimizer` ([CVE-2022-41899](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41899))
+*   Fixes a heap OOB in `FractionalAvgPool` and `FractionalMaxPool`([CVE-2022-41900](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41900))
+*   Fixes a `CHECK_EQ` in `SparseMatrixNNZ` ([CVE-2022-41901](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41901))
+*   Fixes an OOB write in grappler ([CVE-2022-41902](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41902))
+*   Fixes a overflow in `ResizeNearestNeighborGrad` ([CVE-2022-41907](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41907))
+*   Fixes a `CHECK` fail in `PyFunc` ([CVE-2022-41908](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41908))
+*   Fixes a segfault in `CompositeTensorVariantToComponents` ([CVE-2022-41909](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41909))
+*   Fixes a invalid char to bool conversion in printing a tensor ([CVE-2022-41911](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41911))
+*   Fixes a heap overflow in `QuantizeAndDequantizeV2` ([CVE-2022-41910](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-41910))
+*   Fixes a `CHECK` failure in `SobolSample` via missing validation ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
+*   Fixes a `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode ([CVE-2022-35935](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-35935))
 
 # Release 2.10.0
 
@@ -10654,3 +10979,5 @@ answered questions, and were part of inspiring discussions.
 # Release 0.5.0
 
 Initial release of TensorFlow.
+
+<!-- mdformat global-off(b/169948621#comment2) -->
diff --git a/SECURITY.md b/SECURITY.md
index f6d414794c0..d6d47c4e635 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -273,21 +273,11 @@ TensorFlow uses the following disclosure process:
 * An advisory is prepared (but not published) which details the problem and
   steps for mitigation.
 * The vulnerability is fixed and potential workarounds are identified.
-* We will attempt to cherry-pick the fix to the release branches used for all
-  releases of TensorFlow that are at most one year old (though sometimes we
-  might not patch all of them). The cherry-picks will occur as soon as possible
-  and the patch releases will come at the same time as the next quarterly
-  release.
-* Whenever patch releases are finalized, we will notify discuss@tensorflow.org.
 * We will publish a security advisory for all fixed vulnerabilities.
 
 For each vulnerability, we try to ingress it as soon as possible, given the size
 of the team and the number of reports. Vulnerabilities will, in general, be
-batched to be fixed at the same time as a quarterly release. An exception to
-this rule is for high impact vulnerabilities where exploitation of models used
-for inference in products (i.e., not models created just to showcase a
-vulnerability) is possible. In these cases, we will attempt to do patch releases
-within an accelerated timeline, not waiting for the next quarterly release.
+batched to be fixed at the same time as a quarterly release.
 
 Past security advisories are listed
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/README.md).
diff --git a/configure.py b/configure.py
index 135001ed103..6abde63a28a 100644
--- a/configure.py
+++ b/configure.py
@@ -36,7 +36,7 @@
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 
 _SUPPORTED_ANDROID_NDK_VERSIONS = [
-    10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+    19, 20, 21
 ]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
@@ -619,7 +619,7 @@ def prompt_loop_or_load_from_env(environ_cp,
                          'Assuming to be a scripting mistake.' %
                          (var_name, n_ask_attempts))
 
-  if resolve_symlinks and os.path.islink(val):
+  if resolve_symlinks:
     val = os.path.realpath(val)
   environ_cp[var_name] = val
   return val
@@ -718,7 +718,8 @@ def valid_build_tools(version):
 
 
 def get_ndk_api_level(environ_cp, android_ndk_home_path):
-  """Gets the appropriate NDK API level to use for the provided Android NDK path."""
+  """Gets the appropriate NDK API level to use for the provided Android NDK path.
+  """
 
   # First check to see if we're using a blessed version of the NDK.
   properties_path = '%s/source.properties' % android_ndk_home_path
@@ -756,7 +757,7 @@ def valid_api_level(api_level):
   android_ndk_api_level = prompt_loop_or_load_from_env(
       environ_cp,
       var_name='ANDROID_NDK_API_LEVEL',
-      var_default='21',  # 21 is required for ARM64 support.
+      var_default='26',  # 26 is required to support AHardwareBuffer.
       ask_for_var=('Please specify the (min) Android NDK API level to use. '
                    '[Available levels: %s]') % api_levels,
       check_success=valid_api_level,
@@ -1188,6 +1189,9 @@ def main():
     gcc_env = get_gcc_compiler(environ_cp)
     if gcc_env is not None:
 
+      # Use gold linker if 'gcc' and if 'ppc64le'
+      write_to_bazelrc('build --linkopt="-fuse-ld=gold"')
+
       # Get the linker version
       ld_version = run_shell([gcc_env, '-Wl,-version']).split()
 
@@ -1215,8 +1219,6 @@ def main():
 
   if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')):
     write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH'))
-    write_action_env_to_bazelrc('ROCBLAS_TENSILE_LIBPATH',
-                                environ_cp.get('ROCM_PATH') + '/lib/library')
 
   if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('HIP_PLATFORM')):
     write_action_env_to_bazelrc('HIP_PLATFORM', environ_cp.get('HIP_PLATFORM'))
diff --git a/fuzztest.bazelrc b/fuzztest.bazelrc
new file mode 100644
index 00000000000..360b3484ee9
--- /dev/null
+++ b/fuzztest.bazelrc
@@ -0,0 +1,47 @@
+### DO NOT EDIT. Generated file.
+#
+# To regenerate, run the following from your project's workspace:
+#
+#  bazel run @com_google_fuzztest//bazel:setup_configs > fuzztest.bazelrc
+#
+# And don't forget to add the following to your project's .bazelrc:
+#
+#  try-import %workspace%/fuzztest.bazelrc
+
+
+### Common options.
+#
+# Do not use directly.
+
+# Link with Address Sanitizer (ASAN).
+build:fuzztest-common --linkopt=-fsanitize=address
+
+# Standard define for "ifdef-ing" any fuzz test specific code.
+build:fuzztest-common --copt=-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+
+# In fuzz tests, we want to catch assertion violations even in optimized builds.
+build:fuzztest-common --copt=-UNDEBUG
+
+# Enable libc++ assertions.
+# See https://libcxx.llvm.org/UsingLibcxx.html#enabling-the-safe-libc-mode
+build:fuzztest-common --copt=-D_LIBCPP_ENABLE_ASSERTIONS=1
+
+
+### FuzzTest build configuration.
+#
+# Use with: --config=fuzztest
+
+build:fuzztest --config=fuzztest-common
+
+# Link statically.
+build:fuzztest --dynamic_mode=off
+
+# We rely on the following flag instead of the compiler provided
+# __has_feature(address_sanitizer) to know that we have an ASAN build even in
+# the uninstrumented runtime.
+build:fuzztest --copt=-DADDRESS_SANITIZER
+
+# We apply coverage tracking and ASAN instrumentation to everything but the
+# FuzzTest framework itself (including GoogleTest and GoogleMock).
+build:fuzztest --per_file_copt=+//,-//fuzztest:,-googletest/.*,-googlemock/.*@-fsanitize=address,-fsanitize-coverage=inline-8bit-counters,-fsanitize-coverage=trace-cmp
+
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 46879082f93..0d27a8294f5 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -8,16 +8,26 @@ load(
     "//tensorflow:tensorflow.bzl",
     "VERSION",
     "VERSION_MAJOR",
+    "check_deps",
     "if_google",
     "if_oss",
+    "if_xla_available",
     "tf_cc_shared_object",
     "tf_custom_op_library_additional_deps_impl",
+    "tf_monitoring_python_deps",
     "tf_native_cc_binary",
+    "tsl_async_value_deps",
 )
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_binary_deps",
 )
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "if_static",
+    "tf_additional_plugin_deps",
+    "tf_additional_profiler_deps",
+)
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -28,6 +38,7 @@ load(
     "ADDITIONAL_API_INDEXABLE_SETTINGS",
     "tf_cc_shared_library",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # copybara:uncomment_begin
 # load("//tools/build_defs/license:license.bzl", "license")
@@ -95,12 +106,25 @@ PACKAGE_STATIC_DEPS = [
     "@mkl_dnn_acl_compatible//:__subpackages__",
     "@mkl_dnn_v1//:__subpackages__",
     "@nccl_archive//:__subpackages__",
+    "@nvtx_archive//:__subpackages__",
     "@org_sqlite//:__subpackages__",
     "@platforms//:__subpackages__",
     "@snappy//:__subpackages__",
     "@upb//:__subpackages__",
     "@zlib//:__subpackages__",
-]
+    "@dlpack//:__subpackages__",
+    "@arm_neon_2_x86_sse//:__subpackages__",
+    "@cpuinfo//:__subpackages__",
+    "@ruy//:__subpackages__",
+    "@XNNPACK//:__subpackages__",
+    "@pthreadpool//:__subpackages__",
+    "@FXdiv//:__subpackages__",
+    "@FP16//:__subpackages__",
+    "@clog//:__subpackages__",
+    "@flatbuffers//:__subpackages__",
+    "@nccl_archive//:__subpackages__",
+    "@triton//:__subpackages__",
+] + tsl_async_value_deps()
 
 package(
     # copybara:uncomment default_applicable_licenses = [":license"],
@@ -918,23 +942,21 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# copybara:uncomment_begin(configurable API loading)
-# bool_flag(
-#     name = "enable_api_indexable",
-#     build_setting_default = False,
-# )
-#
-# config_setting(
-#     name = "api_indexable_flag",
-#     flag_values = {":enable_api_indexable": "True"},
-# )
-#
-# selects.config_setting_group(
-#     name = "api_indexable",
-#     match_any = [":api_indexable_flag"] + ADDITIONAL_API_INDEXABLE_SETTINGS,
-#     visibility = ["//visibility:public"],
-# )
-# copybara:uncomment_end
+bool_flag(
+    name = "enable_api_indexable",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "api_indexable_flag",
+    flag_values = {":enable_api_indexable": "True"},
+)
+
+selects.config_setting_group(
+    name = "api_indexable",
+    match_any = [":api_indexable_flag"] + ADDITIONAL_API_INDEXABLE_SETTINGS,
+    visibility = ["//visibility:public"],
+)
 
 # DO NOT ADD ANY NEW EXCEPTIONS TO THIS LIST!
 # Instead, please use public APIs or public build rules TF provides.
@@ -949,6 +971,8 @@ package_group(
         "//learning/brain/tfrt/...",
         "//learning/lib/ami/simple_ml/...",
         "//learning/pathways/...",
+        "//learning/serving/contrib/tfrt/mlir/canonical_ops/...",
+        "//perftools/accelerators/xprof/integration_tests/...",
         "//smartass/brain/configure/...",
         "//tensorflow/...",
         "//tensorflow_decision_forests/...",
@@ -967,12 +991,12 @@ package_group(name = "ndarray_tensor_allow_list")
 # Packages that use private types symbols, until they are exported.
 # TODO(b/154650521) Remove.
 # If this is modified, then copy.bara.sky must also be modified.
-package_group(name = "types_whitelist")
+package_group(name = "types_allowlist")
 
 # Packages that use StructuredTensors.
 # TODO(b/159007891) Remove this package once StructuredTensor is exported.
 # LINT.IfChange
-package_group(name = "structured_tensor_whitelist")
+package_group(name = "structured_tensor_allowlist")
 # LINT.ThenChange(copy.bara.sky)
 
 filegroup(
@@ -1081,28 +1105,38 @@ tf_cc_shared_library(
     linkstatic = 1,
     per_os_targets = True,
     roots = [
-        "//tensorflow/c/experimental/filesystem:filesystem_interface",
-        "//tensorflow/c/experimental/stream_executor:stream_executor",
-        "//tensorflow/c:env",
-        "//tensorflow/c:kernels",
-        "//tensorflow/c:kernels_experimental",
-        "//tensorflow/c:logging",
-        "//tensorflow/c:ops",
-        "//tensorflow/cc/saved_model:fingerprinting_impl",
-        "//tensorflow/cc/saved_model:loader_lite_impl",
-        "//tensorflow/cc/saved_model:metrics_impl",
-        "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
-        "//tensorflow/core/common_runtime:core_cpu_impl",
-        "//tensorflow/core:framework_internal_impl",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
-        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
-        "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core/profiler:profiler_impl",
-        "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
-        "//tensorflow/lite/kernels/shim:tf_kernel_shim",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
-    ] + tf_additional_binary_deps(),
+                "//tensorflow/c/experimental/filesystem:filesystem_interface",
+                "//tensorflow/c/experimental/stream_executor:stream_executor",
+                "//tensorflow/c:env",
+                "//tensorflow/c:kernels",
+                "//tensorflow/c:kernels_experimental",
+                "//tensorflow/c:logging",
+                "//tensorflow/c:ops",
+                "//tensorflow/cc/saved_model:fingerprinting_impl",
+                "//tensorflow/cc/saved_model:loader_lite_impl",
+                "//tensorflow/cc/saved_model:metrics_impl",
+                "//tensorflow/compiler/tf2tensorrt:op_converter_registry_impl",
+                "//tensorflow/core/common_runtime:core_cpu_impl",
+                "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+                "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
+                "//tensorflow/core:framework_internal_impl",
+                "//tensorflow/core/framework:tensor",
+                "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
+                "//tensorflow/core:lib_internal_impl",
+                "//tensorflow/core/profiler:profiler_impl",
+                "//tensorflow/core/util:determinism",  # Must be linked and exported to libtensorflow_framework.so.
+                "//tensorflow/lite/kernels/shim:tf_kernel_shim",
+                "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+                "//tensorflow/tsl/framework:bfc_allocator",
+                "//tensorflow/tsl/framework:metrics",
+            ] + tf_additional_binary_deps() +
+            # TODO(b/259305727): Remove this select and include captured_function in macos builds.
+            select({
+                "//tensorflow:macos": [],
+                "//conditions:default": [
+                    "//tensorflow/core/data:captured_function",
+                ],
+            }),
     soversion = VERSION,
     static_deps = PACKAGE_STATIC_DEPS,
     visibility = ["//visibility:public"],
@@ -1193,6 +1227,9 @@ tf_cc_shared_library(
         "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
         "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
     }),
+    exports_filter = [
+        "//:__subpackages__",
+    ],
     framework_so = [],
     linkopts = select({
         "//tensorflow:macos": [
@@ -1206,41 +1243,168 @@ tf_cc_shared_library(
     }),
     per_os_targets = True,
     roots = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/c/eager:c_api",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/cc:const_op",
         "//tensorflow/cc:scope",
-    ],
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/data:standalone",
+        # Exports for pywrap_tensorflow_internal. Many of these are transitive
+        # depedencies of the above, but must be explicitly listed for
+        # cc_shared_library to work.
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:dlpack",
+        "//tensorflow/c/eager:tape",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/c/experimental/gradients",
+        "//tensorflow/c/experimental/gradients/tape",
+        "//tensorflow/c/experimental/ops",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:c_api_no_xla",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir/lite/metrics:error_collector",
+        "//tensorflow/compiler/mlir/lite/python:flatbuffer_to_mlir",
+        "//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:jax_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/compiler/mlir/python:mlir",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/core",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/config:flag_defs",
+        "//tensorflow/core/config:flags",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:grpc_util",
+        "//tensorflow/core/data/service:py_utils",
+        "//tensorflow/core/data/service:server_lib",
+        "//tensorflow/core/debug",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/function/runtime_client:runtime_client_cc",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_memory",
+        "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/kernels:data_service_ops",
+        "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/internal:print_model_analysis",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/rpc/client:profiler_client_impl",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
+        "//tensorflow/core/util:managed_stack_trace",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:reader_base",
+        "//tensorflow/core:script_ops_op_lib",
+        "//tensorflow/distribute/experimental/rpc/kernels:rpc_ops",
+        "//tensorflow/dtensor/cc:dtensor_device_cc",
+        "//tensorflow/dtensor/cc:tensor_layout",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:reference_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco/logging:conversion_log_util",
+        "//tensorflow/lite/toco/logging:toco_conversion_log_proto_cc",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_convert",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_graphviz_dump_options",
+        "//tensorflow/lite/toco:toco_port",
+        "//tensorflow/lite/toco:toco_tooling",
+        "//tensorflow/lite/toco:tooling_util",
+        "//tensorflow/lite/toco:types_proto_cc",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:shared_library",
+        "//tensorflow/lite:stateful_error_reporter",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/python/grappler:cost_analyzer_lib",
+        "//tensorflow/tools/graph_transforms:transform_graph_lib",
+    ] + (tf_monitoring_python_deps() +
+         tf_additional_plugin_deps() +
+         tf_additional_profiler_deps()) + if_xla_available([
+        "//tensorflow/compiler/aot:tfcompile_lib",
+    ]) + if_static(extra_deps = [
+        "//tensorflow/core/platform:tensor_float_32_utils",
+        "//tensorflow/core/platform:enable_tf2_utils",
+    ]) + if_oss([
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+    ]),
     soversion = VERSION,
     static_deps = PACKAGE_STATIC_DEPS,
     visibility = ["//visibility:public"],
     win_def_file = ":tensorflow_filtered_def_file",
-    deps = [
-        "//tensorflow/c:c_api",
-        "//tensorflow/c:env",
-        "//tensorflow/c:kernels",
-        "//tensorflow/c:kernels_experimental",
-        "//tensorflow/c:logging",
-        "//tensorflow/c:ops",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/experimental/filesystem:filesystem_interface",
-        "//tensorflow/c/experimental/stream_executor:stream_executor",
-        "//tensorflow/cc/saved_model:fingerprinting_impl",
-        "//tensorflow/cc/saved_model:loader_lite_impl",
-        "//tensorflow/cc/saved_model:metrics_impl",
-        "//tensorflow/core:framework_internal_impl",
-        "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core/data:standalone",
-        "//tensorflow/core/common_runtime:core_cpu_impl",
-        "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
-        "//tensorflow/core/common_runtime/pluggable_device:pluggable_device_runtime_impl",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
-        "//tensorflow/core/profiler:profiler_impl",
-        "//tensorflow/core/util:determinism",
-        "//tensorflow/lite/kernels/shim:tf_kernel_shim",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
-    ] + tf_additional_binary_deps(),
+)
+
+# To avoid duplication, check that the C++ or python library does not depend on
+# the stream executor cuda plugins. Targets that want to use cuda APIs should
+# instead depend on the dummy plugins in //tensorflow/tsl/platform/default/build_config
+# and use header only targets.
+# TODO(ddunleavy): This seems completely broken. :tensorflow_cc depends on
+# cuda_platform from tf_additional_binary_deps and this doesn't break.
+check_deps(
+    name = "cuda_plugins_check_deps",
+    disallowed_deps = if_static(
+        [],
+        otherwise = [
+            "//tensorflow/compiler/xla/stream_executor/cuda:all_runtime",
+            "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
+            "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform",
+            "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+            "//tensorflow/compiler/xla/stream_executor/cuda:cufft_plugin",
+            "//tensorflow/compiler/xla/stream_executor/cuda:curand_plugin",
+            "//tensorflow/compiler/xla/stream_executor:cuda_platform",
+        ],
+    ),
+    deps = if_cuda([
+        "//tensorflow:tensorflow_cc",
+        "//tensorflow/python:pywrap_tensorflow_internal",
+    ]),
 )
 
 # ** Targets for Windows build (start) **
@@ -1344,7 +1508,7 @@ genrule(
         "//tensorflow/c/eager:headers",
         "//tensorflow/cc:headers",
         "//tensorflow/core:headers",
-        "//tensorflow/stream_executor:stream_executor_install_hdrs",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_install_hdrs",
     ],
     outs = ["include"],
     cmd = """
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 3bb0bb91ba6..cd3cbac7a96 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -103,8 +103,6 @@
 
 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
 # running under pip.
-# TODO(gunan): Enable setting an environment variable to define arbitrary plugin
-# directories.
 # TODO(gunan): Find a better location for this code snippet.
 from tensorflow.python.framework import load_library as _ll
 from tensorflow.python.lib.io import file_io as _fi
@@ -146,6 +144,11 @@ def _running_from_pip_package():
       # Load Pluggable Device Library
       _ll.load_pluggable_device_library(_plugin_dir)
 
+if _os.getenv("TF_PLUGGABLE_DEVICE_LIBRARY_PATH", ""):
+  _ll.load_pluggable_device_library(
+      _os.getenv("TF_PLUGGABLE_DEVICE_LIBRARY_PATH")
+  )
+
 # Add module aliases
 if hasattr(_current_module, 'keras'):
   # It is possible that keras is a lazily loaded module, which might break when
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index f11fedce109..6c42fea562f 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -145,8 +145,6 @@
 
 # Load all plugin libraries from site-packages/tensorflow-plugins if we are
 # running under pip.
-# TODO(gunan): Enable setting an environment variable to define arbitrary plugin
-# directories.
 # TODO(gunan): Find a better location for this code snippet.
 from tensorflow.python.framework import load_library as _ll
 from tensorflow.python.lib.io import file_io as _fi
@@ -187,6 +185,11 @@ def _running_from_pip_package():
       # Load Pluggable Device Library
       _ll.load_pluggable_device_library(_plugin_dir)
 
+if _os.getenv("TF_PLUGGABLE_DEVICE_LIBRARY_PATH", ""):
+  _ll.load_pluggable_device_library(
+      _os.getenv("TF_PLUGGABLE_DEVICE_LIBRARY_PATH")
+  )
+
 # Explicitly import lazy-loaded modules to support autocompletion.
 # pylint: disable=g-import-not-at-top
 if _typing.TYPE_CHECKING:
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index dbd90e1d01f..3c1568b7091 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -2,7 +2,7 @@
 # C API for TensorFlow, for use by client language bindings.
 
 load("@bazel_skylib//lib:selects.bzl", "selects")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -18,6 +18,7 @@ load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -39,6 +40,7 @@ filegroup(
         "tf_tensor.h",
         "tf_tstring.h",
         "//tensorflow/core/platform:ctstring",
+        "//tensorflow/tsl/c:headers",
     ] + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:headers",
     ]),
@@ -60,7 +62,8 @@ filegroup(
             "*test*",
         ],
     ) + [
-        "//tensorflow/core/platform:ctstring",
+        "//tensorflow/tsl/c:srcs",
+        "//tensorflow/tsl/platform:ctstring",
         "//tensorflow/cc:srcs_no_runtime",
         "//tensorflow/core/distributed_runtime:server_lib.h",
     ],
@@ -79,6 +82,7 @@ cc_library(
         "tf_buffer_internal.h",
         "tf_status_internal.h",
         "tf_tensor_internal.h",
+        "//tensorflow/tsl/c:tsl_status_internal_headers",
     ],
     visibility = [
         "//tensorflow/core:__pkg__",
@@ -86,6 +90,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "c_api_headers",
+    hdrs = [
+        "c_api.h",
+        "c_api_macros.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tf_attrtype",
+        ":tf_buffer",
+        ":tf_datatype",
+        ":tf_status_headers",
+        ":tf_tstring",
+    ],
+)
+
 tf_cuda_library(
     name = "c_api_internal",
     hdrs = [
@@ -184,6 +204,7 @@ tf_cuda_library(
         ":tf_tensor_internal",
         ":tf_tstring",
         "//tensorflow/core/platform:tstring",
+        "//tensorflow/tsl/c:tsl_status",
     ] + select({
         "//tensorflow:with_xla_support": [
             "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -213,7 +234,7 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     visibility = [
-        "//tensorflow/c:__subpackages__",
+        "//tensorflow:__subpackages__",
         "//tensorflow/python:__subpackages__",
     ],
     deps = [
@@ -273,6 +294,7 @@ tf_cuda_library(
     hdrs = [
         "tf_status.h",
         "tf_status_internal.h",
+        "//tensorflow/tsl/c:tsl_status_internal_headers",
     ],
     visibility = [
         "//tensorflow/c:__subpackages__",
@@ -285,7 +307,11 @@ tf_cuda_library(
         "//tensorflow/compiler/mlir/tensorflow/c:__subpackages__",
         "//tensorflow/core/transforms:__subpackages__",
     ],
-    deps = select({
+    deps = [
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/c:tsl_status",
+        "//tensorflow/tsl/c:tsl_status_internal",
+    ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
@@ -297,7 +323,10 @@ tf_cuda_library(
 
 filegroup(
     name = "tf_status_internal_headers",
-    srcs = ["tf_status_internal.h"],
+    srcs = [
+        "tf_status_internal.h",
+        "//tensorflow/tsl/c:tsl_status_internal_headers",
+    ],
     visibility = [
         "//tensorflow/python:__subpackages__",
     ],
@@ -331,9 +360,11 @@ cc_library(
     name = "tf_status",
     srcs = ["tf_status.cc"],
     hdrs = ["tf_status.h"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":tf_status_internal",
+        "//tensorflow/tsl/c:tsl_status",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
@@ -344,22 +375,13 @@ cc_library(
     }),
 )
 
-tf_cc_test(
-    name = "tf_status_test",
-    srcs = ["tf_status_test.cc"],
-    deps = [
-        ":tf_status",
-        ":tf_status_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "tf_status_headers",
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/c:tsl_status",
+    ],
 )
 
 cc_library(
@@ -374,10 +396,12 @@ cc_library(
         "tf_tensor.h",
         "tf_tstring.h",
     ],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:tstring",
+        "//tensorflow/tsl/c:tsl_status",
     ],
 )
 
@@ -406,6 +430,7 @@ cc_library(
     name = "tf_datatype",
     srcs = ["tf_datatype.cc"],
     hdrs = ["tf_datatype.h"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -422,6 +447,7 @@ cc_library(
     name = "tf_tensor",
     srcs = ["tf_tensor.cc"],
     hdrs = ["tf_tensor.h"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":c_api_macros",
@@ -475,6 +501,7 @@ cc_library(
     hdrs = [
         "tf_buffer.h",
     ],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":tf_buffer_internal",
@@ -570,24 +597,9 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_internal",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:lib",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "tf_status_helper_test",
-    srcs = ["tf_status_helper_test.cc"],
-    deps = [
-        ":tf_status_helper",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/c:tsl_status_helper",
     ],
 )
 
@@ -804,7 +816,6 @@ tf_cuda_cc_test(
     ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
         ":c_api_internal",
@@ -849,6 +860,7 @@ tf_cc_test(
     data = [
         "testdata/tf_record",
         "//tensorflow/c/experimental/stream_executor/test:test_pluggable_device.so",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:test_next_pluggable_device_plugin.so",
     ],
     extra_copts = if_google(["-DTENSORFLOW_NO_SHARED_OBJECTS=1"]),
     linkopts = select({
@@ -861,7 +873,6 @@ tf_cc_test(
     ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
         ":c_api_experimental",
@@ -934,7 +945,6 @@ tf_cuda_cc_test(
     tags = ["noasan"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
         ":env",
@@ -955,7 +965,6 @@ tf_cuda_cc_test(
     tags = ["no_cuda_on_cpu_tap"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
         ":kernels",
@@ -982,7 +991,6 @@ tf_cc_test(
     tags = ["noasan"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
-    # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
         ":ops",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 96e1268f62d..da62fc35bc0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -117,7 +117,13 @@ const char* TF_Version() { return TF_VERSION_STRING; }
 // --------------------------------------------------------------------------
 
 // --------------------------------------------------------------------------
-TF_SessionOptions* TF_NewSessionOptions() { return new TF_SessionOptions; }
+TF_SessionOptions* TF_NewSessionOptions() {
+  TF_SessionOptions* out = new TF_SessionOptions;
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  out->options.config.mutable_experimental()
+      ->set_disable_optimize_for_static_graph(true);
+  return out;
+}
 void TF_DeleteSessionOptions(TF_SessionOptions* opt) { delete opt; }
 
 void TF_SetTarget(TF_SessionOptions* options, const char* target) {
@@ -129,6 +135,9 @@ void TF_SetConfig(TF_SessionOptions* options, const void* proto,
   if (!options->options.config.ParseFromArray(proto, proto_len)) {
     status->status = InvalidArgument("Unparseable ConfigProto");
   }
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  options->options.config.mutable_experimental()
+      ->set_disable_optimize_for_static_graph(true);
 }
 
 void TF_TensorFromProto(const TF_Buffer* from, TF_Tensor* to,
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 523f5c6e609..3a05e1e64db 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -758,3 +758,9 @@ TF_Library* TF_LoadPluggableDeviceLibrary(const char* library_filename,
 void TF_DeletePluggableDeviceLibraryHandle(TF_Library* lib_handle) {
   delete lib_handle;
 }
+
+void TF_GraphRemoveFunction(TF_Graph* g, const char* func_name,
+                            TF_Status* status) {
+  tensorflow::mutex_lock l(g->mu);
+  status->status = g->graph.mutable_flib_def()->RemoveFunction(func_name);
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ac41bb5a9ca..aec1e875eaf 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -329,6 +329,12 @@ TF_CAPI_EXPORT extern TF_Library* TF_LoadPluggableDeviceLibrary(
 TF_CAPI_EXPORT extern void TF_DeletePluggableDeviceLibraryHandle(
     TF_Library* lib_handle);
 
+// Removes `func_name` from `g`. If `func_name` is not in `g`, an error will be
+// returned.
+TF_CAPI_EXPORT extern void TF_GraphRemoveFunction(TF_Graph* g,
+                                                  const char* func_name,
+                                                  TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index e47b7d0b0f7..63013c3fe46 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_experimental.h"
 
 #include "absl/types/optional.h"
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -255,5 +256,110 @@ TEST(CAPI_EXPERIMENTAL, LibraryPluggableDeviceLoadFunctions) {
 #endif  // !defined(PLATFORM_WINDOWS)
 }
 
+TEST(CAPI_EXPERIMENTAL, LibraryNextPluggableDeviceLoadFunctions) {
+  // TODO(penpornk): Enable this test on Windows.
+#if !defined(PLATFORM_WINDOWS)
+#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+  // Load the library.
+  TF_Status* status = TF_NewStatus();
+  string lib_path =
+      tensorflow::GetDataDependencyFilepath(tensorflow::io::JoinPath(
+          "tensorflow", "core", "common_runtime", "next_pluggable_device", "c",
+          "test_next_pluggable_device_plugin.so"));
+  TF_Library* lib = TF_LoadPluggableDeviceLibrary(lib_path.c_str(), status);
+  TF_Code code = TF_GetCode(status);
+  string status_msg(TF_Message(status));
+  TF_DeleteStatus(status);
+  ASSERT_EQ(TF_OK, code) << status_msg;
+  TF_DeletePluggableDeviceLibraryHandle(lib);
+#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+#endif  // !defined(PLATFORM_WINDOWS)
+}
+
+void DefineFunction(const char* name, TF_Function** func,
+                    const char* description = nullptr,
+                    bool append_hash = false) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Operation* feed = Placeholder(func_graph.get(), s.get());
+  TF_Operation* neg = Neg(feed, func_graph.get(), s.get());
+
+  TF_Output inputs[] = {{feed, 0}};
+  TF_Output outputs[] = {{neg, 0}};
+  *func = TF_GraphToFunction(func_graph.get(), name, append_hash, -1,
+                             /*opers=*/nullptr, 1, inputs, 1, outputs,
+                             /*output_names=*/nullptr,
+                             /*opts=*/nullptr, description, s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(*func, nullptr);
+}
+
+class CApiExperimentalFunctionTest : public ::testing::Test {
+ protected:
+  CApiExperimentalFunctionTest()
+      : s_(TF_NewStatus()), func_graph_(TF_NewGraph()), func_(nullptr) {}
+
+  void SetUp() override {}
+
+  ~CApiExperimentalFunctionTest() override {
+    TF_DeleteFunction(func_);
+    TF_DeleteGraph(func_graph_);
+    TF_DeleteStatus(s_);
+  }
+
+  const char* func_name_ = "MyFunc";
+  TF_Status* s_;
+  TF_Graph* func_graph_;
+  TF_Function* func_;
+};
+
+TEST_F(CApiExperimentalFunctionTest, GraphRemoveFunction) {
+  TF_Function* funcs[1];
+  DefineFunction(func_name_, &func_);
+
+  TF_GraphCopyFunction(func_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  EXPECT_EQ(TF_GraphNumFunctions(func_graph_), 1);
+  EXPECT_EQ(TF_GraphGetFunctions(func_graph_, funcs, 1, s_), 1);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_GraphRemoveFunction(func_graph_, func_name_, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  EXPECT_EQ(TF_GraphNumFunctions(func_graph_), 0);
+  EXPECT_EQ(TF_GraphGetFunctions(func_graph_, funcs, 1, s_), 0);
+
+  TF_DeleteFunction(funcs[0]);
+}
+
+TEST_F(CApiExperimentalFunctionTest, EmptyGraphRemoveNonExistentFunction) {
+  TF_GraphRemoveFunction(func_graph_, "wrong_name", s_);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Tried to remove non-existent function 'wrong_name'."),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiExperimentalFunctionTest, GraphRemoveNonExistentFunction) {
+  TF_Function* funcs[1];
+  DefineFunction(func_name_, &func_);
+
+  TF_GraphCopyFunction(func_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  EXPECT_EQ(TF_GraphNumFunctions(func_graph_), 1);
+  EXPECT_EQ(TF_GraphGetFunctions(func_graph_, funcs, 1, s_), 1);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  TF_GraphRemoveFunction(func_graph_, "wrong_name", s_);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Tried to remove non-existent function 'wrong_name'."),
+            string(TF_Message(s_)));
+  TF_DeleteFunction(funcs[0]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 537b61f3558..a13a1458553 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -185,7 +185,7 @@ TF_Function* TF_GraphToFunctionWithControlOutputs(
   if (control_output_names) {
     control_output_names_vec.reserve(ncontrol_outputs);
     for (int i = 0; i < ncontrol_outputs; ++i) {
-      control_output_names_vec.push_back(string(output_names[i]));
+      control_output_names_vec.push_back(string(control_output_names[i]));
     }
   }
 
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 9722841691f..79d2841b724 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -211,6 +211,14 @@ bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
 
 std::string getTF_OutputDebugString(TF_Output node);
 
+// Set whether to propagate assigned device information when constructing a new
+// Graph from a GraphDef. By default assigned device information is not copied
+// and is re-computed by the runtime.
+inline void TF_ImportGraphDefOptionsSetPropagateDeviceSpec(
+    TF_ImportGraphDefOptions* opts, unsigned char propagate_device_spec) {
+  opts->opts.propagate_device_spec = propagate_device_spec;
+}
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index c1aeb831bce..43dfe5155de 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -9,16 +9,13 @@ load(
     "tf_cuda_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "internal_tfrt_deps")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -131,7 +128,7 @@ filegroup(
         "tfe_tensorhandle_internal.h",
     ],
     visibility = [
-        "//tensorflow/core/function:__pkg__",
+        "//tensorflow/core/function/runtime_client:__pkg__",
         "//tensorflow/python:__subpackages__",
     ],
 )
@@ -256,7 +253,6 @@ tf_cuda_cc_test(
         "gradients_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
         ":abstract_context",
@@ -293,7 +289,6 @@ tf_cuda_cc_test(
         "unified_api_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["no_cuda_asan"],  # b/173654156
     deps = [
         ":c_api_experimental",
@@ -337,7 +332,6 @@ tf_cuda_cc_test(
         "gradient_checker_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # b/175330074
     ],
@@ -755,7 +749,10 @@ tf_cuda_cc_test(
     tags = [
         "no_oss",  # TODO(b/200848572)
         "no_windows",
+        # TODO(b/136478427): sanitizers report issues due to unclean exit.
         "noasan",  # leaks gRPC server instances
+        "nomsan",  # b/229991646: use of destructed memory due to unclean exit.
+        "notsan",  # b/259602430: race on destructed mutex due to unclean exit.
     ],
     deps = [
         ":c_api",
@@ -885,9 +882,9 @@ tf_cuda_library(
     }) + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/core/distributed_runtime/coordination:coordination_service_error_util",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -900,6 +897,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core:gpu_runtime",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
     alwayslink = 1,
 )
@@ -911,7 +909,6 @@ tf_cuda_cc_test(
         "c_api_experimental_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
         ":c_api",
@@ -934,7 +931,6 @@ tf_cuda_cc_test(
         "c_api_unified_experimental_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
         ":c_api",
@@ -1015,7 +1011,7 @@ cc_library(
     name = "dlpack",
     srcs = ["dlpack.cc"],
     hdrs = ["dlpack.h"],
-    copts = [
+    copts = tf_copts() + [
         "-fexceptions",
         "-fno-strict-aliasing",
     ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 13a9c797235..e3199b204f6 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -68,7 +68,6 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE) && \
     !defined(PLATFORM_FUCHSIA)
 #include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-#include "tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.h"
 #endif  // PLATFORM_GOOGLE && !LIBTPU_ON_GCE && !PLATFORM_FUCHSIA
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -123,12 +122,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
         opts->session_options.options,
         static_cast<tensorflow::ContextDevicePlacementPolicy>(
             opts->device_placement_policy),
-        opts->async, opts->use_tfrt_distributed_runtime);
-#if !defined(IS_MOBILE_PLATFORM)
-    tfrt_context->SetDistributedManager(
-        tfrt::tf::CreateDistributedManagerContext(
-            tfrt_context->GetCoreRuntime()->GetHostContext()));
-#endif  // !IS_MOBILE_PLATFORM
+        opts->async);
     return tensorflow::wrap(tfrt_context);
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 149c6062d23..7eb22ed2c7c 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/match.h"
+#include "absl/time/time.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 using tensorflow::string;
 
@@ -539,11 +540,6 @@ void TFE_ContextOptionsSetTfrt(TFE_ContextOptions* options, bool use_tfrt) {
   options->use_tfrt = use_tfrt;
 }
 
-void TFE_ContextOptionsSetTfrtDistributedRuntime(
-    TFE_ContextOptions* options, bool use_tfrt_distributed_runtime) {
-  options->use_tfrt_distributed_runtime = use_tfrt_distributed_runtime;
-}
-
 TFE_CancellationManager* TFE_NewCancellationManager() {
   return tensorflow::wrap(new tensorflow::CancellationManager);
 }
@@ -571,8 +567,10 @@ void TFE_OpSetCancellationManager(TFE_Op* op,
   status->status = ::tensorflow::OkStatus();
 }
 
-TFE_Executor* TFE_NewExecutor(bool is_async, bool enable_streaming_enqueue) {
-  return new TFE_Executor(is_async, enable_streaming_enqueue);
+TFE_Executor* TFE_NewExecutor(bool is_async, bool enable_streaming_enqueue,
+                              int in_flight_nodes_limit) {
+  return new TFE_Executor(is_async, enable_streaming_enqueue,
+                          in_flight_nodes_limit);
 }
 
 void TFE_DeleteExecutor(TFE_Executor* executor) { delete executor; }
@@ -785,7 +783,7 @@ void TFE_InsertConfigKeyValue(TFE_Context* ctx, const char* key,
                               const char* value, TF_Status* status) {
   tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
       tensorflow::unwrap(ctx)->GetDistributedManager();
-  tensorflow::CoordinationServiceAgent* coord_agent =
+  tsl::CoordinationServiceAgent* coord_agent =
       dist_mgr->GetCoordinationServiceAgent();
   if (coord_agent == nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -799,7 +797,7 @@ void TFE_GetConfigKeyValue(TFE_Context* ctx, const char* key,
                            TF_Buffer* value_buf, TF_Status* status) {
   tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
       tensorflow::unwrap(ctx)->GetDistributedManager();
-  tensorflow::CoordinationServiceAgent* coord_agent =
+  tsl::CoordinationServiceAgent* coord_agent =
       dist_mgr->GetCoordinationServiceAgent();
   if (coord_agent == nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -824,7 +822,7 @@ void TFE_DeleteConfigKeyValue(TFE_Context* ctx, const char* key,
                               TF_Status* status) {
   tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
       tensorflow::unwrap(ctx)->GetDistributedManager();
-  tensorflow::CoordinationServiceAgent* coord_agent =
+  tsl::CoordinationServiceAgent* coord_agent =
       dist_mgr->GetCoordinationServiceAgent();
   if (coord_agent == nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -838,7 +836,7 @@ void TFE_ReportErrorToCluster(TFE_Context* ctx, int error_code,
                               const char* error_message, TF_Status* status) {
   tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
       tensorflow::unwrap(ctx)->GetDistributedManager();
-  tensorflow::CoordinationServiceAgent* coord_agent =
+  tsl::CoordinationServiceAgent* coord_agent =
       dist_mgr->GetCoordinationServiceAgent();
   if (coord_agent == nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -854,7 +852,7 @@ void TFE_GetTaskStates(TFE_Context* ctx, const TF_Buffer& tasks, void* states,
                        TF_Status* status) {
   tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
       tensorflow::unwrap(ctx)->GetDistributedManager();
-  tensorflow::CoordinationServiceAgent* coord_agent =
+  tsl::CoordinationServiceAgent* coord_agent =
       dist_mgr->GetCoordinationServiceAgent();
   if (coord_agent == nullptr) {
     status->status = tensorflow::errors::FailedPrecondition(
@@ -890,3 +888,18 @@ void TFE_GetTaskStates(TFE_Context* ctx, const TF_Buffer& tasks, void* states,
   }
   status->status = tensorflow::OkStatus();
 }
+
+void TFE_WaitAtBarrier(TFE_Context* ctx, const char* barrier_id,
+                       int64_t barrier_timeout_in_ms, TF_Status* status) {
+  tensorflow::ImmediateExecutionDistributedManager* dist_mgr =
+      tensorflow::unwrap(ctx)->GetDistributedManager();
+  tsl::CoordinationServiceAgent* coord_agent =
+      dist_mgr->GetCoordinationServiceAgent();
+  if (coord_agent == nullptr) {
+    status->status = tensorflow::errors::FailedPrecondition(
+        "Coordination service is not enabled.");
+    return;
+  }
+  status->status = coord_agent->WaitAtBarrier(
+      barrier_id, absl::Milliseconds(barrier_timeout_in_ms), {});
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 704a093fbab..95d833f6f47 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -294,10 +294,6 @@ TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetTfrt(TFE_ContextOptions*,
                                                      bool use_tfrt);
 
-// Sets whether to use TFRT distributed runtime
-TF_CAPI_EXPORT extern void TFE_ContextOptionsSetTfrtDistributedRuntime(
-    TFE_ContextOptions* options, bool use_tfrt_distributed_runtime);
-
 // Returns the context_id from the EagerContext which is used by the
 // EagerService to maintain consistency between client and worker. The
 // context_id is initialized with a dummy value and is later set when the worker
@@ -333,8 +329,16 @@ typedef struct TFE_Executor TFE_Executor;
 // Creates a new eager Executor. Nodes in one executor are guaranteed to be
 // executed in sequence. Assigning nodes to different executors allows executing
 // nodes in parallel.
+// in_flight_nodes_limit: when is_async is true, this value controls the
+// maximum number of in flight async nodes. Enqueuing of additional async ops
+// after the limit is reached blocks until some inflight nodes finishes.
+// The effect is bounding the memory held by inflight TensorHandles that are
+// referenced by the inflight nodes.
+// A recommended value has not been established.
+// A value of 0 removes the limit, which is the behavior of TensorFlow 2.11.
+// When is_async is false, the value is ignored.
 TF_CAPI_EXPORT extern TFE_Executor* TFE_NewExecutor(
-    bool is_async, bool enable_streaming_enqueue);
+    bool is_async, bool enable_streaming_enqueue, int in_flight_nodes_limit);
 
 // Deletes the eager Executor without waiting for enqueued nodes. Please call
 // TFE_ExecutorWaitForAllPendingNodes before calling this API if you want to
@@ -724,6 +728,11 @@ TF_CAPI_EXPORT extern void TFE_GetTaskStates(TFE_Context* ctx,
                                              const TF_Buffer& tasks,
                                              void* states, TF_Status* status);
 
+TF_CAPI_EXPORT extern void TFE_WaitAtBarrier(TFE_Context* ctx,
+                                             const char* barrier_id,
+                                             int64_t barrier_timeout_in_ms,
+                                             TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index f900de59060..68dbafc4d2a 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -220,7 +220,8 @@ TEST(CAPI, ExecutorContextDestructionOrder) {
     ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
     TFE_DeleteContextOptions(opts);
     TFE_Executor* executor = TFE_NewExecutor(
-        /*is_async=*/false, /*enable_streaming_enqueue=*/true);
+        /*is_async=*/false, /*enable_streaming_enqueue=*/true,
+        /*in_flight_nodes_limit=*/0);
     TFE_ContextSetExecutorForThread(ctx, executor);
 
     TFE_DeleteContext(ctx);
@@ -233,7 +234,8 @@ TEST(CAPI, ExecutorContextDestructionOrder) {
     ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
     TFE_DeleteContextOptions(opts);
     TFE_Executor* executor = TFE_NewExecutor(
-        /*is_async=*/false, /*enable_streaming_enqueue=*/true);
+        /*is_async=*/false, /*enable_streaming_enqueue=*/true,
+        /*in_flight_nodes_limit=*/0);
     TFE_ContextSetExecutorForThread(ctx, executor);
 
     TFE_DeleteExecutor(executor);
@@ -275,7 +277,8 @@ TEST(CAPI, Function_ident_CPU) {
   for (bool async : {false, true, false}) {
     TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
     TFE_Executor* executor = TFE_NewExecutor(
-        /*is_async=*/async, /*enable_streaming_enqueue=*/true);
+        /*is_async=*/async, /*enable_streaming_enqueue=*/true,
+        /*in_flight_nodes_limit=*/0);
     TFE_ContextSetExecutorForThread(ctx, executor);
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
@@ -327,7 +330,8 @@ void Executor_MatMul_CPU(bool async) {
 
   TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
   TFE_Executor* executor = TFE_NewExecutor(
-      /*is_async=*/async, /*enable_streaming_enqueue=*/true);
+      /*is_async=*/async, /*enable_streaming_enqueue=*/true,
+      /*in_flight_nodes_limit=*/0);
   TFE_ContextSetExecutorForThread(ctx, executor);
 
   TFE_TensorHandle* m = TestMatrixTensorHandle(ctx);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 8bec998681e..eff96826822 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -34,11 +34,6 @@ struct TFE_ContextOptions {
       TFE_DEVICE_PLACEMENT_SILENT};
   // If true, use TFRT backend
   bool use_tfrt = false;
-  // This option is effective only when use_tfrt is true. If true, TFRT will use
-  // native TFRT distributed runtime. Otherwise, TFRT will use current runtime's
-  // distributed runtime. Note that TFRT distributed runtime is in development
-  // and not functionally complete.
-  bool use_tfrt_distributed_runtime = false;
   // Whether to run elementary eager ops wrapped in a call op.
   bool run_eager_op_as_function = false;
   // Whether to rewrite jit_compile functions.
diff --git a/tensorflow/c/eager/immediate_execution_distributed_manager.h b/tensorflow/c/eager/immediate_execution_distributed_manager.h
index 9efb2fa85d6..4f96992e739 100644
--- a/tensorflow/c/eager/immediate_execution_distributed_manager.h
+++ b/tensorflow/c/eager/immediate_execution_distributed_manager.h
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 
-namespace tensorflow {
+namespace tsl {
 class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
 class ImmediateExecutionContext;
 class ServerDef;
 class WorkerEnv;
@@ -32,19 +35,19 @@ class ImmediateExecutionDistributedManager {
   virtual ~ImmediateExecutionDistributedManager() {}
 
   // Set up distributed execution environment on local and remote tasks.
-  // When `reset_context` is true, initialize new cluster context state based on
-  // cluster configurations provided in `server_def`; otherwise, update existing
-  // context state with the provided `server_def`.
-  // Contexts created on remote tasks will be considered stale and garbage
-  // collected after `keep_alive_secs` of inactivity.
+  // When `reset_context` is true, initialize new cluster context state based
+  // on cluster configurations provided in `server_def`; otherwise, update
+  // existing context state with the provided `server_def`. Contexts created
+  // on remote tasks will be considered stale and garbage collected after
+  // `keep_alive_secs` of inactivity.
   virtual Status SetOrUpdateServerDef(const ServerDef& server_def,
                                       bool reset_context,
                                       int keep_alive_secs) = 0;
 
-  // Set up a multi-client distributed execution environment. Must be called on
-  // all tasks in the cluster.
-  // This call internally coordinates with other tasks to initialize the eager
-  // context and TF server for multi-client execution.
+  // Set up a multi-client distributed execution environment. Must be called
+  // on all tasks in the cluster. This call internally coordinates with other
+  // tasks to initialize the eager context and TF server for multi-client
+  // execution.
   virtual Status EnableCollectiveOps(const ServerDef& server_def) = 0;
 
   // Check if the remote task is alive.
@@ -52,7 +55,7 @@ class ImmediateExecutionDistributedManager {
                                   bool* is_alive) = 0;
 
   // Get pointer to the coordination service agent instance.
-  virtual CoordinationServiceAgent* GetCoordinationServiceAgent() = 0;
+  virtual tsl::CoordinationServiceAgent* GetCoordinationServiceAgent() = 0;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index e528a7070ab..0de029ff449 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index 727c1f83396..fd054c9af9a 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -66,7 +66,8 @@ using ExecutorPtr = std::unique_ptr<TFE_Executor, ExecutorDeleter>;
 class DeviceThread {
  public:
   // Starts a background thread waiting for `StartExecute`.
-  explicit DeviceThread(const std::string& device, const bool is_async)
+  explicit DeviceThread(const std::string& device, const bool is_async,
+                        const int in_flight_nodes_limit)
       : status_(TF_NewStatus()),
         // If the context's default exector is set to async, re-using that in
         // each thread would cause collectives to deadlock. For consistency we
@@ -75,7 +76,9 @@ class DeviceThread {
         // TODO(allenl): We should have an async API that works with the
         // parallel device.
         device_(device),
-        executor_(TFE_NewExecutor(is_async, /*enable_streaming_enqueue=*/true)),
+        executor_(
+            TFE_NewExecutor(is_async, /*enable_streaming_enqueue=*/true,
+                            /*in_flight_nodes_limit=*/in_flight_nodes_limit)),
         op_(nullptr),
         thread_(tensorflow::Env::Default()->StartThread(
             tensorflow::ThreadOptions(), "parallel_device_execute",
@@ -282,13 +285,13 @@ void DeviceThread::Execute(TFE_Context* context, const char* operation_name,
 }
 
 ParallelDevice::ParallelDevice(const std::vector<std::string>& devices,
-                               const bool is_async)
+                               bool is_async, int in_flight_nodes_limit)
     : underlying_devices_(devices),
       default_cancellation_manager_(absl::make_unique<CancellationManager>()) {
   device_threads_.reserve(devices.size());
   for (int device_index = 0; device_index < devices.size(); ++device_index) {
-    device_threads_.emplace_back(
-        new DeviceThread(devices[device_index].c_str(), is_async));
+    device_threads_.emplace_back(new DeviceThread(
+        devices[device_index].c_str(), is_async, in_flight_nodes_limit));
   }
 }
 
@@ -365,6 +368,26 @@ void ParallelDevice::StartExecute(TFE_Context* context,
   }
 }
 
+void ParallelDevice::StartExecute(
+    TFE_Context* context, const std::vector<const TensorHandlePtr*>& inputs,
+    const char* operation_name, const TFE_OpAttrs* attributes,
+    int expected_max_outputs, CancellationManager& cancellation_manager,
+    absl::optional<int64_t> step_id) const {
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    DeviceThread* device_thread = device_threads_[device_index].get();
+    std::vector<TFE_TensorHandle*> device_inputs;
+    device_inputs.reserve(inputs.size());
+    for (int input_index = 0; input_index < inputs.size(); ++input_index) {
+      // Parallel tensors are divided between operations by device.
+      device_inputs.push_back(inputs[input_index][device_index].get());
+    }
+    device_thread->StartExecute(
+        context, operation_name, std::move(device_inputs), attributes,
+        expected_max_outputs, cancellation_manager, step_id);
+  }
+}
+
 void ParallelDevice::AsyncWait(TFE_Context* context, TF_Status* status) const {
   StatusPtr first_bad_status(nullptr);
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index 80f81dd47a4..01581f40e05 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_LIB_H_
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -44,6 +45,8 @@ class TensorHandleDeleter {
   }
 };
 
+// TODO(b/256016071): Replace this with `Safe_TFE_TensorHandlePtr` when
+// `Safe_TFE_TensorHandlePtr` is marked to be compatible on non-prod env.
 using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
 
 class ParallelTensor;
@@ -56,7 +59,7 @@ class ParallelDevice {
   // Eager async execution is only supported when remote eager is not in use
   // (b/157523095).
   explicit ParallelDevice(const std::vector<std::string>& devices,
-                          const bool is_async = false);
+                          bool is_async = false, int in_flight_nodes_limit = 0);
 
   ~ParallelDevice();
 
@@ -118,12 +121,24 @@ class ParallelDevice {
   //
   // Set step_id to configure the step id used for rendezvous creation. step id
   // of value -1 is reserved for global rendezvous and should not be set here.
+  //
+  // This function is overloaded so that if the inputs are constructed from
+  // `TensorWithLayout` we can use the one with `TensorHandlePtr` but
+  // if the inputs are directly `ParallelTensor` (for example, in the case of
+  // custom device execution) we can use the one with `ParallelTensor`.
   void StartExecute(TFE_Context* context,
                     const std::vector<ParallelTensor*>& inputs,
                     const char* operation_name, const TFE_OpAttrs* attributes,
                     int expected_max_outputs,
                     CancellationManager& cancellation_manager,
-                    absl::optional<int64_t> step_id = absl::nullopt) const;
+                    std::optional<int64_t> step_id = std::nullopt) const;
+
+  void StartExecute(TFE_Context* context,
+                    const std::vector<const TensorHandlePtr*>& inputs,
+                    const char* operation_name, const TFE_OpAttrs* attributes,
+                    int expected_max_outputs,
+                    CancellationManager& cancellation_manager,
+                    std::optional<int64_t> step_id = std::nullopt) const;
 
   // Blocks until the previous `StartExecute` has run `TFE_Execute` on each
   // device. If is_async=false (constructor argument) this means the ops have
@@ -189,6 +204,7 @@ class ParallelTensor {
 
   size_t num_tensors() const { return tensors_.size(); }
   TFE_TensorHandle* tensor(size_t index) const { return tensors_[index].get(); }
+  const TensorHandlePtr* tensor_data() const { return tensors_.data(); }
 
   // If the `shape` argument to `FromTensorHandles` is specified, returns that.
   //
diff --git a/tensorflow/c/eager/tfe_executor_internal.h b/tensorflow/c/eager/tfe_executor_internal.h
index 081b139bd34..7f55532af56 100644
--- a/tensorflow/c/eager/tfe_executor_internal.h
+++ b/tensorflow/c/eager/tfe_executor_internal.h
@@ -20,9 +20,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 
 struct TFE_Executor {
-  explicit TFE_Executor(bool async, bool enable_streaming_enqueue)
-      : owned_executor(
-            new tensorflow::EagerExecutor(async, enable_streaming_enqueue)) {}
+  explicit TFE_Executor(bool async, bool enable_streaming_enqueue,
+                        int in_flight_nodes_limit)
+      : owned_executor(new tensorflow::EagerExecutor(
+            async, enable_streaming_enqueue, in_flight_nodes_limit)) {}
 
   explicit TFE_Executor(tensorflow::EagerExecutor* executor)
       : owned_executor(nullptr), unowned_executor(executor) {}
diff --git a/tensorflow/c/experimental/filesystem/BUILD b/tensorflow/c/experimental/filesystem/BUILD
index 4d8ff231ce7..6c5c43fbb46 100644
--- a/tensorflow/c/experimental/filesystem/BUILD
+++ b/tensorflow/c/experimental/filesystem/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -42,6 +43,7 @@ cc_library(
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:errors",
     ],
 )
 
diff --git a/tensorflow/c/experimental/filesystem/modular_filesystem.cc b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
index 32b06697d77..b47748374fe 100644
--- a/tensorflow/c/experimental/filesystem/modular_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/modular_filesystem.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 // TODO(b/139060984): After all filesystems are converted, all calls to
 // methods from `FileSystem` will have to be replaced to calls to private
@@ -561,8 +562,9 @@ Status RegisterFilesystemPlugin(const std::string& dso_path) {
 
   // Step 2: Load symbol for `TF_InitPlugin`
   void* dso_symbol;
-  TF_RETURN_IF_ERROR(
-      env->GetSymbolFromLibrary(dso_handle, "TF_InitPlugin", &dso_symbol));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      env->GetSymbolFromLibrary(dso_handle, "TF_InitPlugin", &dso_symbol),
+      "Failed to load TF_InitPlugin symbol for DSO: ", dso_path);
 
   // Step 3: Call `TF_InitPlugin`
   TF_FilesystemPluginInfo info;
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 1d9bfc1a15f..bd2041b1d43 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
index 9d655fd43b5..90acb2bf389 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
index fb2f99f44ff..2ac57f6a731 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 90a99b05e38..1788cbd6551 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -5,10 +5,6 @@ load(
     "if_libtpu",
     "tf_cuda_cc_test",
 )
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -16,6 +12,7 @@ load(
 
 # Library of gradient functions.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -59,7 +56,7 @@ cc_library(
         "nn_grad.h",
     ],
     visibility = [
-        "//tensorflow:internal",
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
     ],
     deps = [
         "//tensorflow/c/eager:abstract_tensor_handle",
@@ -118,7 +115,6 @@ tf_cuda_cc_test(
         "custom_gradient_test.cc",
     ],
     args = ["--heap_check="],  # TODO(b/174752220): Remove
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/c:tf_status_helper",
@@ -144,10 +140,7 @@ filegroup(
         "nn_grad.h",
         "not_differentiable.h",
     ],
-    visibility = [
-        "//tensorflow/core:__pkg__",
-        "//tensorflow/python:__pkg__",
-    ],
+    visibility = ["//tensorflow/python:__pkg__"],
 )
 
 cc_library(
@@ -156,7 +149,7 @@ cc_library(
     srcs = ["grad_test_helper.cc"],
     hdrs = ["grad_test_helper.h"],
     visibility = [
-        "//tensorflow:internal",
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
     ],
     deps = [
         "//tensorflow/c/eager:gradient_checker",
@@ -175,7 +168,6 @@ tf_cuda_cc_test(
         "nn_grad_test.cc",
     ],
     args = ["--heap_check="],  # TODO(b/174752220): Remove
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["no_cuda_asan"],  # b/173654156,
     deps = [
         ":grad_test_helper",
@@ -202,7 +194,6 @@ tf_cuda_cc_test(
         "math_grad_test.cc",
     ],
     args = ["--heap_check="],  # TODO(b/174752220): Remove
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["no_cuda_asan"],  # b/173654156,
     deps = [
         ":grad_test_helper",
@@ -229,7 +220,6 @@ tf_cuda_cc_test(
         "array_grad_test.cc",
     ],
     args = ["--heap_check="],  # TODO(b/174752220): Remove
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["no_cuda_asan"],  # b/173654156,
     deps = [
         ":grad_test_helper",
diff --git a/tensorflow/c/experimental/gradients/tape/BUILD b/tensorflow/c/experimental/gradients/tape/BUILD
index 123f1908020..c29b7929d43 100644
--- a/tensorflow/c/experimental/gradients/tape/BUILD
+++ b/tensorflow/c/experimental/gradients/tape/BUILD
@@ -2,6 +2,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/grappler/BUILD b/tensorflow/c/experimental/grappler/BUILD
index 68bdcdcda70..482ec08efed 100644
--- a/tensorflow/c/experimental/grappler/BUILD
+++ b/tensorflow/c/experimental/grappler/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
new file mode 100644
index 00000000000..890477266ea
--- /dev/null
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -0,0 +1,34 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "c_api",
+    srcs = ["c_api.cc"],
+    hdrs = ["c_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:kernels_experimental_hdrs",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/jit:xla_launch_util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_c_api_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/next_pluggable_device",
+        "//tensorflow/core/common_runtime/next_pluggable_device:plugin_resource",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/common:async_value_tensor",
+        "//tensorflow/core/tfrt/common:pjrt_util",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
new file mode 100644
index 00000000000..1ff6e091507
--- /dev/null
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -0,0 +1,333 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+TF_Device* TF_GetDevice(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  return reinterpret_cast<TF_Device*>(cc_ctx->device());
+}
+
+size_t TF_GetDeviceOrdinal(TF_Device* device) {
+  // TODO(chuanhao): make GetDeviceOrdinal a virtual member function in the base
+  // device class, instead of casting to `NextPluggableDevice`.
+  auto cc_device = reinterpret_cast<tensorflow::NextPluggableDevice*>(device);
+  return cc_device->GetDeviceOrdinal();
+}
+
+// --------------------------  Resource  ---------------------------------------
+void TF_CreatePluginResource(TF_OpKernelContext* ctx,
+                             const char* container_name,
+                             const char* plugin_resource_name,
+                             void* plugin_resource, void (*delete_func)(void*),
+                             TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  tensorflow::PluginResource* cc_resource_ptr = new tensorflow::PluginResource(
+      plugin_resource, plugin_resource_name, delete_func);
+  auto cc_status =
+      cc_ctx->resource_manager()->Create<tensorflow::PluginResource>(
+          container_name, plugin_resource_name, cc_resource_ptr);
+  Set_TF_Status_from_Status(status, cc_status);
+}
+
+void TF_LookupOrCreatePluginResource(
+    TF_OpKernelContext* ctx, const char* container_name,
+    const char* plugin_resource_name, void** result_plugin_resource,
+    void* (*create_func)(void*), void* create_func_args,
+    void (*delete_func)(void*), TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  auto* resource_mgr = cc_ctx->resource_manager();
+  tensorflow::core::RefCountPtr<tensorflow::PluginResource>
+      tf_plugin_resource_ptr;
+  tensorflow::PluginResource* tf_plugin_resource = nullptr;
+
+  auto cc_status = resource_mgr->LookupOrCreate<tensorflow::PluginResource>(
+      container_name, plugin_resource_name, &tf_plugin_resource,
+      [plugin_resource_name, create_func, create_func_args,
+       delete_func](tensorflow::PluginResource** new_resource) {
+        void* opaque_plugin_resource = create_func(create_func_args);
+        *new_resource = new tensorflow::PluginResource(
+            opaque_plugin_resource, plugin_resource_name, delete_func);
+        return tensorflow::OkStatus();
+      });
+
+  if (cc_status.ok()) {
+    tf_plugin_resource_ptr.reset(tf_plugin_resource);
+    *result_plugin_resource = tf_plugin_resource_ptr->GetOpaquePluginResource();
+  } else {
+    *result_plugin_resource = nullptr;
+  }
+  Set_TF_Status_from_Status(status, cc_status);
+}
+
+// -------------------------  VariableInfo  ------------------------------------
+struct TF_VariableInfo {
+  TF_VariableInfo() = delete;
+  // TF_VariableInfo is constructed here by TensorFlow, and will be passed to
+  // plugin as a opaque pointer. Plugin will need to call C APIs below to
+  // operate on TF_VaribleInfo (such as allocate temp tensor for the `var` held
+  // by the underlying tensorflow::VariableInfo.
+  TF_VariableInfo(int index, const std::string& name, tensorflow::Var* var) {
+    var_info = tensorflow::VariableInfo{index, name, var};
+  }
+
+  tensorflow::VariableInfo var_info{0, "", nullptr};
+};
+
+TF_VariableInfo* TF_CreateVariableInfoFromContext(TF_OpKernelContext* ctx,
+                                                  int index,
+                                                  TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  const tensorflow::Tensor& arg_tensor = cc_ctx->input(index);
+  tsl::Status cc_status;
+  if (arg_tensor.dtype() != tensorflow::DT_RESOURCE) {
+    cc_status = tsl::errors::InvalidArgument(
+        "Trying to obtain resource handle from Input[", index,
+        "], which is not type DT_RESOURCE.");
+    Set_TF_Status_from_Status(status, cc_status);
+    return nullptr;
+  }
+  const tensorflow::ResourceHandle& handle =
+      arg_tensor.flat<tensorflow::ResourceHandle>()(0);
+  tensorflow::Var* variable;
+  cc_status = tensorflow::LookupResource(cc_ctx, handle, &variable);
+  return new TF_VariableInfo(index, handle.name(), variable);
+}
+
+void TF_LockVariableInfos(TF_VariableInfo** vars, int num_vars,
+                          TF_Status* status) {
+  std::vector<tensorflow::VariableInfo*> variable_ptrs;
+  variable_ptrs.reserve(num_vars);
+  for (int i = 0; i < num_vars; ++i) {
+    variable_ptrs.push_back(&(vars[i]->var_info));
+  }
+  tsl::Status cc_status = LockVariables(absl::MakeSpan(variable_ptrs));
+  tsl::Set_TF_Status_from_Status(status, cc_status);
+}
+
+void TF_AllocateTempForVariableInfo(TF_OpKernelContext* ctx,
+                                    TF_VariableInfo* var_info,
+                                    TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  tsl::Status cc_status;
+  if (var_info == nullptr) {
+    cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
+    Set_TF_Status_from_Status(status, cc_status);
+    return;
+  }
+  if (var_info->var_info.var() == nullptr) {
+    cc_status = tsl::errors::InvalidArgument(
+        "VariableInfo does not track a resource variable.");
+    Set_TF_Status_from_Status(status, cc_status);
+    return;
+  }
+
+  cc_status = cc_ctx->allocate_temp(var_info->var_info.var()->tensor()->dtype(),
+                                    var_info->var_info.var()->tensor()->shape(),
+                                    var_info->var_info.var()->tensor());
+  Set_TF_Status_from_Status(status, cc_status);
+}
+
+TF_Tensor* TF_GetTensorFromVariableInfo(TF_VariableInfo* var_info,
+                                        TF_Status* status) {
+  tsl::Status cc_status;
+  if (var_info == nullptr) {
+    cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
+    Set_TF_Status_from_Status(status, cc_status);
+    return nullptr;
+  }
+  if (var_info->var_info.var() == nullptr) {
+    cc_status = tsl::errors::InvalidArgument(
+        "VariableInfo does not track a resource variable.");
+    Set_TF_Status_from_Status(status, cc_status);
+    return nullptr;
+  }
+
+  tensorflow::Tensor* tensor = var_info->var_info.var()->tensor();
+  TF_Tensor* result_tensor =
+      tensorflow::TF_TensorFromTensor(*tensor, &cc_status);
+  Set_TF_Status_from_Status(status, cc_status);
+  return result_tensor;
+}
+
+void TF_DeleteVariableInfo(TF_VariableInfo* var_info) {
+  if (var_info != nullptr) {
+    delete var_info;
+  }
+}
+
+// ---------------------  Coordination service  --------------------------------
+TF_CoordinationServiceAgent* TF_GetCoordinationServiceAgent(
+    TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
+  return reinterpret_cast<TF_CoordinationServiceAgent*>(
+      cc_ctx->coordination_service_agent());
+}
+
+bool TF_CoordinationServiceIsInitialized(TF_CoordinationServiceAgent* agent) {
+  if (agent == nullptr) return false;
+  auto* cc_agent = reinterpret_cast<tsl::CoordinationServiceAgent*>(agent);
+  return cc_agent->IsInitialized();
+}
+
+void TF_CoordinationServiceInsertKeyValue(const char* key, const char* value,
+                                          TF_CoordinationServiceAgent* agent,
+                                          TF_Status* status) {
+  auto* cc_agent = reinterpret_cast<tsl::CoordinationServiceAgent*>(agent);
+  tsl::Status cc_status = cc_agent->InsertKeyValue(key, value);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
+}
+
+TF_Buffer* TF_CoordinationServiceGetKeyValue(const char* key,
+                                             TF_CoordinationServiceAgent* agent,
+                                             TF_Status* status) {
+  auto* cc_agent = reinterpret_cast<tsl::CoordinationServiceAgent*>(agent);
+  auto value = cc_agent->GetKeyValue(key);
+  tsl::Set_TF_Status_from_Status(status, value.status());
+  if (!value.ok()) {
+    return nullptr;
+  }
+  // Caller is responsible to call `TF_DeleteBuffer` to release the buffer.
+  TF_Buffer* result = TF_NewBuffer();
+  const std::string& value_str = *value;
+  void* data = malloc(value_str.length());
+  value_str.copy(static_cast<char*>(data), value_str.length(), 0);
+  result->data = data;
+  result->length = value_str.length();
+  result->data_deallocator = [](void* data, size_t length) { free(data); };
+  return result;
+}
+
+void TF_CoordinationServiceDeleteKeyValue(const char* key,
+                                          TF_CoordinationServiceAgent* agent,
+                                          TF_Status* status) {
+  auto* cc_agent = reinterpret_cast<tsl::CoordinationServiceAgent*>(agent);
+  tsl::Status cc_status = cc_agent->DeleteKeyValue(key);
+  tsl::Set_TF_Status_from_Status(status, cc_status);
+}
+
+// ----------------------------  PJRT  -----------------------------------------
+void TF_CreateAndSetPjRtCApiClient(const char* device_type, TF_Status* status) {
+  tsl::StatusOr<std::unique_ptr<xla::PjRtClient>> pjrt_client =
+      xla::GetCApiClient(device_type);
+  if (!pjrt_client.ok()) {
+    tensorflow::Set_TF_Status_from_Status(status, pjrt_client.status());
+    return;
+  }
+
+  tsl::Status s = tensorflow::SetPjRtClientInTFGlobalResourceManager(
+      tensorflow::DeviceType(device_type), std::move(*pjrt_client));
+  tsl::Set_TF_Status_from_Status(status, s);
+}
+
+PJRT_Client* TF_GetPjRtCClient(const char* device_type, TF_Status* status) {
+  tsl::StatusOr<xla::PjRtClient*> pjrt_client =
+      tensorflow::GetOrCreatePjRtClient(tensorflow::DeviceType(device_type));
+  if (!pjrt_client.ok()) {
+    tensorflow::Set_TF_Status_from_Status(status, pjrt_client.status());
+    return nullptr;
+  }
+  auto* pjrt_c_api_client =
+      tensorflow::down_cast<xla::PjRtCApiClient*>(*pjrt_client);
+  if (pjrt_c_api_client == nullptr) {
+    tensorflow::Set_TF_Status_from_Status(
+        status, tsl::errors::Internal("PjRtClient for ", device_type,
+                                      " is not type PjRtCApiClient"));
+    return nullptr;
+  }
+  TF_SetStatus(status, TF_OK, "");
+  return pjrt_c_api_client->pjrt_c_client();
+}
+
+PJRT_Buffer* TF_GetPjRtCBuffer(TF_Tensor* c_tensor, TF_Status* status) {
+  tensorflow::Tensor tensor;
+  auto s = tensorflow::TF_TensorToTensor(c_tensor, &tensor);
+  if (!s.ok()) {
+    tensorflow::Set_TF_Status_from_Status(status, s);
+    return nullptr;
+  }
+  tensorflow::AsyncValueTensor* av_tensor =
+      tensorflow::AsyncValueTensor::FromTensor(&tensor);
+  if (av_tensor == nullptr || av_tensor->GetBuffer() == nullptr) {
+    tensorflow::Set_TF_Status_from_Status(
+        status,
+        tsl::errors::Internal("Input tensor does not have PjRtBuffer."));
+    return nullptr;
+  }
+  auto* c_api_buffer =
+      tensorflow::down_cast<xla::PjRtCApiBuffer*>(av_tensor->GetBuffer().get());
+  if (c_api_buffer == nullptr) {
+    tensorflow::Set_TF_Status_from_Status(
+        status,
+        tsl::errors::Internal(
+            "The PjRtBuffer in the tensor is not type PjRtCApiBuffer."));
+    return nullptr;
+  }
+  TF_SetStatus(status, TF_OK, "");
+  return c_api_buffer->c_buffer();
+}
+
+void TF_CreatePjRtBuffer(TF_Tensor* c_tensor, PJRT_Buffer* c_buffer,
+                         const char* device_type, TF_Status* status) {
+  tensorflow::Tensor tensor;
+  auto s = tensorflow::TF_TensorToTensor(c_tensor, &tensor);
+  if (!s.ok()) {
+    tensorflow::Set_TF_Status_from_Status(status, s);
+    return;
+  }
+  auto pjrt_client =
+      tensorflow::GetOrCreatePjRtClient(tensorflow::DeviceType(device_type));
+  if (!pjrt_client.ok()) {
+    tensorflow::Set_TF_Status_from_Status(status, pjrt_client.status());
+    return;
+  }
+  auto* pjrt_c_api_client =
+      tensorflow::down_cast<xla::PjRtCApiClient*>(*pjrt_client);
+  if (pjrt_c_api_client == nullptr) {
+    tensorflow::Set_TF_Status_from_Status(
+        status, tsl::errors::Internal("PjRtClient for ", device_type,
+                                      " is not type PjRtCApiClient"));
+    return;
+  }
+  tensorflow::AsyncValueTensor* av_tensor =
+      tensorflow::AsyncValueTensor::FromTensor(&tensor);
+  av_tensor->SetBuffer(
+      std::make_unique<xla::PjRtCApiBuffer>(pjrt_c_api_client, c_buffer));
+  TF_SetStatus(status, TF_OK, "");
+}
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.h b/tensorflow/c/experimental/next_pluggable_device/c_api.h
new file mode 100644
index 00000000000..e577f02a595
--- /dev/null
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.h
@@ -0,0 +1,153 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
+#define TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+
+// --------------------------------------------------------------------------
+// C API for device. The API is under active development and eventually
+// should allow registering a plugin device with TensorFlow.
+
+// Macro to control visibility of exported symbols in the shared library (.so,
+// .dylib, .dll).
+// This duplicates the TF_EXPORT macro definition in
+// tensorflow/core/platform/macros.h in order to keep this .h file independent
+// of any other includes.
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TF_Device is a C wrapper to the C++ TF Device class. This is to be passed
+// through TF_OpKernelContext, and is opaque to plugin.
+typedef struct TF_Device TF_Device;
+
+typedef struct TF_VariableInfo TF_VariableInfo;
+
+// Returns a `TF_Device` pointer, which actually points to a C++ `Device`.
+// Currently we only allow `NextPluggableDevice` to be casted as `TF_Device`,
+// but in theory every this is a C API for every kind of device.
+TF_CAPI_EXPORT extern TF_Device* TF_GetDevice(TF_OpKernelContext* ctx);
+
+TF_CAPI_EXPORT extern size_t TF_GetDeviceOrdinal(TF_Device* device);
+
+// --------------------------  Resource  ---------------------------------------
+// Create a `tensorflow::PluginResource` to the ResourceMgr provided by the
+// `ctx`. The `tensorflow::PluginResource` wraps a resource by plugin (as a
+// opaque pointer, since TensorFlow cannot parse it). `delete_func` is needed
+// for ResourceMgr to clean up the resource. `status` will be set.
+TF_CAPI_EXPORT extern void TF_CreatePluginResource(
+    TF_OpKernelContext* ctx, const char* container_name,
+    const char* plugin_resource_name, void* plugin_resource,
+    void (*delete_func)(void*), TF_Status* status);
+
+// If the ResourceMgr provided by the `ctx` has a resource
+// `plugin_resource_name`, returns it in `*result_plugin_resource`. Otherwise,
+// invokes create_func to create the resource. `delete_func` is needed for
+// ResourceMgr to clean up the resource. `status` will be set. If `status` is
+// not OK, `*result_plugin_resource` will be set as nullptr.
+//
+// Caller does not take ownership of the `plugin_resource`.
+TF_CAPI_EXPORT extern void TF_LookupOrCreatePluginResource(
+    TF_OpKernelContext* ctx, const char* container_name,
+    const char* plugin_resource_name, void** result_plugin_resource,
+    void* (*create_func)(void*), void* create_func_args,
+    void (*delete_func)(void*), TF_Status* status);
+
+// -------------------------  VariableInfo  ------------------------------------
+TF_CAPI_EXPORT extern TF_VariableInfo* TF_CreateVariableInfoFromContext(
+    TF_OpKernelContext* ctx, int index, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_LockVariableInfos(TF_VariableInfo** vars,
+                                                int num_vars,
+                                                TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_AllocateTempForVariableInfo(
+    TF_OpKernelContext* ctx, TF_VariableInfo* var_info, TF_Status* status);
+
+TF_CAPI_EXPORT extern TF_Tensor* TF_GetTensorFromVariableInfo(
+    TF_VariableInfo* var_info, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_DeleteVariableInfo(TF_VariableInfo* var_info);
+
+// ---------------------  Coordination service  --------------------------------
+// Returns a not owning pointer to the coordination service agent, which is
+// opaque to plugin. Plugin OpKernels need to use the accompanying C APIs to
+// access coordination service functionalities.
+TF_CAPI_EXPORT extern TF_CoordinationServiceAgent*
+TF_GetCoordinationServiceAgent(TF_OpKernelContext* ctx);
+
+// Returns true if the coordination service agent has been initialized.
+TF_CAPI_EXPORT extern bool TF_CoordinationServiceIsInitialized(
+    TF_CoordinationServiceAgent* agent);
+
+TF_CAPI_EXPORT extern void TF_CoordinationServiceInsertKeyValue(
+    const char* key, const char* value, TF_CoordinationServiceAgent* agent,
+    TF_Status* status);
+
+// Obtains key-value from coorination service agent. The returned `TF_Buffer`
+// is a newly allocated buffer to hold the string key-value, and caller is
+// responsible for managing the lifetime. If error, `status` will be set and a
+// nullptr will be returned.
+TF_CAPI_EXPORT extern TF_Buffer* TF_CoordinationServiceGetKeyValue(
+    const char* key, TF_CoordinationServiceAgent* agent, TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_CoordinationServiceDeleteKeyValue(
+    const char* key, TF_CoordinationServiceAgent* agent, TF_Status* status);
+
+// ----------------------------  PJRT  -----------------------------------------
+TF_CAPI_EXPORT extern void TF_CreateAndSetPjRtCApiClient(
+    const char* device_type, TF_Status* status);
+
+// Gets the `PJRT_Client*` stored in TF global ResourceManager.
+TF_CAPI_EXPORT extern PJRT_Client* TF_GetPjRtCClient(const char* device_type,
+                                                     TF_Status* status);
+
+// Gets the `PJRT_Buffer*` stored in the tensor. The status will contain error
+// if the tensor does not have a `PjRtCApiBuffer`.
+TF_CAPI_EXPORT extern PJRT_Buffer* TF_GetPjRtCBuffer(TF_Tensor* c_tensor,
+                                                     TF_Status* status);
+
+// Creates a `PjRtCApiBuffer` with the `PJRT_Buffer*` passed in and set to the
+// tensor.
+TF_CAPI_EXPORT extern void TF_CreatePjRtBuffer(TF_Tensor* c_tensor,
+                                               PJRT_Buffer* c_buffer,
+                                               const char* device_type,
+                                               TF_Status* status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_NEXT_PLUGGABLE_DEVICE_C_API_H_
diff --git a/tensorflow/c/experimental/ops/BUILD b/tensorflow/c/experimental/ops/BUILD
index e5cf1c39f65..13f1c808d45 100644
--- a/tensorflow/c/experimental/ops/BUILD
+++ b/tensorflow/c/experimental/ops/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 # Experimental ops. These will eventually be replaced by machine-generated versions.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/ops/gen/BUILD b/tensorflow/c/experimental/ops/gen/BUILD
index 21e855dceb9..7ab0a9f49c5 100644
--- a/tensorflow/c/experimental/ops/gen/BUILD
+++ b/tensorflow/c/experimental/ops/gen/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/common/BUILD b/tensorflow/c/experimental/ops/gen/common/BUILD
index 2dcbc644cf0..a5618623bbd 100644
--- a/tensorflow/c/experimental/ops/gen/common/BUILD
+++ b/tensorflow/c/experimental/ops/gen/common/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/cpp/BUILD b/tensorflow/c/experimental/ops/gen/cpp/BUILD
index 7b9aa347198..d2fd0294adb 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/cpp/golden/BUILD b/tensorflow/c/experimental/ops/gen/cpp/golden/BUILD
index 5180b86cece..86880db388b 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/golden/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/golden/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
index 2d41ae84512..7589ea2d2f2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/cpp/views/BUILD b/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
index 455c6cac143..46f61c89d8e 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/views/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/ops/gen/model/BUILD b/tensorflow/c/experimental/ops/gen/model/BUILD
index 04df5d61748..918acaabb6b 100644
--- a/tensorflow/c/experimental/ops/gen/model/BUILD
+++ b/tensorflow/c/experimental/ops/gen/model/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/c/experimental/ops/gen:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/pluggable_profiler/BUILD b/tensorflow/c/experimental/pluggable_profiler/BUILD
index 9fd79348de6..4e3de6a46c1 100644
--- a/tensorflow/c/experimental/pluggable_profiler/BUILD
+++ b/tensorflow/c/experimental/pluggable_profiler/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -61,8 +62,8 @@ cc_library(
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ],
 )
diff --git a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.cc b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.cc
index 6e8cc32e556..0efa257723b 100644
--- a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.cc
+++ b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/profiler_factory.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
index 103c0905f08..6dbbe4549ff 100644
--- a/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
+++ b/tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 394c7de8b59..d72cf86a7bc 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -11,7 +11,9 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        # copybara:uncomment(<g3 only>) "//learning/brain/tfrt/aot:__pkg__",
         "//tensorflow/c:__subpackages__",
         "//tensorflow/c/experimental/saved_model/internal:__pkg__",
         "//tensorflow/cc/experimental/libtf:__pkg__",
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 14fa051a4ab..cce725db3fc 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # Restricting visibility for now
         "//tensorflow/c/experimental/saved_model/core:__subpackages__",
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index 3c2050e79ec..ab7de9bae06 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -3,9 +3,11 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 # This package contains classes corresponding to Revived SavedObjectGraph types
 # used by SavedModel. See https://cs.opensource.google/tensorflow/tensorflow/+/c575e2ba93c442121d98d3f125d83fed1339924d:tensorflow/core/protobuf/saved_object_graph.proto;l=56-62
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # Restricting visibility for now
         "//tensorflow/c/experimental/saved_model/core:__pkg__",
+        # copybara:uncomment "//learning/brain/tfrt/aot:__pkg__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index 2a4297e2b67..660a417be8f 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -80,51 +80,6 @@ Status ConstantFromSavedConstant(
   return internal::TensorProtoToConstant(ctx, tensor_proto, output);
 }
 
-// Finds the "signatures" object in the object graph, and fills a mapping of
-// each signature's name to the corresponding function's node in the object
-// graph.
-Status GetSignaturesMap(const SavedObjectGraph& saved_objects,
-                        gtl::FlatMap<std::string, int>* signatures_map) {
-  if (saved_objects.nodes().empty()) {
-    return errors::FailedPrecondition("Saved Object Graph was empty.");
-  }
-  const SavedObject& root = saved_objects.nodes(0);
-  const SavedObject* signatures = nullptr;
-  for (const auto& child : root.children()) {
-    if (child.local_name() == "signatures") {
-      if (child.node_id() >= saved_objects.nodes().size()) {
-        return errors::FailedPrecondition(
-            "Signature object had child node id ", child.node_id(),
-            " which exceeds the size of the set of nodes");
-      }
-      signatures = &saved_objects.nodes(child.node_id());
-    }
-  }
-
-  // Some basic sanity checks that this object is actually our "signatures" map
-  if (signatures == nullptr) {
-    // This is where the "signatures" attribute is always set:
-    // https://github.com/tensorflow/tensorflow/blob/a2c542a0d83227568f9214a2af9a38ae3625976f/tensorflow/python/saved_model/save.py#L1106-L1109
-    return errors::FailedPrecondition(
-        "SavedObjectGraph's root object must have a child 'signatures' object");
-  }
-  if (signatures->kind_case() != SavedObject::kUserObject) {
-    return errors::FailedPrecondition(
-        "Signatures must be a SavedObject of type UserObject.");
-  }
-  if (signatures->user_object().identifier() != "signature_map") {
-    // This is where the string comes from:
-    // https://github.com/tensorflow/tensorflow/blob/c59af2913aaec235d883f50428efef1086f4c0e6/tensorflow/python/saved_model/signature_serialization.py#L220
-    return errors::FailedPrecondition(
-        "Signatures SavedObject must have identifier 'signature_map'.");
-  }
-
-  for (const auto& child : signatures->children()) {
-    (*signatures_map)[child.local_name()] = child.node_id();
-  }
-  return Status();
-}
-
 // Perform some basic sanity checks on SavedConcreteFunction's input and
 // output signatures with respect to the corresponding FunctionDef's input
 // and output args.
@@ -183,6 +138,50 @@ Status ValidateSavedFunctionCompatibleWithFunctionDef(
   return Status();
 }
 
+}  // namespace
+
+Status GetSignaturesMap(const SavedObjectGraph& saved_objects,
+                        gtl::FlatMap<std::string, int>* signatures_map) {
+  if (saved_objects.nodes().empty()) {
+    return errors::FailedPrecondition("Saved Object Graph was empty.");
+  }
+  const SavedObject& root = saved_objects.nodes(0);
+  const SavedObject* signatures = nullptr;
+  for (const auto& child : root.children()) {
+    if (child.local_name() == "signatures") {
+      if (child.node_id() >= saved_objects.nodes().size()) {
+        return errors::FailedPrecondition(
+            "Signature object had child node id ", child.node_id(),
+            " which exceeds the size of the set of nodes");
+      }
+      signatures = &saved_objects.nodes(child.node_id());
+    }
+  }
+
+  // Some basic sanity checks that this object is actually our "signatures" map
+  if (signatures == nullptr) {
+    // This is where the "signatures" attribute is always set:
+    // https://github.com/tensorflow/tensorflow/blob/a2c542a0d83227568f9214a2af9a38ae3625976f/tensorflow/python/saved_model/save.py#L1106-L1109
+    return errors::FailedPrecondition(
+        "SavedObjectGraph's root object must have a child 'signatures' object");
+  }
+  if (signatures->kind_case() != SavedObject::kUserObject) {
+    return errors::FailedPrecondition(
+        "Signatures must be a SavedObject of type UserObject.");
+  }
+  if (signatures->user_object().identifier() != "signature_map") {
+    // This is where the string comes from:
+    // https://github.com/tensorflow/tensorflow/blob/c59af2913aaec235d883f50428efef1086f4c0e6/tensorflow/python/saved_model/signature_serialization.py#L220
+    return errors::FailedPrecondition(
+        "Signatures SavedObject must have identifier 'signature_map'.");
+  }
+
+  for (const auto& child : signatures->children()) {
+    (*signatures_map)[child.local_name()] = child.node_id();
+  }
+  return Status();
+}
+
 Status ValidateSingleConcreteFunction(const SavedFunction& saved_function) {
   // We only allow loading functions that have an annotated input signature,
   // which means there is 1:1 correspondence between tf.function
@@ -198,8 +197,6 @@ Status ValidateSingleConcreteFunction(const SavedFunction& saved_function) {
   return Status();
 }
 
-}  // namespace
-
 Status LoadSavedAsset(ImmediateExecutionContext* ctx, const SavedAsset& asset,
                       const std::string& saved_model_dir,
                       absl::Span<const AssetFileDef> assets,
@@ -438,9 +435,11 @@ Status PartiallyReviveSavedModelObjects(const MetaGraphDef& metagraph,
       resource_revival_state.device = node.resource().device();
       objects->restored_resources[i] = std::move(resource_revival_state);
     } else if (node.kind_case() == SavedObject::kFunction) {
-      // Get the SavedFunction node and validate it has a single concrete func.
+      // Get the SavedFunction node and skip if it has no concrete functions.
       const SavedFunction& saved_function = node.function();
-      TF_RETURN_IF_ERROR(ValidateSingleConcreteFunction(saved_function));
+      if (saved_function.concrete_functions_size() < 1) {
+        continue;
+      }
 
       // Retrieve related function information.
       const std::string& function_name = saved_function.concrete_functions(0);
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
index db45e28087f..34b4499621c 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -94,6 +94,15 @@ gtl::FlatMap<StringPiece, const AttrValueMap*, StringPieceHasher> NodeToAttrMap(
 gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*, StringPieceHasher>
 FunctionNameToFunctionDefMap(const FunctionDefLibrary& library);
 
+// Finds the "signatures" object in the object graph, and fills a mapping of
+// each signature's name to the corresponding function's node in the object
+// graph.
+Status GetSignaturesMap(const SavedObjectGraph& saved_objects,
+                        gtl::FlatMap<std::string, int>* signatures_map);
+
+// Validates the `saved_function`.
+Status ValidateSingleConcreteFunction(const SavedFunction& saved_function);
+
 // Walks through the SavedObjectGraph in metagraph, and restores all nodes
 // (except "UserDefinedObjects") with their corresponding type in
 // "PartiallyRevivedObjects".
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 2647a822f93..d6dc1f202b0 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -20,7 +20,10 @@ load(
     "tf_copts",
 )
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cc_library(
     name = "concrete_function",
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
index 49acc9274fc..ab1a6e3689e 100644
--- a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index 71fd46ab889..6a711ae1738 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -11,6 +11,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # This is intentionally public
     default_visibility = [
         "//visibility:public",
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 849c0f2c22b..d06c536f671 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -9,6 +9,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -43,13 +44,12 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:executor_cache",
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:platform",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_internal",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",
         "//tensorflow/compiler/xla/stream_executor:timer",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/device:device_utils",
-        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/platform:strcat",
+        "@com_google_absl//absl/functional:any_invocable",
     ],
 )
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index c8a9670156b..2ba7d3cc953 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 
 #include <string>
+#include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/c_api_macros_internal.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
@@ -51,7 +53,7 @@ using tensorflow::StringPiece;
 using OwnedTFStatus = tensorflow::TF_StatusPtr;
 
 namespace {
-port::Status ValidateSPPlatform(const SP_Platform& platform) {
+tsl::Status ValidateSPPlatform(const SP_Platform& platform) {
   TF_VALIDATE_STRUCT_SIZE(SP_Platform, platform, SP_PLATFORM_STRUCT_SIZE);
   TF_VALIDATE_NOT_NULL(SP_Platform, platform, name);
   TF_VALIDATE_NOT_NULL(SP_Platform, platform, type);
@@ -63,7 +65,7 @@ port::Status ValidateSPPlatform(const SP_Platform& platform) {
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
+tsl::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
   TF_VALIDATE_STRUCT_SIZE(SP_PlatformFns, platform_fns,
                           SP_PLATFORM_FNS_STRUCT_SIZE);
   TF_VALIDATE_NOT_NULL(SP_PlatformFns, platform_fns, create_device);
@@ -77,40 +79,40 @@ port::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPTimerFns(const SP_TimerFns& timer_fns) {
+tsl::Status ValidateSPTimerFns(const SP_TimerFns& timer_fns) {
   TF_VALIDATE_STRUCT_SIZE(SP_TimerFns, timer_fns, SP_TIMER_FNS_STRUCT_SIZE);
   TF_VALIDATE_NOT_NULL(SP_TimerFns, timer_fns, nanoseconds);
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPAllocatorStats(const SP_AllocatorStats& stats) {
+tsl::Status ValidateSPAllocatorStats(const SP_AllocatorStats& stats) {
   TF_VALIDATE_STRUCT_SIZE(SP_AllocatorStats, stats,
                           SP_ALLOCATORSTATS_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPDeviceMemoryBase(const SP_DeviceMemoryBase& mem) {
+tsl::Status ValidateSPDeviceMemoryBase(const SP_DeviceMemoryBase& mem) {
   TF_VALIDATE_STRUCT_SIZE(SP_DeviceMemoryBase, mem,
                           SP_DEVICE_MEMORY_BASE_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPDevice(const SP_Device& device) {
+tsl::Status ValidateSPDevice(const SP_Device& device) {
   TF_VALIDATE_STRUCT_SIZE(SP_Device, device, SP_DEVICE_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPDeviceFns(const SP_DeviceFns& device_fns) {
+tsl::Status ValidateSPDeviceFns(const SP_DeviceFns& device_fns) {
   TF_VALIDATE_STRUCT_SIZE(SP_DeviceFns, device_fns, SP_DEVICE_FNS_STRUCT_SIZE);
   // All other fields could theoretically be zero/null.
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
-                                      const SP_Platform& platform) {
+tsl::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
+                                     const SP_Platform& platform) {
   TF_VALIDATE_STRUCT_SIZE(SP_StreamExecutor, se,
                           SP_STREAM_EXECUTOR_STRUCT_SIZE);
   TF_VALIDATE_NOT_NULL(SP_StreamExecutor, se, allocate);
@@ -149,7 +151,7 @@ port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
   return ::tensorflow::OkStatus();
 }
 
-port::Status ValidateSEPlatformRegistrationParams(
+tsl::Status ValidateSEPlatformRegistrationParams(
     const SE_PlatformRegistrationParams& params) {
   TF_VALIDATE_STRUCT_SIZE(SE_PlatformRegistrationParams, params,
                           SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE);
@@ -193,7 +195,7 @@ DeviceMemoryBase DeviceMemoryBaseFromC(const SP_DeviceMemoryBase& mem) {
 
 // Wrapper that allows passing std::function across C API.
 struct HostCallbackContext {
-  std::function<port::Status()> callback;
+  absl::AnyInvocable<tsl::Status() &&> callback;
 };
 
 // This wrapper allows calling `HostCallbackContext::callback` across C API.
@@ -201,7 +203,7 @@ struct HostCallbackContext {
 // `callback_fn` to `host_callback` in `SP_StreamExecutor`.
 void HostCallbackTrampoline(void* ctx, TF_Status* status) {
   HostCallbackContext* host_ctx = static_cast<HostCallbackContext*>(ctx);
-  port::Status s = host_ctx->callback();
+  tsl::Status s = std::move(host_ctx->callback)();
   Set_TF_Status_from_Status(status, s);
   delete host_ctx;
 }
@@ -226,14 +228,14 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     platform_fns_->destroy_device(platform_, &device_);
   }
 
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+  tsl::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return ::tensorflow::OkStatus();
   }
 
   DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override {
     SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
     stream_executor_->allocate(&device_, size, memory_space, &mem);
-    port::Status status = ValidateSPDeviceMemoryBase(mem);
+    tsl::Status status = ValidateSPDeviceMemoryBase(mem);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
     }
@@ -280,7 +282,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     if (!has_stats) {
       return absl::nullopt;
     }
-    port::Status status = ValidateSPAllocatorStats(c_stats);
+    tsl::Status status = ValidateSPAllocatorStats(c_stats);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
       return absl::nullopt;
@@ -310,38 +312,37 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     }
     return true;
   }
-  port::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                  uint64 size) override {
+  tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                 uint64 size) override {
     // TODO(annarev): figure out if we should support memzero/memset
     // functionality by allocating on host and then copying to device.
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "SynchronousMemZero is not supported by pluggable device.");
   }
-  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64 size) override {
-    return port::UnimplementedError(
+  tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                uint64 size) override {
+    return tsl::errors::Unimplemented(
         "SynchronousMemSet is not supported by pluggable device.");
   }
-  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                 const void* host_src, uint64 size) override {
+  tsl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                                uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(gpu_dst);
     stream_executor_->sync_memcpy_htod(&device_, &device_memory_base, host_src,
                                        size, c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  port::Status SynchronousMemcpy(void* host_dst,
-                                 const DeviceMemoryBase& gpu_src,
-                                 uint64 size) override {
+  tsl::Status SynchronousMemcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                                uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_DeviceMemoryBase device_memory_base = DeviceMemoryBaseToC(&gpu_src);
     stream_executor_->sync_memcpy_dtoh(&device_, host_dst, &device_memory_base,
                                        size, c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64 size) override {
+  tsl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                              const DeviceMemoryBase& gpu_src,
+                                              uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_DeviceMemoryBase device_mem_dst = DeviceMemoryBaseToC(gpu_dst);
     SP_DeviceMemoryBase device_mem_src = DeviceMemoryBaseToC(&gpu_src);
@@ -349,8 +350,8 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
                                        &device_mem_src, size, c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                       uint64 size) override {
+  tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                      uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
@@ -359,8 +360,8 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
                                c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
-                      uint64 size) override {
+  tsl::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+                     uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
@@ -369,8 +370,8 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
                              size, c_status.get());
     return StatusFromTF_Status(c_status.get());
   }
-  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                        uint32 pattern, uint64 size) override {
+  tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                       uint32 pattern, uint64 size) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
@@ -424,27 +425,27 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return true;
   }
   bool HostCallback(Stream* stream,
-                    std::function<port::Status()> callback) override {
+                    absl::AnyInvocable<tsl::Status() &&> callback) override {
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
-    HostCallbackContext* ctx = new HostCallbackContext{callback};
+    HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
     return stream_executor_->host_callback(&device_, stream_handle,
                                            &HostCallbackTrampoline, ctx);
   }
-  port::Status AllocateEvent(Event* event) override {
+  tsl::Status AllocateEvent(Event* event) override {
     DCHECK(event != nullptr);
     return static_cast<CEvent*>(event->implementation())->Create();
   }
-  port::Status DeallocateEvent(Event* event) override {
+  tsl::Status DeallocateEvent(Event* event) override {
     static_cast<CEvent*>(event->implementation())->Destroy();
     return ::tensorflow::OkStatus();
   }
-  port::Status RecordEvent(Stream* stream, Event* event) override {
+  tsl::Status RecordEvent(Stream* stream, Event* event) override {
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
     return static_cast<CEvent*>(event->implementation())->Record(stream_handle);
   }
-  port::Status WaitForEvent(Stream* stream, Event* event) override {
+  tsl::Status WaitForEvent(Stream* stream, Event* event) override {
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
     SP_Event event_handle =
@@ -452,7 +453,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     OwnedTFStatus c_status(TF_NewStatus());
     stream_executor_->wait_for_event(&device_, stream_handle, event_handle,
                                      c_status.get());
-    port::Status s = StatusFromTF_Status(c_status.get());
+    tsl::Status s = StatusFromTF_Status(c_status.get());
     return s;
   }
   Event::Status PollForEventStatus(Event* event) override {
@@ -464,7 +465,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
   }
   bool AllocateStream(Stream* stream) override {
     DCHECK(stream != nullptr);
-    port::Status status =
+    tsl::Status status =
         static_cast<CStream*>(stream->implementation())->Create();
     // TODO(annarev): update AllocateStream to return status instead
     // (similar to AllocateEvent).
@@ -488,7 +489,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return true;
   }
   bool AllocateTimer(Timer* timer) override {
-    port::Status status =
+    tsl::Status status =
         static_cast<CTimer*>(timer->implementation())->Create();
     // TODO(annarev): change return value of AllocateTimer
     // to status (similar to AllocateEvent).
@@ -525,7 +526,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     }
     return true;
   }
-  port::Status BlockHostForEvent(Stream* stream, Event* event) {
+  tsl::Status BlockHostForEvent(Stream* stream, Event* event) {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Event event_handle =
         static_cast<CEvent*>(event->implementation())->Handle();
@@ -534,7 +535,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return StatusFromTF_Status(c_status.get());
   }
 
-  port::Status BlockHostUntilDone(Stream* stream) override {
+  tsl::Status BlockHostUntilDone(Stream* stream) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
@@ -551,7 +552,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
     stream_executor_->record_event(&device_, stream_handle, event_handle,
                                    c_status.get());
-    port::Status s = StatusFromTF_Status(c_status.get());
+    tsl::Status s = StatusFromTF_Status(c_status.get());
     if (!s.ok()) {
       stream_executor_->destroy_event(&device_, event_handle);
       return s;
@@ -562,7 +563,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return StatusFromTF_Status(c_status.get());
   }
 
-  port::Status GetStatus(Stream* stream) override {
+  tsl::Status GetStatus(Stream* stream) override {
     OwnedTFStatus c_status(TF_NewStatus());
     SP_Stream stream_handle =
         static_cast<CStream*>(stream->implementation())->Handle();
@@ -571,8 +572,8 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return StatusFromTF_Status(c_status.get());
   }
   int PlatformDeviceCount() override { return visible_device_count_; }
-  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
-    return port::UnimplementedError(
+  tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+    return tsl::errors::Unimplemented(
         "EnablePeerAccessTo is not supported by pluggable device.");
   }
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
@@ -587,7 +588,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
 
   // Creates a new DeviceDescription object.
   // Ownership is transferred to the caller.
-  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
     OwnedTFStatus c_status(TF_NewStatus());
 
@@ -679,7 +680,7 @@ CPlatform::~CPlatform() {
   destroy_platform_fns_(&platform_fns_);
 }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 CPlatform::DescriptionForDevice(int ordinal) const {
   // TODO(annarev): see if we can get StreamExecutor instance
   // and call GetDeviceDescription. executor_cache_.Get would need
@@ -688,24 +689,24 @@ CPlatform::DescriptionForDevice(int ordinal) const {
   builder.set_name(name_);
   return builder.Build();
 }
-port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDevice(int ordinal) {
+tsl::StatusOr<StreamExecutor*> CPlatform::ExecutorForDevice(int ordinal) {
   stream_executor::StreamExecutorConfig config;
   config.ordinal = ordinal;
   return GetExecutor(config);
 }
-port::StatusOr<StreamExecutor*> CPlatform::ExecutorForDeviceWithPluginConfig(
+tsl::StatusOr<StreamExecutor*> CPlatform::ExecutorForDeviceWithPluginConfig(
     int ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
   config.plugin_config = plugin_config;
   return GetExecutor(config);
 }
-port::StatusOr<StreamExecutor*> CPlatform::GetExecutor(
+tsl::StatusOr<StreamExecutor*> CPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
-port::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
+tsl::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
   // Fill device creation params
   SE_CreateDeviceParams device_params{SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE};
@@ -734,9 +735,8 @@ port::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
   return result;
 }
 
-port::Status InitStreamExecutorPlugin(void* dso_handle,
-                                      std::string* device_type,
-                                      std::string* platform_name) {
+tsl::Status InitStreamExecutorPlugin(void* dso_handle, std::string* device_type,
+                                     std::string* platform_name) {
   tensorflow::Env* env = tensorflow::Env::Default();
 
   // Step 1: Load symbol for `TF_InitPlugin`
@@ -749,9 +749,9 @@ port::Status InitStreamExecutorPlugin(void* dso_handle,
   return InitStreamExecutorPlugin(init_fn, device_type, platform_name);
 }
 
-port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
-                                      std::string* device_type,
-                                      std::string* platform_name) {
+tsl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
+                                     std::string* device_type,
+                                     std::string* platform_name) {
   SE_PlatformRegistrationParams params{
       SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE};
   SP_Platform platform{SP_PLATFORM_STRUCT_SIZE};
@@ -804,7 +804,7 @@ port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
           std::move(platform), params.destroy_platform, std::move(platform_fns),
           params.destroy_platform_fns, std::move(device_fns), std::move(se),
           std::move(timer_fns)));
-  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+  TF_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
       std::move(cplatform)));
   // TODO(annarev): Return `use_bfc_allocator` value in some way so that it is
   // available in `PluggableDeviceProcessState` once the latter is checked in.
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index 7246dde2660..ad8a77d61fa 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/executor_cache.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 
 namespace stream_executor {
@@ -33,15 +32,14 @@ typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams* const,
 
 // Registers StreamExecutor platform. `device_type` and `platform_name` are
 // output parameters.
-port::Status InitStreamExecutorPlugin(void* dso_handle,
-                                      std::string* device_type,
-                                      std::string* platform_name);
+tsl::Status InitStreamExecutorPlugin(void* dso_handle, std::string* device_type,
+                                     std::string* platform_name);
 
 // Allow registering a StreamExecutor plugin using a function (used for
 // testing).
-port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
-                                      std::string* device_type,
-                                      std::string* platform_name);
+tsl::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn,
+                                     std::string* device_type,
+                                     std::string* platform_name);
 
 // This file implements core stream executor base classes in terms of
 // the C API defined in stream_executor.h. A class "CSomething" represents a
@@ -71,14 +69,14 @@ class CPlatform : public Platform {
   }
   bool UseBfcAllocator() const { return platform_.use_bfc_allocator; }
   bool ForceMemoryGrowth() const { return platform_.force_memory_growth; }
-  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& plugin_config) override;
-  port::StatusOr<StreamExecutor*> GetExecutor(
+  tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   // Trace listener is not supported
@@ -110,10 +108,10 @@ class CStream : public internal::StreamInterface {
         stream_handle_(nullptr) {}
   ~CStream() override { Destroy(); }
 
-  port::Status Create() {
+  tsl::Status Create() {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->create_stream(device_, &stream_handle_, c_status.get());
-    port::Status s = tensorflow::StatusFromTF_Status(c_status.get());
+    tsl::Status s = tensorflow::StatusFromTF_Status(c_status.get());
     return s;
   }
 
@@ -140,13 +138,13 @@ class CEvent : public internal::EventInterface {
         event_handle_(nullptr) {}
   ~CEvent() override { Destroy(); }
 
-  port::Status Create() {
+  tsl::Status Create() {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->create_event(device_, &event_handle_, c_status.get());
     return tensorflow::StatusFromTF_Status(c_status.get());
   }
 
-  port::Status Record(SP_Stream stream_handle) {
+  tsl::Status Record(SP_Stream stream_handle) {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->record_event(device_, stream_handle, event_handle_,
                                    c_status.get());
@@ -178,7 +176,7 @@ class CTimer : public internal::TimerInterface {
         timer_fns_(timer_fns) {}
   ~CTimer() override { Destroy(); }
 
-  port::Status Create() {
+  tsl::Status Create() {
     tensorflow::TF_StatusPtr c_status(TF_NewStatus());
     stream_executor_->create_timer(device_, &timer_handle_, c_status.get());
     return tensorflow::StatusFromTF_Status(c_status.get());
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index 8b82121c51d..cf21374c48f 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -38,17 +38,17 @@ TEST(StreamExecutor, SuccessfulRegistration) {
     test_util::PopulateDefaultPlatformRegistrationParams(params);
   };
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   TF_ASSERT_OK(status);
-  port::StatusOr<Platform*> maybe_platform =
+  tsl::StatusOr<Platform*> maybe_platform =
       MultiPlatformManager::PlatformWithName("MY_DEVICE");
   TF_ASSERT_OK(maybe_platform.status());
   Platform* platform = std::move(maybe_platform).value();
   ASSERT_EQ(platform->Name(), test_util::kDeviceName);
   ASSERT_EQ(platform->VisibleDeviceCount(), test_util::kDeviceCount);
 
-  port::StatusOr<StreamExecutor*> maybe_executor =
+  tsl::StatusOr<StreamExecutor*> maybe_executor =
       platform->ExecutorForDevice(0);
   TF_ASSERT_OK(maybe_executor.status());
 }
@@ -62,7 +62,7 @@ TEST(StreamExecutor, NameNotSet) {
   };
 
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
@@ -77,7 +77,7 @@ TEST(StreamExecutor, InvalidNameWithSemicolon) {
   };
 
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   EXPECT_THAT(
@@ -94,7 +94,7 @@ TEST(StreamExecutor, InvalidNameWithSlash) {
   };
 
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   EXPECT_THAT(status.error_message(),
@@ -110,7 +110,7 @@ TEST(StreamExecutor, CreateDeviceNotSet) {
   };
 
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.error_message(),
@@ -126,7 +126,7 @@ TEST(StreamExecutor, UnifiedMemoryAllocateNotSet) {
   };
 
   std::string device_type, platform_name;
-  port::Status status =
+  tsl::Status status =
       InitStreamExecutorPlugin(plugin_init, &device_type, &platform_name);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(
@@ -152,7 +152,7 @@ class StreamExecutorTest : public ::testing::Test {
           platform_, test_util::DestroyPlatform, platform_fns_,
           test_util::DestroyPlatformFns, device_fns_, se_, timer_fns_);
     }
-    port::StatusOr<StreamExecutor*> maybe_executor =
+    tsl::StatusOr<StreamExecutor*> maybe_executor =
         cplatform_->ExecutorForDevice(ordinal);
     TF_CHECK_OK(maybe_executor.status());
     return std::move(maybe_executor).value();
@@ -724,7 +724,7 @@ TEST_F(StreamExecutorTest, HostCallbackOk) {
   StreamExecutor* executor = GetExecutor(0);
   Stream stream(executor);
   stream.Init();
-  std::function<port::Status()> callback = []() -> port::Status {
+  std::function<tsl::Status()> callback = []() -> tsl::Status {
     return ::tensorflow::OkStatus();
   };
   stream.ThenDoHostCallbackWithStatus(callback);
@@ -744,8 +744,8 @@ TEST_F(StreamExecutorTest, HostCallbackError) {
   StreamExecutor* executor = GetExecutor(0);
   Stream stream(executor);
   stream.Init();
-  std::function<port::Status()> callback = []() -> port::Status {
-    return port::UnimplementedError("Unimplemented");
+  std::function<tsl::Status()> callback = []() -> tsl::Status {
+    return tsl::errors::Unimplemented("Unimplemented");
   };
   stream.ThenDoHostCallbackWithStatus(callback);
   ASSERT_FALSE(stream.ok());
diff --git a/tensorflow/c/experimental/stream_executor/test/BUILD b/tensorflow/c/experimental/stream_executor/test/BUILD
index e3795a2715b..2a4d40b3e79 100644
--- a/tensorflow/c/experimental/stream_executor/test/BUILD
+++ b/tensorflow/c/experimental/stream_executor/test/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index c3a54a46b3c..85b2433ac43 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/kernels.h"
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_api_macros.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/types.h"
 // Required for IS_MOBILE_PLATFORM definition
@@ -295,6 +297,13 @@ void TF_InputRange(TF_OpKernelContext* ctx, const char* name,
   tensorflow::Set_TF_Status_from_Status(args->status, status);
 }
 
+TF_DataType TF_InputDatatype(TF_OpKernelContext* ctx, int index) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  CHECK_GE(index, 0);                     // Crash OK
+  CHECK_LT(index, cc_ctx->num_inputs());  // Crash OK
+  return static_cast<TF_DataType>(cc_ctx->input_dtype(index));
+}
+
 void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
                   TF_Status* status) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
@@ -354,6 +363,18 @@ void TF_GetSerializedConfigProto(TF_OpKernelContext* ctx,
   tensorflow::Set_TF_Status_from_Status(status, cc_status);
 }
 
+void TF_GetSerializedResourceHandleProto(
+    TF_OpKernelContext* ctx, int i, TF_Buffer* serialized_resource_handle_proto,
+    TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  const tensorflow::ResourceHandle& handle = HandleFromInput(cc_ctx, i);
+  tensorflow::ResourceHandleProto handle_proto;
+  handle.AsProto(&handle_proto);
+  auto cc_status = tensorflow::MessageToBuffer(
+      handle_proto, serialized_resource_handle_proto);
+  tensorflow::Set_TF_Status_from_Status(status, cc_status);
+}
+
 void TF_OpKernelConstruction_Failure(TF_OpKernelConstruction* ctx,
                                      TF_Status* status) {
   auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx);
@@ -652,6 +673,18 @@ int64_t TF_GetIterId(TF_OpKernelContext* ctx) {
       .iter_id;
 }
 
+int64_t TF_GetStepId(TF_OpKernelContext* ctx) {
+  return reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->step_id();
+}
+
+int TF_GetDeviceId(TF_OpKernelContext* ctx) {
+  // TensorFlow always sets device in OpKernelContext.
+  auto* device =
+      reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->device();
+  if (!device->parsed_name().has_id) return -1;
+  return device->parsed_name().id;
+}
+
 TF_StringView TF_GetOpKernelName(TF_OpKernelContext* ctx) {
   auto cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
   TF_StringView opkernel_name_sv;
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index e85dc9f252a..2e765b7dfaa 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -190,6 +190,11 @@ TF_CAPI_EXPORT extern void TF_InputRange(TF_OpKernelContext* ctx,
                                          const char* name,
                                          TF_InputRange_Args* args);
 
+// Returns the data type of the index-th input. If index < 0 or index >=
+// TF_NumInputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_InputDatatype(TF_OpKernelContext* ctx,
+                                                   int index);
+
 // Sets the ith output of ctx to tensor. If TF_GetCode(status) is anything but
 // TF_OK, ctx is left unmodified.
 //
@@ -216,6 +221,11 @@ TF_CAPI_EXPORT extern void TF_GetSerializedConfigProto(
     TF_OpKernelContext* ctx, TF_Buffer* serialized_config_proto,
     TF_Status* status);
 
+// Retrieves a serialized ResourceHandleProto. Status will be set.
+TF_CAPI_EXPORT extern void TF_GetSerializedResourceHandleProto(
+    TF_OpKernelContext* ctx, int i, TF_Buffer* serialized_resource_handle_proto,
+    TF_Status* status);
+
 // Notifies the given OpKernelConstruction that kernel construction has failed.
 TF_CAPI_EXPORT extern void TF_OpKernelConstruction_Failure(
     TF_OpKernelConstruction* ctx, TF_Status* status);
@@ -253,6 +263,12 @@ TF_CAPI_EXPORT extern uint64_t TF_GetFrameId(TF_OpKernelContext* ctx);
 // Returns the Iter ID of the given context.
 TF_CAPI_EXPORT extern int64_t TF_GetIterId(TF_OpKernelContext* ctx);
 
+// Returns the Step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_GetStepId(TF_OpKernelContext* ctx);
+
+// Returns the Device ID of the device that the context possesses.
+TF_CAPI_EXPORT extern int TF_GetDeviceId(TF_OpKernelContext* ctx);
+
 // Returns the graph def version of the given context.
 TF_CAPI_EXPORT extern int TF_GetGraphDefVersion(TF_OpKernelContext* ctx);
 
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 99fbcfabab4..93ed9a7880b 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
index 3e6121bf989..df0c1fb45b0 100644
--- a/tensorflow/c/tf_datatype.h
+++ b/tensorflow/c/tf_datatype.h
@@ -59,7 +59,7 @@ typedef enum TF_DataType {
   TF_QINT8 = 11,     // Quantized int8
   TF_QUINT8 = 12,    // Quantized uint8
   TF_QINT32 = 13,    // Quantized int32
-  TF_BFLOAT16 = 14,  // Float32 truncated to 16 bits.  Only for cast ops.
+  TF_BFLOAT16 = 14,  // Float32 truncated to 16 bits.
   TF_QINT16 = 15,    // Quantized int16
   TF_QUINT16 = 16,   // Quantized uint16
   TF_UINT16 = 17,
@@ -69,6 +69,9 @@ typedef enum TF_DataType {
   TF_VARIANT = 21,
   TF_UINT32 = 22,
   TF_UINT64 = 23,
+  TF_FLOAT8_E5M2 = 24,    // 5 exponent bits, 2 mantissa bits.
+  TF_FLOAT8_E4M3FN = 25,  // 4 exponent bits, 3 mantissa bits, finite-only, with
+                          // 2 NaNs (0bS1111111).
 } TF_DataType;
 
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
diff --git a/tensorflow/c/tf_status.cc b/tensorflow/c/tf_status.cc
index 2f774fa7977..686e09508ac 100644
--- a/tensorflow/c/tf_status.cc
+++ b/tensorflow/c/tf_status.cc
@@ -16,39 +16,21 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 #include "tensorflow/c/tf_status_internal.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
 
-using ::tensorflow::Status;
-using ::tensorflow::error::Code;
-using ::tensorflow::errors::IOError;
-
-TF_Status* TF_NewStatus() { return new TF_Status; }
-
-void TF_DeleteStatus(TF_Status* s) { delete s; }
+// Trampoline implementation to redirect to TSL. Kept here for backward
+// compatibility only.
 
+TF_Status* TF_NewStatus() { return TSL_NewStatus(); }
+void TF_DeleteStatus(TF_Status* s) { TSL_DeleteStatus(s); }
 void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg) {
-  if (code == TF_OK) {
-    s->status = ::tensorflow::OkStatus();
-    return;
-  }
-  s->status = Status(static_cast<Code>(code), tensorflow::StringPiece(msg));
+  TSL_SetStatus(s, TSL_Code(code), msg);
 }
-
 void TF_SetPayload(TF_Status* s, const char* key, const char* value) {
-  s->status.SetPayload(key, value);
+  TSL_SetPayload(s, key, value);
 }
-
 void TF_SetStatusFromIOError(TF_Status* s, int error_code,
                              const char* context) {
-  // TODO(b/139060984): Handle windows when changing its filesystem
-  s->status = IOError(context, error_code);
-}
-
-TF_Code TF_GetCode(const TF_Status* s) {
-  return static_cast<TF_Code>(s->status.code());
-}
-
-const char* TF_Message(const TF_Status* s) {
-  return s->status.error_message().c_str();
+  TSL_SetStatusFromIOError(s, error_code, context);
 }
+TF_Code TF_GetCode(const TF_Status* s) { return TF_Code(TSL_GetCode(s)); }
+const char* TF_Message(const TF_Status* s) { return TSL_Message(s); }
diff --git a/tensorflow/c/tf_status.h b/tensorflow/c/tf_status.h
index 4616ee434d9..db1d32bf8e7 100644
--- a/tensorflow/c/tf_status.h
+++ b/tensorflow/c/tf_status.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_TF_STATUS_H_
 #define TENSORFLOW_C_TF_STATUS_H_
 
+#include "tensorflow/tsl/c/tsl_status.h"
+
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
@@ -34,30 +36,29 @@ limitations under the License.
 extern "C" {
 #endif
 
-typedef struct TF_Status TF_Status;
+typedef struct TSL_Status TF_Status;
 
 // --------------------------------------------------------------------------
 // TF_Code holds an error code.  The enum values here are identical to
 // corresponding values in error_codes.proto.
-typedef enum TF_Code {
-  TF_OK = 0,
-  TF_CANCELLED = 1,
-  TF_UNKNOWN = 2,
-  TF_INVALID_ARGUMENT = 3,
-  TF_DEADLINE_EXCEEDED = 4,
-  TF_NOT_FOUND = 5,
-  TF_ALREADY_EXISTS = 6,
-  TF_PERMISSION_DENIED = 7,
-  TF_UNAUTHENTICATED = 16,
-  TF_RESOURCE_EXHAUSTED = 8,
-  TF_FAILED_PRECONDITION = 9,
-  TF_ABORTED = 10,
-  TF_OUT_OF_RANGE = 11,
-  TF_UNIMPLEMENTED = 12,
-  TF_INTERNAL = 13,
-  TF_UNAVAILABLE = 14,
-  TF_DATA_LOSS = 15,
-} TF_Code;
+typedef TSL_Code TF_Code;
+#define TF_OK TSL_OK
+#define TF_CANCELLED TSL_CANCELLED
+#define TF_UNKNOWN TSL_UNKNOWN
+#define TF_INVALID_ARGUMENT TSL_INVALID_ARGUMENT
+#define TF_DEADLINE_EXCEEDED TSL_DEADLINE_EXCEEDED
+#define TF_NOT_FOUND TSL_NOT_FOUND
+#define TF_ALREADY_EXISTS TSL_ALREADY_EXISTS
+#define TF_PERMISSION_DENIED TSL_PERMISSION_DENIED
+#define TF_UNAUTHENTICATED TSL_UNAUTHENTICATED
+#define TF_RESOURCE_EXHAUSTED TSL_RESOURCE_EXHAUSTED
+#define TF_FAILED_PRECONDITION TSL_FAILED_PRECONDITION
+#define TF_ABORTED TSL_ABORTED
+#define TF_OUT_OF_RANGE TSL_OUT_OF_RANGE
+#define TF_UNIMPLEMENTED TSL_UNIMPLEMENTED
+#define TF_INTERNAL TSL_INTERNAL
+#define TF_UNAVAILABLE TSL_UNAVAILABLE
+#define TF_DATA_LOSS TSL_DATA_LOSS
 
 // --------------------------------------------------------------------------
 
diff --git a/tensorflow/c/tf_status_helper.cc b/tensorflow/c/tf_status_helper.cc
index 1e4360d5531..9155d9dde8b 100644
--- a/tensorflow/c/tf_status_helper.cc
+++ b/tensorflow/c/tf_status_helper.cc
@@ -17,75 +17,16 @@ limitations under the License.
 
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/tsl/c/tsl_status_helper.h"
 
 namespace tsl {
 
 void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status) {
-  tensorflow::error::Code code = status.code();
-  const char* message(status.error_message().c_str());
-
-  switch (code) {
-    case tensorflow::error::OK:
-      assert(TF_GetCode(tf_status) == TF_OK);
-      break;
-    case tensorflow::error::CANCELLED:
-      TF_SetStatus(tf_status, TF_CANCELLED, message);
-      break;
-    case tensorflow::error::UNKNOWN:
-      TF_SetStatus(tf_status, TF_UNKNOWN, message);
-      break;
-    case tensorflow::error::INVALID_ARGUMENT:
-      TF_SetStatus(tf_status, TF_INVALID_ARGUMENT, message);
-      break;
-    case tensorflow::error::DEADLINE_EXCEEDED:
-      TF_SetStatus(tf_status, TF_DEADLINE_EXCEEDED, message);
-      break;
-    case tensorflow::error::NOT_FOUND:
-      TF_SetStatus(tf_status, TF_NOT_FOUND, message);
-      break;
-    case tensorflow::error::ALREADY_EXISTS:
-      TF_SetStatus(tf_status, TF_ALREADY_EXISTS, message);
-      break;
-    case tensorflow::error::PERMISSION_DENIED:
-      TF_SetStatus(tf_status, TF_PERMISSION_DENIED, message);
-      break;
-    case tensorflow::error::UNAUTHENTICATED:
-      TF_SetStatus(tf_status, TF_UNAUTHENTICATED, message);
-      break;
-    case tensorflow::error::RESOURCE_EXHAUSTED:
-      TF_SetStatus(tf_status, TF_RESOURCE_EXHAUSTED, message);
-      break;
-    case tensorflow::error::FAILED_PRECONDITION:
-      TF_SetStatus(tf_status, TF_FAILED_PRECONDITION, message);
-      break;
-    case tensorflow::error::ABORTED:
-      TF_SetStatus(tf_status, TF_ABORTED, message);
-      break;
-    case tensorflow::error::OUT_OF_RANGE:
-      TF_SetStatus(tf_status, TF_OUT_OF_RANGE, message);
-      break;
-    case tensorflow::error::UNIMPLEMENTED:
-      TF_SetStatus(tf_status, TF_UNIMPLEMENTED, message);
-      break;
-    case tensorflow::error::INTERNAL:
-      TF_SetStatus(tf_status, TF_INTERNAL, message);
-      break;
-    case tensorflow::error::UNAVAILABLE:
-      TF_SetStatus(tf_status, TF_UNAVAILABLE, message);
-      break;
-    case tensorflow::error::DATA_LOSS:
-      TF_SetStatus(tf_status, TF_DATA_LOSS, message);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  errors::CopyPayloads(status, tf_status->status);
+  Set_TSL_Status_from_Status(tf_status, status);
 }
 
 Status StatusFromTF_Status(const TF_Status* tf_status) {
-  return tf_status->status;
+  return StatusFromTSL_Status(tf_status);
 }
 
 }  // namespace tsl
diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
index 4c3c8af6864..df4600b85dc 100644
--- a/tensorflow/c/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -21,10 +21,10 @@ limitations under the License.
 
 namespace tsl {
 // Set the attribute of "tf_status" from the attributes of "status".
-void Set_TF_Status_from_Status(TF_Status* tf_status, const tsl::Status& status);
+void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status);
 
 // Returns a "status" from "tf_status".
-tensorflow::Status StatusFromTF_Status(const TF_Status* tf_status);
+Status StatusFromTF_Status(const TF_Status* tf_status);
 }  // namespace tsl
 
 namespace tensorflow {
diff --git a/tensorflow/c/tf_status_helper_test.cc b/tensorflow/c/tf_status_helper_test.cc
deleted file mode 100644
index 0bd9d1e4e3c..00000000000
--- a/tensorflow/c/tf_status_helper_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/c/tf_status_helper.h"
-
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-TEST(StatusHelper, TestStatusHelper) {
-  TF_Status* s = TF_NewStatus();
-  Status cc_status(errors::InvalidArgument("some error"));
-  cc_status.SetPayload("key1", "value1");
-  cc_status.SetPayload("key2", "value2");
-  Set_TF_Status_from_Status(s, cc_status);
-  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s));
-  ASSERT_EQ(std::string("some error"), TF_Message(s));
-
-  Status another_cc_status(StatusFromTF_Status(s));
-  ASSERT_FALSE(another_cc_status.ok());
-  ASSERT_EQ(std::string("some error"), another_cc_status.error_message());
-  ASSERT_EQ(error::INVALID_ARGUMENT, another_cc_status.code());
-  // Ensure the payloads are not lost during conversions
-  ASSERT_EQ(cc_status.GetPayload("key1"), another_cc_status.GetPayload("key1"));
-  ASSERT_EQ(cc_status.GetPayload("key2"), another_cc_status.GetPayload("key2"));
-  TF_DeleteStatus(s);
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/c/tf_status_internal.h b/tensorflow/c/tf_status_internal.h
index 1e0f99819ff..7a40d6f518e 100644
--- a/tensorflow/c/tf_status_internal.h
+++ b/tensorflow/c/tf_status_internal.h
@@ -16,13 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_TF_STATUS_INTERNAL_H_
 #define TENSORFLOW_C_TF_STATUS_INTERNAL_H_
 
-#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/c/tsl_status_internal.h"
 
-// Internal structures used by the status C API. These are likely to change
-// and should not be depended on.
-
-struct TF_Status {
-  tensorflow::Status status;
-};
+typedef struct TSL_Status TF_Status;
 
 #endif  // TENSORFLOW_C_TF_STATUS_INTERNAL_H_
diff --git a/tensorflow/c/tf_status_test.cc b/tensorflow/c/tf_status_test.cc
deleted file mode 100644
index 50f5dfb0f96..00000000000
--- a/tensorflow/c/tf_status_test.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/c/tf_status.h"
-
-#include <utility>
-
-#include "tensorflow/c/tf_status_internal.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-TEST(TF_Status, PayloadsSet) {
-  TF_Status* tf_status = TF_NewStatus();
-  TF_SetStatus(tf_status, TF_CANCELLED, "Error Message");
-  TF_SetPayload(tf_status, "a", "1");
-  TF_SetPayload(tf_status, "b", "2");
-  TF_SetPayload(tf_status, "c", "3");
-
-  const std::unordered_map<std::string, std::string> payloads =
-      errors::GetPayloads(tf_status->status);
-  EXPECT_EQ(payloads.size(), 3);
-  EXPECT_EQ(payloads.at("a"), "1");
-  EXPECT_EQ(payloads.at("b"), "2");
-  EXPECT_EQ(payloads.at("c"), "3");
-  TF_DeleteStatus(tf_status);
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 7bf662d81e0..e007af200c4 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -247,7 +247,7 @@ Status TensorInterface::BitcastFrom(const TensorInterface& from, DataType type,
                                     const int64_t* new_dims, int num_new_dims) {
   tensorflow::TensorShape s;
   for (int i = 0; i < num_new_dims; ++i) {
-    s.AddDim(new_dims[i]);
+    TF_RETURN_IF_ERROR(s.AddDimWithStatus(new_dims[i]));
   }
   return tensor_.BitcastFrom(from.tensor_, type, s);
 }
diff --git a/tensorflow/c/tf_tstring.h b/tensorflow/c/tf_tstring.h
index 5dc29f23d59..f9fb2fe083f 100644
--- a/tensorflow/c/tf_tstring.h
+++ b/tensorflow/c/tf_tstring.h
@@ -59,4 +59,4 @@ TF_CAPI_EXPORT extern void TF_StringDealloc(TF_TString *tstr);
 } /* end extern "C" */
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_C_TF_TSTRING_H_
+#endif  // TENSORFLOW_C_TF_TSTRING_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 04796a71711..4fc555871af 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -13,6 +13,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_wrappers_cc")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -759,7 +760,7 @@ tf_gen_op_wrappers_cc(
         "function_ops",
     ],
     pkg = "//tensorflow/core",
-    visibility = ["//tensorflow:internal"],
+    visibility = ["//visibility:public"],
 )
 
 tf_gen_op_wrappers_cc(
diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc
index 27ec4c0871d..3c5357f739e 100644
--- a/tensorflow/cc/client/client_session_test.cc
+++ b/tensorflow/cc/client/client_session_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/cc/client/client_session.h"
 
+#include <utility>
 #include <vector>
 
 #include "absl/synchronization/barrier.h"
@@ -39,6 +40,14 @@ using ops::Mul;
 using ops::Placeholder;
 using ops::Sub;
 
+tensorflow::SessionOptions GetSessionOptions() {
+  tensorflow::SessionOptions options;
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  options.config.mutable_experimental()->set_disable_optimize_for_static_graph(
+      true);
+  return options;
+}
+
 class CustomThreadPoolImpl : public thread::ThreadPoolInterface {
  public:
   explicit CustomThreadPoolImpl(int numThreads) {
@@ -100,7 +109,7 @@ TEST(ClientSessionTest, Extend) {
   Scope root = Scope::NewRootScope();
   auto a = Placeholder(root, DT_INT32, Placeholder::Shape({2}));
   auto c = Add(root, a, {2, 2});
-  ClientSession session(root);
+  ClientSession session(root, GetSessionOptions());
   std::vector<Tensor> outputs;
 
   TF_EXPECT_OK(session.Run({{a, {1, 1}}}, {c}, &outputs));
@@ -116,7 +125,7 @@ TEST(ClientSessionTest, MultiThreadedWithDefaultThreadpool) {
   Scope root = Scope::NewRootScope();
   auto a = Add(root, {1, 2}, {3, 4});
   auto b = Mul(root, {1, 2}, {3, 4});
-  ClientSession session(root);
+  ClientSession session(root, GetSessionOptions());
   {
     thread::ThreadPool thread_pool(Env::Default(), "pool", 2);
     thread_pool.Schedule([&session, a]() {
@@ -143,7 +152,7 @@ TEST(ClientSessionTest, MultiThreadedWithCustomThreadpool) {
   int num_threads = 3;
   auto a = Add(root, {1, 2}, {3, 4});
   auto b = Mul(root, {1, 2}, {3, 4});
-  ClientSession session(root);
+  ClientSession session(root, GetSessionOptions());
 
   auto inter_op_threadpool =
       absl::make_unique<CustomThreadPoolImpl>(num_threads);
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 5313b502bf5..7c1a040960f 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -11,6 +11,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # This is intentionally public
     default_visibility = [
         "//visibility:public",
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index 5f442faa77c..e749d2433bd 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/cc/experimental/libexport/BUILD b/tensorflow/cc/experimental/libexport/BUILD
index 5533cf76431..910ab930440 100644
--- a/tensorflow/cc/experimental/libexport/BUILD
+++ b/tensorflow/cc/experimental/libexport/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility =
         [
             "//tensorflow/cc/experimental/libtf:__subpackages__",
diff --git a/tensorflow/cc/experimental/libtf/BUILD b/tensorflow/cc/experimental/libtf/BUILD
index e9529725d94..e281672de9e 100644
--- a/tensorflow/cc/experimental/libtf/BUILD
+++ b/tensorflow/cc/experimental/libtf/BUILD
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/impl/BUILD b/tensorflow/cc/experimental/libtf/impl/BUILD
index 8231a25102e..0eae5a1f05c 100644
--- a/tensorflow/cc/experimental/libtf/impl/BUILD
+++ b/tensorflow/cc/experimental/libtf/impl/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/mlir/BUILD b/tensorflow/cc/experimental/libtf/mlir/BUILD
index 2d42d855dae..51336186510 100644
--- a/tensorflow/cc/experimental/libtf/mlir/BUILD
+++ b/tensorflow/cc/experimental/libtf/mlir/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/object.h b/tensorflow/cc/experimental/libtf/object.h
index 8f510def431..72d05aaf430 100644
--- a/tensorflow/cc/experimental/libtf/object.h
+++ b/tensorflow/cc/experimental/libtf/object.h
@@ -166,7 +166,7 @@ class Object : public Handle {
         if (class_dict_maybe.type() == TaggedValue::DICT) {
           auto& dict = class_dict_maybe.dict();
           auto it = dict.find(key.value_);
-          if (it != value_.dict().end()) {
+          if (it != dict.end()) {
             return Cast<T>(Handle(it->second));
           }
         }
diff --git a/tensorflow/cc/experimental/libtf/runtime/BUILD b/tensorflow/cc/experimental/libtf/runtime/BUILD
index 75f81a5a8a2..b20c0e6e3f9 100644
--- a/tensorflow/cc/experimental/libtf/runtime/BUILD
+++ b/tensorflow/cc/experimental/libtf/runtime/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/runtime/core/BUILD b/tensorflow/cc/experimental/libtf/runtime/core/BUILD
index cb750c4c7a4..83f61ee11ba 100644
--- a/tensorflow/cc/experimental/libtf/runtime/core/BUILD
+++ b/tensorflow/cc/experimental/libtf/runtime/core/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD b/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD
index 6350e007875..586ef6b9523 100644
--- a/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD
+++ b/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/cc/experimental/libtf:__subpackages__",
     ],
diff --git a/tensorflow/cc/experimental/libtf/tests/runtime_test_core.cc b/tensorflow/cc/experimental/libtf/tests/runtime_test_core.cc
index 0be93c31a28..59952002522 100644
--- a/tensorflow/cc/experimental/libtf/tests/runtime_test_core.cc
+++ b/tensorflow/cc/experimental/libtf/tests/runtime_test_core.cc
@@ -21,7 +21,7 @@ namespace runtime {
 
 INSTANTIATE_TEST_SUITE_P(TF2CAPI, RuntimeTest,
                          ::testing::Values(core::Runtime));
-
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(RuntimeTest);
 }  // namespace runtime
 }  // namespace libtf
 }  // namespace tf
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d0cd220f112..031451d3d2d 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/cc_op_gen.h"
 
+#include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index 9af3b9ce1e3..7b348365b33 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 #define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 
+#include <string>
+
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/cc/framework/cc_op_gen_util.h b/tensorflow/cc/framework/cc_op_gen_util.h
index d6c729f2dc9..8fb90356841 100644
--- a/tensorflow/cc/framework/cc_op_gen_util.h
+++ b/tensorflow/cc/framework/cc_op_gen_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_UTIL_H_
 #define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_UTIL_H_
 
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/cc/framework/fuzzing/BUILD b/tensorflow/cc/framework/fuzzing/BUILD
index 4c6b0d80baf..c14b324fdf2 100644
--- a/tensorflow/cc/framework/fuzzing/BUILD
+++ b/tensorflow/cc/framework/fuzzing/BUILD
@@ -7,6 +7,8 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "cc_op_fuzz_gen_main",
     srcs = [
@@ -28,6 +30,7 @@ cc_library(
         "//tensorflow/core/platform:hash",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -36,15 +39,8 @@ cc_library(
 # tf_gen_op_wrappers_fuzz(
 #     name = "array_ops_fuzz",
 #     api_def_srcs = ["//tensorflow/core/api_def:base_api_def"],
-#     extra_gen_deps = ["//tensorflow/c/kernels:bitcast_op_lib"],
-#     op_lib_names = [
-#         "array_ops",
-#     ],
-#     pkg = "//tensorflow/core",
-#     deps = [
-#         "//third_party/mediapipe/framework/port:parse_text_proto",
+#     kernel_deps = [
 #         "//tensorflow/c/kernels:bitcast_op",
-#         "//tensorflow/cc:cc_ops",
 #         "//tensorflow/core/kernels:array",
 #         "//tensorflow/core/kernels:check_numerics_op",
 #         "//tensorflow/core/kernels:fake_quant_ops",
@@ -57,6 +53,7 @@ cc_library(
 #         "//tensorflow/core/kernels/linalg:matrix_diag_op",
 #         "//tensorflow/core/kernels/linalg:matrix_set_diag_op",
 #     ],
+#     op_def_src = "//tensorflow/core/ops:array_ops_op_lib",
 # )
 # copybara:uncomment_end
 
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
index 02af4b4aa86..416bb56e820 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
@@ -202,11 +202,58 @@ string WriteFuzzTest(const OpInfo& op_info) {
                     }));
 }
 
+string FuzzerFileStart() {
+  const string fuzz_namespace_begin = R"namespace(
+namespace tensorflow {
+namespace fuzzing {
+
+)namespace";
+
+  const string fuzz_header = strings::StrCat(
+      R"include(// This file is MACHINE GENERATED! Do not edit.
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/security/fuzzing/cc/fuzz_session.h"
+#include "third_party/mediapipe/framework/port/parse_text_proto.h"
+)include",
+      fuzz_namespace_begin);
+
+  return fuzz_header;
+}
+
+string FuzzerFileEnd() {
+  const string fuzz_footer = R"footer(
+}  // namespace fuzzing
+}  // namespace tensorflow
+)footer";
+
+  return fuzz_footer;
+}
+
+}  // namespace
+
 bool OpFuzzingIsOk(const OpInfo& op_info) {
+  // Skip deprecated ops.
+  if (op_info.graph_op_def.has_deprecation() &&
+      op_info.graph_op_def.deprecation().version() <= TF_GRAPH_DEF_VERSION) {
+    std::cout << "NOT fuzzing: " << op_info.graph_op_def.name()
+              << " is deprecated.\n";
+    return false;
+  }
+
   // TODO(unda, b/249347507): should we hide fuzzers for hidden ops?
-  if (op_info.api_def.visibility() == ApiDef::HIDDEN) return false;
+  if (op_info.api_def.visibility() == ApiDef::HIDDEN) {
+    std::cout << "NOT fuzzing: " << op_info.graph_op_def.name()
+              << " is hidden.\n";
+    return false;
+  }
 
-  if (op_info.api_def.visibility() == ApiDef::SKIP) return false;
+  if (op_info.api_def.visibility() == ApiDef::SKIP) {
+    std::cout << "NOT fuzzing: " << op_info.graph_op_def.name()
+              << " is skipped.\n";
+    return false;
+  }
 
   // TODO(unda) : zero input ops
   std::set<string> zero_input_ops = {"Placeholder", "ImmutableConst"};
@@ -272,56 +319,10 @@ bool OpFuzzingIsOk(const OpInfo& op_info) {
   return true;
 }
 
-string FuzzerFileStart() {
-  const string fuzz_namespace_begin = R"namespace(
-namespace tensorflow {
-namespace fuzzing {
-
-)namespace";
-
-  const string fuzz_header = strings::StrCat(
-      R"include(// This file is MACHINE GENERATED! Do not edit.
-
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/security/fuzzing/cc/fuzz_session.h"
-#include "third_party/mediapipe/framework/port/parse_text_proto.h"
-)include",
-      fuzz_namespace_begin);
-
-  return fuzz_header;
-}
-
-string FuzzerFileEnd() {
-  const string fuzz_footer = R"footer(
-}  // namespace fuzzing
-}  // namespace tensorflow
-)footer";
-
-  return fuzz_footer;
-}
-
-}  // namespace
-
-string WriteFuzzers(const OpList& ops, const ApiDefMap& api_def_map) {
+string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable) {
   return absl::StrCat(
-      FuzzerFileStart(),
-      absl::StrJoin(
-          ops.op(), "",
-          [&api_def_map](string* out, const OpDef& op_def) {
-            // Skip deprecated ops.
-            bool skip = op_def.has_deprecation() &&
-                        op_def.deprecation().version() <= TF_GRAPH_DEF_VERSION;
-            const auto* api_Def = api_def_map.GetApiDef(op_def.name());
-            OpInfo op_info(op_def, *api_Def, std::vector<string>());
-            skip |= !OpFuzzingIsOk(op_info);
-            if (!skip) {
-              out->append(WriteClassFuzzDef(op_info));
-              out->append(WriteFuzzTest(op_info));
-              out->append("\n");
-            }
-          }),
-      FuzzerFileEnd());
+      FuzzerFileStart(), is_fuzzable ? WriteClassFuzzDef(op_info) : "",
+      is_fuzzable ? WriteFuzzTest(op_info) : "", FuzzerFileEnd());
 }
 
 }  // namespace cc_op
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
index 6770430ad69..c11c9635d6d 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_FUZZING_CC_OP_FUZZ_GEN_H_
 #define TENSORFLOW_CC_FRAMEWORK_FUZZING_CC_OP_FUZZ_GEN_H_
 
+#include "tensorflow/cc/framework/cc_op_gen_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
@@ -23,8 +24,11 @@ limitations under the License.
 namespace tensorflow {
 namespace cc_op {
 
-/// String with fuzzer file contents.
-string WriteFuzzers(const OpList& ops, const ApiDefMap& api_def_map);
+// String with single fuzzer file content.
+string WriteSingleFuzzer(const OpInfo& op_info, bool is_fuzzable);
+
+// Do we have all we need to create a fuzzer
+bool OpFuzzingIsOk(const OpInfo& op_info);
 
 }  // namespace cc_op
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
index 0a1de103d37..99388eb8847 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
@@ -14,10 +14,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
 #include "tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -33,20 +35,33 @@ namespace tensorflow {
 namespace cc_op {
 namespace {
 
-void WriteAllFuzzers(const std::string& file_name, bool include_internal,
-                     const std::vector<string>& api_def_dirs) {
+void WriteAllFuzzers(string root_location, std::vector<string> api_def_dirs,
+                     std::vector<string> op_names) {
   OpList ops;
-  StatusOr<ApiDefMap> api_def_map =
-      LoadOpsAndApiDefs(ops, include_internal, api_def_dirs);
+  StatusOr<ApiDefMap> api_def_map = LoadOpsAndApiDefs(ops, false, api_def_dirs);
 
   TF_CHECK_OK(api_def_map.status());
-  WriteFuzzers(ops, api_def_map.value());
 
   Env* env = Env::Default();
+  tsl::Status status;
   std::unique_ptr<WritableFile> fuzz_file = nullptr;
-  auto status = env->NewWritableFile(file_name, &fuzz_file);
-  status.Update(fuzz_file->Append(WriteFuzzers(ops, api_def_map.value())));
-  status.Update(fuzz_file->Close());
+  for (const OpDef& op_def : ops.op()) {
+    if (std::find(op_names.begin(), op_names.end(), op_def.name()) ==
+        op_names.end())
+      continue;
+
+    const ApiDef* api_def = api_def_map->GetApiDef(op_def.name());
+    if (api_def == nullptr) {
+      continue;
+    }
+
+    OpInfo op_info(op_def, *api_def, std::vector<string>());
+    status.Update(env->NewWritableFile(
+        root_location + "/" + op_def.name() + "_fuzz.cc", &fuzz_file));
+    status.Update(
+        fuzz_file->Append(WriteSingleFuzzer(op_info, OpFuzzingIsOk(op_info))));
+    status.Update(fuzz_file->Close());
+  }
   TF_CHECK_OK(status);
 }
 
@@ -60,17 +75,17 @@ int main(int argc, char* argv[]) {
     for (int i = 1; i < argc; ++i) {
       fprintf(stderr, "Arg %d = %s\n", i, argv[i]);
     }
-    fprintf(stderr,
-            "Usage: %s out include_internal "
-            "api_def_dirs1,api_def_dir2 ...\n"
-            "  include_internal: 1 means include internal ops\n",
+    fprintf(stderr, "Usage: %s location api_def1,api_def2 op1,op2,op3\n",
             argv[0]);
     exit(1);
   }
-
-  bool include_internal = tensorflow::StringPiece("1") == argv[2];
-  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stdout, "Arg %d = %s\n", i, argv[i]);
+  }
+  std::vector<tensorflow::string> api_def_srcs = tensorflow::str_util::Split(
+      argv[2], ",", tensorflow::str_util::SkipEmpty());
+  std::vector<tensorflow::string> op_names = tensorflow::str_util::Split(
       argv[3], ",", tensorflow::str_util::SkipEmpty());
-  tensorflow::cc_op::WriteAllFuzzers(argv[1], include_internal, api_def_dirs);
+  tensorflow::cc_op::WriteAllFuzzers(argv[1], api_def_srcs, op_names);
   return 0;
 }
diff --git a/tensorflow/cc/framework/fuzzing/op_fuzzing.bzl b/tensorflow/cc/framework/fuzzing/op_fuzzing.bzl
index aac616f8928..2dfb4d08589 100644
--- a/tensorflow/cc/framework/fuzzing/op_fuzzing.bzl
+++ b/tensorflow/cc/framework/fuzzing/op_fuzzing.bzl
@@ -12,108 +12,160 @@ load(
     "cc_test",
 )
 
-def tf_gen_op_wrapper_fuzz(
+def tf_gen_op_wrappers_fuzz(
         name,
-        out_ops_file,
-        pkg = "",
-        deps = None,
-        include_internal_ops = 0,
-        api_def_srcs = []):
+        op_def_src,
+        api_def_srcs = [],
+        kernel_deps = []):
     """
-    Generates a file with fuzzers for a subset of ops.
+    Generates fuzzers for several groups of ops.
+
+    For each one we need the corresponding OpDef, ApiDef and KernelDef,
+    since they all can contain constraints for the inputs.
 
     Args:
-        name: name of the op class
-        out_ops_file: prefix for file generation
-        pkg: where to find op registrations
-        deps: depedencies
-        include_internal_ops: true if we should generate internal ops
-        api_def_srcs: which op definitions to use
+        name: the name of the fuzz artifact
+        op_def_src: op definitions
+        api_def_srcs: api definitions
+        kernel_deps: op kernel dependencies
     """
-    tool = out_ops_file + "_gen_fuzz"
 
-    if deps == None:
-        deps = [pkg + ":" + name + "_op_lib"]
+    # Create tool to generate .cc fuzzer files.
     tf_cc_binary(
-        name = tool,
+        name = "op_fuzz_gen_tool",
         copts = tf_copts(),
         linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + lrt_if_needed(),
         linkstatic = 1,  # Faster to link this one-time-use binary dynamically
         deps = [
             "//tensorflow/cc/framework/fuzzing:cc_op_fuzz_gen_main",
-        ] + deps,
+            op_def_src,
+        ] + kernel_deps,
     )
 
-    srcs = api_def_srcs[:]
+    # Add relevant locations to look for api_defs.
+    api_def_src_locations = ",".join(["$$(dirname $$(echo $(locations " + api_def_src + ") | cut -d\" \" -f1))" for api_def_src in api_def_srcs])
 
-    if not api_def_srcs:
-        api_def_args_str = ","
-    else:
-        api_def_args = []
-        for api_def_src in api_def_srcs:
-            # Add directory of the first ApiDef source to args.
-            # We are assuming all ApiDefs in a single api_def_src are in the
-            # same directory.
-            api_def_args.append(
-                " $$(dirname $$(echo $(locations " + api_def_src +
-                ") | cut -d\" \" -f1))",
-            )
-        api_def_args_str = ",".join(api_def_args)
-
-    out_fuzz_file = out_ops_file + "_fuzz.cc"
+    out_fuzz_files = [op_name + "_fuzz.cc" for op_name in op_names]
     native.genrule(
         name = name + "_genrule",
-        outs = [
-            out_fuzz_file,
-        ],
-        srcs = srcs,
-        tools = [":" + tool],  # + tf_binary_additional_srcs(),
-        cmd = ("$(location :" + tool + ") $(location :" + out_fuzz_file + ") " +
-               str(include_internal_ops) + " " + api_def_args_str),
+        outs = out_fuzz_files,
+        srcs = api_def_srcs,
+        tools = [":op_fuzz_gen_tool"],
+        cmd = ("$(location :op_fuzz_gen_tool) " +
+               " $$(dirname $(location " + out_fuzz_files[0] + "))" +
+               " " + api_def_src_locations + " " + (",".join(op_names))),
     )
 
-def tf_gen_op_wrappers_fuzz(
-        name,
-        op_lib_names = [],
-        pkg = "",
-        deps = [
-            "//tensorflow/cc:ops",
-            "//tensorflow/cc:scope",
-            "//tensorflow/cc:const_op",
-        ],
-        include_internal_ops = 0,
-        api_def_srcs = [],
-        extra_gen_deps = []):
-    """
-    Generates fuzzers for several groups of ops.
-
-    Args:
-        name: the name of the fuzz artifact
-        op_lib_names: which op libraries to fuzz
-        pkg: where to find op registrations
-        deps: dependencies
-        include_internal_ops: true if we should generate internal ops
-        api_def_srcs: where to find the op definitions
-        extra_gen_deps: extra dependencies for generation
-    """
-    fuzzsrcs = []
-    for n in op_lib_names:
-        tf_gen_op_wrapper_fuzz(
-            n,
-            "fuzzers/" + n,
-            api_def_srcs = api_def_srcs,
-            include_internal_ops = include_internal_ops,
-            pkg = pkg,
-            deps = [pkg + ":" + n + "_op_lib"] + extra_gen_deps,
+    for op_name in op_names:
+        cc_test(
+            name = op_name.lower() + "_fuzz",
+            srcs = [op_name + "_fuzz.cc"],
+            deps = kernel_deps +
+                   [
+                       "//tensorflow/security/fuzzing/cc:fuzz_session",
+                       "@com_google_googletest//:gtest_main",
+                       "@com_google_fuzztest//fuzztest",
+                       "//tensorflow/cc:cc_ops",
+                       "//third_party/mediapipe/framework/port:parse_text_proto",
+                   ],
         )
-        fuzzsrcs.append("fuzzers/" + n + "_fuzz.cc")
-    cc_test(
-        name = name,
-        srcs = fuzzsrcs,
-        deps = deps +
-               [
-                   "//tensorflow/security/fuzzing/cc:fuzz_session",
-                   "@com_google_googletest//:gtest_main",
-                   "@com_google_fuzztest//fuzztest",
-               ],
-    )
+
+op_names = [
+    "BatchMatrixBandPart",
+    "BatchMatrixDiag",
+    "BatchMatrixDiagPart",
+    "BatchMatrixSetDiag",
+    "BatchToSpace",
+    "BatchToSpaceND",
+    "Bitcast",
+    "BroadcastArgs",
+    "BroadcastTo",
+    "CheckNumerics",
+    "ConcatV2",
+    "ConjugateTranspose",
+    "DebugGradientIdentity",
+    "DeepCopy",
+    "DepthToSpace",
+    "Dequantize",
+    "EditDistance",
+    "Empty",
+    "EnsureShape",
+    "ExpandDims",
+    "ExtractImagePatches",
+    "ExtractVolumePatches",
+    "FakeQuantWithMinMaxArgs",
+    "FakeQuantWithMinMaxArgsGradient",
+    "FakeQuantWithMinMaxVars",
+    "FakeQuantWithMinMaxVarsGradient",
+    "FakeQuantWithMinMaxVarsPerChannel",
+    "FakeQuantWithMinMaxVarsPerChannelGradient",
+    "Fill",
+    "Fingerprint",
+    "Gather",
+    "GuaranteeConst",
+    "Identity",
+    "IdentityN",
+    "InplaceAdd",
+    "InplaceSub",
+    "InplaceUpdate",
+    "InvertPermutation",
+    "ListDiff",
+    "MatrixBandPart",
+    "MatrixDiag",
+    "MatrixDiagPart",
+    "MatrixDiagPartV2",
+    "MatrixDiagPartV3",
+    "MatrixDiagV2",
+    "MatrixDiagV3",
+    "MatrixSetDiag",
+    "MatrixSetDiagV2",
+    "MatrixSetDiagV3",
+    "MirrorPad",
+    "OneHot",
+    "OnesLike",
+    "Pack",
+    "Pad",
+    "PadV2",
+    "ParallelConcat",
+    "PlaceholderV2",
+    "PlaceholderWithDefault",
+    "PreventGradient",
+    "QuantizeAndDequantize",
+    "QuantizeV2",
+    "Rank",
+    "Reshape",
+    "ResourceStridedSliceAssign",
+    "ReverseSequence",
+    "ReverseV2",
+    "ScatterNdNonAliasingAdd",
+    "Shape",
+    "ShapeN",
+    "Size",
+    "Slice",
+    "Snapshot",
+    "SpaceToBatch",
+    "SpaceToBatchND",
+    "SpaceToDepth",
+    "Split",
+    "SplitV",
+    "Squeeze",
+    "StopGradient",
+    "StridedSlice",
+    "StridedSliceGrad",
+    "TensorScatterAdd",
+    "TensorScatterMax",
+    "TensorScatterMin",
+    "TensorScatterSub",
+    "TensorStridedSliceUpdate",
+    "Tile",
+    "TileGrad",
+    "Transpose",
+    "Unique",
+    "UniqueV2",
+    "UniqueWithCounts",
+    "UniqueWithCountsV2",
+    "Unpack",
+    "UnravelIndex",
+    "Where",
+    "ZerosLike",
+]
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
index 0fc5abb20c8..951144cf8ce 100644
--- a/tensorflow/cc/framework/grad_op_registry.h
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
 #define TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
 
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
index 0013ea732df..0c026cf9a0c 100644
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/gradient_checker.h"
 
+#include <algorithm>
+#include <utility>
+
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/standard_ops.h"
diff --git a/tensorflow/cc/framework/gradient_checker.h b/tensorflow/cc/framework/gradient_checker.h
index 1aa215a9088..b8db767f77c 100644
--- a/tensorflow/cc/framework/gradient_checker.h
+++ b/tensorflow/cc/framework/gradient_checker.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
 #define TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index cdb3d0c7d68..3dd2ab3ab82 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -16,6 +16,11 @@ limitations under the License.
 #include "tensorflow/cc/framework/gradients.h"
 
 #include <deque>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/cc/framework/grad_op_registry.h"
@@ -35,9 +40,7 @@ namespace tensorflow {
 namespace {
 
 struct OutputHash {
-  uint64 operator()(const Output& x) const {
-    return x.hash();
-  }
+  uint64 operator()(const Output& x) const { return x.hash(); }
 };
 
 struct OutputEq {
@@ -343,8 +346,8 @@ Status SymbolicGradientBuilder::Initialize() {
 Status SymbolicGradientBuilder::SumGradients(const Output& src, Output* grad) {
   auto iter = backprops_.find(src);
   if (iter == backprops_.end()) {
-    return errors::Internal(
-        "Unable to find backprop list for node.id ", src.node()->name());
+    return errors::Internal("Unable to find backprop list for node.id ",
+                            src.node()->name());
   }
   const auto& grads = iter->second;
   // Filter any backpropped 'NoGradient' Outputs from 'grads' (if needed).
@@ -378,8 +381,7 @@ bool SymbolicGradientBuilder::IsPrimitiveOpWithNoGrad(const string& opname) {
 }
 
 Status SymbolicGradientBuilder::CallGradFunction(
-    const Operation& op,
-    const std::vector<Output>& grad_inputs,
+    const Operation& op, const std::vector<Output>& grad_inputs,
     std::vector<Output>* grad_outputs) {
   ops::GradFunc grad_fn;
   TF_RETURN_IF_ERROR(registry_->Lookup(op.node()->type_string(), &grad_fn));
@@ -526,8 +528,8 @@ Status SymbolicGradientBuilder::AddGradients() {
       if (e->IsControlEdge()) continue;
       size_t dx_index = e->dst_input();
       if (dx_index >= dx.size()) {
-        return errors::Internal(
-            "Invalid gradient output index: ", dx_index, " size: ", dx.size());
+        return errors::Internal("Invalid gradient output index: ", dx_index,
+                                " size: ", dx.size());
       }
       TF_RETURN_IF_ERROR(
           BackpropAlongEdge(dx[dx_index], {e->src(), e->src_output()}));
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index 0a377ad56d1..d404bd34c4a 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
 #define TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 08527b213e3..d19b895654b 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_OPS_H_
 #define TENSORFLOW_CC_FRAMEWORK_OPS_H_
 
+#include <string>
 #include <type_traits>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 5db7eab2b81..586165ee4eb 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -16,6 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
 #define TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
 
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
 #include "tensorflow/cc/framework/scope.h"
 
 namespace tensorflow {
diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h
index 7ad6fb4a676..2464b491d79 100644
--- a/tensorflow/cc/framework/testutil.h
+++ b/tensorflow/cc/framework/testutil.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
 #define TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 
diff --git a/tensorflow/cc/framework/while_gradients.cc b/tensorflow/cc/framework/while_gradients.cc
index a907fa9677a..e28306e5a33 100644
--- a/tensorflow/cc/framework/while_gradients.cc
+++ b/tensorflow/cc/framework/while_gradients.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/while_gradients.h"
 
+#include <string>
+
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
diff --git a/tensorflow/cc/framework/while_gradients.h b/tensorflow/cc/framework/while_gradients.h
index cb4e579c854..6d33d49dbb3 100644
--- a/tensorflow/cc/framework/while_gradients.h
+++ b/tensorflow/cc/framework/while_gradients.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
 #define TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/while_context.h"
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index b07af98fbef..18466df3691 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -17,6 +17,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -121,6 +122,7 @@ cc_library(
     hdrs = ["loader.h"],
     deps = [
         ":constants",
+        ":fingerprinting",
         ":loader_util",
         ":reader",
     ] + if_not_mobile([
@@ -142,6 +144,7 @@ cc_library(
     hdrs = ["bundle_v2.h"],
     deps = [
         ":constants",
+        ":fingerprinting",
         ":metrics",
         ":reader",
         ":util",
@@ -322,6 +325,7 @@ cc_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
+        "//tensorflow/security/fuzzing/cc/ops:__pkg__",  # TODO(b/261455394): Remove.
     ],
     deps = if_not_mobile(["//tensorflow/core:lib"]) + if_android(["//tensorflow/core:portable_tensorflow_lib_lite"]),
     alwayslink = True,
@@ -380,6 +384,7 @@ cc_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
+        "//tensorflow/security/fuzzing/cc/ops:__pkg__",  # TODO(b/261455394): Remove.
     ],
     deps = [
         ":constants",
diff --git a/tensorflow/cc/saved_model/bundle_v2.cc b/tensorflow/cc/saved_model/bundle_v2.cc
index 4785f6f32be..0edc3469e89 100644
--- a/tensorflow/cc/saved_model/bundle_v2.cc
+++ b/tensorflow/cc/saved_model/bundle_v2.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/cc/saved_model/metrics.h"
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/cc/saved_model/util.h"
@@ -121,6 +122,7 @@ Status SavedModelV2Bundle::Load(const std::string& export_dir,
   metrics::SavedModelReadApi(kCCLoadBundleV2Label).IncrementBy(1);
   SavedModel saved_model_proto;
   TF_RETURN_IF_ERROR(ReadSavedModelProto(export_dir, &saved_model_proto));
+  metrics::SavedModelReadPath().Set(export_dir);
 
   // Load MetaGraphDef.
   // In version 2 SavedModels, there is only one MetaGraphDef.
@@ -136,7 +138,8 @@ Status SavedModelV2Bundle::Load(const std::string& export_dir,
 
   // Correct the endiness of Tensor content on big-endian system
   if (!port::kLittleEndian) {
-    TF_RETURN_IF_ERROR(ByteSwapTensorContent(&(bundle->meta_graph_def_)));
+    TF_RETURN_IF_ERROR(
+        ByteSwapTensorContentInMetaGraphDef(&(bundle->meta_graph_def_)));
   }
 
   // Load GraphDebugInfo.
@@ -163,6 +166,14 @@ Status SavedModelV2Bundle::Load(const std::string& export_dir,
     TF_RETURN_IF_ERROR(ReadCheckpointObjectGraph(
         bundle->variable_reader_.get(), &bundle->trackable_object_graph_));
   }
+  // Read the fingerprint.
+  auto fingerprint_proto =
+      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
+  if (fingerprint_proto.ok()) {
+    // Set gauge cell with saved_model_checksum.
+    metrics::SavedModelReadFingerprint().Set(
+        std::to_string(fingerprint_proto->saved_model_checksum()));
+  }
   return OkStatus();
 }
 
diff --git a/tensorflow/cc/saved_model/bundle_v2_test.cc b/tensorflow/cc/saved_model/bundle_v2_test.cc
index 506a4fbd9b4..a70244b04fd 100644
--- a/tensorflow/cc/saved_model/bundle_v2_test.cc
+++ b/tensorflow/cc/saved_model/bundle_v2_test.cc
@@ -28,6 +28,8 @@ namespace tensorflow {
 namespace {
 
 constexpr char kTestData[] = "cc/saved_model/testdata";
+// This is the value in testdata/VarsAndArithmeticObjectGraph/fingerprint.pb
+constexpr char kV2ModuleSavedModelChecksum[] = "15788619162413586750";
 
 class BundleV2Test : public ::testing::Test {
  protected:
@@ -114,6 +116,10 @@ TEST_F(BundleV2Test, UpdatesMetrics) {
   EXPECT_EQ(metrics::SavedModelRead("2").value(), read_count + 1);
   EXPECT_EQ(metrics::SavedModelReadApi(kCCLoadBundleV2Label).value(),
             api_count + 1);
+  // Check that the gauge contains the fingerprint.
+  EXPECT_EQ(metrics::SavedModelReadFingerprint().value(),
+            kV2ModuleSavedModelChecksum);
+  EXPECT_EQ(metrics::SavedModelReadPath().value(), export_dir);
 }
 
 }  // namespace
diff --git a/tensorflow/cc/saved_model/experimental/public/BUILD b/tensorflow/cc/saved_model/experimental/public/BUILD
index 2b91a34d538..a26eabfe5ec 100644
--- a/tensorflow/cc/saved_model/experimental/public/BUILD
+++ b/tensorflow/cc/saved_model/experimental/public/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # This is intentionally public
     default_visibility = [
         "//visibility:public",
diff --git a/tensorflow/cc/saved_model/experimental/tests/BUILD b/tensorflow/cc/saved_model/experimental/tests/BUILD
index ebdcdf02887..3818412b19f 100644
--- a/tensorflow/cc/saved_model/experimental/tests/BUILD
+++ b/tensorflow/cc/saved_model/experimental/tests/BUILD
@@ -2,6 +2,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index 90eda79971c..7d2893f6199 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/fingerprinting.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <string>
+#include <unordered_map>
 
 #include "absl/container/btree_map.h"
 #include "absl/strings/strip.h"
@@ -36,13 +38,20 @@ limitations under the License.
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/util/tensor_bundle/naming.h"
+#include "tensorflow/tsl/lib/strings/proto_serialization.h"
 
 namespace tensorflow::saved_model::fingerprinting {
 
 // Version of the code that produced the fingerprint.
-const int kFingerprintProducer = 0;
+const int kFingerprintProducer = 1;
 namespace {
 
+uint64 HashSavedModel(const SavedModel& saved_model) {
+  std::string saved_model_string;
+  SerializeToStringDeterministic(saved_model, &saved_model_string);
+  return tensorflow::Fingerprint64(saved_model_string);
+}
+
 uint64 RegularizeAndHashSignatureDefs(
     const google::protobuf::Map<std::string, SignatureDef>& signature_def_map) {
   // Sort `signature_def_map`, which is an unordered map from string keys to
@@ -113,15 +122,14 @@ uint64 HashCheckpointIndexFile(absl::string_view model_dir) {
 
 }  // namespace
 
-FingerprintDef CreateFingerprintDef(const MetaGraphDef& metagraph,
+FingerprintDef CreateFingerprintDef(const SavedModel& saved_model,
                                     absl::string_view export_dir) {
   // Create a copy of `metagraph` which will be used and mutated for fingerprint
   // computation.
-  MetaGraphDef metagraph_copy = metagraph;
+  MetaGraphDef metagraph_copy = saved_model.meta_graphs(0);
   FingerprintDef fingerprint_def;
   // Set fingerprint field #1.
-  fingerprint_def.set_graph_def_checksum(
-      graph_regularization::ComputeHash(metagraph_copy.graph_def()));
+  fingerprint_def.set_saved_model_checksum(HashSavedModel(saved_model));
   // Set fingerprint field #2.
   graph_regularization::SimpleDelete(*metagraph_copy.mutable_graph_def());
   fingerprint_def.set_graph_def_program_hash(
@@ -143,4 +151,35 @@ FingerprintDef CreateFingerprintDef(const MetaGraphDef& metagraph,
   return fingerprint_def;
 }
 
+StatusOr<FingerprintDef> ReadSavedModelFingerprint(
+    absl::string_view export_dir) {
+  const string fingerprint_pb_path =
+      io::JoinPath(export_dir, kFingerprintFilenamePb);
+  Status found_pb = Env::Default()->FileExists(fingerprint_pb_path);
+  if (found_pb.ok()) {
+    FingerprintDef fingerprint_proto;
+    Status result = ReadBinaryProto(Env::Default(), fingerprint_pb_path,
+                                    &fingerprint_proto);
+    if (result.ok()) {
+      return fingerprint_proto;
+    }
+    return result;
+  }
+  return found_pb;
+}
+
+std::unordered_map<std::string, uint64_t> MakeFingerprintMap(
+    const FingerprintDef& fingerprint) {
+  std::unordered_map<std::string, uint64_t> fingerprint_map;
+  fingerprint_map["saved_model_checksum"] = fingerprint.saved_model_checksum();
+  fingerprint_map["graph_def_program_hash"] =
+      fingerprint.graph_def_program_hash();
+  fingerprint_map["signature_def_hash"] = fingerprint.signature_def_hash();
+  fingerprint_map["saved_object_graph_hash"] =
+      fingerprint.saved_object_graph_hash();
+  fingerprint_map["checkpoint_hash"] = fingerprint.checkpoint_hash();
+  fingerprint_map["version"] = fingerprint.version().producer();
+  return fingerprint_map;
+}
+
 }  // namespace tensorflow::saved_model::fingerprinting
diff --git a/tensorflow/cc/saved_model/fingerprinting.h b/tensorflow/cc/saved_model/fingerprinting.h
index a827e1cb32d..15790ed61e9 100644
--- a/tensorflow/cc/saved_model/fingerprinting.h
+++ b/tensorflow/cc/saved_model/fingerprinting.h
@@ -16,17 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
 #define TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
 
+#include <string>
+#include <unordered_map>
+
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/fingerprint.pb.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
 
 namespace tensorflow::saved_model::fingerprinting {
 
-// Creates a FingerprintDef proto from a MetaGraph and the checkpoint meta file
+// Creates a FingerprintDef proto from a SavedModel and the checkpoint meta file
 // (.index) in `export_dir`.
-FingerprintDef CreateFingerprintDef(const MetaGraphDef& metagraph,
+FingerprintDef CreateFingerprintDef(const SavedModel& saved_model,
                                     absl::string_view export_dir);
 
+// Loads the `fingerprint.pb` from `export_dir`, returns an error if there is
+// none.
+StatusOr<FingerprintDef> ReadSavedModelFingerprint(
+    absl::string_view export_dir);
+
+// Converts the fingerprint into a dictionary mapping field names to values.
+std::unordered_map<std::string, uint64_t> MakeFingerprintMap(
+    const FingerprintDef& fingerprint);
+
 }  // namespace tensorflow::saved_model::fingerprinting
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_FINGERPRINTING_H_
diff --git a/tensorflow/cc/saved_model/fingerprinting_test.cc b/tensorflow/cc/saved_model/fingerprinting_test.cc
index ee337f4337d..0db31bbf17a 100644
--- a/tensorflow/cc/saved_model/fingerprinting_test.cc
+++ b/tensorflow/cc/saved_model/fingerprinting_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/fingerprint.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
@@ -52,9 +53,9 @@ TEST(FingerprintingTest, TestCreateFingerprint) {
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb,
                           ReadSavedModel(export_dir));
   FingerprintDef fingerprint_def =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
+      CreateFingerprintDef(saved_model_pb, export_dir);
 
-  EXPECT_GT(fingerprint_def.graph_def_checksum(), 0);
+  EXPECT_GT(fingerprint_def.saved_model_checksum(), 0);
   EXPECT_EQ(fingerprint_def.graph_def_program_hash(), 10127142238652115842U);
   EXPECT_EQ(fingerprint_def.signature_def_hash(), 5693392539583495303);
   EXPECT_EQ(fingerprint_def.saved_object_graph_hash(), 3678101440349108924);
@@ -72,14 +73,14 @@ TEST(FingerprintingTest, TestCompareFingerprintForTwoModelSavedTwice) {
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb,
                           ReadSavedModel(export_dir));
   FingerprintDef fingerprint_def =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
+      CreateFingerprintDef(saved_model_pb, export_dir);
 
   const std::string export_dir2 = io::JoinPath(
       testing::TensorFlowSrcRoot(), "cc/saved_model/testdata", "bert2");
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb2,
                           ReadSavedModel(export_dir2));
   FingerprintDef fingerprint_def2 =
-      CreateFingerprintDef(saved_model_pb2.meta_graphs(0), export_dir2);
+      CreateFingerprintDef(saved_model_pb2, export_dir2);
 
   EXPECT_EQ(fingerprint_def.graph_def_program_hash(),
             fingerprint_def2.graph_def_program_hash());
@@ -95,12 +96,12 @@ TEST(FingerprintingTest, TestFingerprintComputationDoesNotMutateModel) {
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb,
                           ReadSavedModel(export_dir));
   FingerprintDef fingerprint_def =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
+      CreateFingerprintDef(saved_model_pb, export_dir);
   FingerprintDef fingerprint_def2 =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
+      CreateFingerprintDef(saved_model_pb, export_dir);
 
-  EXPECT_EQ(fingerprint_def.graph_def_checksum(),
-            fingerprint_def2.graph_def_checksum());
+  EXPECT_EQ(fingerprint_def.saved_model_checksum(),
+            fingerprint_def2.saved_model_checksum());
 }
 
 TEST(FingerprintingTest, TestFingerprintHasVersion) {
@@ -109,8 +110,8 @@ TEST(FingerprintingTest, TestFingerprintHasVersion) {
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb,
                           ReadSavedModel(export_dir));
   FingerprintDef fingerprint_def =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
-  EXPECT_EQ(fingerprint_def.version().producer(), 0);
+      CreateFingerprintDef(saved_model_pb, export_dir);
+  EXPECT_EQ(fingerprint_def.version().producer(), 1);
 }
 
 TEST(FingerprintingTest, TestHashCheckpointForModelWithNoVariables) {
@@ -119,9 +120,35 @@ TEST(FingerprintingTest, TestHashCheckpointForModelWithNoVariables) {
   TF_ASSERT_OK_AND_ASSIGN(SavedModel saved_model_pb,
                           ReadSavedModel(export_dir));
   FingerprintDef fingerprint_def =
-      CreateFingerprintDef(saved_model_pb.meta_graphs(0), export_dir);
+      CreateFingerprintDef(saved_model_pb, export_dir);
   EXPECT_EQ(fingerprint_def.checkpoint_hash(), 0);
 }
 
+TEST(FingerprintingTest, TestReadValidFingerprint) {
+  const std::string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model/testdata",
+                   "VarsAndArithmeticObjectGraph");
+  TF_ASSERT_OK_AND_ASSIGN(FingerprintDef fingerprint_pb,
+                          ReadSavedModelFingerprint(export_dir));
+  EXPECT_EQ(fingerprint_pb.saved_model_checksum(), 15788619162413586750u);
+}
+
+TEST(FingerprintingTest, TestReadNonexistentFingerprint) {
+  const std::string export_dir = io::JoinPath(
+      testing::TensorFlowSrcRoot(), "cc/saved_model/testdata", "AssetModule");
+  EXPECT_FALSE(ReadSavedModelFingerprint(export_dir).ok());
+}
+
+TEST(FingerprintingTest, TestMakeFingerprintMap) {
+  const std::string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), "cc/saved_model/testdata",
+                   "VarsAndArithmeticObjectGraph");
+  TF_ASSERT_OK_AND_ASSIGN(FingerprintDef fingerprint_pb,
+                          ReadSavedModelFingerprint(export_dir));
+  auto fingerprint_map = MakeFingerprintMap(fingerprint_pb);
+  EXPECT_EQ(fingerprint_pb.saved_model_checksum(),
+            fingerprint_map["saved_model_checksum"]);
+}
+
 }  // namespace
 }  // namespace tensorflow::saved_model::fingerprinting
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 2f87d6da6fe..75869afe687 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/loader.h"
 
+#include <string>
 #include <unordered_set>
 
 #include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/cc/saved_model/metrics.h"
 #include "tensorflow/cc/saved_model/reader.h"
@@ -63,8 +65,8 @@ auto* load_latency_by_stage = monitoring::Sampler<2>::New(
         "model_path",
         "stage",
     },
-    // Scale of 10, power of 1.8 with bucket count 33 (~20 minutes).
-    monitoring::Buckets::Exponential(10, 1.8, 33));
+    // Scale of 10, power of 1.8 with bucket count 37 (~258 minutes).
+    monitoring::Buckets::Exponential(10, 1.8, 37));
 
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
@@ -296,6 +298,13 @@ Status LoadSavedModel(const SessionOptions& session_options,
                       const std::unordered_set<string>& tags,
                       SavedModelBundle* const bundle) {
   metrics::SavedModelReadApi(kCCLoadLabel).IncrementBy(1);
+  auto fingerprint_proto =
+      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
+  if (fingerprint_proto.ok()) {
+    // Set gauge cell with saved_model_checksum.
+    metrics::SavedModelReadFingerprint().Set(
+        std::to_string(fingerprint_proto->saved_model_checksum()));
+  }
 
   // TODO(robson): Add tests for the counters.
   const uint64 start_microseconds = Env::Default()->NowMicros();
@@ -309,6 +318,7 @@ Status LoadSavedModel(const SessionOptions& session_options,
   };
   if (status.ok()) {
     log_and_count(kLoadAttemptSuccess);
+    metrics::SavedModelReadPath().Set(export_dir);
   } else {
     log_and_count(kLoadAttemptFail);
   }
diff --git a/tensorflow/cc/saved_model/metrics.cc b/tensorflow/cc/saved_model/metrics.cc
index fc04e2c7725..14e84c93510 100644
--- a/tensorflow/cc/saved_model/metrics.cc
+++ b/tensorflow/cc/saved_model/metrics.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
 
 namespace tensorflow {
@@ -51,6 +52,29 @@ auto* saved_model_read_api = monitoring::Counter<1>::New(
     "/tensorflow/core/saved_model/read/api",
     "The API used to load the SavedModel.", "api_label");
 
+// Gauge that contains the fingerprint (saved_model_checksum) of the newly
+// written SavedModel.
+auto* saved_model_write_fingerprint = monitoring::Gauge<string, 0>::New(
+    "/tensorflow/core/saved_model/write/fingerprint",
+    "The fingerprint (saved_model_checksum) of the exported SavedModel.");
+
+// Gauge that contains the path (saved_model_path) of the newly written
+// SavedModel.
+auto* saved_model_write_path = monitoring::Gauge<string, 0>::New(
+    "/tensorflow/core/saved_model/write/path",
+    "The path (saved_model_path) of the exported SavedModel.");
+
+// Gauge that contains the fingerprint (saved_model_checksum) of the loaded
+// SavedModel.
+auto* saved_model_read_fingerprint = monitoring::Gauge<string, 0>::New(
+    "/tensorflow/core/saved_model/read/fingerprint",
+    "The fingerprint (saved_model_checksum) of the loaded SavedModel.");
+
+// Gauge that contains the path (saved_model_path) of the loaded SavedModel.
+auto* saved_model_read_path = monitoring::Gauge<string, 0>::New(
+    "/tensorflow/core/saved_model/read/path",
+    "The path (saved_model_path) of the loaded SavedModel.");
+
 // Distribution of checkpoint write durations.
 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
     {
@@ -121,6 +145,22 @@ monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
   return *saved_model_read_api->GetCell(std::string(api_label));
 }
 
+monitoring::GaugeCell<string>& SavedModelReadFingerprint() {
+  return *saved_model_read_fingerprint->GetCell();
+}
+
+monitoring::GaugeCell<string>& SavedModelReadPath() {
+  return *saved_model_read_path->GetCell();
+}
+
+monitoring::GaugeCell<string>& SavedModelWriteFingerprint() {
+  return *saved_model_write_fingerprint->GetCell();
+}
+
+monitoring::GaugeCell<string>& SavedModelWritePath() {
+  return *saved_model_write_path->GetCell();
+}
+
 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
   return *checkpoint_read_durations->GetCell(std::string(api_label));
 }
diff --git a/tensorflow/cc/saved_model/metrics.h b/tensorflow/cc/saved_model/metrics.h
index 7ae41285494..4971bb25077 100644
--- a/tensorflow/cc/saved_model/metrics.h
+++ b/tensorflow/cc/saved_model/metrics.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
 
 namespace tensorflow {
@@ -40,6 +41,22 @@ monitoring::CounterCell& SavedModelWrite(absl::string_view write_version);
 // incremented when a SavedModel has been successfully read.
 monitoring::CounterCell& SavedModelRead(absl::string_view write_version);
 
+// Returns "/tensorflow/core/saved_model/write/fingerprint" cell, which contains
+// the saved_model_checksum of the SM's fingerprint when it is exported.
+monitoring::GaugeCell<string>& SavedModelWriteFingerprint();
+
+// Returns "/tensorflow/core/saved_model/write/path" cell, which contains
+// the saved_model_path of the SM when it is exported.
+monitoring::GaugeCell<string>& SavedModelWritePath();
+
+// Returns "/tensorflow/core/saved_model/read/fingerprint" cell, wich contains
+// the saved_model_checksum of the SM's fingerprint when it is imported.
+monitoring::GaugeCell<string>& SavedModelReadFingerprint();
+
+// Returns "/tensorflow/core/saved_model/read/path" cell, wich contains
+// the saved_model_path of the SM when it is imported.
+monitoring::GaugeCell<string>& SavedModelReadPath();
+
 // Returns "/tensorflow/core/saved_model/write/api" cell. This metric has 1
 // field "api_label" which corresponds to a SavedModel write API. The cell for
 // `foo` should be incremented when the write API `foo` is called.
diff --git a/tensorflow/cc/saved_model/metrics_test.cc b/tensorflow/cc/saved_model/metrics_test.cc
index 1f6d9a8658f..b88af2f73f1 100644
--- a/tensorflow/cc/saved_model/metrics_test.cc
+++ b/tensorflow/cc/saved_model/metrics_test.cc
@@ -73,5 +73,37 @@ TEST(MetricsTest, TestCheckpointSize) {
   EXPECT_EQ(CheckpointSize("foo", 10).value(), 1);
 }
 
+TEST(MetricsTest, TestWriteFingerprint) {
+  EXPECT_EQ(SavedModelWriteFingerprint().value(), "");
+  SavedModelWriteFingerprint().Set("foo");
+  EXPECT_EQ(SavedModelWriteFingerprint().value(), "foo");
+  SavedModelWriteFingerprint().Set("bar");
+  EXPECT_EQ(SavedModelWriteFingerprint().value(), "bar");
+}
+
+TEST(MetricsTest, TestWritePath) {
+  EXPECT_EQ(SavedModelWritePath().value(), "");
+  SavedModelWritePath().Set("foo");
+  EXPECT_EQ(SavedModelWritePath().value(), "foo");
+  SavedModelWritePath().Set("bar");
+  EXPECT_EQ(SavedModelWritePath().value(), "bar");
+}
+
+TEST(MetricsTest, TestReadFingerprint) {
+  EXPECT_EQ(SavedModelReadFingerprint().value(), "");
+  SavedModelReadFingerprint().Set("foo");
+  EXPECT_EQ(SavedModelReadFingerprint().value(), "foo");
+  SavedModelReadFingerprint().Set("bar");
+  EXPECT_EQ(SavedModelReadFingerprint().value(), "bar");
+}
+
+TEST(MetricsTest, TestReadPath) {
+  EXPECT_EQ(SavedModelReadPath().value(), "");
+  SavedModelReadPath().Set("foo");
+  EXPECT_EQ(SavedModelReadPath().value(), "foo");
+  SavedModelReadPath().Set("bar");
+  EXPECT_EQ(SavedModelReadPath().value(), "bar");
+}
+
 }  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
index f59f381bb58..7d4737e2c3f 100644
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/core/platform:build_config.bzl", "tf_py_clif_cc")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index ba5672e982c..7dbe9c97504 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -99,7 +99,7 @@ Status FindMetaGraphDef(const std::unordered_set<string>& tags,
       *meta_graph_def = std::move(graph_def);
       // Correct the endiness of Tensor content on big-endian system
       if (!port::kLittleEndian) {
-        TF_RETURN_IF_ERROR(ByteSwapTensorContent(meta_graph_def));
+        TF_RETURN_IF_ERROR(ByteSwapTensorContentInMetaGraphDef(meta_graph_def));
       }
       return OkStatus();
     }
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index 6d17f0663de..c89b6c5736b 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -50,6 +50,10 @@ constexpr char kTestFuzzGeneratedBadNodeAttr[] =
     "cc/saved_model/testdata/fuzz_generated/bad_node_attr";
 constexpr char kTestCyclicModule[] = "cc/saved_model/testdata/CyclicModule";
 constexpr char kTestSimpleV1Model[] = "cc/saved_model/testdata/SimpleV1Model";
+constexpr char kVarsAndArithmeticObjectGraph[] =
+    "cc/saved_model/testdata/VarsAndArithmeticObjectGraph";
+// This is the value in testdata/VarsAndArithmeticObjectGraph/fingerprint.pb
+constexpr char kV2ModuleSavedModelChecksum[] = "15788619162413586750";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -385,5 +389,21 @@ TEST_F(LoaderTest, UpdateMetricsV1) {
   EXPECT_EQ(metrics::SavedModelReadApi(kCCLoadLabel).value(), api_count + 1);
 }
 
+TEST_F(LoaderTest, UpdateFingerprintMetrics) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kVarsAndArithmeticObjectGraph);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+
+  EXPECT_EQ(metrics::SavedModelReadPath().value(), export_dir);
+
+  EXPECT_EQ(metrics::SavedModelReadFingerprint().value(),
+            kV2ModuleSavedModelChecksum);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/fingerprint.pb b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/fingerprint.pb
new file mode 100644
index 00000000000..a5b79c3c288
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/VarsAndArithmeticObjectGraph/fingerprint.pb
@@ -0,0 +1 @@
+���쟯�����������	��˿�O �������ɧ(�������ە2
\ No newline at end of file
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 8d1e1602dba..510e7f589fd 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index c214d7f5706..c0ea0f038c4 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "if_llvm_aarch64_available", "if_llvm_system_z_available")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
@@ -40,7 +41,10 @@ cc_library(
     defines = if_llvm_aarch64_available(["TF_LLVM_AARCH64_AVAILABLE=1"]) + if_llvm_system_z_available([
         "TF_LLVM_S390X_AVAILABLE=1",
     ]),
-    visibility = ["//tensorflow/python:__pkg__"],
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
     deps = [
         ":aot_only_var_handle_op",
         ":embedded_protocol_buffers",
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index f96bd0a9189..849a5227349 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 
 #include <memory>
+#include <optional>
 #include <string>
 
 #include "absl/memory/memory.h"
@@ -106,7 +107,7 @@ GetTargetMachineFromTriple(absl::string_view target_triple) {
 
   return absl::WrapUnique(target->createTargetMachine(
       normalized_triple, /*CPU=*/"",
-      /*Features=*/"", llvm::TargetOptions(), llvm::None));
+      /*Features=*/"", llvm::TargetOptions(), std::nullopt));
 }
 
 StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
@@ -116,8 +117,8 @@ StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
                       GetTargetMachineFromTriple(target_triple));
 
   llvm::LLVMContext llvm_context;
-  std::unique_ptr<llvm::Module> module_with_serialized_proto =
-      absl::make_unique<llvm::Module>("embedded_data_module", llvm_context);
+  auto module_with_serialized_proto =
+      std::make_unique<llvm::Module>("embedded_data_module", llvm_context);
 
   EmbeddedProtocolBuffers result;
 
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 23357df0883..e145ef37107 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 0114be58534..793e2890454 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":internal",
         "//third_party/cloud_tpu/inference_converter:__pkg__",
@@ -50,10 +51,7 @@ filegroup(
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
     name = "jit",
-    visibility = [
-        ":friends",
-        "//learning/tfx:__subpackages__",
-    ],
+    visibility = ["//visibility:public"],
     deps = [
         ":xla_cpu_device",
         ":xla_cpu_jit",
@@ -154,9 +152,9 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/common_runtime/gpu:gpu_init",
     ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
@@ -303,8 +301,13 @@ cc_library(
     # Public visibility is needed for external TF/XLA backends.
     visibility = ["//visibility:public"],
     deps = XLA_DEVICE_DEPS + [
+        ":device_compilation_cache",
+        ":device_compilation_profiler",
+        ":device_compiler_client",
+        ":device_executable_persistor",
         ":flags_headers",
-        ":xla_compilation_cache",
+        ":device_compiler",
+        ":xla_device_compiler_client",
         "//tensorflow/core/tpu:tpu_defs",
     ],
     alwayslink = 1,
@@ -321,6 +324,7 @@ cc_library(
     # Public visibility is needed for external TF/XLA backends.
     visibility = ["//visibility:public"],
     deps = XLA_DEVICE_DEPS + [
+        ":device_compilation_profiler",
         ":jit_compilation_passes",
         ":xla_device_no_jit_rewrite_registration",
     ],
@@ -404,16 +408,17 @@ cc_library(
         ":internal",
         # We reuse VariableInfo in TFRT's implementation of TpuExecuteOp.
         "//learning/brain/tfrt/tf_tpu:__pkg__",
+        "//learning/brain/tfrt/tpu_plugin:__pkg__",
         "//learning/brain/tfrt/tpu_common:__pkg__",
         "//tensorflow/core/common_runtime/next_pluggable_device:__pkg__",
     ],
     deps = [
         ":common",
-        ":xla_compilation_cache",
         ":xla_tensor",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -441,9 +446,10 @@ cc_library(
         "//tensorflow/core/common_runtime/next_pluggable_device:__pkg__",
     ],
     deps = [
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/core:framework",
+        ":flags_headers",
+        "//tensorflow/compiler/tf2xla:xla_argument",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/util:determinism",
     ],
 )
 
@@ -454,6 +460,7 @@ tf_cc_test(
     ],
     deps = [
         ":xla_compile_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:test_main",
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/kernels:identity_op",
@@ -471,76 +478,46 @@ tf_proto_library(
 )
 
 cc_library(
-    name = "xla_compilation_cache",
-    srcs = ["xla_compilation_cache.cc"],
-    hdrs = ["xla_compilation_cache.h"],
+    name = "device_compiler",
+    hdrs = ["device_compiler.h"],
     copts = tf_copts(),
+    visibility = [
+        ":internal",
+        "//tensorflow/core/common_runtime/next_pluggable_device:__pkg__",
+    ],
     deps = [
-        ":flags",
-        ":xla_activity_listener",
-        ":xla_activity_proto_cc",
-        ":xla_cluster_util",
-        ":xla_compilation_cache_proto_cc",
+        ":device_compilation_cache",
+        ":device_compilation_cluster_signature",
+        ":device_compilation_profiler",
+        ":device_compiler_client",
+        ":device_executable_persistor",
+        ":flags_headers",
+        ":tf_graph_to_hlo_compiler",
         ":xla_compile_util",
-        "//tensorflow/compiler/mlir:array_container_utils",
-        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
-        "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
-        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_context",
-        "//tensorflow/compiler/xla:protobuf_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/platform:thread_annotations",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
     ],
 )
 
 tf_cc_test(
-    name = "xla_compilation_cache_test",
+    name = "device_compiler_disable_test",
     srcs = [
-        "xla_compilation_cache_test.cc",
+        "device_compiler_disable_test.cc",
     ],
     deps = [
+        ":device_compilation_profiler",
+        ":device_compiler",
         ":flags",
-        ":xla_compilation_cache",
         ":xla_cpu_jit",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-tf_cc_test(
-    name = "xla_compilation_cache_disable_test",
-    srcs = [
-        "xla_compilation_cache_disable_test.cc",
-    ],
-    deps = [
-        ":flags",
-        ":xla_compilation_cache",
-        ":xla_cpu_jit",
-        "//tensorflow/compiler/tf2xla:common",
+        ":xla_device_compiler_client",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/core:test",
@@ -567,20 +544,21 @@ cc_library(
     hdrs = ["get_compiler_ir.h"],
     visibility = [":internal"],
     deps = [
-        ":common",
         ":compilability_check_util",
-        ":flags",
+        ":device_compiler",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_launch_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:core_cpu_internal",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -595,10 +573,9 @@ cc_library(
     textual_hdrs = ["get_compiler_ir.h"],
     visibility = [":internal"],
     deps = [
-        "//tensorflow/compiler/xla:statusor",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -815,7 +792,6 @@ cc_library(
         "extract_outside_compilation_pass.cc",
         "force_xla_constants_on_host_pass.cc",
         "increase_dynamism_for_auto_jit_pass.cc",
-        "introduce_floating_point_jitter_pass.cc",
         "mark_for_compilation_pass.cc",
         "mark_for_compilation_pass_test_helper.cc",
         "partially_decluster_pass.cc",
@@ -831,7 +807,6 @@ cc_library(
         "extract_outside_compilation_pass.h",
         "force_xla_constants_on_host_pass.h",
         "increase_dynamism_for_auto_jit_pass.h",
-        "introduce_floating_point_jitter_pass.h",
         "mark_for_compilation_pass.h",
         "mark_for_compilation_pass_test_helper.h",
         "partially_decluster_pass.h",
@@ -1001,8 +976,6 @@ tf_cc_test(
         "extract_outside_compilation_pass_test.cc",
         "force_xla_constants_on_host_pass_test.cc",
         "increase_dynamism_for_auto_jit_pass_test.cc",
-        "introduce_floating_point_jitter_pass_internal.h",
-        "introduce_floating_point_jitter_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
         "partially_decluster_pass_test.cc",
         "rearrange_function_argument_pass_test.cc",
@@ -1246,3 +1219,205 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "tf_to_hlo_compiler",
+    hdrs = ["tf_to_hlo_compiler.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_library(
+    name = "tf_graph_to_hlo_compiler",
+    srcs = ["tf_graph_to_hlo_compiler.cc"],
+    hdrs = ["tf_graph_to_hlo_compiler.h"],
+    deps = [
+        ":tf_to_hlo_compiler",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+)
+
+cc_library(
+    name = "device_compilation_profiler",
+    srcs = ["device_compilation_profiler.cc"],
+    hdrs = ["device_compilation_profiler.h"],
+    deps = [
+        ":xla_activity_listener",
+        ":xla_activity_proto_cc",
+        ":xla_compile_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:mutex",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_compiler_client",
+    srcs = ["device_compiler_client.cc"],
+    hdrs = ["device_compiler_client.h"],
+    visibility = [
+        ":internal",
+        "//tensorflow/core/common_runtime/next_pluggable_device:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/core/util:determinism",
+    ],
+)
+
+tf_cc_test(
+    name = "device_compiler_client_test",
+    srcs = ["device_compiler_client_test.cc"],
+    deps = [
+        ":device_compiler_client",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "xla_device_compiler_client",
+    srcs = ["xla_device_compiler_client.cc"],
+    hdrs = ["xla_device_compiler_client.h"],
+    deps = [
+        ":device_compiler_client",
+        "//tensorflow/compiler/xla/client:local_client",
+    ],
+)
+
+cc_library(
+    name = "device_executable_persistor",
+    hdrs = ["device_executable_persistor.h"],
+    deps = [
+        ":xla_compilation_cache_proto_cc",
+        ":xla_device_compiler_client",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_headers_for_pybind",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "device_compilation_cache",
+    hdrs = ["device_compilation_cache.h"],
+    deps = [
+        ":device_compilation_cluster_signature",
+        ":device_compiler_client",
+        ":xla_compile_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/core:framework_lite",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_compilation_cluster_signature",
+    srcs = ["device_compilation_cluster_signature.cc"],
+    hdrs = ["device_compilation_cluster_signature.h"],
+    deps = ["//tensorflow/compiler/tf2xla:xla_compiler"],
+)
+
+cc_library(
+    name = "pjrt_device_compiler_client",
+    srcs = ["pjrt_device_compiler_client.cc"],
+    hdrs = ["pjrt_device_compiler_client.h"],
+    deps = [
+        ":device_compiler_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+    ],
+)
+
+cc_library(
+    name = "pjrt_device_context",
+    srcs = [
+        "pjrt_device_context.cc",
+    ],
+    hdrs = [
+        "pjrt_device_context.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tfrt/common:async_value_tensor",
+        "//tensorflow/core/tfrt/common:pjrt_util",
+    ],
+)
+
+tf_cc_test(
+    name = "device_compilation_cluster_signature_test",
+    srcs = [
+        "device_compilation_cluster_signature_test.cc",
+    ],
+    deps = [
+        ":device_compilation_cluster_signature",
+        ":flags",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "device_compilation_profiler_test",
+    srcs = ["device_compilation_profiler_test.cc"],
+    deps = [
+        ":device_compilation_profiler",
+        ":xla_activity_proto_cc",
+        "//tensorflow/compiler/jit/tests:device_compiler_test_helper",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "device_executable_persistor_test",
+    srcs = ["device_executable_persistor_test.cc"],
+    deps = [
+        ":device_compiler_client",
+        ":device_executable_persistor",
+        ":xla_compilation_cache_proto_cc",
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_device_compiler_client",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:math_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/core/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "device_compilation_cache_test",
+    srcs = ["device_compilation_cache_test.cc"],
+    deps = [
+        ":device_compilation_cache",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
index e2dda95e4ef..5a1619d79ff 100644
--- a/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-using se::port::StatusOr;
+using tsl::StatusOr;
 
 class CloneConstantsForBetterClusteringPassImpl {
  public:
@@ -35,12 +35,12 @@ class CloneConstantsForBetterClusteringPassImpl {
   Status Run();
 
  private:
-  Status CloneSmallHostConstantInputs(
-      const absl::flat_hash_set<string>& name_set, Node* n);
+  Status CloneSmallConstantInputs(const absl::flat_hash_set<string>& name_set,
+                                  Node* n);
   string GenerateUniqueName(const absl::flat_hash_set<string>& name_set,
                             absl::string_view prefix);
-  se::port::StatusOr<Node*> CloneNode(
-      const absl::flat_hash_set<string>& name_set, Node* n);
+  tsl::StatusOr<Node*> CloneNode(const absl::flat_hash_set<string>& name_set,
+                                 Node* n);
 
   Graph* graph_;
   int unique_name_counter_;
@@ -75,25 +75,10 @@ StatusOr<Node*> CloneConstantsForBetterClusteringPassImpl::CloneNode(
 }
 
 namespace {
-StatusOr<bool> IsConstantOnHost(Node* n) {
-  if (n->output_type(0) == DT_INT32) {
-    // TensorFlow always puts int32 tensors on the host.
-    return true;
-  }
-
-  DeviceNameUtils::ParsedName parsed;
-  TF_RET_CHECK(
-      DeviceNameUtils::ParseFullName(n->assigned_device_name(), &parsed));
-  return parsed.type == DEVICE_CPU;
-}
-
 StatusOr<bool> IsConstantSmall(Node* n) {
   const TensorProto* proto = nullptr;
   TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto));
 
-  // TODO(sanjoy): It may make sense to combine this threshold with XLA's "large
-  // constant" threshold, if there is one.
-  const int kSmallTensorThreshold = 16;
   int64_t total_elements = 1;
   for (const auto& dim : proto->tensor_shape().dim()) {
     if (dim.size() < 0) {
@@ -102,21 +87,20 @@ StatusOr<bool> IsConstantSmall(Node* n) {
     }
     total_elements *= dim.size();
   }
+
+  // TODO(sanjoy): It may make sense to combine this threshold with XLA's "large
+  // constant" threshold, if there is one.
+  const int kSmallTensorThreshold = 16;
   return total_elements < kSmallTensorThreshold;
 }
 
-// We only clone host constants for now since we want to avoid increasing memory
+// We only clone small constants since we want to avoid increasing memory
 // pressure on GPUs.
-StatusOr<bool> IsSmallHostConstant(Node* n) {
+StatusOr<bool> IsSmallConstant(Node* n) {
   if (!n->IsConstant()) {
     return false;
   }
 
-  TF_ASSIGN_OR_RETURN(bool is_constant_on_host, IsConstantOnHost(n));
-  if (!is_constant_on_host) {
-    return false;
-  }
-
   return IsConstantSmall(n);
 }
 
@@ -126,7 +110,7 @@ bool IsInPlaceOp(absl::string_view op_name) {
 }
 }  // namespace
 
-Status CloneConstantsForBetterClusteringPassImpl::CloneSmallHostConstantInputs(
+Status CloneConstantsForBetterClusteringPassImpl::CloneSmallConstantInputs(
     const absl::flat_hash_set<string>& name_set, Node* n) {
   std::vector<const Edge*> in_edges;
   // Get the edges and sort them so we clone in a deterministic order.
@@ -136,10 +120,9 @@ Status CloneConstantsForBetterClusteringPassImpl::CloneSmallHostConstantInputs(
   });
   for (const Edge* e : in_edges) {
     Node* input = e->src();
-    TF_ASSIGN_OR_RETURN(bool is_small_host_constant,
-                        IsSmallHostConstant(input));
-    if (is_small_host_constant && input->out_edges().size() != 1) {
-      VLOG(2) << "Cloning small host constant " << input->name();
+    TF_ASSIGN_OR_RETURN(bool is_small_constant, IsSmallConstant(input));
+    if (is_small_constant && input->out_edges().size() != 1) {
+      VLOG(2) << "Cloning small constant " << input->name();
       TF_ASSIGN_OR_RETURN(Node* const input_cloned, CloneNode(name_set, input));
       if (e->IsControlEdge()) {
         graph_->AddControlEdge(input_cloned, e->dst());
@@ -210,7 +193,7 @@ Status CloneConstantsForBetterClusteringPassImpl::Run() {
   // Iterate over a copy of the nodes to avoid iterating over g->nodes() while
   // creating more nodes.
   for (Node* n : nodes) {
-    TF_RETURN_IF_ERROR(CloneSmallHostConstantInputs(name_set, n));
+    TF_RETURN_IF_ERROR(CloneSmallConstantInputs(name_set, n));
   }
   return OkStatus();
 }
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering.h b/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
index b0900d7f1cd..19e6c49ec44 100644
--- a/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_CLONE_CONSTANTS_FOR_BETTER_CLUSTERING_H_
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc b/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc
index 29ce9700e38..468b1eab82b 100644
--- a/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc
+++ b/tensorflow/compiler/jit/clone_constants_for_better_clustering_test.cc
@@ -61,6 +61,29 @@ Status CloneConstantsForBetterClustering(const Scope& s,
 const char* kCPU = "/job:localhost/replica:0/task:0/device:CPU:0";
 const char* kGPU = "/job:localhost/replica:0/task:0/device:GPU:0";
 
+TEST(CloneConstantsForBetterClusteringTest, ScalarConstantPlacedOnGpu) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
+
+  Output in = ops::Placeholder(on_gpu.WithOpName("in"), DT_FLOAT);
+  Output c = ops::Const(on_gpu.WithOpName("const"), 1.0f, {});
+  Output add1 = ops::AddV2(on_gpu.WithOpName("add1"), in, c);
+  Output add2 = ops::AddV2(on_gpu.WithOpName("add2"), add1, c);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(CloneConstantsForBetterClustering(root, &result));
+
+  OutputTensor add1_operand;
+  TF_ASSERT_OK(
+      FindNodeByName(result.get(), "add1")->input_tensor(1, &add1_operand));
+
+  OutputTensor add2_operand;
+  TF_ASSERT_OK(
+      FindNodeByName(result.get(), "add2")->input_tensor(1, &add2_operand));
+
+  EXPECT_NE(add1_operand.node, add2_operand.node);
+}
+
 TEST(CloneConstantsForBetterClusteringTest, HostConstantPlacedOnCpu) {
   Scope root = Scope::NewRootScope().ExitOnError();
   Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
@@ -114,7 +137,7 @@ TEST(CloneConstantsForBetterClusteringTest, HostConstantPlacedOnGpu) {
   EXPECT_NE(tr0_perm.node, tr1_perm.node);
 }
 
-TEST(CloneConstantsForBetterClusteringTest, DontCloneNonHostConstants) {
+TEST(CloneConstantsForBetterClusteringTest, CloneSmallDeviceConstants) {
   Scope root = Scope::NewRootScope().ExitOnError();
   Scope on_gpu = root.WithAssignedDevice(kGPU).WithDevice(kGPU);
 
@@ -143,7 +166,7 @@ TEST(CloneConstantsForBetterClusteringTest, DontCloneNonHostConstants) {
   TF_ASSERT_OK(
       FindNodeByName(result.get(), "perm_cast_1")->input_tensor(0, &tr1_perm));
 
-  EXPECT_EQ(tr0_perm.node, tr1_perm.node);
+  EXPECT_NE(tr0_perm.node, tr1_perm.node);
 }
 
 TEST(CloneConstantsForBetterClusteringTest, DontCloneLargeConstants) {
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index fe86b35943a..9e62619354a 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -115,7 +115,7 @@ namespace tensorflow {
 
 namespace {
 
-using se::port::StatusOr;
+using tsl::StatusOr;
 
 // Represents a logical predicate, used as described in the algorithm overview
 // above.
diff --git a/tensorflow/compiler/jit/deadness_analysis.h b/tensorflow/compiler/jit/deadness_analysis.h
index fbd2a36c4f5..72b446f165a 100644
--- a/tensorflow/compiler/jit/deadness_analysis.h
+++ b/tensorflow/compiler/jit/deadness_analysis.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_H_
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/graph/graph.h"
 
 namespace tensorflow {
@@ -74,8 +73,8 @@ class DeadnessAnalysis {
     friend class DeadnessAnalysis;
   };
 
-  virtual se::port::StatusOr<DeadnessPredicate> GetPredicateFor(
-      Node* n, int oidx) const = 0;
+  virtual tsl::StatusOr<DeadnessPredicate> GetPredicateFor(Node* n,
+                                                           int oidx) const = 0;
 
   // Prints out the internal state of this instance.  For debugging purposes
   // only.
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 02263f7b292..33cb716623f 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-se::port::StatusOr<bool> HasInputsWithMismatchingDeadness(
+tsl::StatusOr<bool> HasInputsWithMismatchingDeadness(
     const DeadnessAnalysis& deadness_analysis, const Node& n) {
   std::optional<DeadnessAnalysis::DeadnessPredicate> pred;
   for (const Edge* edge : n.in_edges()) {
diff --git a/tensorflow/compiler/jit/device_compilation_cache.h b/tensorflow/compiler/jit/device_compilation_cache.h
new file mode 100644
index 00000000000..872c9c6c956
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_cache.h
@@ -0,0 +1,212 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Cache to store compiled HLO, executables and related metadata keyed by
+// `DeviceCompilationClusterSignature`. The cache owns the stored
+// CompilationResults and Executables.
+// Currently no cache eviction policy is implemented and the cache grows without
+// bound.
+template <typename ExecutableType>
+class DeviceCompilationCache {
+ public:
+  DeviceCompilationCache() = default;
+  ~DeviceCompilationCache() = default;
+
+  using Key = DeviceCompilationClusterSignature;
+  struct Value {
+    DeviceCompileState compile_state = DeviceCompileState::kUncompiled;
+    Status compilation_status;
+    int64_t request_count = 0;
+    const XlaCompiler::CompilationResult* compilation_result = nullptr;
+    ExecutableType* executable = nullptr;
+  };
+
+  // Returns std::nullopt if value for the supplied key is not found. If a value
+  // is found, `request_count` is incremented before returning the value.
+  std::optional<Value> Lookup(const Key& key) const;
+
+  // Inserts an empty value if value is not found and returns it. If a value is
+  // found, `request_count` is incremented before returning the value.
+  Value LookupOrCreate(const Key& key);
+
+  // Caches `compile_state`, `compilation_status`, `compilation_result` and
+  // `executable` and associates them with the provided `key`. Takes ownership
+  // of `compilation_result` and `executable`. Does not increment the
+  // corresponding `request_count`. Only arguments that are not std::nullopt are
+  // updated in the cache.
+  void Store(const Key& key, std::optional<DeviceCompileState> compile_state,
+             std::optional<Status> compilation_status,
+             std::optional<std::unique_ptr<XlaCompiler::CompilationResult>>
+                 compilation_result,
+             std::optional<std::unique_ptr<ExecutableType>> executable);
+
+  std::string DebugString() const;
+
+ private:
+  // The value associated with a cache entry.
+  struct Entry {
+    mutable mutex mu;
+
+    // The current compilation state for this entry.
+    DeviceCompileState compile_state TF_GUARDED_BY(mu) =
+        DeviceCompileState::kUncompiled;
+
+    // The number of times a compilation with this signature has been requested.
+    int64_t request_count TF_GUARDED_BY(mu) = 0;
+
+    // Did compilation succeed?
+    Status compilation_status TF_GUARDED_BY(mu);
+
+    // Output of the XlaCompiler.
+    std::unique_ptr<XlaCompiler::CompilationResult> compilation_result
+        TF_GUARDED_BY(mu);
+
+    // The XLA executable compiled from <computation>. May be null if no
+    // executable has been built.
+    std::unique_ptr<ExecutableType> executable TF_GUARDED_BY(mu);
+
+    std::string DebugString() const {
+      mutex_lock lock(mu);
+      return absl::StrCat(
+          "{compile_state: ", compile_state, ", request_count: ", request_count,
+          ", compilation_status: ", compilation_status.ToString(),
+          ", compilation_result?: ", compilation_result != nullptr,
+          ", executable?: ", executable != nullptr, "}");
+    }
+  };
+
+  mutable mutex compile_cache_mu_;
+  absl::flat_hash_map<Key, std::unique_ptr<Entry>, Key::Hash> cache_
+      TF_GUARDED_BY(compile_cache_mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceCompilationCache);
+};
+
+template <typename ExecutableType>
+std::optional<typename DeviceCompilationCache<ExecutableType>::Value>
+DeviceCompilationCache<ExecutableType>::Lookup(const Key& key) const {
+  // The outer lock protects the existence of the cache entry. It does not
+  // protect the contents of the cache entry.
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Find cache entry.
+    auto it = cache_.find(key);
+    if (it == cache_.cend()) {
+      return std::nullopt;
+    }
+
+    entry = it->second.get();
+  }
+
+  mutex_lock lock(entry->mu);
+  Value value = {/*compile_state=*/entry->compile_state,
+                 /*compilation_status=*/entry->compilation_status,
+                 /*request_count=*/++entry->request_count,
+                 /*compilation_result=*/entry->compilation_result.get(),
+                 /*executable=*/entry->executable.get()};
+  return value;
+}
+
+template <typename ExecutableType>
+typename DeviceCompilationCache<ExecutableType>::Value
+DeviceCompilationCache<ExecutableType>::LookupOrCreate(const Key& key) {
+  // The outer lock protects the existence of the cache entry. It does not
+  // protect the contents of the cache entry.
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Emplace empty cache entry if not found.
+    auto it = cache_.emplace(key, std::make_unique<Entry>()).first;
+    entry = it->second.get();
+  }
+
+  mutex_lock lock(entry->mu);
+  Value value = {/*compile_state=*/entry->compile_state,
+                 /*compilation_status=*/entry->compilation_status,
+                 /*request_count=*/++entry->request_count,
+                 /*compilation_result=*/entry->compilation_result.get(),
+                 /*executable=*/entry->executable.get()};
+  return value;
+}
+
+template <typename ExecutableType>
+void DeviceCompilationCache<ExecutableType>::Store(
+    const Key& key, std::optional<DeviceCompileState> compile_state,
+    std::optional<Status> compilation_status,
+    std::optional<std::unique_ptr<XlaCompiler::CompilationResult>>
+        compilation_result,
+    std::optional<std::unique_ptr<ExecutableType>> executable) {
+  Entry* entry;
+  {
+    mutex_lock lock(compile_cache_mu_);
+    // Emplace empty cache entry if not found.
+    auto it = cache_.emplace(key, std::make_unique<Entry>()).first;
+    entry = it->second.get();
+  }
+
+  {
+    mutex_lock lock(entry->mu);
+    if (compile_state.has_value()) {
+      entry->compile_state = *compile_state;
+    }
+    if (compilation_status.has_value()) {
+      entry->compilation_status = *compilation_status;
+    }
+    if (compilation_result.has_value()) {
+      entry->compilation_result = std::move(*compilation_result);
+    }
+    if (executable.has_value()) {
+      entry->executable = std::move(*executable);
+    }
+  }
+}
+
+template <typename ExecutableType>
+std::string DeviceCompilationCache<ExecutableType>::DebugString() const {
+  std::string s = "DeviceCompilationCache<ExecutableType> {\n";
+  {
+    mutex_lock lock(compile_cache_mu_);
+    for (const auto& [key, entry] : cache_) {
+      absl::StrAppend(&s, key.HumanString(), " : ", entry->DebugString(),
+                      ",\n");
+    }
+  }
+  absl::StrAppend(&s, "}");
+
+  return s;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/jit/device_compilation_cache_test.cc b/tensorflow/compiler/jit/device_compilation_cache_test.cc
new file mode 100644
index 00000000000..b755ea0f362
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_cache_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compilation_cache.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace {
+struct FakeExecutable {
+  std::string data;
+  explicit FakeExecutable(const std::string& s) : data(s) {}
+};
+
+using Cache = DeviceCompilationCache<FakeExecutable>;
+using Signature = DeviceCompilationClusterSignature;
+
+StatusOr<Signature> BuildSampleSignature(const std::string& fn_name) {
+  NameAttrList fn;
+  fn.set_name(fn_name);
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({4, 0});
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+  return Signature::Build(fn, args);
+}
+
+TEST(DeviceCompilationCacheTest, LookupEntryDoesntExist) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  auto cache_value = cache->Lookup(key);
+
+  EXPECT_FALSE(cache_value.has_value());
+}
+
+TEST(DeviceCompilationCacheTest, LookupOrCreateEntryDoesntExist) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  Cache::Value cache_value = cache->LookupOrCreate(key);
+
+  EXPECT_EQ(cache_value.compile_state, DeviceCompileState::kUncompiled);
+  EXPECT_EQ(cache_value.request_count, 1);
+  EXPECT_EQ(cache_value.compilation_result, nullptr);
+  EXPECT_EQ(cache_value.executable, nullptr);
+}
+
+TEST(DeviceCompilationCacheTest, IncrementRequestCountOnLookup) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  Cache::Value cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 1);
+
+  cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 2);
+
+  cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 3);
+}
+
+TEST(DeviceCompilationCacheTest, RequestCountUnchangedOnStore) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  Cache::Value cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 1);
+
+  cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 2);
+
+  cache_value = cache->LookupOrCreate(key);
+  EXPECT_EQ(cache_value.request_count, 3);
+
+  auto compilation_result = std::make_unique<XlaCompiler::CompilationResult>();
+  cache->Store(key, DeviceCompileState::kCompiled, OkStatus(),
+               std::move(compilation_result), std::nullopt);
+  cache_value = cache->LookupOrCreate(key);
+
+  EXPECT_EQ(cache_value.request_count, 4);
+}
+
+TEST(DeviceCompilationCacheTest, StoreLookup) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  auto compilation_result = std::make_unique<XlaCompiler::CompilationResult>();
+  auto executable = std::make_unique<FakeExecutable>("foo_exe");
+  cache->Store(key, DeviceCompileState::kCompiled, OkStatus(),
+               std::move(compilation_result), std::move(executable));
+  auto cache_value = cache->Lookup(key);
+
+  EXPECT_EQ(cache_value->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value->request_count, 1);
+  EXPECT_TRUE(cache_value->compilation_status.ok());
+  EXPECT_TRUE(cache_value->compilation_result != nullptr);
+  EXPECT_TRUE(cache_value->executable != nullptr);
+  EXPECT_EQ(cache_value->executable->data, "foo_exe");
+}
+
+TEST(DeviceCompilationCacheTest, StoreLookupOrCreate) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+  auto compilation_result = std::make_unique<XlaCompiler::CompilationResult>();
+  auto executable = std::make_unique<FakeExecutable>("foo_exe");
+  cache->Store(key, DeviceCompileState::kCompiled, OkStatus(),
+               std::move(compilation_result), std::move(executable));
+  auto cache_value = cache->LookupOrCreate(key);
+
+  EXPECT_EQ(cache_value.compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value.request_count, 1);
+  EXPECT_TRUE(cache_value.compilation_status.ok());
+  EXPECT_TRUE(cache_value.compilation_result != nullptr);
+  EXPECT_TRUE(cache_value.executable != nullptr);
+  EXPECT_EQ(cache_value.executable->data, "foo_exe");
+}
+
+TEST(DeviceCompilationCacheTest, StoreOptionalArgs) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key, BuildSampleSignature("foo"));
+
+  auto compilation_result = std::make_unique<XlaCompiler::CompilationResult>();
+  auto executable = std::make_unique<FakeExecutable>("foo_exe");
+
+  cache->Store(key, DeviceCompileState::kCompiled, std::nullopt, std::nullopt,
+               std::nullopt);
+  auto cache_value = cache->Lookup(key);
+
+  EXPECT_EQ(cache_value->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_TRUE(cache_value->compilation_status.ok());
+  EXPECT_TRUE(cache_value->compilation_result == nullptr);
+  EXPECT_TRUE(cache_value->executable == nullptr);
+
+  cache->Store(key, std::nullopt, errors::InvalidArgument("Couldn't compile."),
+               std::nullopt, std::nullopt);
+  cache_value = cache->Lookup(key);
+
+  EXPECT_EQ(cache_value->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value->compilation_status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(cache_value->compilation_result == nullptr);
+  EXPECT_TRUE(cache_value->executable == nullptr);
+
+  cache->Store(key, std::nullopt, std::nullopt, std::move(compilation_result),
+               std::nullopt);
+  cache_value = cache->Lookup(key);
+
+  EXPECT_EQ(cache_value->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value->compilation_status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(cache_value->compilation_result != nullptr);
+  EXPECT_TRUE(cache_value->executable == nullptr);
+
+  cache->Store(key, std::nullopt, std::nullopt, std::nullopt,
+               std::move(executable));
+  cache_value = cache->Lookup(key);
+
+  EXPECT_EQ(cache_value->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value->compilation_status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(cache_value->compilation_result != nullptr);
+  EXPECT_TRUE(cache_value->executable != nullptr);
+  EXPECT_EQ(cache_value->executable->data, "foo_exe");
+}
+
+TEST(DeviceCompilationCacheTest, StoreMultipleEntries) {
+  auto cache = std::make_unique<Cache>();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto key1, BuildSampleSignature("foo"));
+  TF_ASSERT_OK_AND_ASSIGN(auto key2, BuildSampleSignature("bar"));
+
+  auto compilation_result1 = std::make_unique<XlaCompiler::CompilationResult>();
+  auto compilation_result2 = std::make_unique<XlaCompiler::CompilationResult>();
+  auto executable1 = std::make_unique<FakeExecutable>("foo_exe");
+  auto executable2 = std::make_unique<FakeExecutable>("bar_exe");
+  cache->Store(key1, DeviceCompileState::kCompiled,
+               errors::InvalidArgument("Invalid argument."),
+               std::move(compilation_result1), std::move(executable1));
+  cache->Store(key2, DeviceCompileState::kCompiling, OkStatus(),
+               std::move(compilation_result2), std::move(executable2));
+  auto cache_value_1 = cache->Lookup(key1);
+  auto cache_value_2 = cache->Lookup(key2);
+
+  EXPECT_EQ(cache_value_1->compile_state, DeviceCompileState::kCompiled);
+  EXPECT_EQ(cache_value_1->compilation_status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(cache_value_1->compilation_result != nullptr);
+  EXPECT_TRUE(cache_value_1->executable != nullptr);
+  EXPECT_EQ(cache_value_1->executable->data, "foo_exe");
+
+  EXPECT_EQ(cache_value_2->compile_state, DeviceCompileState::kCompiling);
+  EXPECT_TRUE(cache_value_2->compilation_status.ok());
+  EXPECT_TRUE(cache_value_2->compilation_result != nullptr);
+  EXPECT_TRUE(cache_value_2->executable != nullptr);
+  EXPECT_EQ(cache_value_2->executable->data, "bar_exe");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.cc b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
new file mode 100644
index 00000000000..8ca571b104b
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.cc
@@ -0,0 +1,139 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+
+#include <string>
+#include <utility>
+#include <variant>
+
+namespace tensorflow {
+namespace {
+using Signature = DeviceCompilationClusterSignature;
+using TensorTypeAndShape = Signature::TensorTypeAndShape;
+
+// Functor that converts a Signature's arg to a human readable string.
+struct SignatureHumanStringAppender {
+  explicit SignatureHumanStringAppender(std::string* dest) : dest(dest) {}
+  std::string* dest;
+  void operator()(const Tensor& arg) {
+    absl::StrAppend(dest, "; ", arg.DebugString());
+  }
+  void operator()(const TensorTypeAndShape& arg) {
+    absl::StrAppend(dest, ",", DataTypeString(arg.first));
+    absl::StrAppend(dest, " [", absl::StrJoin(arg.second, ","), "]");
+  }
+};
+
+// Functor that compares the arg values of two different signatures. Returns
+// true when the args are not equal.
+struct SignatureNotEqual {
+  bool operator()(const Tensor& arg, const Tensor& other) {
+    return arg.dtype() != other.dtype() || arg.shape() != other.shape() ||
+           arg.tensor_data() != other.tensor_data();
+  }
+  bool operator()(const TensorTypeAndShape& arg,
+                  const TensorTypeAndShape& other) {
+    return arg.first != other.first || arg.second != other.second;
+  }
+  bool operator()(const Tensor& arg, const TensorTypeAndShape& other) {
+    return true;
+  }
+  bool operator()(const TensorTypeAndShape& arg, const Tensor& other) {
+    return true;
+  }
+};
+
+// Functor that incrementally computes a Signature's hash given its current hash
+// and one of its args.
+struct SignatureHashCombiner {
+  explicit SignatureHashCombiner(const uint64 h) : h(h) {}
+  uint64 h;
+  uint64 operator()(const Tensor& arg) {
+    h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.dtype())));
+    h = Hash64Combine(
+        h, Hash64(arg.tensor_data().data(), arg.tensor_data().size()));
+    for (int dim = 0; dim < arg.dims(); ++dim) {
+      h = Hash64Combine(h, std::hash<int>()(arg.dim_size(dim)));
+    }
+    return h;
+  }
+  uint64 operator()(const TensorTypeAndShape& arg) {
+    h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
+    h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
+    for (int dim : arg.second) {
+      h = Hash64Combine(h, std::hash<int>()(dim));
+    }
+    return h;
+  }
+};
+}  // namespace
+
+// Compute a string signature which encodes the shapes of the
+// arguments in the supplied list.
+std::string Signature::HumanString() const {
+  std::string result = name;
+  for (const auto& arg : args) {
+    std::visit(SignatureHumanStringAppender(&result), arg);
+  }
+  return result;
+}
+
+bool Signature::operator==(const Signature& other) const {
+  if (name != other.name) return false;
+  if (args.size() != other.args.size()) return false;
+  for (int i = 0, end = args.size(); i < end; ++i) {
+    if (std::visit(SignatureNotEqual(), args[i], other.args[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint64 Signature::Hash::operator()(const Signature& signature) const {
+  uint64 h = std::hash<string>()(signature.name);
+  for (const auto& arg : signature.args) {
+    h = std::visit(SignatureHashCombiner(h), arg);
+  }
+  return h;
+}
+
+StatusOr<Signature> Signature::Build(
+    const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args) {
+  Signature signature;
+  signature.name = Canonicalize(function.name(), AttrSlice(&function.attr()));
+
+  for (const XlaCompiler::Argument& arg : args) {
+    switch (arg.kind) {
+      case XlaCompiler::Argument::kConstant:
+      case XlaCompiler::Argument::kConstantResource:
+        signature.args.push_back(arg.constant_value);
+        break;
+      case XlaCompiler::Argument::kParameter:
+      case XlaCompiler::Argument::kResource:
+        signature.args.push_back(
+            TensorTypeAndShape(arg.type, arg.DimensionSizesAsInlinedVector()));
+        break;
+      default:
+        return errors::InvalidArgument(
+            "Unhandled argument kind in XlaCompilationCache: ",
+            arg.HumanString());
+    }
+  }
+  return std::move(signature);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature.h b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
new file mode 100644
index 00000000000..76a8daa0d95
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
+
+#include <utility>
+#include <variant>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+
+namespace tensorflow {
+
+// Describes the types, shapes and any compile-time constant arguments
+// to a kernel. Key that uniquely identifies a compilation output.
+struct DeviceCompilationClusterSignature {
+  // Name of the cluster, built from the function name and it's attributes.
+  string name;
+
+  // List of args (either as a TensorTypeAndShape or as a Tensor value)
+  // for compile-time constant arguments to the compilation, ordered by
+  // argument number. Tensors must be in host memory.
+  using TensorTypeAndShape =
+      std::pair<DataType, absl::InlinedVector<int64_t, 4>>;
+  absl::InlinedVector<std::variant<Tensor, TensorTypeAndShape>, 8> args;
+
+  bool operator==(const DeviceCompilationClusterSignature& other) const;
+
+  struct Hash {
+    uint64 operator()(const DeviceCompilationClusterSignature& signature) const;
+  };
+
+  // Returns a human-readable description of the signature.
+  string HumanString() const;
+
+  // Builds the signature for a compilation.
+  static StatusOr<DeviceCompilationClusterSignature> Build(
+      const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_CLUSTER_SIGNATURE_H_
diff --git a/tensorflow/compiler/jit/device_compilation_cluster_signature_test.cc b/tensorflow/compiler/jit/device_compilation_cluster_signature_test.cc
new file mode 100644
index 00000000000..39758c71580
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_cluster_signature_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+using SignatureHash = DeviceCompilationClusterSignature::Hash;
+
+TEST(DeviceCompilationClusterSignatureTest, SignatureEquality) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({4, 0});
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(DeviceCompilationClusterSignature s1,
+                          DeviceCompilationClusterSignature::Build(fn, args));
+
+  args[0].type = DT_FLOAT;
+  args[0].constant_value = Tensor(DT_FLOAT, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(DeviceCompilationClusterSignature s2,
+                          DeviceCompilationClusterSignature::Build(fn, args));
+
+  args[0].shape = TensorShape({0, 4});
+  args[0].constant_value = Tensor(DT_FLOAT, {0, 4});
+  TF_ASSERT_OK_AND_ASSIGN(DeviceCompilationClusterSignature s3,
+                          DeviceCompilationClusterSignature::Build(fn, args));
+
+  std::vector<DeviceCompilationClusterSignature> signatures = {s1, s2, s3};
+  for (int i = 0; i < signatures.size(); ++i) {
+    for (int j = 0; j < signatures.size(); ++j) {
+      EXPECT_EQ(i == j, signatures[i] == signatures[j])
+          << "s1: " << signatures[i].HumanString() << "\n"
+          << "s2: " << signatures[j].HumanString();
+      EXPECT_EQ(i == j,
+                signatures[i].HumanString() == signatures[j].HumanString())
+          << "s1: " << signatures[i].HumanString() << "\n"
+          << "s2: " << signatures[j].HumanString();
+      EXPECT_EQ(i == j, SignatureHash()(signatures[i]) ==
+                            SignatureHash()(signatures[j]))
+          << "s1: " << signatures[i].HumanString() << "\n"
+          << "s1_hash: " << SignatureHash()(signatures[i]) << "\n"
+          << "s2: " << signatures[j].HumanString() << "\n"
+          << "s2_hash: " << SignatureHash()(signatures[j]);
+    }
+  }
+}
+
+TEST(DeviceCompilationClusterSignatureTest, SignatureUniqueness) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({4, 0});
+
+  TF_ASSERT_OK_AND_ASSIGN(DeviceCompilationClusterSignature s1,
+                          DeviceCompilationClusterSignature::Build(fn, args));
+
+  using std::swap;  // go/using-std-swap
+  swap(args[0], args[1]);
+  TF_ASSERT_OK_AND_ASSIGN(DeviceCompilationClusterSignature s2,
+                          DeviceCompilationClusterSignature::Build(fn, args));
+
+  EXPECT_NE(s1.HumanString(), s2.HumanString());
+  EXPECT_NE(SignatureHash()(s1), SignatureHash()(s2));
+  EXPECT_FALSE(s1 == s2);
+}
+
+void BM_BuildSignature(::testing::benchmark::State& state) {
+  const int n_args = state.range(0);
+
+  NameAttrList fn;
+  fn.set_name("afunction");
+  for (int i = 0; i < n_args; i++) {
+    (*fn.mutable_attr())[absl::StrCat("T", i)].set_type(DT_FLOAT);
+  }
+  std::vector<XlaCompiler::Argument> args(n_args);
+  for (int i = 0; i < n_args; i++) {
+    args[i].kind = (((i % 3) == 0) ? XlaCompiler::Argument::kConstant
+                                   : XlaCompiler::Argument::kParameter);
+    args[i].type = DT_INT32;
+    args[i].shape = TensorShape({4, 0});
+    args[i].constant_value = Tensor(DT_INT32, {4, 0});
+  }
+
+  for (auto i : state) {
+    auto s = DeviceCompilationClusterSignature::Build(fn, args);
+    CHECK(s.ok());
+    DeviceCompilationClusterSignature sig = std::move(s.value());
+  }
+}
+BENCHMARK(BM_BuildSignature)->Arg(0)->Arg(1)->Arg(2)->Arg(5)->Arg(10);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compilation_profiler.cc b/tensorflow/compiler/jit/device_compilation_profiler.cc
new file mode 100644
index 00000000000..3ff54b6ff35
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_profiler.cc
@@ -0,0 +1,229 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/metrics.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+bool ShouldBeMegamorphic(int64_t compile_count, int64_t execution_count) {
+  const int64_t kCompileThreshold = 10;
+  const int64_t kMinExecutionsPerCompile = 50;
+
+  // This heuristic is trying to capture the following property: have we sunk a
+  // certain minimum amount of compile time into the cluster that didn't quite
+  // "pay off"?
+  return compile_count > kCompileThreshold &&
+         execution_count < kMinExecutionsPerCompile * compile_count;
+}
+
+void RegisterExecutionForCluster(
+    const NameAttrList& function,
+    DeviceCompilationProfiler::ClusterCompileStats* stats) {
+  ++stats->execution_count;
+
+  // The is_megamorphic bit is "sticky".  We assume clusters that have been
+  // observed to be megamorphic once stay megamorphic forever.
+  if (!stats->is_megamorphic &&
+      ShouldBeMegamorphic(stats->compile_count, stats->execution_count)) {
+    VLOG(1) << "Marking " << function.name()
+            << " as megamorphic, compile_count=" << stats->compile_count
+            << " execution_count=" << stats->execution_count;
+    stats->is_megamorphic = true;
+  }
+}
+
+// The number of times a lazy compilation must be requested for a specific
+// signature before  we attempt to compile it.
+constexpr int64_t kDefaultCompilationThreshold = 2;
+
+// Maximum number of ongoing compilations.
+constexpr int64_t kMaxNumOngoingCompilations = kNumAsyncDeviceCompilerThreads;
+
+}  // namespace
+
+DeviceCompilationProfiler::~DeviceCompilationProfiler() {
+  mutex_lock lock(mu_);
+  cluster_compile_stats_.clear();
+}
+
+StatusOr<DeviceCompilationProfiler::ClusterCompileStats>
+DeviceCompilationProfiler::GetCompileStats(const NameAttrList& function) const {
+  mutex_lock lock(mu_);
+
+  if (auto it = cluster_compile_stats_.find(function.name());
+      it != cluster_compile_stats_.end()) {
+    return it->second;
+  }
+
+  return errors::NotFound("Couldn't find compilation stats for cluster: ",
+                          function.name());
+}
+
+void DeviceCompilationProfiler::RegisterExecution(
+    const NameAttrList& function) {
+  mutex_lock lock(mu_);
+  auto it =
+      cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
+          .first;
+  RegisterExecutionForCluster(function, &it->second);
+}
+
+Status DeviceCompilationProfiler::RegisterCompilation(
+    const NameAttrList& function, int64_t compile_time_us,
+    bool used_persistent_cache) {
+  metrics::UpdateXlaCompilationTime(compile_time_us);
+
+  const std::string& function_name = function.name();
+
+  mutex_lock lock(mu_);
+  // Create a stats entry if it doesn't already exist.
+  auto it =
+      cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
+          .first;
+
+  const uint64 compile_time_s = compile_time_us / 1.0e6;
+  it->second.compile_count++;
+  it->second.cumulative_compile_time_us += compile_time_us;
+  VLOG(1) << "Compiled " << function_name << " " << it->second.compile_count
+          << " times, compile time: " << compile_time_us
+          << " us, cumulative: " << it->second.cumulative_compile_time_us
+          << " us ("
+          << tensorflow::strings::HumanReadableElapsedTime(compile_time_s)
+          << " / "
+          << tensorflow::strings::HumanReadableElapsedTime(
+                 it->second.cumulative_compile_time_us / 1.0e6)
+          << ")";
+
+  XlaJitCompilationActivity jit_compilation_activity;
+  jit_compilation_activity.set_cluster_name(function_name);
+  jit_compilation_activity.set_compile_count(it->second.compile_count);
+  jit_compilation_activity.set_compile_time_us(compile_time_us);
+  jit_compilation_activity.set_cumulative_compile_time_us(
+      it->second.cumulative_compile_time_us);
+  jit_compilation_activity.set_used_persistent_cache(used_persistent_cache);
+  return BroadcastXlaActivity(std::move(jit_compilation_activity));
+}
+
+bool DeviceCompilationProfiler::ShouldCompileCluster(
+    const NameAttrList& function, DeviceCompileMode compile_mode,
+    int64_t current_request_count) {
+  std::optional<int64_t> compile_threshold;
+  if (compile_mode == DeviceCompileMode::kLazy) {
+    compile_threshold = kDefaultCompilationThreshold;
+  } else if (compile_mode == DeviceCompileMode::kAsync) {
+    compile_threshold = 0;  // for now, always compile right away.
+  }
+
+  if (compile_mode == DeviceCompileMode::kStrict) {
+    // Lazy compilation is disabled.
+    return true;
+  }
+
+  mutex_lock lock(mu_);
+  // Create a stats entry if one isn't found and register an execution.
+  // Determine eligibility assuming this is the first execution of the cluster
+  // and this cluster has never been compiled before.
+  auto [it, cluster_not_found] =
+      cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{});
+  if (cluster_not_found) {
+    RegisterExecutionForCluster(function, &it->second);
+  }
+
+  // We avoid compiling clusters that have "gone megamorphic" i.e. have an
+  // excessive amount of shape dynamism.
+  if (it->second.is_megamorphic) {
+    BroadcastOptimizationRemark(XlaOptimizationRemark::MEGAMORPHIC_FUNCTION,
+                                function.name())
+        .IgnoreError();
+    VLOG(2) << "Not compiling cluster " << function.name()
+            << " because it is megamorphic.";
+    return false;
+  }
+
+  // We always compile a cluster the very first time it is executed.  This is an
+  // optimistic guess that pays off for statically shaped TensorFlow graphs
+  // (since they get the benefit of XLA right away without waiting for warmup)
+  // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at
+  // most one cluster-compilation's worth of compile time).
+  if (it->second.execution_count == 1) {
+    return true;
+  }
+
+  if (compile_mode == DeviceCompileMode::kAsync) {
+    // Asynchronous compilation is enabled.
+    if (num_ongoing_compilations_ >= kMaxNumOngoingCompilations) {
+      VLOG(2) << "Not asynchronously compiling cluster " << function.name()
+              << " because of too many ongoing compilations.";
+      return false;
+    }
+  }
+
+  bool reached_compile_threshold = current_request_count >= *compile_threshold;
+  if (!reached_compile_threshold) {
+    VLOG(2) << "Not compiling cluster " << function.name()
+            << " because it has not reached compile threshold; threshold is "
+            << *compile_threshold << " execution count "
+            << current_request_count << ".";
+  }
+  return reached_compile_threshold;
+}
+
+void DeviceCompilationProfiler::IncrementOngoingAsyncCompilations() {
+  mutex_lock lock(mu_);
+  num_ongoing_compilations_++;
+}
+
+void DeviceCompilationProfiler::DecrementOngoingAsyncCompilations() {
+  mutex_lock lock(mu_);
+  num_ongoing_compilations_--;
+}
+
+int64_t DeviceCompilationProfiler::GetNumOngoingAsyncCompilations() const {
+  mutex_lock lock(mu_);
+  return num_ongoing_compilations_;
+}
+
+std::string DeviceCompilationProfiler::DebugString() const {
+  std::string debug_string =
+      "DeviceCompilationProfiler {\ncluster_compile_stats: {\n";
+  {
+    mutex_lock lock(mu_);
+
+    for (const auto& [key, stats] : cluster_compile_stats_) {
+      absl::StrAppend(&debug_string, key, ": ", stats.DebugString(), "\n");
+    }
+  }
+
+  absl::StrAppend(&debug_string, "}\nnum_ongoing_compilations=",
+                  GetNumOngoingAsyncCompilations(), "\n}\n");
+
+  return debug_string;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compilation_profiler.h b/tensorflow/compiler/jit/device_compilation_profiler.h
new file mode 100644
index 00000000000..0f7fdf568bf
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_profiler.h
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+
+// Tracks statistics for device compilation and uses these to determine whether
+// the given cluster should be compiled or not.
+class DeviceCompilationProfiler : public ResourceBase {
+ public:
+  DeviceCompilationProfiler() = default;
+  ~DeviceCompilationProfiler() final;
+
+  struct ClusterCompileStats {
+    // Number of times the cluster has been (re-)compiled.
+    int64_t compile_count = 0;
+
+    // The number of times this cluster has been executed.
+    int64_t execution_count = 0;
+
+    // Cumulative time spent compiling the cluster.
+    int64_t cumulative_compile_time_us = 0;
+
+    // True if we have decided that this cluster is too dynamic (i.e. its shapes
+    // change too frequently) to profitably JIT compile.  Once a cluster is
+    // tagged megamorphic, it stays megamorphic forever.
+    bool is_megamorphic = false;
+
+    std::string DebugString() const {
+      return absl::StrCat(
+          "DeviceCompilationProfiler::ClusterCompileStats {compile_count=",
+          compile_count, ", execution_count=", execution_count,
+          ", cumulative_compile_time_us=", cumulative_compile_time_us,
+          ", is_megamorphic=", is_megamorphic, "}");
+    }
+  };
+
+  // Returns the compilation statistics for the given cluster.
+  StatusOr<ClusterCompileStats> GetCompileStats(
+      const NameAttrList& function) const;
+
+  // Determines whether the cluster should be compiled. Creates and inserts an
+  // entry into stats (also calls `RegisterExecution`) for `function` if it
+  // doesn't already exist.
+  bool ShouldCompileCluster(const NameAttrList& function,
+                            DeviceCompileMode compile_mode,
+                            int64_t current_request_count);
+
+  // Registers a cluster execution. Increments the execution count for the given
+  // cluster and also determines whether the cluster has gone megamorphic (and
+  // sets the megamorphic bit accordingly).
+  void RegisterExecution(const NameAttrList& function);
+
+  // Registers a cluster compilation. Increments the compilation count and
+  // accumulates the compile time for the given cluster. Also broadcasts an
+  // XlaJitCompilationActivity.
+  Status RegisterCompilation(const NameAttrList& function,
+                             int64_t compile_time_us,
+                             bool used_persistent_cache);
+
+  void IncrementOngoingAsyncCompilations();
+  void DecrementOngoingAsyncCompilations();
+  int64_t GetNumOngoingAsyncCompilations() const;
+  std::string DebugString() const override;
+
+ private:
+  mutable mutex mu_;
+
+  // Maps cluster names to compilation statistics for said cluster.
+  absl::flat_hash_map<std::string, ClusterCompileStats> cluster_compile_stats_
+      TF_GUARDED_BY(mu_);
+
+  int64_t num_ongoing_compilations_ TF_GUARDED_BY(mu_) = 0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceCompilationProfiler);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILATION_PROFILER_H_
diff --git a/tensorflow/compiler/jit/device_compilation_profiler_test.cc b/tensorflow/compiler/jit/device_compilation_profiler_test.cc
new file mode 100644
index 00000000000..317858e84b7
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compilation_profiler_test.cc
@@ -0,0 +1,243 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/jit/tests/device_compiler_test_helper.h"
+#include "tensorflow/compiler/jit/xla_activity.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(DeviceCompilationProfilerTest, RegisterExecution) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  for (int i = 0; i < 5; ++i) {
+    profiler->RegisterExecution(function);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(auto stats, profiler->GetCompileStats(function));
+  EXPECT_EQ(stats.execution_count, 5);
+}
+
+TEST(DeviceCompilationProfilerTest, RegisterCompilation) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  auto listener = std::make_unique<JitCompilationListener>();
+  auto listener_ptr = listener.get();
+  RegisterXlaActivityListener(std::move(listener));
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  std::vector<XlaJitCompilationActivity> expected_activities;
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_TRUE(profiler->RegisterCompilation(function, 4, false).ok());
+
+    TF_ASSERT_OK_AND_ASSIGN(auto stats, profiler->GetCompileStats(function));
+    XlaJitCompilationActivity expected_activity;
+    expected_activity.set_cluster_name(function.name());
+    expected_activity.set_compile_count(stats.compile_count);
+    expected_activity.set_compile_time_us(4);
+    expected_activity.set_cumulative_compile_time_us(
+        stats.cumulative_compile_time_us);
+    expected_activity.set_used_persistent_cache(false);
+    expected_activities.push_back(expected_activity);
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto stats, profiler->GetCompileStats(function));
+  EXPECT_EQ(stats.compile_count, 5);
+  EXPECT_EQ(stats.cumulative_compile_time_us, 5 * 4);
+
+  // TODO(b/255826209): Use ::testing::EqualsProto once b/135192747 is fixed.
+  const auto& actual_activities = listener_ptr->GetListenerHistory();
+  EXPECT_EQ(actual_activities.size(), expected_activities.size());
+  for (size_t i = 0; i < actual_activities.size(); ++i) {
+    EXPECT_EQ(actual_activities[i].SerializeAsString(),
+              expected_activities[i].SerializeAsString());
+  }
+}
+
+TEST(DeviceCompilationProfilerTest, OngoingAsyncCompilations) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  for (int i = 0; i < 5; ++i) {
+    profiler->IncrementOngoingAsyncCompilations();
+  }
+
+  EXPECT_EQ(profiler->GetNumOngoingAsyncCompilations(), 5);
+
+  for (int i = 0; i < 5; ++i) {
+    profiler->DecrementOngoingAsyncCompilations();
+  }
+
+  EXPECT_EQ(profiler->GetNumOngoingAsyncCompilations(), 0);
+
+  for (int i = 0; i < 5; ++i) {
+    profiler->IncrementOngoingAsyncCompilations();
+    profiler->DecrementOngoingAsyncCompilations();
+  }
+
+  EXPECT_EQ(profiler->GetNumOngoingAsyncCompilations(), 0);
+}
+
+TEST(DeviceCompilationProfilerTest, ShouldCompileClusterNotFound) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy, 0));
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kStrict, 0));
+}
+
+TEST(DeviceCompilationProfilerTest, ShouldCompileClusterFirstExecution) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  profiler->RegisterExecution(function);
+
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy, 0));
+}
+
+TEST(DeviceCompilationProfilerTest, ShouldCompileClusterMegamorphic) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  const int64_t kCompileThreshold = 10;
+  const int64_t kMinExecutionsPerCompile = 50;
+
+  // Register compilation enough times (without registering executions enough
+  // times) so that the function is marked megamorphic.
+  for (int i = 0; i < kCompileThreshold + 1; ++i) {
+    EXPECT_TRUE(profiler->RegisterCompilation(function, 1, false).ok());
+  }
+  profiler->RegisterExecution(function);
+
+  // Shouldn't compile cluster since it has gone megamorphic.
+  EXPECT_FALSE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+  EXPECT_FALSE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy, 0));
+  TF_ASSERT_OK_AND_ASSIGN(auto stats, profiler->GetCompileStats(function));
+  EXPECT_TRUE(stats.is_megamorphic);
+
+  // Always compile for strict compile mode.
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kStrict, 0));
+
+  // Once a cluster has gone megamorphic, it remains megamorphic (even though
+  // it's being executed more frequently now) and shouldn't be compiled again.
+  for (int i = 0; i < kCompileThreshold * kMinExecutionsPerCompile + 1; ++i) {
+    profiler->RegisterExecution(function);
+  }
+
+  EXPECT_FALSE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+  EXPECT_FALSE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy, 0));
+  TF_ASSERT_OK_AND_ASSIGN(stats, profiler->GetCompileStats(function));
+  EXPECT_TRUE(stats.is_megamorphic);
+
+  // Always compile for strict compile mode.
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kStrict, 0));
+}
+
+TEST(DeviceCompilationProfilerTest, ShouldCompileClusterAsync) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  const int64_t kMaxNumOngoingCompilations = 10;
+  for (int i = 0; i < kMaxNumOngoingCompilations; ++i) {
+    profiler->IncrementOngoingAsyncCompilations();
+  }
+
+  // Should allow compilation since this is the first execution.
+  profiler->RegisterExecution(function);
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+
+  // Should not allow compilation since this is not the first execution and
+  // we've already reached the maximum number of ongoing compilations allowed.
+  profiler->RegisterExecution(function);
+  EXPECT_FALSE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+
+  profiler->DecrementOngoingAsyncCompilations();
+  // Should allow compilation since we've decremented the number of ongoing
+  // compilations.
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kAsync, 0));
+}
+
+TEST(DeviceCompilationProfilerTest, ShouldCompileClusterLazy) {
+  DeviceCompilationProfiler* profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  NameAttrList function;
+  function.set_name("TestFunc");
+
+  constexpr int64_t kDefaultCompilationThreshold = 2;
+
+  // Should allow compilation since this is the first execution.
+  profiler->RegisterExecution(function);
+  EXPECT_TRUE(
+      profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy, 0));
+
+  // Shouldn't allow compilation until compilation has been requested at least
+  // kDefaultCompilationThreshold times.
+  profiler->RegisterExecution(function);
+  for (int current_request_count = 0;
+       current_request_count < kDefaultCompilationThreshold;
+       ++current_request_count) {
+    EXPECT_FALSE(profiler->ShouldCompileCluster(
+        function, DeviceCompileMode::kLazy, current_request_count));
+  }
+  EXPECT_TRUE(profiler->ShouldCompileCluster(function, DeviceCompileMode::kLazy,
+                                             kDefaultCompilationThreshold));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compiler.h b/tensorflow/compiler/jit/device_compiler.h
new file mode 100644
index 00000000000..e942169c519
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compiler.h
@@ -0,0 +1,492 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/base/call_once.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/device_compilation_cache.h"
+#include "tensorflow/compiler/jit/device_compilation_cluster_signature.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+// Compiles/lowers a given Tensorflow graph/function/cluster into a compiled XLA
+// compilation (HLO) using the XlaCompiler and compiles the resulting
+// XlaCompilationResult into an `ExecutableType` (eg. xla::LocalExecutable) by
+// calling `ClientType` (eg. xla::LocalClient).
+//
+// Caches the compiled XlaCompilationResult and Executable using a
+// DeviceCompilationCache. Compilation is done only when there's a cache miss.
+//
+// Uses the DeviceExecutablePersistor class for persistence and tries to load a
+// serialized executable from disk upon a request for compilation. If the
+// appropriate executable isn't found on disk, compiles the given Tensorflow
+// function/graph/cluster into an XlaCompilationResult (HLO) and
+// `ExecutableType` and tries saving/persisting the compiled HLO and executable
+// to disk.
+//
+// Since XLA computations must have static shapes, DeviceCompiler generates a
+// new XLA computation for each new set of input shapes.
+// TODO(b/255826209): De-templatize once we've moved to Device API completely.
+template <typename ExecutableType, typename ClientType>
+class DeviceCompiler : public ResourceBase {
+ public:
+  DeviceCompiler(
+      std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+          persistor,
+      std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+          compiler_client);
+  ~DeviceCompiler() override;
+
+  enum class CompileScope {
+    kOp,
+    kFunction,
+  };
+
+  // Compiles a function into a XlaCompiler::CompilationResult that can be used
+  // to execute an XLA Computation. Compilation results are cached. Compilation
+  // is skipped if there is a cache hit. `function` is the name of a Tensorflow
+  // function to compile. `args` is a description of the arguments to the
+  // computation.
+  //
+  // `compile_mode` controls the behavior of the compilation cache on a cache
+  // miss.  If `compile_mode` is `kLazy` then, based on some profitability
+  // heuristics, the compilation cache may decide not to compile the cluster at
+  // this time.  In this case it returns null into both `out_compilation_result`
+  // and `out_executable`.  If `compile_mode` is `kStrict` then the compilation
+  // cache always attempts the compilation on a cache miss. If compilation mode
+  // is 'kAsync' compilation of the cluster happens in the background while the
+  // fallback path executes.
+  //
+  // The result of compilation is written to `*out_compilation_result`, which
+  // must be non-null. If `out_executable` is non-null, also builds an
+  // `ExecutableType` and sets `out_executable` to point to it. The
+  // resulting executable pointer may be null if the computation has no
+  // non-constant outputs.
+  Status CompileIfNeeded(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      const std::vector<XlaCompiler::Argument>& args,
+      const XlaCompiler::CompileOptions& compile_options,
+      DeviceCompileMode compile_mode, DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  // As above, but for a single op.
+  Status CompileSingleOpIfNeeded(
+      const XlaCompiler::Options& options,
+      const std::vector<XlaCompiler::Argument>& args,
+      const XlaCompiler::CompileOptions& compile_options, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  ClientType* client() const { return compiler_client_->client(); }
+  const DeviceType& device_type() const { return persistor_->device_type(); }
+  DeviceCompilationCache<ExecutableType>* cache() { return cache_.get(); }
+
+  string DebugString() const override;
+
+ private:
+  // Common implementation of Compile and CompileSingleOp. The `OpKernelContext`
+  // parameter is always null for the former.
+  Status CompileImpl(
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      const std::vector<XlaCompiler::Argument>& args, CompileScope scope,
+      DeviceCompileMode compile_mode, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      ExecutableType** out_executable);
+
+  StatusOr<typename DeviceCompilationCache<ExecutableType>::Value>
+  CompileStrict(
+      const DeviceCompilationClusterSignature& sig,
+      const XlaCompiler::CompileOptions& compile_options,
+      const XlaCompiler::Options& options,
+      const std::vector<XlaCompiler::Argument>& args,
+      const NameAttrList& function,
+      typename DeviceCompilationCache<ExecutableType>::Value cache_value,
+      CompileScope scope, OpKernelContext* ctx,
+      DeviceCompilationProfiler* profiler, mutex* mu)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(*mu);
+
+  Status CompileAsynchronous(const DeviceCompilationClusterSignature& sig,
+                             const XlaCompiler::CompileOptions& compile_options,
+                             const XlaCompiler::Options& options,
+                             const std::vector<XlaCompiler::Argument>& args,
+                             const NameAttrList& function, CompileScope scope,
+                             OpKernelContext* ctx,
+                             DeviceCompilationProfiler* profiler);
+
+  std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+      persistor_;
+  std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+      compiler_client_;
+  std::unique_ptr<DeviceCompilationCache<ExecutableType>> cache_;
+
+  // Pool of threads for asynchronous compilations.
+  std::unique_ptr<thread::ThreadPool> async_compiler_threads_;
+
+  mutex cluster_mutexes_mu_;
+  absl::flat_hash_map<DeviceCompilationClusterSignature, std::unique_ptr<mutex>,
+                      DeviceCompilationClusterSignature::Hash>
+      cluster_mutexes_ TF_GUARDED_BY(cluster_mutexes_mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceCompiler);
+};
+
+namespace device_compiler_internal {
+// Print something that users can search for to definitively ascertain that XLA
+// was used for their TF model.
+// Prints only once to avoid spamming LOG(INFO).
+inline void LogOnceXlaCompiledFirstCluster() {
+  static absl::once_flag log_once;
+  absl::call_once(log_once, [] {
+    LOG(INFO) << "Compiled cluster using XLA!  This line is logged at most "
+                 "once for the lifetime of the process.";
+  });
+}
+
+inline Status EligibleToPersist(DeviceCompileState compile_state,
+                                const xla::LocalExecutable* executable) {
+  if (compile_state != DeviceCompileState::kCompiled) {
+    return errors::FailedPrecondition(
+        "Cache entry to serialize is not compiled.");
+  }
+  if (executable == nullptr) {
+    return errors::FailedPrecondition(
+        "LocalExecutable not found for cache entry to serialize.");
+  }
+  return OkStatus();
+}
+}  // namespace device_compiler_internal
+
+template <typename ExecutableType, typename ClientType>
+DeviceCompiler<ExecutableType, ClientType>::DeviceCompiler(
+    std::unique_ptr<DeviceExecutablePersistor<ExecutableType, ClientType>>
+        persistor,
+    std::unique_ptr<DeviceCompilerClient<ExecutableType, ClientType>>
+        compiler_client)
+    : persistor_(std::move(persistor)),
+      compiler_client_(std::move(compiler_client)) {
+  cache_ = std::make_unique<DeviceCompilationCache<ExecutableType>>();
+  async_compiler_threads_ = std::make_unique<tensorflow::thread::ThreadPool>(
+      tensorflow::Env::Default(), "async_compiler_threads",
+      kNumAsyncDeviceCompilerThreads);
+}
+
+template <typename ExecutableType, typename ClientType>
+DeviceCompiler<ExecutableType, ClientType>::~DeviceCompiler() {
+  // Since programs are owned by the cache, ensure any use of our programs have
+  // completed by waiting for all stream executors to complete.
+  compiler_client_->WaitForProgramsToFinish();
+  // Wait for all outstanding compilations to finish.
+  // Resetting the pointer explicitly in the top level destructor.
+  // Without this, the pointer would be reset when the AsyncCompilationState
+  // is destructed, which is dependent on the order of the members in the
+  // DeviceCompiler class, which is error prone if the order changes.
+  async_compiler_threads_.reset();
+  // TODO(b/110813685): Think about the program ownership model. Programs are
+  // currently owned by the compilation cache which means we must wait for
+  // program completion in the destructor. There are multiple compilation caches
+  // around, which complicates things a little. Perhaps having programs be
+  // shared_ptrs (an invasive change) would make the model easier to reason
+  // about?
+}
+
+template <typename ExecutableType, typename ClientType>
+string DeviceCompiler<ExecutableType, ClientType>::DebugString() const {
+  return "DeviceCompiler";
+}
+
+template <typename ExecutableType, typename ClientType>
+Status DeviceCompiler<ExecutableType, ClientType>::CompileIfNeeded(
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args,
+    const XlaCompiler::CompileOptions& compile_options,
+    DeviceCompileMode compile_mode, DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  return CompileImpl(compile_options, options, function, args,
+                     CompileScope::kFunction, compile_mode, /*ctx=*/nullptr,
+                     profiler, out_compilation_result, out_executable);
+}
+
+template <typename ExecutableType, typename ClientType>
+Status DeviceCompiler<ExecutableType, ClientType>::CompileSingleOpIfNeeded(
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const XlaCompiler::CompileOptions& compile_options, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  const NodeDef& def = ctx->op_kernel().def();
+  NameAttrList name;
+  name.set_name(def.op());
+  *name.mutable_attr() = def.attr();
+  // Remove the "_class" attribute from the attribute set used to create the
+  // compilation cache key. This attribute is information for the colocator
+  // and causes false uniqueness between nodes.
+  name.mutable_attr()->erase("_class");
+  return CompileImpl(compile_options, options, name, args, CompileScope::kOp,
+                     DeviceCompileMode::kStrict, ctx, profiler,
+                     out_compilation_result, out_executable);
+}
+
+template <typename ExecutableType, typename ClientType>
+StatusOr<typename DeviceCompilationCache<ExecutableType>::Value>
+DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
+    const DeviceCompilationClusterSignature& sig,
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const NameAttrList& function,
+    typename DeviceCompilationCache<ExecutableType>::Value cache_value,
+    CompileScope scope, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler, mutex* mu) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  const uint64 compile_start_us = env->NowMicros();
+
+  TfGraphToHloCompiler compiler(options);
+  cache_value.compile_state = DeviceCompileState::kCompiled;
+
+  std::unique_ptr<ExecutableType> out_executable;
+  auto out_compilation_result =
+      std::make_unique<XlaCompiler::CompilationResult>();
+
+  if (scope == CompileScope::kOp) {
+    cache_value.compilation_status = compiler.CompileSingleOp(
+        compile_options, ctx, args, out_compilation_result.get());
+  } else {
+    CHECK(scope == CompileScope::kFunction);  // Crash OK
+    cache_value.compilation_status = compiler.Compile(
+        compile_options, function, args, out_compilation_result.get());
+  }
+  TF_RETURN_IF_ERROR(cache_value.compilation_status);
+  TF_RET_CHECK(cache_value.executable == nullptr);
+  TF_RET_CHECK(out_compilation_result->computation != nullptr);
+
+  auto loaded_executable = persistor_->TryToLoadExecutable(
+      DeviceCompilationClusterSignature::Hash()(sig), sig.HumanString(),
+      options, *out_compilation_result, compiler_client_.get());
+
+  if (loaded_executable.has_value()) {
+    cache_value.compilation_status = loaded_executable->status();
+    if (loaded_executable->ok()) {
+      out_executable = *std::move(*loaded_executable);
+    }
+  } else {
+    auto built_executable =
+        compiler_client_->BuildExecutable(options, *out_compilation_result);
+    TF_RETURN_IF_ERROR(built_executable.status());
+    out_executable = *std::move(built_executable);
+
+    TF_RETURN_IF_ERROR(device_compiler_internal::EligibleToPersist(
+        cache_value.compile_state, out_executable.get()));
+    TF_RETURN_IF_ERROR(persistor_->TryToPersistExecutable(
+        DeviceCompilationClusterSignature::Hash()(sig), sig.HumanString(),
+        options, *out_compilation_result, *out_executable,
+        compiler_client_.get()));
+  }
+
+  cache_value.compilation_result = out_compilation_result.get();
+  cache_value.executable = out_executable.get();
+  cache_->Store(sig, cache_value.compile_state, cache_value.compilation_status,
+                std::move(out_compilation_result), std::move(out_executable));
+
+  const uint64 compile_end_us = env->NowMicros();
+  const uint64 compile_time_us = compile_end_us - compile_start_us;
+
+  device_compiler_internal::LogOnceXlaCompiledFirstCluster();
+  TF_RETURN_IF_ERROR(profiler->RegisterCompilation(
+      function, compile_time_us, loaded_executable.has_value()));
+  return cache_value;
+}
+
+template <typename ExecutableType, typename ClientType>
+Status DeviceCompiler<ExecutableType, ClientType>::CompileAsynchronous(
+    const DeviceCompilationClusterSignature& signature,
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options,
+    const std::vector<XlaCompiler::Argument>& args,
+    const NameAttrList& function, CompileScope scope, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler) {
+  // Explicitly capture all required data by value for async compilation.
+  // Update compilation state in cache.
+  cache_->Store(signature, DeviceCompileState::kCompiling, std::nullopt,
+                std::nullopt, std::nullopt);
+  profiler->IncrementOngoingAsyncCompilations();
+  // Don't move the above code into the thread function as it synchronously
+  // updates the async compilation state!
+
+  // When the ThreadPool for the compilation cache is destroyed, it waits for
+  // compilations to have finished. This means that both 'entry' and 'this' will
+  // be alive for the duration of the compilation.
+  // !!Pay attention when additional variables must be captured by this lambda!!
+  // All values are captured by value. Make sure that all pointer values (like
+  // entry) do not get freed until the lambda has finished.
+  const std::string& function_name = function.name();
+  async_compiler_threads_->Schedule([=] {
+    VLOG(2) << "Starting asynchronous compilation of cluster " << function_name
+            << '.';
+    // We don't need to lock mu, but do it anyway to satisfy thread safety
+    // analysis.
+    mutex mu;
+    mutex_lock lock(mu);
+    auto cache_value = typename DeviceCompilationCache<ExecutableType>::Value();
+    auto s = CompileStrict(signature, compile_options, options, args, function,
+                           cache_value, scope, ctx, profiler, &mu);
+    VLOG(2) << "Finished asynchronous compililation of cluster "
+            << function_name << '.';
+    profiler->DecrementOngoingAsyncCompilations();
+    // Update compilation status in cache.
+    if (!s.ok()) {
+      cache_->Store(signature, std::nullopt, s.status(), std::nullopt,
+                    std::nullopt);
+    }
+  });
+  return OkStatus();
+}
+
+template <typename ExecutableType, typename ClientType>
+Status DeviceCompiler<ExecutableType, ClientType>::CompileImpl(
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    const std::vector<XlaCompiler::Argument>& args, CompileScope scope,
+    DeviceCompileMode compile_mode, OpKernelContext* ctx,
+    DeviceCompilationProfiler* profiler,
+    const XlaCompiler::CompilationResult** out_compilation_result,
+    ExecutableType** out_executable) {
+  DCHECK_NE(out_executable, nullptr);
+  VLOG(2) << "DeviceCompiler::Compile " << DebugString();
+
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "num_inputs=" << args.size();
+    for (int i = 0, end = args.size(); i < end; i++) {
+      VLOG(3) << i << ": " << args[i].HumanString();
+    }
+  }
+  TF_ASSIGN_OR_RETURN(auto signature,
+                      DeviceCompilationClusterSignature::Build(function, args));
+
+  // The outer lock protects the existence of the mutex in the map.
+  mutex* cluster_mutex;
+  {
+    mutex_lock lock(cluster_mutexes_mu_);
+    auto it =
+        cluster_mutexes_.emplace(signature, std::make_unique<mutex>()).first;
+    cluster_mutex = it->second.get();
+  }
+
+  profiler->RegisterExecution(function);
+
+  string human_signature;
+  if (VLOG_IS_ON(2)) {
+    human_signature = VLOG_IS_ON(3) ? signature.HumanString() : function.name();
+    VLOG(2) << "DeviceCompilationClusterSignature: " << human_signature;
+  }
+
+  // Acquire the cache entry lock and compile, if necessary.
+  // TODO(phawkins): this locking will need to be restructured when we implement
+  // cache eviction.
+  mutex_lock cluster_compile_lock(*cluster_mutex);
+  auto cache_value = cache_->LookupOrCreate(signature);
+
+  int64_t current_request_count = cache_value.request_count;
+  VLOG(2) << "Compilation cache entry hit: "
+          << static_cast<int>(cache_value.compile_state)
+          << " signature: " << human_signature << " with request count "
+          << current_request_count;
+
+  DeviceCompileState state = cache_value.compile_state;
+  *out_compilation_result = nullptr;
+  *out_executable = nullptr;
+
+  // Check if the requested entry is uncompiled and return an error if
+  // compilation is disabled. This will raise an error for kLazy even if we have
+  // not yet hit the compilation threshold and no compilation happens this
+  // round. This is to avoid non-determanism of when compilation is disallowed,
+  // for example by changing the threshold.
+  if (state == DeviceCompileState::kUncompiled && FailOnXlaCompilation()) {
+    VLOG(1) << "XLA compilation disabled: " << function.name() << "\n"
+            << absl::StrJoin(
+                   args, "\n",
+                   [](std::string* out, const XlaCompiler::Argument& arg) {
+                     absl::StrAppend(out, " arg: ", arg.HumanString());
+                   });
+
+    return errors::Internal("XLA compilation disabled");
+  }
+
+  if (state == DeviceCompileState::kUncompiled) {
+    XLA_SCOPED_LOGGING_TIMER("Compilation of XLA executable");
+    if (!profiler->ShouldCompileCluster(function, compile_mode,
+                                        current_request_count)) {
+      VLOG(2) << "Not compiling for signature: " << human_signature;
+      return OkStatus();
+    } else if (compile_mode == DeviceCompileMode::kAsync) {
+      VLOG(2) << "Queueing asynchronous compilation for signature: "
+              << human_signature;
+      TF_RETURN_IF_ERROR(CompileAsynchronous(signature, compile_options,
+                                             options, args, function, scope,
+                                             ctx, profiler));
+      return OkStatus();
+    } else {
+      VLOG(2) << "Instantly compiling for signature: " << human_signature;
+      TF_ASSIGN_OR_RETURN(
+          cache_value,
+          CompileStrict(signature, compile_options, options, args, function,
+                        cache_value, scope, ctx, profiler, cluster_mutex));
+    }
+  } else if (state == DeviceCompileState::kCompiling) {
+    VLOG(2) << "Ongoing asynchronous compilation for signature: "
+            << human_signature;
+    return OkStatus();
+  } else if (state == DeviceCompileState::kCompiled) {
+    VLOG(2) << "Already Compiled for signature: " << human_signature;
+  }
+
+  TF_RETURN_IF_ERROR(cache_value.compilation_status);
+  *out_compilation_result = cache_value.compilation_result;
+  *out_executable = cache_value.executable;
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_H_
diff --git a/tensorflow/compiler/jit/device_compiler_client.cc b/tensorflow/compiler/jit/device_compiler_client.cc
new file mode 100644
index 00000000000..e84906126b5
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compiler_client.cc
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/util/determinism.h"
+
+namespace tensorflow {
+
+xla::ExecutableBuildOptions GetExecutableBuildOptions(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result, int default_device_ordinal) {
+  xla::ExecutableBuildOptions build_options;
+  if (result.collective_info) {
+    build_options.set_num_replicas(result.collective_info->group_size);
+  }
+  if (options.device_ordinal != -1) {
+    build_options.set_device_ordinal(options.device_ordinal);
+  } else if (default_device_ordinal != -1) {
+    build_options.set_device_ordinal(default_device_ordinal);
+  }
+  build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_device_allocator(options.device_allocator.get());
+  build_options.set_alias_passthrough_params(options.alias_passthrough_params);
+  build_options.mutable_debug_options()->set_xla_detailed_logging_and_dumping(
+      options.detailed_logging);
+  if (tensorflow::OpDeterminismRequired()) {
+    build_options.mutable_debug_options()->set_xla_gpu_deterministic_ops(true);
+  }
+  return build_options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compiler_client.h b/tensorflow/compiler/jit/device_compiler_client.h
new file mode 100644
index 00000000000..44da761ee28
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compiler_client.h
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
+
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+
+namespace tensorflow {
+
+template <typename ExecutableType, typename ClientType>
+class DeviceCompilerClient {
+ public:
+  DeviceCompilerClient() = default;
+  virtual ~DeviceCompilerClient() = default;
+
+  // Compiles `result` (HLO) to an `ExecutableType` using `ClientType` and
+  // returns it.
+  virtual StatusOr<std::unique_ptr<ExecutableType>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) = 0;
+
+  // Serializes an available `executable` to string using `ClientType` and
+  // returns it.
+  virtual StatusOr<std::string> SerializeExecutable(
+      const ExecutableType& executable) = 0;
+
+  // Compiles `result` (HLO) to a serializable executable (eg.
+  // xla::AotCompilationResult) using `ClientType`, serializes it to string and
+  // returns it.
+  virtual StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) = 0;
+
+  // Loads `serialized_executable` into an `ExecutableType` using `ClientType`.
+  virtual StatusOr<std::unique_ptr<ExecutableType>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) = 0;
+
+  // Waits for the underlying `ClientType` backend's programs to finish
+  // executing before returning.
+  virtual void WaitForProgramsToFinish() = 0;
+
+  virtual ClientType* client() const = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceCompilerClient);
+};
+
+// Generates the ExecutableBuildOptions for compilation from HLO to
+// executable.
+xla::ExecutableBuildOptions GetExecutableBuildOptions(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result, int default_device_ordinal);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_COMPILER_CLIENT_H_
diff --git a/tensorflow/compiler/jit/device_compiler_client_test.cc b/tensorflow/compiler/jit/device_compiler_client_test.cc
new file mode 100644
index 00000000000..104b0b0f651
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compiler_client_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+
+#include <gtest/gtest.h>
+
+namespace tensorflow {
+namespace {
+
+TEST(GetExecutableOptionTest, Basic) {
+  XlaCompiler::Options options;
+  options.device_ordinal = 0;
+  options.alias_passthrough_params = true;
+  options.detailed_logging = true;
+  XlaCompiler::CompilationResult result;
+  xla::Shape xla_output_shape;
+  result.xla_output_shape = xla_output_shape;
+
+  auto build_option =
+      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
+
+  EXPECT_EQ(build_option.device_ordinal(), 0);
+  EXPECT_EQ(build_option.result_layout()->ToString(),
+            xla_output_shape.ToString());
+  EXPECT_EQ(build_option.alias_passthrough_params(), true);
+  EXPECT_EQ(build_option.debug_options().xla_detailed_logging_and_dumping(),
+            true);
+}
+
+TEST(GetExecutableOptionTest, DefaultDeviceOrdinal) {
+  XlaCompiler::Options options;
+  XlaCompiler::CompilationResult result;
+
+  auto build_option =
+      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/0);
+
+  EXPECT_EQ(build_option.device_ordinal(), 0);
+}
+
+TEST(GetExecutableOptionTest, DeviceOrdinalNotSet) {
+  XlaCompiler::Options options;
+  XlaCompiler::CompilationResult result;
+
+  auto build_option =
+      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
+
+  EXPECT_EQ(build_option.device_ordinal(), -1);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_compiler_disable_test.cc b/tensorflow/compiler/jit/device_compiler_disable_test.cc
new file mode 100644
index 00000000000..cf4b5461861
--- /dev/null
+++ b/tensorflow/compiler/jit/device_compiler_disable_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// This test is kept separate because it disables XLA compilation globally.
+TEST(DeviceCompilerTest, TestDisabledXlaCompilation) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+
+  // Create mock arguments so we see them in the VLOG when compilation fails.
+  std::vector<XlaCompiler::Argument> args(2);
+  for (int i = 0; i < 2; ++i) {
+    args[i].kind = XlaCompiler::Argument::kParameter;
+    args[i].type = DT_INT32;
+    args[i].shape = TensorShape({2, i + 1});
+    args[i].name = absl::StrCat("arg", i);
+  }
+
+  DisableXlaCompilation();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  DeviceType device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+
+  const XlaCompiler::CompilationResult* compilation_result;
+  xla::LocalExecutable* executable;
+
+  using XlaDeviceExecutablePersistor =
+      DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
+  auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
+      XlaDeviceExecutablePersistor::Config(), device_type);
+  auto compiler_client = std::make_unique<XlaDeviceCompilerClient>(client);
+  auto xla_device_compiler =
+      new DeviceCompiler<xla::LocalExecutable, xla::LocalClient>(
+          std::move(persistor), std::move(compiler_client));
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+
+  auto profiler = new DeviceCompilationProfiler();
+  core::ScopedUnref profiler_ref(profiler);
+
+  // Check that strict compilation is disallowed.
+  Status status = xla_device_compiler->CompileIfNeeded(
+      XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
+      DeviceCompileMode::kStrict, profiler, &compilation_result, &executable);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+
+  // Check that async compilation is disallowed.
+  status = xla_device_compiler->CompileIfNeeded(
+      XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
+      DeviceCompileMode::kAsync, profiler, &compilation_result, &executable);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+
+  // Check that lazy compilation is disallowed.
+  status = xla_device_compiler->CompileIfNeeded(
+      XlaCompiler::Options{}, fn, args, XlaCompiler::CompileOptions{},
+      DeviceCompileMode::kLazy, profiler, &compilation_result, &executable);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/device_executable_persistor.h b/tensorflow/compiler/jit/device_executable_persistor.h
new file mode 100644
index 00000000000..ced6e2c43e1
--- /dev/null
+++ b/tensorflow/compiler/jit/device_executable_persistor.h
@@ -0,0 +1,336 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
+#define TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
+
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+// Offers a way to persist and/or load compiled `ExecutableType`s along with the
+// corresponding HLO (`CompilationResult`) to/from `persistent_cache_directory`
+// (if one was provided during construction) on disk  using `ClientType`.
+template <typename ExecutableType, typename ClientType>
+class DeviceExecutablePersistor {
+ public:
+  // Configuration for setting up persistence (directory, filename prefix, etc).
+  struct Config {
+    Config() = default;
+    explicit Config(absl::string_view persistent_cache_directory,
+                    bool disable_strict_signature_checks,
+                    absl::string_view persistence_prefix)
+        : persistent_cache_directory(persistent_cache_directory),
+          disable_strict_signature_checks(disable_strict_signature_checks),
+          persistence_prefix(persistence_prefix) {}
+
+    // If non-empty, JIT-compiled executables are saved to and loaded from the
+    // specified file system directory path.
+    std::string persistent_cache_directory;
+
+    // Disable strict signature checks for entries loaded into the cache from
+    // external sources.
+    bool disable_strict_signature_checks = false;
+
+    // The cache persistence prefix to use if serializing/deserialzing entries.
+    std::string persistence_prefix;
+  };
+
+  DeviceExecutablePersistor(const Config& config,
+                            const DeviceType& device_type);
+
+  // Returns std::nullopt if persistence is not enabled (i.e.
+  // `persistent_cache_directory_` is empty) or if the serialized entry is not
+  // found on disk. Otherwise, loads and returns the serialized executable
+  // (or returns a status).
+  // TODO(b/255826209): Take in Signature instead of hash and string once cache
+  // is refactored.
+  std::optional<StatusOr<std::unique_ptr<ExecutableType>>> TryToLoadExecutable(
+      uint64 signature_hash, const std::string& signature_str,
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      DeviceCompilerClient<ExecutableType, ClientType>* client) const;
+
+  // Tries to serialize an already built `executable` and persist it on disk. If
+  // unable to do so, tries to build a serialized executable using the AOT
+  // pipeline and persists that to disk.
+  // TODO(b/255826209): Take in Signature instead hash and string once cache
+  // is refactored.
+  Status TryToPersistExecutable(
+      uint64 signature_hash, const std::string& signature_str,
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      const ExecutableType& executable,
+      DeviceCompilerClient<ExecutableType, ClientType>* client) const;
+
+  const DeviceType& device_type() const { return device_type_; }
+  const std::string& persistence_prefix() const { return persistence_prefix_; }
+  const std::string& persistent_cache_directory() const {
+    return persistent_cache_directory_;
+  }
+
+ private:
+  // Returns a cache key proto that identifies an entry in the compilation
+  // cache.
+  XlaSerializedCacheKey BuildSerializedCacheKey(
+      uint64 signature_hash, const xla::HloModuleProto& hlo_module) const;
+
+  // Serializes the signature and its corresponding entry to a proto message.
+  StatusOr<XlaSerializedCacheEntry> SerializeEntry(
+      uint64 signature_hash, const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& compilation_result,
+      const ExecutableType& executable,
+      DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const;
+
+  // Saves the cache entry in the file directory supplied during the
+  // construction of this class. Overwrites existing entries.
+  Status SaveSerializedEntry(const XlaSerializedCacheEntry& entry) const;
+
+  // Tries to read a cache entry given a `key` by searching the file directory
+  // supplied during the construction of this class. Returns std::nullopt if no
+  // cache entry is found.
+  StatusOr<std::optional<XlaSerializedCacheEntry>> TryToReadSerializedEntry(
+      const XlaSerializedCacheKey& key) const;
+
+  // Checks if the loaded `entry` matches the expected `key` and `hlo_module`.
+  Status VerifyLoadedCacheEntry(const XlaSerializedCacheKey& key,
+                                const xla::HloModuleProto& hlo_module,
+                                const XlaSerializedCacheEntry& entry) const;
+
+  std::string XlaSerializedCacheKeyToString(
+      const XlaSerializedCacheKey& key) const;
+  std::string GetFilePath(const XlaSerializedCacheKey& key) const;
+
+  const DeviceType device_type_;
+  const bool disable_strict_signature_checks_;
+  const std::string persistence_prefix_;
+
+  // If non-empty, JIT-compiled executables are saved to and loaded from the
+  // specified file system directory path.
+  const std::string persistent_cache_directory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceExecutablePersistor);
+};
+
+template <typename ExecutableType, typename ClientType>
+DeviceExecutablePersistor<ExecutableType, ClientType>::
+    DeviceExecutablePersistor(const Config& config,
+                              const DeviceType& device_type)
+    : device_type_(device_type),
+      disable_strict_signature_checks_(config.disable_strict_signature_checks),
+      persistence_prefix_(config.persistence_prefix),
+      persistent_cache_directory_(config.persistent_cache_directory) {}
+
+template <typename ExecutableType, typename ClientType>
+std::string DeviceExecutablePersistor<ExecutableType, ClientType>::
+    XlaSerializedCacheKeyToString(const XlaSerializedCacheKey& key) const {
+  static constexpr char kXlaSerializedCacheKeySeparator[] = "__";
+  return absl::StrCat(
+      key.prefix(), key.prefix().empty() ? "" : kXlaSerializedCacheKeySeparator,
+      key.signature_fingerprint(), kXlaSerializedCacheKeySeparator,
+      key.cluster_fingerprint(), kXlaSerializedCacheKeySeparator,
+      key.device_type());
+}
+
+template <typename ExecutableType, typename ClientType>
+std::string DeviceExecutablePersistor<ExecutableType, ClientType>::GetFilePath(
+    const XlaSerializedCacheKey& key) const {
+  const std::string file_name =
+      absl::StrCat(XlaSerializedCacheKeyToString(key), ".pb");
+  return io::JoinPath(persistent_cache_directory_, file_name);
+}
+
+template <typename ExecutableType, typename ClientType>
+XlaSerializedCacheKey
+DeviceExecutablePersistor<ExecutableType, ClientType>::BuildSerializedCacheKey(
+    uint64 signature_hash, const xla::HloModuleProto& hlo_module) const {
+  XlaSerializedCacheKey serialized_cache_key;
+  serialized_cache_key.set_signature_fingerprint(signature_hash);
+  serialized_cache_key.set_cluster_fingerprint(
+      DeterministicProtoHash64(hlo_module));
+  serialized_cache_key.set_device_type(device_type().type_string());
+  serialized_cache_key.set_prefix(persistence_prefix());
+  return serialized_cache_key;
+}
+
+template <typename ExecutableType, typename ClientType>
+StatusOr<std::optional<XlaSerializedCacheEntry>>
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToReadSerializedEntry(
+    const XlaSerializedCacheKey& key) const {
+  Env* env = Env::Default();
+  const std::string file_path = GetFilePath(key);
+  if (!env->FileExists(file_path).ok()) {
+    return StatusOr<std::optional<XlaSerializedCacheEntry>>(std::nullopt);
+  }
+
+  XlaSerializedCacheEntry entry;
+  TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(env, file_path, &entry));
+  return std::optional<XlaSerializedCacheEntry>(entry);
+}
+
+template <typename ExecutableType, typename ClientType>
+Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::VerifyLoadedCacheEntry(
+    const XlaSerializedCacheKey& key, const xla::HloModuleProto& hlo_module,
+    const XlaSerializedCacheEntry& entry) const {
+  XLA_SCOPED_LOGGING_TIMER(absl::StrCat("Verifying loaded cache entry: ",
+                                        hlo_module.entry_computation_name()));
+
+  if (!AreSerializedProtosEqual(key, entry.key())) {
+    VLOG(2) << "Serialized cache key does not match:\n"
+            << "got:\n"
+            << entry.key().DebugString() << "\nexpected:\n"
+            << key.DebugString() << "\n";
+    return errors::InvalidArgument("Serialized cache key does not match.");
+  }
+
+  // Perform a stricter (slower) check of the snapshot to verify that they
+  // match exactly.
+  if (!disable_strict_signature_checks_) {
+    if (!AreSerializedProtosEqual(hlo_module, entry.hlo_module())) {
+      VLOG(2) << "HLOs do not match:\n"
+              << "got:\n"
+              << hlo_module.DebugString() << "\nexpected:\n"
+              << entry.hlo_module().DebugString() << "\n";
+      return errors::InvalidArgument("Serialized HLO does not match.");
+    }
+  }
+
+  if (entry.executable().empty()) {
+    return errors::InvalidArgument("No binary found in serialized entry.");
+  }
+  return OkStatus();
+}
+
+template <typename ExecutableType, typename ClientType>
+Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::SaveSerializedEntry(
+    const XlaSerializedCacheEntry& entry) const {
+  Env* env = Env::Default();
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(persistent_cache_directory_));
+  const std::string file_path = GetFilePath(entry.key());
+  return WriteBinaryProto(env, file_path, entry);
+}
+
+template <typename ExecutableType, typename ClientType>
+StatusOr<XlaSerializedCacheEntry>
+DeviceExecutablePersistor<ExecutableType, ClientType>::SerializeEntry(
+    uint64 signature_hash, const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const ExecutableType& executable,
+    DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
+  XlaSerializedCacheEntry serialized_entry;
+  const xla::HloModuleProto& hlo_module =
+      compilation_result.computation->proto();
+  *serialized_entry.mutable_key() =
+      BuildSerializedCacheKey(signature_hash, hlo_module);
+  *serialized_entry.mutable_hlo_module() = hlo_module;
+
+  // XLA compiler supports exporting executables as an AOT compilation result
+  // to avoid running potentially expensive compilation pipeline twice.
+  // Check if XLA compiler can export available executable.
+  if (auto serialized_executable =
+          compiler_client->SerializeExecutable(executable);
+      serialized_executable.ok()) {
+    serialized_entry.set_executable(std::move(*serialized_executable));
+    return serialized_entry;
+  } else if (serialized_executable.status().code() == error::UNIMPLEMENTED) {
+    VLOG(1) << "Executable export is not implemented";
+  } else {
+    return serialized_executable.status();
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto serialized_executable,
+      compiler_client->BuildSerializedExecutable(options, compilation_result));
+  serialized_entry.set_executable(std::move(serialized_executable));
+  return serialized_entry;
+}
+
+template <typename ExecutableType, typename ClientType>
+std::optional<StatusOr<std::unique_ptr<ExecutableType>>>
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToLoadExecutable(
+    uint64 signature_hash, const std::string& signature_str,
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    DeviceCompilerClient<ExecutableType, ClientType>* compiler_client) const {
+  if (persistent_cache_directory_.empty()) {
+    return std::nullopt;
+  }
+
+  const xla::HloModuleProto& hlo_module =
+      compilation_result.computation->proto();
+
+  XlaSerializedCacheKey cache_key =
+      BuildSerializedCacheKey(signature_hash, hlo_module);
+
+  std::optional<XlaSerializedCacheEntry> serialized_entry;
+  {
+    XLA_SCOPED_LOGGING_TIMER(
+        absl::StrCat("Try loading serialized cache entry:", signature_str));
+    TF_ASSIGN_OR_RETURN(serialized_entry, TryToReadSerializedEntry(cache_key));
+  }
+
+  if (!serialized_entry.has_value()) {
+    return std::nullopt;
+  }
+
+  TF_RETURN_IF_ERROR(
+      VerifyLoadedCacheEntry(cache_key, hlo_module, *serialized_entry));
+
+  VLOG(1) << "Loading cached entry for: " << signature_str;
+  return compiler_client->LoadExecutable(options, compilation_result,
+                                         serialized_entry->executable());
+}
+
+template <typename ExecutableType, typename ClientType>
+Status
+DeviceExecutablePersistor<ExecutableType, ClientType>::TryToPersistExecutable(
+    uint64 signature_hash, const std::string& signature_str,
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const ExecutableType& executable,
+    DeviceCompilerClient<ExecutableType, ClientType>* client) const {
+  if (persistent_cache_directory_.empty()) {
+    VLOG(1) << "Not persisting executable. No `persistent_cache_directory` "
+               "provided.";
+    return OkStatus();
+  }
+
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("Serializing and saving cache entry: ", signature_str));
+  TF_ASSIGN_OR_RETURN(XlaSerializedCacheEntry serialized_entry,
+                      SerializeEntry(signature_hash, options,
+                                     compilation_result, executable, client));
+  TF_RETURN_IF_ERROR(SaveSerializedEntry(std::move(serialized_entry)));
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_DEVICE_EXECUTABLE_PERSISTOR_H_
diff --git a/tensorflow/compiler/jit/device_executable_persistor_test.cc b/tensorflow/compiler/jit/device_executable_persistor_test.cc
new file mode 100644
index 00000000000..d79b93f04f5
--- /dev/null
+++ b/tensorflow/compiler/jit/device_executable_persistor_test.cc
@@ -0,0 +1,483 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
+
+#include <stdlib.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::_;
+using ::testing::ByMove;
+using ::testing::Return;
+using XlaDeviceExecutablePersistor =
+    DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
+
+class DeviceExecutionPersistorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    compiler_client_ = std::make_unique<XlaDeviceCompilerClient>(
+        xla::ClientLibrary::LocalClientOrDie());
+
+    XlaOpRegistry::RegisterCompilationKernels();
+
+    flib_def_ = std::make_unique<FunctionLibraryDefinition>(
+        OpRegistry::Global(), FunctionDefLibrary());
+
+    cache_dir_ = testing::TmpDir();
+    TF_ASSERT_OK_AND_ASSIGN(compilation_result_add_,
+                            BuildSampleCompilationResult());
+  }
+
+  StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildSampleExecutable() {
+    return compiler_client_->BuildExecutable(DefaultOptions(),
+                                             compilation_result_add_);
+  }
+
+  StatusOr<XlaCompiler::CompilationResult> BuildSampleCompilationResult(
+      bool mul = false) {
+    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+    if (mul) {
+      auto c = ops::Mul(scope.WithOpName("C"), a, b);
+      auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+      TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
+    } else {
+      auto c = ops::Add(scope.WithOpName("C"), a, b);
+      auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+      TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
+    }
+
+    // Builds a description of the arguments.
+    std::vector<XlaCompiler::Argument> args(2);
+    args[0].kind = XlaCompiler::Argument::kParameter;
+    args[0].type = DT_INT32;
+    args[0].shape = TensorShape({2});
+    args[1].kind = XlaCompiler::Argument::kParameter;
+    args[1].type = DT_INT32;
+    args[1].shape = TensorShape({2});
+
+    // Compiles the graph.
+    XlaCompiler compiler(DefaultOptions());
+
+    XlaCompiler::CompilationResult compilation_result;
+    TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                             "graph", std::move(graph), args,
+                                             &compilation_result));
+    return compilation_result;
+  }
+
+  XlaCompiler::Options DefaultOptions() {
+    XlaCompiler::Options options;
+    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.client = compiler_client_->client();
+    options.flib_def = flib_def_.get();
+    return options;
+  }
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<XlaDeviceCompilerClient> compiler_client_;
+  XlaCompiler::CompilationResult compilation_result_add_;
+  std::string serialized_executable_ = "serialized_executable";
+  std::string cache_dir_;
+};
+
+// Using a mock to make testing different branches and triggering errors easier.
+// Currently the `XlaDeviceCompilerClient`'s load/serialize functions don't work
+// with the current test setup.
+// TODO(b/255826209): Look into using a real object for most tests.
+class MockCompilerClient : public XlaDeviceCompilerClient {
+ public:
+  MockCompilerClient() : XlaDeviceCompilerClient(nullptr) {}
+  MOCK_METHOD(StatusOr<std::string>, SerializeExecutable,
+              (const xla::LocalExecutable& executable), (override));
+  MOCK_METHOD(StatusOr<std::string>, BuildSerializedExecutable,
+              (const XlaCompiler::Options& options,
+               const XlaCompiler::CompilationResult& result),
+              (override));
+  MOCK_METHOD(StatusOr<std::unique_ptr<xla::LocalExecutable>>, LoadExecutable,
+              (const XlaCompiler::Options& options,
+               const XlaCompiler::CompilationResult& result,
+               const std::string& serialized_executable),
+              (override));
+};
+
+std::string GetFilePath(XlaSerializedCacheKey key,
+                        const std::string& persistent_cache_dir) {
+  static constexpr char kXlaSerializedCacheKeySeparator[] = "__";
+
+  std::string file_name = absl::StrCat(
+      key.prefix(), key.prefix().empty() ? "" : kXlaSerializedCacheKeySeparator,
+      key.signature_fingerprint(), kXlaSerializedCacheKeySeparator,
+      key.cluster_fingerprint(), kXlaSerializedCacheKeySeparator,
+      key.device_type(), ".pb");
+
+  return io::JoinPath(persistent_cache_dir, file_name);
+}
+
+StatusOr<XlaSerializedCacheEntry> ReadCacheEntryFromFile(
+    XlaSerializedCacheKey key, const std::string& persistent_cache_dir) {
+  std::string file_path = GetFilePath(key, persistent_cache_dir);
+  XlaSerializedCacheEntry entry;
+  TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(Env::Default(), file_path, &entry));
+  return entry;
+}
+
+XlaSerializedCacheKey CreateCacheKey(
+    uint64 signature_hash,
+    const XlaCompiler::CompilationResult& compilation_result,
+    const DeviceType& device_type, const std::string& persistence_prefix) {
+  XlaSerializedCacheKey key;
+  key.set_signature_fingerprint(signature_hash);
+  key.set_cluster_fingerprint(
+      DeterministicProtoHash64(compilation_result.computation->proto()));
+  key.set_device_type(device_type.type_string());
+  key.set_prefix(persistence_prefix);
+  return key;
+}
+
+TEST_F(DeviceExecutionPersistorTest, PersistCacheDirNotSet) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/"",
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  TF_EXPECT_OK(persistor.TryToPersistExecutable(
+      /*signature_hash=*/123, "signature_string", DefaultOptions(),
+      compilation_result_add_, *executable, &mock_client));
+
+  auto key =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  auto entry = ReadCacheEntryFromFile(key, "");
+  EXPECT_FALSE(entry.ok());
+}
+
+TEST_F(DeviceExecutionPersistorTest, PersistSerializeAlreadyBuiltExecutable) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  EXPECT_CALL(mock_client, SerializeExecutable(_))
+      .WillOnce(Return(StatusOr<std::string>(serialized_executable_)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  TF_EXPECT_OK(persistor.TryToPersistExecutable(
+      /*signature_hash=*/123, "signature_string", DefaultOptions(),
+      compilation_result_add_, *executable, &mock_client));
+
+  auto key =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  TF_ASSERT_OK_AND_ASSIGN(auto entry, ReadCacheEntryFromFile(key, cache_dir_));
+
+  EXPECT_EQ(entry.executable(), serialized_executable_);
+}
+
+TEST_F(DeviceExecutionPersistorTest, PersistBuildSerializedExecutable) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  EXPECT_CALL(mock_client, SerializeExecutable(_))
+      .WillOnce(Return(errors::Unimplemented("Unimplemented.")));
+  EXPECT_CALL(mock_client, BuildSerializedExecutable(_, _))
+      .WillOnce(Return(serialized_executable_));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  TF_EXPECT_OK(persistor.TryToPersistExecutable(
+      /*signature_hash=*/123, "signature_string", DefaultOptions(),
+      compilation_result_add_, *executable, &mock_client));
+
+  auto key =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  TF_ASSERT_OK_AND_ASSIGN(auto entry, ReadCacheEntryFromFile(key, cache_dir_));
+
+  EXPECT_EQ(entry.executable(), serialized_executable_);
+}
+
+TEST_F(DeviceExecutionPersistorTest, PersistSerializeExecutableError) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  EXPECT_CALL(mock_client, SerializeExecutable(_))
+      .WillOnce(Return(errors::InvalidArgument("InvalidArgument.")));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  EXPECT_THAT(persistor.TryToPersistExecutable(
+                  /*signature_hash=*/123, "signature_string", DefaultOptions(),
+                  compilation_result_add_, *executable, &mock_client),
+              testing::StatusIs(error::INVALID_ARGUMENT));
+}
+
+TEST_F(DeviceExecutionPersistorTest, PersistExecutableEmpty) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  xla::LocalExecutable empty_executable(
+      nullptr, nullptr,
+      GetExecutableBuildOptions(DefaultOptions(), compilation_result_add_, 0));
+  EXPECT_CALL(mock_client, SerializeExecutable(_))
+      .WillOnce(Return(errors::FailedPrecondition("Failed precondition.")));
+
+  EXPECT_THAT(persistor.TryToPersistExecutable(
+                  /*signature_hash=*/123, "signature_string", DefaultOptions(),
+                  compilation_result_add_, empty_executable, &mock_client),
+              testing::StatusIs(error::FAILED_PRECONDITION));
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadCacheDirNotSet) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/"",
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  auto executable =
+      persistor.TryToLoadExecutable(123, "signature_string", DefaultOptions(),
+                                    compilation_result_add_, &mock_client);
+  EXPECT_FALSE(executable.has_value());
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadSuccess) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  EXPECT_CALL(mock_client, LoadExecutable(_, _, serialized_executable_))
+      .WillOnce(Return(ByMove(std::move(executable))));
+
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      /*signature_hash=*/123, "signature_string", DefaultOptions(),
+      compilation_result_add_, &mock_client);
+
+  EXPECT_TRUE(loaded_executable.has_value());
+  EXPECT_TRUE(loaded_executable.value().ok());
+  EXPECT_TRUE((*loaded_executable.value())->executable() != nullptr);
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadFileDoesntExist) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  MockCompilerClient mock_client;
+  // Try to load an executable for a different signature hash (which hasn't been
+  // persisted).
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      /*signature_hash=*/12345, "different_signature", DefaultOptions(),
+      compilation_result_add_, &mock_client);
+
+  EXPECT_FALSE(loaded_executable.has_value());
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadSerializedKeyMismatch) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  auto key1 =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  auto key2 =
+      CreateCacheKey(/*signature_hash=*/456, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  // File for key2 contains the same content as key1.
+  TF_ASSERT_OK(Env::Default()->CopyFile(
+      GetFilePath(key1, persistor.persistent_cache_directory()),
+      GetFilePath(key2, persistor.persistent_cache_directory())));
+
+  MockCompilerClient mock_client;
+  // Try to load an executable from file corresponding to key2 (whose file
+  // content corresponds to key1).
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      /*signature_hash=*/456, "different_signature", DefaultOptions(),
+      compilation_result_add_, &mock_client);
+
+  EXPECT_TRUE(loaded_executable.has_value());
+  EXPECT_FALSE(loaded_executable->ok());
+  EXPECT_THAT(loaded_executable.value(),
+              testing::StatusIs(error::INVALID_ARGUMENT));
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadSerializedHloMismatch) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto compilation_result_mul,
+                          BuildSampleCompilationResult(true));
+
+  auto key1 =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  auto key2 =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_mul,
+                     persistor.device_type(), persistor.persistence_prefix());
+
+  // Read serialized entry corresponding to key1.
+  XlaSerializedCacheEntry entry;
+  TF_ASSERT_OK(ReadTextOrBinaryProto(
+      Env::Default(), GetFilePath(key1, persistor.persistent_cache_directory()),
+      &entry));
+  // Change the entry's key to key2.
+  *entry.mutable_key() = key2;
+  // Write the modified entry to file corresponding to key2.
+  TF_ASSERT_OK(WriteBinaryProto(
+      Env::Default(), GetFilePath(key2, persistor.persistent_cache_directory()),
+      entry));
+
+  MockCompilerClient mock_client;
+  // Try to load executable corresponding to key2 (whose file contains HLO
+  // corresponding to key1).
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      /*signature_hash=*/123, "signature", DefaultOptions(),
+      compilation_result_mul, &mock_client);
+
+  EXPECT_TRUE(loaded_executable.has_value());
+  EXPECT_FALSE(loaded_executable->ok());
+  EXPECT_THAT(loaded_executable.value(),
+              testing::StatusIs(error::INVALID_ARGUMENT));
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadStrictChecksDisabled) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/true,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto compilation_result_mul,
+                          BuildSampleCompilationResult(true));
+
+  auto key1 =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+  auto key2 =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_mul,
+                     persistor.device_type(), persistor.persistence_prefix());
+
+  // Read serialized entry corresponding to key1.
+  XlaSerializedCacheEntry entry;
+  TF_ASSERT_OK(ReadTextOrBinaryProto(
+      Env::Default(), GetFilePath(key1, persistor.persistent_cache_directory()),
+      &entry));
+  // Change the entry's key to key2.
+  *entry.mutable_key() = key2;
+  // Write the modified entry to file corresponding to key2.
+  TF_ASSERT_OK(WriteBinaryProto(
+      Env::Default(), GetFilePath(key2, persistor.persistent_cache_directory()),
+      entry));
+
+  MockCompilerClient mock_client;
+  TF_ASSERT_OK_AND_ASSIGN(auto executable, BuildSampleExecutable());
+  EXPECT_CALL(mock_client, LoadExecutable(_, _, serialized_executable_))
+      .WillOnce(Return(ByMove(std::move(executable))));
+
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      123, "signature", DefaultOptions(), compilation_result_mul, &mock_client);
+
+  EXPECT_TRUE(loaded_executable.has_value());
+  EXPECT_TRUE(loaded_executable->ok());
+}
+
+TEST_F(DeviceExecutionPersistorTest, LoadSerializedExecutableEmpty) {
+  XlaDeviceExecutablePersistor::Config config(
+      /*persistent_cache_directory=*/cache_dir_,
+      /*disable_strict_signature_checks=*/false,
+      /*persistence_prefix=*/"xla");
+  XlaDeviceExecutablePersistor persistor(config, DefaultOptions().device_type);
+
+  auto key =
+      CreateCacheKey(/*signature_hash=*/123, compilation_result_add_,
+                     persistor.device_type(), persistor.persistence_prefix());
+
+  // Read serialized entry.
+  XlaSerializedCacheEntry entry;
+  TF_ASSERT_OK(ReadTextOrBinaryProto(
+      Env::Default(), GetFilePath(key, persistor.persistent_cache_directory()),
+      &entry));
+  entry.clear_executable();
+  // Write entry to another file.
+  TF_ASSERT_OK(WriteBinaryProto(
+      Env::Default(), GetFilePath(key, persistor.persistent_cache_directory()),
+      entry));
+
+  MockCompilerClient mock_client;
+  auto loaded_executable = persistor.TryToLoadExecutable(
+      /*signature_hash=*/123, "signature", DefaultOptions(),
+      compilation_result_add_, &mock_client);
+
+  EXPECT_TRUE(loaded_executable.has_value());
+  EXPECT_FALSE(loaded_executable->ok());
+  EXPECT_THAT(loaded_executable.value(),
+              testing::StatusIs(error::INVALID_ARGUMENT));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 3a5db290025..d761483a1f4 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -24,12 +24,11 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index e7d61561efe..304e317ee25 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -20,7 +20,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/graph/graph.h"
 
 namespace tensorflow {
@@ -117,8 +116,7 @@ struct XlaClusterInfo {
 // dependencies and control dependencies. cluster_deps maps the name name of an
 // outside compilation cluster to a set of names of outside compilation clusters
 // that it depends on.
-stream_executor::port::StatusOr<
-    std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
+tsl::StatusOr<std::unique_ptr<absl::flat_hash_map<string, std::vector<string>>>>
 OutsideCompilationClusterDependencies(
     const Graph* g, const string& outside_compilation_attr_name);
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index f0edee10499..047ebe24e3d 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index be2ec0efc2d..2c216500c5c 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -34,7 +34,6 @@ BuildXlaOpsPassFlags* build_ops_flags;
 MarkForCompilationPassFlags* mark_for_compilation_flags;
 XlaDeviceFlags* device_flags;
 XlaOpsCommonFlags* ops_flags;
-IntroduceFloatingPointJitterPassFlags* jitter_flags;
 MlirCommonFlags* mlir_flags;
 JitRtFlags* jitrt_flags;
 std::vector<Flag>* jitrt_flag_list;
@@ -163,17 +162,22 @@ void AllocateAndParseJitRtFlags() {
   jitrt_flags = new JitRtFlags;
   jitrt_flags->always_specialize = false;
   jitrt_flags->cost_driven_async_parallel_for = false;
+  jitrt_flags->enable_crash_reproducer = false;
+  jitrt_flags->enable_xla_cpu_transformations = false;
   jitrt_flags->log_query_of_death = false;
+  jitrt_flags->pack_matmul = false;
   jitrt_flags->vectorize = false;
-  jitrt_flags->enable_crash_reproducer = false;
   jitrt_flag_list = new std::vector<Flag>({
       Flag("always_specialize", &jitrt_flags->always_specialize, ""),
       Flag("cost_driven_async_parallel_for",
            &jitrt_flags->cost_driven_async_parallel_for, ""),
-      Flag("log_query_of_death", &jitrt_flags->log_query_of_death, ""),
-      Flag("vectorize", &jitrt_flags->vectorize, ""),
       Flag("enable_crash_reproducer", &jitrt_flags->enable_crash_reproducer,
            ""),
+      Flag("enable_xla_cpu_transformations",
+           &jitrt_flags->enable_xla_cpu_transformations, ""),
+      Flag("log_query_of_death", &jitrt_flags->log_query_of_death, ""),
+      Flag("pack_matmul", &jitrt_flags->pack_matmul, ""),
+      Flag("vectorize", &jitrt_flags->vectorize, ""),
   });
   xla::ParseFlagsFromEnvAndDieIfUnknown("TF_JITRT_FLAGS", *jitrt_flag_list);
 }
@@ -214,9 +218,7 @@ void AllocateAndParseFlags() {
   ops_flags = new XlaOpsCommonFlags;
   ops_flags->tf_xla_always_defer_compilation = false;
   ops_flags->tf_xla_async_compilation = false;
-
-  jitter_flags = new IntroduceFloatingPointJitterPassFlags;
-  jitter_flags->jitter_amount = 1e-5;
+  ops_flags->tf_xla_use_device_api = false;
 
   // The `enable_mlir_bridge` flag allows the user to explicitly request that
   // their program is (or isn't) compiled using the MLIR-based TF-to-XLA bridge.
@@ -228,13 +230,8 @@ void AllocateAndParseFlags() {
   // bridge, on a per-graph basis).
   bool enable_mlir_bridge = false;
   bool enable_mlir_bridge_is_explicit = false;
-  bool mlir_bridge_safe_mode = false;
   bool enable_mlir_merge_control_flow_pass = true;
   bool enable_mlir_convert_control_to_data_outputs_pass = false;
-  auto setter_for_jitter_tensor_names = [](string sequence) {
-    jitter_flags->tensor_names = absl::StrSplit(sequence, ',');
-    return true;
-  };
   // Dump graphs in TFG dialect.
   bool use_tfg_graph_dumper = false;
 
@@ -274,15 +271,9 @@ void AllocateAndParseFlags() {
             "When lazy compilation is enabled, asynchronous compilation starts "
             "the cluster compilation in the background, and the fallback path "
             "is executed until the compilation has finished."),
-
-       Flag("tf_introduce_floating_point_jitter_to_tensors",
-            setter_for_jitter_tensor_names, "",
-            "The Tensors to add the jitter to.  The tensors are named in the "
-            "TensorId format of <node name>:<output idx>."),
-       Flag("tf_introduce_floating_point_jitter_amount",
-            &jitter_flags->jitter_amount,
-            "The amount of jitter to introduce.  This amount is added to each "
-            "element in the tensors named in `tensor_names."),
+       Flag("tf_xla_use_device_api", &ops_flags->tf_xla_use_device_api,
+            "If true, uses the Device API (PjRt) for single device compilation."
+            " Defaults to false."),
 
        Flag("tf_mlir_enable_mlir_bridge", &enable_mlir_bridge,
             "Enables experimental MLIR-Based TensorFlow Compiler Bridge.",
@@ -295,12 +286,6 @@ void AllocateAndParseFlags() {
             &enable_mlir_convert_control_to_data_outputs_pass,
             "Enables `tf-executor-convert-control-to-data-outputs` pass for "
             "MLIR-Based TensorFlow Compiler Bridge."),
-       Flag(
-           "tf_mlir_bridge_safe_mode", &mlir_bridge_safe_mode,
-           "When tf_mlir_enable_mlir_bridge is true, this field can enable "
-           "the MLIR bridge's safe mode. When the MLIR bridge is in safe mode, "
-           "it only runs for graphs that use features MLIR bridge currently "
-           "supports."),
        Flag("tf_dump_graphs_in_tfg", &use_tfg_graph_dumper,
             "When tf_dump_graphs_in_tfg is true, graphs after transformations "
             "are dumped in MLIR TFG dialect and not in GraphDef")});
@@ -311,15 +296,10 @@ void AllocateAndParseFlags() {
   mlir_flags = new MlirCommonFlags;
   if (!enable_mlir_bridge_is_explicit) {
     mlir_flags->tf_mlir_enable_mlir_bridge =
-        (mlir_bridge_safe_mode)
-            ? ConfigProto::Experimental::
-                  MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED
-            : ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
   } else if (enable_mlir_bridge) {
     mlir_flags->tf_mlir_enable_mlir_bridge =
-        (mlir_bridge_safe_mode)
-            ? ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED
-            : ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
   } else {
     mlir_flags->tf_mlir_enable_mlir_bridge =
         ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
@@ -342,7 +322,6 @@ void ResetFlags() {
   delete mark_for_compilation_flags;
   delete device_flags;
   delete ops_flags;
-  delete jitter_flags;
   delete mlir_flags;
   delete flag_list;
   delete jitrt_flags;
@@ -372,15 +351,9 @@ XlaDeviceFlags* GetXlaDeviceFlags() {
   return device_flags;
 }
 
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
-  absl::call_once(flags_init, &AllocateAndParseFlags);
-  return *ops_flags;
-}
-
-const IntroduceFloatingPointJitterPassFlags&
-GetIntroduceFloatingPointJitterPassFlags() {
+XlaOpsCommonFlags* GetXlaOpsCommonFlags() {
   absl::call_once(flags_init, &AllocateAndParseFlags);
-  return *jitter_flags;
+  return ops_flags;
 }
 
 MlirCommonFlags* GetMlirCommonFlags() {
@@ -435,6 +408,8 @@ static std::atomic<bool> xla_compilation_disabled(false);
 
 void DisableXlaCompilation() { xla_compilation_disabled = true; }
 
+void EnableXlaCompilation() { xla_compilation_disabled = false; }
+
 bool FailOnXlaCompilation() { return xla_compilation_disabled; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 1cbfdb9caf5..fc319ac3e0e 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
 #define TENSORFLOW_COMPILER_JIT_FLAGS_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -122,6 +123,9 @@ struct XlaOpsCommonFlags {
   // If true, _XlaCompile compiles the cluster asynchronously with respect to
   // the main execution. The fallback path is taken while compilation happens.
   bool tf_xla_async_compilation;
+  // If true, uses Device API (PjRt) for single device compilation. Defaults to
+  // false.
+  bool tf_xla_use_device_api;
 };
 
 // Flags for the build_xla_ops pass.
@@ -147,17 +151,6 @@ struct BuildXlaOpsPassFlags {
   bool tf_xla_disable_constant_folding;
 };
 
-// Flags for the IntroduceFloatingPointJitter pass.
-struct IntroduceFloatingPointJitterPassFlags {
-  // The amount of jitter to introduce.  This amount is added to each element in
-  // the tensors named in `tensor_names.
-  float jitter_amount;
-
-  // The Tensors to add the jitter to.  The tensors are named in the TensorId
-  // format of <node name>:<output idx>.
-  std::vector<string> tensor_names;
-};
-
 // Flags for common MLIR configurations.
 struct MlirCommonFlags {
   ConfigProto::Experimental::MlirBridgeRollout tf_mlir_enable_mlir_bridge;
@@ -175,8 +168,17 @@ struct JitRtFlags {
   // "query of death". See TfJitRtQueryOfDeathLogger.
   bool log_query_of_death;
 
+  // Enable vectorization, which requires tiling and peeling on different ops.
   bool vectorize;
 
+  // Enable tiling/fusion transformations shared with XLA:CPU Next.
+  bool enable_xla_cpu_transformations;
+
+  // Enable packing for matmul, which lowers the matmul op into linalg.mmt4d, to
+  // hopefully get the most optimized layout for matmul inputs, hence accelerate
+  // accesses to these during matmul computation.
+  bool pack_matmul;
+
   // Enables crash reproducer for JitRt MLIR pass manager.
   bool enable_crash_reproducer;
 };
@@ -191,10 +193,7 @@ struct JitRtFlags {
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
 BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags();
 XlaDeviceFlags* GetXlaDeviceFlags();
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
-
-const IntroduceFloatingPointJitterPassFlags&
-GetIntroduceFloatingPointJitterPassFlags();
+XlaOpsCommonFlags* GetXlaOpsCommonFlags();
 
 MlirCommonFlags* GetMlirCommonFlags();
 
@@ -218,6 +217,10 @@ void AppendMarkForCompilationPassFlags(
 // be used by a server to ensure that JIT compilation is opt-in.
 void DisableXlaCompilation();
 
+// Enables XLA compilation. Can be used with `DisableXlaCompilation` to
+// enable/disable JIT compilation at different stages.
+void EnableXlaCompilation();
+
 // Returns `false` unless `DisableXlaCompilation` was called.
 bool FailOnXlaCompilation();
 
diff --git a/tensorflow/compiler/jit/get_compiler_ir.cc b/tensorflow/compiler/jit/get_compiler_ir.cc
index 4b06371078a..db3642e6111 100644
--- a/tensorflow/compiler/jit/get_compiler_ir.cc
+++ b/tensorflow/compiler/jit/get_compiler_ir.cc
@@ -15,29 +15,37 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/get_compiler_ir.h"
 
+#include <cstdint>
+#include <deque>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
-#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/compilability_check_util.h"
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -75,10 +83,153 @@ static StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildExecutable(
   return std::move(executables[0]);
 }
 
+static StatusOr<std::string> BuildHLOString(
+    IrExportStage stage, const XlaCompiler::CompilationResult& result,
+    xla::LocalClient* local_client, const XlaCompiler::Options& options) {
+  switch (stage) {
+    case IrExportStage::HLO:
+    case IrExportStage::HLO_NO_METADATA:
+    case IrExportStage::HLO_SERIALIZED: {
+      TF_ASSIGN_OR_RETURN(xla::ProgramShape program_shape,
+                          result.computation->GetProgramShape());
+      xla::HloModuleConfig config(program_shape);
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<xla::HloModule> new_module,
+          xla::HloModule::CreateFromProto(result.computation->proto(), config));
+
+      xla::HloPrintOptions opts;
+      if (stage == IrExportStage::HLO_NO_METADATA) {
+        opts.set_print_metadata(false);
+      }
+
+      if (stage == IrExportStage::HLO_SERIALIZED) {
+        return new_module->ToProto().SerializeAsString();
+      } else {
+        return new_module->ToString(opts);
+      }
+    }
+    case IrExportStage::OPTIMIZED_HLO:
+    case IrExportStage::OPTIMIZED_HLO_SERIALIZED: {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
+                          BuildExecutable(local_client, result, options));
+      xla::Executable* new_executable = executable->executable();
+      if (stage == IrExportStage::OPTIMIZED_HLO_SERIALIZED) {
+        return new_executable->module().ToProto().SerializeAsString();
+      } else {
+        return new_executable->module().ToString();
+      }
+    }
+    case IrExportStage::OPTIMIZED_HLO_PROTO_SERIALIZED: {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
+                          BuildExecutable(local_client, result, options,
+                                          /*xla_embed_ir_in_executable=*/true));
+      return executable->executable()->hlo_proto()->SerializeAsString();
+    }
+    case IrExportStage::OPTIMIZED_HLO_DOT: {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
+                          BuildExecutable(local_client, result, options));
+      StatusOr<std::string> graph = xla::RenderGraph(
+          *executable->executable()->module().entry_computation(),
+          "Visualization",
+          /*debug_options=*/{}, xla::RenderedGraphFormat::kDot,
+          /*hlo_render_options=*/{});
+      TF_RETURN_IF_ERROR(graph.status());
+      return *graph;
+    }
+  }
+}
+
+static StatusOr<std::vector<XlaCompiler::Argument>>
+BuildXlaCompilerArgumentFromTensorSpec(
+    const FunctionBody* fbody, absl::Span<int const> must_be_constant_idxs,
+    absl::Span<const Tensor* const> inputs,
+    absl::Span<VariableInfo const> variable_args, Device* device,
+    absl::Span<const ArgShapeAndDType> flat_arg_shape_and_dtype) {
+  TF_RET_CHECK(fbody != nullptr);
+  auto& input_args = fbody->fdef.signature().input_arg();
+  int input_arg_size = input_args.size();
+  std::vector<XlaCompiler::Argument> args;
+  args.reserve(input_arg_size);
+
+  for (auto& arg_info : flat_arg_shape_and_dtype) {
+    XlaCompiler::Argument arg;
+    arg.kind = XlaCompiler::Argument::kParameter;
+    arg.type = arg_info.dtype;
+    arg.shape = arg_info.shape;
+    args.push_back(arg);
+  }
+
+  // Build Xla Compiler Arguments from concrete_fn.captured_inputs
+  absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
+  TF_RETURN_IF_ERROR(
+      CreateVariableInfoLookup(variable_args, variable_info_lookup));
+
+  for (const VariableInfo& info : variable_args) {
+    TF_RET_CHECK(!info.var() || info.lock_held() || info.shared_lock_held())
+        << "Need to hold the lock on resource variables "
+           "before calling BuildXlaCompilerArguments";
+    variable_info_lookup.emplace(info.index(), &info);
+  }
+
+  int offset = flat_arg_shape_and_dtype.size();
+  // Here it takes in the concrete_fn.captured_inputs and builds the appropriate
+  // XLA compiler arguments.
+  for (int64_t input_num = offset; input_num < input_arg_size; ++input_num) {
+    const Tensor* input = inputs[input_num];
+
+    XlaCompiler::Argument arg;
+    if (variable_info_lookup.count(input_num)) {
+      // Handles tf.resource variables.
+      TF_RET_CHECK(input->dtype() == DT_RESOURCE);
+      const VariableInfo& variable = *variable_info_lookup[input_num];
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      arg.definition_stack_trace = variable.definition_stack_trace();
+      TF_RET_CHECK(variable.var() && variable.var()->is_initialized);
+      const Tensor* value = variable.var()->tensor();
+      arg.type = value->dtype();
+      arg.shape = value->shape();
+      arg.initialized = true;
+    } else {
+      // Instead of embedding constant into HLO,
+      // we handle tf.constant as parameter to reduce size.
+      arg.kind = XlaCompiler::Argument::kParameter;
+      arg.type = input->dtype();
+      arg.shape = input->shape();
+    }
+    args.push_back(arg);
+  }
+
+  for (int64_t i = 0; i < input_arg_size; ++i) {
+    args[i].name = input_args[i].name();
+  }
+
+  return args;
+}
+
+/**
+ * Clarifies the different meanings of 'input_arg_shape_and_dtype' and
+ * 'input_handles' in different cases.
+ *
+ * For TENSOR_SPEC case:
+ *   - `input_arg_shape_and_dtype`: Contains the shape and dtype of
+ * concrete_fn input args.
+ *   - `input_handles`: Contains the concrete_fn.captured_input tensors.
+ *
+ * For CONCRETE_INPUT case:
+ *   - `input_arg_shape_and_dtype`: it is empty.
+ *   - `input_handles`: Contains all concrete_fn inputs tensors, including
+ * captured inputs.
+ */
 StatusOr<std::string> GetCompilerIr(
     IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
     absl::string_view func_name, Device* dev, EagerContext* context,
-    absl::Span<const TensorHandle* const> inputs_handles) {
+    absl::Span<const ArgShapeAndDType> input_arg_shape_and_dtype,
+    absl::Span<const TensorHandle* const> input_handles,
+    CompilerArgSource compiler_arg_source) {
+  using XlaDeviceCompiler =
+      DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+
   auto is_tfrt_tpu_supported_stage = [](IrExportStage stage) {
     return stage == IrExportStage::HLO ||
            stage == IrExportStage::HLO_NO_METADATA ||
@@ -104,15 +255,18 @@ StatusOr<std::string> GetCompilerIr(
   TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
       flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
 
-  MemoryTypeVector input_memory_types =
-      GetInputMemoryTypes(fbody, constant_arg_indices, resource_arg_indices);
-  MemoryTypeVector output_memory_types = GetOutputMemoryTypes(fbody);
+  // `input_args` includes both concrete_fn input args and captured_input here.
+  auto& input_args = fbody->fdef.signature().input_arg();
+  // Here input_arg_size = len(flat_args) + len(captured_input)
+  int input_arg_size = input_args.size();
 
+  std::vector<const Tensor*> inputs(input_arg_size);
   std::deque<Tensor> inputs_storage;
-  std::vector<const Tensor*> inputs;
-  inputs.reserve(inputs_handles.size());
-  for (int i = 0; i < inputs_handles.size(); i++) {
-    const TensorHandle* th = inputs_handles[i];
+  std::vector<VariableInfo> variable_infos;
+  int offset = input_arg_shape_and_dtype.size();
+
+  for (int i = 0; i < input_handles.size(); i++) {
+    const TensorHandle* th = input_handles[i];
     const Tensor* t;
     // Handle owns the tensor.
     TF_RETURN_IF_ERROR(th->Tensor(&t));
@@ -121,27 +275,26 @@ StatusOr<std::string> GetCompilerIr(
       inputs_storage.emplace_back(t->dtype(), t->shape());
       TF_RETURN_IF_ERROR(
           th->CopyToDevice(*context, /*d=*/nullptr, &inputs_storage.back()));
-      inputs.push_back(&inputs_storage.back());
+      inputs[i + offset] = &inputs_storage.back();
     } else {
-      inputs.push_back(t);
+      inputs[i + offset] = t;
     }
   }
 
-  std::vector<VariableInfo> variable_infos;
   TF_RETURN_IF_ERROR(GetVariableInfosFromInputs(
       rmgr, dev, inputs, resource_arg_indices, &variable_infos));
   TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
 
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(dev);
 
-  XlaCompilationCache* cache;
-  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<XlaCompilationCache>(
-      rmgr->default_container(), "xla_cache", &cache,
-      [&](XlaCompilationCache** cache_write_into) {
-        return BuildXlaCompilationCache(dev, flr, platform_info,
-                                        cache_write_into);
+  XlaDeviceCompiler* xla_device_compiler;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<XlaDeviceCompiler>(
+      rmgr->default_container(), "xla_device_compiler", &xla_device_compiler,
+      [&](XlaDeviceCompiler** xla_device_compiler) {
+        return BuildXlaDeviceCompiler(dev, flr, platform_info,
+                                      xla_device_compiler);
       }));
-  core::ScopedUnref cache_ref(cache);
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
 
   se::Stream* stream = nullptr;
   if (const DeviceBase::AcceleratorDeviceInfo* accelerator_device_info =
@@ -151,9 +304,10 @@ StatusOr<std::string> GetCompilerIr(
 
   XlaCompiler::Options options;
   if (platform_info.device_type() == DEVICE_TPU && stream == nullptr) {
-    options = GenerateTfrtTpuCompilerOptions(*cache, *flr);
+    options = GenerateTfrtTpuCompilerOptions(*xla_device_compiler, *flr);
   } else {
-    options = GenerateCompilerOptions(*cache, *flr, dev, stream, platform_info,
+    options = GenerateCompilerOptions(*xla_device_compiler, *flr, dev, stream,
+                                      platform_info,
                                       /*has_ref_vars=*/false);
   }
 
@@ -163,67 +317,24 @@ StatusOr<std::string> GetCompilerIr(
 
   XlaCompiler compiler(options);
 
-  StatusOr<std::vector<XlaCompiler::Argument>> args =
-      XlaComputationLaunchContext::BuildXlaCompilerArguments(
-          constant_arg_indices, inputs, variable_infos, dev);
+  StatusOr<std::vector<XlaCompiler::Argument>> args;
+
+  if (compiler_arg_source == CompilerArgSource::TENSOR_SPEC) {
+    args = BuildXlaCompilerArgumentFromTensorSpec(fbody, constant_arg_indices,
+                                                  inputs, variable_infos, dev,
+                                                  input_arg_shape_and_dtype);
+  } else if (compiler_arg_source == CompilerArgSource::CONCRETE_INPUT) {
+    args = XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_arg_indices, inputs, variable_infos, dev);
+  }
   TF_RETURN_IF_ERROR(args.status());
 
-  xla::LocalClient* local_client = cache->client();
+  xla::LocalClient* local_client = xla_device_compiler->client();
   XlaCompiler::CompilationResult result;
   TF_RETURN_IF_ERROR(
       compiler.CompileFunction(compile_options, function, *args, &result));
 
-  switch (stage) {
-    case IrExportStage::HLO:
-    case IrExportStage::HLO_NO_METADATA:
-    case IrExportStage::HLO_SERIALIZED: {
-      TF_ASSIGN_OR_RETURN(xla::ProgramShape program_shape,
-                          result.computation->GetProgramShape());
-      xla::HloModuleConfig config(program_shape);
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<xla::HloModule> new_module,
-          xla::HloModule::CreateFromProto(result.computation->proto(), config));
-
-      xla::HloPrintOptions opts;
-      if (stage == IrExportStage::HLO_NO_METADATA) {
-        opts.set_print_metadata(false);
-      }
-
-      if (stage == IrExportStage::HLO_SERIALIZED) {
-        return new_module->ToProto().SerializeAsString();
-      } else {
-        return new_module->ToString(opts);
-      }
-    }
-    case IrExportStage::OPTIMIZED_HLO:
-    case IrExportStage::OPTIMIZED_HLO_SERIALIZED: {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
-                          BuildExecutable(local_client, result, options));
-      xla::Executable* new_executable = executable->executable();
-      if (stage == IrExportStage::OPTIMIZED_HLO_SERIALIZED) {
-        return new_executable->module().ToProto().SerializeAsString();
-      } else {
-        return new_executable->module().ToString();
-      }
-    }
-    case IrExportStage::OPTIMIZED_HLO_PROTO_SERIALIZED: {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
-                          BuildExecutable(local_client, result, options,
-                                          /*xla_embed_ir_in_executable=*/true));
-      return executable->executable()->hlo_proto()->SerializeAsString();
-    }
-    case IrExportStage::OPTIMIZED_HLO_DOT: {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::LocalExecutable> executable,
-                          BuildExecutable(local_client, result, options));
-      StatusOr<std::string> graph = xla::RenderGraph(
-          *executable->executable()->module().entry_computation(),
-          "Visualization",
-          /*debug_options=*/{}, xla::RenderedGraphFormat::kDot,
-          /*hlo_render_options=*/{});
-      TF_RETURN_IF_ERROR(graph.status());
-      return *graph;
-    }
-  }
+  return BuildHLOString(stage, result, local_client, options);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/get_compiler_ir.h b/tensorflow/compiler/jit/get_compiler_ir.h
index 10b37b54a7c..107c7a002b1 100644
--- a/tensorflow/compiler/jit/get_compiler_ir.h
+++ b/tensorflow/compiler/jit/get_compiler_ir.h
@@ -15,8 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
 #define TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
@@ -37,12 +41,24 @@ enum class IrExportStage {
   OPTIMIZED_HLO_DOT
 };
 
+struct ArgShapeAndDType {
+  TensorShape shape;
+  DataType dtype;
+};
+
+enum class CompilerArgSource {
+  TENSOR_SPEC,
+  CONCRETE_INPUT,
+};
+
 // Returns the IR format of the selected stage for a given function `func_name`
 // using library runtime `runtime` on a device `dev` with given `inputs`.
 StatusOr<std::string> GetCompilerIr(
     IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
     absl::string_view func_name, Device* dev, EagerContext* context,
-    absl::Span<const TensorHandle* const> inputs);
+    absl::Span<const ArgShapeAndDType> flat_arg_shape_and_dtype_or_empty,
+    absl::Span<const TensorHandle* const> input_handles,
+    CompilerArgSource compiler_arg_source);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.cc b/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.cc
deleted file mode 100644
index 64370a609d8..00000000000
--- a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/introduce_floating_point_jitter_pass.h"
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/cc/framework/scope_internal.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/core/graph/tensor_id.h"
-
-namespace tensorflow {
-namespace {
-std::vector<std::pair<Node*, std::vector<int>>> GetNodesToModify(
-    const Graph& g, absl::Span<const string> tensor_names) {
-  absl::flat_hash_map<string, Node*> name_to_node;
-  for (Node* n : g.op_nodes()) {
-    name_to_node[n->name()] = n;
-  }
-
-  absl::flat_hash_map<Node*, std::vector<int>> nodes_to_modify_map;
-
-  for (const string& tensor_name : tensor_names) {
-    TensorId tensor_id = ParseTensorName(tensor_name);
-    auto it = name_to_node.find(tensor_id.node());
-    DCHECK(it != name_to_node.end());
-    nodes_to_modify_map[it->second].push_back(tensor_id.index());
-  }
-
-  std::vector<std::pair<Node*, std::vector<int>>> nodes_to_modify;
-  absl::c_copy(nodes_to_modify_map, std::back_inserter(nodes_to_modify));
-
-  absl::c_sort(nodes_to_modify,
-               [](const std::pair<Node*, std::vector<int>>& a,
-                  const std::pair<Node*, std::vector<int>>& b) {
-                 return a.first->id() < b.first->id();
-               });
-
-  for (auto& p : nodes_to_modify) {
-    absl::c_sort(p.second);
-    p.second.erase(std::unique(p.second.begin(), p.second.end()),
-                   p.second.end());
-  }
-
-  return nodes_to_modify;
-}
-
-Status IntroduceJitterToTensor(
-    Graph* g, Node* n, int oidx, float jitter_amount,
-    absl::flat_hash_map<std::pair<DataType, Node*>, Output>*
-        node_to_jitter_constant) {
-  std::vector<const Edge*> edges_to_update;
-  absl::c_copy_if(n->out_edges(), std::back_inserter(edges_to_update),
-                  [&](const Edge* e) { return e->src_output() == oidx; });
-
-  if (edges_to_update.empty()) {
-    VLOG(1) << "No users for " << TensorId(n->name(), oidx).ToString();
-    return OkStatus();
-  }
-
-  VLOG(1) << "Updating " << edges_to_update.size() << " users for  "
-          << TensorId(n->name(), oidx).ToString();
-
-  Status status;
-  Scope s = NewInternalScope(g, &status, /*refiner=*/nullptr)
-                .NewSubScope(absl::StrCat(n->name(), "/jitter"));
-
-  Output node_out(n, oidx);
-  Output jitter_constant;
-  DataType dtype = n->output_type(oidx);
-  auto it = node_to_jitter_constant->find({dtype, n});
-  if (it == node_to_jitter_constant->end()) {
-    Tensor constant_tensor;
-    if (dtype == DT_FLOAT) {
-      constant_tensor = Tensor(static_cast<float>(jitter_amount));
-    } else if (dtype == DT_HALF) {
-      constant_tensor = Tensor(Eigen::half(jitter_amount));
-    } else {
-      return errors::Unimplemented("Only float and half are supported");
-    }
-
-    jitter_constant =
-        ops::Const(s.WithOpName("jitter_amount"), constant_tensor);
-    (*node_to_jitter_constant)[{dtype, n}] = jitter_constant;
-  } else {
-    jitter_constant = it->second;
-  }
-
-  Output jittered_output =
-      ops::Add(s.NewSubScope(absl::StrCat(oidx)).WithOpName("jittered_output"),
-               jitter_constant, node_out);
-
-  TF_RETURN_IF_ERROR(status);
-
-  for (const Edge* e : edges_to_update) {
-    VLOG(3) << "Updating " << e->dst()->name();
-    TF_RETURN_IF_ERROR(
-        g->UpdateEdge(jittered_output.node(), 0, e->dst(), e->dst_input()));
-  }
-
-  // Add a control edge to make sure that the two inputs to jittered_output are
-  // from the same frame.
-  g->AddControlEdge(n, jitter_constant.node());
-
-  return OkStatus();
-}
-}  // namespace
-
-Status IntroduceFloatingPointJitter(Graph* graph,
-                                    absl::Span<string const> tensor_names,
-                                    float jitter_amount) {
-  if (tensor_names.empty()) {
-    VLOG(3) << "Nothing to do";
-    return OkStatus();
-  }
-
-  std::vector<std::pair<Node*, std::vector<int>>> nodes_to_modify =
-      GetNodesToModify(*graph, tensor_names);
-
-  absl::flat_hash_map<std::pair<DataType, Node*>, Output>
-      node_to_jitter_constant;
-  for (const auto& p : nodes_to_modify) {
-    for (int oidx : p.second) {
-      TF_RETURN_IF_ERROR(IntroduceJitterToTensor(
-          graph, p.first, oidx, jitter_amount, &node_to_jitter_constant));
-    }
-  }
-
-  return OkStatus();
-}
-
-Status IntroduceFloatingPointJitterPass::Run(
-    const GraphOptimizationPassOptions& options) {
-  const IntroduceFloatingPointJitterPassFlags& flags =
-      GetIntroduceFloatingPointJitterPassFlags();
-
-  return IntroduceFloatingPointJitter(options.graph->get(), flags.tensor_names,
-                                      flags.jitter_amount);
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.h b/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.h
deleted file mode 100644
index 115f72a6eea..00000000000
--- a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_H_
-#define TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_H_
-
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-
-namespace tensorflow {
-// A debug-only pass that introduces error into outputs of specific TF nodes.
-// This can be used to check the sensitivity of a TF graph to floating point
-// rounding differences.
-//
-// This pass is controlled by TF_XLA_FLAGS.  Please see
-// IntroduceFloatingPointJitterPassFlags for information on how to use this.
-class IntroduceFloatingPointJitterPass : public GraphOptimizationPass {
- public:
-  IntroduceFloatingPointJitterPass() = default;
-
-  Status Run(const GraphOptimizationPassOptions& options) override;
-};
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_H_
diff --git a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_internal.h b/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_internal.h
deleted file mode 100644
index ea7261bc872..00000000000
--- a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_internal.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_INTERNAL_H_
-#define TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_INTERNAL_H_
-
-#include "absl/types/span.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-
-namespace tensorflow {
-Status IntroduceFloatingPointJitter(Graph* graph,
-                                    absl::Span<string const> tensor_names,
-                                    float jitter_amount);
-}
-
-#endif  // TENSORFLOW_COMPILER_JIT_INTRODUCE_FLOATING_POINT_JITTER_PASS_INTERNAL_H_
diff --git a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_test.cc b/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_test.cc
deleted file mode 100644
index 25155a133d7..00000000000
--- a/tensorflow/compiler/jit/introduce_floating_point_jitter_pass_test.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/introduce_floating_point_jitter_pass_internal.h"
-
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/cc/ops/linalg_ops.h"
-#include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/compiler/jit/node_matchers.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-using testing::matchers::Const;
-using testing::matchers::Inputs;
-using testing::matchers::Name;
-using testing::matchers::NodeWith;
-using testing::matchers::Op;
-using testing::matchers::Out;
-
-TEST(IntroduceFloatingPointJitterTest, SingleOutputFP32) {
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output input_a = ops::Placeholder(root.WithOpName("input_a"), DT_FLOAT);
-  Output input_b = ops::Placeholder(root.WithOpName("input_b"), DT_FLOAT);
-
-  Output sigmoid_a = ops::Sigmoid(root.WithOpName("sigmoid_a"), input_a);
-  Output sigmoid_b = ops::Sigmoid(root.WithOpName("sigmoid_b"), input_b);
-
-  Output tanh_a = ops::Tanh(root.WithOpName("tanh_a"), sigmoid_a);
-  Output tanh_b = ops::Tanh(root.WithOpName("tanh_b"), sigmoid_b);
-
-  auto graph = std::make_unique<Graph>(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(graph.get()));
-
-  std::vector<string> tensor_names;
-  tensor_names.push_back("sigmoid_a");
-  tensor_names.push_back("sigmoid_b");
-
-  TF_ASSERT_OK(IntroduceFloatingPointJitter(graph.get(), tensor_names, 0.01f));
-  VLOG(1) << graph->ToGraphDefDebug().DebugString();
-
-  auto m_sigmoid_a = Out(NodeWith(Name("sigmoid_a")));
-  auto m_sigmoid_a_with_jitter =
-      NodeWith(Op("Add"), Inputs(Const(0.01f), m_sigmoid_a));
-  auto m_tanh_a = NodeWith(Op("Tanh"), Inputs(Out(m_sigmoid_a_with_jitter)));
-
-  auto m_sigmoid_b = Out(NodeWith(Name("sigmoid_b")));
-  auto m_sigmoid_b_with_jitter =
-      NodeWith(Op("Add"), Inputs(Const(0.01f), m_sigmoid_b));
-  auto m_tanh_b = NodeWith(Op("Tanh"), Inputs(Out(m_sigmoid_b_with_jitter)));
-
-  Node* tanh_a_transformed = testing::FindNodeByName(graph.get(), "tanh_a");
-  Node* tanh_b_transformed = testing::FindNodeByName(graph.get(), "tanh_b");
-
-  ASSERT_NE(tanh_a_transformed, nullptr);
-  ASSERT_NE(tanh_b_transformed, nullptr);
-
-  EXPECT_THAT(tanh_a_transformed, m_tanh_a);
-  EXPECT_THAT(tanh_b_transformed, m_tanh_b);
-}
-
-TEST(IntroduceFloatingPointJitterTest, TwoNodesOneUser) {
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output input_a = ops::Placeholder(root.WithOpName("input_a"), DT_FLOAT);
-  Output input_b = ops::Placeholder(root.WithOpName("input_b"), DT_FLOAT);
-
-  Output sigmoid_a = ops::Sigmoid(root.WithOpName("sigmoid_a"), input_a);
-  Output sigmoid_b = ops::Sigmoid(root.WithOpName("sigmoid_b"), input_b);
-
-  Output add = ops::Add(root.WithOpName("add"), sigmoid_a, sigmoid_b);
-
-  auto graph = std::make_unique<Graph>(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(graph.get()));
-
-  std::vector<string> tensor_names;
-  tensor_names.push_back("sigmoid_a");
-  tensor_names.push_back("sigmoid_b");
-
-  TF_ASSERT_OK(IntroduceFloatingPointJitter(graph.get(), tensor_names, 0.01f));
-  VLOG(1) << graph->ToGraphDefDebug().DebugString();
-
-  auto m_sigmoid_a = Out(NodeWith(Name("sigmoid_a")));
-  auto m_sigmoid_a_with_jitter =
-      NodeWith(Op("Add"), Inputs(Const(0.01f), m_sigmoid_a));
-
-  auto m_sigmoid_b = Out(NodeWith(Name("sigmoid_b")));
-  auto m_sigmoid_b_with_jitter =
-      NodeWith(Op("Add"), Inputs(Const(0.01f), m_sigmoid_b));
-
-  auto m_add = NodeWith(Op("Add"), Inputs(Out(m_sigmoid_a_with_jitter),
-                                          Out(m_sigmoid_b_with_jitter)));
-
-  Node* add_transformed = testing::FindNodeByName(graph.get(), "add");
-
-  ASSERT_NE(add_transformed, nullptr);
-
-  EXPECT_THAT(add_transformed, m_add);
-}
-
-TEST(IntroduceFloatingPointJitterTest, NotFP32) {
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output input = ops::Placeholder(root.WithOpName("input"), DT_HALF);
-
-  Output sigmoid = ops::Sigmoid(root.WithOpName("sigmoid"), input);
-
-  Output tanh = ops::Tanh(root.WithOpName("tanh"), sigmoid);
-
-  auto graph = std::make_unique<Graph>(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(graph.get()));
-
-  std::vector<string> tensor_names;
-  tensor_names.push_back("sigmoid");
-
-  TF_ASSERT_OK(IntroduceFloatingPointJitter(graph.get(), tensor_names, 0.01f));
-  VLOG(1) << graph->ToGraphDefDebug().DebugString();
-
-  auto m_sigmoid = Out(NodeWith(Name("sigmoid")));
-  auto m_sigmoid_with_jitter =
-      NodeWith(Op("Add"), Inputs(Const(Tensor(Eigen::half(0.01f))), m_sigmoid));
-  auto m_tanh = NodeWith(Op("Tanh"), Inputs(Out(m_sigmoid_with_jitter)));
-
-  Node* tanh_transformed = testing::FindNodeByName(graph.get(), "tanh");
-
-  ASSERT_NE(tanh_transformed, nullptr);
-
-  EXPECT_THAT(tanh_transformed, m_tanh);
-}
-
-TEST(IntroduceFloatingPointJitterTest, MultiOutput) {
-  Scope root = Scope::NewRootScope().ExitOnError();
-
-  Output input = ops::Placeholder(root.WithOpName("input"), DT_HALF);
-
-  ops::Svd svd(root.WithOpName("svd"), input);
-
-  Output tanh_s = ops::Tanh(root.WithOpName("tanh_s"), svd.s);
-  Output tanh_u = ops::Tanh(root.WithOpName("tanh_u"), svd.u);
-  Output tanh_v = ops::Tanh(root.WithOpName("tanh_v"), svd.v);
-
-  auto graph = std::make_unique<Graph>(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(graph.get()));
-
-  std::vector<string> tensor_names;
-  tensor_names.push_back("svd:0");
-  tensor_names.push_back("svd:2");
-
-  TF_ASSERT_OK(IntroduceFloatingPointJitter(graph.get(), tensor_names, 0.01f));
-  VLOG(1) << graph->ToGraphDefDebug().DebugString();
-
-  auto m_svd_s = Out(0, NodeWith(Name("svd")));
-  auto m_svd_s_with_jitter = Out(
-      NodeWith(Op("Add"), Inputs(Const(Tensor(Eigen::half(0.01f))), m_svd_s)));
-
-  auto m_svd_u = Out(1, NodeWith(Name("svd")));
-
-  auto m_svd_v = Out(2, NodeWith(Name("svd")));
-  auto m_svd_v_with_jitter = Out(
-      NodeWith(Op("Add"), Inputs(Const(Tensor(Eigen::half(0.01f))), m_svd_v)));
-
-  auto m_tanh_s = NodeWith(Op("Tanh"), Inputs(m_svd_s_with_jitter));
-  auto m_tanh_u = NodeWith(Op("Tanh"), Inputs(m_svd_u));
-  auto m_tanh_v = NodeWith(Op("Tanh"), Inputs(m_svd_v_with_jitter));
-
-  Node* tanh_s_transformed = testing::FindNodeByName(graph.get(), "tanh_s");
-  ASSERT_NE(tanh_s_transformed, nullptr);
-
-  Node* tanh_u_transformed = testing::FindNodeByName(graph.get(), "tanh_u");
-  ASSERT_NE(tanh_u_transformed, nullptr);
-
-  Node* tanh_v_transformed = testing::FindNodeByName(graph.get(), "tanh_v");
-  ASSERT_NE(tanh_v_transformed, nullptr);
-
-  EXPECT_THAT(tanh_s_transformed, m_tanh_s);
-  EXPECT_THAT(tanh_u_transformed, m_tanh_u);
-  EXPECT_THAT(tanh_v_transformed, m_tanh_v);
-}
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 8adfff34c66..a4a9c7d7e11 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 #include "tensorflow/compiler/jit/force_xla_constants_on_host_pass.h"
 #include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
-#include "tensorflow/compiler/jit/introduce_floating_point_jitter_pass.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/partially_decluster_pass.h"
 #include "tensorflow/compiler/jit/report_clustering_info_pass.h"
@@ -35,9 +34,6 @@ namespace tensorflow {
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 36,
                       EncapsulateXlaComputationsPass);
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 35,
-                      IntroduceFloatingPointJitterPass);
-
 // from
 // tensorflow/compiler/tf2xla/functionalize_control_flow_pass_registration.cc
 // FunctionalizeControlFlowPass: 27
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 8f1d9844c24..8fb1aecf8c5 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/tf2xla:internal",
         "//tensorflow/core/tpu:__subpackages__",
@@ -17,7 +18,7 @@ XLA_OPS_DEPS = [
     "//tensorflow/compiler/jit:flags",
     "//tensorflow/compiler/jit:xla_activity_listener",
     "//tensorflow/compiler/jit:xla_activity_proto_cc",
-    "//tensorflow/compiler/jit:xla_compilation_cache",
+    "//tensorflow/compiler/jit:device_compiler",
     "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
     "//tensorflow/compiler/jit:xla_cluster_util",
     "//tensorflow/compiler/jit:xla_launch_util",
@@ -48,7 +49,14 @@ cc_library(
     name = "xla_ops_no_jit_rewrite_registration",
     srcs = ["xla_ops.cc"],
     hdrs = ["xla_ops.h"],
-    deps = XLA_OPS_DEPS + ["//tensorflow/core/platform:refcount"],
+    deps = XLA_OPS_DEPS + [
+        "//tensorflow/compiler/jit:device_compilation_cache",
+        "//tensorflow/compiler/jit:device_compilation_profiler",
+        "//tensorflow/compiler/jit:tf_graph_to_hlo_compiler",
+        "//tensorflow/compiler/jit:tf_to_hlo_compiler",
+        "//tensorflow/compiler/jit:xla_compile_util",
+        "//tensorflow/core/platform:refcount",
+    ],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 9bebf2be3b9..8868a4c8229 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -18,16 +18,19 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <optional>
+#include <set>
 #include <tuple>
 #include <utility>
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -66,6 +69,8 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
+using XlaDeviceCompiler =
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
 
 auto* xla_launch_counter = monitoring::Counter<1>::New(
     "/tensorflow/core/xla_launch_counter",
@@ -225,102 +230,6 @@ GetXlaCompilerArgsAndSnapshotVariables(
   return result;
 }
 
-struct CompilationResults {
-  const XlaCompiler::CompilationResult* compilation_result = nullptr;
-  xla::LocalExecutable* executable = nullptr;
-  xla::LocalClient* client = nullptr;
-  // This needs to be a shared pointer because it needs to be captured in an
-  // std::function, and std::function requires all captured arguments to be
-  // copyable.
-  std::shared_ptr<std::vector<VariableInfo>> variable_infos;
-};
-
-struct CompilerArgsAndVariableSnapshots {
-  std::vector<XlaCompiler::Argument> xla_compiler_args;
-  ResourceVarsSnapshot variable_snapshots;
-};
-
-// CompilationResults is filled when the XLA cluster is already compiled.
-// CompilerArgsAndVariableSnapshots is filled when the XLA cluster is not
-// compiled yet.
-using CompilationResultOrXlaCompilerArgsAndVariableSnapshots =
-    std::variant<CompilationResults, CompilerArgsAndVariableSnapshots>;
-
-// If the XLA Cluster is already compiled, we don't snapshot the variables and
-// return the compilation result and executable.
-// If the XLA Cluster is not compiled yet, we snapshot the variables and return
-// the variable snapshots and the XLA compiler arguments.
-StatusOr<CompilationResultOrXlaCompilerArgsAndVariableSnapshots>
-GetCompilationResultOrGetXlaCompilerArgsAndSnapshotVariables(
-    const XlaPlatformInfo& platform_info, const NameAttrList& function,
-    absl::Span<const int> variable_indices,
-    absl::Span<const int> must_be_constant_idxs,
-    absl::Span<const Tensor* const> inputs, OpKernelContext* ctx) {
-  CompilationResults compilation_results;
-  CompilerArgsAndVariableSnapshots compiler_args_and_variable_snapshots;
-
-  compilation_results.variable_infos =
-      std::make_shared<std::vector<VariableInfo>>();
-  std::vector<VariableInfo>& variable_infos =
-      *compilation_results.variable_infos;
-  TF_RETURN_IF_ERROR(
-      GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(), inputs,
-                                 variable_indices, &variable_infos));
-  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
-
-  TF_ASSIGN_OR_RETURN(compiler_args_and_variable_snapshots.xla_compiler_args,
-                      XlaComputationLaunchContext::BuildXlaCompilerArguments(
-                          must_be_constant_idxs, inputs, variable_infos,
-                          static_cast<Device*>(ctx->device())));
-
-  // We store information about the JIT-compiled XLA computation
-  // in the ResourceMgr.
-  ResourceMgr* rm = ctx->resource_manager();
-  if (!rm) {
-    return errors::Internal("No resource manager.");
-  }
-
-  XlaCompilationCache* cache;
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
-      rm->default_container(), "xla_cache", &cache,
-      [&](XlaCompilationCache** cache) {
-        return BuildXlaCompilationCache(ctx->device(), ctx->function_library(),
-                                        platform_info, cache);
-      }));
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref cache_ref(cache);
-
-  TF_ASSIGN_OR_RETURN(
-      auto compilation_result_and_executable,
-      cache->GetCompilationResultIfAlreadyCompiled(
-          function, compiler_args_and_variable_snapshots.xla_compiler_args));
-
-  if (compilation_result_and_executable.executable != nullptr) {
-    // The XLA signature is already compiled. We return the compilation result
-    // and the locks on the variables(contained in variable_infos).
-    compilation_results.compilation_result =
-        compilation_result_and_executable.compilation_result;
-    compilation_results.executable =
-        compilation_result_and_executable.executable;
-    compilation_results.client =
-        static_cast<xla::LocalClient*>(cache->client());
-    return CompilationResultOrXlaCompilerArgsAndVariableSnapshots(
-        std::move(compilation_results));
-  }
-
-  // The XLA signature is not compiled yet. We snapshot the variables and
-  // release the locks on the variables(by destructing variable_infos). We
-  // return the compiler arguments and variable snapshots.
-  TF_RETURN_IF_ERROR(SnapshotResourceVariables(
-      ctx, variable_indices, variable_infos,
-      &compiler_args_and_variable_snapshots.variable_snapshots));
-
-  return CompilationResultOrXlaCompilerArgsAndVariableSnapshots(
-      std::move(compiler_args_and_variable_snapshots));
-}
-
 }  // namespace
 
 XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
@@ -339,8 +248,8 @@ static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
     const XlaPlatformInfo& platform_info,
     const std::vector<XlaCompiler::Argument>& args,
-    XlaCompilationCache::CompileMode compile_mode,
-    bool may_alias_resource_update, xla::LocalClient** client,
+    DeviceCompileMode compile_mode, bool may_alias_resource_update,
+    xla::LocalClient** client,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
@@ -350,23 +259,31 @@ static Status CompileToLocalExecutable(
     return errors::Internal("No resource manager.");
   }
 
-  XlaCompilationCache* cache;
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
-      rm->default_container(), "xla_cache", &cache,
-      [&](XlaCompilationCache** cache) {
-        return BuildXlaCompilationCache(ctx->device(), ctx->function_library(),
-                                        platform_info, cache);
+  XlaDeviceCompiler* xla_device_compiler;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaDeviceCompiler>(
+      rm->default_container(), "xla_device_compiler", &xla_device_compiler,
+      [&](XlaDeviceCompiler** xla_device_compiler) {
+        return BuildXlaDeviceCompiler(ctx->device(), ctx->function_library(),
+                                      platform_info, xla_device_compiler);
+      }));
+  DeviceCompilationProfiler* profiler;
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
+      rm->default_container(), "device_compilation_profiler", &profiler,
+      [](DeviceCompilationProfiler** profiler) {
+        *profiler = new DeviceCompilationProfiler();
+        return OkStatus();
       }));
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref cache_ref(cache);
+  // Hold the reference to the XLA device compiler and profiler during
+  // evaluation. (We could probably free them sooner because the ResourceMgr
+  // will retain references, but this is more obviously correct.)
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
 
-  *client = static_cast<xla::LocalClient*>(cache->client());
+  *client = static_cast<xla::LocalClient*>(xla_device_compiler->client());
 
-  XlaCompiler::Options options =
-      GenerateCompilerOptions(*cache, *ctx->function_library(), ctx->device(),
-                              GetStream(ctx), platform_info, has_ref_vars);
+  XlaCompiler::Options options = GenerateCompilerOptions(
+      *xla_device_compiler, *ctx->function_library(), ctx->device(),
+      GetStream(ctx), platform_info, has_ref_vars);
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
@@ -376,8 +293,9 @@ static Status CompileToLocalExecutable(
   compile_options.alias_resource_update =
       !has_ref_vars && may_alias_resource_update;
 
-  return cache->Compile(options, function, args, compile_options, compile_mode,
-                        compilation_result, executable);
+  return xla_device_compiler->CompileIfNeeded(
+      options, function, args, compile_options, compile_mode, profiler,
+      compilation_result, executable);
 }
 
 // Get-or-create thread pool for a given collective.
@@ -410,67 +328,61 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
+  std::vector<XlaCompiler::Argument> xla_compiler_args;
 
-  auto compilation_result_or_args_and_variables_snapshot =
-      GetCompilationResultOrGetXlaCompilerArgsAndSnapshotVariables(
-          platform_info_, function_, resources_, constants_, inputs, ctx);
-  OP_REQUIRES_OK_ASYNC(
-      ctx, compilation_result_or_args_and_variables_snapshot.status(), done);
-
-  std::shared_ptr<std::vector<VariableInfo>> variable_infos;
-  ResourceVarsSnapshot variables_snapshot;
-  if (std::holds_alternative<CompilationResults>(
-          *compilation_result_or_args_and_variables_snapshot)) {
-    auto& compilation_results = std::get<CompilationResults>(
-        *compilation_result_or_args_and_variables_snapshot);
-    // This is when the signature is already compiled.
-    client = compilation_results.client;
-    compilation_result = compilation_results.compilation_result;
-    executable = compilation_results.executable;
-    variable_infos = compilation_results.variable_infos;
-  } else {
-    auto& compiler_args_and_variable_snapshots =
-        std::get<CompilerArgsAndVariableSnapshots>(
-            *compilation_result_or_args_and_variables_snapshot);
-    // This is when the signature is not compiled yet.
-    const std::vector<XlaCompiler::Argument>& args =
-        compiler_args_and_variable_snapshots.xla_compiler_args;
-    variables_snapshot =
-        std::move(compiler_args_and_variable_snapshots.variable_snapshots);
-
-    const Status s = CompileToLocalExecutable(
-        ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_, args,
-        XlaCompilationCache::CompileMode::kStrict,
-        /*may_alias_resource_update=*/true, &client, &compilation_result,
-        &executable);
-    OP_REQUIRES_OK_ASYNC(ctx, s, done);
+  // Note that here we assume the shape of the variables don't change between
+  // compilation and execution. The locks on the variables are released before
+  // compilation so that we can achieve parallel compilation of different batch
+  // sizes during warm-up.
+  {
+    // Creating a scope so that the locks on the variables are released when
+    // variable_infos goes out of scope.
+    std::vector<VariableInfo> variable_infos;
+    std::set<int> variables_updated;
+    // Here we only need to reader-lock the variables, so we pass an empty
+    // variables_updated set here.
+    Status status = GetVariableInfosFromInputs(
+        ctx->resource_manager(), ctx->device(), inputs, resources_,
+        &variables_updated, &variable_infos);
+    OP_REQUIRES_OK_ASYNC(ctx, status, done);
+    status = LockVariables(absl::MakeSpan(variable_infos));
+    OP_REQUIRES_OK_ASYNC(ctx, status, done);
+    auto status_or_xla_compiler_args =
+        XlaComputationLaunchContext::BuildXlaCompilerArguments(
+            constants_, inputs, variable_infos,
+            static_cast<Device*>(ctx->device()));
+    OP_REQUIRES_OK_ASYNC(ctx, status_or_xla_compiler_args.status(), done);
+    xla_compiler_args = std::move(status_or_xla_compiler_args.value());
   }
+  Status status = CompileToLocalExecutable(
+      ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
+      xla_compiler_args, DeviceCompileMode::kStrict,
+      /*may_alias_resource_update=*/true, &client, &compilation_result,
+      &executable);
+  OP_REQUIRES_OK_ASYNC(ctx, status, done);
 
   // Continuation of the execution, may be run in a different thread.
-  //
-  // Note that only one of variables_snapshot and variable_infos contains
-  // value. variables_snapshot contains value when the signature is not
-  // compiled yet before XlaLocalLaunchBase::ComputeAsync is called. Otherwise
-  // variable_infos contains value. variable_infos also contains locks on the
-  // variables.
-  auto run_xla_cluster = [ctx,
-                          variables_snapshot = std::move(variables_snapshot),
-                          variable_infos = std::move(variable_infos), client,
-                          executable, compilation_result, done, inputs,
-                          resources = resources_]() {
+  auto run_xla_cluster = [ctx, client, executable, compilation_result, done,
+                          inputs, resources = resources_]() {
     auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
-    std::map<int, const Tensor*> resource_var_ptrs;
-    if (variable_infos.get() == nullptr) {
-      for (const auto& [variable_index, variable_tensor] : variables_snapshot) {
-        resource_var_ptrs.emplace(variable_index, variable_tensor.has_value()
-                                                      ? &variable_tensor.value()
-                                                      : nullptr);
-      }
-    } else {
-      for (int i = 0; i < resources.size(); i++) {
-        resource_var_ptrs[resources[i]] = (*variable_infos)[i].var()->tensor();
+    std::vector<VariableInfo> variable_infos;
+    std::set<int> variables_updated;
+    for (const auto& resource_update : compilation_result->resource_updates) {
+      if (resource_update.modified) {
+        variables_updated.insert(resource_update.input_index);
       }
     }
+    OP_REQUIRES_OK_ASYNC(ctx,
+                         GetVariableInfosFromInputs(
+                             ctx->resource_manager(), ctx->device(), inputs,
+                             resources, &variables_updated, &variable_infos),
+                         done);
+    OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
+                         done);
+    std::map<int, const Tensor*> resource_var_ptrs;
+    for (int i = 0; i < resources.size(); i++) {
+      resource_var_ptrs[resources[i]] = variable_infos[i].var()->tensor();
+    }
 
     std::shared_ptr<se::DeviceMemoryAllocator> allocator =
         GetAllocator(ctx->device(), GetStream(ctx), platform_info);
@@ -509,30 +421,11 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     OP_REQUIRES_ASYNC(ctx, execution_output.ok(), execution_output.status(),
                       done);
 
-    std::vector<VariableInfo> local_variable_infos;
-    std::vector<VariableInfo>& variable_infos_ref =
-        variable_infos.get() == nullptr ? local_variable_infos
-                                        : *variable_infos;
-
-    // We need to hold the locks on the variables since we are going
-    // to write to them. If variable_infos is null, then we need to acquire the
-    // locks using local_variable_infos. If variable_infos is not null, then we
-    // are already holding the locks on the variables through variable_infos.
-    if (variable_infos.get() == nullptr) {
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
-                                     inputs, resources, &local_variable_infos),
-          done);
-      OP_REQUIRES_OK_ASYNC(
-          ctx, LockVariables(absl::MakeSpan(local_variable_infos)), done);
-    }
-
     OP_REQUIRES_OK_ASYNC(
         ctx,
         launch_context.PopulateOutputs(
             ctx, compilation_result, execution_output->ConsumeResult(),
-            /*missing_ctx_input_prefix=*/0, absl::MakeSpan(variable_infos_ref),
+            /*missing_ctx_input_prefix=*/0, absl::MakeSpan(variable_infos),
             input_output_alias, resource_var_ptrs),
         done);
     VLOG(1) << "Done";
@@ -633,16 +526,16 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     mutex_lock guard(cannot_compile_cluster_mu_);
     cannot_compile_cluster = cannot_compile_cluster_;
   }
-  XlaCompilationCache::CompileMode compile_mode = [&] {
+  DeviceCompileMode compile_mode = [&] {
     if (must_compile_) {
-      return XlaCompilationCache::CompileMode::kStrict;
+      return DeviceCompileMode::kStrict;
     }
-    return GetXlaOpsCommonFlags().tf_xla_async_compilation
-               ? XlaCompilationCache::CompileMode::kAsync
-               : XlaCompilationCache::CompileMode::kLazy;
+    return GetXlaOpsCommonFlags()->tf_xla_async_compilation
+               ? DeviceCompileMode::kAsync
+               : DeviceCompileMode::kLazy;
   }();
 
-  if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+  if (GetXlaOpsCommonFlags()->tf_xla_always_defer_compilation ||
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
@@ -658,7 +551,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     const Status status = CompileToLocalExecutable(
         ctx, function_, has_ref_vars_, platform_info_, args, compile_mode,
         /*may_alias_resource_update=*/false, &client, &kernel, &executable);
-    if (compile_mode != XlaCompilationCache::CompileMode::kLazy ||
+    if (compile_mode != DeviceCompileMode::kLazy ||
         status.code() != error::UNIMPLEMENTED) {
       OP_REQUIRES_OK(ctx, status);
     }
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index a5e77259d0f..03a9fa1e85d 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <atomic>
 
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
@@ -72,7 +72,7 @@ class XlaLocalLaunchBase : public AsyncOpKernel {
 // which will be compiled and executed using XLA.  The XlaLocalLaunchOp is
 // responsible for handling interactions with the TensorFlow executor.
 // Once all inputs are present, and their shapes are known, the op can
-// use a 'XlaCompilationCache' to compile and execute code which is specific
+// use a 'DeviceCompiler' to compile and execute code which is specific
 // to the shapes of input Tensors.
 // XlaLocalLaunchOp uses xla::LocalClient::Compile() and
 // xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
@@ -136,4 +136,4 @@ class XlaMergeOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_LAUNCH_OP_H_
+#endif  // TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index ab93e9f8ec3..2cb35d3e144 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -2144,6 +2144,8 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
       "RngSkip",
       "Roll",
       "ScatterNd",
+      "SegmentSumV2",
+      "SegmentProdV2",
       "SelfAdjointEigV2",
       "SoftmaxCrossEntropyWithLogits",
       "SpaceToBatch",
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 5144584adab..e70b5c2525d 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/jit/pjrt_device_compiler_client.cc b/tensorflow/compiler/jit/pjrt_device_compiler_client.cc
new file mode 100644
index 00000000000..605af0e6865
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_device_compiler_client.cc
@@ -0,0 +1,81 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace tensorflow {
+
+namespace {
+xla::CompileOptions GetPjRtCompileOptions(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result) {
+  xla::CompileOptions pjrt_compile_options;
+  pjrt_compile_options.argument_layouts = result.xla_input_shapes;
+  pjrt_compile_options.executable_build_options =
+      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
+  // Compile portable executable for single device compilation.
+  pjrt_compile_options.compile_portable_executable = true;
+  return pjrt_compile_options;
+}
+}  // namespace
+
+StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>>
+PjRtDeviceCompilerClient::BuildExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result) {
+  VLOG(2) << "Compiling to xla::PjRtLoadedExecutable.";
+
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      client_->Compile(*result.computation,
+                                       GetPjRtCompileOptions(options, result)));
+
+  VLOG(2) << "Compiled PJRT executable " << executable->name()
+          << " num_replicas " << executable->num_replicas()
+          << " num_partitions " << executable->num_partitions();
+
+  return std::move(executable);
+}
+
+StatusOr<std::string> PjRtDeviceCompilerClient::BuildSerializedExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result) {
+  VLOG(1) << "PJRT currently doesn't support AOT compilation. Compiling to "
+             "xla::PjRtLoadedExecutable and serializing it";
+  TF_ASSIGN_OR_RETURN(auto executable, BuildExecutable(options, result));
+  return executable->SerializeExecutable();
+}
+
+StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>>
+PjRtDeviceCompilerClient::LoadExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result,
+    const std::string& serialized_executable) {
+  VLOG(1) << "Deserializing from string to xla::PjRtLoadedExecutable.";
+  return client_->DeserializeExecutable(serialized_executable,
+                                        GetPjRtCompileOptions(options, result));
+}
+
+void PjRtDeviceCompilerClient::WaitForProgramsToFinish() {
+  // TODO(b/255826209): Modify this if PjRtClient exposes a function to wait for
+  // programs to finish.
+  LOG(INFO) << "Unimplemented: PJRT uses futures and waiting for programs to "
+               "finish isn't necessary.";
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/pjrt_device_compiler_client.h b/tensorflow/compiler/jit/pjrt_device_compiler_client.h
new file mode 100644
index 00000000000..bd03d577f83
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_device_compiler_client.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+
+namespace tensorflow {
+
+// Calls into PjRtClient to provide functionality for building, serializing and
+// loading PjRtLoadedExecutables.
+class PjRtDeviceCompilerClient
+    : public DeviceCompilerClient<xla::PjRtLoadedExecutable, xla::PjRtClient> {
+ public:
+  explicit PjRtDeviceCompilerClient(xla::PjRtClient* client)
+      : client_(client) {}
+
+  StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // PjRt doesn't support AOT compilation yet. Builds a PjRtLoadedExecutable and
+  // serializes it to string.
+  StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Deserializes a serialized executable as produced by
+  // PjRtExecutable::SerializeExecutable(). `serialized_executable` must have
+  // been produced by a compiler of the same platform and version as this one.
+  //
+  // PjRt doesn't support AOT compilation yet. Loading a serialized executable
+  // is currently only implemented for TfrtTpuPjrtClient and hence, this
+  // function doesn't use PjRtClient::LoadSerializedExecutable() and uses
+  // PjRtClient::DeserializeExecutable() instead.
+  StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) override;
+
+  // No-op. PJRT uses futures and waiting for programs to finish isn't
+  // necessary.
+  void WaitForProgramsToFinish() override;
+
+  xla::PjRtClient* client() const override { return client_; }
+
+ private:
+  xla::PjRtClient* const client_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PjRtDeviceCompilerClient);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_COMPILER_CLIENT_H_
diff --git a/tensorflow/compiler/jit/pjrt_device_context.cc b/tensorflow/compiler/jit/pjrt_device_context.cc
new file mode 100644
index 00000000000..e6d8157115a
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_device_context.cc
@@ -0,0 +1,124 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/pjrt_device_context.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+
+namespace tensorflow {
+namespace {
+
+StatusOr<std::unique_ptr<xla::PjRtBuffer>> HostTensorToPjRtBuffer(
+    const tensorflow::Tensor* cpu_tensor, tensorflow::Device* device,
+    xla::PjRtClient* pjrt_client) {
+  // TODO(b/262472386): Consider layout_preference_fn and
+  // shape_representation_fn.
+  xla::Shape shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(cpu_tensor->dtype(), cpu_tensor->shape(), &shape));
+  TF_ASSIGN_OR_RETURN(xla::PjRtDevice * pjrt_device,
+                      pjrt_client->LookupDevice(device->parsed_name().id));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtBuffer> buffer,
+      pjrt_client->BufferFromHostBuffer(
+          cpu_tensor->data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          xla::PjRtClient::HostBufferSemantics::kZeroCopy,
+          /*on_done_with_host_buffer=*/
+          [cpu_tensor = *cpu_tensor]() { /* frees tensor */ }, pjrt_device));
+  return buffer;
+}
+
+}  // namespace
+
+void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                                              absl::string_view tensor_name,
+                                              Device* device,
+                                              Tensor* cpu_tensor,
+                                              StatusCallback done) {
+  profiler::TraceMe traceme("PjRtDeviceContext::CopyDeviceTensorToCPU");
+  if (device_tensor->NumElements() == 0) {
+    VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
+    done(OkStatus());
+    return;
+  }
+  auto literal = std::make_unique<xla::MutableBorrowingLiteral>();
+  auto status = tensorflow::HostTensorToMutableBorrowingLiteral(cpu_tensor,
+                                                                literal.get());
+  if (!status.ok()) {
+    done(status);
+  }
+  std::shared_ptr<xla::PjRtBuffer> device_buffer =
+      tensorflow::AsyncValueTensor::FromTensor(device_tensor)->GetBuffer();
+  xla::PjRtFuture<Status> future = device_buffer->ToLiteral(literal.get());
+  future.OnReady([literal = std::move(literal), done = std::move(done),
+                  device_buffer = std::move(device_buffer)](
+                     const tensorflow::Status& status) { done(status); });
+}
+
+void PjRtDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
+                                              Device* device,
+                                              Tensor* device_tensor,
+                                              StatusCallback done,
+                                              bool sync_dst_compute) const {
+  profiler::TraceMe traceme("PjRtDeviceContext::CopyCPUTensorToDevice");
+  if (cpu_tensor->NumElements() == 0) {
+    VLOG(2) << "CopyCPUTensorToDevice empty tensor";
+    done(OkStatus());
+    return;
+  }
+  AsyncValueTensor* result_tensor =
+      tensorflow::AsyncValueTensor::FromTensor(device_tensor);
+  // The result tensor should be newly allocated, which does not point to a
+  // valid buffer yet.
+  CHECK(!result_tensor->GetBuffer());  // Crash OK
+  // TODO(b/252887149): figure out how to cache PJRT client.
+  StatusOr<xla::PjRtClient*> pjrt_client =
+      GetOrCreatePjRtClient(DeviceType(device->device_type()));
+  if (!pjrt_client.ok()) {
+    done(pjrt_client.status());
+    return;
+  }
+  StatusOr<std::unique_ptr<xla::PjRtBuffer>> buffer_or =
+      HostTensorToPjRtBuffer(cpu_tensor, device, *pjrt_client);
+  if (!buffer_or.ok()) {
+    done(buffer_or.status());
+    return;
+  }
+  std::unique_ptr<xla::PjRtBuffer> device_buffer = std::move(buffer_or.value());
+  // TODO(b/244666476): evaluate the performance impact of marking ready when
+  // the data in device buffer is computed. In `tpu_device_context`, it is
+  // marked done when the allocation finished.
+  device_buffer->GetReadyFuture().OnReady(std::move(done));
+  result_tensor->SetBuffer(std::move(device_buffer));
+}
+
+void PjRtDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                               Device* device,
+                                               Tensor* output_tensor,
+                                               StatusCallback done) const {
+  done(errors::Unimplemented("Same-device copies not implemented."));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/pjrt_device_context.h b/tensorflow/compiler/jit/pjrt_device_context.h
new file mode 100644
index 00000000000..42e72dbd9d7
--- /dev/null
+++ b/tensorflow/compiler/jit/pjrt_device_context.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Helper class for managing data transfers between host and accelerator
+// devices using PjRt.
+class PjRtDeviceContext : public DeviceContext {
+ public:
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PJRT_DEVICE_CONTEXT_H_
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index a4311584a6f..17e47dd9a81 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -3,7 +3,10 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 licenses(["notice"])
 
-package(default_visibility = ["//visibility:private"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
+)
 
 cc_library(
     name = "auto_clustering_test_helper",
@@ -51,10 +54,13 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "xla_compilation_cache_test_helper",
+    name = "device_compiler_test_helper",
     testonly = True,
-    srcs = ["xla_compilation_cache_test_helper.cc"],
-    hdrs = ["xla_compilation_cache_test_helper.h"],
+    srcs = ["device_compiler_test_helper.cc"],
+    hdrs = ["device_compiler_test_helper.h"],
+    visibility = [
+        "//tensorflow/compiler/jit:__pkg__",
+    ],
     deps = [
         "//tensorflow/compiler/jit:xla_activity_listener",
         "//tensorflow/compiler/jit:xla_compilation_cache_proto_cc",
@@ -78,9 +84,9 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "xla_compilation_cache_serialize_test",
+    name = "device_compiler_serialize_test",
     srcs = [
-        "xla_compilation_cache_serialize_test.cc",
+        "device_compiler_serialize_test.cc",
     ],
     env = {
         "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
@@ -92,7 +98,7 @@ tf_cc_test(
         "xla",
     ],
     deps = [
-        ":xla_compilation_cache_test_helper",
+        ":device_compiler_test_helper",
         "//tensorflow/compiler/jit:compilation_passes",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:test",
@@ -100,9 +106,9 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "xla_compilation_cache_serialize_options_test",
+    name = "device_compiler_serialize_options_test",
     srcs = [
-        "xla_compilation_cache_serialize_options_test.cc",
+        "device_compiler_serialize_options_test.cc",
     ],
     env = {
         "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
@@ -114,7 +120,7 @@ tf_cc_test(
         "xla",
     ],
     deps = [
-        ":xla_compilation_cache_test_helper",
+        ":device_compiler_test_helper",
         "//tensorflow/compiler/jit:compilation_passes",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
new file mode 100644
index 00000000000..3eaa8202261
--- /dev/null
+++ b/tensorflow/compiler/jit/tests/device_compiler_serialize_options_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/tests/device_compiler_test_helper.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+TEST_F(DeviceCompilerSerializeTest, PersistentCacheOptionsTest) {
+  GraphDef graph = GetTestGraph({-1, 4});
+
+  // Warmup the persistent cache(s) with multiple runs. 4 is a magic number to
+  // detect non-determinism in TF when running the test.
+  listener()->ClearListenerHistory();
+  for (int b = 1; b < 4; ++b) {
+    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
+  }
+  TF_ASSERT_OK(listener()->VerifyPersistentCacheUseListenerHistory(
+      /*expect_persistent_cache_use=*/false));
+
+  // Reset the cluster numbering between sessions so we can get the same
+  // cluster numbering.
+  testing::ResetClusterSequenceNumber();
+
+  auto status =
+      AlterPersistentCacheEntryHloModuleNames(tensorflow::testing::TmpDir());
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "Did not find any persistent XLA compilation cache entries to alter."));
+
+  TF_ASSERT_OK(AlterPersistentCacheEntryHloModuleNames(
+      tensorflow::testing::TmpDir(), "my_test_prefix"));
+
+  // Run again and these should all hit in the persistent cache despite having
+  // altered the persistent cache entries' HLO modules (disabled strict
+  // signature checks).
+  listener()->ClearListenerHistory();
+  for (int b = 1; b < 4; ++b) {
+    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
+  }
+  TF_ASSERT_OK(listener()->VerifyPersistentCacheUseListenerHistory(
+      /*expect_persistent_cache_use=*/true));
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::GetMarkForCompilationPassFlags()
+      ->tf_xla_deterministic_cluster_names = true;
+  tensorflow::GetMarkForCompilationPassFlags()
+      ->tf_xla_persistent_cache_directory = tensorflow::testing::TmpDir();
+  tensorflow::GetMarkForCompilationPassFlags()
+      ->tf_xla_disable_strict_signature_checks = true;
+  tensorflow::GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix =
+      "my_test_prefix";
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc b/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc
new file mode 100644
index 00000000000..984b9852535
--- /dev/null
+++ b/tensorflow/compiler/jit/tests/device_compiler_serialize_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
+#include "tensorflow/compiler/jit/tests/device_compiler_test_helper.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+TEST_F(DeviceCompilerSerializeTest, PersistentCacheTest) {
+  GraphDef graph = GetTestGraph({-1, 4});
+
+  // Warmup the persistent cache(s) with multiple runs. 4 is a magic number to
+  // detect non-determinism in TF when running the test.
+  listener()->ClearListenerHistory();
+  for (int b = 1; b < 4; ++b) {
+    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
+  }
+  TF_ASSERT_OK(listener()->VerifyPersistentCacheUseListenerHistory(
+      /*expect_persistent_cache_use=*/false));
+
+  // Reset the cluster numbering between sessions so we can get the same
+  // cluster numbering.
+  testing::ResetClusterSequenceNumber();
+
+  // Run again but these should all hit in the persistent cache.
+  listener()->ClearListenerHistory();
+  for (int b = 1; b < 4; ++b) {
+    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
+  }
+  TF_ASSERT_OK(listener()->VerifyPersistentCacheUseListenerHistory(
+      /*expect_persistent_cache_use=*/true));
+
+  // Reset the cluster numbering between sessions so we can get the same
+  // cluster numbering.
+  testing::ResetClusterSequenceNumber();
+
+  TF_ASSERT_OK(
+      AlterPersistentCacheEntryHloModuleNames(tensorflow::testing::TmpDir()));
+
+  // Run again but these should all fail, because the persistent cache entries'
+  // HLO modules have been altered.
+  for (int b = 1; b < 4; ++b) {
+    auto status = ExecuteWithBatch(graph, b);
+    EXPECT_FALSE(status.ok());
+    EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                  "Serialized HLO does not match."));
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::GetMarkForCompilationPassFlags()
+      ->tf_xla_deterministic_cluster_names = true;
+  tensorflow::GetMarkForCompilationPassFlags()
+      ->tf_xla_persistent_cache_directory = tensorflow::testing::TmpDir();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
new file mode 100644
index 00000000000..95d75b67cac
--- /dev/null
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
@@ -0,0 +1,166 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/tests/device_compiler_test_helper.h"
+
+#include <string>
+
+#include "absl/strings/match.h"
+#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+// Creates a float tensor of linearly increasing values, starting from offset.
+Tensor CreateInputTensor(const TensorShape& shape, float offset) {
+  Tensor tensor(DT_FLOAT, shape);
+  for (int64 i = 0; i < tensor.flat<float>().size(); ++i) {
+    tensor.flat<float>()(i) = offset + i;
+  }
+  return tensor;
+}
+
+NodeDef MakeNode(
+    absl::string_view name, absl::string_view op,
+    absl::Span<const std::string> inputs,
+    absl::Span<
+        const std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>
+        attrs) {
+  NodeDef node;
+  node.set_name(std::string(name));
+  node.set_op(std::string(op));
+  for (const auto& input : inputs) node.add_input(input);
+  for (const auto& attr : attrs)
+    node.mutable_attr()->insert({attr.first, attr.second.proto});
+  return node;
+}
+
+}  // namespace
+
+GraphDef DeviceCompilerSerializeTest::GetTestGraph(
+    const PartialTensorShape& input_shape) {
+  FunctionDef make_test_fn = FunctionDefHelper::Define(
+      "TestFn", {"a:float", "b:float", "c:float"}, {"m:float"}, {},
+      {{{"d"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
+       {{"e"}, "Mul", {"d", "c"}, {{"T", DT_FLOAT}}},
+       {{"f"}, "Add", {"e", "a"}, {{"T", DT_FLOAT}}},
+       {{"g"}, "Mul", {"f", "b"}, {{"T", DT_FLOAT}}},
+       // Force two clusters by excluding this node explicitly.
+       {{"h"}, "Add", {"g", "f"}, {{"T", DT_FLOAT}, {"_XlaCompile", false}}},
+       {{"i"}, "Add", {"h", "e"}, {{"T", DT_FLOAT}}},
+       {{"j"}, "Add", {"i", "h"}, {{"T", DT_FLOAT}}},
+       {{"k"}, "Add", {"j", "h"}, {{"T", DT_FLOAT}}},
+       {{"l"}, "Add", {"k", "h"}, {{"T", DT_FLOAT}}},
+       {{"m"}, "Identity", {"l"}, {{"T", DT_FLOAT}}}});
+
+  GraphDef graph;
+  *graph.mutable_library()->add_function() = make_test_fn;
+  *graph.add_node() = MakeNode("a", "Placeholder", {},
+                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
+  *graph.add_node() = MakeNode("b", "Placeholder", {},
+                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
+  *graph.add_node() = MakeNode("c", "Placeholder", {},
+                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
+  *graph.add_node() = MakeNode("m", "TestFn", {"a", "b", "c"}, {});
+  return graph;
+}
+
+Status DeviceCompilerSerializeTest::ExecuteWithBatch(const GraphDef& graph,
+                                                     int batch) {
+  const TensorShape shape({batch, 4});
+
+  // Compute the golden output tensor
+  std::vector<Tensor> golden_output_tensors;
+  {
+    SessionOptions options;
+    std::unique_ptr<Session> session(NewSession(options));
+    TF_RETURN_IF_ERROR(session->Create(graph));
+    RunOptions run_options;
+
+    Tensor input_a = CreateInputTensor(shape, 0);
+    Tensor input_b = CreateInputTensor(shape, shape.num_elements());
+    Tensor input_c = CreateInputTensor(shape, 2 * shape.num_elements());
+    TF_RETURN_IF_ERROR(session->Run(
+        run_options,
+        {std::make_pair("a", input_a), std::make_pair("b", input_b),
+         std::make_pair("c", input_c)},
+        {"m"}, {}, &golden_output_tensors, nullptr));
+    TF_RETURN_IF_ERROR(session->Close());
+  }
+
+  // Compute the XLA compiled output
+  std::vector<Tensor> output_tensors;
+  {
+    SessionOptions options;
+    auto& opts =
+        *options.config.mutable_graph_options()->mutable_optimizer_options();
+    opts.set_global_jit_level(OptimizerOptions::ON_1);
+    opts.set_cpu_global_jit(true);
+
+    std::unique_ptr<Session> session(NewSession(options));
+    TF_RETURN_IF_ERROR(session->Create(graph));
+    RunOptions run_options;
+    Tensor input_a = CreateInputTensor(shape, 0);
+    Tensor input_b = CreateInputTensor(shape, shape.num_elements());
+    Tensor input_c = CreateInputTensor(shape, 2 * shape.num_elements());
+    TF_RETURN_IF_ERROR(session->Run(
+        run_options,
+        {std::make_pair("a", input_a), std::make_pair("b", input_b),
+         std::make_pair("c", input_c)},
+        {"m"}, {}, &output_tensors, nullptr));
+    TF_RETURN_IF_ERROR(session->Close());
+  }
+
+  Tensor f32_input(DT_FLOAT, shape);
+  for (int64 i = 0; i < f32_input.NumElements(); ++i) {
+    EXPECT_NEAR(golden_output_tensors[0].flat<float>()(i),
+                output_tensors[0].flat<float>()(i), 1e-3);
+  }
+  return OkStatus();
+}
+
+Status DeviceCompilerSerializeTest::AlterPersistentCacheEntryHloModuleNames(
+    absl::string_view persistent_cache_dir_path,
+    absl::string_view file_prefix) {
+  Env* env = Env::Default();
+  std::vector<string> file_names;
+  TF_RETURN_IF_ERROR(
+      env->GetChildren(tensorflow::testing::TmpDir(), &file_names));
+
+  bool altered = false;
+  for (const auto& file_name : file_names) {
+    if (absl::EndsWith(file_name, ".pb") &&
+        absl::StartsWith(file_name, file_prefix)) {
+      XlaSerializedCacheEntry entry;
+      auto file_path = io::JoinPath(persistent_cache_dir_path, file_name);
+      TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(env, file_path, &entry));
+      entry.mutable_hlo_module()->set_name(
+          absl::StrCat(entry.hlo_module().name(), "_altered"));
+      TF_RETURN_IF_ERROR(WriteBinaryProto(env, file_path, entry));
+      altered = true;
+    }
+  }
+
+  if (!altered) {
+    return errors::NotFound(
+        "Did not find any persistent XLA compilation cache entries to alter.");
+  }
+  return OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.h b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
new file mode 100644
index 00000000000..edb6be6a0ff
--- /dev/null
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
+#define TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/jit/xla_activity_listener.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// A listener to inspect the use of XLA's persistent compilation cache entries.
+class JitCompilationListener : public XlaActivityListener {
+ public:
+  Status Listen(
+      const XlaAutoClusteringActivity& auto_clustering_activity) override {
+    return OkStatus();
+  }
+
+  Status Listen(
+      const XlaJitCompilationActivity& jit_compilation_activity) override {
+    activity_history_.push_back(jit_compilation_activity);
+    return OkStatus();
+  }
+
+  Status Listen(const XlaOptimizationRemark& optimization_remark) override {
+    return OkStatus();
+  }
+
+  ~JitCompilationListener() override = default;
+
+  Status VerifyPersistentCacheUseListenerHistory(
+      bool expect_persistent_cache_use) {
+    for (const auto& activity : activity_history_) {
+      if (activity.used_persistent_cache() != expect_persistent_cache_use) {
+        return errors::FailedPrecondition("Unexpected listener history.");
+      }
+    }
+    return OkStatus();
+  }
+
+  std::vector<XlaJitCompilationActivity> GetListenerHistory() {
+    return activity_history_;
+  }
+
+  void ClearListenerHistory() { activity_history_.clear(); }
+
+ private:
+  std::vector<XlaJitCompilationActivity> activity_history_;
+};
+
+// Fixture for testing XLA compilation cache serialization.
+class DeviceCompilerSerializeTest : public ::testing::Test {
+ protected:
+  DeviceCompilerSerializeTest() {
+    auto listener = std::make_unique<JitCompilationListener>();
+    listener_ = listener.get();
+    RegisterXlaActivityListener(std::move(listener));
+  }
+
+  JitCompilationListener* listener() const { return listener_; }
+
+  // Returns a test graph that will split into two XLA clusters (due to a node
+  // with _XlaCompile = false).
+  GraphDef GetTestGraph(const PartialTensorShape& input_shape);
+
+  // Runs the graph using specified batch size both with and without XLA JIT
+  // compilation. Returns an error if the results between the two do not match.
+  Status ExecuteWithBatch(const GraphDef& graph, int batch);
+
+  // Adds the suffix "_altered" to the HLO module names of all of the persistent
+  // XLA compilation cache entries found at the specified directory. If none are
+  // found, returns NOT_FOUND error.
+  Status AlterPersistentCacheEntryHloModuleNames(
+      absl::string_view persistent_cache_dir_path,
+      absl::string_view file_prefix = "xla_compile_cache");
+
+ private:
+  JitCompilationListener* listener_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TESTS_DEVICE_COMPILER_TEST_HELPER_H_
diff --git a/tensorflow/compiler/jit/tests/keras_imagenet_main.golden_summary b/tensorflow/compiler/jit/tests/keras_imagenet_main.golden_summary
index c264867cd6a..630836b86c2 100644
--- a/tensorflow/compiler/jit/tests/keras_imagenet_main.golden_summary
+++ b/tensorflow/compiler/jit/tests/keras_imagenet_main.golden_summary
@@ -1,4 +1,4 @@
-Clustered nodes: 2301
+Clustered nodes: 2725
 Unclustered nodes: 606
 Number of clusters: 2
 
@@ -13,7 +13,7 @@ unclustered size 606
  Switch 1
  _Arg 435
  _Retval 2
-cluster 0 size 1645
+cluster 0 size 1910
  Add 16
  AddN 71
  ArgMax 1
@@ -21,7 +21,7 @@ cluster 0 size 1645
  BiasAdd 1
  BiasAddGrad 1
  Cast 115
- Const 142
+ Const 407
  Conv2D 53
  Conv2DBackpropFilter 53
  Conv2DBackpropInput 52
@@ -46,11 +46,11 @@ cluster 0 size 1645
  Sum 1
  Tile 1
  Transpose 1
-cluster 1 size 656
+cluster 1 size 815
  AddN 1
  AssignAddVariableOp 1
  AssignSubVariableOp 106
- Const 61
+ Const 220
  DivNoNan 1
  Identity 1
  Mul 161
diff --git a/tensorflow/compiler/jit/tests/keras_imagenet_main_graph_mode.golden_summary b/tensorflow/compiler/jit/tests/keras_imagenet_main_graph_mode.golden_summary
index 9683f9c03ee..a1692ccef28 100644
--- a/tensorflow/compiler/jit/tests/keras_imagenet_main_graph_mode.golden_summary
+++ b/tensorflow/compiler/jit/tests/keras_imagenet_main_graph_mode.golden_summary
@@ -1,17 +1,17 @@
-Clustered nodes: 1968
-Unclustered nodes: 445
+Clustered nodes: 2178
+Unclustered nodes: 446
 Number of clusters: 1
 
-unclustered size 445
+unclustered size 446
  AssignAddVariableOp 2
- Const 1
+ Const 2
  DivNoNan 1
  Identity 1
  NoOp 1
  ReadVariableOp 2
  VarHandleOp 435
  _Retval 2
-cluster 0 size 1968
+cluster 0 size 2178
  Add 17
  AddN 72
  ArgMax 1
@@ -20,7 +20,7 @@ cluster 0 size 1968
  BiasAdd 1
  BiasAddGrad 1
  Cast 3
- Const 147
+ Const 357
  Conv2D 53
  Conv2DBackpropFilter 53
  Conv2DBackpropInput 52
diff --git a/tensorflow/compiler/jit/tests/opens2s_gnmt_mixed_precision.golden_summary b/tensorflow/compiler/jit/tests/opens2s_gnmt_mixed_precision.golden_summary
index a3ffbf7dbbc..2ad145e0147 100644
--- a/tensorflow/compiler/jit/tests/opens2s_gnmt_mixed_precision.golden_summary
+++ b/tensorflow/compiler/jit/tests/opens2s_gnmt_mixed_precision.golden_summary
@@ -1,8 +1,8 @@
-Clustered nodes: 2452
-Unclustered nodes: 3894
-Number of clusters: 31
+Clustered nodes: 2385
+Unclustered nodes: 4221
+Number of clusters: 30
 
-unclustered size 3894
+unclustered size 4221
  Add 17
  AddN 1
  All 1
@@ -12,8 +12,9 @@ unclustered size 3894
  AssignAdd 2
  AssignSub 2
  BroadcastGradientArgs 44
+ Cast 38
  ConcatV2 3
- Const 662
+ Const 875
  ControlTrigger 5
  Enter 874
  Equal 4
@@ -30,9 +31,9 @@ unclustered size 3894
  LogicalAnd 3
  LoopCond 8
  Max 4
- Maximum 6
+ Maximum 44
  Merge 145
- Minimum 5
+ Minimum 43
  Mul 8
  NextIteration 136
  RandomUniform 14
@@ -138,10 +139,28 @@ cluster 4 size 11
  ReverseSequence 1
  Slice 2
  Transpose 3
-cluster 5 size 32
+cluster 5 size 21
+ All 1
+ ConcatV2 1
+ Const 11
+ Equal 1
+ ExpandDims 1
+ ReverseSequence 1
+ Shape 1
+ StridedSlice 1
+ Transpose 3
+cluster 6 size 11
+ Cast 1
+ Const 5
+ GatherV2 1
+ Shape 1
+ StridedSlice 1
+ Transpose 1
+ ZerosLike 1
+cluster 7 size 33
  All 2
  Cast 1
- Const 16
+ Const 17
  Equal 2
  ExpandDims 2
  GatherV2 1
@@ -150,15 +169,7 @@ cluster 5 size 32
  Shape 2
  StridedSlice 2
  Transpose 2
-cluster 6 size 11
- Cast 1
- Const 5
- GatherV2 1
- Shape 1
- StridedSlice 1
- Transpose 1
- ZerosLike 1
-cluster 7 size 11
+cluster 8 size 11
  Cast 1
  Const 4
  Floor 1
@@ -166,26 +177,26 @@ cluster 7 size 11
  Mul 2
  Pow 1
  Sub 1
-cluster 8 size 5
+cluster 9 size 5
  All 1
  Const 1
  Less 1
  LogicalAnd 1
  LogicalNot 1
-cluster 9 size 9
+cluster 10 size 9
  All 1
  Const 4
  Equal 1
  LessEqual 1
  LogicalOr 1
  Max 1
-cluster 10 size 272
+cluster 11 size 302
  Add 24
  BatchMatMulV2 1
  BiasAdd 8
  Cast 8
  ConcatV2 16
- Const 51
+ Const 81
  ExpandDims 3
  Fill 1
  GreaterEqual 8
@@ -207,13 +218,13 @@ cluster 10 size 272
  StridedSlice 1
  Sum 2
  Tanh 17
-cluster 11 size 6
+cluster 12 size 6
  Add 1
  All 1
  Const 2
  GreaterEqual 1
  LogicalOr 1
-cluster 14 size 614
+cluster 15 size 614
  Add 22
  AddN 41
  BatchMatMulV2 2
@@ -239,7 +250,7 @@ cluster 14 size 614
  TanhGrad 17
  Tile 2
  ZerosLike 1
-cluster 15 size 22
+cluster 16 size 22
  Add 2
  BiasAdd 1
  ConcatV2 1
@@ -252,7 +263,7 @@ cluster 15 size 22
  Sigmoid 3
  Split 1
  Tanh 2
-cluster 16 size 60
+cluster 17 size 60
  Add 2
  AddN 4
  BiasAddGrad 1
@@ -270,16 +281,6 @@ cluster 16 size 60
  Slice 2
  Sum 9
  TanhGrad 2
-cluster 17 size 20
- All 1
- ConcatV2 1
- Const 10
- Equal 1
- ExpandDims 1
- ReverseSequence 1
- Shape 1
- StridedSlice 1
- Transpose 3
 cluster 18 size 22
  Add 2
  BiasAdd 1
@@ -311,12 +312,12 @@ cluster 19 size 60
  Slice 2
  Sum 9
  TanhGrad 2
-cluster 21 size 27
+cluster 21 size 29
  Add 2
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 5
+ Const 7
  GreaterEqual 2
  MatMul 1
  Mul 5
@@ -325,12 +326,12 @@ cluster 21 size 27
  Snapshot 1
  Split 1
  Tanh 2
-cluster 22 size 25
+cluster 22 size 28
  Add 3
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 2
+ Const 5
  GreaterEqual 1
  MatMul 1
  Mul 5
@@ -361,12 +362,12 @@ cluster 24 size 4
  Const 1
  Shape 2
  Transpose 1
-cluster 25 size 24
+cluster 25 size 27
  Add 3
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 2
+ Const 5
  GreaterEqual 1
  MatMul 1
  Mul 5
@@ -375,12 +376,12 @@ cluster 25 size 24
  Snapshot 1
  Split 1
  Tanh 2
-cluster 26 size 24
+cluster 26 size 27
  Add 3
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 2
+ Const 5
  GreaterEqual 1
  MatMul 1
  Mul 5
@@ -389,12 +390,12 @@ cluster 26 size 24
  Snapshot 1
  Split 1
  Tanh 2
-cluster 27 size 24
+cluster 27 size 27
  Add 3
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 2
+ Const 5
  GreaterEqual 1
  MatMul 1
  Mul 5
@@ -403,12 +404,12 @@ cluster 27 size 24
  Snapshot 1
  Split 1
  Tanh 2
-cluster 28 size 24
+cluster 28 size 27
  Add 3
  BiasAdd 1
  Cast 1
  ConcatV2 1
- Const 2
+ Const 5
  GreaterEqual 1
  MatMul 1
  Mul 5
@@ -435,8 +436,3 @@ cluster 31 size 4
 cluster 32 size 4
  Mul 3
  UnsortedSegmentSum 1
-cluster 33 size 116
- Cast 38
- Const 2
- Maximum 38
- Minimum 38
diff --git a/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_options_test.cc b/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_options_test.cc
deleted file mode 100644
index c2b10cdc133..00000000000
--- a/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_options_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace tensorflow {
-namespace {
-
-TEST_F(XlaCompilationCacheSerializeTest, PersistentCacheOptionsTest) {
-  GraphDef graph = GetTestGraph({-1, 4});
-
-  // Warmup the persistent cache(s) with multiple runs. 4 is a magic number to
-  // detect non-determinism in TF when running the test.
-  listener()->ClearListenerHistory();
-  for (int b = 1; b < 4; ++b) {
-    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
-  }
-  TF_ASSERT_OK(
-      listener()->VerifyListenerHistory(/*expect_persistent_cache_use=*/false));
-
-  // Reset the cluster numbering between sessions so we can get the same
-  // cluster numbering.
-  testing::ResetClusterSequenceNumber();
-
-  auto status =
-      AlterPersistentCacheEntryHloModuleNames(tensorflow::testing::TmpDir());
-  EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(absl::StrContains(
-      status.error_message(),
-      "Did not find any persistent XLA compilation cache entries to alter."));
-
-  TF_ASSERT_OK(AlterPersistentCacheEntryHloModuleNames(
-      tensorflow::testing::TmpDir(), "my_test_prefix"));
-
-  // Run again and these should all hit in the persistent cache despite having
-  // altered the persistent cache entries' HLO modules (disabled strict
-  // signature checks).
-  listener()->ClearListenerHistory();
-  for (int b = 1; b < 4; ++b) {
-    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
-  }
-  TF_ASSERT_OK(
-      listener()->VerifyListenerHistory(/*expect_persistent_cache_use=*/true));
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  tensorflow::GetMarkForCompilationPassFlags()
-      ->tf_xla_deterministic_cluster_names = true;
-  tensorflow::GetMarkForCompilationPassFlags()
-      ->tf_xla_persistent_cache_directory = tensorflow::testing::TmpDir();
-  tensorflow::GetMarkForCompilationPassFlags()
-      ->tf_xla_disable_strict_signature_checks = true;
-  tensorflow::GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix =
-      "my_test_prefix";
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_test.cc b/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_test.cc
deleted file mode 100644
index 8828cd4d4bb..00000000000
--- a/tensorflow/compiler/jit/tests/xla_compilation_cache_serialize_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
-#include "tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace tensorflow {
-namespace {
-
-TEST_F(XlaCompilationCacheSerializeTest, PersistentCacheTest) {
-  GraphDef graph = GetTestGraph({-1, 4});
-
-  // Warmup the persistent cache(s) with multiple runs. 4 is a magic number to
-  // detect non-determinism in TF when running the test.
-  listener()->ClearListenerHistory();
-  for (int b = 1; b < 4; ++b) {
-    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
-  }
-  TF_ASSERT_OK(
-      listener()->VerifyListenerHistory(/*expect_persistent_cache_use=*/false));
-
-  // Reset the cluster numbering between sessions so we can get the same
-  // cluster numbering.
-  testing::ResetClusterSequenceNumber();
-
-  // Run again but these should all hit in the persistent cache.
-  listener()->ClearListenerHistory();
-  for (int b = 1; b < 4; ++b) {
-    TF_ASSERT_OK(ExecuteWithBatch(graph, b));
-  }
-  TF_ASSERT_OK(
-      listener()->VerifyListenerHistory(/*expect_persistent_cache_use=*/true));
-
-  // Reset the cluster numbering between sessions so we can get the same
-  // cluster numbering.
-  testing::ResetClusterSequenceNumber();
-
-  TF_ASSERT_OK(
-      AlterPersistentCacheEntryHloModuleNames(tensorflow::testing::TmpDir()));
-
-  // Run again but these should all fail, because the persistent cache entries'
-  // HLO modules have been altered.
-  for (int b = 1; b < 4; ++b) {
-    auto status = ExecuteWithBatch(graph, b);
-    EXPECT_FALSE(status.ok());
-    EXPECT_TRUE(absl::StrContains(status.error_message(),
-                                  "Serialized HLO does not match."));
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  tensorflow::GetMarkForCompilationPassFlags()
-      ->tf_xla_deterministic_cluster_names = true;
-  tensorflow::GetMarkForCompilationPassFlags()
-      ->tf_xla_persistent_cache_directory = tensorflow::testing::TmpDir();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.cc b/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.cc
deleted file mode 100644
index cfa6efd4d71..00000000000
--- a/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h"
-
-#include <string>
-
-#include "absl/strings/match.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/public/session.h"
-
-namespace tensorflow {
-namespace {
-
-// Creates a float tensor of linearly increasing values, starting from offset.
-Tensor CreateInputTensor(const TensorShape& shape, float offset) {
-  Tensor tensor(DT_FLOAT, shape);
-  for (int64 i = 0; i < tensor.flat<float>().size(); ++i) {
-    tensor.flat<float>()(i) = offset + i;
-  }
-  return tensor;
-}
-
-NodeDef MakeNode(
-    absl::string_view name, absl::string_view op,
-    absl::Span<const std::string> inputs,
-    absl::Span<
-        const std::pair<std::string, FunctionDefHelper::AttrValueWrapper>>
-        attrs) {
-  NodeDef node;
-  node.set_name(std::string(name));
-  node.set_op(std::string(op));
-  for (const auto& input : inputs) node.add_input(input);
-  for (const auto& attr : attrs)
-    node.mutable_attr()->insert({attr.first, attr.second.proto});
-  return node;
-}
-
-}  // namespace
-
-GraphDef XlaCompilationCacheSerializeTest::GetTestGraph(
-    const PartialTensorShape& input_shape) {
-  FunctionDef make_test_fn = FunctionDefHelper::Define(
-      "TestFn", {"a:float", "b:float", "c:float"}, {"m:float"}, {},
-      {{{"d"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
-       {{"e"}, "Mul", {"d", "c"}, {{"T", DT_FLOAT}}},
-       {{"f"}, "Add", {"e", "a"}, {{"T", DT_FLOAT}}},
-       {{"g"}, "Mul", {"f", "b"}, {{"T", DT_FLOAT}}},
-       // Force two clusters by excluding this node explicitly.
-       {{"h"}, "Add", {"g", "f"}, {{"T", DT_FLOAT}, {"_XlaCompile", false}}},
-       {{"i"}, "Add", {"h", "e"}, {{"T", DT_FLOAT}}},
-       {{"j"}, "Add", {"i", "h"}, {{"T", DT_FLOAT}}},
-       {{"k"}, "Add", {"j", "h"}, {{"T", DT_FLOAT}}},
-       {{"l"}, "Add", {"k", "h"}, {{"T", DT_FLOAT}}},
-       {{"m"}, "Identity", {"l"}, {{"T", DT_FLOAT}}}});
-
-  GraphDef graph;
-  *graph.mutable_library()->add_function() = make_test_fn;
-  *graph.add_node() = MakeNode("a", "Placeholder", {},
-                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
-  *graph.add_node() = MakeNode("b", "Placeholder", {},
-                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
-  *graph.add_node() = MakeNode("c", "Placeholder", {},
-                               {{"dtype", DT_FLOAT}, {"shape", input_shape}});
-  *graph.add_node() = MakeNode("m", "TestFn", {"a", "b", "c"}, {});
-  return graph;
-}
-
-Status XlaCompilationCacheSerializeTest::ExecuteWithBatch(const GraphDef& graph,
-                                                          int batch) {
-  const TensorShape shape({batch, 4});
-
-  // Compute the golden output tensor
-  std::vector<Tensor> golden_output_tensors;
-  {
-    SessionOptions options;
-    std::unique_ptr<Session> session(NewSession(options));
-    TF_RETURN_IF_ERROR(session->Create(graph));
-    RunOptions run_options;
-
-    Tensor input_a = CreateInputTensor(shape, 0);
-    Tensor input_b = CreateInputTensor(shape, shape.num_elements());
-    Tensor input_c = CreateInputTensor(shape, 2 * shape.num_elements());
-    TF_RETURN_IF_ERROR(session->Run(
-        run_options,
-        {std::make_pair("a", input_a), std::make_pair("b", input_b),
-         std::make_pair("c", input_c)},
-        {"m"}, {}, &golden_output_tensors, nullptr));
-    TF_RETURN_IF_ERROR(session->Close());
-  }
-
-  // Compute the XLA compiled output
-  std::vector<Tensor> output_tensors;
-  {
-    SessionOptions options;
-    auto& opts =
-        *options.config.mutable_graph_options()->mutable_optimizer_options();
-    opts.set_global_jit_level(OptimizerOptions::ON_1);
-    opts.set_cpu_global_jit(true);
-
-    std::unique_ptr<Session> session(NewSession(options));
-    TF_RETURN_IF_ERROR(session->Create(graph));
-    RunOptions run_options;
-    Tensor input_a = CreateInputTensor(shape, 0);
-    Tensor input_b = CreateInputTensor(shape, shape.num_elements());
-    Tensor input_c = CreateInputTensor(shape, 2 * shape.num_elements());
-    TF_RETURN_IF_ERROR(session->Run(
-        run_options,
-        {std::make_pair("a", input_a), std::make_pair("b", input_b),
-         std::make_pair("c", input_c)},
-        {"m"}, {}, &output_tensors, nullptr));
-    TF_RETURN_IF_ERROR(session->Close());
-  }
-
-  Tensor f32_input(DT_FLOAT, shape);
-  for (int64 i = 0; i < f32_input.NumElements(); ++i) {
-    EXPECT_NEAR(golden_output_tensors[0].flat<float>()(i),
-                output_tensors[0].flat<float>()(i), 1e-3);
-  }
-  return OkStatus();
-}
-
-Status
-XlaCompilationCacheSerializeTest::AlterPersistentCacheEntryHloModuleNames(
-    absl::string_view persistent_cache_dir_path,
-    absl::string_view file_prefix) {
-  Env* env = Env::Default();
-  std::vector<string> file_names;
-  TF_RETURN_IF_ERROR(
-      env->GetChildren(tensorflow::testing::TmpDir(), &file_names));
-
-  bool altered = false;
-  for (const auto& file_name : file_names) {
-    if (absl::EndsWith(file_name, ".pb") &&
-        absl::StartsWith(file_name, file_prefix)) {
-      XlaSerializedCacheEntry entry;
-      auto file_path = io::JoinPath(persistent_cache_dir_path, file_name);
-      TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(env, file_path, &entry));
-      entry.mutable_hlo_module()->set_name(
-          absl::StrCat(entry.hlo_module().name(), "_altered"));
-      TF_RETURN_IF_ERROR(WriteBinaryProto(env, file_path, entry));
-      altered = true;
-    }
-  }
-
-  if (!altered) {
-    return errors::NotFound(
-        "Did not find any persistent XLA compilation cache entries to alter.");
-  }
-  return OkStatus();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h b/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h
deleted file mode 100644
index df68e0da82a..00000000000
--- a/tensorflow/compiler/jit/tests/xla_compilation_cache_test_helper.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_TESTS_XLA_COMPILATION_CACHE_TEST_HELPER_H_
-#define TENSORFLOW_COMPILER_JIT_TESTS_XLA_COMPILATION_CACHE_TEST_HELPER_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/jit/xla_activity_listener.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-// A listener to inspect the use of XLA's persistent compilation cache entries.
-class JitCompilationListener : public XlaActivityListener {
- public:
-  Status Listen(
-      const XlaAutoClusteringActivity& auto_clustering_activity) override {
-    return OkStatus();
-  }
-
-  Status Listen(
-      const XlaJitCompilationActivity& jit_compilation_activity) override {
-    used_persistent_cache_.push_back(
-        jit_compilation_activity.used_persistent_cache());
-    return OkStatus();
-  }
-
-  Status Listen(const XlaOptimizationRemark& optimization_remark) override {
-    return OkStatus();
-  }
-
-  ~JitCompilationListener() override {}
-
-  Status VerifyListenerHistory(bool expect_persistent_cache_use) {
-    for (bool used_persistent_cache : used_persistent_cache_) {
-      if (used_persistent_cache != expect_persistent_cache_use) {
-        return errors::FailedPrecondition("Unexpected listener history.");
-      }
-    }
-    return OkStatus();
-  }
-
-  void ClearListenerHistory() { used_persistent_cache_.clear(); }
-
- private:
-  std::vector<bool> used_persistent_cache_;
-};
-
-// Fixture for testing XLA compilation cache serialization.
-class XlaCompilationCacheSerializeTest : public ::testing::Test {
- protected:
-  XlaCompilationCacheSerializeTest() {
-    auto listener = absl::make_unique<JitCompilationListener>();
-    listener_ = listener.get();
-    RegisterXlaActivityListener(std::move(listener));
-  }
-
-  JitCompilationListener* listener() const { return listener_; }
-
-  // Returns a test graph that will split into two XLA clusters (due to a node
-  // with _XlaCompile = false).
-  GraphDef GetTestGraph(const PartialTensorShape& input_shape);
-
-  // Runs the graph using specified batch size both with and without XLA JIT
-  // compilation. Returns an error if the results between the two do not match.
-  Status ExecuteWithBatch(const GraphDef& graph, int batch);
-
-  // Adds the suffix "_altered" to the HLO module names of all of the persistent
-  // XLA compilation cache entries found at the specified directory. If none are
-  // found, returns NOT_FOUND error.
-  Status AlterPersistentCacheEntryHloModuleNames(
-      absl::string_view persistent_cache_dir_path,
-      absl::string_view file_prefix = "xla_compile_cache");
-
- private:
-  JitCompilationListener* listener_;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_TESTS_XLA_COMPILATION_CACHE_TEST_HELPER_H_
diff --git a/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc
new file mode 100644
index 00000000000..3f96a0f2aa9
--- /dev/null
+++ b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h"
+
+#include <vector>
+
+namespace tensorflow {
+
+Status TfGraphToHloCompiler::Compile(const XlaCompiler::CompileOptions& options,
+                                     const NameAttrList& function,
+                                     absl::Span<const XlaArgument> args,
+                                     XlaCompilationResult* result) {
+  return xla_compiler_.CompileFunction(options, function, args, result);
+}
+
+Status TfGraphToHloCompiler::CompileSingleOp(
+    const XlaCompiler::CompileOptions& options, const OpKernelContext* ctx,
+    absl::Span<const XlaArgument> args, XlaCompilationResult* result) {
+  return xla_compiler_.CompileSingleOp(
+      options, XlaCompiler::SingleOpCompileArgument(*ctx), args, result);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h
new file mode 100644
index 00000000000..c927a90486b
--- /dev/null
+++ b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/jit/tf_to_hlo_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+
+namespace tensorflow {
+
+class TfGraphToHloCompiler : public TfToHloCompiler {
+ public:
+  TfGraphToHloCompiler() = delete;
+
+  explicit TfGraphToHloCompiler(const XlaCompiler::Options& options)
+      : xla_compiler_(options) {}
+
+  // Compiles a Tensorflow `function` into an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result` by calling
+  // XlaCompiler::CompileFunction.
+  Status Compile(const XlaCompiler::CompileOptions& options,
+                 const NameAttrList& function,
+                 absl::Span<const XlaArgument> args,
+                 XlaCompilationResult* result) override;
+
+  // Compiles a Tensorflow single op into an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result` by calling
+  // XlaCompiler::CompileSingleOp.
+  Status CompileSingleOp(const XlaCompiler::CompileOptions& options,
+                         const OpKernelContext* ctx,
+                         absl::Span<const XlaArgument> args,
+                         XlaCompilationResult* result) override;
+
+ private:
+  XlaCompiler xla_compiler_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TfGraphToHloCompiler);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TF_GRAPH_TO_HLO_COMPILER_H_
diff --git a/tensorflow/compiler/jit/tf_to_hlo_compiler.h b/tensorflow/compiler/jit/tf_to_hlo_compiler.h
new file mode 100644
index 00000000000..cf6245639f2
--- /dev/null
+++ b/tensorflow/compiler/jit/tf_to_hlo_compiler.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
+#define TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class TfToHloCompiler {
+ public:
+  TfToHloCompiler() = default;
+  virtual ~TfToHloCompiler() = default;
+
+  // Compiles a Tensorflow `function` to an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result`.
+  virtual Status Compile(const XlaCompiler::CompileOptions& options,
+                         const NameAttrList& function,
+                         absl::Span<const XlaArgument> args,
+                         XlaCompilationResult* result) = 0;
+
+  // Compiles a Tensorflow single op to an HloModuleProto stored in the
+  // XlaCompilationResult pointed to by `result`.
+  virtual Status CompileSingleOp(const XlaCompiler::CompileOptions& options,
+                                 const OpKernelContext* ctx,
+                                 absl::Span<const XlaArgument> args,
+                                 XlaCompilationResult* result) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TfToHloCompiler);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_TF_TO_HLO_COMPILER_H_
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index 3c4450a8fe8..e65059bb27b 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/algorithm.h"
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
deleted file mode 100644
index f24b146d0dc..00000000000
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ /dev/null
@@ -1,951 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
-
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <variant>
-
-#include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
-#include "absl/base/call_once.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/types/variant.h"
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/xla_activity.pb.h"
-#include "tensorflow/compiler/jit/xla_activity_listener.h"
-#include "tensorflow/compiler/jit/xla_cluster_util.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
-#include "tensorflow/compiler/jit/xla_compile_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
-#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
-#include "tensorflow/core/common_runtime/graph_optimizer.h"
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/metrics.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/protobuf/debug_event.pb.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
-#include "tensorflow/core/public/version.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
-#include "tensorflow/core/util/determinism.h"
-#include "tensorflow/core/util/dump_graph.h"
-
-namespace tensorflow {
-namespace {
-
-using TensorTypeAndShape = XlaCompilationCache::Signature::TensorTypeAndShape;
-
-constexpr char kXlaSerializedCacheKeySeparator[] = "__";
-
-// Functor that converts a Signature's arg to a human readable string.
-struct SignatureHumanStringAppender {
-  explicit SignatureHumanStringAppender(string* dest) : dest(dest) {}
-  string* dest;
-  void operator()(const Tensor& arg) {
-    absl::StrAppend(dest, "; ", arg.DebugString());
-  }
-  void operator()(const TensorTypeAndShape& arg) {
-    absl::StrAppend(dest, ",", DataTypeString(arg.first));
-    absl::StrAppend(dest, " [", absl::StrJoin(arg.second, ","), "]");
-  }
-};
-
-// Functor that compares the arg values of two different signatures. Returns
-// true when the args are not equal.
-struct SignatureNotEqual {
-  bool operator()(const Tensor& arg, const Tensor& other) {
-    return arg.dtype() != other.dtype() || arg.shape() != other.shape() ||
-           arg.tensor_data() != other.tensor_data();
-  }
-  bool operator()(const TensorTypeAndShape& arg,
-                  const TensorTypeAndShape& other) {
-    return arg.first != other.first || arg.second != other.second;
-  }
-  bool operator()(const Tensor& arg, const TensorTypeAndShape& other) {
-    return true;
-  }
-  bool operator()(const TensorTypeAndShape& arg, const Tensor& other) {
-    return true;
-  }
-};
-
-// Functor that incrementally computes a Signature's hash given its current hash
-// and one of its args.
-struct SignatureHashCombiner {
-  explicit SignatureHashCombiner(const uint64 h) : h(h) {}
-  uint64 h;
-  uint64 operator()(const Tensor& arg) {
-    h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.dtype())));
-    h = Hash64Combine(
-        h, Hash64(arg.tensor_data().data(), arg.tensor_data().size()));
-    for (int dim = 0; dim < arg.dims(); ++dim) {
-      h = Hash64Combine(h, std::hash<int>()(arg.dim_size(dim)));
-    }
-    return h;
-  }
-  uint64 operator()(const TensorTypeAndShape& arg) {
-    h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
-    h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
-    for (int dim : arg.second) {
-      h = Hash64Combine(h, std::hash<int>()(dim));
-    }
-    return h;
-  }
-};
-
-std::string XlaSerializedCacheKeyToString(const XlaSerializedCacheKey& key) {
-  return absl::StrCat(
-      key.prefix(), key.prefix().empty() ? "" : kXlaSerializedCacheKeySeparator,
-      key.signature_fingerprint(), kXlaSerializedCacheKeySeparator,
-      key.cluster_fingerprint(), kXlaSerializedCacheKeySeparator,
-      key.device_type());
-}
-
-}  // namespace
-
-constexpr int64_t XlaCompilationCache::kDefaultCompilationThreshold;
-constexpr int64_t
-    XlaCompilationCache::AsyncCompilationState::kNumCompilerThreads;
-constexpr int64_t
-    XlaCompilationCache::AsyncCompilationState::kMaxNumOngoingCompilations;
-
-XlaCompilationCache::XlaCompilationCache(Config config,
-                                         xla::LocalClient* client,
-                                         DeviceType device_type)
-    : client_(client),
-      device_type_(std::move(device_type)),
-      disable_strict_signature_checks_(config.disable_strict_signature_checks),
-      persistance_prefix_(config.persistance_prefix),
-      persistent_cache_directory_(config.persistent_cache_directory) {}
-
-XlaCompilationCache::~XlaCompilationCache() {
-  // Ensure any use of our programs have completed by waiting for all stream
-  // executors to complete.
-  for (auto* executor : client_->backend().stream_executors()) {
-    bool ok = executor->SynchronizeAllActivity();
-    if (!ok) {
-      LOG(ERROR) << "Error synchronizing activity while waiting for all "
-                    "programs to complete";
-    }
-  }
-  // Wait for all outstanding compilations to finish.
-  // Resetting the pointer explicitly in the top level destructor.
-  // Without this, the pointer would be reset when the AsyncCompilationState
-  // is destructed, which is dependent on the order of the members in the
-  // XlaCompilationCache class, which is error prone if the order changes.
-  async_compilation_state_.compiler_threads.reset();
-  // TODO(b/110813685): Think about the program ownership model. Programs are
-  // currently owned by the compilation cache which means we must wait for
-  // program completion in the destructor. There are multiple compilation caches
-  // around, which complicates things a little. Perhaps having programs be
-  // shared_ptrs (an invasive change) would make the model easier to reason
-  // about?
-}
-
-string XlaCompilationCache::DebugString() const {
-  return "XLA JIT compilation cache";
-}
-
-// Compute a string signature which encodes the shapes of the
-// arguments in the supplied list.
-string XlaCompilationCache::Signature::HumanString() const {
-  string result = name;
-  for (const auto& a : args) {
-    absl::visit(SignatureHumanStringAppender(&result), a);
-  }
-  return result;
-}
-
-bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
-  if (name != other.name) return false;
-  if (args.size() != other.args.size()) return false;
-  for (int i = 0, end = args.size(); i < end; ++i) {
-    if (absl::visit(SignatureNotEqual(), args[i], other.args[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-uint64 XlaCompilationCache::Signature::Hash::operator()(
-    const XlaCompilationCache::Signature& signature) const {
-  uint64 h = std::hash<string>()(signature.name);
-  for (const auto& arg : signature.args) {
-    h = absl::visit(SignatureHashCombiner(h), arg);
-  }
-  return h;
-}
-
-StatusOr<XlaCompilationCache::Signature> XlaCompilationCache::BuildSignature(
-    const NameAttrList& function,
-    absl::Span<const XlaCompiler::Argument> args) {
-  Signature signature;
-  signature.name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-
-  for (const XlaCompiler::Argument& arg : args) {
-    switch (arg.kind) {
-      case XlaCompiler::Argument::kConstant:
-      case XlaCompiler::Argument::kConstantResource:
-        signature.args.push_back(arg.constant_value);
-        break;
-      case XlaCompiler::Argument::kParameter:
-      case XlaCompiler::Argument::kResource:
-        signature.args.push_back(
-            TensorTypeAndShape(arg.type, arg.DimensionSizesAsInlinedVector()));
-        break;
-      default:
-        return errors::InvalidArgument(
-            "Unhandled argument kind in XlaCompilationCache: ",
-            arg.HumanString());
-    }
-  }
-  return std::move(signature);
-}
-
-static std::vector<const xla::Shape*> GetShapePointers(
-    absl::Span<const xla::Shape> shapes) {
-  std::vector<const xla::Shape*> shape_ptrs;
-  shape_ptrs.reserve(shapes.size());
-  for (const auto& shape : shapes) {
-    shape_ptrs.push_back(&shape);
-  }
-  return shape_ptrs;
-}
-
-Status XlaCompilationCache::BuildExecutable(
-    const XlaCompiler::Options& options,
-    const XlaCompiler::CompilationResult& result,
-    std::unique_ptr<xla::LocalExecutable>* executable) {
-  VLOG(2) << "Compiling to local executable";
-
-  std::vector<const xla::Shape*> argument_layouts =
-      GetShapePointers(result.xla_input_shapes);
-  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
-      options, result, client_->default_device_ordinal());
-  TF_ASSIGN_OR_RETURN(
-      auto executables,
-      client_->Compile(*result.computation, argument_layouts, build_options));
-  TF_RET_CHECK(executables.size() == 1);
-  *executable = std::move(executables[0]);
-  return OkStatus();
-}
-
-StatusOr<std::unique_ptr<xla::AotCompilationResult>>
-XlaCompilationCache::BuildSerializedExecutable(
-    const XlaCompiler::Options& options,
-    const XlaCompiler::CompilationResult& result) {
-  VLOG(2) << "Compiling to local executable";
-
-  std::vector<const xla::Shape*> argument_layouts =
-      GetShapePointers(result.xla_input_shapes);
-  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
-      options, result, client_->default_device_ordinal());
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_results,
-      client_->CompileAheadOfTime(*result.computation, argument_layouts,
-                                  build_options));
-  TF_RET_CHECK(aot_results.size() == 1);
-  return std::move(aot_results[0]);
-}
-
-StatusOr<std::unique_ptr<xla::LocalExecutable>>
-XlaCompilationCache::LoadExecutable(
-    const XlaCompiler::Options& options,
-    const XlaCompiler::CompilationResult& result,
-    const std::string& serialized_aot_result) {
-  VLOG(2) << "Loading local executable using BEF.";
-
-  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
-      options, result, client_->default_device_ordinal());
-  return client_->Load(serialized_aot_result, build_options);
-}
-
-Status XlaCompilationCache::Compile(
-    const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::vector<XlaCompiler::Argument>& args,
-    const XlaCompiler::CompileOptions& compile_options,
-    CompileMode compile_mode,
-    const XlaCompiler::CompilationResult** out_compilation_result,
-    xla::LocalExecutable** out_executable) {
-  return CompileImpl(compile_options, options, function, args,
-                     /*ctx=*/nullptr, CompileScope::kFunction, compile_mode,
-                     out_compilation_result, out_executable);
-}
-
-static bool ShouldBeMegamorphic(int64_t compile_count,
-                                int64_t execution_count) {
-  const int64_t kCompileThreshold = 10;
-  const int64_t kMinExecutionsPerCompile = 50;
-
-  // This heuristic is trying to capture the following property: have we sunk a
-  // certain minimum amount of compile time into the cluster that didn't quite
-  // "pay off"?
-  return compile_count > kCompileThreshold &&
-         execution_count < kMinExecutionsPerCompile * compile_count;
-}
-
-StatusOr<std::unique_ptr<Graph>> CreateGraph(
-    const NodeDef& node_def, absl::Span<const XlaCompiler::Argument> args,
-    absl::Span<const DataType> result_types) {
-  // TODO(b/74182462): We implement this by creating a new dummy Graph including
-  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-
-  // First create the actual node we care about computing.
-  TF_ASSIGN_OR_RETURN(Node * main_node, graph->AddNode(node_def));
-
-  // Create dummy _Arg nodes. Link these to `node` and also via a control
-  // dependency edge to the _SOURCE node.
-  for (int64_t i = 0, end = args.size(); i < end; ++i) {
-    Node* node;
-    string arg_name = absl::StrCat("_arg", i);
-    Status status =
-        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
-            .ControlInput(graph->source_node())
-            .Attr("T", args[i].kind == XlaCompiler::Argument::kResource
-                           ? DT_RESOURCE
-                           : args[i].type)
-            .Attr("index", i)
-            .Finalize(graph.get(), &node);
-    TF_RETURN_IF_ERROR(status);
-    graph->AddEdge(node, 0, main_node, i);
-  }
-
-  // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64_t i = 0, end = result_types.size(); i < end; ++i) {
-    Node* node;
-    string retval_name = absl::StrCat("_retval", i);
-    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
-                        .Input(main_node, i)
-                        .Attr("T", result_types[i])
-                        .Attr("index", i)
-                        .Finalize(graph.get(), &node);
-    TF_RETURN_IF_ERROR(status);
-  }
-  FixupSourceAndSinkEdges(graph.get());
-  return graph;
-}
-
-Status XlaSingleOpToHlo(
-    XlaCompiler* compiler, const XlaCompiler::Options& options,
-    const std::vector<XlaCompiler::Argument>& args,
-    const XlaCompiler::SingleOpCompileArgument& single_op_compile_argument,
-    const XlaCompiler::CompileOptions& compile_options,
-    XlaCompiler::CompilationResult* compilation_result) {
-  const std::vector<DataType>& result_dtypes =
-      single_op_compile_argument.output_dtypes;
-  const NodeDef& node_def = single_op_compile_argument.node_def;
-  TF_ASSIGN_OR_RETURN(
-      auto graph,
-      CreateGraph(node_def, args, single_op_compile_argument.output_dtypes));
-
-  auto compile_with_old_bridge = [&]() {
-    *compilation_result = {};
-    return compiler->CompileGraph(compile_options, node_def.name(),
-                                  std::move(graph), args, compilation_result);
-  };
-
-  const ConfigProto* config = &(single_op_compile_argument.config_proto);
-  auto bridge_rollout = GetMlirBridgeRolloutState(
-      config ? std::optional<ConfigProto>(*config) : std::nullopt);
-  if (bridge_rollout ==
-          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED ||
-      node_def.op() == "VarIsInitializedOp" ||
-      (bridge_rollout !=
-           ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED &&
-       options.device_type.type_string() != DEVICE_TPU_XLA_JIT)) {
-    return compile_with_old_bridge();
-  }
-
-  GraphDebugInfo debug_info;
-  std::vector<std::string> control_rets;
-  if (result_dtypes.empty()) {
-    control_rets.push_back(node_def.name());
-  }
-
-  bool mlir_enabled = (bridge_rollout ==
-                       ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED);
-  VLOG(1) << "Attempting MLIR bridge."
-          << (mlir_enabled ? " MLIR is explicitly enabled." : "");
-  auto mlir_result = CompileGraphToXlaHlo(
-      *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args), control_rets,
-      options.device_type.type_string(), compile_options.use_tuple_arg,
-      /*analyse_graph=*/!mlir_enabled, *options.flib_def, debug_info,
-      options.shape_determination_fns, compilation_result);
-
-  if (mlir_result.ok() || mlir_enabled) {
-    return mlir_result;
-  }
-
-  VLOG(2) << "Failed second phase of the MLIR bridge. Will "
-             "retry with the old bridge. MLIR bridge compilation status: "
-          << mlir_result;
-  return compile_with_old_bridge();
-}
-
-Status XlaCompilationCache::CompileSingleOp(
-    const XlaCompiler::Options& options,
-    const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
-    const XlaCompiler::CompileOptions& compile_options,
-    const XlaCompiler::CompilationResult** out_compilation_result,
-    xla::LocalExecutable** out_executable) {
-  const NodeDef& def = ctx->op_kernel().def();
-  NameAttrList name;
-  name.set_name(def.op());
-  *name.mutable_attr() = def.attr();
-  // Remove the "_class" attribute from the attribute set used to create the
-  // compilation cache key. This attribute is information for the colocator
-  // and causes false uniqueness between nodes.
-  name.mutable_attr()->erase("_class");
-  return CompileImpl(compile_options, options, name, args, ctx,
-                     CompileScope::kOp, CompileMode::kStrict,
-                     out_compilation_result, out_executable);
-}
-
-namespace {
-// Print something that users can search for to definitively ascertain that XLA
-// was used for their TF model.
-//
-// Prints only once to avoid spamming LOG(INFO).
-void LogOnceXlaCompiledFirstCluster() {
-  static absl::once_flag log_once;
-  absl::call_once(log_once, [] {
-    LOG(INFO) << "Compiled cluster using XLA!  This line is logged at most "
-                 "once for the lifetime of the process.";
-  });
-}
-}  // namespace
-
-Status XlaCompilationCache::CompileStrict(
-    const Signature& sig, Entry* entry,
-    const XlaCompiler::CompileOptions& compile_options,
-    const XlaCompiler::Options& options,
-    const std::vector<XlaCompiler::Argument>& args,
-    const NameAttrList& function, OpKernelContext* ctx, CompileScope scope) {
-  tensorflow::Env* env = tensorflow::Env::Default();
-  const uint64 compile_start_us = env->NowMicros();
-
-  XlaCompiler compiler(options);
-  entry->compile_state = CompileState::kCompiled;
-  entry->compilation_status = [&] {
-    if (scope == CompileScope::kOp) {
-      return XlaSingleOpToHlo(&compiler, options, args,
-                              BuildSingleOpCompileArgument(ctx),
-                              compile_options, &entry->compilation_result);
-
-    } else {
-      CHECK(scope == CompileScope::kFunction);  // Crash OK
-      return compiler.CompileFunction(compile_options, function, args,
-                                      &entry->compilation_result);
-    }
-  }();
-  TF_RETURN_IF_ERROR(entry->compilation_status);
-  TF_RET_CHECK(entry->executable.get() == nullptr);
-  TF_RET_CHECK(entry->compilation_result.computation != nullptr);
-
-  std::optional<XlaSerializedCacheEntry> serialized_entry;
-  if (!persistent_cache_directory_.empty()) {
-    const xla::HloModuleProto& hlo_module =
-        entry->compilation_result.computation->proto();
-
-    XlaSerializedCacheKey cache_key = BuildSerializedCacheKey(sig, hlo_module);
-
-    {
-      XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
-          "Try loading serialized cache entry:", sig.HumanString()));
-      TF_ASSIGN_OR_RETURN(serialized_entry, TryLoadSerializedEntry(cache_key));
-    }
-
-    if (serialized_entry.has_value()) {
-      TF_RETURN_IF_ERROR(
-          VerifyLoadedCacheEntry(cache_key, hlo_module, *serialized_entry));
-    }
-  }
-
-  if (serialized_entry.has_value()) {
-    VLOG(1) << "Loading cached entry for: " << sig.HumanString();
-    StatusOr<std::unique_ptr<xla::LocalExecutable>> executable = LoadExecutable(
-        options, entry->compilation_result, serialized_entry->executable());
-    entry->compilation_status = executable.status();
-    if (executable.ok()) {
-      entry->executable = *std::move(executable);
-    }
-  } else {
-    entry->compilation_status =
-        BuildExecutable(options, entry->compilation_result, &entry->executable);
-
-    // Caching is done regardless of the entry->compilation_status. To take
-    // advantage of newer compilation code, a cache flush is required.
-    if (!persistent_cache_directory_.empty()) {
-      XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
-          "Serializing and saving cache entry: ", sig.HumanString()));
-      TF_ASSIGN_OR_RETURN(XlaSerializedCacheEntry serialized_entry,
-                          SerializeEntry(options, sig, *entry));
-      TF_RETURN_IF_ERROR(SaveSerializedEntry(std::move(serialized_entry)));
-    }
-  }
-
-  const uint64 compile_end_us = env->NowMicros();
-  const uint64 compile_time_us = compile_end_us - compile_start_us;
-  metrics::UpdateXlaCompilationTime(compile_time_us);
-
-  mutex_lock lock(cluster_compile_stats_mu_);
-  const std::string& function_name = function.name();
-  auto it = cluster_compile_stats_.find(function_name);
-  const uint64 compile_time_s = compile_time_us / 1.0e6;
-  it->second.compile_count++;
-  it->second.cumulative_compile_time_us += compile_time_us;
-  LogOnceXlaCompiledFirstCluster();
-  VLOG(1) << "compiled " << function_name << " " << it->second.compile_count
-          << " times, compile time: " << compile_time_us
-          << " us, cumulative: " << it->second.cumulative_compile_time_us
-          << " us ("
-          << tensorflow::strings::HumanReadableElapsedTime(compile_time_s)
-          << " / "
-          << tensorflow::strings::HumanReadableElapsedTime(
-                 it->second.cumulative_compile_time_us / 1.0e6)
-          << ")";
-
-  XlaJitCompilationActivity jit_compilation_activity;
-  jit_compilation_activity.set_cluster_name(function_name);
-  jit_compilation_activity.set_compile_count(it->second.compile_count);
-  jit_compilation_activity.set_compile_time_us(compile_time_us);
-  jit_compilation_activity.set_cumulative_compile_time_us(
-      it->second.cumulative_compile_time_us);
-  jit_compilation_activity.set_used_persistent_cache(
-      serialized_entry.has_value());
-  TF_RETURN_IF_ERROR(BroadcastXlaActivity(std::move(jit_compilation_activity)));
-
-  return OkStatus();
-}
-
-Status XlaCompilationCache::CompileAsynchronous(
-    const Signature& signature, Entry* entry,
-    const XlaCompiler::CompileOptions& compile_options,
-    const XlaCompiler::Options& options,
-    const std::vector<XlaCompiler::Argument>& args,
-    const NameAttrList& function, OpKernelContext* ctx, CompileScope scope) {
-  // Explicitly capture all required data by value for async compilation.
-  entry->compile_state = CompileState::kCompiling;
-  {
-    mutex_lock lock(async_compilation_state_.async_compilation_state_mu);
-    async_compilation_state_.num_ongoing_compilations++;
-  }
-  // Don't move the above code into the thread function as it synchronously
-  // updates the async compilation state!
-
-  // When the ThreadPool for the compilation cache is destroyed, it waits for
-  // compilations to have finished. This means that both 'entry' and 'this' will
-  // be alive for the duration of the compilation.
-  // !!Pay attention when additional variables must be captured by this lambda!!
-  // All values are captured by value. Make sure that all pointer values (like
-  // entry) do not get freed until the lambda has finished,\.
-  const std::string& function_name = function.name();
-  async_compilation_state_.compiler_threads->Schedule([=] {
-    Entry local_entry;
-    VLOG(2) << "Starting asynchronous compilation of cluster " << function_name
-            << '.';
-    // We don't need to lock local_entry.mu, but do it anyway to satisfy
-    // thread safety analysis.
-    mutex_lock entry_lock(local_entry.mu);
-    Status s = CompileStrict(signature, &local_entry, compile_options, options,
-                             args, function, ctx, scope);
-    VLOG(2) << "Finished asynchronous compililation of cluster "
-            << function_name << '.';
-    {
-      mutex_lock lock(async_compilation_state_.async_compilation_state_mu);
-      async_compilation_state_.num_ongoing_compilations--;
-    }
-    {  // Populate original entry with compilation result.
-      mutex_lock entry_lock(entry->mu);
-      if (!s.ok()) {
-        entry->compilation_status = s;
-      } else {
-        entry->compilation_status = local_entry.compilation_status;
-      }
-      entry->compilation_result = local_entry.compilation_result;
-      entry->compile_state = local_entry.compile_state;
-      entry->executable = std::move(local_entry.executable);
-    }
-  });
-  return OkStatus();
-}
-
-bool XlaCompilationCache::ShouldCompileCluster(CompileMode compile_mode,
-                                               bool is_megamorphic,
-                                               bool is_first_execution,
-                                               int64_t current_request_count,
-                                               const NameAttrList& function) {
-  std::optional<int64_t> compile_threshold;
-  if (compile_mode == CompileMode::kLazy) {
-    compile_threshold = kDefaultCompilationThreshold;
-  } else if (compile_mode == CompileMode::kAsync) {
-    compile_threshold = 0;  // for now, always compile right away.
-  }
-
-  if (compile_mode == CompileMode::kStrict) {
-    // Lazy compilation is disabled.
-    return true;
-  }
-
-  if (is_megamorphic) {
-    BroadcastOptimizationRemark(XlaOptimizationRemark::MEGAMORPHIC_FUNCTION,
-                                function.name())
-        .IgnoreError();
-    VLOG(2) << "Not compiling cluster " << function.name()
-            << " because it is megamorphic.";
-    return false;
-  }
-
-  if (is_first_execution) {
-    return true;
-  }
-
-  if (compile_mode == CompileMode::kAsync) {
-    // Asynchronous compilation is enabled.
-    mutex_lock lock(async_compilation_state_.async_compilation_state_mu);
-    if (async_compilation_state_.num_ongoing_compilations >=
-        async_compilation_state_.kMaxNumOngoingCompilations) {
-      VLOG(2) << "Not asynchronously compiling cluster " << function.name()
-              << " because of too many ongoing compilations.";
-      return false;
-    }
-  }
-
-  bool reached_compile_threshold = current_request_count >= *compile_threshold;
-  if (!reached_compile_threshold) {
-    VLOG(2) << "Not compiling cluster " << function.name()
-            << " because it has not reached compile threshold; threshold is "
-            << *compile_threshold << " execution count "
-            << current_request_count << ".";
-  }
-  return reached_compile_threshold;
-}
-
-StatusOr<XlaCompilationCache::CompilationResultAndExecutable>
-XlaCompilationCache::GetCompilationResultIfAlreadyCompiled(
-    const NameAttrList& function,
-    absl::Span<const XlaCompiler::Argument> args) {
-  CompilationResultAndExecutable result{nullptr, nullptr};
-
-  TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args));
-
-  // The outer lock protects the existence of the cache entry. It does not
-  // protect the contents of the cache entry.
-  Entry* entry;
-  {
-    mutex_lock lock(compile_cache_mu_);
-    // Try to find a cache entry.
-    auto cache_entry = cache_.find(signature);
-    if (cache_entry == cache_.end()) {
-      return result;
-    }
-    entry = cache_entry->second.get();
-  }
-
-  // Acquire the cache entry lock.
-  // TODO(phawkins): this locking will need to be restructured when we implement
-  // cache eviction.
-  mutex_lock entry_lock(entry->mu);
-
-  const CompileState state = entry->compile_state;
-  if (state != CompileState::kCompiled) {
-    return result;
-  }
-
-  int64_t current_request_count = ++entry->request_count;
-
-  VLOG(2) << "Compilation cache entry hit and is already compiled : "
-          << static_cast<int>(entry->compile_state)
-          << " signature: " << signature.HumanString() << " with request count "
-          << current_request_count;
-
-  result.compilation_result = &entry->compilation_result;
-  result.executable = entry->executable.get();
-
-  return result;
-}
-
-Status XlaCompilationCache::CompileImpl(
-    const XlaCompiler::CompileOptions& compile_options,
-    const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
-    CompileScope scope, CompileMode compile_mode,
-    const XlaCompiler::CompilationResult** out_compilation_result,
-    xla::LocalExecutable** out_executable) {
-  DCHECK_NE(out_executable, nullptr);
-  VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
-
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "num_inputs=" << args.size();
-    for (int i = 0, end = args.size(); i < end; i++) {
-      VLOG(3) << i << ": " << args[i].HumanString();
-    }
-  }
-  TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args));
-
-  // The outer lock protects the existence of the cache entry. It does not
-  // protect the contents of the cache entry.
-  Entry* entry;
-  {
-    mutex_lock lock(compile_cache_mu_);
-    // Find or create a cache entry.
-    auto cache_entry = cache_.find(signature);
-    if (cache_entry == cache_.end()) {
-      auto inserted_entry =
-          cache_.emplace(signature, std::make_unique<Entry>());
-      cache_entry = inserted_entry.first;
-    }
-    entry = cache_entry->second.get();
-  }
-
-  // We always compile a cluster the very first time it is executed.  This is an
-  // optimistic guess that pays off for statically shaped TensorFlow graphs
-  // (since they get the benefit of XLA right away without waiting for warmup)
-  // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at
-  // most one cluster-compilation's worth of compile time).
-  bool is_first_execution;
-
-  // We avoid compiling clusters that have "gone megamorphic" i.e. have an
-  // excessive amount of shape dynamism.
-  bool is_megamorphic;
-
-  {
-    mutex_lock lock(cluster_compile_stats_mu_);
-    auto it =
-        cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
-            .first;
-    is_first_execution = it->second.execution_count++ == 0;
-
-    // The is_megamorphic bit is "sticky".  We assume clusters that have been
-    // observed to be megamorphic once stay megamorphic forever.
-    if (!it->second.is_megamorphic &&
-        ShouldBeMegamorphic(/*compile_count=*/it->second.compile_count,
-                            /*execution_count=*/it->second.execution_count)) {
-      VLOG(1) << "Marking " << function.name()
-              << " as megamorphic, compile_count=" << it->second.compile_count
-              << " execution_count=" << it->second.execution_count;
-      it->second.is_megamorphic = true;
-    }
-
-    is_megamorphic = it->second.is_megamorphic;
-  }
-
-  string human_signature;
-  if (VLOG_IS_ON(2)) {
-    human_signature = VLOG_IS_ON(3) ? signature.HumanString() : function.name();
-    VLOG(2) << "Signature: " << human_signature;
-  }
-
-  // Acquire the cache entry lock and compile, if necessary.
-  // TODO(phawkins): this locking will need to be restructured when we implement
-  // cache eviction.
-  mutex_lock entry_lock(entry->mu);
-  int64_t current_request_count = ++entry->request_count;
-  VLOG(2) << "Compilation cache entry hit: "
-          << static_cast<int>(entry->compile_state)
-          << " signature: " << human_signature << " with request count "
-          << current_request_count;
-
-  CompileState state = entry->compile_state;
-  *out_compilation_result = nullptr;
-  *out_executable = nullptr;
-
-  // Check if the requested entry is uncompiled and return an error if
-  // compilation is disabled. This will raise an error for kLazy even if we have
-  // not yet hit the compilation threshold and no compilation happens this
-  // round. This is to avoid non-determanism of when compilation is disallowed,
-  // for example by changing the threshold.
-  if (state == CompileState::kUncompiled && FailOnXlaCompilation()) {
-    VLOG(1) << "XLA compilation disabled: " << function.name() << "\n"
-            << absl::StrJoin(
-                   args, "\n",
-                   [](std::string* out, const XlaCompiler::Argument& arg) {
-                     absl::StrAppend(out, " arg: ", arg.HumanString());
-                   });
-
-    return errors::Internal("XLA compilation disabled");
-  }
-
-  if (state == CompileState::kUncompiled) {
-    XLA_SCOPED_LOGGING_TIMER("Compilation of XLA executable");
-    if (!ShouldCompileCluster(compile_mode, is_megamorphic, is_first_execution,
-                              current_request_count, function)) {
-      VLOG(2) << "Not compiling for signature: " << human_signature;
-      return OkStatus();
-    } else if (compile_mode == CompileMode::kAsync) {
-      VLOG(2) << "Queueing asynchronous compilation for signature: "
-              << human_signature;
-      TF_RETURN_IF_ERROR(CompileAsynchronous(signature, entry, compile_options,
-                                             options, args, function, ctx,
-                                             scope));
-      return OkStatus();
-    } else {
-      VLOG(2) << "Instantly compiling for signature: " << human_signature;
-      TF_RETURN_IF_ERROR(CompileStrict(signature, entry, compile_options,
-                                       options, args, function, ctx, scope));
-    }
-  } else if (state == CompileState::kCompiling) {
-    VLOG(2) << "Ongoing asynchronous compilation for signature: "
-            << human_signature;
-    return OkStatus();
-  } else if (state == CompileState::kCompiled) {
-    VLOG(2) << "Already Compiled for signature: " << human_signature;
-  }
-
-  TF_RETURN_IF_ERROR(entry->compilation_status);
-  *out_compilation_result = &entry->compilation_result;
-  *out_executable = entry->executable.get();
-  return OkStatus();
-}
-
-XlaSerializedCacheKey XlaCompilationCache::BuildSerializedCacheKey(
-    const Signature& sig, const xla::HloModuleProto& hlo_module) const {
-  XlaSerializedCacheKey serialized_cache_key;
-  serialized_cache_key.set_signature_fingerprint(Signature::Hash()(sig));
-  serialized_cache_key.set_cluster_fingerprint(
-      DeterministicProtoHash64(hlo_module));
-  serialized_cache_key.set_device_type(device_type_.type_string());
-  serialized_cache_key.set_prefix(persistance_prefix_);
-  return serialized_cache_key;
-}
-
-Status XlaCompilationCache::VerifyLoadedCacheEntry(
-    const XlaSerializedCacheKey& key, const xla::HloModuleProto& hlo_module,
-    const XlaSerializedCacheEntry& entry) {
-  XLA_SCOPED_LOGGING_TIMER(absl::StrCat("Verifying loaded cache entry: ",
-                                        hlo_module.entry_computation_name()));
-
-  if (!AreSerializedProtosEqual(key, entry.key())) {
-    VLOG(2) << "Serialized cache key does not match:\n"
-            << "got:\n"
-            << entry.key().DebugString() << "\nexpected:\n"
-            << key.DebugString() << "\n";
-    return errors::InvalidArgument("Serialized cache key does not match.");
-  }
-
-  // Perform a stricter (slower) check of the snapshot to verify that they
-  // match exactly.
-  if (!disable_strict_signature_checks_) {
-    if (!AreSerializedProtosEqual(hlo_module, entry.hlo_module())) {
-      VLOG(2) << "HLOs do not match:\n"
-              << "got:\n"
-              << hlo_module.DebugString() << "\nexpected:\n"
-              << entry.hlo_module().DebugString() << "\n";
-      return errors::InvalidArgument("Serialized HLO does not match.");
-    }
-  }
-
-  if (entry.executable().empty()) {
-    return errors::InvalidArgument("No binary found in serialized entry.");
-  }
-  return OkStatus();
-}
-
-StatusOr<XlaSerializedCacheEntry> XlaCompilationCache::SerializeEntry(
-    const XlaCompiler::Options& options, const Signature& sig,
-    const Entry& entry) {
-  if (entry.compile_state != CompileState::kCompiled) {
-    return errors::FailedPrecondition(
-        "Cache entry to serialize is not compiled.");
-  }
-  if (entry.executable == nullptr) {
-    return errors::FailedPrecondition(
-        "LocalExecutable not found for cache entry to serialize.");
-  }
-  if (entry.executable->executable() == nullptr) {
-    return errors::FailedPrecondition(
-        "Executable not found for cache entry to serialize.");
-  }
-
-  XlaSerializedCacheEntry serialized_entry;
-  const xla::HloModuleProto& hlo_module =
-      entry.compilation_result.computation->proto();
-  *serialized_entry.mutable_key() = BuildSerializedCacheKey(sig, hlo_module);
-  *serialized_entry.mutable_hlo_module() = hlo_module;
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<xla::AotCompilationResult> aot_result,
-      BuildSerializedExecutable(options, entry.compilation_result));
-  TF_ASSIGN_OR_RETURN(std::string serialized, aot_result->SerializeAsString());
-  serialized_entry.set_executable(std::move(serialized));
-  return serialized_entry;
-}
-
-namespace {
-
-std::string GetFilePath(const XlaSerializedCacheKey& key,
-                        absl::string_view persistent_cache_directory) {
-  const std::string file_name =
-      absl::StrCat(XlaSerializedCacheKeyToString(key), ".pb");
-  return io::JoinPath(persistent_cache_directory, file_name);
-}
-
-}  // namespace
-
-Status XlaCompilationCache::SaveSerializedEntry(
-    const XlaSerializedCacheEntry& entry) {
-  Env* env = Env::Default();
-  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(persistent_cache_directory_));
-  const std::string file_path =
-      GetFilePath(entry.key(), persistent_cache_directory_);
-  return WriteBinaryProto(env, file_path, entry);
-}
-
-StatusOr<std::optional<XlaSerializedCacheEntry>>
-XlaCompilationCache::TryLoadSerializedEntry(const XlaSerializedCacheKey& key) {
-  Env* env = Env::Default();
-  const std::string file_path = GetFilePath(key, persistent_cache_directory_);
-  if (!env->FileExists(file_path).ok()) {
-    return StatusOr<std::optional<XlaSerializedCacheEntry>>(std::nullopt);
-  }
-
-  XlaSerializedCacheEntry entry;
-  TF_RETURN_IF_ERROR(ReadTextOrBinaryProto(env, file_path, &entry));
-  return StatusOr<std::optional<XlaSerializedCacheEntry>>(entry);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
deleted file mode 100644
index 22ca15d2a0e..00000000000
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/types/optional.h"
-#include "absl/types/span.h"
-#include "absl/types/variant.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.pb.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
-
-namespace tensorflow {
-
-// The XlaCompilationCache class caches the results of the XlaCompiler class,
-// which converts a Tensorflow graph into a compiled XLA compilation.
-//
-// Since XLA computations must have static shapes, the cache generates a new
-// XLA computation for each new set of input shapes.
-//
-// Currently no cache eviction policy is implemented and the cache grows without
-// bound.
-class XlaCompilationCache : public ResourceBase {
- public:
-  struct Config {
-    Config() {}
-    explicit Config(absl::string_view persistent_cache_directory,
-                    bool disable_strict_signature_checks,
-                    absl::string_view persistance_prefix)
-        : persistent_cache_directory(persistent_cache_directory),
-          disable_strict_signature_checks(disable_strict_signature_checks),
-          persistance_prefix(persistance_prefix) {}
-
-    // If non-empty, JIT-compiled executables are saved to and loaded from the
-    // specified file system directory path.
-    std::string persistent_cache_directory;
-
-    // Disable strict signature checks for entries loaded into the cache from
-    // external sources.
-    bool disable_strict_signature_checks = false;
-
-    // The cache persistence prefix to use if serializing/deserialzing entries.
-    std::string persistance_prefix;
-  };
-  XlaCompilationCache(Config config, xla::LocalClient* client,
-                      DeviceType device_type);
-  ~XlaCompilationCache() override;
-
-  enum class CompileMode {
-    kLazy,
-    kStrict,
-    kAsync,
-  };
-
-  enum class CompileState { kUncompiled, kCompiling, kCompiled };
-
-  enum class CompileScope {
-    kOp,
-    kFunction,
-  };
-
-  // Compiles a function into a XlaCompiler::CompilationResult that can be used
-  // to execute an XLA Computation. Compilation results are cached.
-  // `function` is the name of a Tensorflow function to compile.
-  // `args` is a description of the arguments to the computation.
-  //
-  // `compile_mode` controls the behavior of the compilation cache on a cache
-  // miss.  If `compile_mode` is `kLazy` then, based on some profitability
-  // heuristics, the compilation cache may decide not to compile the cluster at
-  // this time.  In this case it returns null into both `out_compilation_result`
-  // and `out_executable`.  If `compile_mode` is `kStrict` then the compilation
-  // cache always attempts the compilation on a cache miss. If compilation mode
-  // is 'kAsync' compilation of the cluster happens in the background while the
-  // fallback path executes.
-  //
-  // The result of compilation is written to `*out_compilation_result`, which
-  // must be non-null. If `out_executable` is non-null, also builds an
-  // xla::LocalExecutable and sets `out_executable` to point to it. The
-  // resulting executable pointer may be null if the computation has no
-  // non-constant outputs.
-  Status Compile(const XlaCompiler::Options& options,
-                 const NameAttrList& function,
-                 const std::vector<XlaCompiler::Argument>& args,
-                 const XlaCompiler::CompileOptions& compile_options,
-                 CompileMode compile_mode,
-                 const XlaCompiler::CompilationResult** out_compilation_result,
-                 xla::LocalExecutable** out_executable);
-
-  // As above, but calls XlaCompiler::CompileSingleOp instead of
-  // XlaCompiler::CompileFunction. If MLIR bridge is enabled through ConfigProto
-  // in OpKernelContext, then uses MLIR bridge for compilation instead of
-  // XlaCompiler, if possible.
-  Status CompileSingleOp(
-      const XlaCompiler::Options& options,
-      const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
-      const XlaCompiler::CompileOptions& compile_options,
-      const XlaCompiler::CompilationResult** out_compilation_result,
-      xla::LocalExecutable** out_executable);
-
-  struct CompilationResultAndExecutable {
-    const XlaCompiler::CompilationResult* compilation_result;
-    xla::LocalExecutable* executable;
-  };
-
-  // Returns CompilationResultAndExecutable with non-null compilation_result and
-  // executable if the signature is already compiled.
-  // If the signature has not been compiled yet, this function returns a
-  // CompilationResultAndExecutable instance with only nullptrs in it.
-  // Non-ok status means something other than the 2 circumstances above
-  // happened.
-  StatusOr<CompilationResultAndExecutable>
-  GetCompilationResultIfAlreadyCompiled(
-      const NameAttrList& function,
-      absl::Span<const XlaCompiler::Argument> args);
-
-  xla::LocalClient* client() const { return client_; }
-  const DeviceType& device_type() const { return device_type_; }
-
-  string DebugString() const override;
-
-  // Describes the types, shapes and any compile-time constant arguments
-  // to a kernel. Key that uniquely identifies a compilation output.
-  struct Signature {
-    string name;
-
-    // List of args (either as a TensorTypeAndShape or as a Tensor value)
-    // for compile-time constant arguments to the compilation, ordered by
-    // argument number. Tensors must be in host memory.
-    using TensorTypeAndShape =
-        std::pair<DataType, absl::InlinedVector<int64_t, 4>>;
-    absl::InlinedVector<absl::variant<Tensor, TensorTypeAndShape>, 8> args;
-
-    bool operator==(const Signature& other) const;
-
-    struct Hash {
-      uint64 operator()(const Signature& signature) const;
-    };
-
-    // Returns a human-readable description of the signature.
-    string HumanString() const;
-  };
-
-  // Builds the signature for a compilation.
-  static StatusOr<Signature> BuildSignature(
-      const NameAttrList& function,
-      absl::Span<const XlaCompiler::Argument> args);
-
- private:
-  // Common implementation of Compile and CompileSingleOp. The `OpKernelContext`
-  // parameter is always null for the former.
-  Status CompileImpl(
-      const XlaCompiler::CompileOptions& compile_options,
-      const XlaCompiler::Options& options, const NameAttrList& function,
-      const std::vector<XlaCompiler::Argument>& args, OpKernelContext* ctx,
-      CompileScope scope, CompileMode compile_mode,
-      const XlaCompiler::CompilationResult** out_compilation_result,
-      xla::LocalExecutable** out_executable);
-
-  // Takes `result` which has been compiled from a Tensorflow subgraph to a
-  // XLA computation already, and generates an XLA LocalExecutable `executable`.
-  Status BuildExecutable(const XlaCompiler::Options& options,
-                         const XlaCompiler::CompilationResult& result,
-                         std::unique_ptr<xla::LocalExecutable>* executable);
-
-  // Like BuildExecutable above, except that it generates an XLA
-  // AotCompilationResult (instead of LocalExecutable), which can be persisted
-  // to later load a LocalExecutable using the LoadExecutable() method below.
-  StatusOr<std::unique_ptr<xla::AotCompilationResult>>
-  BuildSerializedExecutable(const XlaCompiler::Options& options,
-                            const XlaCompiler::CompilationResult& result);
-
-  // Returns an XLA LocalExecutable loaded from a serialized XLA
-  // AotCompilationResult.
-  StatusOr<std::unique_ptr<xla::LocalExecutable>> LoadExecutable(
-      const XlaCompiler::Options& options,
-      const XlaCompiler::CompilationResult& result,
-      const std::string& serialized_aot_result);
-
-  // Determines whether the cluster should be compiled.
-  bool ShouldCompileCluster(CompileMode compile_mode, bool is_megamorphic,
-                            bool is_first_execution,
-                            int64_t current_request_count,
-                            const NameAttrList& function);
-
-  xla::LocalClient* const client_;
-  const DeviceType device_type_;
-  bool disable_strict_signature_checks_;
-  std::string persistance_prefix_;
-
-  // The value associated with a cache entry.
-  struct Entry {
-    mutex mu;
-
-    // The current compilation state for this entry.
-    CompileState compile_state = CompileState::kUncompiled;
-
-    // The number of times a compilation with this signature has been requested.
-    int64_t request_count = 0;
-
-    // Did compilation succeed?
-    Status compilation_status TF_GUARDED_BY(mu);
-
-    // Output of the XlaCompiler.
-    XlaCompiler::CompilationResult compilation_result TF_GUARDED_BY(mu);
-
-    // The XLA executable compiled from <computation>. May be null if no
-    // executable has been built.
-    std::unique_ptr<xla::LocalExecutable> executable TF_GUARDED_BY(mu);
-  };
-
-  // Returns a cache key proto that identifies an entry in the compilation
-  // cache.
-  XlaSerializedCacheKey BuildSerializedCacheKey(
-      const Signature& sig, const xla::HloModuleProto& hlo_module) const;
-
-  // Serializes the signature and its corresponding entry to a proto message.
-  StatusOr<XlaSerializedCacheEntry> SerializeEntry(
-      const XlaCompiler::Options& options, const Signature& sig,
-      const Entry& entry) TF_EXCLUSIVE_LOCKS_REQUIRED(entry.mu);
-
-  // Checks if the loaded `entry` matches the expected `key` and `hlo_module`.
-  Status VerifyLoadedCacheEntry(const XlaSerializedCacheKey& key,
-                                const xla::HloModuleProto& hlo_module,
-                                const XlaSerializedCacheEntry& entry);
-
-  Status CompileStrict(const Signature& sig, Entry* entry,
-                       const XlaCompiler::CompileOptions& compile_options,
-                       const XlaCompiler::Options& options,
-                       const std::vector<XlaCompiler::Argument>& args,
-                       const NameAttrList& function, OpKernelContext* ctx,
-                       CompileScope scope)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(entry->mu);
-  Status CompileAsynchronous(const Signature& sig, Entry* entry,
-                             const XlaCompiler::CompileOptions& compile_options,
-                             const XlaCompiler::Options& options,
-                             const std::vector<XlaCompiler::Argument>& args,
-                             const NameAttrList& function, OpKernelContext* ctx,
-                             CompileScope scope);
-
-  // Saves the cache entry in the file directory supplied during the
-  // construction of this class. Overwrites existing entries.
-  Status SaveSerializedEntry(const XlaSerializedCacheEntry& entry);
-
-  // Tries to load a cache entry given a `key` by searching the file directory
-  // supplied during the construction of this class. Returns std::nullopt if no
-  // cache entry is found.
-  StatusOr<std::optional<XlaSerializedCacheEntry>> TryLoadSerializedEntry(
-      const XlaSerializedCacheKey& key);
-
-  mutex compile_cache_mu_;
-  absl::flat_hash_map<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
-      TF_GUARDED_BY(compile_cache_mu_);
-
-  struct ClusterCompileStats {
-    // Number of times the cluster has been (re-)compiled.
-    int64_t compile_count = 0;
-
-    // The number of times this cluster has been executed.
-    int64_t execution_count = 0;
-
-    // Cumulative time spent compiling the cluster.
-    int64_t cumulative_compile_time_us = 0;
-
-    // True if we have decided that this cluster is too dynamic (i.e. its shapes
-    // change too frequently) to profitably JIT compile.  Once a cluster is
-    // tagged megamorphic, it stays megamorphic forever.
-    bool is_megamorphic = false;
-  };
-
-  mutex cluster_compile_stats_mu_;
-
-  // Maps cluster names to compilation statistics for said cluster.
-  absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_
-      TF_GUARDED_BY(cluster_compile_stats_mu_);
-
-  struct AsyncCompilationState {
-    mutex async_compilation_state_mu;
-
-    // Number of threads for asynchronous compilations.
-    static constexpr int64_t kNumCompilerThreads = 10;
-
-    // Maximum number of ongoing compilations.
-    static constexpr int64_t kMaxNumOngoingCompilations = kNumCompilerThreads;
-
-    // Number of ongoing compilations.
-    int64_t num_ongoing_compilations TF_GUARDED_BY(async_compilation_state_mu) =
-        0;
-
-    // Pool of threads for asynchronous compilations.
-    std::unique_ptr<thread::ThreadPool> compiler_threads;
-
-    AsyncCompilationState() {
-      compiler_threads = std::make_unique<tensorflow::thread::ThreadPool>(
-          tensorflow::Env::Default(), "async_compiler_threads",
-          kNumCompilerThreads);
-    }
-
-  } async_compilation_state_;
-
-  // The number of times a lazy compilation must be requested for a specific
-  // signature before  we attempt to compile it.
-  static constexpr int64_t kDefaultCompilationThreshold = 2;
-
-  // If non-empty, JIT-compiled executables are saved to and loaded from the
-  // specified file system directory path.
-  std::string persistent_cache_directory_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaCompilationCache);
-};
-
-// Creates a single-node graph using the specified node_def as the only op apart
-// from the arg and retval nodes.
-StatusOr<std::unique_ptr<Graph>> CreateGraph(
-    const NodeDef& node_def, absl::Span<const XlaCompiler::Argument> args,
-    absl::Span<const DataType> result_types);
-
-// Use XlaCompiler to compile a single op into HLO.
-Status XlaSingleOpToHlo(
-    XlaCompiler* compiler, const XlaCompiler::Options& options,
-    const std::vector<XlaCompiler::Argument>& args,
-    const XlaCompiler::SingleOpCompileArgument& single_op_compile_argument,
-    const XlaCompiler::CompileOptions& compile_options,
-    XlaCompiler::CompilationResult* compilation_result);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_disable_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_disable_test.cc
deleted file mode 100644
index 3114974bd00..00000000000
--- a/tensorflow/compiler/jit/xla_compilation_cache_disable_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-// This test is kept separate because it disables XLA compilation globaly.
-TEST(XlaCompilationCacheTest, TestDisabledXlaCompilation) {
-  NameAttrList fn;
-  fn.set_name("afunction");
-
-  // Create mock arguments so we see them in the VLOG when compilation fails.
-  std::vector<XlaCompiler::Argument> args(2);
-  for (int i = 0; i < 2; ++i) {
-    args[i].kind = XlaCompiler::Argument::kParameter;
-    args[i].type = DT_INT32;
-    args[i].shape = TensorShape({2, i + 1});
-    args[i].name = absl::StrCat("arg", i);
-  }
-
-  DisableXlaCompilation();
-
-  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
-  DeviceType device_type = DeviceType(DEVICE_CPU_XLA_JIT);
-
-  const XlaCompiler::CompilationResult* compilation_result;
-  xla::LocalExecutable* executable;
-
-  auto cache = new XlaCompilationCache(XlaCompilationCache::Config(), client,
-                                       device_type);
-  core::ScopedUnref cache_ref(cache);
-
-  // Check that strict compilation is disallowed.
-  Status status = cache->Compile(XlaCompiler::Options{}, fn, args,
-                                 XlaCompiler::CompileOptions{},
-                                 XlaCompilationCache::CompileMode::kStrict,
-                                 &compilation_result, &executable);
-  EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
-
-  // Check that async compilation is disallowed.
-  status = cache->Compile(XlaCompiler::Options{}, fn, args,
-                          XlaCompiler::CompileOptions{},
-                          XlaCompilationCache::CompileMode::kAsync,
-                          &compilation_result, &executable);
-  EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
-
-  // Check that lazy compilation is disallowed.
-  status = cache->Compile(XlaCompiler::Options{}, fn, args,
-                          XlaCompiler::CompileOptions{},
-                          XlaCompilationCache::CompileMode::kLazy,
-                          &compilation_result, &executable);
-  EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "XLA compilation disabled"));
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
deleted file mode 100644
index d806e1b2d7d..00000000000
--- a/tensorflow/compiler/jit/xla_compilation_cache_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
-
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-using SignatureHash = XlaCompilationCache::Signature::Hash;
-
-TEST(XlaCompilationCacheTest, SignatureEquality) {
-  NameAttrList fn;
-  fn.set_name("afunction");
-  std::vector<XlaCompiler::Argument> args(1);
-  args[0].kind = XlaCompiler::Argument::kConstant;
-  args[0].type = DT_INT32;
-  args[0].shape = TensorShape({4, 0});
-  args[0].constant_value = Tensor(DT_INT32, {4, 0});
-  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1,
-                          XlaCompilationCache::BuildSignature(fn, args));
-
-  args[0].type = DT_FLOAT;
-  args[0].constant_value = Tensor(DT_FLOAT, {4, 0});
-  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2,
-                          XlaCompilationCache::BuildSignature(fn, args));
-
-  args[0].shape = TensorShape({0, 4});
-  args[0].constant_value = Tensor(DT_FLOAT, {0, 4});
-  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s3,
-                          XlaCompilationCache::BuildSignature(fn, args));
-
-  std::vector<XlaCompilationCache::Signature> signatures = {s1, s2, s3};
-  for (int i = 0; i < signatures.size(); ++i) {
-    for (int j = 0; j < signatures.size(); ++j) {
-      EXPECT_EQ(i == j, signatures[i] == signatures[j])
-          << "s1: " << signatures[i].HumanString() << "\n"
-          << "s2: " << signatures[j].HumanString();
-      EXPECT_EQ(i == j,
-                signatures[i].HumanString() == signatures[j].HumanString())
-          << "s1: " << signatures[i].HumanString() << "\n"
-          << "s2: " << signatures[j].HumanString();
-      EXPECT_EQ(i == j, SignatureHash()(signatures[i]) ==
-                            SignatureHash()(signatures[j]))
-          << "s1: " << signatures[i].HumanString() << "\n"
-          << "s1_hash: " << SignatureHash()(signatures[i]) << "\n"
-          << "s2: " << signatures[j].HumanString() << "\n"
-          << "s2_hash: " << SignatureHash()(signatures[j]);
-    }
-  }
-}
-
-TEST(XlaCompilationCacheTest, SignatureUniqueness) {
-  NameAttrList fn;
-  fn.set_name("afunction");
-  std::vector<XlaCompiler::Argument> args(2);
-  args[0].kind = XlaCompiler::Argument::kConstant;
-  args[0].type = DT_INT32;
-  args[0].constant_value = Tensor(DT_INT32, {4, 0});
-
-  args[1].kind = XlaCompiler::Argument::kParameter;
-  args[1].type = DT_INT32;
-  args[1].shape = TensorShape({4, 0});
-
-  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1,
-                          XlaCompilationCache::BuildSignature(fn, args));
-
-  using std::swap;  // go/using-std-swap
-  swap(args[0], args[1]);
-  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2,
-                          XlaCompilationCache::BuildSignature(fn, args));
-
-  EXPECT_NE(s1.HumanString(), s2.HumanString());
-  EXPECT_NE(SignatureHash()(s1), SignatureHash()(s2));
-  EXPECT_FALSE(s1 == s2);
-}
-
-void BM_BuildSignature(::testing::benchmark::State& state) {
-  const int n_args = state.range(0);
-
-  NameAttrList fn;
-  fn.set_name("afunction");
-  for (int i = 0; i < n_args; i++) {
-    (*fn.mutable_attr())[absl::StrCat("T", i)].set_type(DT_FLOAT);
-  }
-  std::vector<XlaCompiler::Argument> args(n_args);
-  for (int i = 0; i < n_args; i++) {
-    args[i].kind = (((i % 3) == 0) ? XlaCompiler::Argument::kConstant
-                                   : XlaCompiler::Argument::kParameter);
-    args[i].type = DT_INT32;
-    args[i].shape = TensorShape({4, 0});
-    args[i].constant_value = Tensor(DT_INT32, {4, 0});
-  }
-
-  for (auto i : state) {
-    StatusOr<XlaCompilationCache::Signature> s =
-        XlaCompilationCache::BuildSignature(fn, args);
-    CHECK(s.ok());
-    XlaCompilationCache::Signature sig = std::move(s.value());
-  }
-}
-BENCHMARK(BM_BuildSignature)->Arg(0)->Arg(1)->Arg(2)->Arg(5)->Arg(10);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 1415a572622..18347058806 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 
+#include <memory>
 #include <utility>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
@@ -31,13 +33,18 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 
 namespace tensorflow {
+namespace {
+using XlaDeviceCompiler =
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+}  // namespace
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
-                                 XlaCompilationCache* cache,
+                                 XlaDeviceCompiler* xla_device_compiler,
                                  const XlaCompiler::CompilationResult* result,
                                  xla::LocalExecutable* executable,
                                  const ResourceVarsSnapshot& variable_args) {
-  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
+  xla::LocalClient* client =
+      static_cast<xla::LocalClient*>(xla_device_compiler->client());
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
@@ -97,7 +104,8 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 
 Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult** result,
-    XlaCompilationCache** cache, ResourceVarsSnapshot* variable_args,
+    XlaDeviceCompiler** xla_device_compiler,
+    DeviceCompilationProfiler** profiler, ResourceVarsSnapshot* variable_args,
     xla::LocalExecutable** executable) {
   TF_ASSIGN_OR_RETURN(std::vector<int> constant_input_indices,
                       GetConstantInputIndicesFromContext(ctx));
@@ -108,15 +116,22 @@ Status XlaCompileOnDemandOp::Compile(
   ResourceMgr* rm = ctx->resource_manager();
   CHECK(rm);
 
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
-      rm->default_container(), "xla_cache", cache,
-      [&](XlaCompilationCache** write_into_cache) {
-        return BuildXlaCompilationCache(ctx->device(), ctx->function_library(),
-                                        platform_info_, write_into_cache);
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaDeviceCompiler>(
+      rm->default_container(), "xla_device_compiler", xla_device_compiler,
+      [&](XlaDeviceCompiler** xla_device_compiler) {
+        return BuildXlaDeviceCompiler(ctx->device(), ctx->function_library(),
+                                      platform_info_, xla_device_compiler);
+      }));
+
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
+      rm->default_container(), "device_compilation_profiler", profiler,
+      [](DeviceCompilationProfiler** profiler) {
+        *profiler = new DeviceCompilationProfiler();
+        return OkStatus();
       }));
 
   XlaCompiler::Options options = GenerateCompilerOptions(
-      **cache, *ctx->function_library(), ctx->device(),
+      **xla_device_compiler, *ctx->function_library(), ctx->device(),
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
       platform_info_, /*has_ref_vars=*/true);
   // No detailed logging from on demand op.
@@ -146,25 +161,29 @@ Status XlaCompileOnDemandOp::Compile(
     TF_RETURN_IF_ERROR(args.status());
   }
 
-  return (*cache)->CompileSingleOp(options, *args, ctx, compile_options, result,
-                                   executable);
+  return (*xla_device_compiler)
+      ->CompileSingleOpIfNeeded(options, *args, compile_options, ctx, *profiler,
+                                result, executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   const XlaCompiler::CompilationResult* result;
   xla::LocalExecutable* executable;
   ResourceVarsSnapshot variable_args;
-  XlaCompilationCache* cache;
+  XlaDeviceCompiler* xla_device_compiler;
+  DeviceCompilationProfiler* profiler;
   OP_REQUIRES(ctx, ctx->function_library(),
               errors::Internal("Function library missing"));
-  OP_REQUIRES_OK(ctx,
-                 Compile(ctx, &result, &cache, &variable_args, &executable));
-
-  // Hold the reference to the JIT during evaluation. (We could probably
-  // free it sooner because the ResourceMgr will retain a reference, but
-  // this is more obviously correct.)
-  core::ScopedUnref cache_ref(cache);
-  OP_REQUIRES_OK(ctx, Run(ctx, cache, result, executable, variable_args));
+  OP_REQUIRES_OK(ctx, Compile(ctx, &result, &xla_device_compiler, &profiler,
+                              &variable_args, &executable));
+
+  // Hold the reference to the XLA device compiler and profiler during
+  // evaluation. (We could probably free them sooner because the ResourceMgr
+  // will retain references, but this is more obviously correct.)
+  core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
+  OP_REQUIRES_OK(
+      ctx, Run(ctx, xla_device_compiler, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 598ea7d3093..800b78d3286 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
@@ -45,11 +46,15 @@ class XlaCompileOnDemandOp : public OpKernel {
   XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64_t i);
   Status Compile(OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** result,
-                 XlaCompilationCache** cache,
+                 DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
+                     xla_device_compiler,
+                 DeviceCompilationProfiler** profiler,
                  ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
 
-  Status Run(OpKernelContext* ctx, XlaCompilationCache* cache,
+  Status Run(OpKernelContext* ctx,
+             DeviceCompiler<xla::LocalExecutable, xla::LocalClient>*
+                 xla_device_compiler,
              const XlaCompiler::CompilationResult* result,
              xla::LocalExecutable* executable,
              const ResourceVarsSnapshot& variable_args);
diff --git a/tensorflow/compiler/jit/xla_compile_util.cc b/tensorflow/compiler/jit/xla_compile_util.cc
index 4fdd7834b22..8d72d20ba55 100644
--- a/tensorflow/compiler/jit/xla_compile_util.cc
+++ b/tensorflow/compiler/jit/xla_compile_util.cc
@@ -15,49 +15,59 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 
+#include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/util/determinism.h"
 
 namespace tensorflow {
 
-xla::ExecutableBuildOptions GetExecutableBuildOptions(
-    const XlaCompiler::Options& options,
-    const XlaCompiler::CompilationResult& result, int default_device_ordinal) {
-  xla::ExecutableBuildOptions build_options;
-  if (result.collective_info) {
-    build_options.set_num_replicas(result.collective_info->group_size);
-  }
-  if (options.device_ordinal != -1) {
-    build_options.set_device_ordinal(options.device_ordinal);
-  } else if (default_device_ordinal != -1) {
-    build_options.set_device_ordinal(default_device_ordinal);
+StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
+    const NodeDef& node_def, absl::Span<const XlaArgument> args,
+    absl::Span<const DataType> result_types) {
+  // TODO(b/74182462): We implement this by creating a new dummy Graph including
+  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // First create the actual node we care about computing.
+  TF_ASSIGN_OR_RETURN(Node * main_node, graph->AddNode(node_def));
+
+  // Create dummy _Arg nodes. Link these to `node` and also via a control
+  // dependency edge to the _SOURCE node.
+  for (int64_t i = 0, end = args.size(); i < end; ++i) {
+    Node* node;
+    string arg_name = absl::StrCat("_arg", i);
+    Status status =
+        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
+            .ControlInput(graph->source_node())
+            .Attr("T", args[i].kind == XlaArgument::kResource ? DT_RESOURCE
+                                                              : args[i].type)
+            .Attr("index", i)
+            .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+    graph->AddEdge(node, 0, main_node, i);
   }
-  build_options.set_result_layout(result.xla_output_shape);
-  build_options.set_device_allocator(options.device_allocator.get());
-  build_options.set_alias_passthrough_params(options.alias_passthrough_params);
-  build_options.mutable_debug_options()->set_xla_detailed_logging_and_dumping(
-      options.detailed_logging);
-  if (tensorflow::OpDeterminismRequired()) {
-    build_options.mutable_debug_options()->set_xla_gpu_deterministic_ops(true);
+
+  // Similarly with return values, create dummy _Retval nodes fed by `node`.
+  for (int64_t i = 0, end = result_types.size(); i < end; ++i) {
+    Node* node;
+    string retval_name = absl::StrCat("_retval", i);
+    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
+                        .Input(main_node, i)
+                        .Attr("T", result_types[i])
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
   }
-  return build_options;
+  FixupSourceAndSinkEdges(graph.get());
+  return graph;
 }
 
-XlaCompiler::SingleOpCompileArgument BuildSingleOpCompileArgument(
-    OpKernelContext* ctx) {
-  XlaCompiler::SingleOpCompileArgument single_op_arg;
-  std::vector<DataType> output_dtypes(ctx->num_outputs());
-  for (int i = 0; i < output_dtypes.size(); ++i) {
-    output_dtypes[i] = ctx->expected_output_dtype(i);
-  }
-  single_op_arg.output_dtypes = output_dtypes;
-  single_op_arg.node_def = ctx->op_kernel().def();
-  auto* config_proto = ctx->function_library()->config_proto();
-  if (config_proto != nullptr) {
-    single_op_arg.config_proto = *config_proto;
-  }
-  return single_op_arg;
+bool UsePjRtForSingleDeviceCompilation() {
+  return GetXlaOpsCommonFlags()->tf_xla_use_device_api;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_util.h b/tensorflow/compiler/jit/xla_compile_util.h
index d7b7fc6e6eb..bdc0ebafad5 100644
--- a/tensorflow/compiler/jit/xla_compile_util.h
+++ b/tensorflow/compiler/jit/xla_compile_util.h
@@ -16,20 +16,35 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
 
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/core/framework/op_kernel.h"
+#include <memory>
 
-namespace tensorflow {
-
-// Generates the ExecutableBuildOptions for compliation from HLO to executable.
-xla::ExecutableBuildOptions GetExecutableBuildOptions(
-    const XlaCompiler::Options& options,
-    const XlaCompiler::CompilationResult& result, int default_device_ordinal);
-
-XlaCompiler::SingleOpCompileArgument BuildSingleOpCompileArgument(
-    OpKernelContext* ctx);
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/core/graph/graph.h"
 
+namespace tensorflow {
+// The number of compiler threads to use for asynchronous device compilation.
+inline constexpr int64_t kNumAsyncDeviceCompilerThreads = 10;
+
+enum class DeviceCompileMode {
+  kLazy,
+  kStrict,
+  kAsync,
+};
+
+enum class DeviceCompileState {
+  kUncompiled,
+  kCompiling,
+  kCompiled,
+};
+
+// Creates a single-node graph using the specified `node_def` as the only op
+// apart from the arg and retval nodes corresponding to `args` and
+// `result_types` respectively.
+StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
+    const NodeDef& node_def, absl::Span<const XlaArgument> args,
+    absl::Span<const DataType> result_types);
+
+bool UsePjRtForSingleDeviceCompilation();
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_compile_util_test.cc b/tensorflow/compiler/jit/xla_compile_util_test.cc
index d1fb89fcd5e..0e971a6b4db 100644
--- a/tensorflow/compiler/jit/xla_compile_util_test.cc
+++ b/tensorflow/compiler/jit/xla_compile_util_test.cc
@@ -15,73 +15,62 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 
 #include <memory>
+#include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 
 namespace tensorflow {
 namespace {
 
-using ::testing::ElementsAreArray;
-
-TEST_F(OpsTestBase, Basic) {
+TEST_F(OpsTestBase, CreateSingleOpGraph) {
   TF_EXPECT_OK(NodeDefBuilder("identity_op", "Identity")
                    .Input(FakeInput(DT_FLOAT))
                    .Attr("T", DT_FLOAT)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<float>(TensorShape({1, 2}), {6.9, 4.2});
   TF_EXPECT_OK(RunOpKernel());
 
-  auto arg = BuildSingleOpCompileArgument(context_.get());
+  XlaCompiler::SingleOpCompileArgument single_op_arg(*context_);
 
-  EXPECT_THAT(arg.output_dtypes, ElementsAreArray({DT_FLOAT}));
-  EXPECT_EQ(arg.node_def.SerializeAsString(),
-            context_->op_kernel().def().SerializeAsString());
-  EXPECT_EQ(arg.config_proto.ByteSizeLong(), 0);
-}
+  std::vector<XlaArgument> args(1);
+  args[0].kind = XlaArgument::kConstant;
+  args[0].type = DT_FLOAT;
+  args[0].shape = TensorShape({1, 2});
+  args[0].constant_value = GetInput(0);
+  args[0].initialized = true;
 
-TEST(GetExecutableOptionTest, Basic) {
-  XlaCompiler::Options options;
-  options.device_ordinal = 0;
-  options.alias_passthrough_params = true;
-  options.detailed_logging = true;
-  XlaCompiler::CompilationResult result;
-  xla::Shape xla_output_shape;
-  result.xla_output_shape = xla_output_shape;
-
-  auto build_option =
-      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
-
-  EXPECT_EQ(build_option.device_ordinal(), 0);
-  EXPECT_EQ(build_option.result_layout()->ToString(),
-            xla_output_shape.ToString());
-  EXPECT_EQ(build_option.alias_passthrough_params(), true);
-  EXPECT_EQ(build_option.debug_options().xla_detailed_logging_and_dumping(),
-            true);
-  LOG(ERROR) << build_option.ToString();
-}
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto graph,
+      CreateSingleOpGraph(*node_def(), args, single_op_arg.output_dtypes));
 
-TEST(GetExecutableOptionTest, DefaultDeviceOrdinal) {
-  XlaCompiler::Options options;
-  XlaCompiler::CompilationResult result;
+  const auto& node_name_index = graph->BuildNodeNameIndex();
 
-  auto build_option =
-      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/0);
+  const Node* identity_node = node_name_index.at("identity_op");
+  EXPECT_EQ(identity_node->op_def().name(), "Identity");
+  EXPECT_EQ(identity_node->attrs().FindByString("T")->type(), DT_FLOAT);
 
-  EXPECT_EQ(build_option.device_ordinal(), 0);
-}
+  EXPECT_EQ(identity_node->num_inputs(), 1);
+  const Node* identity_input_node = nullptr;
+  TF_EXPECT_OK(identity_node->input_node(0, &identity_input_node));
+  EXPECT_EQ(identity_input_node->name(), "_arg0");
 
-TEST(GetExecutableOptionTest, DeviceOrdinalNotSet) {
-  XlaCompiler::Options options;
-  XlaCompiler::CompilationResult result;
+  const Node* arg_node = node_name_index.at("_arg0");
+  EXPECT_EQ(arg_node->op_def().name(), "_Arg");
+  EXPECT_EQ(arg_node->attrs().FindByString("T")->type(), DT_FLOAT);
 
-  auto build_option =
-      GetExecutableBuildOptions(options, result, /*default_device_ordinal=*/-1);
+  const Node* retval_node = node_name_index.at("_retval0");
+  EXPECT_EQ(retval_node->op_def().name(), "_Retval");
+  EXPECT_EQ(retval_node->attrs().FindByString("T")->type(), DT_FLOAT);
 
-  EXPECT_EQ(build_option.device_ordinal(), -1);
+  EXPECT_EQ(identity_node->num_outputs(), 1);
+  EXPECT_EQ(retval_node->num_inputs(), 1);
+  const Node* retval_input_node = nullptr;
+  TF_EXPECT_OK(retval_node->input_node(0, &retval_input_node));
+  EXPECT_EQ(retval_input_node->name(), "identity_op");
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 9f1832af61c..4e03e45769d 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -299,7 +299,7 @@ Status XlaDevice::EnsureStreamOkLocked(xla::Backend* backend,
   return OkStatus();
 }
 
-StatusOr<std::vector<XlaDeviceContext*>> XlaDevice::GetDeviceContextLocked() {
+StatusOr<std::vector<DeviceContext*>> XlaDevice::GetDeviceContextLocked() {
   TF_ASSIGN_OR_RETURN(xla::LocalClient * client, GetOrCreateClient());
   xla::Backend* backend = client->mutable_backend();
 
@@ -397,13 +397,13 @@ StatusOr<std::vector<XlaDeviceContext*>> XlaDevice::GetDeviceContextLocked() {
   return device_contexts_;
 }
 
-StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextWithIndex(int index) {
+StatusOr<DeviceContext*> XlaDevice::GetDeviceContextWithIndex(int index) {
   mutex_lock lock(mu_);
   TF_ASSIGN_OR_RETURN(auto device_contexts, GetDeviceContextLocked());
   return device_contexts.at(index);
 }
 
-StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextDefault() {
+StatusOr<DeviceContext*> XlaDevice::GetDeviceContextDefault() {
   return GetDeviceContextWithIndex(0);
 }
 
@@ -502,7 +502,7 @@ void XlaDevice::Sync(const DoneCallback& done) {
   });
 }
 
-Status XlaDevice::MakeTensorFromProto(XlaDeviceContext* device_context,
+Status XlaDevice::MakeTensorFromProto(DeviceContext* device_context,
                                       const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
@@ -534,7 +534,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
   VLOG(1) << "XlaDevice::MakeTensorFromProto";
-  XlaDeviceContext* device_context;
+  DeviceContext* device_context;
   TF_ASSIGN_OR_RETURN(device_context, GetDeviceContextDefault());
   return MakeTensorFromProto(device_context, tensor_proto, alloc_attrs, tensor);
 }
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 6207932a0ae..fd200e57e06 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -166,7 +166,7 @@ class XlaDevice : public LocalDevice {
                              const AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override TF_LOCKS_EXCLUDED(mu_);
 
-  Status MakeTensorFromProto(XlaDeviceContext* device_context,
+  Status MakeTensorFromProto(DeviceContext* device_context,
                              const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
                              Tensor* tensor);
@@ -184,9 +184,9 @@ class XlaDevice : public LocalDevice {
   // Two convenient methods to get the underlying device context.
   // Get the default device context, created by the first
   // shape_representation_fn.
-  StatusOr<XlaDeviceContext*> GetDeviceContextDefault();
+  StatusOr<DeviceContext*> GetDeviceContextDefault();
   // Get the device context given the index.
-  StatusOr<XlaDeviceContext*> GetDeviceContextWithIndex(int index);
+  StatusOr<DeviceContext*> GetDeviceContextWithIndex(int index);
 
   // Instructs this XlaDevice to set a AcceleratorDeviceInfo, which holds extra
   // information for GPU and TPU devices.
@@ -214,7 +214,7 @@ class XlaDevice : public LocalDevice {
 
   // Return a vector of device context, ordered by the sequence in the given
   // shape_representation_fns.
-  StatusOr<std::vector<XlaDeviceContext*>> GetDeviceContextLocked()
+  StatusOr<std::vector<DeviceContext*>> GetDeviceContextLocked()
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Handles error when RefreshStatus sees !status.ok().
@@ -260,8 +260,8 @@ class XlaDevice : public LocalDevice {
   // calls to EnsureDeviceContextOk. The number of device conetexts is based on
   // the number of shape representation functions in XlaDevice::Options. If
   // accelerator_device_info_ is non-null, this pointer is also filled in to
-  // that struct. XlaDeviceContext is a ref-counted object.
-  std::vector<XlaDeviceContext*> device_contexts_ TF_GUARDED_BY(mu_);
+  // that struct. DeviceContext is a ref-counted object.
+  std::vector<DeviceContext*> device_contexts_ TF_GUARDED_BY(mu_);
 
   // Holds extra information for GPU and TPU devices, e.g. the device context.
   bool use_accelerator_device_info_ TF_GUARDED_BY(mu_) = false;
diff --git a/tensorflow/compiler/jit/xla_device_compiler_client.cc b/tensorflow/compiler/jit/xla_device_compiler_client.cc
new file mode 100644
index 00000000000..37ebffd95dc
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_device_compiler_client.cc
@@ -0,0 +1,114 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+
+namespace tensorflow {
+namespace {
+std::vector<const xla::Shape*> GetShapePointers(
+    absl::Span<const xla::Shape> shapes) {
+  std::vector<const xla::Shape*> shape_ptrs;
+  shape_ptrs.reserve(shapes.size());
+  for (const auto& shape : shapes) {
+    shape_ptrs.push_back(&shape);
+  }
+  return shape_ptrs;
+}
+}  // namespace
+
+StatusOr<std::unique_ptr<xla::LocalExecutable>>
+XlaDeviceCompilerClient::BuildExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result) {
+  VLOG(2) << "Compiling to xla::LocalExecutable.";
+
+  std::vector<const xla::Shape*> argument_layouts =
+      GetShapePointers(result.xla_input_shapes);
+  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
+      options, result, client_->default_device_ordinal());
+  TF_ASSIGN_OR_RETURN(
+      auto executables,
+      client_->Compile(*result.computation, argument_layouts, build_options));
+  TF_RET_CHECK(executables.size() == 1);
+  return std::move(executables[0]);
+}
+
+StatusOr<std::string> XlaDeviceCompilerClient::SerializeExecutable(
+    const xla::LocalExecutable& executable) {
+  if (executable.executable() == nullptr) {
+    return errors::FailedPrecondition(
+        "Executable not found for serialization.");
+  }
+
+  VLOG(1)
+      << "Exporting xla::LocalExecutable as an xla::AotCompilationResult and "
+         "serializing it to string.";
+  xla::Compiler* compiler = client_->backend().compiler();
+  auto exported = compiler->Export(executable.executable());
+  if (exported.ok()) {
+    return (*exported)->SerializeAsString();
+  }
+
+  return exported.status();
+}
+
+StatusOr<std::string> XlaDeviceCompilerClient::BuildSerializedExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result) {
+  VLOG(2) << "Compiling to xla::AotCompilationResult and serializing it";
+
+  std::vector<const xla::Shape*> argument_layouts =
+      GetShapePointers(result.xla_input_shapes);
+  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
+      options, result, client_->default_device_ordinal());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_results,
+      client_->CompileAheadOfTime(*result.computation, argument_layouts,
+                                  build_options));
+  TF_RET_CHECK(aot_results.size() == 1);
+  return aot_results[0]->SerializeAsString();
+}
+
+StatusOr<std::unique_ptr<xla::LocalExecutable>>
+XlaDeviceCompilerClient::LoadExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompilationResult& result,
+    const std::string& serialized_executable) {
+  VLOG(2) << "Loading xla::LocalExecutable from a serialized "
+             "xla::AotCompilationResult";
+
+  xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
+      options, result, client_->default_device_ordinal());
+  return client_->Load(serialized_executable, build_options);
+}
+
+void XlaDeviceCompilerClient::WaitForProgramsToFinish() {
+  for (auto* executor : client_->backend().stream_executors()) {
+    bool ok = executor->SynchronizeAllActivity();
+    if (!ok) {
+      LOG(ERROR) << "Error synchronizing activity while waiting for all "
+                    "programs to complete";
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_compiler_client.h b/tensorflow/compiler/jit/xla_device_compiler_client.h
new file mode 100644
index 00000000000..01325cfcd62
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_device_compiler_client.h
@@ -0,0 +1,68 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+
+namespace tensorflow {
+
+class XlaDeviceCompilerClient
+    : public DeviceCompilerClient<xla::LocalExecutable, xla::LocalClient> {
+ public:
+  explicit XlaDeviceCompilerClient(xla::LocalClient* client)
+      : client_(client) {}
+
+  StatusOr<std::unique_ptr<xla::LocalExecutable>> BuildExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Returns a serialized AOT result obtained by exporting the available
+  // `executable` using the XlaCompiler.
+  StatusOr<std::string> SerializeExecutable(
+      const xla::LocalExecutable& executable) override;
+
+  // Returns a serialized AOT result obtained by compiling `result` into an AOT
+  // result.
+  StatusOr<std::string> BuildSerializedExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result) override;
+
+  // Loads a serialized AOT result (`serialized_executable`) into an
+  // xla::LocalExecutable and returns it.
+  StatusOr<std::unique_ptr<xla::LocalExecutable>> LoadExecutable(
+      const XlaCompiler::Options& options,
+      const XlaCompiler::CompilationResult& result,
+      const std::string& serialized_executable) override;
+
+  void WaitForProgramsToFinish() override;
+
+  xla::LocalClient* client() const override { return client_; }
+
+ private:
+  xla::LocalClient* const client_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaDeviceCompilerClient);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_COMPILER_CLIENT_H_
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 0a1428d8e65..ee00464178f 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -227,7 +227,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   if (device_to_host_stream_) {
     device_to_host_stream = device_to_host_stream_;
   } else {
-    stream_executor::port::StatusOr<xla::StreamPool::Ptr> ptr_or_status =
+    tsl::StatusOr<xla::StreamPool::Ptr> ptr_or_status =
         client_->mutable_backend()->BorrowStream(
             stream_->parent()->device_ordinal());
     if (!ptr_or_status.status().ok()) {
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 38f086cd1d1..573399b34e2 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" or "ROCM" (GPU) backend.
 
+#include <array>
 #include <set>
 
 #include "absl/memory/memory.h"
@@ -29,8 +30,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -51,7 +52,7 @@ Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
   }
 
   auto platform =
-      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName());
   if (!platform.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
@@ -100,7 +101,7 @@ Status XlaGpuDeviceFactory::CreateDevices(
   (void)registrations;
 
   auto platform =
-      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName());
   if (!platform.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
@@ -155,10 +156,10 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 16> kAllXlaGpuTypes = {
+constexpr std::array<DataType, 18> kAllXlaGpuTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT16, DT_INT8, DT_QINT8, DT_INT16, DT_INT32,
      DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16, DT_FLOAT8_E5M2, DT_FLOAT8_E4M3FN}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_GPU, XlaCompileOp, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3c08eb29fcb..450478d6ef9 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <set>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -42,7 +45,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
@@ -99,6 +104,9 @@ VariableInfo::~VariableInfo() {
     if (lock_held()) {
       var()->mu()->unlock();
     }
+    if (shared_lock_held()) {
+      var()->mu()->unlock_shared();
+    }
 
     // Unref the variable so it can be released by ResourceManager.
     var()->Unref();
@@ -109,6 +117,15 @@ Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
                                   absl::Span<const Tensor* const> inputs,
                                   absl::Span<const int> variable_indices,
                                   std::vector<VariableInfo>* result) {
+  return GetVariableInfosFromInputs(rm, dev, inputs, variable_indices, nullptr,
+                                    result);
+}
+
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  const std::set<int>* variables_updated,
+                                  std::vector<VariableInfo>* result) {
   result->clear();
   result->reserve(variable_indices.size());
   for (int var_idx : variable_indices) {
@@ -131,8 +148,12 @@ Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
           *ptr = new Var(DT_INVALID);
           return OkStatus();
         }));
-    result->emplace_back(var_idx, handle.name(), variable,
-                         handle.definition_stack_trace());
+    VariableInfo& variable_info = result->emplace_back(
+        var_idx, handle.name(), variable, handle.definition_stack_trace());
+    if (variables_updated != nullptr &&
+        variables_updated->find(var_idx) == variables_updated->end()) {
+      variable_info.set_read_only();
+    }
   }
   return OkStatus();
 }
@@ -181,10 +202,17 @@ Status LockVariables(absl::Span<VariableInfo*> variables) {
       // TODO(b/128495870) Add support for passing aliased resource variables.
       return errors::Unimplemented("Duplicate variable passed to XLA cluster");
     }
-    VLOG(4) << "Acquiring lock for variable "
-            << reinterpret_cast<void*>(variable);
-    mu->lock();
-    variables[i]->set_lock_held();
+    if (variables[i]->read_only()) {
+      VLOG(4) << "Acquiring reader lock for variable "
+              << reinterpret_cast<void*>(variable);
+      mu->lock_shared();
+      variables[i]->set_shared_lock_held();
+    } else {
+      VLOG(4) << "Acquiring lock for variable "
+              << reinterpret_cast<void*>(variable);
+      mu->lock();
+      variables[i]->set_lock_held();
+    }
     prev = mu;
   }
   VLOG(4) << "Finished acquiring variable locks.";
@@ -512,7 +540,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   }
 
   std::shared_ptr<se::Event> definition_event;
-  if (use_multiple_streams_) {
+  if (use_multiple_streams_ && stream) {
     definition_event = std::make_shared<se::Event>(stream->parent());
     if (!definition_event->Init()) {
       return errors::Internal("Failed to initialize tensor definition event.");
@@ -634,6 +662,20 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   return OkStatus();
 }
 
+Status CreateVariableInfoLookup(
+    absl::Span<VariableInfo const> variable_args,
+    absl::flat_hash_map<int, const VariableInfo*>& variable_info_lookup) {
+  for (const VariableInfo& info : variable_args) {
+    if (!(!info.var() || info.lock_held() || info.shared_lock_held())) {
+      return errors::Internal(
+          "Need to hold the lock on resource variables "
+          "before calling BuildXlaCompilerArguments");
+    }
+    variable_info_lookup.emplace(info.index(), &info);
+  }
+  return OkStatus();
+}
+
 StatusOr<std::vector<XlaCompiler::Argument>>
 XlaComputationLaunchContext::BuildXlaCompilerArguments(
     absl::Span<int const> must_be_constant_idxs,
@@ -662,13 +704,7 @@ XlaComputationLaunchContext::BuildXlaCompilerArguments(
   }
 
   absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
-  for (const VariableInfo& info : variable_args) {
-    CHECK(!info.var() || info.lock_held())
-        << "Need to hold the lock on resource variables "
-           "before calling BuildXlaCompilerArguments";
-    variable_info_lookup.emplace(info.index(), &info);
-  }
-
+  TF_CHECK_OK(CreateVariableInfoLookup(variable_args, variable_info_lookup));
   for (int64_t input_num = 0; input_num < inputs.size(); ++input_num) {
     const Tensor* input = inputs[input_num];
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 0f35d9d020f..e09c910c0ea 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -18,7 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include <map>
+#include <set>
+
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -70,6 +72,14 @@ class VariableInfo {
   bool lock_held() const { return lock_held_; }
   void set_lock_held() { lock_held_ = true; }
 
+  // Returns true if the resource variable reader lock was successfully acquired
+  // by this thread.
+  bool shared_lock_held() const { return shared_lock_held_; }
+  void set_shared_lock_held() { shared_lock_held_ = true; }
+
+  bool read_only() const { return read_only_; }
+  void set_read_only() { read_only_ = true; }
+
   const std::optional<ManagedStackTrace>& definition_stack_trace() const {
     return definition_stack_trace_;
   }
@@ -86,6 +96,11 @@ class VariableInfo {
   // thread safety analysis. Instead we use a boolean flag and release the lock
   // in the VariableInfo destructor.
   bool lock_held_ = false;
+  bool shared_lock_held_ = false;
+
+  // Whether this variable is going to be mutated. Left false if the caller
+  // doesn't provide this information.
+  bool read_only_ = false;
 };
 
 // Creates a list of updated resource variables.
@@ -113,6 +128,8 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 //
 // `variables` is allowed to contain instances that don't track a resource
 // variable (i.e. variables[i].var() can be null for some i).
+//
+// If the variable is read_only(), only acquires reader locks.
 Status LockVariables(absl::Span<VariableInfo*> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 Status LockVariables(absl::Span<VariableInfo> variables)
@@ -121,11 +138,25 @@ Status LockVariables(absl::Span<VariableInfo> variables)
 // Returns a vector of VariableInfo instances for the resource variable inputs,
 // given that *all* inputs are in `inputs`. The input indices for the resource
 // variable inputs are in `variable_indices`.
+//
+// When using the VariableInfos generated by this version, all variables would
+// be writer-locked.
 Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
                                   absl::Span<const Tensor* const> inputs,
                                   absl::Span<const int> variable_indices,
                                   std::vector<VariableInfo>* result);
 
+// variables_updated is a set containing the indices of the variables that are
+// going to be mutated. If variables_updated is empty, then in LockVariables all
+// variables would only be reader-locked. If variables_updated is null, then we
+// consider this information unknown and will acquire writer-lock for all
+// variables.
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  const std::set<int>* variables_updated,
+                                  std::vector<VariableInfo>* result);
+
 // Returns pointers to inputs stored in `ctx`.
 std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx);
 
@@ -247,6 +278,10 @@ class XlaTensorBuffer : public TensorBuffer {
   Allocator* allocator_;
 };
 
+Status CreateVariableInfoLookup(
+    absl::Span<VariableInfo const> variable_args,
+    absl::flat_hash_map<int, const VariableInfo*>& variable_info_lookup);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index 0cad9e63669..0ed41b41833 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -15,13 +15,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 
+#include <memory>
+#include <optional>
 #include <utility>
+#include <vector>
 
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_device_compiler_client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
 namespace tensorflow {
+namespace {
+using XlaDeviceCompiler =
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+}  // namespace
 
 xla::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
     absl::string_view visible_device_list) {
@@ -44,26 +55,35 @@ xla::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
   return {{gpu_ids}};
 }
 
-Status BuildXlaCompilationCache(DeviceBase* device, FunctionLibraryRuntime* flr,
-                                const XlaPlatformInfo& platform_info,
-                                XlaCompilationCache** cache) {
-  XlaCompilationCache::Config cache_config(
+Status BuildXlaDeviceCompiler(DeviceBase* device, FunctionLibraryRuntime* flr,
+                              const XlaPlatformInfo& platform_info,
+                              XlaDeviceCompiler** xla_device_compiler) {
+  using XlaDeviceExecutablePersistor =
+      DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
+  XlaDeviceExecutablePersistor::Config persistor_config(
       GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_directory,
       GetMarkForCompilationPassFlags()->tf_xla_disable_strict_signature_checks,
       GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix);
 
   if (platform_info.xla_device_metadata()) {
-    *cache = new XlaCompilationCache(
-        std::move(cache_config), platform_info.xla_device_metadata()->client(),
+    auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
+        std::move(persistor_config),
         platform_info.xla_device_metadata()->jit_device_type());
+    auto compiler_client = std::make_unique<XlaDeviceCompilerClient>(
+        platform_info.xla_device_metadata()->client());
+    *xla_device_compiler =
+        new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
     return OkStatus();
   }
 
   // TFRT-TPU is used if device type is `DEVICE_TPU` and platform_info does not
   // have `xla_device_metadata`.
   if (platform_info.device_type() == DEVICE_TPU) {
-    *cache = new XlaCompilationCache(std::move(cache_config), nullptr,
-                                     DeviceType(DEVICE_TPU_XLA_JIT));
+    auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
+        std::move(persistor_config), DeviceType(DEVICE_TPU_XLA_JIT));
+    auto compiler_client = std::make_unique<XlaDeviceCompilerClient>(nullptr);
+    *xla_device_compiler =
+        new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
     return OkStatus();
   }
 
@@ -117,9 +137,14 @@ Status BuildXlaCompilationCache(DeviceBase* device, FunctionLibraryRuntime* flr,
     return errors::InvalidArgument("No JIT device registered for ",
                                    platform_info.device_type().type());
   }
-  *cache = new XlaCompilationCache(
-      std::move(cache_config), client.value(),
+
+  auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
+      std::move(persistor_config),
       DeviceType(registration->compilation_device_name));
+  auto compiler_client =
+      std::make_unique<XlaDeviceCompilerClient>(client.value());
+  *xla_device_compiler =
+      new XlaDeviceCompiler(std::move(persistor), std::move(compiler_client));
   return OkStatus();
 }
 
@@ -174,16 +199,16 @@ std::shared_ptr<se::DeviceMemoryAllocator> GetAllocator(
 }
 
 XlaCompiler::Options GenerateCompilerOptions(
-    const XlaCompilationCache& cache,
+    const XlaDeviceCompiler& xla_device_compiler,
     const FunctionLibraryRuntime& function_library, DeviceBase* device,
     se::Stream* stream, const XlaPlatformInfo& platform_info,
     bool has_ref_vars) {
   XlaCompiler::Options options;
-  options.client = static_cast<xla::LocalClient*>(cache.client());
+  options.client = static_cast<xla::LocalClient*>(xla_device_compiler.client());
   if (stream != nullptr) {
     options.device_ordinal = stream->parent()->device_ordinal();
   }
-  options.device_type = cache.device_type();
+  options.device_type = xla_device_compiler.device_type();
   options.flib_def = function_library.GetFunctionLibraryDefinition();
   options.graph_def_version = function_library.graph_def_version();
   options.allow_cpu_custom_calls =
@@ -201,11 +226,11 @@ XlaCompiler::Options GenerateCompilerOptions(
 }
 
 XlaCompiler::Options GenerateTfrtTpuCompilerOptions(
-    const XlaCompilationCache& cache,
+    const XlaDeviceCompiler& xla_device_compiler,
     const FunctionLibraryRuntime& function_library) {
   XlaCompiler::Options options;
   // TODO(b/238830423): consider device_ordinal and shape_determination_fns.
-  options.device_type = cache.device_type();
+  options.device_type = xla_device_compiler.device_type();
   options.flib_def = function_library.GetFunctionLibraryDefinition();
   options.graph_def_version = function_library.graph_def_version();
   options.allow_cpu_custom_calls = false;
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
index 4341bc8e394..86ccee99bce 100644
--- a/tensorflow/compiler/jit/xla_platform_info.h
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_PLATFORM_INFO_H_
 
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include <memory>
+#include <optional>
+
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
 
@@ -88,9 +91,11 @@ StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
     absl::string_view visible_device_list);
 
 // Returns created XLA compilation cache.
-Status BuildXlaCompilationCache(DeviceBase* dev, FunctionLibraryRuntime* flr,
-                                const XlaPlatformInfo& platform_info,
-                                XlaCompilationCache** cache);
+Status BuildXlaDeviceCompiler(
+    DeviceBase* dev, FunctionLibraryRuntime* flr,
+    const XlaPlatformInfo& platform_info,
+    DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
+        xla_device_compiler);
 
 // Returns information about the platform from kernel context.
 XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
@@ -109,14 +114,16 @@ std::shared_ptr<se::DeviceMemoryAllocator> GetAllocator(
 // Returns created options for the XLA compiler, and writes the used allocator
 // into `tf_allocator_adapter`.
 XlaCompiler::Options GenerateCompilerOptions(
-    const XlaCompilationCache& cache,
+    const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>&
+        xla_device_compiler,
     const FunctionLibraryRuntime& function_library, DeviceBase* device,
     se::Stream* stream, const XlaPlatformInfo& platform_info,
     bool has_ref_vars);
 
 // Returns created options for XLA compiler when TFRT-TPU is used.
 XlaCompiler::Options GenerateTfrtTpuCompilerOptions(
-    const XlaCompilationCache& cache,
+    const DeviceCompiler<xla::LocalExecutable, xla::LocalClient>&
+        xla_device_compiler,
     const FunctionLibraryRuntime& function_library);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tpu_device.cc b/tensorflow/compiler/jit/xla_tpu_device.cc
index a7830e51312..e6047e68bde 100644
--- a/tensorflow/compiler/jit/xla_tpu_device.cc
+++ b/tensorflow/compiler/jit/xla_tpu_device.cc
@@ -62,7 +62,7 @@ StatusOr<xla::Shape> TpuShapeRepresentation(
   ApiConverter::StackHelper<XLA_Shape> se_shape(xla_shape);
   ApiConverter::StackHelper<XLA_Shape> tpu_shape;
   StatusHelper status;
-  tpu::ExecutorApiFn()->XlaShapeToTpuShapeRepresentationFn(
+  stream_executor::tpu::ExecutorApiFn()->XlaShapeToTpuShapeRepresentationFn(
       &se_shape.value, type, use_fast_memory, &tpu_shape.value,
       status.c_status);
   if (!status.status().ok()) {
@@ -93,7 +93,7 @@ Status TpuPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
   StatusHelper status;
   ApiConverter::StackHelper<XLA_Shape> se_shape(on_device_shape);
   ApiConverter::StackHelper<XLA_Shape> tpu_shape;
-  tpu::ExecutorApiFn()->XlaShapeToTpuPaddedShapeFn(
+  stream_executor::tpu::ExecutorApiFn()->XlaShapeToTpuPaddedShapeFn(
       &se_shape.value, &tpu_shape.value, status.c_status);
   if (!status.ok()) {
     return status.status();
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 42c78ff6143..6995615d10f 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -74,7 +75,7 @@ cc_library(
     deps = [
         ":init_mlir",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:mlprogram_util",
@@ -82,20 +83,22 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
-        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/compiler/mlir/tosa:tf_passes",
         "//tensorflow/compiler/mlir/tosa:tf_tfl_passes",
         "//tensorflow/compiler/mlir/tosa:tfl_passes",
         "//tensorflow/compiler/mlir/xla:tf_xla_passes",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/core:lib",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:Transforms",
         "@stablehlo//:register",
     ],
 )
@@ -188,7 +191,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/xla:tf_xla_passes",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
     ],
 )
 
@@ -201,16 +204,14 @@ tf_cc_binary(
         ":passes",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_reduce_patterns_inc_gen",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/compiler/mlir/xla:tf_xla_passes",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/core:lib",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirReduceLib",
@@ -225,25 +226,18 @@ tf_cc_binary(
     srcs = ["tf_mlir_translate_main.cc"],
     deps = [
         ":init_mlir",
-        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tf_xla_mlir_translate",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
-        "//tensorflow/compiler/mlir/xla:translate_cl_registration",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:translate_registration",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:translate_registration",
-        "//tensorflow/core:framework",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla:translate_registration",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
@@ -258,6 +252,7 @@ tf_cc_test(
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
index c0f9508ef76..a7a8bbdf953 100644
--- a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
+++ b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
@@ -931,6 +931,11 @@ Would become the following ops (unimportant attribute, type are omitted):
   }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []}
 ```
 ### `-tf-parallel-execute-to-islands`: Lowers device parallel_execute to executor islands
+
+#### Options
+```
+-legacy-graph-export : Determines whether or not this pass should execute logic that is reserved for the legacy graph export pipeline to maintain expected invariants. In the case of this pass, that means manually propagating controls to lifted parallel execute regions to the graph fetch to ensure the ops execute.
+```
 ### `-tf-promote-resources-to-args`: Promote resources reads/writes to function inputs/outputs.
 This pass promotes resource accesses in function(s) (by default, the main)
 to input arguments and outputs of the function(s).
@@ -1111,7 +1116,24 @@ tf_device.replicate([%0, %1] as %ri: tensor<*x!tf_type.resource>) {n = 2 : i32}
   tf_device.return
 }
 ```
+### `-tf-replicate-tensor-list-init-ops`: Replicate TensorList init ops for correct shape assignments in shape inference
+If we pass same TensorList to a while op as multiple arguments or just use
+the same TensorList at multiple places and assign different
+TensorListSetItem to elements of TensorList, the shape inference is then
+unable to identify the Shape of these args and thus the input TensorList
+shape is unidentifiable.
+All of these args are supposed to be independent and not related to original
+creation of TensorList.
+
+This pass will create multiple instances of TensorList for each arg of the
+while op and each use and thus there will be not a conflict in resolving the
+shape of these different inputs.
 ### `-tf-replicate-to-island`: Lowers device replicate to executor islands
+
+#### Options
+```
+-legacy-graph-export : Determines whether or not this pass should execute logic that is reserved for the legacy graph export pipeline to maintain expected invariants. In the case of this pass, that means manually propagating controls to lifted parallel execute regions to the graph fetch to ensure the ops execute, as well as determining whether or not the islands created by this pass should be split after the replicated ops have been lifted.
+```
 ### `-tf-resource-device-inference`: Propagates the device attribute on resources from callers to callees.
 A pass that propagates device assignment of resources on a module. It
 performs in-function propagation, as well as cross-function propagation from
@@ -1549,6 +1571,7 @@ The transformation happens only for on-device variables. The above
 transformation requires `%arg0`, `%arg1` to have the same device assignment
 as the `TPUExecute` op.
 ### `-tf-tpu-parallel-execute-sink-resource-write`: Moves tf.AssignVariableOp consumers of tf_device.parallel_execute into tf_device.parallel_execute regions
+### `-tf-tpu-partitioned-op-conversion`: Rewrite all TPU Partitioned ops into their V2 counterparts.
 ### `-tf-tpu-reorder-replicate-partitioned-inputs`: Reorder replicated and partitioned input ops.
 This pass rewrites how data parallelism and model parallelism is expressed for
 inputs. It reorders `tf.TPUPartitionedInput` (model parallelism) and
diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
deleted file mode 100644
index 1130199fbae..00000000000
--- a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
+++ /dev/null
@@ -1,269 +0,0 @@
-# MLIR CodeGen for XLA
-
-<!--*
-# Document freshness: For more information, see go/fresh-source.
-freshness: { owner: 'timshen' reviewed: '2020-06-16' }
-*-->
-
-XLA operates on `HloInstruction` and performs many optimizations on this
-representation, sharing a lot of these between targeted devices. As some point a
-linear schedule is computed and the memory buffer is assigned to each value
-statically. The device specific codegen operates by traversing this sequence and
-calling "emitters" to generate a representation suitable for the device (for
-example a single LLVM function per XLA computation on CPU, or a sequence of
-"thunks" encapsulating GPU operations and possibly generated PTX when targeting
-GPU).
-
-As a staging step, we're currently in the process of intercepting the process
-right after XLA completes the buffer-assignment phase and emit instead an MLIR
-module in the `lhlo` dialect. From there we perform the codegen using MLIR
-components (Linalg, affine, and GPU dialect mainly) depending on the device.
-
-Below is the plan of record to incrementally migrate XLA/GPU by using `lhlo` as
-the codegen input.
-
-## Tasks
-
-|               | Host                     | Device
-| ------------- | ------------------------ | ------------------------
-| Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
-| Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
-
-*   **Task 1** changes both host and device input format from HloInstruction* to
-    LHLO.
-*   **Task 2** changes output format of host from thunks to "some landing pad
-    for host" (see below).
-*   **Task 3** migrates device output from LLVM IR to some form of MLIR. It's
-    optional to this project, and see the section "Migrating Device LLVM IR" for
-    details.
-
-This project prioritizes having end-to-end runnable models with LHLO-emitters
-enabled as much as possible. This implies that the following order list of
-objectives by priority:
-
-*   Make XLA/GPU runnable with LHLO emitters, with existing Thunks and emitters
-    unmodified.
-*   Eliminate the references to HloInstruction\* in LHLO, case by case:
-    *   Switch a legacy emitter to an MLIR-based emitter (e.g. Linalg), or
-    *   Mechanically translate the existing emitter to take MLIR representation
-        (migrate to Standard with GPU Dialect).
-
-## Migrating Thunks (Task 2)
-
-xla::gpu::Thunk is a data structure that:
-
-*   Can be called into from the host (xla::gpu::Thunk::ExecuteOnStream()).
-*   Carries various data in its subclasses.
-*   Interacts with BufferAllocation::Slice and StreamExecutor.
-*   Launches kernels
-*   Calls into all runtime libraries.
-
-The cost of that includes:
-
-*   Representing op-specific configuration data (e.g. convolution configs).
-*   Migrating op shape and operand shapes.
-*   Representing a tree of thunks (while, condition, etc).
-
-The migration work is independent from LHLO / emitter migration. Under limited
-resources, it's prioritized behind LHLO / emitter migration.
-
-We have several choices on how to lower the host-side part from LHLO:
-
-*   TFRT
-    *   (Pro) great CUDA and HIP wrappers for use.
-    *   (Pro) easy to implement library calls (cuDNN, cuBLAS, cuFFT, etc), as
-        TFRT ops are interpreted by C++ code.
-    *   (Con) host side is under development and not tested.
-*   Jitted CPU code
-    *   (Pro) great lower-ability. Create a few loops and conditions and it's
-        done.
-    *   (Con) GPUDialect doesn't yet model chains/streams/asynchronicity/device
-        allocation.
-    *   (Con) CUDA / HIP runtime support is minimal (toolkit path, version,
-        dynamic loading, etc).
-*   Existing (interpreting) XLA runtime
-
-Decision: adopt TFRT, but also support jitting CPU code in TFRT.
-
-## Migrating Device LLVM IR (Task 3)
-
-An elemental emitter generates target op by filling it element by element. Each
-output element depends on a set of elements from the operands. All elements are
-described by combining the buffer with dynamic indices. It's sufficient to
-describe almost all "math" ops, but for performance reasons only a large subset
-of "math" ops are implemented directly in (Cpu|Gpu)ElementalIrEmitter.
-
-ElementalIrEmitter is unique in that:
-
-*   A large portion of the code is shared between XLA/GPU and CPU.
-*   It represents a large portion of ops seen in models, including all
-    element-wise ops.
-*   Most fusions solely depend on ElementalIrEmitter.
-*   It's structurally simple, as it describes a data dependency DAG between op
-    elements and operand elements.
-*   It's mostly portable and high-level (e.g. unlike GPU kReduce and GPU kCopy).
-*   Dynamic shape support is easy for at least element-wise ops.
-
-Now, for all ops, elementally-emitted or not, there are several flavors of the
-end state of each XLA op:
-
-1.  Device code stays as LLVM IR.
-1.  Refactor the old emitter to be like LHLO -> MLIR LLVM Dialect:
-    *   (Cost) Will be throw-away work if we want to ultimately migrate to
-        Standard.
-    *   (Benefit) It is easy and mechanical. Can be done in a short period.
-    *   (Benefit) It doesn't benefit more compared to (1).
-1.  Refactor old emitters to be like LHLO -> MLIR GPU + Standard + Loops:
-    *   (Cost) Lifting existing emitters to Standard introduces some challenges.
-        Pointers and GEPs need to be converted to MemRefs and SubViews. Ensuring
-        amdgpu completeness is another one.
-    *   (Cost) XLA/GPU heavily relies on LLVM metadata:
-        *   `range` for block/thread indices.
-        *   `align`, `dereferenceable`, `invariant.load`, `alias.scope`,
-            `noalias` for load/stores.
-        *   `llvm.loop.unroll.disable`, `llvm.loop.unroll.full`,
-            `llvm.loop.vectorize.enable` for sequential loops.
-    *   (Benefit) Can be long-term. More portable.
-1.  Refactor old emitters to be LHLO -> Linalg, and write new Linalg emitters
-    *   (Cost) This is case by case. Compared to previous options, a new
-        implementation that matches XLA's performance needs to go through the
-        benchmark <-> optimize workflow, which can be a significant cost for
-        some ops.
-    *   (Benefit) unified stack; community support; portability; more
-        optimization potentials.
-
-Conclusions:
-
-*   Don't go for (2). (1) or (3) are just better than (2). (2) costs more than
-    (1), since it requires a lot of mechanical refactoring. With (1) we can
-    still achieve the goal of enabling XLA to pick up MLIR emitters. This is by
-    doing LHLO -> LLVM IR -> run legacy device emitters.
-*   ElementalIrEmitter ops go for (4), but not incrementally. There is no way to
-    do it op by op, because all elementally-emitted ops are connected into the
-    same graph. This work can also serve as a unification point of several
-    on-going forces (the kernel generator, Linalg).
-*   All other ops go for (1). As a stretch goal, they might be migrated to (3)
-    or (4).
-
-## Prioritization
-
-While all three tasks mentioned above are parallelizable, under limited
-resources they have to be serialized. The prioritization focuses on visible
-results for completion of each task.
-
-The prioritization is: Task1 (LHLO for legacy emitters) > Task 2 (Thunks) > Task
-3 (MLIR emitters).
-
-By the end of Task 1, users of XLA can generate an LHLO (e.g. kernel generator)
-and execute them. The compilation format will not be serializable MLIR.
-
-By the end of Task 2, LHLO lowers to proper, serializable MLIR. This enables
-offline compilation.
-
-By the end of Task 3, all XLA emitters are MLIR-based in its implementation.
-
-## Detailed Design
-
-### Step 1: (Task 1) Complete LHLO and Make Legacy Emitters Take LHLO
-
-This step makes all existing XLA/GPU emitters interact with MLIR ops. This step
-is pure refactoring and NFC.
-
-This step is mostly mechanical, but it's worth noticing the following
-discrepancies between an unnested HloComputation and LHLO:
-
-*   Each HloInstruction has direct access to its operands (a data-flow DAG). On
-    contrary, each LHLO op only has access to its operand buffers (a bipartite
-    between ops and buffers). LHLO ops have to go through use-def chains to
-    access their operand ops.
-*   Unnested legacy emitters empirically almost never access their operands. The
-    only exception is kReduce.
-*   Unnested legacy emitters access BufferAssignment only for getting slices,
-    not for accessing aux data structures like dataflow\_analysis() or
-    alias\_analysis(). llvm\_ir builds its own alias\_analysis() based on slice
-    information.
-
-The conclusion is that LHLO should fit right-in without major hassle.
-
-### Step 2: (Optional) Profiling Support
-
-**This step is only needed if we start to discard some of the XLA Thunk logic
-(see the next step).**
-
-Before actually turning on any MLIR-based emitters, we need profiling for
-MLIR-based emitters.
-
-Currently XLA performs its own profiling by calling into StreamExecutor's timer.
-The timer under the hood inserts two events before and after a kernel launch,
-and measures the sync time between these two events.
-
-There are roughly three approaches to support profiling in MLIR:
-
-*   Run a profiler end-to-end
-*   Add a profile op for each op in LHLO, using an injected profiler.
-
-The "end-to-end" approach is transparent to MLIR, but suffers the same problem
-that makes XLA not use it in the first place: library calls collected by a
-profiler (nvprof/...) can't easily relate to HLO ops. For example, cuDNN
-launches multiple kernels for each HLO, and it's hard to tell which kernels
-correspond to which HLO.
-
-The "injected profiler" approach requires:
-
-*   LHLO to take a profiler as a parameter.
-*   inserting profile.start / profile.end before and after each op.
-*   a pass from that lowers profile.{start,end} to a C++ implementation.
-
-The exact profiling can't be easily done for MLIR-generated ops, since:
-
-*   MLIR doesn't have a timer, nor it depends on TFRT / StreamExecutor.
-*   MLIR doesn't easily call into C functions with complicated parameters.
-
-### Step 3: (Task 2) Migrating Thunks
-
-As a note, there are roughly three kinds of thunks:
-*   KernelThunk, which launches a kernel.
-*   Control flow thunks, which has host control flow logic (conditional, while,
-    for, sequence) and launch body kernels.
-*   Library thunks: cuDNN, cuBLAS, cuFFT, NCCL, etc.
-
-The plan is:
-*   Make Thunks (de)serializable.
-*   Help improve TFRT to a state where it can support these semantics.
-*   As the state improves, migrate individual thunks incrementally.
-
-These action items are only partially ordered. The actual execution order /
-engineering parallelism is to be evaluated as it goes.
-
-### Step 4: (Task 3) Migrated ElementalIrEmitter
-
-Once profiling is ready, we can complete and tune all ElementalIrEmitter-based
-emitters in MLIR. Then we turn them on by default, assuming that all of these
-MLIR-based emitters use a single stream.
-
-Notice that it's beneficial to migrate XLA/CPU's ElementalIrEmitter as well,
-since they share a large portion of the code.
-
-With all benchmarking and performance hunting done (TODO: define performance
-parity), we turn on the new MLIR-based elemental emitter, and delete the legacy
-ElementalIrEmitter.
-
-This step also provides easy fusion transitions (nested ops) for the later
-migration.
-
-### Step 5: Multi-Stream Support or Drop
-
-We can't delete
-[some of the emitters](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/gpu/stream_assignment.cc#L140)
-until we support it in MLIR, or we drop the feature. It's a relatively large
-amount of work in MLIR and a small amount of gain for XLA. We should investigate
-current users of multi-stream XLA/GPU users, and try to delete this feature if
-reasonable.
-
-### Step 6: (Task 3) Migrated Device Ops
-
-This step migrates all unnested ops, then we can delete all unnested emitters.
-
-This calls on a rewrite/refactor for kCopy and kReduce. kReduce is already
-worked on for plenty, so the actual amount of work that needs to be done remains
-to be seen.
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 1840df2b1fc..9ce3e6913ad 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # TODO(jpienaar): Make the visibility more restrictive.
         ":friends",
@@ -19,7 +20,6 @@ package_group(
     packages = [
         "//learning/brain/experimental/mlir/tflite/tfmrt/...",
         "//learning/brain/mlir/...",
-        "//third_party/auroraml/...",
         "//third_party/iree/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/lite/python/...",
@@ -312,6 +312,7 @@ cc_library(
     ],
     deps = [
         ":cost_estimators",
+        ":size_utils",
         ":tensorflow_lite_op_enums_inc_gen",
         ":tensorflow_lite_op_interfaces_inc_gen",
         ":tensorflow_lite_ops_inc_gen",
@@ -363,6 +364,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "size_utils",
+    srcs = [
+        "utils/size_utils.cc",
+    ],
+    hdrs = [
+        "utils/size_utils.h",
+    ],
+    deps = [
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "size_utils_test",
+    size = "small",
+    srcs = ["utils/size_utils_test.cc"],
+    deps = [
+        ":size_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "cost_estimators",
     hdrs = [
@@ -385,13 +411,12 @@ cc_library(
         "utils/constant_utils.h",
     ],
     deps = [
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -615,6 +640,7 @@ cc_library(
         ":lstm_utils",
         ":nms_utils",
         ":perception_ops_utils",
+        ":size_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
@@ -706,7 +732,9 @@ cc_library(
         "transforms/post_quantize.cc",
         "transforms/prepare_quantize.cc",
         "transforms/prepare_quantize_dynamic_range.cc",
+        "transforms/prepare_quantize_helper.cc",
         "transforms/quantize.cc",
+        "transforms/quantize_variables.cc",
         "utils/generated_op_quant_spec_getters.inc",
     ],
     hdrs = [
@@ -722,6 +750,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/quantization/lite:tfl_to_std",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -983,6 +1012,7 @@ cc_library(
         ":convert_type",
         ":flatbuffer_tflite_operator_lib",
         ":low_bit_utils",
+        ":size_utils",
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
@@ -1139,12 +1169,13 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:translate",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -1227,15 +1258,12 @@ cc_library(
         ":common",
         ":flatbuffer_translate_lib",
         ":tensorflow_lite",
-        ":tensorflow_lite_legalize_tf",
-        ":tensorflow_lite_optimize",
-        ":tensorflow_lite_quantize",
         ":tf_tfl_passes",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
-        "//tensorflow/compiler/mlir/lite/stablehlo:mhlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
+        "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
@@ -1249,12 +1277,12 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_freeze_variables",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 994002b0a9b..6a8b8c1f656 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -249,7 +249,7 @@ static void EmitGetBuiltinOpCode(const std::vector<Record *> &defs,
        << "    return tflite::BuiltinOperator_" << operator_name << ";\n";
   }
 
-  os << "  return llvm::None;\n"
+  os << "  return std::nullopt;\n"
         "}\n";
 }
 
@@ -335,7 +335,7 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
        << "fbb);\n";
   }
 
-  os << "  return llvm::None;\n"
+  os << "  return std::nullopt;\n"
         "}\n";
 }
 
@@ -499,7 +499,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
        << "::VerifyTflRuntimeConstraints(::mlir::Operation *op, bool "
           "emit_error_on_verify_fail) {\n";
     os << "  auto top = cast<" << op.getCppClassName() << ">(op); (void)top;\n";
-    verify_ctx.withOp("top");
+    verify_ctx.addSubst("_op", "top");
 
     for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
       auto &value = op.getOperand(i);
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/BUILD b/tensorflow/compiler/mlir/lite/experimental/common/BUILD
new file mode 100644
index 00000000000..f94c2478774
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/common/BUILD
@@ -0,0 +1,18 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+
+cc_library(
+    name = "outline_operations",
+    srcs = ["outline_operations.cc"],
+    hdrs = ["outline_operations.h"],
+    compatible_with = get_compatible_with_cloud(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
new file mode 100644
index 00000000000..d7142f7e6b5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
@@ -0,0 +1,210 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+namespace common {
+
+bool IsConstantOrNone(Operation* op) {
+  return (op->getNumResults() == 1 &&
+          op->getResult(0).getType().isa<NoneType>()) ||
+         matchPattern(op, m_Constant());
+}
+
+// Pre-order traverse, adding results and BlockArgs to `been_defined` and
+// collecting operands not contained within `been_defined`. If we encounter an
+// operand that references a Value that has been defined (and added to
+// `been_defined`) it is garuanteed that the Value definition is not contained
+// in descedant node of reference, and given that the input DAG is valid, the
+// definition is self-contained within `op` so it is not depended upon.
+// Otherwise, the operand must have been defined somewhere above the Subgraph,
+// so union with other operand dependencies.
+llvm::SmallVector<Value> AccumulateOperandsDefinedAbove(
+    const llvm::SetVector<Operation*>& partition_ops) {
+  // Assuming that all are topologically sorted.
+  llvm::SetVector<Value> been_defined;
+  llvm::SetVector<Value> results;
+  auto update_from_op = [&](Operation* op) {
+    been_defined.insert(op->getResults().begin(), op->getResults().end());
+    for (Value input : op->getOperands()) {
+      if (been_defined.contains(input)) {
+        continue;
+      }
+      results.insert(input);
+    }
+  };
+  for (Operation* op : partition_ops) {
+    update_from_op(op);
+    op->walk<WalkOrder::PreOrder>([&](Block* nested_block) {
+      been_defined.insert(nested_block->getArguments().begin(),
+                          nested_block->getArguments().end());
+      for (Operation& op : nested_block->getOperations()) update_from_op(&op);
+    });
+  }
+  return SmallVector<Value>(results.getArrayRef());
+}
+
+llvm::SmallVector<Value> AccumulateResultsDefinedWithin(
+    const llvm::SetVector<Operation*>& partition_ops) {
+  llvm::SmallVector<Value> values_for_results;
+  for (Operation* op : partition_ops) {
+    if (IsConstantOrNone(op)) {
+      continue;
+    }
+    for (Value output : op->getResults()) {
+      bool output_consumed_outside_subgraph = false;
+      for (Operation* consumer : output.getUsers()) {
+        if (llvm::all_of(partition_ops, [&](Operation* op) {
+              return !op->isAncestor(consumer);
+            })) {
+          output_consumed_outside_subgraph = true;
+        }
+      }
+      if (output_consumed_outside_subgraph) {
+        values_for_results.push_back(output);
+      }
+    }
+  }
+  return values_for_results;
+}
+
+// Compute signature for raised func from arugments and outputs of
+// Operation partition.
+llvm::SmallVector<Type> TypesFromValues(
+    const llvm::SmallVector<Value>& values) {
+  llvm::SmallVector<Type> types;
+  for (auto value : values) {
+    types.push_back(value.getType());
+  }
+  return types;
+}
+
+func::FuncOp BuildFuncOp(const Subgraph& subgraph, OpBuilder& builder,
+                         ModuleOp& module, OpsAdded& ops_added) {
+  // The parameters of the new MLIR function are taken to be the union
+  // of all operands referenced by Operations within the subraph.
+  // Likewise the results of the function are any Value(s) that are defined
+  // within the subgraph and are referenced outside the subgraph.
+  llvm::SmallVector<Type> input_types =
+      TypesFromValues(subgraph.FuncArguments());
+  llvm::SmallVector<Type> return_types =
+      TypesFromValues(subgraph.FuncOutputs());
+
+  FunctionType function_type =
+      builder.getFunctionType(input_types, return_types);
+
+  std::string function_name = absl::StrCat("func_", subgraph.subgraph_id_);
+
+  func::FuncOp new_func = func::FuncOp::create(builder.getUnknownLoc(),
+                                               function_name, function_type);
+  new_func.setVisibility(func::FuncOp::Visibility::Private);
+  new_func.addEntryBlock();
+
+  // To form the body of the new function we need to clone each
+  // Operation along with its respective operands and result Values(s).
+  // The semantic of `Operation::clone` is copying given entity *into* this
+  // entity. The new FuncOp body is populated by cloning partitioned ops into
+  // it. Cloning Operation(s) will create cloned Value(s) for the results of a
+  // cloned op, but it needs a reference to the new operand Value(s) which are
+  // the result of the cloned ops. The approach is to traverse the subgraph in
+  // order, accumulating clones of defined Values into a `IRMapping`
+  // and pass that map to calls to clone ops.
+  OpBuilder function_builder(new_func.getBody());
+  // Prefered data structure for mapping MLIR values.
+  IRMapping values_in_scope;
+  // Function arguments can appear as operands, so they clone should
+  // be aware of them.
+  assert(subgraph.FuncArguments().size() == new_func.getNumArguments());
+  for (int i = 0; i < subgraph.FuncArguments().size(); ++i) {
+    Value original_value = subgraph.FuncArguments()[i];
+    Value new_func_arg = new_func.getArgument(i);
+    values_in_scope.map(original_value, new_func_arg);
+  }
+
+  for (Operation* op : subgraph.partition_ops_) {
+    function_builder.clone(*op, values_in_scope);
+  }
+  SmallVector<Value> return_operands;
+  for (Value result : subgraph.FuncOutputs()) {
+    Value cloned_output = values_in_scope.lookup(result);
+    return_operands.push_back(cloned_output);
+  }
+  function_builder.create<mlir::func::ReturnOp>(new_func.getLoc(),
+                                                return_operands);
+  ops_added.func_op = new_func;
+  module.push_back(new_func);
+  return new_func;
+}
+
+void ExtractSubgraphToFunc(const Subgraph& subgraph, OpBuilder& builder,
+                           ModuleOp& module, OpsAdded& ops_added) {
+  func::FuncOp func = BuildFuncOp(subgraph, builder, module, ops_added);
+
+  // We just use the location of the last ops in the subgraph as the location
+  // for the call_op.
+  Operation* last_output = subgraph.partition_ops_.back();
+
+  builder.setInsertionPoint(last_output);
+  auto call_op = builder.create<func::CallOp>(last_output->getLoc(), func,
+                                              subgraph.FuncArguments());
+  ops_added.call_op = call_op;
+  // FuncOutputs refer to the original `Values` in input module which are now
+  // invalid after pulling out the defining ops. The values in
+  // `call_ops.getResult` refer to the clones of original `Values` which are now
+  // returned by the new `FuncOp`. We can replace each in `FuncOutputs` with
+  // clone in `call_op` to fix up.
+  for (int i = 0; i < subgraph.FuncOutputs().size(); ++i) {
+    Value output = subgraph.FuncOutputs()[i];
+    output.replaceAllUsesWith(call_op.getResult(i));
+  }
+
+  // Clear the subgraph.
+  // Those ops should be removed.
+  for (auto* op : subgraph.partition_ops_) {
+    if (IsConstantOrNone(op)) {
+      continue;
+    }
+    op->dropAllDefinedValueUses();
+    op->dropAllReferences();
+    op->erase();
+  }
+}
+
+}  // namespace common
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h
new file mode 100644
index 00000000000..143a37d12c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h
@@ -0,0 +1,132 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+namespace common {
+
+// Returns true if the `op` is a constant-like op or produces none type.
+bool IsConstantOrNone(Operation* op);
+
+// Computes the list of Value(s) referenced by Subgraph Operations that are
+// not defined within the Subgraph. Any such Value(s)
+// are validly in-scope for the initial Operation. They must be either
+// defined above the subgraph or appear as an argument to the containing func.
+// These Value(s) are taken to be the arguments of the new raised func.
+// An operand dependency is a Value referenced anywhere in an Op
+// that is defined above the Op. All SSA Values are assigned/defined in a
+// BlockArg or as a result of an Operation.
+llvm::SmallVector<Value> AccumulateOperandsDefinedAbove(
+    const llvm::SetVector<Operation*>& partition_ops);
+
+// Similar to `AccumulateOperandsDefinedAbove()`, computes the Value(s) that are
+// defined within a Subgraph and referenced in a descendant Operation. These
+// Values(s) are to be returned by the new raised function.
+llvm::SmallVector<Value> AccumulateResultsDefinedWithin(
+    const llvm::SetVector<Operation*>& partition_ops);
+
+// Represents a view of a set of mlir Operations that form a subgraph of the
+// entire Module's DAG. `Subgraph` can be thought of as segment of sequential
+// Operations within a func definition. Additional facts:
+//    1. Subgraphs are restricted to a single Block. They do not span
+//        branching instructions. Thus the subgraph is a simple 1-degree path.
+//    2. All Operations in a subgraph belong to the same block in a
+//        funtion body.
+//    3. Function bodies are assumed to have only one block in some places.
+class Subgraph {
+  // Set vector preserves insertion order, must insert Ops in topological order.
+ public:
+  const llvm::SetVector<Operation*> partition_ops_;
+
+  // Subgraphs are given a unique incremented integer id based on when
+  // they were encountered in this pass.
+  const int subgraph_id_;
+
+  const llvm::StringRef dialect_namespace_;
+
+  Subgraph(const llvm::SetVector<Operation*> partition_ops, int num_subgraphs)
+      : partition_ops_(partition_ops),
+        subgraph_id_(num_subgraphs),
+        func_arguments_(AccumulateOperandsDefinedAbove(partition_ops)),
+        func_outputs_(AccumulateResultsDefinedWithin(partition_ops)) {}
+
+  const llvm::SmallVector<Value>& FuncArguments() const {
+    // `Value`s in MLIR library are implemented as having "value semantics"
+    // see "llvm/llvm-project/mlir/include/mlir/IR/Value.h" so copying is fine.
+    return func_arguments_;
+  }
+  const llvm::SmallVector<Value>& FuncOutputs() const { return func_outputs_; }
+
+ private:
+  // Compute once at construction and save as field.
+  const llvm::SmallVector<Value> func_arguments_;
+  const llvm::SmallVector<Value> func_outputs_;
+};
+
+// Helper data structure for output parameters to `ExtractSubgraphToFunc`.
+// `ExtractSubgraphToFunc` adds exactly two "new" `Operations`, a FuncOp and
+// a CallOp. Pass these back to the caller for setting more specific attributes
+// after graph mutation has taken place.
+struct OpsAdded {
+  mlir::func::FuncOp func_op;
+  mlir::func::CallOp call_op;
+};
+
+// Given a `Subgraph` containing a sequence of adjacent `Operations` from
+// the `module`, raise these `Operations` (and any ops contained nested within)
+// to the body of a new seperate root level function. Replace in their current
+// location with a `CallOp` which invokes said `FuncOp`. The inputs to
+// this new functions are taken to be the `Values` that appear as operands
+// to ops in the subgraph, which are not self-contained within the subgraph.
+// The outputs of this function are taken to be the results of ops in the
+// subgraph which are referenced as operands outside of the subgraph.
+// Also refer to documention of `AccumulateOperandsDefinedAbove` &
+// `AccumulateResultsDefinedWithin`.
+void ExtractSubgraphToFunc(const Subgraph& subgraph, OpBuilder& builder,
+                           ModuleOp& module, OpsAdded& ops_added);
+
+}  // namespace common
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_COMMON_OUTLINE_OPERATIONS_H_
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 4808005008a..989aa30e1ed 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -245,12 +246,13 @@ cc_library(
         ":tac_importer_exporter",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite/experimental/common:outline_operations",
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:target_hardware",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -267,7 +269,6 @@ cc_library(
         ":target_aware_conversion",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration",
-        "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
     ],
     alwayslink = 1,
 )
@@ -287,6 +288,7 @@ tf_cc_binary(
     testonly = True,
     deps = [
         ":tac-opt_lib",
+        "//tensorflow/compiler/mlir/lite/experimental/common:outline_operations",
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:all-target-hardwares",
     ],
 )
@@ -303,6 +305,7 @@ cc_library(
         ":tflite_importer_exporter",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_legalize_tf",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_optimize",
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:target_hardware",
         "//tensorflow/compiler/mlir/lite/experimental/tac/utils",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h b/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h
index 60f6c38e396..ecbfcd8bb2c 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_SUBGRAPH_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_SUBGRAPH_H_
 
+#include <optional>
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
@@ -40,7 +41,7 @@ constexpr char kInterfaceNameAttr[] = "tac.interface_name";
 
 inline llvm::Optional<std::string> GetInterFaceName(Operation* op) {
   auto name_attr = op->getAttrOfType<StringAttr>(kInterfaceNameAttr);
-  if (!name_attr) return llvm::None;
+  if (!name_attr) return std::nullopt;
   return name_attr.getValue().str();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h b/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
index 36246df45a7..5674708f697 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <optional>
 #include <string>
 #include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -88,7 +88,7 @@ inline std::string GetCanonicalHardwareName(const std::string& hardware_name) {
 // Get the target annotation form the op.
 inline llvm::Optional<std::string> GetTargetAnnotation(Operation* op) {
   auto device = op->getAttrOfType<StringAttr>(kDevice);
-  if (device == nullptr || device.getValue().empty()) return llvm::None;
+  if (device == nullptr || device.getValue().empty()) return std::nullopt;
 
   return GetCanonicalHardwareName(device.getValue().str());
 }
@@ -96,7 +96,7 @@ inline llvm::Optional<std::string> GetTargetAnnotation(Operation* op) {
 // Get inference type attribute from the operation if available.
 inline llvm::Optional<InferenceType> GetInferenceTypeAnnotation(Operation* op) {
   auto inference_type = op->getAttrOfType<StringAttr>(kInferenceType);
-  if (inference_type == nullptr) return llvm::None;
+  if (inference_type == nullptr) return std::nullopt;
 
   llvm::StringRef device_name_str = inference_type.getValue();
   return GetInferenceTypeEnum(device_name_str);
@@ -129,14 +129,14 @@ struct InferenceDeviceType {
 inline llvm::Optional<InferenceDeviceType> GetInferenceDeviceTypeForOp(
     Operation* op) {
   auto hardware = GetTargetAnnotation(op);
-  if (!hardware.has_value()) return llvm::None;
+  if (!hardware.has_value()) return std::nullopt;
 
   auto inference_type = GetInferenceTypeAnnotation(op);
-  if (!inference_type.has_value()) return llvm::None;
+  if (!inference_type.has_value()) return std::nullopt;
 
   InferenceDeviceType inference_device_type;
-  inference_device_type.hardware = hardware.getValue();
-  inference_device_type.inference_type = inference_type.getValue();
+  inference_device_type.hardware = hardware.value();
+  inference_device_type.inference_type = inference_type.value();
   return inference_device_type;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
index 998b7770251..57fb5ea9eef 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
index a76de268f5f..7fcd4d8b37d 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.cc
@@ -16,12 +16,12 @@
 
 #include <cstdint>
 #include <map>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -56,17 +56,17 @@ bool HasValidHardwareTarget(mlir::Operation* op) {
 }
 
 llvm::Optional<std::string> GetDeviceName(mlir::Operation* op) {
-  if (IsConst(op)) return llvm::None;
+  if (IsConst(op)) return std::nullopt;
 
   // The model may contain quant stats op which is unrelevant to the
   // execution.
   if (llvm::isa<mlir::func::ReturnOp, mlir::quantfork::StatisticsOp>(op))
-    return llvm::None;
+    return std::nullopt;
 
-  if (!HasValidHardwareTarget(op)) return llvm::None;
+  if (!HasValidHardwareTarget(op)) return std::nullopt;
 
   auto device = op->getAttrOfType<mlir::StringAttr>(mlir::TFL::tac::kDevice);
-  if (device == nullptr) return llvm::None;
+  if (device == nullptr) return std::nullopt;
 
   llvm::StringRef device_name_str = device.getValue();
   return device_name_str.str();
@@ -76,13 +76,13 @@ llvm::Optional<std::vector<float>> GetPerDeviceCosts(
     const std::map<std::string, uint8_t>& hardware_map, mlir::Operation* op) {
   auto device_costs_attr =
       op->getAttrOfType<mlir::DictionaryAttr>("per_device_costs");
-  if (device_costs_attr == nullptr) return llvm::None;
+  if (device_costs_attr == nullptr) return std::nullopt;
 
   std::vector<float> device_costs(hardware_map.size(), -1.f);
 
   for (const auto& kv : hardware_map) {
     auto cost_attr = device_costs_attr.getNamed(kv.first);
-    if (!cost_attr.has_value()) return llvm::None;
+    if (!cost_attr.has_value()) return std::nullopt;
     float cost = cost_attr->getValue()
                      .dyn_cast_or_null<mlir::FloatAttr>()
                      .getValueAsDouble();
@@ -116,13 +116,12 @@ flatbuffers::Offset<SubgraphMetadata> CreateSubgraphMetadata(
       flatbuffers::Offset<flatbuffers::Vector<float>> per_device_cost_offset;
 
       if (per_device_cost.has_value()) {
-        per_device_cost_offset =
-            builder->CreateVector(per_device_cost.getValue());
+        per_device_cost_offset = builder->CreateVector(*per_device_cost);
       }
 
       OpMetadataBuilder op_builder(*builder);
       op_builder.add_index(index);
-      uint8_t hardware = hardware_map.at(device_name.getValue());
+      uint8_t hardware = hardware_map.at(*device_name);
       op_builder.add_hardware(hardware);
 
       if (per_device_cost.has_value()) {
@@ -147,9 +146,9 @@ CreateHardwareMetadataAndPopulateLookupTable(
       auto device_name = GetDeviceName(op);
       if (!device_name.has_value()) return;
 
-      auto iter = hardware_names->find(device_name.getValue());
+      auto iter = hardware_names->find(*device_name);
       if (iter == hardware_names->end()) {
-        hardware_names->insert({device_name.getValue(), index++});
+        hardware_names->insert({*device_name, index++});
       }
     });
   }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
index 19b041e3b35..6d17c7f6ff6 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
@@ -114,8 +114,7 @@ func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>
       mlir::parseSourceString<mlir::ModuleOp>(kMLIR, &context));
   auto module_op = module.get();
   auto serialized_result_fb = ExportRuntimeMetadata(module_op);
-  const auto* result =
-      GetRuntimeMetadata(serialized_result_fb.getValue().c_str());
+  const auto* result = GetRuntimeMetadata(serialized_result_fb.value().c_str());
   const auto* expected = GetRuntimeMetadata(kExpectedFB.c_str());
   ASSERT_TRUE(result != nullptr);
   ASSERT_TRUE(result->subgraph_metadata() != nullptr);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/BUILD
index c04afe663bc..b62f6c5cf4b 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index b5a65f58aaa..154afe2d923 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "VERSION")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/lite/experimental/tac:__subpackages__",
     ],
@@ -59,6 +60,8 @@ pybind_extension(
         "@farmhash_gpu_archive//:__subpackages__",
         "@fft2d//:__subpackages__",
         "@flatbuffers//:__subpackages__",
+        "@FP16//:__subpackages__",
+        "@FXdiv//:__subpackages__",
         "@gemmlowp//:__subpackages__",
         "@gif//:__subpackages__",
         "@highwayhash//:__subpackages__",
@@ -84,6 +87,7 @@ pybind_extension(
         "@org_sqlite//:__subpackages__",
         "@platforms//:__subpackages__",
         "@png//:__subpackages__",
+        "@pthreadpool//:__subpackages__",
         "@pybind11//:__subpackages__",
         "@ruy//:__subpackages__",
         "@snappy//:__subpackages__",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
index b0b278f2ad4..805b7802517 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
@@ -80,7 +80,7 @@ const tac::TargetHardware* TacModule::GetTargetHardware(
 }
 
 absl::Status TacModule::RunTacPasses(mlir::ModuleOp* module, bool debug_mode) {
-  mlir::PassManager pm(module->getContext(),
+  mlir::PassManager pm((*module)->getName(),
                        mlir::OpPassManager::Nesting::Implicit);
   AddTACPass(&pm, options_.hardware_backends);
   if (!debug_mode) {
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
index 060d24b5000..1ae5f737d37 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
@@ -14,6 +17,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
+        "//tensorflow/compiler/mlir/lite/experimental/common:outline_operations",
         "//tensorflow/compiler/mlir/lite/experimental/tac:tac-opt-all-backends",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
index 18dd066d9ec..8fef794a866 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
index 94534bd7c97..e8a30755a8c 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/fold-constants-to-subgraph.mlir
@@ -114,3 +114,28 @@ func.func @fold_all_test(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3x
 // ALL:           return %[[VAL_5]] : tensor<256x30x30x16xf32>
 // ALL:         }
 }
+
+// -----
+
+module {
+
+func.func @main(%arg0: tensor<4x384x32xf32>) -> tensor<1x384x32xf32> {
+  %0 = arith.constant dense<0> : tensor<3xi32>
+  %1 = arith.constant dense<[1, 384, 32]> : tensor<3xi32>
+  %2 = func.call @simple_test(%arg0, %0, %1) {tac.interface_name = "func1"} : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
+  func.return %2 : tensor<1x384x32xf32>
+}
+
+// PARTIAL-LABEL: @simple_test
+func.func @simple_test(%arg0: tensor<4x384x32xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>) -> tensor<1x384x32xf32> attributes {tac.interface_name = "func1"} {
+  %0 = "tfl.slice"(%arg0, %arg1, %arg2) : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
+  func.return %0 : tensor<1x384x32xf32>
+}
+
+// PARTIAL:       func @simple_test(%[[VAL_0:.*]]: tensor<4x384x32xf32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: tensor<3xi32>) -> tensor<1x384x32xf32> attributes {tac.interface_name = "func1"} {
+// PARTIAL:           %[[VAL_3:.*]] = arith.constant dense<[1, 384, 32]> : tensor<3xi32>
+// PARTIAL:           %[[VAL_4:.*]] = arith.constant dense<0> : tensor<3xi32>
+// PARTIAL:           %[[VAL_5:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) : (tensor<4x384x32xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x384x32xf32>
+// PARTIAL:           return %[[VAL_5]] : tensor<1x384x32xf32>
+// PARTIAL:         }
+}
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
index 71e9c0bd69c..07639de6bca 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
@@ -1,4 +1,94 @@
-// RUN: tac-opt-all-backends -tfl-raise-target-subgraphs %s -split-input-file -verify-diagnostics | FileCheck %s
+// RUN: tac-opt-all-backends -tfl-raise-target-subgraphs %s -split-input-file | FileCheck %s
+
+module {
+func.func @simpleWhile(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tfl.while"(%arg0) ({
+  ^bb0(%block: tensor<i32>):
+    "tfl.yield"(%block) : (tensor<i32>) -> ()
+  },{
+  ^bb0(%block: tensor<i32>):
+    "tfl.yield"(%block) : (tensor<i32>) -> ()
+  }) {tac.device = "CPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+}
+
+// CHECK:     func.func @simpleWhile(%arg0: tensor<i32>) -> tensor<i32> {
+// CHECK:       %0 = call @func_0_CPU_FLOAT(%arg0) {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<i32>) -> tensor<i32>
+// CHECK:       return %0 : tensor<i32>
+// CHECK:     }
+// CHECK:     func.func private @func_0_CPU_FLOAT(%arg0: tensor<i32>) -> tensor<i32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:       %0 = "tfl.while"(%arg0) ({
+// CHECK:       ^bb0(%arg1: tensor<i32>):
+// CHECK:         "tfl.yield"(%arg1) : (tensor<i32>) -> ()
+// CHECK:       }, {
+// CHECK:       ^bb0(%arg1: tensor<i32>):
+// CHECK:         "tfl.yield"(%arg1) : (tensor<i32>) -> ()
+// CHECK:       }) {fused_activation_function = "RELU6", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+// CHECK:       return %0 : tensor<i32>
+// CHECK:     }
+
+// -----
+
+module {
+func.func @whileWithNested(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tfl.while"(%arg0) ({
+  ^bb0(%block: tensor<i32>):
+    %1 = "tfl.add"(%arg0, %arg0) { fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+    %2 = "tfl.add"(%1, %1) { fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+    "tfl.yield"(%2) : (tensor<i32>) -> ()
+  },{
+  ^bb0(%block: tensor<i32>):
+    "tfl.yield"(%block) : (tensor<i32>) -> ()
+  }) {tac.device = "CPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+}
+
+// CHECK:     func.func @whileWithNested(%arg0: tensor<i32>) -> tensor<i32> {
+// CHECK:       %0 = call @func_0_CPU_FLOAT(%arg0) {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<i32>) -> tensor<i32>
+// CHECK:       return %0 : tensor<i32>
+// CHECK:     }
+// CHECK:     func.func private @func_0_CPU_FLOAT(%arg0: tensor<i32>) -> tensor<i32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:       %0 = "tfl.while"(%arg0) ({
+// CHECK:       ^bb0(%arg1: tensor<i32>):
+// CHECK:         %1 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "FLOAT"} : tensor<i32>
+// CHECK:         %2 = func.call @func_1_GPU_FLOAT(%1) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} : (tensor<i32>) -> tensor<i32>
+// CHECK:         "tfl.yield"(%2) : (tensor<i32>) -> ()
+// CHECK:       }, {
+// CHECK:       ^bb0(%arg1: tensor<i32>):
+// CHECK:         "tfl.yield"(%arg1) : (tensor<i32>) -> ()
+// CHECK:       }) {fused_activation_function = "RELU6", tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>) -> tensor<i32>
+// CHECK:       return %0 : tensor<i32>
+// CHECK:     }
+// CHECK:     func.func private @func_1_GPU_FLOAT(%arg0: tensor<i32>) -> tensor<i32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
+// CHECK:       %0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<i32>
+// CHECK:       return %0 : tensor<i32>
+// CHECK:     }
+
+
+
+
+
+// -----
+
+module {
+func.func @degenerateCase(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tfl.add"(%arg0, %arg0) {tac.device = "GPU", fused_activation_function = "RELU6", tac.inference_type = "FLOAT"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+}
+
+// CHECK:     func.func @degenerateCase(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+// CHECK:       %0 = call @func_0_GPU_FLOAT(%arg0) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1xf32>) -> tensor<1xf32>
+// CHECK:       return %0 : tensor<1xf32>
+// CHECK:     }
+// CHECK:     func.func private @func_0_GPU_FLOAT(%arg0: tensor<1xf32>) -> tensor<1xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:       %0 = tfl.add %arg0, %arg0  {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
+// CHECK:       return %0 : tensor<1xf32>
+// CHECK:     }
+
+// -----
 
 module {
 func.func @simpleTest(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>, %arg3: tensor<1xf32>) -> tensor<2x1xf32> {
@@ -11,12 +101,12 @@ func.func @simpleTest(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>, %arg2: tensor<
 }
 
 // CHECK:   func @simpleTest(%[[VAL_0:.*]]: tensor<1xf32>, %[[VAL_1:.*]]: tensor<1xf32>, %[[VAL_2:.*]]: tensor<1xf32>, %[[VAL_3:.*]]: tensor<1xf32>) -> tensor<2x1xf32> {
-// CHECK:           %[[VAL_4:.*]]:2 = call @func_0_GPU_FLOAT(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_0]], %[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>)
+// CHECK:           %[[VAL_4:.*]]:2 = call @func_0_GPU_FLOAT(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>)
 // CHECK:           %[[VAL_5:.*]] = call @func_1_CPU_FLOAT(%[[VAL_4]]#0, %[[VAL_4]]#1) {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<2x1xf32>
 // CHECK:           return %[[VAL_5]] : tensor<2x1xf32>
 // CHECK:         }
 
-// CHECK:   func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1xf32>, %[[VAL_1:.*]]: tensor<1xf32>, %[[VAL_2:.*]]: tensor<1xf32>, %[[VAL_3:.*]]: tensor<1xf32>, %[[VAL_4:.*]]: tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:   func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1xf32>, %[[VAL_1:.*]]: tensor<1xf32>, %[[VAL_2:.*]]: tensor<1xf32>, %[[VAL_4:.*]]: tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
 // CHECK:           %[[VAL_5:.*]] = tfl.add %[[VAL_0]], %[[VAL_1]] {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
 // CHECK:           %[[VAL_6:.*]] = tfl.mul %[[VAL_5]], %[[VAL_2]] {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
 // CHECK:           %[[VAL_7:.*]] = tfl.add %[[VAL_0]], %[[VAL_4]] {fused_activation_function = "RELU6", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1xf32>
@@ -109,6 +199,7 @@ func.func @norm2(%arg0: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
   %10 = "tfl.add"(%1, %9) {tac.device = "GPU", tac.inference_type = "FLOAT", fused_activation_function = "NONE"} : (tensor<1x128x128xf32>, tensor<1x128x128xf32>) -> tensor<1x128x128xf32>
   func.return %10 : tensor<1x128x128xf32>
 }
+}
 
 // CHECK:   func @norm2(%[[VAL_0:.*]]: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
 // CHECK-DAG:       %[[VAL_1:.*]] = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<128xf32>} : () -> tensor<128xf32>
@@ -122,25 +213,23 @@ func.func @norm2(%arg0: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> {
 // CHECK:           return %[[VAL_8]] : tensor<1x128x128xf32>
 // CHECK:         }
 
-// CHECK:   func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x128x128xf32>, %[[VAL_1:.*]]: tensor<128xf32>, %[[VAL_2:.*]]: tensor<2xi32>) -> (tensor<1x128x128xf32>, tensor<128x128xf32>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:   func.func private @func_0_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x128x128xf32>, %[[VAL_1:.*]]: tensor<128xf32>, %[[VAL_2:.*]]: tensor<2xi32>) -> (tensor<1x128x128xf32>, tensor<128x128xf32>) attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
 // CHECK:           %[[VAL_3:.*]] = tfl.add(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
 // CHECK:           %[[VAL_4:.*]] = "tfl.reshape"(%[[VAL_3]], %[[VAL_2]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<1x128x128xf32>, tensor<2xi32>) -> tensor<128x128xf32>
 // CHECK:           %[[VAL_5:.*]] = "tfl.relu"(%[[VAL_4]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>) -> tensor<128x128xf32>
 // CHECK:           return %[[VAL_3]], %[[VAL_5]] : tensor<1x128x128xf32>, tensor<128x128xf32>
 // CHECK:         }
 
-// CHECK:   func private @func_2_GPU_FLOAT(%[[VAL_0:.*]]: tensor<128x128xf32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
-// CHECK:           %[[VAL_3:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_1]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>, tensor<3xi32>) -> tensor<1x128x128xf32>
-// CHECK:           %[[VAL_4:.*]] = tfl.add %[[VAL_2]], %[[VAL_3]] {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1x128x128xf32>
-// CHECK:           return %[[VAL_4]] : tensor<1x128x128xf32>
-// CHECK:         }
-
-// CHECK:   func private @func_1_CPU_FLOAT(%[[VAL_0:.*]]: tensor<128x128xf32>, %[[VAL_1:.*]]: tensor<128x128xf32>, %[[VAL_2:.*]]: tensor<128xf32>) -> tensor<128x128xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
+// CHECK:   func.func private @func_1_CPU_FLOAT(%[[VAL_0:.*]]: tensor<128x128xf32>, %[[VAL_1:.*]]: tensor<128x128xf32>, %[[VAL_2:.*]]: tensor<128xf32>) -> tensor<128x128xf32> attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
 // CHECK:           %[[VAL_3:.*]] = "tfl.fully_connected"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {fused_activation_function = "NONE", keep_num_dims = false, tac.device = "CPU", tac.inference_type = "FLOAT", weights_format = "DEFAULT"} : (tensor<128x128xf32>, tensor<128x128xf32>, tensor<128xf32>) -> tensor<128x128xf32>
 // CHECK:           return %[[VAL_3]] : tensor<128x128xf32>
 // CHECK:         }
 
-}
+// CHECK:   func.func private @func_2_GPU_FLOAT(%[[VAL_0:.*]]: tensor<128x128xf32>, %[[VAL_1:.*]]: tensor<3xi32>, %[[VAL_2:.*]]: tensor<1x128x128xf32>) -> tensor<1x128x128xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
+// CHECK:           %[[VAL_3:.*]] = "tfl.reshape"(%[[VAL_0]], %[[VAL_1]]) {tac.device = "GPU", tac.inference_type = "FLOAT"} : (tensor<128x128xf32>, tensor<3xi32>) -> tensor<1x128x128xf32>
+// CHECK:           %[[VAL_4:.*]] = tfl.add %[[VAL_2]], %[[VAL_3]] {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1x128x128xf32>
+// CHECK:           return %[[VAL_4]] : tensor<1x128x128xf32>
+// CHECK:         }
 
 // -----
 
@@ -159,11 +248,11 @@ func.func @quantizedOpOnly(%arg0: tensor<1x!quant.uniform<i8:f32, 0.003:-128>>,
 // CHECK:   func @quantizedOpOnly(%[[VAL_0:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>> {
 // CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>} : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_3:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1xi8>} : () -> tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
-// CHECK:           %[[VAL_4:.*]] = call @func_0_CPU_QUANTIZED_INT8(%[[VAL_0]], %[[VAL_2]], %[[VAL_3]], %[[VAL_1]], %[[VAL_2]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_0"} : (tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
+// CHECK:           %[[VAL_4:.*]] = call @func_0_CPU_QUANTIZED_INT8(%[[VAL_0]], %[[VAL_2]], %[[VAL_3]], %[[VAL_1]]) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_0"} : (tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_4]] : tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
-// CHECK:   func private @func_0_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_2:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_3:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_4:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_0"} {
+// CHECK:   func private @func_0_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_2:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_3:.*]]: tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-03:-128>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_0"} {
 // CHECK:           %[[VAL_5:.*]] = tfl.mul %[[VAL_0]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_6:.*]] = tfl.add %[[VAL_5]], %[[VAL_2]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           %[[VAL_7:.*]] = tfl.add %[[VAL_3]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : tensor<1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
@@ -186,6 +275,7 @@ func.func @quantizationWithFloat(%arg0: tensor<1x1x384x!quant.uniform<i8:f32, 0.
   %6 = "tfl.mul"(%arg1, %5) {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", fused_activation_function = "NONE"} : (tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>, tensor<1x384x384x!quant.uniform<i8:f32, 0.003:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 0.003:-128>>
   func.return %6: tensor<1x384x384x!quant.uniform<i8:f32, 0.003:-128>>
 }
+}
 
 // CHECK:   func @quantizationWithFloat(%[[VAL_0:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>> {
 // CHECK:           %[[VAL_2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>, value = dense<127> : tensor<1x384x1xi8>} : () -> tensor<1x384x1x!quant.uniform<i8:f32, 3.000000e-03:-128>>
@@ -203,14 +293,228 @@ func.func @quantizationWithFloat(%arg0: tensor<1x1x384x!quant.uniform<i8:f32, 0.
 // CHECK:           return %[[VAL_2]] : tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
+// CHECK:   func private @func_1_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x384x384xf32>, %[[VAL_1:.*]]: tensor<1x384x384xf32>) -> tensor<1x384x384xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
+// CHECK:           %[[VAL_2:.*]] = tfl.add %[[VAL_0]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1x384x384xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x384x384xf32>
+// CHECK:         }
+
 // CHECK:   func private @func_2_CPU_QUANTIZED_INT8(%[[VAL_0:.*]]: tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, %[[VAL_1:.*]]: tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>> attributes {tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8", tac.interface_name = "func_2"} {
 // CHECK:           %[[VAL_2:.*]] = tfl.mul(%[[VAL_0]], %[[VAL_1]]) {fused_activation_function = "NONE", tac.device = "CPU", tac.inference_type = "QUANTIZED_INT8"} : (tensor<1x1x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>, tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:           return %[[VAL_2]] : tensor<1x384x384x!quant.uniform<i8:f32, 3.000000e-03:-128>>
 // CHECK:         }
 
-// CHECK:   func private @func_1_GPU_FLOAT(%[[VAL_0:.*]]: tensor<1x384x384xf32>, %[[VAL_1:.*]]: tensor<1x384x384xf32>) -> tensor<1x384x384xf32> attributes {tac.device = "GPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
-// CHECK:           %[[VAL_2:.*]] = tfl.add %[[VAL_0]], %[[VAL_1]] {fused_activation_function = "NONE", tac.device = "GPU", tac.inference_type = "FLOAT"} : tensor<1x384x384xf32>
-// CHECK:           return %[[VAL_2]] : tensor<1x384x384xf32>
-// CHECK:         }
+// -----
 
-}
+func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x?x!tf_type.string>, %arg2: tensor<?x?xi32>, %arg3: tensor<?x!tf_type.string>, %arg4: tensor<?x!tf_type.string>, %arg5: tensor<?xi32>, %arg6: tensor<?xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>) {
+    %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+    %cst_0 = arith.constant dense<0> : tensor<i32>
+    %cst_1 = arith.constant dense<-1> : tensor<1xi32>
+    %cst_2 = arith.constant dense<1> : tensor<1xi32>
+    %cst_3 = arith.constant dense<0> : tensor<1xi32>
+    %0 = "tfl.shape"(%arg2) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>) -> tensor<2xi32>
+    %1 = "tfl.strided_slice"(%0, %cst_3, %cst_2, %cst_2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+    %2 = "tfl.custom"(%cst_1, %1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230072A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+    %3 = "tfl.custom"(%cst_1, %1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230032A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+    %4:8 = "tfl.while"(%cst_0, %cst_0, %arg5, %arg6, %2, %2, %3, %3) ({
+    ^bb0(%arg7: tensor<i32>, %arg8: tensor<i32>, %arg9: tensor<?xi32>, %arg10: tensor<?xi32>, %arg11: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg12: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg13: tensor<!tf_type.variant<tensor<?xi32>>>, %arg14: tensor<!tf_type.variant<tensor<?xi32>>>):
+      %9 = tfl.less(%arg8, %1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      %10 = tfl.less(%arg7, %1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      %11 = tfl.logical_and %10, %9 {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i1>
+      "tfl.yield"(%11) : (tensor<i1>) -> ()
+    }, {
+    ^bb0(%arg7: tensor<i32>, %arg8: tensor<i32>, %arg9: tensor<?xi32>, %arg10: tensor<?xi32>, %arg11: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg12: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg13: tensor<!tf_type.variant<tensor<?xi32>>>, %arg14: tensor<!tf_type.variant<tensor<?xi32>>>):
+      %cst_4 = arith.constant dense<[0, 0, 1, 1, 1]> : tensor<5xi32>
+      %cst_5 = arith.constant dense<[0, 1, 0, 1, 1]> : tensor<5xi32>
+      %cst_6 = arith.constant dense<2> : tensor<i32>
+      %cst_7 = arith.constant dense<"*"> : tensor<!tf_type.string>
+      %cst_8 = arith.constant dense<-1> : tensor<1xi32>
+      %cst_9 = arith.constant dense<-1> : tensor<i32>
+      %cst_10 = arith.constant dense<2> : tensor<1xi32>
+      %cst_11 = arith.constant dense<1> : tensor<i32>
+      %cst_12 = arith.constant dense<0> : tensor<i32>
+      %cst_13 = arith.constant dense<1> : tensor<1xi32>
+      %cst_14 = arith.constant dense<0> : tensor<1xi32>
+      %9 = "tfl.shape"(%arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
+      %10 = "tfl.strided_slice"(%9, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %11 = "tfl.range"(%cst_12, %10, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+      %12 = "tfl.pack"(%10, %cst_11) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+      %13 = "tfl.strided_slice"(%9, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %14 = tfl.mul(%11, %13) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+      %15 = "tfl.reshape"(%14, %12) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
+      %16 = "tfl.strided_slice"(%9, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %17 = "tfl.reduce_prod"(%16, %cst_14) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+      %18 = "tfl.reshape"(%arg1, %17) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+      %19 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
+      %20 = "tfl.strided_slice"(%19, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %21 = "tfl.range"(%cst_12, %20, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+      %22 = "tfl.pack"(%20, %cst_11) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+      %23 = "tfl.strided_slice"(%19, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %24 = tfl.mul(%21, %23) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+      %25 = "tfl.reshape"(%24, %22) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
+      %26 = "tfl.strided_slice"(%19, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %27 = "tfl.reduce_prod"(%26, %cst_14) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+      %28 = "tfl.reshape"(%arg0, %27) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+      %29 = tfl.add %arg8, %cst_11 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
+      %30 = "tfl.expand_dims"(%arg9, %cst_9) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+      %31 = tfl.add %30, %25 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
+      %32 = "tfl.reshape"(%31, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
+      %33 = "tfl.gather"(%28, %32) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+      %34 = "tfl.reshape"(%33, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+      %35 = "tfl.shape"(%34) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+      %36 = "tfl.fill"(%35, %cst_7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
+      %37 = "tfl.expand_dims"(%arg10, %cst_9) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+      %38 = tfl.add %37, %15 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
+      %39 = "tfl.reshape"(%38, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
+      %40 = "tfl.gather"(%18, %39) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+      %41 = "tfl.reshape"(%40, %cst_8) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+      %42 = "tfl.shape"(%41) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+      %43 = "tfl.fill"(%42, %cst_7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
+      %44 = "tfl.gather"(%arg2, %arg8) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+      %45 = "tfl.equal"(%44, %cst_6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
+      %46 = "tfl.custom"(%45, %36, %34) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323131120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+      %47 = "tfl.custom"(%arg11, %arg8, %46) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326333120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+      %48 = "tfl.equal"(%44, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
+      %49 = "tfl.custom"(%48, %43, %41) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323166120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+      %50 = "tfl.custom"(%arg12, %arg8, %49) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326335120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+      %51 = "tfl.gather"(%cst_5, %44) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+      %52 = tfl.add %arg9, %51 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
+      %53 = "tfl.custom"(%arg13, %arg8, %52) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E326337120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+      %54 = "tfl.gather"(%cst_4, %44) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+      %55 = tfl.add %arg10, %54 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
+      %56 = "tfl.custom"(%arg14, %arg8, %55) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E343139120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+      %57 = tfl.add %arg7, %cst_11 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
+      "tfl.yield"(%57, %29, %52, %55, %47, %50, %53, %56) : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> ()
+    }) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>)
+    %5 = "tfl.custom"(%4#4, %cst_1) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+    %6 = "tfl.custom"(%5, %cst) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A0B0A05547065726D120230032A070A01541202300732000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+    %7 = "tfl.custom"(%4#5, %cst_1) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+    %8 = "tfl.custom"(%7, %cst) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A070A0154120230072A0B0A05547065726D1202300332000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+    return %6, %8 : tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>
+  }
+
+// CHECK:   func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x?x!tf_type.string>, %arg2: tensor<?x?xi32>, %arg3: tensor<?x!tf_type.string>, %arg4: tensor<?x!tf_type.string>, %arg5: tensor<?xi32>, %arg6: tensor<?xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>) {
+// CHECK:     %cst = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:     %cst_0 = arith.constant dense<0> : tensor<i32>
+// CHECK:     %cst_1 = arith.constant dense<-1> : tensor<1xi32>
+// CHECK:     %cst_2 = arith.constant dense<1> : tensor<1xi32>
+// CHECK:     %cst_3 = arith.constant dense<0> : tensor<1xi32>
+// CHECK:     %0 = call @func_0_DARWINN_FLOAT(%arg2, %cst_3, %cst_2) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} : (tensor<?x?xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %1:2 = call @func_1_CPU_FLOAT(%cst_1, %0, %cst_0, %arg5, %arg6, %arg1, %arg0, %arg2, %cst) {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} : (tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>, tensor<?x?xi32>, tensor<2xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>)
+// CHECK:     return %1#0, %1#1 : tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>
+// CHECK:   }
+// CHECK:   func.func private @func_0_DARWINN_FLOAT(%arg0: tensor<?x?xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<i32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
+// CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>) -> tensor<2xi32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     return %1 : tensor<i32>
+// CHECK:   }
+// CHECK:   func.func private @func_1_CPU_FLOAT(%arg0: tensor<1xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?x?x!tf_type.string>, %arg6: tensor<?x?x!tf_type.string>, %arg7: tensor<?x?xi32>, %arg8: tensor<2xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>) attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
+// CHECK:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230072A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230032A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:     %2:8 = "tfl.while"(%arg2, %arg2, %arg3, %arg4, %0, %0, %1, %1) ({
+// CHECK:     ^bb0(%arg9: tensor<i32>, %arg10: tensor<i32>, %arg11: tensor<?xi32>, %arg12: tensor<?xi32>, %arg13: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg14: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg15: tensor<!tf_type.variant<tensor<?xi32>>>, %arg16: tensor<!tf_type.variant<tensor<?xi32>>>):
+// CHECK:       %7 = func.call @func_2_DARWINN_FLOAT(%arg10, %arg1, %arg9) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK:       "tfl.yield"(%7) : (tensor<i1>) -> ()
+// CHECK:     }, {
+// CHECK:     ^bb0(%arg9: tensor<i32>, %arg10: tensor<i32>, %arg11: tensor<?xi32>, %arg12: tensor<?xi32>, %arg13: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg14: tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, %arg15: tensor<!tf_type.variant<tensor<?xi32>>>, %arg16: tensor<!tf_type.variant<tensor<?xi32>>>):
+// CHECK:       %cst = arith.constant dense<[0, 0, 1, 1, 1]> : tensor<5xi32>
+// CHECK:       %cst_0 = arith.constant dense<[0, 1, 0, 1, 1]> : tensor<5xi32>
+// CHECK:       %cst_1 = arith.constant dense<2> : tensor<i32>
+// CHECK:       %cst_2 = arith.constant dense<"*"> : tensor<!tf_type.string>
+// CHECK:       %cst_3 = arith.constant dense<-1> : tensor<1xi32>
+// CHECK:       %cst_4 = arith.constant dense<-1> : tensor<i32>
+// CHECK:       %cst_5 = arith.constant dense<2> : tensor<1xi32>
+// CHECK:       %cst_6 = arith.constant dense<1> : tensor<i32>
+// CHECK:       %cst_7 = arith.constant dense<0> : tensor<i32>
+// CHECK:       %cst_8 = arith.constant dense<1> : tensor<1xi32>
+// CHECK:       %cst_9 = arith.constant dense<0> : tensor<1xi32>
+// CHECK:       %7:2 = func.call @func_3_DARWINN_FLOAT(%arg5, %cst_9, %cst_8, %cst_7, %cst_6, %cst_5) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<1xi32>) -> (tensor<?x1xi32>, tensor<2xi32>)
+// CHECK:       %8 = "tfl.reduce_prod"(%7#1, %cst_9) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:       %9:3 = func.call @func_4_DARWINN_FLOAT(%arg5, %8, %arg6, %cst_9, %cst_8, %cst_7, %cst_6, %cst_5) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_4"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<1xi32>) -> (tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>)
+// CHECK:       %10 = "tfl.reduce_prod"(%9#2, %cst_9) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:       %11 = "tfl.expand_dims"(%arg11, %cst_4) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+// CHECK:       %12 = "tfl.expand_dims"(%arg12, %cst_4) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+// CHECK:       %13:7 = func.call @func_5_DARWINN_FLOAT(%arg6, %10, %arg10, %cst_6, %11, %9#1, %cst_3, %cst_2, %12, %7#0, %9#0, %arg7, %cst_1) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_5"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>, tensor<i32>, tensor<i32>, tensor<?x1xi32>, tensor<?x1xi32>, tensor<1xi32>, tensor<!tf_type.string>, tensor<?x1xi32>, tensor<?x1xi32>, tensor<?x!tf_type.string>, tensor<?x?xi32>, tensor<i32>) -> (tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi32>, tensor<?xi1>)
+// CHECK:       %14 = "tfl.custom"(%13#6, %13#2, %13#1) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323131120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:       %15 = "tfl.custom"(%arg13, %arg10, %14) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326333120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:       %16 = func.call @func_6_DARWINN_FLOAT(%13#5, %cst_6) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_6"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
+// CHECK:       %17 = "tfl.custom"(%16, %13#4, %13#3) {custom_code = "FlexSelect", custom_option = #tfl<const_bytes : "0x0653656C6563740031120653656C6563741A001A001A002A070A01541202300732180A052E31323166120F7768696C655F626F64795F3733303200023B341414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?xi1>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:       %18 = "tfl.custom"(%arg14, %arg10, %17) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300732170A042E326335120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<i32>, tensor<?x!tf_type.string>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
+// CHECK:       %19 = func.call @func_7_DARWINN_FLOAT(%cst_0, %13#5, %arg11) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_7"} : (tensor<5xi32>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:       %20 = "tfl.custom"(%arg15, %arg10, %19) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E326337120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:       %21 = func.call @func_8_DARWINN_FLOAT(%cst, %13#5, %arg12) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_8"} : (tensor<5xi32>, tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:       %22 = "tfl.custom"(%arg16, %arg10, %21) {custom_code = "FlexTensorListSetItem", custom_option = #tfl<const_bytes : "0x1154656E736F724C6973745365744974656D0047121154656E736F724C6973745365744974656D1A001A001A002A130A0D656C656D656E745F64747970651202300332170A042E343139120F7768696C655F626F64795F3733303200025C4A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?xi32>>>, tensor<i32>, tensor<?xi32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
+// CHECK:       %23 = func.call @func_9_DARWINN_FLOAT(%arg9, %cst_6) {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_9"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+// CHECK:       "tfl.yield"(%23, %13#0, %19, %21, %15, %18, %20, %22) : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> ()
+// CHECK:     }) {tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>) -> (tensor<i32>, tensor<i32>, tensor<?xi32>, tensor<?xi32>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<!tf_type.variant<tensor<?xi32>>>, tensor<!tf_type.variant<tensor<?xi32>>>)
+// CHECK:     %3 = "tfl.custom"(%2#4, %arg0) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %4 = "tfl.custom"(%3, %arg8) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A0B0A05547065726D120230032A070A01541202300732000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %5 = "tfl.custom"(%2#5, %arg0) {custom_code = "FlexTensorListStack", custom_option = #tfl<const_bytes : "0x0F54656E736F724C697374537461636B0049120F54656E736F724C697374537461636B1A001A002A130A0D656C656D656E745F6474797065120230072A1B0A0C6E756D5F656C656D656E7473120B18FFFFFFFFFFFFFFFFFF01320000025C4C1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<!tf_type.variant<tensor<?x!tf_type.string>>>, tensor<1xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     %6 = "tfl.custom"(%5, %arg8) {custom_code = "FlexTranspose", custom_option = #tfl<const_bytes : "0x095472616E73706F7365002712095472616E73706F73651A001A002A070A0154120230072A0B0A05547065726D1202300332000002342A1414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<2xi32>) -> tensor<?x?x!tf_type.string>
+// CHECK:     return %4, %6 : tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>
+// CHECK:   }
+// CHECK:   func.func private @func_2_DARWINN_FLOAT(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_2"} {
+// CHECK:     %0 = tfl.less(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK:     %1 = tfl.less(%arg2, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+// CHECK:     %2 = tfl.logical_and %1, %0 {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i1>
+// CHECK:     return %2 : tensor<i1>
+// CHECK:   }
+// CHECK:   func.func private @func_3_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<1xi32>) -> (tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} {
+// CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %2 = "tfl.range"(%arg3, %1, %arg4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %3 = "tfl.pack"(%1, %arg4) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:     %4 = "tfl.strided_slice"(%0, %arg2, %arg5, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %5 = tfl.mul(%2, %4) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %6 = "tfl.reshape"(%5, %3) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
+// CHECK:     %7 = "tfl.strided_slice"(%0, %arg1, %arg5, %arg2) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     return %6, %7 : tensor<?x1xi32>, tensor<2xi32>
+// CHECK:   }
+// CHECK:   func.func private @func_4_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<?x?x!tf_type.string>, %arg3: tensor<1xi32>, %arg4: tensor<1xi32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<1xi32>) -> (tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_4"} {
+// CHECK:     %0 = "tfl.reshape"(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %1 = "tfl.shape"(%arg2) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
+// CHECK:     %2 = "tfl.strided_slice"(%1, %arg3, %arg4, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %3 = "tfl.range"(%arg5, %2, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %4 = "tfl.pack"(%2, %arg6) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+// CHECK:     %5 = "tfl.strided_slice"(%1, %arg4, %arg7, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %6 = tfl.mul(%3, %5) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %7 = "tfl.reshape"(%6, %4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
+// CHECK:     %8 = "tfl.strided_slice"(%1, %arg3, %arg7, %arg4) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     return %0, %7, %8 : tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>
+// CHECK:   }
+// CHECK:   func.func private @func_5_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?x1xi32>, %arg5: tensor<?x1xi32>, %arg6: tensor<1xi32>, %arg7: tensor<!tf_type.string>, %arg8: tensor<?x1xi32>, %arg9: tensor<?x1xi32>, %arg10: tensor<?x!tf_type.string>, %arg11: tensor<?x?xi32>, %arg12: tensor<i32>) -> (tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi32>, tensor<?xi1>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_5"} {
+// CHECK:     %0 = "tfl.reshape"(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %1 = tfl.add %arg2, %arg3 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
+// CHECK:     %2 = tfl.add %arg4, %arg5 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
+// CHECK:     %3 = "tfl.reshape"(%2, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:     %4 = "tfl.gather"(%0, %3) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %5 = "tfl.reshape"(%4, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %6 = "tfl.shape"(%5) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+// CHECK:     %7 = "tfl.fill"(%6, %arg7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:     %8 = tfl.add %arg8, %arg9 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?x1xi32>
+// CHECK:     %9 = "tfl.reshape"(%8, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:     %10 = "tfl.gather"(%arg10, %9) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<?xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %11 = "tfl.reshape"(%10, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
+// CHECK:     %12 = "tfl.shape"(%11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+// CHECK:     %13 = "tfl.fill"(%12, %arg7) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<!tf_type.string>) -> tensor<?x!tf_type.string>
+// CHECK:     %14 = "tfl.gather"(%arg11, %arg2) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:     %15 = "tfl.equal"(%14, %arg12) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
+// CHECK:     return %1, %5, %7, %11, %13, %14, %15 : tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi32>, tensor<?xi1>
+// CHECK:   }
+// CHECK:   func.func private @func_6_DARWINN_FLOAT(%arg0: tensor<?xi32>, %arg1: tensor<i32>) -> tensor<?xi1> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_6"} {
+// CHECK:     %0 = "tfl.equal"(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi1>
+// CHECK:     return %0 : tensor<?xi1>
+// CHECK:   }
+// CHECK:   func.func private @func_7_DARWINN_FLOAT(%arg0: tensor<5xi32>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<?xi32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_7"} {
+// CHECK:     %0 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:     %1 = tfl.add %arg2, %0 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
+// CHECK:     return %1 : tensor<?xi32>
+// CHECK:   }
+// CHECK:   func.func private @func_8_DARWINN_FLOAT(%arg0: tensor<5xi32>, %arg1: tensor<?xi32>, %arg2: tensor<?xi32>) -> tensor<?xi32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_8"} {
+// CHECK:     %0 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32, batch_dims = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<5xi32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:     %1 = tfl.add %arg2, %0 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<?xi32>
+// CHECK:     return %1 : tensor<?xi32>
+// CHECK:   }
+// CHECK:   func.func private @func_9_DARWINN_FLOAT(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_9"} {
+// CHECK:     %0 = tfl.add %arg0, %arg1 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
+// CHECK:     return %0 : tensor<i32>
+// CHECK:   }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
index b227a8e35d4..e579b60869b 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
@@ -75,8 +75,10 @@ void AttachCostPerDevice(mlir::ModuleOp module,
 absl::StatusOr<OwningOpRef<mlir::ModuleOp>> TfLiteImporter::Import() {
   source_mgr_handler_ = std::make_unique<mlir::SourceMgrDiagnosticHandler>(
       source_mgr_, &context_);
-  return ImportFlatbufferOrMlir(options_.file_name, options_.input_mlir,
-                                &source_mgr_, &context_);
+  return ImportFlatbufferOrMlir(
+      options_.file_name, options_.input_mlir,
+      /*experimental_prune_unreachable_nodes_unconditionally=*/true,
+      &source_mgr_, &context_);
 }
 
 //////////// Exporter ////////////
@@ -107,7 +109,8 @@ absl::Status TfLiteExporter::Export(mlir::ModuleOp module) {
   }
 
   return mlir::TFL::tac::ExportFlatbufferOrMlir(options_.output_file_name,
-                                                options_.output_mlir, module);
+                                                options_.output_mlir, module,
+                                                /*enable_select_tf_ops=*/false);
 }
 
 }  // namespace tac
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/compute_cost.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/compute_cost.cc
index 92ccb17f895..f399af7da68 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/compute_cost.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/compute_cost.cc
@@ -96,7 +96,7 @@ void ComputeCostPass::runOnOperation() {
       signalPassFailure();
     }
 
-    float total_cost = GetCostForFunc(&func, target.getValue());
+    float total_cost = GetCostForFunc(&func, *target);
     OpBuilder builder(func);
     UpdateCost(func, total_cost, &builder);
   }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
index a66fc48c559..15fb7e66477 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/cost_model.cc
@@ -106,7 +106,7 @@ void GetOpCostPass::runOnOperation() {
         !llvm::isa<func::ReturnOp, func::FuncOp, CallOpInterface>(op)) {
       auto hardware = GetTargetAnnotation(op);
       if (!hardware) return;
-      float cost = GetCostForOp(op, hardware.getValue());
+      float cost = GetCostForOp(op, *hardware);
       UpdateCost(op, cost, &builder);
     }
   });
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
index c15bf2d2be4..b6c544a8f69 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/fold_constants_to_subgraph.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -87,8 +89,9 @@ class FoldConstantsToSubgraphPass
 
 void CopyConstantIntoFunc(int argument_index, Operation* const_op,
                           func::FuncOp func) {
-  assert((llvm::isa<TFL::ConstOp, TFL::QConstOp>(const_op)) &&
-         "Expect QConst or Const op.");
+  assert(
+      (llvm::isa<TFL::ConstOp, TFL::QConstOp, arith::ConstantOp>(const_op)) &&
+      "Expect QConst or Const op.");
   OpBuilder builder(func.getBody());
   auto cloned_const_op = const_op->clone();
   cloned_const_op->setLoc(func.getBody().getLoc());
@@ -99,13 +102,16 @@ void CopyConstantIntoFunc(int argument_index, Operation* const_op,
 }
 
 bool IsConstOrQConstInt(Operation* op) {
-  if (!llvm::isa<TFL::ConstOp, TFL::QConstOp>(op)) return false;
+  if (!llvm::isa<TFL::ConstOp, TFL::QConstOp, arith::ConstantOp>(op))
+    return false;
 
-  if (auto const_op = dyn_cast_or_null<TFL::ConstOp>(op)) {
+  if (auto arith_const_op = dyn_cast_or_null<arith::ConstantOp>(op)) {
+    // arith ConstOp path.
+    auto type = arith_const_op.getType().cast<ShapedType>().getElementType();
+    if (!type.isInteger(32) && !type.isInteger(64)) return false;
+  } else if (auto const_op = dyn_cast_or_null<TFL::ConstOp>(op)) {
     // ConstOp path.
-    auto type = const_op.getType()
-                    .dyn_cast_or_null<RankedTensorType>()
-                    .getElementType();
+    auto type = const_op.getType().cast<ShapedType>().getElementType();
     if (!type.isInteger(32) && !type.isInteger(64)) return false;
   } else {
     // QConstOp path.
@@ -124,7 +130,8 @@ void FoldConstantsToSubgraphPass::runOnOperation() {
 
   for (auto fn : module.getOps<func::FuncOp>()) {
     fn.walk([&](Operation* op) {
-      if (!llvm::isa<TFL::ConstOp, TFL::QConstOp>(op)) return;
+      if (!llvm::isa<TFL::ConstOp, TFL::QConstOp, arith::ConstantOp>(op))
+        return;
 
       // We only fold int32/int64 for Const and i32 for QConst if not specify
       // all constants flag. (Since they're more like "configs" or i32 biases.)
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
index ba79879aa45..42852425741 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
@@ -167,11 +167,10 @@ void AlternativeSubgraphPass::GetAlternativeGraphForFunc(
   }
 
   const InferenceDeviceType current_device_type(
-      {current_device.getValue(), current_inference_type.getValue()});
+      {*current_device, *current_inference_type});
 
   const std::vector<InferenceDeviceType>& all_inference_device_type =
-      GetAllAlternativeInferenceDeviceType(current_inference_type.getValue(),
-                                           devices);
+      GetAllAlternativeInferenceDeviceType(*current_inference_type, devices);
 
   for (const auto& device_inference_type : all_inference_device_type) {
     if (device_inference_type != current_device_type) {
@@ -183,7 +182,8 @@ void AlternativeSubgraphPass::GetAlternativeGraphForFunc(
       // see if we need to erase the func op.
       // Ideally it would be nice if we can utilize dynamic illegal op to do
       // the job.
-      if (!IsAllSupportedbySpec(cloned_func, device_inference_type)) {
+      if (device_inference_type.hardware != "CPU" &&
+          !IsAllSupportedbySpec(cloned_func, device_inference_type)) {
         cloned_func.erase();
       }
     }
@@ -193,7 +193,7 @@ void AlternativeSubgraphPass::GetAlternativeGraphForFunc(
   // We need to run the optimization for the current device last because we
   // need to avoid any changes made the current graph polluting other
   // alternative graph views.
-  Optimize(func, current_device.getValue());
+  Optimize(func, *current_device);
 }
 
 bool AlternativeSubgraphPass::IsAllSupportedbySpec(
@@ -237,8 +237,8 @@ func::FuncOp AlternativeSubgraphPass::GetAlternativeViewForSpec(
   cloned_func->setAttr(kInferenceType,
                        builder->getStringAttr(GetInferenceString(
                            target_device_inference_type.inference_type)));
-  std::string new_function_name = GetFunctionImplName(
-      interface_name.getValue(), target_device_inference_type);
+  std::string new_function_name =
+      GetFunctionImplName(*interface_name, target_device_inference_type);
   cloned_func.setName(new_function_name);
 
   // If it's quantized -> float, we need to wrap all the ops around with dequant
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
index bd7f43d36ca..58940205edf 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
@@ -318,7 +318,7 @@ void PickSubgraphsPass::BuildSubgraphs(
     // Build the subgraph.
     Subgraph subgraph;
     subgraph.call = call_op;
-    auto impl_iter = func_impls.find(interface_name.getValue());
+    auto impl_iter = func_impls.find(*interface_name);
     if (impl_iter == func_impls.end()) {
       call_op.emitError(
           "we cannot find corresponding implementation for this call op");
@@ -331,8 +331,7 @@ void PickSubgraphsPass::BuildSubgraphs(
         impl.emitError("we cannot find inference device type for this func");
         signalPassFailure();
       }
-      subgraph.available_choices.emplace(inference_device_type.getValue(),
-                                         impl);
+      subgraph.available_choices.emplace(*inference_device_type, impl);
     }
 
     // Insert in the subgraphs.
@@ -352,11 +351,10 @@ PickSubgraphsPass::CollectSubgraphFuncs(ModuleOp module) {
   for (auto func : module.getOps<func::FuncOp>()) {
     auto interface_name = GetInterFaceName(func);
     if (interface_name.has_value()) {
-      auto impls_iter = func_impls.find(interface_name.getValue());
+      auto impls_iter = func_impls.find(*interface_name);
       if (impls_iter == func_impls.end())
         impls_iter =
-            func_impls
-                .emplace(interface_name.getValue(), std::vector<func::FuncOp>())
+            func_impls.emplace(*interface_name, std::vector<func::FuncOp>())
                 .first;
       impls_iter->second.push_back(func);
     }
@@ -424,12 +422,11 @@ void PickSubgraphsPass::RewireSubgraphs(
     const InferenceDeviceType& preferred_inference_device_type = kv.second;
 
     // We need to rewire the call.
-    std::string interface_name = GetInterFaceName(call).getValue();
+    std::string interface_name = *GetInterFaceName(call);
     for (auto impl : collected_impl_funcs.find(interface_name)->second) {
       const auto& impl_inference_device_type =
           GetInferenceDeviceTypeForOp(impl);
-      if (impl_inference_device_type.getValue() ==
-          preferred_inference_device_type) {
+      if (*impl_inference_device_type == preferred_inference_device_type) {
         if (call.getCallee() != impl.getName()) {
           // We need to rebuild the call op. :(
           builder->setInsertionPoint(call);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
index 6fb2fcaeeab..457487c3f2f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/raise_target_subgraphs.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -33,10 +34,13 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/common/outline_operations.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/subgraph.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h"
@@ -48,51 +52,11 @@ namespace TFL {
 namespace tac {
 namespace {
 
-// Subgraph here is actually an intermediate data structure holder for the ops:
-// The ops within share the same "target", they're topologically sorted.
-// The subgraph here will be later populated to generate func ops.
-// All the subgraphs should not create cyclic dependencies:
-// So we should not have:
-//     subgraph1
-//             \
-//            subgraph2
-//            /
-//       subgraph1
-struct Subgraph {
-  // All ops must be inserted in it's topological order.
-  llvm::SetVector<Operation*> all_ops;
-  int subgraph_id;
-  InferenceDeviceType inference_device_type;
-};
-
-// This will exclude arguments & consts & quantize/dequantize ops.
-inline bool IsNonConstQuantizeOp(Operation* op) {
-  return IsNonConstOp(op) && NotTFLQuantDequantizeOp(op) && !IsTerminatorOp(op);
-}
+using ::mlir::TFL::common::OpsAdded;
+using ::mlir::TFL::common::Subgraph;
 
-// This pass will group those ops (non-const TFL dialect ops) have the same
-// target together and raise them as FuncOps.
-// See the following Example:
-//
-//     op1 (GPU)
-//       \       op2 (GPU)
-//       \        |
-//        \      op3 (GPU)
-//         \     /
-//         op4 (CPU)
-//
-// will be raised as 3 subgraphs:
-// Subgraph 1: {op1}, GPU -> Func_1_GPU
-// Subgraph 2: {op2, op3}, GPU -> Func_2_GPU
-// Subgraph 3: {op4} CPU -> Func_3_CPU
-//
-// MainFunc:
-//   %0 = call @Func_1_GPU
-//   %1 = call @Func_2_GPU
-//   %2 = call @Func_3_CPU(%0, %1)
 class RaiseTargetSubgraphsPass
-    : public mlir::PassWrapper<RaiseTargetSubgraphsPass,
-                               mlir::OperationPass<ModuleOp>> {
+    : public PassWrapper<RaiseTargetSubgraphsPass, OperationPass<ModuleOp>> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RaiseTargetSubgraphsPass)
 
@@ -106,272 +70,148 @@ class RaiseTargetSubgraphsPass
   }
   void runOnOperation() override;
 
-  void RaiseTargetSubgraphsForBlock(Block* block, OpBuilder* builder,
-                                    ModuleOp module);
-
-  void ExtractSubgraphToFunc(Subgraph* subgraph, OpBuilder* builder,
-                             ModuleOp module);
-
-  func::FuncOp BuildFuncOp(Subgraph* subgraph, OpBuilder* builder,
-                           ModuleOp module_op, SmallVector<Value, 4>* inputs,
-                           SmallVector<Value, 4>* outputs,
-                           InferenceDeviceType* inference_device_type);
-
-  int subgraph_count_ = 0;
+  void RaiseTargetSubgraphsForBlock(Block& block, OpBuilder& builder,
+                                    ModuleOp module, bool skip_cpu,
+                                    int& func_count);
 };
 
-// This is to collect input arguments for the given set of ops.
-// See the example:
-//
-//   value1  value2
-//    \     /
-//      op1
-//        \     value3
-//        \   /
-//         op2
-//         |
-//         op3
-//
-//  Then the arguments will be {value1, value2, value3}
-void CollectInputs(const llvm::SetVector<Operation*>& all_ops,
-                   SmallVector<Value, 4>* inputs) {
-  for (Operation* op : all_ops) {
-    for (Value input : op->getOperands()) {
-      Operation* input_op = input.getDefiningOp();
-      const bool input_within_subgraph =
-          (input_op && all_ops.count(input_op) == 1);
-      if (!input_within_subgraph) {
-        inputs->push_back(input);
-      }
-    }
-  }
+// After raising ops and adding the Func & Call op, call this function
+// to set attributes specific to this pass.
+void AddAttrs(OpsAdded& ops_added, OpBuilder& builder, int func_count) {
+  func::FuncOp& added_func_op = ops_added.func_op;
+  func::CallOp& added_call_op = ops_added.call_op;
+  StringAttr interface_name =
+      builder.getStringAttr(absl::StrCat("func_", func_count));
+
+  added_func_op->setAttr(kInterfaceNameAttr, interface_name);
+  added_call_op->setAttr(kInterfaceNameAttr, interface_name);
+
+  StringAttr device = added_func_op->getRegion(0)
+                          .getBlocks()
+                          .front()
+                          .front()
+                          .getAttr(kDevice)
+                          .cast<StringAttr>();
+  StringAttr inference_type = added_func_op->getRegion(0)
+                                  .getBlocks()
+                                  .front()
+                                  .front()
+                                  .getAttr(kInferenceType)
+                                  .cast<StringAttr>();
+  added_call_op->setAttr(kDevice, device);
+  added_call_op->setAttr(kInferenceType, inference_type);
+  added_func_op->setAttr(kDevice, device);
+  added_func_op->setAttr(kInferenceType, inference_type);
+
+  std::string function_name = absl::StrCat(interface_name.getValue().str(), "_",
+                                           device.getValue().str(), "_",
+                                           inference_type.getValue().str());
+  added_func_op.setName(builder.getStringAttr(function_name));
+  added_call_op.setCallee(builder.getStringAttr(function_name));
 }
 
-// This is to collect outputs arguments for the given set of ops.
-// See the example:
-//
-//      op1
-//      /    \
-//   value1   \
-//           op2
-//           |  \
-//         op3  value2
-//         |
-//       value3
-//
-//  Then the arguments will be {value1, value2, value3}
-void CollectOutputs(const llvm::SetVector<Operation*>& all_ops,
-                    SmallVector<Value, 4>* outputs) {
-  for (Operation* op : all_ops) {
-    for (Value output : op->getResults()) {
-      bool output_consumed_outside_subgraph = false;
-      for (Operation* consumer : output.getUsers()) {
-        if (all_ops.count(consumer) == 0) {
-          output_consumed_outside_subgraph = true;
+// Raises partitioned sequential `Operations` from a block to a new function
+// definition. `Operations` are partitioned into classes from the cartesian
+// product of possible devices and inference datatypes. For example, we might
+// raise a chunk of sequential operations from a block all having attributes
+// `{ tac.device = "GPU", tac.inference_type = "FLOAT"}` to a function
+// with the matching attributes. Assumed is that device type "CPU"
+// is the only device that is allowed to call other devices. I.e. ancestors of a
+// "CPU" `Operation` may only `Operations` without a device or other "CPU"
+// `Operations`. Implied is that "CPU" ops may contain subgraphs of different
+// device types which also need to be raised.
+void RaiseTargetSubgraphsPass::RaiseTargetSubgraphsForBlock(Block& block,
+                                                            OpBuilder& builder,
+                                                            ModuleOp module,
+                                                            bool skip_cpu,
+                                                            int& func_count) {
+  llvm::SetVector<Operation*> partition_ops;
+
+  auto device_is = [&](InferenceDeviceType t, llvm::StringRef device) -> bool {
+    return t.hardware == device;
+  };
+
+  auto op_has_device = [&](Operation& op, InferenceDeviceType& device) -> bool {
+    Optional<InferenceDeviceType> op_device = GetInferenceDeviceTypeForOp(&op);
+    if (!op_device.has_value()) return false;
+    device = op_device.value();
+    return true;
+  };
+
+  auto op_device_is = [&](Operation& op, llvm::StringRef device) -> bool {
+    InferenceDeviceType device_type;
+    if (!op_has_device(op, device_type)) return false;
+    return device_is(device_type, device);
+  };
+
+  // Given a list of `Operation`s to partitition, raise them to a new
+  // function. If the partitons is of type "CPU" then it may contain
+  // other deivice subgraphs that need to be raised. We recur on
+  // any nested blocks of "CPU" ops and skip raising "CPU" ops for the
+  // remainder of that recursive call.
+  auto extract = [&](llvm::SetVector<Operation*>& partition_ops) -> void {
+    if (partition_ops.empty()) return;
+    InferenceDeviceType device =
+        GetInferenceDeviceTypeForOp(partition_ops.front()).value();
+    Subgraph old_subgraph(partition_ops, ++func_count);
+    OpsAdded ops_added;
+    ExtractSubgraphToFunc(old_subgraph, builder, module, ops_added);
+    AddAttrs(ops_added, builder, func_count);
+    // Ops in "CPU" subgraphs may nested regions with other device subgraphs.
+    // We recur into these nested blocks to raise those as well. We don't raise
+    // "CPU" ops who are themselves nested within a "CPU" op, so set
+    // `skip_cpu` to true.
+    if (device_is(device, "CPU")) {
+      for (auto& block : ops_added.func_op->getRegion(0).getBlocks())
+        for (auto& op : block) {
+          auto op_device = GetInferenceDeviceTypeForOp(&op);
+          if (op_device_is(op, "CPU"))
+            // The recently raised func is device type cpu & `op` is a "CPU".
+            // Recursivley call again to raise any non-"CPU" subgraphs contained
+            // within nested region of `op`.
+            for (auto& region : op.getRegions())
+              for (auto& block : region.getBlocks())
+                RaiseTargetSubgraphsForBlock(block, builder, module,
+                                             /*skip_cpu=*/true, func_count);
         }
-      }
-      if (output_consumed_outside_subgraph) {
-        outputs->push_back(output);
-      }
     }
-  }
-}
-
-void BuildTypes(const SmallVector<Value, 4>& values,
-                SmallVector<Type, 4>* types) {
-  for (auto value : values) {
-    types->push_back(value.getType());
-  }
-}
-
-void GetFunctionName(const Subgraph& subgrpah, std::string* function_name,
-                     std::string* interface_name) {
-  *interface_name = absl::StrCat("func_", std::to_string(subgrpah.subgraph_id));
-  *function_name = absl::StrCat(
-      (*interface_name), "_", subgrpah.inference_device_type.hardware, "_",
-      GetInferenceString(subgrpah.inference_device_type.inference_type));
-}
-
-func::FuncOp RaiseTargetSubgraphsPass::BuildFuncOp(
-    Subgraph* subgraph, OpBuilder* builder, ModuleOp module_op,
-    SmallVector<Value, 4>* inputs, SmallVector<Value, 4>* outputs,
-    InferenceDeviceType* inference_device_type) {
-  CollectInputs(subgraph->all_ops, inputs);
-  CollectOutputs(subgraph->all_ops, outputs);
-
-  SmallVector<Type, 4> input_types;
-  SmallVector<Type, 4> return_types;
-
-  BuildTypes(*inputs, &input_types);
-  BuildTypes(*outputs, &return_types);
-
-  FunctionType function_type =
-      builder->getFunctionType(input_types, return_types);
-
-  SmallVector<NamedAttribute, 4> attrs;
-  // Function name.
-  std::string function_name;
-  std::string interface_name;
-  GetFunctionName(*subgraph, &function_name, &interface_name);
-  attrs.push_back(builder->getNamedAttr(
-      kInterfaceNameAttr, builder->getStringAttr(interface_name)));
-
-  // Inference Device type.
-  attrs.push_back(builder->getNamedAttr(
-      kDevice,
-      builder->getStringAttr(subgraph->inference_device_type.hardware)));
-  attrs.push_back(builder->getNamedAttr(
-      kInferenceType, builder->getStringAttr(GetInferenceString(
-                          subgraph->inference_device_type.inference_type))));
-  *inference_device_type = subgraph->inference_device_type;
-
-  func::FuncOp new_func =
-      func::FuncOp::create(builder->getUnknownLoc(), function_name,
-                           function_type, llvm::makeArrayRef(attrs));
-  new_func.setPrivate();
-
-  new_func.addEntryBlock();
-
-  // Function argument mapping.
-  llvm::DenseMap<Value, int> function_argument_mapping;
-  for (int i = 0; i < inputs->size(); ++i) {
-    function_argument_mapping.insert({(*inputs)[i], i});
-  }
-
-  OpBuilder function_builder(new_func.getBody());
-
-  llvm::DenseMap<Operation*, Operation*> op_cloned_op_mapping;
-  llvm::DenseMap<Value, Value> output_cloned_op_output_mapping;
-  for (Operation* op : subgraph->all_ops) {
-    Operation* cloned_op = function_builder.clone(*op);
-    op_cloned_op_mapping.insert({op, cloned_op});
-    for (int i = 0; i < op->getNumResults(); ++i) {
-      Value op_output = op->getResult(i);
-      Value cloned_op_output = cloned_op->getResult(i);
-      output_cloned_op_output_mapping.insert({op_output, cloned_op_output});
-    }
-  }
-
-  for (Operation* op : subgraph->all_ops) {
-    Operation* cloned_op = op_cloned_op_mapping.find(op)->second;
-    for (int i = 0; i < op->getNumOperands(); ++i) {
-      Value input = op->getOperand(i);
-      Value cloned_op_input;
-      // If the input is actually a function argument.
-      if (function_argument_mapping.count(input) > 0) {
-        int function_argument = function_argument_mapping.find(input)->second;
-        cloned_op_input = new_func.getArgument(function_argument);
-      } else {
-        // The input is actually with in the subgraph.
-        cloned_op_input = output_cloned_op_output_mapping.find(input)->second;
+    partition_ops.clear();
+  };
+
+  // Given a block, partition into lists of similar `Operations` as described.
+  Optional<InferenceDeviceType> current_device_type = std::nullopt;
+  for (Operation& current_op : block) {
+    auto next_device_type = GetInferenceDeviceTypeForOp(&current_op);
+    if (!next_device_type.has_value() ||
+        (skip_cpu && device_is(next_device_type.value(), "CPU"))) {
+      // If we aren't raising this op, we only need to raise the current
+      // partition if this op depends on one the the partitioned ops results.
+      for (Value operand : current_op.getOperands()) {
+        if (partition_ops.contains(operand.getDefiningOp()))
+          extract(partition_ops);
       }
-      cloned_op->setOperand(i, cloned_op_input);
+      continue;
     }
-  }
-
-  SmallVector<Value, 4> final_outputs;
-  for (auto output : *outputs) {
-    auto cloned_output = output_cloned_op_output_mapping.find(output)->second;
-    final_outputs.push_back(cloned_output);
-  }
-  function_builder.create<mlir::func::ReturnOp>(new_func.getLoc(),
-                                                final_outputs);
-
-  module_op.push_back(new_func);
-  return new_func;
-}
-
-void RaiseTargetSubgraphsPass::ExtractSubgraphToFunc(Subgraph* subgraph,
-                                                     OpBuilder* builder,
-                                                     ModuleOp module) {
-  SmallVector<Value, 4> func_inputs;
-  SmallVector<Value, 4> func_outputs;
-
-  InferenceDeviceType inference_device_type;
-  func::FuncOp func = BuildFuncOp(subgraph, builder, module, &func_inputs,
-                                  &func_outputs, &inference_device_type);
-
-  // We just use the location of the last ops in the subgraph as the location
-  // for the call_op.
-  Operation* last_output = subgraph->all_ops.back();
-
-  // TODO(renjieliu): we should add func attributes to the call op.
-  builder->setInsertionPoint(last_output);
-  auto call_op =
-      builder->create<func::CallOp>(last_output->getLoc(), func, func_inputs);
-
-  auto interface_name = GetInterFaceName(func);
-
-  // Set call op attribute: interface_name, hardware.
-  call_op->setAttr(kInterfaceNameAttr,
-                   builder->getStringAttr(interface_name.getValue()));
-  call_op->setAttr(kDevice,
-                   builder->getStringAttr(inference_device_type.hardware));
-  call_op->setAttr(kInferenceType, builder->getStringAttr(GetInferenceString(
-                                       inference_device_type.inference_type)));
-
-  // Rewire the outputs.
-  if (call_op.getNumResults() != func_outputs.size()) {
-    module.emitError("the constructed func op has mismatched returns");
-    signalPassFailure();
-  }
-
-  for (int i = 0; i < func_outputs.size(); ++i) {
-    Value output = func_outputs[i];
-    output.replaceAllUsesWith(call_op.getResult(i));
-  }
-
-  // Clear the subgraph.
-  // Those ops should be removed.
-  for (auto* op : subgraph->all_ops) {
-    op->dropAllDefinedValueUses();
-    op->dropAllReferences();
-    op->erase();
-  }
-}
-
-// TODO(renjieliu): We may need to consider about side effect ops: we may leave
-// those ops alone when building the subgraph.
-void RaiseTargetSubgraphsPass::RaiseTargetSubgraphsForBlock(Block* block,
-                                                            OpBuilder* builder,
-                                                            ModuleOp module) {
-  // This is a very naive implementation:
-  // It will greedily group adjacent ops that have the same inference type to a
-  // subgraph.
-  llvm::DenseMap<int, Subgraph> all_subgraphs;
-  llvm::Optional<InferenceDeviceType> previous_device_type = llvm::None;
-  int current_subgraph_id = -1;
-  for (auto& op : *block) {
-    if (IsNonConstQuantizeOp(&op) && !IsTerminatorOp(&op) &&
-        !llvm::isa<func::ReturnOp, func::FuncOp, CallOpInterface>(op)) {
-      auto current_device_type = GetInferenceDeviceTypeForOp(&op);
-      if (!(current_device_type.has_value() &&
-            current_device_type == previous_device_type)) {
-        // We should start a new subgraph.
-        Subgraph new_subgraph;
-        new_subgraph.inference_device_type = current_device_type.getValue();
-        new_subgraph.subgraph_id = subgraph_count_++;
-        all_subgraphs.insert({new_subgraph.subgraph_id, new_subgraph});
-        current_subgraph_id = new_subgraph.subgraph_id;
-      }
-      previous_device_type = current_device_type;
-      all_subgraphs.find(current_subgraph_id)->second.all_ops.insert(&op);
+    if (next_device_type == current_device_type) {
+      partition_ops.insert(&current_op);
+      continue;
     }
+    extract(partition_ops);
+    partition_ops.insert(&current_op);
+    current_device_type = next_device_type;
   }
-
-  // Create FuncOp & replace with current uses based on those subgraphs.
-  for (auto& subgraph : all_subgraphs) {
-    ExtractSubgraphToFunc(&subgraph.second, builder, module);
-  }
+  extract(partition_ops);
 }
 
 void RaiseTargetSubgraphsPass::runOnOperation() {
-  auto module = getOperation();
-  SmallVector<func::FuncOp, 16> funcs(module.getOps<func::FuncOp>());
+  ModuleOp module = getOperation();
+  SmallVector<func::FuncOp> funcs(module.getOps<func::FuncOp>());
+  int func_count = -1;
   for (auto func : funcs) {
     for (auto& block : func) {
-      auto builder = OpBuilder::atBlockBegin(&block);
-      RaiseTargetSubgraphsForBlock(&block, &builder, module);
+      OpBuilder builder = OpBuilder::atBlockBegin(&block);
+      RaiseTargetSubgraphsForBlock(block, builder, module, /*skip_cpu=*/false,
+                                   func_count);
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
index d319b6094d1..b4546ba2120 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
index 97779c3d3ff..20c81962e5a 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
@@ -43,6 +43,7 @@ namespace tac {
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportFlatbufferOrMlir(
     const std::string& input_filename, bool input_mlir,
+    bool experimental_prune_unreachable_nodes_unconditionally,
     llvm::SourceMgr* source_mgr, mlir::MLIRContext* context) {
   std::string error;
   std::unique_ptr<llvm::MemoryBuffer> buffer =
@@ -70,11 +71,12 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportFlatbufferOrMlir(
   return tflite::FlatBufferToMlir(
       absl::string_view(buffer->getBufferStart(), buffer->getBufferSize()),
       context, loc, /*use_external_constant=*/false, inputs, outputs,
-      /*experimental_prune_unreachable_nodes_unconditionally=*/true);
+      experimental_prune_unreachable_nodes_unconditionally);
 }
 
 absl::Status ExportFlatbufferOrMlir(const std::string& output_filename,
-                                    bool output_mlir, mlir::ModuleOp module) {
+                                    bool output_mlir, mlir::ModuleOp module,
+                                    bool enable_select_tf_ops) {
   std::string error_msg;
   auto output = mlir::openOutputFile(output_filename, &error_msg);
   if (output == nullptr) {
@@ -90,8 +92,13 @@ absl::Status ExportFlatbufferOrMlir(const std::string& output_filename,
   } else {
     tflite::FlatbufferExportOptions options;
     options.toco_flags.set_force_select_tf_ops(false);
-    options.toco_flags.set_enable_select_tf_ops(false);
     options.toco_flags.set_allow_custom_ops(true);
+    if (enable_select_tf_ops) {
+      options.toco_flags.set_enable_select_tf_ops(true);
+      options.toco_flags.set_allow_all_select_tf_ops(true);
+    } else {
+      options.toco_flags.set_enable_select_tf_ops(false);
+    }
     if (!tflite::MlirToFlatBufferTranslateFunction(module, options, &result)) {
       return absl::UnknownError("Failed to export tflite file.");
     }
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
index 4af9f005606..af5732683e9 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h
@@ -32,11 +32,13 @@ namespace tac {
 // Import the file as mlir module, the input maybe flatbuffer or mlir file.
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportFlatbufferOrMlir(
     const std::string& input_filename, bool input_mlir,
+    bool experimental_prune_unreachable_nodes_unconditionally,
     llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
 
 // Export the module to file, can be either mlir or flatbuffer.
 absl::Status ExportFlatbufferOrMlir(const std::string& output_filename,
-                                    bool output_mlir, mlir::ModuleOp module);
+                                    bool output_mlir, mlir::ModuleOp module,
+                                    bool enable_select_tf_ops);
 
 }  // namespace tac
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 39942fa0f5f..0223cd93a79 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -37,7 +38,6 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -70,6 +70,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
@@ -436,9 +437,9 @@ static std::unique_ptr<::tensorflow::NodeDef> GetTensorFlowNodeDef(
 }
 
 // Converts a mlir padding StringRef to TfLitePadding.
-// Returns llvm::None if conversion fails.
-static Optional<TfLitePadding> GetTflitePadding(Operation* inst,
-                                                llvm::StringRef padding) {
+// Returns std::nullopt if conversion fails.
+static std::optional<TfLitePadding> GetTflitePadding(Operation* inst,
+                                                     llvm::StringRef padding) {
   const tflite::Padding padding_attr =
       std::move(llvm::StringSwitch<tflite::Padding>(padding)
                     .Case("SAME", tflite::Padding_SAME)
@@ -451,16 +452,16 @@ static Optional<TfLitePadding> GetTflitePadding(Operation* inst,
   }
 
   return inst->emitOpError() << "Invalid padding attribute: " << padding,
-         llvm::None;
+         std::nullopt;
 }
 
 // Extracts TfLitePoolParams from a TFL custom op.
 // Template parameter, TFLOp, should be a TFL custom op containing attributes
 // generated from TfLitePoolParams.
-// Returns llvm::None if conversion fails.
+// Returns std::nullopt if conversion fails.
 template <typename TFLOp>
-static Optional<TfLitePoolParams> GetTflitePoolParams(Operation* inst,
-                                                      TFLOp op) {
+static std::optional<TfLitePoolParams> GetTflitePoolParams(Operation* inst,
+                                                           TFLOp op) {
   TfLitePoolParams pool_params;
   pool_params.stride_height = op.stride_h().getSExtValue();
   pool_params.stride_width = op.stride_w().getSExtValue();
@@ -474,11 +475,14 @@ static Optional<TfLitePoolParams> GetTflitePoolParams(Operation* inst,
     return pool_params;
   }
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 namespace {
 
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+
 // Helper struct that wraps inputs/outputs of a single SignatureDef.
 struct SignatureDefData {
   // Note, we are using maps here to make order deterministic
@@ -498,9 +502,9 @@ struct SignatureDefData {
 class Translator {
  public:
   // Translates the given MLIR module into TFLite FlatBuffer format and returns
-  // the serialized output. Returns llvm::None on unsupported, invalid inputs or
-  // internal error.
-  static Optional<std::string> Translate(
+  // the serialized output. Returns std::nullopt on unsupported, invalid inputs
+  // or internal error.
+  static std::optional<std::string> Translate(
       ModuleOp module, const toco::TocoFlags& toco_flags,
       const std::unordered_set<std::string>& tags,
       OpOrArgNameMapper* op_or_arg_name_mapper,
@@ -543,29 +547,29 @@ class Translator {
         ->getOrLoadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
   }
 
-  Optional<std::string> TranslateInternal();
+  std::optional<std::string> TranslateInternal();
 
   // Returns TFLite buffer populated with constant value if the operation is
   // TFLite constant operation. Otherwise, returns an empty buffer. Emits error
-  // and returns llvm::None on failure.
-  Optional<BufferOffset<tflite::Buffer>> BuildBuffer(Value value);
+  // and returns std::nullopt on failure.
+  std::optional<BufferOffset<tflite::Buffer>> BuildBuffer(Value value);
 
   // Build TFLite tensor from the given type. This function is for tfl.lstm
   // intermediates, which should have UniformQuantizedType.
-  Optional<BufferOffset<tflite::Tensor>> BuildTensorFromType(
+  std::optional<BufferOffset<tflite::Tensor>> BuildTensorFromType(
       mlir::Type type, const std::string& name);
 
-  // Builds TF::VariantType from the given element type. Returns llvm::None if
+  // Builds TF::VariantType from the given element type. Returns std::nullopt if
   // failure. Returns empty vector if the element type is not TF::VariantType or
   // there is empty TensorType in the TF::VariantType.
-  Optional<std::vector<BufferOffset<tflite::VariantSubType>>>
+  std::optional<std::vector<BufferOffset<tflite::VariantSubType>>>
   BuildTFVariantType(mlir::Type element_type);
 
   // Builds TFLite tensor from the given value. `buffer_idx` is index of the
-  // corresponding buffer. Emits error and returns llvm::None on failure.
-  Optional<BufferOffset<tflite::Tensor>> BuildTensor(
+  // corresponding buffer. Emits error and returns std::nullopt on failure.
+  std::optional<BufferOffset<tflite::Tensor>> BuildTensor(
       Value value, const std::string& name, unsigned buffer_idx,
-      const Optional<BufferOffset<tflite::QuantizationParameters>>&
+      const std::optional<BufferOffset<tflite::QuantizationParameters>>&
           quant_parameters);
 
   // TODO(b/137395003): Legalize tf.IfOp to TFLite dialect, and change the
@@ -593,10 +597,10 @@ class Translator {
       const std::vector<int32_t>& operands,
       const std::vector<int32_t>& results);
 
-  Optional<CustomOptionsOffset> CreateFlexOpCustomOptions(
+  std::optional<CustomOptionsOffset> CreateFlexOpCustomOptions(
       const ::tensorflow::NodeDef& node_def, const mlir::Location& loc);
 
-  Optional<CustomOptionsOffset> CreateCustomOpCustomOptions(
+  std::optional<CustomOptionsOffset> CreateCustomOpCustomOptions(
       const ::tensorflow::NodeDef& node_def, const mlir::Location& loc);
 
   std::unique_ptr<flexbuffers::Builder> CreateFlexBuilderWithNodeAttrs(
@@ -609,8 +613,8 @@ class Translator {
                           tflite::BuiltinOperator builtin);
 
   // Builds operator for the given operation with specified operand and result
-  // tensor indices. Emits an error and returns llvm::None on failure.
-  Optional<BufferOffset<tflite::Operator>> BuildOperator(
+  // tensor indices. Emits an error and returns std::nullopt on failure.
+  llvm::Optional<BufferOffset<tflite::Operator>> BuildOperator(
       Operation* inst, std::vector<int32_t> operands,
       const std::vector<int32_t>& results,
       const std::vector<int32_t>& intermediates);
@@ -622,7 +626,7 @@ class Translator {
   // Build a subgraph with a given name out of the region either corresponding
   // to a function's body or while op. Modifies *region by calling
   // ExtractControlEdges.
-  Optional<BufferOffset<tflite::SubGraph>> BuildSubGraph(
+  std::optional<BufferOffset<tflite::SubGraph>> BuildSubGraph(
       const std::string& name, Region* region, const int index);
 
   // Modifies *block by unwrapping all ControlNodeOps. The DAG of the control
@@ -636,11 +640,11 @@ class Translator {
 
   // Encodes the `tfl.metadata` dictionary attribute of the module to the
   // metadata section in the final model.
-  Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
+  std::optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
   CreateMetadataVector();
 
   // Builds and returns list of tfl.SignatureDef sections in the model.
-  Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+  std::optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
   CreateSignatureDefs(const std::vector<SignatureDefData>& signature_defs);
 
   // Returns list of offsets for the passed 'items' in TensorMap structure
@@ -747,7 +751,7 @@ std::string Translator::UniqueName(mlir::Value val) {
   return std::string(name_mapper_.GetUniqueName(val));
 }
 
-Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
+std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     mlir::Value value) {
   auto inst = value.getDefiningOp();
   ElementsAttr attr;
@@ -756,7 +760,7 @@ Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     // TFLite module.
     attr = cst.getValue().cast<ElementsAttr>();
   } else if (auto cst = dyn_cast<mlir::TF::ConstOp>(inst)) {
-    attr = cst.value();
+    attr = cst.getValue();
   } else if (auto cst = dyn_cast<tfl::ConstOp>(inst)) {
     attr = cst.getValue();
   } else if (auto cst = dyn_cast<tfl::QConstOp>(inst)) {
@@ -793,7 +797,7 @@ Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     inst->emitError(
         Twine("failed to convert value attribute to tensor with error: " +
               status.ToString()));
-    return llvm::None;
+    return std::nullopt;
   }
 
   // TensorFlow and TensorFlow Lite use different string encoding formats.
@@ -819,7 +823,7 @@ Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   return tflite::CreateBuffer(builder_, buffer_data);
 }
 
-Optional<std::vector<BufferOffset<tflite::VariantSubType>>>
+std::optional<std::vector<BufferOffset<tflite::VariantSubType>>>
 Translator::BuildTFVariantType(mlir::Type element_type) {
   std::vector<BufferOffset<tflite::VariantSubType>> variant_params;
   auto variant_type = element_type.dyn_cast<mlir::TF::VariantType>();
@@ -829,7 +833,7 @@ Translator::BuildTFVariantType(mlir::Type element_type) {
 
   // We only support up to one nested type in tf_type.variant_type.
   if (variant_type.getSubtypes().size() > 1) {
-    return llvm::None;
+    return std::nullopt;
   }
   if (variant_type.getSubtypes().empty()) {
     return variant_params;
@@ -849,7 +853,7 @@ Translator::BuildTFVariantType(mlir::Type element_type) {
   return variant_params;
 }
 
-Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
+std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
     mlir::Type type, const std::string& name) {
   auto tensor_type = type.cast<TensorType>();
 
@@ -861,17 +865,17 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
       shape_ref = tensor_type.getShape();
       shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
     } else {
-      return llvm::None;
+      return std::nullopt;
     }
   }
 
   auto element_type = tensor_type.getElementType();
   tflite::TensorType tflite_element_type =
       GetTFLiteType(tensor_type.getElementType()).value();
-  Optional<std::vector<BufferOffset<tflite::VariantSubType>>> variant_params =
-      BuildTFVariantType(element_type);
-  if (!variant_params.hasValue()) {
-    return llvm::None;
+  std::optional<std::vector<BufferOffset<tflite::VariantSubType>>>
+      variant_params = BuildTFVariantType(element_type);
+  if (!variant_params.has_value()) {
+    return std::nullopt;
   }
   BufferOffset<tflite::QuantizationParameters> q_params = 0;
   if (auto qtype = element_type.dyn_cast<mlir::quant::UniformQuantizedType>()) {
@@ -897,9 +901,9 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
       variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
 }
 
-Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
+std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     Value value, const std::string& name, unsigned buffer_idx,
-    const Optional<BufferOffset<tflite::QuantizationParameters>>&
+    const std::optional<BufferOffset<tflite::QuantizationParameters>>&
         quant_parameters) {
   auto type = value.getType().cast<TensorType>();
 
@@ -924,7 +928,7 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
   auto* inst = value.getDefiningOp();
   if (type.hasStaticShape()) {
     llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return llvm::None;
+    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
 
     shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
   } else if (inst && IsConst(inst)) {
@@ -934,21 +938,21 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     auto tensor_attr = inst->getAttr("value").cast<mlir::TypedAttr>();
     llvm::ArrayRef<int64_t> shape_ref =
         tensor_attr.getType().cast<TensorType>().getShape();
-    if (mlir::failed(check_shape(shape_ref))) return llvm::None;
+    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
 
     shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
   } else if (type.hasRank()) {
     llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return llvm::None;
+    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
 
     shape.reserve(shape_ref.size());
     for (auto& dim : shape_ref) {
       // translate dynamic shapes from mlir to tfl values
       shape.push_back(
-          dim == mlir::ShapedType::kDynamicSize ? 1 : static_cast<int>(dim));
+          dim == mlir::ShapedType::kDynamic ? 1 : static_cast<int>(dim));
       shape_signature.push_back(static_cast<int>(
-          dim == mlir::ShapedType::kDynamicSize ? tensorflow::kTFDynamicSize
-                                                : dim));
+          dim == mlir::ShapedType::kDynamic ? tensorflow::kTFDynamicSize
+                                            : dim));
     }
   }
 
@@ -965,10 +969,10 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
   tflite::TensorType tflite_element_type =
       GetTFLiteType(type.getElementType()).value();
 
-  Optional<std::vector<BufferOffset<tflite::VariantSubType>>> variant_params =
-      BuildTFVariantType(element_type);
-  if (!variant_params.hasValue()) {
-    return llvm::None;
+  std::optional<std::vector<BufferOffset<tflite::VariantSubType>>>
+      variant_params = BuildTFVariantType(element_type);
+  if (!variant_params.has_value()) {
+    return std::nullopt;
   }
 
   BufferOffset<tflite::QuantizationParameters> q_params;
@@ -993,7 +997,7 @@ Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         tflite::QuantizationDetails_NONE, /*details=*/0,
         qtype.getQuantizedDimension());
   } else if (quant_parameters.has_value()) {
-    q_params = quant_parameters.getValue();
+    q_params = quant_parameters.value();
   } else {
     q_params = tflite::CreateQuantizationParameters(builder_);
   }
@@ -1032,8 +1036,8 @@ BufferOffset<tflite::Operator> Translator::BuildIfOperator(
     mlir::TF::IfOp op, const std::vector<int32_t>& operands,
     const std::vector<int32_t>& results) {
   auto opcode_index = GetOpcodeIndex("if", tflite::BuiltinOperator_IF);
-  int then_subgraph_index = subgraph_index_map_.at(op.then_branch().str());
-  int else_subgraph_index = subgraph_index_map_.at(op.else_branch().str());
+  int then_subgraph_index = subgraph_index_map_.at(op.getThenBranch().str());
+  int else_subgraph_index = subgraph_index_map_.at(op.getElseBranch().str());
   auto builtin_options = tflite::CreateIfOptions(builder_, then_subgraph_index,
                                                  else_subgraph_index)
                              .Union();
@@ -1060,21 +1064,21 @@ BufferOffset<tflite::Operator> Translator::BuildCallOnceOperator(
                                 builtin_options);
 }
 
-Optional<BufferOffset<tflite::Operator>> Translator::BuildWhileOperator(
+llvm::Optional<BufferOffset<tflite::Operator>> Translator::BuildWhileOperator(
     mlir::TFL::WhileOp op, const std::vector<int32_t>& operands,
     const std::vector<int32_t>& results) {
   auto opcode_index = GetOpcodeIndex("while", tflite::BuiltinOperator_WHILE);
-  auto get_call_index = [&](mlir::Block& b) -> Optional<int> {
-    if (b.getOperations().size() != 2) return llvm::None;
+  auto get_call_index = [&](mlir::Block& b) -> std::optional<int> {
+    if (b.getOperations().size() != 2) return std::nullopt;
     if (auto call_op = dyn_cast<mlir::func::CallOp>(b.front()))
       return subgraph_index_map_.at(call_op.getCallee().str());
-    return llvm::None;
+    return std::nullopt;
   };
   auto body_subgraph_index = get_call_index(op.getBody().front());
   auto cond_subgraph_index = get_call_index(op.getCond().front());
   if (!body_subgraph_index || !cond_subgraph_index)
     return op.emitOpError("only single call cond/body while export supported"),
-           llvm::None;
+           std::nullopt;
   auto builtin_options =
       tflite::CreateWhileOptions(builder_, *cond_subgraph_index,
                                  *body_subgraph_index)
@@ -1125,12 +1129,12 @@ BufferOffset<tflite::Operator> Translator::BuildCustomOperator(
       tflite::CustomOptionsFormat_FLEXBUFFERS);
 }
 
-Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
+std::optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   std::string node_def_str;
   if (!node_def.SerializeToString(&node_def_str)) {
     return emitError(loc, "failed to serialize tensorflow node_def"),
-           llvm::None;
+           std::nullopt;
   }
 
   auto flex_builder = std::make_unique<flexbuffers::Builder>();
@@ -1142,7 +1146,7 @@ Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
   return builder_.CreateVector(flex_builder->GetBuffer());
 }
 
-Optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
+std::optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   auto flex_builder = CreateFlexBuilderWithNodeAttrs(node_def, loc);
   return builder_.CreateVector(flex_builder->GetBuffer());
@@ -1239,14 +1243,14 @@ uint32_t Translator::GetOpcodeIndex(const std::string& op_name,
   return it.first->second;
 }
 
-Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
+llvm::Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     Operation* inst, std::vector<int32_t> operands,
     const std::vector<int32_t>& results,
     const std::vector<int32_t>& intermediates) {
   const auto* dialect = inst->getDialect();
   if (!dialect) {
     inst->emitOpError("dialect is not registered");
-    return llvm::None;
+    return std::nullopt;
   }
 
   // If TFLite built in op, create operator as a builtin op.
@@ -1256,7 +1260,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     if (!enabled_op_types_.contains(OpType::kTfliteBuiltin)) {
       return inst->emitOpError(
                  "is a TFLite builtin op but builtin emission is not enabled"),
-             llvm::None;
+             std::nullopt;
     }
 
     auto builtin_code = GetBuiltinOpCode(inst);
@@ -1272,13 +1276,13 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
           inst->emitOpError(
               "number of operands and results don't match, only canonical "
               "TFL While supported");
-          return llvm::None;
+          return std::nullopt;
         }
         return BuildWhileOperator(whileOp, operands, results);
       }
 
       inst->emitOpError("is not a supported TFLite op");
-      return llvm::None;
+      return std::nullopt;
     }
 
     if (*builtin_code == tflite::BuiltinOperator_CALL_ONCE) {
@@ -1327,7 +1331,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     //    we emit the op as custom.
     auto node_def = GetTensorFlowNodeDef(inst);
     if (!node_def) {
-      return llvm::None;
+      return std::nullopt;
     }
 
     std::string op_name = node_def->op();
@@ -1358,7 +1362,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       if (auto options = CreateFlexOpCustomOptions(*node_def, inst->getLoc())) {
         custom_options = *options;
       } else {
-        return llvm::None;
+        return std::nullopt;
       }
 
       // Gather flex ops.
@@ -1371,7 +1375,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
               CreateCustomOpCustomOptions(*node_def, inst->getLoc())) {
         custom_options = *options;
       } else {
-        return llvm::None;
+        return std::nullopt;
       }
 
       // Gather custom ops.
@@ -1389,7 +1393,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
             inst->emitOpError("is neither a custom op nor a flex op"),
             tflite::metrics::ConverterErrorData::ERROR_NEEDS_CUSTOM_OPS);
       }
-      return llvm::None;
+      return std::nullopt;
     }
 
     uint32_t opcode_index =
@@ -1408,7 +1412,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
   return inst->emitOpError(
              "is not any of a builtin TFLite op, a flex TensorFlow op or a "
              "custom TensorFlow op"),
-         llvm::None;
+         std::nullopt;
 }
 
 void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
@@ -1457,12 +1461,12 @@ BufferOffset<tflite::QuantizationParameters>
 Translator::GetQuantizationForQuantStatsOpOutput(
     mlir::quantfork::StatisticsOp stats_op) {
   auto layer_stats = stats_op.getLayerStats().cast<mlir::DenseFPElementsAttr>();
-  Optional<mlir::ElementsAttr> axis_stats = stats_op.getAxisStats();
-  Optional<uint64_t> axis = stats_op.getAxis();
+  std::optional<mlir::ElementsAttr> axis_stats = stats_op.getAxisStats();
+  std::optional<uint64_t> axis = stats_op.getAxis();
   std::vector<float> mins, maxs;
   mlir::DenseFPElementsAttr min_max_attr =
       axis_stats.has_value()
-          ? axis_stats.getValue().cast<mlir::DenseFPElementsAttr>()
+          ? axis_stats.value().cast<mlir::DenseFPElementsAttr>()
           : layer_stats;
 
   for (const auto& index_and_value :
@@ -1479,10 +1483,10 @@ Translator::GetQuantizationForQuantStatsOpOutput(
       builder_, builder_.CreateVector<float>(mins),
       builder_.CreateVector<float>(maxs), /*scale=*/0, /*zero_point=*/0,
       tflite::QuantizationDetails_NONE, /*details=*/0,
-      /*quantized_dimension=*/axis.has_value() ? axis.getValue() : 0);
+      /*quantized_dimension=*/axis.has_value() ? axis.value() : 0);
 }
 
-Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
+std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     const std::string& name, Region* region, const int index) {
   const auto control_edges = ExtractControlEdges(&region->front());
   bool has_input_attr = false;
@@ -1503,7 +1507,8 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
 
     tensor_index_map.insert({value, tensors.size()});
     tensor_index_map_[subgraph_index][tensor_name] = tensors.size();
-    Optional<BufferOffset<tflite::QuantizationParameters>> quant_parameters;
+    std::optional<BufferOffset<tflite::QuantizationParameters>>
+        quant_parameters;
     if (value.hasOneUse()) {
       auto stats_op =
           llvm::dyn_cast<mlir::quantfork::StatisticsOp>(*value.user_begin());
@@ -1546,7 +1551,7 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     if (has_input_attr)
       tensor_name = std::string(name_mapper_.GetUniqueName(arg));
     if (tensor_name.empty()) tensor_name = absl::StrCat("arg", i);
-    if (!build_tensor_and_buffer(arg, index, tensor_name)) return llvm::None;
+    if (!build_tensor_and_buffer(arg, index, tensor_name)) return std::nullopt;
   }
 
   bool failed_once = false;
@@ -1579,7 +1584,7 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
             continue;
           } else {
             intermediates.push_back(tensors.size());
-            tensors.push_back(tensor_or.getValue());
+            tensors.push_back(tensor_or.value());
           }
         }
       }
@@ -1598,7 +1603,8 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
         tensor_name = "NumericVerify/" + UniqueName(quantized_op_val) + ":" +
                       std::to_string(tensor_index_map[quantized_op_val]);
       }
-      if (!build_tensor_and_buffer(val, index, tensor_name)) return llvm::None;
+      if (!build_tensor_and_buffer(val, index, tensor_name))
+        return std::nullopt;
     }
 
     // Skip constant ops as they don't represent a TFLite operator.
@@ -1650,7 +1656,7 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     subgraph_op_inst_map_.resize(index + 1);
   }
   subgraph_op_inst_map_[index] = operators_in_mlir;
-  if (failed_once) return llvm::None;
+  if (failed_once) return std::nullopt;
 
   // Get input and output tensor indices for the subgraph.
   std::vector<int32_t> inputs, outputs;
@@ -1687,7 +1693,7 @@ BufferOffset<tflite::Metadata> Translator::BuildMetadata(StringRef name,
   return tflite::CreateMetadataDirect(builder_, name.data(), buffer_index);
 }
 
-Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
+std::optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
 Translator::CreateMetadataVector() {
   auto dict_attr = module_->getAttrOfType<mlir::DictionaryAttr>("tfl.metadata");
   std::vector<BufferOffset<tflite::Metadata>> metadata;
@@ -1701,7 +1707,7 @@ Translator::CreateMetadataVector() {
         module_.emitError(
             "all values in tfl.metadata's dictionary key-value pairs should be "
             "string attributes");
-        return llvm::None;
+        return std::nullopt;
       }
     }
   }
@@ -1750,14 +1756,14 @@ llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
 // Attribute identified by 'attr_name'.
 std::vector<std::string> GetStringsFromDictionaryAttr(
     const llvm::SmallVector<mlir::DictionaryAttr, 4>& dict_attrs,
-    const std::string& attr_name) {
+    const StringRef attr_name) {
   std::vector<std::string> result;
   for (const auto& arg_attr : dict_attrs) {
     if (!arg_attr) continue;
 
     auto attrs = arg_attr.getValue();
     for (const auto attr : attrs) {
-      if (attr.getName().str() == attr_name) {
+      if (attr.getName() == attr_name) {
         auto array_attr = attr.getValue().dyn_cast_or_null<mlir::ArrayAttr>();
         if (!array_attr || array_attr.empty()) continue;
         auto string_attr = array_attr[0].dyn_cast_or_null<mlir::StringAttr>();
@@ -1772,7 +1778,6 @@ std::vector<std::string> GetStringsFromDictionaryAttr(
 std::vector<SignatureDefData> BuildSignaturedef(
     FuncOp main_op, const std::string& saved_model_tag,
     const uint32_t subgraph_index, tensorflow::OpOrArgNameMapper& name_mapper) {
-  static const char kSignatureDefIndexPath[] = "tf_saved_model.index_path";
   static const char kEntryFunctionAttributes[] = "tf.entry_function";
 
   // Fetch inputs and outputs from the signature.
@@ -1780,9 +1785,9 @@ std::vector<SignatureDefData> BuildSignaturedef(
   main_op.getAllArgAttrs(arg_attrs);
   main_op.getAllResultAttrs(res_attrs);
   std::vector<std::string> sig_def_inputs =
-      GetStringsFromDictionaryAttr(arg_attrs, kSignatureDefIndexPath);
+      GetStringsFromDictionaryAttr(arg_attrs, kTfSavedModelIndexPathAttr);
   std::vector<std::string> sig_def_outputs =
-      GetStringsFromDictionaryAttr(res_attrs, kSignatureDefIndexPath);
+      GetStringsFromDictionaryAttr(res_attrs, kTfSavedModelIndexPathAttr);
 
   // If no defined saved model signature, then return empty list.
   // This can happen when we are converting model not from SavedModel.
@@ -1823,7 +1828,7 @@ std::vector<SignatureDefData> BuildSignaturedef(
   }
   // Exported method name.
   auto exported_name =
-      main_op->getAttrOfType<mlir::ArrayAttr>("tf_saved_model.exported_names");
+      main_op->getAttrOfType<mlir::ArrayAttr>(kTfSavedModelExportedNamesAttr);
   if (exported_name.empty()) {
     main_op.emitError("Empty exported names for main Function");
     return {};
@@ -1861,7 +1866,7 @@ std::vector<BufferOffset<tflite::TensorMap>> Translator::GetList(
   return result;
 }
 
-Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+std::optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
 Translator::CreateSignatureDefs(
     const std::vector<SignatureDefData>& signature_defs) {
   std::vector<BufferOffset<tflite::SignatureDef>> signature_defs_buffer;
@@ -1914,7 +1919,7 @@ bool UpdateEntryFunction(ModuleOp module) {
   return true;
 }
 
-Optional<std::string> Translator::Translate(
+std::optional<std::string> Translator::Translate(
     ModuleOp module, const toco::TocoFlags& toco_flags,
     const std::unordered_set<std::string>& tags,
     OpOrArgNameMapper* op_or_arg_name_mapper,
@@ -1922,8 +1927,8 @@ Optional<std::string> Translator::Translate(
   OpOrArgLocNameMapper default_op_or_arg_name_mapper;
   if (!op_or_arg_name_mapper)
     op_or_arg_name_mapper = &default_op_or_arg_name_mapper;
-  if (!UpdateEntryFunction(module)) return llvm::None;
-  if (!IsValidTFLiteMlirModule(module)) return llvm::None;
+  if (!UpdateEntryFunction(module)) return std::nullopt;
+  if (!IsValidTFLiteMlirModule(module)) return std::nullopt;
   Translator translator(module, toco_flags, tags, op_or_arg_name_mapper,
                         metadata);
   return translator.TranslateInternal();
@@ -1955,7 +1960,7 @@ bool Translator::CheckGpuDelegateCompatibility(uint8_t* model_buffer_pointer) {
   return gpu_compatibile;
 }
 
-Optional<std::string> Translator::TranslateInternal() {
+std::optional<std::string> Translator::TranslateInternal() {
   // A list of named regions in the module with main function being the first in
   // the list. The main function is required as the first subgraph in the model
   // is entry point for the model.
@@ -2084,7 +2089,7 @@ Optional<std::string> Translator::TranslateInternal() {
     return failed_region.second->getParentOp()->emitError()
                << "failed while converting: '" << failed_region.first
                << "': " << err,
-           llvm::None;
+           std::nullopt;
   }
 
   // Log MAC count.
@@ -2123,7 +2128,7 @@ Optional<std::string> Translator::TranslateInternal() {
   auto description = builder_.CreateString(model_description.data());
   VectorBufferOffset<int32_t> metadata_buffer = 0;  // Deprecated
   auto metadata = CreateMetadataVector();
-  if (!metadata) return llvm::None;
+  if (!metadata) return std::nullopt;
 
   std::vector<SignatureDefData> signature_defs_vec;
   subgraph_index = 0;
@@ -2152,13 +2157,13 @@ Optional<std::string> Translator::TranslateInternal() {
   // There is a limit of 2GB for a flatbuffer.
   if (builder_.GetSize() > 2147483648) {
     LOG(ERROR) << "Model size is bigger than 2gb";
-    return llvm::None;
+    return std::nullopt;
   }
   tflite::UpdateOpVersion(builder_.GetBufferPointer());
   tflite::UpdateMinimumRuntimeVersionForModel(builder_.GetBufferPointer());
   if (supported_backends_.find("GPU") != supported_backends_.end()) {
     if (!CheckGpuDelegateCompatibility(builder_.GetBufferPointer())) {
-      return llvm::None;
+      return std::nullopt;
     }
   }
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 7dc0ce17c4c..532d342f360 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -70,8 +70,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/low_bit_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -111,6 +113,9 @@ namespace tfl = mlir::TFL;
 
 namespace {
 
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+
 bool IsQuantized(const TensorT& tensor) {
   return (tensor.quantization != nullptr) &&
          !tensor.quantization->zero_point.empty();
@@ -849,7 +854,8 @@ StatusOr<Operation*> ConvertOp(
 
       mlir::SmallVector<mlir::Attribute, 4> shape;
       for (auto s : new_shape) {
-        shape.push_back(builder.getI32IntegerAttr(static_cast<int32_t>(s)));
+        shape.push_back(
+            builder.getI32IntegerAttr(mlir::TFL::ConvertToTfliteSize(s)));
       }
       auto output_shape = DenseElementsAttr::get(shape_type, shape);
       auto shape_op = builder.create<tfl::ConstOp>(loc, output_shape);
@@ -906,9 +912,8 @@ StatusOr<Operation*> ConvertOp(
         int32_t dim_size = 0;
         for (const auto& dim :
              llvm::enumerate(shape_attr.getValues<llvm::APInt>())) {
-          const int64_t size = dim.value().getSExtValue();
-          shape.push_back(
-              builder.getI32IntegerAttr(static_cast<int32_t>(size)));
+          shape.push_back(builder.getI32IntegerAttr(
+              mlir::TFL::ConvertToTfliteSize(dim.value().getSExtValue())));
           ++dim_size;
         }
         auto shape_type = tensorflow::GetTypeFromTFTensorShape(
@@ -1118,8 +1123,6 @@ void SetSignature(
     FuncOp func, const tflite::SignatureDefT* signature,
     const std::vector<std::unique_ptr<tflite::TensorT>>& tensors) {
   auto* context = func->getContext();
-  static const char kSignatureDefIndexPath[] = "tf_saved_model.index_path";
-  static const char kExportedNameAttr[] = "tf_saved_model.exported_names";
   static const char kEntryFunctionAttributes[] = "tf.entry_function";
 
   auto dict_attr =
@@ -1140,7 +1143,7 @@ void SetSignature(
       return;
     }
     func.setArgAttr(
-        arg_index, kSignatureDefIndexPath,
+        arg_index, kTfSavedModelIndexPathAttr,
         mlir::ArrayAttr::get(context, {mlir::StringAttr::get(
                                           context, input_pair.value()->name)}));
   }
@@ -1155,14 +1158,14 @@ void SetSignature(
       func->emitWarning("Invalid signature tensors specified.");
       return;
     }
-    func.setResultAttr(arg_index, kSignatureDefIndexPath,
+    func.setResultAttr(arg_index, kTfSavedModelIndexPathAttr,
                        mlir::ArrayAttr::get(
                            context, {mlir::StringAttr::get(
                                         context, output_pair.value()->name)}));
     seen_indices.insert(arg_index);
   }
   func->setAttr(
-      kExportedNameAttr,
+      kTfSavedModelExportedNamesAttr,
       mlir::ArrayAttr::get(
           context, {mlir::StringAttr::get(context, signature->signature_key)}));
 }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 4153e3aa88a..35475091aa8 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -178,11 +178,11 @@ static bool ConvertBoolAttrForOptionWriter(
   return b;
 }
 
-// Overloading of ConvertBoolAttrForOptionWriter which takes Optional<bool> as
-// an input. If value is not specified, false is set for the attribute.
+// Overloading of ConvertBoolAttrForOptionWriter which takes std::optional<bool>
+// as an input. If value is not specified, false is set for the attribute.
 static bool ConvertBoolAttrForOptionWriter(
-    mlir::Optional<bool> b, flatbuffers::FlatBufferBuilder* builder) {
-  return b.has_value() ? b.getValue() : false;
+    std::optional<bool> b, flatbuffers::FlatBufferBuilder* builder) {
+  return b.has_value() ? b.value() : false;
 }
 
 static flatbuffers::Offset<flatbuffers::String> ConvertStrAttrForOptionWriter(
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
index 600c2740da1..8d1200d33db 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -41,12 +41,12 @@ namespace mlir {
 std::string GetMlirOpNameFromOpCode(const ::tflite::OperatorCodeT &op_code);
 
 // Returns the builtin op code for the given MLIR operation on success; emits
-// error and returns llvm::None on failure.
+// error and returns std::nullopt on failure.
 llvm::Optional<tflite::BuiltinOperator> GetBuiltinOpCode(Operation *mlir_op);
 
 // Packs the given MLIR operation into a TFLite FlatBuffer operator object.
 // Returns the FlatBuffer offset for the operator on success; emits error and
-// returns llvm::None on failure.
+// returns std::nullopt on failure.
 llvm::Optional<flatbuffers::Offset<tflite::Operator>> CreateFlatBufferOperator(
     Operation *mlir_op, uint32_t opcode_index,
     const std::vector<int32_t> &operands, const std::vector<int32_t> &results,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index cd7505b9427..5a83b906a68 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -36,8 +36,6 @@ def TFL_Dialect : Dialect {
 
   let cppNamespace = "::mlir::TFL";
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
-
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;
 
@@ -52,6 +50,7 @@ def TFL_Dialect : Dialect {
     Operation *materializeConstant(OpBuilder &builder, Attribute value,
                                    Type type, Location loc) override;
   }];
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 233999f0129..177af6feb9b 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/utils/arithmetic_count_util.h"
+#include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
@@ -80,8 +81,8 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SquareOp);
 
 namespace {
 
-ParseResult parseOneResultSameOperandTypeOp(OpAsmParser &parser,
-                                            OperationState &result) {
+ParseResult parseOneResultSameOperandTypeOp(OpAsmParser& parser,
+                                            OperationState& result) {
   SmallVector<OpAsmParser::UnresolvedOperand, 2> ops;
   Type type;
   // If the operand list is in-between parentheses, then we have a generic form.
@@ -109,7 +110,7 @@ ParseResult parseOneResultSameOperandTypeOp(OpAsmParser &parser,
                  parser.addTypeToList(type, result.types));
 }
 
-void printOneResultOp(Operation *op, OpAsmPrinter &p) {
+void printOneResultOp(Operation* op, OpAsmPrinter& p) {
   assert(op->getNumResults() == 1 && "op should have one result");
 
   // If not all the operand and result types are the same, just use the
@@ -128,8 +129,8 @@ void printOneResultOp(Operation *op, OpAsmPrinter &p) {
   p << " : " << resultType;
 }
 
-Operation *getDefiningBroadcastArgsOp(Value operand) {
-  auto *defining_op = operand.getDefiningOp();
+Operation* getDefiningBroadcastArgsOp(Value operand) {
+  auto* defining_op = operand.getDefiningOp();
   if (!llvm::dyn_cast_or_null<TF::BroadcastToOp>(defining_op) &&
       !llvm::dyn_cast_or_null<TFL::BroadcastToOp>(defining_op)) {
     return nullptr;
@@ -137,7 +138,7 @@ Operation *getDefiningBroadcastArgsOp(Value operand) {
 
   Value broadcast_shape = defining_op->getOperand(
       1);  // Broadcasted shape operand of BroadcastTo op.
-  Operation *parent_of_defining_op = broadcast_shape.getDefiningOp();
+  Operation* parent_of_defining_op = broadcast_shape.getDefiningOp();
   if (!llvm::dyn_cast_or_null<TF::BroadcastArgsOp>(parent_of_defining_op) &&
       !llvm::dyn_cast_or_null<TFL::BroadcastArgsOp>(parent_of_defining_op)) {
     return nullptr;
@@ -164,7 +165,7 @@ bool VerifyCompatibleShapesSameElementType(TypeRange lhs, TypeRange rhs) {
 // non-static and maximum rank is within the given rank, this method returns
 // true.
 bool VerifyOperandsHaveSameShapesOrBroadcastableShape(
-    Operation *op, ArrayRef<unsigned> indices, int max_bcast_rank) {
+    Operation* op, ArrayRef<unsigned> indices, int max_bcast_rank) {
   if (indices.empty()) return true;
 
   // First, it checks there are any inputs that has unknown rank.
@@ -228,9 +229,9 @@ bool VerifyOperandsHaveSameShapesOrBroadcastableShape(
   // Checks if all operands are broadcasted by BroadcastTo ops with the shape
   // is calculated from the same BroadcastArgs op. In such case, all operands
   // will have the same shape.
-  Operation *broadcast_args_pivot = nullptr;
+  Operation* broadcast_args_pivot = nullptr;
   for (unsigned index : indices) {
-    Operation *parent_broadcast_args =
+    Operation* parent_broadcast_args =
         getDefiningBroadcastArgsOp(op->getOperand(index));
     if (parent_broadcast_args == nullptr) {
       return false;
@@ -306,7 +307,7 @@ struct RemoveOptionalZeroBias : public OpRewritePattern<ConcreteOpType> {
   using OpRewritePattern<ConcreteOpType>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(ConcreteOpType op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     if (EqualsZero(op.getBias())) {
       auto none_value = rewriter.create<TFL::NoValueOp>(
           rewriter.getUnknownLoc(), rewriter.getNoneType(),
@@ -405,17 +406,17 @@ struct TensorFlowLiteInlinerInterface : public DialectInlinerInterface {
   //===--------------------------------------------------------------------===//
 
   // Allow all call operations to be inlined.
-  bool isLegalToInline(Operation *call, Operation *callable,
+  bool isLegalToInline(Operation* call, Operation* callable,
                        bool wouldBeCloned) const final {
     return true;
   }
-  bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
-                       BlockAndValueMapping &) const final {
+  bool isLegalToInline(Operation* op, Region* dest, bool wouldBeCloned,
+                       IRMapping&) const final {
     // No TFLite op restricts inlining today, revise as needed in the future.
     return true;
   }
-  bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       BlockAndValueMapping &valueMapping) const final {
+  bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
+                       IRMapping& valueMapping) const final {
     return isa<WhileOp>(dest->getParentOp());
   }
 };
@@ -429,12 +430,12 @@ struct TensorFlowLiteDialectFoldInterface : public DialectFoldInterface {
   // materializing constants.
   // In the TFLite dialect we materialize inside a while regions as slightly
   // more efficient computationally.
-  bool shouldMaterializeInto(Region *region) const final {
+  bool shouldMaterializeInto(Region* region) const final {
     return isa<WhileOp>(region->getParentOp());
   }
 };
 
-void TFLDialect::printType(Type type, DialectAsmPrinter &os) const {
+void TFLDialect::printType(Type type, DialectAsmPrinter& os) const {
   if (type.isa<ControlType>()) {
     os << "control";
     return;
@@ -442,7 +443,7 @@ void TFLDialect::printType(Type type, DialectAsmPrinter &os) const {
   os << "<unknown TFL type>";
 }
 
-Type TFLDialect::parseType(DialectAsmParser &parser) const {
+Type TFLDialect::parseType(DialectAsmParser& parser) const {
   StringRef data_type;
   if (parser.parseKeyword(&data_type)) return Type();
   if (data_type == "control") return ControlType::get(getContext());
@@ -507,8 +508,8 @@ inline std::vector<int64_t> GetPaddedShape(ArrayRef<int64_t> old_shape,
 // Helper method that given and 'current_index' representing
 // index in broadcasted tensor, get the index in the flat original tensor.
 // 'shape' is the original shape with padding to match result shape.
-int64_t GetElementIndex(const std::vector<int64_t> &shape,
-                        const std::vector<int64_t> &current_index) {
+int64_t GetElementIndex(const std::vector<int64_t>& shape,
+                        const std::vector<int64_t>& current_index) {
   int64_t ind = 0;
   int64_t mul = 1;
   for (int i = shape.size() - 1; i >= 0; --i) {
@@ -521,8 +522,8 @@ int64_t GetElementIndex(const std::vector<int64_t> &shape,
 // Helper method that increment index represented in 'current_index_ptr'
 // in the shape of 'result_shape'.
 void IncrementIndex(ArrayRef<int64_t> result_shape,
-                    std::vector<int64_t> *current_index_ptr) {
-  std::vector<int64_t> &current_index = *current_index_ptr;
+                    std::vector<int64_t>* current_index_ptr) {
+  std::vector<int64_t>& current_index = *current_index_ptr;
   for (int i = result_shape.size() - 1; i >= 0; --i) {
     current_index[i]++;
     if (current_index[i] == result_shape[i]) {
@@ -543,7 +544,7 @@ template <class AttrElementT,
               llvm::function_ref<ElementValueT(ElementValueT, ElementValueT)>>
 Attribute ConstFoldBinaryOpDenseDense(Type result_type, DenseElementsAttr lhs,
                                       DenseElementsAttr rhs,
-                                      const CalculationT &calculate) {
+                                      const CalculationT& calculate) {
   auto type = OpTrait::util::getBroadcastedType(lhs.getType(), rhs.getType())
                   .dyn_cast_or_null<ShapedType>();
   if (!type) {
@@ -606,7 +607,7 @@ template <class AttrElementT,
           class CalculationT =
               llvm::function_ref<ElementValueT(ElementValueT, ElementValueT)>>
 Attribute ConstFoldBinaryOp(Type result_type, Attribute operand1,
-                            Attribute operand2, const CalculationT &calculate) {
+                            Attribute operand2, const CalculationT& calculate) {
   if (operand1.dyn_cast_or_null<DenseElementsAttr>() &&
       operand2.dyn_cast_or_null<DenseElementsAttr>()) {
     return ConstFoldBinaryOpDenseDense<AttrElementT, ElementValueT>(
@@ -663,7 +664,7 @@ Attribute ConstFoldUnaryOp(Type result_type, Attribute operand,
     const int num_elements = result_shape_type.getNumElements();
     new_values.reserve(num_elements);
 
-    for (const APFloat &old_value : dense_elements.getValues<APFloat>()) {
+    for (const APFloat& old_value : dense_elements.getValues<APFloat>()) {
       new_values.push_back(calculate(old_value));
     }
 
@@ -673,7 +674,7 @@ Attribute ConstFoldUnaryOp(Type result_type, Attribute operand,
   return {};
 }
 
-void buildComparisonBinOp(Builder *builder, OperationState &result, Value lhs,
+void buildComparisonBinOp(Builder* builder, OperationState& result, Value lhs,
                           Value rhs) {
   auto result_type =
       OpTrait::util::getBroadcastedType(lhs.getType(), rhs.getType());
@@ -692,7 +693,7 @@ void buildComparisonBinOp(Builder *builder, OperationState &result, Value lhs,
   }
 }
 
-void buildFusedBroadcastableBinOp(Builder *builder, OperationState &result,
+void buildFusedBroadcastableBinOp(Builder* builder, OperationState& result,
                                   Value lhs, Value rhs,
                                   StringAttr fused_activation_function) {
   auto result_type =
@@ -714,7 +715,8 @@ void buildFusedBroadcastableBinOp(Builder *builder, OperationState &result,
 // AddOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult AddOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   // TODO(b/142478136): Handle fused ops.
   if (getFusedActivationFunction() != "NONE") return {};
   return ConstFoldBinaryOp(
@@ -722,7 +724,7 @@ OpFoldResult AddOp::fold(ArrayRef<Attribute> operands) {
       [](APInt a, APInt b) { return a + b; });
 }
 
-int64_t AddOp::GetArithmeticCount(Operation *op) {
+int64_t AddOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) return count;
 
@@ -755,28 +757,28 @@ int64_t GetConcatenationOpAxis(ConcatenationOp op) {
 //
 // Note: If an operand has unranked tensor type or has dynamic dimension size,
 // those dimensions will be skipped.
-LogicalResult VerifyConcatenationOpTypes(Operation *op,
+LogicalResult VerifyConcatenationOpTypes(Operation* op,
                                          RankedTensorType output_type,
                                          ArrayRef<TensorType> operand_types,
                                          int64_t axis) {
   const int64_t output_rank = output_type.getRank();
 
   SmallVector<int64_t, 4> result_dim_sizes_loc(output_rank,
-                                               ShapedType::kDynamicSize);
+                                               ShapedType::kDynamic);
   SmallVector<int64_t, 4> result_dim_sizes(output_type.getShape().begin(),
                                            output_type.getShape().end());
   result_dim_sizes[axis] = 0;
 
   auto FormatLoc = [&result_dim_sizes_loc](int64_t dim) {
     const int64_t loc = result_dim_sizes_loc[dim];
-    if (loc == ShapedType::kDynamicSize) return std::string("output");
+    if (loc == ShapedType::kDynamic) return std::string("output");
     return llvm::formatv("operand #{0}", loc).str();
   };
 
-  for (const auto &operand : llvm::enumerate(operand_types)) {
+  for (const auto& operand : llvm::enumerate(operand_types)) {
     auto operand_type = operand.value().dyn_cast<RankedTensorType>();
     if (!operand_type) {
-      result_dim_sizes[axis] = ShapedType::kDynamicSize;
+      result_dim_sizes[axis] = ShapedType::kDynamic;
       continue;
     }
 
@@ -793,7 +795,7 @@ LogicalResult VerifyConcatenationOpTypes(Operation *op,
       if (dim == axis) {
         if (ShapedType::isDynamic(operand_dim_size) ||
             ShapedType::isDynamic(result_dim_size)) {
-          result_dim_sizes[axis] = ShapedType::kDynamicSize;
+          result_dim_sizes[axis] = ShapedType::kDynamic;
         } else {
           result_dim_sizes[axis] += operand_dim_size;
         }
@@ -900,7 +902,8 @@ LogicalResult ConcatenationOp::verify() {
                                     operand_types, axis);
 }
 
-OpFoldResult ConcatenationOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ConcatenationOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   if (getFusedActivationFunction() == "NONE") {
     if (auto output_type = getOutput().getType().dyn_cast<RankedTensorType>()) {
       const int64_t axis = GetConcatenationOpAxis(*this);
@@ -952,13 +955,13 @@ mlir::LogicalResult CustomOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult CustomTfOp::inferReturnTypes(
-    MLIRContext *, Optional<Location> location, ValueRange operands,
+    MLIRContext*, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attr, RegionRange ranges,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   CustomTfOpAdaptor op(operands, attr, ranges);
 
   if (op.getRegions().empty()) return success();
-  auto *real_op = &op.getBody().front().front();
+  auto* real_op = &op.getBody().front().front();
   if (llvm::isa<TF::FakeQuantWithMinMaxArgsOp, TF::FakeQuantWithMinMaxVarsOp,
                 TF::FakeQuantWithMinMaxVarsPerChannelOp>(real_op)) {
     Value input = *operands.begin();
@@ -1002,7 +1005,7 @@ struct ConvertBroadcastToReshape : public OpRewritePattern<BroadcastToOp> {
   using OpRewritePattern<BroadcastToOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(BroadcastToOp op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     auto input_type = op.getInput().getType().cast<ShapedType>();
     auto output_type = op.getType().cast<ShapedType>();
     if (!input_type.hasStaticShape() || !output_type.hasStaticShape() ||
@@ -1023,8 +1026,8 @@ struct ConvertBroadcastToReshape : public OpRewritePattern<BroadcastToOp> {
   }
 };
 
-void BroadcastToOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                MLIRContext *context) {
+void BroadcastToOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
   results.add<ConvertBroadcastToReshape>(context);
 }
 
@@ -1079,9 +1082,9 @@ LogicalResult FullyConnectedOp::verify() {
   return mlir::success();
 }
 
-LogicalResult FullyConnectedOp::fold(ArrayRef<Attribute> operands,
-                                     SmallVectorImpl<OpFoldResult> &results) {
-  assert(operands.size() == 3);
+LogicalResult FullyConnectedOp::fold(FoldAdaptor adaptor,
+                                     SmallVectorImpl<OpFoldResult>& results) {
+  assert(adaptor.getOperands().size() == 3);
 
   // Folding not implemented with any activation function or any weight type
   // besides the default.
@@ -1177,12 +1180,12 @@ LogicalResult FullyConnectedOp::fold(ArrayRef<Attribute> operands,
   return success();
 }
 
-void FullyConnectedOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                   MLIRContext *context) {
+void FullyConnectedOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                   MLIRContext* context) {
   results.add<RemoveOptionalZeroBias<FullyConnectedOp>>(context);
 }
 
-int64_t FullyConnectedOp::GetArithmeticCount(Operation *op) {
+int64_t FullyConnectedOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
           op, &count))
@@ -1195,8 +1198,8 @@ int64_t FullyConnectedOp::GetArithmeticCount(Operation *op) {
 // Conv2DOp
 //===----------------------------------------------------------------------===//
 
-void Conv2DOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                           MLIRContext *context) {
+void Conv2DOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
   // TODO(b/180121750): Enable the pattern after the integration tests are
   // fixed.
   // results.add<RemoveOptionalZeroBias<Conv2DOp>>(context);
@@ -1204,7 +1207,7 @@ void Conv2DOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 static LogicalResult ComputeConvWindowedOutputSize(
     int64_t input_size, int64_t filter_size, int64_t dilation_rate,
-    int64_t stride, tensorflow::Padding padding, int64_t *output_size) {
+    int64_t stride, tensorflow::Padding padding, int64_t* output_size) {
   int64_t pad_low;
   int64_t pad_high;
 
@@ -1217,9 +1220,9 @@ static LogicalResult ComputeConvWindowedOutputSize(
 }
 
 LogicalResult Conv2DOp::inferReturnTypes(
-    MLIRContext *, Optional<Location> location, ValueRange operands,
+    MLIRContext*, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attr, RegionRange,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   Conv2DOpAdaptor op(operands, attr);
 
   const Value input = op.getInput();
@@ -1262,7 +1265,7 @@ LogicalResult Conv2DOp::inferReturnTypes(
   // Output always have rank 4. All dimensions are initialized to
   // dynamic size and can be partially inferred.
   // TFL's conv2d is always NHWC format & the filter is OHWI.
-  SmallVector<int64_t, 4> return_shape(4, ShapedType::kDynamicSize);
+  SmallVector<int64_t, 4> return_shape(4, ShapedType::kDynamic);
   return_shape[0] = input_ty.getDimSize(0);
   return_shape[3] = filter_ty.getDimSize(0);
 
@@ -1304,7 +1307,7 @@ bool Conv2DOp::isCompatibleReturnTypes(TypeRange lhs, TypeRange rhs) {
   return true;
 }
 
-int64_t Conv2DOp::GetArithmeticCount(Operation *op) {
+int64_t Conv2DOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
           op, &count))
@@ -1317,14 +1320,14 @@ int64_t Conv2DOp::GetArithmeticCount(Operation *op) {
 // DepthwiseConv2DO
 //===----------------------------------------------------------------------===//
 
-void DepthwiseConv2DOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                    MLIRContext *context) {
+void DepthwiseConv2DOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                    MLIRContext* context) {
   // TODO(b/180121750): Enable the pattern after the integration tests are
   // fixed.
   // results.add<RemoveOptionalZeroBias<DepthwiseConv2DOp>>(context);
 }
 
-int64_t DepthwiseConv2DOp::GetArithmeticCount(Operation *op) {
+int64_t DepthwiseConv2DOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetArithmeticCountForConvAndFullyconnectedOp(
           op, &count))
@@ -1337,7 +1340,7 @@ int64_t DepthwiseConv2DOp::GetArithmeticCount(Operation *op) {
 // GatherOp
 //===----------------------------------------------------------------------===//
 
-static void BuildGatherOp(OpBuilder *builder, OperationState &result,
+static void BuildGatherOp(OpBuilder* builder, OperationState& result,
                           Value params, Value indices, IntegerAttr axis,
                           IntegerAttr batch_dims) {
   auto params_type = params.getType().cast<TensorType>();
@@ -1481,7 +1484,7 @@ mlir::LogicalResult ScatterNdOp::verify() {
 
     // Checks whether the last `(shape_type.getDimSize(0) - outermost_dim)`
     // dimensions of `updates` and `shape` are equal.
-    for (const auto &shape_it : llvm::enumerate(shape_value)) {
+    for (const auto& shape_it : llvm::enumerate(shape_value)) {
       int64_t i = shape_it.index();
       auto value = shape_it.value().getSExtValue();
       if (i >= outermost_dim) {
@@ -1497,7 +1500,7 @@ mlir::LogicalResult ScatterNdOp::verify() {
 
     // Checks if the output has the shape specified by `shape`.
     if (output_type.hasStaticShape()) {
-      for (const auto &shape_it : llvm::enumerate(shape_value)) {
+      for (const auto& shape_it : llvm::enumerate(shape_value)) {
         int i = shape_it.index();
         auto value = shape_it.value().getSExtValue();
         if (output_type.getDimSize(i) != value) {
@@ -1515,7 +1518,8 @@ mlir::LogicalResult ScatterNdOp::verify() {
 // MulOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   // TODO(b/142478136): Handle fused ops.
   if (getFusedActivationFunction() != "NONE") return {};
 
@@ -1544,7 +1548,7 @@ OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
       [](APInt a, APInt b) { return a * b; });
 }
 
-int64_t MulOp::GetArithmeticCount(Operation *op) {
+int64_t MulOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) return count;
 
@@ -1555,7 +1559,8 @@ int64_t MulOp::GetArithmeticCount(Operation *op) {
 // DivOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult DivOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   // TODO(b/142478136): Handle fused ops.
   if (getFusedActivationFunction() != "NONE") return {};
   return ConstFoldBinaryOp(
@@ -1563,7 +1568,7 @@ OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
       [](APInt a, APInt b) { return a.sdiv(b); });
 }
 
-int64_t DivOp::GetArithmeticCount(Operation *op) {
+int64_t DivOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) return count;
 
@@ -1661,16 +1666,16 @@ namespace {
 // TODO(antiagainst): This pattern probably should be moved to the peephole
 // category, after we have the infra for peephole passes.
 struct RemoveAdjacentReshape : public RewritePattern {
-  explicit RemoveAdjacentReshape(MLIRContext *context)
+  explicit RemoveAdjacentReshape(MLIRContext* context)
       : RewritePattern(ReshapeOp::getOperationName(), 1, context) {}
 
-  LogicalResult match(Operation *op) const override {
+  LogicalResult match(Operation* op) const override {
     auto thisOp = cast<ReshapeOp>(op);
     auto prevOp = thisOp.getOperand(0).getDefiningOp();
     return isa_and_nonnull<ReshapeOp>(prevOp) ? success() : failure();
   }
 
-  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
     auto thisOp = cast<ReshapeOp>(op);
     auto prevOp = cast<ReshapeOp>(thisOp.getOperand(0).getDefiningOp());
 
@@ -1693,7 +1698,7 @@ struct ConvertShapeTo1D : public OpRewritePattern<ReshapeOp> {
   using OpRewritePattern<ReshapeOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(ReshapeOp reshape,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     if (!reshape.getShape().hasOneUse()) return failure();
 
     DenseIntElementsAttr shape;
@@ -1733,7 +1738,8 @@ bool InputOutputHasSameShape(mlir::Type input_type, mlir::Type output_type) {
 
 }  // end anonymous namespace
 
-OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   // Remove identity reshape with both static result and input shape.
   auto result_type = getType().cast<ShapedType>();
   auto input_type = getOperand(0).getType().cast<ShapedType>();
@@ -1748,7 +1754,7 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
       if (!shape_elements) return nullptr;
 
       SmallVector<int64_t, 4> shape_data;
-      for (const auto &it : shape_elements.getValues<APInt>()) {
+      for (const auto& it : shape_elements.getValues<APInt>()) {
         shape_data.push_back(it.getSExtValue());
       }
       result_type = tensorflow::GetTypeFromTFTensorShape(
@@ -1760,17 +1766,17 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
   return nullptr;
 }
 
-void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                            MLIRContext *context) {
+void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
   results.add<RemoveAdjacentReshape, ConvertShapeTo1D>(context);
 }
 
 using ReshapeErrorHandler =
-    llvm::function_ref<LogicalResult(const llvm::Twine &)>;
+    llvm::function_ref<LogicalResult(const llvm::Twine&)>;
 
 LogicalResult GetReshapeOutputType(Value input, Value shape,
                                    ReshapeErrorHandler error_handler,
-                                   TensorType &output_ty) {
+                                   TensorType& output_ty) {
   auto input_ty = input.getType().cast<TensorType>();
   auto element_ty = input_ty.getElementType();
   output_ty = UnrankedTensorType::get(element_ty);
@@ -1787,7 +1793,7 @@ LogicalResult GetReshapeOutputType(Value input, Value shape,
     // shape.
     if (shape_ty.hasStaticShape()) {
       llvm::SmallVector<int64_t, 8> dynamic_shape(shape_ty.getDimSize(0),
-                                                  ShapedType::kDynamicSize);
+                                                  ShapedType::kDynamic);
       output_ty =
           tensorflow::GetTypeFromTFTensorShape(dynamic_shape, element_ty);
     }
@@ -1801,10 +1807,10 @@ LogicalResult GetReshapeOutputType(Value input, Value shape,
   int64_t shape_ty_size = 1;
   llvm::SmallVector<int64_t, 8> output_ty_shape;
   output_ty_shape.reserve(shape_attr.getNumElements());
-  for (const auto &dim : llvm::enumerate(shape_attr.getValues<APInt>())) {
+  for (const auto& dim : llvm::enumerate(shape_attr.getValues<APInt>())) {
     const int64_t size = dim.value().getSExtValue();
     if (size == tensorflow::kTFDynamicSize ||  // NOLINT
-        size == ShapedType::kDynamicSize) {    // NOLINT
+        size == ShapedType::kDynamic) {        // NOLINT
       if (unknown_index != -1)
         return error_handler(llvm::formatv(
             "requires 'shape' to have at most one dynamic dimension, but got "
@@ -1839,7 +1845,7 @@ LogicalResult GetReshapeOutputType(Value input, Value shape,
     // Compute number of elements in tensor shape.
     int64_t input_ty_size = 1;
     bool input_ty_zero_dim = false;
-    for (const auto &dim : input_ty.getShape()) {
+    for (const auto& dim : input_ty.getShape()) {
       if (dim > 0 || !shape_ty_zero_dim) {
         input_ty_size *= dim;
       } else {
@@ -1866,7 +1872,7 @@ LogicalResult GetReshapeOutputType(Value input, Value shape,
 
 mlir::LogicalResult ReshapeOp::verify() {
   ReshapeOp op = *this;
-  auto error_handler = [&op](const llvm::Twine &message) -> LogicalResult {
+  auto error_handler = [&op](const llvm::Twine& message) -> LogicalResult {
     return op.emitOpError() << message;
   };
   TensorType expected_ty;
@@ -1895,14 +1901,14 @@ mlir::LogicalResult ReshapeOp::verify() {
 }
 
 LogicalResult ReshapeOp::inferReturnTypes(
-    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    MLIRContext* context, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attr, RegionRange,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   ReshapeOpAdaptor op(operands, attr);
   const Value input = op.getInput();
   const Value shape = op.getShape();
 
-  auto error_handler = [&](const llvm::Twine &message) -> LogicalResult {
+  auto error_handler = [&](const llvm::Twine& message) -> LogicalResult {
     // A dummy error handler.
     // Errors when computing the output shape will be raised in
     // ReshapeOp::verify call.
@@ -1948,13 +1954,13 @@ bool ReshapeOp::isCompatibleReturnTypes(TypeRange lhs, TypeRange rhs) {
 //           =>   Value [5, 8, 9]
 // TODO(b/133341698): Move to tablegen when variadic is supported.
 struct RemoveRedundantUnpackPack : public RewritePattern {
-  explicit RemoveRedundantUnpackPack(MLIRContext *context)
+  explicit RemoveRedundantUnpackPack(MLIRContext* context)
       : RewritePattern(PackOp::getOperationName(), 2, context) {}
 
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
     TFL::PackOp pack_op = cast<TFL::PackOp>(op);
-    Operation *first_input = pack_op.getOperand(0).getDefiningOp();
+    Operation* first_input = pack_op.getOperand(0).getDefiningOp();
     if (!first_input) return failure();
     auto input_unpack_op = dyn_cast_or_null<TFL::UnpackOp>(first_input);
     if (!input_unpack_op) return failure();
@@ -1986,10 +1992,10 @@ struct RemoveRedundantUnpackPack : public RewritePattern {
 
 // Replace PackOp with a reshape when there is only one operand.
 struct ReplacePackWithReshape : public RewritePattern {
-  explicit ReplacePackWithReshape(MLIRContext *context)
+  explicit ReplacePackWithReshape(MLIRContext* context)
       : RewritePattern(PackOp::getOperationName(), 2, context) {}
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
+  LogicalResult matchAndRewrite(Operation* op,
+                                PatternRewriter& rewriter) const override {
     TFL::PackOp pack_op = cast<TFL::PackOp>(op);
     if (pack_op.getNumOperands() != 1) return failure();
 
@@ -2000,7 +2006,7 @@ struct ReplacePackWithReshape : public RewritePattern {
     // This is to workaround the unnecessary cast i64 -> i32.
     SmallVector<int32_t, 4> new_shape_array;
     for (auto size : output_type.getShape()) {
-      new_shape_array.push_back(static_cast<int32_t>(size));
+      new_shape_array.push_back(ConvertToTfliteSize(size));
     }
 
     auto new_shape = rewriter.create<TFL::ConstOp>(
@@ -2016,8 +2022,8 @@ struct ReplacePackWithReshape : public RewritePattern {
   }
 };
 
-void PackOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                         MLIRContext *context) {
+void PackOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* context) {
   results.add<RemoveRedundantUnpackPack, ReplacePackWithReshape>(context);
 }
 
@@ -2046,7 +2052,7 @@ mlir::LogicalResult SliceOp::verify() {
   DenseIntElementsAttr begin;
   if (matchPattern(op.getBegin(), m_Constant(&begin))) {
     int axis = 0;
-    for (const auto &begin_i : llvm::enumerate(begin)) {
+    for (const auto& begin_i : llvm::enumerate(begin)) {
       if (begin_i.value().getSExtValue() < 0) {
         return op.emitError(
             llvm::formatv("begin[{0}] cannot be negative", axis));
@@ -2058,7 +2064,7 @@ mlir::LogicalResult SliceOp::verify() {
   DenseIntElementsAttr size;
   if (matchPattern(op.getSize(), m_Constant(&size))) {
     int axis = 0;
-    for (const auto &size_i : llvm::enumerate(size)) {
+    for (const auto& size_i : llvm::enumerate(size)) {
       if (size_i.value().getSExtValue() < -1) {
         return op.emitError(
             llvm::formatv("size[{0}] cannot be negative other than -1", axis));
@@ -2087,9 +2093,9 @@ mlir::LogicalResult SliceOp::verify() {
   return success();
 }
 
-TFL::ConstOp NarrowDownInt64InputValuesForOp(Operation *input_op,
+TFL::ConstOp NarrowDownInt64InputValuesForOp(Operation* input_op,
                                              RankedTensorType value_type,
-                                             Location loc, OpBuilder *builder) {
+                                             Location loc, OpBuilder* builder) {
   if (input_op == nullptr) return nullptr;
 
   mlir::DenseIntElementsAttr attr;
@@ -2102,8 +2108,8 @@ TFL::ConstOp NarrowDownInt64InputValuesForOp(Operation *input_op,
 
   SmallVector<int32_t, 4> value_i32;
   value_i32.reserve(value_type.getRank());
-  for (const auto &size : attr) {
-    value_i32.push_back(static_cast<int32_t>(size.getSExtValue()));
+  for (const auto& size : attr) {
+    value_i32.push_back(ConvertToTfliteSize(size.getSExtValue()));
   }
   auto new_value_i32_attr =
       mlir::DenseIntElementsAttr::get(value_shape_type, value_i32);
@@ -2117,7 +2123,7 @@ struct CastDonwInt64BeginEndToInt32 : public OpRewritePattern<TFL::SliceOp> {
   using OpRewritePattern<TFL::SliceOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(TFL::SliceOp slice_op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     auto begin = slice_op.getBegin();
     auto size = slice_op.getSize();
     auto begin_type = begin.getType().dyn_cast_or_null<RankedTensorType>();
@@ -2151,8 +2157,8 @@ struct CastDonwInt64BeginEndToInt32 : public OpRewritePattern<TFL::SliceOp> {
   }
 };
 
-void SliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                          MLIRContext *context) {
+void SliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
   results.add<CastDonwInt64BeginEndToInt32>(context);
 }
 
@@ -2160,7 +2166,7 @@ void SliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // SqueezeOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult SqueezeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SqueezeOp::fold(FoldAdaptor) {
   auto input_ty = getInput().getType().dyn_cast<RankedTensorType>();
   auto result_ty = getType().dyn_cast<RankedTensorType>();
 
@@ -2173,7 +2179,8 @@ OpFoldResult SqueezeOp::fold(ArrayRef<Attribute> operands) {
 // SubOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SubOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   // TODO(b/142478136): Handle fused ops.
   if (getFusedActivationFunction() != "NONE") return {};
   return ConstFoldBinaryOp(
@@ -2181,7 +2188,7 @@ OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
       [](APInt a, APInt b) { return a - b; });
 }
 
-int64_t SubOp::GetArithmeticCount(Operation *op) {
+int64_t SubOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) return count;
 
@@ -2192,7 +2199,7 @@ int64_t SubOp::GetArithmeticCount(Operation *op) {
 // TopKOp
 //===----------------------------------------------------------------------===//
 
-static void BuildTopKOp(OpBuilder *builder, OperationState &result, Value input,
+static void BuildTopKOp(OpBuilder* builder, OperationState& result, Value input,
                         Value k) {
   // Output size is only known if k is constant value. A negative dimension is
   // considered dynamic so use -1 here if k is not a constant value.
@@ -2225,7 +2232,7 @@ static void BuildTopKOp(OpBuilder *builder, OperationState &result, Value input,
 //===----------------------------------------------------------------------===//
 
 // Return true if the op has non-empty "minmax" attribute.
-static inline bool HasValidMinMaxAttribute(Operation *op) {
+static inline bool HasValidMinMaxAttribute(Operation* op) {
   auto minmax = op->getAttrOfType<ArrayAttr>("minmax");
   return minmax && minmax.getValue().size() == 2;
 }
@@ -2235,31 +2242,31 @@ namespace {
 /// This pattern matches and remove a tfl.fake_quant if all the users of this op
 /// and itself have "minmax" attribute set.
 struct DropFakeQuant : public RewritePattern {
-  explicit DropFakeQuant(MLIRContext *context)
+  explicit DropFakeQuant(MLIRContext* context)
       : RewritePattern(FakeQuantOp::getOperationName(), 1, context) {}
 
-  LogicalResult match(Operation *op) const override {
+  LogicalResult match(Operation* op) const override {
     // We only match the op with valid "minmax" attribute.
     if (!HasValidMinMaxAttribute(op)) return failure();
 
     // If all the users of this op have valid "minmax" attributes, it is matched
     // and can be removed.
     auto fakeQuantOp = cast<FakeQuantOp>(op);
-    for (auto *operand : fakeQuantOp.getResult().getUsers())
+    for (auto* operand : fakeQuantOp.getResult().getUsers())
       if (!HasValidMinMaxAttribute(operand)) return failure();
 
     return success();
   }
 
-  void rewrite(Operation *op, PatternRewriter &rewriter) const override {
+  void rewrite(Operation* op, PatternRewriter& rewriter) const override {
     // Replace the matched FakeQuantOp by its primary operand.
     rewriter.replaceOp(op, op->getOperand(0));
   }
 };
 }  // end anonymous namespace
 
-void FakeQuantOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
+void FakeQuantOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                              MLIRContext* context) {
   results.add<DropFakeQuant>(context);
 }
 
@@ -2270,9 +2277,9 @@ void FakeQuantOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // TODO(b/133486129): Implement shape inference for unpack
 
 LogicalResult UnpackOp::inferReturnTypes(
-    MLIRContext *context, Optional<Location> loc, ValueRange operands,
+    MLIRContext* context, std::optional<Location> loc, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   UnpackOpAdaptor op(operands, attributes);
   // TODO(jpienaar): Refactor verify
   if (failed(op.verify(loc.has_value() ? *loc : UnknownLoc::get(context))))
@@ -2342,7 +2349,7 @@ bool UnpackOp::isCompatibleReturnTypes(TypeRange lhs, TypeRange rhs) {
 
 // Extracts and returns the signed integer constant in a 0-rank integer tensor
 // or 1-element 1-rank integer tensor if 'value' is a constant.
-static llvm::Optional<int64_t> ExtractConstantIntFromTensor(Value value) {
+static std::optional<int64_t> ExtractConstantIntFromTensor(Value value) {
   ElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr))) return {};
   if (attr.getNumElements() != 1) return {};
@@ -2365,7 +2372,7 @@ static RankedTensorType SubstituteRankedTensorTypeDimSize(
 // Verifies the output tensor types of SplitOp or SplitVOp.
 template <typename ExpectedOutputTypeGetter>
 static LogicalResult VerifySplitOpOutputTypes(
-    Operation *op, int64_t num_splits,
+    Operation* op, int64_t num_splits,
     ExpectedOutputTypeGetter get_expected_output_type) {
   for (int64_t i = 0; i < num_splits; ++i) {
     auto expected_output_type = get_expected_output_type(i);
@@ -2385,7 +2392,7 @@ mlir::LogicalResult SplitOp::verify() {
     return op.emitOpError("output count should match 'num_splits' attribute");
 
   // If 'split_dim' is not a constant, there are no other checks.
-  llvm::Optional<int64_t> split_dim_opt =
+  std::optional<int64_t> split_dim_opt =
       ExtractConstantIntFromTensor(op.getSplitDim());
   if (!split_dim_opt) return success();
 
@@ -2393,7 +2400,7 @@ mlir::LogicalResult SplitOp::verify() {
   auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return success();
 
-  int64_t split_dim = split_dim_opt.getValue();
+  int64_t split_dim = split_dim_opt.value();
   const int64_t rank = input_type.getRank();
   if (split_dim < 0) split_dim += rank;
   if (split_dim < 0 || split_dim >= rank)
@@ -2422,7 +2429,7 @@ mlir::LogicalResult SplitVOp::verify() {
     return op.emitOpError("output count should match 'num_splits' attribute");
 
   // If 'split_dim' is not a constant, there are no other checks.
-  llvm::Optional<int64_t> split_dim_opt =
+  std::optional<int64_t> split_dim_opt =
       ExtractConstantIntFromTensor(op.getSplitDim());
   if (!split_dim_opt) return success();
 
@@ -2430,7 +2437,7 @@ mlir::LogicalResult SplitVOp::verify() {
   auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return success();
 
-  int64_t split_dim = split_dim_opt.getValue();
+  int64_t split_dim = split_dim_opt.value();
   const int64_t rank = input_type.getRank();
   if (split_dim < 0) split_dim += rank;
   if (split_dim < 0 || split_dim >= rank)
@@ -2588,7 +2595,7 @@ struct RemoveLSTMOpZeroBias : public OpRewritePattern<LSTMOp> {
   using OpRewritePattern<LSTMOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(LSTMOp op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     if (EqualsZero(op.getInputGateBias())) {
       auto none_value = rewriter.create<TFL::NoValueOp>(
           rewriter.getUnknownLoc(), rewriter.getNoneType(),
@@ -2609,8 +2616,8 @@ struct RemoveLSTMOpZeroBias : public OpRewritePattern<LSTMOp> {
 
 }  // namespace
 
-void LSTMOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                         MLIRContext *context) {
+void LSTMOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* context) {
   results.add<RemoveLSTMOpZeroBias>(context);
 }
 
@@ -2629,8 +2636,9 @@ mlir::LogicalResult UnidirectionalSequenceLSTMOp::verify() {
 }
 
 LogicalResult UnidirectionalSequenceLSTMOp::inferReturnTypes(
-    MLIRContext *, Optional<Location>, ValueRange operands, DictionaryAttr attr,
-    RegionRange, SmallVectorImpl<Type> &inferredReturnTypes) {
+    MLIRContext*, std::optional<Location>, ValueRange operands,
+    DictionaryAttr attr, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   Value input = operands[0];
   auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
 
@@ -2664,10 +2672,11 @@ LogicalResult UnidirectionalSequenceLSTMOp::inferReturnTypes(
       time_major_attr ? time_major_attr->getValue().cast<BoolAttr>().getValue()
                       : false;
 
-  int batch =
+  int64_t batch =
       time_majored ? input_type.getDimSize(1) : input_type.getDimSize(0);
-  int time = time_majored ? input_type.getDimSize(0) : input_type.getDimSize(1);
-  int n_output = output_state_type.getDimSize(1);
+  int64_t time =
+      time_majored ? input_type.getDimSize(0) : input_type.getDimSize(1);
+  int64_t n_output = output_state_type.getDimSize(1);
 
   // Build the output shape.
   SmallVector<int64_t, 3> output_shape;
@@ -2736,7 +2745,8 @@ mlir::LogicalResult SVDFOp::verify() {
 // AbsOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult AbsOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult AbsOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2749,7 +2759,8 @@ OpFoldResult AbsOp::fold(ArrayRef<Attribute> operands) {
 // NegOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult NegOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult NegOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2762,7 +2773,8 @@ OpFoldResult NegOp::fold(ArrayRef<Attribute> operands) {
 // SinOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult SinOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SinOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2779,7 +2791,8 @@ OpFoldResult SinOp::fold(ArrayRef<Attribute> operands) {
 // CosOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult CosOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult CosOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2796,7 +2809,8 @@ OpFoldResult CosOp::fold(ArrayRef<Attribute> operands) {
 // LogOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult LogOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult LogOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2813,7 +2827,7 @@ OpFoldResult LogOp::fold(ArrayRef<Attribute> operands) {
 // ShapeOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ShapeOp::fold(FoldAdaptor) {
   auto input_type = getInput().getType().cast<ShapedType>();
   if (!input_type.hasStaticShape()) return nullptr;
 
@@ -2836,7 +2850,8 @@ OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
 // SqrtOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SqrtOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2853,7 +2868,8 @@ OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
 // RsqrtOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult RsqrtOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult RsqrtOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32/bf16 is implemented.
   if (!IsF32ShapedType(result_type) && !IsBF16ShapedType(result_type))
@@ -2861,7 +2877,7 @@ OpFoldResult RsqrtOp::fold(ArrayRef<Attribute> operands) {
 
   auto compute = [](APFloat value) -> APFloat {
     bool loseInfo;
-    const llvm::fltSemantics &original_float_semantics = value.getSemantics();
+    const llvm::fltSemantics& original_float_semantics = value.getSemantics();
     value.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
                   &loseInfo);
     float f = value.convertToFloat();
@@ -2877,7 +2893,8 @@ OpFoldResult RsqrtOp::fold(ArrayRef<Attribute> operands) {
 // SquareOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult SquareOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SquareOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   Type result_type = getType();
   // Only constant fold for tensor of f32 is implemented.
   if (!IsF32ShapedType(result_type)) return nullptr;
@@ -2890,7 +2907,8 @@ OpFoldResult SquareOp::fold(ArrayRef<Attribute> operands) {
 // RankOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult RankOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 1);
   auto result_type = getType().cast<ShapedType>();
   if (auto elements_attr = operands[0].dyn_cast_or_null<ElementsAttr>()) {
@@ -2916,7 +2934,9 @@ OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
 // ConstOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ConstOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  (void)operands;
   assert(operands.empty() && "constant has no operands");
   // Return the held attribute value.
   return getValue();
@@ -2937,7 +2957,7 @@ struct FoldPseudoConstOp : public OpRewritePattern<ConstOp> {
   using OpRewritePattern<ConstOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(ConstOp const_op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     if (arith::ConstantOp::isBuildableWith(const_op.getValue(),
                                            const_op.getType())) {
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(const_op,
@@ -2956,8 +2976,8 @@ struct FoldPseudoConstOp : public OpRewritePattern<ConstOp> {
 
 }  // namespace
 
-void ConstOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                          MLIRContext *context) {
+void ConstOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
   results.add<FoldPseudoConstOp>(context);
 }
 
@@ -2965,7 +2985,8 @@ void ConstOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // CastOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult CastOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 1);
   if (getElementTypeOrSelf(getInput()) == getElementTypeOrSelf(getType())) {
     return getInput();
@@ -3016,7 +3037,7 @@ OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
 // SelectV2Op
 //===----------------------------------------------------------------------===//
 
-static void BuildSelectV2Op(Builder *builder, OperationState &result,
+static void BuildSelectV2Op(Builder* builder, OperationState& result,
                             Value cond, Value x, Value y) {
   auto operand_type =
       OpTrait::util::getBroadcastedType(x.getType(), y.getType());
@@ -3106,7 +3127,8 @@ DenseElementsAttr BuildConstRangeTensor(Type result_elem_type, int num_elements,
 }
 }  // namespace
 
-OpFoldResult RangeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 3);
   auto start_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
   auto limit_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
@@ -3178,7 +3200,7 @@ mlir::LogicalResult TransposeConvOp::verify() {
   return success();
 }
 
-int64_t TransposeConvOp::GetArithmeticCount(Operation *op) {
+int64_t TransposeConvOp::GetArithmeticCount(Operation* op) {
   int64_t count = -1;
   auto transpose_conv = llvm::dyn_cast<TransposeConvOp>(op);
   auto input_type = transpose_conv.getInput()
@@ -3240,7 +3262,7 @@ LogicalResult StridedSliceOp::verify() {
   return success();
 }
 
-OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
   // Currently only support all masks being 0.
   if (getBeginMask() != 0 || getEndMask() != 0 || getEllipsisMask() != 0 ||
       getNewAxisMask() != 0 || getShrinkAxisMask() != 0)
@@ -3298,8 +3320,8 @@ namespace {
 // `new_values`.
 void ComputePermutation(ElementsAttr input_tensor, ArrayRef<int32_t> perm,
                         ArrayRef<int64_t> output_shape, int num_dimensions,
-                        int output_axis, std::vector<uint64_t> *input_indices,
-                        std::vector<Attribute> *new_values) {
+                        int output_axis, std::vector<uint64_t>* input_indices,
+                        std::vector<Attribute>* new_values) {
   // Refer to the implementation of `Transpose` function in
   // tensorflow/lite/kernels/internal/reference/reference_ops.h
   assert(output_axis < num_dimensions);
@@ -3322,7 +3344,8 @@ void ComputePermutation(ElementsAttr input_tensor, ArrayRef<int32_t> perm,
 
 }  // namespace
 
-OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 2);
   auto input_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
   auto perm_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
@@ -3380,7 +3403,7 @@ mlir::LogicalResult TransposeOp::verify() {
 
   int index = 0;
   llvm::SmallVector<int64_t, 4> axes;
-  for (const auto &axis_int : perm.getValues<APInt>()) {
+  for (const auto& axis_int : perm.getValues<APInt>()) {
     const int64_t axis = axis_int.getSExtValue();
     if (axis < 0 || (input_type.hasRank() && axis >= input_type.getRank())) {
       return op.emitOpError(
@@ -3430,7 +3453,7 @@ mlir::LogicalResult TransposeOp::verify() {
   return success();
 }
 
-static void BuildTransposeOp(OpBuilder *builder, OperationState &result,
+static void BuildTransposeOp(OpBuilder* builder, OperationState& result,
                              Value input, Value perm) {
   // Output size is only known if input is ranked and perm is a constant.
   auto input_type = input.getType().cast<TensorType>();
@@ -3490,9 +3513,9 @@ static void BuildTransposeOp(OpBuilder *builder, OperationState &result,
 /// during the flow of control. `operands` is a set of optional attributes that
 /// correspond to a constant value for each operand, or null if that operand is
 /// not a constant.
-void IfOp::getSuccessorRegions(Optional<unsigned> index,
+void IfOp::getSuccessorRegions(std::optional<unsigned> index,
                                ArrayRef<Attribute> operands,
-                               SmallVectorImpl<RegionSuccessor> &regions) {
+                               SmallVectorImpl<RegionSuccessor>& regions) {
   // The `then` and the `else` region branch back to the parent operation.
   if (index.has_value()) {
     regions.push_back(RegionSuccessor(getResults()));
@@ -3500,7 +3523,7 @@ void IfOp::getSuccessorRegions(Optional<unsigned> index,
   }
 
   // Don't consider the else region if it is empty.
-  Region *else_reg = &getElseRegion();
+  Region* else_reg = &getElseRegion();
   if (else_reg->empty()) else_reg = nullptr;
 
   // Otherwise, the successor is dependent on the condition.
@@ -3531,7 +3554,7 @@ struct PolyCallResultOperandsMatchAndImplicitCapture
   using OpRewritePattern<PolyCallOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(PolyCallOp while_op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     // Finish this.
     return success();
   }
@@ -3539,14 +3562,14 @@ struct PolyCallResultOperandsMatchAndImplicitCapture
 
 }  // namespace
 
-void PolyCallOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void PolyCallOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add<PolyCallResultOperandsMatchAndImplicitCapture>(context);
 }
 
 void PolyCallOp::getSuccessorRegions(
-    Optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
+    std::optional<unsigned> index, ArrayRef<Attribute> operands,
+    SmallVectorImpl<RegionSuccessor>& regions) {
   // Defaults to first region for TFLite execution.
 }
 
@@ -3588,15 +3611,15 @@ struct WhileResultOperandsMatchAndImplicitCapture
   using OpRewritePattern<WhileOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(WhileOp while_op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     // Replace values simply passed through the body with extern values
     // (in both body and condition regions as well as while result). The
     // block arguments of body and while match and so the corresponding cond
     // argument can be easily found.
     bool unchanged = true;
-    auto &body_block = while_op.getBody().front();
-    auto &cond_block = while_op.getCond().front();
-    auto &yield = *body_block.getTerminator();
+    auto& body_block = while_op.getBody().front();
+    auto& cond_block = while_op.getCond().front();
+    auto& yield = *body_block.getTerminator();
     for (auto ba : body_block.getArguments()) {
       int arg_no = ba.getArgNumber();
       // Skip removing resources that are not read-only variables.
@@ -3666,8 +3689,8 @@ struct WhileResultOperandsMatchAndImplicitCapture
     if (unchanged) return failure();
 
     // Replace with new While with matching operands and results.
-    Operation *op = while_op.getOperation();
-    Operation *new_op = rewriter.insert(
+    Operation* op = while_op.getOperation();
+    Operation* new_op = rewriter.insert(
         Operation::create(op->getLoc(), op->getName(), types, new_operands,
                           op->getAttrs(), {}, /*numRegions=*/2));
 
@@ -3680,7 +3703,7 @@ struct WhileResultOperandsMatchAndImplicitCapture
     }
     rewriter.eraseOp(op);
 
-    Block &new_body_block = cast<WhileOp>(new_op).getBody().front();
+    Block& new_body_block = cast<WhileOp>(new_op).getBody().front();
     rewriter.setInsertionPointToEnd(&new_body_block);
     rewriter.replaceOpWithNewOp<YieldOp>(new_body_block.getTerminator(),
                                          new_body_yield);
@@ -3691,12 +3714,12 @@ struct WhileResultOperandsMatchAndImplicitCapture
 
 }  // namespace
 
-void WhileOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                          MLIRContext *context) {
+void WhileOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
   results.add<WhileResultOperandsMatchAndImplicitCapture>(context);
 }
 
-Region &WhileOp::getLoopBody() { return getBody(); }
+Region& WhileOp::getLoopBody() { return getBody(); }
 
 bool WhileOp::isDefinedOutsideOfLoop(Value value) {
   // TODO(jpienaar): This is to overly conservative and disables anything other
@@ -3708,7 +3731,7 @@ bool WhileOp::isDefinedOutsideOfLoop(Value value) {
 // LogisticOp
 //===----------------------------------------------------------------------===//
 
-int64_t LogisticOp::GetArithmeticCount(Operation *op) {
+int64_t LogisticOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
   // such as tanh or logistic is about 32 multiplications, and about as
@@ -3724,7 +3747,7 @@ int64_t LogisticOp::GetArithmeticCount(Operation *op) {
 // LogSoftmaxOp
 //===----------------------------------------------------------------------===//
 
-int64_t LogSoftmaxOp::GetArithmeticCount(Operation *op) {
+int64_t LogSoftmaxOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
   // such as tanh or logistic is about 32 multiplications, and about as
@@ -3740,7 +3763,7 @@ int64_t LogSoftmaxOp::GetArithmeticCount(Operation *op) {
 // SoftmaxOp
 //===----------------------------------------------------------------------===//
 
-int64_t SoftmaxOp::GetArithmeticCount(Operation *op) {
+int64_t SoftmaxOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
   // such as tanh or logistic is about 32 multiplications, and about as
@@ -3756,7 +3779,7 @@ int64_t SoftmaxOp::GetArithmeticCount(Operation *op) {
 // TanhOp
 //===----------------------------------------------------------------------===//
 
-int64_t TanhOp::GetArithmeticCount(Operation *op) {
+int64_t TanhOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // As a very rough ballpark, the cost of evaluating a math function
   // such as tanh or logistic is about 32 multiplications, and about as
@@ -3772,7 +3795,7 @@ int64_t TanhOp::GetArithmeticCount(Operation *op) {
 // AddNOp
 //===----------------------------------------------------------------------===//
 
-int64_t AddNOp::GetArithmeticCount(Operation *op) {
+int64_t AddNOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) {
     // AddN cost is roughly the same cost as N-1 Adds.
@@ -3787,7 +3810,7 @@ int64_t AddNOp::GetArithmeticCount(Operation *op) {
 // AveragePool2DOp
 //===----------------------------------------------------------------------===//
 
-int64_t AveragePool2DOp::GetArithmeticCount(Operation *op) {
+int64_t AveragePool2DOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) {
     auto avg_pool = llvm::dyn_cast<AveragePool2DOp>(op);
@@ -3801,7 +3824,7 @@ int64_t AveragePool2DOp::GetArithmeticCount(Operation *op) {
 // MaxPool2DOp
 //===----------------------------------------------------------------------===//
 
-int64_t MaxPool2DOp::GetArithmeticCount(Operation *op) {
+int64_t MaxPool2DOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   if (ArithmeticCountUtilHelper::GetFirstOutputCount(op, &count)) {
     auto max_pool = llvm::dyn_cast<MaxPool2DOp>(op);
@@ -3815,7 +3838,7 @@ int64_t MaxPool2DOp::GetArithmeticCount(Operation *op) {
 // L2NormalizationOp
 //===----------------------------------------------------------------------===//
 
-int64_t L2NormalizationOp::GetArithmeticCount(Operation *op) {
+int64_t L2NormalizationOp::GetArithmeticCount(Operation* op) {
   int64_t count;
   // Computing the squared L2 norm is N multiply-adds so 2N ops,
   // then the single inverse-sqrt is negligible, then we multiply each
@@ -3831,7 +3854,7 @@ int64_t L2NormalizationOp::GetArithmeticCount(Operation *op) {
 // PadOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult PadOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult PadOp::fold(FoldAdaptor) {
   if (InputOutputHasSameShape(getInput().getType(), getOutput().getType()))
     return getInput();
 
@@ -3842,7 +3865,7 @@ OpFoldResult PadOp::fold(ArrayRef<Attribute> operands) {
 // PadV2Op
 //===----------------------------------------------------------------------===//
 
-OpFoldResult PadV2Op::fold(ArrayRef<Attribute> operands) {
+OpFoldResult PadV2Op::fold(FoldAdaptor) {
   if (InputOutputHasSameShape(getInput().getType(), getOutput().getType()))
     return getInput();
 
@@ -3853,9 +3876,7 @@ OpFoldResult PadV2Op::fold(ArrayRef<Attribute> operands) {
 // NoValueOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult NoValueOp::fold(ArrayRef<Attribute> operands) {
-  return getValueAttr();
-}
+OpFoldResult NoValueOp::fold(FoldAdaptor) { return getValueAttr(); }
 
 bool NoValueOp::isBuildableWith(Attribute value, Type type) {
   return value.isa<UnitAttr>() && type.isa<NoneType>();
@@ -3871,7 +3892,7 @@ bool ControlNodeOp::WrapsSinglePerfectlyForwardedOp() {
   auto body = GetBody().without_terminator();
   if (!hasSingleElement(body)) return false;
 
-  Operation &controlled_op = *body.begin();
+  Operation& controlled_op = *body.begin();
   YieldOp yield = GetYield();
   return controlled_op.getNumResults() == yield.getNumOperands() &&
          std::equal(controlled_op.getResults().begin(),
@@ -3884,7 +3905,7 @@ mlir::LogicalResult ControlNodeOp::verify() {
   if (!control_node.GetBody().args_empty())
     return control_node.emitOpError() << "expects body without any arguments";
 
-  Operation &yield = control_node.GetBody().back();
+  Operation& yield = control_node.GetBody().back();
   if (!isa<YieldOp>(yield))
     return yield.emitOpError()
            << "invalid TFL.control_node terminator, yield expected";
@@ -3907,7 +3928,7 @@ mlir::LogicalResult ControlNodeOp::verify() {
   return success();
 }
 
-void ControlNodeOp::print(OpAsmPrinter &p) {
+void ControlNodeOp::print(OpAsmPrinter& p) {
   if (getNumOperands()) {
     // These are always control operand, no explicit type needed.
     p << '(';
@@ -3918,7 +3939,7 @@ void ControlNodeOp::print(OpAsmPrinter &p) {
   // control_node contains a single operation and the results of this operation
   // are perfectly forwarded to the yield.
   if (getOperation()->getAttrs().empty() && WrapsSinglePerfectlyForwardedOp()) {
-    Operation &controlled_op = GetBody().front();
+    Operation& controlled_op = GetBody().front();
     // The "controls" syntax only encodes a single location.
     YieldOp yield_op = GetYield();
     // In order to correctly round-trip, we can only use this syntax when all
@@ -3934,7 +3955,7 @@ void ControlNodeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict(getOperation()->getAttrs());
 }
 
-ParseResult ControlNodeOp::parse(OpAsmParser &parser, OperationState &result) {
+ParseResult ControlNodeOp::parse(OpAsmParser& parser, OperationState& result) {
   // Parse the body region.
   llvm::SMLoc loc = parser.getCurrentLocation();
   Type control_type = ControlType::get(parser.getBuilder().getContext());
@@ -3949,15 +3970,15 @@ ParseResult ControlNodeOp::parse(OpAsmParser &parser, OperationState &result) {
       return failure();
   }
 
-  Region &body = *result.addRegion();
+  Region& body = *result.addRegion();
 
   if (succeeded(parser.parseOptionalKeyword("controls"))) {
     // If we parse the short version of the control node, we have an operation
     // in the generic form that follows the "controls" keyword. Parse it inside
     // the region and forward all of its results as-is to the yield operation.
     body.push_back(new Block);
-    Block &block = body.back();
-    Operation *controlled_op =
+    Block& block = body.back();
+    Operation* controlled_op =
         parser.parseGenericOperation(&block, block.begin());
     if (!controlled_op) return failure();
     OpBuilder builder(parser.getBuilder().getContext());
@@ -3972,7 +3993,7 @@ ParseResult ControlNodeOp::parse(OpAsmParser &parser, OperationState &result) {
   ControlNodeOp::ensureTerminator(body, parser.getBuilder(), result.location);
 
   // Get the results type for the control node from the terminator operands.
-  Operation &yield = body.back().back();
+  Operation& yield = body.back().back();
   result.types.reserve(yield.getNumOperands() + 1);
   result.types.append(yield.operand_type_begin(), yield.operand_type_end());
   result.types.push_back(control_type);
@@ -3986,7 +4007,8 @@ ParseResult ControlNodeOp::parse(OpAsmParser &parser, OperationState &result) {
 // EmbeddingLookupOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult EmbeddingLookupOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult EmbeddingLookupOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   auto lookup_attr = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
   auto value_attr = operands[1].dyn_cast_or_null<DenseElementsAttr>();
   if (!lookup_attr || !value_attr) {
@@ -4031,7 +4053,7 @@ OpFoldResult EmbeddingLookupOp::fold(ArrayRef<Attribute> operands) {
 // ConstBytesAttr
 //===----------------------------------------------------------------------===//
 
-Attribute ConstBytesAttr::parse(AsmParser &parser, Type type) {
+Attribute ConstBytesAttr::parse(AsmParser& parser, Type type) {
   if (parser.parseColon()) {
     return nullptr;
   }
@@ -4049,7 +4071,7 @@ Attribute ConstBytesAttr::parse(AsmParser &parser, Type type) {
   return ConstBytesAttr::get(parser.getBuilder().getContext(), bytes_data);
 }
 
-void ConstBytesAttr::print(mlir::AsmPrinter &printer) const {
+void ConstBytesAttr::print(mlir::AsmPrinter& printer) const {
   StringRef bytes_str = getValue();
   printer << " : \"0x" << llvm::toHex(bytes_str) << "\"";
 }
@@ -4060,7 +4082,7 @@ void ConstBytesAttr::print(mlir::AsmPrinter &printer) const {
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.cc.inc"
 
-static FailureOr<SmallVector<int32_t>> parseI32Array(AsmParser &parser) {
+static FailureOr<SmallVector<int32_t>> parseI32Array(AsmParser& parser) {
   SmallVector<int32_t> elements;
   auto elementParser = [&]() {
     int32_t element;
@@ -4089,7 +4111,7 @@ namespace TFL {
 
 #include "tensorflow/compiler/mlir/lite/runtime_verifiers.inc"
 
-Operation *TFLDialect::materializeConstant(OpBuilder &builder, Attribute value,
+Operation* TFLDialect::materializeConstant(OpBuilder& builder, Attribute value,
                                            Type type, Location loc) {
   // If this is a constant bytes attribute or the result type doesn't match the
   // attribute type, then generate a tfl.pseudo_const.
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index ec8938aa358..b624618ea35 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -508,9 +508,9 @@ value of each element in `x`. For example, if x is an input element and y is
 an output element, this operation computes \\(y = |x|\\).
   }];
 
-  let arguments = (ins TFL_TensorOf<[I16, F32, QI8, QI16]>:$x);
+  let arguments = (ins TFL_TensorOf<[I16, I32, F32, QI8, QI16]>:$x);
 
-  let results = (outs TFL_TensorOf<[I16, F32, QI8, QI16]>:$y);
+  let results = (outs TFL_TensorOf<[I16, I32, F32, QI8, QI16]>:$y);
 
   let hasFolder = 1;
 }
@@ -654,7 +654,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
     TFL_TensorOfOrNone<[F32, QI32, I64]>:$bias,
     TFL_PaddingAttr:$padding,
     ConfinedAttr<I32Attr, [IntPositive]>:$stride_h,
-    ConfinedAttr<I32Attr, [IntPositive]>:$stride_w
+    ConfinedAttr<I32Attr, [IntPositive]>:$stride_w,
+    TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$output);
@@ -996,7 +997,7 @@ def TFL_DepthwiseConv2DOp :
                  DynamicRangeQuantizedOpInterface]> {
   let arguments = (
     ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
-    TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
+    TFL_TensorOf<[F32, QI4, QI8, QUI8]>:$filter,
     TFL_1DTensorOfOrNone<[F32, I32, I64]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
@@ -1036,7 +1037,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
 
   let arguments = (ins
     TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
-    TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$filter,
+    TFL_TensorOf<[F32, QI4, QI8, QUI8, QI16]>:$filter,
     TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
 
     TFL_AFAttr:$fused_activation_function,
@@ -2883,6 +2884,24 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
   ];
 }
 
+def TFL_Relu0To1Op: TFL_Op<"relu_0_to_1", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    Pure,
+    QuantizableResult,
+    SameOperandsAndResultShape]> {
+  let summary = "Relu0To1 operator";
+
+  let description = [{
+    Element-wise Relu0To1 operator
+      x -> max(0, min(1, x))
+  }];
+
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
+
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
+}
+
 def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
@@ -3866,10 +3885,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F32, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
@@ -3998,7 +4017,7 @@ def TFL_DynamicUpdateSliceOp: TFL_Op<"dynamic_update_slice", [
 //===----------------------------------------------------------------------===//
 // Quantization ops.
 //===----------------------------------------------------------------------===//
-def TFL_DequantizeOp: TFL_Op<"dequantize", []> {
+def TFL_DequantizeOp: TFL_Op<"dequantize", [NoMemoryEffect]> {
   let summary = "Dequantize operator";
 
   let description = [{
@@ -4104,7 +4123,7 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
 
 def TFL_QuantizeOp: TFL_Op<"quantize", [
     FirstAttrDerivedResultType,
-    SameOperandsAndResultShape]> {
+    SameOperandsAndResultShape, NoMemoryEffect]> {
   let summary = "Quantize operator";
 
   let description = [{
@@ -4277,7 +4296,7 @@ Ba et al. 'Layer Normalization'
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QI8]>:$input,
+    ins TFL_TensorOf<[F32, QI8, QI16]>:$input,
 
     // Weights
     TFL_TensorOfOrNone<[F32, QI8]>:$input_to_input_weights,
@@ -4377,24 +4396,24 @@ def TFL_UnidirectionalSequenceLSTMOp :
            LstmProjectionWeightBiasConstraint,
            LstmCifgInputConstraint,
            LstmResultConstraint,
-           TFL_OperandHasRankAtLeast<0, 2>,    // input
-           TFL_OperandIsNoneOrHasRank<1, 2>,   // input_to_input_weights
-           TFL_OperandHasRank<2, 2>,           // input_to_forget_weights
-           TFL_OperandHasRank<3, 2>,           // input_to_cell_weights
-           TFL_OperandHasRank<4, 2>,           // input_to_output_weights
-           TFL_OperandIsNoneOrHasRank<5, 2>,   // recurrent_to_input_weights
-           TFL_OperandHasRank<6, 2>,           // recurrent_to_forget_weights
-           TFL_OperandHasRank<7, 2>,           // recurrent_to_cell_weights
-           TFL_OperandHasRank<8, 2>,           // recurrent_to_output_weights
-           TFL_OperandIsNoneOrHasRank<9, 1>,   // cell_to_input_weights
-           TFL_OperandIsNoneOrHasRank<10, 1>,  // cell_to_forget_weights
-           TFL_OperandIsNoneOrHasRank<11, 1>,  // cell_to_output_weights
-           TFL_OperandIsNoneOrHasRank<12, 1>,  // input_gate_bias
-           TFL_OperandHasRank<13, 1>,          // forget_gate_bias
-           TFL_OperandHasRank<14, 1>,          // cell_gate_bias
-           TFL_OperandHasRank<15, 1>,          // output_gate_bias
-           TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
-           TFL_OperandIsNoneOrHasRank<17, 1>,  // projection_bias
+           TFL_OperandHasRankAtLeast<0, 2>,       // input
+           TFL_OperandIsNoneOrHasRank<1, 2>,      // input_to_input_weights
+           TFL_OperandHasRank<2, 2>,              // input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,              // input_to_cell_weights
+           TFL_OperandHasRank<4, 2>,              // input_to_output_weights
+           TFL_OperandIsNoneOrHasRankAtMost<5, 2>,// recurrent_to_input_weights
+           TFL_OperandHasRankAtMost<6, 2>,        // recurrent_to_forget_weights
+           TFL_OperandHasRankAtMost<7, 2>,        // recurrent_to_cell_weights
+           TFL_OperandHasRankAtMost<8, 2>,        // recurrent_to_output_weights
+           TFL_OperandIsNoneOrHasRank<9, 1>,      // cell_to_input_weights
+           TFL_OperandIsNoneOrHasRank<10, 1>,     // cell_to_forget_weights
+           TFL_OperandIsNoneOrHasRank<11, 1>,     // cell_to_output_weights
+           TFL_OperandIsNoneOrHasRank<12, 1>,     // input_gate_bias
+           TFL_OperandHasRank<13, 1>,             // forget_gate_bias
+           TFL_OperandHasRank<14, 1>,             // cell_gate_bias
+           TFL_OperandHasRank<15, 1>,             // output_gate_bias
+           TFL_OperandIsNoneOrHasRank<16, 2>,     // projection_weights
+           TFL_OperandIsNoneOrHasRank<17, 1>,     // projection_bias
            TFL_StatefulOp,
            DeclareOpInterfaceMethods<InferTypeOpInterface>,
            QuantizableResult,
@@ -4464,6 +4483,9 @@ def TFL_UnidirectionalSequenceLSTMOp :
     // Used in post-training dynamic range quantization. If the value is true,
     // input activations are asymmetrically quantized.
     OptionalAttr<BoolAttr>:$asymmetric_quantize_inputs,
+    // IndyLSTM optimizations (i.e. optimizations enabled because of diagonal
+    // recurrent weight matrices that are provided as vectors)
+    OptionalAttr<BoolAttr>:$diagonal_recurrent_tensors,
 
     // Types of the optional intermediate tensors, which exist for fully
     // quantized op and hold the ranges of the intermediate tensors.
@@ -4995,7 +5017,7 @@ def TFL_Atan2Op: TFL_Op<"atan2", [
 
   let summary = "Atan2 operation";
   let description = [{
-    The "atan2" operation computes the arctangent of y/x element-wise, 
+    The "atan2" operation computes the arctangent of y/x element-wise,
     respecting signs of the arguments.
   }];
 
@@ -5014,18 +5036,18 @@ def TFL_SignOp: TFL_Op<"sign", [
   SameOperandsAndResultShape,
   SameOperandsAndResultElementType
   ]> {
-  
+
   let summary = "Sign operation";
   let description = [{
     Returns NaN if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, F64]>:$x
+    TFL_TensorOf<[F32, F64, I32]>:$x
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, F64]>:$output
+    TFL_TensorOf<[F32, F64, I32]>:$output
   );
 }
 
@@ -5041,7 +5063,7 @@ def TFL_YieldOp : Op<TFL_Dialect, "yield",
     The operation takes a variable number of operands and produces no results.
     The operand number and types must match the signature of the region that contains the operation.
   }];
-  let arguments = (ins Variadic<AnyType>:$operands);
+  let arguments = (ins Variadic<AnyType>);
 
   // Default builder needed for ensureTerminator
   let builders = [
diff --git a/tensorflow/compiler/mlir/lite/metrics/BUILD b/tensorflow/compiler/mlir/lite/metrics/BUILD
index b6df8dcabac..4173130bafe 100644
--- a/tensorflow/compiler/mlir/lite/metrics/BUILD
+++ b/tensorflow/compiler/mlir/lite/metrics/BUILD
@@ -2,7 +2,9 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
+        "//tensorflow:__pkg__",
         "//tensorflow/compiler/mlir/lite:__subpackages__",
         "//tensorflow/lite/python:__subpackages__",
         "//tensorflow/lite/toco/python:__subpackages__",
@@ -45,9 +47,9 @@ tf_cc_test(
         ":error_collector_inst",
         ":types_util",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:test",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
index 824c464fd33..7874f6c5f3c 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
@@ -33,14 +33,14 @@ limitations under the License.
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace mlir {
 namespace TFL {
 namespace {
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // MockSuccessPass reports errors but doesn't fail.
 class MockSuccessPass
@@ -109,13 +109,14 @@ TEST(ErrorCollectorTest, TessSuccessPass) {
       "tensorflow/compiler/mlir/lite/metrics/testdata/strided_slice.mlir");
   MLIRContext context;
   context.getOrLoadDialect<mlir::func::FuncDialect>();
-  context.allowUnregisteredDialects();
+  context.getOrLoadDialect<TF::TensorFlowDialect>();
   context.enableMultithreading();
 
   auto module = LoadModule(&context, input_file);
   EXPECT_EQ(module.ok(), true);
 
-  PassManager pm(&context, OpPassManager::Nesting::Implicit);
+  PassManager pm(module.value().get()->getName(),
+                 OpPassManager::Nesting::Implicit);
   pm.addPass(std::make_unique<MockSuccessPass>());
 
   pm.addInstrumentation(
@@ -131,18 +132,19 @@ TEST(ErrorCollectorTest, TessFailurePass) {
   using tflite::metrics::ConverterErrorData;
   MLIRContext context;
   context.getOrLoadDialect<mlir::func::FuncDialect>();
+  context.getOrLoadDialect<TF::TensorFlowDialect>();
   const std::string input_file =
       "tensorflow/compiler/mlir/lite/metrics/testdata/strided_slice.mlir";
   auto input_file_id = StringAttr::get(&context, input_file);
 
-  context.allowUnregisteredDialects();
   context.enableMultithreading();
 
   auto module =
       LoadModule(&context, tensorflow::GetDataDependencyFilepath(input_file));
   EXPECT_EQ(module.ok(), true);
 
-  PassManager pm(&context, OpPassManager::Nesting::Implicit);
+  PassManager pm(module.value().get()->getName(),
+                 OpPassManager::Nesting::Implicit);
   pm.addPass(std::make_unique<MockSuccessPass>());
   pm.addPass(std::make_unique<MockFailurePass>());
 
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index c68359214b4..51618d4826e 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -2,7 +2,13 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 licenses(["notice"])
 
-package(default_visibility = [":friends"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        ":friends",
+        "//tensorflow:__pkg__",
+    ],
+)
 
 package_group(
     name = "friends",
@@ -26,7 +32,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -58,7 +63,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:model_flags_proto_cc",
@@ -90,7 +94,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:model_flags_proto_cc",
@@ -122,7 +125,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index e8833159316..fae07ee2644 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h"
 
+#include <optional>
 #include <ostream>
 #include <string>
 #include <utility>
 
-#include "llvm/ADT/None.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -43,6 +42,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
@@ -113,11 +113,12 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.preserve_assert_op = toco_flags.preserve_assert_op();
   pass_config.guarantee_all_funcs_one_use =
       toco_flags.guarantee_all_funcs_one_use();
+  pass_config.enable_stablehlo_conversion = toco_flags.convert_to_stablehlo();
 
   return internal::ConvertMLIRToTFLiteFlatBuffer(
       model_flags, toco_flags, std::move(module), pass_config,
       /*saved_model_tags=*/{}, result,
-      /*session=*/llvm::None);
+      /*session=*/std::nullopt);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
index d310867fed7..ef8536ccfb6 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.h"
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -42,7 +42,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -53,6 +52,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -160,6 +160,7 @@ Status ConvertJaxToTFLiteFlatBuffer(const std::string& input,
   pass_config.unfold_large_splat_constant =
       toco_flags.unfold_large_splat_constant();
   pass_config.enable_hlo_to_tf_conversion = true;
+  pass_config.enable_stablehlo_conversion = toco_flags.convert_to_stablehlo();
 
   mlir::OwningOpRef<mlir::ModuleOp> module;
   if (model_flags.hlo_file_type() == toco::ModelFlags::HLO_TEXT) {
@@ -192,7 +193,7 @@ Status ConvertJaxToTFLiteFlatBuffer(const std::string& input,
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
       model_flags, toco_flags, std::move(module), pass_config,
       /*saved_model_tags=*/{}, result,
-      /*session=*/llvm::None);
+      /*session=*/std::nullopt);
   return status;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index eae882586ba..de9e5eadb10 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -47,6 +46,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -198,6 +198,7 @@ Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.preserve_assert_op = toco_flags.preserve_assert_op();
   pass_config.guarantee_all_funcs_one_use =
       toco_flags.guarantee_all_funcs_one_use();
+  pass_config.enable_stablehlo_conversion = toco_flags.convert_to_stablehlo();
 
   // TODO(b/153507667): Pass the session object when importing logic is removed.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 2e10602e7f6..1ddba633947 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
 
+#include <optional>
 #include <ostream>
 #include <string>
 #include <unordered_set>
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -43,8 +43,9 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 namespace tensorflow {
 namespace internal {
@@ -234,7 +235,7 @@ Status PopulateQuantizationSpecs(
           DataType_Name(ConvertIODataTypeToDataType(toco_data_type)));
     }
     if (flag.shape().unknown_rank()) {
-      node_shapes->push_back(llvm::None);
+      node_shapes->push_back(std::nullopt);
     } else {
       node_shapes->push_back(std::vector<int>(flag.shape().dims().begin(),
                                               flag.shape().dims().end()));
@@ -248,8 +249,8 @@ Status PopulateQuantizationSpecs(
         node_mins->push_back(min_max.first);
         node_maxs->push_back(min_max.second);
       } else {
-        node_mins->push_back(llvm::None);
-        node_maxs->push_back(llvm::None);
+        node_mins->push_back(std::nullopt);
+        node_maxs->push_back(std::nullopt);
       }
     }
   }
@@ -306,9 +307,10 @@ Status PopulateQuantizationSpecs(
   if (toco_flags.has_default_ranges_max()) {
     quant_specs->default_ranges.second = toco_flags.default_ranges_max();
   }
-  if (toco_flags.enable_mlir_dynamic_range_quantizer()) {
-    quant_specs->enable_mlir_dynamic_range_quantizer = true;
-  }
+  quant_specs->enable_mlir_dynamic_range_quantizer =
+      toco_flags.enable_mlir_dynamic_range_quantizer();
+  quant_specs->enable_mlir_variable_quantization =
+      toco_flags.enable_mlir_variable_quantization();
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index c9b5b49ca88..37d79b679c0 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace internal {
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index 1c5328d3080..33d54e4e449 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -5,8 +5,10 @@ load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
+        "//tensorflow:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -158,6 +160,8 @@ tf_native_cc_binary(
         "tools/tflite_op_coverage_spec_getters_gen.cc",
     ],
     deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:regexp",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TableGen",
diff --git a/tensorflow/compiler/mlir/lite/quantization/device_target.cc b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
index 914c6f5f419..5297e55f599 100644
--- a/tensorflow/compiler/mlir/lite/quantization/device_target.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/device_target.cc
@@ -55,7 +55,7 @@ DeviceTarget::DeviceTarget(MLIRContext* ctx) : ctx_(ctx) {
 Optional<KernelSpec> DeviceTarget::GetKernelSpec(
     llvm::StringRef kernel, const KernelSpecs::Signature& signature) const {
   auto kernel_specs_it = specs_.find(kernel);
-  if (kernel_specs_it == specs_.end()) return llvm::None;
+  if (kernel_specs_it == specs_.end()) return std::nullopt;
   return kernel_specs_it->getValue().Find(signature);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/device_target.h b/tensorflow/compiler/mlir/lite/quantization/device_target.h
index a74a696da80..a49035a24db 100644
--- a/tensorflow/compiler/mlir/lite/quantization/device_target.h
+++ b/tensorflow/compiler/mlir/lite/quantization/device_target.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_DEVICE_TARGET_H_
 
 #include <functional>
+#include <optional>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
@@ -78,7 +79,7 @@ class KernelSpecs {
     if (spec_it != all_signatures_.end()) {
       return spec_it->second;
     } else {
-      return llvm::None;
+      return std::nullopt;
     }
   }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
index 04dd6034cfd..bd718c14d23 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
@@ -3,6 +3,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
index f9cb74826ec..3bd80ad4a7b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.cc
@@ -40,7 +40,7 @@ void QuantizationForkDialect::initialize() {
       >();
 }
 
-OpFoldResult StorageCastOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult StorageCastOp::fold(FoldAdaptor) {
   // Matches x -> [scast -> scast] -> y, replacing the second scast with the
   // value of x if the casts invert each other.
   auto srcScastOp = getArg().getDefiningOp<StorageCastOp>();
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
index e6851daf260..54e730d2b80 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantOpsBase.td
@@ -27,8 +27,7 @@ include "mlir/IR/OpBase.td"
 def QuantizationFork_Dialect : Dialect {
   let name = "quantfork";
   let cppNamespace = "::mlir::quantfork";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif // QUANT_FORK_BASE
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 789b56a00ed..20346adde81 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -2,8 +2,10 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
+        "//tensorflow:__pkg__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 4195dfdc44a..e21105fc5c4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -56,9 +57,11 @@ TfLiteStatus QuantizeModel(
     flatbuffers::FlatBufferBuilder* builder,
     tflite::ErrorReporter* error_reporter, bool verify_numeric,
     bool whole_model_verify, bool legacy_float_scale,
-    const StringSet& denylisted_ops, const StringSet& denylisted_nodes) {
+    const absl::flat_hash_set<std::string>& denylisted_ops,
+    const absl::flat_hash_set<std::string>& denylisted_nodes,
+    const bool enable_variable_quantization) {
   // Translate TFLite names to mlir op names.
-  StringSet denylisted_mlir_op_names;
+  absl::flat_hash_set<std::string> denylisted_mlir_op_names;
   for (const auto& entry : denylisted_ops) {
     denylisted_mlir_op_names.insert(TfLiteToMlir(entry));
   }
@@ -87,7 +90,7 @@ TfLiteStatus QuantizeModel(
   }
 
   // Apply quantization passes.
-  PassManager pm(module->getContext(), OpPassManager::Nesting::Implicit);
+  PassManager pm((*module)->getName(), OpPassManager::Nesting::Implicit);
   quant::QuantizationSpecs quant_specs;
   quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
@@ -97,6 +100,7 @@ TfLiteStatus QuantizeModel(
   quant_specs.legacy_float_scale = legacy_float_scale;
   quant_specs.ops_blocklist = denylisted_mlir_op_names;
   quant_specs.nodes_blocklist = denylisted_nodes;
+  quant_specs.enable_mlir_variable_quantization = enable_variable_quantization;
 
   llvm::dbgs() << "fully_quantize: " << fully_quantize
                << ", inference_type: " << quant_specs.inference_type
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index eba106589ed..243af219da6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -27,8 +27,6 @@ limitations under the License.
 namespace mlir {
 namespace lite {
 
-using StringSet = absl::flat_hash_set<std::string>;
-
 // Quantize the `input_model` and write the result to a flatbuffer `builder`.
 // The `input_type`, `output_type` and `inference_type` can be
 // float32/qint8/int8/int16.
@@ -47,8 +45,9 @@ TfLiteStatus QuantizeModel(
     flatbuffers::FlatBufferBuilder* builder,
     tflite::ErrorReporter* error_reporter, bool verify_numeric = false,
     bool whole_model_verify = false, bool legacy_float_scale = true,
-    const StringSet& denylisted_ops = {},
-    const StringSet& denylisted_nodes = {});
+    const absl::flat_hash_set<std::string>& denylisted_ops = {},
+    const absl::flat_hash_set<std::string>& denylisted_nodes = {},
+    bool enable_variable_quantization = false);
 }  // namespace lite
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
index b60ba282a49..ce87e5d8f92 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
@@ -51,7 +52,7 @@ using llvm::StringRef;
 // Convert op represented in TFLite builtin_code to its corresponding MLIR
 // OperationName.
 void TfLiteBuiltinOpToMlir(const BuiltinOperatorSet& tflite_builtin_codes,
-                           StringSet& mlir_op_names) {
+                           absl::flat_hash_set<std::string>& mlir_op_names) {
   for (const auto& entry : tflite_builtin_codes) {
     StringRef tflite_op_name = EnumNameBuiltinOperator(entry);
     std::string mlir_name = llvm::Twine("tfl.", tflite_op_name.lower()).str();
@@ -77,12 +78,13 @@ std::unique_ptr<tflite::ModelT> CreateMutableModelFromFile(
 TfLiteStatus QuantizeWeights(
     flatbuffers::FlatBufferBuilder* builder, const tflite::Model* input_model,
     tflite::ErrorReporter* error_reporter,
-    const tflite::TensorType& inference_type, const StringSet& denylisted_ops,
+    const tflite::TensorType& inference_type,
+    const absl::flat_hash_set<std::string>& denylisted_ops,
     const CustomOpMap& custom_op_map, int64_t minimum_elements_for_weights,
     bool disable_per_channel, bool weight_only_quantization,
     bool legacy_float_scale) {
   // Translate TFLite names to mlir op names.
-  StringSet denylisted_mlir_op_names;
+  absl::flat_hash_set<std::string> denylisted_mlir_op_names;
   for (auto& entry : denylisted_ops) {
     denylisted_mlir_op_names.insert(TfLiteToMlir(entry));
   }
@@ -106,7 +108,7 @@ TfLiteStatus QuantizeWeights(
       serialized_model, &context, UnknownLoc::get(&context));
 
   // Apply quantization passes.
-  PassManager pm(module->getContext(), OpPassManager::Nesting::Implicit);
+  PassManager pm((*module)->getName(), OpPassManager::Nesting::Implicit);
   quant::QuantizationSpecs quant_specs;
   quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.weight_quantization = true;
@@ -215,7 +217,7 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
   tflite::StderrReporter error_reporter;
   const tflite::TensorType inference_type = tflite::TensorType_INT8;
 
-  StringSet mlir_op_denylist;
+  absl::flat_hash_set<std::string> mlir_op_denylist;
   TfLiteBuiltinOpToMlir(op_denylist, mlir_op_denylist);
 
   return QuantizeWeights(
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
index 7fac2b5ab64..d7cb5ab1fe6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
@@ -44,7 +44,6 @@ struct CustomOpInfo {
   bool no_side_effect = true;
 };
 
-using StringSet = absl::flat_hash_set<std::string>;
 using BuiltinOperatorSet = absl::flat_hash_set<tflite::BuiltinOperator>;
 // Map from custom op code to custom op quantization information.
 using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
@@ -57,16 +56,15 @@ using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
 // third_party/tensorflow/lite/tools/optimize/quantize_weights.h.
 // TODO(b/202468183): Selective quantization + quant debugger support for
 // dynamic range quantization for verify_numeric and whole_model_verify flags.
-TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
-                             const tflite::Model* input_model,
-                             tflite::ErrorReporter* error_reporter,
-                             const tflite::TensorType& inference_type,
-                             const StringSet& denylisted_ops,
-                             const CustomOpMap& custom_op_map,
-                             int64_t minimum_elements_for_weights = 1024,
-                             bool disable_per_channel = false,
-                             bool weight_only_quantization = false,
-                             bool legacy_float_scale = false);
+TfLiteStatus QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const tflite::Model* input_model,
+    tflite::ErrorReporter* error_reporter,
+    const tflite::TensorType& inference_type,
+    const absl::flat_hash_set<std::string>& denylisted_ops,
+    const CustomOpMap& custom_op_map,
+    int64_t minimum_elements_for_weights = 1024,
+    bool disable_per_channel = false, bool weight_only_quantization = false,
+    bool legacy_float_scale = false);
 
 // Overloading methods to support old quantizer versions API
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index c89bcac032c..adbed4d6eb0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -66,7 +66,7 @@ void ParseCustomOpSpecs(absl::string_view node_names,
     auto node_specification = node_infos[1];
     CustomOpInfo new_node_info;
     switch (update_option) {
-      case CustomOpUpdateOptions::kINputIndices: {
+      case CustomOpUpdateOptions::kInputIndices: {
         std::vector<std::string> indices =
             absl::StrSplit(node_specification, '-');
         for (auto& cur_index : indices) {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 1cdf17f9c24..a523c9009f3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -42,9 +42,8 @@ struct CustomOpInfo {
 };
 
 using ::tflite::optimize::ReducedPrecisionSupport;
-using StringSet = absl::flat_hash_set<std::string>;
 using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
-enum CustomOpUpdateOptions { kINputIndices, kWeightOnly, kNoSideEffect };
+enum CustomOpUpdateOptions { kInputIndices, kWeightOnly, kNoSideEffect };
 
 struct QuantizationSpecs {
   // Which function this node quant specifications belong to.
@@ -92,6 +91,12 @@ struct QuantizationSpecs {
   // quantization.
   bool disable_infer_tensor_range = false;
 
+  // Whether use the unfrozen variable quantization in MLIR. Typically,
+  // variables are frozen for passing passes, but some variables aren't frozen.
+  // If it is true, QuantizeVariables pass will be added after the
+  // PrepareQuantizePass.
+  bool enable_mlir_variable_quantization = false;
+
   // The node type when the model is exported. Currently this is limited to
   // DT_FLOAT, DT_HALF, DT_QINT8, and DT_QUINT8. When DT_HALF is used, the
   // `weight_quantization` flag needs to set to true. When DT_QUINT8 is used,
@@ -194,10 +199,10 @@ struct QuantizationSpecs {
   // Names of ops to block from quantization. Used in QuantizePass.
   // For dynamic range quantization, ops in blocklist are quantized in weight-
   // only manner.
-  StringSet ops_blocklist;
+  absl::flat_hash_set<std::string> ops_blocklist;
 
   // Names of locations to block from quantization. Used in QuantizePass.
-  StringSet nodes_blocklist;
+  absl::flat_hash_set<std::string> nodes_blocklist;
 
   // Map from custom op code to custom op quantization information.
   // For dynamic range quantization, among the custom ops in the graph those
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 7241b78fb52..18a812fea8e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -50,8 +50,8 @@ namespace mlir {
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_interface.cc.inc"
 
 namespace quant {
-
 namespace {
+
 constexpr double kSmallestHalfRange = kNearZeroTolerance / 2;
 using QType = quant::QuantizedType;
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index bf90c38a07f..100e911b8bb 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <memory>
 #include <string>
+#include <type_traits>
 #include <unordered_map>
 
 #include "absl/container/flat_hash_set.h"
@@ -35,7 +37,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -74,7 +76,7 @@ constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
 
 constexpr double kNearZeroTolerance = 1.0e-6;
 
-using QuantParams = mlir::quant::QuantizedType;
+using QuantParams = QuantizedType;
 using QuantSpec = QuantizationSpecs;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
 using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
@@ -89,7 +91,6 @@ using RequiredSameOperandsAndResultsScaleFunc = std::function<bool(bool, int)>;
 // bool RequiredSameQuantizedAxes()
 using RequiredSameQuantizedAxesFunc = std::function<bool()>;
 
-using StringSet = absl::flat_hash_set<std::string>;
 using CustomMap = quant::CustomOpMap;
 
 // Quantization spec of an op, driving the quantization algorithm.
@@ -107,10 +108,10 @@ struct OpQuantSpec {
   llvm::DenseMap<SignedInteger, QuantParamsForResults> restricted_output_params;
 
   // Coefficient operand index and whether supporting per-channel quantization.
-  // For QAT, this information is carried by the FakeQuant*/QDQ ops, but
-  // post-training quantization, the quantization parameters need to be inferred
-  // from the tensor content and op property. A "-1" value indicates the
-  // operand doesn't support per-channel quantization.
+  // For QAT, this information is carried by the FakeQuant*/Quantize/Dequantize
+  // ops, but post-training quantization, the quantization parameters need to be
+  // inferred from the tensor content and op property. A "-1" value indicates
+  // the operand doesn't support per-channel quantization.
   llvm::DenseMap<int, int> coeff_op_quant_dim;
 
   // Indices of quantizable operands. Biases are not included in this field,
@@ -118,6 +119,11 @@ struct OpQuantSpec {
   absl::flat_hash_set<int> quantizable_operands;
 };
 
+// A function signature for getting the particular OpQuantSpec for the provided
+// op.
+using OpQuantSpecGetter =
+    std::function<std::unique_ptr<OpQuantSpec>(Operation*)>;
+
 // Quantization scale spec of an op. The information defined in the MLIR
 // interfaces FixedOutputRangeInterface and SameOperandsAndResultsScale should
 // be checked first if present.
@@ -137,6 +143,11 @@ struct OpQuantScaleSpec {
   };
 };
 
+// A function signature for getting the particular OpQuantScaleSpec for the
+// provided op.
+using OpQuantScaleSpecGetter =
+    std::function<std::unique_ptr<OpQuantScaleSpec>(Operation*)>;
+
 // Used in TFL Numeric Verify
 struct NumericVerifySpec {
   // Whether to enable numeric verification
@@ -162,14 +173,6 @@ struct QuantPassSpec {
   QuantSpec quant_spec;
 };
 
-// A function signature for getting the particular OpQuantSpec for the provided
-// op.
-typedef std::unique_ptr<OpQuantSpec> (*OpQuantSpecGetter)(Operation* op);
-// A function signature for getting the particular OpQuantScaleSpec for the
-// provided op.
-typedef std::unique_ptr<OpQuantScaleSpec> (*OpQuantScaleSpecGetter)(
-    Operation* op);
-
 // Re-calculates scales again in float instead of simply downcasting existing
 // scales.
 quant::QuantizedType DownCastScale(quant::QuantizedType type,
@@ -190,7 +193,7 @@ inline std::string GetTensorNameFromLoc(Location loc) {
   return "";
 }
 
-template <typename Q, typename DQ>
+template <typename QuantizeOpT, typename DequantizeOpT>
 struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
   ConvertStatsToQDQs(int num_bits, bool narrow_range, bool is_signed,
                      bool legacy_float_scale, MLIRContext* context)
@@ -207,6 +210,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
     SmallVector<double, 4> mins, maxs;
 
     if (op.getAxisStats().has_value()) {
+      // Per axis quantization (or per channel quantization)
       int stats_num = op.getAxisStats()->getNumElements();
       if (stats_num == 0 || stats_num % 2 != 0) return failure();
       auto stats = op.getAxisStats()->dyn_cast<DenseFPElementsAttr>();
@@ -220,6 +224,13 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
         // So here we adjust the range to include 0.0.
         rmin = std::min(rmin, 0.0);
         rmax = std::max(rmax, 0.0);
+        if (num_bits == 16) {
+          // TODO(b/266536261): Since the kernel implementation assumes that
+          // 16x8 integer quantization is symmetric, this MLIR quantizer
+          // supports only symmetric quantization.
+          rmax = std::max(std::abs(rmin), std::abs(rmax));
+          rmin = -rmax;
+        }
         TensorRangeSanityCheck(op, rmin, rmax);
         mins.push_back(rmin);
         maxs.push_back(rmax);
@@ -232,6 +243,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
       }
     } else if (auto stats =
                    op.getLayerStats().dyn_cast<DenseFPElementsAttr>()) {
+      // Per tensor quantization
       auto statValues = stats.getValues<APFloat>();
       double rmin = FloatAttr::getValueAsDouble(statValues[0]);
       double rmax = FloatAttr::getValueAsDouble(statValues[1]);
@@ -240,6 +252,13 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
       // So here we adjust the range to include 0.0.
       rmin = std::min(rmin, 0.0);
       rmax = std::max(rmax, 0.0);
+      if (num_bits == 16) {
+        // TODO(b/266536261): Since the kernel implementation assumes that
+        // 16x8 integer quantization is symmetric, this MLIR quantizer supports
+        // only symmetric quantization.
+        rmax = std::max(std::abs(rmin), std::abs(rmax));
+        rmin = -rmax;
+      }
       TensorRangeSanityCheck(op, rmin, rmax);
       quant_type =
           quantfork::fakeQuantAttrsToType(op.getLoc(), num_bits, rmin, rmax,
@@ -253,10 +272,11 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quantfork::StatisticsOp> {
 
     rewriter.setInsertionPointAfter(op.getOperation());
     Type result_type = quant_type.castFromExpressedType(op.getType());
-    auto q = rewriter.create<Q>(op.getLoc(), result_type, op.getArg());
+    auto q =
+        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
     q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
 
-    auto dq = rewriter.create<DQ>(op.getLoc(), op.getType(), q);
+    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
     op.getResult().replaceAllUsesWith(dq);
     q.getOperation()->replaceUsesOfWith(dq, op.getArg());
     op.erase();
@@ -330,8 +350,8 @@ inline void CreateVerifier<void>(Operation* quantizing_op,
 
 // A base rewrite pattern which matches any N-in-M-out operations with
 // quantization parameters propagated to at least one of its operands. The
-// quantization parameters are annotated by the Q/DQ op pairs. Each
-// matched pattern are rewritten by its quantized alternatives.
+// quantization parameters are annotated by the QuantizeOp/DequantizeOp pairs.
+// Each matched pattern are rewritten by its quantized alternatives.
 //
 // The concrete pattern, extends from this base pattern, can specify whether it
 // allows dynamic range quantized operands and results for the operations in the
@@ -346,16 +366,17 @@ inline void CreateVerifier<void>(Operation* quantizing_op,
 // Full integer quantization disallows "DynamicRangeQuantized" operands or
 // results. Dynamic range quantization allows "DynamicRangeQuantized" operands
 // and results.
-template <typename ConcretTy, typename Q, typename DQ, typename VERIFIER,
-          typename RootOp = DQ>
+template <typename ConcreteT, typename QuantizeOpT, typename DequantizeOpT,
+          typename VerifierT, typename RootOpT = DequantizeOpT>
 class QuantizationPattern : public RewritePattern {
  public:
-  using BaseType = QuantizationPattern<ConcretTy, Q, DQ, VERIFIER, RootOp>;
+  using BaseType = QuantizationPattern<ConcreteT, QuantizeOpT, DequantizeOpT,
+                                       VerifierT, RootOpT>;
 
   explicit QuantizationPattern(MLIRContext* context,
                                const QuantPassSpec& quant_params)
       // Set the score to a large number so it is always preferred.
-      : RewritePattern(RootOp::getOperationName(), 300, context),
+      : RewritePattern(RootOpT::getOperationName(), 300, context),
         quant_params_(quant_params) {}
 
   LogicalResult matchAndRewrite(Operation* op,
@@ -363,24 +384,25 @@ class QuantizationPattern : public RewritePattern {
     llvm::SmallVector<Operation*, 4> quantizing_ops;
 
     // Collect all the ops to quantize, as the user / producer of the root op.
-    if (std::is_same<RootOp, DQ>::value) {
+    if constexpr (std::is_same_v<RootOpT, DequantizeOpT>) {
       if (op->getNumResults() != 1) {
         return failure();
       }
       auto users = op->getResult(0).getUsers();
       quantizing_ops.append(users.begin(), users.end());
-    } else if (std::is_same<RootOp, Q>::value) {
+    } else if constexpr (std::is_same_v<RootOpT, QuantizeOpT>) {
       if (op->getNumOperands() != 1) {
         return failure();
       }
       Value quantize_operand = op->getOperand(0);
       if (QuantizedType::getQuantizedElementType(quantize_operand.getType())) {
-        // The input of this Q op has already been quantized, i.e. rescale.
+        // The input of this QuantizeOp has already been quantized, i.e.
+        // rescale.
         return failure();
       }
       DenseFPElementsAttr attr;
       if (matchPattern(quantize_operand, m_Constant(&attr))) {
-        // Const->Q pattern will be handled separately.
+        // Const-> QuantizeOp pattern will be handled separately.
         return failure();
       }
       if (Operation* quantizing_op = quantize_operand.getDefiningOp()) {
@@ -395,15 +417,17 @@ class QuantizationPattern : public RewritePattern {
     bool enable_verify = quant_params_.numeric_verify_spec.verify_numeric;
     bool enable_whole_model_verify =
         quant_params_.numeric_verify_spec.whole_model_verify;
-    StringSet ops_blocklist = quant_params_.quant_spec.ops_blocklist;
-    StringSet nodes_blocklist = quant_params_.quant_spec.nodes_blocklist;
+    absl::flat_hash_set<std::string> ops_blocklist =
+        quant_params_.quant_spec.ops_blocklist;
+    absl::flat_hash_set<std::string> nodes_blocklist =
+        quant_params_.quant_spec.nodes_blocklist;
     CustomMap custom_map = quant_params_.quant_spec.custom_map;
 
     // Rewrite the floating-point ops to the quantized version, by fusing
     // preceding dequantize ops and succeding quantize ops.
     for (Operation* quantizing_op : quantizing_ops) {
       // If it is requantize op, we shouldn't rewrite this op.
-      if (llvm::isa<Q, DQ>(quantizing_op)) {
+      if (llvm::isa<QuantizeOpT, DequantizeOpT>(quantizing_op)) {
         return failure();
       }
 
@@ -416,7 +440,7 @@ class QuantizationPattern : public RewritePattern {
       }
 
       if (IsOpNotQuantizable(quantizing_op) &&
-          !static_cast<const ConcretTy*>(this)->IsQuantizableCustomOp(
+          !static_cast<const ConcreteT*>(this)->IsQuantizableCustomOp(
               quantizing_op, custom_map)) {
         if (!(enable_verify && enable_whole_model_verify)) {
           return failure();
@@ -455,7 +479,7 @@ class QuantizationPattern : public RewritePattern {
 
       // An op with float inputs and outputs are expected when it's used by a
       // NumericVerify op. Skip this op.
-      if (enable_verify && UsedBy<VERIFIER>(quantizing_op)) {
+      if (enable_verify && UsedBy<VerifierT>(quantizing_op)) {
         continue;
       }
 
@@ -471,17 +495,17 @@ class QuantizationPattern : public RewritePattern {
         }
 
         auto ele_type = operand.getType().cast<TensorType>().getElementType();
-        if (static_cast<const ConcretTy*>(this)
+        if (static_cast<const ConcreteT*>(this)
                 ->AllowDynamicRangeQuantizedOperand(quantizing_op,
                                                     custom_map)) {
-          auto dq_op = dyn_cast_or_null<DQ>(operand.getDefiningOp());
+          auto dq_op = dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp());
 
           if (dq_op && inference_type == tensorflow::DT_QINT8 &&
-              !static_cast<const ConcretTy*>(this)->IsWeightOnlyOp(
+              !static_cast<const ConcreteT*>(this)->IsWeightOnlyOp(
                   quantizing_op, ops_blocklist, weight_only_quantization,
                   custom_map)) {
-            // Dynamic range quantization is applied by having Q as an input.
-            // Only int8 weight is supported for now.
+            // Dynamic range quantization is applied by having QuantizeOp as an
+            // input. Only int8 weight is supported for now.
             inputs.push_back(dq_op.getOperand());
           } else {
             // Otherwise, it's the case where the operand is activations or the
@@ -489,11 +513,12 @@ class QuantizationPattern : public RewritePattern {
             inputs.push_back(operand);
           }
         } else {
-          if (auto dq_op = dyn_cast_or_null<DQ>(operand.getDefiningOp())) {
+          if (auto dq_op =
+                  dyn_cast_or_null<DequantizeOpT>(operand.getDefiningOp())) {
             inputs.push_back(dq_op.getOperand());
           } else if (!ele_type.isF32()) {
             // If the operand is an integer tensor, then it doesn't require the
-            // DQ op in the pattern.
+            // DequantizeOp in the pattern.
             inputs.push_back(operand);
           } else {
             return failure();
@@ -519,9 +544,10 @@ class QuantizationPattern : public RewritePattern {
         }
         Type result_ele_type =
             result.getType().cast<TensorType>().getElementType();
-        // If the user is the Quantize op, it must be the only user.
-        if (result.hasOneUse() && llvm::isa<Q>(*result.user_begin())) {
-          auto user = llvm::cast<Q>(*result.user_begin());
+        // If the user is the QuantizeOp, it must be the only user.
+        if (result.hasOneUse() &&
+            llvm::isa<QuantizeOpT>(*result.user_begin())) {
+          auto user = llvm::cast<QuantizeOpT>(*result.user_begin());
           outputs_replaced.insert(
               {user.getResult(), enumerated_result.index()});
           output_types.push_back(user.getType());
@@ -530,7 +556,7 @@ class QuantizationPattern : public RewritePattern {
           // D op in the pattern.
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
-        } else if (static_cast<const ConcretTy*>(this)
+        } else if (static_cast<const ConcreteT*>(this)
                        ->AllowDynamicRangeQuantizedResult(quantizing_op,
                                                           custom_map)) {
           outputs_replaced.insert({result, enumerated_result.index()});
@@ -553,7 +579,7 @@ class QuantizationPattern : public RewritePattern {
              llvm::enumerate(quantizing_op->getRegions())) {
           Region& target_region =
               quantized_op->getRegion(indexed_regions.index());
-          BlockAndValueMapping mapping;
+          IRMapping mapping;
           indexed_regions.value().cloneInto(&target_region, mapping);
         }
       }
@@ -565,12 +591,12 @@ class QuantizationPattern : public RewritePattern {
       // To verify the numericals, the original floating-point ops are
       // preserved in the graph. The result of these floating-point ops are sent
       // to a numeric verifier op as the reference.
-      if (enable_verify && !std::is_same<VERIFIER, void>()) {
+      if (enable_verify && !std::is_same_v<VerifierT, void>) {
         // For constant operands, the floating-point constant is duplicated in
         // case it is quantized.
         for (int i = 0, e = quantized_op->getNumOperands(); i < e; ++i) {
           auto def = quantized_op->getOperand(i).getDefiningOp();
-          if (auto q = llvm::dyn_cast_or_null<Q>(def)) {
+          if (auto q = llvm::dyn_cast_or_null<QuantizeOpT>(def)) {
             DenseFPElementsAttr attr;
             if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
               continue;
@@ -589,8 +615,8 @@ class QuantizationPattern : public RewritePattern {
                    .isa<FloatType>()) {
             continue;
           }
-          CreateVerifier<VERIFIER>(quantizing_op, quantized_op, rewriter, i,
-                                   quant_params_);
+          CreateVerifier<VerifierT>(quantizing_op, quantized_op, rewriter, i,
+                                    quant_params_);
 
           if (enable_whole_model_verify) {
             RewireFloatModelBackbone(quantized_op, quantizing_op);
@@ -623,11 +649,11 @@ class QuantizationPattern : public RewritePattern {
       if (IsOpNotQuantizable(float_op)) {
         // For not quantizable ops, search for dequantize attached to the
         // quantized op of the output.
-        if (Operation* quantize_op = dyn_cast_or_null<Q>(
+        if (Operation* quantize_op = dyn_cast_or_null<QuantizeOpT>(
                 *quantized_op->getResult(i).getUsers().begin())) {
           result = quantize_op->getResult(0);
         } else {
-          quantize_op->emitError()
+          quantized_op->emitError()
               << "Output[" << i
               << "] is expected to have only one user [QUANTIZE]";
           return;
@@ -638,12 +664,12 @@ class QuantizationPattern : public RewritePattern {
       for (auto user : result.getUsers()) {
         // Skip the Requantize op and set the user to the following dequantize
         // op. This happens when the quantizer tries to match the scale conflict
-        // with Q - Q(requant) - DQ op triples. The correct float op should be
-        // the user of the last DQ op.
-        if (llvm::isa<Q>(user)) {
+        // with QuantizeOp - QuantizeOp(requant) - DequantizeOp triples. The
+        // correct float op should be the user of the last DequantizeOp.
+        if (llvm::isa<QuantizeOpT>(user)) {
           user = *user->getResult(0).getUsers().begin();
         }
-        if (auto dequantize = llvm::dyn_cast<DQ>(user)) {
+        if (auto dequantize = llvm::dyn_cast<DequantizeOpT>(user)) {
           // Replace all uses, except not quantizable ops that are being used in
           // the float backbone.
           dequantize.getResult().replaceUsesWithIf(
@@ -674,15 +700,15 @@ Type ConvertSignedQuantizedToUnsigned(Type signed_tensor_type, Location loc);
 
 // Converts quantize ops with unsigned quantized types to these with signed
 // quantized types and preserves the scales.
-template <typename Q>
-struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
-  using BaseType = ConvertUnsignedToSigned<Q>;
+template <typename QuantizeOpT>
+struct ConvertUnsignedToSigned : public OpRewritePattern<QuantizeOpT> {
+  using BaseType = ConvertUnsignedToSigned<QuantizeOpT>;
   using QType = quant::QuantizedType;
 
   explicit ConvertUnsignedToSigned(MLIRContext* context)
-      : OpRewritePattern<Q>(context, 1) {}
+      : OpRewritePattern<QuantizeOpT>(context, 1) {}
 
-  LogicalResult matchAndRewrite(Q op,
+  LogicalResult matchAndRewrite(QuantizeOpT op,
                                 PatternRewriter& rewriter) const override {
     Type output_type = op.getResult().getType();
     auto qtype = QType::getQuantizedElementType(output_type);
@@ -728,18 +754,18 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
     if (!new_qtype) return failure();
     Type new_output_type = new_qtype.castFromExpressedType(
         QType::castToExpressedType(output_type));
-    rewriter.replaceOpWithNewOp<Q>(op, new_output_type, op.getArg());
+    rewriter.replaceOpWithNewOp<QuantizeOpT>(op, new_output_type, op.getArg());
     return success();
   }
 };
 
 // Fold Extra Requantize ops if the preceding ops has free scale requirement.
-template <typename RQ>
-struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
+template <typename RequantizeOpT>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RequantizeOpT> {
   explicit FoldTrivalRequantizeOp(MLIRContext* context)
-      : OpRewritePattern<RQ>(context, 1) {}
+      : OpRewritePattern<RequantizeOpT>(context, 1) {}
 
-  LogicalResult matchAndRewrite(RQ op,
+  LogicalResult matchAndRewrite(RequantizeOpT op,
                                 PatternRewriter& rewriter) const override {
     Value pre_quantized = op->getOperand(0);
     auto pre_quantized_type =
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
index 4cf3e5a0346..9652196367f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
@@ -3,6 +3,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
index dc36508a866..03332e19f6a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index b6d4d8f5633..6afc81e8ce9 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -93,23 +93,23 @@ struct InsertQuantOpsAfterTFFakeQuantOp
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
                                 PatternRewriter &rewriter) const override {
     // We don't want to insert quantize/dequantize if the quantize op exists.
-    auto res = tf_op.outputs();
+    auto res = tf_op.getOutputs();
     if (!res.hasOneUse() || isa<quantfork::QuantizeCastOp>(*res.user_begin()))
       return failure();
 
     // Extract the min/max constant values from the operands. We also consider
     // a special case that there are tf.Identity ops between the min/max
     // constants and the tf.FakeQuantWithMinMaxVarsOp.
-    Value min = tf_op.min(), max = tf_op.max();
+    Value min = tf_op.getMin(), max = tf_op.getMax();
     DenseFPElementsAttr min_value, max_value;
     if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp())) {
-      id1.replaceAllUsesWith(id1.input());
-      min = tf_op.min();
+      id1.replaceAllUsesWith(id1.getInput());
+      min = tf_op.getMin();
       rewriter.eraseOp(id1);
     }
     if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp())) {
-      id2.replaceAllUsesWith(id2.input());
-      max = tf_op.max();
+      id2.replaceAllUsesWith(id2.getInput());
+      max = tf_op.getMax();
       rewriter.eraseOp(id2);
     }
     if (!matchPattern(min, m_Constant(&min_value))) return failure();
@@ -124,8 +124,8 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
     rewriter.setInsertionPointAfter(tf_op.getOperation());
-    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
-    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
@@ -135,7 +135,7 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     // Finally, use the quantization parameter to create the quantize and
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
-    Value value = tf_op.outputs();
+    Value value = tf_op.getOutputs();
     auto quantize = rewriter.create<quantfork::QuantizeCastOp>(
         tf_op.getLoc(), qtype.getValue(), value);
     auto dequantize = rewriter.create<quantfork::DequantizeCastOp>(
diff --git a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
index dc36508a866..03332e19f6a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
index ce0cded8bb5..1cbf9cb71af 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <list>
-#include <regex>  // NOLINT
 #include <string>
 
 #include "absl/strings/match.h"
@@ -24,6 +23,8 @@ limitations under the License.
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "mlir/TableGen/Operator.h"  // from @llvm-project
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/regexp.h"
 
 using llvm::LessRecord;
 using llvm::raw_ostream;
@@ -143,7 +144,8 @@ bool CheckTypeConstraints(llvm::Init *input_value,
 
 void GenerateStaticQuantOp(std::vector<Record *> &defs,
                            std::vector<std::string> &result,
-                           InputDataType act_type, bool per_axis) {
+                           InputDataType act_type, const bool per_axis,
+                           const bool is_toco) {
   std::list<std::string> required_types = {
       GetTypeToStringRepresentation().at("F32")};
 
@@ -169,8 +171,8 @@ void GenerateStaticQuantOp(std::vector<Record *> &defs,
   // Dimension equals to -1 means per-channel quantization is not supported for
   // the op. Therefore check whether the return value is positive integer as
   // well.
-  std::regex per_channel_support_regex(
-      "(.*)(int GetQuantizationDimIndex\\(\\) \\{ return (\\d*); \\})(.*)");
+  static const LazyRE2 per_channel_support_regex = {
+      "int GetQuantizationDimIndex\\(\\) \\{ return (\\d*); \\}"};
 
   for (const auto *def : defs) {
     Operator op(def);
@@ -188,11 +190,22 @@ void GenerateStaticQuantOp(std::vector<Record *> &defs,
                              per_axis)) {
       std::string op_name = op.getCppClassName().str();
 
+      // TODO(b/197195711): Please add the additional operations for 16x8 MLIR
+      // quantizer. This code is temporary until 16x8 is fully supported in MLIR
+      // quantizer.
+      if (act_type == InputDataType::INT16) {
+        if (absl::StrContains(op_name, "LSTMOp") && is_toco) {
+          continue;
+        } else if (!absl::StrContains(op_name, "LSTMOp") && !is_toco) {
+          continue;
+        }
+      }
+
       if (per_axis) {
         std::string op_extra_declaration = op.getExtraClassDeclaration().str();
-        bool per_axis_support = std::regex_match(
+        bool per_axis_support = RE2::PartialMatch(
             absl::StrReplaceAll(op_extra_declaration, {{"\n", " "}}),
-            per_channel_support_regex);
+            *per_channel_support_regex);
         if (per_axis_support) result.emplace_back(op_name);
       } else {
         result.emplace_back(op_name);
@@ -209,7 +222,8 @@ void EmitStaticInt8PerAxisQuantOp(std::vector<Record *> &defs,
   os.indent(4) << "new std::set<std::string>({\n";
 
   std::vector<std::string> result;
-  GenerateStaticQuantOp(defs, result, InputDataType::INT8, true);
+  GenerateStaticQuantOp(defs, result, InputDataType::INT8, /*per_axis=*/true,
+                        /*is_toco=*/false);
 
   for (const auto &op_name : result) {
     os.indent(6) << "\"" << op_name << "\",\n";
@@ -228,7 +242,8 @@ void EmitStaticInt8PerTensorQuantOp(std::vector<Record *> &defs,
   os.indent(4) << "new std::set<std::string>({\n";
 
   std::vector<std::string> result;
-  GenerateStaticQuantOp(defs, result, InputDataType::INT8, false);
+  GenerateStaticQuantOp(defs, result, InputDataType::INT8, /*per_axis=*/false,
+                        /*is_toco=*/false);
 
   for (const auto &op_name : result) {
     os.indent(6) << "\"" << op_name << "\",\n";
@@ -247,7 +262,8 @@ void EmitStaticUInt8PerAxisQuantOp(std::vector<Record *> &defs,
   os.indent(4) << "new std::set<std::string>({\n";
 
   std::vector<std::string> result;
-  GenerateStaticQuantOp(defs, result, InputDataType::UINT8, true);
+  GenerateStaticQuantOp(defs, result, InputDataType::UINT8, /*per_axis=*/true,
+                        /*is_toco=*/false);
 
   for (const auto &op_name : result) {
     os.indent(6) << "\"" << op_name << "\",\n";
@@ -266,7 +282,8 @@ void EmitStaticUInt8PerTensorQuantOp(std::vector<Record *> &defs,
   os.indent(4) << "new std::set<std::string>({\n";
 
   std::vector<std::string> result;
-  GenerateStaticQuantOp(defs, result, InputDataType::UINT8, false);
+  GenerateStaticQuantOp(defs, result, InputDataType::UINT8, /*per_axis=*/false,
+                        /*is_toco=*/false);
 
   for (const auto &op_name : result) {
     os.indent(6) << "\"" << op_name << "\",\n";
@@ -298,7 +315,31 @@ void EmitStaticQuantWithInt16ActOp(std::vector<Record *> &defs,
   os.indent(4) << "new std::set<std::string>({\n";
 
   std::vector<std::string> result;
-  GenerateStaticQuantOp(defs, result, InputDataType::INT16, false);
+  GenerateStaticQuantOp(defs, result, InputDataType::INT16, /*per_axis=*/false,
+                        /*is_toco=*/false);
+
+  for (const auto &op_name : result) {
+    os.indent(6) << "\"" << op_name << "\",\n";
+  }
+
+  os.indent(4) << "});";
+  os.indent(2) << "return *result;\n";
+  os.indent(0) << "}\n";
+}
+
+void EmitStaticQuantWithInt16ActTocoOp(std::vector<Record *> &defs,
+                                       raw_ostream *ostream) {
+  raw_ostream &os = *ostream;
+  llvm::sort(defs, LessRecord());
+
+  os.indent(0) << "const std::set<std::string> "
+                  "&ExportStaticInt8WithInt16ActTocoSpec() {\n";
+  os.indent(2) << "static const std::set<std::string> * result =\n";
+  os.indent(4) << "new std::set<std::string>({\n";
+
+  std::vector<std::string> result;
+  GenerateStaticQuantOp(defs, result, InputDataType::INT16, /*per_axis=*/false,
+                        /*is_toco=*/true);
 
   for (const auto &op_name : result) {
     os.indent(6) << "\"" << op_name << "\",\n";
@@ -315,6 +356,7 @@ static bool TFLiteOpCoverageSpecWritersMain(raw_ostream &os,
   EmitStaticQuantOp(op_defs, &os);
   EmitDynamicRangeOp(op_defs, &os);
   EmitStaticQuantWithInt16ActOp(op_defs, &os);
+  EmitStaticQuantWithInt16ActTocoOp(op_defs, &os);
   EmitSparseOp(op_defs, &os);
   return false;
 }
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index f32f3669b77..4f2e681a986 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -1,8 +1,11 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
+        "//tensorflow:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -31,11 +34,32 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:private_c_api_types",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/tools/optimize:reduced_precision_support",
         "@com_google_absl//absl/strings",
+        "@flatbuffers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
     ],
 )
+
+tf_cc_test(
+    name = "sparsify_model_test",
+    srcs = ["sparsify_model_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/sparse_tensor.bin",
+    ],
+    deps = [
+        ":sparsify_model",
+        "//tensorflow/lite/core:model_builder",
+        "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:private_c_api_types",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/tools/optimize:reduced_precision_support",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
index 67ba8907d4c..a9614c0e62c 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
 
 namespace mlir {
 namespace lite {
@@ -60,7 +61,7 @@ TfLiteStatus SparsifyModel(const tflite::ModelT& input_model,
     return kTfLiteError;
   }
 
-  PassManager pm(module->getContext(), OpPassManager::Nesting::Implicit);
+  PassManager pm((*module)->getName(), OpPassManager::Nesting::Implicit);
   pm.addPass(TFL::CreateDenseToSparsePass());
 
   if (failed(pm.run(module.get()))) {
@@ -75,6 +76,18 @@ TfLiteStatus SparsifyModel(const tflite::ModelT& input_model,
   options.toco_flags.set_force_select_tf_ops(false);
   options.toco_flags.set_enable_select_tf_ops(true);
   options.toco_flags.set_allow_custom_ops(true);
+
+  // Copy metadata for Reduced Precision Support from input model if it exists
+  for (const auto& metadata : input_model.metadata) {
+    if (metadata->name != tflite::optimize::kTfLiteReducedPrecisionKey) {
+      continue;
+    }
+
+    const auto& data = input_model.buffers[metadata->buffer]->data;
+    options.metadata[metadata->name] = std::string(data.begin(), data.end());
+    break;
+  }
+
   if (!tflite::MlirToFlatBufferTranslateFunction(module.get(), options,
                                                  &result)) {
     error_reporter->Report("Failed to export MLIR to flatbuffer.");
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
index 0689a7031f9..53deff6d990 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h
@@ -15,11 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_SPARSITY_SPARSIFY_MODEL_H_
 
-#include <memory>
-#include <unordered_set>
-
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc
new file mode 100644
index 00000000000..861a02be9ca
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/sparsity/sparsify_model_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
+
+#include <stdint.h>
+
+#include <cstdarg>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
+
+namespace mlir {
+namespace lite {
+namespace {
+
+class NoopErrorReporter : public ::tflite::ErrorReporter {
+ public:
+  int Report(const char* format, std::va_list args) override { return 0; }
+};
+
+TEST(SparsifyModelTest, MetadataIsAddedToOutputModel) {
+  std::string expected_key = tflite::optimize::kTfLiteReducedPrecisionKey;
+  std::string expected_value = "test_data";
+
+  // Load input model
+  auto input_fbm = tflite::FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/sparse_tensor.bin");
+  tflite::ModelT input_model;
+  input_fbm->GetModel()->UnPackTo(&input_model);
+
+  // Populate input metadata
+  auto model_metadata_buffer = std::make_unique<tflite::BufferT>();
+  model_metadata_buffer->data =
+      std::vector<uint8_t>(expected_value.begin(), expected_value.end());
+  input_model.buffers.push_back(std::move(model_metadata_buffer));
+  auto metadata_t = std::make_unique<tflite::MetadataT>();
+  metadata_t->name = tflite::optimize::kTfLiteReducedPrecisionKey;
+  metadata_t->buffer = input_model.buffers.size() - 1;
+  input_model.metadata.push_back(std::move(metadata_t));
+
+  // Sparsify and create output model
+  flatbuffers::FlatBufferBuilder output_builder;
+  NoopErrorReporter reporter;
+  ASSERT_EQ(SparsifyModel(input_model, &output_builder, &reporter), kTfLiteOk);
+  auto output_fbm = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(output_builder.GetCurrentBufferPointer()),
+      output_builder.GetSize());
+  tflite::ModelT output_model;
+  output_fbm->GetModel()->UnPackTo(&output_model);
+
+  // Extract output metadata
+  std::map<std::string, std::string> output_metadata;
+  for (const auto& metadata : output_model.metadata) {
+    const auto& data = output_model.buffers[metadata->buffer]->data;
+    output_metadata[metadata->name] = std::string(data.begin(), data.end());
+  }
+
+  EXPECT_THAT(output_metadata,
+              testing::Contains(testing::Pair(expected_key, expected_value)));
+}
+
+}  // namespace
+}  // namespace lite
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 3cba4ad3123..bb4eaaef136 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -1,9 +1,8 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -22,7 +21,7 @@ cc_library(
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_util",
+        ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -32,85 +31,43 @@ cc_library(
     alwayslink = 1,
 )
 
-gentbl_cc_library(
-    name = "mhlo_tfl_legalize_patterns_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    includes = [
-        "//tensorflow/compiler/xla/mlir_hlo/include/",
-    ],
-    tbl_outs = [
-        (
-            ["-gen-rewriters"],
-            "transforms/generated_mhlo_tfl_legalize_patterns.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/mhlo_tfl_legalize_patterns.td",
-    deps = [
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//mlir:ArithOpsTdFiles",
-        "@llvm-project//mlir:FuncTdFiles",
-    ],
-)
-
 cc_library(
-    name = "mhlo_tfl",
+    name = "stablehlo_tfl",
     srcs = [
-        "transforms/mhlo_tfl_pass.cc",
+        "transforms/stablehlo_tfl_pass.cc",
     ],
     hdrs = [
-        "transforms/mhlo_tfl_pass.h",
+        "transforms/stablehlo_tfl_pass.h",
     ],
     copts = [
         "-Ithird_party",
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "@flatbuffers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-        "@stablehlo//:register",
+        "@stablehlo//:stablehlo_ops",
     ],
     alwayslink = 1,
 )
 
 cc_library(
-    name = "mhlo_util",
+    name = "stablehlo_util",
     srcs = [
-        "transforms/mhlo_util.cc",
+        "transforms/stablehlo_util.cc",
     ],
     hdrs = [
-        "transforms/mhlo_util.h",
+        "transforms/stablehlo_util.h",
     ],
     copts = [
         "-Ithird_party",
     ],
     deps = [
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
-        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-        "@stablehlo//:register",
     ],
     alwayslink = 1,
 )
@@ -127,7 +84,7 @@ cc_library(
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_util",
+        ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -148,54 +105,31 @@ cc_library(
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_util",
+        ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/xla/mlir_hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
     ],
     alwayslink = 1,
 )
 
 cc_library(
-    name = "tf_mhlo",
-    srcs = [
-        "transforms/tf_mhlo_pass.cc",
-    ],
-    hdrs = [
-        "transforms/tf_mhlo_pass.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":mhlo_util",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tf_mhlo_tfl",
+    name = "tf_stablehlo",
     srcs = [
-        "transforms/tf_mhlo_tfl_pass.cc",
+        "transforms/tf_stablehlo_pass.cc",
     ],
     hdrs = [
-        "transforms/tf_mhlo_tfl_pass.h",
+        "transforms/tf_stablehlo_pass.h",
     ],
     copts = [
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_tfl_legalize_patterns_inc_gen",
-        ":mhlo_util",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        ":stablehlo_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
@@ -208,45 +142,8 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
-        "@stablehlo//:register",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tf_poly",
-    srcs = [
-        "transforms/tf_poly_pass.cc",
-    ],
-    hdrs = [
-        "transforms/tf_poly_pass.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        ":mhlo_tfl_legalize_patterns_inc_gen",
-        ":mhlo_util",
-        ":tf_mhlo",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
-        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:register",
     ],
@@ -254,12 +151,12 @@ cc_library(
 )
 
 cc_library(
-    name = "tfl_mhlo",
+    name = "tfl_stablehlo",
     srcs = [
-        "transforms/tfl_mhlo_pass.cc",
+        "transforms/tfl_stablehlo_pass.cc",
     ],
     hdrs = [
-        "transforms/tfl_mhlo_pass.h",
+        "transforms/tfl_stablehlo_pass.h",
     ],
     copts = [
         "-Ithird_party",
@@ -267,16 +164,14 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "@flatbuffers",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:chlo_ops",
         "@stablehlo//:register",
+        "@stablehlo//:stablehlo_ops",
     ],
     alwayslink = 1,
 )
@@ -294,13 +189,18 @@ cc_library(
     ],
     deps = [
         ":drop_savedmodel_semantics",
+        ":fold_broadcast_pass",
+        ":fuse_convolution_pass",
+        ":optimize",
         ":rename_entrypoint_to_main",
         ":smuggle_disallowed_ops",
-        ":tf_mhlo",
+        ":tf_stablehlo",
+        ":unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/xla:tf_xla_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
@@ -318,7 +218,8 @@ cc_library(
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_util",
+        ":stablehlo_util",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
@@ -336,13 +237,106 @@ cc_library(
         "-Ithird_party",
     ],
     deps = [
-        ":mhlo_util",
+        ":stablehlo_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "unfuse_batch_norm_pass",
+    srcs = [
+        "transforms/unfuse_batch_norm_pass.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "fold_broadcast_pass",
+    srcs = [
+        "transforms/fold_broadcast_pass.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "fuse_convolution_pass",
+    srcs = [
+        "transforms/fuse_convolution_pass.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/lite:validators",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "optimize",
+    srcs = [
+        "transforms/optimize.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
 )
 
 tf_cc_binary(
@@ -353,15 +347,14 @@ tf_cc_binary(
     copts = ["-O3"],
     deps = [
         ":check_accepted_ops_pass",
-        ":mhlo_tfl",
         ":op_stat_pass",
+        ":stablehlo_tfl",
         ":transforms",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir/lite:flatbuffer_export",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -369,7 +362,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/xla:legalize_tf",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "//tensorflow/core:lib",
@@ -387,15 +380,17 @@ tf_cc_binary(
 )
 
 tf_cc_binary(
-    name = "tf-mhlo-tfl-opt",
+    name = "odml-to-stablehlo-opt",
     testonly = True,
     tags = ["hostonly"],
     deps = [
-        ":mhlo_tfl",
-        ":tf_mhlo",
-        ":tf_mhlo_tfl",
-        ":tf_poly",
-        ":tfl_mhlo",
+        ":fold_broadcast_pass",
+        ":fuse_convolution_pass",
+        ":optimize",
+        ":stablehlo_tfl",
+        ":tf_stablehlo",
+        ":tfl_stablehlo",
+        ":unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir:tf_mlir_opt_main",
     ],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index 58a6de36c26..d1d8e605566 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <algorithm>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -50,8 +51,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
@@ -61,10 +62,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
@@ -133,7 +134,7 @@ opt<bool> skip_resize(
 // NOLINTNEXTLINE
 opt<bool> smuggle_disallowed_ops(
     "smuggle-disallowed-ops",
-    llvm::cl::desc("Smuggle disallowed ops via mhlo.custom_calls."),
+    llvm::cl::desc("Smuggle disallowed ops via stablehlo.custom_calls."),
     llvm::cl::Optional, llvm::cl::init(false));
 
 // NOLINTNEXTLINE
@@ -180,9 +181,9 @@ tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
 tensorflow::Status ConvertStableHLOToFlatbuffer(mlir::ModuleOp module,
                                                 std::string* flatbuffer_str) {
   // Convert StableHLO MLIR to TFLite Custom Op MLIR
-  mlir::PassManager mhlo_tfl_pm(module->getContext());
-  mhlo_tfl_pm.addNestedPass<func::FuncOp>(TFL::mhlo::CreateMhloToTflPass());
-  if (failed(mhlo_tfl_pm.run(module))) {
+  mlir::PassManager pm(module->getContext());
+  pm.addNestedPass<func::FuncOp>(CreateStablehloToTflPass());
+  if (failed(pm.run(module))) {
     return tensorflow::errors::Aborted("HLO to TFL passes failed.");
   }
 
@@ -271,7 +272,7 @@ tensorflow::Status ConvertTFToStableHLO(
   mlir::odml::AddStablehloOptimizationPasses(pm);
 
   if (failed(pm.run(tf_module))) {
-    return tensorflow::errors::Aborted("Lowering to Compute IR failed.");
+    return tensorflow::errors::Aborted("Lowering to StableHLO failed.");
   }
 
   return ::tensorflow::OkStatus();
@@ -314,7 +315,7 @@ tensorflow::Status RunConverter(const PassPipelineCLParser& pass_pipeline) {
                                     elide_large_elements_attrs));
   }
 
-  llvm::Optional<tensorflow::Session*> session = llvm::None;
+  llvm::Optional<tensorflow::Session*> session = std::nullopt;
   if (bundle) session = bundle->GetSession();  // NOMUTANTS--it should pass.
 
   if (freeze_tf_graph) {
@@ -332,10 +333,23 @@ tensorflow::Status RunConverter(const PassPipelineCLParser& pass_pipeline) {
   }
 
   auto conversion_status = ConvertTFToStableHLO(*module, pass_pipeline);
-  auto export_path = conversion_status.ok()
-                         ? output_path
-                         : absl::StrCat(verbose_dir, "/debug_mhlo.mlir");
-  return ExportModule(*module, export_path, elide_large_elements_attrs);
+  auto output_export_status =
+      ExportModule(*module, output_path, elide_large_elements_attrs);
+  if (!conversion_status.ok()) {
+    LOG(ERROR) << "TF to StableHLO conversion failed: "
+               << conversion_status.error_message();
+
+    auto debug_export_status = ExportModule(
+        *module, absl::StrCat(verbose_dir, "/debug_stablehlo.mlir"),
+        elide_large_elements_attrs);
+    if (!debug_export_status.ok()) {
+      LOG(ERROR) << "Failed to export debug_stablehlo.mlir: "
+                 << debug_export_status.error_message();
+    }
+
+    return conversion_status;
+  }
+  return output_export_status;
 }
 
 // All MLIR and TF passes are registered here, similar to mlirOptMain.
@@ -355,9 +369,8 @@ void initAllPasses() {
   mlir::lmhlo::registerAllLmhloPasses();
   // These are in compiler/mlir/xla and not part of the above MHLO passes.
   mlir::mhlo::registerTfXlaPasses();
-  mlir::mhlo::registerXlaPasses();
+  mlir::mhlo::registerXlaFrameworkPasses();
   mlir::mhlo::registerLegalizeTFPass();
-  mlir::mhlo::registerLegalizeTFControlFlowPass();
   mlir::mhlo::registerLegalizeTfTypesPassPass();
   tensorflow::RegisterConvertMlirToXlaHloPipelineWithDefaults();
   tensorflow::RegisterGraphOptimizationPasses();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
index 140bb674f4c..7002dd57dda 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -11,6 +12,9 @@ package(
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir/lite/stablehlo:run_lit.sh",
+    size_override = {
+        "legalize-skip-quantization-ops.mlir": "medium",
+    },
     test_file_exts = [
         "mlir",
         "cc",
@@ -24,8 +28,8 @@ filegroup(
     data = [
         "//tensorflow/compiler/mlir/lite:flatbuffer_translate",
         "//tensorflow/compiler/mlir/lite:tf_tfl_translate",
+        "//tensorflow/compiler/mlir/lite/stablehlo:odml-to-stablehlo-opt",
         "//tensorflow/compiler/mlir/lite/stablehlo:odml_to_stablehlo",
-        "//tensorflow/compiler/mlir/lite/stablehlo:tf-mhlo-tfl-opt",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//mlir:run_lit.sh",
     ],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir
new file mode 100644
index 00000000000..18526fe94c7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/fold_broadcast.mlir
@@ -0,0 +1,44 @@
+// RUN: odml-to-stablehlo-opt %s -constant-fold-broadcast-pass -cse -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_float
+func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_float() -> (tensor<1x1x2x4xf32>) {
+  // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00, 9.000000e+00, 1.600000e+01], [5.000000e+00, 1.200000e+01, 2.100000e+01, 3.200000e+01]]]]> : tensor<1x1x2x4xf32>
+  %cst0 = mhlo.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
+  %cst1 = mhlo.constant dense<[[[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]]]> : tensor<1x1x2x4xf32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x1x2x4xf32>
+  %1 = mhlo.multiply %0, %cst1 : tensor<1x1x2x4xf32>
+  // CHECK:      return %[[RES]] : tensor<1x1x2x4xf32>
+  func.return %1 : tensor<1x1x2x4xf32>
+}
+
+// CHECK-LABEL: @foldBroadcastInDimBeforeMulOp_bcast_dim_2D_float
+func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_2D_float() -> (tensor<1x2x2x3xf32>) {
+  // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 4.000000e+00, 9.000000e+00], [4.000000e+00, 1.000000e+01, 1.800000e+01]], {{\[\[}}2.800000e+01, 4.000000e+01, 5.400000e+01], [4.000000e+01, 5.500000e+01, 7.200000e+01]]]]> : tensor<1x2x2x3xf32>
+  %cst0 = mhlo.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>
+  %cst1 = mhlo.constant dense<[[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]]]> : tensor<1x2x2x3xf32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<[1, 3]> : tensor<2xi64>} : (tensor<2x3xf32>) -> tensor<1x2x2x3xf32>
+  %1 = mhlo.multiply %0, %cst1 : tensor<1x2x2x3xf32>
+  // CHECK:      return %[[RES]] : tensor<1x2x2x3xf32>
+  func.return %1 : tensor<1x2x2x3xf32>
+}
+
+// CHECK-LABEL: @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_int
+func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_1D_int() -> (tensor<1x1x2x4xi32>) {
+  // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}1, 4, 9, 16], [5, 12, 21, 32]]]]> : tensor<1x1x2x4xi32>
+  %cst0 = mhlo.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  %cst1 = mhlo.constant dense<[[[[1, 2, 3, 4], [5, 6, 7, 8]]]]> : tensor<1x1x2x4xi32>
+  %0 = "mhlo.broadcast_in_dim"(%cst0) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x1x2x4xi32>
+  %1 = mhlo.multiply %0, %cst1 : tensor<1x1x2x4xi32>
+  // CHECK:      return %[[RES]] : tensor<1x1x2x4xi32>
+  func.return %1 : tensor<1x1x2x4xi32>
+}
+
+// CHECK-LABEL: @foldBroadcastInDimBeforeMulOp_bcast_dim_4D_int
+func.func @foldBroadcastInDimBeforeMulOp_bcast_dim_4D_int(%arg0: tensor<1x2x1x4xi32>) -> tensor<1x2x1x4xi32> {
+  // CHECK-DAG: %[[RES:.*]] = mhlo.constant dense<{{\[\[\[\[}}0, 1, 2, 3]], {{\[\[}}0, 1, 2, 3]]]]> : tensor<1x2x1x4xi32>
+  %0 = mhlo.constant dense<[[[[0, 1, 2, 3]]]]> : tensor<1x1x1x4xi32>
+  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x4xi32>) -> tensor<1x2x1x4xi32>
+  // CHECK: mhlo.multiply %[[ARG0:.*]], %[[RES]] : tensor<1x2x1x4xi32>
+  %2 = mhlo.multiply %arg0, %1 : tensor<1x2x1x4xi32>
+  return %2 : tensor<1x2x1x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir
new file mode 100644
index 00000000000..a05f4ee36fb
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/fuse_mhlo_convolution.mlir
@@ -0,0 +1,18 @@
+// RUN: odml-to-stablehlo-opt %s -fuse-mhlo-convolution-pass -cse | FileCheck %s
+
+// CHECK-LABEL: @fuseMulAndConv2D
+// CHECK-SAME: %[[INPUT:[^:[:space:]]+]]
+func.func @fuseMulAndConv2D(%input: tensor<1x256x256x3xf32>) -> (tensor<1x256x256x2xf32>) {
+  // CHECK-DAG: %[[FILTER:.+]] = mhlo.constant dense<{{\[\[\[\[}}1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]]]> : tensor<1x1x3x2xf32>
+  // CHECK-DAG: %[[CST:.+]] = mhlo.constant dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>
+  // CHECK-DAG: %[[CST_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[CST]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<1x1x3x2xf32>
+  // CHECK-DAG: %[[NEW_FILTER:.+]] =  mhlo.multiply %[[CST_BCAST]], %[[FILTER]] : tensor<1x1x3x2xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = mhlo.convolution(%[[INPUT]], %[[NEW_FILTER]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = {{\[\[}}0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x256x256x2xf32>
+  %filter = mhlo.constant dense<[[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]]]> : tensor<1x1x3x2xf32>
+  %cst = mhlo.constant dense<[0.1, 0.2]> : tensor<2xf32>
+  %0 = mhlo.convolution(%input, %filter) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x256x3xf32>, tensor<1x1x3x2xf32>) -> tensor<1x256x256x2xf32>
+  %1 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2xf32>) -> tensor<1x256x256x2xf32>
+  %2 = mhlo.multiply %0, %1 : tensor<1x256x256x2xf32>
+  // CHECK-DAG: return %[[RESULT]]
+  func.return %2 : tensor<1x256x256x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-acos.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-acos.mlir
deleted file mode 100644
index 2e558fec4cc..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-acos.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tf-mhlo-tfl | FileCheck %s
-
-// Convert Acos to TFL via MHLO, but leave the unsupported tf.Cos untouched.
-func.func @convertAcos(%arg0: tensor<3xf32>) -> tensor<3xf32>  {
-  %1 = "tf.Acos"(%arg0) {device = ""} : (tensor<3xf32>) -> tensor<3xf32>
-  %2 = "tf.Cos"(%1) : (tensor<3xf32>) -> tensor<3xf32>
-  func.return %2: tensor<3xf32>
-}
-
-// CHECK-LABEL: @convertAcos
-// CHECK-SAME: %arg0: tensor<3xf32>
-// CHECK: %[[CST:.*]] = arith.constant dense<-1.000000e+00> : tensor<3xf32>
-// CHECK: %[[TMP1:.*]] = tfl.not_equal(%arg0, %[[CST]]) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
-// CHECK-DAG: %[[CST0:.*]] = arith.constant dense<2.000000e+00> : tensor<3xf32>
-// CHECK-DAG: %[[CST1:.*]] = arith.constant dense<1.000000e+00> : tensor<3xf32>
-// CHECK: %[[TMP2:.*]] = tfl.mul %arg0, %arg0 {fused_activation_function = "NONE"} : tensor<3xf32>
-// CHECK: %[[TMP3:.*]] = tfl.sub %[[CST1]], %[[TMP2]] {fused_activation_function = "NONE"} : tensor<3xf32>
-// CHECK: %[[TMP4:.*]] = "tfl.sqrt"(%[[TMP3]]) : (tensor<3xf32>) -> tensor<3xf32>
-// CHECK: %[[CST2:.*]] = arith.constant dense<1.000000e+00> : tensor<3xf32>
-// CHECK: %[[TMP5:.*]] = tfl.add %[[CST2]], %arg0 {fused_activation_function = "NONE"} : tensor<3xf32>
-// CHECK: %[[TMP6:.*]] = "tfl.custom"(%[[TMP4]], %[[TMP5]]) {custom_code = "atan2", custom_option = #tfl<const_bytes : "0x">} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
-// CHECK: %[[TMP7:.*]] = tfl.mul %[[CST0]], %[[TMP6]] {fused_activation_function = "NONE"} : tensor<3xf32>
-// CHECK: %[[CST3:.*]] = arith.constant dense<3.14159274> : tensor<3xf32>
-// CHECK: %[[RES1:.*]] = "tfl.select"(%[[TMP1]], %[[TMP7]], %[[CST3]]) : (tensor<3xi1>, tensor<3xf32>, tensor<3xf32>) -> tensor<3xf32>
-// CHECK: %[[RES2:.*]] = "tf.Cos"(%[[RES1]]) : (tensor<3xf32>) -> tensor<3xf32>
-// CHECK: return %[[RES2]] : tensor<3xf32>
-
-
-// Leave unsupported tf.Cos untouched in TF dialect.
-func.func @cosUnconverted(%arg0: tensor<3xf32>) -> tensor<3xf32> {
-  %0 = "tf.Cos"(%arg0) : (tensor<3xf32>) -> tensor<3xf32>
-  func.return %0 : tensor<3xf32>
-}
-
-// CHECK-LABEL: @cosUnconverted
-// CHECK-SAME: %arg0: tensor<3xf32>
-// CHECK: %[[RES:.*]] = "tf.Cos"(%arg0) : (tensor<3xf32>) -> tensor<3xf32>
-// CHECK: return %[[RES]] : tensor<3xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate-tf_mhlo_tflite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate-tf_mhlo_tflite.mlir
deleted file mode 100644
index 27653a005ab..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate-tf_mhlo_tflite.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tf-mhlo-tfl | FileCheck %s
-
-module attributes {tf.versions = {producer = 888 : i32}} {
-
-func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
-  %1 = arith.constant dense<1> : tensor<1xi32>
-  %2 = arith.constant dense<2.0> : tensor<1x1x2xf32>
-  %3 = "tf.InplaceUpdate"(%arg0, %1, %2) {device = ""}
-    : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
-  func.return %3 : tensor<2x1x2xf32>
-}
-
-}
-
-// CHECK-LABEL: @tfInplaceUpdate
-// CHECK-DAG: %cst = arith.constant dense<1> : tensor<1xi32>
-// CHECK-DAG: %cst_0 = arith.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-// CHECK: %0 = "tf.InplaceUpdate"(%arg0, %cst, %cst_0) {device = ""} : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
-// CHECK: return %0 : tensor<2x1x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate.mlir
index efb8c5e20be..4355ab7de08 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-inplaceupdate.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mhlo-tfl-opt %s -tf-mhlo | FileCheck %s
+// RUN: odml-to-stablehlo-opt %s -tf-stablehlo | FileCheck %s
 
 module attributes {tf.versions = {producer = 888 : i32}} {
 
@@ -13,10 +13,8 @@ func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
 }
 
 // CHECK-LABEL: @tfInplaceUpdate
-// CHECK-DAG: %{{.*}} = arith.constant dense<1> : tensor<1xi32>
-// CHECK-DAG: %{{.*}} = arith.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-// CHECK-DAG: %[[CST0:.*]] = mhlo.constant dense<1> : tensor<i32>
-// CHECK-DAG: %[[CST1:.*]] = mhlo.constant dense<0> : tensor<i32>
-// CHECK-DAG: %[[CST2:.*]] = mhlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-// CHECK: %[[RES:.*]] = mhlo.dynamic_update_slice %arg0, %[[CST2]], %[[CST0]], %[[CST1]], %[[CST1]] : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+// CHECK-DAG: %[[CST0:.*]] = stablehlo.constant dense<1> : tensor<i32>
+// CHECK-DAG: %[[CST1:.*]] = stablehlo.constant dense<0> : tensor<i32>
+// CHECK-DAG: %[[CST2:.*]] = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
+// CHECK: %[[RES:.*]] = stablehlo.dynamic_update_slice %arg0, %[[CST2]], %[[CST0]], %[[CST1]], %[[CST1]] : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
 // CHECK: return %[[RES]] : tensor<2x1x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tf-fb-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tf-fb-tf.mlir
deleted file mode 100644
index 840d4164658..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tf-fb-tf.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | flatbuffer_translate -mlir-to-tflite-flatbuffer - -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = mhlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:     module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    return %1 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-add.mlir
deleted file mode 100644
index 4c925611687..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-add.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-broadcast_in_dim.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-broadcast_in_dim.mlir
deleted file mode 100644
index 15f2c6e1f2e..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-broadcast_in_dim.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0= "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-  func.return %0 : tensor<1x2x2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-// CHECK-NEXT:      return %0 : tensor<1x2x2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-clamp.mlir
deleted file mode 100644
index a98e0f52f77..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-clamp.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "mhlo.clamp"(%arg0, %arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0, %arg0) {custom_code = "mhlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-compare.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-compare.mlir
deleted file mode 100644
index b815be086fe..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-compare.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-  %0 = mhlo.compare LT, %arg0, %arg1 : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %1 = mhlo.compare LT, %arg0, %arg1, TOTALORDER : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %2 = mhlo.compare GT, %arg2, %arg3 : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-  func.return %2 : tensor<2xi1>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E00024C5400011A0101010814022401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.compare", custom_option = #tfl<const_bytes : "0x636F6D706172655F74797065000A544F54414C4F5244455200636F6D70617269736F6E5F646972656374696F6E00024C540002331B0201022A0A1414042401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %2 = "tfl.custom"(%arg2, %arg3) {custom_code = "mhlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E0002475400011A0101010814022401">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-// CHECK-NEXT:     return %2 : tensor<2xi1>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-concat.mlir
deleted file mode 100644
index 7f0ae066aab..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-concat.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  %1 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  func.return %1 : tensor<6x3xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:      return %0 : tensor<6x3xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
-
-
-
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-constant.mlir
deleted file mode 100644
index 93e1f8ed1be..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-constant.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main() -> tensor<2xf32> {
-  %0 = mhlo.constant dense<2> : tensor<i32>
-  %1 = mhlo.constant dense<[10.0, 11.0]> : tensor<2xf32>
-  func.return %1 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func.func @main() -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"() {custom_code = "mhlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<i32>
-// CHECK-NEXT:      %1 = "tfl.custom"() {custom_code = "mhlo.constant", custom_option = #tfl<const_bytes : "0x76616C756500000002000000000020410000304101150101010D36022401">} : () -> tensor<2xf32>
-// CHECK-NEXT:      return %1 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-conv.mlir
deleted file mode 100644
index c21b33a9db5..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-conv.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck -dump-input always %s
-
-module {
-func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
-    dimension_numbers = #mhlo.conv<raw
-      input_batch_dimension = 2,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [0, 1],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 3,
-      output_feature_dimension = 0,
-      output_spatial_dimensions = [1, 2]
-    >, feature_group_count = 1 : i64, lhs_dilation = dense<1> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} :
-       (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-  func.return %0 : tensor<16x8x8x1xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK-NEXT:    return %0 : tensor<16x8x8x1xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-dot.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-dot.mlir
deleted file mode 100644
index 17ad4b9dd15..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-dot.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-  %0 = "mhlo.dot"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.dot<
-      lhs_batching_dimensions = [0, 1],
-      rhs_batching_dimensions = [1, 2],
-      lhs_contracting_dimensions = [0, 1],
-      rhs_contracting_dimensions = [1, 2]
-    >} :
-       (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-  func.return %0 : tensor<72x512xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.dot", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730002000104040201020404020001040402010204040414100C082828282801300101010D28022401">} : (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-// CHECK-NEXT:    return %0 : tensor<72x512xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-gather.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-gather.mlir
deleted file mode 100644
index 5235221b4e2..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-gather.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-  %0 = "mhlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.gather<
-      offset_dims = [2],
-      collapsed_slice_dims = [0, 1],
-      start_index_map = [0, 1],
-      index_vector_dim = 2>,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 256]> : tensor<3xi64>} :
-       (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-  func.return %0 : tensor<30x1x256xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.gather", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730001020402000104040200010404040D0B070228282804696E64696365735F6172655F736F7274656400736C6963655F73697A65730000030001000100000103512A1803010337000F28042D062401">} : (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-// CHECK-NEXT:     return %0 : tensor<30x1x256xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-max.mlir
deleted file mode 100644
index 99f0edf7562..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-max.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.maximum %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-mul.mlir
deleted file mode 100644
index e55f83a742b..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-mul.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.multiply %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-pad.mlir
deleted file mode 100644
index a0ef6d95dc0..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-pad.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-  %0 = "mhlo.pad"(%arg0, %arg1) {
-    edge_padding_low = dense<[1, 0]> : tensor<2xi64>,
-    edge_padding_high = dense<[2, 3]> : tensor<2xi64>,
-    interior_padding = dense<0> : tensor<2xi64>
-  } : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-  func.return %0 : tensor<11x131xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-// CHECK-NEXT:      return %0 : tensor<11x131xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-reshape.mlir
deleted file mode 100644
index 4d500bae675..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-reshape.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "mhlo.reshape"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-rsqrt.mlir
deleted file mode 100644
index 08c8c788b44..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-rsqrt.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = "mhlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK-NEXT:      return %0 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-scatter.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-scatter.mlir
deleted file mode 100644
index 78bd4de693b..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-scatter.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
-  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
-    "mhlo.return"(%arg4) : (tensor<i32>) -> ()
-  }) {
-    scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [],
-      inserted_window_dims = [0],
-      scatter_dims_to_operand_dims = [0],
-      index_vector_dim = 1>,
-    indices_are_sorted = false,
-    unique_indices = false} :
-       (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "mhlo.scatter", custom_option = #tfl<const_bytes : "0x696E64696365735F6172655F736F7274656400736361747465725F64696D656E73696F6E5F6E756D626572730000010004010004040707050128282804756E697175655F696E646963657300034D3B12030103001F00042804062401">} : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-// CHECK-NEXT:     return %0 : tensor<3xi32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-sub.mlir
deleted file mode 100644
index f47d382ab63..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl-sub.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.subtract %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl.mlir
deleted file mode 100644
index 12163c26c87..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-mhlo-tfl.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -mhlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = mhlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      %1 = "tfl.custom"(%0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %1 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-poly.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-poly.mlir
deleted file mode 100644
index 36b04e7951c..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-poly.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tf-poly | FileCheck %s
-
-module attributes {tf.versions = {producer = 888 : i32}} {
-
-func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
-  %1 = arith.constant dense<1> : tensor<1xi32>
-  %2 = arith.constant dense<2.0> : tensor<1x1x2xf32>
-  %3 = "tf.InplaceUpdate"(%arg0, %1, %2) {device = ""}
-    : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
-  func.return %3 : tensor<2x1x2xf32>
-}
-
-}
-
-// CHECK-LABEL: @tfInplaceUpdate
-// CHECK-NEXT:  %cst = arith.constant dense<1> : tensor<1xi32>
-// CHECK-NEXT:  %cst_0 = arith.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-// CHECK-NEXT:  %0 = "tfl.poly_call"(%arg0, %cst, %cst_0) ({
-// CHECK-NEXT:   ^bb0(%arg1: tensor<2x1x2xf32>, %arg2: tensor<1xi32>, %arg3: tensor<1x1x2xf32>):
-// CHECK-NEXT:    %1 = "tf.InplaceUpdate"(%arg1, %arg2, %arg3) {device = ""} : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
-// CHECK-NEXT:    "tfl.yield"(%1) : (tensor<2x1x2xf32>) -> ()
-// CHECK-NEXT:  }, {
-// CHECK-NEXT:   ^bb0(%arg1: tensor<2x1x2xf32>, %arg2: tensor<1xi32>, %arg3: tensor<1x1x2xf32>):
-// CHECK-NEXT:    %1 = "mhlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
-// CHECK-NEXT:    %2 = mhlo.reshape %1 : (tensor<1xi32>) -> tensor<i32>
-// CHECK-NEXT:    %3 = mhlo.constant dense<0> : tensor<i32>
-// CHECK-NEXT:    %4 = "mhlo.slice"(%arg3) {limit_indices = dense<[1, 1, 2]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
-// CHECK-NEXT:    %5 = mhlo.dynamic_update_slice %arg1, %4, %2, %3, %3 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
-// CHECK-NEXT:    "tfl.yield"(%5) : (tensor<2x1x2xf32>) -> ()
-// CHECK-NEXT:  }) {device = ""} : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
-// CHECK-NEXT:  return %0 : tensor<2x1x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-quantization-ops.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-quantization-ops.mlir
index dec004d600c..30ad5f7fce9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-quantization-ops.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-skip-quantization-ops.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-mhlo-tfl-opt %s --tf-mhlo=skip-quantization-ops=true | FileCheck %s --check-prefix=CHECK-SKIP
-// RUN: tf-mhlo-tfl-opt %s --tf-mhlo=skip-quantization-ops=false | FileCheck %s --check-prefix=CHECK-NOSKIP
+// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-quantization-ops=true | FileCheck %s --check-prefix=CHECK-SKIP
+// RUN: odml-to-stablehlo-opt %s --tf-stablehlo=skip-quantization-ops=false | FileCheck %s --check-prefix=CHECK-NOSKIP
 
 func.func @fake_quant_with_min_max_vars(%arg0: tensor<1x1x28x48xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<1x1x28x48xf32> {
   %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {device = "", narrow_range = true, num_bits = 8 : i64} : (tensor<1x1x28x48xf32>, tensor<f32>, tensor<f32>) -> tensor<1x1x28x48xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
new file mode 100644
index 00000000000..f50c399deb5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
@@ -0,0 +1,17 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | flatbuffer_translate -mlir-to-tflite-flatbuffer - -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
+  func.return %1 : tensor<2xi32>
+}
+}
+
+// CHECK:     module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:    return %1 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
new file mode 100644
index 00000000000..b0eb02192f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
new file mode 100644
index 00000000000..3b5dae4706e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
+  %0= "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  func.return %0 : tensor<1x2x2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+// CHECK-NEXT:      return %0 : tensor<1x2x2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
new file mode 100644
index 00000000000..2d0051afde9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = "stablehlo.clamp"(%arg0, %arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0, %arg0) {custom_code = "stablehlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
new file mode 100644
index 00000000000..44b69ab9330
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
@@ -0,0 +1,19 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
+  %0 = stablehlo.compare LT, %arg0, %arg1 : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  %1 = stablehlo.compare LT, %arg0, %arg1, TOTALORDER : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  %2 = stablehlo.compare GT, %arg2, %arg3 : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
+  func.return %2 : tensor<2xi1>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
+// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E00024C5400011A0101010814022401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-NEXT:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D706172655F74797065000A544F54414C4F5244455200636F6D70617269736F6E5F646972656374696F6E00024C540002331B0201022A0A1414042401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-NEXT:     %2 = "tfl.custom"(%arg2, %arg3) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E0002475400011A0101010814022401">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
+// CHECK-NEXT:     return %2 : tensor<2xi1>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
new file mode 100644
index 00000000000..4be83175a41
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
@@ -0,0 +1,18 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+  %1 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  func.return %1 : tensor<6x3xf32>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:    func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:      return %0 : tensor<6x3xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT: }
+
+
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
new file mode 100644
index 00000000000..62c2253869c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
@@ -0,0 +1,17 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main() -> tensor<2xf32> {
+  %0 = stablehlo.constant dense<2> : tensor<i32>
+  %1 = stablehlo.constant dense<[10.0, 11.0]> : tensor<2xf32>
+  func.return %1 : tensor<2xf32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func.func @main() -> tensor<2xf32> {
+// CHECK-NEXT:      %0 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<i32>
+// CHECK-NEXT:      %1 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C756500000002000000000020410000304101150101010D36022401">} : () -> tensor<2xf32>
+// CHECK-NEXT:      return %1 : tensor<2xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
new file mode 100644
index 00000000000..40305064722
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
@@ -0,0 +1,27 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck -dump-input always %s
+
+module {
+func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+  %0 = "stablehlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+    dimension_numbers = #stablehlo.conv<raw
+      input_batch_dimension = 2,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [0, 1],
+      kernel_input_feature_dimension = 3,
+      kernel_output_feature_dimension = 2,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 3,
+      output_feature_dimension = 0,
+      output_spatial_dimensions = [1, 2]
+    >, feature_group_count = 1 : i64, lhs_dilation = dense<1> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} :
+       (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+  func.return %0 : tensor<16x8x8x1xf32>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:    func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+// CHECK-NEXT:    return %0 : tensor<16x8x8x1xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
new file mode 100644
index 00000000000..ef715f778e8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
@@ -0,0 +1,22 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
+  %0 = "stablehlo.dot"(%arg0, %arg1) {
+    dimension_numbers = #stablehlo.dot<
+      lhs_batching_dimensions = [0, 1],
+      rhs_batching_dimensions = [1, 2],
+      lhs_contracting_dimensions = [0, 1],
+      rhs_contracting_dimensions = [1, 2]
+    >} :
+       (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
+  func.return %0 : tensor<72x512xf32>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:    func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
+// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.dot", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730002000104040201020404020001040402010204040414100C082828282801300101010D28022401">} : (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
+// CHECK-NEXT:    return %0 : tensor<72x512xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
new file mode 100644
index 00000000000..5fb78f0540c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
@@ -0,0 +1,23 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
+  %0 = "stablehlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #stablehlo.gather<
+      offset_dims = [2],
+      collapsed_slice_dims = [0, 1],
+      start_index_map = [0, 1],
+      index_vector_dim = 2>,
+    indices_are_sorted = false,
+    slice_sizes = dense<[1, 1, 256]> : tensor<3xi64>} :
+       (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
+  func.return %0 : tensor<30x1x256xf32>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
+// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.gather", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730001020402000104040200010404040D0B070228282804696E64696365735F6172655F736F7274656400736C6963655F73697A65730000030001000100000103512A1803010337000F28042D062401">} : (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
+// CHECK-NEXT:     return %0 : tensor<30x1x256xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
new file mode 100644
index 00000000000..e8ccfcaee07
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.maximum %arg0, %arg0 : tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
new file mode 100644
index 00000000000..b4bcbc455f2
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
new file mode 100644
index 00000000000..f5f69b1cf18
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
@@ -0,0 +1,19 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
+  %0 = "stablehlo.pad"(%arg0, %arg1) {
+    edge_padding_low = dense<[1, 0]> : tensor<2xi64>,
+    edge_padding_high = dense<[2, 3]> : tensor<2xi64>,
+    interior_padding = dense<0> : tensor<2xi64>
+  } : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
+  func.return %0 : tensor<11x131xf32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
+// CHECK-NEXT:      return %0 : tensor<11x131xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
new file mode 100644
index 00000000000..281f14bf8b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = "stablehlo.reshape"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
new file mode 100644
index 00000000000..f352e19959c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "stablehlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK-NEXT:      return %0 : tensor<2xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
new file mode 100644
index 00000000000..5bd79227f57
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
@@ -0,0 +1,26 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
+  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+    "stablehlo.return"(%arg4) : (tensor<i32>) -> ()
+  }) {
+    scatter_dimension_numbers = #stablehlo.scatter<
+      update_window_dims = [],
+      inserted_window_dims = [0],
+      scatter_dims_to_operand_dims = [0],
+      index_vector_dim = 1>,
+    indices_are_sorted = false,
+    unique_indices = false} :
+       (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+}
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
+// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "stablehlo.scatter", custom_option = #tfl<const_bytes : "0x696E64696365735F6172655F736F7274656400736361747465725F64696D656E73696F6E5F6E756D626572730000010004010004040707050128282804756E697175655F696E646963657300034D3B12030103001F00042804062401">} : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+// CHECK-NEXT:     return %0 : tensor<3xi32>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
new file mode 100644
index 00000000000..bc4f72fd2bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.subtract %arg0, %arg0 : tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
new file mode 100644
index 00000000000..8898fac4288
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
@@ -0,0 +1,17 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
+  func.return %1 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:      return %1 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-add.mlir
deleted file mode 100644
index 173559af740..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-add.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-broadcast.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-broadcast.mlir
deleted file mode 100644
index aa14caeb1c9..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-broadcast.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-  func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-  func.return %0 : tensor<1x2x2xi32>
-  }
-}
-
-// CHECK:      module {
-// CHECK-NEXT:  func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-// CHECK-NEXT:  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-// CHECK-NEXT:  return %0 : tensor<1x2x2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-clamp.mlir
deleted file mode 100644
index c5cf3a9cd21..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-clamp.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "mhlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:    %0 = mhlo.clamp %arg0, %arg1, %arg2 : tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-concat.mlir
deleted file mode 100644
index d2327e2a898..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-concat.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  func.return %0 : tensor<6x3xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK-NEXT:    %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:    return %0 : tensor<6x3xf32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-constant.mlir
deleted file mode 100644
index ea8deda4d71..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-constant.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck -dump-input always %s
-
-module {
-  func.func @main() -> tensor<1xi64> {
-  %0 = "tfl.custom"() {custom_code = "mhlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<1xi64>
-  func.return %0 : tensor<1xi64>
-  }
-}
-
-// CHECK:  module {
-// CHECK-NEXT:    func @main() -> tensor<1xi64> {
-// CHECK-NEXT:    %0 = mhlo.constant dense<2> : tensor<1xi64>
-// CHECK-NEXT:    return %0 : tensor<1xi64>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-conv.mlir
deleted file mode 100644
index e95f1e22764..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-conv.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck -dump-input always %s
-
-module {
-  func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-  func.return %0 : tensor<16x8x8x1xf32>
-  }
-}
-
-
-// CHECK:      module {
-// CHECK:      func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK:      %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [0, 1, b, f]x[0, 1, o, i]->[f, 0, 1, b], window = {stride = [1, 1], pad = [1, 1, 1, 1], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK:      return %0 : tensor<16x8x8x1xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-max.mlir
deleted file mode 100644
index 0e129dad19c..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-max.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = mhlo.maximum %arg0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-mul.mlir
deleted file mode 100644
index 6caeed24216..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-mul.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = mhlo.multiply %arg0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-pad.mlir
deleted file mode 100644
index 87ab7f83276..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-pad.mlir
+++ /dev/null
@@ -1,16 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck -dump-input always %s
-
-module {
-    func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "mhlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-      func.return %0 : tensor<11x131xf32>
-    }
-  }
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-// CHECK-NEXT:    %0 = "mhlo.pad"(%arg0, %arg1) {edge_padding_high = dense<[2, 3]> : tensor<2xi64>, edge_padding_low = dense<[1, 0]> : tensor<2xi64>, interior_padding = dense<0> : tensor<2xi64>} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-// CHECK-NEXT:    return %0 : tensor<11x131xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    }
-
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-reshape.mlir
deleted file mode 100644
index e6777e9c9c0..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-reshape.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck -dump-input always %s
-
-module {
-  func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-  }
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:    %0 = mhlo.reshape %arg0 : (tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-rsqrt.mlir
deleted file mode 100644
index 16ea2618697..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-rsqrt.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck -dump-input always %s
-
-module {
-func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = "tfl.custom"(%arg0) {custom_code = "mhlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-}
-
-// CHECK:       module
-// CHECK-NEXT:  func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK-NEXT:  %0 = mhlo.rsqrt %arg0 : tensor<2xf32>
-// CHECK-NEXT:  return %0 : tensor<2xf32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-sub.mlir
deleted file mode 100644
index 98ddf5a38f0..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo-sub.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = mhlo.subtract %arg0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    return %0 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo.mlir
deleted file mode 100644
index 379b24890e4..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-mhlo.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: tf-mhlo-tfl-opt %s -tfl-parse-mhlo-ops | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  %1 = "tfl.custom"(%0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    %1 = mhlo.subtract %0, %arg0 : tensor<2xi32>
-// CHECK-NEXT:    return %1 : tensor<2xi32>
-// CHECK-NEXT:  }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-add.mlir
new file mode 100644
index 00000000000..298e87b2a40
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-add.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-broadcast.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-broadcast.mlir
new file mode 100644
index 00000000000..b2477312ef1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-broadcast.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+  func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
+  %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  func.return %0 : tensor<1x2x2xi32>
+  }
+}
+
+// CHECK:      module {
+// CHECK-NEXT:  func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
+// CHECK-NEXT:  %0 = stablehlo.broadcast_in_dim %arg0, dims = [1, 2] : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+// CHECK-NEXT:  return %0 : tensor<1x2x2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-clamp.mlir
new file mode 100644
index 00000000000..6456a802809
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-clamp.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "stablehlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:    %0 = stablehlo.clamp %arg0, %arg1, %arg2 : tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-concat.mlir
new file mode 100644
index 00000000000..6e783dca9dd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-concat.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+  func.return %0 : tensor<6x3xf32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+// CHECK-NEXT:    %0 = stablehlo.concatenate %arg0, %arg1, dim = 0 : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:    return %0 : tensor<6x3xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir
new file mode 100644
index 00000000000..d4d3b0abf01
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-constant.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck -dump-input always %s
+
+module {
+  func.func @main() -> tensor<1xi64> {
+  %0 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<1xi64>
+  func.return %0 : tensor<1xi64>
+  }
+}
+
+// CHECK:  module {
+// CHECK-NEXT:    func @main() -> tensor<1xi64> {
+// CHECK-NEXT:    %0 = stablehlo.constant dense<2> : tensor<1xi64>
+// CHECK-NEXT:    return %0 : tensor<1xi64>
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir
new file mode 100644
index 00000000000..65f988cebe7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir
@@ -0,0 +1,14 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck -dump-input always %s
+
+module {
+  func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+  func.return %0 : tensor<16x8x8x1xf32>
+  }
+}
+
+
+// CHECK:      module {
+// CHECK:      func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+// CHECK:      %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [0, 1, b, f]x[0, 1, o, i]->[f, 0, 1, b], window = {stride = [1, 1], pad = {{\[}}[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+// CHECK:      return %0 : tensor<16x8x8x1xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-max.mlir
new file mode 100644
index 00000000000..98be6176789
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-max.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = stablehlo.maximum %arg0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-mul.mlir
new file mode 100644
index 00000000000..338657415d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-mul.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = stablehlo.multiply %arg0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir
new file mode 100644
index 00000000000..482a7f9e176
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-pad.mlir
@@ -0,0 +1,16 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck -dump-input always %s
+
+module {
+    func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
+      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
+      func.return %0 : tensor<11x131xf32>
+    }
+  }
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
+// CHECK-NEXT:    %0 = stablehlo.pad %arg0, %arg1, low = [1, 0], high = [2, 3], interior = [0, 0] : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
+// CHECK-NEXT:    return %0 : tensor<11x131xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
+
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-reshape.mlir
new file mode 100644
index 00000000000..76e77c00c59
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-reshape.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck -dump-input always %s
+
+module {
+  func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+  }
+}
+
+// CHECK:       module {
+// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-NEXT:    %0 = stablehlo.reshape %arg0 : (tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-rsqrt.mlir
new file mode 100644
index 00000000000..5c24fb2e354
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-rsqrt.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck -dump-input always %s
+
+module {
+func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+}
+
+// CHECK:       module
+// CHECK-NEXT:  func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK-NEXT:  %0 = stablehlo.rsqrt %arg0 : tensor<2xf32>
+// CHECK-NEXT:  return %0 : tensor<2xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-sub.mlir
new file mode 100644
index 00000000000..73c3e919163
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-sub.mlir
@@ -0,0 +1,15 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = stablehlo.subtract %arg0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    return %0 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo.mlir
new file mode 100644
index 00000000000..73dc4307a86
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo.mlir
@@ -0,0 +1,17 @@
+// RUN: odml-to-stablehlo-opt %s -tfl-parse-stablehlo-ops | FileCheck %s
+
+module {
+func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+  %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  func.return %1 : tensor<2xi32>
+}
+}
+
+// CHECK:       module {
+// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK-NEXT:    %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
+// CHECK-NEXT:    return %1 : tensor<2xi32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-stablehlo-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-stablehlo-tfl.mlir
index 0bc46ac0ea2..46e0748b348 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-stablehlo-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-stablehlo-tfl.mlir
@@ -2,16 +2,16 @@
 
 module {
 func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = mhlo.subtract %0, %arg0 : tensor<2xi32>
+  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
+  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
   func.return %1 : tensor<2xi32>
 }
 }
 
 // CHECK:     module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
 // CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "mhlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "mhlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK-NEXT:    return %1 : tensor<2xi32>
 // CHECK-NEXT:  }
 // CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
index e2c8d1bfb70..608b90d54a7 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-allow-tf.mlir
@@ -1,9 +1,9 @@
-// RUN: odml_to_stablehlo %s --allow-tf=false -o /tmp/temp.mlir; [ ! -f /tmp/temp.mlir ]
+// RUN: odml_to_stablehlo %s --allow-tf=false -o /tmp/temp.mlir; [ -f /tmp/temp.mlir ]; [ -f /tmp/debug_stablehlo.mlir ]
 // RUN: odml_to_stablehlo %s --allow-tf=true -o /tmp/temp2.mlir; [ -f /tmp/temp2.mlir ]
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 975 : i32}, tf_saved_model.semantics}  {
   func.func @serving_default(%arg0: tensor<1x20x20x28xf32> {tf_saved_model.index_path = ["a"]}) -> (tensor<1x40x40x28xf32> {tf_saved_model.index_path = ["b"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "c:0", outputs = "d:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-      %0 = mhlo.constant dense<40> : tensor<2xi32>
+      %0 = stablehlo.constant dense<40> : tensor<2xi32>
       %1 = "tf.UnconvertedOp"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x20x20x28xf32>, tensor<2xi32>) -> tensor<1x40x40x28xf32>
       func.return %1 : tensor<1x40x40x28xf32>
   }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir
new file mode 100644
index 00000000000..d59c5488240
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/optimize.mlir
@@ -0,0 +1,246 @@
+// RUN: odml-to-stablehlo-opt %s -split-input-file -mhlo-optimize | FileCheck %s
+
+// CHECK-LABEL: testDotToDotGeneralVectorVector
+func.func @testDotToDotGeneralVectorVector(%arg0: tensor<3072xf32>, %arg1: tensor<3072xf32>) -> tensor<f32> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [0],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<3072xf32>, tensor<3072xf32>) -> tensor<f32>
+// CHECK:      return %[[RES]] : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: testDotToDotGeneralVectorMatrix
+func.func @testDotToDotGeneralVectorMatrix(%arg0: tensor<3072xf32>, %arg1: tensor<3072x512xf32>) -> tensor<512xf32> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3072xf32>, tensor<3072x512xf32>) -> tensor<512xf32>
+  func.return %0 : tensor<512xf32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [0],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<3072xf32>, tensor<3072x512xf32>) -> tensor<512xf32>
+// CHECK:      return %[[RES]] : tensor<512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testDotToDotGeneralMatrixVector
+func.func @testDotToDotGeneralMatrixVector(%arg0: tensor<2x3072xf32>, %arg1: tensor<3072xf32>) -> tensor<2xf32> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3072xf32>, tensor<3072xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [1],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<2x3072xf32>, tensor<3072xf32>) -> tensor<2xf32>
+// CHECK:      return %[[RES]] : tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testDotToDotGeneralMatrixMatrix
+func.func @testDotToDotGeneralMatrixMatrix(%arg0: tensor<2x3072xf32>, %arg1: tensor<3072x512xf32>) -> tensor<2x512xf32> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3072xf32>, tensor<3072x512xf32>) -> tensor<2x512xf32>
+  func.return %0 : tensor<2x512xf32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [1],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<2x3072xf32>, tensor<3072x512xf32>) -> tensor<2x512xf32>
+// CHECK:      return %[[RES]] : tensor<2x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testRemoveReshapeAroundDotGeneral
+func.func @testRemoveReshapeAroundDotGeneral(%arg0: tensor<3x72x1x2048xf32>, %arg1: tensor<3x2048x512xf32>) -> tensor<3x72x1x512xf32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<3x72x1x2048xf32>) -> tensor<3x72x2048xf32>
+  %1 = "mhlo.dot_general"(%0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+        lhs_batching_dimensions = [0],
+        rhs_batching_dimensions = [0],
+        lhs_contracting_dimensions = [2],
+        rhs_contracting_dimensions = [1]
+    >} : (tensor<3x72x2048xf32>, tensor<3x2048x512xf32>) -> tensor<3x72x512xf32>
+  %2 = "mhlo.reshape"(%1) : (tensor<3x72x512xf32>) -> tensor<3x72x1x512xf32>
+  func.return %2 : tensor<3x72x1x512xf32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_batching_dimensions = [0],
+// CHECK-SAME:     rhs_batching_dimensions = [0],
+// CHECK-SAME:     lhs_contracting_dimensions = [3],
+// CHECK-SAME:     rhs_contracting_dimensions = [1]
+// CHECK-SAME: >} : (tensor<3x72x1x2048xf32>, tensor<3x2048x512xf32>) -> tensor<3x72x1x512xf32>
+// CHECK:      return %[[RES]] : tensor<3x72x1x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testRemoveReshapeAroundDot
+func.func @testRemoveReshapeAroundDot(%arg0: tensor<1x1x512xf32>, %arg1: tensor<512x13x!quant.uniform<i8:f32, 0.00285>>) -> tensor<1x1x13xf32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<1x1x512xf32>) -> tensor<1x512xf32>
+  %1 = "mhlo.dot"(%0, %arg1) : (tensor<1x512xf32>, tensor<512x13x!quant.uniform<i8:f32, 0.00285>>) -> tensor<1x13xf32>
+  %2 = "mhlo.reshape"(%1) : (tensor<1x13xf32>) -> tensor<1x1x13xf32>
+  func.return %2 : tensor<1x1x13xf32>
+
+// CHECK:      %[[RES:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [2],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<1x1x512xf32>, tensor<512x13x!quant.uniform<i8:f32, 2.850000e-03>>) -> tensor<1x1x13xf32>
+// CHECK:      return %[[RES]] : tensor<1x1x13xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testLiftDotConcatLHSSimple
+func.func @testLiftDotConcatLHSSimple(%arg0: tensor<1x1x512xf32>, %arg1: tensor<2x1x512xf32>, %arg2: tensor<3x1x512xf32>, %arg3: tensor<512x13xf32>) -> tensor<6x1x13xf32> {
+  %0 = "mhlo.dot_general"(%arg0, %arg3) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+  >} : (tensor<1x1x512xf32>, tensor<512x13xf32>) -> tensor<1x1x13xf32>
+  %1 = "mhlo.dot_general"(%arg1, %arg3) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+  >} : (tensor<2x1x512xf32>, tensor<512x13xf32>) -> tensor<2x1x13xf32>
+  %2 = "mhlo.dot_general"(%arg2, %arg3) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+  >} : (tensor<3x1x512xf32>, tensor<512x13xf32>) -> tensor<3x1x13xf32>
+  %r = "mhlo.concatenate"(%0, %1, %2) {dimension = 0 : i64} : (tensor<1x1x13xf32>, tensor<2x1x13xf32>, tensor<3x1x13xf32>) -> tensor<6x1x13xf32>
+  func.return %r : tensor<6x1x13xf32>
+
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1, %arg2) {dimension = 0 : i64} : (tensor<1x1x512xf32>, tensor<2x1x512xf32>, tensor<3x1x512xf32>) -> tensor<6x1x512xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg3) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_contracting_dimensions = [2],
+// CHECK-SAME:     rhs_contracting_dimensions = [0]
+// CHECK-SAME: >} : (tensor<6x1x512xf32>, tensor<512x13xf32>) -> tensor<6x1x13xf32>
+// CHECK:      return %[[R1]] : tensor<6x1x13xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testLiftDotConcatLHSComplex
+func.func @testLiftDotConcatLHSComplex(%arg0: tensor<1x9x2x3x8x4x10xf32>, %arg1: tensor<1x9x2x3x8x100x10xf32>, %arg2: tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x104x5x5x7xf32> {
+  %0 = "mhlo.dot_general"(%arg0, %arg2) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0, 2],
+      rhs_batching_dimensions = [2, 1],
+      lhs_contracting_dimensions = [4, 1, 6],
+      rhs_contracting_dimensions = [6, 0, 4]
+  >} : (tensor<1x9x2x3x8x4x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x4x5x5x7xf32>
+  %1 = "mhlo.dot_general"(%arg1, %arg2) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0, 2],
+      rhs_batching_dimensions = [2, 1],
+      lhs_contracting_dimensions = [4, 1, 6],
+      rhs_contracting_dimensions = [6, 0, 4]
+  >} : (tensor<1x9x2x3x8x100x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x100x5x5x7xf32>
+  %r = "mhlo.concatenate"(%0, %1) {dimension = 3 : i64} : (tensor<1x2x3x4x5x5x7xf32>, tensor<1x2x3x100x5x5x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
+  func.return %r : tensor<1x2x3x104x5x5x7xf32>
+
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg1) {dimension = 5 : i64} : (tensor<1x9x2x3x8x4x10xf32>, tensor<1x9x2x3x8x100x10xf32>) -> tensor<1x9x2x3x8x104x10xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.dot_general"(%[[R0]], %arg2) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_batching_dimensions = [0, 2],
+// CHECK-SAME:     rhs_batching_dimensions = [2, 1],
+// CHECK-SAME:     lhs_contracting_dimensions = [4, 1, 6],
+// CHECK-SAME:     rhs_contracting_dimensions = [6, 0, 4]
+// CHECK-SAME: >} : (tensor<1x9x2x3x8x104x10xf32>, tensor<9x2x1x5x10x5x8x7xf32>) -> tensor<1x2x3x104x5x5x7xf32>
+// CHECK:      return %[[R1]] : tensor<1x2x3x104x5x5x7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testLiftDotConcatLHSAndRHS
+func.func @testLiftDotConcatLHSAndRHS(%arg0: tensor<1x72x128xf32>, %arg1: tensor<1x128x72xf32>, %arg2: tensor<1x72x128xf32>, %arg3: tensor<1x128x72xf32>, %arg4: tensor<1x72x128xf32>, %arg5: tensor<1x128x72xf32>, %arg6: tensor<1x72x128xf32>, %arg7: tensor<1x128x72xf32>) -> tensor<4x72x72xf32> {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [1]
+    >} : (tensor<1x72x128xf32>, tensor<1x128x72xf32>) -> tensor<1x72x72xf32>
+  %1 = "mhlo.dot_general"(%arg2, %arg3) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [1]
+    >} : (tensor<1x72x128xf32>, tensor<1x128x72xf32>) -> tensor<1x72x72xf32>
+  %2 = "mhlo.dot_general"(%arg4, %arg5) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [1]
+    >} : (tensor<1x72x128xf32>, tensor<1x128x72xf32>) -> tensor<1x72x72xf32>
+  %3 = "mhlo.dot_general"(%arg6, %arg7) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [1]
+    >} : (tensor<1x72x128xf32>, tensor<1x128x72xf32>) -> tensor<1x72x72xf32>
+  %4 = "mhlo.concatenate"(%0, %1, %2, %3) {dimension = 0 : i64} : (tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>, tensor<1x72x72xf32>) -> tensor<4x72x72xf32>
+  func.return %4 : tensor<4x72x72xf32>
+
+// CHECK:      %[[R0:.*]] = "mhlo.concatenate"(%arg0, %arg2, %arg4, %arg6) {dimension = 0 : i64} : (tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>, tensor<1x72x128xf32>) -> tensor<4x72x128xf32>
+// CHECK:      %[[R1:.*]] = "mhlo.concatenate"(%arg1, %arg3, %arg5, %arg7) {dimension = 0 : i64} : (tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>, tensor<1x128x72xf32>) -> tensor<4x128x72xf32>
+// CHECK:      %[[R2:.*]] = "mhlo.dot_general"(%[[R0]], %[[R1]]) {
+// CHECK-SAME:   dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:     lhs_batching_dimensions = [0],
+// CHECK-SAME:     rhs_batching_dimensions = [0],
+// CHECK-SAME:     lhs_contracting_dimensions = [2],
+// CHECK-SAME:     rhs_contracting_dimensions = [1]
+// CHECK-SAME:   >} : (tensor<4x72x128xf32>, tensor<4x128x72xf32>) -> tensor<4x72x72xf32>
+// CHECK:      return %[[R2]] : tensor<4x72x72xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testSliceConcat
+func.func @testSliceConcat(%arg0: tensor<3x1x512xf32>) -> tensor<3x1x512xf32> {
+  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 1, 512]> : tensor<3xi64>, start_indices = dense<[0, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %1 = "mhlo.slice"(%arg0) {limit_indices = dense<[2, 1, 512]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %2 = "mhlo.slice"(%arg0) {limit_indices = dense<[3, 1, 512]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<3x1x512xf32>) -> tensor<1x1x512xf32>
+  %r = "mhlo.concatenate"(%0, %1, %2) {dimension = 0 : i64} : (tensor<1x1x512xf32>, tensor<1x1x512xf32>, tensor<1x1x512xf32>) -> tensor<3x1x512xf32>
+  func.return %r : tensor<3x1x512xf32>
+
+// CHECK: return %arg0 : tensor<3x1x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: testConvertReshapeDotRhsToBatchedDot
+func.func @testConvertReshapeDotRhsToBatchedDot(%arg0: tensor<1x72x72xf32>, %arg1: tensor<1x72x128xf32>) -> tensor<1x72x128xf32> {
+  %0 = mhlo.reshape %arg1 : (tensor<1x72x128xf32>) -> tensor<72x128xf32>
+  %1 = "mhlo.dot_general"(%arg0, %0) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+    >} : (tensor<1x72x72xf32>, tensor<72x128xf32>) -> tensor<1x72x128xf32>
+  func.return %1 : tensor<1x72x128xf32>
+
+// CHECK:      %[[R:.*]] = "mhlo.dot_general"(%arg0, %arg1) {
+// CHECK-SAME: dot_dimension_numbers = #mhlo.dot<
+// CHECK-SAME:   lhs_batching_dimensions = [0],
+// CHECK-SAME:   rhs_batching_dimensions = [0],
+// CHECK-SAME:   lhs_contracting_dimensions = [2],
+// CHECK-SAME:   rhs_contracting_dimensions = [1]
+// CHECK-SAME: >} : (tensor<1x72x72xf32>, tensor<1x72x128xf32>) -> tensor<1x72x128xf32>
+// CHECK:      return %[[R]] : tensor<1x72x128xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
new file mode 100644
index 00000000000..d131149d7d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
@@ -0,0 +1,22 @@
+//RUN: tf_tfl_translate --enable-stablehlo-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+
+module {
+func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
+  %1 = arith.constant dense<1> : tensor<1xi32>
+  %2 = arith.constant dense<2.0> : tensor<1x1x2xf32>
+  %3 = "tf.InplaceUpdate"(%arg0, %1, %2) {device = ""}
+    : (tensor<2x1x2xf32>, tensor<1xi32>, tensor<1x1x2xf32>) -> tensor<2x1x2xf32>
+  func.return %3 : tensor<2x1x2xf32>
+}
+}
+
+//CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
+//CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom3"}} {
+//CHECK-NEXT:    %[[cst0:.*]] = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001010109010101062C022401">} : () -> tensor<i32>
+//CHECK-NEXT:    %[[cst1:.*]] = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001000109010101062C022401">} : () -> tensor<i32>
+//CHECK-NEXT:    %[[cst2:.*]] = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C756500000002000000000000400000004001150101010D36022401">} : () -> tensor<1x1x2xf32>
+//CHECK-NEXT:    %[[dus:.*]] = "tfl.custom"(%arg0, %[[cst2]], %[[cst0]], %[[cst1]], %[[cst1]]) {custom_code = "stablehlo.dynamic_update_slice", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+//CHECK-NEXT:    return %[[dus]] : tensor<2x1x2xf32>
+//CHECK-NEXT:  }
+//CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
index 748037580c8..d23de7ce50c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-tf-quantize.mlir
@@ -13,10 +13,10 @@ func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
 
 //CHECK: module {
 //CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
-//CHECK-DAG:    %0 = mhlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
-//CHECK-DAG:    %1 = mhlo.constant dense<1> : tensor<i32>
-//CHECK-DAG:    %2 = mhlo.constant dense<0> : tensor<i32>
-//CHECK-NEXT:    %3 = mhlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
+//CHECK-DAG:    %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
+//CHECK-DAG:    %1 = stablehlo.constant dense<1> : tensor<i32>
+//CHECK-DAG:    %2 = stablehlo.constant dense<0> : tensor<i32>
+//CHECK-NEXT:    %3 = stablehlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
 //CHECK-NEXT:    return %3 : tensor<2x1x2xf32>
 //CHECK-NEXT:  }
-//CHECK-NEXT:}
\ No newline at end of file
+//CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
new file mode 100644
index 00000000000..6a25c98a768
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
@@ -0,0 +1,126 @@
+// RUN: odml-to-stablehlo-opt %s -unfuse-mhlo-batch-norm-pass -cse -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @batchNormInference_2D_inner_features
+// CHECK-SAME: %[[X:[^:[:space:]]+]]
+// CHECK-SAME: %[[SCALE:[^:[:space:]]+]]
+// CHECK-SAME: %[[OFFSET:[^:[:space:]]+]]
+// CHECK-SAME: %[[MEAN:[^:[:space:]]+]]
+// CHECK-SAME: %[[VARIANCE:[^:[:space:]]+]]
+func.func @batchNormInference_2D_inner_features(
+    %x: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>,
+    %mean: tensor<256xf32>, %variance: tensor<256xf32>)
+    -> (tensor<4x256xf32>) {
+  // CHECK-DAG: %[[EPS_BCAST:.+]] = mhlo.constant dense<1.001000e-05> : tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS_RSQRT:.+]] = mhlo.rsqrt %[[VARIANCE_EPS]] : tensor<256xf32>
+  // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
+  // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<256xf32>
+  // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<256xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.multiply %[[X]], %[[MULTIPLIER_BCAST]] : tensor<4x256xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[RHS_BCAST]] : tensor<4x256xf32>
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
+      (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
+        tensor<256xf32>) -> tensor<4x256xf32>
+  // CHECK-DAG: return %[[RESULT]]
+  func.return %0 : tensor<4x256xf32>
+}
+
+// CHECK-LABEL: @batchNormInference_4D_middle_features
+// CHECK-SAME: %[[X:[^:[:space:]]+]]
+// CHECK-SAME: %[[SCALE:[^:[:space:]]+]]
+// CHECK-SAME: %[[OFFSET:[^:[:space:]]+]]
+// CHECK-SAME: %[[MEAN:[^:[:space:]]+]]
+// CHECK-SAME: %[[VARIANCE:[^:[:space:]]+]]
+func.func @batchNormInference_4D_middle_features(
+    %x: tensor<3x4x256x6xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>,
+    %mean: tensor<256xf32>, %variance: tensor<256xf32>)
+    -> (tensor<3x4x256x6xf32>) {
+  // CHECK-DAG: %[[EPS_BCAST:.+]] = mhlo.constant dense<1.001000e-05> : tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS_RSQRT:.+]] = mhlo.rsqrt %[[VARIANCE_EPS]] : tensor<256xf32>
+  // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[VARIANCE_EPS_RSQRT]], %[[SCALE]] : tensor<256xf32>
+  // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<256xf32>
+  // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<256xf32>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[MULTIPLIER]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.broadcast_in_dim"(%[[RHS]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
+      (tensor<3x4x256x6xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
+        tensor<256xf32>) -> tensor<3x4x256x6xf32>
+  func.return %0 : tensor<3x4x256x6xf32>
+}
+
+// CHECK-LABEL: @batchNormInference_dynamic_shape
+// Validate that dynamic shapes are handled properly.
+// CHECK-SAME: %[[X:[^:[:space:]]+]]
+// CHECK-SAME: %[[SCALE:[^:[:space:]]+]]
+// CHECK-SAME: %[[OFFSET:[^:[:space:]]+]]
+// CHECK-SAME: %[[MEAN:[^:[:space:]]+]]
+// CHECK-SAME: %[[VARIANCE:[^:[:space:]]+]]
+func.func @batchNormInference_dynamic_shape(
+    %x: tensor<?x?x?x?xf32>, %scale: tensor<?xf32>, %offset: tensor<?xf32>,
+    %mean: tensor<?xf32>, %variance: tensor<?xf32>)
+    -> tensor<?x?x?x?xf32> {
+  // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
+  // CHECK-DAG: %[[VAR_SHAPE:.+]] = shape.shape_of %[[VARIANCE]] : tensor<?xf32> -> tensor<1xindex>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[VAR_SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
+  // CHECK-DAG: %[[R_STDDEV:.+]] = mhlo.rsqrt %[[VARIANCE_EPS]] : tensor<?xf32>
+  // CHECK-DAG: %[[MULTIPLIER:.+]] = mhlo.multiply %[[R_STDDEV]], %[[SCALE]] : tensor<?xf32>
+  // CHECK-DAG: %[[MUL_MEAN:.+]] = mhlo.multiply %[[MULTIPLIER]], %[[MEAN]] : tensor<?xf32>
+  // CHECK-DAG: %[[RHS:.+]] = mhlo.subtract %[[OFFSET]], %[[MUL_MEAN]] : tensor<?xf32>
+  // CHECK-DAG: %[[X_SHAPE:.+]] = shape.shape_of %[[X]] : tensor<?x?x?x?xf32> -> tensor<4xindex>
+  // CHECK-DAG: %[[MULTIPLIER_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[MULTIPLIER]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[RHS]], %[[X_SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[X_NORMED:.+]] = mhlo.multiply %[[X]], %[[MULTIPLIER_BCAST]] : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = mhlo.add %[[X_NORMED]], %[[RHS_BCAST]] : tensor<?x?x?x?xf32>
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 0.001 : f32, feature_index = 1 : i64} :
+      (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,
+        tensor<?xf32>) -> tensor<?x?x?x?xf32>
+  func.return %0 : tensor<?x?x?x?xf32>
+}
+
+// CHECK-LABEL: @batchNormInference_f64
+// Validate that epsilon is properly promoted to f64
+// CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e+00> : tensor<256xf64>
+func.func @batchNormInference_f64(
+    %x: tensor<4x256xf64>, %scale: tensor<256xf64>, %offset: tensor<256xf64>,
+    %mean: tensor<256xf64>, %variance: tensor<256xf64>)
+    -> (tensor<4x256xf64>) {
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 1.0 : f32, feature_index = 1 : i64} :
+      (tensor<4x256xf64>, tensor<256xf64>, tensor<256xf64>, tensor<256xf64>,
+        tensor<256xf64>) -> tensor<4x256xf64>
+  func.return %0 : tensor<4x256xf64>
+}
+
+// CHECK-LABEL: @batchNormInference_f16
+// Validate that epsilon is properly down to f16
+// CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e+00> : tensor<256xf16>
+func.func @batchNormInference_f16(
+    %x: tensor<4x256xf16>, %scale: tensor<256xf16>, %offset: tensor<256xf16>,
+    %mean: tensor<256xf16>, %variance: tensor<256xf16>)
+    -> (tensor<4x256xf16>) {
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 1.0 : f32, feature_index = 1 : i64} :
+      (tensor<4x256xf16>, tensor<256xf16>, tensor<256xf16>, tensor<256xf16>,
+        tensor<256xf16>) -> tensor<4x256xf16>
+  func.return %0 : tensor<4x256xf16>
+}
+
+// Validate that epsilon is overflow
+func.func @batchNormInference_f16_overflow(
+    %x: tensor<4x256xf16>, %scale: tensor<256xf16>, %offset: tensor<256xf16>,
+    %mean: tensor<256xf16>, %variance: tensor<256xf16>)
+    -> (tensor<4x256xf16>) {
+  // expected-warning @+1 {{Could not convert batch_norm epsilon to target fp type: opStatus = 24}}
+  %0 = "mhlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 0.00000001 : f32, feature_index = 1 : i64} :
+      (tensor<4x256xf16>, tensor<256xf16>, tensor<256xf16>, tensor<256xf16>,
+        tensor<256xf16>) -> tensor<4x256xf16>
+  func.return %0 : tensor<4x256xf16>
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.cc
index 1cf7083332c..159d6529163 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.cc
@@ -19,7 +19,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 
 namespace mlir {
 namespace odml {
@@ -33,7 +37,7 @@ class CheckAcceptedOpsPass
 
   explicit CheckAcceptedOpsPass(
       const std::vector<std::string> &optional_accepted_dialects)
-      : accepted_dialects_(TFL::mhlo::GetAcceptedDialects()),
+      : accepted_dialects_(GetAcceptedDialects()),
         optional_accepted_dialects_(optional_accepted_dialects) {}
 
   // Check if TF dialect ops exist over the module.
@@ -49,10 +53,10 @@ void CheckAcceptedOpsPass::runOnOperation() {
   getOperation()->walk([&](Operation *op) {
     auto dialect_name = op->getDialect()->getNamespace();
     auto op_name = op->getName().stripDialect();
-    if (TFL::mhlo::IsAcceptedOp(dialect_name, op_name, accepted_dialects_)) {
+    if (IsAcceptedOp(dialect_name, op_name, accepted_dialects_)) {
       // If given op is in the `accepted_dialects_`, it's ok.
-    } else if (TFL::mhlo::IsAcceptedOp(dialect_name, op_name,
-                                       optional_accepted_dialects_)) {
+    } else if (IsAcceptedOp(dialect_name, op_name,
+                            optional_accepted_dialects_)) {
       // If the given op is in the `optional_accepted_dialects_`, let's warn it.
       op->emitWarning() << op->getName().getStringRef() << " op is temporarily "
                         << "accepted, but it should be removed in the end.";
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.cc
index 0d45c812a23..d9664a681b6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.cc
@@ -26,8 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
+namespace {
+
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+
+}  // namespace
 
 class DropSavedModelSemanticsPass
     : public PassWrapper<DropSavedModelSemanticsPass, OperationPass<ModuleOp>> {
@@ -47,9 +52,8 @@ class DropSavedModelSemanticsPass
     // Clean up functions from tf_saved_model attributes.
     OpBuilder builder(module);
     auto bound_input = builder.getStringAttr("tf_saved_model.bound_input");
-    auto exported_names =
-        builder.getStringAttr("tf_saved_model.exported_names");
-    auto index_path = builder.getStringAttr("tf_saved_model.index_path");
+    auto exported_names = builder.getStringAttr(kTfSavedModelExportedNamesAttr);
+    auto index_path = builder.getStringAttr(kTfSavedModelIndexPathAttr);
     module.walk([&](func::FuncOp func) {
       func->removeAttr(exported_names);
       for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
@@ -75,6 +79,5 @@ std::unique_ptr<Pass> CreateDropSavedModelSemanticsPass() {
 
 static PassRegistration<DropSavedModelSemanticsPass> pass;
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h
index fa8ce65d88f..444a3c4632f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h
@@ -21,13 +21,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
 
 std::unique_ptr<Pass> CreateDropSavedModelSemanticsPass();
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_DROP_SAVEDMODEL_SEMANTICS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
new file mode 100644
index 00000000000..88ddc74e75c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
@@ -0,0 +1,259 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+static const APFloat &addSign(const APFloat &v, Type) { return v; }
+static APSInt addSign(const APInt &v, Type t) {
+  // Add signedness information to the value, treating signless as signed,
+  // unless it's i1.
+  return APSInt(v, t.isUnsignedInteger() || t.isSignlessInteger(1));
+}
+
+// Helper method that given 'shape' and 'current_index' representing
+// index in broadcasted tensor, get the index in the flat original tensor.
+// 'shape' is computed from the original shape and the broadcast dimensions to
+// match result shape.
+int64_t GetElementIndex(llvm::SmallVectorImpl<int64_t> &shape,
+                        llvm::SmallVectorImpl<int64_t> &current_index) {
+  int64_t ind = 0;
+  int64_t mul = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    ind += (current_index[i] % shape[i]) * mul;
+    mul *= shape[i];
+  }
+  return ind;
+}
+
+// Helper method that increment index represented in 'current_index_ptr'
+// in the shape of 'result_shape'.
+void IncrementIndex(ArrayRef<int64_t> result_shape,
+                    llvm::SmallVectorImpl<int64_t> &current_index) {
+  for (int i = result_shape.size() - 1; i >= 0; --i) {
+    current_index[i]++;
+    if (current_index[i] == result_shape[i]) {
+      current_index[i] = 0;
+    } else {
+      break;
+    }
+  }
+}
+
+template <class AttrElementT,
+          class ElementValueT = typename AttrElementT::ValueType>
+Attribute ConstFoldBroadcastInDim(ShapedType result_type,
+                                  DenseElementsAttr operand,
+                                  DenseIntElementsAttr bcast_dims) {
+  auto dimensions = llvm::to_vector(bcast_dims.getValues<int64_t>());
+  const auto result_shape = result_type.getShape();
+  // Index for the broadcasted matrix.
+  llvm::SmallVector<int64_t, 16> current_index(result_type.getRank(), 0);
+  // Computes the new operand shape using the original shape and the broadcast
+  // dimensions to match result shape.
+  llvm::SmallVector<int64_t, 16> operand_new_shape(result_type.getRank(), 1);
+  for (int i = 0; i < dimensions.size(); ++i) {
+    operand_new_shape[dimensions[i]] = operand.getType().getDimSize(i);
+  }
+
+  llvm::SmallVector<ElementValueT, 16> new_values;
+  auto num_elements = result_type.getNumElements();
+  new_values.reserve(num_elements);
+  auto operand_values = operand.getValues<ElementValueT>();
+  for (int64_t i = 0; i < num_elements; ++i) {
+    const int64_t operand_index =
+        GetElementIndex(operand_new_shape, current_index);
+    new_values.push_back(*(operand_values.begin() + operand_index));
+    IncrementIndex(result_shape, current_index);
+  }
+  return DenseElementsAttr::get(result_type,
+                                ArrayRef<ElementValueT>(new_values));
+}
+
+template <typename Op, typename ElementType = Type, typename ValType,
+          typename Convert>
+static Attribute BinaryFolder(Op *op) {
+  auto lhs_op = op->getLhs().template getDefiningOp<mhlo::ConstantOp>();
+  auto rhs_op = op->getRhs().template getDefiningOp<mhlo::ConstantOp>();
+  if (!lhs_op || !lhs_op) return {};
+
+  auto lhs = dyn_cast_or_null<DenseElementsAttr>(lhs_op.getValue());
+  auto rhs = dyn_cast_or_null<DenseElementsAttr>(rhs_op.getValue());
+  if (!lhs || !rhs) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
+  }
+
+  Type etype = type.getElementType();
+
+  // Evaluate for element types.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  // Special case for folding splats no matter how large.
+  // Only covers the case of both attrs being splats; operation-specific cases
+  // like adding a zero or multiplying by one are handled elsewhere.
+  SplatElementsAttr splatLhs = lhs.template dyn_cast<SplatElementsAttr>();
+  SplatElementsAttr splatRhs = rhs.template dyn_cast<SplatElementsAttr>();
+  if (splatLhs && splatRhs) {
+    auto signedLhs = addSign(splatLhs.getSplatValue<ValType>(), etype);
+    auto signedRhs = addSign(splatRhs.getSplatValue<ValType>(), etype);
+    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
+    return succeeded(result) ? SplatElementsAttr::get(type, *result)
+                             : Attribute();
+  }
+
+  SmallVector<ValType, 6> values;
+  values.reserve(lhs.getNumElements());
+  for (const auto zip : llvm::zip(lhs.template getValues<ValType>(),
+                                  rhs.template getValues<ValType>())) {
+    auto signedLhs = addSign(std::get<0>(zip), etype);
+    auto signedRhs = addSign(std::get<1>(zip), etype);
+    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
+    if (failed(result)) {
+      return {};
+    }
+    values.push_back(std::move(*result));
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+template <typename BinaryOpType>
+class FoldBroadcastInDimBeforeBinaryElementwiseOp
+    : public OpRewritePattern<BinaryOpType> {
+ public:
+  using OpRewritePattern<BinaryOpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BinaryOpType binary_op,
+                                PatternRewriter &rewriter) const override {
+    auto lhs = binary_op.getLhs();
+    auto rhs = binary_op.getRhs();
+    auto lhs_bcast_op = lhs.template getDefiningOp<mhlo::BroadcastInDimOp>();
+    auto rhs_bcast_op = rhs.template getDefiningOp<mhlo::BroadcastInDimOp>();
+    if ((lhs_bcast_op && rhs_bcast_op) || (!lhs_bcast_op && !rhs_bcast_op)) {
+      return rewriter.notifyMatchFailure(
+          binary_op, "Operands should have exactly one BroadcastInDim op.");
+    }
+    auto bcast_op = lhs_bcast_op ? lhs_bcast_op : rhs_bcast_op;
+    auto const_op =
+        bcast_op.getOperand().template getDefiningOp<mhlo::ConstantOp>();
+    if (!const_op) return failure();
+    auto const_val = dyn_cast_or_null<DenseElementsAttr>(const_op.getValue());
+    if (!const_val) return failure();
+
+    auto result_type =
+        dyn_cast_or_null<ShapedType>(bcast_op.getResult().getType());
+    if (!result_type || !result_type.hasStaticShape())
+      return rewriter.notifyMatchFailure(binary_op,
+                                         "Result type must have static shape.");
+
+    auto bcast_dims = bcast_op.getBroadcastDimensions();
+    auto elem_type = const_val.getElementType();
+    Attribute result;
+    if (elem_type.template isa<FloatType>()) {
+      result = ConstFoldBroadcastInDim<FloatAttr>(result_type, const_val,
+                                                  bcast_dims);
+    } else if (elem_type.template isa<IntegerType>()) {
+      result = ConstFoldBroadcastInDim<IntegerAttr>(result_type, const_val,
+                                                    bcast_dims);
+    } else {
+      return rewriter.notifyMatchFailure(bcast_op, "Unsupported element type.");
+    }
+    Value new_const_op =
+        rewriter.create<mhlo::ConstantOp>(bcast_op.getLoc(), result);
+    rewriter.replaceOp(bcast_op, {new_const_op});
+    return success();
+  }
+};
+
+using FoldBroadcastInDimBeforeMulOp =
+    FoldBroadcastInDimBeforeBinaryElementwiseOp<mhlo::MulOp>;
+
+// Constant folds mhlo.mul, this folder doesn't have an upper limit on how many
+// elements can be folded.
+LogicalResult ConstantFoldMul(mhlo::MulOp op, PatternRewriter &rewriter) {
+  ShapedType type = op.getType().dyn_cast<ShapedType>();
+  Type etype = type.getElementType();
+  Attribute result = {};
+  if (etype.isa<FloatType>()) {
+    result =
+        BinaryFolder<mhlo::MulOp, FloatType, APFloat, std::multiplies<APFloat>>(
+            &op);
+  } else if (etype.isa<IntegerType>()) {
+    result =
+        BinaryFolder<mhlo::MulOp, IntegerType, APInt, std::multiplies<APSInt>>(
+            &op);
+  }
+  if (result == Attribute()) return failure();
+  Value new_const_op = rewriter.create<mhlo::ConstantOp>(op.getLoc(), result);
+  rewriter.replaceOp(op, {new_const_op});
+  return success();
+}
+
+class FoldBroadcastPass
+    : public PassWrapper<FoldBroadcastPass, OperationPass<func::FuncOp>> {
+ public:
+  StringRef getArgument() const final { return "constant-fold-broadcast-pass"; }
+  StringRef getDescription() const final {
+    return "Constant folds BroadcastInDimOp before binary elementwise ops";
+  }
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {}
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<FoldBroadcastInDimBeforeMulOp>(&getContext());
+    patterns.add(ConstantFoldMul);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<Pass> createFoldBroadcastPass() {
+  return std::make_unique<FoldBroadcastPass>();
+}
+
+static PassRegistration<FoldBroadcastPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc
new file mode 100644
index 00000000000..45c8edc1ec5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fuse_convolution_pass.cc
@@ -0,0 +1,148 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/utils/validators.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+class FuseMhloMulAndConvolutionPattern : public OpRewritePattern<mhlo::MulOp> {
+ public:
+  using OpRewritePattern<mhlo::MulOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::MulOp mul_op,
+                                PatternRewriter &rewriter) const override {
+    // Variables for capturing values and attributes used while creating ops.
+    mhlo::ConvolutionOp conv_op;
+    mhlo::BroadcastInDimOp broadcast_op;
+    mhlo::ConstantOp filter;
+    mhlo::ConstantOp multiplier;
+    mlir::ElementsAttr filter_value, mul_value;
+    mlir::DenseIntElementsAttr broadcast_dims;
+
+    // Match and capture values/attributes.
+    Value lhs = mul_op.getLhs();
+    Value rhs = mul_op.getRhs();
+    conv_op = lhs.getDefiningOp<mhlo::ConvolutionOp>();
+    if (conv_op == nullptr) {
+      return failure();
+    }
+    filter = conv_op.getRhs().getDefiningOp<mhlo::ConstantOp>();
+    if (filter == nullptr) {
+      return failure();
+    }
+    broadcast_op = rhs.getDefiningOp<mhlo::BroadcastInDimOp>();
+    multiplier =
+        (broadcast_op == nullptr)
+            ? rhs.getDefiningOp<mhlo::ConstantOp>()
+            : broadcast_op.getOperand().getDefiningOp<mhlo::ConstantOp>();
+    if (multiplier == nullptr) {
+      return failure();
+    }
+    auto result_type = OpTrait::util::getBroadcastedType(filter.getType(),
+                                                         multiplier.getType());
+    if (!result_type) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'filter, multiplier' failed to satisfy constraint: "
+                "non-broadcastable operands";
+      });
+    }
+    filter_value = filter.getValue();
+    mul_value = multiplier.getValue();
+    // In MHLO, Conv filter is in HWIO format, Depthwise conv filter is in HW1O
+    // format and backprop input conv filter is in HWOI format.
+    // Only fuses multiplier if all dimensions other than the out channel
+    // dimension are equal to 1.
+    if (!TFL::IsDimensionsDegenerateExceptLastOne(
+            mul_value.getType().getShape())) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'mul_value' failed to satisfy constraint: "
+                "unsupported dimensions";
+      });
+    }
+    if (!((*conv_op.getODSResults(0).begin()).hasOneUse())) {
+      return rewriter.notifyMatchFailure(mul_op, [&](::mlir::Diagnostic &diag) {
+        diag << "entities 'conv' failed to satisfy constraint: has one use";
+      });
+    }
+
+    // Rewrite
+    broadcast_dims = broadcast_op.getBroadcastDimensions();
+    if (broadcast_dims == nullptr) {
+      const auto filter_rank = filter_value.getType().getRank();
+      auto dimsType = RankedTensorType::get({1}, rewriter.getIntegerType(64));
+      broadcast_dims = DenseIntElementsAttr::get(dimsType, {filter_rank - 1});
+    }
+    Value broadcast_multiplier = rewriter.create<mhlo::BroadcastInDimOp>(
+        mul_op.getLoc(), filter.getType(), multiplier, broadcast_dims);
+    Value new_filter = rewriter.create<mhlo::MulOp>(
+        mul_op.getLoc(), filter.getType(), filter, broadcast_multiplier);
+    Value new_conv = rewriter.create<mhlo::ConvolutionOp>(
+        mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(), new_filter,
+        conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
+        conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
+        conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(),
+        conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
+        conv_op.getPrecisionConfigAttr());
+    rewriter.replaceOp(mul_op, {new_conv});
+
+    return success();
+  }
+};
+
+class FuseMhloConvolutionPass
+    : public PassWrapper<FuseMhloConvolutionPass, OperationPass<func::FuncOp>> {
+ public:
+  StringRef getArgument() const final { return "fuse-mhlo-convolution-pass"; }
+  StringRef getDescription() const final {
+    return "Fuses MHLO binary element-wise ops and convolution op";
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<FuseMhloMulAndConvolutionPattern>(&getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<Pass> createFuseConvolutionPass() {
+  return std::make_unique<FuseMhloConvolutionPass>();
+}
+
+static PassRegistration<FuseMhloConvolutionPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_legalize_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_legalize_patterns.td
deleted file mode 100644
index 44157c7cd5b..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_legalize_patterns.td
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/IR/OpBase.td"
-include "mlir/Dialect/Arith/IR/ArithOps.td"
-include "mlir/Dialect/Func/IR/FuncOps.td"
-include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-
-
-// Patterns to legalize mhlo to tfl.
-
-// TODO(pulkitb): Both TFL_ConstOp and ConstantOp seem to work. Decide which
-// one is more apropos.
-def LegalizeConst : Pat<(HLO_ConstantOp $value),
-                        (Arith_ConstantOp $value)>;
-
-def LegalizeMul : Pat<(HLO_MulOp $lhs, $rhs),
-                      (TFL_MulOp  $lhs, $rhs, TFL_AF_None)>;
-
-def LegalizeDiv : Pat<(HLO_DivOp $lhs, $rhs),
-                      (TFL_DivOp  $lhs, $rhs, TFL_AF_None)>;
-
-def LegalizeAdd : Pat<(HLO_AddOp $lhs, $rhs),
-                      (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
-
-def LegalizeSub : Pat<(HLO_SubtractOp $lhs, $rhs),
-                      (TFL_SubOp $lhs, $rhs, TFL_AF_None)>;
-
-def LegalizeSqrt : Pat<(HLO_SqrtOp $arg), (TFL_SqrtOp $arg)>;
-
-def LegalizeSelect : Pat<(HLO_SelectOp $cond, $x, $y),
-                         (TFL_SelectOp $cond, $x, $y)>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.cc
deleted file mode 100644
index c8db922d33f..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-class MhloToTflPass
-    : public mlir::PassWrapper<MhloToTflPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- public:
-  explicit MhloToTflPass() : PassWrapper() {}
-  StringRef getArgument() const final { return "mhlo-tfl"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize MHLO Ops to TFLite custom Ops.";
-  }
-
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    mlir::mhlo::registerAllMhloDialects(registry);
-    mlir::stablehlo::registerAllDialects(registry);
-    registry.insert<mlir::func::FuncDialect, mlir::arith::ArithDialect>();
-    registry.insert<::mlir::mhlo::MhloDialect>();
-    registry.insert<shape::ShapeDialect>();
-    registry.insert<TensorFlowLiteDialect>();
-  }
-  inline ConstBytesAttr CustomOption(OpBuilder* builder,
-                                     const std::string& content) {
-    return ConstBytesAttr::get(builder->getContext(),
-                               StringRef(content.data(), content.size()));
-  }
-
-  void AddIntegerArray(flexbuffers::Builder* fbb,
-                       ::llvm::ArrayRef<int64_t> vec) {
-    auto start_input_dim = fbb->StartVector();
-    for (auto int_value : vec) {
-      fbb->Add(int_value);
-    }
-    fbb->EndVector(start_input_dim, /*typed=*/false, /*fixed=*/false);
-  }
-};
-
-void MhloToTflPass::runOnOperation() {
-  func::FuncOp fn = getOperation();
-  OpBuilder builder(fn.getContext());
-  fn.walk([&](Operation* op) {
-    // Process only MHLO ops.
-    if (op->getDialect()->getNamespace() != "mhlo") return;
-
-    // Build options.
-    std::string custom_option_buffer;
-    auto fbb = std::make_unique<flexbuffers::Builder>();
-    size_t map_start = fbb->StartMap();
-    for (auto pair : op->getAttrDictionary().getValue()) {
-      const char* key = pair.getName().data();
-      const auto attr = pair.getValue();
-
-      if (attr.isa<::mlir::IntegerAttr>()) {
-        fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
-        continue;
-      }
-
-      if (attr.isa<::mlir::FloatAttr>()) {
-        fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ElementsAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ElementsAttr>();
-        const auto ftype = array_attr.getElementType();
-        if (ftype.isInteger(16) || ftype.isInteger(32) || ftype.isInteger(64) ||
-            ftype.isInteger(128) || ftype.isInteger(1)) {
-          for (auto value : array_attr.getValues<IntegerAttr>()) {
-            auto int_value =
-                value.dyn_cast_or_null<mlir::IntegerAttr>().getInt();
-            fbb->Add(int_value);
-          }
-        } else if (ftype.isF32() || ftype.isF64() || ftype.isF128()) {
-          for (auto value : array_attr.getValues<FloatAttr>()) {
-            auto double_value =
-                value.dyn_cast_or_null<mlir::FloatAttr>().getValueAsDouble();
-            fbb->Add(double_value);
-          }
-        } else {
-          emitWarning(op->getLoc(), "serialization of ElementsAttr for ")
-              << key << " only supports Integer and Float.";
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::StringAttr>()) {
-        fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ArrayAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ArrayAttr>();
-        if (array_attr.size() > 1 && !array_attr[0].isa<mlir::StringAttr>() &&
-            !array_attr[0].isa<mlir::mhlo::PrecisionAttr>()) {
-          emitWarning(op->getLoc(), "serialization of ArrayAttr for ")
-              << key << " only supports Strings.";
-          continue;
-        }
-        for (auto value : array_attr) {
-          if (value.isa<mlir::mhlo::PrecisionAttr>()) {
-            auto string_value =
-                mlir::mhlo::stringifyPrecision(
-                    value.cast<mlir::mhlo::PrecisionAttr>().getValue())
-                    .data();
-            fbb->Add(string_value);
-          } else {
-            auto string_value =
-                value.dyn_cast_or_null<mlir::StringAttr>().data();
-            fbb->Add(string_value);
-          }
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::ConvDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::mhlo::ConvDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        fbb->Add(dimension_attr.getInputBatchDimension());
-        fbb->Add(dimension_attr.getInputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getInputSpatialDimensions());
-        fbb->Add(dimension_attr.getKernelInputFeatureDimension());
-        fbb->Add(dimension_attr.getKernelOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getKernelSpatialDimensions());
-        fbb->Add(dimension_attr.getOutputBatchDimension());
-        fbb->Add(dimension_attr.getOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getOutputSpatialDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::GatherDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::mhlo::GatherDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getOffsetDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getCollapsedSliceDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getStartIndexMap());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::ScatterDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::mhlo::ScatterDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getUpdateWindowDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getInsertedWindowDims());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getScatterDimsToOperandDims());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::DotDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::mhlo::DotDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getLhsBatchingDimensions());
-        AddIntegerArray(fbb.get(), dimension_attr.getRhsBatchingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getLhsContractingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getRhsContractingDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::ComparisonDirectionAttr>()) {
-        auto string_value =
-            mlir::mhlo::stringifyComparisonDirection(
-                attr.cast<mlir::mhlo::ComparisonDirectionAttr>().getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      if (attr.isa<::mlir::mhlo::ComparisonTypeAttr>()) {
-        auto string_value =
-            mlir::mhlo::stringifyComparisonType(
-                attr.cast<mlir::mhlo::ComparisonTypeAttr>().getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      // default
-      emitWarning(op->getLoc(), "serialization not supported for : ") << key;
-    }
-    fbb->EndMap(map_start);
-    fbb->Finish();
-    custom_option_buffer.assign(fbb->GetBuffer().begin(),
-                                fbb->GetBuffer().end());
-
-    // Build custom op.
-    builder.setInsertionPoint(op);
-    auto tfl_custom_op = builder.create<TFL::CustomOp>(
-        op->getLoc(), op->getResultTypes(), op->getOperands(),
-        op->getName().getStringRef(),
-        CustomOption(&builder, custom_option_buffer));
-    op->replaceAllUsesWith(tfl_custom_op);
-    op->erase();
-  });
-}
-std::unique_ptr<OperationPass<func::FuncOp>> CreateMhloToTflPass() {
-  return std::make_unique<MhloToTflPass>();
-}
-
-static PassRegistration<MhloToTflPass> pass;
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h
deleted file mode 100644
index 4517986edfd..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_TFL_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_TFL_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-// Creates a pass which transforms TF Ops to MHLO Ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateMhloToTflPass();
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_TFL_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.cc
deleted file mode 100644
index e8ae0140f65..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
-
-#include <algorithm>
-#include <string>
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-std::vector<std::string> GetAcceptedDialects() {
-  // It returns the default list of accepted dialects.
-  std::vector<std::string> accepted_dialects({"mhlo", "builtin", "func"});
-  return accepted_dialects;
-}
-
-bool IsAcceptedDialect(llvm::StringRef dialect_name,
-                       const std::vector<std::string>& accepted_dialects) {
-  return std::find(accepted_dialects.begin(), accepted_dialects.end(),
-                   dialect_name) != accepted_dialects.end();
-}
-
-bool IsMhloOpAllowed(StringRef op_name) {
-  // As per go/compute-ir-ops-v01.
-  static DenseSet<StringRef>* denylist = new DenseSet<StringRef>{
-      // (R2) Part 1: Internal ops.
-      "bitcast", "fusion",
-      // (R2) Part 2: Modularity ops.
-      // NOTE: These ops were proposed to be excluded from Compute IR
-      // because we didn't want to necessarily tie the specification to MLIR.
-      // In an MLIR-based implementation such as MHLO, these ops are fine.
-      // "get_tuple_element", "return", "tuple",
-      // (R3) Part 1: Distribution ops.
-      "after_all", "all_gather", "all_reduce", "all_to_all",
-      "collective_permute", "create_token", "cross-replica-sum", "infeed",
-      "outfeed", "print", "recv", "reduce_scatter", "replica_id", "send",
-      "trace",
-      // (R3) Part 2: Dynamism ops.
-      "compute_reshape_shape", "cstr_reshapable", "dynamic_broadcast_in_dim",
-      "dynamic_conv", "dynamic_gather", "dynamic_iota", "dynamic_pad",
-      "dynamic_reshape", "get_dimension_size", "real_dynamic_slice",
-      "set_dimension_size"
-      // NOTE: These ops were proposed to be excluded from Compute IR for now
-      // because we wanted to unify them with slice and real_dynamic_slice.
-      // In the meanwhile, they are very practically important to MHLO,
-      // so we should keep them on the allowlist.
-      // "dynamic-slice", "dynamic-update-slice"
-  };
-  return !denylist->contains(op_name);
-}
-
-bool IsAcceptedOp(llvm::StringRef dialect_name, llvm::StringRef op_name,
-                  const std::vector<std::string>& accepted_dialects) {
-  if (!IsAcceptedDialect(dialect_name, accepted_dialects)) return false;
-  return dialect_name != "mhlo" || IsMhloOpAllowed(op_name);
-}
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h
deleted file mode 100644
index b18081a7fca..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_UTIL_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_UTIL_H_
-
-#include <string>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-std::vector<std::string> GetAcceptedDialects();
-
-// Can we find the given `dialect_name` in the `accepted_dialects`?
-bool IsAcceptedDialect(llvm::StringRef dialect_name,
-                       const std::vector<std::string> &accepted_dialects);
-
-// Is MHLO op allowed in the TF to MHLO conversion result?
-bool IsMhloOpAllowed(StringRef op_name);
-
-// The consolidated logic to verify if each final op is acceptable or not.
-// Also see `PrintOpStatsPass` and `CheckAcceptedOpsPass`.
-bool IsAcceptedOp(llvm::StringRef dialect_name, llvm::StringRef op_name,
-                  const std::vector<std::string> &accepted_dialects);
-
-// Adds patterns which map TF Ops to MHLO Ops.
-inline void PopulateTFToMhloPatterns(
-    MLIRContext *context, bool legalize_chlo,
-    llvm::Optional<StringRef> tf2xla_fallback_device_type, bool prefer_tf2xla,
-    RewritePatternSet *patterns) {
-  // Add TF->HLO legalization patterns.
-  ::mlir::mhlo::PopulateLegalizeTfPatterns(context, patterns);
-
-  // Add TF->TF lowering patterns.
-  TF::PopulateTFLoweringBeforeHLOPatterns(context, patterns);
-
-  if (tf2xla_fallback_device_type) {
-    // Adding fallback Tf2XlaPatterns is needed to make the patterns work.
-    // Add TF->HLO legalization patterns via TF2XLA fallback.
-    ::mlir::mhlo::PopulateLegalizeTfWithTf2XlaPatterns(
-        tf2xla_fallback_device_type.getValue(), *patterns, context,
-        prefer_tf2xla);
-  }
-
-  // Populate with CHLO->HLO lowerings to account for TF ops legalized to
-  // client HLO (CHLO) first.
-  // https://github.com/tensorflow/mlir-hlo
-  if (legalize_chlo) {
-    chlo::populateDecomposeChloPatterns(context, patterns);
-    chlo::populateChloBroadcastingPatterns(context, patterns);
-  }
-  // ConstantLike op is convenient to create splat constants, but is
-  // canonicalized to plain HLO constant if statically shaped. Add the
-  // canonicalization pattern to pattern list to enable multi-hop lowering.
-  ::mlir::chlo::ConstantLikeOp::getCanonicalizationPatterns(*patterns, context);
-}
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_MHLO_UTIL_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
index 7c4371caf1e..53a3ba563bc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.cc
@@ -24,8 +24,11 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 
 namespace mlir {
 namespace odml {
@@ -38,9 +41,7 @@ class PrintOpStatsPass : public PassWrapper<PrintOpStatsPass, OperationPass<>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrintOpStatsPass)
 
   explicit PrintOpStatsPass(raw_ostream *os = &llvm::errs())
-      : accepted_dialects_(TFL::mhlo::GetAcceptedDialects()),
-        os_(os),
-        total_ops_(0) {}
+      : accepted_dialects_(GetAcceptedDialects()), os_(os), total_ops_(0) {}
 
   // Prints the resultant operation statistics pos_t iterating over the module.
   void runOnOperation() override;
@@ -118,7 +119,7 @@ void PrintOpStatsPass::printSummary() {
   num_dialect = 0;
   // Print the number of unconverted ops in the non-accepted dialects.
   for (const auto &dialect_name : sorted_dialect) {
-    if (!TFL::mhlo::IsAcceptedDialect(dialect_name, accepted_dialects_)) {
+    if (!IsAcceptedDialect(dialect_name, accepted_dialects_)) {
       *os_ << absl::StrFormat("%d %s ops", dialect_count_[dialect_name],
                               absl::AsciiStrToUpper(dialect_name));
       if (++num_dialect < num_unaccepted) {
@@ -130,9 +131,8 @@ void PrintOpStatsPass::printSummary() {
   *os_ << "\n\n";
 
   for (const auto &op_with_dialect_name : sorted_op) {
-    if (!TFL::mhlo::IsAcceptedOp(dialect_name_of_[op_with_dialect_name],
-                                 op_name_of_[op_with_dialect_name],
-                                 accepted_dialects_)) {
+    if (!IsAcceptedOp(dialect_name_of_[op_with_dialect_name],
+                      op_name_of_[op_with_dialect_name], accepted_dialects_)) {
       *os_ << absl::StrFormat("- %s: %4d occurrences \n", op_with_dialect_name,
                               op_with_dialect_count_[op_with_dialect_name]);
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
new file mode 100644
index 00000000000..8392c307fb9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
@@ -0,0 +1,547 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+// Convert mhlo.dot to mhlo.dot_general.
+LogicalResult ConvertDotToDotGeneral(mhlo::DotOp op,
+                                     PatternRewriter &rewriter) {
+  auto lhs_type = op.getLhs().getType().cast<ShapedType>();
+  auto rhs_type = op.getRhs().getType().cast<ShapedType>();
+  if (!lhs_type.hasRank() || !rhs_type.hasRank()) {
+    return rewriter.notifyMatchFailure(op, "unsupported unranked input type");
+  }
+  if (lhs_type.getRank() < 1 || 2 < lhs_type.getRank() ||
+      rhs_type.getRank() < 1 || 2 < rhs_type.getRank()) {
+    return rewriter.notifyMatchFailure(
+        op,
+        "unsupported dot operation type; operands must be vectors or "
+        "matrices");
+  }
+  rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+      op, op.getType(), op.getLhs(), op.getRhs(),
+      mhlo::DotDimensionNumbersAttr::get(
+          op.getContext(),
+          /*lhsBatchingDimensions=*/{},
+          /*rhsBatchingDimensions=*/{},
+          /*lhsContractingDimensions=*/{lhs_type.getRank() - 1},
+          /*rhsContractingDimensions=*/{0}),
+      op.getPrecisionConfigAttr());
+  return success();
+}
+
+// Convert reshape(dot_general(reshape(%y), %z)) to
+// dot_general(%y, %z) if possible.
+LogicalResult RemoveReshapeAroundDotGeneral(mhlo::ReshapeOp reshape_after,
+                                            PatternRewriter &rewriter) {
+  auto dot = dyn_cast_or_null<mhlo::DotGeneralOp>(
+      reshape_after.getOperand().getDefiningOp());
+  if (!dot) return failure();
+
+  auto reshape_before =
+      dyn_cast_or_null<mhlo::ReshapeOp>(dot.getLhs().getDefiningOp());
+  if (!reshape_before) return failure();
+
+  if (!dot.getLhs().getType().hasStaticShape() ||
+      !dot.getRhs().getType().hasStaticShape() ||
+      !reshape_before.getOperand().getType().hasStaticShape() ||
+      !dot.getType().hasStaticShape() ||
+      !reshape_after.getType().hasStaticShape()) {
+    return rewriter.notifyMatchFailure(reshape_after,
+                                       "dynamic shapes not supported");
+  }
+
+  const auto range = [](int64_t begin, int n) {
+    SmallVector<int64_t> result;
+    result.reserve(n);
+    for (int i = 0; i < n; ++i) {
+      result.push_back(i + begin);
+    }
+    return result;
+  };
+
+  // We only support the mhlo.dot style input layouts, i.e.:
+  //   lhs: [batch, other dims, contract dims]
+  //   rhs: [batch, contract dims, other dims]
+  auto dim_nums = dot.getDotDimensionNumbers();
+  int batch_dims_count = dim_nums.getLhsBatchingDimensions().size();
+  int contracting_dims_count = dim_nums.getLhsContractingDimensions().size();
+  if (dim_nums.getLhsBatchingDimensions() !=
+          ArrayRef<int64_t>(range(0, batch_dims_count)) ||
+      dim_nums.getRhsBatchingDimensions() !=
+          ArrayRef<int64_t>(range(0, batch_dims_count)) ||
+      dim_nums.getLhsContractingDimensions() !=
+          ArrayRef<int64_t>(
+              range(dot.getLhs().getType().getRank() - contracting_dims_count,
+                    contracting_dims_count)) ||
+      dim_nums.getRhsContractingDimensions() !=
+          ArrayRef<int64_t>(range(batch_dims_count, contracting_dims_count))) {
+    return rewriter.notifyMatchFailure(reshape_after,
+                                       "unsupported dot_general layout");
+  }
+
+  // (B = batch dims, C = contracting dims, Y/Z = other dims)
+  //
+  // This pattern converts:
+  //   %before = "mhlo.reshape"(%lhs : BxY1xC) : BxY2xC
+  //   %dot = "mhlo.dot_general"(%before, %rhs : BxCxZ) : BxY2xZ
+  //   %after = "mhlo.reshape"(%dot) : BxY1xZ
+  // to:
+  //   %dot : "mhlo.dot_general"(%lhs : BxY1xC, %rhs : BxCxZ) : BxY1xZ
+
+  // Extract B, Y1, C from %lhs.
+  ArrayRef<int64_t> shape_lhs =
+      reshape_before.getOperand().getType().getShape();
+  ArrayRef<int64_t> shape_b = shape_lhs.take_front(batch_dims_count);
+  ArrayRef<int64_t> shape_c = shape_lhs.take_back(contracting_dims_count);
+  ArrayRef<int64_t> shape_y1 =
+      shape_lhs.drop_front(shape_b.size()).drop_back(shape_c.size());
+
+  // Check %before shape, and extract Y2 from it.
+  ArrayRef<int64_t> shape_before = reshape_before.getType().getShape();
+  if (shape_before.take_front(shape_b.size()) != shape_b ||
+      shape_before.take_back(shape_c.size()) != shape_c) {
+    return failure();
+  }
+  ArrayRef<int64_t> shape_y2 =
+      shape_before.drop_front(shape_b.size()).drop_back(shape_c.size());
+
+  // No need to check %dot; dot_general verifier ensures correct shapes.
+  // Extract Z from %dot.
+  ArrayRef<int64_t> shape_z =
+      dot.getType().getShape().drop_front(shape_b.size() + shape_y2.size());
+
+  // Check %after shape.
+  if (reshape_after.getType().getShape() !=
+      ArrayRef<int64_t>(llvm::to_vector(
+          llvm::concat<const int64_t>(shape_b, shape_y1, shape_z)))) {
+    return failure();
+  }
+
+  rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+      reshape_after, reshape_after.getType(), reshape_before.getOperand(),
+      dot.getRhs(),
+      mhlo::DotDimensionNumbersAttr::get(
+          reshape_after.getContext(),
+          /*lhsBatchingDimensions=*/range(0, batch_dims_count),
+          /*rhsBatchingDimensions=*/range(0, batch_dims_count),
+          /*lhsContractingDimensions=*/
+          range(batch_dims_count + shape_y1.size(), contracting_dims_count),
+          /*rhsContractingDimensions=*/
+          range(batch_dims_count, contracting_dims_count)),
+      dot.getPrecisionConfigAttr());
+  return success();
+}
+
+// Convert:
+//   %y0 = dot_general(%x0, %w)
+//   %y1 = dot_general(%x1, %w)
+//   ...
+//   concatenate(%y0, %y1, ...)
+// To:
+//   %x = concatenate(%x0, %x1, ...)
+//   dot_general(%x, %w)
+LogicalResult LiftDotConcatLHS(mhlo::ConcatenateOp concat,
+                               PatternRewriter &rewriter) {
+  if (concat.getVal().size() < 2)
+    return rewriter.notifyMatchFailure(
+        concat, "Concatenate op should have at least two operands");
+
+  auto first_dot = concat.getVal()[0].getDefiningOp<mhlo::DotGeneralOp>();
+  if (!first_dot)
+    return rewriter.notifyMatchFailure(concat, "Operand is not dot_general");
+  if (!first_dot.getLhs().getType().hasStaticShape())
+    return rewriter.notifyMatchFailure(
+        first_dot, "All dot_general LHS must be statically shaped");
+  if (!first_dot->hasOneUse())
+    return rewriter.notifyMatchFailure(first_dot, "Op has multiple uses");
+
+  SmallVector<Value> all_dot_lhs;
+  all_dot_lhs.reserve(concat.getVal().size());
+  all_dot_lhs.push_back(first_dot.getLhs());
+
+  const uint64_t batch_dims_count =
+      first_dot.getDotDimensionNumbers().getLhsBatchingDimensions().size();
+  const uint64_t contracting_dims_count =
+      first_dot.getDotDimensionNumbers().getLhsContractingDimensions().size();
+  const uint64_t lhs_other_dims_count = first_dot.getLhs().getType().getRank() -
+                                        batch_dims_count -
+                                        contracting_dims_count;
+
+  // This pattern only supports concating on LHS other dims (neither batch nor
+  // contracting).
+  if (concat.getDimension() < batch_dims_count ||
+      concat.getDimension() >= batch_dims_count + lhs_other_dims_count) {
+    return rewriter.notifyMatchFailure(concat,
+                                       "Not concating on LHS other dims");
+  }
+
+  for (auto value : concat.getVal().drop_front()) {
+    auto dot = value.getDefiningOp<mhlo::DotGeneralOp>();
+    if (!dot)
+      return rewriter.notifyMatchFailure(concat, "Operand is not dot_general");
+
+    if (dot.getRhs() != first_dot.getRhs())
+      return rewriter.notifyMatchFailure(
+          dot, "dot_general ops have different rhs parameters");
+    if (dot.getDotDimensionNumbers() != first_dot.getDotDimensionNumbers())
+      return rewriter.notifyMatchFailure(
+          dot, "dot_general ops have different dimension numbers");
+    if (dot.getPrecisionConfig() != first_dot.getPrecisionConfig())
+      return rewriter.notifyMatchFailure(
+          dot, "dot_general ops have different precision configs");
+    if (!dot.getLhs().getType().hasStaticShape())
+      return rewriter.notifyMatchFailure(
+          dot, "all dot_general LHS must be statically shaped");
+    if (dot.getLhs().getType().getElementType() !=
+            first_dot.getLhs().getType().getElementType() ||
+        dot.getType().getElementType() != first_dot.getType().getElementType())
+      return rewriter.notifyMatchFailure(
+          dot, "all dot_general ops must have the same element type");
+    if (!dot->hasOneUse())
+      return rewriter.notifyMatchFailure(dot, "Op has multiple uses");
+
+    all_dot_lhs.push_back(dot.getLhs());
+  }
+
+  const auto is_lhs_batch_or_contracting_dim = [&](uint64_t dim) {
+    auto dim_nums = first_dot.getDotDimensionNumbers();
+    return llvm::is_contained(dim_nums.getLhsBatchingDimensions(), dim) ||
+           llvm::is_contained(dim_nums.getLhsContractingDimensions(), dim);
+  };
+
+  // dot_general outputs are always in the
+  //   [batch dims, LHS other dims, RHS other dims]
+  // layout, so the new concat dim is where the n-th (base-0 counting) LHS other
+  // dim appears in the original LHS layout, where:
+  //   n = old_concat_dim - batch_dims_count
+  uint64_t n = concat.getDimension() - batch_dims_count;
+
+  // Now try to answer where the n-th LHS other dim was originally placed.
+  // This is the dimension we should now concat on.
+  int new_concat_dim = -1;
+  for (int i = 0; i < first_dot.getLhs().getType().getRank(); ++i) {
+    if (!is_lhs_batch_or_contracting_dim(i) && n-- == 0) {
+      new_concat_dim = i;
+      break;
+    }
+  }
+
+  // Now get the output shape of the lifted concat op.
+  SmallVector<int64_t> new_concat_shape(
+      first_dot.getLhs().getType().getShape());
+  new_concat_shape[new_concat_dim] = 0;
+  for (auto v : all_dot_lhs) {
+    new_concat_shape[new_concat_dim] +=
+        v.getType().dyn_cast<ShapedType>().getShape()[new_concat_dim];
+  }
+
+  auto new_concat = rewriter.create<mhlo::ConcatenateOp>(
+      concat->getLoc(), concat.getType().clone(new_concat_shape), all_dot_lhs,
+      rewriter.getI64IntegerAttr(new_concat_dim));
+  rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+      concat, concat.getType(), new_concat, first_dot.getRhs(),
+      first_dot.getDotDimensionNumbers(), first_dot.getPrecisionConfigAttr());
+  return success();
+}
+
+// Convert:
+//   %y0 = dot_general(%x0, %w0)
+//   %y1 = dot_general(%x1, %w1)
+//   ...
+//   concatenate(%y0, %y1, ...)
+// To:
+//   %x = concatenate(%x0, %x1, ...)
+//   %w = concatenate(%w0, %w1, ...)
+//   dot_general(%x, %w)
+//
+// To simplify the implementation, we only handle the case where the final
+// concat is on the only batching dim.
+LogicalResult LiftDotConcatLHSAndRHS(mhlo::ConcatenateOp concat,
+                                     PatternRewriter &rewriter) {
+  if (concat.getVal().size() < 2)
+    return rewriter.notifyMatchFailure(
+        concat, "Concatenate op should have at least two operands");
+
+  auto first_dot = concat.getVal()[0].getDefiningOp<mhlo::DotGeneralOp>();
+  if (!first_dot)
+    return rewriter.notifyMatchFailure(concat, "Operand is not dot_general");
+  if (!first_dot.getLhs().getType().hasStaticShape())
+    return rewriter.notifyMatchFailure(
+        first_dot, "All dot_general LHS must be statically shaped");
+  if (!first_dot->hasOneUse())
+    return rewriter.notifyMatchFailure(first_dot, "Op has multiple uses");
+
+  SmallVector<Value> all_dot_lhs;
+  all_dot_lhs.reserve(concat.getVal().size());
+  all_dot_lhs.push_back(first_dot.getLhs());
+  SmallVector<Value> all_dot_rhs;
+  all_dot_rhs.reserve(concat.getVal().size());
+  all_dot_rhs.push_back(first_dot.getRhs());
+
+  if (first_dot.getDotDimensionNumbers().getLhsBatchingDimensions().size() != 1)
+    return rewriter.notifyMatchFailure(first_dot, "One batching dim required");
+  if (concat.getDimension() != 0)
+    return rewriter.notifyMatchFailure(
+        concat, "Not concating on the first batching dim");
+
+  for (auto value : concat.getVal().drop_front()) {
+    auto dot = value.getDefiningOp<mhlo::DotGeneralOp>();
+    if (!dot)
+      return rewriter.notifyMatchFailure(concat, "Operand is not dot_general");
+
+    if (dot.getDotDimensionNumbers() != first_dot.getDotDimensionNumbers())
+      return rewriter.notifyMatchFailure(
+          dot, "dot_general ops have different dimension numbers");
+    if (dot.getPrecisionConfig() != first_dot.getPrecisionConfig())
+      return rewriter.notifyMatchFailure(
+          dot, "dot_general ops have different precision configs");
+    if (!dot.getLhs().getType().hasStaticShape() ||
+        !dot.getRhs().getType().hasStaticShape())
+      return rewriter.notifyMatchFailure(
+          dot, "all dot_general operands must be statically shaped");
+    if (dot.getLhs().getType().getElementType() !=
+            first_dot.getLhs().getType().getElementType() ||
+        dot.getRhs().getType().getElementType() !=
+            first_dot.getRhs().getType().getElementType() ||
+        dot.getType().getElementType() != first_dot.getType().getElementType())
+      return rewriter.notifyMatchFailure(
+          dot, "all dot_general ops must have the same element type");
+    if (!dot->hasOneUse())
+      return rewriter.notifyMatchFailure(dot, "Op has multiple uses");
+
+    all_dot_lhs.push_back(dot.getLhs());
+    all_dot_rhs.push_back(dot.getRhs());
+  }
+
+  // Now get the output shapes of the lifted concat ops.
+  const int64_t lhs_batch_dim =
+      first_dot.getDotDimensionNumbers().getLhsBatchingDimensions()[0];
+  SmallVector<int64_t> lhs_new_concat_shape(
+      first_dot.getLhs().getType().getShape());
+  lhs_new_concat_shape[lhs_batch_dim] = 0;
+  for (auto v : all_dot_lhs) {
+    lhs_new_concat_shape[lhs_batch_dim] +=
+        v.getType().dyn_cast<ShapedType>().getShape()[lhs_batch_dim];
+  }
+  const int64_t rhs_batch_dim =
+      first_dot.getDotDimensionNumbers().getRhsBatchingDimensions()[0];
+  SmallVector<int64_t> rhs_new_concat_shape(
+      first_dot.getRhs().getType().getShape());
+  rhs_new_concat_shape[rhs_batch_dim] = 0;
+  for (auto v : all_dot_rhs) {
+    rhs_new_concat_shape[rhs_batch_dim] +=
+        v.getType().dyn_cast<ShapedType>().getShape()[rhs_batch_dim];
+  }
+
+  auto lhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
+      concat->getLoc(), concat.getType().clone(lhs_new_concat_shape),
+      all_dot_lhs, rewriter.getI64IntegerAttr(lhs_batch_dim));
+  auto rhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
+      concat->getLoc(), concat.getType().clone(rhs_new_concat_shape),
+      all_dot_rhs, rewriter.getI64IntegerAttr(rhs_batch_dim));
+  rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+      concat, concat.getType(), lhs_new_concat, rhs_new_concat,
+      first_dot.getDotDimensionNumbers(), first_dot.getPrecisionConfigAttr());
+  return success();
+}
+
+// Convert:
+//   %y0 = slice(%x, start=0, limit=2)
+//   %y1 = slice(%x, start=2, limit=3)
+//   concat(%y0, %y1, ...)
+// To:
+//   %y = slice(%x, start=0, limit=3)
+//   concat(%y, ...)
+LogicalResult FuseSliceConcat(mhlo::ConcatenateOp concat,
+                              PatternRewriter &rewriter) {
+  if (concat.getVal().size() < 2)
+    return rewriter.notifyMatchFailure(
+        concat, "Concatenate op should have at least two operands");
+
+  auto first = concat.getVal()[0].getDefiningOp<mhlo::SliceOp>();
+  auto second = concat.getVal()[1].getDefiningOp<mhlo::SliceOp>();
+  if (!first || !second)
+    return rewriter.notifyMatchFailure(concat, "operands are not slice ops");
+  if (first.getOperand() != second.getOperand())
+    return rewriter.notifyMatchFailure(concat, "slice not on the same input");
+  if (!first.getStrides().isSplat() ||
+      first.getStrides().getSplatValue<IntegerAttr>().getInt() != 1 ||
+      first.getStrides() != second.getStrides())
+    return rewriter.notifyMatchFailure(concat, "slice ops must have stride=1");
+  if (!first->hasOneUse() || !second->hasOneUse())
+    return rewriter.notifyMatchFailure(concat, "slice ops are used elsewhere");
+
+  SmallVector<int64_t> new_start;
+  SmallVector<int64_t> new_limit;
+  SmallVector<int64_t> new_slice_shape;
+  new_start.reserve(first.getStrides().size());
+  new_limit.reserve(first.getStrides().size());
+  new_slice_shape.reserve(first.getStrides().size());
+
+  for (int i = 0; i < first.getStrides().size(); ++i) {
+    const int64_t first_start =
+        first.getStartIndicesAttr().getValues<IntegerAttr>()[i].getInt();
+    const int64_t first_limit =
+        first.getLimitIndicesAttr().getValues<IntegerAttr>()[i].getInt();
+    const int64_t second_start =
+        second.getStartIndicesAttr().getValues<IntegerAttr>()[i].getInt();
+    const int64_t second_limit =
+        second.getLimitIndicesAttr().getValues<IntegerAttr>()[i].getInt();
+
+    if (i == concat.getDimension()) {
+      if (first_limit != second_start)
+        return rewriter.notifyMatchFailure(
+            second, "slice is not continuous with previous slice");
+    } else {
+      if (first_start != second_start || first_limit != second_limit)
+        return rewriter.notifyMatchFailure(
+            second, "non-concat dims have mismatching slice bounds");
+    }
+
+    new_start.push_back(first_start);
+    new_limit.push_back(second_limit);
+    new_slice_shape.push_back(second_limit - first_start);
+  }
+
+  auto new_slice = rewriter.create<mhlo::SliceOp>(
+      FusedLoc::get(first->getContext(), {first.getLoc(), second.getLoc()}),
+      first.getType().clone(new_slice_shape), first.getOperand(),
+      /*start_indices=*/rewriter.getI64TensorAttr(new_start),
+      /*limit_indices=*/rewriter.getI64TensorAttr(new_limit),
+      /*strides=*/first.getStrides());
+
+  SmallVector<Value> new_concat_values;
+  new_concat_values.reserve(concat.getVal().size() - 1);
+  new_concat_values.push_back(new_slice);
+  llvm::append_range(new_concat_values, concat.getVal().drop_front(2));
+
+  rewriter.replaceOpWithNewOp<mhlo::ConcatenateOp>(
+      concat, concat.getType(), new_concat_values, concat.getDimension());
+  return success();
+}
+
+// Convert:
+//   %input : 1xYxC
+//   %1 = mhlo.reshape %param : (1xCxZ) -> CxZ
+//   mhlo.dot_general %input, %1 {batch_dims = []}
+// To:
+//   mhlo.dot_general %input, %param {batch_dims = [0]}
+//
+// This usage will mostly come from tf-unroll-batch-matmul, so it's fine to only
+// handle the case where batching dim is the leftmost dim.
+LogicalResult ConvertReshapeDotRhsToBatchedDot(mhlo::DotGeneralOp dot,
+                                               PatternRewriter &rewriter) {
+  mhlo::DotDimensionNumbersAttr dim_nums = dot.getDotDimensionNumbers();
+  if (!dim_nums.getLhsBatchingDimensions().empty()) return failure();
+
+  auto reshape = dot.getRhs().getDefiningOp<mhlo::ReshapeOp>();
+  if (!reshape) return failure();
+  if (!reshape->hasOneUse())
+    return rewriter.notifyMatchFailure(reshape, "reshape has multiple usages");
+  if (!reshape.getType().hasStaticShape() ||
+      !reshape.getOperand().getType().hasStaticShape() ||
+      !dot.getLhs().getType().hasStaticShape()) {
+    return rewriter.notifyMatchFailure(dot, "dynamic shaping not supported");
+  }
+
+  ArrayRef<int64_t> orig_param_shape =
+      reshape.getOperand().getType().getShape();
+  ArrayRef<int64_t> dot_param_shape = reshape.getType().getShape();
+  if (orig_param_shape.size() != dot_param_shape.size() + 1 ||
+      orig_param_shape.front() != 1) {
+    return rewriter.notifyMatchFailure(reshape, "unsupported reshape pattern");
+  }
+
+  int lhs_first_other_dim = -1;
+  for (int i = 0; i < dot.getLhs().getType().getRank(); ++i) {
+    if (!llvm::is_contained(dim_nums.getLhsContractingDimensions(), i)) {
+      lhs_first_other_dim = i;
+      break;
+    }
+  }
+  if (lhs_first_other_dim == -1 ||
+      dot.getLhs().getType().getShape()[lhs_first_other_dim] != 1) {
+    return rewriter.notifyMatchFailure(dot, "unsupported LHS shape");
+  }
+
+  SmallVector<int64_t, 4> new_rhs_contracting_dims;
+  new_rhs_contracting_dims.reserve(
+      dim_nums.getRhsContractingDimensions().size());
+  for (int64_t d : dim_nums.getRhsContractingDimensions()) {
+    new_rhs_contracting_dims.push_back(d + 1);
+  }
+
+  rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
+      dot, dot.getType(), dot.getLhs(), reshape.getOperand(),
+      mhlo::DotDimensionNumbersAttr::get(
+          dot.getContext(),
+          /*lhsBatchingDimensions=*/{lhs_first_other_dim},
+          /*rhsBatchingDimensions=*/{0},
+          /*lhsContractingDimensions=*/dim_nums.getLhsContractingDimensions(),
+          /*rhsContractingDimensions=*/new_rhs_contracting_dims),
+      dot.getPrecisionConfigAttr());
+  return success();
+}
+
+class OptimizePass
+    : public PassWrapper<OptimizePass, OperationPass<func::FuncOp>> {
+ public:
+  StringRef getArgument() const final { return "mhlo-optimize"; }
+  StringRef getDescription() const final {
+    return "Applies various optimizations on MHLO IR";
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add(ConvertDotToDotGeneral);
+    patterns.add(RemoveReshapeAroundDotGeneral);
+    patterns.add(LiftDotConcatLHS);
+    patterns.add(LiftDotConcatLHSAndRHS);
+    patterns.add(FuseSliceConcat);
+    patterns.add(ConvertReshapeDotRhsToBatchedDot);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<Pass> createOptimizePass() {
+  return std::make_unique<OptimizePass>();
+}
+
+static PassRegistration<OptimizePass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
new file mode 100644
index 00000000000..9da4a1b5008
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which unfuses MHLO batch norm inference op into arithmetic
+// ops.
+std::unique_ptr<Pass> createUnfuseBatchNormPass();
+
+// Creates a pass which constant folds broadcast_in_dim op conditionally.
+std::unique_ptr<Pass> createFoldBroadcastPass();
+
+// Creates a pass which fuses MHLO binary element-wise ops and convolution op.
+std::unique_ptr<Pass> createFuseConvolutionPass();
+
+// Creates a pass which applies various optimizations on MHLO IR.
+std::unique_ptr<Pass> createOptimizePass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
index f802b161fb9..72ae7cc1c00 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
 
 class RenameEntrypointToMainPass
     : public PassWrapper<RenameEntrypointToMainPass, OperationPass<ModuleOp>> {
@@ -86,6 +85,5 @@ std::unique_ptr<Pass> CreateRenameEntrypointToMainPass() {
 
 static PassRegistration<RenameEntrypointToMainPass> pass;
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
index 2b1fac5d305..e56b7130132 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h
@@ -21,13 +21,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
 
 std::unique_ptr<Pass> CreateRenameEntrypointToMainPass();
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_RENAME_ENTRYPOINT_TO_MAIN_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
index 7ba5821514d..7b050d281fd 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
@@ -23,19 +23,18 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
 
 namespace {
 
 LogicalResult SmuggleOp(Operation* op, PatternRewriter& rewriter) {
   auto call_target =
       rewriter.getNamedAttr("call_target_name", op->getName().getIdentifier());
-  auto custom_call = rewriter.create<mlir::mhlo::CustomCallOp>(
+  auto custom_call = rewriter.create<mlir::stablehlo::CustomCallOp>(
       op->getLoc(), op->getResultTypes(), op->getOperands(),
       ArrayRef<NamedAttribute>{call_target});
   rewriter.replaceOp(op, custom_call.getResults());
@@ -61,7 +60,7 @@ class SmuggleDisallowedOpsPass
  public:
   StringRef getArgument() const final { return "smuggle-disallowed-ops-pass"; }
   StringRef getDescription() const final {
-    return "Smuggle disallowed ops via mhlo.custom_calls";
+    return "Smuggle disallowed ops via stablehlo.custom_calls";
   }
 
   void runOnOperation() override {
@@ -70,7 +69,7 @@ class SmuggleDisallowedOpsPass
 
     ConversionTarget target(getContext());
     target.addIllegalOp<TF::ResizeBilinearOp>();
-    target.addLegalDialect<mlir::mhlo::MhloDialect>();
+    target.addLegalDialect<mlir::stablehlo::StablehloDialect>();
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
       signalPassFailure();
@@ -84,6 +83,5 @@ std::unique_ptr<Pass> CreateSmuggleDisallowedOpsPass() {
 
 static PassRegistration<SmuggleDisallowedOpsPass> pass;
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h
index f46a369628f..61e076e8099 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h
@@ -21,13 +21,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
 namespace mlir {
-namespace TFL {
-namespace mhlo {
+namespace odml {
 
 std::unique_ptr<Pass> CreateSmuggleDisallowedOpsPass();
 
-}  // namespace mhlo
-}  // namespace TFL
+}  // namespace odml
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_SMUGGLE_DISALLOWED_OPS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
new file mode 100644
index 00000000000..858fe15a7f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
@@ -0,0 +1,259 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace odml {
+
+class StablehloToTflPass
+    : public mlir::PassWrapper<StablehloToTflPass,
+                               mlir::OperationPass<mlir::func::FuncOp>> {
+ public:
+  explicit StablehloToTflPass() : PassWrapper() {}
+  StringRef getArgument() const final { return "stablehlo-tfl"; }
+  StringRef getDescription() const final {
+    return "This pass will legalize StableHLO Ops to TFLite custom Ops.";
+  }
+
+ private:
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+  inline TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
+                                          const std::string& content) {
+    return TFL::ConstBytesAttr::get(builder->getContext(),
+                                    StringRef(content.data(), content.size()));
+  }
+
+  void AddIntegerArray(flexbuffers::Builder* fbb,
+                       ::llvm::ArrayRef<int64_t> vec) {
+    auto start_input_dim = fbb->StartVector();
+    for (auto int_value : vec) {
+      fbb->Add(int_value);
+    }
+    fbb->EndVector(start_input_dim, /*typed=*/false, /*fixed=*/false);
+  }
+};
+
+void StablehloToTflPass::runOnOperation() {
+  func::FuncOp fn = getOperation();
+  OpBuilder builder(fn.getContext());
+  fn.walk([&](Operation* op) {
+    // Process only StableHLO ops.
+    if (op->getDialect()->getNamespace() != "stablehlo") return;
+
+    // Build options.
+    std::string custom_option_buffer;
+    auto fbb = std::make_unique<flexbuffers::Builder>();
+    size_t map_start = fbb->StartMap();
+    for (auto pair : op->getAttrDictionary().getValue()) {
+      const char* key = pair.getName().data();
+      const auto attr = pair.getValue();
+
+      if (attr.isa<::mlir::IntegerAttr>()) {
+        fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
+        continue;
+      }
+
+      if (attr.isa<::mlir::FloatAttr>()) {
+        fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
+        continue;
+      }
+
+      if (attr.isa<::mlir::ElementsAttr>()) {
+        auto start = fbb->StartVector(key);
+        auto array_attr = attr.dyn_cast<mlir::ElementsAttr>();
+        const auto ftype = array_attr.getElementType();
+        if (ftype.isInteger(16) || ftype.isInteger(32) || ftype.isInteger(64) ||
+            ftype.isInteger(128) || ftype.isInteger(1)) {
+          for (auto value : array_attr.getValues<IntegerAttr>()) {
+            auto int_value =
+                value.dyn_cast_or_null<mlir::IntegerAttr>().getInt();
+            fbb->Add(int_value);
+          }
+        } else if (ftype.isF32() || ftype.isF64() || ftype.isF128()) {
+          for (auto value : array_attr.getValues<FloatAttr>()) {
+            auto double_value =
+                value.dyn_cast_or_null<mlir::FloatAttr>().getValueAsDouble();
+            fbb->Add(double_value);
+          }
+        } else {
+          emitWarning(op->getLoc(), "serialization of ElementsAttr for ")
+              << key << " only supports Integer and Float.";
+        }
+        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::StringAttr>()) {
+        fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
+        continue;
+      }
+
+      if (attr.isa<::mlir::ArrayAttr>()) {
+        auto start = fbb->StartVector(key);
+        auto array_attr = attr.dyn_cast<mlir::ArrayAttr>();
+        if (array_attr.size() > 1 && !array_attr[0].isa<mlir::StringAttr>() &&
+            !array_attr[0].isa<mlir::stablehlo::PrecisionAttr>()) {
+          emitWarning(op->getLoc(), "serialization of ArrayAttr for ")
+              << key << " only supports Strings.";
+          continue;
+        }
+        for (auto value : array_attr) {
+          if (value.isa<mlir::stablehlo::PrecisionAttr>()) {
+            auto string_value =
+                mlir::stablehlo::stringifyPrecision(
+                    value.cast<mlir::stablehlo::PrecisionAttr>().getValue())
+                    .data();
+            fbb->Add(string_value);
+          } else {
+            auto string_value =
+                value.dyn_cast_or_null<mlir::StringAttr>().data();
+            fbb->Add(string_value);
+          }
+        }
+        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::ConvDimensionNumbersAttr>()) {
+        auto dimension_attr =
+            attr.dyn_cast<::mlir::stablehlo::ConvDimensionNumbersAttr>();
+        auto start = fbb->StartVector(key);
+        fbb->Add(dimension_attr.getInputBatchDimension());
+        fbb->Add(dimension_attr.getInputFeatureDimension());
+        AddIntegerArray(fbb.get(), dimension_attr.getInputSpatialDimensions());
+        fbb->Add(dimension_attr.getKernelInputFeatureDimension());
+        fbb->Add(dimension_attr.getKernelOutputFeatureDimension());
+        AddIntegerArray(fbb.get(), dimension_attr.getKernelSpatialDimensions());
+        fbb->Add(dimension_attr.getOutputBatchDimension());
+        fbb->Add(dimension_attr.getOutputFeatureDimension());
+        AddIntegerArray(fbb.get(), dimension_attr.getOutputSpatialDimensions());
+        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::GatherDimensionNumbersAttr>()) {
+        auto dimension_attr =
+            attr.dyn_cast<::mlir::stablehlo::GatherDimensionNumbersAttr>();
+        auto start = fbb->StartVector(key);
+        AddIntegerArray(fbb.get(), dimension_attr.getOffsetDims());
+        AddIntegerArray(fbb.get(), dimension_attr.getCollapsedSliceDims());
+        AddIntegerArray(fbb.get(), dimension_attr.getStartIndexMap());
+        fbb->Add(dimension_attr.getIndexVectorDim());
+        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::ScatterDimensionNumbersAttr>()) {
+        auto dimension_attr =
+            attr.dyn_cast<::mlir::stablehlo::ScatterDimensionNumbersAttr>();
+        auto start = fbb->StartVector(key);
+        AddIntegerArray(fbb.get(), dimension_attr.getUpdateWindowDims());
+        AddIntegerArray(fbb.get(), dimension_attr.getInsertedWindowDims());
+        AddIntegerArray(fbb.get(),
+                        dimension_attr.getScatterDimsToOperandDims());
+        fbb->Add(dimension_attr.getIndexVectorDim());
+        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::DotDimensionNumbersAttr>()) {
+        auto dimension_attr =
+            attr.dyn_cast<::mlir::stablehlo::DotDimensionNumbersAttr>();
+        auto start = fbb->StartVector(key);
+        AddIntegerArray(fbb.get(), dimension_attr.getLhsBatchingDimensions());
+        AddIntegerArray(fbb.get(), dimension_attr.getRhsBatchingDimensions());
+        AddIntegerArray(fbb.get(),
+                        dimension_attr.getLhsContractingDimensions());
+        AddIntegerArray(fbb.get(),
+                        dimension_attr.getRhsContractingDimensions());
+        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::ComparisonDirectionAttr>()) {
+        auto string_value =
+            mlir::stablehlo::stringifyComparisonDirection(
+                attr.cast<mlir::stablehlo::ComparisonDirectionAttr>()
+                    .getValue())
+                .str();
+        fbb->String(key, string_value);
+        continue;
+      }
+
+      if (attr.isa<::mlir::stablehlo::ComparisonTypeAttr>()) {
+        auto string_value =
+            mlir::stablehlo::stringifyComparisonType(
+                attr.cast<mlir::stablehlo::ComparisonTypeAttr>().getValue())
+                .str();
+        fbb->String(key, string_value);
+        continue;
+      }
+
+      // default
+      emitWarning(op->getLoc(), "serialization not supported for : ") << key;
+    }
+    fbb->EndMap(map_start);
+    fbb->Finish();
+    custom_option_buffer.assign(fbb->GetBuffer().begin(),
+                                fbb->GetBuffer().end());
+
+    // Build custom op.
+    builder.setInsertionPoint(op);
+    auto tfl_custom_op = builder.create<TFL::CustomOp>(
+        op->getLoc(), op->getResultTypes(), op->getOperands(),
+        op->getName().getStringRef(),
+        CustomOption(&builder, custom_option_buffer));
+    op->replaceAllUsesWith(tfl_custom_op);
+    op->erase();
+  });
+}
+std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass() {
+  return std::make_unique<StablehloToTflPass>();
+}
+
+static PassRegistration<StablehloToTflPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
new file mode 100644
index 00000000000..9445b770f10
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which transforms StableHLO Ops to TFL Ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.cc
new file mode 100644
index 00000000000..4c9929631e1
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.cc
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+
+namespace mlir {
+namespace odml {
+
+std::vector<std::string> GetAcceptedDialects() {
+  // It returns the default list of accepted dialects.
+  std::vector<std::string> accepted_dialects({"stablehlo", "builtin", "func"});
+  return accepted_dialects;
+}
+
+bool IsAcceptedDialect(llvm::StringRef dialect_name,
+                       const std::vector<std::string>& accepted_dialects) {
+  return std::find(accepted_dialects.begin(), accepted_dialects.end(),
+                   dialect_name) != accepted_dialects.end();
+}
+
+bool IsAcceptedOp(llvm::StringRef dialect_name, llvm::StringRef op_name,
+                  const std::vector<std::string>& accepted_dialects) {
+  return IsAcceptedDialect(dialect_name, accepted_dialects);
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h
new file mode 100644
index 00000000000..bee9031f4e7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace odml {
+
+std::vector<std::string> GetAcceptedDialects();
+
+// Can we find the given `dialect_name` in the `accepted_dialects`?
+bool IsAcceptedDialect(llvm::StringRef dialect_name,
+                       const std::vector<std::string> &accepted_dialects);
+
+// The consolidated logic to verify if each final op is acceptable or not.
+// Also see `PrintOpStatsPass` and `CheckAcceptedOpsPass`.
+bool IsAcceptedOp(llvm::StringRef dialect_name, llvm::StringRef op_name,
+                  const std::vector<std::string> &accepted_dialects);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_UTIL_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.cc
deleted file mode 100644
index cd997872cc1..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h"
-
-#include <utility>
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-class TFToMhloPass
-    : public mlir::PassWrapper<TFToMhloPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- public:
-  explicit TFToMhloPass(bool skip_quantization_ops = false,
-                        bool skip_resize = false)
-      : PassWrapper() {
-    skip_quantization_ops_ = skip_quantization_ops;
-    skip_resize_ = skip_resize;
-  }
-
-  TFToMhloPass(const TFToMhloPass &pass) {
-    skip_quantization_ops_ = pass.skip_quantization_ops_;
-    skip_resize_ = pass.skip_resize_;
-  }
-
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    mlir::mhlo::registerAllMhloDialects(registry);
-    mlir::stablehlo::registerAllDialects(registry);
-    registry.insert<mlir::func::FuncDialect, mlir::arith::ArithDialect>();
-    registry.insert<shape::ShapeDialect>();
-  }
-
- public:
-  StringRef getArgument() const final { return "tf-mhlo"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize TF Ops to MHLO Ops..";
-  }
-
- protected:
-  Option<bool> skip_quantization_ops_{
-      *this, "skip-quantization-ops",
-      ::llvm::cl::desc("Skip quantization ops")};
-
-  Option<bool> skip_resize_{
-      *this, "skip-resize",
-      ::llvm::cl::desc("Skip tf.ResizeBilinear and tf.ResizeNearestNeighbor")};
-};
-
-void TFToMhloPass::runOnOperation() {
-  auto func = getOperation();
-  MLIRContext *context = func->getContext();
-
-  RewritePatternSet patterns(context);
-  // Add TF to MHLO patterns.
-  PopulateTFToMhloPatterns(
-      context, /*legalize_chlo=*/true,
-      /*tf2xla_fallback_device_type=*/llvm::StringRef("XLA_CPU_JIT"),
-      /*prefer_tf2xla=*/false, &patterns);
-
-  ConversionTarget target(*context);
-  target.addIllegalDialect<chlo::ChloDialect>();
-  target.addLegalDialect<mlir::mhlo::MhloDialect>();
-  target.addLegalDialect<arith::ArithDialect>();
-  target.addLegalDialect<func::FuncDialect>();
-  target.addLegalDialect<tensor::TensorDialect>();
-  target.addLegalDialect<shape::ShapeDialect>();
-  target.addLegalOp<func::CallOp>();
-
-  if (skip_quantization_ops_) {
-    target.addLegalOp<TF::FakeQuantWithMinMaxVarsOp>();
-    target.addLegalOp<TF::FakeQuantWithMinMaxVarsPerChannelOp>();
-    target.addLegalOp<TF::FakeQuantWithMinMaxArgsOp>();
-    target.addLegalOp<TF::QuantizeAndDequantizeV2Op>();
-    target.addLegalOp<TF::QuantizeAndDequantizeV3Op>();
-  }
-  if (skip_resize_) {
-    target.addLegalOp<TF::ResizeBilinearOp>();
-    target.addLegalOp<TF::ResizeNearestNeighborOp>();
-  }
-
-  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
-  if (failed(applyPartialConversion(func, target, frozen_patterns))) {
-    return signalPassFailure();
-  }
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFToMhloPass(
-    bool skip_quantization_ops, bool skip_resize) {
-  return std::make_unique<TFToMhloPass>(skip_quantization_ops, skip_resize);
-}
-
-static PassRegistration<TFToMhloPass> pass;
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h
deleted file mode 100644
index a5797b3e02e..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-// Creates a pass which transforms TF Ops to MHLO Ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFToMhloPass(
-    bool skip_quantization_ops, bool skip_resize_nearest);
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.cc
deleted file mode 100644
index fa787e63444..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-namespace {
-
-// Converts an `mhlo.compare` general Op to specific TFL comparison Ops such
-// as `tfl.greater`. Needs to be instantiated with specific target types, and
-// comparison directions.
-//
-// Example:
-//   patterns.add<ConvertMhloCompareOp<mlir::TFL::GreaterEqualOp>>(
-//       patterns.getContext(),
-//       ::mlir::mhlo::ComparisonDirection::GE);
-//
-template <typename TFL_CompareOp, mlir::mhlo::ComparisonDirection Direction>
-struct ConvertMhloCompareOp
-    : public OpConversionPattern<::mlir::mhlo::CompareOp> {
- public:
-  explicit ConvertMhloCompareOp(MLIRContext *context)
-      : OpConversionPattern<::mlir::mhlo::CompareOp>(context) {}
-
-  ::mlir::LogicalResult matchAndRewrite(
-      ::mlir::mhlo::CompareOp op, mlir::mhlo::CompareOp::Adaptor adaptor,
-      ::mlir::ConversionPatternRewriter &rewriter) const override {
-    auto direction = op.getComparisonDirection();
-
-    if (direction != Direction) {
-      return failure();
-    }
-
-    rewriter.replaceOpWithNewOp<TFL_CompareOp>(op, adaptor.getLhs(),
-                                               adaptor.getRhs());
-    return success();
-  }
-};
-
-// Convert the MHLO Atan2 Op which cannot be mapped to TFL directly to a
-// `tfl.custom` Op which can be executed using custom TFL kernels.
-struct ConvertMhloAtan2ToTflCustomOp
-    : public OpConversionPattern<::mlir::mhlo::Atan2Op> {
- public:
-  explicit ConvertMhloAtan2ToTflCustomOp(::mlir::MLIRContext *context)
-      : OpConversionPattern<::mlir::mhlo::Atan2Op>(context) {}
-
-  ConstBytesAttr BuildEmptyConstBytesAttr(Operation *op) const {
-    OpBuilder builder(op);
-
-    return ConstBytesAttr::get(builder.getContext(), StringRef());
-  }
-
-  ::mlir::LogicalResult matchAndRewrite(
-      ::mlir::mhlo::Atan2Op op, OpAdaptor adaptor,
-      ::mlir::ConversionPatternRewriter &rewriter) const override {
-    auto op_code = op->getName().stripDialect().str();
-    auto custom_option = BuildEmptyConstBytesAttr(op);
-
-    rewriter.replaceOpWithNewOp<mlir::TFL::CustomOp>(op, op->getResultTypes(),
-                                                     adaptor.getOperands(),
-                                                     op_code, custom_option);
-    return success();
-  }
-};
-
-}  // namespace
-
-class TFMhloTFLPass
-    : public mlir::PassWrapper<TFMhloTFLPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    mlir::mhlo::registerAllMhloDialects(registry);
-    mlir::stablehlo::registerAllDialects(registry);
-    registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                    mlir::TFL::TensorFlowLiteDialect>();
-  }
-
- public:
-  StringRef getArgument() const final { return "tf-mhlo-tfl"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize TF ops to TFL via mHLO.";
-  }
-};
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/generated_mhlo_tfl_legalize_patterns.inc"
-
-// Convert TF to MHLO and then to TFLite.
-// This is "all or nothing" based conversion. If a TF op converts to a few MHLO
-// ops that cannot fully convert to TFLite, the original TF op is kept.
-void TFMhloTFLPass::runOnOperation() {
-  auto func = getOperation();
-  MLIRContext *context = func->getContext();
-
-  RewritePatternSet patterns(context);
-
-  // Add TF to MHLO patterns.
-  PopulateTFToMhloPatterns(
-      context, /*legalize_chlo=*/true,
-      /*tf2xla_fallback_device_type=*/llvm::StringRef("XLA_CPU_JIT"),
-      /*prefer_tf2xla=*/false, &patterns);
-
-  // Add MHLO to TFL patterns.
-  populateWithGenerated(patterns);
-  patterns.add<ConvertMhloAtan2ToTflCustomOp,
-               ConvertMhloCompareOp<mlir::TFL::NotEqualOp,
-                                    ::mlir::mhlo::ComparisonDirection::NE>,
-               ConvertMhloCompareOp<mlir::TFL::LessOp,
-                                    ::mlir::mhlo::ComparisonDirection::LT>,
-               ConvertMhloCompareOp<mlir::TFL::LessEqualOp,
-                                    ::mlir::mhlo::ComparisonDirection::LE>,
-               ConvertMhloCompareOp<mlir::TFL::GreaterOp,
-                                    ::mlir::mhlo::ComparisonDirection::GT>,
-               ConvertMhloCompareOp<mlir::TFL::GreaterEqualOp,
-                                    ::mlir::mhlo::ComparisonDirection::GE>>(
-      patterns.getContext());
-
-  ConversionTarget target(*context);
-  // Intermediate dialects.
-  target.addIllegalDialect<shape::ShapeDialect>();
-  target.addIllegalDialect<::mlir::mhlo::MhloDialect>();
-  // Final expected dialects.
-  target.addLegalDialect<arith::ArithDialect>();
-  target.addLegalDialect<func::FuncDialect>();
-  target.addLegalDialect<TFL::TensorFlowLiteDialect>();
-
-  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
-  if (failed(applyPartialConversion(func, target, frozen_patterns))) {
-    return signalPassFailure();
-  }
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFMhloTFLPass() {
-  return std::make_unique<TFMhloTFLPass>();
-}
-
-static PassRegistration<TFMhloTFLPass> pass([] {
-  return CreateTFMhloTFLPass();
-});
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.h
deleted file mode 100644
index 06e44700bb8..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_tfl_pass.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_TFL_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_TFL_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-// Creates a pass which transforms TF Ops to TFLite via an intermediate
-// conversion to MHLO.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFMhloTFLPass();
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_MHLO_TFL_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.cc
deleted file mode 100644
index 9195fcf0577..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_util.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-static bool isTFOp(Operation *op) {
-  return op->getDialect()->getNamespace() == "tf";
-}
-
-class TFPolyPass
-    : public mlir::PassWrapper<TFPolyPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    mlir::mhlo::registerAllMhloDialects(registry);
-    mlir::stablehlo::registerAllDialects(registry);
-    registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                    mlir::TFL::TensorFlowLiteDialect>();
-  }
-
-  // Options that control what transfomraitons are applied.
-
-  PolyCallOptions options_;
-
-  // A transformation to be applied to TF ops. The transformation is specified
-  // by patterns and target.
-  struct Tranformation {
-    std::string name;
-    FrozenRewritePatternSet patterns;  // The patterns to be applied.
-    ConversionTarget target;           // The target of the transformation.
-  };
-
-  // Creates a list of transformaiton to be applied to TF ops.
-  // The transformations apply on replicated list of TF ops so there is no
-  // interaction between the transformations.
-  std::vector<Tranformation> LoadTransformations(PolyCallOptions options,
-                                                 MLIRContext *context) {
-    std::vector<Tranformation> transformations;
-    // Optionally add TF to MHLO pass.
-    if (options.enable_tf_mhlo_conversion) {
-      RewritePatternSet patterns(context);
-      PopulateTFToMhloPatterns(
-          context, /*legalize_chlo=*/true,
-          /*tf2xla_fallback_device_type=*/llvm::StringRef("DEFAULT"),
-          /*prefer_tf2xla=*/false, &patterns);
-
-      ConversionTarget target(*context);
-      target.addLegalDialect<arith::ArithDialect>();
-      target.addLegalDialect<func::FuncDialect>();
-      target.addLegalDialect<::mlir::mhlo::MhloDialect>();
-
-      FrozenRewritePatternSet frozen_patterns(std::move(patterns));
-      Tranformation tf_mhlo_transform = {"tf_mhlo", frozen_patterns, target};
-      transformations.push_back(tf_mhlo_transform);
-    }
-    return transformations;
-  }
-
-  // Copies an `op` and put the copy into `region`, and return the copied op.
-  Operation *CopyTfAndCreateRegion(OpBuilder *builder, Operation *op,
-                                   Region *region) {
-    Block *block = new Block;
-    region->push_back(block);
-    builder->setInsertionPointToEnd(&region->front());
-    Operation *tf_op = builder->clone(*op);
-    Location loc = op->getLoc();
-    block->addArguments(op->getOperandTypes(),
-                        SmallVector<Location>(op->getNumOperands(), loc));
-    for (auto &idx_args : llvm::enumerate(block->getArguments())) {
-      tf_op->setOperand(idx_args.index(), idx_args.value());
-    }
-    builder->create<YieldOp>(loc, tf_op->getResults());
-    return tf_op;
-  }
-
- public:
-  StringRef getArgument() const final { return "tf-poly"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize TF ops to poly call.";
-  }
-  explicit TFPolyPass(PolyCallOptions options) { options_ = options; }
-};
-
-void TFPolyPass::runOnOperation() {
-  func::FuncOp fn = getOperation();
-  MLIRContext *context = fn->getContext();
-  const std::vector<Tranformation> transformations =
-      LoadTransformations(options_, context);
-  const int num_transformation = transformations.size();
-  std::vector<std::vector<Operation *>> to_transform(num_transformation);
-  fn.walk([&](Operation *op) {
-    // Process only TF ops.
-    if (!isTFOp(op)) return;
-
-    // Create polycall op. Need to call setInsertionPoint to avoid recurrsion.
-    OpBuilder builder(op->getContext());
-    builder.setInsertionPoint(op);
-    auto poly_op = builder.create<TFL::PolyCallOp>(
-        op->getLoc(), op->getResultTypes(), op->getOperands(),
-        num_transformation + 1);
-    poly_op->setAttrs(op->getAttrs());
-
-    // Create TF region.
-    Region tf_region;
-    (void)CopyTfAndCreateRegion(&builder, op, &tf_region);
-    poly_op.getCalls()
-        .take_back(num_transformation + 1)
-        .data()
-        ->takeBody(tf_region);
-
-    // Create regions according to the transformations.
-    for (int i = 0; i < num_transformation; i++) {
-      Region region;
-      to_transform[i].push_back(CopyTfAndCreateRegion(&builder, op, &region));
-      poly_op.getCalls().take_back(i + 1).data()->takeBody(region);
-    }
-
-    // Replace original func with polycall.
-    op->replaceAllUsesWith(poly_op);
-    op->erase();
-  });
-
-  // Apply transformations.
-  for (int i = 0; i < num_transformation; i++) {
-    auto transformation = transformations[i];
-    auto op_to_transform = to_transform[i];
-    if (failed(applyPartialConversion(op_to_transform, transformation.target,
-                                      transformation.patterns))) {
-      return signalPassFailure();
-    }
-  }
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFPolyPass(
-    PolyCallOptions options) {
-  return std::make_unique<TFPolyPass>(options);
-}
-
-static PassRegistration<TFPolyPass> pass([] {
-  PolyCallOptions options;
-  options.enable_tf_mhlo_conversion = true;
-  return CreateTFPolyPass(options);
-});
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.h
deleted file mode 100644
index 2f2a6f7b655..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_poly_pass.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_POLY_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_POLY_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-// Creates a pass which transforms TF Ops to multiple representations.
-// Only use this to TF ops that cannot convert to tflite fully.
-struct PolyCallOptions {
-  bool enable_tf_mhlo_conversion = false;
-};
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTFPolyPass(
-    PolyCallOptions options);
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_POLY_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
new file mode 100644
index 00000000000..d3886e9127b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -0,0 +1,165 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h"
+
+namespace mlir {
+namespace odml {
+
+class TFToMhloPass
+    : public mlir::PassWrapper<TFToMhloPass,
+                               mlir::OperationPass<mlir::func::FuncOp>> {
+ public:
+  explicit TFToMhloPass(bool skip_quantization_ops = false,
+                        bool skip_resize = false)
+      : PassWrapper() {
+    skip_quantization_ops_ = skip_quantization_ops;
+    skip_resize_ = skip_resize;
+  }
+
+  TFToMhloPass(const TFToMhloPass &pass) {
+    skip_quantization_ops_ = pass.skip_quantization_ops_;
+    skip_resize_ = pass.skip_resize_;
+  }
+
+ private:
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    mlir::mhlo::registerAllMhloDialects(registry);
+    mlir::stablehlo::registerAllDialects(registry);
+    registry.insert<mlir::func::FuncDialect, mlir::arith::ArithDialect>();
+    registry.insert<shape::ShapeDialect>();
+  }
+
+ public:
+  StringRef getArgument() const final { return "tf-mhlo"; }
+  StringRef getDescription() const final {
+    return "This pass will legalize TF Ops to MHLO Ops.";
+  }
+
+ protected:
+  Option<bool> skip_quantization_ops_{
+      *this, "skip-quantization-ops",
+      ::llvm::cl::desc("Skip quantization ops")};
+
+  Option<bool> skip_resize_{
+      *this, "skip-resize",
+      ::llvm::cl::desc("Skip tf.ResizeBilinear and tf.ResizeNearestNeighbor")};
+};
+
+void TFToMhloPass::runOnOperation() {
+  auto func = getOperation();
+  MLIRContext *context = func->getContext();
+
+  RewritePatternSet patterns(context);
+  mhlo::PopulateLegalizeTfPatterns(context, &patterns);
+  TF::PopulateTFLoweringBeforeHLOPatterns(context, &patterns);
+  mhlo::Tf2XlaTypeConverter converter;
+  mhlo::PopulateLegalizeTfWithTf2XlaPatterns(
+      "XLA_CPU_JIT", patterns, context, converter, /*prefer_tf2xla=*/false);
+  chlo::populateDecomposeChloPatterns(context, &patterns);
+  chlo::populateChloBroadcastingPatterns(context, &patterns);
+  chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
+
+  ConversionTarget target(*context);
+  target.addIllegalDialect<chlo::ChloDialect>();
+  target.addLegalDialect<mhlo::MhloDialect>();
+  target.addLegalDialect<arith::ArithDialect>();
+  target.addLegalDialect<func::FuncDialect>();
+  target.addLegalDialect<tensor::TensorDialect>();
+  target.addLegalDialect<shape::ShapeDialect>();
+  target.addLegalOp<func::CallOp>();
+
+  if (skip_quantization_ops_) {
+    target.addLegalOp<TF::FakeQuantWithMinMaxVarsOp>();
+    target.addLegalOp<TF::FakeQuantWithMinMaxVarsPerChannelOp>();
+    target.addLegalOp<TF::FakeQuantWithMinMaxArgsOp>();
+    target.addLegalOp<TF::QuantizeAndDequantizeV2Op>();
+    target.addLegalOp<TF::QuantizeAndDequantizeV3Op>();
+  }
+  if (skip_resize_) {
+    target.addLegalOp<TF::ResizeBilinearOp>();
+    target.addLegalOp<TF::ResizeNearestNeighborOp>();
+  }
+
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+  if (failed(applyPartialConversion(func, target, frozen_patterns))) {
+    return signalPassFailure();
+  }
+}
+
+struct TFToStablehloOptions : public PassPipelineOptions<TFToStablehloOptions> {
+  Option<bool> skip_quantization_ops{*this, "skip-quantization-ops",
+                                     ::llvm::cl::desc("Skip quantization ops")};
+  Option<bool> skip_resize{
+      *this, "skip-resize",
+      ::llvm::cl::desc("Skip tf.ResizeBilinear and tf.ResizeNearestNeighbor")};
+};
+
+void PopulateLegalizeTFToStablehloPipeline(
+    OpPassManager &pm, const TFToStablehloOptions &options) {
+  // TODO(burmako): Migrate this pass from producing MHLO to producing StableHLO
+  // by aligning with the TF/XLA bridge on the corresponding functionality and
+  // reusing their work, perhaps through `LowerToMlProgramAndHlo`.
+  pm.addNestedPass<func::FuncOp>(std::make_unique<TFToMhloPass>(
+      options.skip_quantization_ops, options.skip_resize));
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mhlo::createHloLegalizeToStablehloPass());
+}
+
+static PassPipelineRegistration<TFToStablehloOptions>
+    legalize_tf_to_stablehlo_pipeline("tf-stablehlo",
+                                      "Legalize TF ops to StableHLO ops",
+                                      PopulateLegalizeTFToStablehloPipeline);
+
+void AddLegalizeTFToStablehloPasses(OpPassManager &pm,
+                                    bool skip_quantization_ops,
+                                    bool skip_resize) {
+  TFToStablehloOptions options;
+  options.skip_quantization_ops = skip_quantization_ops;
+  options.skip_resize = skip_resize;
+  PopulateLegalizeTFToStablehloPipeline(pm, options);
+}
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
new file mode 100644
index 00000000000..0eb199b7e8f
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Adds passes which transform TF Ops to StableHLO Ops.
+void AddLegalizeTFToStablehloPasses(OpPassManager& pm,
+                                    bool skip_quantization_ops,
+                                    bool skip_resize);
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TF_STABLEHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.cc
deleted file mode 100644
index c2370ecd63f..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-class TflToMhloPass
-    : public mlir::PassWrapper<TflToMhloPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- public:
-  explicit TflToMhloPass() : PassWrapper() {}
-  StringRef getArgument() const final { return "tfl-parse-mhlo-ops"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize TFLite custom Ops to MHLO ops.";
-  }
-
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    mlir::mhlo::registerAllMhloDialects(registry);
-    mlir::stablehlo::registerAllDialects(registry);
-    registry.insert<::mlir::mhlo::MhloDialect>();
-    registry.insert<shape::ShapeDialect>();
-  }
-  inline ConstBytesAttr CustomOption(OpBuilder* builder,
-                                     const std::string& content) {
-    return ConstBytesAttr::get(builder->getContext(),
-                               StringRef(content.data(), content.size()));
-  }
-
-  std::vector<int64_t> FlatbufferVecToMlirVec(const flexbuffers::Vector& vec) {
-    std::vector<int64_t> temp(vec.size(), 0);
-    for (int i = 0; i < vec.size(); i++) {
-      temp[i] = vec[i].AsInt64();
-    }
-    return temp;
-  }
-
-  llvm::SmallVector<mlir::NamedAttribute, 4> ReadAttr(const flexbuffers::Map& m,
-                                                      Builder* builder) {
-    llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
-    const auto& keys = m.Keys();
-    for (size_t i = 0; i < keys.size(); ++i) {
-      const auto key = keys[i].AsKey();
-      const auto& value = m[key];
-      switch (value.GetType()) {
-        case flexbuffers::FBT_INT: {
-          auto attr = value.AsInt64();
-          auto named_attr = builder->getNamedAttr(
-              key, builder->getIntegerAttr(builder->getIntegerType(64), attr));
-          attrs.push_back(named_attr);
-          break;
-        }
-        case flexbuffers::FBT_VECTOR_INT: {
-          const auto& vector = value.AsTypedVector();
-          std::vector<int64_t> vec;
-          for (size_t i = 0; i < vector.size(); i++) {
-            vec.push_back(vector[i].AsInt64());
-          }
-          RankedTensorType ty = tensorflow::GetTypeFromTFTensorShape(
-              {static_cast<int64_t>(vec.size())}, builder->getIntegerType(64));
-          auto named_attr =
-              builder->getNamedAttr(key, DenseIntElementsAttr::get(ty, vec));
-          attrs.push_back(named_attr);
-          break;
-        }
-        case flexbuffers::FBT_VECTOR_STRING_DEPRECATED: {
-          const auto& vector = value.AsTypedVector();
-
-          if (std::string{key} == "precision_config") {
-            llvm::SmallVector<mlir::Attribute> precision_attrs;
-            for (size_t i = 0; i < vector.size(); i++) {
-              auto conf_attr = mlir::mhlo::PrecisionAttr::get(
-                  builder->getContext(),
-                  mlir::mhlo::symbolizePrecision(vector[i].AsString().str())
-                      .getValue());
-              precision_attrs.push_back(conf_attr);
-            }
-            auto named_attr = builder->getNamedAttr(
-                key, builder->getArrayAttr(precision_attrs));
-            attrs.push_back(named_attr);
-          } else {
-            std::vector<StringRef> temp;
-            for (size_t i = 0; i < vector.size(); i++) {
-              auto conf_str =
-                  builder->getStringAttr(vector[i].AsString().str());
-              temp.push_back(conf_str);
-            }
-            ArrayRef<StringRef> values(temp);
-            auto named_attr =
-                builder->getNamedAttr(key, builder->getStrArrayAttr(values));
-            attrs.push_back(named_attr);
-          }
-
-          break;
-        }
-        case flexbuffers::FBT_VECTOR: {
-          if (std::string{key} == "dimension_numbers") {
-            auto value_vec = value.AsVector();
-            auto vec1 = FlatbufferVecToMlirVec(value_vec[2].AsVector());
-            auto vec2 = FlatbufferVecToMlirVec(value_vec[5].AsVector());
-            auto vec3 = FlatbufferVecToMlirVec(value_vec[8].AsVector());
-            auto conv_dimension_numbers_attr =
-                mlir::mhlo::ConvDimensionNumbersAttr::get(
-                    builder->getContext(), value_vec[0].AsInt64(),
-                    value_vec[1].AsInt64(), llvm::ArrayRef<int64_t>(vec1),
-                    value_vec[3].AsInt64(), value_vec[4].AsInt64(),
-                    llvm::ArrayRef<int64_t>(vec2), value_vec[6].AsInt64(),
-                    value_vec[7].AsInt64(), llvm::ArrayRef<int64_t>(vec3));
-            auto named_attr =
-                builder->getNamedAttr(key, conv_dimension_numbers_attr);
-            attrs.push_back(named_attr);
-          }
-          break;
-        }
-        default: {
-          emitWarning(builder->getUnknownLoc(),
-                      "seralization not supported for : ")
-              << key;
-          break;
-        }
-      }
-    }
-    return attrs;
-  }
-};
-
-void TflToMhloPass::runOnOperation() {
-  func::FuncOp fn = getOperation();
-  OpBuilder builder(fn.getContext());
-  fn.walk([&](TFL::CustomOp custom_op) {
-    builder.setInsertionPoint(custom_op);
-    const uint8_t* option_buf = reinterpret_cast<const uint8_t*>(
-        custom_op.getCustomOption().getValue().data());
-    auto flex_buffer_map =
-        flexbuffers::GetRoot(option_buf,
-                             custom_op.getCustomOption().getValue().size())
-            .AsMap();
-    auto attr = ReadAttr(flex_buffer_map, &builder);
-    OperationState op_state(custom_op.getLoc(),
-                            custom_op.getCustomCode().str());
-    op_state.addOperands(custom_op.getOperands());
-    llvm::SmallVector<mlir::Type, 4> output_tys;
-    for (int i = 0; i < custom_op.getNumResults(); i++) {
-      output_tys.push_back(custom_op.getType(i));
-    }
-    op_state.addTypes(output_tys);
-    op_state.addAttributes(attr);
-    auto mhlo_op = builder.create(op_state);
-    custom_op.replaceAllUsesWith(mhlo_op);
-    custom_op.erase();
-  });
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTflToMhloPass() {
-  return std::make_unique<TflToMhloPass>();
-}
-
-static PassRegistration<TflToMhloPass> pass;
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.h
deleted file mode 100644
index 1ad815c1cca..00000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_mhlo_pass.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_MHLO_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_MHLO_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace TFL {
-namespace mhlo {
-
-// Creates a pass which transforms TFLite to MHLO Ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateTflToMhloPass();
-
-}  // namespace mhlo
-}  // namespace TFL
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_MHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
new file mode 100644
index 00000000000..278ce333258
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
@@ -0,0 +1,206 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
+
+namespace mlir {
+namespace odml {
+
+class TflToStablehloPass
+    : public mlir::PassWrapper<TflToStablehloPass,
+                               mlir::OperationPass<mlir::func::FuncOp>> {
+ public:
+  explicit TflToStablehloPass() : PassWrapper() {}
+  StringRef getArgument() const final { return "tfl-parse-stablehlo-ops"; }
+  StringRef getDescription() const final {
+    return "This pass will legalize TFLite custom Ops to StableHLO ops.";
+  }
+
+ private:
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    mlir::stablehlo::registerAllDialects(registry);
+  }
+  inline TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
+                                          const std::string& content) {
+    return TFL::ConstBytesAttr::get(builder->getContext(),
+                                    StringRef(content.data(), content.size()));
+  }
+
+  std::vector<int64_t> FlatbufferVecToMlirVec(const flexbuffers::Vector& vec) {
+    std::vector<int64_t> temp(vec.size(), 0);
+    for (int i = 0; i < vec.size(); i++) {
+      temp[i] = vec[i].AsInt64();
+    }
+    return temp;
+  }
+
+  llvm::SmallVector<mlir::NamedAttribute, 4> ReadAttr(const flexbuffers::Map& m,
+                                                      Builder* builder) {
+    llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+    const auto& keys = m.Keys();
+    for (size_t i = 0; i < keys.size(); ++i) {
+      const auto key = keys[i].AsKey();
+      const auto& value = m[key];
+      switch (value.GetType()) {
+        case flexbuffers::FBT_INT: {
+          auto attr = value.AsInt64();
+          auto named_attr = builder->getNamedAttr(
+              key, builder->getIntegerAttr(builder->getIntegerType(64), attr));
+          attrs.push_back(named_attr);
+          break;
+        }
+        case flexbuffers::FBT_VECTOR_INT: {
+          const auto& vector = value.AsTypedVector();
+          std::vector<int64_t> vec;
+          for (size_t i = 0; i < vector.size(); i++) {
+            vec.push_back(vector[i].AsInt64());
+          }
+          std::vector<int64_t> shape;
+          if (std::string{key} == "padding") {
+            shape.push_back(vec.size() / 2);
+            shape.push_back(2);
+          } else {
+            shape.push_back(vec.size());
+          }
+          RankedTensorType ty = tensorflow::GetTypeFromTFTensorShape(
+              shape, builder->getIntegerType(64));
+          auto named_attr =
+              builder->getNamedAttr(key, DenseIntElementsAttr::get(ty, vec));
+          attrs.push_back(named_attr);
+          break;
+        }
+        case flexbuffers::FBT_VECTOR_STRING_DEPRECATED: {
+          const auto& vector = value.AsTypedVector();
+
+          if (std::string{key} == "precision_config") {
+            llvm::SmallVector<mlir::Attribute> precision_attrs;
+            for (size_t i = 0; i < vector.size(); i++) {
+              auto conf_attr = mlir::stablehlo::PrecisionAttr::get(
+                  builder->getContext(), mlir::stablehlo::symbolizePrecision(
+                                             vector[i].AsString().str())
+                                             .value());
+              precision_attrs.push_back(conf_attr);
+            }
+            auto named_attr = builder->getNamedAttr(
+                key, builder->getArrayAttr(precision_attrs));
+            attrs.push_back(named_attr);
+          } else {
+            std::vector<StringRef> temp;
+            for (size_t i = 0; i < vector.size(); i++) {
+              auto conf_str =
+                  builder->getStringAttr(vector[i].AsString().str());
+              temp.push_back(conf_str);
+            }
+            ArrayRef<StringRef> values(temp);
+            auto named_attr =
+                builder->getNamedAttr(key, builder->getStrArrayAttr(values));
+            attrs.push_back(named_attr);
+          }
+
+          break;
+        }
+        case flexbuffers::FBT_VECTOR: {
+          if (std::string{key} == "dimension_numbers") {
+            auto value_vec = value.AsVector();
+            auto vec1 = FlatbufferVecToMlirVec(value_vec[2].AsVector());
+            auto vec2 = FlatbufferVecToMlirVec(value_vec[5].AsVector());
+            auto vec3 = FlatbufferVecToMlirVec(value_vec[8].AsVector());
+            auto conv_dimension_numbers_attr =
+                mlir::stablehlo::ConvDimensionNumbersAttr::get(
+                    builder->getContext(), value_vec[0].AsInt64(),
+                    value_vec[1].AsInt64(), llvm::ArrayRef<int64_t>(vec1),
+                    value_vec[3].AsInt64(), value_vec[4].AsInt64(),
+                    llvm::ArrayRef<int64_t>(vec2), value_vec[6].AsInt64(),
+                    value_vec[7].AsInt64(), llvm::ArrayRef<int64_t>(vec3));
+            auto named_attr =
+                builder->getNamedAttr(key, conv_dimension_numbers_attr);
+            attrs.push_back(named_attr);
+          }
+          break;
+        }
+        default: {
+          emitWarning(builder->getUnknownLoc(),
+                      "seralization not supported for : ")
+              << key;
+          break;
+        }
+      }
+    }
+    return attrs;
+  }
+};
+
+void TflToStablehloPass::runOnOperation() {
+  func::FuncOp fn = getOperation();
+  OpBuilder builder(fn.getContext());
+  fn.walk([&](TFL::CustomOp custom_op) {
+    builder.setInsertionPoint(custom_op);
+    const uint8_t* option_buf = reinterpret_cast<const uint8_t*>(
+        custom_op.getCustomOption().getValue().data());
+    auto flex_buffer_map =
+        flexbuffers::GetRoot(option_buf,
+                             custom_op.getCustomOption().getValue().size())
+            .AsMap();
+    auto attr = ReadAttr(flex_buffer_map, &builder);
+    OperationState op_state(custom_op.getLoc(),
+                            custom_op.getCustomCode().str());
+    op_state.addOperands(custom_op.getOperands());
+    llvm::SmallVector<mlir::Type, 4> output_tys;
+    for (int i = 0; i < custom_op.getNumResults(); i++) {
+      output_tys.push_back(custom_op.getType(i));
+    }
+    op_state.addTypes(output_tys);
+    op_state.addAttributes(attr);
+    auto stablehlo_op = builder.create(op_state);
+    custom_op.replaceAllUsesWith(stablehlo_op);
+    custom_op.erase();
+  });
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTflToStablehloPass() {
+  return std::make_unique<TflToStablehloPass>();
+}
+
+static PassRegistration<TflToStablehloPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h
new file mode 100644
index 00000000000..e6e40762891
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace odml {
+
+// Creates a pass which transforms TFLite to StableHLO Ops.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateTflToStablehloPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_TFL_STABLEHLO_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index 7b0d00ac45b..b34c9bc0ffa 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -18,20 +18,22 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/drop_savedmodel_semantics.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_mhlo_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace mlir {
 namespace odml {
 
 void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
                             bool smuggle_disallowed_ops) {
-  pm.addPass(mlir::TFL::mhlo::CreateRenameEntrypointToMainPass());
+  pm.addPass(CreateRenameEntrypointToMainPass());
   // TODO(b/230572023): Consider improving shape inference for While op instead
   // of dropping the attribute. This need not be correct for models not trained
   // on TPU.
@@ -56,20 +58,30 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm.addNestedPass<func::FuncOp>(
       mlir::quant::CreateConvertTFQuantOpsToMHLOPass());
-  pm.addPass(mhlo::createLegalizeTFControlFlowPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  pm.addNestedPass<func::FuncOp>(mlir::TFL::mhlo::CreateTFToMhloPass(
-      /*skip_quantization_ops=*/false, skip_resize));
   pm.addPass(mlir::createCanonicalizerPass());
+  AddLegalizeTFToStablehloPasses(pm, /*skip_quantization_ops=*/false,
+                                 skip_resize);
   if (smuggle_disallowed_ops) {
-    pm.addNestedPass<func::FuncOp>(
-        mlir::TFL::mhlo::CreateSmuggleDisallowedOpsPass());
+    pm.addNestedPass<func::FuncOp>(CreateSmuggleDisallowedOpsPass());
     pm.addPass(mlir::createCanonicalizerPass());
   }
-  pm.addPass(mlir::TFL::mhlo::CreateDropSavedModelSemanticsPass());
+  pm.addPass(CreateDropSavedModelSemanticsPass());
 }
 
-void AddStablehloOptimizationPasses(OpPassManager& pm) {}
+void AddStablehloOptimizationPasses(OpPassManager& pm) {
+  // The current plan of record is to avoid doing optimization passes
+  // on StableHLO, treating StableHLO purely as an input format, and do all
+  // optimizations via MHLO passes that can be shared with the OpenXLA compiler.
+  // Therefore, this function inserts a StableHLO <=> MHLO roundtrip to make
+  // this happen.
+  pm.addPass(mhlo::createStablehloLegalizeToHloPass());
+  pm.addNestedPass<func::FuncOp>(createUnfuseBatchNormPass());
+  pm.addNestedPass<func::FuncOp>(createFuseConvolutionPass());
+  pm.addNestedPass<func::FuncOp>(createFoldBroadcastPass());
+  pm.addNestedPass<func::FuncOp>(createOptimizePass());
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mhlo::createHloLegalizeToStablehloPass());
+}
 
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc
new file mode 100644
index 00000000000..4a46a083aff
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfuse_batch_norm_pass.cc
@@ -0,0 +1,198 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace odml {
+
+// Broadcasts the 1D value tensor 'value_1d' to the shape of 'result_type'. If
+// 'shape_value' is initialized, creates a dynamic broadcast, otherwise creates
+// a static broadcast.
+Value broadcastToFeatureDim(Location loc, RankedTensorType result_type,
+                            Value value1d, Value shape_value,
+                            int64_t feature_dim, PatternRewriter &rewriter) {
+  auto dims_type =
+      RankedTensorType::get(/*shape=*/{1}, rewriter.getIntegerType(64));
+  auto dims = DenseIntElementsAttr::get(dims_type, {feature_dim});
+  if (shape_value) {
+    return rewriter.createOrFold<mhlo::DynamicBroadcastInDimOp>(
+        loc, result_type, value1d, shape_value, dims);
+  }
+  assert(result_type.hasStaticShape());
+  return rewriter.create<mhlo::BroadcastInDimOp>(loc, result_type, value1d,
+                                                 dims);
+}
+
+// Gets the shape of operand, assuming it is a dynamic shape with static rank.
+Value getShapeValue(Location loc, Value operand, PatternRewriter &rewriter) {
+  RankedTensorType resultType = operand.getType().dyn_cast<RankedTensorType>();
+  return rewriter.create<shape::ShapeOfOp>(
+      loc,
+      RankedTensorType::get(/*shape=*/{resultType.getRank()},
+                            rewriter.getIndexType()),
+      operand);
+}
+
+Value materializeEpsilon(Operation *op, FloatAttr epsilon_attr,
+                         FloatType fp_type, Value broadcast_to,
+                         RankedTensorType broadcast_to_type,
+                         PatternRewriter &rewriter) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+  if (epsilon_attr.getType() != fp_type) {
+    // Need to convert.
+    bool loses_info;
+    APFloat epsilon_float = epsilon_attr.getValue();
+    auto status = epsilon_float.convert(
+        fp_type.getFloatSemantics(), APFloat::rmNearestTiesToEven, &loses_info);
+    if ((status & (~APFloat::opInexact)) != APFloat::opOK) {
+      op->emitWarning() << "Could not convert batch_norm epsilon to target fp "
+                           "type: opStatus = "
+                        << static_cast<int>(status);
+      return nullptr;
+    }
+    if (loses_info) {
+      op->emitWarning("Conversion of epsilon loses precision");
+    }
+    epsilon_attr = b.getFloatAttr(fp_type, epsilon_float);
+  }
+
+  auto scalar_type = RankedTensorType::get(/*shape=*/{}, fp_type);
+  auto epsilon_tensor_attr =
+      DenseElementsAttr::get(scalar_type, {epsilon_attr.cast<Attribute>()});
+  Value epsilon = b.create<mhlo::ConstantOp>(epsilon_tensor_attr);
+  auto dims_type = RankedTensorType::get(/*shape=*/{0}, b.getIntegerType(64));
+  auto dims = DenseIntElementsAttr::get(dims_type, SmallVector<int64_t, 1>{});
+  if (broadcast_to_type.hasStaticShape()) {
+    return b.create<mhlo::BroadcastInDimOp>(broadcast_to_type, epsilon, dims);
+  }
+  Value shape_value = getShapeValue(op->getLoc(), broadcast_to, rewriter);
+  return b.createOrFold<mhlo::DynamicBroadcastInDimOp>(
+      broadcast_to_type, epsilon, shape_value, dims);
+}
+
+class UnfuseBatchNormInferencePattern
+    : public OpRewritePattern<mhlo::BatchNormInferenceOp> {
+ public:
+  using OpRewritePattern<mhlo::BatchNormInferenceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::BatchNormInferenceOp bn_op,
+                                PatternRewriter &rewriter) const override {
+    // Enforce type invariants.
+    // Note that we deduce the actual element type from the variance,
+    // which should not be subject to quantization at a higher level.
+    auto input_type = bn_op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto variance_type =
+        bn_op.getVariance().getType().dyn_cast<RankedTensorType>();
+    if (!input_type || !variance_type) {
+      return failure();
+    }
+    auto fp_type = variance_type.getElementType().dyn_cast<FloatType>();
+    if (!fp_type) {
+      return failure();
+    }
+
+    // result = (x - mean) * scale / sqrt(variance + epsilon) + offset
+    // Let multiplier = scale / sqrt(variance + epsilon), to compute
+    // (x - mean) * scale / sqrt(variance + epsilon) + offset,
+    // is then to compute (x * multiplier) + (offset - mean * multiplier).
+
+    auto epsilon = materializeEpsilon(
+        bn_op.getOperation(), bn_op.getEpsilonAttr(), fp_type,
+        bn_op.getVariance(), variance_type, rewriter);
+    if (!epsilon) {
+      return failure();
+    }
+
+    // Compute multiplier = scale / sqrt(variance + epsilon)
+    Value multiplier = rewriter.create<mhlo::AddOp>(
+        bn_op.getLoc(), bn_op.getVariance(), epsilon);
+    multiplier = rewriter.create<mhlo::RsqrtOp>(bn_op.getLoc(), multiplier);
+    multiplier = rewriter.create<mhlo::MulOp>(bn_op.getLoc(), multiplier,
+                                              bn_op.getScale());
+
+    // Compute rhs = offset - mean * multiplier
+    Value rhs = rewriter.create<mhlo::MulOp>(bn_op.getLoc(), multiplier,
+                                             bn_op.getMean());
+    rhs = rewriter.create<mhlo::SubtractOp>(bn_op.getLoc(), bn_op.getOffset(),
+                                            rhs);
+
+    // Broadcast `multiplier` and `rhs`
+    Value shape_value;
+    if (!input_type.hasStaticShape()) {
+      shape_value = getShapeValue(bn_op.getLoc(), bn_op.getOperand(), rewriter);
+    }
+    int64_t feature_dim = bn_op.getFeatureIndex();
+    auto broadcast_multiplier =
+        broadcastToFeatureDim(bn_op.getLoc(), input_type, multiplier,
+                              shape_value, feature_dim, rewriter);
+    auto broadcast_rhs = broadcastToFeatureDim(
+        bn_op.getLoc(), input_type, rhs, shape_value, feature_dim, rewriter);
+
+    // Computes x * multiplier + rhs
+    Value lhs = rewriter.create<mhlo::MulOp>(bn_op.getLoc(), bn_op.getOperand(),
+                                             broadcast_multiplier);
+    rewriter.replaceOpWithNewOp<mhlo::AddOp>(bn_op, lhs, broadcast_rhs);
+
+    return success();
+  }
+};
+
+class UnfuseMhloBatchNormPass
+    : public PassWrapper<UnfuseMhloBatchNormPass, OperationPass<func::FuncOp>> {
+ public:
+  StringRef getArgument() const final { return "unfuse-mhlo-batch-norm-pass"; }
+  StringRef getDescription() const final {
+    return "Unfuses MHLO batch norm inference op into arithmetic ops";
+  }
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<shape::ShapeDialect>();
+  }
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<UnfuseBatchNormInferencePattern>(&getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+std::unique_ptr<Pass> createUnfuseBatchNormPass() {
+  return std::make_unique<UnfuseMhloBatchNormPass>();
+}
+
+static PassRegistration<UnfuseMhloBatchNormPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/tests/BUILD b/tensorflow/compiler/mlir/lite/tests/BUILD
index ae71e967452..f2cfa39ab54 100644
--- a/tensorflow/compiler/mlir/lite/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/BUILD
@@ -1,16 +1,27 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = ["load-quantization-recipe.mlir"],
+    size_override = {
+        "optimize.mlir": "medium",
+        "prepare-tf.mlir": "medium",
+        "prepare-tf-fake-quant.mlir": "medium",
+        "prepare-tf-fake-quant-4bit.mlir": "medium",
+        "raise-custom-ops.mlir": "medium",
+    },
     tags_override = {
         "legalize-tf.mlir": ["no_rocm"],
         "optimize.mlir": ["no_rocm"],
         "prepare-tf.mlir": ["no_rocm"],
+        "const-fold.mlir": ["no_mac_arm64"],
     },
     test_file_exts = ["mlir"],
 )
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 6de8a6b5f9c..8429d9198c4 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -pass-pipeline='func.func(canonicalize)' -tfl-runtime-verify -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: tf-opt -canonicalize -tfl-runtime-verify -split-input-file -verify-diagnostics %s | FileCheck %s
 
 // CHECK-LABEL: @squeeze_folder
 func.func @squeeze_folder(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
index 4ef1e88b73c..9a0b427f294 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir b/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
index 10237c446c6..ce2c896ccde 100644
--- a/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/decompose-hybrid-quantization.mlir
@@ -97,12 +97,12 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32
   // CHECK-DAG: %[[VAL1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<16x{{.+}}>, value = dense<2> : tensor<16xi32>}
   // CHECK-DAG: %[[VAL2:.+]] = "tfl.dequantize"(%[[VAL0]])
   // CHECK-DAG: %[[VAL3:.+]] = "tfl.dequantize"(%[[VAL1]])
-  // CHECK-DAG: %[[VAL4:.+]] = "tfl.transpose_conv"(%[[SHAPE]], %[[VAL2]], %arg0, %[[VAL3]]) {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
+  // CHECK-DAG: %[[VAL4:.+]] = "tfl.transpose_conv"(%[[SHAPE]], %[[VAL2]], %arg0, %[[VAL3]]) {fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}
   // CHECK: return %[[VAL4]]
   %0 = "tfl.pseudo_const"() { value = dense<[1, 32, 32, 16]> : tensor<4xi32> } : () -> tensor<4xi32>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<1> : tensor<16xi32>} : () -> tensor<16x1x1x8x!quant.uniform<i32:f32, 1.0>>
   %2 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32, 1.0>>, value = dense<2> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32, 1.0>>
-  %3 = "tfl.transpose_conv"(%0, %1, %arg0, %2)  {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<4xi32>, tensor<16x1x1x8x!quant.uniform<i32:f32, 1.0>>, tensor<1x32x32x8xf32>, tensor<16x!quant.uniform<i32:f32, 1.0>>) -> tensor<1x32x32x16xf32>
+  %3 = "tfl.transpose_conv"(%0, %1, %arg0, %2)  {fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<4xi32>, tensor<16x1x1x8x!quant.uniform<i32:f32, 1.0>>, tensor<1x32x32x8xf32>, tensor<16x!quant.uniform<i32:f32, 1.0>>) -> tensor<1x32x32x16xf32>
   func.return %3 : tensor<1x32x32x16xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index 835c126d63b..b162606d135 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
@@ -9,6 +11,9 @@ glob_lit_tests(
         ":test_utilities",
     ],
     driver = "@llvm-project//mlir:run_lit.sh",
+    size_override = {
+        "quant_stats.pbtxt": "medium",
+    },
     tags_override = {
         "add.pbtxt": ["no_rocm"],
         "conv_2d.pbtxt": ["no_rocm"],
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index 3d23fb1aeca..f7dbeaf48af 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -2,6 +2,8 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
index 302c0103603..ba0fd474a3a 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <iostream>
 #include <memory>
+#include <optional>
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
-using llvm::Optional;
 using llvm::cl::opt;
 
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s.mlir -o - \
@@ -49,12 +49,12 @@ static opt<std::string> inputFileName(llvm::cl::Positional,
 
 namespace mlir {
 namespace {
-Optional<std::unique_ptr<tflite::ModelT>> InjectStatsToFullyConnected(
+std::optional<std::unique_ptr<tflite::ModelT>> InjectStatsToFullyConnected(
     llvm::StringRef buffer) {
   auto model_ptr = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
       buffer.data(), buffer.size());
   if (nullptr == model_ptr) {
-    return llvm::None;
+    return std::nullopt;
   }
   std::unique_ptr<tflite::ModelT> model(model_ptr->GetModel()->UnPack());
 
@@ -161,7 +161,7 @@ int main(int argc, char** argv) {
   }
   flatbuffers::FlatBufferBuilder builder;
   flatbuffers::Offset<tflite::Model> output_model_location =
-      tflite::Model::Pack(builder, maybe_module.getValue().get());
+      tflite::Model::Pack(builder, maybe_module.value().get());
   tflite::FinishModelBuffer(builder, output_model_location);
   std::string output_model_content(
       reinterpret_cast<const char*>(builder.GetBufferPointer()),
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index eb303dbd01d..97e3a647b04 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -32,7 +32,7 @@ func.func @testFullyQuantizedLSTM(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.0
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithIntermediates
 func.func @testUnidirectionalSequenceLstmWithIntermediates(%arg0: tensor<? x ? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x ? x f32>, %arg19: tensor<? x ? xf32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {asymmetric_quantize_inputs = false, cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, 0.0077881771139800549>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0xf32>, input_to_forget_intermediate = tensor<0xf32>, input_to_input_intermediate = tensor<0xf32>, input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<?x?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?x?x?xf32>
   func.return %0 : tensor<?x?x?xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/multi_output_op.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/multi_output_op.json
index fda0a759f85..7d39ccb9d48 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/multi_output_op.json
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/multi_output_op.json
@@ -1,8 +1,11 @@
 // RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir --mlir-print-debuginfo -o - | FileCheck %s
 
+// CHECK: #[[LOC0:.*]] = loc("<stdin>":0:0)
 // CHECK: "tfl.split"
 // CHECK-SAME: loc(#[[SPLIT_LOC:.*]])
-// CHECK: #[[SPLIT_LOC]] = loc(fused["output0"("<stdin>":0:0), "output1"("<stdin>":0:0)])
+// CHECK: #[[LOC1:.*]] = loc("output0"(#[[LOC0]]))
+// CHECK: #[[LOC2:.*]] = loc("output1"(#[[LOC0]]))
+// CHECK: #[[SPLIT_LOC]] = loc(fused[#[[LOC1]], #[[LOC2]]])
 
 {
   "version": 3,
diff --git a/tensorflow/compiler/mlir/lite/tests/get-arithmetic-count.mlir b/tensorflow/compiler/mlir/lite/tests/get-arithmetic-count.mlir
index 0d1503892b2..fd2075cb4f2 100644
--- a/tensorflow/compiler/mlir/lite/tests/get-arithmetic-count.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/get-arithmetic-count.mlir
@@ -122,7 +122,7 @@ func.func @testAveragePool2D(tensor<1x10x10x3xf32>) -> tensor<1x10x10x3xf32> {
 func.func @testTransposeConv(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
   %cst = "tfl.no_value"() {value = unit} : () -> none
   // CHECK: _arithmetic_count = 176160768 : i64
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   func.return %0 : tensor<1x64x84x32xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 660f8a6b7ab..c6562460a5f 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1626,8 +1626,8 @@ func.func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf3
   // CHECK: %[[CST:.*]] = arith.constant dense<[2, 0, 1, 3]> : tensor<4xi32>
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[CST_0:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
-  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
 }
@@ -2504,3 +2504,17 @@ func.func @sigmoidGrad(%arg0: tensor<?x32xf32>, %arg1: tensor<?x32xf32>) -> tens
 // CHECK-NEXT: [[MUL1:%.+]] =  tfl.mul %arg1, [[MUL0]] {fused_activation_function = "NONE"} : tensor<?x32xf32>
 // CHECK: return [[MUL1]]
 }
+
+func.func @batchmatmul2fullyconnected(%arg0: tensor<4x128x2xf32>) -> (tensor<4x128x1xf32>) {
+  %0 = "tf.Const"() {value = dense<[[1.0], [2.0]]> : tensor<2x1xf32>} : () -> tensor<2x1xf32>
+  %1 = "tf.BatchMatMulV2"(%arg0, %0) : (tensor<4x128x2xf32>, tensor<2x1xf32>) -> tensor<4x128x1xf32>
+  func.return %1 : tensor<4x128x1xf32>
+
+  // CHECK-LABEL: batchmatmul2fullyconnected
+  // CHECK-DAG:  %cst_0 = arith.constant dense<[1, 0]> : tensor<2xi32> 
+  // CHECK:  %0 = "tfl.transpose"(%cst, %cst_0) : (tensor<2x1xf32>, tensor<2xi32>) -> tensor<1x2xf32> 
+  // CHECK-DAG:  %1 = "tfl.no_value"() {value} : () -> none
+  // CHECK:  %2 = "tfl.fully_connected"(%arg0, %0, %1) {fused_activation_function = "NONE", keep_num_dims = true, weights_format = "DEFAULT"} : (tensor<4x128x2xf32>, tensor<1x2xf32>, none) -> tensor<4x128x1xf32>
+  // CHECK:  return %2 : tensor<4x128x1xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir b/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
index 99498a96f84..9b49a3edc96 100644
--- a/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lift_tflite_flex_ops.mlir
@@ -57,6 +57,30 @@ func.func @TfParseExample(%arg0: tensor<1x!tf_type.string>) -> (tensor<1x1x!tf_t
 // CHECK-SAME: operand_segment_sizes = array<i32: 1, 1, 0, 2, 2>, result_segment_sizes = array<i32: 0, 0, 0, 2>
 }
 
+// CHECK-LABEL: TfMapDataset
+func.func @TfMapDataset(%arg0: tensor<!tf_type.variant>) -> (tensor<!tf_type.variant>) {
+  %0 = "tfl.custom"(%arg0) {
+    custom_code = "FlexMapDataset",
+    custom_option = #tfl<const_bytes : "0x0A4D61704461746173657400CA120A4D6170446174617365741A002A1A0A1470726573657276655F63617264696E616C697479120228012A100A0A54617267756D656E747312020A002A1E0A187573655F696E7465725F6F705F706172616C6C656C69736D120228012A0E0A086D65746164617461120212002A2C0A0166122752250A235F5F696E666572656E63655F446174617365745F6D61705F6C616D6264615F343131302A150A0C6F75747075745F747970657312050A033201072A150A0D6F75747075745F73686170657312040A023A0032000002D8CD1414042801">
+  } : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+
+  func.return %0 : tensor<!tf_type.variant>
+// CHECK: "tf.MapDataset"(
+// CHECK-SAME: {Targuments = [], f = @{{.*}}, metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], preserve_cardinality = true, use_inter_op_parallelism = true}
+}
+
+// CHECK-LABEL: TfTakeWhileDataset
+func.func @TfTakeWhileDataset(%arg0: tensor<!tf_type.variant>, %arg1: tensor<!tf_type.resource>) -> (tensor<!tf_type.variant>) {
+  %0 = "tfl.custom"(%arg0, %arg1) {
+    custom_code = "FlexTakeWhileDataset",
+    custom_option = #tfl<const_bytes : "0x1054616B655768696C654461746173657400C0121054616B655768696C65446174617365741A001A001A001A001A001A001A001A001A002A1A0A0A54617267756D656E7473120C0A0A320814140914141414092A3E0A097072656469636174651231522F0A2D5F5F696E666572656E63655F446174617365745F74616B655F7768696C655F7072656469636174655F373738302A0E0A086D65746164617461120212002A150A0C6F75747075745F747970657312050A033201072A150A0D6F75747075745F73686170657312040A023A0032000002D4C31414042801">
+  } : (tensor<!tf_type.variant>, tensor<!tf_type.resource>) -> tensor<!tf_type.variant>
+
+  func.return %0 : tensor<!tf_type.variant>
+// CHECK: "tf.TakeWhileDataset"(
+// CHECK-SAME: {Targuments = [!tf_type.resource, !tf_type.resource, i64, !tf_type.resource, !tf_type.resource, !tf_type.resource, !tf_type.resource, i64], metadata = "", output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], predicate = @{{.*}}}
+}
+
 // CHECK-LABEL: FailureOnInvalidOp
 func.func @FailureOnInvalidOp(%arg0: tensor<4xf64>, %arg1: tensor<4xf64>) -> tensor<4xf64> {
   // expected-error@+1 can't find registered TF op for Nop
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
index cc69cd46b83..930e0f20b05 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
@@ -9,6 +9,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
index 62aa45c0dc0..7e748ffe18d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
index 2bb0aa766ff..975e959b052 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
@@ -78,6 +78,6 @@ func.func @main(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tens
 // CHECK-NEXT:}
 
   %cst = "tfl.no_value"() {value = unit} : () -> none
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   func.return %0 : tensor<1x64x84x32xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 0fb6a0595e8..3c3d50243a9 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1905,6 +1905,13 @@ func.func @testRelu6WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32,
 
 // -----
 
+func.func @testRelu0To1WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>> {
+  %0 = "tfl.relu_0_to_1"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 1.0>>
+  func.return %0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
+}
+
+// -----
+
 func.func @testReluWithDifferentScales(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 4.0>> {
   %0 = "tfl.relu"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 2.0>>
   %1 = "tfl.relu_n1_to_1"(%0) : (tensor<10x!quant.uniform<u8:f32, 2.0>>) -> tensor<10x!quant.uniform<u8:f32, 3.0>>
@@ -2504,7 +2511,7 @@ func.func @testFullyConnectedWithBadOutputShape(%arg0: tensor<1x37xf32>, %arg1:
 
 func.func @testTransposeConv(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
   %cst = "tfl.no_value"() {value = unit} : () -> none
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   func.return %0 : tensor<1x64x84x32xf32>
 }
 
@@ -2515,7 +2522,7 @@ func.func @testTransposeConvWithOutputThatHasDynamicSizes(%arg0: tensor<4xi32>,
   // CHECK: %[[NONE:.*]] = "tfl.no_value"() {value} : () -> none
   // CHECK: "tfl.transpose_conv"(%arg0, %arg1, %arg2, %[[NONE]])
   %cst = "tfl.no_value"() {value = unit} : () -> none
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<?x?x?x?xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<?x?x?x?xf32>
   func.return %0 : tensor<?x?x?x?xf32>
 }
 
@@ -2541,7 +2548,7 @@ func.func @testConvolution2DTransposeNoBias(%arg0: tensor<32x4x4x128xf32>, %arg1
 func.func @testTransposeConvBadOutputRank(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x32x42x128xf32>) -> tensor<64x84x32xf32> {
   %cst = "tfl.no_value"() {value = unit} : () -> none
   // expected-error @+1 {{expect output type has rank = 4, got output type tensor<64x84x32xf32>}}
-  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%arg0, %arg1, %arg2, %cst) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<64x84x32xf32>
   func.return %0 : tensor<64x84x32xf32>
 }
 
@@ -2551,7 +2558,7 @@ func.func @testTransposeConvBadOutputShape(%arg1: tensor<32x4x4x128xf32>, %arg2:
   %cst = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_1 = "tfl.no_value"() {value = unit} : () -> none
   // expected-error @+1 {{expect output type tensor<1x64x84x32xf32>, got tensor<1x64x84x31xf32>}}
-  %0 = "tfl.transpose_conv"(%cst, %arg1, %arg2, %cst_1) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x31xf32>
+  %0 = "tfl.transpose_conv"(%cst, %arg1, %arg2, %cst_1) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x31xf32>
   func.return %0 : tensor<1x64x84x31xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 4d4f7e88f47..b225c7355df 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -77,7 +77,7 @@ func.func @fuseAddIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x
   %cst_1 = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_2 = arith.constant dense<1.0> : tensor<32x4x4x128xf32>
   %cst_3 = arith.constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
-  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   func.return %1 : tensor<1x64x84x32xf32>
 
@@ -95,7 +95,7 @@ func.func @fuseSubIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x
   %cst_1 = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_2 = arith.constant dense<1.0> : tensor<32x4x4x128xf32>
   %cst_3 = arith.constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
-  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   %1 = "tfl.sub"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   func.return %1 : tensor<1x64x84x32xf32>
 
@@ -113,7 +113,7 @@ func.func @fuseAddIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> ten
   %cst_1 = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_2 = arith.constant dense<1.0> : tensor<32x4x4x128xf32>
   %cst_3 = "tfl.no_value"() {value} : () -> none
-  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   func.return %1 : tensor<1x64x84x32xf32>
 
@@ -131,7 +131,7 @@ func.func @fuseMulIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x
   %cst_1 = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_2 = arith.constant dense<1.0> : tensor<32x4x4x128xf32>
   %cst_3 = arith.constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
-  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   func.return %1 : tensor<1x64x84x32xf32>
 
@@ -149,7 +149,7 @@ func.func @fuseMulIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> ten
   %cst_1 = arith.constant dense<[1, 64, 84, 32]> : tensor<4xi32>
   %cst_2 = arith.constant dense<1.0> : tensor<32x4x4x128xf32>
   %cst_3 = "tfl.no_value"() {value} : () -> none
-  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
   %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
   func.return %1 : tensor<1x64x84x32xf32>
 
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index e1e4036a43e..de372f38daf 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -161,7 +161,7 @@ func.func @FoldTranspose(%arg0: tensor<1x10x20x3xf32>) -> tensor<1x20x40x16xf32>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<3x3x16x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, value = dense<"0x0303040002010303FFFFFD0304020401FF0000FEFF0003FF01FD0203FF0202FEFE0003010201FD04FE0402030303000202FD0100FDFE0402FEFEFE01020101FD0204FEFDFC03FFFE0101FDFE02040002FDFFFE03FFFE0201FEFDFF00FFFDFEFD030201FD01FC01FF010003FF0401FCFD0101FC0000FE03FEFE010102000002FE02030100FE00FEFDFD0003FD000303000103FE01FF02000002FF0101FDFDFF02FFFF00000203FF0003030302FDFF03FFFF030001020102FD04FE0104FE030401030102FEFCFEFD03FD03FD000102FE02020001020000FE030202030103FFFC01FC000302000304FCFF03FD04FC00010400010100030303FC02FCFEFE01000303000100010003FE000303010301010102FEFC01FD020301FFFDFFFCFDFEFCFE030001FDFCFE000202FE020300FD00FD02FF0001FF0002FF01FD010102FDFE04FCFE0000FD01000101FF0402FF020103FC020301FF03010204FDFFFE0202FF0302FF02FFFF01FF01FF04FD0002FF00FC00FC0101010404FE03040300000301FD0001FE04FF040103FF01FD0301FF0002040403FF03FE04FDFD0103FCFE01FDFCFF03FC010200FDFE020200FF00FFFC03FE"> : tensor<3x3x16x3xi8>} : () -> tensor<3x3x16x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>
   %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x10x20x3x!quant.uniform<i8:f32, 3.9215686274509805E-9:-1>>} : (tensor<1x10x20x3xf32>) -> tensor<1x10x20x3x!quant.uniform<i8:f32, 3.9215686274509805E-9:-1>>
   %3 = "tfl.transpose"(%1, %cst_0) : (tensor<3x3x16x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, tensor<4xi32>) -> tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>
-  %4 = "tfl.transpose_conv"(%cst, %3, %2, %0) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, tensor<1x10x20x3x!quant.uniform<i8:f32, 3.9215686274509805E-9:-1>>, tensor<16x!quant.uniform<i32:f32, 1.8527095877721169E-10>>) -> tensor<1x20x40x16x!quant.uniform<i8:f32, 0.047058823529411764>>
+  %4 = "tfl.transpose_conv"(%cst, %3, %2, %0) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<16x3x3x3x!quant.uniform<i8<-127:127>:f32, 0.047244094488188976>>, tensor<1x10x20x3x!quant.uniform<i8:f32, 3.9215686274509805E-9:-1>>, tensor<16x!quant.uniform<i32:f32, 1.8527095877721169E-10>>) -> tensor<1x20x40x16x!quant.uniform<i8:f32, 0.047058823529411764>>
   %5 = "tfl.dequantize"(%4) : (tensor<1x20x40x16x!quant.uniform<i8:f32, 0.047058823529411764>>) -> tensor<1x20x40x16xf32>
   return %5 : tensor<1x20x40x16xf32>
 
@@ -175,7 +175,7 @@ func.func @FoldReshape(%arg0: tensor<4xi32>, %arg1: tensor<1x48x80x16x!quant.uni
   %cst = arith.constant dense<[1, 2, 2, 16]> : tensor<4xi32>
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<2x2x1x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, value = dense<[[[[12, -60, -51, -59, -62, 33, 53, 17, -31, 50, 27, 7, -19, -34, -14, -26]], [[47, -84, -32, -36, -102, -8, -8, 35, -33, 59, 95, 40, -25, -30, -55, 25]]], [[[4, -41, -61, 12, -23, 48, 40, 15, -39, 52, 81, -62, -24, 17, -7, -52]], [[40, -70, -45, 32, -43, 2, -30, 34, -35, 58, 77, -28, -30, 37, -47, -5]]]]> : tensor<2x2x1x16xi8>} : () -> tensor<2x2x1x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>
   %1 = "tfl.reshape"(%0, %cst) : (tensor<2x2x1x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, tensor<4xi32>) -> tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>
-  %2 = "tfl.transpose_conv"(%arg0, %1, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, tensor<1x48x80x16x!quant.uniform<i8:f32, 0.047054948993757659:-128>>, tensor<1x!quant.uniform<i32:f32, 0.0010538385465422978>>) -> tensor<1x96x160x1x!quant.uniform<i8:f32, 0.37102097156001074:-14>>
+  %2 = "tfl.transpose_conv"(%arg0, %1, %arg1, %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, tensor<1x48x80x16x!quant.uniform<i8:f32, 0.047054948993757659:-128>>, tensor<1x!quant.uniform<i32:f32, 0.0010538385465422978>>) -> tensor<1x96x160x1x!quant.uniform<i8:f32, 0.37102097156001074:-14>>
   return %2 : tensor<1x96x160x1x!quant.uniform<i8:f32, 0.37102097156001074:-14>>
   // CHECK-NOT: "tfl.reshape"
   // CHECK{LITERAL}: "tfl.pseudo_qconst"() {qtype = tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>, value = dense<[[[[12, -60, -51, -59, -62, 33, 53, 17, -31, 50, 27, 7, -19, -34, -14, -26], [47, -84, -32, -36, -102, -8, -8, 35, -33, 59, 95, 40, -25, -30, -55, 25]], [[4, -41, -61, 12, -23, 48, 40, 15, -39, 52, 81, -62, -24, 17, -7, -52], [40, -70, -45, 32, -43, 2, -30, 34, -35, 58, 77, -28, -30, 37, -47, -5]]]]> : tensor<1x2x2x16xi8>} : () -> tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32, 0.022395913056501255>>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 99c6f24481e..7aac8662a83 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -204,7 +204,7 @@ func.func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: t
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -218,6 +218,61 @@ func.func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: t
 
 // -----
 
+module {
+func.func @inference_standard_indy_lstm_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x4xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
+  %2 = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %3 = "tf.Transpose"(%arg4, %2) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+  %4 = "tf.MatrixDiag"(%3) : (tensor<4x10xf32>) -> tensor<4x10x10xf32>
+  %5 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %6 = "tf.ConcatV2"(%4, %5) : (tensor<4x10x10xf32>, tensor<i64>) -> tensor<40x10xf32>
+  %7 = "tf.BatchMatMulV2"(%1, %6) {adj_x = false, adj_y = false} : (tensor<8x8x40xf32>, tensor<40x10xf32>) -> tensor<8x8x10xf32>
+  %8 = "tf.Add"(%7, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %9 = "tf.Add"(%7, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %10 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  %11 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  func.return %10, %9, %10, %10, %11 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_indy_lstm_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x4xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:           [[VAL_6:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+// CHECK:         }
+
+}
+
+// -----
+
 module {
 func.func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
@@ -245,7 +300,7 @@ func.func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -260,6 +315,61 @@ func.func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg
 
 // -----
 
+module {
+func.func @inference_standard_indy_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x4xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
+  %2 = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %3 = "tf.Transpose"(%arg4, %2) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+  %4 = "tf.MatrixDiag"(%3) : (tensor<4x10xf32>) -> tensor<4x10x10xf32>
+  %5 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %6 = "tf.ConcatV2"(%4, %5) : (tensor<4x10x10xf32>, tensor<i64>) -> tensor<40x10xf32>
+  %7 = "tf.BatchMatMulV2"(%1, %6) {adj_x = false, adj_y = false} : (tensor<8x8x40xf32>, tensor<40x10xf32>) -> tensor<8x8x10xf32>
+  %8 = "tf.Add"(%7, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %9 = "tf.Add"(%7, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %10 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  %11 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  func.return %10, %9, %10, %10, %11 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_indy_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x4xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
+// CHECK:           [[VAL_6:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+// CHECK:         }
+
+}
+
+// -----
+
 module {
 func.func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
@@ -289,7 +399,7 @@ func.func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf
 // CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -304,6 +414,63 @@ func.func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf
 
 // -----
 
+module {
+func.func @inference_standard_indy_lstm_time_major_go_backwards(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x4xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
+  %2 = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %3 = "tf.Transpose"(%arg4, %2) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+  %4 = "tf.MatrixDiag"(%3) : (tensor<4x10xf32>) -> tensor<4x10x10xf32>
+  %5 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %6 = "tf.ConcatV2"(%4, %5) : (tensor<4x10x10xf32>, tensor<i64>) -> tensor<40x10xf32>
+  %7 = "tf.BatchMatMulV2"(%1, %6) {adj_x = false, adj_y = false} : (tensor<8x8x40xf32>, tensor<40x10xf32>) -> tensor<8x8x10xf32>
+  %8 = "tf.Add"(%7, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %9 = "tf.Add"(%7, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %10 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  %11 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  func.return %10, %9, %10, %10, %11 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_indy_lstm_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x4xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
+// CHECK:           [[VAL_40:%.*]] = arith.constant dense<0> : tensor<1xi32>
+// CHECK:           [[VAL_41:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_40]]) : (tensor<8x8x8xf32>, tensor<1xi32>) -> tensor<8x8x8xf32>
+// CHECK:           [[VAL_6:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+// CHECK:         }
+
+}
+
+// -----
+
 module {
 func.func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
@@ -333,7 +500,7 @@ func.func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8
 // CHECK-DAG:       [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_24:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_25:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -348,6 +515,63 @@ func.func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8
 
 // -----
 
+module {
+func.func @inference_standard_indy_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x4xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<8x8x8xf32>, tensor<8x40xf32>) -> tensor<8x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<8x8x40xf32>, tensor<40xf32>) -> tensor<8x8x40xf32>
+  %2 = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %3 = "tf.Transpose"(%arg4, %2) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+  %4 = "tf.MatrixDiag"(%3) : (tensor<4x10xf32>) -> tensor<4x10x10xf32>
+  %5 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+  %6 = "tf.ConcatV2"(%4, %5) : (tensor<4x10x10xf32>, tensor<i64>) -> tensor<40x10xf32>
+  %7 = "tf.BatchMatMulV2"(%1, %6) {adj_x = false, adj_y = false} : (tensor<8x8x40xf32>, tensor<40x10xf32>) -> tensor<8x8x10xf32>
+  %8 = "tf.Add"(%7, %arg1) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %9 = "tf.Add"(%7, %arg2) : (tensor<8x8x10xf32>, tensor<8x10xf32>) -> tensor<8x8x10xf32>
+  %10 = "tf.Add"(%arg1, %arg2) : (tensor<8x10xf32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  %11 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  func.return %10, %9, %10, %10, %11 : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_indy_lstm_non_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<8x10xf32>, [[VAL_2:%.*]]: tensor<8x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x4xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$dim { size: 8 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: false", "tfshape$unknown_rank: false"], tf.api_implements = "indy_lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+// CHECK:           [[VAL_40:%.*]] = arith.constant dense<1> : tensor<1xi32>
+// CHECK:           [[VAL_41:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_40]]) : (tensor<8x8x8xf32>, tensor<1xi32>) -> tensor<8x8x8xf32>
+// CHECK:           [[VAL_6:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = arith.constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x4xf32>, tensor<2xi32>) -> tensor<4x10xf32>
+// CHECK-DAG:       [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK-DAG:       [[VAL_13:%.*]] = "tf.Const"() {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<4x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>)
+// CHECK-DAG:       [[VAL_20:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_21:%.*]] = "tf.Reshape"([[VAL_15]]#0, [[VAL_20]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_22:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_23:%.*]] = "tf.Reshape"([[VAL_15]]#1, [[VAL_22]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_24:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Reshape"([[VAL_15]]#2, [[VAL_24]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_26:%.*]] = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Reshape"([[VAL_15]]#3, [[VAL_26]]) : (tensor<1x10xf32>, tensor<1xi32>) -> tensor<10xf32>
+// CHECK-DAG:       [[VAL_28:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK-DAG:       [[VAL_29:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_30:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_28]], [[VAL_29]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_31:%.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK:           [[VAL_32:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_41]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_21]], [[VAL_23]], [[VAL_25]], [[VAL_27]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_30]]#0, [[VAL_30]]#1, [[VAL_30]]#2, [[VAL_30]]#3, [[VAL_31]], [[VAL_31]], [[VAL_1]], [[VAL_2]], [[VAL_31]], [[VAL_31]], [[VAL_31]], [[VAL_31]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = true, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK-DAG:       [[VAL_33:%.*]] = arith.constant dense<[0, -1, 0]> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_34:%.*]] = arith.constant dense<0> : tensor<3xi32>
+// CHECK-DAG:       [[VAL_35:%.*]] = arith.constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_36:%.*]] = "tf.StridedSlice"([[VAL_32]], [[VAL_33]], [[VAL_34]], [[VAL_35]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_37:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_38:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<8x10xf32>
+// CHECK-DAG:       [[VAL_39:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_36]], [[VAL_32]], [[VAL_37]], [[VAL_38]], [[VAL_39]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<8x10xf32>, tensor<8x10xf32>, tensor<f32>
+// CHECK:         }
+
+}
+
+// -----
+
 module {
 func.func @inference_can_fuse(%arg0: tensor<?x8x8xf32>, %arg1: tensor<8x10xf32>, %arg2: tensor<8x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = f32, value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
@@ -382,7 +606,7 @@ func.func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>,
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
@@ -432,7 +656,7 @@ func.func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor
 // CHECK-DAG:       [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = "tfl.no_value"() {value} : () -> none
-// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, diagonal_recurrent_tensors = false, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<8x10xf32>, tensor<8x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
 // CHECK-DAG:       [[VAL_21:%.*]] = arith.constant dense<[-1, 0, 0]> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_22:%.*]] = arith.constant dense<0> : tensor<3xi32>
 // CHECK-DAG:       [[VAL_23:%.*]] = arith.constant dense<1> : tensor<3xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
index 6cb9dc0ac4d..b549b564515 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
@@ -244,7 +244,7 @@ func.func @QuantizeTransposeConvWeightOnly(%arg0: tensor<32x4x4x128xf32>, %arg1:
   %0 = "quantfork.stats"(%arg0) {layerStats = dense<[0.000000e+00, 1.000000e+01]> : tensor<2xf32>} : (tensor<32x4x4x128xf32>) -> tensor<32x4x4x128xf32>
   %w = arith.constant dense<127.0> : tensor<1x32x42x128xf32>
   %b = arith.constant dense<0.0> : tensor<1x32x42x128xf32>
-  %tconv = "tfl.transpose_conv"(%arg1, %w, %0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
+  %tconv = "tfl.transpose_conv"(%arg1, %w, %0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
   %tconv_s = "quantfork.stats"(%tconv) {layerStats = dense<[0.000000e+00, 1.000000e+01]> : tensor<2xf32>} : (tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
   func.return %tconv_s : tensor<1x32x42x128xf32>
 
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
new file mode 100644
index 00000000000..2a4005baa07
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
@@ -0,0 +1,213 @@
+// RUN: tf-opt %s -tfl-prepare-quantize="quantize-signed=true post-training-quantize=true activation-number-of-bits=16" -cse | FileCheck %s
+
+// CHECK-LABEL: QuantizeUnidirectionalLstmFullPerTensor
+func.func @QuantizeUnidirectionalLstmFullPerTensor(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>) {
+  %input = "quantfork.stats"(%arg0) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %1 = "tfl.pseudo_const"() {value = dense<[[0.1]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %2 = "tfl.pseudo_const"() {value = dense<[[0.2]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %3 = "tfl.pseudo_const"() {value = dense<[[0.3]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %4 = "tfl.pseudo_const"() {value = dense<[[0.4]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %5 = "tfl.pseudo_const"() {value = dense<[[0.5]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %6 = "tfl.pseudo_const"() {value = dense<[[0.6]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %7 = "tfl.pseudo_const"() {value = dense<[[0.7]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %8 = "tfl.pseudo_const"() {value = dense<[[0.8]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %9 = "tfl.no_value"() {value} : () -> none
+  %10 = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %11 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %recurrent_input = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %recurrent_stats = "quantfork.stats"(%recurrent_input) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  %cell_input = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %cell_stats = "quantfork.stats"(%cell_input) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  %16 = "tfl.unidirectional_sequence_lstm"(
+    %input,
+    %1, %2, %3, %4,
+    %5, %6, %7, %8,
+    %9, %9, %9,
+    %10, %11,
+    %10, %10,
+    %9, %9,
+    %recurrent_stats, %cell_stats,
+    %9, %9, %9, %9) {
+      asymmetric_quantize_inputs = false,
+      cell_clip = 1.000000e+01 : f32,
+      effective_hidden_scale_intermediate = tensor<0x!quant.calibrated<f32<0.0:1.0>>>,
+      fused_activation_function = "TANH",
+      input_to_cell_intermediate = tensor<0xf32>,
+      input_to_forget_intermediate = tensor<0xf32>,
+      input_to_input_intermediate = tensor<0xf32>,
+      input_to_output_intermediate = tensor<0xf32>,
+      proj_clip = 0.000000e+00 : f32,
+      time_major = false} : (
+        tensor<1x2x3xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        none, none, none,
+        tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+        none, none,
+        tensor<1x3xf32>, tensor<1x3xf32>,
+        none, none, none, none) -> tensor<1x2x3xf32>
+  %17 = "quantfork.stats"(%16) {layerStats = dense<[-0.1, 0.1]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.return %17 : tensor<1x2x3xf32>
+
+// CHECK-DAG: %[[input_0:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x2x3x!quant.uniform<i16<-32767:32767>:f32, 3.0518509475997192E-5>>) -> tensor<1x2x3xf32>
+// CHECK-DAG: %[[input_1:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 7.8740158653634745E-4>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_2:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0015748031730726949>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_4:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0031496063461453898>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_5:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.003937007874015748>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_6:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0047244096365500624>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_7:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0055118109297564652>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_8:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0062992126922907796>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[input_10:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 2.4030322780124744E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_11:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 4.8060645560249487E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_12:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 7.2090970130772759E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_13:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 9.6121291120498974E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_14:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3x!quant.uniform<i16:f32, 3.0518043793392844E-5:-1>>) -> tensor<1x3xf32>
+// CHECK-DAG: %[[input_15:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x3xf32>
+// CHECK: %[[lstm:.*]] = "tfl.unidirectional_sequence_lstm"(
+// CHECK-SAME: %[[input_0]],
+// CHECK-SAME: %[[input_1]], %[[input_2]], %[[input_3]], %[[input_4]],
+// CHECK-SAME: %[[input_5]], %[[input_6]], %[[input_7]], %[[input_8]],
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]],
+// CHECK-SAME: %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]],
+// CHECK-SAME: %[[input_9]], %[[input_9]],
+// CHECK-SAME: %[[input_14]], %[[input_15]],
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) {
+// CHECK-SAME: asymmetric_quantize_inputs = false,
+// CHECK-SAME: cell_clip = 1.000000e+01 : f32,
+// CHECK-SAME: effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, {{.*}}>>,
+// CHECK-SAME: fused_activation_function = "TANH",
+// CHECK-SAME: input_to_cell_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_forget_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_input_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_output_intermediate = tensor<0xf32>,
+// CHECK-SAME: proj_clip = 0.000000e+00 : f32,
+// CHECK-SAME: time_major = false} : (
+// CHECK-SAME: tensor<1x2x3xf32>,
+// CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+// CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+// CHECK-SAME: none, none, none,
+// CHECK-SAME: tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+// CHECK-SAME: none, none,
+// CHECK-SAME: tensor<1x3xf32>, tensor<1x3xf32>,
+// CHECK-SAME: none, none, none, none)
+// CHECK-SAME: -> tensor<1x2x3xf32>
+// CHECK: "tfl.quantize"(%[[lstm]]) {qtype = tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32, {{.*}}>>
+
+}
+
+// CHECK-LABEL: QuantizeUnidirectionalLstmFullPerAxis
+func.func @QuantizeUnidirectionalLstmFullPerAxis(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>) {
+  %input = "quantfork.stats"(%arg0) {
+    layerStats = dense<[0.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5]
+    ]> : tensor<3x2xf32>, axis = 2 : i64
+  } : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %1 = "tfl.pseudo_const"() {value = dense<[[0.1]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %2 = "tfl.pseudo_const"() {value = dense<[[0.2]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %3 = "tfl.pseudo_const"() {value = dense<[[0.3]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %4 = "tfl.pseudo_const"() {value = dense<[[0.4]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %5 = "tfl.pseudo_const"() {value = dense<[[0.5]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %6 = "tfl.pseudo_const"() {value = dense<[[0.6]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %7 = "tfl.pseudo_const"() {value = dense<[[0.7]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %8 = "tfl.pseudo_const"() {value = dense<[[0.8]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
+  %9 = "tfl.no_value"() {value} : () -> none
+  %10 = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %11 = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+  %recurrent_input = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %recurrent_stats = "quantfork.stats"(%recurrent_input) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  %cell_input = "tfl.pseudo_const"() {value = dense<1.000000e+00> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %cell_stats = "quantfork.stats"(%cell_input) {
+    layerStats = dense<[0.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5]
+    ]> : tensor<3x2xf32>, axis = 1 : i64
+  } : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  %16 = "tfl.unidirectional_sequence_lstm"(
+    %input,
+    %1, %2, %3, %4,
+    %5, %6, %7, %8,
+    %9, %9, %9,
+    %10, %11,
+    %10, %10,
+    %9, %9,
+    %recurrent_stats, %cell_stats,
+    %9, %9, %9, %9) {
+      asymmetric_quantize_inputs = false,
+      cell_clip = 1.000000e+01 : f32,
+      effective_hidden_scale_intermediate = tensor<0x!quant.calibrated<f32<0.0:1.0>>>,
+      fused_activation_function = "TANH",
+      input_to_cell_intermediate = tensor<0xf32>,
+      input_to_forget_intermediate = tensor<0xf32>,
+      input_to_input_intermediate = tensor<0xf32>,
+      input_to_output_intermediate = tensor<0xf32>,
+      proj_clip = 0.000000e+00 : f32,
+      time_major = false} : (
+        tensor<1x2x3xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+        none, none, none,
+        tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+        none, none,
+        tensor<1x3xf32>, tensor<1x3xf32>,
+        none, none, none, none) -> tensor<1x2x3xf32>
+  %17 = "quantfork.stats"(%16) {
+    layerStats = dense<[0.0, 1.0]> : tensor<2xf32>,
+    axisStats = dense<[
+      [-1.0, 1.0],
+      [-8.0, 8.0],
+      [-0.5, 0.5]
+    ]> : tensor<3x2xf32>, axis = 2 : i64
+  } : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.return %17 : tensor<1x2x3xf32>
+
+// CHECK-DAG: %[[input_0:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x2x3x!quant.uniform<i16<-32767:32767>:f32, {{3.0518509475997192E-5}}>>) -> tensor<1x2x3xf32>
+// CHECK-DAG: %[[input_1:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 7.8740158653634745E-4>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_2:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0015748031730726949>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_4:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0031496063461453898>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_5:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.003937007874015748>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_6:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0047244096365500624>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_7:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0055118109297564652>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_8:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x1x!quant.uniform<i8<-127:127>:f32, 0.0062992126922907796>>) -> tensor<1x1xf32>
+// CHECK-DAG: %[[input_9:.*]] = "tfl.no_value"() {value} : () -> none
+// CHECK-DAG: %[[input_10:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 2.4030322780124744E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_11:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 4.8060645560249487E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_12:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 7.2090970130772759E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_13:.*]] = "tfl.dequantize"({{.*}}) : (tensor<3x!quant.uniform<i32:f32, 9.6121291120498974E-8>>) -> tensor<3xf32>
+// CHECK-DAG: %[[input_14:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3x!quant.uniform<i16:f32, 3.0518043793392844E-5:-1>>) -> tensor<1x3xf32>
+// CHECK-DAG: %[[input_15:.*]] = "tfl.dequantize"({{.*}}) : (tensor<1x3x!quant.uniform<i16:f32, 3.0517578125E-5>>) -> tensor<1x3xf32>
+// CHECK: %31 = "tfl.unidirectional_sequence_lstm"(
+// CHECK-SAME: %[[input_0]],
+// CHECK-SAME: %[[input_1]], %[[input_2]], %[[input_3]], %[[input_4]],
+// CHECK-SAME: %[[input_5]], %[[input_6]], %[[input_7]], %[[input_8]],
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]],
+// CHECK-SAME: %[[input_10]], %[[input_11]], %[[input_12]], %[[input_13]],
+// CHECK-SAME: %[[input_9]], %[[input_9]],
+// CHECK-SAME: %[[input_14]], %[[input_15]],
+// CHECK-SAME: %[[input_9]], %[[input_9]], %[[input_9]], %[[input_9]]) {
+// CHECK-SAME: asymmetric_quantize_inputs = false,
+// CHECK-SAME: cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8:f32, {{.*}}>>,
+// CHECK-SAME: fused_activation_function = "TANH",
+// CHECK-SAME: input_to_cell_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_forget_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_input_intermediate = tensor<0xf32>,
+// CHECK-SAME: input_to_output_intermediate = tensor<0xf32>, proj_clip = 0.000000e+00 : f32, time_major = false} : (
+// CHECK-SAME: tensor<1x2x3xf32>,
+// CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+// CHECK-SAME: tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>, tensor<1x1xf32>,
+// CHECK-SAME: none, none, none,
+// CHECK-SAME: tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>,
+// CHECK-SAME: none, none,
+// CHECK-SAME: tensor<1x3xf32>, tensor<1x3xf32>,
+// CHECK-SAME: none, none, none, none)
+// CHECK-SAME: -> tensor<1x2x3xf32>
+// CHECK: %32 = "tfl.quantize"(%31) {qtype = tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>, volatile} : (tensor<1x2x3xf32>) -> tensor<1x2x3x!quant.uniform<i16:f32:2, {{{.*}},{{.*}},{{.*}}}>>
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index 838851740cc..6e9ca99e11f 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -188,7 +188,7 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
 func.func @QuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
   %w = arith.constant dense<127.0> : tensor<1x32x42x128xf32>
   %b = arith.constant dense<0.0> : tensor<1x32x42x128xf32>
-  %tc = "tfl.transpose_conv"(%arg1, %w, %arg0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
+  %tc = "tfl.transpose_conv"(%arg1, %w, %arg0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
   func.return %tc : tensor<1x32x42x128xf32>
 
 // CHECK: %[[CST:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
@@ -231,7 +231,7 @@ func.func @bias_adjust_perchannel(%arg0: tensor<1x5x5x2xf32>, %arg1: tensor<4xi3
   %w = arith.constant dense<[[[[-1.0, 1.0]]], [[[1.0, 2.0]]], [[[-2.0, 1.0]]]]> : tensor<3x1x1x2xf32>
   %b = arith.constant dense<[1.0e-2, 2.1473647e1, -2.1473647e2]> : tensor<3xf32>
   %transpose_conv = "tfl.transpose_conv"(%arg1, %w, %0, %b) {
-    padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32
+    padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32, fused_activation_function = "NONE"
   } : (tensor<4xi32>, tensor<3x1x1x2xf32>, tensor<1x5x5x2xf32>, tensor<3xf32>) -> tensor<1x5x5x3xf32>
   func.return %transpose_conv : tensor<1x5x5x3xf32>
 // CHECK: %[[bias:.*]] = arith.constant dense<[0.00999999977, 21.4736462, -214.736465]>
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
index 9f08baea0bd..881e29d205f 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
@@ -144,7 +144,7 @@ func.func @QuantizeMatmulWithActConst(%arg0: tensor<1x3x3x512xf32>) -> tensor<1x
 func.func @QuantizeTransposeConvWeightOnly(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
   %w = arith.constant dense<127.0> : tensor<1x32x42x128xf32>
   %b = arith.constant dense<0.0> : tensor<1x32x42x128xf32>
-  %tconv = "tfl.transpose_conv"(%arg1, %w, %arg0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
+  %tconv = "tfl.transpose_conv"(%arg1, %w, %arg0, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32, fused_activation_function = "NONE"} : (tensor<4xi32>, tensor<1x32x42x128xf32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
   func.return %tconv : tensor<1x32x42x128xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
new file mode 100644
index 00000000000..4431444c1ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
@@ -0,0 +1,190 @@
+// RUN: tf-opt %s -tfl-quantize-variables | FileCheck %s
+// RUN: tf-opt %s -tfl-prepare-quantize -tfl-quantize -tfl-post-quantize -tfl-quantize-variables -tfl-quantize -tfl-post-quantize | FileCheck --check-prefix=WHOLE-PASSES %s
+
+// CHECK-LABEL: QuantizeReadVariable
+func.func @QuantizeReadVariable() -> (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) {
+  %1 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
+  %2 = "tfl.read_variable"(%1) : (tensor<!tf_type.resource>) -> tensor<1x2x1x3xf32>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+  func.return %3 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  return %[[q]] : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+}
+
+// CHECK-LABEL: QuantizeAssignVariableWithDequantAndEqualType
+func.func @QuantizeAssignVariableWithDequantAndEqualType(%arg0 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>> {
+  %0 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
+  %1 = "tfl.dequantize"(%arg0) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x2x1x3xf32>
+  "tfl.assign_variable"(%0, %1) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
+  func.return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %arg0) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
+// CHECK-NEXT:  return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+}
+
+// CHECK-LABEL: QuantizeAssignVariableWithDequantAndNotEqualType
+func.func @QuantizeAssignVariableWithDequantAndNotEqualType(%arg0 : tensor<1x2x1x3x!quant.uniform<i8:f64, 1.0>>) -> tensor<1x2x1x3x!quant.uniform<i8:f64, 1.0>> {
+  %1 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
+  %2 = "tfl.read_variable"(%1) : (tensor<!tf_type.resource>) -> tensor<1x2x1x3xf32>
+  %3 = "tfl.quantize"(%2) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+  %5 = "tfl.dequantize"(%arg0) : (tensor<1x2x1x3x!quant.uniform<i8:f64, 1.0>>) -> tensor<1x2x1x3xf32>
+  "tfl.assign_variable"(%1, %5) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
+  func.return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f64, 1.0>>
+
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>} : (tensor<1x2x1x3x!quant.uniform<i8:f64, 1.000000e+00>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q2]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
+// CHECK-NEXT:  return %arg0 : tensor<1x2x1x3x!quant.uniform<i8:f64, 1.000000e+00>>
+}
+
+// CHECK-LABEL: QuantizeAssignVariableWithoutDequant
+func.func @QuantizeAssignVariableWithoutDequant(%arg0 : tensor<1x2x1x3xf32>) -> tensor<1x2x1x3xf32> {
+  %0 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
+  %1 = "tfl.read_variable"(%0) : (tensor<!tf_type.resource>) -> tensor<1x2x1x3xf32>
+  %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+  %3 = "tfl.dequantize"(%2) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x2x1x3xf32>
+  "tfl.assign_variable"(%0, %3) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
+  func.return %arg0 : tensor<1x2x1x3xf32>
+
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[q:.*]] = "tfl.quantize"(%[[dq]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
+// CHECK-NEXT:  return %arg0 : tensor<1x2x1x3xf32>
+}
+
+// CHECK-LABEL: VarHandleCase
+func.func @VarHandleCase(%arg0 : tensor<1x2x1x3xf32>) -> tensor<1x2x1x3xf32> {
+  %0 = "tfl.var_handle"() : () -> tensor<!tf_type.resource>
+  func.return %arg0 : tensor<1x2x1x3xf32>
+
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = ""} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  return %arg0 : tensor<1x2x1x3xf32>
+}
+
+// CHECK-LABEL: QuantizeReadAssign
+func.func @QuantizeReadAssign(%arg0: tensor<1x32x1x3xf32>) -> (tensor<1x34x1x3xf32>) {
+  %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x32x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x32x1x3xf32>) -> tensor<1x32x1x3x!quant.uniform<i8:f32, 1.0>>
+  %1 = "tfl.dequantize"(%0) : (tensor<1x32x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x32x1x3xf32>
+  %cst = arith.constant dense<1> : tensor<4xi32>
+  %cst_0 = arith.constant dense<[0, 0, 0, 3]> : tensor<4xi32>
+  %cst_1 = arith.constant dense<[0, -2, 0, 0]> : tensor<4xi32>
+  %2 = "tfl.var_handle"() {container = "", shared_name = "read_assign2/states"} : () -> tensor<!tf_type.resource>
+  %3 = "tfl.read_variable"(%2) : (tensor<!tf_type.resource>) -> tensor<1x2x1x3xf32>
+  %4 = "tfl.concatenation"(%3, %1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
+  %5 = "tfl.quantize"(%4) {qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>
+  %6 = "tfl.dequantize"(%5) : (tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x34x1x3xf32>
+  %7 = "tfl.strided_slice"(%6, %cst_1, %cst_0, %cst) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+  %8 = "tfl.quantize"(%7) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
+  %9 = "tfl.dequantize"(%8) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x2x1x3xf32>
+  "tfl.assign_variable"(%2, %9) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
+  func.return %6 : tensor<1x34x1x3xf32>
+
+// CHECK-NEXT:  %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x32x1x3xf32>) -> tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x32x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x32x1x3xf32>
+// CHECK-NEXT:  %[[cst:.*]] = arith.constant dense<1> : tensor<4xi32>
+// CHECK-NEXT:  %[[cst_0:.*]] = arith.constant dense<[0, 0, 0, 3]> : tensor<4xi32>
+// CHECK-NEXT:  %[[cst_1:.*]] = arith.constant dense<[0, -2, 0, 0]> : tensor<4xi32>
+// CHECK-NEXT:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign2/states"} : () -> tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>
+// CHECK-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq2:.*]] = "tfl.dequantize"(%[[rv]]) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[dq2]], %[[dq1]]) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
+// CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc]]) {qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  %[[dq3:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x34x1x3xf32>
+// CHECK-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[dq3]], %[[cst_1]], %[[cst_0]], %[[cst]]) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[q3:.*]] = "tfl.quantize"(%[[ss]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q3]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
+// CHECK-NEXT:  return %[[dq3]] : tensor<1x34x1x3xf32>
+}
+
+// WHOLE-PASSES-LABEL: QuantizeConvVariable
+func.func @QuantizeConvVariable(%arg0: tensor<1x3x1x1xf32>) -> (tensor<1x3x1x1xf32>) {
+  %cst = arith.constant dense<1> : tensor<4xi32>
+  %cst_0 = arith.constant dense<[0, 3, 0, 1]> : tensor<4xi32>
+  %cst_1 = arith.constant dense<0> : tensor<4xi32>
+  %cst_2 = arith.constant dense<0.000000e+00> : tensor<1xf32>
+  %0 = "tfl.quantize"(%cst_2) {qtype = tensor<1x!quant.uniform<i32:f32, 1.0>>, volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, 1.0>>
+  %1 = "tfl.dequantize"(%0) : (tensor<1x!quant.uniform<i32:f32, 1.0>>) -> tensor<1xf32>
+  %cst_3 = arith.constant dense<[[[[1.0]], [[1.0]], [[1.0]]]]> : tensor<1x3x1x1xf32>
+  %2 = "tfl.quantize"(%cst_3) {qtype = tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {1.0}>>, volatile} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {1.0}>>
+  %3 = "tfl.dequantize"(%2) : (tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {1.0}>>) -> tensor<1x3x1x1xf32>
+  %4 = "tfl.quantize"(%arg0) {qtype = tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:6>>, volatile} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:6>>
+  %5 = "tfl.dequantize"(%4) : (tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:6>>) -> tensor<1x3x1x1xf32>
+  %6 = "tfl.var_handle"() {container = "", shared_name = "conv_variable/state"} : () -> tensor<!tf_type.resource>
+  %7 = "tfl.read_variable"(%6) : (tensor<!tf_type.resource>) -> tensor<1x3x1x1xf32>
+  %8 = "tfl.conv_2d"(%5, %3, %1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x1x1xf32>, tensor<1x3x1x1xf32>, tensor<1xf32>) -> tensor<1x3x1x1xf32>
+  %9 = "tfl.quantize"(%8) {qtype = tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>, volatile} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>
+  %10 = "tfl.dequantize"(%9) : (tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>) -> tensor<1x3x1x1xf32>
+  %11 = "tfl.concatenation"(%7, %10) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x3x1x1xf32>, tensor<1x3x1x1xf32>) -> tensor<1x6x1x1xf32>
+  %12 = "tfl.quantize"(%11) {qtype = tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>, volatile} : (tensor<1x6x1x1xf32>) -> tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>
+  %13 = "tfl.dequantize"(%12) : (tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>) -> tensor<1x6x1x1xf32>
+  %14 = "tfl.strided_slice"(%13, %cst_1, %cst_0, %cst) {begin_mask = 15 : i32, ellipsis_mask = 0 : i32, end_mask = 13 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x6x1x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1xf32>
+  %15 = "tfl.quantize"(%14) {qtype = tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>, volatile} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>
+  %16 = "tfl.dequantize"(%15) : (tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>) -> tensor<1x3x1x1xf32>
+  "tfl.assign_variable"(%6, %16) : (tensor<!tf_type.resource>, tensor<1x3x1x1xf32>) -> ()
+  func.return %10 : tensor<1x3x1x1xf32>
+
+// WHOLE-PASSES:  %[[vh:.*]] = "tfl.var_handle"() {container = "", shared_name = "conv_variable/state"} : () -> tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>
+// WHOLE-PASSES-NEXT:  %[[rv:.*]] = "tfl.read_variable"(%[[vh]]) : (tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:  %[[cv:.*]] = "tfl.conv_2d"(%arg0, {{.*}}) {{{.*}}} : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[rv]], %[[cv]]) {{{.*}}} : (tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[cc]], {{.*}}) {{{.*}}} : (tensor<1x6x1x1x!quant.uniform<i8:f32, {{.*}}>>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh]], %[[ss]]) : (tensor<*x!tf_type.resource<tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>>>, tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>) -> ()
+// WHOLE-PASSES-NEXT:  return %[[cv]] : tensor<1x3x1x1x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// WHOLE-PASSES-LABEL: QuantizeTwoVariable
+func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>) {
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+
+  %1 = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+  %2 = "tfl.pseudo_const"() {value = dense<[0, 2, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %3 = "tfl.pseudo_const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+
+  %4 = "tfl.var_handle"() {container = "", shared_name = "read_assign/states0"} : () -> tensor<!tf_type.resource>
+  %5 = "tfl.var_handle"() {container = "", shared_name = "read_assign/states1"} : () -> tensor<!tf_type.resource>
+  
+  %40 = "tfl.read_variable"(%4) : (tensor<!tf_type.resource>) -> tensor<1x2x3xf32>
+  %41 = "quantfork.stats"(%40) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %42 = "tfl.concatenation"(%41, %0) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x4x3xf32>
+  %43 = "quantfork.stats"(%42) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x4x3xf32>) -> tensor<1x4x3xf32>
+  %44 = "tfl.strided_slice"(%43, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
+  %45 = "quantfork.stats"(%44) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  "tfl.assign_variable"(%4, %45) : (tensor<!tf_type.resource>, tensor<1x2x3xf32>) -> ()
+
+  %50 = "tfl.read_variable"(%5) : (tensor<!tf_type.resource>) -> tensor<1x2x3xf32>
+  %51 = "quantfork.stats"(%50) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  %52 = "tfl.concatenation"(%51, %0) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x4x3xf32>
+  %53 = "quantfork.stats"(%52) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x4x3xf32>) -> tensor<1x4x3xf32>
+  %54 = "tfl.strided_slice"(%53, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
+  %55 = "quantfork.stats"(%54) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  "tfl.assign_variable"(%5, %55) : (tensor<!tf_type.resource>, tensor<1x2x3xf32>) -> ()
+
+  func.return %0 : tensor<1x2x3xf32>
+
+// WHOLE-PASSES:  %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-DAG:  %[[vh1:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign/states0"} : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+// WHOLE-PASSES-DAG:  %[[vh2:.*]] = "tfl.var_handle"() {container = "", shared_name = "read_assign/states1"} : () -> tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>
+
+// WHOLE-PASSES-DAG:  %[[rv1:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[cc1:.*]] = "tfl.concatenation"(%[[rv1]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc1]]) {qtype = tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss1:.*]] = "tfl.strided_slice"(%[[q2]], {{.*}}) {{{.*}}} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh1]], %[[ss1]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
+
+// WHOLE-PASSES-DAG:  %[[rv2:.*]] = "tfl.read_variable"({{.*}}) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[cc2:.*]] = "tfl.concatenation"(%[[rv2]], {{.*}}) {{.*}} : (tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  %[[ss2:.*]] = "tfl.strided_slice"(%[[cc2]], {{.*}}) {{{.*}}} : (tensor<1x4x3x!quant.uniform<u8:f32, {{.*}}>>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+// WHOLE-PASSES-NEXT:  "tfl.assign_variable"(%[[vh2]], %[[ss2]]) : (tensor<*x!tf_type.resource<tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>>>, tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>) -> ()
+
+// WHOLE-PASSES-NEXT:  return %arg0 : tensor<1x2x3x!quant.uniform<u8:f32, {{.*}}>>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 05dd85c0c42..30a5f67bd14 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -80,7 +80,24 @@ func.func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03
 // CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi8>}
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst0]])
 // CHECK: return %[[conv]] : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+}
 
+// CHECK-LABEL: QuantizeConv2D4Bit
+func.func @QuantizeConv2D4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
+^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
+  %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
+  %w = arith.constant dense<-1.0> : tensor<32x3x3x3xf32>
+  %3 = "tfl.quantize"(%w) {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.1>>} : (tensor<32x3x3x3xf32>) -> tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.1>>
+  %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.1>>) -> tensor<32x3x3x3xf32>
+  %5 = "tfl.conv_2d"(%2, %4, %cst) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 7.812500e-04>>, value = dense<-1583> : tensor<32xi32>}
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 1.000000e-01>>, value = dense<1> : tensor<32x3x3x3xi4>}
+// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[cst1]], %[[cst0]])
+// CHECK: return %[[conv]] : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 }
 
 // CHECK-LABEL: QuantizeDepthwiseConv2D
@@ -100,6 +117,23 @@ func.func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.8
 // CHECK: return %[[conv]]
 }
 
+// CHECK-LABEL: QuantizeDepthwiseConv2D4Bit
+func.func @QuantizeDepthwiseConv2D4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
+^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
+  %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
+  %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>, value = dense<-7> : tensor<32x3x3x3xi4>} : () -> tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>
+  %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>) -> tensor<32x3x3x3xf32>
+  %5 = "tfl.depthwise_conv_2d"(%2, %4, %cst) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+
+// CHECK: %[[cst0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}
+// CHECK: %[[cst1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x3x3x3xi4>}
+// CHECK: %[[conv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[cst1]], %[[cst0]]) {depth_multiplier = 4 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 4 : i32, stride_w = 5 : i32}
+// CHECK: return %[[conv]]
+}
+
 // CHECK-LABEL: QuantizeFullyConnected
 func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
 ^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
@@ -125,6 +159,31 @@ func.func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.81
 // BLOCK: return %[[q]]
 }
 
+// CHECK-LABEL: QuantizeFullyConnected4Bit
+func.func @QuantizeFullyConnected4Bit(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>> {
+^bb0(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %cst = arith.constant dense<-1.23697901> : tensor<32xf32>
+  %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x224x224x3xf32>
+  %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>, value = dense<-7> : tensor<32x12xi4>} : () -> tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>
+  %4 = "tfl.dequantize"(%3) : (tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.395998308:8>>) -> tensor<32x12xf32>
+  %5 = "tfl.fully_connected"(%2, %4, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x12xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
+  %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+  func.return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
+
+// CHECK: %[[cst_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform<i32:f32, 0.0030937367812500002>>, value = dense<-400> : tensor<32xi32>}
+// CHECK: %[[cst_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[cst_1]], %[[cst_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"}
+// CHECK: return %[[fc]]
+
+// BLOCK: %[[cst:.*]] = arith.constant dense<-1.23697901>
+// BLOCK: %[[dq1:.*]] = "tfl.dequantize"(%arg0)
+// BLOCK: %[[cst2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<32x12x!quant.uniform<u4<1:15>:f32, 0.39599830800000002:8>>, value = dense<-7> : tensor<32x12xi4>}
+// BLOCK: %[[dq2:.*]] = "tfl.dequantize"(%[[cst2]])
+// BLOCK: %[[fc:.*]] = "tfl.fully_connected"(%[[dq1]], %[[dq2]], %[[cst]])
+// BLOCK: %[[q:.*]] = "tfl.quantize"(%[[fc]])
+// BLOCK: return %[[q]]
+}
+
 // CHECK-LABEL: QuantizeNoBiasFullyConnected
 func.func @QuantizeNoBiasFullyConnected(%arg0: tensor<3x!quant.uniform<u8:f32, 1.0>>, %arg1: tensor<3x3x!quant.uniform<u8<1:255>:f32, 1.0>>, %arg2: none) -> tensor<3x!quant.uniform<u8:f32, 1.0>> {
   %0 = "tfl.dequantize"(%arg0) : (tensor<3x!quant.uniform<u8:f32, 1.0>>) -> tensor<3xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
index d4d30b4c922..34b9a54bc91 100644
--- a/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/shape-inference.mlir
@@ -87,6 +87,17 @@ func.func @testUnidirectionalSequenceLstmShapeInference(%arg0: tensor<600 x 10 x
 
 // -----
 
+module attributes {tf.versions = {producer = 888 : i32}} {
+// CHECK-LABEL: testUnidirectionalSequenceLstmShapeInference
+func.func @testUnidirectionalSequenceLstmShapeInference(%arg0: tensor<600 x ? x 20 x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<40 x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<600 x 40 x f32>, %arg19: tensor<600 x 40 x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x ? x ? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600x?x20xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<600x?x40xf32
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<600 x ? x 20 x f32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<40xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<600x40xf32>, tensor<600x40xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<? x ? x ? xf32>
+  func.return %0 : tensor<? x ? x ? x f32>
+}
+}
+
+// -----
+
 // CHECK-LABEL: testReshapeShapeInference
 module attributes {tf.versions = {producer = 888 : i32}} {
 func.func @testReshapeShapeInference(%arg0: tensor<3x4xi32>) -> tensor<*xi32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
index 35c25a92f2a..0fd1482d77d 100644
--- a/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/tfl_while_outline.mlir
@@ -190,33 +190,3 @@ func.func @whileDifferentResultShapes(%arg0: tensor<i32>) -> tensor<?xf32>
   // CHECK: (tensor<i32>, tensor<1xf32>, tensor<i32>) -> (tensor<i32>, tensor<?xf32>, tensor<i32>)
   func.return %0#1 : tensor<?xf32>
 }
-
-// -----
-
-func.func @unsupportedCast(%arg0: tensor<4x4x3xf32>) -> tensor<*xf32> {
-  %cst = arith.constant dense<0.000000e+00> : tensor<4x2xf32>
-  %cst_0 = arith.constant dense<0.000000e+00> : tensor<4x4x3xf64>
-  %cst_1 = arith.constant dense<[1, 0, 2]> : tensor<3xi32>
-  %cst_2 = arith.constant dense<0.000000e+00> : tensor<4x4x2xf32>
-  %cst_3 = arith.constant dense<4> : tensor<i32>
-  %cst_4 = arith.constant dense<0> : tensor<i32>
-  %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x2xf64>
-  %0 = "tfl.transpose"(%arg0, %cst_1) : (tensor<4x4x3xf32>, tensor<3xi32>) -> tensor<4x4x3xf32>
-  %1:6 = "tfl.while"(%cst_4, %cst_4, %cst_2, %cst, %cst_5, %cst_0) ({
-  ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<*xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x2xf64>, %arg6: tensor<*xf64>):
-    %5 = "tfl.less"(%arg2, %cst_3) : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    %6 = "tfl.less"(%arg1, %cst_3) : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    %7 = tfl.logical_and %6, %5 : tensor<i1>
-    "tfl.yield"(%7) : (tensor<i1>) -> ()
-  },  {
-  ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<*xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x2xf64>, %arg6: tensor<*xf64>):
-    "tfl.yield"(%arg1, %arg2, %arg3, %arg4, %arg5, %cst_0) : (tensor<i32>, tensor<i32>, tensor<*xf32>, tensor<4x2xf32>, tensor<4x2xf64>, tensor<4x4x3xf64>) -> ()
-  }) {is_stateless = true} : (tensor<i32>, tensor<i32>, tensor<4x4x2xf32>, tensor<4x2xf32>, tensor<4x2xf64>, tensor<4x4x3xf64>) -> (tensor<i32>, tensor<i32>, tensor<*xf32>, tensor<4x2xf32>, tensor<4x2xf64>, tensor<*xf32>)
-  func.return %1#2 : tensor<*xf32>
-}
-
-// CHECK-LABEL:  func @unsupportedCast(
-
-// CHECK-LABEL:  func private @tfl.while_body(
-// CHECK-SAME:     %arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<*xf32>, %arg3: tensor<4x2xf32>, %arg4: tensor<4x2xf64>, %arg5: tensor<*xf64>) -> (tensor<i32>, tensor<i32>, tensor<*xf32>, tensor<4x2xf32>, tensor<4x2xf64>, tensor<*xf64>)
-// CHECK:           [[VAL:%.*]] = "tf.Cast"
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 6d0b035683b..17b0501cb78 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace mlir {
 /// Create a pass to convert from the TFExecutor to the TF control dialect.
@@ -57,8 +57,8 @@ void AddQuantizationPasses(const mlir::quant::QuantizationSpecs& quant_specs,
       quant_specs.default_ranges.second.has_value()) {
     pass_manager.addNestedPass<mlir::func::FuncOp>(
         mlir::TFL::CreateDefaultQuantParamsPass(
-            quant_specs.default_ranges.first.getValueOr(0.0),
-            quant_specs.default_ranges.second.getValueOr(0.0),
+            quant_specs.default_ranges.first.value_or(0.0),
+            quant_specs.default_ranges.second.value_or(0.0),
             quant_specs.IsSignedInferenceType()));
   }
   pass_manager.addNestedPass<mlir::func::FuncOp>(
@@ -67,6 +67,19 @@ void AddQuantizationPasses(const mlir::quant::QuantizationSpecs& quant_specs,
       quant_specs.inference_type != quant_specs.inference_input_type;
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
+  // TODO(b/265081639): When added PrepareQuantizeVariablesPass before adding
+  // PrepareQuantizePass, an error occurs in certain model. It could fix it by
+  // roll-back to run PrepareQuantizeVariablesPass, QuantizePass,
+  // PostQuantizePass as suggested in cl/479634700. Need to figure out the
+  // fundamental reason of the error, and (if needed) fix it without this
+  // rollback.
+  if (quant_specs.enable_mlir_variable_quantization) {
+    pass_manager.addPass(mlir::TFL::CreatePrepareQuantizeVariablesPass());
+    pass_manager.addNestedPass<mlir::func::FuncOp>(
+        mlir::TFL::CreateQuantizePass(quant_specs));
+    pass_manager.addNestedPass<mlir::func::FuncOp>(
+        mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
+  }
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreateOptimizeOpOrderPass());
   // Add optimization pass after quantization for additional fusing
@@ -87,6 +100,19 @@ void AddDynamicRangeQuantizationPasses(
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops,
                                         quant_specs.custom_map));
+  // TODO(b/265081639): When added PrepareQuantizeVariablesPass before adding
+  // PrepareQuantizePass, an error occurs in certain model. It could fix it by
+  // roll-back to run PrepareQuantizeVariablesPass, QuantizePass,
+  // PostQuantizePass as suggested in cl/479634700. Need to figure out the
+  // fundamental reason of the error, and (if needed) fix it without this
+  // rollback.
+  if (quant_specs.enable_mlir_variable_quantization) {
+    pass_manager.addPass(mlir::TFL::CreatePrepareQuantizeVariablesPass());
+    pass_manager.addNestedPass<mlir::func::FuncOp>(
+        mlir::TFL::CreateQuantizePass(quant_specs));
+    pass_manager.addNestedPass<mlir::func::FuncOp>(
+        mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
+  }
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreateOptimizeOpOrderPass());
   // Add optimization pass after quantization for additional fusing
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 74f9d994eaa..c808336e571 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <functional>
 #include <iostream>
+#include <optional>
 
 #include "absl/strings/str_split.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -53,17 +53,17 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::func::FuncOp;
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // Debugging flag to print function mapping in the flatbuffer.
 // NOLINTNEXTLINE
@@ -308,7 +308,7 @@ int main(int argc, char **argv) {
   });
 
   std::string result;
-  llvm::Optional<tensorflow::Session *> session = llvm::None;
+  llvm::Optional<tensorflow::Session *> session = std::nullopt;
   if (bundle) session = bundle->GetSession();
   auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
       module.value().get(), output_mlir, toco_flags, pass_config, tags,
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 0755c06aa0e..f908f608238 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 
+#include <iostream>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "llvm/Support/raw_ostream.h"
@@ -34,8 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/mhlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -48,12 +51,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -61,7 +64,7 @@ using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::Operation;
 using mlir::OwningOpRef;
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 bool IsControlFlowV1Op(Operation* op) {
   return mlir::isa<mlir::tf_executor::SwitchOp, mlir::tf_executor::MergeOp,
@@ -243,9 +246,22 @@ Status ConvertTFExecutorToStablehloFlatbuffer(
     return statusHandler.ConsumeStatus();
   }
 
-  // TODO(b/242328935): Add outputing a TFLite Flatbuffer for a stablehlo.
-  return tensorflow::errors::Unknown(
-      "No output mlir specified, please specify output_mlir");
+  // Convert StableHLO MLIR to TFLite Custom Op MLIR
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::CreateStablehloToTflPass());
+  if (failed(pass_manager.run(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  // Write TFLite Custom Op MLIR to Flatbuffer
+  // TODO(b/260112687): will serialize StableHLO to Flatbuffer directly
+  tflite::FlatbufferExportOptions options;
+  options.toco_flags.set_allow_custom_ops(true);
+  if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result)) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  return OkStatus();
 }
 
 Status ConvertTFExecutorToTFLOrFlatbuffer(
@@ -297,8 +313,8 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   // Freeze variables if a session is provided.
   if (session.has_value()) {
     mlir::TFL::ErrorCollectorInstrumentation collector(module.getContext());
-    if (failed(mlir::tf_saved_model::FreezeVariables(module,
-                                                     session.getValue()))) {
+    if (failed(
+            mlir::tf_saved_model::FreezeVariables(module, session.value()))) {
       auto status = statusHandler.ConsumeStatus();
       mlir::TFL::ErrorCollector* collector =
           mlir::TFL::ErrorCollector::GetErrorCollector();
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 0b327054ebf..0561be72547 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -30,8 +30,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -40,8 +40,7 @@ namespace tensorflow {
 // file; otherwise, load from a GraphDef.
 // Setting prune_unused_nodes to true, would prune unreachable nodes if
 // output_arrays is specified.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-LoadFromGraphdefOrMlirSource(
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
     const GraphImportConfig& specs, absl::string_view debug_info_file,
@@ -52,8 +51,7 @@ LoadFromGraphdefOrMlirSource(
 
 // Load Saved model (either v1 or v2) into MLIR.
 // 'saved_model_bundle' will be initialized if V1 model was loaded.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ImportSavedModel(
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
     absl::Span<const std::string> extra_tf_opdefs,
diff --git a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
index c4ba044caf7..d8ccdc6bef2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
+++ b/tensorflow/compiler/mlir/lite/transforms/dilated_conv.h
@@ -99,8 +99,8 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "dilations should be all 1");
   }
 
-  if (!TFL::TFTypeIsFloat32Tensor(op.input()) &&
-      !TFL::TFTypeIsBFloat16OrHalfTensor(op.input())) {
+  if (!TFL::TFTypeIsFloat32Tensor(op.getInput()) &&
+      !TFL::TFTypeIsBFloat16OrHalfTensor(op.getInput())) {
     return rewriter.notifyMatchFailure(
         op, "op's input is not float or half or bfloat16");
   }
@@ -185,8 +185,8 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     }
     // Make sure that the axis in `expand_op` is constant.
     if (auto const_op =
-            llvm::dyn_cast<TF::ConstOp>(expand_op.dim().getDefiningOp())) {
-      expand_axis = (*const_op.value()
+            llvm::dyn_cast<TF::ConstOp>(expand_op.getDim().getDefiningOp())) {
+      expand_axis = (*const_op.getValue()
                           .cast<DenseElementsAttr>()
                           .getValues<APInt>()
                           .begin())
@@ -202,7 +202,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
           expand_op, "ExpandDimsOp doesn't have a constant axis");
     }
     // Make sure that the `squeeze_dims` is equal to `expand_axis`.
-    auto squeeze_dims = squeeze_op.squeeze_dims();
+    auto squeeze_dims = squeeze_op.getSqueezeDims();
     if (squeeze_dims.size() != 1) {
       return rewriter.notifyMatchFailure(
           squeeze_op, "squeeze dims should have exactly 1 dimension specified");
@@ -218,7 +218,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     }
 
     // Update previous/next op pointer.
-    Operation* tmp = expand_op.input().getDefiningOp();
+    Operation* tmp = expand_op.getInput().getDefiningOp();
     if (!tmp || tmp->getNumResults() != 1) {
       return rewriter.notifyMatchFailure(
           producer_op,
@@ -264,7 +264,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
       return maybeConsumer.first;
     }
     consumer_op = maybeConsumer.second;
-    if (!matchPattern(pad_op.paddings(), m_Constant(&pad_attr))) {
+    if (!matchPattern(pad_op.getPaddings(), m_Constant(&pad_attr))) {
       // If the padding value isn't constant, we can't determine the padding
       // scheme for Conv2D below, in this case just reject the pattern.
       return rewriter.notifyMatchFailure(
@@ -311,13 +311,13 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   }
 
   llvm::Optional<ArrayAttr> dilations_attr = ExtractDilationsAttrFromBlockShape(
-      stb_op.block_shape(), bts_op.block_shape(), expand_axis, rewriter);
+      stb_op.getBlockShape(), bts_op.getBlockShape(), expand_axis, rewriter);
   if (!dilations_attr.has_value()) {
     return rewriter.notifyMatchFailure(op, "failed to extract dilation rate");
   }
 
   if (expand_op) {
-    if (stb_op.input().getType().dyn_cast<RankedTensorType>() == nullptr) {
+    if (stb_op.getInput().getType().dyn_cast<RankedTensorType>() == nullptr) {
       return rewriter.notifyMatchFailure(
           stb_op, "SpaceToBatchND op's input should have RankedTensorType");
     }
@@ -351,8 +351,8 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   // dilated conv, hence we shouldn't pattern match here. Instead, we need to
   // check values of `paddings` and `crops` to make sure it really stands for
   // a dilated conv.
-  auto stb_paddings = stb_op.paddings();
-  auto bts_crops = bts_op.crops();
+  auto stb_paddings = stb_op.getPaddings();
+  auto bts_crops = bts_op.getCrops();
   ElementsAttr stb_paddings_attr, bts_crops_attr;
   if (!matchPattern(stb_paddings, m_Constant(&stb_paddings_attr)) ||
       !matchPattern(bts_crops, m_Constant(&bts_crops_attr))) {
@@ -388,7 +388,7 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
   }
 
   // Set dilations
-  op->setAttr("dilations", dilations_attr.getValue());
+  op->setAttr("dilations", dilations_attr.value());
 
   if (expand_op) {
     // If there is `expand_op`, we need to rewire the inputs to bypass the
@@ -397,45 +397,46 @@ LogicalResult ConvertTFDilatedConvOp<Conv2dOpTy>::matchAndRewrite(
     // BiasAdd' to 'Expand -> Conv2D ->Squeeze -> BiasAdd'.
 
     // Connect `expand_op` with the input of `stb_op`.
-    expand_op.setOperand(0, stb_op.input());
+    expand_op.setOperand(0, stb_op.getInput());
     // Calculate the shape for expand.
-    auto input_shape = stb_op.input().getType().cast<ShapedType>().getShape();
+    auto input_shape =
+        stb_op.getInput().getType().cast<ShapedType>().getShape();
     SmallVector<int64_t, 4> expand_shape(input_shape.begin(),
                                          input_shape.end());
     expand_shape.insert(expand_shape.begin() + expand_axis, 1);
 
     auto expand_result_type = RankedTensorType::get(
-        expand_shape, getElementTypeOrSelf(stb_op.input()));
+        expand_shape, getElementTypeOrSelf(stb_op.getInput()));
     expand_op.getResult().setType(expand_result_type);
 
     // Update the conv op's output shape.
     auto bts_output_shape =
-        bts_op.output().getType().cast<ShapedType>().getShape();
+        bts_op.getOutput().getType().cast<ShapedType>().getShape();
     SmallVector<int64_t, 4> conv_result_shape(bts_output_shape.begin(),
                                               bts_output_shape.end());
     conv_result_shape.insert(conv_result_shape.begin() + expand_axis, 1);
     auto conv_result_type = RankedTensorType::get(
-        conv_result_shape, getElementTypeOrSelf(stb_op.input()));
+        conv_result_shape, getElementTypeOrSelf(stb_op.getInput()));
     op.getResult().setType(conv_result_type);
 
-    squeeze_op.getResult().setType(bts_op.output().getType());
+    squeeze_op.getResult().setType(bts_op.getOutput().getType());
 
     // Connect `biasadd_op` with the output of `squeeze_op`.
     if (biasadd_op) {
-      biasadd_op.setOperand(0, squeeze_op.output());
-      biasadd_op.output().setType(squeeze_op.output().getType());
+      biasadd_op.setOperand(0, squeeze_op.getOutput());
+      biasadd_op.getOutput().setType(squeeze_op.getOutput().getType());
     }
   } else {
-    if (biasadd_op) biasadd_op.setOperand(0, op.output());
-    op.setOperand(0, stb_op.input());
+    if (biasadd_op) biasadd_op.setOperand(0, op.getOutput());
+    op.setOperand(0, stb_op.getInput());
     op.getResult().setType(bts_op.getResult().getType());
   }
 
   if (final_op_is_bts) {
-    if (bts_op.input().getDefiningOp<TF::PadOp>()) {
-      bts_op.getResult().replaceAllUsesWith(pad_op.input());
+    if (bts_op.getInput().getDefiningOp<TF::PadOp>()) {
+      bts_op.getResult().replaceAllUsesWith(pad_op.getInput());
     } else {
-      bts_op.getResult().replaceAllUsesWith(bts_op.input());
+      bts_op.getResult().replaceAllUsesWith(bts_op.getInput());
     }
   }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
index fd279b75f96..4e1fe8e0122 100644
--- a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
@@ -41,22 +41,9 @@ class InsertCallOnceOpFromSessionInitializerPass
 
 void InsertCallOnceOpFromSessionInitializerPass::runOnOperation() {
   ModuleOp module = getOperation();
-  tf_saved_model::SessionInitializerOp session_init_op =
-      tf_saved_model::GetSessionInitializerOp(module);
-
-  if (!session_init_op) return;
-
-  SymbolTable symbol_table(module);
-
-  for (auto sym_ref : session_init_op.getInitializers()) {
-    func::FuncOp init_func_op = symbol_table.lookup<mlir::func::FuncOp>(
-        sym_ref.cast<FlatSymbolRefAttr>().getValue());
-
-    if (!init_func_op) {
-      module.emitError("no session initializer function found");
-      return signalPassFailure();
-    }
 
+  for (func::FuncOp init_func_op :
+       tf_saved_model::GetInitializerFunctions(module)) {
     for (auto func : module.getOps<func::FuncOp>()) {
       auto dict_attr =
           func->getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
index e3b201bd82d..59dbbbb9dc0 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_hashtables.cc
@@ -59,9 +59,9 @@ class LegalizeHashTableOpPattern : public OpRewritePattern<TF::HashTableV2Op> {
     // native resource design is based on integer keys to identify the
     // corresponding resource objects.
     auto table_id =
-        static_cast<int32_t>(::llvm::hash_value(hashtable_op.shared_name()));
-    auto key_dtype = hashtable_op.key_dtype();
-    auto value_dtype = hashtable_op.value_dtype();
+        static_cast<int32_t>(::llvm::hash_value(hashtable_op.getSharedName()));
+    auto key_dtype = hashtable_op.getKeyDtype();
+    auto value_dtype = hashtable_op.getValueDtype();
 
     rewriter.replaceOpWithNewOp<TFL::HashtableOp>(
         hashtable_op, output_type, table_id, key_dtype, value_dtype);
@@ -76,13 +76,13 @@ class LegalizeHashTableFindOpPattern
 
   LogicalResult matchAndRewrite(TF::LookupTableFindV2Op find_op,
                                 PatternRewriter& rewriter) const override {
-    auto handle_op = find_op.table_handle().getDefiningOp();
+    auto handle_op = find_op.getTableHandle().getDefiningOp();
     if (handle_op == nullptr) return failure();
     auto hashtable_op = llvm::dyn_cast<TFL::HashtableOp>(handle_op);
     if (hashtable_op == nullptr) return failure();
     rewriter.replaceOpWithNewOp<TFL::HashtableFindOp>(
-        find_op, find_op->getResultTypes(), find_op.table_handle(),
-        find_op.keys(), find_op.default_value());
+        find_op, find_op->getResultTypes(), find_op.getTableHandle(),
+        find_op.getKeys(), find_op.getDefaultValue());
     return success();
   }
 };
@@ -94,13 +94,13 @@ class LegalizeHashTableImportOpPattern
 
   LogicalResult matchAndRewrite(TF::LookupTableImportV2Op import_op,
                                 PatternRewriter& rewriter) const override {
-    auto handle_op = import_op.table_handle().getDefiningOp();
+    auto handle_op = import_op.getTableHandle().getDefiningOp();
     if (handle_op == nullptr) return failure();
     auto hashtable_op = llvm::dyn_cast<TFL::HashtableOp>(handle_op);
     if (hashtable_op == nullptr) return failure();
     rewriter.replaceOpWithNewOp<TFL::HashtableImportOp>(
-        import_op, import_op->getResultTypes(), import_op.table_handle(),
-        import_op.keys(), import_op.values());
+        import_op, import_op->getResultTypes(), import_op.getTableHandle(),
+        import_op.getKeys(), import_op.getValues());
     return success();
   }
 };
@@ -112,12 +112,12 @@ class LegalizeHashTableSizeOpPattern
 
   LogicalResult matchAndRewrite(TF::LookupTableSizeV2Op size_op,
                                 PatternRewriter& rewriter) const override {
-    auto handle_op = size_op.table_handle().getDefiningOp();
+    auto handle_op = size_op.getTableHandle().getDefiningOp();
     if (handle_op == nullptr) return failure();
     auto hashtable_op = llvm::dyn_cast<TFL::HashtableOp>(handle_op);
     if (hashtable_op == nullptr) return failure();
     rewriter.replaceOpWithNewOp<TFL::HashtableSizeOp>(
-        size_op, size_op->getResultTypes(), size_op.table_handle());
+        size_op, size_op->getResultTypes(), size_op.getTableHandle());
     return success();
   }
 };
@@ -137,8 +137,8 @@ bool checkWhetherGraphHasValidStaticLookupTables(ModuleOp module) {
   }
 
   for (auto hashtable : hashtables) {
-    auto key_dtype = hashtable.key_dtype();
-    auto value_dtype = hashtable.value_dtype();
+    auto key_dtype = hashtable.getKeyDtype();
+    auto value_dtype = hashtable.getValueDtype();
 
     // Only allow string -> int64 and int64 -> string mappings due to kernel
     // capability.
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
index 1bc15fefe49..db99805c92e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
@@ -50,7 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 74e1a04b958..a8f0ea36135 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -491,7 +491,8 @@ def LegalizeConv2DBackpropInput : Pat<
      /*bias=*/ (CreateNoneValue $input_sizes),
      /*padding=*/ $padding,
      /*stride_h=*/ ExtractI32At<1>:$strides,
-     /*stride_w=*/ ExtractI32At<2>:$strides)>;
+     /*stride_w=*/ ExtractI32At<2>:$strides,
+     /*fused_activation_function=*/TFL_AF_None)>;
 
 def IsRankZeroAttr
   : CPred<"$_self.cast<DenseElementsAttr>().getType().getRank() == 0">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 38d651cba4b..555a879c956 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -21,6 +21,7 @@ limitations under the License.
 // constant folding opportunities from the extra ops can be exploited by the
 // constant folding support for the TensorFlow ops.
 
+#include <algorithm>
 #include <climits>
 #include <complex>
 #include <cstdint>
@@ -136,7 +137,7 @@ Value GetShape(Value input, Location loc, PatternRewriter& rewriter) {
         RankedTensorType::get(static_shape.size(), rewriter.getIntegerType(64));
     auto static_shape_attr =
         mlir::DenseIntElementsAttr::get(static_shape_type, static_shape);
-    return rewriter.create<TF::ConstOp>(loc, static_shape_attr).output();
+    return rewriter.create<TF::ConstOp>(loc, static_shape_attr).getOutput();
   }
 
   // If the shape is not static, create a new ShapeOp.
@@ -144,7 +145,7 @@ Value GetShape(Value input, Location loc, PatternRewriter& rewriter) {
   return rewriter
       .create<TF::ShapeOp>(loc, input,
                            /*use_32bit=*/false_attr)
-      .output();
+      .getOutput();
 }
 
 mlir::TFL::MirrorPaddingType GetTFLMirrorPaddingFromString(
@@ -169,6 +170,9 @@ mlir::TFL::MirrorPaddingType GetTFLMirrorPaddingFromString(
 
 DECL_CONVERT_OP(Assert);
 DECL_CONVERT_OP(ConcatV2);
+DECL_CONVERT_OP(BatchMatMul);
+DECL_CONVERT_OP(BatchMatMulV2);
+DECL_CONVERT_OP(BatchMatMulV3);
 DECL_CONVERT_OP(MatMul);
 DECL_CONVERT_OP(MatrixDiagV2);
 DECL_CONVERT_OP(MatrixDiagV3);
@@ -205,11 +209,12 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatV2Op>(op);
 
-  auto values = tf_concat_op.values();
-  auto output_type = tf_concat_op.output().getType();
+  auto values = tf_concat_op.getValues();
+  auto output_type = tf_concat_op.getOutput().getType();
   // Extract axis attribute from constant axis tensor
   ElementsAttr axis;
-  if (!matchPattern(tf_concat_op.axis(), m_Constant(&axis))) return failure();
+  if (!matchPattern(tf_concat_op.getAxis(), m_Constant(&axis)))
+    return failure();
   IntegerAttr axis_int = ExtractSingleElementAsInteger(axis);
 
   // "axis" operand could be a i64 tensor. Resolve it here.
@@ -223,6 +228,128 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   return success();
 }
 
+template <typename BatchMatMulOpType>
+bool ConvertTFBatchMatMulOp2TFLFullyConnectedOp(Operation* bmm_op,
+                                                PatternRewriter& rewriter) {
+  // If `value` is produced by tf.Dequantize op, then return the Dequantize op's
+  // input. Otherwise return `value`.
+  auto get_real_input_value = [](Value value) -> Value {
+    Operation* defining_op = value.getDefiningOp();
+    if (auto dequantize = dyn_cast_or_null<TF::DequantizeOp>(defining_op)) {
+      return dequantize.getInput();
+    } else if (auto dequantize =
+                   dyn_cast_or_null<TFL::DequantizeOp>(defining_op)) {
+      return dequantize.getInput();
+    } else {
+      return value;
+    }
+  };
+
+  // Returns true if the TF::BatchMatMul operation can be converted to
+  // tfl.fully_connected.
+  auto can_convert_to_fully_connected =
+      [&](BatchMatMulOpType& batch_matmul_op) {
+        Value input_rhs = get_real_input_value(batch_matmul_op.getY());
+
+        DenseElementsAttr constant;
+        if (!matchPattern(input_rhs, m_Constant(&constant))) {
+          return false;
+        }
+
+        // The rhs matrix must be 2D for fully connected op.
+        return (constant.getType().getRank() == 2);
+      };
+
+  auto op = cast<BatchMatMulOpType>(bmm_op);
+
+  // Create a tfl.transpose op that performs ZX transpose on `input`.
+  auto create_z_x_transpose_op = [&](Value input) -> Value {
+    RankedTensorType input_type = input.getType().cast<RankedTensorType>();
+    const int input_rank = input_type.getRank();
+
+    // Create a 1D I32 tensor for representing the dimension permutation.
+    auto permuation_tensor_type =
+        RankedTensorType::get({input_rank}, rewriter.getIntegerType(32));
+    llvm::SmallVector<Attribute, 4> permute;
+    permute.reserve(input_rank);
+    // First create an identity permutation tensor.
+    for (int i = 0; i < input_rank; i++) {
+      permute.push_back(rewriter.getI32IntegerAttr(i));
+    }
+    // Swaps the last two dimension since the last two dimension will be mapped
+    // to X and Z dimension.
+    std::iter_swap(permute.begin() + input_rank - 1,
+                   permute.begin() + input_rank - 2);
+    auto permutation_tensor_op = rewriter.create<arith::ConstantOp>(
+        op->getLoc(), permuation_tensor_type,
+        DenseElementsAttr::get(permuation_tensor_type, permute));
+
+    auto input_shape = input_type.getShape();
+    llvm::SmallVector<int64_t, 4> permuted_shape(input_shape.begin(),
+                                                 input_shape.end());
+    // Swaps z dimension and x dimension to get permuted shape.
+    std::iter_swap(permuted_shape.begin() + input_rank - 1,
+                   permuted_shape.begin() + input_rank - 2);
+    return rewriter.create<TFL::TransposeOp>(
+        op->getLoc(),
+        RankedTensorType::get(permuted_shape, input_type.getElementType()),
+        input, permutation_tensor_op.getResult());
+  };
+
+  if (!can_convert_to_fully_connected(op)) {
+    return false;
+  }
+
+  Value input_lhs = get_real_input_value(op.getX());
+  Value input_rhs = get_real_input_value(op.getY());
+
+  Value legalized_lhs =
+      op.getAdjX() ? create_z_x_transpose_op(input_lhs) : input_lhs;
+
+  // The rhs need to be transposed if adj_y == false AND this matmul will be
+  // legalized to tfl.fully_connected
+  Value legalized_rhs =
+      !op.getAdjY() ? create_z_x_transpose_op(input_rhs) : input_rhs;
+
+  Type output_type = op.getResult().getType();
+  auto no_input = rewriter.create<TFL::NoValueOp>(
+      op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
+  auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
+      op->getLoc(), ArrayRef<Type>{output_type},
+      /*input=*/legalized_lhs, /*filter=*/legalized_rhs, /*bias=*/no_input,
+      /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
+      /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
+      /*keep_num_dims=*/rewriter.getBoolAttr(true),
+      /*asymmetric_quantize_inputs=*/mlir::BoolAttr());
+  rewriter.replaceOp(op, {fc_op.getResult(0)});
+
+  return true;
+}
+
+LogicalResult ConvertTFBatchMatMulOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  if (ConvertTFBatchMatMulOp2TFLFullyConnectedOp<TF::BatchMatMulOp>(op,
+                                                                    rewriter))
+    return success();
+  return failure();
+}
+
+LogicalResult ConvertTFBatchMatMulV2Op::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  if (ConvertTFBatchMatMulOp2TFLFullyConnectedOp<TF::BatchMatMulV2Op>(op,
+                                                                      rewriter))
+    return success();
+  return failure();
+}
+
+LogicalResult ConvertTFBatchMatMulV3Op::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  if (ConvertTFBatchMatMulOp2TFLFullyConnectedOp<TF::BatchMatMulV3Op>(op,
+                                                                      rewriter))
+    return success();
+  return failure();
+}
+
 LogicalResult ConvertTFMatMulOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_matmul_op = cast<TF::MatMulOp>(op);
@@ -246,12 +373,12 @@ LogicalResult ConvertTFMatMulOp::matchAndRewrite(
   };
 
   // TODO(jpienaar): Remove once handled via dailect conversion.
-  if (tf_matmul_op.transpose_a()) {
+  if (tf_matmul_op.getTransposeA()) {
     LogicalResult result = success();
     std::tie(result, lhs) = transpose(lhs);
     if (failed(result)) return failure();
   }
-  if (!tf_matmul_op.transpose_b()) {
+  if (!tf_matmul_op.getTransposeB()) {
     LogicalResult result = success();
     std::tie(result, rhs) = transpose(rhs);
     if (failed(result)) return failure();
@@ -275,11 +402,11 @@ LogicalResult ConvertTFPackOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_pack_op = cast<TF::PackOp>(op);
 
-  SmallVector<Value, 4> values(tf_pack_op.values());
-  auto output_type = tf_pack_op.output().getType();
-  auto values_count = rewriter.getI32IntegerAttr(tf_pack_op.N());
+  SmallVector<Value, 4> values(tf_pack_op.getValues());
+  auto output_type = tf_pack_op.getOutput().getType();
+  auto values_count = rewriter.getI32IntegerAttr(tf_pack_op.getN());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.axis());
+  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.getAxis());
 
   rewriter.replaceOpWithNewOp<PackOp>(op, output_type, values, values_count,
                                       axis);
@@ -291,11 +418,11 @@ LogicalResult ConvertTFSplitOp::matchAndRewrite(
   auto tf_split_op = cast<TF::SplitOp>(op);
 
   // Number of splits cannot be negative.
-  auto num_split = rewriter.getI32IntegerAttr(tf_split_op.num_split());
+  auto num_split = rewriter.getI32IntegerAttr(tf_split_op.getNumSplit());
 
-  rewriter.replaceOpWithNewOp<TFL::SplitOp>(op, tf_split_op.output().getTypes(),
-                                            tf_split_op.split_dim(),
-                                            tf_split_op.value(), num_split);
+  rewriter.replaceOpWithNewOp<TFL::SplitOp>(
+      op, tf_split_op.getOutput().getTypes(), tf_split_op.getSplitDim(),
+      tf_split_op.getValue(), num_split);
   return success();
 }
 
@@ -304,11 +431,11 @@ LogicalResult ConvertTFSplitVOp::matchAndRewrite(
   auto tf_splitv_op = cast<TF::SplitVOp>(op);
 
   // Number of splits cannot be negative.
-  auto num_split = rewriter.getI32IntegerAttr(tf_splitv_op.num_split());
+  auto num_split = rewriter.getI32IntegerAttr(tf_splitv_op.getNumSplit());
 
   rewriter.replaceOpWithNewOp<TFL::SplitVOp>(
-      op, tf_splitv_op.output().getTypes(), tf_splitv_op.value(),
-      tf_splitv_op.size_splits(), tf_splitv_op.split_dim(), num_split);
+      op, tf_splitv_op.getOutput().getTypes(), tf_splitv_op.getValue(),
+      tf_splitv_op.getSizeSplits(), tf_splitv_op.getSplitDim(), num_split);
   return success();
 }
 
@@ -316,12 +443,12 @@ LogicalResult ConvertTFUnpackOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_unpack_op = cast<TF::UnpackOp>(op);
 
-  auto input = tf_unpack_op.value();
-  auto num = rewriter.getI32IntegerAttr(tf_unpack_op.num());
+  auto input = tf_unpack_op.getValue();
+  auto num = rewriter.getI32IntegerAttr(tf_unpack_op.getNum());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis());
+  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.getAxis());
 
-  rewriter.replaceOpWithNewOp<UnpackOp>(op, tf_unpack_op.output().getTypes(),
+  rewriter.replaceOpWithNewOp<UnpackOp>(op, tf_unpack_op.getOutput().getTypes(),
                                         input, num, axis);
   return success();
 }
@@ -357,7 +484,7 @@ LogicalResult ConvertTFConv3DOp::matchAndRewrite(
       op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
 
   rewriter.replaceOpWithNewOp<TFL::Conv3DOp>(
-      op, tf_op.getType(), tf_op.input(), tf_op.filter(),
+      op, tf_op.getType(), tf_op.getInput(), tf_op.getFilter(),
       /*bias=*/none, dilation_depth_factor, dilation_height_factor,
       dilation_width_factor,
       /*fused_activation_function=*/rewriter.getStringAttr("NONE"), padding,
@@ -397,10 +524,11 @@ LogicalResult ConvertTFConv3DBackpropInputV2Op::matchAndRewrite(
       op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
 
   Value output_shape =
-      CreateCastToInt32(tf_op.input_sizes(), op->getLoc(), rewriter);
+      CreateCastToInt32(tf_op.getInputSizes(), op->getLoc(), rewriter);
 
   rewriter.replaceOpWithNewOp<TFL::Conv3DTransposeOp>(
-      op, tf_op.getType(), output_shape, tf_op.filter(), tf_op.out_backprop(),
+      op, tf_op.getType(), output_shape, tf_op.getFilter(),
+      tf_op.getOutBackprop(),
       /*bias=*/none, dilation_depth_factor, dilation_height_factor,
       dilation_width_factor,
       /*fused_activation_function=*/rewriter.getStringAttr("NONE"), padding,
@@ -422,31 +550,31 @@ bool ConvertTFMatrixDiagV2orV3(Operation* op, PatternRewriter* rewriter) {
 
   if (tf_matrix_diag_v2_or_v3_op.getNumOperands() != 5) return false;
 
-  auto input = tf_matrix_diag_v2_or_v3_op.diagonal();
-  auto output_type = tf_matrix_diag_v2_or_v3_op.output().getType();
+  auto input = tf_matrix_diag_v2_or_v3_op.getDiagonal();
+  auto output_type = tf_matrix_diag_v2_or_v3_op.getOutput().getType();
 
   // Extract k constant tensor and check value = 0.
   ElementsAttr k;
-  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.k(), m_Constant(&k)))
+  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.getK(), m_Constant(&k)))
     return false;
   if (ExtractSingleElementAsInteger(k).getInt() != 0) return false;
 
   // Extract num_rows constant tensor and check value = -1.
   ElementsAttr num_rows;
-  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.num_rows(),
+  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.getNumRows(),
                     m_Constant(&num_rows)))
     return false;
   if (ExtractSingleElementAsInteger(num_rows).getInt() != -1) return false;
 
   // Extract num_cols constant tensor and check value = -1.
   ElementsAttr num_cols;
-  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.num_cols(),
+  if (!matchPattern(tf_matrix_diag_v2_or_v3_op.getNumCols(),
                     m_Constant(&num_cols)))
     return false;
   if (ExtractSingleElementAsInteger(num_cols).getInt() != -1) return false;
 
   // Verify padding_value is a tensor with all 0s.
-  mlir::Value padding_value = tf_matrix_diag_v2_or_v3_op.padding_value();
+  mlir::Value padding_value = tf_matrix_diag_v2_or_v3_op.getPaddingValue();
   mlir::Type element_type =
       padding_value.getType().cast<ShapedType>().getElementType();
   if (element_type.isa<FloatType>()) {
@@ -659,7 +787,7 @@ class ApplyExplicitBroadcasting : public OpRewritePattern<SourceOp> {
                 RankedTensorType::get(symbolic_broadcast_shape.size(),
                                       rewriter.getIntegerType(64)),
                 lhs_shape, rhs_shape)
-            .r0();
+            .getR0();
 
     // Broadcasts inputs using BroadcastTo op.
     auto broadcast_type = RankedTensorType::get(
@@ -668,12 +796,12 @@ class ApplyExplicitBroadcasting : public OpRewritePattern<SourceOp> {
         rewriter
             .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, lhs,
                                        broadcast_shape)
-            .output();
+            .getOutput();
     auto broadcasted_rhs =
         rewriter
             .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, rhs,
                                        broadcast_shape)
-            .output();
+            .getOutput();
 
     // Recreate an op with the above BroadcastTo op results.
     RankedTensorType result_type = RankedTensorType::get(
@@ -725,13 +853,13 @@ class ApplyExplicitBroadcasting : public OpRewritePattern<SourceOp> {
       lhs = rewriter
                 .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, lhs,
                                            new_shape)
-                .output();
+                .getOutput();
     }
     if (result_type.getShape() != rhs_shape) {
       rhs = rewriter
                 .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, rhs,
                                            new_shape)
-                .output();
+                .getOutput();
     }
 
     // Recreate an op with the above Broadcast op results.
@@ -786,12 +914,12 @@ class ApplyExplicitBroadcasting<TF::SelectV2Op>
         rewriter
             .create<TF::BroadcastArgsOp>(op->getLoc(), lhs_shape.getType(),
                                          lhs_shape, rhs_shape)
-            .r0();
+            .getR0();
     broadcast_shape_value =
         rewriter
             .create<TF::BroadcastArgsOp>(op->getLoc(), lhs_shape.getType(),
                                          broadcast_shape_value, cond_shape)
-            .r0();
+            .getR0();
 
     // Broadcasting inputs using BroadcastTo op.
     auto broadcast_type = RankedTensorType::get(
@@ -803,17 +931,17 @@ class ApplyExplicitBroadcasting<TF::SelectV2Op>
                 RankedTensorType::get(symbolic_broadcast_shape,
                                       rewriter.getIntegerType(1)),
                 cond, broadcast_shape_value)
-            .output();
+            .getOutput();
     auto broadcasted_lhs =
         rewriter
             .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, lhs,
                                        broadcast_shape_value)
-            .output();
+            .getOutput();
     auto broadcasted_rhs =
         rewriter
             .create<TF::BroadcastToOp>(op->getLoc(), broadcast_type, rhs,
                                        broadcast_shape_value)
-            .output();
+            .getOutput();
 
     // Recreate an op with the above BroadcastTo op results.
     rewriter.replaceOpWithNewOp<TF::SelectV2Op>(
@@ -873,19 +1001,19 @@ class ApplyExplicitBroadcasting<TF::SelectV2Op>
       cond = rewriter
                  .create<TF::BroadcastToOp>(op->getLoc(), cond_result_type,
                                             cond, new_shape)
-                 .output();
+                 .getOutput();
     }
     if (result_shape != lhs_shape) {
       lhs = rewriter
                 .create<TF::BroadcastToOp>(op->getLoc(), result_type, lhs,
                                            new_shape)
-                .output();
+                .getOutput();
     }
     if (result_shape != rhs_shape) {
       rhs = rewriter
                 .create<TF::BroadcastToOp>(op->getLoc(), result_type, rhs,
                                            new_shape)
-                .output();
+                .getOutput();
     }
 
     // Recreate an op with the above Broadcast op results.
@@ -902,7 +1030,9 @@ void addPatterns(MLIRContext* context, RewritePatternSet& patterns,
 
   // Add the generated patterns to the list.
   populateWithGenerated(patterns);
-  patterns.add<ConvertTFConcatV2Op, ConvertTFMatMulOp, ConvertTFMatrixDiagV2Op,
+  patterns.add<ConvertTFConcatV2Op, ConvertTFBatchMatMulOp,
+               ConvertTFBatchMatMulV2Op, ConvertTFBatchMatMulV3Op,
+               ConvertTFMatMulOp, ConvertTFMatrixDiagV2Op,
                ConvertTFMatrixDiagV3Op, ConvertTFPackOp, ConvertTFSplitOp,
                ConvertTFSplitVOp, ConvertTFUnpackOp, ConvertTFConv3DOp,
                ConvertTFConv3DBackpropInputV2Op>(context);
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
index 849db47e67d..1dcbd4c0384 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
@@ -63,7 +63,7 @@ void RunOnWhile(TF::WhileOp while_op) {
   // Create new TFL While op that will be used to replace TF While op.
   auto new_op = OpBuilder(op).create<TFL::WhileOp>(
       op->getLoc(), op->getResultTypes(), op->getOperands(),
-      while_op.is_stateless());
+      while_op.getIsStateless());
   Location loc = while_op->getLoc();
   CreateRegionWithCall(while_op.cond_function(), new_op.getCond(), loc);
   CreateRegionWithCall(while_op.body_function(), new_op.getBody(), loc);
diff --git a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
index f43e13b82cd..d939d74c5dd 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lift_tflite_flex_ops.cc
@@ -110,11 +110,25 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
     Operation* tf_op = rewriter.create(op_state);
     rewriter.replaceOp(op, tf_op->getResults());
 
+    if (isa<TF::MapDatasetOp, TF::ReduceDatasetOp>(tf_op)) {
+      constexpr StringRef kFuncAttrName = "f";
+      tf_op->setAttr(
+          kFuncAttrName,
+          tf_op->getAttr(kFuncAttrName).cast<TF::FuncAttr>().getName());
+    }
+
+    if (isa<TF::TakeWhileDatasetOp>(tf_op)) {
+      constexpr StringRef kFuncAttrName = "predicate";
+      tf_op->setAttr(
+          kFuncAttrName,
+          tf_op->getAttr(kFuncAttrName).cast<TF::FuncAttr>().getName());
+    }
+
     // Special type fixes for TF Resource Tensors that are casted to
     // Int32 tensor during MLIR->TFLite flatbuffer conversion.
     // TODO(b/146131919): correct handling of resource type
     if (auto tensor_array_v3_op = dyn_cast<TF::TensorArrayV3Op>(tf_op)) {
-      Value handle = tensor_array_v3_op.handle();
+      Value handle = tensor_array_v3_op.getHandle();
       auto handle_type = handle.getType().cast<TensorType>();
       if (handle_type.getElementType().isInteger(/*width=*/32)) {
         Type resource_tensor_type =
@@ -132,7 +146,7 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
     if (auto tensor_array_v3_op = dyn_cast<TF::TensorArrayV3Op>(tf_op)) {
       // The "flow" in TensorArrayV3 is always a scalar float tensor.
       // https://www.tensorflow.org/api_docs/python/tf/raw_ops/TensorArrayWriteV3
-      Value flow = tensor_array_v3_op.flow();
+      Value flow = tensor_array_v3_op.getFlow();
       Type scalar_f32_tensor_type =
           RankedTensorType::get(/*shape=*/{}, rewriter.getF32Type());
       flow.setType(scalar_f32_tensor_type);
@@ -150,10 +164,10 @@ class LiftFlexCustomOp : public OpRewritePattern<TFL::CustomOp> {
           values.reserve(args.size());
           for (const auto& arg : args) {
             auto range = arg_ranges.at(arg.name());
-            values.push_back(
-                range.second - range.first);
+            values.push_back(range.second - range.first);
           }
-          auto attr_value = mlir::DenseI32ArrayAttr::get(tf_op->getContext(), values);
+          auto attr_value =
+              mlir::DenseI32ArrayAttr::get(tf_op->getContext(), values);
           tf_op->setAttr(attr_name, attr_value);
         };
     if (tf_op->hasTrait<mlir::OpTrait::AttrSizedOperandSegments>() ||
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 13e8dc7dc4f..865da881efe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -22,12 +22,12 @@ limitations under the License.
 
 #include <climits>
 #include <cstdint>
+#include <optional>
 #include <utility>
 
 #include "absl/container/inlined_vector.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -255,7 +255,7 @@ struct ConvertConst : public OpConversionPattern<TF::ConstOp> {
       ConversionPatternRewriter &rewriter) const override {
     // Verify that the tensor proto contains tensor of type variant and scalar
     // shape. The variant type should hold a TensorList.
-    auto proto_attr = op.value().dyn_cast<TF::TensorProtoAttr>();
+    auto proto_attr = op.getValue().dyn_cast<TF::TensorProtoAttr>();
     if (!proto_attr) return failure();
     tensorflow::Tensor tensor;
     if (!tensorflow::ConvertToTensor(proto_attr, &tensor).ok())
@@ -470,7 +470,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
   LogicalResult matchAndRewrite(
       OpT op, typename OpT::Adaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    Type dtype = op.element_dtype();
+    Type dtype = op.getElementDtype();
     if (!(dtype.isF16() || dtype.isF32() || dtype.isF64() ||
           dtype.isInteger(1) || dtype.isInteger(8) || dtype.isInteger(16) ||
           dtype.isInteger(32) || dtype.isInteger(64))) {
@@ -499,7 +499,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
             element_shape = rewriter.create<TF::ShapeOp>(
                 op.getLoc(),
                 tensorflow::GetTypeFromTFTensorShape({-1}, shape_dtype),
-                set_op.item());
+                set_op.getItem());
             element_shape_acquired = true;
           } else if (TF::WhileOp while_op =
                          llvm::dyn_cast<TF::WhileOp>(use.getOwner())) {
@@ -513,7 +513,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
                       llvm::dyn_cast<TF::TensorListSetItemOp>(
                           inside_use.getOwner())) {
                 if (auto shaped_type =
-                        set_op.item().getType().dyn_cast<ShapedType>()) {
+                        set_op.getItem().getType().dyn_cast<ShapedType>()) {
                   if (shaped_type.hasStaticShape()) {
                     RankedTensorType type =
                         tensorflow::GetTypeFromTFTensorShape(
@@ -594,7 +594,7 @@ struct ConvertTensorListInitOp : public TensorListOpConverterBase<OpT> {
     }
 
     int64_t result_rank = -1;  // -1 means unknown result rank.
-    Type element_dtype = op.element_dtype();
+    Type element_dtype = op.getElementDtype();
     Type result_type = UnrankedTensorType::get(element_dtype);
     Value leading_dim = GetNumElements(op, adaptor.getOperands(), &rewriter);
     if (auto element_type =
@@ -650,7 +650,7 @@ struct ConvertTensorListReserve
   Value GetNumElements(TF::TensorListReserveOp op, ValueRange operands,
                        PatternRewriter *rewriter) const override {
     Value scalar_zero = CreateI32SplatConst(op.getLoc(), rewriter, {}, 0);
-    Type shape_dtype = getElementTypeOrSelf(op.element_shape().getType());
+    Type shape_dtype = getElementTypeOrSelf(op.getElementShape().getType());
     Value num_elements = operands[1];
     IntegerAttr attr;
     if (matchPattern(num_elements, m_Constant(&attr))) {
@@ -658,7 +658,7 @@ struct ConvertTensorListReserve
     }
     if (auto const_op = num_elements.getDefiningOp<TF::ConstOp>()) {
       return CreateI32SplatConst(op->getLoc(), rewriter, {1},
-                                 (*const_op.value()
+                                 (*const_op.getValue()
                                        .cast<DenseElementsAttr>()
                                        .getValues<APInt>()
                                        .begin())
@@ -707,7 +707,7 @@ struct ConvertTensorListPushBack
         loc, expanded_item_type, item, scalar_zero);
 
     Type elem_type = getElementTypeOrSelf(item);
-    auto handle_dtype = getElementTypeOrSelf(op.output_handle().getType())
+    auto handle_dtype = getElementTypeOrSelf(op.getOutputHandle().getType())
                             .cast<TF::VariantType>();
     Type result_type =
         GetTensorTypeForTensorList(elem_type, handle_dtype, &rewriter);
@@ -749,7 +749,7 @@ struct ConvertTensorListResize
 
     // Infer result type of this op based on TF's shape inference result.
     Type elem_type = getElementTypeOrSelf(input_handle);
-    auto handle_dtype = getElementTypeOrSelf(op.output_handle().getType())
+    auto handle_dtype = getElementTypeOrSelf(op.getOutputHandle().getType())
                             .cast<TF::VariantType>();
     Type result_type =
         GetTensorTypeForTensorList(elem_type, handle_dtype, &rewriter);
@@ -842,7 +842,7 @@ struct ConvertTensorListResize
         loc, tensorflow::GetTypeFromTFTensorShape({-1}, shape_dtype),
         input_shape, slice_start, slice_size);
     auto extended_part = rewriter->create<TF::TensorListReserveOp>(
-        loc, resize_op.output_handle().getType(), elem_shape, size_diff);
+        loc, resize_op.getOutputHandle().getType(), elem_shape, size_diff);
     // `ConcatOp` expects non-variant-typed input. Insert a
     // `TensorListStackOp` here to convert type from variant to non-variant.
     // Note that we are using the same `result_type` for both the
@@ -954,7 +954,7 @@ struct ConvertTensorListStack
     RankedTensorType shape_type =
         tensorflow::GetTypeFromTFTensorShape({-1}, rewriter.getIntegerType(32));
     auto new_shape = rewriter.create<TF::ShapeOp>(loc, shape_type, input);
-    SmallVector<int64_t, 8> output_shape(/*Size=*/1, op.num_elements());
+    SmallVector<int64_t, 8> output_shape(/*Size=*/1, op.getNumElements());
     for (const auto &dim : dense_elem_attr.getValues<APInt>())
       output_shape.push_back(dim.getSExtValue());
     RankedTensorType result_type = tensorflow::GetTypeFromTFTensorShape(
@@ -1116,7 +1116,7 @@ bool IsTensorListType(Type type, llvm::Optional<Value> value) {
   if (!value.has_value()) {
     return false;
   }
-  for (const mlir::OpOperand &use : value.getValue().getUses()) {
+  for (const mlir::OpOperand &use : value.value().getUses()) {
     mlir::Operation *op = use.getOwner();
     if (llvm::isa<TF::TensorListGetItemOp>(op) ||
         llvm::isa<TF::TensorListLengthOp>(op) ||
@@ -1150,7 +1150,7 @@ llvm::SmallSet<int, 4> GetTensorListResultsIndex(func::FuncOp func) {
 
   for (const auto &result_and_idx :
        llvm::enumerate(func.getFunctionType().getResults())) {
-    if (IsTensorListType(result_and_idx.value(), llvm::None)) {
+    if (IsTensorListType(result_and_idx.value(), std::nullopt)) {
       set.insert(result_and_idx.index());
     }
   }
@@ -1315,10 +1315,10 @@ llvm::DenseMap<int, int> MapTensorListResultToArgument(func::FuncOp func) {
     Value parent = value;
     while (true) {
       if (auto identity = parent.getDefiningOp<TF::IdentityOp>()) {
-        parent = identity.input();
+        parent = identity.getInput();
       } else if (auto set_item =
                      parent.getDefiningOp<TF::TensorListSetItemOp>()) {
-        parent = set_item.input_handle();
+        parent = set_item.getInputHandle();
       } else {
         break;
       }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 0a0536d9f53..8c6a2f14e32 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -23,12 +23,12 @@ limitations under the License.
 #include <iterator>
 #include <map>
 #include <numeric>
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -309,7 +309,7 @@ DenseElementsAttr GetShape(Value output_val) {
       RankedTensorType::get(
           {static_cast<int>(shape.size())},
           mlir::IntegerType::get(output_val.getContext(), 32)),
-      llvm::makeArrayRef(shape));
+      llvm::ArrayRef(shape));
 }
 
 static Type GetShapeStrippedType(TypeAttr type_attr) {
@@ -867,11 +867,12 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     // TF::MulOp is used to fold the constant.
     // TODO(b/139192933): switch to the TFL constant folding
     auto new_filter =
-        rewriter.create<TF::MulOp>(mul_op.getLoc(), filter, new_const_val).z();
+        rewriter.create<TF::MulOp>(mul_op.getLoc(), filter, new_const_val)
+            .getZ();
     // If bias isn't None, it needs to be multiplied as well.
     if (!bias.getType().isa<NoneType>()) {
-      bias =
-          rewriter.create<TF::MulOp>(mul_op.getLoc(), bias, constant_val).z();
+      bias = rewriter.create<TF::MulOp>(mul_op.getLoc(), bias, constant_val)
+                 .getZ();
     }
 
     auto fc = rewriter.create<TFL::FullyConnectedOp>(
@@ -978,7 +979,7 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
     // Rewrite filter constant. Since the folder of TFL::MulOp couldn't
     // broadcast the operands, TF::MulOp is used to fold the constant.
     auto new_filter =
-        rewriter.create<TF::MulOp>(loc, filter, broadcasted_gamma).z();
+        rewriter.create<TF::MulOp>(loc, filter, broadcasted_gamma).getZ();
     // Update the scale in the quantize op.
     auto new_qtype = RescaleQtype(q_op.getQtype(), gamma_cst);
     if (!new_qtype) return failure();
@@ -1542,16 +1543,16 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     auto slice_op =
         llvm::dyn_cast_or_null<TFL::SliceOp>(value.getUses().begin().getUser());
     // We only match for the case where value is used by SliceOp.
-    if (!slice_op) return llvm::None;
+    if (!slice_op) return std::nullopt;
     DenseElementsAttr begin;
     DenseElementsAttr size;
     if (!matchPattern(slice_op->getOperand(1), m_Constant(&begin)) ||
         !matchPattern(slice_op->getOperand(2), m_Constant(&size)))
-      return llvm::None;
+      return std::nullopt;
 
     // Check if "begin" is a zero tensor.
     for (auto begin_idx : begin.getValues<APInt>())
-      if (begin_idx != 0) return llvm::None;
+      if (begin_idx != 0) return std::nullopt;
 
     // Check if "size" is equal to slice_op.input.shape except
     // for last dimension.
@@ -1559,19 +1560,19 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     // i.e., num_input/input_last_dim = num_result/k
     auto input_ty = value.getType().dyn_cast_or_null<ShapedType>();
     auto result_ty = slice_op.getType().dyn_cast<ShapedType>();
-    if (!input_ty || !result_ty) return llvm::None;
+    if (!input_ty || !result_ty) return std::nullopt;
     if (!input_ty.hasStaticShape() || !result_ty.hasStaticShape())
-      return llvm::None;
-    if (!input_ty.getRank() || !result_ty.getRank()) return llvm::None;
+      return std::nullopt;
+    if (!input_ty.getRank() || !result_ty.getRank()) return std::nullopt;
     int num_input = input_ty.getNumElements();
     int input_last_dim = input_ty.getShape().back();
-    if (input_last_dim < 1) return llvm::None;
+    if (input_last_dim < 1) return std::nullopt;
     int num_result = result_ty.getNumElements();
     auto size_last = *(--size.value_end<APInt>());
     int32_t k = size_last.getSExtValue();
-    if (num_input / input_last_dim * k != num_result) return llvm::None;
+    if (num_input / input_last_dim * k != num_result) return std::nullopt;
     // We don't match sliceOp with last dim size = 0.
-    if (!k) return llvm::None;
+    if (!k) return std::nullopt;
     return k;
   }
 
@@ -1586,8 +1587,8 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     auto k_values_or = ComputeSliceK(values);
     auto k_indices_or = ComputeSliceK(indices);
     if (!k_values_or.has_value() || !k_indices_or.has_value()) return failure();
-    int32_t k_values = k_values_or.getValue();
-    int32_t k_indices = k_indices_or.getValue();
+    int32_t k_values = k_values_or.value();
+    int32_t k_indices = k_indices_or.value();
     // We don't match two SliceOp with different sizes.
     if (k_values != k_indices && !values.use_empty() && !indices.use_empty())
       return failure();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index ef3c6d66e6a..7d7ab4b5acd 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -56,7 +56,7 @@ void UpdateFuncType(func::FuncOp func) {
   auto return_types = llvm::to_vector<4>(terminator->getOperandTypes());
 
   FunctionType func_type = func.getFunctionType();
-  if (llvm::makeArrayRef(return_types) == func_type.getResults()) return;
+  if (llvm::ArrayRef(return_types) == func_type.getResults()) return;
 
   auto updated_type =
       FunctionType::get(func.getContext(), func_type.getInputs(), return_types);
@@ -67,7 +67,7 @@ void UpdateFuncType(func::FuncOp func) {
 bool IsSideEffectFree(func::FuncOp func) {
   return !func.getBody()
               .walk([&](Operation* op) {
-                if (!MemoryEffectOpInterface::hasNoEffect(op) &&
+                if (!isMemoryEffectFree(op) &&
                     !op->hasTrait<OpTrait::IsTerminator>())
                   return WalkResult::interrupt();
                 return WalkResult::advance();
@@ -99,7 +99,7 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     // remove.
     // TODO(jpienaar): Remove once recusive side-effects are supported.
     if (op.use_empty() &&
-        (op.is_stateless() ||
+        (op.getIsStateless() ||
          (IsSideEffectFree(then_func) && IsSideEffectFree(else_func)))) {
       rewriter.eraseOp(op.getOperation());
       return success();
@@ -107,7 +107,7 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
 
     // Extract the constant cond value.
     DenseElementsAttr cond;
-    if (!matchPattern(op.cond(), m_Constant(&cond))) return failure();
+    if (!matchPattern(op.getCond(), m_Constant(&cond))) return failure();
 
     // TODO(hinsu): Handle constants that are not scalar booleans.
     auto cond_type = cond.getType().dyn_cast<RankedTensorType>();
@@ -124,7 +124,7 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     // one blocks are not encountered in practice.
     if (!llvm::hasSingleElement(func)) return failure();
 
-    BlockAndValueMapping mapper;
+    IRMapping mapper;
     for (int i = 0, e = func.getNumArguments(); i != e; ++i)
       mapper.map(func.getArgument(i), op.getOperand(i + 1));
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index f6b28b35e20..e772f9cb88d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -129,25 +129,25 @@ multiclass FuseBinaryOpToPrecedingAffine<Op binaryOp> {
      (HasRank<1> $value),
      (HasOneUse $output)]>;
    def FuseBinaryOpWithTransposeConv#binaryOp : Pat<
-    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $input,
                 (Arith_ConstantOp FloatElementsAttr:$bias), $padding,
-                $stride_h, $stride_w),
-              (Arith_ConstantOp FloatElementsAttr:$value), TFL_AF_None),
-    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+                $stride_h, $stride_w, TFL_AF_None),
+              (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
+    (TFL_TransposeConvOp $output_shape, $weights, $input,
       (binaryOp (Arith_ConstantOp $bias),
          (Arith_ConstantOp $value), TFL_AF_None),
-      $padding, $stride_h, $stride_w),
+      $padding, $stride_h, $stride_w, $act_fn),
     [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
      (HasOneUse $output)]>;
   // Fuse for TransposeConv with no bias
   def FuseBinaryOpWithTransposeConvNoneBias#binaryOp : Pat<
-    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $input,
                 $bias, $padding,
-                $stride_h, $stride_w),
-              (Arith_ConstantOp FloatElementsAttr:$value), TFL_AF_None),
-    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+                $stride_h, $stride_w, TFL_AF_None),
+              (Arith_ConstantOp FloatElementsAttr:$value), $act_fn),
+    (TFL_TransposeConvOp $output_shape, $weights, $input,
       (Arith_ConstantOp $value),
-      $padding, $stride_h, $stride_w),
+      $padding, $stride_h, $stride_w, $act_fn),
     [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
      (IsNoneType $bias),
      (HasOneUse $output)]>;
@@ -209,8 +209,8 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<Op BinaryOp> {
     (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
                 (Arith_ConstantOp FloatElementsAttr:$weights), $input,
                 (Arith_ConstantOp FloatElementsAttr:$bias),
-                $padding, $stride_h, $stride_w),
-              (Arith_ConstantOp $value), TFL_AF_None),
+                $padding, $stride_h, $stride_w, TFL_AF_None),
+              (Arith_ConstantOp $value), $act_fn),
     (TFL_TransposeConvOp $output_shape,
       (BinaryOp (Arith_ConstantOp $weights),
         (Arith_ConstantOp (ExpandTo4DForConv $value)),
@@ -219,22 +219,22 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<Op BinaryOp> {
       (BinaryOp (Arith_ConstantOp $bias),
         (Arith_ConstantOp $value),
         TFL_AF_None),
-      $padding, $stride_h, $stride_w),
+      $padding, $stride_h, $stride_w, $act_fn),
     [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
      (HasOneUse $output)]>;
   def FuseMulOrDivWithTransposeConvWithNoneBias#BinaryOp : Pat<
     (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
                 (Arith_ConstantOp FloatElementsAttr:$weights), $input,
                 $bias,
-                $padding, $stride_h, $stride_w),
-              (Arith_ConstantOp $value), TFL_AF_None),
+                $padding, $stride_h, $stride_w, TFL_AF_None),
+              (Arith_ConstantOp $value), $act_fn),
     (TFL_TransposeConvOp $output_shape,
       (BinaryOp (Arith_ConstantOp $weights),
         (Arith_ConstantOp (ExpandTo4DForConv $value)),
         TFL_AF_None),
       $input,
       $bias,
-      $padding, $stride_h, $stride_w),
+      $padding, $stride_h, $stride_w, $act_fn),
     [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
      (IsNoneType $bias),
      (HasOneUse $output)]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 30fd466963a..7777a1e2ded 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -49,7 +49,6 @@ class OperationPass;
 class Type;
 
 namespace TFL {
-using StringSet = absl::flat_hash_set<std::string>;
 
 // Creates an instance of the TensorFlow Lite dialect LegalizeTF pass.
 // When the given run_tfl_runtime_verification value is true, it will check each
@@ -84,7 +83,8 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateLowerStaticTensorListPass();
 // as they are now structure variables of QuantizationSpecs.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
     const quant::QuantizationSpecs& quant_specs,
-    const StringSet& ops_blocklist = {}, const StringSet& nodes_blocklist = {});
+    const absl::flat_hash_set<std::string>& ops_blocklist = {},
+    const absl::flat_hash_set<std::string>& nodes_blocklist = {});
 
 std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantizePass();
 
@@ -92,8 +92,9 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantizePass();
 // the binary size.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
     bool verify_numeric = false, bool whole_model_verify = false,
-    bool legacy_float_scale = false, const StringSet& ops_blocklist = {},
-    const StringSet& nodes_blocklist = {});
+    bool legacy_float_scale = false,
+    const absl::flat_hash_set<std::string>& ops_blocklist = {},
+    const absl::flat_hash_set<std::string>& nodes_blocklist = {});
 
 // Creates an instance of the TensorFlow Lite dialect PrepareQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
@@ -115,6 +116,9 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass();
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePostQuantizePass(
     bool emit_quant_adaptor_ops, const quant::CustomOpMap& custom_op_map = {});
 
+// Creates an instance of the TensorFlow Lite dialect QuantizeVariables pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeVariablesPass();
+
 // Creates an instance of the TensorFlow Lite pass that decomposes hybrid
 // quantization patterns to the same dense operation with tfl dequantization
 // and quantization patterns.
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index 3fa85cd3504..d54d34d2033 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -288,6 +288,8 @@ def PrepareQuantizePass : Pass<"tfl-prepare-quantize", "mlir::func::FuncOp"> {
                  "comma separated list of allowlisted functions to be quantized. Only used in tests">,
       Option<"quantize_signed_", "quantize-signed", "bool", "false",
              "signed inference type. Only used in tests">,
+      Option<"activation_number_of_bits_", "activation-number-of-bits", "int", "8",
+             "number of bits for inference type. Only used in tests">,
       Option<"post_training_quantize_", "post-training-quantize", "bool", "false",
              "enable post training quantization. Only used in tests">,
       Option<"legacy_float_scale_", "legacy-float-scale", "bool", "false",
@@ -384,6 +386,12 @@ def QuantizePass : Pass<"tfl-quantize", "mlir::func::FuncOp"> {
   ];
 }
 
+def QuantizeVariablesPass : Pass<"tfl-quantize-variables", "mlir::ModuleOp"> {
+  let summary = "Quantize variables";
+  let constructor = "CreatePrepareQuantizeVariablesPass()";
+  let dependentDialects = ["TFL::TensorFlowLiteDialect"];
+}
+
 def RaiseCustomOpsPass : Pass<"tfl-raise-custom-ops", "mlir::func::FuncOp"> {
   let summary = "Raise custom ops into tflite dialect.";
   let constructor = "CreateRaiseCustomOpsPass()";
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 59fee3b765e..3df42acf5e3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -399,6 +399,21 @@ void PrepareCompositeFunctionsPass::ConvertTFAPIImplements(func::FuncOp func,
     if (failed(ConvertKerasLSTMLayer(func, &builder)))
       return signalPassFailure();
   }
+
+  // LSTM `func::FuncOps` with indy behavior always have the `tf.api_implements`
+  // function attribute prefixed with `"indy_lstm_"`.
+  // IndyLSTMs have diagonal recurrent weight matrices and can benefit from
+  // more efficent operations in TFLite with the correct conversion (i.e. when
+  // the diagonal recurrent weight matrices are provided as vectors).
+  if (attr.getValue().startswith("indy_lstm_")) {
+    // Check if the keras lstm can be fused, if not, we just don't do anything.
+    if (failed(CheckFusableKerasLstm(func, module))) return;
+    func.eraseBody();
+    func.addEntryBlock();
+    OpBuilder builder(func.getBody());
+    if (failed(ConvertKerasLSTMLayer(func, &builder, true)))
+      return signalPassFailure();
+  }
 }
 
 void PrepareCompositeFunctionsPass::runOnOperation() {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index c9a8444b4be..1fa65305a97 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -190,9 +190,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
         if (!min_max.first.has_value() || !min_max.second.has_value()) return;
 
         TypeAttr params = quant::GetQuantizedTypeAttr(
-            builder, input_type,
-            builder.getF64FloatAttr(min_max.first.getValue()),
-            builder.getF64FloatAttr(min_max.second.getValue()),
+            builder, input_type, builder.getF64FloatAttr(min_max.first.value()),
+            builder.getF64FloatAttr(min_max.second.value()),
             /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
         builder.setInsertionPoint(block, insertion_point);
         auto q_op = builder.create<quantfork::QuantizeCastOp>(
@@ -327,8 +326,14 @@ void PrepareQuantizePass::runOnOperation() {
   MLIRContext* ctx = func.getContext();
   ScopedTFLQuantOpsToMlirQuantOpsConverter converter(func);
   if (use_quantization_flags_) {
-    quant_specs_.inference_type =
-        this->quantize_signed_ ? tensorflow::DT_QINT8 : tensorflow::DT_QUINT8;
+    quant_specs_.inference_type = GetQuantizedInferenceType(
+        this->quantize_signed_, this->activation_number_of_bits_);
+    if (quant_specs_.inference_type == tensorflow::DT_INVALID) {
+      func.emitError() << "prepare-quantize pass failed: unsupported "
+                          "inference type specification";
+      signalPassFailure();
+      return;
+    }
     quant_specs_.post_training_quantization = post_training_quantize_;
     quant_specs_.legacy_float_scale = legacy_float_scale_;
     quant_specs_.disable_set_input_nodes_quantization_params =
@@ -378,16 +383,14 @@ void PrepareQuantizePass::runOnOperation() {
   if (is_signed) {
     patterns_2.add<quant::ConvertUnsignedToSigned<quantfork::QuantizeCastOp>>(
         ctx);
-    // Convert quant stats to int8 quantization parameters.
-    // Currently, only activation stats are imported, so narrow_range = false.
-    patterns_2.add<PrepareQuantStats>(bit_width, false, true,
-                                      quant_specs_.legacy_float_scale, ctx);
-  } else {
-    // Convert quant stats to uint8 quantization parameters.
-    // Currently, only activation stats are imported, so narrow_range = false.
-    patterns_2.add<PrepareQuantStats>(bit_width, false, false,
-                                      quant_specs_.legacy_float_scale, ctx);
   }
+  // Convert quant stats to int8, unit8, int16 quantization parameters.
+  // Currently, only activation stats are imported, so narrow_range = false.
+  // TODO(b/266524882): Support narrow_range in TFLite converter(ODML
+  // converter).
+  patterns_2.add<PrepareQuantStats>(bit_width, /*narrow_range=*/false,
+                                    is_signed, quant_specs_.legacy_float_scale,
+                                    ctx);
 
   if (quant_specs_.post_training_quantization) {
     patterns_2.add<ConvertLstmStatsToQDQs<LSTMOp>>(ctx, quant_specs_);
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index a5d2099fa51..04f7fb84011 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -419,7 +419,7 @@ void PrepareDynamicRangeQuantizePass::runOnOperation() {
 
   if (!enable_custom_op_quantization_.empty()) {
     ParseCustomOpSpecs(enable_custom_op_quantization_,
-                       quant::CustomOpUpdateOptions::kINputIndices,
+                       quant::CustomOpUpdateOptions::kInputIndices,
                        quant_specs_.custom_map);
   }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.cc
new file mode 100644
index 00000000000..ad48c5f1407
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.cc
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"
+
+#include <cmath>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace mlir {
+namespace TFL {
+
+double PowerOfTwoBound(double value) {
+  return std::pow(2, std::ceil(std::log2(value)));
+}
+
+tensorflow::DataType GetQuantizedInferenceType(bool is_signed,
+                                               int number_of_bits) {
+  if (is_signed && number_of_bits == 8) {
+    return tensorflow::DT_QINT8;
+  } else if (!is_signed && number_of_bits == 8) {
+    return tensorflow::DT_QUINT8;
+  } else if (is_signed && number_of_bits == 16) {
+    return tensorflow::DT_QINT16;
+  } else {
+    return tensorflow::DT_INVALID;
+  }
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index 644751d0321..90a48d577ef 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -62,9 +62,10 @@ constexpr const char* intermediate_attributes[] = {
     "effective_hidden_scale_intermediate"};
 
 // Calculates the minimum power of two that is not less than the value.
-inline double PowerOfTwoBound(double value) {
-  return std::pow(2, std::ceil(std::log2(value)));
-}
+double PowerOfTwoBound(double value);
+
+tensorflow::DataType GetQuantizedInferenceType(bool is_signed,
+                                               int activation_number_of_bits);
 
 // Returns the element type of LSTM's intermediate tensor designated by the
 // index.
@@ -84,9 +85,10 @@ using Q = quantfork::QuantizeCastOp;
 using DQ = quantfork::DequantizeCastOp;
 
 template <typename LstmOp>
-LogicalResult GetLstmProperty(
-    LstmOp op, operator_property::OpVariant* lstm_variant,
-    operator_property::OperatorProperty* op_property) {
+LogicalResult GetLstmProperty(LstmOp op,
+                              operator_property::OpVariant* lstm_variant,
+                              operator_property::OperatorProperty* op_property,
+                              int activation_number_of_bits = 8) {
   if (llvm::isa<TFL::LSTMOp>(op.getOperation())) {
     lstm_variant->op_code = tflite::BuiltinOperator_LSTM;
   } else if (llvm::isa<TFL::UnidirectionalSequenceLSTMOp>(op.getOperation())) {
@@ -103,7 +105,8 @@ LogicalResult GetLstmProperty(
   lstm_variant->use_layer_norm =
       !op.getForgetLayerNormCoefficients().getType().template isa<NoneType>();
 
-  *op_property = operator_property::GetOperatorProperty(*lstm_variant);
+  *op_property = operator_property::GetOperatorProperty(
+      *lstm_variant, activation_number_of_bits);
 
   // TODO(b/176258587) move this to operator_property.cc if this is needed in
   // other components, too.
@@ -308,12 +311,12 @@ class ConvertOpStatsToQDQs : public OpRewritePattern<SourceOp> {
           rewriter.getIntegerType(16), attr.getType().getElementType(), scale,
           /*zeroPoint=*/0, llvm::minIntN(10), -llvm::minIntN(10));
     } else {
-      quant_type =
-          quant::GetUniformQuantizedTypeForWeight(
-              attr, /*symmetric=*/true,
-              /*num_bits=*/tensor_property.number_of_bits, /*is_signed=*/true,
-              /*narrow_range=*/true, quant_specs_.legacy_float_scale)
-              .template dyn_cast<quant::UniformQuantizedType>();
+      quant_type = quant::GetUniformQuantizedTypeForWeight(
+                       attr, /*symmetric=*/true,
+                       /*num_bits=*/tensor_property.number_of_bits,
+                       /*is_signed=*/true,
+                       /*narrow_range=*/true, quant_specs_.legacy_float_scale)
+                       .template dyn_cast<quant::UniformQuantizedType>();
     }
     if (!quant_type) {
       const_op->emitError("Failed to get quantized type");
@@ -405,13 +408,14 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
  public:
   ConvertLstmStatsToQDQs(MLIRContext* context,
                          const quant::QuantizationSpecs& quant_specs)
-
-      : ConvertOpStatsToQDQs<SourceOp>(context, quant_specs) {}
+      : ConvertOpStatsToQDQs<SourceOp>(context, quant_specs),
+        activation_number_of_bits_(quant_specs.GetQuantizationTypeWidth()) {}
   LogicalResult matchAndRewrite(SourceOp op,
                                 PatternRewriter& rewriter) const override {
     operator_property::OpVariant lstm_variant;
     operator_property::OperatorProperty lstm_property;
-    if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property))) {
+    if (failed(GetLstmProperty(op, &lstm_variant, &lstm_property,
+                               activation_number_of_bits_))) {
       return failure();
     }
 
@@ -491,6 +495,8 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
     }
     return success();
   }
+
+  int activation_number_of_bits_;
 };
 
 // Returns a function that returns the quantized type of a bias input.
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 42a6bf643b0..465827429db 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
@@ -72,7 +73,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 #define DEBUG_TYPE "tf-tfl-legalization"
 
@@ -167,9 +168,9 @@ class ConvertTFConvOp : public RewritePattern {
     //   [1, X, Y, 1] if exists.
 
     TFConvOpType tf_op = cast<TFConvOpType>(op);
-    if (!TFTypeIsFloat32Tensor(tf_op.input()) &&
+    if (!TFTypeIsFloat32Tensor(tf_op.getInput()) &&
         !(allow_bf16_and_f16_type_legalization_ &&
-          TFTypeIsBFloat16OrHalfTensor(tf_op.input())))
+          TFTypeIsBFloat16OrHalfTensor(tf_op.getInput())))
       return failure();
 
     if (!TFDataFormatIsNHWC(op)) return failure();
@@ -196,13 +197,13 @@ class ConvertTFConvOp : public RewritePattern {
     // Additionally, we require the filter operand to be of 4-D tensor type so
     // that we can extract info from the shape (e.g., for constructing bias
     // tensor, for setting depth_multiplier attribute, etc.).
-    auto filter = tf_op.filter();
+    auto filter = tf_op.getFilter();
     auto filter_type = filter.getType().template dyn_cast<RankedTensorType>();
     if (!filter_type || filter_type.getRank() != 4 ||
         !filter_type.hasStaticShape())
       return failure();
 
-    Value input = tf_op.input();
+    Value input = tf_op.getInput();
     RankedTensorType input_type =
         input.getType().template dyn_cast<RankedTensorType>();
     // Only rank size four input will be only available by the tf.Conv2D
@@ -317,7 +318,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
     auto perm_type = tensorflow::GetTypeFromTFTensorShape(
         {static_cast<int>(perm.size())}, rewriter.getIntegerType(32));
     auto perm_attr =
-        DenseElementsAttr::get(perm_type, llvm::makeArrayRef<int>(perm));
+        DenseElementsAttr::get(perm_type, llvm::ArrayRef<int>(perm));
     auto perm_op = rewriter.create<TF::ConstOp>(loc, perm_type, perm_attr);
 
     // Create tensor type for the transpose result.
@@ -394,8 +395,9 @@ class ConvertTFDepthwiseConv2dNative
         tensorflow::GetTypeFromTFTensorShape({4}, rewriter.getIntegerType(32));
     SmallVector<Attribute, 4> result_shape_data(4);
     for (int i = 0; i < 4; ++i) {
+      auto size = result_shape[i];
       result_shape_data[i] =
-          rewriter.getI32IntegerAttr(static_cast<int32_t>(result_shape[i]));
+          rewriter.getI32IntegerAttr(ConvertToTfliteSize(size));
     }
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
     auto shape = rewriter.create<TF::ConstOp>(loc, shape_type, shape_attr);
@@ -427,9 +429,9 @@ struct ConvertTFStridedSlice : public RewritePattern {
   LogicalResult RewriteNewAxisMask(Operation *op,
                                    PatternRewriter &rewriter) const {
     TF::StridedSliceOp strided_slice_op = llvm::cast<TF::StridedSliceOp>(op);
-    uint64_t new_axis_mask = strided_slice_op.new_axis_mask();
+    uint64_t new_axis_mask = strided_slice_op.getNewAxisMask();
 
-    if (strided_slice_op.ellipsis_mask() != 0) {
+    if (strided_slice_op.getEllipsisMask() != 0) {
       // Ellipsis mask should have been lowered-away prior to invoking this
       // function.
       op->emitError() << "encountered a logical error";
@@ -437,7 +439,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
     }
 
     // Insert a new reshape op.
-    Value original_input = strided_slice_op.input();
+    Value original_input = strided_slice_op.getInput();
     RankedTensorType original_input_type =
         original_input.getType().dyn_cast<RankedTensorType>();
     if (!original_input_type) {
@@ -466,8 +468,9 @@ struct ConvertTFStridedSlice : public RewritePattern {
         {dim_size}, rewriter.getIntegerType(32));
     SmallVector<Attribute, 4> result_shape_data(dim_size);
     for (int i = 0; i < dim_size; ++i) {
+      auto size = revised_shape[i];
       result_shape_data[i] =
-          rewriter.getI32IntegerAttr(static_cast<int32_t>(revised_shape[i]));
+          rewriter.getI32IntegerAttr(ConvertToTfliteSize(size));
     }
 
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
@@ -479,25 +482,25 @@ struct ConvertTFStridedSlice : public RewritePattern {
         loc, revised_output_type, original_input, shape);
 
     // Replace the original strided_slice.
-    uint64_t revised_begin_mask = strided_slice_op.begin_mask();
-    uint64_t revised_end_mask = strided_slice_op.end_mask();
+    uint64_t revised_begin_mask = strided_slice_op.getBeginMask();
+    uint64_t revised_end_mask = strided_slice_op.getEndMask();
     // Since we expand the dims, we need to apply them to the begin_mask &
     // end_mask.
-    revised_begin_mask |= strided_slice_op.new_axis_mask();
-    revised_end_mask |= strided_slice_op.new_axis_mask();
+    revised_begin_mask |= strided_slice_op.getNewAxisMask();
+    revised_end_mask |= strided_slice_op.getNewAxisMask();
 
     // Enforce operator precedence.
-    uint64_t revised_shrink_axis_mask =
-        strided_slice_op.shrink_axis_mask() & ~strided_slice_op.new_axis_mask();
+    uint64_t revised_shrink_axis_mask = strided_slice_op.getShrinkAxisMask() &
+                                        ~strided_slice_op.getNewAxisMask();
 
     auto attribute_type = rewriter.getIntegerType(64);
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
-        op, strided_slice_op.getType(), reshape, strided_slice_op.begin(),
-        strided_slice_op.end(), strided_slice_op.strides(),
+        op, strided_slice_op.getType(), reshape, strided_slice_op.getBegin(),
+        strided_slice_op.getEnd(), strided_slice_op.getStrides(),
         rewriter.getIntegerAttr(attribute_type, revised_begin_mask),
         rewriter.getIntegerAttr(attribute_type, revised_end_mask),
         rewriter.getIntegerAttr(attribute_type,
-                                strided_slice_op.ellipsis_mask()),
+                                strided_slice_op.getEllipsisMask()),
         rewriter.getI64IntegerAttr(0),
         rewriter.getIntegerAttr(attribute_type, revised_shrink_axis_mask));
     return success();
@@ -507,16 +510,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
                                     PatternRewriter &rewriter) const {
     TF::StridedSliceOp strided_slice_op = llvm::cast<TF::StridedSliceOp>(op);
 
-    uint64_t ellipsis_mask = strided_slice_op.ellipsis_mask();
-    uint64_t shrink_axis_mask = strided_slice_op.shrink_axis_mask();
-    uint64_t new_axis_mask = strided_slice_op.new_axis_mask();
+    uint64_t ellipsis_mask = strided_slice_op.getEllipsisMask();
+    uint64_t shrink_axis_mask = strided_slice_op.getShrinkAxisMask();
+    uint64_t new_axis_mask = strided_slice_op.getNewAxisMask();
 
     // Enforce operator precedence.
     shrink_axis_mask &= ~ellipsis_mask;
     new_axis_mask &= ~ellipsis_mask;
 
     DenseIntElementsAttr begin_dense_elem_attr;
-    Value begin = strided_slice_op.begin();
+    Value begin = strided_slice_op.getBegin();
     auto begin_ranked_attr_type = begin.getType().dyn_cast<RankedTensorType>();
     if (!begin_ranked_attr_type ||
         !matchPattern(begin, m_Constant(&begin_dense_elem_attr))) {
@@ -524,7 +527,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
     }
 
     DenseIntElementsAttr end_dense_elem_attr;
-    Value end = strided_slice_op.end();
+    Value end = strided_slice_op.getEnd();
     auto end_ranked_attr_type = end.getType().dyn_cast<RankedTensorType>();
     if (!end_ranked_attr_type ||
         !matchPattern(end, m_Constant(&end_dense_elem_attr))) {
@@ -532,7 +535,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
     }
 
     DenseIntElementsAttr stride_dense_elem_attr;
-    Value stride = strided_slice_op.strides();
+    Value stride = strided_slice_op.getStrides();
     auto stride_ranked_attr_type =
         stride.getType().dyn_cast<RankedTensorType>();
     if (!stride_ranked_attr_type ||
@@ -540,7 +543,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
       return failure();
     }
 
-    Value input = strided_slice_op.input();
+    Value input = strided_slice_op.getInput();
     RankedTensorType input_type = input.getType().dyn_cast<RankedTensorType>();
     if (!input_type) {
       return failure();
@@ -560,8 +563,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
     const int ellipsis_filled_dim_size =
         input_size - begin_shape[0] + 1 + absl::popcount(new_axis_mask);
 
-    int64_t begin_mask = strided_slice_op.begin_mask();
-    int64_t end_mask = strided_slice_op.end_mask();
+    int64_t begin_mask = strided_slice_op.getBeginMask();
+    int64_t end_mask = strided_slice_op.getEndMask();
     int64_t revised_begin_mask = 0;
     int64_t revised_end_mask = 0;
     int64_t revised_shrink_axis_mask = 0;
@@ -673,24 +676,24 @@ struct ConvertTFStridedSlice : public RewritePattern {
     TF::StridedSliceOp strided_slice_op = llvm::cast<TF::StridedSliceOp>(op);
 
     // Handle ellipsis mask.
-    if (strided_slice_op.ellipsis_mask() != 0) {
+    if (strided_slice_op.getEllipsisMask() != 0) {
       return RewriteEllipsisMask(strided_slice_op, rewriter);
     }
 
     // Handle new axis mask.
-    if (strided_slice_op.new_axis_mask() != 0) {
+    if (strided_slice_op.getNewAxisMask() != 0) {
       return RewriteNewAxisMask(strided_slice_op, rewriter);
     }
 
     auto ranked_input_type =
-        strided_slice_op.input().getType().dyn_cast<RankedTensorType>();
+        strided_slice_op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!ranked_input_type) {
       return failure();
     }
 
-    auto begin_attr = strided_slice_op.begin();
-    auto end_attr = strided_slice_op.end();
-    auto strides_attr = strided_slice_op.strides();
+    auto begin_attr = strided_slice_op.getBegin();
+    auto end_attr = strided_slice_op.getEnd();
+    auto strides_attr = strided_slice_op.getStrides();
 
     auto begin_attr_type = begin_attr.getType().dyn_cast<RankedTensorType>();
     auto end_attr_type = end_attr.getType().dyn_cast<RankedTensorType>();
@@ -722,8 +725,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
     SmallVector<int32_t, 4> padding_end(input_shape.begin(), input_shape.end());
     SmallVector<int32_t, 4> padding_strides(num_input_dims, 1);
 
-    int begin_mask = strided_slice_op.begin_mask();
-    int end_mask = strided_slice_op.end_mask();
+    int begin_mask = strided_slice_op.getBeginMask();
+    int end_mask = strided_slice_op.getEndMask();
 
     PadStridedSliceAttributeArray(begin_elem_attr, begin, padded_begin,
                                   padding_begin, &begin_mask);
@@ -734,8 +737,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     if (begin == padded_begin && end == padded_end &&
         strides == padded_strides &&
-        begin_mask == strided_slice_op.begin_mask() &&
-        end_mask == strided_slice_op.end_mask()) {
+        begin_mask == strided_slice_op.getBeginMask() &&
+        end_mask == strided_slice_op.getEndMask()) {
       return failure();
     }
 
@@ -756,16 +759,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto attribute_type = rewriter.getIntegerType(64);
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
-        op, strided_slice_op.output().getType(), strided_slice_op.input(),
+        op, strided_slice_op.getOutput().getType(), strided_slice_op.getInput(),
         new_begin_attr, new_end_attr, new_strides_attr,
         rewriter.getIntegerAttr(attribute_type, begin_mask),
         rewriter.getIntegerAttr(attribute_type, end_mask),
         rewriter.getIntegerAttr(attribute_type,
-                                strided_slice_op.ellipsis_mask()),
+                                strided_slice_op.getEllipsisMask()),
         rewriter.getIntegerAttr(attribute_type,
-                                strided_slice_op.new_axis_mask()),
+                                strided_slice_op.getNewAxisMask()),
         rewriter.getIntegerAttr(attribute_type,
-                                strided_slice_op.shrink_axis_mask()));
+                                strided_slice_op.getShrinkAxisMask()));
 
     return success();
   }
@@ -778,9 +781,12 @@ struct ConvertTFBroadcastTo : public RewritePattern {
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     auto tf_broadcast_to_op = cast<TF::BroadcastToOp>(op);
-    auto input_type = tf_broadcast_to_op.input().getType().cast<ShapedType>();
-    auto output_type = tf_broadcast_to_op.output().getType().cast<ShapedType>();
-    auto shape_type = tf_broadcast_to_op.shape().getType().cast<ShapedType>();
+    auto input_type =
+        tf_broadcast_to_op.getInput().getType().cast<ShapedType>();
+    auto output_type =
+        tf_broadcast_to_op.getOutput().getType().cast<ShapedType>();
+    auto shape_type =
+        tf_broadcast_to_op.getShape().getType().cast<ShapedType>();
     Type element_type = input_type.getElementType();
 
     // Allow lowering when low dimension inputs are given and its type is F32 or
@@ -801,11 +807,11 @@ struct ConvertTFBroadcastTo : public RewritePattern {
     }
 
     auto tf_fill_op = rewriter.create<TF::FillOp>(op->getLoc(), output_type,
-                                                  tf_broadcast_to_op.shape(),
+                                                  tf_broadcast_to_op.getShape(),
                                                   status_or_const_op.value());
 
     auto mul_op = rewriter.create<TF::MulOp>(
-        op->getLoc(), output_type, tf_broadcast_to_op.input(), tf_fill_op);
+        op->getLoc(), output_type, tf_broadcast_to_op.getInput(), tf_fill_op);
     rewriter.replaceOp(op, mul_op.getResult());
     return success();
   }
@@ -925,7 +931,7 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
     ::mlir::Value mean_value = (*mean.begin());
     ::mlir::Value variance_value = (*variance.begin());
 
-    if (!TFTypeIsFloat32Tensor(fused_batch_norm_op.x())) return failure();
+    if (!TFTypeIsFloat32Tensor(fused_batch_norm_op.getX())) return failure();
 
     {
       epsilon =
@@ -994,14 +1000,14 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
     auto odsLoc = rewriter.getFusedLoc({fused_batch_norm->getLoc()});
 
     // We need to make sure input and output shapes are compatible.
-    int64_t last_dim = ShapedType::kDynamicSize;
+    int64_t last_dim = ShapedType::kDynamic;
     {
       auto is_last_dim_compatible = [](const Value &v, int64_t &last_dim) {
         auto v_type = v.getType().dyn_cast_or_null<RankedTensorType>();
         if (!v_type) return true;
         int64_t v_last_dim = v_type.getDimSize(v_type.getRank() - 1);
-        if (v_last_dim == ShapedType::kDynamicSize) return true;
-        if (last_dim != ShapedType::kDynamicSize && v_last_dim != last_dim)
+        if (v_last_dim == ShapedType::kDynamic) return true;
+        if (last_dim != ShapedType::kDynamic && v_last_dim != last_dim)
           return false;
         last_dim = v_last_dim;
         return true;
@@ -1041,7 +1047,7 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
 
     // For training, mean and variance is calculated from input values.
     if (is_training.getValue()) {
-      auto input_type = fused_batch_norm_op.x()
+      auto input_type = fused_batch_norm_op.getX()
                             .getType()
                             .dyn_cast_or_null<RankedTensorType>();
       if (!input_type || input_type.getRank() != 4) {
@@ -1215,7 +1221,9 @@ LogicalResult ConvertTf2XlaOps(func::FuncOp func, MLIRContext *context) {
   target.addIllegalOp<TF::XlaGatherOp>();
 
   RewritePatternSet patterns(context);
-  mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns, context);
+  mhlo::Tf2XlaTypeConverter converter;
+  mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns, context,
+                                             converter);
   mhlo::PopulateLegalizeTfPatterns(context, &patterns);
   TF::PopulateLegalizeHloToTfPatterns(&patterns, context);
   mhlo::GatherOp::getCanonicalizationPatterns(patterns, context);
@@ -1248,10 +1256,10 @@ struct ConvertRfftToRfft2d : public RewritePattern {
                                 PatternRewriter &rewriter) const override {
     auto rfft_op = dyn_cast<TF::RFFTOp>(op);
 
-    auto input = rfft_op.input();
+    auto input = rfft_op.getInput();
     auto input_type = input.getType().dyn_cast_or_null<RankedTensorType>();
     if (!input_type) return failure();
-    auto fft_len = rfft_op.fft_length();
+    auto fft_len = rfft_op.getFftLength();
     auto fft_len_type = fft_len.getType().dyn_cast_or_null<ShapedType>();
     if (!fft_len_type) return failure();
 
@@ -1328,8 +1336,8 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
   LogicalResult matchAndRewrite(TF::IdentityOp identity,
                                 PatternRewriter &rewriter) const override {
     // Replace the op with the input if input and result have the same type.
-    if (identity.input().getType() == identity.getType()) {
-      rewriter.replaceOp(identity, identity.input());
+    if (identity.getInput().getType() == identity.getType()) {
+      rewriter.replaceOp(identity, identity.getInput());
       return success();
     }
     // Replace the op with the input if output is only used by TF ops.
@@ -1343,7 +1351,7 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
       }
     }
 
-    rewriter.replaceOp(identity, identity.input());
+    rewriter.replaceOp(identity, identity.getInput());
     return success();
   }
 };
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 013325de59f..0bc21e41f68 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
@@ -50,7 +51,7 @@ namespace TFL {
 
 //===----------------------------------------------------------------------===//
 // The actual Quantize Pass.
-//
+//===----------------------------------------------------------------------===//
 namespace {
 #define GEN_PASS_DEF_QUANTIZEPASS
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
@@ -58,16 +59,16 @@ namespace {
 enum QuantizationTrait { kFullQuantization, kDynamicRangeQuantization };
 
 // Base struct for quantization.
-template <QuantizationTrait quantization_trait, typename ConcretTy,
-          typename RootOp = DequantizeOp>
+template <QuantizationTrait quantization_trait, typename ConcreteT,
+          typename RootOpT = DequantizeOp>
 struct TFLQuantizationBase
-    : public quant::QuantizationPattern<ConcretTy, QuantizeOp, DequantizeOp,
-                                        NumericVerifyOp, RootOp> {
+    : public quant::QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
+                                        NumericVerifyOp, RootOpT> {
   explicit TFLQuantizationBase(MLIRContext* ctx,
                                const quant::QuantPassSpec& quant_params)
-      : quant::QuantizationPattern<ConcretTy, QuantizeOp, DequantizeOp,
-                                   NumericVerifyOp, RootOp>(ctx, quant_params) {
-  }
+      : quant::QuantizationPattern<ConcreteT, QuantizeOp, DequantizeOp,
+                                   NumericVerifyOp, RootOpT>(ctx,
+                                                             quant_params) {}
 
   static bool IsQuantizableCustomOp(Operation* op,
                                     const quant::CustomOpMap& custom_op_map) {
@@ -77,7 +78,7 @@ struct TFLQuantizationBase
     // behaviors. In that case, these ops can be marked in the custom map and
     // treated separately in this pass.
 
-    auto custom_op = llvm::dyn_cast_or_null<TFL::CustomOp>(op);
+    auto custom_op = llvm::dyn_cast_or_null<CustomOp>(op);
     if (!custom_op) return false;
 
     // Custom op which is marked in the custom op map is quantizable.
@@ -89,7 +90,6 @@ struct TFLQuantizationBase
       Operation* quantized_op, const quant::CustomOpMap& custom_op_map) {
     // Collect the input if dynamic range quantization is on and the op supports
     // it.
-
     return quantization_trait == kDynamicRangeQuantization &&
            (dyn_cast_or_null<DynamicRangeQuantizedOpInterface>(quantized_op) ||
             IsQuantizableCustomOp(quantized_op, custom_op_map));
@@ -99,15 +99,16 @@ struct TFLQuantizationBase
       Operation* quantized_op, const quant::CustomOpMap& custom_op_map) {
     // Collect the output if dynamic range quantization is on and the op
     // supports it.
-
     return quantization_trait == kDynamicRangeQuantization &&
            (dyn_cast_or_null<DynamicRangeQuantizedOpInterface>(quantized_op) ||
             IsQuantizableCustomOp(quantized_op, custom_op_map));
   }
 
-  static bool IsWeightOnlyOp(Operation* quantized_op, StringSet& ops_blocklist,
-                             bool weight_only_quantization,
-                             const quant::CustomOpMap& custom_op_map) {
+  static bool IsWeightOnlyOp(
+      Operation* quantized_op,
+      const absl::flat_hash_set<std::string>& ops_blocklist,
+      const bool weight_only_quantization,
+      const quant::CustomOpMap& custom_op_map) {
     // Check whether the quantized_op needs to be quantized in weight-only
     // manner.
     bool is_blocklisted = false;
@@ -234,13 +235,13 @@ void QuantizePass::runOnOperation() {
   quant_specs.weight_quantization = enable_dynamic_range_quantization_;
   quant_specs.weight_only_quantization = enable_weight_only_quantization_;
   if (!ops_blocklist_flag_.empty()) {
-    quant_specs.ops_blocklist =
-        StringSet(ops_blocklist_flag_.begin(), ops_blocklist_flag_.end());
+    quant_specs.ops_blocklist = absl::flat_hash_set<std::string>(
+        ops_blocklist_flag_.begin(), ops_blocklist_flag_.end());
   }
 
   if (!nodes_blocklist_flag_.empty()) {
-    quant_specs.nodes_blocklist =
-        StringSet(nodes_blocklist_flag_.begin(), nodes_blocklist_flag_.end());
+    quant_specs.nodes_blocklist = absl::flat_hash_set<std::string>(
+        nodes_blocklist_flag_.begin(), nodes_blocklist_flag_.end());
   }
 
   if (!enable_custom_op_weight_only_.empty()) {
@@ -254,7 +255,7 @@ void QuantizePass::runOnOperation() {
        quant_specs.whole_model_verify, enable_log_if_failed_},
       quant_specs};
 
-  TFL::populateWithGenerated(patterns);
+  populateWithGenerated(patterns);
 
   if (quant_specs.weight_quantization || quant_specs.use_fake_quant_num_bits) {
     patterns.add<TFLDynamicRangeQuantization>(ctx, quant_params);
@@ -277,8 +278,9 @@ void QuantizePass::runOnOperation() {
 
 // Creates an instance of the TensorFlow Lite dialect QuantizeTFL pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    const quant::QuantizationSpecs& quant_specs, const StringSet& ops_blocklist,
-    const StringSet& nodes_blocklist) {
+    const quant::QuantizationSpecs& quant_specs,
+    const absl::flat_hash_set<std::string>& ops_blocklist,
+    const absl::flat_hash_set<std::string>& nodes_blocklist) {
   quant::QuantizationSpecs updated_quant_specs;
   updated_quant_specs = quant_specs;
   // If there's new blocklists given, update quant_specs to use the new one.
@@ -296,8 +298,10 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateDefaultQuantizePass() {
 }
 
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    bool verify_numeric, bool whole_model_verify, bool legacy_float_scale,
-    const StringSet& ops_blocklist, const StringSet& nodes_blocklist) {
+    const bool verify_numeric, const bool whole_model_verify,
+    const bool legacy_float_scale,
+    const absl::flat_hash_set<std::string>& ops_blocklist,
+    const absl::flat_hash_set<std::string>& nodes_blocklist) {
   quant::QuantizationSpecs quant_specs;
   quant_specs.verify_numeric = verify_numeric;
   quant_specs.whole_model_verify = whole_model_verify;
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
new file mode 100644
index 00000000000..33580d1ea95
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -0,0 +1,208 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h.inc"
+
+using ResourceIdMap =
+    absl::flat_hash_map<std::pair<std::string, std::string>, int>;
+
+using ResourceMap = absl::flat_hash_map<int, std::vector<VarHandleOp>>;
+
+Type GetQuantizedTypeFromReadVariableOp(VarHandleOp var_handle_op) {
+  Type ref_qtype = nullptr;
+  for (auto *var_handle_user : var_handle_op.getResult().getUsers()) {
+    auto read_variable_op = dyn_cast_or_null<ReadVariableOp>(var_handle_user);
+    if (!read_variable_op) continue;
+    for (auto *read_variable_user : read_variable_op.getResult().getUsers()) {
+      auto q_op = dyn_cast_or_null<QuantizeOp>(read_variable_user);
+      if (!q_op || ref_qtype) continue;
+      ref_qtype = q_op.getResult().getType();
+    }
+  }
+  return ref_qtype;
+}
+
+Type GetDequantizedTypeFromAssigneVariableOp(VarHandleOp var_handle_op) {
+  Type ref_qtype = nullptr;
+  for (auto *var_handle_user : var_handle_op.getResult().getUsers()) {
+    auto assign_variable_op =
+        dyn_cast_or_null<AssignVariableOp>(var_handle_user);
+    if (!assign_variable_op) continue;
+    auto value_op = assign_variable_op.getValue().getDefiningOp();
+    auto dq_op = dyn_cast_or_null<DequantizeOp>(value_op);
+    if (!dq_op || ref_qtype) continue;
+    ref_qtype = dq_op.getInput().getType();
+  }
+  return ref_qtype;
+}
+
+class QuantizeVariablesPass
+    : public QuantizeVariablesPassBase<QuantizeVariablesPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeVariablesPass)
+  explicit QuantizeVariablesPass() = default;
+
+  void runOnOperation() override;
+
+ private:
+  // Outlines the regions of the WhileOp's cond and body and insert function
+  // calls instead.
+  void QuantizeVariable(OpBuilder &builder,
+                        const std::vector<VarHandleOp> &var_handle_op);
+};
+
+void QuantizeVariablesPass::QuantizeVariable(
+    OpBuilder &builder, const std::vector<VarHandleOp> &var_handle_ops) {
+  // TODO(b/261940892): Refactoring quantize_variables.cc
+  Type ref_qtype = nullptr;
+  for (VarHandleOp var_handle_op : var_handle_ops) {
+    if (ref_qtype) break;
+    ref_qtype = GetQuantizedTypeFromReadVariableOp(var_handle_op);
+    if (ref_qtype) break;
+    ref_qtype = GetDequantizedTypeFromAssigneVariableOp(var_handle_op);
+  }
+  if (!ref_qtype) return;
+
+  for (VarHandleOp var_handle_op : var_handle_ops) {
+    for (auto *var_handle_user :
+         llvm::make_early_inc_range(var_handle_op.getResult().getUsers())) {
+      auto read_variable_op = dyn_cast_or_null<ReadVariableOp>(var_handle_user);
+      if (!read_variable_op) continue;
+      // Add dequantize.
+      builder.setInsertionPointAfter(read_variable_op);
+      auto new_read_variable_op =
+          builder.create<ReadVariableOp>(read_variable_op.getLoc(), ref_qtype,
+                                         read_variable_op.getResourceId());
+      auto new_dq_op = builder.create<DequantizeOp>(
+          read_variable_op.getLoc(), read_variable_op.getResult().getType(),
+          new_read_variable_op.getResult());
+      read_variable_op->replaceAllUsesWith(new_dq_op);
+      read_variable_op.erase();
+    }
+    for (auto *var_handle_user :
+         llvm::make_early_inc_range(var_handle_op.getResult().getUsers())) {
+      auto assign_variable_op =
+          dyn_cast_or_null<AssignVariableOp>(var_handle_user);
+      if (!assign_variable_op) continue;
+      auto *value_op = assign_variable_op.getValue().getDefiningOp();
+      auto dq_op = dyn_cast_or_null<DequantizeOp>(value_op);
+      if (dq_op) {
+        Type output_type = dq_op.getInput().getType();
+        auto qtype = quant::QuantizedType::getQuantizedElementType(output_type);
+        if (qtype == quant::QuantizedType::getQuantizedElementType(ref_qtype)) {
+          // Same quantization parameters, remove it.
+          builder.setInsertionPoint(assign_variable_op);
+          auto new_assign_variable_op = builder.create<AssignVariableOp>(
+              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
+              dq_op.getInput());
+          assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
+        } else {
+          // Otherwise, apply re-quantization.
+          builder.setInsertionPoint(assign_variable_op);
+          auto new_q_op = builder.create<QuantizeOp>(
+              assign_variable_op.getLoc(), ref_qtype, dq_op.getInput(),
+              TypeAttr::get(ref_qtype));
+          auto new_assign_variable_op = builder.create<AssignVariableOp>(
+              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
+              new_q_op.getResult());
+          assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
+        }
+        assign_variable_op.erase();
+        dq_op.erase();
+      } else {
+        // Add quantize op.
+        builder.setInsertionPoint(assign_variable_op);
+        auto new_q_op = builder.create<QuantizeOp>(
+            assign_variable_op.getLoc(), ref_qtype,
+            assign_variable_op.getValue(), TypeAttr::get(ref_qtype));
+        auto new_assign_variable_op = builder.create<AssignVariableOp>(
+            assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
+            new_q_op.getResult());
+        assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
+        assign_variable_op.erase();
+      }
+    }
+  }
+  // Update resource tensors.
+  for (VarHandleOp var_handle_op : var_handle_ops) {
+    builder.setInsertionPoint(var_handle_op);
+    auto output_type = UnrankedTensorType::get(TF::ResourceType::get(
+        {ref_qtype.cast<TensorType>()}, builder.getContext()));
+    auto new_var_handle_op = builder.create<VarHandleOp>(
+        var_handle_op.getLoc(), output_type, var_handle_op.getContainer(),
+        var_handle_op.getSharedName());
+    var_handle_op->replaceAllUsesWith(new_var_handle_op);
+    var_handle_op.erase();
+  }
+}
+
+void QuantizeVariablesPass::runOnOperation() {
+  ResourceIdMap resource_id_map;
+  ResourceMap resource_map;
+
+  // Collect all resource identities.
+  getOperation().walk([&](TFL::VarHandleOp var_handle_op) {
+    auto identity = std::make_pair(var_handle_op.getContainer().str(),
+                                   var_handle_op.getSharedName().str());
+    resource_id_map.insert(
+        std::make_pair(identity, static_cast<int>(resource_id_map.size())));
+    int resource_id = resource_id_map[identity];
+    resource_map[resource_id].push_back(var_handle_op);
+  });
+
+  OpBuilder builder(getOperation().getContext());
+
+  for (const auto &[identity, var_handle_op] : resource_map) {
+    QuantizeVariable(builder, var_handle_op);
+  }
+}
+}  // namespace
+
+// Creates an instance of the TensorFlow Lite dialect Quantize Variables pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeVariablesPass() {
+  return std::make_unique<QuantizeVariablesPass>();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc b/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
index 0c0b0b11bc0..19109ce3b29 100644
--- a/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/reduce_while_operands.cc
@@ -47,7 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace TFL {
@@ -142,8 +142,7 @@ bool AllOperationSafe(Block &block) {
   auto walk_result = block.walk([&](Operation *op) {
     // op has SideEffect.
     if (!isa_and_nonnull<TFL::WhileOp>(op) &&
-        !op->hasTrait<OpTrait::IsTerminator>() &&
-        !MemoryEffectOpInterface::hasNoEffect(op)) {
+        !op->hasTrait<OpTrait::IsTerminator>() && !isMemoryEffectFree(op)) {
       return WalkResult::interrupt();
     }
     // op has implict arguments not listed in operands.
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index c6c019e604d..560bcfd3543 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -79,8 +79,10 @@ bool IsAlreadyOutlined(WhileOp while_op) {
 
 bool IsCompatibleTypeWithTFLCastOp(Type type) {
   auto elemType = getElementTypeOrSelf(type);
-  // F32 and BF16 types are allowed.
-  if (elemType.isBF16() || elemType.isF32()) return true;
+  // F16, F32, F64, BF16 types are allowed.
+  if (elemType.isBF16() || elemType.isF16() || elemType.isF32() ||
+      elemType.isF64())
+    return true;
 
   // I1, I8 I16, I32, I64 types are allowed.
   if (elemType.isInteger(1) || elemType.isInteger(8) ||
@@ -180,7 +182,7 @@ void ReplaceRegionWithCall(StringRef name, Region& region,
   auto block = b.createBlock(&region);
   SmallVector<Value, 4> new_operands;
   new_operands.reserve(types.size());
-  for (Type t : llvm::makeArrayRef(types).drop_back(extern_values.size()))
+  for (Type t : llvm::ArrayRef(types).drop_back(extern_values.size()))
     new_operands.push_back(block->addArgument(t, loc));
   for (Value v : extern_values) new_operands.push_back(v);
   auto call = b.create<func::CallOp>(loc, func, new_operands);
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index b55d0713cb3..8bd8f0ee3d5 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace mlir {
 namespace TFL {
 
-stream_executor::port::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
+tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     PatternRewriter* rewriter, Location loc, ShapedType shaped_type,
     int value) {
   Type element_type = shaped_type.getElementType();
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.h b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
index a8a40723835..9c107211b71 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
@@ -22,13 +22,13 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace mlir {
 namespace TFL {
 
 // Returns a Constant op with a single value.
-stream_executor::port::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
+tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 5db4c759f9a..093e53c0869 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -36,8 +36,8 @@ struct FetchMinMaxAttrs {
   using AttrType = FloatAttr;
   bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
                   AttrType &max_value) const {
-    min_value = tf_op.minAttr();
-    max_value = tf_op.maxAttr();
+    min_value = tf_op.getMinAttr();
+    max_value = tf_op.getMaxAttr();
     return true;  // Successfully matched and fetched.
   }
 };
@@ -47,7 +47,7 @@ struct FetchConstantMinMaxInputs {
   using AttrType = DenseFPElementsAttr;
   bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
                   AttrType &max_value) const {
-    Value min = tf_op.min(), max = tf_op.max();
+    Value min = tf_op.getMin(), max = tf_op.getMax();
     if (!matchPattern(min, m_Constant(&min_value))) {
       return false;
     }
@@ -105,7 +105,7 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
                                 OpBuilder &rewriter) const {
     // We don't want to insert quantize/dequantize if the quantize op exists.
-    auto res = tf_op.outputs();
+    auto res = tf_op.getOutputs();
     if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin())) {
       return failure();
     }
@@ -127,8 +127,8 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
     rewriter.setInsertionPointAfter(tf_op.getOperation());
-    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
-    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
@@ -141,7 +141,7 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     // Finally, use the quantization parameter to create the quantize and
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
-    Value value = tf_op.outputs();
+    Value value = tf_op.getOutputs();
     auto quantize = rewriter.create<TFL::QuantizeOp>(
         tf_op.getLoc(), qtype.getValue(), value, qtype);
     auto dequantize = rewriter.create<TFL::DequantizeOp>(
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index f9ca9318c1b..50f2a9bd4ca 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -594,6 +594,15 @@ TF::ConstOp CreateScalarConstantOp(int value, Location loc,
   return builder->create<TF::ConstOp>(loc, builder->getI32IntegerAttr(value));
 }
 
+TF::ReshapeOp CreateFlattenOP(const Value& input, Location loc,
+                              OpBuilder* builder) {
+  auto output_shape = Create1DConstantOp({-1}, loc, builder);
+  return builder->create<mlir::TF::ReshapeOp>(
+      loc,
+      /*tensor=*/input,
+      /*shape=*/output_shape.getResult());
+}
+
 LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
                                       Location loc, OpBuilder* builder,
                                       Operation** result) {
@@ -630,9 +639,14 @@ LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
   return success();
 }
 
-// TODO(b/147436982): Consider refactor this to be more general.
+// TODO(b/147436982): Consider refactoring these to be more general.
 LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
                                     OpBuilder* builder) {
+  return ConvertKerasLSTMLayer(func_op, builder, false);
+}
+
+LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
+                                    OpBuilder* builder, bool indy) {
   // For argument order, please check out standard_lstm under
   // tensorflow/python/keras/layers/recurrent_v2.py
   Value input = func_op.getArgument(0);
@@ -707,6 +721,34 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
                                      &recurrent_weights_array)))
     return failure();
 
+  // Reshape recurrent weights to vectors if indy behaviour is enabled.
+  // IndyLSTMs are a LSTM variant with diagonal recurrent weight
+  // matrices. For optimization purposes these are provided as vectors.
+  Value recurrent_to_input_weights =
+      indy ? CreateFlattenOP(recurrent_weights_array->getResult(0),
+                             func_op.getLoc(), builder)
+                 .getResult()
+                 .cast<Value>()
+           : recurrent_weights_array->getResult(0);
+  Value recurrent_to_forget_weights =
+      indy ? CreateFlattenOP(recurrent_weights_array->getResult(1),
+                             func_op.getLoc(), builder)
+                 .getResult()
+                 .cast<Value>()
+           : recurrent_weights_array->getResult(1);
+  Value recurrent_to_cell_weights =
+      indy ? CreateFlattenOP(recurrent_weights_array->getResult(2),
+                             func_op.getLoc(), builder)
+                 .getResult()
+                 .cast<Value>()
+           : recurrent_weights_array->getResult(2);
+  Value recurrent_to_output_weights =
+      indy ? CreateFlattenOP(recurrent_weights_array->getResult(3),
+                             func_op.getLoc(), builder)
+                 .getResult()
+                 .cast<Value>()
+           : recurrent_weights_array->getResult(3);
+
   // Splits the bias into 4:
   Operation* bias_array;
   if (failed(CreateEqualSizeSplitVOp(bias, 0, splits, func_op.getLoc(), builder,
@@ -731,10 +773,10 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
       /*input_to_forget_weights=*/weights_array->getResult(1),
       /*input_to_cell_weights=*/weights_array->getResult(2),
       /*input_to_output_weights=*/weights_array->getResult(3),
-      /*recurrent_to_input_weights=*/recurrent_weights_array->getResult(0),
-      /*recurrent_to_forget_weights=*/recurrent_weights_array->getResult(1),
-      /*recurrent_to_cell_weights=*/recurrent_weights_array->getResult(2),
-      /*recurrent_to_output_weights=*/recurrent_weights_array->getResult(3),
+      /*recurrent_to_input_weights=*/recurrent_to_input_weights,
+      /*recurrent_to_forget_weights=*/recurrent_to_forget_weights,
+      /*recurrent_to_cell_weights=*/recurrent_to_cell_weights,
+      /*recurrent_to_output_weights=*/recurrent_to_output_weights,
       /*cell_to_input_weights=*/none,
       /*cell_to_forget_weights=*/none,
       /*cell_to_output_weights=*/none,
@@ -755,6 +797,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
       /*proj_clip*/ builder->getF32FloatAttr(0.0),
       /*time_major*/ builder->getBoolAttr(time_majored),
       /*asymmetric_quantize_inputs=*/mlir::BoolAttr(),
+      /*diagonal_recurrent_tensors=*/builder->getBoolAttr(indy),
       /*input_to_input_intermediate=*/mlir::TypeAttr(),
       /*input_to_forget_intermediate=*/mlir::TypeAttr(),
       /*input_to_cell_intermediate=*/mlir::TypeAttr(),
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.h b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
index 6749f824943..7421fe2faa8 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
@@ -211,6 +211,9 @@ class ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM
 LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
                                     OpBuilder* builder);
 
+LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
+                                    OpBuilder* builder, bool indy);
+
 }  // end namespace TFL
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index d7aca785e2b..342bbb5c7fe 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -183,7 +183,7 @@ TEST_F(LstmUtilsTest, ConvertLSTMCellSimple) {
 
   EXPECT_EQ(fused_lstm_func_.getFunctionType().getNumResults(), 1);
   auto output_types = fused_lstm_func_.getFunctionType().getResults();
-  SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamicSize};
+  SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamic};
   EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
             output_shape.size());
   for (int i = 0; i < output_shape.size(); i++) {
@@ -254,7 +254,7 @@ TEST_F(LstmUtilsTest, ConvertLayerNormLSTMCellSimpleToFusedLSTM) {
 
   EXPECT_EQ(fused_ln_lstm_func_.getFunctionType().getNumResults(), 1);
   auto output_types = fused_ln_lstm_func_.getFunctionType().getResults();
-  SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamicSize};
+  SmallVector<int64_t, 2> output_shape{1, mlir::ShapedType::kDynamic};
   EXPECT_EQ(output_types[0].cast<RankedTensorType>().getShape().size(),
             output_shape.size());
   for (int i = 0; i < output_shape.size(); i++) {
diff --git a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
index d84835df803..c7944b67406 100644
--- a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
@@ -183,6 +183,12 @@ LogicalResult ConvertMaxUnpoolingFunc::CreateCustomOptions(
   pool_params.activation = kTfLiteActNone;
   pool_params.computed.padding = TfLitePaddingValues{0, 0, 0, 0};
 
+#if FLATBUFFERS_LITTLEENDIAN == 0
+  int32_t* p = reinterpret_cast<int32_t*>(&pool_params);
+  for (size_t i = 0; i < sizeof(TfLitePoolParams) / 4; i++, p++)
+    *p = flatbuffers::EndianSwap(*p);
+#endif
+
   custom_option_buffer.assign(reinterpret_cast<char*>(&pool_params),
                               sizeof(TfLitePoolParams));
   return success();
diff --git a/tensorflow/compiler/mlir/lite/utils/size_utils.cc b/tensorflow/compiler/mlir/lite/utils/size_utils.cc
new file mode 100644
index 00000000000..a5ffb64eaf4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/size_utils.cc
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
+
+#include <cstdint>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFL {
+
+int32_t ConvertToTfliteSize(int64_t size) {
+  return mlir::ShapedType::isDynamic(size) ? -1 : static_cast<int32_t>(size);
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/size_utils.h b/tensorflow/compiler/mlir/lite/utils/size_utils.h
new file mode 100644
index 00000000000..52aa50c1440
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/size_utils.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
+
+#include <cstdint>
+
+namespace mlir {
+namespace TFL {
+
+// Converts a TF size (64-bit) to TFLite (32-bit) and properly converts TF's
+// value for dynamic size (`std::numeric_limits<int64_t>::min()`) to the
+// TFLite-specific value.
+int32_t ConvertToTfliteSize(int64_t size);
+
+}  // namespace TFL
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_SIZE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/size_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/size_utils_test.cc
new file mode 100644
index 00000000000..49c3fc70cd0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/size_utils_test.cc
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/size_utils.h"
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir {
+namespace TFL {
+namespace {
+
+TEST(SizeUtilTest, TestConvertsSize) {
+  ASSERT_EQ(ConvertToTfliteSize(1), 1);
+  ASSERT_EQ(ConvertToTfliteSize(-1), -1);
+  ASSERT_EQ(ConvertToTfliteSize(mlir::ShapedType::kDynamic), -1);
+}
+
+}  // namespace
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 9902b3db18b..5db4ef7cce7 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -168,26 +168,12 @@ Status MlirFunctionOptimizationPass::Run(
     }
   }
 
-  static const char* kTfMlirCategory = "TfMlir";
-  tensorflow::metrics::ScopedCounter<2> timings(
-      tensorflow::metrics::GetGraphOptimizationCounter(),
-      {kTfMlirCategory, "graph_analysis"});
-
-  timings.ReportAndStop();
-
   if (overall_state == MlirOptimizationPassState::Disabled) {
     if (VLOG_IS_ON(1)) {
       LOG_FIRST_N(INFO, 1)
           << "None of the MLIR Optimization Passes are enabled "
           << "(registered " << registry_->passes().size() << ")";
     }
-    // Capture stats on graph properties analyzed before running the MLIR
-    // bridge. We set `uses_uninitialized_resource_args` to false here because
-    // function optimization is not affected by uninitialized resource args.
-    // TODO(b/241853328): Remove LogGraphFeatures when fixed
-    LogGraphFeatures(**graph, flib_def, config_proto,
-                     /*uses_uninitialized_resource_args=*/false,
-                     /*is_v1_compat=*/false);
     return OkStatus();
   }
 
@@ -214,7 +200,11 @@ Status MlirFunctionOptimizationPass::Run(
   // during import is not necessary.
   import_config.enable_shape_inference = false;
 
-  timings.Reset({kTfMlirCategory, "convert_graph_to_mlir"});
+  static const char* kTfMlirCategory = "TfMlir";
+  tensorflow::metrics::ScopedCounter<2> timings(
+      tensorflow::metrics::GetGraphOptimizationCounter(),
+      {kTfMlirCategory, "convert_graph_to_mlir"});
+
   auto module_ref_status = ConvertGraphToMlir(**graph, debug_info, *flib_def,
                                               import_config, &context);
   timings.ReportAndStop();
@@ -233,15 +223,8 @@ Status MlirFunctionOptimizationPass::Run(
       std::move(module_ref_status.value());
   AddDevicesToOp(*module_ref, &device_set);
 
-  // Capture stats on graph properties analyzed before running the MLIR
-  // bridge. We set `uses_uninitialized_resource_args` to false here because
-  // function optimization is not affected by uninitialized resource args.
-  // TODO (b/241853328) Remove LogGraphFeatures when fixed
-  LogGraphFeatures(**graph, flib_def, config_proto,
-                   /*uses_uninitialized_resource_args=*/false,
-                   /*is_v1_compat=*/false);
-
   int per_pass_state_index = 0;
+  bool is_module_updated = false;
   for (auto& pass_registration : registry_->passes()) {
     llvm::StringRef name = pass_registration.pass->name();
 
@@ -253,13 +236,24 @@ Status MlirFunctionOptimizationPass::Run(
     auto pass_state = per_pass_state[per_pass_state_index++];
     if (pass_state == MlirOptimizationPassState::Enabled) {
       VLOG(2) << "Run MLIR graph optimization pass: " << StringRefToView(name);
+      VLOG(2) << "Graph #nodes " << (*graph)->num_nodes() << " #edges "
+              << (*graph)->num_edges();
       timings.Reset({kTfMlirCategory, name.str()});
       pass_status = pass_registration.pass->Run(config_proto, *module_ref,
                                                 **graph, *flib_def);
       timings.ReportAndStop();
+      if (pass_status.ok()) {
+        VLOG(2) << "Finished MLIR graph optimization pass: "
+                << StringRefToView(name);
+        VLOG(2) << "Graph #nodes " << (*graph)->num_nodes() << " #edges "
+                << (*graph)->num_edges();
+        is_module_updated = true;
+      }
     } else if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
       VLOG(2) << "Run MLIR graph optimization pass with fallback: "
               << StringRefToView(name);
+      VLOG(2) << "Graph #nodes " << (*graph)->num_nodes() << " #edges "
+              << (*graph)->num_edges();
       // Make sure when the pass is FallbackEnabled, it only modifies the MLIR
       // module in case of no failures.
       auto module_ref_clone = module_ref->clone();
@@ -268,10 +262,16 @@ Status MlirFunctionOptimizationPass::Run(
                                                 **graph, *flib_def);
       timings.ReportAndStop();
 
-      if (pass_status.ok())
+      if (pass_status.ok()) {
+        VLOG(2) << "Finished MLIR graph optimization pass with fallback: "
+                << StringRefToView(name);
+        VLOG(2) << "Graph #nodes " << (*graph)->num_nodes() << " #edges "
+                << (*graph)->num_edges();
         module_ref = module_ref_clone;
-      else
+        is_module_updated = true;
+      } else {
         module_ref_clone->destroy();
+      }
     } else {
       VLOG(2) << "MLIR graph optimization pass: " << StringRefToView(name)
               << " is disabled and will not be run.";
@@ -301,6 +301,11 @@ Status MlirFunctionOptimizationPass::Run(
     }
   }
 
+  if (!is_module_updated) {
+    VLOG(2) << "MLIR module is not updated. Using the original graph. "
+            << "Do not convert mlir module back to graph";
+    return OkStatus();
+  }
   GraphExportConfig export_config;
   absl::flat_hash_set<Node*> control_ret_nodes;
 
@@ -344,15 +349,6 @@ Status MlirV1CompatGraphOptimizationPass::Run(
       pass->GetPassState(options.device_set, options.session_options->config,
                          **options.graph, *options.flib_def);
 
-  // If we ever have more than one MlirV1CompatOptimization pass we need to
-  // ensure the logging only happens once per graph to avoid redundant logging
-  // (see how it is used in the MLIRFunctionOptimizationPass as an example)
-  // TODO(b/241853328): Remove LogGraphFeatures when fixed
-  LogGraphFeatures(**options.graph, options.flib_def,
-                   options.session_options->config,
-                   /*uses_uninitialized_resource_args=*/false,
-                   /*is_v1_compat=*/true);
-
   if (pass_state == MlirOptimizationPassState::Disabled) {
     LOG_FIRST_N(INFO, 1) << "MLIR V1 optimization pass is not enabled";
     return OkStatus();
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index ec38a06676c..c4fa5158f54 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 
 #include <memory>
+#include <vector>
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -89,6 +91,25 @@ class ModifyMlirModulePass : public MlirOptimizationPass {
   Status run_status_;
 };
 
+FunctionDef XTimesTwo() {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  return FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+}
+
 class MlirGraphOptimizationPassTest : public Test {
  public:
   void Init(Status pass_run_result,
@@ -171,6 +192,14 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsDisabledFallback) {
        {MlirOptimizationPassState::Disabled,
         MlirOptimizationPassState::FallbackEnabled});
 
+  // We expect the result graph to be exactly the same as the original graph
+  // so we define the `graph_` by the following `flib` in this test point
+  // instead of the way we do in the Init method.
+  FunctionDefLibrary flib;
+  *flib.add_function() = XTimesTwo();
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  graph_ = std::make_unique<Graph>(flib_def);
+
   GraphDef original_graph_def;
   graph_->ToGraphDef(&original_graph_def);
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 016eecee453..136d5522d0d 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -17,14 +18,17 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -42,7 +46,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tosa:tf_tfl_passes",
         "//tensorflow/compiler/mlir/tosa:tfl_passes",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index c7ba1896e65..a51bef8376f 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -25,13 +25,17 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/tf_status.h"
@@ -52,9 +56,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/function_body.h"
@@ -82,10 +86,9 @@ static void RegisterPasses() {
     mlir::lmhlo::registerAllLmhloPasses();
     // These are in compiler/mlir/xla and not part of the above MHLO
     // passes.
-    mlir::mhlo::registerXlaPasses();
+    mlir::mhlo::registerXlaFrameworkPasses();
     mlir::mhlo::registerTfXlaPasses();
     mlir::mhlo::registerLegalizeTFPass();
-    mlir::mhlo::registerLegalizeTFControlFlowPass();
     mlir::mhlo::registerLegalizeTfTypesPassPass();
     mlir::tosa::registerLegalizeTosaPasses();
     mlir::tosa::registerTFtoTOSALegalizationPipeline();
@@ -355,4 +358,32 @@ std::string ExperimentalRunPassPipeline(const std::string& mlir_txt,
   return MlirModuleToString(*module, show_debug_info);
 }
 
+void ExperimentalWriteBytecode(const std::string& filename,
+                               const std::string& mlir_txt, TF_Status* status) {
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  {
+    mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+    module = mlir::parseSourceString<mlir::ModuleOp>(mlir_txt, &context);
+    if (!module) {
+      Set_TF_Status_from_Status(status, diagnostic_handler.ConsumeStatus());
+      return;
+    }
+  }
+  mlir::FallbackAsmResourceMap fallback_resource_map;
+  mlir::BytecodeWriterConfig writer_config(fallback_resource_map);
+  std::string error;
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(filename, &error);
+  if (!error.empty()) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 ("Unable to create output file" + error).c_str());
+    return;
+  }
+  outputFile->keep();
+  mlir::writeBytecodeToFile(*module, outputFile->os(), writer_config);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/python/mlir.h b/tensorflow/compiler/mlir/python/mlir.h
index 6d1dff63ebd..740971d4fb8 100644
--- a/tensorflow/compiler/mlir/python/mlir.h
+++ b/tensorflow/compiler/mlir/python/mlir.h
@@ -103,6 +103,10 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
                                         bool show_debug_info,
                                         TF_Status *status);
 
+// Writes the input textual MLIR as bytecode to output file.
+void ExperimentalWriteBytecode(const std::string &filename,
+                               const std::string &mlir_txt, TF_Status *status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_PYTHON_MLIR_H_
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
index 55fe818a1a9..807d0f497df 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_python_pybind_extension(
     name = "mlir_wrapper",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
new file mode 100644
index 00000000000..0ae09a43dd3
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -0,0 +1,28 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow/compiler/mlir/quantization/stablehlo:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
+
+package_group(
+    name = "internal_visibility_allowlist_package",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/compiler/mlir/quantization/...",
+        "//third_party/cloud_tpu/inference_converter/...",  # TPU Inference Converter V1
+    ] + internal_visibility_allowlist(),
+)
+
+tf_proto_library(
+    name = "quantization_options_proto",
+    srcs = ["quantization_options.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    visibility = [":internal_visibility_allowlist_package"],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "quantization_options_py_pb2",
+#     api_version = 2,
+#     visibility = [":internal_visibility_allowlist_package"],
+#     deps = [":quantization_options_proto"],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl b/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl
new file mode 100644
index 00000000000..0e302a08fd5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/internal_visibility_allowlist.bzl
@@ -0,0 +1,10 @@
+"""Internal visibility rules."""
+
+def internal_visibility_allowlist():
+    """Returns a list of g3 packages that can depend on internal targets."""
+    return [
+        "//learning/brain/experimental/mlir/quantization/...",
+        "//learning/brain/mlir/quantization/tensorflow/...",
+        "//learning/brain/mobile/programmability/...",
+        "//learning/brain/experimental/tfq/...",
+    ]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto
new file mode 100644
index 00000000000..22163b54a6d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto
@@ -0,0 +1,107 @@
+syntax = "proto3";
+
+package stablehlo.quantization;
+
+option cc_enable_arenas = true;
+
+// Defines arious options to specify and control the behavior of the
+// StableHLO quantizer.
+// NEXT ID: 2
+message QuantizationOptions {
+  QuantizationMethod quantization_method = 1;
+}
+
+// NEXT ID: 3
+message QuantizationMethod {
+  // Quantization Method can be either preset or custom.
+  oneof quantization_method {
+    PresetQuantizationMethod preset_quantization_method = 1;
+    CustomQuantizationMethod custom_quantization_method = 2;
+  }
+}
+
+// Preset model quantization method for optimization.
+//
+// Common quantization methods are defined as preset methods in this message.
+// Note that the quantization specs (ex: bit width) for preset quantization
+// methods are fixed. To use a different quantization spec for a given method,
+// use CustomQuantizationMethod.
+// NEXT ID: 2
+message PresetQuantizationMethod {
+  // Preset quantization methods that are supported as a stable API.
+  // NEXT ID: 3
+  enum PresetMethod {
+    // TODO(b/266173150): Update preset methods after redefining quantization
+    // pattern matching in DarwiNN.
+    // This should never be used. Using this will generally result in an error.
+    METHOD_UNSPECIFIED = 0;  // go/do-include-enum-unspecified
+
+    // Apply default weight-only quantization. Weights are quantized during
+    // conversion, then dequantized during inference. Data type is as follows:
+    // Weight: i8, Bias: f32, Activation: f32, Input/output: f32
+    WEIGHT_ONLY = 1;
+
+    // Apply default dynamic range quantization. Quantized tensor value's
+    // ranges are determined during graph runtime. Data type is as follows:
+    // Weight: i8, Bias: f32, Activation: f32, Input/output: f32
+    DYNAMIC_RANGE = 2;
+  }
+  PresetMethod preset_method = 1;
+}
+
+// Custom option for specifying quantization spec details.
+// If the selected quantization option is not available, StableHLO quantizer
+// will raise an error.
+// NEXT ID: 2
+message CustomQuantizationMethod {
+  // Specify component name, bit width, and other specs for all compoenents
+  // intended to be quantized.
+  repeated QuantizationComponentSpec quantization_component_spec = 1;
+}
+
+// Quantization spec per each component designated to be quantized.
+// Components whose QuantizationComponentSpec is not specified will not be
+// quantized, and remain f32.
+// NEXT ID: 7
+message QuantizationComponentSpec {
+  // NEXT ID: 4
+  enum QuantizationComponent {
+    COMPONENT_UNSPECIFIED = 0;
+    COMPONENT_ACTIVATION = 1;
+    COMPONENT_WEIGHT = 2;
+    COMPONENT_BIAS = 3;
+  }
+
+  // NEXT ID: 4
+  enum BitWidth {
+    BIT_WIDTH_UNSPECIFIED = 0;
+    BIT_WIDTH_4 = 1;
+    BIT_WIDTH_8 = 2;
+    BIT_WIDTH_16 = 3;
+  }
+
+  // NEXT ID: 4
+  enum BitType {
+    BIT_TYPE_UNSPECIFIED = 0;
+    BIT_TYPE_INT = 1;
+    BIT_TYPE_FLOAT = 2;
+    BIT_TYPE_BFLOAT = 3;
+  }
+
+  QuantizationComponent quantization_component = 1;
+
+  // Defines the target bit of the data.
+  BitWidth bit_width = 2;
+
+  // Defines the type of data of the quantized component.
+  BitType bit_type = 3;
+
+  // Defines whether quantization is done in narrow range.
+  bool enable_narrow_range = 4;
+
+  // Defines whether quantiation is done per-channel.
+  bool enable_per_channel_quantization = 5;
+
+  // Defines whether quantization is done symmetrically.
+  bool enable_symmetric = 6;
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 47269d4b222..04679693985 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -15,8 +15,10 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":internal_visibility_allowlist_package",
+        "//tensorflow:__pkg__",
     ],
     licenses = ["notice"],
 )
@@ -85,6 +87,7 @@ td_library(
         "passes/post_quantize.td",
         "passes/prepare_lifting.td",
         "passes/prepare_quantize.td",
+        "passes/preprocess_op.td",
         "passes/quantize_composite_functions.td",
         "passes/replace_cast_hacks_with_tf_xla_ops.td",
         "passes/tf_quant_ops.td",
@@ -218,6 +221,20 @@ gentbl_cc_library(
     deps = [":quant_td_files"],
 )
 
+gentbl_cc_library(
+    name = "preprocess_op_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "passes/preprocess_op.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes/preprocess_op.td",
+    deps = [":quant_td_files"],
+)
+
 cc_library(
     name = "tf_quant_ops",
     srcs = [
@@ -264,6 +281,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "uniform_op_quant_spec",
+    srcs = [
+        "ops/uniform_op_quant_spec.cc",
+    ],
+    hdrs = ["ops/uniform_op_quant_spec.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":tf_quant_ops",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 gentbl_cc_library(
     name = "replace_cast_hacks_with_tf_xla_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
@@ -284,14 +318,17 @@ cc_library(
         "passes/convert_custom_aggregation_op_to_quant_stats.cc",
         "passes/convert_fake_quant_to_qdq.cc",
         "passes/convert_tf_quant_ops_to_mhlo.cc",
+        "passes/duplicate_shape_determining_constants.cc",
         "passes/insert_custom_aggregation_ops.cc",
         "passes/insert_main_function.cc",
         "passes/insert_quantized_functions.cc",
+        "passes/insert_restore_op.cc",
         "passes/issue_ids_of_custom_aggregation_ops.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions.inc",
         "passes/lift_quantizable_spots_as_functions_drq.cc",
         "passes/lift_quantizable_spots_as_functions_drq.inc",
+        "passes/mark_functions_noinline.cc",
         "passes/merge_initializer_function_ops_to_main.cc",
         "passes/optimize.cc",
         "passes/optimize.inc",
@@ -302,10 +339,13 @@ cc_library(
         "passes/prepare_quantize.cc",
         "passes/prepare_quantize.inc",
         "passes/prepare_quantize_drq.cc",
+        "passes/preprocess_op.cc",
+        "passes/preprocess_op.inc",
         "passes/quantize.cc",
         "passes/quantize_composite_functions.cc",
         "passes/quantize_composite_functions.inc",
         "passes/quantized_function_library.h",
+        "passes/remove_var_init_by_const.cc",
         "passes/replace_cast_hacks_with_tf_xla_ops.cc",
         "passes/replace_cast_hacks_with_tf_xla_ops.inc",
         "passes/unfreeze_constants.cc",
@@ -325,8 +365,10 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:fake_quant_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:lift_as_function_call_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_to_uniform_attribute_utils",
         "//tensorflow/compiler/mlir/quantization/tensorflow/utils:tf_to_xla_attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -353,6 +395,7 @@ cc_library(
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
@@ -381,28 +424,26 @@ cc_library(
     ],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        ":quantization_options_proto_cc",
-        "//tensorflow/cc/saved_model:loader",
+        ":passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:mlir_dump",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_freeze_variables",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
-        "//tensorflow/core/platform:statusor",
-        "@llvm-project//mlir:ArithDialect",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -417,9 +458,10 @@ cc_library(
     ],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        ":pass_utils",
+        ":passes",
         ":quantization_options_proto_cc",
         "//tensorflow/cc/saved_model:loader",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
@@ -468,6 +510,33 @@ tf_proto_library(
 # )
 # copybara:uncomment_end
 
+# OSS only: This target is header-only. Link `exported_model_proto_impl` only to
+# `libtensorflow_framework.so` via `lib_internal_impl`. Do NOT link
+# `exported_model_proto_impl` directly unless the target does not link
+# `libtensorflow_framework.so`.
+tf_proto_library(
+    name = "exported_model_proto",
+    srcs = ["exported_model.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
+    visibility = [
+        ":internal_visibility_allowlist_package",
+        # To be visible from `lib_internal_impl`.
+        "//tensorflow/core:__pkg__",
+    ],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "exported_model_py_pb2",
+#     api_version = 2,
+#     deps = [":exported_model_proto"],
+# )
+# copybara:uncomment_end
+
 tf_cc_binary(
     name = "tf-quant-opt",
     srcs = ["passes/tf_quant_opt.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index 73cdea990ec..831bf9980ca 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -1,4 +1,13 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud", "tf_kernel_library", "tf_py_test")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "if_static",
+)
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "get_compatible_with_cloud",
+    "tf_kernel_library",
+    "tf_py_test",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -6,17 +15,21 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
     ],
     licenses = ["notice"],
 )
 
+# Directly linked to `custom_aggregator_op`. In general, one should avoid directly depending on
+# this target to avoid the ODR violation. Depend on `calibrator_singleton` instead.
 cc_library(
-    name = "calibrator_singleton",
+    name = "calibrator_singleton_impl",
     srcs = ["calibrator_singleton.cc"],
     hdrs = ["calibrator_singleton.h"],
     compatible_with = get_compatible_with_cloud(),
+    visibility = ["//visibility:private"],
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -25,12 +38,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "calibrator_singleton",
+    hdrs = ["calibrator_singleton.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = if_static([":calibrator_singleton_impl"]) + [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cc_test(
     name = "calibrator_singleton_test",
     size = "small",
     srcs = ["calibrator_singleton_test.cc"],
     deps = [
-        ":calibrator_singleton",
+        ":calibrator_singleton_impl",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -40,9 +65,12 @@ tf_kernel_library(
     name = "custom_aggregator_op",
     srcs = ["custom_aggregator_op.cc"],
     compatible_with = get_compatible_with_cloud(),
-    visibility = ["//tensorflow/compiler/mlir/quantization/tensorflow/python:__pkg__"],
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:__pkg__",
+    ],
     deps = [
-        ":calibrator_singleton",
+        ":calibrator_singleton_impl",
         "//tensorflow/core:framework",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
index 135ff141d9d..846095b30ac 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/integration_test/custom_aggregator_op_test.py
@@ -40,7 +40,7 @@ def testBypassAndMinMax(self):
                                         dtypes.float32)
       aggregator = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor, '1')
-      self.assertAllEqual(aggregator.eval(), [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertAllEqual(self.evaluate(aggregator), [1.0, 2.0, 3.0, 4.0, 5.0])
       min_val = quantize_model_wrapper.get_min_from_calibrator('1')
       max_val = quantize_model_wrapper.get_max_from_calibrator('1')
       self.assertAllEqual((min_val, max_val), (1.0, 5.0))
@@ -52,12 +52,13 @@ def testTwoIdentities(self):
                                          dtypes.float32)
       aggregator1 = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor1, '2')
-      self.assertAllEqual(aggregator1.eval(), [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertAllEqual(self.evaluate(aggregator1), [1.0, 2.0, 3.0, 4.0, 5.0])
       input_tensor2 = array_ops.constant([-1.0, -2.0, -3.0, -4.0, -5.0],
                                          dtypes.float32)
       aggregator2 = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor2, '3')
-      self.assertAllEqual(aggregator2.eval(), [-1.0, -2.0, -3.0, -4.0, -5.0])
+      self.assertAllEqual(
+          self.evaluate(aggregator2), [-1.0, -2.0, -3.0, -4.0, -5.0])
 
       min_val = quantize_model_wrapper.get_min_from_calibrator('2')
       max_val = quantize_model_wrapper.get_max_from_calibrator('2')
@@ -73,12 +74,13 @@ def testClearData(self):
                                          dtypes.float32)
       aggregator1 = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor1, '4')
-      self.assertAllEqual(aggregator1.eval(), [1.0, 2.0, 3.0, 4.0, 5.0])
+      self.assertAllEqual(self.evaluate(aggregator1), [1.0, 2.0, 3.0, 4.0, 5.0])
       input_tensor2 = array_ops.constant([-1.0, -2.0, -3.0, -4.0, -5.0],
                                          dtypes.float32)
       aggregator2 = custom_aggregator_op_wrapper.custom_aggregator(
           input_tensor2, '5')
-      self.assertAllEqual(aggregator2.eval(), [-1.0, -2.0, -3.0, -4.0, -5.0])
+      self.assertAllEqual(
+          self.evaluate(aggregator2), [-1.0, -2.0, -3.0, -4.0, -5.0])
 
       min_val = quantize_model_wrapper.get_min_from_calibrator('4')
       max_val = quantize_model_wrapper.get_max_from_calibrator('4')
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
new file mode 100644
index 00000000000..5976d1f91ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -0,0 +1,93 @@
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "get_compatible_with_cloud",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # By default, these targets should only be used within the quantization library.
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "save_variables",
+    srcs = ["save_variables.cc"],
+    hdrs = ["save_variables.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/ir/importexport:convert_tensor",
+        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "save_variables_test",
+    srcs = ["save_variables_test.cc"],
+    deps = [
+        ":save_variables",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "const_op_size",
+    srcs = ["const_op_size.cc"],
+    hdrs = ["const_op_size.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_remaining_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "const_op_size_test",
+    srcs = ["const_op_size_test.cc"],
+    deps = [
+        ":const_op_size",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc
new file mode 100644
index 00000000000..2c1b85ba194
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.cc
@@ -0,0 +1,79 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
+
+#include <climits>
+
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+// For types that have varying sizes or difficult to determine the size of, each
+// element is arbitrarily considered to be 4 bytes.
+constexpr int64_t kAssumedNumBytesPerElem = 4;
+
+int64_t GetSizeOfIntOrFloatConst(TF::ConstOp const_op) {
+  const Type dtype = const_op.getDtype();
+  const ElementsAttr const_value = const_op.getValue();
+
+  const auto bytes_per_elem =
+      static_cast<int64_t>(dtype.getIntOrFloatBitWidth() / CHAR_BIT);
+
+  return bytes_per_elem * const_value.getNumElements();
+}
+
+int64_t GetSizeOfStringConst(TF::ConstOp const_op) {
+  const ElementsAttr const_value = const_op.getValue();
+
+  // This cast is guaranteed to succeed. See `ConvertToTensorProto` from
+  // tensorflow/core/ir/importexport/convert_tensor.cc.
+  const auto str_attr = cast<DenseStringElementsAttr>(const_value);
+
+  // Sum the sizes of each string.
+  return absl::c_accumulate(
+      str_attr.getRawStringData(), 0,
+      [](int64_t acc, const StringRef str_value) -> int64_t {
+        return acc + str_value.size();
+      });
+}
+
+// Arbitrarily calculate the size of const of type whose size is unkown or
+// varying. Each element of such a type is considered to have
+// `kAssumedNumBytesPerElem` bytes.
+int64_t GetSizeOfUnsupportedTypeConst(TF::ConstOp const_op) {
+  return kAssumedNumBytesPerElem * const_op.getValue().getNumElements();
+}
+
+}  // namespace
+
+int64_t GetSizeInBytes(TF::ConstOp const_op) {
+  const Type dtype = const_op.getDtype();
+
+  if (dtype.isIntOrFloat()) {
+    return GetSizeOfIntOrFloatConst(const_op);
+  } else if (isa<TF::StringType>(dtype)) {
+    return GetSizeOfStringConst(const_op);
+  } else {
+    return GetSizeOfUnsupportedTypeConst(const_op);
+  }
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h
new file mode 100644
index 00000000000..884ac938f3c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
+
+#include <cstdint>
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace quant {
+
+// Returns the size in bytes of the underlying data of `const_op`. If the
+// underlying type's size cannot be determined, it assumes 4 bytes per element.
+int64_t GetSizeInBytes(TF::ConstOp const_op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_CONST_OP_SIZE_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc
new file mode 100644
index 00000000000..5206aceec7b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+using ::testing::Eq;
+
+class GetSizeInBytesTest : public ::testing::Test {
+ protected:
+  GetSizeInBytesTest() : ctx_() { ctx_.loadDialect<TF::TensorFlowDialect>(); }
+
+  MLIRContext ctx_;
+};
+
+TF::ConstOp ParseConstOp(const absl::string_view const_op_str, Block& block,
+                         MLIRContext& ctx) {
+  const LogicalResult parse_result =
+      parseSourceString(const_op_str, &block, ParserConfig(&ctx));
+  EXPECT_TRUE(succeeded(parse_result));
+
+  auto const_op = dyn_cast_or_null<TF::ConstOp>(block.front());
+  EXPECT_TRUE(const_op);
+
+  return const_op;
+}
+
+TEST_F(GetSizeInBytesTest, Int32ScalarConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr =
+      R"mlir(%cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>)mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(4));
+}
+
+TEST_F(GetSizeInBytesTest, Int32ConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr =
+      R"mlir(%cst = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>)mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(8));
+}
+
+TEST_F(GetSizeInBytesTest, Int8ConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr =
+      R"mlir(%cst = "tf.Const"() {value = dense<2> : tensor<3xi8>} : () -> tensor<3xi8>)mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(3));
+}
+
+TEST_F(GetSizeInBytesTest, Float32ConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr =
+      R"mlir(%cst = "tf.Const"() {value = dense<3.0> : tensor<4xf32>} : () -> tensor<4xf32>)mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(16));
+}
+
+TEST_F(GetSizeInBytesTest, Float64ConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr =
+      R"mlir(%cst = "tf.Const"() {value = dense<3.0> : tensor<2xf64>} : () -> tensor<2xf64>)mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(16));
+}
+
+TEST_F(GetSizeInBytesTest, Bfloat16ConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr = R"mlir(
+    %cst = "tf.Const"() {value = dense<1.0> : tensor<7xbf16>} : () -> tensor<7xbf16>
+  )mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(14));
+}
+
+TEST_F(GetSizeInBytesTest, TfStringConstOpSizeInBytes) {
+  constexpr absl::string_view kConstOpExpr = R"mlir(
+    %cst = "tf.Const"() {value = dense<["Hello World", "Quantization"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+  )mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  // Sum of the number of characters in "Hello World" and "Quantization".
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(23));
+}
+
+TEST_F(GetSizeInBytesTest, ConstOpWithUnknownSizeAssumes4BytesPerElement) {
+  constexpr absl::string_view kConstOpExpr = R"mlir(
+    %cst = "tf.Const"() {value = #tf_type<tensor_proto : "0xDEADBAAD"> : tensor<!tf_type.variant>} : () -> tensor<!tf_type.variant>
+  )mlir";
+
+  Block block{};
+  TF::ConstOp int_tensor_const_op = ParseConstOp(kConstOpExpr, block, ctx_);
+
+  // For non-fixed size like tf_type.variant, the size of each element is
+  // assumed to be 4 bytes.
+  const int64_t num_bytes = GetSizeInBytes(int_tensor_const_op);
+  EXPECT_THAT(num_bytes, Eq(4));
+}
+
+}  // namespace
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
new file mode 100644
index 00000000000..529879df617
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
@@ -0,0 +1,135 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/ir/importexport/convert_tensor.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace quantization {
+namespace {
+
+using ::mlir::func::FuncOp;
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::GetSessionInitializerOp;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
+using ::mlir::tf_saved_model::SessionInitializerOp;
+
+// Adds the tensor that initializes the variable through the provided
+// `assign_var_op` to the `bundle_writer` for saving to checkpoint. Returns the
+// shared name of the variable if a variable is saved successfully. If the
+// variable is not saved, returns an empty string.
+absl::StatusOr<std::string> AddTensorToBundleWriter(
+    mlir::TF::AssignVariableOp assign_var_op, BundleWriter& bundle_writer) {
+  auto resource_operand = assign_var_op.getOperand(0);
+  auto var_handle_op =
+      llvm::dyn_cast<mlir::TF::VarHandleOp>(resource_operand.getDefiningOp());
+  if (!var_handle_op) {
+    assign_var_op->emitRemark(
+        "Operand idx 0 is not a tf.VarHandleOp. The initializing tensor is not "
+        "saved to checkpoint.");
+    return "";
+  }
+
+  auto assigned_value_operand = assign_var_op.getOperand(1);
+  auto const_op =
+      llvm::dyn_cast<mlir::TF::ConstOp>(assigned_value_operand.getDefiningOp());
+  if (!const_op) {
+    assign_var_op->emitRemark(
+        "Operand idx 1 is not a tf.ConstOp. The initializing tensor is not "
+        "saved to checkpoint.");
+    return "";
+  }
+
+  Tensor const_tensor{};
+  if (const tsl::Status status = mlir::tfg::ConvertToTensor(
+          /*attr=*/const_op.getValue(), /*output_tensor=*/&const_tensor);
+      !status.ok()) {
+    return tsl::ToAbslStatus(status);
+  }
+
+  if (!bundle_writer.Add(/*key=*/var_handle_op.getSharedName(), const_tensor)
+           .ok()) {
+    return tsl::ToAbslStatus(bundle_writer.status());
+  }
+
+  return var_handle_op.getSharedName().str();
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<std::string>> SaveVariablesToCheckpoint(
+    const absl::string_view prefix, mlir::ModuleOp module_op) {
+  // Only the "tf.AssignVariableOp" patterns inside this initializer function
+  // will be searched.
+  FuncOp session_init_func_type_restore_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (!session_init_func_type_restore_op) {
+    LOG(INFO) << "No session initializer function with type 'restore_op'. No "
+                 "variables are saved to checkpoint.";
+    return std::vector<std::string>{};
+  }
+
+  BundleWriter bundle_writer(Env::Default(), prefix);
+  if (!bundle_writer.status().ok()) {
+    return tsl::ToAbslStatus(bundle_writer.status());
+  }
+
+  std::vector<std::string> saved_variable_shared_names;
+  for (auto assign_variable_op :
+       session_init_func_type_restore_op.getOps<mlir::TF::AssignVariableOp>()) {
+    if (const absl::StatusOr<std::string> variable_shared_name =
+            AddTensorToBundleWriter(assign_variable_op, bundle_writer);
+        !variable_shared_name.ok()) {
+      return variable_shared_name.status();
+    } else if (!variable_shared_name->empty()) {
+      // Empty string means the variable isn't applicable for saving.
+      saved_variable_shared_names.emplace_back(
+          std::move(*variable_shared_name));
+      VLOG(1) << "Saved a variable with shared_name: " << *variable_shared_name;
+    }
+  }
+
+  // Exit early if no variables are added.
+  if (saved_variable_shared_names.empty()) {
+    LOG(INFO) << "No variables are saved to checkpoint";
+    return saved_variable_shared_names;
+  }
+
+  if (!bundle_writer.Finish().ok()) {
+    return tsl::ToAbslStatus(bundle_writer.status());
+  }
+
+  return saved_variable_shared_names;
+}
+
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h
new file mode 100644
index 00000000000..124f2a5bc9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+// Saves variables in `module_op` to the checkpoint file inside `prefix`.
+// It finds variables that are initialized with "tf.AssignVariableOp" inside the
+// initializer function with type "restore_op". The "tf.Const"s used to
+// initialize the variables are saved. This function does not modify the
+// `module_op`. Returns a list of saved names of the saved variables.
+absl::StatusOr<std::vector<std::string>> SaveVariablesToCheckpoint(
+    absl::string_view prefix, mlir::ModuleOp module_op);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CC_SAVE_VARIABLES_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
new file mode 100644
index 00000000000..8967b64b877
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
@@ -0,0 +1,385 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+namespace quantization {
+namespace {
+
+using ::tensorflow::test::AsTensor;
+using ::tensorflow::test::ExpectEqual;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOk;
+
+// This fixture simply wraps the Env and MLIRContext.
+class SaveVariablesToCheckpointTest : public ::testing::Test {
+ protected:
+  SaveVariablesToCheckpointTest() : env_(Env::Default()) {
+    ctx_.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect,
+                     mlir::tf_saved_model::TensorFlowSavedModelDialect>();
+  }
+
+  absl::StatusOr<std::string> MakeTempDir() {
+    std::string tmp_dir{};
+    if (!env_->LocalTempFilename(&tmp_dir)) {
+      return absl::InternalError("Failed to create temp file.");
+    }
+
+    TF_CHECK_OK(env_->CreateDir(tmp_dir));
+    return tmp_dir;
+  }
+
+  // Parses `module_op_str` to create a `ModuleOp`. Checks whether the created
+  // module op is valid.
+  mlir::OwningOpRef<mlir::ModuleOp> ParseModuleOpString(
+      const absl::string_view module_op_str) {
+    auto module_op_ref =
+        mlir::parseSourceString<mlir::ModuleOp>(module_op_str, &ctx_);
+    EXPECT_TRUE(module_op_ref);
+    return module_op_ref;
+  }
+
+  Env* env_{};
+  mlir::MLIRContext ctx_{};
+};
+
+TEST_F(SaveVariablesToCheckpointTest, VariableSavedToCheckpoint) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+      func.func @init_func_restore_op() -> () attributes {tf_saved_model.exported_names = ["restore"], tf_saved_model.initializer_type = "restore_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %0 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        "tf.AssignVariableOp"(%0, %cst) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, UnorderedElementsAre("var_0"));
+
+  // Verify the saved variable.
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+
+  Tensor loaded_tensor{};
+  EXPECT_TRUE(
+      tsl::ToAbslStatus(bundle_reader.Lookup("var_0", &loaded_tensor)).ok());
+
+  ExpectEqual(loaded_tensor, AsTensor<float>({1.0, 2.0}));
+}
+
+TEST_F(SaveVariablesToCheckpointTest, MultipleVariablesSavedToCheckpoint) {
+  // Module's session intializer contains two variables.
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+      func.func @init_func_restore_op() -> () attributes {tf_saved_model.exported_names = ["restore"], tf_saved_model.initializer_type = "restore_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %0 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        "tf.AssignVariableOp"(%0, %cst) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+        %cst_0 = "tf.Const"() {device = "", value = dense<[3, 4, 5, 6]> : tensor<4xi32>} : () -> tensor<4xi32>
+        %1 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+        "tf.AssignVariableOp"(%1, %cst_0) : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, UnorderedElementsAre("var_0", "var_1"));
+
+  // Verify that both variables are saved correctly.
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+
+  Tensor loaded_var_0{};
+  EXPECT_TRUE(
+      tsl::ToAbslStatus(bundle_reader.Lookup("var_0", &loaded_var_0)).ok());
+  ExpectEqual(loaded_var_0, AsTensor<float>({1.0, 2.0}));
+
+  Tensor loaded_var_1{};
+  EXPECT_TRUE(
+      tsl::ToAbslStatus(bundle_reader.Lookup("var_1", &loaded_var_1)).ok());
+  ExpectEqual(loaded_var_1, AsTensor<int>({3, 4, 5, 6}));
+}
+
+TEST_F(SaveVariablesToCheckpointTest,
+       NoVariablesSavedWhenNoInitializerFunction) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, IsEmpty());
+
+  // Verify that the checkpoint doesn't exist.
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+  EXPECT_THAT(bundle_reader.status(), Not(IsOk()));
+}
+
+TEST_F(SaveVariablesToCheckpointTest,
+       NoVariablesSavedWhenNoSessionInitializerOp) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module {
+      func.func @my_func() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  EXPECT_TRUE(
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref).ok());
+
+  // Verify that the checkpoint doesn't exist.
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+  EXPECT_THAT(bundle_reader.status(), Not(IsOk()));
+}
+
+TEST_F(SaveVariablesToCheckpointTest,
+       NoVariablesSavedWhenNoSessionInitializerOpTypeRestoreOp) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_init_op]} : () -> ()
+
+      func.func @init_func_init_op() -> () attributes {tf_saved_model.exported_names = ["init"], tf_saved_model.initializer_type = "init_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %0 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        "tf.AssignVariableOp"(%0, %cst) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, IsEmpty());
+
+  // Verify that the checkpoint doesn't exist.
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+  EXPECT_THAT(bundle_reader.status(), Not(IsOk()));
+}
+
+TEST_F(SaveVariablesToCheckpointTest, MutableVariablesNotSaved) {
+  // This function includes an AssignVariableOp that does not initialize the
+  // variable from a ConstOp. In this case, the variable is not saved to the
+  // checkpoint.
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+      func.func @init_func_restore_op() -> () attributes {tf_saved_model.exported_names = ["init"], tf_saved_model.initializer_type = "restore_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %add = "tf.AddV2"(%cst, %cst) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+        %var_handle = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        "tf.AssignVariableOp"(%var_handle, %add) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, IsEmpty());
+
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+  EXPECT_THAT(bundle_reader.status(), Not(IsOk()));
+}
+
+TEST_F(SaveVariablesToCheckpointTest,
+       VariableNotSavedWhenNonVarHandleOpOperandForAssignVariableOp) {
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+      func.func @init_func_restore_op() -> () attributes {tf_saved_model.exported_names = ["init"], tf_saved_model.initializer_type = "restore_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %var_handle = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        %var_handle_cast = "tf.Cast"(%var_handle) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<!tf_type.resource>
+        "tf.AssignVariableOp"(%var_handle_cast, %cst) : (tensor<!tf_type.resource>, tensor<2xf32>) -> ()
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref);
+  EXPECT_TRUE(variable_shared_names.ok());
+  EXPECT_THAT(*variable_shared_names, IsEmpty());
+
+  BundleReader bundle_reader(env_, *checkpoint_prefix);
+  EXPECT_THAT(bundle_reader.status(), Not(IsOk()));
+}
+
+TEST_F(SaveVariablesToCheckpointTest, FailsWhenDuplicateSharedName) {
+  // Saving variables fails when there are duplicate shared_names ("var_0").
+  constexpr absl::string_view kModuleCode = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+      func.func @init_func_restore_op() -> () attributes {tf_saved_model.exported_names = ["restore"], tf_saved_model.initializer_type = "restore_op"} {
+        %cst = "tf.Const"() {device = "", value = dense<[1.0, 2.0]> : tensor<2xf32>} : () -> tensor<2xf32>
+        %0 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+        "tf.AssignVariableOp"(%0, %cst) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+        %cst_0 = "tf.Const"() {device = "", value = dense<[3, 4, 5, 6]> : tensor<4xi32>} : () -> tensor<4xi32>
+        %1 = "tf.VarHandleOp"() {container = "", device = "/device:CPU:0", shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+        "tf.AssignVariableOp"(%1, %cst_0) : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+
+        return
+      }
+    }
+  )mlir";
+
+  mlir::OwningOpRef<mlir::ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleCode);
+
+  const absl::StatusOr<std::string> checkpoint_prefix = MakeTempDir();
+  EXPECT_TRUE(checkpoint_prefix.ok());
+
+  const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
+    int64_t undeleted_files, undeleted_dirs;
+    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                        &undeleted_dirs));
+  };
+
+  EXPECT_FALSE(
+      SaveVariablesToCheckpoint(*checkpoint_prefix, *module_op_ref).ok());
+}
+
+}  // namespace
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
new file mode 100644
index 00000000000..448ac05842f
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
@@ -0,0 +1,46 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+
+package(
+    default_visibility = [
+        "//tensorflow/compiler/mlir/quantization:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "mlir_dump",
+    srcs = ["mlir_dump.cc"],
+    hdrs = ["mlir_dump.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+tf_cc_test(
+    name = "mlir_dump_test",
+    srcs = ["mlir_dump_test.cc"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":mlir_dump",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
new file mode 100644
index 00000000000..fe1b205ce1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
@@ -0,0 +1,145 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace quantization {
+namespace {
+
+// Retrieve the MLIR dump directory. The directory is read from the environment
+// variable `TF_QUANT_MLIR_DUMP_PREFIX`. However, if a special value "sponge" is
+// set to `TF_QUANT_MLIR_DUMP_PREFIX`, it uses the directory set in
+// `TEST_UNDECLARED_OUTPUT_DIRS`. Returns `absl::FailedPreconditionError` if
+// either:
+//   1. `TF_QUANT_MLIR_DUMP_PREFIX` is not set (empty), or
+//   2. `TEST_UNDECLARED_OUTPUT_DIRS` is not set (empty) when
+//      `TF_QUANT_MLIR_DUMP_PREFIX = "sponge"`.
+absl::StatusOr<std::string> GetMlirDumpDir() {
+  auto dump_dir = std::string(
+      absl::NullSafeStringView(std::getenv("TF_QUANT_MLIR_DUMP_PREFIX")));
+  if (dump_dir.empty()) {
+    return absl::FailedPreconditionError(
+        "Environment variable not set: TF_QUANT_MLIR_DUMP_PREFIX, "
+        "IR dump file for TF quantization is not created.");
+  }
+
+  if (absl::EqualsIgnoreCase(dump_dir, "sponge")) {
+    if (!tsl::io::GetTestUndeclaredOutputsDir(&dump_dir)) {
+      return absl::FailedPreconditionError(
+          "Environment variable TF_QUANT_MLIR_DUMP_PREFIX=sponge but "
+          "TEST_UNDECLARED_OUTPUT_DIRS not set.");
+    }
+  }
+
+  return dump_dir;
+}
+
+// Creates a new file to dump the intermediate MLIRs by prefixing the
+// `dump_file_name` with the value of the TF_QUANT_MLIR_DUMP_PREFIX env
+// variable. Returns absl::FailedPreconditionError if the env variable is not
+// set or set to an empty string.
+absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> CreateMlirDumpFile(
+    const absl::string_view dump_file_name) {
+  const absl::StatusOr<std::string> dump_dir = GetMlirDumpDir();
+  if (!dump_dir.ok()) {
+    return dump_dir.status();
+  }
+
+  auto *env = tsl::Env::Default();
+  const tsl::Status status = env->RecursivelyCreateDir(*dump_dir);
+  if (!status.ok()) {
+    return tsl::ToAbslStatus(status);
+  }
+
+  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
+  const std::string dump_file_path =
+      tsl::io::JoinPath(*dump_dir, dump_file_name);
+  auto dump_file = std::make_unique<llvm::raw_fd_ostream>(dump_file_path, ec);
+  if (ec) {
+    return absl::InternalError(absl::StrFormat(
+        "Unable to open file: %s, error: %s", dump_file_path, ec.message()));
+  }
+
+  LOG(INFO) << "IR dump file created: " << dump_file_path;
+  return dump_file;
+}
+
+}  // namespace
+
+void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm) {
+  mlir::OpPrintingFlags flag{};
+  flag.useLocalScope().elideLargeElementsAttrs().enableDebugInfo();
+
+  // IR printing requires multithreading disabled.
+  pm.getContext()->disableMultithreading();
+
+  // The configuration uses the default parameter values for
+  // `PassManager::enableIRPrinting`, except for the `printModuleScope`
+  // parameter, which is true by default. It is set to false to avoid the dump
+  // file size becoming too large when the passes are running on a large model.
+  pm.enableIRPrinting(
+      /*shouldPrintBeforePass=*/[](mlir::Pass *,
+                                   mlir::Operation *) { return true; },
+      /*shouldPrintAfterPass=*/
+      [](mlir::Pass *, mlir::Operation *) { return true; },
+      /*printModuleScope=*/false, /*printAfterOnlyOnChange=*/true,
+      /*printAfterOnlyOnFailure=*/false, out_stream, flag);
+
+  LOG(INFO) << "IR dump for TensorFlow quantization pipeline enabled.";
+}
+
+// TODO(b/259374854): Create tests for MaybeEnableIrPrinting.
+absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> MaybeEnableIrPrinting(
+    mlir::PassManager &pm, const absl::string_view name) {
+  if (!VLOG_IS_ON(1)) {
+    LOG(INFO) << "Verbosity level too low to enable IR printing.";
+    return nullptr;
+  }
+
+  absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> dump_file =
+      CreateMlirDumpFile(/*dump_file_name=*/absl::StrCat(name, ".mlir"));
+  if (absl::IsFailedPrecondition(dump_file.status())) {
+    // Requirements for enabling IR dump are not met. IR printing will not be
+    // enabled.
+    LOG(WARNING) << dump_file.status();
+    return nullptr;
+  } else if (!dump_file.ok()) {
+    return dump_file.status();
+  }
+
+  EnableIrPrinting(**dump_file, pm);
+
+  return dump_file;
+}
+
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
new file mode 100644
index 00000000000..db13cd19f08
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace quantization {
+
+// Enables IR printing for `pm`. When the passes are run, the IRs will be dumped
+// to `out_stream`.
+void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm);
+
+// If verbosity level >= 1, this will dump intermediate IRs of passes to a file.
+// The file path is given by prefixing `name`.mlir with the value of the
+// TF_QUANT_MLIR_DUMP_PREFIX env variable. Returns `nullptr` iff the verbosity
+// level < 1 or TF_QUANT_MLIR_DUMP_PREFIX is not set or set to an empty string.
+// The returned ostream instance should live until the pass run is complete.
+absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> MaybeEnableIrPrinting(
+    mlir::PassManager &pm, const absl::string_view name);
+
+}  // namespace quantization
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc
new file mode 100644
index 00000000000..da8b64fc136
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/status/statusor.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace quantization {
+namespace {
+
+class NoOpPass
+    : public mlir::PassWrapper<NoOpPass, mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(NoOpPass)
+
+  NoOpPass() = default;
+
+  llvm::StringRef getArgument() const final { return "no-op-pass"; }
+
+  void runOnOperation() override {
+    // Noop pass does nothing on the operation.
+  }
+};
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateNoOpPass() {
+  return std::make_unique<NoOpPass>();
+}
+
+TEST(EnableIrPrintingTest, PassSuccessfullyRuns) {
+  mlir::MLIRContext ctx{};
+
+  mlir::PassManager pm = {&ctx};
+  pm.addPass(CreateNoOpPass());
+
+  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
+  const std::string tmp_dump_filename =
+      tsl::io::GetTempFilename(/*extension=*/".mlir");
+  llvm::raw_fd_ostream dump_file{tmp_dump_filename, ec};
+
+  EnableIrPrinting(dump_file, pm);
+
+  mlir::OpBuilder builder(&ctx);
+  auto module_op = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+  // Destroy by calling destroy() to avoid memory leak since it is allocated
+  // with malloc().
+  const absl::Cleanup module_op_cleanup = [module_op] { module_op->destroy(); };
+
+  const mlir::LogicalResult result = pm.run(module_op);
+  EXPECT_FALSE(failed(result));
+}
+
+}  // namespace
+}  // namespace quantization
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/exported_model.proto b/tensorflow/compiler/mlir/quantization/tensorflow/exported_model.proto
new file mode 100644
index 00000000000..6684f6696ea
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/exported_model.proto
@@ -0,0 +1,40 @@
+syntax = "proto3";
+
+package tensorflow.quantization;
+
+import "tensorflow/core/framework/graph.proto";
+
+option cc_enable_arenas = true;
+
+// Represents an exported TensorFlow model. It consists of a GraphDef and extra
+// metadata required for building a SavedModel. This message is primarily used
+// to "export" the model produced from various quantization passes in c++ to
+// Python layer.
+// Next ID: 7
+message ExportedModel {
+  GraphDef graph_def = 1;
+
+  // Name of the initialization node (TF Operation) used for initializing
+  // resources like hash tables upon loading.
+  string init_node_name = 2;
+
+  // Name of the restore node. When fetched it runs the `RestoreV2` op that
+  // restores variables from the checkpoint file specified by `checkpoint_dir`.
+  string restore_node_name = 3;
+
+  // A set of variable `shared_name`s to restore for the quantized model.
+  repeated string variable_shared_names = 4;
+
+  // Path to the directory where checkpoint files are saved. This directoy is
+  // not expected to be persistent (usually a temporary directory). When
+  // fetching the restore op (see `restore_node_name`), this value is provided
+  // to the "file_prefix" tensor to identify the checkpoint directory.
+  string checkpoint_dir = 5;
+
+  // Function name -> function alias mapping. This associates the quantized
+  // functions to the original functions' aliases. This information will be used
+  // to populate `MetaInfoDef`s `function_aliases` when the quantized model is
+  // exported to the saved model. This field is usually only populated for the
+  // TF2 models.
+  map<string, string> function_aliases = 6;
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/gen_quantized_function_library.py b/tensorflow/compiler/mlir/quantization/tensorflow/gen_quantized_function_library.py
index f44e2df1e78..8352b974996 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/gen_quantized_function_library.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/gen_quantized_function_library.py
@@ -22,6 +22,8 @@
 from absl import app
 from absl import flags
 
+# TODO(b/263048929): Create a test for gen_quantized_function_library.
+
 _OUTPUT_FILE = flags.DEFINE_string('output_file', None, 'output file location')
 _SRCS = flags.DEFINE_string('src', None, 'source file locations')
 _NAMESPACE = flags.DEFINE_string('namespace', 'mlir::quant',
@@ -51,14 +53,14 @@ def _substitute_for_loop_template(module: str) -> str:
     for arg_value in arg_values:
       arg_dict = {arg_name: arg_value}
       replacement_text += '\\n'
-      replacement_text += _substitute_function_template(
+      replacement_text += _substitute_parameterization_template(
           loop_template.safe_substitute(arg_dict))
     module = re.sub(compiled_regex, replacement_text, module, count=1)
 
   return module
 
 
-def _substitute_function_template(module: str) -> str:
+def _substitute_parameterization_template(module: str) -> str:
   """Substitutes all the function templates in the given module."""
   compiled_regex = re.compile(
       r'^\s*parameters(\[.*?\])\n?(^\s*(?:func\.)+func.*?\{.*?(?:func\.)+return.*?\}\n)',
@@ -70,18 +72,91 @@ def _substitute_function_template(module: str) -> str:
 
     try:
       value_list = ast.literal_eval(func_match.group(1))
-      func_template = string.Template(func_match.group(2))
+      # Escapes template $-based substitutions for attributes containing $.
+      # $$ is replaced with a single $.
+      func_template = string.Template(
+          func_match.group(2).replace('tfdtype$DT_', 'tfdtype$$DT_'))
     except Exception as e:  # pylint: disable=broad-except
       raise ValueError('The function template is in wrong format') from e
 
     replacement_text = ''
     for value_dict in value_list:
+      for key, value in value_dict.items():
+        # Replace single quote to double quote since single quote around a
+        # string are not valid in the MLIR representation.
+        value_dict[key] = str(value).replace("'", '"')
       replacement_text += '\\n'
       replacement_text += func_template.substitute(value_dict)
     module = re.sub(compiled_regex, replacement_text, module, count=1)
   return module
 
 
+def _format_snake_case_op_name(s):
+  """Formats the op name to snake case."""
+  s = s.replace('2D', '2d').replace('3D', '3d')
+  snake_case = ''.join(['_' + i.lower() if i.isupper() else i for i in s
+                       ]).lstrip('_')
+  return snake_case.replace('mat_mul', 'matmul').replace('bias_add', 'bias')
+
+
+def _substitute_impl_function_name_template(module: str) -> str:
+  """Generates the op-specific implementation function name."""
+  compiled_regex = re.compile(r'GenerateImplFunctionName\(([\w\s]+)\)')
+  while True:
+    func_match = re.search(compiled_regex, module)
+    if func_match is None:
+      break
+
+    text = func_match.group(1)
+    function_name = 'internal_{}_fn'.format(_format_snake_case_op_name(text))
+    module = re.sub(compiled_regex, function_name, module, count=1)
+  return module
+
+
+def _substitute_quantized_function_name_template(module: str) -> str:
+  """Generates the quantized function name."""
+  compiled_regex = re.compile(
+      r'GenerateQuantizedFunctionName(\([\w\s\'\"\[\],]+\))')
+  while True:
+    func_match = re.search(compiled_regex, module)
+    if func_match is None:
+      break
+
+    # Make sure the string ends with ",)" so the parsed value is a tuple.
+    argument_string = func_match.group(1)
+    if not argument_string.endswith(',)'):
+      argument_string = argument_string[:-1] + ',)'
+    arguments = ast.literal_eval(argument_string)
+
+    if len(arguments) < 1 or len(arguments) > 2:
+      raise ValueError(
+          'Wrong number of arguments to GenerateQuantizedFunctionName')
+
+    quantized_ops = arguments[0]
+    if not quantized_ops:
+      raise ValueError('The quantized_ops list must not be empty')
+
+    # Add op names to the function name.
+    function_name = 'quantized_{}'.format(
+        _format_snake_case_op_name(quantized_ops[0]))
+    if len(quantized_ops) > 1:
+      function_name += '_with_{}'.format(
+          _format_snake_case_op_name(quantized_ops[1]))
+    if len(quantized_ops) > 1:
+      for quantized_op in quantized_ops[2:]:
+        function_name += '_and_{}'.format(
+            _format_snake_case_op_name(quantized_op))
+
+    # Add suffix based on output type.
+    suffix = '_fn'
+    if len(arguments) > 1 and arguments[1] == 'f32':
+      suffix = '_float_output_fn'
+    function_name += suffix
+
+    module = re.sub(compiled_regex, function_name, module, count=1)
+  return module
+
+
 def main(_: Sequence[str]) -> None:
   namespaces = _NAMESPACE.value.split('::')
   src_files = _SRCS.value.split(' ')
@@ -107,7 +182,9 @@ def main(_: Sequence[str]) -> None:
         raise ValueError('The file name must start with {}'.format(file_prefix))
       tag = out[1][:-5]  # the last five values = ".mlir"
       module = _substitute_for_loop_template(module)
-      module = _substitute_function_template(module)
+      module = _substitute_parameterization_template(module)
+      module = _substitute_quantized_function_name_template(module)
+      module = _substitute_impl_function_name_template(module)
       modules.append((tag, module))
 
   with open(_OUTPUT_FILE.value, 'w') as f:
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 3daed86a936..919557f557e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -30,7 +30,7 @@ std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
   auto spec = std::make_unique<OpQuantSpec>();
   if (auto call_op = dyn_cast<TF::PartitionedCallOp>(op)) {
     StringRef function_name =
-        call_op.fAttr().cast<FlatSymbolRefAttr>().getValue();
+        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
     if (!function_name.startswith("composite_")) {
       return spec;
     }
@@ -58,6 +58,15 @@ std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op) {
         spec->biases_params[2] = {{0, 1},
                                   quant::GetUniformQuantizedTypeForBias};
       }
+    } else if (function_name.contains("batch_matmul")) {
+      spec->coeff_op_quant_dim[1] = -1;
+      if (function_name.contains("with_bias")) {
+        spec->biases_params[2] = {{0, 1},
+                                  quant::GetUniformQuantizedTypeForBias};
+      }
+    } else if (function_name.contains("gather")) {
+      // Note that gather has axis attribute that specifies channel axis.
+      spec->coeff_op_quant_dim[0] = -1;
     }
     for (auto quantizable_operand : spec->coeff_op_quant_dim) {
       spec->quantizable_operands.insert(quantizable_operand.first);
@@ -72,12 +81,22 @@ std::unique_ptr<OpQuantScaleSpec> GetTfQuantScaleSpec(Operation* op) {
           // clang-format off
           // go/keep-sorted start
           TF::AvgPoolOp,
+          TF::ConcatOp,
           TF::ConcatV2Op,
+          TF::ExpandDimsOp,
+          TF::IdentityNOp,
           TF::IdentityOp,
           TF::MaxPoolOp,
           TF::PadV2Op,
+          TF::RankOp,
           TF::ReshapeOp,
-          TF::SqueezeOp
+          TF::SelectOp,
+          TF::SelectV2Op,
+          TF::ShapeNOp,
+          TF::ShapeOp,
+          TF::SizeOp,
+          TF::SqueezeOp,
+          TF::TransposeOp
           // go/keep-sorted end
           // clang-format on
           >(op)) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h
index 411bc463456..2b3089d6ddf 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h
@@ -32,7 +32,6 @@ namespace quant {
 std::unique_ptr<OpQuantSpec> GetTFOpQuantSpec(Operation* op);
 
 // Returns quantization scale specs (fixed output, same scale) for a TF op.
-// TODO(b/224691264): Implement same scale verification like `VerifySameScales`
 std::unique_ptr<OpQuantScaleSpec> GetTfQuantScaleSpec(Operation* op);
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
new file mode 100644
index 00000000000..c86968b319c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir::quant {
+
+std::unique_ptr<OpQuantSpec> GetUniformOpQuantSpec(Operation* op) {
+  auto spec = std::make_unique<OpQuantSpec>();
+  if (isa<TF::UniformQuantizedConvolutionHybridOp>(op) ||
+      isa<TF::UniformQuantizedConvolutionOp>(op)) {
+    spec->coeff_op_quant_dim[1] = 3;
+  } else if (isa<TF::UniformQuantizedDotHybridOp>(op)) {
+    spec->coeff_op_quant_dim[1] = -1;
+  }
+
+  for (auto quantizable_operand : spec->coeff_op_quant_dim) {
+    spec->quantizable_operands.insert(quantizable_operand.first);
+  }
+  return spec;
+}
+
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h
new file mode 100644
index 00000000000..5ff8929c71c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Functions for quantization specifications of Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
+
+#include <memory>
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+
+namespace mlir {
+namespace quant {
+
+// Returns the spec for the given operation that can be used for both of
+// dynamic and static range quantization.
+std::unique_ptr<OpQuantSpec> GetUniformOpQuantSpec(Operation* op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_OPS_UNIFORM_OP_QUANT_SPEC_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_quant_ops_to_mhlo.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_quant_ops_to_mhlo.cc
index cefc126a515..1ad50bb633c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_quant_ops_to_mhlo.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_tf_quant_ops_to_mhlo.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/ir/importexport/mangling.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
new file mode 100644
index 00000000000..707856389d2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
@@ -0,0 +1,369 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+
+#include "absl/algorithm/container.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+// Required to use LLVM_DEBUG macro.
+#define DEBUG_TYPE "quant-duplicate-shape-determining-constants"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+// This pass duplicates constants that affect or determine the shape of a tensor
+// after being used in a computation for some op. Some specific operands of TF
+// ops (like the `dim` argument for `TF::ExpandDimsOp`) determine the shape of
+// the resulting tensor. If these operands are constants, they are duplicated
+// and replace the shape-determining operands. Each duplicated constant will
+// only be used as the shape-determining operand; it will not replace other
+// usages of the original constant. If the operands are not constants (i.e.
+// results of some other computation), then the pass recursively traverses the
+// call tree upwards and duplicates all constants found in the subtree in a
+// similar manner.
+//
+// This pass may be used to avoid placing shape-determining constants in the CPU
+// graph and pass them as arguments to the TPU graph (via `TPUPartitionedCall`).
+// If this happens, the XLA compiler cannot recognize such arguments as
+// constants and may result in an error.
+//
+// A set of predefined ops and operand indices is used to determine whether an
+// operand is a target for constant duplication.
+class DuplicateShapeDeterminingConstantsPass
+    : public PassWrapper<DuplicateShapeDeterminingConstantsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      DuplicateShapeDeterminingConstantsPass)
+
+  StringRef getArgument() const final {
+    return "quant-duplicate-shape-determining-constants";
+  }
+
+  StringRef getDescription() const final {
+    return "Duplicates shape-determining constants. A shape-determining "
+           "constant is a constant that are transitively used to change or "
+           "determine the shape of a tensor. For example, the second argument "
+           "'dim' to TF::ExpandDimsOp specifies the dimension index to expand.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Returns True iff the otuput value of `op` is either a compile time constant
+// or bounded from the XLA compiler's perspective, even if it is not a
+// `ConstOp`.
+bool IsOutputCompileTimeConstantOrBounded(Operation* op) {
+  return llvm::isa_and_nonnull<TF::ShapeOp, TF::ShapeNOp, TF::RankOp,
+                               TF::SizeOp, TF::TensorArraySizeV3Op,
+                               TF::XlaSetBoundOp>(op);
+}
+
+// Recursively duplicate constants for `op_operands` upward.
+void RecursivelyDuplicateConstantsForOperands(
+    llvm::ArrayRef<OpOperand*> op_operands) {
+  // Target operands to duplicate if it is a ConstOp.
+  llvm::SmallVector<OpOperand*, 4> duplication_targets{op_operands.begin(),
+                                                       op_operands.end()};
+
+  int target_idx = 0;
+  while (target_idx < duplication_targets.size()) {
+    OpOperand* curr_operand = duplication_targets[target_idx];
+    target_idx++;
+
+    Operation* owning_op = curr_operand->getOwner();
+    Operation* defining_op = curr_operand->get().getDefiningOp();
+
+    if (llvm::isa_and_nonnull<TF::ConstOp>(defining_op)) {
+      // No need to clone if this is the only use.
+      if (defining_op->hasOneUse()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Not duplicating constant operand since it has only one "
+                      "usage. Op: "
+                   << curr_operand->getOperandNumber()
+                   << ", operand idx: " << curr_operand->getOperandNumber()
+                   << ", loc: " << owning_op->getLoc() << "\n");
+        continue;
+      }
+
+      mlir::OpBuilder builder{owning_op->getContext()};
+      builder.setInsertionPointAfter(defining_op);
+      auto const_op_cloned = builder.clone(*defining_op);
+
+      // Replace the operand with the duplicated op.
+      owning_op->setOperand(curr_operand->getOperandNumber(),
+                            const_op_cloned->getResult(0));
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Duplicated constant operand from: "
+                 << owning_op->getName().getStringRef()
+                 << ", operand idx: " << curr_operand->getOperandNumber()
+                 << ", loc: " << const_op_cloned->getLoc() << "\n");
+    } else if (IsOutputCompileTimeConstantOrBounded(defining_op)) {
+      // Stop the recursion early when the output of the defining op is
+      // considered compile-time constant from the XLA compiler's perspective.
+      continue;
+    } else if (!defining_op) {
+      // One example for this case is when `curr_operand` is a function
+      // argument.
+      owning_op->emitWarning()
+          << "Operand idx (zero-based): " << curr_operand->getOperandNumber()
+          << " does not have a defining op and cannot be duplicated.";
+    } else {
+      // If the operand's defining is not a ConstOp, recursively traverse
+      // "upwards" to find ConstOps that transitively produces the current
+      // operand and duplicate them.
+      auto op_operands = defining_op->getOpOperands();
+      absl::c_transform(
+          op_operands, std::back_inserter(duplication_targets),
+          [](OpOperand& op_operand) -> OpOperand* { return &op_operand; });
+    }
+  }
+}
+
+// Evaluate `operand_idx` w.r.t. `op`'s operands. If `operand_idx` is a positive
+// number or a zero, it is returned as it is. If it is a negative number, it
+// means it is counting backwards and will return the zero-based operand index
+// for `op`.
+//
+// `operand_idx` should be within the range: [-num_operands, num_operands - 1].
+int EvaluateOperandIdx(const int operand_idx, Operation& op) {
+  if (operand_idx < 0) {
+    // Calculate the actual index if a negative value is provided for
+    // `operand_idx`.
+    return op.getNumOperands() + operand_idx;
+  }
+  return operand_idx;
+}
+
+// Returns the pointers to operands at `operand_indices` of `op`.
+llvm::SmallVector<OpOperand*> GetOperands(Operation& op,
+                                          llvm::ArrayRef<int> operand_indices) {
+  llvm::SmallVector<OpOperand*> operands{};
+  for (const int operand_idx : operand_indices) {
+    const int evaluated_operand_idx = EvaluateOperandIdx(operand_idx, op);
+    operands.emplace_back(&op.getOpOperand(evaluated_operand_idx));
+  }
+
+  return operands;
+}
+
+// Represents an op type and its operand indices that should be "compile time
+// constant" from the XLA compiler's point of view.
+template <typename OpT, int... OperandIdx>
+struct CompileTimeConstantOperand {
+  static_assert(
+      sizeof...(OperandIdx) > 0,
+      "CompileTimeConstantOperand should have at least one operand index.");
+
+  using OpType = OpT;
+
+  // Returns the indices of operands that should be compile time constants.
+  static constexpr std::array<int, sizeof...(OperandIdx)> OperandIndices() {
+    return {OperandIdx...};
+  }
+};
+
+// Finds all op of type `T::OpType` `func_op` and recursively duplicates
+// constants used at the op's operands at `T::OperandIndices()`. It sequentially
+// does the same thing for `Ts`.
+template <typename T, typename... Ts>
+void DuplicateShapeDeterminingConstants(func::FuncOp func_op) {
+  for (auto op : func_op.getOps<typename T::OpType>()) {
+    RecursivelyDuplicateConstantsForOperands(
+        GetOperands(*op, T::OperandIndices()));
+  }
+
+  // Do the same thing for the rest of `Ts`.
+  if constexpr (sizeof...(Ts) != 0) {
+    DuplicateShapeDeterminingConstants<Ts...>(func_op);
+  }
+}
+
+void DuplicateShapeDeterminingConstantsPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+
+  DuplicateShapeDeterminingConstants<
+      // go/keep-sorted start
+      CompileTimeConstantOperand<TF::AllToAllOp, 1>,  // $group_assignment
+      CompileTimeConstantOperand<TF::ArgMaxOp, 1>,    // $dimension
+      CompileTimeConstantOperand<TF::ArgMinOp, 1>,    // $dimension
+      // $orig_input_shape
+      CompileTimeConstantOperand<TF::AvgPool3DGradOp, 0>,
+      // $orig_input_shape
+      CompileTimeConstantOperand<TF::AvgPoolGradOp, 0>,
+      // $block_shape, $crops
+      CompileTimeConstantOperand<TF::BatchToSpaceNDOp, 1, 2>,
+      CompileTimeConstantOperand<TF::BatchToSpaceOp, 1>,      // $crops
+      CompileTimeConstantOperand<TF::BincountOp, 1>,          // $size
+      CompileTimeConstantOperand<TF::BroadcastArgsOp, 0, 1>,  // $s0, $s1
+      // $s0, $s1
+      CompileTimeConstantOperand<TF::BroadcastGradientArgsOp, 0, 1>,
+      CompileTimeConstantOperand<TF::BroadcastToOp, 1>,  // $shape
+      /// $group_assignment
+      CompileTimeConstantOperand<TF::CollectiveAssignGroupV2Op, 0>,
+      // $source_target_pairs
+      CompileTimeConstantOperand<TF::CollectivePermuteOp, 1>,
+      // $group_size, $group_key
+      CompileTimeConstantOperand<TF::CollectiveReduceV2Op, 1, 2>,
+      CompileTimeConstantOperand<TF::ConcatV2Op, -1>,  // (variadic) $axis
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::Conv2DBackpropFilterOp, 1>,
+      CompileTimeConstantOperand<TF::Conv2DBackpropInputOp, 0>,  // $input_sizes
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::Conv3DBackpropFilterV2Op, 1>,
+      // $input_sizes
+      CompileTimeConstantOperand<TF::Conv3DBackpropInputV2Op, 0>,
+      // $group_assignment
+      CompileTimeConstantOperand<TF::CrossReplicaSumOp, 1>,
+      CompileTimeConstantOperand<TF::CumprodOp, 1>,              // $axis
+      CompileTimeConstantOperand<TF::CumsumOp, 1>,               // $axis
+      CompileTimeConstantOperand<TF::CumulativeLogsumexpOp, 1>,  // $axis
+      // $filter_sizes
+      CompileTimeConstantOperand<TF::DepthwiseConv2dNativeBackpropFilterOp, 1>,
+      // $input_sizes
+      CompileTimeConstantOperand<TF::DepthwiseConv2dNativeBackpropInputOp, 0>,
+      CompileTimeConstantOperand<TF::EmptyOp, 0>,  // $shape
+      // $element_shape, $max_num_elements
+      CompileTimeConstantOperand<TF::EmptyTensorListOp, 0, 1>,
+      CompileTimeConstantOperand<TF::ExpandDimsOp, 1>,   // $dim
+      CompileTimeConstantOperand<TF::FillOp, 0>,         // $dims
+      CompileTimeConstantOperand<TF::GatherV2Op, 2>,     // $axis
+      CompileTimeConstantOperand<TF::IRFFT2DOp, 1>,      // $fft_length
+      CompileTimeConstantOperand<TF::IRFFT3DOp, 1>,      // $fft_length
+      CompileTimeConstantOperand<TF::IRFFTOp, 1>,        // $fft_length
+      CompileTimeConstantOperand<TF::InTopKV2Op, 2>,     // $k
+      CompileTimeConstantOperand<TF::LinSpaceOp, 2>,     // $num
+      CompileTimeConstantOperand<TF::ListDiffOp, 0, 1>,  // $x, $y
+      // $k, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagPartV3Op, 1, 2>,
+      // $k, $num_rows, $num_cols, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagV2Op, 1, 2, 3, 4>,
+      // $k, $num_rows, $num_cols, $padding_value
+      CompileTimeConstantOperand<TF::MatrixDiagV3Op, 1, 2, 3, 4>,
+      CompileTimeConstantOperand<TF::MatrixSetDiagV2Op, 2>,  // $k
+      CompileTimeConstantOperand<TF::MatrixSetDiagV3Op, 2>,  // $k
+      CompileTimeConstantOperand<TF::MaxOp, 1>,  // $reduction_indices
+      // $ksize, $strides
+      CompileTimeConstantOperand<TF::MaxPoolGradGradV2Op, 3, 4>,
+      // $ksize, $strides
+      CompileTimeConstantOperand<TF::MaxPoolGradV2Op, 2, 3>,
+      CompileTimeConstantOperand<TF::MaxPoolV2Op, 1, 2>,   // $ksize, $strides
+      CompileTimeConstantOperand<TF::MeanOp, 1>,           // $reduction_indices
+      CompileTimeConstantOperand<TF::MirrorPadGradOp, 1>,  // $paddings
+      CompileTimeConstantOperand<TF::MirrorPadOp, 1>,      // $paddings
+      CompileTimeConstantOperand<TF::MultinomialOp, 1>,    // $num_samples
+      // $max_output_size
+      CompileTimeConstantOperand<TF::NonMaxSuppressionV3Op, 2>,
+      // $max_output_size
+      CompileTimeConstantOperand<TF::NonMaxSuppressionV4Op, 2>,
+      CompileTimeConstantOperand<TF::OneHotOp, 1>,  // $depth
+      CompileTimeConstantOperand<TF::PadOp, 1>,     // $paddings
+      CompileTimeConstantOperand<TF::PadV2Op, 1>,   // $paddings
+      // $shape
+      CompileTimeConstantOperand<TF::ParameterizedTruncatedNormalOp, 0>,
+      CompileTimeConstantOperand<TF::RFFT2DOp, 1>,                // $fft_length
+      CompileTimeConstantOperand<TF::RFFT3DOp, 1>,                // $fft_length
+      CompileTimeConstantOperand<TF::RFFTOp, 1>,                  // $fft_length
+      CompileTimeConstantOperand<TF::RandomStandardNormalOp, 0>,  // $shape
+      CompileTimeConstantOperand<TF::RandomUniformIntOp, 0>,      // $shape
+      CompileTimeConstantOperand<TF::RandomUniformOp, 0>,         // $shape
+      // $start, $limit, $delta
+      CompileTimeConstantOperand<TF::RangeOp, 0, 1, 2>,
+      CompileTimeConstantOperand<TF::ReshapeOp, 1>,                // $shape
+      CompileTimeConstantOperand<TF::ResizeBilinearOp, 1>,         // $size
+      CompileTimeConstantOperand<TF::ResizeNearestNeighborOp, 1>,  // $size
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::ResourceStridedSliceAssignOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::ReverseOp, 1>,        // $dims
+      CompileTimeConstantOperand<TF::ReverseV2Op, 1>,      // $axis
+      CompileTimeConstantOperand<TF::ScatterNdOp, 2>,      // $shape
+      CompileTimeConstantOperand<TF::SegmentSumV2Op, 2>,   // $num_segments
+      CompileTimeConstantOperand<TF::SliceOp, 1, 2>,       // $begin, $size
+      CompileTimeConstantOperand<TF::SparseToDenseOp, 1>,  // $output_shape
+      CompileTimeConstantOperand<TF::StackV2Op, 0>,        // $max_size
+      // $num_samples
+      CompileTimeConstantOperand<TF::StatelessMultinomialOp, 1>,
+      // $shape, $begin, $end, $strides
+      CompileTimeConstantOperand<TF::StridedSliceGradOp, 0, 1, 2, 3>,
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::StridedSliceOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::SumOp, 1>,  // $reduction_indices
+      CompileTimeConstantOperand<TF::TensorArraySplitV3Op, 2>,  // $lengths
+      CompileTimeConstantOperand<TF::TensorArrayV3Op, 0>,       // $size
+      // $element_shape
+      CompileTimeConstantOperand<TF::TensorListFromTensorOp, 1>,
+      // $element_shape, $num_elements
+      CompileTimeConstantOperand<TF::TensorListReserveOp, 0, 1>,
+      // $begin, $end, $strides
+      CompileTimeConstantOperand<TF::TensorStridedSliceUpdateOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::TileOp, 1>,                // $multiples
+      CompileTimeConstantOperand<TF::TopKV2Op, 1>,              // $k
+      CompileTimeConstantOperand<TF::TransposeOp, 1>,           // $perm
+      CompileTimeConstantOperand<TF::TruncatedNormalOp, 0>,     // $shape
+      CompileTimeConstantOperand<TF::UnsortedSegmentMaxOp, 2>,  // $num_segments
+      CompileTimeConstantOperand<TF::UnsortedSegmentMinOp, 2>,  // $num_segments
+      CompileTimeConstantOperand<TF::UnsortedSegmentSumOp, 2>,  // $num_segments
+      // $broadcast_dims
+      CompileTimeConstantOperand<TF::XlaBroadcastHelperOp, 2>,
+      // $window_strides, $padding, $lhs_dilation, $rhs_dilation,
+      // $feature_group_count
+      CompileTimeConstantOperand<TF::XlaConvOp, 2, 3, 4, 5, 6>,
+      // $window_strides, $padding, $lhs_dilation, $rhs_dilation,
+      // $feature_group_count
+      CompileTimeConstantOperand<TF::XlaConvV2Op, 2, 3, 4, 5, 6>,
+      CompileTimeConstantOperand<TF::XlaDynamicSliceOp, 2>,  // $slice_indices
+      CompileTimeConstantOperand<TF::XlaGatherOp, 2>,        // $slice_sizes
+      // $padding_low, $padding_high, $padding_interior
+      CompileTimeConstantOperand<TF::XlaPadOp, 2, 3, 4>,
+      // $window_dimensions, $window_strides, $base_dilations,
+      // $window_dilations, $padding
+      CompileTimeConstantOperand<TF::XlaReduceWindowOp, 2, 3, 4, 5, 6>,
+      // $dim_index
+      CompileTimeConstantOperand<TF::XlaRemoveDynamicDimensionSizeOp, 1>,
+      // $window_dimensions, $window_strides, $padding
+      CompileTimeConstantOperand<TF::XlaSelectAndScatterOp, 1, 2, 3>,
+      CompileTimeConstantOperand<TF::XlaSetBoundOp, 1>,  // $bound
+      // $dim_index
+      CompileTimeConstantOperand<TF::XlaSetDynamicDimensionSizeOp, 1>
+      // go/keep-sorted end
+      >(func_op);
+}
+
+static PassRegistration<DuplicateShapeDeterminingConstantsPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDuplicateShapeDeterminingConstantsPass() {
+  return std::make_unique<DuplicateShapeDeterminingConstantsPass>();
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index 7441b68a19e..2970cafffc0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -33,7 +33,8 @@ namespace mlir {
 namespace quant {
 namespace {
 
-constexpr char kCustomAggregatorOpName[] = "tf.CustomAggregator";
+constexpr StringRef kCustomAggregatorOpName = "tf.CustomAggregator";
+constexpr StringRef kQuantTraitAttrName = "_tfl_quant_trait";
 
 class InsertCustomAggregationOpsPass
     : public PassWrapper<InsertCustomAggregationOpsPass,
@@ -74,6 +75,12 @@ class AddCustomAggregationOp : public RewritePattern {
     if (op->getName().getStringRef() == kCustomAggregatorOpName)
       return failure();
 
+    // Return early if the given op is a non-quantizable op.
+    auto call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
+    if (call_op && !op->hasAttr(kQuantTraitAttrName)) {
+      return failure();
+    }
+
     bool mutated = false;
     for (Value input : op->getOperands()) {
       Type element_type = getElementTypeOrSelf(input.getType());
@@ -89,7 +96,8 @@ class AddCustomAggregationOp : public RewritePattern {
       }
 
       // Skip calibration when the given operand comes from a constant.
-      if (defining_op != nullptr && detail::isConstantLike(defining_op)) {
+      if (defining_op != nullptr &&
+          defining_op->hasTrait<OpTrait::ConstantLike>()) {
         continue;
       }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
index e6fa3b3b1e1..0d60c9a2020 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
@@ -12,18 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
+#include <iterator>
 #include <memory>
 #include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -31,45 +42,41 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 using ::tensorflow::kImportModelDefaultGraphFuncName;
 
-constexpr char kEntryFunctionAttr[] = "tf.entry_function";
-constexpr char kExportedNameAttr[] = "tf_saved_model.exported_names";
-constexpr char kIndexPathAttr[] = "tf_saved_model.index_path";
+constexpr StringRef kEntryFunctionAttr = "tf.entry_function";
 
 // The ConvertMlirToGraphdef requires the provided input module to have a main
 // function, which might not exist in case of multi-signature graphs. In that
 // case, this pass will create a new main function, which calls signature
 // functions.
+//
+// An already existing @main function will be renamed by attaching a numeric
+// suffix like `@main_0` to avoid conflict with the newly created main function.
 class InsertMainFunctionPass
     : public PassWrapper<InsertMainFunctionPass, OperationPass<ModuleOp>> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertMainFunctionPass)
 
-  explicit InsertMainFunctionPass() {}
+  explicit InsertMainFunctionPass() = default;
 
-  StringRef getArgument() const override { return "quant-add-main-function"; }
+  StringRef getArgument() const override {
+    return "quant-insert-main-function";
+  }
 
   StringRef getDescription() const override {
-    return "Insert the main function to the module if it is missing.";
+    return "Inserts the main function to the module.";
   }
 
   void runOnOperation() override;
 };
 
-// Checks if the module has a main function.
-bool HasMainFunction(ModuleOp& module) {
-  StringAttr main_func_id =
-      StringAttr::get(module.getContext(), kImportModelDefaultGraphFuncName);
-  for (auto function : module.getOps<func::FuncOp>()) {
-    if (function.getName() == main_func_id) return true;
-  }
-  return false;
-}
-
 // Checks if a FuncOp is exported.
 bool IsExported(func::FuncOp op) {
-  auto exported_names = op->getAttrOfType<ArrayAttr>(kExportedNameAttr);
+  auto exported_names =
+      op->getAttrOfType<ArrayAttr>(kTfSavedModelExportedNamesAttr);
   return exported_names && !exported_names.empty();
 }
 
@@ -86,7 +93,7 @@ bool ShouldIncludeInMainFunction(func::FuncOp func_op) {
 }
 
 // Sets a function to be private so it can be referred internally.
-void SetFunctionPrivate(func::FuncOp& func) {
+void SetFunctionPrivate(func::FuncOp func) {
   func.setVisibility(SymbolTable::Visibility::Private);
 
   // The `tf_saved_model` attributes can only be appied to public functions.
@@ -97,8 +104,9 @@ void SetFunctionPrivate(func::FuncOp& func) {
     }
   }
 
+  auto iface = cast<FunctionOpInterface>(func.getOperation());
   for (int i = 0; i < func.getNumArguments(); ++i) {
-    for (auto& attr : func.getArgAttrs(i)) {
+    for (auto& attr : iface.getArgAttrs(i)) {
       const StringAttr& attr_name = attr.getName();
       if (attr_name.getValue().startswith("tf_saved_model.")) {
         func.removeArgAttr(i, attr_name);
@@ -106,7 +114,7 @@ void SetFunctionPrivate(func::FuncOp& func) {
     }
   }
   for (int i = 0; i < func.getNumResults(); ++i) {
-    for (auto& attr : func.getResultAttrs(i)) {
+    for (auto& attr : iface.getResultAttrs(i)) {
       const StringAttr& attr_name = attr.getName();
       if (attr_name.getValue().startswith("tf_saved_model.")) {
         func.removeResultAttr(i, attr_name);
@@ -115,55 +123,121 @@ void SetFunctionPrivate(func::FuncOp& func) {
   }
 }
 
-// Creates a main function which calls other exported functions.
-bool CreateMainFunction(ModuleOp& module) {
-  MLIRContext* context = module.getContext();
-  OpBuilder builder(context);
-
-  // Collects argument and result types.
-  llvm::SmallVector<Location> arg_locs;
-  llvm::SmallVector<Type> arg_types, result_types;
-  std::vector<std::string> input_names, output_names;
-  for (auto function : module.getOps<func::FuncOp>()) {
-    if (!ShouldIncludeInMainFunction(function)) continue;
-
-    arg_types.append(function.getArgumentTypes().begin(),
-                     function.getArgumentTypes().end());
-    auto& return_op = function.getBody().getBlocks().front().back();
-    result_types.append(return_op.getOperandTypes().begin(),
-                        return_op.getOperandTypes().end());
-    for (const auto& arg : function.getArguments()) {
-      arg_locs.push_back(arg.getLoc());
-    }
+// Information to identify an output in its node and in the model output list.
+// Ex: If the model output list is ["add:0", "topk:0": "topk:1"], then the
+// output corresponding to "topk:1" will have output_index=2 and tensor_index=1.
+struct OutputInfo {
+  // The index of this output in the model output list.
+  int32_t output_index;
+  // The index of this output in its node.
+  int32_t tensor_index;
+  // The output value.
+  Value value;
+};
 
-    // Collects input and output node names. These names are prefixed with the
-    // signature key in SavedModel. They also contain the index suffix. Ex:
-    // "<signature key>_<name>:0", where 0 is the index.
+// Makes input/output names across entry functions unique if necessary. If a
+// dupliated name is found, this function will add signature prefix for all the
+// input/output names.
+void GetUniqueInputOutputNodeNames(ModuleOp module_op,
+                                   std::vector<std::string>& input_name_vec,
+                                   std::vector<std::string>& output_name_vec) {
+  bool need_prefix_for_input_name = false;
+  bool need_prefix_for_output_name = false;
+  std::vector<StringRef> fn_input_name_vec, fn_output_name_vec;
+  StringSet<> input_name_set, output_name_set;
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
     if (auto tf_attrs =
-            function->getAttrOfType<DictionaryAttr>(kEntryFunctionAttr)) {
+            func_op->getAttrOfType<DictionaryAttr>(kEntryFunctionAttr)) {
+      StringRef function_name = func_op.getSymName();
+
       if (auto inputs_attr = tf_attrs.get("inputs")) {
-        std::string inputs_attr_str =
+        const std::string inputs_attr_str =
             inputs_attr.cast<StringAttr>().getValue().str();
-        std::vector<std::string> inputs_attr_vec =
+        std::vector<std::string> fn_input_names =
             absl::StrSplit(inputs_attr_str, ',', absl::SkipEmpty());
-        input_names.insert(input_names.end(), inputs_attr_vec.begin(),
-                           inputs_attr_vec.end());
+
+        for (StringRef input_name : fn_input_names) {
+          if (input_name_set.contains(input_name)) {
+            // Found a duplicated name, all input names will be prefixed by
+            // their corresponding function names.
+            need_prefix_for_input_name = true;
+          }
+          input_name_set.insert(input_name);
+          fn_input_name_vec.push_back(function_name);
+        }
+        input_name_vec.insert(input_name_vec.end(),
+                              std::make_move_iterator(fn_input_names.begin()),
+                              std::make_move_iterator(fn_input_names.end()));
       }
+
       if (auto outputs_attr = tf_attrs.get("outputs")) {
-        std::string outputs_attr_str =
+        const std::string outputs_attr_str =
             outputs_attr.cast<StringAttr>().getValue().str();
-        std::vector<std::string> outputs_attr_vec =
+        std::vector<std::string> fn_output_names =
             absl::StrSplit(outputs_attr_str, ',', absl::SkipEmpty());
-        output_names.insert(output_names.end(), outputs_attr_vec.begin(),
-                            outputs_attr_vec.end());
+
+        for (StringRef output_name : fn_output_names) {
+          if (output_name_set.contains(output_name)) {
+            // Found a duplicated name, all output names will be prefixed by
+            // their corresponding function names.
+            need_prefix_for_output_name = true;
+          }
+          output_name_set.insert(output_name);
+          fn_output_name_vec.push_back(function_name);
+        }
+        output_name_vec.insert(output_name_vec.end(),
+                               std::make_move_iterator(fn_output_names.begin()),
+                               std::make_move_iterator(fn_output_names.end()));
       }
     }
   }
 
+  if (need_prefix_for_input_name) {
+    absl::c_transform(
+        input_name_vec, fn_input_name_vec, input_name_vec.begin(),
+        [](const std::string& input_name, const StringRef fn_name) {
+          return absl::StrCat(fn_name.str(), "_", input_name);
+        });
+  }
+  if (need_prefix_for_output_name) {
+    absl::c_transform(
+        output_name_vec, fn_output_name_vec, output_name_vec.begin(),
+        [](const std::string& output_name, const StringRef fn_name) {
+          return absl::StrCat(fn_name.str(), "_", output_name);
+        });
+  }
+}
+
+// Creates a main function which calls other exported functions.
+bool CreateMainFunction(ModuleOp module_op) {
+  MLIRContext* context = module_op.getContext();
+  OpBuilder builder(context);
+
+  std::vector<std::string> input_names, output_names;
+  GetUniqueInputOutputNodeNames(module_op, input_names, output_names);
+
+  // Collects argument and result types.
+  llvm::SmallVector<Location> arg_locs;
+  llvm::SmallVector<Type> arg_types, result_types;
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
+
+    arg_types.append(func_op.getArgumentTypes().begin(),
+                     func_op.getArgumentTypes().end());
+    auto& return_op = func_op.getBody().getBlocks().front().back();
+    result_types.append(return_op.getOperandTypes().begin(),
+                        return_op.getOperandTypes().end());
+    for (const auto& arg : func_op.getArguments()) {
+      arg_locs.push_back(arg.getLoc());
+    }
+  }
+
   // Creates a new main function.
   auto func_type = FunctionType::get(context, arg_types, result_types);
   auto main_func = builder.create<func::FuncOp>(
-      module.getLoc(), kImportModelDefaultGraphFuncName, func_type);
+      module_op.getLoc(), kImportModelDefaultGraphFuncName, func_type);
   builder.createBlock(&main_func.getBody(), main_func.begin(), arg_types,
                       arg_locs);
   SmallVector<NamedAttribute> func_attrs;
@@ -176,12 +250,12 @@ bool CreateMainFunction(ModuleOp& module) {
   auto dictAttr = DictionaryAttr::get(context, func_attrs);
   main_func->setAttr(StringAttr::get(context, kEntryFunctionAttr), dictAttr);
   main_func->setAttr(
-      kExportedNameAttr,
+      kTfSavedModelExportedNamesAttr,
       builder.getStrArrayAttr({kImportModelDefaultGraphFuncName}));
 
   if (input_names.size() != main_func.getNumArguments() ||
       output_names.size() != main_func.getNumResults()) {
-    module.emitError()
+    module_op.emitError()
         << "Number of inputs and outputs in the tf.entry_function attribute "
            "mismatched. [Input] Expected: "
         << input_names.size() << ", got: " << main_func.getNumArguments()
@@ -193,14 +267,14 @@ bool CreateMainFunction(ModuleOp& module) {
   const int num_args = main_func.getNumArguments();
   for (int i = 0; i < num_args; ++i) {
     main_func.setArgAttr(
-        i, kIndexPathAttr,
+        i, kTfSavedModelIndexPathAttr,
         ArrayAttr::get(context, {StringAttr::get(context, input_names[i])}));
   }
 
   const int num_results = main_func.getNumResults();
   for (int i = 0; i < num_results; ++i) {
     main_func.setResultAttr(
-        i, kIndexPathAttr,
+        i, kTfSavedModelIndexPathAttr,
         ArrayAttr::get(context, {StringAttr::get(context, output_names[i])}));
   }
 
@@ -208,42 +282,140 @@ bool CreateMainFunction(ModuleOp& module) {
   auto guard = OpBuilder::InsertionGuard(builder);
   int arg_idx = 0;
   int result_idx = 0;
-  llvm::SmallVector<Value> returning_values;
-  for (auto function : module.getOps<func::FuncOp>()) {
-    if (!ShouldIncludeInMainFunction(function)) continue;
+  llvm::SmallVector<Value> call_op_returns;
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (!ShouldIncludeInMainFunction(func_op)) continue;
 
-    llvm::ArrayRef<BlockArgument> new_args = llvm::makeArrayRef(
-        main_func.getArguments().begin() + arg_idx, function.getNumArguments());
-    arg_idx += function.getNumArguments();
-    llvm::ArrayRef<Type> new_types = llvm::makeArrayRef(
-        result_types.begin() + result_idx, function.getNumResults());
-    result_idx += function.getNumResults();
+    llvm::ArrayRef<BlockArgument> new_args = llvm::ArrayRef(
+        main_func.getArguments().begin() + arg_idx, func_op.getNumArguments());
+    arg_idx += func_op.getNumArguments();
+    llvm::ArrayRef<Type> new_types = llvm::ArrayRef(
+        result_types.begin() + result_idx, func_op.getNumResults());
+    result_idx += func_op.getNumResults();
 
     auto call_op = builder.create<TF::PartitionedCallOp>(
-        module.getLoc(), new_types, new_args,
-        SymbolRefAttr::get(context, function.getSymName()),
+        module_op.getLoc(), new_types, new_args,
+        SymbolRefAttr::get(context, func_op.getSymName()),
         /*config=*/builder.getStringAttr(""),
         /*config_proto=*/builder.getStringAttr(""),
         /*executor_type=*/builder.getStringAttr(""));
-    returning_values.append(call_op.getResults().begin(),
-                            call_op.getResults().end());
-    SetFunctionPrivate(function);
+    call_op_returns.append(call_op.getResults().begin(),
+                           call_op.getResults().end());
+    SetFunctionPrivate(func_op);
+  }
+
+  // Creates Identity/IdentityN ops for returing values. This allows us to
+  // restore the same output tensor names in python.
+  int32_t output_count = 0;
+  // Map from node name to the list of the OutputInfos of its outputs that are
+  // used as the model outputs.
+  llvm::StringMap<llvm::SmallVector<OutputInfo>> node_to_output_map;
+  for (auto [output_name, call_op_return] :
+       llvm::zip(output_names, call_op_returns)) {
+    std::vector<std::string> name_and_index =
+        absl::StrSplit(output_name, ':', absl::SkipEmpty());
+    llvm::StringRef node_name = name_and_index.front();
+    int32_t tensor_index = 0;
+    if (name_and_index.size() > 1) {
+      tensor_index = std::stoi(name_and_index.back());
+    }
+    node_to_output_map[node_name].push_back(
+        {output_count++, tensor_index, call_op_return});
+  }
+
+  Value scalar_one =
+      CreateScalarConstValue<float>(builder, builder.getUnknownLoc(), 1.0);
+  llvm::SmallVector<Value> returning_values(output_count, Value());
+  for (const auto& node_name : node_to_output_map.keys()) {
+    auto node_output_tensors = node_to_output_map[node_name];
+
+    NameLoc new_loc = NameLoc::get(builder.getStringAttr(node_name));
+    int32_t max_tensor_index = 0;
+    absl::c_for_each(node_output_tensors,
+                     [&max_tensor_index](const OutputInfo& output_info) {
+                       max_tensor_index =
+                           std::max(max_tensor_index, output_info.tensor_index);
+                     });
+
+    // Create IdentityOp or IdentityNOp based on the number of outputs.
+    Operation* identity_op;
+    if (max_tensor_index == 0) {
+      Value output_value = node_output_tensors.front().value;
+      identity_op = builder.create<TF::IdentityOp>(
+          new_loc, output_value.getType(), output_value);
+    } else {
+      llvm::SmallVector<Value> input_values(node_output_tensors.size(),
+                                            scalar_one);
+      for (const auto& [output_index, tensor_index, tensor_value] :
+           node_output_tensors) {
+        input_values[tensor_index] = tensor_value;
+      }
+      identity_op = builder.create<TF::IdentityNOp>(
+          new_loc, TypeRange(ValueRange(input_values)), input_values);
+    }
+
+    for (const auto& [output_index, tensor_index, tensor_value] :
+         node_output_tensors) {
+      returning_values[output_index] = identity_op->getResult(tensor_index);
+    }
   }
   builder.create<func::ReturnOp>(main_func.getBody().getLoc(),
                                  returning_values);
 
   // Adds the new function to symbol table.
-  SymbolTable symbol_table(module);
+  SymbolTable symbol_table(module_op);
   symbol_table.insert(main_func);
   return true;
 }
 
+// Creates a new function name by attaching a number suffix
+// (`main_func_name_{i}`) and incrementing it until there are no conflicts.
+std::string CreateNewFuncName(const StringRef main_func_name,
+                              SymbolTable& symbol_table) {
+  int suffix_id = 0;
+  std::string new_func_name =
+      absl::StrCat(main_func_name.str(), "_", suffix_id);
+  while (symbol_table.lookup(new_func_name)) {
+    suffix_id++;
+    new_func_name = absl::StrCat(main_func_name.str(), "_", suffix_id);
+  }
+
+  return new_func_name;
+}
+
+// Renames the existing @main function to avoid conflict with the newly
+// created main function. When it is renamed, its usages will also be replaced.
+// It will be renamed by attaching a number suffix like `@main_{i}`, until there
+// are no conflicts. This function is a no-op when no function called @main
+// exists.
+LogicalResult RenameExistingMainFunction(ModuleOp module_op) {
+  SymbolTable symbol_table(module_op);
+
+  auto main_func_op =
+      symbol_table.lookup<func::FuncOp>(kImportModelDefaultGraphFuncName);
+  if (!main_func_op) {
+    return success();
+  }
+
+  const std::string new_func_name =
+      CreateNewFuncName(main_func_op.getSymName(), symbol_table);
+
+  main_func_op.setSymName(new_func_name);
+  return symbol_table.replaceAllSymbolUses(
+      main_func_op, StringAttr::get(module_op.getContext(), new_func_name),
+      module_op);
+}
+
 void InsertMainFunctionPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  if (!HasMainFunction(module)) {
-    if (!CreateMainFunction(module)) {
-      signalPassFailure();
-    }
+  ModuleOp module_op = getOperation();
+
+  if (failed(RenameExistingMainFunction(module_op))) {
+    module_op->emitError("Failed to rename existing function `@main`.");
+    signalPassFailure();
+  }
+
+  if (!CreateMainFunction(module_op)) {
+    signalPassFailure();
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
index 38f6de3be6c..2535fc5ae62 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
@@ -36,15 +36,18 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using QuantMethod =
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
+
 class InsertQuantizedFunctionsPass
     : public PassWrapper<InsertQuantizedFunctionsPass,
                          OperationPass<ModuleOp>> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertQuantizedFunctionsPass)
 
-  explicit InsertQuantizedFunctionsPass() {}
-  explicit InsertQuantizedFunctionsPass(QuantizationMethod quantization_method,
-                                        const OpSet& op_set) {
+  explicit InsertQuantizedFunctionsPass() = default;
+  explicit InsertQuantizedFunctionsPass(QuantMethod quantization_method,
+                                        OpSet op_set) {
     quantization_method_ = quantization_method;
     op_set_ = op_set;
   }
@@ -73,18 +76,22 @@ class InsertQuantizedFunctionsPass
 
   // Returns the function library for the given quantization method and opset
   // pair.
-  llvm::StringRef GetFunctionLibrary(QuantizationMethod quantization_method,
+  llvm::StringRef GetFunctionLibrary(QuantMethod quantization_method,
                                      OpSet op_set);
 
-  Option<QuantizationMethod> quantization_method_{
+  Option<QuantMethod> quantization_method_{
       *this, "quantization-method",
-      llvm::cl::init(QuantizationMethod::kPostTrainingQuantization),
+      llvm::cl::init(
+          tensorflow::quantization::QuantizationMethod::STATIC_RANGE),
       llvm::cl::desc("Choose quantization method."),
       llvm::cl::values(
-          clEnumValN(QuantizationMethod::kPostTrainingQuantization, "ptq",
-                     "Post-training static-range quantization"),
-          clEnumValN(QuantizationMethod::kDynamicRangeQuantization, "drq",
-                     "Post-training dynamic-range quantizaiton"))};
+          clEnumValN(tensorflow::quantization::QuantizationMethod::STATIC_RANGE,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(
+              tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE,
+              "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY,
+                     "weight_only", "Post-training weight_only quantizaiton"))};
 
   Option<OpSet> op_set_{
       *this, "target-opset", llvm::cl::init(OpSet::TF),
@@ -98,13 +105,22 @@ class InsertQuantizedFunctionsPass
 };
 
 llvm::StringRef InsertQuantizedFunctionsPass::GetFunctionLibrary(
-    QuantizationMethod quantization_method, OpSet op_set) {
+    QuantMethod quantization_method, OpSet op_set) {
   absl::flat_hash_map<OpSet, llvm::StringRef> function_library_map;
-  if (quantization_method == QuantizationMethod::kDynamicRangeQuantization) {
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
     function_library_map = {
+        {OpSet::TF, kQuantizedFunctionLibraryInMLIR_TF_DRQ},
         {OpSet::UNIFORM_QUANTIZED,
          kQuantizedFunctionLibraryInMLIR_UNIFORM_QUANTIZED_DRQ},
-        {OpSet::TF, kQuantizedFunctionLibraryInMLIR_TF_DRQ}};
+        {OpSet::XLA, kQuantizedFunctionLibraryInMLIR_TF_DRQ}};
+  } else if (quantization_method ==
+             tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY) {
+    // Uniform quantized opset is not supported for weight-only as inputs for
+    // weight quantization are floats. And only dequantize_i8 is used from the
+    // quantized function library.
+    function_library_map = {{OpSet::TF, kQuantizedFunctionLibraryInMLIR},
+                            {OpSet::XLA, kQuantizedFunctionLibraryInMLIR}};
   } else {
     function_library_map = {{OpSet::TF, kQuantizedFunctionLibraryInMLIR},
                             {OpSet::UNIFORM_QUANTIZED,
@@ -170,6 +186,15 @@ void InsertQuantizedFunctionsPass::runOnOperation() {
     func::FuncOp new_func = func.clone();
     new_func.setPrivate();
     symbol_table.insert(new_func);
+
+    // For consistency, we require all quantized composite function to have
+    // the "tf_quant.quantized_ops" attribute.
+    if (!new_func.getSymName().starts_with("quantized_")) continue;
+    if (!new_func->hasAttrOfType<ArrayAttr>("tf_quant.quantized_ops")) {
+      new_func->emitError() << "Missing \"tf_quant.quantized_ops\" "
+                               "attribute in the quantized composite function.";
+      signalPassFailure();
+    }
   }
 }
 
@@ -177,9 +202,9 @@ void InsertQuantizedFunctionsPass::runOnOperation() {
 
 // Creates an instance of the pass for inserting quantized functions.
 std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
-    QuantizationMethod quantization_method, const OpSet& op_set) {
+    QuantMethod quantization_method, OpSet target_opset) {
   return std::make_unique<InsertQuantizedFunctionsPass>(quantization_method,
-                                                        op_set);
+                                                        target_opset);
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc
new file mode 100644
index 00000000000..31371f7b502
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_restore_op.cc
@@ -0,0 +1,213 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::GetSessionInitializerOp;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
+using ::mlir::tf_saved_model::SessionInitializerOp;
+
+// This pass creates a RestoreV2 op in the initializer function with
+// type "restore_op" that initializes variables from checkpoint. It finds
+// tf.AssignVariableOp(tf.VarHandleOp, tf.Const) patterns in the initializer
+// function and replaces tf.Consts with the results of RestoreV2.
+class InsertRestoreOpPass
+    : public PassWrapper<InsertRestoreOpPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertRestoreOpPass)
+
+  explicit InsertRestoreOpPass() = default;
+
+  // The argument used to refer to the pass in the textual format (e.g. on the
+  // commandline).
+  StringRef getArgument() const final { return "quant-insert-restore-op"; }
+
+  StringRef getDescription() const final {
+    return "Creates RestoreV2 op to initialize the variables in the "
+           "initializer function (`tf_saved_model.initializer_type == "
+           "'restore_op'`). Replaces each occurrence of "
+           "`tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns with "
+           "`tf.AssignVariableOp(tf.VarHandleOp, restore_op_output#N)`, where "
+           "`restore_op_output#N` is the Nth output of the newly created "
+           "RestoreV2Op.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Finds `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns and returns
+// the `tf.VarHandleOp`s that are initialized by these `tf.AssignVariableOp`s.
+std::vector<TF::VarHandleOp> CollectVariableOps(
+    func::FuncOp session_init_func) {
+  std::vector<TF::VarHandleOp> var_handle_ops{};
+
+  for (auto assign_variable_op : llvm::make_early_inc_range(
+           session_init_func.getOps<TF::AssignVariableOp>())) {
+    Value resource_operand = assign_variable_op.getOperand(0);
+    Value assigned_value_operand = assign_variable_op.getOperand(1);
+
+    if (auto var_handle_op =
+            dyn_cast<TF::VarHandleOp>(resource_operand.getDefiningOp());
+        var_handle_op &&
+        isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
+      var_handle_ops.emplace_back(var_handle_op);
+    }
+  }
+
+  return var_handle_ops;
+}
+
+// Creates a `ConstOp` of 1-dimensional TF::StringType out of `str_values`.
+TF::ConstOp Create1DStringConst(const ArrayRef<std::string> str_values,
+                                const Location loc, OpBuilder& builder) {
+  const auto tensor_type =
+      RankedTensorType::get(/*shape=*/{static_cast<int64_t>(str_values.size())},
+                            /*elementType=*/builder.getType<TF::StringType>());
+
+  return builder.create<TF::ConstOp>(
+      loc, DenseStringElementsAttr::get(
+               tensor_type,
+               SmallVector<StringRef>(str_values.begin(), str_values.end())));
+}
+
+// Creates a new argument for `func_op` that accepts a string tensor containing
+// the checkpoint file's prefix.
+BlockArgument InsertFilePrefixArgument(func::FuncOp func_op,
+                                       OpBuilder& builder) {
+  const auto filename_op_type = RankedTensorType::get(
+      /*shape=*/{}, /*elementType=*/builder.getType<TF::StringType>());
+  const auto file_prefix_attr = builder.getStringAttr("__tf_file_prefix");
+  const auto arg_attrs = builder.getDictionaryAttr({builder.getNamedAttr(
+      kTfSavedModelIndexPathAttr, builder.getArrayAttr({file_prefix_attr}))});
+
+  const int insert_idx = func_op.getNumArguments();
+
+  func_op.insertArgument(insert_idx, /*argType=*/filename_op_type, arg_attrs,
+                         NameLoc::get(file_prefix_attr));
+
+  return func_op.getArgument(insert_idx);
+}
+
+// Creates a 1D string array constant for "tensor_names" input of `RestoreV2`
+// op. The `ConstOp` will be created at `builder`'s current insertion point.
+TF::ConstOp CreateTensorNamesConst(const ArrayRef<std::string> tensor_names,
+                                   OpBuilder& builder) {
+  const auto loc = NameLoc::get(builder.getStringAttr("tensor_names"));
+  return Create1DStringConst(tensor_names, loc, builder);
+}
+
+// Creates a 1D string array constant for "shape_and_slices" input of
+// `RestoreV2` op. The `ConstOp` will be created at `builder`'s current
+// insertion point. It will be filled with `size` empty strings.
+TF::ConstOp CreateShapeAndSlicesConst(const int size, OpBuilder& builder) {
+  const SmallVector<std::string> shape_and_slices_values(size, /*Value=*/"");
+
+  const auto loc = NameLoc::get(builder.getStringAttr("shape_and_slices"));
+  return Create1DStringConst(shape_and_slices_values, loc, builder);
+}
+
+// Creates a `tf.RestoreV2Op` that loads the variable values from the checkpoint
+// file. The loaded tensors will be used to initialize `tf.VarHandleOp`s via
+// `tf.AssignVariableOp`s.
+void CreateRestoreV2Op(std::vector<TF::VarHandleOp>& target_var_handle_ops,
+                       func::FuncOp session_init_func) {
+  SmallVector<Type> tensor_types{};
+  SmallVector<std::string> tensor_names{};
+  for (auto var_handle_op : target_var_handle_ops) {
+    tensor_names.emplace_back(var_handle_op.getSharedName().str());
+
+    // Ex) If VarHandleOp's type is tensor<!tf_type.resource<tensor<1xf32>>>,
+    // then tensor<1xf32> is the subtype.
+    tensor_types.emplace_back(var_handle_op.resource_subtype());
+  }
+
+  auto builder =
+      OpBuilder::atBlockTerminator(&session_init_func.getBody().front());
+
+  const BlockArgument filename_arg =
+      InsertFilePrefixArgument(session_init_func, builder);
+
+  TF::ConstOp tensor_names_const =
+      CreateTensorNamesConst(tensor_names, builder);
+  TF::ConstOp shape_and_slices_const =
+      CreateShapeAndSlicesConst(tensor_names.size(), builder);
+
+  auto restore_op = builder.create<TF::RestoreV2Op>(
+      session_init_func.getLoc(),
+      /*tensors=*/tensor_types,
+      /*prefix=*/filename_arg, tensor_names_const, shape_and_slices_const);
+
+  for (auto [idx, restore_result] : llvm::enumerate(restore_op.getResults())) {
+    builder.create<TF::AssignVariableOp>(
+        restore_op.getLoc(), target_var_handle_ops[idx], restore_result);
+  }
+}
+
+// TODO(b/261813194): Do not create a new RestoreV2 op when a RestoreV2 op
+// already exists.
+void InsertRestoreOpPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+
+  func::FuncOp session_init_func = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (!session_init_func) {
+    LOG(INFO) << "No session initializer function with type 'restore_op'. "
+                 "RestoreV2 op will not be created.";
+    return;
+  }
+
+  std::vector<TF::VarHandleOp> target_var_handle_ops =
+      CollectVariableOps(session_init_func);
+  if (target_var_handle_ops.empty()) {
+    LOG(INFO) << "There are no VarHandleOps to restore. RestoreV2 op will not "
+                 "be created.";
+    return;
+  }
+
+  CreateRestoreV2Op(target_var_handle_ops, session_init_func);
+}
+
+static PassRegistration<InsertRestoreOpPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertRestoreOpPass() {
+  return std::make_unique<InsertRestoreOpPass>();
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index d7d2845ea2a..b4f9fd73239 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -26,7 +27,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -37,9 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
 namespace mlir {
 namespace quant {
@@ -52,15 +49,18 @@ class LiftQuantizableSpotsAsFunctionsPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
       LiftQuantizableSpotsAsFunctionsPass)
 
-  LiftQuantizableSpotsAsFunctionsPass() {}
+  LiftQuantizableSpotsAsFunctionsPass() = default;
 
-  explicit LiftQuantizableSpotsAsFunctionsPass(const OpSet& op_set) {
+  explicit LiftQuantizableSpotsAsFunctionsPass(OpSet op_set,
+                                               bool enable_two_input_tensors) {
     op_set_ = op_set;
+    enable_two_input_tensors_ = enable_two_input_tensors;
   }
 
   LiftQuantizableSpotsAsFunctionsPass(
       const LiftQuantizableSpotsAsFunctionsPass& other) {
     op_set_ = other.op_set_;
+    enable_two_input_tensors_ = other.enable_two_input_tensors_;
   }
 
   StringRef getArgument() const final {
@@ -91,76 +91,105 @@ class LiftQuantizableSpotsAsFunctionsPass
           clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
           clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
                      "Uses TF Uniform Quantized ops"))};
+
+  bool enable_two_input_tensors_{false};
 };
 
 class CheckQuantizableOps
     : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
  public:
-  explicit CheckQuantizableOps(MLIRContext* context, const OpSet& op_set)
-      : OpRewritePattern<TF::PartitionedCallOp>(context), op_set_(op_set) {}
+  explicit CheckQuantizableOps(MLIRContext* context, OpSet op_set,
+                               bool enable_two_input_tensors)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        op_set_(op_set),
+        enable_two_input_tensors_(enable_two_input_tensors) {}
 
  private:
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
     StringRef function_name =
-        call_op.fAttr().cast<FlatSymbolRefAttr>().getValue();
+        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
     if (!function_name.startswith("composite_") ||
         !call_op->hasAttr(kQuantTraitAttrName)) {
       return failure();
     }
 
-    tensorflow::Status check_status;
-    switch (op_set_) {
-      case OpSet::XLA:
-        check_status = checkQuantizableOpsForXla(call_op, function_name);
-        break;
-      default:
-        check_status = tensorflow::OkStatus();
-        break;
+    absl::Status check_status;
+    // Skip quantization for read-only ops as only weight-only is supported.
+    if (function_name.contains("gather")) {
+      check_status.Update(absl::InternalError("Weight-only op is skipped."));
+    }
+
+    if (op_set_ == OpSet::XLA) {
+      check_status.Update(checkQuantizableOpsForXla(call_op, function_name,
+                                                    enable_two_input_tensors_));
     }
 
     // The OK status means this op is quantizable. Return failure since the
     // pattern doesn't rewrite anything yet.
     if (check_status.ok()) return failure();
     call_op->removeAttr(kQuantTraitAttrName);
-    removeAttrMapAttribute(call_op, function_name,
-                           check_status.error_message());
+    removeAttrMapAttribute(call_op, function_name, check_status.message());
     return success();
   }
 
-  tensorflow::Status checkQuantizableOpsForXla(TF::PartitionedCallOp call_op,
-                                               StringRef function_name) const {
+  absl::Status checkQuantizableOpsForXla(TF::PartitionedCallOp call_op,
+                                         StringRef function_name,
+                                         bool enable_two_input_tensors) const {
     // Disable quantization for the DepthwiseConv since it has no benefits in
     // the XLA opset.
     if (function_name.contains("depthwise_conv2d")) {
-      return tensorflow::errors::Unknown(
+      return absl::InternalError(
           "DepthwiseConv2D doesn't get any benefit of quantization in XLA.");
     } else if (function_name.contains("conv2d")) {
       // For Conv2D, the channel dimension must be static to calculate the
       // feature group count.
       if (!HasStaticShapeAtDims(call_op->getOperand(0), /*dims=*/3)) {
-        return tensorflow::errors::Unknown(
+        return absl::InternalError(
             "The channel dimension of Conv2D is required to be static.");
       }
     } else if (function_name.contains("conv3d")) {
       // For Conv3D, the channel dimension must be static to calculate the
       // feature group count.
       if (!HasStaticShapeAtDims(call_op->getOperand(0), /*dims=*/4)) {
-        return tensorflow::errors::Unknown(
+        return absl::InternalError(
             "The channel dimension of Conv3D is required to be static.");
       }
+    } else if (function_name.contains("batch_matmul")) {
+      // For BatchMatMul, the input must be ranked.
+      auto shaped_type =
+          call_op->getOperand(0).getType().dyn_cast<ShapedType>();
+      if (!shaped_type || !shaped_type.hasRank()) {
+        return absl::InternalError("The input of BatchMatMul must have rank.");
+      }
     }
 
     std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(call_op);
     for (auto iter : spec->coeff_op_quant_dim) {
       Operation* preceding_op = call_op.getOperand(iter.first).getDefiningOp();
       // The XLA opset only supports constant filter/weight at the moment.
-      if (!preceding_op || !preceding_op->hasTrait<OpTrait::ConstantLike>()) {
-        return tensorflow::errors::Unknown(
-            "Non-constant weights are not supported at the moment.");
+      bool is_weight_constant =
+          preceding_op && preceding_op->hasTrait<OpTrait::ConstantLike>();
+
+      // There might be q/dq ops after the filter/weight.
+      if (auto dq_op = llvm::dyn_cast_or_null<quantfork::DequantizeCastOp>(
+              preceding_op)) {
+        if (auto q_op = llvm::dyn_cast_or_null<quantfork::QuantizeCastOp>(
+                dq_op.getArg().getDefiningOp())) {
+          Operation* q_op_input = q_op.getArg().getDefiningOp();
+          is_weight_constant =
+              q_op_input && q_op_input->hasTrait<OpTrait::ConstantLike>();
+        }
+      }
+
+      if (!is_weight_constant) {
+        if (!enable_two_input_tensors || !function_name.contains("matmul")) {
+          return absl::InternalError(
+              "Non-constant weights are not supported at the moment.");
+        }
       }
     }
-    return tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
   void removeAttrMapAttribute(TF::PartitionedCallOp call_op,
@@ -189,6 +218,7 @@ class CheckQuantizableOps
   }
 
   OpSet op_set_;
+  bool enable_two_input_tensors_;
 };
 
 static PassRegistration<LiftQuantizableSpotsAsFunctionsPass> pass;
@@ -201,7 +231,7 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   populateWithGenerated(patterns);
-  patterns.add<CheckQuantizableOps>(ctx, op_set_);
+  patterns.add<CheckQuantizableOps>(ctx, op_set_, enable_two_input_tensors_);
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
   for (auto func : module.getOps<func::FuncOp>()) {
     if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
@@ -214,8 +244,10 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>>
-CreateLiftQuantizableSpotsAsFunctionsPass(const OpSet& op_set) {
-  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(op_set);
+CreateLiftQuantizableSpotsAsFunctionsPass(OpSet target_opset,
+                                          bool enable_two_input_tensors) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(
+      target_opset, enable_two_input_tensors);
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
index 7ec9739eaac..00fd6f5605d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.td
@@ -83,6 +83,16 @@ def LiftConv3D : Pat<
       (NamedAttr<"dilations"> $dilations))),
   [(IsNotInLiftedFunc $res)], (addBenefit 1)>;
 
+def LiftBatchMatMul : Pat<
+  (TF_BatchMatMulV2Op:$res $x, $y, $adj_x, $adj_y),
+  (LiftAsFunctionCall<"composite_batch_matmul_fn">
+    (ArgumentList $x, $y),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"adj_x"> $adj_x),
+      (NamedAttr<"adj_y"> $adj_y))),
+  [(IsNotInLiftedFunc $res)], (addBenefit 1)>;
+
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops with bias as functions
 //===----------------------------------------------------------------------===//
@@ -144,6 +154,18 @@ def LiftConv3dWithBias : Pat<
       (NamedAttr<"dilations"> $dilations))),
   [(IsNotInLiftedFunc $res)], (addBenefit 5)>;
 
+def LiftBatchMatMulWithBias : Pat<
+  (TF_BiasAddOp:$res
+    (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
+    $bias, IsDataFormatNHWC:$bias_data_format),
+  (LiftAsFunctionCall<"composite_batch_matmul_with_bias_fn">
+    (ArgumentList $x, $y, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"adj_x"> $adj_x),
+      (NamedAttr<"adj_y"> $adj_y))),
+  [(IsNotInLiftedFunc $res)], (addBenefit 5)>;
+
 //===----------------------------------------------------------------------===//
 // Pattern rules for lifting ops with bias and activation as functions
 //===----------------------------------------------------------------------===//
@@ -263,6 +285,38 @@ multiclass LiftCompositeOpsWithActivation<Op ActivationOp, string ActivationName
         (NamedAttr<"dilations"> $dilations))),
     [(IsNotInLiftedFunc $res)], (addBenefit 10)>;
 
+  def LiftBatchMatMulWith#ActivationOp : Pat<
+    (ActivationOp:$res
+      (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y)),
+    (LiftAsFunctionCall<"composite_batch_matmul_with_"# ActivationName #"_fn">
+      (ArgumentList $x, $y),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"adj_x"> $adj_x),
+        (NamedAttr<"adj_y"> $adj_y))),
+    [(IsNotInLiftedFunc $res)], (addBenefit 10)>;
+
+  def LiftBatchMatMulWithBiasAnd#LastFusedOp : Pat<
+    (ActivationOp:$res
+      (TF_BiasAddOp
+        (TF_BatchMatMulV2Op $x, $y, $adj_x, $adj_y),
+        $bias, IsDataFormatNHWC:$bias_data_format)),
+    (LiftAsFunctionCall<"composite_batch_matmul_with_bias_and_"# ActivationName #"_fn">
+      (ArgumentList $x, $y, $bias),
+      (ResultList $res),
+      (NamedAttributeList
+        (NamedAttr<"adj_x"> $adj_x),
+        (NamedAttr<"adj_y"> $adj_y))),
+    [(IsNotInLiftedFunc $res)], (addBenefit 10)>;
 }
 defm : LiftCompositeOpsWithActivation<TF_ReluOp, "relu">;
 defm : LiftCompositeOpsWithActivation<TF_Relu6Op, "relu6">;
+
+def LiftGather : Pat<
+  (TF_GatherV2Op:$res $params, $indices, $axis, $batch_dims),
+  (LiftAsFunctionCall<"composite_gather_fn">
+    (ArgumentList $params, $indices, $axis),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"batch_dims"> $batch_dims))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $params)], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
index 2b135096426..94109bf98f2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -43,7 +45,7 @@ class LiftQuantizableSpotsAsFunctionsDRQPass
       LiftQuantizableSpotsAsFunctionsDRQPass)
 
   // Constructor used by the PassRegistration. This is only used by test.
-  explicit LiftQuantizableSpotsAsFunctionsDRQPass() {}
+  explicit LiftQuantizableSpotsAsFunctionsDRQPass() = default;
 
   // Constructor used by manually creating the pass.
   explicit LiftQuantizableSpotsAsFunctionsDRQPass(
@@ -108,6 +110,13 @@ class CheckQuantizableOps
         call_op->removeAttr(kQuantTraitAttrName);
       }
     }
+
+    StringRef function_name =
+        call_op.getFAttr().cast<FlatSymbolRefAttr>().getValue();
+    if (function_name.contains("gather")) {
+      call_op->removeAttr(kQuantTraitAttrName);
+    }
+
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
index b74cf08acba..c4ea778b522 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.td
@@ -24,6 +24,33 @@ include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
 // Pattern rules for lifting ops as functions
 //===----------------------------------------------------------------------===//
 
+def LiftConv : Pat<
+  (TF_Conv2DOp:$res $input, $filter, $strides, $use_cudnn_on_gpu, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsFunctionCall<"composite_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"use_cudnn_on_gpu"> $use_cudnn_on_gpu),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], (addBenefit 1)>;
+
+def LiftDepthwiseConv : Pat<
+  (TF_DepthwiseConv2dNativeOp:$res $input, $filter, $strides, $padding,
+    $explicit_paddings, IsDataFormatNHWC:$data_format, $dilations),
+  (LiftAsFunctionCall<"composite_depthwise_conv2d_fn">
+    (ArgumentList $input, $filter),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"strides"> $strides),
+      (NamedAttr<"padding"> $padding),
+      (NamedAttr<"explicit_paddings"> $explicit_paddings),
+      (NamedAttr<"dilations"> $dilations))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $filter)], (addBenefit 1)>;
+
 def LiftMatMul : Pat<
   (TF_MatMulOp:$res $a, $b, $transpose_a, $transpose_b),
   (LiftAsFunctionCall<"composite_matmul_fn">
@@ -33,3 +60,12 @@ def LiftMatMul : Pat<
       (NamedAttr<"transpose_a"> $transpose_a),
       (NamedAttr<"transpose_b"> $transpose_b))),
   [(IsNotInLiftedFunc $res), (IsConstTensor $b)], (addBenefit 1)>;
+
+def LiftGather : Pat<
+  (TF_GatherV2Op:$res $params, $indices, $axis, $batch_dims),
+  (LiftAsFunctionCall<"composite_gather_fn">
+    (ArgumentList $params, $indices, $axis),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"batch_dims"> $batch_dims))),
+  [(IsNotInLiftedFunc $res), (IsConstTensor $params)], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/mark_functions_noinline.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/mark_functions_noinline.cc
new file mode 100644
index 00000000000..11fb2bebfef
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/mark_functions_noinline.cc
@@ -0,0 +1,124 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+
+// Required when using LLVM_DEBUG macro.
+#define DEBUG_TYPE "mark-functions-noinline"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+// Name of the boolean attribute indicating whether the function can be
+// inlined or not.
+constexpr StringRef kTfNoinlineAttr = "tf._noinline";
+
+// This pass marks functions with the attribute `tf._noinline = true` so that
+// they aren't inlined by the `InlinerPass`. The names of the functions to be
+// marked noinline should be specified by the `noinline-functions` option.
+class MarkFunctionsNoinlinePass
+    : public PassWrapper<MarkFunctionsNoinlinePass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MarkFunctionsNoinlinePass)
+
+  explicit MarkFunctionsNoinlinePass()
+      : MarkFunctionsNoinlinePass(
+            /*noinline_functions=*/ArrayRef<std::string>{}) {}
+
+  // `noinline_functions` is a list of function names to be marked noinline.
+  explicit MarkFunctionsNoinlinePass(
+      const ArrayRef<std::string> noinline_functions)
+      : noinline_functions_(CreateNoinlineFunctionsOption(noinline_functions)) {
+  }
+
+  MarkFunctionsNoinlinePass(const MarkFunctionsNoinlinePass& other)
+      : MarkFunctionsNoinlinePass() {
+    noinline_functions_ = other.noinline_functions_;
+  }
+
+  StringRef getArgument() const final { return "mark-functions-noinline"; }
+
+  StringRef getDescription() const final {
+    return "Marks a function whose name is in `noinline-functions` option with "
+           "the attribute `tf._noinline = true`. This attributes the function "
+           "from being inlined by the `InlinerPass`.";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  ListOption<std::string> CreateNoinlineFunctionsOption(
+      const ArrayRef<std::string> noinline_functions) {
+    return {*this, "noinline-functions",
+            llvm::cl::desc(
+                "Name of the functions that should be marked "
+                "tf._noinline = true to prevent inlining. The name of the "
+                "function should exactly match to be marked noinline."),
+            llvm::cl::list_init<std::string>(noinline_functions),
+            llvm::cl::ZeroOrMore};
+  }
+
+  // Gets a set of function names from `noinline_functions_`.
+  StringSet<> GetNoinlineFunctionsSet() {
+    StringSet<> noinline_functions;
+    noinline_functions.insert(noinline_functions_.begin(),
+                              noinline_functions_.end());
+    return noinline_functions;
+  }
+
+  // Names of the functions to be marked noinline.
+  ListOption<std::string> noinline_functions_;
+};
+
+void MarkFunctionsNoinlinePass::runOnOperation() {
+  const StringSet<> noinline_functions = GetNoinlineFunctionsSet();
+
+  func::FuncOp func_op = getOperation();
+  Builder builder(&getContext());
+
+  // Adds the `tf._noinline = true` attribute to the function if the name
+  // matches.
+  if (noinline_functions.contains(func_op.getSymName())) {
+    func_op->setAttr(kTfNoinlineAttr, builder.getBoolAttr(true));
+    LLVM_DEBUG(llvm::dbgs()
+               << "Marked tf._noinline = true: " << func_op.getSymName());
+  }
+}
+
+static PassRegistration<MarkFunctionsNoinlinePass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMarkFunctionsNoinlinePass(
+    const ArrayRef<std::string> noinline_functions) {
+  return std::make_unique<MarkFunctionsNoinlinePass>(noinline_functions);
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
index cf1a2f0fb64..d5491f6010c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
 #include <iterator>
 #include <memory>
 #include <string>
@@ -22,9 +23,11 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -43,6 +46,9 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using ::mlir::tf_executor::FetchOp;
+using ::mlir::tf_executor::GraphOp;
+using ::mlir::tf_executor::IslandOp;
 using ::mlir::tf_saved_model::GetSessionInitializerOp;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
@@ -61,7 +67,7 @@ class MergeInitializerFunctionOpsToMainPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
       MergeInitializerFunctionOpsToMainPass)
 
-  explicit MergeInitializerFunctionOpsToMainPass() {}
+  explicit MergeInitializerFunctionOpsToMainPass() = default;
 
   StringRef getArgument() const override {
     return "quant-merge-initializer-function-ops-to-main";
@@ -88,7 +94,7 @@ class MergeInitializerFunctionOpsToMainPass
 // Gets the "main" function from the module. Returns an empty op iff it doesn't
 // exist.
 func::FuncOp GetMainFunction(ModuleOp module_op) {
-  const StringAttr main_func_id =
+  const auto main_func_id =
       StringAttr::get(module_op.getContext(), kImportModelDefaultGraphFuncName);
   auto func_ops = module_op.getOps<func::FuncOp>();
   auto main_func_itr = absl::c_find_if(func_ops, [&main_func_id](auto func_op) {
@@ -106,14 +112,14 @@ bool IsFuncOpEmpty(func::FuncOp func_op) {
 
 // Gets the GraphOp from the function op. Returns an empty op iff it doesn't
 // exist.
-tf_executor::GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
+GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
   if (IsFuncOpEmpty(func_op)) return {};
 
   auto graph_op_range = func_op.front().without_terminator();
   if (llvm::hasSingleElement(graph_op_range)) {
     // The pass runs on a valid tf_executor dialect, so the op should be the
     // GraphOp.
-    return cast<tf_executor::GraphOp>(graph_op_range.begin());
+    return cast<GraphOp>(graph_op_range.begin());
   }
 
   return {};
@@ -121,34 +127,55 @@ tf_executor::GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
 
 // Gets the string representation of the type name.
 std::string GetTypeName(const Type type) {
-  // Gets the string representation of the type name.
   std::string type_name{};
   auto os = llvm::raw_string_ostream{type_name};
   os << type;
   return type_name;
 }
 
+// Retrieves the value of `tf_saved_model.initializer_type` attribute from the
+// initializer function. Returns "unknown_initializer_type" iff the attribute is
+// not set.
+std::string GetInitializerType(func::FuncOp init_func_op) {
+  const auto initializer_type_attr =
+      init_func_op->getAttrOfType<StringAttr>(kTfSavedModelInitializerTypeAttr);
+
+  if (!initializer_type_attr) {
+    init_func_op->emitWarning()
+        << "Initializer func op does not have tf_saved_model.initializer_type "
+           "attribute. Func op: "
+        << init_func_op.getSymName();
+    return "unknown_initializer_type";
+  }
+
+  return initializer_type_attr.str();
+}
+
 // An initializer function should satisfy the follwing conditions:
-// 1. The arguments should not be used.
+// 1. The arguments should not be used if the type is "init_op" (it assumes
+//    non-variable resources like tables aren't being initialized by the asset
+//    files passed as arguments).
 // 2. Its GraphOp should only have control outputs.
 LogicalResult ValidateInitFunc(func::FuncOp init_func_op) {
-  for (BlockArgument arg : init_func_op.getArguments()) {
-    if (!arg.use_empty()) {
-      const int arg_idx = arg.getArgNumber();
-      const int num_uses = absl::c_distance(arg.getUses());
-      init_func_op.emitError(absl::StrFormat(
-          "Validation failed for the initializer function: %s. "
-          "The initializer function's arguments should have no "
-          "usages. Instead, argument index: %d has number of usages: %d.",
-          init_func_op.getName().str(), arg_idx, num_uses));
-      return failure();
+  if (GetInitializerType(init_func_op) == kTfSavedModelInitializerInitType) {
+    for (BlockArgument arg : init_func_op.getArguments()) {
+      if (!arg.use_empty()) {
+        const int arg_idx = arg.getArgNumber();
+        const int num_uses = absl::c_distance(arg.getUses());
+        init_func_op.emitError(absl::StrFormat(
+            "Validation failed for the initializer function: %s. "
+            "The initializer function's arguments should have no "
+            "usages. Instead, argument index: %d has number of usages: %d.",
+            init_func_op.getName().str(), arg_idx, num_uses));
+        return failure();
+      }
     }
   }
 
-  tf_executor::GraphOp graph_op = GetGraphOpFromFuncOp(init_func_op);
+  GraphOp graph_op = GetGraphOpFromFuncOp(init_func_op);
   if (!graph_op) return success();  // Consider empty FuncOp valid.
 
-  tf_executor::FetchOp fetch_op = graph_op.GetFetch();
+  FetchOp fetch_op = graph_op.GetFetch();
   for (const Value fetch : fetch_op.getFetches()) {
     if (!fetch.getType().isa<tf_executor::ControlType>()) {
       fetch_op.emitError(absl::StrFormat(
@@ -163,24 +190,6 @@ LogicalResult ValidateInitFunc(func::FuncOp init_func_op) {
   return success();
 }
 
-// Retrieves the value of `tf_saved_model.initializer_type` attribute from the
-// initializer function. Returns "unknown_initializer_type" iff the attribute is
-// not set.
-std::string GetInitializerType(func::FuncOp init_func_op) {
-  const auto initializer_type_attr =
-      init_func_op->getAttrOfType<StringAttr>(kTfSavedModelInitializerTypeAttr);
-
-  if (!initializer_type_attr) {
-    init_func_op->emitWarning()
-        << "Initializer func op does not have tf_saved_model.initializer_type "
-           "attribute. Func op: "
-        << init_func_op.getSymName();
-    return "unknown_initializer_type";
-  }
-
-  return initializer_type_attr.str();
-}
-
 // Returns initializer_type -> init_func_op mapping from the session_init_op's
 // initializers. The initializer functions are validated for whether it can be
 // moved to the main function. Returns failure() iff validation fails.
@@ -207,13 +216,85 @@ FailureOr<absl::flat_hash_map<std::string, func::FuncOp>> GetInitFuncOps(
   return init_func_ops;
 }
 
+// If `main_func_op` has the `tf.entry_function` attribute, adds a new input
+// name to the `inputs` field of the attribute. Otherwise, no attribute is
+// modified.
+void MaybeAddEntryFunctionInput(const StringRef input_name,
+                                func::FuncOp main_func_op) {
+  auto entry_func_attr =
+      main_func_op->getAttrOfType<DictionaryAttr>("tf.entry_function");
+  if (!entry_func_attr) return;
+
+  auto entry_func_attrs = SmallVector<NamedAttribute>(entry_func_attr.begin(),
+                                                      entry_func_attr.end());
+
+  MLIRContext* ctx = main_func_op.getContext();
+  for (auto& named_attr : entry_func_attrs) {
+    if (named_attr.getName() != "inputs") continue;
+
+    // Splits the "inputs" field to retrieve individual input names. Ignores
+    // empty strings.
+    SmallVector<StringRef> inputs_attrs{};
+    cast<StringAttr>(named_attr.getValue())
+        .strref()
+        .split(inputs_attrs, /*Separator=*/',', /*MaxSplit=*/-1,
+               /*KeepEmpty=*/false);
+
+    inputs_attrs.emplace_back(input_name);
+
+    const std::string new_inputs_attr_str =
+        llvm::join(std::move(inputs_attrs), /*Separator=*/",");
+
+    named_attr.setValue(StringAttr::get(ctx, new_inputs_attr_str));
+  }
+
+  main_func_op->setAttr("tf.entry_function",
+                        DictionaryAttr::get(ctx, entry_func_attrs));
+}
+
+// Creates new arguments to the main function that corresponds to the source
+// function's arguments. Returns the `IRMapping` that contains the
+// relationship.
+IRMapping CloneSrcFuncArgumentsToMainFunc(
+    func::FuncOp src_func_op, func::FuncOp main_func_op) {
+  IRMapping mapper{};
+
+  for (auto [src_arg_idx, src_arg] :
+       llvm::enumerate(src_func_op.getArguments())) {
+    // No need to create a mapping when there is no usage - it will not affect
+    // the cloning.
+    if (src_arg.use_empty()) continue;
+
+    const unsigned main_arg_idx = main_func_op.getNumArguments();
+
+    const DictionaryAttr main_arg_attr =
+        src_func_op.getArgAttrDict(src_arg_idx);
+
+    main_func_op.insertArgument(main_arg_idx, src_arg.getType(), main_arg_attr,
+                                src_arg.getLoc());
+
+    const std::string new_input_name =
+        absl::StrCat(GetInitializerType(src_func_op), "_", src_arg_idx, ":0");
+
+    MaybeAddEntryFunctionInput(new_input_name, main_func_op);
+
+    // During cloning, let it know that the source function's argument
+    // corresponds to the main function's newly created argument when cloning
+    // ops from src -> main.
+    BlockArgument main_arg = main_func_op.getArgument(main_arg_idx);
+    mapper.map(src_arg, main_arg);
+  }
+
+  return mapper;
+}
+
 // Copies ops from `src_func_op` to `main_body` except for the FetchOps. Returns
 // the fetch values in the main GraphOp corresponding to the original fetch
 // values from `src_func_op`. Returns an empty vector when `src_func_op` is
-// empty.
-llvm::SmallVector<Value> CopyOpsToMainFunction(
-    func::FuncOp src_func_op, tf_executor::GraphOp main_graph_op) {
-  tf_executor::GraphOp src_graph_op = GetGraphOpFromFuncOp(src_func_op);
+// empty. `main_func_op` must have a GraphOp.
+SmallVector<Value> CopyOpsToMainFunction(func::FuncOp src_func_op,
+                                         func::FuncOp main_func_op) {
+  GraphOp src_graph_op = GetGraphOpFromFuncOp(src_func_op);
   if (!src_graph_op) {
     VLOG(1) << "Function " << src_func_op.getName().str()
             << " does not have a tf_executor::GraphOp. No ops are copied to "
@@ -221,17 +302,20 @@ llvm::SmallVector<Value> CopyOpsToMainFunction(
     return {};
   }
 
-  tf_executor::FetchOp main_fetch_op = main_graph_op.GetFetch();
+  GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
+
+  FetchOp main_fetch_op = main_graph_op.GetFetch();
   const absl::Cleanup erase_main_fetch_op = [main_fetch_op]() mutable {
     main_fetch_op.erase();
   };
 
-  Block& main_body = main_graph_op.GetBody();
+  // TODO(b/245473863): Handle when assets are actually used in the body.
+  IRMapping mapper =
+      CloneSrcFuncArgumentsToMainFunc(src_func_op, main_func_op);
 
   // Clones each op from src to main_body.
+  Block& main_body = main_graph_op.GetBody();
   Block& src_body = src_graph_op.GetBody();
-  // TODO(b/245473863): Handle when assets are actually used in the body.
-  BlockAndValueMapping mapper{};
   for (Operation& op : src_body.without_terminator()) {
     main_body.push_back(op.clone(mapper));
   }
@@ -241,59 +325,22 @@ llvm::SmallVector<Value> CopyOpsToMainFunction(
 
   // Clone the source's FetchOp, but do not push to the main function's body.
   // The clone is only needed to identify the fetch operands.
-  auto cloned_fetch_op =
-      cast<tf_executor::FetchOp>(src_graph_op.GetFetch()->clone(mapper));
+  auto cloned_fetch_op = cast<FetchOp>(src_graph_op.GetFetch()->clone(mapper));
   const absl::Cleanup erase_cloned_fetch_op = [cloned_fetch_op]() mutable {
     cloned_fetch_op.erase();
   };
 
-  const auto fetch_operands = llvm::to_vector(cloned_fetch_op.getFetches());
-
-  return fetch_operands;
-}
-
-// An overload where it accepts multiple source FuncOps. Returns all the fetches
-// from the source FuncOps.
-llvm::SmallVector<Value> CopyOpsToMainFunction(
-    const ArrayRef<func::FuncOp> src_func_ops,
-    tf_executor::GraphOp main_graph_op) {
-  llvm::SmallVector<Value> fetches{};
-  absl::c_for_each(src_func_ops, [main_graph_op, &fetches](auto src_func_op) {
-    const auto fetch_operands =
-        CopyOpsToMainFunction(src_func_op, main_graph_op);
-    fetches.append(fetch_operands);
-  });
-
-  return fetches;
-}
-
-// Removes the SymbolRefAttr from session_initializer op's `initializers`
-// attribute when its initializer_type corresponds to `init_type_to_erase`.
-void EraseInitializerFromInitializersAttr(
-    absl::flat_hash_map<std::string, func::FuncOp>& init_func_ops,
-    StringRef init_type_to_erase, SessionInitializerOp session_init_op,
-    MLIRContext* ctx) {
-  // Resets the `initializers` attribute excluding the symbol ref of the init
-  // function whose type matches `init_type_to_erase`.
-  llvm::SmallVector<Attribute> init_func_symbols{};
-  for (auto& [init_type, init_func_op] : init_func_ops) {
-    if (init_type == init_type_to_erase) continue;
-
-    init_func_symbols.emplace_back(
-        SymbolRefAttr::get(ctx, init_func_op.getSymName()));
-  }
-
-  session_init_op.setInitializersAttr(ArrayAttr::get(ctx, init_func_symbols));
+  return llvm::to_vector(cloned_fetch_op.getFetches());
 }
 
 // Creates a new `IslandOp` that wraps a `TF::NoOp`. The `IslandOp` has control
 // dependencies to the values provided.
-tf_executor::IslandOp CreateNoOpWithControlDependencies(
-    const Location loc, tf_executor::GraphOp main_graph_op,
+IslandOp CreateNoOpWithControlDependencies(
+    const Location loc, GraphOp main_graph_op,
     const ArrayRef<Value> control_dependencies) {
   auto builder = OpBuilder::atBlockTerminator(&main_graph_op.GetBody());
 
-  auto wrapper_island_op = builder.create<tf_executor::IslandOp>(
+  auto wrapper_island_op = builder.create<IslandOp>(
       loc, /*outputs=*/TypeRange{},
       /*control=*/tf_executor::ControlType::get(builder.getContext()),
       /*controlInputs=*/control_dependencies);
@@ -310,9 +357,8 @@ tf_executor::IslandOp CreateNoOpWithControlDependencies(
 }
 
 // Adds a new fetch operand for the main function's GraphOp.
-void AddFetchOperandToMain(tf_executor::GraphOp main_graph_op,
-                           const Value fetch_operand) {
-  tf_executor::FetchOp old_fetch = main_graph_op.GetFetch();
+void AddFetchOperandToMain(GraphOp main_graph_op, const Value fetch_operand) {
+  FetchOp old_fetch = main_graph_op.GetFetch();
   const absl::Cleanup erase_old_fetch = [old_fetch]() mutable {
     old_fetch.erase();
   };
@@ -321,16 +367,15 @@ void AddFetchOperandToMain(tf_executor::GraphOp main_graph_op,
   fetches.emplace_back(fetch_operand);
 
   auto builder = OpBuilder::atBlockTerminator(&main_graph_op.GetBody());
-  builder.create<tf_executor::FetchOp>(main_graph_op.getLoc(),
-                                       std::move(fetches));
+  builder.create<FetchOp>(main_graph_op.getLoc(), std::move(fetches));
 }
 
-// Creates a new Location for the init op. This creates a loc by attaching a
-// prefix `kInitOpNamePrefix` to the initializer function's name so that it is
-// identifiable.
+// Creates a new Location for the initializer function. This creates a loc by
+// attaching a to the initializer function's type so that it is identifiable.
 Location CreateInitOpLoc(MLIRContext* ctx, func::FuncOp init_func_ops) {
+  const std::string init_type = GetInitializerType(init_func_ops);
   const std::string name =
-      absl::StrCat(kInitOpNamePrefix, "_", init_func_ops.getName().str());
+      absl::StrCat(init_type, "_", init_func_ops.getName().str());
   return NameLoc::get(StringAttr::get(ctx, name));
 }
 
@@ -344,7 +389,7 @@ void MergeInitializerFunctionOpsToMainPass::runOnOperation() {
     return signalPassFailure();
   }
 
-  tf_executor::GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
+  GraphOp main_graph_op = GetGraphOpFromFuncOp(main_func_op);
   if (!main_graph_op) return;
 
   SessionInitializerOp session_init_op = GetSessionInitializerOp(module_op);
@@ -362,41 +407,32 @@ void MergeInitializerFunctionOpsToMainPass::runOnOperation() {
     return;
   }
 
-  // Find the init function with type "init_op" and clone the ops to @main.
-  // TODO(b/253614209): Also add the init function corresponding to the
-  // "restore_op" to @main.
-  const auto init_op_it = init_func_ops->find(kTfSavedModelInitializerInitType);
-  if (init_op_it == init_func_ops->end()) {
-    VLOG(1) << "Initializer function with tf_saved_model.initializer_type == "
-               "'init_op' not found.";
-    return;
-  }
-
-  func::FuncOp init_op_func = init_op_it->second;
-  const llvm::SmallVector<Value> init_op_fetches =
-      CopyOpsToMainFunction(init_op_func, main_graph_op);
-  if (init_op_fetches.empty()) {
-    VLOG(1) << "No fetch values exist from initializer functions.";
-    return;
-  }
+  // Find the initializer functions and clone their ops to @main.
+  for (auto& [init_type, init_op_func] : *init_func_ops) {
+    const SmallVector<Value> init_op_fetches =
+        CopyOpsToMainFunction(init_op_func, main_func_op);
+    if (init_op_fetches.empty()) {
+      VLOG(1) << "No fetch values exist from initializer functions.";
+      return;
+    }
 
-  // Creates a NoOp that has control dependency to the initializer function
-  // for non-variables.
-  const Location init_op_loc = CreateInitOpLoc(ctx, init_op_func);
-  tf_executor::IslandOp noop_wrapper_island_op =
-      CreateNoOpWithControlDependencies(
-          init_op_loc, main_graph_op,
-          /*control_dependencies=*/init_op_fetches);
+    // Creates a NoOp that has control dependency to the initializer function
+    // for non-variables.
+    const Location init_op_loc = CreateInitOpLoc(ctx, init_op_func);
+    IslandOp noop_wrapper_island_op = CreateNoOpWithControlDependencies(
+        init_op_loc, main_graph_op,
+        /*control_dependencies=*/init_op_fetches);
 
-  AddFetchOperandToMain(main_graph_op,
-                        /*fetch_operand=*/noop_wrapper_island_op.getControl());
+    AddFetchOperandToMain(
+        main_graph_op,
+        /*fetch_operand=*/noop_wrapper_island_op.getControl());
 
-  symbol_table.erase(init_op_func);
+    symbol_table.erase(init_op_func);
+  }
 
-  EraseInitializerFromInitializersAttr(
-      *init_func_ops,
-      /*init_type_to_erase=*/kTfSavedModelInitializerInitType, session_init_op,
-      ctx);
+  // Empties the "initializers" attribute from the `SessionInitializerOp` since
+  // all ops of the initializer ops are cloned into @main.
+  session_init_op.setInitializersAttr(ArrayAttr::get(ctx, {}));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
index be4e2160ee2..2348ac80b84 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.td
@@ -27,8 +27,7 @@ def RemoveRedundantCastOps : Pat<
       (TF_ClipByValueOp:$clip $input, $min_value, $max_value),
       ConstBoolAttrFalse:$truncate2),
     ConstBoolAttrFalse:$truncate1),
-  (CreateOpWithOutputType<"TF::CastOp">
-    (GetValueType $root_cast), $clip, ConstBoolAttrFalse),
+  (TF_CastOp $clip, ConstBoolAttrFalse),
   [(TensorOf<[I8]> $i8_cast),
    (TensorOf<[I32]> $clip),
    (IsIntSplatValueEqual<"int32_t", "-128"> $min_value),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index d90ad8ef60c..cebc14ed259 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -16,9 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_PASSES_H_
 
+#include <memory>
+#include <string>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 
@@ -35,9 +39,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateInsertMainFunctionPass();
 std::unique_ptr<OperationPass<func::FuncOp>> CreateConvertFakeQuantToQdqPass();
 
 // Lifts the quantizable spots as composite functions.
-// TODO(b/249914162): Pass OpSet by value instead of reference.
 std::unique_ptr<OperationPass<ModuleOp>>
-CreateLiftQuantizableSpotsAsFunctionsPass(const OpSet& op_set);
+CreateLiftQuantizableSpotsAsFunctionsPass(OpSet target_opset,
+                                          bool enable_two_input_tensors);
 
 // Apply graph optimizations such as fusing and constant folding to prepare
 // lifting.
@@ -58,7 +62,9 @@ CreateIssueIDsOfCustomAggregationOpsPass();
 
 // Inserts quantized function library.
 std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
-    QuantizationMethod quantization_method, const OpSet& op_set);
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    OpSet target_opset);
 
 // Inserts custom aggregation operators for the calibration procedure.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -68,7 +74,10 @@ CreateInsertCustomAggregationOpsPass();
 // pass runs, functions in the given graph will be replaced with their quantized
 // versions. By doing so, the quantization will be applied to the given input.
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
-    QuantizationMethod quantization_method, OpSet target_opset = OpSet::TF);
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    OpSet target_opset, bool enable_per_channel_quantization,
+    int min_num_elements_for_weights);
 
 // Converts dequantize-(quantizable) call-quantize pattern to a single call op
 // that has quantized input and output types. It is expected for this pass to
@@ -79,16 +88,24 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass();
 
 // Overloading of CreateQuantizePass which takes QuantizationSpecs.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    QuantizationSpecs quant_specs);
+    QuantizationSpecs quant_specs, OpSet target_opset);
 
-// Creates an instance of the PrepareQuantize pass, which will perfrom similar
+// Creates an instance of the PrepareQuantize pass, which will perform similar
 // transformations as TFL::PrepareQuantizePass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    QuantizationMethod quantization_method);
+    const QuantizationSpecs& quant_specs,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method);
 
 // Creates an instance of the PrepareQuantizeDRQ pass, which will
-// perfrom similar transformations as TFL::PrepareQuantizeDynamicRangePass.
-std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizeDRQPass();
+// perform similar transformations as TFL::PrepareQuantizeDynamicRangePass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
+    const QuantizationSpecs& quant_specs, OpSet op_set);
+
+// Creates an instance of the PreprocessOp pass, which will perform op
+// preprocessing to allow multi-axis quantization, prior to quantization.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
+    const QuantizationSpecs& quant_specs, OpSet op_set);
 
 // Creates an instance of the PostQuantize pass, which will remove unnecessary
 // ops from the final quantized graph.
@@ -123,6 +140,32 @@ CreateMergeInitializerFunctionOpsToMainPass();
 // AssignVariableOps.
 std::unique_ptr<OperationPass<ModuleOp>> CreateUnfreezeConstantsPass();
 
+// Creates a pass that duplicates constants that affect the shape of a tensor
+// after some computation.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateDuplicateShapeDeterminingConstantsPass();
+
+// Creates a pass that creates a RestoreV2 op in the initializer function with
+// type "restore_op" that initializes variables from the checkpoint. It finds
+// tf.AssignVariableOp(tf.VarHandleOp, tf.Const) patterns in the initializer
+// function and replaces tf.Consts with the results of RestoreV2.
+std::unique_ptr<OperationPass<ModuleOp>> CreateInsertRestoreOpPass();
+
+// Creates a pass that marks functions with the attribute `tf._noinline = true`
+// to avoid being inlined by the `InlinerPass`. `noinline_functions` is the name
+// of the functions to mark.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMarkFunctionsNoinlinePass(
+    ArrayRef<std::string> noinline_functions);
+
+// Removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns from the
+// initializer function (type = "restore_op").
+// Note: initializing values (`tf.Const`s) will be removed and this may result
+// in an information loss and uninitialized variables eventually. Make sure that
+// this effect is desired (e.g. there is a `tf.RestoreV2Op` that restores the
+// variables instead).
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariableInitializationByConstPass();
+
 }  // namespace quant
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
index 4a17b32acba..03c6f14baa1 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.cc
@@ -43,7 +43,7 @@ class PostQuantizePass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PostQuantizePass)
 
   // Constructor used by the PassRegistration. This will remove the adaptor ops.
-  explicit PostQuantizePass() {}
+  explicit PostQuantizePass() = default;
 
   StringRef getArgument() const final {
     // This is the argument used to refer to the pass in
@@ -104,6 +104,28 @@ struct RemoveVolatileOps
   }
 };
 
+// The StorageCastOp is used to cast from a quantized type to its storage type
+// or the opposite. If none of its input and output is quantized, the op has
+// no effect and should be removed.
+class RemoveRedundantScast
+    : public mlir::OpRewritePattern<quantfork::StorageCastOp> {
+ public:
+  explicit RemoveRedundantScast(MLIRContext* context)
+      : OpRewritePattern<quantfork::StorageCastOp>(context) {}
+
+ private:
+  LogicalResult matchAndRewrite(quantfork::StorageCastOp scast_op,
+                                PatternRewriter& rewriter) const override {
+    if (QuantizedType::getQuantizedElementType(scast_op.getArg().getType()) ||
+        QuantizedType::getQuantizedElementType(scast_op.getType())) {
+      return failure();
+    }
+
+    scast_op.replaceAllUsesWith(scast_op.getArg());
+    return success();
+  }
+};
+
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.inc"
 
 void PostQuantizePass::runOnOperation() {
@@ -111,7 +133,7 @@ void PostQuantizePass::runOnOperation() {
   auto func = getOperation();
   auto* ctx = func.getContext();
   patterns.add<FoldTrivalRequantizeOp<quantfork::QuantizeCastOp>,
-               RemoveVolatileOps<kPreserveNone>>(ctx);
+               RemoveVolatileOps<kPreserveNone>, RemoveRedundantScast>(ctx);
   populateWithGenerated(patterns);
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
index ef4523fbfb5..5d879adea90 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/post_quantize.td
@@ -31,5 +31,5 @@ def ReorderIdentityFollowingQuantizedFunction : Pat<
       (TF_IdentityOp
         (quantfork_StorageCastOp $value)))),
   (TF_IdentityOp
-    (CreateOpWithOutputType<"quantfork::DequantizeCastOp">
-      (GetValueType $output), $value))>;
+    (quantfork_DequantizeCastOp
+      $value, (returnType (GetValueType $output))))>;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index 053ac34c6bb..979c0ffca98 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -13,16 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
 #include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
@@ -48,33 +55,200 @@ class PrepareLiftingPass
            "prepare lifting.";
   }
 
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<TF::TensorFlowDialect, mlir::arith::ArithDialect>();
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, arith::ArithDialect>();
   }
 
   void runOnOperation() override;
 };
 
-bool HasEqualElementSize(Value filter, Attribute val,
-                         mlir::ArrayRef<unsigned> filter_indices,
-                         mlir::ArrayRef<unsigned> val_indices) {
-  int filter_result = 1;
-  int val_result = 1;
+// Check if given indices in `val1` has same number of elements as given
+// indices in `val2`.
+bool HasEqualElementSize(Value val1, Value val2, ArrayRef<int> val1_indices,
+                         ArrayRef<int> val2_indices) {
+  ShapedType val1_shape = val1.getType().cast<ShapedType>();
+  ShapedType val2_shape = val2.getType().cast<ShapedType>();
+  if (!val1_shape.hasRank() || !val2_shape.hasRank()) return false;
 
-  mlir::ShapedType shaped_filter = filter.getType().cast<ShapedType>();
-  mlir::ShapedType shaped_val = val.dyn_cast<DenseElementsAttr>().getType();
+  int val1_result = 1;
+  int val2_result = 1;
+  for (auto idx : val1_indices) {
+    if (idx < 0) idx = idx + val1_shape.getRank();
+    if (idx >= val1_shape.getRank() || val1_shape.isDynamicDim(idx)) {
+      return false;
+    }
+    val1_result *= val1_shape.getDimSize(idx);
+  }
+
+  for (auto idx : val2_indices) {
+    if (idx < 0) idx = idx + val2_shape.getRank();
+    if (idx >= val2_shape.getRank() || val2_shape.isDynamicDim(idx)) {
+      return false;
+    }
+    val2_result *= val2_shape.getDimSize(idx);
+  }
 
-  for (auto idx : filter_indices) {
-    if (idx >= shaped_filter.getRank()) return false;
-    filter_result *= shaped_filter.getDimSize(idx);
+  return val1_result == val2_result;
+}
+
+// Matches convolution op with "NHWC" data format or matmul op with false adj_y.
+// The list of supported ops in this function is:
+// - Conv2DOp
+// - Conv3DOp
+// - DepthwiseConv2dNativeOp
+// - MatMulOp
+// - BatchMatMulV2Op
+LogicalResult MatchSupportedAffineOp(Operation* op, Value& binding_output,
+                                     Value& binding_input,
+                                     Value& binding_weight) {
+  bool is_supported_affine_op = false;
+  if (llvm::isa<TF::Conv2DOp, TF::Conv3DOp, TF::DepthwiseConv2dNativeOp>(op)) {
+    if (const auto data_format = op->getAttrOfType<StringAttr>("data_format")) {
+      is_supported_affine_op = data_format.getValue().equals("NHWC") ||
+                               data_format.getValue().equals("NDHWC");
+    }
+  } else if (llvm::isa<TF::MatMulOp, TF::BatchMatMulV2Op>(op)) {
+    if (const auto adj_y = op->getAttrOfType<BoolAttr>("adj_y")) {
+      is_supported_affine_op = !adj_y.getValue();
+    }
+  }
+
+  if (!is_supported_affine_op) return failure();
+
+  // Bind input, output and weight to the given values.
+  binding_output = op->getResult(0);
+  binding_input = op->getOperand(0);
+  binding_weight = op->getOperand(1);
+  return success();
+}
+
+// Makes the 1D value broadcastable with the `rhs_shape`.
+Value MakeOneDimValueBroadcastable(OpBuilder& builder, Location loc,
+                                   Value value, ShapedType rhs_shape) {
+  ShapedType value_shape = value.getType().dyn_cast_or_null<ShapedType>();
+  if (!value_shape || value_shape.getRank() != 1 ||
+      !value_shape.hasStaticShape() || !rhs_shape.hasStaticShape()) {
+    return {};
+  }
+
+  int64_t num_elements = value_shape.getNumElements();
+  llvm::SmallVector<int64_t> new_shape;
+  for (auto idx : llvm::reverse(llvm::seq<int32_t>(0, rhs_shape.getRank()))) {
+    const int64_t rhs_dim = rhs_shape.getDimSize(idx);
+    if (num_elements % rhs_dim != 0) {
+      return {};
+    }
+    new_shape.push_back(rhs_dim);
+    num_elements = num_elements / rhs_dim;
+    if (num_elements == 1) break;
   }
+  absl::c_reverse(new_shape);
 
-  for (auto idx : val_indices) {
-    if (idx >= shaped_val.getRank()) return false;
-    val_result *= shaped_val.getDimSize(idx);
+  auto reshape_op = builder.create<TF::ReshapeOp>(
+      loc, value, Create1DConstValue(builder, loc, new_shape));
+  return ConstantFoldOpIfPossible(reshape_op).front();
+}
+
+// Checks if a value can be symetrically quantized.
+bool CanBeSymmetricallyQuantized(Value weight) {
+  auto dq_op = weight.getDefiningOp<quantfork::DequantizeCastOp>();
+  if (!dq_op) return true;
+
+  auto qtype = dq_op.getArg().getType().cast<TensorType>().getElementType();
+  if (auto uniform_type = llvm::dyn_cast_or_null<UniformQuantizedType>(qtype)) {
+    return uniform_type.getZeroPoint() == 0;
+  } else if (auto per_axis_type =
+                 llvm::dyn_cast_or_null<UniformQuantizedPerAxisType>(qtype)) {
+    return absl::c_all_of(per_axis_type.getZeroPoints(),
+                          [](int64_t x) { return x == 0; });
+  }
+  return false;
+}
+
+// Multiplies two 1D arrays with broadcasting support.
+template <typename T>
+SmallVector<T> MultiplyTwoArrays(ArrayRef<T> a, ArrayRef<T> b) {
+  auto get_value_at = [](ArrayRef<T> v, size_t i) -> T {
+    if (v.size() == 1) return v.front();
+    return v[i];
+  };
+
+  size_t max_size = std::max(a.size(), b.size());
+  SmallVector<T> result(max_size);
+  for (size_t i : llvm::seq<size_t>(0, max_size)) {
+    result[i] = get_value_at(a, i) * get_value_at(b, i);
+  }
+  return result;
+}
+
+// Multiplies the value followed by a FakeQuant op and adjusts the quantization
+// params. This funtion only supports symetrically quantized values.
+Value MultiplyFakeQuantValue(OpBuilder& builder, Location loc, Value value,
+                             Value multiplier) {
+  auto dq_op = value.getDefiningOp<quantfork::DequantizeCastOp>();
+  if (!dq_op) {
+    auto mul_op = builder.create<TF::MulOp>(loc, value, multiplier);
+    return ConstantFoldOpIfPossible(mul_op).front();
+  }
+  auto q_op = dq_op.getArg().getDefiningOp<quantfork::QuantizeCastOp>();
+  if (!q_op) return {};
+
+  Value float_value = q_op.getArg();
+  Value new_value = builder.create<TF::MulOp>(loc, float_value, multiplier);
+  auto new_value_type = new_value.getType().cast<TensorType>();
+
+  // Get multiplier value in double.
+  DenseFPElementsAttr multiplier_attr;
+  if (!matchPattern(multiplier, m_Constant(&multiplier_attr)) ||
+      multiplier_attr.getType().cast<ShapedType>().getRank() > 1) {
+    return {};
+  }
+  std::vector<double> multiplier_values;
+  absl::c_transform(multiplier_attr, std::back_inserter(multiplier_values),
+                    [](auto v) { return FloatAttr::getValueAsDouble(v); });
+  ArrayRef<double> multiplier_array(multiplier_values.data(),
+                                    multiplier_values.size());
+
+  // Multiply the quantization parameters by the multiplier.
+  QuantizedType new_qtype;
+  auto element_type = q_op.getType().cast<TensorType>().getElementType();
+  if (auto uniform_type = llvm::dyn_cast<UniformQuantizedType>(element_type)) {
+    if (multiplier_attr.isSplat()) {
+      double new_scale = multiplier_array.front() * uniform_type.getScale();
+      new_qtype = UniformQuantizedType::get(
+          uniform_type.getFlags(), uniform_type.getStorageType(),
+          uniform_type.getExpressedType(), new_scale,
+          uniform_type.getZeroPoint(), uniform_type.getStorageTypeMin(),
+          uniform_type.getStorageTypeMax());
+    } else {
+      auto new_scales =
+          MultiplyTwoArrays(multiplier_array, {uniform_type.getScale()});
+      int32_t quantized_dim = new_value_type.getRank() - 1;
+      auto new_zero_points =
+          SmallVector<int64_t>(new_scales.size(), uniform_type.getZeroPoint());
+      new_qtype = UniformQuantizedPerAxisType::get(
+          uniform_type.getFlags(), uniform_type.getStorageType(),
+          uniform_type.getExpressedType(), new_scales, new_zero_points,
+          quantized_dim, uniform_type.getStorageTypeMin(),
+          uniform_type.getStorageTypeMax());
+    }
+  } else if (auto per_axis_type =
+                 llvm::dyn_cast_or_null<UniformQuantizedPerAxisType>(
+                     element_type)) {
+    auto new_scales =
+        MultiplyTwoArrays(multiplier_array, per_axis_type.getScales());
+    new_qtype = UniformQuantizedPerAxisType::get(
+        per_axis_type.getFlags(), per_axis_type.getStorageType(),
+        per_axis_type.getExpressedType(), new_scales,
+        per_axis_type.getZeroPoints(), per_axis_type.getQuantizedDimension(),
+        per_axis_type.getStorageTypeMin(), per_axis_type.getStorageTypeMax());
   }
 
-  return filter_result == val_result;
+  auto quantize = builder.create<quantfork::QuantizeCastOp>(
+      q_op.getLoc(), new_value_type.clone(new_qtype), new_value);
+  auto dequantize = builder.create<quantfork::DequantizeCastOp>(
+      dq_op.getLoc(), new_value_type, quantize.getResult());
+  return ConstantFoldOpIfPossible(dequantize).front();
 }
 
 // Copied from tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc.
@@ -85,8 +259,8 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
   using OpRewritePattern<TF::IdentityOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(TF::IdentityOp identity,
-                                PatternRewriter &rewriter) const override {
-    for (Operation *user : identity->getUsers()) {
+                                PatternRewriter& rewriter) const override {
+    for (Operation* user : identity->getUsers()) {
       // Replace the op with the input if output is only used by TF ops.
       // Currently this is more on the conservative side since we need to ensure
       // every consumer op to be a TF op before applying this pattern. We can
@@ -102,7 +276,7 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
       }
     }
 
-    rewriter.replaceOp(identity, identity.input());
+    rewriter.replaceOp(identity, identity.getInput());
     return success();
   }
 };
@@ -110,7 +284,7 @@ struct RemoveIdentity : public OpRewritePattern<TF::IdentityOp> {
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.inc"
 
 void PrepareLiftingPass::runOnOperation() {
-  MLIRContext *ctx = &getContext();
+  MLIRContext* ctx = &getContext();
   auto func = getOperation();
 
   // The pattern includes decomposing batch normalization ops, fusing add/mul
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index 04651bea7bf..85edb26daf6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -46,13 +46,13 @@ def FoldFusedBatchNormV3: Pattern<
         $x, $scale, $offset, $mean, $variance,
         F32Attr:$epsilon, $exponential_avg_factor,
         $data_format, IsFalseBoolAttr:$is_training),
-    [(TF_AddOp
+    [(TF_AddV2Op
         (TF_MulOp
             $x,
             (TF_MulOp:$multiplier
                 $scale,
                 (TF_RsqrtOp
-                    (TF_AddOp $variance,
+                    (TF_AddV2Op $variance,
                               (TF_ConstOp $epsilon))))),
         (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
      // We already guaranteed that the last five results have no use so it does
@@ -66,15 +66,10 @@ def FoldFusedBatchNormV3: Pattern<
      (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
      (HasNoUseOf:$root__5)]>;
 
-class HasRank<int n> : Constraint<
-  CPred<"$0.getType().cast<ShapedType>().hasRank() && "
-        "$0.getType().cast<ShapedType>().getRank() == " # n>,
-  "Checks if the value has rank of 'n'.">;
-
 class HasEqualElementSize<list<int> shape_1, list<int> shape_2> : Constraint<
   CPred<"quant::HasEqualElementSize($0, $1,"
-  "llvm::ArrayRef<unsigned>({" # !interleave(shape_1, ", ") # "}),"
-  "llvm::ArrayRef<unsigned>({" # !interleave(shape_2, ", ") # "}))">,
+  "llvm::ArrayRef<int>({" # !interleave(shape_1, ", ") # "}),"
+  "llvm::ArrayRef<int>({" # !interleave(shape_2, ", ") # "}))">,
   "Checks if the given dimensions contain the same number of elements.">;
 
 def HasEqualShape : Constraint<CPred<
@@ -83,121 +78,79 @@ def HasEqualShape : Constraint<CPred<
   "$0.getType().cast<ShapedType>().getShape() == $1.getType().cast<ShapedType>().getShape()">,
   "Checks if the shapes of tensors are same.">;
 
-def Expand1DTo4DForConv2D : NativeCodeCall<
-  "$0.cast<DenseElementsAttr>().reshape("
-  "RankedTensorType::get({1,1,1,$0.getType().cast<ShapedType>().getNumElements()},"
-  "getElementTypeOrSelf($0.getType())))">;
-
-def Expand1DTo4DForDepthwiseConv2D : NativeCodeCall<
-  "$0.cast<DenseElementsAttr>().reshape("
-  "RankedTensorType::get({1,1,$1.getType().cast<ShapedType>().getDimSize(2),$1.getType().cast<ShapedType>().getDimSize(3)},"
-  "getElementTypeOrSelf($0.getType())))">;
-
-def CreateUnrankedTensorTypeWithElementType : NativeCodeCall<
-    "UnrankedTensorType::get(getElementTypeOrSelf($0.getType()))">;
-
-// Matching AffineOp followed by an AddOp patterns.
-def MatchConv2dAndAddPattern : Pat<
-  (TF_AddOp (TF_Conv2DOp:$conv_out $input, $filter, $strides, $use_cudnn, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-            (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_BiasAddOp (TF_Conv2DOp $input, $filter, $strides, $use_cudnn, $padding,
-         $explicit_padding, $data_format, $dilations), (TF_ConstOp $value), $data_format),
-  [(HasOneUse $conv_out), (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[3],[0]> $filter, $value)]>;
-
-def MatchDepthwiseConv2dAndAddPattern : Pat<
-  (TF_AddOp (TF_DepthwiseConv2dNativeOp:$conv_out $input, $filter, $strides, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-         (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_BiasAddOp (TF_DepthwiseConv2dNativeOp $input, $filter, $strides, $padding,
-         $explicit_padding, $data_format, $dilations,
-         (returnType (CreateUnrankedTensorTypeWithElementType $input))),
-         (TF_ConstOp $value), $data_format),
-  [(HasOneUse $conv_out), (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[2,3],[0]> $filter, $value)]>;
-
-// Fusing AffineOp followed by an MulOp patterns.
-def FuseConv2dAndMul : Pat<
-  (TF_MulOp (TF_Conv2DOp:$conv_out $input, $filter, $strides, $use_cudnn, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-            (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_Conv2DOp $input, (TF_MulOp $filter,
-        (TF_ConstOp (Expand1DTo4DForConv2D $value))), $strides, $use_cudnn, $padding,
-         $explicit_padding, $data_format, $dilations),
-  [(HasOneUse $conv_out), (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[3],[0]> $filter, $value)]>;
-
-def FuseDepthwiseConv2dAndMul : Pat<
-  (TF_MulOp (TF_DepthwiseConv2dNativeOp:$conv_out $input, $filter, $strides, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-            (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_DepthwiseConv2dNativeOp $input, (TF_MulOp $filter,
-        (TF_ConstOp (Expand1DTo4DForDepthwiseConv2D $value, $filter))), $strides, $padding,
-         $explicit_padding, $data_format, $dilations),
-  [(HasOneUse $conv_out), (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[2,3],[0]> $filter, $value)]>;
-
-// Fusing AffineOp followed by an BiasAddOp and an AddOp patterns.
-def FuseConv2dWithBiasAndAdd : Pat<
-  (TF_AddOp
-   (TF_BiasAddOp:$biasadd_out
-    (TF_Conv2DOp:$conv_out $input, $filter, $strides, $use_cudnn, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-    $bias, IsDataFormatNHWC:$bias_data_format),
-    (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_BiasAddOp
-   (TF_Conv2DOp $input, $filter, $strides, $use_cudnn, $padding,
-    $explicit_padding, $data_format, $dilations),
-   (TF_AddOp $bias, (TF_ConstOp $value)), $bias_data_format),
-  [(HasOneUse $conv_out), (HasOneUse $biasadd_out), (HasRank<1> $value),
-   (HasEqualShape $value, $bias)]>;
-
-def FuseDepthwiseConv2dWithBiasAndAdd : Pat<
-  (TF_AddOp
-   (TF_BiasAddOp:$biasadd_out
-    (TF_DepthwiseConv2dNativeOp:$conv_out $input, $filter, $strides, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-    $bias, IsDataFormatNHWC:$bias_data_format),
-    (TF_ConstOp IsFloatElementsAttr:$value)),
+// Make the 1D value $0 broadcastable with the shape of $1.
+def MakeOneDimValueBroadcastable : NativeCodeCall<
+  "MakeOneDimValueBroadcastable($_builder, $_loc, $0, $1.getType().cast<ShapedType>())">;
+
+// Match convolution op with "NHWC" data format or matmul op.
+def SupportedAffineOpMatcher : NativeCodeCall<
+  "MatchSupportedAffineOp($_self, $0, $1, $2)">;
+
+// Checks if a value can be symetrically quantized.
+def CanBeSymmetricallyQuantized : Constraint<CPred<"CanBeSymmetricallyQuantized($0)">>;
+
+// Multiplies the value followed by a FakeQuant op and adjusts its params.
+def MultiplyFakeQuantValue : NativeCodeCall<
+  "MultiplyFakeQuantValue($_builder, $_loc, $0...)">;
+
+// Convert AddV2Op following an AffineOp to BiasAddOp.
+// For Conv3D, even though the Conv3D op has "NDHWC" data format, the BiasAdd
+// will still has the data format of "NHWC".
+def ConvertAddToBiasAdd : Pat<
+  (TF_AddV2Op
+    (SupportedAffineOpMatcher $conv_out, $input, $weight),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
+  (TF_BiasAddOp $conv_out, $add_rhs, (CreateStringAttr<"NHWC">)),
+  [(HasRankOf<1> $add_rhs_value),
+   (HasEqualElementSize<[-1], [0]> $conv_out, $add_rhs)]>;
+
+// Fuse consecutive BiasAddOp and an AddV2Op.
+def FuseBiasAndAddV2 : Pat<
+  (TF_AddV2Op
+    (TF_BiasAddOp:$bias_add
+      $conv_out,
+      (TF_ConstOp:$bias IsFloatElementsAttr:$bias_value), $data_format),
+    (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
   (TF_BiasAddOp
-   (TF_DepthwiseConv2dNativeOp $input, $filter, $strides, $padding,
-    $explicit_padding, $data_format, $dilations,
-    (returnType (CreateUnrankedTensorTypeWithElementType $input))),
-   (TF_AddOp $bias, (TF_ConstOp $value)), $bias_data_format),
-  [(HasOneUse $conv_out), (HasOneUse $biasadd_out), (HasRank<1> $value),
-   (HasEqualShape $value, $bias)]>;
-
-// Fusing AffineOp followed by an BiasAddOp and an MulOp patterns.
-def FuseConv2dWithBiasAndMul : Pat<
+    $conv_out, (TF_AddV2Op $bias, $add_rhs), $data_format),
+  [(HasOneUse $bias_add),
+   (HasEqualShape $bias_value, $add_rhs_value)]>;
+
+// Fuse AffineOp followed by an MulOp patterns.
+def FuseAffineOpAndMul : Pat<
   (TF_MulOp
-   (TF_BiasAddOp:$biasadd_out
-    (TF_Conv2DOp:$conv_out $input, $filter, $strides, $use_cudnn, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-    $bias, IsDataFormatNHWC:$bias_data_format),
-    (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_BiasAddOp (TF_Conv2DOp $input, (TF_MulOp $filter,
-        (TF_ConstOp (Expand1DTo4DForConv2D $value))), $strides, $use_cudnn, $padding,
-         $explicit_padding, $data_format, $dilations), (TF_MulOp $bias,
-        (TF_ConstOp $value)), $bias_data_format),
-  [(HasOneUse $conv_out), (HasOneUse $biasadd_out),
-   (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[3],[0]> $filter, $value),
-   (HasEqualShape $value, $bias)]>;
-
-def FuseDepthwiseConv2dWithBiasAndMul : Pat<
+    (SupportedAffineOpMatcher $conv_out, $input, $weight),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (CloneOpWithReplacedOperands
+        (GetDefiningOp $conv_out),
+        $input,
+        (MultiplyFakeQuantValue $weight,
+          (MakeOneDimValueBroadcastable $mul_rhs, $weight))),
+  [(HasOneUse $conv_out),
+   (HasRankOf<1> $mul_rhs_value),
+   (HasStaticShapeConstraint $weight),
+   (CanBeSymmetricallyQuantized $weight),
+   (HasEqualElementSize<[-1], [0]> $conv_out, $mul_rhs)]>;
+
+// Fuse AffineOp followed by an BiasAddOp and an MulOp patterns.
+def FuseAffineOpWithBiasAddAndMul : Pat<
   (TF_MulOp
-   (TF_BiasAddOp:$biasadd_out
-    (TF_DepthwiseConv2dNativeOp:$conv_out $input, $filter, $strides, $padding,
-         $explicit_padding, IsDataFormatNHWC:$data_format, $dilations),
-    $bias, IsDataFormatNHWC:$bias_data_format),
-    (TF_ConstOp IsFloatElementsAttr:$value)),
-  (TF_BiasAddOp (TF_DepthwiseConv2dNativeOp $input, (TF_MulOp $filter,
-        (TF_ConstOp (Expand1DTo4DForDepthwiseConv2D $value, $filter))), $strides, $padding,
-         $explicit_padding, $data_format, $dilations,
-        (returnType (CreateUnrankedTensorTypeWithElementType $input))),
-        (TF_MulOp $bias, (TF_ConstOp $value)), $bias_data_format),
-  [(HasOneUse $conv_out), (HasOneUse $biasadd_out),
-   (HasRank<1> $value), (HasRank<4> $filter),
-   (HasEqualElementSize<[2,3],[0]> $filter, $value),
-   (HasEqualShape $value, $bias)]>;
+    (TF_BiasAddOp:$bias_add
+      (SupportedAffineOpMatcher $conv_out, $input, $weight),
+      $bias, $data_format),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (TF_BiasAddOp
+    (CloneOpWithReplacedOperands
+      (GetDefiningOp $conv_out),
+      $input,
+      (MultiplyFakeQuantValue $weight,
+        (MakeOneDimValueBroadcastable $mul_rhs, $weight))),
+    (MultiplyFakeQuantValue $bias, $mul_rhs), $data_format),
+  [(HasOneUse $conv_out),
+   (HasOneUse $bias_add),
+   (HasRankOf<1> $mul_rhs_value),
+   (HasStaticShapeConstraint $weight),
+   (CanBeSymmetricallyQuantized $weight),
+   (CanBeSymmetricallyQuantized $bias),
+   (HasEqualShape $bias, $mul_rhs_value)]>;
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index 9174c89be6f..72ee6151b05 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -52,6 +52,9 @@ namespace quant {
 
 namespace {
 
+using QuantMethod =
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
+
 // Applies prepare quantization on the model in TF dialect. This pass runs
 // before the quantization pass and propagate the quantization parameters
 // across ops. This step is necessary for post-training quantization and also
@@ -73,19 +76,23 @@ class PrepareQuantizePass
     quant_specs_.inference_type = tensorflow::DT_QINT8;
   }
 
-  explicit PrepareQuantizePass(QuantizationMethod quantization_method) {
+  // Constructor used by manually creating the pass.
+  explicit PrepareQuantizePass(const QuantizationSpecs& quant_specs,
+                               QuantMethod quantization_method)
+      : quant_specs_(quant_specs) {
     quant_specs_.inference_type = tensorflow::DT_QINT8;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
     enable_post_training_quantize_ =
-        (quantization_method == QuantizationMethod::kPostTrainingQuantization);
+        (quantization_method ==
+         tensorflow::quantization::QuantizationMethod::STATIC_RANGE);
   }
 
   PrepareQuantizePass(const PrepareQuantizePass& other) {
     quant_specs_ = other.quant_specs_;
     enable_post_training_quantize_ = other.enable_post_training_quantize_;
-    disable_per_channel_ = other.disable_per_channel_;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
   }
 
-  // Constructor used by manually creating the pass.
   explicit PrepareQuantizePass(const QuantizationSpecs& quant_specs)
       : quant_specs_(quant_specs) {
     enable_post_training_quantize_ = quant_specs.post_training_quantization;
@@ -151,9 +158,11 @@ class PrepareQuantizePass
       *this, "post-training-quantize", llvm::cl::init(false),
       llvm::cl::desc("Enable post training quantization. Only used in tests.")};
 
-  Option<bool> disable_per_channel_{
-      *this, "disable-per-channel", llvm::cl::init(false),
-      llvm::cl::desc("Whether disable per-channel quantized weights.")};
+  // A local flag is needed for testing conditions in
+  // prepare_quantize_ptq_per_channel.mlir.
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
 };
 
 bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
@@ -204,9 +213,8 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
         if (!min_max.first.has_value() || !min_max.second.has_value()) return;
 
         TypeAttr params = quant::GetQuantizedTypeAttr(
-            builder, input_type,
-            builder.getF64FloatAttr(min_max.first.getValue()),
-            builder.getF64FloatAttr(min_max.second.getValue()),
+            builder, input_type, builder.getF64FloatAttr(min_max.first.value()),
+            builder.getF64FloatAttr(min_max.second.value()),
             /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
         builder.setInsertionPoint(block, insertion_point);
         auto q_op = builder.create<quantfork::QuantizeCastOp>(
@@ -389,7 +397,7 @@ void PrepareQuantizePass::runOnOperation() {
 
   // During the legalization, unsigned quantized type is used, so we have to
   // convert all of them to signed.
-  RewritePatternSet patterns(&getContext());
+  RewritePatternSet patterns(ctx);
   populateWithGenerated(patterns);
   patterns.add<quant::ConvertUnsignedToSigned<quantfork::QuantizeCastOp>>(ctx);
   // Convert quant stats to int8 quantization parameters.
@@ -403,9 +411,8 @@ void PrepareQuantizePass::runOnOperation() {
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
   ApplyQuantizationParamsPropagation(
-      func, is_signed, disable_per_channel_ || quant_specs_.disable_per_channel,
-      GetTFOpQuantSpec, GetTfQuantScaleSpec, infer_tensor_range,
-      quant_specs_.legacy_float_scale);
+      func, is_signed, !enable_per_channel_quantization_, GetTFOpQuantSpec,
+      GetTfQuantScaleSpec, infer_tensor_range, quant_specs_.legacy_float_scale);
 
   RewritePatternSet patterns2(ctx);
   patterns2.add<MergeConsecutiveQuantizeCast>(ctx);
@@ -416,8 +423,9 @@ void PrepareQuantizePass::runOnOperation() {
 
 // Creates an instance of the TensorFlow dialect PrepareQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    QuantizationMethod quantization_method) {
-  return std::make_unique<PrepareQuantizePass>(quantization_method);
+    const QuantizationSpecs& quant_specs, QuantMethod quantization_method) {
+  return std::make_unique<PrepareQuantizePass>(quant_specs,
+                                               quantization_method);
 }
 
 static PassRegistration<PrepareQuantizePass> pass;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index 34309788054..2d96d13091c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -31,8 +33,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 //===----------------------------------------------------------------------===//
 // The prepare-quantize-drq Pass.
@@ -42,12 +44,13 @@ namespace quant {
 
 namespace {
 
-using QuantizationUnits = llvm::SetVector<std::pair<Operation*, int>>;
+using QuantizationUnit = std::pair<Operation*, int>;
+using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
 
 // Applies prepare quantization on the model in TF dialect for dynamic range
 // quantization case.
 class PrepareQuantizeDRQPass
-    : public PassWrapper<PrepareQuantizeDRQPass, OperationPass<func::FuncOp>> {
+    : public PassWrapper<PrepareQuantizeDRQPass, OperationPass<ModuleOp>> {
   void getDependentDialects(DialectRegistry& registry) const override {
     registry.insert<TF::TensorFlowDialect, ::mlir::quant::QuantizationDialect,
                     ::mlir::quantfork::QuantizationForkDialect>();
@@ -58,13 +61,22 @@ class PrepareQuantizeDRQPass
 
   // Constructor used by the PassRegistration and enforce int8 quantization.
   // This is only used by test.
-  explicit PrepareQuantizeDRQPass() {
+  explicit PrepareQuantizeDRQPass() : op_set_(OpSet::UNIFORM_QUANTIZED) {
     quant_specs_.inference_type = tensorflow::DT_QINT8;
   }
 
   // Constructor used by manually creating the pass.
-  explicit PrepareQuantizeDRQPass(const QuantizationSpecs& quant_specs)
-      : quant_specs_(quant_specs) {}
+  explicit PrepareQuantizeDRQPass(const QuantizationSpecs& quant_specs,
+                                  OpSet op_set)
+      : quant_specs_(quant_specs), op_set_(op_set) {
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+  }
+
+  PrepareQuantizeDRQPass(const PrepareQuantizeDRQPass& other) {
+    quant_specs_ = other.quant_specs_;
+    op_set_ = other.op_set_;
+    enable_per_channel_quantization_ = !quant_specs_.disable_per_channel;
+  }
 
   StringRef getArgument() const final {
     // This is the argument used to refer to the pass in
@@ -86,6 +98,11 @@ class PrepareQuantizeDRQPass
 
  private:
   QuantizationSpecs quant_specs_;
+  OpSet op_set_;
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
 };
 
 // If the weight is applicable to dynamic range quantization, insert Quantize
@@ -93,9 +110,13 @@ class PrepareQuantizeDRQPass
 class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
  public:
   explicit PrepareDRQQuantizableOp(MLIRContext* context,
-                                   const quant::QuantizationSpecs& quant_specs)
+                                   const quant::QuantizationSpecs& quant_specs,
+                                   OpSet op_set,
+                                   bool enable_per_channel_quantization)
       : OpRewritePattern<arith::ConstantOp>(context),
-        quant_specs_(quant_specs) {}
+        quant_specs_(quant_specs),
+        op_set_(op_set),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
 
   LogicalResult matchAndRewrite(arith::ConstantOp op,
                                 PatternRewriter& rewriter) const override {
@@ -143,28 +164,49 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
 
   // Apply per-tensor quantization for int8 dynamic range quantization.
   bool quantizeOpAsInt8(PatternRewriter& rewriter, arith::ConstantOp op,
-                        std::pair<Operation*, int> quant_op) const {
-    bool is_narrow_range = true;
-    bool is_legacy_float = quant_specs_.legacy_float_scale;
-    bool is_signed = quant_specs_.IsSignedInferenceType();
-    int bit_width = quant_specs_.GetQuantizationTypeWidth();
-
-    QuantizedType quant_type = nullptr;
+                        QuantizationUnit quant_op) const {
+    auto [quantized_op, weight_idx] = quant_op;
+    const bool is_narrow_range = true;
+    const bool is_legacy_float = quant_specs_.legacy_float_scale;
+    const bool is_signed = quant_specs_.IsSignedInferenceType();
+    const int bit_width = quant_specs_.GetQuantizationTypeWidth();
+
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(quantized_op);
+    const int quant_dim = spec->coeff_op_quant_dim[weight_idx];
+    const bool is_per_channel_quantization =
+        enable_per_channel_quantization_ && quant_dim != -1;
+
+    QuantizedType quant_type;
     DenseFPElementsAttr attr;
     if (!matchPattern(op->getResult(0), m_Constant(&attr))) return false;
 
-    quant_type = quant::GetUniformQuantizedTypeForWeight(
-                     attr, is_narrow_range && is_signed, bit_width, is_signed,
-                     is_narrow_range, is_legacy_float)
-                     .template dyn_cast<quant::QuantizedType>();
+    if (attr.size() < quant_specs_.minimum_elements_for_weights) {
+      op->emitRemark("Quantization is skipped for ")
+          << quantized_op->getName().getStringRef().str() << " because it has "
+          << attr.dyn_cast<DenseFPElementsAttr>().size()
+          << " elements which is fewer than the threshold("
+          << quant_specs_.minimum_elements_for_weights << " elements).";
+      return false;
+    }
 
+    if (is_per_channel_quantization) {
+      quant_type = quant::GetUniformQuantizedPerAxisTypeForWeight(
+                       attr, quant_dim,
+                       /*symmetric=*/true, bit_width, is_signed,
+                       is_narrow_range, is_legacy_float)
+                       .template dyn_cast<quant::QuantizedType>();
+    } else {
+      quant_type = quant::GetUniformQuantizedTypeForWeight(
+                       attr, is_narrow_range && is_signed, bit_width, is_signed,
+                       is_narrow_range, is_legacy_float)
+                       .template dyn_cast<quant::QuantizedType>();
+    }
     return insertQDQ(rewriter, op, quant_type, quant_op);
   }
 
   // Insert Quantize and Dequantize ops.
   bool insertQDQ(PatternRewriter& rewriter, arith::ConstantOp op,
-                 QuantizedType quant_type,
-                 std::pair<Operation*, int> quant_op) const {
+                 QuantizedType quant_type, QuantizationUnit quant_op) const {
     if (!quant_type) return false;
 
     Operation* quantize_op = quant_op.first;
@@ -208,7 +250,9 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
   }
 
  protected:
-  quant::QuantizationSpecs quant_specs_;
+  QuantizationSpecs quant_specs_;
+  OpSet op_set_;
+  bool enable_per_channel_quantization_;
 };
 
 // Remove all the stats ops which are redundant for dynamic range quantizaiton.
@@ -222,23 +266,31 @@ void PrepareQuantizeDRQPass::removeAllStatsOp(func::FuncOp func) {
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.inc"
 
 void PrepareQuantizeDRQPass::runOnOperation() {
-  func::FuncOp func = getOperation();
-  MLIRContext* ctx = func.getContext();
-
-  removeAllStatsOp(func);
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
 
-  RewritePatternSet patterns(&getContext());
   populateWithGenerated(patterns);
-  patterns.add<PrepareDRQQuantizableOp>(ctx, quant_specs_);
-  (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+  patterns.add<PrepareDRQQuantizableOp>(ctx, quant_specs_, op_set_,
+                                        enable_per_channel_quantization_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    removeAllStatsOp(func);
+    if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
+      func.emitError() << "quant-prepare-quantize-drq failed.";
+      signalPassFailure();
+    }
+  }
 }
 
 }  // namespace
 
 // Creates an instance of the TensorFlow dialect PrepareQuantizeDRQ
 // pass.
-std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizeDRQPass() {
-  return std::make_unique<PrepareQuantizeDRQPass>();
+std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
+    const QuantizationSpecs& quant_specs, const OpSet op_set) {
+  return std::make_unique<PrepareQuantizeDRQPass>(quant_specs, op_set);
 }
 
 static PassRegistration<PrepareQuantizeDRQPass> pass;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
new file mode 100644
index 00000000000..18aa58fe60c
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -0,0 +1,209 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This transformation pass applies quantization propagation on TF dialect.
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+
+//===----------------------------------------------------------------------===//
+// The preprocess-op Pass.
+//
+namespace mlir {
+namespace quant {
+
+namespace {
+
+using QuantizationUnit = std::pair<Operation*, int>;
+using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+
+// Preprocesses ops to allow multi-axis quantization, prior to quantization
+// passes. Currently, per-channel quantization only supports 1D results.
+class PreprocessOpPass
+    : public PassWrapper<PreprocessOpPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect, QuantizationDialect,
+                    quantfork::QuantizationForkDialect>();
+  }
+
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PreprocessOpPass)
+
+  // Constructor used by the PassRegistration and enforce int8 quantization.
+  // This is only used by test.
+  explicit PreprocessOpPass() : op_set_(OpSet::UNIFORM_QUANTIZED) {
+    quant_specs_.inference_type = tensorflow::DT_QINT8;
+  }
+
+  // Constructor used by manually creating the pass.
+  explicit PreprocessOpPass(const QuantizationSpecs& quant_specs, OpSet op_set)
+      : quant_specs_(quant_specs), op_set_(op_set) {}
+
+  PreprocessOpPass(const PreprocessOpPass& other) {
+    quant_specs_ = other.quant_specs_;
+    op_set_ = other.op_set_;
+  }
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "quant-preprocess-op";
+  }
+  StringRef getDescription() const final {
+    // This is a brief description of the pass.
+    return "Preprocess TF op prior to quantization";
+  }
+
+  void runOnOperation() override;
+
+ private:
+  QuantizationSpecs quant_specs_;
+  OpSet op_set_;
+};
+
+// Apply constant transformations for the op_set.
+class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
+ public:
+  explicit PreprocessConstantOp(MLIRContext* context, OpSet op_set)
+      : OpRewritePattern<TF::PartitionedCallOp>(context), op_set_(op_set) {}
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    // Non-quantizable op
+    if (!op->hasAttr(kQuantTraitAttrName)) return failure();
+    StringRef function_name = f_attr.getValue();
+    if (!function_name.startswith("composite_")) {
+      return failure();
+    }
+
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(op);
+    const absl::flat_hash_set<int> operands = spec->quantizable_operands;
+
+    if (function_name.contains("depthwise_conv2d")) {
+      // Uniform Quantized op requires weights of tf.DepthwiseConv2dNative to
+      // be transformed from [H,W,C,M] to [H,W,1,CxM] where
+      // H=height,W=width,C=channel,M=multiplier. Therefore, a reshape op is
+      // inserted between the constant op and the function op so that the
+      // constant is safely transformed for the multi-use cases as well. Note
+      // that bias doesn't need transformation as its shape is already in [CxM].
+      if (operands.size() != 1) return failure();
+      int weight_operand_idx = *operands.begin();
+      Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
+
+      if (op_set_ == OpSet::UNIFORM_QUANTIZED) {
+        DenseFPElementsAttr attr;
+        if (!matchPattern(weight_op->getResult(0), m_Constant(&attr))) {
+          return failure();
+        }
+
+        // Get new shape.
+        llvm::ArrayRef<int64_t> cur_shape = attr.getType().getShape();
+        int cur_rank = cur_shape.size();
+        if (cur_rank != 4 || cur_shape[2] == 1) return failure();
+        TensorType new_shape = RankedTensorType::get(
+            {cur_shape[0], cur_shape[1], 1, cur_shape[2] * cur_shape[3]},
+            attr.getElementType());
+
+        // Inserts a reshape op.
+        auto shape_spec_type =
+            RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
+        auto new_shape_const_attr =
+            DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
+        rewriter.setInsertionPointAfter(weight_op);
+        auto new_shape_const = rewriter.create<arith::ConstantOp>(
+            weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
+        auto reshape_op = rewriter.create<TF::ReshapeOp>(
+            weight_op->getLoc(), new_shape, weight_op->getResult(0),
+            new_shape_const);
+        op->setOperand(weight_operand_idx, reshape_op);
+
+        // Create a new function with preprocessed types.
+        ModuleOp module = op->getParentOfType<ModuleOp>();
+        SymbolTable symbol_table(module);
+        func::FuncOp float_func =
+            dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
+        OperandRange func_args = op.getArgs();
+        func::FuncOp new_float_func = float_func.clone();
+
+        SmallVector<Value> new_float_func_args{func_args.begin(),
+                                               func_args.end()};
+        new_float_func_args[weight_operand_idx] = reshape_op;
+        new_float_func.getArgument(weight_operand_idx).setType(new_shape);
+        new_float_func.setType(FunctionType::get(
+            getContext(), TypeRange{ValueRange{new_float_func_args}},
+            new_float_func.getResultTypes()));
+        symbol_table.insert(new_float_func);
+
+        op->setAttr("f", SymbolRefAttr::get(rewriter.getContext(),
+                                            new_float_func.getName()));
+        return success();
+      }
+    }
+    return failure();
+  }
+
+ private:
+  const OpSet op_set_;
+};
+
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.inc"
+
+void PreprocessOpPass::runOnOperation() {
+  MLIRContext* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  ModuleOp module_op = getOperation();
+
+  populateWithGenerated(patterns);
+  patterns.add<PreprocessConstantOp>(ctx, op_set_);
+  FrozenRewritePatternSet frozen_patterns(std::move(patterns));
+
+  for (auto func : module_op.getOps<func::FuncOp>()) {
+    if (failed(applyPatternsAndFoldGreedily(func, frozen_patterns))) {
+      func.emitError() << "quant-preprocess-op failed.";
+      signalPassFailure();
+    }
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the TensorFlow dialect PreprocessOp
+// pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
+    const QuantizationSpecs& quant_specs, const OpSet op_set) {
+  return std::make_unique<PreprocessOpPass>(quant_specs, op_set);
+}
+
+static PassRegistration<PreprocessOpPass> pass;
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td
new file mode 100644
index 00000000000..66ff5f06752
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.td
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/PatternBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+
+// Converts tf.Const to arith.constant for statically shaped, non-opaque constants.
+// Needed for QuantizationDriver to recognize constants.
+def ConvertTfConstToArithConst : Pat<
+  (TF_ConstOp:$res DenseElementsAttr:$value),
+  (Arith_ConstantOp $value),
+  [(AnyStaticShapeTensor $res)], (addBenefit 10)>;
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index 432d2c9720f..3be531c05d9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
@@ -52,23 +53,23 @@ namespace quant {
 
 //===----------------------------------------------------------------------===//
 // The actual Quantize Pass.
-//
+//===----------------------------------------------------------------------===//
 namespace {
 
 enum QuantizationTrait { kFullQuantization, kDynamicRangeQuantization };
 
 // Base struct for quantization.
-template <QuantizationTrait quantization_trait, typename ConcretTy,
-          typename RootOp = quantfork::DequantizeCastOp>
+template <QuantizationTrait quantization_trait, typename ConcreteT,
+          typename RootOpT = quantfork::DequantizeCastOp>
 struct TFQuantizationBase
-    : public QuantizationPattern<ConcretTy, quantfork::QuantizeCastOp,
+    : public QuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
                                  quantfork::DequantizeCastOp,
-                                 /*VERIFIER=*/void, RootOp> {
+                                 /*VerifierT=*/void, RootOpT> {
   explicit TFQuantizationBase(MLIRContext* ctx,
                               const QuantPassSpec& quant_params)
-      : QuantizationPattern<ConcretTy, quantfork::QuantizeCastOp,
+      : QuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
                             quantfork::DequantizeCastOp,
-                            /*VERIFIER=*/void, RootOp>(ctx, quant_params) {}
+                            /*VerifierT=*/void, RootOpT>(ctx, quant_params) {}
 
   // Custom op quantization is not supported.
   static bool IsQuantizableCustomOp(Operation* op,
@@ -90,11 +91,13 @@ struct TFQuantizationBase
     return quantization_trait == kDynamicRangeQuantization;
   }
 
-  // Weight-only quantization is not supported.
-  static bool IsWeightOnlyOp(Operation* quantized_op, StringSet& ops_blocklist,
+  // All the quantized ops are supported if the quantization method is weight
+  // only quantization.
+  static bool IsWeightOnlyOp(Operation* quantized_op,
+                             absl::flat_hash_set<std::string>& ops_blocklist,
                              bool weight_only_quantization,
                              const CustomMap& custom_op_map) {
-    return false;
+    return weight_only_quantization;
   }
 };
 
@@ -132,17 +135,16 @@ struct TFDynamicRangeQuantization
 // The benefit of this pattern is set to lower value than other patterns, so
 // that the other patterns can work on quantize/dequantize ops first.
 class RemoveUnusedQdqPattern
-    : public OpRewritePattern<quantfork::QuantizeCastOp> {
+    : public OpRewritePattern<quantfork::DequantizeCastOp> {
  public:
   explicit RemoveUnusedQdqPattern(MLIRContext* context)
-      : OpRewritePattern<quantfork::QuantizeCastOp>(context) {}
-  LogicalResult matchAndRewrite(quantfork::QuantizeCastOp op,
+      : OpRewritePattern<quantfork::DequantizeCastOp>(context) {}
+  LogicalResult matchAndRewrite(quantfork::DequantizeCastOp dq_op,
                                 PatternRewriter& rewriter) const override {
-    if (!op->hasOneUse() ||
-        !llvm::isa<quantfork::DequantizeCastOp>(*op->getUsers().begin())) {
-      return failure();
-    }
-    op->getUsers().begin()->getResult(0).replaceAllUsesWith(op.getArg());
+    auto q_op = dq_op.getArg().getDefiningOp<quantfork::QuantizeCastOp>();
+    if (!q_op) return failure();
+
+    dq_op.replaceAllUsesWith(q_op.getArg());
     return success();
   }
 };
@@ -151,11 +153,13 @@ class QuantizeSameScaleOpsPattern
     : public OpRewritePattern<quantfork::DequantizeCastOp> {
  public:
   explicit QuantizeSameScaleOpsPattern(
-      MLIRContext* context, OpQuantScaleSpecGetter op_quant_scale_spec_getter)
+      MLIRContext* context, OpQuantScaleSpecGetter op_quant_scale_spec_getter,
+      OpSet target_opset)
       // Set the score to a large number so it is always preferred, after
       // quantization patterns.
       : OpRewritePattern<quantfork::DequantizeCastOp>(context, /*benefit=*/200),
-        op_quant_scale_spec_getter_(op_quant_scale_spec_getter) {}
+        op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
+        target_opset_(target_opset) {}
 
   LogicalResult matchAndRewrite(quantfork::DequantizeCastOp op,
                                 PatternRewriter& rewriter) const override {
@@ -184,6 +188,16 @@ class QuantizeSameScaleOpsPattern
         continue;
       }
 
+      if (target_opset_ == OpSet::XLA &&
+          !IsConnectedWithCompsiteFunction(quantizing_op)) {
+        continue;
+      }
+
+      // Same scale op is not supported for Uniform Quantized ops.
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+        continue;
+      }
+
       // Collect all the quantized inputs and "clone" the matched op by these
       // inputs.
       SmallVector<Value, 4> inputs;
@@ -263,7 +277,7 @@ class QuantizeSameScaleOpsPattern
       if (quantizing_op->getNumRegions() != 0) {
         for (const auto& indexed_regions :
              llvm::enumerate(quantizing_op->getRegions())) {
-          BlockAndValueMapping mapping;
+          IRMapping mapping;
           indexed_regions.value().cloneInto(
               &quantized_op->getRegion(indexed_regions.index()), mapping);
         }
@@ -282,7 +296,105 @@ class QuantizeSameScaleOpsPattern
   }
 
  private:
+  // Checks whether the operation is connnected with a composite function.
+  // If not, the same-scale op will not be quantized. This decision is based
+  // on the current assumption that the performance gain of the same-scale
+  // op itself could not beat the overhead of the quantize and dequantize
+  // routines need to be added around that op. When the assumption changes,
+  // this policy might change as well.
+  bool IsConnectedWithCompsiteFunction(Operation* same_scale_op) const {
+    for (const auto& operand : same_scale_op->getOperands()) {
+      auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
+          operand.getDefiningOp());
+      if (!dq_op) continue;
+
+      Operation* preceding_op = dq_op.getArg().getDefiningOp();
+      if (!preceding_op) continue;
+
+      // Check whether the preceding op is a quantized composite function.
+      if (llvm::isa<TF::PartitionedCallOp>(preceding_op)) {
+        auto call_op = llvm::cast<TF::PartitionedCallOp>(preceding_op);
+        if (!IsCompositeFunction(call_op)) continue;
+        return true;
+      }
+
+      // Check if the preceding op is a quantized same-scale op.
+      if (llvm::isa<quantfork::StorageCastOp>(preceding_op)) {
+        auto sc_op = llvm::cast<quantfork::StorageCastOp>(preceding_op);
+        auto sc_arg_type = sc_op.getArg().getType().dyn_cast<TensorType>();
+        if (sc_arg_type.getElementType().isInteger(8)) {
+          return true;
+        }
+      }
+    }
+
+    for (const auto& result : same_scale_op->getResults()) {
+      // If the user is the Quantize op, it must be the only user.
+      if (!result.hasOneUse() ||
+          !llvm::isa<quantfork::QuantizeCastOp>(*result.user_begin())) {
+        continue;
+      }
+
+      auto q_op = llvm::cast<quantfork::QuantizeCastOp>(*result.user_begin());
+      for (auto following_op : q_op->getUsers()) {
+        // Check whether the preceding op is a quantized composite function.
+        if (llvm::isa<TF::PartitionedCallOp>(following_op)) {
+          auto call_op = llvm::cast<TF::PartitionedCallOp>(following_op);
+          if (!IsCompositeFunction(call_op)) continue;
+          return true;
+        }
+
+        // Check if the preceding op is a quantized same-scale op.
+        if (llvm::isa<quantfork::StorageCastOp>(following_op)) {
+          auto sc_op = llvm::cast<quantfork::StorageCastOp>(following_op);
+          auto sc_arg_type = sc_op.getResult().getType().dyn_cast<TensorType>();
+          if (sc_arg_type.getElementType().isInteger(8)) {
+            return true;
+          }
+        }
+      }
+    }
+
+    return false;
+  }
+
+  // Checks if op calls a composite function and all the inputs are quantized.
+  bool IsCompositeFunction(TF::PartitionedCallOp call_op) const {
+    if (!call_op->hasAttr(kQuantTraitAttrName)) {
+      return false;
+    }
+
+    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    if (!f_attr || !f_attr.getValue().startswith("composite_")) {
+      return false;
+    }
+
+    bool has_quantized_types = false;
+    for (Value input : call_op.getArgs()) {
+      if (auto type = input.getType().dyn_cast<TensorType>()) {
+        if (type.getElementType().isa<FloatType>()) {
+          return false;
+        }
+        if (type.getElementType().isa<QuantizedType>()) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    for (Value output : call_op.getOutput()) {
+      if (auto type = output.getType().dyn_cast<TensorType>()) {
+        if (type.getElementType().isa<FloatType>()) {
+          return false;
+        }
+        if (type.getElementType().isa<QuantizedType>()) {
+          has_quantized_types = true;
+        }
+      }
+    }
+    return has_quantized_types;
+  }
+
   OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
+  OpSet target_opset_;
 };
 
 // The AvgPool op is a same-scale op but it doesn't have int8 kernel, so
@@ -290,54 +402,50 @@ class QuantizeSameScaleOpsPattern
 // TODO(b/229183248): Remove this workaround after int8 kernels have been
 // added to TF and XLA.
 struct QuantizeAvgPoolOpPattern
-    : public OpRewritePattern<quantfork::QuantizeCastOp> {
+    : public OpRewritePattern<quantfork::StorageCastOp> {
   explicit QuantizeAvgPoolOpPattern(MLIRContext* context)
-      : OpRewritePattern<quantfork::QuantizeCastOp>(context, /*benefit=*/300) {}
+      : OpRewritePattern<quantfork::StorageCastOp>(context, /*benefit=*/100) {}
 
-  LogicalResult matchAndRewrite(quantfork::QuantizeCastOp q_op,
+  LogicalResult matchAndRewrite(quantfork::StorageCastOp sc_op,
                                 PatternRewriter& rewriter) const override {
-    auto avg_pool_op = q_op.getArg().getDefiningOp<TF::AvgPoolOp>();
+    auto avg_pool_op = sc_op.getArg().getDefiningOp<TF::AvgPoolOp>();
     if (!avg_pool_op) return failure();
-    auto dq_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
-        avg_pool_op.value().getDefiningOp());
-    if (!dq_op) return failure();
+    auto preceding_sc_op = dyn_cast_or_null<quantfork::StorageCastOp>(
+        avg_pool_op.getValue().getDefiningOp());
+    if (!preceding_sc_op) return failure();
 
     // Check if the same-scale requirement is met.
-    auto dq_arg_type = dq_op.getArg().getType().cast<TensorType>();
+    auto dq_arg_type = preceding_sc_op.getArg().getType().cast<TensorType>();
     auto qtype = dq_arg_type.getElementType().cast<QuantizedType>();
-    auto q_result_type = q_op.getType().cast<TensorType>();
+    auto q_result_type = sc_op.getType().cast<TensorType>();
     auto out_qtype = q_result_type.getElementType().cast<QuantizedType>();
     if (qtype != out_qtype) {
       avg_pool_op.emitError(
-          "The preceding DequantizeCastOp and the following "
-          "QuantizeCastOp must have the same quantized type");
+          "The preceding StorageCastOp and the following "
+          "StorageCastOp must have the same quantized type");
       return failure();
     }
 
     // Cast to float type before the AvgPool op.
     OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPointAfter(dq_op);
-    auto scast_op = rewriter.create<quantfork::StorageCastOp>(
-        dq_op->getLoc(), dq_arg_type.clone(qtype.getStorageType()),
-        dq_op.getArg());
+    rewriter.setInsertionPointAfter(preceding_sc_op);
     auto fcast_op = rewriter.create<TF::CastOp>(
-        dq_op->getLoc(), dq_arg_type.clone(rewriter.getF32Type()),
-        scast_op.getResult());
-    dq_op.getResult().replaceUsesWithIf(fcast_op.y(), [&](OpOperand& operand) {
-      return operand.getOwner() == avg_pool_op;
-    });
+        preceding_sc_op->getLoc(), dq_arg_type.clone(rewriter.getF32Type()),
+        preceding_sc_op.getResult());
+
+    // Create a new AvgPool op with float type.
+    TF::AvgPoolOp float_avg_pool_op = rewriter.create<TF::AvgPoolOp>(
+        avg_pool_op->getLoc(),
+        avg_pool_op.getType().clone(rewriter.getF32Type()),
+        /*operands=*/fcast_op.getResult(),
+        /*attributes=*/avg_pool_op->getAttrs());
 
     // Cast back to the storage type after AvgPool op.
-    rewriter.setInsertionPointAfter(avg_pool_op);
-    auto const_val = CreateScalarConstValue(rewriter, q_op.getLoc(), 0.5f);
-    auto add_val = rewriter.create<TF::AddV2Op>(
-        q_op.getLoc(), avg_pool_op.output(), const_val);
-    auto floor_val = rewriter.create<TF::FloorOp>(q_op.getLoc(), add_val);
+    auto round_val = rewriter.create<TF::RoundOp>(
+        sc_op.getLoc(), float_avg_pool_op.getOutput());
     auto icast_op = rewriter.create<TF::CastOp>(
-        q_op.getLoc(), q_result_type.clone(qtype.getStorageType()), floor_val);
-    auto iscast_op = rewriter.create<quantfork::StorageCastOp>(
-        q_op.getLoc(), q_op.getType(), icast_op.y());
-    q_op.getResult().replaceAllUsesWith(iscast_op.getResult());
+        sc_op.getLoc(), q_result_type.clone(qtype.getStorageType()), round_val);
+    avg_pool_op.getResult().replaceAllUsesWith(icast_op.getResult());
     return success();
   }
 };
@@ -354,13 +462,16 @@ class QuantizePass
   }
 
   // Constructor used by manually creating the pass.
-  explicit QuantizePass(const QuantizationSpecs& quant_specs)
+  explicit QuantizePass(const QuantizationSpecs& quant_specs,
+                        OpSet target_opset)
       : quant_specs_(quant_specs) {
     weight_quantization_ = quant_specs.weight_quantization;
+    target_opset_ = target_opset;
   }
 
   QuantizePass(const QuantizePass& other) : quant_specs_(other.quant_specs_) {
     weight_quantization_ = other.weight_quantization_;
+    target_opset_ = other.target_opset_;
   }
 
   StringRef getArgument() const final {
@@ -373,6 +484,10 @@ class QuantizePass
     return "Apply quantization on models in TensorFlow dialect";
   }
 
+  // Determine if the unused Q-DQ pairs need to be removed. For weight-only
+  // quantizable ops, Q-DQ ops need to be preserved.
+  bool shouldKeepUnusedQdqPattern();
+
   void runOnOperation() override;
 
  private:
@@ -381,8 +496,23 @@ class QuantizePass
   Option<bool> weight_quantization_{
       *this, "weight-quantization", llvm::cl::init(false),
       llvm::cl::desc("Whether to enable weight quantization.")};
+  Option<OpSet> target_opset_{
+      *this, "target-opset", llvm::cl::init(OpSet::TF),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
 };
 
+bool QuantizePass::shouldKeepUnusedQdqPattern() {
+  return target_opset_ == OpSet::XLA &&
+         (quant_specs_.weight_only_quantization ||
+          quant_specs_.weight_quantization);
+}
+
 void QuantizePass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   auto func = getOperation();
@@ -399,26 +529,29 @@ void QuantizePass::runOnOperation() {
   } else {
     patterns.add<TFFullQuantization, TFFullQuantizationReverse>(ctx,
                                                                 quant_params);
-    patterns.add<QuantizeSameScaleOpsPattern>(ctx, GetTfQuantScaleSpec);
+    patterns.add<QuantizeSameScaleOpsPattern>(ctx, GetTfQuantScaleSpec,
+                                              target_opset_);
     patterns.add<QuantizeAvgPoolOpPattern>(ctx);
   }
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
-  RewritePatternSet patterns_2(&getContext());
-  patterns_2.add<RemoveUnusedQdqPattern>(ctx);
-  (void)applyPatternsAndFoldGreedily(func, std::move(patterns_2));
+  if (!shouldKeepUnusedQdqPattern()) {
+    RewritePatternSet patterns_2(&getContext());
+    patterns_2.add<RemoveUnusedQdqPattern>(ctx);
+    (void)applyPatternsAndFoldGreedily(func, std::move(patterns_2));
+  }
 }
 }  // namespace
 
 // Creates an instance of the TensorFlow dialect Quantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass() {
   QuantizationSpecs quant_specs;
-  return std::make_unique<QuantizePass>(quant_specs);
+  return std::make_unique<QuantizePass>(quant_specs, OpSet::TF);
 }
 
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    QuantizationSpecs quant_specs) {
-  return std::make_unique<QuantizePass>(quant_specs);
+    QuantizationSpecs quant_specs, OpSet target_opset) {
+  return std::make_unique<QuantizePass>(quant_specs, target_opset);
 }
 
 static PassRegistration<QuantizePass> pass;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index eabb26b51f7..1e1686efe1c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -12,13 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
@@ -36,13 +42,17 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
@@ -52,9 +62,16 @@ namespace mlir {
 namespace quant {
 namespace {
 
-constexpr char kQuantizeFuncName[] = "quantize_i8";
-constexpr char kDequantizeFuncName[] = "dequantize_i8";
-constexpr char kAttrMapAttribute[] = "attr_map";
+using QuantMethod =
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
+
+constexpr StringRef kQuantizeFuncName = "quantize_i8";
+constexpr StringRef kDequantizeFuncName = "dequantize_i8";
+constexpr StringRef kAttrMapAttribute = "attr_map";
+constexpr StringRef kQuantizedOpsAttribute = "tf_quant.quantized_ops";
+constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kQuantizedFuncPrefix = "quantized_";
+constexpr StringRef kFloatOutputFuncPrefix = "_float_output_fn";
 
 class QuantizeCompositeFunctionsPass
     : public mlir::PassWrapper<QuantizeCompositeFunctionsPass,
@@ -62,17 +79,23 @@ class QuantizeCompositeFunctionsPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeCompositeFunctionsPass)
 
-  explicit QuantizeCompositeFunctionsPass() {}
+  explicit QuantizeCompositeFunctionsPass() = default;
 
-  explicit QuantizeCompositeFunctionsPass(
-      QuantizationMethod quantization_method, OpSet target_opset) {
+  explicit QuantizeCompositeFunctionsPass(QuantMethod quantization_method,
+                                          OpSet target_opset,
+                                          bool enable_per_channel_quantization,
+                                          int min_num_elements_for_weights) {
     quantization_method_ = quantization_method;
     target_opset_ = target_opset;
+    enable_per_channel_quantization_ = enable_per_channel_quantization;
+    min_num_elements_for_weights_ = min_num_elements_for_weights;
   }
 
   QuantizeCompositeFunctionsPass(const QuantizeCompositeFunctionsPass& other) {
     quantization_method_ = other.quantization_method_;
     target_opset_ = other.target_opset_;
+    enable_per_channel_quantization_ = other.enable_per_channel_quantization_;
+    min_num_elements_for_weights_ = other.min_num_elements_for_weights_;
   }
 
   StringRef getArgument() const final {
@@ -95,15 +118,20 @@ class QuantizeCompositeFunctionsPass
   void runOnOperation() override;
 
   // These flags are only used for testing purpose.
-  Option<QuantizationMethod> quantization_method_{
+  Option<QuantMethod> quantization_method_{
       *this, "quantization-method",
-      llvm::cl::init(QuantizationMethod::kPostTrainingQuantization),
+      llvm::cl::init(
+          tensorflow::quantization::QuantizationMethod::STATIC_RANGE),
       llvm::cl::desc("Choose quantization method."),
       llvm::cl::values(
-          clEnumValN(QuantizationMethod::kPostTrainingQuantization, "ptq",
-                     "Post-training static-range quantization"),
-          clEnumValN(QuantizationMethod::kDynamicRangeQuantization, "drq",
-                     "Post-training dynamic-range quantizaiton"))};
+          clEnumValN(tensorflow::quantization::QuantizationMethod::STATIC_RANGE,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(
+              tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE,
+              "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY,
+                     "weight_only", "Post-training weight-only quantizaiton"))};
+
   Option<OpSet> target_opset_{
       *this, "target-opset", llvm::cl::init(OpSet::TF),
       llvm::cl::desc("Choose target opset."),
@@ -113,6 +141,12 @@ class QuantizeCompositeFunctionsPass
           clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
           clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
                      "Uses TF Uniform Quantized ops"))};
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
+
+  int min_num_elements_for_weights_;
 };
 
 LogicalResult CreateUniformQuantizedTypeParams(UniformQuantizedType qtype,
@@ -178,14 +212,55 @@ LogicalResult CreateQuantizationParams(QuantizedType elem_type, Location loc,
   return failure();
 }
 
+// Converts the element type of the input tensor to the corresponding quantized
+// version. Supports only int8 for now and returns nullptr if the input type is
+// not supported.
+ShapedType ConvertIntToQint(ShapedType input_type, MLIRContext* ctx) {
+  int bit_width;
+  bool is_signed;
+
+  Type ele_type = input_type.getElementType();
+  if (ele_type.isIntOrFloat()) {
+    bit_width = ele_type.getIntOrFloatBitWidth();
+    is_signed = ele_type.isSignlessIntOrFloat() || ele_type.isSignedInteger();
+  } else if (QuantizedType qtype = ele_type.dyn_cast<QuantizedType>()) {
+    bit_width = qtype.getStorageTypeIntegralWidth();
+    is_signed = qtype.isSigned();
+  } else {
+    return input_type;
+  }
+
+  Type new_storage_type;
+  if (is_signed) {
+    switch (bit_width) {
+      case 8:
+        new_storage_type = TF::Qint8Type::get(ctx);
+        break;
+      case 32:
+        new_storage_type = TF::Qint32Type::get(ctx);
+        break;
+      default:
+        return nullptr;  // Not yet supported
+    }
+  } else {
+    return nullptr;  // Not yet supported
+  }
+
+  input_type = input_type.clone(new_storage_type);
+  return input_type;
+}
+
 // Replaces quant.qcast op to composite quantize_i8 function.
 class ReplaceQuantizePattern
     : public mlir::OpRewritePattern<quantfork::QuantizeCastOp> {
  public:
-  explicit ReplaceQuantizePattern(MLIRContext* context)
-      : OpRewritePattern<quantfork::QuantizeCastOp>(context) {}
+  explicit ReplaceQuantizePattern(MLIRContext* context, OpSet target_opset)
+      : OpRewritePattern<quantfork::QuantizeCastOp>(context),
+        target_opset_(target_opset) {}
 
  private:
+  OpSet target_opset_ = OpSet::TF;
+
   LogicalResult matchAndRewrite(quantfork::QuantizeCastOp q_op,
                                 PatternRewriter& rewriter) const override {
     auto output_type = q_op.getType().cast<TensorType>();
@@ -198,8 +273,21 @@ class ReplaceQuantizePattern
       return failure();
     }
 
-    SmallVector<Type> output_types = {
-        output_type.clone(elem_type.getStorageType())};
+    SmallVector<Type> output_types;
+
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      ShapedType new_output_type = ConvertIntToQint(
+          output_type.cast<ShapedType>(), rewriter.getContext());
+      if (!new_output_type) {
+        q_op->emitError(
+            "Failed to convert the type to the corresponding qtype.");
+        return failure();
+      }
+      output_types = {new_output_type};
+    } else {
+      output_types = {output_type.clone(elem_type.getStorageType())};
+    }
+
     SmallVector<Value> args = {q_op.getArg(), scale, zero_point};
     FlatSymbolRefAttr func_name =
         FlatSymbolRefAttr::get(rewriter.getStringAttr(kQuantizeFuncName));
@@ -218,10 +306,13 @@ class ReplaceQuantizePattern
 class ReplaceDequantizePattern
     : public mlir::OpRewritePattern<quantfork::DequantizeCastOp> {
  public:
-  explicit ReplaceDequantizePattern(MLIRContext* context)
-      : OpRewritePattern<quantfork::DequantizeCastOp>(context) {}
+  explicit ReplaceDequantizePattern(MLIRContext* context, OpSet target_opset)
+      : OpRewritePattern<quantfork::DequantizeCastOp>(context),
+        target_opset_(target_opset) {}
 
  private:
+  OpSet target_opset_ = OpSet::TF;
+
   LogicalResult matchAndRewrite(quantfork::DequantizeCastOp dq_op,
                                 PatternRewriter& rewriter) const override {
     auto input_type = dq_op.getArg().getType().cast<TensorType>();
@@ -235,6 +326,17 @@ class ReplaceDequantizePattern
     }
 
     TensorType output_type = input_type.clone(elem_type.getStorageType());
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      ShapedType new_output_type = ConvertIntToQint(
+          output_type.cast<ShapedType>(), rewriter.getContext());
+      if (!new_output_type) {
+        dq_op->emitError(
+            "Failed to convert the type to the corresponding qtype.");
+        return failure();
+      }
+      output_type = new_output_type.cast<TensorType>();
+    }
+
     auto scast_op = rewriter.create<quantfork::StorageCastOp>(loc, output_type,
                                                               dq_op.getArg());
 
@@ -254,10 +356,10 @@ class ReplaceDequantizePattern
 // index information for each op.
 bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
   bool has_quantized_types_for_weights = false;
-  for (int32_t cur_idx = 0; cur_idx < call_op.args().size(); cur_idx++) {
+  for (int32_t cur_idx = 0; cur_idx < call_op.getArgs().size(); cur_idx++) {
     // Check if the only the weight index has QuantizeCastOp.
     auto cur_op = dyn_cast_or_null<quantfork::QuantizeCastOp>(
-        call_op.args()[cur_idx].getDefiningOp());
+        call_op.getArgs()[cur_idx].getDefiningOp());
     if ((!cur_op && cur_idx == 1) || (cur_op && cur_idx != 1)) {
       return false;
     } else if (cur_op) {
@@ -270,7 +372,7 @@ bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
       has_quantized_types_for_weights = true;
     }
   }
-  for (Value output : call_op.output()) {
+  for (Value output : call_op.getOutput()) {
     if (auto type = output.getType().dyn_cast<TensorType>()) {
       if (type.getElementType().isa<QuantizedType>()) {
         return false;
@@ -283,7 +385,7 @@ bool IsQuantizedCallforDynamicRange(TF::PartitionedCallOp call_op) {
 // Checks if all the inputs are quantized.
 bool IsQuantizedCallforStaticRange(TF::PartitionedCallOp call_op) {
   bool has_quantized_types = false;
-  for (Value input : call_op.args()) {
+  for (Value input : call_op.getArgs()) {
     if (auto type = input.getType().dyn_cast<TensorType>()) {
       if (type.getElementType().isa<FloatType>()) {
         return false;
@@ -293,7 +395,7 @@ bool IsQuantizedCallforStaticRange(TF::PartitionedCallOp call_op) {
       }
     }
   }
-  for (Value output : call_op.output()) {
+  for (Value output : call_op.getOutput()) {
     if (auto type = output.getType().dyn_cast<TensorType>()) {
       if (type.getElementType().isa<FloatType>()) {
         return false;
@@ -306,39 +408,67 @@ bool IsQuantizedCallforStaticRange(TF::PartitionedCallOp call_op) {
   return has_quantized_types;
 }
 
-// Converts the element type of the input tensor to the corresponding quantized
-// version. Supports only int8 for now and returns nullptr if the input type is
-// not supported.
-ShapedType ConvertIntToQint(ShapedType input_type, MLIRContext* ctx) {
-  int bit_width;
-  bool is_signed;
+// Transfers the attributes of the corresponding ops from the float function to
+// the quantized function using the attr_map attribute. In the quantized
+// function, this map (map1) is in {attr_name_1: attr_identifier} format; and in
+// the float function, this map (map2) is in {attr_identifier: attr_name_2}
+// format. Where, the attribute identifiers should match between two maps,
+// attr_name_1 is the name of the of the attribute needs to be set in the
+// quantized function, attr_name_2 is the name of the attribute corresponding to
+// the attribute identifier in the float function.
+LogicalResult TransferTFAttributesToTFUniformAttributes(
+    PatternRewriter& rewriter, func::FuncOp float_func,
+    func::FuncOp quantized_func, QuantMethod quantization_method,
+    bool enable_per_channel_quantization) {
+  // A map to find an attribute from its identifier.
+  llvm::StringMap<Attribute> identifier_to_attr;
 
-  Type ele_type = input_type.getElementType();
-  if (ele_type.isIntOrFloat()) {
-    bit_width = ele_type.getIntOrFloatBitWidth();
-    is_signed = ele_type.isSignlessIntOrFloat() || ele_type.isSignedInteger();
-  } else if (QuantizedType qtype = ele_type.dyn_cast<QuantizedType>()) {
-    bit_width = qtype.getStorageTypeIntegralWidth();
-    is_signed = qtype.isSigned();
-  } else {
-    return input_type;
+  for (Operation& inner_op : float_func.getBody().front().getOperations()) {
+    if (!inner_op.hasAttr(kAttrMapAttribute)) continue;
+    // Insert quantization related attribute if they exists. Quantization
+    // attributes are generated in the prepare pass so the attr_map doesn't
+    // contain the attribute names.
+    // TransferQuantizationAttributes(rewriter, inner_op, attrs);
+    std::string attr_map_str =
+        inner_op.getAttrOfType<StringAttr>(kAttrMapAttribute).str();
+    for (absl::string_view element_str : absl::StrSplit(attr_map_str, ',')) {
+      std::vector<absl::string_view> key_and_value_pair =
+          absl::StrSplit(element_str, ':');
+      if (key_and_value_pair.size() != 2) {
+        float_func.emitError("The attr_map attribute is malformed");
+        return failure();
+      }
+      identifier_to_attr.insert(
+          {llvm::StringRef(std::string(key_and_value_pair[1])),
+           inner_op.getAttr(
+               llvm::StringRef(std::string(key_and_value_pair[1])))});
+    }
   }
 
-  Type new_storage_type;
-  if (is_signed) {
-    switch (bit_width) {
-      case 8:
-        new_storage_type = mlir::TF::Qint8Type::get(ctx);
-        break;
-      default:
-        return nullptr;  // Not yet supported
+  // Set the attributes for ops with the attr_map attribute.
+  for (Operation& inner_op : quantized_func.getBody().front().getOperations()) {
+    if (auto uniform_op =
+            llvm::dyn_cast<TF::UniformQuantizedConvolutionHybridOp>(inner_op)) {
+      if (failed(FillAttributesForUniformQuantizedConvolutionOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedConvolutionOp>(
+                       inner_op)) {
+      if (failed(FillAttributesForUniformQuantizedConvolutionOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
+    } else if (auto uniform_op =
+                   llvm::dyn_cast<TF::UniformQuantizedDotHybridOp>(inner_op)) {
+      if (failed(FillAttributesForUniformQuantizedDotOp(
+              rewriter, uniform_op, identifier_to_attr, quantization_method,
+              enable_per_channel_quantization)))
+        return failure();
     }
-  } else {
-    return nullptr;  // Not yet supported
   }
-
-  input_type = input_type.clone(new_storage_type);
-  return input_type;
+  return success();
 }
 
 // Transfers the attributes of the corresponding ops from the float function to
@@ -400,26 +530,41 @@ LogicalResult TransferAttributes(func::FuncOp float_func,
   return success();
 }
 
+// Get the corresponding quantized function name from the given function name.
+std::string GetQuantizedFunctionName(StringRef func_name) {
+  if (func_name.startswith(kQuantizedFuncPrefix)) return func_name.str();
+  if (!func_name.startswith(kCompositeFuncPrefix)) return "";
+
+  return llvm::Twine(kQuantizedFuncPrefix)
+      .concat(llvm::Twine(
+          func_name.substr(kCompositeFuncPrefix.size()).rsplit("_fn").first))
+      .concat("_fn")
+      .str();
+}
+
 // Unwraps quantization parameters of PartitionedCall ops with quantized
 // input/outputs that are created from QuantizePass.
 class QuantizeFunctionPattern
     : public mlir::OpRewritePattern<TF::PartitionedCallOp> {
  public:
   explicit QuantizeFunctionPattern(MLIRContext* context,
-                                   QuantizationMethod quantization_method,
-                                   OpSet target_opset)
+                                   QuantMethod quantization_method,
+                                   OpSet target_opset,
+                                   bool enable_per_channel_quantization)
       : OpRewritePattern<TF::PartitionedCallOp>(context),
         quantization_method_(quantization_method),
-        target_opset_(target_opset) {}
+        target_opset_(target_opset),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
 
  private:
-  QuantizationMethod quantization_method_ =
-      QuantizationMethod::kPostTrainingQuantization;
+  QuantMethod quantization_method_ =
+      tensorflow::quantization::QuantizationMethod::STATIC_RANGE;
   OpSet target_opset_ = OpSet::TF;
+  bool enable_per_channel_quantization_;
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
                                 PatternRewriter& rewriter) const override {
-    const auto f_attr = call_op.fAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
     // removeAttr will return nullptr if no attribute was removed.
     if (!call_op->removeAttr(kQuantTraitAttrName) || !f_attr) {
       return failure();
@@ -427,9 +572,11 @@ class QuantizeFunctionPattern
 
     // Determines if all required float input/outputs are now quantized.
     bool has_quantized_types = false;
-    if (quantization_method_ == QuantizationMethod::kDynamicRangeQuantization) {
+    if (quantization_method_ ==
+        tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
       has_quantized_types = IsQuantizedCallforDynamicRange(call_op);
-      if (f_attr.getValue().startswith("composite_") && !has_quantized_types) {
+      if (f_attr.getValue().startswith(kCompositeFuncPrefix) &&
+          !has_quantized_types) {
         call_op->emitError(
             "Only quantizable ops need to be in composite function for dynamic"
             "-range PTQ case.");
@@ -439,13 +586,14 @@ class QuantizeFunctionPattern
       has_quantized_types = IsQuantizedCallforStaticRange(call_op);
     }
 
-    if (!f_attr.getValue().startswith("composite_") || !has_quantized_types) {
+    if (!f_attr.getValue().startswith(kCompositeFuncPrefix) ||
+        !has_quantized_types) {
       return failure();
     }
 
     SmallVector<Value, 4> args;
     SmallVector<Value, 4> qparam_args;
-    for (Value arg : call_op.args()) {
+    for (Value arg : call_op.getArgs()) {
       if (const auto arg_type = arg.getType().dyn_cast<TensorType>()) {
         QuantizedType qtype =
             arg_type.getElementType().dyn_cast<QuantizedType>();
@@ -491,7 +639,7 @@ class QuantizeFunctionPattern
 
     rewriter.setInsertionPoint(call_op);
 
-    for (Value arg : call_op.args()) {
+    for (Value arg : call_op.getArgs()) {
       TensorType arg_type = arg.getType().dyn_cast<TensorType>();
       if (!arg_type) {
         args.push_back(arg);
@@ -504,8 +652,7 @@ class QuantizeFunctionPattern
       }
 
       quantfork::StorageCastOp scast_op;
-      if (quantization_method_ ==
-          QuantizationMethod::kDynamicRangeQuantization) {
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
         ShapedType new_arg_type = ConvertIntToQint(arg_type.cast<ShapedType>(),
                                                    rewriter.getContext());
         if (!new_arg_type) {
@@ -548,11 +695,17 @@ class QuantizeFunctionPattern
         result_types.push_back(result_type);
         continue;
       }
+
+      if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+        ShapedType new_result_type = ConvertIntToQint(
+            result_type.cast<ShapedType>(), rewriter.getContext());
+        result_types.push_back(new_result_type);
+      } else {
+        result_types.push_back(result_type.clone(qtype.getStorageType()));
+      }
       auto scast_op = rewriter.create<quantfork::StorageCastOp>(
           call_op.getLoc(), result_type, result);
       replace_map.insert(std::make_pair(result, scast_op));
-
-      result_types.push_back(result_type.clone(qtype.getStorageType()));
     }
 
     for (auto replace_pair : replace_map) {
@@ -569,16 +722,16 @@ class QuantizeFunctionPattern
         dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
     rewriter.setInsertionPointAfter(float_func);
 
-    // substr(10) == strip the "composite_" prefix.
-    const llvm::Twine quantized_function_name = llvm::Twine(
-        "quantized_", f_attr.getValue().substr(10).rsplit('_').first);
-    const mlir::func::FuncOp quantized_func = dyn_cast<func::FuncOp>(
-        symbol_table.lookup(quantized_function_name.str()));
+    const std::string quantized_function_name =
+        GetQuantizedFunctionName(f_attr.getValue());
+    const mlir::func::FuncOp quantized_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(quantized_function_name));
     mlir::func::FuncOp new_quantized_func =
         dyn_cast<func::FuncOp>(quantized_func->clone());
     if (new_quantized_func == nullptr) {
       return failure();
     }
+
     new_quantized_func.setType(
         FunctionType::get(getContext(), TypeRange{ValueRange{args}},
                           new_quantized_func.getResultTypes()));
@@ -588,8 +741,16 @@ class QuantizeFunctionPattern
     }
 
     // Set the attributes for ops with the attr_map attribute.
-    if (failed(TransferAttributes(float_func, new_quantized_func))) {
-      return failure();
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      if (failed(TransferTFAttributesToTFUniformAttributes(
+              rewriter, float_func, new_quantized_func, quantization_method_,
+              enable_per_channel_quantization_))) {
+        return failure();
+      }
+    } else {
+      if (failed(TransferAttributes(float_func, new_quantized_func))) {
+        return failure();
+      }
     }
 
     rewriter.setInsertionPoint(call_op);
@@ -639,15 +800,18 @@ class QuantizeFunctionPattern
     auto module = call_op->getParentOfType<ModuleOp>();
     SymbolTable symbol_table(module);
 
-    const auto f_attr = call_op.fAttr().dyn_cast<FlatSymbolRefAttr>();
+    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
     const auto float_func =
         dyn_cast<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
     rewriter.setInsertionPointAfter(float_func);
 
-    // substr(10) == strip the "composite_" prefix.
-    const std::string quantized_function_name =
-        "quantized_" + f_attr.getValue().substr(10).rsplit("_fn_").first.str() +
-        "_float_output_fn";
+    // the length of the "_fn" suffix.
+    const size_t fn_suffix_length = 3;
+    std::string quantized_function_name =
+        GetQuantizedFunctionName(f_attr.getValue());
+    quantized_function_name.replace(
+        quantized_function_name.size() - fn_suffix_length, fn_suffix_length,
+        kFloatOutputFuncPrefix);
     const auto quantized_func =
         dyn_cast<func::FuncOp>(symbol_table.lookup(quantized_function_name));
     auto new_quantized_func = dyn_cast<func::FuncOp>(quantized_func->clone());
@@ -663,8 +827,16 @@ class QuantizeFunctionPattern
     }
 
     // Set the attributes for ops with the attr_map attribute.
-    if (failed(TransferAttributes(float_func, new_quantized_func))) {
-      return failure();
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
+      if (failed(TransferTFAttributesToTFUniformAttributes(
+              rewriter, float_func, new_quantized_func, quantization_method_,
+              enable_per_channel_quantization_))) {
+        return failure();
+      }
+    } else {
+      if (failed(TransferAttributes(float_func, new_quantized_func))) {
+        return failure();
+      }
     }
 
     rewriter.setInsertionPoint(call_op);
@@ -696,14 +868,11 @@ class QuantizeConstPattern
     : public OpRewritePattern<quantfork::QuantizeCastOp> {
  public:
   // This pattern should have larger benefit than ReplaceQuantizePattern
-  explicit QuantizeConstPattern(MLIRContext* context,
-                                QuantizationMethod quantization_method)
+  explicit QuantizeConstPattern(MLIRContext* context, OpSet target_opset)
       : OpRewritePattern<quantfork::QuantizeCastOp>(context, /*benefit=*/10),
-        quantization_method_(quantization_method) {}
+        target_opset_(target_opset) {}
 
  private:
-  QuantizationMethod quantization_method_ =
-      QuantizationMethod::kPostTrainingQuantization;
   LogicalResult matchAndRewrite(quantfork::QuantizeCastOp q_op,
                                 PatternRewriter& rewriter) const override {
     DenseFPElementsAttr attr;
@@ -721,11 +890,9 @@ class QuantizeConstPattern
         tensor_qtype.getElementType().cast<QuantizedType>().getStorageType();
     ShapedType new_type = tensor_qtype.clone(storage_type);
     Location loc = q_op.getArg().getLoc();
-    // Convert integer to quantized integer type. Currently only applied for
-    // dynamic range quantization case.
-    if (quantization_method_ == QuantizationMethod::kDynamicRangeQuantization) {
+
+    if (target_opset_ == OpSet::UNIFORM_QUANTIZED) {
       new_type = ConvertIntToQint(new_type, rewriter.getContext());
-      tensor_qtype = ConvertIntToQint(tensor_qtype, rewriter.getContext());
 
       // TODO(b/225793355): It adds TensorProtoAttr to the constant as a
       // workaround.
@@ -735,7 +902,12 @@ class QuantizeConstPattern
         return failure();
       }
 
-      tensor_proto.set_dtype(tensorflow::DT_QINT8);
+      const int bit_width = tensor_qtype.getElementType()
+                                .dyn_cast<QuantizedType>()
+                                .getStorageTypeIntegralWidth();
+
+      tensor_proto.set_dtype((bit_width == 8) ? tensorflow::DT_QINT8
+                                              : tensorflow::DT_QINT32);
 
       tensor_proto_attr = ElementsAttr(TF::TensorProtoAttr::get(
           new_type, tensorflow::mangling_util::MangleTensor(tensor_proto)));
@@ -745,10 +917,161 @@ class QuantizeConstPattern
     // Add scast op to match quantize -> composition pattern. The added scast
     // is then removed by canonicalization. ([scast - scast] -> [])
     auto scast_op = rewriter.create<quantfork::StorageCastOp>(
-        loc, tensor_qtype, const_op.output());
+        loc, tensor_qtype, const_op.getOutput());
     q_op->replaceAllUsesWith(scast_op);
     return success();
   }
+
+  OpSet target_opset_;
+};
+
+// Prints a summary about the quantization results.
+class QuantizationSummary {
+ public:
+  explicit QuantizationSummary(ModuleOp module)
+      : module_(module), symbol_table_(module) {}
+
+  void Print() {
+    llvm::StringMap<OpCountItem> func_count_map;
+    int32_t total_quantized_func_count = 0, float_output_func_count = 0,
+            quantize_func_count = 0, dequantize_func_count = 0,
+            weight_only_count = 0;
+
+    module_.walk([&](Operation* op) {
+      if (auto call_op = llvm::dyn_cast_or_null<TF::PartitionedCallOp>(op)) {
+        const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+        if (!f_attr) return;
+        StringRef func_name = f_attr.getValue();
+        if (func_name.startswith(kQuantizedFuncPrefix)) {
+          auto representative_name = GetRepresentativeName(func_name);
+          if (failed(representative_name)) return;
+
+          func_count_map[representative_name.value()].num_quant++;
+          total_quantized_func_count++;
+          if (func_name.contains(kFloatOutputFuncPrefix)) {
+            float_output_func_count++;
+          }
+        } else if (func_name.startswith(kCompositeFuncPrefix)) {
+          auto representative_name = GetRepresentativeName(func_name);
+          if (failed(representative_name)) {
+            // TODO(b/264507511): Print quantization summary for weight-only.
+            weight_only_count++;
+          } else {
+            func_count_map[representative_name.value()].num_float++;
+          }
+        } else if (func_name.startswith("quantize_i")) {
+          quantize_func_count++;
+        } else if (func_name.startswith("dequantize_i")) {
+          dequantize_func_count++;
+        }
+      } else if (auto einsum = llvm::isa<TF::EinsumOp>(op)) {
+        if (IsInCompsiteFunction(op)) return;
+        // Leftover Einsum ops are always non-quantized.
+        auto op_name = op->getName().stripDialect();
+        func_count_map[op_name].num_float++;
+      }
+    });
+
+    // Pad string to a certain size to format the table. Space is preferred to
+    // Tab since it is easier to check the format in the mlir tests.
+    auto pad_string = [](StringRef s, int32_t width) -> std::string {
+      return llvm::Twine(s).concat(std::string(width - s.size(), ' ')).str();
+    };
+
+    // Generate a quantization report.
+    size_t name_col_width = 5;
+    absl::c_for_each(func_count_map.keys(), [&name_col_width](const auto& key) {
+      name_col_width = std::max(name_col_width, key.size() + 1);
+    });
+
+    std::vector<std::string> lines;
+    lines.push_back("-------- Quantization Summary --------");
+    lines.push_back("Number of quantized layers in the model");
+    lines.push_back("--------------------------------");
+    lines.push_back(
+        absl::StrFormat("%s Count/Total", pad_string("Name", name_col_width)));
+    lines.push_back("================================");
+    for (StringRef op_name : func_count_map.keys()) {
+      const int32_t quantized_count = func_count_map[op_name].num_quant;
+      const int32_t total_count =
+          quantized_count + func_count_map[op_name].num_float;
+      lines.push_back(absl::StrFormat("%s %d/%d",
+                                      pad_string(op_name, name_col_width),
+                                      quantized_count, total_count));
+    }
+    lines.push_back("");
+    lines.push_back(absl::StrFormat(
+        "Number of quantized layers with quantized outputs: %d/%d",
+        total_quantized_func_count - float_output_func_count,
+        total_quantized_func_count));
+    lines.push_back(absl::StrFormat("Number of quantize layers added: %d",
+                                    quantize_func_count));
+    lines.push_back(absl::StrFormat("Number of dequantize layers added: %d",
+                                    dequantize_func_count));
+    lines.push_back("");
+
+    // Make the report visible by default.
+    const std::string log_message =
+        absl::StrJoin(lines.begin(), lines.end(), /*separator=*/"\n");
+    llvm::errs() << log_message;
+
+    // Create a FuncOp and attach the quantization summary to it. This is a
+    // a hack to check the summary in mlir tests. This function will be
+    // automatically removed since this pass is always followed by the Symbol
+    // DCE pass.
+    OpBuilder builder(module_);
+    builder.setInsertionPointToEnd(&module_.getBodyRegion().back());
+    const auto func_type =
+        builder.getFunctionType(/*inputs=*/{}, /*results=*/{});
+    auto summary_func = builder.create<func::FuncOp>(
+        builder.getUnknownLoc(), /*sym_name=*/"summary", func_type);
+    summary_func.setPrivate();
+    summary_func->setAttr("quantization_summary",
+                          builder.getStringAttr(log_message));
+  }
+
+ private:
+  // Structs used to count quantized and non-quantized ops.
+  struct OpCountItem {
+    int32_t num_quant = 0;
+    int32_t num_float = 0;
+  };
+
+  // Get the representative name attribute value of a composite function.
+  FailureOr<StringRef> GetRepresentativeName(StringRef func_name) {
+    std::string quantized_func_name = GetQuantizedFunctionName(func_name);
+    auto quantized_func = dyn_cast_or_null<func::FuncOp>(
+        symbol_table_.lookup(quantized_func_name));
+    // Quantized function does not exist for weight-only case.
+    if (!quantized_func ||
+        !quantized_func->hasAttrOfType<ArrayAttr>(kQuantizedOpsAttribute)) {
+      return failure();
+    }
+
+    auto quantized_ops =
+        quantized_func->getAttrOfType<ArrayAttr>(kQuantizedOpsAttribute)
+            .getValue();
+    if (quantized_ops.empty()) {
+      quantized_func->emitError() << "At least one op is expected in the "
+                                  << kQuantizedOpsAttribute << " attribute.";
+      return failure();
+    }
+
+    // Use the first op as the representative name.
+    return quantized_ops.front().cast<StringAttr>().getValue();
+  }
+
+  bool IsInCompsiteFunction(Operation* op) {
+    func::FuncOp parent = op->getParentOfType<func::FuncOp>();
+    if (!parent) return false;
+
+    StringRef sym_name = parent.getSymName();
+    return sym_name.startswith(kQuantizedFuncPrefix) ||
+           sym_name.startswith(kCompositeFuncPrefix);
+  }
+
+  ModuleOp module_;
+  SymbolTable symbol_table_;
 };
 
 static PassRegistration<QuantizeCompositeFunctionsPass> pass;
@@ -766,47 +1089,71 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   pm.enableVerifier(false);
 
   QuantizationSpecs quant_specs;
-  if (quantization_method_ == QuantizationMethod::kDynamicRangeQuantization) {
+  pm.addPass(CreatePreprocessOpPass(quant_specs, target_opset_));
+
+  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.disable_per_channel = !enable_per_channel_quantization_;
+  // Apply activation-weight quantization.
+  if (quantization_method_ ==
+      tensorflow::quantization::QuantizationMethod::STATIC_RANGE) {
+    pm.addNestedPass<func::FuncOp>(
+        CreatePrepareQuantizePass(quant_specs, quantization_method_));
+    pm.addNestedPass<func::FuncOp>(
+        CreateQuantizePass(quant_specs, target_opset_));
+    pm.addNestedPass<func::FuncOp>(CreatePostQuantizePass());
+  }
+  if ((quantization_method_ !=
+       tensorflow::quantization::QuantizationMethod::STATIC_RANGE) ||
+      (target_opset_ == OpSet::XLA)) {
+    // Apply weight quantization.
+    // For XLA case, weight quantization will be applied for the remaining f32
+    // weights even in SRQ.
+    quant_specs.minimum_elements_for_weights = min_num_elements_for_weights_;
     quant_specs.weight_quantization = true;
-    quant_specs.inference_type = tensorflow::DT_QINT8;
-    pm.addNestedPass<func::FuncOp>(CreatePrepareQuantizeDRQPass());
-  } else {
+    pm.addPass(CreatePrepareQuantizeDRQPass(quant_specs, target_opset_));
+    if (quantization_method_ !=
+        tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
+      quant_specs.weight_only_quantization = true;
+    }
     pm.addNestedPass<func::FuncOp>(
-        CreatePrepareQuantizePass(quantization_method_));
+        CreateQuantizePass(quant_specs, target_opset_));
+    pm.addNestedPass<func::FuncOp>(CreatePostQuantizePass());
   }
-  pm.addNestedPass<func::FuncOp>(CreateQuantizePass(quant_specs));
-
-  pm.addNestedPass<func::FuncOp>(CreatePostQuantizePass());
   if (failed(pm.run(module))) {
     signalPassFailure();
   }
 
   RewritePatternSet patterns(ctx);
   patterns.add<QuantizeFunctionPattern>(ctx, quantization_method_,
-                                        target_opset_);
+                                        target_opset_,
+                                        enable_per_channel_quantization_);
 
   if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
     signalPassFailure();
   }
 
   // Constant quantization is a lossy transformation, so they are applied only
-  // after all the other patterns have been aplied.
+  // after all the other patterns have been applied.
   RewritePatternSet patterns_2(ctx);
   populateWithGenerated(patterns_2);
-  patterns_2.add<ReplaceQuantizePattern, ReplaceDequantizePattern>(ctx);
-  patterns_2.add<QuantizeConstPattern>(ctx, quantization_method_);
+  patterns_2.add<ReplaceQuantizePattern, ReplaceDequantizePattern>(
+      ctx, target_opset_);
+  patterns_2.add<QuantizeConstPattern>(ctx, target_opset_);
   if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns_2))) ||
       failed(verify(module))) {
     signalPassFailure();
   }
+  QuantizationSummary(module).Print();
 }
 
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
-    QuantizationMethod quantization_method, OpSet target_opset) {
-  return std::make_unique<QuantizeCompositeFunctionsPass>(quantization_method,
-                                                          target_opset);
+    QuantMethod quantization_method, OpSet target_opset,
+    bool enable_per_channel_quantization, int min_num_elements_for_weights) {
+  return std::make_unique<QuantizeCompositeFunctionsPass>(
+      quantization_method, target_opset, enable_per_channel_quantization,
+      min_num_elements_for_weights);
 }
 
 }  // namespace quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
index 17c17383f69..37aaae94971 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
@@ -38,18 +38,20 @@ module {
                          %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*xf32> {
     %scale_prod = "tf.Mul"(%input_scale, %filter_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     %rescale_factor = "tf.Div"(%scale_prod, %out_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-
-    // Uses tf.floor(x + 0.5) instead of tf.round(x) since tf.round generates
-    // a very expensive pattern.
-    %round_cst = "tf.Const"() {value = dense<0.5> : tensor<f32>} : () -> tensor<f32>
     %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %zp_plus_round_cst = "tf.AddV2"(%float_out_zp, %round_cst) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
 
     %cast = "tf.Cast"(%accumulation) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
     %mul = "tf.Mul"(%cast, %rescale_factor) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %add = "tf.AddV2"(%mul, %zp_plus_round_cst) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %round = "tf.Floor"(%add) : (tensor<*xf32>) -> tensor<*xf32>
-    func.return %round : tensor<*xf32>
+    %add = "tf.AddV2"(%mul, %float_out_zp) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %add : tensor<*xf32>
+  }
+
+  func.func private @internal_dequantize_i8_fn(%input : tensor<*xi8>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xf32> {
+    %input_i32 = "tf.Cast"(%input) : (tensor<*xi8>) -> tensor<*xi32>
+    %output = "tf.Sub"(%input_i32, %zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    %cast = "tf.Cast"(%output) : (tensor<*xi32>) -> tensor<*xf32>
+    %mul = "tf.Mul"(%cast, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %mul : tensor<*xf32>
   }
 
   // Requantizes and clips to the range of quantized type if there is no specific activation.
@@ -64,9 +66,12 @@ module {
              tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
     %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
     %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
-    %0 = "tf.ClipByValue"(%rescale, %i8_min, %i8_max) : (tensor<*xf32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
-    %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
-    func.return %1 : tensor<*xi8>
+
+    %clamp_max = "tf.Maximum"(%rescale, %i8_min) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %i8_max) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    %round = "tf.Round"(%clamp_min) : (tensor<*xf32>) -> tensor<*xf32>
+    %cast = "tf.Cast"(%round) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+    func.return %cast : tensor<*xi8>
   }
 
   // Requantizes and applies quantized Relu by clipping.
@@ -82,10 +87,13 @@ module {
     %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
     %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
     %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
-    %0 = "tf.ClipByValue"(%rescale, %clip_min, %i8_max) : (tensor<*xf32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
-    %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
-    func.return %1 : tensor<*xi8>
+    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+
+    %clamp_max = "tf.Maximum"(%rescale, %clip_min) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %i8_max) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    %round = "tf.Round"(%clamp_min) : (tensor<*xf32>) -> tensor<*xf32>
+    %cast = "tf.Cast"(%round) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+    func.return %cast : tensor<*xi8>
   }
 
   // Requantizes and applies quantized Relu6 by clipping.
@@ -103,14 +111,17 @@ module {
     %act_max =  "tf.Const"() {value = dense<6.0> : tensor<f32>} : () -> tensor<f32>
     %i8_act_max_0 = "tf.PartitionedCall"(%act_max, %out_scale, %out_zp) {
         config = "", config_proto = "", executor_type = "", f=@quantize_i8
-      } : (tensor<f32>, tensor<*xf32>, tensor<*xi32>) -> tensor<i8>
-    %i8_act_max_1 = "tf.Cast"(%i8_act_max_0) {Truncate = false} : (tensor<i8>) -> tensor<f32>
+      } : (tensor<f32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xi8>
+    %i8_act_max_1 = "tf.Cast"(%i8_act_max_0) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
     %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
-    %clip_max = "tf.Minimum"(%i8_max, %i8_act_max_1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    %0 = "tf.ClipByValue"(%rescale, %clip_min, %clip_max) : (tensor<*xf32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
-    %1 = "tf.Cast"(%0) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
-    func.return %1 : tensor<*xi8>
+    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+    %clip_max = "tf.Minimum"(%i8_max, %i8_act_max_1) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+
+    %clamp_max = "tf.Maximum"(%rescale, %clip_min) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %clip_max) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %round = "tf.Round"(%clamp_min) : (tensor<*xf32>) -> tensor<*xf32>
+    %cast = "tf.Cast"(%round) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+    func.return %cast : tensor<*xi8>
   }
 
   // Dequantizes and clips to the range of quantized type if there is no specific activation.
@@ -121,7 +132,21 @@ module {
     %accumulation_scale = "tf.Mul"(%input_scale, %filter_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
     %cast = "tf.Cast"(%accumulation) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
     %dequantize = "tf.Mul"(%cast, %accumulation_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    func.return %dequantize : tensor<*xf32>
+
+    %i8_min = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
+    %i8_max = "tf.Const"() {value = dense<127> : tensor<i8>} : () -> tensor<i8>
+
+    %clip_min = "tf.PartitionedCall"(%i8_min, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+    %clip_max = "tf.PartitionedCall"(%i8_max, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+
+    %clamp_max = "tf.Maximum"(%dequantize, %clip_min) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %clip_max) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %clamp_min : tensor<*xf32>
   }
 
   // Dequantizes and applies quantized Relu by clipping.
@@ -133,8 +158,22 @@ module {
     %cast = "tf.Cast"(%accumulation) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
     %dequantize = "tf.Mul"(%cast, %accumulation_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
-    %relu = "tf.Relu"(%dequantize) : (tensor<*xf32>) -> tensor<*xf32>
-    func.return %relu : tensor<*xf32>
+    %i8_min = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
+    %i8_max = "tf.Const"() {value = dense<127> : tensor<i8>} : () -> tensor<i8>
+
+    %clip_min_0 = "tf.PartitionedCall"(%i8_min, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+    %clip_max = "tf.PartitionedCall"(%i8_max, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+
+    %clip_min = "tf.Relu"(%clip_min_0) : (tensor<*xf32>) -> tensor<*xf32>
+
+    %clamp_max = "tf.Maximum"(%dequantize, %clip_min) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %clip_max) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %clamp_min : tensor<*xf32>
   }
 
   // Dequantizes and applies quantized Relu6 by clipping.
@@ -146,8 +185,24 @@ module {
     %cast = "tf.Cast"(%accumulation) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
     %dequantize = "tf.Mul"(%cast, %accumulation_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
-    %relu6 = "tf.Relu6"(%dequantize) : (tensor<*xf32>) -> tensor<*xf32>
-    func.return %relu6 : tensor<*xf32>
+    %i8_min = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
+    %i8_max = "tf.Const"() {value = dense<127> : tensor<i8>} : () -> tensor<i8>
+    %relu6_upper = "tf.Const"() {value = dense<6.0>: tensor<f32>} : () -> tensor<f32>
+
+    %clip_min_0 = "tf.PartitionedCall"(%i8_min, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+    %clip_max_0 = "tf.PartitionedCall"(%i8_max, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_dequantize_i8_fn
+      } : (tensor<i8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+
+    %clip_min = "tf.Relu"(%clip_min_0) : (tensor<*xf32>) -> tensor<*xf32>
+    %clip_max = "tf.Minimum"(%clip_max_0, %relu6_upper) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+
+    %clamp_max = "tf.Maximum"(%dequantize, %clip_min) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %clip_max) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+
+    func.return %clamp_min : tensor<*xf32>
   }
 
   // Conv2D with int32 accumulation.
@@ -237,24 +292,44 @@ module {
     func.return %6 : tensor<*xi32>
   }
 
-  for main_op in ["conv2d", "depthwise_conv2d", "matmul", "conv3d"] {
+  // BatchMatMul with int32 accumulation.
+  func.func private @internal_batch_matmul_fn(
+                         %input : tensor<*xi8>, %weight : tensor<*xi8>,
+                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xi32> {
+    %0 = "tf.Cast"(%input) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %1 = "tf.Sub"(%0, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%weight) : (tensor<*xi8>) -> tensor<*xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %3 = "tf.Sub"(%2, %weight_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    %5 = "tf.BatchMatMulV2"(%1, %3) {
+      attr_map = "adj_x:0,adj_y:1"
+    } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    func.return %5 : tensor<*xi32>
+  }
+
+  for main_op in ["Conv2D", "DepthwiseConv2D", "MatMul", "Conv3D", "BatchMatMul"] {
     parameters[
-      {"suffix": "with_bias_fn", "act_func": "internal_requantize_no_activation_fn", "output_type": "i8"},
-      {"suffix": "with_bias_and_relu_fn", "act_func": "internal_requantize_and_relu_fn", "output_type": "i8"},
-      {"suffix": "with_bias_and_relu6_fn", "act_func": "internal_requantize_and_relu6_fn", "output_type": "i8"},
-      {"suffix": "with_bias_float_output_fn", "act_func": "internal_dequantize_no_activation_fn", "output_type": "f32"},
-      {"suffix": "with_bias_and_relu_float_output_fn", "act_func": "internal_dequantize_and_relu_fn", "output_type": "f32"},
-      {"suffix": "with_bias_and_relu6_float_output_fn", "act_func": "internal_dequantize_and_relu6_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}", "BiasAdd"], "act_func": "internal_requantize_no_activation_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu"], "act_func": "internal_requantize_and_relu_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu6"], "act_func": "internal_requantize_and_relu6_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "BiasAdd"], "act_func": "internal_dequantize_no_activation_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu"], "act_func": "internal_dequantize_and_relu_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu6"], "act_func": "internal_dequantize_and_relu6_fn", "output_type": "f32"},
     ]
-    func.func @quantized_${main_op}_${suffix}(%input : tensor<*xi8>,
+    func.func @GenerateQuantizedFunctionName(${quantized_ops}, "${output_type}")(%input : tensor<*xi8>,
                            %filter : tensor<*xi8>, %bias : tensor<*xi32>,
                            %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
                            %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>,
                            %bias_scale : tensor<*xf32>, %bias_zp : tensor<*xi32>,
-                           %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}> {
+                           %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+        attributes {tf_quant.quantized_ops = ${quantized_ops}} {
       %0 = "tf.PartitionedCall"(%input, %filter, %input_scale, %input_zp,
                                   %filter_scale, %filter_zp) {
-          config = "", config_proto = "", executor_type = "", f=@internal_${main_op}_fn
+          config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
         } : (tensor<*xi8>, tensor<*xi8>, tensor<*xf32>, tensor<*xi32>,
                tensor<*xf32>, tensor<*xi32>) -> tensor<*xi32>
       %1 = "tf.AddV2"(%0, %bias) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
@@ -267,21 +342,22 @@ module {
     }
 
     parameters[
-      {"suffix": "fn", "act_func": "internal_requantize_no_activation_fn", "output_type": "i8"},
-      {"suffix": "with_relu_fn", "act_func": "internal_requantize_and_relu_fn", "output_type": "i8"},
-      {"suffix": "with_relu6_fn", "act_func": "internal_requantize_and_relu6_fn", "output_type": "i8"},
-      {"suffix": "float_output_fn", "act_func": "internal_dequantize_no_activation_fn", "output_type": "f32"},
-      {"suffix": "with_relu_float_output_fn", "act_func": "internal_dequantize_and_relu_fn", "output_type": "f32"},
-      {"suffix": "with_relu6_float_output_fn", "act_func": "internal_dequantize_and_relu6_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}"], "act_func": "internal_requantize_no_activation_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "Relu"], "act_func": "internal_requantize_and_relu_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}", "Relu6"], "act_func": "internal_requantize_and_relu6_fn", "output_type": "i8"},
+      {"quantized_ops": ["${main_op}"], "act_func": "internal_dequantize_no_activation_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}", "Relu"], "act_func": "internal_dequantize_and_relu_fn", "output_type": "f32"},
+      {"quantized_ops": ["${main_op}", "Relu6"], "act_func": "internal_dequantize_and_relu6_fn", "output_type": "f32"},
     ]
-    func.func @quantized_${main_op}_${suffix}(
+    func.func @GenerateQuantizedFunctionName(${quantized_ops}, "${output_type}")(
                            %input : tensor<*xi8>, %filter : tensor<*xi8>,
                            %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
                            %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>,
-                           %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}> {
+                           %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+        attributes {tf_quant.quantized_ops = ${quantized_ops}} {
       %0 = "tf.PartitionedCall"(%input, %filter, %input_scale, %input_zp,
                                   %filter_scale, %filter_zp) {
-          config = "", config_proto = "", executor_type = "", f=@internal_${main_op}_fn
+          config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
         } : (tensor<*xi8>, tensor<*xi8>, tensor<*xf32>, tensor<*xi32>,
                tensor<*xf32>, tensor<*xi32>) -> tensor<*xi32>
       %1 = "tf.PartitionedCall"(%0, %input_scale, %input_zp, %filter_scale, %filter_zp,
@@ -294,25 +370,23 @@ module {
   } // end for
 
   func.func @quantize_i8(%input : tensor<*xf32>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xi8> {
-    // Uses tf.floor(x + 0.5) instead of tf.round(x) since tf.round generates
-    // a very expensive pattern.
-    %round_cst = "tf.Const"() {value = dense<0.5> : tensor<f32>} : () -> tensor<f32>
     %float_zp = "tf.Cast"(%zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %zp_plus_round_cst = "tf.AddV2"(%float_zp, %round_cst) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-
     %div = "tf.Div"(%input, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %add = "tf.AddV2"(%div, %zp_plus_round_cst) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %round = "tf.Floor"(%add) : (tensor<*xf32>) -> tensor<*xf32>
+    %add = "tf.AddV2"(%div, %float_zp) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
     %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
     %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
-    %clip = "tf.ClipByValue"(%round, %i8_min, %i8_max) : (tensor<*xf32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
-    %i8 = "tf.Cast"(%clip) : (tensor<*xf32>) -> tensor<*xi8>
+    %clamp_max = "tf.Maximum"(%add, %i8_min) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    %clamp_min = "tf.Minimum"(%clamp_max, %i8_max) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+    %round = "tf.Round"(%clamp_min) : (tensor<*xf32>) -> tensor<*xf32>
+    %i8 = "tf.Cast"(%round) : (tensor<*xf32>) -> tensor<*xi8>
     func.return %i8 : tensor<*xi8>
   }
 
   func.func @dequantize_i8(%input : tensor<*xi8>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xf32> {
-    %input_i32 = "tf.Cast"(%input) : (tensor<*xi8>) -> tensor<*xi32>
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%input) : (tensor<*xi8>) -> tensor<*xi8>
+    %input_i32 = "tf.Cast"(%identity) : (tensor<*xi8>) -> tensor<*xi32>
     %output = "tf.Sub"(%input_i32, %zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
     %cast = "tf.Cast"(%output) : (tensor<*xi32>) -> tensor<*xf32>
     %mul = "tf.Mul"(%cast, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_tf_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_tf_drq.mlir
index aa942c18613..7b6ab40d579 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_tf_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_tf_drq.mlir
@@ -34,11 +34,20 @@ module {
 
   // Note: following functions won't handle per-channel quantization for now.
   func.func private @internal_quantize_i8(%input : tensor<*xf32>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xi8> {
+    // Uses tf.floor(x + 0.5) instead of tf.round(x) since tf.round generates
+    // a very expensive pattern.
+    %round_cst = "tf.Const"() {value = dense<0.5> : tensor<f32>} : () -> tensor<f32>
+    %float_zp = "tf.Cast"(%zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+    %zp_plus_round_cst = "tf.AddV2"(%float_zp, %round_cst) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+
     %div = "tf.Div"(%input, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %round = "tf.Round"(%div) : (tensor<*xf32>) -> tensor<*xf32>
-    %cast = "tf.Cast"(%round) : (tensor<*xf32>) -> tensor<*xi32>
-    %add = "tf.AddV2"(%cast, %zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    %i8 = "tf.Cast"(%add) : (tensor<*xi32>) -> tensor<*xi8>
+    %add = "tf.AddV2"(%div, %zp_plus_round_cst) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %round = "tf.Floor"(%add) : (tensor<*xf32>) -> tensor<*xf32>
+
+    %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
+    %clip = "tf.ClipByValue"(%round, %i8_min, %i8_max) : (tensor<*xf32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
+    %i8 = "tf.Cast"(%clip) : (tensor<*xf32>) -> tensor<*xi8>
     func.return %i8 : tensor<*xi8>
   }
 
@@ -46,60 +55,123 @@ module {
                                     %input_scale : tensor<*xf32>,
                                     %weight_scale : tensor<*xf32>) -> tensor<*xf32> {
     %scale_prod = "tf.Mul"(%input_scale, %weight_scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    %0 = "tf.Cast"(%input) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %1 = "tf.Mul"(%0, %scale_prod) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    func.return %1 : tensor<*xf32>
+
+    %cast = "tf.Cast"(%input) : (tensor<*xi32>) -> tensor<*xf32>
+    %mul = "tf.Mul"(%cast, %scale_prod) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %mul : tensor<*xf32>
   }
 
-  // Note: following function supports per-tensor, symmetric, none narrow_range.
+  // TODO(b/263199401): Support quantization options for activation quantization for DRQ
+  // Note: following function supports per-tensor, asymmetric, non_narrow_range.
   func.func private @internal_calculate_quant_params(%input : tensor<*xf32>) -> (tensor<1xf32>, tensor<1xi32>) {
-    %zp = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %zero = "tf.Const"() {value = dense<0.0> : tensor<1xf32>} : () -> tensor<1xf32>
     %shape = "tf.Const"() {value = dense<[-1]> : tensor<1xi32>} : () -> tensor<1xi32>
     %dim = "tf.Const"() { value = dense<0> : tensor<1xi64> } : () -> tensor<1xi64>
 
+    // Check and include zero in the range so that zero value can be correctly
+    // represented.
     %input_1d = "tf.Reshape"(%input, %shape) : (tensor<*xf32>, tensor<1xi32>) -> tensor<?xf32>
-    %r_max = "tf.Max"(%input_1d, %dim) { keep_dims = true }: (tensor<?xf32>, tensor<1xi64>) -> tensor<1xf32>
-    %r_min = "tf.Min"(%input_1d, %dim) { keep_dims = true }: (tensor<?xf32>, tensor<1xi64>) -> tensor<1xf32>
-    %r_max_abs = "tf.Abs"(%r_max) : (tensor<1xf32>) -> tensor<1xf32>
-    %r_min_abs = "tf.Abs"(%r_min) : (tensor<1xf32>) -> tensor<1xf32>
-    %r_abs_max = "tf.Maximum"(%r_max_abs, %r_min_abs) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-    %r_abs_max_cast = "tf.Cast"(%r_abs_max) : (tensor<1xf32>) -> tensor<1xf64>
+    %r_max_without_zero = "tf.Max"(%input_1d, %dim) { keep_dims = true }: (tensor<?xf32>, tensor<1xi64>) -> tensor<1xf32>
+    %r_max = "tf.Maximum"(%zero, %r_max_without_zero) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+
+    %r_min_without_zero = "tf.Min"(%input_1d, %dim) { keep_dims = true }: (tensor<?xf32>, tensor<1xi64>) -> tensor<1xf32>
+    %r_min = "tf.Minimum"(%zero, %r_min_without_zero) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+
+    %r_max_f64 = "tf.Cast"(%r_max) : (tensor<1xf32>) -> tensor<1xf64>
+    %r_min_f64 = "tf.Cast"(%r_min) : (tensor<1xf32>) -> tensor<1xf64>
+
+    %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_min_f64 = "tf.Cast"(%i8_min) : (tensor<f32>) -> tensor<f64>
+    %i8_max_f64 = "tf.Cast"(%i8_max) : (tensor<f32>) -> tensor<f64>
 
-    %i8_min = "tf.Const"() {value = dense<-128> : tensor<i32>} : () -> tensor<i32>
-    %i8_max = "tf.Const"() {value = dense<127> : tensor<i32>} : () -> tensor<i32>
-    %i8_min_cast = "tf.Cast"(%i8_min) : (tensor<i32>) -> tensor<f64>
-    %i8_max_cast = "tf.Cast"(%i8_max) : (tensor<i32>) -> tensor<f64>
+    %range_nume = "tf.Sub"(%r_max_f64, %r_min_f64) : (tensor<1xf64>, tensor<1xf64>) -> tensor<1xf64>
+    %range_deno = "tf.Sub"(%i8_max_f64, %i8_min_f64) : (tensor<f64>, tensor<f64>) -> tensor<f64>
 
-    %range_nume = "tf.AddV2"(%r_abs_max_cast, %r_abs_max_cast) : (tensor<1xf64>, tensor<1xf64>) -> tensor<1xf64>
-    %range_deno = "tf.Sub"(%i8_max_cast, %i8_min_cast) : (tensor<f64>, tensor<f64>) -> tensor<f64>
+    %scale_f64 = "tf.Div"(%range_nume, %range_deno) : (tensor<1xf64>, tensor<f64>) -> tensor<1xf64>
+    %scale = "tf.Cast"(%scale_f64) : (tensor<1xf64>) -> tensor<1xf32>
 
-    %scale_double = "tf.Div"(%range_nume, %range_deno) : (tensor<1xf64>, tensor<f64>) -> tensor<1xf64>
-    %scale = "tf.Cast"(%scale_double) : (tensor<1xf64>) -> tensor<1xf32>
+    // Add comparison with minimum if needed
+    %intermediate_val = "tf.Div"(%r_max_f64, %scale_f64) : (tensor<1xf64>, tensor<1xf64>) -> tensor<1xf64>
+    %zp_from_max = "tf.Sub"(%i8_max_f64, %intermediate_val) : (tensor<f64>, tensor<1xf64>) -> tensor<1xf64>
+    %zp_fp32 = "tf.Cast"(%zp_from_max) : (tensor<1xf64>) -> tensor<1xf32>
+    %zp = "tf.Cast"(%zp_fp32) : (tensor<1xf32>) -> tensor<1xi32>
 
     func.return %scale, %zp : tensor<1xf32>, tensor<1xi32>
   }
 
   // Matmul with int32 accumulation
   func.func private @internal_matmul_fn(
-                         %input : tensor<*xi8>, %weight : tensor<*xi8>,
+                         %input : tensor<*xi8>, %filter : tensor<*xi8>,
                          %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
                          %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xi32> {
     %0 = "tf.Cast"(%input) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
     %1 = "tf.Sub"(%0, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
 
-    %2 = "tf.Cast"(%weight) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%filter) : (tensor<*xi8>) -> tensor<*xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
     %3 = "tf.Sub"(%2, %weight_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
 
-    // TODO(b/215633216): Optimize this function with the XLA Dot op.
     %5 = "tf.MatMul"(%1, %3) {
       attr_map = "transpose_a:0,transpose_b:1"
     } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
     func.return %5 : tensor<*xi32>
   }
 
-  func.func @quantized_matmul_fn(
+  // Conv2D with int32 accumulation
+  func.func private @internal_conv2d_fn(
+                         %input : tensor<*xi8>, %filter : tensor<*xi8>,
+                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>) -> tensor<*xi32> {
+    %0 = "tf.Cast"(%input) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %1 = "tf.Sub"(%0, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%filter) : (tensor<*xi8>) -> tensor<*xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %3 = "tf.Sub"(%2, %filter_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    %5 = "tf.Conv2D"(%1, %3) {
+      padding = "VALID", strides = [1, 1, 1, 1],
+      attr_map = "strides:0,use_cudnn_on_gpu:1,padding:2,explicit_paddings:3,dilations:4"
+    } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    func.return %5 : tensor<*xi32>
+  }
+
+  // DepthwiseConv2D with float computation
+  func.func private @internal_depthwise_conv2d_fn(
+                         %input : tensor<*xi8>, %filter : tensor<*xi8>,
+                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>) -> tensor<*xi32> {
+    %0 = "tf.Cast"(%input) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %1 = "tf.Sub"(%0, %input_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%filter) : (tensor<*xi8>) -> tensor<*xi8>
+    %2 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
+    %3 = "tf.Sub"(%2, %filter_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+
+    %cast_1_f32 = "tf.Cast"(%1) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+    %cast_3_f32 = "tf.Cast"(%3) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+
+    %5 = "tf.DepthwiseConv2dNative"(%cast_1_f32, %cast_3_f32) {
+      padding = "VALID", strides = [1, 1, 1, 1],
+      attr_map = "strides:0,padding:1,explicit_paddings:2,dilations:3"
+    } : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    %6 = "tf.Cast"(%5) : (tensor<*xf32>) -> tensor<*xi32>
+    func.return %6 : tensor<*xi32>
+  }
+  
+  parameters[
+    {"quantized_ops": ["MatMul"], "internal_func_name": "internal_matmul_fn"},
+    {"quantized_ops": ["Conv2D"], "internal_func_name": "internal_conv2d_fn"},
+    {"quantized_ops": ["DepthwiseConv2D"], "internal_func_name": "internal_depthwise_conv2d_fn"}
+  ]
+  func.func @GenerateQuantizedFunctionName(${quantized_ops})(
                          %input : tensor<*xf32>, %weight : tensor<*xi8>,
-                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32> {
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32>
+      attributes {tf_quant.quantized_ops = ${quantized_ops}} {
 
     %input_scale, %input_zp = "tf.PartitionedCall"(%input) {
         config = "", config_proto = "", executor_type = "", f=@internal_calculate_quant_params
@@ -111,7 +183,7 @@ module {
 
     %accum_out = "tf.PartitionedCall"(%quantized_input, %weight, %input_scale, %input_zp,
                                 %weight_scale, %weight_zp) {
-        config = "", config_proto = "", executor_type = "", f=@internal_matmul_fn
+        config = "", config_proto = "", executor_type = "", f=@${internal_func_name}
       } : (tensor<*xi8>, tensor<*xi8>, tensor<*xf32>, tensor<*xi32>,
              tensor<*xf32>, tensor<*xi32>) -> tensor<*xi32>
 
@@ -122,4 +194,14 @@ module {
     func.return %out : tensor<*xf32>
   }
 
+  // For weight-only
+  func.func @dequantize_i8(%input : tensor<*xi8>, %scale : tensor<*xf32>, %zp : tensor<*xi32>) -> tensor<*xf32> {
+    // Use identity op to avoid the weight being constant-folded.
+    %identity = "tf.Identity"(%input) : (tensor<*xi8>) -> tensor<*xi8>
+    %input_i32 = "tf.Cast"(%identity) : (tensor<*xi8>) -> tensor<*xi32>
+    %output = "tf.Sub"(%input_i32, %zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    %cast = "tf.Cast"(%output) : (tensor<*xi32>) -> tensor<*xf32>
+    %mul = "tf.Mul"(%cast, %scale) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    func.return %mul : tensor<*xf32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
index dbec1ed5661..2225e588e39 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized.mlir
@@ -33,19 +33,76 @@
 
 module {
 
-  // TODO(b/240931497) Replace with core tf ops once uniform quantization is submitted.
-  // Ref bugs for op: b/230804708, b/230805744
-  func.func @quantized_conv2d_with_bias_fn(
-                         %input : tensor<*x!tf_type.qint8>, %filter : tensor<*x!tf_type.qint8>, %bias : tensor<*x!tf_type.qint32>,
-                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>, %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>, %bias_scale : tensor<*xf32>, %bias_zp : tensor<*xi32>, %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
-   %conv_out = "tf.ExperimentalUniformQuantizedConvolution"(%input, %filter,
+  for main_op in ["Conv2D", "DepthwiseConv2D"] {
+    parameters[
+      {"quantized_ops": ["${main_op}", "BiasAdd"], "act_func": "internal_requantize_no_activation_fn", "output_type": "!tf_type.qint8"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu"], "act_func": "internal_requantize_and_relu_fn", "output_type": "!tf_type.qint8"},
+      {"quantized_ops": ["${main_op}", "BiasAdd", "Relu6"], "act_func": "internal_requantize_and_relu6_fn", "output_type": "!tf_type.qint8"},
+    ]
+    func.func @GenerateQuantizedFunctionName(${quantized_ops})(%input : tensor<*x!tf_type.qint8>,
+                          %filter : tensor<*x!tf_type.qint8>, %bias : tensor<*x!tf_type.qint32>,
+                          %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                          %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>,
+                          %bias_scale : tensor<*xf32>, %bias_zp : tensor<*xi32>,
+                          %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+        attributes {tf_quant.quantized_ops = ${quantized_ops}} {
+      // TODO(b/258729559): Revisit scale/zp after e2e path for SRQ on UQ is ready.
+      %main_out = "tf.PartitionedCall"(%input, %filter, %input_scale, %input_zp,
+                                  %filter_scale, %filter_zp, %out_scale, %out_zp) {
+          config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
+        } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+      %add = "tf.UniformQuantizedAdd"(%main_out, %bias, %input_scale, %input_zp, %bias_scale, %bias_zp, %out_scale, %out_zp) {
+        lhs_quantization_axis = -1,
+        lhs_quantization_min_val = -128,
+        lhs_quantization_max_val = 127,
+        rhs_quantization_axis = -1,
+        rhs_quantization_min_val = -128,
+        rhs_quantization_max_val = 127,
+        output_quantization_axis = -1,
+        output_quantization_min_val = -128,
+        output_quantization_max_val = 127,
+        T = "tfdtype$DT_QINT32",
+        attr_map = ""
+      } : (tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+      %act = "tf.PartitionedCall"(%add, %input_scale, %input_zp, %out_scale, %out_zp) {
+          config = "", config_proto = "", executor_type = "", f=@${act_func}
+        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
+      func.return %act : tensor<*x${output_type}>
+    }
+
+    parameters[
+      {"quantized_ops": ["${main_op}"], "act_func": "internal_requantize_no_activation_fn", "output_type": "!tf_type.qint8"},
+      {"quantized_ops": ["${main_op}", "Relu"], "act_func": "internal_requantize_and_relu_fn", "output_type": "!tf_type.qint8"},
+      {"quantized_ops": ["${main_op}", "Relu6"], "act_func": "internal_requantize_and_relu6_fn", "output_type": "!tf_type.qint8"},
+    ]
+    func.func @GenerateQuantizedFunctionName(${quantized_ops})(%input : tensor<*x!tf_type.qint8>, %filter : tensor<*x!tf_type.qint8>,
+                          %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                          %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>,
+                          %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x${output_type}>
+        attributes {tf_quant.quantized_ops = ${quantized_ops}} {
+      // TODO(b/258729559): Revisit scale/zp after e2e path for SRQ on UQ is ready.
+      %main_out = "tf.PartitionedCall"(%input, %filter, %input_scale, %input_zp,
+                                  %filter_scale, %filter_zp, %out_scale, %out_zp) {
+          config = "", config_proto = "", executor_type = "", f=@GenerateImplFunctionName(${main_op})
+        } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+      %act = "tf.PartitionedCall"(%main_out, %input_scale, %input_zp, %out_scale, %out_zp) {
+          config = "", config_proto = "", executor_type = "", f=@${act_func}
+        } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x${output_type}>
+      func.return %act : tensor<*x${output_type}>
+    }
+  } // end for
+
+  // Conv2d Convolution.
+  func.func private @internal_conv2d_fn(
+                         %input : tensor<*x!tf_type.qint8>, %filter : tensor<*x!tf_type.qint8>,
+                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>, %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint32> {
+    %conv_out = "tf.UniformQuantizedConvolution"(%input, %filter,
                                 %input_scale, %input_zp, %filter_scale, %filter_zp, %out_scale, %out_zp) {
-        // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
-        Tlhs = "tfdtype$DT_QINT8",
-        Trhs = "tfdtype$DT_QINT8",
+        Tin = "tfdtype$DT_QINT8",
         Tout = "tfdtype$DT_QINT32",
         window_strides = [1, 1],
-        padding = "",
+        padding = "SAME",
         explicit_padding = [],
         lhs_dilation = [],
         rhs_dilation = [],
@@ -61,66 +118,141 @@ module {
         output_quantization_axis = -1,
         output_quantization_min_val = -128,
         output_quantization_max_val = 127,
-        attr_map = "0:Tlhs,1:Trhs,2:Tout,3:lhs_quantization_min_val,4:lhs_quantization_max_val,5:rhs_quantization_min_val,6:rhs_quantization_max_val,7:output_quantization_min_val,8:output_quantization_max_val"
-    } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
-    %add_bias = "tf.ExperimentalUniformQuantizedAdd"(%conv_out, %bias, %input_scale, %input_zp, %bias_scale, %bias_zp, %out_scale, %out_zp) {
-      // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
-      lhs_quantization_axis = -1,
-      lhs_quantization_min_val = -128,
-      lhs_quantization_max_val = 127,
-      rhs_quantization_axis = -1,
-      rhs_quantization_min_val = -128,
-      rhs_quantization_max_val = 127,
-      output_quantization_axis = -1,
-      output_quantization_min_val = -128,
-      output_quantization_max_val = 127,
-      T = 1,
-      attr_map = "0:Tlhs,1:Trhs,2:Tout,3:lhs_quantization_min_val,4:lhs_quantization_max_val,5:rhs_quantization_min_val,6:rhs_quantization_max_val,7:output_quantization_min_val,8:output_quantization_max_val"
-    } : (tensor<*x!tf_type.qint32>, tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
-    %requantized_out = "tf.PartitionedCall"(%add_bias, %input_scale, %input_zp, %out_scale, %out_zp) {
-        config = "", config_proto = "", executor_type = "", f=@requantize_qi8
-    } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
-    func.return %requantized_out : tensor<*x!tf_type.qint8>
+        attr_map = ""
+      } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    func.return %conv_out : tensor<*x!tf_type.qint32>
   }
 
-  // Quantize initial input at the start of the graph.
-  func.func @quantize_qi8(%input : tensor<*xf32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>, %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
-    %quantized_out = "tf.ExperimentalUniformQuantize"(%input, %input_scale, %input_zp) {
-      // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
+  // Depthwise convolution. feature_group_count is set to 3rd dim of input shape.
+  func.func private @internal_depthwise_conv2d_fn(
+                         %input : tensor<*x!tf_type.qint8>, %filter : tensor<*x!tf_type.qint8>,
+                         %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %filter_scale : tensor<*xf32>, %filter_zp : tensor<*xi32>, %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint32> {
+    %conv_out = "tf.UniformQuantizedConvolution"(%input, %filter,
+                                %input_scale, %input_zp, %filter_scale, %filter_zp, %out_scale, %out_zp) {
+        Tin = "tfdtype$DT_QINT8",
+        Tout = "tfdtype$DT_QINT32",
+        window_strides = [1, 1],
+        padding = "SAME",
+        explicit_padding = [],
+        lhs_dilation = [],
+        rhs_dilation = [],
+        batch_group_count = 1,
+        feature_group_count = 1,
+        dimension_numbers = "",
+        lhs_quantization_axis = -1,
+        lhs_quantization_min_val = -128,
+        lhs_quantization_max_val = 127,
+        rhs_quantization_axis = -1,
+        rhs_quantization_min_val = -128,
+        rhs_quantization_max_val = 127,
+        output_quantization_axis = -1,
+        output_quantization_min_val = -128,
+        output_quantization_max_val = 127,
+        attr_map = ""
+      } : (tensor<*x!tf_type.qint8>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    func.return %conv_out : tensor<*x!tf_type.qint32>
+  }
+
+  // Quantize initial input at the start of the graph. Output is qint8.
+  func.func @quantize_i8(%input : tensor<*xf32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %quantize = "tf.UniformQuantize"(%input, %input_scale, %input_zp) {
+      Tin = "tfdtype$DT_FLOAT",
+      Tout = "tfdtype$DT_QINT8",
       quantization_axis = -1,
       quantization_min_val = -128,
       quantization_max_val = 127,
-      T = 1,
-      attr_map = "0:Tin,1:Tout,2:quantization_axis,3:quantization_min_val,4:quantization_max_val"
+      attr_map = ""
     } : (tensor<*xf32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
-    func.return %quantized_out : tensor<*x!tf_type.qint8>
+    func.return %quantize : tensor<*x!tf_type.qint8>
   }
 
   // Requantize a qint32 tensor to qint8 tensor for the next input.
-  func.func @requantize_qi8(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>, %out_scale: tensor<*xf32>, %out_zp: tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
-    %requantized_out = "tf.ExperimentalUniformRequantize"(%input, %input_scale, %input_zp, %out_scale, %out_zp) {
-      // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
-      input_quantization_axis = -1,
-      input_quantization_min_val = -128,
-      input_quantization_max_val = 127,
-      output_quantization_axis = -1,
-      output_quantization_min_val = -128,
-      output_quantization_max_val = 127,
-      attr_map = "0:Tin,1:Tout,2:input_quantization_axis,3:input_quantization_min_val,4:input_quantization_max_val,5:output_quantization_axis,6:output_quantization_min_val,7:output_quantization_min_val"
-    } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
-    func.return %requantized_out : tensor<*x!tf_type.qint8>
+  func.func private @internal_requantize_qi8_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>, %out_scale: tensor<*xf32>, %out_zp: tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %requantize = "tf.UniformRequantize"(%input, %input_scale, %input_zp, %out_scale, %out_zp) {
+            Tin = "tfdtype$DT_QINT32",
+            Tout = "tfdtype$DT_QINT8",
+            input_quantization_axis = -1,
+            input_quantization_min_val = -128,
+            input_quantization_max_val = 127,
+            output_quantization_axis = -1,
+            output_quantization_min_val = -128,
+            output_quantization_max_val = 127,
+            attr_map = ""
+          } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
+    func.return %requantize : tensor<*x!tf_type.qint8>
   }
 
-  // Dequantize final graph output back to f32.
-  func.func @dequantize_qi8(%input : tensor<*x!tf_type.qint8>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>, %out_scale : tensor<*xf32>, %out_zp : tensor<*xi8>) -> tensor<*xf32> {
-    %dequantized_out = "tf.ExperimentalUniformDequantize"(%input, %input_scale, %input_zp) {
-      // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
+  // Dequantize final graph output back to f32. Input is qint8.
+  func.func @dequantize_i8(%input : tensor<*x!tf_type.qint8>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>) -> tensor<*xf32> {
+    %dequantize = "tf.UniformDequantize"(%input, %input_scale, %input_zp) {
+      Tin = "tfdtype$DT_QINT8",
+      Tout = "tfdtype$DT_FLOAT",
       quantization_axis = -1,
       quantization_min_val = -128,
       quantization_max_val = 127,
-      attr_map = "0:Tin,1:Tout,2:quantization_axis,3:quantization_min_val,4:quantization_max_val"
+      attr_map = ""
     } : (tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
-    func.return %dequantized_out : tensor<*xf32>
+    func.return %dequantize : tensor<*xf32>
+  }
+
+  // Requantizes and applies quantized Relu by clipping.
+  func.func private @internal_requantize_no_activation_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %q_out = "tf.PartitionedCall"(%input, %input_scale, %input_zp, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
+      } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
+    func.return %q_out : tensor<*x!tf_type.qint8>
+  }
+
+  // Requantizes and applies quantized Relu6 by clipping.
+  func.func private @internal_requantize_and_relu_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
+    %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
+    %qclip_min = "tf.Cast"(%i8_min) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
+    %qi8_max = "tf.Cast"(%i8_max) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
+    %relu = "tf.UniformQuantizedClipByValue"(%input, %qclip_min, %qi8_max, %out_scale, %out_zp) {
+      T = "tfdtype$DT_QINT32",
+      quantization_axis = -1,
+      quantization_min_val = -128,
+      quantization_max_val = 127,
+      attr_map = ""
+    } : (tensor<*x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    %requantize = "tf.PartitionedCall"(%relu, %input_scale, %input_zp, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
+      } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
+    func.return %requantize : tensor<*x!tf_type.qint8>
+  }
+
+   // Apply requantization and relu6.
+  func.func private @internal_requantize_and_relu6_fn(%input : tensor<*x!tf_type.qint32>, %input_scale : tensor<*xf32>, %input_zp : tensor<*xi32>,
+                         %out_scale : tensor<*xf32>, %out_zp : tensor<*xi32>) -> tensor<*x!tf_type.qint8> {
+    %i8_min = "tf.Const"() {value = dense<-128.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_max = "tf.Const"() {value = dense<127.0> : tensor<f32>} : () -> tensor<f32>
+    %act_max =  "tf.Const"() {value = dense<6.0> : tensor<f32>} : () -> tensor<f32>
+    %i8_act_max_0 = "tf.PartitionedCall"(%act_max, %input_scale, %input_zp) {
+        config = "", config_proto = "", executor_type = "", f=@quantize_i8
+      } : (tensor<f32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
+    %i8_act_max_1 = "tf.Cast"(%i8_act_max_0) {Truncate = false} : (tensor<*x!tf_type.qint8>) -> tensor<f32>
+    %float_out_zp = "tf.Cast"(%out_zp) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+    %clip_min = "tf.Maximum"(%i8_min, %float_out_zp) : (tensor<f32>, tensor<*xf32>) -> tensor<f32>
+    %clip_max = "tf.Minimum"(%i8_max, %i8_act_max_1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %qclip_min = "tf.Cast"(%i8_min) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
+    %qclip_max = "tf.Cast"(%i8_max) {Truncate = false} : (tensor<f32>) -> tensor<!tf_type.qint32>
+    %relu = "tf.UniformQuantizedClipByValue"(%input, %qclip_min, %qclip_max, %out_scale, %out_zp) {
+      T = "tfdtype$DT_QINT32",
+      quantization_axis = -1,
+      quantization_min_val = -128,
+      quantization_max_val = 127,
+      attr_map = ""
+    } : (tensor<*x!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint32>
+    %requantize = "tf.PartitionedCall"(%relu, %input_scale, %input_zp, %out_scale, %out_zp) {
+        config = "", config_proto = "", executor_type = "", f=@internal_requantize_qi8_fn
+      } : (tensor<*x!tf_type.qint32>, tensor<*xf32>, tensor<*xi32>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf_type.qint8>
+    func.return %requantize : tensor<*x!tf_type.qint8>
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized_drq.mlir
index 8588eede7f2..a2db0c25f10 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library_uniform_quantized_drq.mlir
@@ -18,25 +18,73 @@
 // Internal functions should be marked as private. They will be inlined and
 // deleted in `InsertQuantizedFunctionsPass`.
 //
-// Function template can generate functions with different parameters. Ex:
-// ```
-// parameters[
-//   {"key1": "value11", "key2": "value21"},
-//   {"key1": "value12", "key2": "value22"},
-// ]
-// func.func func_name_${key1}_fn (...) {
-//   ...${key2}...
-// }
-// ```
-// The above template with generate two functions by substituting `key1` and
-// `key2` with given values.
+// For Uniform Quantized op case, attributes are generated during quantize
+// composite pass. Therefore, attr_map is set to an empty string.
 
 module {
 
-  // TODO(b/238600711): Populate attributes for quantized_function_library_uniform_quantized
+  // Currently only 4-d case is supported
+  func.func @quantized_conv2d_fn(
+                         %input : tensor<*xf32>, %weight : tensor<*x!tf_type.qint8>,
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32>
+      attributes {tf_quant.quantized_ops = ["Conv2D"]} {
+
+    %out = "tf.UniformQuantizedConvolutionHybrid"(%input, %weight,
+                           %weight_scale, %weight_zp) {
+        Tlhs = "tfdtype$DT_FLOAT",
+        Trhs = "tfdtype$DT_QINT8",
+        Tout = "tfdtype$DT_FLOAT",
+        window_strides = [1, 1],
+        padding = "",
+        explicit_padding = [],
+        lhs_dilation = [],
+        rhs_dilation = [],
+        dimension_numbers = "",
+        batch_group_count = 1,
+        feature_group_count = 1,
+        rhs_quantization_axis = -1,
+        rhs_quantization_min_val = -128,
+        rhs_quantization_max_val = 127,
+        attr_map = ""
+      } : (tensor<*xf32>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+
+
+    func.return %out : tensor<*xf32>
+  }
+
+  // Currently only 4-d case is supported
+  func.func @quantized_depthwise_conv2d_fn(
+                         %input : tensor<*xf32>, %weight : tensor<*x!tf_type.qint8>,
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32>
+      attributes {tf_quant.quantized_ops = ["DepthwiseConv2D"]} {
+
+    %out = "tf.UniformQuantizedConvolutionHybrid"(%input, %weight,
+                           %weight_scale, %weight_zp) {
+        Tlhs = "tfdtype$DT_FLOAT",
+        Trhs = "tfdtype$DT_QINT8",
+        Tout = "tfdtype$DT_FLOAT",
+        window_strides = [1, 1],
+        padding = "",
+        explicit_padding = [],
+        lhs_dilation = [],
+        rhs_dilation = [],
+        dimension_numbers = "",
+        batch_group_count = 1,
+        feature_group_count = 1,
+        rhs_quantization_axis = -1,
+        rhs_quantization_min_val = -128,
+        rhs_quantization_max_val = 127,
+        attr_map = ""
+      } : (tensor<*xf32>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+
+    func.return %out : tensor<*xf32>
+  }
+
+  // Currently only 4-d case is supported
   func.func @quantized_matmul_fn(
                          %input : tensor<*xf32>, %weight : tensor<*x!tf_type.qint8>,
-                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32> {
+                         %weight_scale : tensor<*xf32>, %weight_zp : tensor<*xi32>) -> tensor<*xf32>
+      attributes {tf_quant.quantized_ops = ["MatMul"]} {
 
     %out = "tf.UniformQuantizedDotHybrid"(%input, %weight,
                                 %weight_scale, %weight_zp) {
@@ -46,7 +94,7 @@ module {
         rhs_quantization_axis = -1,
         rhs_quantization_min_val = -128,
         rhs_quantization_max_val = 127,
-        attr_map = "0:Tlhs,1:Trhs,2:Tout,3:rhs_quantization_axis,4:rhs_quantization_min_val,5:rhs_quantization_max_val"
+        attr_map = ""
       } : (tensor<*xf32>, tensor<*x!tf_type.qint8>, tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
 
     func.return %out : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc
new file mode 100644
index 00000000000..b27a4456356
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/remove_var_init_by_const.cc
@@ -0,0 +1,121 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+
+// A pass that removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns
+// from the initializer function (type = "restore_op").
+//
+// Note: initializing values (`tf.Const`s) will be removed and this may result
+// in an information loss and uninitialized variable errors. Make sure that this
+// effect is desired (e.g. there is a `tf.RestoreV2Op` restoring the variables
+// instead).
+class RemoveVariableInitializationByConstPass
+    : public PassWrapper<RemoveVariableInitializationByConstPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      RemoveVariableInitializationByConstPass)
+
+  StringRef getArgument() const final {
+    return "quant-remove-var-init-by-const";
+  }
+
+  StringRef getDescription() const final {
+    return "Removes `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` patterns "
+           "from the initializer function of type 'restore_op'.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Finds and removes the `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)`
+// pattern. `tf.VarHandleOp` and `tf.Const` are removed unless they are used by
+// other ops.
+struct RemoveVariableAssignmentByConst
+    : public OpRewritePattern<TF::AssignVariableOp> {
+  // Inherit the constructors.
+  using OpRewritePattern<TF::AssignVariableOp>::OpRewritePattern;
+
+  LogicalResult match(TF::AssignVariableOp assign_op) const override {
+    Value resource_operand = assign_op.getOperand(0);
+    Value assigned_value_operand = assign_op.getOperand(1);
+
+    if (isa<TF::VarHandleOp>(resource_operand.getDefiningOp()) &&
+        isa<TF::ConstOp>(assigned_value_operand.getDefiningOp())) {
+      return success();
+    } else {
+      return failure();
+    }
+  }
+
+  void rewrite(TF::AssignVariableOp assign_op,
+               PatternRewriter& rewriter) const override {
+    // `TF::ConstOp` and `TF::VarHandleOp` are not manually erased.
+    // `applyPatternsAndFoldGreedily` performs dead code elimination and unsed
+    // ops will be erased during the optimization.
+    rewriter.eraseOp(assign_op);
+  }
+};
+
+void RemoveVariableInitializationByConstPass::runOnOperation() {
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<RemoveVariableAssignmentByConst>(&ctx);
+
+  ModuleOp module_op = getOperation();
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (init_func_op) {
+    if (failed(
+            applyPatternsAndFoldGreedily(init_func_op, std::move(patterns)))) {
+      init_func_op->emitError(
+          "Failed to remove variable assignment by const patterns.");
+      signalPassFailure();
+    }
+  } else {
+    LOG(INFO) << "Initializer function with type 'restore_op' does not exist. "
+                 "'RemoveVariableInitializationByConstPass' is a no-op.";
+  }
+}
+
+static PassRegistration<RemoveVariableInitializationByConstPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateRemoveVariableInitializationByConstPass() {
+  return std::make_unique<RemoveVariableInitializationByConstPass>();
+}
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
index fed951810f6..b8de48a954f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.cc
@@ -13,13 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <iostream>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -74,32 +80,143 @@ void PrepareXlaConvParams(OpBuilder &builder, Location loc, ArrayAttr strides,
       CreateScalarConstValue<int32_t>(builder, loc, feature_group_cnt);
 }
 
-// Calculates zero-point offset by reducing weights and multiply it with zp.
-Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value filter,
-                               int8_t input_zp, int output_dim) {
-  auto weight_shape = filter.getType().template cast<ShapedType>();
-  SmallVector<int64_t> weight_non_output_indices;
-  for (int64_t i : llvm::seq<int64_t>(0, weight_shape.getRank())) {
-    if (i != output_dim) weight_non_output_indices.push_back(i);
+// Calculates other_tensor_zp * tensor for zero point offset calculation.
+Value CreateZeroPointPartialOffset(OpBuilder &builder, Location loc,
+                                   Value tensor, int8_t other_tensor_zp,
+                                   const ArrayRef<int64_t> output_dims) {
+  if (other_tensor_zp == 0) {
+    return CreateScalarConstValue<int32_t>(builder, loc, 0);
+  }
+
+  auto shape = tensor.getType().template cast<ShapedType>();
+  SmallVector<int64_t> non_output_indices;
+  for (int64_t i : llvm::seq<int64_t>(0, shape.getRank())) {
+    if (absl::c_count(output_dims, i) == 0) {
+      non_output_indices.push_back(i);
+    }
   }
 
-  Value reduction_indices_value =
-      Create1DConstValue<int64_t>(builder, loc, weight_non_output_indices);
-  Value zp = CreateScalarConstValue<int32_t>(builder, loc, input_zp);
-
-  TensorType filter_type = filter.getType().dyn_cast<TensorType>();
-  Value filter_i32 = builder.create<TF::CastOp>(
-      loc, filter_type.clone(builder.getIntegerType(32)), filter);
-  auto zp_mul_output_type = RankedTensorType::get(
-      {weight_shape.getDimSize(output_dim)}, builder.getIntegerType(32));
-  auto reduced = builder.create<TF::SumOp>(
-      loc, zp_mul_output_type, filter_i32, reduction_indices_value,
-      /*keep_dims=*/builder.getBoolAttr(false));
-  TF::MulOp mul_op = builder.create<TF::MulOp>(loc, zp, reduced);
+  auto reduction_indices_value =
+      Create1DConstValue<int64_t>(builder, loc, non_output_indices);
+  auto zp = CreateScalarConstValue<int32_t>(builder, loc, other_tensor_zp);
+
+  TensorType tensor_type = tensor.getType().dyn_cast<TensorType>();
+  Value tensor_i32 = builder.create<TF::CastOp>(
+      loc, tensor_type.clone(builder.getIntegerType(32)), tensor);
+  auto reduced =
+      builder.create<TF::SumOp>(loc, tensor_i32, reduction_indices_value,
+                                /*keep_dims=*/builder.getBoolAttr(true));
+  auto mul_op = builder.create<TF::MulOp>(loc, zp, reduced);
+
   llvm::SmallVector<Value> folded_results = ConstantFoldOpIfPossible(mul_op);
   return folded_results.front();
 }
 
+// Calculates zero-point offset by reducing the weight and multiply it with zp.
+// Originally, we have:
+//   output = (int8_input - input_zp) * (int8_weight - weight_zp)
+// So, offset = input_zp * int8_weight + weight_zp * int8_input
+// - input_zp * weight_zp.
+// This function calculates the `offset` value mentioned above. Note that the
+// `output_dims` is the weight dimensions that are not contracted, so they
+// appear in the output shape.
+Value CalculateZeroPointOffset(OpBuilder &builder, Location loc, Value input,
+                               Value weight, int8_t input_zp, int8_t weight_zp,
+                               const ArrayRef<int64_t> input_output_dims,
+                               const ArrayRef<int64_t> weight_output_dims) {
+  Value zp_input_contribution = CreateZeroPointPartialOffset(
+      builder, loc, input, weight_zp, input_output_dims);
+  Value zp_weight_contribution = CreateZeroPointPartialOffset(
+      builder, loc, weight, input_zp, weight_output_dims);
+
+  auto weight_shape = weight.getType().template cast<ShapedType>();
+  SmallVector<int64_t> weight_non_output_indices;
+  for (auto i : llvm::seq<int64_t>(0, weight_shape.getRank())) {
+    if (absl::c_count(weight_output_dims, i) == 0) {
+      weight_non_output_indices.push_back(i);
+    }
+  }
+
+  if (input_zp != 0 && weight_zp != 0) {
+    // Add two contributions, and a zeropoint modification term
+    // Consider two quantized matrices P, Q with zero points z, w. Let's say the
+    // dimensions are l X n, n X m.
+    // What we want to calculate is: R = matmul(P-z, Q-w).
+    // Then r_ij = sigma(k) (p_ik - z) * (q_kj - w)
+    //           = sigma(k)(p_ik * q_kj) - w * sigma(k)p_ik - z * sigma(k)q_kj
+    //             + sigma(k)z*w.
+    // zp_input_contribution = z * sigma(k)q_kj
+    // zp_weight_contribution = w * sigma(k)p_ik
+    // In case z != 0 and w != 0, we need to additionally calculate sigma(k)z*w,
+    // which is: # of reduced dim(n in this case) * input_zp * weight_zp
+    int32_t static_dim_total = 1;
+    Value accum_dynamic_dim = nullptr;
+    llvm::SmallVector<int64_t> weight_non_output_dynamic_indices;
+    for (const int64_t weight_idx : weight_non_output_indices) {
+      if (weight_shape.isDynamicDim(weight_idx)) {
+        weight_non_output_dynamic_indices.push_back(weight_idx);
+      } else {
+        static_dim_total *= weight_shape.getDimSize(weight_idx);
+      }
+    }
+
+    if (!weight_non_output_dynamic_indices.empty()) {
+      // Has dynamic shapes.
+      auto weight_shape_op = builder.create<TF::ShapeOp>(
+          loc, weight, /*use32Bit=*/builder.getBoolAttr(false));
+
+      auto slice_output_type = RankedTensorType::get({1}, builder.getI64Type());
+      auto slice_stride = CreateConstValue<int64_t>(builder, loc, {1}, {1});
+      for (int64_t weight_idx : weight_non_output_dynamic_indices) {
+        auto start = CreateConstValue<int64_t>(builder, loc, {1}, {weight_idx});
+        auto end =
+            CreateConstValue<int64_t>(builder, loc, {1}, {weight_idx + 1});
+        auto sliced_shape_op = builder.create<TF::StridedSliceOp>(
+            loc, slice_output_type, weight_shape_op, start, end, slice_stride);
+        if (accum_dynamic_dim == nullptr) {
+          accum_dynamic_dim = sliced_shape_op->getResults().front();
+        } else {
+          accum_dynamic_dim =
+              builder
+                  .create<TF::MulOp>(loc, accum_dynamic_dim, sliced_shape_op)
+                  ->getResults()
+                  .front();
+        }
+      }
+    }
+
+    const int32_t zp_constant_offset = static_cast<int32_t>(input_zp) *
+                                       static_cast<int32_t>(weight_zp) *
+                                       static_dim_total;
+    auto zp_offset_value =
+        CreateScalarConstValue<int32_t>(builder, loc, zp_constant_offset);
+    if (accum_dynamic_dim != nullptr) {
+      accum_dynamic_dim =
+          builder
+              .create<mlir::TF::CastOp>(
+                  loc, mlir::RankedTensorType::get({1}, builder.getI32Type()),
+                  accum_dynamic_dim)
+              ->getResults()
+              .front();
+      auto mul_op =
+          builder.create<TF::MulOp>(loc, accum_dynamic_dim, zp_offset_value);
+      zp_offset_value = mul_op->getResults().front();
+    }
+
+    auto offset_sum = builder.create<TF::AddOp>(loc, zp_input_contribution,
+                                                zp_weight_contribution);
+    auto offset_op =
+        builder.create<TF::SubOp>(loc, offset_sum, zp_offset_value);
+
+    llvm::SmallVector<Value> folded_results =
+        ConstantFoldOpIfPossible(offset_op);
+    return folded_results.front();
+  }
+
+  if (input_zp != 0) return zp_weight_contribution;
+  return zp_input_contribution;
+}
+
 // Helper function to create a XlaConvV2Op for Conv2DOp, DepthwiseConv2DOp and
 // Conv3DOp.
 Value CreateXlaConvOp(OpBuilder &builder, Location loc, Value input,
@@ -170,16 +287,21 @@ Value CreateXlaConvOp(OpBuilder &builder, Location loc, Value input,
               rhs_dilation, feature_group_count,
               builder.getStringAttr(dnums.SerializeAsString()),
               /*precision_config=*/builder.getStringAttr(precision_config_str))
-          .output();
+          .getOutput();
+
+  // Dynamic-range quantization wil always fall into this case.
   if (input_zp_value == 0) return xla_conv_output;
 
-  Value zp_offset = CalculateZeroPointOffset(builder, loc, /*filter=*/filter,
-                                             /*input_zp=*/input_zp_value,
-                                             /*output_dim=*/num_dims - 1);
-  return builder.create<TF::SubOp>(loc, xla_conv_output, zp_offset).z();
+  Value zp_offset = CalculateZeroPointOffset(
+      builder, loc, input, filter, input_zp_value,
+      /*weight_zp=*/0,
+      /*input_output_dims=*/ArrayRef<int64_t>({0}),
+      /*weight_output_dims=*/ArrayRef<int64_t>({num_dims - 1}));
+  return builder.create<TF::SubOp>(loc, xla_conv_output, zp_offset).getZ();
 }
 
-// Creates a XlaConvV2Op from TF Conv2DOp and returns its output.
+// Creates a XlaConvV2Op from TF Conv2DOp and returns its output. The returned
+// value will be used as an input of the next op.
 Value CreateXlaConvOpFromTfConv2dOp(OpBuilder &builder, Location loc,
                                     Value input, Value filter, Value input_zp,
                                     Value conv_output, ArrayAttr strides,
@@ -252,11 +374,18 @@ Value CreateXlaConvOpFromTfConv3dOp(OpBuilder &builder, Location loc,
 
 // Helper function to create an XlaDotV2Op.
 Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
-                       Value weight, Value input_zp, Value output,
-                       const xla::DotDimensionNumbers &dnums,
+                       Value weight, Value input_zp, Value weight_zp,
+                       Value output, const xla::DotDimensionNumbers &dnums,
                        bool four_bit = false) {
-  int32_t input_zp_value;
-  if (!GetSplatValue(input_zp, input_zp_value)) {
+  int32_t input_zp_value = 0;
+  int32_t weight_zp_value = 0;
+  if (input_zp != nullptr && !GetSplatValue(input_zp, input_zp_value)) {
+    emitError(loc,
+              "zero point is expected to be a constant with a single value");
+    return {};
+  }
+
+  if (weight_zp != nullptr && !GetSplatValue(weight_zp, weight_zp_value)) {
     emitError(loc,
               "zero point is expected to be a constant with a single value");
     return {};
@@ -271,6 +400,7 @@ Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
     precision_config.add_operand_precision(xla::PrecisionConfig::PACKED_NIBBLE);
     precision_config_str = precision_config.SerializeAsString();
   }
+
   Value dot_result =
       builder
           .create<TF::XlaDotV2Op>(
@@ -282,15 +412,26 @@ Value CreateXlaDotV2Op(OpBuilder &builder, Location loc, Value input,
               /*precision_config=*/builder.getStringAttr(precision_config_str))
           .getResult();
 
-  Value zp_offset =
-      CalculateZeroPointOffset(builder, loc, weight, input_zp_value,
-                               /*output_dim=*/1);
+  auto input_shape = input.getType().template cast<ShapedType>();
+  auto weight_shape = weight.getType().template cast<ShapedType>();
+  SmallVector<int64_t> input_output_dims(input_shape.getRank() - 2);
+  SmallVector<int64_t> weight_output_dims(weight_shape.getRank() - 2);
+  absl::c_iota(input_output_dims, 0);
+  absl::c_iota(weight_output_dims, 0);
+  input_output_dims.push_back(weight_shape.getRank() - 2);
+  weight_output_dims.push_back(weight_shape.getRank() - 1);
+
+  Value zp_offset = CalculateZeroPointOffset(
+      builder, loc, input, weight, input_zp_value, weight_zp_value,
+      ArrayRef<int64_t>(input_output_dims),
+      ArrayRef<int64_t>(weight_output_dims));
   return builder.create<TF::SubOp>(loc, dot_result, zp_offset);
 }
 
 Value CreateXlaDotV2OpFromTfMatMulOp(OpBuilder &builder, Location loc,
                                      Value input, Value weight, Value input_zp,
-                                     Value output, BoolAttr transpose_a,
+                                     Value weight_zp, Value output,
+                                     BoolAttr transpose_a,
                                      BoolAttr transpose_b) {
   // Transpose and constant-fold the weight if needed.
   if (transpose_b.getValue()) {
@@ -307,7 +448,174 @@ Value CreateXlaDotV2OpFromTfMatMulOp(OpBuilder &builder, Location loc,
     dnums.add_lhs_contracting_dimensions(1);
   }
 
-  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, output, dnums);
+  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, weight_zp,
+                          output, dnums);
+}
+
+// Gets the broadcasted shapes of the input and weight of the BatchMatMul op
+// from their types. If there are dynamic dimesions, these shapes couldn't be
+// used as the arguments for the BroadcastTo ops.
+llvm::Optional<std::pair<SmallVector<int64_t>, SmallVector<int64_t>>>
+GetBroadcastShapesForBatchMatmul(ShapedType input_type,
+                                 ShapedType weight_type) {
+  ArrayRef<int64_t> input_shape = input_type.getShape();
+  ArrayRef<int64_t> weight_shape = weight_type.getShape();
+
+  const int64_t num_matmul_dim = 2;
+  const int64_t num_input_batch_dim = input_type.getRank() - num_matmul_dim;
+  const int64_t num_weight_batch_dim = weight_type.getRank() - num_matmul_dim;
+
+  ArrayRef<int64_t> input_batch_dims =
+      input_shape.slice(0, num_input_batch_dim);
+  ArrayRef<int64_t> weight_batch_dims =
+      weight_shape.slice(0, num_weight_batch_dim);
+  ArrayRef<int64_t> input_matmul_dims =
+      input_shape.slice(num_input_batch_dim, num_matmul_dim);
+  ArrayRef<int64_t> weight_matmul_dims =
+      weight_shape.slice(num_weight_batch_dim, num_matmul_dim);
+
+  SmallVector<int64_t> broadcasted_batch_dims;
+  if (!OpTrait::util::getBroadcastedShape(input_batch_dims, weight_batch_dims,
+                                          broadcasted_batch_dims)) {
+    return std::nullopt;
+  }
+  SmallVector<int64_t> broadcasted_input_shape(broadcasted_batch_dims);
+  broadcasted_input_shape.append(input_matmul_dims.begin(),
+                                 input_matmul_dims.end());
+  SmallVector<int64_t> broadcasted_weight_shape(broadcasted_batch_dims);
+  broadcasted_weight_shape.append(weight_matmul_dims.begin(),
+                                  weight_matmul_dims.end());
+
+  return std::make_pair(std::move(broadcasted_input_shape),
+                        std::move(broadcasted_weight_shape));
+}
+
+// Broadcasts batch dimensions of the input and weight of the BatchMatMul
+// op. In XLA, shapes are all constants, so all operations created in this
+// function, except BroadcastTo, are expected to be folded.
+void BroadcastBatchDimensionsForBatchMatMul(OpBuilder &builder, Location loc,
+                                            Value &input, Value &weight) {
+  ShapedType input_type = input.getType().template cast<ShapedType>();
+  ShapedType weight_type = weight.getType().template cast<ShapedType>();
+  const int32_t input_rank = input_type.getRank();
+  const int32_t weight_rank = weight_type.getRank();
+  const int32_t broadcasted_rank = std::max(input_rank, weight_rank);
+
+  const int32_t num_matmul_dim = 2;
+  const int32_t num_input_batch_dim = input_rank - num_matmul_dim;
+  const int32_t num_weight_batch_dim = weight_rank - num_matmul_dim;
+  if (num_input_batch_dim == 0 && num_weight_batch_dim == 0) return;
+
+  // If the broadcasted shapes can be calculated statically, only add two
+  // BroadcastTo ops for input and weight.
+  auto broadcasted_shapes_or =
+      GetBroadcastShapesForBatchMatmul(input_type, weight_type);
+  if (!broadcasted_shapes_or.has_value()) return;
+  const auto broadcasted_input_type = RankedTensorType::get(
+      broadcasted_shapes_or->first, input_type.getElementType());
+  const auto broadcasted_weight_type = RankedTensorType::get(
+      broadcasted_shapes_or->second, weight_type.getElementType());
+
+  if (broadcasted_input_type.hasStaticShape() &&
+      broadcasted_weight_type.hasStaticShape()) {
+    input = builder.create<TF::BroadcastToOp>(
+        loc, broadcasted_input_type, input,
+        Create1DConstValue(builder, loc, broadcasted_shapes_or->first));
+    weight = builder.create<TF::BroadcastToOp>(
+        loc, broadcasted_weight_type, weight,
+        Create1DConstValue(builder, loc, broadcasted_shapes_or->second));
+    return;
+  }
+
+  const Value zero = Create1DConstValue<int32_t>(builder, loc, {0});
+  const Value num_matmul_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_matmul_dim});
+  const Value num_input_batch_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_input_batch_dim});
+  const Value num_weight_batch_dim_value =
+      Create1DConstValue<int32_t>(builder, loc, {num_weight_batch_dim});
+
+  // Decompose the input and weight shape into batch and matmul dimensions.
+  Value input_shape = builder.create<TF::ShapeOp>(
+      loc, input, /*use32Bit=*/builder.getBoolAttr(false));
+  Value input_batch_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_input_batch_dim}, builder.getI64Type()),
+      input_shape, zero, num_input_batch_dim_value);
+  Value input_matmul_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_matmul_dim}, builder.getI64Type()),
+      input_shape, num_input_batch_dim_value, num_matmul_dim_value);
+
+  Value weight_shape = builder.create<TF::ShapeOp>(
+      loc, weight, /*use32Bit=*/builder.getBoolAttr(false));
+  Value weight_batch_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_weight_batch_dim}, builder.getI64Type()),
+      weight_shape, zero, num_weight_batch_dim_value);
+  Value weight_matmul_dims = builder.create<TF::SliceOp>(
+      loc, RankedTensorType::get({num_matmul_dim}, builder.getI64Type()),
+      weight_shape, num_weight_batch_dim_value, num_matmul_dim_value);
+
+  // Calculate the broadcasted shapes.
+  Value broadcasted_batch_dims = builder.create<TF::BroadcastArgsOp>(
+      loc,
+      RankedTensorType::get({broadcasted_rank - num_matmul_dim},
+                            builder.getI64Type()),
+      input_batch_dims, weight_batch_dims);
+  Type broadcasted_shape_type =
+      RankedTensorType::get({broadcasted_rank}, builder.getI64Type());
+
+  const Value zero_scalar = CreateScalarConstValue<int32_t>(builder, loc, 0);
+  Value broacasted_input_shape = builder.create<TF::ConcatOp>(
+      loc, broadcasted_shape_type, /*concat_dim=*/zero_scalar,
+      ValueRange{broadcasted_batch_dims, input_matmul_dims});
+  Value broacasted_weight_shape = builder.create<TF::ConcatOp>(
+      loc, broadcasted_shape_type, /*concat_dim=*/zero_scalar,
+      ValueRange{broadcasted_batch_dims, weight_matmul_dims});
+
+  // Broadcast input and weight with the calculated shapes.
+  input = builder.create<TF::BroadcastToOp>(loc, broadcasted_input_type, input,
+                                            broacasted_input_shape);
+  weight = builder.create<TF::BroadcastToOp>(loc, broadcasted_weight_type,
+                                             weight, broacasted_weight_shape);
+}
+
+Value CreateXlaDotV2OpFromTfBatchMatMulOp(OpBuilder &builder, Location loc,
+                                          Value input, Value weight,
+                                          Value input_zp, Value weight_zp,
+                                          Value output, BoolAttr adj_x,
+                                          BoolAttr adj_y) {
+  // TensorFlow BatchMatMulOp allows the batch dimensions to be broadcastable
+  // while the XlaDotV2Op doesn't. So we have to broadcast them beforehand.
+  BroadcastBatchDimensionsForBatchMatMul(builder, loc, input, weight);
+
+  // Both input and weight have the same rank after broadcasting.
+  ShapedType weight_shape = weight.getType().template cast<ShapedType>();
+  int num_batch_dim = weight_shape.getRank() - 2;
+
+  // Transpose and constant-fold the weight if needed.
+  if (adj_y.getValue()) {
+    SmallVector<int32_t> perm_values(num_batch_dim);
+    absl::c_iota(perm_values, 0);
+    perm_values.push_back(num_batch_dim + 1);
+    perm_values.push_back(num_batch_dim);
+    Value perm = Create1DConstValue<int32_t>(builder, loc, perm_values);
+    auto transpose_op = builder.create<TF::TransposeOp>(loc, weight, perm);
+    weight = ConstantFoldOpIfPossible(transpose_op).front();
+  }
+
+  xla::DotDimensionNumbers dnums;
+  for (int i : llvm::seq<int32_t>(0, num_batch_dim)) {
+    dnums.add_lhs_batch_dimensions(i);
+    dnums.add_rhs_batch_dimensions(i);
+  }
+  dnums.add_rhs_contracting_dimensions(num_batch_dim);
+  if (adj_x.getValue()) {
+    dnums.add_lhs_contracting_dimensions(num_batch_dim);
+  } else {
+    dnums.add_lhs_contracting_dimensions(num_batch_dim + 1);
+  }
+
+  return CreateXlaDotV2Op(builder, loc, input, weight, input_zp, weight_zp,
+                          output, dnums);
 }
 
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.inc"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
index 49184b95bca..7e679ff4e9c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/replace_cast_hacks_with_tf_xla_ops.td
@@ -31,6 +31,9 @@ def CreateXlaDotV2OpFromTfMatMulOp : NativeCodeCall<
 def CreateXLAConvOpFromTFConv3DOp : NativeCodeCall<
   "CreateXlaConvOpFromTfConv3dOp($_builder, $_loc, $0...)">;
 
+def CreateXlaDotV2OpFromTfBatchMatMulOp : NativeCodeCall<
+  "CreateXlaDotV2OpFromTfBatchMatMulOp($_builder, $_loc, $0...)">;
+
 // Converts inlined Conv2D pattern to TF XlaConvV2 op. This pattern doesn't
 // support non-constant weights.
 def ConvertTFConv2DToXLAConvOp : Pat<
@@ -43,6 +46,27 @@ def ConvertTFConv2DToXLAConvOp : Pat<
     $input, $filter, $input_zp, $conv, $strides,
     $dilations, $padding, $explicit_padding),
   [(IsInt8ElementType $input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFConv2DToXLAConvOp but handles the case where input zero
+// point is dynaically calculated so not a constant.
+def ConvertTFConv2DToXLAConvOpDynamicRange : Pat<
+  (TF_Conv2DOp:$conv
+    (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp),
+    (TF_CastOp (TF_IdentityOp $filter), $truncate1),
+    $strides, $use_cudnn, $padding, $explicit_padding,
+    IsDataFormatNHWC:$data_format, $dilations),
+  (CreateXLAConvOpFromTFConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt32ElementType $input),
    (IsInt8ElementType $filter),
    (IsConstTensor $filter),
    (IsInt32ElementType $conv),
@@ -74,7 +98,7 @@ def ConvertTFConv2DWithNoZeroPointToXLAConvOp : Pat<
 def ConvertTFDepthwiseConv2DToXLAConvOp : Pat<
   (TF_CastOp:$conv
     (TF_DepthwiseConv2dNativeOp
-      (TF_CastOp:$casted_input
+      (TF_CastOp:$cast_input
         (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp), $truncate2),
       (TF_CastOp
         (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
@@ -84,7 +108,31 @@ def ConvertTFDepthwiseConv2DToXLAConvOp : Pat<
     $input, $filter, $input_zp, $conv, $strides,
     $dilations, $padding, $explicit_padding),
   [(IsInt8ElementType $input),
-   (IsF32ElementType $casted_input),
+   (IsF32ElementType $cast_input),
+   (IsInt8ElementType $filter),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $filter),
+   (IsInt32ElementType $conv),
+   (HasStaticShapeConstraint $filter),
+   (HasStaticShapeAtDimsConstraint<"3"> $input)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFDepthwiseConv2DToXLAConvOp but handles the case where input
+// zero point is dynaically calculated so not a constant.
+def ConvertTFDepthwiseConv2DToXLAConvOpDynamicRange : Pat<
+  (TF_CastOp:$conv
+    (TF_DepthwiseConv2dNativeOp
+    (TF_CastOp
+      (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp), $truncate1),
+    (TF_CastOp
+      (TF_CastOp (TF_IdentityOp $filter), $truncate2), $truncate3),
+      $strides, $padding, $explicit_padding,
+      IsDataFormatNHWC:$data_format, $dilations), $truncate4),
+  (CreateXLAConvOpFromTFDepthwiseConv2DOp
+    $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $conv, $strides,
+    $dilations, $padding, $explicit_padding),
+  [(IsInt32ElementType $input),
    (IsInt8ElementType $filter),
    (IsConstTensor $filter),
    (IsInt32ElementType $conv),
@@ -97,7 +145,7 @@ def ConvertTFDepthwiseConv2DToXLAConvOp : Pat<
 def ConvertTFDepthwiseConv2DWithNoZeroPointToXLAConvOp : Pat<
   (TF_CastOp:$conv
     (TF_DepthwiseConv2dNativeOp
-      (TF_CastOp:$casted_input
+      (TF_CastOp:$cast_input
         (TF_CastOp $input, $truncate1), $truncate2),
       (TF_CastOp
         (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
@@ -107,7 +155,7 @@ def ConvertTFDepthwiseConv2DWithNoZeroPointToXLAConvOp : Pat<
     $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     $conv, $strides, $dilations, $padding, $explicit_padding),
   [(IsInt8ElementType $input),
-   (IsF32ElementType $casted_input),
+   (IsF32ElementType $cast_input),
    (IsInt8ElementType $filter),
    (IsConstTensor $filter),
    (IsInt32ElementType $conv),
@@ -124,8 +172,29 @@ def ConvertTFMatMulToXLADotV2Op : Pat<
     (TF_CastOp (TF_IdentityOp $weight), $truncate1),
     $transpose_a, $transpose_b),
   (CreateXlaDotV2OpFromTfMatMulOp
-    $input, $weight, $input_zp, $matmul, $transpose_a, $transpose_b),
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">), $matmul,
+    $transpose_a, $transpose_b),
   [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $input_zp),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $matmul),
+   (HasStaticShapeConstraint $weight)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFMatMulToXLADotV2Op but handles the case where input zero
+// point is dynaically calculated so not a constant.
+def ConvertTFMatMulToXLADotV2OpDynamicRange : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp:$input (TF_CastOp $input_i8, $truncate0), $input_zp),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $transpose_a, $transpose_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt32ElementType $input),
    (IsInt8ElementType $weight),
    (IsConstTensor $weight),
    (IsInt32ElementType $matmul),
@@ -141,6 +210,7 @@ def ConvertTFMatMulWithNoZeroPointToXLADotV2Op : Pat<
     $transpose_a, $transpose_b),
   (CreateXlaDotV2OpFromTfMatMulOp
     $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     $matmul, $transpose_a, $transpose_b),
   [(IsInt8ElementType $input),
    (IsInt8ElementType $weight),
@@ -149,12 +219,86 @@ def ConvertTFMatMulWithNoZeroPointToXLADotV2Op : Pat<
    (HasStaticShapeConstraint $weight)],
   (addBenefit 10)>;
 
+// Converts inlined MatMul pattern to TF XlaDotV2 op. This pattern supports
+// non-constant weights.
+def ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
+    (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
+    $transpose_a, $transpose_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, $input_zp, $weight_zp, $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where input zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_CastOp $input, $truncate),
+    (TF_SubOp (TF_CastOp $weight, $truncate2), $weight_zp),
+    $transpose_a, $transpose_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $weight_zp, $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where weight zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp $weight, $truncate1),
+    $transpose_a, $transpose_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (IsInt32ElementType $matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFMatMulWithTwoInputTensorsToXLADotV2Op but handles the case
+// where both zero point is 0 and the Sub op has been folded.
+def ConvertTFMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat<
+  (TF_MatMulOp:$matmul
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp $weight, $truncate1),
+    $transpose_a, $transpose_b),
+  (CreateXlaDotV2OpFromTfMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $matmul, $transpose_a, $transpose_b),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (IsInt32ElementType $matmul)],
+  (addBenefit 10)>;
+
+
 // Converts inlined Conv3D pattern to TF XlaConvV2 op. This pattern
 // doesn't support non-constant weights.
 def ConvertTFConv3DToXLAConvOp : Pat<
   (TF_CastOp:$conv
     (TF_Conv3DOp
-      (TF_CastOp:$casted_input
+      (TF_CastOp:$cast_input
         (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp), $truncate2),
       (TF_CastOp
         (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
@@ -163,7 +307,7 @@ def ConvertTFConv3DToXLAConvOp : Pat<
   (CreateXLAConvOpFromTFConv3DOp
     $input, $filter, $input_zp, $conv, $strides, $dilations, $padding),
   [(IsInt8ElementType $input),
-   (IsF32ElementType $casted_input),
+   (IsF32ElementType $cast_input),
    (IsInt8ElementType $filter),
    (IsConstTensor $filter),
    (IsInt32ElementType $conv),
@@ -176,7 +320,7 @@ def ConvertTFConv3DToXLAConvOp : Pat<
 def ConvertTFConv3DWithNoZeroPointToXLAConvOp : Pat<
   (TF_CastOp:$conv
     (TF_Conv3DOp
-      (TF_CastOp:$casted_input
+      (TF_CastOp:$cast_input
         (TF_CastOp $input, $truncate1), $truncate2),
       (TF_CastOp
         (TF_CastOp (TF_IdentityOp $filter), $truncate3), $truncate4),
@@ -186,10 +330,126 @@ def ConvertTFConv3DWithNoZeroPointToXLAConvOp : Pat<
     $input, $filter, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
     $conv, $strides, $dilations, $padding),
   [(IsInt8ElementType $input),
-   (IsF32ElementType $casted_input),
+   (IsF32ElementType $cast_input),
    (IsInt8ElementType $filter),
    (IsConstTensor $filter),
    (IsInt32ElementType $conv),
    (HasStaticShapeConstraint $filter),
    (HasStaticShapeAtDimsConstraint<"4"> $input)],
   (addBenefit 10)>;
+
+// Converts inlined BatchMatMul pattern to TF XlaDotV2 op. This pattern doesn't
+// support non-constant weights.
+def ConvertTFBatchMatMulToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (HasRank $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $batch_matmul),
+   (HasStaticShapeConstraint $weight)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulToXLADotV2Op but handles the case where input
+// zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithNoZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate),
+    (TF_CastOp (TF_IdentityOp $weight), $truncate1),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (HasRank $input),
+   (IsInt8ElementType $weight),
+   (IsConstTensor $weight),
+   (IsInt32ElementType $batch_matmul),
+   (HasStaticShapeConstraint $weight)],
+  (addBenefit 10)>;
+
+// Converts inlined BatchMatMul pattern to TF XlaDotV2 op. Support for
+// non-constant weights.
+// TODO(b/263529454): Remove redundant identity of the rule input on the second
+// argument.
+def ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate), $input_zp),
+    (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp, $weight_zp, $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $batch_matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where input zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoInputZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate),
+    (TF_SubOp (TF_CastOp (TF_IdentityOp $weight), $truncate1), $weight_zp),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $weight_zp, $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $weight_zp),
+   (IsInt32ElementType $batch_matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where weight zero point is 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoWeightZeroPointToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_SubOp (TF_CastOp $input, $truncate1), $input_zp),
+    (TF_CastOp $weight, $truncate2),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, $input_zp,
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (HasRankOf<0> $input_zp),
+   (IsInt32ElementType $batch_matmul)],
+  (addBenefit 10)>;
+
+// Same as ConvertTFBatchMatMulWithTwoInputTensorsToXLADotV2O but handles
+// the case where both zero points are 0 and the Sub op has been folded.
+def ConvertTFBatchMatMulWithTwoInputTensorsAndNoBothZeroPointsToXLADotV2Op : Pat<
+  (TF_BatchMatMulV2Op:$batch_matmul
+    (TF_CastOp $input, $truncate1),
+    (TF_CastOp $weight, $truncate2),
+    $adj_x, $adj_y),
+  (CreateXlaDotV2OpFromTfBatchMatMulOp
+    $input, $weight, /*input_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    /*weight_zp=*/(CreateScalarIntegerConst<"int32_t", "0">),
+    $batch_matmul, $adj_x, $adj_y),
+  [(IsInt8ElementType $input),
+   (IsInt8ElementType $weight),
+   (HasRank $input),
+   (HasRank $weight),
+   (IsInt32ElementType $batch_matmul)],
+  (addBenefit 10)>;
+
+
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
index efc863434ba..23991171576 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.td
@@ -19,81 +19,6 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpAsmInterface.td"
 
-def TF_UniformQuantizedAddOp : TF_Op<"UniformQuantizedAdd", [Pure]> {
-  // TODO(b/230804708): Improve the operator description.
-  let summary = "Quantized add operator.";
-
-  let arguments = (ins
-    TensorOf<[TF_Qint32]>:$lhs,
-    TensorOf<[TF_Qint32]>:$rhs,
-    TF_Float32Tensor:$lhs_scales,
-    TF_Int32Tensor:$lhs_zps,
-    TF_Float32Tensor:$rhs_scales,
-    TF_Int32Tensor:$rhs_zps,
-    TF_Float32Tensor:$output_scales,
-    TF_Int32Tensor:$output_zps,
-
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$lhs_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$lhs_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$lhs_quantization_max_val,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$rhs_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$rhs_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$rhs_quantization_max_val,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$output_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$output_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$output_quantization_max_val
-  );
-
-  let results = (outs
-    TensorOf<[TF_Qint32]>:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
-// TODO(b/230804708): Add hybrid convolution.
-def TF_UniformQuantizedConvolutionOp : TF_Op<"UniformQuantizedConvolution", [Pure]> {
-  // TODO(b/230804708): Improve the operator description.
-  let summary = "Quantized conv2d operator.";
-
-  let arguments = (ins
-    TensorOf<[TF_Float32, TF_Qint8]>:$lhs,
-    TensorOf<[TF_Qint8]>:$rhs,
-    TF_Float32Tensor:$lhs_scales,
-    TF_Int32Tensor:$lhs_zps,
-    TF_Float32Tensor:$rhs_scales,
-    TF_Int32Tensor:$rhs_zps,
-    TF_Float32Tensor:$output_scales,
-    TF_Int32Tensor:$output_zps,
-    TF_Int32Tensor:$window_strides,
-    TF_Int32Tensor:$padding,
-    TF_Int32Tensor:$lhs_dilation,
-    TF_Int32Tensor:$rhs_dilation,
-    TF_Int32Tensor:$feature_group_count,
-
-    StrAttr:$dimension_numbers,
-    DefaultValuedOptionalAttr<I64Attr, "1">:$batch_group_count,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$lhs_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$lhs_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$lhs_quantization_max_val,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$rhs_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$rhs_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$rhs_quantization_max_val,
-    DefaultValuedOptionalAttr<I64Attr, "-1">:$output_quantization_axis,
-    DefaultValuedOptionalAttr<I64Attr, "-128">:$output_quantization_min_val,
-    DefaultValuedOptionalAttr<I64Attr, "127">:$output_quantization_max_val
-  );
-
-  let results = (outs
-    TensorOf<[TF_Qint32]>:$output
-  );
-
-  TF_DerivedOperandTypeAttr LhsT = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr RhsT = TF_DerivedOperandTypeAttr<1>;
-  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<8>;
-  TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
-}
-
 // TODO(b/230804708): Add hybrid dot general.
 def TF_UniformQuantizedDotGeneralOp : TF_Op<"UniformQuantizedDotGeneral", [Pure]> {
   // TODO(b/230804708): Improve the operator description.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/unfreeze_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/unfreeze_constants.cc
index 38b49d4c3c0..95eeabe78eb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/unfreeze_constants.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/unfreeze_constants.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <iterator>
 #include <memory>
 #include <string>
 #include <vector>
@@ -31,6 +32,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/const_op_size.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -41,18 +44,41 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using ::mlir::tf_saved_model::GetInitializerFunction;
+using ::mlir::tf_saved_model::GetSessionInitializerOp;
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
 using ::mlir::tf_saved_model::SessionInitializerOp;
 
 constexpr absl::string_view kDefaultConstName = "const";
 
+// The default lower threshold for the constant size for unfreezing.
+constexpr int64_t kDefaultConstantSizeThresholdInBytes = 64 * 1024;  // 64KiB
+
+// This pass "unfreezes" constants found in the moudle and converts them to
+// `tf.VarHandleOp`s. Also, an initialization pattern
+// `tf.AssignVariableOp(tf.VarHandleOp, tf.ConstOp)` is inserted to the
+// initializer function of type "restore_op" for each of the unfrozen constants.
+//
+// The constants whose sizes are smaller than `size_threshold_in_bytes_` will
+// not be converted to variables.
 class UnfreezeConstantsPass
     : public PassWrapper<UnfreezeConstantsPass, OperationPass<ModuleOp>> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(UnfreezeConstantsPass)
 
-  explicit UnfreezeConstantsPass() {}
+  explicit UnfreezeConstantsPass()
+      : UnfreezeConstantsPass(kDefaultConstantSizeThresholdInBytes) {}
+
+  explicit UnfreezeConstantsPass(const int64_t size_threshold_in_bytes)
+      : size_threshold_in_bytes_(
+            CreateSizeThresholdInBytesOption(size_threshold_in_bytes)) {}
+
+  UnfreezeConstantsPass(const UnfreezeConstantsPass& other)
+      : UnfreezeConstantsPass{} {
+    size_threshold_in_bytes_ = other.size_threshold_in_bytes_.getValue();
+  }
 
   StringRef getArgument() const override { return "quant-unfreeze-constants"; }
 
@@ -63,10 +89,23 @@ class UnfreezeConstantsPass
   void runOnOperation() override;
 
  private:
+  Option<int64_t> CreateSizeThresholdInBytesOption(const int64_t init_value) {
+    return Option<int64_t>(
+        *this, "size_threshold_in_bytes", llvm::cl::init(init_value),
+        llvm::cl::desc(
+            "Lower threshold of the constant size for unfreezing. Constants "
+            "smaller than this value will not be converted to variables."));
+  }
+
   void getDependentDialects(DialectRegistry& registry) const override {
     registry.insert<TF::TensorFlowDialect,
                     tf_saved_model::TensorFlowSavedModelDialect>();
   }
+
+  // Lower-bound threshold for the size of the constant in bytes. Constants
+  // larger than this threshold will not be unfrozen and will remain as
+  // constants.
+  Option<int64_t> size_threshold_in_bytes_;
 };
 
 // Adds the symbol to the "initializers" attribute of the session_initializer
@@ -82,13 +121,31 @@ void AddSymbolToInitializersAttr(SessionInitializerOp session_init_op,
       ArrayAttr::get(session_init_op.getContext(), initializers_attrs));
 }
 
-// Create the initializer function right after the session_initializer op.
+// Returns the session_initializer op in the module if exists. Otherwise,
+// creates a new session_initializer op and returns it.
+SessionInitializerOp GetOrCreateSessionInitializerOp(ModuleOp module_op) {
+  SessionInitializerOp session_init_op = GetSessionInitializerOp(module_op);
+
+  // Create one if it doesn't exist.
+  if (!session_init_op) {
+    OpBuilder builder(&module_op.getBodyRegion());
+
+    session_init_op = builder.create<SessionInitializerOp>(
+        module_op.getLoc(), /*initializers=*/builder.getArrayAttr({}));
+  }
+
+  return session_init_op;
+}
+
+// Create the initializer function right after the SessionInitializer op.
 // Returns the newly created initializer function. The initializer function's
 // initializer_type is set to "restore_op" since it essentially serves as a
 // variable restoration function.
-func::FuncOp CreateInitializerFunc(SymbolTable& symbol_table,
-                                   SessionInitializerOp session_init_op) {
-  OpBuilder builder{session_init_op.getContext()};
+func::FuncOp CreateInitializerFunc(ModuleOp module_op) {
+  SessionInitializerOp session_init_op =
+      GetOrCreateSessionInitializerOp(module_op);
+
+  OpBuilder builder(module_op.getContext());
   builder.setInsertionPointAfter(session_init_op);
 
   const Location loc = builder.getUnknownLoc();
@@ -99,7 +156,7 @@ func::FuncOp CreateInitializerFunc(SymbolTable& symbol_table,
   builder.createBlock(&init_func.getBody(), /*insertPt=*/init_func.begin(),
                       /*arg_types=*/{}, /*arg_locs=*/{});
 
-  init_func->setAttr("tf_saved_model.exported_names",
+  init_func->setAttr(kTfSavedModelExportedNamesAttr,
                      builder.getStrArrayAttr(
                          {"tf_saved_model.session_initializer_restore_op"}));
   init_func->setAttr(
@@ -109,6 +166,7 @@ func::FuncOp CreateInitializerFunc(SymbolTable& symbol_table,
   builder.setInsertionPointToStart(&init_func.front());
   builder.create<func::ReturnOp>(loc, /*operands=*/ValueRange{});
 
+  SymbolTable symbol_table(module_op);
   symbol_table.insert(init_func);
 
   AddSymbolToInitializersAttr(
@@ -126,42 +184,19 @@ bool IsInitializerType(func::FuncOp init_func_op, StringRef initializer_type) {
 }
 
 // Returns the initializer function whose tf_saved_model.initializer_type
-// matches `initializer_type`. Creates and returns a new initializer function
-// iff such FuncOp is not found. The newly created initializer function's symbol
-// will be added to the symbol table and session_initializer op's "intializer"
-// attribute.
-func::FuncOp GetOrCreateSessionInitializerFunc(
-    SymbolTable& symbol_table, SessionInitializerOp session_init_op,
-    StringRef initializer_type) {
-  for (const auto init_sym :
-       session_init_op.getInitializers().getAsValueRange<FlatSymbolRefAttr>()) {
-    auto init_func_op = symbol_table.lookup<func::FuncOp>(init_sym);
-    if (!init_func_op) continue;
-
-    if (IsInitializerType(init_func_op, kTfSavedModelInitializerRestoreType)) {
-      return init_func_op;
-    }
+// is "restore_op". Creates and returns a new initializer function iff such
+// `FuncOp` is not found. The newly created initializer function's
+// initializer_type is "restore_op" and its symbol will be added to the symbol
+// table and session_initializer op's "intializer" attribute.
+func::FuncOp GetOrCreateInitializerFunc(ModuleOp module_op) {
+  if (auto init_func_op = GetInitializerFunction(
+          module_op, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+      init_func_op) {
+    return init_func_op;
+  } else {
+    // Create a new initializer function if the init function is not found.
+    return CreateInitializerFunc(module_op);
   }
-
-  // Create a new initializer function if the init function is not found.
-  return CreateInitializerFunc(symbol_table, session_init_op);
-}
-
-// Returns the session_initializer op in the module if exists. Otherwise,
-// creates a new session_initializer op and returns it.
-SessionInitializerOp GetOrCreateSessionInitializerOp(ModuleOp module_op) {
-  SessionInitializerOp session_init_op =
-      tf_saved_model::GetSessionInitializerOp(module_op);
-
-  // Create one if it doesn't exist.
-  if (!session_init_op) {
-    OpBuilder builder{&module_op.getBodyRegion()};
-
-    session_init_op = builder.create<SessionInitializerOp>(
-        module_op.getLoc(), /*initializers=*/builder.getArrayAttr({}));
-  }
-
-  return session_init_op;
 }
 
 // Retrieve the ConstOp's name from its loc. Returns "const" if a name cannot be
@@ -176,15 +211,18 @@ std::string GetConstOpName(TF::ConstOp const_op) {
 }
 
 // Collects the ConstOps to unfreeze.
-std::vector<TF::ConstOp> GetTargetConstOps(ModuleOp module_op) {
+std::vector<TF::ConstOp> GetTargetConstOps(const int64_t size_threshold,
+                                           ModuleOp module_op) {
   std::vector<TF::ConstOp> target_const_ops{};
 
   // TODO(b/254636388): Lift the assumption that there are no intializer
   // functions and avoid converting ConstOps inside initializer functions.
   for (auto func_op : module_op.getOps<func::FuncOp>()) {
-    auto const_ops = func_op.getOps<TF::ConstOp>();
-    target_const_ops.insert(target_const_ops.end(), const_ops.begin(),
-                            const_ops.end());
+    absl::c_copy_if(func_op.getOps<TF::ConstOp>(),
+                    std::back_inserter(target_const_ops),
+                    [size_threshold](TF::ConstOp const_op) -> bool {
+                      return GetSizeInBytes(const_op) > size_threshold;
+                    });
   }
 
   return target_const_ops;
@@ -255,11 +293,11 @@ void CreateAssignVariableOps(
     // Assign the ConstOp to each VarHandleOp. These will be used to save the
     // variable values to the checkpoint.
     auto const_op_copy =
-        builder.create<TF::ConstOp>(const_op.getLoc(), const_op.value());
+        builder.create<TF::ConstOp>(const_op.getLoc(), const_op.getValue());
 
     builder.create<TF::AssignVariableOp>(const_op.getLoc(),
                                          /*resource=*/var_handle_op,
-                                         /*value=*/const_op_copy.output());
+                                         /*value=*/const_op_copy.getOutput());
   }
 }
 
@@ -268,18 +306,13 @@ void UnfreezeConstantsPass::runOnOperation() {
 
   // Find the ConstOps to "unfreeze" into VarHandleOps.
   const std::vector<TF::ConstOp> target_const_ops =
-      GetTargetConstOps(module_op);
+      GetTargetConstOps(size_threshold_in_bytes_.getValue(), module_op);
   if (target_const_ops.empty()) {
     VLOG(1) << "No ConstOps found. UnfreezeConstantsPass is a no-op.";
     return;
   }
 
-  SessionInitializerOp session_init_op =
-      GetOrCreateSessionInitializerOp(module_op);
-
-  SymbolTable symbol_table{module_op};
-  func::FuncOp session_init_func = GetOrCreateSessionInitializerFunc(
-      symbol_table, session_init_op, kTfSavedModelInitializerRestoreType);
+  func::FuncOp session_init_func = GetOrCreateInitializerFunc(module_op);
 
   // Replace each usage of ConstOp to a VarHandleOp -> ReadVariableOp pattern.
   llvm::MapVector<TF::ConstOp, std::string> const_op_name_map =
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
index 9d228c4bff1..c65f7ac7906 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/ADT/STLExtras.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/eval_util.h"
@@ -76,7 +77,7 @@ LogicalResult IsOperationFoldable(Operation* op) {
   // folded to preserve the original semantics.
   if (op->hasTrait<OpTrait::IsTerminator>() ||
       op->hasTrait<OpTrait::TF::NoConstantFold>() || op->getNumRegions() != 0 ||
-      !MemoryEffectOpInterface::hasNoEffect(op)) {
+      !isMemoryEffectFree(op)) {
     return failure();
   }
 
@@ -135,7 +136,7 @@ LogicalResult FoldOperation(TFE_Context* ctx, OpBuilder& builder, Operation* op,
   for (auto operand : op->getOperands()) {
     auto preceding_const_op = operand.getDefiningOp<TF::ConstOp>();
     if (preceding_const_op) {
-      inputs.push_back(preceding_const_op.value());
+      inputs.push_back(preceding_const_op.getValue());
       continue;
     }
 
@@ -153,7 +154,7 @@ LogicalResult FoldOperation(TFE_Context* ctx, OpBuilder& builder, Operation* op,
     }
     auto preceding_result = preceding_results[preceding_result_id];
     preceding_const_op = preceding_result.getDefiningOp<TF::ConstOp>();
-    inputs.push_back(preceding_const_op.value());
+    inputs.push_back(preceding_const_op.getValue());
   }
 
   // Avoid overlapping folds with the same context.
@@ -242,5 +243,15 @@ llvm::SmallVector<Value> ConstantFoldOpIfPossible(Operation* op) {
   return results;
 }
 
+llvm::SmallVector<Value> CloneOpWithReplacedOperands(
+    OpBuilder& builder, Operation* op,
+    const llvm::SmallVector<Value>& new_operands) {
+  IRMapping mapping;
+  for (const auto& arg : llvm::enumerate(new_operands)) {
+    mapping.map(op->getOperand(arg.index()), arg.value());
+  }
+  return builder.clone(*op, mapping)->getResults();
+}
+
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h
index 56a3f686de7..003018f4db7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h
@@ -30,17 +30,15 @@ limitations under the License.
 namespace mlir {
 namespace quant {
 
+constexpr char kQuantizeFuncName[] = "quantize_i8";
+constexpr char kDequantizeFuncName[] = "dequantize_i8";
+constexpr char kAttrMapAttribute[] = "attr_map";
+
 // TODO(b/238829558): Populate quantization config based on the
-// QuantizationOptions proto. We might want to clean QuantizationMethod as well
-// as this can be inferred from the proto.
+// QuantizationOptions proto.
+// TODO(b/263449239): Put the OpSet aliases separately within each file
 using OpSet = tensorflow::quantization::OpSet;
 
-enum class QuantizationMethod {
-  kQuantizationAwareTraining,
-  kPostTrainingQuantization,
-  kDynamicRangeQuantization
-};
-
 // Returns true if the value has static shape.
 bool HasStaticShape(Value value);
 
@@ -130,8 +128,16 @@ bool AreSplatValuesEqual(Value x, Value y) {
   return splat_x == splat_y;
 }
 
-// TODO(b/241488936): Remove this function after adding a new constant folding
+// Clones an operation with new operands while keeping attributes.
+llvm::SmallVector<Value> CloneOpWithReplacedOperands(
+    OpBuilder &builder, Operation *op,
+    const llvm::SmallVector<Value> &new_operands);
+
+// TODO(b/241488936): Remove these functions after adding a new constant folding
 // pass to TensorFlow.
+// Checks if an Operation is foldable.
+LogicalResult IsOperationFoldable(Operation *op);
+
 // Applies constant folding to the operation if possible and return the folded
 // results.
 llvm::SmallVector<Value> ConstantFoldOpIfPossible(Operation *op);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td
index 1e5cc82be94..eaa065f516e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.td
@@ -57,6 +57,16 @@ def IsInt32ElementType : Constraint<
 def IsF32ElementType : Constraint<
   CPred<"getElementTypeOrSelf($0).isF32()">>;
 
+// Checks if the value has rank.
+def HasRank : Constraint<
+  CPred<"$0.getType().cast<ShapedType>().hasRank()">>;
+
+// Checks if the value has rank of `n`.
+class HasRankOf<int n> : Constraint<
+  CPred<"$0.getType().cast<ShapedType>().hasRank() && "
+        "$0.getType().cast<ShapedType>().getRank() == " # n>,
+  "Checks if the value has rank of 'n'.">;
+
 // Checks if the value has static shape.
 def HasStaticShapeConstraint : Constraint<CPred<"HasStaticShape($0)">>;
 
@@ -96,7 +106,7 @@ class CreateI64ArrayAttr<string values> : NativeCodeCall<
 
 // Creates a string attribute with given values.
 class CreateStringAttr<string values> : NativeCodeCall<
-  "$_builder.getStringAttr("# values #")">;
+  "$_builder.getStringAttr(\""# values #"\")">;
 
 // Creates a new F32 type with the same shape as the given value.
 def CloneTypeWithF32ElementType : NativeCodeCall<
@@ -106,11 +116,6 @@ def CloneTypeWithF32ElementType : NativeCodeCall<
 def CloneTypeWithI32ElementType : NativeCodeCall<
   "CloneTypeWithNewElementType($0.getType(), $_builder.getI32Type())">;
 
-// By default, the generated code uses  the `create` method without the output
-// type field. However, for many ops, the output type field is always required.
-class CreateOpWithOutputType<string op_name> : NativeCodeCall<
-  "$_builder.create<"# op_name #">($_loc, $0...)">;
-
 // Checks if the value is a float constant and its splat value is equal to `x`.
 class IsSplatValueEqual<string x> : Constraint<CPred<
   "IsSplatValueEqual<float>($0, "# x #")">>;
@@ -126,3 +131,19 @@ class IsIntSplatValueEqual<string type, string x> : Constraint<CPred<
 // Checks if two values are integer constants and their values are equal.
 class AreIntSplatValuesEqual<string type> : Constraint<CPred<
   "AreSplatValuesEqual<"# type #">($0, $1)">>;
+
+// Returns defining op of this value.
+def GetDefiningOp :  NativeCodeCall<"$0.getDefiningOp()">;
+
+// Checks if an Operation is foldable.
+def IsOperationFoldable : Constraint<
+  CPred<"succeeded(IsOperationFoldable($0.getDefiningOp()))">>;
+
+// Applies constant-folding to the given op if possible.
+def ConstantFoldOpIfPossible :
+  NativeCodeCall<"ConstantFoldOpIfPossible($0).front()">;
+
+// Clones an operation with new operands while keeping attributes.
+def CloneOpWithReplacedOperands : NativeCodeCall<
+  "CloneOpWithReplacedOperands("
+    "$_builder, $0, llvm::SmallVector<Value>{$1...}).front()">;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 5570fb64d6b..61c7aebff05 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -1,7 +1,17 @@
 load("//tensorflow:pytype.default.bzl", "pytype_library", "pytype_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud", "tf_py_test", "tf_python_pybind_extension")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "if_static",
+)
+load(
+    "//tensorflow:tensorflow.default.bzl",
+    "get_compatible_with_cloud",
+    "tf_py_test",
+    "tf_python_pybind_extension",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
         "//tensorflow/python:__subpackages__",
@@ -9,37 +19,30 @@ package(
     licenses = ["notice"],
 )
 
+# Do NOT directly depend on `quantize_model_cc_impl` unless it is necessary
+# (i.e. undefined symbol). See the comments in `quantize_model_cc`.
 cc_library(
-    name = "quantize_model_lib",
-    srcs = [
-        "quantize_model.cc",
-    ],
-    hdrs = [
-        "quantize_model.h",
-    ],
+    name = "quantize_model_cc_impl",
+    srcs = ["quantize_model.cc"],
+    hdrs = ["quantize_model.h"],
     compatible_with = get_compatible_with_cloud(),
+    visibility = [
+        # Directly linked to `libtensorflow_cc.so` or
+        # `_pywrap_tensorflow_internal.so` if static build.
+        "//tensorflow:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
     deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:Transforms",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/tensorflow:constants",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
-        # Required for CustomAggregator op registration.
-        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:custom_aggregator_op",  # Required for CustomAggregator op registration.
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:save_variables",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/debugging:mlir_dump",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
@@ -53,52 +56,99 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Transforms",
     ],
 )
 
+# OSS: This is a header-only target. The implementation target `quantize_model_cc_impl` is
+# directly linked to `lib_pywrap_tensorflow_internal.so`, so in most use cases of python-
+# exported symbols depending directly on `quantize_model_cc_impl` should be unnecessary.
+# Using the header-only target will help avoid the ODR violation.
 cc_library(
     name = "quantize_model_cc",
-    srcs = [
-        "quantize_model_wrapper.cc",
-    ],
-    hdrs = [
-        "quantize_model_wrapper.h",
-    ],
-    copts = ["-fexceptions"],
-    features = [
-        "-use_header_modules",  # Required for pybind11
-        "-parse_headers",
-    ],
-    visibility = [
-        "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
-        "//tensorflow/python:__subpackages__",
+    hdrs = ["quantize_model.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = if_static([":quantize_model_cc_impl"]) + [
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/core:protos_all_cc",
     ],
+)
+
+# Exports python symbols via pybind11.
+tf_python_pybind_extension(
+    name = "pywrap_quantize_model",
+    srcs = ["pywrap_quantize_model.cc"],
+    # All deps must be header-only.
     deps = [
-        ":quantize_model_lib",
+        ":quantize_model_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_lib",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
     ],
 )
 
-tf_python_pybind_extension(
-    name = "pywrap_quantize_model",
+tf_py_test(
+    name = "pywrap_quantize_model_test",
     srcs = [
-        "pywrap_quantize_model.cc",
+        "pywrap_quantize_model_test.py",
     ],
-    hdrs = [
-        "quantize_model_wrapper.h",
+    tags = ["no_pip"],
+    deps = [
+        ":pywrap_quantize_model",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/platform",
+    ],
+)
+
+pytype_strict_library(
+    name = "save_model",
+    srcs = [
+        "save_model.py",
     ],
     deps = [
-        "//tensorflow/python/lib/core:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@com_google_absl//absl/strings",
-        "@pybind11",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/types",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -112,9 +162,10 @@ pytype_strict_library(
     deps = [
         ":pywrap_quantize_model",
         ":representative_dataset",
+        ":save_model",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/client:session",
@@ -122,7 +173,6 @@ pytype_strict_library(
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform",
-        "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -138,7 +188,7 @@ tf_py_test(
     name = "quantize_model_test",
     size = "medium",
     srcs = ["integration_test/quantize_model_test.py"],
-    shard_count = 10,  # Parallelize the test to avoid timeouts.
+    shard_count = 50,  # Parallelize the test to avoid timeouts.
     tags = ["no_pip"],
     deps = [
         ":quantize_model",
@@ -165,16 +215,25 @@ pytype_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
         "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/types",
         "//third_party/py/numpy",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/concurrency_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/concurrency_test.py
index 6fb9e220627..f2444a8946b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/concurrency_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/concurrency_test.py
@@ -41,14 +41,19 @@ def setUp(self):
     self.pool = futures.ThreadPoolExecutor(max_workers=4)
 
   def _convert_with_calibration(self):
-
     class ModelWithAdd(autotrackable.AutoTrackable):
       """Basic model with addition."""
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=[10], dtype=dtypes.float32, name='x'),
-          tensor_spec.TensorSpec(shape=[10], dtype=dtypes.float32, name='y')
-      ])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  shape=[10], dtype=dtypes.float32, name='x'
+              ),
+              tensor_spec.TensorSpec(
+                  shape=[10], dtype=dtypes.float32, name='y'
+              ),
+          ]
+      )
       def add(self, x, y):
         res = math_ops.add(x, y)
         return {'output': res}
@@ -56,29 +61,34 @@ def add(self, x, y):
     def data_gen():
       for _ in range(255):
         yield {
-            'x':
-                ops.convert_to_tensor(
-                    np.random.uniform(size=(10)).astype('f4')),
-            'y':
-                ops.convert_to_tensor(
-                    np.random.uniform(size=(10)).astype('f4'))
+            'x': ops.convert_to_tensor(
+                np.random.uniform(size=(10)).astype('f4')
+            ),
+            'y': ops.convert_to_tensor(
+                np.random.uniform(size=(10)).astype('f4')
+            ),
         }
 
     root = ModelWithAdd()
 
     temp_path = self.create_tempdir().full_path
     saved_model_save.save(
-        root, temp_path, signatures=root.add.get_concrete_function())
+        root, temp_path, signatures=root.add.get_concrete_function()
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=quant_opts_pb2.QuantizationMethod
-            .ExperimentalMethod.STATIC_RANGE))
+            experimental_method=quant_opts_pb2.QuantizationMethod.ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     model = quantize_model.quantize(
-        temp_path, ['serving_default'], [tag_constants.SERVING],
+        temp_path,
+        ['serving_default'],
+        [tag_constants.SERVING],
         quantization_options=quantization_options,
-        representative_dataset=data_gen())
+        representative_dataset=data_gen(),
+    )
     return model
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 88ddbd7e243..e6519973c44 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for quantize_model."""
+# TODO(b/264234648): Refactor and cleanup this file.
 import itertools
 import os
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Sequence, Tuple, Union
 
 from absl.testing import parameterized
 import numpy as np
@@ -24,36 +25,100 @@
 from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.python import quantize_model
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
+from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
 from tensorflow.compiler.mlir.quantization.tensorflow.python.integration_test import quantize_model_test_base
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import loader_impl as saved_model_loader
 from tensorflow.python.saved_model import save as saved_model_save
+from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils_impl
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.types import core
 
 # Type aliases for quantization method protobuf enums.
 _Method = quant_opts_pb2.QuantizationMethod.Method
 _ExperimentalMethod = quant_opts_pb2.QuantizationMethod.ExperimentalMethod
 
+_TensorShape = Sequence[Union[int, None]]
+
+_PER_CHANNEL_QUANTIZED_OPS = (
+    'UniformQuantizedConvolution',
+    'UniformQuantizedConvolutionHybrid',
+    'UniformQuantizedDotHybrid',
+)
+
+# Lists of ops whose channel dimension should be changed if per_channel
+# quantization is enabled. Respectively refers to (scale, zero_point).
+_SUFFIXES = ('/filter1', '/filter2')
+_PER_CHANNEL_OP_NAMES = (
+    f'{op}{suffix}'
+    for op, suffix in itertools.product(_PER_CHANNEL_QUANTIZED_OPS, _SUFFIXES)
+)
+
+
+def _is_variable(node_def: node_def_pb2.NodeDef) -> bool:
+  """Determines whether `node_def` is a variable node.
+
+  Args:
+    node_def: `NodeDef` to test whether it is a variable or not.
+
+  Returns:
+    Returns True if it is a variable.
+  """
+  return node_def.op == 'VarHandleOp'
+
+
+def _find_variables(
+    graph_def: graph_pb2.GraphDef,
+) -> Mapping[str, node_def_pb2.NodeDef]:
+  """Finds all variables within `graph_def`.
+
+  This function makes sense for TF 1 graphs only, as it depends on
+  `shared_name`.
+
+  Args:
+    graph_def: `GraphDef` to find variables from.
+
+  Returns:
+    A mapping of `shared_name` -> `NodeDef` corresponding to a variable op.
+  """
+  variable_nodes = {}
+
+  for var_node in filter(_is_variable, graph_def.node):
+    shared_name = str(var_node.attr['shared_name'].s, encoding='utf-8')
+    variable_nodes[shared_name] = var_node
+
+  for func in graph_def.library.function:
+    for var_node in filter(_is_variable, func.node_def):
+      variable_nodes[shared_name] = var_node
+
+  return variable_nodes
+
 
 def parameter_combinations(test_parameters):
   """Generate all combinations of test parameters."""
@@ -71,9 +136,11 @@ class MultipleSignatureModel(module.Module):
   Used to test where the quantizer has to handle multiple signatures.
   """
 
-  @def_function.function(input_signature=[
-      tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
-  ])
+  @def_function.function(
+      input_signature=[
+          tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
+      ]
+  )
   def matmul(self, matmul_input: core.Tensor) -> Mapping[str, core.Tensor]:
     """Performs a matrix multiplication.
 
@@ -88,9 +155,11 @@ def matmul(self, matmul_input: core.Tensor) -> Mapping[str, core.Tensor]:
 
     return {'output': out}
 
-  @def_function.function(input_signature=[
-      tensor_spec.TensorSpec(shape=(1, 3, 4, 3), dtype=dtypes.float32)
-  ])
+  @def_function.function(
+      input_signature=[
+          tensor_spec.TensorSpec(shape=(1, 3, 4, 3), dtype=dtypes.float32)
+      ]
+  )
   def conv(self, conv_input: core.Tensor) -> Mapping[str, core.Tensor]:
     """Performs a 2D convolution operation.
 
@@ -100,22 +169,24 @@ def conv(self, conv_input: core.Tensor) -> Mapping[str, core.Tensor]:
     Returns:
       A map of: output key -> output result.
     """
-    filters = np.random.uniform(
-        low=-10, high=10, size=(2, 3, 3, 2)).astype('f4')
+    filters = np.random.uniform(low=-10, high=10, size=(2, 3, 3, 2)).astype(
+        'f4'
+    )
     out = nn_ops.conv2d(
         conv_input,
         filters,
         strides=[1, 1, 2, 1],
         dilations=[1, 1, 1, 1],
         padding='SAME',
-        data_format='NHWC')
+        data_format='NHWC',
+    )
 
     return {'output': out}
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class QuantizationMethodTest(quantize_model_test_base.QuantizedModelTest):
-  """Test cases regarding the use of QuantizationMethod proto.
+class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
+  """Test cases regarding the use of QuantizationOptions proto.
 
   Run all tests cases in both the graph mode (default in TF1) and the eager mode
   (default in TF2) to ensure support for when TF2 is disabled.
@@ -123,9 +194,11 @@ class QuantizationMethodTest(quantize_model_test_base.QuantizedModelTest):
 
   class SimpleModel(module.Module):
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
-    ])
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
+        ]
+    )
     def __call__(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
       """Performs a matrix multiplication.
 
@@ -149,65 +222,228 @@ def _simple_model_data_gen(self) -> repr_dataset.RepresentativeDataset:
     """
     for _ in range(8):
       yield {
-          'input_tensor':
-              ops.convert_to_tensor(
-                  np.random.uniform(low=0, high=150, size=(1, 4)).astype('f4')),
+          'input_tensor': ops.convert_to_tensor(
+              np.random.uniform(low=0, high=150, size=(1, 4)).astype('f4')
+          ),
       }
 
   def test_static_range_quantization_by_default(self):
     model = self.SimpleModel()
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     # Use default QuantizationOptions.
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
-        representative_dataset=self._simple_model_data_gen())
+        self._input_saved_model_path,
+        representative_dataset=self._simple_model_data_gen(),
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
 
     # Indirectly prove that it is performing a static-range quantization
     # by checking that it complains about representative_dataset when it is
     # not provided.
     with self.assertRaisesRegex(ValueError, 'representative_dataset'):
-      quantize_model.quantize(input_saved_model_path)
+      quantize_model.quantize(self._input_saved_model_path)
 
   def test_method_unspecified_raises_value_error(self):
     model = self.SimpleModel()
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            method=_Method.METHOD_UNSPECIFIED))
+            method=_Method.METHOD_UNSPECIFIED
+        )
+    )
 
     with self.assertRaises(ValueError):
       quantize_model.quantize(
-          input_saved_model_path, quantization_options=options)
+          self._input_saved_model_path, quantization_options=options
+      )
 
   def test_invalid_method_raises_value_error(self):
     model = self.SimpleModel()
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     # Set an invalid value of -1 to QuantizationMethod.method.
     options = quant_opts_pb2.QuantizationOptions(
-        quantization_method=quant_opts_pb2.QuantizationMethod(method=-1))
+        quantization_method=quant_opts_pb2.QuantizationMethod(method=-1)
+    )
+
+    with self.assertRaises(ValueError):
+      quantize_model.quantize(
+          self._input_saved_model_path, quantization_options=options
+      )
+
+  def test_per_channel_for_non_uniform_opset_raises_value_error(self):
+    model = self.SimpleModel()
+
+    saved_model_save.save(model, self._input_saved_model_path)
+
+    options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+        enable_per_channel_quantization=True,
+    )
 
     with self.assertRaises(ValueError):
       quantize_model.quantize(
-          input_saved_model_path, quantization_options=options)
+          self._input_saved_model_path, quantization_options=options
+      )
+
+
+class TensorNamePreservationTest(quantize_model_test_base.QuantizedModelTest):
+
+  def test_preserving_input_output_tensor_names(self):
+    class MultiSignatureModel(module.Module):
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='input', shape=[32], dtype=dtypes.float32
+              ),
+          ]
+      )
+      def multiple_output_ops(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        k = array_ops.constant(4, dtype=dtypes.int32)
+        values, indices = nn_ops.top_k(input_tensor, k, name='TopK')
+        adj_values = values + 2
+        return {'indices': indices, 'adj_values': adj_values, 'values': values}
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='input', shape=[32], dtype=dtypes.float32
+              ),
+          ]
+      )
+      def duplicate_outputs(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        q_input = array_ops.fake_quant_with_min_max_args(
+            input_tensor, min=-0.1, max=0.2, num_bits=8, narrow_range=False
+        )
+        adj_values = q_input + 2
+        return {'adj_values_1': adj_values, 'adj_values_2': adj_values}
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='input', shape=[32], dtype=dtypes.float32
+              ),
+          ]
+      )
+      def return_higher_index_only(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        k = array_ops.constant(4, dtype=dtypes.int32)
+        values, indices = nn_ops.top_k(input_tensor, k, name='TopK')
+        adj_values = values + 2
+        return {'indices': indices, 'adj_values': adj_values}
+
+    model = MultiSignatureModel()
+    signatures = {
+        'multiple_output_ops': model.multiple_output_ops,
+        'duplicate_outputs': model.duplicate_outputs,
+        'return_higher_index_only': model.return_higher_index_only,
+    }
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=signatures
+    )
+
+    tags = {tag_constants.SERVING}
+    original_signature_map = save_model.get_signatures_from_saved_model(
+        self._input_saved_model_path,
+        signature_keys=signatures.keys(),
+        tags=tags,
+    )
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signatures.keys(),
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    converted_signature_map = save_model.get_signatures_from_saved_model(
+        self._output_saved_model_path,
+        signature_keys=signatures.keys(),
+        tags=tags,
+    )
+
+    # The original and converted model should have the same signature map.
+    self.assertAllInSet(
+        list(original_signature_map.keys()), set(signatures.keys())
+    )
+    self.assertDictEqual(original_signature_map, converted_signature_map)
+
+  def test_duplicated_tensor_name(self):
+    with session.Session(graph=ops.Graph()) as sess:
+      input_tensor = array_ops.placeholder(
+          dtypes.float32, shape=[], name='input'
+      )
+      q_input = array_ops.fake_quant_with_min_max_args(
+          input_tensor, min=-0.1, max=0.2, num_bits=8, narrow_range=False
+      )
+      sqrt = math_ops.sqrt(q_input, name='sqrt')
+      identity = array_ops.identity(sqrt, name='output')
+
+      input_map = {'input': input_tensor}
+      output_map = {'sqrt': identity}
+      signature = signature_def_utils_impl.predict_signature_def(
+          inputs=input_map, outputs=output_map
+      )
+      signature_map = {'main': signature}
+
+      tags = {tag_constants.SERVING}
+      v1_builder = builder.SavedModelBuilder(self._input_saved_model_path)
+      v1_builder.add_meta_graph_and_variables(
+          sess, tags, signature_def_map=signature_map
+      )
+      v1_builder.save()
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_map.keys(),
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    converted_signature_map = save_model.get_signatures_from_saved_model(
+        self._output_saved_model_path,
+        signature_keys=signature_map.keys(),
+        tags=tags,
+    )
+    # The original and converted model should have the same signature map.
+    self.assertDictEqual(signature_map, converted_signature_map)
 
 
 class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
-  def _any_warning_contains(self, substring: str,
-                            warnings_list: List['LogRecord']) -> bool:
+  def _any_warning_contains(
+      self, substring: str, warnings_list: List['LogRecord']
+  ) -> bool:
     """Returns True if any of the warnings contains a given substring.
 
     Args:
@@ -220,28 +456,220 @@ def _any_warning_contains(self, substring: str,
       `warnings_list`.
     """
     return any(
-        map(lambda warning: substring in str(warning.message), warnings_list))
+        map(lambda warning: substring in str(warning.message), warnings_list)
+    )
 
   @parameterized.parameters(
       parameter_combinations([{
+          'shapes': [
+              ([3, 3], [3, 3]),
+              ([3, None], [None, 3]),
+              ([None, None], [None, None]),
+              ([4, 3, 3], [4, 3, 3]),
+              ([4, 3, None], [4, None, 3]),
+              ([None, None, None], [None, None, None]),
+          ],
           'activation_fn': [None, nn_ops.relu, nn_ops.relu6],
           'has_bias': [True, False],
-          'target_opset': [quant_opts_pb2.XLA],
-      }]))
+          'use_kernel': [True, False],
+      }])
+  )
   @test_util.run_in_graph_and_eager_modes
-  def test_qat_conv_model(self, activation_fn: Optional[ops.Operation],
-                          has_bias: bool, target_opset: quant_opts_pb2.OpSet):
+  def test_qat_matmul_model(
+      self,
+      shapes: Sequence[Tuple[_TensorShape, _TensorShape]],
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      use_kernel: bool,
+  ):
+    n = 5
+    x_shape = [v if v is not None else n for v in shapes[0]]
+    y_shape = [v if v is not None else n for v in shapes[1]]
+
+    class MatmulModel(module.Module):
+
+      def __init__(self, bias: Optional[core.Tensor]):
+        self._bias = bias
+        self._kernel = np.random.uniform(size=y_shape).astype('f4')
+        self._min = (-0.8, -0.8, -0.9)
+        self._max = (0.9, 0.9, 1.0)
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='x', shape=shapes[0], dtype=dtypes.float32
+              )
+          ]
+      )
+      def matmul_with_kernel(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
+        return self._matmul(x, self._kernel)
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='x', shape=shapes[0], dtype=dtypes.float32
+              ),
+              tensor_spec.TensorSpec(
+                  name='y', shape=shapes[1], dtype=dtypes.float32
+              ),
+          ]
+      )
+      def matmul_without_kernel(
+          self, x: core.Tensor, y: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        return self._matmul(x, y)
+
+      def _matmul(self, x, y):
+        x = array_ops.fake_quant_with_min_max_vars(
+            x,
+            min=ops.convert_to_tensor(self._min[0]),
+            max=ops.convert_to_tensor(self._max[0]),
+            num_bits=8,
+            narrow_range=False,
+        )
+        y = array_ops.fake_quant_with_min_max_vars(
+            y,
+            min=ops.convert_to_tensor(self._min[1]),
+            max=ops.convert_to_tensor(self._max[1]),
+            num_bits=8,
+            narrow_range=False,
+        )
+
+        out = math_ops.matmul(x, y)
+        if self._bias is not None:
+          out = nn_ops.bias_add(out, self._bias)
+        if activation_fn is not None:
+          out = activation_fn(out)
+        out = array_ops.fake_quant_with_min_max_vars(
+            out,
+            min=ops.convert_to_tensor(self._min[2]),
+            max=ops.convert_to_tensor(self._max[2]),
+            num_bits=8,
+            narrow_range=False,
+        )
+        return {'output': out}
+
+    bias = None
+    if has_bias:
+      bias_shape = shapes[1][-1]
+      if bias_shape is not None:
+        bias = array_ops.constant(
+            np.random.uniform(size=[shapes[1][-1]]), dtype=dtypes.float32
+        )
+    model = MatmulModel(bias)
+    x = array_ops.constant(
+        np.random.uniform(size=x_shape), dtype=dtypes.float32
+    )
+    y = array_ops.constant(
+        np.random.uniform(size=y_shape), dtype=dtypes.float32
+    )
+    if use_kernel:
+      model.matmul = model.matmul_with_kernel
+      model_inputs = {'x': x}
+    else:
+      model.matmul = model.matmul_without_kernel
+      model_inputs = {'x': x, 'y': y}
+
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.matmul
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+
+    # Check the converted model with TF opset as the baseline.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+
+    expected_outputs = model.matmul(**model_inputs)
+    got_outputs = converted_model.signatures[signature_key](**model_inputs)
+    self.assertAllClose(expected_outputs, got_outputs, atol=1e-1)
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    # Check the converted model in the XLA opset.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.XLA,
+        enable_two_input_tensors=not use_kernel,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path_2,
+        quantization_options,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_op(graphdef, 'XlaDotV2'))
+
+    new_outputs = converted_model.signatures[signature_key](**model_inputs)
+
+    # The difference between TF and XLA path is expected to be small (smaller
+    # or equal to 1 in the quantized domain).
+    self.assertAllClose(new_outputs, expected_outputs, atol=1e-1)
 
+  @parameterized.parameters(
+      parameter_combinations([{
+          'activation_fn': [None, nn_ops.relu, nn_ops.relu6],
+          'has_bias': [True, False],
+          'has_batch_norm': [True, False],
+          'target_opset': [quant_opts_pb2.XLA],
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_qat_conv_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      has_batch_norm: bool,
+      target_opset: quant_opts_pb2.OpSet,
+  ):
     class ConvModel(module.Module):
 
       def __init__(self):
         self.filter_value = np.random.uniform(
-            low=-0.5, high=0.5, size=(2, 3, 3, 2)).astype('f4')
-
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(
-              name='input', shape=[1, 3, 4, 3], dtype=dtypes.float32),
-      ])
+            low=-0.5, high=0.5, size=(2, 3, 3, 2)
+        ).astype('f4')
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='input', shape=[1, 3, 4, 3], dtype=dtypes.float32
+              ),
+          ]
+      )
       def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a 2D convolution operation.
 
@@ -252,90 +680,383 @@ def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
           A map of: output key -> output result.
         """
         q_input = array_ops.fake_quant_with_min_max_args(
-            input_tensor, min=-0.1, max=0.2, num_bits=8, narrow_range=False)
+            input_tensor, min=-0.1, max=0.2, num_bits=8, narrow_range=False
+        )
         filter_tensor = ops.convert_to_tensor(self.filter_value)
+        filter_min = array_ops.identity(
+            array_ops.constant([-0.5, -0.5], dtype=dtypes.float32)
+        )
+        filter_max = array_ops.identity(
+            array_ops.constant([0.5, 0.5], dtype=dtypes.float32)
+        )
+        q_filter = array_ops.fake_quant_with_min_max_vars_per_channel(
+            filter_tensor, filter_min, filter_max, num_bits=8, narrow_range=True
+        )
         bias = array_ops.constant([0.1, 0.2], dtype=dtypes.float32)
+        scale, offset = [1.0] * 2, [0.5] * 2
+        mean, variance = scale, offset
         out = nn_ops.conv2d(
             q_input,
-            filter_tensor,
+            q_filter,
             strides=[1, 1, 2, 1],
             dilations=[1, 1, 1, 1],
             padding='SAME',
-            data_format='NHWC')
+            data_format='NHWC',
+        )
         if has_bias:
           out = nn_ops.bias_add(out, bias, data_format='NHWC')
         if activation_fn is not None:
+          # The accuracy is not good when having FusedBatchNorm without
+          # activation in this test.
+          if has_batch_norm:
+            # Fusing is supported for non-training case.
+            out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
+                out, scale, offset, mean, variance, is_training=False
+            )
           out = activation_fn(out)
-        q_out = array_ops.fake_quant_with_min_max_args(
-            out, min=-0.3, max=0.4, num_bits=8, narrow_range=False)
+        out_min = array_ops.constant([-0.18, -0.32], dtype=dtypes.float32)
+        out_max = array_ops.constant([0.5, 0.5], dtype=dtypes.float32)
+        q_out = array_ops.fake_quant_with_min_max_vars_per_channel(
+            out, min=out_min, max=out_max, num_bits=8, narrow_range=True
+        )
         return {'output': q_out}
 
     np.random.seed(1234)
     model = ConvModel()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    tags = [tag_constants.SERVING]
+    tags = {tag_constants.SERVING}
 
     # Check the converted model with TF opset as the baseline.
-    output_directory = self.create_tempdir().full_path
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=quant_opts_pb2.TF)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              [signature_key], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {signature_key})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
 
     input_data = np.random.uniform(
-        low=-0.1, high=0.2, size=(1, 3, 4, 3)).astype('f4')
+        low=-0.1, high=0.2, size=(1, 3, 4, 3)
+    ).astype('f4')
     expected_outputs = model.conv(input_data)
     got_outputs = converted_model.signatures[signature_key](
-        input=ops.convert_to_tensor(input_data))
-    # TODO(b/215633216): Check if the accuracy is acceptable.
-    self.assertAllClose(expected_outputs, got_outputs, atol=0.01)
+        input=ops.convert_to_tensor(input_data)
+    )
+    self.assertAllClose(expected_outputs, got_outputs, atol=0.00323)
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     # Check the converted model in the target opset.
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=target_opset)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-    output_directory = self.create_tempdir().full_path
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              [signature_key], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path_2,
+        quantization_options,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {signature_key})
-    loader = saved_model_loader.SavedModelLoader(output_directory)
-    meta_graphdef = loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
     if target_opset == quant_opts_pb2.XLA:
-      self.assertTrue(self._contains_op(meta_graphdef, 'XlaConvV2'))
+      self.assertTrue(self._contains_op(graphdef, 'XlaConvV2'))
 
     new_outputs = converted_model.signatures[signature_key](
-        input=ops.convert_to_tensor(input_data))
+        input=ops.convert_to_tensor(input_data)
+    )
+    # The difference between TF and XLA path is expected to be small (smaller
+    # or equal to 1 in the quantized domain).
+    self.assertAllClose(new_outputs, got_outputs, atol=0.00154)
+
+  # Currently, only some specific forms of equantions are supported.
+  @parameterized.parameters(
+      parameter_combinations([{
+          'equation': ['abc,cd->abd', 'abcd,cde->abe'],
+          'shape_unknown': [True, False],
+          'activation_fn': [None, nn_ops.relu, nn_ops.relu6],
+          'has_bias': [True, False],
+          'use_kernel': [True, False],
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_qat_einsum_model(
+      self,
+      equation: str,
+      shape_unknown: bool,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      use_kernel: bool,
+  ):
+    comma_pos = equation.find(',')
+    arrow_pos = equation.find('->')
+    x_labels = equation[0:comma_pos]
+    y_labels = equation[comma_pos + 1 : arrow_pos]
+
+    label_to_size = {'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6}
+    x_shape = [label_to_size.get(x_label) for x_label in x_labels]
+    y_shape = [label_to_size.get(y_label) for y_label in y_labels]
+    x_signature = [None for _ in x_labels] if shape_unknown else list(x_shape)
+    y_signature = [None for _ in y_labels] if shape_unknown else list(y_shape)
+
+    class EinsumModel(module.Module):
+
+      def __init__(self, bias: Optional[core.Tensor]):
+        self._bias = bias
+        self._kernel = np.random.uniform(size=y_shape).astype('f4')
+        self._min = (-0.8, -0.8, -0.9)
+        self._max = (0.9, 0.9, 1.0)
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='x', shape=x_signature, dtype=dtypes.float32
+              )
+          ]
+      )
+      def einsum_with_kernel(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
+        return self._einsum(x, self._kernel)
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='x', shape=x_signature, dtype=dtypes.float32
+              ),
+              tensor_spec.TensorSpec(
+                  name='y', shape=y_signature, dtype=dtypes.float32
+              ),
+          ]
+      )
+      def einsum_without_kernel(
+          self, x: core.Tensor, y: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        return self._einsum(x, y)
+
+      def _einsum(self, x, y):
+        x = array_ops.fake_quant_with_min_max_vars(
+            x,
+            min=ops.convert_to_tensor(self._min[0]),
+            max=ops.convert_to_tensor(self._max[0]),
+            num_bits=8,
+            narrow_range=False,
+        )
+        y = array_ops.fake_quant_with_min_max_vars(
+            y,
+            min=ops.convert_to_tensor(self._min[1]),
+            max=ops.convert_to_tensor(self._max[1]),
+            num_bits=8,
+            narrow_range=False,
+        )
+
+        out = special_math_ops.einsum(equation, x, y)
+        if self._bias is not None:
+          out = nn_ops.bias_add(out, self._bias)
+        if activation_fn is not None:
+          out = activation_fn(out)
+        out = array_ops.fake_quant_with_min_max_vars(
+            out,
+            min=ops.convert_to_tensor(self._min[2]),
+            max=ops.convert_to_tensor(self._max[2]),
+            num_bits=8,
+            narrow_range=False,
+        )
+        return {'output': out}
+
+    bias = None
+    if has_bias:
+      bias_shape = y_signature[-1]
+      if bias_shape is not None:
+        bias = array_ops.constant(
+            np.random.uniform(size=[y_signature[-1]]), dtype=dtypes.float32
+        )
+    model = EinsumModel(bias)
+    x = array_ops.constant(
+        np.random.uniform(size=x_shape), dtype=dtypes.float32
+    )
+    y = array_ops.constant(
+        np.random.uniform(size=y_shape), dtype=dtypes.float32
+    )
+    if use_kernel:
+      model.einsum = model.einsum_with_kernel
+      model_inputs = {'x': x}
+    else:
+      model.einsum = model.einsum_without_kernel
+      model_inputs = {'x': x, 'y': y}
+
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=model.einsum
+    )
+
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+
+    # Check the converted model with TF opset as the baseline.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+
+    expected_outputs = model.einsum(**model_inputs)
+    got_outputs = converted_model.signatures[signature_key](**model_inputs)
+    self.assertAllClose(expected_outputs, got_outputs, atol=1e-1)
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    # Check the converted model in the XLA opset.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.XLA,
+        enable_two_input_tensors=not use_kernel,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        [signature_key],
+        tags,
+        self._output_saved_model_path_2,
+        quantization_options,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_op(graphdef, 'XlaDotV2'))
+
+    new_outputs = converted_model.signatures[signature_key](**model_inputs)
+
     # The difference between TF and XLA path is expected to be small (smaller
     # or equal to 1 in the quantized domain).
-    self.assertAllClose(new_outputs, got_outputs, atol=0.00275)
+    self.assertAllClose(new_outputs, expected_outputs, atol=1e-1)
+
+  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
+  @test_util.deprecated_graph_mode_only
+  def test_qat_vocab_table_lookup_model(self):
+    tags = {tag_constants.SERVING}
+    signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    # Create and save a simple model that involves a hash table.
+    inputs, outputs = self._create_and_save_vocab_table_lookup_qat_model_tf1(
+        self._input_saved_model_path, tags, signature_def_key
+    )
+
+    # Make sure that the desired input key and output key is present.
+    self.assertIn('input_vocabs', inputs.keys())
+    self.assertIn('lookup', outputs.keys())
+
+    # Representative dataset is composed of a set of vocabs for table lookup.
+    repr_ds = [
+        {'input_vocabs': np.array([b'hello', b'model', b'quantization'])}
+        for _ in range(4)
+    ]
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
+
+    signature_def_keys = [signature_def_key]
+
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_def_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=repr_ds,
+    )
+
+    # Tests table lookup to make sure the table has been initialized
+    # successfully.
+    with session.Session(graph=ops.Graph()) as sess:
+      output_meta_graph_def = saved_model_loader.load(
+          sess, tags=tags, export_dir=self._output_saved_model_path
+      )
+
+      # The graph should contain a quantized function call (it contains a
+      # single f32 matmul node).
+      self.assertTrue(
+          self._contains_quantized_function_call(
+              output_meta_graph_def.graph_def
+          )
+      )
+      self.assertCountEqual(
+          output_meta_graph_def.signature_def.keys(), signature_def_keys
+      )
+
+      signature_def = output_meta_graph_def.signature_def[signature_def_key]
+
+      input_tensor_name = signature_def.inputs['input_vocabs'].name
+      input_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
+
+      lookup_tensor_name = signature_def.outputs['lookup'].name
+      lookup_tensor = sess.graph.get_tensor_by_name(lookup_tensor_name)
+
+      lookup_val = sess.run(
+          lookup_tensor,
+          feed_dict={
+              input_tensor: np.array([b'model', b'quantization', b'hello'])
+          },
+      )
+
+      self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
   # Run this test only with the eager mode.
   @test_util.run_v2_only
   def test_ptq_model_with_variable(self):
-
     class ConvModelWithVariable(module.Module):
       """A simple model that performs a single convolution to the input tensor.
 
@@ -346,12 +1067,17 @@ def __init__(self) -> None:
         """Initializes the filter variable."""
         self.filters = variables.Variable(
             random_ops.random_uniform(
-                shape=(2, 3, 3, 2), minval=-1., maxval=1.))
-
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(
-              name='input', shape=(1, 3, 4, 3), dtype=dtypes.float32),
-      ])
+                shape=(2, 3, 3, 2), minval=-1.0, maxval=1.0
+            )
+        )
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  name='input', shape=(1, 3, 4, 3), dtype=dtypes.float32
+              ),
+          ]
+      )
       def __call__(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a 2D convolution operation.
 
@@ -367,7 +1093,8 @@ def __call__(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
             strides=[1, 1, 2, 1],
             dilations=[1, 1, 1, 1],
             padding='SAME',
-            data_format='NHWC')
+            data_format='NHWC',
+        )
         return {'output': out}
 
     def gen_data() -> repr_dataset.RepresentativeDataset:
@@ -379,572 +1106,1271 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
       """
       for _ in range(8):
         yield {
-            'input':
-                random_ops.random_uniform(
-                    shape=(1, 3, 4, 3), minval=0, maxval=150)
+            'input': random_ops.random_uniform(
+                shape=(1, 3, 4, 3), minval=0, maxval=150
+            )
         }
 
     model = ConvModelWithVariable()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     signature_keys = [signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
     tags = {tag_constants.SERVING}
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys,
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=gen_data())
+        representative_dataset=gen_data(),
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
+  # TODO(b/263830952): Use dictionaries instead of tuples for parameters.
   @parameterized.named_parameters(
-      ('none', None, False, False, quant_opts_pb2.TF, False),
-      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False),
-      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False),
-      ('bn', None, False, True, quant_opts_pb2.TF, False),
-      ('bn_and_relu', nn_ops.relu, False, True, quant_opts_pb2.TF, False),
-      ('with_bias', None, True, False, quant_opts_pb2.TF, False),
-      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False),
-      ('with_bias_and_bn_and_relu', nn_ops.relu, True, True, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_relu', nn_ops.relu, True, False, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_relu6', nn_ops.relu6, True, False, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_bn_to_xla', None, True, True, quant_opts_pb2.XLA, False),
-      ('with_bias_and_relu6_to_xla', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, False),
-      ('with_bias_and_bn_to_xla_dynamic', None, True, True, quant_opts_pb2.XLA,
-       True),
-      ('with_bias_and_relu6_to_xla_dynamic', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, True),
+      ('none', None, False, False, quant_opts_pb2.TF, False, False),
+      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False, False),
+      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False, False),
+      ('bn', None, False, True, quant_opts_pb2.TF, False, False),
+      (
+          'bn_and_relu',
+          nn_ops.relu,
+          False,
+          True,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      ('with_bias', None, True, False, quant_opts_pb2.TF, False, False),
+      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False, False),
+      (
+          'with_bias_and_bn_and_relu',
+          nn_ops.relu,
+          True,
+          True,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu',
+          nn_ops.relu,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_bn_to_xla',
+          None,
+          True,
+          True,
+          quant_opts_pb2.XLA,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_xla',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_bn_to_xla_dynamic',
+          None,
+          True,
+          True,
+          quant_opts_pb2.XLA,
+          True,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_xla_dynamic',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          True,
+          False,
+      ),
+      (
+          'none_to_uq',
+          None,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'none_to_uq_per_channel',
+          None,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          True,
+      ),
+      (
+          'relu_to_uq',
+          nn_ops.relu,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_to_uq',
+          None,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu_to_uq',
+          nn_ops.relu,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_uq',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
   )
   @test_util.run_in_graph_and_eager_modes
-  def test_conv_ptq_model(self, activation_fn: Optional[ops.Operation],
-                          has_bias: bool, has_bn: bool,
-                          target_opset: quant_opts_pb2.OpSet,
-                          input_shape_dynamic: bool):
+  def test_conv_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      has_batch_norm: bool,
+      target_opset: quant_opts_pb2.OpSet,
+      input_shape_dynamic: bool,
+      enable_per_channel_quantization: bool,
+  ):
     input_shape = [None, None, None, 3] if input_shape_dynamic else [1, 3, 4, 3]
-
-    class ConvModel(module.Module):
-
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
-      ])
-      def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
-        """Performs a 2D convolution operation.
-
-        Args:
-          input_tensor: Input tensor to perform convolution on.
-
-        Returns:
-          A map of: output key -> output result.
-        """
-        filters = np.random.uniform(
-            low=-10, high=10, size=(2, 3, 3, 2)).astype('f4')
-        bias = np.random.uniform(low=0, high=10, size=(2)).astype('f4')
-        scale, offset = [1.0, 1.0], [0.5, 0.5]
-        mean, variance = scale, offset
-        out = nn_ops.conv2d(
-            input_tensor,
-            filters,
-            strides=[1, 1, 2, 1],
-            dilations=[1, 1, 1, 1],
-            padding='SAME',
-            data_format='NHWC')
-        if has_bias:
-          out = nn_ops.bias_add(out, bias)
-        if has_bn:
-          # Fusing is supported for non-training case.
-          out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
-              out, scale, offset, mean, variance, is_training=False)
-        if activation_fn is not None:
-          out = activation_fn(out)
-        return {'output': out}
+    filter_shape = [2, 3, 3, 2]
 
     np.random.seed(1234)
-    model = ConvModel()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    model = self._create_conv2d_model(
+        input_shape, filter_shape, has_bias, has_batch_norm, activation_fn
+    )
+    saved_model_save.save(model, self._input_saved_model_path)
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(8):
         yield {
-            'input_tensor':
-                ops.convert_to_tensor(
-                    np.random.uniform(low=0, high=150,
-                                      size=(1, 3, 4, 3)).astype('f4')),
+            'input_tensor': ops.convert_to_tensor(
+                np.random.uniform(low=0, high=150, size=(1, 3, 4, 3)).astype(
+                    'f4'
+                )
+            ),
         }
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=target_opset)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen())
+        representative_dataset=data_gen(),
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     if target_opset == quant_opts_pb2.XLA:
-      self.assertTrue(self._contains_op(output_meta_graphdef, 'XlaConvV2'))
-    else:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+    elif target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertTrue(
-          self._contains_quantized_function_call(output_meta_graphdef))
-    self.assertFalse(
-        self._contains_op(output_meta_graphdef, 'FusedBatchNormV3'))
+          self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
+      )
+      if enable_per_channel_quantization:
+        quantized_axis = 3
+        quantized_dim_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[
+                    tensor_shape_pb2.TensorShapeProto(
+                        dim=[
+                            tensor_shape_pb2.TensorShapeProto.Dim(
+                                size=filter_shape[quantized_axis]
+                            )
+                        ]
+                    )
+                ]
+            )
+        )
+      else:
+        quantized_axis = -1
+        # Empty dimension. Per-tensor quantization has singular channel.
+        quantized_dim_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[tensor_shape_pb2.TensorShapeProto()]
+            )
+        )
+      quantized_axis_attr = attr_value_pb2.AttrValue(i=quantized_axis)
+      self.assertEqual(
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_QUANTIZED_OPS,
+              'rhs_quantization_axis',
+              quantized_axis_attr,
+          ),
+          self._count_ops(output_graphdef, _PER_CHANNEL_QUANTIZED_OPS),
+      )
+      self.assertEqual(
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_OP_NAMES,
+              '_output_shapes',
+              quantized_dim_size_attr,
+              get_op_name=True,
+          ),
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_OP_NAMES,
+              get_op_name=True,
+          ),
+      )
+      self.assertFalse(self._contains_op(output_graphdef, 'Conv2D'))
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+    self.assertFalse(self._contains_op(output_graphdef, 'FusedBatchNormV3'))
 
   @parameterized.named_parameters(
-      ('none', None, False, False, quant_opts_pb2.TF, False),
-      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False),
-      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False),
-      ('bn', None, False, True, quant_opts_pb2.TF, False),
-      ('bn_and_relu', nn_ops.relu, False, True, quant_opts_pb2.TF, False),
-      ('with_bias', None, True, False, quant_opts_pb2.TF, False),
-      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False),
-      ('with_bias_and_bn_and_relu', nn_ops.relu, True, True, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_relu', nn_ops.relu, True, False, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_relu6', nn_ops.relu6, True, False, quant_opts_pb2.TF,
-       False),
-      ('with_bias_and_bn_to_xla', None, True, True, quant_opts_pb2.XLA, False),
-      ('with_bias_and_relu6_to_xla', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, False),
-      ('with_bias_and_bn_to_xla_dynamic', None, True, True, quant_opts_pb2.XLA,
-       True),
-      ('with_bias_and_relu6_to_xla_dynamic', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, True),
+      ('to_tf', quant_opts_pb2.TF),
+      ('to_xla', quant_opts_pb2.XLA),
+      ('to_uq', quant_opts_pb2.UNIFORM_QUANTIZED),
   )
-  @test_util.run_in_graph_and_eager_modes
-  def test_depthwise_conv_ptq_model(self,
-                                    activation_fn: Optional[ops.Operation],
-                                    has_bias: bool, has_bn: bool,
-                                    target_opset: quant_opts_pb2.OpSet,
-                                    input_shape_dynamic: bool):
-    input_shape = [None, None, None, 3] if input_shape_dynamic else [1, 3, 4, 3]
+  @test_util.run_v2_only
+  def test_gather_and_conv_model(self, target_opset: quant_opts_pb2.OpSet):
+    model = self._create_simple_gather_and_conv_model(filter_shape=(2, 3, 3, 2))
+    saved_model_save.save(model, self._input_saved_model_path)
 
-    class DepthwiseConvModel(module.Module):
+    data_gen = self._create_data_generator(
+        input_key='input_tensor',
+        shape=[6],
+        minval=0,
+        maxval=10,
+        dtype=dtypes.int64,
+    )
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
-      ])
-      def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
-        """Performs a 2D convolution operation.
+    tags = {tag_constants.SERVING}
 
-        Args:
-          input_tensor: Input tensor to perform convolution on.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-        Returns:
-          A map of: output key -> output result.
-        """
-        filters = np.random.uniform(
-            low=-10, high=10, size=(2, 3, 3, 1)).astype('f4')
-        bias = np.random.uniform(low=0, high=10, size=(3)).astype('f4')
-        scale, offset = [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]
-        mean, variance = scale, offset
-        out = nn_ops.depthwise_conv2d_native(
-            input_tensor,
-            filters,
-            strides=[1, 2, 2, 1],
-            dilations=[1, 1, 1, 1],
-            padding='SAME',
-            data_format='NHWC')
-        if has_bias:
-          out = nn_ops.bias_add(out, bias)
-        if has_bn:
-          # Fusing is supported for non-training case.
-          out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
-              out, scale, offset, mean, variance, is_training=False)
-        if activation_fn is not None:
-          out = activation_fn(out)
-        return {'output': out}
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    if target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+      self.assertSizeRatioLessThan(
+          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      )
+    elif target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      self.assertTrue(
+          self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
+      )
+      self.assertSizeRatioGreaterThan(
+          self._output_saved_model_path, self._input_saved_model_path, 0.95
+      )
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+      self.assertSizeRatioGreaterThan(
+          self._output_saved_model_path, self._input_saved_model_path, 0.95
+      )
 
+  # TODO(b/263830952): Use dictionaries instead of tuples for parameters.
+  @parameterized.named_parameters(
+      ('none', None, False, False, quant_opts_pb2.TF, False, False),
+      ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False, False),
+      ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False, False),
+      ('bn', None, False, True, quant_opts_pb2.TF, False, False),
+      (
+          'bn_and_relu',
+          nn_ops.relu,
+          False,
+          True,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      ('with_bias', None, True, False, quant_opts_pb2.TF, False, False),
+      ('with_bias_and_bn', None, True, True, quant_opts_pb2.TF, False, False),
+      (
+          'with_bias_and_bn_and_relu',
+          nn_ops.relu,
+          True,
+          True,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu',
+          nn_ops.relu,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_bn_to_xla',
+          None,
+          True,
+          True,
+          quant_opts_pb2.XLA,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_xla',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_bn_to_xla_dynamic',
+          None,
+          True,
+          True,
+          quant_opts_pb2.XLA,
+          True,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_xla_dynamic',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          True,
+          False,
+      ),
+      (
+          'none_to_uq',
+          None,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'none_to_uq_per_channel',
+          None,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          True,
+      ),
+      (
+          'relu_to_uq',
+          nn_ops.relu,
+          False,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_to_uq',
+          None,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu_to_uq',
+          nn_ops.relu,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+      (
+          'with_bias_and_relu6_to_uq',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+          False,
+      ),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_depthwise_conv_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      has_batch_norm: bool,
+      target_opset: quant_opts_pb2.OpSet,
+      input_shape_dynamic: bool,
+      enable_per_channel_quantization: bool,
+  ):
+    input_shape = [None, None, None, 3] if input_shape_dynamic else [1, 3, 4, 3]
+    filter_shape = [2, 3, 3, 1]
+    model = self._create_depthwise_conv2d_model(
+        input_shape, filter_shape, has_bias, has_batch_norm, activation_fn
+    )
     np.random.seed(1234)
-    model = DepthwiseConvModel()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(8):
         yield {
-            'input_tensor':
-                ops.convert_to_tensor(
-                    np.random.uniform(low=0, high=150,
-                                      size=(1, 3, 4, 3)).astype('f4')),
+            'input_tensor': ops.convert_to_tensor(
+                np.random.uniform(low=0, high=150, size=(1, 3, 4, 3)).astype(
+                    'f4'
+                )
+            ),
         }
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=target_opset)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen())
+        representative_dataset=data_gen(),
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     if target_opset == quant_opts_pb2.XLA:
-      # Quantization for DepthwiseConv is disabled for XLA opset.
       self.assertTrue(
-          self._contains_op(output_meta_graphdef, 'DepthwiseConv2dNative'))
-    else:
+          self._contains_op(output_graphdef, 'DepthwiseConv2dNative')
+      )
+    elif target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       self.assertTrue(
-          self._contains_quantized_function_call(output_meta_graphdef))
-    self.assertFalse(
-        self._contains_op(output_meta_graphdef, 'FusedBatchNormV3'))
+          self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
+      )
+      if enable_per_channel_quantization:
+        quantized_axis = 3
+        quantized_dim_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[
+                    tensor_shape_pb2.TensorShapeProto(
+                        dim=[
+                            tensor_shape_pb2.TensorShapeProto.Dim(
+                                # Depthwise conv is reshaped to [H,W,1,CxM].
+                                size=filter_shape[quantized_axis]
+                                * filter_shape[2]
+                            )
+                        ]
+                    )
+                ]
+            )
+        )
+      else:
+        quantized_axis = -1
+        # Empty dimension. Per-tensor quantization has singular channel.
+        quantized_dim_size_attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(
+                shape=[tensor_shape_pb2.TensorShapeProto()]
+            )
+        )
+      quantized_axis_attr = attr_value_pb2.AttrValue(i=quantized_axis)
+      self.assertEqual(
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_QUANTIZED_OPS,
+              'rhs_quantization_axis',
+              quantized_axis_attr,
+          ),
+          self._count_ops(output_graphdef, _PER_CHANNEL_QUANTIZED_OPS),
+      )
+      self.assertEqual(
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_OP_NAMES,
+              '_output_shapes',
+              quantized_dim_size_attr,
+              get_op_name=True,
+          ),
+          self._count_ops(
+              output_graphdef,
+              _PER_CHANNEL_OP_NAMES,
+              get_op_name=True,
+          ),
+      )
+      self.assertFalse(self._contains_op(output_graphdef, 'Conv2D'))
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+    self.assertFalse(self._contains_op(output_graphdef, 'FusedBatchNormV3'))
 
   @parameterized.parameters(
-      parameter_combinations([{
-          'activation_fn': [None, nn_ops.relu, nn_ops.relu6],
-          'has_bias': [True, False],
-          'target_opset': [quant_opts_pb2.XLA],
-      }]))
+      *parameter_combinations([
+          {
+              'activation_fn': [None, nn_ops.relu, nn_ops.relu6],
+              'has_bias': [True, False],
+              'batch_sizes': [([], []), ([2, 3], [2, 3])],
+              'target_opset': [quant_opts_pb2.XLA],
+          },
+          # Test broadcastable batch sizes.
+          {
+              'activation_fn': [None],
+              'has_bias': [True],
+              'batch_sizes': [
+                  ([2], []),
+                  ([], [2]),
+                  ([1], [2]),
+                  ([None], []),
+              ],
+              'target_opset': [quant_opts_pb2.XLA],
+          },
+      ])
+  )
   @test_util.run_in_graph_and_eager_modes
-  def test_matmul_ptq_model(self, activation_fn: Optional[ops.Operation],
-                            has_bias: bool, target_opset: quant_opts_pb2.OpSet):
+  def test_matmul_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      batch_sizes: Sequence[int],
+      target_opset: quant_opts_pb2.OpSet,
+  ):
     np.random.seed(1234)
-    model = self._create_matmul_model(has_bias, activation_fn)
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    lhs_batch_size, rhs_batch_size = batch_sizes
+    input_shape = (*lhs_batch_size, 1, 1024)
+    filter_shape = (*rhs_batch_size, 1024, 3)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    model = self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        has_bias,
+        activation_fn,
+    )
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(500):
         yield {
-            'input_tensor':
-                ops.convert_to_tensor(
-                    np.random.uniform(low=0.0, high=1.0,
-                                      size=(1, 1024)).astype('f4')),
+            'input_tensor': ops.convert_to_tensor(
+                np.random.uniform(
+                    low=0.0, high=1.0, size=static_input_shape
+                ).astype('f4')
+            ),
         }
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen())
+        representative_dataset=data_gen(),
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     input_data = ops.convert_to_tensor(
-        np.random.uniform(low=0.0, high=1.0, size=(1, 1024)).astype('f4'))
+        np.random.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            'f4'
+        )
+    )
     expected_outputs = model.matmul(input_data)
     got_outputs = converted_model.signatures['serving_default'](
-        input_tensor=ops.convert_to_tensor(input_data))
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
     self.assertAllClose(expected_outputs, got_outputs, atol=0.1674)
 
     # Check the converted model in the target opset.
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=target_opset)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-    output_directory = self.create_tempdir().full_path
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path_2,
         quantization_options,
-        representative_dataset=data_gen())
+        representative_dataset=data_gen(),
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    output_graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
+    if target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
 
     new_outputs = converted_model.signatures['serving_default'](
-        input_tensor=ops.convert_to_tensor(input_data))
-    # The difference between TF and target path is expected to be small (smaller
-    # or equal to 1 in the quantized domain).
-    self.assertAllClose(new_outputs, got_outputs, atol=0.1048)
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # The difference between TF and target path is expected to be small.
+    self.assertAllClose(new_outputs, got_outputs, atol=0.1202)
     self.assertAllClose(new_outputs, expected_outputs, atol=0.1023)
 
+  @parameterized.parameters(
+      ('abc,cde->abde', (2, 2, 64), (64, 3, 3), (3, 3), quant_opts_pb2.XLA),
+      ('abc,dce->adbe', (2, 2, 64), (3, 64, 3), (2, 3), quant_opts_pb2.XLA),
+  )
+  def test_einsum_ptq_model(
+      self,
+      equation: str,
+      input_shape: Sequence[int],
+      weight_shape: Sequence[int],
+      bias_shape: Sequence[int],
+      target_opset: quant_opts_pb2.OpSet,
+  ):
+    model = self._create_einsum_model(
+        self._input_saved_model_path,
+        equation,
+        input_shape,
+        weight_shape,
+        bias_shape,
+        activation_fn=nn_ops.relu,
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(200):
+        yield {
+            'input_tensor': ops.convert_to_tensor(
+                np.random.uniform(low=0.0, high=1.0, size=input_shape).astype(
+                    'f4'
+                )
+            ),
+        }
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    input_data = ops.convert_to_tensor(
+        np.random.uniform(low=0.0, high=1.0, size=input_shape).astype('f4')
+    )
+    expected_outputs = model.einsum(input_data)
+    got_outputs = converted_model.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    self.assertAllClose(expected_outputs, got_outputs, atol=0.0608)
+
+    # Check the converted model in the target opset.
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path_2,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    if target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
+
+    new_outputs = converted_model.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # The difference between TF and target path is expected to be small.
+    self.assertAllClose(new_outputs, got_outputs, atol=0.0666)
+    self.assertAllClose(new_outputs, expected_outputs, atol=0.057)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_alias_preserved(self):
+    model = self._create_conv2d_model(
+        input_shape=(1, 3, 4, 3), filter_shape=(2, 3, 3, 2)
+    )
+
+    signatures = {
+        'serving_default': model.conv.get_concrete_function(),
+    }
+    save_opts = save_options.SaveOptions(
+        function_aliases={'conv_func': model.conv}
+    )
+
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures, save_opts
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      rng = np.random.default_rng(seed=123)
+      for _ in range(2):
+        yield {
+            'input_tensor': rng.uniform(
+                low=0, high=150, size=(1, 3, 4, 3)
+            ).astype(np.float32),
+        }
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.OpSet.XLA,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen(),
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    # Test whether the aliased function exists.
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+
+    # Confirm that the function alias is preserved.
+    meta_graph_def = output_loader.get_meta_graph_def_from_tags(tags)
+    function_aliases = meta_graph_def.meta_info_def.function_aliases
+    self.assertNotEmpty(function_aliases)
+    self.assertCountEqual(function_aliases.values(), {'conv_func'})
+
+    # Test that the aliased function contains a quantized op.
+    for func_name, alias in function_aliases.items():
+      if alias == 'conv_func':
+        for func in meta_graph_def.graph_def.library.function:
+          if func.signature.name == func_name:
+            self._contains_op_with_name_and_attribute(
+                func.node_def, op_name='XlaConvV2', attr_name='', attr_val=None
+            )
+
+  @test_util.deprecated_graph_mode_only
+  def test_matmul_ptq_model_with_unfreeze_constants(self):
+    # Uses large weight to exceed the constant size threshold of 64KiB
+    # (specified by `kDefaultConstantSizeThresholdInBytes`) for unfreezing.
+    self._create_matmul_model(
+        input_shape=(1, 20),
+        weight_shape=(20, 4096),
+        saved_model_path=self._input_saved_model_path,
+    )
+
+    repr_ds = self._create_data_generator(
+        input_key='input_tensor', shape=(1, 20), num_examples=2
+    )
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        freeze_all_variables=quant_opts_pb2.FreezeAllVariables(enabled=False),
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=repr_ds,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    # Confirms that quantization is applied to the model.
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    # Tests that there are variables in the model.
+    variable_node_defs = _find_variables(output_graphdef)
+    self.assertLen(variable_node_defs, 1)
+
+    # Reads the variables from the checkpoint file and matches with the
+    # variables found in the graph.
+    checkpoint_path = os.path.join(
+        self._output_saved_model_path, 'variables', 'variables'
+    )
+    var_name_and_shapes = checkpoint_utils.list_variables(checkpoint_path)
+
+    # Checks that each variable's name and shape match.
+    self.assertEqual(len(variable_node_defs), len(var_name_and_shapes))
+    for var_name, shape in var_name_and_shapes:
+      self.assertIn(var_name, variable_node_defs)
+      self.assertEqual(
+          shape,
+          tensor_shape.TensorShape(
+              variable_node_defs[var_name].attr['shape'].shape
+          ),
+      )
+
   @parameterized.named_parameters(
       ('use_constant', False),
       ('use_variable', True),
   )
   @test_util.run_v2_only
   def test_gather_model(self, use_variable):
-
     model = self._create_gather_model(use_variable)
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     data_gen = self._create_data_generator(
         input_key='input_tensor',
         shape=[6],
         minval=0,
         maxval=10,
-        dtype=dtypes.int64)
+        dtype=dtypes.int64,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen)
+        representative_dataset=data_gen,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     # Currently gather is not supported.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertFalse(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_use_representative_samples_list(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
-    output_savedmodel_dir = self.create_tempdir().full_path
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
     tags = {tag_constants.SERVING}
 
-    representative_dataset: repr_dataset.RepresentativeDataset = [{
-        'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
-    } for _ in range(8)]
+    representative_dataset: repr_dataset.RepresentativeDataset = [
+        {
+            'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
+        }
+        for _ in range(8)
+    ]
 
     converted_model = quantize_model.quantize(
-        input_savedmodel_dir, ['serving_default'],
-        output_directory=output_savedmodel_dir,
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_use_ndarray_representative_dataset(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
-    output_savedmodel_dir = self.create_tempdir().full_path
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
     tags = {tag_constants.SERVING}
 
     # Use np.ndarrays instead of tf.Tensors for the representative dataset.
-    representative_dataset = [{
-        'input_tensor': np.random.uniform(size=(1, 1024)).astype(np.float32),
-    } for _ in range(4)]
+    representative_dataset = [
+        {
+            'input_tensor': np.random.uniform(size=(1, 1024)).astype(
+                np.float32
+            ),
+        }
+        for _ in range(4)
+    ]
 
     converted_model = quantize_model.quantize(
-        input_savedmodel_dir, ['serving_default'],
-        tags=tags,
-        output_directory=output_savedmodel_dir,
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_use_python_list_representative_dataset(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
-    output_savedmodel_dir = self.create_tempdir().full_path
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
     tags = {tag_constants.SERVING}
 
     # Use plain python lists as representative samples.
-    representative_dataset = [{
-        'input_tensor': [[i * 0.1 for i in range(1024)]],
-    } for _ in range(4)]
+    representative_dataset = [
+        {
+            'input_tensor': [[i * 0.1 for i in range(1024)]],
+        }
+        for _ in range(4)
+    ]
 
     converted_model = quantize_model.quantize(
-        input_savedmodel_dir, ['serving_default'],
-        tags=tags,
-        output_directory=output_savedmodel_dir,
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_call_twice(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
-    output_savedmodel_dir_1 = self.create_tempdir().full_path
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
     tags = {tag_constants.SERVING}
     signature_def_keys = [signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
 
-    representative_dataset: repr_dataset.RepresentativeDataset = [{
-        'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
-    } for _ in range(8)]
+    representative_dataset: repr_dataset.RepresentativeDataset = [
+        {
+            'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
+        }
+        for _ in range(8)
+    ]
 
     # Test the first run.
     converted_model_1 = quantize_model.quantize(
-        input_savedmodel_dir,
+        self._input_saved_model_path,
         signature_def_keys,
-        output_directory=output_savedmodel_dir_1,
+        output_directory=self._output_saved_model_path,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model_1)
-    self.assertCountEqual(converted_model_1.signatures._signatures.keys(),
-                          signature_def_keys)
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir_1)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model_1.signatures._signatures.keys(), signature_def_keys
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     # Test the second run on the same model.
-    output_savedmodel_dir_2 = self.create_tempdir().full_path
     converted_model_2 = quantize_model.quantize(
-        input_savedmodel_dir,
+        self._input_saved_model_path,
         signature_def_keys,
-        output_directory=output_savedmodel_dir_2,
+        output_directory=self._output_saved_model_path_2,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model_2)
-    self.assertCountEqual(converted_model_2.signatures._signatures.keys(),
-                          signature_def_keys)
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir_2)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model_2.signatures._signatures.keys(), signature_def_keys
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   # tf.data.Dataset is as an Iterable (thus can be used as representative
   # dataset) only in TF2 (eager mode).
   @test_util.run_v2_only
   def test_model_ptq_use_tf_dataset_for_representative_dataset(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
-    output_savedmodel_dir = self.create_tempdir().full_path
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
     tags = {tag_constants.SERVING}
 
-    representative_samples = [{
-        'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
-    } for _ in range(8)]
+    representative_samples = [
+        {
+            'input_tensor': random_ops.random_uniform(shape=(1, 1024)),
+        }
+        for _ in range(8)
+    ]
 
     # Construct a tf.data.Dataset from the representative samples.
     representative_dataset = dataset_ops.DatasetV2.from_generator(
         lambda: representative_samples,
         output_signature={
-            'input_tensor':
-                tensor_spec.TensorSpec(shape=(1, 1024), dtype=dtypes.float32),
-        })
+            'input_tensor': tensor_spec.TensorSpec(
+                shape=(1, 1024), dtype=dtypes.float32
+            ),
+        },
+    )
 
     converted_model = quantize_model.quantize(
-        input_savedmodel_dir, ['serving_default'],
-        output_directory=output_savedmodel_dir,
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
         quantization_options=quantization_options,
-        representative_dataset=representative_dataset)
+        representative_dataset=representative_dataset,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_no_representative_sample_shows_warnings(self):
-    model = self._create_matmul_model()
-    input_savedmodel_dir = self.create_tempdir('input').full_path
-    output_savedmodel_dir = self.create_tempdir().full_path
-    saved_model_save.save(model, input_savedmodel_dir)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
-    tags = [tag_constants.SERVING]
+    tags = {tag_constants.SERVING}
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     with self.assertLogs(level='WARN') as warning_logs:
       # Save the logger verbosity.
@@ -953,14 +2379,15 @@ def test_model_ptq_no_representative_sample_shows_warnings(self):
 
       try:
         converted_model = quantize_model.quantize(
-            input_savedmodel_dir,
+            self._input_saved_model_path,
             ['serving_default'],
             tags,
-            output_savedmodel_dir,
+            self._output_saved_model_path,
             quantization_options,
             # Put no sample into the representative dataset to make calibration
             # impossible.
-            representative_dataset=[])
+            representative_dataset=[],
+        )
       finally:
         # Restore the logger verbosity.
         logging.set_verbosity(prev_log_level)
@@ -969,29 +2396,35 @@ def test_model_ptq_no_representative_sample_shows_warnings(self):
 
       # Warning message should contain the function name.
       self.assertTrue(
-          self._any_warning_contains('matmul', warning_logs.records))
+          self._any_warning_contains('matmul', warning_logs.records)
+      )
       self.assertTrue(
-          self._any_warning_contains('does not have min or max values',
-                                     warning_logs.records))
+          self._any_warning_contains(
+              'does not have min or max values', warning_logs.records
+          )
+      )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_savedmodel_dir)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     # Model is not quantized because there was no sample data for calibration.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertFalse(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_model_ptq_with_uncalibrated_subgraph(self):
-
     class IfModel(module.Module):
       """A model that contains a branching op."""
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
-      ])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(shape=[1, 4], dtype=dtypes.float32)
+          ]
+      )
       def model_fn(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
         """Runs the input tensor to a branched operations.
 
@@ -1005,39 +2438,40 @@ def model_fn(self, x: core.Tensor) -> Mapping[str, core.Tensor]:
           A map of: output key -> output result.
         """
         if math_ops.reduce_sum(x) > 10.0:
-          filters = np.random.uniform(
-              low=-1.0, high=1.0, size=(4, 3)).astype('f4')
+          filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
+              'f4'
+          )
           bias = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype('f4')
           out = math_ops.matmul(x, filters)
           out = nn_ops.bias_add(out, bias)
           return {'output': out}
 
-        filters = np.random.uniform(
-            low=-1.0, high=1.0, size=(4, 3)).astype('f4')
+        filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
+            'f4'
+        )
         bias = np.random.uniform(low=-1.0, high=1.0, size=(3,)).astype('f4')
         out = math_ops.matmul(x, filters)
         out = nn_ops.bias_add(out, bias)
         return {'output': out}
 
     model = IfModel()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(8):
         yield {
-            'x':
-                ops.convert_to_tensor(
-                    np.random.uniform(low=0.0, high=1.0,
-                                      size=(1, 4)).astype('f4')),
+            'x': ops.convert_to_tensor(
+                np.random.uniform(low=0.0, high=1.0, size=(1, 4)).astype('f4')
+            ),
         }
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     with self.assertLogs(level='WARN') as warning_logs:
       # Save the logger verbosity.
@@ -1046,11 +2480,13 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
 
       try:
         converted_model = quantize_model.quantize(
-            input_saved_model_path, ['serving_default'],
+            self._input_saved_model_path,
+            ['serving_default'],
             tags,
-            output_directory,
+            self._output_saved_model_path,
             quantization_options,
-            representative_dataset=data_gen())
+            representative_dataset=data_gen(),
+        )
       finally:
         # Restore the logger verbosity.
         logging.set_verbosity(log_level)
@@ -1061,20 +2497,26 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
       # is when the condition is true, so 'cond_true' function must be part of
       # the warning message.
       self.assertTrue(
-          self._any_warning_contains('cond_true', warning_logs.records))
+          self._any_warning_contains('cond_true', warning_logs.records)
+      )
       self.assertFalse(
-          self._any_warning_contains('cond_false', warning_logs.records))
+          self._any_warning_contains('cond_false', warning_logs.records)
+      )
       self.assertTrue(
-          self._any_warning_contains('does not have min or max values',
-                                     warning_logs.records))
+          self._any_warning_contains(
+              'does not have min or max values', warning_logs.records
+          )
+      )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   # Run this test only with the eager mode.
   @test_util.run_v2_only
@@ -1083,21 +2525,22 @@ def test_ptq_model_with_multiple_signatures(self):
     model = MultipleSignatureModel()
 
     signatures = {
-        'sig1':
-            model.matmul.get_concrete_function(
-                tensor_spec.TensorSpec(shape=(1, 4), dtype=dtypes.float32)),
-        'sig2':
-            model.conv.get_concrete_function(
-                tensor_spec.TensorSpec(
-                    shape=(1, 3, 4, 3), dtype=dtypes.float32)),
+        'sig1': model.matmul.get_concrete_function(
+            tensor_spec.TensorSpec(shape=(1, 4), dtype=dtypes.float32)
+        ),
+        'sig2': model.conv.get_concrete_function(
+            tensor_spec.TensorSpec(shape=(1, 3, 4, 3), dtype=dtypes.float32)
+        ),
     }
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path, signatures=signatures)
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=signatures
+    )
 
-    output_directory = self.create_tempdir().full_path
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     def data_gen_sig1() -> repr_dataset.RepresentativeDataset:
       """Generates tuple-style samples for signature 'sig1'.
@@ -1125,23 +2568,26 @@ def data_gen_sig2() -> repr_dataset.RepresentativeDataset:
 
     tags = {tag_constants.SERVING}
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys=['sig1', 'sig2'],
         tags=tags,
-        output_directory=output_directory,
+        output_directory=self._output_saved_model_path,
         quantization_options=quantization_options,
         representative_dataset={
             'sig1': data_gen_sig1(),
             'sig2': data_gen_sig2(),
-        })
+        },
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'sig1', 'sig2'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'sig1', 'sig2'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   # Run this test only with the eager mode.
   @test_util.run_v2_only
@@ -1150,77 +2596,85 @@ def test_ptq_multiple_signatures_invalid_dataset_raises_value_error(self):
     model = MultipleSignatureModel()
 
     signatures = {
-        'sig1':
-            model.matmul.get_concrete_function(
-                tensor_spec.TensorSpec(shape=(1, 4), dtype=dtypes.float32)),
-        'sig2':
-            model.conv.get_concrete_function(
-                tensor_spec.TensorSpec(
-                    shape=(1, 3, 4, 3), dtype=dtypes.float32)),
+        'sig1': model.matmul.get_concrete_function(
+            tensor_spec.TensorSpec(shape=(1, 4), dtype=dtypes.float32)
+        ),
+        'sig2': model.conv.get_concrete_function(
+            tensor_spec.TensorSpec(shape=(1, 3, 4, 3), dtype=dtypes.float32)
+        ),
     }
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path, signatures=signatures)
+    saved_model_save.save(
+        model, self._input_saved_model_path, signatures=signatures
+    )
 
-    output_directory = self.create_tempdir().full_path
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     # Use a dict-style samples instead of tuple-style samples. This is invalid
     # because for a model multiple signatures one must use tuple-style samples.
-    invalid_dataset: repr_dataset.RepresentativeDataset = [{
-        'matmul_input': random_ops.random_uniform(shape=(1, 4))
-    } for _ in range(8)]
+    invalid_dataset: repr_dataset.RepresentativeDataset = [
+        {'matmul_input': random_ops.random_uniform(shape=(1, 4))}
+        for _ in range(8)
+    ]
 
     with self.assertRaisesRegex(ValueError, 'Invalid representative dataset.'):
       quantize_model.quantize(
-          input_saved_model_path,
+          self._input_saved_model_path,
           signature_keys=['sig1', 'sig2'],
           tags={tag_constants.SERVING},
-          output_directory=output_directory,
+          output_directory=self._output_saved_model_path,
           quantization_options=quantization_options,
-          representative_dataset=invalid_dataset)
+          representative_dataset=invalid_dataset,
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_tf1_saved_model_with_variable_for_conv2d(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     tags = {tag_constants.SERVING}
 
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='x',
         output_key='output',
-        use_variable=True)
+        use_variable=True,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     data_gen = self._create_data_generator(
-        input_key='x', shape=input_placeholder.shape)
+        input_key='x', shape=input_placeholder.shape
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys,
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen)
+        representative_dataset=data_gen,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @parameterized.named_parameters(
       ('use_constant', False),
@@ -1228,95 +2682,178 @@ def test_ptq_model_with_tf1_saved_model_with_variable_for_conv2d(self):
   )
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_tf1_saved_model_with_variable_for_gather(
-      self, use_variable):
-    input_saved_model_path = self.create_tempdir('input').full_path
+      self, use_variable
+  ):
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     tags = {tag_constants.SERVING}
 
     input_placeholder = self._create_and_save_tf1_gather_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='x',
         output_key='output',
-        use_variable=use_variable)
+        use_variable=use_variable,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     data_gen = self._create_data_generator(
         input_key='x',
         shape=input_placeholder.shape,
         minval=0,
         maxval=10,
-        dtype=dtypes.int64)
+        dtype=dtypes.int64,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys,
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen)
+        representative_dataset=data_gen,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
-
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     # Quantization is not currently supported for gather.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    self.assertFalse(self._contains_quantized_function_call(output_graphdef))
+
+  @test_util.deprecated_graph_mode_only
+  def test_ptq_model_with_variable_tf1_saved_model_unfreeze_constants(self):
+    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    tags = {tag_constants.SERVING}
+
+    input_placeholder = self._create_and_save_tf1_conv_model(
+        self._input_saved_model_path,
+        signature_key,
+        tags,
+        input_key='x',
+        output_key='output',
+        input_shape=(1, 16, 16, 8),
+        # Uses large filter to exceed the constant size threshold of 64KiB
+        # (specified by `kDefaultConstantSizeThresholdInBytes`) for unfreezing.
+        filter_shape=(256, 8, 8, 16),
+        use_variable=True,
+    )
+
+    signature_keys = [signature_key]
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        freeze_all_variables=quant_opts_pb2.FreezeAllVariables(enabled=False),
+    )
+
+    repr_ds = self._create_data_generator(
+        input_key='x', shape=input_placeholder.shape, num_examples=2
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=repr_ds,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    # Checks that quantization is applied.
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    # Tests that there are variables in the model.
+    variable_node_defs = _find_variables(output_graphdef)
+    self.assertLen(variable_node_defs, 1)
+
+    # Reads the variables from the checkpoint file and matches with the
+    # variables found in the graph.
+    checkpoint_path = os.path.join(
+        self._output_saved_model_path, 'variables', 'variables'
+    )
+    var_name_and_shapes = checkpoint_utils.list_variables(checkpoint_path)
+
+    # Checks that each variable's name and shape match.
+    self.assertEqual(len(variable_node_defs), len(var_name_and_shapes))
+    for var_name, shape in var_name_and_shapes:
+      self.assertIn(var_name, variable_node_defs)
+      self.assertEqual(
+          shape,
+          tensor_shape.TensorShape(
+              variable_node_defs[var_name].attr['shape'].shape
+          ),
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_tf1_saved_model(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     tags = {tag_constants.SERVING}
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='p',
         output_key='output',
-        use_variable=False)
+        use_variable=False,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     data_gen = self._create_data_generator(
-        input_key='p', shape=input_placeholder.shape)
+        input_key='p', shape=input_placeholder.shape
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys,
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen)
+        representative_dataset=data_gen,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_tf1_saved_model_multiple_signatures(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     tags = {tag_constants.SERVING}
 
     # Create two models and add them to a same SavedModel under different
@@ -1324,25 +2861,31 @@ def test_ptq_model_with_tf1_saved_model_multiple_signatures(self):
     with ops.Graph().as_default(), session.Session() as sess:
       in_placeholder_1, output_tensor_1 = self._create_simple_tf1_conv_model()
       sig_def_1 = signature_def_utils_impl.predict_signature_def(
-          inputs={'x1': in_placeholder_1}, outputs={'output1': output_tensor_1})
+          inputs={'x1': in_placeholder_1}, outputs={'output1': output_tensor_1}
+      )
 
       in_placeholder_2, output_tensor_2 = self._create_simple_tf1_conv_model()
       sig_def_2 = signature_def_utils_impl.predict_signature_def(
-          inputs={'x2': in_placeholder_2}, outputs={'output2': output_tensor_2})
+          inputs={'x2': in_placeholder_2}, outputs={'output2': output_tensor_2}
+      )
 
-      v1_builder = builder.SavedModelBuilder(input_saved_model_path)
+      v1_builder = builder.SavedModelBuilder(self._input_saved_model_path)
       v1_builder.add_meta_graph_and_variables(
-          sess, tags, signature_def_map={
+          sess,
+          tags,
+          signature_def_map={
               'sig1': sig_def_1,
               'sig2': sig_def_2,
-          })
+          },
+      )
 
       v1_builder.save()
 
-    output_directory = self.create_tempdir().full_path
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     def data_gen_sig1() -> repr_dataset.RepresentativeDataset:
       """Generates tuple-style samples.
@@ -1369,196 +2912,213 @@ def data_gen_sig2() -> repr_dataset.RepresentativeDataset:
         yield {'x2': random_ops.random_uniform(shape=in_placeholder_2.shape)}
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys=['sig1', 'sig2'],
         tags=tags,
-        output_directory=output_directory,
+        output_directory=self._output_saved_model_path,
         quantization_options=quantization_options,
         representative_dataset={
             'sig1': data_gen_sig1(),
             'sig2': data_gen_sig2(),
-        })
+        },
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'sig1', 'sig2'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'sig1', 'sig2'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_tf1_saved_model_invalid_input_key_raises_value_error(
-      self):
-    input_saved_model_path = self.create_tempdir('input').full_path
+      self,
+  ):
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     tags = {tag_constants.SERVING}
 
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='x',
         output_key='output',
-        use_variable=False)
+        use_variable=False,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     # Representative generator function that yields with an invalid input key.
     invalid_data_gen = self._create_data_generator(
-        input_key='invalid_input_key', shape=input_placeholder.shape)
+        input_key='invalid_input_key', shape=input_placeholder.shape
+    )
 
     with self.assertRaisesRegex(
         ValueError,
-        'Failed to run graph for post-training quantization calibration'):
+        'Failed to run graph for post-training quantization calibration',
+    ):
       quantize_model.quantize(
-          input_saved_model_path,
+          self._input_saved_model_path,
           signature_keys,
           tags,
-          output_directory,
+          self._output_saved_model_path,
           quantization_options,
-          representative_dataset=invalid_data_gen)
+          representative_dataset=invalid_data_gen,
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_non_default_tags(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     # Use a different set of tags other than {"serve"}.
     tags = {tag_constants.TRAINING, tag_constants.GPU}
 
     # Non-default tags are usually used when saving multiple metagraphs in TF1.
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='input',
         output_key='output',
-        use_variable=True)
+        use_variable=True,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     data_gen = self._create_data_generator(
-        input_key='input', shape=input_placeholder.shape)
+        input_key='input', shape=input_placeholder.shape
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_keys,
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=data_gen)
+        representative_dataset=data_gen,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_model_with_wrong_tags_raises_error(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     save_tags = {tag_constants.TRAINING, tag_constants.GPU}
 
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         save_tags,
         input_key='input',
         output_key='output',
-        use_variable=True)
+        use_variable=True,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     # Try to use a different set of tags to quantize.
     tags = {tag_constants.SERVING}
     data_gen = self._create_data_generator(
-        input_key='input', shape=input_placeholder.shape)
-    with self.assertRaisesRegex(RuntimeError,
-                                'Failed to retrieve MetaGraphDef'):
+        input_key='input', shape=input_placeholder.shape
+    )
+    with self.assertRaisesRegex(
+        RuntimeError,
+        "MetaGraphDef associated with tags {'serve'} could not be found",
+    ):
       quantize_model.quantize(
-          input_saved_model_path,
+          self._input_saved_model_path,
           signature_keys,
           tags,
-          output_directory,
+          self._output_saved_model_path,
           quantization_options,
-          representative_dataset=data_gen)
+          representative_dataset=data_gen,
+      )
 
   # TODO(b/244276332): Allow table initialization in TF2 eager mode.
   @test_util.deprecated_graph_mode_only
   def test_ptq_vocab_table_lookup_model(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    input_model_dir = self.create_tempdir('input').full_path
 
-    with session.Session() as sess:
-      input_vocabs_placeholder, lookup_tensor, output_tensor = (
-          self._create_vocab_table_lookup_model_tf1(sess))
+    # Create and save a simple model that involves a hash table.
+    inputs, outputs = self._create_and_save_vocab_table_lookup_model_tf1(
+        self._input_saved_model_path, tags, signature_def_key
+    )
 
-      self._save_tf1_model(
-          sess,
-          input_model_dir,
-          signature_def_key,
-          tags,
-          inputs={'input_vocabs': input_vocabs_placeholder},
-          outputs={
-              'lookup': lookup_tensor,  # Table lookup values.
-              'output': output_tensor,
-          },
-          init_op=lookup_ops.tables_initializer(),
-          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
+    # Make sure that the desired input key and output key is present.
+    self.assertIn('input_vocabs', inputs.keys())
+    self.assertIn('lookup', outputs.keys())
 
     # Representative dataset is composed of a set of vocabs for table lookup.
-    repr_ds = [{
-        'input_vocabs': np.array([b'hello', b'model', b'quantization'])
-    } for _ in range(4)]
+    repr_ds = [
+        {'input_vocabs': np.array([b'hello', b'model', b'quantization'])}
+        for _ in range(4)
+    ]
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE))
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
 
     signature_def_keys = [signature_def_key]
-    output_model_dir = self.create_tempdir('output').full_path
 
     quantize_model.quantize(
-        input_model_dir,
+        self._input_saved_model_path,
         signature_def_keys,
         tags,
-        output_model_dir,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=repr_ds)
+        representative_dataset=repr_ds,
+    )
 
     # Tests table lookup to make sure the table has been initialized
     # successfully.
     with session.Session(graph=ops.Graph()) as sess:
       output_meta_graph_def = saved_model_loader.load(
-          sess, tags=tags, export_dir=output_model_dir)
+          sess, tags=tags, export_dir=self._output_saved_model_path
+      )
 
       # The graph should contain a quantized function call (it contains a
       # single f32 matmul node).
       self.assertTrue(
-          self._contains_quantized_function_call(output_meta_graph_def))
-      self.assertCountEqual(output_meta_graph_def.signature_def.keys(),
-                            signature_def_keys)
+          self._contains_quantized_function_call(
+              output_meta_graph_def.graph_def
+          )
+      )
+      self.assertCountEqual(
+          output_meta_graph_def.signature_def.keys(), signature_def_keys
+      )
 
       signature_def = output_meta_graph_def.signature_def[signature_def_key]
 
@@ -1572,35 +3132,90 @@ def test_ptq_vocab_table_lookup_model(self):
           lookup_tensor,
           feed_dict={
               input_tensor: np.array([b'model', b'quantization', b'hello'])
-          })
+          },
+      )
 
-      self.assertAllClose(lookup_val, [1., 2., 0.])
+      self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
   @parameterized.named_parameters(
       ('none', None, False, False, quant_opts_pb2.TF, False, 'SAME'),
       ('relu', nn_ops.relu, False, False, quant_opts_pb2.TF, False, 'SAME'),
       ('relu6', nn_ops.relu6, False, False, quant_opts_pb2.TF, False, 'SAME'),
       ('with_bias', None, True, False, quant_opts_pb2.TF, False, 'SAME'),
-      ('with_bias_and_relu', nn_ops.relu, True, False, quant_opts_pb2.TF, False,
-       'SAME'),
-      ('with_bias_and_relu6', nn_ops.relu6, True, False, quant_opts_pb2.TF,
-       False, 'SAME'),
+      (
+          'with_bias_and_relu',
+          nn_ops.relu,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          'SAME',
+      ),
+      (
+          'with_bias_and_relu6',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.TF,
+          False,
+          'SAME',
+      ),
       ('none_to_xla', None, False, False, quant_opts_pb2.XLA, False, 'SAME'),
-      ('with_bias_and_relu6_to_xla', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, False, 'SAME'),
-      ('with_bias_to_xla_dynamic', None, True, False, quant_opts_pb2.XLA, True,
-       'SAME'),
-      ('none_to_xla_padding_valid', None, False, False, quant_opts_pb2.XLA,
-       False, 'VALID'),
-      ('with_bias_and_relu6_to_xla_padding_valid', nn_ops.relu6, True, False,
-       quant_opts_pb2.XLA, False, 'VALID'),
-      ('with_bias_to_xla_dynamic_padding_valid', None, True, False,
-       quant_opts_pb2.XLA, True, 'VALID'),
+      (
+          'with_bias_and_relu6_to_xla',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          False,
+          'SAME',
+      ),
+      (
+          'with_bias_to_xla_dynamic',
+          None,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          True,
+          'SAME',
+      ),
+      (
+          'none_to_xla_padding_valid',
+          None,
+          False,
+          False,
+          quant_opts_pb2.XLA,
+          False,
+          'VALID',
+      ),
+      (
+          'with_bias_and_relu6_to_xla_padding_valid',
+          nn_ops.relu6,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          False,
+          'VALID',
+      ),
+      (
+          'with_bias_to_xla_dynamic_padding_valid',
+          None,
+          True,
+          False,
+          quant_opts_pb2.XLA,
+          True,
+          'VALID',
+      ),
   )
-  def test_conv3d_ptq_model(self, activation_fn: Optional[ops.Operation],
-                            has_bias: bool, has_bn: bool,
-                            target_opset: quant_opts_pb2.OpSet,
-                            input_shape_dynamic: bool, padding: str):
+  def test_conv3d_ptq_model(
+      self,
+      activation_fn: Optional[ops.Operation],
+      has_bias: bool,
+      has_batch_norm: bool,
+      target_opset: quant_opts_pb2.OpSet,
+      input_shape_dynamic: bool,
+      padding: str,
+  ):
     input_shape = [1, 3, 4, 3, 3]
     if input_shape_dynamic:
       input_shape = [None, None, None, None, 3]
@@ -1609,12 +3224,15 @@ class ConvModel(module.Module):
 
       def __init__(self):
         self.filters = np.random.uniform(
-            low=-0.5, high=0.5, size=(2, 3, 3, 3, 2)).astype('f4')
+            low=-0.5, high=0.5, size=(2, 3, 3, 3, 2)
+        ).astype('f4')
         self.bias = np.random.uniform(low=0.0, high=0.2, size=(2)).astype('f4')
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
-      ])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
+          ]
+      )
       def conv3d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a 3D convolution operation.
 
@@ -1630,7 +3248,8 @@ def conv3d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
             strides=[1, 1, 2, 1, 1],
             dilations=[1, 1, 1, 1, 1],
             padding=padding,
-            data_format='NDHWC')
+            data_format='NDHWC',
+        )
         if has_bias:
           out = nn_ops.bias_add(out, self.bias)
         if activation_fn is not None:
@@ -1639,213 +3258,574 @@ def conv3d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
 
     np.random.seed(1234)
     model = ConvModel()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
     repr_ds = []
     for _ in range(500):
-      repr_ds.append({
-          'input_tensor':
-              ops.convert_to_tensor(
-                  np.random.uniform(low=-0.1, high=0.2,
-                                    size=(1, 3, 4, 3, 3)).astype('f4')),
-      })
+      repr_ds.append(
+          {
+              'input_tensor': ops.convert_to_tensor(
+                  np.random.uniform(
+                      low=-0.1, high=0.2, size=(1, 3, 4, 3, 3)
+                  ).astype('f4')
+              ),
+          }
+      )
 
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    tags = [tag_constants.SERVING]
+    tags = {tag_constants.SERVING}
 
     # Check the converted model with TF opset as the baseline.
-    output_directory = self.create_tempdir().full_path
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=quant_opts_pb2.TF)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, [signature_key],
+        self._input_saved_model_path,
+        [signature_key],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        representative_dataset=repr_ds)
+        representative_dataset=repr_ds,
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {signature_key})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
 
     input_data = np.random.uniform(
-        low=-0.1, high=0.2, size=(1, 3, 4, 3, 3)).astype('f4')
+        low=-0.1, high=0.2, size=(1, 3, 4, 3, 3)
+    ).astype('f4')
     expected_outputs = model.conv3d(input_data)
     got_outputs = converted_model.signatures[signature_key](
-        input_tensor=ops.convert_to_tensor(input_data))
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
     self.assertAllClose(expected_outputs, got_outputs, atol=0.00494)
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     # Check the converted model in the target opset.
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.STATIC_RANGE),
-        op_set=target_opset)
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-    output_directory = self.create_tempdir().full_path
     converted_model = quantize_model.quantize(
-        input_saved_model_path, [signature_key],
+        self._input_saved_model_path,
+        [signature_key],
         tags,
-        output_directory,
+        self._output_saved_model_path_2,
         quantization_options,
-        representative_dataset=repr_ds)
+        representative_dataset=repr_ds,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {signature_key})
-    loader = saved_model_loader.SavedModelLoader(output_directory)
-    meta_graphdef = loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {signature_key}
+    )
+    loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path_2
+    )
+    graphdef = loader.get_meta_graph_def_from_tags(tags).graph_def
     if target_opset == quant_opts_pb2.XLA:
-      self.assertTrue(self._contains_op(meta_graphdef, 'XlaConvV2'))
+      self.assertTrue(self._contains_op(graphdef, 'XlaConvV2'))
 
     new_outputs = converted_model.signatures[signature_key](
-        input_tensor=ops.convert_to_tensor(input_data))
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
     # The quantized model in XLA opset is expected to have similar fidelity
     # compared to the quantized model in TF opset.
     self.assertAllClose(new_outputs, got_outputs, atol=0.00306)
     self.assertAllClose(new_outputs, expected_outputs, atol=0.00494)
 
+  # Tests the case of having a signature key of `main` because it is a
+  # special name in the TF quantizer's MLIR pipeline that should be treated
+  # with care.
+  @test_util.run_in_graph_and_eager_modes
+  def test_ptq_model_with_signature_key_main(self):
+    signature_key = 'main'
+    tags = {tag_constants.SERVING}
+
+    input_placeholder = self._create_and_save_tf1_conv_model(
+        self._input_saved_model_path,
+        signature_key,
+        tags,
+        input_key='x',
+        output_key='output',
+        use_variable=True,
+    )
+
+    signature_keys = [signature_key]
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        )
+    )
+
+    data_gen = self._create_data_generator(
+        input_key='x', shape=input_placeholder.shape
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        representative_dataset=data_gen,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+    # Makes sure that the original function identified by the signature key
+    # `main` is renamed to `main_0` (see `InsertMainFunctionPass` for details).
+    self.assertTrue(
+        any(
+            map(
+                lambda func: func.signature.name == 'main_0',
+                output_graphdef.library.function,
+            )
+        )
+    )
+
 
 class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   """Test cases for dynamic range quantization.
 
-  Run all tests cases in both the graph mode (default in TF1) and the eager mode
-  (default in TF2) to ensure support for when TF2 is disabled.
+  Tries to run all tests cases in both the graph mode (default in TF1) and the
+  eager mode (default in TF2) to ensure support for when TF2 is disabled.
   """
 
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_per_tensor',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      (
+          'to_uniform_quantized_per_channel',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
+  )
   @test_util.run_in_graph_and_eager_modes
-  def test_matmul_model(self):
-    model = self._create_matmul_model()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+  def test_matmul_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              ['serving_default'], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
-    self.assertTrue(
-        self._contains_op(output_meta_graphdef, 'UniformQuantizedDotHybrid'))
-    self.assertFalse(self._contains_op(output_meta_graphdef, 'MatMul'))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      self.assertTrue(
+          self._contains_op(output_graphdef, 'UniformQuantizedDotHybrid')
+      )
+      self.assertFalse(self._contains_op(output_graphdef, 'MatMul'))
+      if enable_per_channel_quantization:
+        quantized_axis_attr = attr_value_pb2.AttrValue(i=-1)
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'UniformQuantizedDotHybrid',
+                'rhs_quantization_axis',
+                quantized_axis_attr,
+            )
+        )
+    elif target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
+      self.assertFalse(self._contains_op(output_graphdef, 'MatMul'))
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+      self.assertTrue(self._contains_op(output_graphdef, 'MatMul'))
 
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_per_tensor',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      (
+          'to_uniform_quantized_per_channel',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
+  )
   @test_util.run_in_graph_and_eager_modes
-  def test_conv_model(self):
-    model = self._create_conv2d_model()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+  def test_conv_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    filter_shape = (2, 3, 512, 2)
+
+    model = self._create_conv2d_model(
+        input_shape=(1, 3, 4, 512),
+        filter_shape=filter_shape,
+        has_bias=True,
+        has_batch_norm=True,
+        activation_fn=nn_ops.relu6,
+    )
+
+    saved_model_save.save(model, self._input_saved_model_path)
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              ['serving_default'], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    if enable_per_channel_quantization:
+      quantized_axis = 3
+      quantized_axis_attr = attr_value_pb2.AttrValue(i=quantized_axis)
+      quantized_dim_size_attr = attr_value_pb2.AttrValue(
+          list=attr_value_pb2.AttrValue.ListValue(
+              shape=[
+                  tensor_shape_pb2.TensorShapeProto(
+                      dim=[
+                          tensor_shape_pb2.TensorShapeProto.Dim(
+                              size=filter_shape[quantized_axis]
+                          )
+                      ]
+                  )
+              ]
+          )
+      )
+
+    if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      self.assertTrue(
+          self._contains_op(
+              output_graphdef, 'UniformQuantizedConvolutionHybrid'
+          )
+      )
+      self.assertFalse(self._contains_op(output_graphdef, 'Conv2D'))
+      if enable_per_channel_quantization:
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'UniformQuantizedConvolutionHybrid',
+                'rhs_quantization_axis',
+                quantized_axis_attr,
+            )
+        )
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'Const',
+                '_output_shapes',
+                quantized_dim_size_attr,
+            )
+        )
+    elif target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+      self.assertFalse(self._contains_op(output_graphdef, 'Conv2D'))
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+      self.assertTrue(self._contains_op(output_graphdef, 'Conv2D'))
+
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_per_tensor',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      (
+          'to_uniform_quantized_per_channel',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_depthwise_conv_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    filter_shape = (2, 3, 1024, 2)
+    strides = (1, 2, 2, 1)
+
+    model = self._create_depthwise_conv2d_model(
+        input_shape=(1, 3, 4, 1024), filter_shape=filter_shape, strides=strides
+    )
+
+    saved_model_save.save(model, self._input_saved_model_path)
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    # Currently conv is not supported.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    tags = [tag_constants.SERVING]
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    # Uniform Quantized op takes only the first and the second values for
+    # strides.
+    strides_to_check = (
+        (strides[1], strides[2])
+        if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED
+        else strides
+    )
+    strides_attr = attr_value_pb2.AttrValue(
+        list=attr_value_pb2.AttrValue.ListValue(i=strides_to_check)
+    )
+
+    if enable_per_channel_quantization:
+      quantized_axis_attr = attr_value_pb2.AttrValue(i=3)
+      quantized_dim_size_attr = attr_value_pb2.AttrValue(
+          list=attr_value_pb2.AttrValue.ListValue(
+              shape=[
+                  tensor_shape_pb2.TensorShapeProto(
+                      dim=[
+                          tensor_shape_pb2.TensorShapeProto.Dim(
+                              size=filter_shape[2] * filter_shape[3]
+                          )
+                      ]
+                  )
+              ]
+          )
+      )
+
+    if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      self.assertTrue(
+          self._contains_op(
+              output_graphdef,
+              'UniformQuantizedConvolutionHybrid',
+              'window_strides',
+              strides_attr,
+          )
+      )
+      self.assertFalse(
+          self._contains_op(output_graphdef, 'DepthwiseConv2dNative')
+      )
+      if enable_per_channel_quantization:
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'UniformQuantizedConvolutionHybrid',
+                'rhs_quantization_axis',
+                quantized_axis_attr,
+            )
+        )
+        self.assertTrue(
+            self._contains_op(
+                output_graphdef,
+                'Const',
+                '_output_shapes',
+                quantized_dim_size_attr,
+            )
+        )
+    elif target_opset == quant_opts_pb2.XLA:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+      self.assertFalse(
+          self._contains_op(output_graphdef, 'DepthwiseConv2dNative')
+      )
+    else:
+      self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+      self.assertTrue(
+          self._contains_op(
+              output_graphdef, 'DepthwiseConv2dNative', 'strides', strides_attr
+          )
+      )
 
   @parameterized.named_parameters(
-      ('use_constant', False),
-      ('use_variable', True),
+      ('to_tf_use_constant', quant_opts_pb2.TF, False),
+      ('to_xla_use_constant', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_use_constant',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      ('to_tf_use_variable', quant_opts_pb2.TF, True),
+      ('to_xla_use_variable', quant_opts_pb2.XLA, True),
+      (
+          'to_uniform_quantized_use_variable',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
   )
   @test_util.run_v2_only
-  def test_gather_model(self, use_variable):
+  def test_gather_model(
+      self, target_opset: quant_opts_pb2.OpSet, use_variable: bool
+  ):
     model = self._create_gather_model(use_variable)
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    saved_model_save.save(model, self._input_saved_model_path)
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              ['serving_default'], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    # Currently gather is not supported.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    # Only XLA opset does not apply weight-only quantization
+    if target_opset == quant_opts_pb2.XLA:
+      threshold = 0.25 if use_variable else 0.3
+      self.assertSizeRatioLessThan(
+          self._output_saved_model_path, self._input_saved_model_path, threshold
+      )
+    else:
+      # Double from the XLA threshold
+      threshold = 0.4 if use_variable else 0.7
+      self.assertSizeRatioGreaterThan(
+          self._output_saved_model_path, self._input_saved_model_path, threshold
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_conv_model_with_wrong_tags_raises_error(self):
-    input_saved_model_path = self.create_tempdir('input').full_path
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     save_tags = {tag_constants.TRAINING, tag_constants.GPU}
 
     input_placeholder = self._create_and_save_tf1_conv_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         save_tags,
         input_key='input',
         output_key='output',
-        use_variable=True)
+        use_variable=True,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED,
+    )
 
     # Try to use a different set of tags to quantize.
     tags = {tag_constants.SERVING}
     data_gen = self._create_data_generator(
-        input_key='input', shape=input_placeholder.shape)
-    with self.assertRaisesRegex(ValueError, 'Failed to import SavedModel'):
+        input_key='input', shape=input_placeholder.shape
+    )
+
+    # StatusNotOk error. `Exception` is used here because importing
+    # `StatusNotOk` may break the open-sourced version of TensorFlow.
+    with self.assertRaisesRegex(
+        Exception, 'Failed to import SavedModel'
+    ) as raises:
       quantize_model.quantize(
-          input_saved_model_path,
+          self._input_saved_model_path,
           signature_keys,
           tags,
-          output_directory,
+          self._output_saved_model_path,
           quantization_options,
-          representative_dataset=data_gen)
+          representative_dataset=data_gen,
+      )
+
+    self.assertEqual(raises.exception.__class__.__name__, 'StatusNotOk')
 
   @parameterized.named_parameters(
       ('quantize', True, 0),
@@ -1853,136 +3833,482 @@ def test_conv_model_with_wrong_tags_raises_error(self):
   )
   @test_util.run_in_graph_and_eager_modes
   def test_minimum_elements_for_weights(self, quantize, num_elements):
-    model = self._create_matmul_model()
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
 
-    tags = [tag_constants.SERVING]
-    output_directory = self.create_tempdir().full_path
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED,
+    )
     quantization_options.min_num_elements_for_weights = num_elements
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              ['serving_default'], tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
-
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    type_attr = attr_value_pb2.AttrValue(type=types_pb2.DT_QINT8)
     if quantize:
       self.assertTrue(
-          self._contains_quantized_function_call(output_meta_graphdef))
+          self._contains_op(output_graphdef, 'Const', 'dtype', type_attr)
+      )
     else:
       self.assertFalse(
-          self._contains_quantized_function_call(output_meta_graphdef))
+          self._contains_op(output_graphdef, 'Const', 'dtype', type_attr)
+      )
 
   @parameterized.named_parameters(
-      ('use_constant', False),
-      ('use_variable', True),
+      ('to_tf_use_constant', quant_opts_pb2.TF, False),
+      ('to_xla_use_constant', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_use_constant',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      ('to_tf_use_variable', quant_opts_pb2.TF, True),
+      ('to_xla_use_variable', quant_opts_pb2.XLA, True),
+      (
+          'to_uniform_quantized_use_variable',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
   )
   @test_util.run_in_graph_and_eager_modes
-  def test_gather_model_tf1(self, use_variable):
-    input_saved_model_path = self.create_tempdir('input').full_path
+  def test_gather_model_tf1(
+      self, target_opset: quant_opts_pb2.OpSet, use_variable: bool
+  ):
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     tags = {tag_constants.SERVING}
 
     _ = self._create_and_save_tf1_gather_model(
-        input_saved_model_path,
+        self._input_saved_model_path,
         signature_key,
         tags,
         input_key='x',
         output_key='output',
-        use_variable=use_variable)
+        use_variable=use_variable,
+    )
 
     signature_keys = [signature_key]
-    output_directory = self.create_tempdir().full_path
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE),
-        op_set=quant_opts_pb2.OpSet.UNIFORM_QUANTIZED)
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+    )
 
-    converted_model = quantize_model.quantize(input_saved_model_path,
-                                              signature_keys, tags,
-                                              output_directory,
-                                              quantization_options)
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          signature_keys)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), signature_keys
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    # Quantization is not currently supported for gather.
-    self.assertFalse(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    # Only XLA opset does not apply weight-only quantization
+    if target_opset == quant_opts_pb2.XLA:
+      threshold = 0.17 if use_variable else 0.4
+      self.assertSizeRatioLessThan(
+          self._output_saved_model_path, self._input_saved_model_path, threshold
+      )
+    else:
+      # Double from the XLA threshold
+      threshold = 0.3 if use_variable else 0.8
+      self.assertSizeRatioGreaterThan(
+          self._output_saved_model_path, self._input_saved_model_path, threshold
+      )
 
   @test_util.run_in_graph_and_eager_modes
   def test_non_empty_directory_raises_file_exists_error(self):
-    model = self._create_matmul_model()
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+    tags = {tag_constants.SERVING}
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    # Create a file inside the output directory.
+    file_io.write_string_to_file(
+        filename=os.path.join(self._output_saved_model_path, 'dummy_file.txt'),
+        file_content='Test content',
+    )
 
-    tags = [tag_constants.SERVING]
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        )
+    )
+
+    with self.assertRaisesRegex(
+        FileExistsError, 'Output directory already exists'
+    ):
+      quantize_model.quantize(
+          self._input_saved_model_path,
+          ['serving_default'],
+          tags,
+          self._output_saved_model_path,
+          quantization_options,
+      )
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_non_empty_directory_overwritten(self):
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+    tags = {tag_constants.SERVING}
 
     # Create a file inside the output directory.
-    output_directory = self.create_tempdir().full_path
     file_io.write_string_to_file(
-        filename=os.path.join(output_directory, 'dummy_file.txt'),
-        file_content='Test content')
+        filename=os.path.join(self._output_saved_model_path, 'dummy_file.txt'),
+        file_content='Test content',
+    )
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        )
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+        overwrite_output_directory=True,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+    self.assertTrue(self._contains_quantized_function_call(output_graphdef))
+
+  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
+  @test_util.deprecated_graph_mode_only
+  def test_table_initialized_when_model_has_table_tf1(self):
+    tags = {tag_constants.SERVING}
+    signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    # Create and save a simple model that involves a hash table.
+    inputs, outputs = self._create_and_save_vocab_table_lookup_model_tf1(
+        self._input_saved_model_path, tags, signature_def_key
+    )
+
+    # Make sure that the desired input key and output key is present.
+    self.assertIn('input_vocabs', inputs.keys())
+    self.assertIn('lookup', outputs.keys())
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE))
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        )
+    )
+
+    signature_def_keys = [signature_def_key]
+
+    quantize_model.quantize(
+        self._input_saved_model_path,
+        signature_def_keys,
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+
+    # Tests table lookup to make sure the table has been initialized
+    # successfully.
+    with session.Session(graph=ops.Graph()) as sess:
+      output_meta_graph_def = saved_model_loader.load(
+          sess, tags=tags, export_dir=self._output_saved_model_path
+      )
 
-    with self.assertRaisesRegex(FileExistsError,
-                                'Output directory already exists'):
-      quantize_model.quantize(input_saved_model_path, ['serving_default'], tags,
-                              output_directory, quantization_options)
+      self.assertCountEqual(
+          output_meta_graph_def.signature_def.keys(), signature_def_keys
+      )
 
+      signature_def = output_meta_graph_def.signature_def[signature_def_key]
+
+      input_tensor_name = signature_def.inputs['input_vocabs'].name
+      input_tensor = sess.graph.get_tensor_by_name(input_tensor_name)
+
+      lookup_tensor_name = signature_def.outputs['lookup'].name
+      lookup_tensor = sess.graph.get_tensor_by_name(lookup_tensor_name)
+
+      lookup_val = sess.run(
+          lookup_tensor,
+          feed_dict={
+              input_tensor: np.array([b'model', b'quantization', b'hello'])
+          },
+      )
+
+      self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
+
+
+class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
+  """Test cases for weight-only quantization.
+
+  Run all tests cases in both the graph mode (default in TF1) and the eager mode
+  (default in TF2) to ensure support for when TF2 is disabled.
+  """
+
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+  )
   @test_util.run_in_graph_and_eager_modes
-  def test_non_empty_directory_overwritten(self):
-    model = self._create_matmul_model()
+  def test_matmul_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    input_shape = (1, 512)
+
+    self._create_matmul_model(
+        input_shape=input_shape,
+        weight_shape=(512, 2),
+        saved_model_path=self._input_saved_model_path,
+    )
 
-    input_saved_model_path = self.create_tempdir('input').full_path
-    saved_model_save.save(model, input_saved_model_path)
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.WEIGHT_ONLY
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
-    tags = [tag_constants.SERVING]
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    self.assertTrue(self._contains_op(output_graphdef, 'MatMul'))
+    # Due to other meta data, the compression is not exactly 1/4.
+    threshold = 0.9 if quant_opts_pb2.TF else 0.3
+    self.assertSizeRatioLessThan(
+        self._output_saved_model_path, self._input_saved_model_path, threshold
+    )
 
-    # Create a file inside the output directory.
-    output_directory = self.create_tempdir().full_path
-    file_io.write_string_to_file(
-        filename=os.path.join(output_directory, 'dummy_file.txt'),
-        file_content='Test content')
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_conv_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    model = self._create_conv2d_model(
+        input_shape=(1, 3, 4, 512),
+        filter_shape=(2, 3, 512, 2),
+        has_bias=False,
+        has_batch_norm=False,
+        activation_fn=nn_ops.relu6,
+    )
+    saved_model_save.save(model, self._input_saved_model_path)
+
+    tags = {tag_constants.SERVING}
 
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
-            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE))
+            experimental_method=_ExperimentalMethod.WEIGHT_ONLY
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
 
     converted_model = quantize_model.quantize(
-        input_saved_model_path, ['serving_default'],
+        self._input_saved_model_path,
+        ['serving_default'],
         tags,
-        output_directory,
+        self._output_saved_model_path,
         quantization_options,
-        overwrite_output_directory=True)
+    )
 
     self.assertIsNotNone(converted_model)
-    self.assertCountEqual(converted_model.signatures._signatures.keys(),
-                          {'serving_default'})
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
 
-    output_loader = saved_model_loader.SavedModelLoader(output_directory)
-    output_meta_graphdef = output_loader.get_meta_graph_def_from_tags(tags)
-    self.assertTrue(
-        self._contains_quantized_function_call(output_meta_graphdef))
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    self.assertTrue(self._contains_op(output_graphdef, 'Conv2D'))
+    # Due to other meta data, the compression is not exactly 1/4.
+
+    threshold = 0.9 if quant_opts_pb2.TF else 0.3
+    self.assertSizeRatioLessThan(
+        self._output_saved_model_path, self._input_saved_model_path, threshold
+    )
+
+  @parameterized.named_parameters(
+      ('to_tf_per_tensor', quant_opts_pb2.TF, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_depthwise_conv_model(
+      self,
+      target_opset: quant_opts_pb2.OpSet,
+      enable_per_channel_quantization: bool,
+  ):
+    filter_shape = (2, 3, 512, 2)
+    strides = (1, 2, 2, 1)
+
+    model = self._create_depthwise_conv2d_model(
+        input_shape=(1, 3, 4, 512), filter_shape=filter_shape, strides=strides
+    )
+
+    saved_model_save.save(model, self._input_saved_model_path)
+
+    tags = {tag_constants.SERVING}
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.WEIGHT_ONLY
+        ),
+        op_set=target_opset,
+        enable_per_channel_quantization=enable_per_channel_quantization,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    self.assertTrue(self._contains_op(output_graphdef, 'DepthwiseConv2dNative'))
+    # Due to other meta data, the compression is not exactly 1/4.
+    threshold = 0.9 if quant_opts_pb2.TF else 0.3
+    self.assertSizeRatioLessThan(
+        self._output_saved_model_path, self._input_saved_model_path, threshold
+    )
+
+  @parameterized.named_parameters(
+      ('to_tf_use_constant', quant_opts_pb2.TF, False),
+      ('to_xla_use_constant', quant_opts_pb2.XLA, False),
+      (
+          'to_uniform_quantized_use_constant',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          False,
+      ),
+      ('to_tf_use_variable', quant_opts_pb2.TF, True),
+      ('to_xla_use_variable', quant_opts_pb2.XLA, True),
+      (
+          'to_uniform_quantized_use_variable',
+          quant_opts_pb2.UNIFORM_QUANTIZED,
+          True,
+      ),
+  )
+  @test_util.run_v2_only
+  def test_gather_model(
+      self, target_opset: quant_opts_pb2.OpSet, use_variable: bool
+  ):
+    model = self._create_gather_model(use_variable)
+    input_saved_model_path = self.create_tempdir('input').full_path
+    saved_model_save.save(model, input_saved_model_path)
+
+    tags = {tag_constants.SERVING}
+    output_directory = self.create_tempdir().full_path
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.WEIGHT_ONLY
+        ),
+        op_set=target_opset,
+    )
+
+    if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      # Uniform quantized opset is not supported for weight-only
+      with self.assertRaisesRegex(
+          ValueError, 'Uniform quantized opset does not support weight-only.'
+      ):
+        converted_model = quantize_model.quantize(
+            input_saved_model_path,
+            ['serving_default'],
+            tags,
+            output_directory,
+            quantization_options,
+        )
+      return
+
+    else:
+      converted_model = quantize_model.quantize(
+          input_saved_model_path,
+          ['serving_default'],
+          tags,
+          output_directory,
+          quantization_options,
+      )
+
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    threshold = 0.3 if quant_opts_pb2.XLA else 0.9
+    self.assertSizeRatioLessThan(
+        self._output_saved_model_path, self._input_saved_model_path, threshold
+    )
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index 25ec30f3de5..8794ca87d1f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Base test class for quantize_model Tests."""
 import os
-from typing import Collection, Iterable, Mapping, Sequence, Tuple, Optional
+from typing import Collection, Iterable, Mapping, Sequence, Tuple, Optional, Union, List
 
 from absl.testing import parameterized
 import numpy as np
@@ -22,8 +22,8 @@
 
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
 from tensorflow.core.framework import function_pb2
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
@@ -42,15 +42,81 @@
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.saved_model import signature_def_utils_impl
 from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import autotrackable
 from tensorflow.python.types import core
 
+# Type aliases for supported attribute types.
+_AttrValType = Union[List[int], bool, str, None]
+
 
 class QuantizedModelTest(test.TestCase, parameterized.TestCase):
   """Base test class for TF-quant tests."""
 
+  def setUp(self) -> None:
+    super().setUp()
+
+    # Many test cases for quantization involve creating and saving the input
+    # model and saving the output quantized model. These two member
+    # attributes can be used to specify the paths for such models,
+    # respectively. These paths will be cleaned up after each test case.
+    self._input_saved_model_path = self.create_tempdir('input').full_path
+    self._output_saved_model_path = self.create_tempdir('output').full_path
+    # Extra output path occasionally used for comparing two different
+    # quantized models.
+    self._output_saved_model_path_2 = self.create_tempdir('output2').full_path
+
+  def _get_dir_size(self, path: str = '.'):
+    """Get the total size of files and sub-directories under the path.
+
+    Args:
+      path: Path of a directory or a file to calculate the total size.
+
+    Returns:
+      Total size of the directory or a file.
+    """
+    total = 0
+    for root, _, files in os.walk(path):
+      for filename in files:
+        total += os.path.getsize(os.path.join(root, filename))
+    return total
+
+  def assertSizeRatioGreaterThan(
+      self, path_a: str, path_b: str, threshold: float
+  ):
+    """Check if the size ratio of the given paths is greater than the threshold.
+
+    Args:
+      path_a: Path of a directory or a file to be the nominator of the ratio.
+      path_b: Path of a directory or a file to be the denominator of the ratio.
+      threshold: a number to compare with.
+
+    Returns:
+      True if the size ratio of path_a / path_b is greater than threshold.
+    """
+    size_a = self._get_dir_size(path_a)
+    size_b = self._get_dir_size(path_b)
+    size_ratio = size_a / size_b
+    return self.assertGreater(size_ratio, threshold)
+
+  def assertSizeRatioLessThan(self, path_a: str, path_b: str, threshold: float):
+    """Check if the size ratio of the given paths is less than the threshold.
+
+    Args:
+      path_a: Path of a directory or a file to be the nominator of the ratio.
+      path_b: Path of a directory or a file to be the denominator of the ratio.
+      threshold: a number to compare with.
+
+    Returns:
+      True if the size ratio of path_a / path_b is less than threshold.
+    """
+    size_a = self._get_dir_size(path_a)
+    size_b = self._get_dir_size(path_b)
+    size_ratio = size_a / size_b
+    return self.assertLess(size_ratio, threshold)
+
   def _is_quantized_function(self, func: function_pb2.FunctionDef) -> bool:
     """Determine whether a FunctionDef is quantized.
 
@@ -73,76 +139,198 @@ def _is_composite_function(self, func: function_pb2.FunctionDef) -> bool:
     """
     return func.signature.name.startswith('composite_')
 
-  def _contains_op_with_name(self, nodes: Iterable[node_def_pb2.NodeDef],
-                             op_name: str) -> bool:
+  def _contains_op_with_name_and_attribute(
+      self,
+      nodes: Iterable[node_def_pb2.NodeDef],
+      op_name: str,
+      attr_name: str,
+      attr_val: _AttrValType,
+  ) -> bool:
     """Determine whether there is a node whose operation name matches `op_name`.
 
+    If `attr_name` is given, additionally check if the `attr_val` matches with
+    the attribute value of the op.
+
     Args:
       nodes: Iterable of NodeDefs.
       op_name: Name of the op to match.
+      attr_name: Name of the attribute of the op to match.
+      attr_val: Value of the attr_name to check.
 
     Returns:
-      True iff there exists a node whose name matches `op_name`.
+      True if there exists a node whose name matches `op_name` and 'attr_val' if
+      'attr_name' is given.
     """
-    return any(node.op == op_name for node in nodes)
+    return any(
+        node.attr.get(attr_name) == attr_val
+        for node in nodes
+        if node.op == op_name
+    )
 
   def _contains_quantized_function_call(
-      self, meta_graphdef: meta_graph_pb2.MetaGraphDef) -> bool:
+      self, graphdef: graph_pb2.GraphDef
+  ) -> bool:
     """Determines if the graph def has quantized function call.
 
     Args:
-      meta_graphdef: A MetaGraphDef object.
+      graphdef: A GraphDef object.
 
     Returns:
       True if and only if the graph def contains a quantized function call.
     """
-    return any(
-        map(self._is_quantized_function,
-            meta_graphdef.graph_def.library.function))
+    return any(map(self._is_quantized_function, graphdef.library.function))
 
   def _contains_composite_function_call(
-      self, meta_graphdef: meta_graph_pb2.MetaGraphDef) -> bool:
+      self, graphdef: graph_pb2.GraphDef
+  ) -> bool:
     """Determines if the graph def has composite function call.
 
     Args:
-      meta_graphdef: A MetaGraphDef object.
+      graphdef: A GraphDef object.
 
     Returns:
       True if and only if the graph def contains a composite function call.
     """
-    return any(
-        map(self._is_composite_function,
-            meta_graphdef.graph_def.library.function))
+    return any(map(self._is_composite_function, graphdef.library.function))
 
-  def _contains_op(self, meta_graphdef: meta_graph_pb2.MetaGraphDef,
-                   op_name: str) -> bool:
+  def _contains_op(
+      self,
+      graphdef: graph_pb2.GraphDef,
+      op_name: str,
+      attr_name: str = '',
+      attr_val: _AttrValType = None,
+  ) -> bool:
     """Determines if the graph def contains the given op.
 
     Args:
-      meta_graphdef: A MetaGraphDef object.
+      graphdef: A GraphDef object.
       op_name: Name of the operation to find within the graph.
+      attr_name: Name of the attribute of the op to match.
+      attr_val: Value of the attr_name to check.
 
     Returns:
-      True if and only if the graph def contains an op named `op_name`.
+      True if and only if the graph def contains an op named `op_name`. If
+      `attr_name` is given, check if the `attr_val` matches with the attribute
+      value of the op.
     """
     # Check the main graph
-    if self._contains_op_with_name(
-        nodes=meta_graphdef.graph_def.node, op_name=op_name):
+    if self._contains_op_with_name_and_attribute(
+        nodes=graphdef.node,
+        op_name=op_name,
+        attr_name=attr_name,
+        attr_val=attr_val,
+    ):
       return True
 
     # Check the graph genederated from user defined functions
-    return any(
-        self._contains_op_with_name(nodes=func.node_def, op_name=op_name)
-        for func in meta_graphdef.graph_def.library.function)
+    for func in graphdef.library.function:
+      if self._contains_op_with_name_and_attribute(
+          nodes=func.node_def,
+          op_name=op_name,
+          attr_name=attr_name,
+          attr_val=attr_val,
+      ):
+        return True
+    return False
+
+  def _count_ops(
+      self,
+      graphdef: graph_pb2.GraphDef,
+      op_names: Collection[str],
+      attr_name: str = '',
+      attr_val: _AttrValType = None,
+      get_op_name: bool = False,
+  ) -> int:
+    """Returns the number of given ops in a graph def.
+
+    Args:
+      graphdef: A GraphDef object.
+      op_names: Names of the operations to find within the graph.
+      attr_name: Name of the attribute of the ops to match.
+      attr_val: Value of the attr_name to check.
+      get_op_name: If set True, checks node.name rather than node.op.
+
+    Returns:
+      The number of occurrences of the given ops in a graph. The ops will be
+      counted only if the ops are named 'op_name' and has 'attr_val' if
+      'attr_name' is specified.
+    """
+    op_count = 0
+    for op_name in op_names:
+      # Check the main graph
+      op_count += self._count_op_with_name_and_attribute(
+          nodes=graphdef.node,
+          op_name=op_name,
+          attr_name=attr_name,
+          attr_val=attr_val,
+          get_op_name=get_op_name,
+      )
+
+      # Check the graph genederated from user defined functions
+      for func in graphdef.library.function:
+        op_count += self._count_op_with_name_and_attribute(
+            nodes=func.node_def,
+            op_name=op_name,
+            attr_name=attr_name,
+            attr_val=attr_val,
+            get_op_name=get_op_name,
+        )
+    return op_count
+
+  def _count_op_with_name_and_attribute(
+      self,
+      nodes: Iterable[node_def_pb2.NodeDef],
+      op_name: str,
+      attr_name: str,
+      attr_val: _AttrValType,
+      get_op_name: bool = False,
+  ) -> int:
+    """Determine the number of nodes whose operation name matches `op_name`.
+
+    If `attr_name` is given, additionally check if the `attr_val` matches with
+    the attribute value of the op.
 
-  def _create_simple_tf1_conv_model(self,
-                                    use_variable_for_filter=False
-                                   ) -> Tuple[core.Tensor, core.Tensor]:
+    Args:
+      nodes: Iterable of NodeDefs.
+      op_name: Name of the op to match.
+      attr_name: Name of the attribute of the op to match.
+      attr_val: Value of the attr_name to check.
+      get_op_name: If set True, checks node.name rather than node.op.
+
+    Returns:
+      The number of occurrences of nodes whose name match `op_name` and
+      'attr_val' if 'attr_name' is given.
+    """
+    if get_op_name:
+      return len(
+          [
+              node.attr.get(attr_name) == attr_val
+              for node in nodes
+              if node.name == op_name
+          ]
+      )
+    else:
+      return len(
+          [
+              node.attr.get(attr_name) == attr_val
+              for node in nodes
+              if node.op == op_name
+          ]
+      )
+
+  def _create_simple_tf1_conv_model(
+      self,
+      input_shape: Sequence[int] = (1, 3, 4, 3),
+      filter_shape: Sequence[int] = (2, 3, 3, 2),
+      use_variable_for_filter=False,
+  ) -> Tuple[core.Tensor, core.Tensor]:
     """Creates a basic convolution model.
 
     This is intended to be used for TF1 (graph mode) tests.
 
     Args:
+      input_shape: Shape of the input tensor.
+      filter_shape: Shape of the filter.
       use_variable_for_filter: Setting this to `True` makes the filter for the
         conv operation a `tf.Variable`.
 
@@ -150,10 +338,11 @@ def _create_simple_tf1_conv_model(self,
       in_placeholder: Input tensor placeholder.
       output_tensor: The resulting tensor of the convolution operation.
     """
-    in_placeholder = array_ops.placeholder(dtypes.float32, shape=[1, 3, 4, 3])
+    in_placeholder = array_ops.placeholder(dtypes.float32, shape=input_shape)
 
     filters = random_ops.random_uniform(
-        shape=(2, 3, 3, 2), minval=-1., maxval=1.)
+        shape=filter_shape, minval=-1.0, maxval=1.0
+    )
     if use_variable_for_filter:
       filters = variables.Variable(filters)
 
@@ -163,13 +352,14 @@ def _create_simple_tf1_conv_model(self,
         strides=[1, 1, 2, 1],
         dilations=[1, 1, 1, 1],
         padding='SAME',
-        data_format='NHWC')
+        data_format='NHWC',
+    )
 
     return in_placeholder, output_tensor
 
-  def _create_simple_tf1_gather_model(self,
-                                      use_variable_for_filter=False
-                                     ) -> Tuple[core.Tensor, core.Tensor]:
+  def _create_simple_tf1_gather_model(
+      self, use_variable_for_filter=False
+  ) -> Tuple[core.Tensor, core.Tensor]:
     """Creates a basic gather model.
 
     This is intended to be used for TF1 (graph mode) tests.
@@ -184,7 +374,7 @@ def _create_simple_tf1_gather_model(self,
     """
     in_placeholder = array_ops.placeholder(dtypes.int64, shape=(6))
 
-    filters = random_ops.random_uniform(shape=(64, 512), minval=-1., maxval=1.)
+    filters = np.random.randn(128, 32).astype(np.float32)
     if use_variable_for_filter:
       filters = variables.Variable(filters)
 
@@ -192,9 +382,53 @@ def _create_simple_tf1_gather_model(self,
 
     return in_placeholder, output_tensor
 
-  def _create_vocab_table_lookup_model_tf1(
+  def _create_and_save_vocab_table_lookup_model_tf1(
       self,
-      sess: session.Session) -> Tuple[core.Tensor, core.Tensor, core.Tensor]:
+      output_path: str,
+      tags: Collection[str],
+      signature_def_key: str,
+  ) -> Tuple[Mapping[str, core.Tensor], Mapping[str, core.Tensor]]:
+    """Creates and saves a simple model that uses a vocab table.
+
+    Args:
+      output_path: Path to the directory to save the created model.
+      tags: Set of strings that identifies the saved meta graph.
+      signature_def_key: Name of the SignatureDef. Used to identify the
+        SignatureDef within the meta graph.
+
+    Returns:
+      inputs: A mapping of input_key -> input_tensor (placeholder). The input
+        key is "input_vocabs".
+      outputs: A mapping of output_key -> output_tensor. The output keys are
+        "lookup" and "output".
+    """
+    with session.Session(graph=ops.Graph()) as sess:
+      input_vocabs_placeholder, lookup_tensor, output_tensor = (
+          self._create_vocab_table_lookup_model_tf1(sess)
+      )
+
+      inputs = {'input_vocabs': input_vocabs_placeholder}
+      outputs = {
+          'lookup': lookup_tensor,
+          'output': output_tensor,
+      }
+
+      self._save_tf1_model(
+          sess,
+          output_path,
+          signature_def_key,
+          tags,
+          inputs=inputs,
+          outputs=outputs,
+          init_op=lookup_ops.tables_initializer(),
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+      )
+
+    return inputs, outputs
+
+  def _create_vocab_table_lookup_model_tf1(
+      self, sess: session.Session
+  ) -> Tuple[core.Tensor, core.Tensor, core.Tensor]:
     """Creates a simple model that initializes and lookups a vocab table.
 
     This model creates an asset file at "vocab_file.txt" containing
@@ -215,37 +449,165 @@ def _create_vocab_table_lookup_model_tf1(
     asset_dir = self.create_tempdir('assets').full_path
     asset_file = os.path.join(asset_dir, 'vocab_file.txt')
     file_io.write_string_to_file(
-        filename=asset_file, file_content='hello,model,quantization\n')
+        filename=asset_file, file_content='hello,model,quantization\n'
+    )
+
+    vocab_file = asset.Asset(asset_file)
+
+    raw_vocab = io_ops.read_file(vocab_file)
+    vocabs = ragged_string_ops.string_split_v2(
+        string_ops.string_strip(raw_vocab), sep=','
+    )
+
+    # Initialize the vocab table. Each comma-separated word in vocab_file.txt
+    # corresponds to the numeric identifiers in `values`.
+    kv_init = lookup_ops.KeyValueTensorInitializer(
+        keys=vocabs, values=np.array([0, 1, 2]), value_dtype=dtypes.int64
+    )
+    table = lookup_ops.StaticVocabularyTable(kv_init, num_oov_buckets=5)
+
+    input_vocabs_placeholder = array_ops.placeholder(
+        dtypes.string, shape=(None,), name='input_vocabs'
+    )
+
+    # Introduce a matmul op that takes the lookup values to observe the
+    # effects of quantization.
+    lookup_vals = math_ops.cast(
+        table.lookup(input_vocabs_placeholder), dtypes.float32
+    )
+    # shape: (2, ?)
+    matmul_input = array_ops.stack([lookup_vals, lookup_vals])
+
+    # Create a dummy weight matrix filled with ones.
+    weight_row = array_ops.ones(
+        shape=array_ops.shape(input_vocabs_placeholder), dtype=dtypes.float32
+    )
+    # shape: (?, 2)
+    weight = array_ops.transpose_v2(array_ops.stack([weight_row, weight_row]))
+    # shape: (2, 2)
+    output_tensor = math_ops.matmul(matmul_input, weight)
+
+    return input_vocabs_placeholder, lookup_vals, output_tensor
+
+  def _create_and_save_vocab_table_lookup_qat_model_tf1(
+      self,
+      output_path: str,
+      tags: Collection[str],
+      signature_def_key: str,
+  ) -> Tuple[Mapping[str, core.Tensor], Mapping[str, core.Tensor]]:
+    """Creates and saves a simple QAT model that uses a vocab table.
+
+    Args:
+      output_path: Path to the directory to save the created model.
+      tags: Set of strings that identifies the saved meta graph.
+      signature_def_key: Name of the SignatureDef. Used to identify the
+        SignatureDef within the meta graph.
+
+    Returns:
+      inputs: A mapping of input_key -> input_tensor (placeholder). The input
+        key is "input_vocabs".
+      outputs: A mapping of output_key -> output_tensor. The output keys are
+        "lookup" and "output".
+    """
+    with session.Session(graph=ops.Graph()) as sess:
+      input_vocabs_placeholder, lookup_tensor, output_tensor = (
+          self._create_vocab_table_lookup_qat_model_tf1(sess)
+      )
+
+      inputs = {'input_vocabs': input_vocabs_placeholder}
+      outputs = {
+          'lookup': lookup_tensor,
+          'output': output_tensor,
+      }
+
+      self._save_tf1_model(
+          sess,
+          output_path,
+          signature_def_key,
+          tags,
+          inputs=inputs,
+          outputs=outputs,
+          init_op=lookup_ops.tables_initializer(),
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+      )
+
+    return inputs, outputs
+
+  def _create_vocab_table_lookup_qat_model_tf1(
+      self, sess: session.Session
+  ) -> Tuple[core.Tensor, core.Tensor, core.Tensor]:
+    """Creates a simple QAT model that initializes and lookups a vocab table.
+
+    This model creates an asset file at "vocab_file.txt" containing
+    comma-separated vocabularies.  It also initializes a `StaticVocabularyTable`
+    and performs a lookup with the input vocabs, which is a 1D tensor of
+    strings.
+
+    Args:
+      sess: Tensorflow Session to create the model in.
+
+    Returns:
+      (input_vocabs_placeholder, lookup_vals, output_tensor), where
+      * input_vocabs_placeholder is a placeholder tensor of 1D strings
+      * lookup_vals is an output tensor that is a direct result of table lookup
+      * output_tensor is a float 2x2 matrix
+    """
+    # Creates and populates an asset file.
+    asset_dir = self.create_tempdir('assets').full_path
+    asset_file = os.path.join(asset_dir, 'vocab_file.txt')
+    file_io.write_string_to_file(
+        filename=asset_file, file_content='hello,model,quantization\n'
+    )
 
     vocab_file = asset.Asset(asset_file)
 
     raw_vocab = io_ops.read_file(vocab_file)
     vocabs = ragged_string_ops.string_split_v2(
-        string_ops.string_strip(raw_vocab), sep=',')
+        string_ops.string_strip(raw_vocab), sep=','
+    )
 
     # Initialize the vocab table. Each comma-separated word in vocab_file.txt
     # corresponds to the numeric identifiers in `values`.
     kv_init = lookup_ops.KeyValueTensorInitializer(
-        keys=vocabs, values=np.array([0, 1, 2]), value_dtype=dtypes.int64)
+        keys=vocabs, values=np.array([0, 1, 2]), value_dtype=dtypes.int64
+    )
     table = lookup_ops.StaticVocabularyTable(kv_init, num_oov_buckets=5)
 
     input_vocabs_placeholder = array_ops.placeholder(
-        dtypes.string, shape=(None,), name='input_vocabs')
+        dtypes.string, shape=(None,), name='input_vocabs'
+    )
 
     # Introduce a matmul op that takes the lookup values to observe the
     # effects of quantization.
     lookup_vals = math_ops.cast(
-        table.lookup(input_vocabs_placeholder), dtypes.float32)
+        table.lookup(input_vocabs_placeholder), dtypes.float32
+    )
+
     # shape: (2, ?)
     matmul_input = array_ops.stack([lookup_vals, lookup_vals])
+    # Insert fake quant to simulate a QAT model.
+    matmul_input = array_ops.fake_quant_with_min_max_args(
+        matmul_input, min=-0.3, max=0.3, num_bits=8, narrow_range=False
+    )
 
     # Create a dummy weight matrix filled with ones.
     weight_row = array_ops.ones(
-        shape=array_ops.shape(input_vocabs_placeholder), dtype=dtypes.float32)
+        shape=array_ops.shape(input_vocabs_placeholder), dtype=dtypes.float32
+    )
+
     # shape: (?, 2)
     weight = array_ops.transpose_v2(array_ops.stack([weight_row, weight_row]))
+    # Insert fake quant to simulate a QAT model.
+    weight = array_ops.fake_quant_with_min_max_args(
+        weight, min=-0.1, max=0.2, num_bits=8, narrow_range=False
+    )
+
     # shape: (2, 2)
     output_tensor = math_ops.matmul(matmul_input, weight)
+    # Insert fake quant to simulate a QAT model.
+    output_tensor = array_ops.fake_quant_with_min_max_args(
+        output_tensor, min=-0.2, max=0.2, num_bits=8, narrow_range=False
+    )
 
     return input_vocabs_placeholder, lookup_vals, output_tensor
 
@@ -253,10 +615,11 @@ def _create_data_generator(
       self,
       input_key: str,
       shape: Sequence[int],
-      minval: float = -1.,
-      maxval: float = 1.,
+      minval: float = -1.0,
+      maxval: float = 1.0,
       dtype: dtypes.DType = dtypes.float32,
-      num_examples: int = 8) -> repr_dataset.RepresentativeDataset:
+      num_examples: int = 8,
+  ) -> repr_dataset.RepresentativeDataset:
     """Creates a data generator to be used as representative dataset.
 
     Supports generating random value input tensors mapped by the `input_key`.
@@ -286,7 +649,8 @@ def _save_tf1_model(
       inputs: Mapping[str, core.Tensor],
       outputs: Mapping[str, core.Tensor],
       init_op: Optional[ops.Operation] = None,
-      assets_collection: Optional[Sequence[ops.Tensor]] = None) -> None:
+      assets_collection: Optional[Sequence[ops.Tensor]] = None,
+  ) -> None:
     """Saves a TF1 model.
 
     Args:
@@ -303,23 +667,69 @@ def _save_tf1_model(
     """
     v1_builder = builder.SavedModelBuilder(saved_model_path)
     sig_def = signature_def_utils_impl.predict_signature_def(
-        inputs=inputs, outputs=outputs)
+        inputs=inputs, outputs=outputs
+    )
 
     v1_builder.add_meta_graph_and_variables(
         sess,
         tags,
         signature_def_map={signature_key: sig_def},
         main_op=init_op,
-        assets_collection=assets_collection)
+        assets_collection=assets_collection,
+    )
     v1_builder.save()
 
-  def _create_and_save_tf1_gather_model(self,
-                                        saved_model_path: str,
-                                        signature_key: str,
-                                        tags: Collection[str],
-                                        input_key: str,
-                                        output_key: str,
-                                        use_variable=False) -> core.Tensor:
+  def _create_simple_gather_and_conv_model(self, filter_shape: Sequence[int]):
+    class SimpleGatherAndConvModel(module.Module):
+      """A simple model with a single gather and a conv2d."""
+
+      def __init__(self):
+        """Initializes a SimpleGatherAndConvModel."""
+        embedding_w_val = np.random.randn(1024, 3, 4, 3).astype('f4')
+        self.embedding_w = embedding_w_val
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  shape=[1], dtype=dtypes.int64, name='input_tensor'
+              )
+          ]
+      )
+      def model(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Performs a gather and a 2D convolution operation.
+
+        Args:
+          input_tensor: Input tensor to perform operation on.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        conv_filters = np.random.uniform(
+            low=-10, high=10, size=filter_shape
+        ).astype('f4')
+
+        out = array_ops.gather_v2(self.embedding_w, input_tensor)
+        out = nn_ops.conv2d(
+            out,
+            conv_filters,
+            strides=[1, 1, 2, 1],
+            dilations=[1, 1, 1, 1],
+            padding='SAME',
+            data_format='NHWC',
+        )
+        return {'output': out}
+
+    return SimpleGatherAndConvModel()
+
+  def _create_and_save_tf1_gather_model(
+      self,
+      saved_model_path: str,
+      signature_key: str,
+      tags: Collection[str],
+      input_key: str,
+      output_key: str,
+      use_variable=False,
+  ) -> core.Tensor:
     """Creates and saves a simple gather model.
 
     This is intended to be used for TF1 (graph mode) tests.
@@ -339,7 +749,8 @@ def _create_and_save_tf1_gather_model(self,
     """
     with ops.Graph().as_default(), session.Session() as sess:
       in_placeholder, output_tensor = self._create_simple_tf1_gather_model(
-          use_variable_for_filter=use_variable)
+          use_variable_for_filter=use_variable
+      )
 
       if use_variable:
         sess.run(variables.global_variables_initializer())
@@ -350,12 +761,12 @@ def _create_and_save_tf1_gather_model(self,
           signature_key,
           tags,
           inputs={input_key: in_placeholder},
-          outputs={output_key: output_tensor})
+          outputs={output_key: output_tensor},
+      )
 
       return in_placeholder
 
   def _create_gather_model(self, use_variable):
-
     class GatherModel(autotrackable.AutoTrackable):
       """A simple model with a single gather."""
 
@@ -366,33 +777,107 @@ def __init__(self, use_variable):
           use_variable: If True, creates a variable for weight.
         """
         super(GatherModel, self).__init__()
-        w_val = np.random.randint(
-            low=0, high=100, size=(64, 512), dtype=np.int64)
+        w_val = np.random.randn(128, 32).astype('f4')
         if use_variable:
           self.w = variables.Variable(w_val)
         else:
           self.w = w_val
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(
-              shape=[6], dtype=dtypes.int64, name='input_tensor')
-      ])
-      def __call__(self,
-                   input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(
+                  shape=[6], dtype=dtypes.int64, name='input_tensor'
+              )
+          ]
+      )
+      def __call__(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
         """Performs a gather operation."""
         out = array_ops.gather_v2(self.w, input_tensor)
         return {'output': out}
 
     return GatherModel(use_variable)
 
-  def _create_conv2d_model(self):
+  def _create_depthwise_conv2d_model(
+      self,
+      input_shape: Sequence[int],
+      filter_shape: Sequence[int],
+      has_bias: bool = False,
+      has_batch_norm: bool = False,
+      activation_fn: Optional[ops.Operation] = None,
+      strides: Sequence[int] = (1, 2, 2, 1),
+      dilations: Sequence[int] = (1, 1, 1, 1),
+      padding: str = 'SAME',
+  ):
+    class DepthwiseConvModel(module.Module):
+      """A simple model with a single depthwise conv2d, bias and relu."""
+
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
+          ]
+      )
+      def depthwise_conv(
+          self, input_tensor: core.Tensor
+      ) -> Mapping[str, core.Tensor]:
+        """Performs a 2D depthwise convolution operation.
+
+        Args:
+          input_tensor: Input tensor to perform convolution on.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        filters = np.random.uniform(low=-10, high=10, size=filter_shape).astype(
+            'f4'
+        )
+        out_channel_size = filter_shape[2] * filter_shape[3]
+        bias = np.random.uniform(
+            low=0, high=10, size=(out_channel_size)
+        ).astype('f4')
+        scale, offset = [1.0] * out_channel_size, [0.5] * out_channel_size
+        mean, variance = scale, offset
+        out = nn_ops.depthwise_conv2d_native(
+            input_tensor,
+            filters,
+            strides=[1, 2, 2, 1],
+            dilations=[1, 1, 1, 1],
+            padding='SAME',
+            data_format='NHWC',
+        )
+        if has_bias:
+          out = nn_ops.bias_add(out, bias)
+        if has_batch_norm:
+          # Fusing is supported for non-training case.
+          out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
+              out, scale, offset, mean, variance, is_training=False
+          )
+        if activation_fn is not None:
+          out = activation_fn(out)
+        return {'output': out}
+
+    return DepthwiseConvModel()
 
+  def _create_conv2d_model(
+      self,
+      input_shape: Sequence[int],
+      filter_shape: Sequence[int],
+      has_bias: bool = False,
+      has_batch_norm: bool = False,
+      activation_fn: Optional[ops.Operation] = None,
+      strides: Sequence[int] = (1, 2, 2, 1),
+      dilations: Sequence[int] = (1, 1, 1, 1),
+      padding: str = 'SAME',
+  ):
     class ConvModel(module.Module):
       """A simple model with a single conv2d, bias and relu."""
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=[1, 3, 4, 512], dtype=dtypes.float32)
-      ])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(shape=input_shape, dtype=dtypes.float32)
+          ]
+      )
       def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a 2D convolution operation.
 
@@ -402,51 +887,70 @@ def conv(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         Returns:
           A map of: output key -> output result.
         """
-        filters = np.random.uniform(
-            low=-10, high=10, size=(2, 3, 512, 2)).astype('f4')
-        bias = np.random.uniform(low=0, high=10, size=(2)).astype('f4')
+        filters = np.random.uniform(low=-10, high=10, size=filter_shape).astype(
+            'f4'
+        )
+        out_channel_size = filter_shape[-1]
+        bias = np.random.uniform(
+            low=0, high=10, size=(out_channel_size)
+        ).astype('f4')
+        scale, offset = [1.0] * out_channel_size, [0.5] * out_channel_size
+        mean, variance = scale, offset
         out = nn_ops.conv2d(
             input_tensor,
             filters,
             strides=[1, 1, 2, 1],
             dilations=[1, 1, 1, 1],
             padding='SAME',
-            data_format='NHWC')
-        out = nn_ops.bias_add(out, bias, data_format='NHWC')
-        out = nn_ops.relu6(out)
+            data_format='NHWC',
+        )
+        if has_bias:
+          out = nn_ops.bias_add(out, bias, data_format='NHWC')
+        if has_batch_norm:
+          # Fusing is supported for non-training case.
+          out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
+              out, scale, offset, mean, variance, is_training=False
+          )
+        if activation_fn is not None:
+          out = activation_fn(out)
         return {'output': out}
 
     return ConvModel()
 
-  def _create_matmul_model(self,
-                           has_bias: bool = False,
-                           activation_fn: Optional[ops.Operation] = None) ->...:
-
+  def _create_matmul_model(
+      self,
+      input_shape: Sequence[int],
+      weight_shape: Sequence[int],
+      saved_model_path: str,
+      has_bias: bool = False,
+      activation_fn: Optional[ops.Operation] = None,
+  ) -> module.Module:
     class MatmulModel(module.Module):
       """A simple model with a single matmul.
 
       Bias and activation function are optional.
       """
 
-      def __init__(self,
-                   has_bias: bool = False,
-                   activation_fn: Optional[ops.Operation] = None) -> None:
+      def __init__(
+          self,
+          weight_shape: Sequence[int],
+          has_bias: bool = False,
+          activation_fn: Optional[ops.Operation] = None,
+      ) -> None:
         """Initializes a MatmulModel.
 
         Args:
+          weight_shape: Shape of the weight tensor.
           has_bias: If True, creates and adds a bias term.
           activation_fn: The activation function to be used. No activation
             function if None.
         """
         self.has_bias = has_bias
         self.activation_fn = activation_fn
-        self.filters = np.random.uniform(low=-1.0, high=1.0, size=(1024, 3))
-        self.bias = np.random.uniform(low=-1.0, high=1.0, size=(3,))
+        self.filters = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
+        self.bias = np.random.uniform(low=-1.0, high=1.0, size=weight_shape[-1])
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(
-              shape=(1, 1024), dtype=dtypes.float32, name='input_tensor')
-      ])
+      @def_function.function
       def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a matrix multiplication.
 
@@ -470,15 +974,106 @@ def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
 
         return {'output': out}
 
-    return MatmulModel(has_bias, activation_fn)
+    model = MatmulModel(weight_shape, has_bias, activation_fn)
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.matmul.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+    return model
+
+  def _create_einsum_model(
+      self,
+      saved_model_path: str,
+      equation: str,
+      input_shape: Sequence[int],
+      weight_shape: Sequence[int],
+      bias_shape: Optional[Sequence[int]] = None,
+      activation_fn: Optional[ops.Operation] = None,
+  ) -> module.Module:
+    class EinsumModel(module.Module):
+      """A simple model with a single einsum.
+
+      Bias and activation function are optional.
+      """
+
+      def __init__(
+          self,
+          equation: str,
+          weight_shape: Sequence[int],
+          bias_shape: Optional[Sequence[int]] = None,
+          activation_fn: Optional[ops.Operation] = None,
+      ) -> None:
+        """Initializes a EinsumModel.
+
+        Args:
+          equation: a string describing the contraction.
+          weight_shape: Shape of the weight tensor.
+          bias_shape: Shape of the bias. This is not always 1D so Einsum ops
+            usually use Add op instead of BiasAdd.
+          activation_fn: The activation function to be used. No activation
+            function if None.
+        """
+        self.equation = equation
+        self.activation_fn = activation_fn
+        self.weight = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
+        self.bias = (
+            np.random.uniform(low=-1.0, high=1.0, size=bias_shape)
+            if bias_shape is not None
+            else None
+        )
+
+      @def_function.function
+      def einsum(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Evaluates the Einstein summation convention.
+
+        Depending on self.has_bias and self.activation_fn, it may add a bias
+        term or go through the activaction function.
+
+        Args:
+          input_tensor: Input tensor to einsum with the weight.
+
+        Returns:
+          A map of: output key -> output result.
+        """
+        out = tensorflow.einsum(self.equation, input_tensor, self.weight)
+
+        if self.bias is not None:
+          out = out + self.bias
+
+        if self.activation_fn is not None:
+          out = self.activation_fn(out)
+
+        return {'output': out}
 
-  def _create_and_save_tf1_conv_model(self,
-                                      saved_model_path: str,
-                                      signature_key: str,
-                                      tags: Collection[str],
-                                      input_key: str,
-                                      output_key: str,
-                                      use_variable=False) -> core.Tensor:
+    model = EinsumModel(equation, weight_shape, bias_shape, activation_fn)
+    saved_model_save.save(
+        model,
+        saved_model_path,
+        signatures=model.einsum.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+    return model
+
+  def _create_and_save_tf1_conv_model(
+      self,
+      saved_model_path: str,
+      signature_key: str,
+      tags: Collection[str],
+      input_key: str,
+      output_key: str,
+      *,
+      input_shape: Sequence[int] = (1, 3, 4, 3),
+      filter_shape: Sequence[int] = (2, 3, 3, 2),
+      use_variable: bool = False,
+  ) -> core.Tensor:
     """Creates and saves a simple convolution model.
 
     This is intended to be used for TF1 (graph mode) tests.
@@ -490,6 +1085,8 @@ def _create_and_save_tf1_conv_model(self,
       tags: Set of tags associated with the model.
       input_key: The key to the input tensor.
       output_key: The key to the output tensor.
+      input_shape: Shape of the input tensor.
+      filter_shape: Shape of the filter.
       use_variable: Setting this to `True` makes the filter for the conv
         operation a `tf.Variable`.
 
@@ -498,7 +1095,10 @@ def _create_and_save_tf1_conv_model(self,
     """
     with ops.Graph().as_default(), session.Session() as sess:
       in_placeholder, output_tensor = self._create_simple_tf1_conv_model(
-          use_variable_for_filter=use_variable)
+          input_shape=input_shape,
+          filter_shape=filter_shape,
+          use_variable_for_filter=use_variable,
+      )
 
       if use_variable:
         sess.run(variables.global_variables_initializer())
@@ -509,6 +1109,7 @@ def _create_and_save_tf1_conv_model(self,
           signature_key,
           tags,
           inputs={input_key: in_placeholder},
-          outputs={output_key: output_tensor})
+          outputs={output_key: output_tensor},
+      )
 
     return in_placeholder
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 6be9ad9ea18..8c1beece1ef 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -13,110 +13,223 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstring>
+#include <optional>
 #include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
+#include "absl/strings/str_format.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.h"
+#include "pybind11/stl.h"
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 
+namespace {
+
+using ::tensorflow::calibrator::CalibratorSingleton;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::QuantizationOptions;
+using ::tensorflow::quantization::QuantizePtqDynamicRange;
+using ::tensorflow::quantization::QuantizePtqModelPostCalibration;
+using ::tensorflow::quantization::QuantizePtqModelPreCalibration;
+using ::tensorflow::quantization::QuantizeQatModel;
+
+// Serializes an ExportedModel. Raises python ValueError if serialization fails.
+std::string Serialize(const ExportedModel& exported_model) {
+  const std::string exported_model_serialized =
+      exported_model.SerializeAsString();
+
+  // Empty string means it failed to serialize the protobuf with an error. See
+  // the docstring for SerializeAsString for details.
+  if (exported_model_serialized.empty()) {
+    throw py::value_error("Failed to serialize ExportedModel.");
+  }
+
+  return exported_model_serialized;
+}
+
+// Retrieves collected min / max values of a `CustomAggregator` node from the
+// singleton. `id` is the identifier of the `CustomAggregator`.
+std::pair<float, float> GetCalibratorMinMax(const absl::string_view id) {
+  std::optional<std::pair<float, float>> min_max =
+      CalibratorSingleton::GetMinMax(id);
+  if (min_max == std::nullopt) {
+    throw py::value_error(
+        absl::StrFormat("Calibrated data does not exist. Cannot find min/max "
+                        "value for id: '%s'",
+                        id));
+  }
+
+  return *min_max;
+}
+
+}  // namespace
+
+namespace pybind11 {
+namespace detail {
+
+// Converts `ExportedModel` (c++) to `bytes` (python). The resulting `bytes`
+// object is a serialization of `ExportedModel`.
+//
+// See https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html for
+// further details on how custom type conversions work for pybind11.
+template <>
+struct type_caster<ExportedModel> {
+ public:
+  PYBIND11_TYPE_CASTER(ExportedModel, const_name("ExportedModel"));
+
+  // Constructs a `bytes` object after serializing `src`.
+  static handle cast(ExportedModel&& src, return_value_policy policy,
+                     handle parent) {
+    // release() prevents the reference count from decreasing upon the
+    // destruction of py::bytes and returns a raw python object handle.
+    return py::bytes(Serialize(src)).release();
+  }
+};
+
+// Python -> cpp conversion for `QuantizationOptions`. Accepts a serialized
+// protobuf string and deserializes into an instance of `QuantizationOptions`.
+template <>
+struct type_caster<QuantizationOptions> {
+ public:
+  PYBIND11_TYPE_CASTER(QuantizationOptions, const_name("QuantizationOptions"));
+
+  bool load(handle src, const bool convert) {
+    auto caster = make_caster<absl::string_view>();
+    // The user should have passed a valid python string.
+    if (!caster.load(src, convert)) {
+      return false;
+    }
+
+    const absl::string_view quantization_opts_serialized =
+        cast_op<absl::string_view>(std::move(caster));
+
+    // NOLINTNEXTLINE: Explicit std::string conversion required for OSS.
+    return value.ParseFromString(std::string(quantization_opts_serialized));
+  }
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
 PYBIND11_MODULE(pywrap_quantize_model, m) {
+  // Supports absl::StatusOr<T> type conversions.
+  pybind11::google::ImportStatusModule();
+
+  // Calibrator related functions.
   m.def(
       "clear_calibrator",
-      [] {
-        tensorflow::quantization::ClearCollectedInformationFromCalibrator();
-      },
+      [] { CalibratorSingleton::ClearCollectedInformation(); },
       R"pbdoc(
       Clears the collected metrics from the calibrator.
     )pbdoc");
   m.def(
       "clear_data_from_calibrator",
-      [](const absl::string_view id) {
-        tensorflow::quantization::ClearDataFromCalibrator(id);
-      },
+      [](const absl::string_view id) { CalibratorSingleton::ClearData(id); },
       R"pbdoc(
       Clears the collected data of the given id from calibrator.
     )pbdoc");
   m.def(
-      "get_max_from_calibrator",
-      [](const absl::string_view id) {
-        return tensorflow::quantization::GetMaxFromCalibrator(id);
+      "get_min_from_calibrator",
+      [](const absl::string_view id) -> float {
+        const std::pair<float, float> min_max = GetCalibratorMinMax(id);
+        return min_max.first;
       },
       R"pbdoc(
       Return the tuple with the min value of the given id.
     )pbdoc");
   m.def(
-      "get_min_from_calibrator",
-      [](const absl::string_view id) {
-        return tensorflow::quantization::GetMinFromCalibrator(id);
+      "get_max_from_calibrator",
+      [](const absl::string_view id) -> float {
+        const std::pair<float, float> min_max = GetCalibratorMinMax(id);
+        return min_max.second;
       },
       R"pbdoc(
       Return the tuple with the min value of the given id.
     )pbdoc");
+
+  // Quantization functions.
   m.def(
       "quantize_qat_model",
       [](const absl::string_view saved_model_path,
-         const absl::string_view exported_names_str,
-         const absl::string_view tags,
-         const absl::string_view quant_opts_serialized) {
-        const std::string graph_def_serialized =
-            tensorflow::quantization::QuantizeQatModel(saved_model_path,
-                                                       exported_names_str, tags,
-                                                       quant_opts_serialized)
-                .first;
-
-        return py::bytes(graph_def_serialized);
+         const std::vector<std::string>& signature_keys,
+         const std::unordered_set<std::string>& tags,
+         const QuantizationOptions& quant_opts)
+          -> absl::StatusOr<ExportedModel> {
+        return QuantizeQatModel(saved_model_path, signature_keys, tags,
+                                quant_opts);
       },
       R"pbdoc(
-      Returns serialized GraphDef of a TF model.
+      Returns serialized ExportedModel that contains the quantized model's
+      GraphDef and metadata. The user should pass a serialized
+      `QuantizationOptions` for the `quant_opts` argument.
+
+      Raises `StatusNotOk` exception if when the run was unsuccessful.
     )pbdoc");
+
   m.def(
       "quantize_ptq_dynamic_range",
       [](const absl::string_view saved_model_path,
-         const absl::string_view exported_names_str,
-         const absl::string_view tags,
-         const absl::string_view quant_opts_serialized) {
-        const std::string graph_def_serialized =
-            tensorflow::quantization::QuantizePtqDynamicRange(
-                saved_model_path, exported_names_str, tags,
-                quant_opts_serialized)
-                .first;
-
-        return py::bytes(graph_def_serialized);
+         const std::vector<std::string>& signature_keys,
+         const std::unordered_set<std::string>& tags,
+         const QuantizationOptions& quant_opts)
+          -> absl::StatusOr<ExportedModel> {
+        return QuantizePtqDynamicRange(saved_model_path, signature_keys, tags,
+                                       quant_opts);
       },
       R"pbdoc(
-      Returns serialized GraphDef of a TF model.
+      Returns serialized ExportedModel that contains the quantized model's
+      GraphDef and metadata. The user should pass a serialized
+      `QuantizationOptions` for the `quant_opts` argument.
+
+      Raises `StatusNotOk` exception if when the run was unsuccessful.
     )pbdoc");
+
   m.def(
       "quantize_ptq_model_pre_calibration",
       [](const absl::string_view saved_model_path,
-         const absl::string_view exported_names_str,
-         const absl::string_view tags,
-         const absl::string_view quant_opts_serialized) {
-        const auto [graph_def_serialized, init_node_name] =
-            tensorflow::quantization::QuantizePtqModelPreCalibration(
-                saved_model_path, exported_names_str, tags,
-                quant_opts_serialized);
-
-        return std::make_pair(py::bytes(graph_def_serialized), init_node_name);
+         const std::vector<std::string>& signature_keys,
+         const std::unordered_set<std::string>& tags,
+         const QuantizationOptions& quant_opts,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases)
+          -> absl::StatusOr<ExportedModel> {
+        return QuantizePtqModelPreCalibration(saved_model_path, signature_keys,
+                                              tags, quant_opts,
+                                              function_aliases);
       },
       R"pbdoc(
-      Returns serialized GraphDef of a TF model.
+      Returns serialized ExportedModel that contains the model's GraphDef and
+      metadata. The GraphDef contains extra ops required for calibration. The
+      user should pass a serialized `QuantizationOptions` for the `quant_opts`
+      argument.
+
+      Raises `StatusNotOk` exception if when the run was unsuccessful.
     )pbdoc");
+
   m.def(
       "quantize_ptq_model_post_calibration",
       [](const absl::string_view saved_model_path,
-         const absl::string_view exported_names_str,
-         const absl::string_view tags,
-         const absl::string_view quant_opts_serialized) {
-        const auto [graph_def_serialized, init_node_name] =
-            tensorflow::quantization::QuantizePtqModelPostCalibration(
-                saved_model_path, exported_names_str, tags,
-                quant_opts_serialized);
-
-        return std::make_pair(py::bytes(graph_def_serialized), init_node_name);
+         const std::vector<std::string>& signature_keys,
+         const std::unordered_set<std::string>& tags,
+         const QuantizationOptions& quant_opts,
+         const absl::flat_hash_map<std::string, std::string>& function_aliases)
+          -> absl::StatusOr<ExportedModel> {
+        return QuantizePtqModelPostCalibration(saved_model_path, signature_keys,
+                                               tags, quant_opts,
+                                               function_aliases);
       },
       R"pbdoc(
-      Returns serialized GraphDef of a TF model.
+      Returns serialized ExportedModel that contains the quantized model's
+      GraphDef and metadata. The user should pass a serialized
+      `QuantizationOptions` for the `quant_opts` argument.
+
+      Raises `StatusNotOk` exception if when the run was unsuccessful.
     )pbdoc");
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py
new file mode 100644
index 00000000000..ed531218290
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model_test.py
@@ -0,0 +1,51 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for pywrap_quantize_model.
+
+These test cases are mostly for validation checks. Tests for functionalities
+are at `quantize_model_test.py`.
+"""
+from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model
+from tensorflow.python.platform import test
+
+
+class PywrapQuantizeModelTest(test.TestCase):
+  """Test cases for quantize_model python wrappers."""
+
+  def test_quantize_model_fails_when_invalid_quant_options_serialization(self):
+    saved_model_path = self.create_tempdir('saved_model').full_path
+    signature_def_keys = ['serving_default']
+    tags = {'serve'}
+    quant_opts_serialized = 'invalid protobuf serialization string'
+
+    with self.assertRaisesRegex(TypeError, 'incompatible function arguments'):
+      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
+          saved_model_path, signature_def_keys, tags, quant_opts_serialized
+      )
+
+  def test_quantize_model_fails_when_invalid_quant_options_type(self):
+    saved_model_path = self.create_tempdir('saved_model').full_path
+    signature_def_keys = ['serving_default']
+    tags = {'serve'}
+    invalid_quant_opts_object = ('a', 'b', 'c')
+
+    with self.assertRaisesRegex(TypeError, 'incompatible function arguments'):
+      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
+          saved_model_path, signature_def_keys, tags, invalid_quant_opts_object
+      )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 0517be6937a..ae4c98c05b2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -14,16 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
 
-#include <cstdlib>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "llvm/Support/raw_ostream.h"
@@ -40,7 +39,10 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
@@ -57,39 +59,113 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace quantization {
-namespace internal {
 namespace {
 
-void AddExportPasses(mlir::PassManager &pm) {
+using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+
+// Suffix string for the module export step. Used for debugging.
+constexpr absl::string_view kExportStepSuffix = "_export";
+
+// Options when running passes for exporting an MLIR ModuleOp.
+struct ExportOptions {
+  // If set to `true`, it runs `DuplicateShapeDeterminingConstantsPass` before
+  // lowering to tf_executor dialect.
+  bool duplicate_shape_determining_constants = true;
+
+  // If set to `true`, unfreezes constants into variables and saves them to a
+  // checkpoint file. Setting this to `true` is an experimental feature that has
+  // no stability guarantees.
+  bool unfreeze_constants = false;
+
+  // Path to the directory where checkpoint files are saved.
+  std::string checkpoint_dir = "";
+
+  // Name used to identify the ModuleOp this is exporting. Only used for
+  // debugging and does not modify the behavior of the export.
+  std::string debug_name = "tf_quant";
+};
+
+// Add passes for transforming the MLIR module op so that it can be exported
+// back to GraphDef. Roughly, this consists of:
+//   1) Inserting the @main function, which will become the main Graph.
+//   2) Duplicating shape-determining constants.
+//   3) Converting TF dialect -> tf_executor dialect.
+//   4) Adding initializer function's ops into @main function for correct
+//      resource initialization when loading the exported model.
+//
+// Duplicating shape-determining constants is required to place constants that
+// affect the shape of a tensor to be placed in the TPU graph instead of in the
+// CPU graph, when the graph gets converted for TPU inference. This allows these
+// constants to be known at XLA compilation time.
+void AddExportPasses(const bool duplicate_shape_determining_constants,
+                     mlir::PassManager &pm) {
+  if (duplicate_shape_determining_constants) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::quant::CreateDuplicateShapeDeterminingConstantsPass());
+  }
+
   pm.addPass(mlir::quant::CreateInsertMainFunctionPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::CreateFunctionalToExecutorDialectConversionPass());
   pm.addPass(mlir::CreateBreakUpIslandsPass());
   pm.addPass(mlir::quant::CreateMergeInitializerFunctionOpsToMainPass());
+
+  // Used to clean up the "tf._noinliner" attribute that is previously used to
+  // prevent certain functions from being inlined (see
+  // `MarkFunctionsNoinlinePass`). InlinerPass must not come after this pass.
+  pm.addPass(mlir::TF::CreateStripNoinlineAttributePass());
 }
 
-// Returns the name of the initializer node from a set of control return nodes.
-// Returns an empty string if no initializer node exists. This assumes that
-// there is only one node for initialization.
-std::string GetInitNodeName(
-    const absl::flat_hash_set<Node *> &control_ret_nodes) {
+// Finds and returns the name of the node from a set of control output nodes.
+// The name should contain the string `contains`. Returns an empty string if no
+// node whose name contains `contains` is found. Assumes there is at most one
+// such a node.
+std::string GetNodeName(const absl::flat_hash_set<Node *> &control_ret_nodes,
+                        const absl::string_view contains) {
   for (Node *control_ret_node : control_ret_nodes) {
-    if (absl::StrContains(control_ret_node->name(), kInitOpNamePrefix)) {
-      VLOG(1) << "Init node found: " << control_ret_node->name();
+    if (absl::StrContains(control_ret_node->name(), contains)) {
+      VLOG(1) << "Node found: " << control_ret_node->name()
+              << ", contains: " << contains;
       return control_ret_node->name();
     }
   }
+  VLOG(1) << "Could not find node whose name conatins: " << contains;
   return "";
 }
 
+[[nodiscard]] ExportedModel CreateExportedModel(
+    GraphDef &&graph_def, const absl::string_view init_node_name,
+    const absl::string_view restore_node_name,
+    const absl::string_view checkpoint_dir,
+    const std::vector<std::string> &variable_shared_names,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  ExportedModel exported_model{};
+  *exported_model.mutable_graph_def() = graph_def;
+  exported_model.set_init_node_name(std::string(init_node_name));
+  exported_model.set_restore_node_name(std::string(restore_node_name));
+  exported_model.set_checkpoint_dir(std::string(checkpoint_dir));
+  for (auto &shared_name : variable_shared_names) {
+    *exported_model.mutable_variable_shared_names()->Add() = shared_name;
+  }
+  exported_model.mutable_function_aliases()->insert(function_aliases.begin(),
+                                                    function_aliases.end());
+
+  return exported_model;
+}
+
 // Converts MLIR ModuleOp to ExportedModel. Returns InternalError status
-// when the GraphDef conversion fails.
+// when the conversion fails.
 absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
-    const mlir::ModuleOp module_op) {
+    const mlir::ModuleOp module_op, const absl::string_view checkpoint_dir,
+    const std::vector<std::string> &variable_shared_names,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
   const GraphExportConfig config{};
   FunctionLibraryDefinition flib_def{OpRegistry::Global(),
                                      FunctionDefLibrary()};
@@ -102,122 +178,180 @@ absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
                                status.error_message());
   }
 
-  auto graph_def = std::make_unique<GraphDef>();
-  graph->ToGraphDef(graph_def.get());
+  GraphDef graph_def{};
+  graph->ToGraphDef(&graph_def);
+
+  const std::string init_node_name =
+      GetNodeName(control_ret_nodes, kTfSavedModelInitializerInitType);
+  const std::string restore_node_name =
+      GetNodeName(control_ret_nodes, kTfSavedModelInitializerRestoreType);
 
-  return ExportedModel{*graph_def, GetInitNodeName(control_ret_nodes)};
+  return CreateExportedModel(std::move(graph_def), init_node_name,
+                             restore_node_name, checkpoint_dir,
+                             variable_shared_names, function_aliases);
 }
 
-// Creates a new file to dump the intermediate MLIRs by prefixing the
-// `dump_file_name` with the value of the TF_QUANT_MLIR_DUMP_PREFIX env
-// variable. Returns absl::FailedPreconditionError if the env variable is not
-// set or set to an empty string.
-[[nodiscard]] absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>>
-CreateMlirDumpFile(const absl::string_view dump_file_name) {
-  const auto prefix =
-      absl::NullSafeStringView(std::getenv("TF_QUANT_MLIR_DUMP_PREFIX"));
-  if (prefix.empty()) {
-    return absl::FailedPreconditionError(
-        "Environment variable not set: TF_QUANT_MLIR_DUMP_PREFIX, "
-        "IR dump file for TF quantization is not created.");
+// Runs MLIR passes with `module_op`. The passes are added by calling
+// `add_passes_func`, which is a callable receiving mlir::PassManager& as its
+// only argument. `name` identifies the set of passes added by `add_passes_func`
+// and is used for debugging. Changing the `name` does not modify the behavior
+// of the passes.
+//
+// It will try to dump intermediate MLIRs if certain conditions are met. See the
+// description from `MaybeEnableIrPrinting` for the details about the
+// conditions.
+//
+// Returns a non-OK status when the pass run fails or it fails to create an MLIR
+// dump file.
+template <typename FuncT>
+absl::Status RunPasses(const absl::string_view name, FuncT add_passes_func,
+                       mlir::MLIRContext &ctx, mlir::ModuleOp module_op) {
+  mlir::PassManager pm{&ctx};
+  add_passes_func(pm);
+
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler{&ctx};
+  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> out_dump_file =
+      MaybeEnableIrPrinting(pm, name);
+  if (!out_dump_file.ok()) {
+    return absl::InternalError(out_dump_file.status().message());
   }
 
-  Env *env = Env::Default();
-  const Status status = env->RecursivelyCreateDir(std::string(prefix));
-  if (!status.ok()) {
-    return ToAbslStatus(status);
+  if (failed(pm.run(module_op))) {
+    return absl::InternalError(
+        absl::StrFormat("Failed to run pass: %s. %s", name,
+                        diagnostic_handler.ConsumeStatus().error_message()));
   }
 
-  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
-  const std::string dump_file_path = tsl::io::JoinPath(prefix, dump_file_name);
-  auto dump_file = std::make_unique<llvm::raw_fd_ostream>(dump_file_path, ec);
-  if (ec) {
-    return absl::InternalError(absl::StrFormat(
-        "Unable to open file: %s, error: %s", dump_file_path, ec.message()));
+  return absl::OkStatus();
+}
+
+// Create a unique local temporary filename. It only creates the name, not the
+// actual file.
+absl::StatusOr<std::string> GetLocalTempFilename() {
+  auto *env = Env::Default();
+  std::string tmp_fname{};
+  if (!env->LocalTempFilename(&tmp_fname)) {
+    return absl::InternalError("Failed to create a local temp file name.");
   }
 
-  LOG(INFO) << "IR dump file created: " << dump_file_path;
-  return dump_file;
+  return tmp_fname;
 }
 
-// If verbosity level >= 1, this will dump intermediate IRs of passes to a file.
-// The file path is given by prefixing `name`.mlir with the value of the
-// TF_QUANT_MLIR_DUMP_PREFIX env variable. Returns `nullptr` iff the verbosity
-// level < 1 or TF_QUANT_MLIR_DUMP_PREFIX is not set or set to an empty string.
-// The returned ostream instance should live until the pass run is complete.
-[[nodiscard]] absl::StatusOr<std::unique_ptr<llvm::raw_ostream>>
-MaybeEnableIrPrinting(mlir::PassManager &pm, const absl::string_view name) {
-  if (!VLOG_IS_ON(1)) {
-    // Verbosity level is too low to enable IR printing.
-    return nullptr;
-  }
-
-  absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> dump_file =
-      CreateMlirDumpFile(/*dump_file_name=*/absl::StrCat(name, ".mlir"));
-  if (absl::IsFailedPrecondition(dump_file.status())) {
-    // The env variable TF_QUANT_MLIR_DUMP_PREFIX is not set. IR printing will
-    // not be enabled.
-    LOG(WARNING) << dump_file.status();
-    return nullptr;
-  } else if (!dump_file.ok()) {
-    return dump_file.status();
-  }
-
-  mlir::OpPrintingFlags flag{};
-  flag.useLocalScope().elideLargeElementsAttrs().enableDebugInfo();
-
-  // IR printing requires multithreading disabled.
-  pm.getContext()->disableMultithreading();
-
-  // The configuration uses the default parameter values for
-  // `PassManager::enableIRPrinting`, except for the `printModuleScope`
-  // parameter, which is true by default. It is set to false to avoid the dump
-  // file size becoming too large when the passes are running on a large model.
-  pm.enableIRPrinting(
-      /*shouldPrintBeforePass=*/[](mlir::Pass *,
-                                   mlir::Operation *) { return true; },
-      /*shouldPrintAfterPass=*/
-      [](mlir::Pass *, mlir::Operation *) { return true; },
-      /*printModuleScope=*/false, /*printAfterOnlyOnChange=*/true,
-      /*printAfterOnlyOnFailure=*/false, **dump_file, flag);
-
-  LOG(INFO) << "IR dump for TensorFlow quantization pipeline enabled. ";
-  return dump_file;
+// Unfreezes constants into variables and saves them to a checkpoint files under
+// `checkpoint_dir`. `checkpoint_dir` will be created within this function. It
+// will return a non-OK status if it already exists or permission is denied.
+// TODO(b/261652258): Make sure this works for when there are non-frozen
+// variables in the model.
+// TODO(b/262189534): Move this to a separate file for better testing.
+absl::StatusOr<std::vector<std::string>> UnfreezeConstantsAndSaveVariables(
+    const absl::string_view checkpoint_dir, mlir::MLIRContext &ctx,
+    mlir::ModuleOp module_op) {
+  if (const absl::Status pass_run_status =
+          RunPasses(/*name=*/kTfQuantConstantUnfreezingStepName,
+                    /*add_passes_func=*/
+                    [](mlir::PassManager &pm) {
+                      pm.addPass(mlir::quant::CreateUnfreezeConstantsPass());
+                    },
+                    ctx, module_op);
+      !pass_run_status.ok()) {
+    return pass_run_status;
+  }
+
+  if (const tsl::Status create_dir_status =
+          Env::Default()->CreateDir(std::string(checkpoint_dir));
+      !create_dir_status.ok()) {
+    LOG(ERROR) << "Failed to create checkpoint directory at: "
+               << checkpoint_dir;
+    return tsl::ToAbslStatus(create_dir_status);
+  }
+
+  const absl::StatusOr<std::vector<std::string>> variable_save_status =
+      SaveVariablesToCheckpoint(checkpoint_dir, module_op);
+  if (!variable_save_status.ok()) {
+    return variable_save_status.status();
+  }
+
+  if (const absl::Status pass_run_status = RunPasses(
+          /*name=*/kTfQuantInsertRestoreOpStepName,
+          /*add_passes_func=*/
+          [](mlir::PassManager &pm) {
+            pm.addPass(mlir::quant::CreateInsertRestoreOpPass());
+            // Initialization by `tf.ConstOp` is no longer required as there is
+            // a `tf.RestoreV2Op` now.
+            pm.addPass(
+                mlir::quant::CreateRemoveVariableInitializationByConstPass());
+          },
+          ctx, module_op);
+      !pass_run_status.ok()) {
+    return pass_run_status;
+  }
+
+  return *variable_save_status;
 }
 
-}  // namespace
+// Sets up and runs the passes for exporting `module_op`. The behavior of the
+// exporting passes is controlled by `export_opts`.
+absl::StatusOr<std::vector<std::string>> RunExportPasses(
+    const ExportOptions &export_opts, mlir::MLIRContext &ctx,
+    mlir::ModuleOp module_op) {
+  std::vector<std::string> variable_shared_names;
+
+  if (export_opts.unfreeze_constants) {
+    const absl::StatusOr<std::vector<std::string>> shared_names =
+        UnfreezeConstantsAndSaveVariables(export_opts.checkpoint_dir, ctx,
+                                          module_op);
+    if (!shared_names.ok()) {
+      return shared_names.status();
+    }
 
-absl::StatusOr<ExportedModel> QuantizeQatModel(
-    const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const std::unordered_set<std::string> tag_set =
-      absl::StrSplit(tags, ',', absl::SkipEmpty());
-  std::vector<std::string> exported_names =
-      absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
-  QuantizationOptions quantization_options;
-  if (!quantization_options.ParseFromString(
-          // NOLINTNEXTLINE: std::string conversion required.
-          std::string(quant_opts_serialized))) {
-    return absl::InternalError(
-        "Failed to parse QuantizationOptions from string.");
+    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
+              << export_opts.checkpoint_dir;
+
+    variable_shared_names = std::move(*shared_names);
   }
 
-  // Convert the SavedModelBundle to an MLIR module.
-  mlir::DialectRegistry registry;
+  if (const absl::Status pass_run_status = RunPasses(
+          /*name=*/export_opts.debug_name,
+          /*add_passes_func=*/
+          [dup_constants = export_opts.duplicate_shape_determining_constants](
+              mlir::PassManager &pm) { AddExportPasses(dup_constants, pm); },
+          ctx, module_op);
+      !pass_run_status.ok()) {
+    return pass_run_status;
+  }
+
+  return variable_shared_names;
+}
+
+// Creates MLIRContext where the dialects required for quantization are
+// registered.
+mlir::MLIRContext CreateMlirContextForTfQuantization() {
+  mlir::DialectRegistry registry{};
   registry.insert<mlir::func::FuncDialect, mlir::scf::SCFDialect,
                   mlir::tf_saved_model::TensorFlowSavedModelDialect,
                   mlir::TF::TensorFlowDialect, mlir::shape::ShapeDialect,
                   mlir::quant::QuantizationDialect>();
-  mlir::MLIRContext context(registry);
+  return mlir::MLIRContext{registry};
+}
+
+}  // namespace
+
+absl::StatusOr<ExportedModel> QuantizeQatModel(
+    const absl::string_view saved_model_path,
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options) {
+  // Convert the SavedModelBundle to an MLIR module.
+  mlir::MLIRContext context = CreateMlirContextForTfQuantization();
 
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
+  std::vector<std::string> exported_names = signature_keys;
   StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tag_set,
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
                                           absl::MakeSpan(exported_names),
                                           &context, import_options,
                                           /*lift_variables=*/false, &bundle);
@@ -229,64 +363,97 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  const Status status = PreprocessAndFreezeGraph(
-      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr);
-  if (!status.ok()) {
-    return absl::InternalError("Failed to preprocess graph: " +
-                               status.error_message());
+  if (const absl::Status preprocess_status = PreprocessAndFreezeGraph(
+          module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr);
+      !preprocess_status.ok()) {
+    return preprocess_status;
   }
 
-  mlir::PassManager pm(&context);
-  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> out_dump_file =
-      MaybeEnableIrPrinting(pm, /*name=*/"tf_quantize_qat");
-  if (!out_dump_file.ok()) {
-    return absl::InternalError(out_dump_file.status().message());
+  if (const absl::Status qat_status =
+          RunPasses(/*name=*/kTfQuantQatStepName,
+                    /*add_passes_func=*/
+                    [&quantization_options](mlir::PassManager &pm) {
+                      AddQuantizeQatPasses(pm, quantization_options);
+                    },
+                    context, *module_ref);
+      !qat_status.ok()) {
+    return qat_status;
   }
 
-  AddQuantizeQatPasses(pm, quantization_options);
-  AddExportPasses(pm);
+  const bool unfreeze_constants =
+      !quantization_options.freeze_all_variables().enabled();
+  const absl::StatusOr<std::string> checkpoint_dir = GetLocalTempFilename();
+  if (!checkpoint_dir.ok()) {
+    LOG(ERROR) << "Failed to get checkpoint directory name.";
+    return checkpoint_dir.status();
+  }
 
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-  if (failed(pm.run(*module_ref))) {
-    return absl::InternalError(
-        "failed to apply the quantization: " +
-        diagnostic_handler.ConsumeStatus().error_message());
+  const auto export_opts = ExportOptions{
+      /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
+      *checkpoint_dir,
+      /*debug_name=*/absl::StrCat(kTfQuantQatStepName, kExportStepSuffix)};
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      RunExportPasses(export_opts, context, *module_ref);
+  if (!variable_shared_names.ok()) {
+    return variable_shared_names.status();
   }
 
-  return ConvertMlirModuleToExportedModel(*module_ref);
+  return ConvertMlirModuleToExportedModel(*module_ref, *checkpoint_dir,
+                                          *variable_shared_names,
+                                          /*function_aliases=*/{});
+}
+
+// Returns the updated function aliases. `module_op` may have different function
+// names from the original model, so it re-associates the aliases with the new
+// function names. Both the input `function_aliases` and the returned value
+// are function name -> alias mappings. `function_aliases` is the function alias
+// mapping of the original function.
+absl::flat_hash_map<std::string, std::string> UpdateFunctionAliases(
+    const absl::flat_hash_map<std::string, std::string> function_aliases,
+    mlir::ModuleOp module_op) {
+  absl::flat_hash_map<std::string, std::string> updated_function_aliases;
+
+  module_op->walk([&](mlir::func::FuncOp func_op) {
+    // We may retrieve the original function's name from the attribute.
+    // Functions without this attribute are ignored.
+    auto original_func_name =
+        func_op->getAttrOfType<mlir::StringAttr>("tf._original_func_name");
+    if (original_func_name) {
+      if (auto alias_itr = function_aliases.find(original_func_name.str());
+          alias_itr != function_aliases.end()) {
+        const std::string alias = alias_itr->second;
+        const std::string new_func_name = func_op.getSymName().str();
+
+        updated_function_aliases[new_func_name] = alias;
+
+        VLOG(1) << "Updated function alias. Alias: " << alias
+                << ", New function name: " << new_func_name
+                << ", Old function name: " << original_func_name.str();
+      }
+    }
+  });
+
+  return updated_function_aliases;
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
     const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const std::unordered_set<std::string> tag_set =
-      absl::StrSplit(tags, ',', absl::SkipEmpty());
-  std::vector<std::string> exported_names =
-      absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
-  QuantizationOptions quantization_options;
-  if (!quantization_options.ParseFromString(
-          // NOLINTNEXTLINE: std::string conversion required.
-          std::string(quant_opts_serialized))) {
-    return absl::InternalError(
-        "Failed to parse QuantizationOptions from string.");
-  }
-
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
   // Convert the SavedModelBundle to an MLIR module.
-  mlir::DialectRegistry registry;
-  registry.insert<mlir::func::FuncDialect, mlir::scf::SCFDialect,
-                  mlir::tf_saved_model::TensorFlowSavedModelDialect,
-                  mlir::TF::TensorFlowDialect, mlir::shape::ShapeDialect,
-                  mlir::quant::QuantizationDialect>();
-  mlir::MLIRContext context(registry);
+  mlir::MLIRContext context = CreateMlirContextForTfQuantization();
 
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
+  std::vector<std::string> exported_names = signature_keys;
   StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tag_set,
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
                                           absl::MakeSpan(exported_names),
                                           &context, import_options,
                                           /*lift_variables=*/false, &bundle);
@@ -297,67 +464,81 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  const Status status = PreprocessAndFreezeGraph(
-      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr);
-  if (!status.ok()) {
-    return absl::InternalError("Failed to preprocess graph: " +
-                               status.error_message());
+  const absl::flat_hash_map<std::string, std::string> updated_function_aliases =
+      UpdateFunctionAliases(function_aliases, *module_ref);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(updated_function_aliases, [&](const auto &aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  if (const absl::Status preprocess_status = PreprocessAndFreezeGraph(
+          /*mlir_dump_file_prefix=*/kTfQuantPtqPreCalibrationStepName,
+          /*is_inliner_run=*/true,
+          /*noinline_functions=*/aliased_function_names, module_ref.get(),
+          &context, bundle ? bundle->GetSession() : nullptr);
+      !preprocess_status.ok()) {
+    return preprocess_status;
   }
 
-  mlir::PassManager pm(&context);
-  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> out_dump_file =
-      MaybeEnableIrPrinting(pm, /*name=*/"tf_quantize_ptq_pre_calibration");
-  if (!out_dump_file.ok()) {
-    return absl::InternalError(out_dump_file.status().message());
+  if (const absl::Status pre_calib_pass_status = RunPasses(
+          /*name=*/kTfQuantPtqPreCalibrationStepName,
+          /*add_passes_func=*/
+          [&quantization_options](mlir::PassManager &pm) {
+            AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
+          },
+          context, *module_ref);
+      !pre_calib_pass_status.ok()) {
+    return pre_calib_pass_status;
   }
 
-  AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
-  AddExportPasses(pm);
-
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-  if (failed(pm.run(*module_ref))) {
-    return absl::InternalError(
-        "Failed to apply the quantization at the pre-calibration stage: " +
-        diagnostic_handler.ConsumeStatus().error_message());
+  const bool unfreeze_constants =
+      !quantization_options.freeze_all_variables().enabled();
+  const absl::StatusOr<std::string> checkpoint_dir = GetLocalTempFilename();
+  if (!checkpoint_dir.ok()) {
+    return checkpoint_dir.status();
+  }
+  // `duplicate_shape_determining_constants = false` because the
+  // resulting graph of this step is not expected to be loaded on TPU.
+  const auto export_opts = ExportOptions{
+      /*duplicate_shape_determining_constants=*/false, unfreeze_constants,
+      *checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(kTfQuantPtqPreCalibrationStepName, kExportStepSuffix)};
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      RunExportPasses(export_opts, context, *module_ref);
+  if (!variable_shared_names.ok()) {
+    return variable_shared_names.status();
   }
 
-  return ConvertMlirModuleToExportedModel(*module_ref);
+  return ConvertMlirModuleToExportedModel(*module_ref, *checkpoint_dir,
+                                          *variable_shared_names,
+                                          updated_function_aliases);
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
     const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const std::unordered_set<std::string> tag_set =
-      absl::StrSplit(tags, ',', absl::SkipEmpty());
-  std::vector<std::string> exported_names =
-      absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
-  QuantizationOptions quantization_options;
-  if (!quantization_options.ParseFromString(
-          // NOLINTNEXTLINE: std::string conversion required.
-          std::string(quant_opts_serialized))) {
-    return absl::InternalError(
-        "Failed to parse QuantizationOptions from string.");
-  }
-
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
   // Convert the SavedModelBundle to an MLIR module.
-  mlir::DialectRegistry registry;
-  registry.insert<mlir::func::FuncDialect, mlir::scf::SCFDialect,
-                  mlir::tf_saved_model::TensorFlowSavedModelDialect,
-                  mlir::TF::TensorFlowDialect, mlir::shape::ShapeDialect,
-                  mlir::quant::QuantizationDialect>();
-  mlir::MLIRContext context(registry);
+  mlir::MLIRContext context = CreateMlirContextForTfQuantization();
 
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
+  std::vector<std::string> exported_names = signature_keys;
   StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tag_set,
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
                                           absl::MakeSpan(exported_names),
                                           &context, import_options,
-                                          /*lift_variables=*/true, &bundle);
+                                          /*lift_variables=*/false, &bundle);
 
   if (!module.status().ok()) {
     return absl::InternalError("Failed to import SavedModel: " +
@@ -366,57 +547,78 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  mlir::PassManager pm(&context);
-  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> out_dump_file =
-      MaybeEnableIrPrinting(pm, /*name=*/"tf_quantize_ptq_post_calibration");
-  if (!out_dump_file.ok()) {
-    return absl::InternalError(out_dump_file.status().message());
+  const absl::flat_hash_map<std::string, std::string> updated_function_aliases =
+      UpdateFunctionAliases(function_aliases, *module_ref);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(updated_function_aliases, [&](const auto &aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  // Freezing is required again since variables might have been produced during
+  // the pre-calibration step. `is_inliner_run = false` to prevent the functions
+  // lifted for quantization from being inlined.
+  if (const absl::Status preprocess_status = PreprocessAndFreezeGraph(
+          /*mlir_dump_file_prefix=*/kTfQuantPtqPostCalibrationStepName,
+          /*is_inliner_run=*/false,
+          /*noinline_functions=*/aliased_function_names, module_ref.get(),
+          &context, bundle ? bundle->GetSession() : nullptr);
+      !preprocess_status.ok()) {
+    return preprocess_status;
   }
 
-  AddQuantizePtqPostCalibrationPasses(pm, quantization_options);
-  AddExportPasses(pm);
+  if (const absl::Status pre_calib_pass_status = RunPasses(
+          /*name=*/kTfQuantPtqPostCalibrationStepName,
+          /*add_passes_func=*/
+          [&quantization_options](mlir::PassManager &pm) {
+            AddQuantizePtqPostCalibrationPasses(pm, quantization_options);
+          },
+          context, *module_ref);
+      !pre_calib_pass_status.ok()) {
+    return pre_calib_pass_status;
+  }
 
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-  if (failed(pm.run(*module_ref))) {
-    return absl::InternalError(
-        "Failed to apply the quantization at the post-calibation stage: " +
-        diagnostic_handler.ConsumeStatus().error_message());
+  const bool unfreeze_constants =
+      !quantization_options.freeze_all_variables().enabled();
+  const absl::StatusOr<std::string> checkpoint_dir = GetLocalTempFilename();
+  if (!checkpoint_dir.ok()) {
+    return checkpoint_dir.status();
+  }
+  const auto export_opts = ExportOptions{
+      /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
+      *checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(kTfQuantPtqPostCalibrationStepName, kExportStepSuffix)};
+
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      RunExportPasses(export_opts, context, *module_ref);
+  if (!variable_shared_names.ok()) {
+    return variable_shared_names.status();
   }
 
-  return ConvertMlirModuleToExportedModel(*module_ref);
+  return ConvertMlirModuleToExportedModel(*module_ref, *checkpoint_dir,
+                                          *variable_shared_names,
+                                          updated_function_aliases);
 }
 
 absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
     const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const std::unordered_set<std::string> tag_set =
-      absl::StrSplit(tags, ',', absl::SkipEmpty());
-  std::vector<std::string> exported_names =
-      absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
-  QuantizationOptions quantization_options;
-  if (!quantization_options.ParseFromString(
-          // NOLINTNEXTLINE: std::string conversion required.
-          std::string(quant_opts_serialized))) {
-    return absl::InternalError(
-        "Failed to parse QuantizationOptions from string.");
-  }
-
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options) {
   // Convert the SavedModelBundle to an MLIR module.
-  mlir::DialectRegistry registry;
-  registry.insert<mlir::func::FuncDialect, mlir::scf::SCFDialect,
-                  mlir::tf_saved_model::TensorFlowSavedModelDialect,
-                  mlir::TF::TensorFlowDialect, mlir::shape::ShapeDialect,
-                  mlir::quant::QuantizationDialect>();
-  mlir::MLIRContext context(registry);
+  mlir::MLIRContext context = CreateMlirContextForTfQuantization();
 
   MLIRImportOptions import_options;
   import_options.upgrade_legacy = true;
   auto bundle = std::make_unique<SavedModelBundle>();
 
   // TODO(b/213406917): Add support for the object graph based saved model input
+  std::vector<std::string> exported_names = signature_keys;
   StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tag_set,
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
                                           absl::MakeSpan(exported_names),
                                           &context, import_options,
                                           /*lift_variables=*/false, &bundle);
@@ -428,33 +630,44 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
 
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  const Status status = PreprocessAndFreezeGraph(
-      module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr);
-  if (!status.ok()) {
-    return absl::InternalError("Failed to preprocess graph: " +
-                               status.error_message());
+  if (const absl::Status preprocess_status = PreprocessAndFreezeGraph(
+          module_ref.get(), &context, bundle ? bundle->GetSession() : nullptr);
+      !preprocess_status.ok()) {
+    return preprocess_status;
   }
 
-  mlir::PassManager pm(&context);
-  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> out_dump_file =
-      MaybeEnableIrPrinting(pm, /*name=*/"tf_quantize_drq");
-  if (!out_dump_file.ok()) {
-    return absl::InternalError(out_dump_file.status().message());
+  if (const absl::Status ptq_dynamic_range_status = RunPasses(
+          /*name=*/kTfQuantPtqDynamicRangeStepName,
+          /*add_passes_func=*/
+          [&quantization_options](mlir::PassManager &pm) {
+            AddQuantizePtqDynamicRangePasses(pm, quantization_options);
+          },
+          context, *module_ref);
+      !ptq_dynamic_range_status.ok()) {
+    return ptq_dynamic_range_status;
   }
 
-  AddQuantizePtqDynamicRangePasses(pm, quantization_options);
-  AddExportPasses(pm);
-
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-  if (failed(pm.run(*module_ref))) {
-    return absl::InternalError(
-        "Failed to apply the quantization: " +
-        diagnostic_handler.ConsumeStatus().error_message());
+  const bool unfreeze_constants =
+      !quantization_options.freeze_all_variables().enabled();
+  const absl::StatusOr<std::string> checkpoint_dir = GetLocalTempFilename();
+  if (!checkpoint_dir.ok()) {
+    return checkpoint_dir.status();
+  }
+  const auto export_opts = ExportOptions{
+      /*duplicate_shape_determining_constants=*/true, unfreeze_constants,
+      *checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(kTfQuantPtqDynamicRangeStepName, kExportStepSuffix)};
+  const absl::StatusOr<std::vector<std::string>> variable_shared_names =
+      RunExportPasses(export_opts, context, *module_ref);
+  if (!variable_shared_names.ok()) {
+    return variable_shared_names.status();
   }
 
-  return ConvertMlirModuleToExportedModel(*module_ref);
+  return ConvertMlirModuleToExportedModel(*module_ref, *checkpoint_dir,
+                                          *variable_shared_names,
+                                          /*function_aliases=*/{});
 }
 
-}  // namespace internal
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index b69b3deaa0e..c3747fee523 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -16,43 +16,60 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_H_
 
 #include <string>
+#include <unordered_set>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
 namespace quantization {
-namespace internal {
 
-// Represents an exported TensorFlow model. It consists of a GraphDef and extra
-// metadata required for building a SavedModel.
-struct ExportedModel {
-  GraphDef graph_def = {};
-
-  // Name of the initialization node used for initializing resources like
-  // hash tables upon loading.
-  std::string init_node_name = "";
-};
+// Names of the TensorFlow Quantization steps. These names are used primarily
+// for debugging.
+inline constexpr absl::string_view kTfQuantPtqPreCalibrationStepName =
+    "tf_quant_ptq_pre_calibration";
+inline constexpr absl::string_view kTfQuantPtqPostCalibrationStepName =
+    "tf_quant_ptq_post_calibration";
+inline constexpr absl::string_view kTfQuantQatStepName = "tf_quant_qat";
+inline constexpr absl::string_view kTfQuantPtqDynamicRangeStepName =
+    "tf_quant_ptq_dynamic_range";
+inline constexpr absl::string_view kTfQuantConstantUnfreezingStepName =
+    "tf_quant_constant_unfreezing";
+inline constexpr absl::string_view kTfQuantInsertRestoreOpStepName =
+    "tf_quant_insert_restore_op";
 
 absl::StatusOr<ExportedModel> QuantizeQatModel(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quant_opts);
 
 // Apply post-training dynamic range quantization to the model.
 absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quant_opts);
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& exported_names,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quant_opts,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationOptions& quant_opts,
+    const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
-}  // namespace internal
 }  // namespace quantization
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 89eb0e97f05..5c63834cfab 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -18,7 +18,6 @@
 import tempfile
 from typing import Callable, Collection, Dict, Mapping, Optional, Sequence, Tuple
 import uuid
-import warnings
 from absl import logging
 
 import numpy as np
@@ -26,19 +25,19 @@
 # pylint: disable=invalid-import-order,g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
-from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model as quantize_model_wrapper
+from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_quantize_model
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
+from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
+from tensorflow.compiler.mlir.quantization.tensorflow import exported_model_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import wrap_function
-from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import loader_impl as saved_model_loader
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
@@ -61,17 +60,13 @@
 _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS = 1024
 
 
-def _legalize_tensor_name(tensor_name: str) -> str:
-  """Converts tensor name from 'name:index' to 'name__index' format."""
-  return tensor_name.replace(':', '__')
-
-
 def _is_qat_saved_model(saved_model_path: str):
   """Checks if the SavedModel is QAT-enabled by looking for 'FakeQuant' ops."""
   saved_model_proto = saved_model_loader.parse_saved_model(saved_model_path)
   for meta_graph in saved_model_proto.meta_graphs:
     if any(
-        node.op.startswith('FakeQuant') for node in meta_graph.graph_def.node):
+        node.op.startswith('FakeQuant') for node in meta_graph.graph_def.node
+    ):
       return True
     for function in meta_graph.graph_def.library.function:
       if any(node.op.startswith('FakeQuant') for node in function.node_def):
@@ -79,88 +74,11 @@ def _is_qat_saved_model(saved_model_path: str):
   return False
 
 
-def _get_signatures_from_saved_model(saved_model_path: str,
-                                     signature_keys: Sequence[str],
-                                     tags: Collection[str]) -> _SignatureDefMap:
-  """Gets a map from signature keys to their SignatureDef from a saved model."""
-  loader = saved_model_loader.SavedModelLoader(saved_model_path)
-  try:
-    meta_graphdef = loader.get_meta_graph_def_from_tags(tags)
-  except RuntimeError as runtime_error:
-    raise RuntimeError(
-        f'Failed to retrieve MetaGraphDef with tags {tags}'
-        f' from a SavedModel in {saved_model_path}.') from runtime_error
-
-  signature_def_map = {}
-  for key, signature_def in meta_graphdef.signature_def.items():
-    if key == _INIT_OP_SIGNATURE_KEY or key not in signature_keys:
-      continue
-
-    signature_def_map[key] = signature_def
-
-  return signature_def_map
-
-
-def _fix_tensor_names(signature_def_map: _SignatureDefMap,
-                      exported_graph: ops.Graph) -> Optional[_SignatureDefMap]:
-  """Tries fixing tensor names in the signatures to match the exported graph.
-
-  The output tensor names in the original graph usually become names of the
-  return nodes in the exported graph. This function tries to fix that and checks
-  if the input tensor names are found in the exported graph.
-
-  Args:
-    signature_def_map: the signatures of the original graph.
-    exported_graph: The PTQ-exported GraphDef.
-
-  Returns:
-    Fixed signatures or None if it couldn't be fixed.
-  """
-  # The InsertMainFunctionPass populates input and output nodes of the newly
-  # inserted main function with "tf_saved_model.index_path" attributes. These
-  # attributes can be used to identify outputs in the exported graph.
-  output_index_path_map = {}
-  for op in exported_graph.get_operations():
-    if (op.type == '_Retval' and
-        op.get_attr('tf_saved_model.index_path') is not None):
-      index_path_name = op.get_attr('tf_saved_model.index_path')[0]
-      index_path_name = index_path_name.decode('utf-8')
-      output_index_path_map[index_path_name] = op.inputs[0].name
-
-  for signature_def in signature_def_map.values():
-    for tensor_info in signature_def.inputs.values():
-      try:
-        exported_graph.get_tensor_by_name(tensor_info.name)
-      except KeyError:
-        # If input tensors are not found, the signatures can't be used for the
-        # exported graph.
-        warnings.warn('Cannot find the tensor with name %s in the graph.' %
-                      tensor_info.name)
-        return None
-
-    for tensor_info in signature_def.outputs.values():
-      try:
-        if tensor_info.name in output_index_path_map:
-          tensor_info.name = output_index_path_map[tensor_info.name]
-        else:
-          # Tries to find the return node with the given name and use its input
-          # as the output tensor name.
-          return_node = exported_graph.get_operation_by_name(
-              _legalize_tensor_name(tensor_info.name))
-          tensor_info.name = return_node.inputs[0].name
-      except KeyError:
-        warnings.warn(
-            'Cannot find the tensor or node with name %s in the graph.' %
-            tensor_info.name)
-        return None
-
-  return signature_def_map
-
-
 def _create_sample_validator(
     expected_input_keys: Collection[str],
-) -> Callable[[repr_dataset.RepresentativeSample],
-              repr_dataset.RepresentativeSample]:
+) -> Callable[
+    [repr_dataset.RepresentativeSample], repr_dataset.RepresentativeSample
+]:
   """Creates a validator function for a representative sample.
 
   Args:
@@ -172,7 +90,7 @@ def _create_sample_validator(
   """
 
   def validator(
-      sample: repr_dataset.RepresentativeSample
+      sample: repr_dataset.RepresentativeSample,
   ) -> repr_dataset.RepresentativeSample:
     """Validates a single instance of representative sample.
 
@@ -191,16 +109,19 @@ def validator(
         the input keys of the function.
     """
     if not isinstance(sample, collections.abc.Mapping):
-      raise ValueError('Invalid representative sample type. Provide a mapping '
-                       '(usually a dict) of {input_key: input_value}. '
-                       f'Got type: {type(sample)} instead.')
+      raise ValueError(
+          'Invalid representative sample type. Provide a mapping '
+          '(usually a dict) of {input_key: input_value}. '
+          f'Got type: {type(sample)} instead.'
+      )
 
     if set(sample.keys()) != expected_input_keys:
       raise KeyError(
           'Invalid input keys for representative sample. The function expects '
           f'input keys of: {set(expected_input_keys)}. '
           f'Got: {set(sample.keys())}. Please provide correct input keys for '
-          'representative samples.')
+          'representative samples.'
+      )
 
     return sample
 
@@ -209,7 +130,8 @@ def validator(
 
 def _validate_representative_dataset(
     representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
-    signature_keys: Collection[str]) -> None:
+    signature_keys: Collection[str],
+) -> None:
   """Validates the representative dataset, based on the signature keys.
 
   Representative dataset can be provided in two different forms: a single
@@ -239,18 +161,22 @@ def _validate_representative_dataset(
       raise ValueError(
           'The signature keys and the keys of representative dataset map '
           f'do not match. Signature keys: {set(signature_keys)}, '
-          f'representative dataset map: {set(representative_dataset.keys())}.')
+          f'representative dataset map: {set(representative_dataset.keys())}.'
+      )
   else:
     if len(signature_keys) > 1:
-      raise ValueError('Representative dataset is not a mapping '
-                       f'(got: {type(representative_dataset)}), '
-                       'but there is more than one signature key provided. '
-                       'Please provide a map of {signature_key -> dataset} '
-                       'with more than one signature key.')
+      raise ValueError(
+          'Representative dataset is not a mapping '
+          f'(got: {type(representative_dataset)}), '
+          'but there is more than one signature key provided. '
+          'Please provide a map of {signature_key -> dataset} '
+          'with more than one signature key.'
+      )
 
 
 def _convert_values_to_tf_tensors(
-    sample: repr_dataset.RepresentativeSample) -> Mapping[str, core.Tensor]:
+    sample: repr_dataset.RepresentativeSample,
+) -> Mapping[str, core.Tensor]:
   """Converts TensorLike values of `sample` to Tensors.
 
   Creates a copy of `sample`, where each value is converted to Tensors
@@ -278,7 +204,8 @@ def _convert_values_to_tf_tensors(
 
 def _create_feed_dict_from_input_data(
     input_data: repr_dataset.RepresentativeSample,
-    signature_def: meta_graph_pb2.SignatureDef) -> Dict[str, np.ndarray]:
+    signature_def: meta_graph_pb2.SignatureDef,
+) -> Dict[str, np.ndarray]:
   """Constructs a feed_dict from input data.
 
   Note: This function should only be used in graph mode.
@@ -332,7 +259,8 @@ def _log_sample_num_for_calibration(
     modification.
   """
   num_samples: Optional[int] = repr_dataset.get_num_samples(
-      representative_dataset)
+      representative_dataset
+  )
   if num_samples is None:
     total_num_samples = '?'
     logging.info('Representative dataset size unknown.')
@@ -346,17 +274,26 @@ def _log_sample_num_for_calibration(
 
     # Log the sample number for every 5 iterations.
     logging.log_every_n(
-        logging.DEBUG, 'Running representative sample for calibration: %d / %s',
-        5, sample_num, total_num_samples)
+        logging.DEBUG,
+        'Running representative sample for calibration: %d / %s',
+        5,
+        sample_num,
+        total_num_samples,
+    )
     yield sample
 
-  logging.info('Running representative samples complete: %d / %s', sample_num,
-               total_num_samples)
+  logging.info(
+      'Running representative samples complete: %d / %s',
+      sample_num,
+      total_num_samples,
+  )
 
 
 def _run_function_for_calibration_graph_mode(
-    sess: session.Session, signature_def: meta_graph_pb2.SignatureDef,
-    representative_dataset: repr_dataset.RepresentativeDataset) -> None:
+    sess: session.Session,
+    signature_def: meta_graph_pb2.SignatureDef,
+    representative_dataset: repr_dataset.RepresentativeDataset,
+) -> None:
   """Runs the representative dataset through a function for calibration.
 
   NOTE: This is intended to be run in graph mode (TF1).
@@ -376,10 +313,12 @@ def _run_function_for_calibration_graph_mode(
   ]
 
   sample_validator = _create_sample_validator(
-      expected_input_keys=signature_def.inputs.keys())
+      expected_input_keys=signature_def.inputs.keys()
+  )
 
-  for sample in map(sample_validator,
-                    _log_sample_num_for_calibration(representative_dataset)):
+  for sample in map(
+      sample_validator, _log_sample_num_for_calibration(representative_dataset)
+  ):
     # Create a mapping from input tensor name to the input tensor value.
     # ex) "Placeholder:0" -> [0, 1, 2]
     feed_dict = _create_feed_dict_from_input_data(sample, signature_def)
@@ -387,7 +326,8 @@ def _run_function_for_calibration_graph_mode(
 
 
 def _replace_tensors_by_numpy_ndarrays(
-    repr_ds_map: repr_dataset.RepresentativeDatasetMapping) -> None:
+    repr_ds_map: repr_dataset.RepresentativeDatasetMapping,
+) -> None:
   """Replaces tf.Tensors by their evaluated numpy arrays.
 
   This assumes that tf.Tensors in representative samples are created in the
@@ -403,7 +343,8 @@ def _replace_tensors_by_numpy_ndarrays(
       # by their evaluated values.
       ds = repr_ds_map[signature_def_key]
       repr_ds_map[signature_def_key] = (
-          repr_dataset.replace_tensors_by_numpy_ndarrays(ds, sess))
+          repr_dataset.replace_tensors_by_numpy_ndarrays(ds, sess)
+      )
 
 
 def _run_graph_for_calibration_graph_mode(
@@ -435,23 +376,27 @@ def _run_graph_for_calibration_graph_mode(
   # happen when the same model is loaded multiple times in the default graph.
   with ops.Graph().as_default(), session.Session() as sess:
     meta_graph: meta_graph_pb2.MetaGraphDef = saved_model_loader.load(
-        sess, tags, export_dir=model_dir)
+        sess, tags, export_dir=model_dir
+    )
 
     for signature_key, repr_ds in representative_dataset_map.items():
       sig_def = meta_graph.signature_def[signature_key]
 
       try:
         _run_function_for_calibration_graph_mode(
-            sess, signature_def=sig_def, representative_dataset=repr_ds)
+            sess, signature_def=sig_def, representative_dataset=repr_ds
+        )
       except Exception as ex:
         raise ValueError(
             'Failed to run representative dataset through the '
-            f'function with the signature key: {signature_key}.') from ex
+            f'function with the signature key: {signature_key}.'
+        ) from ex
 
 
 def _run_function_for_calibration_eager_mode(
     func: wrap_function.WrappedFunction,
-    representative_dataset: repr_dataset.RepresentativeDataset) -> None:
+    representative_dataset: repr_dataset.RepresentativeDataset,
+) -> None:
   """Runs the representative dataset through a function for calibration.
 
   NOTE: This is intended to be run in eager mode (TF2).
@@ -464,10 +409,12 @@ def _run_function_for_calibration_eager_mode(
   """
   _, keyword_args = func.structured_input_signature
   sample_validator = _create_sample_validator(
-      expected_input_keys=keyword_args.keys())
+      expected_input_keys=keyword_args.keys()
+  )
 
-  for sample in map(sample_validator,
-                    _log_sample_num_for_calibration(representative_dataset)):
+  for sample in map(
+      sample_validator, _log_sample_num_for_calibration(representative_dataset)
+  ):
     # Convert any non-Tensor values from the sample to Tensors.
     # This conversion is required because the model saved in `model_dir` is
     # saved using TF1 SavedModelBuilder, which doesn't save the
@@ -503,11 +450,13 @@ def _run_graph_for_calibration_eager_mode(
   for signature_key, repr_ds in representative_dataset_map.items():
     try:
       _run_function_for_calibration_eager_mode(
-          func=root.signatures[signature_key], representative_dataset=repr_ds)
+          func=root.signatures[signature_key], representative_dataset=repr_ds
+      )
     except Exception as ex:
       raise ValueError(
           'Failed to run representative dataset through the '
-          f'function with the signature key: {signature_key}.') from ex
+          f'function with the signature key: {signature_key}.'
+      ) from ex
 
 
 def _run_graph_for_calibration(
@@ -550,11 +499,13 @@ def _run_graph_for_calibration(
 
   try:
     if context.executing_eagerly():
-      _run_graph_for_calibration_eager_mode(float_model_dir, tags,
-                                            representative_dataset_map)
+      _run_graph_for_calibration_eager_mode(
+          float_model_dir, tags, representative_dataset_map
+      )
     else:
-      _run_graph_for_calibration_graph_mode(float_model_dir, tags,
-                                            representative_dataset_map)
+      _run_graph_for_calibration_graph_mode(
+          float_model_dir, tags, representative_dataset_map
+      )
   except Exception as ex:
     raise ValueError(
         'Failed to run graph for post-training quantization calibration.'
@@ -563,29 +514,12 @@ def _run_graph_for_calibration(
   logging.info('Calibration step complete.')
 
 
-def _create_empty_output_dir(output_directory: str) -> None:
-  """Creates the `output_directory`.
-
-  If `output_directory` already exists, it recursively deletes all contents
-  inside the directory.
-
-  Also creates the parent & intermediate directories.
-
-  Args:
-    output_directory: Output directory.
-  """
-  if file_io.file_exists_v2(output_directory):
-    logging.info('Deleting existing directory for quantized model output: %s .',
-                 output_directory)
-    file_io.delete_recursively_v2(output_directory)
-
-  file_io.recursive_create_dir_v2(output_directory)
-
-
 def _run_static_range_qat(
-    saved_model_path: str, signature_def_keys: Sequence[str],
+    saved_model_path: str,
+    signature_def_keys: Sequence[str],
     tags: Collection[str],
-    quant_opts: quant_opts_pb2.QuantizationOptions) -> graph_pb2.GraphDef:
+    quant_opts: quant_opts_pb2.QuantizationOptions,
+) -> exported_model_pb2.ExportedModel:
   """Runs static-range quantization for a Quantization-Aware Trained model.
 
   Runs the quantization for a model trained using QAT.
@@ -598,16 +532,22 @@ def _run_static_range_qat(
     quant_opts: Quantization options.
 
   Returns:
-    The static-range quantized graph.
+    exported_model: Contains the GraphDef and extra metadata required for saving
+      the quantized graph to SavedModel.
   """
   logging.info('Running static-range quantization for QAT model.')
-  graph_def_serialized = (
-      quantize_model_wrapper.quantize_qat_model(saved_model_path,
-                                                ','.join(signature_def_keys),
-                                                ','.join(tags),
-                                                quant_opts.SerializeToString()))
+  exported_model_serialized = pywrap_quantize_model.quantize_qat_model(
+      saved_model_path,
+      list(signature_def_keys),
+      set(tags),
+      quant_opts.SerializeToString(),
+  )
 
-  return graph_pb2.GraphDef.FromString(graph_def_serialized)
+  exported_model = exported_model_pb2.ExportedModel.FromString(
+      exported_model_serialized
+  )
+
+  return exported_model
 
 
 def _add_calibration_statistics(graph_def: graph_pb2.GraphDef) -> None:
@@ -627,40 +567,20 @@ def _add_calibration_statistics(graph_def: graph_pb2.GraphDef) -> None:
 
       node_id = node_def.attr['id'].s
       try:
-        min_val = quantize_model_wrapper.get_min_from_calibrator(node_id)
-        max_val = quantize_model_wrapper.get_max_from_calibrator(node_id)
-        quantize_model_wrapper.clear_data_from_calibrator(node_id)
+        min_val = pywrap_quantize_model.get_min_from_calibrator(node_id)
+        max_val = pywrap_quantize_model.get_max_from_calibrator(node_id)
+        pywrap_quantize_model.clear_data_from_calibrator(node_id)
         node_def.attr['min'].f = float(min_val)
         node_def.attr['max'].f = float(max_val)
       except ValueError:
         logging.warn(
-            'CustomAggregator id "%s" from FunctionDef "%s" does not have '
-            'min or max values. Parts of this function are not quantized.',
-            node_id.decode('utf-8'), function_def.signature.name)
-
-
-def _find_op(graph: ops.Graph,
-             op_name: Optional[str]) -> Optional[ops.Operation]:
-  """Finds the operation with `op_name`.
-
-  Args:
-    graph: The graph to find from.
-    op_name: Name of the node.
-
-  Returns:
-    The operation that corresponds to `op_name`. Returns None iff op_name is an
-    empty string or None.
-
-  Raises:
-    ValueError: `op_name` is malformed.
-  """
-  if not op_name:
-    return None
-
-  init_op = graph.get_operation_by_name(op_name)
-  logging.debug('Op found in the graph: %s', op_name)
-
-  return init_op
+            (
+                'CustomAggregator id "%s" from FunctionDef "%s" does not have '
+                'min or max values. Parts of this function are not quantized.'
+            ),
+            node_id.decode('utf-8'),
+            function_def.signature.name,
+        )
 
 
 def _run_static_range_ptq(
@@ -670,7 +590,7 @@ def _run_static_range_ptq(
     quant_opts: quant_opts_pb2.QuantizationOptions,
     representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
     signature_def_map: _SignatureDefMap,
-) -> Tuple[graph_pb2.GraphDef, _SignatureDefMap, str]:
+) -> Tuple[exported_model_pb2.ExportedModel, _SignatureDefMap]:
   """Runs static-range Post-Training Quantization.
 
   Runs static-range PTQ for the model. Runs the calibration step with
@@ -693,122 +613,88 @@ def _run_static_range_ptq(
     ValueError if the graph doesn't contain a valid signature.
 
   Returns:
-    (graph_def, signature_def_map, init_op_name) where graph_def is the
-    quantized graph and
-    the signature_def_map contains the SignatureDefs, possibly modified
-    according to the quantized graph to match the original signature defs.
-    init_op_name is the name of the initializer op, which is fetched once to
-    initialize resources (e.g. hash tables) when a SavedModel is loaded.
+    exported_model: Contains the GraphDef and extra metadata required for saving
+      the quantized graph to SavedModel.
+    signature_def_map: Contains the SignatureDefs, possibly modified
+      according to the quantized graph to match the original signature defs.
   """
   logging.info('Running post-training quantization pre-calibration step.')
-  graph_def_serialized, init_node_name = (
-      quantize_model_wrapper.quantize_ptq_model_pre_calibration(
-          saved_model_path, ','.join(signature_def_keys), ','.join(tags),
-          quant_opts.SerializeToString()))
 
-  graph_def = graph_pb2.GraphDef.FromString(graph_def_serialized)
+  loader = saved_model_loader.SavedModelLoader(saved_model_path)
+  function_aliases = loader.get_meta_graph_def_from_tags(
+      tags
+  ).meta_info_def.function_aliases
+
+  exported_model_serialized = (
+      pywrap_quantize_model.quantize_ptq_model_pre_calibration(
+          saved_model_path,
+          list(signature_def_keys),
+          set(tags),
+          quant_opts.SerializeToString(),
+          dict(function_aliases),
+      )
+  )
+
+  exported_model = exported_model_pb2.ExportedModel.FromString(
+      exported_model_serialized
+  )
+
+  graph_def = exported_model.graph_def
+  for function_def in graph_def.library.function:
+    for node_def in function_def.node_def:
+      if node_def.op == 'CustomAggregator':
+        node_def.attr['id'].s = uuid.uuid4().hex.encode('ascii')
 
   float_model_dir = tempfile.mkdtemp()
-  v1_builder = builder.SavedModelBuilder(float_model_dir)
-
-  with session.Session(graph=ops.Graph()) as sess:
-    for function_def in graph_def.library.function:
-      for node_def in function_def.node_def:
-        if node_def.op == 'CustomAggregator':
-          node_def.attr['id'].s = uuid.uuid4().hex.encode('ascii')
-
-    importer.import_graph_def(graph_def, name='')
-    working_graph = ops.get_default_graph()
-    graph_def = working_graph.as_graph_def()
-
-    signature_def_map = _fix_tensor_names(signature_def_map, working_graph)
-    if signature_def_map is None:
-      raise ValueError("The input SavedModel doesn't contain a valid signature")
-
-    v1_builder.add_meta_graph_and_variables(
-        sess,
-        tags,
-        signature_def_map=signature_def_map,
-        main_op=_find_op(working_graph, init_node_name))
-
-  v1_builder.save()
+  save_model.save_model_v1(
+      graph_def,
+      float_model_dir,
+      signature_def_map,
+      tags,
+      exported_model.init_node_name,
+      exported_model.restore_node_name,
+      exported_model.checkpoint_dir,
+      exported_model.variable_shared_names,
+      exported_model.function_aliases,
+  )
 
   # Uses the representative dataset to collect statistics for calibration.
   # Handles the graph mode execution separately in case TF2 is disabled or
   # eager execution is disabled. The min & max values are stored separately
   # in a global CalibratorSingleton instance.
-  _run_graph_for_calibration(float_model_dir, signature_def_keys, tags,
-                             representative_dataset)
+  _run_graph_for_calibration(
+      float_model_dir, signature_def_keys, tags, representative_dataset
+  )
   _add_calibration_statistics(graph_def)
 
   calibrated_model_dir = tempfile.mkdtemp()
-  v1_builder = builder.SavedModelBuilder(calibrated_model_dir)
-
-  with session.Session(graph=ops.Graph()) as sess:
-    importer.import_graph_def(graph_def, name='')
-    working_graph = ops.get_default_graph()
-    graph_def = working_graph.as_graph_def()
-
-    v1_builder.add_meta_graph_and_variables(
-        sess,
-        tags,
-        signature_def_map=signature_def_map,
-        main_op=_find_op(working_graph, init_node_name))
-
-  v1_builder.save()
-
-  signature_def_map = _get_signatures_from_saved_model(calibrated_model_dir,
-                                                       signature_def_keys, tags)
+  save_model.save_model_v1(
+      graph_def,
+      calibrated_model_dir,
+      signature_def_map,
+      tags,
+      exported_model.init_node_name,
+      exported_model.restore_node_name,
+      exported_model.checkpoint_dir,
+      exported_model.variable_shared_names,
+  )
 
   logging.info('Running post-training quantization post-calibration step.')
-  graph_def_serialized, init_node_name = (
-      quantize_model_wrapper.quantize_ptq_model_post_calibration(
-          calibrated_model_dir, ','.join(signature_def_keys), ','.join(tags),
-          quant_opts.SerializeToString()))
-
-  graph_def = graph_pb2.GraphDef.FromString(graph_def_serialized)
-
-  return graph_def, signature_def_map, init_node_name
+  exported_model_serialized = (
+      pywrap_quantize_model.quantize_ptq_model_post_calibration(
+          calibrated_model_dir,
+          list(signature_def_keys),
+          set(tags),
+          quant_opts.SerializeToString(),
+          dict(exported_model.function_aliases),
+      )
+  )
 
+  exported_model = exported_model_pb2.ExportedModel.FromString(
+      exported_model_serialized
+  )
 
-def _save_model_v1(graph_def: graph_pb2.GraphDef,
-                   output_dir: str,
-                   signature_def_map: _SignatureDefMap,
-                   tags: Collection[str],
-                   init_op_name: Optional[str] = None) -> None:
-  """Saves the model.
-
-  Saves the provided graph def as SavedModel.
-  Uses TF1 SavedModel semantics (i.e. no object graph).
-
-  Args:
-    graph_def: Graph to save.
-    output_dir: Output directory for the SavedModel.
-    signature_def_map: Mapping of signature def key -> SignatureDef.
-    tags: Tags for the meta graph def.
-    init_op_name: Name of the node for initialization.
-
-  Raises:
-    ValueError iff the graph does not contain a valid signature.
-  """
-  _create_empty_output_dir(output_dir)
-  v1_builder = builder.SavedModelBuilder(output_dir)
-
-  with session.Session(graph=ops.Graph()) as sess:
-    importer.import_graph_def(graph_def, name='')
-
-    signature_def_map = _fix_tensor_names(signature_def_map,
-                                          ops.get_default_graph())
-    if signature_def_map is None:
-      raise ValueError("The input SavedModel doesn't contain a valid signature")
-
-    v1_builder.add_meta_graph_and_variables(
-        sess,
-        tags,
-        signature_def_map=signature_def_map,
-        main_op=_find_op(sess.graph, op_name=init_op_name))
-
-  v1_builder.save()
+  return exported_model, signature_def_map
 
 
 def _static_range_quantize(
@@ -818,7 +704,8 @@ def _static_range_quantize(
     output_directory: str,
     quantization_options: quant_opts_pb2.QuantizationOptions,
     representative_dataset: Optional[
-        repr_dataset.RepresentativeDatasetOrMapping] = None
+        repr_dataset.RepresentativeDatasetOrMapping
+    ] = None,
 ) -> autotrackable.AutoTrackable:
   """Quantizes the given SavedModel via static range quantization.
 
@@ -851,42 +738,56 @@ def _static_range_quantize(
     RuntimeError: When a MetaGraphDef could not be found associated with `tags`
       in the SavedModel.
   """
-  logging.info('Running static range quantization on model: %s',
-               saved_model_path)
+  logging.info(
+      'Running static range quantization on model: %s', saved_model_path
+  )
   logging.info('Using SignatureDef keys: %s', signature_keys)
   logging.info('Using tags: %s', tags)
   logging.info('QuantizationOptions: \n%s', quantization_options)
 
   is_qat_saved_model = _is_qat_saved_model(saved_model_path)
-  signature_def_map = _get_signatures_from_saved_model(saved_model_path,
-                                                       signature_keys, tags)
+  signature_def_map = save_model.get_signatures_from_saved_model(
+      saved_model_path, signature_keys, tags
+  )
 
   # Checks if the model is from QAT
   if representative_dataset is None and not is_qat_saved_model:
     raise ValueError(
         'When `representative_dataset` is not provided, the model should be '
-        'trained with quantization-aware training (QAT).')
+        'trained with quantization-aware training (QAT).'
+    )
   if quantization_options.min_num_elements_for_weights > 0:
     logging.warn(
         'min_num_elements_for_weights is set but is not supported for the '
         'Post-training static range quantization. '
-        'The flag is ignored.')
+        'The flag is ignored.'
+    )
 
   if is_qat_saved_model:
-    init_node_name: Optional[str] = None
-    graph_def = _run_static_range_qat(saved_model_path, signature_keys, tags,
-                                      quantization_options)
+    exported_model = _run_static_range_qat(
+        saved_model_path, signature_keys, tags, quantization_options
+    )
   else:
-    graph_def, signature_def_map, init_node_name = _run_static_range_ptq(
-        saved_model_path, signature_keys, tags, quantization_options,
-        representative_dataset, signature_def_map)
+    exported_model, signature_def_map = _run_static_range_ptq(
+        saved_model_path,
+        signature_keys,
+        tags,
+        quantization_options,
+        representative_dataset,
+        signature_def_map,
+    )
 
-  _save_model_v1(
-      graph_def,
+  save_model.save_model_v1(
+      exported_model.graph_def,
       output_directory,
       signature_def_map,
       tags,
-      init_op_name=init_node_name)
+      init_op_name=exported_model.init_node_name,
+      restore_op_name=exported_model.restore_node_name,
+      checkpoint_dir=exported_model.checkpoint_dir,
+      variable_shared_names=exported_model.variable_shared_names,
+      function_aliases=exported_model.function_aliases,
+  )
 
   return saved_model_load(output_directory)
 
@@ -900,6 +801,8 @@ def _dynamic_range_quantize(
 ) -> autotrackable.AutoTrackable:
   """Quantizes the given SavedModel via post-training dynamic range quantization.
 
+  Weight-only quantization also uses this path.
+
   Args:
     saved_model_path: Path to the saved model.
     signature_keys: Sequence of keys identifying SignatureDef containing inputs
@@ -917,13 +820,22 @@ def _dynamic_range_quantize(
   Raises:
     ValueError: when the model is QAT model.
   """
+  if (
+      quantization_options.quantization_method.experimental_method
+      == _ExperimentalMethod.WEIGHT_ONLY
+  ):
+    mode_str = 'weight-only quantization'
+  else:
+    mode_str = 'dynamic-range quantization'
   if _is_qat_saved_model(saved_model_path):
     raise ValueError(
         'The models trained with quantization-aware training (QAT) is not '
-        'supported for dynamic range quantization.')
+        'supported for %s.' % mode_str
+    )
 
-  logging.info('Running post-training dynamic-range quantization on model: %s',
-               saved_model_path)
+  logging.info(
+      'Running post-training %s on model: %s', mode_str, saved_model_path
+  )
   logging.info('Using SignatureDef keys: %s', signature_keys)
   logging.info('Using tags: %s', tags)
   logging.info('QuantizationOptions: \n%s', quantization_options)
@@ -934,28 +846,39 @@ def _dynamic_range_quantize(
   # please also update default value in tflite converter:
   # tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc;l=201
   if quantization_options.min_num_elements_for_weights == 0:
-    (quantization_options.min_num_elements_for_weights
-    ) = _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
+    (quantization_options.min_num_elements_for_weights) = (
+        _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS
+    )
     logging.warn(
-        'QuantizationOptions.min_num_elements_for_weights is not set (0). '
-        'Setting to the default value: %s.',
-        _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS)
+        (
+            'QuantizationOptions.min_num_elements_for_weights is not set (0). '
+            'Setting to the default value: %s.'
+        ),
+        _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS,
+    )
 
   # Apply post-training dynamic range quantization to the model.
-  graph_def_serialized = (
-      quantize_model_wrapper.quantize_ptq_dynamic_range(
-          saved_model_path, ','.join(signature_keys), ','.join(tags),
-          quantization_options.SerializeToString()))
-
-  graph_def = graph_pb2.GraphDef.FromString(graph_def_serialized)
-  signature_def_map = _get_signatures_from_saved_model(saved_model_path,
-                                                       signature_keys, tags)
-
-  _save_model_v1(
-      graph_def,
+  exported_model_serialized = pywrap_quantize_model.quantize_ptq_dynamic_range(
+      saved_model_path,
+      list(signature_keys),
+      set(tags),
+      quantization_options.SerializeToString(),
+  )
+
+  exported_model = exported_model_pb2.ExportedModel.FromString(
+      exported_model_serialized
+  )
+  signature_def_map = save_model.get_signatures_from_saved_model(
+      saved_model_path, signature_keys, tags
+  )
+
+  save_model.save_model_v1(
+      exported_model.graph_def,
       output_directory,
       signature_def_map,
-      tags={tag_constants.SERVING})
+      tags=tags,
+      init_op_name=exported_model.init_node_name,
+  )
 
   return saved_model_load(output_directory)
 
@@ -976,13 +899,68 @@ def _verify_output_dir(output_dir: Optional[str], overwrite: bool) -> None:
     FileExistsError: Iff `output_dir` is not empty and `overwrite` is false.
   """
   dir_not_empty = (
-      output_dir is not None and file_io.file_exists_v2(output_dir) and
-      file_io.list_directory_v2(output_dir))
+      output_dir is not None
+      and file_io.file_exists_v2(output_dir)
+      and file_io.list_directory_v2(output_dir)
+  )
 
   if dir_not_empty and not overwrite:
-    raise FileExistsError(f'Output directory already exists: {output_dir} . '
-                          'Please set overwrite_output_directory to true to '
-                          'overwrite the existing directory.')
+    raise FileExistsError(
+        f'Output directory already exists: {output_dir} . '
+        'Please set overwrite_output_directory to true to '
+        'overwrite the existing directory.'
+    )
+
+
+def _populate_quantization_options_default_values(
+    quantization_options: quant_opts_pb2.QuantizationOptions,
+) -> None:
+  """Populates default values for QuantizationOptions.
+
+  Populates unspecified or unset fields of QuantizationOptions with the default
+  values.
+
+  * If `op_set` is unspecified, it defaults to `OpSet.TF`.
+  * If `freeze_all_variables` is not set, it defaults to `True`.
+  * Check if configurations are set correctly:
+    - Per-channel quantization is supported for Uniform Quantized opset only.
+
+  Args:
+    quantization_options: An instance of QuantizationOptions.
+  """
+  if quantization_options.op_set == quant_opts_pb2.OpSet.OP_SET_UNSPECIFIED:
+    quantization_options.op_set = quant_opts_pb2.OpSet.TF
+
+  if not quantization_options.HasField('freeze_all_variables'):
+    quantization_options.freeze_all_variables.enabled = True
+
+  if quantization_options.enable_per_channel_quantization and (
+      quantization_options.op_set != quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
+  ):
+    raise ValueError(
+        'Currently, per-channel quantization is supported for Uniform '
+        'Quantized opset only.'
+    )
+
+  if (
+      quantization_options.quantization_method.experimental_method
+      == _ExperimentalMethod.WEIGHT_ONLY
+      and quantization_options.op_set == quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
+  ):
+    raise ValueError('Uniform quantized opset does not support weight-only.')
+
+  # Converter assumes options are specified. So set SRQ explicitly.
+  if (
+      quantization_options.quantization_method.experimental_method
+      == _ExperimentalMethod.EXPERIMENTAL_METHOD_UNSPECIFIED
+  ):
+    logging.debug(
+        '"experimental_method" for QuantizationMethod is not specified.'
+        'Static range quantization is used by default.'
+    )
+    quantization_options.quantization_method.experimental_method = (
+        _ExperimentalMethod.STATIC_RANGE
+    )
 
 
 def quantize(
@@ -992,7 +970,8 @@ def quantize(
     output_directory: Optional[str] = None,
     quantization_options: Optional[quant_opts_pb2.QuantizationOptions] = None,
     representative_dataset: Optional[
-        repr_dataset.RepresentativeDatasetOrMapping] = None,
+        repr_dataset.RepresentativeDatasetOrMapping
+    ] = None,
     *,
     overwrite_output_directory: bool = False,
 ) -> autotrackable.AutoTrackable:
@@ -1029,39 +1008,52 @@ def quantize(
       implemented.
   """
   _verify_output_dir(output_directory, overwrite_output_directory)
+
+  # Set default values for None arguments.
   if output_directory is None:
     output_directory = tempfile.mkdtemp()
 
-  # Set default values for None arguments.
   if quantization_options is None:
     quantization_options = quant_opts_pb2.QuantizationOptions()
-  if quantization_options.op_set == quant_opts_pb2.OpSet.OP_SET_UNSPECIFIED:
-    quantization_options.op_set = quant_opts_pb2.OpSet.TF
+
+  _populate_quantization_options_default_values(quantization_options)
 
   if tags is None:
     tags = {tag_constants.SERVING}
+
   if signature_keys is None:
     signature_keys = [signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
 
-  method: quant_opts_pb2.QuantizationMethod = quantization_options.quantization_method
+  method: quant_opts_pb2.QuantizationMethod = (
+      quantization_options.quantization_method
+  )
   if method.HasField('method'):
     raise ValueError(f'Invalid value for QuantizationMethod: {method.method}.')
   elif method.HasField('experimental_method'):
     if method.experimental_method == _ExperimentalMethod.STATIC_RANGE:
-      return _static_range_quantize(saved_model_path, signature_keys, tags,
-                                    output_directory, quantization_options,
-                                    representative_dataset)
-    elif method.experimental_method == _ExperimentalMethod.DYNAMIC_RANGE:
-      return _dynamic_range_quantize(saved_model_path, signature_keys, tags,
-                                     output_directory, quantization_options)
+      return _static_range_quantize(
+          saved_model_path,
+          signature_keys,
+          tags,
+          output_directory,
+          quantization_options,
+          representative_dataset,
+      )
+    elif (
+        method.experimental_method == _ExperimentalMethod.DYNAMIC_RANGE
+        or method.experimental_method == _ExperimentalMethod.WEIGHT_ONLY
+    ):
+      return _dynamic_range_quantize(
+          saved_model_path,
+          signature_keys,
+          tags,
+          output_directory,
+          quantization_options,
+      )
     else:
       raise NotImplementedError(
           'Experimental quantization method {method.experimental_method}'
-          ' is not implemented.')
+          ' is not implemented.'
+      )
   else:
-    logging.debug(
-        'Neither "method" nor "experimental_method" for QuantizationMethod '
-        'is specified. Static range quantization is used by default.')
-    return _static_range_quantize(saved_model_path, signature_keys, tags,
-                                  output_directory, quantization_options,
-                                  representative_dataset)
+    raise ValueError(f'Invalid value for QuantizationMethod: {method.method}.')
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.cc
deleted file mode 100644
index 15299f710f1..00000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.h"
-
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "pybind11/pybind11.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/python/lib/core/pybind11_lib.h"
-
-namespace tensorflow {
-namespace quantization {
-namespace {
-
-using ::tensorflow::quantization::internal::ExportedModel;
-
-// Serializes a GraphDef. Raises python ValueError if serialization fails.
-std::string SerializeGraphDef(const GraphDef& graph_def,
-                              const absl::string_view function_name,
-                              const int line_no) {
-  const std::string graph_def_serialized = graph_def.SerializeAsString();
-
-  // Empty string means it failed to serialize the protobuf with an error. See
-  // the docstring for SerializeAsString for details.
-  if (graph_def_serialized.empty()) {
-    throw py::value_error(absl::StrFormat(
-        "Failed to serialize GraphDef result from function %s [%s:%d].",
-        function_name, __FILE__, line_no));
-  }
-
-  return graph_def_serialized;
-}
-
-}  // namespace
-
-std::pair<std::string, std::string> QuantizeQatModel(
-    const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const absl::StatusOr<ExportedModel> exported_model =
-      internal::QuantizeQatModel(saved_model_path, exported_names_str, tags,
-                                 quant_opts_serialized);
-  if (!exported_model.ok()) {
-    throw py::value_error(absl::StrFormat("Failed to quantize QAT model: %s",
-                                          exported_model.status().message()));
-  }
-
-  return std::make_pair(
-      SerializeGraphDef(exported_model->graph_def, __func__, __LINE__),
-      exported_model->init_node_name);
-}
-
-std::pair<std::string, std::string> QuantizePtqDynamicRange(
-    const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const absl::StatusOr<ExportedModel> exported_model =
-      internal::QuantizePtqDynamicRange(saved_model_path, exported_names_str,
-                                        tags, quant_opts_serialized);
-  if (!exported_model.ok()) {
-    throw py::value_error(
-        absl::StrFormat("Failed to apply post-training dynamic range "
-                        "quantization to the model: %s",
-                        exported_model.status().message()));
-  }
-
-  return std::make_pair(
-      SerializeGraphDef(exported_model->graph_def, __func__, __LINE__),
-      exported_model->init_node_name);
-}
-
-std::pair<std::string, std::string> QuantizePtqModelPreCalibration(
-    const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const absl::StatusOr<ExportedModel> exported_model =
-      internal::QuantizePtqModelPreCalibration(
-          saved_model_path, exported_names_str, tags, quant_opts_serialized);
-  if (!exported_model.ok()) {
-    throw py::value_error(absl::StrFormat(
-        "Failed to quantize PTQ model at the precalibration stage: %s",
-        exported_model.status().message()));
-  }
-
-  return std::make_pair(
-      SerializeGraphDef(exported_model->graph_def, __func__, __LINE__),
-      exported_model->init_node_name);
-}
-
-std::pair<std::string, std::string> QuantizePtqModelPostCalibration(
-    const absl::string_view saved_model_path,
-    const absl::string_view exported_names_str, const absl::string_view tags,
-    const absl::string_view quant_opts_serialized) {
-  const absl::StatusOr<ExportedModel> exported_model =
-      internal::QuantizePtqModelPostCalibration(
-          saved_model_path, exported_names_str, tags, quant_opts_serialized);
-  if (!exported_model.ok()) {
-    throw py::value_error(absl::StrFormat(
-        "Failed to quantize PTQ model at the postcalibration stage: %s",
-        exported_model.status().message()));
-  }
-
-  return std::make_pair(
-      SerializeGraphDef(exported_model->graph_def, __func__, __LINE__),
-      exported_model->init_node_name);
-}
-
-void ClearCollectedInformationFromCalibrator() {
-  calibrator::CalibratorSingleton::ClearCollectedInformation();
-}
-
-void ClearDataFromCalibrator(absl::string_view id) {
-  calibrator::CalibratorSingleton::ClearData(id);
-}
-
-float GetMinFromCalibrator(absl::string_view id) {
-  std::optional<std::pair<float, float>> min_max =
-      calibrator::CalibratorSingleton::GetMinMax(id);
-  if (!min_max.has_value()) {
-    throw py::value_error(absl::StrFormat(
-        "No calibrated data; cannot find min value for '%s'", id));
-  }
-
-  return min_max->first;
-}
-
-float GetMaxFromCalibrator(absl::string_view id) {
-  std::optional<std::pair<float, float>> min_max =
-      calibrator::CalibratorSingleton::GetMinMax(id);
-  if (!min_max.has_value()) {
-    throw py::value_error(absl::StrFormat(
-        "No calibrated data; cannot find max value for '%s'", id));
-  }
-
-  return min_max->second;
-}
-
-}  // namespace quantization
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.h
deleted file mode 100644
index e18bb01a498..00000000000
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model_wrapper.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_WRAPPER_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_WRAPPER_H_
-
-#include <string>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-
-namespace tensorflow {
-namespace quantization {
-
-// TODO(b/247442990): Devise a better data structure to transfer this data
-// structure to python.
-std::pair<std::string, std::string> QuantizeQatModel(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
-
-std::pair<std::string, std::string> QuantizePtqDynamicRange(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
-
-// Runs the pre-calibration step of post-training quantization (PTQ). Returns
-// (serialized GraphDef, initializer node name).
-std::pair<std::string, std::string> QuantizePtqModelPreCalibration(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
-
-// Runs the post-calibration step of post-training quantization (PTQ). Returns
-// (serialized GraphDef, initializer node name).
-std::pair<std::string, std::string> QuantizePtqModelPostCalibration(
-    absl::string_view saved_model_path, absl::string_view exported_names_str,
-    absl::string_view tags, absl::string_view quant_opts_serialized);
-
-void ClearCollectedInformationFromCalibrator();
-
-void ClearDataFromCalibrator(absl::string_view id);
-
-float GetMinFromCalibrator(absl::string_view id);
-
-float GetMaxFromCalibrator(absl::string_view id);
-
-}  // namespace quantization
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PYTHON_QUANTIZE_MODEL_WRAPPER_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
index 78311571f14..5608605b5c4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
@@ -36,13 +36,55 @@
 
 # A type alias expressing that it can be either a RepresentativeDataset or
 # a mapping of signature key to RepresentativeDataset.
-RepresentativeDatasetOrMapping = Union[RepresentativeDataset,
-                                       RepresentativeDatasetMapping]
+RepresentativeDatasetOrMapping = Union[
+    RepresentativeDataset, RepresentativeDatasetMapping
+]
+
+
+class RepresentativeDatasetSaver:
+  """Representative dataset saver.
+
+  Exposes a single method `save` that saves the provided representative dataset
+  into files.
+
+  This is useful when you would like to keep a snapshot of your representative
+  dataset at a file system or when you need to pass the representative dataset
+  as files.
+  """
+
+  def save(
+      self, representative_dataset: RepresentativeDatasetOrMapping
+  ) -> None:
+    """Saves the representative dataset.
+
+    Args:
+      representative_dataset: RepresentativeDataset or
+        RepresentativeDatasetMapping which is a signature_def_key ->
+        representative dataset mapping. RepresentativeDataset should be
+        considered as: {"serving_default": representative_dataset}.
+    """
+    raise NotImplementedError('Method "save" is not implemented.')
+
+
+class RepresentativeDatasetLoader:
+  """Representative dataset loader.
+
+  Exposes a single method `load` that loads the representative dataset from
+  files.
+  """
+
+  def load(self) -> RepresentativeDatasetMapping:
+    """Loads the representative dataset.
+
+    Returns:
+      A signature def key -> representative dataset mapping.
+    """
+    raise NotImplementedError('Method "load" is not implemented.')
 
 
 def replace_tensors_by_numpy_ndarrays(
-    repr_ds: RepresentativeDataset,
-    sess: session.Session) -> RepresentativeDataset:
+    repr_ds: RepresentativeDataset, sess: session.Session
+) -> RepresentativeDataset:
   """Replaces tf.Tensors in samples by their evaluated numpy arrays.
 
   Note: This should be run in graph mode (default in TF1) only.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
index d48d91c950e..4d2ff7aa776 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset_test.py
@@ -40,9 +40,12 @@ def _contains_tensor(sample: repr_dataset.RepresentativeSample) -> bool:
 class RepresentativeDatasetTest(test.TestCase):
   """Tests functions for representative datasets."""
 
-  def _assert_tensorlike_all_close(self, sess: session.Session,
-                                   tensorlike_value_1: core.TensorLike,
-                                   tensorlike_value_2: core.TensorLike) -> None:
+  def _assert_tensorlike_all_close(
+      self,
+      sess: session.Session,
+      tensorlike_value_1: core.TensorLike,
+      tensorlike_value_2: core.TensorLike,
+  ) -> None:
     """Asserts that two different TensorLike values are "all close".
 
     Args:
@@ -59,9 +62,11 @@ def _assert_tensorlike_all_close(self, sess: session.Session,
     self.assertAllClose(tensorlike_value_1, tensorlike_value_2)
 
   def _assert_sample_values_all_close(
-      self, sess: session.Session,
+      self,
+      sess: session.Session,
       repr_ds_1: repr_dataset.RepresentativeDataset,
-      repr_ds_2: repr_dataset.RepresentativeDataset) -> None:
+      repr_ds_2: repr_dataset.RepresentativeDataset,
+  ) -> None:
     """Asserts that the sample values are "all close" between the two datasets.
 
     This assumes that the order of corresponding samples is preserved and the
@@ -76,24 +81,29 @@ def _assert_sample_values_all_close(
       self.assertCountEqual(sample_1.keys(), sample_2.keys())
 
       for input_key in sample_1:
-        self._assert_tensorlike_all_close(sess, sample_1[input_key],
-                                          sample_2[input_key])
+        self._assert_tensorlike_all_close(
+            sess, sample_1[input_key], sample_2[input_key]
+        )
 
   @test_util.deprecated_graph_mode_only
   def test_replace_tensors_by_numpy_ndarrays_with_tensor_list(self):
     num_samples = 8
     samples = [
-        np.random.uniform(low=-1., high=1., size=(3, 3)).astype('f4')
+        np.random.uniform(low=-1.0, high=1.0, size=(3, 3)).astype('f4')
         for _ in range(num_samples)
     ]
 
-    repr_ds: repr_dataset.RepresentativeDataset = [{
-        'input_tensor': ops.convert_to_tensor(sample),
-    } for sample in samples]
+    repr_ds: repr_dataset.RepresentativeDataset = [
+        {
+            'input_tensor': ops.convert_to_tensor(sample),
+        }
+        for sample in samples
+    ]
 
     with self.session() as sess:
       new_repr_ds = repr_dataset.replace_tensors_by_numpy_ndarrays(
-          repr_ds, sess)
+          repr_ds, sess
+      )
 
       # The resulting dataset should not contain any tf.Tensors.
       self.assertFalse(any(map(_contains_tensor, new_repr_ds)))
@@ -103,7 +113,7 @@ def test_replace_tensors_by_numpy_ndarrays_with_tensor_list(self):
   def test_replace_tensors_by_numpy_ndarrays_with_tensor_generator(self):
     num_samples = 8
     samples = [
-        np.random.uniform(low=-1., high=1., size=(1, 4)).astype('f4')
+        np.random.uniform(low=-1.0, high=1.0, size=(1, 4)).astype('f4')
         for _ in range(num_samples)
     ]
 
@@ -113,7 +123,8 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
 
     with self.session() as sess:
       new_repr_ds = repr_dataset.replace_tensors_by_numpy_ndarrays(
-          data_gen(), sess)
+          data_gen(), sess
+      )
 
       # The resulting dataset should not contain any tf.Tensors.
       self.assertFalse(any(map(_contains_tensor, new_repr_ds)))
@@ -122,13 +133,17 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
   @test_util.deprecated_graph_mode_only
   def test_replace_tensors_by_numpy_ndarrays_is_noop_when_no_tensor(self):
     # Fill the representative dataset with np.ndarrays only.
-    repr_ds: repr_dataset.RepresentativeDataset = [{
-        'input_tensor': np.random.uniform(low=-1., high=1., size=(4, 3)),
-    } for _ in range(8)]
+    repr_ds: repr_dataset.RepresentativeDataset = [
+        {
+            'input_tensor': np.random.uniform(low=-1.0, high=1.0, size=(4, 3)),
+        }
+        for _ in range(8)
+    ]
 
     with self.session() as sess:
       new_repr_ds = repr_dataset.replace_tensors_by_numpy_ndarrays(
-          repr_ds, sess)
+          repr_ds, sess
+      )
 
       # The resulting dataset should not contain any tf.Tensors.
       self.assertFalse(any(map(_contains_tensor, new_repr_ds)))
@@ -138,24 +153,31 @@ def test_replace_tensors_by_numpy_ndarrays_is_noop_when_no_tensor(self):
   def test_replace_tensors_by_numpy_ndarrays_mixed_tensor_and_ndarray(self):
     num_tensors = 4
     samples = [
-        np.random.uniform(low=-1., high=1., size=(3, 3)).astype('f4')
+        np.random.uniform(low=-1.0, high=1.0, size=(3, 3)).astype('f4')
         for _ in range(num_tensors)
     ]
 
-    repr_ds: repr_dataset.RepresentativeDataset = [{
-        'tensor_key': ops.convert_to_tensor(sample),
-    } for sample in samples]
+    repr_ds: repr_dataset.RepresentativeDataset = [
+        {
+            'tensor_key': ops.convert_to_tensor(sample),
+        }
+        for sample in samples
+    ]
 
     # Extend the representative dataset with np.ndarrays.
-    repr_ds.extend([{
-        'tensor_key': np.random.uniform(low=-1., high=1., size=(3, 3))
-    } for _ in range(4)])
+    repr_ds.extend(
+        [
+            {'tensor_key': np.random.uniform(low=-1.0, high=1.0, size=(3, 3))}
+            for _ in range(4)
+        ]
+    )
 
     random.shuffle(repr_ds)
 
     with self.session() as sess:
       new_repr_ds = repr_dataset.replace_tensors_by_numpy_ndarrays(
-          repr_ds, sess)
+          repr_ds, sess
+      )
 
       # The resulting dataset should not contain any tf.Tensors.
       self.assertFalse(any(map(_contains_tensor, new_repr_ds)))
@@ -163,9 +185,10 @@ def test_replace_tensors_by_numpy_ndarrays_mixed_tensor_and_ndarray(self):
 
   def test_get_num_samples_returns_num_samples_when_list(self):
     num_samples = 8
-    repr_ds = [{
-        'input': np.random.uniform(low=-1., high=1., size=(1, 2))
-    } for _ in range(num_samples)]
+    repr_ds = [
+        {'input': np.random.uniform(low=-1.0, high=1.0, size=(1, 2))}
+        for _ in range(num_samples)
+    ]
 
     self.assertEqual(repr_dataset.get_num_samples(repr_ds), num_samples)
 
@@ -174,7 +197,9 @@ def test_get_num_samples_returns_none_for_generator(self):
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(num_samples):
-        yield {'input_tensor': np.random.uniform(low=-1., high=1., size=(1, 4))}
+        yield {
+            'input_tensor': np.random.uniform(low=-1.0, high=1.0, size=(1, 4))
+        }
 
     repr_ds = data_gen()
     self.assertIsNone(repr_dataset.get_num_samples(repr_ds))
@@ -184,7 +209,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertLen(list(repr_ds), num_samples)
 
   def test_get_num_samples_returns_none_when_len_raises_error(self):
-
     class LenRaisingError:
       """A test-only class that raises an error when len() is called.
 
@@ -195,10 +219,36 @@ class LenRaisingError:
 
       def __len__(self):
         raise ValueError(
-            'You cannot take the len() of instance of LenRaisingError.')
+            'You cannot take the len() of instance of LenRaisingError.'
+        )
 
     self.assertIsNone(repr_dataset.get_num_samples(LenRaisingError()))
 
 
+class RepresentativeDatasetSaverTest(test.TestCase):
+  """Test cases for RepresentativeDatasetSaver."""
+
+  def test_save_raises_error(self):
+    saver = repr_dataset.RepresentativeDatasetSaver()
+    repr_ds = {'serving_default': []}
+
+    with self.assertRaisesRegex(
+        NotImplementedError, 'Method "save" is not implemented.'
+    ):
+      saver.save(repr_ds)
+
+
+class RepresentativeDatasetLoaderTest(test.TestCase):
+  """Test cases for RepresentativeDatasetLoader."""
+
+  def test_load_raises_error(self):
+    loader = repr_dataset.RepresentativeDatasetLoader()
+
+    with self.assertRaisesRegex(
+        NotImplementedError, 'Method "load" is not implemented.'
+    ):
+      loader.load()
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py
new file mode 100644
index 00000000000..03204ed5bc6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/save_model.py
@@ -0,0 +1,413 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines utilities involving SavedModel."""
+
+from typing import Collection, Dict, Mapping, Optional, Sequence
+
+from absl import logging
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import constants as saved_model_constants
+from tensorflow.python.saved_model import loader_impl as saved_model_loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.types import core
+
+# Mapping of signature def key -> SignatureDef.
+_SignatureDefMap = Mapping[str, meta_graph_pb2.SignatureDef]
+
+
+def get_signatures_from_saved_model(
+    saved_model_path: str,
+    signature_keys: Optional[Sequence[str]] = None,
+    tags: Optional[Collection[str]] = None,
+) -> Dict[str, meta_graph_pb2.SignatureDef]:
+  """Gets a map from signature keys to their SignatureDef.
+
+  Args:
+    saved_model_path: Path to the saved model.
+    signature_keys: List of keys identifying SignatureDef to retrieve. If None,
+      retrieve all except the init signature.
+    tags: Set of tags identifying the MetaGraphDef within the SavedModel.
+
+  Returns:
+    A map from signature_key to its SignatureDef.
+  """
+  if tags is None:
+    tags = {tag_constants.SERVING}
+
+  loader = saved_model_loader.SavedModelLoader(saved_model_path)
+  meta_graphdef = loader.get_meta_graph_def_from_tags(tags)
+  signatures = {}
+  for key, signature_def in meta_graphdef.signature_def.items():
+    if key == saved_model_constants.INIT_OP_SIGNATURE_KEY:
+      continue
+    if signature_keys is not None and key not in signature_keys:
+      continue
+    signatures[key] = signature_def
+
+  return signatures
+
+
+def _restore_output_tensor_names(
+    graph_def: graph_pb2.GraphDef,
+) -> graph_pb2.GraphDef:
+  """Restores the output tensor names of the converted model.
+
+  During the conversion, the output tensor names of the original model are
+  embedded in the `tf_saved_model.index_path` attribute of the RetVal nodes and
+  might become the name of Retval nodes as well (with an index suffix if there
+  are multiple output tensors from one node). Since Retval nodes are not used in
+  SavedModel, this function removes them and restore the names to the actual
+  output tensors.
+
+  Args:
+    graph_def: the converted GraphDef.
+
+  Returns:
+    The GraphDef with Retval nodes removed and output tensor names restored.
+  """
+  output_renaming_map = {}
+  with session.Session(graph=ops.Graph()):
+    importer.import_graph_def(graph_def, name='')
+    graph = ops.get_default_graph()
+    for op in graph.get_operations():
+      if op.type == '_Retval':
+        expected_node_name = op.name
+        if op.get_attr('tf_saved_model.index_path') is not None:
+          index_path_name = op.get_attr('tf_saved_model.index_path')[0]
+          index_path_name = index_path_name.decode('utf-8').split(':')[0]
+          try:
+            # Only use the index_path name if it points to a Retval node.
+            index_path_node = graph.get_operation_by_name(index_path_name)
+            if index_path_node.type == '_Retval':
+              expected_node_name = index_path_name
+          except KeyError:
+            pass
+        retval_input_node_name = op.inputs[0].op.name
+        output_renaming_map[retval_input_node_name] = expected_node_name
+
+  for node in reversed(graph_def.node):
+    if node.name in output_renaming_map:
+      node.name = output_renaming_map[node.name]
+    elif node.op == '_Retval':
+      graph_def.node.remove(node)
+    else:
+      # Update the inputs referring to the pre-renaming node.
+      for idx, input_name in enumerate(node.input):
+        if input_name in output_renaming_map:
+          node.input[idx] = output_renaming_map[input_name]
+      # Update the control inputs referring to the pre-renaming node.
+      updating_inputs = []
+      for input_name in reversed(node.input):
+        if input_name.startswith('^') and input_name[1:] in output_renaming_map:
+          updating_inputs.append(input_name[1:])
+          node.input.remove(input_name)
+      for updating_input in updating_inputs:
+        node.input.append('^' + output_renaming_map[updating_input])
+  return graph_def
+
+
+def _create_empty_output_dir(output_directory: str) -> None:
+  """Creates the `output_directory`.
+
+  If `output_directory` already exists, it recursively deletes all contents
+  inside the directory.
+
+  Also creates the parent & intermediate directories.
+
+  Args:
+    output_directory: Output directory.
+  """
+  if file_io.file_exists_v2(output_directory):
+    logging.info(
+        'Deleting existing directory for quantized model output: %s .',
+        output_directory,
+    )
+    file_io.delete_recursively_v2(output_directory)
+
+  file_io.recursive_create_dir_v2(output_directory)
+
+
+def _validate_signatures(
+    signature_def_map: _SignatureDefMap, exported_graph: ops.Graph
+) -> _SignatureDefMap:
+  """Validates if the tensor names in signatures are consistent with the graph.
+
+  This function checks if the input and output tensor names in the signatures
+  exist if the graph. The output tensor names might change during conversion,
+  we try to fix that with `_restore_output_tensor_names`. Besides, if there
+  are duplicated tensor names, they we will be prefixed with the signature name.
+  However, if that doesn't work the signatures can't be used with the converted
+  graph.
+
+  Args:
+    signature_def_map: the signatures to validate.
+    exported_graph: The PTQ-exported GraphDef.
+
+  Returns:
+    The signatures with tensor names prefixed with signature name if necessary.
+
+  Raises:
+    ValueError: Iff the signatures are not consistent with the graph.
+  """
+  for signature_key, signature_def in signature_def_map.items():
+    for tensor_info in signature_def.inputs.values():
+      try:
+        exported_graph.get_tensor_by_name(tensor_info.name)
+      except KeyError as exc:
+        try:
+          prefixed_name = signature_key + '_' + tensor_info.name
+          exported_graph.get_tensor_by_name(prefixed_name)
+          tensor_info.name = prefixed_name
+        except KeyError:
+          raise ValueError(
+              'Cannot find the input tensor with name %s in the graph.'
+              % tensor_info.name
+          ) from exc
+
+    for tensor_info in signature_def.outputs.values():
+      try:
+        exported_graph.get_tensor_by_name(tensor_info.name)
+      except KeyError as exc:
+        try:
+          prefixed_name = signature_key + '_' + tensor_info.name
+          exported_graph.get_tensor_by_name(prefixed_name)
+          tensor_info.name = prefixed_name
+        except KeyError:
+          raise ValueError(
+              'Cannot find the output tensor with name %s in the graph.'
+              % tensor_info.name
+          ) from exc
+
+  return signature_def_map
+
+
+def _find_op(
+    graph: ops.Graph, op_name: Optional[str]
+) -> Optional[ops.Operation]:
+  """Finds the operation with `op_name`.
+
+  Args:
+    graph: The graph to find from.
+    op_name: Name of the node.
+
+  Returns:
+    The operation that corresponds to `op_name`. Returns None iff op_name is an
+    empty string or None.
+
+  Raises:
+    ValueError: `op_name` is malformed.
+  """
+  if not op_name:
+    return None
+
+  init_op = graph.get_operation_by_name(op_name)
+  logging.debug('Op found in the graph: %s', op_name)
+
+  return init_op
+
+
+def _find_file_prefix_tensor(graph: ops.Graph) -> Optional[core.Tensor]:
+  """Finds the "file_prefix" tensor used for identifying the checkpoint path.
+
+  File prefix tensor can be identified as the output of an `_Arg` node which has
+  the value "__tf_file_prefix" in its "tf_saved_model.index_path" attribute.
+  This attribute should have been set to the file prefix argument by the
+  `InsertRestoreOpPass` when creating the `RestoreV2Op` for the variables.
+
+  Args:
+    graph: The graph to find the file_prefix tensor from.
+
+  Returns:
+    None if not found. True if a "file_prefix" tensor is found.
+  """
+  for op in graph.get_operations():
+    if op.type == '_Arg' and (
+        b'__tf_file_prefix' in op.get_attr('tf_saved_model.index_path')
+    ):
+      candidate_tensor = op.outputs[0]
+      return candidate_tensor
+
+  return None
+
+
+def _create_empty_variable(
+    node_def: node_def_pb2.NodeDef,
+) -> variables.Variable:
+  """Creates an empty `Variable`.
+
+  Variables with unknown shape and empty value is created.
+
+  Args:
+    node_def: Instance of `NodeDef` of the `VarHandleOp`.
+
+  Returns:
+    Empty `Variable` with only `shared_name` and `dtype` populated according to
+    `node_def`.
+  """
+  shared_name = str(node_def.attr['shared_name'].s, encoding='utf-8')
+  dtype: dtypes.DType = dtypes.as_dtype(node_def.attr['dtype'].type)
+
+  return variables.Variable(
+      [], trainable=False, name=shared_name, dtype=dtype, shape=None
+  )
+
+
+def _find_variables(
+    graph_def: graph_pb2.GraphDef,
+) -> Mapping[str, node_def_pb2.NodeDef]:
+  """Finds existing `VarHandleOp`s in the graph.
+
+  Args:
+    graph_def: `GraphDef` to find variables from.
+
+  Returns:
+    A shared_name -> `NodeDef` mapping that maps each `NodeDef` corresponding to
+    `VarHandleOp` to its `shared_name`.
+  """
+  var_mapping = {}
+  for node in graph_def.node:
+    if node.op == 'VarHandleOp':
+      var_mapping[str(node.attr['shared_name'].s, encoding='utf-8')] = node
+
+  for func in graph_def.library.function:
+    for node in func.node_def:
+      if node.op == 'VarHandleOp':
+        var_mapping[str(node.attr['shared_name'].s, encoding='utf-8')] = node
+
+  return var_mapping
+
+
+def _save_function_alias(
+    saved_model_dir: str,
+    tags: Collection[str],
+    function_aliases: Mapping[str, str],
+) -> None:
+  """Saves the function alias to the SavedModel.
+
+  SavedModelBuilder (TF1 saved model saver) does not support saving function
+  aliases, so this function loads the SavedModel proto and adds the
+  `function_aliases` field.
+
+  Args:
+    saved_model_dir: Path to the saved model directory.
+    tags: A collection of tags to specify the meta graph.
+    function_aliases: Function name -> function alias mapping.
+  """
+  loader = saved_model_loader.SavedModelLoader(saved_model_dir)
+  meta_graph_def = loader.get_meta_graph_def_from_tags(tags)
+
+  for function_name, function_alias in function_aliases.items():
+    meta_graph_def.meta_info_def.function_aliases[function_name] = (
+        function_alias
+    )
+
+  saved_model_proto_serialized = loader.saved_model.SerializeToString()
+
+  # TODO(b/266015731): Also update and set the SavedModel fingerprint.
+  path = file_io.join(
+      saved_model_dir, saved_model_constants.SAVED_MODEL_FILENAME_PB
+  )
+  file_io.atomic_write_string_to_file(path, saved_model_proto_serialized)
+
+
+def save_model_v1(
+    graph_def: graph_pb2.GraphDef,
+    output_dir: str,
+    signature_def_map: _SignatureDefMap,
+    tags: Collection[str],
+    init_op_name: Optional[str] = None,
+    restore_op_name: Optional[str] = None,
+    checkpoint_dir: Optional[str] = None,
+    variable_shared_names: Optional[Sequence[str]] = None,
+    function_aliases: Optional[Mapping[str, str]] = None,
+) -> None:
+  """Saves the model.
+
+  Saves the provided graph def as SavedModel.
+  Uses TF1 SavedModel semantics (i.e. no object graph).
+
+  Args:
+    graph_def: Graph to save.
+    output_dir: Output directory for the SavedModel.
+    signature_def_map: Mapping of signature def key -> SignatureDef.
+    tags: Tags for the meta graph def.
+    init_op_name: Name of the node for initialization.
+    restore_op_name: Name of the node for restoration.
+    checkpoint_dir: Path to checkpoint file where variable values are saved.
+    variable_shared_names: Shared name of the variables in the model.
+    function_aliases: Function name -> function alias mapping.
+
+  Raises:
+    ValueError iff the graph does not contain a valid signature.
+  """
+  _create_empty_output_dir(output_dir)
+  v1_builder = builder.SavedModelBuilder(output_dir)
+
+  graph_def = _restore_output_tensor_names(graph_def)
+  with session.Session(graph=ops.Graph()) as sess:
+    importer.import_graph_def(graph_def, name='')
+
+    signature_def_map = _validate_signatures(
+        signature_def_map, ops.get_default_graph()
+    )
+
+    # `restore_op_name` is non-empty & non-None when variables should be
+    # restored before saving.
+    if restore_op_name:
+      var_mapping = _find_variables(graph_def)
+      logging.debug(
+          'Shared names of the variables to be saved: %s',
+          str(list(var_mapping.keys())),
+      )
+
+      for shared_name in variable_shared_names:
+        var_node_def = var_mapping[shared_name]
+
+        # Variables with unknown shape and empty value is created. This is
+        # just there to register a variable with `shared_name` to the resource
+        # manager and collections, so that the values in checkpoint is
+        # properly restored via `RestoreV2` op. Once restored, the value,
+        # dtype and shape will be properly populated.
+        _create_empty_variable(var_node_def)
+
+      # Restores the variables by running the `RestoreV2` op.
+      # `v1_builder.save()` saves the restored variables to the variables/
+      # directory in `output_dir`.
+      sess.run(
+          _find_op(sess.graph, op_name=restore_op_name),
+          feed_dict={_find_file_prefix_tensor(sess.graph): checkpoint_dir},
+      )
+
+    v1_builder.add_meta_graph_and_variables(
+        sess,
+        tags,
+        signature_def_map=signature_def_map,
+        main_op=_find_op(sess.graph, op_name=init_op_name),
+    )
+
+  v1_builder.save()
+
+  if function_aliases:
+    _save_function_alias(output_dir, tags, function_aliases)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 8ed6fec1f9c..ad221cdf3a7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -23,6 +23,7 @@ option cc_enable_arenas = true;
 // Various techniques for model quantization are defined within this message
 // along with a field that specifies a method to be used for a particular
 // quantization request.
+// NEXT ID: 3
 message QuantizationMethod {
   // Quantization methods that are supported as a stable API.
   enum Method {
@@ -45,6 +46,9 @@ message QuantizationMethod {
     // determined in the graph executions. The weights are quantized during
     // conversion.
     DYNAMIC_RANGE = 2;
+
+    // Weight-only quantization. Only weights are quantized during conversion.
+    WEIGHT_ONLY = 3;
   }
 
   // Quantization method is either exprimental or non-experimental method.
@@ -71,8 +75,10 @@ enum QuantizationPrecision {
 // Unit (either nodes or ops at this moment) wise quantization method for
 // mixed bit precision quantization. It contains the name of the unit,
 // the granularity of the unit, and the quantization method for each unit.
+// NEXT ID: 6
 message UnitWiseQuantizationPrecision {
   // Quantization unit granularity.
+  // NEXT ID: 3
   enum UnitType {
     // This should never be used. Using this will generally result in an error.
     UNIT_UNSPECIFIED = 0;
@@ -98,6 +104,7 @@ message UnitWiseQuantizationPrecision {
 
 // List of supported opsets to deploy the quantized model.
 // The quantized model contains different set of ops depending on the opset.
+// NEXT ID: 4
 enum OpSet {
   OP_SET_UNSPECIFIED = 0;  // go/do-include-enum-unspecified
   // Uses TF ops that mimic quantization behavior. Used when the corresponding
@@ -109,6 +116,15 @@ enum OpSet {
   UNIFORM_QUANTIZED = 3;
 }
 
+// Configurations for variable freezing during quantization passes.
+// NEXT ID: 2
+message FreezeAllVariables {
+  // Setting this to true freezes all variables to constants during
+  // quantization. Setting this to `false` is an experimental feature and does
+  // not have stability guarantees.
+  bool enabled = 1;
+}
+
 // Defines various options to specify and control the behavior of the quantizer.
 // It consists of
 // 1) Model-wise quantization configuration as a default configuration. If it is
@@ -116,6 +132,7 @@ enum OpSet {
 // 2) A set of supported operations.
 // 3) Unit wise quantization precision.
 // 4) Target hardware name.
+// NEXT ID: 9
 message QuantizationOptions {
   // The default quantization configuration for the model. If the below
   // unit-wise configuration does not exist, we use this default quantization
@@ -123,9 +140,11 @@ message QuantizationOptions {
   // exists, this default one will become the quantization configuration for
   // units that are not specified in unit-wise configurations.
   QuantizationMethod quantization_method = 1;
-  OpSet op_set = 2;
+
+  OpSet op_set = 2;  // If not specified, it defaults to `TF`.
 
   QuantizationPrecision quantization_precision = 3;
+
   // Quantization precision for each unit. Units can become either
   // nodes or ops, and the mixture of those different units are allowed.
   // If there are conflicts or ambiguity in this unit-wise precision, our
@@ -136,4 +155,21 @@ message QuantizationOptions {
   // supported for Post-training Dynamic Range Quantization. By default, it is
   // set to 1024. To disable this, set the value to -1 explicitly.
   int64 min_num_elements_for_weights = 5;
+
+  // When set to `true`, freezes all variables in the model into constants.
+  // When set to `false` the model's large constants are converted to variables.
+  // Setting this to `false` is an experimental feature and quantization may
+  // fail. To quantize models larger than 2 GiB, this should be set to `false`.
+  // If not set, it defaults to `true`.
+  FreezeAllVariables freeze_all_variables = 6;
+
+  // Enables chnanel-wise quantizaiton. By default, channel-wise quantization is
+  // not applied regardless of the op support. Currently, it is supported for
+  // Uniform Quantized opset only.
+  bool enable_per_channel_quantization = 7;
+
+  // Enables two inputs of an operation to be both tensors.
+  // Currently supports MatMul and BatchMatMul ops for XLA.
+  // TODO(b/263528090): Check the condition when this feature is beneficial.
+  bool enable_two_input_tensors = 8;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 0cbd943990b..d40d8632d87 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -47,63 +48,90 @@ void AddQuantizeQatPasses(mlir::PassManager &pm,
                           const QuantizationOptions &quantization_options) {
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateConvertFakeQuantToQdqPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateUnrollBatchMatMulPassPass());
-  // TODO(b/229995333): Add PrepareLiftingPass for QAT. In QAT, AffineOps are
-  // connected to FakeQuantOp instead of the ConstOp so need to add separate
-  // pattern for FakeQuantOp.
-  // pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreatePrepareLiftingPass());
+  // TODO(b/260031290): Set unfold_batchmatmul = false for ODML support
+  if (quantization_options.op_set() == OpSet::UNIFORM_QUANTIZED) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateUnrollBatchMatMulPassPass());
+  }
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreatePrepareLiftingPass());
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
-      quantization_options.op_set()));
+      quantization_options.op_set(),
+      quantization_options.enable_two_input_tensors()));
   pm.addPass(mlir::quant::CreateInsertQuantizedFunctionsPass(
-      mlir::quant::QuantizationMethod::kQuantizationAwareTraining,
+      quantization_options.quantization_method().experimental_method(),
       quantization_options.op_set()));
+  // TODO(b/260677670): Pass quantization options as pass's inputs where
+  // applicable
   pm.addPass(mlir::quant::CreateQuantizeCompositeFunctionsPass(
-      mlir::quant::QuantizationMethod::kQuantizationAwareTraining,
-      quantization_options.op_set()));
+      quantization_options.quantization_method().experimental_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights()));
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
 
-  // For XLA opset, the graph is inlined to take benefit of constant folding
-  // and the TF Conv/Matmul ops with cast-hack are converted to XLA ops.
-  if (quantization_options.op_set() == OpSet::XLA) {
+  // TODO(b/264637396): Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
     pm.addPass(mlir::createInlinerPass());
     pm.addPass(mlir::TF::CreateTFShapeInferencePass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
   }
-
   pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreateOptimizePass());
 }
 
 void AddQuantizePtqDynamicRangePasses(
     mlir::PassManager &pm, const QuantizationOptions &quantization_options) {
+  // TODO(b/260031290): Set unfold_batchmatmul = false for ODML support
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateUnrollBatchMatMulPassPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreatePrepareLiftingPass());
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsDRQPass(
       quantization_options.min_num_elements_for_weights()));
   pm.addPass(mlir::quant::CreateInsertQuantizedFunctionsPass(
-      mlir::quant::QuantizationMethod::kDynamicRangeQuantization,
+      quantization_options.quantization_method().experimental_method(),
       quantization_options.op_set()));
   pm.addPass(mlir::quant::CreateQuantizeCompositeFunctionsPass(
-      mlir::quant::QuantizationMethod::kDynamicRangeQuantization,
-      quantization_options.op_set()));
+      quantization_options.quantization_method().experimental_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights()));
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
+  // TODO(b/264637396): Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
+    pm.addPass(mlir::createInlinerPass());
+    pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  }
+
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreateOptimizePass());
 }
 
 void AddQuantizePtqPreCalibrationPasses(
     mlir::PassManager &pm, const QuantizationOptions &quantization_options) {
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::TF::CreateUnrollBatchMatMulPassPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreatePrepareLiftingPass());
+  // TODO(b/260031290): Set unfold_batchmatmul = false for ODML support
+  if (quantization_options.op_set() == OpSet::UNIFORM_QUANTIZED) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateUnrollBatchMatMulPassPass());
+  }
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreatePrepareLiftingPass());
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
-      quantization_options.op_set()));
+      quantization_options.op_set(),
+      quantization_options.enable_two_input_tensors()));
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateInsertCustomAggregationOpsPass());
   pm.addPass(mlir::quant::CreateIssueIDsOfCustomAggregationOpsPass());
@@ -112,28 +140,31 @@ void AddQuantizePtqPreCalibrationPasses(
 void AddQuantizePtqPostCalibrationPasses(
     mlir::PassManager &pm, const QuantizationOptions &quantization_options) {
   pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateConvertCustomAggregationOpToQuantStatsPass());
   pm.addPass(mlir::quant::CreateInsertQuantizedFunctionsPass(
-      mlir::quant::QuantizationMethod::kPostTrainingQuantization,
+      quantization_options.quantization_method().experimental_method(),
       quantization_options.op_set()));
   pm.addPass(mlir::quant::CreateQuantizeCompositeFunctionsPass(
-      mlir::quant::QuantizationMethod::kPostTrainingQuantization,
-      quantization_options.op_set()));
+      quantization_options.quantization_method().experimental_method(),
+      quantization_options.op_set(),
+      quantization_options.enable_per_channel_quantization(),
+      quantization_options.min_num_elements_for_weights()));
   pm.addPass(mlir::createSymbolDCEPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
 
-  // For XLA opset, the graph is inlined to take benefit of constant folding
-  // and the TF Conv/Matmul ops with cast-hack are converted to XLA ops.
-  if (quantization_options.op_set() == OpSet::XLA) {
+  // TODO(b/264637396): Deprecate TF opset
+  if (quantization_options.op_set() != OpSet::TF) {
     pm.addPass(mlir::createInlinerPass());
     pm.addPass(mlir::TF::CreateTFShapeInferencePass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    if (quantization_options.op_set() == OpSet::XLA) {
+      pm.addNestedPass<mlir::func::FuncOp>(
+          mlir::quant::CreateReplaceCastHacksWithTFXLAOpsPass());
+    }
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
   }
-
   pm.addNestedPass<mlir::func::FuncOp>(mlir::quant::CreateOptimizePass());
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index 35226cb93b7..e77091aba37 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -14,39 +14,64 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace quantization {
+namespace {
+
+absl::Status RunPassesOnModuleOp(const absl::string_view mlir_dump_file_name,
+                                 mlir::PassManager& pass_manager,
+                                 mlir::ModuleOp module_op) {
+  mlir::StatusScopedDiagnosticHandler statusHandler(module_op.getContext(),
+                                                    /*propagate=*/true);
+
+  const absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> dump_file =
+      MaybeEnableIrPrinting(pass_manager, mlir_dump_file_name);
+  if (!dump_file.ok()) {
+    return dump_file.status();
+  }
+
+  if (failed(pass_manager.run(module_op))) {
+    return tsl::ToAbslStatus(statusHandler.ConsumeStatus());
+  }
+
+  return absl::OkStatus();
+}
 
-Status PreprocessAndFreezeGraph(mlir::ModuleOp module,
-                                mlir::MLIRContext* context,
-                                llvm::Optional<Session*> session) {
+}  // namespace
+
+absl::Status PreprocessAndFreezeGraph(
+    const absl::string_view mlir_dump_file_prefix, const bool is_inliner_run,
+    const absl::flat_hash_set<std::string>& noinline_functions,
+    mlir::ModuleOp module_op, mlir::MLIRContext* context,
+    llvm::Optional<Session*> session) {
   mlir::PassManager pm_before_freezing_variables(context);
-  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
+  mlir::StatusScopedDiagnosticHandler statusHandler(module_op.getContext(),
                                                     /*propagate=*/true);
 
   mlir::TF::StandardPipelineOptions standard_pipeline_options;
@@ -61,22 +86,33 @@ Status PreprocessAndFreezeGraph(mlir::ModuleOp module,
   mlir::PassManager pm_after_freezing_variables(context);
   pm_after_freezing_variables.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm_after_freezing_variables.addPass(mlir::createCanonicalizerPass());
-  pm_after_freezing_variables.addPass(mlir::createInlinerPass());
 
-  if (failed(pm_before_freezing_variables.run(module))) {
-    return statusHandler.ConsumeStatus();
+  // Makes certain functions immune to the `InlinerPass`. Used to preserve
+  // aliased functions.
+  pm_after_freezing_variables.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::CreateMarkFunctionsNoinlinePass(std::vector<std::string>(
+          noinline_functions.begin(), noinline_functions.end())));
+  if (is_inliner_run) {
+    pm_after_freezing_variables.addPass(mlir::createInlinerPass());
   }
 
-  if (session.has_value() && failed(mlir::tf_saved_model::FreezeVariables(
-                                 module, session.getValue()))) {
-    return statusHandler.ConsumeStatus();
+  if (const auto pre_variable_freezing_status = RunPassesOnModuleOp(
+          /*mlir_dump_file_name=*/absl::StrCat(
+              mlir_dump_file_prefix, "_preprocess_pre_variable_freezing"),
+          pm_before_freezing_variables, module_op);
+      !pre_variable_freezing_status.ok()) {
+    return pre_variable_freezing_status;
   }
 
-  if (failed(pm_after_freezing_variables.run(module))) {
-    return statusHandler.ConsumeStatus();
+  if (session.has_value() && failed(mlir::tf_saved_model::FreezeVariables(
+                                 module_op, session.value()))) {
+    return tsl::ToAbslStatus(statusHandler.ConsumeStatus());
   }
 
-  return OkStatus();
+  return RunPassesOnModuleOp(
+      /*mlir_dump_file_name=*/absl::StrCat(
+          mlir_dump_file_prefix, "_preprocess_post_variable_freezing"),
+      pm_after_freezing_variables, module_op);
 }
 
 }  // namespace quantization
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
index c89278e9abd..6914f4ade18 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
@@ -15,17 +15,46 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_QUANTIZE_PREPROCESS_H_
 
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/Optional.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace quantization {
 
-Status PreprocessAndFreezeGraph(mlir::ModuleOp module,
-                                mlir::MLIRContext* context,
-                                llvm::Optional<Session*> session);
+// Default MLIR dump file prefix for TensorFlow quantization passes.
+inline constexpr absl::string_view kDefaultTfQuantMlirDumpFilePrefix =
+    "tf_quant";
+
+// Preprocesses the `module_op` for quantization. The preprocess steps include
+// freezing the variables in the graph into constants. `is_inliner_run`
+// determines whether the `InlinerPass` should be run after unfreezing.
+//
+// `mlir_dump_file_prefix` is primarily used for debugging and does not affect
+// the preprocessing behavior. Instructions for producing MLIR dump files are in
+// the comments of `tensorflow::quantization::MaybeEnableIrPrinting` function.
+absl::Status PreprocessAndFreezeGraph(
+    absl::string_view mlir_dump_file_prefix, bool is_inliner_run,
+    const absl::flat_hash_set<std::string>& noinline_functions,
+    mlir::ModuleOp module_op, mlir::MLIRContext* context,
+    llvm::Optional<Session*> session);
+
+// Overload of `PreprocessAndFreezeGraph` that uses the default MLIR dump file
+// prefix.
+inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
+                                             mlir::MLIRContext* context,
+                                             llvm::Optional<Session*> session) {
+  return PreprocessAndFreezeGraph(
+      /*mlir_dump_file_prefix=*/kDefaultTfQuantMlirDumpFilePrefix,
+      /*is_inliner_run=*/true, /*noinline_functions=*/{}, module_op, context,
+      session);
+}
 
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
index 9a11a6fc4d2..2587a70d9cf 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
     ],
@@ -23,6 +24,8 @@ glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     size_override = {
+        "insert_quantized_functions.mlir": "medium",
+        "insert_quantized_functions_drq.mlir": "medium",
         "replace_cast_hacks_with_tf_xla_ops_large_constants.mlir": "medium",
     },
     tags_override = {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/duplicate_shape_determining_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/duplicate_shape_determining_constants.mlir
new file mode 100644
index 00000000000..f01274b5ff4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/duplicate_shape_determining_constants.mlir
@@ -0,0 +1,223 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -quant-duplicate-shape-determining-constants | FileCheck %s
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_at_idx_1
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<?x2xi32>)
+func.func private @duplicate_const_for_shape_determining_operand_at_idx_1(%arg0: tensor<?x2xi32>) -> tensor<?x2x1xi32> {
+  %cst = "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // idx 1 should be a compile time constant
+  %0 = "tf.ExpandDims"(%arg0, %cst) {device = ""} : (tensor<?x2xi32>, tensor<i32>) -> tensor<?x2x1xi32>
+  %1 = "tf.AddV2"(%cst, %cst) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %0 : tensor<?x2x1xi32>
+}
+// Check that the constant is cloned with same value.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<2> : tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<2> : tensor<i32>
+
+// Check that the constants used for tf.ExpandDims and tf.AddV2 are different.
+// CHECK: %[[EXPAND_DIMS:.*]] = "tf.ExpandDims"(%[[ARG_0]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_at_idx_2
+// CHECK-SAME: (%[[ARG_0:.*]]: tensor<16x4xf32>, %[[ARG_1:.*]]: tensor<16xi32>)
+func.func private @duplicate_const_for_shape_determining_operand_at_idx_2(%arg0: tensor<16x4xf32>, %arg1: tensor<16xi32>) -> tensor<16xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi32>} : () -> tensor<1xi32>
+  // idx 2 should be a compile time constant
+  %0 = "tf.GatherV2"(%arg0, %arg1, %cst) {batch_dims = 1: i64} : (tensor<16x4xf32>, tensor<16xi32>, tensor<1xi32>) -> tensor<16xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%cst, %cst) {device = ""} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+
+  return %0 : tensor<16xf32>
+}
+// Check that the constant is cloned with same value.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<1xi32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<1xi32>
+
+// Check that the constants used for tf.GatherV2 and tf.AddV2 are different.
+// CHECK: %[[GATHER_V2:.*]] = "tf.GatherV2"(%[[ARG_0]], %[[ARG_1]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_shape_determining_operand_with_variadic_operand
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<16x1xf32>
+func.func private @duplicate_const_for_shape_determining_operand_with_variadic_operand(%arg0: tensor<16x1xf32>) -> tensor<16x4xf32> {
+  %axis = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // tf.ConcatV2 accepts a variadic operand. The last operand should be compile
+  // time constant.
+  %0 = "tf.ConcatV2"(%arg0, %arg0, %arg0, %arg0, %axis) : (tensor<16x1xf32>, tensor<16x1xf32>, tensor<16x1xf32>, tensor<16x1xf32>, tensor<i32>) -> tensor<16x4xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%axis, %axis) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %0 : tensor<16x4xf32>
+}
+// Check that the constant is cloned with same value.
+// The duplicated constant is the last index of the ConcatV2 op (which
+// accepts a variadic arg).
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+
+// Check that the constants used for tf.ConcatV2 and tf.AddV2 are different.
+// CHECK: %[[CONCAT_V2:.*]] = "tf.ConcatV2"(%[[ARG_0]], %[[ARG_0]], %[[ARG_0]], %[[ARG_0]], %[[CST_1]])
+// CHECK: %[[ADDV2:.*]] = "tf.AddV2"(%[[CST_0]], %[[CST_0]])
+
+// -----
+
+// CHECK-LABEL: @duplicate_const_for_multiple_shape_determining_operands
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<8x4x16x16x16xf32>
+// CHECK-SAME: %[[ARG_1:.*]]: tensor<4x3x3x16x16xf32>
+func.func private @duplicate_const_for_multiple_shape_determining_operands(
+    %arg0: tensor<8x4x16x16x16xf32>, %arg1: tensor<4x3x3x16x16xf32>) -> tensor<8x4x14x14x16xf32> {
+  %strides = "tf.Const"() {value = dense<[3, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %padding = "tf.Const"() {value = dense<0> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
+  %lhs_dilation = "tf.Const"() {value = dense<[4, 1, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %rhs_dilation = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+  %feature_group_count = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+
+  // tf.XlaConvV2's 2, 3, 4, 5, 6 indices should be compile-time constants.
+  %0 = "tf.XlaConvV2"(%arg0, %arg1, %strides, %padding, %lhs_dilation, %rhs_dilation, %feature_group_count) {
+      batch_group_count = 1 : i64,
+      dimension_numbers = "\18\03 \042\03\00\01\02@\04P\04Z\03\01\02\03b\03\01\02\03",
+      precision_config = ""} : (tensor<8x4x16x16x16xf32>, tensor<4x3x3x16x16xf32>, tensor<3xi32>,
+         tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<8x4x14x14x16xf32>
+
+  // Just to introduce an extra use for %cst.
+  %1 = "tf.AddV2"(%feature_group_count, %feature_group_count) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.AddV2"(%lhs_dilation, %lhs_dilation) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+  %3 = "tf.AddV2"(%rhs_dilation, %rhs_dilation) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+  %4 = "tf.AddV2"(%padding, %padding) {device = ""} : (tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+  %5 = "tf.AddV2"(%strides, %strides) {device = ""} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+
+  return %0 : tensor<8x4x14x14x16xf32>
+}
+
+// Check that the constants that are input to XlaConvV2's 3rd, 4th, 5th, 6th
+// and 7th arguments are cloned with same value.
+// CHECK-DAG: %[[STRIDES:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[3, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[STRIDES_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[3, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[PADDING:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<0> : tensor<3x2xi32>
+// CHECK-DAG: %[[PADDING_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<0> : tensor<3x2xi32>
+// CHECK-DAG: %[[LHS_DILATION:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[4, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[LHS_DILATION_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<[4, 1, 1]> : tensor<3xi32>
+// CHECK-DAG: %[[RHS_DILATION:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<3xi32>
+// CHECK-DAG: %[[RHS_DILATION_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<3xi32>
+// CHECK-DAG: %[[FEATURE_GROUP_COUNT:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+// CHECK-DAG: %[[FEATURE_GROUP_COUNT_COPY:.*]] = "tf.Const"()
+// CHECK-SAME: value = dense<1> : tensor<i32>
+
+// Check that the constants that are input to XlaConvV2's 3rd and 4th
+// arguments are not duplicated.
+// CHECK-NOT: "tf.Const"()
+
+// Check that the constants used for tf.XlaConvV2 and tf.AddV2s are different.
+// CHECK: %[[GATHER_V2:.*]] = "tf.XlaConvV2"(%[[ARG_0]], %[[ARG_1]], %[[STRIDES_COPY]], %[[PADDING_COPY]], %[[LHS_DILATION_COPY]], %[[RHS_DILATION_COPY]], %[[FEATURE_GROUP_COUNT_COPY]])
+
+// CHECK: %[[ADDV2_2:.*]] = "tf.AddV2"(%[[FEATURE_GROUP_COUNT]], %[[FEATURE_GROUP_COUNT]])
+// CHECK: %[[ADDV2_0:.*]] = "tf.AddV2"(%[[LHS_DILATION]], %[[LHS_DILATION]])
+// CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[RHS_DILATION]], %[[RHS_DILATION]])
+
+// -----
+
+// CHECK-LABEL: @stop_recursion_when_arg_is_reached
+func.func private @stop_recursion_when_arg_is_reached(%arg0: tensor<1x2x3xf32>, %arg1: tensor<i32>) -> tensor<?x?x?xf32> {
+// The pass wants to duplicate constants for TF::MeanOp's operand idx 1, but
+// it can't proceed since it is a function argument.
+
+// expected-warning @+1 {{Operand idx (zero-based): 1 does not have a defining op and cannot be duplicated}}
+  %0 = "tf.Mean"(%arg0, %arg1) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<?x?x?xf32>
+
+  return %0: tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @constant_with_single_use_not_duplicated
+func.func private @constant_with_single_use_not_duplicated(%arg0: tensor<1x2x3xf32>) -> tensor<1x3xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Max"(%arg0, %0) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<1x3xf32>
+
+  return %1: tensor<1x3xf32>
+}
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+// Check that there are no extra "tf.Const"s existing in this function.
+// CHECK-NOT: "tf.Const"
+
+// Check that the usages of %[[CST]] and %[[CST_0]] are untouched.
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CST]], %[[CST_0]])
+// CHECK: "tf.Max"({{.*}}, %[[ADD]])
+
+// -----
+
+// CHECK-LABEL: @recursively_duplicate_constants
+func.func private @recursively_duplicate_constants(%arg0: tensor<1x2x3xf32>) -> tensor<1x3xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Max"(%arg0, %0) {device = ""} : (tensor<1x2x3xf32>, tensor<i32>) -> tensor<1x3xf32>
+
+  // Just to introduce extra usages for %cst and %cst_0.
+  %2 = "tf.Mul"(%cst, %cst_0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  return %1: tensor<1x3xf32>
+}
+// Check that both constants are duplicated, which are used to transitively
+// determine the shape of the result of `tf.Max`.
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<0>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"
+// CHECK-SAME: dense<1>
+
+// -----
+
+// CHECK-LABEL: @early_stop_at_shape_op
+func.func private @early_stop_at_shape_op() -> tensor<1x3xi32> {
+  %cst = "tf.Const"() {device = "", value = dense<1.0> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Shape"(%cst) : (tensor<1x3xf32>) -> tensor<2xi32>
+  // Operand index 0 ($dims) should be a compile-time constant.
+  %2 = "tf.Fill"(%1, %cst_0) {device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<1x3xi32>
+
+  // Just to introduce extra usages for %cst.
+  %3 = "tf.Mul"(%cst, %cst) {device = ""} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+
+  return %2: tensor<1x3xi32>
+}
+// The output of tf.Shape is considered a compile-time constant, so the
+// constant leading to tf.Shape (which transitively becomes an input to the
+// first arg of tf.Fill) is not duplicated.
+
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"
+// CHECK-SAME: dense<1.000000e+00> : tensor<1x3xf32>
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"
+// CHECK-SAME: dense<2> : tensor<i32>
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"(%[[CST]])
+// CHECK: %[[FILL:.*]] = "tf.Fill"(%[[SHAPE]], %[[CST_0]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
index 325fe2bd355..5d9801e2085 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/fake_quant_e2e_xla.mlir
@@ -18,30 +18,35 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-LABEL: func @conv_with_multiple_uses
 // CHECK: %[[div:.*]] = "tf.Div"(%arg0
 // CHECK: %[[add:.*]] = "tf.AddV2"(%[[div]]
-// CHECK: %[[floor:.*]] = "tf.Floor"(%[[add]]
-// CHECK: %[[clip:.*]] = "tf.ClipByValue"(%[[floor]]
-// CHECK: %[[quant:.*]] = "tf.Cast"(%[[clip]]) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xi8>
+// CHECK: %[[maximum:.*]] = "tf.Maximum"(%[[add]]
+// CHECK: %[[minimum:.*]] = "tf.Minimum"(%[[maximum]]
+// CHECK: %[[round:.*]] = "tf.Round"(%[[minimum]]
+// CHECK: %[[quant:.*]] = "tf.Cast"(%[[round]]) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xi8>
 // CHECK: %[[pad:.*]] = "tf.PadV2"(%[[quant]]
 // CHECK: %[[xlaconv:.*]] = "tf.XlaConvV2"(%[[pad]]
 // CHECK: %[[sub:.*]] = "tf.Sub"(%[[xlaconv]]
 // CHECK: %[[cast:.*]] = "tf.Cast"(%[[sub]]) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
 // CHECK: %[[dequant1:.*]] = "tf.Mul"(%[[cast]]
 // CHECK: %[[relu:.*]] = "tf.Relu"(%[[dequant1]]
+// CHECK: %[[clamped:.*]] = "tf.Minimum"(%[[relu]]
 
 // CHECK: %[[rescale1:.*]] = "tf.Mul"(%[[cast]]
 // CHECK: %[[add2:.*]] = "tf.AddV2"(%[[rescale1]]
-// CHECK: %[[floor2:.*]] = "tf.Floor"(%[[add2]]
-// CHECK: %[[clip2:.*]] = "tf.ClipByValue"(%[[floor2]]
-// CHECK: %[[quant2:.*]] = "tf.Cast"(%[[clip2]]) {Truncate = false} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xi8>
+// CHECK: %[[maximum2:.*]] = "tf.Maximum"(%[[add2]]
+// CHECK: %[[minimum2:.*]] = "tf.Minimum"(%[[maximum2]]
+// CHECK: %[[round2:.*]] = "tf.Round"(%[[minimum2]]
+// CHECK: %[[quant2:.*]] = "tf.Cast"(%[[round2]]) {Truncate = false} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xi8>
 
 // CHECK: %[[pad2:.*]] = "tf.PadV2"(%[[quant2]]
 // CHECK: %[[xlaconv2:.*]] = "tf.XlaConvV2"(%[[pad2]]
 // CHECK: %[[sub2:.*]] = "tf.Sub"(%[[xlaconv2]]
 // CHECK: %[[cast2:.*]] = "tf.Cast"(%[[sub2]]) {Truncate = false} : (tensor<1x3x2x2xi32>) -> tensor<1x3x2x2xf32>
 // CHECK: %[[rescale2:.*]] = "tf.Mul"(%[[cast2]]
+// CHECK: %[[rescale2_maxclamped:.*]] = "tf.Maximum"(%[[rescale2]]
+// CHECK: %[[rescale2_minclamped:.*]] = "tf.Minimum"(%[[rescale2_maxclamped]]
 
-// CHECK: %[[sum:.*]] = "tf.Sum"(%[[relu]]
-// CHECK: return %[[rescale2]], %[[sum]]
+// CHECK: %[[sum:.*]] = "tf.Sum"(%[[clamped]]
+// CHECK: return %[[rescale2_minclamped]], %[[sum]]
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
index cd43b2e6aa4..e34b5ed0fb0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_main_function.mlir
@@ -1,4 +1,5 @@
-// RUN: tf-quant-opt %s -quant-add-main-function -allow-unregistered-dialect -mlir-disable-threading -split-input-file | FileCheck %s
+// RUN: tf-quant-opt %s -quant-insert-main-function -mlir-disable-threading \
+// RUN:     -allow-unregistered-dialect -split-input-file | FileCheck %s
 
 // CHECK-LABEL: module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
 module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
@@ -34,7 +35,9 @@ module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantic
 // CHECK-NOT: f = @NoOp
 // CHECK:   %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @mul1} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
 // CHECK:   %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg2, %arg3) {config = "", config_proto = "", executor_type = "", f = @mul2} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK:   return %[[PARTITIONEDCALL_0]], %[[PARTITIONEDCALL_1]] : tensor<1xf32>, tensor<1xf32>
+// CHECK:   %[[IDENTITY_0:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_1]])
+// CHECK:   %[[IDENTITY_1:.*]] = "tf.Identity"(%[[PARTITIONEDCALL_0]])
+// CHECK:   return %[[IDENTITY_1]], %[[IDENTITY_0]] : tensor<1xf32>, tensor<1xf32>
 // CHECK: }
 }
 
@@ -84,5 +87,128 @@ module attributes {tf.versions = {producer = 1132 : i32}, tf_saved_model.semanti
 // CHECK-SAME: f = @add
 // CHECK-SAME: }
 // CHECK-SAME: : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-// CHECK: return %[[CALL0]] : tensor<1xf32>
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%[[CALL0]])
+// CHECK: return %[[IDENTITY]] : tensor<1xf32>
+}
+
+// -----
+
+// Test a case where an entry function return multiple values
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @topk(%arg0: tensor<16xf32> {tf_saved_model.index_path = ["input"]}, %arg1: tensor<i32> {tf_saved_model.index_path = ["k"]}) -> (tensor<?xf32> {tf_saved_model.index_path = ["values"]}, tensor<?xi32> {tf_saved_model.index_path = ["indices"]}) attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}, tf_saved_model.exported_names = ["topk"]} {
+    %0:2 = "tf.TopKV2"(%arg0, %arg1): (tensor<16xf32>, tensor<i32>) -> (tensor<?xf32>, tensor<?xi32>)
+    func.return %0#0, %0#1: tensor<?xf32>, tensor<?xi32>
+  }
+
+// CHECK: func.func private @topk(%arg0: tensor<16xf32>, %arg1: tensor<i32>) -> (tensor<?xf32>, tensor<?xi32>)
+// CHECK-SAME: attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}}
+
+// CHECK: func.func @main(%arg0: tensor<16xf32> {tf_saved_model.index_path = ["input:0"]}, %arg1: tensor<i32> {tf_saved_model.index_path = ["k:0"]})
+// CHECK-SAME: -> (tensor<?xf32> {tf_saved_model.index_path = ["TopK:0"]}, tensor<?xi32> {tf_saved_model.index_path = ["TopK:1"]})
+// CHECK-SAME: attributes {tf.entry_function = {inputs = "input:0,k:0", outputs = "TopK:0,TopK:1"}, tf_saved_model.exported_names = ["main"]}
+// CHECK: %[[CALL0:.*]]:2 = "tf.PartitionedCall"(%arg0, %arg1) {config = "", config_proto = "", executor_type = "", f = @topk}
+// Expects an IdentityN op to be created.
+// CHECK: %[[IDENTITY:.*]]:2 = "tf.IdentityN"(%[[CALL0]]#0, %[[CALL0]]#1) : (tensor<?xf32>, tensor<?xi32>) -> (tensor<?xf32>, tensor<?xi32>)
+// CHECK: return %[[IDENTITY]]#0, %[[IDENTITY]]#1 : tensor<?xf32>, tensor<?xi32>
+}
+
+// -----
+
+// Test that the signature prefix is added when there are duplicated input names.
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @mul1(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "y:0,x:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["mul1"]} {
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @mul2(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "y:0,x:0", outputs = "PartitionedCall_1:0"}, tf_saved_model.exported_names = ["mul2"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    func.return %1 : tensor<1xf32>
+  }
+
+// CHECK: func @main
+// CHECK: (%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}
+// CHECK: %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]})
+// CHECK: -> (tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["PartitionedCall_1:0"]})
+// CHECK: attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "PartitionedCall:0,PartitionedCall_1:0"}, tf_saved_model.exported_names = ["main"]}
+}
+
+// -----
+
+// Test that the signature prefix is added when there are duplicated output names.
+module attributes {tf.versions = {producer = 930 : i32}, tf_saved_model.semantics, tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}  {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  func.func @NoOp() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"]} {
+    func.return
+  }
+
+  func.func @mul1(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0", outputs = "output:0"}, tf_saved_model.exported_names = ["mul1"]} {
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @mul2(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["y"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["x"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "mul2_y:0,mul2_x:0", outputs = "output:0"}, tf_saved_model.exported_names = ["mul2"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Mul"(%arg1, %arg0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    %1 = "tf.Mul"(%0, %cst) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+    func.return %1 : tensor<1xf32>
+  }
+// CHECK: func @main
+// CHECK: (%arg0: tensor<1xf32> {tf_saved_model.index_path = ["mul1_y:0"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["mul1_x:0"]}
+// CHECK: %arg2: tensor<1xf32> {tf_saved_model.index_path = ["mul2_y:0"]}, %arg3: tensor<1xf32> {tf_saved_model.index_path = ["mul2_x:0"]})
+// CHECK: -> (tensor<1xf32> {tf_saved_model.index_path = ["mul1_output:0"]}, tensor<1xf32> {tf_saved_model.index_path = ["mul2_output:0"]})
+// CHECK: attributes {tf.entry_function = {inputs = "mul1_y:0,mul1_x:0,mul2_y:0,mul2_x:0", outputs = "mul1_output:0,mul2_output:0"}, tf_saved_model.exported_names = ["main"]}
+}
+
+// -----
+
+// Tests when a function called @main already exists, it is renamed to
+// `main_{i}` to avoid conflict.
+module attributes {tf_saved_model.semantics}  {
+  func.func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["x"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["y"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "x:0,y:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+// CHECK: func.func private @main_0
+// CHECK: func.func @main
+}
+
+// -----
+
+// Tests when a function called @main already exists and @main_{i} also already
+// exists, it increments the suffix number until there's no conflict.
+module attributes {tf_saved_model.semantics}  {
+  func.func @main_0(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["z"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "z:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main_0"]} {
+    %0 = "tf.Identity"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+
+  func.func @main(%arg0: tensor<1xf32> {tf_saved_model.index_path = ["x"]}, %arg1: tensor<1xf32> {tf_saved_model.index_path = ["y"]}) -> (tensor<1xf32> {tf_saved_model.index_path = ["output_0"]}) attributes {tf.entry_function = {inputs = "x:0,y:0", outputs = "output:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return %0 : tensor<1xf32>
+  }
+// `@main_0` remains touched.
+// CHECK: func.func private @main_0
+// CHECK-SAME: z:0
+
+// `@main` should be renamed to `@main_1` instead of `@main_0` to avoid
+// conflict.
+// CHECK: func.func private @main_1
+// CHECK-SAME: x:0
+
+// This is the newly created main function.
+// CHECK: func.func @main
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions.mlir
index 5dd67fd14b8..030d5fb946a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions.mlir
@@ -1,5 +1,6 @@
 // RUN: tf-quant-opt %s -quant-insert-quantized-functions | FileCheck %s
 // RUN: tf-quant-opt %s -quant-insert-quantized-functions='quantization-method=ptq target-opset=UNIFORM_QUANTIZED' | FileCheck --check-prefix=UQ-CHECK %s
+// RUN: tf-quant-opt %s -quant-insert-quantized-functions='quantization-method=weight_only target-opset=XLA' | FileCheck --check-prefix=WEIGHTONLY %s
 
 // Empty module
 module {
@@ -13,22 +14,50 @@ module {
 // CHECK-NOT: func private @internal_conv2d_fn
 // CHECK-NOT: func private @internal_matmul_fn
 // CHECK: func private @quantized_conv2d_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["Conv2D", "BiasAdd"]
 // CHECK: func private @quantized_conv2d_with_bias_and_relu_fn
 // CHECK: func private @quantized_conv2d_with_bias_and_relu6_fn
 // CHECK: func private @quantized_conv2d_fn
 // CHECK: func private @quantized_conv2d_with_relu_fn
 // CHECK: func private @quantized_conv2d_with_relu6_fn
+// CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu_float_output_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["DepthwiseConv2D", "BiasAdd", "Relu"]
 // CHECK: func private @quantized_matmul_with_bias_fn
 // CHECK: func private @quantized_matmul_with_bias_and_relu_fn
 // CHECK: func private @quantized_matmul_with_bias_and_relu6_fn
 // CHECK: func private @quantized_matmul_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["MatMul"]
 // CHECK: func private @quantized_matmul_with_relu_fn
 // CHECK: func private @quantized_matmul_with_relu6_fn
+// CHECK: func private @quantized_conv3d_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["Conv3D", "BiasAdd"]
+// CHECK: func private @quantized_batch_matmul_with_bias_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["BatchMatMul", "BiasAdd"]
 // CHECK: func private @quantize_i8
 // CHECK: func private @dequantize_i8
 
+// UQ-CHECK-NOT: func private @dequantize_i8
+// UQ-CHECK-NOT: func private @internal_conv2d_fn
+// UQ-CHECK-NOT: func private @internal_requantize_qi8_fn
+// UQ-CHECK-NOT: func private @internal_requantize_no_activation_fn
+// UQ-CHECK-NOT: func private @internal_requantize_and_relu_fn
+// UQ-CHECK-NOT: func private @quantize_i8
 // UQ-CHECK: func private @quantized_conv2d_with_bias_fn
-// UQ-CHECK: func private @quantize_qi8
-// UQ-CHECK: func private @requantize_qi8
-// UQ-CHECK: func private @dequantize_qi8
+// UQ-CHECK-SAME: tf_quant.quantized_ops = ["Conv2D", "BiasAdd"]
+// UQ-CHECK: func private @quantized_conv2d_with_bias_and_relu_fn
+// UQ-CHECK: func private @quantized_conv2d_with_bias_and_relu6_fn
+// UQ-CHECK: func private @quantized_conv2d_with_relu_fn
+// UQ-CHECK: func private @quantized_conv2d_with_relu6_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_fn
+// UQ-CHECK-SAME: tf_quant.quantized_ops = ["DepthwiseConv2D", "BiasAdd"]
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_bias_and_relu6_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_relu_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_with_relu6_fn
 
+// WEIGHTONLY: func private @quantized_conv2d
+// WEIGHTONLY: func private @quantized_depthwise_conv2d
+// WEIGHTONLY: func private @quantized_matmul
+// WEIGHTONLY: func private @quantized_conv3d
+// WEIGHTONLY: func private @quantized_batch_matmul
+// WEIGHTONLY: func private @dequantize_i8
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions_drq.mlir
index 2b2a014ccfb..a6e060dc74a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_quantized_functions_drq.mlir
@@ -1,4 +1,5 @@
-// RUN: tf-quant-opt %s -quant-insert-quantized-functions='quantization-method=drq target-opset=UNIFORM_QUANTIZED' | FileCheck %s
+// RUN: tf-quant-opt %s -quant-insert-quantized-functions='quantization-method=drq' | FileCheck %s
+// RUN: tf-quant-opt %s -quant-insert-quantized-functions='quantization-method=drq target-opset=UNIFORM_QUANTIZED' | FileCheck --check-prefix=UQ-CHECK %s
 
 // Empty module
 module {
@@ -12,3 +13,12 @@ module {
 // CHECK-NOT: func private @internal_quantize_i8
 // CHECK-NOT: func private @internal_matmul_fn
 // CHECK: func private @quantized_matmul_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["MatMul"]
+// CHECK: func private @quantized_conv2d_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["Conv2D"]
+// CHECK: func private @quantized_depthwise_conv2d_fn
+// CHECK-SAME: tf_quant.quantized_ops = ["DepthwiseConv2D"]
+
+// UQ-CHECK: func private @quantized_conv2d_fn
+// UQ-CHECK: func private @quantized_depthwise_conv2d_fn
+// UQ-CHECK: func private @quantized_matmul_fn
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir
new file mode 100644
index 00000000000..800f8238d30
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/insert_restore_op.mlir
@@ -0,0 +1,170 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -quant-insert-restore-op | FileCheck %s
+
+// RestoreV2 op created for a single VarHandleOp.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// Original `AssignVariableOp(VarHandleOp, Const)` pattern persists.
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST_0]]) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{.*value = dense<"var_0"> : tensor<1x!tf_type.string>.*}}
+// CHECK-DAG: %[[CST_2:.*]] = "tf.Const"() {{.*value = dense<""> : tensor<1x!tf_type.string>.*}}
+
+// Test that RestoreV2 op is created with 1 resulting value.
+// CHECK: %[[RESTORE:.*]] = "tf.RestoreV2"(%[[ARG_0]], %[[CST_1]], %[[CST_2]]) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]]) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+}
+
+// -----
+
+// RestoreV2 op created for multiple VarHandleOps.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_multiple_variables]} : () -> ()
+
+  func.func @init_func_restore_op_multiple_variables() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+    %cst_1 = "tf.Const"() {value = dense<2> : tensor<4xi32>} : () -> tensor<4xi32>
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+    "tf.AssignVariableOp"(%var_1, %cst_1) : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op_multiple_variables
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<4xi32>>>
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}
+
+// Test that RestoreV2 op is created with 2 resulting values.
+// CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<4xi32>)
+
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) {validate_shape = false} : (tensor<!tf_type.resource<tensor<4xi32>>>, tensor<4xi32>) -> ()
+}
+
+// -----
+
+// RestoreV2 op not created for `AssignVariableOp(VarHandleOp, Const)` patterns
+// in the initializer function of "init_op" type.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_init_op]} : () -> ()
+
+  func.func @init_func_init_op() -> () attributes {
+      tf_saved_model.initializer_type = "init_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+// Check that no function argument is created.
+// CHECK: func.func @init_func_init_op()
+
+// CHECK-DAG: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[CST:.*]] = "tf.Const"() {{{.*value = dense<1.000000e\+00> : tensor<2xf32>.*}}}
+// Make sure that "tf.RestoreV2" is not created.
+// CHECK-NOT: "tf.RestoreV2"
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[CST]]) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+}
+
+// -----
+
+// Test that `RestoreV2Op` is created even when the `Const` op is shared across
+// `AssignVariableOp`s.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_multiple_variables_sharing_const]} : () -> ()
+
+  func.func @init_func_restore_op_multiple_variables_sharing_const() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    // This const is shared and initializes two variables.
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_1, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+
+// CHECK: func.func @init_func_restore_op_multiple_variables_sharing_const
+// Check that an argument ("__tf_file_prefix") is created.
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["__tf_file_prefix"]}
+
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+// CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"() {{.*shared_name = "var_1".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<\["var_0", "var_1"\]> : tensor<2x!tf_type.string>.*}}}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<""> : tensor<2x!tf_type.string>.*}}}
+
+// Test that RestoreV2 op is created with 2 resulting values.
+// CHECK: %[[RESTORE:.*]]:2 = "tf.RestoreV2"(%[[ARG_0]], %[[CST_0]], %[[CST_1]]) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<2xf32>, tensor<2xf32>)
+
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[RESTORE]]#0) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[RESTORE]]#1) {validate_shape = false} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+}
+
+
+// -----
+
+// Test that "tf.RestoreV2" is not created because there are no variables.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op_no_variable]} : () -> ()
+
+  func.func @init_func_restore_op_no_variable() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    return
+  }
+// CHECK: func.func @init_func_restore_op_no_variable()
+// CHECK-NOT: "tf.RestoreV2"
+}
+
+// -----
+
+// Test when there are no initializers.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+// CHECK-NOT: "tf.RestoreV2"
+}
+
+// -----
+
+// Test when there is no SessionInitializerOp.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK-NOT: "tf.RestoreV2"
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
index a125da0f5a3..0a7b5108387 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq.mlir
@@ -22,3 +22,118 @@ func.func @lift_float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>,
 // CHECK-NEXT: %[[OUT:.*]] = "tf.MatMul"(%arg0, %arg1)
 // CHECK-NEXT: return %[[OUT]]
 }
+
+// -----
+
+// CHECK-LABEL: lift_float_conv
+func.func @lift_float_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.Conv2D"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: {_tfl_quant_trait = "fully_quantizable",
+// CHECK-SAME: f = @composite_conv2d_fn_2}
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
+// CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_conv2d_fn_1}
+// CHECK: %[[BIASADD_1:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
+// CHECK: return %[[RELU6_0]], %[[BIASADD_1]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_conv2d_fn_2
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-SAME: data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+// CHECK-NEXT: return %[[CONV2D_0]]
+
+// CHECK-LABEL: private @composite_conv2d_fn_1
+// CHECK-NEXT: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations"
+// CHECK-NEXT: return %[[CONV2D_0]]
+}
+
+// -----
+
+// CHECK-LABEL: not_lift_float_conv_with_non_constant_weights
+func.func @not_lift_float_conv_with_non_constant_weights(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.Conv2D"(%arg0, %arg1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-NOT: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %arg1)
+// CHECK: %[[CONV2D_0:.*]] = "tf.Conv2D"(%arg0, %arg1)
+}
+
+// -----
+
+// CHECK-LABEL: lift_float_depthwise_conv
+func.func @lift_float_depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %1 = "tf.BiasAdd"(%0, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Relu6"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+
+  %3 = "tf.DepthwiseConv2dNative"(%arg0, %cst_1) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1]
+  } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+  %4 = "tf.BiasAdd"(%3, %cst) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  func.return %2, %4 : tensor<*xf32>, tensor<*xf32>
+
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable",
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_2}
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0]])
+// CHECK: %[[RELU6_0:.*]] = "tf.Relu6"(%[[BIASADD_0]])
+// CHECK: %[[PARTITIONEDCALL_1:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1]])
+// CHECK-SAME: f = @composite_depthwise_conv2d_fn_1}
+// CHECK: %[[BIASADD_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_1]], %[[CONST_0]])
+// CHECK: return %[[RELU6_0]], %[[BIASADD_0]]
+// CHECK: }
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_fn_2
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: return %[[DEPTHWISECONV2D_0:.*]]
+
+// CHECK-LABEL: private @composite_depthwise_conv2d_fn_1
+// CHECK-NEXT: %[[DEPTHWISECONV2D_0:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %arg1)
+// CHECK-SAME: attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations"
+// CHECK-NEXT: return %[[DEPTHWISECONV2D_0:.*]]
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
index 73836bc4e2a..b1104490025 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_drq_min_elements.mlir
@@ -1,7 +1,7 @@
 // RUN: tf-quant-opt %s -split-input-file -quant-lift-quantizable-spots-as-functions-drq="min-num-elements-for-weights=2500000" | FileCheck %s
 
-// CHECK-LABEL: float_matmul
-func.func @float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+// CHECK-LABEL: lift_float_matmul
+func.func @lift_float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
   %out_1 = "tf.MatMul"(%arg0, %cst) {
     device = "", transpose_a = false, transpose_b = false
@@ -21,3 +21,21 @@ func.func @float_matmul(%arg0: tensor<1x12x12x512xf32>) -> (tensor<*xf32>, tenso
 
 // CHECK-LABEL: private @composite_matmul_fn_1
 }
+
+// -----
+
+// CHECK-LABEL: not_lift_float_conv
+func.func @not_lift_float_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+  %cst = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x512x512xf32>} : () -> tensor<2x3x512x512xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {
+    data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [],
+    padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true
+  } : (tensor<1x3x4x512xf32>, tensor<2x3x512x512xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x512x512xf32>} : () -> tensor<2x3x512x512xf32>
+// CHECK: %[[PARTITIONEDCALL:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST]])
+// CHECK-NOT: {_tfl_quant_trait = "fully_quantizable",
+// CHECK-SAME: {config = "",
+// CHECK-SAME: f = @composite_conv2d_fn_1}
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
index b2aea357eb7..1d80a199d2e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_quantizable_spots_as_functions_xla.mlir
@@ -71,3 +71,33 @@ func.func @conv_with_dynamic_channel_dim(%arg0: tensor<1x3x4x?xf32>) -> tensor<*
 // Check that the `attr_map` attribute has been removed.
 // CHECK-NOT: attr_map
 // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+
+// -----
+
+func.func @const_filter_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<[[[[-0.308480561, 0.122108772], [-0.0622722618, 0.285358578], [0.279975802, -0.227407396]], [[-0.223535746, 0.301872164], [0.45813936, 0.375932634], [-0.142182723, 9.95125505E-4]], [[0.183462933, 0.212702021], [-0.129749238, 0.0611961856], [0.00308316527, -0.486231536]]], [[[0.272826612, 0.382641196], [-0.135114014, 0.115396179], [-0.424618751, -1.311760e-01]], [[0.433140099, 0.15137814], [-0.102797419, 0.288730145], [-0.183163881, 0.0680986494]], [[0.369127393, -0.0638265759], [0.302147657, -0.35623318], [0.204260975, 0.204581305]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+  %1 = "quantfork.dcast"(%0) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
+  %q_w = "quantfork.qcast"(%cst) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0125:-24>>
+  %dq_w = "quantfork.dcast"(%q_w) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0125:-24>>) -> tensor<2x3x3x2xf32>
+  %2 = "tf.Conv2D"(%1, %dq_w) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.BiasAdd"(%2, %cst_0) {data_format = "NHWC", device = ""} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %4 = "tf.Relu"(%3) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %5 = "quantfork.qcast"(%4) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 0.0027450981093387976:-19>>
+  %6 = "quantfork.dcast"(%5) : (tensor<1x3x2x2x!quant.uniform<i8:f32, 0.0027450981093387976:-19>>) -> tensor<1x3x2x2xf32>
+  %7 = "tf.Identity"(%6) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  %8 = "tf.Identity"(%7) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  return %8 : tensor<1x3x2x2xf32>
+}
+
+// CHECK-LABEL: func @const_filter_with_q_dq
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {{.*}} : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[BIAS:.*]] = "tf.Const"() {device = "", value = dense<[1.000000e-01, 2.000000e-01]> : tensor<2xf32>}
+// CHECK: %[[Q_W:.*]] = "quantfork.qcast"(%[[WEIGHT]])
+// CHECK: %[[DQ_W:.*]] = "quantfork.dcast"(%[[Q_W]])
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"({{.*}}, %[[DQ_W]], %[[BIAS]])
+// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu_fn_1
+
+// CHECK-LABEL: func private @composite_conv2d_with_bias_and_relu_fn_1
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/mark_functions_noinline.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/mark_functions_noinline.mlir
new file mode 100644
index 00000000000..cd8af14e7ea
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/mark_functions_noinline.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-quant-opt %s -mark-functions-noinline='noinline-functions=noinline0' \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that the function is marked tf._noinline = true.
+
+// CHECK-LABEL: @noinline0
+// CHECK-SAME: attributes {{{.*tf._noinline = true.*}}}
+func.func @noinline0() -> (tensor<0xf32>) {
+  %cst = "tf.Const"() {value = dense<1.0> : tensor<0xf32>} : () -> tensor<0xf32>
+  return %cst : tensor<0xf32>
+}
+
+// -----
+
+// Tests that the function not listed in the option `noinline-functions`
+// is not marked tf._noinline = true.
+
+// CHECK-LABEL: @inline
+// CHECK-NOT: tf._noinline
+func.func @inline() -> (tensor<0xf32>) {
+  %cst = "tf.Const"() {value = dense<1.0> : tensor<0xf32>} : () -> tensor<0xf32>
+  return %cst : tensor<0xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
index 9dfd75a2fc9..117c267deac 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_initializer_function_ops_to_main.mlir
@@ -80,13 +80,102 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // Checks that the location for the init op is properly set.
 // CHECK-LOC-LABEL: func.func @main
 // CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
-// CHECK-LOC-SAME: loc("init_op__NoOp")
+// CHECK-LOC-SAME: loc("init_op_NoOp")
+}
+
+// -----
+
+// Tests when the initializer function contains multiple stateful
+// initialization ops. They should be transitively connected through
+// control dependencies (!tf_executor.control), which is guaranteed by
+// the `tf-executor-break-up-islands` pass.
+
+// CHECK-LABEL: module attributes
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+// Check that the initializers list is empty.
+// CHECK: "tf_saved_model.session_initializer"()
+// CHECK-SAME: initializers = []
+
+  func.func @NoOp()
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test_1"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[1]> : tensor<1xi64>} : () -> tensor<1xi64>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_3 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_1, %out, %out_0) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+
+      %out_2, %ctl_4 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<["test_2"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_3, %ctl_5 = tf_executor.island wraps "tf.Const"() {device = "", value = dense<[2]> : tensor<1xi64>} : () -> tensor<1xi64>
+      // Has a control dependency to the previous LookupTableImportV2.
+      %out_4, %ctl_6 = tf_executor.island(%ctl_3) wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "2", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %ctl_7 = tf_executor.island wraps "tf.LookupTableImportV2"(%out_4, %out_2, %out_3) {device = ""} : (tensor<!tf_type.resource>, tensor<1x!tf_type.string>, tensor<1xi64>) -> ()
+      tf_executor.fetch %ctl_7 : !tf_executor.control
+    }
+    return
+  }
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp()
+
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string>) -> tensor<*xi64> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {device = "", value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+      %out_0, %ctl_1 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "1", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+      %out_1, %ctl_2 = tf_executor.island wraps "tf.LookupTableFindV2"(%out_0, %arg0, %out) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+      tf_executor.fetch %out_1 : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["serving_default_input_vocabs:0"]}) -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+      attributes {tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> tensor<*xi64>
+      tf_executor.fetch %out : tensor<*xi64>
+    }
+    return %0 : tensor<*xi64>
+  }
+// Sanity check: The main function's signature & attributes have not changed.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<?x!tf_type.string>
+// CHECK-SAME: tf_saved_model.index_path = ["serving_default_input_vocabs:0"]
+// CHECK-SAME: -> (tensor<*xi64> {tf_saved_model.index_path = ["StatefulPartitionedCall:0"]})
+// CHECK-SAME: tf.entry_function = {inputs = "serving_default_input_vocabs:0", outputs = "StatefulPartitionedCall:0"}
+// CHECK-SAME: tf_saved_model.exported_names = ["main"]
+
+// CHECK: %[[GRAPH_OUT:.*]] = tf_executor.graph
+// CHECK-NEXT: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.PartitionedCall"(%[[ARG]])
+// CHECK-SAME: f = @serving_default
+// Checks that the contents of @NoOp are copied here.
+// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"test_1">.*}}}
+// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<1>.*}}}
+
+// CHECK-NEXT: %[[OUT_2:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_3:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_2]], %[[OUT_0]], %[[OUT_1]])
+
+// CHECK-DAG: %[[OUT_3:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"test_2">.*}}}
+// CHECK-DAG: %[[OUT_4:.*]], %[[CTL_5:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<2>.*}}}
+
+// CHECK-NEXT: %[[OUT_5:.*]], %[[CTL_6:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.HashTableV2"()
+// CHECK-NEXT: %[[CTL_7:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_5]], %[[OUT_3]], %[[OUT_4]])
+
+// Checks that the NoOp with control dependency to the control output for the
+// initializer function is created & fetched.
+// CHECK-NEXT: %[[CTL_8:.*]] = tf_executor.island(%[[CTL_7]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[OUT]], %[[CTL_8]] : tensor<*xi64>, !tf_executor.control
+// CHECK-NEXT: }
+// CHECK-NEXT: return %[[GRAPH_OUT]] : tensor<*xi64>
+
+// Checks that the location for the init op is properly set.
+// CHECK-LOC-LABEL: func.func @main
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
+// CHECK-LOC-SAME: loc("init_op_NoOp")
 }
 
 // -----
 
 // Test the case where the initializer function accepts an argument but it
 // is not used within the body.
+
 // CHECK-LABEL: module attributes
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
   "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
@@ -137,18 +226,20 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // Checks that the location for the init op is properly set.
 // CHECK-LOC-LABEL: func.func @main
 // CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
-// CHECK-LOC-SAME: loc("init_op__NoOp")
+// CHECK-LOC-SAME: loc("init_op_NoOp")
 }
 
 // -----
 
-// Test the case where there are 2 initializer functions.
+// Test the case where there are 2 initializer functions ("init_op" and
+// "restore_op").
+
 // CHECK-LABEL: module attributes
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
   "tf_saved_model.session_initializer"() {initializers = [@NoOp_0, @NoOp_1]} : () -> ()
 // Check that the initializer typed "init_op" is removed from initializers list.
 // CHECK: "tf_saved_model.session_initializer"()
-// CHECK-SAME: initializers = [@NoOp_1]
+// CHECK-SAME: initializers = []
 
   func.func @NoOp_0()
     attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp_0"], tf_saved_model.initializer_type = "init_op"} {
@@ -169,8 +260,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     }
     return
   }
-// The session initializer function typed "restore_op" is not removed.
-// CHECK: @NoOp_1()
+// The session initializer function is removed.
+// CHECK-NOT: @NoOp_1()
 
   func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
     tf_executor.graph {
@@ -183,27 +274,113 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-SAME: tf_saved_model.exported_names = ["main"]
 
 // CHECK: tf_executor.graph
-// Checks that the contents of @NoOp_0 (type: "init_op") are copied here.
-// CHECK-NEXT: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"dummy_op">.*}}}
-// Checks that the contents of @NoOp_1 (type: "restore_op") are not copied here.
-// CHECK-NOT: tf_executor.island wraps "tf.Const"() {{{.*value = dense<1>.*}}}
-// Checks that the NoOp is only dependent on the initializer function with type "init_op".
-// This is because the control dependency node is only required for the
-// initializer function for resources other than variables.
-// CHECK-NEXT: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_0]]) wraps "tf.NoOp"()
-// CHECK-NEXT: tf_executor.fetch %[[CTL_2]] : !tf_executor.control
+// Checks that the contents of the initializer functions are copied here.
+// CHECK-DAG: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"dummy_op"> : tensor<1x!tf_type.string>.*}}}
+// CHECK-DAG: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<1> : tensor<1xi32>.*}}}
+
+// Checks that 2 `NoOp`s having control dependencies to each of the initializer
+// functions are created.
+// CHECK-DAG: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_0]]) wraps "tf.NoOp"()
+// CHECK-DAG: %[[CTL_3:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.NoOp"()
+
+// CHECK: tf_executor.fetch
+// CHECK-SAME: !tf_executor.control, !tf_executor.control
 // CHECK-NEXT: }
 // CHECK-NEXT: return
 
 // Checks that the location for the init op is properly set.
 // CHECK-LOC-LABEL: func.func @main
-// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"()
-// CHECK-LOC-SAME: loc("init_op__NoOp_0")
+
+// CHECK-LOC-DAG: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("init_op_NoOp_0")
+// CHECK-LOC-DAG: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("restore_op_NoOp_1")
+}
+
+// -----
+
+// Tests that initializer function for "restore_op" is merged into @main.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// CHECK: "tf_saved_model.session_initializer"() {initializers = []}
+
+  func.func @init_func_restore_op(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "restore_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "var_0", device = "/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.RestoreV2"(%arg, %out_0, %out) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+      %ctl_3 = tf_executor.island wraps "tf.AssignVariableOp"(%out_1, %out_2) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf.entry_function = {inputs = "", outputs = ""}, tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// A new argument corresponding to the "file_prefix" should be created.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]})
+// CHECK-SAME: {{{.*tf.entry_function = {inputs = "restore_op_0:0", outputs = ""}.*}}}
+// CHECK-NEXT: tf_executor.graph
+
+// Checks that the ops from @init_func_restore_op are cloned.
+// CHECK-DAG: %[[CONST_0:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<""> : tensor<1x!tf_type\.string>.*}}}
+// CHECK-DAG: %[[CONST_1:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<"var_0"> : tensor<1x!tf_type\.string>.*}}}
+// CHECK: %[[VAR_HANDLE:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {{{.*shared_name = "var_0".*}}}
+// CHECK: %[[RESTORE:.*]], %[[CTL_2:.*]] = tf_executor.island wraps "tf.RestoreV2"(%[[ARG]], %[[CONST_1]], %[[CONST_0]])
+// CHECK: %[[CTL_3:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[RESTORE]])
+// CHECK: %[[CTL_4:.*]] = tf_executor.island(%[[CTL_3]]) wraps "tf.NoOp"()
+// CHECK-NEXT: tf_executor.fetch %[[CTL_4]] : !tf_executor.control
+// CHECK: return
+
+// Checks that the Location is properly set for the NoOp.
+// CHECK-LOC: tf_executor.island({{.*}}) wraps "tf.NoOp"() {{.*}} loc("restore_op_init_func_restore_op")
+}
+
+// -----
+
+// Tests that the input name for the new argument created in @main (for the
+// "restore_op" initializer function) is not added when there is no
+// tf.entry_function.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+// CHECK: "tf_saved_model.session_initializer"() {initializers = []}
+
+  func.func @init_func_restore_op(%arg: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]})
+    attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "restore_op"} {
+    tf_executor.graph {
+      %out, %ctl = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_0, %ctl_0 = tf_executor.island wraps "tf.Const"() {value = dense<"var_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      %out_1, %ctl_1 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "var_0", device = "/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+      %out_2, %ctl_2 = tf_executor.island wraps "tf.RestoreV2"(%arg, %out_0, %out) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<2xf32>
+      %ctl_3 = tf_executor.island wraps "tf.AssignVariableOp"(%out_1, %out_2) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+      tf_executor.fetch %ctl_3 : !tf_executor.control
+    }
+    return
+  }
+
+  func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
+    tf_executor.graph {
+      tf_executor.fetch
+    }
+    return
+  }
+// A new argument corresponding to the "file_prefix" should be created.
+// Also checks that tf.entry_function is not created.
+// CHECK: func.func @main(%[[ARG:.*]]: tensor<!tf_type.string> {tf_saved_model.index_path = ["file_prefix"]}) attributes {tf_saved_model.exported_names = ["main"]}
 }
 
 // -----
 
 // Tests no change when there's no initializer functions.
+
 // CHECK-LABEL: module attributes
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
   "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
@@ -265,17 +442,17 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // Tests when the initializer function is empty.
 // CHECK-LABEL: module attributes
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
-  "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_when_main_empty]} : () -> ()
 // Check that the initializers attribute is untouched.
 // CHECK: "tf_saved_model.session_initializer"()
-// CHECK-SAME: initializers = [@NoOp]
+// CHECK-SAME: initializers = [@init_func_when_main_empty]
 
-  func.func @NoOp()
+  func.func @init_func_when_main_empty()
     attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_NoOp"], tf_saved_model.initializer_type = "init_op"} {
     return
   }
 // The initializer function is untouched.
-// CHECK: func.func @NoOp
+// CHECK: func.func @init_func_when_main_empty()
 
   func.func @main() attributes {tf_saved_model.exported_names = ["main"]} {
     tf_executor.graph {
@@ -356,12 +533,13 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 
 // Tests that warning is emitted when an initializer function does not have the
 // tf_saved_model.initializer_type attribute.
+
 // CHECK-LABEL: module attributes
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1228 : i32}, tf_saved_model.semantics} {
   "tf_saved_model.session_initializer"() {initializers = [@NoOp]} : () -> ()
 // Check that the initializers attribute is untouched.
 // CHECK: "tf_saved_model.session_initializer"()
-// CHECK-SAME: initializers = [@NoOp]
+// CHECK-SAME: initializers = []
 
   // expected-warning @+1 {{Initializer func op does not have tf_saved_model.initializer_type attribute. Func op: NoOp}}
   func.func @NoOp()
@@ -381,6 +559,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 // CHECK: func.func @main()
 // CHECK-NEXT: tf_executor.graph
-// CHECK-NEXT: tf_executor.fetch
+// CHECK: %[[OUT:.*]], %[[CTL:.*]] = tf_executor.island wraps "tf.Const"() {{{.*value = dense<1> : tensor<1xi64>.*}}}
+// CHECK: %[[CTL_0:.*]] = tf_executor.island(%[[CTL]]) wraps "tf.NoOp"() : () -> ()
+// CHECK: tf_executor.fetch %[[CTL_0]] : !tf_executor.control
 // CHECK: return
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
index be4d7356923..7a67389b64e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
@@ -10,7 +10,7 @@ func.func @decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.49743462E-5> : tensor<2xf32>} : () -> tensor<2xf32>
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<0.999950051> : tensor<2xf32>} : () -> tensor<2xf32>
 // CHECK: %[[mul:.*]] = "tf.Mul"(%arg0, %[[CONST_0]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
-// CHECK: %[[add:.*]] = "tf.Add"(%[[mul]], %[[CONST]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+// CHECK: %[[add:.*]] = "tf.AddV2"(%[[mul]], %[[CONST]]) : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[add]] : tensor<*xf32>
 
 func.func @not_decompose_batch_norm(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
@@ -29,7 +29,7 @@ func.func @convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
-  %1 = "tf.Add"(%0, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
   func.return %1 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @convert_add_to_biasadd
@@ -43,14 +43,14 @@ func.func @not_convert_add_to_biasadd(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3
   %cst = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x3xf32>} : () -> tensor<2x3x3x3xf32>
   %cst_0 = "tf.Const"() {value = dense<0.500000e+00> : tensor<1x3x2x3xf32>} : () -> tensor<1x3x2x3xf32>
   %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
-  %1 = "tf.Add"(%0, %cst_0) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
   func.return %1 : tensor<1x3x2x3xf32>
 }
 // CHECK: func @not_convert_add_to_biasadd
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x3xf32>} : () -> tensor<2x3x3x3xf32>
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<1x3x2x3xf32>} : () -> tensor<1x3x2x3xf32>
 // CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x3xf32>) -> tensor<1x3x2x3xf32>
-// CHECK-NEXT: %[[ADD:.*]] = "tf.Add"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+// CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CONV2D]], %[[CONST_0]]) : (tensor<1x3x2x3xf32>, tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x3xf32>
 
 func.func @fuse_conv2d_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
@@ -119,7 +119,7 @@ func.func @fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>) -> (tensor<
   %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
   %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
-  %2 = "tf.Add"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.AddV2"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
   func.return %2 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @fuse_conv2d_with_bias_and_add
@@ -134,7 +134,7 @@ func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1:
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
   %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
   %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
-  %2 = "tf.Add"(%1, %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.AddV2"(%1, %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
   func.return %2 : tensor<1x3x2x2xf32>
 }
 // CHECK: func @not_fuse_conv2d_with_bias_and_add
@@ -142,66 +142,66 @@ func.func @not_fuse_conv2d_with_bias_and_add(%arg0: tensor<1x3x4x3xf32>, %arg1:
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
 // CHECK-NEXT: %[[CONV2D:.*]] = "tf.Conv2D"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
-// CHECK-NEXT: %[[ADD:.*]] = "tf.Add"(%[[BIASADD]], %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[BIASADD]], %arg1) : (tensor<1x3x2x2xf32>, tensor<2xf32>) -> tensor<1x3x2x2xf32>
 // CHECK-NEXT: return %[[ADD]] : tensor<1x3x2x2xf32>
 
 func.func @match_depthwise_conv2d_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-  %1 = "tf.Add"(%0, %cst_0) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.AddV2"(%0, %cst_0) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
 }
 // CHECK: func @match_depthwise_conv2d_and_add
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<4.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
 // CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
 
-func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+func.func @match_depthwise_conv2d_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-  %1 = "tf.Mul"(%0, %cst_0) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-  func.return %1 : tensor<*xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.Mul"(%0, %cst_0) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %1 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_and_mul
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-// CHECK-NEXT: return %[[DEPTHWISE_CONV2D]] : tensor<*xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[DEPTHWISE_CONV2D]] : tensor<?x?x?x3xf32>
 
-func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+func.func @match_depthwise_conv2d_with_bias_and_add(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
   %cst_1 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-  %2 = "tf.Add"(%1, %cst_1) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  %2 = "tf.AddV2"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %2 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_with_bias_and_add
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<8.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-// CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
-func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (tensor<*xf32>) {
+func.func @match_depthwise_conv2d_with_bias_and_mul(%arg0: tensor<*xf32>) -> (tensor<?x?x?x3xf32>) {
   %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
   %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
   %cst_1 = "tf.Const"() {value = dense<0.500000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
-  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-  %2 = "tf.Mul"(%1, %cst_1) : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
+  %0 = "tf.DepthwiseConv2dNative"(%arg0, %cst) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+  %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+  func.return %2 : tensor<?x?x?x3xf32>
 }
 // CHECK: func @match_depthwise_conv2d_with_bias_and_mul
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
 // CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<3xf32>} : () -> tensor<3xf32>
-// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
-// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
-// CHECK-NEXT: return %[[BIASADD]] : tensor<*xf32>
+// CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]]) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>, tensor<2x3x3x1xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]]) {data_format = "NHWC"} : (tensor<?x?x?x3xf32>, tensor<3xf32>) -> tensor<?x?x?x3xf32>
+// CHECK-NEXT: return %[[BIASADD]] : tensor<?x?x?x3xf32>
 
 func.func @lower_einsum(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
@@ -238,3 +238,32 @@ func.func @not_removing_identity_of_returning_value(%arg0: tensor<*xf32>) -> (te
 // CHECK: func @not_removing_identity_of_returning_value
 // CHECK: %[[identity:.*]] = "tf.Identity"
 // CHECK: return %[[identity]] : tensor<*xf32>
+
+func.func @batch_norm_with_q_dq(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32> 
+  %cst_0 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32> 
+  %cst_1 = "tf.Const"() {device = "", value = dense<5.000000e-01> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32> 
+  %0 = "quantfork.qcast"(%cst_1) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>> 
+  %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.003937007874015748,0.003937007874015748}>>) -> tensor<2x3x3x2xf32> 
+  %2 = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>> 
+  %3 = "quantfork.dcast"(%2) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32> 
+  %4 = "tf.Conv2D"(%3, %1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32> 
+  %y, %batch_mean, %batch_variance, %reserve_space_1, %reserve_space_2, %reserve_space_3 = "tf.FusedBatchNormV3"(%4, %cst, %cst_0, %cst, %cst_0) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<*xf32>) 
+  %5 = "tf.Relu6"(%y) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
+  %6 = "quantfork.qcast"(%5) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>> 
+  %7 = "quantfork.dcast"(%6) : (tensor<1x3x2x2x!quant.uniform<i8<-127:127>:f32:3, {0.0026771653824903836:-60,0.0032283464285332388:-28}>>) -> tensor<1x3x2x2xf32> 
+  %8 = "tf.Identity"(%7) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
+  %9 = "tf.Identity"(%8) {device = ""} : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32> 
+  return %9 : tensor<1x3x2x2xf32> 
+}
+
+// CHECK: func @batch_norm_with_q_dq
+// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<0.707036077> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+// CHECK-DAG: %[[cst_0:.*]] = "tf.Const"() {value = dense<-0.914072155> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK: %[[q_input:.*]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>
+// CHECK: %[[dq_input:.*]] = "quantfork.dcast"(%[[q_input]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0011764706057660721:-43>>) -> tensor<1x3x4x3xf32>
+// CHECK: %[[q_weight:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>
+// CHECK: %[[dq_weight:.*]] = "quantfork.dcast"(%[[q_weight]]) : (tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.005567213212411235,0.005567213212411235}>>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[conv:.*]] = "tf.Conv2D"(%[[dq_input]], %[[dq_weight]])
+// CHECK: %[[bias:.*]] = "tf.BiasAdd"(%[[conv]], %[[cst_0]]) {data_format = "NHWC"}
+// CHECK: %[[relu6:.*]] = "tf.Relu6"(%[[bias]])
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize.mlir
index 784cb959a31..e46fd9ece88 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize.mlir
@@ -1,7 +1,5 @@
 // RUN: tf-quant-opt %s -split-input-file -quant-prepare-quantize | FileCheck %s
 
-// -----
-
 module {
   func.func @same_scale_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
@@ -26,10 +24,8 @@ module {
   func.func private @composite_matmul_with_bias_fn_1(%a: tensor<*xf32>, %b: tensor<*xf32>, %c: tensor<*xf32>) -> tensor<*xf32> {
     func.return %a: tensor<*xf32>
   }
-}
 
 // CHECK-LABEL: same_scale_test
-
 // CHECK: %[[maxpool:.*]] = "tf.MaxPool"
 // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[maxpool]])
 // CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
@@ -42,5 +38,5 @@ module {
 // CHECK-SAME: quant.uniform<i8:f32, 5.000000e-02:-10>
 // CHECK: "tf.PartitionedCall"(%[[dq2]]
 // CHECK-SAME: f = @composite_matmul_with_bias_fn_1
+}
 
-// -----
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
index f6ba4afb7a0..4da50d4ac91 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq.mlir
@@ -1,28 +1,90 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-prepare-quantize-drq | FileCheck %s
-
-// -----
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op -quant-prepare-quantize-drq | FileCheck %s
 
 module {
   func.func @matmul(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
-    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
     func.return %1: tensor<*xf32>
   }
-  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
-    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
     return %0 : tensor<*xf32>
   }
 
 // CHECK-LABEL: func @matmul
-// CHECK-DAG: %cst = arith.constant dense<0.000000e+00> : tensor<2x3xf32>
-// CHECK: %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
-// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x3xf32>
-// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+// CHECK-DAG: %[[CONST:.*]] = arith.constant dense<0.000000e+00> : tensor<2x1024xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x1024xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
 // CHECK: return %2 : tensor<*xf32>
 
 // CHECK-LABEL: func private @composite_matmul_fn
-// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %0 : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @conv2d(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    %2 = "tf.BiasAdd"(%1, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %2: tensor<*xf32>
+  }
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv2d
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x3x512xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x3x512xf32>) -> tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x512x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x3x512xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_conv2d_fn_1
+// CHECK: %0 = "tf.Conv2D"(%arg0, %arg1)
 // CHECK: return %0 : tensor<*xf32>
 }
 
 // -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x512xf32>, %arg1: tensor<2x3x3x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x1x1536xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x1x1536xf32>) -> tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32, 0.023622047244094488>>) -> tensor<2x3x1x1536xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x3x512xf32>)
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x1x1536xf32>)
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir
new file mode 100644
index 00000000000..f2d80c0bf4e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_drq_per_channel.mlir
@@ -0,0 +1,90 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op -quant-prepare-quantize-drq='enable-per-channel-quantization=true' | FileCheck %s
+
+module {
+  func.func @matmul(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @matmul
+// CHECK-DAG: %[[CONST:.*]] = arith.constant dense<0.000000e+00> : tensor<2x1024xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<2x1024xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %2 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_matmul_fn
+// CHECK: %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
+// CHECK: return %0 : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @conv2d(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x512x2xf32>} : () -> tensor<2x3x512x2xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x512xf32>, tensor<2x3x512x2xf32>) -> tensor<*xf32>
+    %2 = "tf.BiasAdd"(%1, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %2: tensor<*xf32>
+  }
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x3x4x512xf32>, %arg1: tensor<2x3x512x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x512xf32>, tensor<2x3x512x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv2d
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x512x2xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x512x2xf32>) -> tensor<2x3x512x2x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,0.023622047244094488}>>
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x512x2x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,0.023622047244094488}>>) -> tensor<2x3x512x2xf32>
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x3x4x512xf32>, tensor<2x3x512x2xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_conv2d_fn_1
+// CHECK: %0 = "tf.Conv2D"(%arg0, %arg1)
+// CHECK: return %0 : tensor<*xf32>
+}
+
+// -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x512xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x512xf32>} : () -> tensor<2x3x3x512xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x512xf32>, %arg1: tensor<2x3x3x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x512xf32>, tensor<2x3x3x512xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = arith.constant dense<3.000000e+00> : tensor<2x3x1x1536xf32>
+// CHECK: %0 = "quantfork.qcast"(%[[CONST_1]]) : (tensor<2x3x1x1536xf32>) -> tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,
+// CHECK: %1 = "quantfork.dcast"(%0) : (tensor<2x3x1x1536x!quant.uniform<i8<-127:127>:f32:3, {0.023622047244094488,
+// CHECK: %2 = "tf.PartitionedCall"(%arg0, %1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x512xf32>, tensor<2x3x1x1536xf32>) -> tensor<*xf32>
+// CHECK: %3 = "tf.BiasAdd"(%2, %[[CONST_0]])
+// CHECK: return %3 : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x3x512xf32>)
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x512xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x1x1536xf32>)
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq.mlir
index a5937136000..1ab40ea3f78 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq.mlir
@@ -95,11 +95,11 @@ module {
 // CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<{{.*}}> : tensor<2x3x3x2xf32>
 
 // CHECK: %[[q0:.*]] = "quantfork.qcast"(%[[cst]]) {volatile}
-// CHECK-SAME: tensor<2x!quant.uniform<i32:f32:0, {0.044169864606680966,0.042867627733627671}>>
+// CHECK-SAME: tensor<2x!quant.uniform<i32:f32, 0.044169864606680966>>
 // CHECK: %[[dq0:.*]] = "quantfork.dcast"(%[[q0]])
 
 // CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[cst_1]]) {volatile}
-// CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.075176584439014829,0.072960192762960605}
+// CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.075176584439014829>>
 // CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
 
 // CHECK: %[[q2:.*]] = "quantfork.qcast"(%arg0)
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq_per_channel.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq_per_channel.mlir
new file mode 100644
index 00000000000..baeb053327b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_quantize_ptq_per_channel.mlir
@@ -0,0 +1,47 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-prepare-quantize='post-training-quantize=true enable-per-channel-quantization=true' | FileCheck %s
+
+module {
+  func.func private @conv_with_bias_and_relu(%arg0: tensor<1x3x4x3xf32>) -> tensor<*xf32> {
+    %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-6.30731344, 5.4962182], [1.80364347, -7.64542675], [-2.11145878, -7.08605719]], [[-9.54062747, -6.14013147], [6.12640238, -4.18223286], [5.05738974, 8.99269962]], [[3.3535192, 0.84816426], [-6.64676809, -7.95477629], [5.81315517, 9.21566581]]], [[[1.38622558, 4.63866329], [9.54742622, -1.43770897], [-7.96835279, 8.99996852]], [[0.989735424, -4.83384752], [-7.27702999, 1.17216611], [9.33735656, 0.728900194]], [[5.1286211, 8.98645591], [1.55008793, -3.85491467], [3.7003777, 9.26594448]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[1.27501142, 149.824783]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.PartitionedCall"(%0, %cst_0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", device = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_10} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
+    return %2 : tensor<*xf32>
+  }
+
+  func.func private @composite_conv2d_with_bias_and_relu6_fn_10(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> attributes {tf.tf_quant.composite_function} {
+    %0 = "quantfork.stats"(%arg1) {layerStats = dense<[-9.54062747, 9.54742622]> : tensor<2xf32>} : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2xf32>
+    %1 = "quantfork.stats"(%arg0) {layerStats = dense<[1.27501142, 149.824783]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %2 = "tf.Conv2D"(%1, %0) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %3 = "quantfork.stats"(%arg2) {layerStats = dense<[7.05456924, 7.11401462]> : tensor<2xf32>} : (tensor<2xf32>) -> tensor<2xf32>
+    %4 = "quantfork.stats"(%2) {layerStats = dense<[-2795.36523, 4609.57373]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
+    %5 = "tf.BiasAdd"(%4, %3) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+    %6 = "quantfork.stats"(%5) {layerStats = dense<[-2788.31055, 4616.62842]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
+    %7 = "tf.Relu6"(%6) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+    %8 = "quantfork.stats"(%7) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<*xf32>) -> tensor<*xf32>
+    return %8 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: conv_with_bias_and_relu
+// CHECK-DAG: %[[cst:.*]] = arith.constant dense<[7.11401462, 7.05456924]> : tensor<2xf32>
+// CHECK-DAG: %[[cst_1:.*]] = arith.constant dense<{{.*}}> : tensor<2x3x3x2xf32>
+
+// CHECK: %[[q0:.*]] = "quantfork.qcast"(%[[cst]]) {volatile}
+// CHECK-SAME: tensor<2x!quant.uniform<i32:f32:0, {0.044169864606680966,0.042867627733627671}>>
+// CHECK: %[[dq0:.*]] = "quantfork.dcast"(%[[q0]])
+
+// CHECK: %[[q1:.*]] = "quantfork.qcast"(%[[cst_1]]) {volatile}
+// CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.075176584439014829,0.072960192762960605}>>
+// CHECK: %[[dq1:.*]] = "quantfork.dcast"(%[[q1]])
+
+// CHECK: %[[q2:.*]] = "quantfork.qcast"(%arg0)
+// CHECK-SAME: tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58754816990272674:-128>>
+// CHECK: %[[dq2:.*]] = "quantfork.dcast"(%[[q2]])
+
+// CHECK: %[[call:.*]] = "tf.PartitionedCall"(%[[dq2]], %[[dq1]], %[[dq0]])
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_10
+// CHECK: %[[q3:.*]] = "quantfork.qcast"(%[[call]]) {volatile}
+// CHECK-SAME: tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK: %[[dq3:.*]] = "quantfork.dcast"(%[[q3]])
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir
new file mode 100644
index 00000000000..0ef69f6f6f7
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op.mlir
@@ -0,0 +1,39 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op | FileCheck %s
+
+module {
+  // For UniformQuantized depthwise convolution, tensor shape should have
+  // transformed from [H,W,C,M] to [H,W,1,CxM],
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<6xf32>} : () -> tensor<6xf32>
+    %cst_1 = "tf.Const"() {value = dense<[[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]],[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<6xf32>
+// CHECK: %[[CONST_1:.*]] = arith.constant dense
+// CHECK-NOT: tensor<2x3x3x2xf32>
+// CHECK-SAME: tensor<2x3x1x6xf32>
+// CHECK: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
+// CHECK: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// CHECK: return %[[BIAS_0:.*]] : tensor<*xf32>
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x3x2xf32>)
+
+// CHECK-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// CHECK-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// CHECK-SAME:                                             %arg1: tensor<2x3x1x6xf32>)
+// CHECK: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// CHECK: return %0 : tensor<*xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
index 420dcc6ddb3..10bedcff581 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize.mlir
@@ -67,15 +67,13 @@ func.func @avgpool_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   func.return %4 : tensor<*xf32>
 }
 
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
 // CHECK: %[[q:.*]] = "quantfork.qcast"(%arg0)
 // CHECK: %[[sc1:.*]] = "quantfork.scast"(%[[q]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
 // CHECK: %[[fcast:.*]] = "tf.Cast"(%[[sc1]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
 // CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"(%[[fcast]])
 // CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
-// CHECK: %[[add:.*]] = "tf.AddV2"(%[[avgpool_f32]], %[[cst]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-// CHECK: %[[floor:.*]] = "tf.Floor"(%[[add]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK: %[[icast:.*]] = "tf.Cast"(%[[floor]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool_f32]])
+// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
 // CHECK: %[[sc2:.*]] = "quantfork.scast"(%[[icast]])
 // CHECK: %[[dq:.*]] = "quantfork.dcast"(%[[sc2]]) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>)
 // CHECK: return %[[dq]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
index ac4f42d97e4..915aa2eb1c5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions -symbol-dce | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions | FileCheck %s
 
 module {
   func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
@@ -66,6 +66,17 @@ module {
 // CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xi8>
 // CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
 // CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/2
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
 }
 
 // -----
@@ -105,6 +116,17 @@ module {
 // CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xi8>
 // CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
 // CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
 }
 
 // -----
@@ -131,7 +153,6 @@ module {
   }
 
 // CHECK-LABEL: func @conv_with_avgpool
-// CHECK-DAG: %[[cst:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
 // CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0
 // CHECK-SAME: f = @quantize_i8
 // CHECK: %[[conv_quant:.*]] = "tf.PartitionedCall"(%[[quantize]]
@@ -139,11 +160,42 @@ module {
 // CHECK-SAME: (tensor<1x2x2x3xi8>, tensor<2x2x3x2xi8>, tensor<2xi32>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>, tensor<f32>, tensor<i32>) -> tensor<*xi8>
 // CHECK: %[[cast_1:.*]] = "tf.Cast"(%[[conv_quant]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
 // CHECK: %[[avgpool:.*]] = "tf.AvgPool"(%[[cast_1]]) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK: %[[add:.*]] = "tf.AddV2"(%[[avgpool]], %[[cst]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
-// CHECK: %[[floor:.*]] = "tf.Floor"(%[[add]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK: %[[cast_2:.*]] = "tf.Cast"(%[[floor]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[cast_2:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
 // CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[cast_2]]
 // CHECK-SAME: f = @dequantize_i8
 // CHECK: return %[[dequantize]]
 
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
+}
+
+
+// -----
+
+module {
+  func.func @float_einsum(%arg0: tensor<?x64x32xf32>, %arg1: tensor<32x2x16xf32>) -> (tensor<?x64x2x16xf32>) {
+    %0 = "tf.Einsum"(%arg0, %arg1) {equation = "abc,cde->abde"} : (tensor<?x64x32xf32>, tensor<32x2x16xf32>) -> tensor<?x64x2x16xf32>
+    func.return %0 : tensor<?x64x2x16xf32>
+  }
+
+// CHECK-LABEL: func @float_einsum
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Einsum  0/1
+
+// CHECK: Number of quantized layers with quantized outputs: 0/0
+// CHECK: Number of quantize layers added: 0
+// CHECK: Number of dequantize layers added: 0
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
index 49bd9ffceab..b78a891e5b2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_drq.mlir
@@ -1,39 +1,137 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=drq target-opset=UNIFORM_QUANTIZED'   -quant-quantize-composite-functions='quantization-method=drq' -symbol-dce | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=drq target-opset=UNIFORM_QUANTIZED' -quant-quantize-composite-functions='quantization-method=drq target-opset=UNIFORM_QUANTIZED' -symbol-dce | FileCheck %s
 
 module {
-  func.func @matmul(%arg0: tensor<2x512xf32>) -> (tensor<*xf32>) {
-    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<512x512xf32>} : () -> tensor<512x512xf32>
-    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x512xf32>, tensor<512x512xf32>) -> tensor<*xf32>
+  // TODO(b/260020937): Support transpose_a, transpose_b for matmul.
+  func.func @matmul(%arg0: tensor<2x12xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<12x2xf32>} : () -> tensor<12x2xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x12xf32>, tensor<12x2xf32>) -> tensor<*xf32>
     func.return %1: tensor<*xf32>
   }
-  func.func private @composite_matmul_fn_1(%arg0: tensor<2x512xf32>, %arg1: tensor<512x512xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
-    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x512xf32>, tensor<512x512xf32>) -> tensor<*xf32>
+  func.func private @composite_matmul_fn_1(%arg0: tensor<2x12xf32>, %arg1: tensor<12x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x12xf32>, tensor<12x2xf32>) -> tensor<*xf32>
     return %0 : tensor<*xf32>
   }
 
 // CHECK-LABEL: func @matmul
-// CHECK-DAG: %[[q_w:.*]]  = "tf.Const"()
-// CHECK-SAME: tensor<512x512x!tf_type.qint8>} : () -> tensor<512x512x!tf_type.qint8>
+// CHECK-DAG: %[[q_w:.*]]  = "tf.Const"() {value = #tf_type<tensor_proto : "0x746
 // CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
 // CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: %0 = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]])
-// CHECK-SAME: f = @quantized_matmul_fn
+// CHECK: %0 = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
 
 // CHECK-LABEL: func private @quantized_matmul_fn_0
-//CHECK:  %0 = "tf.UniformQuantizedDotHybrid"(%arg0, %arg1, %arg2, %arg3)
+// CHECK:  %0 = "tf.UniformQuantizedDotHybrid"(%arg0, %arg1, %arg2, %arg3)
+// CHECK-SAME: rhs_quantization_axis = -1 : i64
+// CHECK-SAME: rhs_quantization_max_val = 127 : i64
+// CHECK-SAME: rhs_quantization_min_val = -127 : i64
+
 }
 
 // -----
 
 module {
-  func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
-  %weight = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-  %conv = "tf.Conv2D"(%arg0, %weight) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
-  func.return %conv : tensor<*xf32>
+  func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %weight = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %weight) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %2 = "tf.PartitionedCall"(%arg0, %weight) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_2} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    func.return %1, %2 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %conv = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %conv : tensor<*xf32>
+  }
+
+  func.func private @composite_conv2d_fn_2(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %conv = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %conv : tensor<*xf32>
   }
 
 // CHECK-LABEL: func @conv
-// CHECK-DAG: %[[w:.*]]  =  "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-// CHECK: %[[conv:.*]] = "tf.Conv2D"(%arg0, %[[w]]) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// CHECK-DAG: %[[q_w:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() {value = dense<0.0157480314> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: return %[[quantize_1]], %[[quantize_2]]
+
+// CHECK-LABEL: func private @quantized_conv2d_fn_0
+// CHECK:      %[[CONV2D_0:.*]] = "tf.UniformQuantizedConvolutionHybrid"
+// CHECK-SAME: batch_group_count = 1 : i64
+// CHECK-SAME: dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02"
+// CHECK-SAME: explicit_padding = []
+// CHECK-SAME: feature_group_count = 1 : i64
+// CHECK-SAME: lhs_dilation = [1, 1]
+// CHECK-SAME: padding = "VALID"
+// CHECK-SAME: rhs_dilation = [2, 2]
+// CHECK-SAME: rhs_quantization_axis = -1 : i64
+// CHECK-SAME: rhs_quantization_max_val = 127 : i64
+// CHECK-SAME: rhs_quantization_min_val = -127 : i64
+// CHECK-SAME: window_strides = [1, 2]
+// CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func private @quantized_conv2d_fn_1
+// CHECK:      %[[CONV2D_0:.*]] = "tf.UniformQuantizedConvolutionHybrid"
+// CHECK-SAME: padding = "SAME"
+
+}
+
+// -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+    %cst_2 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+    %2 = "tf.PartitionedCall"(%arg0, %cst_2) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    func.return %1, %2: tensor<*xf32>, tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x1xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+// CHECK-DAG: %[[q_w1:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
+// CHECK-SAME:                                                                     -> tensor<2x3x1x3x!tf_type.qint8>
+// CHECK-DAG: %[[q_w2:.*]] = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674
+// CHECK-SAME:                                                                     -> tensor<2x3x1x6x!tf_type.qint8>
+// CHECK-DAG: %[[w_scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[w_zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x1x3x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: %[[quantize_1_add:.*]] = "tf.BiasAdd"(%[[quantize_1]], %[[bias]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[w_scale]], %[[w_zp]]) {config = "", config_proto = "", executor_type = "", f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// CHECK: return %[[quantize_1_add]], %[[quantize_2]]
+
+// CHECK-LABEL: func private @quantized_depthwise_conv2d_fn_0
+// CHECK:      %[[CONV2D_0:.*]] = "tf.UniformQuantizedConvolutionHybrid"
+// CHECK-SAME: batch_group_count = 1 : i64,
+// CHECK-SAME: dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02"
+// CHECK-SAME: explicit_padding = [],
+// CHECK-SAME: feature_group_count = 3 : i64,
+// CHECK-SAME: lhs_dilation = [1, 1],
+// CHECK-SAME: padding = "VALID",
+// CHECK-SAME: rhs_dilation = [2, 2],
+// CHECK-SAME: rhs_quantization_axis = -1 : i64,
+// CHECK-SAME: rhs_quantization_max_val = 127 : i64,
+// CHECK-SAME: rhs_quantization_min_val = -127 : i64,
+// CHECK-SAME: window_strides = [1, 2]
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x1x6x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+
+// CHECK-LABEL: func private @quantized_depthwise_conv2d_fn_1
+// CHECK:      %[[CONV2D_0:.*]] = "tf.UniformQuantizedConvolutionHybrid"
+// CHECK-SAME: padding = "SAME"
 
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
new file mode 100644
index 00000000000..c6a5ff74792
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
@@ -0,0 +1,101 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=weight_only target-opset=XLA' -quant-quantize-composite-functions='quantization-method=weight_only target-opset=XLA' -symbol-dce | FileCheck %s
+
+module {
+  // TODO(b/260020937): Support transpose_a, transpose_b for matmul.
+  func.func @matmul(%arg0: tensor<2x12xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<12x2xf32>} : () -> tensor<12x2xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} : (tensor<2x12xf32>, tensor<12x2xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_matmul_fn_1(%arg0: tensor<2x12xf32>, %arg1: tensor<12x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x12xf32>, tensor<12x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: func @matmul
+// CHECK-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK: %[[dq_w:.*]] = "tf.PartitionedCall"(%[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @dequantize_i8} : (tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<12x2xf32>
+// CHECK: %[[quantize:.*]] = "tf.PartitionedCall"(%arg0, %[[dq_w]]) {config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1} :
+// CHECK-SAME: (tensor<2x12xf32>, tensor<12x2xf32>) -> tensor<*xf32>
+// CHECK: return %[[quantize]]
+
+// -----
+
+module {
+  func.func @conv(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %weight = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %weight) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %2 = "tf.PartitionedCall"(%arg0, %weight) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_2} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    func.return %1, %2 : tensor<*xf32>, tensor<*xf32>
+  }
+
+  func.func private @composite_conv2d_fn_1(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %conv = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %conv : tensor<*xf32>
+  }
+
+  func.func private @composite_conv2d_fn_2(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %conv = "tf.Conv2D"(%arg0, %arg1) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %conv : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @conv
+// CHECK-DAG: %[[q_w:.*]] = "tf.Const"()
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"()
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"()
+// CHECK: %[[dq_w:.*]] = "tf.PartitionedCall"(%[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @dequantize_i8} : (tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[dq_w]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_1} :
+// CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[dq_w]]) {config = "", config_proto = "", executor_type = "", f = @composite_conv2d_fn_2} :
+// CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// CHECK: return %[[quantize_1]], %[[quantize_2]]
+
+}
+
+// -----
+
+module {
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+    %cst_1 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x1xf32>} : () -> tensor<2x3x3x1xf32>
+    %cst_2 = "tf.Const"() {value = dense<3.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<3xf32>) -> tensor<*xf32>
+    %2 = "tf.PartitionedCall"(%arg0, %cst_2) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    func.return %1, %2: tensor<*xf32>, tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x1xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn_1(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 2, 2, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// CHECK-LABEL: func @depthwise_conv
+// CHECK-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x1xi8>}
+// CHECK-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
+// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>}
+// CHECK: %[[dq_w1:.*]] = "tf.PartitionedCall"(%[[q_w1]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @dequantize_i8} : (tensor<2x3x3x1xi8>, tensor<f32>, tensor<i32>) -> tensor<2x3x3x1xf32>
+// CHECK: %[[dq_w2:.*]] = "tf.PartitionedCall"(%[[q_w2]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// CHECK-SAME: f = @dequantize_i8} : (tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[quantize_1:.*]] = "tf.PartitionedCall"(%arg0, %[[dq_w1]]) {config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} :
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x1xf32>) -> tensor<*xf32>
+// CHECK: %[[bias_add:.*]]  = "tf.BiasAdd"(%[[quantize_1]], %[[bias]])
+// CHECK: %[[quantize_2:.*]] = "tf.PartitionedCall"(%arg0, %[[dq_w2]]) {config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_1} :
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// CHECK: return %[[bias_add]], %[[quantize_2]]
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
index 0a74c393abc..5ba40e0eb1d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_xla.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions='target-opset=XLA' -symbol-dce | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions -quant-quantize-composite-functions='target-opset=XLA' | FileCheck %s
 
 module {
   func.func @conv_with_single_layer(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
@@ -33,6 +33,17 @@ module {
 // CHECK-SAME: (%arg0: tensor<1x2x2x3xi8>, %arg1: tensor<2x2x3x2xi8>, %arg2: tensor<2xi32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2xf32>, %arg6: tensor<2xi32>, %arg7: tensor<2xf32>, %arg8: tensor<2xi32>, %arg9: tensor<f32>, %arg10: tensor<i32>) -> tensor<*xf32>
 // CHECK:      %[[CONV2D_0:.*]] = "tf.Conv2D"
 // CHECK-SAME: {dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true}
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 0/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 0
 }
 
 // -----
@@ -70,6 +81,17 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK: %[[conv_quant2:.*]] = "tf.PartitionedCall"(%[[conv_quant]]
 // CHECK-SAME: f = @quantized_conv2d_float_output_fn_0
 // CHECK: return %[[conv_quant2]]
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  2/2
+
+// CHECK: Number of quantized layers with quantized outputs: 1/2
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 0
 }
 
 // -----
@@ -105,4 +127,15 @@ module {
 // CHECK: %[[dequantize:.*]] = "tf.PartitionedCall"(%[[maxpool]]
 // CHECK-SAME: f = @dequantize_i8
 // CHECK: return %[[dequantize]]
+
+// CHECK: -------- Quantization Summary --------
+// CHECK: Number of quantized layers in the model
+// CHECK: --------------------------------
+// CHECK: Name    Count/Total
+// CHECK: ================================
+// CHECK: Conv2D  1/1
+
+// CHECK: Number of quantized layers with quantized outputs: 1/1
+// CHECK: Number of quantize layers added: 1
+// CHECK: Number of dequantize layers added: 1
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
index 2d8670480ae..abe0c997195 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_drq.mlir
@@ -4,18 +4,18 @@
 
 module {
   func.func @matmul(%arg0: tensor<1x2x2x3xf32>) -> (tensor<*xf32>) {
-    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+    %1 = "tf.PartitionedCall"(%arg0, %cst_0) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
     func.return %1: tensor<*xf32>
   }
-  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
-    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+  func.func private @composite_matmul_fn(%arg0: tensor<1x2x2x3xf32>, %arg1: tensor<2x1024xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_a", device = "", transpose_a = false, transpose_b = false} : (tensor<1x2x2x3xf32>, tensor<2x1024xf32>) -> tensor<*xf32>
     return %0 : tensor<*xf32>
   }
 
-// CHECK: %[[cst:.*]] = "arith.constant"() {value = dense<0.000000e+00> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-// CHECK: %[[q_cst:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
-// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
+// CHECK: %[[cst:.*]] = "arith.constant"() {value = dense<0.000000e+00> : tensor<2x1024xf32>} : () -> tensor<2x1024xf32>
+// CHECK: %[[q_cst:.*]] = "quantfork.qcast"(%[[cst]]) : (tensor<2x1024xf32>) -> tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>
+// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_cst]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn} : (tensor<1x2x2x3xf32>, tensor<2x1024x!quant.uniform<i8<-127:127>:f32, 3.9370078740157481E-9>>) -> tensor<*xf32>
 // CHECK: "func.return"(%[[out]]) : (tensor<*xf32>) -> ()
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
new file mode 100644
index 00000000000..9123e41967e
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_xla.mlir
@@ -0,0 +1,136 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-lift-quantizable-spots-as-functions -quant-quantize='target-opset=XLA' -verify-each=false | FileCheck %s
+
+func.func private @conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "input_tensor"}) -> tensor<*xf32> attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x3x4x3>]} {
+  %weight = arith.constant dense_resource<__elided__> : tensor<2x3x3x2xf32>
+  %bias = arith.constant dense<[7.11401462, 7.05456924]> : tensor<2xf32>
+
+  %q_input= "quantfork.qcast"(%input) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+  %dq_input= "quantfork.dcast"(%q_input) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>) -> tensor<1x3x4x3xf32>
+  %q_weight = "quantfork.qcast"(%weight) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+  %dq_weight = "quantfork.dcast"(%q_weight) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>) -> tensor<2x3x3x2xf32>
+  %q_bias = "quantfork.qcast"(%bias) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
+  %dq_bias = "quantfork.dcast"(%q_bias) : (tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<2xf32>
+  %conv = "tf.Conv2D"(%dq_input, %dq_weight) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %biasadd = "tf.BiasAdd"(%conv, %dq_bias) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %res = "tf.Relu6"(%biasadd) : (tensor<*xf32>) -> tensor<*xf32>
+  %q_res = "quantfork.qcast"(%res) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+  %dq_res = "quantfork.dcast"(%q_res) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+
+  func.return %dq_res : tensor<*xf32>
+}
+
+// CHECK-DAG: [[bias:%.+]] = "arith.constant"() {value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK-DAG: [[weight:%.+]] = "arith.constant"() {value = dense_resource<__elided__> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+// CHECK: [[q_input:%.+]] = "quantfork.qcast"(%arg0) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+// CHECK-NEXT: [[q_bias:%.+]] = "quantfork.qcast"([[bias]]) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
+// CHECK-NEXT: [[conv:%.+]] = "tf.PartitionedCall"([[q_input]], [[weight]], [[q_bias]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @[[composite_fn:composite_conv2d_with_bias_and_relu6_fn.*]]} : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK-NEXT: [[res:%.+]] = "quantfork.dcast"([[conv]]) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+// CHECK-NEXT: "func.return"([[res]]) : (tensor<*xf32>) -> ()
+
+
+// -----
+
+// CHECK-LABEL: standalone_same_scale_test
+func.func @standalone_same_scale_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+  %0 = "quantfork.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %1 = "quantfork.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %2 = "tf.MaxPool"(%1) {data_format = "NHWC", device = "", explicit_paddings = [], ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %3 = "quantfork.qcast"(%2) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %4 = "quantfork.dcast"(%3) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %5 = "tf.Reshape"(%4, %cst) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %6 = "quantfork.qcast"(%5) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %7 = "quantfork.dcast"(%6) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  func.return %7 : tensor<*xf32>
+}
+
+// CHECK: %[[maxpool_i8:.*]] = "tf.MaxPool"
+// CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[reshape_i8:.*]] = "tf.Reshape"(%[[maxpool_i8]]
+// CHECK-SAME: (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+
+// -----
+
+// CHECK-LABEL: standalone_avgpool_test
+func.func @standalone_avgpool_test(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+  %0 = "quantfork.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %1 = "quantfork.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %2 = "tf.AvgPool"(%1) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %3 = "quantfork.qcast"(%2) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %4 = "quantfork.dcast"(%3) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  func.return %4 : tensor<*xf32>
+}
+
+// CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"
+// CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: return %[[avgpool_f32]]
+
+// -----
+
+func.func @same_scale_op_before_matmul(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+  %0 = "quantfork.qcast"(%arg0) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %1 = "quantfork.dcast"(%0) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %2 = "tf.MaxPool"(%1) {data_format = "NHWC", device = "", explicit_paddings = [], ksize = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %3 = "quantfork.qcast"(%2) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %4 = "quantfork.dcast"(%3) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %5 = "tf.Reshape"(%4, %cst) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %6 = "quantfork.qcast"(%5) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>
+  %7 = "quantfork.dcast"(%6) : (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>) -> tensor<*xf32>
+  %weight = arith.constant dense<1.0> : tensor<144x12xf32>
+  %q_weight = "quantfork.qcast"(%weight) : (tensor<144x12xf32>) -> tensor<144x12x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+  %dq_weight = "quantfork.dcast"(%q_weight) : (tensor<144x12x!quant.uniform<i8:f32, 0.074855112561992565:-1>>) -> tensor<144x12xf32>
+  %9 = "tf.MatMul"(%7, %dq_weight) {transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<144x12xf32>) -> tensor<*xf32>
+  %10 = "quantfork.qcast"(%9) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 4.000000e-03:-12>>
+  %11 = "quantfork.dcast"(%10) : (tensor<*x!quant.uniform<i8:f32, 4.000000e-03:-12>>) -> tensor<*xf32>
+  func.return %11 : tensor<*xf32>
+}
+
+// CHECK: %[[maxpool_i8:.*]] = "tf.MaxPool"
+// CHECK-SAME: (tensor<*xi8>) -> tensor<*xi8>
+// CHECK: %[[reshape_i8:.*]] = "tf.Reshape"(%[[maxpool_i8]]
+// CHECK-SAME: (tensor<*xi8>, tensor<2xi32>) -> tensor<*xi8>
+// CHECK: %[[scast:.*]] = "quantfork.scast"(%[[reshape_i8]]
+// CHECK: %[[matmul:.*]] = "tf.PartitionedCall"(%[[scast]]
+// CHECK-SAME: f = @composite_matmul_fn_1
+// CHECK-SAME: (tensor<*x!quant.uniform<i8:f32, 5.000000e-02:-10>>, tensor<144x12x!quant.uniform<i8:f32, 0.074855112561992565:-1>>) -> tensor<*x!quant.uniform<i8:f32, 4.000000e-03:-12>>
+
+// -----
+
+func.func private @avgpool_after_conv(%input: tensor<1x3x4x3xf32> {tf._user_specified_name = "input_tensor"}) -> tensor<*xf32> attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x3x4x3>]} {
+  %weight = arith.constant dense<1.0> : tensor<2x3x3x2xf32>
+  %bias = arith.constant dense<[7.11401462, 7.05456924]> : tensor<2xf32>
+  %cst = arith.constant dense<[-1, 144]> : tensor<2xi32>
+
+  %q_input= "quantfork.qcast"(%input) : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>
+  %dq_input= "quantfork.dcast"(%q_input) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>) -> tensor<1x3x4x3xf32>
+  %q_weight = "quantfork.qcast"(%weight) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>
+  %dq_weight = "quantfork.dcast"(%q_weight) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>) -> tensor<2x3x3x2xf32>
+  %q_bias = "quantfork.qcast"(%bias) : (tensor<2xf32>) -> tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>
+  %dq_bias = "quantfork.dcast"(%q_bias) : (tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<2xf32>
+  %conv = "tf.Conv2D"(%dq_input, %dq_weight) {attr_map = "0:strides,1:use_cudnn_on_gpu,2:padding,3:explicit_paddings,4:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+  %biasadd = "tf.BiasAdd"(%conv, %dq_bias) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
+  %res = "tf.Relu6"(%biasadd) : (tensor<*xf32>) -> tensor<*xf32>
+  %q_res = "quantfork.qcast"(%res) : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+  %dq_res = "quantfork.dcast"(%q_res) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+  %avg_pool = "tf.AvgPool"(%dq_res) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<*xf32>) -> tensor<*xf32>
+  %q_pool = "quantfork.qcast"(%avg_pool) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+  %dq_pool = "quantfork.dcast"(%q_pool) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+  %reshape = "tf.Reshape"(%dq_pool, %cst) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %q_reshape = "quantfork.qcast"(%reshape) {volatile} : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+  %dq_reshape = "quantfork.dcast"(%q_reshape) : (tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>) -> tensor<*xf32>
+  func.return %dq_reshape : tensor<*xf32>
+}
+
+// CHECK: %[[conv:.*]] = "tf.PartitionedCall"
+// CHECK-SAME: f = @composite_conv2d_with_bias_and_relu6_fn_1
+// CHECK-SAME: (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.58810077742034317:-128>>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.074855112561992565:-1>>, tensor<2x!quant.uniform<i32:f32, 0.044022349891595126>>) -> tensor<*x!quant.uniform<i8:f32, 0.023529411764705882:-128>>
+// CHECK: %[[scast:.*]] = "quantfork.scast"(%[[conv]]
+// CHECK: %[[fcast:.*]] = "tf.Cast"(%[[scast]]) {Truncate = false} : (tensor<*xi8>) -> tensor<*xf32>
+// CHECK: %[[avgpool_f32:.*]] = "tf.AvgPool"(%[[fcast]])
+// CHECK-SAME: (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[round:.*]] = "tf.Round"(%[[avgpool_f32]])
+// CHECK: %[[icast:.*]] = "tf.Cast"(%[[round]]) {Truncate = false} : (tensor<*xf32>) -> tensor<*xi8>
+// CHECK: %[[reshape:.*]] = "tf.Reshape"(%[[icast]]
+// CHECK: %[[sc2:.*]] = "quantfork.scast"(%[[reshape]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir
new file mode 100644
index 00000000000..d5e18209291
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/remove_var_init_by_const.mlir
@@ -0,0 +1,150 @@
+// RUN: tf-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:   -quant-remove-var-init-by-const | FileCheck %s
+
+// Single `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern removed from
+// the initializer function.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // All three ops should have been removed.
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// The `tf.AssignVariableOp(tf.VarHandleOp, tf.Const)` pattern is not removed
+// from the initializer function that is not "restore_op" type.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_init_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_init_op]
+
+  func.func @init_func_init_op() -> () attributes {
+      tf_saved_model.initializer_type = "init_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // Nothing has been removed.
+  // CHECK: @init_func_init_op
+  // CHECK-NEXT: "tf.Const"
+  // CHECK-NEXT: "tf.VarHandleOp"
+  // CHECK-NEXT: "tf.AssignVariableOp"
+  // CHECK-NEXT: return
+}
+
+// -----
+
+// If `tf.Const` is not used to initialize the variable, it is not removed.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    %var_0 = "tf.VarHandleOp"() {shared_name = "var_0"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    %add_0 = "tf.Identity"(%cst_0) : (tensor<2xf32>) -> tensor<2xf32>
+    %var_1 = "tf.VarHandleOp"() {shared_name = "var_1"} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
+    "tf.AssignVariableOp"(%var_1, %add_0) : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    return
+  }
+  // The second AssignVariableOp, which takes the result of the `tf.Identity`
+  // op, is not removed. Note that the first AssignVariableOp is removed.
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[IDENTITY:.*]] = "tf.Identity"(%[[CST]])
+  // CHECK-NEXT: %[[VAR:.*]] = "tf.VarHandleOp"() {{{.*shared_name = "var_1".*}}}
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[VAR]], %[[IDENTITY]])
+}
+
+// -----
+
+// If something other than `tf.VarHandleOp` is being initialized, it is
+// not erased.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
+    // Note: this is a contrived example and is an invalid input.
+    %var_0 = "tf.HashTableV2"() {key_dtype = i64, value_dtype = !tf_type.string} : () -> tensor<!tf_type.resource>
+    "tf.AssignVariableOp"(%var_0, %cst_0) : (tensor<!tf_type.resource>, tensor<2xf32>) -> ()
+    return
+  }
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK: %[[CST:.*]] = "tf.Const"()
+  // CHECK-NEXT: %[[HASH_TABLE:.*]] = "tf.HashTableV2"()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[HASH_TABLE]], %[[CST]])
+}
+
+// -----
+
+
+// Nothing happens when there are no `tf_saved_model.session_initializer`.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+}
+
+// -----
+
+// Nothing happens when there are no initializer functions.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+}
+
+// -----
+
+// Nothing happens when the initializer function of type = "restore_op" is
+// empty.
+
+// CHECK-LABEL: module
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+  // CHECK: "tf_saved_model.session_initializer"()
+  // CHECK-SAME: initializers = [@init_func_restore_op]
+
+  func.func @init_func_restore_op() -> () attributes {
+      tf_saved_model.initializer_type = "restore_op",
+      tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]} {
+    return
+  }
+  // CHECK: @init_func_restore_op
+  // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
+  // CHECK-NEXT: return
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
index 9f4bf32d40c..c8ed105c3da 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops.mlir
@@ -65,12 +65,13 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]>
 // CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<-128> : tensor<i8>} : () -> tensor<i8>
 // CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<[-22016, -23680]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[[[-22016, -23680]]]]>
 // CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() {value = dense<[162, 160]> : tensor<2xi32>} : () -> tensor<2xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_4]], %[[CONST_5]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_6]], %[[CONST_0]], %[[CONST_3]], %[[CONST_1]], %[[CONST_1]], %[[CONST_2]])
 // CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x3x2x2xi32>
-// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x3x2x2xi32>, tensor<2xi32>) -> tensor<1x3x2x2xi32>
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x3x2x2xi32>, tensor<1x1x1x2xi32>) -> tensor<1x3x2x2xi32>
 // CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[SUB_0]], %[[CONST_8]]) : (tensor<1x3x2x2xi32>, tensor<2xi32>) -> tensor<1x3x2x2xi32>
 }
 
@@ -150,12 +151,13 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() {value = dense<0> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
 // CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
 // CHECK-DAG: %[[CONST_6:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<[55040, -15104, -21376]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK-DAG: %[[CONST_7:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<1x1x1x3xi32>} : () -> tensor<1x1x1x3xi32>
+// CHECK-DAG-SAME{LITERAL}: value = dense<[[[[55040, -15104, -21376]]]]>
 // CHECK-DAG: %[[CONST_8:.*]] = "tf.Const"() {value = dense<[129, 166, 221]> : tensor<3xi32>} : () -> tensor<3xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"({{.*}}, %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<1x4x5x3xi8>
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], %[[CONST_5]], %[[CONST_5]], %[[CONST_6]])
 // CHECK-SAME: (tensor<1x4x5x3xi8>, tensor<2x3x1x3xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<1x2x2x3xi32>
-// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x2x2x3xi32>, tensor<3xi32>) -> tensor<1x2x2x3xi32>
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST_7]]) : (tensor<1x2x2x3xi32>, tensor<1x1x1x3xi32>) -> tensor<1x2x2x3xi32>
 // CHECK: %[[ADDV2_1:.*]] = "tf.AddV2"(%[[SUB_0]], %[[CONST_8]]) : (tensor<1x2x2x3xi32>, tensor<3xi32>) -> tensor<1x2x2x3xi32>
 }
 
@@ -204,7 +206,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-DAG: %[[padding_rank_2:.*]] = "tf.Reshape"(%[[padding_rank_1]], {{.*}}) : (tensor<8xi32>, tensor<2xi64>) -> tensor<4x2xi32>
 // CHECK-DAG: %[[input_padded:.*]] = "tf.PadV2"(%{{.*}}, %[[padding_rank_2]], {{.*}}) : (tensor<?x?x?x3xi8>, tensor<4x2xi32>, tensor<i8>) -> tensor<?x?x?x3xi8>
 // CHECK: %[[conv_output:.*]] = "tf.XlaConvV2"(%[[input_padded]], %[[filter]], {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) {batch_group_count = 1 : i64, dimension_numbers = "{{.*}}", precision_config = ""} : (tensor<?x?x?x3xi8>, tensor<2x3x3x2xi8>, tensor<2xi32>, tensor<2x2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<i32>) -> tensor<?x?x?x2xi32>
-// CHECK: %[[conv_output_sub:.*]] = "tf.Sub"(%[[conv_output]], {{.*}}) : (tensor<?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x2xi32>
+// CHECK: %[[conv_output_sub:.*]] = "tf.Sub"(%[[conv_output]], {{.*}}) : (tensor<?x?x?x2xi32>, tensor<1x1x1x2xi32>) -> tensor<?x?x?x2xi32>
 // CHECK: %[[conv_output_add:.*]] = "tf.AddV2"(%[[conv_output_sub]], {{.*}}) {device = ""} : (tensor<?x?x?x2xi32>, tensor<2xi32>) -> tensor<?x?x?x2xi32>
 }
 
@@ -262,7 +264,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 
 // CHECK-LABEL: func @conv_with_filter_larger_than_1MB
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-264192> : tensor<512xi32>} : () -> tensor<512xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-264192> : tensor<1x1x1x512xi32>} : () -> tensor<1x1x1x512xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]]
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST]])
@@ -296,10 +298,154 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 // CHECK-LABEL: func @matmul_with_relu
 // CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-131072> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-131072> : tensor<1x3xi32>} : () -> tensor<1x3xi32>
 // CHECK: %[[MATMUL:.*]] = "tf.XlaDotV2"({{.*}}, %[[WEIGHT]])
 // CHECK-SAME: (tensor<1x1024xi8>, tensor<1024x3xi8>) -> tensor<1x3xi32>
-// CHECK: %[[SUB:.*]] = "tf.Sub"(%[[MATMUL]], %[[CONST]]) : (tensor<1x3xi32>, tensor<3xi32>) -> tensor<1x3xi32>
+// CHECK: %[[SUB:.*]] = "tf.Sub"(%[[MATMUL]], %[[CONST]]) : (tensor<1x3xi32>, tensor<1x3xi32>) -> tensor<1x3xi32>
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}} {
+  func.func @matmul_two_tensors_with_static_shape(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> (tensor<2x2xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %10 = "tf.Cast"(%9) {Truncate = false} : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %11 = "tf.Sub"(%10, %cst_4) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %12 = "tf.Identity"(%4) : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %14 = "tf.Sub"(%13, %cst_2) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %15 = "tf.MatMul"(%11, %14) {transpose_a = false, transpose_b = false} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<2x2xi32>) -> tensor<2x2xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    %19 = "tf.Floor"(%18) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x2xf32>) -> tensor<2x2xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<2x2xi8>) -> tensor<2x2xi8>
+    %24 = "tf.Cast"(%23) : (tensor<2x2xi8>) -> tensor<2x2xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<2x2xi32>, tensor<i32>) -> tensor<2x2xi32>
+    %26 = "tf.Cast"(%25) : (tensor<2x2xi32>) -> tensor<2x2xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    return %27 : tensor<2x2xf32>
+  }
+
+// CHECK-LABEL: func @matmul_two_tensors_with_static_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[arg1_identity:.*]] = "tf.Identity"(%[[arg1_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg0_cast]], %[[arg1_identity]]
+// CHECK-SAME: (tensor<2x2xi8>, tensor<2x2xi8>) -> tensor<2x2xi32>
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}} {
+  func.func @matmul_two_tensors_with_dynamic_shape(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %2 = "tf.Floor"(%1) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %7 = "tf.Floor"(%6) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %10 = "tf.Cast"(%4) {Truncate = false} : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %11 = "tf.Sub"(%10, %cst_2) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %12 = "tf.Identity"(%9) : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %14 = "tf.Sub"(%13, %cst_4) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %15 = "tf.MatMul"(%11, %14) {transpose_a = false, transpose_b = false} : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<?x?xi32>) -> tensor<?x?xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    %19 = "tf.Floor"(%18) : (tensor<?x?xf32>) -> tensor<?x?xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<?x?xf32>) -> tensor<?x?xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<?x?xi8>) -> tensor<?x?xi8>
+    %24 = "tf.Cast"(%23) : (tensor<?x?xi8>) -> tensor<?x?xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+    %26 = "tf.Cast"(%25) : (tensor<?x?xi32>) -> tensor<?x?xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+    return %27 : tensor<?x?xf32>
+  }
+
+// CHECK-LABEL: func @matmul_two_tensors_with_dynamic_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+// CHECK: %[[arg0_identity:.*]] = "tf.Identity"(%[[arg0_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_cast]], %[[arg0_identity]]
+// CHECK-SAME: (tensor<?x?xi8>, tensor<?x?xi8>) -> tensor<?x?xi32>
+
+// CHECK: %[[arg0_shape:.*]] = "tf.Shape"(%[[arg0_identity]]
+// CHECK: %[[shape_zp_contribute:.*]] = "tf.StridedSlice"(%[[arg0_shape]]
+// CHECK: %[[shape_zp_contribute_cast:.*]] = "tf.Cast"(%[[shape_zp_contribute]]
+// CHECK: %[[shape_zp_contribute_mul:.*]] = "tf.Mul"(%[[shape_zp_contribute_cast]]
+// CHECK: %[[zp:.*]] = "tf.Sub"({{.*}}, %[[shape_zp_contribute_mul]])
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]], %[[zp]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+
 }
 
 // -----
@@ -337,7 +483,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {{.*}} : () -> tensor<5x2xi32>
 // CHECK-DAG-SAME{LITERAL}: value = dense<[[0, 0], [0, 1], [0, 1], [1, 1], [0, 0]]> : tensor<5x2xi32>
 // CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-43> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<1x1x1x1x2xi32>} : () -> tensor<1x1x1x1x2xi32>
 
 // CHECK: %[[PAD:.*]] = "tf.PadV2"({{.*}}, %[[CONST]], %[[CONST_1]])
 // CHECK: %[[CONV:.*]] = "tf.XlaConvV2"(%[[PAD]], %[[WEIGHT]]
@@ -380,7 +526,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-LABEL: func @conv3d_with_dynamic_shape
 // CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = dense<1> : tensor<2x3x3x3x2xi8>} : () -> tensor<2x3x3x3x2xi8>
 // CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<-43> : tensor<i8>} : () -> tensor<i8>
-// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<-2322> : tensor<1x1x1x1x2xi32>} : () -> tensor<1x1x1x1x2xi32>
 
 // CHECK: %[[CONCAT:.*]] = "tf.Concat"({{.*}})
 // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[CONCAT]], {{.*}}) : (tensor<10xi32>, tensor<2xi64>) -> tensor<5x2xi32>
@@ -389,3 +535,333 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-SAME: (tensor<?x?x?x?x3xi8>, tensor<2x3x3x3x2xi8>, tensor<3xi32>, tensor<3x2xi32>, tensor<3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<?x?x?x?x2xi32>
 // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[CONV]], %[[CONST_2]])
 }
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}, tf_saved_model.semantics} {
+  func.func @batch_matmul(%arg0: tensor<20x30x64x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<20x30x64x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "tf.PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08784583E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<1> : tensor<20x30x1024x3xi8>} : () -> tensor<20x30x1024x3xi8>
+    %cst_3 = "tf.Const"() {device = "", value = dense<0.00392156886> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_3) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<20x30x64x1024xf32>) -> tensor<20x30x64x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_5) {device = ""} : (tensor<20x30x64x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<20x30x64x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<20x30x64x1024xf32>) -> tensor<20x30x64x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<20x30x64x1024xi8>) -> tensor<20x30x64x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_4) {device = ""} : (tensor<20x30x64x1024xi32>, tensor<i32>) -> tensor<20x30x64x1024xi32>
+    %7 = "tf.Identity"(%cst_2) {device = ""} : (tensor<20x30x1024x3xi8>) -> tensor<20x30x1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<20x30x1024x3xi8>) -> tensor<20x30x1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<20x30x64x1024xi32>, tensor<20x30x1024x3xi32>) -> tensor<20x30x64x3xi32>
+    %10 = "tf.Cast"(%9) {Truncate = false, device = ""} : (tensor<20x30x64x3xi32>) -> tensor<20x30x64x3xf32>
+    %11 = "tf.Mul"(%10, %cst) {device = ""} : (tensor<20x30x64x3xf32>, tensor<f32>) -> tensor<20x30x64x3xf32>
+    %12 = "tf.Relu"(%11) {device = ""} : (tensor<20x30x64x3xf32>) -> tensor<20x30x64x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<20x30x64x3xf32>) -> tensor<20x30x64x3xf32>
+    return %13 : tensor<20x30x64x3xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-131072> : tensor<20x30x1x3xi32>} : () -> tensor<20x30x1x3xi32>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]]
+// CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLADOTV2_0]], %[[CONST]]) : (tensor<20x30x64x3xi32>, tensor<20x30x1x3xi32>) -> tensor<20x30x64x3xi32>
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}, tf_saved_model.semantics} {
+  func.func @broadcasting_weight_batch_matmul(%arg0: tensor<2x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x1x1024xi8>) -> tensor<2x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<2x1x1024xi32>, tensor<i32>) -> tensor<2x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<2x1x1024xi32>, tensor<1024x3xi32>) -> tensor<2x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<2x1x3xi32>, tensor<3xi32>) -> tensor<2x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<2x1x3xi32>) -> tensor<2x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<2x1x3xf32>, tensor<f32>) -> tensor<2x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<2x1x3xf32>) -> tensor<2x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<2x1x3xf32>) -> tensor<2x1x3xf32>
+    return %14 : tensor<2x1x3xf32>
+  }
+
+// CHECK-LABEL: func @broadcasting_weight_batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<[2, 1024, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"({{.*}}, %[[CONST]]) : (tensor<1024x3xi8>, tensor<3xi64>) -> tensor<2x1024x3xi8>
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[CAST]], %[[BROADCAST_TO]])
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}, tf_saved_model.semantics} {
+  func.func @broadcasting_input_batch_matmul(%arg0: tensor<2x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x2x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<2x2x1024x3xi8>} : () -> tensor<2x2x1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<2x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<2x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<2x1x1024xf32>) -> tensor<2x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<2x1x1024xi8>) -> tensor<2x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<2x1x1024xi32>, tensor<i32>) -> tensor<2x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<2x2x1024x3xi8>) -> tensor<2x2x1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<2x2x1024x3xi8>) -> tensor<2x2x1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<2x1x1024xi32>, tensor<2x2x1024x3xi32>) -> tensor<2x2x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<2x2x1x3xi32>, tensor<3xi32>) -> tensor<2x2x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<2x2x1x3xi32>) -> tensor<2x2x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<2x2x1x3xf32>, tensor<f32>) -> tensor<2x2x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<2x2x1x3xf32>) -> tensor<2x2x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<2x2x1x3xf32>) -> tensor<2x2x1x3xf32>
+    return %14 : tensor<2x2x1x3xf32>
+  }
+
+// CHECK-LABEL: func @broadcasting_input_batch_matmul
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = {{.*}} : tensor<2x2x1024x3xi8>} : () -> tensor<2x2x1024x3xi8>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<[2, 2, 1, 1024]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK: %[[CAST:.*]] = "tf.Cast"
+// CHECK: %[[BROADCAST_TO:.*]] = "tf.BroadcastTo"(%[[CAST]], %[[CONST]]) : (tensor<2x1x1024xi8>, tensor<4xi64>) -> tensor<2x2x1x1024xi8>
+// CHECK: %[[XLADOTV2_0:.*]] = "tf.XlaDotV2"(%[[BROADCAST_TO]], %[[WEIGHT]])
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}, tf_saved_model.semantics} {
+  func.func @dynamic_shape_batch_matmul(%arg0: tensor<?x1x1024xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x1x3xf32> {tf_saved_model.index_path = ["output"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input_tensor:0", outputs = "PartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = "tf.Const"() {device = "", value = dense<3.08762283E-5> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<-1.275000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {device = "", value = dense<[-241, 5894, -3771]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_3 = "tf.Const"() {device = "", value = dense_resource<__elided__> : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+    %cst_4 = "tf.Const"() {device = "", value = dense<0.00392156513> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {device = "", value = dense<-128> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {device = "", value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg0, %cst_4) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %1 = "tf.AddV2"(%0, %cst_0) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %2 = "tf.Floor"(%1) {device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_1, %cst_6) {device = ""} : (tensor<?x1x1024xf32>, tensor<f32>, tensor<f32>) -> tensor<?x1x1024xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false, device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
+    %5 = "tf.Cast"(%4) {Truncate = false, device = ""} : (tensor<?x1x1024xi8>) -> tensor<?x1x1024xi32>
+    %6 = "tf.Sub"(%5, %cst_5) {device = ""} : (tensor<?x1x1024xi32>, tensor<i32>) -> tensor<?x1x1024xi32>
+    %7 = "tf.Identity"(%cst_3) {device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi8>
+    %8 = "tf.Cast"(%7) {Truncate = false, device = ""} : (tensor<1024x3xi8>) -> tensor<1024x3xi32>
+    %9 = "tf.BatchMatMulV2"(%6, %8) {adj_x = false, adj_y = false, device = ""} : (tensor<?x1x1024xi32>, tensor<1024x3xi32>) -> tensor<?x1x3xi32>
+    %10 = "tf.AddV2"(%9, %cst_2) {device = ""} : (tensor<?x1x3xi32>, tensor<3xi32>) -> tensor<?x1x3xi32>
+    %11 = "tf.Cast"(%10) {Truncate = false, device = ""} : (tensor<?x1x3xi32>) -> tensor<?x1x3xf32>
+    %12 = "tf.Mul"(%11, %cst) {device = ""} : (tensor<?x1x3xf32>, tensor<f32>) -> tensor<?x1x3xf32>
+    %13 = "tf.Identity"(%12) {device = ""} : (tensor<?x1x3xf32>) -> tensor<?x1x3xf32>
+    %14 = "tf.Identity"(%13) {device = ""} : (tensor<?x1x3xf32>) -> tensor<?x1x3xf32>
+    return %14 : tensor<?x1x3xf32>
+  }
+
+// CHECK-LABEL: func @dynamic_shape_batch_matmul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_1:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.*]] = "tf.Const"() {value = dense<[1024, 3]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK-DAG: %[[CONST_4:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
+// CHECK-DAG: %[[CONST_5:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-DAG: %[[WEIGHT:.*]] = "tf.Const"() {device = "", value = {{.*}} : tensor<1024x3xi8>} : () -> tensor<1024x3xi8>
+// CHECK: %[[CAST:.*]] = "tf.Cast"({{.*}}) {Truncate = false, device = ""} : (tensor<?x1x1024xf32>) -> tensor<?x1x1024xi8>
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"(%[[CAST]]) : (tensor<?x1x1024xi8>) -> tensor<3xi64>
+// CHECK: %[[SLICE_1:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST]], %[[CONST_2]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+// CHECK: %[[SLICE_2:.*]] = "tf.Slice"(%[[SHAPE]], %[[CONST_2]], %[[CONST_1]]) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+// CHECK: %[[BROADCAST_ARGS:.*]] = "tf.BroadcastArgs"(%[[SLICE_1]], %[[CONST_4]]) : (tensor<1xi64>, tensor<0xi64>) -> tensor<1xi64>
+// CHECK: %[[CONCAT_1:.*]] = "tf.Concat"(%[[CONST_5]], %[[BROADCAST_ARGS]], %[[SLICE_2]]) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK: %[[CONCAT_2:.*]] = "tf.Concat"(%[[CONST_5]], %[[BROADCAST_ARGS]], %[[CONST_3]]) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK: %[[BROADCAST_1:.*]] = "tf.BroadcastTo"(%[[CAST]], %[[CONCAT_1]]) : (tensor<?x1x1024xi8>, tensor<3xi64>) -> tensor<?x1x1024xi8>
+// CHECK: %[[BROADCAST_2:.*]] = "tf.BroadcastTo"(%[[WEIGHT]], %[[CONCAT_2]]) : (tensor<1024x3xi8>, tensor<3xi64>) -> tensor<?x1024x3xi8>
+// CHECK: %[[DOT:.*]] = "tf.XlaDotV2"(%[[BROADCAST_1]], %[[BROADCAST_2]])
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}} {
+  func.func @batch_matmul_two_tensors_with_static_shape(%arg0: tensor<2x2x2xf32>, %arg1: tensor<2x2x2xf32>) -> (tensor<2x2x2xf32>) {
+    %cst = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_0 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_6 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_0) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %1 = "tf.AddV2"(%0, %cst_1) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %5 = "tf.Div"(%arg0, %cst_3) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %6 = "tf.AddV2"(%5, %cst) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %10 = "tf.Cast"(%4) {Truncate = false} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %11 = "tf.Sub"(%10, %cst_2) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %12 = "tf.Identity"(%9) : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %13 = "tf.Cast"(%12) {Truncate = false} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %14 = "tf.Sub"(%13, %cst_4) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %15 = "tf.BatchMatMulV2"(%11, %14) {adj_x = false, adj_y = false} : (tensor<2x2x2xi32>, tensor<2x2x2xi32>) -> tensor<2x2x2xi32>
+    %16 = "tf.Cast"(%15) {Truncate = false} : (tensor<2x2x2xi32>) -> tensor<2x2x2xf32>
+    %17 = "tf.Mul"(%16, %cst_0) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %18 = "tf.AddV2"(%17, %cst) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %19 = "tf.Floor"(%18) : (tensor<2x2x2xf32>) -> tensor<2x2x2xf32>
+    %20 = "tf.ClipByValue"(%19, %cst_5, %cst_6) : (tensor<2x2x2xf32>, tensor<f32>, tensor<f32>) -> tensor<2x2x2xf32>
+    %21 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x2x2xf32>) -> tensor<2x2x2xi8>
+    %22 = "tf.Identity"(%21) {device = ""} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %23 = "tf.Identity"(%22) {device = ""} : (tensor<2x2x2xi8>) -> tensor<2x2x2xi8>
+    %24 = "tf.Cast"(%23) : (tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+    %25 = "tf.Sub"(%24, %cst_4) : (tensor<2x2x2xi32>, tensor<i32>) -> tensor<2x2x2xi32>
+    %26 = "tf.Cast"(%25) : (tensor<2x2x2xi32>) -> tensor<2x2x2xf32>
+    %27 = "tf.Mul"(%26, %cst_3) : (tensor<2x2x2xf32>, tensor<f32>) -> tensor<2x2x2xf32>
+    return %27 : tensor<2x2x2xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul_two_tensors_with_static_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_cast]], %[[arg0_cast]]
+// CHECK-SAME: (tensor<2x2x2xi8>, tensor<2x2x2xi8>) -> tensor<2x2x2xi32>
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
+// -----
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1213 : i32}} {
+  func.func @batch_matmul_two_tensors_with_dynamic_shape(%arg0: tensor<2x?x?xf32>, %arg1: tensor<2x?x?xf32>) -> (tensor<2x?x?xf32>) {
+    %cst = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_0 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_1 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %cst_2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %cst_3 = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+    %cst_4 = "tf.Const"() {value = dense<-55> : tensor<i32>} : () -> tensor<i32>
+    %cst_5 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+    %cst_6 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_7 = "tf.Const"() {value = dense<55> : tensor<i32>} : () -> tensor<i32>
+    %cst_8 = "tf.Const"() {value = dense<-5.450000e+01> : tensor<f32>} : () -> tensor<f32>
+    %cst_9 = "tf.Const"() {value = dense<0.0156862754> : tensor<f32>} : () -> tensor<f32>
+    %cst_10 = "tf.Const"() {value = dense<-5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+    %cst_11 = "tf.Const"() {value = dense<0.0274509806> : tensor<f32>} : () -> tensor<f32>
+    %cst_12 = "tf.Const"() {value = dense<-1.280000e+02> : tensor<f32>} : () -> tensor<f32>
+    %cst_13 = "tf.Const"() {value = dense<1.270000e+02> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.Div"(%arg1, %cst_9) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %1 = "tf.AddV2"(%0, %cst_10) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %2 = "tf.Floor"(%1) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %3 = "tf.ClipByValue"(%2, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %4 = "tf.Cast"(%3) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %5 = "tf.Div"(%arg0, %cst_11) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %6 = "tf.AddV2"(%5, %cst_8) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %7 = "tf.Floor"(%6) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %8 = "tf.ClipByValue"(%7, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %9 = "tf.Cast"(%8) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %10 = "tf.Shape"(%4) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %11 = "tf.Slice"(%10, %cst, %cst_1) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+    %12 = "tf.Slice"(%10, %cst_1, %cst_0) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+    %13 = "tf.Shape"(%9) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %14 = "tf.Slice"(%13, %cst, %cst_1) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi64>
+    %15 = "tf.Slice"(%13, %cst_1, %cst_0) : (tensor<3xi64>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi64>
+    %16 = "tf.BroadcastArgs"(%11, %14) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+    %17 = "tf.Concat"(%cst_2, %16, %12) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+    %18 = "tf.Concat"(%cst_2, %16, %15) : (tensor<i32>, tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+    %19 = "tf.BroadcastTo"(%4, %17) : (tensor<2x?x?xi8>, tensor<3xi64>) -> tensor<2x?x?xi8>
+    %20 = "tf.BroadcastTo"(%9, %18) : (tensor<2x?x?xi8>, tensor<3xi64>) -> tensor<2x?x?xi8>
+    %21 = "tf.XlaDotV2"(%19, %20) {dimension_numbers = "\22\01\00\1A\01\00\12\01\01\0A\01\02", precision_config = ""} : (tensor<2x?x?xi8>, tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %22 = "tf.Cast"(%19) {Truncate = false} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %23 = "tf.Sum"(%22, %cst_3) {keep_dims = true} : (tensor<2x?x?xi32>, tensor<1xi64>) -> tensor<2x?x1xi32>
+    %24 = "tf.Mul"(%23, %cst_4) : (tensor<2x?x1xi32>, tensor<i32>) -> tensor<2x?x1xi32>
+    %25 = "tf.Cast"(%20) {Truncate = false} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %26 = "tf.Sum"(%25, %cst_5) {keep_dims = true} : (tensor<2x?x?xi32>, tensor<1xi64>) -> tensor<2x1x?xi32>
+    %27 = "tf.Mul"(%26, %cst_6) : (tensor<2x1x?xi32>, tensor<i32>) -> tensor<2x1x?xi32>
+    %28 = "tf.Shape"(%20) : (tensor<2x?x?xi8>) -> tensor<3xi64>
+    %29 = "tf.StridedSlice"(%28, %cst_5, %cst_3, %cst_5) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<3xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+    %30 = "tf.Cast"(%29) {Truncate = false} : (tensor<1xi64>) -> tensor<1xi32>
+    %31 = "tf.Mul"(%30, %cst_7) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+    %32 = "tf.Add"(%24, %27) : (tensor<2x?x1xi32>, tensor<2x1x?xi32>) -> tensor<2x?x?xi32>
+    %33 = "tf.Sub"(%32, %31) : (tensor<2x?x?xi32>, tensor<1xi32>) -> tensor<2x?x?xi32>
+    %34 = "tf.Sub"(%21, %33) : (tensor<2x?x?xi32>, tensor<2x?x?xi32>) -> tensor<2x?x?xi32>
+    %35 = "tf.Cast"(%34) {Truncate = false} : (tensor<2x?x?xi32>) -> tensor<2x?x?xf32>
+    %36 = "tf.Mul"(%35, %cst_9) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %37 = "tf.AddV2"(%36, %cst_8) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %38 = "tf.Floor"(%37) : (tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
+    %39 = "tf.ClipByValue"(%38, %cst_12, %cst_13) : (tensor<2x?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<2x?x?xf32>
+    %40 = "tf.Cast"(%39) {Truncate = false} : (tensor<2x?x?xf32>) -> tensor<2x?x?xi8>
+    %41 = "tf.Identity"(%40) {device = ""} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi8>
+    %42 = "tf.Identity"(%41) {device = ""} : (tensor<2x?x?xi8>) -> tensor<2x?x?xi8>
+    %43 = "tf.Cast"(%42) : (tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+    %44 = "tf.Sub"(%43, %cst_4) : (tensor<2x?x?xi32>, tensor<i32>) -> tensor<2x?x?xi32>
+    %45 = "tf.Cast"(%44) : (tensor<2x?x?xi32>) -> tensor<2x?x?xf32>
+    %46 = "tf.Mul"(%45, %cst_11) : (tensor<2x?x?xf32>, tensor<f32>) -> tensor<2x?x?xf32>
+    return %46 : tensor<2x?x?xf32>
+  }
+
+// CHECK-LABEL: func @batch_matmul_two_tensors_with_dynamic_shape
+// CHECK: %[[arg1_div:.*]] = "tf.Div"(%arg1
+// CHECK: %[[arg1_add:.*]] = "tf.AddV2"(%[[arg1_div]]
+// CHECK: %[[arg1_floor:.*]] = "tf.Floor"(%[[arg1_add]]
+// CHECK: %[[arg1_clip:.*]] = "tf.ClipByValue"(%[[arg1_floor]]
+// CHECK: %[[arg1_cast:.*]] = "tf.Cast"(%[[arg1_clip]]
+
+// CHECK: %[[arg0_div:.*]] = "tf.Div"(%arg0
+// CHECK: %[[arg0_add:.*]] = "tf.AddV2"(%[[arg0_div]]
+// CHECK: %[[arg0_floor:.*]] = "tf.Floor"(%[[arg0_add]]
+// CHECK: %[[arg0_clip:.*]] = "tf.ClipByValue"(%[[arg0_floor]]
+// CHECK: %[[arg0_cast:.*]] = "tf.Cast"(%[[arg0_clip]]
+
+// CHECK: %[[arg1_broad:.*]] = "tf.BroadcastTo"(%[[arg1_cast]]
+// CHECK: %[[arg0_broad:.*]] = "tf.BroadcastTo"(%[[arg0_cast]]
+
+// CHECK: %[[matmul:.*]] = "tf.XlaDotV2"(%[[arg1_broad]], %[[arg0_broad]]
+// CHECK-SAME: (tensor<2x?x?xi8>, tensor<2x?x?xi8>) -> tensor<2x?x?xi32>
+
+// CHECK: %[[arg0_shape:.*]] = "tf.Shape"(%[[arg0_broad]]
+// CHECK: %[[shape_zp_contribute:.*]] = "tf.StridedSlice"(%[[arg0_shape]]
+// CHECK: %[[shape_zp_contribute_cast:.*]] = "tf.Cast"(%[[shape_zp_contribute]]
+// CHECK: %[[shape_zp_contribute_mul:.*]] = "tf.Mul"(%[[shape_zp_contribute_cast]]
+// CHECK: %[[zp:.*]] = "tf.Sub"({{.*}}, %[[shape_zp_contribute_mul]])
+
+// CHECK: %[[matmul_sub:.*]] = "tf.Sub"(%[[matmul]], %[[zp]]
+// CHECK: %[[matmul_cast:.*]] = "tf.Cast"(%[[matmul_sub]]
+// CHECK: %[[matmul_mul:.*]] = "tf.Mul"(%[[matmul_cast]]
+// CHECK: %[[matmul_add:.*]] = "tf.AddV2"(%[[matmul_mul]]
+// CHECK: %[[matmul_floor:.*]] = "tf.Floor"(%[[matmul_add]]
+// CHECK: %[[matmul_clip:.*]] = "tf.ClipByValue"(%[[matmul_floor]]
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
index 59e9b0ea80a..775ab82e105 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/replace_cast_hacks_with_tf_xla_ops_large_constants.mlir
@@ -56,7 +56,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
   }
 
 // CHECK-LABEL: func @conv_with_filter_larger_than_1GB
-// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-237772800> : tensor<512xi32>} : () -> tensor<512xi32>
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() {value = dense<-237772800> : tensor<1x1x1x512xi32>} : () -> tensor<1x1x1x512xi32>
 // CHECK: %[[PADV2_0:.*]] = "tf.PadV2"
 // CHECK: %[[XLACONVV2_0:.*]] = "tf.XlaConvV2"(%[[PADV2_0]]
 // CHECK: %[[SUB_0:.*]] = "tf.Sub"(%[[XLACONVV2_0]], %[[CONST]])
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
index a3faed341bb..600b2926631 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/unfreeze_constants.mlir
@@ -1,9 +1,9 @@
-// RUN: tf-quant-opt %s -quant-unfreeze-constants -allow-unregistered-dialect \
-// RUN:     -mlir-disable-threading -split-input-file -verify-diagnostics | \
-// RUN:     FileCheck %s
+// RUN: tf-quant-opt %s -quant-unfreeze-constants='size_threshold_in_bytes=16' \
+// RUN:     -allow-unregistered-dialect -mlir-disable-threading \
+// RUN:     -split-input-file -verify-diagnostics | FileCheck %s
 
 // Tests a case with one ConstOp and a tf_saved_model.session_initializer with an empty initializers.
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1287 : i32}, tf_saved_model.semantics} {
+module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
 // Check that the init function is created & added to the initializers attribute.
@@ -34,7 +34,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // -----
 
 // Tests the case when there's no tf_saved_model.sesion_initializer.
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1287 : i32}, tf_saved_model.semantics} {
+module attributes {tf_saved_model.semantics} {
 
 // Check that a new tf_saved_model.session_initializer is created, along with an initialier function.
 // CHECK: "tf_saved_model.session_initializer"()
@@ -44,34 +44,34 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_restore_op"]
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<1.000000e\+00> : tensor<8xf32>.*}}}
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2xf32>}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<2.000000e\+00> : tensor<8xf32>.*}}}
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()  {{.*shared_name = "const_1".*}}
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
 
-  func.func @serving_default() -> (tensor<2xf32> {tf_saved_model.index_path = ["output"]})
+  func.func @serving_default() -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
   attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<2xf32>} : () -> tensor<2xf32>
-    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<2xf32>} : () -> tensor<2xf32>
-    %0 = "tf.Add"(%cst_0, %cst_1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    return %0 : tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %0 = "tf.AddV2"(%cst_0, %cst_1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+    return %0 : tensor<8xf32>
   }
 // CHECK: @serving_default
-// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
-// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
-// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
-// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
-// CHECK-DAG: %[[ADD_0:.*]] = "tf.Add"(%[[READ_VAR_0]], %[[READ_VAR_1]])
-// CHECK: return %[[ADD_0]] : tensor<2xf32>
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[ADD_0:.*]] = "tf.AddV2"(%[[READ_VAR_0]], %[[READ_VAR_1]])
+// CHECK: return %[[ADD_0]] : tensor<8xf32>
 }
 
 // -----
 
 // Tests the case when there's a tf_saved_model.sesion_initializer and an empty init function.
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1287 : i32}, tf_saved_model.semantics} {
+module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
 // CHECK: "tf_saved_model.session_initializer"()
@@ -84,34 +84,34 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK-SAME: tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"]
 // CHECK-SAME: tf_saved_model.initializer_type = "restore_op"
 
-// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<2xf32>}
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<8xf32>}
 // CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"()
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
 
-// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<2xf32>}
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<8xf32>}
 // CHECK-DAG: %[[VAR_HANDLE_1:.*]] = "tf.VarHandleOp"()
 // CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_1]], %[[CST_1]])
 
-  func.func @serving_default(%arg0: tensor<2xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<2xf32> {tf_saved_model.index_path = ["output"]})
+  func.func @serving_default(%arg0: tensor<8xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<8xf32> {tf_saved_model.index_path = ["output"]})
     attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<2xf32>} : () -> tensor<2xf32>
-    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<2xf32>} : () -> tensor<2xf32>
-    %0 = "tf.Sub"(%cst_0, %cst_1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-    return %0 : tensor<2xf32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<2.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %0 = "tf.Sub"(%cst_0, %cst_1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+    return %0 : tensor<8xf32>
   }
 // CHECK: @serving_default
-// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
-// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
-// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<2xf32>>>
-// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<2xf32>
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[VAR_HANDLE_3:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_1".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_1:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_3]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
 // CHECK-DAG: %[[SUB_0:.*]] = "tf.Sub"(%[[READ_VAR_0]], %[[READ_VAR_1]])
-// CHECK: return %[[SUB_0]] : tensor<2xf32>
+// CHECK: return %[[SUB_0]] : tensor<8xf32>
 }
 
 // -----
 
 // Tests the case when there's a tf_saved_model.sesion_initializer and an init function whose type is "init_op".
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1287 : i32}, tf_saved_model.semantics} {
+module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
 // Check that @init_func_restore_op is added to the initializers list.
@@ -144,15 +144,63 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // -----
 
 // Tests the case when there is no ConstOp.
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1287 : i32}, tf_saved_model.semantics} {
+module attributes {tf_saved_model.semantics} {
 
 // Check that nothing happens when there's no ConstOp in the graph.
 // CHECK-NOT: "tf_saved_model.session_initializer"()
 
-  func.func @serving_default(%arg_0: tensor<3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<3xf32> {tf_saved_model.index_path = ["output"]})
+  func.func @serving_default(%arg_0: tensor<5xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<5xf32> {tf_saved_model.index_path = ["output"]})
   attributes {tf.entry_function = {control_outputs = "", inputs = "inputs:0", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
-    return %arg_0 : tensor<3xf32>
+    return %arg_0 : tensor<5xf32>
+  }
+// CHECK: @serving_default(%[[ARG_0:.*]]: tensor<5xf32> {{.*}})
+// CHECK-NEXT: return %[[ARG_0]] : tensor<5xf32>
+}
+
+// -----
+
+// Tests that constants that are smaller than "size_threshold_in_bytes" are
+// not converted to variables. This test uses the threshold of 16 bytes.
+
+module attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_func_restore_op]} : () -> ()
+
+  func.func @init_func_restore_op() attributes {tf_saved_model.exported_names = ["tf_saved_model.session_initializer_init"],
+                                           tf_saved_model.initializer_type = "restore_op"} {
+    return
   }
-// CHECK: @serving_default(%[[ARG_0:.*]]: tensor<3xf32> {{.*}})
-// CHECK-NEXT: return %[[ARG_0]] : tensor<3xf32>
+
+  func.func @serving_default() -> (tensor<12xf32> {tf_saved_model.index_path = ["output"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "output:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    // Should be unfrozen.
+    %cst_0 = "tf.Const"() {value = dense<5.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    // Consts below are smaller than or equal to the threshold so they
+    // should not be converted to variables.
+    %cst_1 = "tf.Const"() {value = dense<5.0> : tensor<4xf32>} : () -> tensor<4xf32>
+    %cst_axis = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %0 = "tf.ConcatV2"(%cst_0, %cst_1, %cst_axis) : (tensor<8xf32>, tensor<4xf32>, tensor<i64>) -> tensor<12xf32>
+    return %0 : tensor<12xf32>
+  }
+// CHECK: func.func @init_func_restore_op()
+
+// Check that `tf.VarHandleOp` is only created for the constant that is larger
+// than the threshold (16 bytes for this test).
+// CHECK-DAG: %[[CST_0:.*]] = "tf.Const"() {{{.*value = dense<5.000000e\+00> : tensor<8xf32>.*}}}
+// CHECK-DAG: %[[VAR_HANDLE_0:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}}
+// CHECK-DAG: "tf.AssignVariableOp"(%[[VAR_HANDLE_0]], %[[CST_0]])
+
+// Make sure that there are no more `tf.VarHandleOp`s and `tf.AssignVariableOp`s
+// in this function.
+// CHECK-NOT: "tf.VarHandleOp"
+// CHECK-NOT: "tf.AssignVariableOp"
+
+// Only the large constant is replaced with the `tf.VarHandleOp ->
+// tf.ReadVariableOp` pattern and others remain as `tf.Const`s.
+// CHECK: @serving_default
+// CHECK-DAG: %[[VAR_HANDLE_2:.*]] = "tf.VarHandleOp"() {{.*shared_name = "const_0".*}} : () -> tensor<!tf_type.resource<tensor<8xf32>>>
+// CHECK-DAG: %[[READ_VAR_0:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE_2]]) : (tensor<!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
+// CHECK-DAG: %[[CST_1:.*]] = "tf.Const"() {{{.*value = dense<5.000000e\+00> : tensor<4xf32>.*}}}
+// CHECK-DAG: %[[AXIS:.*]] = "tf.Const"() {{{.*value = dense<0> : tensor<i64>.*}}}
+// CHECK-DAG: %[[CONCAT:.*]] = "tf.ConcatV2"(%[[READ_VAR_0]], %[[CST_1]], %[[AXIS]])
+// CHECK: return %[[CONCAT]] : tensor<12xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
index d75b2806c2a..f2d89b2df75 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
     ],
@@ -41,6 +42,7 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "@com_google_absl//absl/strings",
@@ -50,6 +52,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_to_uniform_attribute_utils",
+    srcs = ["tf_to_uniform_attribute_utils.cc"],
+    hdrs = ["tf_to_uniform_attribute_utils.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:pass_utils",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:uniform_op_quant_spec",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "tf_to_xla_attribute_utils",
     srcs = ["tf_to_xla_attribute_utils.cc"],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
index e36f6a3c632..93c62d4492d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/fake_quant_utils.h
@@ -36,8 +36,8 @@ struct FetchMinMaxAttrs {
   using AttrType = FloatAttr;
   bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
                   AttrType &max_value) const {
-    min_value = tf_op.minAttr();
-    max_value = tf_op.maxAttr();
+    min_value = tf_op.getMinAttr();
+    max_value = tf_op.getMaxAttr();
     return true;  // Successfully matched and fetched.
   }
 };
@@ -47,7 +47,14 @@ struct FetchConstantMinMaxInputs {
   using AttrType = DenseFPElementsAttr;
   bool operator()(TFFakeQuantOp tf_op, AttrType &min_value,
                   AttrType &max_value) const {
-    Value min = tf_op.min(), max = tf_op.max();
+    Value min = tf_op.getMin(), max = tf_op.getMax();
+    if (auto min_id = min.getDefiningOp<TF::IdentityOp>()) {
+      min = min_id.getInput();
+    }
+    if (auto max_id = max.getDefiningOp<TF::IdentityOp>()) {
+      max = max_id.getInput();
+    }
+
     if (!matchPattern(min, m_Constant(&min_value))) {
       return false;
     }
@@ -91,7 +98,7 @@ class ConvertFakeQuantOpToQuantOps {
   using FetchAttrType = typename FetchMinMax::AttrType;
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
                                 OpBuilder &rewriter) const {
-    if (tf_op.num_bits() != 8) {
+    if (tf_op.getNumBits() != 8) {
       return failure();
     }
 
@@ -103,7 +110,7 @@ class ConvertFakeQuantOpToQuantOps {
       return failure();
     }
 
-    Value input = tf_op.inputs();
+    Value input = tf_op.getInputs();
     int quant_dim = -1;
     auto input_type = input.getType().template cast<ShapedType>();
     if (PerAxis) {
@@ -117,8 +124,8 @@ class ConvertFakeQuantOpToQuantOps {
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
     rewriter.setInsertionPointAfter(tf_op.getOperation());
-    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
-    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.getNumBits());
+    BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.getNarrowRange());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
         rewriter, input_type, min_value, max_value, quant_dim, num_bits,
@@ -135,7 +142,7 @@ class ConvertFakeQuantOpToQuantOps {
         tf_op.getLoc(), qtype.getValue(), input);
     auto dequantize = rewriter.create<quantfork::DequantizeCastOp>(
         tf_op.getLoc(), res_type, quantize.getResult());
-    tf_op.outputs().replaceAllUsesWith(dequantize);
+    tf_op.getOutputs().replaceAllUsesWith(dequantize);
 
     return success();
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
index 45311e45f7b..25af606123c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -62,7 +63,7 @@ ValueRange createFusedFnCall(OpBuilder builder, Location location,
       builder.getStringAttr(llvm::StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
-  return call_op.output();
+  return call_op.getOutput();
 }
 
 // Finds ops in the paths from arguments to results. The ops is listed in an
@@ -201,7 +202,7 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   builder.createBlock(&wrap_func.getBody(), wrap_func.begin(), arg_types,
                       arg_locs);
 
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   for (int32_t i : llvm::seq<int32_t>(0, arguments.size())) {
     mapping.map(arguments[i], wrap_func.getArgument(i));
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h
index 46fc7dfa837..7cface2dc98 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/lift_as_function_call_utils.h
@@ -27,7 +27,6 @@ limitations under the License.
 namespace mlir {
 namespace quant {
 
-inline constexpr absl::string_view kAttrMapAttribute = "attr_map";
 // This attribute will be set for functions created by this pass.
 inline constexpr absl::string_view kFusedFunctionAttr =
     "tf_quant.composite_function";
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
new file mode 100644
index 00000000000..e6f74a654aa
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.cc
@@ -0,0 +1,241 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h"
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+#include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
+
+namespace mlir::quant {
+
+using QuantMethod =
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
+
+Attribute GetWindowStridesValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr stride = identifier_to_attr["strides"].dyn_cast<ArrayAttr>();
+  const int stride_h = stride[1].cast<IntegerAttr>().getInt();
+  const int stride_w = stride[2].cast<IntegerAttr>().getInt();
+  return rewriter.getI64ArrayAttr({stride_h, stride_w});
+}
+
+Attribute GetLhsDilationValue(PatternRewriter& rewriter,
+                              llvm::StringMap<Attribute>& identifier_to_attr) {
+  return rewriter.getI64ArrayAttr({1, 1});
+}
+
+Attribute GetRhsDilationValue(PatternRewriter& rewriter,
+                              llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr dilations = identifier_to_attr["dilations"].dyn_cast<ArrayAttr>();
+  const int dilation_h = dilations[1].cast<IntegerAttr>().getInt();
+  const int dilation_w = dilations[2].cast<IntegerAttr>().getInt();
+  return rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+}
+
+Attribute GetPaddingValue(PatternRewriter& rewriter,
+                          llvm::StringMap<Attribute>& identifier_to_attr) {
+  llvm::StringRef padding =
+      identifier_to_attr["padding"].dyn_cast<StringAttr>().getValue();
+  return rewriter.getStringAttr(padding);
+}
+
+Attribute GetExplicitPaddingValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  ArrayAttr explicit_padding =
+      identifier_to_attr["explicit_paddings"].dyn_cast<ArrayAttr>();
+  return explicit_padding;
+}
+
+Attribute GetDimensionNumbersValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  // Only NHWC is lifted in TF-quant and the corresponding dimension number is
+  // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f].
+
+  tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(
+          R"pb(
+            input_batch_dimension: 0
+            input_feature_dimension: 3
+            input_spatial_dimensions: 1
+            input_spatial_dimensions: 2
+            kernel_output_feature_dimension: 3
+            kernel_input_feature_dimension: 2
+            kernel_spatial_dimensions: 0
+            kernel_spatial_dimensions: 1
+            output_batch_dimension: 0
+            output_feature_dimension: 3
+            output_spatial_dimensions: 1
+            output_spatial_dimensions: 2
+          )pb",
+          &dimension_numbers)) {
+    return rewriter.getStringAttr("");
+  }
+  return rewriter.getStringAttr(dimension_numbers.SerializeAsString());
+}
+
+Attribute GetBatchGroupCountValue(
+    PatternRewriter& rewriter, llvm::StringMap<Attribute>& identifier_to_attr) {
+  // Only 1 case is supported.
+  return rewriter.getI64IntegerAttr(1);
+}
+
+void FillQuantizationAttributes(PatternRewriter& rewriter, Operation* op,
+                                NamedAttrList& attrs,
+                                llvm::StringMap<Attribute>& identifier_to_attr,
+                                QuantMethod quantization_method) {
+  // TODO(b/259374419): Support broader quantization schemes
+  absl::flat_hash_map<std::string, int> min_max_scheme_for_8bit_narrow;
+  min_max_scheme_for_8bit_narrow = {{"min", -127}, {"max", 127}};
+
+  std::set<std::string> quantization_attributes;
+  if (quantization_method ==
+      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
+    quantization_attributes = {
+        "rhs_quantization_min_val",
+        "rhs_quantization_max_val",
+    };
+  } else {
+    quantization_attributes = {
+        "lhs_quantization_min_val",    "lhs_quantization_max_val",
+        "rhs_quantization_min_val",    "rhs_quantization_max_val",
+        "output_quantization_min_val", "output_quantization_max_val",
+    };
+  }
+
+  for (const auto& attr : quantization_attributes) {
+    auto quant_val = absl::StrContains(attr, "min")
+                         ? min_max_scheme_for_8bit_narrow["min"]
+                         : min_max_scheme_for_8bit_narrow["max"];
+    auto quant_val_attr = rewriter.getI64IntegerAttr(quant_val);
+    attrs.push_back(rewriter.getNamedAttr(attr, quant_val_attr));
+  }
+}
+
+LogicalResult FillAttributesForUniformQuantizedDotOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+
+  // Fill quantization related attributes.
+  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                             quantization_method);
+
+  if (!(quantization_method ==
+        tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE)) {
+    // Per-channel activation is not supported
+    attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+    attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+  }
+
+  std::unique_ptr<OpQuantSpec> spec = GetUniformOpQuantSpec(op);
+  absl::flat_hash_set<int> operands = spec->quantizable_operands;
+  int quant_dim = -1;
+  if (enable_per_channel_quantization && operands.size() == 1) {
+    quant_dim = spec->coeff_op_quant_dim[*(spec->quantizable_operands.begin())];
+  }
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    QuantMethod quantization_method, bool enable_per_channel_quantization) {
+  NamedAttrList attrs;
+  absl::flat_hash_map<std::string, Attribute (*)(PatternRewriter&,
+                                                 llvm::StringMap<Attribute>&)>
+      attribute_getter_map;
+
+  attribute_getter_map = {{"window_strides", GetWindowStridesValue},
+                          {"lhs_dilation", GetLhsDilationValue},
+                          {"rhs_dilation", GetRhsDilationValue},
+                          {"padding", GetPaddingValue},
+                          {"explicit_padding", GetExplicitPaddingValue},
+                          {"dimension_numbers", GetDimensionNumbersValue},
+                          {"batch_group_count", GetBatchGroupCountValue}};
+
+  for (auto& attr : op->getAttrs()) {
+    llvm::StringRef attr_name = attr.getName().getValue();
+    if (attribute_getter_map.find(attr_name.str()) !=
+        attribute_getter_map.end()) {
+      auto attr_val =
+          (attribute_getter_map[attr_name.str()])(rewriter, identifier_to_attr);
+      attrs.push_back(rewriter.getNamedAttr(attr_name, attr_val));
+    }
+  }
+
+  auto feature_group_cnt_attr = llvm::StringRef("feature_group_count");
+  int feature_group_cnt = 1;
+  ShapedType input_shape = op->getOperand(0).getType().dyn_cast<ShapedType>();
+  if (!input_shape) {
+    return op->emitError(
+        "Only input with known shape is supported for Uniform Quantized "
+        "opset.");
+  }
+
+  if (op->getParentOfType<func::FuncOp>().getName().contains("depthwise_")) {
+    feature_group_cnt = input_shape.getDimSize(3);
+  }
+
+  attrs.push_back(rewriter.getNamedAttr(
+      feature_group_cnt_attr, rewriter.getI64IntegerAttr(feature_group_cnt)));
+
+  // Fill quantization related attributes.
+  FillQuantizationAttributes(rewriter, op, attrs, identifier_to_attr,
+                             quantization_method);
+
+  if (quantization_method !=
+      tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE) {
+    // Per-channel activation is not supported
+    attrs.push_back(rewriter.getNamedAttr("lhs_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+    attrs.push_back(rewriter.getNamedAttr("output_quantization_axis",
+                                          rewriter.getI64IntegerAttr(-1)));
+  }
+
+  std::unique_ptr<OpQuantSpec> spec = GetUniformOpQuantSpec(op);
+  absl::flat_hash_set<int> operands = spec->quantizable_operands;
+  int quant_dim = -1;
+  if (enable_per_channel_quantization && operands.size() == 1) {
+    quant_dim = spec->coeff_op_quant_dim[*(spec->quantizable_operands.begin())];
+  }
+  attrs.push_back(rewriter.getNamedAttr("rhs_quantization_axis",
+                                        rewriter.getI64IntegerAttr(quant_dim)));
+
+  op->setAttrs(rewriter.getDictionaryAttr(attrs));
+
+  return success();
+}
+
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
new file mode 100644
index 00000000000..547473f3d90
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_uniform_attribute_utils.h
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This header file defines common utils used when transforming TF ops to
+// Uniform Quantized ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/utils.h"
+
+namespace mlir::quant {
+
+LogicalResult FillAttributesForUniformQuantizedDotOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+LogicalResult FillAttributesForUniformQuantizedConvolutionOp(
+    PatternRewriter& rewriter, Operation* op,
+    llvm::StringMap<Attribute>& identifier_to_attr,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_UNIFORM_ATTRIBUTE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
index bf8156bcf83..52dcdcbc780 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This header file defines common utils used when transforming TF ops to
-// XLA/Uniform Quantized ops.
+// This header file defines common utils used when transforming TF ops to XLA
+// ops.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_UTILS_TF_TO_XLA_ATTRIBUTE_UTILS_H_
 
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 406ba58f9e4..e29e29cbd95 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -67,15 +67,46 @@
     config.mlir_tools_dir, config.llvm_tools_dir
 ]
 tool_names = [
-    'mlir-opt', 'mlir-hlo-opt', 'mlir-translate', 'tf-opt', 'tf-reduce',
-    'tf_tfl_translate', 'tf_tfjs_translate', 'flatbuffer_to_string',
-    'flatbuffer_translate', 'tf-mlir-translate', 'mlir-tflite-runner',
-    'tfcompile', 'json_to_flatbuffer', 'xla-cpu-opt', 'xla-gpu-opt',
-    'xla-mlir-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir', 'kernel-gen-opt',
-    'tf_to_kernel', 'tf_to_gpu_binary', 'tfjs-opt', 'tac-opt-all-backends',
-    'tac-translate', 'tfg-opt-no-passes', 'tfg-transforms-opt', 'tfg-translate',
-    'tf-tfrt-opt', 'lhlo-tfrt-opt', 'tf-quant-opt', 'mhlo-tosa-opt',
-    'xla-runtime-opt', 'tf-mhlo-tfl-opt', 'odml_to_stablehlo', 'xla-translate'
+    'dtensor-opt',
+    'flatbuffer_to_string',
+    'flatbuffer_translate',
+    'hlo_to_llvm_ir',
+    'json_to_flatbuffer',
+    'kernel-gen-opt',
+    'lhlo-tfrt-opt',
+    'mhlo-tosa-opt',
+    'mlir-bisect',
+    'mlir-hlo-opt',
+    'mlir-interpreter-runner',
+    'mlir-opt',
+    'mlir-tflite-runner',
+    'mlir-translate',
+    'odml-to-stablehlo-opt',
+    'odml_to_stablehlo',
+    'tac-opt-all-backends',
+    'tac-translate',
+    'tf-mlir-translate',
+    'tf-opt',
+    'tf-quant-opt',
+    'tf-reduce',
+    'tf-tfrt-opt',
+    'tf_tfjs_translate',
+    'tf_tfl_translate',
+    'tf_to_gpu_binary',
+    'tf_to_kernel',
+    'tfcompile',
+    'tfg-opt-no-passes',
+    'tfg-transforms-opt',
+    'tfg-translate',
+    'tfjs-opt',
+    'xla-cpu-opt',
+    'xla-gpu-opt',
+    'xla-mlir-gpu-opt',
+    'xla-opt',
+    'xla-runtime-opt',
+    'xla-translate',
+    'xla-translate-gpu-opt',
+    'xla-translate-opt',
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index eb2ba3d1213..293118cda2b 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -37,26 +37,30 @@
 config.suffixes = ['.td', '.mlir', '.pbtxt']
 
 mlir_tf_tools_dirs = [
-    'tensorflow/core/ir/importexport/',
-    'tensorflow/core/ir/tests/',
-    'tensorflow/core/transforms/',
+    'tensorflow/compiler/aot',
     'tensorflow/compiler/mlir',
-    'tensorflow/compiler/xla/mlir_hlo',
-    'tensorflow/compiler/xla/mlir_hlo/tosa',
-    'tensorflow/compiler/xla/translate',
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/lite/experimental/tac',
+    'tensorflow/compiler/mlir/lite/stablehlo',
     'tensorflow/compiler/mlir/quantization/tensorflow',
     'tensorflow/compiler/mlir/tensorflow',
     'tensorflow/compiler/mlir/tfrt',
-    'tensorflow/compiler/mlir/xla',
     'tensorflow/compiler/mlir/tools/kernel_gen',
-    'tensorflow/compiler/aot',
-    'tensorflow/compiler/xla/service/mlir_gpu',
-    'tensorflow/compiler/xla/service/gpu/tests',
+    'tensorflow/compiler/mlir/xla',
+    'tensorflow/compiler/xla/mlir/backends/cpu',
+    'tensorflow/compiler/xla/mlir/backends/gpu',
     'tensorflow/compiler/xla/mlir/runtime',
-    'tensorflow/compiler/xla/mlir/tools',
-    'tensorflow/compiler/mlir/lite/stablehlo',
+    'tensorflow/compiler/xla/mlir/tools/mlir_bisect',
+    'tensorflow/compiler/xla/mlir_hlo',
+    'tensorflow/compiler/xla/mlir_hlo/tosa',
+    'tensorflow/compiler/xla/service/gpu/tests',
+    'tensorflow/compiler/xla/service/mlir_gpu',
+    'tensorflow/compiler/xla/translate',
+    'tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla',
+    'tensorflow/core/ir/importexport/',
+    'tensorflow/core/ir/tests/',
+    'tensorflow/core/transforms/',
+    'tensorflow/dtensor/mlir/tests',
 ]
 config.mlir_tf_tools_dirs = [
     os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], s)
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 44102734392..1b7ca19ea13 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -9,6 +9,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
 #
 # copybara:uncomment_end(google-only)
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -706,6 +707,7 @@ cc_library(
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
+    visibility = ["//visibility:public"],
     deps = [
         ":tensorflow_all_ops_inc_gen",
         ":tensorflow_attributes",
@@ -765,6 +767,21 @@ gentbl_cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "tf_saved_model_test",
+    srcs = ["ir/tf_saved_model_test.cc"],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:test",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "decompose_resource_ops",
     srcs = [
@@ -966,6 +983,7 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -973,6 +991,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "topological_sort",
+    srcs = ["utils/topological_sort.cc"],
+    hdrs = ["utils/topological_sort.h"],
+    deps = [
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "initialize_variables_in_session_init",
     srcs = [
@@ -1000,6 +1028,7 @@ cc_library(
 cc_library(
     name = "tf_saved_model_passes",
     srcs = [
+        "transforms/convert_session_initializer_to_function.cc",
         "transforms/deduplicate_bound_input_bindings.cc",
         "transforms/freeze_global_tensors.cc",
         "transforms/freeze_saved_model_assets.cc",
@@ -1012,6 +1041,7 @@ cc_library(
     hdrs = [
         "transforms/tf_saved_model_passes.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":resource_value_typed_analyzer",
         ":tensorflow",
@@ -1217,6 +1247,7 @@ cc_library(
         "transforms/remove_unused_while_results.cc",
         "transforms/replica_id_to_device_ordinal.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
+        "transforms/replicate_tensor_list_init_ops_pass.cc",
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
         "transforms/resource_op_lifting.cc",
@@ -1244,6 +1275,7 @@ cc_library(
         "transforms/tpu_identity_pruning.cc",
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_parallel_execute_sink_resource_write.cc",
+        "transforms/tpu_partitioned_op_conversion.cc",
         "transforms/tpu_reorder_replicate_and_partitioned_inputs.cc",
         "transforms/tpu_resource_partitioning.cc",
         "transforms/tpu_resource_read_for_write.cc",
@@ -1264,10 +1296,12 @@ cc_library(
     ],
     hdrs = [
         "transforms/bridge.h",
+        "transforms/call_graph_util.h",
         "transforms/cluster_ops_by_policy.h",
         "transforms/collection_ops_util.h",
         "transforms/einsum.h",
         "transforms/passes.h",
+        "translate/split_into_island_per_op_pass.h",
     ],
     includes = ["include"],
     textual_hdrs = [
@@ -1275,6 +1309,7 @@ cc_library(
         "transforms/tf_passes.h.inc",
         "transforms/tf_savedmodel_passes.h.inc",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":attribute_utils",
         ":bridge_logger",
@@ -1304,7 +1339,9 @@ cc_library(
         ":tf_pass_inc_gen",
         ":tf_savedmodel_pass_inc_gen",
         ":tfe_legalize_tfg",
+        ":topological_sort",
         ":tpu_cluster_util",
+        ":tpu_embedding_ops_registry",
         ":tpu_rewrite_device_util",
         ":translate_utils",
         ":unroll_batch_matmul_pass",
@@ -1332,6 +1369,8 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1497,7 +1536,6 @@ cc_library(
     deps = [
         ":attribute_utils",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime:device",
@@ -1519,6 +1557,7 @@ cc_library(
     hdrs = [
         "translate/export_graphdef.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":convert_type",
         ":error_util",
@@ -1531,7 +1570,6 @@ cc_library(
         "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1587,7 +1625,6 @@ cc_library(
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1654,7 +1691,6 @@ cc_library(
         ":tensorflow_attributes",
         ":tensorflow_types",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -1700,7 +1736,6 @@ cc_library(
         "utils/translate_utils.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@llvm-project//mlir:FuncDialect",
@@ -1723,7 +1758,6 @@ cc_library(
         ":tensorflow",
         "//tensorflow/compiler/mlir:string_container_utils",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1760,7 +1794,6 @@ cc_library(
         ":import_model",
         ":mlir_roundtrip_flags",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1782,6 +1815,7 @@ cc_library(
     name = "mlir_roundtrip_flags",
     srcs = ["translate/mlir_roundtrip_flags.cc"],
     hdrs = ["translate/mlir_roundtrip_flags.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:framework",
@@ -1807,9 +1841,9 @@ cc_library(
         ":convert_tensor",
         ":convert_type",
         ":tensorflow_attributes",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -1831,7 +1865,6 @@ cc_library(
     deps = [
         ":dynamic_shape_utils",
         ":tensorflow_types",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1849,7 +1882,6 @@ tf_cc_test(
     deps = [
         ":convert_type",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1868,10 +1900,10 @@ cc_library(
         ":mangling_util",
         ":tensorflow_attributes",
         ":tensorflow_types",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/platform:float8",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
@@ -1889,13 +1921,13 @@ tf_cc_test(
         ":dynamic_shape_utils",
         ":tensorflow",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/tsl/platform:float8",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -1938,6 +1970,7 @@ cc_library(
     hdrs = [
         "transforms/constant_fold.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         ":convert_tensor",
         ":eval_util",
@@ -1946,7 +1979,6 @@ cc_library(
         ":tensorflow_types",
         "//tensorflow/c:tf_status",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
@@ -1975,7 +2007,6 @@ cc_library(
         ":mlir_roundtrip_flags",
         ":tensorflow",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow_pass_registration",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -1994,21 +2025,14 @@ cc_library(
     hdrs = ["utils/eval_util.h"],
     deps = [
         ":convert_tensor",
-        ":convert_type",
         ":export_tf_dialect_op",
-        ":mangling_util",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/stream_executor/lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -2019,15 +2043,16 @@ cc_library(
 cc_library(
     name = "mlir_import_options",
     hdrs = ["translate/mlir_import_options.h"],
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "translate_lib",
     srcs = ["translate/tf_mlir_translate.cc"],
     hdrs = ["translate/tf_mlir_translate.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":error_util",
-        ":export_graphdef",
         ":import_model",
         ":import_utils",
         ":mangling_util",
@@ -2036,13 +2061,13 @@ cc_library(
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/util/tensor_bundle:byteswaptensor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -2079,7 +2104,6 @@ cc_library(
         ":tensorflow",
         ":translate_cl_options",
         ":translate_lib",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:protos_all_cc",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -2162,7 +2186,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
     "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
     "//tensorflow/compiler/mlir/xla:tf_xla_passes",
-    "//tensorflow/compiler/mlir/xla:xla_passes",
+    "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
     "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
     "//tensorflow/compiler/mlir/xla:adjust_layout",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
@@ -2174,7 +2198,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/xla:shape_util",
     "//tensorflow/compiler/xla:xla_data_proto_cc",
     "//tensorflow/compiler/xla/client:xla_computation",
-    "//tensorflow/compiler/xla/service:hlo",
+    "//tensorflow/compiler/xla/hlo/ir:hlo",
     "//tensorflow/core/platform:errors",
     "//tensorflow/core:framework",
     "//tensorflow/core:protos_all_cc",
@@ -2182,9 +2206,9 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/core/platform:error_payloads",
     "//tensorflow/core/platform:logging",
     "//tensorflow/core/tpu:tpu_defs",
-    "//tensorflow/compiler/xla/stream_executor/lib",
     "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
     "@llvm-project//mlir:TensorDialect",
+    "//tensorflow/compiler/mlir/xla:xla_legalize_targets",
 ]
 
 cc_library(
@@ -2228,7 +2252,7 @@ cc_library(
         "//tensorflow/compiler/mlir:string_container_utils",
         "//tensorflow/compiler/tf2xla:xla_argument",
         "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
@@ -2244,6 +2268,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
+        "@stablehlo//:stablehlo_ops",
     ],
     alwayslink = 1,
 )
@@ -2344,7 +2369,6 @@ cc_library(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
@@ -2428,13 +2452,21 @@ tf_cc_test(
     srcs = ["utils/dump_mlir_util_test.cc"],
     deps = [
         ":dump_mlir_util",
+        ":tensorflow",
+        ":tensorflow_passes",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -2532,6 +2564,8 @@ cc_library(
     hdrs = ["utils/attribute_utils.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:tf2xla_defs",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -2650,6 +2684,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_embedding_ops_registry",
+    srcs = [
+        "ir/tpu_embedding_ops_registry.cc",
+    ],
+    hdrs = [
+        "ir/tpu_embedding_ops_registry.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "set_tpu_infeed_layout",
     srcs = ["transforms/set_tpu_infeed_layout.cc"],
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index f8ac6e264e4..16e160d2243 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <initializer_list>
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
@@ -71,11 +72,12 @@ class BacktrackAnalysisInfo {
 
   // Returns the argument index of the region to which the given result number
   // can backtracked to. Such results will be called "function passthrough". If
-  // the result cannot be backtracked to a region argument, returns llvm::None.
+  // the result cannot be backtracked to a region argument, returns
+  // std::nullopt.
   llvm::Optional<int> GetArg(int result_index) const {
     if (auto arg = GetValue(result_index).dyn_cast<BlockArgument>())
       if (arg.getParentBlock() == &region_->front()) return arg.getArgNumber();
-    return llvm::None;
+    return std::nullopt;
   }
 
  private:
@@ -137,10 +139,10 @@ class BacktrackAnalysis {
   }
 
   // Returns the backtrack analysis for the given region if it exists.
-  // If the region has not yet been analyzed, returns llvm::None.
+  // If the region has not yet been analyzed, returns std::nullopt.
   Optional<const InfoT*> GetAnalysisIfExists(Region& region) const {
     auto it = info_map_.find(&region);
-    if (it == info_map_.end()) return llvm::None;
+    if (it == info_map_.end()) return std::nullopt;
     return &it->second;
   }
 
@@ -208,9 +210,9 @@ Value BacktrackAnalysis::BacktrackValue(Value value) {
       // we cannot backtrack the value further.
       Optional<const InfoT*> callee_info = GetAnalysisIfExists(func);
       if (!callee_info) break;
-      Optional<int> passthrough_arg = callee_info.getValue()->GetArg(res_index);
+      Optional<int> passthrough_arg = callee_info.value()->GetArg(res_index);
       if (!passthrough_arg) break;
-      value = call.getArgOperands()[passthrough_arg.getValue()];
+      value = call.getArgOperands()[passthrough_arg.value()];
     } else if (isa<tf_device::LaunchOp, tf_device::ClusterOp>(op)) {
       value = op->getRegion(0).front().getTerminator()->getOperand(res_index);
     } else {
@@ -384,7 +386,7 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
                                      while_op.body_function()));
     } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
       AnalyzeWhileLoop(while_region, backtrack_analysis.GetAnalysisForRegion(
-                                         while_region.body()));
+                                         while_region.getBody()));
     } else if (auto case_op = dyn_cast<CaseOp>(op)) {
       llvm::SmallVector<func::FuncOp, 4> functions;
       case_op.get_branch_functions(functions);
@@ -406,8 +408,8 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
       for (auto result : filter_resources(op->getResults())) {
         auto passthrough_arg = func_info.GetArg(result.getResultNumber());
         if (passthrough_arg) {
-          PropagateInputToOutput(
-              call.getArgOperands()[passthrough_arg.getValue()], result);
+          PropagateInputToOutput(call.getArgOperands()[passthrough_arg.value()],
+                                 result);
         } else {
           AddValueUniqueIDMapping(result, kUnknownResourceId);
         }
@@ -432,7 +434,7 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
               mem_interface.getEffectOnValue<MemoryEffects::Allocate>(value);
           if (alloc_effect) {
             TypeID mlir_type_id =
-                alloc_effect.getValue().getResource()->getResourceID();
+                alloc_effect.value().getResource()->getResourceID();
             // Update or lookup internal type ID.
             auto emplace_result = type_id_to_internal_type_id_.try_emplace(
                 mlir_type_id, next_unique_type_id);
@@ -502,7 +504,7 @@ void ResourceAliasAnalysisInfo::AnalyzeWhileLoop(
     int result_index = result.getResultNumber();
     passthrough_args[result_index] = body_info.GetArg(result_index);
     if (passthrough_args[result_index]) {
-      int passthru_index = passthrough_args[result_index].getValue();
+      int passthru_index = passthrough_args[result_index].value();
       PropagateInputToOutput(while_op->getOperand(passthru_index), result);
       need_analysis |=
           !IsUnknownResource(result) && passthru_index != result_index;
@@ -525,7 +527,7 @@ void ResourceAliasAnalysisInfo::AnalyzeWhileLoop(
       // If this result has a valid passthrough arg, propagate resource IDs
       // from the result of the passthrough arg
       int result_index = result.getResultNumber();
-      int passthru_index = passthrough_args[result_index].getValue();
+      int passthru_index = passthrough_args[result_index].value();
       change =
           PropagateInputToOutput(while_op->getResult(passthru_index), result) ||
           change;
@@ -556,7 +558,7 @@ void ResourceAliasAnalysisInfo::AnalyzeFunctionalCaseOrIfOp(
         });
     if (all_passthrough_args_known) {
       for (const auto& passthrough_arg : passthrough_args) {
-        Value operand = case_or_if_op.input()[passthrough_arg.getValue()];
+        Value operand = case_or_if_op.getInput()[passthrough_arg.value()];
         PropagateInputToOutput(operand, result);
       }
     } else {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
index 8413b16b905..c06d09bf04f 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.cc
@@ -118,7 +118,7 @@ LogicalResult ResourceAnalyzer::AnalyzeRegion(Region& region) {
       return;
     }
     if (auto assign_variable = dyn_cast<TF::AssignVariableOp>(op)) {
-      SetPotentiallyWritten(assign_variable.resource());
+      SetPotentiallyWritten(assign_variable.getResource());
       return;
     }
     if (auto call = dyn_cast<CallOpInterface>(op)) {
@@ -131,29 +131,29 @@ LogicalResult ResourceAnalyzer::AnalyzeRegion(Region& region) {
     if (auto if_op = dyn_cast<TF::IfOp>(op)) {
       for (auto callee : {if_op.then_function(), if_op.else_function()}) {
         PropagatePotentiallyWrittenUpFromCallee(callee.getRegion(),
-                                                if_op.input());
+                                                if_op.getInput());
       }
       return;
     }
     if (auto if_op = dyn_cast<TF::IfRegionOp>(op)) {
-      PropagatePotentiallyWrittenUpFromCallee(if_op.then_branch(),
+      PropagatePotentiallyWrittenUpFromCallee(if_op.getThenBranch(),
                                               if_op.getODSOperands(1));
-      PropagatePotentiallyWrittenUpFromCallee(if_op.else_branch(),
+      PropagatePotentiallyWrittenUpFromCallee(if_op.getElseBranch(),
                                               if_op.getODSOperands(1));
       return;
     }
     if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
       for (auto callee : {while_op.cond_function(), while_op.body_function()}) {
         PropagatePotentiallyWrittenUpFromCallee(callee.getRegion(),
-                                                while_op.input());
+                                                while_op.getInput());
       }
       return;
     }
     if (auto while_op = dyn_cast<TF::WhileRegionOp>(op)) {
-      PropagatePotentiallyWrittenUpFromCallee(while_op.cond(),
-                                              while_op.input());
-      PropagatePotentiallyWrittenUpFromCallee(while_op.body(),
-                                              while_op.input());
+      PropagatePotentiallyWrittenUpFromCallee(while_op.getCond(),
+                                              while_op.getInput());
+      PropagatePotentiallyWrittenUpFromCallee(while_op.getBody(),
+                                              while_op.getInput());
       return;
     }
     // For all other ops, we assume it mutates all resources it uses, so
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index 771294559b2..14ae242525a 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 
 #include <bitset>
+#include <optional>
 #include <string>
 
 #include "absl/container/node_hash_map.h"
@@ -128,7 +129,7 @@ bool MayHaveSideEffect(Operation* op) {
   if (isa_and_nonnull<TF::TensorFlowDialect>(op->getDialect()))
     return TensorFlowDialect::CanHaveSideEffects(op);
 
-  if (mlir::MemoryEffectOpInterface::hasNoEffect(op)) return false;
+  if (mlir::isMemoryEffectFree(op)) return false;
   // Conservatively assume that there can be side effects.
   return true;
 }
@@ -241,7 +242,7 @@ class OpSideEffectCollector {
     } else if (auto while_op = dyn_cast<WhileOp>(op)) {
       AddRegionSideEffectsForOp(while_op.body_function().getBody(), op);
     } else if (auto while_region_op = dyn_cast<WhileRegionOp>(op)) {
-      AddRegionSideEffectsForOp(while_region_op.body(), op);
+      AddRegionSideEffectsForOp(while_region_op.getBody(), op);
     } else if (auto case_op = dyn_cast<CaseOp>(op)) {
       llvm::SmallVector<func::FuncOp, 4> branch_funcs;
       case_op.get_branch_functions(branch_funcs);
@@ -286,15 +287,21 @@ class OpSideEffectCollector {
           // dead or get pruned, ignore it for side effect analysis.
           continue;
 
-        // Add side effects for op resource ID.
-        std::string instance_str = "";
+        // Add side effects for op resource ID. If `op` does not have
+        // `GetResourceInstanceInterface`, then all op instances will keep an
+        // empty `instance_str` which enforces global order.
+        std::optional<std::string> instance_str = "";
         SideEffects side_effects(GetSideEffectsFromEffectInstance(effect, op));
         if (auto resource_instance_op =
             dyn_cast<GetResourceInstanceInterface>(op)) {
           instance_str = resource_instance_op.GetResourceInstanceStr();
         }
+        // No value (`std::nullopt`) instance string signals that we should
+        // ignore this effect, see comment for `GetResourceInstanceInterface`.
+        if (!instance_str.has_value()) continue;
+
         TypeID type_id = effect.getResource()->getResourceID();
-        ResourceId resource_id = GetOpResourceId(type_id, instance_str);
+        ResourceId resource_id = GetOpResourceId(type_id, instance_str.value());
         side_effects.SetResourceId(resource_id);
         UpdateSideEffectsByResourceId(side_effects,
                                       side_effects_by_resource_id);
@@ -347,6 +354,17 @@ SideEffectsByResourceId CollectSideEffectsByResourceId(
   SideEffectsByResourceId side_effects_by_resource_id;
   if (!MayHaveSideEffect(op)) return side_effects_by_resource_id;
 
+  // For fetch op, set unknown effect to guarantee that it depends on every
+  // side-effecting op (directly or indirectly).
+  if (isa<tf_executor::FetchOp>(op)) {
+    SideEffects unknown_effect;
+    unknown_effect.SetUnknownEffect();
+    unknown_effect.SetResourceId(kUnknownResourceId);
+    UpdateSideEffectsByResourceId(unknown_effect,
+                                  side_effects_by_resource_id);
+    return side_effects_by_resource_id;
+  }
+
   if (isa<tf_device::LaunchOp, tf_device::ClusterOp, tf_executor::IslandOp,
           tf_executor::GraphOp, IfRegionOp, CaseRegionOp, WhileRegionOp>(op)) {
     // For ops that are side-effecting only if their attached regions are,
@@ -716,6 +734,14 @@ bool SideEffectAnalysisInfo::IsUnknownAccessIndirectlyTrackedByResource(
   return is_tracked;
 }
 
+const llvm::SmallVector<Operation*, 4>&
+SideEffectAnalysisInfo::DirectControlPredecessors(
+    Operation* op) const {
+  auto it = sorted_control_predecessors_.find(op);
+  if (it == sorted_control_predecessors_.end()) return empty_operation_set_;
+  return it->second;
+}
+
 llvm::SmallVector<Operation*, 4>
 SideEffectAnalysisInfo::DirectControlPredecessors(
     Operation* op, llvm::function_ref<bool(Operation*)> filter) const {
@@ -729,6 +755,14 @@ SideEffectAnalysisInfo::DirectControlPredecessors(
   return result;
 }
 
+const llvm::SmallVector<Operation*, 4>&
+SideEffectAnalysisInfo::DirectControlSuccessors(
+    Operation* op) const {
+  auto it = sorted_control_successors_.find(op);
+  if (it == sorted_control_successors_.end()) return empty_operation_set_;
+  return it->second;
+}
+
 llvm::SmallVector<Operation*, 4>
 SideEffectAnalysisInfo::DirectControlSuccessors(
     Operation* op, llvm::function_ref<bool(Operation*)> filter) const {
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
index 47e7b99183e..f9fac1c61c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h
@@ -83,16 +83,16 @@ class SideEffectAnalysisInfo {
   // Returns a vector of ops that are direct control predecessors of `op`,
   // sorted in program order. If `filter` is provided, only predecessors that
   // pass the filter (returning true) will be included.
+  const llvm::SmallVector<Operation*, 4>& DirectControlPredecessors(
+      Operation* op) const;
   llvm::SmallVector<Operation*, 4> DirectControlPredecessors(
-      Operation* op,
-      llvm::function_ref<bool(Operation*)> filter = nullptr) const;
+      Operation* op, llvm::function_ref<bool(Operation*)> filter) const;
 
-  // Returns a vector of ops that are direct control successors of `op`,
-  // sorted in program order. If `filter` is provided, only successors that
   // pass the filter (returning true) will be included.
+  const llvm::SmallVector<Operation*, 4>& DirectControlSuccessors(
+      Operation* op) const;
   llvm::SmallVector<Operation*, 4> DirectControlSuccessors(
-      Operation* op,
-      llvm::function_ref<bool(Operation*)> filter = nullptr) const;
+      Operation* op, llvm::function_ref<bool(Operation*)> filter) const;
 
   // Returns a vector of ops that are control sinks (i.e. side-effecting ops
   // with no control successors).
@@ -163,6 +163,9 @@ class SideEffectAnalysisInfo {
       op_to_resource_ids_;
   llvm::SmallVector<std::pair<ResourceId, bool>> empty_resource_ids_;
 
+  // For predecessor / successor queries on ops we don't track.
+  llvm::SmallVector<Operation*, 4> empty_operation_set_;
+
   // Internal per-resource data structure for building the dependencies.
   struct PerResourceAccessInfo {
     // Last op that writes to resource before the current op is being analyzed.
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index b5bdf0c0e0b..058348bbdbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index 348f5799ec5..f1078898ee6 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <memory>
+#include <optional>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
@@ -247,9 +248,9 @@ class MlirFunctionContext : public TracingContext {
     RegisterDialects(*context_);
     // TODO(aminim) figure out the location story here
     module_ = ModuleOp::create(builder_.getUnknownLoc());
-    func_ =
-        func::FuncOp::create(builder_.getUnknownLoc(), name,
-                             builder_.getFunctionType(llvm::None, llvm::None));
+    func_ = func::FuncOp::create(
+        builder_.getUnknownLoc(), name,
+        builder_.getFunctionType(std::nullopt, std::nullopt));
     module_->push_back(func_);
     builder_ = OpBuilder::atBlockBegin(func_.addEntryBlock());
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
index f58c10d72c0..bc46b0c04ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
@@ -58,8 +58,8 @@ template <
         OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
 OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
                                         ArrayRef<Attribute> operands) {
-  auto lhs_type = arithmetic_op.x().getType().template cast<ShapedType>();
-  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  auto lhs_type = arithmetic_op.getX().getType().template cast<ShapedType>();
+  auto rhs_type = arithmetic_op.getY().getType().template cast<ShapedType>();
   auto result_type =
       arithmetic_op.getResult().getType().template cast<ShapedType>();
 
@@ -110,7 +110,7 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
   if (rhs_attr && is_valid_broadcasting(lhs_type, rhs_type, result_type)) {
     if (rhs_attr.isSplat() &&
         rhs_attr.getSplatValue<Attribute>() == identity_attr)
-      return arithmetic_op.x();
+      return arithmetic_op.getX();
   }
 
   // Fold: Op(Identity, Operand) -> Operand for commutative operations.
@@ -118,7 +118,7 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
       is_valid_broadcasting(rhs_type, lhs_type, result_type)) {
     if (lhs_attr.isSplat() &&
         lhs_attr.getSplatValue<Attribute>() == identity_attr)
-      return arithmetic_op.y();
+      return arithmetic_op.getY();
   }
 
   return {};
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index ba0e7f7bb68..37eeb8b9733 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -70,13 +70,13 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Returns if its legal to inline 'src' region into the 'dest' region
   // attached to a TF Device operation.
   bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
-                       BlockAndValueMapping& valueMapping) const final {
+                       IRMapping& valueMapping) const final {
     return true;
   }
 
   // Defines the legality of inlining TF Device operations.
   bool isLegalToInline(Operation*, Region*, bool,
-                       BlockAndValueMapping&) const final {
+                       IRMapping&) const final {
     // For now, enable inlining all operations.
     return true;
   }
@@ -236,7 +236,7 @@ ParseResult ParseReplicateOpOperands(
   llvm::SmallVector<Type, 8> packed_region_arg_types;
   do {
     OpAsmParser::UnresolvedOperand operand_type;
-    if (parser->parseOptionalOperand(operand_type).hasValue()) {
+    if (parser->parseOptionalOperand(operand_type).has_value()) {
       packed_inputs->emplace_back(operand_type);
       if (parser->parseKeyword("as",
                                " between packed input and block argument") ||
@@ -439,7 +439,7 @@ void BuildReplicateOp(
   DCHECK_GE(n, 2);
   state->addAttribute("n", builder->getI32IntegerAttr(n));
 
-  if (devices.has_value()) state->addAttribute("devices", devices.getValue());
+  if (devices.has_value()) state->addAttribute("devices", devices.value());
 
   Region* region = state->addRegion();
   region->push_back(new Block);
@@ -479,7 +479,7 @@ LogicalResult ReplicateOp::verify() {
 
   // Check number of devices, if set, matches `n`.
   if (op.getDevices().has_value()) {
-    for (auto device_attr : op.getDevices().getValue().getValue()) {
+    for (auto device_attr : op.getDevices().value().getValue()) {
       auto device_list = device_attr.getValue().dyn_cast_or_null<ArrayAttr>();
       if (!device_list)
         return op.emitError()
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 39599547d2f..8f700aeb2c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -38,8 +38,8 @@ def TfDevice_Dialect : Dialect {
     XlaRun.
 }];
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
   let cppNamespace = "::mlir::tf_device";
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
@@ -115,7 +115,7 @@ def TfDevice_LaunchFuncOp : TfDevice_Op<"launch_func", []> {
   let arguments = (ins
     StrAttr:$device,
     FlatSymbolRefAttr:$func,
-    Variadic<AnyType>:$operands);
+    Variadic<AnyType>);
 
   let results = (outs
     Variadic<AnyType>:$results
@@ -349,7 +349,7 @@ This op is used for outlining a cluster.
 
   let arguments = (ins
     FlatSymbolRefAttr:$func,
-    Variadic<AnyType>:$operands
+    Variadic<AnyType>
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 2f2df605602..f7c35420c22 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -69,7 +69,7 @@ struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   // Override the inlining hook to determine if 'src' can be inlined into
   // 'dest'.
   bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       BlockAndValueMapping &value_mapping) const final {
+                       IRMapping &value_mapping) const final {
     // Allow inlining into tf.island regions if the incoming region has a single
     // block.
     return llvm::isa<tf_executor::IslandOp>(dest->getParentOp()) &&
@@ -511,7 +511,7 @@ void SwitchNOp::print(OpAsmPrinter &p) {
   p.printOperands(operands.begin(), std::next(operands.begin(), 2));
   p << " of " << (getNumResults() - 1);
   // print control dependencies if any
-  if (!llvm::empty(getControlInputs())) {
+  if (!getControlInputs().empty()) {
     p << " (";
     p.printOperands(getControlInputs());
     p << ")";
@@ -1085,7 +1085,7 @@ void ControlTriggerOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // tf_executor.island
 //===----------------------------------------------------------------------===//
 
-LogicalResult IslandOp::fold(llvm::ArrayRef<Attribute> operands,
+LogicalResult IslandOp::fold(FoldAdaptor,
                              llvm::SmallVectorImpl<OpFoldResult> &results) {
   // This folds IslandOps with no inner ops, one control operand and no data
   // results. The single control operand is forwarded to the IslandOp control
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 1c16e59b6ed..b5b49f878e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -43,8 +43,8 @@ def TfExecutor_Dialect : Dialect {
     value).
 }];
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
   let cppNamespace = "::mlir::tf_executor";
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 // Control type.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 734dc1e2a18..89645498270 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -16,9 +16,12 @@ limitations under the License.
 // This is the auto-generated operation definition file for TensorFlow.
 //
 // PLEASE DO NOT MANUALLY EDIT THIS FILE!
+// ONLY EXCEPTION: FIELDS THAT CANNOT BE GENERATED
 //
 // If you absolutely need to modify the generated fields of an op, move the op
-// definition to `tf_ops.td` and perform the modification there.
+// definition to `tf_ops.td` and perform the modification there. Generated
+// fields and the process to generate them are documented at:
+// mlir/tensorflow/dialectgen/README.md
 //
 // This file contains TensorFlow ops whose definitions are programmatically
 // generated from the TF op registration and the api-def-files in the following
@@ -31,7 +34,7 @@ limitations under the License.
 //
 // Ops in this file are sorted alphabetically.
 
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "external/org_tensorflow/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpAsmInterface.td"
@@ -73,11 +76,11 @@ Provided an input tensor, the `tf.math.acos` operation returns the inverse cosin
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
+    TF_FpOrComplexTensor:$x
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
+    TF_FpOrComplexTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -159,11 +162,11 @@ Inputs must be of same size and shape.
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>>:$inputs
+    Variadic<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>>:$inputs
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$sum
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$sum
   );
 
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
@@ -324,7 +327,7 @@ replica 1's output: `[[B], [D]]`
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The local input to the sum.}]>:$input,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The local input to the sum.}]>:$input,
     Arg<TF_Int32Tensor, [{An int32 tensor with shape
 [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
 replica ids in the ith subgroup.}]>:$group_assignment,
@@ -335,7 +338,7 @@ replica ids in the ith subgroup.}]>:$group_assignment,
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The exchanged result.}]>:$output
+    Res<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The exchanged result.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -356,7 +359,7 @@ For example:
 
 ```
 # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.angle(input) ==> [2.0132, 1.056]
+tf.math.angle(input) ==> [2.0132, 1.056]
 ```
 
 @compatibility(numpy)
@@ -566,8 +569,8 @@ def TF_ApproximateEqualOp : TF_Op<"ApproximateEqual", [Commutative, Pure]> {
   let summary = "Returns the truth value of abs(x-y) < tolerance element-wise.";
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
+    TF_NumberTensor:$x,
+    TF_NumberTensor:$y,
 
     DefaultValuedOptionalAttr<F32Attr, "1e-05f">:$tolerance
   );
@@ -599,7 +602,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     Arg<TensorOf<[TF_Int16, TF_Int32, TF_Int64]>, [{int16, int32 or int64, must be in the range `[-rank(input), rank(input))`.
 Describes which dimension of the input Tensor to reduce across. For vectors,
 use dimension = 0.}]>:$dimension
@@ -634,7 +637,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     Arg<TF_I32OrI64Tensor, [{int32 or int64, must be in the range `[-rank(input), rank(input))`.
 Describes which dimension of the input Tensor to reduce across. For vectors,
 use dimension = 0.}]>:$dimension
@@ -706,11 +709,11 @@ tf.math.asin(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
+    TF_FpOrComplexTensor:$x
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
+    TF_FpOrComplexTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -877,11 +880,11 @@ tf.math.atan(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
+    TF_FpOrComplexTensor:$x
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
+    TF_FpOrComplexTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1070,7 +1073,7 @@ is smaller than desired.}]>:$drop_remainder,
   );
 }
 
-def TF_BatchFunctionOp : TF_Op<"BatchFunction", [AttrSizedOperandSegments, Pure]> {
+def TF_BatchFunctionOp : TF_Op<"BatchFunction", [AttrSizedOperandSegments, DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
   let summary = [{
 Batches all the inputs tensors to the computation done by the function.
   }];
@@ -1213,15 +1216,15 @@ about broadcasting
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>, [{2-D or higher with shape `[..., r_x, c_x]`.}]>:$x,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>, [{2-D or higher with shape `[..., r_y, c_y]`.}]>:$y,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{2-D or higher with shape `[..., r_x, c_x]`.}]>:$x,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{2-D or higher with shape `[..., r_y, c_y]`.}]>:$y,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$adj_y
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>, [{3-D or higher with shape `[..., r_o, c_o]`}]>:$output
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{3-D or higher with shape `[..., r_o, c_o]`}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1562,14 +1565,14 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Any number of dimensions.}]>:$value,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D with size the last dimension of `value`.}]>:$bias,
+    Arg<TF_NumberTensor, [{Any number of dimensions.}]>:$value,
+    Arg<TF_NumberTensor, [{1-D with size the last dimension of `value`.}]>:$bias,
 
     DefaultValuedOptionalAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">:$data_format
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Broadcasted sum of `value` and `bias`.}]>:$output
+    Res<TF_NumberTensor, [{Broadcasted sum of `value` and `bias`.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1597,13 +1600,13 @@ the feature dimension is the third-to-last.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Any number of dimensions.}]>:$out_backprop,
+    Arg<TF_NumberTensor, [{Any number of dimensions.}]>:$out_backprop,
 
     DefaultValuedOptionalAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">:$data_format
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D with size the feature dimension of `out_backprop`.}]>:$output
+    Res<TF_NumberTensor, [{1-D with size the feature dimension of `out_backprop`.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1622,12 +1625,12 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Any number of dimensions.}]>:$value,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D with size the last dimension of `value`.}]>:$bias
+    Arg<TF_NumberTensor, [{Any number of dimensions.}]>:$value,
+    Arg<TF_NumberTensor, [{1-D with size the last dimension of `value`.}]>:$bias
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Broadcasted sum of `value` and `bias`.}]>:$output
+    Res<TF_NumberTensor, [{Broadcasted sum of `value` and `bias`.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2224,6 +2227,30 @@ Mutually accumulates multiple tensors of identical type and shape.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CollectiveGatherV2Op : TF_Op<"CollectiveGatherV2", [TF_CollectiveReduceOrderingEffect]> {
+  let summary = [{
+Mutually accumulates multiple tensors of identical type and shape.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
+    TF_Int32Tensor:$group_size,
+    TF_Int32Tensor:$group_key,
+    TF_Int32Tensor:$instance_key,
+    Variadic<TF_ResourceTensor>:$ordering_token,
+
+    DefaultValuedOptionalAttr<StrAttr, "\"auto\"">:$communication_hint,
+    DefaultValuedOptionalAttr<F32Attr, "0.0f">:$timeout_seconds
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
+  );
+
+  TF_DerivedOperandSizeAttr Nordering_token = TF_DerivedOperandSizeAttr<4>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", []> {
   let summary = "An Op to permute tensors across replicated TPU instances.";
 
@@ -2236,13 +2263,13 @@ source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The local input to be permuted. Currently only supports float and
+    Arg<TF_NumberTensor, [{The local input to be permuted. Currently only supports float and
 bfloat16.}]>:$input,
     Arg<TF_Int32Tensor, [{A tensor with shape [num_pairs, 2].}]>:$source_target_pairs
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The permuted input.}]>:$output
+    Res<TF_NumberTensor, [{The permuted input.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2280,7 +2307,34 @@ Mutually reduces multiple tensors of identical type and shape.
   }];
 }
 
-def TF_CollectiveReduceV2Op : TF_Op<"CollectiveReduceV2", [TF_CollectiveReduceOrderingEffect]> {
+def TF_CollectiveReduceScatterV2Op : TF_Op<"CollectiveReduceScatterV2", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_CollectiveReduceOrderingEffect]> {
+  let summary = [{
+Mutually reduces multiple tensors of identical type and shape and scatters the result.
+  }];
+
+  let arguments = (ins
+    TF_FpOrI32OrI64Tensor:$input,
+    TF_Int32Tensor:$group_size,
+    TF_Int32Tensor:$group_key,
+    TF_Int32Tensor:$instance_key,
+    Variadic<TF_ResourceTensor>:$ordering_token,
+
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
+    TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
+    DefaultValuedOptionalAttr<StrAttr, "\"auto\"">:$communication_hint,
+    DefaultValuedOptionalAttr<F32Attr, "0.0f">:$timeout_seconds,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$max_subdivs_per_device
+  );
+
+  let results = (outs
+    TF_FpOrI32OrI64Tensor:$data
+  );
+
+  TF_DerivedOperandSizeAttr Nordering_token = TF_DerivedOperandSizeAttr<4>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveReduceV2Op : TF_Op<"CollectiveReduceV2", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_CollectiveReduceOrderingEffect]> {
   let summary = [{
 Mutually reduces multiple tensors of identical type and shape.
   }];
@@ -2695,6 +2749,35 @@ the `filter` input of the convolution.}]>:$output
   }];
 }
 
+def TF_Conv2DBackpropFilterV2Op : TF_Op<"Conv2DBackpropFilterV2", [Pure]> {
+  let summary = [{
+Computes the gradients of convolution with respect to the filter.
+  }];
+
+  let arguments = (ins
+    Arg<TF_FloatTensor, [{4-D with shape `[batch, in_height, in_width, in_channels]`.}]>:$input,
+    Arg<TF_FloatTensor, [{4-D with shape `[filter_height, filter_width, in_channels, out_channels]`.
+Only shape of tensor is used.}]>:$filter,
+    Arg<TF_FloatTensor, [{4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.}]>:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    DefaultValuedOptionalAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedOptionalAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">:$data_format,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    Res<TF_FloatTensor, [{4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+the `filter` input of the convolution.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_Conv2DBackpropInputOp : TF_Op<"Conv2DBackpropInput", [Pure, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes the gradients of convolution with respect to the input.
@@ -2734,6 +2817,35 @@ w.r.t. the input of the convolution.}]>:$output
   }];
 }
 
+def TF_Conv2DBackpropInputV2Op : TF_Op<"Conv2DBackpropInputV2", [Pure]> {
+  let summary = [{
+Computes the gradients of convolution with respect to the input.
+  }];
+
+  let arguments = (ins
+    Arg<TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>, [{4-D with shape `[batch, in_height, in_width, in_channels]`.
+Only shape of tensor is used.}]>:$input,
+    Arg<TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>, [{4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.}]>:$filter,
+    Arg<TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>, [{4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.}]>:$out_backprop,
+
+    I64ArrayAttr:$strides,
+    DefaultValuedOptionalAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$explicit_paddings,
+    DefaultValuedOptionalAttr<TF_ConvnetDataFormatAttr, "\"NHWC\"">:$data_format,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    Res<TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>, [{4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+w.r.t. the input of the convolution.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_Conv3DOp : TF_Op<"Conv3D", [InferTensorType, Pure]> {
   let summary = [{
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
@@ -2775,6 +2887,30 @@ out_channels]`. `in_channels` must match between `input` and `filter`.}]>:$filte
 
 }
 
+def TF_Conv3DBackpropFilterOp : TF_Op<"Conv3DBackpropFilter", [Pure]> {
+  let summary = [{
+Computes the gradients of 3-D convolution with respect to the filter.
+  }];
+
+  let arguments = (ins
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Shape `[batch, depth, rows, cols, in_channels]`.}]>:$input,
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Shape `[depth, rows, cols, in_channels, out_channels]`.
+`in_channels` must match between `input` and `filter`.}]>:$filter,
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.}]>:$out_backprop,
+
+    ConfinedAttr<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{1, 1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_Conv3DBackpropFilterV2Op : TF_Op<"Conv3DBackpropFilterV2", [Pure]> {
   let summary = [{
 Computes the gradients of 3-D convolution with respect to the filter.
@@ -2802,6 +2938,30 @@ out_channels]`.}]>:$out_backprop,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_Conv3DBackpropInputOp : TF_Op<"Conv3DBackpropInput", [Pure]> {
+  let summary = [{
+Computes the gradients of 3-D convolution with respect to the input.
+  }];
+
+  let arguments = (ins
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Shape `[batch, depth, rows, cols, in_channels]`.}]>:$input,
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Shape `[depth, rows, cols, in_channels, out_channels]`.
+`in_channels` must match between `input` and `filter`.}]>:$filter,
+    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.}]>:$out_backprop,
+
+    ConfinedAttr<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
+    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{1, 1, 1, 1, 1}">:$dilations
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_Conv3DBackpropInputV2Op : TF_Op<"Conv3DBackpropInputV2", [Pure]> {
   let summary = [{
 Computes the gradients of 3-D convolution with respect to the input.
@@ -2983,7 +3143,7 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    Arg<TF_NumberTensor, [{A `Tensor`. Must be one of the following types: `float32`, `float64`,
 `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
 `complex128`, `qint8`, `quint8`, `qint32`, `half`.}]>:$x,
     Arg<TF_I32OrI64Tensor, [{A `Tensor` of type `int32` (default: 0). Must be in the range
@@ -2994,7 +3154,7 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+    TF_NumberTensor:$out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3038,7 +3198,7 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    Arg<TF_NumberTensor, [{A `Tensor`. Must be one of the following types: `float32`, `float64`,
 `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
 `complex128`, `qint8`, `quint8`, `qint32`, `half`.}]>:$x,
     Arg<TF_I32OrI64Tensor, [{A `Tensor` of type `int32` (default: 0). Must be in the range
@@ -3049,7 +3209,7 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+    TF_NumberTensor:$out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3085,7 +3245,7 @@ opposite direction.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Float16, TF_Float32, TF_Float64]>, [{A `Tensor`. Must be one of the following types: `float16`, `float32`, `float64`.}]>:$x,
+    Arg<TF_FloatTensor, [{A `Tensor`. Must be one of the following types: `float16`, `float32`, `float64`.}]>:$x,
     Arg<TF_I32OrI64Tensor, [{A `Tensor` of type `int32` (default: 0). Must be in the range
 `[-rank(x), rank(x))`.}]>:$axis,
 
@@ -3094,7 +3254,7 @@ opposite direction.
   );
 
   let results = (outs
-    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$out
+    TF_FloatTensor:$out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4083,6 +4243,70 @@ of TPU cores in the task on which the node is placed.}]>:$device_ordinal,
   TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_DynamicPartitionOp : TF_Op<"DynamicPartition", [Pure]> {
+  let summary = [{
+Partitions `data` into `num_partitions` tensors using indices from `partitions`.
+  }];
+
+  let description = [{
+For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+are placed in `outputs[i]` in lexicographic order of `js`, and the first
+dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+In detail,
+
+```python
+    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+
+    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+```
+
+`data.shape` must start with `partitions.shape`.
+
+For example:
+
+```python
+    # Scalar partitions.
+    partitions = 1
+    num_partitions = 2
+    data = [10, 20]
+    outputs[0] = []  # Empty with shape [0, 2]
+    outputs[1] = [[10, 20]]
+
+    # Vector partitions.
+    partitions = [0, 0, 1, 1, 0]
+    num_partitions = 2
+    data = [10, 20, 30, 40, 50]
+    outputs[0] = [10, 20, 50]
+    outputs[1] = [30, 40]
+```
+
+See `dynamic_stitch` for an example on how to merge partitions back.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+</div>
+
+
+Raises:
+  * `InvalidArgumentError` in following cases:
+    - If partitions is not in range `[0, num_partiions)`
+    - If `partitions.shape` does not match prefix of `data.shape` argument.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$data,
+    Arg<TF_Int32Tensor, [{Any shape.  Indices in the range `[0, num_partitions)`.}]>:$partitions
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultSizeAttr num_partitions = TF_DerivedResultSizeAttr<0>;
+}
+
 def TF_DynamicStitchOp : TF_Op<"DynamicStitch", [Pure, SameVariadicOperandSize]> {
   let summary = [{
 Interleave the values from the `data` tensors into a single tensor.
@@ -4995,10 +5219,14 @@ def TF_FakeParamOp : TF_Op<"FakeParam", [Pure, TF_NoConstantFold]> {
 
 def TF_FakeQuantWithMinMaxArgsOp : TF_Op<"FakeQuantWithMinMaxArgs", [Pure, TF_SameOperandsAndResultTypeResolveRef]> {
   let summary = [{
-Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same shape and type.
   }];
 
   let description = [{
+Quantization is called fake since the output is still in floating point.
+  The API converts inputs into values within the range [min and max] and returns
+  as output.
+
 Attributes
 
 *   `[min; max]` define the clamping range for the `inputs` data.
@@ -5018,7 +5246,25 @@ the behavior can be unexpected:
 *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
 `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 
-Quantization is called fake since the output is still in floating point.
+
+Examples
+
+```python
+
+inp = tf.constant ([10.03, -10.23, 3])
+out = tf.quantization.fake_quant_with_min_max_args(inp, min=-5, max=5,
+                                                   num_bits=16)
+print(out)
+
+#  Output:
+#  tf.Tensor([ 4.9999237 -5.0000763  3.0000763], shape=(3,), dtype=float32)
+```
+
+Raises:
+  * InvalidArgumentError:
+    - If num_bits are outside of range [2, 16].
+    - If min >= max.
+  * ValueError: If `inputs` are of any other type than float32.
   }];
 
   let arguments = (ins
@@ -7860,6 +8106,8 @@ def TF_LogicalAndOp : TF_Op<"LogicalAnd", [Commutative, Pure, ResultsBroadcastab
   let results = (outs
     TF_BoolTensor:$z
   );
+
+  let hasFolder = 1;
 }
 
 def TF_LogicalNotOp : TF_Op<"LogicalNot", [Pure, TF_Involution, TF_SameOperandsAndResultTypeResolveRef]> {
@@ -8216,15 +8464,15 @@ cublas.
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$a,
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$b,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$b,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$transpose_a,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$transpose_b
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$product
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$product
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9390,7 +9638,7 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The tensor to reduce.}]>:$input,
+    Arg<TF_NumberTensor, [{The tensor to reduce.}]>:$input,
     Arg<TF_I32OrI64Tensor, [{The dimensions to reduce. Must be in the range
 `[-rank(input), rank(input))`.}]>:$reduction_indices,
 
@@ -9398,7 +9646,7 @@ retained with length 1.
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The reduced tensor.}]>:$output
+    Res<TF_NumberTensor, [{The reduced tensor.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9953,6 +10201,47 @@ the insert operations. It does not support the initialization operation.
   );
 }
 
+def TF_NcclAllReduceOp : TF_Op<"NcclAllReduce", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_NcclAllReduceOrderingEffect, TF_SameOperandsAndResultTypeResolveRef]> {
+  let summary = [{
+Outputs a tensor containing the reduction across all input tensors.
+  }];
+
+  let description = [{
+Outputs a tensor containing the reduction across all input tensors passed to ops
+within the same `shared_name.
+
+The graph should be constructed so if one op runs with shared_name value `c`,
+then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+will cause the graph execution to fail to complete.
+
+input: the input to the reduction
+data: the value of the reduction across all `num_devices` devices.
+reduction: the reduction operation to perform.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that shared between ops of the same reduction.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
+
+    TF_AnyStrAttrOf<["min", "max", "prod", "sum"]>:$reduction,
+    I64Attr:$num_devices,
+    StrAttr:$shared_name
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let extraClassDeclaration = [{
+    static bool isCompatibleReturnTypes(TypeRange inferred, TypeRange actual) {
+      return ArraysAreCastCompatible(inferred, actual);
+    }
+  }];
+}
+
 def TF_NdtriOp : TF_Op<"Ndtri", [Pure]> {
   let summary = "";
 
@@ -10479,6 +10768,18 @@ def TF_OptionalNoneOp : TF_Op<"OptionalNone", [Pure]> {
   );
 }
 
+def TF_OutfeedEnqueueOp : TF_Op<"OutfeedEnqueue", []> {
+  let summary = "Enqueue a Tensor on the computation outfeed.";
+
+  let arguments = (ins
+    Arg<TF_Tensor, [{A tensor that will be inserted into the outfeed queue.}]>:$input
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_OutfeedEnqueueTupleOp : TF_Op<"OutfeedEnqueueTuple", []> {
   let summary = "Enqueue multiple Tensor values on the computation outfeed.";
 
@@ -10808,7 +11109,7 @@ truncated normal values using the parameters for each row.}]>:$output
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_PartitionedCallOp : TF_Op<"PartitionedCall", [CallOpInterface, Pure, SymbolUserOpInterface]> {
+def TF_PartitionedCallOp : TF_Op<"PartitionedCall", [CallOpInterface, DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
   let summary = [{
 returns `f(inputs)`, where `f`'s body is placed and partitioned.
   }];
@@ -10837,18 +11138,15 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
 
   let extraClassDeclaration = [{
     // Gets the argument operands to the called function.
-    operand_range getArgOperands() { return args(); }
+    operand_range getArgOperands() { return getArgs(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+    CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
 
     // returns the callee of this operation.
     func::FuncOp func() {
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, f());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getF());
     }
-
-    // SymbolUserOpInterface verifier.
-    LogicalResult verifySymbolUses(SymbolTableCollection &symbolTable);
   }];
 }
 
@@ -11045,7 +11343,7 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The tensor to reduce.}]>:$input,
+    Arg<TF_NumberTensor, [{The tensor to reduce.}]>:$input,
     Arg<TF_I32OrI64Tensor, [{The dimensions to reduce. Must be in the range
 `[-rank(input), rank(input))`.}]>:$reduction_indices,
 
@@ -11053,7 +11351,7 @@ retained with length 1.
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The reduced tensor.}]>:$output
+    Res<TF_NumberTensor, [{The reduced tensor.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12547,12 +12845,12 @@ variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$m,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$v,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$beta1_power,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum factor. Must be a scalar.}]>:$beta1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum factor. Must be a scalar.}]>:$beta2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Ridge term. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$beta1_power,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Momentum factor. Must be a scalar.}]>:$beta1,
+    Arg<TF_NumberTensor, [{Momentum factor. Must be a scalar.}]>:$beta2,
+    Arg<TF_NumberTensor, [{Ridge term. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12576,10 +12874,10 @@ var -= update;
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum_update,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Decay factor. Must be a scalar.}]>:$rho,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Constant factor. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Decay factor. Must be a scalar.}]>:$rho,
+    Arg<TF_NumberTensor, [{Constant factor. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12600,8 +12898,8 @@ var -= lr * grad * (1 / sqrt(accum))
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "true">:$update_slots
@@ -12619,10 +12917,10 @@ def TF_ResourceApplyAdagradDAOp : TF_Op<"ResourceApplyAdagradDA", []> {
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$gradient_accumulator,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$gradient_squared_accumulator,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 regularization. Must be a scalar.}]>:$l2,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 regularization. Must be a scalar.}]>:$l2,
     Arg<TF_Int64Tensor, [{Training step number. Must be a scalar.}]>:$global_step,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
@@ -12644,9 +12942,9 @@ var -= lr * grad * (1 / (sqrt(accum) + epsilon))
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Constant factor. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Constant factor. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "true">:$update_slots
@@ -12671,13 +12969,13 @@ $$\text{var} := \begin{cases} \text{var} - (m_t \beta_1 + g \cdot (1 - \beta_1))
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$m,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$v,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$beta1_power,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$beta2_power,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum factor. Must be a scalar.}]>:$beta1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum factor. Must be a scalar.}]>:$beta2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Ridge term. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$beta1_power,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$beta2_power,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Momentum factor. Must be a scalar.}]>:$beta1,
+    Arg<TF_NumberTensor, [{Momentum factor. Must be a scalar.}]>:$beta2,
+    Arg<TF_NumberTensor, [{Ridge term. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_nesterov
@@ -12700,11 +12998,11 @@ variable <- variable - lr_t * update
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$m,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$alpha,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$sign_decay,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$beta,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$alpha,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$sign_decay,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$beta,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12743,11 +13041,11 @@ var <- var - mom
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$mg,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$ms,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$mom,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Decay rate. Must be a scalar.}]>:$rho,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum Scale. Must be a scalar.}]>:$momentum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Ridge term. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Decay rate. Must be a scalar.}]>:$rho,
+    Arg<TF_NumberTensor, [{Momentum Scale. Must be a scalar.}]>:$momentum,
+    Arg<TF_NumberTensor, [{Ridge term. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12772,11 +13070,11 @@ accum = accum_new
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$linear,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 regularization. Must be a scalar.}]>:$l2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr_power,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 regularization. Must be a scalar.}]>:$l2,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr_power,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$multiply_linear_by_lr
@@ -12804,12 +13102,12 @@ accum = accum_new
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$linear,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 shrinkage regularization. Must be a scalar.}]>:$l2,
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2_shrinkage,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr_power,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 shrinkage regularization. Must be a scalar.}]>:$l2,
+    TF_NumberTensor:$l2_shrinkage,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr_power,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$multiply_linear_by_lr
@@ -12825,8 +13123,8 @@ def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []
 
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$alpha,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The change.}]>:$delta,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$alpha,
+    Arg<TF_NumberTensor, [{The change.}]>:$delta,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12849,9 +13147,9 @@ var += accum
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum. Must be a scalar.}]>:$momentum,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Momentum. Must be a scalar.}]>:$momentum,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_nesterov
@@ -12875,9 +13173,9 @@ var -= lr * accum
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Momentum. Must be a scalar.}]>:$momentum,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Momentum. Must be a scalar.}]>:$momentum,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_nesterov
@@ -12900,11 +13198,11 @@ variable <- variable - lr_t * update
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$m,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$logbase,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$sign_decay,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Must be a scalar.}]>:$beta,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$logbase,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$sign_decay,
+    Arg<TF_NumberTensor, [{Must be a scalar.}]>:$beta,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12928,10 +13226,10 @@ var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 regularization. Must be a scalar.}]>:$l2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 regularization. Must be a scalar.}]>:$l2,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12951,10 +13249,10 @@ var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$alpha,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 regularization. Must be a scalar.}]>:$l2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The change.}]>:$delta,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$alpha,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 regularization. Must be a scalar.}]>:$l2,
+    Arg<TF_NumberTensor, [{The change.}]>:$delta,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -12984,11 +13282,11 @@ var <- var - mom
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$ms,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$mom,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Decay rate. Must be a scalar.}]>:$rho,
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Ridge term. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Decay rate. Must be a scalar.}]>:$rho,
+    TF_NumberTensor:$momentum,
+    Arg<TF_NumberTensor, [{Ridge term. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking
   );
@@ -13063,7 +13361,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13102,7 +13400,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13141,7 +13439,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13180,7 +13478,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13219,7 +13517,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13431,7 +13729,7 @@ Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a `Variable` node.}], [TF_VariableRead, TF_VariableWrite]>:$resource,
     Arg<TF_I32OrI64Tensor, [{A tensor of indices into the first dimension of `ref`.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A tensor of updated values to add to `ref`.}]>:$updates
+    Arg<TF_NumberTensor, [{A tensor of updated values to add to `ref`.}]>:$updates
   );
 
   let results = (outs);
@@ -13484,8 +13782,8 @@ var -= lr * grad * (1 / sqrt(accum))
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Learning rate. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Learning rate. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
     Arg<TF_I32OrI64Tensor, [{A vector of indices into the first dimension of var and accum.}]>:$indices,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
@@ -13512,9 +13810,9 @@ var -= lr * grad * (1 / sqrt(accum))
   let arguments = (ins
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Learning rate. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Constant factor. Must be a scalar.}]>:$epsilon,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{Learning rate. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{Constant factor. Must be a scalar.}]>:$epsilon,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
     Arg<TF_I32OrI64Tensor, [{A vector of indices into the first dimension of var and accum.}]>:$indices,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
@@ -13545,12 +13843,12 @@ accum = accum_new
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$var,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$accum,
     Arg<TF_ResourceTensor, [{Should be from a Variable().}], [TF_VariableRead, TF_VariableWrite]>:$linear,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The gradient.}]>:$grad,
+    Arg<TF_NumberTensor, [{The gradient.}]>:$grad,
     Arg<TF_I32OrI64Tensor, [{A vector of indices into the first dimension of var and accum.}]>:$indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L1 regularization. Must be a scalar.}]>:$l1,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{L2 regularization. Must be a scalar.}]>:$l2,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Scaling factor. Must be a scalar.}]>:$lr_power,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr,
+    Arg<TF_NumberTensor, [{L1 regularization. Must be a scalar.}]>:$l1,
+    Arg<TF_NumberTensor, [{L2 regularization. Must be a scalar.}]>:$l2,
+    Arg<TF_NumberTensor, [{Scaling factor. Must be a scalar.}]>:$lr_power,
 
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$multiply_linear_by_lr
@@ -14853,7 +15151,7 @@ array([[2.5, 2.5, 2.5, 2.5],
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TF_NumberTensor:$data,
     Arg<TF_I32OrI64Tensor, [{A 1-D tensor whose size is equal to the size of `data`'s
 first dimension.  Values should be sorted and can be repeated.
 
@@ -14862,7 +15160,7 @@ on GPU.}]>:$segment_ids
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Has same shape as data, except for dimension 0 which
+    Res<TF_NumberTensor, [{Has same shape as data, except for dimension 0 which
 has size `k`, the number of segments.}]>:$output
   );
 
@@ -14953,7 +15251,7 @@ array([[4, 6, 6, 4],
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TF_NumberTensor:$data,
     Arg<TF_I32OrI64Tensor, [{A 1-D tensor whose size is equal to the size of `data`'s
 first dimension.  Values should be sorted and can be repeated.
 
@@ -14962,7 +15260,7 @@ on GPU.}]>:$segment_ids
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Has same shape as data, except for dimension 0 which
+    Res<TF_NumberTensor, [{Has same shape as data, except for dimension 0 which
 has size `k`, the number of segments.}]>:$output
   );
 
@@ -14970,6 +15268,64 @@ has size `k`, the number of segments.}]>:$output
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SegmentProdV2Op : TF_Op<"SegmentProdV2", [Pure]> {
+  let summary = "Computes the product along segments of a tensor.";
+
+  let description = [{
+Read
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+for an explanation of segments.
+
+Computes a tensor such that
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If the product is empty for a given segment ID `i`, `output[i] = 1`.
+
+Note: That this op is currently only supported with jit_compile=True.
+
+The only difference with SegmentProd is the additional input  `num_segments`.
+This helps in evaluating the output shape in compile time.
+`num_segments` should be consistent with segment_ids.
+e.g. Max(segment_ids) - 1 should be equal to `num_segments` for a 1-d segment_ids
+With inconsistent num_segments, the op still runs. only difference is, 
+the output takes the size of num_segments irrespective of size of segment_ids and data.
+for num_segments less than expected output size, the last elements are ignored
+for num_segments more than the expected output size, last elements are assigned 1.
+
+For example:
+
+>>> @tf.function(jit_compile=True)
+... def test(c):
+...   return tf.raw_ops.SegmentProdV2(data=c, segment_ids=tf.constant([0, 0, 1]), num_segments=2)
+>>> c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+>>> test(c).numpy()
+array([[4, 6, 6, 4],
+       [5, 6, 7, 8]], dtype=int32)
+  }];
+
+  let arguments = (ins
+    TF_NumberTensor:$data,
+    Arg<TF_I32OrI64Tensor, [{A 1-D tensor whose size is equal to the size of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+The values must be less than `num_segments`.
+
+Caution: The values are always validated to be sorted on CPU, never validated
+on GPU.}]>:$segment_ids,
+    TF_I32OrI64Tensor:$num_segments
+  );
+
+  let results = (outs
+    Res<TF_NumberTensor, [{Has same shape as data, except for the first `segment_ids.rank`
+dimensions, which are replaced with a single dimensionw which has size
+`num_segments`.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr Tnumsegments = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_SegmentSumOp : TF_Op<"SegmentSum", [Pure]> {
   let summary = "Computes the sum along segments of a tensor.";
 
@@ -15003,7 +15359,7 @@ array([[5, 5, 5, 5],
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TF_NumberTensor:$data,
     Arg<TF_I32OrI64Tensor, [{A 1-D tensor whose size is equal to the size of `data`'s
 first dimension.  Values should be sorted and can be repeated.
 
@@ -15012,7 +15368,7 @@ on GPU.}]>:$segment_ids
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Has same shape as data, except for dimension 0 which
+    Res<TF_NumberTensor, [{Has same shape as data, except for dimension 0 which
 has size `k`, the number of segments.}]>:$output
   );
 
@@ -15020,6 +15376,46 @@ has size `k`, the number of segments.}]>:$output
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SegmentSumV2Op : TF_Op<"SegmentSumV2", [Pure]> {
+  let summary = "Computes the sum along segments of a tensor.";
+
+  let description = [{
+Read
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+for an explanation of segments.
+
+Computes a tensor such that
+\\(output_i = \sum_j data_j\\) where sum is over `j` such
+that `segment_ids[j] == i`.
+
+If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+
+Note that this op is currently only supported with jit_compile=True.
+</div>
+  }];
+
+  let arguments = (ins
+    TF_NumberTensor:$data,
+    Arg<TF_I32OrI64Tensor, [{A 1-D tensor whose size is equal to the size of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+The values must be less than `num_segments`.
+
+Caution: The values are always validated to be sorted on CPU, never validated
+on GPU.}]>:$segment_ids,
+    TF_I32OrI64Tensor:$num_segments
+  );
+
+  let results = (outs
+    Res<TF_NumberTensor, [{Has same shape as data, except for the first `segment_ids.rank`
+dimensions, which are replaced with a single dimension which has size
+`num_segments`.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr Tnumsegments = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_SelfAdjointEigV2Op : TF_Op<"SelfAdjointEigV2", [Pure]> {
   let summary = [{
 Computes the eigen decomposition of one or more square self-adjoint matrices.
@@ -16082,10 +16478,10 @@ In the following shapes, `nnz` is the count after taking `thresh` into account.
 
   let arguments = (ins
     Arg<TF_Int64Tensor, [{2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.}]>:$a_indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.}]>:$a_values,
+    Arg<TF_NumberTensor, [{1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.}]>:$a_values,
     Arg<TF_Int64Tensor, [{1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.}]>:$a_shape,
     Arg<TF_Int64Tensor, [{2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.}]>:$b_indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.}]>:$b_values,
+    Arg<TF_NumberTensor, [{1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.}]>:$b_values,
     Arg<TF_Int64Tensor, [{1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.}]>:$b_shape,
     Arg<TF_IntOrFpTensor, [{0-D.  The magnitude threshold that determines if an output value/index
 pair takes space.}]>:$thresh
@@ -16093,7 +16489,7 @@ pair takes space.}]>:$thresh
 
   let results = (outs
     TF_Int64Tensor:$sum_indices,
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$sum_values,
+    TF_NumberTensor:$sum_values,
     TF_Int64Tensor:$sum_shape
   );
 
@@ -16223,7 +16619,7 @@ which are interpreted according to the indexing rules in Python.
   let arguments = (ins
     Arg<TF_Int64Tensor, [{2-D.  `N x R` matrix with the indices of non-empty values in a
 SparseTensor, possibly not in canonical ordering.}]>:$input_indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{1-D.  `N` non-empty values corresponding to `input_indices`.}]>:$input_values,
+    Arg<TF_NumberTensor, [{1-D.  `N` non-empty values corresponding to `input_indices`.}]>:$input_values,
     Arg<TF_Int64Tensor, [{1-D.  Shape of the input SparseTensor.}]>:$input_shape,
     Arg<TF_Int32Tensor, [{1-D.  Length-`K` vector containing the reduction axes.}]>:$reduction_axes,
 
@@ -16231,7 +16627,7 @@ SparseTensor, possibly not in canonical ordering.}]>:$input_indices,
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{`R-K`-D.  The reduced Tensor.}]>:$output
+    Res<TF_NumberTensor, [{`R-K`-D.  The reduced Tensor.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -17771,7 +18167,7 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The tensor to reduce.}]>:$input,
+    Arg<TF_NumberTensor, [{The tensor to reduce.}]>:$input,
     Arg<TF_I32OrI64Tensor, [{The dimensions to reduce. Must be in the range
 `[-rank(input), rank(input))`.}]>:$reduction_indices,
 
@@ -17779,7 +18175,7 @@ retained with length 1.
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The reduced tensor.}]>:$output
+    Res<TF_NumberTensor, [{The reduced tensor.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -18216,11 +18612,11 @@ Given an input tensor, this function computes tangent of every
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
+    TF_FpOrComplexTensor:$x
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
+    TF_FpOrComplexTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -18937,7 +19333,7 @@ Apply a sparse update to a tensor taking the element-wise maximum.
 Returns a new tensor copied from `tensor` whose values are element-wise maximum between
 tensor and updates according to the indices.
 
->>> tensor = [0, 0, 0, 0, 0, 0, 0, 0] 
+>>> tensor = [0, 0, 0, 0, 0, 0, 0, 0]
 >>> indices = [[1], [4], [5]]
 >>> updates = [1, -1, 1]
 >>> tf.tensor_scatter_nd_max(tensor, indices, updates).numpy()
@@ -19236,8 +19632,15 @@ def TF_TimestampOp : TF_Op<"Timestamp", []> {
   let description = [{
 Returns the timestamp as a `float64` for seconds since the Unix epoch.
 
-Note: the timestamp is computed when the op is executed, not when it is added
-to the graph.
+Common usages include:
+* Logging
+* Providing a random number seed
+* Debugging graph execution
+* Generating timing information, mainly through comparison of timestamps
+
+Note: In graph mode, the timestamp is computed when the op is executed,
+not when it is added to the graph.  In eager mode, the timestamp is computed
+when the op is eagerly executed.
   }];
 
   let arguments = (ins);
@@ -19424,7 +19827,7 @@ left-hand side.}]>:$rhs,
 
 def TF_TruncateDivOp : TF_Op<"TruncateDiv", [Pure, ResultsBroadcastableShape]>,
                        WithBroadcastableBinOpBuilder {
-  let summary = "Returns x / y element-wise for integer types.";
+  let summary = "Returns x / y element-wise, rounded towards zero.";
 
   let description = [{
 Truncation designates that negative numbers will round fractional quantities
@@ -19542,6 +19945,8 @@ Same shape condition as scales.}]>:$zero_points,
 
   TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniformQuantizeOp : TF_Op<"UniformQuantize", [Pure]> {
@@ -19570,6 +19975,65 @@ Same shape condition as scales.}]>:$zero_points,
 
   TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
+}
+
+def TF_UniformQuantizedAddOp : TF_Op<"UniformQuantizedAdd", [Pure]> {
+  let summary = [{
+Perform quantized add of quantized Tensor `lhs` and quantized Tensor `rhs` to make quantized `output`.
+  }];
+
+  let description = [{
+Given quantized `lhs` and quantized `rhs`, performs quantized add on `lhs` and `rhs` to make quantized `output`.
+
+`UniformQuantizedAdd` follows Numpy broadcasting rules.
+The two input array shapes are compared element-wise.
+Starting with the trailing dimensions, the two dimensions either have to be equal or one of them needs to be 1.
+
+`lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+```
+quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+```
+`output` is also quantized, using the same formula.
+
+If `lhs` and `output` is both per-axis quantized, the quantization axis must match.
+Also, if `rhs` and `output` is both per-axis quantized, the quantization axis must match.
+*Match* means the axis must match when adding, regarding the broadcasting.
+i.e. For both operands `lhs` and `rhs`,
+if `operand.quantization_axis` >= 0 and `output.quantization_axis` >= 0,
+`operand.dims` - `operand.quantization_axis` must be equal to `output.dims` - `output.quantization_axis`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_Qint32Tensor, [{Must be a quantized tensor.}]>:$lhs,
+    Arg<TF_Qint32Tensor, [{Must be a quantized tensor.}]>:$rhs,
+    Arg<TF_Float32Tensor, [{The float value(s) used as scale factors when quantizing the original data that `lhs` represents.}]>:$lhs_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+Must have same shape with `lhs_scales`.}]>:$lhs_zero_points,
+    Arg<TF_Float32Tensor, [{The float value(s) used as scale factors when quantizing the original data that `rhs` represents.}]>:$rhs_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+Must have same shape with `rhs_scales`.}]>:$rhs_zero_points,
+    Arg<TF_Float32Tensor, [{The float value(s) to use as scale factors when quantizing original data that `output` represents.}]>:$output_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that output represents.
+Must have same shape with `output_scales`.}]>:$output_zero_points,
+
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$lhs_quantization_axis,
+    I64Attr:$lhs_quantization_min_val,
+    I64Attr:$lhs_quantization_max_val,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$rhs_quantization_axis,
+    I64Attr:$rhs_quantization_min_val,
+    I64Attr:$rhs_quantization_max_val,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$output_quantization_axis,
+    I64Attr:$output_quantization_min_val,
+    I64Attr:$output_quantization_max_val
+  );
+
+  let results = (outs
+    Res<TF_Qint32Tensor, [{The output quantized tensor.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
 def TF_UniformQuantizedClipByValueOp : TF_Op<"UniformQuantizedClipByValue", [Pure]> {
@@ -19604,6 +20068,138 @@ Same shape condition as scales.}]>:$zero_points,
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_UniformQuantizedConvolutionOp : TF_Op<"UniformQuantizedConvolution", [Pure]> {
+  let summary = [{
+Perform quantized convolution of quantized Tensor `lhs` and quantized Tensor `rhs`. to make quantized `output`.
+  }];
+
+  let description = [{
+Given quantized `lhs` and quantized `rhs`, performs quantized dot on `lhs` and `rhs` to make quantized `output`.
+
+`lhs` and `rhs` must be Tensors of same rank, and meet following shape conditions.
+- `lhs_feature` % `feature_group_count` == 0
+- `lhs_feature` % `rhs_input_feature` == 0
+- `lhs_feature` / `feature_group_count` == `rhs_input_feature`
+- `rhs_output_feature` % `feature_group_count` == 0
+- `lhs_batch` % `batch_group_count` == 0
+- `rhs_output_feature` % `batch_group_count` == 0
+
+`lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+```
+quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+```
+`output` is also quantized, using the same formula.
+If `rhs` is per-tensor quantized, `output` must be also per-tensor quantized.
+  }];
+
+  let arguments = (ins
+    Arg<TF_Qint8Tensor, [{Must be a quantized tensor, rank >= 3.}]>:$lhs,
+    Arg<TF_Qint8Tensor, [{Must be a quantized tensor, same rank as `lhs`.}]>:$rhs,
+    Arg<TF_Float32Tensor, [{The float value(s) used as scale factors when quantizing the original data that `lhs` represents.
+Must be a scalar `Tensor` (`lhs` supports only per-tensor quantization).}]>:$lhs_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+Same shape condition as `lhs_scales`.}]>:$lhs_zero_points,
+    Arg<TF_Float32Tensor, [{The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+Must be a scalar `Tensor` for per-tensor quantization,
+or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`, for per-channel quantization.}]>:$rhs_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+Same shape condition as `rhs_scales`.}]>:$rhs_zero_points,
+    Arg<TF_Float32Tensor, [{The float value(s) to use as scale factors when quantizing original data that `output` represents.
+Must be a scalar `Tensor` for per-tensor quantization,
+or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`
+- which is equal to `output.dim_size(output_feature_dimension)`,
+for per-channel quantization.
+If `rhs` is per-tensor quantized, output must be also per-tensor quantized.
+This means that if `rhs_scales` and `rhs_zero_points` are scalar `Tensor`s, `output_scales` and `output_zero_points` must be scalar `Tensor`s as well.}]>:$output_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero points when quantizing original data that output represents.
+Same shape condition as `output_scales`.}]>:$output_zero_points,
+
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$window_strides,
+    StrAttr:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$explicit_padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$lhs_dilation,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$rhs_dilation,
+    DefaultValuedOptionalAttr<I64Attr, "1">:$batch_group_count,
+    DefaultValuedOptionalAttr<I64Attr, "1">:$feature_group_count,
+    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$dimension_numbers,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$lhs_quantization_axis,
+    I64Attr:$lhs_quantization_min_val,
+    I64Attr:$lhs_quantization_max_val,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$rhs_quantization_axis,
+    I64Attr:$rhs_quantization_min_val,
+    I64Attr:$rhs_quantization_max_val,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$output_quantization_axis,
+    I64Attr:$output_quantization_min_val,
+    I64Attr:$output_quantization_max_val
+  );
+
+  let results = (outs
+    Res<TF_Qint32Tensor, [{The output quantized tensor of `Tout`, same rank as `lhs` and `rhs`.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_UniformQuantizedConvolutionHybridOp : TF_Op<"UniformQuantizedConvolutionHybrid", [Pure]> {
+  let summary = [{
+Perform hybrid quantized convolution of float Tensor `lhs` and quantized Tensor `rhs`.
+  }];
+
+  let description = [{
+Given float `lhs` and quantized `rhs`, internally performs quantization on `lhs`,
+and then performs quantized convolution on quantized `lhs` and `rhs`.
+
+The internal quantization on `lhs` is a quantization to `Trhs`, dynamic range,
+per-batch (per-axis along axis `dimension_numbers.input_batch_dimension`), asymmetric,
+and not narrow range (the range is [Trhs_MIN, Trhs_MAX]).
+
+`lhs` and `rhs` must be Tensors of same rank, and meet following shape conditions.
+- lhs_feature % feature_group_count == 0
+- lhs_feature % rhs_input_feature == 0
+- lhs_feature / feature_group_count == rhs_input_feature
+- rhs_output_feature % feature_group_count == 0
+- lhs_batch % batch_group_count == 0
+- rhs_output_feature % batch_group_count == 0
+
+`rhs` must be quantized Tensor, where its data value is quantized using the formula:
+quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val).
+  }];
+
+  let arguments = (ins
+    Arg<TF_Float32Tensor, [{Must be a non-quantized Tensor of `Tlhs`, rank >= 3.}]>:$lhs,
+    Arg<TF_Qint8Tensor, [{Must be a quantized Tensor of `Trhs`, same rank as `lhs`.}]>:$rhs,
+    Arg<TF_Float32Tensor, [{The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+Must be a scalar Tensor for per-tensor quantization,
+or 1D Tensor of size `rhs.dim_size(kernel_output_feature_dimension)`, for per-channel quantization.}]>:$rhs_scales,
+    Arg<TF_Int32Tensor, [{The int32 value(s) used as zero_point when quantizing original data that `rhs` represents.
+Same shape condition as `rhs_scales`.}]>:$rhs_zero_points,
+
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$window_strides,
+    StrAttr:$padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$explicit_padding,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$lhs_dilation,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$rhs_dilation,
+    DefaultValuedOptionalAttr<I64Attr, "1">:$batch_group_count,
+    DefaultValuedOptionalAttr<I64Attr, "1">:$feature_group_count,
+    DefaultValuedOptionalAttr<StrAttr, "\"\"">:$dimension_numbers,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$rhs_quantization_axis,
+    I64Attr:$rhs_quantization_min_val,
+    I64Attr:$rhs_quantization_max_val
+  );
+
+  let results = (outs
+    Res<TF_Float32Tensor, [{The output Tensor of `Tout`, same rank as `lhs` and `rhs`.
+The output data is the non-quantized output data.}]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tlhs = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Trhs = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
+}
+
 def TF_UniformQuantizedDotOp : TF_Op<"UniformQuantizedDot", [Pure]> {
   let summary = [{
 Perform quantized dot of quantized Tensor `lhs` and quantized Tensor `rhs` to make quantized `output`.
@@ -19653,6 +20249,8 @@ Same shape condition as rhs_scales.}]>:$output_zero_points,
 
   TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniformQuantizedDotHybridOp : TF_Op<"UniformQuantizedDotHybrid", [Pure]> {
@@ -19689,6 +20287,8 @@ The output data is the original output data itself (Not quantized).}]>:$output
   TF_DerivedOperandTypeAttr Tlhs = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Trhs = TF_DerivedOperandTypeAttr<1>;
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniformRequantizeOp : TF_Op<"UniformRequantize", [Pure]> {
@@ -19737,6 +20337,8 @@ Same shape condition as scales.}]>:$output_zero_points,
 
   TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
+
+  let hasVerifier = 1;
 }
 
 def TF_UniqueOp : TF_Op<"Unique", [Pure]> {
@@ -19975,7 +20577,7 @@ dimension of its shape if `num_segments` is 0.
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TF_NumberTensor:$data,
     Arg<TF_I32OrI64Tensor, [{A tensor whose shape is a prefix of `data.shape`.
 The values must be less than `num_segments`.
 
@@ -19985,7 +20587,7 @@ on GPU.}]>:$segment_ids,
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Has same shape as data, except for the first `segment_ids.rank`
+    Res<TF_NumberTensor, [{Has same shape as data, except for the first `segment_ids.rank`
 dimensions, which are replaced with a single dimension which has size
 `num_segments`.}]>:$output
   );
@@ -20035,7 +20637,7 @@ array([[5, 5, 5, 5],
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TF_NumberTensor:$data,
     Arg<TF_I32OrI64Tensor, [{A tensor whose shape is a prefix of `data.shape`.
 The values must be less than `num_segments`.
 
@@ -20045,7 +20647,7 @@ on GPU.}]>:$segment_ids,
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Has same shape as data, except for the first `segment_ids.rank`
+    Res<TF_NumberTensor, [{Has same shape as data, except for the first `segment_ids.rank`
 dimensions, which are replaced with a single dimension which has size
 `num_segments`.}]>:$output
   );
@@ -20252,7 +20854,7 @@ where(input) ==> [[0, 0, 0],
   }];
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input
   );
 
   let results = (outs
@@ -20267,12 +20869,12 @@ def TF_XdivyOp : TF_Op<"Xdivy", [Pure, ResultsBroadcastableShape, TF_SameOperand
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
+    TF_FpOrComplexTensor:$x,
+    TF_FpOrComplexTensor:$y
   );
 
   let results = (outs
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
+    TF_FpOrComplexTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20312,14 +20914,14 @@ for binary operators.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS input tensor}]>:$lhs,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS input tensor}]>:$rhs,
+    Arg<TF_NumberTensor, [{the LHS input tensor}]>:$lhs,
+    Arg<TF_NumberTensor, [{the RHS input tensor}]>:$rhs,
     Arg<TF_I32OrI64Tensor, [{an XLA-style broadcast dimension specification}]>:$broadcast_dims
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted LHS tensor}]>:$lhs_output,
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted RHS tensor}]>:$rhs_output
+    Res<TF_NumberTensor, [{the broadcasted LHS tensor}]>:$lhs_output,
+    Res<TF_NumberTensor, [{the broadcasted RHS tensor}]>:$rhs_output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20362,7 +20964,8 @@ E.g., the specification "2.1" denotes the value args[2].shape[1].
 
   let arguments = (ins
     Arg<Variadic<TF_Tensor>, [{A list of `Tensor` with possibly different types to be passed as arguments
-to the HLO module.}]>:$args,
+to the HLO module. These are all non-dimension arguments. The dimension
+arguments are computed at JIT time.}]>:$args,
 
     I64Attr:$version,
     StrAttr:$module,
@@ -20403,8 +21006,8 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$lhs,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the kernel tensor}]>:$rhs,
+    Arg<TF_NumberTensor, [{the input tensor}]>:$lhs,
+    Arg<TF_NumberTensor, [{the kernel tensor}]>:$rhs,
     Arg<TF_I32OrI64Tensor, [{the inter-window strides}]>:$window_strides,
     Arg<TF_I32OrI64Tensor, [{the padding to apply at the start and end of each input dimensions}]>:$padding,
     Arg<TF_I32OrI64Tensor, [{dilation to apply between input elements}]>:$lhs_dilation,
@@ -20416,7 +21019,7 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TF_NumberTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20434,8 +21037,8 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{input tensor}]>:$lhs,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{kernel tensor}]>:$rhs,
+    Arg<TF_NumberTensor, [{input tensor}]>:$lhs,
+    Arg<TF_NumberTensor, [{kernel tensor}]>:$rhs,
     Arg<TF_I32OrI64Tensor, [{inter-window strides}]>:$window_strides,
     Arg<TF_I32OrI64Tensor, [{padding to apply at the start and end of each input dimensions}]>:$padding,
     Arg<TF_I32OrI64Tensor, [{dilation to apply between input elements}]>:$lhs_dilation,
@@ -20448,7 +21051,7 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TF_NumberTensor:$output
   );
 
   TF_DerivedOperandTypeAttr LhsT = TF_DerivedOperandTypeAttr<0>;
@@ -20497,15 +21100,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS tensor}]>:$lhs,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS tensor}]>:$rhs,
+    Arg<TF_NumberTensor, [{the LHS tensor}]>:$lhs,
+    Arg<TF_NumberTensor, [{the RHS tensor}]>:$rhs,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TF_NumberTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20520,15 +21123,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS tensor}]>:$lhs,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS tensor}]>:$rhs,
+    Arg<TF_NumberTensor, [{the LHS tensor}]>:$lhs,
+    Arg<TF_NumberTensor, [{the RHS tensor}]>:$rhs,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TF_NumberTensor:$output
   );
 
   TF_DerivedOperandTypeAttr LhsT = TF_DerivedOperandTypeAttr<0>;
@@ -20629,7 +21232,7 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The array we're gathering from.}]>:$operand,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The array we're gathering from.}]>:$operand,
     Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices we gather.}]>:$start_indices,
     Arg<TF_I32OrI64Tensor, [{slice_sizes[i] is the bounds for the slice on dimension i.}]>:$slice_sizes,
 
@@ -20638,7 +21241,7 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20840,15 +21443,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
 
     I64ArrayAttr:$dimensions_to_reduce,
     SymbolRefAttr:$reducer
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20907,8 +21510,8 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
     Arg<TF_I32OrI64Tensor, [{the shape of the window}]>:$window_dimensions,
     Arg<TF_I32OrI64Tensor, [{the inter-window strides}]>:$window_strides,
     TF_I32OrI64Tensor:$base_dilations,
@@ -20919,7 +21522,7 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reducewindow .
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -20994,10 +21597,10 @@ https://www.tensorflow.org/xla/operation_semantics#scatter.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array to be scattered into.}]>:$operand,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array to be scattered into.}]>:$operand,
     Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices that must
 be scattered to.}]>:$scatter_indices,
-    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array containing the values that must be used for scattering.}]>:$updates,
+    Arg<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array containing the values that must be used for scattering.}]>:$updates,
 
     SymbolRefAttr:$update_computation,
     StrAttr:$dimension_numbers,
@@ -21005,7 +21608,7 @@ be scattered to.}]>:$scatter_indices,
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -21021,19 +21624,19 @@ https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$operand,
+    Arg<TF_NumberTensor, [{the input tensor}]>:$operand,
     Arg<TF_I32OrI64Tensor, [{the shape of the window}]>:$window_dimensions,
     Arg<TF_I32OrI64Tensor, [{the inter-window strides}]>:$window_strides,
     Arg<TF_I32OrI64Tensor, [{the padding to apply at the start and end of each input dimensions}]>:$padding,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a tensor of values to scatter}]>:$source,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the output tensor}]>:$init_value,
+    Arg<TF_NumberTensor, [{a tensor of values to scatter}]>:$source,
+    Arg<TF_NumberTensor, [{a scalar representing the initial value for the output tensor}]>:$init_value,
 
     SymbolRefAttr:$select,
     SymbolRefAttr:$scatter
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TF_NumberTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -21056,7 +21659,7 @@ i=0...N-1.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
+    Arg<TF_NumberTensor, [{the input tensor.}]>:$a,
 
     BoolAttr:$lower,
     I64Attr:$max_iter,
@@ -21064,9 +21667,9 @@ i=0...N-1.
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The eigenvalues in ascending order, each repeated according to its
+    Res<TF_NumberTensor, [{The eigenvalues in ascending order, each repeated according to its
 multiplicity.}]>:$w,
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The column v[..., :, i] is the normalized eigenvector corresponding to the
+    Res<TF_NumberTensor, [{The column v[..., :, i] is the normalized eigenvector corresponding to the
 eigenvalue w[..., i].}]>:$v
   );
 
@@ -21147,6 +21750,25 @@ key: A unique identifier for this region used to match up host transfers.
   TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSetBoundOp : TF_Op<"XlaSetBound", [Pure]> {
+  let summary = [{
+Set a bound for the given input value as a hint to Xla compiler,
+  }];
+
+  let description = [{
+returns the same value.
+  }];
+
+  let arguments = (ins
+    TF_Int32Tensor:$input,
+    TF_Int32Tensor:$bound
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$output
+  );
+}
+
 def TF_XlaSetDynamicDimensionSizeOp : TF_Op<"XlaSetDynamicDimensionSize", [InferTensorType, Pure, TF_NoConstantFold]> {
   let summary = "Make a static dimension into a xla bounded dynamic dimension.";
 
@@ -21196,6 +21818,63 @@ Sorts a tensor. Currently only sorts in ascending order are supported.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSpmdFullToShardShapeOp : TF_Op<"XlaSpmdFullToShardShape", [Pure]> {
+  let summary = [{
+An op used by XLA SPMD partitioner to switch from automatic partitioning to
+  }];
+
+  let description = [{
+manual partitioning. It annotates the input (full-shape, to be automatically
+partitioned) with the same sharding used by manual partitioning, and outputs a
+shard-shaped tensor to be consumed by later manually-partitioned ops. If the
+shape is not evenly partitionable, the padding region will be masked with 0s.
+The conversion can happen partially in subgroups, by specifying the dim
+attribute, where only that dim will be converted.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$manual_sharding,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$dim,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$unspecified_dims
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_XlaSpmdShardToFullShapeOp : TF_Op<"XlaSpmdShardToFullShape", [Pure]> {
+  let summary = [{
+An op used by XLA SPMD partitioner to switch from manual partitioning to
+  }];
+
+  let description = [{
+automatic partitioning. It converts the shard-shaped, manually partitioned input
+into full-shaped tensor to be partitioned automatically with the same sharding
+used by manual partitioning. The conversion can happen partially in subgroups,
+by specifying the dim attribute, where only that dim will be converted.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$manual_sharding,
+    TF_ShapeAttr:$full_shape,
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$dim,
+    DefaultValuedOptionalAttr<I64ArrayAttr, "{}">:$unspecified_dims
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSvdOp : TF_Op<"XlaSvd", [Pure]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -21209,7 +21888,7 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
+    Arg<TF_NumberTensor, [{the input tensor.}]>:$a,
 
     I64Attr:$max_iter,
     F32Attr:$epsilon,
@@ -21217,10 +21896,10 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Singular values. The values are sorted in reverse order of magnitude, so
+    Res<TF_NumberTensor, [{Singular values. The values are sorted in reverse order of magnitude, so
 s[..., 0] is the largest value, s[..., 1] is the second largest, etc.}]>:$s,
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Left singular vectors.}]>:$u,
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Right singular vectors.}]>:$v
+    Res<TF_NumberTensor, [{Left singular vectors.}]>:$u,
+    Res<TF_NumberTensor, [{Right singular vectors.}]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -21238,15 +21917,15 @@ XlaVariadicReduceV2 is a version that supports heterogeneous operands.
   }];
 
   let arguments = (ins
-    Arg<Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>, [{the input tensor(s)}]>:$input,
-    Arg<Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>, [{scalar initial value(s) for the reduction}]>:$init_value,
+    Arg<Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>, [{the input tensor(s)}]>:$input,
+    Arg<Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>, [{scalar initial value(s) for the reduction}]>:$init_value,
 
     I64ArrayAttr:$dimensions_to_reduce,
     SymbolRefAttr:$reducer
   );
 
   let results = (outs
-    Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>:$output
+    Variadic<TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>>:$output
   );
 
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
@@ -21318,12 +21997,12 @@ def TF_Xlog1pyOp : TF_Op<"Xlog1py", [Pure, TF_SameOperandsAndResultElementTypeRe
   let summary = "Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
+    TF_FpOrComplexTensor:$x,
+    TF_FpOrComplexTensor:$y
   );
 
   let results = (outs
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
+    TF_FpOrComplexTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -21334,12 +22013,12 @@ def TF_XlogyOp : TF_Op<"Xlogy", [Pure, ResultsBroadcastableShape, TF_SameOperand
   let summary = "Returns 0 if x == 0, and x * log(y) otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
+    TF_FpOrComplexTensor:$x,
+    TF_FpOrComplexTensor:$y
   );
 
   let results = (outs
-    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
+    TF_FpOrComplexTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 672e7d867d3..cb5f7bfd3e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -25,7 +25,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
+include "external/org_tensorflow/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow dialect definitions
@@ -48,8 +48,7 @@ TODO: Make invariants more structured so that we can reference them in ops.
   }];
 
   let cppNamespace = "::mlir::TF";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Raw;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
@@ -119,7 +118,7 @@ class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
 
 
 class TF_AllTypesMatchPred<list<string> values> :
-    CPred<"tf_type::AreCastCompatible(llvm::makeArrayRef({" #
+    CPred<"tf_type::AreCastCompatible(llvm::ArrayRef({" #
       !interleave(values, ", ") # "}))">;
 
 class TF_AllTypesMatch<list<string> names> :
@@ -190,6 +189,7 @@ def TF_TPUExecuteResource : TF_ResourceBase<"TPUExecute">;
 def TF_RandomGeneratorResource : TF_ResourceBase<"RandomGenerator">;
 def TF_XlaHostComputeResource : TF_ResourceBase<"XlaHostCompute">;
 def TF_CollectiveReduceOrderingResource : TF_ResourceBase<"CollectiveReduceOrdering">;
+def TF_NcclAllReduceOrderingResource : TF_ResourceBase<"NcclAllReduceOrdering">;
 
 // Fake resource, see `TF_MustExecute` below.
 def TF_MustExecuteResource : TF_ResourceBase<"MustExecute">;
@@ -257,6 +257,9 @@ def TF_RandomGeneratorSideEffect : MemoryEffects<[MemWrite<TF_RandomGeneratorRes
 // Special effect for keeping `CollectiveReduce` ops in order.
 def TF_CollectiveReduceOrderingEffect : MemoryEffects<[MemWrite<TF_CollectiveReduceOrderingResource>]>;
 
+// Special effect for keeping `NcclAllReduce` ops on the same device in order.
+def TF_NcclAllReduceOrderingEffect : MemoryEffects<[MemWrite<TF_NcclAllReduceOrderingResource>]>;
+
 // Trait for enforcing that a side-effecting op is executed, even if it would be
 // considered dead by MLIR (see b/195782952).
 // The trait is implemented as a write effect for a fake resource which is
@@ -313,6 +316,8 @@ def TF_Float16Ref : TF_TensorFlowType<"HalfRef", "f16ref">;
 def TF_Float32Ref : TF_TensorFlowType<"FloatRef", "f32ref">;
 def TF_Float64Ref : TF_TensorFlowType<"DoubleRef", "f64ref">;
 def TF_Bfloat16Ref : TF_TensorFlowType<"Bfloat16Ref", "bf16ref">;
+def TF_Float8E4M3FNRef : TF_TensorFlowType<"Float8E4M3FNRef", "float8e4m3fnref">;
+def TF_Float8E5M2Ref : TF_TensorFlowType<"Float8E5M2Ref", "float8e5m2ref">;
 
 // Complex reference types
 def TF_Complex64Ref : TF_TensorFlowType<"Complex64Ref", "complex64ref">;
@@ -421,11 +426,14 @@ def TF_Float16 : AnyTypeOf<[F16, TF_Float16Ref], "16-bit float">;
 def TF_Float32 : AnyTypeOf<[F32, TF_Float32Ref], "32-bit float">;
 def TF_Float64 : AnyTypeOf<[F64, TF_Float64Ref], "64-bit float">;
 def TF_Bfloat16 : AnyTypeOf<[BF16, TF_Bfloat16Ref], "bfloat16">;
+def TF_Float8E4M3FN : AnyTypeOf<[F8E4M3FN, TF_Float8E4M3FNRef], "float8e4m3fn">;
+def TF_Float8E5M2 : AnyTypeOf<[F8E5M2, TF_Float8E5M2Ref], "float8e5m2">;
 
 def TF_F32OrF64 : AnyTypeOf<[TF_Float32, TF_Float64], "32/64-bit float">;
 
 def TF_Float : AnyTypeOf<
-  [TF_Float16, TF_Float32, TF_Float64, TF_Bfloat16],
+  [TF_Float16, TF_Float32, TF_Float64, TF_Bfloat16, TF_Float8E4M3FN,
+   TF_Float8E5M2],
   "floating-point">;
 
 // Tensor types
@@ -435,6 +443,8 @@ def TF_Float16Tensor : TensorOf<[TF_Float16]>;
 def TF_Float32Tensor : TensorOf<[TF_Float32]>;
 def TF_Float64Tensor : TensorOf<[TF_Float64]>;
 def TF_Bfloat16Tensor : TensorOf<[TF_Bfloat16]>;
+def TF_Float8E4M3FNTensor : TensorOf<[TF_Float8E4M3FN]>;
+def TF_Float8E5M2Tensor : TensorOf<[TF_Float8E5M2]>;
 
 //===----------------------------------------------------------------------===//
 // Complex types (including corresponding reference types)
@@ -556,7 +566,7 @@ class TF_DerivedOperandTypeListAttr<int idx> : DerivedAttr<
   "return {mlir::OperandElementTypeIterator(values.begin()), "
           "mlir::OperandElementTypeIterator(values.end())};",
   [{
-    ArrayAttr::get($_ctxt, 
+    ArrayAttr::get($_ctxt,
     [&]() {
       llvm::SmallVector<Attribute, 4> ret;
       for (auto t : $_self)
@@ -612,7 +622,7 @@ class TF_DerivedResultTypeListAttr<int idx> : DerivedAttr<
   "return {mlir::ResultElementTypeIterator(values.begin()), "
           "mlir::ResultElementTypeIterator(values.end())};",
   [{
-    ArrayAttr::get($_ctxt, 
+    ArrayAttr::get($_ctxt,
     [&]() {
       llvm::SmallVector<Attribute, 4> ret;
       for (auto t : $_self)
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
index 8d9bdfa3a68..88cbf879d56 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -125,7 +125,7 @@ ResourceHandleValueAndId GetResourceHandleValueAndIdBase(
 
 #define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
   LogicalResult Op::inferReturnTypeComponents(                                \
-      MLIRContext* context, Optional<Location> location,                      \
+      MLIRContext* context, std::optional<Location> location,                 \
       ValueShapeRange operands, DictionaryAttr attributes,                    \
       RegionRange regions,                                                    \
       SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 0f3d48bf577..2968c8eb1c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -40,8 +40,14 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
   let methods = [
     InterfaceMethod<
       [{Returns current operation data format (data layout).}],
-      "StringRef", "data_format", (ins)
+      "StringRef", "getDataFormat", (ins)
     >,
+    InterfaceMethod<
+      [{Deprecated method that returns current operation data format (data layout).}],
+      "StringRef", "data_format", (ins),
+      /*methodBody=*/[{
+        return $_op.getDataFormat();
+    }]>,
     InterfaceMethod<
       [{Returns indices of layout dependent arguments.}],
       "SmallVector<unsigned, 4>", "GetLayoutDependentArgs", (ins)
@@ -138,12 +144,14 @@ def TF_GetResourceInstanceInterface : OpInterface<"GetResourceInstanceInterface"
   let methods = [
     InterfaceMethod<
       /*desc=*/[{Returns a string corresponding to the resource instance
-                 accessed by this op. The implementation must guarantee that the
+                 accessed by this op, or `std::nullopt` if the resource should
+                 be ignored. The implementation must guarantee that the
                  mapping between resource instances and strings is bijective,
                  i.e., two op instances should return the same string if and
                  only if they access the same resource. The interface should
-                 only be used for ops that access exactly one resource.}],
-      /*retTy=*/"std::string",
+                 only be used for ops that access exactly one op-based resource
+                 (see `tf_op_base.td` for details).}],
+      /*retTy=*/"std::optional<std::string>",
       /*methodName=*/"GetResourceInstanceStr",
       /*args=*/(ins)
     >,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 9c72980d39c..da6bdb54cf3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -159,7 +159,7 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Returns if its legal to inline 'src' region into the 'dest' region
   // attached to a TF operation.
   bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       BlockAndValueMapping &valueMapping) const final {
+                       IRMapping &valueMapping) const final {
     // Allow inlining in regions attached to region based control flow
     // operations only if the src region is a single block region
     return isa<IfRegionOp, WhileRegionOp>(dest->getParentOp()) &&
@@ -169,7 +169,7 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Returns true if its legal to inline a TF operation `op` into the `dest`
   // region.
   bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
-                       BlockAndValueMapping &) const final {
+                       IRMapping &) const final {
     // An op is legal to inline if either of the following conditions is true:
     // (a) Its legal to duplicate the Op.
     // (b) The Op is inside a single use function. If that function is inlined,
@@ -227,7 +227,7 @@ bool TensorFlowDialect::CanDuplicate(Operation *op) {
   if (op->hasTrait<OpTrait::TF::CannotDuplicate>()) return false;
 
   // If the op has no memory side effects, it can be duplicated.
-  if (MemoryEffectOpInterface::hasNoEffect(op)) return true;
+  if (isMemoryEffectFree(op)) return true;
 
   // If the op is marked stateless using the `is_stateless` attribute, that
   // attribute determines if the op can be duplicated.
@@ -272,7 +272,7 @@ void *TensorFlowDialect::getRegisteredInterfaceForOp(
 // Returns true if the op can have side effects.
 bool TensorFlowDialect::CanHaveSideEffects(Operation *op) {
   // If the op has no memory side effects, it has no side effects
-  if (MemoryEffectOpInterface::hasNoEffect(op)) return false;
+  if (isMemoryEffectFree(op)) return false;
 
   // If the op is marked stateless using the `is_stateless` attribute, then
   // it has no side effects.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 14fa5387d43..c184494fcaa 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -27,8 +27,8 @@ limitations under the License.
 #ifndef TF_OPS
 #define TF_OPS
 
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
-include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "external/org_tensorflow/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
+include "external/org_tensorflow/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
@@ -57,12 +57,13 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [Pure]> {
     // Returns data type of the result handle. Returned type contains type of
     // the TensorList element as a subtype.
     VariantType handle_dtype() {
-      return getElementTypeOrSelf(handle().getType()).cast<TF::VariantType>();
+      return getElementTypeOrSelf(getHandle().getType()).cast<TF::VariantType>();
     }
   }];
 }
 
-def TF_CaseOp : TF_Op<"Case", [DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+def TF_CaseOp : TF_Op<"Case",
+      [DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = [{
 An n-way switch statement which calls a single branch function.
   }];
@@ -111,13 +112,13 @@ An n-way switch statement, implementing the following:
 
 
  let extraClassDeclaration = [{
-    int num_branches() { return branches().size(); }
+    int num_branches() { return getBranches().size(); }
 
     // Gets function corresponding branch # `index`.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveBranchFunction(::mlir::SymbolTableCollection* table, int index) {
-      auto flat_sym_ref = branches()[index].cast<FlatSymbolRefAttr>();
+      auto flat_sym_ref = getBranches()[index].cast<FlatSymbolRefAttr>();
       if (table)
         return table->lookupNearestSymbolFrom<func::FuncOp>(*this, flat_sym_ref);
       return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, flat_sym_ref);
@@ -282,9 +283,9 @@ else_branch: A function that takes 'inputs' and returns a list of
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveThenFunction(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, then_branchAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getThenBranchAttr());
       return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
-        *this, then_branchAttr());
+        *this, getThenBranchAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp then_function(::mlir::SymbolTableCollection* table = nullptr) {
@@ -296,9 +297,9 @@ else_branch: A function that takes 'inputs' and returns a list of
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveElseFunction(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, else_branchAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getElseBranchAttr());
       return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
-        *this, else_branchAttr());
+        *this, getElseBranchAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp else_function(::mlir::SymbolTableCollection* table = nullptr) {
@@ -320,7 +321,7 @@ def TF_YieldOp : TF_Op<"Yield",
     region.
   }];
 
-  let arguments = (ins Variadic<AnyType>:$operands);
+  let arguments = (ins Variadic<AnyType>);
 }
 
 def TF_IfRegionOp : TF_Op<"IfRegion",
@@ -377,7 +378,8 @@ else_branch: A region that computes the outputs of the op if cond = false.
 }
 
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
-                            [CallOpInterface, Pure]> {
+      [CallOpInterface,
+       DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
   let summary =
     "returns `f(inputs)`, where `f` is a function.";
 
@@ -403,18 +405,18 @@ def TF_LegacyCallOp : TF_Op<"LegacyCall",
 
   let extraClassDeclaration = [{
     // Gets the argument operands to the called function.
-    operand_range getArgOperands() { return args(); }
+    operand_range getArgOperands() { return getArgs(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+    CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveFunc(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp func() {  return ResolveFunc(nullptr); }
@@ -536,7 +538,7 @@ def TF_PlaceholderWithDefaultOp : TF_Op<"PlaceholderWithDefault", [Pure]> {
 }
 
 def TF_StatefulPartitionedCallOp : TF_Op<"StatefulPartitionedCall",
-                                         [CallOpInterface, SymbolUserOpInterface]> {
+      [CallOpInterface, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary =
     "returns `f(inputs)`, where `f`'s body is placed and partitioned.";
 
@@ -564,28 +566,26 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
 
   let extraClassDeclaration = [{
     // Gets the argument operands to the called function.
-    operand_range getArgOperands() { return args(); }
+    operand_range getArgOperands() { return getArgs(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+    CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveFunc(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp func() {  return ResolveFunc(nullptr); }
-
-    // SymbolUserOpInterface verifier.
-    LogicalResult verifySymbolUses(SymbolTableCollection &symbolTable);
   }];
 }
 
-def TF_WhileOp : TF_Op<"While", [DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+def TF_WhileOp : TF_Op<"While",
+      [DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = [{
 output = input; While (Cond(output)) { output = Body(output) }
   }];
@@ -644,8 +644,8 @@ body: A function that takes a list of tensors and returns another
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveCondFunction(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, condAttr());
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, condAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getCondAttr());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getCondAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp cond_function() { return ResolveCondFunction(nullptr); }
@@ -655,8 +655,8 @@ body: A function that takes a list of tensors and returns another
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveBodyFunction(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, bodyAttr());
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, bodyAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getBodyAttr());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getBodyAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp body_function() { return ResolveBodyFunction(nullptr); }
@@ -779,7 +779,7 @@ Example:
     TensorType resource_subtype() { return resource_type().getSubtypes()[0]; }
 
     ResourceType resource_type() {
-      return getElementTypeOrSelf(resource()).cast<TF::ResourceType>();
+      return getElementTypeOrSelf(getResource()).cast<TF::ResourceType>();
     }
   }];
 
@@ -984,7 +984,8 @@ This function is faster and numerically stabler than `bessel_i1(x)`.
   }];
 }
 
-def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface, SymbolUserOpInterface]> {
+def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall",
+      [CallOpInterface, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Calls a function placed on a specified TPU device.";
 
   let arguments = (ins
@@ -1004,24 +1005,21 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface, Symb
 
   let extraClassDeclaration = [{
     // Gets the argument operands to the called function.
-    operand_range getArgOperands() { return args(); }
+    operand_range getArgOperands() { return getArgs(); }
 
     // Returns the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() { return fAttr(); }
+    CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
     // enabling reusing cached symbol table lookup.
     func::FuncOp ResolveFunc(::mlir::SymbolTableCollection* table) {
       if (table)
-        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
-      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, fAttr());
+        return table->lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
+      return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getFAttr());
     }
     // TODO(b/204997177): Deprecate and remove.
     func::FuncOp func() {  return ResolveFunc(nullptr); }
-
-    // SymbolUserOpInterface verifier.
-    LogicalResult verifySymbolUses(SymbolTableCollection &symbolTable);
   }];
 }
 
@@ -1437,6 +1435,9 @@ operations inside a TPU host.
   );
 }
 
+// We must manually define TPUPartitionedInput, TPUPartitionedInputV2,
+// TPUPartitionedOutput, and TPUPartitionedOutputV2 since they have an
+// optional attribute, _XlaSharding, unlike the TensorFlow definition.
 def TF_TPUPartitionedInputOp : TF_Op<"TPUPartitionedInput", [Pure]> {
   let summary = [{
 An op that groups a list of partitioned inputs together. This op
@@ -1457,6 +1458,28 @@ An op that groups a list of partitioned inputs together. This op
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
+def TF_TPUPartitionedInputV2Op : TF_Op<"TPUPartitionedInputV2", [Pure]> {
+  let summary = [{
+An op that groups a list of partitioned inputs together. Supports ND sharding.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+    I64ArrayAttr:$partition_dims,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_packed,
+    OptionalAttr<StrAttr>:$_XlaSharding
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+
+  let hasVerifier = 1;
+}
+
 def TF_TPUPartitionedOutputOp : TF_Op<"TPUPartitionedOutput", [Pure]> {
   let summary = [{
 An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
@@ -1481,6 +1504,30 @@ outputs outside the XLA computation.
   TF_DerivedResultSizeAttr num_splits = TF_DerivedResultSizeAttr<0>;
 }
 
+def TF_TPUPartitionedOutputV2Op : TF_Op<"TPUPartitionedOutputV2", [Pure]> {
+  let summary = [{
+An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
+  }];
+
+  let description = [{
+outputs outside the XLA computation. Supports ND sharding.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$inputs,
+
+    I64ArrayAttr:$partition_dims,
+    OptionalAttr<StrAttr>:$_XlaSharding
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedResultSizeAttr num_splits = TF_DerivedResultSizeAttr<0>;
+}
+
 // Declares symbol reference attribute `shape_inference_graph` to be optional
 // unlike the TensorFlow definition. This is required to support ops that use
 // empty string value for the attribute to signify missing.
@@ -1670,16 +1717,16 @@ This op is deprecated. Prefer `tf.nn.batch_normalization`.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 4D input Tensor.}]>:$x,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D mean Tensor with size matching the last dimension of x.
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 4D input Tensor.}]>:$x,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D mean Tensor with size matching the last dimension of x.
 This is the first output from tf.nn.moments,
 or a saved moving average thereof.}]>:$m,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D variance Tensor with size matching the last dimension of x.
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D variance Tensor with size matching the last dimension of x.
 This is the second output from tf.nn.moments,
 or a saved moving average thereof.}]>:$v,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D beta Tensor with size matching the last dimension of x.
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D beta Tensor with size matching the last dimension of x.
 An offset to be added to the normalized tensor.}]>:$beta,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D gamma Tensor with size matching the last dimension of x.
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 1D gamma Tensor with size matching the last dimension of x.
 If "scale_after_normalization" is true, this tensor will be multiplied
 with the normalized tensor.}]>:$gamma,
 
@@ -1688,7 +1735,7 @@ with the normalized tensor.}]>:$gamma,
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$result
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$result
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1705,15 +1752,15 @@ greater than `clip_value_max` are set to `clip_value_max`.
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A `Tensor`.}]>:$x,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A `Tensor`.}]>:$x,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
 as `t`. The minimum value to clip by.}]>:$clip_value_min,
-    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
 as `x`. The maximum value to clip by.}]>:$clip_value_max
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A clipped `Tensor` with the same shape as input 't'.}]>:$output
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{A clipped `Tensor` with the same shape as input 't'.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index e354ecd39d9..3947b17c879 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <iterator>
 #include <limits>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -129,8 +130,8 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MergeSummaryOp);
 // AddOp
 //===----------------------------------------------------------------------===//
 
-void AddOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                        MLIRContext *context) {
+void AddOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                        MLIRContext* context) {
   results.add<AddToAddV2>(context);
 }
 
@@ -138,8 +139,9 @@ void AddOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // AddNOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult AddNOp::fold(ArrayRef<Attribute> operands) {
-  if (operands.size() == 1) return *inputs().begin();
+OpFoldResult AddNOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (operands.size() == 1) return *getInputs().begin();
 
   // Fold if there is only one single non-zero operand or all operands are zero.
   int non_zero_index = -1;
@@ -175,8 +177,8 @@ OpFoldResult AddNOp::fold(ArrayRef<Attribute> operands) {
   }
 
   // Check the non-zero operand's shape matches the result shape.
-  if (result_ty == inputs()[non_zero_index].getType())
-    return inputs()[non_zero_index];
+  if (result_ty == getInputs()[non_zero_index].getType())
+    return getInputs()[non_zero_index];
   return {};
 }
 
@@ -184,12 +186,13 @@ OpFoldResult AddNOp::fold(ArrayRef<Attribute> operands) {
 // AddV2Op
 //===----------------------------------------------------------------------===//
 
-void AddV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
-                                          MLIRContext *context) {
+void AddV2Op::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
   results.add<AddV2OfNegLeft, AddV2OfNegRight>(context);
 }
 
-OpFoldResult AddV2Op::fold(ArrayRef<Attribute> operands) {
+OpFoldResult AddV2Op::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   return IdentityArithmeticOpFolder<AddV2Op>(*this, operands);
 }
 
@@ -199,7 +202,7 @@ OpFoldResult AddV2Op::fold(ArrayRef<Attribute> operands) {
 
 LogicalResult AllOp::verify() {
   AllOp op = *this;
-  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
+  return VerifyReductionInputAndDims(op.getInput(), op.getReductionIndices(),
                                      op.getLoc());
 }
 
@@ -209,7 +212,7 @@ LogicalResult AllOp::verify() {
 
 LogicalResult AnyOp::verify() {
   AnyOp op = *this;
-  return VerifyReductionInputAndDims(op.input(), op.reduction_indices(),
+  return VerifyReductionInputAndDims(op.getInput(), op.getReductionIndices(),
                                      op.getLoc());
 }
 
@@ -224,9 +227,9 @@ struct AssertWithTrue : public OpRewritePattern<AssertOp> {
   using OpRewritePattern<AssertOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(AssertOp op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     ElementsAttr cst;
-    if (matchPattern(op.condition(), m_Constant(&cst))) {
+    if (matchPattern(op.getCondition(), m_Constant(&cst))) {
       if (cst.getValues<bool>()[0]) {
         rewriter.eraseOp(op);
         return success();
@@ -237,28 +240,46 @@ struct AssertWithTrue : public OpRewritePattern<AssertOp> {
 };
 }  // namespace
 
-void AssertOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                           MLIRContext *context) {
+void AssertOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
   results.add<AssertWithTrue>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// BatchFunctionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BatchFunctionOp::verifySymbolUses(
+    SymbolTableCollection& symbolTable) {
+  StringAttr func_attr = getFAttr().getRootReference();
+  func::FuncOp func =
+      symbolTable.lookupNearestSymbolFrom<func::FuncOp>(*this, func_attr);
+
+  if (!func) {
+    return emitError("'f' attribute refers to an undefined function: ")
+           << func_attr.getValue();
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // BatchMatMulV2Op & BatchMatMulOp
 //===----------------------------------------------------------------------===//
 
 template <typename OpT,
           typename std::enable_if<llvm::is_one_of<
-              OpT, BatchMatMulOp, BatchMatMulV2Op>::value>::type * = nullptr>
+              OpT, BatchMatMulOp, BatchMatMulV2Op>::value>::type* = nullptr>
 static LogicalResult Verify(OpT op) {
-  if (!HasRankAtLeast(op.x(), 2)) {
+  if (!HasRankAtLeast(op.getX(), 2)) {
     return op.emitOpError("requires lhs operand to have rank at least two");
   }
-  if (!HasRankAtLeast(op.y(), 2)) {
+  if (!HasRankAtLeast(op.getY(), 2)) {
     return op.emitOpError("requires rhs operand to have rank at least two");
   }
 
-  RankedTensorType x_ty = GetRankedTensorTypeForOperand(op.x());
-  RankedTensorType y_ty = GetRankedTensorTypeForOperand(op.y());
+  RankedTensorType x_ty = GetRankedTensorTypeForOperand(op.getX());
+  RankedTensorType y_ty = GetRankedTensorTypeForOperand(op.getY());
 
   if (!x_ty || !y_ty) return success();
 
@@ -276,7 +297,7 @@ static LogicalResult Verify(OpT op) {
   // The last two dimensions are non-batch dimensions that don't need to
   // participate in batch dimension compatibility check.
   if (std::is_same<OpT, BatchMatMulOp>()) {
-    for (const auto &dim_pairs : llvm::zip(x_batches, y_batches)) {
+    for (const auto& dim_pairs : llvm::zip(x_batches, y_batches)) {
       int64_t x_dim = std::get<0>(dim_pairs);
       int64_t y_dim = std::get<1>(dim_pairs);
       if (!ShapedType::isDynamic(x_dim) && !ShapedType::isDynamic(y_dim) &&
@@ -294,7 +315,7 @@ static LogicalResult Verify(OpT op) {
              << x_ty << " and rhs shape " << y_ty;
   }
 
-  RankedTensorType output_ty = GetRankedTensorTypeForOperand(op.output());
+  RankedTensorType output_ty = GetRankedTensorTypeForOperand(op.getOutput());
   if (!output_ty) return success();
 
   int64_t expected_output_rank = std::max(x_ty.getRank(), y_ty.getRank());
@@ -306,8 +327,8 @@ static LogicalResult Verify(OpT op) {
   // Check output batch dim with potential broadcasting.
   ArrayRef<int64_t> output_shape = output_ty.getShape();
   for (int i = 0; i < result_batch_shape.size(); ++i) {
-    if (output_shape[i] != ShapedType::kDynamicSize &&
-        result_batch_shape[i] != ShapedType::kDynamicSize &&
+    if (output_shape[i] != ShapedType::kDynamic &&
+        result_batch_shape[i] != ShapedType::kDynamic &&
         output_shape[i] != result_batch_shape[i])
       return op.emitOpError()
              << "has mismatching input batch dimension "
@@ -324,17 +345,17 @@ static LogicalResult Verify(OpT op) {
   int64_t out_row_dim = output_shape[output_shape.size() - 2];
   int64_t out_col_dim = output_shape[output_shape.size() - 1];
 
-  int64_t expected_out_row_dim = op.adj_x() ? x_col_dim : x_row_dim;
-  int64_t expected_out_col_dim = op.adj_y() ? y_row_dim : y_col_dim;
+  int64_t expected_out_row_dim = op.getAdjX() ? x_col_dim : x_row_dim;
+  int64_t expected_out_col_dim = op.getAdjY() ? y_row_dim : y_col_dim;
 
-  if (expected_out_row_dim != ShapedType::kDynamicSize &&
-      out_row_dim != ShapedType::kDynamicSize &&
+  if (expected_out_row_dim != ShapedType::kDynamic &&
+      out_row_dim != ShapedType::kDynamic &&
       out_row_dim != expected_out_row_dim)
     return op.emitOpError()
            << "found invalid output dimension on row, expected "
            << expected_out_row_dim << " but got " << out_row_dim;
-  if (expected_out_col_dim != ShapedType::kDynamicSize &&
-      out_col_dim != ShapedType::kDynamicSize &&
+  if (expected_out_col_dim != ShapedType::kDynamic &&
+      out_col_dim != ShapedType::kDynamic &&
       out_col_dim != expected_out_col_dim)
     return op.emitOpError()
            << "found invalid output dimension on col, expected "
@@ -345,13 +366,13 @@ static LogicalResult Verify(OpT op) {
 LogicalResult BatchMatMulOp::verify() { return Verify(*this); }
 LogicalResult BatchMatMulV2Op::verify() { return Verify(*this); }
 
-void BatchMatMulOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                MLIRContext *context) {
+void BatchMatMulOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
   results.add<BatchMatMulToV2>(context);
 }
 
-void BatchMatMulV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                  MLIRContext *context) {
+void BatchMatMulV2Op::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                  MLIRContext* context) {
   results.add<BatchMatMulV2ToMatMul>(context);
 }
 
@@ -362,17 +383,17 @@ void BatchMatMulV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
 LogicalResult BatchToSpaceOp::verify() {
   BatchToSpaceOp op = *this;
   // Op already has a constraint that block_size >= 2.
-  int64_t block_size = op.block_size();
+  int64_t block_size = op.getBlockSize();
 
-  llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
-  auto input_type = op.input().getType().cast<TensorType>();
+  llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamic);
+  auto input_type = op.getInput().getType().cast<TensorType>();
   if (input_type.hasRank()) {
     if (input_type.getRank() != 4)
       return op.emitOpError()
              << "requires input to be a 4D tensor, but got " << input_type;
 
     int64_t input_batch = input_type.getDimSize(0);
-    if (input_batch != ShapedType::kDynamicSize &&
+    if (input_batch != ShapedType::kDynamic &&
         input_batch % (block_size * block_size) != 0) {
       return op.emitOpError()
              << "requires input batch (dimension 0) to be evenly divisible "
@@ -384,7 +405,7 @@ LogicalResult BatchToSpaceOp::verify() {
                        input_type.getShape().end());
   }
 
-  auto crops_type = op.crops().getType().cast<TensorType>();
+  auto crops_type = op.getCrops().getType().cast<TensorType>();
   if (crops_type.hasRank()) {
     if (crops_type.getRank() != 2)
       return op.emitOpError()
@@ -403,12 +424,12 @@ LogicalResult BatchToSpaceOp::verify() {
   // Crops are defined as [[crop_top, crop_bottom], [crop_left, crop_right]],
   // and flattened as [crop_top, crop_bottom, crop_left, crop_right]
   llvm::SmallVector<int64_t, 4> crops_values;
-  if (matchPattern(op.crops(), m_Constant(&crops_attr))) {
+  if (matchPattern(op.getCrops(), m_Constant(&crops_attr))) {
     assert(crops_attr.getNumElements() == 4 &&
            "tf.BatchToSpace crops must have 4 elements");
 
     auto crops_range = crops_attr.getValues<APInt>();
-    for (const auto &crops_value : crops_range) {
+    for (const auto& crops_value : crops_range) {
       int64_t crops_value_int = crops_value.getSExtValue();
       if (crops_value_int < 0)
         return op.emitOpError()
@@ -419,15 +440,14 @@ LogicalResult BatchToSpaceOp::verify() {
     }
   }
 
-  auto output_type = op.output().getType().cast<TensorType>();
+  auto output_type = op.getOutput().getType().cast<TensorType>();
   if (output_type.hasRank()) {
     if (output_type.getRank() != 4)
       return op.emitOpError()
              << "requires output to be a 4D tensor, but got " << output_type;
 
     auto static_dims = [](int64_t dim_a, int64_t dim_b) {
-      return dim_a != ShapedType::kDynamicSize &&
-             dim_b != ShapedType::kDynamicSize;
+      return dim_a != ShapedType::kDynamic && dim_b != ShapedType::kDynamic;
     };
 
     auto output_shape = output_type.getShape();
@@ -499,8 +519,8 @@ LogicalResult BatchToSpaceOp::verify() {
   return success();
 }
 
-void BatchToSpaceOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                 MLIRContext *context) {
+void BatchToSpaceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                 MLIRContext* context) {
   results.add<BatchToSpaceToBatchToSpaceND>(context);
 }
 
@@ -510,8 +530,8 @@ void BatchToSpaceOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult BatchToSpaceNDOp::verify() {
   BatchToSpaceNDOp op = *this;
-  auto block_shape_ty = op.block_shape().getType().cast<ShapedType>();
-  auto crops_ty = op.crops().getType().cast<ShapedType>();
+  auto block_shape_ty = op.getBlockShape().getType().cast<ShapedType>();
+  auto crops_ty = op.getCrops().getType().cast<ShapedType>();
 
   if (block_shape_ty.hasStaticShape() && crops_ty.hasStaticShape()) {
     const int block_rank = block_shape_ty.getShape().front();
@@ -537,38 +557,40 @@ LogicalResult BatchToSpaceNDOp::verify() {
 //
 LogicalResult BiasAddOp::verify() {
   BiasAddOp op = *this;
-  absl::string_view data_format(op.data_format().data(),
-                                op.data_format().size());
+  absl::string_view data_format(op.getDataFormat().data(),
+                                op.getDataFormat().size());
   tensorflow::TensorFormat format;
   bool is_valid = FormatFromString(data_format, &format);
   DCHECK(is_valid) << data_format;
   if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
-    if (!HasRankAtLeast(op.value(), 2))
+    if (!HasRankAtLeast(op.getValue(), 2))
       return op.emitOpError(
           "requires value operand to have rank at least two with `NHWC` data "
           "format");
   } else {
     // Op definition requires data_format to be either NHWC or NCHW.
     DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
-    if (!HasRankAtLeast(op.value(), 3))
+    if (!HasRankAtLeast(op.getValue(), 3))
       return op.emitOpError(
           "requires value operand to have rank at least three with `NCHW` data "
           "format");
   }
 
-  if (!IsOfRankOrUnranked(op.bias(), 1))
+  if (!IsOfRankOrUnranked(op.getBias(), 1))
     return op.emitOpError("requires bias operand to have rank exactly one");
 
-  RankedTensorType value_ty = op.value().getType().dyn_cast<RankedTensorType>();
-  RankedTensorType bias_ty = op.bias().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType value_ty =
+      op.getValue().getType().dyn_cast<RankedTensorType>();
+  RankedTensorType bias_ty =
+      op.getBias().getType().dyn_cast<RankedTensorType>();
   if (!bias_ty || !value_ty) return success();
 
   int64_t feature_dim_idx =
       tensorflow::GetTensorFeatureDimIndex(value_ty.getRank(), format);
   int64_t feature_dim = value_ty.getDimSize(feature_dim_idx);
   int64_t bias_len = bias_ty.getDimSize(0);
-  if (feature_dim != ShapedType::kDynamicSize &&
-      bias_len != ShapedType::kDynamicSize && feature_dim != bias_len) {
+  if (feature_dim != ShapedType::kDynamic && bias_len != ShapedType::kDynamic &&
+      feature_dim != bias_len) {
     return op.emitOpError()
            << "requires channel dimension and feature dimension to match; "
               "found "
@@ -581,11 +603,11 @@ LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
   return ::mlir::TF::UpdateDataFormat(data_format, this);
 }
 
-StringRef BiasAddOp::GetOptimalLayout(const RuntimeDevices &devices) {
+StringRef BiasAddOp::GetOptimalLayout(const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // Prefer NHWC for GPU devices.
   return "NHWC";
@@ -600,20 +622,20 @@ StringRef BiasAddOp::GetOptimalLayout(const RuntimeDevices &devices) {
 //
 LogicalResult BiasAddGradOp::verify() {
   BiasAddGradOp op = *this;
-  absl::string_view data_format(op.data_format().data(),
-                                op.data_format().size());
+  absl::string_view data_format(op.getDataFormat().data(),
+                                op.getDataFormat().size());
   tensorflow::TensorFormat format;
   bool is_valid = FormatFromString(data_format, &format);
   DCHECK(is_valid) << data_format;
   if (format == tensorflow::TensorFormat::FORMAT_NHWC) {
-    if (!HasRankAtLeast(op.out_backprop(), 2))
+    if (!HasRankAtLeast(op.getOutBackprop(), 2))
       return op.emitOpError(
           "requires out_backprop operand to have rank at least two with `NHWC` "
           "data format");
   } else {
     // Op definition requires data_format to be either NHWC or NCHW.
     DCHECK_EQ(format, tensorflow::TensorFormat::FORMAT_NCHW);
-    if (!HasRankAtLeast(op.out_backprop(), 3))
+    if (!HasRankAtLeast(op.getOutBackprop(), 3))
       return op.emitOpError(
           "requires out_backprop operand to have rank at least three with "
           "`NCHW` data format");
@@ -626,8 +648,8 @@ LogicalResult BiasAddGradOp::verify() {
 // BiasAddV1Op
 //===----------------------------------------------------------------------===//
 
-void BiasAddV1Op::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
+void BiasAddV1Op::getCanonicalizationPatterns(RewritePatternSet& results,
+                                              MLIRContext* context) {
   results.add<BiasAddV1ToBiasAdd>(context);
 }
 
@@ -635,8 +657,8 @@ void BiasAddV1Op::getCanonicalizationPatterns(RewritePatternSet &results,
 // arith::BitcastOp
 //===----------------------------------------------------------------------===//
 
-void BitcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                            MLIRContext *context) {
+void BitcastOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
   results.add<BitcastSameType, BitcastNested>(context);
 }
 
@@ -652,8 +674,8 @@ LogicalResult BroadcastToOp::verify() {
   return success();
 }
 
-OpFoldResult BroadcastToOp::fold(ArrayRef<Attribute> operands) {
-  Value input = this->input();
+OpFoldResult BroadcastToOp::fold(FoldAdaptor) {
+  Value input = this->getInput();
 
   // Fold broadcast if operand and result types are the same and all dimensions
   // are statically known (no-op broadcast).
@@ -677,11 +699,11 @@ namespace {
 // Returns `true` if both s0 & s1 are defined via constant op, and fills
 // s0_shape & s1_shape.
 bool ExtractInputConstShape(BroadcastGradientArgsOp op,
-                            DenseIntElementsAttr &s0, DenseIntElementsAttr &s1,
-                            SmallVectorImpl<int64_t> &s0_shape,
-                            SmallVectorImpl<int64_t> &s1_shape) {
-  if (!matchPattern(op.s0(), m_Constant(&s0))) return false;
-  if (!matchPattern(op.s1(), m_Constant(&s1))) return false;
+                            DenseIntElementsAttr& s0, DenseIntElementsAttr& s1,
+                            SmallVectorImpl<int64_t>& s0_shape,
+                            SmallVectorImpl<int64_t>& s1_shape) {
+  if (!matchPattern(op.getS0(), m_Constant(&s0))) return false;
+  if (!matchPattern(op.getS1(), m_Constant(&s1))) return false;
 
   for (auto s : s0.getValues<APInt>()) s0_shape.push_back(s.getSExtValue());
   for (auto s : s1.getValues<APInt>()) s1_shape.push_back(s.getSExtValue());
@@ -699,8 +721,8 @@ bool ExtractInputConstShape(BroadcastGradientArgsOp op,
 void GetOutputShapeForBroadcastGradientArgs(ArrayRef<int64_t> bcasted_shape,
                                             ArrayRef<int64_t> s0_shape,
                                             ArrayRef<int64_t> s1_shape,
-                                            SmallVectorImpl<int64_t> &r0,
-                                            SmallVectorImpl<int64_t> &r1) {
+                                            SmallVectorImpl<int64_t>& r0,
+                                            SmallVectorImpl<int64_t>& r1) {
   r0.clear();
   r1.clear();
 
@@ -759,8 +781,8 @@ LogicalResult BroadcastGradientArgsOp::verify() {
 
   // Verify that output types are of rank one and matches the computed result
   // shape.
-  auto r0_ty = op.r0().getType().dyn_cast<RankedTensorType>();
-  auto r1_ty = op.r1().getType().dyn_cast<RankedTensorType>();
+  auto r0_ty = op.getR0().getType().dyn_cast<RankedTensorType>();
+  auto r1_ty = op.getR1().getType().dyn_cast<RankedTensorType>();
   if (r0_ty && r0_ty.hasStaticShape() && r0_ty.getDimSize(0) != r0.size())
     return op.emitOpError() << "requires dimension 0 size of 'r0' to be "
                             << r0.size() << " but got " << r0_ty.getShape()[0];
@@ -772,7 +794,7 @@ LogicalResult BroadcastGradientArgsOp::verify() {
 }
 
 LogicalResult BroadcastGradientArgsOp::fold(
-    ArrayRef<Attribute> operands, SmallVectorImpl<OpFoldResult> &results) {
+    FoldAdaptor, SmallVectorImpl<OpFoldResult>& results) {
   SmallVector<int64_t, 4> s0_shape, s1_shape;
   DenseIntElementsAttr s0, s1;
   if (!ExtractInputConstShape(*this, s0, s1, s0_shape, s1_shape))
@@ -791,7 +813,7 @@ LogicalResult BroadcastGradientArgsOp::fold(
   GetOutputShapeForBroadcastGradientArgs(bcasted_shape, s0_shape, s1_shape, r0,
                                          r1);
 
-  auto build_out_dense_element = [](SmallVectorImpl<int64_t> &shape,
+  auto build_out_dense_element = [](SmallVectorImpl<int64_t>& shape,
                                     Type input_type) {
     Type element_type = input_type.cast<mlir::TensorType>().getElementType();
     RankedTensorType type = tensorflow::GetTypeFromTFTensorShape(
@@ -807,8 +829,8 @@ LogicalResult BroadcastGradientArgsOp::fold(
     }
   };
 
-  results.push_back(build_out_dense_element(r0, this->s0().getType()));
-  results.push_back(build_out_dense_element(r1, this->s1().getType()));
+  results.push_back(build_out_dense_element(r0, this->getS0().getType()));
+  results.push_back(build_out_dense_element(r1, this->getS1().getType()));
 
   return success();
 }
@@ -819,22 +841,22 @@ LogicalResult BroadcastGradientArgsOp::fold(
 
 class FoldConstantCaseOp : public OpRewritePattern<TF::CaseOp> {
  public:
-  explicit FoldConstantCaseOp(MLIRContext *context)
+  explicit FoldConstantCaseOp(MLIRContext* context)
       : OpRewritePattern<TF::CaseOp>(context) {}
   LogicalResult matchAndRewrite(TF::CaseOp op,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter& rewriter) const override;
 };
 
 LogicalResult FoldConstantCaseOp::matchAndRewrite(
-    TF::CaseOp op, PatternRewriter &rewriter) const {
+    TF::CaseOp op, PatternRewriter& rewriter) const {
   // Extract the constant cond value.
   DenseIntElementsAttr branch;
-  if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
+  if (!matchPattern(op.getBranchIndex(), m_Constant(&branch))) return failure();
 
   int index = *branch.getValues<int>().begin();
   if (index < 0 || index >= op.num_branches()) index = op.num_branches() - 1;
 
-  auto func = op.branches()[index].cast<SymbolRefAttr>();
+  auto func = op.getBranches()[index].cast<SymbolRefAttr>();
   auto empty = rewriter.getStringAttr("");
   ReplaceTfOpWithNewOp<PartitionedCallOp>(
       rewriter, op, op.getResultTypes(), op.getOperands().drop_front(), func,
@@ -842,12 +864,12 @@ LogicalResult FoldConstantCaseOp::matchAndRewrite(
   return success();
 }
 
-void CaseOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                         MLIRContext *context) {
+void CaseOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* context) {
   results.add<FoldConstantCaseOp, DropAttributes<CaseOp>>(context);
 }
 
-static LogicalResult VerifyCaseOpBase(Operation *op, Value branch_index) {
+static LogicalResult VerifyCaseOpBase(Operation* op, Value branch_index) {
   if (!IsOfRankOrUnranked(branch_index, 0))
     return op->emitOpError()
            << "expects 'branch_index' to be a scalar, but got "
@@ -856,7 +878,7 @@ static LogicalResult VerifyCaseOpBase(Operation *op, Value branch_index) {
 }
 
 static LogicalResult VerifyCaseOrIfOpBranchFunctions(
-    SymbolTableCollection &symbol_table, Operation *op,
+    SymbolTableCollection& symbol_table, Operation* op,
     ArrayRef<Attribute> branches,
     llvm::function_ref<std::string(unsigned branch_index)> branch_name) {
   SmallVector<FunctionType, 2> branch_types;
@@ -901,7 +923,7 @@ static LogicalResult VerifyCaseOrIfOpBranchFunctions(
     branch_input_i_types.reserve(branches.size());
     llvm::transform(
         branch_types, std::back_inserter(branch_input_i_types),
-        [i](FunctionType &branch_type) { return branch_type.getInput(i); });
+        [i](FunctionType& branch_type) { return branch_type.getInput(i); });
     if (!AreCastCompatible(branch_input_i_types)) {
       std::string input_types_str;
       llvm::raw_string_ostream os(input_types_str);
@@ -917,15 +939,15 @@ static LogicalResult VerifyCaseOrIfOpBranchFunctions(
 
 LogicalResult CaseOp::verify() {
   CaseOp op = *this;
-  return VerifyCaseOpBase(op, op.branch_index());
+  return VerifyCaseOpBase(op, op.getBranchIndex());
 }
 
-LogicalResult CaseOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
+LogicalResult CaseOp::verifySymbolUses(SymbolTableCollection& symbol_table) {
   auto branch_name = [](unsigned index) {
     return llvm::formatv("branch #{0}", index).str();
   };
   return VerifyCaseOrIfOpBranchFunctions(symbol_table, *this,
-                                         branches().getValue(), branch_name);
+                                         getBranches().getValue(), branch_name);
 }
 
 //===----------------------------------------------------------------------===//
@@ -934,17 +956,17 @@ LogicalResult CaseOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
 
 LogicalResult CaseRegionOp::verify() {
   CaseRegionOp op = *this;
-  if (op.branches().empty())
+  if (op.getBranches().empty())
     return op.emitOpError() << "expects to have at least 1 region";
 
-  if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
+  if (failed(VerifyCaseOpBase(op, op.getBranchIndex()))) return failure();
 
   TypeRangeWithDesc results{op.getResultTypes(), "result"};
 
-  for (auto region_and_idx : llvm::enumerate(op.branches())) {
+  for (auto region_and_idx : llvm::enumerate(op.getBranches())) {
     std::string description =
         llvm::formatv("branch #{0} result", region_and_idx.index()).str();
-    Operation *yield = region_and_idx.value().front().getTerminator();
+    Operation* yield = region_and_idx.value().front().getTerminator();
     TypeRangeWithDesc branch_results{yield->getOperandTypes(), description};
     if (failed(VerifyTypeRangesAreCompatible(op, branch_results, results)))
       return failure();
@@ -961,7 +983,7 @@ class CaseOrIfRegionEliminatePassThrough
   using OpRewritePattern<CaseOrIfRegionOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(CaseOrIfRegionOp op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     RegionRange branches = op.getRegions();
     SmallVector<Type, 4> new_result_types;
     // Maps pass through results to extern values.
@@ -969,8 +991,8 @@ class CaseOrIfRegionEliminatePassThrough
 
     for (auto result : op.getResults()) {
       unsigned index = result.getResultNumber();
-      Region *first_branch = *branches.begin();
-      Operation *first_terminator = first_branch->front().getTerminator();
+      Region* first_branch = *branches.begin();
+      Operation* first_terminator = first_branch->front().getTerminator();
       Value returned_val = first_terminator->getOperand(index);
 
       // Pass through values would be defined outside the branch region. Keep
@@ -981,8 +1003,8 @@ class CaseOrIfRegionEliminatePassThrough
         continue;
       }
       // Check if the same extern value is returned in each branch.
-      for (Region *region : branches.drop_front()) {
-        Operation *terminator = region->front().getTerminator();
+      for (Region* region : branches.drop_front()) {
+        Operation* terminator = region->front().getTerminator();
         if (terminator->getOperand(index) != returned_val) return failure();
       }
       result_to_extern_value[result] = returned_val;
@@ -1003,7 +1025,7 @@ class CaseOrIfRegionEliminatePassThrough
         continue;
       }
       result.replaceAllUsesWith(result_to_extern_value[result]);
-      for (Region *branch : branches)
+      for (Region* branch : branches)
         branch->front().getTerminator()->eraseOperand(next_index);
     }
 
@@ -1017,8 +1039,8 @@ class CaseOrIfRegionEliminatePassThrough
 };
 }  // namespace
 
-void CaseRegionOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                               MLIRContext *context) {
+void CaseRegionOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                               MLIRContext* context) {
   results.add<CaseOrIfRegionEliminatePassThrough<TF::CaseRegionOp>>(context);
 }
 
@@ -1026,23 +1048,47 @@ void CaseRegionOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // CastOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult CastOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult CastOp::fold(FoldAdaptor) {
   // Cast with the same type is a no-op.
   Value operand = getOperand();
   if (getType() == operand.getType()) return operand;
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// CollectiveReduceV2Op
+//===----------------------------------------------------------------------===//
+
+// For `CollectiveReduceV2Op` we have two cases:
+// 1) If at least one ordering token is present, then we purely rely on ordering
+//    tokens for side effect modeling and ignore the op-based effect
+//    `TF_CollectiveReduceOrderingEffect` for which this function is relevant
+//    (note that returning `std::nullopt` here signals exactly that).
+// 2) If no ordering token is present, then we treat the op conservatively which
+//    means that different op instances need dependencies. This is realized by
+//    always returning the same string ("") in this case. In fact, we could
+//    return any string here, as long as it is the same string for all op
+//    instances without ordering tokens.
+std::optional<std::string> CollectiveReduceV2Op::GetResourceInstanceStr() {
+  return getNorderingToken() == 0 ? std::optional<std::string>("")
+                                  : std::nullopt;
+}
+
+std::optional<std::string>
+CollectiveReduceScatterV2Op::GetResourceInstanceStr() {
+  return getNorderingToken() == 0 ? std::optional<std::string>("")
+                                  : std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // ConcatOp and ConcatV2Op
 //===----------------------------------------------------------------------===//
 
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, ConcatOp, ConcatV2Op>::value>::type * = nullptr>
+template <typename OpT, typename std::enable_if<llvm::is_one_of<
+                            OpT, ConcatOp, ConcatV2Op>::value>::type* = nullptr>
 static LogicalResult Verify(OpT op) {
   // TODO(hinsu): Convert variadic length attributes to derived attributes.
-  Operation::operand_range values = op.values();
+  Operation::operand_range values = op.getValues();
 
   int axis_idx = std::is_same<OpT, ConcatOp>() ? 0 : 1;
   Value axis = *op.getODSOperands(axis_idx).begin();
@@ -1059,8 +1105,8 @@ static LogicalResult Verify(OpT op) {
 LogicalResult ConcatOp::verify() { return Verify(*this); }
 LogicalResult ConcatV2Op::verify() { return Verify(*this); }
 
-void ConcatOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                           MLIRContext *context) {
+void ConcatOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
   results.add<ConvertToConcatV2>(context);
 }
 
@@ -1080,46 +1126,46 @@ namespace {
 //   %1 = "tf.Log1p"(%0)
 class HoistCwiseUnaryOutOfConcat : public OpRewritePattern<TF::ConcatV2Op> {
  public:
-  explicit HoistCwiseUnaryOutOfConcat(MLIRContext *context)
+  explicit HoistCwiseUnaryOutOfConcat(MLIRContext* context)
       : OpRewritePattern<TF::ConcatV2Op>(context) {}
   LogicalResult matchAndRewrite(TF::ConcatV2Op op,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter& rewriter) const override;
 };
 
 LogicalResult HoistCwiseUnaryOutOfConcat::matchAndRewrite(
-    TF::ConcatV2Op op, PatternRewriter &rewriter) const {
+    TF::ConcatV2Op op, PatternRewriter& rewriter) const {
   auto loc = op.getLoc();
 
   // All concat operands must be defined by ops.
-  Operation *first_arg_op = op.values().front().getDefiningOp();
+  Operation* first_arg_op = op.getValues().front().getDefiningOp();
   if (first_arg_op == nullptr) return failure();
 
   // All concat operands must be produced by the coeff-wise unary operation.
   if (!first_arg_op->hasTrait<OpTrait::TF::CwiseUnary>()) return failure();
 
   // All concat operands must be defined by the op of same kind.
-  bool args_same_op = llvm::all_of(op.values(), [&](Value arg) -> bool {
-    Operation *arg_op = arg.getDefiningOp();
+  bool args_same_op = llvm::all_of(op.getValues(), [&](Value arg) -> bool {
+    Operation* arg_op = arg.getDefiningOp();
     return arg_op && arg_op->getName() == first_arg_op->getName();
   });
   if (!args_same_op) return failure();
 
   // Collect unary operations operands.
-  auto unary_operands = llvm::map_range(op.values(), [](Value arg) -> Value {
+  auto unary_operands = llvm::map_range(op.getValues(), [](Value arg) -> Value {
     return arg.getDefiningOp()->getOperand(0);
   });
   SmallVector<Value, 8> unary_ops_args(unary_operands);
 
   // Concatenate unary ops operands.
-  auto concat_unary_operands =
-      rewriter.create<ConcatV2Op>(loc, op.getType(), unary_ops_args, op.axis());
+  auto concat_unary_operands = rewriter.create<ConcatV2Op>(
+      loc, op.getType(), unary_ops_args, op.getAxis());
 
   // Replace original concat with an unary op.
   OperationState new_unary_op_state(loc, first_arg_op->getName().getStringRef(),
                                     concat_unary_operands.getResult(),
                                     op.getResult().getType(),
                                     ArrayRef<NamedAttribute>());
-  Operation *new_unary_op = rewriter.create(new_unary_op_state);
+  Operation* new_unary_op = rewriter.create(new_unary_op_state);
 
   rewriter.replaceOp(op, new_unary_op->getResults());
 
@@ -1153,10 +1199,10 @@ LogicalResult HoistCwiseUnaryOutOfConcat::matchAndRewrite(
 // produce incorrect concat operations.
 class HoistCwiseBinaryOutOfConcat : public OpRewritePattern<TF::ConcatV2Op> {
  public:
-  explicit HoistCwiseBinaryOutOfConcat(MLIRContext *context)
+  explicit HoistCwiseBinaryOutOfConcat(MLIRContext* context)
       : OpRewritePattern<TF::ConcatV2Op>(context) {}
   LogicalResult matchAndRewrite(TF::ConcatV2Op op,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter& rewriter) const override;
 
  private:
   struct HoistParams {
@@ -1174,18 +1220,18 @@ class HoistCwiseBinaryOutOfConcat : public OpRewritePattern<TF::ConcatV2Op> {
   // All inputs of `op` should be of the same binary op kind (e.g. tf.Mul),
   // except from the ones in `exceptions`. In that case, we can synthesize that
   // binary op kind for the values in `exceptions`.
-  Optional<HoistParams> GetHoistParams(
+  std::optional<HoistParams> GetHoistParams(
       TF::ConcatV2Op op, int64_t axis,
-      const llvm::SmallDenseMap<Value, unsigned, 4> &exceptions) const;
+      const llvm::SmallDenseMap<Value, unsigned, 4>& exceptions) const;
 };
 
 LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
-    TF::ConcatV2Op op, PatternRewriter &rewriter) const {
+    TF::ConcatV2Op op, PatternRewriter& rewriter) const {
   auto loc = op.getLoc();
 
   // Axis must be a constant scalar value.
   DenseIntElementsAttr axis_attr;
-  if (!matchPattern(op.axis(), m_Constant(&axis_attr))) return failure();
+  if (!matchPattern(op.getAxis(), m_Constant(&axis_attr))) return failure();
   if (axis_attr.getNumElements() != 1) return failure();
   int64_t axis =
       axis_attr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
@@ -1198,7 +1244,7 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
   // (e.g. converting op A to tf.Mul(A, 1.0))
   // TODO(hongm): generalize the code here to support cases where the first arg
   // has no defining op (e.g. might be a block arg).
-  Operation *first_arg_op = op.values().front().getDefiningOp();
+  Operation* first_arg_op = op.getValues().front().getDefiningOp();
   if (first_arg_op == nullptr) return failure();
 
   // All concat operands must be produced by the coeff-wise binary operation.
@@ -1209,8 +1255,8 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
   // Map from the operands to operand indices.
   llvm::SmallDenseMap<Value, unsigned, 4> exceptions;
   unsigned operand_idx = 0;
-  for (Value arg : op.values()) {
-    Operation *arg_op = arg.getDefiningOp();
+  for (Value arg : op.getValues()) {
+    Operation* arg_op = arg.getDefiningOp();
     if (arg_op && arg_op->getName() == first_arg_op->getName()) {
       ++operand_idx;
       continue;
@@ -1225,7 +1271,7 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
   // out of 2 concat inputs is an exception, we don't apply the hoist. If it's 1
   // out of 3, we do.
   const float exception_pct_threshold = 0.5;
-  if (static_cast<float>(op.values().size()) * exception_pct_threshold <=
+  if (static_cast<float>(op.getValues().size()) * exception_pct_threshold <=
       exceptions.size())
     return failure();
 
@@ -1258,7 +1304,7 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
 
     // All checks are passes, and we now prepare for rewrite.
     auto identity_const = rewriter.create<TF::ConstOp>(loc, const_attr);
-    for (const auto &kv : exceptions) {
+    for (const auto& kv : exceptions) {
       assert(!hoist_params->lhs_args[kv.second]);
       assert(!hoist_params->rhs_args[kv.second]);
 
@@ -1313,21 +1359,21 @@ LogicalResult HoistCwiseBinaryOutOfConcat::matchAndRewrite(
   OperationState new_binary_op_state(
       loc, first_arg_op->getName().getStringRef(), {lhs_concat, rhs_concat},
       op.getResult().getType(), ArrayRef<NamedAttribute>());
-  Operation *new_binary_op = rewriter.create(new_binary_op_state);
+  Operation* new_binary_op = rewriter.create(new_binary_op_state);
 
   rewriter.replaceOp(op, new_binary_op->getResults());
 
   return success();
 }
 
-Optional<HoistCwiseBinaryOutOfConcat::HoistParams>
+std::optional<HoistCwiseBinaryOutOfConcat::HoistParams>
 HoistCwiseBinaryOutOfConcat::GetHoistParams(
     TF::ConcatV2Op op, int64_t axis,
-    const llvm::SmallDenseMap<Value, unsigned, 4> &exceptions) const {
+    const llvm::SmallDenseMap<Value, unsigned, 4>& exceptions) const {
   assert(axis >= 0);
   // Collects lhs or rhs arguments of concat op operands.
   auto args = [&](int operand_idx) -> SmallVector<Value, 8> {
-    auto range = llvm::map_range(op.values(), [&](Value arg) {
+    auto range = llvm::map_range(op.getValues(), [&](Value arg) {
       if (exceptions.count(arg)) return Value();
       return arg.getDefiningOp()->getOperand(operand_idx);
     });
@@ -1337,7 +1383,7 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
   // Returns true if all binary ops operands at `operand_idx` index are tensors
   // of `axis + 1` rank and axis dim has size `1`.
   auto is_all_tensors = [&](int operand_idx, int axis) -> bool {
-    return llvm::all_of(op.values(), [&](Value arg) -> bool {
+    return llvm::all_of(op.getValues(), [&](Value arg) -> bool {
       mlir::Value operand;
       if (exceptions.count(arg)) {
         // For exceptions, since we are going to synthesize a binary op that
@@ -1355,7 +1401,7 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
 
   // Returns true if all binary ops operands at `operand_idx` index are scalars.
   auto is_all_scalars = [&](int operand_idx) -> bool {
-    return llvm::all_of(op.values(), [&](Value arg) -> bool {
+    return llvm::all_of(op.getValues(), [&](Value arg) -> bool {
       if (exceptions.count(arg)) return true;
       auto operand = arg.getDefiningOp()->getOperand(operand_idx);
       auto ranked = operand.getType().dyn_cast<RankedTensorType>();
@@ -1365,7 +1411,7 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
 
   // Concat result type must be a ranked tensor.
   auto ranked = op.getType().dyn_cast<RankedTensorType>();
-  if (!ranked) return None;
+  if (!ranked) return std::nullopt;
 
   // TODO(ezhulenev): Add support for more valid concat patterns.
 
@@ -1376,7 +1422,8 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
   // Concatenate tensor arguments on the same axis as the original operation,
   // and concatenate scalars into the vector.
   if (is_all_tensors(0, axis) && is_all_scalars(1)) {
-    std::array<int64_t, 1> rhs_dims{static_cast<int64_t>(op.values().size())};
+    std::array<int64_t, 1> rhs_dims{
+        static_cast<int64_t>(op.getValues().size())};
     auto rhs_type =
         tensorflow::GetTypeFromTFTensorShape(rhs_dims, ranked.getElementType());
     return HoistParams{args(0),
@@ -1387,7 +1434,8 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
                        rhs_type,
                        /*scalar_operand_idx=*/1};
   } else if (is_all_tensors(1, axis) && is_all_scalars(0)) {
-    std::array<int64_t, 1> lhs_dims{static_cast<int64_t>(op.values().size())};
+    std::array<int64_t, 1> lhs_dims{
+        static_cast<int64_t>(op.getValues().size())};
     auto lhs_type =
         tensorflow::GetTypeFromTFTensorShape(lhs_dims, ranked.getElementType());
     return HoistParams{args(0),
@@ -1398,13 +1446,13 @@ HoistCwiseBinaryOutOfConcat::GetHoistParams(
                        op.getType(),
                        /*scalar_operand_idx=*/0};
   }
-  return None;
+  return std::nullopt;
 }
 
 }  // namespace
 
-void ConcatV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void ConcatV2Op::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add<HoistCwiseBinaryOutOfConcat, HoistCwiseUnaryOutOfConcat>(context);
 }
 
@@ -1414,15 +1462,15 @@ void ConcatV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
 
 template <typename OpT,
           typename std::enable_if<llvm::is_one_of<
-              OpT, CumsumOp, CumulativeLogsumexpOp, CumprodOp>::value>::type * =
+              OpT, CumsumOp, CumulativeLogsumexpOp, CumprodOp>::value>::type* =
               nullptr>
 static LogicalResult Verify(OpT op) {
-  if (!IsOfRankOrUnranked(op.axis(), 0))
+  if (!IsOfRankOrUnranked(op.getAxis(), 0))
     return op.emitOpError("requires scalar axis operand");
 
   DenseIntElementsAttr axis_attr;
-  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
-    auto input_ty = op.x().getType().template dyn_cast<RankedTensorType>();
+  if (matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
+    auto input_ty = op.getX().getType().template dyn_cast<RankedTensorType>();
     if (input_ty) {
       int64_t rank = input_ty.getRank();
       assert(axis_attr.getNumElements() == 1 &&
@@ -1448,15 +1496,15 @@ LogicalResult CumulativeLogsumexpOp::verify() { return Verify(*this); }
 
 LogicalResult ConcatOffsetOp::verify() {
   ConcatOffsetOp op = *this;
-  if (op.N() < 2)
-    return op.emitOpError() << "requires N to be at least 2, got " << op.N();
+  if (op.getN() < 2)
+    return op.emitOpError() << "requires N to be at least 2, got " << op.getN();
 
-  if (op.shape().size() != op.offset().size())
+  if (op.getShape().size() != op.getOffset().size())
     return op.emitOpError()
            << "requires sizes of shapes and offsets to be the same, got sizes "
-           << op.shape().size() << " and " << op.offset().size();
+           << op.getShape().size() << " and " << op.getOffset().size();
 
-  auto ranked_dim = op.concat_dim().getType().dyn_cast<RankedTensorType>();
+  auto ranked_dim = op.getConcatDim().getType().dyn_cast<RankedTensorType>();
   if (ranked_dim && ranked_dim.getRank() != 0)
     return op.emitOpError()
            << "requires concat_dim to be a scalar, got tensor of rank "
@@ -1464,7 +1512,7 @@ LogicalResult ConcatOffsetOp::verify() {
 
   int64_t num_dims = -1;
   for (auto shape_offset_idx :
-       llvm::enumerate(llvm::zip(op.shape(), op.offset()))) {
+       llvm::enumerate(llvm::zip(op.getShape(), op.getOffset()))) {
     Value shape = std::get<0>(shape_offset_idx.value());
     Value offset = std::get<1>(shape_offset_idx.value());
     const size_t idx = shape_offset_idx.index();
@@ -1496,8 +1544,9 @@ LogicalResult ConcatOffsetOp::verify() {
   return success();
 }
 
-LogicalResult ConcatOffsetOp::fold(ArrayRef<Attribute> operands,
-                                   SmallVectorImpl<OpFoldResult> &results) {
+LogicalResult ConcatOffsetOp::fold(FoldAdaptor adaptor,
+                                   SmallVectorImpl<OpFoldResult>& results) {
+  auto operands = adaptor.getOperands();
   // ConcatOffset must have its first operand be concat_dim and at least two
   // shape tensors in variadic shapes operand.
   if (operands.size() < 3) return failure();
@@ -1564,11 +1613,11 @@ void ConstOp::getAsmResultNames(
   setNameFn(getResult(), "cst");
 }
 
-OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.empty() && "constant has no operands");
+OpFoldResult ConstOp::fold(FoldAdaptor adaptor) {
+  assert(adaptor.getOperands().empty() && "constant has no operands");
 
   // Return the held attribute value.
-  return value();
+  return getValue();
 }
 
 // Builds a constant op with the specified attribute `value`. The result
@@ -1576,7 +1625,7 @@ OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
 // wraps it up with a tensor type of empty shape.
 // TODO(jpienaar): This one differs from the autogenerated one as it takes an
 // attribute but always creates an ElementsAttr internally.
-void ConstOp::build(OpBuilder &builder, OperationState &result,
+void ConstOp::build(OpBuilder& builder, OperationState& result,
                     Attribute value) {
   ShapedType type;
   if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
@@ -1595,7 +1644,7 @@ void ConstOp::build(OpBuilder &builder, OperationState &result,
   llvm_unreachable("unsupported attribute type for building tf.Const");
 }
 
-void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
+void ConstOp::build(OpBuilder& builder, OperationState& result, Type type,
                     Attribute value) {
   // Handle the case where the type and value are already tensors.
   if (type.isa<TensorType>() && value.isa<ElementsAttr>()) {
@@ -1610,9 +1659,9 @@ void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
 }
 
 LogicalResult ConstOp::inferReturnTypes(
-    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    MLIRContext* context, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
+    SmallVectorImpl<Type>& inferredReturnTypes) {
   auto value = attributes.get("value");
   if (!value) return emitOptionalError(location, "missing attribute 'value'");
   if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
@@ -1630,7 +1679,7 @@ LogicalResult ConstOp::inferReturnTypes(
 
 static LogicalResult VerifyConvOpAttributes(
     int num_dims, ArrayRef<Attribute> strides, ArrayRef<Attribute> dilations,
-    llvm::Optional<mlir::Location> location) {
+    std::optional<mlir::Location> location) {
   int64_t strides_size = strides.size();
   if (strides_size != num_dims)
     return emitOptionalError(
@@ -1655,19 +1704,19 @@ static LogicalResult VerifyConvOpAttributes(
 // * Number of input channels is divisible by the number of filter input
 //   channels
 template <typename OpT, typename std::enable_if<llvm::is_one_of<
-                            OpT, Conv2DOp, Conv3DOp>::value>::type * = nullptr>
+                            OpT, Conv2DOp, Conv3DOp>::value>::type* = nullptr>
 static LogicalResult Verify(OpT op) {
   int num_spatial_dims = std::is_same<OpT, Conv2DOp>() ? 2 : 3;
   int num_dims = 2 + num_spatial_dims;
 
-  StringRef data_format = op.data_format();
+  StringRef data_format = op.getDataFormat();
   tensorflow::TensorFormat format;
   auto data_format_is_valid = FormatFromString(data_format.str(), &format);
   if (!data_format_is_valid) {
     return emitOptionalError(op.getLoc(), "Invalid data format provided");
   }
 
-  const StringRef paddings = op.padding();
+  const StringRef paddings = op.getPadding();
   tensorflow::Padding padding;
   auto padding_is_valid = GetPaddingFromString(paddings.str(), &padding);
   if (!padding_is_valid.ok()) {
@@ -1679,8 +1728,8 @@ static LogicalResult Verify(OpT op) {
   // * Length of explicit_paddings attribute is valid and has non negative
   //   elements
   // * strides and dilations attributes have positive elements
-  if (!IsOfRankOrUnranked(op.input(), num_dims) ||
-      !IsOfRankOrUnranked(op.filter(), num_dims))
+  if (!IsOfRankOrUnranked(op.getInput(), num_dims) ||
+      !IsOfRankOrUnranked(op.getFilter(), num_dims))
     return emitOptionalError(op.getLoc(), "requires operands to be ", num_dims,
                              "D tensor");
 
@@ -1712,17 +1761,17 @@ static LogicalResult Verify(OpT op) {
                                "requires non negative explicit paddings");
   }
 
-  ArrayRef<Attribute> strides = op.strides().getValue();
-  ArrayRef<Attribute> dilations = op.dilations().getValue();
+  ArrayRef<Attribute> strides = op.getStrides().getValue();
+  ArrayRef<Attribute> dilations = op.getDilations().getValue();
   if (failed(
           VerifyConvOpAttributes(num_dims, strides, dilations, op.getLoc()))) {
     return failure();
   }
 
-  int64_t input_channels = ShapedType::kDynamicSize;
-  if (auto ty = op.input().getType().template dyn_cast<RankedTensorType>()) {
-    absl::string_view data_format(op.data_format().data(),
-                                  op.data_format().size());
+  int64_t input_channels = ShapedType::kDynamic;
+  if (auto ty = op.getInput().getType().template dyn_cast<RankedTensorType>()) {
+    absl::string_view data_format(op.getDataFormat().data(),
+                                  op.getDataFormat().size());
     tensorflow::TensorFormat format;
     auto is_valid = FormatFromString(data_format, &format);
     DCHECK(is_valid) << data_format;
@@ -1730,8 +1779,9 @@ static LogicalResult Verify(OpT op) {
     input_channels = ty.getDimSize(idx);
   }
 
-  int64_t filter_channels = ShapedType::kDynamicSize;
-  if (auto ty = op.filter().getType().template dyn_cast<RankedTensorType>()) {
+  int64_t filter_channels = ShapedType::kDynamic;
+  if (auto ty =
+          op.getFilter().getType().template dyn_cast<RankedTensorType>()) {
     int idx = tensorflow::GetFilterTensorInputChannelsDimIndex(
         num_dims, tensorflow::FORMAT_HWIO);
     filter_channels = ty.getDimSize(idx);
@@ -1756,17 +1806,17 @@ LogicalResult Conv2DOp::verify() { return Verify(*this); }
 LogicalResult Conv3DOp::verify() { return Verify(*this); }
 
 LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
-  auto perm = GetDataFormatPermutation(this->data_format(), data_format);
+  auto perm = GetDataFormatPermutation(this->getDataFormat(), data_format);
   if (perm.empty()) return failure();
 
   // Update data_format attribute and result types.
   if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
 
   // Update convolution attributes.
-  (*this)->setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  (*this)->setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  (*this)->setAttr("dilations", ShuffleArrayAttr(getDilations(), perm));
+  (*this)->setAttr("strides", ShuffleArrayAttr(getStrides(), perm));
   (*this)->setAttr("explicit_paddings",
-                   ShuffleArrayAttr(explicit_paddings(), perm, 2));
+                   ShuffleArrayAttr(getExplicitPaddings(), perm, 2));
 
   return success();
 }
@@ -1774,21 +1824,21 @@ LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
 // Verifies the inferred return type of the given operation.
 template <typename OpT,
           typename std::enable_if<llvm::is_one_of<
-              OpT, Conv2DOpAdaptor, Conv3DOpAdaptor>::value>::type * = nullptr>
+              OpT, Conv2DOpAdaptor, Conv3DOpAdaptor>::value>::type* = nullptr>
 static LogicalResult inferConvReturnTypeComponents(
-    llvm::Optional<mlir::Location> location, OpT op,
+    std::optional<mlir::Location> location, OpT op,
     ArrayRef<Attribute> explicit_padding,
-    llvm::SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+    llvm::SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   const int64_t num_spatial_dims = std::is_same<OpT, Conv2DOpAdaptor>() ? 2 : 3;
   const int64_t num_dims = 2 + num_spatial_dims;
-  const Value input = op.input();
-  const Value filter = op.filter();
+  const Value input = op.getInput();
+  const Value filter = op.getFilter();
   const TensorType input_ty = input.getType().template cast<TensorType>();
   const TensorType filter_ty = filter.getType().template cast<TensorType>();
 
-  ArrayRef<Attribute> strides = op.strides().getValue();
-  StringRef data_format = op.data_format();
-  ArrayRef<Attribute> dilations = op.dilations().getValue();
+  ArrayRef<Attribute> strides = op.getStrides().getValue();
+  StringRef data_format = op.getDataFormat();
+  ArrayRef<Attribute> dilations = op.getDilations().getValue();
 
   tensorflow::TensorFormat format;
   auto data_format_is_valid = FormatFromString(data_format.str(), &format);
@@ -1796,7 +1846,7 @@ static LogicalResult inferConvReturnTypeComponents(
   (void)data_format_is_valid;
 
   tensorflow::Padding padding;
-  const StringRef paddings = op.padding();
+  const StringRef paddings = op.getPadding();
   auto padding_is_valid = GetPaddingFromString(paddings.str(), &padding);
   assert(padding_is_valid.ok());
   (void)padding_is_valid;
@@ -1807,7 +1857,7 @@ static LogicalResult inferConvReturnTypeComponents(
 
   // Output always have `num_dims` rank. All dimensions are initialized to
   // dynamic size and can be partially inferred.
-  SmallVector<int64_t, 4> return_shape(num_dims, ShapedType::kDynamicSize);
+  SmallVector<int64_t, 4> return_shape(num_dims, ShapedType::kDynamic);
   // Output batch and channel dimension can be obtained using utilities from
   // tensorflow/core/util/tensor_format.h.
   if (input_ty.hasRank()) {
@@ -1852,9 +1902,9 @@ static LogicalResult inferConvReturnTypeComponents(
 }
 
 LogicalResult Conv2DOp::inferReturnTypeComponents(
-    MLIRContext *context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+    MLIRContext* context, std::optional<Location> location,
+    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   Conv2DOpAdaptor op(operands.getValues(), attributes);
   ArrayRef<Attribute> explicit_padding;
   ArrayAttr explicit_pad =
@@ -1868,15 +1918,15 @@ LogicalResult Conv2DOp::inferReturnTypeComponents(
                                        inferredReturnShapes);
 }
 
-StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
+StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // Input must be a tensor.
-  auto input_ty = input().getType().dyn_cast<TensorType>();
-  if (!input_ty) return data_format();
+  auto input_ty = getInput().getType().dyn_cast<TensorType>();
+  if (!input_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
@@ -1886,11 +1936,11 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
   // For f32/f16 data type decision depends on the filter size in spatial
   // dimensions, for other data types we keep current data format.
   if (!input_ty.getElementType().isF32() && !input_ty.getElementType().isF16())
-    return data_format();
+    return getDataFormat();
 
   // Keep current data format if filter rank is unknown or not equal to 4.
-  auto filter_ty = filter().getType().dyn_cast<RankedTensorType>();
-  if (!filter_ty || filter_ty.getRank() != 4) return data_format();
+  auto filter_ty = getFilter().getType().dyn_cast<RankedTensorType>();
+  if (!filter_ty || filter_ty.getRank() != 4) return getDataFormat();
 
   const int64_t d0 = filter_ty.getDimSize(0);
   const int64_t d1 = filter_ty.getDimSize(1);
@@ -1905,8 +1955,8 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
   // be computed as a GEMM in NHWC data format, and can be up to ~2x times
   // faster than convolution in NCHW.
   const bool one_by_one = d0 == 1 && d1 == 1;
-  const bool trivial_strides = all_ones(strides());
-  const bool trivial_dilations = all_ones(dilations());
+  const bool trivial_strides = all_ones(getStrides());
+  const bool trivial_dilations = all_ones(getDilations());
 
   // TODO(ezhulenev): This might lead to excessive transposes in the final IR,
   // if the ratio of 1x1 convolutions to regular convolutions is close to 1:1.
@@ -1926,7 +1976,7 @@ StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult Conv2DBackpropFilterOp::UpdateDataFormat(StringRef data_format) {
-  StringRef src_data_format = this->data_format();
+  StringRef src_data_format = this->getDataFormat();
 
   auto perm = GetDataFormatPermutation(src_data_format, data_format);
   if (perm.empty()) return failure();
@@ -1935,15 +1985,16 @@ LogicalResult Conv2DBackpropFilterOp::UpdateDataFormat(StringRef data_format) {
   if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
 
   // Update convolution attributes.
-  (*this)->setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  (*this)->setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  (*this)->setAttr("dilations", ShuffleArrayAttr(getDilations(), perm));
+  (*this)->setAttr("strides", ShuffleArrayAttr(getStrides(), perm));
   (*this)->setAttr("explicit_paddings",
-                   ShuffleArrayAttr(explicit_paddings(), perm, 2));
+                   ShuffleArrayAttr(getExplicitPaddings(), perm, 2));
 
   // Permute filter sizes operand.
   OpBuilder builder(getOperation());
   auto filter_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
-      getLoc(), filter_sizes(), StringAttr::get(getContext(), src_data_format),
+      getLoc(), getFilterSizes(),
+      StringAttr::get(getContext(), src_data_format),
       StringAttr::get(getContext(), data_format));
   setOperand(1, filter_sizes_permuted);
 
@@ -1951,15 +2002,15 @@ LogicalResult Conv2DBackpropFilterOp::UpdateDataFormat(StringRef data_format) {
 }
 
 StringRef Conv2DBackpropFilterOp::GetOptimalLayout(
-    const RuntimeDevices &devices) {
+    const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // Input must be a tensor.
-  auto input_ty = input().getType().dyn_cast<TensorType>();
-  if (!input_ty) return data_format();
+  auto input_ty = getInput().getType().dyn_cast<TensorType>();
+  if (!input_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
@@ -1979,17 +2030,17 @@ LogicalResult Conv2DBackpropInputOp::verify() {
   int num_spatial_dims = 2;
   int num_dims = 2 + num_spatial_dims;
 
-  if (!IsOfRankOrUnranked(op.out_backprop(), num_dims) ||
-      !IsOfRankOrUnranked(op.filter(), num_dims))
+  if (!IsOfRankOrUnranked(op.getOutBackprop(), num_dims) ||
+      !IsOfRankOrUnranked(op.getFilter(), num_dims))
     return op.emitOpError()
            << "requires operands to be " << num_dims << "D tensor";
   if (!IsOfRankOrUnranked(op.getResult(), num_dims))
     return op.emitOpError()
            << "requires result to be " << num_dims << "D tensor";
 
-  llvm::Optional<mlir::Location> location = op.getLoc();
-  ArrayRef<Attribute> strides = op.strides().getValue();
-  ArrayRef<Attribute> dilations = op.dilations().getValue();
+  std::optional<mlir::Location> location = op.getLoc();
+  ArrayRef<Attribute> strides = op.getStrides().getValue();
+  ArrayRef<Attribute> dilations = op.getDilations().getValue();
   LogicalResult verify_result =
       VerifyConvOpAttributes(num_dims, strides, dilations, location);
   if (failed(verify_result)) {
@@ -2000,7 +2051,7 @@ LogicalResult Conv2DBackpropInputOp::verify() {
 }
 
 LogicalResult Conv2DBackpropInputOp::UpdateDataFormat(StringRef data_format) {
-  StringRef src_data_format = this->data_format();
+  StringRef src_data_format = this->getDataFormat();
 
   auto perm = GetDataFormatPermutation(src_data_format, data_format);
   if (perm.empty()) return failure();
@@ -2009,15 +2060,15 @@ LogicalResult Conv2DBackpropInputOp::UpdateDataFormat(StringRef data_format) {
   if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
 
   // Update convolution attributes.
-  (*this)->setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
-  (*this)->setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  (*this)->setAttr("dilations", ShuffleArrayAttr(getDilations(), perm));
+  (*this)->setAttr("strides", ShuffleArrayAttr(getStrides(), perm));
   (*this)->setAttr("explicit_paddings",
-                   ShuffleArrayAttr(explicit_paddings(), perm, 2));
+                   ShuffleArrayAttr(getExplicitPaddings(), perm, 2));
 
   // Permute input sizes operand.
   OpBuilder builder(getOperation());
   auto input_sizes_permuted = builder.create<TF::DataFormatVecPermuteOp>(
-      getLoc(), input_sizes(), StringAttr::get(getContext(), src_data_format),
+      getLoc(), getInputSizes(), StringAttr::get(getContext(), src_data_format),
       StringAttr::get(getContext(), data_format));
   setOperand(0, input_sizes_permuted);
 
@@ -2025,15 +2076,15 @@ LogicalResult Conv2DBackpropInputOp::UpdateDataFormat(StringRef data_format) {
 }
 
 StringRef Conv2DBackpropInputOp::GetOptimalLayout(
-    const RuntimeDevices &devices) {
+    const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // Filter must be a tensor.
-  auto filter_ty = filter().getType().dyn_cast<TensorType>();
-  if (!filter_ty) return data_format();
+  auto filter_ty = getFilter().getType().dyn_cast<TensorType>();
+  if (!filter_ty) return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
@@ -2049,9 +2100,9 @@ StringRef Conv2DBackpropInputOp::GetOptimalLayout(
 //===----------------------------------------------------------------------===//
 
 LogicalResult Conv3DOp::inferReturnTypeComponents(
-    MLIRContext *context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+    MLIRContext* context, std::optional<Location> location,
+    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
   Conv3DOpAdaptor op(operands.getValues(), attributes);
   ArrayRef<Attribute> explicit_padding;
   ArrayAttr explicit_pad =
@@ -2071,7 +2122,7 @@ LogicalResult Conv3DOp::inferReturnTypeComponents(
 
 LogicalResult DataFormatVecPermuteOp::verify() {
   DataFormatVecPermuteOp op = *this;
-  auto input_ty = op.x().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = op.getX().getType().dyn_cast<RankedTensorType>();
   if (!input_ty) return success();
 
   int rank = input_ty.getRank();
@@ -2080,18 +2131,18 @@ LogicalResult DataFormatVecPermuteOp::verify() {
 
   if (rank == 1) {
     int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4 && dim0 != 2)
+    if (dim0 != ShapedType::kDynamic && dim0 != 4 && dim0 != 2)
       return op.emitOpError("requires 1D input of size 4 or size 2");
   }
 
   if (rank == 2) {
     int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4)
+    if (dim0 != ShapedType::kDynamic && dim0 != 4)
       return op.emitOpError(
           "requires first dimensions of 2D input to be of size 4");
 
     int64_t dim1 = input_ty.getDimSize(1);
-    if (dim1 != ShapedType::kDynamicSize && dim1 != 2)
+    if (dim1 != ShapedType::kDynamic && dim1 != 2)
       return op.emitOpError(
           "requires second dimensions of 2D input to be of size 2");
   }
@@ -2125,7 +2176,7 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
   using OpRewritePattern<OpT>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(OpT op,
-                                PatternRewriter &rewriter) const override {
+                                PatternRewriter& rewriter) const override {
     static_assert(
         llvm::is_one_of<OpT, DivNoNanOp, MulNoNanOp>::value,
         "only canonicalization of tf.DivNoNan and tf.MulNoNan is supported");
@@ -2168,7 +2219,7 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
 
     // Note that `y` is the divisor if the op is tf.DivNoNan and it is the
     // multiplier if the op is tf.MulNoNan.
-    Value y = op.y();
+    Value y = op.getY();
     // The below if condition is true iff `y.getDefiningOp()` is of the type
     // TF::ConstOp, i.e., if `y` is defined by an op and it is the tf.Const op.
     // In that case, `yDefOp` stores this tf.Const op.
@@ -2178,7 +2229,7 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
     // `y.getDefiningOp()` will not return null but dyn_cast_or_null will.
     if (auto yDefOp = dyn_cast_or_null<TF::ConstOp>(y.getDefiningOp())) {
       Type typeOfElementsInY = getElementTypeOrSelf(y.getType());
-      ElementsAttr attr = yDefOp.value();
+      ElementsAttr attr = yDefOp.getValue();
       bool yHasComplexElements = typeOfElementsInY.isa<ComplexType>();
 
       // If `y` is a splat constant, then the op will definitely get replaced.
@@ -2223,8 +2274,8 @@ class DivNoNanOrMulNoNanConstantY : public OpRewritePattern<OpT> {
 };
 }  // namespace
 
-void DivNoNanOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void DivNoNanOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add<DivNoNanOrMulNoNanConstantY<TF::DivNoNanOp, TF::DivOp>>(context);
 }
 
@@ -2232,12 +2283,13 @@ void DivNoNanOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // DivOp
 //===----------------------------------------------------------------------===//
 
-void DivOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                        MLIRContext *context) {
+void DivOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                        MLIRContext* context) {
   results.add<DivWithSqrtDivisor>(context);
 }
 
-OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult DivOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   return IdentityArithmeticOpFolder<DivOp>(*this, operands);
 }
 
@@ -2247,7 +2299,8 @@ OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
 
 LogicalResult DynamicStitchOp::verify() {
   DynamicStitchOp op = *this;
-  if (op.N() < 1) return op.emitOpError("requires attribute N with value >= 1");
+  if (op.getN() < 1)
+    return op.emitOpError("requires attribute N with value >= 1");
 
   if (RankedTensorType out_ty = op.getType().dyn_cast<RankedTensorType>()) {
     if (out_ty.getRank() == 0) {
@@ -2258,8 +2311,8 @@ LogicalResult DynamicStitchOp::verify() {
   llvm::SmallDenseSet<int64_t, 8> index_values;
   bool all_indices_const = true;
   int32_t max_index = -1;
-  llvm::Optional<SmallVector<int64_t, 4>> inferred_item_shape;
-  for (auto it : llvm::zip(op.indices(), op.data())) {
+  std::optional<SmallVector<int64_t, 4>> inferred_item_shape;
+  for (auto it : llvm::zip(op.getIndices(), op.getData())) {
     Value index = std::get<0>(it);
 
     DenseIntElementsAttr index_attr;
@@ -2299,10 +2352,10 @@ LogicalResult DynamicStitchOp::verify() {
     if (failed(mlir::verifyCompatibleShape(item_shape, *inferred_item_shape)))
       return op.emitOpError() << "has inconsistent shaped data and index "
                                  "pairs; inferred item shapes ["
-                              << llvm::makeArrayRef(*inferred_item_shape)
+                              << llvm::ArrayRef(*inferred_item_shape)
                               << "] and [" << item_shape << "] don't match";
     for (int i = 0, e = item_shape.size(); i < e; ++i) {
-      int64_t &inferred_dim = (*inferred_item_shape)[i];
+      int64_t& inferred_dim = (*inferred_item_shape)[i];
       int64_t dim = item_shape[i];
       if (ShapedType::isDynamic(inferred_dim)) inferred_dim = dim;
     }
@@ -2347,7 +2400,7 @@ LogicalResult DynamicStitchOp::verify() {
 // TODO(hinsu): Verify einsum equation attribute.
 LogicalResult EinsumOp::verify() {
   EinsumOp op = *this;
-  if (op.N() > 2) {
+  if (op.getN() > 2) {
     return op.emitOpError("supports at most two operands");
   }
   return success();
@@ -2357,7 +2410,8 @@ LogicalResult EinsumOp::verify() {
 // EmptyOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult EmptyOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult EmptyOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 1 && "empty op has one operand");
 
   Attribute attr = operands.front();
@@ -2404,12 +2458,12 @@ LogicalResult EmptyTensorListOp::verify() {
         "must have exactly one subtype in the result variant type");
   }
 
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+  if (!IsOfRankOrUnranked(op.getElementShape(), 0) &&
+      !IsOfRankOrUnranked(op.getElementShape(), 1)) {
     return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
   }
 
-  if (!IsOfRankOrUnranked(op.max_num_elements(), 0)) {
+  if (!IsOfRankOrUnranked(op.getMaxNumElements(), 0)) {
     return op.emitOpError("requires max_num_elements operand to be 0D tensor");
   }
   return success();
@@ -2425,7 +2479,7 @@ LogicalResult EmptyTensorListOp::verify() {
 
 // Helper function to get an absolute device string, combining device and
 // ordinal attribute values.
-std::string GetAbsDeviceStr(Operation *op, uint64_t device_ordinal) {
+std::string GetAbsDeviceStr(Operation* op, uint64_t device_ordinal) {
   std::string device_ordinal_str = std::to_string(device_ordinal);
   auto device_attr = op->getAttrOfType<StringAttr>("device");
   if (!device_attr || device_attr.getValue().empty()) return device_ordinal_str;
@@ -2437,53 +2491,58 @@ std::string GetAbsDeviceStr(Operation *op, uint64_t device_ordinal) {
   return absl::StrCat(device_str, ":", device_ordinal_str);
 }
 
-std::string
+std::optional<std::string>
 EnqueueTPUEmbeddingArbitraryTensorBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
-std::string EnqueueTPUEmbeddingBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+std::optional<std::string>
+EnqueueTPUEmbeddingBatchOp::GetResourceInstanceStr() {
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
-std::string EnqueueTPUEmbeddingIntegerBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+std::optional<std::string>
+EnqueueTPUEmbeddingIntegerBatchOp::GetResourceInstanceStr() {
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
-std::string EnqueueTPUEmbeddingRaggedTensorBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+std::optional<std::string>
+EnqueueTPUEmbeddingRaggedTensorBatchOp::GetResourceInstanceStr() {
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
-std::string EnqueueTPUEmbeddingSparseBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+std::optional<std::string>
+EnqueueTPUEmbeddingSparseBatchOp::GetResourceInstanceStr() {
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
-std::string EnqueueTPUEmbeddingSparseTensorBatchOp::GetResourceInstanceStr() {
-  return GetAbsDeviceStr(*this, device_ordinal());
+std::optional<std::string>
+EnqueueTPUEmbeddingSparseTensorBatchOp::GetResourceInstanceStr() {
+  return GetAbsDeviceStr(*this, getDeviceOrdinal());
 }
 
 //===----------------------------------------------------------------------===//
 // EnsureShapeOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult EnsureShapeOp::fold(llvm::ArrayRef<mlir::Attribute>) {
-  ShapedType type = input().getType().dyn_cast<ShapedType>();
+OpFoldResult EnsureShapeOp::fold(FoldAdaptor) {
+  ShapedType type = getInput().getType().dyn_cast<ShapedType>();
   if (!type || !type.hasRank()) return {};
   // If shape attribute equals input operand's type's shape, fold it to input.
-  llvm::Optional<llvm::ArrayRef<int64_t>> shape_constraint = shape();
-  if (type.getShape() == shape_constraint) return input();
+  Optional<llvm::ArrayRef<int64_t>> shape_constraint = getShape();
+  if (type.getShape() == shape_constraint) return getInput();
 
   // If input operand's type's shape always satisfies the shape attribute, fold
   // it to input.
   if (shape_constraint.has_value() &&
       shape_constraint->size() == type.getShape().size()) {
     for (int i = 0; i < shape_constraint->size(); ++i) {
-      if (!ShapedType::isDynamic(shape_constraint.getValue()[i]) &&
-          type.getDimSize(i) != shape_constraint.getValue()[i]) {
+      if (!ShapedType::isDynamic(shape_constraint.value()[i]) &&
+          type.getDimSize(i) != shape_constraint.value()[i]) {
         return {};
       }
     }
-    return input();
+    return getInput();
   }
   // Else retain to enable failing dynamically.
   return {};
@@ -2496,14 +2555,14 @@ OpFoldResult EnsureShapeOp::fold(llvm::ArrayRef<mlir::Attribute>) {
 LogicalResult EqualOp::verify() {
   EqualOp op = *this;
   // If we allow inputs to have incompatible type, then nothing to do.
-  if (!op.incompatible_shape_error()) return success();
+  if (!op.getIncompatibleShapeError()) return success();
 
   // Otherwise, check inputs are broadcastable.
   return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
       op.getOperation());
 }
 
-void EqualOp::build(OpBuilder &builder, OperationState &result, Value x,
+void EqualOp::build(OpBuilder& builder, OperationState& result, Value x,
                     Value y, BoolAttr incompatible_shape_error) {
   auto result_type = DeduceEqualCmpOpType(&builder, result.location, x, y,
                                           incompatible_shape_error);
@@ -2515,8 +2574,8 @@ namespace {
 // Flips the incompatible_shape_error attribute to true if the shapes are known
 // to be compatible.
 template <typename Ty>
-static LogicalResult flipComatibleShapeError(Ty op, PatternRewriter &rewriter) {
-  if (op.incompatible_shape_error()) {
+static LogicalResult flipComatibleShapeError(Ty op, PatternRewriter& rewriter) {
+  if (op.getIncompatibleShapeError()) {
     return rewriter.notifyMatchFailure(op, "the attribute is already true");
   }
 
@@ -2532,27 +2591,27 @@ static LogicalResult flipComatibleShapeError(Ty op, PatternRewriter &rewriter) {
 
   // Unless this is a scalar compare, a scalar output indicates that this will
   // always fail.
-  auto x_ty = op.x().getType().template dyn_cast<RankedTensorType>();
-  auto y_ty = op.y().getType().template dyn_cast<RankedTensorType>();
+  auto x_ty = op.getX().getType().template dyn_cast<RankedTensorType>();
+  auto y_ty = op.getY().getType().template dyn_cast<RankedTensorType>();
   if (ty.getRank() == 0 &&
       (!x_ty || x_ty.getRank() != 0 || !y_ty || y_ty.getRank() != 0)) {
     return rewriter.notifyMatchFailure(op, "output rank must match input rank");
   }
 
   // Shapes are known to be compatible.
-  rewriter.template replaceOpWithNewOp<Ty>(op, op.x(), op.y(),
+  rewriter.template replaceOpWithNewOp<Ty>(op, op.getX(), op.getY(),
                                            rewriter.getBoolAttr(true));
   return success();
 }
 }  // namespace
 
-void EqualOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                          MLIRContext *context) {
+void EqualOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
   results.add(flipComatibleShapeError<EqualOp>);
 }
 
-void NotEqualOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void NotEqualOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add(flipComatibleShapeError<NotEqualOp>);
 }
 
@@ -2582,7 +2641,7 @@ Type InferExpandDimsOpType(Value input, Value dim) {
   return tensorflow::GetTypeFromTFTensorShape(shape, element_ty);
 }
 
-void ExpandDimsOp::build(OpBuilder &builder, OperationState &result,
+void ExpandDimsOp::build(OpBuilder& builder, OperationState& result,
                          Value input, Value dim) {
   return build(builder, result, InferExpandDimsOpType(input, dim), input, dim);
 }
@@ -2593,21 +2652,21 @@ void ExpandDimsOp::build(OpBuilder &builder, OperationState &result,
 LogicalResult FakeQuantWithMinMaxArgsOp::verify() {
   FakeQuantWithMinMaxArgsOp op = *this;
   // TODO(fengliuai): moving the following to an utility method.
-  const llvm::fltSemantics &semantics = op.min().getSemantics();
+  const llvm::fltSemantics& semantics = op.getMin().getSemantics();
   float rmin, rmax;
   if (&semantics == &APFloat::IEEEsingle()) {
-    rmin = op.min().convertToFloat();
-    rmax = op.max().convertToFloat();
+    rmin = op.getMin().convertToFloat();
+    rmax = op.getMax().convertToFloat();
   } else {
-    rmin = op.min().convertToDouble();
-    rmax = op.max().convertToDouble();
+    rmin = op.getMin().convertToDouble();
+    rmax = op.getMax().convertToDouble();
   }
   // Range boundaries must be valid.
   if (rmin >= rmax) {
     return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
                           "," + Twine(std::to_string(rmax)) + "]");
   }
-  int64_t num_bits = op.num_bits();
+  int64_t num_bits = op.getNumBits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -2620,15 +2679,15 @@ LogicalResult FakeQuantWithMinMaxArgsOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult FakeQuantWithMinMaxVarsOp::verify() {
   FakeQuantWithMinMaxVarsOp op = *this;
-  auto min = GetRankedTensorTypeForOperand(op.min());
+  auto min = GetRankedTensorTypeForOperand(op.getMin());
   if (min && !IsOfRankedFloatTensorType(min, 0))
     return op.emitOpError("requires min to be a 0d float tensor");
 
-  auto max = GetRankedTensorTypeForOperand(op.max());
+  auto max = GetRankedTensorTypeForOperand(op.getMax());
   if (max && !IsOfRankedFloatTensorType(max, 0))
     return op.emitOpError("requires max to be a 0d float tensor");
 
-  int64_t num_bits = op.num_bits();
+  int64_t num_bits = op.getNumBits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -2641,19 +2700,19 @@ LogicalResult FakeQuantWithMinMaxVarsOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult FakeQuantWithMinMaxVarsPerChannelOp::verify() {
   FakeQuantWithMinMaxVarsPerChannelOp op = *this;
-  auto min = GetRankedTensorTypeForOperand(op.min());
+  auto min = GetRankedTensorTypeForOperand(op.getMin());
   if (min && !IsOfRankedFloatTensorType(min, 1))
     return op.emitOpError("requires min to be a 1d float tensor");
 
-  auto max = GetRankedTensorTypeForOperand(op.max());
+  auto max = GetRankedTensorTypeForOperand(op.getMax());
   if (max && !IsOfRankedFloatTensorType(max, 1))
     return op.emitOpError("requires max to be a 1d float tensor");
 
-  Value inputs = op.inputs();
+  Value inputs = op.getInputs();
   if (!HasRankAtLeast(inputs, 1))
     return op.emitError("requires inputs to be at least 1d float tensor");
 
-  int64_t num_bits = op.num_bits();
+  int64_t num_bits = op.getNumBits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -2677,9 +2736,9 @@ LogicalResult FakeQuantWithMinMaxVarsPerChannelOp::verify() {
 
 LogicalResult FillOp::verify() {
   FillOp op = *this;
-  if (!IsOfRankOrUnranked(op.dims(), 1))
+  if (!IsOfRankOrUnranked(op.getDims(), 1))
     return op.emitOpError() << "requires dims to be a 1D tensor";
-  if (!IsOfRankOrUnranked(op.value(), 0))
+  if (!IsOfRankOrUnranked(op.getValue(), 0))
     return op.emitOpError() << "requires value to be a scalar";
 
   return success();
@@ -2699,7 +2758,7 @@ static ShapedType InferFillOpType(Value dims, Value value) {
   }
 
   if (auto shape_op = dims.getDefiningOp<ShapeOp>()) {
-    if (auto t = shape_op.input().getType().dyn_cast<ShapedType>()) {
+    if (auto t = shape_op.getInput().getType().dyn_cast<ShapedType>()) {
       return t;
     }
   }
@@ -2707,12 +2766,13 @@ static ShapedType InferFillOpType(Value dims, Value value) {
   return UnrankedTensorType::get(etype);
 }
 
-void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
+void FillOp::build(OpBuilder& builder, OperationState& result, Value dims,
                    Value value) {
   FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
 }
 
-OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult FillOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 2 && "fill op has two operand");
 
   auto type = getType().cast<ShapedType>();
@@ -2753,15 +2813,15 @@ LogicalResult FusedBatchNormGradV3Op::UpdateDataFormat(StringRef data_format) {
 }
 
 StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
-    const RuntimeDevices &devices) {
+    const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
-  auto x_ty = x().getType().cast<TensorType>();
+  auto x_ty = getX().getType().cast<TensorType>();
   const bool is_f16 = x_ty.getElementType().isF16();
   if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
 
@@ -2775,23 +2835,23 @@ StringRef FusedBatchNormGradV3Op::GetOptimalLayout(
 
 LogicalResult FusedBatchNormOp::verify() {
   FusedBatchNormOp op = *this;
-  auto x = GetRankedTensorTypeForOperand(op.x());
+  auto x = GetRankedTensorTypeForOperand(op.getX());
   if (x && !IsOfRankedFloatTensorType(x, 4))
     return op.emitOpError("requires x to be a 4D float tensor");
 
-  auto scale = GetRankedTensorTypeForOperand(op.scale());
+  auto scale = GetRankedTensorTypeForOperand(op.getScale());
   if (scale && !IsOfRankedFloatTensorType(scale, 1))
     return op.emitOpError("requires scale to be a 1D float tensor");
 
-  auto offset = GetRankedTensorTypeForOperand(op.offset());
+  auto offset = GetRankedTensorTypeForOperand(op.getOffset());
   if (offset && !IsOfRankedFloatTensorType(offset, 1))
     return op.emitOpError("requires offset to be a 1D float tensor");
 
-  auto mean = GetRankedTensorTypeForOperand(op.mean());
+  auto mean = GetRankedTensorTypeForOperand(op.getMean());
   if (mean && !IsOfRankedFloatTensorType(mean, 1))
     return op.emitOpError("requires mean to be a 1D float tensor");
 
-  auto variance = GetRankedTensorTypeForOperand(op.variance());
+  auto variance = GetRankedTensorTypeForOperand(op.getVariance());
   if (variance && !IsOfRankedFloatTensorType(variance, 1))
     return op.emitOpError("requires variance to be a 1D float tensor");
 
@@ -2806,26 +2866,26 @@ LogicalResult FusedBatchNormOp::verify() {
 
 template <class Op>
 static LogicalResult InferenceFoldOperandsPermutation(
-    ArrayRef<int64_t> permutation, Op *op) {
+    ArrayRef<int64_t> permutation, Op* op) {
   // FusedBatchNorm in training mode is a layout sentitive operation, and should
   // have already assigned an optimal data format.
-  if (op->is_training()) return failure();
+  if (op->getIsTraining()) return failure();
   return ::mlir::TF::FoldOperandsPermutation(permutation, op);
 }
 
 template <class Op>
-static StringRef GetOptimalLayout(const RuntimeDevices &devices, Op *op) {
+static StringRef GetOptimalLayout(const RuntimeDevices& devices, Op* op) {
   // In inference mode FusedBatchNorm is not sensitive to data layout.
-  if (!op->is_training()) return op->data_format();
+  if (!op->getIsTraining()) return op->getDataFormat();
 
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(op->getOperation()))
-    return op->data_format();
+    return op->getDataFormat();
 
   // For f16 data type on devices with Tensor Cores support NHWC data format
   // is up to ~2x faster.
-  auto x_ty = op->x().getType().template cast<TensorType>();
+  auto x_ty = op->getX().getType().template cast<TensorType>();
   const bool is_f16 = x_ty.getElementType().isF16();
   if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
 
@@ -2842,7 +2902,7 @@ LogicalResult FusedBatchNormV2Op::UpdateDataFormat(StringRef data_format) {
   return ::mlir::TF::UpdateDataFormat(data_format, this);
 }
 
-StringRef FusedBatchNormV2Op::GetOptimalLayout(const RuntimeDevices &devices) {
+StringRef FusedBatchNormV2Op::GetOptimalLayout(const RuntimeDevices& devices) {
   return ::mlir::TF::GetOptimalLayout(devices, this);
 }
 
@@ -2855,7 +2915,7 @@ LogicalResult FusedBatchNormV3Op::UpdateDataFormat(StringRef data_format) {
   return ::mlir::TF::UpdateDataFormat(data_format, this);
 }
 
-StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
+StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices& devices) {
   return ::mlir::TF::GetOptimalLayout(devices, this);
 }
 
@@ -2865,8 +2925,8 @@ StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
 
 LogicalResult GatherV2Op::verify() {
   GatherV2Op op = *this;
-  int64_t batch_dims = op.batch_dims();
-  if (auto ty = op.indices().getType().dyn_cast<RankedTensorType>()) {
+  int64_t batch_dims = op.getBatchDims();
+  if (auto ty = op.getIndices().getType().dyn_cast<RankedTensorType>()) {
     int64_t rank = ty.getRank();
     if (batch_dims > rank || batch_dims < -rank)
       return op.emitOpError()
@@ -2875,13 +2935,13 @@ LogicalResult GatherV2Op::verify() {
     if (batch_dims < 0) batch_dims += rank;
   }
 
-  if (!HasRankAtMost(op.axis(), 1))
+  if (!HasRankAtMost(op.getAxis(), 1))
     return op.emitOpError("requires axis to have rank at most 1");
 
   DenseIntElementsAttr axis_attr;
-  if (matchPattern(op.axis(), m_Constant(&axis_attr))) {
+  if (matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
     int64_t axis = (*axis_attr.begin()).getSExtValue();
-    if (auto ty = op.params().getType().dyn_cast<RankedTensorType>()) {
+    if (auto ty = op.getParams().getType().dyn_cast<RankedTensorType>()) {
       int64_t rank = ty.getRank();
       if (axis >= rank || axis < -rank)
         return op.emitOpError() << "axis (" << axis << ") must be in range ["
@@ -2898,8 +2958,8 @@ LogicalResult GatherV2Op::verify() {
   return success();
 }
 
-void GatherOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                           MLIRContext *context) {
+void GatherOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
   results.add<GatherToV2>(context);
 }
 
@@ -2907,12 +2967,13 @@ void GatherOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // IfOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult IfOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
+LogicalResult IfOp::verifySymbolUses(SymbolTableCollection& symbol_table) {
   auto branch_name = [](unsigned index) -> std::string {
     return index == 0 ? "'then_branch'" : "'else_branch'";
   };
   return VerifyCaseOrIfOpBranchFunctions(
-      symbol_table, *this, {then_branchAttr(), else_branchAttr()}, branch_name);
+      symbol_table, *this, {getThenBranchAttr(), getElseBranchAttr()},
+      branch_name);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2922,10 +2983,10 @@ LogicalResult IfOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
 namespace {
 class FoldConstantIfOp : public OpRewritePattern<TF::IfOp> {
  public:
-  explicit FoldConstantIfOp(MLIRContext *context)
+  explicit FoldConstantIfOp(MLIRContext* context)
       : OpRewritePattern<TF::IfOp>(context) {}
   LogicalResult matchAndRewrite(TF::IfOp op,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter& rewriter) const override;
 
  private:
   template <typename T>
@@ -2935,27 +2996,28 @@ class FoldConstantIfOp : public OpRewritePattern<TF::IfOp> {
 };
 
 LogicalResult FoldConstantIfOp::matchAndRewrite(
-    TF::IfOp op, PatternRewriter &rewriter) const {
+    TF::IfOp op, PatternRewriter& rewriter) const {
   // Extract the constant cond value.
   DenseIntElementsAttr cond_attr;
-  if (!matchPattern(op.cond(), m_Constant(&cond_attr))) return failure();
+  if (!matchPattern(op.getCond(), m_Constant(&cond_attr))) return failure();
 
   // Cond value must be a scalar.
   if (cond_attr.getNumElements() != 1) return failure();
 
   // Select a branch function.
   bool cond = cond_attr.getSplatValue<BoolAttr>().getValue();
-  FlatSymbolRefAttr func = cond ? op.then_branchAttr() : op.else_branchAttr();
+  FlatSymbolRefAttr func =
+      cond ? op.getThenBranchAttr() : op.getElseBranchAttr();
 
   // Replace IfOp with PartitionedCallOp or StatefulPartitionedCallOp.
   auto rewrite = [&](auto op_type) {
     auto empty = rewriter.getStringAttr("");
     ReplaceTfOpWithNewOp<typename decltype(op_type)::CallOp>(
-        rewriter, op, op.getResultTypes(), op.input(), func,
+        rewriter, op, op.getResultTypes(), op.getInput(), func,
         /*config=*/empty, /*config_proto=*/empty, /*executor_type=*/empty);
   };
 
-  if (op.is_stateless())
+  if (op.getIsStateless())
     rewrite(CallOpType<PartitionedCallOp>{});
   else
     rewrite(CallOpType<StatefulPartitionedCallOp>{});
@@ -2964,8 +3026,8 @@ LogicalResult FoldConstantIfOp::matchAndRewrite(
 }
 }  // anonymous namespace
 
-void IfOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                       MLIRContext *context) {
+void IfOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                       MLIRContext* context) {
   results.add<FoldConstantIfOp, DropAttributes<IfOp>>(context);
 }
 
@@ -2976,9 +3038,9 @@ void IfOp::getCanonicalizationPatterns(RewritePatternSet &results,
 LogicalResult IfRegionOp::verifyRegions() {
   IfRegionOp op = *this;
   TypeRange then_types =
-      op.then_branch().front().getTerminator()->getOperandTypes();
+      op.getThenBranch().front().getTerminator()->getOperandTypes();
   TypeRange else_types =
-      op.else_branch().front().getTerminator()->getOperandTypes();
+      op.getElseBranch().front().getTerminator()->getOperandTypes();
 
   TypeRangeWithDesc results{op.getResultTypes(), "result"};
   TypeRangeWithDesc then_results{then_types, "then result"};
@@ -2994,21 +3056,21 @@ LogicalResult IfRegionOp::verifyRegions() {
 namespace {
 class FoldConstantIfRegionOp : public OpRewritePattern<TF::IfRegionOp> {
  public:
-  explicit FoldConstantIfRegionOp(MLIRContext *context)
+  explicit FoldConstantIfRegionOp(MLIRContext* context)
       : OpRewritePattern<TF::IfRegionOp>(context) {}
   LogicalResult matchAndRewrite(TF::IfRegionOp op,
-                                PatternRewriter &rewriter) const override;
+                                PatternRewriter& rewriter) const override;
 };
 
 LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
-    TF::IfRegionOp op, PatternRewriter &rewriter) const {
+    TF::IfRegionOp op, PatternRewriter& rewriter) const {
   // Extract the constant cond value.
   DenseIntElementsAttr cond_attr;
-  if (!matchPattern(op.cond(), m_Constant(&cond_attr))) return failure();
+  if (!matchPattern(op.getCond(), m_Constant(&cond_attr))) return failure();
 
   // IfRegion condition should always be a scalar. Select the region to fold to.
   bool cond = cond_attr.getSplatValue<BoolAttr>().getValue();
-  Region &region = cond ? op.then_branch() : op.else_branch();
+  Region& region = cond ? op.getThenBranch() : op.getElseBranch();
 
   // If the IfRegion is stateless but the region being inlined itself is not
   // stateless, then inlining the region could cause a loss of information.
@@ -3025,7 +3087,7 @@ LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
   // casts.
   rewriter.setInsertionPoint(yield);
   for (auto it : llvm::zip(op.getResultTypes(), updated_results)) {
-    auto &updated_result = std::get<1>(it);
+    auto& updated_result = std::get<1>(it);
     Type result_type = std::get<0>(it);
     if (result_type != updated_result.getType()) {
       updated_result =
@@ -3041,8 +3103,8 @@ LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
 }
 }  // anonymous namespace
 
-void IfRegionOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void IfRegionOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add<FoldConstantIfRegionOp,
               CaseOrIfRegionEliminatePassThrough<TF::IfRegionOp>>(context);
 }
@@ -3054,7 +3116,7 @@ void IfRegionOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // Verifies that the input is 1D.
 LogicalResult InvertPermutationOp::verify() {
   InvertPermutationOp op = *this;
-  auto x_type = op.x().getType().cast<TensorType>();
+  auto x_type = op.getX().getType().cast<TensorType>();
   if (!x_type.hasRank()) return success();
   if (x_type.getShape().size() != 1)
     return op.emitOpError() << "requires input x to be 1-dimensional";
@@ -3066,15 +3128,16 @@ LogicalResult InvertPermutationOp::verify() {
 // LeakyReluOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult LeakyReluOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 1 && "leaky relu has one operand");
 
   // leaky_relu(x, alpha: 1) -> x
-  if (alpha().convertToFloat() == 1.0f) return getOperand();
+  if (getAlpha().convertToFloat() == 1.0f) return getOperand();
 
   auto calculate = [&](FloatAttr arg) {
     APFloat val = arg.getValue();
-    if (val.isNegative()) val = alpha() * val;
+    if (val.isNegative()) val = getAlpha() * val;
     return FloatAttr::get(arg.getType(), val);
   };
 
@@ -3087,21 +3150,76 @@ OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// LegacyCallOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult LegacyCallOp::verifySymbolUses(
+    SymbolTableCollection& symbolTable) {
+  StringAttr func_attr = getFAttr().getAttr();
+  StringRef func_name = func_attr.getValue();
+  func::FuncOp func =
+      symbolTable.lookupNearestSymbolFrom<func::FuncOp>(*this, func_attr);
+
+  if (!func) {
+    return emitError("'f' attribute refers to an undefined function: ")
+           << func_name;
+  }
+
+  FunctionType func_ty = func.getFunctionType();
+  int func_arg_count = func_ty.getNumInputs();
+  int arg_count = getArgs().size();
+
+  if (arg_count != func_arg_count) {
+    return emitError() << "argument count mismatch: 'args' has " << arg_count
+                       << " argument(s), but '" << func_name << "' expects "
+                       << func_arg_count;
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp
 //===----------------------------------------------------------------------===//
 
-void LogOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                        MLIRContext *context) {
+void LogOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                        MLIRContext* context) {
   results.add<LogOfSoftmax, LogToLog1p>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// LogicalAndOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult LogicalAndOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  // TODO(b/264429950): Expand this to work for broadcastable shapes and other
+  // conditions (e.g. one operand is always True).
+  auto result_type = getType();
+
+  for (const auto& operand : operands) {
+    auto splat_attr = operand.dyn_cast_or_null<SplatElementsAttr>();
+    if (!splat_attr) continue;
+
+    if (splat_attr.getType() != result_type) continue;
+
+    // We can only fold away constant Falses.
+    auto splat_value = splat_attr.getSplatValue<BoolAttr>().getValue();
+    if (splat_value) continue;
+
+    return operand;
+  }
+
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // LogicalNotOp
 //===----------------------------------------------------------------------===//
 
-void LogicalNotOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                               MLIRContext *context) {
+void LogicalNotOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                               MLIRContext* context) {
   results
       .add<LogicalNotOfEqual, LogicalNotOfNotEqual, LogicalNotOfGreater,
            LogicalNotOfGreaterEqual, LogicalNotOfLess, LogicalNotOfLessEqual>(
@@ -3114,20 +3232,20 @@ void LogicalNotOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult MatrixBandPartOp::verify() {
   MatrixBandPartOp op = *this;
-  if (!HasRankAtLeast(op.input(), 2)) {
+  if (!HasRankAtLeast(op.getInput(), 2)) {
     return op.emitOpError()
            << "requires `input` to have rank of at least 2, but found "
-           << op.input().getType();
+           << op.getInput().getType();
   }
-  if (!IsOfRankOrUnranked(op.num_lower(), 0)) {
+  if (!IsOfRankOrUnranked(op.getNumLower(), 0)) {
     return op.emitOpError()
            << "requires `num_lower` to have 0 dimensions, but found "
-           << op.num_lower().getType();
+           << op.getNumLower().getType();
   }
-  if (!IsOfRankOrUnranked(op.num_upper(), 0)) {
+  if (!IsOfRankOrUnranked(op.getNumUpper(), 0)) {
     return op.emitOpError()
            << "requires `num_upper` to have 0 dimensions, but found "
-           << op.num_upper().getType();
+           << op.getNumUpper().getType();
   }
   return success();
 }
@@ -3136,8 +3254,8 @@ LogicalResult MatrixBandPartOp::verify() {
 // MatrixDiag Ops
 //===----------------------------------------------------------------------===//
 
-void MatrixDiagOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                               MLIRContext *context) {
+void MatrixDiagOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                               MLIRContext* context) {
   results.add<MatrixDiagToV3>(context);
 }
 
@@ -3145,8 +3263,8 @@ void MatrixDiagOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // MatrixSetDiagOp
 //===----------------------------------------------------------------------===//
 
-void MatrixSetDiagOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                  MLIRContext *context) {
+void MatrixSetDiagOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                  MLIRContext* context) {
   results.add<MatrixSetDiagToV3>(context);
 }
 
@@ -3154,8 +3272,8 @@ void MatrixSetDiagOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // MatrixSetDiagV2Op
 //===----------------------------------------------------------------------===//
 
-void MatrixSetDiagV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                    MLIRContext *context) {
+void MatrixSetDiagV2Op::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                    MLIRContext* context) {
   results.add<MatrixSetDiagV2ToV3>(context);
 }
 
@@ -3163,7 +3281,7 @@ void MatrixSetDiagV2Op::getCanonicalizationPatterns(RewritePatternSet &results,
 // MaxOp
 //===----------------------------------------------------------------------===//
 
-void MaxOp::build(OpBuilder &builder, OperationState &result, Value input,
+void MaxOp::build(OpBuilder& builder, OperationState& result, Value input,
                   Value reduction_indices, BoolAttr keep_dims) {
   Type out_ty = InferReductionOpType(input, reduction_indices, keep_dims);
   build(builder, result, out_ty, input, reduction_indices, keep_dims);
@@ -3173,8 +3291,8 @@ void MaxOp::build(OpBuilder &builder, OperationState &result, Value input,
 // MaximumOp
 //===----------------------------------------------------------------------===//
 
-void MaximumOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                            MLIRContext *context) {
+void MaximumOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
   results.add<MaximumOfZeroToRelu>(context);
 }
 
@@ -3185,11 +3303,11 @@ void MaximumOp::getCanonicalizationPatterns(RewritePatternSet &results,
 LogicalResult MaxPoolOp::FoldOperandsPermutation(
     ArrayRef<int64_t> permutation) {
   return ::mlir::TF::FoldOperandsPermutation(
-      permutation, this, {{"strides", strides()}, {"ksize", ksize()}});
+      permutation, this, {{"strides", getStrides()}, {"ksize", getKsize()}});
 }
 
 LogicalResult MaxPoolOp::UpdateDataFormat(StringRef new_data_format) {
-  StringRef src_data_format = data_format();
+  StringRef src_data_format = getDataFormat();
 
   auto perm = GetDataFormatPermutation(src_data_format, new_data_format);
   if (perm.empty()) return failure();
@@ -3198,18 +3316,18 @@ LogicalResult MaxPoolOp::UpdateDataFormat(StringRef new_data_format) {
   if (failed(::mlir::TF::UpdateDataFormat(new_data_format, this)))
     return failure();
 
-  stridesAttr(ShuffleArrayAttr(strides(), perm));
-  explicit_paddingsAttr(ShuffleArrayAttr(explicit_paddings(), perm, 2));
-  ksizeAttr(ShuffleArrayAttr(ksize(), perm));
+  setStridesAttr(ShuffleArrayAttr(getStrides(), perm));
+  setExplicitPaddingsAttr(ShuffleArrayAttr(getExplicitPaddings(), perm, 2));
+  setKsizeAttr(ShuffleArrayAttr(getKsize(), perm));
 
   return success();
 }
 
-StringRef MaxPoolOp::GetOptimalLayout(const RuntimeDevices &devices) {
+StringRef MaxPoolOp::GetOptimalLayout(const RuntimeDevices& devices) {
   // Keep current data format if no GPUs are available or if explicit placement
   // does not allow to use GPU for this operation.
   if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
-    return data_format();
+    return getDataFormat();
 
   // Defaults to NCHW.
   return "NCHW";
@@ -3221,13 +3339,13 @@ StringRef MaxPoolOp::GetOptimalLayout(const RuntimeDevices &devices) {
 
 LogicalResult MaxPoolGradOp::verify() {
   MaxPoolGradOp op = *this;
-  if (!IsOfRankOrUnranked(op.orig_input(), 4)) {
+  if (!IsOfRankOrUnranked(op.getOrigInput(), 4)) {
     return op.emitOpError() << "requires orig_input to be rank 4";
   }
-  if (!IsOfRankOrUnranked(op.orig_output(), 4)) {
+  if (!IsOfRankOrUnranked(op.getOrigOutput(), 4)) {
     return op.emitOpError() << "requires orig_output to be rank 4";
   }
-  if (!IsOfRankOrUnranked(op.grad(), 4)) {
+  if (!IsOfRankOrUnranked(op.getGrad(), 4)) {
     return op.emitOpError() << "requires grad to be rank 4";
   }
   return success();
@@ -3240,10 +3358,10 @@ LogicalResult MaxPoolGradOp::verify() {
 LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
   // Reduction indices must be defined by a constant operation.
   auto reduction_op =
-      dyn_cast_or_null<TF::ConstOp>(reduction_indices().getDefiningOp());
+      dyn_cast_or_null<TF::ConstOp>(getReductionIndices().getDefiningOp());
   if (!reduction_op) return failure();
 
-  auto reductions_value = reduction_op.value().dyn_cast<DenseElementsAttr>();
+  auto reductions_value = reduction_op.getValue().dyn_cast<DenseElementsAttr>();
   if (!reductions_value) return failure();
 
   // Prepare new reduction indices according to operand permutation.
@@ -3270,8 +3388,8 @@ LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
 // MulNoNanOp
 //===----------------------------------------------------------------------===//
 
-void MulNoNanOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
+void MulNoNanOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                             MLIRContext* context) {
   results.add<DivNoNanOrMulNoNanConstantY<TF::MulNoNanOp, TF::MulOp>>(context);
 }
 
@@ -3279,15 +3397,16 @@ void MulNoNanOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // MulOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   return IdentityArithmeticOpFolder<MulOp>(*this, operands);
 }
 
 //===----------------------------------------------------------------------===//
 // HashTableOp
 //===----------------------------------------------------------------------===//
-void HashTableOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
+void HashTableOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                              MLIRContext* context) {
   results.add<HashTableAndInitializeTableToV2>(context);
   results.add<HashTableAndLookupTableSizeToV2>(context);
   results.add<HashTableAndLookupTableFindToV2>(context);
@@ -3299,8 +3418,8 @@ void HashTableOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult BitcastOp::verify() {
   BitcastOp op = *this;
-  auto input_type = op.input().getType().cast<ShapedType>();
-  auto output_type = op.output().getType().cast<ShapedType>();
+  auto input_type = op.getInput().getType().cast<ShapedType>();
+  auto output_type = op.getOutput().getType().cast<ShapedType>();
   auto input_element_type = input_type.getElementType();
   auto output_element_type = output_type.getElementType();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
index 1732ecc7658..77f87e0f960 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
@@ -52,7 +52,7 @@ bool AreCancellablePermutations(DenseIntElementsAttr perm0,
 // attributes besides `data_format` string.
 template <typename Op>
 LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
-  auto perm = GetDataFormatPermutation(op->data_format(), data_format);
+  auto perm = GetDataFormatPermutation(op->getDataFormat(), data_format);
   if (perm.empty()) return failure();
 
   // Update data format attribute.
@@ -83,9 +83,10 @@ LogicalResult FoldOperandsPermutation(
 
   // Operation data format after folding `permutation`.
   StringRef target_data_format = [&]() -> StringRef {
-    if (op->data_format() == "NHWC" && permutation.equals(kNchwToNhwc)) {
+    if (op->getDataFormat() == "NHWC" && permutation.equals(kNchwToNhwc)) {
       return "NCHW";  // cancel NCHW->NHWC operand permutation
-    } else if (op->data_format() == "NCHW" && permutation.equals(kNhwcToNchw)) {
+    } else if (op->getDataFormat() == "NCHW" &&
+               permutation.equals(kNhwcToNchw)) {
       return "NHWC";  // cancel NHWC->NCHW operand permutation
     } else {
       return "";
@@ -105,7 +106,7 @@ LogicalResult FoldOperandsPermutation(
   // To bypass %2 we have to change data format to shuffle data format from NCHW
   // to NHWC, which is the reverse of operand permutation (function argument).
   auto reverse_permutation =
-      GetDataFormatPermutation(op->data_format(), target_data_format);
+      GetDataFormatPermutation(op->getDataFormat(), target_data_format);
   if (reverse_permutation.empty()) return failure();
 
   (*op)->setAttr("data_format", StringAttr::get(context, target_data_format));
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 1ebd7f8acc0..2c8b6a1cfd1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -29,7 +30,6 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -68,6 +68,7 @@ limitations under the License.
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_canonicalization_helper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_device_helper.h"
@@ -98,6 +99,7 @@ Value LookThroughIdentity(Value result) {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
 
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NcclAllReduceOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NegOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(OnesLikeOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(PreventGradientOp);
@@ -136,6 +138,21 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(TanhGradOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ZerosLikeOp);
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(_UnaryOpsCompositionOp);
 
+//===----------------------------------------------------------------------===//
+// NcclAllReduceOp
+//===----------------------------------------------------------------------===//
+
+// For `NcclAllReduceOp` ops the `device` attribute corresponds to the resource
+// instance.
+std::optional<std::string> NcclAllReduceOp::GetResourceInstanceStr() {
+  auto device_attr = (*this)->getAttrOfType<StringAttr>("device");
+  // Treat missing device attribute like unspecified (= empty string) attribute.
+  // Note that different op instances with the same string (including empty
+  // string) are seen as dependent (same resource instance).
+  if (!device_attr) return "";
+  return device_attr.str();
+}
+
 //===----------------------------------------------------------------------===//
 // NotEqualOp
 //===----------------------------------------------------------------------===//
@@ -143,7 +160,7 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(_UnaryOpsCompositionOp);
 LogicalResult NotEqualOp::verify() {
   NotEqualOp op = *this;
   // If we allow inputs to have incompatible type, then nothing to do.
-  if (!op.incompatible_shape_error()) return success();
+  if (!op.getIncompatibleShapeError()) return success();
 
   // Otherwise, check inputs are broadcastable.
   return mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(
@@ -163,9 +180,9 @@ void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
 
 LogicalResult OneHotOp::verify() {
   OneHotOp op = *this;
-  int64_t axis = op.axis();
+  int64_t axis = op.getAxis();
 
-  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
+  auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
   if (indices_ty &&
       !(axis == -1 || (axis >= 0 && axis <= indices_ty.getShape().size()))) {
     return op.emitOpError()
@@ -178,18 +195,18 @@ LogicalResult OneHotOp::verify() {
                             << ") to be -1 or between [0, rank(indices()))";
   }
 
-  if (!IsOfRankOrUnranked(op.depth(), 0)) {
+  if (!IsOfRankOrUnranked(op.getDepth(), 0)) {
     return op.emitOpError() << "requires depth to be a scalar";
   }
-  if (!IsOfRankOrUnranked(op.on_value(), 0)) {
+  if (!IsOfRankOrUnranked(op.getOnValue(), 0)) {
     return op.emitOpError() << "requires on_value to be a scalar";
   }
-  if (!IsOfRankOrUnranked(op.off_value(), 0)) {
+  if (!IsOfRankOrUnranked(op.getOffValue(), 0)) {
     return op.emitOpError() << "requires off_value to be a scalar";
   }
 
   DenseIntElementsAttr depth_attr;
-  if (matchPattern(op.depth(), m_Constant(&depth_attr))) {
+  if (matchPattern(op.getDepth(), m_Constant(&depth_attr))) {
     if (depth_attr.getType().getRank() != 0)
       return op.emitOpError() << "requires depth to be a scalar";
     int64_t depth = depth_attr.getValues<APInt>()[0].getSExtValue();
@@ -214,7 +231,7 @@ static TensorType InferOneHotOpType(Value indices, Value depth, Value on_value,
   auto shape = llvm::to_vector<2>(indices_ty.getShape());
   if (axis_val == -1) axis_val = shape.size();
 
-  int64_t depth_val = ShapedType::kDynamicSize;
+  int64_t depth_val = ShapedType::kDynamic;
   DenseIntElementsAttr depth_attr;
   if (matchPattern(depth, m_Constant(&depth_attr)) &&
       depth_attr.getNumElements() == 1)
@@ -238,7 +255,7 @@ void OneHotOp::build(OpBuilder &builder, OperationState &result, Value indices,
 LogicalResult PackOp::verify() {
   PackOp op = *this;
   // TODO(hinsu): Convert variadic length attributes to derived attributes.
-  Operation::operand_range values = op.values();
+  Operation::operand_range values = op.getValues();
 
   if (failed(VerifyTypesCompatibility(values,
                                       /*mask_one_dim=*/false,
@@ -262,7 +279,7 @@ LogicalResult PackOp::verify() {
   // the axis value range is [-(R+1), R+1).
   int64_t range_begin = -inputs_rank - 1;  // Inclusive
   int64_t range_end = inputs_rank + 1;     // Exclusive
-  int64_t axis = op.axis();
+  int64_t axis = op.getAxis();
   if (axis < range_begin || axis >= range_end) {
     return op.emitError() << "attribute 'axis' should be within range ["
                           << range_begin << ", " << range_end
@@ -272,7 +289,7 @@ LogicalResult PackOp::verify() {
   return success();
 }
 
-OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult PackOp::fold(FoldAdaptor) {
   // Fold pack operation if it computes the input tensor shape:
   //
   //   %shape  = tf.Shape(%arg)                    // [? x ...]
@@ -284,40 +301,44 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
   // batch size.
 
   // Pack operation should pack at least two values.
-  if (values().size() < 2) return {};
+  if (getValues().size() < 2) return {};
 
   // Dimensions packed along axis = 0 (pack scalars into vector).
-  if (axis() != 0) return {};
+  if (getAxis() != 0) return {};
 
   // First packed value is defined by a strided slice operation.
-  auto slice_op = dyn_cast_or_null<StridedSliceOp>(values()[0].getDefiningOp());
+  auto slice_op =
+      dyn_cast_or_null<StridedSliceOp>(getValues()[0].getDefiningOp());
   if (!slice_op) return {};
 
   // Input to the slice op is defined by shape operation.
-  auto shape_op = dyn_cast_or_null<ShapeOp>(slice_op.input().getDefiningOp());
+  auto shape_op =
+      dyn_cast_or_null<ShapeOp>(slice_op.getInput().getDefiningOp());
   if (!shape_op) return {};
 
   // Input tensor, which shape is reconstructed by the pack operation.
-  Value tensor = shape_op.input();
+  Value tensor = shape_op.getInput();
 
   // All masks are `0` except `shrink_axis_mask` which is equal to `1` (slicing
   // scalar value from input vector).
-  if (slice_op.begin_mask() != 0 || slice_op.ellipsis_mask() != 0 ||
-      slice_op.end_mask() != 0 || slice_op.new_axis_mask() != 0 ||
-      slice_op.shrink_axis_mask() != 1)
+  if (slice_op.getBeginMask() != 0 || slice_op.getEllipsisMask() != 0 ||
+      slice_op.getEndMask() != 0 || slice_op.getNewAxisMask() != 0 ||
+      slice_op.getShrinkAxisMask() != 1)
     return {};
 
   // Returns a value if the `value` is defined by a ConstOp with a single
   // integer element in it and has an expected rank.
-  auto get_const_int = [](Value value, int expected_rank) -> Optional<int64_t> {
+  auto get_const_int = [](Value value,
+                          int expected_rank) -> std::optional<int64_t> {
     auto const_op = dyn_cast_or_null<ConstOp>(value.getDefiningOp());
-    if (!const_op) return None;
+    if (!const_op) return std::nullopt;
 
-    auto value_attr = const_op.value().dyn_cast<DenseIntElementsAttr>();
-    if (!value_attr || value_attr.getNumElements() != 1) return None;
+    auto value_attr = const_op.getValue().dyn_cast<DenseIntElementsAttr>();
+    if (!value_attr || value_attr.getNumElements() != 1) return std::nullopt;
 
     auto value_ty = value_attr.getType();
-    if (!value_ty.hasRank() || value_ty.getRank() != expected_rank) return None;
+    if (!value_ty.hasRank() || value_ty.getRank() != expected_rank)
+      return std::nullopt;
 
     auto splat = value_attr.getSplatValue<IntegerAttr>();
     return splat.getValue().getSExtValue();
@@ -325,8 +346,8 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
 
   // All other packed values are scalar constants.
   SmallVector<int64_t, 4> packed_dims;
-  packed_dims.reserve(values().size() - 1);
-  for (Value operand : llvm::drop_begin(values(), 1)) {
+  packed_dims.reserve(getValues().size() - 1);
+  for (Value operand : llvm::drop_begin(getValues(), 1)) {
     if (auto dim = get_const_int(operand, /*expected_rank=*/0)) {
       packed_dims.push_back(*dim);
     } else {
@@ -336,9 +357,9 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
 
   // Slice exactly the first shape dimension:
   //   begin = [0] end = [1], strides = [1]
-  auto begin = get_const_int(slice_op.begin(), /*expected_rank=*/1);
-  auto end = get_const_int(slice_op.end(), /*expected_rank=*/1);
-  auto strides = get_const_int(slice_op.strides(), /*expected_rank=*/1);
+  auto begin = get_const_int(slice_op.getBegin(), /*expected_rank=*/1);
+  auto end = get_const_int(slice_op.getEnd(), /*expected_rank=*/1);
+  auto strides = get_const_int(slice_op.getStrides(), /*expected_rank=*/1);
   if (!begin.has_value() || !end.has_value() || !strides.has_value() ||
       *begin != 0 || *end != 1 || *strides != 1)
     return {};
@@ -350,7 +371,7 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
     return {};
 
   // Argument tensor rank is equal to the number of packed dimensions.
-  if (arg_ty.getRank() != values().size()) return {};
+  if (arg_ty.getRank() != getValues().size()) return {};
 
   // All other dimensions are statically known and equal to packed dims.
   auto arg_dims = llvm::drop_begin(arg_ty.getShape(), 1);
@@ -358,7 +379,7 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
     return {};
 
   // Replace %pack with %shape.
-  return slice_op.input();
+  return slice_op.getInput();
 }
 
 // Convert Pack to Reshape when there is only one operand to be packed.
@@ -376,13 +397,13 @@ struct ConvertPackToReshape : public OpRewritePattern<PackOp> {
   LogicalResult matchAndRewrite(PackOp pack_op,
                                 PatternRewriter &rewriter) const override {
     // Check if there is only one operand to be packed.
-    if (pack_op.N() != 1) {
+    if (pack_op.getN() != 1) {
       return failure();
     }
 
     // Check if input and output are static.
     auto input_ty = pack_op.getOperand(0).getType().cast<ShapedType>();
-    auto output_ty = pack_op.output().getType().cast<ShapedType>();
+    auto output_ty = pack_op.getOutput().getType().cast<ShapedType>();
     if (!input_ty.hasStaticShape() || !output_ty.hasStaticShape()) {
       return failure();
     }
@@ -411,10 +432,11 @@ void PackOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
   // Paddings must be defined by a constant operation.
-  auto paddings_op = dyn_cast_or_null<TF::ConstOp>(paddings().getDefiningOp());
+  auto paddings_op =
+      dyn_cast_or_null<TF::ConstOp>(getPaddings().getDefiningOp());
   if (!paddings_op) return failure();
 
-  auto paddings_value = paddings_op.value().dyn_cast<DenseElementsAttr>();
+  auto paddings_value = paddings_op.getValue().dyn_cast<DenseElementsAttr>();
   if (!paddings_value ||
       paddings_value.getNumElements() != permutation.size() * 2)
     return failure();
@@ -440,7 +462,8 @@ LogicalResult PadOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
 
   // Change the result type.
   getResult().setType(ShuffleRankedTensorType(getResult().getType(),
-                                              ReversePermutation(permutation)));
+                                              ReversePermutation(permutation))
+                          .cast<TensorType>());
 
   return success();
 }
@@ -460,9 +483,9 @@ LogicalResult ParseExampleV2Op::verify() {
   // NOTE(mrry): The Tdense attr is derived from dense_defaults, so we
   // do not need to validate dense_defaults.
   auto dense_types_count =
-      std::distance(op.Tdense().begin(), op.Tdense().end());
+      std::distance(op.getTdense().begin(), op.getTdense().end());
   auto dense_values_count =
-      std::distance(op.dense_values().begin(), op.dense_values().end());
+      std::distance(op.getDenseValues().begin(), op.getDenseValues().end());
   if (dense_values_count != dense_types_count) {
     return op.emitError() << "output 'dense_values' should have same length "
                           << "as attribute 'Tdense'";
@@ -472,25 +495,25 @@ LogicalResult ParseExampleV2Op::verify() {
   // NOTE(mrry): The sparse_types attr is derived from sparse_values, so we
   // do not need to validate sparse_values.
   auto sparse_types_count =
-      std::distance(op.sparse_types().begin(), op.sparse_types().end());
-  if (op.num_sparse() != sparse_types_count) {
+      std::distance(op.getSparseTypes().begin(), op.getSparseTypes().end());
+  if (op.getNumSparse() != sparse_types_count) {
     return op.emitError() << "attribute 'num_sparse' should be the same as "
                           << "the length of attribute 'sparse_types'";
   }
-  if (op.sparse_indices().size() != sparse_types_count) {
+  if (op.getSparseIndices().size() != sparse_types_count) {
     return op.emitError() << "output 'sparse_indices' should have same length "
                           << "as attribute 'sparse_types'";
   }
-  if (op.sparse_shapes().size() != sparse_types_count) {
+  if (op.getSparseShapes().size() != sparse_types_count) {
     return op.emitError() << "output 'sparse_shapes' should have same length "
                           << "as attribute 'sparse_types'";
   }
 
   // Validate ragged variadic output lengths.
-  auto ragged_value_types_count = std::distance(op.ragged_value_types().begin(),
-                                                op.ragged_value_types().end());
-  auto ragged_split_types_count = std::distance(op.ragged_split_types().begin(),
-                                                op.ragged_split_types().end());
+  auto ragged_value_types_count = std::distance(
+      op.getRaggedValueTypes().begin(), op.getRaggedValueTypes().end());
+  auto ragged_split_types_count = std::distance(
+      op.getRaggedSplitTypes().begin(), op.getRaggedSplitTypes().end());
   if (ragged_value_types_count != ragged_split_types_count) {
     return op.emitError() << "attribute 'ragged_value_types' should have same "
                           << "length as attribute 'ragged_split_types'";
@@ -515,7 +538,7 @@ static LogicalResult VerifyPartitionedCall(CallOpClass op,
 
   FunctionType function_ty = function.getFunctionType();
   int func_arg_count = function_ty.getNumInputs();
-  int arg_count = op.args().size();
+  int arg_count = op.getArgs().size();
 
   if (arg_count != func_arg_count) {
     return op.emitError() << "argument count mismatch: 'args' has " << arg_count
@@ -543,7 +566,8 @@ LogicalResult TPUPartitionedCallOp::verifySymbolUses(
 // PowOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult PowOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult PowOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   auto constant_y = operands[1].dyn_cast_or_null<DenseFPElementsAttr>();
   if (constant_y && constant_y.isSplat()) {
     APFloat y_value = constant_y.getSplatValue<APFloat>();
@@ -554,7 +578,7 @@ OpFoldResult PowOp::fold(ArrayRef<Attribute> operands) {
           FloatAttr::get(output_type.getElementType(), /*value=*/1.0));
     }
     if (y_value.isExactlyValue(1.0)) {
-      return x();
+      return getX();
     }
   }
   return {};
@@ -580,12 +604,12 @@ void QuantizeAndDequantizeV2Op::getCanonicalizationPatterns(
 //
 LogicalResult QrOp::verify() {
   QrOp op = *this;
-  auto ttype = op.input().getType().cast<TensorType>();
+  auto ttype = op.getInput().getType().cast<TensorType>();
   if (!ttype.hasRank()) return success();
-  if (!HasRankAtLeast(op.input(), 2))
+  if (!HasRankAtLeast(op.getInput(), 2))
     return op.emitOpError(
         "requires ranked input tensor to be of rank 2 or more");
-  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
+  if (!HasRankAtMost(op.getInput(), std::numeric_limits<int32_t>::max()))
     return op.emitOpError(
         "requires ranked input tensor to be of rank INT32_MAX or less");
 
@@ -607,7 +631,7 @@ void ReadVariableOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult RandomUniformOp::verify() {
   RandomUniformOp op = *this;
-  if (!IsOfRankOrUnranked(op.shape(), 1))
+  if (!IsOfRankOrUnranked(op.getShape(), 1))
     return op.emitOpError("shape must be 1D tensor");
   return success();
 }
@@ -684,7 +708,8 @@ void RangeOp::build(OpBuilder &builder, OperationState &result, Value start,
       start, limit, delta);
 }
 
-OpFoldResult RangeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult RangeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 3);
   auto start_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
   auto limit_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
@@ -742,8 +767,8 @@ void RankOp::build(OpBuilder &builder, OperationState &result, Value input) {
 }
 
 // This will create a constant value for RankOp of a ranked tensor.
-OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
-  auto type = input().getType();
+OpFoldResult RankOp::fold(FoldAdaptor) {
+  auto type = getInput().getType();
   auto ranked_type = type.dyn_cast<RankedTensorType>();
   if (!ranked_type) return {};
 
@@ -765,7 +790,8 @@ void RealDivOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RealDivWithSqrtDivisor, RealDivWithConstDivisor>(context);
 }
 
-OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult RealDivOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
 }
 
@@ -805,7 +831,7 @@ LogicalResult GetReshapeOutputType(Value tensor, Value shape,
     // shape.
     if (shape_ty.hasStaticShape()) {
       llvm::SmallVector<int64_t, 8> dynamic_shape(shape_ty.getDimSize(0),
-                                                  ShapedType::kDynamicSize);
+                                                  ShapedType::kDynamic);
       output_ty =
           tensorflow::GetTypeFromTFTensorShape(dynamic_shape, element_ty);
     }
@@ -822,7 +848,7 @@ LogicalResult GetReshapeOutputType(Value tensor, Value shape,
   for (const auto &dim : llvm::enumerate(shape_attr.getValues<APInt>())) {
     const int64_t size = dim.value().getSExtValue();
     if (size == tensorflow::kTFDynamicSize ||  // NOLINT
-        size == ShapedType::kDynamicSize) {    // NOLINT
+        size == ShapedType::kDynamic) {        // NOLINT
       if (unknown_index != -1)
         return error_handler(llvm::formatv(
             "requires 'shape' to have at most one dynamic dimension, but got "
@@ -885,13 +911,13 @@ LogicalResult ReshapeOp::verify() {
     return op.emitOpError() << message;
   };
   TensorType expected_ty;
-  if (failed(GetReshapeOutputType(op.tensor(), op.shape(), error_handler,
+  if (failed(GetReshapeOutputType(op.getTensor(), op.getShape(), error_handler,
                                   expected_ty)))
     return failure();
 
   auto output_ty = op.getType().dyn_cast<RankedTensorType>();
   if (!output_ty) return success();
-  auto tensor_ty = op.tensor().getType().cast<TensorType>();
+  auto tensor_ty = op.getTensor().getType().cast<TensorType>();
   if (output_ty.hasStaticShape() && tensor_ty.hasStaticShape()) {
     const int64_t output_ty_size = output_ty.getNumElements();
     const int64_t tensor_ty_size = tensor_ty.getNumElements();
@@ -929,8 +955,8 @@ void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<RedundantReshape, ReshapeToSelfShape>(context);
 }
 
-OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
-  Value tensor = this->tensor();
+OpFoldResult ReshapeOp::fold(FoldAdaptor) {
+  Value tensor = this->getTensor();
 
   // Fold reshape if operand and result types are the same and all dimensions
   // are statically known (no-op reshape).
@@ -956,8 +982,8 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
 //         first dimension equal to `cond`.
 LogicalResult SelectOp::verify() {
   SelectOp op = *this;
-  auto then_tensor = op.then_value().getType().cast<TensorType>();
-  auto else_tensor = op.else_value().getType().cast<TensorType>();
+  auto then_tensor = op.getThenValue().getType().cast<TensorType>();
+  auto else_tensor = op.getElseValue().getType().cast<TensorType>();
   // Check (1).
   if (!AreCastCompatible({then_tensor, else_tensor}))
     return op.emitOpError() << "requires t and e have compatible shapes";
@@ -988,7 +1014,7 @@ LogicalResult SelectOp::verify() {
     return success();
   }
 
-  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
+  auto cond_tensor = op.getCondition().getType().dyn_cast<RankedTensorType>();
   if (!cond_tensor) return success();
   auto cond_rank = cond_tensor.getRank();
   // Check (2a) and (2b).
@@ -1001,9 +1027,8 @@ LogicalResult SelectOp::verify() {
              << "requires that t and e are nonscalar when pred is a vector";
     }
     // We know `data` tensor has a rank of at least 1.
-    if (data_first_dim != ShapedType::kDynamicSize &&
-        cond_shape != ShapedType::kDynamicSize &&
-        data_first_dim != cond_shape) {
+    if (data_first_dim != ShapedType::kDynamic &&
+        cond_shape != ShapedType::kDynamic && data_first_dim != cond_shape) {
       return op.emitOpError() << "requires that, when pred is a vector, the "
                                  "shape matches the first dimension of t and e";
     }
@@ -1099,7 +1124,7 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
 
 LogicalResult ShapeOp::verify() {
   ShapeOp op = *this;
-  return VerifyShapeOperandAndResult(op, op.input().getType(), op.getType());
+  return VerifyShapeOperandAndResult(op, op.getInput().getType(), op.getType());
 }
 
 // Converts shape of the given type to attribute if it is of ranked tensor type.
@@ -1121,7 +1146,7 @@ static Attribute ConvertShapeToAttr(Type input_ty, int out_width) {
   return DenseElementsAttr::get(result_type, dimensions);
 }
 
-OpFoldResult ShapeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ShapeOp::fold(FoldAdaptor) {
   int width =
       getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
   return ConvertShapeToAttr(getOperand().getType(), width);
@@ -1144,7 +1169,7 @@ void ShapeOp::build(OpBuilder &builder, OperationState &result, Value input,
 
 LogicalResult ShapeNOp::verify() {
   ShapeNOp op = *this;
-  const size_t num_tensors = op.N();
+  const size_t num_tensors = op.getN();
 
   if (op.getNumOperands() != num_tensors)
     return op.emitOpError() << "requires " << num_tensors << " operand(s), got "
@@ -1243,18 +1268,18 @@ void ShapeNOp::getCanonicalizationPatterns(RewritePatternSet &results,
 //
 LogicalResult SizeOp::verify() {
   SizeOp op = *this;
-  if (!HasRankAtMost(op.input(), std::numeric_limits<int32_t>::max()))
+  if (!HasRankAtMost(op.getInput(), std::numeric_limits<int32_t>::max()))
     return op.emitOpError(
         "requires ranked input tensor to be of rank INT32_MAX or less");
 
   // Output type needs to be scalar.
-  if (!IsOfRankOrUnranked(op.output(), /*rank=*/0))
+  if (!IsOfRankOrUnranked(op.getOutput(), /*rank=*/0))
     return op.emitOpError("requires scalar output");
 
   return success();
 }
 
-OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SizeOp::fold(FoldAdaptor) {
   ShapedType output_type = getType().cast<ShapedType>();
   if (!output_type.hasRank()) return {};
   ShapedType input_type = getOperand().getType().cast<ShapedType>();
@@ -1284,12 +1309,12 @@ OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
 //
 LogicalResult SliceOp::verify() {
   SliceOp op = *this;
-  RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.begin());
+  RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.getBegin());
   if (begin_ty && begin_ty.getRank() != 1) {
     return op.emitOpError() << "requires begin operand to be 1D tensor";
   }
 
-  RankedTensorType size_ty = GetRankedTensorTypeForOperand(op.size());
+  RankedTensorType size_ty = GetRankedTensorTypeForOperand(op.getSize());
   if (size_ty && size_ty.getRank() != 1) {
     return op.emitOpError() << "requires size operand to be 1D tensor";
   }
@@ -1303,13 +1328,13 @@ LogicalResult SliceOp::verify() {
                                " same number of elements";
   }
 
-  auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
   if (input_ty && begin_ty.getNumElements() != input_ty.getRank()) {
     return op.emitOpError() << "requires number of elements in begin and size "
                                "are equal to input rank";
   }
 
-  auto output_ty = op.output().getType().dyn_cast<RankedTensorType>();
+  auto output_ty = op.getOutput().getType().dyn_cast<RankedTensorType>();
   if (output_ty && input_ty && output_ty.getRank() != input_ty.getRank()) {
     return op.emitOpError()
            << "requires output to have the same rank as input, but got input "
@@ -1318,35 +1343,35 @@ LogicalResult SliceOp::verify() {
   }
 
   DenseIntElementsAttr begin_indices;
-  if (matchPattern(op.begin(), m_Constant(&begin_indices))) {
+  if (matchPattern(op.getBegin(), m_Constant(&begin_indices))) {
     DenseIntElementsAttr slice_sizes;
     bool constant_slice_sizes =
-        matchPattern(op.size(), m_Constant(&slice_sizes));
+        matchPattern(op.getSize(), m_Constant(&slice_sizes));
     int dim = 0;
     // TODO(jpienaar): Reformulate the shape verification below to not use magic
     // constants.
     for (const APInt &raw_begin_index : begin_indices.getValues<APInt>()) {
       int64_t begin_index = raw_begin_index.getSExtValue();
       int64_t input_size =
-          input_ty ? input_ty.getShape()[dim] : ShapedType::kDynamicSize;
+          input_ty ? input_ty.getShape()[dim] : ShapedType::kDynamic;
       int64_t slice_size =
           constant_slice_sizes
               ? slice_sizes.getValues<APInt>()[dim].getSExtValue()
               : 0;
       int64_t output_size =
-          output_ty ? output_ty.getShape()[dim] : ShapedType::kDynamicSize;
+          output_ty ? output_ty.getShape()[dim] : ShapedType::kDynamic;
 
-      if (slice_size == -1 && input_size != ShapedType::kDynamicSize) {
+      if (slice_size == -1 && input_size != ShapedType::kDynamic) {
         slice_size = input_size - begin_index;
       }
-      if (output_size != ShapedType::kDynamicSize && constant_slice_sizes &&
+      if (output_size != ShapedType::kDynamic && constant_slice_sizes &&
           output_size != slice_size) {
         return op.emitOpError()
                << "requires output size to have the same size of slice, got "
                   "slice size "
                << slice_size << " and output size " << output_size;
       }
-      if (begin_index < 0 || (input_size != ShapedType::kDynamicSize &&
+      if (begin_index < 0 || (input_size != ShapedType::kDynamic &&
                               begin_index + slice_size > input_size)) {
         return op.emitOpError()
                << "requires 0 <= begin[i] <= begin[i] + size[i] <= Di";
@@ -1356,12 +1381,12 @@ LogicalResult SliceOp::verify() {
   } else if (input_ty) {
     // If the inputs are ranked, we can do a few more sanity checks.
     DenseIntElementsAttr slice_sizes;
-    if (matchPattern(op.size(), m_Constant(&slice_sizes))) {
+    if (matchPattern(op.getSize(), m_Constant(&slice_sizes))) {
       auto input_shape = input_ty.getShape();
       for (int64_t i = 0; i < input_ty.getRank(); ++i) {
         int64_t slice_size = slice_sizes.getValues<APInt>()[i].getSExtValue();
         int64_t input_size = input_shape[i];
-        if (slice_size != -1 && input_size != ShapedType::kDynamicSize &&
+        if (slice_size != -1 && input_size != ShapedType::kDynamic &&
             slice_size > input_size) {
           return op.emitOpError() << "requires size[i] <= Di, even if begin[i] "
                                      "is unknown at compile time";
@@ -1379,7 +1404,7 @@ LogicalResult SliceOp::verify() {
 
 LogicalResult SoftmaxOp::verify() {
   SoftmaxOp op = *this;
-  if (!HasRankAtLeast(op.logits(), 1)) {
+  if (!HasRankAtLeast(op.getLogits(), 1)) {
     return op.emitOpError("requires operand to have rank at least 1");
   }
   return success();
@@ -1395,9 +1420,10 @@ LogicalResult SoftmaxOp::verify() {
 //
 LogicalResult SoftmaxCrossEntropyWithLogitsOp::verify() {
   SoftmaxCrossEntropyWithLogitsOp op = *this;
-  auto broadcasted_ty = OpTrait::util::getBroadcastedType(
-                            op.features().getType(), op.labels().getType())
-                            .dyn_cast_or_null<ShapedType>();
+  auto broadcasted_ty =
+      OpTrait::util::getBroadcastedType(op.getFeatures().getType(),
+                                        op.getLabels().getType())
+          .dyn_cast_or_null<ShapedType>();
   if (!broadcasted_ty ||
       (broadcasted_ty.hasRank() && broadcasted_ty.getRank() != 2))
     return op.emitOpError(
@@ -1423,18 +1449,18 @@ int64_t SpaceToBatchNDBlockRank(const TensorType block_shape_type,
 
 LogicalResult SpaceToBatchNDOp::verify() {
   SpaceToBatchNDOp op = *this;
-  const auto input_type = op.input().getType().cast<TensorType>();
-  const auto block_shape_type = op.block_shape().getType().cast<TensorType>();
-  const auto paddings_type = op.paddings().getType().cast<TensorType>();
+  const auto input_type = op.getInput().getType().cast<TensorType>();
+  const auto block_shape_type = op.getBlockShape().getType().cast<TensorType>();
+  const auto paddings_type = op.getPaddings().getType().cast<TensorType>();
 
   // Check that block_shape has rank 1.
-  if (!IsOfRankOrUnranked(op.block_shape(), 1)) {
+  if (!IsOfRankOrUnranked(op.getBlockShape(), 1)) {
     return op.emitOpError() << "requires rank of block_shape = 1; got "
                             << block_shape_type.getRank();
   }
 
   // Check that paddings has rank 2.
-  if (!IsOfRankOrUnranked(op.paddings(), 2)) {
+  if (!IsOfRankOrUnranked(op.getPaddings(), 2)) {
     return op.emitOpError()
            << "requires rank of paddings = 2; got " << paddings_type.getRank();
   }
@@ -1469,7 +1495,7 @@ LogicalResult SpaceToBatchNDOp::verify() {
   ElementsAttr paddings_attr = nullptr;
 
   // Check that block_shape[*] >= 1.
-  if (matchPattern(op.block_shape(), m_Constant(&block_shape_attr))) {
+  if (matchPattern(op.getBlockShape(), m_Constant(&block_shape_attr))) {
     uint64_t i = 0;
     for (auto block_len : block_shape_attr.getValues<APInt>()) {
       if (block_len.getSExtValue() < 1) {
@@ -1483,7 +1509,7 @@ LogicalResult SpaceToBatchNDOp::verify() {
   }
 
   // Check that paddings[*] >= 0.
-  if (matchPattern(op.paddings(), m_Constant(&paddings_attr))) {
+  if (matchPattern(op.getPaddings(), m_Constant(&paddings_attr))) {
     for (uint64_t i = 0; i < block_rank; ++i) {
       const int64_t pad_start =
           paddings_attr.getValues<APInt>()[{i, 0}].getSExtValue();
@@ -1527,14 +1553,14 @@ LogicalResult SpaceToBatchNDOp::verify() {
 
 LogicalResult SparseSoftmaxCrossEntropyWithLogitsOp::verify() {
   SparseSoftmaxCrossEntropyWithLogitsOp op = *this;
-  if (!IsOfRankOrUnranked(op.features(), 2)) {
+  if (!IsOfRankOrUnranked(op.getFeatures(), 2)) {
     return op.emitOpError("requires features operand of rank two");
   }
-  if (!IsOfRankOrUnranked(op.labels(), 1)) {
+  if (!IsOfRankOrUnranked(op.getLabels(), 1)) {
     return op.emitOpError("requires labels operand of rank one");
   }
-  auto features_ty = op.features().getType().dyn_cast<RankedTensorType>();
-  auto labels_ty = op.labels().getType().dyn_cast<RankedTensorType>();
+  auto features_ty = op.getFeatures().getType().dyn_cast<RankedTensorType>();
+  auto labels_ty = op.getLabels().getType().dyn_cast<RankedTensorType>();
   if (features_ty && labels_ty) {
     int64_t features_batches = features_ty.getDimSize(0);
     int64_t labels_batches = labels_ty.getDimSize(0);
@@ -1555,10 +1581,11 @@ LogicalResult SparseSoftmaxCrossEntropyWithLogitsOp::verify() {
 // Writes the split dimension's index (adjusted with input rank) via `dim_index`
 // if it's a constant.
 template <class Op>
-LogicalResult VerifySplitInputAndSplitDim(Op op, Optional<int64_t> *dim_index) {
-  *dim_index = llvm::None;
+LogicalResult VerifySplitInputAndSplitDim(Op op,
+                                          std::optional<int64_t> *dim_index) {
+  *dim_index = std::nullopt;
 
-  Value split_dim = op.split_dim();
+  Value split_dim = op.getSplitDim();
   if (auto split_dim_type = split_dim.getType().dyn_cast<RankedTensorType>())
     if (split_dim_type.getRank() != 0)
       return op.emitOpError(
@@ -1567,7 +1594,8 @@ LogicalResult VerifySplitInputAndSplitDim(Op op, Optional<int64_t> *dim_index) {
   // We can perform further verification if the input tensor to be split has
   // known rank and the split dimension tensor is a constant.
 
-  auto input_type = op.value().getType().template dyn_cast<RankedTensorType>();
+  auto input_type =
+      op.getValue().getType().template dyn_cast<RankedTensorType>();
   if (!input_type) return success();
 
   int64_t input_rank = input_type.getRank();
@@ -1592,12 +1620,12 @@ LogicalResult VerifySplitInputAndSplitDim(Op op, Optional<int64_t> *dim_index) {
 
 LogicalResult SplitOp::verify() {
   SplitOp op = *this;
-  Optional<int64_t> dim_index;
+  std::optional<int64_t> dim_index;
   if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
   if (!dim_index) return success();
 
   int64_t input_dim_size =
-      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+      op.getValue().getType().cast<RankedTensorType>().getDimSize(*dim_index);
   if (ShapedType::isDynamic(input_dim_size)) return success();
 
   if (op.getNumResults() == 0) return failure();
@@ -1616,7 +1644,7 @@ LogicalResult SplitOp::verify() {
 LogicalResult SplitVOp::verify() {
   SplitVOp op = *this;
   auto split_sizes_type =
-      op.size_splits().getType().dyn_cast<RankedTensorType>();
+      op.getSizeSplits().getType().dyn_cast<RankedTensorType>();
   if (!split_sizes_type) return success();
 
   if (split_sizes_type.getRank() != 1 ||
@@ -1625,22 +1653,22 @@ LogicalResult SplitVOp::verify() {
     return op.emitOpError("split sizes should be a 1D tensor of ")
            << op.getNumResults() << " elements";
 
-  Optional<int64_t> dim_index = 0;
+  std::optional<int64_t> dim_index = 0;
   if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure();
   if (!dim_index) return success();
 
   int64_t input_dim_size =
-      op.value().getType().cast<RankedTensorType>().getDimSize(*dim_index);
+      op.getValue().getType().cast<RankedTensorType>().getDimSize(*dim_index);
   if (ShapedType::isDynamic(input_dim_size)) return success();
 
   // If split sizes come from a constant, they must sum to the dimension size
   // along split_dim, and we can have no more than one dynamic dimension.
   DenseIntElementsAttr split_sizes_attr;
-  if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr)))
+  if (!matchPattern(op.getSizeSplits(), m_Constant(&split_sizes_attr)))
     return success();
 
   int64_t total_dim_size = 0;  // Total dimension size assigned to splits
-  llvm::Optional<int64_t> dynamic_dim_index;
+  std::optional<int64_t> dynamic_dim_index;
 
   SmallVector<int64_t, 4> split_sizes;
   split_sizes.reserve(
@@ -1690,13 +1718,13 @@ void SquareOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult SqueezeOp::verify() {
   SqueezeOp op = *this;
-  auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
+  auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
 
   if (!input_type) return success();  // Can't verify squeeze dims.
 
   int64_t input_rank = input_type.getRank();
   for (const auto &squeeze_dim_apint :
-       op.squeeze_dims().getAsValueRange<IntegerAttr>()) {
+       op.getSqueezeDims().getAsValueRange<IntegerAttr>()) {
     int64_t squeeze_dim = squeeze_dim_apint.getSExtValue();
     if (squeeze_dim < -input_rank || squeeze_dim >= input_rank) {
       return op.emitOpError()
@@ -1717,7 +1745,8 @@ void SubOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<SubOfNeg>(context);
 }
 
-OpFoldResult SubOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult SubOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   return IdentityArithmeticOpFolder<SubOp>(*this, operands);
 }
 
@@ -1732,16 +1761,16 @@ void SumOp::build(OpBuilder &builder, OperationState &result, Value input,
 }
 
 // TODO: Templatize this fold for all reduction ops.
-OpFoldResult SumOp::fold(ArrayRef<Attribute> operands) {
-  auto input_ty = input().getType().template dyn_cast<RankedTensorType>();
+OpFoldResult SumOp::fold(FoldAdaptor) {
+  auto input_ty = getInput().getType().template dyn_cast<RankedTensorType>();
   if (!input_ty) return {};
   auto result_ty = getType().template dyn_cast<RankedTensorType>();
   if (!result_ty) return {};
 
   // Bypass this op if the result has the same shape and type. This can happen
   // if the input tensor has size 0 or size 1.
-  if (!keep_dims() && input_ty == result_ty) {
-    return input();
+  if (!getKeepDims() && input_ty == result_ty) {
+    return getInput();
   }
   return {};
 }
@@ -1769,7 +1798,7 @@ static LogicalResult VerifyStridedSliceBase(OpTy op) {
   // Expected size for operands begin, end and strides vector operands.
   int64_t expected_size = -1;
 
-  for (Value val : {op.begin(), op.end(), op.strides()}) {
+  for (Value val : {op.getBegin(), op.getEnd(), op.getStrides()}) {
     auto operand_ty = val.getType().dyn_cast<ShapedType>();
     if (!operand_ty || !operand_ty.hasStaticShape()) {
       // TensorFlow constant ops may have non-static shape because the shape is
@@ -1804,14 +1833,14 @@ static LogicalResult VerifyStridedSliceBase(OpTy op) {
 
   // If strides are constants, verify that none of the element is zero.
   DenseIntElementsAttr strides;
-  if (matchPattern(op.strides(), m_Constant(&strides))) {
+  if (matchPattern(op.getStrides(), m_Constant(&strides))) {
     if (llvm::is_contained(strides.getValues<APInt>(), 0))
       return op.emitOpError("requires non-zero strides");
   }
 
   // Use bit compares to ensure ellipsis_mask is 0 or a power of 2, i.e. there
   // exists only no more than one ellipsis.
-  uint32_t ellipsis_mask = op.ellipsis_mask();
+  uint32_t ellipsis_mask = op.getEllipsisMask();
   if (ellipsis_mask != 0 && !llvm::isPowerOf2_32(ellipsis_mask))
     return op.emitOpError("cannot have multiple ellipses");
 
@@ -2050,12 +2079,12 @@ bool StridedSliceOp::GetSlicedBoundRanges(
   // TODO(hinsu): Support lowering for ops with dynamic begin and end values
   // when it is possible to derive indices based on mask attributes.
   DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
-  if (!matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
-      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
-      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
+  if (!matchPattern(getBegin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(getEnd(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(getStrides(), m_Constant(&sparse_strides_attr)))
     return false;
 
-  auto input_ty = this->input().getType().dyn_cast<RankedTensorType>();
+  auto input_ty = this->getInput().getType().dyn_cast<RankedTensorType>();
   if (!input_ty || !input_ty.hasStaticShape()) return false;
   auto input_shape = llvm::to_vector<4>(input_ty.getShape());
 
@@ -2069,13 +2098,13 @@ bool StridedSliceOp::GetSlicedBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
-      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      input_shape, sparse_begin, sparse_end, sparse_strides, getBeginMask(),
+      getEndMask(), getEllipsisMask(), getNewAxisMask(), getShrinkAxisMask(),
       slice_begin, slice_end, slice_stride);
   return true;
 }
 
-OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
   // Fold StridedSlice operation if it extracts statically known dimensions.
   //
   // For example,
@@ -2093,7 +2122,7 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
   // In this case %spatial_shape can be replaced with a constant [2, 3].
 
   // Input to strided slice op is defined by shape operation.
-  auto shape_op = input().getDefiningOp<ShapeOp>();
+  auto shape_op = getInput().getDefiningOp<ShapeOp>();
   if (!shape_op) {
     return {};
   }
@@ -2101,9 +2130,9 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
   // `begin`, `end` and `strides` should be constant in order to infer static
   // dimension.
   DenseIntElementsAttr begin_attr, end_attr, strides_attr;
-  if (!matchPattern(begin(), m_Constant(&begin_attr)) ||
-      !matchPattern(end(), m_Constant(&end_attr)) ||
-      !matchPattern(strides(), m_Constant(&strides_attr)) ||
+  if (!matchPattern(getBegin(), m_Constant(&begin_attr)) ||
+      !matchPattern(getEnd(), m_Constant(&end_attr)) ||
+      !matchPattern(getStrides(), m_Constant(&strides_attr)) ||
       begin_attr.getNumElements() != 1 || end_attr.getNumElements() != 1 ||
       strides_attr.getNumElements() != 1) {
     return {};
@@ -2112,9 +2141,9 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
   // Do not fold when `new_axis_mask` is set. It's likely to break the shape
   // of output. Typically, `new_axis_mask` is not set in this canonicalization
   // pattern.
-  if (new_axis_mask() != 0) return {};
+  if (getNewAxisMask() != 0) return {};
 
-  auto tensor_ty = shape_op.input().getType().dyn_cast<RankedTensorType>();
+  auto tensor_ty = shape_op.getInput().getType().dyn_cast<RankedTensorType>();
   // Only ranked tensor can be folded.
   if (!tensor_ty) return {};
 
@@ -2129,18 +2158,18 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
 
   // Create `begin` and `end` from `*_mask`. Note that we don't care about
   // `new_axis_mask` as it can be inferred from `output_ty`.
-  if (shrink_axis_mask() == 1) {
+  if (getShrinkAxisMask() == 1) {
     // When `shrink_axis_mask` is set, output is always a scalar so only
     // one element is sliced.
     end_int = begin_int + 1;
   }
-  if (begin_mask() == 1) {
+  if (getBeginMask() == 1) {
     begin_int = (strides_int > 0) ? 0 : rank - 1;
   }
-  if (end_mask() == 1) {
+  if (getEndMask() == 1) {
     end_int = (strides_int > 0) ? rank : -1;
   }
-  if (ellipsis_mask() == 1) {
+  if (getEllipsisMask() == 1) {
     begin_int = 0;
     end_int = rank;
   }
@@ -2172,10 +2201,11 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
   // For unranked or dynamic output, we infer the output type to either a
   // scalar or a vector based on `shrink_axis_mask` because we have rejected
   // the case of `new_axis_mask` != 0.
-  auto output_elt_ty = output().getType().cast<ShapedType>().getElementType();
-  auto output_ty = output().getType().dyn_cast<RankedTensorType>();
+  auto output_elt_ty =
+      getOutput().getType().cast<ShapedType>().getElementType();
+  auto output_ty = getOutput().getType().dyn_cast<RankedTensorType>();
   if (!output_ty || !output_ty.hasStaticShape()) {
-    if (shrink_axis_mask() == 1) {
+    if (getShrinkAxisMask() == 1) {
       output_ty = tensorflow::GetTypeFromTFTensorShape({}, output_elt_ty);
     } else {
       output_ty = tensorflow::GetTypeFromTFTensorShape(
@@ -2199,7 +2229,7 @@ OpFoldResult StridedSliceOp::fold(ArrayRef<Attribute> operands) {
 
 LogicalResult StridedSliceGradOp::verify() {
   StridedSliceGradOp op = *this;
-  auto shape_type = op.shape().getType().dyn_cast<RankedTensorType>();
+  auto shape_type = op.getShape().getType().dyn_cast<RankedTensorType>();
   if (shape_type && shape_type.getRank() != 1)
     return op.emitOpError("'shape' operand must be 1D tensor, but got ")
            << shape_type.getRank() << "D tensor";
@@ -2218,10 +2248,10 @@ bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
     SmallVectorImpl<int64_t> *slice_stride) {
   DenseIntElementsAttr shape_attr;
   DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
-  if (!matchPattern(shape(), m_Constant(&shape_attr)) ||
-      !matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
-      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
-      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
+  if (!matchPattern(getShape(), m_Constant(&shape_attr)) ||
+      !matchPattern(getBegin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(getEnd(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(getStrides(), m_Constant(&sparse_strides_attr)))
     return false;
 
   int rank = std::distance(shape_attr.begin(), shape_attr.end());
@@ -2241,8 +2271,8 @@ bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      *input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
-      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      *input_shape, sparse_begin, sparse_end, sparse_strides, getBeginMask(),
+      getEndMask(), getEllipsisMask(), getNewAxisMask(), getShrinkAxisMask(),
       slice_begin, slice_end, slice_stride);
   return true;
 }
@@ -2256,9 +2286,9 @@ SummaryWriterOp::GetResourceHandleValueAndIdList(
     llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
     int64_t &next_id) {
   llvm::StringRef device = GetDeviceOrEmpty(getOperation());
-  return {GetResourceHandleValueAndIdBase(container(), shared_name(), device,
-                                          writer(), resource_handle_id_map,
-                                          next_id)};
+  return {GetResourceHandleValueAndIdBase(getContainer(), getSharedName(),
+                                          device, getWriter(),
+                                          resource_handle_id_map, next_id)};
 }
 
 //===----------------------------------------------------------------------===//
@@ -2268,11 +2298,11 @@ SummaryWriterOp::GetResourceHandleValueAndIdList(
 void TPUExecuteOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.reserve(args().size() + 1);
+  effects.reserve(getArgs().size() + 1);
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::TPUExecute::get());
 
-  for (Value value : args()) {
+  for (Value value : getArgs()) {
     if (value.getType()
             .cast<TensorType>()
             .getElementType()
@@ -2297,7 +2327,7 @@ void TPUExecuteOp::getEffects(
 LogicalResult TPUExecuteAndUpdateVariablesOp::verify() {
   TPUExecuteAndUpdateVariablesOp op = *this;
   int num_resource_args = 0;
-  for (Type arg_type : op.args().getTypes())
+  for (Type arg_type : op.getArgs().getTypes())
     if (arg_type.cast<TensorType>().getElementType().isa<ResourceType>())
       ++num_resource_args;
 
@@ -2323,19 +2353,19 @@ LogicalResult TPUExecuteAndUpdateVariablesOp::verify() {
   };
 
   return failure(
-      failed(check_attr(op.device_var_reads_indices(),
+      failed(check_attr(op.getDeviceVarReadsIndices(),
                         /*name=*/"device_var_reads_indices", /*min=*/0)) ||
-      failed(check_attr(op.device_var_updates_indices(),
+      failed(check_attr(op.getDeviceVarUpdatesIndices(),
                         /*name=*/"device_var_updates_indices", /*min=*/-1)));
 }
 
 void TPUExecuteAndUpdateVariablesOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.reserve(device_var_reads_indices().size() + 1);
+  effects.reserve(getDeviceVarReadsIndices().size() + 1);
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::TPUExecute::get());
-  auto resource_handles = llvm::make_filter_range(args(), [](Value value) {
+  auto resource_handles = llvm::make_filter_range(getArgs(), [](Value value) {
     return value.getType()
         .cast<TensorType>()
         .getElementType()
@@ -2346,7 +2376,7 @@ void TPUExecuteAndUpdateVariablesOp::getEffects(
     Value value = entry.value();
     effects.emplace_back(MemoryEffects::Read::get(), value,
                          ResourceEffects::Variable::get());
-    if (device_var_updates_indices()
+    if (getDeviceVarUpdatesIndices()
             .getValue()[entry.index()]
             .cast<IntegerAttr>()
             .getInt() >= 0)
@@ -2371,7 +2401,7 @@ class ConvertTensorListGetItemOpOfTensorListFromTensorOpToGather
     // Checks that the input is created by TensorListFromTensorOp and the input
     // is only used by TensorListGetItemOp.
     auto tensor_list_from_tensor_op = dyn_cast_or_null<TensorListFromTensorOp>(
-        op.input_handle().getDefiningOp());
+        op.getInputHandle().getDefiningOp());
     if (!tensor_list_from_tensor_op ||
         llvm::any_of(
             tensor_list_from_tensor_op->getUsers(),
@@ -2380,7 +2410,8 @@ class ConvertTensorListGetItemOpOfTensorListFromTensorOpToGather
     }
 
     rewriter.replaceOpWithNewOp<GatherOp>(
-        op, op.getType(), tensor_list_from_tensor_op.tensor(), op.index());
+        op, op.getType(), tensor_list_from_tensor_op.getTensor(),
+        op.getIndex());
     return success();
   }
 };
@@ -2405,12 +2436,12 @@ LogicalResult TensorListReserveOp::verify() {
     return emitOpError(
         "must have exactly one subtype in the result variant type");
   }
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+  if (!IsOfRankOrUnranked(op.getElementShape(), 0) &&
+      !IsOfRankOrUnranked(op.getElementShape(), 1)) {
     return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
   }
 
-  if (!IsOfRankOrUnranked(op.num_elements(), 0)) {
+  if (!IsOfRankOrUnranked(op.getNumElements(), 0)) {
     return op.emitOpError("requires num_elements operand to be 0D tensor");
   }
   return success();
@@ -2420,7 +2451,7 @@ LogicalResult TensorListReserveOp::verify() {
 // TensorListElementShapeOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult TensorListElementShapeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult TensorListElementShapeOp::fold(FoldAdaptor) {
   int width =
       getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
   auto variant_type =
@@ -2435,8 +2466,8 @@ OpFoldResult TensorListElementShapeOp::fold(ArrayRef<Attribute> operands) {
 
 LogicalResult TensorListStackOp::verify() {
   TensorListStackOp op = *this;
-  if (!IsOfRankOrUnranked(op.element_shape(), 0) &&
-      !IsOfRankOrUnranked(op.element_shape(), 1)) {
+  if (!IsOfRankOrUnranked(op.getElementShape(), 0) &&
+      !IsOfRankOrUnranked(op.getElementShape(), 1)) {
     return op.emitOpError("requires element_shape operand to be 0D/1D tensor");
   }
   return success();
@@ -2448,15 +2479,15 @@ LogicalResult TensorListStackOp::verify() {
 
 LogicalResult TensorScatterUpdateOp::verify() {
   TensorScatterUpdateOp op = *this;
-  if (!HasRankAtLeast(op.tensor(), 1))
+  if (!HasRankAtLeast(op.getTensor(), 1))
     return op.emitOpError(
         "requires tensor operand to have at least 1 dimension");
-  if (!HasRankAtLeast(op.indices(), 1))
+  if (!HasRankAtLeast(op.getIndices(), 1))
     return op.emitOpError(
         "requires indices operand to have at least 1 dimension");
 
-  auto tensor_ty = op.tensor().getType().dyn_cast<RankedTensorType>();
-  auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
+  auto tensor_ty = op.getTensor().getType().dyn_cast<RankedTensorType>();
+  auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
   if (!tensor_ty || !indices_ty) return success();
 
   int64_t num_index_dims = indices_ty.getShape().back();
@@ -2485,9 +2516,10 @@ LogicalResult TensorScatterUpdateOp::verify() {
 
 LogicalResult TileOp::verify() {
   TileOp op = *this;
-  auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
-  auto multiples_type = op.multiples().getType().dyn_cast<RankedTensorType>();
-  auto output_type = op.output().getType().dyn_cast<RankedTensorType>();
+  auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+  auto multiples_type =
+      op.getMultiples().getType().dyn_cast<RankedTensorType>();
+  auto output_type = op.getOutput().getType().dyn_cast<RankedTensorType>();
 
   if (multiples_type && multiples_type.getRank() != 1) {
     return op.emitOpError() << "expected multiples to be rank 1, got rank = "
@@ -2512,7 +2544,7 @@ LogicalResult TileOp::verify() {
     }
 
     DenseIntElementsAttr multiples_attr;
-    if (matchPattern(op.multiples(), m_Constant(&multiples_attr))) {
+    if (matchPattern(op.getMultiples(), m_Constant(&multiples_attr))) {
       for (int32_t i = 0, e = input_type.getRank(); i < e; ++i) {
         const int64_t input_dim = input_type.getDimSize(i);
         const int64_t output_dim = output_type.getDimSize(i);
@@ -2538,14 +2570,14 @@ LogicalResult TileOp::verify() {
   return success();
 }
 
-OpFoldResult TileOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult TileOp::fold(FoldAdaptor) {
   DenseIntElementsAttr multiples_attr;
-  if (matchPattern(multiples(), m_Constant(&multiples_attr))) {
+  if (matchPattern(getMultiples(), m_Constant(&multiples_attr))) {
     // Return input directly when multiples are all ones,
     // regardless what input is.
     if (multiples_attr.isSplat() &&
         multiples_attr.getSplatValue<APInt>().getSExtValue() == 1) {
-      return input();
+      return getInput();
     }
   }
   return {};
@@ -2557,11 +2589,11 @@ OpFoldResult TileOp::fold(ArrayRef<Attribute> operands) {
 
 LogicalResult TopKV2Op::verify() {
   TopKV2Op op = *this;
-  if (!HasRankAtLeast(op.input(), 1))
+  if (!HasRankAtLeast(op.getInput(), 1))
     return op.emitOpError(
         "requires input operand to have at least 1 dimension");
 
-  if (!IsOfRankOrUnranked(op.k(), 0))
+  if (!IsOfRankOrUnranked(op.getK(), 0))
     return op.emitOpError("requires k operand to be 0D tensor");
 
   return success();
@@ -2627,7 +2659,7 @@ void ToBoolOp::getCanonicalizationPatterns(RewritePatternSet &results,
 }
 
 LogicalResult ToBoolOp::inferReturnTypes(
-    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    MLIRContext *context, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
   inferredReturnTypes.push_back(
@@ -2635,15 +2667,41 @@ LogicalResult ToBoolOp::inferReturnTypes(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TPUPartitionedInputV2
+//===----------------------------------------------------------------------===//
+
+// This method mimics this op's core/TF-level shape inference logic
+LogicalResult TPUPartitionedInputV2Op::verify() {
+  TPUPartitionedInputV2Op op = *this;
+
+  int num_partitions = 1;
+  const mlir::ArrayAttr partition_dims = op.getPartitionDims();
+  for (const mlir::Attribute &dim : partition_dims) {
+    num_partitions *= dim.cast<IntegerAttr>().getInt();
+  }
+
+  const bool is_packed = op.getIsPacked();
+  const bool replicated = partition_dims.empty();
+  const int num_inputs_expected = is_packed ? 1 : num_partitions;
+
+  if (!((replicated && !is_packed) || (op.getN() == num_inputs_expected))) {
+    return op.emitOpError() << "expected " << num_inputs_expected
+                            << " inputs, got " << op.getN();
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeOp
 //===----------------------------------------------------------------------===//
 
 LogicalResult TransposeOp::verify() {
   TransposeOp op = *this;
-  auto perm_type = op.perm().getType().dyn_cast<RankedTensorType>();
-  auto x_type = op.x().getType().dyn_cast<RankedTensorType>();
-  auto y_type = op.y().getType().dyn_cast<RankedTensorType>();
+  auto perm_type = op.getPerm().getType().dyn_cast<RankedTensorType>();
+  auto x_type = op.getX().getType().dyn_cast<RankedTensorType>();
+  auto y_type = op.getY().getType().dyn_cast<RankedTensorType>();
 
   if (perm_type && perm_type.getRank() != 1) {
     return op.emitOpError()
@@ -2669,7 +2727,7 @@ LogicalResult TransposeOp::verify() {
   }
 
   DenseIntElementsAttr attr_perm;
-  if (matchPattern(op.perm(), m_Constant(&attr_perm))) {
+  if (matchPattern(op.getPerm(), m_Constant(&attr_perm))) {
     // y.shape[i] should be equal to x.shape[perm[i]]
     // for i = [0, 1, ..., rank(x) - 1]
     for (auto e : llvm::enumerate(attr_perm)) {
@@ -2728,7 +2786,7 @@ namespace {
 
 OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   DenseIntElementsAttr perm;
-  if (!matchPattern(op.perm(), m_Constant(&perm))) return {};
+  if (!matchPattern(op.getPerm(), m_Constant(&perm))) return {};
   const auto elements = perm.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
@@ -2736,37 +2794,37 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   }
 
   // TODO(jpienaar): Remove if/when we handle this more generally.
-  if (op.getType() != op.x().getType()) {
+  if (op.getType() != op.getX().getType()) {
     // If the types don't match then only fold if all the operands are in the TF
     // dialect.
     for (auto user : op.getOperation()->getUsers())
       if (user->getDialect() != op->getDialect()) return {};
   }
 
-  return op.x();
+  return op.getX();
 }
 
 OpFoldResult FoldCancellableTranspose(TransposeOp op) {
   // Operand is a TransposeOp.
-  auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.x().getDefiningOp());
+  auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.getX().getDefiningOp());
   if (!transpose) return {};
 
   // Permutations defined by constant operations.
   DenseIntElementsAttr perm0;
   DenseIntElementsAttr perm1;
-  if (!matchPattern(op.perm(), m_Constant(&perm0)) ||
-      !matchPattern(transpose.perm(), m_Constant(&perm1)))
+  if (!matchPattern(op.getPerm(), m_Constant(&perm0)) ||
+      !matchPattern(transpose.getPerm(), m_Constant(&perm1)))
     return {};
 
   // With permutation indices that cancel each other
   if (!AreCancellablePermutations(perm0, perm1)) return {};
 
-  return transpose.x();
+  return transpose.getX();
 }
 
 }  // namespace
 
-OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult TransposeOp::fold(FoldAdaptor) {
   if (auto folded = FoldIdentityTranspose(*this)) return folded;
   if (auto folded = FoldCancellableTranspose(*this)) return folded;
   return {};
@@ -2804,9 +2862,9 @@ class NMSV3ToNMSV4Op : public OpRewritePattern<NonMaxSuppressionV3Op> {
     new_result_types.push_back(valid_output_type);
 
     auto nmsv4 = rewriter.create<TF::NonMaxSuppressionV4Op>(
-        nms_op.getLoc(), new_result_types, nms_op.boxes(), nms_op.scores(),
-        nms_op.max_output_size(), nms_op.iou_threshold(),
-        nms_op.score_threshold());
+        nms_op.getLoc(), new_result_types, nms_op.getBoxes(),
+        nms_op.getScores(), nms_op.getMaxOutputSize(), nms_op.getIouThreshold(),
+        nms_op.getScoreThreshold());
     // Cannot replace the NMSv3 Op with NMSv4 since the outputs between the
     // two are different (v4 expects two output values vs v3 requires only one.
     nms_op.replaceAllUsesWith(nmsv4.getResult(0));
@@ -2861,11 +2919,11 @@ void FusedBatchNormOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult UnpackOp::verify() {
   UnpackOp op = *this;
-  auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
+  auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
   if (!value_type) return success();
 
   int64_t value_rank = value_type.getRank();
-  int64_t axis = op.axis();
+  int64_t axis = op.getAxis();
   if (axis < -value_rank || axis >= value_rank)
     return op.emitOpError("axis attribute must be in the range of [-")
            << value_rank << ", " << value_rank << ')';
@@ -2924,7 +2982,7 @@ LogicalResult HoistCwiseUnaryOutOfUnpack::matchAndRewrite(
 
   // Unpack results after applying unary operation.
   auto unpack_unary_op = rewriter.create<UnpackOp>(
-      loc, op.getResultTypes(), new_unary_op->getResult(0), op.axis());
+      loc, op.getResultTypes(), new_unary_op->getResult(0), op.getAxis());
 
   // Bypass all users of the original unpack operation and use `unpack_unary_op`
   // results instead.
@@ -2954,12 +3012,12 @@ void UnpackOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 template <class Op>
 static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
-  if (!HasRankAtMost(op.num_segments(), 0))
+  if (!HasRankAtMost(op.getNumSegments(), 0))
     return op.emitOpError("number of segments should be a 0-D tensor");
 
-  auto data_type = op.data().getType().template dyn_cast<RankedTensorType>();
+  auto data_type = op.getData().getType().template dyn_cast<RankedTensorType>();
   auto segment_ids_type =
-      op.segment_ids().getType().template dyn_cast<RankedTensorType>();
+      op.getSegmentIds().getType().template dyn_cast<RankedTensorType>();
   if (data_type && segment_ids_type) {
     if (data_type.getRank() < segment_ids_type.getRank())
       return op.emitOpError(
@@ -2982,7 +3040,7 @@ static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
   }
 
   DenseIntElementsAttr num_segments_attr;
-  if (matchPattern(op.num_segments(), m_Constant(&num_segments_attr))) {
+  if (matchPattern(op.getNumSegments(), m_Constant(&num_segments_attr))) {
     int64_t num_segments = (*num_segments_attr.begin()).getSExtValue();
     if (num_segments < 0)
       return op.emitOpError("num of segments cannot be negative");
@@ -3024,9 +3082,9 @@ VarHandleOp::GetResourceHandleValueAndIdList(
     llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
     int64_t &next_id) {
   llvm::StringRef device = GetDeviceOrEmpty(getOperation());
-  return {GetResourceHandleValueAndIdBase(container(), shared_name(), device,
-                                          resource(), resource_handle_id_map,
-                                          next_id)};
+  return {GetResourceHandleValueAndIdBase(getContainer(), getSharedName(),
+                                          device, getResource(),
+                                          resource_handle_id_map, next_id)};
 }
 
 //===----------------------------------------------------------------------===//
@@ -3070,7 +3128,7 @@ void VariableOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult VariableShapeOp::verify() {
   VariableShapeOp op = *this;
-  auto input_type = op.input().getType().cast<TensorType>();
+  auto input_type = op.getInput().getType().cast<TensorType>();
   if (input_type.hasStaticShape() && input_type.getNumElements() != 1)
     return op.emitOpError("requires input to have one resource");
 
@@ -3088,7 +3146,7 @@ LogicalResult VariableShapeOp::verify() {
   }
 }
 
-OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult VariableShapeOp::fold(FoldAdaptor) {
   int width =
       getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
   auto resource_type =
@@ -3162,14 +3220,14 @@ static LogicalResult VerifyWhileTypes(Operation *op, TypeRange cond_input,
 
 LogicalResult WhileOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
   auto cond_fn =
-      symbol_table.lookupNearestSymbolFrom<func::FuncOp>(*this, condAttr());
+      symbol_table.lookupNearestSymbolFrom<func::FuncOp>(*this, getCondAttr());
   auto body_fn =
-      symbol_table.lookupNearestSymbolFrom<func::FuncOp>(*this, bodyAttr());
+      symbol_table.lookupNearestSymbolFrom<func::FuncOp>(*this, getBodyAttr());
   if (!cond_fn) {
-    return emitOpError("cond refers to an undefined function : ") << cond();
+    return emitOpError("cond refers to an undefined function : ") << getCond();
   }
   if (!body_fn) {
-    return emitOpError("body refers to an undefined function : ") << body();
+    return emitOpError("body refers to an undefined function : ") << getBody();
   }
 
   auto cond_fn_type = cond_fn.getFunctionType();
@@ -3182,7 +3240,7 @@ LogicalResult WhileOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
   return VerifyWhileTypes(*this, /*cond_input=*/cond_fn_type.getInputs(),
                           /*body_input=*/body_fn_type.getInputs(),
                           /*body_result=*/body_fn_type.getResults(),
-                          shape_invariant());
+                          getShapeInvariant());
 }
 
 //===----------------------------------------------------------------------===//
@@ -3191,7 +3249,7 @@ LogicalResult WhileOp::verifySymbolUses(SymbolTableCollection &symbol_table) {
 LogicalResult WhileRegionOp::verify() {
   WhileRegionOp op = *this;
   // Verify that the condition generates a single tensor<i1> result.
-  Operation *cond_yield = op.cond().front().getTerminator();
+  Operation *cond_yield = op.getCond().front().getTerminator();
   if (cond_yield->getNumOperands() != 1)
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
@@ -3203,11 +3261,12 @@ LogicalResult WhileRegionOp::verify() {
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
 
-  Operation *body_yield = op.body().front().getTerminator();
-  if (failed(VerifyWhileTypes(op, /*cond_input=*/op.cond().getArgumentTypes(),
-                              /*body_input=*/op.body().getArgumentTypes(),
+  Operation *body_yield = op.getBody().front().getTerminator();
+  if (failed(VerifyWhileTypes(op,
+                              /*cond_input=*/op.getCond().getArgumentTypes(),
+                              /*body_input=*/op.getBody().getArgumentTypes(),
                               /*body_result=*/body_yield->getOperandTypes(),
-                              op.shape_invariant())))
+                              op.getShapeInvariant())))
     return failure();
   return success();
 }
@@ -3216,7 +3275,7 @@ LogicalResult WhileRegionOp::verify() {
 // WhileRegionOp LoopLikeOpInterface
 //===----------------------------------------------------------------------===//
 
-Region &WhileRegionOp::getLoopBody() { return body(); }
+Region &WhileRegionOp::getLoopBody() { return getBody(); }
 
 //===----------------------------------------------------------------------===//
 // WhileRegionOp canonicalization
@@ -3232,8 +3291,8 @@ struct WhileRegionExplicitCast : public OpRewritePattern<WhileRegionOp> {
 
   LogicalResult matchAndRewrite(WhileRegionOp while_op,
                                 PatternRewriter &rewriter) const override {
-    auto &body_block = while_op.body().front();
-    auto &cond_block = while_op.cond().front();
+    auto &body_block = while_op.getBody().front();
+    auto &cond_block = while_op.getCond().front();
     bool changed = false;
     for (int op_idx : llvm::seq<int>(0, while_op.getNumOperands())) {
       auto body_arg = body_block.getArgument(op_idx);
@@ -3267,8 +3326,8 @@ struct WhileRegionEliminatePassThrough
     // argument can be easily found.
     int old_num_operands = while_op.getNumOperands();
     int new_num_operands = old_num_operands;
-    auto &body_block = while_op.body().front();
-    auto &cond_block = while_op.cond().front();
+    auto &body_block = while_op.getBody().front();
+    auto &cond_block = while_op.getCond().front();
     auto &yield = *body_block.getTerminator();
 
     // Bit mask indicating which operands will be removed.
@@ -3326,13 +3385,13 @@ struct WhileRegionEliminatePassThrough
         while_op->getAttrs());
 
     // Move region bodies to the new while.
-    rewriter.inlineRegionBefore(while_op.cond(), new_while_op.cond(),
-                                new_while_op.cond().end());
-    rewriter.inlineRegionBefore(while_op.body(), new_while_op.body(),
-                                new_while_op.body().end());
+    rewriter.inlineRegionBefore(while_op.getCond(), new_while_op.getCond(),
+                                new_while_op.getCond().end());
+    rewriter.inlineRegionBefore(while_op.getBody(), new_while_op.getBody(),
+                                new_while_op.getBody().end());
 
-    auto &new_cond_block = new_while_op.cond().front();
-    auto &new_body_block = new_while_op.body().front();
+    auto &new_cond_block = new_while_op.getCond().front();
+    auto &new_body_block = new_while_op.getBody().front();
     auto &new_yield = *new_body_block.getTerminator();
 
     // Patch up the region bodies and yield.
@@ -3375,12 +3434,12 @@ void XdivyOp::getCanonicalizationPatterns(RewritePatternSet &results,
 //===----------------------------------------------------------------------===//
 
 LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
-    MLIRContext *context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    MLIRContext *context, std::optional<Location> location,
+    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   XlaBroadcastHelperOpAdaptor op(operands.getValues(), attributes);
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
+  Value lhs = op.getLhs();
+  Value rhs = op.getRhs();
   auto set_unranked_results = [&]() {
     inferredReturnShapes.emplace_back(getElementTypeOrSelf(lhs));
     inferredReturnShapes.emplace_back(getElementTypeOrSelf(rhs));
@@ -3395,7 +3454,7 @@ LogicalResult XlaBroadcastHelperOp::inferReturnTypeComponents(
   int64_t rhs_rank = rhs_ty.getRank();
 
   DenseIntElementsAttr dims;
-  if (!matchPattern(op.broadcast_dims(), m_Constant(&dims))) {
+  if (!matchPattern(op.getBroadcastDims(), m_Constant(&dims))) {
     return set_unranked_results();
   }
 
@@ -3459,10 +3518,10 @@ class XlaConvToV2 : public OpRewritePattern<TF::XlaConvOp> {
                                 PatternRewriter &rewriter) const override {
     SmallVector<Type> result_types{op.getResult().getType()};
     rewriter.replaceOpWithNewOp<TF::XlaConvV2Op>(
-        op, op.getResult().getType(), op.lhs(), op.rhs(), op.window_strides(),
-        op.padding(), op.lhs_dilation(), op.rhs_dilation(),
-        op.feature_group_count(), op.dimension_numbers(), op.precision_config(),
-        1);
+        op, op.getResult().getType(), op.getLhs(), op.getRhs(),
+        op.getWindowStrides(), op.getPadding(), op.getLhsDilation(),
+        op.getRhsDilation(), op.getFeatureGroupCount(),
+        op.getDimensionNumbers(), op.getPrecisionConfig(), 1);
     return ::mlir::success();
   };
 };
@@ -3480,11 +3539,11 @@ LogicalResult XlaConvV2Op::verify() {
   XlaConvV2Op op = *this;
   DenseElementsAttr window_strides_attr, padding_attr, lhs_dilation_attr,
       rhs_dilation_attr, feature_group_count_attr;
-  if (!(matchPattern(op.window_strides(), m_Constant(&window_strides_attr)) &&
-        matchPattern(op.padding(), m_Constant(&padding_attr)) &&
-        matchPattern(op.lhs_dilation(), m_Constant(&lhs_dilation_attr)) &&
-        matchPattern(op.rhs_dilation(), m_Constant(&rhs_dilation_attr)) &&
-        matchPattern(op.feature_group_count(),
+  if (!(matchPattern(op.getWindowStrides(), m_Constant(&window_strides_attr)) &&
+        matchPattern(op.getPadding(), m_Constant(&padding_attr)) &&
+        matchPattern(op.getLhsDilation(), m_Constant(&lhs_dilation_attr)) &&
+        matchPattern(op.getRhsDilation(), m_Constant(&rhs_dilation_attr)) &&
+        matchPattern(op.getFeatureGroupCount(),
                      m_Constant(&feature_group_count_attr))))
     return success();
 
@@ -3513,12 +3572,12 @@ LogicalResult XlaConvV2Op::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
-    MLIRContext *context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
+    MLIRContext *context, std::optional<Location> location,
+    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
   XlaSetDynamicDimensionSizeOpAdaptor op(operands.getValues(), attributes);
 
-  TensorType operand_ty = op.input().getType().cast<TensorType>();
+  TensorType operand_ty = op.getInput().getType().cast<TensorType>();
   Type element_ty = operand_ty.getElementType();
 
   TensorType result_ty;
@@ -3526,7 +3585,7 @@ LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
     auto shape = llvm::to_vector<4>(operand_ty.getShape());
 
     DenseIntElementsAttr dim_index_attr;
-    if (matchPattern(op.dim_index(), m_Constant(&dim_index_attr))) {
+    if (matchPattern(op.getDimIndex(), m_Constant(&dim_index_attr))) {
       int64_t dim_index = dim_index_attr.getValues<APInt>()[0].getSExtValue();
 
       int64_t rank = operand_ty.getRank();
@@ -3534,9 +3593,9 @@ LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypeComponents(
         return emitOptionalError(location, "dim_index (", dim_index,
                                  ") is out of range [0, ", rank, ")");
       }
-      shape[dim_index] = ShapedType::kDynamicSize;
+      shape[dim_index] = ShapedType::kDynamic;
     } else {
-      shape.assign(shape.size(), ShapedType::kDynamicSize);
+      shape.assign(shape.size(), ShapedType::kDynamic);
     }
     result_ty = tensorflow::GetTypeFromTFTensorShape(shape, element_ty);
   } else {
@@ -3558,12 +3617,12 @@ class XlaReduceToXlaVariadicReduceV2
 
   LogicalResult matchAndRewrite(TF::XlaReduceOp op,
                                 PatternRewriter &rewriter) const override {
-    SmallVector<Value> inputs{op.input()};
-    SmallVector<Value> init_values{op.init_value()};
+    SmallVector<Value> inputs{op.getInput()};
+    SmallVector<Value> init_values{op.getInitValue()};
     SmallVector<Type> result_types{op.getResult().getType()};
     rewriter.replaceOpWithNewOp<TF::XlaVariadicReduceV2Op>(
-        op, result_types, inputs, init_values, op.dimensions_to_reduce(),
-        op.reducer());
+        op, result_types, inputs, init_values, op.getDimensionsToReduce(),
+        op.getReducer());
     return ::mlir::success();
   };
 };
@@ -3579,7 +3638,7 @@ void XlaReduceOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 LogicalResult XlaReduceWindowOp::verify() {
   XlaReduceWindowOp op = *this;
-  const auto &input_ty = op.input().getType().cast<ShapedType>();
+  const auto &input_ty = op.getInput().getType().cast<ShapedType>();
 
   auto check = [&](mlir::Value val, std::string attr_name) -> LogicalResult {
     ElementsAttr attr;
@@ -3602,18 +3661,18 @@ LogicalResult XlaReduceWindowOp::verify() {
     return success();
   };
 
-  if (check(op.window_dimensions(), "window_dimensions").failed())
+  if (check(op.getWindowDimensions(), "window_dimensions").failed())
     return failure();
 
-  if (check(op.window_strides(), "window_strides").failed()) return failure();
+  if (check(op.getWindowStrides(), "window_strides").failed()) return failure();
 
-  if (check(op.base_dilations(), "base_dilations").failed()) return failure();
+  if (check(op.getBaseDilations(), "base_dilations").failed()) return failure();
 
-  if (check(op.window_dilations(), "window_dilations").failed())
+  if (check(op.getWindowDilations(), "window_dilations").failed())
     return failure();
 
   ElementsAttr padding;
-  if (matchPattern(op.padding(), m_Constant(&padding))) {
+  if (matchPattern(op.getPadding(), m_Constant(&padding))) {
     const ShapedType &padding_ty = padding.getType();
     if (padding_ty.getRank() != 2 || padding_ty.getDimSize(1) != 2) {
       return op.emitOpError()
@@ -3624,7 +3683,7 @@ LogicalResult XlaReduceWindowOp::verify() {
 
   auto module = op->getParentOfType<mlir::ModuleOp>();
   auto func = dyn_cast_or_null<mlir::func::FuncOp>(
-      SymbolTable::lookupSymbolIn(module, op.computation()));
+      SymbolTable::lookupSymbolIn(module, op.getComputation()));
   if (!func) {
     return op.emitOpError() << "has no reduction function specified";
   }
@@ -3647,7 +3706,7 @@ LogicalResult XlaReduceWindowOp::verify() {
 
 LogicalResult XlaSelectAndScatterOp::verify() {
   XlaSelectAndScatterOp op = *this;
-  auto input_ty = op.operand().getType().cast<ShapedType>();
+  auto input_ty = op.getOperand().getType().cast<ShapedType>();
 
   auto check = [&](mlir::Value val, std::string attr_name) -> LogicalResult {
     ElementsAttr attr;
@@ -3664,13 +3723,13 @@ LogicalResult XlaSelectAndScatterOp::verify() {
     return success();
   };
 
-  if (check(op.window_dimensions(), "window_dimensions").failed())
+  if (check(op.getWindowDimensions(), "window_dimensions").failed())
     return failure();
 
-  if (check(op.window_strides(), "window_strides").failed()) return failure();
+  if (check(op.getWindowStrides(), "window_strides").failed()) return failure();
 
   ElementsAttr padding;
-  if (matchPattern(op.padding(), m_Constant(&padding))) {
+  if (matchPattern(op.getPadding(), m_Constant(&padding))) {
     const ShapedType &padding_ty = padding.getType();
     if (padding_ty.getRank() != 2 || padding_ty.getDimSize(1) != 2) {
       return op.emitOpError()
@@ -3681,7 +3740,7 @@ LogicalResult XlaSelectAndScatterOp::verify() {
 
   auto module = op->getParentOfType<mlir::ModuleOp>();
   auto select_func = dyn_cast_or_null<mlir::func::FuncOp>(
-      SymbolTable::lookupSymbolIn(module, op.select()));
+      SymbolTable::lookupSymbolIn(module, op.getSelect()));
   if (!select_func) {
     return op.emitOpError() << "has no select function specified";
   }
@@ -3698,7 +3757,7 @@ LogicalResult XlaSelectAndScatterOp::verify() {
                             << select_func_type.getResult(0);
   }
   auto scatter_func = dyn_cast_or_null<mlir::func::FuncOp>(
-      SymbolTable::lookupSymbolIn(module, op.scatter()));
+      SymbolTable::lookupSymbolIn(module, op.getScatter()));
   if (!scatter_func) {
     return op.emitOpError() << "has no scatter function specified";
   }
@@ -3719,7 +3778,7 @@ LogicalResult XlaSelectAndScatterOp::verify() {
 LogicalResult XlaVariadicReduceOp::verify() {
   XlaVariadicReduceOp op = *this;
   // We rely on V2 for the majority of the checks.
-  const auto &input_ty = op.input().getType();
+  const auto &input_ty = op.getInput().getType();
   if (input_ty.empty()) return op.emitOpError() << "No input";
   const auto &dtype = input_ty[0].cast<TensorType>().getElementType();
   for (const auto &ty : input_ty) {
@@ -3738,8 +3797,8 @@ class XlaVariadicReduceToV2 : public OpRewritePattern<TF::XlaVariadicReduceOp> {
                                 PatternRewriter &rewriter) const override {
     mlir::TF::XlaVariadicReduceV2Op xla_variadic_reduce_v2_op =
         rewriter.create<::mlir::TF::XlaVariadicReduceV2Op>(
-            op.getLoc(), op.getResults().getTypes(), op.input(),
-            op.init_value(), op.dimensions_to_reduce(), op.reducer());
+            op.getLoc(), op.getResults().getTypes(), op.getInput(),
+            op.getInitValue(), op.getDimensionsToReduce(), op.getReducer());
 
     rewriter.replaceOp(op, xla_variadic_reduce_v2_op.getResults());
     return ::mlir::success();
@@ -3757,11 +3816,11 @@ void XlaVariadicReduceOp::getCanonicalizationPatterns(
 
 LogicalResult XlaVariadicReduceV2Op::verify() {
   XlaVariadicReduceV2Op op = *this;
-  const auto &inputs_ty = op.inputs().getType();
+  const auto &inputs_ty = op.getInputs().getType();
   int n_inputs = inputs_ty.size();
   if (n_inputs < 1) return op.emitOpError() << "No inputs";
 
-  const auto &init_values_ty = op.init_values().getType();
+  const auto &init_values_ty = op.getInitValues().getType();
   int n_init_values = init_values_ty.size();
   if (n_init_values != n_inputs) {
     return op.emitOpError() << "Number of inputs (" << n_inputs
@@ -3782,7 +3841,7 @@ LogicalResult XlaVariadicReduceV2Op::verify() {
       }
     }
 
-    if (op.dimensions_to_reduce().size() > input_ty_0.getRank()) {
+    if (op.getDimensionsToReduce().size() > input_ty_0.getRank()) {
       return op.emitOpError()
              << "Invalid dimensions_to_reduce argument to XlaVariadicReduceV2";
     }
@@ -3799,7 +3858,7 @@ LogicalResult XlaVariadicReduceV2Op::verify() {
 
   auto module = op->getParentOfType<mlir::ModuleOp>();
   auto function = dyn_cast_or_null<mlir::func::FuncOp>(
-      SymbolTable::lookupSymbolIn(module, op.reducer()));
+      SymbolTable::lookupSymbolIn(module, op.getReducer()));
   if (!function) return op.emitOpError() << "No reducer";
   if (!function.getBody().hasOneBlock())
     return op.emitOpError() << "reducer has more than one block";
@@ -3813,7 +3872,7 @@ LogicalResult XlaVariadicReduceV2Op::verify() {
 
 LogicalResult XlaVariadicSortOp::verify() {
   XlaVariadicSortOp op = *this;
-  const auto &inputs_ty = op.inputs().getType();
+  const auto &inputs_ty = op.getInputs().getType();
   int n_inputs = inputs_ty.size();
   auto input_ty_0 = inputs_ty[0].cast<ShapedType>();
   if (input_ty_0.hasStaticShape()) {
@@ -3830,7 +3889,7 @@ LogicalResult XlaVariadicSortOp::verify() {
   }
 
   ElementsAttr dimension;
-  if (matchPattern(op.dimension(), m_Constant(&dimension))) {
+  if (matchPattern(op.getDimension(), m_Constant(&dimension))) {
     if (dimension.getType().getRank() != 0 ||
         dimension.getType().getNumElements() != 1)
       return op.emitOpError() << "dimension must be a scalar";
@@ -3838,7 +3897,7 @@ LogicalResult XlaVariadicSortOp::verify() {
 
   auto module = op->getParentOfType<mlir::ModuleOp>();
   auto function = dyn_cast_or_null<mlir::func::FuncOp>(
-      SymbolTable::lookupSymbolIn(module, op.comparator()));
+      SymbolTable::lookupSymbolIn(module, op.getComparator()));
   if (!function) return op.emitOpError() << "No comparator";
   if (!function.getBody().hasOneBlock())
     return op.emitOpError() << "comparator has more than one block";
@@ -3853,9 +3912,10 @@ LogicalResult XlaVariadicSortOp::verify() {
 
 LogicalResult SetStaticDimensionBoundsOp::verify() {
   SetStaticDimensionBoundsOp op = *this;
-  mlir::ShapedType input_type = op.input().getType().cast<mlir::ShapedType>();
+  mlir::ShapedType input_type =
+      op.getInput().getType().cast<mlir::ShapedType>();
   mlir::ShapedType static_shape_type =
-      op.static_shape().getType().cast<mlir::ShapedType>();
+      op.getStaticShape().getType().cast<mlir::ShapedType>();
   int input_type_rank = input_type.hasRank() ? input_type.getRank() : -1;
   if (input_type_rank > 2) {
     return op.emitOpError() << "was used with an input tensor with rank > 2, "
@@ -3876,6 +3936,134 @@ LogicalResult SetStaticDimensionBoundsOp::verify() {
   return success();
 }
 
+namespace {
+
+template <typename UniformQuantizedOp>
+LogicalResult VerifyScalesAndZeroPoints(UniformQuantizedOp op, Value scales,
+                                        Value zero_points,
+                                        int32_t quantization_axis) {
+  ShapedType scales_type = scales.getType().cast<ShapedType>();
+  ShapedType zero_points_type = zero_points.getType().cast<ShapedType>();
+
+  if (quantization_axis == -1) {
+    if (scales_type.hasRank() && scales_type.getRank() != 0) {
+      return op.emitOpError(
+          "quantization_axis is -1, scales must have 0 rank.");
+    }
+    if (zero_points_type.hasRank() && zero_points_type.getRank() != 0) {
+      return op.emitOpError(
+          "quantization_axis is -1, zero_points must have 0 rank.");
+    }
+  } else {
+    if (scales_type.hasRank() && scales_type.getRank() != 1) {
+      return op.emitOpError(
+          "quantization_axis is not -1, scales must have 1 rank.");
+    }
+    if (zero_points_type.hasRank() && zero_points_type.getRank() != 1) {
+      return op.emitOpError(
+          "quantization_axis is not -1, zero_points must have 1 rank.");
+    }
+    if (scales_type.hasStaticShape() && zero_points_type.hasStaticShape() &&
+        scales_type.getNumElements() != zero_points_type.getNumElements()) {
+      return op.emitOpError(
+          "scales and zero points must have same number of elements.");
+    }
+  }
+
+  return success();
+}
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// UniformQuantizedDotHybridOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizedDotHybridOp::verify() {
+  UniformQuantizedDotHybridOp op = *this;
+  return VerifyScalesAndZeroPoints(op, op.getRhsScales(), op.getRhsZeroPoints(),
+                                   op.getRhsQuantizationAxis());
+}
+
+//===----------------------------------------------------------------------===//
+// UniformQuantizedConvolutionHybridOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizedConvolutionHybridOp::verify() {
+  UniformQuantizedConvolutionHybridOp op = *this;
+  return VerifyScalesAndZeroPoints(op, op.getRhsScales(), op.getRhsZeroPoints(),
+                                   op.getRhsQuantizationAxis());
+}
+
+//===----------------------------------------------------------------------===//
+// UniformQuantizeOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizeOp::verify() {
+  UniformQuantizeOp op = *this;
+  return VerifyScalesAndZeroPoints(op, op.getScales(), op.getZeroPoints(),
+                                   op.getQuantizationAxis());
+}
+
+//===----------------------------------------------------------------------===//
+// UniformRequantizeOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformRequantizeOp::verify() {
+  UniformRequantizeOp op = *this;
+  auto verify_input_params = VerifyScalesAndZeroPoints(
+      op, op.getInputScales(), op.getInputZeroPoints(),
+      op.getInputQuantizationAxis());
+  if (failed(verify_input_params)) {
+    return failure();
+  }
+  return VerifyScalesAndZeroPoints(op, op.getOutputScales(),
+                                   op.getOutputZeroPoints(),
+                                   op.getOutputQuantizationAxis());
+}
+
+//===----------------------------------------------------------------------===//
+// UniformDequantizeOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformDequantizeOp::verify() {
+  UniformDequantizeOp op = *this;
+  return VerifyScalesAndZeroPoints(op, op.getScales(), op.getZeroPoints(),
+                                   op.getQuantizationAxis());
+}
+
+//===----------------------------------------------------------------------===//
+// UniformQuantizedDotOp
+//===----------------------------------------------------------------------===//
+//
+
+LogicalResult UniformQuantizedDotOp::verify() {
+  UniformQuantizedDotOp op = *this;
+
+  auto verify_lhs_params =
+      VerifyScalesAndZeroPoints(op, op.getLhsScales(), op.getLhsZeroPoints(),
+                                op.getLhsQuantizationAxis());
+  if (failed(verify_lhs_params)) {
+    return failure();
+  }
+
+  auto verify_rhs_params =
+      VerifyScalesAndZeroPoints(op, op.getRhsScales(), op.getRhsZeroPoints(),
+                                op.getRhsQuantizationAxis());
+  if (failed(verify_rhs_params)) {
+    return failure();
+  }
+
+  return VerifyScalesAndZeroPoints(op, op.getOutputScales(),
+                                   op.getOutputZeroPoints(),
+                                   op.getOutputQuantizationAxis());
+}
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
index b2984078afd..24036d17b58 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
@@ -77,7 +77,7 @@ Type InferReductionOpType(Value input, Value reduction_indices,
 
     // Otherwise, output type has same rank as the input.
     return RankedTensorType::get(
-        SmallVector<int64_t, 4>(rank, ShapedType::kDynamicSize), element_ty);
+        SmallVector<int64_t, 4>(rank, ShapedType::kDynamic), element_ty);
   }
 
   int64_t num_reduce_dim = 0;
@@ -112,9 +112,9 @@ Type InferReductionOpType(Value input, Value reduction_indices,
 // rank and match dimension sizes for all but one of the dimensions.
 LogicalResult VerifyTypesCompatibility(Operation::operand_type_range types,
                                        bool mask_one_dim, Operation *op) {
-  int64_t common_rank = ShapedType::kDynamicSize;
+  int64_t common_rank = ShapedType::kDynamic;
   llvm::SmallVector<int64_t, 4> common_dims;
-  int64_t dim_to_mask = ShapedType::kDynamicSize;
+  int64_t dim_to_mask = ShapedType::kDynamic;
 
   // Initialize common_rank with rank of the first ranked type and verify that
   // following ranked types have the same rank.
@@ -128,9 +128,9 @@ LogicalResult VerifyTypesCompatibility(Operation::operand_type_range types,
     if (!ranked_ty) continue;
 
     int64_t rank = ranked_ty.getRank();
-    if (common_rank == ShapedType::kDynamicSize) {
+    if (common_rank == ShapedType::kDynamic) {
       common_rank = rank;
-      common_dims.resize(common_rank, ShapedType::kDynamicSize);
+      common_dims.resize(common_rank, ShapedType::kDynamic);
     } else if (common_rank != rank) {
       return op->emitError()
              << "operand type " << ranked_ty
@@ -142,16 +142,16 @@ LogicalResult VerifyTypesCompatibility(Operation::operand_type_range types,
       if (i == dim_to_mask) continue;
 
       int64_t dim = ranked_ty.getDimSize(i);
-      if (dim == ShapedType::kDynamicSize) continue;
+      if (dim == ShapedType::kDynamic) continue;
 
       int64_t &common_dim = common_dims[i];
-      if (common_dim == ShapedType::kDynamicSize) {
+      if (common_dim == ShapedType::kDynamic) {
         common_dim = dim;
       } else if (common_dim != dim) {
         // If mask_one_dim is true, do not emit an error if this is the only
         // dimension with mismatches. Note down the dimension to mask it from
         // the following types.
-        if (mask_one_dim && dim_to_mask == ShapedType::kDynamicSize) {
+        if (mask_one_dim && dim_to_mask == ShapedType::kDynamic) {
           dim_to_mask = i;
           continue;
         }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index 4559294c41c..8177609211b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -85,7 +86,7 @@ namespace {
 LogicalResult _XlaHostComputeMlirOp::verify() {
   _XlaHostComputeMlirOp op = *this;
   // Extract the module and function.
-  StringRef host_module = op.host_mlir_module();
+  StringRef host_module = op.getHostMlirModule();
 
   if (host_module.empty()) return success();
 
@@ -122,7 +123,7 @@ LogicalResult _XlaHostComputeMlirOp::verify() {
 
 func::FuncOp _XlaHostComputeMlirOp::GetHostFunc(
     mlir::OwningOpRef<mlir::ModuleOp>* mlir_module) {
-  if (!tensorflow::DeserializeMlirModule(host_mlir_module().str(),
+  if (!tensorflow::DeserializeMlirModule(getHostMlirModule().str(),
                                          this->getContext(), mlir_module)
            .ok())
     return nullptr;
@@ -135,14 +136,20 @@ func::FuncOp _XlaHostComputeMlirOp::GetHostFunc(
 
 // For XLA Send/Recv ops the key corresponds to the resource instance.
 
-std::string _XlaRecvAtHostOp::GetResourceInstanceStr() { return key().str(); }
+std::optional<std::string> _XlaRecvAtHostOp::GetResourceInstanceStr() {
+  return getKey().str();
+}
 
-std::string _XlaRecvAtHostV2Op::GetResourceInstanceStr() { return key().str(); }
+std::optional<std::string> _XlaRecvAtHostV2Op::GetResourceInstanceStr() {
+  return getKey().str();
+}
 
-std::string _XlaSendFromHostOp::GetResourceInstanceStr() { return key().str(); }
+std::optional<std::string> _XlaSendFromHostOp::GetResourceInstanceStr() {
+  return getKey().str();
+}
 
-std::string _XlaSendFromHostV2Op::GetResourceInstanceStr() {
-  return key().str();
+std::optional<std::string> _XlaSendFromHostV2Op::GetResourceInstanceStr() {
+  return getKey().str();
 }
 
 namespace {
@@ -155,24 +162,24 @@ std::string GetRendezvousKey(const std::string& send_device,
 }
 }  // namespace
 
-std::string _HostRecvOp::GetResourceInstanceStr() {
-  return GetRendezvousKey(send_device().str(), send_device_incarnation(),
-                          recv_device().str(), tensor_name().str());
+std::optional<std::string> _HostRecvOp::GetResourceInstanceStr() {
+  return GetRendezvousKey(getSendDevice().str(), getSendDeviceIncarnation(),
+                          getRecvDevice().str(), getTensorName().str());
 }
 
-std::string _HostSendOp::GetResourceInstanceStr() {
-  return GetRendezvousKey(send_device().str(), send_device_incarnation(),
-                          recv_device().str(), tensor_name().str());
+std::optional<std::string> _HostSendOp::GetResourceInstanceStr() {
+  return GetRendezvousKey(getSendDevice().str(), getSendDeviceIncarnation(),
+                          getRecvDevice().str(), getTensorName().str());
 }
 
-std::string _RecvOp::GetResourceInstanceStr() {
-  return GetRendezvousKey(send_device().str(), send_device_incarnation(),
-                          recv_device().str(), tensor_name().str());
+std::optional<std::string> _RecvOp::GetResourceInstanceStr() {
+  return GetRendezvousKey(getSendDevice().str(), getSendDeviceIncarnation(),
+                          getRecvDevice().str(), getTensorName().str());
 }
 
-std::string _SendOp::GetResourceInstanceStr() {
-  return GetRendezvousKey(send_device().str(), send_device_incarnation(),
-                          recv_device().str(), tensor_name().str());
+std::optional<std::string> _SendOp::GetResourceInstanceStr() {
+  return GetRendezvousKey(getSendDevice().str(), getSendDeviceIncarnation(),
+                          getRecvDevice().str(), getTensorName().str());
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 7e77edef436..93fa8e26f14 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -139,8 +139,8 @@ TensorFlowSavedModelDialect::TensorFlowSavedModelDialect(MLIRContext *context)
 static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
   auto attr = named_attr.getValue().dyn_cast<ArrayAttr>();
   if (!attr) {
-    return op->emitError()
-           << "'tf_saved_model.index_path' attribute should be an ArrayAttr";
+    return op->emitError() << "'" << kTfSavedModelIndexPathAttr
+                           << "' attribute should be an ArrayAttr";
   }
   for (auto element : attr) {
     if (element.isa<StringAttr>()) {
@@ -151,8 +151,8 @@ static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
         continue;
       }
     }
-    return op->emitError() << "'tf_saved_model.index_path' elements should "
-                              "be strings or 64-bit integers";
+    return op->emitError() << "'" << kTfSavedModelIndexPathAttr
+                           << "' elements should be strings or 64-bit integers";
   }
   return mlir::success();
 }
@@ -206,7 +206,7 @@ LogicalResult TensorFlowSavedModelDialect::verifyRegionArgAttribute(
     auto arg_type = cast<func::FuncOp>(op).getArgument(arg_index).getType();
     return VerifyBoundInputArgType(op, arg_type, symbol_op);
   }
-  if (named_attr.getName() == "tf_saved_model.index_path") {
+  if (named_attr.getName() == kTfSavedModelIndexPathAttr) {
     return VerifyIndexPath(op, named_attr);
   }
 
@@ -217,7 +217,7 @@ LogicalResult TensorFlowSavedModelDialect::verifyRegionArgAttribute(
 LogicalResult TensorFlowSavedModelDialect::verifyRegionResultAttribute(
     Operation *op, unsigned region_index, unsigned result_index,
     NamedAttribute named_attr) {
-  if (named_attr.getName() == "tf_saved_model.index_path") {
+  if (named_attr.getName() == kTfSavedModelIndexPathAttr) {
     return VerifyIndexPath(op, named_attr);
   }
 
@@ -256,13 +256,13 @@ LogicalResult VerifySessionInitOp(SessionInitializerOp session_init_op,
 
 static bool HasAnyTfSavedModelArgAttr(func::FuncOp func) {
   for (int i = 0, e = func.getNumArguments(); i < e; i++) {
-    if (func.getArgAttr(i, "tf_saved_model.index_path") ||
+    if (func.getArgAttr(i, kTfSavedModelIndexPathAttr) ||
         func.getArgAttr(i, "tf_saved_model.bound_input")) {
       return true;
     }
   }
   for (int i = 0, e = func.getNumResults(); i < e; i++) {
-    if (func.getResultAttr(i, "tf_saved_model.index_path") ||
+    if (func.getResultAttr(i, kTfSavedModelIndexPathAttr) ||
         func.getResultAttr(i, "tf_saved_model.bound_input")) {
       return true;
     }
@@ -273,7 +273,7 @@ static bool HasAnyTfSavedModelArgAttr(func::FuncOp func) {
 static LogicalResult VerifySavedModelModule(
     ModuleOp module, TensorFlowSavedModelDialect *dialect) {
   auto exported_names_ident =
-      StringAttr::get(dialect->getContext(), "tf_saved_model.exported_names");
+      StringAttr::get(dialect->getContext(), kTfSavedModelExportedNamesAttr);
   // Check that there are no duplicated exported_names.
   DenseMap<StringRef, Operation *> exported_name_to_op;
   for (auto &op : module) {
@@ -377,11 +377,12 @@ LogicalResult VerifyExportedFunc(func::FuncOp func) {
       reached_bound_inputs = true;
       continue;
     }
-    if (func.getArgAttr(i, "tf_saved_model.index_path")) {
+    if (func.getArgAttr(i, kTfSavedModelIndexPathAttr)) {
       if (reached_bound_inputs) {
         return func.emitError()
-               << "all 'tf_saved_model.index_path' arg attributes should "
-                  "precede all 'tf_saved_model.bound_input' arg attributes";
+               << "all '" << kTfSavedModelIndexPathAttr
+               << "' arg attributes should precede all "
+                  "'tf_saved_model.bound_input' arg attributes";
       }
       continue;
     }
@@ -391,8 +392,9 @@ LogicalResult VerifyExportedFunc(func::FuncOp func) {
                                  "unless it is being under construction";
     }
     return func.emitError()
-           << "all arguments should have 'tf_saved_model.index_path', "
-              "'tf_saved_model.bound_input' or 'tf.resource_name' attributes";
+           << "all arguments should have '" << kTfSavedModelIndexPathAttr
+           << "', 'tf_saved_model.bound_input' or 'tf.resource_name' "
+              "attributes";
   }
   llvm::SmallDenseSet<StringRef, 8> unique_bound_inputs;
   for (int i = 0, e = func.getNumArguments(); i < e; i++) {
@@ -407,9 +409,9 @@ LogicalResult VerifyExportedFunc(func::FuncOp func) {
   }
 
   for (int i = 0, e = func.getNumResults(); i < e; i++) {
-    if (!func.getResultAttr(i, "tf_saved_model.index_path")) {
-      return func.emitError() << "all results should have "
-                                 "'tf_saved_model.index_path' attributes";
+    if (!func.getResultAttr(i, kTfSavedModelIndexPathAttr)) {
+      return func.emitError() << "all results should have '"
+                              << kTfSavedModelIndexPathAttr << "' attributes";
     }
   }
 
@@ -448,20 +450,20 @@ LogicalResult VerifyInitializerTypeAttr(Operation *op,
 
 LogicalResult TensorFlowSavedModelDialect::verifyOperationAttribute(
     Operation *op, NamedAttribute named_attr) {
-  if (named_attr.getName() == "tf_saved_model.exported_names") {
+  if (named_attr.getName() == kTfSavedModelExportedNamesAttr) {
     if (!isa<func::FuncOp, GlobalTensorOp>(op)) {
-      return op->emitError() << "'tf_saved_model.exported_names' must be on a "
-                                "'func' or 'tf_saved_model.global_tensor' op";
+      return op->emitError()
+             << "'" << kTfSavedModelExportedNamesAttr
+             << "' must be on a 'func' or 'tf_saved_model.global_tensor' op";
     }
     if (!IsStrArrayAttr(named_attr.getValue())) {
-      return op->emitError()
-             << "'tf_saved_model.exported_names' must be an array of strings";
+      return op->emitError() << "'" << kTfSavedModelExportedNamesAttr
+                             << "' must be an array of strings";
     }
     if (!op->getParentOp()->getAttr("tf_saved_model.semantics")) {
-      return op->emitError()
-             << "'tf_saved_model.exported_names' must be on an op "
-                "whose immediate parent has attribute "
-                "'tf_saved_model.semantics'";
+      return op->emitError() << "'" << kTfSavedModelExportedNamesAttr
+                             << "' must be on an op whose immediate parent has "
+                                "attribute 'tf_saved_model.semantics'";
     }
     if (auto func = dyn_cast<func::FuncOp>(op)) {
       if (failed(VerifyExportedFunc(func))) {
@@ -493,7 +495,7 @@ LogicalResult TensorFlowSavedModelDialect::verifyOperationAttribute(
 SmallVector<StringRef, 2> GetExportedNames(Operation *op) {
   SmallVector<StringRef, 2> ret;
   auto exported_names =
-      op->getAttrOfType<ArrayAttr>("tf_saved_model.exported_names");
+      op->getAttrOfType<ArrayAttr>(kTfSavedModelExportedNamesAttr);
   if (exported_names) {
     for (auto name : exported_names) {
       ret.push_back(name.cast<StringAttr>().getValue());
@@ -504,7 +506,7 @@ SmallVector<StringRef, 2> GetExportedNames(Operation *op) {
 
 bool IsExported(Operation *op) {
   auto exported_names =
-      op->getAttrOfType<ArrayAttr>("tf_saved_model.exported_names");
+      op->getAttrOfType<ArrayAttr>(kTfSavedModelExportedNamesAttr);
   return exported_names && !exported_names.empty();
 }
 
@@ -591,5 +593,38 @@ SmallVector<StringRef, 2> GetSessionInitializerExportedName(ModuleOp op) {
   return results;
 }
 
+SmallVector<func::FuncOp, 2> GetInitializerFunctions(ModuleOp module_op) {
+  SessionInitializerOp session_initializer_op =
+      GetSessionInitializerOp(module_op);
+  if (!session_initializer_op) return {};
+
+  SymbolTable symbol_table(module_op);
+
+  SmallVector<func::FuncOp, 2> init_func_ops;
+  for (auto init_func_sym : session_initializer_op.getInitializers()
+                                .getAsValueRange<FlatSymbolRefAttr>()) {
+    auto init_func_op = symbol_table.lookup<func::FuncOp>(init_func_sym);
+    // `init_func_op` is guaranteed to be not null in a valid module.
+    init_func_ops.push_back(init_func_op);
+  }
+
+  return init_func_ops;
+}
+
+func::FuncOp GetInitializerFunction(ModuleOp module_op,
+                                    const StringRef initializer_type) {
+  SmallVector<func::FuncOp, 2> init_func_ops =
+      GetInitializerFunctions(module_op);
+
+  auto init_func_itr = absl::c_find_if(
+      init_func_ops, [initializer_type](const func::FuncOp init_func_op) {
+        const auto init_type_attr = init_func_op->getAttrOfType<StringAttr>(
+            kTfSavedModelInitializerTypeAttr);
+        return init_type_attr && init_type_attr == initializer_type;
+      });
+
+  return init_func_itr == init_func_ops.end() ? nullptr : *init_func_itr;
+}
+
 }  // namespace tf_saved_model
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 6b66d2e841d..53d335f4ff2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -24,6 +24,16 @@ limitations under the License.
 namespace mlir {
 namespace tf_saved_model {
 
+// The name of the attribute indicating under what name an object is exported.
+inline constexpr StringRef kTfSavedModelExportedNamesAttr =
+    "tf_saved_model.exported_names";
+
+// The name of the attribute attached to input arguments or results of a
+// function to represent the path which one would use to index into a structured
+// value to reach a given tensor.
+inline constexpr StringRef kTfSavedModelIndexPathAttr =
+    "tf_saved_model.index_path";
+
 // Name of the attribute that inidicates the type of initializer. It should be
 // on a function and the function should exist in the initializers attribute of
 // the SessionInitializerOp.
@@ -70,7 +80,7 @@ SmallVector<StringRef, 2> GetExportedNames(Operation *op);
 bool IsExported(Operation *op);
 
 // Returns true if `module` has tf_saved_model linkage semantics.
-bool HasTfSavedModelSemantics(ModuleOp module);
+bool HasTfSavedModelSemantics(ModuleOp module_op);
 
 // Returns the tf_saved_model.global_tensor op that func's arg_index'th argument
 // refers to as a bound input, or null.
@@ -90,10 +100,19 @@ Type GetBoundInputArgTypeFor(mlir::Operation *op);
 
 // Returns the session initializer of this module if it exists. Returns null
 // otherwise.
-SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op);
+SessionInitializerOp GetSessionInitializerOp(ModuleOp module_op);
 
 // Returns the exported name for the session initializer function.
-SmallVector<StringRef, 2> GetSessionInitializerExportedName(mlir::ModuleOp op);
+SmallVector<StringRef, 2> GetSessionInitializerExportedName(ModuleOp module_op);
+
+// Returns initializer function ops. These functions' symbols are in the
+// "initializers" attribute of the session initializer op.
+SmallVector<func::FuncOp, 2> GetInitializerFunctions(ModuleOp module_op);
+
+// Returns the initializer function whose `tf_saved_model.initializer_type`
+// attribute matches `initializer_type`. Returns a null op if it doesn't exist.
+func::FuncOp GetInitializerFunction(ModuleOp module_op,
+                                    StringRef initializer_type);
 
 }  // namespace tf_saved_model
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index c3588720060..03f70eb9388 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -83,8 +83,7 @@ def TfSavedModel_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlir::tf_saved_model";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let  useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
new file mode 100644
index 00000000000..48cfb26d680
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/platform/test.h"
+
+namespace mlir {
+namespace tf_saved_model {
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsNull;
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+// Fixture for testing TfSavedModel functionalities. Initializes a
+// `MLIRContext` by loading the `tf_saved_model` dialect.
+class TfSavedModelTest : public ::testing::Test {
+ protected:
+  TfSavedModelTest() : ctx_() {
+    ctx_.loadDialect<TensorFlowSavedModelDialect, func::FuncDialect>();
+  }
+
+  MLIRContext ctx_;
+};
+
+// Parses `module_op_str` and returns the resulting `ModuleOp`.
+ModuleOp ParseModuleOp(const StringRef module_op_str, Block& block,
+                       MLIRContext& ctx) {
+  const LogicalResult parse_result =
+      parseSourceString(module_op_str, &block, ParserConfig(&ctx));
+  EXPECT_TRUE(succeeded(parse_result));
+
+  return cast<ModuleOp>(block.front());
+}
+
+TEST_F(TfSavedModelTest,
+       GetInitializerFunctionReturnsNullWhenNoSessionInitializerOp) {
+  constexpr StringRef kModuleOpStr =
+      R"mlir(module attributes {tf_saved_model.semantics} {})mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerInitType);
+
+  EXPECT_THAT(init_func_op, IsNull());
+}
+
+TEST_F(TfSavedModelTest,
+       GetInitializerFunctionReturnsNullWhenInitializersEmpty) {
+  constexpr StringRef kModuleOpStr = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+    }
+  )mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerInitType);
+
+  EXPECT_THAT(init_func_op, IsNull());
+}
+
+TEST_F(TfSavedModelTest,
+       GetInitializerFunctionReturnsFuncOpMatchingInitializerType) {
+  constexpr StringRef kModuleOpStr = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func]} : () -> ()
+      func.func @init_func() attributes {tf_saved_model.exported_names = ["init_func"], tf_saved_model.initializer_type = "init_op"} {
+        func.return
+      }
+    }
+  )mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerInitType);
+
+  EXPECT_THAT(init_func_op, NotNull());
+  EXPECT_THAT(init_func_op.getSymName(), "init_func");
+  EXPECT_THAT(
+      init_func_op->getAttrOfType<StringAttr>(kTfSavedModelInitializerTypeAttr),
+      kTfSavedModelInitializerInitType);
+}
+
+TEST_F(TfSavedModelTest, GetInitializerFunctionNoMatchingInitializerType) {
+  constexpr StringRef kModuleOpStr = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func]} : () -> ()
+      func.func @init_func() attributes {tf_saved_model.exported_names = ["init_func"], tf_saved_model.initializer_type = "restore_op"} {
+        func.return
+      }
+    }
+  )mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  func::FuncOp init_func_op = GetInitializerFunction(
+      module_op, /*initializer_type=*/kTfSavedModelInitializerInitType);
+
+  // No initializer function matches the initializer type.
+  EXPECT_THAT(init_func_op, IsNull());
+}
+
+TEST_F(TfSavedModelTest, GetInitializerFunctionsEmptyWhenNoInitFunctions) {
+  constexpr StringRef kModuleOpStr = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = []} : () -> ()
+    }
+  )mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  SmallVector<func::FuncOp, 2> init_func_ops =
+      GetInitializerFunctions(module_op);
+
+  EXPECT_THAT(init_func_ops, IsEmpty());
+}
+
+TEST_F(TfSavedModelTest,
+       GetInitializerFunctionsEmptyWhenNoSessionInitializerOp) {
+  constexpr StringRef kModuleOpStr =
+      R"mlir(module attributes {tf_saved_model.semantics} {})mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  SmallVector<func::FuncOp, 2> init_func_ops =
+      GetInitializerFunctions(module_op);
+
+  EXPECT_THAT(init_func_ops, IsEmpty());
+}
+
+TEST_F(TfSavedModelTest, GetInitializerFunctionsReturnsMultipleFuncOps) {
+  constexpr StringRef kModuleOpStr = R"mlir(
+    module attributes {tf_saved_model.semantics} {
+      "tf_saved_model.session_initializer"() {initializers = [@init_func1, @init_func2]} : () -> ()
+
+      func.func @init_func1() attributes {tf_saved_model.exported_names = ["init_func1"], tf_saved_model.initializer_type = "init_op"} {
+        func.return
+      }
+
+      func.func @init_func2() attributes {tf_saved_model.exported_names = ["init_func2"], tf_saved_model.initializer_type = "restore_op"} {
+        func.return
+      }
+    }
+  )mlir";
+
+  Block block;
+  ModuleOp module_op = ParseModuleOp(kModuleOpStr, block, ctx_);
+
+  SmallVector<func::FuncOp, 2> init_func_ops =
+      GetInitializerFunctions(module_op);
+
+  EXPECT_THAT(init_func_ops, SizeIs(2));
+  EXPECT_THAT(init_func_ops[0].getSymName(), Eq("init_func1"));
+  EXPECT_THAT(init_func_ops[1].getSymName(), Eq("init_func2"));
+}
+
+}  // namespace
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 9299dad8474..0b456a0de25 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -99,6 +99,11 @@ struct CollectiveReduceOrdering
   StringRef getName() final { return "CollectiveReduceOrdering"; }
 };
 
+struct NcclAllReduceOrdering
+    : public ::mlir::SideEffects::Resource::Base<NcclAllReduceOrdering> {
+  StringRef getName() final { return "NcclAllReduceOrdering"; }
+};
+
 // Returns true iff resource type with given ID is only self-dependent, i.e.,
 // there are no dependencies to other resource types (including unknown resource
 // type).
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
index 9017b062b43..d0131d9f8cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 
+#include <optional>
+
 namespace mlir {
 namespace TF {
 
@@ -34,7 +36,7 @@ llvm::Optional<GpuDeviceMetadata> RuntimeDevices::GetGpuDeviceMetadata(
   if (it != gpu_metadata_.end()) {
     return it->second;
   } else {
-    return llvm::None;
+    return std::nullopt;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 03ca1a545a1..28e7daa7f52 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -116,7 +116,7 @@ inline ShapedType MergeType(ShapedType a, ShapedType b) {
   for (int i = 0, e = rank; i != e; i++) {
     int64_t dim0 = a.getDimSize(i);
     int64_t dim1 = b.getDimSize(i);
-    dims[i] = (dim0 == ShapedType::kDynamicSize) ? dim1 : dim0;
+    dims[i] = (dim0 == ShapedType::kDynamic) ? dim1 : dim0;
   }
   return RankedTensorType::get(dims, a.getElementType());
 }
@@ -148,7 +148,7 @@ class SameOperandsAndResultTypeResolveRef
   }
 
   static LogicalResult inferReturnTypeComponentsFromOperands(
-      MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+      MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
       DictionaryAttr attributes, RegionRange regions,
       SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
     if (operands.empty())
@@ -175,7 +175,7 @@ template <typename ConcreteType>
 class CannotDuplicate : public TraitBase<ConcreteType, CannotDuplicate> {
  public:
   static LogicalResult verifyTrait(Operation* op) {
-    if (MemoryEffectOpInterface::hasNoEffect(op))
+    if (isMemoryEffectFree(op))
       return op->emitError(
           "operations with no side effects cannot have CannotDuplicate trait");
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
index a097a3cad88..940292e7724 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
@@ -64,6 +64,8 @@ HANDLE_TF_REF_TYPE(Complex64Ref, COMPLEX64_REF, "complex64ref")
 HANDLE_TF_REF_TYPE(Complex128Ref, COMPLEX128_REF, "complex128ref")
 HANDLE_TF_REF_TYPE(HalfRef, HALF_REF, "halfref")
 HANDLE_TF_REF_TYPE(ResourceRef, RESOURCE_REF, "resourceref")
+HANDLE_TF_REF_TYPE(Float8E4M3FNRef, FLOAT8_E4M3FN_REF, "float8e4m3fnref")
+HANDLE_TF_REF_TYPE(Float8E5M2Ref, FLOAT8_E5M2_REF, "float8e5m2ref")
 
 #ifndef HANDLE_LAST_TF_TYPE
 #define HANDLE_LAST_TF_TYPE(class, enumerant, name) \
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
index f0a337a15c1..da2075cdc0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
@@ -38,13 +38,13 @@ _TfrtGetResourceOp::GetResourceHandleValueAndIdList(
   llvm::SmallVector<ResourceHandleValueAndId, 4> resource_vec;
   llvm::StringRef device = GetDeviceOrEmpty(getOperation());
 
-  for (auto iter : llvm::enumerate(results())) {
+  for (auto iter : llvm::enumerate(getResults())) {
     auto index = iter.index();
     if (getElementTypeOrSelf(iter.value().getType()).isa<TF::ResourceType>()) {
       resource_vec.push_back(GetResourceHandleValueAndIdBase(
-          container()[index].cast<mlir::StringAttr>().getValue(),
-          shared_name()[index].cast<mlir::StringAttr>().getValue(), device,
-          results()[index], resource_handle_id_map, next_id));
+          getContainer()[index].cast<mlir::StringAttr>().getValue(),
+          getSharedName()[index].cast<mlir::StringAttr>().getValue(), device,
+          getResults()[index], resource_handle_id_map, next_id));
     }
   }
   return resource_vec;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
new file mode 100644
index 00000000000..5921efa2096
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
+
+#include <vector>
+
+namespace mlir {
+namespace TF {
+
+const llvm::SmallDenseSet<mlir::TypeID>&
+TPUEmbeddingOpsRegistry::GetOpsTypeIds() {
+  return ops_type_ids_;
+}
+
+// static
+TPUEmbeddingOpsRegistry& TPUEmbeddingOpsRegistry::Global() {
+  static TPUEmbeddingOpsRegistry* registry = new TPUEmbeddingOpsRegistry;
+  return *registry;
+}
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h
new file mode 100644
index 00000000000..c8160418e8f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
+
+#include "llvm/ADT/DenseSet.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// A global ops registry that is used to hold TPU embedding ops.
+//
+// Example:
+//    TPUEmbeddingOpsRegistry::Global().Add<TF::FooOp>();
+//    for (auto op_type_id : TPUEmbeddingOpsRegistry::Global().GetOpsTypeIds())
+//    {
+//      ...
+//    }
+class TPUEmbeddingOpsRegistry {
+ public:
+  // Add the op to the registry.
+  //
+  // Adding an op here will allow use old bridge legalization from the MLIR
+  // bridge with the use of fallback mechanism. Therefore, addition of any op
+  // here must have a python test with MLIR bridge enabled to verify that the
+  // fallback works correctly.
+  template <typename OpType>
+  void Add() {
+    ops_type_ids_.insert(TypeID::get<OpType>());
+  }
+
+  // Returns the type id of the ops in the TPUEmbeddingOpRegistry.
+  const llvm::SmallDenseSet<mlir::TypeID>& GetOpsTypeIds();
+
+  // Returns the global registry.
+  static TPUEmbeddingOpsRegistry& Global();
+
+ private:
+  llvm::SmallDenseSet<mlir::TypeID> ops_type_ids_{};
+};
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TPU_EMBEDDING_OPS_REGISTRY_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
index be43c9ebc05..6cc7344b083 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
@@ -1,11 +1,19 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    size_override = {
+        "decompose_resource_ops.mlir": "medium",
+        "layout_optimization_move_transposes_end.mlir": "medium",
+        "layout_optimization_to_nhwc.mlir": "medium",
+    },
     tags_override = {
         "optimize.mlir": ["no_rocm"],
         "tf_optimize.mlir": ["no_rocm"],
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 0b34e68694f..c991ddfcdcb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' | FileCheck %s
 
 // CHECK-LABEL: func @tfAssertTrue
 func.func @tfAssertTrue(%arg0: tensor<1x1x6x2xf32>) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
index 641a76f5397..16acdcec5ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
@@ -245,13 +245,13 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // Note that tf.C is moved before tf_device.launch.
-        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]])
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
-        %4 = "tf.C"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%arg0) {is_stateless = true} : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
@@ -273,6 +273,78 @@ module {
 
 // -----
 
+// Side effecting ops
+
+module {
+  // CHECK-LABEL: func @sideeffect
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<?xi32>)
+  func.func @sideeffect(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = tf_executor.graph {
+      %1:2 = tf_executor.island {
+
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
+        %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT0:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: tf_device.return %[[B_OUTPUT]]
+        %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+
+        // tf.B and tf.D cannot be merged because of tf.C, which is assumed to have a side effect.
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+
+        %4 = "tf.C"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT1:[0-9]*]] = "tf_device.launch"
+        // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[TPU0_OUTPUT0]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: tf_device.return %[[D_OUTPUT]]
+        %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+
+        // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[C_OUTPUT]], %[[TPU0_OUTPUT1]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+        %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
+
+        // CHECK: tf_executor.yield %[[E_OUTPUT]]
+        tf_executor.yield %6 : tensor<?xi32>
+      }
+      tf_executor.fetch %1#0 : tensor<?xi32>
+    }
+    func.return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Cluster formation that requires reordering users of the cluster op.
+
+module {
+  // CHECK-LABEL: func @dominanceorder
+  // CHECK-SAME: (%[[ARG0:.*]]: tensor<?xi32>)
+  func.func @dominanceorder(%arg0: tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>) {
+    %0:2 = tf_executor.graph {
+      %1:3 = tf_executor.island {
+        %2 = "tf.A"(%arg0) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+        %3 = "tf.B"(%2) {is_stateless = true} : (tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+        tf_executor.yield %3, %4 : tensor<?xi32>, tensor<?xi32>
+
+        // CHECK: %[[TPU0_OUTPUT:.*]]:2 = "tf_device.launch"
+        // CHECK: %[[A:.*]] = "tf.A"(%[[ARG0]])
+        // CHECK: %[[C:.*]] = "tf.C"(%[[A]])
+        // CHECK: tf_device.return %[[A]], %[[C]]
+
+        // CHECK: %[[B:.*]] = "tf.B"(%[[TPU0_OUTPUT]]#0)
+        // CHECK: tf_executor.yield %[[B]], %[[TPU0_OUTPUT]]#1
+      }
+      tf_executor.fetch %1#0, %1#1 : tensor<?xi32>, tensor<?xi32>
+    }
+    func.return %0#0, %0#1 : tensor<?xi32>, tensor<?xi32>
+  }
+}
+
+// -----
+
 // Multiple device clusters with intertwined instructions in original block.
 
 module {
@@ -286,7 +358,7 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
-        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
+        // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]])
         // CHECK: tf_device.return %[[C_OUTPUT]]
         // CHECK: {device = "gpu0"} : () -> tensor<?xi32>
 
@@ -294,7 +366,7 @@ module {
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
-        %4 = "tf.C"(%arg0) {device = "gpu0"} : (tensor<?xi32>) -> tensor<?xi32>
+        %4 = "tf.C"(%arg0) {device = "gpu0", is_stateless = true} : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[B_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
index 0a68d0d85c0..9132abf2fe5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/replicate-tensor-list-init-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/replicate-tensor-list-init-ops.mlir
new file mode 100644
index 00000000000..2ea3bf4d935
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/replicate-tensor-list-init-ops.mlir
@@ -0,0 +1,26 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func.func @main() -> (tensor<300x?xf32>, tensor<300x?xf32>) {
+    %elem_shape = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %size = "tf.Const"() {value = dense<300> : tensor<i32>} : () -> tensor<i32>
+    %tl = "tf.TensorListReserve"(%elem_shape, %size) : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+
+    %idx = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %elem_1 = "tf.Const"() {value = dense<10.0> : tensor<8xf32>} : () -> tensor<8xf32>
+    %tl_set_item = "tf.TensorListSetItem"(%tl, %idx, %elem_1) : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<8xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+    %elem_shape_2 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %tls = "tf.TensorListStack"(%tl_set_item, %elem_shape_2) {num_elements = 300 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<300x?xf32>
+
+    %elem_2 = "tf.Const"() {value = dense<10.0> : tensor<9xf32>} : () -> tensor<9xf32>
+    %tl_set_item_2 = "tf.TensorListSetItem"(%tl, %idx, %elem_2) : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<9xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+    %elem_shape_3 = "tf.Const"() {value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %tls_2 = "tf.TensorListStack"(%tl_set_item_2, %elem_shape_3) {num_elements = 300 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<300x?xf32>
+    func.return %tls, %tls_2 : tensor<300x?xf32>, tensor<300x?xf32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} () -> (f32[300,8], f32[300,9]) {
+// CHECK:       %tuple.{{[0-9]+}} = (f32[300,8]{1,0}, f32[300,9]{1,0})
+// CHECK:       }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/stablehlo_add.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/stablehlo_add.mlir
new file mode 100644
index 00000000000..8727fd39c12
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/stablehlo_add.mlir
@@ -0,0 +1,20 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -tf-xla-emit-return-tuple | FileCheck %s
+
+
+// TODO(b/259459405): Remove this test along with the upstream refactoring to
+// avoid non TF inputs.
+// This is not a supported mode.
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+    %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> (f32[]) {
+// CHECK-NEXT:    %[[ARG0]] = f32[] parameter(0)
+// CHECK-NEXT:    %[[ARG1]] = f32[] parameter(1)
+// CHECK-NEXT:    [[ADD:%.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
+// CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] [[ADD]])
+// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 8af56e9d733..5a3f48c1252 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -608,7 +608,7 @@ func.func @testBroadcastGradientArgsHigherRank() -> (tensor<2xi32>, tensor<2xi32
   // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
-  // CEHCK: return [[R0]], [[R1]]
+  // CHECK: return %[[R0]], %[[R1]]
 
   func.return %r0, %r1 : tensor<2xi32>, tensor<2xi32>
 }
@@ -621,7 +621,7 @@ func.func @testBroadcastGradientArgsScalar() -> (tensor<2xi32>, tensor<0xi32>) {
   // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
   // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NOT: tf.BroadcastGradientArgs
-  // CEHCK: return [[R0]], [[R1]]
+  // CHECK: return %[[R0]], %[[R1]]
 
   func.return %r0, %r1 : tensor<2xi32>, tensor<0xi32>
 }
@@ -634,7 +634,7 @@ func.func @testBroadcastGradientArgI64() -> (tensor<2xi64>, tensor<0xi64>) {
   // CHECK-DAG: %[[R0:.*]] = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>} : () -> tensor<2xi64>
   // CHECK-DAG: %[[R1:.*]] = "tf.Const"() {value = dense<> : tensor<0xi64>} : () -> tensor<0xi64>
   // CHECK-NOT: tf.BroadcastGradientArgs
-  // CEHCK: return [[R0]], [[R1]]
+  // CHECK: return %[[R0]], %[[R1]]
 
   func.return %r0, %r1 : tensor<2xi64>, tensor<0xi64>
 }
@@ -653,7 +653,7 @@ func.func @testEmptyResults(%arg0: tensor<0x2xf32>) -> tensor<0x2xf32> {
 //
 // CHECK-LABEL: func @yieldOp
 func.func @yieldOp(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i1>) -> (tensor<f32>) {
-  // CHECK-2: tf.Yield
+  // CHECK-COUNT-2: tf.Yield
   %0 = "tf.IfRegion"(%arg2) ({
       "tf.Yield"(%arg0) : (tensor<f32>) -> ()
     }, {
@@ -697,3 +697,47 @@ func.func @range_float() -> tensor<?xf32> {
   %0 = "tf.Range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
+
+// CHECK-LABEL: func @testLogicalAndFoldsWithConstantFalse
+func.func @testLogicalAndFoldsWithConstantFalse(%arg0: tensor<i1>) -> (tensor<i1>) {
+  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %cst = arith.constant dense<false> : tensor<i1>
+
+  %0 = "tf.LogicalAnd"(%cst, %arg0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+
+  // CHECK: return [[CST]]
+  func.return %0: tensor<i1>
+}
+
+// CHECK-LABEL: func @testLogicalAndFoldsWithConstantFalseSecondArg
+func.func @testLogicalAndFoldsWithConstantFalseSecondArg(%arg0: tensor<i1>) -> (tensor<i1>) {
+  // CHECK: [[CST:%.+]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+  %cst = arith.constant dense<false> : tensor<i1>
+
+  %0 = "tf.LogicalAnd"(%arg0, %cst) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+
+  // CHECK: return [[CST]]
+  func.return %0: tensor<i1>
+}
+
+// CHECK-LABEL: func @testLogicalAndNoFoldWithConstTrue
+func.func @testLogicalAndNoFoldWithConstTrue(%arg0: tensor<i1>) -> (tensor<i1>) {
+  %cst = arith.constant dense<true> : tensor<i1>
+
+  // CHECK: %[[LOGICAL_AND:.*]] = "tf.LogicalAnd"
+  %0 = "tf.LogicalAnd"(%cst, %arg0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+
+  // CHECK: return %[[LOGICAL_AND]]
+  func.return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @testLogicalAndDoesntFoldWithConstantFalseBroadcast
+func.func @testLogicalAndDoesntFoldWithConstantFalseBroadcast(%arg0: tensor<2xi1>) -> (tensor<2xi1>) {
+  %cst = arith.constant dense<false> : tensor<i1>
+
+  // CHECK: %[[LOGICAL_AND:.*]] = "tf.LogicalAnd"
+  %0 = "tf.LogicalAnd"(%cst, %arg0) : (tensor<i1>, tensor<2xi1>) -> tensor<2xi1>
+
+  // CHECK: return %[[LOGICAL_AND]]
+  func.return %0: tensor<2xi1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_session_initializer_to_function.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_session_initializer_to_function.mlir
new file mode 100644
index 00000000000..f8c56ce5467
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_session_initializer_to_function.mlir
@@ -0,0 +1,51 @@
+// RUN: tf-opt %s --tf-saved-model-convert-session-initializer-to-function --split-input-file | FileCheck %s
+
+// CHECK-LABEL: simple_initializer
+// CHECK-NOT: tf_saved_model.session_initializer
+// CHECK: func @session_initializer
+// CHECK: call @init1
+module @simple_initializer attributes {tf_saved_model.semantics} {
+"tf_saved_model.session_initializer"() {initializers = [@init1]} : () -> ()
+func.func @init1() attributes {tf_saved_model.exported_names = ["init1"]} {
+  %0 = "tf.Const"() {value = dense<42> : tensor<1xi64>} : () -> tensor<1xi64>
+  return
+}
+}
+
+// -----
+
+// CHECK-LABEL: with_initializer_type
+// CHECK-NOT: tf_saved_model.session_initializer
+// CHECK: func @session_initializer
+// CHECK: call @init1
+module @with_initializer_type attributes {tf_saved_model.semantics} {
+"tf_saved_model.session_initializer"() {initializers = [@init1]} : () -> ()
+func.func @init1() attributes {tf_saved_model.exported_names = ["init1"], tf_saved_model.initializer_type = "init_op"} {
+  %0 = "tf.Const"() {value = dense<42> : tensor<1xi64>} : () -> tensor<1xi64>
+  return
+}
+}
+
+// -----
+
+// CHECK-LABEL: multiple_initializers
+// CHECK-NOT: tf_saved_model.session_initializer
+// CHECK: func @session_initializer
+// CHECK: call @init1
+// CHECK: call @init2
+// CHECK: call @init3
+module @multiple_initializers attributes {tf_saved_model.semantics} {
+"tf_saved_model.session_initializer"() {initializers = [@init1, @init2, @init3]} : () -> ()
+func.func @init1() attributes {tf_saved_model.exported_names = ["init1"]} {
+  %0 = "tf.Const"() {value = dense<42> : tensor<1xi64>} : () -> tensor<1xi64>
+  return
+}
+func.func @init2() attributes {tf_saved_model.exported_names = ["init2"]} {
+  %0 = "tf.Const"() {value = dense<43> : tensor<1xi64>} : () -> tensor<1xi64>
+  return
+}
+func.func @init3() attributes {tf_saved_model.exported_names = ["init3"]} {
+  %0 = "tf.Const"() {value = dense<44> : tensor<1xi64>} : () -> tensor<1xi64>
+  return
+}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_canonicalize.mlir
index d000293871d..d03a776289c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/device_canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // Test empty launch with no results is folded away.
 // CHECK-LABEL: func @empty_launch_no_results
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
index 0f85928a0f1..67a3e54979c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -15,6 +15,16 @@ func.func @einsum_matmul(%arg0: tensor<7x9xf32>, %arg1: tensor<9x5xf32>) -> tens
   // CHECK: return %[[v0]] : tensor<7x5xf32>
 }
 
+func.func @einsum_matmul_dynamic_size(%arg0: tensor<2x?x?x?xf32>, %arg1: tensor<2x?xf32>) -> tensor<2x?x?x1xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "bxyc,bx->bxyc"} : (tensor<2x?x?x?xf32>, tensor<2x?xf32>) -> tensor<2x?x?x1xf32>
+  func.return %0 : tensor<2x?x?x1xf32>
+  // CHECK-LABEL: einsum_matmul_dynamic_size
+  // CHECK-DAG: %[[cst:.*]] = arith.constant dense<[2, -1, 1, 1]> : tensor<4xi64>
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg1, %cst) : (tensor<2x?xf32>, tensor<4xi64>) -> tensor<2x?x1x1xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %0) {adj_x = false, adj_y = false} : (tensor<2x?x?x?xf32>, tensor<2x?x1x1xf32>) -> tensor<2x?x?x1xf32>
+  // CHECK: return %[[v1]] : tensor<2x?x?x1xf32>
+}
+
 func.func @einsum_broadcast(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6xf32>) -> tensor<3x4x6xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,km->ijm"}: (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
   func.return %0 : tensor<3x4x6xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
index abba4557eb2..17efa936c81 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 
 // Test single graph with no outputs and one island is folded away.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir
index 0dfaa3c6490..5b10a6ded47 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_materialize_const.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // Test that a constant stays inside an island after canonicalization
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
index c9b9a22838d..954eca9c0e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
index c9b9a22838d..954eca9c0e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
index c9b9a22838d..954eca9c0e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
index 7b266683958..e435debc0e0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/freeze_variables.mlir
@@ -408,3 +408,47 @@ module {
     func.return %c0 : tensor<0xf32>
   }
 }
+
+// -----
+
+// Tests that entries corresponding to removed arguments in "tf._input_shapes"
+// is also removed.
+
+module {
+  func.func @f() -> tensor<0xf32> {
+    // CHECK-NOT: "tf.VarHandleOp"
+    %handle = "tf.VarHandleOp"() {container="", shared_name="var1", device = "/job:worker/replica:0/task:1/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<0xf32>>>
+    %cst = "tf.Const"() { value = dense<1.0> : tensor<0xf32> } : () -> tensor<0xf32>
+    %val = "tf.PartitionedCall"(%cst, %handle) {config = "", config_proto = "", executor_type = "", f = @f_callee} : (tensor<0xf32>, tensor<!tf_type.resource<tensor<0xf32>>>) -> (tensor<0xf32>)
+    func.return %val : tensor<0xf32>
+  }
+
+  // CHECK: func private @f_callee(%[[ARG0:.*]]: tensor<0xf32>) -> tensor<0xf32>
+  // CHECK-SAME: tf._input_shapes = [#tf_type.shape<0>]
+  func.func private @f_callee(%arg0: tensor<0xf32>, %arg1: tensor<*x!tf_type.resource>) -> tensor<0xf32> attributes {tf._input_shapes = [#tf_type.shape<0>, #tf_type.shape<>]} {
+    %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource>) -> tensor<0xf32>
+    %1 = "tf.AddV2"(%arg0, %0) : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
+    func.return %1 : tensor<0xf32>
+  }
+}
+
+// -----
+
+// Tests that an error is emitted when the number of arguments and the size of
+// "tf._input_shapes" attribute doesn't match.
+
+module {
+  func.func @f() -> tensor<0xf32> {
+    %handle = "tf.VarHandleOp"() {container="", shared_name="var1", device = "/job:worker/replica:0/task:1/device:CPU:0"} : () -> tensor<!tf_type.resource<tensor<0xf32>>>
+    %cst = "tf.Const"() { value = dense<1.0> : tensor<0xf32> } : () -> tensor<0xf32>
+    %val = "tf.PartitionedCall"(%cst, %handle) {config = "", config_proto = "", executor_type = "", f = @f_callee} : (tensor<0xf32>, tensor<!tf_type.resource<tensor<0xf32>>>) -> (tensor<0xf32>)
+    func.return %val : tensor<0xf32>
+  }
+
+  // expected-error@+1 {{Number of arguments and 'tf._input_shapes' attribute size do not match. Num args: 2, tf._input_shapes size: 3}}
+  func.func private @f_callee(%arg0: tensor<0xf32>, %arg1: tensor<*x!tf_type.resource>) -> tensor<0xf32> attributes {tf._input_shapes = [#tf_type.shape<0>, #tf_type.shape<>, #tf_type.shape<9x9x9>]} {
+    %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource>) -> tensor<0xf32>
+    %1 = "tf.AddV2"(%arg0, %0) : (tensor<0xf32>, tensor<0xf32>) -> tensor<0xf32>
+    func.return %1 : tensor<0xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
index 4fd67f248d8..71493d0f30a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
index 4d34db63405..ab1cc6459a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
index 8003ed93789..52f29badb19 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/guarantee-all-funcs-one-use.mlir
@@ -68,3 +68,21 @@ module {
     func.return
   }
 }
+
+// -----
+// Test stateful and stateless partitioned calls.
+// CHECK-LABEL: func @f
+func.func @f() {
+  // CHECK: "tf.PartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
+  "tf.PartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
+  // CHECK: "tf.StatefulPartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @[[NEWG:.+]]} : () -> ()
+  "tf.StatefulPartitionedCall"() {config = "",  config_proto = "", executor_type = "", f = @g} : () -> ()
+  func.return
+}
+
+// CHECK: func.func @g()
+// CHECK: func.func private @[[NEWG]]()
+func.func @g() {
+  func.return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
index b1539bae843..5c73ce94600 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-launch-to-device-attribute | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-launch-to-device-attribute=legacy-graph-export=false | FileCheck %s
 
 
 // Tests single TensorFlow op is hoisted out and has the correct device assigned
@@ -20,12 +20,13 @@ func.func @single_op_launch() {
   func.return
 }
 
-// CHECK:      %[[A:.*]] = "tf.opA"
-// CHECK:      %[[B:.*]]:2 = "tf.opB"(%[[A]])
+// CHECK-NOT: tf_executor.island
+// CHECK:      %[[A:.*]], {{.*}} = tf_executor.island wraps "tf.opA"
+// CHECK:      %[[B:.*]]:2, {{.*}} = tf_executor.island wraps "tf.opB"(%[[A]])
 // CHECK-SAME: device = "CPU:0"
-// CHECK:      %[[C:.*]] = "tf.opC"
+// CHECK:      %[[C:.*]], {{.*}} = tf_executor.island wraps "tf.opC"
 // CHECK-NOT:  "tf_device.launch"
-// CHECK:      tf_executor.yield %[[A]], %[[B]]#1, %[[B]]#0, %[[C]]
+// CHECK-NOT:      tf_executor.yield
 
 
 // Tests multiple TensorFlow ops are hoisted out and all have the correct device
@@ -48,14 +49,15 @@ func.func @multi_op_launch() {
   func.return
 }
 
-// CHECK:      %[[A:.*]] = "tf.opA"
-// CHECK:      %[[B:.*]] = "tf.opB"(%[[A]])
+// CHECK-NOT: tf_executor.island
+// CHECK:      %[[A:.*]], {{.*}} = tf_executor.island wraps "tf.opA"
+// CHECK:      %[[B:.*]], {{.*}} = tf_executor.island wraps "tf.opB"(%[[A]])
 // CHECK-SAME: device = "CPU:0"
-// CHECK:      %[[C:.*]] = "tf.opC"(%[[B]])
+// CHECK:      %[[C:.*]], {{.*}} = tf_executor.island wraps "tf.opC"(%[[B]])
 // CHECK-SAME: device = "CPU:0"
-// CHECK:      %[[D:.*]] = "tf.opD"
+// CHECK:      %[[D:.*]], {{.*}} = tf_executor.island wraps "tf.opD"
 // CHECK-NOT:  "tf_device.launch"
-// CHECK:      tf_executor.yield %[[A]], %[[C]], %[[B]], %[[D]]
+// CHECK-NOT:      tf_executor.yield %[[A]], %[[C]], %[[B]], %[[D]]
 
 
 // Tests empty device string attributes are overwritten.
@@ -74,12 +76,41 @@ func.func @empty_device_op() {
   func.return
 }
 
-// CHECK:      [[A:%.+]]:2 = "tf.opA"
+// CHECK-NOT: tf_executor.island
+// CHECK:      [[A:%.+]]:2, {{.*}} = tf_executor.island wraps "tf.opA"
 // CHECK-SAME: device = "CPU:0"
 // CHECK-NOT:  tf_device.launch
-// CHECK:      tf_executor.yield [[A]]#1, [[A]]#0
+// CHECK-NOT:      tf_executor.yield [[A]]#1, [[A]]#0
 
 
+// Tests annotation `parallel_execution_ids` can be propagated correctly
+// CHECK-LABEL: func @propagate_parallel_execution_ids
+func.func @propagate_parallel_execution_ids() {
+  tf_executor.graph {
+    %0:5 = tf_executor.island {
+      %a = "tf.opA"() : () -> tensor<i1>
+      %launch:2 = "tf_device.launch"() ({
+        %b = "tf.opB"(%a) : (tensor<i1>) -> tensor<i32>
+        %c = "tf.opC"(%b) : (tensor<i32>) -> tensor<f32>
+        tf_device.return %c, %b : tensor<f32>, tensor<i32>
+      }) {device = "CPU:0", _parallel_execution_ids = "r4:5,p0:0"} : () -> (tensor<f32>, tensor<i32>)
+      %d = "tf.opD"() : () -> tensor<i1>
+      tf_executor.yield %a, %launch#0, %launch#1, %d : tensor<i1>, tensor<f32>, tensor<i32>, tensor<i1>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      %[[A:.*]], {{.*}} = tf_executor.island wraps "tf.opA"
+// CHECK:      %[[B:.*]], {{.*}} = tf_executor.island wraps "tf.opB"(%[[A]])
+// CHECK-SAME: _parallel_execution_ids = "r4:5,p0:0", device = "CPU:0"
+// CHECK:      %[[C:.*]], {{.*}} = tf_executor.island wraps "tf.opC"(%[[B]])
+// CHECK-SAME: _parallel_execution_ids = "r4:5,p0:0", device = "CPU:0"
+// CHECK:      %[[D:.*]], {{.*}} = tf_executor.island wraps "tf.opD"
+// CHECK-NOT:  "tf_device.launch"
+// CHECK-NOT:      tf_executor.yield %[[A]], %[[C]], %[[B]], %[[D]]
+
 // -----
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute_legacy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute_legacy.mlir
new file mode 100644
index 00000000000..c520792b807
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/launch_to_device_attribute_legacy.mlir
@@ -0,0 +1,121 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-launch-to-device-attribute=legacy-graph-export=true | FileCheck %s
+
+
+// Tests single TensorFlow op is hoisted out and has the correct device assigned
+// by parent `tf_device.launch`.
+// CHECK-LABEL: func @single_op_launch
+func.func @single_op_launch() {
+  tf_executor.graph {
+    %0:5 = tf_executor.island {
+      %a = "tf.opA"() : () -> tensor<i1>
+      %launch:2 = "tf_device.launch"() ({
+        %b:2 = "tf.opB"(%a) : (tensor<i1>) -> (tensor<i32>, tensor<f32>)
+        tf_device.return %b#1, %b#0 : tensor<f32>, tensor<i32>
+      }) {device = "CPU:0"} : () -> (tensor<f32>, tensor<i32>)
+      %c = "tf.opC"() : () -> tensor<i1>
+      tf_executor.yield %a, %launch#0, %launch#1, %c : tensor<i1>, tensor<f32>, tensor<i32>, tensor<i1>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      %[[A:.*]] = "tf.opA"
+// CHECK:      %[[B:.*]]:2 = "tf.opB"(%[[A]])
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      %[[C:.*]] = "tf.opC"
+// CHECK-NOT:  "tf_device.launch"
+// CHECK:      tf_executor.yield %[[A]], %[[B]]#1, %[[B]]#0, %[[C]]
+
+
+// Tests multiple TensorFlow ops are hoisted out and all have the correct device
+// assigned by parent `tf_device.launch`.
+// CHECK-LABEL: func @multi_op_launch
+func.func @multi_op_launch() {
+  tf_executor.graph {
+    %0:5 = tf_executor.island {
+      %a = "tf.opA"() : () -> tensor<i1>
+      %launch:2 = "tf_device.launch"() ({
+        %b = "tf.opB"(%a) : (tensor<i1>) -> tensor<i32>
+        %c = "tf.opC"(%b) : (tensor<i32>) -> tensor<f32>
+        tf_device.return %c, %b : tensor<f32>, tensor<i32>
+      }) {device = "CPU:0"} : () -> (tensor<f32>, tensor<i32>)
+      %d = "tf.opD"() : () -> tensor<i1>
+      tf_executor.yield %a, %launch#0, %launch#1, %d : tensor<i1>, tensor<f32>, tensor<i32>, tensor<i1>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      %[[A:.*]] = "tf.opA"
+// CHECK:      %[[B:.*]] = "tf.opB"(%[[A]])
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      %[[C:.*]] = "tf.opC"(%[[B]])
+// CHECK-SAME: device = "CPU:0"
+// CHECK:      %[[D:.*]] = "tf.opD"
+// CHECK-NOT:  "tf_device.launch"
+// CHECK:      tf_executor.yield %[[A]], %[[C]], %[[B]], %[[D]]
+
+
+// Tests empty device string attributes are overwritten.
+// CHECK-LABEL: func @empty_device_op
+func.func @empty_device_op() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %launch:2 = "tf_device.launch"() ({
+        %a:2 = "tf.opA"() {device = ""} : () -> (tensor<i32>, tensor<f32>)
+        tf_device.return %a#1, %a#0 : tensor<f32>, tensor<i32>
+      }) {device = "CPU:0"} : () -> (tensor<f32>, tensor<i32>)
+      tf_executor.yield %launch#0, %launch#1: tensor<f32>, tensor<i32>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      [[A:%.+]]:2 = "tf.opA"
+// CHECK-SAME: device = "CPU:0"
+// CHECK-NOT:  tf_device.launch
+// CHECK:      tf_executor.yield [[A]]#1, [[A]]#0
+
+
+// -----
+
+
+// Tests TensorFlow op with conflicting `device` attribute compared to parent
+// `tf_device.launch`.
+func.func @conflicting_device() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      // expected-error@+1 {{'tf_device.launch' op inner op has conflicting 'device' attribute, got 'GPU:0' but expected 'CPU:0'}}
+      "tf_device.launch"() ({
+        "tf.opA"() {device = "GPU:0"} : () -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+
+// -----
+
+
+// Tests TensorFlow op with bad `device` attribute already set.
+func.func @bad_tf_device_attr() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      // expected-error@+1 {{'tf_device.launch' op inner op has bad 'device' attribute}}
+      "tf_device.launch"() ({
+        "tf.opA"() {device = 0 : i32} : () -> ()
+        tf_device.return
+      }) {device = "CPU:0"} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 8c853de1e69..e9745eb4e08 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -1576,6 +1576,17 @@ func.func @convert_dot_general(%arg0: tensor<3x2x6x5x1xf32>, %arg1: tensor<3x2x4
   func.return %0 : tensor<3x5x1x4xf32>
 }
 
+// CHECK-LABEL:   func @quantized_dot_general_not_converted
+// CHECK:           "mhlo.dot_general"
+func.func @quantized_dot_general_not_converted(%arg0: tensor<1x1x512xf32>, %arg1: tensor<512x512x!quant.uniform<i8:f32, 0.00285>>) -> tensor<1x1x512xf32> {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [2],
+      rhs_contracting_dimensions = [0]
+    >} : (tensor<1x1x512xf32>, tensor<512x512x!quant.uniform<i8:f32, 0.00285>>) -> tensor<1x1x512xf32>
+  func.return %0 : tensor<1x1x512xf32>
+}
+
 // CHECK-LABEL:   func @convert_dot_general_repeated(
 // CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1x1x1024xf32>,
 // CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<1024x1024xf32>) -> tensor<1x1x1024xf32> {
@@ -2214,6 +2225,30 @@ func.func @convert_avgpool_same(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8x
   func.return %4 : tensor<4x8x8x8xf32>
 }
 
+// CHECK-LABEL:   func @convert_avgpool_reshape_broadcast(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.AvgPool"(%[[VAL_0]]) {data_format = "NHWC", ksize = [1, 3, 3, 1], padding = "SAME", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32>
+// CHECK:           return %[[VAL_1]] : tensor<4x8x8x8xf32>
+// CHECK:         }
+func.func @convert_avgpool_reshape_broadcast(%arg0: tensor<4x16x16x8xf32>) -> tensor<4x8x8x8xf32> {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<1x16x16x1xf32>
+  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = "mhlo.reduce_window"(%arg0, %1) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<4x16x16x8xf32>, tensor<f32>) -> tensor<4x8x8x8xf32>
+  %3 = "mhlo.reduce_window"(%0, %1) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %7 = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %7 : tensor<f32>
+  }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<1x16x16x1xf32>, tensor<f32>) -> tensor<1x8x8x1xf32>
+  %4 = mhlo.reshape %3 : (tensor<1x8x8x1xf32>) -> tensor<8x8xf32>
+  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x8xf32>) -> tensor<4x8x8x8xf32>
+  %6 = mhlo.divide %2, %5 : tensor<4x8x8x8xf32>
+  return %6 : tensor<4x8x8x8xf32>
+}
+
 // CHECK-LABEL:   func @convert_maxpool_valid(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32> {
 // CHECK:           %[[VAL_1:.*]] = "tf.MaxPool"(%[[VAL_0]]) {data_format = "NHWC", explicit_paddings = [], ksize = [1, 3, 3, 1], padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<4x16x16x8xf32>) -> tensor<4x7x7x8xf32>
@@ -2388,6 +2423,23 @@ func.func @convert_floor_mod_int_cst(%arg0: tensor<192x8xi32>) -> tensor<192x8xi
   func.return %7 : tensor<192x8xi32>
 }
 
+// CHECK-LABEL: func @convert_floor_mod_bfloat
+// CHECK: %[[RESULT:.*]] = "tf.FloorMod"(%arg0, %arg1) : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xbf16>
+// CHECK: return %[[RESULT]]
+// CHECK: }
+func.func @convert_floor_mod_bfloat(%arg0: tensor<10x10xbf16>, %arg1: tensor<10x10xbf16>) -> tensor<10x10xbf16> {
+  %0 = mhlo.constant dense<0.000000e+00> : tensor<10x10xbf16>
+  %1 = mhlo.remainder %arg0, %arg1 : tensor<10x10xbf16>
+  %2 = mhlo.compare  NE, %1, %0,  FLOAT : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xi1>
+  %3 = mhlo.compare  LT, %1, %0,  FLOAT : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xi1>
+  %4 = mhlo.compare  LT, %arg1, %0,  FLOAT : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xi1>
+  %5 = mhlo.compare  NE, %3, %4,  UNSIGNED : (tensor<10x10xi1>, tensor<10x10xi1>) -> tensor<10x10xi1>
+  %6 = mhlo.and %5, %2 : tensor<10x10xi1>
+  %7 = mhlo.add %1, %arg1 : tensor<10x10xbf16>
+  %8 = mhlo.select %6, %7, %1 : tensor<10x10xi1>, tensor<10x10xbf16>
+  return %8 : tensor<10x10xbf16>
+}
+
 // CHECK-LABEL: func @convert_floor_div
 // CHECK: %[[RESULT:.*]] = "tf.FloorDiv"(%arg0, %arg1) : (tensor<10x10xbf16>, tensor<10x10xbf16>) -> tensor<10x10xbf16>
 // CHECK: return %[[RESULT]]
@@ -3020,6 +3072,94 @@ func.func @convert_not(%arg0: tensor<5x3x1xi1>) -> tensor<5x3x1xi1> {
   func.return %0 : tensor<5x3x1xi1>
 }
 
+// CHECK-LABEL:   func @convert_not_i8(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi8>) -> tensor<7x9x11xi8> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i8>} : () -> tensor<i8>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi8>, tensor<i8>) -> tensor<7x9x11xi8>
+// CHECK:           return %[[RES]] : tensor<7x9x11xi8>
+// CHECK:         }
+func.func @convert_not_i8(%arg0: tensor<7x9x11xi8>) -> tensor<7x9x11xi8> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xi8>) -> (tensor<7x9x11xi8>)
+  func.return %0 : tensor<7x9x11xi8>
+}
+
+// CHECK-LABEL:   func @convert_not_i16(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi16>) -> tensor<7x9x11xi16> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i16>} : () -> tensor<i16>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi16>, tensor<i16>) -> tensor<7x9x11xi16>
+// CHECK:           return %[[RES]] : tensor<7x9x11xi16>
+// CHECK:         }
+func.func @convert_not_i16(%arg0: tensor<7x9x11xi16>) -> tensor<7x9x11xi16> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xi16>) -> (tensor<7x9x11xi16>)
+  func.return %0 : tensor<7x9x11xi16>
+}
+
+// CHECK-LABEL:   func @convert_not_i32(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi32>) -> tensor<7x9x11xi32> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi32>, tensor<i32>) -> tensor<7x9x11xi32>
+// CHECK:           return %[[RES]] : tensor<7x9x11xi32>
+// CHECK:         }
+func.func @convert_not_i32(%arg0: tensor<7x9x11xi32>) -> tensor<7x9x11xi32> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xi32>) -> (tensor<7x9x11xi32>)
+  func.return %0 : tensor<7x9x11xi32>
+}
+
+// CHECK-LABEL:   func @convert_not_i64(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xi64>) -> tensor<7x9x11xi64> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xi64>, tensor<i64>) -> tensor<7x9x11xi64>
+// CHECK:           return %[[RES]] : tensor<7x9x11xi64>
+// CHECK:         }
+func.func @convert_not_i64(%arg0: tensor<7x9x11xi64>) -> tensor<7x9x11xi64> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xi64>) -> (tensor<7x9x11xi64>)
+  func.return %0 : tensor<7x9x11xi64>
+}
+
+// CHECK-LABEL:   func @convert_not_ui8(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui8>) -> tensor<7x9x11xui8> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<255> : tensor<ui8>} : () -> tensor<ui8>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui8>, tensor<ui8>) -> tensor<7x9x11xui8>
+// CHECK:           return %[[RES]] : tensor<7x9x11xui8>
+// CHECK:         }
+func.func @convert_not_ui8(%arg0: tensor<7x9x11xui8>) -> tensor<7x9x11xui8> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xui8>) -> (tensor<7x9x11xui8>)
+  func.return %0 : tensor<7x9x11xui8>
+}
+
+// CHECK-LABEL:   func @convert_not_ui16(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui16>) -> tensor<7x9x11xui16> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<65535> : tensor<ui16>} : () -> tensor<ui16>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui16>, tensor<ui16>) -> tensor<7x9x11xui16>
+// CHECK:           return %[[RES]] : tensor<7x9x11xui16>
+// CHECK:         }
+func.func @convert_not_ui16(%arg0: tensor<7x9x11xui16>) -> tensor<7x9x11xui16> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xui16>) -> (tensor<7x9x11xui16>)
+  func.return %0 : tensor<7x9x11xui16>
+}
+
+// CHECK-LABEL:   func @convert_not_ui32(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui32>) -> tensor<7x9x11xui32> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<4294967295> : tensor<ui32>} : () -> tensor<ui32>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui32>, tensor<ui32>) -> tensor<7x9x11xui32>
+// CHECK:           return %[[RES]] : tensor<7x9x11xui32>
+// CHECK:         }
+func.func @convert_not_ui32(%arg0: tensor<7x9x11xui32>) -> tensor<7x9x11xui32> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xui32>) -> (tensor<7x9x11xui32>)
+  func.return %0 : tensor<7x9x11xui32>
+}
+
+// CHECK-LABEL:   func @convert_not_ui64(
+// CHECK-SAME:                      %[[ARG:.*]]: tensor<7x9x11xui64>) -> tensor<7x9x11xui64> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<18446744073709551615> : tensor<ui64>} : () -> tensor<ui64>
+// CHECK:           %[[RES:.*]] = "tf.BitwiseXor"(%[[ARG]], %[[CST]]) : (tensor<7x9x11xui64>, tensor<ui64>) -> tensor<7x9x11xui64>
+// CHECK:           return %[[RES]] : tensor<7x9x11xui64>
+// CHECK:         }
+func.func @convert_not_ui64(%arg0: tensor<7x9x11xui64>) -> tensor<7x9x11xui64> {
+  %0 = "mhlo.not"(%arg0): (tensor<7x9x11xui64>) -> (tensor<7x9x11xui64>)
+  func.return %0 : tensor<7x9x11xui64>
+}
+
 // -----
 
 // CHECK-LABEL:  func @while_with_variadic() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index f34c6267661..432195aed3b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -81,9 +81,9 @@ func.func @div_no_nan(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf3
   func.return %0 : tensor<*xf32>
 }
 
-// CHECK-LABEL: @truncate_div
+// CHECK-LABEL: @truncate_div_int
 // CHECK-SAME: (%[[LHS:.*]]: tensor<*xi32>, %[[RHS:.*]]: tensor<*xi32>)
-func.func @truncate_div(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)
+func.func @truncate_div_int(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)
     -> tensor<*xi32> {
   // CHECK: %[[RESULT:.*]] = "tf.Div"(%[[LHS]], %[[RHS]])
   // CHECK: return %[[RESULT]]
@@ -92,6 +92,23 @@ func.func @truncate_div(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)
   func.return %0 : tensor<*xi32>
 }
 
+// CHECK-LABEL: @truncate_div_float
+// CHECK-SAME: (%[[LHS:.*]]: tensor<*xf32>, %[[RHS:.*]]: tensor<*xf32>)
+func.func @truncate_div_float(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>)
+    -> tensor<*xf32> {
+  // CHECK:  %[[ZERO:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK:  %[[XDIVY:.*]] = "tf.Div"(%[[LHS]], %[[RHS]])
+  // CHECK:  %[[MASK:.*]] = "tf.Less"(%[[XDIVY]], %[[ZERO]])
+  // CHECK:  %[[CEIL:.*]] = "tf.Ceil"(%[[XDIVY]])
+  // CHECK:  %[[FLOOR:.*]] = "tf.Floor"(%[[XDIVY]])
+  // CHECK:  %[[RESULT:.*]] = "tf.SelectV2"(%[[MASK]], %[[CEIL]], %[[FLOOR]])
+  %0 = "tf.TruncateDiv"(%arg0, %arg1)
+      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @mul_no_nan
 // CHECK-SAME: (%[[X:.*]]: tensor<2x3xf32>, %[[Y:.*]]: tensor<3xf32>)
 func.func @mul_no_nan(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2x3xf32> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index 06b24b385d4..a19dd27b73a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -558,3 +558,12 @@ func.func @variant_block_arg(tensor<!tf_type.variant<tensor<f32>>>) -> () {
     func.return
 }
 
+// CHECK-LABEL: func @set_bound
+func.func @set_bound(%arg0: tensor<i32>) -> tensor<i32> {
+  %bound = "tf.Const"() {value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.XlaSetBound
+  // CHECK-NOT: _xla_outside_compilation
+  %bounded = "tf.XlaSetBound"(%arg0, %bound) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %bounded : tensor<i32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
index 9acd2b232d3..432d0ab8733 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
index 399b3b6bac7..bb3f74d702b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/order_by_dialect.mlir
@@ -1,7 +1,7 @@
 // RUN: tf-opt %s -allow-unregistered-dialect --tf-order-by-dialect --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @interleave
-func.func @interleave(%arg0: f32) -> (f32, f32, f32) {
+func.func @interleave(%arg0: f32) -> (f32, f32, f32) attributes {ignore_side_effects_for_testing} {
   %0 = "x.a"(%arg0) : (f32) -> f32
   %1 = "y.a"(%arg0) : (f32) -> f32
   %2 = "z.a"(%arg0) : (f32) -> f32
@@ -26,7 +26,7 @@ func.func @interleave(%arg0: f32) -> (f32, f32, f32) {
 // -----
 
 // CHECK-LABEL: @terminator
-func.func @terminator(%arg0: f32) -> (f32) {
+func.func @terminator(%arg0: f32) -> (f32) attributes {ignore_side_effects_for_testing} {
   func.call @terminator(%arg0) : (f32) -> (f32)
   "x.a"(%arg0) : (f32) -> ()
   "y.a"(%arg0) : (f32) -> ()
@@ -43,7 +43,7 @@ func.func @terminator(%arg0: f32) -> (f32) {
 // -----
 
 // CHECK-LABEL: @fanout
-func.func @fanout(%arg0: f32) -> (f32) {
+func.func @fanout(%arg0: f32) -> (f32) attributes {ignore_side_effects_for_testing} {
   %0 = "x.a"(%arg0) : (f32) -> (f32)
   %1 = "y.a"(%0) : (f32) -> (f32)
   %2 = "y.b"(%0) : (f32) -> (f32)
@@ -62,7 +62,7 @@ func.func @fanout(%arg0: f32) -> (f32) {
 // -----
 
 // CHECK-LABEL: @constants
-func.func @constants() -> f32 {
+func.func @constants() -> f32 attributes {ignore_side_effects_for_testing} {
   %0 = "a.x"() : () -> f32
   %1 = "b.x"() : () -> f32
   %2 = "c.x"() : () -> f32
@@ -129,7 +129,7 @@ func.func private @mhlo_while() {
 // -----
 
 // CHECK-LABEL: @nested_regions
-func.func @nested_regions(%arg0: f32) {
+func.func @nested_regions(%arg0: f32) attributes {ignore_side_effects_for_testing} {
   %0 = "x.a"(%arg0) : (f32) -> f32
   %1 = "y.a"(%arg0) : (f32) -> f32
   %2 = "x.b"(%arg0) : (f32) -> f32
@@ -157,3 +157,61 @@ func.func @nested_regions(%arg0: f32) {
 // CHECK-NEXT: x.d
 // CHECK-NEXT: y.c
 // CHECK-NEXT: y.e
+
+// -----
+
+// CHECK-LABEL: interleaved_tf_and_mhlo
+func.func private @interleaved_tf_and_mhlo() {
+  %m0 = mhlo.constant dense<0> : tensor<i32>
+  %t0 = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
+  %m1 = mhlo.constant dense<1> : tensor<i32>
+  %t1 = "tf.Const"() { value = dense<1> : tensor<1xi32> } : () -> tensor<1xi32>
+  %m2 = mhlo.constant dense<1> : tensor<i32>
+  %t2 = "tf.Const"() { value = dense<1> : tensor<1xi32> } : () -> tensor<1xi32>
+  %m3 = mhlo.constant dense<1> : tensor<i32>
+  %t3 = "tf.Const"() { value = dense<1> : tensor<1xi32> } : () -> tensor<1xi32>
+  // CHECK: mhlo.constant
+  // CHECK: mhlo.constant
+  // CHECK: mhlo.constant
+  // CHECK: mhlo.constant
+  // CHECK: tf.Const
+  // CHECK: tf.Const
+  // CHECK: tf.Const
+  // CHECK: tf.Const
+  return
+}
+
+// -----
+
+// CHECK-LABEL: variable_ops
+func.func private @variable_ops(%arg0: tensor<!tf_type.resource<tensor<f32>>>) {
+  %t3 = "tf.Const"() { value = dense<0> : tensor<0xi32> } : () -> tensor<0xi32>
+  // Without side effect analysis, we would now schedule tf.ReadVariableOp next,
+  // since all its operands are ready. Check that we don't.
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+  %1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+  // CHECK: tf.Const
+  // CHECK: mhlo.constant
+  // CHECK: tf.Assign
+  // CHECK: tf.Read
+  return
+}
+
+// -----
+
+func.func private @id(%arg0: tensor<!tf_type.variant>) -> tensor<!tf_type.variant> {
+  return %arg0 : tensor<!tf_type.variant>
+}
+
+// CHECK-LABEL: iterators
+func.func private @iterators(%arg0 : tensor<!tf_type.variant>) {
+  %0 = "tf.Iterator"() {container = "", output_shapes = [#tf_type.shape<200x28x28x1>, #tf_type.shape<200x10>], output_types = [f32, f32], shared_name = "_iterator1"} : () -> tensor<!tf_type.resource>
+  %1 = func.call @id(%arg0) : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+  "tf.MakeIterator"(%1, %0) {_class = ["loc:@BatchDatasetV2"], device = ""} : (tensor<!tf_type.variant>, tensor<!tf_type.resource>) -> ()
+  %2:2 = "tf.IteratorGetNext"(%0) {_class = ["loc:@iterator"], device = ""} : (tensor<!tf_type.resource>) -> (tensor<200x28x28x1xf32>, tensor<200x10xf32>)
+  // CHECK: tf.Iterator
+  // CHECK: tf.MakeIterator
+  // CHECK: tf.IteratorGetNext
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
index bb668bd3806..5c468ee37d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-parallel-execute-to-islands | FILECHECK_OPTS="" FileCheck %s
+// RUN: tf-opt %s -tf-parallel-execute-to-islands=legacy-graph-export=false | FILECHECK_OPTS="" FileCheck %s
 
 // CHECK-LABEL: func @testEmptyRegions
 func.func @testEmptyRegions() {
@@ -17,10 +17,10 @@ func.func @testEmptyRegions() {
 }
 
 // CHECK:      [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
-// CHECK:        tf_executor.yield
+// CHECK:        tf_executor.yield {_parallel_execution_ids = "p0:0"}
 // CHECK:      [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
-// CHECK:        tf_executor.yield
-// CHECK:      tf_executor.fetch [[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]] :
+// CHECK:        tf_executor.yield {_parallel_execution_ids = "p0:1"}
+// CHECK:      tf_executor.fetch
 
 
 // CHECK-LABEL: func @testDataOperandsAndResults
@@ -50,10 +50,10 @@ func.func @testDataOperandsAndResults(%arg0 : tensor<i1>) {
 // CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
 // CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
 // CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
-// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]]) {_parallel_execution_ids = "p0:0"}
 // CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
 // CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
-// CHECK-NEXT:   [[OP_C_OUTPUT:%.+]] = "tf.opC"([[INPUT_A]])
+// CHECK-NEXT:   [[OP_C_OUTPUT:%.+]] = "tf.opC"([[INPUT_A]]) {_parallel_execution_ids = "p0:1"}
 // CHECK:        tf_executor.yield [[OP_C_OUTPUT]] :
 // CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
@@ -62,6 +62,7 @@ func.func @testDataOperandsAndResults(%arg0 : tensor<i1>) {
 func.func @testControlOperands() {
   %0:2 = tf_executor.graph {
     %1 = tf_executor.island {
+      "tf.someOp"() : () -> ()
       tf_executor.yield
     }
     %2:3 = tf_executor.island(%1) {
@@ -81,10 +82,10 @@ func.func @testControlOperands() {
 
 // CHECK:      [[INPUT_CTRL:%.+]] = tf_executor.island {
 // CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
-// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"() {_parallel_execution_ids = "p0:0"}
 // CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
 // CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
-// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"() {_parallel_execution_ids = "p0:1"}
 // CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
 // CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
@@ -103,6 +104,7 @@ func.func @testControlResults() {
       tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
     }
     %3 = tf_executor.island(%0#2) {
+      "tf.someOp"() : () -> ()
       tf_executor.yield
     }
     tf_executor.fetch %3 : !tf_executor.control
@@ -111,10 +113,10 @@ func.func @testControlResults() {
 }
 
 // CHECK:      {{%.+}}, [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
-// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"() {_parallel_execution_ids = "p0:0"}
 // CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
 // CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
-// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"() {_parallel_execution_ids = "p0:1"}
 // CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
 // CHECK:      [[OUTPUT_CTRL:%.+]] = tf_executor.island([[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]]) {
 // CHECK:      [[FETCH_ISLAND:%.+]] = tf_executor.island([[OUTPUT_CTRL]]) {
@@ -140,12 +142,38 @@ func.func @testSomeRegionNoUsers() {
 }
 
 // CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
-// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"() {_parallel_execution_ids = "p0:0"}
 // CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
 // CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
-// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"() {_parallel_execution_ids = "p0:1"}
 // CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
-// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_CTRL]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]] :
+
+// CHECK-LABEL: @testRegionContainsMultipleOps
+func.func @testRegionContainsMultipleOps() {
+  %0:2 = tf_executor.graph {
+    %outputs:2, %control = tf_executor.island {
+      %1:2 = "tf_device.parallel_execute"() ({
+        %2 = "tf.opA"() : () -> tensor<i1>
+        %3 = "tf.opB"(%2) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      }, {
+        %2 = "tf.opC"() : () -> tensor<i32>
+        %3 = "tf.opD"(%2) : (tensor<i32>) -> tensor<i32>
+        tf_device.return %3 : tensor<i32>
+      }) : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %outputs#0, %outputs#1 : tensor<i1>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: [[OUTPUT_0:%.*]], {{%.*}} = tf_executor.island wraps "tf.opA"() {_parallel_execution_ids = "p0:0"}
+// CHECK: [[OUTPUT_1:%.*]], {{%.*}} = tf_executor.island wraps "tf.opB"([[OUTPUT_0:%.*]]) {_parallel_execution_ids = "p0:0"}
+// CHECK: [[OUTPUT_2:%.*]], {{%.*}} = tf_executor.island wraps "tf.opC"() {_parallel_execution_ids = "p0:1"}
+// CHECK: [[OUTPUT_3:%.*]], {{%.*}} = tf_executor.island wraps "tf.opD"([[OUTPUT_2:%.*]]) {_parallel_execution_ids = "p0:1"}
+// CHECK: tf_executor.fetch [[OUTPUT_1:%.*]], [[OUTPUT_3:%.*]]
 
 // -----
 
@@ -175,6 +203,34 @@ func.func @testSingleton(%arg0 : tensor<i1>) {
 // CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
 // CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
 // CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
-// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]]) {_parallel_execution_ids = "p0:0"}
 // CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
 // CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]] :
+
+// -----
+// Tests parallel_group attr can merge correctly.
+// CHECK-LABEL: func @merge_of_parallel_group_attr
+func.func @merge_of_parallel_group_attr() {
+  %0:2 = tf_executor.graph {
+    %outputs:2, %control = tf_executor.island {
+      %1:2 = "tf_device.parallel_execute"  () ({
+        %2 = "tf.opA"() : () -> tensor<i1>
+        %3 = "tf.opB"(%2) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      }, {
+        %2 = "tf.opC"() : () -> tensor<i32>
+        %3 = "tf.opD"(%2) : (tensor<i32>) -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {_parallel_execution_ids = "r4:5"} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %outputs#0, %outputs#1 : tensor<i1>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: [[OUTPUT_0:%.*]], {{%.*}} = tf_executor.island wraps "tf.opA"() {_parallel_execution_ids = "r4:5,p0:0"}
+// CHECK: [[OUTPUT_1:%.*]], {{%.*}} = tf_executor.island wraps "tf.opB"([[OUTPUT_0:%.*]]) {_parallel_execution_ids = "r4:5,p0:0"}
+// CHECK: [[OUTPUT_2:%.*]], {{%.*}} = tf_executor.island wraps "tf.opC"() {_parallel_execution_ids = "r4:5,p0:1"}
+// CHECK: [[OUTPUT_3:%.*]], {{%.*}} = tf_executor.island wraps "tf.opD"([[OUTPUT_2:%.*]]) {_parallel_execution_ids = "r4:5,p0:1"}
+// CHECK: tf_executor.fetch [[OUTPUT_1:%.*]], [[OUTPUT_3:%.*]]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands_legacy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands_legacy.mlir
new file mode 100644
index 00000000000..56c589768b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands_legacy.mlir
@@ -0,0 +1,182 @@
+// RUN: tf-opt %s -tf-parallel-execute-to-islands=legacy-graph-export=true | FILECHECK_OPTS="" FileCheck %s
+
+// CHECK-LABEL: func @testEmptyRegions
+func.func @testEmptyRegions() {
+  tf_executor.graph {
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        tf_device.return
+      }, {
+        tf_device.return
+      }) {} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
+// CHECK:        tf_executor.yield
+// CHECK:      [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK:        tf_executor.yield
+// CHECK:      tf_executor.fetch [[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]] :
+
+
+// CHECK-LABEL: func @testDataOperandsAndResults
+// CHECK-SAME: ([[ARG_0:%.+]]: tensor<i1>)
+func.func @testDataOperandsAndResults(%arg0 : tensor<i1>) {
+  %0:2 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %3:3 = tf_executor.island() {
+      %4:2 = "tf_device.parallel_execute"() ({
+        %5 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %5 : tensor<i1>
+      }, {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %4#0, %4#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %3#0, %3#1 : tensor<i1>, tensor<i32>
+  }
+  func.return
+}
+
+// CHECK:      [[INPUT_A:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_C_OUTPUT:%.+]] = "tf.opC"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_C_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
+
+
+// CHECK-LABEL: func @testControlOperands
+func.func @testControlOperands() {
+  %0:2 = tf_executor.graph {
+    %1 = tf_executor.island {
+      "tf.someOp"() : () -> ()
+      tf_executor.yield
+    }
+    %2:3 = tf_executor.island(%1) {
+      %3:2 = "tf_device.parallel_execute"() ({
+        %4 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %4 : tensor<i1>
+      }, {
+        %4 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %4 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %3#0, %3#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %2#0, %2#1 : tensor<i1>, tensor<i32>
+  }
+  func.return
+}
+
+// CHECK:      [[INPUT_CTRL:%.+]] = tf_executor.island {
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
+
+
+// CHECK-LABEL: func @testControlResults
+func.func @testControlResults() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %1:2 = "tf_device.parallel_execute"() ({
+        %2 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %2 : tensor<i1>
+      }, {
+        %2 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
+    }
+    %3 = tf_executor.island(%0#2) {
+      "tf.someOp"() : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch %3 : !tf_executor.control
+  }
+  func.return
+}
+
+// CHECK:      {{%.+}}, [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[OUTPUT_CTRL:%.+]] = tf_executor.island([[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]]) {
+// CHECK:      [[FETCH_ISLAND:%.+]] = tf_executor.island([[OUTPUT_CTRL]]) {
+// CHECK:      tf_executor.fetch [[FETCH_ISLAND]] : !tf_executor.control
+
+
+// CHECK-LABEL: func @testSomeRegionNoUsers
+func.func @testSomeRegionNoUsers() {
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.island {
+      %2:2 = "tf_device.parallel_execute"() ({
+        %3 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      }, {
+        %3 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %3 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %2#0, %2#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  func.return
+}
+
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_CTRL]] :
+
+// -----
+
+// Tests a ParallelExecute with a single region.
+
+// CHECK-LABEL: func @testSingleton
+// CHECK-SAME: ([[ARG_0:%.+]]: tensor<i1>)
+func.func @testSingleton(%arg0 : tensor<i1>) {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %3:2 = tf_executor.island() {
+      %4 = "tf_device.parallel_execute"() ({
+        %5 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %5 : tensor<i1>
+      }) {} : () -> tensor<i1>
+      tf_executor.yield %4 : tensor<i1>
+    }
+    tf_executor.fetch %3#0 : tensor<i1>
+  }
+  func.return
+}
+
+// CHECK:      [[INPUT_A:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]] :
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir b/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
index d1c71c6f14b..e4c1941a1c2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
@@ -76,6 +76,21 @@ func.func @handles_partitioned_function_calls(%arg0: tensor<f32>, %arg1: tensor<
 
 // -----
 
+func.func private @f(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// CHECK-LABEL: handles_tpu_partitioned_function_calls_with_device_ordinal
+func.func @handles_tpu_partitioned_function_calls_with_device_ordinal(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %0 = "tf.TPUOrdinalSelector"() {device = ""} : () -> tensor<?xi32>
+  // CHECK: [[ordinal:%[0-9]*]] = "tf.TPUOrdinalSelector"
+  // CHECK: TPUPartitionedCall"([[ordinal]])
+  %1 = "tf.TPUPartitionedCall"(%arg0, %arg1, %0) {f = @f} : (tensor<f32>, tensor<f32>, tensor<?xi32>) -> tensor<f32>
+  return %1 : tensor<f32>
+}
+
+// -----
+
 func.func private @f(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32) -> f32 {
     %0 = "tf.Add2"(%arg0, %arg2) : (f32, f32) -> f32
     return %0 : f32
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_tensor_list_init_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_tensor_list_init_ops.mlir
new file mode 100644
index 00000000000..10567878eaa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_tensor_list_init_ops.mlir
@@ -0,0 +1,139 @@
+// RUN: tf-opt %s -tf-replicate-tensor-list-init-ops -verify-diagnostics | FileCheck %s
+
+// CHECK: while_region_op
+func.func @while_region_op() {
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: TensorListReserve
+  // CHECK: TensorListReserve
+  %tl = "tf.TensorListReserve"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %while:1 = "tf.WhileRegion"(%tl) ({
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+      %cond = "tf.false"():()-> tensor<i1>
+      "tf.Yield"(%cond) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+    "tf.Yield"(%barg1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>) -> ()
+  }) {is_stateless = false} : (tensor<!tf_type.variant<tensor<?x1xf32>>>) -> (tensor<!tf_type.variant<tensor<?x1xf32>>>)
+  func.return
+}
+
+// CHECK: while_region_op_empty_tensor_list
+func.func @while_region_op_empty_tensor_list() {
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: EmptyTensorList
+  // CHECK: EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %while:1 = "tf.WhileRegion"(%tl) ({
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+      %cond = "tf.false"():()-> tensor<i1>
+      "tf.Yield"(%cond) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+    "tf.Yield"(%barg1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>) -> ()
+  }) {is_stateless = false} : (tensor<!tf_type.variant<tensor<?x1xf32>>>) -> (tensor<!tf_type.variant<tensor<?x1xf32>>>)
+  func.return
+}
+
+// CHECK: while_region_op_twosepargs
+func.func @while_region_op_twosepargs() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[RESULT0:.*]] = "tf.TensorListReserve"
+  // CHECK: %[[RESULT1:.*]] = "tf.TensorListReserve"
+  // CHECK: %[[RESULT2:.*]] = "tf.TensorListReserve"
+  // CHECK: tf.WhileRegion
+  // CHECK-SAME: (%[[RESULT1]], %[[RESULT0]])
+  %tl = "tf.TensorListReserve"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %while:2 = "tf.WhileRegion"(%tl, %tl) ({
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>, %barg2: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+      %cond = "tf.false"():()-> tensor<i1>
+      "tf.Yield"(%cond) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>, %barg2: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+    "tf.Yield"(%barg1, %barg2) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>) -> ()
+  }) {is_stateless = false} : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>) -> (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>)
+  func.return
+}
+
+// CHECK: while_region_op_two_sep_args_empty_tensor_list
+func.func @while_region_op_two_sep_args_empty_tensor_list() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[RESULT0:.*]] = "tf.EmptyTensorList"
+  // CHECK: %[[RESULT1:.*]] = "tf.EmptyTensorList"
+  // CHECK: %[[RESULT2:.*]] = "tf.EmptyTensorList"
+  // CHECK: tf.WhileRegion
+  // CHECK-SAME: (%[[RESULT1]], %[[RESULT0]])
+  %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %while:2 = "tf.WhileRegion"(%tl, %tl) ({
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>, %barg2: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+      %cond = "tf.false"():()-> tensor<i1>
+      "tf.Yield"(%cond) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%barg1: tensor<!tf_type.variant<tensor<?x1xf32>>>, %barg2: tensor<!tf_type.variant<tensor<?x1xf32>>>): // no predeceessors
+    "tf.Yield"(%barg1, %barg2) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>) -> ()
+  }) {is_stateless = false} : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>) -> (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<!tf_type.variant<tensor<?x1xf32>>>)
+  func.return
+}
+
+// CHECK: no_while_region_op
+func.func @no_while_region_op() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: TensorListReserve
+  // CHECK: TensorListReserve
+  %tl = "tf.TensorListReserve"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_1 = "tf._SomeOtherOp"() : () -> tensor<8x1xf32>
+  %tl_set_item = "tf.TensorListSetItem"(%tl, %one, %elem_1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<8x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  func.return
+}
+
+// CHECK: no_while_region_op_empty_tensor_list
+func.func @no_while_region_op_empty_tensor_list() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: EmptyTensorList
+  // CHECK: EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_1 = "tf._SomeOtherOp"() : () -> tensor<8x1xf32>
+  %tl_set_item = "tf.TensorListSetItem"(%tl, %one, %elem_1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<8x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  func.return
+}
+
+// CHECK: use_two_sep_ops
+func.func @use_two_sep_ops() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: TensorListReserve
+  // CHECK: TensorListReserve
+  // CHECK: TensorListReserve
+  %tl = "tf.TensorListReserve"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_1 = "tf._FirstOp"() : () -> tensor<8x1xf32>
+  %tl_set_item = "tf.TensorListSetItem"(%tl, %one, %elem_1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<8x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_2 = "tf._SecondOp"() : () -> tensor<16x1xf32>
+  %tl_set_item2 = "tf.TensorListSetItem"(%tl, %one, %elem_2) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  func.return
+}
+
+// CHECK: use_two_sep_ops_empty_tensor_list
+func.func @use_two_sep_ops_empty_tensor_list() {
+  %one = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem_shape = "tf.Const"() {value = dense<[-1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: EmptyTensorList
+  // CHECK: EmptyTensorList
+  // CHECK: EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_1 = "tf._FirstOp"() : () -> tensor<8x1xf32>
+  %tl_set_item = "tf.TensorListSetItem"(%tl, %one, %elem_1) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<8x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  %elem_2 = "tf._SecondOp"() : () -> tensor<16x1xf32>
+  %tl_set_item2 = "tf.TensorListSetItem"(%tl, %one, %elem_2) : (tensor<!tf_type.variant<tensor<?x1xf32>>>, tensor<i32>, tensor<16x1xf32>) -> tensor<!tf_type.variant<tensor<?x1xf32>>>
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index eed78e0e503..079cdafc484 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -split-input-file -verify-diagnostics %s -tf-replicate-to-island | FileCheck %s
+// RUN: tf-opt -split-input-file -verify-diagnostics %s -tf-replicate-to-island=legacy-graph-export=false | FileCheck %s
 
 // Tests per replica island has same control operands as island holding
 // replicate.
@@ -44,9 +44,9 @@ func.func @no_devices() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: device = "CORE_0"
+// CHECK: _parallel_execution_ids = "r0:0", device = "CORE_0"
 // CHECK: "tf.opA"
-// CHECK: device = "CORE_0"
+// CHECK: _parallel_execution_ids = "r0:1", device = "CORE_0"
 
 
 // Tests devices are not remapped if device is not in replicate devices.
@@ -69,9 +69,9 @@ func.func @no_override_device() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: device = "/TPU:2"
+// CHECK: _parallel_execution_ids = "r0:0", device = "/TPU:2"
 // CHECK: "tf.opA"
-// CHECK: device = "/TPU:2"
+// CHECK: _parallel_execution_ids = "r0:1", device = "/TPU:2"
 
 
 // Tests devices are remapped if device is in replicate devices.
@@ -94,9 +94,9 @@ func.func @remap_device() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: device = "/CPU:0"
+// CHECK: _parallel_execution_ids = "r0:0", device = "/CPU:0"
 // CHECK: "tf.opA"
-// CHECK: device = "/GPU:1"
+// CHECK: _parallel_execution_ids = "r0:1", device = "/GPU:1"
 
 
 // Tests replicate with control dependency output has each expanded replica
@@ -138,7 +138,7 @@ func.func @unused_replica(%arg0: tensor<i1>) {
 
 // CHECK: {{%.*}}, [[REPLICA_0_CONTROL:%.*]] = tf_executor.island
 // CHECK: [[REPLICA_1_OUTPUT:%.*]], {{%.*}} = tf_executor.island
-// CHECK: tf_executor.fetch [[REPLICA_1_OUTPUT]], [[REPLICA_0_CONTROL]]
+// CHECK: tf_executor.fetch [[REPLICA_1_OUTPUT]] :
 
 
 // Tests replicate results are remapped correctly.
@@ -158,10 +158,19 @@ func.func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
   func.return
 }
 
-// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
-// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
-// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
-
+// CHECK: %[[REPLICA_OPA_1:.*]], %{{.*}} = tf_executor.island wraps
+// CHECK: "tf.opA"(%arg0)
+// CHECK:  _parallel_execution_ids = "r0:0"
+// CHECK: %[[REPLICA_OPB_1:.*]], %{{.*}} = tf_executor.island wraps
+// CHECK: "tf.opB"(%arg0)
+// CHECK:  _parallel_execution_ids = "r0:0"
+// CHECK: %[[REPLICA_OPA_2:.*]], %{{.*}} = tf_executor.island wraps
+// CHECK: "tf.opA"(%arg1)
+// CHECK:  _parallel_execution_ids = "r0:1"
+// CHECK: %[[REPLICA_OPB_2:.*]], %{{.*}} = tf_executor.island wraps
+// CHECK: "tf.opB"(%arg1)
+// CHECK:  _parallel_execution_ids = "r0:1"
+// CHECK: tf_executor.fetch %[[REPLICA_OPA_1]], %[[REPLICA_OPA_2]], %[[REPLICA_OPB_1]], %[[REPLICA_OPB_2]]
 
 // Tests replicate results are remapped correctly with packed inputs.
 // CHECK-LABEL: func @replicate_with_packed_input
@@ -181,14 +190,19 @@ func.func @replicate_with_packed_input(%arg0: tensor<i1>, %arg1: tensor<i1>) {
   func.return
 }
 
-// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_OPA_1:.*]], %{{.*}} = tf_executor.island wraps
 // CHECK: "tf.opA"(%arg0)
+// CHECK:  _parallel_execution_ids = "r0:0"
+// CHECK: %[[REPLICA_OPB_1:.*]], %{{.*}} = tf_executor.island wraps
 // CHECK: "tf.opB"(%arg1)
-// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK:  _parallel_execution_ids = "r0:0"
+// CHECK: %[[REPLICA_OPA_2:.*]], %{{.*}} = tf_executor.island wraps
 // CHECK: "tf.opA"(%arg0)
+// CHECK:  _parallel_execution_ids = "r0:1"
+// CHECK: %[[REPLICA_OPB_2:.*]], %{{.*}} = tf_executor.island wraps
 // CHECK: "tf.opB"(%arg1)
-// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0
-
+// CHECK:  _parallel_execution_ids = "r0:1"
+// CHECK: tf_executor.fetch %[[REPLICA_OPA_1]], %[[REPLICA_OPA_2]], %[[REPLICA_OPB_1]], %[[REPLICA_OPB_2]]
 
 // Tests replica id is added correctly.
 // CHECK-LABEL: func @replica_id_attr_added
@@ -209,24 +223,30 @@ func.func @replica_id_attr_added(%arg0: tensor<!tf_type.string>, %arg1: tensor<!
   func.return
 }
 
-// CHECK:      tf_executor.island
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 0
+// CHECK-SAME:   _parallel_execution_ids = "r0:0", _xla_replica_id = 0 : i64
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 0
+// CHECK-SAME:   _parallel_execution_ids = "r0:0", _xla_replica_id = 0 : i64
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 0
-// CHECK:      "tf.A"
+// CHECK-SAME:   _parallel_execution_ids = "r0:0", _xla_replica_id = 0 : i64
+// CHECK:      tf_executor.island wraps "tf.A"
 // CHECK-NOT:   _xla_replica_id
-// CHECK:      tf_executor.island
+// CHECK:      _parallel_execution_ids = "r0:0"
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 1
+// CHECK-SAME:   _parallel_execution_ids = "r0:1", _xla_replica_id = 1 : i64
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 1
+// CHECK-SAME:   _parallel_execution_ids = "r0:1", _xla_replica_id = 1 : i64
+// CHECK:      tf_executor.island wraps
 // CHECK:      "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"
-// CHECK-SAME:   _xla_replica_id = 1
-// CHECK:      "tf.A"
+// CHECK-SAME:   _parallel_execution_ids = "r0:1", _xla_replica_id = 1 : i64
+// CHECK:      tf_executor.island wraps "tf.A"
 // CHECK-NOT:   _xla_replica_id
+// CHECK:      _parallel_execution_ids = "r0:1"
 // CHECK:      tf_executor.fetch
 
 
@@ -249,13 +269,87 @@ func.func @device_ordinals() {
 
 // CHECK:      tf_executor.island
 // CHECK:      [[CONST_0:%.+]] = "tf.Const"
-// CHECK-SAME: value = dense<1> : tensor<i64>
+// CHECK-SAME: _parallel_execution_ids = "r0:0", value = dense<1> : tensor<i64>
 // CHECK:      tf_executor.yield [[CONST_0]]
 // CHECK:      tf_executor.island
 // CHECK:      [[CONST_1:%.+]] = "tf.Const"
-// CHECK-SAME: value = dense<2> : tensor<i64>
+// CHECK-SAME: _parallel_execution_ids = "r0:1", value = dense<2> : tensor<i64>
 // CHECK:      tf_executor.yield [[CONST_1]]
 
+// -----
+// Tests parallel_execute nested inside replicate
+// CHECK-LABEL: func @nested_parallel_execute
+func.func @nested_parallel_execute(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3:2 = "tf_device.parallel_execute"() ({
+          %6 = "tf_device.launch"() ({
+            %4 = "tf.OpA"(%arg0) : (tensor<i1>) -> tensor<i32>
+            tf_device.return %4: tensor<i32>
+          }) {device = "/TPU:1"} : () -> (tensor<i32>)
+          tf_device.return %6: tensor<i32>
+        }, {
+          %4 = "tf_device.launch"() ({
+            %5 = "tf.OpB"(%arg1) : (tensor<i1>) -> (tensor<i64>)
+            tf_device.return %5: tensor<i64>
+          }) {device = "/TPU:2"} : () -> (tensor<i64>)
+          tf_device.return %4 : tensor<i64>
+        }) : () -> (tensor<i32>, tensor<i64>)
+        tf_device.return %3#0, %3#1 : tensor<i32>, tensor<i64>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<i32>, tensor<i32>, tensor<i64>, tensor<i64>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<i32>, tensor<i32>, tensor<i64>, tensor<i64>
+  }
+  func.return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      tf_device.parallel_execute
+// CHECK:      tf_device.launch
+// CHECK:      tf.OpA
+// CHECK:      {device = "/TPU:1"}
+// CHECK:      tf_device.launch
+// CHECK:      tf.OpB
+// CHECK:      {device = "/TPU:2"}
+// CHECK:      _parallel_execution_ids = "r0:0"
+// CHECK:      tf_executor.island
+// CHECK:      tf_device.parallel_execute
+// CHECK:      tf_device.launch
+// CHECK:      tf.OpA
+// CHECK:      {device = "/TPU:1"}
+// CHECK:      tf_device.launch
+// CHECK:      tf.OpB
+// CHECK:      {device = "/TPU:2"}
+// CHECK:      _parallel_execution_ids = "r0:1"
+// CHECK:      tf_executor.fetch
+
+// -----
+// Tests parallel_group attr can merge correctly.
+// CHECK-LABEL: func @merge_of_parallel_group_attr
+func.func @merge_of_parallel_group_attr() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32, devices = {CORE_0 = ["/CPU:0", "/GPU:1"]}, _parallel_execution_ids = "r4:5"} {
+        "tf_device.launch"() ({
+          "tf.opA"() : () -> ()
+          tf_device.return
+        }) {device = "CORE_0"} : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK: "tf.opA"
+// CHECK: _parallel_execution_ids = "r4:5,r0:0", device = "/CPU:0"
+// CHECK: "tf.opA"
+// CHECK: _parallel_execution_ids = "r4:5,r0:1", device = "/GPU:1"
+
 // -----
 
 // Tests tf._TPUDeviceOrdinalPlaceholder cannot be updated when device ordinal
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir
new file mode 100644
index 00000000000..91ac4a2e76d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir
@@ -0,0 +1,277 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics %s -tf-replicate-to-island=legacy-graph-export=true | FileCheck %s
+
+// Tests per replica island has same control operands as island holding
+// replicate.
+// CHECK-LABEL: func @controls_per_replica
+func.func @controls_per_replica() {
+  tf_executor.graph {
+    %1 = tf_executor.ControlTrigger {}
+    %2 = tf_executor.ControlTrigger {}
+    %3 = tf_executor.island(%1, %2) {
+      tf_device.replicate {n = 2 : i32} {
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK: %[[CT_0:.*]] = tf_executor.ControlTrigger
+// CHECK: %[[CT_1:.*]] = tf_executor.ControlTrigger
+// CHECK: %{{.*}} = tf_executor.island(%[[CT_0]], %[[CT_1]])
+// CHECK: %{{.*}} = tf_executor.island(%[[CT_0]], %[[CT_1]])
+
+
+// Tests devices are not remapped if no devices were defined in replicate.
+// CHECK-LABEL: func @no_devices
+func.func @no_devices() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32} {
+        "tf_device.launch"() ({
+          "tf.opA"() : () -> ()
+          tf_device.return
+        }) {device = "CORE_0"} : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK: "tf.opA"
+// CHECK: device = "CORE_0"
+// CHECK: "tf.opA"
+// CHECK: device = "CORE_0"
+
+
+// Tests devices are not remapped if device is not in replicate devices.
+// CHECK-LABEL: func @no_override_device
+func.func @no_override_device() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32, devices = {CORE_0 = ["/CPU:0", "/GPU:1"]}} {
+        "tf_device.launch"() ({
+          "tf.opA"() : () -> ()
+          tf_device.return
+        }) {device = "/TPU:2"} : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK: "tf.opA"
+// CHECK: device = "/TPU:2"
+// CHECK: "tf.opA"
+// CHECK: device = "/TPU:2"
+
+
+// Tests devices are remapped if device is in replicate devices.
+// CHECK-LABEL: func @remap_device
+func.func @remap_device() {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32, devices = {CORE_0 = ["/CPU:0", "/GPU:1"]}} {
+        "tf_device.launch"() ({
+          "tf.opA"() : () -> ()
+          tf_device.return
+        }) {device = "CORE_0"} : () -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK: "tf.opA"
+// CHECK: device = "/CPU:0"
+// CHECK: "tf.opA"
+// CHECK: device = "/GPU:1"
+
+
+// Tests replicate with control dependency output has each expanded replica
+// control pinned to a sink island.
+// CHECK-LABEL: func @replicate_control
+func.func @replicate_control() {
+  tf_executor.graph {
+    %1 = tf_executor.island {
+      tf_device.replicate {n = 2 : i32} {
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch %1 : !tf_executor.control
+  }
+  func.return
+}
+
+// CHECK: %[[REPLICA_0:.*]] = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]] = tf_executor.island
+// CHECK: %[[SINK:.*]] = tf_executor.island(%[[REPLICA_0]], %[[REPLICA_1]])
+// CHECK: tf_executor.fetch %[[SINK]]
+
+
+// Tests unused replica are pinned to the graph fetch.
+// CHECK-LABEL: func @unused_replica
+func.func @unused_replica(%arg0: tensor<i1>) {
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.island {
+      %2:2 = tf_device.replicate([%arg0, %arg0] as %ri0: tensor<i1>) {n = 2 : i32} {
+        tf_device.return %ri0 : tensor<i1>
+      }
+      tf_executor.yield %2#0, %2#1 : tensor<i1>, tensor<i1>
+    }
+    tf_executor.fetch %1#1 : tensor<i1>
+  }
+  func.return
+}
+
+// CHECK: {{%.*}}, [[REPLICA_0_CONTROL:%.*]] = tf_executor.island
+// CHECK: [[REPLICA_1_OUTPUT:%.*]], {{%.*}} = tf_executor.island
+// CHECK: tf_executor.fetch [[REPLICA_1_OUTPUT]], [[REPLICA_0_CONTROL]]
+
+
+// Tests replicate results are remapped correctly.
+// CHECK-LABEL: func @replicate_result
+func.func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg2) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  func.return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
+
+
+// Tests replicate results are remapped correctly with packed inputs.
+// CHECK-LABEL: func @replicate_with_packed_input
+func.func @replicate_with_packed_input(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate(%arg0 as %arg2: tensor<i1>, %arg1 as %arg3: tensor<i1>)
+          {n = 2 : i32, _packed_input_indices = [0, 1]} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg3) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  func.return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: "tf.opA"(%arg0)
+// CHECK: "tf.opB"(%arg1)
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: "tf.opA"(%arg0)
+// CHECK: "tf.opB"(%arg1)
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0
+
+
+// Tests replica id is added correctly.
+// CHECK-LABEL: func @replica_id_attr_added
+func.func @replica_id_attr_added(%arg0: tensor<!tf_type.string>, %arg1: tensor<!tf_type.string>) {
+  tf_executor.graph {
+    %0 = tf_executor.island {
+      tf_device.replicate([%arg0, %arg1] as %arg2: tensor<!tf_type.string>) {n = 2 : i32} {
+        "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.A"(%arg2) : (tensor<!tf_type.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.fetch
+
+
+// Tests tf._TPUDeviceOrdinalPlaceholder ops are replaced with explicit device
+// ordinal constant values based on the first TPU core device id.
+// CHECK-LABEL: func @device_ordinals
+func.func @device_ordinals() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %1:2 = tf_device.replicate {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        %2 = "tf._TPUDeviceOrdinalPlaceholder"() : () -> tensor<i64>
+        tf_device.return %2 : tensor<i64>
+      }
+      tf_executor.yield %1#0, %1#1 : tensor<i64>, tensor<i64>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      [[CONST_0:%.+]] = "tf.Const"
+// CHECK-SAME: value = dense<1> : tensor<i64>
+// CHECK:      tf_executor.yield [[CONST_0]]
+// CHECK:      tf_executor.island
+// CHECK:      [[CONST_1:%.+]] = "tf.Const"
+// CHECK-SAME: value = dense<2> : tensor<i64>
+// CHECK:      tf_executor.yield [[CONST_1]]
+
+// -----
+
+// Tests tf._TPUDeviceOrdinalPlaceholder cannot be updated when device ordinal
+// is missing.
+
+func.func @missing_device_ordinals() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %1:2 = tf_device.replicate {n = 2 : i32, devices = {TPU_REPLICATED_CORE_1 = ["/job:worker/replica:0/task:0/device:TPU:1", "/job:worker/replica:0/task:0/device:TPU:2"]}} {
+        // expected-error@below {{requires device ordinal from device TPU_REPLICATED_CORE_0 to be present in 'tf.device.replicate' op}}
+        %2 = "tf._TPUDeviceOrdinalPlaceholder"() : () -> tensor<i64>
+        tf_device.return %2 : tensor<i64>
+      }
+      tf_executor.yield %1#0, %1#1 : tensor<i64>, tensor<i64>
+    }
+    tf_executor.fetch
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 2c816bd4145..0f0fea844e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -145,6 +145,51 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     func.return %0 : tensor<*xf32>
   }
 
+  // CHECK-LABEL: func @shape_from_case_to_branch_functions_to_results
+  // CHECK-SAME: (%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.func @shape_from_case_to_branch_functions_to_results(%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
+    %0 = "tf.Case"(%arg0, %arg1) {branches = [@case_branch0, @case_branch1], is_stateless = true} : (tensor<i32>, tensor<1x2x3xf32>) -> tensor<*xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: func @case_branch0
+  // CHECK-SAME: (%arg0: tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.func @case_branch0(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    // CHECK: return
+    // CHECK-SAME: tensor<1x2x3xf32>
+    func.return %arg0 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: func @case_branch1
+  // CHECK-SAME: (%arg0: tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+  func.func @case_branch1(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    // CHECK: "tf.Identity"(%arg0) : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %0 = "tf.Identity"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
+    // CHECK: return
+    // CHECK-SAME: tensor<1x2x3xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: shape_from_case_to_region_bodies_to_output
+  // CHECK-SAME: -> tensor<1x2x3xf32>
+  func.func @shape_from_case_to_region_bodies_to_output(%arg0: tensor<i32>, %arg1: tensor<1x2x3xf32>) -> tensor<*xf32> {
+    %unshaped = "tf.Cast"(%arg1) : (tensor<1x2x3xf32>) -> tensor<*xf32>
+    %0 = "tf.CaseRegion"(%arg0) ({
+      // CHECK: "tf.Add"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+      // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
+      %1 = "tf.Add"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
+      "tf.Yield"(%1) : (tensor<*xf32>) -> ()
+     }, {
+      // CHECK: "tf.Sub"{{.+}}(tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+      // CHECK: "tf.Yield"{{.+}}(tensor<1x2x3xf32>) -> ()
+      %2 = "tf.Sub"(%unshaped, %unshaped) : (tensor<*xf32>,  tensor<*xf32>) -> tensor<*xf32>
+      "tf.Yield"(%2) : (tensor<*xf32>) -> ()
+      // CHECK: {is_stateless = true} : (tensor<i32>) -> tensor<1x2x3xf32>
+     }) {is_stateless = true} : (tensor<i32>) -> tensor<*xf32>
+    // CHECK: return {{.*}} :  tensor<1x2x3xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @shape_from_while_to_cond_body_functions
   func.func @shape_from_while_to_cond_body_functions(%arg0: tensor<4xf32>, %arg1: tensor<!tf_type.resource<tensor<4xf32>>>, %arg2: tensor<!tf_type.resource<tensor<*xf32>>>) -> tensor<4xf32> {
     // CHECK: "tf.While"
@@ -751,6 +796,34 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
    func.return
   }
 
+  // CHECK-LABEL: func @shape_partial_eval
+  func.func @shape_partial_eval(%arg0: tensor<1x?x7xf32>, %arg1: tensor<3x7xf32>) -> tensor<*xf32> {
+    %0 = "tf.Shape"(%arg0) : (tensor<1x?x7xf32>) -> tensor<3xi32>
+
+    // CHECK: tf.Reshape
+    // CHECK-SAME: tensor<1x3x7xf32>
+    %1 = "tf.Reshape"(%arg1, %0) : (tensor<3x7xf32>, tensor<3xi32>) -> tensor<*xf32>
+    return %1 : tensor<*xf32>
+  }
+
+  // CHECK-LABEL: func @gather_concat_reshape
+  func.func @gather_concat_reshape(%arg0: tensor<?x1x7xf32>, %arg1: tensor<?x7xf32>) -> tensor<*xf32> {
+    %0 = "tf.Shape"(%arg0) : (tensor<?x1x7xf32>) -> tensor<3xi32>
+
+    %indices = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %axis = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<i32>
+    %1 = "tf.GatherV2"(%0, %indices, %axis) {batch_dims = 0 : i64, device = ""} : (tensor<3xi32>, tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
+
+    %last_dim = "tf.Const"() {value = dense<7> : tensor<1xi32>} : () -> tensor<1xi32>
+    %2 = "tf.ConcatV2"(%1, %last_dim, %axis) {device = ""} : (tensor<2xi32>, tensor<1xi32>, tensor<i32>) -> tensor<3xi32>
+
+
+    // CHECK: tf.Reshape
+    // CHECK-SAME: tensor<?x1x7xf32>
+    %3 = "tf.Reshape"(%arg1, %2) : (tensor<?x7xf32>, tensor<3xi32>) -> tensor<*xf32>
+    return %3 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: const_fold
   func.func @const_fold() -> () {
     // CHECK: tf.Const
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index be22c2f0552..b489dd04e73 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -2201,8 +2201,8 @@ func.func @nontrivial_multi_op_islands(
 
 // -----
 
-// Tests that we create dependencies between `CollectiveReduceV2` ops
-// (TF_CollectiveReduceOrderingEffect).
+// Tests that we create dependencies between `CollectiveReduceV2` ops if no
+// ordering tokens are present (TF_CollectiveReduceOrderingEffect).
 func.func @collective_reduce_ordering_effect(
   // expected-remark@above {{ID: 7}}
   %input: tensor<f32>,
@@ -2236,6 +2236,114 @@ func.func @collective_reduce_ordering_effect(
 
 // -----
 
+// Tests that we don't create dependencies between `CollectiveReduceV2` ops if
+// ordering tokens are present and independent.
+func.func @collective_reduce_independent_ordering_tokens(
+  // expected-remark@above {{ID: 7}}
+  %arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf_type.resource<tensor<f32>>> {tf._resource_arg_unique_id = 1 : i64},
+  %input: tensor<f32>,
+  %group_key: tensor<i32>,
+  %group_size: tensor<i32>,
+  %instance_key: tensor<i32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key, %arg0) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {2}}}
+        %1 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key, %arg1) {merge_op = "Mul", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {0,1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
+// Tests that we do create dependencies between `CollectiveReduceV2` ops if
+// ordering tokens are present and dependent.
+func.func @collective_reduce_dependent_ordering_tokens(
+  // expected-remark@above {{ID: 7}}
+  %arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf_type.resource<tensor<f32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %input: tensor<f32>,
+  %group_key: tensor<i32>,
+  %group_size: tensor<i32>,
+  %instance_key: tensor<i32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key, %arg0) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {1}}}
+        %1 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key, %arg1) {merge_op = "Mul", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Successors: {2}}}
+        // expected-remark@above {{Predecessors: {0}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
+// Tests that we don't create dependencies between `CollectiveReduceV2` ops if
+// one has ordering tokens and the other one doesn't.
+func.func @collective_reduce_dependent_ordering_tokens(
+  // expected-remark@above {{ID: 7}}
+  %arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %input: tensor<f32>,
+  %group_key: tensor<i32>,
+  %group_size: tensor<i32>,
+  %instance_key: tensor<i32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key, %arg0) {merge_op = "Add", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {2}}}
+        %1 = "tf.CollectiveReduceV2"(%input, %group_size, %group_key, %instance_key) {merge_op = "Mul", final_op = "Id"} : (tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {0,1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
 // Tests that we don't create dependencies between device launch ops with
 // multiple stateless ops each.
 func.func @multi_stateless_op_launches() {
@@ -2385,3 +2493,138 @@ func.func @self_dependent_only_feeds_into_fetch(
   // expected-remark@above {{ID: 8}}
   // expected-remark@above {{Sinks: {7}}}
 }
+
+// -----
+
+// Tests that we create dependencies between `NcclAllReduce` ops on same device.
+func.func @collective_reduce_ordering_effect(
+  // expected-remark@above {{ID: 7}}
+  %input: tensor<f32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name", device = "CPU:0"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {1}}}
+        %1 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name", device = "CPU:0"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Predecessors: {0}}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
+// Tests that we create dependencies between `NcclAllReduce` ops for unspecified
+// devices.
+func.func @collective_reduce_ordering_effect(
+  // expected-remark@above {{ID: 7}}
+  %input: tensor<f32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {1}}}
+        %1 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Predecessors: {0}}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
+// Tests that we don't create dependencies between `NcclAllReduce` ops on
+// different devices.
+func.func @collective_reduce_ordering_effect(
+  // expected-remark@above {{ID: 7}}
+  %input: tensor<f32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        %0 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name", device = "CPU:0"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {2}}}
+        %1 = "tf.NcclAllReduce"(%input) { reduction = "min", num_devices = 2, shared_name = "name", device = "CPU:1"} : (tensor<f32>) -> tensor<f32>
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {0,1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
+// Tests that we create dependencies to a fetch op, even if the fetch op has
+// known effects (in this case due to resource operand) and the resources of
+// other side-effecting ops are independent.
+func.func @fetch_with_resource_operand(
+  // expected-remark@above {{ID: 9}}
+  %arg0: tensor<!tf_type.string>,
+  %arg1: tensor<*x!tf_type.resource<tensor<32xf32>>>,
+  %arg2: tensor<*x!tf_type.resource<tensor<32xf32>>>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 7}}
+    %island1 = tf_executor.island {
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Successors: {6}}}
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {1}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Predecessors: {0}}}
+    }
+    %island2 = tf_executor.island {
+        // expected-remark@above {{ID: 5}}
+        // expected-remark@above {{Successors: {6}}}
+        %read = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<32xf32>>>) -> tensor<32xf32>
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 4}}
+        // expected-remark@above {{Predecessors: {3}}}
+    }
+    tf_executor.fetch %arg2, %island1, %island2 : tensor<*x!tf_type.resource<tensor<32xf32>>>, !tf_executor.control, !tf_executor.control
+    // expected-remark@above {{ID: 6}}
+    // expected-remark@above {{Predecessors: {2,5}}}
+  }
+  func.return
+  // expected-remark@above {{ID: 8}}
+  // expected-remark@above {{Sinks: {7}}}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
index 173ade30390..771a55cb7a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/split_into_island_per_op.mlir
@@ -121,6 +121,26 @@ func.func @dangling_print(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*
 // CHECK:  }
 // CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
 
+func.func @drop_fetch_control_dep(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*xi32>, tensor<*xi32>) {
+  %graph:2 = tf_executor.graph {
+    %island1:3 = tf_executor.island {
+      %add1 = "tf.Add"(%arg0, %arg1) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      %add2 = "tf.Add"(%add1, %arg1) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+      tf_executor.yield %add1, %add2 : tensor<*xi32>, tensor<*xi32>
+    }
+    tf_executor.fetch %island1#0, %island1#1, %island1#2 : tensor<*xi32>, tensor<*xi32>, !tf_executor.control
+  }
+  func.return %graph#0, %graph#1 : tensor<*xi32>, tensor<*xi32>
+}
+
+// CHECK-LABEL:  func @drop_fetch_control_dep
+// CHECK:  %[[GRAPH:.*]]:2 = tf_executor.graph {
+// CHECK:    %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1)
+// CHECK:    %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1)
+// CHECK:    tf_executor.fetch %[[ADD1]], %[[ADD2]] :
+// CHECK:  }
+// CHECK:  return %[[GRAPH]]#0, %[[GRAPH]]#1
+
 func.func @fetching_arg(%arg0: tensor<*xi32>) {
   tf_executor.graph {
     %island:3 = tf_executor.island {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 8b23c7e678a..5c6e0efacfb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1639,7 +1639,7 @@ func.func @invalidSelect(%arg0: tensor<1x8xi1>, %arg1: tensor<1x8x8xi32>, %arg2:
 //===--------------------------------------------------------------------===//
 
 // Test valid tf.SelectV2
-// CHfaECK-LABEL: func @selectV2BroadcastThen
+// CHECK-LABEL: func @selectV2BroadcastThen
 func.func @selectV2BroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
   %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
   func.return %0: tensor<2x8x8xi32>
@@ -2561,6 +2561,26 @@ func.func @testInvalidToBool(%arg0: tensor<i32>) -> tensor<1xi1> {
 
 // -----
 
+// Test invalid tf.TPUPartitionedInputV2 with packing
+func.func @testPackedTPUPartitionedInputV2(tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<4x4xf32> {
+^bb0(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>):
+  // expected-error @+1 {{expected 1 inputs, got 2}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {partition_dims = [2, 1], is_packed = true} : (tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// Test invalid tf.TPUPartitionedInputV2 without packing
+func.func @testUnpackedTPUPartitionedInputV2(tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<4x4xf32> {
+^bb0(%arg0: tensor<2x4xf32>, %arg1: tensor<2x4xf32>):
+  // expected-error @+1 {{expected 2 inputs, got 1}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {partition_dims = [2, 1], is_packed = false} : (tensor<2x4xf32>) -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
 // Test valid tf.Transpose
 // CHECK-LABEL: testTranspose
 func.func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
@@ -4679,3 +4699,334 @@ func.func @testSetStaticDimensionBounds(%arg0: tensor<?x?xi32>, %arg1: tensor<4x
   %dyn_arg0 = "tf.SetStaticDimensionBounds" (%arg0, %arg1) :(tensor<?x?xi32>, tensor<4xi32>) -> tensor<?x?xi32>
   func.return %dyn_arg0 : tensor<?x?xi32>
 }
+
+// -----
+
+func.func @testUniformQuantizedDotHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedDotHybrid' op quantization_axis is -1, scales must have 0 rank.}}
+  %0 = "tf.UniformQuantizedDotHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedDotHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<f32>, %rhs_zps: tensor<2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedDotHybrid' op quantization_axis is -1, zero_points must have 0 rank.}}
+  %0 = "tf.UniformQuantizedDotHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<f32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedDotHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2x2xf32>, %rhs_zps: tensor<2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedDotHybrid' op quantization_axis is not -1, scales must have 1 rank.}}
+  %0 = "tf.UniformQuantizedDotHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2x2xf32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedDotHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<2x2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedDotHybrid' op quantization_axis is not -1, zero_points must have 1 rank.}}
+  %0 = "tf.UniformQuantizedDotHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<2x2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedDotHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<3xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedDotHybrid' op scales and zero points must have same number of elements.}}
+  %0 = "tf.UniformQuantizedDotHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<3xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedConvolutionHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedConvolutionHybrid' op quantization_axis is -1, scales must have 0 rank.}}
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedConvolutionHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<f32>, %rhs_zps: tensor<2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedConvolutionHybrid' op quantization_axis is -1, zero_points must have 0 rank.}}
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<f32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedConvolutionHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2x2xf32>, %rhs_zps: tensor<2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedConvolutionHybrid' op quantization_axis is not -1, scales must have 1 rank.}}
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2x2xf32>, tensor<2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedConvolutionHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<2x2xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedConvolutionHybrid' op quantization_axis is not -1, zero_points must have 1 rank.}}
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<2x2xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedConvolutionHybrid(%lhs: tensor<*xf32>, %rhs: tensor<2x2x!tf_type.qint8>, %rhs_scales: tensor<2xf32>, %rhs_zps: tensor<3xi32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformQuantizedConvolutionHybrid' op scales and zero points must have same number of elements.}}
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%lhs, %rhs, %rhs_scales, %rhs_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = 0 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64
+    } : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<3xi32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantize(%arg0: tensor<*xf32>, %scales: tensor<2xf32>, %zps: tensor<i32>) -> tensor<*x!tf_type.qint8> {
+  // expected-error @below {{'tf.UniformQuantize' op quantization_axis is -1, scales must have 0 rank.}}
+   %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*xf32>, tensor<2xf32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  func.return %0 : tensor<*x!tf_type.qint8>
+}
+
+// -----
+
+func.func @testUniformRequantize(
+  %arg0: tensor<*x!tf_type.qint8>,
+  %scales_0: tensor<2xf32>, %zps_0: tensor<i32>,
+  %scales_1: tensor<f32>, %zps_1: tensor<i32>) -> tensor<*x!tf_type.qint8> {
+  // expected-error @below {{'tf.UniformRequantize' op quantization_axis is -1, scales must have 0 rank.}}
+  %0 = "tf.UniformRequantize"(%arg0, %scales_0, %zps_0, %scales_1, %zps_1) {
+    input_quantization_axis = -1 : i64, input_quantization_min_val = -2147483648 : i64, input_quantization_max_val = 2147483647 : i64,
+    output_quantization_axis = -1 : i64, output_quantization_min_val = -128 : i64, output_quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<2xf32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  func.return %0 : tensor<*x!tf_type.qint8>
+}
+
+// -----
+
+func.func @testUniformRequantize(
+  %arg0: tensor<*x!tf_type.qint8>,
+  %scales_0: tensor<f32>, %zps_0: tensor<i32>,
+  %scales_1: tensor<2xf32>, %zps_1: tensor<i32>) -> tensor<*x!tf_type.qint8> {
+  // expected-error @below {{'tf.UniformRequantize' op quantization_axis is -1, scales must have 0 rank.}}
+  %0 = "tf.UniformRequantize"(%arg0, %scales_0, %zps_0, %scales_1, %zps_1) {
+    input_quantization_axis = -1 : i64, input_quantization_min_val = -2147483648 : i64, input_quantization_max_val = 2147483647 : i64,
+    output_quantization_axis = -1 : i64, output_quantization_min_val = -128 : i64, output_quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<f32>, tensor<i32>, tensor<2xf32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  func.return %0 : tensor<*x!tf_type.qint8>
+}
+
+// -----
+
+func.func @testUniformDequantize(%arg0: tensor<*x!tf_type.qint8>, %scales: tensor<2xf32>, %zps: tensor<i32>) -> tensor<*xf32> {
+  // expected-error @below {{'tf.UniformDequantize' op quantization_axis is -1, scales must have 0 rank.}}
+   %0 = "tf.UniformDequantize"(%arg0, %scales, %zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<2xf32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @testUniformQuantizedDot(
+  %input: tensor<*x!tf_type.qint8>, %weight: tensor<2x2x!tf_type.qint8>,
+  %input_scales: tensor<2xf32>, %input_zps: tensor<i32>,
+  %weight_scales: tensor<f32>, %weight_zps: tensor<i32>,
+  %output_scales: tensor<f32>, %output_zps: tensor<i32>) -> () {
+  // expected-error @below {{'tf.UniformQuantizedDot' op quantization_axis is -1, scales must have 0 rank.}}
+  %1 = "tf.UniformQuantizedDot"(
+    %input, %weight,
+    %input_scales, %input_zps,
+    %weight_scales, %weight_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -128 : i64,
+      lhs_quantization_max_val = 127 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -128 : i64,
+      rhs_quantization_max_val = 127 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>,
+        tensor<2xf32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+func.func @testUniformQuantizedDot(
+  %input: tensor<*x!tf_type.qint8>, %weight: tensor<2x2x!tf_type.qint8>,
+  %input_scales: tensor<f32>, %input_zps: tensor<i32>,
+  %weight_scales: tensor<2xf32>, %weight_zps: tensor<i32>,
+  %output_scales: tensor<f32>, %output_zps: tensor<i32>) -> () {
+  // expected-error @below {{'tf.UniformQuantizedDot' op quantization_axis is -1, scales must have 0 rank.}}
+  %1 = "tf.UniformQuantizedDot"(
+    %input, %weight,
+    %input_scales, %input_zps,
+    %weight_scales, %weight_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -128 : i64,
+      lhs_quantization_max_val = 127 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -128 : i64,
+      rhs_quantization_max_val = 127 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>,
+        tensor<f32>, tensor<i32>,
+        tensor<2xf32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
+// -----
+
+func.func @testUniformQuantizedDot(
+  %input: tensor<*x!tf_type.qint8>, %weight: tensor<2x2x!tf_type.qint8>,
+  %input_scales: tensor<f32>, %input_zps: tensor<i32>,
+  %weight_scales: tensor<f32>, %weight_zps: tensor<i32>,
+  %output_scales: tensor<2xf32>, %output_zps: tensor<i32>) -> () {
+  // expected-error @below {{'tf.UniformQuantizedDot' op quantization_axis is -1, scales must have 0 rank.}}
+  %1 = "tf.UniformQuantizedDot"(
+    %input, %weight,
+    %input_scales, %input_zps,
+    %weight_scales, %weight_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -128 : i64,
+      lhs_quantization_max_val = 127 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -128 : i64,
+      rhs_quantization_max_val = 127 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<2xf32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
+// Following tests are for LegacyCall symbol use verifier.
+
+// -----
+
+// Tests that valid symbol use does not produce any error.
+func.func @valid_symbol_use(%arg0: tensor<i32>) -> () {
+  "tf.LegacyCall"(%arg0) {f = @call_func} : (tensor<i32>) -> (tensor<i32>)
+  func.return
+}
+
+func.func @call_func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+
+// -----
+
+// Tests that undefined call function produces error.
+func.func @test_undefined_function() -> () {
+  // expected-error @below {{'f' attribute refers to an undefined function: undefined_func}}
+  "tf.LegacyCall"() {f = @undefined_func} : () -> ()
+  func.return
+}
+
+// -----
+
+// Tests that argument count mismatch produces error.
+func.func @test_arg_count_mismatch(%arg0: tensor<i32>) -> () {
+  // expected-error @below {{argument count mismatch: 'args' has 1 argument(s), but 'call_func' expects 2}}
+  "tf.LegacyCall"(%arg0) {f = @call_func} : (tensor<i32>) -> tensor<i32>
+  func.return
+}
+
+func.func @call_func(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  func.return %arg0 : tensor<f32>
+}
+
+// -----
+
+func.func @test_batch_function_with_valid_symbol(%arg0: tensor<1x3xf32>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> () {
+  "tf.BatchFunction"(%arg0, %arg1) {batch_timeout_micros = 100000 : i64, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return
+}
+
+func.func private @batched_function(%arg0: tensor<1x3xf32>, %arg1: tensor<*x!tf_type.resource>) -> tensor<1x3xf32> {
+  %0 = "tf.Identity"(%arg0) : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %0 : tensor<1x3xf32>
+}
+
+// -----
+
+func.func @test_batch_function_with_invalid_symbol(%arg0: tensor<1x3xf32>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> () {
+  // expected-error @below {{'f' attribute refers to an undefined function: undefined_function}}
+  "tf.BatchFunction"(%arg0, %arg1) {batch_timeout_micros = 100000 : i64, f = @undefined_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
index 4c0ca54b235..88c31e5057f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common.py
index 254d1699cd9..b3bd3449851 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common.py
@@ -24,8 +24,8 @@
 from absl import flags
 from absl import logging
 import tensorflow.compat.v2 as tf
-
 from tensorflow.python import pywrap_mlir  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.lib.io import file_io
 
 # Use /tmp to make debugging the tests easier (see README.md)
 flags.DEFINE_string('save_model_path', '',
@@ -88,5 +88,9 @@ def app_main(argv):
     mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir, 'canonicalize',
                                                       show_debug_info)
     print(mlir)
+    filename = '%s/result.mlirbc' % save_model_path
+    pywrap_mlir.experimental_write_bytecode(filename, mlir)
+    if not file_io.file_exists(filename):
+      raise app.UsageError('Failed to create bytecode output.')
 
   app.run(app_main)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
index 34ca31137da..eefb57b84dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/control_flow_upgrade_legacy_v1.py
@@ -29,7 +29,9 @@
 # CHECK-SAME: then_branch = @"key/[[then:[a-zA-Z_0-9]+]]"
 
 # CHECK: func private @"key/[[else]]"(
+# CHECK-SAME: tf._original_func_name
 # CHECK: func private @"key/[[then]]"(
+# CHECK-SAME: tf._original_func_name
 
 
 def Test():
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/debug_info.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/debug_info.py
index 364b6953e5c..5a3a9875a00 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/debug_info.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/debug_info.py
@@ -30,8 +30,10 @@ def some_function(self, x, y):
     return x + y
     # Basic check that the debug info file is being correctly saved and loaded.
     #
-    # CHECK: "tf.AddV2"{{.*}}loc(#[[LOC:.*]])
-    # CHECK: #[[LOC]] = loc({{.*}}callsite("{{[^"]*}}/debug_info.py{{.*}}":{{[0-9]+}}:{{[0-9]+}}
+    # CHECK: "tf.AddV2"{{.*}}loc(#loc{{[0-9]+}})
+    # CHECK: "tf.Identity"{{.*}}loc(#loc{{[0-9]+}})
+    # CHECK: #loc{{[0-9]+}} = loc("{{.*}}debug_info.py":{{[0-9]+}}:{{[0-9]+}})
+    # CHECK: #loc{{[0-9]+}} = loc(callsite(#loc{{[0-9]+}} at #loc{{[0-9]+}}))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
index c9b9a22838d..954eca9c0e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
index c9b9a22838d..954eca9c0e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 glob_lit_tests(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 3f575069f6a..355085be8b4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -252,7 +252,7 @@ func.func @replication(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<f32>
 // Test replication with model parallelism using partitioned resource inputs.
 // The cluster will be wrapped in a `tf_device.cluster` first and then by a
 // replicate.
-// TPUPartitionedInput nodes would be inside the replicate but outside the
+// TPUPartitionedInputV2 nodes would be inside the replicate but outside the
 // cluster.
 // TPUReplicatedInput and TPUReplicatedOutput nodes will be replaced by the
 // replicate operands and results.
@@ -265,7 +265,7 @@ func.func @replication_with_model_parallelism(%arg0: !rtype, %arg1: !rtype, %arg
   %1 = "tf.opB"() {is_stateless = true} : () -> tensor<i32>
   %2 = "tf.TPUReplicatedInput"(%arg0, %arg2) : (!rtype, !rtype) -> !rtype
   %3 = "tf.TPUReplicatedInput"(%arg1, %arg3) : (!rtype, !rtype) -> !rtype
-  %4 = "tf.TPUPartitionedInput"(%2, %3) {_XlaSharding = "", device = "", partition_dim = -1 : i64} : (!rtype, !rtype) -> !rtype
+  %4 = "tf.TPUPartitionedInputV2"(%2, %3) {_XlaSharding = "", device = "", partition_dims = []} : (!rtype, !rtype) -> !rtype
   %5 = "tf.TPUReplicatedInput"(%0, %1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
   %6 = "tf.opC"(%4) {_xla_compile_device_type = "TPU", _replication_info = "replicate", is_stateless = true} : (!rtype) -> tensor<10x3xf32>
   %7:2 = "tf.TPUReplicatedOutput"(%6) : (tensor<10x3xf32>) -> (tensor<10x3xf32>, tensor<10x3xf32>)
@@ -282,7 +282,7 @@ func.func @replication_with_model_parallelism(%arg0: !rtype, %arg1: !rtype, %arg
 // CHECK-DAG:  [%[[ARG_1]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>
 // CHECK-DAG:  [%[[OP_A]], %[[OP_B]]] as %[[RI_2:[a-z0-9]*]]: tensor<i32>
 // CHECK-SAME: n = 2 : i32
-// CHECK:        %[[PI:[0-9]*]] = "tf.TPUPartitionedInput"(%[[RI_0]], %[[RI_1]])
+// CHECK:        %[[PI:[0-9]*]] = "tf.TPUPartitionedInputV2"(%[[RI_0]], %[[RI_1]])
 // CHECK-NEXT:   %[[CLUSTER:[0-9]*]]:2 = "tf_device.cluster"() ({
 // CHECK:          %[[OP_C:[0-9]*]] = "tf.opC"(%[[PI]])
 // CHECK:          %[[OP_D:[0-9]*]] = "tf.opD"(%[[RI_2]])
@@ -568,8 +568,8 @@ func.func @bad_num_replicas() {
 func.func @replication_with_model_parallelism(%arg0: !rtype, %arg1: !rtype, %arg2: !rtype, %arg3: !rtype) -> (tensor<10x3xf32>) {
   %2 = "tf.TPUReplicatedInput"(%arg0, %arg2) : (!rtype, !rtype) -> !rtype
   %3 = "tf.TPUReplicatedInput"(%arg1, %arg3) : (!rtype, !rtype) -> !rtype
-  // expected-error@+1 {{'tf.TPUPartitionedInput' op requires 4 operands but found 2}}
-  %4 = "tf.TPUPartitionedInput"(%2, %3) {_XlaSharding = "", device = "", partition_dim = -1 : i64} : (!rtype, !rtype) -> !rtype
+  // expected-error@+1 {{'tf.TPUPartitionedInputV2' op requires 4 operands but found 2}}
+  %4 = "tf.TPUPartitionedInputV2"(%2, %3) {_XlaSharding = "", device = "", partition_dims = []} : (!rtype, !rtype) -> !rtype
   %6 = "tf.opC"(%4) {_xla_compile_device_type = "TPU", _replication_info = "replicate", is_stateless = true} : (!rtype) -> tensor<10x3xf32>
   %7:2 = "tf.TPUReplicatedOutput"(%6) : (tensor<10x3xf32>) -> (tensor<10x3xf32>, tensor<10x3xf32>)
   "tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "replicate", device = "/device:TPU:0", num_cores_per_replica = 4 : i64, num_replicas = 2 : i64, topology = "topology"} : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 596b9c7fc8e..9513ef51360 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -1778,6 +1778,55 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     func.return %1 : tensor<2xi32>
   }
 
+  // Check that a non-XLA value is not routed through the XLA side.
+
+  // CHECK-LABEL: func @nonxla_static
+  func.func @nonxla_static() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.A"() : () -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster0"} : (tensor<i32>) -> tensor<!tf_type.string>
+      %2 = "tf.C"(%1) {_xla_outside_compilation = "cluster0"} : (tensor<!tf_type.string>) -> tensor<i32>
+      "tf.D"(%2) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+    func.return
+  }
+
+  // Check that a non-XLA value with dynamic shape is not routed through the XLA side.
+
+  // CHECK-LABEL: func @nonxla_dynamic
+  func.func @nonxla_dynamic() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.A"() : () -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster0"} : (tensor<i32>) -> tensor<?x!tf_type.string>
+      %2 = "tf.C"(%1) {_xla_outside_compilation = "cluster0"} : (tensor<?x!tf_type.string>) -> tensor<i32>
+      "tf.D"(%2) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+    func.return
+  }
+
+  // Reproducer for an operand #x does not dominate this use
+
+  // CHECK-LABEL: func @op_dominate_repro
+  func.func @op_dominate_repro(%writer: tensor<*x!tf_type.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> () {
+    %step = "tf.Const"() {device = "", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %tag = "tf.Const"() {device = "", value = dense<""> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+    %wmetadata = "tf.Const"() {device = "", value = dense<""> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+    %pred = "tf.Const"() {device = "", value = dense<0> : tensor<i1>} : () -> tensor<i1>
+    "tf_device.cluster"() ({
+      "tf.IfRegion"(%pred) ({
+        %wtensor = "tf.Const"() {device = "", value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+        "tf.WriteSummary"(%writer, %step, %wtensor, %tag, %wmetadata) {_xla_outside_compilation = "auto"} : (tensor<*x!tf_type.resource>, tensor<i64>, tensor<f32>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> ()
+        "tf.WriteSummary"(%writer, %step, %wtensor, %tag, %wmetadata) {_xla_outside_compilation = "auto"} : (tensor<*x!tf_type.resource>, tensor<i64>, tensor<f32>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> ()
+        "tf.Yield"() : () -> ()
+      }, {
+        "tf.Yield"() : () -> ()
+      }) {is_stateless = false} : (tensor<i1>) -> ()
+      tf_device.return
+    }) {_replication_info = "cluster__train_single_step", _xla_compile_device_type = "TPU", allow_soft_placement = true, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 1, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
+    return
+  }
 }
 
 // -----
@@ -1967,3 +2016,25 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   }
 }
 
+// -----
+
+// Tests that an error is reported when an op with _xla_outside_compilation has
+// an ancestor with _xla_outside_compilation.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func.func @outside_comp_ancestor() {
+    "tf_device.cluster"() ({
+      "tf.WhileRegion"() ({
+      ^bb0():
+        // expected-error @+1 {{has an ancestor marked for outside compilation}}
+        %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i1>
+	"tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      ^bb0():
+        "tf.Yield"() : () -> ()
+      }) {_xla_outside_compilation = "cluster1", is_stateless = true} : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = []} : () -> ()
+    func.return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_partitioned_op_conversion.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_partitioned_op_conversion.mlir
new file mode 100644
index 00000000000..f2a52c8dc34
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_partitioned_op_conversion.mlir
@@ -0,0 +1,106 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-partitioned-op-conversion | FileCheck %s
+
+// CHECK-LABEL:func @replicated
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG2:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG3:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
+func.func @replicated(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK-SAME: partition_dims = []
+  // CHECK: [[PI_1:%.*]] = "tf.TPUPartitionedInputV2"([[ARG2]], [[ARG3]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK-SAME: partition_dims = []
+  // CHECK: [[RI:%.*]] = "tf.TPUReplicatedInput"([[PI_0]], [[PI_1]])
+  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: return [[RI]]
+  func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
+
+// -----
+
+// CHECK-LABEL:func @partitioned_2d
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<10x3xf32>, [[ARG1:%.*]]: tensor<10x3xf32>)
+func.func @partitioned_2d(%arg0: tensor<10x3xf32>, %arg1: tensor<10x3xf32>) -> tensor<20x3xf32> {
+  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [2, 1]
+  // CHECK: [[RI:%.*]] = "tf.TPUReplicatedInput"([[PI_0]])
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 1 : i64} : () -> ()
+  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "123", partition_dim = 0 : i64} : (tensor<10x3xf32>, tensor<10x3xf32>) -> tensor<20x3xf32>
+  %ri = "tf.TPUReplicatedInput"(%pi_0) : (tensor<20x3xf32>) -> tensor<20x3xf32>
+  // CHECK: return [[RI]]
+  func.return %ri : tensor<20x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL:func @partitioned_2d_resource
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG2:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG3:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
+func.func @partitioned_2d_resource(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<20x3xf32>>> {
+  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [2, 1]
+  // CHECK: [[PI_1:%.*]] = "tf.TPUPartitionedInputV2"([[ARG2]], [[ARG3]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [2, 1]
+  // CHECK: [[RI:%.*]] = "tf.TPUReplicatedInput"([[PI_0]], [[PI_1]])
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "123", partition_dim = 0 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<20x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "123", partition_dim = 0 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<20x3xf32>>>
+  %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<20x3xf32>>>, tensor<!tf_type.resource<tensor<20x3xf32>>>) -> tensor<!tf_type.resource<tensor<20x3xf32>>>
+  // CHECK: return [[RI]]
+  func.return %ri : tensor<!tf_type.resource<tensor<20x3xf32>>>
+}
+
+// -----
+
+// CHECK-LABEL:func @partitioned_3d
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, [[ARG2:%.*]]: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, [[ARG3:%.*]]: tensor<!tf_type.resource<tensor<16x8x16xf32>>>)
+func.func @partitioned_3d(%arg0: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, %arg1: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, %arg2: tensor<!tf_type.resource<tensor<16x8x16xf32>>>, %arg3: tensor<!tf_type.resource<tensor<16x8x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x16x16xf32>>> {
+  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [1, 2, 1]
+  // CHECK: [[PI_1:%.*]] = "tf.TPUPartitionedInputV2"([[ARG2]], [[ARG3]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [1, 2, 1]
+  // CHECK: [[RI:%.*]] = "tf.TPUReplicatedInput"([[PI_0]], [[PI_1]])
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "123", partition_dim = 1 : i64} : (tensor<!tf_type.resource<tensor<16x8x16xf32>>>, tensor<!tf_type.resource<tensor<16x8x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x16x16xf32>>>
+  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "123", partition_dim = 1 : i64} : (tensor<!tf_type.resource<tensor<16x8x16xf32>>>, tensor<!tf_type.resource<tensor<16x8x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x16x16xf32>>>
+  %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<16x16x16xf32>>>, tensor<!tf_type.resource<tensor<16x16x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x16x16xf32>>>
+  // CHECK: return [[RI]]
+  func.return %ri : tensor<!tf_type.resource<tensor<16x16x16xf32>>>
+}
+
+// -----
+
+// CHECK-LABEL:func @partitioned_output_3d
+// CHECK-SAME: ([[ARG:%.*]]: tensor<!tf_type.resource<tensor<16x16x16xf32>>>)
+func.func @partitioned_output_3d(%arg: tensor<!tf_type.resource<tensor<16x16x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x8x16xf32>>> {
+  // CHECK: [[PO:%.*]] = "tf.TPUPartitionedOutputV2"([[ARG]])
+  // CHECK-SAME: _XlaSharding = "123"
+  // CHECK-SAME: partition_dims = [1, 2, 1]
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %po:2 = "tf.TPUPartitionedOutput"(%arg) {_XlaSharding = "123", partition_dim = 1 : i64} : (tensor<!tf_type.resource<tensor<16x16x16xf32>>>) -> (tensor<!tf_type.resource<tensor<16x8x16xf32>>>, tensor<!tf_type.resource<tensor<16x8x16xf32>>>)
+  // CHECK: return [[PO:%.*0]]
+  func.return %po#0 : tensor<!tf_type.resource<tensor<16x8x16xf32>>>
+}
+
+// -----
+
+func.func @out_of_range_dim(%arg: tensor<!tf_type.resource<tensor<16x16x16xf32>>>) -> tensor<!tf_type.resource<tensor<16x8x16xf32>>> {
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  // expected-error @+1 {{cannot partition 'tensor<16x16x16xf32>' (rank = 3) along dimension 3.}}
+  %po:2 = "tf.TPUPartitionedOutput"(%arg) {_XlaSharding = "123", partition_dim = 3 : i64} : (tensor<!tf_type.resource<tensor<16x16x16xf32>>>) -> (tensor<!tf_type.resource<tensor<16x8x16xf32>>>, tensor<!tf_type.resource<tensor<16x8x16xf32>>>)
+  func.return %po#0 : tensor<!tf_type.resource<tensor<16x8x16xf32>>>
+}
+
+// -----
+
+func.func @unranked(%arg: tensor<!tf_type.resource<tensor<*xf32>>>) -> tensor<!tf_type.resource<tensor<*xf32>>> {
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  // expected-error @+1 {{cannot convert op with unranked or non-tensor input type 'tensor<*xf32>'.}}
+  %po:2 = "tf.TPUPartitionedOutput"(%arg) {_XlaSharding = "123", partition_dim = 3 : i64} : (tensor<!tf_type.resource<tensor<*xf32>>>) -> (tensor<!tf_type.resource<tensor<*xf32>>>, tensor<!tf_type.resource<tensor<*xf32>>>)
+  func.return %po#0 : tensor<!tf_type.resource<tensor<*xf32>>>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_reorder_replicate_and_partitioned_inputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_reorder_replicate_and_partitioned_inputs.mlir
index 8de8c8cc915..6daddaba7c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_reorder_replicate_and_partitioned_inputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_reorder_replicate_and_partitioned_inputs.mlir
@@ -5,37 +5,73 @@
 func.func @simple(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
   // CHECK: [[RI_0:%.*]] = "tf.TPUReplicatedInput"([[ARG0]], [[ARG2]])
   // CHECK: [[RI_1:%.*]] = "tf.TPUReplicatedInput"([[ARG1]], [[ARG3]])
-  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInput"([[RI_0]], [[RI_1]])
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInputV2"([[RI_0]], [[RI_1]])
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   // CHECK: return [[PI]]
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
 
+// CHECK-LABEL:func @simple_packed
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
+func.func @simple_packed(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  // CHECK: "tf.TPUReplicateMetadata"()
+  // CHECK: [[RI_0:%.*]] = "tf.TPUReplicatedInput"([[ARG0]])
+  // CHECK-SAME: is_packed = true
+  // CHECK: [[RI_1:%.*]] = "tf.TPUReplicatedInput"([[ARG0]])
+  // CHECK-SAME: is_packed = true
+  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInputV2"([[RI_0]], [[RI_1]])
+  // CHECK-SAME: is_packed = false
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %1 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %2 = "tf.TPUReplicatedInput"(%0, %1) {is_packed = false} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: return [[PI]]
+  func.return %2 : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
+
+// CHECK-LABEL:func @multi_arg_packed
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
+func.func @multi_arg_packed(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  // CHECK: "tf.TPUReplicateMetadata"()
+  // CHECK: [[RI_0:%.*]] = "tf.TPUReplicatedInput"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: is_packed = false
+  // CHECK: [[RI_1:%.*]] = "tf.TPUReplicatedInput"([[ARG0]], [[ARG1]])
+  // CHECK-SAME: is_packed = false
+  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInputV2"([[RI_0]], [[RI_1]])
+  // CHECK-SAME: is_packed = false
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %1 = "tf.TPUPartitionedInputV2"(%arg1) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %2 = "tf.TPUReplicatedInput"(%0, %1) {is_packed = false} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: return [[PI]]
+  func.return %2 : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
+
 // CHECK-LABEL:func @missing_xla_sharding
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG2:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG3:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
 func.func @missing_xla_sharding(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
   // CHECK: [[RI_0:%.*]] = "tf.TPUReplicatedInput"([[ARG0]], [[ARG2]])
   // CHECK: [[RI_1:%.*]] = "tf.TPUReplicatedInput"([[ARG1]], [[ARG3]])
-  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInput"([[RI_0]], [[RI_1]])
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {device = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {device = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: [[PI:%.*]] = "tf.TPUPartitionedInputV2"([[RI_0]], [[RI_1]])
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {device = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3) {device = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   // CHECK: return [[PI]]
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
 
 // Test IR is not modified when none of the operands of tf.TPUReplicaedInput is
-// a tf.TPUPartitionedInput op.
+// a tf.TPUPartitionedInputV2 op.
 
 // CHECK-LABEL:func @no_change_to_dag
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG1:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG2:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>, [[ARG3:%.*]]: tensor<!tf_type.resource<tensor<10x3xf32>>>)
 func.func @no_change_to_dag(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) {
-  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInput"([[ARG0]], [[ARG1]])
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {device = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  // CHECK: [[PI_1:%.*]] = "tf.TPUPartitionedInput"([[ARG2]], [[ARG3]])
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {device = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: [[PI_0:%.*]] = "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {device = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // CHECK: [[PI_1:%.*]] = "tf.TPUPartitionedInputV2"([[ARG2]], [[ARG3]])
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3) {device = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   // CHECK: [[RI:%.*]] = "tf.TPUReplicatedInput"([[ARG0]], [[ARG1]])
   %ri = "tf.TPUReplicatedInput"(%arg0, %arg1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   // CHECK: return [[RI]], [[PI_0]], [[PI_1]]
@@ -44,10 +80,31 @@ func.func @no_change_to_dag(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>,
 
 // -----
 
+func.func @missing_metadata(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  // expected-error@+1 {{num cores per replica unavailable, metadata missing?}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %1 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %2 = "tf.TPUReplicatedInput"(%0, %1) {is_packed = false} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  func.return %2 : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
+
+// -----
+
+func.func @inconsistent_packing(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 2 : i64} : () -> ()
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // expected-error@+1 {{packing should match across ops}}
+  %1 = "tf.TPUPartitionedInputV2"(%arg0, %arg0) {_XlaSharding = "", partition_dims = [], is_packed = false} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %2 = "tf.TPUReplicatedInput"(%0, %1) {is_packed = false} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  func.return %2 : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
+
+// -----
+
 func.func @xla_sharding_mismatch(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "123", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  // expected-error@+1 {{expects all inputs from 'tf.TPUPartitionedInput' ops to have identical XLA sharding}}
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3) {_XlaSharding = "123", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // expected-error@+1 {{expects all inputs from 'tf.TPUPartitionedInputV2' ops to have identical XLA sharding}}
   %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
@@ -55,9 +112,9 @@ func.func @xla_sharding_mismatch(%arg0: tensor<!tf_type.resource<tensor<10x3xf32
 // -----
 
 func.func @partition_dim_mismatch(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  // expected-error@+1 {{expects partition_dim = -1 but found 0}}
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "", partition_dim = 0 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // expected-error@+1 {{expects partition_dims = [] but found [1, 2]}}
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3) {_XlaSharding = "", partition_dims = [1, 2]} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
@@ -65,9 +122,9 @@ func.func @partition_dim_mismatch(%arg0: tensor<!tf_type.resource<tensor<10x3xf3
 // -----
 
 func.func @num_partitioned_inputs_mismatch(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg4: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   // expected-error@+1 {{expects 2 operands but found 3}}
-  %pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3, %arg4) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %pi_1 = "tf.TPUPartitionedInputV2"(%arg2, %arg3, %arg4) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   %ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
@@ -75,8 +132,18 @@ func.func @num_partitioned_inputs_mismatch(%arg0: tensor<!tf_type.resource<tenso
 // -----
 
 func.func @mixed_inputs_to_replicated_op(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg3: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
-  %pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
-  // expected-error@+1 {{'tf.TPUReplicatedInput' op expects all inputs from 'tf.TPUPartitionedInput' ops}}
+  %pi_0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  // expected-error@+1 {{'tf.TPUReplicatedInput' op expects all inputs from 'tf.TPUPartitionedInputV2' ops}}
   %ri = "tf.TPUReplicatedInput"(%pi_0, %arg2) {index = 1} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
   func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
 }
+
+// -----
+
+func.func @num_partitioned_inputs_mismatch_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<10x3xf32>>>, %arg2: tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>> {
+  "tf.TPUReplicateMetadata"() {num_cores_per_replica = 2 : i64, num_replicas = 1 : i64} : () -> ()
+  // expected-error@+1 {{expects 2 operands but found 3}}
+  %pi = "tf.TPUPartitionedInputV2"(%arg0, %arg1, %arg2) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>, tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  %ri = "tf.TPUReplicatedInput"(%pi) : (tensor<!tf_type.resource<tensor<10x3xf32>>>) -> tensor<!tf_type.resource<tensor<10x3xf32>>>
+  func.return %ri : tensor<!tf_type.resource<tensor<10x3xf32>>>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
index 9f390374aaa..aeab33e74d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
@@ -7,35 +7,56 @@ func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
 func.func @read_write_resource(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
   // CHECK-DAG:  [[READ0:%.+]] = "tf.ReadVariableOp"([[ARG0]])
   // CHECK-DAG:  [[READ1:%.+]] = "tf.ReadVariableOp"([[ARG1]])
-  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInput"([[READ0]], [[READ1]])
+  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInputV2"([[READ0]], [[READ1]])
   // CHECK-SAME: _XlaSharding = ""
-  // CHECK-SAME: partition_dim = -1
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK-SAME: partition_dims = []
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   // CHECK:      [[COMPUTATION:%.+]] = "tf_device.cluster_func"([[INPUT]])
   %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-  // CHECK:      [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutput"([[COMPUTATION]])
+  // CHECK:      [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutputV2"([[COMPUTATION]])
   // CHECK-SAME: _XlaSharding = ""
-  // CHECK-SAME: partition_dim = -1
+  // CHECK-SAME: partition_dims = []
   // CHECK-DAG:  "tf.AssignVariableOp"([[ARG0]], [[OUTPUT]]#0)
   // CHECK-DAG:  "tf.AssignVariableOp"([[ARG1]], [[OUTPUT]]#1)
   "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
   func.return
 }
 
+// CHECK-LABEL: func @read_write_packed_resource
+// CHECK-SAME: ([[ARG0:%.+]]: tensor<!tf_type.resource<tensor<i32>>>)
+func.func @read_write_packed_resource(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
+  // CHECK-DAG:  [[READ0:%.+]] = "tf.ReadVariableOp"([[ARG0]])
+  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInputV2"([[READ0]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK-SAME: is_packed = true
+  // CHECK-SAME: partition_dims = []
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  // CHECK:      [[COMPUTATION:%.+]] = "tf_device.cluster_func"([[INPUT]])
+  %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 2 : i64} : (tensor<i32>) -> tensor<i32>
+  // CHECK:      [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutputV2"([[COMPUTATION]])
+  // CHECK-SAME: _XlaSharding = ""
+  // CHECK-SAME: partition_dims = []
+  // CHECK-DAG:  "tf.AssignVariableOp"([[ARG0]], [[OUTPUT]]#0)
+  // CHECK-DAG:  "tf.AssignVariableOp"([[ARG0]], [[OUTPUT]]#1)
+  "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+  func.return
+}
+
 // CHECK-LABEL: func @read_only_resource
 // CHECK-SAME: ([[ARG0:%.+]]: tensor<!tf_type.resource<tensor<i32>>>, [[ARG1:%.+]]: tensor<!tf_type.resource<tensor<i32>>>)
 func.func @read_only_resource(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
   // CHECK-DAG:  [[READ0:%.+]] = "tf.ReadVariableOp"([[ARG0]])
   // CHECK-DAG:  [[READ1:%.+]] = "tf.ReadVariableOp"([[ARG1]])
-  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInput"([[READ0]], [[READ1]])
+  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInputV2"([[READ0]], [[READ1]])
   // CHECK-SAME: _XlaSharding = ""
-  // CHECK-SAME: partition_dim = -1
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK-SAME: partition_dims = []
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   // CHECK:      "tf_device.cluster_func"([[INPUT]])
   %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-  // CHECK-NOT:  tf.TPUPartitionedOutput
+  // CHECK-NOT:  tf.TPUPartitionedOutputV2
   // CHECK-NOT:  tf.AssignVariableOp
   func.return %2 : tensor<i32>
 }
@@ -47,11 +68,11 @@ func.func private @computation_two_args(%arg0: tensor<i32>, %arg1: tensor<i32>)
 func.func @partitioned_variable_multiple_users(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
   // CHECK-DAG:  [[READ0:%.+]] = "tf.ReadVariableOp"([[ARG0]])
   // CHECK-DAG:  [[READ1:%.+]] = "tf.ReadVariableOp"([[ARG1]])
-  // CHECK:      [[INPUT0:%.+]] = "tf.TPUPartitionedInput"([[READ0]], [[READ1]])
+  // CHECK:      [[INPUT0:%.+]] = "tf.TPUPartitionedInputV2"([[READ0]], [[READ1]])
   // CHECK-DAG:  [[READ2:%.+]] = "tf.ReadVariableOp"([[ARG0]])
   // CHECK-DAG:  [[READ3:%.+]] = "tf.ReadVariableOp"([[ARG1]])
-  // CHECK:      [[INPUT1:%.+]] = "tf.TPUPartitionedInput"([[READ2]], [[READ3]])
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK:      [[INPUT1:%.+]] = "tf.TPUPartitionedInputV2"([[READ2]], [[READ3]])
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %2 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   // CHECK:      "tf_device.cluster_func"([[INPUT0]], [[INPUT1]])
@@ -64,12 +85,12 @@ func.func @partitioned_variable_multiple_users(%arg0: tensor<!tf_type.resource<t
 // CHECK-LABEL: func @no_spmd
 // CHECK-SAME: ([[ARG0:%.+]]: tensor<!tf_type.resource<tensor<i32>>>, [[ARG1:%.+]]: tensor<!tf_type.resource<tensor<i32>>>)
 func.func @no_spmd(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
-  // CHECK:      "tf.TPUPartitionedInput"([[ARG0]], [[ARG1]])
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK:      "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %2 = "tf_device.cluster_func"(%1) {func = @computation} : (tensor<i32>) -> tensor<i32>
-  // CHECK:      "tf.TPUPartitionedInput"([[ARG0]], [[ARG1]])
-  %3 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK:      "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  %3 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %4 = "tf.ReadVariableOp"(%3) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %5 = "tf_device.cluster_func"(%4) {func = @computation, use_spmd_for_xla_partitioning = false} : (tensor<i32>) -> tensor<i32>
   func.return
@@ -77,20 +98,20 @@ func.func @no_spmd(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<
 
 // CHECK-LABEL: func @read_write_unpartitioned_resource
 func.func @read_write_unpartitioned_resource(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
-  // CHECK-NOT:  tf.TPUPartitionedInput
+  // CHECK-NOT:  tf.TPUPartitionedInputV2
   %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %1 = "tf_device.cluster_func"(%0) {func = @computation} : (tensor<i32>) -> tensor<i32>
-  // CHECK-NOT:  tf.TPUPartitionedOutput
+  // CHECK-NOT:  tf.TPUPartitionedOutputV2
   "tf.AssignVariableOp"(%arg0, %1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
   func.return
 }
 
 // CHECK-LABEL: func @read_only_unpartitioned_resource
 func.func @read_only_unpartitioned_resource(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
-  // CHECK-NOT:  tf.TPUPartitionedInput
+  // CHECK-NOT:  tf.TPUPartitionedInputV2
   %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %1 = "tf_device.cluster_func"(%0) {func = @computation} : (tensor<i32>) -> tensor<i32>
-  // CHECK-NOT:  tf.TPUPartitionedOutput
+  // CHECK-NOT:  tf.TPUPartitionedOutputV2
   // CHECK-NOT:  tf.AssignVariableOp
   func.return
 }
@@ -98,8 +119,8 @@ func.func @read_only_unpartitioned_resource(%arg0: tensor<!tf_type.resource<tens
 // CHECK-LABEL: func @resource_read_multiple_users
 // CHECK-SAME: ([[ARG0:%.+]]: tensor<!tf_type.resource<tensor<i32>>>, [[ARG1:%.+]]: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
 func.func @resource_read_multiple_users(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
-  // CHECK:      "tf.TPUPartitionedInput"([[ARG0]], [[ARG1]])
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK:      "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %2 = "tf_device.cluster_func"(%1) {func = @computation} : (tensor<i32>) -> tensor<i32>
   func.return %1 : tensor<i32>
@@ -107,27 +128,49 @@ func.func @resource_read_multiple_users(%arg0: tensor<!tf_type.resource<tensor<i
 
 // CHECK-LABEL: func @non_resource_read_input_write_output
 func.func @non_resource_read_input_write_output(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK-NOT:  tf.TPUPartitionedInput
+  // CHECK-NOT:  tf.TPUPartitionedInputV2
   %0 = "tf_device.cluster_func"(%arg0) {func = @computation} : (tensor<i32>) -> tensor<i32>
-  // CHECK-NOT:  tf.TPUPartitionedOutput
+  // CHECK-NOT:  tf.TPUPartitionedOutputV2
   func.return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: func @resource_missing_subtype
 // CHECK-SAME: ([[ARG0:%.+]]: tensor<!tf_type.resource>, [[ARG1:%.+]]: tensor<!tf_type.resource>)
 func.func @resource_missing_subtype(%arg0: tensor<!tf_type.resource>, %arg1: tensor<!tf_type.resource>) {
-  // CHECK:      "tf.TPUPartitionedInput"([[ARG0]], [[ARG1]])
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource>, tensor<!tf_type.resource>) -> tensor<!tf_type.resource>
+  // CHECK:      "tf.TPUPartitionedInputV2"([[ARG0]], [[ARG1]])
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource>, tensor<!tf_type.resource>) -> tensor<!tf_type.resource>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource>) -> tensor<i32>
   %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-  // CHECK-NOT:  tf.TPUPartitionedOutput
+  // CHECK-NOT:  tf.TPUPartitionedOutputV2
   "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource>, tensor<i32>) -> ()
   func.return
 }
 
 // -----
 
-// Check outside compiled that uses a TPUPartitionedInput.
+func.func @missing_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
+  // expected-error@+1 {{op num cores per replica unavailable}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+  func.return
+}
+
+// -----
+
+func.func @mismatch_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
+  // expected-error@+1 {{expects 2 operands but found 3}}
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg0, %arg0) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 2 : i64} : (tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+  func.return
+}
+
+// -----
+
+// Check outside compiled that uses a TPUPartitionedInputV2.
 
 func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
 
@@ -136,10 +179,10 @@ func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
 func.func @with_host_process(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
   // CHECK-DAG:  [[READ0:%.+]] = "tf.ReadVariableOp"([[ARG0]])
   // CHECK-DAG:  [[READ1:%.+]] = "tf.ReadVariableOp"([[ARG1]])
-  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInput"([[READ0]], [[READ1]])
+  // CHECK:      [[INPUT:%.+]] = "tf.TPUPartitionedInputV2"([[READ0]], [[READ1]])
   // CHECK-SAME: _XlaSharding = ""
-  // CHECK-SAME: partition_dim = -1
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK-SAME: partition_dims = []
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   // CHECK:      [[COMPUTATION:%.+]] = "tf_device.parallel_execute"()
   // CHECK:      "tf.OpA"([[READ0]])
@@ -153,9 +196,9 @@ func.func @with_host_process(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg
     %3 = "tf_device.cluster_func"(%1) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
     tf_device.return %3 : tensor<i32>
   }) : () -> tensor<i32>
-  // CHECK:      [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutput"([[COMPUTATION]])
+  // CHECK:      [[OUTPUT:%.+]]:2 = "tf.TPUPartitionedOutputV2"([[COMPUTATION]])
   // CHECK-SAME: _XlaSharding = ""
-  // CHECK-SAME: partition_dim = -1
+  // CHECK-SAME: partition_dims = []
   // CHECK-DAG:  "tf.AssignVariableOp"([[ARG0]], [[OUTPUT]]#0)
   // CHECK-DAG:  "tf.AssignVariableOp"([[ARG1]], [[OUTPUT]]#1)
   "tf.AssignVariableOp"(%0, %2) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
@@ -165,7 +208,7 @@ func.func @with_host_process(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg
 // -----
 
 // Check for an error that reports the unsupported case of outside compiled
-// code that uses a TPUPartitionedInput without REPLICATED sharding.
+// code that uses a TPUPartitionedInputV2 without REPLICATED sharding.
 
 // The TPUParitionedInput has the following OpSharding:
 // Proto debug string:
@@ -182,7 +225,7 @@ func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
 
 func.func @non_replicated_sharding(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
   // expected-error@+1 {{support}}
-  %0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   %2 = "tf_device.parallel_execute"() ({
     "tf_device.launch"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 187a7035117..c81a69f791f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1185,7 +1185,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 // -----
 
 // Test `tf_device.cluster_func` on TPU with pre-split replicate sharded
-// input/output using `tf.TPUPartitionedInput` and `tf.TPUPartitionedOutput`.
+// input/output using `tf.TPUPartitionedInputV2` and `tf.TPUPartitionedOutputV2`.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
@@ -1193,8 +1193,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
     // CHECK: %[[READ_VAR_1:[0-9]*]] = "tf.ReadVariableOp"(%arg1)
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-    // CHECK-NOT: tf.TPUPartitionedInput
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // CHECK-NOT: tf.TPUPartitionedInputV2
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {N = 2 : i64, partition_dims = []} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:3 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"()
     // CHECK: "tf_device.launch"
@@ -1207,8 +1207,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_1]], %[[COMPILE_OUTPUT]]#2)
     // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:1"
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = [""], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-    // CHECK-NOT: tf.TPUPartitionedOutput
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    // CHECK-NOT: tf.TPUPartitionedOutputV2
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     // CHECK: "tf.AssignVariableOp"(%arg0, %[[PARALLEL_EXECUTE_OUTPUT]]#0)
     // CHECK: "tf.AssignVariableOp"(%arg1, %[[PARALLEL_EXECUTE_OUTPUT]]#1)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
@@ -1223,7 +1223,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 // -----
 
 // Test `tf_device.cluster_func` on TPU with pre-split tile sharded input/
-// output using `tf.TPUPartitionedInput` and `tf.TPUPartitionedOutput`.
+// output using `tf.TPUPartitionedInputV2` and `tf.TPUPartitionedOutputV2`.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<3x2xf32>>>, %arg1: tensor<!tf_type.resource<tensor<3x2xf32>>>) {
@@ -1231,8 +1231,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<3x2xf32>>>) -> tensor<3x2xf32>
     // CHECK: %[[READ_VAR_1:[0-9]*]] = "tf.ReadVariableOp"(%arg1)
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<3x2xf32>>>) -> tensor<3x2xf32>
-    // CHECK-NOT: tf.TPUPartitionedInput
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dim = 1 : i64} : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x4xf32>
+    // CHECK-NOT: tf.TPUPartitionedInputV2
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dims = [1, 2]} : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x4xf32>
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:3 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"()
     // CHECK: "tf_device.launch"
@@ -1245,8 +1245,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_1]], %[[COMPILE_OUTPUT]]#2)
     // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:1"
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01"], output_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01"], use_spmd_for_xla_partitioning = true} : (tensor<3x4xf32>) -> tensor<3x4xf32>
-    // CHECK-NOT: tf.TPUPartitionedOutput
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dim = 1 : i64} : (tensor<3x4xf32>) -> (tensor<3x2xf32>, tensor<3x2xf32>)
+    // CHECK-NOT: tf.TPUPartitionedOutputV2
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dims = [1, 2]} : (tensor<3x4xf32>) -> (tensor<3x2xf32>, tensor<3x2xf32>)
     // CHECK: "tf.AssignVariableOp"(%arg0, %[[PARALLEL_EXECUTE_OUTPUT]]#0)
     // CHECK: "tf.AssignVariableOp"(%arg1, %[[PARALLEL_EXECUTE_OUTPUT]]#1)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<3x2xf32>>>, tensor<3x2xf32>) -> ()
@@ -1260,18 +1260,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Test that unsupported input sharding type of TPUPartitionedInputOp inputs of
+// Test that unsupported input sharding type of TPUPartitionedInputV2Op inputs of
 // ClusterFuncOp result in error.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {N = 2 : i64, partition_dims = []} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     // expected-error@+1 {{unsupported input sharding type MAXIMAL for 0-th input}}
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1],
       input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     func.return
@@ -1283,18 +1283,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Test that unsupported output sharding type of TPUPartitionedOutputOp outputs
+// Test that unsupported output sharding type of TPUPartitionedOutputV2Op outputs
 // of ClusterFuncOp result in error.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {N = 2 : i64, partition_dims = []} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     // expected-error@+1 {{unsupported output sharding type MAXIMAL for 0-th output}}
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1],
       input_sharding_configuration = [""], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     func.return
@@ -1307,16 +1307,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 // -----
 
 // Test that multiple uses of ClusterFuncOp output along with
-// TPUPartitionedOutputOp results in error.
+// TPUPartitionedOutputV2Op results in error.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {N = 2 : i64, partition_dims = []} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = [""], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-    // expected-error@+1 {{'tf.TPUPartitionedOutput' op must be a unique user of TPU Cluster}}
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    // expected-error@+1 {{'tf.TPUPartitionedOutputV2' op must be a unique user of TPU Cluster}}
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf._SomeOp"(%computation) : (tensor<i32>) -> ()
@@ -2532,17 +2532,17 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Test `tf.TPUPartitionedInput` has outputs not in `tf_device.cluster_func`
+// Test `tf.TPUPartitionedInputV2` has outputs not in `tf_device.cluster_func`
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
-    // expected-error@+1 {{Output of TPUPartitionedInput must be in tpu computation.}}
-    %partitioned_input = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+    // expected-error@+1 {{Output of TPUPartitionedInputV2 must be in tpu computation.}}
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%arg0, %arg1) {N = 2 : i64, partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
 
     %read = "tf.ReadVariableOp"(%partitioned_input) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
 
     %computation = "tf_device.cluster_func"(%read) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = [""], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     func.return
@@ -2554,17 +2554,17 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Test `tf.TPUPartitionedOutput` has inputs not in `tf_device.cluster_func`
+// Test `tf.TPUPartitionedOutputV2` has inputs not in `tf_device.cluster_func`
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   func.func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
     %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
     %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
-    %partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %partitioned_input = "tf.TPUPartitionedInputV2"(%read0, %read1) {N = 2 : i64, partition_dims = []} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = [""], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
     %add_result = "tf.Add"(%computation, %computation) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    // expected-error@+1 {{Input of TPUPartitionedOutput must be in tpu computation.}}
-    %partitioned_output:2 = "tf.TPUPartitionedOutput"(%add_result) {N = 2 : i64, partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    // expected-error@+1 {{Input of TPUPartitionedOutputV2 must be in tpu computation.}}
+    %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%add_result) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
     "tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     "tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
     func.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index b673dfd1bc1..921248cf473 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -274,12 +274,12 @@ func.func @func_body(%arg0: tensor<*xi32>)-> tensor<*xi32> {
 
 // CHECK-LABEL: func @partitioned_input_output
 func.func @partitioned_input_output(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
-  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\01\02\03", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\01\02\03", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK:      tf_device.cluster_func
   // CHECK-SAME: input_sharding_configuration = ["\01\02\03", ""]
   // CHECK-SAME: output_sharding_configuration = ["", "\04\05\06"]
   %1:2 = "tf_device.cluster_func"(%0, %arg1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 1 : i64} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
-  %2 = "tf.TPUPartitionedOutput"(%1#1) {_XlaSharding = "\04\05\06", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %2 = "tf.TPUPartitionedOutputV2"(%1#1) {_XlaSharding = "\04\05\06", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   func.return %1#0, %2 : tensor<*xi32>, tensor<*xi32>
 }
 
@@ -296,7 +296,7 @@ func.func @cluster_func(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*
 
 // CHECK-LABEL: func @partitioned_variable
 func.func @partitioned_variable(%arg0: tensor<!tf_type.resource<tensor<*xf32>>>) {
-  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\01\02\03", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<*xf32>>>) -> tensor<!tf_type.resource<tensor<*xf32>>>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\01\02\03", partition_dims = []} : (tensor<!tf_type.resource<tensor<*xf32>>>) -> tensor<!tf_type.resource<tensor<*xf32>>>
   %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
   // CHECK:      tf_device.cluster_func
   // CHECK-SAME: input_sharding_configuration = ["\01\02\03"]
@@ -369,12 +369,12 @@ func.func @cluster_func(%arg0: tensor<*xf32>) {
 
 // CHECK-LABEL: func @partitioned_input_output
 func.func @partitioned_input_output(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  %0 = "tf.TPUPartitionedInput"(%arg0) {partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK:      tf_device.cluster_func
   // CHECK-SAME: input_sharding_configuration = [""]
   // CHECK-SAME: output_sharding_configuration = [""]
   %1 = "tf_device.cluster_func"(%0) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
-  %2 = "tf.TPUPartitionedOutput"(%1) {partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %2 = "tf.TPUPartitionedOutputV2"(%1) {partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   func.return %2 : tensor<*xi32>
 }
 
@@ -392,7 +392,7 @@ func.func @cluster_func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 
 // CHECK-LABEL: func @partitioned_input_output
 func.func @partitioned_input_output(%arg0: tensor<!tf_type.resource<tensor<f32>>>) {
-  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\01\02\03", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<!tf_type.resource<tensor<f32>>>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\01\02\03", partition_dims = []} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<!tf_type.resource<tensor<f32>>>
   // CHECK:      tf_device.cluster_func
   // CHECK-SAME: input_sharding_configuration = []
   // CHECK-SAME: output_sharding_configuration = ["\01\02\03"]
@@ -414,13 +414,13 @@ func.func @cluster_func() -> tensor<f32> {
 
 // CHECK-LABEL: func @partitioned_input_maximal_sharding_revert_mpmd
 func.func @partitioned_input_maximal_sharding_revert_mpmd(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
-  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\08\01\1A\01\01\22\01\00", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\08\01\1A\01\01\22\01\00", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK:      tf_device.cluster_func
   // CHECK-SAME: input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"]
   // CHECK-SAME: output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\04\05\06"]
   // CHECK-SAME: use_spmd_for_xla_partitioning = false
   %1:2 = "tf_device.cluster_func"(%0, %arg1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 1 : i64} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
-  %2 = "tf.TPUPartitionedOutput"(%1#1) {_XlaSharding = "\04\05\06", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %2 = "tf.TPUPartitionedOutputV2"(%1#1) {_XlaSharding = "\04\05\06", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   func.return %1#0, %2 : tensor<*xi32>, tensor<*xi32>
 }
 
@@ -441,9 +441,9 @@ func.func @partitioned_output_maximal_sharding_revert_mpmd(%arg0: tensor<*xi32>,
   // CHECK-SAME: input_sharding_configuration = ["\04\05\06", "\08\01\1A\01\01\22\01\00"]
   // CHECK-SAME: output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"]
   // CHECK-SAME: use_spmd_for_xla_partitioning = false
-  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\04\05\06", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "\04\05\06", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   %1:2 = "tf_device.cluster_func"(%0, %arg1) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 1 : i64} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
-  %2 = "tf.TPUPartitionedOutput"(%1#1) {_XlaSharding = "\08\01\1A\01\01\22\01\00", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  %2 = "tf.TPUPartitionedOutputV2"(%1#1) {_XlaSharding = "\08\01\1A\01\01\22\01\00", partition_dims = []} : (tensor<*xi32>) -> tensor<*xi32>
   func.return %1#0, %2 : tensor<*xi32>, tensor<*xi32>
 }
 
@@ -615,8 +615,8 @@ func.func @func(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-SAME: input_sharding_configuration = ["", ""]
 // CHECK-SAME: output_sharding_configuration = ["\08\03\1A\02\02\01\22\02\00\01"]
 func.func @check_propagation_for_output_sharding_from_tf_matmul(%arg0: tensor<2x4xf32>, %arg1: tensor<4x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @_func, use_spmd_for_xla_partitioning = true, use_tpu = true, num_cores_per_replica = 1 : i64} : (tensor<2x4xf32>, tensor<4x2xf32>) -> tensor<2x2xf32>
-  %1:2 = "tf.TPUPartitionedOutput"(%0) {device = "", partition_dim = 0 : i64} : (tensor<2x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>)
+  %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @_func, use_spmd_for_xla_partitioning = true, use_tpu = true, num_cores_per_replica = 2 : i64} : (tensor<2x4xf32>, tensor<4x2xf32>) -> tensor<2x2xf32>
+  %1:2 = "tf.TPUPartitionedOutputV2"(%0) {device = "", partition_dims = [2, 1]} : (tensor<2x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>)
   return %1#0, %1#1 : tensor<1x2xf32>, tensor<1x2xf32>
 }
 func.func @_func(%arg0: tensor<2x4xf32>, %arg1: tensor<4x2xf32>) -> tensor<2x2xf32> {
@@ -630,8 +630,8 @@ func.func @_func(%arg0: tensor<2x4xf32>, %arg1: tensor<4x2xf32>) -> tensor<2x2xf
 // CHECK-SAME: input_sharding_configuration = ["", ""]
 // CHECK-SAME: output_sharding_configuration = ["\08\03\1A\02\02\01\22\02\00\01"]
 func.func @check_propagation_for_output_sharding_from_tf_matmul_following_by_identity_op(%arg0: tensor<2x4xf32>, %arg1: tensor<4x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @_func, use_spmd_for_xla_partitioning = true, use_tpu = true, num_cores_per_replica = 1 : i64} : (tensor<2x4xf32>, tensor<4x2xf32>) -> tensor<2x2xf32>
-  %1:2 = "tf.TPUPartitionedOutput"(%0) {device = "", partition_dim = 0 : i64} : (tensor<2x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>)
+  %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @_func, use_spmd_for_xla_partitioning = true, use_tpu = true, num_cores_per_replica = 2 : i64} : (tensor<2x4xf32>, tensor<4x2xf32>) -> tensor<2x2xf32>
+  %1:2 = "tf.TPUPartitionedOutputV2"(%0) {device = "", partition_dims = [2, 1]} : (tensor<2x2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>)
   return %1#0, %1#1 : tensor<1x2xf32>, tensor<1x2xf32>
 }
 func.func @_func(%arg0: tensor<2x4xf32>, %arg1: tensor<4x2xf32>) -> tensor<2x2xf32> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
index 5cbcc21080e..ec2d36d1267 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
@@ -448,6 +448,17 @@ func.func @batchMatMulV2MatrixAdjXY(%arg0: tensor<5x4xf32>, %arg1: tensor<6x5xf3
   // CHECK: return %[[MATMUL_1]] : tensor<4x6xf32>
 }
 
+// -----
+
+func.func @batchMatMulV2DynamicSize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x4xf32>) -> tensor<?x4xf32> {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
+  func.return %0 : tensor<?x4xf32>
+
+  // CHECK-LABEL: batchMatMulV2DynamicSize
+  // CHECK: %[[MATMUL_1:.*]] = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} : (tensor<?x?xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
+  // CHECK: return %[[MATMUL_1]] : tensor<?x4xf32>
+}
+
 // -----
 // ==== V3 tests ====
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir b/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
index 9a4a23f1cf6..263a6762238 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/update_control_dependencies.mlir
@@ -205,15 +205,15 @@ func.func @tpu_load_embedding_ops_sink_controls(%arg0: tensor<*x!tf_type.resourc
 
 // -----
 
-// Test that we don't create dependencies between ops on different devices, even
-// if both have unknown side effects.
+// Tests that we don't create dependencies between ops with same parallel group
+// ID but different branch IDs, even if both ops have unknown side effects.
 // Also test that the fetch op still depends on all side-effecting ops.
-func.func @different_devices() {
+func.func @same_group_different_branches() {
   tf_executor.graph {
     // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
-    tf_executor.island wraps "tf.A"() {is_stateless = false, device = "CPU:0"} : () -> ()
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
     // CHECK: %[[control_2:.*]] = tf_executor.island wraps "tf.B"()
-    tf_executor.island wraps "tf.B"() {is_stateless = false, device = "CPU:1"} : () -> ()
+    tf_executor.island wraps "tf.B"() {is_stateless = false, _parallel_execution_ids = "p0:1"} : () -> ()
     // CHECK: tf_executor.fetch %[[control]], %[[control_2]] : !tf_executor.control, !tf_executor.control
     tf_executor.fetch
   }
@@ -222,14 +222,14 @@ func.func @different_devices() {
 
 // -----
 
-// Test that we do create dependencies between ops with different but compatible
-// device attributes, if both ops have unknown side effects.
-func.func @compatible_devices() {
+// Tests that we create dependencies between ops with same parallel group ID and
+// same branch ID, if both ops have unknown side effects.
+func.func @same_group_same_branch() {
   tf_executor.graph {
     // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
-    tf_executor.island wraps "tf.A"() {is_stateless = false, device = "CPU:0"} : () -> ()
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
     // CHECK: %[[control_2:.*]] =  tf_executor.island(%[[control]]) wraps "tf.B"()
-    tf_executor.island wraps "tf.B"() {is_stateless = false, device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
+    tf_executor.island wraps "tf.B"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
     // CHECK: tf_executor.fetch %[[control_2]] : !tf_executor.control
     tf_executor.fetch
   }
@@ -238,25 +238,148 @@ func.func @compatible_devices() {
 
 // -----
 
-// More complex test with mixed compatible and different devices. In this case,
-// side effect analysis should report following dependencies
+// Tests one group with multiple branches. In this case, side effect analysis
+// should report following dependencies
 // A -> B -> C -> D -> E -> fetch
-// and we expect following dependency chains (one chain per device)
+// and we expect following dependency chains after the pass
 // A -> D -> fetch, B -> E -> fetch, C -> fetch.
-func.func @mixed_compatible_and_different_devices() {
+func.func @one_group_multiple_branches() {
   tf_executor.graph {
     // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
-    tf_executor.island wraps "tf.A"() {is_stateless = false, device = "CPU:0"} : () -> ()
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
     // CHECK: %[[control_2:.*]] =  tf_executor.island wraps "tf.B"()
-    tf_executor.island wraps "tf.B"() {is_stateless = false, device = "TPU:0"} : () -> ()
+    tf_executor.island wraps "tf.B"() {is_stateless = false, _parallel_execution_ids = "p0:1"} : () -> ()
     // CHECK: %[[control_3:.*]] =  tf_executor.island wraps "tf.C"()
-    tf_executor.island wraps "tf.C"() {is_stateless = false, device = "CPU:2"} : () -> ()
+    tf_executor.island wraps "tf.C"() {is_stateless = false, _parallel_execution_ids = "p0:2000"} : () -> ()
     // CHECK: %[[control_4:.*]] =  tf_executor.island(%[[control]]) wraps "tf.D"()
-    tf_executor.island wraps "tf.D"() {is_stateless = false, device = "CPU:0"} : () -> ()
+    tf_executor.island wraps "tf.D"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
     // CHECK: %[[control_5:.*]] =  tf_executor.island(%[[control_2]]) wraps "tf.E"()
-    tf_executor.island wraps "tf.E"() {is_stateless = false, device = "TPU:0"} : () -> ()
+    tf_executor.island wraps "tf.E"() {is_stateless = false, _parallel_execution_ids = "p0:1"} : () -> ()
     // CHECK: tf_executor.fetch %[[control_3]], %[[control_4]], %[[control_5]] : !tf_executor.control, !tf_executor.control, !tf_executor.control
     tf_executor.fetch
   }
   func.return
-}
\ No newline at end of file
+}
+
+// -----
+
+// Tests nested replica and parallel execute groups.
+func.func @nested_replica_and_parallel_execute_groups() {
+  tf_executor.graph {
+    // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
+    tf_executor.island wraps "tf.A"() : () -> ()
+    // CHECK-NEXT: %[[control_2:.*]] = tf_executor.island(%[[control]]) wraps "tf.B"()
+    tf_executor.island wraps "tf.B"() {_parallel_execution_ids = "r1:1"} : () -> ()
+    // CHECK-NEXT: %[[control_3:.*]] = tf_executor.island(%[[control_2]]) wraps "tf.C"()
+    tf_executor.island wraps "tf.C"() {_parallel_execution_ids = "r1:1,p2:1"} : () -> ()
+    // CHECK-NEXT: %[[control_4:.*]] = tf_executor.island(%[[control_2]]) wraps "tf.D"()
+    tf_executor.island wraps "tf.D"() {_parallel_execution_ids = "r1:1,p2:2"} : () -> ()
+    // CHECK-NEXT: %[[control_5:.*]] = tf_executor.island(%[[control]]) wraps "tf.B"()
+    tf_executor.island wraps "tf.B"() {_parallel_execution_ids = "r1:2"} : () -> ()
+    // CHECK-NEXT: %[[control_6:.*]] = tf_executor.island(%[[control_5]]) wraps "tf.C"()
+    tf_executor.island wraps "tf.C"() {_parallel_execution_ids = "r1:2,p3:1"} : () -> ()
+    // CHECK-NEXT: %[[control_7:.*]] = tf_executor.island(%[[control_5]]) wraps "tf.D"()
+    tf_executor.island wraps "tf.D"() {_parallel_execution_ids = "r1:2,p3:2"} : () -> ()
+    // CHECK-NEXT: tf_executor.fetch %[[control_3]], %[[control_4]], %[[control_6]], %[[control_7]] : !tf_executor.control, !tf_executor.control, !tf_executor.control, !tf_executor.control
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+// Tests mixed and nested groups and branches. In this case, side effect
+// analysis should report following dependencies
+// A -> B -> C -> D -> E -> fetch
+// and we expect following dependency chains after the pass
+// A -> B -> D -> fetch, C -> fetch, E -> fetch.
+func.func @mixed_groups_and_branches_nested() {
+  tf_executor.graph {
+    // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
+    // CHECK-NEXT: %[[control_2:.*]] =  tf_executor.island(%[[control]]) wraps "tf.B"()
+    tf_executor.island wraps "tf.B"() {is_stateless = false, _parallel_execution_ids = "p0:0,r1000:0"} : () -> ()
+    // CHECK-NEXT: %[[control_3:.*]] =  tf_executor.island wraps "tf.C"()
+    tf_executor.island wraps "tf.C"() {is_stateless = false, _parallel_execution_ids = "p0:1,r1000:0"} : () -> ()
+    // CHECK-NEXT: %[[control_4:.*]] =  tf_executor.island(%[[control_2]], %[[control_3]]) wraps "tf.D"()
+    tf_executor.island wraps "tf.D"() {is_stateless = false, _parallel_execution_ids = "r1000:0"} : () -> ()
+    // CHECK-NEXT: %[[control_5:.*]] =  tf_executor.island wraps "tf.E"()
+    tf_executor.island wraps "tf.E"() {is_stateless = false, _parallel_execution_ids = "p0:1,r1000:3000"} : () -> ()
+    // CHECK-NEXT: tf_executor.fetch %[[control_4]], %[[control_5]] : !tf_executor.control, !tf_executor.control
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+// Tests that we create dependencies between ops where one op has a parallel
+// execution ID and the other has not.
+func.func @unspecified_parallel_execution_ids() {
+  tf_executor.graph {
+    // CHECK: %[[control:.*]] = tf_executor.island wraps "tf.A"()
+    tf_executor.island wraps "tf.A"() {is_stateless = false} : () -> ()
+    // CHECK-NEXT: %[[control_2:.*]] =  tf_executor.island(%[[control]]) wraps "tf.B"()
+    tf_executor.island wraps "tf.B"() {is_stateless = false, _parallel_execution_ids = "p0:0"} : () -> ()
+    // CHECK-NEXT: %[[control_3:.*]] =  tf_executor.island(%[[control]]) wraps "tf.C"()
+    tf_executor.island wraps "tf.C"() {is_stateless = false, _parallel_execution_ids = "p0:1"} : () -> ()
+    // CHECK-NEXT: tf_executor.fetch %[[control_2]], %[[control_3]] : !tf_executor.control, !tf_executor.control
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+func.func @missing_branch_id() {
+  tf_executor.graph {
+    // expected-error@+1 {{Malformed _parallel_execution_ids attribute}}
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "p0:"} : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+func.func @missing_colon() {
+  tf_executor.graph {
+    // expected-error@+1 {{Malformed _parallel_execution_ids attribute}}
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "r01"} : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+func.func @missing_group_id_prefix() {
+  tf_executor.graph {
+    // expected-error@+1 {{Malformed _parallel_execution_ids attribute}}
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "0:0"} : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+func.func @invalid_group_id_prefix() {
+  tf_executor.graph {
+    // expected-error@+1 {{Malformed _parallel_execution_ids attribute}}
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "s0:0"} : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+func.func @extra_colon() {
+  tf_executor.graph {
+    // expected-error@+1 {{Malformed _parallel_execution_ids attribute}}
+    tf_executor.island wraps "tf.A"() {is_stateless = false, _parallel_execution_ids = "r0:0:1"} : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
index 658ddf5c767..536d14305e7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_cluster_formation.mlir
@@ -1,23 +1,30 @@
-// RUN: tf-opt %s -tf-xla-cluster-formation | FileCheck %s
+// RUN: tf-opt %s -split-input-file -verify-diagnostics  -tf-xla-cluster-formation | FileCheck %s
 
 // Check that we outline the partitioned call to a device cluster (since it has
 // `_xla_compile_device_type`).
+// CHECK-LABEL: func.func @xla_must_compile_true
 // CHECK: tf_device.cluster
 // CHECK-NEXT: tf.StatefulPartitionedCall
 // CHECK-NEXT: tf_device.return
 // CHECK: tf.Const
 // CHECK: tf.Add
-func.func @xla_must_compile_true(%arg0: tensor<i32>) -> tensor<i32> {
+func.func @xla_must_compile_true(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
   %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", _xla_compile_device_type = "CPU", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
   %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
   func.return %2 : tensor<i32>
 }
 
+func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+// -----
+
 // Check that we don't outline the partitioned call to a device cluster (since
 // it does not has `_xla_compile_device_type`).
+// CHECK-LABEL: func.func @xla_must_compile_false
 // CHECK-NOT: tf_device.cluster
-func.func @xla_must_compile_false(%arg0: tensor<i32>) -> tensor<i32> {
+func.func @xla_must_compile_false(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
   %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
   %1 = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
   %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
@@ -27,3 +34,44 @@ func.func @xla_must_compile_false(%arg0: tensor<i32>) -> tensor<i32> {
 func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
   func.return %arg0 : tensor<i32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @nested_calls
+func.func @nested_calls(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+    %0 = "tf.While"(%arg0) {cond = @while_cond_func, body = @while_body_func, is_stateless = true} : (tensor<i32>) -> (tensor<i32>)
+    func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func.func @while_cond_func
+func.func @while_cond_func(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {value = dense<0> : tensor<i1>} : () -> tensor<i1>
+  func.return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func.func @while_body_func
+func.func @while_body_func(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK-NOT: tf_device.cluster
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @outer_stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func.func @outer_stateful_pcall_func
+func.func @outer_stateful_pcall_func(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK: tf_device.cluster
+  // CHECK-NEXT: tf.StatefulPartitionedCall
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @inner_stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func.func @inner_stateful_pcall_func
+func.func @inner_stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT: tf_device.cluster
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
index 32d66ee9424..973fe031d75 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite.mlir
@@ -1,11 +1,12 @@
-// RUN: tf-opt %s -tf-xla-rewrite | FileCheck %s
-
-// -----
+// RUN: tf-opt %s -split-input-file -tf-xla-rewrite | FileCheck %s
 
 // CHECK-LABEL: func.func @convert_partitioned_call
 func.func @convert_partitioned_call(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
-  %0 = "tf.PartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  %0 = "tf_device.cluster"() ({
+    // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
+    %1 = "tf.PartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @pcall_func} : (tensor<i32>) -> (tensor<i32>)
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
   func.return %0 : tensor<i32>
 }
 
@@ -17,8 +18,12 @@ func.func @pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
 
 // CHECK-LABEL: func.func @convert_stateful_partitioned_call
 func.func @convert_stateful_partitioned_call(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
-  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  %0 = "tf_device.cluster"() ({
+    // CHECK: "tf.XlaLaunch"(%arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<i32>) -> tensor<i32>
+    %1 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+    tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+
   func.return %0 : tensor<i32>
 }
 
@@ -29,30 +34,39 @@ func.func @stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
 // -----
 
 // CHECK-LABEL: func.func @convert_stateful_partitioned_call_with_resources_in_order
-func.func @convert_stateful_partitioned_call_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i64>) -> tensor<i64> {
-  // CHECK: %0 = "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources_in_order, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i64>, tensor<!tf_type.resource>) -> tensor<i64>
-  %0 = "tf.StatefulPartitionedCall"(%arg1, %arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources_in_order} : (tensor<i64>, tensor<!tf_type.resource>) -> (tensor<i64>)
-  func.return %0 : tensor<i64>
+func.func @convert_stateful_partitioned_call_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+  %0 = "tf_device.cluster"() ({
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources_in_order, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %1 = "tf.StatefulPartitionedCall"(%arg1, %arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources_in_order} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
+     tf_device.return %1 : tensor<i32>
+  }) : () -> tensor<i32>
+  func.return %0 : tensor<i32>
 }
 
-func.func @stateful_pcall_func_with_resources_in_order(%arg0 : tensor<i64>, %arg1 : tensor<!tf_type.resource>) -> tensor<i64> {
-  func.return %arg0 : tensor<i64>
+func.func @stateful_pcall_func_with_resources_in_order(%arg0 : tensor<i32>, %arg1 : tensor<!tf_type.resource>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
 }
 
 // -----
 
 // CHECK-LABEL: func.func @convert_stateful_partitioned_call_with_resources
-func.func @convert_stateful_partitioned_call_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i64>) -> tensor<i64> {
-  // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i64>, tensor<!tf_type.resource>) -> tensor<i64>
-  %0 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i64>) -> (tensor<i64>)
-  // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i64>, tensor<!tf_type.resource>) -> tensor<i64>
-  %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i64>) -> (tensor<i64>)
-  func.return %0 : tensor<i64>
+func.func @convert_stateful_partitioned_call_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+  %0 = "tf_device.cluster"() ({
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %2 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) : () -> tensor<i32>
+  %1 = "tf_device.cluster"() ({
+    // CHECK: "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @stateful_pcall_func_with_resources, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+    %2 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @stateful_pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
+    tf_device.return %2 : tensor<i32>
+  }) : () -> tensor<i32>
+  return %0 : tensor<i32>
 }
 
 // CHECK-LABEL: func.func @stateful_pcall_func_with_resources
-// CHECK-SAME:  (%arg0: tensor<i64>, %arg1: tensor<!tf_type.resource>) -> tensor<i64>
-// CHECK:         return %arg0 : tensor<i64>
-func.func @stateful_pcall_func_with_resources(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i64>) -> tensor<i64> {
-  func.return %arg1 : tensor<i64>
+// CHECK-SAME:  (%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> tensor<i32>
+// CHECK:         return %arg0 : tensor<i32>
+func.func @stateful_pcall_func_with_resources(%arg0 : tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
+  func.return %arg1 : tensor<i32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index cdd2a0536ba..14ab17be0fd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -49,8 +49,8 @@ class ConvertTFBatchMatMulToEinsumOp
 
   LogicalResult matchAndRewrite(BatchMatMulOpType op,
                                 PatternRewriter& rewriter) const override {
-    Value input_lhs = op.x();
-    Value input_rhs = op.y();
+    Value input_lhs = op.getX();
+    Value input_rhs = op.getY();
 
     // LHS and RHS must be a ranked tensor type
     auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
@@ -70,8 +70,8 @@ class ConvertTFBatchMatMulToEinsumOp
 
     // einsum equation for batchmatmul
     std::string equation("...mk,...kn->...mn");
-    if (op.adj_x()) std::swap(equation[3], equation[4]);
-    if (op.adj_y()) std::swap(equation[6 + 3], equation[6 + 4]);
+    if (op.getAdjX()) std::swap(equation[3], equation[4]);
+    if (op.getAdjY()) std::swap(equation[6 + 3], equation[6 + 4]);
 
     rewriter.replaceOpWithNewOp<TF::EinsumOp>(
         op, op.getType(),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 28779f6c29d..43742deb647 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 
 #include <memory>
+#include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -59,9 +60,6 @@ tensorflow::Status RunTFXLABridge(
   // Populate a passmanager with the list of passes that implement the bridge.
   pipeline_builder(bridge);
 
-  // Add set of passes to lower back to graph (from tf_executor).
-  TF::AddGraphExportLoweringPasses(bridge);
-
   mlir::StatusScopedDiagnosticHandler diag_handler(
       module.getContext(), /*propagate=*/false,
       /*filter_stack=*/!VLOG_IS_ON(1));
@@ -97,6 +95,7 @@ void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
   // Run shape inference so that tf_executor/tf_device ops created later will
   // likely to inherit more concrete types.
   pm.addPass(TF::CreateTFShapeInferencePass());
+  pm.addNestedPass<func::FuncOp>(CreateTPUPartitionedOpConversionPass());
   pm.addNestedPass<func::FuncOp>(
       CreateTPUReorderReplicateAndPartitionedInputsPass());
   pm.addNestedPass<func::FuncOp>(TF::CreateDecomposeReduceDatasetPass());
@@ -179,6 +178,8 @@ void CreateTPUBridgePipelineImpl(OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(
       CreateTPUResourceReadsWritesPartitioningPass());
   pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
   pm.addPass(CreateTPURewritePass());
   pm.addPass(createSymbolDCEPass());
   pm.addNestedPass<func::FuncOp>(
@@ -226,8 +227,15 @@ void CreateTPUBridgePipelineV1(OpPassManager &pm) {
 
 tensorflow::Status TPUBridge(ModuleOp module, bool enable_logging,
                              bool fallback_enabled) {
-  Status status =
-      RunTFXLABridge(module, enable_logging, CreateTPUBridgePipeline);
+  Status status = RunTFXLABridge(module, enable_logging, [](OpPassManager &pm) {
+    CreateTPUBridgePipeline(pm);
+    // Add set of passes to lower back to graph (from tf_executor).
+    // Use graph export pipline V2 in TPU Bridge.
+    // TODO(hanxiong): Completely replace AddGraphExportLoweringPasses with
+    // AddGraphExortLoweringPassessV2 in all the code paths (V1 compat pipeline,
+    // CPU/GPU bridge, etc.)
+    TF::AddGraphExportLoweringPassesV2(pm);
+  });
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       "tpu", "v2", fallback_enabled, status.ok() ? "success" : "failure");
   OkOrSetErrorCounterPayload(
@@ -237,8 +245,11 @@ tensorflow::Status TPUBridge(ModuleOp module, bool enable_logging,
 }
 tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool enable_logging,
                                      bool fallback_enabled) {
-  Status status =
-      RunTFXLABridge(module, enable_logging, CreateTPUBridgePipelineV1);
+  Status status = RunTFXLABridge(module, enable_logging, [](OpPassManager &pm) {
+    CreateTPUBridgePipelineV1(pm);
+    // Add set of passes to lower back to graph (from tf_executor).
+    TF::AddGraphExportLoweringPasses(pm);
+  });
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       "tpu", "v1", fallback_enabled, status.ok() ? "success" : "failure");
   return status;
@@ -255,10 +266,43 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   };
 
   add_pass(CreateFunctionalToExecutorDialectConversionPass());
-  add_pass(TFDevice::CreateReplicateToIslandPass());
+  add_pass(TFDevice::CreateReplicateToIslandPass(/*legacy_graph_export=*/true));
   add_pass(TFDevice::CreateReplicaIDToDeviceOrdinalPass());
-  add_pass(TFDevice::CreateParallelExecuteToIslandsPass());
-  add_pass(TFDevice::CreateLaunchToDeviceAttributePass());
+  add_pass(TFDevice::CreateParallelExecuteToIslandsPass(
+      /*legacy_graph_export=*/true));
+  add_pass(TFDevice::CreateLaunchToDeviceAttributePass(
+      /*legacy_graph_export=*/true));
+  pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUDevicePropagationPass());
+  pm.addPass(createSymbolDCEPass());
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
+    pm.addPass(tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
+  }
+  pm.addPass(CreateVerifySuitableForExportPass());
+}
+
+void AddGraphExportLoweringPassesV2(OpPassManager &pm) {
+  // First, we need to convert from functional, to executor dialect.
+  pm.addNestedPass<func::FuncOp>(
+      CreateFunctionalToExecutorDialectConversionPass());
+
+  // Do a single pass to split the graph's single island op into an island per
+  // op as expected by the following passes.
+  pm.addNestedPass<func::FuncOp>(CreateSplitIntoIslandPerOpPass());
+
+  pm.addNestedPass<func::FuncOp>(TFDevice::CreateReplicateToIslandPass(
+      /*legacy_graph_export=*/false));
+  pm.addNestedPass<func::FuncOp>(
+      TFDevice::CreateReplicaIDToDeviceOrdinalPass());
+  pm.addNestedPass<func::FuncOp>(TFDevice::CreateParallelExecuteToIslandsPass(
+      /*legacy_graph_export=*/false));
+  pm.addNestedPass<func::FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass(
+      /*legacy_graph_export=*/false));
+
+  // Do a single pass to encode necessary control deps in the IR according to
+  // the results of side effect analysis.
+  pm.addPass(tf_executor::CreateTFExecutorUpdateControlDependenciesPass());
+
   pm.addNestedPass<func::FuncOp>(TFTPU::CreateTPUDevicePropagationPass());
   pm.addPass(createSymbolDCEPass());
   if (tensorflow::GetMlirCommonFlags()
@@ -282,15 +326,13 @@ tensorflow::Status RunBridgeWithStandardPipeline(ModuleOp module,
       /*filter_stack=*/!VLOG_IS_ON(1));
 
   if (enable_logging || VLOG_IS_ON(1)) {
-    tensorflow::DumpMlirOpToFile("standard_pipeline_before", module, "",
-                                 &bridge);
+    tensorflow::DumpMlirOpToFile(kStandardPipelineBefore, module, "", &bridge);
     if (VLOG_IS_ON(2)) EnableDetailedLogging(&bridge);
   }
   LogicalResult result = bridge.run(module);
   (void)result;
   if (enable_logging || VLOG_IS_ON(1))
-    tensorflow::DumpMlirOpToFile("standard_pipeline_after", module, "",
-                                 &bridge);
+    tensorflow::DumpMlirOpToFile(kStandardPipelineAfter, module, "", &bridge);
   return diag_handler.ConsumeStatus();
 }
 
@@ -310,6 +352,7 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
       CreateExecutorDialectToFunctionalConversionPass());
   // Guarantee all functions have one use, which enables more exact shape
   // inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   pm.addPass(TF::CreateTFShapeInferencePass());
   // Encapsulate PartitionedCall ops within a cluster so that the composite
   // resource ops can be decomposed.
@@ -327,9 +370,9 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
   pm.addPass(TF::CreateTFShapeInferencePass());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addPass(TFDevice::CreateResourceOpLiftingPass());
-  // Inline the StatefulPartitionedCallOp op based in the parent region.
-  pm.addPass(TFDevice::CreateXlaInlineDeviceOpsPass());
   pm.addPass(TFDevice::CreateXlaRewritePass());
+  // Inline the cluster ops.
+  pm.addPass(TFDevice::CreateXlaInlineDeviceOpsPass());
   // Re-run the canonicalizer pass as some cleanup during resource op lifting
   // pass opens up some opportunities for canonicalization of cluster ops.
   // Specifically, we want to eliminate pass through results from the cluster
@@ -342,8 +385,12 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
 }
 
 tensorflow::Status RunTFXLABridge(ModuleOp module, bool enable_logging) {
-  Status status = mlir::TFTPU::RunTFXLABridge(module, enable_logging,
-                                              CreateTFXLABridgePipeline);
+  Status status = mlir::TFTPU::RunTFXLABridge(
+      module, enable_logging, [](OpPassManager &pm) {
+        CreateTFXLABridgePipeline(pm);
+        // Add set of passes to lower back to graph (from tf_executor).
+        TF::AddGraphExportLoweringPasses(pm);
+      });
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       /*device type*/ "cpu/gpu", /*bridge version*/ "tfxla",
       /*fallback_enabled*/ false,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
index c0dc24416ff..925149dd843 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.h
@@ -44,6 +44,9 @@ tensorflow::Status TPUBridgeV1Compat(ModuleOp module, bool enable_logging,
 
 namespace TF {
 
+inline constexpr char kStandardPipelineBefore[] = "standard_pipeline_before";
+inline constexpr char kStandardPipelineAfter[] = "standard_pipeline_after";
+
 // Runs all passes involved in transforming or optimizing an MLIR graph without
 // any target specialization. When enable_logging is true, enables
 // tensorflow::BridgeLogger. When enable_inliner is true, enables the inliner
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
new file mode 100644
index 00000000000..e04d1323352
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
+
+#include <functional>
+#include <stack>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+
+// Find the outermost ops with any of specified types starting from the tree
+// rooted at `root` parameter. The results are stored in `ops`. Addtional
+// filters can be specified by providing `predicate` parameter.
+template <typename T, typename... Types>
+LogicalResult GetOutermostOpsOfType(
+    func::FuncOp root, SymbolTable &symtab, llvm::SmallVector<Operation *> &ops,
+    const std::function<bool(Operation *)> &predicate = {}) {
+  std::stack<func::FuncOp> worklist;
+  worklist.push(root);
+  while (!worklist.empty()) {
+    func::FuncOp u = worklist.top();
+    worklist.pop();
+    auto result = u.walk([&](SymbolUserOpInterface op) {
+      if (llvm::isa<T, Types...>(op) && (!predicate || predicate(op))) {
+        ops.push_back(op);
+        return WalkResult::advance();
+      }
+      for (auto attr : op->getAttrs()) {
+        auto sym = attr.getValue().dyn_cast<SymbolRefAttr>();
+        if (!sym) continue;
+        auto v = symtab.lookup<func::FuncOp>(sym.getRootReference());
+        if (!v) {
+          // This is not expected to happen in practice.
+          v.emitError() << "Cannot find function " << sym.getRootReference();
+          return WalkResult::interrupt();
+        }
+        worklist.push(v);
+      }
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) return failure();
+  }
+  return success();
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index ead70410731..3c8541d2a50 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -17,16 +17,20 @@ limitations under the License.
 // assigned to save devices. Clusters are represented as regions.
 // Note that side-effecting ops are not correctly handled yet.
 
+#include <memory>
+#include <vector>
+
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
@@ -59,19 +63,39 @@ StringRef GetDevice(Operation* op) {
   return device_attr ? device_attr.getValue() : "";
 }
 
-// An op can be merged into cluster if all of its operands are one of the
-// following:
-//  1) A block argument
-//  2) A value produced by other islands
-//  1) Defined before the cluster
-//  2) Defined by an operation in the cluster
+// An op can be merged into cluster if it satisfies both of the following
+// conditions:
+//
+//  * All of its operands are one of the following:
+//    1) A block argument
+//    2) A value produced by other islands
+//    3) Defined before the cluster
+//    4) Defined by an operation in the cluster
+//  * Merging the op into the cluster does not reorder control dependencies.
+//
 // TODO(ycao): This is not optimal as it doesn't consider the situation of
 // defining_op's operands all meet the requirements above. In that case, the
 // defining_op can be moved and to_merge op would be legal to absorb.
-// TODO(ycao): Take op side-effects into consideration since they can not be
-// re-ordered but forming clusters of non-continuous ops is effectively
-// re-ordering them..
-bool CanMergeIntoCluster(const Cluster& c, Operation* to_merge) {
+bool CanMergeIntoCluster(
+    const Cluster& c, Operation* to_merge,
+    const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+  // If any of the op's control predecessors appears after the last op in the
+  // cluster, merging the op may cause control dependencies to be reordered.
+  // Hence, the op cannot be merged to the cluster in such a case.
+  const bool has_control_predecessors_after_cluster =
+      !side_effect_analysis
+           .DirectControlPredecessors(
+               to_merge,
+               [&c](Operation* pred) {
+                 Operation* const last_c_op = c.ops.back();
+                 return last_c_op->getBlock() == pred->getBlock() &&
+                        last_c_op->isBeforeInBlock(pred);
+               })
+           .empty();
+  if (has_control_predecessors_after_cluster) {
+    return false;
+  }
+
   return llvm::all_of(to_merge->getOperands(), [&](Value operand) {
     // Block arguments.
     if (operand.isa<BlockArgument>()) return true;
@@ -130,6 +154,56 @@ void GetLiveOuts(Region* region, llvm::SmallVectorImpl<Value>* live_outs) {
   }
 }
 
+// Reorder all users of the given op's results to after the op.
+//
+// Since launch ops are inserted after the last op in the region, the region is
+// guaranteed to dominate all live-in values. On the other hand, it is still
+// possible that live-out values don't dominate the region. For example:
+//
+// ```
+// %0 = "tf.OpA"()
+// %1 = "tf.OpB"(%0)
+// %2 = "tf.OpC"(%0)
+// ```
+//
+// Assuming `tf.OpA` and `tf.OpC` are clustered together, the region will be
+// inserted right after `tf.OpC`. The live-out `%0`, however, is used by
+// `tf.OpB`, which won't dominate the region. This function reorders all users
+// of the cluster op to be placed after the cluster op itself so that SSA
+// dominance is preserved after cluster op creation.
+void ReorderOpResultUses(mlir::Operation* cluster) {
+  mlir::Block* const cluster_block = cluster->getBlock();
+  llvm::SetVector<mlir::Operation*> ops_to_reorder;
+
+  llvm::SmallVector<mlir::Value> worklist;
+  llvm::append_range(worklist, cluster->getResults());
+
+  while (!worklist.empty()) {
+    mlir::Value value = worklist.back();
+    worklist.pop_back();
+
+    for (mlir::Operation* const user : value.getUsers()) {
+      mlir::Operation* const op = cluster_block->findAncestorOpInBlock(*user);
+      if (op == nullptr || !op->isBeforeInBlock(cluster)) {
+        continue;
+      }
+
+      if (ops_to_reorder.insert(op)) {
+        llvm::append_range(worklist, op->getResults());
+      }
+    }
+  }
+
+  std::vector<mlir::Operation*> sorted = ops_to_reorder.takeVector();
+  llvm::sort(sorted, [](mlir::Operation* lhs, mlir::Operation* rhs) {
+    return lhs->isBeforeInBlock(rhs);
+  });
+
+  for (mlir::Operation* const op : llvm::reverse(sorted)) {
+    op->moveAfter(cluster);
+  }
+}
+
 // Build a `tf_device.launch` op with a region that contains all the operations
 // in given cluster. Then all ops in cluster are replaced by `tf_device.launch`.
 void BuildLaunchForCluster(const Cluster& c, OpBuilder* builder) {
@@ -176,9 +250,14 @@ void BuildLaunchForCluster(const Cluster& c, OpBuilder* builder) {
   // Replace any external uses of live-out values with return values of launch
   // op. So live-out values no longer escape the region.
   ReplaceLiveOutExternalUses(live_outs, launch_op);
+
+  // Ensure that users of the launch op's results appear after the launch op
+  // in order to preserve the dominance property.
+  ReorderOpResultUses(launch_op);
 }
 
-void BuildClusters(Block* block, OpBuilder builder) {
+void BuildClusters(Block* block, OpBuilder builder,
+                   const TF::SideEffectAnalysis::Info& side_effect_analysis) {
   // Iteratively find clusters of different devices within an island.
   // Whenever we see an operation that is assigned to an accelerator device
   // (ie. device != ""), we try to merge it into the last cluster of same
@@ -200,7 +279,7 @@ void BuildClusters(Block* block, OpBuilder builder) {
     // Check if it is legal to merge op into nearest cluster of same device.
     // If positive, update cluster and move on to next operation.
     Cluster& nearest_cluster = it->second;
-    if (CanMergeIntoCluster(nearest_cluster, &op)) {
+    if (CanMergeIntoCluster(nearest_cluster, &op, side_effect_analysis)) {
       nearest_cluster.ops.emplace_back(&op);
       continue;
     }
@@ -221,21 +300,28 @@ void BuildClusters(Block* block, OpBuilder builder) {
 }
 
 void ClusterFormationPass::runOnOperation() {
-  auto func = getOperation();
-  if (func.isExternal()) return;
-  OpBuilder builder(func.getContext());
-
-  // Operates on individual blocks independently of if they are directly in the
-  // function body or if they are nested in individual `tf_executor.island`.
-  for (Block& block : func.getBody()) BuildClusters(&block, builder);
-  func.walk([&](tf_executor::IslandOp island) {
-    BuildClusters(&island.GetBody(), builder);
-  });
+  auto module = getOperation();
+  auto& side_effect_analysis = getAnalysis<TF::SideEffectAnalysis>();
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.isExternal()) continue;
+    OpBuilder builder(func.getContext());
+    const TF::SideEffectAnalysis::Info& info =
+        side_effect_analysis.GetAnalysisForFunc(func);
+
+    // Operates on individual blocks independently of if they are directly in
+    // the function body or if they are nested in individual
+    // `tf_executor.island`.
+    for (Block& block : func.getBody()) BuildClusters(&block, builder, info);
+    func.walk([&](tf_executor::IslandOp island) {
+      BuildClusters(&island.GetBody(), builder, info);
+    });
+  }
 }
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> CreateClusterFormationPass() {
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterFormationPass() {
   return std::make_unique<ClusterFormationPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index e5273988b0b..2e256ae4eff 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.h"
 
+#include <optional>
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -100,7 +102,7 @@ void ValuesConstraintSet::Walk(
 Optional<ValueConstraint> ValuesConstraintSet::GetConstraint(
     Value value) const {
   auto it = constraints_.find(value);
-  if (it == constraints_.end()) return None;
+  if (it == constraints_.end()) return std::nullopt;
   return it->getSecond();
 }
 
@@ -386,10 +388,10 @@ static Optional<ValuesConstraintSet> CanBeClustered(
     const std::function<bool(Operation *op)> &filter) {
   // Check that op has no side effects. This guarantees that we will not
   // reorder side-effecting ops during cluster formation.
-  if (!MemoryEffectOpInterface::hasNoEffect(op)) return llvm::None;
+  if (!isMemoryEffectFree(op)) return std::nullopt;
 
   // Operation rejected by the custom filter.
-  if (filter && !filter(op)) return llvm::None;
+  if (filter && !filter(op)) return std::nullopt;
 
   // Initially we do not have any constraints on the operation results.
   ValuesConstraintSet result_constraints;
@@ -401,7 +403,7 @@ static Optional<ValuesConstraintSet> CanBeClustered(
       return operands_constraints.Resolve();
   }
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Compute initial clustering state based on the clustering polocy.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
index 435a85aec34..abf4abef0ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_tf_ops_pass.cc
@@ -25,12 +25,14 @@ limitations under the License.
 // does not exist any operation placed on host_B that conumes any result of any
 // operation placed on host_A.
 
+#include <optional>
+
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -169,7 +171,7 @@ llvm::Optional<llvm::StringMap<FunctionMetadata>> GetFunctionMetadatas(
     return WalkResult::advance();
   });
 
-  if (result.wasInterrupted()) return llvm::None;
+  if (result.wasInterrupted()) return std::nullopt;
 
   return metadatas;
 }
@@ -226,7 +228,7 @@ void CreateFunctions(ModuleOp module_op,
     // operation should use the arguments of the newly created func_op as
     // appropriate.
     OpBuilder builder(block, block->end());
-    BlockAndValueMapping mapping;
+    IRMapping mapping;
     for (int i : llvm::seq<int>(0, metadata.inputs.size())) {
       Value original_value = metadata.inputs[i];
       Value new_value = func_op.getArgument(i);
@@ -254,7 +256,7 @@ void CreateFunctions(ModuleOp module_op,
 // tf_device.remote_run calls.
 void CreateRemoteRunCalls(MLIRContext *context,
                           const llvm::StringMap<FunctionMetadata> &metadatas) {
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   for (auto &iter : metadatas) {
     llvm::StringRef host = iter.first();
     const FunctionMetadata &metadata = iter.second;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index a222d4734bd..4caeb7111c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 
+#include <optional>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -96,7 +98,7 @@ Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc,
       loc, ArrayRef<Type>{element_type},
       ArrayRef<Value>{slice,
                       GetR1Const(element_type.getShape(), builder, loc)});
-  return reshape.output();
+  return reshape.getOutput();
 }
 
 Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
@@ -119,7 +121,7 @@ Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
           loc, ArrayRef<Type>{buffer.getType()},
           ArrayRef<Value>{buffer, update_slice,
                           GetIndicesForElement(index, buffer, builder, loc)})
-      .output();
+      .getOutput();
 }
 
 TensorType GetSizeType(OpBuilder builder) {
@@ -142,7 +144,8 @@ LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
   auto max_count_const_op = llvm::dyn_cast<TF::ConstOp>(max_count_op);
   if (!max_count_const_op) return op->emitOpError("unknown max element count");
   int64_t max_size_const =
-      (*max_count_const_op.value().getValues<APInt>().begin()).getSExtValue();
+      (*max_count_const_op.getValue().getValues<APInt>().begin())
+          .getSExtValue();
   return CreateInitBufferValue(element_shape, max_size_const, op, element_dtype,
                                builder, buffer);
 }
@@ -168,7 +171,7 @@ LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
   auto broadcast = builder.create<TF::BroadcastToOp>(
       op->getLoc(), ArrayRef<Type>{buffer_type},
       ArrayRef<Value>{zero, GetR1Const(buffer_shape, builder, op->getLoc())});
-  *buffer = broadcast.output();
+  *buffer = broadcast.getOutput();
   return success();
 }
 
@@ -210,7 +213,7 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
       if (elem_type && elem_type.hasStaticShape()) return elem_type;
     }
   }
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Creates a ReadVariableOp on a local variable.
@@ -222,7 +225,7 @@ Value ReadLocalVariable(Value local_var, OpBuilder builder, Location loc) {
                              .cast<TF::ResourceType>()
                              .getSubtypes()[0]},
           ArrayRef<Value>{local_var})
-      .value();
+      .getValue();
 }
 
 // Creates an AssignVariableOp on a local variable.
@@ -252,7 +255,7 @@ int64_t GetFirstIfIndicesAreContiguous(Value indices) {
   if (!const_op) return -1;
   int64_t last_index = -1;
   int64_t first_index = -1;
-  for (const auto& ind : const_op.value().getValues<APInt>()) {
+  for (const auto& ind : const_op.getValue().getValues<APInt>()) {
     if (last_index == -1) {
       last_index = ind.getSExtValue();
       first_index = last_index;
@@ -314,12 +317,12 @@ Value ScatterAccumulateElements(Value indices, Value updates, Value buffer,
         GetElement(index, buffer, builder, loc, /*keep_slice_shape=*/true);
     starts_in_update[0] = i;
     auto update_slice_starts = GetR1Const(starts_in_update, builder, loc);
-    auto slice =
+    Value slice =
         builder
             .create<TF::SliceOp>(
                 loc, ArrayRef<Type>{old_slice.getType()},
                 ArrayRef<Value>{updates, update_slice_starts, slice_sizes})
-            .output();
+            .getOutput();
     slice = AccumulateBuffers(old_slice, slice, builder, loc);
     buffer = SetElement(index, buffer, slice, builder, loc);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index b1c7879394d..55354d3eec5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -89,7 +89,7 @@ LogicalResult ConstantFoldFallbackHook(
   // TensorFlow folding hook.
   if (inst->getNumResults() == 0 ||
       inst->hasTrait<OpTrait::TF::NoConstantFold>() ||
-      inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
+      inst->getNumRegions() != 0 || !isMemoryEffectFree(inst))
     return failure();
 
   // If any of the result types are variants, don't try to constant fold them.
@@ -151,6 +151,10 @@ LogicalResult ConstantFoldFallbackHook(
     // with all GPU/TPU devices ignored and CPU only set to 1.
     (*config_proto.mutable_device_count())["CPU"] = 1;
     config_proto.add_device_filters("/device:CPU:*");
+    // Limit the thread pool size. Without this, TF by default creates as many
+    // threads as the number of CPUs (`port::MaxParallelism()`). This can be
+    // expensive since this TFE context persists the entire program execution.
+    config_proto.set_inter_op_parallelism_threads(2);
     std::unique_ptr<TF_Buffer, decltype(&TF_DeleteBuffer)> config(
         TF_NewBuffer(), TF_DeleteBuffer);
     DCHECK(config->data == nullptr);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index b96061aedc5..5af3b02a2f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -232,10 +232,9 @@ void AppendFunctionResults(func::FuncOp func, int num_resources,
   graph_op.erase();
   func::ReturnOp return_op = cast<func::ReturnOp>(block.getTerminator());
   int num_old_arguments = return_op.getNumOperands();
-  for (int i = 0; i < num_resources; ++i) {
-    return_op.operandsMutable().append(
-        new_graph_op.getResult(num_old_arguments + i));
-  }
+  return_op->insertOperands(
+      num_old_arguments,
+      new_graph_op.getResults().slice(num_old_arguments, num_resources));
 }
 
 // Creates a wrapper island enclosing the `sub_op` dependent on
@@ -347,7 +346,7 @@ TF::WhileOp RewriteWhileOp(TF::WhileOp while_op, int num_resource_inputs,
   // Get the dummy constant.
   OpBuilder builder(while_wrapper);
   auto loc = NameLoc::get(
-      builder.getStringAttr("chain_control_outputs@" + while_op.body()));
+      builder.getStringAttr("chain_control_outputs@" + while_op.getBody()));
   IslandOp const_wrapper = GetDummyConstant(builder, const_type, loc);
 
   // Get new operand and result types.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
index 136137121da..96dc678dd1c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_launch_func_to_tf_call.cc
@@ -52,7 +52,7 @@ void ConvertLaunchFuncToTFCallPass::runOnOperation() {
   module.walk([&](tf_device::LaunchFuncOp launch) {
     OpBuilder builder(launch);
     auto call_op = builder.create<TF::PartitionedCallOp>(
-        module.getLoc(), launch.getResultTypes(), launch.operands(),
+        module.getLoc(), launch.getResultTypes(), launch.getOperands(),
         SymbolRefAttr::get(builder.getContext(), launch.getFunc()),
         /*config=*/builder.getStringAttr(""),
         /*config_proto=*/builder.getStringAttr(""),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_session_initializer_to_function.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_session_initializer_to_function.cc
new file mode 100644
index 00000000000..491cb4e46a7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_session_initializer_to_function.cc
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
+
+namespace mlir {
+namespace tf_saved_model {
+
+namespace {
+
+#define GEN_PASS_DEF_CONVERTSESSIONINITIALIZERTOFUNCTIONPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.h.inc"
+
+struct ConvertSessionInitializerToFunctionPass
+    : public impl::ConvertSessionInitializerToFunctionPassBase<
+          ConvertSessionInitializerToFunctionPass> {
+  void runOnOperation() override;
+};
+
+void ConvertSessionInitializerToFunctionPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  auto session_initializer = tf_saved_model::GetSessionInitializerOp(module);
+  if (!session_initializer) return;
+
+  OpBuilder builder(session_initializer);
+  const char *name = "session_initializer";
+
+  // In the (unlikely) case of there already being a session initializer
+  // function, bail out.
+  if (SymbolTable::lookupSymbolIn(module, name)) {
+    module->emitWarning("session_initializer function already exists");
+    session_initializer.erase();
+    return;
+  }
+
+  auto init = builder.create<func::FuncOp>(
+      module.getLoc(), name,
+      FunctionType::get(module.getContext(), /*inputs=*/{}, /*results=*/{}));
+
+  // Make savedmodel verification happy.
+  init->setAttr("tf_saved_model.exported_names",
+                builder.getStrArrayAttr({name}));
+
+  builder.setInsertionPointToStart(init.addEntryBlock());
+
+  for (func::FuncOp func : tf_saved_model::GetInitializerFunctions(module)) {
+    if (func.getNumArguments() != 0) {
+      session_initializer->emitWarning(
+          "encountered session initializers with arguments");
+      continue;
+    }
+
+    // Since we're now calling this function, savedmodel verification
+    // needs it to be private.
+    func.setVisibility(SymbolTable::Visibility::Private);
+    func->removeAttr("tf_saved_model.exported_names");
+
+    ArrayRef<Value> args;
+    builder.create<func::CallOp>(session_initializer.getLoc(),
+                                 func.getFunctionType().getResults(),
+                                 func.getSymName(), args);
+  }
+  builder.create<func::ReturnOp>(session_initializer.getLoc());
+
+  session_initializer.erase();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateConvertSessionInitializerToFunctionPass() {
+  return std::make_unique<ConvertSessionInitializerToFunctionPass>();
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
index a39cf338ac0..a3266f58718 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_tf_control_flow_to_scf.cc
@@ -87,7 +87,8 @@ class ConvertIfRegionOp : public OpRewritePattern<IfRegionOp> {
     // condition of the `tf.IfRegion` op is a 0-D tensor of 1-bit signless
     // integers. Thus, we use the `tensor.extract` op to compute the condition
     // of `scf.if` from that of `tf.IfRegion`.
-    auto scf_if_condition = rewriter.create<tensor::ExtractOp>(loc, op.cond());
+    auto scf_if_condition =
+        rewriter.create<tensor::ExtractOp>(loc, op.getCond());
 
     TypeRange tf_if_region_return_type = op.getResultTypes();
 
@@ -96,8 +97,8 @@ class ConvertIfRegionOp : public OpRewritePattern<IfRegionOp> {
         rewriter.create<scf::IfOp>(loc, tf_if_region_return_type,
                                    scf_if_condition, /*withElseRegion=*/true);
 
-    Region& then_region = op.then_branch();
-    Region& else_region = op.else_branch();
+    Region& then_region = op.getThenBranch();
+    Region& else_region = op.getElseBranch();
 
     // Create the `then` and `else` regions of the `scf.if` op.
     createScfThenOrElse(then_region, scf_if_op.getThenRegion(),
@@ -140,7 +141,7 @@ class ConvertWhileRegionOp : public OpRewritePattern<WhileRegionOp> {
           return cond_or_body_terminator;
         };
 
-    ValueRange opInput = op.input();
+    ValueRange opInput = op.getInput();
     TypeRange scf_block_arguments_type = opInput.getType();
 
     // Create the `scf.while` op.
@@ -155,7 +156,7 @@ class ConvertWhileRegionOp : public OpRewritePattern<WhileRegionOp> {
     // `tensor.extract` op to compute the input of `scf.condition`.
     rewriter.createBlock(&scf_while_op.getBefore());
     Operation* cond_terminator =
-        createScfCondOrBody(op.cond(), scf_while_op.getBefore(),
+        createScfCondOrBody(op.getCond(), scf_while_op.getBefore(),
                             scf_block_arguments_type, rewriter);
     auto scf_condition_input = rewriter.create<tensor::ExtractOp>(
         cond_terminator->getLoc(), cond_terminator->getOperand(0));
@@ -167,8 +168,9 @@ class ConvertWhileRegionOp : public OpRewritePattern<WhileRegionOp> {
     // the terminator). Note that the arguments' type of this block is kept as
     // `opInput`'s type.
     rewriter.createBlock(&scf_while_op.getAfter());
-    Operation* body_terminator = createScfCondOrBody(
-        op.body(), scf_while_op.getAfter(), scf_block_arguments_type, rewriter);
+    Operation* body_terminator =
+        createScfCondOrBody(op.getBody(), scf_while_op.getAfter(),
+                            scf_block_arguments_type, rewriter);
     rewriter.replaceOpWithNewOp<scf::YieldOp>(body_terminator,
                                               body_terminator->getOperands());
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
index 0ea6d309be8..8d91ef5b311 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decode_attributes_hook.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/framework/logging.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace mlir {
 namespace {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
index 200d8b9ef58..8e89f3988dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_reduce_dataset.cc
@@ -86,7 +86,7 @@ AnonymousIteratorV3Op CreateIterator(OpBuilder builder,
       /*output_types=*/builder.getArrayAttr(type_attrs),
       /*shape_types=*/builder.getArrayAttr(shape_attrs));
   builder.create<MakeIteratorOp>(reduce_dataset.getLoc(),
-                                 reduce_dataset.input_dataset(),
+                                 reduce_dataset.getInputDataset(),
                                  anonymous_iterator.getResult());
   return anonymous_iterator;
 }
@@ -127,7 +127,7 @@ WhileRegionOp CreateDatasetWhile(OpBuilder builder,
 // condition of whether to continue to next iteration.
 void PopulateDatasetWhileCond(OpBuilder builder, WhileRegionOp dataset_while,
                               Location loc) {
-  auto& cond_region = dataset_while.cond();
+  auto& cond_region = dataset_while.getCond();
   Block* cond_block = builder.createBlock(&cond_region);
   auto while_input_types = dataset_while.getOperandTypes();
   cond_block->addArguments(
@@ -160,7 +160,7 @@ IfRegionOp CreateOptionalDatasetIf(
   // parallelization.
   dataset_if->setAttr("_lower_using_switch_merge", builder.getBoolAttr(true));
   // Empty else branch, if there is no more data, do nothing.
-  auto& else_branch = dataset_if.else_branch();
+  auto& else_branch = dataset_if.getElseBranch();
   else_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&else_branch.front());
   // Return only the state variables from the body arguments.
@@ -172,7 +172,7 @@ IfRegionOp CreateOptionalDatasetIf(
                               /*operands=*/else_returns);
 
   // Then branch gets the data and calls the reduce_function.
-  auto& then_branch = dataset_if.then_branch();
+  auto& then_branch = dataset_if.getThenBranch();
   then_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&then_branch.front());
   // Add iterator operational data access inside if.
@@ -220,14 +220,14 @@ void PopulateDatasetWhileBody(OpBuilder builder, ReduceDatasetOp reduce_dataset,
                               ArrayRef<Type> dataset_types) {
   const Location loc = reduce_dataset.getLoc();
   auto while_input_types = dataset_while.getOperandTypes();
-  auto& body_region = dataset_while.body();
+  auto& body_region = dataset_while.getBody();
   Block* body_block = builder.createBlock(&body_region);
   auto body_arguments = body_block->addArguments(
       while_input_types, SmallVector<Location>(while_input_types.size(), loc));
   auto get_next = builder.create<IteratorGetNextAsOptionalOp>(
       loc, RankedTensorType::get({}, builder.getType<VariantType>()),
-      anonymous_iterator.getResult(), anonymous_iterator.output_types(),
-      anonymous_iterator.output_shapes());
+      anonymous_iterator.getResult(), anonymous_iterator.getOutputTypes(),
+      anonymous_iterator.getOutputShapes());
   auto optional_has_value = builder.create<OptionalHasValueOp>(
       loc, RankedTensorType::get({}, builder.getI1Type()),
       get_next.getResult());
@@ -279,7 +279,7 @@ LogicalResult DecomposeReduceDatasetInFunction(FuncOp function) {
     // complexity = # ReduceDataset ops x # of functions in module.
     func::FuncOp reduce_func =
         function->getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
-            reduce_dataset.f());
+            reduce_dataset.getF());
 
     // The reduce function arguments consist of three part in this order:
     // 1. Reduction state inputs.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index e5d72fb0174..0a205859957 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -93,7 +93,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     auto rng_op = cast<RngReadAndSkipOp>(op);
 
     DenseIntElementsAttr alg_constant;
-    if (!matchPattern(rng_op.alg(), m_Constant(&alg_constant))) {
+    if (!matchPattern(rng_op.getAlg(), m_Constant(&alg_constant))) {
       return rewriter.notifyMatchFailure(
           op, "unable to determine algorithm statically");
     }
@@ -123,7 +123,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
       return rewriter.notifyMatchFailure(op, "unexpected op type");
     }
 
-    if (!HasResourceSubtype(rng_op.resource())) {
+    if (!HasResourceSubtype(rng_op.getResource())) {
       return rewriter.notifyMatchFailure(op, "missing resource subtype");
     }
 
@@ -131,7 +131,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     int state_size = counter_size + tensorflow::RNG_KEY_SIZE;
     RankedTensorType res_type =
         RankedTensorType::get({state_size}, state_element_type);
-    if (res_type != GetResourceSubtype(rng_op.resource())) {
+    if (res_type != GetResourceSubtype(rng_op.getResource())) {
       return rewriter.notifyMatchFailure(op, "unexpected resource subtype");
     }
 
@@ -139,7 +139,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
 
     // Read the state value from the resource.
     Value state =
-        rewriter.create<ReadVariableOp>(loc, res_type, rng_op.resource());
+        rewriter.create<ReadVariableOp>(loc, res_type, rng_op.getResource());
 
     // Extract the key and counter from the state.
     RankedTensorType word_type = RankedTensorType::get({}, state_element_type);
@@ -157,7 +157,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     RankedTensorType u64_scalar = RankedTensorType::get({}, u64);
     Value step_size = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 256));
     Value increment =
-        rewriter.create<MulOp>(loc, u64_scalar, step_size, rng_op.delta());
+        rewriter.create<MulOp>(loc, u64_scalar, step_size, rng_op.getDelta());
 
     // Increment the counter.
     SmallVector<Value, 4> pack_args;
@@ -178,7 +178,7 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     // Save the new state value to the resource.
     pack_args.push_back(key);
     Value new_state = rewriter.create<PackOp>(loc, res_type, pack_args);
-    rewriter.create<AssignVariableOp>(loc, rng_op.resource(), new_state);
+    rewriter.create<AssignVariableOp>(loc, rng_op.getResource(), new_state);
 
     // Pad the original state as necessary to fill the output shape.
     int pad = tensorflow::RNG_MAX_COUNTER_SIZE - counter_size;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
index b44aa1526e1..617e355fbfe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops_pass.cc
@@ -93,10 +93,13 @@ LogicalResult ApplyPatternsLocallyUntilConverged(
     changed = false;
     auto walk_result =
         op_with_regions->walk([&patterns, &changed](Operation* operation) {
-          bool op_changed;
-          if (failed(applyOpPatternsAndFold(operation, patterns, &op_changed)))
+          GreedyRewriteConfig config;
+          config.strictMode = mlir::GreedyRewriteStrictness::ExistingOps;
+          bool op_erased;
+          if (failed(applyOpPatternsAndFold(operation, patterns, config,
+                                            &op_erased)))
             return WalkResult::interrupt();
-          changed |= op_changed;
+          changed |= op_erased;
           return WalkResult::advance();
         });
     if (walk_result.wasInterrupted()) return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
index 008858ca692..e0467bea424 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -63,7 +63,7 @@ void DeviceIndexSelector::runOnOperation() {
     // future.
     OpBuilder b(op);
     RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
-    int index = op.device_names().size();
+    int index = op.getDeviceNames().size();
     for (auto use : op.getOperation()->getUsers()) {
       // Skip if it doesn't feed into case. Alternatively this could always
       // return the CPU device index if it exists.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 9146951f7d5..df3b7fa1d93 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include <cctype>
 #include <climits>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -47,6 +49,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verification_utils.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 
@@ -99,7 +102,8 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
 TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
                               Type element_type, Location loc,
                               PatternRewriter* rewriter) {
-  auto shape_tensor = createI64ConstantOp(shape, loc, rewriter);
+  auto shape_tensor = createI64ConstantOp(
+      tensorflow::ConvertMlirShapeToTF(shape), loc, rewriter);
   Type resultType = RankedTensorType::get(shape, element_type);
   return rewriter->create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
                                          /*shape=*/shape_tensor);
@@ -208,11 +212,11 @@ llvm::Optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
   for (int64_t i = 0; i < equation.size(); ++i) {
     if (!std::isalpha(equation[i])) {
       // Unsupported character in the equation.
-      return llvm::None;
+      return std::nullopt;
     }
     if (map.count(equation[i])) {
       // Duplicate character in the equation.
-      return llvm::None;
+      return std::nullopt;
     }
     map.try_emplace(equation[i], i);
   }
@@ -243,11 +247,11 @@ llvm::Optional<llvm::SetVector<char>> GetAvailableLabels(
       labels.remove(label);
       ++lhs_count;
     } else if (label == '.') {
-      if (!is_start_of_ellipsis(lhs, i)) return llvm::None;
+      if (!is_start_of_ellipsis(lhs, i)) return std::nullopt;
       i += 2;
     } else {
       // Unsupported character in the equation.
-      return llvm::None;
+      return std::nullopt;
     }
   }
   *lhs_named_label_count = lhs_count;
@@ -260,11 +264,11 @@ llvm::Optional<llvm::SetVector<char>> GetAvailableLabels(
       labels.remove(label);
       ++rhs_count;
     } else if (label == '.') {
-      if (!is_start_of_ellipsis(rhs, i)) return llvm::None;
+      if (!is_start_of_ellipsis(rhs, i)) return std::nullopt;
       i += 2;
     } else {
       // Unsupported character in the equation.
-      return llvm::None;
+      return std::nullopt;
     }
   }
 
@@ -347,38 +351,38 @@ llvm::Optional<EinsumDimensionNumbers> GetEinsumDimensionNumbers(
   llvm::StringRef lhs_rhs;
   llvm::StringRef out;
   std::tie(lhs_rhs, out) = equation.split("->");
-  if (lhs_rhs.empty() || out.empty()) return llvm::None;
+  if (lhs_rhs.empty() || out.empty()) return std::nullopt;
 
   llvm::StringRef lhs;
   llvm::StringRef rhs;
   std::tie(lhs, rhs) = lhs_rhs.split(',');
-  if (lhs.empty() || rhs.empty()) return llvm::None;
+  if (lhs.empty() || rhs.empty()) return std::nullopt;
 
   // Try to flatten the "..." if possible.
   int lhs_named_label, rhs_named_label;
   auto available_labels =
       GetAvailableLabels(lhs, rhs, &lhs_named_label, &rhs_named_label);
-  if (!available_labels.has_value()) return llvm::None;
+  if (!available_labels.has_value()) return std::nullopt;
 
   auto flattended_labels =
       FlattenEllipsis(lhs, lhs_named_label, rhs, rhs_named_label, out, lhs_ty,
-                      rhs_ty, available_labels.getValue());
+                      rhs_ty, available_labels.value());
 
   lhs = std::get<0>(flattended_labels);
   rhs = std::get<1>(flattended_labels);
   out = std::get<2>(flattended_labels);
 
   auto lhs_map_or = EquationToMap(lhs);
-  if (!lhs_map_or.has_value()) return llvm::None;
-  auto lhs_map = lhs_map_or.getValue();
+  if (!lhs_map_or.has_value()) return std::nullopt;
+  auto lhs_map = lhs_map_or.value();
 
   auto rhs_map_or = EquationToMap(rhs);
-  if (!rhs_map_or.has_value()) return llvm::None;
-  auto rhs_map = rhs_map_or.getValue();
+  if (!rhs_map_or.has_value()) return std::nullopt;
+  auto rhs_map = rhs_map_or.value();
 
   auto out_map_or = EquationToMap(out);
-  if (!out_map_or.has_value()) return llvm::None;
-  auto out_map = out_map_or.getValue();
+  if (!out_map_or.has_value()) return std::nullopt;
+  auto out_map = out_map_or.value();
 
   EinsumDimensionNumbers dnums;
   for (int64_t i = 0, e = lhs.size(); i < e; ++i) {
@@ -410,7 +414,7 @@ llvm::Optional<EinsumDimensionNumbers> GetEinsumDimensionNumbers(
     auto rhs_index = rhs_map.find(out[i]);
     if (lhs_index == lhs_map.end() && rhs_index == rhs_map.end()) {
       // out only isn't supported
-      return llvm::None;
+      return std::nullopt;
     }
   }
   return dnums;
@@ -487,7 +491,7 @@ inline int64_t ProdShapeWithIndexInTuple(
   int64_t prod_shape = 1;
   for (auto index_tuple : index_tuples) {
     const int64_t shape_i = shape[std::get<I>(index_tuple)];
-    if (shape_i == -1) return -1;
+    if (ShapedType::isDynamic(shape_i)) return ShapedType::kDynamic;
     prod_shape *= shape_i;
   }
   return prod_shape;
@@ -629,7 +633,7 @@ LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
                                    PatternRewriter& rewriter) {
   if (!dnums.lhs.empty() || !dnums.rhs.empty()) return failure();
 
-  auto inputs = op.inputs();
+  auto inputs = op.getInputs();
   if (inputs.size() != 2) return failure();
   Value lhs = inputs.front();
   Value rhs = inputs.back();
@@ -712,8 +716,9 @@ LogicalResult ConvertTFEinsumOp::matchAndRewrite(
   // dynamic dimension is always supported. If there are two or more dynamic
   // dimensions, it is supported if they only exist in a single component
   // among: L0,...,Ln R0,...,Rn or C0,...,Cn.
-  if (const auto dnums_or = GetEinsumDimensionNumbers(op.equation(), lhs, rhs))
-    return rewriteToBatchMatmul(op, dnums_or.getValue(), rewriter);
+  if (const auto dnums_or =
+          GetEinsumDimensionNumbers(op.getEquation(), lhs, rhs))
+    return rewriteToBatchMatmul(op, dnums_or.value(), rewriter);
   return rewriter.notifyMatchFailure(op, "unsupported einsum lowering");
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
index 122c1b8279c..c45fe527d0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
@@ -57,7 +57,7 @@ void ExecutorTPUV1IslandInliningPass::runOnOperation() {
 
   InlinerInterface inliner(&getContext());
   auto walk_result = getOperation().walk([&](TF::PartitionedCallOp call_op) {
-    if (!call_op.f().getRootReference().getValue().startswith(kNestedModule))
+    if (!call_op.getF().getRootReference().getValue().startswith(kNestedModule))
       return WalkResult::advance();
     // This is a call we need to inline!
     LLVM_DEBUG(llvm::dbgs()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
index 5f9fdbd5e64..e4224042d2f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <iterator>
+#include <optional>
 #include <queue>
 #include <tuple>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -71,7 +71,7 @@ struct TpuV1BridgeExecutorIslandCoarsening
 };
 
 // Returns name of TPU cluster, if op belongs to a TPU cluster. Otherwise,
-// returns `llvm::None`.
+// returns `std::nullopt`.
 llvm::Optional<llvm::StringRef> GetTpuClusterName(Operation* op) {
   if (auto tpu_status = op->getAttrOfType<StringAttr>(kTpuStatusAttr)) {
     // Borrow cluster name from TPU status (for `TPUCompilationResult` op).
@@ -80,7 +80,7 @@ llvm::Optional<llvm::StringRef> GetTpuClusterName(Operation* op) {
   auto device_type = op->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr);
   if (!device_type || device_type.getValue() != TF::kTpuDevice) {
     // Op does not belong to a TPU cluster.
-    return llvm::None;
+    return std::nullopt;
   }
   // Op belongs to a TPU cluster.
   if (auto replication_info =
@@ -196,7 +196,7 @@ void CollectCandidateIslands(
         GetTpuClusterName(&candidate_wrapped_op);
     llvm::StringRef candidate_cluster_name;
     if (result.has_value()) {
-      candidate_cluster_name = result.getValue();
+      candidate_cluster_name = result.value();
     } else if (is_op_calling_func_for_cluster(cluster_name,
                                               &candidate_wrapped_op)) {
       candidate_cluster_name = cluster_name;
@@ -298,7 +298,7 @@ LogicalResult MergeIsland(
 
   llvm::Optional<llvm::StringRef> result = GetTpuClusterName(&wrapped_op);
   if (!result.has_value()) return success();
-  llvm::StringRef cluster_name = result.getValue();
+  llvm::StringRef cluster_name = result.value();
 
   // We found a _replication_info, let's build an island for the full cluster!
   LLVM_DEBUG(llvm::dbgs() << "Processing candidate island: "
@@ -396,7 +396,7 @@ bool is_valid_special_tpu_op(
 
     bool op_has_inconsistent_cluster_name =
         wrapped_op_cluster_name.has_value() &&
-        !wrapped_op_cluster_name.getValue().equals(cluster_name);
+        !wrapped_op_cluster_name.value().equals(cluster_name);
 
     if (op_has_inconsistent_cluster_name) {
       return false;
@@ -511,7 +511,7 @@ LogicalResult CollectSpecialTpuOps(
 
   llvm::Optional<llvm::StringRef> result = GetTpuClusterName(&wrapped_op);
   if (!result.has_value()) return success();
-  llvm::StringRef cluster_name = result.getValue();
+  llvm::StringRef cluster_name = result.value();
 
   visited_wrapped_ops.insert(&wrapped_op);
 
@@ -547,12 +547,12 @@ bool ExcludeIdentityOp(llvm::SmallDenseSet<Operation*>& tpu_ops,
       for (IslandOp wrapper : ops) {
         Operation* wrapped_op = &wrapper.GetBody().front();
         auto cluster_name = GetTpuClusterName(wrapped_op);
-        if (cluster_name.hasValue() &&
-            cluster_name.getValue() != target_cluster_name) {
+        if (cluster_name.has_value() &&
+            cluster_name.value() != target_cluster_name) {
           tpu_ops.erase(iter);
           return true;
         }
-        if (!cluster_name.hasValue() &&
+        if (!cluster_name.has_value() &&
             !tpu_ops.count(wrapper.getOperation())) {
           tpu_ops.erase(iter);
           return true;
@@ -590,10 +590,10 @@ void EraseIdentityWithNoReplicationInfo(Block& graph_body) {
     if (!island || island.WrapsSingleOp()) continue;
     for (Operation& op : llvm::make_early_inc_range(island.GetBody())) {
       llvm::Optional<llvm::StringRef> cluster_name = GetTpuClusterName(&op);
-      if (cluster_name.hasValue()) continue;
+      if (cluster_name.has_value()) continue;
       if (auto identity_op = llvm::dyn_cast_or_null<TF::IdentityOp>(op)) {
-        auto identity_input = identity_op.input();
-        auto output = identity_op.output();
+        auto identity_input = identity_op.getInput();
+        auto output = identity_op.getOutput();
         output.replaceAllUsesWith(identity_input);
         identity_op.erase();
       }
@@ -611,7 +611,7 @@ void TpuV1BridgeExecutorIslandCoarsening::runOnOperation() {
     func_op.walk([&](Operation* op) {
       llvm::Optional<llvm::StringRef> cluster_name_opt = GetTpuClusterName(op);
       if (cluster_name_opt.has_value()) {
-        tpu_funcs[cluster_name_opt.getValue()].insert(func_op);
+        tpu_funcs[cluster_name_opt.value()].insert(func_op);
       }
     });
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
index ea44b748e7b..2f0e7dbdb11 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_broadcast.cc
@@ -102,14 +102,14 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteBatchMatMulV2Op(
         }
 
         const int x_row =
-            matmul_op.adj_x() ? shape_x.back() : *(shape_x.rbegin() + 1);
+            matmul_op.getAdjX() ? shape_x.back() : *(shape_x.rbegin() + 1);
         const int x_col =
-            !matmul_op.adj_x() ? shape_x.back() : *(shape_x.rbegin() + 1);
+            !matmul_op.getAdjX() ? shape_x.back() : *(shape_x.rbegin() + 1);
 
         const int y_row =
-            matmul_op.adj_y() ? shape_y.back() : *(shape_y.rbegin() + 1);
+            matmul_op.getAdjY() ? shape_y.back() : *(shape_y.rbegin() + 1);
         const int y_col =
-            !matmul_op.adj_y() ? shape_y.back() : *(shape_y.rbegin() + 1);
+            !matmul_op.getAdjY() ? shape_y.back() : *(shape_y.rbegin() + 1);
 
         // Checks that matrix multiply can perform a valid contraction.
         if (x_col != y_row) {
@@ -129,7 +129,7 @@ template <typename Op>
 LogicalResult ConvertResultsBroadcastableShapeOp::RewriteEqOp(
     Operation* op, PatternRewriter& rewriter) const {
   auto eq_op = llvm::dyn_cast_or_null<Op>(op);
-  if (eq_op && eq_op.incompatible_shape_error())
+  if (eq_op && eq_op.getIncompatibleShapeError())
     return RewriteOp(op, rewriter, OpTrait::util::getBroadcastedShape);
   return failure();
 }
@@ -156,7 +156,7 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 
     // Check that the operand of the broadcast has fully defined shape.
     auto broadcast_arg_type =
-        broadcast.input().getType().dyn_cast_or_null<RankedTensorType>();
+        broadcast.getInput().getType().dyn_cast_or_null<RankedTensorType>();
     if (!broadcast_arg_type || !broadcast_arg_type.hasStaticShape()) continue;
 
     // Check that the other argument has fully defined shape.
@@ -184,7 +184,7 @@ LogicalResult ConvertResultsBroadcastableShapeOp::RewriteOp(
 
     // Update the operand of the op to be the operand of the broadcast.
     rewriter.updateRootInPlace(
-        op, [&]() { op->getOpOperand(i).set(broadcast.input()); });
+        op, [&]() { op->getOpOperand(i).set(broadcast.getInput()); });
     changed = true;
   }
   return success(changed);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index a0b1f0cac59..ad5edd1b2dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -58,6 +59,7 @@ void FreezeGlobalTensorsPass::runOnOperation() {
 
   DataFlowSolver solver;
   solver.load<dataflow::DeadCodeAnalysis>();
+  solver.load<dataflow::SparseConstantPropagation>();
   solver.load<TF::ResourceDataflowAnalysis>();
   if (failed(solver.initializeAndRun(module))) return signalPassFailure();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
index 47d40e0651f..daaf9df7400 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_saved_model_assets.cc
@@ -48,7 +48,6 @@ struct FreezeAssetsPass : public impl::FreezeAssetsPassBase<FreezeAssetsPass> {
   void runOnOperation() override;
 
  private:
-  // TODO(team): should be a pass option.
   std::string saved_model_dir;
 };
 
@@ -101,9 +100,9 @@ void FreezeAssetsPass::runOnOperation() {
         // asset filepath.
         builder.setInsertionPoint(init_op);
         builder.create<TF::InitializeTableFromTextFileV2Op>(
-            init_op.getLoc(), init_op.table_handle(), const_op.getResult(),
-            init_op.key_index(), init_op.value_index(), init_op.vocab_size(),
-            init_op.delimiter());
+            init_op.getLoc(), init_op.getTableHandle(), const_op.getResult(),
+            init_op.getKeyIndex(), init_op.getValueIndex(),
+            init_op.getVocabSize(), init_op.getDelimiter());
         init_op.erase();
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index 09dbcb103ed..65d9a288d56 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -47,9 +47,9 @@ struct FunctionalControlFlowToCFG
 // Lowers a general tensor argument that is used as a condition to a functional
 // control flow op into an i1 value.
 static Value LowerCondition(Location loc, Value value, OpBuilder* builder) {
-  auto zero_d = builder->create<ToBoolOp>(loc, value);
-  auto scalar = builder->create<tensor::ExtractOp>(loc, zero_d);
-  return scalar.getResult();
+  Value zero_d = builder->create<ToBoolOp>(loc, value);
+  Value scalar = builder->create<tensor::ExtractOp>(loc, zero_d);
+  return scalar;
 }
 
 // Calls the function `fn` with arguments provided by the given function and
@@ -143,7 +143,7 @@ static LogicalResult LowerIfOp(IfOp op) {
   OpBuilder builder(op_inst);
 
   // Lower the condition to a boolean value (i1).
-  Value cond_i1 = LowerCondition(loc, op.cond(), &builder);
+  Value cond_i1 = LowerCondition(loc, op.getCond(), &builder);
   if (!cond_i1) return failure();
 
   // Split the basic block before the 'if'.  The new dest will be our merge
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index ff9fe343930..bc64b6ed975 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -98,19 +98,19 @@ Value ConvertConditionToBoolean(Operation* op, Value cond) {
 
 // Transform a functional IfOp to a region based IfRegionOp.
 LogicalResult ConvertIfOp(IfOp if_op) {
-  Value cond = ConvertConditionToBoolean(if_op, if_op.cond());
+  Value cond = ConvertConditionToBoolean(if_op, if_op.getCond());
   OpBuilder builder(if_op);
   auto if_region = builder.create<TF::IfRegionOp>(
-      if_op.getLoc(), if_op.getResultTypes(), cond, if_op.is_stateless(),
+      if_op.getLoc(), if_op.getResultTypes(), cond, if_op.getIsStateless(),
       builder.getStringAttr(if_op.then_function().getName()),
       builder.getStringAttr(if_op.else_function().getName()));
   CopyDeviceAndUnderscoredAttributes(if_op, if_region);
 
   CreateCall(if_op, if_op.then_function(),
-             /*caller_region=*/if_region.then_branch(), if_op.input(),
+             /*caller_region=*/if_region.getThenBranch(), if_op.getInput(),
              /*use_region_args=*/false);
   CreateCall(if_op, if_op.else_function(),
-             /*caller_region=*/if_region.else_branch(), if_op.input(),
+             /*caller_region=*/if_region.getElseBranch(), if_op.getInput(),
              /*use_region_args=*/false);
   if_op.replaceAllUsesWith(if_region.getResults());
   if_op.erase();
@@ -119,21 +119,21 @@ LogicalResult ConvertIfOp(IfOp if_op) {
 
 LogicalResult ConvertWhileOp(WhileOp while_op) {
   auto while_region = OpBuilder(while_op).create<TF::WhileRegionOp>(
-      while_op.getLoc(), while_op.getResultTypes(), while_op.input(),
-      while_op.parallel_iterations(), while_op.is_stateless(),
-      while_op.shape_invariant());
+      while_op.getLoc(), while_op.getResultTypes(), while_op.getInput(),
+      while_op.getParallelIterations(), while_op.getIsStateless(),
+      while_op.getShapeInvariant());
   CopyDeviceAndUnderscoredAttributes(while_op, while_region);
 
   YieldOp cond_yield =
       CreateCall(while_op, while_op.cond_function(),
-                 /*caller_region=*/while_region.cond(), while_op.input(),
+                 /*caller_region=*/while_region.getCond(), while_op.getInput(),
                  /*use_region_args=*/true);
   Value i1_cond =
       ConvertConditionToBoolean(cond_yield, cond_yield.getOperand(0));
   cond_yield.setOperand(0, i1_cond);
 
   CreateCall(while_op, while_op.body_function(),
-             /*caller_region=*/while_region.body(), while_op.input(),
+             /*caller_region=*/while_region.getBody(), while_op.getInput(),
              /*use_region_args=*/true);
   while_op.replaceAllUsesWith(while_region.getResults());
   while_op.erase();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 2db53e7fa6e..185daa50a86 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdio>
 #include <iostream>
+#include <optional>
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
@@ -82,7 +83,7 @@ BiasAddOp GetBiasAdd(Value op) {
   for (auto &use : op.getUses()) {
     auto bias_add = dyn_cast_or_null<BiasAddOp>(use.getOwner());
     // If it's a BiasAdd, check that the conv op is the first input.
-    if (bias_add && bias_add.value() == op) return bias_add;
+    if (bias_add && bias_add.getValue() == op) return bias_add;
   }
   // No BiasAddOps found among uses.
   return BiasAddOp();
@@ -162,7 +163,7 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
 
     // If there is an activation, only fuse it if this is the only op to use the
     // result of the BiasAdd.
-    bool fuse_activation = activation && bias_add.output().hasOneUse();
+    bool fuse_activation = activation && bias_add.getOutput().hasOneUse();
     Type result_type;
 
     // Include info about the activation function if applicable.
@@ -181,7 +182,7 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     // with `bias` from the BiasAddOp appended.
     SmallVector<Value, 4> operands(contraction.operand_begin(),
                                    contraction.operand_end());
-    operands.push_back(bias_add.bias());
+    operands.push_back(bias_add.getBias());
 
     // The fused contraction has the same attributes as the original
     // contraction, with two additions: the list of ops which have been fused
@@ -241,15 +242,15 @@ const char kDeviceGpu[] = "GPU";
 llvm::Optional<std::string> GetDevice(mlir::Operation *op) {
   mlir::StringAttr device = op->getAttrOfType<mlir::StringAttr>(kDeviceAttr);
   if (!device || device.getValue().empty()) {
-    return llvm::None;
+    return std::nullopt;
   }
   const std::string device_name = device.str();
   tensorflow::DeviceNameUtils::ParsedName parsed_name;
   if (!tensorflow::DeviceNameUtils::ParseFullName(device_name, &parsed_name)) {
-    return llvm::None;
+    return std::nullopt;
   }
   if (!parsed_name.has_type) {
-    return llvm::None;
+    return std::nullopt;
   }
   return parsed_name.type;
 }
@@ -273,10 +274,11 @@ class FuseConv2DBiasAdd
   bool AreFuseCompatible(Conv2DOp conv, BiasAddOp bias_add,
                          PatternRewriter &rewriter) const override {
     // Verify that the data formats match and are valid for fusion.
-    if (conv.data_format() != bias_add.data_format()) {
+    if (conv.getDataFormat() != bias_add.getDataFormat()) {
       (void)rewriter.notifyMatchFailure(conv, [&](Diagnostic &diag) {
         diag << "data format does not match Conv2D data format ("
-             << bias_add.data_format() << " vs " << conv.data_format() << ")";
+             << bias_add.getDataFormat() << " vs " << conv.getDataFormat()
+             << ")";
       });
       return false;
     }
@@ -299,7 +301,7 @@ class FuseConv2DBiasAdd
     if (IsGpuDevice(conv)) {
       auto activation = GetActivation(bias_add);
       if (!activation || activation->getName().stripDialect() != "Relu" ||
-          !bias_add.output().hasOneUse()) {
+          !bias_add.getOutput().hasOneUse()) {
         (void)rewriter.notifyMatchFailure(conv, [&](Diagnostic &diag) {
           diag << "GPU only supports Conv2D+BiasAdd+Relu fusion";
         });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
index f3c0f96e2ad..98e0f3b3454 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -67,7 +67,7 @@ struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
 
   LogicalResult matchAndRewrite(ReluOp relu_op,
                                 PatternRewriter &rewriter) const override {
-    Operation *relu_input = relu_op.features().getDefiningOp();
+    Operation *relu_input = relu_op.getFeatures().getDefiningOp();
     if (!relu_input) return failure();
     auto batch_norm = dyn_cast_or_null<FusedBatchNormV3Op>(relu_input);
     AddV2Op add_op;
@@ -79,20 +79,20 @@ struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
       if (!add_op) return failure();
 
       batch_norm =
-          dyn_cast_or_null<FusedBatchNormV3Op>(add_op.x().getDefiningOp());
+          dyn_cast_or_null<FusedBatchNormV3Op>(add_op.getX().getDefiningOp());
       if (batch_norm) {
-        side_input = add_op.y();
+        side_input = add_op.getY();
       } else {
         // Didn't get a FusedBatchNorm on the LHS of the AddV2, try the RHS.
         batch_norm =
-            dyn_cast_or_null<FusedBatchNormV3Op>(add_op.y().getDefiningOp());
+            dyn_cast_or_null<FusedBatchNormV3Op>(add_op.getY().getDefiningOp());
         if (!batch_norm) return failure();
-        side_input = add_op.x();
+        side_input = add_op.getX();
       }
     }
     assert(batch_norm);
-    if (batch_norm.is_training()) return failure();
-    if (!batch_norm.y().hasOneUse()) return failure();
+    if (batch_norm.getIsTraining()) return failure();
+    if (!batch_norm.getY().hasOneUse()) return failure();
 
     // Build the newly fused operation to replace the batch norm
     OperationState state(batch_norm.getLoc(),
@@ -105,7 +105,7 @@ struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
     rewriter.replaceOp(batch_norm, op->getResults());
 
     // Depending on the case, we may fuse the add, the relu, or both.
-    if (!add_op || add_op.z().hasOneUse()) {
+    if (!add_op || add_op.getZ().hasOneUse()) {
       // We fuse the Relu only if the add has a single use, otherwise we only
       // fuse the add itself.
       op->setAttr("activation_mode", rewriter.getStringAttr("Relu"));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
index 7691bd89235..9c03ce5def2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_loop_invariant.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "mlir/Transforms/SideEffectUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir {
@@ -71,7 +71,7 @@ bool ResourceOpCanBeHoisted(
   auto read_var_op = llvm::dyn_cast<ReadVariableOp>(op);
   if (!read_var_op) return false;
   auto var_handle_op = llvm::dyn_cast_or_null<VarHandleOp>(
-      read_var_op.resource().getDefiningOp());
+      read_var_op.getResource().getDefiningOp());
   if (!var_handle_op) return false;
   return read_only_vars.contains(GetResourceHandle(var_handle_op));
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
index d81cbcbbefc..ffc650cb907 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
@@ -71,9 +71,9 @@ void MoveTailWritesAfterReplicate(
   // returned.
   auto new_result_types = llvm::to_vector<4>(replicate_op->getResultTypes());
   for (auto assign : tail_assign_variable_ops) {
-    return_op->insertOperands(return_op->getNumOperands(), assign.value());
+    return_op->insertOperands(return_op->getNumOperands(), assign.getValue());
     new_result_types.insert(new_result_types.end(), num_replicas,
-                            assign.value().getType());
+                            assign.getValue().getType());
   }
 
   OpBuilder builder(replicate_op);
@@ -114,7 +114,7 @@ SmallVector<TF::AssignVariableOp> GetTailWritesToReplicateInvariantResourceVars(
     if (op_accessed_resources.empty()) continue;
 
     if (auto assign = llvm::dyn_cast<TF::AssignVariableOp>(op)) {
-      Value resource_var = assign.resource();
+      Value resource_var = assign.getResource();
       if (visited_resources.contains(resource_var) ||
           !resource_var.getParentRegion()->isProperAncestor(
               &replicate_op.getRegion()))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
index be2e6575a81..67bf6fa4221 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/init_text_file_to_import.cc
@@ -74,14 +74,14 @@ class ConvertInitializeTableFromTextFileV2
     //
     // In the above case, the delimiter will be not used since the key is just a
     // whole line and value is a line number.
-    if (op.key_index() != kTextFileIndex_WholeLine ||
-        op.value_index() != kTextFileIndex_LineNumber) {
+    if (op.getKeyIndex() != kTextFileIndex_WholeLine ||
+        op.getValueIndex() != kTextFileIndex_LineNumber) {
       return failure();
     }
 
     // Try to find filename from constant op.
     DenseStringElementsAttr filename_attr;
-    if (!matchPattern(op.filename().getDefiningOp(),
+    if (!matchPattern(op.getFilename().getDefiningOp(),
                       m_Constant(&filename_attr))) {
       return failure();
     }
@@ -111,7 +111,7 @@ class ConvertInitializeTableFromTextFileV2
     file->getBuffer().split(lines, "\n", -1, false);
     // The resize method is used since split operator puts tail value in the end
     // without splitting the leftovers.
-    if (op.vocab_size() != -1) lines.resize(op.vocab_size());
+    if (op.getVocabSize() != -1) lines.resize(op.getVocabSize());
 
     // Map each line to line number, starting from zero.
     SmallVector<int64_t, 8> line_nums;
@@ -130,7 +130,7 @@ class ConvertInitializeTableFromTextFileV2
         op.getLoc(), rewriter.getI64TensorAttr(line_nums));
 
     // Replace the given op with LookupTableImportV2Op.
-    rewriter.create<LookupTableImportV2Op>(op.getLoc(), op.table_handle(),
+    rewriter.create<LookupTableImportV2Op>(op.getLoc(), op.getTableHandle(),
                                            key_constant_tensor,
                                            value_constant_tensor);
     rewriter.eraseOp(op);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
index 4624f5107e5..44e178ac76c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/initialize_variables_in_session_init.cc
@@ -54,9 +54,6 @@ void InitializeVariable(TF::VarHandleOp var_handle_op,
                                   const_op.getResult()});
 }
 
-constexpr char kTfSavedModelExportedNameAttr[] =
-    "tf_saved_model.exported_names";
-
 func::FuncOp CreateSessionInitFunc(ModuleOp module) {
   constexpr char kSessionInitFuncName[] = "SessionInitializerFunction";
 
@@ -65,7 +62,7 @@ func::FuncOp CreateSessionInitFunc(ModuleOp module) {
       FunctionType::get(module.getContext(), /*inputs=*/{}, /*results=*/{});
   auto func = builder.create<func::FuncOp>(module->getLoc(),
                                            kSessionInitFuncName, func_type);
-  func->setAttr(kTfSavedModelExportedNameAttr,
+  func->setAttr(kTfSavedModelExportedNamesAttr,
                 builder.getStrArrayAttr({kSessionInitFuncName}));
   func->setAttr(kTfSavedModelInitializerTypeAttr,
                 builder.getStringAttr(kTfSavedModelInitializerRestoreType));
@@ -91,35 +88,23 @@ func::FuncOp GetOrCreateSessionInitFunc(ModuleOp module) {
   SessionInitializerOp session_init_op = GetSessionInitializerOp(module);
   if (!session_init_op) return CreateSessionInitFunc(module);
 
-  SymbolTable symbol_table(module);
-
-  // Find the init function that has tf_saved_model.initializer_type ==
-  // "restore_op".
-  for (auto init_sym :
-       session_init_op.getInitializers().getAsValueRange<FlatSymbolRefAttr>()) {
-    auto init_func_op = symbol_table.lookup<func::FuncOp>(init_sym);
-
-    const auto init_type_attr = init_func_op->getAttrOfType<StringAttr>(
-        kTfSavedModelInitializerTypeAttr);
-    if (init_type_attr &&
-        init_type_attr == kTfSavedModelInitializerRestoreType) {
-      return init_func_op;
-    }
-  }
-
-  // When the init function with type "restore_op" is not found, fall back to
-  // taking the init function corresponding to the first symbol in the
-  // initializers list to be backwards-compatible, before
-  // tf_saved_model.initializer_type attribute was introduced.
-  if (!session_init_op.getInitializers().empty()) {
-    auto init_func_op =
-        symbol_table.lookup<func::FuncOp>(session_init_op.getInitializers()[0]
-                                              .cast<FlatSymbolRefAttr>()
-                                              .getValue());
+  auto init_func_op = GetInitializerFunction(
+      module, /*initializer_type=*/kTfSavedModelInitializerRestoreType);
+  if (init_func_op) {
     return init_func_op;
+  } else if (!session_init_op.getInitializers().empty()) {
+    // When the init function with type "restore_op" is not found, fall back to
+    // taking the init function corresponding to the first symbol in the
+    // initializers list to be backwards-compatible, before
+    // tf_saved_model.initializer_type attribute was introduced.
+    SymbolTable symbol_table(module);
+    return symbol_table.lookup<func::FuncOp>(
+        session_init_op.getInitializers()[0]
+            .cast<FlatSymbolRefAttr>()
+            .getValue());
+  } else {
+    return CreateSessionInitFunc(module);
   }
-
-  return CreateSessionInitFunc(module);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
index 40e22f503bd..aa1efc6837e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/launch_to_device_attribute.cc
@@ -13,16 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/Casting.h"
+#include <memory>
+#include <vector>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -35,6 +40,11 @@ constexpr char kDeviceAttr[] = "device";
 struct LaunchToDeviceAttributePass
     : public impl::LaunchToDeviceAttributePassBase<
           LaunchToDeviceAttributePass> {
+ public:
+  explicit LaunchToDeviceAttributePass(bool legacy_graph_export) {
+    legacy_graph_export_ = legacy_graph_export;
+  }
+
   void runOnOperation() override;
 };
 
@@ -42,9 +52,14 @@ struct LaunchToDeviceAttributePass
 LogicalResult AssignDevicesInRegion(const Dialect* tf_dialect,
                                     tf_device::LaunchOp launch,
                                     Region& region) {
+  auto parallel_group_attr =
+      launch->getAttrOfType<StringAttr>(TF::kParallelExecAnnotation);
   auto result = region.walk([&](Operation* op) -> WalkResult {
     if (op->getDialect() != tf_dialect) return WalkResult::advance();
 
+    if (parallel_group_attr) {
+      op->setAttr(TF::kParallelExecAnnotation, parallel_group_attr);
+    }
     auto device_attr = op->getAttr(kDeviceAttr);
     if (!device_attr) {
       op->setAttr(kDeviceAttr, launch.getDeviceAttr());
@@ -109,13 +124,25 @@ void LaunchToDeviceAttributePass::runOnOperation() {
   });
 
   if (result.wasInterrupted()) return signalPassFailure();
+
+  if (!legacy_graph_export_) {
+    // Now, split the island into an island per op since we don't want to
+    // violate the invariant imposed by the GraphExport pipeline that every
+    // IslandOp perfectly wraps a single op.
+    auto control_type =
+        mlir::tf_executor::ControlType::get(tf_dialect->getContext());
+    getOperation().walk(
+        [&control_type](mlir::tf_executor::IslandOp curr_island) {
+          mlir::TF::SplitIsland(curr_island, control_type);
+        });
+  }
 }
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>>
-CreateLaunchToDeviceAttributePass() {
-  return std::make_unique<LaunchToDeviceAttributePass>();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLaunchToDeviceAttributePass(
+    bool legacy_graph_export) {
+  return std::make_unique<LaunchToDeviceAttributePass>(legacy_graph_export);
 }
 
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index 31676934544..0fad3c019ea 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -201,13 +201,13 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
       if (!perm) return;
 
       // With the same permutation indices.
-      auto dense_elem_attr = perm.value().dyn_cast<DenseElementsAttr>();
+      auto dense_elem_attr = perm.getValue().dyn_cast<DenseElementsAttr>();
       if (!dense_elem_attr) return;
 
       if (!permutation_op) permutation_op = perm;
 
       // Check that permutation matches for all result transposes.
-      if (perm.value() != permutation_op.value()) return;
+      if (perm.getValue() != permutation_op.getValue()) return;
 
       // Add a transpose operation for later reuse.
       transpose_ops.push_back(transpose);
@@ -217,7 +217,7 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
   // Nothing to do here.
   if (!permutation_op || transpose_ops.empty()) return;
   SmallVector<int64_t, 4> permutation;
-  auto perm_attr = permutation_op.value().cast<DenseElementsAttr>();
+  auto perm_attr = permutation_op.getValue().cast<DenseElementsAttr>();
   for (const auto& value : perm_attr.getValues<APInt>())
     permutation.push_back(value.getSExtValue());
 
@@ -247,7 +247,8 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
 
   // Bypass Transpose nodes for all results.
   for (OpResult result : op->getResults()) {
-    result.setType(cast<TransposeOp>(*result.getUsers().begin()).y().getType());
+    result.setType(
+        cast<TransposeOp>(*result.getUsers().begin()).getY().getType());
     for (Operation* transpose : result.getUsers()) {
       transpose->getResult(0).replaceAllUsesWith(result);
     }
@@ -342,13 +343,13 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
     if (!perm) return;
 
     // With the same permutation indices.
-    auto dense_elem_attr = perm.value().dyn_cast<DenseElementsAttr>();
+    auto dense_elem_attr = perm.getValue().dyn_cast<DenseElementsAttr>();
     if (!dense_elem_attr) return;
 
     if (!permutation_op) permutation_op = perm;
 
     // Check that permutation matches for all result transposes.
-    if (perm.value() != permutation_op.value()) return;
+    if (perm.getValue() != permutation_op.getValue()) return;
 
     // Add a transpose operation for later reuse only if it's used once.
     if (transpose.getResult().hasOneUse()) transpose_ops.push_back(transpose);
@@ -364,7 +365,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
 
   SmallVector<int64_t, 8> permutation;
 
-  auto attr = permutation_op.value().cast<DenseElementsAttr>();
+  auto attr = permutation_op.getValue().cast<DenseElementsAttr>();
   for (const auto& value : attr.getValues<APInt>())
     permutation.push_back(value.getSExtValue());
 
@@ -372,7 +373,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
   if (fold_operands && fold_transpose_in_ops) {
     SmallVector<int64_t, 8> permutation;
 
-    auto attr = permutation_op.value().cast<DenseElementsAttr>();
+    auto attr = permutation_op.getValue().cast<DenseElementsAttr>();
     for (const auto& value : attr.getValues<APInt>())
       permutation.push_back(value.getSExtValue());
 
@@ -421,7 +422,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list,
       transpose.getOperation()->moveBefore(op->getNextNode());
       transpose.setOperand(0, result);
       transpose.setOperand(1, permutation_op);
-      transpose.getResult().setType(original_type[idx]);
+      transpose.getResult().setType(original_type[idx].cast<TensorType>());
     } else {
       transpose = builder.create<TransposeOp>(loc, result, permutation_op);
     }
@@ -451,7 +452,7 @@ void MoveTransposesPass::runOnOperation() {
       }
     } else {
       // Try to push transpose after the user operation.
-      for (Operation* user : transpose.y().getUsers()) {
+      for (Operation* user : transpose.getY().getUsers()) {
         if (!llvm::isa<TransposeOp>(user)) work_list.push_back(user);
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index d7f55ea13c2..52522210055 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -28,7 +29,6 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/Sequence.h"
@@ -39,7 +39,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -59,7 +59,7 @@ limitations under the License.
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/lib/math/math_util.h"
 
@@ -70,11 +70,11 @@ namespace {
 using mhlo::DotDimensionNumbersAttr;
 
 // Replaces `region`'s terminator to TF::Yield.
-void ReplaceReturnOp(Region &region, PatternRewriter &rewriter) {
+void ReplaceReturnOp(Region& region, PatternRewriter& rewriter) {
   OpBuilder::InsertionGuard guard(rewriter);
 
-  for (auto &block : region.getBlocks()) {
-    Operation *terminator = block.getTerminator();
+  for (auto& block : region.getBlocks()) {
+    Operation* terminator = block.getTerminator();
     auto return_op = llvm::dyn_cast_or_null<mhlo::ReturnOp>(terminator);
     if (return_op == nullptr) continue;
 
@@ -88,7 +88,7 @@ void ReplaceReturnOp(Region &region, PatternRewriter &rewriter) {
 // to the splate constant value.
 // `SplatValueType` can be `APInt` or `APFloat`.
 template <typename SplatValueType>
-LogicalResult GetConstantSplatValue(Value value, SplatValueType &splat_value) {
+LogicalResult GetConstantSplatValue(Value value, SplatValueType& splat_value) {
   DenseElementsAttr attr;
   if (!matchPattern(value, m_Constant(&attr)) || !attr.isSplat()) {
     return failure();
@@ -107,7 +107,7 @@ struct PermutationAndShape {
 // applying the permutation to a given shape through a transpose.
 PermutationAndShape GetPermutationAndTransposedShape(
     llvm::ArrayRef<int64_t> permutation_array, ShapedType input_type,
-    ConversionPatternRewriter &rewriter) {
+    ConversionPatternRewriter& rewriter) {
   assert(permutation_array.size() == input_type.getRank());
   llvm::SmallVector<int64_t> transposed_shape(permutation_array.size());
   for (int64_t i = 0; i < permutation_array.size(); ++i) {
@@ -137,7 +137,7 @@ llvm::SmallVector<int64_t> GetInversePermutationArray(
 // permutation_array.
 DenseIntElementsAttr GetInversePermutation(
     llvm::ArrayRef<int64_t> permutation_array,
-    ConversionPatternRewriter &rewriter) {
+    ConversionPatternRewriter& rewriter) {
   SmallVector<int64_t, 4> inverse_permutation_array =
       GetInversePermutationArray(permutation_array);
   return DenseIntElementsAttr::get(
@@ -150,7 +150,7 @@ DenseIntElementsAttr GetInversePermutation(
 // applying the inverse permutation to a given shape through a transpose.
 PermutationAndShape GetInversePermutationAndShape(
     llvm::ArrayRef<int64_t> permutation_array, ShapedType input_type,
-    ConversionPatternRewriter &rewriter) {
+    ConversionPatternRewriter& rewriter) {
   SmallVector<int64_t, 4> inverse_permutation_array =
       GetInversePermutationArray(permutation_array);
   return GetPermutationAndTransposedShape(inverse_permutation_array, input_type,
@@ -169,13 +169,13 @@ struct ConvertNdConvOp {
     // All ones in "lhs_dilation" means this "mhlo.conv" op should be
     // converted to "tf.Conv2D" or "tf.DepthwiseConv2dNativeOp".
     if (conv_op.getLhsDilation().has_value()) {
-      auto lhs_dilation = conv_op.getLhsDilation().getValue();
+      auto lhs_dilation = conv_op.getLhsDilation().value();
       if (!lhs_dilation.isSplat() || lhs_dilation.getSplatValue<int64_t>() != 1)
         return false;
     }
 
     if (!conv_op.getWindowStrides().has_value() || conv_op.getWindowStrides()
-                                                           .getValue()
+                                                           .value()
                                                            .getType()
                                                            .cast<ShapedType>()
                                                            .getRank() != 1)
@@ -199,7 +199,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
   LogicalResult matchAndRewrite(
       mhlo::ConvolutionOp conv_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     //
     // Check that input is a supported 1d convolution.
     //
@@ -284,7 +284,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
     // Padding
     SmallVector<int64_t, 4> padding_2d_array;
-    for (const auto v : conv_op.getPadding().getValue().getValues<int64_t>()) {
+    for (const auto v : conv_op.getPadding().value().getValues<int64_t>()) {
       padding_2d_array.emplace_back(v);
     }
     // The newly added spatial dimension requires zero left and right padding.
@@ -295,8 +295,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
     // LHS dilation
     SmallVector<int64_t, 4> lhs_dilation_array_2d;
-    for (const auto v :
-         conv_op.getLhsDilation().getValue().getValues<int64_t>()) {
+    for (const auto v : conv_op.getLhsDilation().value().getValues<int64_t>()) {
       lhs_dilation_array_2d.emplace_back(v);
     }
     lhs_dilation_array_2d.push_back(1);
@@ -306,8 +305,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
     // RHS dilation
     SmallVector<int64_t, 4> rhs_dilation_array_2d;
-    for (const auto v :
-         conv_op.getRhsDilation().getValue().getValues<int64_t>()) {
+    for (const auto v : conv_op.getRhsDilation().value().getValues<int64_t>()) {
       rhs_dilation_array_2d.emplace_back(v);
     }
     rhs_dilation_array_2d.push_back(1);
@@ -396,16 +394,24 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
   LogicalResult matchAndRewrite(
       mhlo::ConvolutionOp conv_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (!IsSupportedConvOp(conv_op)) {
       return failure();
     }
 
+    // tf Convolution doesn't support quantized type.
+    if (conv_op.getRhs()
+            .getType()
+            .getElementType()
+            .isa<quant::QuantizedType>()) {
+      return failure();
+    }
+
     // Constructs strides array.
     // For example, [2, 3] -> [1, 2, 3, 1].
     SmallVector<int64_t, 4> strides({1});
     for (const auto v :
-         conv_op.getWindowStrides().getValue().getValues<int64_t>()) {
+         conv_op.getWindowStrides().value().getValues<int64_t>()) {
       strides.emplace_back(v);
     }
     strides.emplace_back(1);
@@ -415,8 +421,8 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
     if (auto rhs_dilation = conv_op.getRhsDilation()) {
       // For example, [2, 3] -> [1, 2, 3, 1].
       dilation.emplace_back(1);
-      dilation.append(rhs_dilation.getValue().getValues<int64_t>().begin(),
-                      rhs_dilation.getValue().getValues<int64_t>().end());
+      dilation.append(rhs_dilation.value().getValues<int64_t>().begin(),
+                      rhs_dilation.value().getValues<int64_t>().end());
       dilation.emplace_back(1);
     } else {
       // Default value
@@ -440,13 +446,12 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
     std::string padding;
     SmallVector<int64_t, 8> explicit_padding;
     if (!conv_op.getPadding().has_value() ||
-        (conv_op.getPadding().getValue().isSplat() &&
+        (conv_op.getPadding().value().isSplat() &&
          conv_op.getPadding()->getSplatValue<int64_t>() == 0)) {
       padding = "VALID";
     } else {
       SmallVector<int64_t, 4> padding_array;
-      for (const auto v :
-           conv_op.getPadding().getValue().getValues<int64_t>()) {
+      for (const auto v : conv_op.getPadding().value().getValues<int64_t>()) {
         padding_array.emplace_back(v);
       }
 
@@ -516,7 +521,7 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
                                 int default_feature_dim,
                                 int default_spatial_dim_start,
                                 int num_spatial_dims, RankedTensorType type,
-                                ConversionPatternRewriter &rewriter) const {
+                                ConversionPatternRewriter& rewriter) const {
     auto shape = type.getShape();
     llvm::SmallVector<int64_t, 4> permutation_array(num_spatial_dims + 2);
     permutation_array[default_batch_dim] = batch_dim;
@@ -541,7 +546,7 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
                      ArrayRef<int64_t> spatial_dimensions,
                      int default_batch_dim, int default_feature_dim,
                      int default_spatial_dim_start, int num_spatial_dims,
-                     ConversionPatternRewriter &rewriter) const {
+                     ConversionPatternRewriter& rewriter) const {
     auto type = value.getType().cast<RankedTensorType>();
     DenseIntElementsAttr permutation;
     const int spatial_dim_start = spatial_dimensions.front();
@@ -551,7 +556,7 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
       // Transpose is not needed because the current format is "NHWC".
       return value;
     }
-    std::pair<RankedTensorType &, DenseIntElementsAttr &>(type, permutation) =
+    std::pair<RankedTensorType&, DenseIntElementsAttr&>(type, permutation) =
         GetReformatTypeAndPermutation(batch_dim, feature_dim, spatial_dim_start,
                                       default_batch_dim, default_feature_dim,
                                       default_spatial_dim_start,
@@ -563,7 +568,7 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
   // Slices the input `value` if there are negative padding values in
   // `explicit_padding`.
   Value SliceNegativePadding(Value value, ArrayRef<int64_t> explicit_padding,
-                             ConversionPatternRewriter &rewriter) const {
+                             ConversionPatternRewriter& rewriter) const {
     // If no padding is negative return the input as is.
     if (llvm::all_of(explicit_padding, [](int64_t pad) { return pad >= 0; })) {
       return value;
@@ -607,7 +612,7 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
                     StringRef padding, ArrayRef<int64_t> explicit_padding,
                     ArrayRef<int64_t> dilation, bool is_depthwise_conv,
                     int input_channels, int num_spatial_dims,
-                    ConversionPatternRewriter &rewriter) const {
+                    ConversionPatternRewriter& rewriter) const {
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
     // Transposes lhs and rhs if their formats are not NHWC.
     Value lhs = FormatToNHWC(
@@ -637,8 +642,8 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
         /*default_batch_dim=*/0, /*default_feature_dim=*/num_spatial_dims + 1,
         /*default_spatial_dim_start=*/1);
     if (need_transpose_output) {
-      std::pair<RankedTensorType &, DenseIntElementsAttr &>(conv_output_type,
-                                                            permutation) =
+      std::pair<RankedTensorType&, DenseIntElementsAttr&>(conv_output_type,
+                                                          permutation) =
           GetReformatTypeAndPermutation(
               dnums.getOutputBatchDimension(),
               dnums.getOutputFeatureDimension(),
@@ -684,8 +689,8 @@ class Convert2DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
 
     if (need_transpose_output) {
       // Converts from "NHWC" format back to the original output format.
-      std::pair<RankedTensorType &, DenseIntElementsAttr &>(conv_output_type,
-                                                            permutation) =
+      std::pair<RankedTensorType&, DenseIntElementsAttr&>(conv_output_type,
+                                                          permutation) =
           GetReformatTypeAndPermutation(
               /*batch_dim=*/0, /*feature_dim=*/num_spatial_dims + 1,
               /*spatial_dim_start=*/1, dnums.getOutputBatchDimension(),
@@ -706,7 +711,7 @@ class ConvertNonTrivialConvOp
 
   LogicalResult matchAndRewrite(
       mhlo::ConvolutionOp conv_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (IsSupportedConvOp(conv_op, rewriter).failed()) {
       return rewriter.notifyMatchFailure(
           conv_op,
@@ -728,8 +733,8 @@ class ConvertNonTrivialConvOp
     // For example, [2, 3] -> [1, 2, 3, 1].
     SmallVector<int64_t, 4> strides({1});
     strides.append(
-        conv_op.getLhsDilation().getValue().getValues<int64_t>().begin(),
-        conv_op.getLhsDilation().getValue().getValues<int64_t>().end());
+        conv_op.getLhsDilation().value().getValues<int64_t>().begin(),
+        conv_op.getLhsDilation().value().getValues<int64_t>().end());
     strides.emplace_back(1);
 
     // Constructs dilation array.
@@ -737,8 +742,8 @@ class ConvertNonTrivialConvOp
     if (auto rhs_dilation = conv_op.getRhsDilation()) {
       // For example, [2, 3] -> [1, 2, 3, 1].
       dilation.emplace_back(1);
-      dilation.append(rhs_dilation.getValue().getValues<int64_t>().begin(),
-                      rhs_dilation.getValue().getValues<int64_t>().end());
+      dilation.append(rhs_dilation.value().getValues<int64_t>().begin(),
+                      rhs_dilation.value().getValues<int64_t>().end());
       dilation.emplace_back(1);
     } else {
       // Default value
@@ -748,7 +753,7 @@ class ConvertNonTrivialConvOp
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
     std::string padding;
     if (!conv_op.getPadding().has_value() ||
-        (conv_op.getPadding().getValue().isSplat() &&
+        (conv_op.getPadding().value().isSplat() &&
          conv_op.getPadding()->getSplatValue<int64_t>() == 0)) {
       padding = "VALID";
     } else {
@@ -807,7 +812,7 @@ class ConvertNonTrivialConvOp
   }
 
   LogicalResult IsSupportedConvOp(mhlo::ConvolutionOp conv_op,
-                                  ConversionPatternRewriter &rewriter) const {
+                                  ConversionPatternRewriter& rewriter) const {
     if (!conv_op.getLhs().getType().cast<ShapedType>().hasStaticShape() ||
         !conv_op.getRhs().getType().cast<ShapedType>().hasStaticShape() ||
         !conv_op.getType().cast<ShapedType>().hasStaticShape())
@@ -830,13 +835,13 @@ class ConvertNonTrivialConvOp
       return rewriter.notifyMatchFailure(conv_op,
                                          "requires lhs_dilation attribute");
     }
-    auto lhs_dilation = conv_op.getLhsDilation().getValue();
+    auto lhs_dilation = conv_op.getLhsDilation().value();
     if (lhs_dilation.isSplat() && lhs_dilation.getSplatValue<int64_t>() == 1)
       return rewriter.notifyMatchFailure(conv_op,
                                          "requires non-trivial lhs_dilation");
 
     if (!conv_op.getWindowStrides().has_value() || conv_op.getWindowStrides()
-                                                           .getValue()
+                                                           .value()
                                                            .getType()
                                                            .cast<ShapedType>()
                                                            .getRank() != 1)
@@ -893,7 +898,7 @@ class ConvertNonTrivialConvOp
   void CreateResizeBilinearOp(mhlo::ConvolutionOp conv_op,
                               llvm::ArrayRef<int32_t> output_sizes,
                               bool align_corners,
-                              ConversionPatternRewriter &rewriter) const {
+                              ConversionPatternRewriter& rewriter) const {
     Value output_sizes_attr = rewriter.create<ConstOp>(
         conv_op.getLoc(),
         DenseIntElementsAttr::get(
@@ -911,9 +916,9 @@ class ConvertNonTrivialConvOp
     rewriter.replaceOp(conv_op, {output});
   }
 
-  LogicalResult MatchResizeOp(mhlo::ConvolutionOp conv_op, bool &align_corners,
-                              llvm::SmallVector<int, 2> &output_sizes,
-                              ConversionPatternRewriter &rewriter) const {
+  LogicalResult MatchResizeOp(mhlo::ConvolutionOp conv_op, bool& align_corners,
+                              llvm::SmallVector<int, 2>& output_sizes,
+                              ConversionPatternRewriter& rewriter) const {
     mhlo::ConvDimensionNumbersAttr dnums = conv_op.getDimensionNumbers();
     auto input_spatial_dimensions = dnums.getInputSpatialDimensions();
     auto kernel_spatial_dimensions = dnums.getKernelSpatialDimensions();
@@ -934,10 +939,10 @@ class ConvertNonTrivialConvOp
       return rewriter.notifyMatchFailure(
           conv_op, "resize op requires rhs_dilation and padding");
 
-    auto lhs_dilation = conv_op.getLhsDilation().getValue();
-    auto rhs_dilation = conv_op.getRhsDilation().getValue();
-    auto window_strides = conv_op.getWindowStrides().getValue();
-    auto padding = conv_op.getPadding().getValue();
+    auto lhs_dilation = conv_op.getLhsDilation().value();
+    auto rhs_dilation = conv_op.getRhsDilation().value();
+    auto window_strides = conv_op.getWindowStrides().value();
+    auto padding = conv_op.getPadding().value();
     if (lhs_dilation.getNumElements() != 2 || !rhs_dilation.isSplat() ||
         rhs_dilation.getSplatValue<int64_t>() != 1 ||
         window_strides.getNumElements() != 2 || padding.getNumElements() != 4)
@@ -1023,7 +1028,7 @@ class ConvertSliceOp : public OpConversionPattern<mhlo::SliceOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::SliceOp slice_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     auto begin =
         rewriter.create<ConstOp>(slice_op.getLoc(), slice_op.getStartIndices());
     auto end =
@@ -1043,7 +1048,7 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::DynamicSliceOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     ShapedType input_type = op.getOperand().getType().cast<ShapedType>();
     if (!input_type.hasStaticShape()) return failure();
     Type start_indices_element_type = op.getStartIndices()
@@ -1101,33 +1106,33 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
 
 // Appends all elements in `range` to `values`.
 template <typename ValueT, typename Range>
-void Append(llvm::SmallVectorImpl<ValueT> &values, Range &&range) {
+void Append(llvm::SmallVectorImpl<ValueT>& values, Range&& range) {
   values.insert(values.end(), range.begin(), range.end());
 }
 
 // Appends all elements in `range` to `values`.
 template <typename ValueT, typename Range, typename... RangeTs>
-void Append(llvm::SmallVectorImpl<ValueT> &values, Range &&range,
-            RangeTs &&...ranges) {
+void Append(llvm::SmallVectorImpl<ValueT>& values, Range&& range,
+            RangeTs&&... ranges) {
   values.insert(values.end(), range.begin(), range.end());
   Append(values, ranges...);
 }
 
 // Returns the number of elements in `range`.
 template <typename Range>
-size_t Size(Range &&range) {
+size_t Size(Range&& range) {
   return range.size();
 }
 
 // Returns the total number of elements in a variadic number of `ranges`.
 template <typename Range, typename... RangeTs>
-size_t Size(Range &&range, RangeTs &&...ranges) {
+size_t Size(Range&& range, RangeTs&&... ranges) {
   return range.size() + Size(std::forward<RangeTs>(ranges)...);
 }
 
 // Concats all elements in `ranges` and returns a small vector as a result.
 template <typename ValueT, typename... RangeTs>
-llvm::SmallVector<ValueT, 4> Concat(RangeTs &&...ranges) {
+llvm::SmallVector<ValueT, 4> Concat(RangeTs&&... ranges) {
   llvm::SmallVector<int64_t, 4> results;
   results.reserve(Size(std::forward<RangeTs>(ranges)...));
   Append(results, std::forward<RangeTs>(ranges)...);
@@ -1144,16 +1149,16 @@ struct DimensionVector {
 };
 
 // Create a single const integer.
-Value BuildIntConstOp(ImplicitLocOpBuilder &builder,
-                      ConversionPatternRewriter &rewriter, int64_t const_value,
+Value BuildIntConstOp(ImplicitLocOpBuilder& builder,
+                      ConversionPatternRewriter& rewriter, int64_t const_value,
                       Type type) {
   Value result_const =
       builder.create<ConstOp>(rewriter.getIntegerAttr(type, const_value));
   return result_const;
 }
 // Create a const integer vector tensor (1-dim).
-Value BuildIntArrayConstOp(ImplicitLocOpBuilder &builder,
-                           ConversionPatternRewriter &rewriter,
+Value BuildIntArrayConstOp(ImplicitLocOpBuilder& builder,
+                           ConversionPatternRewriter& rewriter,
                            ArrayRef<int64_t> const_value, Type type) {
   DenseIntElementsAttr const_value_raw;
   if (type == rewriter.getI64Type()) {
@@ -1171,8 +1176,8 @@ Value BuildIntArrayConstOp(ImplicitLocOpBuilder &builder,
 }
 
 // Create a tensor that is reshaped from input.
-Value BuildReshapeOp(ImplicitLocOpBuilder &builder,
-                     ConversionPatternRewriter &rewriter, Value input,
+Value BuildReshapeOp(ImplicitLocOpBuilder& builder,
+                     ConversionPatternRewriter& rewriter, Value input,
                      ArrayRef<int64_t> shape, Type idx_type,
                      Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
@@ -1182,8 +1187,8 @@ Value BuildReshapeOp(ImplicitLocOpBuilder &builder,
 }
 
 // Create a tensor which is equal to input[begin: begin + size].
-Value BuildSliceOp(ImplicitLocOpBuilder &builder,
-                   ConversionPatternRewriter &rewriter, Value input,
+Value BuildSliceOp(ImplicitLocOpBuilder& builder,
+                   ConversionPatternRewriter& rewriter, Value input,
                    Value begin, ArrayRef<int64_t> shape, Type idx_type,
                    Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
@@ -1199,7 +1204,7 @@ class ConvertDynamicUpdateSliceOp
 
   LogicalResult matchAndRewrite(
       mhlo::DynamicUpdateSliceOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     ShapedType operand_type = op.getOperand().getType().cast<ShapedType>();
     ShapedType update_type =
         op.getUpdate().getType().dyn_cast_or_null<ShapedType>();
@@ -1409,7 +1414,7 @@ class StridedArrayViewBase {
       if (index[dim] < shape[dim]) return std::move(index);
       index[dim] = 0;
     }
-    return llvm::None;
+    return std::nullopt;
   }
 
  protected:
@@ -1472,7 +1477,7 @@ class StridedArrayView;  // Class requires specialization.
 template <>
 class StridedArrayView<DenseIntElementsAttr> : StridedArrayViewBase {
  public:
-  StridedArrayView(const DenseIntElementsAttr &data, ArrayRef<int64_t> shape,
+  StridedArrayView(const DenseIntElementsAttr& data, ArrayRef<int64_t> shape,
                    ArrayRef<int64_t> index, int64_t axis)
       : StridedArrayViewBase(shape, index, axis), data_(data) {
     int64_t element_count = 1;
@@ -1490,7 +1495,7 @@ class StridedArrayView<DenseIntElementsAttr> : StridedArrayViewBase {
   }
 
  private:
-  const DenseIntElementsAttr &data_;
+  const DenseIntElementsAttr& data_;
 };
 
 // Matches %iota generated from the following mlir codes (rank 2 example):
@@ -1542,11 +1547,11 @@ bool MatchIota(DenseIntElementsAttr dimensions, Value iota) {
          MatchIotaConst(dimensions, iota);
 }
 
-bool MatchTopKComparator(Region &comparator) {
+bool MatchTopKComparator(Region& comparator) {
   if (!comparator.hasOneBlock()) return false;
-  Block &comparator_blk = comparator.front();
+  Block& comparator_blk = comparator.front();
   using OpListType = llvm::iplist<Operation>;
-  OpListType &operations = comparator_blk.getOperations();
+  OpListType& operations = comparator_blk.getOperations();
   if (operations.size() != 2) return false;
   auto compare_op = dyn_cast_or_null<mhlo::CompareOp>(&operations.front());
   auto return_op = dyn_cast_or_null<mhlo::ReturnOp>(&operations.back());
@@ -1575,7 +1580,7 @@ class ConvertSortToTfTopk : public OpConversionPattern<mhlo::SortOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::SortOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (op->getOperands().size() != 2)
       return rewriter.notifyMatchFailure(
           op, "only match for the case where operands is of size 2");
@@ -1643,13 +1648,13 @@ class DotDimensionsInfo {
     }
   }
 
-  const DimensionVector &batch_dimensions() const { return batch_dimensions_; }
-  const DimensionVector &contracting_dimensions() const {
+  const DimensionVector& batch_dimensions() const { return batch_dimensions_; }
+  const DimensionVector& contracting_dimensions() const {
     return contracting_dimensions_;
   }
   // Out dimensions are any dimensions that are neither batch nor contracting
   // dimensions, hence will be propagated to output shape.
-  const DimensionVector &out_dimensions() const { return out_dimensions_; }
+  const DimensionVector& out_dimensions() const { return out_dimensions_; }
 
   // Returns the total dimension size after flattening all contracting
   // dimensions.
@@ -1674,7 +1679,7 @@ class DotDimensionsInfo {
   DimensionVector out_dimensions_;
 };
 
-Value ConvertDot(PatternRewriter &rewriter, Value lhs, Value rhs,
+Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                  DotDimensionNumbersAttr dot_dimension_numbers,
                  ShapedType result_type, mlir::Location loc) {
   auto lhs_type = lhs.getType().cast<ShapedType>();
@@ -1767,7 +1772,7 @@ Value ConvertDot(PatternRewriter &rewriter, Value lhs, Value rhs,
 
 // Converts mhlo.dot to tf.MatMul. Reshape ops will be inserted when
 // necessary.
-Value ConvertDotOp(PatternRewriter &rewriter, Operation *old_op) {
+Value ConvertDotOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_op = cast<mhlo::DotOp>(old_op);
   auto lhs_rank = dot_op.getLhs().getType().cast<ShapedType>().getRank();
   auto dot_dimension_numbers =
@@ -1784,7 +1789,7 @@ Value ConvertDotOp(PatternRewriter &rewriter, Operation *old_op) {
 
 // Converts mhlo.dot to tf.BatchMatMul. Reshape or Transpose ops will also be
 // inserted to convert to well-formed matrix multiply.
-Value ConvertDotGeneralOp(PatternRewriter &rewriter, Operation *old_op) {
+Value ConvertDotGeneralOp(PatternRewriter& rewriter, Operation* old_op) {
   auto dot_general_op = cast<mhlo::DotGeneralOp>(old_op);
   return ConvertDot(rewriter, dot_general_op.getLhs(), dot_general_op.getRhs(),
                     dot_general_op.getDotDimensionNumbers(),
@@ -1796,8 +1801,8 @@ Value ConvertDotGeneralOp(PatternRewriter &rewriter, Operation *old_op) {
 // inputs, passes it to an instance of the specifiied reduction op and then
 // returns the result.
 template <typename ReductionOp>
-LogicalResult MatchBinaryReduceFunction(mlir::Region &function) {
-  Block &body = function.front();
+LogicalResult MatchBinaryReduceFunction(mlir::Region& function) {
+  Block& body = function.front();
   if (body.getNumArguments() != 2) return failure();
 
   mhlo::ReturnOp return_op = dyn_cast<mhlo::ReturnOp>(body.back());
@@ -1818,8 +1823,8 @@ LogicalResult MatchBinaryReduceFunction(mlir::Region &function) {
 // inputs and returns the second input. Functions like this are used by update
 // scatter like ops.
 template <>
-LogicalResult MatchBinaryReduceFunction<void>(mlir::Region &function) {
-  Block &body = function.front();
+LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function) {
+  Block& body = function.front();
   if (body.getNumArguments() != 2) return failure();
 
   mhlo::ReturnOp return_op = dyn_cast<mhlo::ReturnOp>(body.back());
@@ -1834,7 +1839,7 @@ LogicalResult MatchBinaryReduceFunction<void>(mlir::Region &function) {
 template <typename TfReduceOp, typename TfBinOp>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        ConstOp reduction_indices,
-                                       ConversionPatternRewriter &rewriter) {
+                                       ConversionPatternRewriter& rewriter) {
   Value reduce_result = rewriter.create<TfReduceOp>(
       reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
       /*keep_dim=*/rewriter.getBoolAttr(false));
@@ -1849,14 +1854,14 @@ LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
 template <>
 LogicalResult rewriteNonMatchInitValue<TF::MaxOp, void>(
     mhlo::ReduceOp reduce_op, Value input, ConstOp reduction_indices,
-    ConversionPatternRewriter &rewriter) {
+    ConversionPatternRewriter& rewriter) {
   return failure();
 }
 
 template <>
 LogicalResult rewriteNonMatchInitValue<TF::MinOp, void>(
     mhlo::ReduceOp reduce_op, Value input, ConstOp reduction_indices,
-    ConversionPatternRewriter &rewriter) {
+    ConversionPatternRewriter& rewriter) {
   return failure();
 }
 
@@ -1877,7 +1882,7 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::ReduceOp reduce_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (failed(MatchReduceOpOperand(reduce_op))) return failure();
 
     if (failed(MatchBinaryReduceFunction<BinaryOp>(reduce_op.getBody())))
@@ -1888,7 +1893,7 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
     // Get reduction dimension.
     DenseIntElementsAttr dimension = reduce_op.getDimensions();
     SmallVector<int64_t, 4> reduce_dims;
-    for (const int64_t &dim : dimension.getValues<int64_t>()) {
+    for (const int64_t& dim : dimension.getValues<int64_t>()) {
       reduce_dims.emplace_back(dim);
     }
     auto dim_type = RankedTensorType::get(
@@ -2045,7 +2050,7 @@ class ConvertReduceOpToTfArgMinMax
   using OpConversionPattern::OpConversionPattern;
   LogicalResult matchAndRewrite(
       mhlo::ReduceOp reduce_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (reduce_op.getInputs().size() != 2) return failure();
     if (reduce_op.getDimensions().getNumElements() != 1) return failure();
 
@@ -2106,9 +2111,9 @@ class ConvertReduceOpToTfArgMinMax
   // %8 = select(%7, %lhs_index, %rhs_index)
   // return %3, %8
   // Also note that %1 may be folded if %lhs_value is of integer types.
-  LogicalResult matchReduceComputation(Region &computation,
+  LogicalResult matchReduceComputation(Region& computation,
                                        bool is_float) const {
-    Block &body = computation.front();
+    Block& body = computation.front();
     if (body.getNumArguments() != 4) return failure();
 
     mhlo::ReturnOp return_op = dyn_cast<mhlo::ReturnOp>(body.back());
@@ -2187,7 +2192,7 @@ class ConvertReduceOpToTfArgMinMax
 
   virtual mhlo::ComparisonDirection CompareDirection() const = 0;
 
-  virtual bool IsValueInitValue(const DenseElementsAttr &attr) const = 0;
+  virtual bool IsValueInitValue(const DenseElementsAttr& attr) const = 0;
 };
 
 class ConvertReduceOpToTfArgmax
@@ -2198,7 +2203,7 @@ class ConvertReduceOpToTfArgmax
   mhlo::ComparisonDirection CompareDirection() const override {
     return mhlo::ComparisonDirection::GT;
   }
-  bool IsValueInitValue(const DenseElementsAttr &attr) const override {
+  bool IsValueInitValue(const DenseElementsAttr& attr) const override {
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat() ||
         element_type.isInteger(1))
@@ -2222,7 +2227,7 @@ class ConvertReduceOpToTfArgmin
   mhlo::ComparisonDirection CompareDirection() const override {
     return mhlo::ComparisonDirection::LT;
   }
-  bool IsValueInitValue(const DenseElementsAttr &attr) const override {
+  bool IsValueInitValue(const DenseElementsAttr& attr) const override {
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat() ||
         element_type.isInteger(1))
@@ -2244,7 +2249,7 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::IotaOp iota_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     RankedTensorType type =
         iota_op.getType().dyn_cast_or_null<RankedTensorType>();
     // TF::RangeOp doesn't support UI16.
@@ -2298,8 +2303,8 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
 // true, also outputs the window strides and the TF padding mode ("VALID" or
 // "SAME").
 bool IsSpatialPoolingWithoutDilation(
-    mhlo::ReduceWindowOp rw, llvm::SmallVectorImpl<int64_t> *window_strides,
-    std::string *padding_mode) {
+    mhlo::ReduceWindowOp rw, llvm::SmallVectorImpl<int64_t>* window_strides,
+    std::string* padding_mode) {
   // tf.max_pool or tf.avg_pool need at least 3 dimensions (batch, spatial,
   // channel).
   const uint64_t rank = rw.getWindowDimensions().size();
@@ -2379,11 +2384,11 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
 
-  virtual bool IsInitValue(const DenseElementsAttr &attr) const = 0;
+  virtual bool IsInitValue(const DenseElementsAttr& attr) const = 0;
 
   LogicalResult matchAndRewrite(
       mhlo::ReduceWindowOp rw, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     if (rw.getNumResults() != 1 || rw.getInputs().size() != 1 ||
         rw.getInitValues().size() != 1)
       return failure();
@@ -2406,7 +2411,7 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
     // operand.
     auto is_splat_int64_ones =
         [&rewriter,
-         &operand_type](const ::llvm::Optional<DenseIntElementsAttr> &attr) {
+         &operand_type](const std::optional<DenseIntElementsAttr>& attr) {
           // According to the definition, the default value of these attributes
           // are all ones when unspecified.
           if (!attr.has_value()) return true;
@@ -2430,7 +2435,7 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
     // where N is the same as the size of the corresponding input dimension
     // and there is a 1-entry for each input dimension not being operated
     // over.
-    const auto &window_dimensions = rw.getWindowDimensions();
+    const auto& window_dimensions = rw.getWindowDimensions();
     if (window_dimensions.size() != operand_type.getRank()) return failure();
     int64_t cumulative_axis = -1;
     for (int64_t i = 0, e = window_dimensions.size(); i < e; ++i) {
@@ -2452,7 +2457,7 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
     //  dense<[[0, 0], [0, 0], [N-1, 0], [0, 0]]>
     // where N is the size of the input dimension being operated over.
     if (!rw.getPadding()) return failure();
-    const auto &padding = rw.getPadding()->getValues<int64_t>();
+    const auto& padding = rw.getPadding()->getValues<int64_t>();
     if (padding.size() != operand_type.getRank() * 2) return failure();
     int64_t padding_value = operand_type.getShape()[cumulative_axis] - 1;
     for (int64_t dim = 0; dim < operand_type.getRank(); ++dim) {
@@ -2480,7 +2485,7 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
 class ConvertLoweredCumSumOp
     : public ConvertLoweredCumOp<mhlo::AddOp, TF::CumsumOp> {
   using ConvertLoweredCumOp::ConvertLoweredCumOp;
-  bool IsInitValue(const DenseElementsAttr &attr) const override {
+  bool IsInitValue(const DenseElementsAttr& attr) const override {
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
@@ -2496,7 +2501,7 @@ class ConvertLoweredCumSumOp
 class ConvertLoweredCumProdOp
     : public ConvertLoweredCumOp<mhlo::MulOp, TF::CumprodOp> {
   using ConvertLoweredCumOp::ConvertLoweredCumOp;
-  bool IsInitValue(const DenseElementsAttr &attr) const override {
+  bool IsInitValue(const DenseElementsAttr& attr) const override {
     auto element_type = attr.getType().getElementType();
     if (attr.getNumElements() != 1 || !element_type.isIntOrFloat())
       return false;
@@ -2516,12 +2521,12 @@ class ConvertLoweredCumProdOp
 // * div(reduce_sum_window(x), reduce_sum_window(constant(1)))
 class ConvertAvgPoolOp : public OpConversionPattern<mhlo::DivOp> {
  public:
-  explicit ConvertAvgPoolOp(MLIRContext *context)
+  explicit ConvertAvgPoolOp(MLIRContext* context)
       : OpConversionPattern(context, /*benefit=*/10) {}
 
   LogicalResult matchAndRewrite(
       mhlo::DivOp div_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     auto rw =
         dyn_cast_or_null<mhlo::ReduceWindowOp>(div_op.getLhs().getDefiningOp());
     if (!rw || rw->getNumResults() != 1) return failure();
@@ -2573,8 +2578,9 @@ class ConvertAvgPoolOp : public OpConversionPattern<mhlo::DivOp> {
           window_strides, "VALID", rewriter);
     }
 
+    Value actual_divisor = recursivelyWalkUpDivisor(div_op.getRhs());
     auto rw_rhs =
-        dyn_cast_or_null<mhlo::ReduceWindowOp>(div_op.getRhs().getDefiningOp());
+        dyn_cast_or_null<mhlo::ReduceWindowOp>(actual_divisor.getDefiningOp());
     if (rw_rhs && rw_rhs.getNumResults() == 1) {
       // Check that RHS is a sum-reduce-window.
       if (failed(MatchBinaryReduceFunction<mhlo::AddOp>(rw_rhs.getBody())))
@@ -2618,7 +2624,7 @@ class ConvertAvgPoolOp : public OpConversionPattern<mhlo::DivOp> {
                                    llvm::ArrayRef<int64_t> ksizes,
                                    llvm::ArrayRef<int64_t> kstrides,
                                    llvm::StringRef padding,
-                                   ConversionPatternRewriter &rewriter) const {
+                                   ConversionPatternRewriter& rewriter) const {
     if (ksizes.size() == 4) {
       rewriter.replaceOpWithNewOp<AvgPoolOp>(
           op, op.getType(), input, rewriter.getI64ArrayAttr(ksizes),
@@ -2634,6 +2640,18 @@ class ConvertAvgPoolOp : public OpConversionPattern<mhlo::DivOp> {
     }
     return failure();
   }
+
+  // Walks up the divisor and ignore all precedding reshape/broadcast op.
+  // Returns the first producer op which is neither reshape nor broadcast.
+  Value recursivelyWalkUpDivisor(Value divisor) const {
+    while (llvm::isa_and_nonnull<mhlo::BroadcastInDimOp, mhlo::ReshapeOp>(
+        divisor.getDefiningOp())) {
+      Operation* producer = divisor.getDefiningOp();
+      divisor = producer->getOperand(/*idx=*/0);
+    }
+
+    return divisor;
+  }
 };
 
 class ConvertMaxPoolOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
@@ -2642,7 +2660,7 @@ class ConvertMaxPoolOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::ReduceWindowOp rw, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     // Check that the reduce-window is a max-reduce-window.
     if (failed(MatchBinaryReduceFunction<mhlo::MaxOp>(rw.getBody())))
       return failure();
@@ -2697,7 +2715,7 @@ class ConvertMaxPoolOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
                                    llvm::ArrayRef<int64_t> ksizes,
                                    llvm::ArrayRef<int64_t> kstrides,
                                    llvm::StringRef padding,
-                                   ConversionPatternRewriter &rewriter) const {
+                                   ConversionPatternRewriter& rewriter) const {
     if (ksizes.size() == 4) {
       rewriter.replaceOpWithNewOp<MaxPoolOp>(
           op, op.getType(0), input, rewriter.getI64ArrayAttr(ksizes),
@@ -2725,7 +2743,7 @@ class LegalizeHloToTf : public impl::LegalizeHloToTfPassBase<LegalizeHloToTf> {
 };
 
 // Returns the shape of the given value in a Constant Op.
-arith::ConstantOp ShapeToConst(PatternRewriter &rewriter, Value value) {
+arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
   ArrayRef<int64_t> shape = value.getType().cast<ShapedType>().getShape();
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
@@ -2794,7 +2812,7 @@ bool ValueGreaterThanZero(ElementsAttr float_or_int) {
 }
 
 // Returns whether the splat constant is the sign of the int or float Tensor.
-bool TensorIsSign(PatternRewriter &rewriter, ElementsAttr float_or_int,
+bool TensorIsSign(PatternRewriter& rewriter, ElementsAttr float_or_int,
                   ElementsAttr sgn_cst) {
   auto sgn_splat = llvm::dyn_cast<SplatElementsAttr>(sgn_cst);
   if (!sgn_splat) return false;
@@ -2893,9 +2911,9 @@ bool IsIotaAttr(ArrayRef<int64_t> arr, int64_t size) {
 // Note: NormalizeIndexVector is assumed to have run on the indices already so
 // that the index_vector_dim is the trailing dimension in `indices`.
 LogicalResult CanonicalizeScatterUpdates(
-    Operation *scatter_op, llvm::ArrayRef<int64_t> update_window_dims,
-    const Value &indices, const ShapedType &indices_type, Value &updates,
-    ShapedType &updates_type, ConversionPatternRewriter &rewriter) {
+    Operation* scatter_op, llvm::ArrayRef<int64_t> update_window_dims,
+    const Value& indices, const ShapedType& indices_type, Value& updates,
+    ShapedType& updates_type, ConversionPatternRewriter& rewriter) {
   auto canonical_update_window_dims = llvm::to_vector(
       llvm::seq<int64_t>(indices_type.getRank() - 1, updates_type.getRank()));
 
@@ -2936,10 +2954,10 @@ LogicalResult CanonicalizeScatterUpdates(
 // If index_vector_dim == indices.rank() then insert the implicit extra
 // dimension into indices to normalize everything to index_vector_dim ==
 // indices.rank() - 1.
-LogicalResult NormalizeIndexVector(Operation *parent_op, Value &indices,
-                                   ShapedType &indices_type,
+LogicalResult NormalizeIndexVector(Operation* parent_op, Value& indices,
+                                   ShapedType& indices_type,
                                    int64_t index_vector_dim,
-                                   ConversionPatternRewriter &rewriter) {
+                                   ConversionPatternRewriter& rewriter) {
   if (index_vector_dim == indices_type.getRank()) {
     llvm::SmallVector<int64_t, 4> new_start_indices_shape(
         indices_type.getShape().begin(), indices_type.getShape().end());
@@ -2974,7 +2992,7 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::GatherOp gather_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     Value operand = gather_op.getOperand();
     Value start_indices = gather_op.getStartIndices();
 
@@ -3036,7 +3054,7 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     auto offset_dims = gather_op.getDimensionNumbers().getOffsetDims();
     SmallVector<int64_t, 4> offset_dims_vector(offset_dims.begin(),
                                                offset_dims.end());
-    const TransposeParams &transpose_params =
+    const TransposeParams& transpose_params =
         CanonicalizeOffset(/*result_type=*/result_type,
                            /*original_offset_dims=*/offset_dims_vector);
 
@@ -3174,7 +3192,7 @@ class ConvertWhileOp : public OpConversionPattern<mhlo::WhileOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::WhileOp while_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     // HLO WhileOp should have two regions: cond and body.
     if (while_op->getNumRegions() != 2) return failure();
 
@@ -3190,10 +3208,10 @@ class ConvertWhileOp : public OpConversionPattern<mhlo::WhileOp> {
         while_op.getLoc(), while_op->getResultTypes(), while_op->getOperands(),
         /*parallel_iterations=*/10,
         /*is_stateless=*/false, /*shape_invariant=*/false);
-    new_while.cond().takeBody(while_op.getCond());
-    new_while.body().takeBody(while_op.getBody());
-    ReplaceReturnOp(new_while.cond(), rewriter);
-    ReplaceReturnOp(new_while.body(), rewriter);
+    new_while.getCond().takeBody(while_op.getCond());
+    new_while.getBody().takeBody(while_op.getBody());
+    ReplaceReturnOp(new_while.getCond(), rewriter);
+    ReplaceReturnOp(new_while.getBody(), rewriter);
     rewriter.replaceOp(while_op, new_while.getResults());
     return success();
   }
@@ -3205,16 +3223,16 @@ class ConvertIfOp : public OpConversionPattern<mhlo::IfOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::IfOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     // HLO IfOp currently doesn't support stateless
     auto new_op = rewriter.create<TF::IfRegionOp>(
         op.getLoc(), op->getResultTypes(), op.getPred(),
         /*is_stateless=*/false, /*_then_func_name=*/nullptr,
         /*_else_func_name=*/nullptr);
-    new_op.then_branch().takeBody(op.getTrueBranch());
-    new_op.else_branch().takeBody(op.getFalseBranch());
-    ReplaceReturnOp(new_op.then_branch(), rewriter);
-    ReplaceReturnOp(new_op.else_branch(), rewriter);
+    new_op.getThenBranch().takeBody(op.getTrueBranch());
+    new_op.getElseBranch().takeBody(op.getFalseBranch());
+    ReplaceReturnOp(new_op.getThenBranch(), rewriter);
+    ReplaceReturnOp(new_op.getElseBranch(), rewriter);
     rewriter.replaceOp(op, new_op.getResults());
     return success();
   }
@@ -3227,7 +3245,7 @@ class ConvertScatterOp : public OpConversionPattern<mhlo::ScatterOp> {
 
   LogicalResult matchAndRewrite(
       mhlo::ScatterOp scatter_op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const final {
+      ConversionPatternRewriter& rewriter) const final {
     OperandRange operands = scatter_op.getInputs();
     Value indices = scatter_op.getScatterIndices();
     OperandRange updates = scatter_op.getUpdates();
@@ -3342,7 +3360,7 @@ using ConvertScatterUpdateOp =
     ConvertScatterOp<void, TF::TensorScatterUpdateOp>;
 
 // Converts mhlo.pad to tf.PadV2
-Value ConvertPadOp(PatternRewriter &rewriter, Operation *old_op) {
+Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   auto pad_op = cast<mhlo::PadOp>(old_op);
   mlir::Location loc = pad_op.getLoc();
 
@@ -3376,7 +3394,7 @@ bool IsTFStyleBroadcast(DenseIntElementsAttr broadcast_dimensions,
 
 // Returns the intermediate shape that input tensor should be reshaped to during
 // legalization of BroadcastInDimOp.
-arith::ConstantOp ExpandedShape(PatternRewriter &rewriter, Value input,
+arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
                                 DenseIntElementsAttr broadcast_dimensions,
                                 Value output) {
   // Initialize expanded shape with output rank and dimensions of 1.
@@ -3403,7 +3421,7 @@ arith::ConstantOp ExpandedShape(PatternRewriter &rewriter, Value input,
 
 /// Performs the lowering to XLA dialect.
 void LegalizeHloToTf::runOnOperation() {
-  MLIRContext &context = getContext();
+  MLIRContext& context = getContext();
 
   // Add legalization patterns to the list.
   RewritePatternSet patterns(&getContext());
@@ -3422,8 +3440,8 @@ void LegalizeHloToTf::runOnOperation() {
 
 }  // end namespace
 
-void PopulateLegalizeHloToTfPatterns(RewritePatternSet *patterns,
-                                     MLIRContext *context) {
+void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
+                                     MLIRContext* context) {
   patterns->add<
       ConvertAvgPoolOp, Convert2DConvOp, Convert1DConvOp,
       ConvertNonTrivialConvOp, ConvertDynamicSliceOp,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 4a66528c49d..16493c30286 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -19,7 +19,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
 include "stablehlo/dialect/ChloOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td"
 
 // Check if broadcasting is compatible with TF ops.
 def IsLegalNumpyRankedBroadcast :
@@ -40,7 +40,7 @@ def IsNotTFStyleBroadcast : Constraint<Neg<CPred<"IsTFStyleBroadcast($0, $1)">>,
 // Return intermediate shape before broadcasting, wrapped in a constant op.
 def ExpandedShape : NativeCodeCall<"ExpandedShape($_builder, $0, $1, $2)">;
 
-def : Pat<(HLO_ConstantOp:$output $value), (TF_ConstOp $value),
+def : Pat<(MHLO_ConstantOp:$output $value), (TF_ConstOp $value),
           [(TF_Tensor $output)]>;
 
 //===----------------------------------------------------------------------===//
@@ -50,58 +50,58 @@ def : Pat<(HLO_ConstantOp:$output $value), (TF_ConstOp $value),
 // context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
-foreach fromToBinPair = [[HLO_AddOp, CHLO_BroadcastAddOp, TF_AddV2Op],
-                         [HLO_DivOp, CHLO_BroadcastDivOp, TF_DivOp],
-                         [HLO_ShiftLeftOp, CHLO_BroadcastShiftLeftOp, TF_LeftShiftOp],
-                         [HLO_MaxOp, CHLO_BroadcastMaxOp, TF_MaximumOp],
-                         [HLO_MinOp, CHLO_BroadcastMinOp, TF_MinimumOp],
-                         [HLO_MulOp, CHLO_BroadcastMulOp, TF_MulOp],
-                         [HLO_PowOp, CHLO_BroadcastPowOp, TF_PowOp],
-                         [HLO_SubtractOp, CHLO_BroadcastSubOp, TF_SubOp],
-                         [HLO_Atan2Op, CHLO_BroadcastAtan2Op, TF_Atan2Op]] in {
+foreach fromToBinPair = [[MHLO_AddOp, CHLO_BroadcastAddOp, TF_AddV2Op],
+                         [MHLO_DivOp, CHLO_BroadcastDivOp, TF_DivOp],
+                         [MHLO_ShiftLeftOp, CHLO_BroadcastShiftLeftOp, TF_LeftShiftOp],
+                         [MHLO_MaxOp, CHLO_BroadcastMaxOp, TF_MaximumOp],
+                         [MHLO_MinOp, CHLO_BroadcastMinOp, TF_MinimumOp],
+                         [MHLO_MulOp, CHLO_BroadcastMulOp, TF_MulOp],
+                         [MHLO_PowOp, CHLO_BroadcastPowOp, TF_PowOp],
+                         [MHLO_SubtractOp, CHLO_BroadcastSubOp, TF_SubOp],
+                         [MHLO_Atan2Op, CHLO_BroadcastAtan2Op, TF_Atan2Op]] in {
   def : Pat<(fromToBinPair[0] $l, $r), (fromToBinPair[2] $l, $r)>;
   def : Pat<(fromToBinPair[1] $l, $r, $broadcast_dimensions),
             (fromToBinPair[2] $l, $r),
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
-foreach pair  = [[HLO_AndOp, CHLO_BroadcastAndOp, TF_BitwiseAndOp],
-                 [HLO_OrOp, CHLO_BroadcastOrOp, TF_BitwiseOrOp],
-                 [HLO_XorOp, CHLO_BroadcastXorOp, TF_BitwiseXorOp]] in {
+foreach pair  = [[MHLO_AndOp, CHLO_BroadcastAndOp, TF_BitwiseAndOp],
+                 [MHLO_OrOp, CHLO_BroadcastOrOp, TF_BitwiseOrOp],
+                 [MHLO_XorOp, CHLO_BroadcastXorOp, TF_BitwiseXorOp]] in {
   def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r), (pair[2] $l, $r)>;
   def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $broadcast_dimensions),
             (pair[2] $l, $r),
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
-foreach pair  = [[HLO_AndOp, CHLO_BroadcastAndOp, TF_LogicalAndOp],
-                 [HLO_OrOp, CHLO_BroadcastOrOp, TF_LogicalOrOp]] in {
+foreach pair  = [[MHLO_AndOp, CHLO_BroadcastAndOp, TF_LogicalAndOp],
+                 [MHLO_OrOp, CHLO_BroadcastOrOp, TF_LogicalOrOp]] in {
   def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r), (pair[2] $l, $r)>;
   def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $broadcast_dimensions),
             (pair[2] $l, $r),
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
-def : Pat<(HLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(MHLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
 def : Pat<(CHLO_BroadcastShiftRightArithmeticOp $l, $r,
                                                      $broadcast_dimensions),
           (TF_RightShiftOp $l, $r),
           [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
-def : Pat<(HLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(MHLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
 def : Pat<(CHLO_BroadcastShiftRightLogicalOp $l, $r,
                                                   $broadcast_dimensions),
           (TF_RightShiftOp $l, $r),
           [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 
-def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
-def : Pat<(HLO_FloorOp (CHLO_BroadcastDivOp $l, $r,
+def : Pat<(MHLO_FloorOp (MHLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
+def : Pat<(MHLO_FloorOp (CHLO_BroadcastDivOp $l, $r,
                                                  $broadcast_dimensions)),
           (TF_FloorDivOp $l, $r),
           [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 
-def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
+def : Pat<(MHLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
 
-def : Pat<(HLO_RemOp TF_FpOrI32OrI64Tensor:$l, TF_FpOrI32OrI64Tensor:$r), (TF_ModOp $l, $r)>;
+def : Pat<(MHLO_RemOp TF_FpOrI32OrI64Tensor:$l, TF_FpOrI32OrI64Tensor:$r), (TF_ModOp $l, $r)>;
 def : Pat<(CHLO_BroadcastRemOp TF_FpOrI32OrI64Tensor:$l, TF_FpOrI32OrI64Tensor:$r, $broadcast_dimensions),
           (TF_ModOp $l, $r),
           [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
@@ -110,64 +110,98 @@ def : Pat<(CHLO_BroadcastRemOp TF_FpOrI32OrI64Tensor:$l, TF_FpOrI32OrI64Tensor:$
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_ConvertOp HLO_Tensor:$operand),
+def : Pat<(MHLO_ConvertOp MHLO_Tensor:$operand),
           (TF_CastOp $operand, ConstBoolAttrFalse)>;
 
-foreach Mapping = [[HLO_AbsOp, TF_AbsOp],
-                   [HLO_BitcastConvertOp, TF_BitcastOp],
-                   [HLO_CeilOp, TF_CeilOp],
-                   [HLO_CosineOp, TF_CosOp],
-                   [HLO_ExpOp, TF_ExpOp],
-                   [HLO_Expm1Op, TF_Expm1Op],
-                   [HLO_FloorOp, TF_FloorOp],
-                   [HLO_ImagOp, TF_ImagOp],
-                   [HLO_IsFiniteOp, TF_IsFiniteOp],
-                   [HLO_LogOp, TF_LogOp],
-                   [HLO_Log1pOp, TF_Log1pOp],
-                   [HLO_LogisticOp, TF_SigmoidOp],
-                   [HLO_NegOp, TF_NegOp],
-                   [HLO_RealOp, TF_RealOp],
-                   [HLO_RsqrtOp, TF_RsqrtOp],
-                   [HLO_SineOp, TF_SinOp],
-                   [HLO_SignOp, TF_SignOp],
-                   [HLO_SqrtOp, TF_SqrtOp],
-                   [HLO_TanhOp, TF_TanhOp]] in
- def : Pat<(Mapping[0] TF_IntOrFpTensor:$input), (Mapping[1] $input)>;
-
-def : Pat<(HLO_NotOp TF_BoolTensor:$input), (TF_LogicalNotOp $input)>;
-def : Pat<(HLO_AbsOp TF_ComplexTensor:$arg), (TF_ComplexAbsOp $arg)>;
-
-def : Pat<(HLO_BroadcastOp $arg, $shape),
+foreach Mapping = [[MHLO_AbsOp, TF_AbsOp],
+                   [MHLO_BitcastConvertOp, TF_BitcastOp],
+                   [MHLO_CeilOp, TF_CeilOp],
+                   [MHLO_CosineOp, TF_CosOp],
+                   [MHLO_ExpOp, TF_ExpOp],
+                   [MHLO_Expm1Op, TF_Expm1Op],
+                   [MHLO_FloorOp, TF_FloorOp],
+                   [MHLO_ImagOp, TF_ImagOp],
+                   [MHLO_IsFiniteOp, TF_IsFiniteOp],
+                   [MHLO_LogOp, TF_LogOp],
+                   [MHLO_Log1pOp, TF_Log1pOp],
+                   [MHLO_LogisticOp, TF_SigmoidOp],
+                   [MHLO_NegOp, TF_NegOp],
+                   [MHLO_RealOp, TF_RealOp],
+                   [MHLO_RsqrtOp, TF_RsqrtOp],
+                   [MHLO_SineOp, TF_SinOp],
+                   [MHLO_SignOp, TF_SignOp],
+                   [MHLO_SqrtOp, TF_SqrtOp],
+                   [MHLO_TanhOp, TF_TanhOp]] in
+  def : Pat<(Mapping[0] TF_IntOrFpTensor:$input), (Mapping[1] $input)>;
+
+class GetRankedScalarAttr<string prefix, int width, string signed, string value> :
+  NativeCodeCall<"DenseElementsAttr::get<" # prefix # "int" # width # "_t>("
+    "RankedTensorType::get({}, $_builder.getIntegerType("
+      # width # signed # "))," # value # ")">;
+
+def : Pat<(MHLO_NotOp TF_Int8Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"", 8, "", "-1">)))>;
+def : Pat<(MHLO_NotOp TF_Int16Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"", 16, "", "-1">)))>;
+def : Pat<(MHLO_NotOp TF_Int32Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"", 32, "", "-1">)))>;
+def : Pat<(MHLO_NotOp TF_Int64Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"", 64, "", "-1">)))>;
+
+def : Pat<(MHLO_NotOp TF_Uint8Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"u", 8, ", false", "0xFFU">)))>;
+def : Pat<(MHLO_NotOp TF_Uint16Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"u", 16, ", false",
+                                               "0xFFFFU">)))>;
+def : Pat<(MHLO_NotOp TF_Uint32Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"u", 32, ", false",
+                                               "0xFFFFFFFFUL">)))>;
+def : Pat<(MHLO_NotOp TF_Uint64Tensor:$input),
+            (TF_BitwiseXorOp $input,
+              (TF_ConstOp (GetRankedScalarAttr<"u", 64, ", false",
+                                               "0xFFFFFFFFFFFFFFFFULL">)))>;
+
+def : Pat<(MHLO_NotOp TF_BoolTensor:$input), (TF_LogicalNotOp $input)>;
+def : Pat<(MHLO_AbsOp TF_ComplexTensor:$arg), (TF_ComplexAbsOp $arg)>;
+
+def : Pat<(MHLO_BroadcastOp $arg, $shape),
           (TF_BroadcastToOp $arg, (TF_ConstOp $shape))>;
-def : Pat<(HLO_BroadcastInDimOp:$output $input, $broadcast_dimensions),
+def : Pat<(MHLO_BroadcastInDimOp:$output $input, $broadcast_dimensions),
           (TF_BroadcastToOp $input, (ShapeToConst $output)),
           [(IsTFStyleBroadcast $broadcast_dimensions, $output)]>;
-def : Pat<(HLO_BroadcastInDimOp:$output $input, $broadcast_dimensions),
+def : Pat<(MHLO_BroadcastInDimOp:$output $input, $broadcast_dimensions),
           (TF_BroadcastToOp
               (TF_ReshapeOp
                   $input,
                   (ExpandedShape $input, $broadcast_dimensions, $output)),
               (ShapeToConst $output)),
           [(IsNotTFStyleBroadcast $broadcast_dimensions, $output)]>;
-def : Pat<(HLO_TransposeOp $arg, $permutation),
+def : Pat<(MHLO_TransposeOp $arg, $permutation),
           (TF_TransposeOp $arg, (TF_ConstOp $permutation))>;
-def : Pat<(HLO_ReverseOp $op, $dims), (TF_ReverseV2Op $op, (TF_ConstOp $dims))>;
-def : Pat<(HLO_ReshapeOp:$output $input),
+def : Pat<(MHLO_ReverseOp $op, $dims), (TF_ReverseV2Op $op, (TF_ConstOp $dims))>;
+def : Pat<(MHLO_ReshapeOp:$output $input),
           (TF_ReshapeOp $input, (ShapeToConst $output))>;
 
 //===----------------------------------------------------------------------===//
 // Ternary op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_ClampOp $min, $arg, $max),
+def : Pat<(MHLO_ClampOp $min, $arg, $max),
           (TF_MaximumOp (TF_MinimumOp $arg, $max), $min)>;
-def : Pat<(HLO_SelectOp $cond, $t, $e), (TF_SelectOp $cond, $t, $e)>;
+def : Pat<(MHLO_SelectOp $cond, $t, $e), (TF_SelectOp $cond, $t, $e)>;
 
 //===----------------------------------------------------------------------===//
 // Variadic op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_ConcatenateOp $inputs, $dim),
+def : Pat<(MHLO_ConcatenateOp $inputs, $dim),
           (TF_ConcatV2Op $inputs, (TF_ConstOp $dim))>;
 
 //===----------------------------------------------------------------------===//
@@ -204,8 +238,8 @@ def IsMhloTFCompareType : AttrConstraint<
    HasMhloCompareType<"::mlir::mhlo::ComparisonType::NOTYPE">]>,
    "compare type supported by TensorFlow">;
 
-class HLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<HLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
+class MHLO_ComparisonDirectionValue<string enumStr> :
+  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
 
 foreach p = [[TF_EqualOp, CHLO_ComparisonDirectionValue<"EQ">],
              [TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">]] in {
@@ -215,9 +249,9 @@ foreach p = [[TF_EqualOp, CHLO_ComparisonDirectionValue<"EQ">],
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
-foreach p = [[TF_EqualOp, HLO_ComparisonDirectionValue<"EQ">],
-             [TF_NotEqualOp, HLO_ComparisonDirectionValue<"NE">]] in {
-  def : Pat<(HLO_CompareOp $l, $r, p[1], IsMhloTFCompareType:$type),
+foreach p = [[TF_EqualOp, MHLO_ComparisonDirectionValue<"EQ">],
+             [TF_NotEqualOp, MHLO_ComparisonDirectionValue<"NE">]] in {
+  def : Pat<(MHLO_CompareOp $l, $r, p[1], IsMhloTFCompareType:$type),
             (p[0] $l, $r, ConstBoolAttrTrue)>;
 }
 
@@ -231,32 +265,33 @@ foreach p = [[TF_GreaterEqualOp, CHLO_ComparisonDirectionValue<"GE">],
             [(IsLegalNumpyRankedBroadcast $l, $r, $broadcast_dimensions)]>;
 }
 
-foreach p = [[TF_GreaterEqualOp, HLO_ComparisonDirectionValue<"GE">],
-             [TF_GreaterOp, HLO_ComparisonDirectionValue<"GT">],
-             [TF_LessEqualOp, HLO_ComparisonDirectionValue<"LE">],
-             [TF_LessOp, HLO_ComparisonDirectionValue<"LT">]] in {
-  def : Pat<(HLO_CompareOp $l, $r, p[1], IsMhloTFCompareType:$type),
+foreach p = [[TF_GreaterEqualOp, MHLO_ComparisonDirectionValue<"GE">],
+             [TF_GreaterOp, MHLO_ComparisonDirectionValue<"GT">],
+             [TF_LessEqualOp, MHLO_ComparisonDirectionValue<"LE">],
+             [TF_LessOp, MHLO_ComparisonDirectionValue<"LT">]] in {
+  def : Pat<(MHLO_CompareOp $l, $r, p[1], IsMhloTFCompareType:$type),
             (p[0] $l, $r)>;
 }
 
 def ConvertDotOp : NativeCodeCall<"ConvertDotOp($_builder, "
                                                "$0.getDefiningOp())">;
-def : Pat<(HLO_DotOp:$old_value StaticShapeTensorOf<[TF_ElementType]>:$lhs,
+def : Pat<(MHLO_DotOp:$old_value StaticShapeTensorOf<[TF_ElementType]>:$lhs,
                StaticShapeTensorOf<[TF_ElementType]>:$rhs, $precision_config),
           (ConvertDotOp $old_value)>;
 
 def ConvertDotGeneralOp : NativeCodeCall<"ConvertDotGeneralOp($_builder, "
                                                "$0.getDefiningOp())">;
-def : Pat<(HLO_DotGeneralOp:$old_value AnyStaticShapeTensor:$lhs,
-               AnyStaticShapeTensor:$rhs, $dot_dimension_numbers,
-               $precision_config),
+def : Pat<(MHLO_DotGeneralOp:$old_value
+               StaticShapeTensorOf<[TF_ElementType]>:$lhs,
+               StaticShapeTensorOf<[TF_ElementType]>:$rhs,
+               $dot_dimension_numbers, $precision_config),
           (ConvertDotGeneralOp $old_value)>;
 
 def IsZero : Constraint<CPred<
   "$0.isSplat() && $0.getSplatValue<APInt>() == 0">>;
 def ConvertPadOp : NativeCodeCall<
   "ConvertPadOp($_builder, $0.getDefiningOp())">;
-def : Pat<(HLO_PadOp:$old_value $input, $pad_value, $pad_low, $pad_high,
+def : Pat<(MHLO_PadOp:$old_value $input, $pad_value, $pad_low, $pad_high,
                $pad_interior),
           (ConvertPadOp $old_value),
           [(IsZero $pad_interior)]>;
@@ -282,30 +317,30 @@ def SameTypeOrDefaultCompare : Constraint<CPred<
 //   return floor(x) + 1
 // else
 //   return floor(x)
-def : Pat<(HLO_SelectOp
-            (HLO_OrOp
-              (HLO_CompareOp (HLO_SubtractOp:$frac
+def : Pat<(MHLO_SelectOp
+            (MHLO_OrOp
+              (MHLO_CompareOp (MHLO_SubtractOp:$frac
                                $input,
-                               (HLO_FloorOp:$floor $input)),
-                             (HLO_ConstantOp $half),
-                             HLO_ComparisonDirectionValue<"GT">,
+                               (MHLO_FloorOp:$floor $input)),
+                             (MHLO_ConstantOp $half),
+                             MHLO_ComparisonDirectionValue<"GT">,
                              $compare_type0),
-              (HLO_AndOp
-                (HLO_CompareOp
+              (MHLO_AndOp
+                (MHLO_CompareOp
                   $frac1,
-                  (HLO_ConstantOp $half1),
-                  HLO_ComparisonDirectionValue<"EQ">,
+                  (MHLO_ConstantOp $half1),
+                  MHLO_ComparisonDirectionValue<"EQ">,
                   $compare_type1),
-                (HLO_CompareOp
-                  (HLO_SubtractOp
+                (MHLO_CompareOp
+                  (MHLO_SubtractOp
                     $floor1,
-                    (HLO_MulOp
-                      (HLO_FloorOp (HLO_MulOp $input, (HLO_ConstantOp $half2))),
-                      (HLO_ConstantOp $two))),
-                  (HLO_ConstantOp $one1),
-                  HLO_ComparisonDirectionValue<"EQ">,
+                    (MHLO_MulOp
+                      (MHLO_FloorOp (MHLO_MulOp $input, (MHLO_ConstantOp $half2))),
+                      (MHLO_ConstantOp $two))),
+                  (MHLO_ConstantOp $one1),
+                  MHLO_ComparisonDirectionValue<"EQ">,
                   $compare_type2))),
-            (HLO_AddOp $floor2, (HLO_ConstantOp $one)),
+            (MHLO_AddOp $floor2, (MHLO_ConstantOp $one)),
             $floor3),
           (TF_RoundOp $input),
           [(ValueEquals<"1.0"> $one),
@@ -330,19 +365,19 @@ def : Pat<(HLO_SelectOp
 //    if ((rem[i] < 0) != (arg0[i] < 0) && arg0[i] != 0)
 //       rem[i] += arg1[i]
 // return rem
-def : Pat<(HLO_SelectOp
-            (HLO_AndOp
-              (HLO_CompareOp
-                (HLO_CompareOp:$rltz
-                  (HLO_RemOp:$rem $arg, $arg1),
-                  (HLO_ConstantOp $cst),
-                  HLO_ComparisonDirectionValue<"LT">,
+def : Pat<(MHLO_SelectOp
+            (MHLO_AndOp
+              (MHLO_CompareOp
+                (MHLO_CompareOp:$rltz
+                  (MHLO_RemOp:$rem $arg, $arg1),
+                  (MHLO_ConstantOp $cst),
+                  MHLO_ComparisonDirectionValue<"LT">,
                   $compare_type),
-                (HLO_CompareOp:$arg1ltz $arg1, (HLO_ConstantOp $cst1), HLO_ComparisonDirectionValue<"LT">, $compare_type1),
-                HLO_ComparisonDirectionValue<"NE">,
+                (MHLO_CompareOp:$arg1ltz $arg1, (MHLO_ConstantOp $cst1), MHLO_ComparisonDirectionValue<"LT">, $compare_type1),
+                MHLO_ComparisonDirectionValue<"NE">,
                 $compare_type2),
-              (HLO_CompareOp:$rnz $rem1, (HLO_ConstantOp $cst2), HLO_ComparisonDirectionValue<"NE">, $compare_type3)),
-            (HLO_AddOp $rem2, $arg1),
+              (MHLO_CompareOp:$rnz $rem1, (MHLO_ConstantOp $cst2), MHLO_ComparisonDirectionValue<"NE">, $compare_type3)),
+            (MHLO_AddOp $rem2, $arg1),
             $rem3),
           (TF_FloorModOp $arg, $arg1),
           [(ValueEquals<"0.0"> $cst),
@@ -352,8 +387,7 @@ def : Pat<(HLO_SelectOp
            (SameValue $rem, $rem2),
            (SameValue $rem, $rem3),
            (SameTypeOrDefaultCompare $compare_type, $cst),
-           (SameTypeOrDefaultCompare $compare_type1, $cst1),
-           (SameTypeOrDefaultCompare $compare_type2, $compare_type)]>;
+           (SameTypeOrDefaultCompare $compare_type1, $cst1)]>;
 
 // Converts a dag of HLOs representing floor_mod with a constant to
 // tf.FloorMod. The pattern matched executes the following computation:
@@ -364,15 +398,15 @@ def : Pat<(HLO_SelectOp
 //    if (rem[i] < 0 && rem[i] != 0)
 //       rem[i] += cst
 // return rem
-def : Pat<(HLO_SelectOp
-            (HLO_AndOp
-              (HLO_CompareOp:$rltz
-                (HLO_RemOp:$rem $arg, (HLO_ConstantOp $cst)),
-                (HLO_ConstantOp $cst1),
-                HLO_ComparisonDirectionValue<"LT">,
+def : Pat<(MHLO_SelectOp
+            (MHLO_AndOp
+              (MHLO_CompareOp:$rltz
+                (MHLO_RemOp:$rem $arg, (MHLO_ConstantOp $cst)),
+                (MHLO_ConstantOp $cst1),
+                MHLO_ComparisonDirectionValue<"LT">,
                 $compare_type),
-              (HLO_CompareOp:$rnz $rem1, (HLO_ConstantOp $cst2), HLO_ComparisonDirectionValue<"NE">, $compare_type3)),
-            (HLO_AddOp $rem2, (HLO_ConstantOp $cst3)),
+              (MHLO_CompareOp:$rnz $rem1, (MHLO_ConstantOp $cst2), MHLO_ComparisonDirectionValue<"NE">, $compare_type3)),
+            (MHLO_AddOp $rem2, (MHLO_ConstantOp $cst3)),
             $rem3),
           (TF_FloorModOp $arg, (TF_ConstOp $cst3)),
           [(ValueGreaterThanZero $cst),
@@ -410,24 +444,24 @@ def : Pat<(HLO_SelectOp
 // the same function in this case the sign function. Named values like 'div'
 // refer to the same value produced by the same function, in this case division.
 // Mathematical symbols do not indicate a re-use of the value.
-def : Pat<(HLO_RoundOp
-            (HLO_SelectOp
-              (HLO_AndOp
-                (HLO_CompareOp
-                  (HLO_RemOp:$rem $arg0, $arg1),
-                  (HLO_ConstantOp $cst),
-                  HLO_ComparisonDirectionValue<"NE">,
+def : Pat<(MHLO_RoundOp
+            (MHLO_SelectOp
+              (MHLO_AndOp
+                (MHLO_CompareOp
+                  (MHLO_RemOp:$rem $arg0, $arg1),
+                  (MHLO_ConstantOp $cst),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type),
-                (HLO_CompareOp
-                  (HLO_SignOp $arg1),
-                  (HLO_SignOp $rem1),
-                  HLO_ComparisonDirectionValue<"NE">,
+                (MHLO_CompareOp
+                  (MHLO_SignOp $arg1),
+                  (MHLO_SignOp $rem1),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type1)),
-              (HLO_AddOp
-                (HLO_DivOp:$div
-                  (HLO_SubtractOp $arg0, $rem2),
+              (MHLO_AddOp
+                (MHLO_DivOp:$div
+                  (MHLO_SubtractOp $arg0, $rem2),
                   $arg1b),
-                (HLO_ConstantOp $cst_neg1)),
+                (MHLO_ConstantOp $cst_neg1)),
               $div1)),
           (TF_FloorDivOp $arg0, $arg1),
           [(ValueEquals<"0.0"> $cst),
@@ -466,24 +500,24 @@ def : Pat<(HLO_RoundOp
 // the same function in this case the sign function. Named values like 'div'
 // refer to the same value produced by the same function, in this case division.
 // Mathematical symbols do not indicate a re-use of the value.
-def : Pat<(HLO_RoundOp
-            (HLO_SelectOp
-              (HLO_AndOp
-                (HLO_CompareOp
-                  (HLO_RemOp:$rem $arg0, (HLO_ConstantOp:$cst $cstv)),
-                  (HLO_ConstantOp $cst_zero),
-                  HLO_ComparisonDirectionValue<"NE">,
+def : Pat<(MHLO_RoundOp
+            (MHLO_SelectOp
+              (MHLO_AndOp
+                (MHLO_CompareOp
+                  (MHLO_RemOp:$rem $arg0, (MHLO_ConstantOp:$cst $cstv)),
+                  (MHLO_ConstantOp $cst_zero),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type),
-                (HLO_CompareOp
-                  (HLO_ConstantOp $cst_sgn),
-                  (HLO_SignOp $rem1),
-                  HLO_ComparisonDirectionValue<"NE">,
+                (MHLO_CompareOp
+                  (MHLO_ConstantOp $cst_sgn),
+                  (MHLO_SignOp $rem1),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type1)),
-              (HLO_AddOp
-                (HLO_MulOp:$mul
-                  (HLO_SubtractOp $arg0, $rem2),
-                  (HLO_ConstantOp $cst_recip)),
-                (HLO_ConstantOp $cst_neg1)),
+              (MHLO_AddOp
+                (MHLO_MulOp:$mul
+                  (MHLO_SubtractOp $arg0, $rem2),
+                  (MHLO_ConstantOp $cst_recip)),
+                (MHLO_ConstantOp $cst_neg1)),
               $mul1)),
           (TF_FloorDivOp $arg0, $cst),
           [(ValueEquals<"0.0"> $cst_zero),
@@ -524,24 +558,24 @@ def : Pat<(HLO_RoundOp
 // the same function in this case the sign function. Named values like 'div'
 // refer to the same value produced by the same function, in this case division.
 // Mathematical symbols do not indicate a re-use of the value.
-def : Pat<(HLO_RoundOp
-            (HLO_SelectOp
-              (HLO_AndOp
-                (HLO_CompareOp
-                  (HLO_RemOp:$rem $arg0, (HLO_ConstantOp:$cst $cstv)),
-                  (HLO_ConstantOp $cst_zero),
-                  HLO_ComparisonDirectionValue<"NE">,
+def : Pat<(MHLO_RoundOp
+            (MHLO_SelectOp
+              (MHLO_AndOp
+                (MHLO_CompareOp
+                  (MHLO_RemOp:$rem $arg0, (MHLO_ConstantOp:$cst $cstv)),
+                  (MHLO_ConstantOp $cst_zero),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type),
-                (HLO_CompareOp
-                  (HLO_ConstantOp $cst_sgn),
-                  (HLO_SignOp $rem1),
-                  HLO_ComparisonDirectionValue<"NE">,
+                (MHLO_CompareOp
+                  (MHLO_ConstantOp $cst_sgn),
+                  (MHLO_SignOp $rem1),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type1)),
-              (HLO_AddOp
-                (HLO_DivOp:$div
-                  (HLO_SubtractOp $arg0, $rem2),
-                  (HLO_ConstantOp $cstv1)),
-                (HLO_ConstantOp $cst_neg1)),
+              (MHLO_AddOp
+                (MHLO_DivOp:$div
+                  (MHLO_SubtractOp $arg0, $rem2),
+                  (MHLO_ConstantOp $cstv1)),
+                (MHLO_ConstantOp $cst_neg1)),
               $div1)),
           (TF_FloorDivOp $arg0, $cst),
           [(ValueEquals<"0.0"> $cst_zero),
@@ -586,27 +620,27 @@ def : Pat<(HLO_RoundOp
 // the same function in this case the sign function. Named values like 'div'
 // refer to the same value produced by the same function, in this case division.
 // Mathematical symbols do not indicate a re-use of the value.
-def : Pat<(HLO_RoundOp
-            (HLO_SelectOp
-              (HLO_AndOp
-                (HLO_CompareOp
-                  (HLO_RemOp:$rem $arg0,
-                    (HLO_BroadcastInDimOp:$bcst
-                      (HLO_ConstantOp $cstv),
+def : Pat<(MHLO_RoundOp
+            (MHLO_SelectOp
+              (MHLO_AndOp
+                (MHLO_CompareOp
+                  (MHLO_RemOp:$rem $arg0,
+                    (MHLO_BroadcastInDimOp:$bcst
+                      (MHLO_ConstantOp $cstv),
                       $broadcast_dimension)),
-                  (HLO_ConstantOp $cst_zero),
-                  HLO_ComparisonDirectionValue<"NE">,
+                  (MHLO_ConstantOp $cst_zero),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type),
-                (HLO_CompareOp
-                  (HLO_ConstantOp $cst_sgn),
-                  (HLO_SignOp $rem1),
-                  HLO_ComparisonDirectionValue<"NE">,
+                (MHLO_CompareOp
+                  (MHLO_ConstantOp $cst_sgn),
+                  (MHLO_SignOp $rem1),
+                  MHLO_ComparisonDirectionValue<"NE">,
                   $compare_type1)),
-              (HLO_AddOp
-                (HLO_DivOp:$div
-                  (HLO_SubtractOp $arg0, $rem2),
+              (MHLO_AddOp
+                (MHLO_DivOp:$div
+                  (MHLO_SubtractOp $arg0, $rem2),
                   $bcst1),
-                (HLO_ConstantOp $cst_neg1)),
+                (MHLO_ConstantOp $cst_neg1)),
               $div1)),
           (TF_FloorDivOp $arg0, $bcst),
           [(ValueEquals<"0.0"> $cst_zero),
@@ -623,6 +657,6 @@ def : Pat<(HLO_RoundOp
 // TorchIndexSelect op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_TorchIndexSelectOp $params, $indices, $axis, $batch_dims),
+def : Pat<(MHLO_TorchIndexSelectOp $params, $indices, $axis, $batch_dims),
           (TF_GatherV2Op $params, $indices, (TF_ConstOp $axis), $batch_dims)>;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/localize_var_handles.cc b/tensorflow/compiler/mlir/tensorflow/transforms/localize_var_handles.cc
index 54327cefe80..9aab63292e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/localize_var_handles.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/localize_var_handles.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
@@ -50,11 +51,11 @@ struct LocalizeVarHandlesPass
 void MaybeCreateVarHandleForOp(Operation* op, DataFlowSolver& solver) {
   Value resource;
   if (auto read = llvm::dyn_cast<TF::ReadVariableOp>(op)) {
-    resource = read.resource();
+    resource = read.getResource();
   } else if (auto write = llvm::dyn_cast<TF::AssignVariableOp>(op)) {
-    resource = write.resource();
+    resource = write.getResource();
   } else if (auto next = llvm::dyn_cast<TF::IteratorGetNextOp>(op)) {
-    resource = next.iterator();
+    resource = next.getIterator();
   }
 
   if (llvm::dyn_cast_or_null<TF::VarHandleOp>(resource.getDefiningOp())) {
@@ -79,11 +80,11 @@ void MaybeCreateVarHandleForOp(Operation* op, DataFlowSolver& solver) {
     container = "";
     shared_name = global.getSymName();
   } else if (auto handle = llvm::dyn_cast<TF::VarHandleOp>(source)) {
-    container = handle.container();
-    shared_name = handle.shared_name();
+    container = handle.getContainer();
+    shared_name = handle.getSharedName();
   } else if (auto it = llvm::dyn_cast<TF::IteratorOp>(source)) {
-    container = it.container();
-    shared_name = it.shared_name();
+    container = it.getContainer();
+    shared_name = it.getSharedName();
   } else {
     // Can't happen, as long as this file and resource_dataflow.cc are in sync.
     return;
@@ -98,7 +99,7 @@ void MaybeCreateVarHandleForOp(Operation* op, DataFlowSolver& solver) {
     // See core/kernels/data/iterator_ops.cc.)
     resource_op = builder.create<TF::IteratorOp>(
         op->getLoc(), resource.getType(), shared_name, container,
-        it.output_types(), it.output_shapes());
+        it.getOutputTypes(), it.getOutputShapes());
   } else {
     resource_op = builder.create<TF::VarHandleOp>(
         op->getLoc(), resource.getType(), container, shared_name);
@@ -111,6 +112,7 @@ void LocalizeVarHandlesPass::runOnOperation() {
 
   DataFlowSolver solver;
   solver.load<dataflow::DeadCodeAnalysis>();
+  solver.load<dataflow::SparseConstantPropagation>();
   solver.load<TF::ResourceDataflowAnalysis>();
   if (failed(solver.initializeAndRun(module))) return signalPassFailure();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
index c4ddc1c23a7..75ca6e7ce0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_globals_to_ml_program.cc
@@ -121,12 +121,6 @@ SymbolRefAttr lookupGlobalTensor(func::FuncOp func, Value resource,
 }
 
 static LogicalResult convertTFGlobals(ModuleOp module) {
-  if (auto sessionInitializer =
-          tf_saved_model::GetSessionInitializerOp(module)) {
-    return sessionInitializer.emitError()
-           << "Session initializer is not supported yet";
-  }
-
   OpBuilder globalBuilder(module.getBodyRegion());
   DenseMap<Operation *, std::string> opToName;
   for (auto globalTensor : module.getOps<tf_saved_model::GlobalTensorOp>()) {
@@ -155,22 +149,22 @@ static LogicalResult convertTFGlobals(ModuleOp module) {
     }
     bool success = true;
     func.walk([&](mlir::TF::ReadVariableOp op) {
-      auto sym = lookupGlobalTensor(func, op.resource(), syms, opToName);
+      auto sym = lookupGlobalTensor(func, op.getResource(), syms, opToName);
       success &= !!sym;
       if (!success) return;
       OpBuilder builder(op);
       auto load = builder.create<mlir::ml_program::GlobalLoadOp>(
-          op.getLoc(), op.value().getType(), sym);
-      op.value().replaceAllUsesWith(load.getResult());
+          op.getLoc(), op.getValue().getType(), sym);
+      op.getValue().replaceAllUsesWith(load.getResult());
       op.erase();
     });
     func.walk([&](mlir::TF::AssignVariableOp op) {
-      auto sym = lookupGlobalTensor(func, op.resource(), syms, opToName);
+      auto sym = lookupGlobalTensor(func, op.getResource(), syms, opToName);
       success &= !!sym;
       if (!success) return;
       OpBuilder builder(op);
       builder.create<mlir::ml_program::GlobalStoreOp>(op.getLoc(), sym,
-                                                      op.value());
+                                                      op.getValue());
       op.erase();
     });
     if (!success) return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index bb54390816a..1c9b1e03a66 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -260,8 +260,8 @@ class LowerAddNOp : public RewritePattern {
     // support variant type so variant types require special handling.
     if (getElementTypeOrSelf(addn_op.getType()).isa<VariantType>())
       return failure();
-    llvm::SmallVector<Value, 4> operands(addn_op.inputs().begin(),
-                                         addn_op.inputs().end());
+    llvm::SmallVector<Value, 4> operands(addn_op.getInputs().begin(),
+                                         addn_op.getInputs().end());
 
     int64_t n = operands.size();
     // Keep doing tree-based reduction when there are more than one operand.
@@ -331,8 +331,8 @@ class LowerDynamicStitchOp : public RewritePattern {
     // Extract out all the constant indices' attributes and verify that data
     // types are static.
     SmallVector<DenseIntElementsAttr, 4> indices;
-    indices.reserve(op.N());
-    for (auto it : llvm::zip(op.indices(), op.data())) {
+    indices.reserve(op.getN());
+    for (auto it : llvm::zip(op.getIndices(), op.getData())) {
       Value index = std::get<0>(it);
       Value data = std::get<1>(it);
 
@@ -361,7 +361,7 @@ class LowerDynamicStitchOp : public RewritePattern {
     // Prepare each of the output item by unpacking data and then putting it to
     // the specified index.
     SmallVector<Value, 8> values(out_ty.getDimSize(0));
-    for (auto it : llvm::zip(indices, op.data())) {
+    for (auto it : llvm::zip(indices, op.getData())) {
       DenseIntElementsAttr index_attr = std::get<0>(it);
       Value data = std::get<1>(it);
 
@@ -406,18 +406,18 @@ class ConvertFakeQuantWithMinMaxVarsOp : public RewritePattern {
                                 PatternRewriter &rewriter) const override {
     auto op = cast<FakeQuantWithMinMaxVarsOp>(src_op);
 
-    auto input = op.inputs();
+    auto input = op.getInputs();
     auto input_ty = input.getType().cast<ShapedType>();
     auto element_ty = input_ty.getElementType();
     auto scalar_ty = tensorflow::GetTypeFromTFTensorShape({}, element_ty);
 
-    auto num_bits = op.num_bits();
-    auto narrow_range = op.narrow_range();
+    auto num_bits = op.getNumBits();
+    auto narrow_range = op.getNarrowRange();
     const double bits_min = narrow_range ? 1 : 0;
     const double bits_max = (1 << num_bits) - 1;
 
-    auto float_min = op.min();
-    auto float_max = op.max();
+    auto float_min = op.getMin();
+    auto float_max = op.getMax();
 
     auto float_diff = rewriter.create<SubOp>(op.getLoc(), float_max, float_min);
 
@@ -534,7 +534,7 @@ class LowerInvertPermutationOp : public RewritePattern {
     auto op = cast<InvertPermutationOp>(src_op);
 
     Location loc = op.getLoc();
-    auto x_type = op.x().getType().dyn_cast<RankedTensorType>();
+    auto x_type = op.getX().getType().dyn_cast<RankedTensorType>();
     // x input must have static shape.
     if (!x_type || !x_type.hasStaticShape()) {
       return failure();
@@ -555,10 +555,10 @@ class LowerInvertPermutationOp : public RewritePattern {
     auto shape = rewriter.create<ConstOp>(
         loc, DenseElementsAttr::get(
                  shape_type, {static_cast<int>(x_type.getDimSize(0)), 1}));
-    auto indices = rewriter.create<ReshapeOp>(loc, op.x(), shape);
+    auto indices = rewriter.create<ReshapeOp>(loc, op.getX(), shape);
 
-    rewriter.replaceOpWithNewOp<TensorScatterUpdateOp>(op, result_type, op.x(),
-                                                       indices, updates);
+    rewriter.replaceOpWithNewOp<TensorScatterUpdateOp>(
+        op, result_type, op.getX(), indices, updates);
     return success();
   }
 };
@@ -616,8 +616,8 @@ class LowerLgammaOp : public RewritePattern {
     auto op = cast<LgammaOp>(src_op);
 
     Location loc = op.getLoc();
-    Value input = op.x();
-    TensorType original_tensor_type = op.x().getType().cast<TensorType>();
+    Value input = op.getX();
+    TensorType original_tensor_type = op.getX().getType().cast<TensorType>();
 
     // The approximation is not precise enough for float16. Do the computation
     // in float32 for that case.
@@ -814,13 +814,13 @@ class LowerPackOp : public RewritePattern {
     auto axis_value = rewriter.create<ConstOp>(
         loc, DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
                                         {}, rewriter.getIntegerType(64)),
-                                    op.axis()));
-    int64_t axis = op.axis();
+                                    op.getAxis()));
+    int64_t axis = op.getAxis();
 
     Type prev_input_ty, inferred_ty;
     SmallVector<Value, 4> expanded_inputs;
-    expanded_inputs.reserve(op.N());
-    for (Value input : op.values()) {
+    expanded_inputs.reserve(op.getN());
+    for (Value input : op.getValues()) {
       // If input type is different than the previous input type, infer the
       // output type. Otherwise, use the already inferred output type from the
       // previous iteration.
@@ -887,17 +887,17 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto op = cast<SpaceToBatchNDOp>(src_op);
 
     Location loc = op.getLoc();
-    auto input_type = op.input().getType().cast<TensorType>();
+    auto input_type = op.getInput().getType().cast<TensorType>();
     auto element_type = input_type.getElementType();
     if (!input_type.hasStaticShape()) {
       return failure();
     }
     ArrayRef<int64_t> input_shape = input_type.getShape();
-    auto block_shape_type = op.block_shape().getType().cast<TensorType>();
+    auto block_shape_type = op.getBlockShape().getType().cast<TensorType>();
     if (!block_shape_type.hasStaticShape()) {
       return failure();
     }
-    auto paddings_type = op.paddings().getType().cast<ShapedType>();
+    auto paddings_type = op.getPaddings().getType().cast<ShapedType>();
     if (!paddings_type.hasRank()) {
       return failure();
     }
@@ -913,12 +913,12 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto block_shape_i64_type = tensorflow::GetTypeFromTFTensorShape(
         block_shape_type.getShape(), rewriter.getIntegerType(64));
     auto block_shape_i64 =
-        rewriter.create<CastOp>(loc, block_shape_i64_type, op.block_shape());
+        rewriter.create<CastOp>(loc, block_shape_i64_type, op.getBlockShape());
 
     auto paddings_i64_type = tensorflow::GetTypeFromTFTensorShape(
         paddings_type.getShape(), rewriter.getIntegerType(64));
     auto paddings_i64 =
-        rewriter.create<CastOp>(loc, paddings_i64_type, op.paddings());
+        rewriter.create<CastOp>(loc, paddings_i64_type, op.getPaddings());
 
     auto pad00 = rewriter.create<ConstOp>(
         loc, DenseElementsAttr::get<int64_t>(
@@ -942,8 +942,8 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     ElementsAttr paddings;
     llvm::SmallVector<int64_t, 4> block_shape_ints;
     auto padded_shape = llvm::to_vector<4>(input_shape);
-    if (matchPattern(op.block_shape(), m_Constant(&block_shape)) &&
-        matchPattern(op.paddings(), m_Constant(&paddings))) {
+    if (matchPattern(op.getBlockShape(), m_Constant(&block_shape)) &&
+        matchPattern(op.getPaddings(), m_Constant(&paddings))) {
       for (uint64_t i = 0; i < block_rank; i++) {
         int64_t paddings_sum =
             paddings.getValues<APInt>()[{i, 0}].getSExtValue() +
@@ -955,7 +955,7 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
       }
     } else {
       for (int i = 0; i < block_rank; i++) {
-        padded_shape[i + 1] = ShapedType::kDynamicSize;
+        padded_shape[i + 1] = ShapedType::kDynamic;
       }
       block_shape_ints.resize(block_shape_type.getNumElements(), -1);
     }
@@ -964,7 +964,7 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
         tensorflow::GetTypeFromTFTensorShape(padded_shape, element_type);
     // padded = pad(input, full_paddings)
     auto padded =
-        rewriter.create<PadOp>(loc, padded_type, op.input(), full_paddings);
+        rewriter.create<PadOp>(loc, padded_type, op.getInput(), full_paddings);
 
     auto paddings_sum_type = tensorflow::GetTypeFromTFTensorShape(
         {input_rank}, rewriter.getIntegerType(64));
@@ -994,7 +994,7 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
         rewriter
             .create<SplitOp>(loc, padded_shape_splits_types, zero_i32,
                              padded_shape_tensor)
-            .output());
+            .getOutput());
 
     SmallVector<Type, 4> block_shape_splits_types(
         block_rank,
@@ -1003,7 +1003,7 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
         rewriter
             .create<SplitOp>(loc, block_shape_splits_types, zero_i32,
                              block_shape_i64)
-            .output());
+            .getOutput());
 
     SmallVector<int64_t, 4> outer_shape_ints;
     SmallVector<Value, 4> outer_shape_vals;
@@ -1099,7 +1099,7 @@ class LowerBatchToSpaceND : public RewritePattern {
   LogicalResult matchAndRewrite(Operation *src_op,
                                 PatternRewriter &rewriter) const override {
     auto op = cast<BatchToSpaceNDOp>(src_op);
-    auto input = op.input();
+    auto input = op.getInput();
     auto input_ty = input.getType().cast<ShapedType>();
     auto element_ty = input_ty.getElementType();
     if (!input_ty.hasStaticShape()) {
@@ -1111,8 +1111,8 @@ class LowerBatchToSpaceND : public RewritePattern {
 
     DenseIntElementsAttr block_shape;
     DenseIntElementsAttr crops;
-    if (!matchPattern(op.block_shape(), m_Constant(&block_shape)) ||
-        !matchPattern(op.crops(), m_Constant(&crops))) {
+    if (!matchPattern(op.getBlockShape(), m_Constant(&block_shape)) ||
+        !matchPattern(op.getCrops(), m_Constant(&crops))) {
       return failure();
     }
 
@@ -1279,11 +1279,15 @@ class LowerSparseMatMulOp : public RewritePattern {
 
     // Result type must be f32 for applying the pattern (currently this is
     // required by the op anyway but this might change).
-    if (!op.product().getType().cast<TensorType>().getElementType().isF32()) {
+    if (!op.getProduct()
+             .getType()
+             .cast<TensorType>()
+             .getElementType()
+             .isF32()) {
       return failure();
     }
     MLIRContext *context = rewriter.getContext();
-    llvm::SmallVector<Value, 2> operands{op.a(), op.b()};
+    llvm::SmallVector<Value, 2> operands{op.getA(), op.getB()};
     for (Value &operand : operands) {
       TensorType tensor_type = operand.getType().cast<TensorType>();
       Type element_type = tensor_type.getElementType();
@@ -1302,8 +1306,8 @@ class LowerSparseMatMulOp : public RewritePattern {
       operand = rewriter.create<CastOp>(op.getLoc(), tensor_type_f32, operand);
     }
     Value result = rewriter.create<MatMulOp>(
-        op.getLoc(), op.product().getType(), operands[0], operands[1],
-        op.transpose_a(), op.transpose_b());
+        op.getLoc(), op.getProduct().getType(), operands[0], operands[1],
+        op.getTransposeA(), op.getTransposeB());
 
     rewriter.replaceOp(op, {result});
     return success();
@@ -1319,8 +1323,8 @@ class Lower_UnaryOpsComposition
 
   LogicalResult matchAndRewrite(_UnaryOpsCompositionOp op,
                                 PatternRewriter &rewriter) const override {
-    Value result = op.x();
-    for (StringRef op_name : op.op_names().getAsValueRange<StringAttr>()) {
+    Value result = op.getX();
+    for (StringRef op_name : op.getOpNames().getAsValueRange<StringAttr>()) {
       std::string full_name = "tf." + op_name.str();
       // All ops in the sequences have the same result type as the original
       // result type.
@@ -1372,10 +1376,10 @@ class LowerResizeNearestNeighbor : public RewritePattern {
     auto loc = op.getLoc();
     auto result_ty = op.getType().cast<ShapedType>();
 
-    auto input = op.images();
+    auto input = op.getImages();
     auto input_ty = input.getType().cast<ShapedType>();
     auto input_element_ty = input_ty.getElementType();
-    auto out_size = op.size();
+    auto out_size = op.getSize();
     auto out_size_ty = out_size.getType().cast<ShapedType>();
     auto out_size_element_ty = out_size_ty.getElementType();
 
@@ -1425,7 +1429,7 @@ class LowerResizeNearestNeighbor : public RewritePattern {
         in_y_cst < 0 || in_x_cst < 0 ? -1 : in_y_cst * in_x_cst;
 
     // TODO(suderman): Add support for these optional parameters.
-    if (op.align_corners() == true || op.half_pixel_centers() == true) {
+    if (op.getAlignCorners() == true || op.getHalfPixelCenters() == true) {
       return failure();
     }
 
@@ -1615,14 +1619,15 @@ struct LowerRollOp : public RewritePattern {
                                 PatternRewriter &rewriter) const override {
     auto tf_roll_op = cast<RollOp>(op);
 
-    auto input_ty = tf_roll_op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty =
+        tf_roll_op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_ty || !input_ty.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require the type of input to have static shapes");
     }
 
     DenseIntElementsAttr shift_attr;
-    Value shift = tf_roll_op.shift();
+    Value shift = tf_roll_op.getShift();
     auto shift_ranked_attr_type = shift.getType().dyn_cast<RankedTensorType>();
     if (!shift_ranked_attr_type ||
         !matchPattern(shift, m_Constant(&shift_attr))) {
@@ -1630,7 +1635,7 @@ struct LowerRollOp : public RewritePattern {
     }
 
     DenseIntElementsAttr axis_attr;
-    Value axis = tf_roll_op.axis();
+    Value axis = tf_roll_op.getAxis();
     auto axis_ranked_attr_type = axis.getType().dyn_cast<RankedTensorType>();
     if (!axis_ranked_attr_type || !matchPattern(axis, m_Constant(&axis_attr))) {
       return failure();
@@ -1681,7 +1686,7 @@ struct LowerRollOp : public RewritePattern {
                                       size);
     };
 
-    auto result = tf_roll_op.input();
+    auto result = tf_roll_op.getInput();
     auto scalar_type =
         tensorflow::GetTypeFromTFTensorShape({}, rewriter.getIntegerType(32));
     for (int i = 0; i < adjusted_axis.size(); ++i) {
@@ -1697,7 +1702,7 @@ struct LowerRollOp : public RewritePattern {
           rewriter.create<ConstOp>(op->getLoc(), scalar_type, dim_attr);
       auto concat_op = rewriter.create<ConcatV2Op>(
           op->getLoc(), input_ty,
-          ArrayRef<Value>({slice_op_1.output(), slice_op_2.output()}),
+          ArrayRef<Value>({slice_op_1.getOutput(), slice_op_2.getOutput()}),
           concat_dim);
       result = concat_op.getResult();
     }
@@ -1721,7 +1726,7 @@ class LowerSoftmaxOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    Value logits = op.logits();
+    Value logits = op.getLogits();
     auto loc = op.getLoc();
 
     // Note that the TensorFlow Softmax op verifies that the input rank is
@@ -1833,7 +1838,8 @@ void PopulateTFLoweringBeforeHLOPatterns(MLIRContext *context,
       LowerSquaredDifferenceOpOnRealTensors,
       LowerSquaredDifferenceOpOneComplexTensors,
       LowerTanhGradOp,
-      LowerTruncateDivOp,
+      LowerTruncateDivOpOnIntTensors,
+      LowerTruncateDivOpOnFloatTensors,
       LowerXdivyOp,
       LowerXlog1pyOp,
       LowerXlogyOp>(context);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index 8464bc24610..7d0bb5b273a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -218,10 +218,23 @@ def LowerMulNoNanOp : BinaryNoNanPat<TF_MulNoNanOp, TF_MulOp>;
 // TruncateDiv op patterns.
 //===----------------------------------------------------------------------===//
 
-def LowerTruncateDivOp : Pat<
-  (TF_TruncateDivOp $lhs, $rhs),
+def LowerTruncateDivOpOnIntTensors : Pat<
+  (TF_TruncateDivOp TF_IntTensor:$lhs, $rhs),
   (TF_DivOp $lhs, $rhs)>;
 
+// Note: truncation could also be implemented as sign(x) * floor(abs(x)) or
+//       (-1 & x) || floor(abs(x)), based on performance benchmarks.
+def LowerTruncateDivOpOnFloatTensors : Pat<
+  (TF_TruncateDivOp TF_FloatTensor:$lhs, $rhs),
+  (TF_SelectV2Op
+    (TF_LessOp
+      (TF_DivOp:$div $lhs, $rhs),
+      (TF_ConstOp:$zero (GetScalarOfFloatType<"0.0"> $lhs))
+    ),
+    (TF_CeilOp $div),
+    (TF_FloorOp $div)
+  )>;
+
 //===----------------------------------------------------------------------===//
 // Fill op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_variable_ops_to_ml_program.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_variable_ops_to_ml_program.cc
index ea1ef145077..ac8048c6a4f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_variable_ops_to_ml_program.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_variable_ops_to_ml_program.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -45,8 +46,8 @@ namespace {
 
 std::string GetVariableName(Operation* op) {
   if (auto handle = dyn_cast<TF::VarHandleOp>(op)) {
-    std::string container = handle.container().str();
-    std::string shared_name = handle.shared_name().str();
+    std::string container = handle.getContainer().str();
+    std::string shared_name = handle.getSharedName().str();
     if (container.empty()) {
       return absl::StrCat("vars.", shared_name);
     } else {
@@ -61,9 +62,9 @@ std::string GetVariableName(Operation* op) {
 Operation* GetHandleSource(Operation* op, DataFlowSolver& solver) {
   Value resource;
   if (auto read = llvm::dyn_cast<TF::ReadVariableOp>(op)) {
-    resource = read.resource();
+    resource = read.getResource();
   } else if (auto write = llvm::dyn_cast<TF::AssignVariableOp>(op)) {
-    resource = write.resource();
+    resource = write.getResource();
   }
   const TF::ResourceDataflowAnalysis::StateT* state =
       solver.lookupState<TF::ResourceDataflowAnalysis::StateT>(resource);
@@ -90,7 +91,7 @@ Type GetGlobalType(Operation* source) {
     // Resources are represented as tensor<resource<tensor<...>>>, so
     // unwrap until we get to the inner tensor<...>.
     auto tensor =
-        llvm::dyn_cast<TensorType>(var_handle_op.resource().getType());
+        llvm::dyn_cast<TensorType>(var_handle_op.getResource().getType());
     if (!tensor) return nullptr;
     TF::ResourceType resource =
         llvm::dyn_cast<TF::ResourceType>(tensor.getElementType());
@@ -148,6 +149,7 @@ struct LowerVariableOpsToMlProgramPass
 
     DataFlowSolver solver;
     solver.load<dataflow::DeadCodeAnalysis>();
+    solver.load<dataflow::SparseConstantPropagation>();
     solver.load<TF::ResourceDataflowAnalysis>();
     if (failed(solver.initializeAndRun(module))) return signalPassFailure();
 
@@ -165,8 +167,8 @@ struct LowerVariableOpsToMlProgramPass
       Operation* load = builder.create<mlir::ml_program::GlobalLoadOp>(
           op.getLoc(), globalOp.getType(),
           SymbolRefAttr::get(op->getContext(), globalOp.getSymName()));
-      if (globalOp.getType() != op.value().getType()) {
-        load = builder.create<TF::CastOp>(op.getLoc(), op.value().getType(),
+      if (globalOp.getType() != op.getValue().getType()) {
+        load = builder.create<TF::CastOp>(op.getLoc(), op.getValue().getType(),
                                           load->getResult(0));
       }
       op.getResult().replaceAllUsesWith(load->getResult(0));
@@ -182,8 +184,8 @@ struct LowerVariableOpsToMlProgramPass
       symbol_table.insert(globalOp);
       OpBuilder builder(op);
       globalOp.setIsMutableAttr(builder.getUnitAttr());
-      Value value_to_store = op.value();
-      if (globalOp.getType() != op.value().getType()) {
+      Value value_to_store = op.getValue();
+      if (globalOp.getType() != op.getValue().getType()) {
         value_to_store = builder.create<TF::CastOp>(
             op.getLoc(), globalOp.getType(), value_to_store);
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_input_output_aliases.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_input_output_aliases.cc
index 842d7a8fe28..58eb0959eb3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_input_output_aliases.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_input_output_aliases.cc
@@ -62,7 +62,7 @@ LogicalResult BuildAliasingInfo(
     auto assign_op = llvm::dyn_cast_or_null<TF::AssignVariableOp>(
         result.use_begin()->getOwner());
     if (!assign_op) continue;
-    AliasInfo& alias_info = resource_alias_info_map[assign_op.resource()];
+    AliasInfo& alias_info = resource_alias_info_map[assign_op.getResource()];
     // TODO(b/184420848): We may not need to skip aliasing for entire function
     // in case of multiple assigns.
     if (alias_info.output_index != kUnassigned) {
@@ -82,7 +82,7 @@ LogicalResult BuildAliasingInfo(
         operand.get().getDefiningOp());
     if (!read_op) continue;
     if (!read_op->hasOneUse()) continue;
-    auto it = resource_alias_info_map.find(read_op.resource());
+    auto it = resource_alias_info_map.find(read_op.getResource());
     if (it == resource_alias_info_map.end()) continue;
     AliasInfo& alias_info = it->getSecond();
     // TODO(b/184420848): We may not need to skip aliasing for entire function
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index 0c268c2002c..980325dbb96 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <queue>
 #include <string>
 #include <utility>
@@ -90,17 +91,22 @@ void AddSupportedOpsUsingFolding(MLIRContext* context,
   supported_ops->insert(allowlist_ops.begin(), allowlist_ops.end());
 }
 
-// Adds the list of ops that are supported through dynamic padder using op by op
-// fallback to the TF2XLA bridge.
-// TODO(b/168036682): Remove this once ops are supported using dynamic padder
-// on MLIR bridge.
-void AddSupportedOpsUsingDynamicPadder(
-    MLIRContext* context, llvm::DenseSet<OperationName>* supported_ops) {
+// Adds the list of ops that are only supported in the old bridge.
+// TODO(b/168036682): Remove bounded dynamism ops now that MLIR bridge supports
+// bounded dynamism.
+// TODO(b/257574556): Remove the need for this manual list by making use of old
+// bridge phase 2 op list.
+void AddOldBridgeOnlyOps(MLIRContext* context,
+                         llvm::DenseSet<OperationName>* supported_ops) {
   llvm::SmallDenseSet<OperationName, 8> allowlist_ops = {
+      OperationName(TF::DynamicPartitionOp::getOperationName(), context),
+      OperationName(TF::OutfeedEnqueueOp::getOperationName(), context),
       OperationName(TF::WhereOp::getOperationName(), context),
       OperationName(TF::UniqueOp::getOperationName(), context),
       OperationName(TF::XlaSetDynamicDimensionSizeOp::getOperationName(),
                     context),
+      OperationName(TF::XlaSpmdFullToShardShapeOp::getOperationName(), context),
+      OperationName(TF::XlaSpmdShardToFullShapeOp::getOperationName(), context),
   };
 
   supported_ops->insert(allowlist_ops.begin(), allowlist_ops.end());
@@ -373,8 +379,8 @@ bool ContainsUncompilableOps(const Dialect* tf_dialect, Block* block,
 // Unmarks outside compilation for any op that has parents already
 // marked for outside compilation since the child will be extracted
 // anyways.
-void UnmarkChildren(Block* block) {
-  block->walk([&](Operation* op) {
+void UnmarkChildren(ModuleOp module) {
+  module->walk([&](Operation* op) {
     if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) return;
     Operation* iter_op = op;
     bool remove_attr = false;
@@ -409,12 +415,12 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   llvm::DenseSet<OperationName> supported_ops;
   PatternApplicator(std::move(patterns))
       .walkAllPatterns([&](const Pattern& pattern) {
-        Optional<OperationName> root_kind = pattern.getRootKind();
-        if (root_kind.has_value()) supported_ops.insert(root_kind.getValue());
+        std::optional<OperationName> root_kind = pattern.getRootKind();
+        if (root_kind.has_value()) supported_ops.insert(root_kind.value());
       });
   AddSupportedFunctionalOps(module.getContext(), &supported_ops);
   AddSupportedOpsUsingFolding(module.getContext(), &supported_ops);
-  AddSupportedOpsUsingDynamicPadder(module.getContext(), &supported_ops);
+  AddOldBridgeOnlyOps(module.getContext(), &supported_ops);
   AddRewrittenEmbeddingOps(module.getContext(), &supported_ops);
   AddRewrittenCompositeOps(module.getContext(), &supported_ops);
 
@@ -439,16 +445,7 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
 
   if (result.wasInterrupted()) return signalPassFailure();
 
-  module.walk([&](tf_device::ClusterOp cluster) {
-    // Only if `allow_soft_placement` attribute is true should we unmark ops
-    // for outside compilation.
-    auto soft_placement_attr =
-        cluster->getAttrOfType<BoolAttr>(kAllowSoftPlacementAttr);
-    if (!(soft_placement_attr && soft_placement_attr.getValue())) {
-      return;
-    }
-    UnmarkChildren(&cluster.GetBody());
-  });
+  UnmarkChildren(module);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
index 8a57f7193bd..8b1c1fe28c7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
@@ -48,7 +48,7 @@ class MaterializePassthroughOpPass
 
 void MaterializePassthroughOpPass::runOnOperation() {
   getOperation().walk([](TF::MlirPassthroughOp op) {
-    std::string module_string(op.mlir_module());
+    std::string module_string(op.getMlirModule());
     // Parse the module.
     auto nested_module =
         parseSourceString<ModuleOp>(module_string, op.getContext());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc b/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
index 9bccaaa3112..e47bc7f4771 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/merge_control_flow.cc
@@ -81,10 +81,10 @@ struct MergeControlFlowPass
 llvm::SmallSetVector<Operation*, 4> GetAllOpsFromIf(TF::IfRegionOp if_op) {
   llvm::SmallSetVector<Operation*, 4> all_ops;
   all_ops.insert(if_op);
-  for (Operation& op : if_op.then_branch().front()) {
+  for (Operation& op : if_op.getThenBranch().front()) {
     all_ops.insert(&op);
   }
-  for (Operation& op : if_op.else_branch().front()) {
+  for (Operation& op : if_op.getElseBranch().front()) {
     all_ops.insert(&op);
   }
   return all_ops;
@@ -121,14 +121,14 @@ bool SafeToMerge(TF::IfRegionOp first_if, TF::IfRegionOp second_if,
       dependencies.push_back(successor);
     }
   }
-  for (Operation& op : first_if.then_branch().front()) {
+  for (Operation& op : first_if.getThenBranch().front()) {
     for (auto* successor : side_effect_analysis.DirectControlSuccessors(&op)) {
       if (!downstream_if_ops.contains(successor) &&
           !destination_ops.contains(successor))
         dependencies.push_back(successor);
     }
   }
-  for (Operation& op : first_if.else_branch().front()) {
+  for (Operation& op : first_if.getElseBranch().front()) {
     for (auto* successor : side_effect_analysis.DirectControlSuccessors(&op)) {
       if (!downstream_if_ops.contains(successor) &&
           !destination_ops.contains(successor))
@@ -173,14 +173,14 @@ bool SafeToMerge(TF::IfRegionOp first_if, TF::IfRegionOp second_if,
 // Move the body excluding the terminators of else and then regions from
 // 'second_if' to 'first_if'.
 void MoveBranches(TF::IfRegionOp first_if, TF::IfRegionOp second_if) {
-  Block& first_if_then_block = first_if.then_branch().front();
-  auto& second_if_then_body = second_if.then_branch().front().getOperations();
+  Block& first_if_then_block = first_if.getThenBranch().front();
+  auto& second_if_then_body = second_if.getThenBranch().front().getOperations();
   first_if_then_block.getOperations().splice(
       first_if_then_block.without_terminator().end(), second_if_then_body,
       second_if_then_body.begin(), std::prev(second_if_then_body.end()));
 
-  Block& first_if_else_block = first_if.else_branch().front();
-  auto& second_if_else_body = second_if.else_branch().front().getOperations();
+  Block& first_if_else_block = first_if.getElseBranch().front();
+  auto& second_if_else_body = second_if.getElseBranch().front().getOperations();
   first_if_else_block.getOperations().splice(
       first_if_else_block.without_terminator().end(), second_if_else_body,
       second_if_else_body.begin(), std::prev(second_if_else_body.end()));
@@ -396,14 +396,14 @@ void ReplaceInternalUsage(llvm::SmallVector<TF::IfRegionOp, 8>& if_op_segment) {
       for (OpResult result : it->getResults()) {
         replaceAllUsesInRegionWith(
             result,
-            it->then_branch().front().getTerminator()->getOperand(
+            it->getThenBranch().front().getTerminator()->getOperand(
                 result.getResultNumber()),
-            it2->then_branch());
+            it2->getThenBranch());
         replaceAllUsesInRegionWith(
             result,
-            it->else_branch().front().getTerminator()->getOperand(
+            it->getElseBranch().front().getTerminator()->getOperand(
                 result.getResultNumber()),
-            it2->else_branch());
+            it2->getElseBranch());
       }
     }
   }
@@ -487,12 +487,12 @@ void CreateYieldOps(
     auto if_op = index_and_value.value();
     for (auto i : return_indices[index_and_value.index()]) {
       merged_then_yield_values.push_back(
-          if_op.then_branch().front().getTerminator()->getOperand(i));
+          if_op.getThenBranch().front().getTerminator()->getOperand(i));
     }
   }
-  builder.setInsertionPointToEnd(&new_if_op.then_branch().front());
+  builder.setInsertionPointToEnd(&new_if_op.getThenBranch().front());
   builder.create<TF::YieldOp>(
-      first_if.then_branch().front().getTerminator()->getLoc(),
+      first_if.getThenBranch().front().getTerminator()->getLoc(),
       /*operands=*/merged_then_yield_values);
 
   llvm::SmallVector<Value, 4> merged_else_yield_values;
@@ -500,12 +500,12 @@ void CreateYieldOps(
     auto if_op = index_and_value.value();
     for (auto i : return_indices[index_and_value.index()]) {
       merged_else_yield_values.push_back(
-          if_op.else_branch().front().getTerminator()->getOperand(i));
+          if_op.getElseBranch().front().getTerminator()->getOperand(i));
     }
   }
-  builder.setInsertionPointToEnd(&new_if_op.else_branch().front());
+  builder.setInsertionPointToEnd(&new_if_op.getElseBranch().front());
   builder.create<TF::YieldOp>(
-      first_if.else_branch().front().getTerminator()->getLoc(),
+      first_if.getElseBranch().front().getTerminator()->getLoc(),
       /*operands=*/merged_else_yield_values);
 }
 
@@ -541,12 +541,12 @@ void MergeIfPerSegment(
   builder.setInsertionPoint(if_op_segment.back().getOperation());
 
   auto new_if_op = builder.create<TF::IfRegionOp>(
-      first_if.getLoc(), merged_return_types, first_if.cond(),
+      first_if.getLoc(), merged_return_types, first_if.getCond(),
       llvm::all_of(if_op_segment,
-                   [&](TF::IfRegionOp op) { return op.is_stateless(); }),
-      first_if._then_func_nameAttr(), first_if._else_func_nameAttr());
-  new_if_op.then_branch().push_back(new Block);
-  new_if_op.else_branch().push_back(new Block);
+                   [&](TF::IfRegionOp op) { return op.getIsStateless(); }),
+      first_if.get_thenFuncNameAttr(), first_if.get_elseFuncNameAttr());
+  new_if_op.getThenBranch().push_back(new Block);
+  new_if_op.getElseBranch().push_back(new Block);
 
   // Replace internal usages of merged if ops.
   ReplaceInternalUsage(if_op_segment);
@@ -607,9 +607,9 @@ void OptimizeIfRegions(Block* block, ModuleOp module) {
       grouped_if_ops;
   llvm::SmallVector<Value, 4> if_cond_order;
   block->walk([&](TF::IfRegionOp if_op) {
-    auto it = grouped_if_ops.try_emplace(if_op.cond());
+    auto it = grouped_if_ops.try_emplace(if_op.getCond());
     if (it.second) {
-      if_cond_order.push_back(if_op.cond());
+      if_cond_order.push_back(if_op.getCond());
     }
     it.first->getSecond().push_back(if_op);
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
index 7588a7a7b93..90dc43afaee 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mlprogram.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace tensorflow {
 
@@ -36,7 +36,10 @@ void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
   // Remove unused global tensors, or make then immutable if possible.
   pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
 
-  pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
+  pm.addPass(
+      mlir::tf_saved_model::CreateConvertSessionInitializerToFunctionPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateDecomposeResourceOpsPass());
   pm.addPass(mlir::TF::CreateNameAnonymousIteratorsPass());
 
   // This will add regions to IfOp/WhileOp (turning them into IfRegionOp
@@ -50,7 +53,8 @@ void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
   pm.addPass(mlir::tf_saved_model::CreateStripSavedModuleMetadataPass());
 
   pm.addPass(mlir::TF::CreateRemoveUnusedArgumentsPass());
-  pm.addPass(mlir::TF::CreateRemoveUnusedWhileResultsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateRemoveUnusedWhileResultsPass());
 
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -63,8 +67,6 @@ void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
       /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
       tf2xla_fallback_device_type, /*prefer_tf2xla=*/false));
 
-  pm.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
-
   pm.addPass(mlir::TF::CreateStripTfAttributesPass());
 
   pm.addPass(mlir::createCanonicalizerPass());
@@ -73,7 +75,6 @@ void PopulateLowerToMlProgramAndHloPipeline(mlir::OpPassManager& pm) {
   pm.addPass(mlir::createCanonicalizerPass());
 
   pm.addPass(mlir::TF::CreateOrderByDialectPass());
-  pm.addPass(mlir::TF::CreateGroupByDialectPass());
 
   pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/name_anonymous_iterators.cc b/tensorflow/compiler/mlir/tensorflow/transforms/name_anonymous_iterators.cc
index 7031386c39a..ef3627f95a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/name_anonymous_iterators.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/name_anonymous_iterators.cc
@@ -52,7 +52,7 @@ int replace(OP op, int count) {
 
   auto new_op = builder.create<TF::IteratorOp>(
       op->getLoc(), op->getResultTypes()[0], name, /*container=*/"",
-      op.output_types(), op.output_shapes());
+      op.getOutputTypes(), op.getOutputShapes());
   op->getResults()[0].replaceAllUsesWith(new_op->getResults()[0]);
   if (op->use_empty()) op->erase();
   return count;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 7ddd52cd314..cd608bdf269 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -50,20 +50,20 @@ class SimplifyBroadcastReshape : public OpRewritePattern<BroadcastToOp> {
   LogicalResult matchAndRewrite(BroadcastToOp op,
                                 PatternRewriter &rewriter) const override {
     // Only rewrite if the Broadcast has only one consumer.
-    if (!op.output().hasOneUse()) return failure();
+    if (!op.getOutput().hasOneUse()) return failure();
 
-    Operation *user = *op.output().getUsers().begin();
+    Operation *user = *op.getOutput().getUsers().begin();
 
     auto reshape_op = llvm::dyn_cast_or_null<ReshapeOp>(user);
     if (!reshape_op) return failure();
 
-    auto reshape_type = reshape_op.output().getType().cast<ShapedType>();
+    auto reshape_type = reshape_op.getOutput().getType().cast<ShapedType>();
 
     if (!reshape_type.hasStaticShape()) return failure();
     ArrayRef<int64_t> reshape_shape = reshape_type.getShape();
 
-    auto input_type = op.input().getType().cast<ShapedType>();
-    auto output_type = op.output().getType().cast<ShapedType>();
+    auto input_type = op.getInput().getType().cast<ShapedType>();
+    auto output_type = op.getOutput().getType().cast<ShapedType>();
 
     if (!input_type.hasRank() || !output_type.hasRank()) return failure();
 
@@ -120,11 +120,11 @@ class SimplifyBroadcastReshape : public OpRewritePattern<BroadcastToOp> {
     auto new_reshape_type = RankedTensorType::get(new_reshape_dims, el_ty);
     ReshapeOp new_reshape =
         rewriter.create<ReshapeOp>(new_reshape_shape.getLoc(), new_reshape_type,
-                                   op.input(), new_reshape_shape);
+                                   op.getInput(), new_reshape_shape);
     TF::ConstOp new_broadcast_shape =
         GetI64ConstantTensor(rewriter, reshape_shape, op.getLoc());
     rewriter.replaceOpWithNewOp<BroadcastToOp>(
-        reshape_op, reshape_op.output().getType(), new_reshape,
+        reshape_op, reshape_op.getOutput().getType(), new_reshape,
         new_broadcast_shape);
     return success();
   }
@@ -166,8 +166,7 @@ void CreateTFStandardPipeline(OpPassManager &pm,
   func_pm.addPass(tf_executor::CreateTFExecutorGraphPruningPass());
   func_pm.addPass(tf_executor::CreateTFExecutorIslandCoarseningPass());
   func_pm.addPass(CreateMaterializePassthroughOpPass());
-  if (options.form_clusters)
-    func_pm.addPass(TFDevice::CreateClusterFormationPass());
+  if (options.form_clusters) pm.addPass(TFDevice::CreateClusterFormationPass());
 
   // Hopefully there is a single island left, or there wasn't any to begin with.
   // We now run the optimizer which operates mostly inside islands.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.cc b/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.cc
index d8f5ccb5325..5a3f91c0d23 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/order_by_dialect.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <queue>
+#include <utility>
 #include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h"
 
 namespace mlir {
 namespace TF {
@@ -36,19 +39,42 @@ std::vector<Operation*> groupOperationsByDialect(Block& block);
 // Reorder operations so that consecutive ops stay in the same dialect, as far
 // as possible. This is to optimize the op order for the group-by-dialect pass,
 // which factors consecutive same-dialect ops into functions.
-// TODO(kramm): This pass needs to become aware of side-effects between ops
-// of different dialects.
 class OrderByDialectPass
     : public impl::OrderByDialectPassBase<OrderByDialectPass> {
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OrderByDialectPass)
+  void runOnOperation() override;
+};
+
+int DialectOrdering(Operation* predecessor, Operation* op) {
+  return predecessor && predecessor->getName().getDialectNamespace() ==
+                            op->getName().getDialectNamespace();
+}
 
-  void runOnOperation() override {
-    getOperation().walk([](Operation* function) {
+void OrderByDialectPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  for (func::FuncOp func : module.getOps<func::FuncOp>()) {
+    std::vector<std::pair<Operation*, Operation*>> side_effect_data;
+    const detail::SideEffectAnalysisInfo* info = nullptr;
+    auto extra_dependencies =
+        [&](Operation* op,
+            bool incoming) -> llvm::SmallVector<Operation*, 4> const& {
+      return incoming ? info->DirectControlPredecessors(op)
+                      : info->DirectControlSuccessors(op);
+    };
+    // Some tests have recursive calls and other shenanigans, so allow
+    // them to skip side effect analysis.
+    if (!func->hasAttr("ignore_side_effects_for_testing")) {
+      info =
+          &getAnalysis<mlir::TF::SideEffectAnalysis>().GetAnalysisForFunc(func);
+    }
+    func->walk([&](Operation* function) {
       for (Region& region : function->getRegions()) {
         for (Block& block : region.getBlocks()) {
           if (block.empty()) continue;
-          auto ops = groupOperationsByDialect(block);
+          auto ops = SortBlockTopologically(
+              block, DialectOrdering,
+              info ? extra_dependencies : no_extra_dependencies);
           // Replace the block with the reordered block.
           for (Operation* op : ops) {
             op->remove();
@@ -58,97 +84,6 @@ class OrderByDialectPass
       }
     });
   }
-};
-
-// Similar to MLIR's topological sort (lib/Transforms/TopologicalSort.cpp)
-// but has an explicit scoring function to determine the next op to emit.
-// Note that this doesn't explicitly handle TF side effects. However,
-// it typically leaves the order of operations within a given dialect the
-// same, and different dialects tend to not access the same resources.
-std::vector<Operation*> groupOperationsByDialect(Block& block) {
-  llvm::DenseMap<Operation*, int> remaining_incoming_edges;
-  llvm::DenseMap<Operation*, int> position;
-  llvm::DenseMap<Operation*, Operation*> ancestor;
-  SmallVector<Operation*> ready;
-
-  int i = 0;
-  for (Operation& op : block.getOperations()) {
-    int incoming_edges = 0;
-    op.walk([&](Operation* child) {
-      ancestor[child] = &op;
-      for (Value v : child->getOperands()) {
-        if (v.getParentBlock() == &block) {
-          incoming_edges++;
-        }
-      }
-    });
-    remaining_incoming_edges[&op] = incoming_edges;
-    if (incoming_edges == 0) {
-      ready.push_back(&op);
-    }
-    position[&op] = i++;
-  }
-
-  std::queue<Value> todo;
-  for (Value value : block.getArguments()) {
-    todo.push(value);
-  }
-
-  StringRef current_dialect = "<none>";
-
-  std::vector<Operation*> result;
-  while (!todo.empty() || !ready.empty()) {
-    while (!todo.empty()) {
-      Value value = todo.front();
-      todo.pop();
-      // All operations that have all their inputs available are good to go.
-      // Uses, not Users, in case getUsers ever dedups.
-      for (OpOperand& operand : value.getUses()) {
-        Operation* user = ancestor[operand.getOwner()];
-        if (--remaining_incoming_edges[user] == 0) {
-          ready.push_back(user);
-        }
-      }
-    }
-
-    // Find the "best" operation to emit. We
-    // (a) stay in the same dialect as far as possible.
-    // (b) preserve order within the ops of one dialect.
-    // (c) emit the terminator last.
-    auto better = [&](Operation* a, Operation* b) {
-      if (a->hasTrait<OpTrait::IsTerminator>() !=
-          b->hasTrait<OpTrait::IsTerminator>()) {
-        return b->hasTrait<OpTrait::IsTerminator>();
-      }
-      bool a_current = a->getName().getDialectNamespace() == current_dialect;
-      bool b_current = b->getName().getDialectNamespace() == current_dialect;
-      if (a_current != b_current) {
-        return a_current;
-      }
-      return position[a] < position[b];  // preserve order
-    };
-
-    Operation* best = nullptr;
-    for (Operation* op : ready) {
-      if (best == nullptr || better(op, best)) {
-        best = op;
-      }
-    }
-
-    if (!best) {
-      assert(ready.empty());
-      return result;  // happens for unused results for ops in the todo list
-    }
-
-    // Consider this operation emitted, and make its results available.
-    ready.erase(std::find(ready.begin(), ready.end(), best));
-    current_dialect = best->getName().getDialectNamespace();
-    for (Value result : best->getResults()) {
-      todo.push(result);
-    }
-    result.push_back(best);
-  }
-  return result;
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
index b4e7630d219..dcb806635ee 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -65,15 +65,24 @@ limitations under the License.
 //  then this pass will run following `replicate-to-island` pass and
 //  `tf-executor-break-up-islands` pass.
 
+#include <memory>
+#include <string>
+
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -85,6 +94,9 @@ namespace {
 struct ParallelExecuteToIslandsPass
     : public impl::ParallelExecuteToIslandsPassBase<
           ParallelExecuteToIslandsPass> {
+  explicit ParallelExecuteToIslandsPass(bool legacy_graph_export) {
+    legacy_graph_export_ = legacy_graph_export;
+  }
   void runOnOperation() override;
 };
 
@@ -94,7 +106,8 @@ struct ParallelExecuteToIslandsPass
 void ExpandParallelExecuteToIslands(
     tf_executor::IslandOp island_op,
     tf_device::ParallelExecuteOp parallel_execute_op, OpBuilder* builder,
-    llvm::SmallVectorImpl<tf_executor::IslandOp>& executes) {
+    llvm::SmallVectorImpl<tf_executor::IslandOp>& executes,
+    bool legacy_graph_export, int parallel_group_idx) {
   const int num_regions = parallel_execute_op.getOperation()->getNumRegions();
   executes.reserve(num_regions);
 
@@ -117,19 +130,46 @@ void ExpandParallelExecuteToIslands(
     // Move over tf_device.parallel_execute body region into newly the created
     // island.
     execute_island.getBody().takeBody(*execute_block.getParent());
+
+    // In new graph export pipeline, we will update control dependencies in the
+    // end of the pipeline. Mostly, it will rely on side effect analysis by
+    // considering accessing resource only. However, for branches under parallel
+    // group, there should not be any control deps between them even side effect
+    // analysis indicate some control deps. Therefore, we will mark parallel
+    // group and branch information here so that `UpdateControlDependenciesPass`
+    // can fetch the related information later.
+    if (!legacy_graph_export) {
+      std::string group_annotation = absl::StrCat(
+          "p", std::to_string(parallel_group_idx), ":", std::to_string(i));
+      if (auto parallel_group_attr =
+              parallel_execute_op->getAttrOfType<StringAttr>(
+                  TF::kParallelExecAnnotation)) {
+        // Extend the existing attribute so that nested parallel execution
+        // structure is supported.
+        group_annotation = absl::StrCat(parallel_group_attr.getValue().str(),
+                                        ",", group_annotation);
+      }
+      for (auto& op : execute_island.GetBody()) {
+        op.setAttr(TF::kParallelExecAnnotation,
+                   builder->getStringAttr(group_annotation));
+      }
+    }
+
     executes.push_back(execute_island);
   }
 }
 
 void CreateIslandsFromParallelExecute(
     tf_executor::IslandOp island_op,
-    tf_device::ParallelExecuteOp parallel_execute_op) {
+    tf_device::ParallelExecuteOp parallel_execute_op, bool legacy_graph_export,
+    int parallel_group_idx) {
   OpBuilder builder(island_op);
 
   // Create islands for each region of the parallel_execute op.
   llvm::SmallVector<tf_executor::IslandOp, 4> executes;
   ExpandParallelExecuteToIslands(island_op, parallel_execute_op, &builder,
-                                 executes);
+                                 executes, legacy_graph_export,
+                                 parallel_group_idx);
 
   // Remap all results of parallel_execute op with outputs from newly created
   // islands.
@@ -163,22 +203,34 @@ void CreateIslandsFromParallelExecute(
     island_op.getControl().replaceAllUsesWith(island_sink.getControl());
   }
 
-  // Islands with no uses should be pinned to a graph fetch so they still
-  // execute.
-  llvm::SmallVector<Value, 8> unused_execute_controls;
-  for (auto& execute : executes)
-    if (execute.use_empty())
-      unused_execute_controls.push_back(execute.getControl());
-
-  if (!unused_execute_controls.empty()) {
-    auto graph_op = island_op->getParentOfType<tf_executor::GraphOp>();
-    tf_executor::FetchOp fetch = graph_op.GetFetch();
-    auto fetches = llvm::to_vector<8>(fetch.getOperands());
-    fetches.append(unused_execute_controls.begin(),
-                   unused_execute_controls.end());
-    builder.setInsertionPoint(fetch);
-    builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
-    fetch.erase();
+  if (legacy_graph_export) {
+    // Islands with no uses should be pinned to a graph fetch so they still
+    // execute.
+    llvm::SmallVector<Value, 8> unused_execute_controls;
+    for (auto& execute : executes)
+      if (execute.use_empty())
+        unused_execute_controls.push_back(execute.getControl());
+
+    if (!unused_execute_controls.empty()) {
+      auto graph_op = island_op->getParentOfType<tf_executor::GraphOp>();
+      tf_executor::FetchOp fetch = graph_op.GetFetch();
+      auto fetches = llvm::to_vector<8>(fetch.getOperands());
+      fetches.append(unused_execute_controls.begin(),
+                     unused_execute_controls.end());
+      builder.setInsertionPoint(fetch);
+      builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
+      fetch.erase();
+    }
+  } else {
+    // Now, finally, we need to maintain the invariant expected to be maintained
+    // throughout the graph export pipeline that all islands always perfectly
+    // wrap a single op. So we'll split all islands which wrap multiple ops.
+    auto control_type = tf_executor::ControlType::get(island_op.getContext());
+    for (auto& execute : executes) {
+      if (execute.GetBody().getOperations().size() > 1) {
+        mlir::TF::SplitIsland(execute, control_type);
+      }
+    }
   }
 
   island_op.erase();
@@ -190,24 +242,37 @@ void ParallelExecuteToIslandsPass::runOnOperation() {
   llvm::SmallVector<tf_executor::IslandOp, 4> parallel_execute_op_islands;
   getOperation().walk([&](tf_executor::GraphOp graph_op) {
     for (auto island_op : graph_op.getOps<tf_executor::IslandOp>()) {
-      if (!island_op.WrapsSingleOp()) continue;
+      if (!island_op.WrapsSingleOp()) {
+        island_op.emitError(
+            "tf_executor.island must perfectly wrap a single op");
+        signalPassFailure();
+      }
 
       if (isa<tf_device::ParallelExecuteOp>(&island_op.GetBody().front()))
         parallel_execute_op_islands.push_back(island_op);
     }
   });
 
+  // This number is unique within each function which is sufficient for
+  // `UpdateControlDependenciesPass` which consumes the related attributes.
+  // However, this assumes that we don't inline functions between this pass
+  // and `UpdateControlDependenciesPass`.
+  // If we need globally unique parallel group IDs in the future,
+  // we can either make this pass a module pass (using a global counter)
+  // or use an atomic counter.
+  int parallel_group_idx = 0;
   for (tf_executor::IslandOp island_op : parallel_execute_op_islands) {
     auto parallel_execute_op =
         cast<tf_device::ParallelExecuteOp>(island_op.GetBody().front());
-    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
+    CreateIslandsFromParallelExecute(island_op, parallel_execute_op,
+                                     legacy_graph_export_, parallel_group_idx);
   }
 }
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>>
-CreateParallelExecuteToIslandsPass() {
-  return std::make_unique<ParallelExecuteToIslandsPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
+    bool legacy_graph_export) {
+  return std::make_unique<ParallelExecuteToIslandsPass>(legacy_graph_export);
 }
 
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index e69e1a3e8a5..fd3a34eefea 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
 namespace mlir {
 
@@ -30,13 +31,6 @@ namespace mlir {
 // islands, each with a single op.
 std::unique_ptr<OperationPass<ModuleOp>> CreateBreakUpIslandsPass();
 
-// Creates a pass that breaks up an island with multiple ops into multiple
-// islands, each with a single op. This pass intentionally does not propagate
-// control dependencies across newly created islands, a following pass will
-// handle this.
-// TODO(b/244596254) Implement followup pass for creating control deps.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass();
-
 // Creates a pass that converts mlir functions consisting of mlir ops into a
 // tf_executor dialect as a single island.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -88,6 +82,11 @@ CreateTFRegionControlFlowToFunctional();
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateMaterializePassthroughOpPass();
 
+// Replicates the TensorList init op by undoing some CSE needed for correct
+// shape assignment in shape_inference.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplicateTensorListInitOpsPass();
+
 // Performs Shape Inference on the TensorFlow dialect using the global registry.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTFShapeInferencePass();
 
@@ -266,6 +265,12 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateConstantOpDeviceAssignmentPass();
 
 // Populates the supplied passmanager with the passes required to export
 // to TensorFlow Graph.
+void AddGraphExportLoweringPassesV2(OpPassManager& pm);
+
+// Populates the supplied passmanager with the passes required to export
+// to TensorFlow Graph.
+// ***This is the legacy graph export pipeline, prefer
+// AddGraphExportLoweringPassesV2***.
 void AddGraphExportLoweringPasses(OpPassManager& pm);
 
 // Returns pass that verifies whether all functions in module are of single
@@ -307,6 +312,13 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateStripTfAttributesPass();
 // Converts AnonymousIteratorOps to (named) IteratorOps.
 std::unique_ptr<OperationPass<ModuleOp>> CreateNameAnonymousIteratorsPass();
 
+// Creates a pass that breaks up an island with multiple ops into multiple
+// islands, each with a single op. This pass intentionally does not propagate
+// control dependencies across newly created islands, a following pass will
+// handle this.
+// TODO(b/244596254) Implement followup pass for creating control deps.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass();
+
 // Populates the supplied passmanager with the passes required to run the
 // CPU/GPU bridge.
 void CreateTFXLABridgePipeline(OpPassManager& pm);
@@ -356,7 +368,7 @@ CreateTFExecutorUpdateControlDependenciesPass();
 namespace TFDevice {
 // Creates a pass that forms clusters from instructions that are assigned to
 // same device.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateClusterFormationPass();
+std::unique_ptr<OperationPass<ModuleOp>> CreateClusterFormationPass();
 
 // Sinks `tf.Const` operations in the ClusterOp region using them. This is
 // performed in order to limit the number of values implicitly captured in this
@@ -412,7 +424,8 @@ CreateReplicateInvariantOpHoistingPass();
 
 // Creates a pass that forms replica `tf_executor.island` from a single
 // `tf_device.replicate` island.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
+    bool legacy_graph_export = true);
 
 // Creates a pass that sets the device ordinal attribute of the required op
 // using the replica id attribute.
@@ -421,8 +434,8 @@ CreateReplicaIDToDeviceOrdinalPass();
 
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
-std::unique_ptr<OperationPass<func::FuncOp>>
-CreateParallelExecuteToIslandsPass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
+    bool legacy_graph_export = true);
 
 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the
 // same data across replicas.
@@ -445,8 +458,8 @@ CreateDeviceAttributeToLaunchPass();
 // Creates a pass that hoists a `tf_device.launch` body and assigns a `device`
 // attribute to each TensorFlow dialect op in the body based on the `device`
 // attribute on the `tf_device.launch`.
-std::unique_ptr<OperationPass<func::FuncOp>>
-CreateLaunchToDeviceAttributePass();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateLaunchToDeviceAttributePass(
+    bool legacy_graph_export = true);
 
 // Creates a pass that extracts ops in tf_device.launch op with host device
 // assignment and adds an `_xla_outside_compilation` attribute value.
@@ -471,6 +484,10 @@ namespace TFTPU {
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateConvertToLegacyCompileAndReplicateAttributesPass();
 
+// Creates a pass that converts all TPUPartitionedInput to TPUPartitionedInputV2
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUPartitionedOpConversionPass();
+
 // Creates a pass that forms clusters from operations of the same
 // `_replication_info` attribute.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index aaa2aef1fa6..457dca838af 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -78,7 +78,7 @@ class RewriteXlaHostComputeMlir
     // and use it for `shape_inference_graph` attribute on XlaHostCompute.
     func::FuncOp cloned_func;
     SymbolTable manager(op->getParentOfType<ModuleOp>());
-    StringRef host_module = op.host_mlir_module();
+    StringRef host_module = op.getHostMlirModule();
     if (!host_module.empty()) {
       mlir::OwningOpRef<mlir::ModuleOp> module_for_func;
 
@@ -98,7 +98,7 @@ class RewriteXlaHostComputeMlir
 
       auto recv_at_host = rewriter.create<TF::_XlaRecvAtHostOp>(
           func.getLoc(), op.getOperandTypes(), /*dynamic_key=*/dynamic_key,
-          op.send_keyAttr(),
+          op.getSendKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0));
       for (auto result :
            llvm::zip(cloned_func.getArguments(), recv_at_host->getResults())) {
@@ -109,19 +109,19 @@ class RewriteXlaHostComputeMlir
       rewriter.create<TF::_XlaSendFromHostOp>(
           func.getLoc(),
           cloned_func.getBody().front().getTerminator()->getOperands(),
-          /*dynamic_key=*/dynamic_key, op.recv_keyAttr(),
+          /*dynamic_key=*/dynamic_key, op.getRecvKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0));
     }
 
     constexpr int64_t kDefaultCostEstimate = 1000000;
     rewriter.replaceOpWithNewOp<TF::XlaHostComputeOp>(
-        op, op.getResultTypes(), op.inputs(),
+        op, op.getResultTypes(), op.getInputs(),
         /*ancestors=*/rewriter.getArrayAttr({}),
         rewriter.getArrayAttr(shape_attrs),
         /*shape_inference_graph=*/
         cloned_func ? SymbolRefAttr::get(cloned_func) : SymbolRefAttr(),
-        /*key=*/rewriter.getStringAttr(""), op.send_keyAttr(),
-        op.recv_keyAttr(),
+        /*key=*/rewriter.getStringAttr(""), op.getSendKeyAttr(),
+        op.getRecvKeyAttr(),
         /*cost_estimate_ns=*/rewriter.getI64IntegerAttr(kDefaultCostEstimate),
         /*tpu_core=*/rewriter.getI64IntegerAttr(0));
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index ffd6c9537a0..a7226b39ebe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -136,16 +136,16 @@ mlir::LogicalResult PromoteVarHandlesToArguments(
     // then we keep them as VarHandleOps.
     if (!VariableIsInitialized(var_handle_op)) continue;
 
-    llvm::StringRef name = var_handle_op.shared_nameAttr().getValue();
+    llvm::StringRef name = var_handle_op.getSharedNameAttr().getValue();
     auto it = var_arg_index_by_name.insert({name, func_arg_types.size()});
     if (it.second) {
       var_handle_shared_names->emplace_back(name);
-      auto resource_type = var_handle_op.resource().getType();
+      auto resource_type = var_handle_op.getResource().getType();
       func_arg_types.push_back(resource_type);
-      var_handle_op.resource().replaceAllUsesWith(
+      var_handle_op.getResource().replaceAllUsesWith(
           block.addArgument(resource_type, var_handle_op.getLoc()));
     } else {
-      var_handle_op.resource().replaceAllUsesWith(
+      var_handle_op.getResource().replaceAllUsesWith(
           block.getArgument(it.first->getSecond()));
     }
     var_handle_op.erase();
@@ -226,28 +226,28 @@ LogicalResult PromoteResourcesToArguments(
   // live value.
   for (Operation& op : llvm::make_early_inc_range(block)) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
-      if (auto func_arg = read_op.resource().dyn_cast<BlockArgument>()) {
+      if (auto func_arg = read_op.getResource().dyn_cast<BlockArgument>()) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
         ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
         resource_info.read = true;
-        read_op.value().replaceAllUsesWith(resource_info.live_value);
+        read_op.getValue().replaceAllUsesWith(resource_info.live_value);
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
 
       read_op.erase();
     } else if (auto write_op = llvm::dyn_cast<TF::AssignVariableOp>(&op)) {
-      if (auto func_arg = write_op.resource().dyn_cast<BlockArgument>()) {
+      if (auto func_arg = write_op.getResource().dyn_cast<BlockArgument>()) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
         ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
         resource_info.write = true;
-        resource_info.live_value = write_op.value();
+        resource_info.live_value = write_op.getValue();
       } else {
-        return read_op.emitOpError(kInvalidResourceMsg);
+        return write_op.emitOpError(kInvalidResourceMsg);
       }
 
       write_op.erase();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
index 7b197b39c1a..6ff39e9d69e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
@@ -168,7 +168,8 @@ void ConvertReadonlyReferenceVariablesToResourceVariablesPass::
         ArrayRef<Value>{},
         ArrayRef<NamedAttribute>{
             builder.getNamedAttr("device", device_attr),
-            builder.getNamedAttr("container", variable_v2_op.containerAttr()),
+            builder.getNamedAttr("container",
+                                 variable_v2_op.getContainerAttr()),
             builder.getNamedAttr("shared_name",
                                  builder.getStringAttr(variable_name))});
     for (Operation *user :
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index 6ea75c86d63..a0335a0857f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -17,6 +17,8 @@ limitations under the License.
 // the TensorFlow dialect to their functional counterparts, i.e.,
 // tf.IfRegion ->  tf.If and tf.WhileRegion -> tf.While
 
+#include <optional>
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -185,13 +187,13 @@ void ExtractSingleBlockRegion(Region& region, StringRef name,
 // does not conform to this pattern.
 llvm::Optional<func::CallOp> IsSingleCallRegion(Region& region,
                                                 bool allow_to_bool = false) {
-  if (!llvm::hasSingleElement(region)) return llvm::None;
+  if (!llvm::hasSingleElement(region)) return std::nullopt;
 
   Block& block = region.front();
   auto it = block.rbegin();
   YieldOp yield = dyn_cast<YieldOp>(*it++);
 
-  if (it == block.rend()) return llvm::None;
+  if (it == block.rend()) return std::nullopt;
 
   // Operation which is expected to consume all the call results.
   Operation* call_consumer = yield;
@@ -199,28 +201,28 @@ llvm::Optional<func::CallOp> IsSingleCallRegion(Region& region,
   // Allow a single ToBoolOp between the call and the yield (valid only
   // when the yield has a single operand)
   if (allow_to_bool && yield.getNumOperands() == 1 && isa<ToBoolOp>(*it)) {
-    if (it->getResult(0) != yield.getOperand(0)) return llvm::None;
+    if (it->getResult(0) != yield.getOperand(0)) return std::nullopt;
     call_consumer = cast<ToBoolOp>(*it);
     it++;
-    if (it == block.rend()) return llvm::None;
+    if (it == block.rend()) return std::nullopt;
   }
 
   // Check if there is a Call before the Yield.
   func::CallOp call = dyn_cast<func::CallOp>(*it++);
-  if (!call) return llvm::None;
+  if (!call) return std::nullopt;
 
   // All call results should feed into expected consumer
   // All results of the call should feed into the yield.
   if (call.getNumResults() != call_consumer->getNumOperands())
-    return llvm::None;
+    return std::nullopt;
 
   for (auto res_it : llvm::zip(call.getResults(), call_consumer->getOperands()))
-    if (std::get<0>(res_it) != std::get<1>(res_it)) return llvm::None;
+    if (std::get<0>(res_it) != std::get<1>(res_it)) return std::nullopt;
 
   // There can only be non-truncating cast op's prior to the call.
   for (; it != block.rend(); ++it) {
     CastOp cast = dyn_cast<CastOp>(*it);
-    if (!cast || cast.Truncate()) return llvm::None;
+    if (!cast || cast.getTruncate()) return std::nullopt;
   }
 
   return call;
@@ -246,7 +248,7 @@ bool MatchCallArgs(func::CallOp first, func::CallOp second,
         // Consider cast compatibility in case
         //    %cast = "tf.Cast"(%0) : (tensor<2xi64>) -> tensor<2xf32>
         // is skipped.
-        if (cast_op.SrcT() != cast_op.DstT()) {
+        if (cast_op.getSrcT() != cast_op.getDstT()) {
           break;
         }
         value = cast_op.getOperand();
@@ -286,13 +288,12 @@ struct TrivialTransformInfo {
                        ArgMatcherFn arg_matcher) {
     if (!first_call || !second_call) return;
 
-    if (!MatchCallArgs(first_call.getValue(), second_call.getValue(),
-                       arg_matcher))
+    if (!MatchCallArgs(first_call.value(), second_call.value(), arg_matcher))
       return;
 
     can_transform = true;
-    callee_names = {first_call.getValue().getCallee(),
-                    second_call.getValue().getCallee()};
+    callee_names = {first_call.value().getCallee(),
+                    second_call.value().getCallee()};
   }
 };
 
@@ -310,8 +311,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
     return true;
   };
 
-  const TrivialTransformInfo tti(IsSingleCallRegion(if_region.then_branch()),
-                                 IsSingleCallRegion(if_region.else_branch()),
+  const TrivialTransformInfo tti(IsSingleCallRegion(if_region.getThenBranch()),
+                                 IsSingleCallRegion(if_region.getElseBranch()),
                                  if_arg_matcher);
 
   std::string then_name, else_name;
@@ -322,8 +323,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
     else_name = tti.callee_names[1].str();
   } else {
     // Collect external values that are used within the else and then bodies.
-    extern_values =
-        CollectExternValues(if_region.then_branch(), if_region.else_branch());
+    extern_values = CollectExternValues(if_region.getThenBranch(),
+                                        if_region.getElseBranch());
 
     // These external values need to be added as inputs to the generated If. The
     // order is determined by the order of these values the `extern_vales`.
@@ -332,30 +333,32 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
     // and outline the `then` and `else` regions by moving the bodies of these
     // regions into these functions. Replace tf.yield with a regular return.
     if (if_region->hasAttrOfType<StringAttr>(kThenFuncNameAttr) &&
-        !if_region._then_func_nameAttr().getValue().empty()) {
+        !if_region.get_thenFuncNameAttr().getValue().empty()) {
       then_name =
-          mapper.GetUniqueName(if_region._then_func_nameAttr().getValue())
+          mapper.GetUniqueName(if_region.get_thenFuncNameAttr().getValue())
               .str();
     } else {
       then_name = GetName(if_region, "_then");
     }
-    ExtractSingleBlockRegion(if_region.then_branch(), then_name, extern_values,
-                             worklist, /*extern_values_passthrough=*/false);
+    ExtractSingleBlockRegion(if_region.getThenBranch(), then_name,
+                             extern_values, worklist,
+                             /*extern_values_passthrough=*/false);
 
     if (if_region->hasAttrOfType<StringAttr>(kElseFuncNameAttr) &&
-        !if_region._else_func_nameAttr().getValue().empty()) {
+        !if_region.get_elseFuncNameAttr().getValue().empty()) {
       else_name =
-          mapper.GetUniqueName(if_region._else_func_nameAttr().getValue())
+          mapper.GetUniqueName(if_region.get_elseFuncNameAttr().getValue())
               .str();
     } else {
       else_name = GetName(if_region, "_else");
     }
-    ExtractSingleBlockRegion(if_region.else_branch(), else_name, extern_values,
-                             worklist, /*extern_values_passthrough=*/false);
+    ExtractSingleBlockRegion(if_region.getElseBranch(), else_name,
+                             extern_values, worklist,
+                             /*extern_values_passthrough=*/false);
   }
 
   // Look through ToBool operations for the condition.
-  Value cond = if_region.cond();
+  Value cond = if_region.getCond();
   auto to_bool = dyn_cast_or_null<ToBoolOp>(cond.getDefiningOp());
   if (to_bool) cond = to_bool.getOperand();
 
@@ -365,7 +368,7 @@ LogicalResult RegionControlFlowToFunctional::ConvertIfOp(IfRegionOp if_region) {
   OpBuilder builder(if_region);
   auto if_op = builder.create<IfOp>(
       if_region.getLoc(), if_region.getResultTypes(), cond, extern_values,
-      then_name, else_name, if_region.is_stateless());
+      then_name, else_name, if_region.getIsStateless());
   CopyAndOverrideAttributes(if_region, if_op, &builder);
 
   if_region.replaceAllUsesWith(if_op.getResults());
@@ -399,8 +402,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   };
 
   const TrivialTransformInfo tti(
-      IsSingleCallRegion(while_region.cond(), /*allow_to_bool=*/true),
-      IsSingleCallRegion(while_region.body()), while_arg_matcher);
+      IsSingleCallRegion(while_region.getCond(), /*allow_to_bool=*/true),
+      IsSingleCallRegion(while_region.getBody()), while_arg_matcher);
 
   // All existing inputs to while region are inputs to the functional while.
   auto new_inputs = llvm::to_vector<4>(while_region.getOperands());
@@ -422,16 +425,16 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
     // to the region arguments, all these external references need to be added
     // as function arguments.
     llvm::SmallVector<Value, 4> extern_values =
-        CollectExternValues(while_region.cond(), while_region.body());
+        CollectExternValues(while_region.getCond(), while_region.getBody());
 
     // Outline the `cond` and `body` regions by moving the bodies of these
     // regions into new functions. Replace tf.yield with a regular return.
     cond_name = GetName(while_region, "_cond");
-    ExtractSingleBlockRegion(while_region.cond(), cond_name, extern_values,
+    ExtractSingleBlockRegion(while_region.getCond(), cond_name, extern_values,
                              worklist, /*extern_values_passthrough=*/false);
 
     body_name = GetName(while_region, "_body");
-    ExtractSingleBlockRegion(while_region.body(), body_name, extern_values,
+    ExtractSingleBlockRegion(while_region.getBody(), body_name, extern_values,
                              worklist, /*extern_values_passthrough=*/true);
 
     // All extern values become additional inputs and additional output types
@@ -445,8 +448,8 @@ LogicalResult RegionControlFlowToFunctional::ConvertWhileOp(
   OpBuilder builder(while_region);
   auto while_op = builder.create<WhileOp>(
       while_region.getLoc(), new_result_types, new_inputs, cond_name, body_name,
-      while_region.parallel_iterations(), while_region.is_stateless(),
-      while_region.shape_invariant());
+      while_region.getParallelIterations(), while_region.getIsStateless(),
+      while_region.getShapeInvariant());
   CopyAndOverrideAttributes(while_region, while_op, &builder);
 
   // Redirect old results to new results.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
index a46f585df88..d587df91c1b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_arguments.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -110,7 +110,7 @@ void EraseResults(Operation* op, llvm::BitVector erase) {
   Operation* new_op = builder.create(state);
   for (const auto& indexed_regions : llvm::enumerate(op->getRegions())) {
     Region& region = op->getRegion(indexed_regions.index());
-    BlockAndValueMapping mapping;
+    IRMapping mapping;
     indexed_regions.value().cloneInto(&region, mapping);
   }
   int new_position = 0;
@@ -223,7 +223,14 @@ void RemoveUnusedArgumentsPass::runOnOperation() {
       op.getOperation()->getResult(from).replaceAllUsesWith(
           op.getOperation()->getOperand(to));
     }
-    op->eraseOperands(args_to_erase.lookup(func));
+    BitVector operands_to_erase(op->getNumOperands());
+    int args_start = op->getNumOperands()
+                         ? op.getArgOperands().getBase()->getOperandNumber()
+                         : 0;
+    operands_to_erase |= args_to_erase.lookup(func);
+    operands_to_erase <<= args_start;
+    op->eraseOperands(operands_to_erase);
+
     EraseResults(op, results_to_erase.lookup(func));
   });
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_while_results.cc b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_while_results.cc
index 05336d2818d..b4818592ef6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_while_results.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/remove_unused_while_results.cc
@@ -50,8 +50,8 @@ bool TryPruneResultDefiningOp(TF::WhileRegionOp while_op, OpResult result) {
   // Don't prune if result is used.
   if (!result.use_empty()) return false;
 
-  Block& body_block = while_op.body().front();
-  Block& cond_block = while_op.cond().front();
+  Block& body_block = while_op.getBody().front();
+  Block& cond_block = while_op.getCond().front();
   Operation* body_yield_op = body_block.getTerminator();
 
   // The body yield operand, body block argument, condition block argument, and
@@ -69,7 +69,7 @@ bool TryPruneResultDefiningOp(TF::WhileRegionOp while_op, OpResult result) {
     if (TF::TensorFlowDialect::CanHaveSideEffects(candidate_op)) {
       return false;
     }
-  } else if (!MemoryEffectOpInterface::hasNoEffect(candidate_op)) {
+  } else if (!isMemoryEffectFree(candidate_op)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 171f03a9e94..1831ecc68db 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -47,7 +47,7 @@ struct ReplicateInvariantOpHoistingPass
 
 void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
                           Block* replicate_block, TF::ShapeOp shape_op) {
-  Value input = shape_op.input();
+  Value input = shape_op.getInput();
   // If ShapeOp operand is replicate tensor block argument, replace with the
   // associated first replica operand.
   if (auto block_arg = input.dyn_cast<BlockArgument>()) {
@@ -72,7 +72,7 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
   // shape has not changed in replicate prior to read. Currently after both
   // ResourceOpLiftingPass and TPURewritePass, there should not be any updates
   // to resources prior to their respective ReadVariableOp.
-  if (auto block_arg = read_var_op.resource().dyn_cast<BlockArgument>()) {
+  if (auto block_arg = read_var_op.getResource().dyn_cast<BlockArgument>()) {
     if (block_arg.getOwner() != replicate_block) return;
 
     OpBuilder builder(shape_op);
@@ -86,7 +86,7 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
 }
 
 // Check if op uses a device from a list of virtual devices.
-bool UsesVirtualDevice(const Optional<DictionaryAttr>& virtual_devices,
+bool UsesVirtualDevice(const std::optional<DictionaryAttr>& virtual_devices,
                        Operation* operation) {
   if (!virtual_devices.has_value()) return false;
 
@@ -94,7 +94,7 @@ bool UsesVirtualDevice(const Optional<DictionaryAttr>& virtual_devices,
     StringAttr op_device = op->getAttrOfType<StringAttr>(kDeviceAttr);
     if (!op_device) return WalkResult::advance();
 
-    if (virtual_devices.getValue().get(op_device.getValue()))
+    if (virtual_devices.value().get(op_device.getValue()))
       return WalkResult::interrupt();
     return WalkResult::advance();
   });
@@ -132,7 +132,7 @@ void HoistReplicateInvariantOps(tf_device::ReplicateOp replicate_op) {
   });
 
   Region* replicate_region = &replicate_op.getBody();
-  Optional<DictionaryAttr> virtual_device_list = replicate_op.getDevices();
+  std::optional<DictionaryAttr> virtual_device_list = replicate_op.getDevices();
   for (Operation& inner_op :
        llvm::make_early_inc_range(replicate_op.GetBody())) {
     if (llvm::isa<tf_device::ReturnOp>(inner_op)) continue;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_tensor_list_init_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_tensor_list_init_ops_pass.cc
new file mode 100644
index 00000000000..a8b6ecc0530
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_tensor_list_init_ops_pass.cc
@@ -0,0 +1,79 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+#define GEN_PASS_DEF_REPLICATETENSORLISTINITOPSPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+// Replicates the TensorList initialization ops for all the uses.
+// No need to delete the original TensorList as it might be used elsewhere.
+template <typename T>
+void ReplicateTensorListForUses(T tensor_list_op) {
+  Value tensor_list = tensor_list_op.getResult();
+  std::vector<OpOperand*> uses;
+  for (auto& use : tensor_list.getUses()) {
+    uses.emplace_back(&use);
+  }
+  OpBuilder builder(tensor_list_op.getOperation());
+  for (OpOperand* operand : uses) {
+    auto new_op = builder.clone(*tensor_list_op.getOperation());
+    operand->set(new_op->getResult(0));
+  }
+}
+
+// This transformation pass replicates TensorList initialization ops.
+class ReplicateTensorListInitOps
+    : public impl::ReplicateTensorListInitOpsPassBase<
+          ReplicateTensorListInitOps> {
+ public:
+  void runOnOperation() override {
+    getOperation().walk([](Operation* op) {
+      if (auto tl_reserve = dyn_cast<TensorListReserveOp>(op)) {
+        ReplicateTensorListForUses(tl_reserve);
+      }
+      if (auto tl_empty = dyn_cast<EmptyTensorListOp>(op)) {
+        ReplicateTensorListForUses(tl_empty);
+      }
+    });
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateReplicateTensorListInitOpsPass() {
+  return std::make_unique<ReplicateTensorListInitOps>();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index d1b248e9b0b..1b2ea737b50 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -17,10 +17,11 @@ limitations under the License.
 // `tf_device.replicate` island.
 
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -29,7 +30,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
@@ -39,6 +40,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
@@ -55,6 +59,9 @@ constexpr char kTPUCore0[] = "TPU_REPLICATED_CORE_0";
 
 struct ReplicateToIslandPass
     : public impl::ReplicateToIslandPassBase<ReplicateToIslandPass> {
+  explicit ReplicateToIslandPass(bool legacy_graph_export) {
+    legacy_graph_export_ = legacy_graph_export;
+  }
   void runOnOperation() override;
 };
 
@@ -68,22 +75,22 @@ bool RequiresReplicaIDAttribute(Operation* op) {
 // Collects TPU device ordinal for outside compilation communication ops. This
 // currently assumes outside compilation only uses `TPU_REPLICATED_CORE_0`
 // aliased device for the device computation.
-llvm::Optional<int64_t> GetDeviceOrdinal(
-    const llvm::Optional<DictionaryAttr>& devices, Location loc,
+std::optional<int64_t> GetDeviceOrdinal(
+    const std::optional<DictionaryAttr>& devices, Location loc,
     unsigned replica_id) {
   int64_t device_ordinal = 0;
   if (devices.has_value()) {
-    if (auto tpu_replica_0 = devices.getValue().get(kTPUCore0)) {
+    if (auto tpu_replica_0 = devices.value().get(kTPUCore0)) {
       llvm::StringRef tpu_device = tpu_replica_0.cast<ArrayAttr>()[replica_id]
                                        .cast<StringAttr>()
                                        .getValue();
       if (succeeded(tensorflow::GetDeviceOrdinalFromDeviceString(
               loc, tpu_device, &device_ordinal))) {
-        return llvm::Optional<int64_t>(device_ordinal);
+        return std::optional<int64_t>(device_ordinal);
       }
     }
   }
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Updates replica variant ops in a region based on replica `replica_id`.
@@ -94,8 +101,8 @@ llvm::Optional<int64_t> GetDeviceOrdinal(
 // represents replica id.
 LogicalResult UpdateRegionReplicateVariantOps(
     OpBuilder& builder, Location loc, Region& region, int replica_id,
-    const llvm::Optional<DictionaryAttr>& devices) {
-  llvm::Optional<int64_t> device_ordinal =
+    const std::optional<DictionaryAttr>& devices) {
+  std::optional<int64_t> device_ordinal =
       GetDeviceOrdinal(devices, loc, replica_id);
 
   auto result = region.walk([&](Operation* op) -> WalkResult {
@@ -114,7 +121,7 @@ LogicalResult UpdateRegionReplicateVariantOps(
       auto const_op = builder.create<TF::ConstOp>(
           op->getLoc(), DenseIntElementsAttr::get(
                             RankedTensorType::get({}, builder.getI64Type()),
-                            {device_ordinal.getValue()}));
+                            {device_ordinal.value()}));
       op->replaceAllUsesWith(const_op);
       op->erase();
       return WalkResult::advance();
@@ -124,7 +131,7 @@ LogicalResult UpdateRegionReplicateVariantOps(
 
     // Map aliased devices to explicit devices based on replica.
     if (auto launch = dyn_cast<tf_device::LaunchOp>(op))
-      if (auto device_by_replica = devices.getValue().get(launch.getDevice()))
+      if (auto device_by_replica = devices.value().get(launch.getDevice()))
         launch->setAttr(
             kDeviceAttr,
             device_by_replica.cast<ArrayAttr>()[replica_id].cast<StringAttr>());
@@ -142,7 +149,8 @@ LogicalResult UpdateRegionReplicateVariantOps(
 LogicalResult ExpandReplicateIntoReplicas(
     const Dialect* tf_dialect, OpBuilder& builder,
     tf_executor::IslandOp island_op, tf_device::ReplicateOp replicate_op,
-    int num_replicas, llvm::SmallVectorImpl<tf_executor::IslandOp>& replicas) {
+    int num_replicas, llvm::SmallVectorImpl<tf_executor::IslandOp>& replicas,
+    bool legacy_graph_export, int replica_group_idx) {
   replicas.reserve(num_replicas);
   auto devices = replicate_op.getDevices();
 
@@ -159,7 +167,7 @@ LogicalResult ExpandReplicateIntoReplicas(
   terminator.erase();
 
   builder.setInsertionPoint(island_op);
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   for (int i : llvm::seq<int>(0, num_replicas)) {
     // Create new island for replica.
     auto replica = builder.create<tf_executor::IslandOp>(
@@ -179,6 +187,28 @@ LogicalResult ExpandReplicateIntoReplicas(
                                                /*replica_id=*/i, devices)))
       return failure();
 
+    // In new graph export pipeline, we will update control dependencies in the
+    // end of the pipeline. Mostly, it will rely on side effect analysis by
+    // considering accessing resource only. However, for branches under parallel
+    // group, there should not be any control deps between them even side effect
+    // analysis indicate some control deps. Therefore, we will mark parallel
+    // group and branch information here so that `UpdateControlDependenciesPass`
+    // can fetch the related information later.
+    if (!legacy_graph_export) {
+      std::string group_annotation = absl::StrCat(
+          "r", std::to_string(replica_group_idx), ":", std::to_string(i));
+      if (auto parallel_group_attr = replicate_op->getAttrOfType<StringAttr>(
+              TF::kParallelExecAnnotation)) {
+        // Extend the existing attribute so that nested parallel execution
+        // structure is supported.
+        group_annotation = absl::StrCat(parallel_group_attr.getValue().str(),
+                                        ",", group_annotation);
+      }
+      for (auto& op : replica.GetBody()) {
+        op.setAttr(TF::kParallelExecAnnotation,
+                   builder.getStringAttr(group_annotation));
+      }
+    }
     replicas.push_back(replica);
   }
 
@@ -238,14 +268,17 @@ LogicalResult ExpandReplicateIntoReplicas(
 LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
                                          tf_executor::GraphOp graph_op,
                                          tf_executor::IslandOp island_op,
-                                         tf_device::ReplicateOp replicate_op) {
+                                         tf_device::ReplicateOp replicate_op,
+                                         bool legacy_graph_export,
+                                         int replica_group_idx) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.getN();
 
   // Create islands per replica.
   llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
-  if (failed(ExpandReplicateIntoReplicas(tf_dialect, builder, island_op,
-                                         replicate_op, num_replicas, replicas)))
+  if (failed(ExpandReplicateIntoReplicas(
+          tf_dialect, builder, island_op, replicate_op, num_replicas, replicas,
+          legacy_graph_export, replica_group_idx)))
     return failure();
 
   // Collect all replica results.
@@ -280,21 +313,33 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
     island_op.getControl().replaceAllUsesWith(island_sink.getControl());
   }
 
-  // Replicas with no uses should be pinned to a graph fetch so they still
-  // execute.
-  llvm::SmallVector<Value, 8> unused_replica_controls;
-  for (auto& replica : replicas)
-    if (replica.use_empty())
-      unused_replica_controls.push_back(replica.getControl());
-
-  if (!unused_replica_controls.empty()) {
-    tf_executor::FetchOp fetch = graph_op.GetFetch();
-    auto fetches = llvm::to_vector<8>(fetch.getOperands());
-    fetches.append(unused_replica_controls.begin(),
-                   unused_replica_controls.end());
-    builder.setInsertionPoint(fetch);
-    builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
-    fetch.erase();
+  if (legacy_graph_export) {
+    // Replicas with no uses should be pinned to a graph fetch so they still
+    // execute.
+    llvm::SmallVector<Value, 8> unused_replica_controls;
+    for (auto& replica : replicas)
+      if (replica.use_empty())
+        unused_replica_controls.push_back(replica.getControl());
+
+    if (!unused_replica_controls.empty()) {
+      tf_executor::FetchOp fetch = graph_op.GetFetch();
+      auto fetches = llvm::to_vector<8>(fetch.getOperands());
+      fetches.append(unused_replica_controls.begin(),
+                     unused_replica_controls.end());
+      builder.setInsertionPoint(fetch);
+      builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
+      fetch.erase();
+    }
+  } else {
+    // Now, finally, we need to maintain the invariant expected to be maintained
+    // throughout the graph export pipeline that all islands always perfectly
+    // wrap a single op. So we'll split all replica islands.
+    auto control_type = tf_executor::ControlType::get(island_op.getContext());
+    for (auto& replica : replicas) {
+      if (replica.GetBody().getOperations().size() > 1) {
+        mlir::TF::SplitIsland(replica, control_type);
+      }
+    }
   }
 
   island_op.erase();
@@ -320,19 +365,31 @@ void ReplicateToIslandPass::runOnOperation() {
     }
   });
 
+  // This number is unique within each function which is sufficient for
+  // `UpdateControlDependenciesPass` which consumes the related attributes.
+  // However, this assumes that we don't inline functions between this pass
+  // and `UpdateControlDependenciesPass`.
+  // If we need globally unique replica group IDs in the future,
+  // we can either make this pass a module pass (using a global counter)
+  // or use an atomic counter.
+  int replica_group_idx = 0;
   for (tf_executor::IslandOp island_op : replicate_op_islands) {
     auto graph_op = island_op->getParentOfType<tf_executor::GraphOp>();
     auto replicate_op =
         cast<tf_device::ReplicateOp>(island_op.GetBody().front());
     if (failed(CreateIslandsFromReplicate(tf_dialect, graph_op, island_op,
-                                          replicate_op)))
+                                          replicate_op, legacy_graph_export_,
+                                          replica_group_idx))) {
+      replica_group_idx++;
       return signalPassFailure();
+    }
   }
 }
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass() {
-  return std::make_unique<ReplicateToIslandPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
+    bool legacy_graph_export) {
+  return std::make_unique<ReplicateToIslandPass>(legacy_graph_export);
 }
 
 }  // namespace TFDevice
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index bc3944539c6..020abcfd63e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <tuple>
 #include <utility>
 
@@ -78,7 +79,7 @@ class PerFunctionResult {
   // Returns the recorded device assignment for a resource, if any.
   Optional<StringRef> DeviceForResource(Value resource) const {
     Optional<StringRef> result;
-    if (alias_analysis_.IsUnknownResource(resource)) return llvm::None;
+    if (alias_analysis_.IsUnknownResource(resource)) return std::nullopt;
     for (int64_t id : alias_analysis_.GetResourceUniqueIds(resource)) {
       auto it = resource_id_to_device_.find(id);
       if (it == resource_id_to_device_.end()) continue;
@@ -87,7 +88,7 @@ class PerFunctionResult {
         continue;
       }
       // Got conflicting assignments
-      return llvm::None;
+      return std::nullopt;
     }
     return result;
   }
@@ -145,7 +146,8 @@ inline StringRef GetDeviceAttr(Operation* op) {
 // Print operation with debug info (to get line number info for debugging)
 void dump(StringRef message, Operation* op) {
   llvm::dbgs() << message;
-  op->print(llvm::dbgs(), OpPrintingFlags().enableDebugInfo(true));
+  op->print(llvm::dbgs(), OpPrintingFlags().enableDebugInfo(
+                              /*enable=*/true, /*prettyForm=*/true));
   llvm::dbgs() << "\n";
 }
 
@@ -178,7 +180,7 @@ LogicalResult ComputeResourceDevicesInComputation(func::FuncOp func_op,
       // Record VarHandleOp's device attribute.
       StringRef device_attr = GetDeviceAttr(op);
       if (device_attr.empty()) return WalkResult::advance();
-      auto res = AddResourceDeviceAndEmitError(var_handle.resource(),
+      auto res = AddResourceDeviceAndEmitError(var_handle.getResource(),
                                                device_attr, op, result);
       if (failed(res)) return WalkResult::interrupt();
     } else if (auto identity = dyn_cast<IdentityOp>(op)) {
@@ -286,7 +288,7 @@ void ResourceDeviceInference::runOnOperation() {
           return WalkResult::interrupt();
       } else if (auto if_op = dyn_cast<IfOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
-                if_op, if_op.input(),
+                if_op, if_op.getInput(),
                 {if_op.then_function(), if_op.else_function()}, func_res)))
           return WalkResult::interrupt();
       } else if (auto call = dyn_cast<CallOpInterface>(op)) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index c2221db9f2d..aa4941ec5b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -102,7 +102,7 @@ void SetAllVarIsInitializedToTrue(Block* block) {
           DenseIntElementsAttr::get(
               RankedTensorType::get(/*shape=*/{}, builder.getI1Type()), true));
 
-    op.is_initialized().replaceAllUsesWith(const_true);
+    op.getIsInitialized().replaceAllUsesWith(const_true);
     op.erase();
   }
 }
@@ -124,23 +124,23 @@ void ForwardStoreToLoad(Block* block) {
   // nested deeper in regions.
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto read_variable_op = dyn_cast<TF::ReadVariableOp>(&op)) {
-      Value resource = read_variable_op.resource();
+      Value resource = read_variable_op.getResource();
       auto last_store = resource_handle_to_last_store_op[resource];
       if (!last_store) continue;
 
       // Use stored value in last_store to replace all uses of current resource
       // load's result, then erase this resource load. Add an intermediate
       // CastOp if the shape of types doesn't exactly match.
-      Type read_type = read_variable_op.value().getType();
-      if (read_type != last_store.value().getType()) {
+      Type read_type = read_variable_op.getValue().getType();
+      if (read_type != last_store.getValue().getType()) {
         OpBuilder builder(last_store);
         builder.setInsertionPointAfter(last_store);
         auto cast = builder.create<TF::CastOp>(
-            last_store.getLoc(), read_type, last_store.value(),
+            last_store.getLoc(), read_type, last_store.getValue(),
             /*Truncate=*/builder.getBoolAttr(false));
-        read_variable_op.value().replaceAllUsesWith(cast);
+        read_variable_op.getValue().replaceAllUsesWith(cast);
       } else {
-        read_variable_op.value().replaceAllUsesWith(last_store.value());
+        read_variable_op.getValue().replaceAllUsesWith(last_store.getValue());
       }
 
       read_variable_op.erase();
@@ -148,7 +148,7 @@ void ForwardStoreToLoad(Block* block) {
     }
 
     if (auto assign_variable_op = dyn_cast<TF::AssignVariableOp>(&op)) {
-      Value resource = assign_variable_op.resource();
+      Value resource = assign_variable_op.getResource();
       auto last_store = resource_handle_to_last_store_op[resource];
       // Previous store ops to same resource can be erased.
       if (last_store) last_store.erase();
@@ -331,13 +331,13 @@ LogicalResult RegionResourceHoister::Analyze() {
 
       if (read && !info.is_read) {
         info.is_read = true;
-        info.RefineType(read.value().getType());
+        info.RefineType(read.getValue().getType());
         info.read_attrs = user->getAttrDictionary();
       }
 
       if (write) {
         info.is_written = true;
-        info.RefineType(write.value().getType());
+        info.RefineType(write.getValue().getType());
         info.write_attrs = user->getAttrDictionary();
         written_regions.set(user->getParentRegion()->getRegionNumber());
       }
@@ -397,7 +397,7 @@ void RegionResourceHoister::ReplaceResourceLoads(Region& region,
   // ops nested deeper in regions.
   auto all_reads = region.front().getOps<TF::ReadVariableOp>();
   for (auto read_op : llvm::make_early_inc_range(all_reads)) {
-    Value resource = read_op.resource();
+    Value resource = read_op.getResource();
     if (!Contains(resource)) continue;
 
     ResourceInfo& info = resources_[resource];
@@ -435,7 +435,7 @@ void RegionResourceHoister::AppendResourceStoreValueToReturn(
     // regions should have been lifted out.
     auto assign_ops = front.getOps<TF::AssignVariableOp>();
     for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
-      Value resource = assign_variable_op.resource();
+      Value resource = assign_variable_op.getResource();
       if (!IsWritten(resource)) continue;
 
       // TODO(ycao): Prevent same value from being returned multiple times.
@@ -443,7 +443,7 @@ void RegionResourceHoister::AppendResourceStoreValueToReturn(
       // of cluster. Both of these can be post-resource-op-lifting cleanup
       // passes.
       int result_index = resources_[resource].result_index;
-      new_return_operands[result_index] = assign_variable_op.value();
+      new_return_operands[result_index] = assign_variable_op.getValue();
       assign_variable_op.erase();
     }
     old_return->setOperands(new_return_operands);
@@ -633,7 +633,7 @@ LogicalResult FindResourceArgUseInfo(
       if (auto assign = llvm::dyn_cast<TF::AssignVariableOp>(user)) {
         read_or_assigned = true;
         info.updated = true;
-        info.data_type = assign.value().getType();
+        info.data_type = assign.getValue().getType();
         continue;
       }
 
@@ -768,11 +768,11 @@ LogicalResult LiftArgRetResourcesForFunction(
   // For writes, invoke the callback and then erase the write.
   auto assign_ops = func_op.front().getOps<TF::AssignVariableOp>();
   for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
-    Value resource = assign_variable_op.resource();
+    Value resource = assign_variable_op.getResource();
     if (!hoister.Contains(resource)) continue;
 
     auto arg = resource.dyn_cast<BlockArgument>();
-    handle_updated_arg_value(arg.getArgNumber(), assign_variable_op.value());
+    handle_updated_arg_value(arg.getArgNumber(), assign_variable_op.getValue());
     assign_variable_op.erase();
   }
 
@@ -965,7 +965,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<func::FuncOp> branches) {
   // Now use the filtered original operands, which will be replaced by
   // AddLoadsStoresOutsideControlFlowOp().
   auto new_operands =
-      FilterRange<Value, OperandRange>(op.input(), resource_arg_uses);
+      FilterRange<Value, OperandRange>(op.getInput(), resource_arg_uses);
   new_operands.insert(new_operands.begin(), op.getOperand(0));
   func::FuncOp first_func = branches.front();
   auto new_op = builder.create<CaseOrIfOp>(
@@ -1110,8 +1110,8 @@ void UpdatePartitionedCallOpWithNewCallee(
   OpBuilder builder(call_op);
   // Now use the filtered original operands, which will be replaced by
   // AddLoadsStoresOutsideControlFlowOp().
-  auto new_operands =
-      FilterRange<Value, OperandRange>(call_op.args(), lifting_info.use_info);
+  auto new_operands = FilterRange<Value, OperandRange>(call_op.getArgs(),
+                                                       lifting_info.use_info);
   auto new_call = builder.create<CallOpType>(
       call_op.getLoc(),
       lifting_info.lifted_callee.getFunctionType().getResults(), new_operands,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
index f2a62a96db1..14d0b1047fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -36,7 +36,7 @@ bool IsResource(Value value) {
 bool IsCastOfResource(Operation &op) {
   auto cast = dyn_cast<TF::CastOp>(op);
   if (!cast) return false;
-  return IsResource(cast.x());
+  return IsResource(cast.getX());
 }
 
 // Removes passthrough ops in the block. The device computation does not need
@@ -60,7 +60,7 @@ void RemoveDeadLocalVariables(Block &block) {
     }
   }
   for (auto local_var : local_vars) {
-    auto users = local_var.resource().getUsers();
+    auto users = local_var.getResource().getUsers();
     if (llvm::all_of(users, [](const Operation *user) {
           return isa<TF::AssignVariableOp>(user);
         })) {
@@ -120,7 +120,7 @@ void EliminateUnusedResults(
 func::FuncOp CloneFunctionIfNeeded(func::FuncOp func) {
   ModuleOp module = func->getParentOfType<ModuleOp>();
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
-  if (func_uses.has_value() && llvm::hasSingleElement(func_uses.getValue()))
+  if (func_uses.has_value() && llvm::hasSingleElement(func_uses.value()))
     return func;
   func::FuncOp cloned = func.clone();
   cloned.setPrivate();
@@ -220,8 +220,10 @@ void EliminateUnusedResultsForWhile(TF::WhileOp op) {
 
   func::FuncOp cloned_cond = CloneFunctionIfNeeded(cond);
   func::FuncOp cloned_body = CloneFunctionIfNeeded(body);
-  op.condAttr(FlatSymbolRefAttr::get(op.getContext(), cloned_cond.getName()));
-  op.bodyAttr(FlatSymbolRefAttr::get(op.getContext(), cloned_body.getName()));
+  op.setCondAttr(
+      FlatSymbolRefAttr::get(op.getContext(), cloned_cond.getName()));
+  op.setBodyAttr(
+      FlatSymbolRefAttr::get(op.getContext(), cloned_body.getName()));
 
   // Drop cond/body args and return value. WhileOp result will be dropped later
   // in EliminateUnusedResults. Traverse in reverse order so that indices to be
@@ -270,21 +272,21 @@ LogicalResult ForwardCommonArgToOutput(Operation *op,
       }
       if (!common_arg_index.has_value()) {
         common_arg_index = block_arg.getArgNumber();
-      } else if (common_arg_index.getValue() != block_arg.getArgNumber()) {
+      } else if (common_arg_index.value() != block_arg.getArgNumber()) {
         return op->emitError("result #")
                << result_idx
                << " is not tied to the same argument across all branches";
       }
     }
 
-    if (io_match && result_idx != common_arg_index.getValue()) {
+    if (io_match && result_idx != common_arg_index.value()) {
       return op->emitOpError("Result #")
              << result_idx << " is tied to argument #"
-             << common_arg_index.getValue();
+             << common_arg_index.value();
     }
 
     // Forward the corresponding input to the output
-    result.replaceAllUsesWith(branch_args[common_arg_index.getValue()]);
+    result.replaceAllUsesWith(branch_args[common_arg_index.value()]);
   }
   return success();
 }
@@ -373,8 +375,8 @@ LogicalResult CanonicalizeRegionIfCaseCluster(Operation *op) {
 // the body, the result is replaced with the operand and all argument/results
 // and retuns values corresponding to that result are dropped.
 LogicalResult CanonicalizeWhileRegion(TF::WhileRegionOp op) {
-  Region &body = op.body();
-  Region &cond = op.cond();
+  Region &body = op.getBody();
+  Region &cond = op.getCond();
   llvm::BitVector can_eliminate(op.getNumResults());
 
   // Traverse in reverse order so that indices to be deleted stay unchanged.
@@ -423,11 +425,12 @@ LogicalResult CleanupAndCanonicalize(Operation *parent_op) {
 
     if (auto if_op = dyn_cast<TF::IfOp>(op)) {
       result = CanonicalizeFunctionalIfCase(
-          op, {if_op.then_function(), if_op.else_function()}, if_op.input());
+          op, {if_op.then_function(), if_op.else_function()}, if_op.getInput());
     } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
       SmallVector<func::FuncOp, 4> branches;
       case_op.get_branch_functions(branches);
-      result = CanonicalizeFunctionalIfCase(case_op, branches, case_op.input());
+      result =
+          CanonicalizeFunctionalIfCase(case_op, branches, case_op.getInput());
     } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
       if (while_op.cond_function().walk(check_while_cond).wasInterrupted())
         return WalkResult::interrupt();
@@ -436,7 +439,7 @@ LogicalResult CleanupAndCanonicalize(Operation *parent_op) {
                    op)) {
       result = CanonicalizeRegionIfCaseCluster(op);
     } else if (auto while_region = dyn_cast<TF::WhileRegionOp>(op)) {
-      if (while_region.cond().walk(check_while_cond).wasInterrupted())
+      if (while_region.getCond().walk(check_while_cond).wasInterrupted())
         return WalkResult::interrupt();
       // For while region, the body input and output arg should match.
       result = CanonicalizeWhileRegion(while_region);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
index 2084ff3d99b..2ff6c78896f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
@@ -72,7 +72,7 @@ LogicalResult RunOnRegion(Region* region) {
   if (!recv_op && !send_op) return success();
 
   Location loc = recv_op ? recv_op.getLoc() : send_op.getLoc();
-  StringRef config = recv_op ? recv_op.config() : send_op.config();
+  StringRef config = recv_op ? recv_op.getConfig() : send_op.getConfig();
 
   // Create XlaRecvTPUEmbeddingDeduplicationData op.
   OpBuilder builder(region);
@@ -89,8 +89,8 @@ LogicalResult RunOnRegion(Region* region) {
   // Rewrite SendTPUEmbeddingGradients op to the corresponding internal op and
   // then update the OperandSegmentSize attribute.
   if (send_op) {
-    int32_t operand_sizes[] = {static_cast<int32_t>(send_op.N()),
-                               static_cast<int32_t>(send_op.NN()), 1};
+    int32_t operand_sizes[] = {static_cast<int32_t>(send_op.getN()),
+                               static_cast<int32_t>(send_op.getNN()), 1};
     auto operand_size_attr = builder.getDenseI32ArrayAttr(operand_sizes);
 
     auto new_send_op = AddOperandAndRewriteAs<XlaSendTPUEmbeddingGradientsOp>(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
index 4be54c05b90..0d2406e30e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h"
 
+#include <optional>
 #include <string>
 
 #include "tensorflow/core/util/device_name_utils.h"
@@ -30,14 +31,14 @@ const char kDeviceGpu[] = "GPU";
 llvm::Optional<std::string> GetOpDevice(mlir::Operation *op) {
   mlir::StringAttr device = op->getAttrOfType<mlir::StringAttr>(kDeviceAttr);
   if (!device || device.getValue().empty()) {
-    return llvm::None;
+    return std::nullopt;
   }
   tensorflow::DeviceNameUtils::ParsedName parsed_name;
   if (!tensorflow::DeviceNameUtils::ParseFullName(device.str(), &parsed_name)) {
-    return llvm::None;
+    return std::nullopt;
   }
   if (!parsed_name.has_type) {
-    return llvm::None;
+    return std::nullopt;
   }
   return parsed_name.type;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc b/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
index 0eac58abbcd..fed9894db2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/set_tpu_infeed_layout.cc
@@ -45,7 +45,7 @@ bool SetTPUInfeedLayout(mlir::OwningOpRef<mlir::ModuleOp> &mlir_module) {
     // Do not append a UnitAttr for the "token" operand here to avoid
     // compilation failure when exporting the "layouts" attribute to a graph
     // node. Instead, add the UnitAttr during LegalizeTF pass.
-    op->setAttr("layouts", layout.getValue());
+    op->setAttr("layouts", layout.value());
 
     return mlir::WalkResult::advance();
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index cd6523ef81a..a77c5bb1119 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 #include <iterator>
+#include <optional>
 #include <queue>
 #include <stack>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -63,7 +63,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
@@ -233,7 +232,7 @@ bool NeedsCastBack(OpOperand& use, Dialect* tf_dialect) {
 TensorType CreateTensorType(llvm::Optional<llvm::ArrayRef<int64_t>> shape,
                             Type element_type) {
   if (shape.has_value())
-    return tensorflow::GetTypeFromTFTensorShape(shape.getValue(), element_type);
+    return tensorflow::GetTypeFromTFTensorShape(shape.value(), element_type);
   return UnrankedTensorType::get(element_type);
 }
 
@@ -246,11 +245,11 @@ bool IsTensorListInitOp(Operation* op) {
 // Returns the `element_shape` operand of the ops that create a TensorList.
 Value GetElementShapeOperand(Operation* op) {
   if (auto empty_tl = dyn_cast<EmptyTensorListOp>(op))
-    return empty_tl.element_shape();
+    return empty_tl.getElementShape();
   if (auto tl_reserve = dyn_cast<TensorListReserveOp>(op))
-    return tl_reserve.element_shape();
+    return tl_reserve.getElementShape();
   if (auto tl_from_tensor = dyn_cast<TensorListFromTensorOp>(op))
-    return tl_from_tensor.element_shape();
+    return tl_from_tensor.getElementShape();
   llvm_unreachable("unsupported TensorList op");
 }
 
@@ -318,10 +317,10 @@ bool CanInferTensorListElementType(Value tensorlist,
     for (auto& use : tensorlist.getUses()) {
       if (auto push = llvm::dyn_cast<TensorListPushBackOp>(use.getOwner())) {
         auto element_type =
-            push.tensor().getType().dyn_cast<RankedTensorType>();
+            push.getTensor().getType().dyn_cast<RankedTensorType>();
         if (!verify_and_update_potential_element_type(element_type))
           return false;
-        worklist.emplace(push.output_handle());
+        worklist.emplace(push.getOutputHandle());
         continue;
       }
       if (auto scatter = llvm::dyn_cast<TensorListScatterIntoExistingListOp>(
@@ -329,27 +328,27 @@ bool CanInferTensorListElementType(Value tensorlist,
         // For scatter op we can get the element shape by dropping the first
         // dimension of the input tensor.
         RankedTensorType element_type =
-            DropFirstDimension(scatter.tensor().getType());
+            DropFirstDimension(scatter.getTensor().getType());
         if (!verify_and_update_potential_element_type(element_type))
           return false;
-        worklist.emplace(scatter.output_handle());
+        worklist.emplace(scatter.getOutputHandle());
         continue;
       }
       if (auto set_item = llvm::dyn_cast<TensorListSetItemOp>(use.getOwner())) {
         auto element_type =
-            set_item.item().getType().dyn_cast<RankedTensorType>();
+            set_item.getItem().getType().dyn_cast<RankedTensorType>();
         DCOMMENT("\tTensorListSetItemOp " << element_type);
         if (!verify_and_update_potential_element_type(element_type))
           return false;
-        worklist.emplace(set_item.output_handle());
+        worklist.emplace(set_item.getOutputHandle());
         continue;
       }
       if (auto pop = llvm::dyn_cast<TensorListPopBackOp>(use.getOwner())) {
-        worklist.emplace(pop.output_handle());
+        worklist.emplace(pop.getOutputHandle());
         continue;
       }
       if (auto resize = llvm::dyn_cast<TensorListResizeOp>(use.getOwner())) {
-        worklist.emplace(resize.output_handle());
+        worklist.emplace(resize.getOutputHandle());
         continue;
       }
       // WhileRegionOp can explicitly capture TensorList value to be used inside
@@ -449,6 +448,8 @@ struct ValuePort {
     return producer == other.producer && port == other.port;
   }
 
+  ValuePort() = default;
+
   // Convert output value to ValuePort.
   explicit ValuePort(Value v) {
     OpResult opr = v.dyn_cast<OpResult>();
@@ -472,6 +473,8 @@ struct ValuePort {
     os << formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
     return os;
   }
+
+  bool IsValid() const { return !producer.isNull(); }
 };
 
 struct ValuePortHasher {
@@ -487,6 +490,86 @@ using ComputedQueryFn = function_ref<bool(ValuePort)>;
 using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
 using ValuePortInputs = SmallVectorImpl<ValuePort>;
 
+// Note: Following implements the rank 1 pack op case so could be
+// generalized.
+//
+// Maps the specified component in the `port` of the given op's result to one of
+// the element in the input.
+ValuePort ComputeInputComponentFor(PackOp op, ArrayRef<unsigned int> port) {
+  auto type = op.getType().cast<TensorType>();
+  if (!type.hasRank() || type.getRank() != 1) return {};
+  if (port.size() != 2) return {};
+  assert(port[0] == 0);
+  return ValuePort(op.getOperand(port[1]));
+}
+
+ValuePort ComputeInputComponentFor(ConcatV2Op op, ArrayRef<unsigned int> port) {
+  if (port.size() != 2) return {};
+  assert(port[0] == 0);
+
+  int64_t element_idx = port[1];
+  for (Value val : op.getValues()) {
+    auto val_ty = val.getType().cast<TensorType>();
+    if (!val_ty.hasStaticShape() || val_ty.getRank() != 1) return {};
+
+    int64_t dim_size = val_ty.getNumElements();
+    if (element_idx >= dim_size) {
+      element_idx -= dim_size;
+      continue;
+    }
+
+    ValuePort req(val);
+    req.port.push_back(element_idx);
+    return req;
+  }
+  return {};
+}
+
+ValuePort ComputeInputComponentFor(GatherV2Op op, ArrayRef<unsigned int> port) {
+  if (port.size() != 2) return {};
+  assert(port[0] == 0);
+
+  auto params = op.getParams();
+  auto params_ty = params.getType().dyn_cast<RankedTensorType>();
+  if (!params_ty || !params_ty.hasStaticShape() || params_ty.getRank() != 1 ||
+      op.getBatchDims() != 0) {
+    return {};
+  }
+
+  DenseIntElementsAttr axis;
+  if (!matchPattern(op.getAxis(), m_Constant(&axis)) ||
+      axis.getNumElements() != 1 ||
+      !axis.getSplatValue<llvm::APInt>().isZero()) {
+    return {};
+  }
+
+  DenseIntElementsAttr indices;
+  if (!matchPattern(op.getIndices(), m_Constant(&indices)) ||
+      indices.getType().getRank() != 1 || port[1] >= indices.getNumElements()) {
+    return {};
+  }
+
+  int64_t input_idx = indices.getValues<IntegerAttr>()[port[1]].getInt();
+  if (input_idx >= params_ty.getDimSize(0)) return {};
+
+  ValuePort req(params);
+  req.port.push_back(input_idx);
+  return req;
+}
+
+ValuePort ComputeInputComponentFor(Operation* op, ArrayRef<unsigned int> port) {
+  if (auto pack_op = llvm::dyn_cast<PackOp>(op)) {
+    return ComputeInputComponentFor(pack_op, port);
+  }
+  if (auto concat_op = llvm::dyn_cast<ConcatV2Op>(op)) {
+    return ComputeInputComponentFor(concat_op, port);
+  }
+  if (auto gather_op = llvm::dyn_cast<GatherV2Op>(op)) {
+    return ComputeInputComponentFor(gather_op, port);
+  }
+  return {};
+}
+
 // TODO(jpienaar): ComputeInputsRequiredForOutput and ComputeOutputComponent are
 // intended to be switched to op interfaces once more refined.
 LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
@@ -496,17 +579,11 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
   auto& port = value_port.port;
   if (!op) return failure();
 
-  // No inputs required for constants.
-  if (matchPattern(op, m_Constant())) return success();
-
-  // Note: this focusses only on the trivial pack op case and this could be
-  // generalized.
-  if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
-    auto type = pack_op.getType().cast<TensorType>();
-    if (!type.hasRank() || type.getRank() != 1) return failure();
-    if (port.size() != 2) return failure();
-    assert(port[0] == 0);
-    ValuePort req(pack_op.getOperand(port[1]));
+  // No inputs required for constants and ShapeOp.
+  if (matchPattern(op, m_Constant()) || isa<TF::ShapeOp>(op)) return success();
+
+  ValuePort req = ComputeInputComponentFor(op, port);
+  if (req.IsValid()) {
     if (!has_been_computed(req)) inputs->push_back(req);
     return success();
   }
@@ -533,23 +610,49 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
   ElementsAttr attr;
   if (matchPattern(op, m_Constant(&attr))) {
     if (port.size() == 1 && port[0] == 0) return attr;
+    if (port.size() == 2) {
+      assert(port[0] == 0);
+      DenseIntElementsAttr value;
+      if (!matchPattern(op, m_Constant(&value)) ||
+          value.getType().getRank() != 1 || port[1] >= value.getNumElements()) {
+        return nullptr;
+      }
+
+      auto range = value.getValues<Attribute>();
+      auto component_ty = RankedTensorType::get({1}, value.getElementType());
+      return DenseElementsAttr::get(component_ty, range[port[1]]);
+    }
     return nullptr;
   }
 
   if (auto id = dyn_cast<IdentityOp>(op)) {
     if (port.size() == 1 && port[0] == 0)
-      return ComputeOutputComponent(ValuePort(id.input()), values);
+      return ComputeOutputComponent(ValuePort(id.getInput()), values);
     return nullptr;
   }
 
-  // Note: this focusses only on the trivial pack op case and this could be
-  // generalized.
-  if (auto pack_op = dyn_cast<TF::PackOp>(op)) {
-    TensorType type = pack_op.getType().cast<TensorType>();
-    if (!type.hasRank() || type.getRank() != 1) return nullptr;
-    if (port.size() != 2 || port[0] != 0) return nullptr;
-    ValuePort op_port(op->getOperand(port[1]));
-    return values(op_port);
+  if (auto shape_op = dyn_cast<TF::ShapeOp>(op)) {
+    // No shape available in an unranked tensor type.
+    auto operand_ty =
+        shape_op.getOperand().getType().dyn_cast<RankedTensorType>();
+    if (!operand_ty) return nullptr;
+
+    // Shape op has a single output so the first element should always be zero
+    // and the second element of port points to a particular element in the
+    // shape result.
+    if (port.size() != 2 || port[0] != 0 || port[1] >= operand_ty.getRank())
+      return nullptr;
+
+    // If the dim is dynamic, the dimension can't be inferred during
+    // compilation.
+    int64_t dim = operand_ty.getDimSize(port[1]);
+    if (dim == ShapedType::kDynamic) return nullptr;
+
+    // Create an elements attribute for the particular dimension.
+    Type element_ty = getElementTypeOrSelf(shape_op.getType());
+    APInt dim_value(element_ty.getIntOrFloatBitWidth(), dim);
+    auto component_ty = RankedTensorType::get({1}, element_ty);
+    return DenseElementsAttr::get(component_ty, {dim_value});
   }
 
   if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
@@ -566,6 +669,9 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     return nullptr;
   }
 
+  ValuePort req = ComputeInputComponentFor(op, port);
+  if (req.IsValid()) return values(req);
+
   return nullptr;
 }
 
@@ -759,6 +865,14 @@ class ShapeInference {
   // yields.
   bool InferShapeForIfRegion(IfRegionOp op);
 
+  // Infers the shape CaseOp outputs based on the shapes of branch function
+  // result types.
+  bool InferShapeForCase(CaseOp op);
+
+  // Infers the shape CaseRegion outputs based on the shapes of the branch
+  // yields.
+  bool InferShapeForCaseRegion(CaseRegionOp op);
+
   // Infers the shape of _XlaHostComputeMlir based on the host computation
   // module.  Returns true if a return type was changed.
   bool InferShapeForXlaHostComputeMlir(_XlaHostComputeMlirOp op);
@@ -967,8 +1081,8 @@ bool ShapeInference::InferShapeForIf(IfOp op) {
 bool ShapeInference::InferShapeForIfRegion(IfRegionOp op) {
   bool changed = false;
 
-  Operation* then_yield = op.then_branch().front().getTerminator();
-  Operation* else_yield = op.else_branch().front().getTerminator();
+  Operation* then_yield = op.getThenBranch().front().getTerminator();
+  Operation* else_yield = op.getElseBranch().front().getTerminator();
   for (auto result : zip(op.getResults(), then_yield->getOperandTypes(),
                          else_yield->getOperandTypes())) {
     // If then and else types do not match, skip refinement for that result.
@@ -979,6 +1093,44 @@ bool ShapeInference::InferShapeForIfRegion(IfRegionOp op) {
   return changed;
 }
 
+bool ShapeInference::InferShapeForCase(CaseOp op) {
+  DCOMMENT_OP(op.getOperation(), "Infer shape for case ");
+
+  llvm::SmallVector<TypeRange> branch_result_types;
+  for (int i = 0; i < op.num_branches(); ++i) {
+    branch_result_types.push_back(op.ResolveBranchFunction(&symbol_table_, i)
+                                      .getFunctionType()
+                                      .getResults());
+  }
+
+  bool changed = false;
+  for (const auto& result : op.getResults()) {
+    llvm::DenseSet<Type> types;
+    for (const auto& branch_result_type : branch_result_types) {
+      types.insert(branch_result_type[result.getResultNumber()]);
+    }
+    if (types.size() == 1) {
+      changed = RefineResultType(op, result, *types.begin()) || changed;
+    }
+  }
+  return changed;
+}
+
+bool ShapeInference::InferShapeForCaseRegion(CaseRegionOp op) {
+  bool changed = false;
+  for (const auto& result : op.getResults()) {
+    llvm::DenseSet<Type> types;
+    for (auto& branch : op.getBranches()) {
+      Operation* yield = branch.front().getTerminator();
+      types.insert(yield->getOperandTypes()[result.getResultNumber()]);
+    }
+    if (types.size() == 1) {
+      changed = RefineResultType(op, result, *types.begin()) || changed;
+    }
+  }
+  return changed;
+}
+
 bool ShapeInference::InferShapeForXlaHostComputeMlir(
     _XlaHostComputeMlirOp host_compute_op) {
   // Extract the module and function.
@@ -1043,7 +1195,7 @@ bool ShapeInference::InferShapeForRestore(Operation* op) {
     if (!assign_op) {
       continue;
     }
-    auto subtypes = getElementTypeOrSelf(assign_op.resource())
+    auto subtypes = getElementTypeOrSelf(assign_op.getResource())
                         .cast<TF::ResourceType>()
                         .getSubtypes();
     if (subtypes.empty()) {
@@ -1145,7 +1297,7 @@ bool ShapeInference::InferShapeForMapDataset(MapDatasetOp op,
   // op. The MapDataset op always has N+1 inputs.
   // TODO(jpienaar): Avoid this lookup.
   auto module = op->getParentOfType<ModuleOp>();
-  auto f = module.lookupSymbol<func::FuncOp>(op.f());
+  auto f = module.lookupSymbol<func::FuncOp>(op.getF());
   // Skip if function is not found or more than one caller.
   if (!f || !llvm::hasSingleElement(GetCallers(f))) return false;
   return InferShapeForDatasetOpCommon(op, f, max_iterations);
@@ -1159,7 +1311,7 @@ bool ShapeInference::InferShapeForTakeWhileDataset(TakeWhileDatasetOp op,
   // TakeWhileDataset op. The TakeWhileDataset op always has N+1 inputs.
   // TODO(jpienaar): Avoid this lookup.
   auto module = op->getParentOfType<ModuleOp>();
-  auto f = module.lookupSymbol<func::FuncOp>(op.predicate());
+  auto f = module.lookupSymbol<func::FuncOp>(op.getPredicate());
   // Skip if function is not found or more than one caller.
   if (!f || !llvm::hasSingleElement(GetCallers(f))) return false;
   return InferShapeForDatasetOpCommon(op, f, max_iterations);
@@ -1177,14 +1329,14 @@ bool ShapeInference::InferShapeForReduceDataset(ReduceDatasetOp op,
 
   // TODO(jpienaar): Avoid this lookup.
   auto module = op->getParentOfType<ModuleOp>();
-  auto f = module.lookupSymbol<func::FuncOp>(op.f());
+  auto f = module.lookupSymbol<func::FuncOp>(op.getF());
 
   // Skip if function is not found or it has more than one caller.
   if (!f || !llvm::hasSingleElement(GetCallers(f))) return false;
 
-  DatasetInput input_elements = GetDatasetInput(op.input_dataset());
+  DatasetInput input_elements = GetDatasetInput(op.getInputDataset());
 
-  const int num_states = op.output_shapes().size();
+  const int num_states = op.getOutputShapes().size();
   const int num_captured_arguments = op.getNumOperands() - 1 - num_states;
 
   // If input_elements is undefined, we can still infer the shapes for the
@@ -1217,7 +1369,7 @@ bool ShapeInference::InferShapeForReduceDataset(ReduceDatasetOp op,
 
   // Set the first num_states arguments shapes & types from the state.
   for (int i = 0; i < num_states; ++i) {
-    Type t = GetType(op.output_shapes()[i], op.output_types()[i]);
+    Type t = GetType(op.getOutputShapes()[i], op.getOutputTypes()[i]);
     t = TypeMeet(*it, t);
     changed = changed || (t != *it);
     *it++ = t;
@@ -1263,7 +1415,7 @@ bool ShapeInference::InferShapeForTensorListInitOps(Operation* op) {
   if (auto tl_from_tensor = dyn_cast<TensorListFromTensorOp>(op)) {
     // For TensorListFromTensor op we can infer element shape by dropping the
     // first dimension of input tensor.
-    element_type = DropFirstDimension(tl_from_tensor.tensor().getType());
+    element_type = DropFirstDimension(tl_from_tensor.getTensor().getType());
     if (!element_type || !element_type.hasStaticShape()) return false;
   }
   if (!CanInferTensorListElementType(handle, initial_element_shape,
@@ -1284,7 +1436,7 @@ bool ShapeInference::InferShapeForTensorListInitOps(Operation* op) {
 bool ShapeInference::InferShapeForVarHandleOp(VarHandleOp op) {
   DCOMMENT_OP(op, "Inferring shape for VarHandleOp");
 
-  Value resource = op.resource();
+  Value resource = op.getResource();
   if (!CanBeRefined(resource.getType())) return false;
 
   // Make sure there are only use cases from the `AssignVariableOp` and
@@ -1304,9 +1456,9 @@ bool ShapeInference::InferShapeForVarHandleOp(VarHandleOp op) {
     Operation* def = use.getOwner();
     Value value;
     if (AssignVariableOp assign_op = dyn_cast<AssignVariableOp>(def)) {
-      value = assign_op.value();
+      value = assign_op.getValue();
     } else if (ReadVariableOp read_op = dyn_cast<ReadVariableOp>(def)) {
-      value = read_op.value();
+      value = read_op.getValue();
     } else {
       llvm_unreachable("unexpected operator type");
     }
@@ -1327,7 +1479,7 @@ bool ShapeInference::InferShapeForVarHandleOp(VarHandleOp op) {
 }
 
 // Helper function for creating a Window proto from user-supplied data.
-// Returns llvm::None if the user-supplied data was invalid.
+// Returns std::nullopt if the user-supplied data was invalid.
 llvm::Optional<xla::Window> InferWindowFromDimensions(
     llvm::SmallVector<int64_t> window_dimensions,
     llvm::SmallVector<int64_t> window_strides,
@@ -1352,7 +1504,7 @@ llvm::Optional<xla::Window> InferWindowFromDimensions(
         verify_size(padding.size(), "padding entries") &&
         verify_size(lhs_dilation.size(), "lhs dilation factors") &&
         verify_size(rhs_dilation.size(), "rhs dilation factors")))
-    return llvm::None;
+    return std::nullopt;
 
   xla::Window window;
   for (size_t i = 0; i < window_dimensions.size(); i++) {
@@ -1392,7 +1544,7 @@ llvm::Optional<RankedTensorType> InferWindowOutputShape(
     llvm::errs() << "Window has dimension " << window.dimensions_size()
                  << " but base shape has dimension " << base_shape.getRank()
                  << "\n";
-    return llvm::None;
+    return std::nullopt;
   }
 
   std::vector<int64_t> output_dimensions(window.dimensions_size());
@@ -1402,26 +1554,26 @@ llvm::Optional<RankedTensorType> InferWindowOutputShape(
     if (dim.size() <= 0) {
       llvm::errs() << "Window " << window.DebugString()
                    << " has a non-positive dimension.\n";
-      return llvm::None;
+      return std::nullopt;
     }
     if (dim.stride() <= 0) {
       llvm::errs() << "Window " << window.DebugString()
                    << " has a non-positive stride.\n";
-      return llvm::None;
+      return std::nullopt;
     }
     if (dim.base_dilation() < 1) {
       llvm::errs() << "Window " << window.DebugString()
                    << " has a non-positive base area dilation factor.\n";
-      return llvm::None;
+      return std::nullopt;
     }
     if (dim.window_dilation() < 1) {
       llvm::errs() << "Window " << window.DebugString()
                    << " has a non-positive window dilation factor.\n";
-      return llvm::None;
+      return std::nullopt;
     }
 
     if (base_shape.isDynamicDim(i)) {
-      output_dimensions[i] = ShapedType::kDynamicSize;
+      output_dimensions[i] = ShapedType::kDynamic;
     } else {
       const int64_t dilated_base = xla::window_util::DilatedBound(
           base_shape.getDimSize(i), dim.base_dilation());
@@ -1443,15 +1595,15 @@ bool ShapeInference::InferShapeForXlaReduceWindowOp(XlaReduceWindowOp op) {
 
   bool changed = false;
 
-  auto input_ty = op.input().getType().cast<ShapedType>();
+  auto input_ty = op.getInput().getType().cast<ShapedType>();
   DenseElementsAttr window_dimensions, window_strides, base_dilations,
       window_dilations, padding;
   if (input_ty.hasStaticShape() &&
-      matchPattern(op.window_dimensions(), m_Constant(&window_dimensions)) &&
-      matchPattern(op.window_strides(), m_Constant(&window_strides)) &&
-      matchPattern(op.base_dilations(), m_Constant(&base_dilations)) &&
-      matchPattern(op.window_dilations(), m_Constant(&window_dilations)) &&
-      matchPattern(op.padding(), m_Constant(&padding))) {
+      matchPattern(op.getWindowDimensions(), m_Constant(&window_dimensions)) &&
+      matchPattern(op.getWindowStrides(), m_Constant(&window_strides)) &&
+      matchPattern(op.getBaseDilations(), m_Constant(&base_dilations)) &&
+      matchPattern(op.getWindowDilations(), m_Constant(&window_dilations)) &&
+      matchPattern(op.getPadding(), m_Constant(&padding))) {
     llvm::SmallVector<int64_t> window_dimensions_vec, window_strides_vec,
         base_dilations_vec, window_dilations_vec;
     llvm::SmallVector<std::pair<int64_t, int64_t>> padding_pairs(
@@ -1489,15 +1641,15 @@ bool ShapeInference::InferShapeForXlaReduceWindowOp(XlaReduceWindowOp op) {
       op->emitOpError("failed to create window");
     }
     auto output_shape = InferWindowOutputShape(
-        input_ty, window.getValue(),
-        op.init_value().getType().cast<ShapedType>().getElementType());
+        input_ty, window.value(),
+        op.getInitValue().getType().cast<ShapedType>().getElementType());
 
     if (!output_shape) {
       op->emitOpError("failed to infer output shape");
     }
 
     changed = RefineResultType(op.getOperation(), op.getResult(),
-                               output_shape.getValue());
+                               output_shape.value());
   }
 
   return changed;
@@ -1507,13 +1659,13 @@ bool ShapeInference::InferShapeForXlaSelectAndScatterOp(
     XlaSelectAndScatterOp op) {
   DCOMMENT_OP(op, "Inferring shape for XlaSelectAndScatterOp");
 
-  auto operand_shape = op.operand().getType().cast<ShapedType>();
-  auto source_shape = op.source().getType().cast<ShapedType>();
+  auto operand_shape = op.getOperand().getType().cast<ShapedType>();
+  auto source_shape = op.getSource().getType().cast<ShapedType>();
   DenseElementsAttr window_dimensions, window_strides, padding;
   if (operand_shape.hasRank() && source_shape.hasRank() &&
-      matchPattern(op.window_dimensions(), m_Constant(&window_dimensions)) &&
-      matchPattern(op.window_strides(), m_Constant(&window_strides)) &&
-      matchPattern(op.padding(), m_Constant(&padding))) {
+      matchPattern(op.getWindowDimensions(), m_Constant(&window_dimensions)) &&
+      matchPattern(op.getWindowStrides(), m_Constant(&window_strides)) &&
+      matchPattern(op.getPadding(), m_Constant(&padding))) {
     llvm::SmallVector<int64_t> window_dimensions_vec, window_strides_vec,
         base_dilations_vec, window_dilations_vec;
     llvm::SmallVector<std::pair<int64_t, int64_t>> padding_pairs(
@@ -1540,36 +1692,36 @@ bool ShapeInference::InferShapeForXlaSelectAndScatterOp(
       op->emitOpError("failed to create window");
     }
     auto window_result_shape = InferWindowOutputShape(
-        operand_shape, window.getValue(), operand_shape.getElementType());
+        operand_shape, window.value(), operand_shape.getElementType());
 
     if (!window_result_shape) {
       op->emitOpError("failed to infer window result shape");
     }
 
-    if (window_result_shape.getValue() != source_shape) {
+    if (window_result_shape.value() != source_shape) {
       op->emitOpError(
           "Source shape does not match the shape of window-reduced operand.");
     }
   }
 
   return RefineResultType(op.getOperation(), op.getResult(),
-                          op.operand().getType());
+                          op.getOperand().getType());
 }
 
 bool ShapeInference::InferShapeForXlaGatherOp(XlaGatherOp op) {
-  xla::Shape input_shape = xla::TypeToShape(op.operand().getType());
+  xla::Shape input_shape = xla::TypeToShape(op.getOperand().getType());
   if (input_shape == xla::Shape()) return false;
 
   xla::Shape start_indices_shape =
-      xla::TypeToShape(op.start_indices().getType());
+      xla::TypeToShape(op.getStartIndices().getType());
   if (start_indices_shape == xla::Shape()) return false;
 
   xla::GatherDimensionNumbers gather_dim_numbers;
-  if (!gather_dim_numbers.ParseFromString(op.dimension_numbers().str()))
+  if (!gather_dim_numbers.ParseFromString(op.getDimensionNumbers().str()))
     return false;
 
   DenseIntElementsAttr slice_sizes_attr;
-  if (!matchPattern(op.slice_sizes(), m_Constant(&slice_sizes_attr)))
+  if (!matchPattern(op.getSliceSizes(), m_Constant(&slice_sizes_attr)))
     return false;
   llvm::SmallVector<int64_t> slice_sizes;
   for (const auto& attr : slice_sizes_attr.getValues<APInt>()) {
@@ -1590,7 +1742,7 @@ bool ShapeInference::InferShapeForXlaGatherOp(XlaGatherOp op) {
     return false;
   }
 
-  return RefineResultType(op, op.output(), *refined_type);
+  return RefineResultType(op, op.getOutput(), *refined_type);
 }
 
 llvm::Optional<RankedTensorType> InferXlaConvOutputShape(
@@ -1631,11 +1783,11 @@ llvm::Optional<RankedTensorType> InferXlaConvOutputShape(
                                 lhs_dilations, rhs_dilations);
 
   auto output_shape =
-      InferWindowOutputShape(base_shape, window.getValue(), element_type);
+      InferWindowOutputShape(base_shape, window.value(), element_type);
 
   for (auto i = 0; i < num_spatial_dims; ++i) {
     output_dims[dnums.output_spatial_dimensions(i)] =
-        output_shape.getValue().getShape()[i];
+        output_shape.value().getShape()[i];
     DCOMMENT("inferrd output spatial dimension "
              << i << " at dimension numebr "
              << dnums.output_spatial_dimensions(i) << " is "
@@ -1650,14 +1802,14 @@ llvm::Optional<RankedTensorType> InferXlaConvOutputShape(
 // "third_party/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc" is
 // resolved
 LogicalResult PrecheckForXlaConvV2Op(XlaConvV2Op op) {
-  auto input_tensor = op.lhs();
-  auto kernel_tensor = op.rhs();
-  auto window_strides = op.window_strides();
-  auto padding = op.padding();
-  auto lhs_dilation = op.lhs_dilation();
-  auto rhs_dilation = op.rhs_dilation();
-  auto feature_group_count = op.feature_group_count();
-  int64_t batch_group_count = op.batch_group_count();
+  auto input_tensor = op.getLhs();
+  auto kernel_tensor = op.getRhs();
+  auto window_strides = op.getWindowStrides();
+  auto padding = op.getPadding();
+  auto lhs_dilation = op.getLhsDilation();
+  auto rhs_dilation = op.getRhsDilation();
+  auto feature_group_count = op.getFeatureGroupCount();
+  int64_t batch_group_count = op.getBatchGroupCount();
 
   auto input_args_have_static_shape = [&]() -> bool {
     return input_tensor.getType().cast<TensorType>().hasStaticShape() &&
@@ -1700,7 +1852,7 @@ LogicalResult PrecheckForXlaConvV2Op(XlaConvV2Op op) {
 
   DenseElementsAttr feature_group_count_attr;
   xla::ConvolutionDimensionNumbers dnums;
-  dnums.ParseFromString(op.dimension_numbersAttr().getValue().str());
+  dnums.ParseFromString(op.getDimensionNumbersAttr().getValue().str());
   if (dnums.input_spatial_dimensions_size() !=
       dnums.kernel_spatial_dimensions_size()) {
     return op.emitOpError() << "Both arguments to convolution must have "
@@ -1783,13 +1935,13 @@ bool ShapeInference::InferShapeForXlaConvV2Op(XlaConvV2Op op) {
     return changed;
   }
 
-  auto input_tensor = op.lhs();
-  auto kernel_tensor = op.rhs();
-  auto window_strides = op.window_strides();
-  auto padding = op.padding();
-  auto lhs_dilation = op.lhs_dilation();
-  auto rhs_dilation = op.rhs_dilation();
-  int64_t batch_group_count = op.batch_group_count();
+  auto input_tensor = op.getLhs();
+  auto kernel_tensor = op.getRhs();
+  auto window_strides = op.getWindowStrides();
+  auto padding = op.getPadding();
+  auto lhs_dilation = op.getLhsDilation();
+  auto rhs_dilation = op.getRhsDilation();
+  int64_t batch_group_count = op.getBatchGroupCount();
 
   DenseIntElementsAttr window_strides_attr, padding_attr, lhs_dilation_attr,
       rhs_dilation_attr;
@@ -1802,7 +1954,7 @@ bool ShapeInference::InferShapeForXlaConvV2Op(XlaConvV2Op op) {
     llvm::SmallVector<std::pair<int64_t, int64_t>> padding_pairs(
         padding_attr.getNumElements() / 2);
     xla::ConvolutionDimensionNumbers dnums;
-    dnums.ParseFromString(op.dimension_numbersAttr().getValue().str());
+    dnums.ParseFromString(op.getDimensionNumbersAttr().getValue().str());
 
     auto input_tensor_shape = input_tensor.getType().cast<RankedTensorType>();
     for (auto i = 0; i < input_tensor_shape.getShape().size(); ++i) {
@@ -1847,9 +1999,9 @@ bool ShapeInference::InferShapeForXlaConvV2Op(XlaConvV2Op op) {
         padding_pairs, lhs_dilations_vec, rhs_dilations_vec, batch_group_count,
         dnums, element_type);
 
-    if (output_shape.getValue()) {
+    if (output_shape.value()) {
       changed = RefineResultType(op.getOperation(), op.getResult(),
-                                 output_shape.getValue());
+                                 output_shape.value());
       return changed;
     }
   }
@@ -2050,7 +2202,7 @@ bool CanWhileTypeBeRefinedWith(TensorType current_type,
     int64_t current_dim = std::get<0>(dim);
     int64_t potential_refined_dim = std::get<1>(dim);
     if (current_dim != potential_refined_dim &&
-        current_dim != ShapedType::kDynamicSize)
+        current_dim != ShapedType::kDynamic)
       return false;
   }
   return true;
@@ -2059,12 +2211,12 @@ bool CanWhileTypeBeRefinedWith(TensorType current_type,
 template <typename WhileOpTy>
 bool ShapeInference::InferShapeForWhile(WhileOpTy op,
                                         TypeRange body_result_types) {
-  if (!op.shape_invariant())
-    return RefineTypeForPassThroughOperands(op, op.input(), op.output());
+  if (!op.getShapeInvariant())
+    return RefineTypeForPassThroughOperands(op, op.getInput(), op.getOutput());
 
   bool changed = false;
   for (auto entry :
-       zip(op.input().getTypes(), op.output(), body_result_types)) {
+       zip(op.getInput().getTypes(), op.getOutput(), body_result_types)) {
     Value result = std::get<1>(entry);
     TensorType body_result_type =
         std::get<2>(entry).template cast<TensorType>();
@@ -2143,6 +2295,11 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
   if (auto if_region = dyn_cast<IfRegionOp>(op))
     return InferShapeForIfRegion(if_region);
 
+  if (auto case_op = dyn_cast<CaseOp>(op)) return InferShapeForCase(case_op);
+
+  if (auto case_region = dyn_cast<CaseRegionOp>(op))
+    return InferShapeForCaseRegion(case_region);
+
   if (auto while_op = dyn_cast<WhileOp>(op))
     return InferShapeForWhile(
         while_op, while_op.body_function().getFunctionType().getResults());
@@ -2150,7 +2307,7 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
   if (auto while_region = dyn_cast<WhileRegionOp>(op))
     return InferShapeForWhile(
         while_region,
-        while_region.body().front().getTerminator()->getOperandTypes());
+        while_region.getBody().front().getTerminator()->getOperandTypes());
 
   if (auto host_compute_op = dyn_cast<_XlaHostComputeMlirOp>(op)) {
     return InferShapeForXlaHostComputeMlir(host_compute_op);
@@ -2215,7 +2372,7 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
 
   llvm::SmallVector<ShapedTypeComponents, 4> inferred_return_shapes;
   if (failed(InferReturnTypeComponentsForTFOp(
-          /*location=*/None, op, graph_version_, operand_as_constant_fn,
+          /*location=*/std::nullopt, op, graph_version_, operand_as_constant_fn,
           op_result_as_shape_fn, result_element_type_fn,
           inferred_return_shapes)))
     return false;
@@ -2230,8 +2387,8 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op,
     ShapedTypeComponents inferred = std::get<1>(result);
     TensorType inferred_type;
     if (inferred.hasRank()) {
-      inferred_type = tensorflow::GetTypeFromTFTensorShape(
-          inferred.getDims(), inferred.getElementType());
+      inferred_type =
+          RankedTensorType::get(inferred.getDims(), inferred.getElementType());
 
     } else {
       inferred_type = UnrankedTensorType::get(inferred.getElementType());
@@ -2289,7 +2446,7 @@ FailureOr<bool> ShapeInference::PropagateShapeToFunctions(
       any_failure = true;
       continue;
     }
-    any_nonconvergence = any_nonconvergence || !failure_or_converged.getValue();
+    any_nonconvergence = any_nonconvergence || !failure_or_converged.value();
     if (failed(InferShapeForFunctionReturnType(func))) any_failure = true;
   }
   if (any_failure) return failure();
@@ -2319,7 +2476,7 @@ FailureOr<bool> ShapeInference::PropagateShapeToRegions(
         InferShapeUntilFixPoint(region, max_iterations);
     if (failed(failure_or_converged))
       any_failure = true;
-    else if (!failure_or_converged.getValue())
+    else if (!failure_or_converged.value())
       any_nonconvergence = true;
   }
   if (any_failure) return failure();
@@ -2399,7 +2556,7 @@ RankedTensorType GetCompatibleRankedTensorType(RankedTensorType lhs,
     if (lhs_dim == std::get<1>(dim)) {
       dims.push_back(lhs_dim);
     } else {
-      dims.push_back(ShapedType::kDynamicSize);
+      dims.push_back(ShapedType::kDynamic);
     }
   }
   return tensorflow::GetTypeFromTFTensorShape(
@@ -2447,23 +2604,23 @@ FailureOr<bool> ShapeInference::PropagateShapeIntoAttachedFunctions(
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     DCOMMENT("Propagating shapes into If");
     return PropagateShapeToFunctions(
-        module, if_op.input().getTypes(),
+        module, if_op.getInput().getTypes(),
         {if_op.ResolveThenFunction(&symbol_table_),
          if_op.ResolveElseFunction(&symbol_table_)},
         max_iterations);
   } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
     SmallVector<func::FuncOp, 4> branches;
     case_op.get_branch_functions(branches);
-    return PropagateShapeToFunctions(module, case_op.input().getTypes(),
+    return PropagateShapeToFunctions(module, case_op.getInput().getTypes(),
                                      branches, max_iterations);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     // If `shape_invariant` is set, operand shapes cannot be simply propagated
     // to result shapes as the op may have different intermediate shapes (such
     // While ops can have different result shapes from operand shapes).
     // Compatible shapes must be determined before propagating them.
-    if (while_op.shape_invariant()) {
+    if (while_op.getShapeInvariant()) {
       auto compatible_types = GetWhileCompatibleTypes(
-          while_op.input().getTypes(), while_op.output().getTypes(),
+          while_op.getInput().getTypes(), while_op.getOutput().getTypes(),
           while_op.ResolveBodyFunction(&symbol_table_)
               .getFunctionType()
               .getInputs());
@@ -2474,7 +2631,7 @@ FailureOr<bool> ShapeInference::PropagateShapeIntoAttachedFunctions(
           max_iterations);
     }
     return PropagateShapeToFunctions(
-        module, while_op.input().getTypes(),
+        module, while_op.getInput().getTypes(),
         {while_op.ResolveCondFunction(&symbol_table_),
          while_op.ResolveBodyFunction(&symbol_table_)},
         max_iterations);
@@ -2504,19 +2661,19 @@ FailureOr<bool> ShapeInference::PropagateShapeIntoAttachedFunctions(
     };
 
     if (auto xla_reduce_window_op = dyn_cast<TF::XlaReduceWindowOp>(op)) {
-      return propagate_shape_to(xla_reduce_window_op.computation());
+      return propagate_shape_to(xla_reduce_window_op.getComputation());
     }
     if (auto xla_select_and_scatter_op =
             dyn_cast<TF::XlaSelectAndScatterOp>(op)) {
-      return propagate_shape_to(xla_select_and_scatter_op.select())
-                 .getValue() &&
-             propagate_shape_to(xla_select_and_scatter_op.scatter()).getValue();
+      return propagate_shape_to(xla_select_and_scatter_op.getSelect())
+                 .value() &&
+             propagate_shape_to(xla_select_and_scatter_op.getScatter()).value();
     } else if (auto xla_variadic_reduce_v2_op =
                    dyn_cast<TF::XlaVariadicReduceV2Op>(op)) {
-      return propagate_shape_to(xla_variadic_reduce_v2_op.reducer());
+      return propagate_shape_to(xla_variadic_reduce_v2_op.getReducer());
     } else if (auto xla_variadic_sort_op =
                    dyn_cast<TF::XlaVariadicSortOp>(op)) {
-      return propagate_shape_to(xla_variadic_sort_op.comparator());
+      return propagate_shape_to(xla_variadic_sort_op.getComparator());
     }
   }
 
@@ -2532,16 +2689,16 @@ FailureOr<bool> ShapeInference::PropagateShapeIntoAttachedRegions(
     // to result shapes as the op may have different intermediate shapes (such
     // While ops can have different result shapes from operand shapes).
     // Compatible shapes must be determined before propagating them.
-    if (while_op.shape_invariant()) {
+    if (while_op.getShapeInvariant()) {
       auto compatible_types = GetWhileCompatibleTypes(
-          while_op.input().getTypes(), while_op.output().getTypes(),
-          while_op.body().getArgumentTypes());
+          while_op.getInput().getTypes(), while_op.getOutput().getTypes(),
+          while_op.getBody().getArgumentTypes());
       return PropagateShapeToRegions(compatible_types,
-                                     {&while_op.cond(), &while_op.body()},
+                                     {&while_op.getCond(), &while_op.getBody()},
                                      max_iterations);
     }
-    return PropagateShapeToRegions(while_op.input().getTypes(),
-                                   {&while_op.cond(), &while_op.body()},
+    return PropagateShapeToRegions(while_op.getInput().getTypes(),
+                                   {&while_op.getCond(), &while_op.getBody()},
                                    max_iterations);
   }
   return true;
@@ -2745,7 +2902,7 @@ static FailureOr<bool> InferShapeForFunction(ShapeInference& context,
                                              int64_t max_iterations) {
   FailureOr<bool> failure_or_converged =
       context.InferShapeUntilFixPoint(&func.getBody(), max_iterations);
-  if (failed(failure_or_converged) || !failure_or_converged.getValue())
+  if (failed(failure_or_converged) || !failure_or_converged.value())
     return failure_or_converged;
   // TODO(b/156276510): Verify that it is always fine to refine a function's
   // return type, as long as we do not change the argument shapes.
@@ -2799,7 +2956,7 @@ FailureOr<bool> InferShapeForFunction(func::FuncOp func,
 
   FailureOr<bool> failure_or_converged =
       context.InferShapeUntilFixPoint(&func.getBody(), max_iterations);
-  if (failed(failure_or_converged) || !failure_or_converged.getValue())
+  if (failed(failure_or_converged) || !failure_or_converged.value())
     return failure_or_converged;
 
   if (failed(context.InferShapeForFunctionReturnType(func))) return failure();
@@ -2833,7 +2990,7 @@ FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations) {
     func::FuncOp func = context.front();
     FailureOr<bool> failure_or_converged =
         InferShapeForFunction(context, func, max_iterations);
-    if (failed(failure_or_converged) || !failure_or_converged.getValue())
+    if (failed(failure_or_converged) || !failure_or_converged.value())
       return failure_or_converged;
     context.pop_front();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 072399f8a8d..8180e2d4084 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -40,7 +40,7 @@ class ShapeInference
     auto failure_or_converged =
         InferModuleShape(getOperation(), max_iterations_);
     if (failed(failure_or_converged)) return signalPassFailure();
-    if (!failure_or_converged.getValue()) {
+    if (!failure_or_converged.value()) {
       getOperation().emitError()
           << "shape inference pass did not reach convergence after "
           << max_iterations_;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
index fa8682d9d97..1a84be115b3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sink_constant.cc
@@ -63,7 +63,7 @@ class ClusterConstantSinkingPass
         if (!const_op) return;
 
         // Filter constants using user provided predicate function.
-        if (filter && !filter(cluster, const_op.value())) return;
+        if (filter && !filter(cluster, const_op.getValue())) return;
 
         // We found a constant, try to insert it in the map and re-use its
         // cloned value if any.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index fc6d37bc7ca..c07e8b6a38a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
 #include <string>
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -139,7 +139,7 @@ LogicalResult HandleWhileOp(
   llvm::SmallDenseMap<Value, Value> body_map;
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(while_op.getOperand(index));
-    if (it == data_var_to_size_var.end()) return llvm::None;
+    if (it == data_var_to_size_var.end()) return std::nullopt;
     return it->getFirst().getType();
   };
   auto add_size_vars_to_return = [&](ArrayRef<BlockArgument> new_args) {
@@ -211,7 +211,7 @@ LogicalResult HandleIfOp(
 
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(if_op.getOperand(index + 1));
-    if (it == data_var_to_size_var.end()) return llvm::None;
+    if (it == data_var_to_size_var.end()) return std::nullopt;
     return it->getFirst().getType();
   };
   ModifyFunctionSignature(then_func, &then_map, find_arg_stack_type);
@@ -316,7 +316,7 @@ LogicalResult HandlePartitionedCallOp(
   }
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(call.getOperand(index));
-    if (it == data_var_to_size_var.end()) return llvm::None;
+    if (it == data_var_to_size_var.end()) return std::nullopt;
     return it->getFirst().getType();
   };
   ModifyFunctionSignature(lowered_callee, &callee_map, find_arg_stack_type);
@@ -355,10 +355,10 @@ LogicalResult HandleStackV2Op(
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var) {
   // Create a buffer variable and a size variable to replace the stack.
   auto elem_type = cutil::GetElementTypeFromAccess(
-      stack.handle(), module, [](Operation* user) -> llvm::Optional<Type> {
+      stack.getHandle(), module, [](Operation* user) -> llvm::Optional<Type> {
         auto push = llvm::dyn_cast<TF::StackPushV2Op>(user);
-        if (!push) return llvm::None;
-        return push.elem().getType();
+        if (!push) return std::nullopt;
+        return push.getElem().getType();
       });
   if (!elem_type.has_value()) {
     return stack.emitOpError("cannot infer element shape of stack");
@@ -366,7 +366,7 @@ LogicalResult HandleStackV2Op(
   OpBuilder builder(stack);
   Value buffer;
   if (failed(cutil::CreateInitBufferValue(
-          elem_type->getShape(), stack.max_size(), stack,
+          elem_type->getShape(), stack.getMaxSize(), stack,
           elem_type->getElementType(), builder, &buffer))) {
     return failure();
   }
@@ -384,7 +384,7 @@ LogicalResult HandleStackV2Op(
                             cutil::GetR1Const({0LL}, builder, stack.getLoc()),
                             builder, stack.getLoc());
   cutil::WriteLocalVariable(local_var, buffer, builder, stack.getLoc());
-  stack.handle().replaceAllUsesWith(local_var);
+  stack.getHandle().replaceAllUsesWith(local_var);
   (*data_var_to_size_var)[local_var] = local_size_var;
   stack.erase();
   return success();
@@ -393,22 +393,23 @@ LogicalResult HandleStackV2Op(
 LogicalResult HandleStackPushV2Op(
     TF::StackPushV2Op push,
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var) {
-  auto it = data_var_to_size_var->find(push.handle());
+  auto it = data_var_to_size_var->find(push.getHandle());
   if (it == data_var_to_size_var->end()) {
     return push.emitOpError("unknown stack");
   }
   // Push output simply forward the input element.
-  push.replaceAllUsesWith(push.elem());
+  push.replaceAllUsesWith(push.getElem());
   OpBuilder builder(push);
   // Read the current buffer and size.
   auto stack_val =
-      cutil::ReadLocalVariable(push.handle(), builder, push.getLoc());
+      cutil::ReadLocalVariable(push.getHandle(), builder, push.getLoc());
   auto index =
       cutil::ReadLocalVariable(it->getSecond(), builder, push.getLoc());
-  stack_val =
-      cutil::SetElement(index, stack_val, push.elem(), builder, push.getLoc());
+  stack_val = cutil::SetElement(index, stack_val, push.getElem(), builder,
+                                push.getLoc());
   // Assign the new buffer and size.
-  cutil::WriteLocalVariable(push.handle(), stack_val, builder, push.getLoc());
+  cutil::WriteLocalVariable(push.getHandle(), stack_val, builder,
+                            push.getLoc());
   index = builder.create<TF::AddV2Op>(
       push.getLoc(), ArrayRef<Type>{index.getType()},
       ArrayRef<Value>{index, cutil::GetR1Const({1}, builder, push.getLoc())});
@@ -420,14 +421,14 @@ LogicalResult HandleStackPushV2Op(
 LogicalResult HandleStackPopV2Op(
     TF::StackPopV2Op pop,
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var) {
-  auto it = data_var_to_size_var->find(pop.handle());
+  auto it = data_var_to_size_var->find(pop.getHandle());
   if (it == data_var_to_size_var->end()) {
     return pop.emitOpError("unknown stack");
   }
   OpBuilder builder(pop);
   // Read the current buffer and size.
   auto stack_val =
-      cutil::ReadLocalVariable(pop.handle(), builder, pop.getLoc());
+      cutil::ReadLocalVariable(pop.getHandle(), builder, pop.getLoc());
   auto size = cutil::ReadLocalVariable(it->getSecond(), builder, pop.getLoc());
   auto new_size = builder.create<TF::SubOp>(
       pop.getLoc(), ArrayRef<Type>{size.getType()},
@@ -501,7 +502,7 @@ LogicalResult DecomposeStackOpsInternal(
         return failure();
       }
     } else if (auto close = llvm::dyn_cast<TF::StackCloseV2Op>(&op)) {
-      data_var_to_size_var->erase(close.handle());
+      data_var_to_size_var->erase(close.getHandle());
       close.erase();
     } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
       if (failed(HandleWhileOp(while_op, module, *data_var_to_size_var,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/strip_saved_module_metadata.cc b/tensorflow/compiler/mlir/tensorflow/transforms/strip_saved_module_metadata.cc
index 9300109fe66..422b722c9d2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/strip_saved_module_metadata.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/strip_saved_module_metadata.cc
@@ -64,8 +64,10 @@ void StripFunction(func::FuncOp func) {
   }
 
   for (int i = 0; i < func.getNumArguments(); ++i) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs =
+        mlir::function_interface_impl::getArgAttrs(func, i);
     auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
-        func.getArgAttrs(i),
+        attrs,
         [](NamedAttribute namedAttr) { return ShouldStripAttr(namedAttr); }));
     for (auto namedAttr : stripAttrs) {
       func.removeArgAttr(i, namedAttr.getName());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/strip_tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/strip_tf_attributes.cc
index 0f2d41efbba..26f4bbf8f93 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/strip_tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/strip_tf_attributes.cc
@@ -50,8 +50,10 @@ void StripFunction(func::FuncOp func) {
   }
 
   for (int i = 0; i < func.getNumArguments(); ++i) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs =
+        mlir::function_interface_impl::getArgAttrs(func, i);
     auto stripAttrs = llvm::to_vector<4>(llvm::make_filter_range(
-        func.getArgAttrs(i),
+        attrs,
         [](NamedAttribute namedAttr) { return ShouldStripAttr(namedAttr); }));
     for (auto namedAttr : stripAttrs) {
       func.removeArgAttr(i, namedAttr.getName());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index ce9706d8fd6..d18d7781930 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -80,17 +81,17 @@ LogicalResult GetSplitElementTypeAndCount(TF::TensorArraySplitV3Op split,
                                           RankedTensorType* elem_type,
                                           int64_t* count) {
   auto lengths_const =
-      llvm::dyn_cast_or_null<TF::ConstOp>(split.lengths().getDefiningOp());
+      llvm::dyn_cast_or_null<TF::ConstOp>(split.getLengths().getDefiningOp());
   if (!lengths_const) return split.emitOpError("non-constant split lengths");
-  *count = lengths_const.value().getNumElements();
+  *count = lengths_const.getValue().getNumElements();
   if (*count <= 0) return split.emitOpError("non-positive split count");
-  auto buffer_type = split.value().getType().dyn_cast<RankedTensorType>();
+  auto buffer_type = split.getValue().getType().dyn_cast<RankedTensorType>();
   if (!buffer_type || !buffer_type.hasStaticShape() ||
       buffer_type.getRank() < 1) {
     return split.emitOpError("unknown or invalid split tensor shape");
   }
   int64_t length = buffer_type.getDimSize(0) / *count;
-  for (const auto& len : lengths_const.value().getValues<APInt>()) {
+  for (const auto& len : lengths_const.getValue().getValues<APInt>()) {
     if (length == len.getSExtValue()) continue;
     return split.emitOpError("different split lengths are not supported");
   }
@@ -106,7 +107,7 @@ LogicalResult GetSplitElementTypeAndCount(TF::TensorArraySplitV3Op split,
 // Tries to infer the tensor array element shape.
 llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
     TF::TensorArrayV3Op ta, ModuleOp module) {
-  auto element_shape = ta.element_shapeAttr().cast<mlir::TF::ShapeAttr>();
+  auto element_shape = ta.getElementShapeAttr().cast<mlir::TF::ShapeAttr>();
   if (element_shape.hasStaticShape()) {
     auto shape = element_shape.getShape();
     // Convert int64 to int64_t.
@@ -116,21 +117,21 @@ llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
 
   bool has_failure = false;
   auto elem_type = cutil::GetElementTypeFromAccess(
-      ta.handle(), module, [&](Operation* user) -> llvm::Optional<Type> {
-        if (has_failure) return llvm::None;
+      ta.getHandle(), module, [&](Operation* user) -> llvm::Optional<Type> {
+        if (has_failure) return std::nullopt;
         if (auto write = llvm::dyn_cast<TF::TensorArrayWriteV3Op>(user)) {
-          return write.value().getType();
+          return write.getValue().getType();
         } else if (auto split =
                        llvm::dyn_cast<TF::TensorArraySplitV3Op>(user)) {
-          if (!split.lengths().getDefiningOp() ||
-              !llvm::isa<TF::ConstOp>(split.lengths().getDefiningOp())) {
-            return llvm::None;
+          if (!split.getLengths().getDefiningOp() ||
+              !llvm::isa<TF::ConstOp>(split.getLengths().getDefiningOp())) {
+            return std::nullopt;
           }
           RankedTensorType t;
           int64_t count;
           if (failed(GetSplitElementTypeAndCount(split, &t, &count))) {
             has_failure = true;
-            return llvm::None;
+            return std::nullopt;
           }
           return t;
         } else if (auto scatter =
@@ -138,28 +139,28 @@ llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
           // TensorArrayScatter writes vector of tensors to TensorArray. We can
           // deduce the shape of TensorArray by dropping the 0th dim of
           // TensorArrayScatter `value`.
-          auto t = scatter.value().getType().dyn_cast<RankedTensorType>();
-          if (!t || t.getShape().empty()) return llvm::None;
+          auto t = scatter.getValue().getType().dyn_cast<RankedTensorType>();
+          if (!t || t.getShape().empty()) return std::nullopt;
           return RankedTensorType::get(t.getShape().drop_front(),
                                        t.getElementType());
         } else if (auto gather =
                        llvm::dyn_cast<TF::TensorArrayGatherV3Op>(user)) {
           // Try to infer from result type of gather.
-          auto t = gather.value().getType().dyn_cast<RankedTensorType>();
+          auto t = gather.getValue().getType().dyn_cast<RankedTensorType>();
           if (t && !t.getShape().empty())
             return RankedTensorType::get(t.getShape().drop_front(),
                                          t.getElementType());
           // Try to infer from `element_shape` attribute of gather.
-          auto element_shape = gather.element_shapeAttr()
+          auto element_shape = gather.getElementShapeAttr()
                                    .dyn_cast_or_null<mlir::TF::ShapeAttr>();
           if (element_shape && element_shape.hasStaticShape()) {
             return RankedTensorType::get(element_shape.getShape(),
-                                         gather.dtype());
+                                         gather.getDtype());
           }
         }
-        return llvm::None;
+        return std::nullopt;
       });
-  if (!elem_type) return llvm::None;
+  if (!elem_type) return std::nullopt;
   return llvm::to_vector<8>(elem_type->getShape());
 }
 
@@ -196,13 +197,13 @@ LogicalResult HandleTensorArrayV3Op(
     llvm::SmallDenseMap<Value, TensorArrayStats>* stats) {
   auto elem_shape = GetTensorArrayElementShape(ta, module);
   if (!elem_shape) return ta.emitOpError("unknown element shape");
-  if (ta.dynamic_size()) {
+  if (ta.getDynamicSize()) {
     return ta.emitOpError("dynamic tensor array size is unsupported");
   }
   Value buffer;
   OpBuilder builder(ta);
-  if (failed(cutil::CreateInitBufferValue(*elem_shape, ta.size(), ta,
-                                          ta.dtype(), builder, &buffer))) {
+  if (failed(cutil::CreateInitBufferValue(*elem_shape, ta.getSize(), ta,
+                                          ta.getDtype(), builder, &buffer))) {
     return failure();
   }
   auto var_type = RankedTensorType::get(
@@ -212,7 +213,7 @@ LogicalResult HandleTensorArrayV3Op(
   auto local_var = builder.create<TF::MlirLocalVarOp>(
       ta.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{});
   cutil::WriteLocalVariable(local_var, buffer, builder, ta.getLoc());
-  ta.handle().replaceAllUsesWith(local_var);
+  ta.getHandle().replaceAllUsesWith(local_var);
   // The flow output is just a way for the front end to enforce ordering among
   // tensor array ops, but in the MLIR TF dialect they have sequential ordering.
   // Just create a constant to replace its uses.
@@ -220,7 +221,7 @@ LogicalResult HandleTensorArrayV3Op(
   scalar_tensor.scalar<float>()() = 0.0f;
   auto flow = builder.create<TF::ConstOp>(
       ta.getLoc(), tensorflow::ConvertTensor(scalar_tensor, &builder).value());
-  ta.flow().replaceAllUsesWith(flow);
+  ta.getFlow().replaceAllUsesWith(flow);
   ta.erase();
   (*stats)[local_var].accumulate_on_write = false;
   return success();
@@ -229,17 +230,17 @@ LogicalResult HandleTensorArrayV3Op(
 LogicalResult HandleTensorArrayReadV3Op(
     TF::TensorArrayReadV3Op read,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = read.handle();
+  auto local_var = read.getHandle();
   if (stats.count(local_var) == 0) {
     return read.emitOpError("unknown tensor array");
   }
   OpBuilder builder(read);
   auto buffer = cutil::ReadLocalVariable(local_var, builder, read.getLoc());
   auto index_reshape =
-      cutil::ReshapeScalarToSizeType(builder, read.index(), read.getLoc());
+      cutil::ReshapeScalarToSizeType(builder, read.getIndex(), read.getLoc());
   auto elem = cutil::GetElement(index_reshape, buffer, builder, read.getLoc());
-  ReplaceAllUsesExceptTerminator(read.value(), elem);
-  ReplaceAllUsesWithCast(read.value(), elem);
+  ReplaceAllUsesExceptTerminator(read.getValue(), elem);
+  ReplaceAllUsesWithCast(read.getValue(), elem);
   read.erase();
   // The clear_after_read attribute does not mean setting the tensor to 0 after
   // read; instead it does not allow a second read before the next write. We
@@ -250,14 +251,14 @@ LogicalResult HandleTensorArrayReadV3Op(
 LogicalResult HandleTensorArrayWriteV3Op(
     TF::TensorArrayWriteV3Op write,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = write.handle();
+  auto local_var = write.getHandle();
   auto stat_it = stats.find(local_var);
   if (stat_it == stats.end()) return write.emitOpError("unknown tensor array");
   OpBuilder builder(write);
   auto buffer = cutil::ReadLocalVariable(local_var, builder, write.getLoc());
   auto index_reshape =
-      cutil::ReshapeScalarToSizeType(builder, write.index(), write.getLoc());
-  auto elem = write.value();
+      cutil::ReshapeScalarToSizeType(builder, write.getIndex(), write.getLoc());
+  Value elem = write.getValue();
   if (stat_it->getSecond().accumulate_on_write) {
     // Get the old slice, and accumulate with it. We set keep_slice_shape
     // (keeping the leading size-1 dimension) because it avoids reshape back and
@@ -277,7 +278,7 @@ LogicalResult HandleTensorArrayWriteV3Op(
   buffer =
       cutil::SetElement(index_reshape, buffer, elem, builder, write.getLoc());
   cutil::WriteLocalVariable(local_var, buffer, builder, write.getLoc());
-  write.flow_out().replaceAllUsesWith(write.flow_in());
+  write.getFlowOut().replaceAllUsesWith(write.getFlowIn());
   write.erase();
   return success();
 }
@@ -285,7 +286,7 @@ LogicalResult HandleTensorArrayWriteV3Op(
 LogicalResult HandleTensorArrayConcatV3Op(
     TF::TensorArrayConcatV3Op concat,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = concat.handle();
+  auto local_var = concat.getHandle();
   if (stats.count(local_var) == 0) {
     return concat.emitOpError("unknown tensor array");
   }
@@ -304,8 +305,8 @@ LogicalResult HandleTensorArrayConcatV3Op(
           RankedTensorType::get(shape, buffer_type.getElementType())},
       ArrayRef<Value>{buffer,
                       cutil::GetR1Const(shape, builder, concat.getLoc())});
-  ReplaceAllUsesExceptTerminator(concat.value(), buffer);
-  ReplaceAllUsesWithCast(concat.value(), buffer);
+  ReplaceAllUsesExceptTerminator(concat.getValue(), buffer);
+  ReplaceAllUsesWithCast(concat.getValue(), buffer);
 
   // Create the lengths as a list of the same value (element size).
   tensorflow::Tensor lengths_tensor(tensorflow::DT_INT64,
@@ -313,7 +314,7 @@ LogicalResult HandleTensorArrayConcatV3Op(
   for (int64_t i = 0; i < buffer_type.getDimSize(0); ++i) {
     lengths_tensor.vec<int64_t>()(i) = buffer_type.getDimSize(1);
   }
-  concat.lengths().replaceAllUsesWith(builder.create<TF::ConstOp>(
+  concat.getLengths().replaceAllUsesWith(builder.create<TF::ConstOp>(
       concat.getLoc(),
       tensorflow::ConvertTensor(lengths_tensor, &builder).value()));
   concat.erase();
@@ -323,7 +324,7 @@ LogicalResult HandleTensorArrayConcatV3Op(
 LogicalResult HandleTensorArraySplitV3Op(
     TF::TensorArraySplitV3Op split,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = split.handle();
+  auto local_var = split.getHandle();
   if (stats.count(local_var) == 0) {
     return split.emitOpError("unknown tensor array");
   }
@@ -337,22 +338,23 @@ LogicalResult HandleTensorArraySplitV3Op(
   buffer_shape.push_back(count);
   for (int64_t dim : elem_type.getShape()) buffer_shape.push_back(dim);
   // Reshape the input to match the buffer of the tensor array.
-  auto buffer = builder
-                    .create<TF::ReshapeOp>(
-                        split.getLoc(),
-                        ArrayRef<Type>{RankedTensorType::get(
-                            buffer_shape, elem_type.getElementType())},
-                        ArrayRef<Value>{split.value(),
-                                        cutil::GetR1Const(buffer_shape, builder,
-                                                          split.getLoc())})
-                    .output();
+  Value buffer =
+      builder
+          .create<TF::ReshapeOp>(
+              split.getLoc(),
+              ArrayRef<Type>{RankedTensorType::get(buffer_shape,
+                                                   elem_type.getElementType())},
+              ArrayRef<Value>{
+                  split.getValue(),
+                  cutil::GetR1Const(buffer_shape, builder, split.getLoc())})
+          .getOutput();
   // Accumulate with the old buffer.
   auto old_buffer =
       cutil::ReadLocalVariable(local_var, builder, split.getLoc());
   buffer =
       cutil::AccumulateBuffers(old_buffer, buffer, builder, split.getLoc());
   cutil::WriteLocalVariable(local_var, buffer, builder, split.getLoc());
-  split.flow_out().replaceAllUsesWith(split.flow_in());
+  split.getFlowOut().replaceAllUsesWith(split.getFlowIn());
   split.erase();
   return success();
 }
@@ -360,7 +362,7 @@ LogicalResult HandleTensorArraySplitV3Op(
 LogicalResult HandleTensorArraySizeV3Op(
     TF::TensorArraySizeV3Op size,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = size.handle();
+  auto local_var = size.getHandle();
   if (stats.count(local_var) == 0) {
     return size.emitOpError("unknown tensor array");
   }
@@ -371,7 +373,7 @@ LogicalResult HandleTensorArraySizeV3Op(
   OpBuilder builder(size);
   auto result = cutil::CreateScalarConst(buffer_type.getDimSize(0), builder,
                                          size.getLoc());
-  size.size().replaceAllUsesWith(result);
+  size.getSize().replaceAllUsesWith(result);
   size.erase();
   return success();
 }
@@ -398,13 +400,13 @@ LogicalResult CreateAndInitializeGradVariable(Type local_var_type,
 LogicalResult HandleTensorArrayGradV3Op(
     TF::TensorArrayGradV3Op grad,
     llvm::SmallDenseMap<Value, TensorArrayStats>* stats) {
-  auto local_var = grad.handle();
+  auto local_var = grad.getHandle();
   OpBuilder builder(grad);
   Value grad_var;
   auto sit = stats->find(local_var);
   if (sit == stats->end()) return grad.emitOpError("unknown tensor array");
   auto emplace_res =
-      sit->getSecond().grads.try_emplace(grad.source().str(), Value());
+      sit->getSecond().grads.try_emplace(grad.getSource().str(), Value());
   if (!emplace_res.second) {
     // If the source has been assigned a grad, use it.
     grad_var = emplace_res.first->second;
@@ -417,8 +419,8 @@ LogicalResult HandleTensorArrayGradV3Op(
     // Write to a grad accumulates with previous writes.
     (*stats)[grad_var].accumulate_on_write = true;
   }
-  grad.flow_out().replaceAllUsesWith(grad.flow_in());
-  grad.grad_handle().replaceAllUsesWith(grad_var);
+  grad.getFlowOut().replaceAllUsesWith(grad.getFlowIn());
+  grad.getGradHandle().replaceAllUsesWith(grad_var);
   grad.erase();
   return success();
 }
@@ -426,16 +428,16 @@ LogicalResult HandleTensorArrayGradV3Op(
 LogicalResult HandleTensorArrayGatherV3Op(
     TF::TensorArrayGatherV3Op gather,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = gather.handle();
+  auto local_var = gather.getHandle();
   if (stats.count(local_var) == 0) {
     return gather.emitOpError("unknown tensor array");
   }
   OpBuilder builder(gather);
   auto buffer = cutil::ReadLocalVariable(local_var, builder, gather.getLoc());
-  auto result =
-      cutil::GatherElements(gather.indices(), buffer, builder, gather.getLoc());
-  ReplaceAllUsesExceptTerminator(gather.value(), result);
-  ReplaceAllUsesWithCast(gather.value(), result);
+  auto result = cutil::GatherElements(gather.getIndices(), buffer, builder,
+                                      gather.getLoc());
+  ReplaceAllUsesExceptTerminator(gather.getValue(), result);
+  ReplaceAllUsesWithCast(gather.getValue(), result);
   gather.erase();
   return success();
 }
@@ -443,16 +445,17 @@ LogicalResult HandleTensorArrayGatherV3Op(
 LogicalResult HandleTensorArrayScatterV3Op(
     TF::TensorArrayScatterV3Op scatter,
     const llvm::SmallDenseMap<Value, TensorArrayStats>& stats) {
-  auto local_var = scatter.handle();
+  auto local_var = scatter.getHandle();
   if (stats.count(local_var) == 0) {
     return scatter.emitOpError("unknown tensor array");
   }
   OpBuilder builder(scatter);
   auto buffer = cutil::ReadLocalVariable(local_var, builder, scatter.getLoc());
-  buffer = cutil::ScatterAccumulateElements(scatter.indices(), scatter.value(),
-                                            buffer, builder, scatter.getLoc());
+  buffer =
+      cutil::ScatterAccumulateElements(scatter.getIndices(), scatter.getValue(),
+                                       buffer, builder, scatter.getLoc());
   cutil::WriteLocalVariable(local_var, buffer, builder, scatter.getLoc());
-  scatter.flow_out().replaceAllUsesWith(scatter.flow_in());
+  scatter.getFlowOut().replaceAllUsesWith(scatter.getFlowIn());
   scatter.erase();
   return success();
 }
@@ -488,7 +491,7 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
         return;
       }
       if (auto grad = llvm::dyn_cast<TF::TensorArrayGradV3Op>(op)) {
-        insert(grad.handle(), grad.source().str(), func_block);
+        insert(grad.getHandle(), grad.getSource().str(), func_block);
       } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(op)) {
         for (const auto& entry : AccessedGradients(
                  {while_op.body_function(), while_op.cond_function()}, module))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 2b47b332671..92140dfbc98 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -158,7 +164,7 @@ LogicalResult HandleWhileOp(
   llvm::SmallDenseMap<Value, SizeInfo> body_map;
   auto find_arg_tensor_list_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(while_op.getOperand(index));
-    if (it == buffer_to_size->end()) return llvm::None;
+    if (it == buffer_to_size->end()) return std::nullopt;
     return it->getFirst().getType();
   };
   auto arg_buffer_size_is_fixed = [&](int64_t index) {
@@ -219,7 +225,7 @@ LogicalResult HandleCaseOrIfOp(
 
   auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(op.getOperand(index + 1));
-    if (it == buffer_to_size->end()) return llvm::None;
+    if (it == buffer_to_size->end()) return std::nullopt;
     return it->getFirst().getType();
   };
   auto arg_buffer_size_is_fixed = [&](int64_t index) {
@@ -288,7 +294,7 @@ LogicalResult HandleWhileRegionOp(
   };
 
   // Rewrite body.
-  Region& body_region = while_op.body();
+  Region& body_region = while_op.getBody();
   modify_region_arguments(body_region);
   if (failed(DecomposeTensorListOpsInternal(
           &body_region.front(), module, buffer_to_size,
@@ -299,7 +305,7 @@ LogicalResult HandleWhileRegionOp(
       body_region.front(), *buffer_to_size);
 
   // Rewrite cond.
-  Region& cond_region = while_op.cond();
+  Region& cond_region = while_op.getCond();
   modify_region_arguments(cond_region);
   if (failed(DecomposeTensorListOpsInternal(
           &cond_region.front(), module, buffer_to_size,
@@ -319,8 +325,8 @@ LogicalResult HandleWhileRegionOp(
   auto new_while = builder.create<TF::WhileRegionOp>(
       while_op.getLoc(), body_region.front().getTerminator()->getOperandTypes(),
       new_while_operands, while_op->getAttrs());
-  new_while.body().takeBody(body_region);
-  new_while.cond().takeBody(cond_region);
+  new_while.getBody().takeBody(body_region);
+  new_while.getCond().takeBody(cond_region);
   for (const auto& entry : output_buffer_to_size) {
     (*buffer_to_size)[new_while.getResult(std::get<0>(entry))] = {
         new_while.getResult(std::get<1>(entry)), std::get<2>(entry)};
@@ -337,8 +343,8 @@ LogicalResult HandleIfRegionOp(
     llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   // Rewrite the branches.
-  Region& then_branch = if_op.then_branch();
-  Region& else_branch = if_op.else_branch();
+  Region& then_branch = if_op.getThenBranch();
+  Region& else_branch = if_op.getElseBranch();
   if (failed(DecomposeTensorListOpsInternal(
           &then_branch.front(), module, buffer_to_size,
           decomposed_partitioned_call_callees)))
@@ -364,8 +370,8 @@ LogicalResult HandleIfRegionOp(
         new_op.getResult(std::get<1>(entry)), std::get<2>(entry)};
   }
 
-  new_op.then_branch().takeBody(if_op.then_branch());
-  new_op.else_branch().takeBody(if_op.else_branch());
+  new_op.getThenBranch().takeBody(if_op.getThenBranch());
+  new_op.getElseBranch().takeBody(if_op.getElseBranch());
 
   if_op.replaceAllUsesWith(
       new_op.getResults().take_front(if_op.getNumResults()));
@@ -474,7 +480,7 @@ LogicalResult HandlePartitionedCallOp(
   }
   auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(call.getOperand(index));
-    if (it == buffer_to_size->end()) return llvm::None;
+    if (it == buffer_to_size->end()) return std::nullopt;
     return it->getFirst().getType();
   };
   auto arg_buffer_size_is_fixed = [&](int64_t index) {
@@ -530,9 +536,9 @@ LogicalResult GetConstShapeValue(Value shape_value,
   if (!shape_op) return failure();
   auto shape_const_op = llvm::dyn_cast<TF::ConstOp>(shape_op);
   if (!shape_const_op) return failure();
-  for (const auto& v : shape_const_op.value().getValues<APInt>()) {
+  for (const auto& v : shape_const_op.getValue().getValues<APInt>()) {
     int64_t dim_size = v.getSExtValue();
-    if (dim_size == ShapedType::kDynamicSize) return failure();
+    if (dim_size == tensorflow::kTFDynamicSize) return failure();
     shape->push_back(dim_size);
   }
   return success();
@@ -562,17 +568,17 @@ LogicalResult HandleEmptyTensorListOp(
   // shape inference might have successfully inferred the element shape from
   // write operations on the TensorList.
   if (failed(GetElementShapeFromResultType(list.getType(), &element_shape))) {
-    if (failed(GetConstShapeValue(list.element_shape(), &element_shape))) {
+    if (failed(GetConstShapeValue(list.getElementShape(), &element_shape))) {
       return list.emitOpError("unknown tensor list element shape");
     }
   }
   if (failed(cutil::CreateInitBufferValue(
-          element_shape, list.max_num_elements(), list, list.element_dtype(),
+          element_shape, list.getMaxNumElements(), list, list.getElementDtype(),
           builder, &buffer))) {
     return failure();
   }
   Value size = cutil::GetR1Const({0LL}, builder, list.getLoc());
-  list.handle().replaceAllUsesWith(buffer);
+  list.getHandle().replaceAllUsesWith(buffer);
   (*buffer_to_size)[buffer] = {size, /*fixed=*/false};
   list.erase();
   return success();
@@ -589,19 +595,19 @@ LogicalResult HandleTensorListReserveOp(
   // shape inference might have successfully inferred the element shape from
   // write operations on the TensorList.
   if (failed(GetElementShapeFromResultType(list.getType(), &element_shape))) {
-    if (failed(GetConstShapeValue(list.element_shape(), &element_shape))) {
+    if (failed(GetConstShapeValue(list.getElementShape(), &element_shape))) {
       return list.emitOpError("unknown tensor list element shape");
     }
   }
-  if (failed(cutil::CreateInitBufferValue(element_shape, list.num_elements(),
-                                          list, list.element_dtype(), builder,
+  if (failed(cutil::CreateInitBufferValue(element_shape, list.getNumElements(),
+                                          list, list.getElementDtype(), builder,
                                           &buffer))) {
     return failure();
   }
-  Value size = cutil::ReshapeScalarToSizeType(builder, list.num_elements(),
+  Value size = cutil::ReshapeScalarToSizeType(builder, list.getNumElements(),
                                               list.getLoc());
   (*buffer_to_size)[buffer] = {size, /*fixed=*/true};
-  list.handle().replaceAllUsesWith(buffer);
+  list.getHandle().replaceAllUsesWith(buffer);
   list.erase();
   return success();
 }
@@ -611,15 +617,15 @@ LogicalResult HandleTensorListFromTensorOp(
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
   OpBuilder builder(list);
   Value buffer = builder.create<TF::IdentityOp>(
-      list.getLoc(), ArrayRef<Type>{list.tensor().getType()},
-      ArrayRef<Value>{list.tensor()});
+      list.getLoc(), ArrayRef<Type>{list.getTensor().getType()},
+      ArrayRef<Value>{list.getTensor()});
   auto type = buffer.getType().cast<TensorType>();
   if (!type.hasStaticShape()) {
     return list.emitOpError("TensorListFromTensorOp input has unknown shape.");
   }
   Value size = cutil::GetR1Const({type.getShape()[0]}, builder, list.getLoc());
   (*buffer_to_size)[buffer] = {size, /*fixed=*/true};
-  list.output_handle().replaceAllUsesWith(buffer);
+  list.getOutputHandle().replaceAllUsesWith(buffer);
   list.erase();
   return success();
 }
@@ -627,7 +633,7 @@ LogicalResult HandleTensorListFromTensorOp(
 LogicalResult HandleTensorListPushBackOp(
     TF::TensorListPushBackOp push,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
-  auto buffer = push.input_handle();
+  auto buffer = push.getInputHandle();
   auto it = buffer_to_size->find(buffer);
   if (it == buffer_to_size->end()) {
     return push.emitOpError(
@@ -639,11 +645,11 @@ LogicalResult HandleTensorListPushBackOp(
   auto size = it->getSecond().size;
   OpBuilder builder(push);
   auto new_buffer =
-      cutil::SetElement(size, buffer, push.tensor(), builder, push.getLoc());
+      cutil::SetElement(size, buffer, push.getTensor(), builder, push.getLoc());
   auto new_size = builder.create<TF::AddV2Op>(
       push.getLoc(), ArrayRef<Type>{size.getType()},
       ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, push.getLoc())});
-  push.output_handle().replaceAllUsesWith(new_buffer);
+  push.getOutputHandle().replaceAllUsesWith(new_buffer);
   (*buffer_to_size)[new_buffer] = {new_size, /*fixed=*/false};
   push.erase();
   return success();
@@ -652,7 +658,7 @@ LogicalResult HandleTensorListPushBackOp(
 LogicalResult HandleTensorListPopBackOp(
     TF::TensorListPopBackOp pop,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
-  auto buffer = pop.input_handle();
+  auto buffer = pop.getInputHandle();
   auto it = buffer_to_size->find(buffer);
   if (it == buffer_to_size->end()) {
     pop.emitOpError("found tf.TensorListPopBack on unknown TensorList.");
@@ -669,8 +675,8 @@ LogicalResult HandleTensorListPopBackOp(
       pop.getLoc(), ArrayRef<Type>{size.getType()},
       ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, pop.getLoc())});
   auto element = cutil::GetElement(new_size, new_buffer, builder, pop.getLoc());
-  pop.output_handle().replaceAllUsesWith(new_buffer);
-  pop.tensor().replaceAllUsesWith(element);
+  pop.getOutputHandle().replaceAllUsesWith(new_buffer);
+  pop.getTensor().replaceAllUsesWith(element);
   pop.erase();
   (*buffer_to_size)[new_buffer] = {new_size, /*fixed=*/false};
   return success();
@@ -679,18 +685,18 @@ LogicalResult HandleTensorListPopBackOp(
 LogicalResult HandleTensorListGetItemOp(
     TF::TensorListGetItemOp get_item,
     const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
-  auto buffer = get_item.input_handle();
+  auto buffer = get_item.getInputHandle();
   auto it = buffer_to_size.find(buffer);
   if (it == buffer_to_size.end()) {
     get_item.emitOpError("found tf.TensorListGetItemOp on unknown TensorList.");
     return failure();
   }
   OpBuilder builder(get_item);
-  auto index = cutil::ReshapeScalarToSizeType(builder, get_item.index(),
+  auto index = cutil::ReshapeScalarToSizeType(builder, get_item.getIndex(),
                                               get_item.getLoc());
   auto element =
       cutil::GetElement(index, buffer, OpBuilder(get_item), get_item.getLoc());
-  get_item.item().replaceAllUsesWith(element);
+  get_item.getItem().replaceAllUsesWith(element);
   get_item.erase();
   return success();
 }
@@ -698,18 +704,18 @@ LogicalResult HandleTensorListGetItemOp(
 LogicalResult HandleTensorListSetItemOp(
     TF::TensorListSetItemOp set_item,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
-  auto buffer = set_item.input_handle();
+  auto buffer = set_item.getInputHandle();
   auto it = buffer_to_size->find(buffer);
   if (it == buffer_to_size->end()) {
     set_item.emitOpError("found tf.TensorListSetItemOp on unknown TensorList.");
     return failure();
   }
   OpBuilder builder(set_item);
-  auto index = cutil::ReshapeScalarToSizeType(builder, set_item.index(),
+  auto index = cutil::ReshapeScalarToSizeType(builder, set_item.getIndex(),
                                               set_item.getLoc());
-  auto new_buffer = cutil::SetElement(index, buffer, set_item.item(), builder,
-                                      set_item.getLoc());
-  set_item.output_handle().replaceAllUsesWith(new_buffer);
+  auto new_buffer = cutil::SetElement(index, buffer, set_item.getItem(),
+                                      builder, set_item.getLoc());
+  set_item.getOutputHandle().replaceAllUsesWith(new_buffer);
   auto size = it->getSecond();
   (*buffer_to_size)[new_buffer] = size;
   set_item.erase();
@@ -719,7 +725,7 @@ LogicalResult HandleTensorListSetItemOp(
 LogicalResult HandleTensorListLengthOp(
     TF::TensorListLengthOp length,
     const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
-  auto it = buffer_to_size.find(length.input_handle());
+  auto it = buffer_to_size.find(length.getInputHandle());
   if (it == buffer_to_size.end()) {
     length.emitOpError("found tf.TensorListLength on unknown TensorList.");
     return failure();
@@ -727,9 +733,10 @@ LogicalResult HandleTensorListLengthOp(
   OpBuilder builder(length);
   if (it->getSecond().fixed) {
     auto dim = cutil::CreateScalarConst(
-        length.input_handle().getType().cast<RankedTensorType>().getDimSize(0),
+        length.getInputHandle().getType().cast<RankedTensorType>().getDimSize(
+            0),
         builder, length.getLoc());
-    length.length().replaceAllUsesWith(dim);
+    length.getLength().replaceAllUsesWith(dim);
   } else {
     auto current_size = it->getSecond().size;
     // Reshapes the R1 length to a scalar.
@@ -739,7 +746,7 @@ LogicalResult HandleTensorListLengthOp(
             {}, getElementTypeOrSelf(current_size.getType()))},
         ArrayRef<Value>{current_size,
                         cutil::GetR1Const({}, builder, length.getLoc())});
-    length.length().replaceAllUsesWith(reshape);
+    length.getLength().replaceAllUsesWith(reshape);
   }
   length.erase();
   return success();
@@ -748,15 +755,15 @@ LogicalResult HandleTensorListLengthOp(
 LogicalResult HandleTensorListElementShapeOp(
     TF::TensorListElementShapeOp elem_shape,
     const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
-  if (buffer_to_size.count(elem_shape.input_handle()) == 0) {
+  if (buffer_to_size.count(elem_shape.getInputHandle()) == 0) {
     return elem_shape.emitOpError("unknown tensor list");
   }
-  auto buffer = elem_shape.input_handle();
+  auto buffer = elem_shape.getInputHandle();
   auto result = cutil::GetR1Const(
       buffer.getType().cast<RankedTensorType>().getShape().drop_front(),
       OpBuilder(elem_shape), elem_shape.getLoc(),
-      elem_shape.shape_type().getIntOrFloatBitWidth());
-  elem_shape.element_shape().replaceAllUsesWith(result);
+      elem_shape.getShapeType().getIntOrFloatBitWidth());
+  elem_shape.getElementShape().replaceAllUsesWith(result);
   elem_shape.erase();
   return success();
 }
@@ -764,14 +771,14 @@ LogicalResult HandleTensorListElementShapeOp(
 LogicalResult HandleTensorListGatherOp(
     TF::TensorListGatherOp gather,
     const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
-  auto it = buffer_to_size.find(gather.input_handle());
+  auto it = buffer_to_size.find(gather.getInputHandle());
   if (it == buffer_to_size.end()) {
     return gather.emitOpError("unknown tensor list");
   }
-  auto buffer = gather.input_handle();
-  auto result = cutil::GatherElements(gather.indices(), buffer,
+  auto buffer = gather.getInputHandle();
+  auto result = cutil::GatherElements(gather.getIndices(), buffer,
                                       OpBuilder(gather), gather.getLoc());
-  gather.values().replaceAllUsesWith(result);
+  gather.getValues().replaceAllUsesWith(result);
   gather.erase();
   return success();
 }
@@ -779,24 +786,24 @@ LogicalResult HandleTensorListGatherOp(
 LogicalResult HandleTensorListScatterIntoExistingListOp(
     TF::TensorListScatterIntoExistingListOp scatter,
     llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
-  auto it = buffer_to_size->find(scatter.input_handle());
+  auto it = buffer_to_size->find(scatter.getInputHandle());
   if (it == buffer_to_size->end()) {
     return scatter.emitOpError("unknown tensor list");
   }
-  auto buffer = scatter.input_handle();
+  auto buffer = scatter.getInputHandle();
   OpBuilder builder(scatter);
-  auto indices_type = scatter.indices().getType().cast<RankedTensorType>();
+  auto indices_type = scatter.getIndices().getType().cast<RankedTensorType>();
   if (!indices_type) return scatter.emitOpError("unranked indices shape");
   auto shape_type = RankedTensorType::get({2}, builder.getIntegerType(32));
   auto shape = builder.create<TF::ConstOp>(
       scatter.getLoc(),
       DenseElementsAttr::get(
           shape_type, {static_cast<int>(indices_type.getDimSize(0)), 1}));
-  auto indices =
-      builder.create<TF::ReshapeOp>(scatter.getLoc(), scatter.indices(), shape);
+  auto indices = builder.create<TF::ReshapeOp>(scatter.getLoc(),
+                                               scatter.getIndices(), shape);
   Value tensor_scatter_update = builder.create<TF::TensorScatterUpdateOp>(
-      scatter.getLoc(), buffer, indices, scatter.tensor());
-  scatter.output_handle().replaceAllUsesWith(tensor_scatter_update);
+      scatter.getLoc(), buffer, indices, scatter.getTensor());
+  scatter.getOutputHandle().replaceAllUsesWith(tensor_scatter_update);
   scatter.erase();
   auto size = it->getSecond();
   (*buffer_to_size)[tensor_scatter_update] = size;
@@ -846,7 +853,7 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto stack = llvm::dyn_cast<TF::TensorListStackOp>(&op)) {
-      stack.tensor().replaceAllUsesWith(stack.input_handle());
+      stack.getTensor().replaceAllUsesWith(stack.getInputHandle());
       stack.erase();
     } else if (auto elem_shape =
                    llvm::dyn_cast<TF::TensorListElementShapeOp>(&op)) {
@@ -867,15 +874,15 @@ LogicalResult DecomposeTensorListOpsInternal(
     } else if (auto addn = llvm::dyn_cast<TF::AddNOp>(&op)) {
       auto it = buffer_to_size->find(addn.getOperand(0));
       if (it != buffer_to_size->end()) {
-        addn.sum().setType(addn.getOperand(0).getType());
+        addn.getSum().setType(addn.getOperand(0).getType().cast<TensorType>());
         auto size = it->getSecond();
-        (*buffer_to_size)[addn.sum()] = size;
+        (*buffer_to_size)[addn.getSum()] = size;
       }
     } else if (auto zeros = llvm::dyn_cast<TF::ZerosLikeOp>(&op)) {
-      if (buffer_to_size->count(zeros.x()) > 0) {
-        zeros.y().setType(zeros.x().getType());
-        auto size = (*buffer_to_size)[zeros.x()];
-        (*buffer_to_size)[zeros.y()] = size;
+      if (buffer_to_size->count(zeros.getX()) > 0) {
+        zeros.getY().setType(zeros.getX().getType());
+        auto size = (*buffer_to_size)[zeros.getX()];
+        (*buffer_to_size)[zeros.getY()] = size;
       }
     } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
       if (failed(HandleWhileOp(while_op, module, buffer_to_size,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
index ea555789383..72302903b37 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -29,7 +29,7 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
 
   LogicalResult matchAndRewrite(BatchDatasetV2Op op,
                                 PatternRewriter &rewriter) const override {
-    auto batchInputDataset = op.input_dataset();
+    auto batchInputDataset = op.getInputDataset();
 
     ParallelMapDatasetOp batchInputOp = dyn_cast_or_null<ParallelMapDatasetOp>(
         batchInputDataset.getDefiningOp());
@@ -39,19 +39,19 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
     // and MapAndBatchDataset is different (int32 and int64 respectively)
     auto num_parallel_calls_op = rewriter.create<CastOp>(
         op.getLoc(), UnrankedTensorType::get(rewriter.getIntegerType(64)),
-        batchInputOp.num_parallel_calls(), rewriter.getBoolAttr(false));
+        batchInputOp.getNumParallelCalls(), rewriter.getBoolAttr(false));
 
-    if (op.metadata() != batchInputOp.metadata()) {
+    if (op.getMetadata() != batchInputOp.getMetadata()) {
       return failure();
     }
 
     auto fused_op = rewriter.create<MapAndBatchDatasetOp>(
-        op.getLoc(), op.getType(), batchInputOp.input_dataset(),
-        batchInputOp.other_arguments(), op.batch_size(),
-        num_parallel_calls_op.y(), op.drop_remainder(), batchInputOp.f(),
-        op.output_types(), op.output_shapes(),
-        batchInputOp.preserve_cardinality(), op.metadata());
-    rewriter.replaceOp(op, {fused_op.handle()});
+        op.getLoc(), op.getType(), batchInputOp.getInputDataset(),
+        batchInputOp.getOtherArguments(), op.getBatchSize(),
+        num_parallel_calls_op.getY(), op.getDropRemainder(),
+        batchInputOp.getF(), op.getOutputTypes(), op.getOutputShapes(),
+        batchInputOp.getPreserveCardinality(), op.getMetadata());
+    rewriter.replaceOp(op, {fused_op.getHandle()});
     return failure();
   }
 };
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
index 04aac42d728..f46fb14d2ca 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
@@ -273,11 +273,61 @@ def LaunchToDeviceAttributePass : Pass<"tf-launch-to-device-attribute", "mlir::f
     ```
   }];
 
+  let options = [
+    Option<"legacy_graph_export_", "legacy-graph-export", "bool",
+           /*default=*/"true",
+           "Determines whether or not this pass should execute logic that is "
+           "reserved for the legacy graph export pipeline to maintain expected "
+           "invariants. In the case of this pass, that means manually propagating "
+           "controls to lifted parallel execute regions to the graph fetch to "
+           "ensure the ops execute, as well as determining whether or not the "
+           "islands created by this pass should be split after the replicated "
+           "ops have been lifted.">
+  ];
+
   let constructor = "TFDevice::CreateLaunchToDeviceAttributePass()";
 }
 
 def XlaClusterFormationPass : Pass<"tf-xla-cluster-formation", "ModuleOp"> {
-  let summary = "Encapsulate StatefulPartitionedCallOp within a Cluster op";
+  let summary = "Encapsulate partitioned calls within a Cluster op";
+  let description = [{
+    This pass clusters `tf.PartitionedCall` and `tf.StatefulPartitionedCall`
+    with `_xla_compile_device_type` attribute into a `tf_device.cluster`.
+    Notice this pass will only rewrite the outermost call if there are nested
+    calls to avoid nested `tf.XlaLaunch` operations from being created later.
+
+    For example, the following code
+
+    ```mlir
+    func.func @main() -> tensor<i32> {
+      %0 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> (tensor<i32>)
+      func.return %0 : tensor<i32>
+    }
+
+    func.func @stateful_pcall_func() -> tensor<i32> {
+      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+    ```
+
+    will be transformed into,
+
+    ```mlir
+    func.func @main() -> tensor<i32> {
+      %0 = "tf_device.cluster"() ({
+        %1 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> tensor<i32>
+        tf_device.return %1 : tensor<i32>
+      }) : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+
+    func.func @stateful_pcall_func() -> tensor<i32> {
+      %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+      func.return %0 : tensor<i32>
+    }
+
+    ```
+  }];
   let constructor = "TFDevice::CreateXlaClusterFormationPass()";
   let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
 }
@@ -293,16 +343,23 @@ def XlaRewritePass : Pass<"tf-xla-rewrite", "mlir::ModuleOp"> {
 
   let description = [{
     This pass rewrites `tf.PartitionedCall` and `tf.StatefulPartitionedCall`
-    operations with `_xla_compile_device_type` attribute into `tf.XlaLaunch`
-    operations. This makes the attached function execute with XLA.
-    `tf.XlaLaunch` requires resource-type arguments come at the end, so this
-    pass rewrites the called function if necessary.
+    operations with `_xla_compile_device_type` attribute in a
+    `tf_device.cluster` into `tf.XlaLaunch` operations. This makes the attached
+    function execute with XLA. `tf.XlaLaunch` requires resource-type arguments
+    come at the end, so this pass rewrites the called function if necessary.
+    This pass assumes there are no nested `tf_device.cluster`s so we don't end
+    up creating nested XLA launch ops.
 
     For example, the `tf.PartitionedCall` operation in the following code
 
     ```mlir
     func.func @convert_partitioned_call_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-      %0 = "tf.PartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> (tensor<i32>)
+      %0 = "tf.PartitionedCall"(%arg0, %arg1) {_xla_compile_device_type = "CPU", f = @pcall_func_with_resources} : (tensor<!tf_type.resource>, tensor<i32>) -> (tensor<i32>)
+      %0 = "tf_device.cluster"() ({
+        %1 = "tf.StatefulPartitionedCall"() {_xla_compile_device_type = "CPU", f = @stateful_pcall_func} : () -> tensor<i32>
+        tf_device.return %1 : tensor<i32>
+      }) : () -> tensor<i32>
+
       func.return %0 : tensor<i32>
     }
 
@@ -315,9 +372,14 @@ def XlaRewritePass : Pass<"tf-xla-rewrite", "mlir::ModuleOp"> {
 
     ```mlir
     func.func @convert_partitioned_call_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
-      %0 = "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", device = "/device:CPU:0", function = @pcall_func_with_resources_0, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+      %0 = "tf_device.cluster"() ({
+        %1 = "tf.XlaLaunch"(%arg1, %arg0) {_xla_compile_device_type = "CPU", function = @pcall_func_with_resources_0, operand_segment_sizes = array<i32: 0, 1, 1>} : (tensor<i32>, tensor<!tf_type.resource>) -> tensor<i32>
+        tf_device.return %1 : tensor<i32>
+      }) : () -> tensor<i32>
+
       return %0 : tensor<i32>
     }
+
     func.func @pcall_func_with_resources_0(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> tensor<i32> {
       return %arg0 : tensor<i32>
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index 29a10930567..e84cf959800 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 #define DEBUG_TYPE "run-tf-graph-optimization"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index 70c0b1f0bb3..647cad07aaf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -306,7 +306,7 @@ def UnrollBatchMatMulPass : Pass<"tf-unroll-batch-matmul", "mlir::func::FuncOp">
   let constructor = "TF::CreateUnrollBatchMatMulPassPass()";
 }
 
-def ClusterFormationPass : Pass<"tf-device-cluster-formation", "mlir::func::FuncOp"> {
+def ClusterFormationPass : Pass<"tf-device-cluster-formation", "mlir::ModuleOp"> {
   let summary = "Form clusters from instructions assigned to same device";
   let constructor = "TFDevice::CreateClusterFormationPass()";
   let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
@@ -368,6 +368,17 @@ def LegalizeTFGToTFPass : Pass<"tfe-legalize-tfg", "ModuleOp"> {
 def ReplicateToIslandPass : Pass<"tf-replicate-to-island", "mlir::func::FuncOp"> {
   let summary = "Lowers device replicate to executor islands";
   let constructor = "TFDevice::CreateReplicateToIslandPass()";
+  let options = [
+    Option<"legacy_graph_export_", "legacy-graph-export", "bool",
+           /*default=*/"true",
+           "Determines whether or not this pass should execute logic that is "
+           "reserved for the legacy graph export pipeline to maintain expected "
+           "invariants. In the case of this pass, that means manually propagating "
+           "controls to lifted parallel execute regions to the graph fetch to "
+           "ensure the ops execute, as well as determining whether or not the "
+           "islands created by this pass should be split after the replicated "
+           "ops have been lifted.">
+  ];
 }
 
 def ReplicaIDToDeviceOrdinalPass : Pass<"tf-replica-id-to-device-ordinal", "mlir::func::FuncOp"> {
@@ -388,6 +399,27 @@ def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
   let constructor = "TF::CreateConvertReadonlyReferenceVariablesToResourceVariablesPass()";
 }
 
+def ReplicateTensorListInitOpsPass : Pass<"tf-replicate-tensor-list-init-ops", "mlir::func::FuncOp"> {
+  let summary =
+    "Replicate TensorList init ops for correct shape assignments in shape inference";
+
+  let description = [{
+    If we pass same TensorList to a while op as multiple arguments or just use
+    the same TensorList at multiple places and assign different
+    TensorListSetItem to elements of TensorList, the shape inference is then
+    unable to identify the Shape of these args and thus the input TensorList
+    shape is unidentifiable.
+    All of these args are supposed to be independent and not related to original
+    creation of TensorList.
+
+    This pass will create multiple instances of TensorList for each arg of the
+    while op and each use and thus there will be not a conflict in resolving the
+    shape of these different inputs.
+  }];
+
+  let constructor = "TF::CreateReplicateTensorListInitOpsPass()";
+}
+
 def TensorFlowShapeInferencePass : Pass<"tf-shape-inference", "ModuleOp"> {
   let summary =
     "Shape inference on TF dialect and ops implementing InferTypeOpInterface";
@@ -904,6 +936,12 @@ def ExecutorTPUV1IslandInliningPass : Pass<"tf-executor-tpu-v1-island-inlining",
   let constructor = "tf_executor::CreateTFExecutorTPUV1IslandInliningPass()";
 }
 
+def TPUPartitionedOpConversionPass : Pass<"tf-tpu-partitioned-op-conversion", "mlir::func::FuncOp"> {
+  let summary = "Rewrite all TPU Partitioned ops into their V2 counterparts.";
+
+  let constructor = "TFTPU::CreateTPUPartitionedOpConversionPass()";
+}
+
 def TPUClusterFormationPass : Pass<"tf-tpu-cluster-formation", "ModuleOp"> {
   let summary = "Forms clusters from operations assigned to the same TPU computation";
 
@@ -2181,7 +2219,7 @@ def SplitIntoIslandPerOpPass : Pass<"tf-executor-split-into-island-per-op", "mli
         }
     ```
   }];
-  let constructor = "mlir::CreateSplitIntoIslandPerOpPass()";
+  let constructor = "mlir::TF::CreateSplitIntoIslandPerOpPass()";
   let dependentDialects = ["mlir::tf_executor::TensorFlowExecutorDialect"];
 }
 
@@ -2257,6 +2295,15 @@ def BroadcastFoldPass : Pass<"tf-broadcast-fold", "mlir::func::FuncOp"> {
 def ParallelExecuteToIslandsPass : Pass<"tf-parallel-execute-to-islands", "mlir::func::FuncOp"> {
   let summary = "Lowers device parallel_execute to executor islands";
   let constructor = "TFDevice::CreateParallelExecuteToIslandsPass()";
+  let options = [
+    Option<"legacy_graph_export_", "legacy-graph-export", "bool",
+           /*default=*/"true",
+           "Determines whether or not this pass should execute logic that is "
+           "reserved for the legacy graph export pipeline to maintain expected "
+           "invariants. In the case of this pass, that means manually propagating "
+           "controls to lifted parallel execute regions to the graph fetch to "
+           "ensure the ops execute.">
+  ];
 }
 
 def ConstantOpDeviceAssignmentPass : Pass<"constant-op-device-assignment", "ModuleOp"> {
@@ -2345,7 +2392,7 @@ def DeviceIndexSelectorPass : Pass<"tf-device-index-selector", "mlir::func::Func
   let constructor = "TF::CreateDeviceIndexSelectorPass()";
 }
 
-def OrderByDialectPass : Pass<"tf-order-by-dialect", "mlir::func::FuncOp"> {
+def OrderByDialectPass : Pass<"tf-order-by-dialect", "mlir::ModuleOp"> {
   let summary = "Reorders ops so ops of the same dialect are next to each other.";
   let description = [{
       Performs a reordering of ops so that
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
index 829abbbb89e..20326e2ef49 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 
+#include <iterator>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseSet.h"
@@ -46,6 +48,9 @@ namespace mlir {
 namespace tf_saved_model {
 namespace {
 
+// Attribute name that specifies the input shapes of a function.
+constexpr StringRef kTfInputShapesAttr = "tf._input_shapes";
+
 // Build and returns ElementsAttr which holds the data in 'tensor'.
 ElementsAttr GetTensorValueAsElementsAttr(const tensorflow::Tensor& tensor,
                                           OpBuilder builder) {
@@ -119,7 +124,7 @@ void PropagateUsage(
     }
   } else if (auto if_op = dyn_cast<TF::IfRegionOp>(user_op)) {
     (*arguments_to_erase)[if_op].push_back(argument_index);
-    for (auto callee : {&if_op.then_branch(), &if_op.else_branch()}) {
+    for (auto callee : {&if_op.getThenBranch(), &if_op.getElseBranch()}) {
       work_list->push_back(std::make_pair(callee, argument_index));
     }
   } else if (auto while_op = dyn_cast<TF::WhileOp>(user_op)) {
@@ -130,7 +135,7 @@ void PropagateUsage(
     }
   } else if (auto while_op = dyn_cast<TF::WhileRegionOp>(user_op)) {
     (*arguments_to_erase)[while_op].push_back(argument_index);
-    for (auto callee : {&while_op.cond(), &while_op.body()}) {
+    for (auto callee : {&while_op.getCond(), &while_op.getBody()}) {
       work_list->push_back(std::make_pair(callee, argument_index));
     }
   }
@@ -176,22 +181,6 @@ void ReplaceVarWithConstant(
   }
 }
 
-// Helper that returns the FuncOp that is the SessionInit function which
-// will be called to initialize all resources.
-// Returns nullptr if no function is found.
-func::FuncOp GetSessionInitializerFunc(ModuleOp module) {
-  auto session_init_op = tf_saved_model::GetSessionInitializerOp(module);
-  SymbolTable symbol_table(module);
-  if (session_init_op && !session_init_op.getInitializers().empty()) {
-    func::FuncOp init_func_op = symbol_table.lookup<mlir::func::FuncOp>(
-        session_init_op.getInitializers()[0]
-            .cast<FlatSymbolRefAttr>()
-            .getValue());
-    return init_func_op;
-  }
-  return nullptr;
-}
-
 // Returns ID for identifying a resource.
 std::tuple<llvm::StringRef, llvm::StringRef, llvm::StringRef> GetResourceKey(
     Operation* op) {
@@ -254,9 +243,9 @@ void RemoveVariablesInitializations(
 // Updates terminator op arguments of 'func' after removing arguments
 // specified in 'arguments_to_erase'.
 template <typename T>
-void UpdateTerminatorArguments(
-    T& func, const llvm::SmallVector<unsigned, 4>& arguments_to_erase,
-    llvm::BitVector& erase_indices) {
+void UpdateTerminatorArguments(T& func,
+                               const ArrayRef<unsigned> arguments_to_erase,
+                               llvm::BitVector& erase_indices) {
   auto terminator = func.front().getTerminator();
   int num_operands = terminator->getNumOperands();
   erase_indices.resize(num_operands);
@@ -264,8 +253,7 @@ void UpdateTerminatorArguments(
     auto argument = func.getArgument(arg_index);
     for (auto& use : argument.getUses()) {
       if (llvm::isa<func::ReturnOp, TF::YieldOp>(use.getOwner())) {
-        int operand_index = use.getOperandNumber();
-        erase_indices.set(operand_index);
+        erase_indices.set(use.getOperandNumber());
       }
     }
     func.getArgument(arg_index).dropAllUses();
@@ -305,9 +293,67 @@ T GetUpdatedWhileOp(T while_op, const U& argument_types,
   return new_while_op;
 }
 
+// Erases function arguments indexed at `args_to_erase`. Also applies the
+// changes to any relevant function attributes accordingly.
+void EraseFuncOpArguments(func::FuncOp func_op,
+                          const ArrayRef<unsigned> args_to_erase) {
+  BitVector args_to_erase_bit_vector(func_op.getNumArguments());
+  for (const unsigned i : args_to_erase) args_to_erase_bit_vector.set(i);
+
+  func_op.eraseArguments(args_to_erase_bit_vector);
+
+  // Erases entries in "tf._input_shapes" attribute of `func_op` that correspond
+  // to the erased arguments.
+  if (auto input_shapes_attr =
+          func_op->getAttrOfType<ArrayAttr>(kTfInputShapesAttr);
+      input_shapes_attr) {
+    // Construct a new array of input shapes excluding the input shapes of the
+    // erased arguments.
+    SmallVector<Attribute> updated_input_shapes_attr;
+    for (const unsigned i : args_to_erase_bit_vector.flip().set_bits()) {
+      updated_input_shapes_attr.emplace_back(input_shapes_attr[i]);
+    }
+
+    // Replaces the attribute with the updated "#tf_type.shape" array.
+    // Builder builder(func_op.getContext());
+    func_op->setAttr(
+        kTfInputShapesAttr,
+        ArrayAttr::get(func_op.getContext(), updated_input_shapes_attr));
+  }
+}
+
+// Validates func ops. Returns `failure` if the function is invalid.
+LogicalResult ValidateFuncOp(func::FuncOp func_op) {
+  auto input_shapes_attr =
+      func_op->getAttrOfType<ArrayAttr>(kTfInputShapesAttr);
+  if (!input_shapes_attr) return success();
+
+  if (input_shapes_attr.size() != func_op.getNumArguments()) {
+    return func_op->emitError(
+               "Number of arguments and 'tf._input_shapes' "
+               "attribute size do not match. ")
+           << "Num args: " << func_op.getNumArguments()
+           << ", tf._input_shapes size: " << input_shapes_attr.size();
+  }
+
+  return success();
+}
+
+// Validates ModuleOp. Returns `failure` if the module op is invalid.
+LogicalResult ValidateModule(ModuleOp module_op) {
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (failed(ValidateFuncOp(func_op))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
 }  // namespace
 
 LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
+  if (failed(ValidateModule(module))) return failure();
+
   const tensorflow::DeviceMgr* mgr = nullptr;
   auto status = session->LocalDeviceManager(&mgr);
   if (!status.ok()) {
@@ -316,7 +362,10 @@ LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
     return failure();
   }
 
-  func::FuncOp session_init_func = GetSessionInitializerFunc(module);
+  SmallVector<func::FuncOp, 2> session_init_funcs =
+      tf_saved_model::GetInitializerFunctions(module);
+  func::FuncOp session_init_func =
+      session_init_funcs.empty() ? nullptr : session_init_funcs[0];
 
   TF::ResourceAnalyzer analyzer(module, /*skip_session_init=*/true);
   llvm::SmallVector<TF::VarHandleOp, 4> variables;
@@ -324,7 +373,7 @@ LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
   for (auto func : module.getOps<func::FuncOp>()) {
     if (func == session_init_func) continue;
     for (auto var_handle_op : func.getOps<TF::VarHandleOp>()) {
-      if (!analyzer.IsPotentiallyWritten(var_handle_op.resource())) {
+      if (!analyzer.IsPotentiallyWritten(var_handle_op.getResource())) {
         variables.push_back(var_handle_op);
       }
     }
@@ -347,40 +396,34 @@ LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
 
   // Container to hold all update actions on ops.
   // Key: Operation to update.
-  // Value: optional list of arguments to delete from this op.
+  // Value: optional list of argument indices to delete from this op.
   // Note that we use MapVector because we want to iterate on the same order
   // of insertion.
   llvm::MapVector<Operation*, llvm::SmallVector<unsigned int, 4>>
       arguments_to_erase;
-  for (auto variable_value_pair :
+  for (auto [var_handle_op, resource_tensor] :
        llvm::zip(variables, resource_tensors_or.value())) {
-    auto var_handle_op = std::get<0>(variable_value_pair);
     builder.setInsertionPointAfterValue(var_handle_op);
     auto elements_attr = GetTensorValueAsElementsAttr(
-        var_handle_op, std::get<1>(variable_value_pair), mgr, builder);
+        var_handle_op, resource_tensor, mgr, builder);
     ReplaceVarWithConstant(var_handle_op, elements_attr, &arguments_to_erase);
   }
 
   // All updates to different ops are captured in 'arguments_to_erase'.
   // Now loop on them and based on each item type update accordingly.
-  for (auto& items : arguments_to_erase) {
-    auto* user_op = items.first;
-    auto& args_to_erase = items.second;
+  for (auto& [user_op, args_to_erase] : arguments_to_erase) {
     if (auto func = dyn_cast<func::FuncOp>(user_op)) {
       // To update a function we will need to:
       // 1) Remove the unused arguments from the function itself.
+      //    1-2) Remove func attributes corresponding to the removed arguments.
       // 2) Remove any returns that are not needed from the function terminator
-      // op in the function. 3) Update function result to match the terminator.
+      //    op in the function.
+      // 3) Update function result to match the terminator.
       llvm::BitVector result_indices_to_erase;
       UpdateTerminatorArguments(func, args_to_erase, result_indices_to_erase);
-      llvm::BitVector args_to_erase_bit_vector(func.getNumArguments());
-      for (auto i : args_to_erase) args_to_erase_bit_vector.set(i);
-      func.eraseArguments(args_to_erase_bit_vector);
-      llvm::BitVector indices_to_erase(func.getNumResults());
-      const int indices_to_erase_size = result_indices_to_erase.size();
-      for (int i = 0; i < indices_to_erase_size; ++i)
-        if (result_indices_to_erase.test(i)) indices_to_erase.set(i);
-      func.eraseResults(indices_to_erase);
+      EraseFuncOpArguments(func, args_to_erase);
+
+      func.eraseResults(result_indices_to_erase);
     } else if (auto read_var = dyn_cast<TF::ReadVariableOp>(user_op)) {
       // Read variables was already replaced by constant op. Just remove the op.
       read_var->erase();
@@ -390,20 +433,20 @@ LogicalResult FreezeVariables(ModuleOp module, tensorflow::Session* session) {
       while_op->erase();
     } else if (auto while_op = dyn_cast<TF::WhileRegionOp>(user_op)) {
       auto new_while_op = GetUpdatedWhileOp(
-          while_op, while_op.cond().getArgumentTypes(), args_to_erase);
-      new_while_op.cond().takeBody(while_op.cond());
-      new_while_op.body().takeBody(while_op.body());
+          while_op, while_op.getCond().getArgumentTypes(), args_to_erase);
+      new_while_op.getCond().takeBody(while_op.getCond());
+      new_while_op.getBody().takeBody(while_op.getBody());
       llvm::BitVector erase_indices;
-      UpdateTerminatorArguments(new_while_op.body(), args_to_erase,
+      UpdateTerminatorArguments(new_while_op.getBody(), args_to_erase,
                                 erase_indices);
       llvm::BitVector body_bit_vector(
-          new_while_op.body().front().getNumArguments());
+          new_while_op.getBody().front().getNumArguments());
       for (auto i : args_to_erase) body_bit_vector.set(i);
-      new_while_op.body().front().eraseArguments(body_bit_vector);
+      new_while_op.getBody().front().eraseArguments(body_bit_vector);
       llvm::BitVector cond_bit_vector(
-          new_while_op.cond().front().getNumArguments());
+          new_while_op.getCond().front().getNumArguments());
       for (auto i : args_to_erase) cond_bit_vector.set(i);
-      new_while_op.cond().front().eraseArguments(cond_bit_vector);
+      new_while_op.getCond().front().eraseArguments(cond_bit_vector);
       while_op->erase();
     } else {
       llvm::BitVector erase_indices(user_op->getNumOperands());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
index e2ebd57b3e3..9530262a423 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
@@ -60,6 +60,10 @@ CreateLowerVariableOpsToMlProgramPass();
 // Strips saved_model attributes from a module and its functions.
 std::unique_ptr<OperationPass<ModuleOp>> CreateStripSavedModuleMetadataPass();
 
+// Convert the session initializer to a function.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateConvertSessionInitializerToFunctionPass();
+
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_DEDUPBOUNDINPUTBINDINGPASS
 #define GEN_PASS_DECL_FREEZEASSETSPASS
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
index 42a0aa84546..cbf294ff3a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
@@ -132,3 +132,19 @@ def StripSavedModuleMetadataPass : Pass<"tf-strip-saved-module-metadata", "Modul
   }];
   let constructor = "::mlir::tf_saved_model::CreateStripSavedModuleMetadataPass()";
 }
+
+def ConvertSessionInitializerToFunctionPass : Pass<"tf-saved-model-convert-session-initializer-to-function", "ModuleOp"> {
+  let summary = "Converts the session initializer to a function.";
+  let description = [{
+      This converts
+        "tf_saved_model.session_initializer"() {initializers = [@a, @b, @c]} : () -> ()
+      to
+	func.func @session_initializer() {
+	  call @a() : () -> ()
+	  call @b() : () -> ()
+	  call @c() : () -> ()
+	  return
+	}
+  }];
+  let constructor = "::mlir::tf_saved_model::CreateConvertSessionInitializerToFunctionPass()";
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
index 528691755f8..09ea90c8947 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index f3dde17483e..c9dd2824ce1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -463,13 +463,14 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
         }
         // When model parallelism is used in conjunction with data parallelism
         // for resource inputs, we need to collect the per replica resource
-        // inputs from input to `tf.TPUPartitionedInput` ops.
-        if (auto pi = llvm::dyn_cast_or_null<TF::TPUPartitionedInputOp>(def)) {
+        // inputs from input to `tf.TPUPartitionedInputV2` ops.
+        if (auto pi =
+                llvm::dyn_cast_or_null<TF::TPUPartitionedInputV2Op>(def)) {
           if (pi->getNumOperands() != num_cores_per_replica)
             status = pi.emitOpError()
                      << "requires " << num_cores_per_replica
                      << " operands but found " << pi->getNumOperands();
-          for (auto operand : pi.inputs()) {
+          for (auto operand : pi.getInputs()) {
             if (auto ri = llvm::dyn_cast_or_null<TF::TPUReplicatedInputOp>(
                     operand.getDefiningOp())) {
               if (!seen_ops.contains(ri)) {
@@ -495,7 +496,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   llvm::SmallVector<TF::TPUReplicatedInputOp, 8> packed_ops;
   for (auto& pos_and_input : llvm::enumerate(replicated_input_ops)) {
     auto input = pos_and_input.value();
-    bool is_packed = input.is_packed();
+    bool is_packed = input.getIsPacked();
     const int num_operands = input->getNumOperands();
     int num_inputs = is_packed ? 1 : num_replicas;
     if (num_operands != num_inputs)
@@ -521,7 +522,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   for (const auto& pos_and_input :
        llvm::enumerate(ordered_tpu_replicate_inputs)) {
     auto tpu_replicated_input = pos_and_input.value();
-    if (tpu_replicated_input.is_mirrored_variable()) {
+    if (tpu_replicated_input.getIsMirroredVariable()) {
       mirrored_variable_indices.push_back(pos_and_input.index());
     }
   }
@@ -563,7 +564,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
     }
   }
 
-  // Collect all `tf.TPUPartitionedInput` ops to be moved inside the
+  // Collect all `tf.TPUPartitionedInputV2` ops to be moved inside the
   // `tf_device.replicate` later.
   llvm::SmallSet<Operation*, 4> partitioned_inputs;
   for (auto input_and_block_arg :
@@ -573,9 +574,9 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
     Value block_arg = std::get<1>(input_and_block_arg);
     mlir::replaceAllUsesInRegionWith(input->getResult(0), block_arg,
                                      cluster.getBody());
-    // Update replicated input use in tf.TPUPartitionedInput op.
+    // Update replicated input use in tf.TPUPartitionedInputV2 op.
     for (auto& use : input->getUses()) {
-      auto pi = llvm::dyn_cast<TF::TPUPartitionedInputOp>(use.getOwner());
+      auto pi = llvm::dyn_cast<TF::TPUPartitionedInputV2Op>(use.getOwner());
       if (pi) {
         pi.setOperand(use.getOperandNumber(), block_arg);
         partitioned_inputs.insert(pi.getOperation());
@@ -584,7 +585,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
   }
 
   // Create terminator for replicate op and move `tf_device.cluster` and
-  // `tf.TPUPartitionedInput`(s) into replicate body.
+  // `tf.TPUPartitionedInputV2`(s) into replicate body.
   builder.setInsertionPointToEnd(&replicate_op.GetBody());
   auto return_op = builder.create<tf_device::ReturnOp>(replicate_op.getLoc(),
                                                        cluster.getResults());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
index 9bcfad6038f..098d34e05cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
@@ -82,7 +82,8 @@ llvm::SmallVector<Operation*, 4> GetResourceOpsUsingCompositeArgsInReplicate(
       // Account for pass-through identity ops.
       if (auto pass_through_identity =
               llvm::dyn_cast<TF::IdentityOp>(resource_user)) {
-        for (auto identity_user : pass_through_identity.output().getUsers()) {
+        for (auto identity_user :
+             pass_through_identity.getOutput().getUsers()) {
           new_resource_users.emplace_back(identity_user);
         }
       }
@@ -97,8 +98,7 @@ void ColocateCompositeResourceOpsInReplicate(
     tf_device::ReplicateOp replicate_op, OpBuilder* builder) {
   auto devices = replicate_op.getDevices();
   if (!devices) return;
-  if (!devices.getValue().get(tensorflow::GetDeviceAliasForLogicalCore(0)))
-    return;
+  if (!devices.value().get(tensorflow::GetDeviceAliasForLogicalCore(0))) return;
 
   const auto composite_resource_users =
       GetResourceOpsUsingCompositeArgsInReplicate(replicate_op);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 055ae25ee86..c8ad200e328 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -78,7 +78,7 @@ bool IsSupportedInputOp(
   TF::IteratorGetNextOp iterator_op = llvm::dyn_cast<TF::IteratorGetNextOp>(op);
   if (!iterator_op) return false;
 
-  Value resource_iterator = iterator_op.iterator();
+  Value resource_iterator = iterator_op.getIterator();
 
   if (resource_alias_analysis.IsUnknownResource(resource_iterator))
     return false;
@@ -127,7 +127,7 @@ TF::TPUGetLayoutOp BuildGetLayout(const int64_t execute_arg_index,
                                   OpBuilder* builder) {
   return builder->create<TF::TPUGetLayoutOp>(
       compile_launch.getLoc(),
-      llvm::ArrayRef<Type>{RankedTensorType::get({ShapedType::kDynamicSize},
+      llvm::ArrayRef<Type>{RankedTensorType::get({ShapedType::kDynamic},
                                                  builder->getIntegerType(64))},
       llvm::ArrayRef<Value>{compilation_key},
       llvm::ArrayRef<NamedAttribute>{
@@ -143,7 +143,7 @@ TF::TPUCopyWithLayoutOp BuildCopyWithLayout(tf_device::LaunchOp execute_launch,
                                             Value input, OpBuilder* builder) {
   return builder->create<TF::TPUCopyWithLayoutOp>(
       execute_launch.getLoc(), llvm::ArrayRef<Type>{input.getType()},
-      llvm::ArrayRef<Value>{input, get_layout.layout()});
+      llvm::ArrayRef<Value>{input, get_layout.getLayout()});
 }
 
 // Performs transformation for a non-replicated input.
@@ -151,7 +151,7 @@ void HandleInput(Value input, const int64_t execute_arg_index,
                  TF::TPUExecuteOp execute, tf_device::LaunchOp execute_launch,
                  tf_device::LaunchOp compile_launch) {
   OpBuilder builder = CreateBuilderAfterOp(compile_launch);
-  auto get_layout = BuildGetLayout(execute_arg_index, execute.key(),
+  auto get_layout = BuildGetLayout(execute_arg_index, execute.getKey(),
                                    compile_launch, &builder);
   builder.setInsertionPoint(execute_launch);
   auto copy_with_layout = BuildCopyWithLayout(execute_launch, compile_launch,
@@ -187,7 +187,7 @@ bool HandleReplicatedInputs(
                             entry.value().get(), &builder);
 
     auto device_list = replicate.getDevices()
-                           .getValue()
+                           .value()
                            .get(execute_launch.getDevice())
                            .cast<ArrayAttr>();
     copy_with_layout->setAttr(kDeviceAttr,
@@ -207,7 +207,7 @@ void HandleCompileAndExecutes(
   auto compile =
       llvm::cast<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front());
   tensorflow::tpu::TPUCompileMetadataProto metadata;
-  metadata.ParseFromString(compile.metadata().str());
+  metadata.ParseFromString(compile.getMetadata().str());
   llvm::SmallVector<llvm::SmallVector<int64_t, 4>, 4> input_mappings =
       tensorflow::GetMetadataArgumentMapping(metadata);
 
@@ -222,14 +222,14 @@ void HandleCompileAndExecutes(
         llvm::cast<TF::TPUExecuteOp>(execute_launch.GetBody().front());
     const auto& input_mapping = std::get<1>(execute_and_input_mapping);
 
-    for (auto& input_and_idx : llvm::enumerate(execute.args())) {
+    for (auto& input_and_idx : llvm::enumerate(execute.getArgs())) {
       Value input = input_and_idx.value();
       const int64_t execute_arg_index = input_and_idx.index();
       if (auto block_arg = input.dyn_cast<BlockArgument>()) {
         // For a block argument, consider transforms only when it is a
         // replicated input (defining ops will be outside the replicate node).
         if (maybe_replicate != block_arg.getParentRegion()->getParentOp() ||
-            !HandleReplicatedInputs(execute_arg_index, execute.key(),
+            !HandleReplicatedInputs(execute_arg_index, execute.getKey(),
                                     execute_launch, compile_launch, block_arg,
                                     maybe_replicate, resource_alias_analysis)) {
           continue;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index a253687b0d9..110b8e34281 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -96,7 +96,7 @@ func::FuncOp BuildFunction(llvm::ArrayRef<Operation*> ops,
   Block* outlined_func_block = outlined_func.addEntryBlock();
 
   // Clone the operations and remap the inputs to use the function arguments.
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   mapping.map(inputs, outlined_func.getArguments());
   builder->setInsertionPoint(outlined_func_block, outlined_func_block->begin());
   for (Operation* op : ops) {
@@ -214,19 +214,19 @@ TF::IfRegionOp CloneEmptyIfWithPredicate(TF::IfRegionOp if_region,
                                          OpBuilder& builder) {
   // Mark op as stateful due to side-effecting communication ops added later.
   auto host_side_if = builder.create<TF::IfRegionOp>(
-      if_region.getLoc(), llvm::SmallVector<Type, 4>{}, if_region.cond(),
-      /*is_stateless=*/false, if_region._then_func_nameAttr(),
-      if_region._else_func_nameAttr());
+      if_region.getLoc(), llvm::SmallVector<Type, 4>{}, if_region.getCond(),
+      /*is_stateless=*/false, if_region.get_thenFuncNameAttr(),
+      if_region.get_elseFuncNameAttr());
 
   // Create empty then branch region.
-  auto& then_branch = host_side_if.then_branch();
+  auto& then_branch = host_side_if.getThenBranch();
   then_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&then_branch.front());
   builder.create<TF::YieldOp>(if_region.getLoc(),
                               /*operands=*/ArrayRef<Value>{});
 
   // Create empty else branch region.
-  auto& else_branch = host_side_if.else_branch();
+  auto& else_branch = host_side_if.getElseBranch();
   else_branch.push_back(new Block);
   builder.setInsertionPointToEnd(&else_branch.front());
   builder.create<TF::YieldOp>(if_region.getLoc(),
@@ -243,7 +243,7 @@ TF::WhileRegionOp CloneEmptyWhile(uint64_t parallel_iterations, Location loc,
       parallel_iterations, /*is_stateless=*/false, /*shape_invariant=*/false);
 
   // Create empty else branch region.
-  auto& body = host_side_while.body();
+  auto& body = host_side_while.getBody();
   body.push_back(new Block);
   builder.setInsertionPointToEnd(&body.front());
   builder.create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
@@ -321,6 +321,14 @@ bool HasDynamicExternalValues(Operation* op) {
       .wasInterrupted();
 }
 
+// Checks if `type` is allowed for XLA. String and resources are not XLA types.
+// There are other TF types that are not XLA types which will be removed by
+// successive passes in TF/XLA bridge phase 2.
+bool TypeValidForXLA(const Type& type) {
+  const Type elem = getElementTypeOrSelf(type);
+  return !elem.isa<TF::ResourceType>() && !elem.isa<TF::StringType>();
+}
+
 // Returns operands of `cluster_ops` that need to be
 // communicated from device->host. This is for the case when all operands have a
 // static shape.
@@ -330,15 +338,20 @@ llvm::SmallSetVector<Value, 4> GetStaticExternalOperands(
   llvm::SmallSetVector<Value, 4> external_values;
   for (Operation* op : cluster_ops) {
     op->walk([&](Operation* walked_op) {
-      if (llvm::isa<TF::_XlaRecvAtHostV2Op, TF::_XlaSendFromHostV2Op>(
+      if (llvm::isa<TF::_XlaRecvAtHostOp, TF::_XlaRecvAtHostV2Op,
+                    TF::_XlaSendFromHostOp, TF::_XlaSendFromHostV2Op>(
               walked_op))
         return WalkResult::advance();
       for (Value v : walked_op->getOperands()) {
+        if (!TypeValidForXLA(v.getType())) continue;
         if (auto* defining_op = v.getDefiningOp()) {
           if (!op->isAncestor(defining_op) &&
               tpu_cluster->isAncestor(defining_op) &&
               !HasOutsideCompilationAncestor(defining_op) &&
-              !llvm::isa<TF::_XlaRecvAtHostV2Op>(defining_op)) {
+              // Ignore operands that have already been received by a previously
+              // created cluster.
+              !llvm::isa<TF::_XlaRecvAtHostOp, TF::_XlaRecvAtHostV2Op>(
+                  defining_op)) {
             external_values.insert(v);
           }
           continue;
@@ -361,6 +374,7 @@ llvm::SmallSetVector<Value, 4> GetAllExternalOperands(
   for (Operation* op : cluster_ops) {
     op->walk([&](Operation* walked_op) {
       for (Value v : walked_op->getOperands()) {
+        if (!TypeValidForXLA(v.getType())) continue;
         Operation* defining_op = v.getDefiningOp();
         if (!defining_op || !cluster_ops.count(defining_op)) {
           external_values.insert(v);
@@ -406,7 +420,8 @@ void GetExternalOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops,
           HasDynamicOutputs(user)) {
         if (!user_set.insert(user).second) continue;
         for (Value v : user->getOperands()) {
-          if (v.getDefiningOp() == op && !isa<tf_device::ReturnOp>(user))
+          if (TypeValidForXLA(v.getType()) && v.getDefiningOp() == op &&
+              !isa<tf_device::ReturnOp>(user))
             external_outputs.insert(v);
           if (v.getDefiningOp() == op && isa<tf_device::ReturnOp>(user))
             tmp_host_outputs.push_back(v);
@@ -447,11 +462,13 @@ void MarkOutsideCompiled(Operation* op) {
 }
 
 // Returns whether an outside compilation cluster should be closed.  True when:
-// 1. There is a dynamically shaped output consumed by a non-outside compiled
+// 1. There is no non-XLA output.
+// 2. There is a dynamically shaped output consumed by a non-outside compiled
 // op.
-// 2. There is no dynamically shaped output.
+// 3. There is no dynamically shaped output.
 bool ShouldCloseCluster(llvm::ArrayRef<Value> outputs) {
   bool has_dynamic_output = false;
+  bool has_nonxla_output = false;
   for (Value v : outputs) {
     if (TF::CanBeRefined(v.getType())) {
       has_dynamic_output = true;
@@ -461,8 +478,12 @@ bool ShouldCloseCluster(llvm::ArrayRef<Value> outputs) {
           return true;
       }
     }
+    if (!TypeValidForXLA(v.getType()))
+      for (const Operation* user : v.getUsers())
+        if (!isa<tf_device::ReturnOp>(user)) has_nonxla_output = true;
   }
-  return !has_dynamic_output;
+
+  return !has_nonxla_output && !has_dynamic_output;
 }
 
 // Replaces `external_operands` with the results from `recv_at_host`.
@@ -715,14 +736,14 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp tpu_cluster,
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(if_op);
       auto host_if = CloneEmptyIfWithPredicate(if_op, builder);
-      if (failed(MoveOpsToHost(tpu_cluster, &if_op.then_branch().front(),
-                               host_if.then_branch().front().getTerminator(),
+      if (failed(MoveOpsToHost(tpu_cluster, &if_op.getThenBranch().front(),
+                               host_if.getThenBranch().front().getTerminator(),
                                compilation_key, device_ordinal,
                                default_device_ordignal,
                                communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(tpu_cluster, &if_op.else_branch().front(),
-                               host_if.else_branch().front().getTerminator(),
+      if (failed(MoveOpsToHost(tpu_cluster, &if_op.getElseBranch().front(),
+                               host_if.getElseBranch().front().getTerminator(),
                                compilation_key, device_ordinal,
                                default_device_ordignal,
                                communication_key_index)))
@@ -734,16 +755,17 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp tpu_cluster,
     if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(op)) {
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(while_op);
-      auto host_while = CloneEmptyWhile(while_op.parallel_iterations(),
+      auto host_while = CloneEmptyWhile(while_op.getParallelIterations(),
                                         while_op.getLoc(), builder);
       const auto condition_send_recv_key =
           llvm::formatv("while_condition_channel_{0}",
                         communication_key_index++)
               .str();
-      auto& cond = host_while.cond();
+      auto& cond = host_while.getCond();
       cond.push_back(new Block);
-      auto condition = while_op.cond().front().getTerminator()->getOperand(0);
-      builder.setInsertionPoint(while_op.cond().front().getTerminator());
+      auto condition =
+          while_op.getCond().front().getTerminator()->getOperand(0);
+      builder.setInsertionPoint(while_op.getCond().front().getTerminator());
       builder.create<TF::XlaSendToHostOp>(while_op.getLoc(), condition,
                                           condition_send_recv_key);
       builder.setInsertionPointToEnd(&cond.front());
@@ -754,13 +776,13 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp tpu_cluster,
       builder.create<TF::YieldOp>(while_op.getLoc(),
                                   recv_condition_at_host->getResults());
 
-      if (failed(MoveOpsToHost(tpu_cluster, &while_op.cond().front(),
+      if (failed(MoveOpsToHost(tpu_cluster, &while_op.getCond().front(),
                                recv_condition_at_host, compilation_key,
                                device_ordinal, default_device_ordignal,
                                communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(tpu_cluster, &while_op.body().front(),
-                               host_while.body().front().getTerminator(),
+      if (failed(MoveOpsToHost(tpu_cluster, &while_op.getBody().front(),
+                               host_while.getBody().front().getTerminator(),
                                compilation_key, device_ordinal,
                                default_device_ordignal,
                                communication_key_index)))
@@ -1047,12 +1069,12 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   builder.setInsertionPoint(tmp_host_launch_op.GetBody().getTerminator());
   auto compilation_key_op =
       CreateCompilationKeyPlaceholder(tpu_cluster.getLoc(), builder);
-  Value compilation_key = compilation_key_op.program();
+  Value compilation_key = compilation_key_op.getProgram();
   auto device_ordinal_op = builder.create<TF::_TPUDeviceOrdinalPlaceholderOp>(
       tpu_cluster.getLoc(), RankedTensorType::get({}, builder.getI64Type()));
   Value device_ordinal = nullptr;
   if (tpu_cluster->getParentOfType<tf_device::ReplicateOp>()) {
-    device_ordinal = device_ordinal_op.device_ordinal();
+    device_ordinal = device_ordinal_op.getDeviceOrdinal();
   }
   int default_device_ordinal = 0;
   if (failed(GetDefaultDeviceOrdinal(tpu_cluster, default_device_ordinal))) {
@@ -1100,19 +1122,11 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
   return success();
 }
 
-// Checks if `type` is allowed for data on TPUs. String and resources cannot be
-// assigned to TPUs. There are other TF types that are not allowed on TPUs, but
-// these will be removed by successive passes in TF/XLA bridge phase 2.
-bool TypeValidForTPU(Type type) {
-  Type elem = getElementTypeOrSelf(type);
-  return !elem.isa<TF::ResourceType>() && !elem.isa<TF::StringType>();
-}
-
 // Check that cluster results are valid. An result is invalid when it does not
 // have a valid XLA type.
 LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
   for (OpResult result : cluster.getResults()) {
-    if (!TypeValidForTPU(result.getType())) {
+    if (!TypeValidForXLA(result.getType())) {
       cluster.emitError()
           << "The TPUExtractHeadTailOutsideCompilation pass produced a TPU "
              "cluster with a result with a non-XLA type: "
@@ -1123,6 +1137,35 @@ LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
   return success();
 }
 
+// Check that op marked for outside compilation has an ancestor also marked for
+// outside compilation.
+LogicalResult CheckAncestorNotOutsideComp(Operation* op) {
+  if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
+    return success();
+  Operation* iter_op = op;
+  while (auto* parent_op = iter_op->getParentOp()) {
+    if (parent_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      op->emitOpError()
+          << "An op marked for outside compilation (having attribute "
+          << kXlaOutsideCompilationAttr
+          << ") has an ancestor marked for outside compilation.";
+      return failure();
+    }
+    iter_op = parent_op;
+  }
+  return success();
+}
+
+// Check the validity of the module, pre-pass.
+LogicalResult CheckPreconditions(ModuleOp module) {
+  auto walk_result = module.walk([&](Operation* op) {
+    if (failed(CheckAncestorNotOutsideComp(op))) return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  if (walk_result.wasInterrupted()) return failure();
+  return success();
+}
+
 // Check the validity of the module, post-pass.
 LogicalResult CheckPostconditions(ModuleOp module) {
   auto walk_result = module.walk([&](tf_device::ClusterOp cluster) {
@@ -1136,6 +1179,8 @@ LogicalResult CheckPostconditions(ModuleOp module) {
 void TPUExtractOutsideCompilation::runOnOperation() {
   // Get runtime devices information from the closest parent module.
   auto module = getOperation();
+  if (failed(CheckPreconditions(module))) signalPassFailure();
+
   mlir::TF::RuntimeDevices devices;
   if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
     return signalPassFailure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
index 9fbd0ceea4a..24ee61facfb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
@@ -47,7 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 #define DEBUG_TYPE "tf-tpu-merge-variables-with-execute"
 
@@ -144,6 +144,19 @@ bool AddAccessedResourceIds(
   return false;
 }
 
+/* Resources may be merged with an execute op when they are on its device or a
+ * `COMPOSITE`. Note that a `COMPOSITE` represents a set of devices, they
+ * are typically associated with packed variables. Presently, we assume this
+ * set spans all the devices. So, a variable on a `COMPOSITE` will have a local
+ * instance on the execute op's device.
+ */
+bool IsResourceMergeable(Attribute& resource_attr, Attribute& device_attr) {
+  return resource_attr &&
+         ((resource_attr == device_attr) ||
+          (resource_attr.cast<mlir::StringAttr>().getValue().find(
+               "COMPOSITE") != llvm::StringRef::npos));
+}
+
 // Finds the variable access info for a TPUExecute op.
 //  - `check_device` specifies  whether it checks the device assignment of the
 //  variables to match the TPUExecute op. This is optional in some context,
@@ -181,13 +194,13 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
         read_op->getParentRegion() != execute_parent->getParentRegion())
       continue;
 
-    auto resource = read_op.resource();
+    auto resource = read_op.getResource();
     if (check_device) {
       // TODO(lyandy): Wrap resource ops in tf_device.launch.
       if (auto* resource_op = resource.getDefiningOp()) {
         auto resource_attr = resource_op->getAttr(kDeviceAttr);
         // Check device matching for the node defining the resource.
-        if (!resource_attr || resource_attr != device_attr) continue;
+        if (!IsResourceMergeable(resource_attr, device_attr)) continue;
       } else {
         auto resource_arg = resource.dyn_cast<BlockArgument>();
         assert(resource_arg);
@@ -195,7 +208,7 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
         // Check device matching for the argument defining the resource.
         auto resource_attr = func.getArgAttrOfType<mlir::StringAttr>(
             resource_arg.getArgNumber(), kFuncDeviceAttr);
-        if (!resource_attr || resource_attr != device_attr) continue;
+        if (!IsResourceMergeable(resource_attr, device_attr)) continue;
       }
     }
 
@@ -230,12 +243,13 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
            first_read->getIterator(), execute_parent->getIterator()))) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
       VLOG(2) << "Processing read op " << debugString(op);
-      auto info_it = var_access_info.per_resource_info.find(read_op.resource());
+      auto info_it =
+          var_access_info.per_resource_info.find(read_op.getResource());
       bool is_merge_candidate =
           info_it != var_access_info.per_resource_info.end();
 
       if (is_merge_candidate &&
-          !IsResourceSafeForMerge(read_op.resource(), resource_analysis_info,
+          !IsResourceSafeForMerge(read_op.getResource(), resource_analysis_info,
                                   var_access_info, resource_ids,
                                   previous_unknown_resource_access)) {
         VLOG(2) << "  removing op from merge candidates";
@@ -272,7 +286,7 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
 
     auto assign_op = llvm::dyn_cast<TF::AssignVariableOp>(*result.user_begin());
     if (!assign_op) continue;
-    auto resource = assign_op.resource();
+    auto resource = assign_op.getResource();
     auto it = var_access_info.per_resource_info.find(resource);
     if (it == var_access_info.per_resource_info.end()) continue;
     auto& info = it->getSecond();
@@ -307,14 +321,15 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
         bool is_merge_candidate = true;
         if (all_assigns.count(assign_op) == 0) is_merge_candidate = false;
         auto info_it =
-            var_access_info.per_resource_info.find(assign_op.resource());
+            var_access_info.per_resource_info.find(assign_op.getResource());
         if (info_it == var_access_info.per_resource_info.end())
           is_merge_candidate = false;
 
         if (is_merge_candidate &&
-            !IsResourceSafeForMerge(
-                assign_op.resource(), resource_analysis_info, var_access_info,
-                resource_ids, previous_unknown_resource_access)) {
+            !IsResourceSafeForMerge(assign_op.getResource(),
+                                    resource_analysis_info, var_access_info,
+                                    resource_ids,
+                                    previous_unknown_resource_access)) {
           VLOG(2) << "  removing op from merge candidates";
           output_merged[info_it->second.execute_output_index] = false;
           info_it->second.execute_output_index = -1;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
index 96c994495b1..9ef8cda3d6f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
@@ -58,9 +58,9 @@ TF::AssignVariableOp GetSingleUseResourceWrite(
   auto assign_var = dyn_cast<TF::AssignVariableOp>(use.getOwner());
   if (!assign_var) return nullptr;
 
-  if (use.get() != assign_var.value()) return nullptr;
+  if (use.get() != assign_var.getValue()) return nullptr;
 
-  auto* resource_handle_op = assign_var.resource().getDefiningOp();
+  auto* resource_handle_op = assign_var.getResource().getDefiningOp();
   if (resource_handle_op == parallel_execute) return nullptr;
 
   if (resource_handle_op &&
@@ -104,7 +104,8 @@ void SinkResourceWritesIntoParallelExecute(
       // resource variable to be the non forwarded value from within the
       // parallel_execute region.
       assign_var.getOperation()->moveBefore(terminator);
-      assign_var.valueMutable().assign(terminator->getOperand(result.index()));
+      assign_var.getValueMutable().assign(
+          terminator->getOperand(result.index()));
       results_to_remove.push_back(result.index());
     }
 
@@ -136,7 +137,7 @@ void SinkResourceWritesIntoParallelExecute(
 
   for (auto region : llvm::zip(new_parallel_execute.getRegions(),
                                parallel_execute.getRegions()))
-    std::get<0>(region)->takeBody(*std::get<1>(region));
+    std::get<0>(region).takeBody(std::get<1>(region));
 
   for (auto result :
        llvm::zip(results_to_remap, new_parallel_execute.getResults()))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc
new file mode 100644
index 00000000000..a2232f9f33b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_partitioned_op_conversion.cc
@@ -0,0 +1,147 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TFTPU {
+namespace {
+
+#define GEN_PASS_DEF_TPUPARTITIONEDOPCONVERSIONPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+struct TPUPartitionedOpConversionPass
+    : public impl::TPUPartitionedOpConversionPassBase<
+          TPUPartitionedOpConversionPass> {
+  void runOnOperation() override;
+};
+
+template <typename T>
+LogicalResult ReplacePartitionedOp(IntegerAttr num_cores_per_replica, T op) {
+  constexpr bool is_input =
+      std::is_same_v<std::decay_t<T>, TF::TPUPartitionedInputOp>;
+  static_assert(
+      is_input || std::is_same_v<std::decay_t<T>, TF::TPUPartitionedOutputOp>,
+      "operator should either be an input or output");
+
+  OpBuilder builder(op);
+  int partition_dim = op.getPartitionDim();
+  bool is_replicated = partition_dim == -1;
+  if (!(is_replicated || num_cores_per_replica)) return failure();
+
+  Type first_operand_type;
+  if constexpr (is_input) {
+    first_operand_type = op.getOperand(0).getType();
+  } else {
+    first_operand_type = op.getOperand().getType();
+  }
+
+  auto element_type = getElementTypeOrSelf(first_operand_type);
+  if (element_type.isa<TF::ResourceType>()) {
+    first_operand_type =
+        element_type.cast<TF::ResourceType>().getSubtypes().front();
+  }
+
+  auto tensor_type = first_operand_type.dyn_cast_or_null<TensorType>();
+  if (!(tensor_type && tensor_type.hasRank())) {
+    return op->emitError()
+           << "cannot convert op with unranked or non-tensor input type "
+           << tensor_type << ".";
+  }
+
+  int rank = tensor_type.getRank();
+  if (rank <= partition_dim) {
+    return op->emitError() << "cannot partition " << first_operand_type
+                           << " (rank = " << rank << ") along dimension "
+                           << partition_dim << ".";
+  }
+
+  llvm::SmallVector<int64_t, 4> partition_dims(is_replicated ? 0 : rank, 1);
+  if (!is_replicated) {
+    partition_dims[partition_dim] = num_cores_per_replica.getInt();
+  }
+
+  if constexpr (is_input) {
+    auto pi = builder.create<TF::TPUPartitionedInputV2Op>(
+        op.getLoc(), op.getType(), op.getOperands(),
+        builder.getI64ArrayAttr(partition_dims), builder.getBoolAttr(false),
+        op.get_XlaShardingAttr());
+    op->replaceAllUsesWith(pi);
+  } else {
+    auto po = builder.create<TF::TPUPartitionedOutputV2Op>(
+        op.getLoc(), op.getResultTypes(), op.getOperand(),
+        builder.getI64ArrayAttr(partition_dims), op.get_XlaShardingAttr());
+    op->replaceAllUsesWith(po);
+  }
+
+  return success();
+}
+
+void TPUPartitionedOpConversionPass::runOnOperation() {
+  llvm::SmallVector<TF::TPUReplicateMetadataOp, 4> metadata;
+  getOperation()->walk(
+      [&metadata](TF::TPUReplicateMetadataOp op) { metadata.push_back(op); });
+
+  IntegerAttr num_cores_per_replica;
+  if (metadata.size() == 1) {
+    num_cores_per_replica = metadata.front().getNumCoresPerReplicaAttr();
+  }
+
+  auto result = getOperation()->walk([&num_cores_per_replica](Operation* op) {
+    std::optional<LogicalResult> status;
+    if (auto partitioned_input =
+            llvm::dyn_cast_or_null<TF::TPUPartitionedInputOp>(op)) {
+      status = ReplacePartitionedOp(num_cores_per_replica, partitioned_input);
+    } else if (auto partitioned_output =
+                   llvm::dyn_cast_or_null<TF::TPUPartitionedOutputOp>(op)) {
+      status = ReplacePartitionedOp(num_cores_per_replica, partitioned_output);
+    }
+
+    if (status.has_value()) {
+      if (failed(*status) || !op->use_empty()) return WalkResult::interrupt();
+
+      op->erase();
+    }
+
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) {
+    signalPassFailure();
+    return;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateTPUPartitionedOpConversionPass() {
+  return std::make_unique<TPUPartitionedOpConversionPass>();
+}
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_reorder_replicate_and_partitioned_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_reorder_replicate_and_partitioned_inputs.cc
index 2ed5bdae886..be4f986bf1f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_reorder_replicate_and_partitioned_inputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_reorder_replicate_and_partitioned_inputs.cc
@@ -11,6 +11,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
+#include <optional>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -34,80 +35,113 @@ struct TPUReorderReplicateAndPartitionedInputsPass
 
 LogicalResult ReorderReplicateAndPartitionedInputs(
     TF::TPUReplicatedInputOp replicated_input) {
-  if (!llvm::all_of(replicated_input.inputs(), [](Value input) {
-        return llvm::isa_and_nonnull<TF::TPUPartitionedInputOp>(
+  if (!llvm::all_of(replicated_input.getInputs(), [](Value input) {
+        return llvm::isa_and_nonnull<TF::TPUPartitionedInputV2Op>(
             input.getDefiningOp());
       }))
     return replicated_input.emitOpError()
-           << "expects all inputs from 'tf.TPUPartitionedInput' ops";
+           << "expects all inputs from 'tf.TPUPartitionedInputV2' ops";
 
-  auto first_partitioned_input = llvm::cast<TF::TPUPartitionedInputOp>(
+  const auto metadata_iter =
+      replicated_input->getBlock()->getOps<TF::TPUReplicateMetadataOp>();
+  TF::TPUReplicateMetadataOp metadata;
+  if (!metadata_iter.empty()) metadata = *(metadata_iter.begin());
+
+  auto first_partitioned_input = llvm::cast<TF::TPUPartitionedInputV2Op>(
       replicated_input.getOperand(0).getDefiningOp());
-  llvm::Optional<::llvm::StringRef> xla_sharding =
-      first_partitioned_input._XlaSharding();
-  int64_t partition_dim = first_partitioned_input.partition_dim();
+  auto partition_dims = first_partitioned_input.getPartitionDims();
+  const std::optional<::llvm::StringRef> xla_sharding =
+      first_partitioned_input.get_XlaSharding();
+
   size_t num_cores_per_replica = first_partitioned_input.getNumOperands();
+  if (metadata) {
+    num_cores_per_replica = metadata.getNumCoresPerReplica();
+  } else if (first_partitioned_input.getIsPacked()) {
+    return first_partitioned_input->emitOpError()
+           << "num cores per replica unavailable, metadata missing?";
+  }
 
-  for (auto operand : replicated_input.inputs().drop_front()) {
+  const bool packed_input = first_partitioned_input.getIsPacked();
+  const size_t num_operands_expected = packed_input ? 1 : num_cores_per_replica;
+  if (metadata &&
+      num_operands_expected != first_partitioned_input.getNumOperands()) {
+    return first_partitioned_input->emitOpError()
+           << "expects " << num_operands_expected << " operands but found "
+           << first_partitioned_input.getNumOperands();
+  }
+
+  for (const auto& operand : replicated_input.getInputs().drop_front()) {
     auto partitioned_input =
-        llvm::cast<TF::TPUPartitionedInputOp>(operand.getDefiningOp());
-    llvm::Optional<::llvm::StringRef> op_xla_sharding =
-        partitioned_input._XlaSharding();
-    int64_t op_partition_dim = partitioned_input.partition_dim();
-    // Abort if TPUPartitionedInput(s) do not have the same attributes.
-    if (partition_dim != op_partition_dim)
+        llvm::cast<TF::TPUPartitionedInputV2Op>(operand.getDefiningOp());
+    const std::optional<::llvm::StringRef> op_xla_sharding =
+        partitioned_input.get_XlaSharding();
+    const auto op_partition_dims = partitioned_input.getPartitionDims();
+    // Abort if TPUPartitionedInputV2(s) do not have the same attributes.
+    if (!llvm::equal(partition_dims, op_partition_dims)) {
+      return partitioned_input->emitOpError()
+             << "expects partition_dims = " << partition_dims << " but found "
+             << op_partition_dims;
+    } else if (partitioned_input.getIsPacked() !=
+               first_partitioned_input.getIsPacked()) {
       return partitioned_input->emitOpError()
-             << "expects partition_dim = " << partition_dim << " but found "
-             << op_partition_dim;
-    if (partitioned_input.getNumOperands() != num_cores_per_replica)
+             << "packing should match across ops";
+    } else if (partitioned_input.getNumOperands() != num_operands_expected) {
       return partitioned_input->emitOpError()
-             << "expects " << num_cores_per_replica << " operands but found "
+             << "expects " << num_operands_expected << " operands but found "
              << partitioned_input.getNumOperands();
-    if (xla_sharding != op_xla_sharding)
+    } else if (xla_sharding != op_xla_sharding) {
       return replicated_input.emitOpError()
-             << "expects all inputs from 'tf.TPUPartitionedInput' ops to have "
-                "identical XLA sharding";
+             << "expects all inputs from 'tf.TPUPartitionedInputV2' ops to "
+                "have identical XLA sharding";
+    }
   }
 
   // 2D Matrix to store per core per replica operands. The matrix dimensions are
   // num_cores_per_replica x num_replicas. i-th row holds the operands for i-th
   // core. j-th column holds the operands for j-th replica.
   llvm::SmallVector<llvm::SmallVector<Value, 4>, 4>
-      operands_per_replica_per_core;
-  operands_per_replica_per_core.resize(num_cores_per_replica);
+      operands_per_replica_per_core(num_cores_per_replica);
 
   // Collect all operands in the 2D matrix.
-  for (auto operand : replicated_input.inputs()) {
-    auto pi = llvm::cast<TF::TPUPartitionedInputOp>(operand.getDefiningOp());
-    for (auto& pi_operand : pi->getOpOperands()) {
-      unsigned core_id = pi_operand.getOperandNumber();
-      operands_per_replica_per_core[core_id].push_back(pi_operand.get());
+  for (auto operand : replicated_input.getInputs()) {
+    Operation* pi = operand.getDefiningOp();
+    for (unsigned core_id = 0; core_id < num_cores_per_replica; ++core_id) {
+      const auto pi_operand =
+          packed_input ? pi->getOperand(0) : pi->getOperand(core_id);
+      operands_per_replica_per_core[core_id].push_back(pi_operand);
     }
   }
 
   // Create new `tf.TPUReplicatedInput` ops feeding into one
-  // `tf.TPUPartitionedInput` op.
+  // `tf.TPUPartitionedInputV2` op.
   OpBuilder builder(replicated_input);
   llvm::SmallVector<Value, 4> operands_per_core;
-  for (const auto& operands_per_replica : operands_per_replica_per_core) {
+  for (auto& operands_per_replica : operands_per_replica_per_core) {
+    const bool is_packed =
+        packed_input && llvm::all_equal(operands_per_replica);
+    if (is_packed)  // reduce the duplicates to one input for packed vars
+      operands_per_replica.erase(operands_per_replica.begin() + 1,
+                                 operands_per_replica.end());
     auto replicate_op = builder.create<TF::TPUReplicatedInputOp>(
         replicated_input.getLoc(), replicated_input.getType(),
         operands_per_replica, replicated_input->getAttrs());
+    replicate_op.setIsPacked(is_packed);
     operands_per_core.push_back(replicate_op);
   }
 
-  auto pi = builder.create<TF::TPUPartitionedInputOp>(
+  auto pi = builder.create<TF::TPUPartitionedInputV2Op>(
       first_partitioned_input.getLoc(), replicated_input.getType(),
       operands_per_core, first_partitioned_input->getAttrs());
-  replicated_input.replaceAllUsesWith(pi.output());
+  pi.setIsPacked(false);  // inputs are now ops--not resources
+  replicated_input.replaceAllUsesWith(pi.getOutput());
   return success();
 }
 
 void TPUReorderReplicateAndPartitionedInputsPass::runOnOperation() {
   auto result =
       getOperation()->walk([](TF::TPUReplicatedInputOp replicated_input) {
-        if (llvm::none_of(replicated_input.inputs(), [](Value input) {
-              return llvm::isa_and_nonnull<TF::TPUPartitionedInputOp>(
+        if (llvm::none_of(replicated_input.getInputs(), [](Value input) {
+              return llvm::isa_and_nonnull<TF::TPUPartitionedInputV2Op>(
                   input.getDefiningOp());
             }))
           return WalkResult::advance();
@@ -124,7 +158,7 @@ void TPUReorderReplicateAndPartitionedInputsPass::runOnOperation() {
     return;
   }
 
-  getOperation()->walk([](TF::TPUPartitionedInputOp partitioned_input) {
+  getOperation()->walk([](TF::TPUPartitionedInputV2Op partitioned_input) {
     if (partitioned_input->use_empty()) partitioned_input->erase();
   });
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
index 998cc1ef0bd..bb3941df08a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_partitioning.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <tuple>
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
@@ -38,6 +39,9 @@ namespace {
 #define GEN_PASS_DEF_TPURESOURCEREADSWRITESPARTITIONINGPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
+constexpr char kUseSpmdAttr[] = "use_spmd_for_xla_partitioning";
+constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
+
 struct TPUResourceReadsWritesPartitioningPass
     : public impl::TPUResourceReadsWritesPartitioningPassBase<
           TPUResourceReadsWritesPartitioningPass> {
@@ -71,25 +75,25 @@ Type GetResourceSubtype(Value resource) {
 // `old_partitioned_input` is the predecessor of `old_read`. `new_reads`
 // contains the predecessors of `new_partitioned_input`.
 LogicalResult UpdateReadUses(TF::ReadVariableOp old_read,
-                             TF::TPUPartitionedInputOp old_partitioned_input,
-                             TF::TPUPartitionedInputOp new_partitioned_input,
+                             TF::TPUPartitionedInputV2Op old_partitioned_input,
+                             TF::TPUPartitionedInputV2Op new_partitioned_input,
                              llvm::SmallVector<Value, 4> new_reads) {
   xla::OpSharding sharding;
   sharding.ParseFromString(
-      old_partitioned_input._XlaShardingAttr().getValue().str());
+      old_partitioned_input.get_XlaShardingAttr().getValue().str());
   for (OpOperand& read_use :
-       llvm::make_early_inc_range(old_read.value().getUses())) {
+       llvm::make_early_inc_range(old_read.getValue().getUses())) {
     if (dyn_cast_or_null<tf_device::ClusterFuncOp>(read_use.getOwner())) {
       // ClusterFunc's use of the Read is replaced with use of the
-      // TPUPartitionedInput.
+      // TPUPartitionedInputV2.
       read_use.set(new_partitioned_input);
     } else {
-      // Outside compiled code's use of the Read after TPUPartitionedInput is
-      // replaced with use of the first Read before the TPUPartitionedInput.
+      // Outside compiled code's use of the Read after TPUPartitionedInputV2 is
+      // replaced with use of the first Read before the TPUPartitionedInputV2.
       if (sharding.type() != xla::OpSharding::REPLICATED) {
         // TODO(b/243077297): Generalize to any sharding.
         old_partitioned_input.emitOpError(
-            "TPUPartitionedInput variable used in outside compiled code is "
+            "TPUPartitionedInputV2 variable used in outside compiled code is "
             "only supported with REPLICATED sharding");
         return failure();
       }
@@ -109,12 +113,14 @@ LogicalResult UpdateReadUses(TF::ReadVariableOp old_read,
 LogicalResult PartitionResourceReadsWrites(
     tf_device::ClusterFuncOp cluster_func) {
   bool use_spmd = false;
-  if (auto use_spmd_attr = cluster_func->getAttrOfType<BoolAttr>(
-          "use_spmd_for_xla_partitioning"))
+  if (auto use_spmd_attr = cluster_func->getAttrOfType<BoolAttr>(kUseSpmdAttr))
     use_spmd = use_spmd_attr.getValue();
 
   if (!use_spmd) return success();
 
+  auto num_cores_per_replica_attr =
+      cluster_func->getAttrOfType<IntegerAttr>(kNumCoresPerReplicaAttr);
+
   // Wrap the ClusterFunc with a ParallelExecute if it does not already exist.
   OpBuilder builder(cluster_func);
   tf_device::ParallelExecuteOp parallel_execute =
@@ -122,35 +128,56 @@ LogicalResult PartitionResourceReadsWrites(
   if (!parallel_execute)
     parallel_execute = BuildParallelExecuteOp(cluster_func, &builder);
 
-  // Rewrite results before rewriting operands as `tf.TPUPartitionedInput`
+  // Rewrite results before rewriting operands as `tf.TPUPartitionedInputV2`
   // resource handle results is an indicator for a partitioned resource
-  // variable. These `tf.TPUPartitionedInput` will be removed when rewriting
+  // variable. These `tf.TPUPartitionedInputV2` will be removed when rewriting
   // the operands.
   for (Value result : parallel_execute.getExecuteOutputs()) {
     if (!result.hasOneUse()) continue;
     auto assign_var =
         llvm::dyn_cast<TF::AssignVariableOp>(*result.getUsers().begin());
-    if (!assign_var || assign_var.value() != result) continue;
-    auto partitioned_input = llvm::dyn_cast_or_null<TF::TPUPartitionedInputOp>(
-        assign_var.resource().getDefiningOp());
+    if (!assign_var || assign_var.getValue() != result) continue;
+    auto partitioned_input =
+        llvm::dyn_cast_or_null<TF::TPUPartitionedInputV2Op>(
+            assign_var.getResource().getDefiningOp());
     if (!partitioned_input ||
-        !AllResourceTypesHaveSubtypes(partitioned_input.inputs().getTypes()))
+        !AllResourceTypesHaveSubtypes(partitioned_input.getInputs().getTypes()))
       continue;
 
+    const auto inputs = partitioned_input.getInputs();
+    const bool packed_input = partitioned_input.getIsPacked();
+    int num_cores_per_replica = partitioned_input.getN();
+    if (num_cores_per_replica_attr) {
+      num_cores_per_replica = num_cores_per_replica_attr.getInt();
+    } else if (packed_input) {
+      return partitioned_input->emitOpError()
+             << "num cores per replica unavailable";
+    }
+
+    const int num_operands_expected = packed_input ? 1 : num_cores_per_replica;
+    if (num_cores_per_replica_attr && num_operands_expected != inputs.size()) {
+      return partitioned_input->emitOpError()
+             << "expects " << num_operands_expected << " operands but found "
+             << partitioned_input.getNumOperands();
+    }
+
     builder.setInsertionPoint(assign_var);
     llvm::SmallVector<Type, 4> partitioned_output_types;
-    partitioned_output_types.reserve(partitioned_input.N());
-    for (Type input_type : partitioned_input.inputs().getTypes())
-      partitioned_output_types.push_back(GetResourceSubtype(input_type));
-    auto partitioned_output = builder.create<TF::TPUPartitionedOutputOp>(
+    partitioned_output_types.reserve(num_cores_per_replica);
+    for (int i = 0; i < num_cores_per_replica; ++i) {
+      const auto& input = packed_input ? inputs[0] : inputs[i];
+      partitioned_output_types.push_back(GetResourceSubtype(input.getType()));
+    }
+
+    auto partitioned_output = builder.create<TF::TPUPartitionedOutputV2Op>(
         cluster_func->getLoc(), partitioned_output_types, result,
-        partitioned_input.partition_dimAttr(),
-        partitioned_input._XlaShardingAttr());
-    for (auto resource_write :
-         llvm::zip(partitioned_input.inputs(), partitioned_output.output()))
+        partitioned_input.getPartitionDimsAttr(),
+        partitioned_input.get_XlaShardingAttr());
+    for (auto [i, value] : llvm::enumerate(partitioned_output.getOutput())) {
+      const auto& resource = packed_input ? inputs[0] : inputs[i];
       builder.create<TF::AssignVariableOp>(
-          assign_var->getLoc(), /*resource=*/std::get<0>(resource_write),
-          /*value=*/std::get<1>(resource_write));
+          assign_var->getLoc(), /*resource=*/resource, /*value=*/value);
+    }
     assign_var.erase();
   }
 
@@ -158,24 +185,37 @@ LogicalResult PartitionResourceReadsWrites(
     auto read_var = llvm::dyn_cast_or_null<TF::ReadVariableOp>(
         operand.get().getDefiningOp());
     if (!read_var) continue;
-    auto partitioned_input = llvm::dyn_cast_or_null<TF::TPUPartitionedInputOp>(
-        read_var.resource().getDefiningOp());
-    if (!partitioned_input ||
-        !AllResourceTypesHaveSubtypes(partitioned_input.inputs().getTypes())) {
+    auto partitioned_input =
+        llvm::dyn_cast_or_null<TF::TPUPartitionedInputV2Op>(
+            read_var.getResource().getDefiningOp());
+    if (!partitioned_input || !AllResourceTypesHaveSubtypes(
+                                  partitioned_input.getInputs().getTypes())) {
       continue;
     }
 
-    builder.setInsertionPoint(partitioned_input);
+    // we only want to create one read variable op per unique input
+    // otherwise tpu rewriting will fail to clean up the duplicates
+    llvm::SmallMapVector<Value, Value, 4> read_variable_ops;
     llvm::SmallVector<Value, 4> partitioned_reads;
-    for (Value input : partitioned_input.inputs()) {
-      auto partitioned_read = builder.create<TF::ReadVariableOp>(
-          read_var->getLoc(), GetResourceSubtype(input), input);
-      partitioned_reads.push_back(partitioned_read.value());
+    builder.setInsertionPoint(partitioned_input);
+
+    for (Value input : partitioned_input.getInputs()) {
+      auto search = read_variable_ops.find(input);
+      // if a read variable op already doesn't exist for this input, create it
+      if (search == read_variable_ops.end()) {
+        auto partitioned_read = builder.create<TF::ReadVariableOp>(
+            read_var->getLoc(), GetResourceSubtype(input), input);
+        search = read_variable_ops.insert({input, partitioned_read.getValue()})
+                     .first;
+      }
+      partitioned_reads.push_back(search->second);
     }
-    auto partitioned_read = builder.create<TF::TPUPartitionedInputOp>(
-        partitioned_input->getLoc(), read_var.value().getType(),
-        partitioned_reads, partitioned_input.partition_dimAttr(),
-        partitioned_input._XlaShardingAttr());
+
+    auto partitioned_read = builder.create<TF::TPUPartitionedInputV2Op>(
+        partitioned_input->getLoc(), read_var.getValue().getType(),
+        partitioned_reads, partitioned_input.getPartitionDimsAttr(),
+        partitioned_input.getIsPackedAttr(),
+        partitioned_input.get_XlaShardingAttr());
     if (failed(UpdateReadUses(read_var, partitioned_input, partitioned_read,
                               partitioned_reads)))
       return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
index 45fd630de91..06f7c911a6a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -57,18 +57,18 @@ ResourceValueAndSubtype GetResourceWriteResult(
   auto assign_var = dyn_cast<TF::AssignVariableOp>(result_user);
   if (!assign_var) return resource;
 
-  auto handle = assign_var.resource();
+  auto handle = assign_var.getResource();
   // Skip result if cluster writes to the same variable via multiple results.
   for (Operation* handle_user : handle.getUsers()) {
     if (handle_user == assign_var) continue;
     auto assign_var_user = dyn_cast<TF::AssignVariableOp>(handle_user);
     if (!assign_var_user) continue;
-    if (assign_var_user.value().getDefiningOp() == cluster_func)
+    if (assign_var_user.getValue().getDefiningOp() == cluster_func)
       return resource;
   }
 
-  resource.resource = assign_var.resource();
-  resource.subtype = assign_var.value().getType();
+  resource.resource = assign_var.getResource();
+  resource.subtype = assign_var.getValue().getType();
   return resource;
 }
 
@@ -77,7 +77,7 @@ bool ClusterFuncHasResourceRead(tf_device::ClusterFuncOp cluster_func,
                                 Value resource) {
   for (Operation* resource_user : resource.getUsers())
     if (auto read = dyn_cast<TF::ReadVariableOp>(resource_user))
-      for (Operation* read_user : read.value().getUsers())
+      for (Operation* read_user : read.getValue().getUsers())
         if (read_user == cluster_func) return true;
 
   return false;
@@ -105,7 +105,7 @@ void TPUResourceReadForWritePass::runOnOperation() {
       auto new_read = builder.create<TF::ReadVariableOp>(
           resource_and_type.resource.getLoc(), resource_and_type.subtype,
           resource_and_type.resource);
-      read_operands.push_back(new_read.value());
+      read_operands.push_back(new_read.getValue());
     }
 
     if (read_operands.empty()) continue;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 64cf3da19a0..d22946dcee1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -301,7 +301,7 @@ LogicalResult SetMetadataProtoFromClusterFuncOp(
 
   if (xla_device_assignment.has_value())
     *metadata->mutable_device_assignment() =
-        std::move(xla_device_assignment.getValue());
+        std::move(xla_device_assignment.value());
   auto use_spmd_attr = op->getAttrOfType<BoolAttr>(kUseXlaSpmdAttr);
   if (!use_spmd_attr)
     return op.emitOpError(CreateMissingAttributeMsg(kUseXlaSpmdAttr));
@@ -473,7 +473,8 @@ int MovePreservedParallelExecuteChildren(
     tf_device::ParallelExecuteOp old_parallel_execute,
     tf_device::ParallelExecuteOp* new_parallel_execute) {
   // `num_moved_children` is the number of children that will be preserved.
-  const int num_moved_children = old_parallel_execute.regions().size() - 1;
+  const size_t num_moved_children =
+      old_parallel_execute.getRegions().size() - 1;
   *new_parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
       old_parallel_execute->getLoc(),
       num_moved_children + num_cores_per_replica, concatenated_output_types);
@@ -481,8 +482,8 @@ int MovePreservedParallelExecuteChildren(
   // `cluster_idx` is the index of the child with the `ClusterFuncOp`, which
   // will be replaced.
   int cluster_idx = -1;
-  for (int child_idx = 0; child_idx < old_parallel_execute.regions().size();
-       ++child_idx) {
+  for (size_t child_idx = 0;
+       child_idx < old_parallel_execute.getRegions().size(); ++child_idx) {
     auto& block = old_parallel_execute.GetRegionBlockWithIndex(child_idx);
     if (cluster_func->getBlock() == &block) {
       assert(cluster_idx == -1);
@@ -496,8 +497,8 @@ int MovePreservedParallelExecuteChildren(
     int old_idx = child_idx >= cluster_idx ? child_idx + 1 : child_idx;
     int new_idx = child_idx >= cluster_idx ? child_idx + num_cores_per_replica
                                            : child_idx;
-    new_parallel_execute->getRegions()[new_idx]->takeBody(
-        *old_parallel_execute.getRegions()[old_idx]);
+    new_parallel_execute->getRegions()[new_idx].takeBody(
+        old_parallel_execute.getRegions()[old_idx]);
   }
 
   return cluster_idx;
@@ -528,9 +529,9 @@ LogicalResult AddToParallelExecuteOp(
   concatenated_output_types.reserve(num_results_pre_cluster +
                                     cluster_result_types.size() *
                                         num_cores_per_replica);
-  for (auto* region : old_parallel_execute.getRegions()) {
-    if (!isa<tf_device::ClusterFuncOp>(region->front().front())) {
-      for (Type t : region->front().front().getResultTypes())
+  for (mlir::Region& region : old_parallel_execute.getRegions()) {
+    if (!isa<tf_device::ClusterFuncOp>(region.front().front())) {
+      for (Type t : region.front().front().getResultTypes())
         concatenated_output_types.emplace_back(t);
     }
   }
@@ -646,11 +647,11 @@ LogicalResult CheckTPUPartitionedInputAndOutputAreValid(
   for (auto cluster_result : parallel_execute.getExecuteOutputs()) {
     for (Operation* user :
          llvm::make_early_inc_range(cluster_result.getUsers())) {
-      // Check that user has no outputs that are TPUPartitionedOutput
+      // Check that user has no outputs that are TPUPartitionedOutputV2
       for (auto result : user->getResults()) {
         for (Operation* user : llvm::make_early_inc_range(result.getUsers())) {
-          if (llvm::isa<TF::TPUPartitionedOutputOp>(user)) {
-            user->emitError() << "Input of TPUPartitionedOutput must "
+          if (llvm::isa<TF::TPUPartitionedOutputV2Op>(user)) {
+            user->emitError() << "Input of TPUPartitionedOutputV2 must "
                               << "be in tpu computation.";
             return failure();
           }
@@ -658,17 +659,17 @@ LogicalResult CheckTPUPartitionedInputAndOutputAreValid(
       }
     }
   }
-  for (auto cluster_operand : cluster.operands()) {
+  for (auto cluster_operand : cluster.getOperands()) {
     Operation* def = cluster_operand.getDefiningOp();
-    // This pass assumes that a TPUPartitionedInput is preceeded by
+    // This pass assumes that a TPUPartitionedInputV2 is preceeded by
     // ReadVariable ops, and not vice versa. An earlier pass,
     // TPUResourceReadsWritesPartitioning, should have ensured this
     // precondition.
     if (!def) continue;
     for (auto operand : def->getOperands()) {
       Operation* def_of_read = operand.getDefiningOp();
-      if (llvm::isa_and_nonnull<TF::TPUPartitionedInputOp>(def_of_read)) {
-        def_of_read->emitError() << "Output of TPUPartitionedInput must "
+      if (llvm::isa_and_nonnull<TF::TPUPartitionedInputV2Op>(def_of_read)) {
+        def_of_read->emitError() << "Output of TPUPartitionedInputV2 must "
                                  << "be in tpu computation.";
         return failure();
       }
@@ -682,8 +683,8 @@ LogicalResult CheckParallelExecuteConstainsValidNonClusterProcess(
   int num_pre_cluster_regions = 0;
   int num_post_cluster_regions = 0;
   int num_cluster_regions = 0;
-  for (auto* region : parallel_execute.getRegions()) {
-    if (isa<tf_device::LaunchFuncOp>(region->front().front())) {
+  for (mlir::Region& region : parallel_execute.getRegions()) {
+    if (isa<tf_device::LaunchFuncOp>(region.front().front())) {
       if (num_cluster_regions == 0) {
         num_pre_cluster_regions++;
       } else {
@@ -704,9 +705,9 @@ LogicalResult CheckParallelExecuteConstainsValidNonClusterProcess(
 
 int GetNumResultsPreCluster(tf_device::ParallelExecuteOp parallel_execute) {
   int num_results_pre_cluster = 0;
-  for (auto region : parallel_execute.getRegions()) {
-    if (isa<tf_device::LaunchOp>(region->front().front())) {
-      num_results_pre_cluster = region->front().front().getResultTypes().size();
+  for (mlir::Region& region : parallel_execute.getRegions()) {
+    if (isa<tf_device::LaunchOp>(region.front().front())) {
+      num_results_pre_cluster = region.front().front().getResultTypes().size();
     }
   }
   return num_results_pre_cluster;
@@ -730,7 +731,7 @@ LogicalResult Rewrite(
   if (!old_parallel_execute)
     old_parallel_execute = BuildParallelExecuteOp(cluster_func, builder);
 
-  // check TPUPartitionedInput and TPUPartitionedOutput are in valid pattern
+  // check TPUPartitionedInputV2 and TPUPartitionedOutputV2 are in valid pattern
   if (failed(CheckTPUPartitionedInputAndOutputAreValid(cluster_func,
                                                        old_parallel_execute)))
     return failure();
@@ -840,7 +841,7 @@ LogicalResult Rewrite(
     } else if (compile_device_op) {
       result_id->setAttr("device", compile_device_op);
     }
-    res.output().replaceAllUsesWith(compile_op->getResult(0));
+    res.getOutput().replaceAllUsesWith(compile_op->getResult(0));
   }
 
   BuildTPUCompileSucceededAssertOp(
@@ -879,8 +880,8 @@ LogicalResult Rewrite(
   return RemoveSingletonParallelExecuteOp(new_parallel_execute, builder);
 }
 
-// Erase rewritten ClusterFuncOp(s). If TPUPartitionedInputOp /
-// TPUPartitionedOutputOp are present, they must be removed along with the
+// Erase rewritten ClusterFuncOp(s). If TPUPartitionedInputV2Op /
+// TPUPartitionedOutputV2Op are present, they must be removed along with the
 // ClusterFuncOp(s).
 void EraseClusterFuncs(
     llvm::MutableArrayRef<tf_device::ClusterFuncOp> to_be_erased) {
@@ -891,17 +892,17 @@ void EraseClusterFuncs(
 
     for (auto result : old_parallel_execute.getExecuteOutputs()) {
       for (Operation* user : llvm::make_early_inc_range(result.getUsers())) {
-        if (llvm::isa<TF::TPUPartitionedOutputOp>(user)) {
+        if (llvm::isa<TF::TPUPartitionedOutputV2Op>(user)) {
           assert(user->use_empty());
           user->erase();
         }
       }
     }
 
-    for (auto operand : cluster.operands()) {
+    for (auto operand : cluster.getOperands()) {
       Operation* def = operand.getDefiningOp();
       if (operand.hasOneUse() &&
-          llvm::isa_and_nonnull<TF::TPUPartitionedInputOp>(def)) {
+          llvm::isa_and_nonnull<TF::TPUPartitionedInputV2Op>(def)) {
         operand.dropAllUses();
         def->erase();
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index 9dc8e7bba56..19d27247efe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -70,16 +70,16 @@ std::string CreateMissingAttributeMsg(llvm::StringRef attribute) {
 // `tf_device.cluster_func` operand value. If value is a resource type then
 // TPUPartitionedInput op will be connected to a ReadVariable op that feeds into
 // a `tf_device.cluster_func`.
-llvm::Optional<llvm::StringRef> GetXlaShardingFromOperand(Value value) {
+std::optional<llvm::StringRef> GetXlaShardingFromOperand(Value value) {
   Value value_to_visit = value;
   if (auto read_var = value_to_visit.getDefiningOp<TF::ReadVariableOp>())
-    value_to_visit = read_var.resource();
+    value_to_visit = read_var.getResource();
 
   if (auto partitioned_input =
-          value_to_visit.getDefiningOp<TF::TPUPartitionedInputOp>())
-    return partitioned_input._XlaSharding();
+          value_to_visit.getDefiningOp<TF::TPUPartitionedInputV2Op>())
+    return partitioned_input.get_XlaSharding();
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Given a `tf_device.cluster_func` operand value return true iff it a device
@@ -142,7 +142,7 @@ LogicalResult VerifyShardings(
 // Assign the logical device if an op has an attribute `TPU_REPLICATED_CORE:n`,
 // the corresponding input sharding arg will be associated with
 // logical device `n`.
-llvm::Optional<llvm::StringRef> AssignLogicalDeviceFromTPUReplicatedCoreAttr(
+std::optional<llvm::StringRef> AssignLogicalDeviceFromTPUReplicatedCoreAttr(
     Operation* op, const llvm::SmallVector<std::string>& logical_device_vec) {
   if (auto device = op->getAttrOfType<StringAttr>("device")) {
     if (!device.getValue().empty() && !device.getValue().str().empty()) {
@@ -155,7 +155,7 @@ llvm::Optional<llvm::StringRef> AssignLogicalDeviceFromTPUReplicatedCoreAttr(
       }
     }
   }
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Returns XLA sharding from a XlaSharding op connected to an argument value. If
@@ -168,7 +168,7 @@ llvm::Optional<llvm::StringRef> AssignLogicalDeviceFromTPUReplicatedCoreAttr(
 // Case, While) ops and Caller return values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(
+std::optional<llvm::StringRef> GetXlaShardingFromArg(
     Value value, const llvm::SmallVector<std::string>& logical_device_vec) {
   llvm::SmallPtrSet<Value, 4> visited_values;
   llvm::SmallVector<Value, 4> values_to_visit{value};
@@ -180,7 +180,7 @@ llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(
       for (auto& use : value_to_visit.getUses()) {
         Operation* owner = use.getOwner();
         if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(owner))
-          return sharding._XlaSharding();
+          return sharding.get_XlaSharding();
 
         if (auto logical_device = AssignLogicalDeviceFromTPUReplicatedCoreAttr(
                 owner, logical_device_vec)) {
@@ -205,7 +205,7 @@ llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(
     values_to_visit.swap(next_values_to_visit);
   }
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Extracts sharding configurations for all inputs by parsing XlaSharding/
@@ -241,19 +241,19 @@ void IdentifyXlaShardingForComputationInputs(
   // Sharding configurations are added to the tf_device.ClusterFunc as an
   // attribute and the function as an argument attribute.
   for (auto operand_and_arg :
-       llvm::zip(cluster_func.operands(), function_block.getArguments())) {
+       llvm::zip(cluster_func.getOperands(), function_block.getArguments())) {
     Value operand = std::get<0>(operand_and_arg);
     BlockArgument arg = std::get<1>(operand_and_arg);
 
     if (auto operand_sharding = GetXlaShardingFromOperand(operand)) {
-      sharding_for_args.push_back(operand_sharding.getValue());
+      sharding_for_args.push_back(operand_sharding.value());
       continue;
     }
 
     if (infer_from_computation) {
       auto arg_sharding = GetXlaShardingFromArg(arg, logical_device_vec);
       if (arg_sharding) {
-        sharding_for_args.push_back(arg_sharding.getValue());
+        sharding_for_args.push_back(arg_sharding.value());
         continue;
       }
     }
@@ -275,20 +275,21 @@ void IdentifyXlaShardingForComputationInputs(
 // Returns XLA sharding from TPUPartitionedOutput or TPUPartitionedInput (via
 // AssignVariableOp/resource write) op connected to a `tf_device.cluster_func`
 // result value.
-llvm::Optional<llvm::StringRef> GetXlaShardingFromResult(Value value) {
-  if (!value.hasOneUse()) return llvm::None;
+std::optional<llvm::StringRef> GetXlaShardingFromResult(Value value) {
+  if (!value.hasOneUse()) return std::nullopt;
 
   Operation* user = *value.getUsers().begin();
   if (auto partitioned_output =
-          llvm::dyn_cast<TF::TPUPartitionedOutputOp>(user))
-    return partitioned_output._XlaSharding();
+          llvm::dyn_cast<TF::TPUPartitionedOutputV2Op>(user))
+    return partitioned_output.get_XlaSharding();
 
   if (auto assign_var = llvm::dyn_cast<TF::AssignVariableOp>(user))
     if (auto partitioned_input =
-            assign_var.resource().getDefiningOp<TF::TPUPartitionedInputOp>())
-      return partitioned_input._XlaSharding();
+            assign_var.getResource()
+                .getDefiningOp<TF::TPUPartitionedInputV2Op>())
+      return partitioned_input.get_XlaSharding();
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Looks up arg->retval aliases for every argument, and builds a reverse map.
@@ -305,7 +306,7 @@ void ExtractAliases(func::FuncOp func, llvm::SmallVectorImpl<int>& aliases) {
 }
 
 // Returns XLA sharding from argument connected via tf.aliasing_output.
-llvm::Optional<StringRef> GetXlaShardingFromAlias(
+std::optional<StringRef> GetXlaShardingFromAlias(
     Value value, llvm::SmallVectorImpl<int>& aliases,
     const llvm::SmallVectorImpl<llvm::StringRef>& sharding_for_args) {
   int retval_index = value.cast<OpResult>().getResultNumber();
@@ -315,7 +316,7 @@ llvm::Optional<StringRef> GetXlaShardingFromAlias(
       return sharding_for_args[arg_index];
     }
   }
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Returns XLA sharding from XlaSharding op connected to a result value.
@@ -327,7 +328,7 @@ llvm::Optional<StringRef> GetXlaShardingFromAlias(
 // Case, While) ops and Caller argument values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-llvm::Optional<StringRef> GetXlaShardingFromRetval(
+std::optional<StringRef> GetXlaShardingFromRetval(
     Value value, const llvm::SmallVector<std::string>& logical_device_vec) {
   llvm::SmallPtrSet<Value, 4> visited_values;
   llvm::SmallVector<Value, 4> values_to_visit;
@@ -346,7 +347,7 @@ llvm::Optional<StringRef> GetXlaShardingFromRetval(
     }
 
     if (auto sharding = llvm::dyn_cast_or_null<TF::XlaShardingOp>(def))
-      return sharding._XlaSharding();
+      return sharding.get_XlaSharding();
 
     if (auto sharding = def->getAttrOfType<StringAttr>("_XlaSharding")) {
       return sharding.strref();
@@ -385,7 +386,7 @@ llvm::Optional<StringRef> GetXlaShardingFromRetval(
     }
   }
 
-  return llvm::None;
+  return std::nullopt;
 }
 
 // Extracts sharding configurations for all outputs by parsing XlaSharding/
@@ -418,20 +419,20 @@ void IdentifyXlaShardingForComputationOutputs(
     OpOperand& retval = std::get<1>(result_and_retval);
 
     if (auto result_sharding = GetXlaShardingFromResult(result)) {
-      sharding_for_rets.push_back(result_sharding.getValue());
+      sharding_for_rets.push_back(result_sharding.value());
       continue;
     }
 
     if (auto from_alias =
             GetXlaShardingFromAlias(result, aliases, sharding_for_args)) {
-      sharding_for_rets.push_back(from_alias.getValue());
+      sharding_for_rets.push_back(from_alias.value());
       continue;
     }
 
     if (infer_from_computation) {
       if (auto retval_sharding =
               GetXlaShardingFromRetval(retval.get(), logical_device_vec)) {
-        sharding_for_rets.push_back(retval_sharding.getValue());
+        sharding_for_rets.push_back(retval_sharding.value());
         continue;
       }
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
index b6366083d42..b0dfc09df0e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <iostream>
+#include <optional>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -84,7 +85,7 @@ void HandleFuncOp(Operation* op) {
 
 // Handles cast op between the first convolution and the block argument.
 LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
-  auto cast_input = cast_op.x();
+  auto cast_input = cast_op.getX();
   // Update input type.
   auto transform_result_type =
       RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
@@ -98,7 +99,7 @@ LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
       block_arg = nullptr;
       cast_op_input = nullptr;
     } else {
-      auto cast_input = cast_op_input.x();
+      auto cast_input = cast_op_input.getX();
       // Update input type.
       auto transform_result_type =
           RankedTensorType::get(new_shape, getElementTypeOrSelf(cast_input));
@@ -113,7 +114,7 @@ LogicalResult HandleCast(TF::CastOp cast_op, ArrayRef<int64_t> new_shape) {
 
 // Handles padding before convolution for space to depth transform.
 LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
-  auto ranked_type = op.input().getType().dyn_cast<RankedTensorType>();
+  auto ranked_type = op.getInput().getType().dyn_cast<RankedTensorType>();
   if (!ranked_type) return failure();
   auto pad_input_shape = ranked_type.getShape();
   Location loc = op.getLoc();
@@ -162,7 +163,7 @@ void HandleConv2DStride(TF::Conv2DOp conv2d) {
 
 // Transforms input shape for the first convolution.
 void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
-  auto input = conv2d.input();
+  auto input = conv2d.getInput();
   auto input_shape = input.getType().cast<RankedTensorType>().getShape();
   SmallVector<int64_t, 4> transform_shape = {
       input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
@@ -223,7 +224,7 @@ void HandleConv2DFilter(TF::Conv2DOp conv2d, int64_t block_size) {
   // 2. Reshape to [4, 2, 4, 2, 3, 64]
   // 3. Transpose to [4, 4, 2, 2, 3, 64]
   // 4. Reshape to [4, 4, 12, 64]
-  auto filter = conv2d.filter();
+  auto filter = conv2d.getFilter();
   OpBuilder builder(conv2d);
   builder.setInsertionPoint(conv2d);
   // Book keeping filter information.
@@ -296,7 +297,7 @@ void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
   OpBuilder builder(backprop);
   builder.setInsertionPoint(backprop);
 
-  auto input = backprop.input();
+  auto input = backprop.getInput();
   // Get new filter size from new_filter_shape.
   auto new_filter_sizes = builder.create<TF::ConstOp>(
       backprop.getLoc(),
@@ -321,10 +322,10 @@ void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
   // Build new BackPropFilterOp.
   auto loc = backprop.getLoc();
   auto new_backprop = builder.create<TF::Conv2DBackpropFilterOp>(
-      loc, new_result_type, input, new_filter_sizes, backprop.out_backprop(),
-      strides, backprop.use_cudnn_on_gpu(), backprop.padding(),
-      backprop.explicit_paddings(), backprop.data_format(),
-      backprop.dilations());
+      loc, new_result_type, input, new_filter_sizes, backprop.getOutBackprop(),
+      strides, backprop.getUseCudnnOnGpu(), backprop.getPadding(),
+      backprop.getExplicitPaddings(), backprop.getDataFormat(),
+      backprop.getDilations());
 
   // For example, if new filter shape is [4, 4, 12, 64], old filter shape
   // is [7, 7, 3, 64] with block_size 2.
@@ -440,7 +441,7 @@ void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
       llvm::dyn_cast<tf_device::ReplicateOp>(cluster_func->getParentOp());
 
   llvm::SmallVector<int64_t, 8> transform_input_indices;
-  for (auto input : llvm::enumerate(cluster_func.operands())) {
+  for (const auto& input : llvm::enumerate(cluster_func.getOperands())) {
     if (auto block_arg = input.value().dyn_cast<BlockArgument>()) {
       if (block_arg.getArgNumber() != arg_num) continue;
       // For a block argument, consider transforms only when it is a replicated
@@ -486,13 +487,13 @@ bool Conv2DInputShapeCanTransform(Value input) {
 // Get block argument id and number of users for the input arg.
 Optional<BlockArgumentInfo> GetBlockArgNum(Value arg) {
   if (auto block_arg = arg.dyn_cast<mlir::BlockArgument>()) {
-    if (!Conv2DInputShapeCanTransform(arg)) return None;
+    if (!Conv2DInputShapeCanTransform(arg)) return std::nullopt;
     unsigned num_users =
         std::distance(block_arg.getUsers().begin(), block_arg.getUsers().end());
     BlockArgumentInfo block_arg_info = {block_arg.getArgNumber(), num_users};
     return block_arg_info;
   }
-  return None;
+  return std::nullopt;
 }
 
 // Gets input block argument id and number of users for the input recursively.
@@ -508,47 +509,47 @@ Optional<BlockArgumentInfo> GetInputBlockArgNum(Value input) {
 
   while (pad_op || cast_op) {
     if (pad_op) {
-      auto block_arg_num = GetBlockArgNum(pad_op.input());
+      auto block_arg_num = GetBlockArgNum(pad_op.getInput());
       if (block_arg_num.has_value()) return block_arg_num;
-      next_input = pad_op.input();
+      next_input = pad_op.getInput();
     } else {
-      auto block_arg_num = GetBlockArgNum(cast_op.x());
+      auto block_arg_num = GetBlockArgNum(cast_op.getX());
       if (block_arg_num.has_value()) return block_arg_num;
-      next_input = cast_op.x();
+      next_input = cast_op.getX();
     }
     pad_op = dyn_cast_or_null<TF::PadOp>(next_input.getDefiningOp());
     cast_op = dyn_cast_or_null<TF::CastOp>(next_input.getDefiningOp());
   }
 
-  return None;
+  return std::nullopt;
 }
 
 // Checks if a convoluton can apply SpaceToDepth transform.
 // Only the first convolution in the graph whose batch size smaller than 8
 // and its input feature size smaller than 8 can be transformed.
 Optional<BlockArgumentInfo> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
-  if (conv2d.data_format() != "NHWC" || conv2d.strides().size() != 4) {
-    return None;
+  if (conv2d.getDataFormat() != "NHWC" || conv2d.getStrides().size() != 4) {
+    return std::nullopt;
   }
   // Current supported ops between convolution input and the block arguments are
   // PadOp and CastOp.
-  return GetInputBlockArgNum(conv2d.input());
+  return GetInputBlockArgNum(conv2d.getInput());
 }
 
 // Applies space to depth transform for the first convolution on TPU device.
 void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   // Check if input and filter type are RankedTensorType.
   auto input_tensor_type =
-      conv2d.input().getType().dyn_cast<RankedTensorType>();
+      conv2d.getInput().getType().dyn_cast<RankedTensorType>();
   auto filter_tensor_type =
-      conv2d.filter().getType().dyn_cast<RankedTensorType>();
+      conv2d.getFilter().getType().dyn_cast<RankedTensorType>();
   if (!input_tensor_type || !filter_tensor_type) return;
   // Book keeping filter shape for padding and backprop filter rewrite.
   auto filter_shape = filter_tensor_type.getShape();
   SmallVector<int32_t, 4> old_filter_shape(filter_shape.begin(),
                                            filter_shape.end());
   // Handles input.
-  auto conv2d_input = conv2d.input();
+  auto conv2d_input = conv2d.getInput();
   if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
     // Change on device function type/shape.
     HandleFuncOp(block_arg.getOwner()->getParentOp());
@@ -557,7 +558,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
   if (auto pad_op = dyn_cast_or_null<TF::PadOp>(conv2d_input.getDefiningOp())) {
     // Rewrite pad_op before Convolutioin.
     if (failed(HandlePad(pad_op, filter_shape[0], block_size))) return;
-    auto pad_input = pad_op.input();
+    auto pad_input = pad_op.getInput();
     if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
       // Change on device function type/shape.
       HandleFuncOp(block_arg.getOwner()->getParentOp());
@@ -571,7 +572,8 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
 
   // Book keeping new filter shape for backprop filter rewrite.
   // Filter shape is defined in HandleConv2DFilter, thus it is RankedTensorType.
-  filter_shape = conv2d.filter().getType().cast<RankedTensorType>().getShape();
+  filter_shape =
+      conv2d.getFilter().getType().cast<RankedTensorType>().getShape();
   SmallVector<int32_t, 4> new_filter_shape(filter_shape.begin(),
                                            filter_shape.end());
 
@@ -591,7 +593,7 @@ void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
 int32_t GetConv2DBlockSize(TF::Conv2DOp conv2d) {
   SmallVector<int32_t, 4> strides(4, 1);
   for (int i = 0; i < 3; ++i) {
-    strides[i] = conv2d.strides()[i].cast<mlir::IntegerAttr>().getInt();
+    strides[i] = conv2d.getStrides()[i].cast<mlir::IntegerAttr>().getInt();
   }
 
   // Space to depth only supports striding at spatial dimension.
@@ -634,8 +636,8 @@ void TPUSpaceToDepthPass::runOnOperation() {
     if (arg_num_and_num_users.has_value()) {
       // Get block size for the first convolution.
       int64_t block_size = GetConv2DBlockSize(conv2d);
-      auto arg_num = arg_num_and_num_users.getValue().arg_num;
-      auto num_users = arg_num_and_num_users.getValue().num_users;
+      auto arg_num = arg_num_and_num_users.value().arg_num;
+      auto num_users = arg_num_and_num_users.value().num_users;
       argnum_and_convolutions[arg_num].emplace_back(conv2d, block_size);
       argnum_num_users[arg_num] = num_users;
       return WalkResult::interrupt();
@@ -689,7 +691,7 @@ void TPUSpaceToDepthPass::runOnOperation() {
     auto conv2d_and_block_sizes = argnum_and_convolution.getSecond();
     int64_t block_size = conv2d_and_block_sizes[0].second;
     // Apply space to depth transform to the input on the host.
-    HandleCluster(cluster_func.getValue(), block_size,
+    HandleCluster(cluster_func.value(), block_size,
                   argnum_and_convolution.getFirst());
     // Transform the convolution.
     for (auto conv2d_and_block_size : conv2d_and_block_sizes) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 29b43216e1b..797038c7772 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -101,8 +101,8 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     TF::WhileRegionOp while_op, tf_device::ReplicateOp replicate,
     TF::TPUExecuteAndUpdateVariablesOp execute,
     tf_device::LaunchOp compile_launch) {
-  Region& body = while_op.body();
-  Region& cond = while_op.cond();
+  Region& body = while_op.getBody();
+  Region& cond = while_op.getCond();
 
   llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4> mapping;
   auto mirrored_variable_indices_attr =
@@ -111,7 +111,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
 
   // Finds the mapping from a replicate argument to an execute operand.
   llvm::SmallDenseMap<int64_t, int64_t, 8> replicate_arg_to_execute_arg;
-  for (auto index_and_arg : llvm::enumerate(execute.args())) {
+  for (auto index_and_arg : llvm::enumerate(execute.getArgs())) {
     auto arg = SkipIdentity(index_and_arg.value(), /*allow_other_use=*/false);
     if (!arg.hasOneUse() ||
         !getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
@@ -203,12 +203,12 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   // Sort the mapping according to execute operand order.
   llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
-  for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
+  for (auto entry : llvm::enumerate(execute.getDeviceVarReadsIndices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();
     auto arg_metadata = metadata.mutable_args(arg_index);
     if (arg_metadata->enable_xla_sharding() ==
         ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED) {
-      int64_t ret_index = execute.device_var_updates_indices()
+      int64_t ret_index = execute.getDeviceVarUpdatesIndices()
                               .getValue()[entry.index()]
                               .cast<IntegerAttr>()
                               .getInt();
@@ -257,7 +257,7 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
   }
   SmallVector<Value, 4> new_input_values;
   new_input_values.reserve(new_inputs.size());
-  for (auto var : new_inputs) new_input_values.push_back(var.resource());
+  for (auto var : new_inputs) new_input_values.push_back(var.getResource());
   new_replicated_inputs.emplace_back(new_input_values,
                                      new_input_values.front().getType());
   OpBuilder builder(replicate);
@@ -362,7 +362,7 @@ bool HandleReplicateOp(TF::WhileRegionOp while_op,
   });
   if (!execute) return false;
   auto compile =
-      SkipIdentity(execute.key(), /*allow_other_use=*/true).getDefiningOp();
+      SkipIdentity(execute.getKey(), /*allow_other_use=*/true).getDefiningOp();
   if (!compile) return false;
   auto compile_launch = llvm::dyn_cast<tf_device::LaunchOp>(compile);
   if (!compile_launch || !compile_launch.WrapsSingleOp() ||
@@ -379,7 +379,7 @@ bool HandleReplicateOp(TF::WhileRegionOp while_op,
   auto devices_attr = replicate.getDevices();
   if (!devices_attr) return false;
 
-  auto device_map = devices_attr.getValue();
+  auto device_map = devices_attr.value();
   llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>> devices;
   devices.reserve(device_map.size());
 
@@ -408,7 +408,7 @@ bool HandleReplicateOp(TF::WhileRegionOp while_op,
   // `replicate`.
   llvm::SmallVector<Value, 8> reformat_operands;
   for (const auto& entry : execute_arg_to_outer_args) {
-    reformat_operands.push_back(execute.args()[entry.first]);
+    reformat_operands.push_back(execute.getArgs()[entry.first]);
   }
   reformat_operands.push_back(compile_launch.getResult(1));
   reformat_operands.push_back(replicate.GetBody().getArgument(
@@ -433,7 +433,7 @@ bool HandleReplicateOp(TF::WhileRegionOp while_op,
   }
   llvm::SmallVector<Value, 4> state_var_vals(state_vars.size());
   for (const auto& entry : llvm::enumerate(state_vars)) {
-    state_var_vals[entry.index()] = entry.value().resource();
+    state_var_vals[entry.index()] = entry.value().getResource();
   }
   // Add the replicated state var to the end of the replicate operands.
   unformat_replicate_operands.emplace_back(state_var_vals,
@@ -485,7 +485,7 @@ void TPUVariableRuntimeReformattingPass::runOnOperation() {
   bool reshard_was_inserted = false;
   module.walk([&](TF::WhileRegionOp while_op) {
     tf_device::ReplicateOp replicate;
-    while_op.body().walk([&](tf_device::ReplicateOp replicate_op) {
+    while_op.getBody().walk([&](tf_device::ReplicateOp replicate_op) {
       if (replicate == nullptr) {
         replicate = replicate_op;
         return WalkResult::advance();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index 2e72492db4f..abdd1a83d51 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -111,7 +111,7 @@ std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
     // [1, num_rows, num_cols] -> [num_rows, num_cols]
     auto reshape_op = createReshapeOp(value, {num_rows, num_cols}, element_type,
                                       loc, rewriter);
-    sliced.emplace_back(reshape_op.output());
+    sliced.emplace_back(reshape_op.getOutput());
   } else {
     // Reshape to rank-3 tensor with first dimension as the batch size.
     auto reshape_op = createReshapeOp(value, {batch_size, num_rows, num_cols},
@@ -128,16 +128,17 @@ std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
     SmallVector<int64_t, 3> slice_size = {1, num_rows, num_cols};
     Type slice_result_type = RankedTensorType::get(slice_size, element_type);
     llvm::SmallVector<Type, 4> output_types(batch_size, slice_result_type);
-    auto split_op = rewriter.create<TF::SplitOp>(
-        loc, output_types, split_dimension_op.output(), reshape_op.output());
+    auto split_op = rewriter.create<TF::SplitOp>(loc, output_types,
+                                                 split_dimension_op.getOutput(),
+                                                 reshape_op.getOutput());
 
     // Squeeze each batch, i.e. reshape
     // [1, num_rows, num_cols] -> [num_rows, num_cols]
-    for (const auto& split_value : split_op.output()) {
+    for (const auto& split_value : split_op.getOutput()) {
       auto reshape_op = createReshapeOp(split_value, {num_rows, num_cols},
                                         element_type, loc, rewriter);
 
-      sliced.emplace_back(reshape_op.output());
+      sliced.emplace_back(reshape_op.getOutput());
     }
   }
   return sliced;
@@ -146,8 +147,8 @@ std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
 template <typename BatchMatMulOpType>
 LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
     BatchMatMulOpType op, PatternRewriter& rewriter) const {
-  Value input_lhs = op.x();
-  Value input_rhs = op.y();
+  Value input_lhs = op.getX();
+  Value input_rhs = op.getY();
 
   if (!input_lhs.getType().isa<RankedTensorType>()) {
     // LHS must be a ranked tensor type
@@ -190,15 +191,15 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
 
   // Replace the last 2 dimensions of LHS and RHS if necessary.
   // The actual transpose is done by MatMulOp.
-  if (op.adj_x()) {
+  if (op.getAdjX()) {
     std::swap(lhs_shape[lhs_dims - 1], lhs_shape[lhs_dims - 2]);
   }
-  if (op.adj_y()) {
+  if (op.getAdjY()) {
     std::swap(rhs_shape[rhs_dims - 1], rhs_shape[rhs_dims - 2]);
   }
 
-  const int rows = lhs_shape[lhs_dims - 2];
-  const int cols = rhs_shape[rhs_dims - 1];
+  const int64_t rows = lhs_shape[lhs_dims - 2];
+  const int64_t cols = rhs_shape[rhs_dims - 1];
 
   if (lhs_shape[lhs_dims - 1] != rhs_shape[rhs_dims - 2]) {
     // Input dimensions must be compatible for multiplication.
@@ -212,20 +213,20 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
     rewriter.replaceOpWithNewOp<TF::MatMulOp>(op, matmul_type,
                                               /*a=*/input_lhs,
                                               /*b=*/input_rhs,
-                                              /*transpose_a=*/op.adj_x(),
-                                              /*transpose_b=*/op.adj_y());
+                                              /*transpose_a=*/op.getAdjX(),
+                                              /*transpose_b=*/op.getAdjY());
     return success();
   }
 
   // Input dimensions must be defined. MatMulBCast does not support partial
   // shapes.
   for (auto dim : lhs_shape) {
-    if (dim == -1) {
+    if (dim == mlir::ShapedType::kDynamic) {
       return failure();
     }
   }
   for (auto dim : rhs_shape) {
-    if (dim == -1) {
+    if (dim == mlir::ShapedType::kDynamic) {
       return failure();
     }
   }
@@ -260,9 +261,9 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
     auto matmul = rewriter.create<TF::MatMulOp>(loc, matmul_type,
                                                 /*a=*/sliced_lhs[lhs_batch_idx],
                                                 /*b=*/sliced_rhs[rhs_batch_idx],
-                                                /*transpose_a=*/op.adj_x(),
-                                                /*transpose_b=*/op.adj_y());
-    matmuls.emplace_back(matmul.product());
+                                                /*transpose_a=*/op.getAdjX(),
+                                                /*transpose_b=*/op.getAdjY());
+    matmuls.emplace_back(matmul.getProduct());
   }
 
   // Combine the result of each individual MatMul into a rank-3 tensor.
@@ -279,9 +280,9 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
   result_shape.push_back(rows);
   result_shape.push_back(cols);
 
-  auto reshape_op = createReshapeOp(pack_op.output(), result_shape,
+  auto reshape_op = createReshapeOp(pack_op.getOutput(), result_shape,
                                     element_type, loc, rewriter);
-  rewriter.replaceOp(op, reshape_op.output());
+  rewriter.replaceOp(op, reshape_op.getOutput());
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc b/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
index 4455152d708..f1403fc3a80 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
-#include <utility>
+#include <string>
 #include <vector>
 
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -25,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/verify_suitable_for_graph_export.h"
@@ -34,13 +36,28 @@ namespace mlir {
 namespace tf_executor {
 namespace {
 
+// Comparator for `OpsInReverseProgramOrder`.
+struct IsAfterInBlock {
+  bool operator()(Operation* op, Operation* other_op) const {
+    // This function has an average complexity of O(1).
+    return other_op->isBeforeInBlock(op);
+  }
+};
+
+// Maps group IDs to branch IDs.
+using GroupIdToBranchIdMap = absl::flat_hash_map<std::string, std::string>;
+// Maps an op to parallel execution IDs.
+using OpToParallelIdsMap =
+    absl::flat_hash_map<Operation*, GroupIdToBranchIdMap>;
+// Maps an op to a set of ops.
+using OpToOpsMap =
+    absl::flat_hash_map<Operation*, absl::flat_hash_set<Operation*>>;
+// Represents a set of ops in reverse program order.
+using OpsInReverseProgramOrder = absl::btree_set<Operation*, IsAfterInBlock>;
+
 #define GEN_PASS_DEF_EXECUTORUPDATECONTROLDEPENDENCIESPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
-// Note that `SetVector` provides efficient lookup and deletion as well as
-// deterministic iteration order which we need here.
-using OpToIslandsMap = llvm::DenseMap<Operation*, llvm::SetVector<IslandOp>>;
-
 class UpdateControlDependenciesPass
     : public impl::ExecutorUpdateControlDependenciesPassBase<
           UpdateControlDependenciesPass> {
@@ -48,144 +65,222 @@ class UpdateControlDependenciesPass
   void runOnOperation() override;
 };
 
-// Returns true iff the islands are guaranteed to have different devices
-// assigned.
-bool HaveDifferentDevices(IslandOp first_island, IslandOp second_island) {
-  Operation& first_op = first_island.GetBody().front();
-  Operation& second_op = second_island.GetBody().front();
-  llvm::SmallVector<tensorflow::DeviceNameUtils::ParsedName, 2> parsed_names;
-
-  for (Operation* op : {&first_op, &second_op}) {
-    auto device_attr = op->getAttrOfType<StringAttr>(tensorflow::kDeviceAttr);
-    // For empty device we can't guarantee that devices are different.
-    if (!device_attr || device_attr.getValue().empty()) return false;
-
-    tensorflow::DeviceNameUtils::ParsedName parsed_name;
-    bool success = tensorflow::DeviceNameUtils::ParseFullOrLocalName(
-        device_attr.getValue(), &parsed_name);
-    // If parsing was not successful, then we can't guarantee that devices are
-    // different.
-    if (!success) return false;
-    parsed_names.push_back(parsed_name);
-  }
-  // If device names are not compatible, then corresponding devices must be
-  // different.
-  return !tensorflow::DeviceNameUtils::AreCompatibleDevNames(parsed_names[0],
-                                                             parsed_names[1]);
+const GroupIdToBranchIdMap& EmptyGroupIdToBranchIdMap() {
+  // clang-format off
+  static auto* empty_map = new absl::flat_hash_map<std::string, std::string>{};
+  return *empty_map;
 }
 
-// Returns true iff we should ignore a dependency between both islands.
-bool ShouldIgnoreDependency(IslandOp first_island, IslandOp second_island) {
-  return HaveDifferentDevices(first_island, second_island);
+// Returns map whose elements are the (group ID,branch ID) pairs for `op`.
+const GroupIdToBranchIdMap& GetGroupIdToBranchIdMap(
+    Operation* op, const OpToParallelIdsMap& op_to_parallel_ids_map) {
+  auto iter = op_to_parallel_ids_map.find(op);
+  if (iter == op_to_parallel_ids_map.end()) return EmptyGroupIdToBranchIdMap();
+  return iter->second;
 }
 
-// Collects direct control predecessors per op by querying side effect analysis.
-//
-// We only collect control predecessor that are islands, others (if any) are
-// irrelevant for this pass.
-void CollectDirectControlPredecessors(
-    Operation* op, const TF::SideEffectAnalysis::Info& analysis_for_func,
-    OpToIslandsMap& control_predecessors_map) {
-  for (Operation* control_predecessor :
-       analysis_for_func.DirectControlPredecessors(op)) {
-    if (auto control_pred_island =
-            dyn_cast<mlir::tf_executor::IslandOp>(control_predecessor)) {
-      control_predecessors_map[op].insert(control_pred_island);
+// Returns true iff a control dependency between both ops is considered valid,
+// depending on their parallel execution IDs.
+// A control dependency is invalid if both ops share a common parallel execution
+// group with different branch IDs (in that case, the ops are expected to run in
+// parallel).
+bool IsValidDependency(Operation* op, Operation* other_op,
+                          const OpToParallelIdsMap& op_to_parallel_ids_map) {
+  const GroupIdToBranchIdMap& parallel_ids_map =
+      GetGroupIdToBranchIdMap(op, op_to_parallel_ids_map);
+  const GroupIdToBranchIdMap& other_parallel_ids_map =
+      GetGroupIdToBranchIdMap(other_op, op_to_parallel_ids_map);
+
+  for (auto [group_id, branch_id] : parallel_ids_map) {
+    auto iter = other_parallel_ids_map.find(group_id);
+    // `other_op` has same group as `op`, with different branch ID.
+    if (iter != other_parallel_ids_map.end() && iter->second != branch_id) {
+      return false;
     }
   }
+  // The ops don't share a common group with different branch IDs.
+  return true;
 }
 
-// Propagates control predecessors for cases where we don't want to create a
-// control dependency even though side effect analysis sees a dependency.
-//
-// Currently, this is the case for ops with different assigned devices: It can
-// happen that side effect analysis sees a dependency because the ops may use
-// the same resource (which is basically a modeling issue we have to work
-// around here). In such a case, we ignore the dependency, but we have to make
-// sure that we don't lose any indirect dependencies we want to keep.
-// For example, say side effect analysis sees dependencies A -> B -> C, and A
-// and C have the same assigned device and B has a different assigned device.
-// Then we want to ignore the dependencies A -> B and B -> C but keep the
-// transitive dependency A -> C.
-// This function updates `control_predecessors_map` such that this is always the
-// case.
-void PropagateControlPredecessors(
-    IslandOp island, const TF::SideEffectAnalysis::Info& analysis_for_func,
-    OpToIslandsMap& control_predecessors_map) {
-  // Find control predecessors we want to ignore and mark them for propagation.
-  llvm::SmallVector<IslandOp, 8> control_predecessors_to_propagate;
-  for (IslandOp control_pred_island : control_predecessors_map[island]) {
-    if (ShouldIgnoreDependency(island, control_pred_island)) {
-      control_predecessors_to_propagate.push_back(control_pred_island);
+void ClearControlInputs(Operation* op, int& num_control_inputs_removed) {
+  // We only call this function for island or fetch ops. The second pair of
+  // parentheses is needed for successful compilation.
+  assert((isa<IslandOp, FetchOp>(op)));
+  if (auto island = dyn_cast<IslandOp>(op)) {
+    num_control_inputs_removed += island.getControlInputs().size();
+    island.getControlInputsMutable().clear();
+  } else if (auto fetch = dyn_cast<FetchOp>(op)) {
+    GraphOp graph = fetch->getParentOfType<GraphOp>();
+    int num_control_fetches = fetch.getNumOperands() - graph.getNumResults();
+    if (num_control_fetches > 0) {
+      fetch.getFetchesMutable().erase(graph.getNumResults(),
+                                      num_control_fetches);
+      num_control_inputs_removed += num_control_fetches;
     }
   }
-  // For all control predecessors to propagate, remove them from island's
-  // control predecessors and add them as control predecessors for all control
-  // successors of island (this is to make sure we don't lose any transitive
-  // dependencies).
-  for (IslandOp control_pred_island : control_predecessors_to_propagate) {
-    control_predecessors_map[island].remove(control_pred_island);
-    for (Operation* control_successor :
-         analysis_for_func.DirectControlSuccessors(island)) {
-      control_predecessors_map[control_successor].insert(control_pred_island);
+}
+
+void SetControlInputs(
+    Operation* op,
+    const llvm::SmallVector<Operation*, 8>& preds_in_reverse_program_order,
+    int& num_control_inputs_added) {
+  // We only call this function for island or fetch ops. The second pair of
+  // parentheses is needed for successful compilation.
+  assert((isa<IslandOp, FetchOp>(op)));
+  mlir::MutableOperandRange mutable_control_inputs =
+      isa<IslandOp>(op) ? cast<IslandOp>(op).getControlInputsMutable()
+                        : cast<FetchOp>(op).getFetchesMutable();
+  // Add control inputs in program order of the defining ops.
+  for (auto iter = preds_in_reverse_program_order.rbegin();
+       iter != preds_in_reverse_program_order.rend();
+       ++iter) {
+    Operation* pred = *iter;
+    if (auto pred_island = dyn_cast<mlir::tf_executor::IslandOp>(pred)) {
+      mutable_control_inputs.append(pred_island.getControl());
     }
   }
+  num_control_inputs_added += preds_in_reverse_program_order.size();
 }
 
-void UpdateAllControlDependencies(
-    func::FuncOp func, const TF::SideEffectAnalysis::Info& analysis_for_func) {
-  int control_inputs_added = 0;
-  llvm::SmallVector<Value, 8> new_control_inputs;
-  llvm::SmallVector<Operation*, 8> fetch_control_predecessors;
-
-  OpToIslandsMap control_predecessors_map;
-  auto graph_op = cast<GraphOp>(func.front().front());
-  graph_op.walk([&](Operation* op) {
-    if (!isa<IslandOp, FetchOp>(op)) return WalkResult::advance();
-    CollectDirectControlPredecessors(op, analysis_for_func,
-                                     control_predecessors_map);
-    if (auto island = dyn_cast<mlir::tf_executor::IslandOp>(op)) {
-      PropagateControlPredecessors(island, analysis_for_func,
-                                   control_predecessors_map);
+// Fills `op_to_parallel_ids_map` from parallel execution attributes in `graph`.
+// Returns `failure` iff any attribute is malformed.
+LogicalResult FillOpToParallelIdsMap(
+    GraphOp graph, OpToParallelIdsMap& op_to_parallel_ids_map) {
+  for (Operation& op : graph.GetBody()) {
+    auto island = dyn_cast<IslandOp>(&op);
+    if (!island) continue;
+
+    // We call `VerifyExportSuitable` in the beginning of the pass, so every
+    // island wraps a single op.
+    Operation& wrapped_op = island.GetBody().front();
+    TF::ParallelExecutionIdPairs id_pairs;
+    if (failed(TF::ParseParallelExecutionIds(&wrapped_op, id_pairs))) {
+      wrapped_op.emitError()
+          << "Malformed " << TF::kParallelExecAnnotation << " attribute";
+      return failure();
     }
-    return WalkResult::advance();
-  });
-
-  graph_op.walk([&](IslandOp island) {
-    // Update control inputs for island.
-    for (Operation* control_predecessor : control_predecessors_map[island]) {
-      if (auto control_pred_island =
-              dyn_cast<mlir::tf_executor::IslandOp>(control_predecessor)) {
-        new_control_inputs.push_back(control_pred_island.getControl());
+    if (id_pairs.empty()) continue;
+
+    GroupIdToBranchIdMap& ids_map = op_to_parallel_ids_map[island];
+    for (auto [group_id, branch_id] : id_pairs) ids_map[group_id] = branch_id;
+  }
+  return success();
+}
+
+// Computes and sets direct control inputs for `op`. Also fills
+// `active_transitive_preds` and `inactive_transitive_preds` for `op`.
+void
+UpdateControlDependenciesForOp(
+    Operation* op, const TF::SideEffectAnalysis::Info& analysis_for_func,
+    const OpToParallelIdsMap& op_to_parallel_ids_map,
+    OpToOpsMap& active_transitive_preds,
+    OpToOpsMap& inactive_transitive_preds,
+    int& num_control_inputs_removed,
+    int& num_control_inputs_added,
+    int& num_invalid_dependencies) {
+  OpsInReverseProgramOrder potential_preds;
+  active_transitive_preds[op].insert(op);
+  for (Operation* pred : analysis_for_func.DirectControlPredecessors(op)) {
+    // Propagate inactive transitive dependencies from `pred` to `op`.
+    inactive_transitive_preds[op].insert(
+        inactive_transitive_preds[pred].begin(),
+        inactive_transitive_preds[pred].end());
+    // Inactive transitive predecessors of `pred` are potential direct
+    // predecessors of `op` (they are not tracked by `pred`).
+    for (Operation* transitive_pred : inactive_transitive_preds[pred]) {
+      potential_preds.insert(transitive_pred);
+    }
+    if (IsValidDependency(pred, op, op_to_parallel_ids_map)) {
+      // We know that any active transitive predecessors will still be covered
+      // by (pred, op), so we don't have to add them to `potential_preds`.
+      potential_preds.insert(pred);
+    } else {
+      // Active transitive predecessors will not be covered by (pred, op)
+      // anymore, so add them all as candidates.
+      for (Operation* transitive_pred : active_transitive_preds[pred]) {
+        potential_preds.insert(transitive_pred);
       }
+      ++num_invalid_dependencies;
     }
-    // None of the originally given control deps are necessary.
-    island.getControlInputsMutable().clear();
-    island.getControlInputsMutable().append(new_control_inputs);
-    control_inputs_added += new_control_inputs.size();
-    new_control_inputs.clear();
-  });
-
-  // Update control inputs for fetch op.
-  FetchOp fetch_op = graph_op.GetFetch();
-
-  // None of the originally given control deps are necessary.
-  int num_control_fetches =
-      fetch_op.getNumOperands() - graph_op.getNumResults();
-  if (num_control_fetches > 0) {
-    fetch_op.getFetchesMutable().erase(graph_op.getNumResults(),
-                                       num_control_fetches);
   }
-  for (Operation* control_predecessor : control_predecessors_map[fetch_op]) {
-    if (auto control_pred_island =
-            dyn_cast<tf_executor::IslandOp>(control_predecessor)) {
-      new_control_inputs.push_back(control_pred_island.getControl());
+  llvm::SmallVector<Operation*, 8> preds_in_reverse_program_order;
+  for (Operation* potential_pred : potential_preds) {
+    bool is_valid =
+        IsValidDependency(potential_pred, op, op_to_parallel_ids_map);
+    if (!is_valid) {
+      // We don't keep the (pred, op) dependency, so all active transitive
+      // dependencies become inactive.
+      inactive_transitive_preds[op].insert(
+          active_transitive_preds[potential_pred].begin(),
+          active_transitive_preds[potential_pred].end());
+    } else if (!active_transitive_preds[op].contains(potential_pred)) {
+      // `potential_pred` is not an active transitive predecessor of `op` yet,
+      // so we must add it as a direct predecessor.
+      preds_in_reverse_program_order.push_back(potential_pred);
+      // We keep the (pred, op) dependency, so all active transitive
+      // dependencies stay active.
+      active_transitive_preds[op].insert(
+          active_transitive_preds[potential_pred].begin(),
+          active_transitive_preds[potential_pred].end());
     }
   }
-  control_inputs_added += new_control_inputs.size();
-  fetch_op.getFetchesMutable().append(new_control_inputs);
+  ClearControlInputs(op, num_control_inputs_removed);
+  SetControlInputs(op, preds_in_reverse_program_order,
+                   num_control_inputs_added);
+}
 
-  VLOG(2) << "Number of control inputs added: " << control_inputs_added;
+// This function updates all control dependencies in `func`, represented as
+// control inputs for island and fetch ops of the graph body in `func`.
+// Ideally, we would purely rely on side effect analysis here and propagate
+// the queried dependencies to the island and fetch ops. However, this is
+// currently not in line with execution semantics in case of replication and
+// parallel executes: If two ops originated from different branches of a
+// `tf_device.replicate` or `tf_device.parallel_execute` op, then there should
+// be no control dependency between them irrespective of side effects, even if
+// this could cause a race condition (see b/262304795).
+// Because of this, we need to keep track of the origin of such ops which we do
+// via `kParallelExecAnnotation` attributes that are interpreted in this pass.
+//
+// NOTE: This pass guarantees the minimum number of control inputs. Its runtime
+// and space complexity can be quadratic in the number of side-effecting ops per
+// function. If that becomes a problem in practice, we could look into speed-ups
+// used for `DependencyOptimizer::TransitiveReduction` which solves a similar
+// problem and also has worst-case quadratic runtime and space complexity.
+// Alternatively, we could allow redundant control inputs (less bookkeeping).
+LogicalResult UpdateAllControlDependencies(
+    func::FuncOp func, const TF::SideEffectAnalysis::Info& analysis_for_func) {
+  int num_control_inputs_removed = 0;
+  int num_control_inputs_added = 0;
+  int num_invalid_dependencies = 0;
+
+  // Maps island ops to parallel IDs of the wrapped ops.
+  OpToParallelIdsMap op_to_parallel_ids_map;
+  OpToOpsMap active_transitive_preds, inactive_transitive_preds;
+
+  // We call `VerifyExportSuitable` in the beginning of the pass, so every
+  // function has a single graph op.
+  auto graph = cast<GraphOp>(func.front().front());
+  if (failed(FillOpToParallelIdsMap(graph, op_to_parallel_ids_map))) {
+    return failure();
+  }
+  for (Operation& op_ref : graph.GetBody()) {
+    Operation* op = &op_ref;
+    // We only represent control dependencies between island and fetch ops.
+    if (!isa<IslandOp, FetchOp>(op)) continue;
+    UpdateControlDependenciesForOp(
+        op,
+        analysis_for_func,
+        op_to_parallel_ids_map,
+        active_transitive_preds,
+        inactive_transitive_preds,
+        num_control_inputs_removed,
+        num_control_inputs_added,
+        num_invalid_dependencies);
+  }
+  VLOG(2) << "Number of control inputs removed: " << num_control_inputs_removed;
+  VLOG(2) << "Number of control inputs added: " << num_control_inputs_added;
+  VLOG(2) << "Number of invalid dependencies: " << num_invalid_dependencies;
+  return success();
 }
 
 void UpdateControlDependenciesPass::runOnOperation() {
@@ -198,13 +293,14 @@ void UpdateControlDependenciesPass::runOnOperation() {
     return;
   }
   TF::SideEffectAnalysis side_effect_analysis(module);
-
-  // Recompute control dependencies between all islands.
   for (auto func : module.getOps<func::FuncOp>()) {
     if (func.isExternal()) continue;
     const auto& analysis_for_func =
         side_effect_analysis.GetAnalysisForFunc(func);
-    UpdateAllControlDependencies(func, analysis_for_func);
+    if (failed(UpdateAllControlDependencies(func, analysis_for_func))) {
+      signalPassFailure();
+      return;
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
index 2c759132900..46e91eaeb51 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
@@ -13,12 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+#include <stack>
+
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
+inline constexpr absl::string_view kEntryFunctionAttr = "tf.entry_function";
+
 namespace mlir {
 
 namespace {
@@ -32,36 +39,55 @@ struct XlaClusterFormationPass
   void runOnOperation() override;
 };
 
-void EncapsulatePartitionedCall(TF::StatefulPartitionedCallOp call_op) {
-  mlir::OpBuilder builder(call_op);
+void EncapsulatePartitionedCall(Operation *call_op) {
+  OpBuilder builder(call_op);
 
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      call_op.getLoc(), call_op.getResultTypes());
+  auto cluster = builder.create<tf_device::ClusterOp>(
+      call_op->getLoc(), call_op->getResultTypes());
 
-  call_op.replaceAllUsesWith(cluster.getResults());
+  call_op->replaceAllUsesWith(cluster.getResults());
 
-  cluster.getBody().push_back(new mlir::Block);
+  cluster.getBody().push_back(new Block);
 
-  call_op.getOperation()->moveBefore(&cluster.GetBody(),
-                                     cluster.GetBody().end());
+  call_op->moveBefore(&cluster.GetBody(), cluster.GetBody().end());
 
   builder.setInsertionPointToEnd(&cluster.GetBody());
-  builder.create<mlir::tf_device::ReturnOp>(call_op.getLoc(),
-                                            call_op->getResults());
+  builder.create<tf_device::ReturnOp>(call_op->getLoc(), call_op->getResults());
 }
 
 void XlaClusterFormationPass::runOnOperation() {
   ModuleOp module = getOperation();
-
-  llvm::SmallVector<TF::StatefulPartitionedCallOp, 4> ops;
-  module.walk([&](TF::StatefulPartitionedCallOp call_op) {
-    if (call_op->hasAttr(tensorflow::kCompileDeviceTypeAttr)) {
-      ops.push_back(call_op);
+  SymbolTable symtab(module);
+
+  llvm::SmallVector<func::FuncOp> entry_funcs;
+  // A model may have multiple graphs, with each graph having its own entry.
+  // When a graph is imported to MLIR, `tf.entry_function` will be added to
+  // each entry function. The one exception are initializer functions, which
+  // have `tf_saved_model.initializer_type` instead.
+  module.walk([&](func::FuncOp func) {
+    if (func->hasAttr(kEntryFunctionAttr) ||
+        func->hasAttr(tf_saved_model::kTfSavedModelInitializerTypeAttr)) {
+      entry_funcs.push_back(func);
     }
   });
-
-  for (auto call_op : ops) {
-    EncapsulatePartitionedCall(call_op);
+  if (entry_funcs.empty()) {
+    LOG(WARNING) << "no entry function is found";
+  }
+  auto predicate = [](Operation *op) {
+    if (op->hasAttr(tensorflow::kCompileDeviceTypeAttr)) return true;
+    return false;
+  };
+  for (auto &root : entry_funcs) {
+    llvm::SmallVector<Operation *> outermost_call_ops;
+    if (failed(GetOutermostOpsOfType<TF::StatefulPartitionedCallOp,
+                                     TF::PartitionedCallOp>(
+            root, symtab, outermost_call_ops, predicate)))
+      return signalPassFailure();
+    // Cluster outermost partitioned calls with _xla_compile_device_type
+    // attribute.
+    for (auto &call_op : outermost_call_ops) {
+      EncapsulatePartitionedCall(call_op);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
index 6ff5363765e..550e5804430 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This transformation pass converts stateful partitioned calls with
-// _xla_compile_device_type attribute to XLA launch ops.
+// This transformation pass converts stateful and stateless paritioned calls
+// with _xla_compile_device_type attribute to XLA launch ops.
 
-#include <memory>
-#include <string>
-#include <vector>
+#include <stack>
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
@@ -38,8 +37,8 @@ struct XlaRewritePass : public impl::XlaRewritePassBase<XlaRewritePass> {
   void runOnOperation() override;
 };
 
-void MoveResourceArgsToEnd(mlir::func::FuncOp callee) {
-  llvm::DenseMap<mlir::BlockArgument, mlir::BlockArgument> mapping;
+void MoveResourceArgsToEnd(func::FuncOp callee) {
+  llvm::DenseMap<BlockArgument, BlockArgument> mapping;
   unsigned num_params = callee.getNumArguments();
   llvm::BitVector removed_params(num_params);
   // Copy the resource-type parameters to the end.
@@ -65,10 +64,10 @@ template <typename OpT,
           typename std::enable_if<llvm::is_one_of<
               OpT, TF::PartitionedCallOp,
               TF::StatefulPartitionedCallOp>::value>::type * = nullptr>
-void Rewrite(OpT pcall_op, SymbolTable &symtab) {
+void RewriteCall(OpT call_op, SymbolTable &symtab) {
   llvm::SmallVector<Value> non_resource_args, resource_args;
   bool has_resources = false, in_order = true;
-  for (const mlir::Value &arg : pcall_op.args()) {
+  for (const Value &arg : call_op.getArgs()) {
     if (!getElementTypeOrSelf(arg.getType()).template isa<TF::ResourceType>()) {
       non_resource_args.push_back(arg);
       if (has_resources) in_order = false;
@@ -79,39 +78,52 @@ void Rewrite(OpT pcall_op, SymbolTable &symtab) {
   }
 
   if (!in_order) {
-    // Functions do not get reused in practise, so skip the check for if the
+    // Functions do not get reused in practice, so skip the check for if the
     // callee has been updated.
-    StringAttr callee_sym = cast<FlatSymbolRefAttr>(pcall_op.fAttr()).getAttr();
-    MoveResourceArgsToEnd(cast<mlir::func::FuncOp>(symtab.lookup(callee_sym)));
+    StringAttr callee_sym =
+        cast<SymbolRefAttr>(call_op.getFAttr()).getRootReference();
+    MoveResourceArgsToEnd(symtab.lookup<func::FuncOp>(callee_sym));
   }
-  OpBuilder builder(pcall_op->getContext());
-  builder.setInsertionPoint(pcall_op);
+  OpBuilder builder(call_op->getContext());
+  builder.setInsertionPoint(call_op);
   auto xla_launch_op = builder.create<TF::XlaLaunchOp>(
-      pcall_op.getLoc(), pcall_op.getResultTypes(),
+      call_op.getLoc(), call_op.getResultTypes(),
       /*constants=*/ValueRange({}), ValueRange(non_resource_args),
-      ValueRange(resource_args), pcall_op.fAttr());
+      ValueRange(resource_args), call_op.getFAttr());
 
-  CopyDeviceAndUnderscoredAttributes(pcall_op, xla_launch_op);
-  pcall_op.replaceAllUsesWith(xla_launch_op.getResults());
-  pcall_op.erase();
+  CopyDeviceAndUnderscoredAttributes(call_op, xla_launch_op);
+  call_op.replaceAllUsesWith(xla_launch_op.getResults());
+  call_op.erase();
 }
 
 void XlaRewritePass::runOnOperation() {
-  mlir::ModuleOp module = getOperation();
+  ModuleOp module = getOperation();
   SymbolTable symtab(module);
+  module.walk([&](tf_device::ClusterOp cluster_op) {
+    cluster_op.getBody().walk([&](mlir::Operation *op) {
+      if (auto call_op = llvm::dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
+        RewriteCall(call_op, symtab);
+      } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(op)) {
+        RewriteCall(call_op, symtab);
+      }
+    });
+  });
 
-  module.walk([&](mlir::Operation *op) {
-    if (!op->hasAttr(tensorflow::kCompileDeviceTypeAttr))
-      return WalkResult::advance();
-    if (auto pcall_op = dyn_cast<TF::PartitionedCallOp>(op)) {
-      Rewrite(pcall_op, symtab);
-    } else if (auto stateful_pcall_op =
-                   dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
-      Rewrite(stateful_pcall_op, symtab);
+  // Verify that there are no nested XLA launch ops.
+  module.walk([&](TF::XlaLaunchOp xla_launch_op) {
+    llvm::SmallVector<mlir::Operation *> nested_launch_ops;
+    func::FuncOp root = symtab.lookup<func::FuncOp>(
+        xla_launch_op.getFunctionAttr().getRootReference());
+    if (failed(GetOutermostOpsOfType<TF::XlaLaunchOp>(root, symtab,
+                                                      nested_launch_ops)))
+      return signalPassFailure();
+    if (!nested_launch_ops.empty()) {
+      xla_launch_op.emitError() << "Nested XLA launch ops detected";
+      return signalPassFailure();
     }
-    return WalkResult::advance();
   });
 }
+
 }  // namespace
 
 namespace TFDevice {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index 307257ca0c1..8e36d069930 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -177,7 +177,7 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
     Value operand = yield.getOperand(0);
     auto identity = builder.create<TF::IdentityOp>(island.getLoc(),
                                                    operand.getType(), operand);
-    yield.setOperand(0, identity.output());
+    yield.setOperand(0, identity.getOutput());
   } else {
     auto identity_n = builder.create<TF::IdentityNOp>(
         island.getLoc(), yield.getOperandTypes(), yield.getOperands());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 48187ad18a1..4aff79c8585 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -76,7 +76,7 @@ using mlir::Operation;
 using mlir::SymbolTable;
 using mlir::Value;
 using mlir::func::FuncOp;
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 namespace {
 
@@ -220,7 +220,7 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
     *node_def->mutable_device() = device_attr.getValue().str();
 
   llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
-      func.getArgAttrs(index);
+      mlir::function_interface_impl::getArgAttrs(func, index);
   absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr,
                                                             kAliasingAttr};
   TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
@@ -279,7 +279,8 @@ Status Exporter::AddEdgeBetweenNodes(Value src, Node* dst_node,
     TF_RET_CHECK(node_it != nodes_.end())
         << "Use of OpResult encountered before def!";
     if (input_result.getType().isa<mlir::tf_executor::ControlType>()) {
-      graph_->AddControlEdge(node_it->second, dst_node);
+      graph_->AddControlEdge(node_it->second, dst_node,
+                             /*allow_duplicates=*/true);
     } else {
       graph_->AddEdge(node_it->second, input_result.getResultNumber(), dst_node,
                       dst_index);
@@ -769,7 +770,7 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
   return graphdef;
 }
 
-stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
+tsl::Status ConvertMlirFunctionToFunctionLibraryDef(
     FuncOp func, const GraphExportConfig& configs, FunctionDef* function_def) {
   Dialect* tf_dialect = func.getContext()->getLoadedDialect("tf");
   FunctionDefLibrary flib;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
index d0768969254..562226e1c76 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -31,8 +30,8 @@ limitations under the License.
 
 namespace tensorflow {
 // Given an MLIR module, returns a GraphDef.
-stream_executor::port::StatusOr<std::unique_ptr<GraphDef>>
-ConvertMlirToGraphdef(mlir::ModuleOp module, const GraphExportConfig& configs);
+tsl::StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
+    mlir::ModuleOp module, const GraphExportConfig& configs);
 
 // Converts an MLIR module to TensorFlow graph and FunctionLibraryDefinition.
 // The "main" function of the module is stored in the graph and the rest of
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index f80bedcd602..04120e81c7b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -74,7 +74,9 @@ void SetShapeAttribute(absl::string_view name, ContainerT shapes,
   for (const llvm::Optional<llvm::ArrayRef<int64_t>>& shape : shapes) {
     TensorShapeProto& tshape = *shape_list.add_shape();
     if (shape.has_value()) {
-      for (int64_t dim : *shape) tshape.add_dim()->set_size(dim);
+      for (int64_t dim : *shape) {
+        tshape.add_dim()->set_size(mlir::ShapedType::isDynamic(dim) ? -1 : dim);
+      }
     } else {
       tshape.set_unknown_rank(true);
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
index 72e47cd2f60..ae87dad305a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 7eaf8457e81..912404b2bca 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -91,7 +91,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -141,15 +140,17 @@ namespace tensorflow {
 constexpr size_t kNumThreadToConvertSignatures = 10;
 constexpr absl::string_view kOutputShapesAttrName = "_output_shapes";
 
-using mlir::NamedAttrList;
-using mlir::TensorType;
-using mlir::tf_saved_model::AssetOp;
-using mlir::tf_saved_model::GlobalTensorOp;
-using mlir::tf_saved_model::kTfSavedModelInitializerInitType;
-using mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
-using mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
-using mlir::tf_saved_model::SessionInitializerOp;
-using stream_executor::port::StatusOr;
+using ::mlir::NamedAttrList;
+using ::mlir::TensorType;
+using ::mlir::tf_saved_model::AssetOp;
+using ::mlir::tf_saved_model::GlobalTensorOp;
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::mlir::tf_saved_model::kTfSavedModelInitializerTypeAttr;
+using ::mlir::tf_saved_model::SessionInitializerOp;
+using ::tsl::StatusOr;
 
 namespace {
 
@@ -1281,7 +1282,7 @@ StatusOr<TensorType> ImporterBase::ConvertElementTypeAndShape(
   }
 
   return GetTypeFromTFTensorShape(
-      llvm::makeArrayRef(dimensions.begin(), dimensions.end()), element_type);
+      llvm::ArrayRef(dimensions.begin(), dimensions.end()), element_type);
 }
 
 StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
@@ -1359,7 +1360,7 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
           if (attr) attrs.push_back(attr);
         }
         return builder_.getArrayAttr(
-            llvm::makeArrayRef(attrs.begin(), attrs.end()));
+            llvm::ArrayRef(attrs.begin(), attrs.end()));
       }
       return ConvertNonFuncAttributeValue(value, &builder_);
     }
@@ -1797,7 +1798,7 @@ mlir::Location ImporterBase::GetLocation(const Node& node) {
 
     // If there are more locations then generate a stack trace, otherwise just
     // return the name loc.
-    auto callsite_locs = llvm::makeArrayRef(locations).drop_front();
+    auto callsite_locs = llvm::ArrayRef(locations).drop_front();
     return callsite_locs.empty()
                ? node_name_loc
                : mlir::CallSiteLoc::get(node_name_loc, callsite_locs);
@@ -1987,11 +1988,7 @@ mlir::Operation* ImporterBase::CreateOperation(
             resource = true;
             return true;
           }
-          if (auto with_subtype =
-                  type.dyn_cast<mlir::SubElementTypeInterface>()) {
-            with_subtype.walkSubTypes(
-                [&](mlir::Type t) { record_resource(t); });
-          }
+          type.walk([&](mlir::Type t) { record_resource(t); });
           return resource;
         };
 
@@ -2419,7 +2416,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
     crash_analysis::RemoveReportData(flib_crash_handle);
   });
 
-  VLOG(1) << "Importing: "
+  VLOG(2) << "Importing: "
           << ::tensorflow::DumpGraphToFile("tf_mlir_importer_base", graph,
                                            &flib_def);
 
@@ -2522,7 +2519,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
                           : mlir::func::FuncOp::Visibility::Private;
     function.setVisibility(visibility);
   }
-  VLOG(1) << "Imported: "
+  VLOG(2) << "Imported: "
           << tensorflow::DumpMlirOpToFile("tf_mlir_imported_base",
                                           module.get());
   return module;
@@ -3087,7 +3084,7 @@ StatusOr<llvm::ArrayRef<mlir::ArrayAttr>>
 StructuredValueLinearizer::GetLeafIndexPaths(
     llvm::StringRef error_context) const {
   if (error_message_.empty()) {
-    return llvm::makeArrayRef(leaf_index_paths_);
+    return llvm::ArrayRef(leaf_index_paths_);
   }
   return errors::InvalidArgument(
       error_context.str(), error_message_,
@@ -3377,7 +3374,7 @@ Status CreateSavedModelIR(
                                                   call.getResults());
       }
       func->setAttr(
-          "tf_saved_model.exported_names",
+          kTfSavedModelExportedNamesAttr,
           builder.getStrArrayAttr(object_names.GetExportedNames(node_id)));
       const SavedConcreteFunction& concrete_function =
           object_graph.concrete_functions().at(function.concrete_functions(0));
@@ -3410,7 +3407,7 @@ Status CreateSavedModelIR(
             " vs ", bound_input_base, ")");
       }
       for (auto index_path : llvm::enumerate(input_index_paths)) {
-        func.setArgAttr(index_path.index(), "tf_saved_model.index_path",
+        func.setArgAttr(index_path.index(), kTfSavedModelIndexPathAttr,
                         index_path.value());
       }
 
@@ -3437,7 +3434,7 @@ Status CreateSavedModelIR(
             " vs ", func.getNumResults(), ")");
       }
       for (auto index_path : llvm::enumerate(output_index_paths)) {
-        func.setResultAttr(index_path.index(), "tf_saved_model.index_path",
+        func.setResultAttr(index_path.index(), kTfSavedModelIndexPathAttr,
                            index_path.value());
       }
     } else if (object.kind_case() == SavedObject::kVariable) {
@@ -3476,7 +3473,7 @@ Status CreateSavedModelIR(
           /*type=*/mlir::TypeAttr::get(type),
           /*is_mutable=*/builder.getUnitAttr());
       op->setAttr(
-          "tf_saved_model.exported_names",
+          kTfSavedModelExportedNamesAttr,
           builder.getStrArrayAttr(object_names.GetExportedNames(node_id)));
     } else if (object.kind_case() == SavedObject::kConstant) {
       const SavedConstant& constant = object.constant();
@@ -3496,7 +3493,7 @@ Status CreateSavedModelIR(
           /*type=*/mlir::TypeAttr::get(value_attr.getType()),
           /*is_mutable=*/nullptr);
       op->setAttr(
-          "tf_saved_model.exported_names",
+          kTfSavedModelExportedNamesAttr,
           builder.getStrArrayAttr(object_names.GetExportedNames(node_id)));
     }
   }
@@ -3885,7 +3882,7 @@ Status SavedModelSignatureDefImporterLite::ConvertInitializer(
   // Set the exported name of init function to an reserved name for
   // tf_saved_model.
   init_func_op->setAttr(
-      "tf_saved_model.exported_names",
+      kTfSavedModelExportedNamesAttr,
       builder.getStrArrayAttr({absl::StrCat(
           "__tf_saved_model_session_initializer_", target_node_name)}));
   init_func_op->setAttr(kTfSavedModelInitializerTypeAttr,
@@ -3958,20 +3955,30 @@ Status SavedModelSignatureDefImporterLite::ConvertSignature(
       << sig_def_key << ".";
 
   // Use unique SignatureDef key as exported name.
-  func_op->setAttr("tf_saved_model.exported_names",
+  func_op->setAttr(kTfSavedModelExportedNamesAttr,
                    builder.getStrArrayAttr({sig_def_key}));
 
   // Transfer input and output parameter names to index_path attributes.
   for (auto input_and_idx : llvm::enumerate(inputs)) {
-    func_op.setArgAttr(input_and_idx.index(), "tf_saved_model.index_path",
+    func_op.setArgAttr(input_and_idx.index(), kTfSavedModelIndexPathAttr,
                        builder.getStrArrayAttr({input_and_idx.value().first}));
   }
   for (auto output_and_idx : llvm::enumerate(outputs)) {
     func_op.setResultAttr(
-        output_and_idx.index(), "tf_saved_model.index_path",
+        output_and_idx.index(), kTfSavedModelIndexPathAttr,
         builder.getStrArrayAttr({output_and_idx.value().first}));
   }
 
+  // Add the original TF function name as a function attribute.
+  // TODO(b/258817244) Remove this after TFRT exports functions.
+  for (const auto& [tf_name, mlir_name] : tf_name_to_mlir_name) {
+    auto func_op = sub_symbol_table.lookup<mlir::func::FuncOp>(mlir_name);
+    TF_RET_CHECK(func_op)
+        << "Graphdef importer should have created a function named "
+        << mlir_name << ".";
+    func_op->setAttr("tf._original_func_name", builder.getStringAttr(tf_name));
+  }
+
   // Move the converted functions to top level MLIR module.
   return MoveConvertedFunctionsToModule(sig_def_key, *sub_module,
                                         tf_name_to_mlir_name);
@@ -4248,10 +4255,9 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToMlir(
                                    tf_name_to_mlir_name);
 }
 
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertFunctionToMlir(const FunctionBody* fbody,
-                      const FunctionLibraryDefinition& flib_def,
-                      mlir::MLIRContext* context) {
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertFunctionToMlir(
+    const FunctionBody* fbody, const FunctionLibraryDefinition& flib_def,
+    mlir::MLIRContext* context) {
   tensorflow::GraphDebugInfo dummy_debug_info;
   tensorflow::GraphImportConfig specs;
   specs.graph_func_name = fbody->fdef.signature().name();
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 7bcfaa8685d..27972cbf972 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -40,44 +39,38 @@ inline constexpr absl::string_view kImportModelDefaultGraphFuncName = "main";
 
 // Given a GraphDef, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertGraphdefToMlir(const GraphDef& graphdef,
-                      const GraphDebugInfo& debug_info,
-                      const GraphImportConfig& specs,
-                      mlir::MLIRContext* context,
-                      bool add_default_attributes = true);
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphdefToMlir(
+    const GraphDef& graphdef, const GraphDebugInfo& debug_info,
+    const GraphImportConfig& specs, mlir::MLIRContext* context,
+    bool add_default_attributes = true);
 
 // Given a Graph, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertGraphToMlir(const Graph& graph, const GraphDebugInfo& debug_info,
-                   const FunctionLibraryDefinition& flib_def,
-                   const GraphImportConfig& specs, mlir::MLIRContext* context);
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertGraphToMlir(
+    const Graph& graph, const GraphDebugInfo& debug_info,
+    const FunctionLibraryDefinition& flib_def, const GraphImportConfig& specs,
+    mlir::MLIRContext* context);
 
 // [Experimental]
 // Given a Function, returns a MLIR module containing the graph, expressed with
 // tf_executor dialect.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertFunctionToMlir(const FunctionBody* fbody,
-                      const FunctionLibraryDefinition& flib_def,
-                      mlir::MLIRContext* context);
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertFunctionToMlir(
+    const FunctionBody* fbody, const FunctionLibraryDefinition& flib_def,
+    mlir::MLIRContext* context);
 
 // Given a SavedModel, returns a MLIR module containing the functions, expressed
 // with tf_executor dialect.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertSavedModelToMlir(SavedModelV2Bundle* saved_model,
-                        mlir::MLIRContext* context,
-                        absl::Span<std::string> exported_names,
-                        bool add_default_attributes = true,
-                        bool unconditionally_use_set_output_shapes = false);
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelToMlir(
+    SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
+    absl::Span<std::string> exported_names, bool add_default_attributes = true,
+    bool unconditionally_use_set_output_shapes = false);
 
 // Given a V1 SavedModel, returns a MLIR module containing the functions,
 // expressed with tf_executor dialect.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
-                          absl::Span<std::string> exported_names,
-                          mlir::MLIRContext* context, MLIRImportOptions options,
-                          bool lift_variables = true);
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlir(
+    const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context, MLIRImportOptions options,
+    bool lift_variables = true);
 
 // Given a V1 SavedModel, returns a MLIR module containing the functions,
 // expressed with tf_executor dialect. It does not require a session to be
@@ -89,8 +82,7 @@ ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
 // ConvertSavedModelV1ToMlir(), and is not related to TFLite.
 //
 // TODO(b/179683149): Rename this class to avoid confusion with TFLite.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertSavedModelV1ToMlirLite(
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
     const MetaGraphDef& meta_graph_def, const GraphDebugInfo& debug_info,
     std::optional<absl::Span<const std::string>> exported_names,
     mlir::MLIRContext* context, MLIRImportOptions options);
@@ -123,8 +115,8 @@ class SavedModelMLIRImportInput {
   // and remain valid for the graph.
   // `name` is a unique identifier for this subgraph, so the implementation can
   // use it for eg. debugging or caching compilation results.
-  virtual stream_executor::port::StatusOr<const Graph*> GetSubGraph(
-      absl::string_view name, GraphImportConfig& specs) = 0;
+  virtual tsl::StatusOr<const Graph*> GetSubGraph(absl::string_view name,
+                                                  GraphImportConfig& specs) = 0;
 
  private:
   const MetaGraphDef* meta_graph_def_ = nullptr;
@@ -142,8 +134,7 @@ class SavedModelMLIRImportInput {
 // ConvertSavedModelV1ToMlir(), and is not related to TFLite.
 //
 // TODO(b/179683149): Rename this class to avoid confusion with TFLite.
-stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-ConvertSavedModelV1ToMlirLite(
+tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ConvertSavedModelV1ToMlirLite(
     SavedModelMLIRImportInput& input,
     std::optional<absl::Span<const std::string>> exported_names,
     mlir::MLIRContext* context,
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
index e1f2c68f04d..ce1242d342f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 
+#include <optional>
 #include <ostream>
 #include <sstream>
 #include <type_traits>
@@ -208,7 +209,7 @@ Status ParseInputArrayInfo(
         info.shape.set_unknown_rank(true);
         continue;
       }
-      for (auto& dim : node_shapes[i].getValue()) {
+      for (auto& dim : node_shapes[i].value()) {
         info.shape.add_dim()->set_size(dim);
       }
     }
@@ -224,7 +225,7 @@ Status ParseNodeShapes(
     std::vector<string> node_shapes_str = absl::StrSplit(shapes_str, ':');
     for (int i = 0; i < node_shapes_str.size(); i++) {
       if (node_shapes_str[i] == "*") {
-        shapes_vector.push_back(llvm::None);
+        shapes_vector.push_back(std::nullopt);
         continue;
       }
       TF_ASSIGN_OR_RETURN(auto shape, ParseShapeStr(node_shapes_str[i]));
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
index a9c335ca5c6..65a6dbaa1c5 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
@@ -32,9 +31,9 @@ namespace tensorflow {
 
 using mlir::MLIRContext;
 
-static stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-Import(const GraphOptimizationPassOptions& options, const Graph& graph,
-       MLIRContext* context) {
+static tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Import(
+    const GraphOptimizationPassOptions& options, const Graph& graph,
+    MLIRContext* context) {
   // TODO(fengliuai): get debug info at runtime.
   GraphDebugInfo debug_info;
   GraphImportConfig specs;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
index 258e62b2fe1..a1dced4bf5e 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h"
+
 #include <cstdint>
 #include <memory>
 
@@ -45,10 +47,6 @@ class SplitIntoIslandPerOpPass
     : public impl::SplitIntoIslandPerOpPassBase<SplitIntoIslandPerOpPass> {
  public:
   void runOnOperation() override;
-
- private:
-  void SplitIsland(tf_executor::IslandOp island_op,
-                   tf_executor::GraphOp graph_op);
 };
 
 void SplitIntoIslandPerOpPass::runOnOperation() {
@@ -91,7 +89,7 @@ void SplitIntoIslandPerOpPass::runOnOperation() {
   // Break up all islands by simply creating a new island wrapping each
   // individual sub op. Do not create any control dependencies between the
   // newly created islands.
-  SplitIsland(island_op, graph_op);
+  SplitIsland(island_op, tf_executor::ControlType::get(&getContext()));
 
   // None of the originally given control deps are necessary.
   tf_executor::FetchOp fetch_op = graph_op.GetFetch();
@@ -103,6 +101,8 @@ void SplitIntoIslandPerOpPass::runOnOperation() {
   }
 }
 
+}  // namespace
+
 // Populates an empty IslandOp and with a NoOp or Identity/IdentityN depending
 // on if there are any data results.
 void PopulateEmptyIsland(tf_executor::IslandOp island) {
@@ -114,7 +114,7 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
     Value operand = yield.getOperand(0);
     auto identity = builder.create<TF::IdentityOp>(island.getLoc(),
                                                    operand.getType(), operand);
-    yield.setOperand(0, identity.output());
+    yield.setOperand(0, identity.getOutput());
   } else {
     auto identity_n = builder.create<TF::IdentityNOp>(
         island.getLoc(), yield.getOperandTypes(), yield.getOperands());
@@ -142,8 +142,8 @@ tf_executor::IslandOp CreateIsland(TypeRange result_types,
 }
 
 // Converts a single island into multiple islands (one for each op).
-void SplitIntoIslandPerOpPass::SplitIsland(tf_executor::IslandOp island_op,
-                                           tf_executor::GraphOp graph_op) {
+void SplitIsland(mlir::tf_executor::IslandOp island_op,
+                 mlir::tf_executor::ControlType control_type) {
   auto island_body = island_op.GetBody().without_terminator();
   // Populate islands that are empty (only yield).
   if (island_body.empty()) {
@@ -154,8 +154,6 @@ void SplitIntoIslandPerOpPass::SplitIsland(tf_executor::IslandOp island_op,
   // Skip islands that are already only a single op.
   if (island_op.WrapsSingleOp()) return;
 
-  auto control_type = tf_executor::ControlType::get(&getContext());
-
   // For each operation in the island, construct a new island to wrap the op,
   // yield all the results, and replace all the usages with the results of the
   // new island.
@@ -171,14 +169,25 @@ void SplitIntoIslandPerOpPass::SplitIsland(tf_executor::IslandOp island_op,
   for (auto item :
        llvm::zip(island_op.getOutputs(), island_op.GetYield().getFetches()))
     std::get<0>(item).replaceAllUsesWith(std::get<1>(item));
+
+  auto graph_op = island_op->getParentOfType<mlir::tf_executor::GraphOp>();
+
+  // Dropping all uses of an island op's control dep using
+  // `island_op.getControl().dropAllUses();` of a control dep that's only used
+  // in a graph's fetch, immediately leads to a segfault. Turns out we need to
+  // drop its uses manually so that we don't leave dangling controls.
+  for (auto& fetch : llvm::enumerate(graph_op.GetFetch().getFetches())) {
+    if (fetch.value() == island_op.getControl()) {
+      graph_op.GetFetch().getFetchesMutable().erase(fetch.index(), 1);
+      break;
+    }
+  }
   island_op.erase();
 }
 
-}  // namespace
-}  // namespace TF
-
 std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass() {
   return std::make_unique<TF::SplitIntoIslandPerOpPass>();
 }
 
+}  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h
new file mode 100644
index 00000000000..45924dea9f1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/split_into_island_per_op_pass.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+
+namespace mlir {
+namespace TF {
+
+// Converts a single island into multiple islands (one for each op).
+void SplitIsland(mlir::tf_executor::IslandOp island_op,
+                 mlir::tf_executor::ControlType control_type);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_SPLIT_INTO_ISLAND_PER_OP_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 0d7a8f1242a..2a85214d355 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/util/tensor_bundle/byte_swap_tensor.h"
 
 namespace tensorflow {
 
@@ -54,6 +55,8 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
       tensorflow::LoadProtoFromBuffer({input.data(), input.size()}, &graphdef));
+  if (!port::kLittleEndian)
+    TF_RETURN_IF_ERROR(ByteSwapTensorContentInGraphDef(&graphdef));
 
   GraphDebugInfo debug_info;
   if (!debug_info_file.empty()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index a5911036edd..3e5aabb5de8 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -27,12 +27,11 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
-using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
+using tsl::Status;
+using tsl::StatusOr;
 
 // TODO(antiagainst): Directly manipulating files in library functions is not
 // a good idea. We should pass in a string/stream here.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 3486a6c1114..f1c39aba7ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -30,12 +30,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace mlir {
-using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
+using tsl::Status;
+using tsl::StatusOr;
 
 namespace {
 inline absl::string_view StringRefToView(llvm::StringRef ref) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
index 4d19ff37f15..fb2c22ce385 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
index 2c8ed09b3f7..098c7d19411 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.cc
@@ -16,7 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
 
+#include "absl/strings/str_split.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -57,5 +61,37 @@ LogicalResult IsValidDeviceTypeOrEmpty(StringAttr device_attr) {
   return success();
 }
 
+LogicalResult ParseParallelExecutionIds(Operation* op,
+                                        ParallelExecutionIdPairs& id_pairs) {
+  auto attr = op->getAttrOfType<StringAttr>(kParallelExecAnnotation);
+  if (!attr) return success();
+
+  // ID pairs are separated by `,`.
+  llvm::SmallVector<std::string, 8> str_list =
+      absl::StrSplit(attr.getValue().str(), ',', absl::SkipWhitespace());
+  id_pairs.reserve(str_list.size());
+  for (const std::string& str : str_list) {
+    // IDs of one pair are separated by `:`.
+    llvm::SmallVector<std::string, 8> id_pair = absl::StrSplit(str, ':');
+
+    // Check for malformed IDs.
+    if (id_pair.size() != 2) return failure();
+    if (id_pair[0].empty() || id_pair[1].empty()) return failure();
+
+    auto is_digit = [](char c) { return absl::ascii_isdigit(c); };
+    const std::string& group_id = id_pair[0];
+    if (group_id[0] != 'p' && group_id[0] != 'r') return failure();
+    if (!std::all_of(std::next(group_id.begin()), group_id.end(), is_digit)) {
+      return failure();
+    }
+    const std::string& branch_id = id_pair[1];
+    if (!std::all_of(branch_id.begin(), branch_id.end(), is_digit)) {
+      return failure();
+    }
+    id_pairs.push_back(std::make_pair(id_pair[0], id_pair[1]));
+  }
+  return success();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index 09dd3bcd20d..0c6a4733dc9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_ATTRIBUTE_UTILS_H_
 
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
@@ -46,6 +50,60 @@ inline constexpr llvm::StringRef kTpuDevice = "TPU";
 inline constexpr llvm::StringRef kSkipIslandOutlining =
     "_skip_island_outlining";
 
+// This string attribute encodes parallel execution groups and their associated
+// branches. It has the following format:
+// `_parallel_execution_ids= group1:branch1,group2:branch2,...`
+// For example, if we have IR as follows:
+//
+// tf_executor.island wraps "tf.OpA"
+// tf_executor.island {
+//  "tf_device.replicate" {n = 2} {
+//    "tf.OpB"
+//    "tf_device.parallel_execute"() ({
+//      "tf.OpC"
+//    }, {
+//      "tf.OpD"
+//    })
+//  }
+//
+// The above IR will be flattened after `ReplicateToIslandPass` and
+// `ParallelExecuteToIslandsPass` as follows:
+//
+// tf_executor.island wraps "tf.OpA"
+// tf_executor.island {_parallel_execution_ids=r0:0} wraps "tf.OpB"
+// tf_executor.island {_parallel_execution_ids=r0:0,p0:0} wraps "tf.OpC"
+// tf_executor.island {_parallel_execution_ids=r0:0,p0:1} wraps "tf.OpD"
+// tf_executor.island {_parallel_execution_ids=r0:1} wraps "tf.OpB"
+// tf_executor.island {_parallel_execution_ids=r0:1,p0:0} wraps "tf.OpC"
+// tf_executor.island {_parallel_execution_ids=r0:1,p0:1} wraps "tf.OpD"
+//
+// "tf.OpA" will not have `_parallel_execution_ids` attr,
+//          means it does not belong to any parallel execution groups.
+// First instance of "tf.OpB" after flattening will have
+//          `_parallel_execution_ids = "r0:0"`,
+//          which represents the first branch of replicate group 0.
+// Second instance of "tf.OpB" after flattening will have
+//          `_parallel_execution_ids = "r0:1"`
+//          which represents the second branch of replicate group 0.
+// First instance of "tf.OpC" after flattening will have
+//          `_parallel_execution_ids = "r0:0,p0:0"`
+//          which represents the first branch of replicate group 0 and
+//          the first branch of parallel group 0.
+// Second instance of "tf.OpC" after flattening will have
+//          `_parallel_execution_ids = "r0:1,p0:0"`
+//          which represents the second branch of replicate group 0 and
+//          the first branch of parallel group 0.
+// First instance of "tf.OpD" after flattening will have
+//          `_parallel_execution_ids = "r0:0,p0:1"`
+//          which represents the first branch of replicate group 0 and
+//          the second branch of parallel group 0.
+// Second instance of "tf.OpD" after flattening will have
+//          `_parallel_execution_ids = "r0:1,p0:1"`
+//          which represents the second branch of replicate group 0 and
+//          the second branch of parallel group 0.
+inline constexpr llvm::StringRef kParallelExecAnnotation =
+    "_parallel_execution_ids";
+
 // Copies attributes that satisfy the given predicate from `from` to `to`.
 template <typename Predicate>
 void CopyAttributes(Operation *from, Operation *to, Predicate P) {
@@ -95,6 +153,14 @@ LogicalResult HasValidCompilationAndReplicationAttributes(Operation &op);
 // Checks if the device attribute is valid.
 LogicalResult IsValidDeviceTypeOrEmpty(StringAttr attr);
 
+using ParallelExecutionIdPairs =
+    llvm::SmallVector<std::pair<std::string, std::string>, 8>;
+// Parses the parallel execution attribute for `op` and fills `id_pairs` with
+// the corresponding (group ID,branch ID) pairs.
+// Returns `failure` if the attribute is malformed.
+LogicalResult ParseParallelExecutionIds(Operation *op,
+                                        ParallelExecutionIdPairs &id_pairs);
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 7eca89870f1..44b92532142 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -55,14 +55,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/adjust_layout.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
@@ -173,7 +174,7 @@ mlir::RankedTensorType GetBufferType(mlir::Type ty) {
                       .dyn_cast_or_null<mlir::mhlo::TypeExtensionsAttr>();
   if (encoding && !encoding.getBounds().empty()) {
     for (int64_t dim = 0; dim < rank; ++dim) {
-      if (dims[dim] == mlir::ShapedType::kDynamicSize) {
+      if (dims[dim] == mlir::ShapedType::kDynamic) {
         dims[dim] = encoding.getBounds()[dim];
       }
     }
@@ -326,67 +327,26 @@ bool CanInlineFunctionsPostLegalization(llvm::StringRef device_type) {
 
 }  //  namespace
 
-Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
-                    mlir::ModuleOp module) {
-  auto producer_or = GetTfGraphProducerVersion(module);
-  if (!producer_or.ok()) return producer_or.status();
-  int64_t producer_version = producer_or.value();
-
-  llvm::SmallVector<int64_t, 16> shape_backing;
-  llvm::SmallVector<llvm::ArrayRef<int64_t>, 4> arg_shapes_copy;
-  {
-    // Convert arg_shapes to a mlir friendly format.
-    size_t count = 0;
-    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
-      if (tensor_resource_shape.is_resource) continue;
-      count += tensor_resource_shape.shape.dims();
-    }
-    shape_backing.resize(count);
-    arg_shapes_copy.reserve(arg_shapes.size());
-    size_t offset = 0;
-    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
-      if (tensor_resource_shape.is_resource) {
-        arg_shapes_copy.push_back(llvm::ArrayRef<int64_t>());
-        continue;
-      }
-      size_t start = offset;
-      for (tensorflow::TensorShapeDim dim : tensor_resource_shape.shape) {
-        shape_backing[offset] = dim.size;
-        ++offset;
-      }
-      if (offset == start) {
-        arg_shapes_copy.push_back(llvm::ArrayRef<int64_t>());
-      } else {
-        arg_shapes_copy.push_back(
-            llvm::ArrayRef<int64_t>(&shape_backing[start], offset - start));
-      }
-    }
-  }
-
-  auto main_func = module.lookupSymbol<mlir::func::FuncOp>("main");
-
-  mlir::StatusScopedDiagnosticHandler error_handler(module.getContext());
-  mlir::LogicalResult result = mlir::TF::InferShapeForFunction(
-      main_func, arg_shapes_copy, producer_version);
-
-  if (failed(result)) {
-    return error_handler.Combine(
-        errors::Internal("MLIR Shape refinement failed"));
-  }
-  return error_handler.ConsumeStatus();
-}
-
 void CreateConvertMlirToXlaHloPipeline(
     mlir::OpPassManager& pm, llvm::StringRef device_type, bool prefer_tf2xla,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes,
     bool allow_partial_conversion) {
+  bool legalize_chlo = true;
+
   // Note that the region-based control-flow produced here still contains
   // function call ops which get inlined by the subsequent inliner pass.
   pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
   pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateDropWhileShapeInvariantPass());
+  // Create a replicated TensorList initialization ops for all of its uses. This
+  // pass undo some CSE because shape_inference is not correctly able to
+  // identify the shapes of TensorList initialization ops.
+  // This pass requires CanonicalizerPass before
+  // CreateTensorListOpsDecompositionPass for clean-ups.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateReplicateTensorListInitOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   // The SCCP pass performs constant propagation across the IR, which, for
   // example, propagates constant arguments into callee functions.
@@ -413,28 +373,36 @@ void CreateConvertMlirToXlaHloPipeline(
       mlir::TFDevice::CreateDecomposeResourceOpsPass());
   pm.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
   pm.addPass(mlir::createSymbolDCEPass());
+
+  // Sink constants to regions so that ops requiring constant operands can
+  // access the constant and there is no indirection through control flow region
+  // arguments. Also, note that this pass is in MHLO but it is generic and sinks
+  // constants for all ops with regions.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createSinkConstantsToControlFlowPass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
-  // TODO(b/171426148): We cannot completely remove region to functional control
-  // flow conversion from this pipeline yet as it causes some unit tests to
-  // fail.
-  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
-  // LegalizeTFControlFlow encapsulates arguments for control flow operations
-  // with a tuple argument which break the assumption of resource lifting
-  // inside PromoteResourcesToArgs.
-  pm.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
+
+  // Legalize any StableHLO ops to MHLO. Bridge still doesn't use StableHLO but
+  // such ops might be present in the input from upstream like TFRT compilation.
+  // Later on, this could be merged in the legalization pass when we migrate
+  // bridge to StableHLO.
+  // TODO(b/259459405): Avoid this peculiar use through some refactoring in
+  // the the caller.
+  // This needs to happen before legalization.
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
 
   pm.addNestedPass<mlir::func::FuncOp>(mlir::TF::CreateLowerQuantizedPass());
   pm.addPass(mlir::mhlo::CreateLegalizeTfTypesPass());
   pm.addPass(mlir::mhlo::createLegalizeTFModulePass(
       /*tf2xla_fallback_device_type=*/device_type));
+
   pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeTFPass(
-      /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
+      /*allow_partial_conversion=*/true, legalize_chlo,
       /*tf2xla_fallback_device_type=*/device_type, prefer_tf2xla));
   for (auto& target_pass : custom_legalization_passes) {
     pm.addNestedPass<mlir::func::FuncOp>(std::move(target_pass));
   }
   pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::CreateAdjustLayoutPass());
-  pm.addPass(mlir::mhlo::CreateLegalizeTFCommunicationPass());
   pm.addPass(mlir::mhlo::CreateLegalizeTFCollectivePass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   // Run shape inference pass to propagate shapes through tensor_cast operations
@@ -442,15 +410,25 @@ void CreateConvertMlirToXlaHloPipeline(
   // inference was originally missing in a TF op but the corresponding HLO op
   // had static shape after lowering.
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+
   // Run LegalizeTFPass again because the previous legalization passes can
   // expose more graph pruning and canonicalization opportunities that are
   // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
   // invocation.
   pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeTFPass(
-      /*allow_partial_conversion=*/allow_partial_conversion,
-      /*legalize_chlo=*/true,
+      /*allow_partial_conversion=*/allow_partial_conversion, legalize_chlo,
       /*tf2xla_fallback_device_type=*/device_type, prefer_tf2xla));
 
+  // This pass operates on MHLO control flow ops so it should be legalized after
+  // the control flow ops are legalized.
+  pm.addPass(mlir::mhlo::CreateLegalizeTFCommunicationPass());
+
+  // Everything should be MHLO after this.
+  if (!allow_partial_conversion) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::mhlo::CreateVerifyTFXLALegalizationPass(legalize_chlo));
+  }
+
   if (CanInlineFunctionsPostLegalization(device_type))
     pm.addPass(mlir::createInlinerPass());
 
@@ -460,6 +438,56 @@ void CreateConvertMlirToXlaHloPipeline(
       mlir::mhlo::createSinkConstantsToControlFlowPass());
 }
 
+Status RefineShapes(llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+                    mlir::ModuleOp module) {
+  auto producer_or = GetTfGraphProducerVersion(module);
+  if (!producer_or.ok()) return producer_or.status();
+  int64_t producer_version = producer_or.value();
+
+  llvm::SmallVector<int64_t, 16> shape_backing;
+  llvm::SmallVector<llvm::ArrayRef<int64_t>, 4> arg_shapes_copy;
+  {
+    // Convert arg_shapes to a mlir friendly format.
+    size_t count = 0;
+    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
+      if (tensor_resource_shape.is_resource) continue;
+      count += tensor_resource_shape.shape.dims();
+    }
+    shape_backing.resize(count);
+    arg_shapes_copy.reserve(arg_shapes.size());
+    size_t offset = 0;
+    for (const TensorOrResourceShape& tensor_resource_shape : arg_shapes) {
+      if (tensor_resource_shape.is_resource) {
+        arg_shapes_copy.push_back(llvm::ArrayRef<int64_t>());
+        continue;
+      }
+      size_t start = offset;
+      for (tensorflow::TensorShapeDim dim : tensor_resource_shape.shape) {
+        shape_backing[offset] = dim.size;
+        ++offset;
+      }
+      if (offset == start) {
+        arg_shapes_copy.push_back(llvm::ArrayRef<int64_t>());
+      } else {
+        arg_shapes_copy.push_back(
+            llvm::ArrayRef<int64_t>(&shape_backing[start], offset - start));
+      }
+    }
+  }
+
+  auto main_func = module.lookupSymbol<mlir::func::FuncOp>("main");
+
+  mlir::StatusScopedDiagnosticHandler error_handler(module.getContext());
+  mlir::LogicalResult result = mlir::TF::InferShapeForFunction(
+      main_func, arg_shapes_copy, producer_version);
+
+  if (failed(result)) {
+    return error_handler.Combine(
+        errors::Internal("MLIR Shape refinement failed"));
+  }
+  return error_handler.ConsumeStatus();
+}
+
 Status LegalizeToHlo(mlir::ModuleOp module_op, llvm::StringRef device_type,
                      bool prefer_tf2xla,
                      llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index 7c7d6a46806..0db64e59db8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -28,22 +28,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 
 namespace tensorflow {
 
-// Populates the supplied passmanager with the passes required to run the
-// TF MLIR to XLA HLO MLIR conversion/legalization. Custom legalization passes
-// can be populated in `custom_legalization_passes`.
-void CreateConvertMlirToXlaHloPipeline(
-    mlir::OpPassManager& pm, llvm::StringRef device_type, bool prefer_tf2xla,
-    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
-        custom_legalization_passes,
-    bool allow_partial_conversion = false);
-
 // Lowers MLIR module to XLA HLO inside an XlaComputation. The input module
 // should only contain operations in tf dialect. If the input module contains
 // operation in the tf_executor dialect, for example, returns an error.
@@ -83,6 +73,32 @@ Status ConvertMLIRToXlaComputation(
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes = {});
 
+// Creates a MLIR pipeline that lowers MLIR module to MHLO dialect. The input
+// module should only contain operations in tf dialect. For example, if the
+// input module contains operation in the tf_executor dialect, the pass raises
+// an error unless the tf_executor dialect ops are optimized away by
+// canonicalization.
+//
+// The pipeline is used in ConvertMLIRToXlaComputation. And it generally has the
+// following pass structure:
+// - TensorFlow passes
+// - Legalization passes
+// - MHLO passes
+//
+// device_type: XLA JIT device to use for compilation such as "XLA_CPU_JIT",
+//   "XLA_GPU_JIT" or "XLA_TPU_JIT".
+// prefer_tf2xla: when this is true, prefer tf2xla fallback kernels over MLIR
+//   native kernels for legalization to HLO.
+// custom_legalization_passes: passes to run before the default TF legalization
+//   passes for backend-specific ops.
+// allow_partial_conversion: when this is true, allow operations that can't be
+// legalized.
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type, bool prefer_tf2xla,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes,
+    bool allow_partial_conversion = false);
+
 // Helper struct representing argument tensor or resource handle shapes.
 struct TensorOrResourceShape {
   TensorShape shape;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
index bd203d2742e..fc0ee8b9d20 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
@@ -76,8 +76,7 @@ StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
         return tensorflow::errors::Unimplemented(
             absl::StrCat("Attribute ", value.DebugString()));
       }
-      return builder->getArrayAttr(
-          llvm::makeArrayRef(attrs.begin(), attrs.end()));
+      return builder->getArrayAttr(llvm::ArrayRef(attrs.begin(), attrs.end()));
     }
     case AttrValue::VALUE_NOT_SET:
       return builder->getUnitAttr();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
index 7131eda4b6c..3eb21956721 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // Converts non func AttrValue proto into an MLIR attribute. Func attribute is
 // exclused in this function because the function might be renamed when the
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index a4cf7f9e289..42716d6e9ec 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <limits>
+#include <optional>
 
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace tensorflow {
 
@@ -85,19 +86,13 @@ StatusOr<ElementsAttr> ConvertFlatTensor(const Tensor& input_tensor,
                                          ShapedType type) {
   auto arr = input_tensor.flat<T>();
   return ElementsAttr(mlir::DenseElementsAttr::get(
-      type, llvm::makeArrayRef(arr.data(), arr.size())));
+      type, llvm::ArrayRef(arr.data(), arr.size())));
 }
 
-ElementsAttr ConvertBf16Tensor(const Tensor& input_tensor,
-                               RankedTensorType type) {
-  auto buffer = llvm::makeArrayRef(static_cast<char*>(input_tensor.data()),
-                                   input_tensor.TotalBytes());
-  return mlir::DenseElementsAttr::getFromRawBuffer(type, buffer);
-}
-
-ElementsAttr ConvertHalfTensor(const Tensor& tensor, RankedTensorType type) {
-  auto buffer = llvm::makeArrayRef(static_cast<char*>(tensor.data()),
-                                   tensor.TotalBytes());
+ElementsAttr ConvertTensorOfCustomFloatType(const Tensor& tensor,
+                                            RankedTensorType type) {
+  auto buffer =
+      llvm::ArrayRef(static_cast<char*>(tensor.data()), tensor.TotalBytes());
   return mlir::DenseElementsAttr::getFromRawBuffer(type, buffer);
 }
 
@@ -145,12 +140,11 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
     CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
     CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
-    // BFLOAT16 is a special case that it needs to be cast to double type to
-    // match its storage type.
     case DT_BFLOAT16:
-      return ConvertBf16Tensor(input_tensor, type);
     case DT_HALF:
-      return ConvertHalfTensor(input_tensor, type);
+    case DT_FLOAT8_E5M2:
+    case DT_FLOAT8_E4M3FN:
+      return ConvertTensorOfCustomFloatType(input_tensor, type);
     case DT_STRING:
       return ConvertStringTensor(input_tensor, type);
     default:
@@ -262,16 +256,11 @@ PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type) {
 
 mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
   if (type.isa<mlir::UnrankedTensorType>()) {
-    return mlir::TF::ShapeAttr::get(type.getContext(), llvm::None);
+    return mlir::TF::ShapeAttr::get(type.getContext(), std::nullopt);
   }
 
   if (auto tensor_type = type.dyn_cast<mlir::RankedTensorType>()) {
-    llvm::SmallVector<int64_t, 4> shape;
-    for (int64_t d : tensor_type.getShape()) {
-      shape.push_back(ShapedType::isDynamic(d) ? kTFDynamicSize : d);
-    }
-    return mlir::TF::ShapeAttr::get(type.getContext(),
-                                    llvm::makeArrayRef(shape));
+    return mlir::TF::ShapeAttr::get(type.getContext(), tensor_type.getShape());
   }
 
   // If type is not a RankedTensor or UnrankedTensor, it must be a scalar.
@@ -283,15 +272,15 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
 StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
                                                   mlir::MLIRContext* context) {
   if (shape.unknown_rank())
-    return mlir::TF::ShapeAttr::get(context, llvm::None);
+    return mlir::TF::ShapeAttr::get(context, std::nullopt);
 
   llvm::SmallVector<int64_t, 4> dims;
   dims.reserve(shape.dim().size());
   for (const auto& dim : shape.dim()) {
-    dims.push_back(dim.size() == kTFDynamicSize ? ShapedType::kDynamicSize
+    dims.push_back(dim.size() == kTFDynamicSize ? ShapedType::kDynamic
                                                 : dim.size());
   }
-  return mlir::TF::ShapeAttr::get(context, llvm::makeArrayRef(dims));
+  return mlir::TF::ShapeAttr::get(context, llvm::ArrayRef(dims));
 }
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
@@ -400,6 +389,20 @@ void ConvertBfloat16ElementsAttr(const mlir::DenseElementsAttr attr,
   }
 }
 
+template <typename T>
+void ConvertFloat8ElementsAttr(const mlir::DenseElementsAttr attr,
+                               std::string* output) {
+  if (attr.isSplat()) {
+    if (attr.getSplatValue<T>() != T(0))
+      output->push_back(
+          Eigen::numext::bit_cast<uint8_t>(attr.getSplatValue<T>()));
+  } else {
+    output->reserve(attr.getNumElements());
+    for (const T value : attr.getValues<T>())
+      output->push_back(Eigen::numext::bit_cast<uint8_t>(value));
+  }
+}
+
 Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   auto type = attr.getType();
   auto shape = type.getShape();
@@ -438,6 +441,14 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       ConvertFloatElementsAttr(dense_attr, output->mutable_float_val(),
                                output->mutable_tensor_content());
       break;
+    case DT_FLOAT8_E5M2:
+      ConvertFloat8ElementsAttr<tsl::float8_e5m2>(dense_attr,
+                                                  output->mutable_float8_val());
+      break;
+    case DT_FLOAT8_E4M3FN:
+      ConvertFloat8ElementsAttr<tsl::float8_e4m3fn>(
+          dense_attr, output->mutable_float8_val());
+      break;
     case DT_QUINT8:
     case DT_INT8:
       ConvertUIntElementsAttr<int, int8_t>(dense_attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
index 29c8b470fad..9255667c647 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -21,14 +21,13 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 
 namespace tensorflow {
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // Converts an TensorFlow tensor proto into an MLIR elements attribute.
 StatusOr<mlir::ElementsAttr> ConvertTensorProto(const TensorProto& input_tensor,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index 8fef254c74d..115a1cbbfd2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace tensorflow {
 namespace {
@@ -140,6 +140,12 @@ TEST_F(ConvertTensorTest, Simple) {
       {1.0, -1.0}, DT_FLOAT, mlir::FloatType::getF32(&context)));
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<double>(
       {1.0, -1.0}, DT_DOUBLE, mlir::FloatType::getF64(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::float8_e5m2>(
+      {tsl::float8_e5m2{1.0}, tsl::float8_e5m2{-1.0}}, DT_FLOAT8_E5M2,
+      mlir::FloatType::getFloat8E5M2(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<tsl::float8_e4m3fn>(
+      {tsl::float8_e4m3fn{1.0}, tsl::float8_e4m3fn{-1.0}}, DT_FLOAT8_E4M3FN,
+      mlir::FloatType::getFloat8E4M3FN(&context)));
 
   ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
       {1, -1}, DT_INT8, mlir::IntegerType::get(&context, 8)));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index efcd7e3830a..2546fa44a05 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -79,6 +79,12 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
     case DT_COMPLEX128:
       *type = mlir::ComplexType::get(builder.getF64Type());
       return OkStatus();
+    case tensorflow::DT_FLOAT8_E4M3FN:
+      *type = builder.getFloat8E4M3FNType();
+      return ::tensorflow::OkStatus();
+    case tensorflow::DT_FLOAT8_E5M2:
+      *type = builder.getFloat8E5M2Type();
+      return ::tensorflow::OkStatus();
 #define HANDLE_TF_TYPE(tftype, enumerant, name)             \
   case DT_##enumerant:                                      \
     *type = builder.getType<mlir::tf_type::tftype##Type>(); \
@@ -104,6 +110,12 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
   } else if (type.isBF16()) {
     *dtype = DT_BFLOAT16;
     return OkStatus();
+  } else if (type.isFloat8E4M3FN()) {
+    *dtype = DT_FLOAT8_E4M3FN;
+    return OkStatus();
+  } else if (type.isFloat8E5M2()) {
+    *dtype = DT_FLOAT8_E5M2;
+    return OkStatus();
   } else if (auto itype = type.dyn_cast<mlir::IntegerType>()) {
     switch (itype.getWidth()) {
       case 1:
@@ -164,8 +176,7 @@ void ConvertToMlirShape(const TensorShape& input_shape,
                         llvm::SmallVectorImpl<int64_t>* shape) {
   shape->reserve(input_shape.dims());
   for (const auto& d : input_shape) {
-    shape->push_back(d.size == kTFDynamicSize ? ShapedType::kDynamicSize
-                                              : d.size);
+    shape->push_back(d.size == kTFDynamicSize ? ShapedType::kDynamic : d.size);
   }
 }
 
@@ -177,7 +188,7 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
     if (d.size() > std::numeric_limits<int64_t>::max()) {
       return errors::InvalidArgument("Shape element overflows");
     }
-    shape->push_back(d.size() == kTFDynamicSize ? ShapedType::kDynamicSize
+    shape->push_back(d.size() == kTFDynamicSize ? ShapedType::kDynamic
                                                 : d.size());
   }
   return OkStatus();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
index 76e05663be5..35a3d1fb156 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.h
@@ -18,14 +18,13 @@ limitations under the License.
 
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // Converts the TensorFlow DataType 'dtype' into an MLIR (scalar) type.
 Status ConvertDataType(DataType dtype, mlir::Builder builder, mlir::Type* type);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
index 0670bbdc20a..7bc65919030 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
index 2d6b0f01e9a..2c400925a88 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_GRAPH_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_GRAPH_H_
 
+#include <optional>
 #include <string>
 
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -50,7 +51,7 @@ struct MlirDumpConfig {
   // debug information is printed in a more readable 'pretty' form but this
   // pretty form is not parsable (so only for human readability).
   MlirDumpConfig& emit_location_information(bool pretty_form = false) {
-    this->op_printing_flags.enableDebugInfo(pretty_form);
+    this->op_printing_flags.enableDebugInfo(/*enable=*/true, pretty_form);
     return *this;
   }
 
@@ -60,7 +61,7 @@ struct MlirDumpConfig {
   }
 
   // Op printing flags.
-  mlir::OpPrintingFlags op_printing_flags = llvm::None;
+  mlir::OpPrintingFlags op_printing_flags = std::nullopt;
 
   // The target MLIR dialect.
   Dialect dialect = Dialect::kTFG;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index a8e2a988159..e6dffa6d217 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -186,8 +186,9 @@ void PrintPassPipeline(const mlir::PassManager& pass_manager,
   llvm::interleaveComma(
       pass_manager.getPasses(), passOS,
       [&](mlir::Pass& pass) { pass.printAsTextualPipeline(passOS); });
-  os << "{-# external_resources: { mlir_reproducer: { pipeline: \""
-     << passOS.str() << "\", ";
+  os << "{-# external_resources: { mlir_reproducer: { pipeline: "
+        "\"builtin.module("
+     << passOS.str() << ")\", ";
   os << "disable_threading: true, ";
   os << "verify_each: true } } #-}";
   os << "\n\n";
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index d8cefd434f7..fc0dd628eca 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -15,11 +15,25 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
@@ -27,6 +41,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::IsNull;
+
 TEST(DumpMlirModuleTest, NoEnvPrefix) {
   mlir::MLIRContext context;
   mlir::OwningOpRef<mlir::ModuleOp> module_ref =
@@ -104,7 +120,8 @@ TEST(DumpCrashReproducerTest, Valid) {
   std::string expected_txt_module;
   {
     llvm::raw_string_ostream os(expected_txt_module);
-    os << "{-# external_resources: { mlir_reproducer: { pipeline: \"\", "
+    os << "{-# external_resources: { mlir_reproducer: { pipeline: "
+          "\"builtin.module()\", "
           "disable_threading: true, verify_each: true } } #-}\n\n";
     module_ref->getOperation()->print(os,
                                       mlir::OpPrintingFlags().useLocalScope());
@@ -123,6 +140,46 @@ TEST(DumpCrashReproducerTest, Valid) {
   EXPECT_EQ(file_txt_module, expected_txt_module);
 }
 
+TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
+  mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  std::string filepath =
+      testing::TmpDir() + "/" + mlir::TF::kStandardPipelineBefore + ".mlir";
+
+  std::string output_dump = testing::TmpDir() + "/" + "output_dump.txt";
+
+  TF_ASSERT_OK(mlir::TF::RunBridgeWithStandardPipeline(
+      module_ref.get(),
+      /*enable_logging=*/true, /*enable_inliner=*/false));
+
+  std::string errorMessage;
+  auto input_file = mlir::openInputFile(filepath, &errorMessage);
+  EXPECT_THAT(input_file, Not(IsNull()));
+
+  auto output_stream = mlir::openOutputFile(output_dump, &errorMessage);
+  EXPECT_THAT(output_stream, Not(IsNull()));
+
+  mlir::PassPipelineCLParser passPipeline(
+      /*arg=*/"", /*description=*/"Compiler passes to run", /*alias=*/"p");
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::registerAllPasses();
+  mlir::registerTensorFlowPasses();
+
+  EXPECT_TRUE(mlir::MlirOptMain(output_stream->os(), std::move(input_file),
+                                passPipeline, registry,
+                                /*splitInputFile=*/false,
+                                /*verifyDiagnostics=*/false,
+                                /*verifyPasses=*/false,
+                                /*allowUnregisteredDialects=*/false)
+                  .succeeded());
+}
+
 TEST(DumpRawStringToFileTest, Valid) {
   llvm::StringRef example = "module {\n}";
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.cc
index 62627642d7f..a82374a23c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.cc
@@ -21,7 +21,7 @@ namespace tensorflow {
 llvm::SmallVector<int64_t> ConvertTFShapeToMlir(
     llvm::ArrayRef<int64_t> shapes) {
   return llvm::to_vector(llvm::map_range(shapes, [](int64_t shape) {
-    return shape == kTFDynamicSize ? mlir::ShapedType::kDynamicSize : shape;
+    return shape == kTFDynamicSize ? mlir::ShapedType::kDynamic : shape;
   }));
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
index d4a5cb22bbc..c03ba4c2f8a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 33547c830f9..24152cad81c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -40,7 +39,7 @@ class ShapedType;
 
 namespace tensorflow {
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // Add custom op prefix for TensorFlow dialects.
 Status AddTensorFlowOpPrefix(std::string);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
index 2f37747f6a0..f88e3e57364 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
@@ -39,7 +39,7 @@ tf_device::ParallelExecuteOp BuildParallelExecuteOp(
 
 LogicalResult RemoveSingletonParallelExecuteOp(
     tf_device::ParallelExecuteOp parallel_execute, OpBuilder* builder) {
-  if (parallel_execute.regions().size() == 1) {
+  if (parallel_execute.getRegions().size() == 1) {
     builder->setInsertionPoint(parallel_execute);
     auto& block = parallel_execute.GetRegionBlockWithIndex(0);
     llvm::SmallVector<Operation*, 2> ops_move;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
index 0846aad45fa..5b89105156d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
@@ -31,7 +31,7 @@ std::string GetVariableName(TF::VarHandleOp var_handle_op) {
   // then fallback to shared_name attribute.
   if (auto loc = var_handle_op->getLoc().dyn_cast<NameLoc>())
     return loc.getName().str();
-  return var_handle_op.shared_name().str();
+  return var_handle_op.getSharedName().str();
 }
 
 std::vector<std::string> GetVariableNames(
@@ -50,11 +50,11 @@ tensorflow::Var* GetVariableFromSession(mlir::TF::VarHandleOp var_handle_op,
   if (!mgr || !mgr->LookupDevice(StringRefToView(device_name), &device).ok())
     return nullptr;
   tensorflow::Var* var_ptr = nullptr;
-  const auto& container = var_handle_op.container().str();
+  const auto& container = var_handle_op.getContainer().str();
   auto status = device->resource_manager()->Lookup(
       (container.empty() ? device->resource_manager()->default_container()
                          : container),
-      var_handle_op.shared_name().str(), &var_ptr);
+      var_handle_op.getSharedName().str(), &var_ptr);
   if (!device || !status.ok()) return nullptr;
   return var_ptr;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
index 64e04628fa1..14c723f8fce 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
 
+#include <optional>
+
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 
@@ -24,7 +26,7 @@ namespace mlir {
 namespace TF {
 
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, int64_t graph_version,
+    std::optional<Location> location, Operation* op, int64_t graph_version,
     tfg::OperandAsConstantFn operand_as_constant_fn,
     tfg::OpResultAsShapeFn op_result_as_shape_fn,
     tfg::ResultElementTypeFn result_element_type_fn,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
index e81ccce70a7..77fb3f8364a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
@@ -33,7 +33,7 @@ namespace TF {
 // and instead is temporary until shape functions are reimplemented/migrated to
 // being in MLIR instead of the TensorFlow op registry.
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, int64_t graph_version,
+    std::optional<Location> location, Operation* op, int64_t graph_version,
     tfg::OperandAsConstantFn operand_as_constant_fn,
     tfg::OpResultAsShapeFn op_result_as_shape_fn,
     tfg::ResultElementTypeFn result_element_type_fn,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
index c5758029eee..2799c8eb069 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
@@ -43,8 +44,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -142,7 +143,7 @@ Status ParseArgumentShapes(
       continue;
     }
     TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
-        shape.value().getValue(), &arg_shapes[shape.index()].shape));
+        *shape.value(), &arg_shapes[shape.index()].shape));
   }
 
   return OkStatus();
@@ -232,8 +233,7 @@ Status ParseXlaArguments(absl::string_view input_shapes_str,
     TensorShape shape;
     auto input_shapes = std::get<1>(arg_components);
     if (input_shapes.has_value()) {
-      TF_RETURN_IF_ERROR(
-          TensorShapeUtils::MakeShape(input_shapes.getValue(), &shape));
+      TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(*input_shapes, &shape));
     } else {
       TF_RETURN_IF_ERROR(
           TensorShapeUtils::MakeShape(static_cast<int*>(nullptr), 0, &shape));
@@ -371,8 +371,10 @@ static mlir::LogicalResult MlirTfGraphToHloTextTranslateFunction(
 }
 
 static void RegisterMlirInputDialects(mlir::DialectRegistry& registry) {
-  registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                  mlir::TF::TensorFlowDialect>();
+  // TODO(b/259459405): Remove support for stablehlo as an input.
+  registry
+      .insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+              mlir::TF::TensorFlowDialect, mlir::stablehlo::StablehloDialect>();
 }
 
 static void RegisterGraphInputDialects(mlir::DialectRegistry& registry) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.cc b/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.cc
new file mode 100644
index 00000000000..460c4baa49e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.cc
@@ -0,0 +1,157 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h"
+
+#include <algorithm>
+#include <queue>
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+ExtraDependenciesFunction no_extra_dependencies = nullptr;
+
+std::vector<Operation*> SortBlockTopologically(
+    Block& block, PriorityFunction priorityFunction,
+    ExtraDependenciesFunction extraDependencies) {
+  llvm::DenseMap<Operation*, int> remaining_incoming_data_edges;
+  llvm::DenseMap<Operation*, int> remaining_incoming_ctrl_edges;
+  llvm::DenseMap<Operation*, int> position;
+  llvm::DenseMap<Operation*, Operation*> ancestor;
+  SmallVector<Operation*> ready;
+
+  llvm::SmallVector<mlir::Operation*, 4> empty_op_set;
+  auto ctrlPredecessors =
+      [&](Operation* op) -> llvm::SmallVector<mlir::Operation*, 4> const& {
+    if (extraDependencies) {
+      return extraDependencies(op, /*incoming=*/true);
+    } else {
+      return empty_op_set;
+    }
+  };
+  auto ctrlSuccessors =
+      [&](Operation* op) -> llvm::SmallVector<mlir::Operation*, 4> const& {
+    if (extraDependencies) {
+      return extraDependencies(op, /*incoming=*/false);
+    } else {
+      return empty_op_set;
+    }
+  };
+
+  int i = 0;
+  for (Operation& op : block.getOperations()) {
+    int incoming_ctrl_edges = 0;
+    int incoming_data_edges = 0;
+    op.walk([&](Operation* child) {
+      ancestor[child] = &op;
+      for (Operation* predecessor : ctrlPredecessors(child)) {
+        if (predecessor->getBlock() == &block) {
+          incoming_ctrl_edges++;
+        }
+      }
+      for (Value v : child->getOperands()) {
+        if (v.getParentBlock() == &block) {
+          incoming_data_edges++;
+        }
+      }
+    });
+    remaining_incoming_data_edges[&op] = incoming_data_edges;
+    remaining_incoming_ctrl_edges[&op] = incoming_ctrl_edges;
+    if (incoming_data_edges == 0 && incoming_ctrl_edges == 0) {
+      ready.push_back(&op);
+    }
+    position[&op] = i++;
+  }
+
+  std::queue<Value> todo;
+  for (Value value : block.getArguments()) {
+    todo.push(value);
+  }
+
+  std::vector<Operation*> result;
+  Operation* previous_op = nullptr;
+  while (!todo.empty() || !ready.empty()) {
+    while (!todo.empty()) {
+      Value value = todo.front();
+      todo.pop();
+      // All operations that have all their inputs available are good to go.
+      // Uses, not Users, in case getUsers ever dedups.
+      for (OpOperand& operand : value.getUses()) {
+        Operation* user = ancestor[operand.getOwner()];
+        remaining_incoming_data_edges[user]--;
+        if (remaining_incoming_data_edges[user] == 0 &&
+            remaining_incoming_ctrl_edges[user] == 0) {
+          ready.push_back(user);
+        }
+      }
+    }
+
+    // Find the "best" operation to emit. We
+    // (a) emit the terminator last.
+    // (b) honor the priority function (as far as possible).
+    // (c) preserve order within the ops of one dialect.
+    auto better = [&](Operation* a, Operation* b) {
+      if (a->hasTrait<OpTrait::IsTerminator>() !=
+          b->hasTrait<OpTrait::IsTerminator>()) {
+        return b->hasTrait<OpTrait::IsTerminator>();
+      }
+      int a_priority = priorityFunction(previous_op, a);
+      int b_priority = priorityFunction(previous_op, b);
+      if (a_priority != b_priority) {
+        return a_priority > b_priority;
+      } else {
+        return position[a] < position[b];  // preserve order
+      }
+    };
+
+    Operation* best = nullptr;
+    for (Operation* op : ready) {
+      if (best == nullptr || better(op, best)) {
+        best = op;
+      }
+    }
+
+    if (!best) {
+      assert(ready.empty());
+      return result;  // happens for unused results for ops in the todo list
+    }
+
+    // Consider this operation emitted, and make its results available.
+    ready.erase(std::find(ready.begin(), ready.end(), best));
+    previous_op = best;
+    for (Value result : best->getResults()) {
+      todo.push(result);
+    }
+    for (Operation* successor : ctrlSuccessors(best)) {
+      if (ancestor.find(successor) != ancestor.end()) {
+        successor = ancestor[successor];
+        remaining_incoming_ctrl_edges[successor]--;
+        if (remaining_incoming_ctrl_edges[successor] == 0 &&
+            remaining_incoming_data_edges[successor] == 0) {
+          ready.push_back(successor);
+        }
+      }
+    }
+    result.push_back(best);
+  }
+  return result;
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h b/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h
new file mode 100644
index 00000000000..a62fe17add2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/topological_sort.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
+
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// A function that determines which op to emit next in the case of ties.
+// The predecessor (which can be null) is the last op we emitted,
+// and op is the candidate we're considering. A larger returned integer
+// means the op has a higher chance of being emitted first.
+typedef int (*PriorityFunction)(Operation *predecessor, Operation *op);
+
+// A function that returns extra dependencies for each op. These might
+// e.g. be known side-effects (or control dependencies) between ops.
+// If "incoming" is true, then the list of (extra) predecessors of the
+// op should be returned. If "incoming" is false, the list of successors.
+// The algorithm assumes that these are consistent which each other. So
+// if (and only if) op1 is in extra_dependencies(op2, true), then op2
+// must also be in extra_dependencies(op1, false).
+// This function is called multiple times during the topological sort,
+// so the implementation should preferably be constant-time.
+typedef llvm::function_ref<llvm::SmallVector<Operation *, 4> const &(
+    Operation *, bool incoming)>
+    ExtraDependenciesFunction;
+
+// Convenience function if there are no extra dependencies to declare.
+// (Unlike nullptr, this also works inside the ternary operator)
+extern ExtraDependenciesFunction no_extra_dependencies;
+
+// Sort a block topologically, so that for all ops, all operands are
+// available at the time of execution.  This is similar to MLIR's topological
+// sort (lib/Transforms/TopologicalSort.cpp) but also takes a priority
+// function to determine the next op to emit in the case of ambiguity. This
+// makes it possible to group operations by certain attributes. For example,
+// the order_by_dialect pass uses this function to group by dialect.
+// Only the operations nested directly under the block will be reordered.
+// Nested blocks will be left alone.
+// Also takes a list of control dependencies (vector of operation pairs,
+// from->to) that will be honored when ordering the ops together with the
+// data dependencies given through (the ops/results of) the operations
+// themselves.
+std::vector<Operation *> SortBlockTopologically(
+    Block &block, PriorityFunction priorityFunction,
+    ExtraDependenciesFunction extraDependencies = no_extra_dependencies);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 264d4576b9d..8529a55753f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 6c515922416..d51a06feaa5 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -27,13 +27,13 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 inline constexpr absl::string_view kTPUReplicatedHost = "TPU_REPLICATED_HOST";
 inline constexpr absl::string_view kNumCoresPerReplicaAttr =
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
index f1a301342a7..988950389ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
@@ -66,8 +66,7 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
   return mlir::success();
 }
 
-::stream_executor::port::StatusOr<int64_t> GetTfGraphProducerVersion(
-    mlir::ModuleOp module) {
+::tsl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module) {
   auto versions = module->getAttrOfType<::mlir::DictionaryAttr>("tf.versions");
   if (!versions) {
     return errors::Internal(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
index 4f2a306b72d..feccb1754d5 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -37,8 +37,7 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
 
 // Returns TensorFlow GraphDef producer version for the given module. Returns an
 // error if the version information is missing for the module or is not valid.
-::stream_executor::port::StatusOr<int64_t> GetTfGraphProducerVersion(
-    mlir::ModuleOp module);
+::tsl::StatusOr<int64_t> GetTfGraphProducerVersion(mlir::ModuleOp module);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.cc
index be30e296a46..bcc70642c97 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/verification_utils.cc
@@ -25,7 +25,7 @@ namespace TF {
 LogicalResult VerifyShapeOfReshapeOp(ArrayRef<int64_t> shape) {
   bool has_dynamic_dim = false;
   for (int64_t dim : shape) {
-    if (dim != ShapedType::kDynamicSize) {
+    if (dim != ShapedType::kDynamic) {
       if (dim < 0) return failure();
       continue;
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 2b607727e20..2a6d94828a3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -62,8 +62,7 @@ mlir::LogicalResult CreateSplitOp(const int num_split,
   auto input_type = src_input.getType().cast<mlir::TensorType>();
 
   if (input_type.hasRank()) {
-    if (input_type.getShape()[split_dimension] ==
-        mlir::ShapedType::kDynamicSize) {
+    if (input_type.getShape()[split_dimension] == mlir::ShapedType::kDynamic) {
       output_type = input_type;
     } else {
       auto shape = llvm::to_vector<4>(input_type.getShape());
@@ -87,7 +86,7 @@ mlir::LogicalResult CreateSplitOp(const int num_split,
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
   *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.output(), src_input);
+      location, output_types, split_dimension_op.getOutput(), src_input);
   (*split_op)->setAttr(
       kNumSplitAttr,
       builder->getIntegerAttr(builder->getIntegerType(32), num_split));
@@ -115,8 +114,7 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
   auto input_type = inputs[0].getType().cast<mlir::TensorType>();
 
   if (input_type.hasRank()) {
-    if (input_type.getShape()[concat_dimension] ==
-        mlir::ShapedType::kDynamicSize) {
+    if (input_type.getShape()[concat_dimension] == mlir::ShapedType::kDynamic) {
       output_type = input_type;
     } else {
       auto shape = llvm::to_vector<4>(input_type.getShape());
@@ -129,7 +127,7 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
   }
 
   return builder->create<mlir::TF::ConcatOp>(
-      location, output_type, concat_dimension_op.output(), inputs);
+      location, output_type, concat_dimension_op.getOutput(), inputs);
 }
 
 // For tile sharded inputs to TPU computation, inject split op between the
@@ -245,11 +243,11 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
                         input_index, tiled_input_size, num_cores_per_replica));
     };
 
-    // If input is already partitioned using the `tf.TPUPartitionedInput` op,
+    // If input is already partitioned using the `tf.TPUPartitionedInputV2` op,
     // only replicated sharding is supported where i-th operand to
-    // `tf.TPUPartitionedInput` op is input to the i-th logical device.
+    // `tf.TPUPartitionedInputV2` op is input to the i-th logical device.
     if (auto partitioned_input =
-            llvm::dyn_cast_or_null<mlir::TF::TPUPartitionedInputOp>(
+            llvm::dyn_cast_or_null<mlir::TF::TPUPartitionedInputV2Op>(
                 input_value.getDefiningOp())) {
       if (UnsupportedPartitionedShardingType(input_sharding_type))
         return cluster_func->emitOpError()
@@ -264,14 +262,15 @@ mlir::LogicalResult ExtractInputsForLogicalDevices(
         }
       } else {
         assert(input_sharding_type == xla::OpSharding::OTHER);
-        if (partitioned_input.inputs().size() != num_cores_per_replica)
-          return tiled_sharding_mismatched(partitioned_input.inputs().size());
+        if (partitioned_input.getInputs().size() != num_cores_per_replica)
+          return tiled_sharding_mismatched(
+              partitioned_input.getInputs().size());
 
         for (int i = 0; i < sharding.tile_assignment_devices_size(); ++i) {
           const int assigned_logical_device =
               sharding.tile_assignment_devices(i);
           (*input_list)[assigned_logical_device].emplace_back(
-              partitioned_input.inputs()[i]);
+              partitioned_input.getInputs()[i]);
         }
       }
       continue;
@@ -479,7 +478,7 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
     const auto output_splits = dimension_and_output_splits.value();
     const auto output_shape = cluster_func_output_type.getShape();
 
-    if (output_shape[dimension_index] == mlir::ShapedType::kDynamicSize) {
+    if (output_shape[dimension_index] == mlir::ShapedType::kDynamic) {
       *tiled_logical_computation_type = cluster_func_output_type;
       break;
     }
@@ -577,15 +576,16 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
         output_sharding_config[tpu_cluster_output_index];
     const auto output_sharding_type = output_sharding.type();
 
-    // If output is demultiplexed using the `tf.TPUPartitionedOutput` op, only
+    // If output is demultiplexed using the `tf.TPUPartitionedOutputV2` op, only
     // replicated sharding is supported where i-th output of
-    // `tf.TPUPartitionedOutput` op maps to the output of i-th logical device.
-    // Also `tf.TPUPartitionedOutput` op must be a unique user of
+    // `tf.TPUPartitionedOutputV2` op maps to the output of i-th logical device.
+    // Also `tf.TPUPartitionedOutputV2` op must be a unique user of
     // TPU Cluster (`tf_device.old_parallel_execute`) output.
-    mlir::TF::TPUPartitionedOutputOp partitioned_output;
+    mlir::TF::TPUPartitionedOutputV2Op partitioned_output;
     for (auto user : old_parallel_execute_output.getUsers()) {
       if (auto partitioned_output_user =
-              llvm::dyn_cast_or_null<mlir::TF::TPUPartitionedOutputOp>(user)) {
+              llvm::dyn_cast_or_null<mlir::TF::TPUPartitionedOutputV2Op>(
+                  user)) {
         partitioned_output = partitioned_output_user;
         break;
       }
@@ -604,7 +604,7 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
 
       if (output_sharding_type == xla::OpSharding::REPLICATED) {
         for (const auto& index_and_output :
-             llvm::enumerate(partitioned_output.output())) {
+             llvm::enumerate(partitioned_output.getOutput())) {
           const auto output_from_logical_device =
               new_parallel_execute.GetRegionOutputs(
                   cluster_idx +
@@ -621,7 +621,7 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
                 &tile_sharded_outputs)))
           return mlir::failure();
         for (auto result :
-             llvm::zip(partitioned_output.output(), tile_sharded_outputs))
+             llvm::zip(partitioned_output.getOutput(), tile_sharded_outputs))
           std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
       }
       continue;
diff --git a/tensorflow/compiler/mlir/tf2xla/BUILD b/tensorflow/compiler/mlir/tf2xla/BUILD
index 1727147da34..206b77246c1 100644
--- a/tensorflow/compiler/mlir/tf2xla/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #    TF2XLA Bridge and related components.
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "mlir_bridge_rollout_policy",
     srcs = ["mlir_bridge_rollout_policy.cc"],
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
index f8fd9f59d56..262ebc0fd2e 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
@@ -36,9 +36,6 @@ enum class MlirBridgeRolloutPolicy {
   // features in the model, the MLIR bridge should be run. If the MLIR Bridge
   // errors, the fallback path should be used whenever possible.
   kEnabledAfterGraphAnalysis,
-  // The bridge was fallback enabled in a safe mode and passed all graph
-  // analysis checks.
-  kEnabledAfterGraphAnalysisSafeModeFallback
 };
 
 // Analyzes the user requested policy as well as the contents of the graph and
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 2a732b54484..7c768abb3fd 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
-#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "tensorflow//compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
@@ -34,16 +36,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
-  mlir::registerAllPasses();
+  mlir::registerTransformsPasses();
   mlir::registerTensorFlowPasses();
   mlir::TFDevice::registerTensorFlowDevicePasses();
   mlir::tf_saved_model::registerTensorFlowSavedModelPasses();
@@ -51,11 +52,9 @@ int main(int argc, char **argv) {
   mlir::mhlo::registerAllMhloPasses();
   mlir::lmhlo::registerAllLmhloPasses();
   // These are in compiler/mlir/xla and not part of the above MHLO passes.
+  mlir::mhlo::registerLegalizeTfPasses();
   mlir::mhlo::registerTfXlaPasses();
-  mlir::mhlo::registerXlaPasses();
-  mlir::mhlo::registerLegalizeTFPass();
-  mlir::mhlo::registerLegalizeTFControlFlowPass();
-  mlir::mhlo::registerLegalizeTfTypesPassPass();
+  mlir::mhlo::registerXlaFrameworkPasses();
   mlir::tosa::registerLegalizeTosaPasses();
   mlir::tosa::registerTFtoTOSALegalizationPipeline();
   mlir::tosa::registerTFLtoTOSALegalizationPipeline();
@@ -66,15 +65,16 @@ int main(int argc, char **argv) {
   tensorflow::RegisterMlProgramPasses();
 
   mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
   mlir::RegisterAllTensorFlowDialects(registry);
   mlir::mhlo::registerAllMhloDialects(registry);
   mlir::stablehlo::registerAllDialects(registry);
-  registry.insert<mlir::shape::ShapeDialect>();
-  registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
   registry.insert<mlir::quant::QuantizationDialect>();
   registry.insert<mlir::quantfork::QuantizationForkDialect>();
-  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
+  registry.insert<mlir::shape::ShapeDialect>();
+  registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  registry.insert<mlir::tensor::TensorDialect>();
+  registry.insert<mlir::tosa::TosaDialect>();
   return failed(
       mlir::MlirOptMain(argc, argv, "TensorFlow pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/tf_mlir_reduce_main.cc b/tensorflow/compiler/mlir/tf_mlir_reduce_main.cc
index a0e3df7f297..00953ce144a 100644
--- a/tensorflow/compiler/mlir/tf_mlir_reduce_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_reduce_main.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace {
 
@@ -60,9 +60,8 @@ int main(int argc, char *argv[]) {
   mlir::lmhlo::registerAllLmhloPasses();
   // These are in compiler/mlir/xla and not part of the above MHLO passes.
   mlir::mhlo::registerTfXlaPasses();
-  mlir::mhlo::registerXlaPasses();
+  mlir::mhlo::registerXlaFrameworkPasses();
   mlir::mhlo::registerLegalizeTFPass();
-  mlir::mhlo::registerLegalizeTFControlFlowPass();
   mlir::mhlo::registerLegalizeTfTypesPassPass();
 
   mlir::DialectRegistry registry;
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index dc7f6ed1ddc..6758aee3b77 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -88,7 +88,7 @@ int main(int argc, char** argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
   // Add flags for all the registered translations.
-  llvm::cl::opt<const mlir::TranslateFunction*, false, mlir::TranslationParser>
+  llvm::cl::opt<const mlir::Translation*, false, mlir::TranslationParser>
       requested_translation("", llvm::cl::desc("Translation to perform"));
   mlir::registerAsmPrinterCLOptions();
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF MLIR translation driver\n");
@@ -155,10 +155,10 @@ int main(int argc, char** argv) {
     // Processes the memory buffer with a new MLIRContext.
     auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                              llvm::raw_ostream& os) {
-      llvm::SourceMgr sourceMgr;
-      sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
+      auto sourceMgr = std::make_shared<llvm::SourceMgr>();
+      sourceMgr->AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
       mlir::MLIRContext context;
-      mlir::SourceMgrDiagnosticHandler diagnostic_handler(sourceMgr, &context);
+      mlir::SourceMgrDiagnosticHandler diagnostic_handler(*sourceMgr, &context);
       return (*requested_translation)(sourceMgr, os, &context);
     };
 
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index db66e902602..80fc8ae1c84 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -15,6 +15,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
@@ -223,7 +224,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -246,14 +246,13 @@ tf_cc_test(
     deps = [
         ":tfr_decompose_ctx",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -265,7 +264,6 @@ cc_library(
     deps = [
         ":tfr_decompose_ctx",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:device_set",
         "@llvm-project//mlir:IR",
@@ -297,7 +295,6 @@ cc_library(
     hdrs = ["integration/node_expansion_pass.h"],
     deps = [
         ":tfr_decompose_ctx",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/eager:core_no_xla",
         "//tensorflow/core/common_runtime/eager:eager_op_rewrite_registry",
diff --git a/tensorflow/compiler/mlir/tfr/examples/customization/BUILD b/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
index 10053c2425a..748a189e25c 100644
--- a/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
diff --git a/tensorflow/compiler/mlir/tfr/examples/customization/test_ops_test.py b/tensorflow/compiler/mlir/tfr/examples/customization/test_ops_test.py
index c0fa6aa4a6d..6310782e92e 100644
--- a/tensorflow/compiler/mlir/tfr/examples/customization/test_ops_test.py
+++ b/tensorflow/compiler/mlir/tfr/examples/customization/test_ops_test.py
@@ -24,7 +24,7 @@
 class TestOpsDefsTest(test_utils.OpsDefsTest):
 
   def test_test_ops(self):
-    attr = tf.function(test_ops.test_attr)(T=tf.float32)
+    attr = tf.function(test_ops.test_attr)(tf.float32)
     self.assertAllClose(attr.numpy(), 100.0)
 
 
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
index 4ae874c8fb3..4160d864f2e 100644
--- a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/BUILD b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
index 1602facf04c..837ce6e4f22 100644
--- a/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
index 5eb23b5bddf..206e5ef13f8 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
index 462ca6d7269..e415f5cbea9 100644
--- a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tfr {
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
index 21d61d76b41..f078093e3e2 100644
--- a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
index 319d9f96955..d29f2bdbf32 100644
--- a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
 
 #include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tfr {
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index d031330e906..7fddd2526d5 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -47,10 +47,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/compiler/mlir/tfr/passes/passes.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tfr {
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
index ab97468c525..79c41a3f946 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
@@ -20,15 +20,15 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tfr {
 
 extern const char* const kTFRLibEnv;
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 // An wrapper for all the objects used to decompose a module (graph mode) and
 // node_def (eager mode). Note that this class owns the decomposition library.
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
index c4b2dd4c8f0..cf25bfd5020 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 using testing::ElementsAreArray;
 using testing::Test;
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index 22253dc91ff..8c33af424b8 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -74,7 +74,7 @@ class TFRInlinerInterface : public DialectInlinerInterface {
   // Returns true if the given region 'src' can be inlined into the region
   // 'dest' that is attached to an operation registered to the current dialect.
   bool isLegalToInline(Region *dest, Region *src, bool wouldBeCloned,
-                       BlockAndValueMapping &) const final {
+                       IRMapping &) const final {
     return true;
   }
 
@@ -82,7 +82,7 @@ class TFRInlinerInterface : public DialectInlinerInterface {
   // dialect, can be inlined into the region 'dest' that is attached to an
   // operation registered to the current dialect.
   bool isLegalToInline(Operation *op, Region *dest, bool wouldBeCloned,
-                       BlockAndValueMapping &) const final {
+                       IRMapping &) const final {
     return true;
   }
 
@@ -93,7 +93,7 @@ class TFRInlinerInterface : public DialectInlinerInterface {
     auto retValOp = dyn_cast<TFRReturnOp>(op);
     if (!retValOp) return;
 
-    for (auto ret_value : llvm::zip(valuesToRepl, retValOp.operands())) {
+    for (auto ret_value : llvm::zip(valuesToRepl, retValOp.getOperands())) {
       std::get<0>(ret_value).replaceAllUsesWith(std::get<1>(ret_value));
     }
   }
@@ -359,11 +359,15 @@ ParseResult TFRFuncOp::parse(OpAsmParser &parser, OperationState &result) {
          function_interface_impl::VariadicFlag,
          std::string &) { return builder.getFunctionType(arg_types, results); };
   return function_interface_impl::parseFunctionOp(
-      parser, result, /*allowVariadic=*/false, build_func_type);
+      parser, result, /*allowVariadic=*/false,
+      getFunctionTypeAttrName(result.name), build_func_type,
+      getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
 }
 
 void TFRFuncOp::print(OpAsmPrinter &p) {
-  function_interface_impl::printFunctionOp(p, *this, /*isVariadic=*/false);
+  function_interface_impl::printFunctionOp(
+      p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
+      getArgAttrsAttrName(), getResAttrsAttrName());
 }
 
 }  // namespace TFR
@@ -679,21 +683,21 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
       auto scales_type = RankedTensorType::get(
           {static_cast<int64_t>(num_channels)}, rewriter.getF32Type());
       auto scales_attr =
-          DenseElementsAttr::get(scales_type, llvm::makeArrayRef(scales));
+          DenseElementsAttr::get(scales_type, llvm::ArrayRef(scales));
       scale_op = rewriter.create<TF::ConstOp>(loc, scales_attr);
 
       auto zps_type = RankedTensorType::get(
           {static_cast<int64_t>(num_channels)}, rewriter.getI32Type());
-      auto zps_attr = DenseElementsAttr::get(zps_type, llvm::makeArrayRef(zps));
+      auto zps_attr = DenseElementsAttr::get(zps_type, llvm::ArrayRef(zps));
       zp_op = rewriter.create<TF::ConstOp>(loc, zps_attr);
     }
     if (!scale_op || !zp_op) {
       return failure();
     }
     auto scale_cast = rewriter.create<CastOp>(
-        loc, qparams_op.getScale().getType(), scale_op.output());
+        loc, qparams_op.getScale().getType(), scale_op.getOutput());
     auto zp_cast = rewriter.create<CastOp>(loc, qparams_op.getZp().getType(),
-                                           zp_op.output());
+                                           zp_op.getOutput());
 
     qparams_op.getScale().replaceAllUsesWith(scale_cast.getOut());
     qparams_op.getZp().replaceAllUsesWith(zp_cast.getOut());
@@ -770,10 +774,9 @@ class RemoveScaleFactorOp : public OpRewritePattern<TFRQuantScaleFactorOp> {
     rewriter.setInsertionPoint(scale_factor_op);
     const Location loc = scale_factor_op->getLoc();
     auto result_scale_op = rewriter.create<TF::ConstOp>(
-        loc,
-        DenseElementsAttr::get(scale_type, llvm::makeArrayRef(scale_factors)));
+        loc, DenseElementsAttr::get(scale_type, llvm::ArrayRef(scale_factors)));
     auto result_scale_cast_op = rewriter.create<CastOp>(
-        loc, scale_factor_op.getType(), result_scale_op.output());
+        loc, scale_factor_op.getType(), result_scale_op.getOutput());
     scale_factor_op.getScaleFactor().replaceAllUsesWith(
         result_scale_cast_op.getOut());
     return success();
@@ -810,7 +813,7 @@ class RemoveRescaleOp : public OpRewritePattern<TFRQuantRescaleOp> {
     auto zp_tensor = rewriter.create<TF::ConstOp>(
         loc, RankedTensorType::get({}, zp.getType()), zp_attr);
     auto zp_cast = rewriter.create<CastOp>(
-        loc, rewriter.getType<TFRTensorType>(), zp_tensor.output());
+        loc, rewriter.getType<TFRTensorType>(), zp_tensor.getOutput());
 
     rewriter.setInsertionPoint(rescale_op);
     auto cast_input_to_float_op = rewriter.create<CallOp>(
@@ -893,14 +896,17 @@ void TFRQuantScaleFactorOp::getCanonicalizationPatterns(
   results.add<RemoveScaleFactorOp>(context);
 }
 
-OpFoldResult TFR::EqualOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult TFR::EqualOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
   assert(operands.size() == 2 && "equal op has two operands");
   auto ctx = getContext();
   if (operands[0] == operands[1]) return BoolAttr::get(ctx, true);
   return BoolAttr::get(ctx, false);
 }
 
-OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+OpFoldResult ConstOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  (void)operands;
   assert(operands.empty() && "constant has no operands");
 
   // Return the held attribute value.
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
index 12bac76e735..66ae99ae2ec 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -41,8 +41,7 @@ def TFR_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlir::TFR";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
@@ -416,7 +415,9 @@ def TFR_TFRFuncOp : TFR_Op<"func", [HasParent<"ModuleOp">,
 
   let arguments = (ins
     TypeAttrOf<FunctionType>:$function_type,
-    StrAttr:$sym_name
+    StrAttr:$sym_name,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
   );
 
   let results = (outs);
@@ -473,9 +474,9 @@ def TFR_TFRReturnOp : TFR_Op<"return", [HasParent<"TFRFuncOp">, Pure,
     Note that only the tfr.tensor and tfr.tensor_list can be returned.
   }];
 
-  let arguments = (ins Variadic<TFR_allowedResultType>:$operands);
+  let arguments = (ins Variadic<TFR_allowedResultType>:$arguments);
 
-  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+  let assemblyFormat = "attr-dict ($arguments^ `:` type($arguments))?";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
index df30f3779d3..443781b6b63 100644
--- a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -66,7 +66,7 @@ class UnrollSCFForOp : public OpRewritePattern<scf::ForOp> {
     // TODO(fengliuai): use loopUnrollByFactor once the iter_arg is supported
 
     Block *single_block = for_op.getBody();
-    BlockAndValueMapping mapping;
+    IRMapping mapping;
     Value iv = for_op.getInductionVar();
     for (auto iter_op :
          llvm::zip(for_op.getRegionIterArgs(), for_op.getInitArgs())) {
diff --git a/tensorflow/compiler/mlir/tfr/resources/BUILD b/tensorflow/compiler/mlir/tfr/resources/BUILD
index 07944d6e940..12966f2920f 100644
--- a/tensorflow/compiler/mlir/tfr/resources/BUILD
+++ b/tensorflow/compiler/mlir/tfr/resources/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_bindings")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
     ],
diff --git a/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
index 6188141a445..77508b60046 100644
--- a/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: tfr-opt %s -canonicalize -verify-diagnostics -split-input-file | FileCheck %s
+// RUN: tfr-opt %s -canonicalize="test-convergence" -verify-diagnostics -split-input-file | FileCheck %s
 
 // Tests for ops with canonicalization patterns.
 
diff --git a/tensorflow/compiler/mlir/tfr/tests/ops.mlir b/tensorflow/compiler/mlir/tfr/tests/ops.mlir
index e05f96b83a8..592ab9e9090 100644
--- a/tensorflow/compiler/mlir/tfr/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/tfr/tests/ops.mlir
@@ -28,7 +28,7 @@ func.func private @tensor_list_type_tuple_like() -> !tfr.tensor_list<input_T>
 
 // -----
 
-// expected-error@+1 {{unbalanced '>' character in pretty dialect name}}
+// expected-error@+1 {{unbalanced '[' character in pretty dialect name}}
 func.func private @tensor_invalid_1() -> !tfr.tensor<[N, T>
 
 // -----
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 296766aa31d..bc3fbfb8cdb 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud", "get_co
 
 # TF to TFRT kernels conversion.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -38,7 +39,6 @@ package_group(
         "//smartass/brain/ops/...",
         "//tensorflow_serving/servables/tensorflow/google/...",
         "//third_party/tf_runtime_google/...",
-        "//third_party/auroraml/...",
     ]),
 )
 
@@ -113,9 +113,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_transforms",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/runtime:compiler",
@@ -231,7 +233,6 @@ tfrt_cc_library(
     name = "tf_jitrt_request_context",
     srcs = ["jit/tf_jitrt_request_context.cc"],
     hdrs = ["jit/tf_jitrt_request_context.h"],
-    # copybara:uncomment compatible_with = ["//buildenv/target:gce"],
     deps = [
         "//tensorflow/compiler/xla/runtime:async_values_cache",
         "//tensorflow/compiler/xla/runtime:jit_executable",
@@ -338,7 +339,6 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
-        "@tf_runtime//:distributed_kernels_opdefs",
     ],
 )
 
@@ -361,6 +361,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "transforms/gpu_passes",
+    srcs = ["transforms/gpu_passes.cc"],
+    hdrs = ["transforms/gpu_passes.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "tf_to_tfrt",
     srcs = [
@@ -372,10 +388,10 @@ cc_library(
         "transforms/merge_tf_if_ops.cc",
         "transforms/optimize.cc",
         "transforms/optimize_tf_control_flow_side_effect.cc",
-        "transforms/remote_run_encapsulate.cc",
         "transforms/remove_device_attribute.cc",
         "transforms/remove_tf_if_const_args.cc",
         "transforms/reorder_assert.cc",
+        "transforms/sink_in_invariant_ops.cc",
         "transforms/tf_to_tfrt.cc",
         "transforms/tpu_passes.h",
     ],
@@ -390,6 +406,7 @@ cc_library(
         ":tensor_array_side_effect_analysis",
         ":tf_jitrt_opdefs",
         ":tf_jitrt_pipeline",
+        ":transforms/gpu_passes",
         ":transforms/set_shape_invariant_in_while_ops",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -413,49 +430,20 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
-        "@tf_runtime//:distributed_kernels_opdefs",
         "@tf_runtime//backends/jitrt:jitrt_opdefs",
         "@tf_runtime//:stream_analysis",
         "@tf_runtime//:test_kernels_opdefs",
+        "//tensorflow/compiler/mlir/tfrt:transform_utils",
+        "//tensorflow/tsl/platform:status",
     ] + if_google([
         "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
     ]),
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tf_to_tfrt_data",
-    srcs = [
-        "transforms/tf_to_tfrt_data.cc",
-    ],
-    hdrs = [
-        "transforms/tf_to_tfrt_data.h",
-    ],
-    deps = [
-        ":tf_to_tfrt",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs",
-        "@tf_runtime//:bef",
-        "@tf_runtime//:data_opdefs",
-        "@tf_runtime//:mlirtobef",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "host_context_util",
     srcs = ["utils/host_context.cc"],
@@ -565,6 +553,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime:function_body",
         "//tensorflow/core/common_runtime:function_def_utils",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/platform:status",
         "@tf_runtime//:bef",
         "@tf_runtime//:mlirtobef",
@@ -644,7 +633,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_test_passes",
         "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
-        "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt_data",
     ] + if_google([
         "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
     ]),
@@ -678,9 +666,11 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:bridge_pass_test_pipeline_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tfrt:transforms/gpu_passes",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_sync_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_test_passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st",
@@ -760,7 +750,6 @@ cc_library(
         "tfrt_fallback_registration.h",
     ],
     visibility = [":friends"] + if_google([
-        "//learning/brain/experimental/tfrt/distributed_runtime:__pkg__",
         "//learning/brain/experimental/tfrt/visualization:__pkg__",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server:__pkg__",
@@ -810,3 +799,37 @@ cc_library(
         "@tf_runtime//:core_runtime_opdefs",
     ],
 )
+
+cc_library(
+    name = "transform_utils",
+    srcs = [
+        "transforms/utils.cc",
+    ],
+    hdrs = [
+        "transforms/utils.h",
+    ],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@tf_runtime//:basic_kernels_opdefs",
+        "@tf_runtime//:core_runtime_opdefs",
+        "@tf_runtime//:support",
+    ],
+)
+
+cc_library(
+    name = "transforms/update_op_cost_in_tfrt_mlir",
+    srcs = ["transforms/update_op_cost_in_tfrt_mlir.cc"],
+    hdrs = ["transforms/update_op_cost_in_tfrt_mlir.h"],
+    deps = [
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
index 0de55acca2a..3fedc1deb3f 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
@@ -55,8 +55,8 @@ int64_t InferLookupTableFindV2Cost(const CostContext& context,
   constexpr int64_t kLookupTableFindCostScale = 8;
   constexpr int64_t kLookupTableFindStringKeyCostScale = 16;
 
-  auto value_type = op.values().getType().cast<mlir::TensorType>();
-  auto key_type = op.keys().getType().cast<mlir::TensorType>();
+  auto value_type = op.getValues().getType().cast<mlir::TensorType>();
+  auto key_type = op.getKeys().getType().cast<mlir::TensorType>();
 
   int64_t output_size = InferTensorSize(context, value_type);
 
@@ -71,14 +71,14 @@ int64_t InferLookupTableFindV2Cost(const CostContext& context,
 // The cost function for tf.GatherV2.
 int64_t InferGatherV2Cost(const CostContext& context, mlir::TF::GatherV2Op op) {
   return InferTensorSize(context,
-                         op.output().getType().cast<mlir::TensorType>());
+                         op.getOutput().getType().cast<mlir::TensorType>());
 }
 
 // The cost function for tf.SparseSegmentSumOp.
 template <typename OpType>
 int64_t InferSparseSegmentOpCost(const CostContext& context, OpType op) {
   return InferTensorSize(
-      context, op.output().getType().template cast<mlir::TensorType>());
+      context, op.getOutput().getType().template cast<mlir::TensorType>());
 }
 
 // CostFunctionRegistry is a map from op names to their cost functions.
@@ -128,14 +128,7 @@ void RegisterCostFunction(absl::string_view op_name,
                        std::move(cost_function));
 }
 
-int64_t CostAnalysis::GetCost(mlir::Operation* op, int64_t op_key) const {
-  // Try to use its measured cost.
-  const auto& measured_cost_map = op_cost_map_proto_.op_cost_map();
-  if (const auto op_cost = measured_cost_map.find(op_key);
-      op_cost != measured_cost_map.end()) {
-    return op_cost->second;
-  }
-
+int64_t CostAnalysis::GetCost(mlir::Operation* op) const {
   assert(cost_map_.count(op) > 0);
   return cost_map_.lookup(op);
 }
@@ -201,16 +194,5 @@ void CostAnalysis::EvaluateCost(mlir::Operation* op) {
   cost_map_[op] = cost;
 }
 
-Status CostAnalysis::ReadMeasuredCosts() {
-  const char* env_var = getenv("TF_TFRT_MEASURED_COST_PATH");
-  // No need to read because the cost measurement is disabled.
-  if (env_var == nullptr) return OkStatus();
-
-  tensorflow::Env* env = Env::Default();
-  const std::string measured_cost_path(env_var);
-  TF_RETURN_IF_ERROR(env->FileExists(measured_cost_path));
-  return ReadTextProto(env, measured_cost_path, &op_cost_map_proto_);
-}
-
 }  // namespace tfrt_compiler
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
index 77d2fedf336..9f8d957e066 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
@@ -38,21 +38,18 @@ class CostAnalysis {
  public:
   explicit CostAnalysis(mlir::func::FuncOp func_op) {
     AnalyzeArguments(func_op);
-    TF_CHECK_OK(ReadMeasuredCosts());
     AnalyzeBlock(&func_op.front());
   }
 
-  int64_t GetCost(mlir::Operation* op, int64_t op_key) const;
+  int64_t GetCost(mlir::Operation* op) const;
 
  private:
   void AnalyzeArguments(mlir::func::FuncOp func_op);
   void AnalyzeBlock(mlir::Block* block);
   void EvaluateCost(mlir::Operation* op);
-  Status ReadMeasuredCosts();
 
   int64_t max_arg_size_ = 1;
   llvm::DenseMap<mlir::Operation*, int64_t> cost_map_;
-  tfrt_stub::OpCostMapProto op_cost_map_proto_;
 };
 
 struct CostContext {
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.cc
index c27960fbe26..53cd5945766 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.cc
@@ -31,9 +31,7 @@ bool IsTensorArrayOp(mlir::Operation* op) {
 static bool FunctionContainsOnlyNoSideEffectOpOrTensorArrayOp(
     mlir::func::FuncOp func_op) {
   for (mlir::Operation& op : func_op.front()) {
-    if (!mlir::MemoryEffectOpInterface::hasNoEffect(&op) &&
-        !IsTensorArrayOp(&op))
-      return false;
+    if (!mlir::isMemoryEffectFree(&op) && !IsTensorArrayOp(&op)) return false;
   }
 
   return true;
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/test_cost_analysis_pass.cc b/tensorflow/compiler/mlir/tfrt/analysis/test_cost_analysis_pass.cc
index 84470c64f63..60c4f6e480b 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/test_cost_analysis_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/test_cost_analysis_pass.cc
@@ -32,9 +32,8 @@ class TestCostAnalysis
     const auto& cost_analysis = getAnalysis<CostAnalysis>();
 
     auto func_op = getOperation();
-    int64_t op_key = 0;
     for (auto& op : func_op.front()) {
-      op.emitRemark() << "Cost: " << cost_analysis.GetCost(&op, op_key++);
+      op.emitRemark() << "Cost: " << cost_analysis.GetCost(&op);
     }
   }
 };
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD b/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
index 707ef6c4abd..00a5639381b 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
@@ -5,7 +5,10 @@ load(
     "tf_cc_test",
 )
 
-package(default_visibility = ["//visibility:private"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
+)
 
 licenses(["notice"])
 
@@ -47,6 +50,7 @@ cc_library(
     hdrs = ["benchmark_mlir_function.h"],
     deps = [
         ":benchmark",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tfrt:host_context_util",
         "//tensorflow/compiler/mlir/tfrt:runtime_fallback_executor",
@@ -88,6 +92,16 @@ tf_cc_test(
     deps = [":cwise_op_unary_benchmark"],
 )
 
+tf_cc_test(
+    name = "concat_benchmark",
+    testonly = 1,
+    srcs = ["concat_benchmark.cc"],
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+    ],
+)
+
 tf_cc_binary(
     name = "cwise_op_fusion_benchmark",
     testonly = 1,
@@ -164,7 +178,9 @@ tf_cc_binary(
     ]),
     deps = [
         ":benchmark",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir/tfrt:host_context_util",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -206,6 +222,54 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_binary(
+    name = "map_op_benchmark",
+    testonly = 1,
+    srcs = ["map_op_benchmark.cc"],
+    # Args() not supported. Enable when we got rid of tf benchmark and use the
+    # standard gunit benchmark.
+    tags = if_oss([
+        "no_oss",
+        "manual",
+    ]),
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+    ],
+)
+
+tf_cc_binary(
+    name = "fused_map_bcast_benchmark",
+    testonly = 1,
+    srcs = ["fused_map_bcast_benchmark.cc"],
+    # Args() not supported. Enable when we got rid of tf benchmark and use the
+    # standard gunit benchmark.
+    tags = if_oss([
+        "no_oss",
+        "manual",
+    ]),
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+    ],
+)
+
+tf_cc_binary(
+    name = "scatter_op_benchmark",
+    testonly = 1,
+    srcs = ["scatter_op_benchmark.cc"],
+    # Args() not supported. Enable when we got rid of tf benchmark and use the
+    # standard gunit benchmark.
+    tags = if_oss([
+        "no_oss",
+        "manual",
+    ]),
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+    ],
+)
+
 tf_cc_binary(
     name = "sum_full_op_benchmark",
     testonly = 1,
@@ -307,3 +371,20 @@ tf_cc_binary(
         "@llvm-project//llvm:Support",
     ],
 )
+
+tf_cc_binary(
+    name = "reverse_op_benchmark",
+    testonly = 1,
+    srcs = ["reverse_op_benchmark.cc"],
+    # Args() not supported. Enable when we got rid of tf benchmark and use the
+    # standard gunit benchmark.
+    tags = if_oss([
+        "no_oss",
+        "manual",
+    ]),
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
index a038a91ff8f..58ff8929c2a 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/core/platform/logging.h"
@@ -128,6 +129,16 @@ MemrefDesc TensorToMemrefDesc(const Tensor& tensor) {
       });
 }
 
+llvm::SmallVector<int64_t> GetTensorTypeShape(
+    llvm::ArrayRef<int64_t> shape, llvm::ArrayRef<bool> dynamic_dims) {
+  llvm::SmallVector<int64_t> type_shape;
+  for (int64_t i = 0; i < shape.size(); ++i) {
+    type_shape.push_back(
+        dynamic_dims[i] == kDynamicDim ? mlir::ShapedType::kDynamic : shape[i]);
+  }
+  return type_shape;
+}
+
 std::string PrintTensorType(llvm::ArrayRef<int64_t> shape,
                             llvm::StringRef element_type) {
   std::string result{"tensor<"};
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
index e8350f2c977..c315d2e9917 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
@@ -168,7 +168,10 @@ struct ExecuteAssignOp {
 // Common utilities.
 // -------------------------------------------------------------------------- //
 
-static constexpr int64_t kDynSize = mlir::ShapedType::kDynamicSize;
+static constexpr int64_t kDynSize = mlir::ShapedType::kDynamic;
+
+llvm::SmallVector<int64_t> GetTensorTypeShape(
+    llvm::ArrayRef<int64_t> shape, llvm::ArrayRef<bool> dynamic_dims);
 
 // Prints an MLIR tensor type, i.e. for `shape` {1, kDynSize} and `element_type`
 // "f32" the output is "tensor<1x?xf32>".
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
index 04d2b524d8d..de7bebf43d9 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfrt/utils/host_context.h"
@@ -90,6 +91,9 @@ void RunJitRtBenchmark(::testing::benchmark::State& state,
                       : CreateSingleThreadedHostContext();
 
   TfJitRtPipelineOptions tf_jitrt_opts;
+  tf_jitrt_opts.enable_xla_cpu_transformations =
+      tensorflow::GetJitRtFlags().enable_xla_cpu_transformations;
+  tf_jitrt_opts.lower_to_mmt4d = tensorflow::GetJitRtFlags().pack_matmul;
   tf_jitrt_opts.vectorize = vectorize;
   tf_jitrt_opts.codegen_transpose = codegen_transpose;
   JitExecutable& jit_executable =
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/concat_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/concat_benchmark.cc
new file mode 100644
index 00000000000..a020c946077
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/concat_benchmark.cc
@@ -0,0 +1,435 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+llvm::SmallVector<int64_t> GetOutputTypeShape(llvm::ArrayRef<int64_t> arg_shape,
+                                              llvm::ArrayRef<bool> dynamic_dims,
+                                              int64_t concat_dim,
+                                              int64_t num_concats) {
+  llvm::SmallVector<int64_t> out_ty =
+      GetTensorTypeShape(arg_shape, dynamic_dims);
+  if (dynamic_dims[concat_dim] == kStaticDim) out_ty[concat_dim] *= num_concats;
+  return out_ty;
+}
+
+static const char* kBinaryConcatIR = R"(
+func.func @main(%lhs: {0}, %rhs: {0}) -> {1} {
+  %0 = "tf.Log"(%lhs): ({0}) -> {0}
+  %1 = "tf.Log"(%rhs): ({0}) -> {0}
+  %2 = "tf.Const"() {{ value = dense<{2}> : tensor<i64> } : () -> tensor<i64>
+  %3 = "tf.ConcatV2"(%0, %1, %2) : ({0}, {0}, tensor<i64>) -> {1}
+  %4 = "tf.Log"(%3): ({1}) -> {1}
+  func.return %4 : {1}
+}
+)";
+
+std::string GetBinaryConcatIR(llvm::ArrayRef<int64_t> arg_shape,
+                              llvm::ArrayRef<bool> dynamic_dims,
+                              int64_t concat_dim) {
+  llvm::SmallVector<int64_t> in_ty =
+      GetTensorTypeShape(arg_shape, dynamic_dims);
+  llvm::SmallVector<int64_t> out_ty = GetOutputTypeShape(
+      arg_shape, dynamic_dims, concat_dim, /*num_concats=*/2);
+  return llvm::formatv(kBinaryConcatIR, PrintTensorType(in_ty, "f32"),
+                       PrintTensorType(out_ty, "f32"), concat_dim);
+}
+
+static const char* kTeraryConcatIR = R"(
+func.func @main(%arg0: {0}, %arg1: {0}, %arg2: {0}) -> {1} {
+  %0 = "tf.Log"(%arg0): ({0}) -> {0}
+  %1 = "tf.Log"(%arg1): ({0}) -> {0}
+  %2 = "tf.Log"(%arg2): ({0}) -> {0}
+  %3 = "tf.Const"() {{ value = dense<{2}> : tensor<i64> } : () -> tensor<i64>
+  %4 = "tf.ConcatV2"(%0, %1, %2, %3) : ({0}, {0}, {0}, tensor<i64>) -> {1}
+  %5 = "tf.Log"(%4): ({1}) -> {1}
+  func.return %5 : {1}
+}
+)";
+
+std::string GetTernaryConcatIR(llvm::ArrayRef<int64_t> arg_shape,
+                               llvm::ArrayRef<bool> dynamic_dims,
+                               int64_t concat_dim) {
+  llvm::SmallVector<int64_t> in_ty =
+      GetTensorTypeShape(arg_shape, dynamic_dims);
+  llvm::SmallVector<int64_t> out_ty = GetOutputTypeShape(
+      arg_shape, dynamic_dims, concat_dim, /*num_concats=*/3);
+  return llvm::formatv(kTeraryConcatIR, PrintTensorType(in_ty, "f32"),
+                       PrintTensorType(out_ty, "f32"), concat_dim);
+}
+
+static const char* kOctonaryConcatIR = R"(
+func.func @main(%arg0: {0}, %arg1: {0}, %arg2: {0}, %arg3: {0}, %arg4: {0},
+    %arg5: {0}, %arg6: {0}, %arg7: {0}) -> {1} {
+  %0 = "tf.Log"(%arg0): ({0}) -> {0}
+  %1 = "tf.Log"(%arg1): ({0}) -> {0}
+  %2 = "tf.Log"(%arg2): ({0}) -> {0}
+  %3 = "tf.Log"(%arg3): ({0}) -> {0}
+  %4 = "tf.Log"(%arg4): ({0}) -> {0}
+  %5 = "tf.Log"(%arg5): ({0}) -> {0}
+  %6 = "tf.Log"(%arg6): ({0}) -> {0}
+  %7 = "tf.Log"(%arg7): ({0}) -> {0}
+  %8 = "tf.Const"() {{ value = dense<{2}> : tensor<i64> } : () -> tensor<i64>
+  %9 = "tf.ConcatV2"(%0, %1, %2, %3, %4, %5, %6, %7, %8)
+      : ({0}, {0}, {0}, {0}, {0}, {0}, {0}, {0}, tensor<i64>) -> {1}
+  %10 = "tf.Log"(%9): ({1}) -> {1}
+  func.return %10 : {1}
+}
+)";
+
+std::string GetOctonaryConcatIR(llvm::ArrayRef<int64_t> arg_shape,
+                                llvm::ArrayRef<bool> dynamic_dims,
+                                int64_t concat_dim) {
+  llvm::SmallVector<int64_t> in_ty =
+      GetTensorTypeShape(arg_shape, dynamic_dims);
+  llvm::SmallVector<int64_t> out_ty = GetOutputTypeShape(
+      arg_shape, dynamic_dims, concat_dim, /*num_concats=*/8);
+  return llvm::formatv(kOctonaryConcatIR, PrintTensorType(in_ty, "f32"),
+                       PrintTensorType(out_ty, "f32"), concat_dim);
+}
+
+template <int64_t D>
+TensorShape GetOutShape(llvm::ArrayRef<Tensor> inputs) {
+  auto lhsShape = inputs[0].shape();
+  auto rhsShape = inputs[1].shape();
+  std::vector<int64_t> shape;
+  shape.reserve(inputs[0].dims());
+  for (int64_t i = 0; i < inputs[0].dims(); ++i)
+    shape.push_back(inputs[0].dim_size(i));
+  for (int64_t i = 1; i < inputs.size(); ++i) shape[D] += inputs[i].dim_size(D);
+  return TensorShape(shape);
+}
+
+template <int64_t R, int64_t D>
+auto GetEigenBinaryConcatFn() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    auto lhs = inputs[0].tensor<float, R>();
+    auto rhs = inputs[1].tensor<float, R>();
+    Tensor output(DT_FLOAT, GetOutShape<D>(inputs));
+    auto out = output.tensor<float, R>();
+    if (device.has_value()) {
+      out.device(*device) = lhs.log().concatenate(rhs.log(), D).log();
+    } else {
+      out = lhs.log().concatenate(rhs.log(), D).log();
+    }
+  };
+}
+
+template <int64_t R, int64_t D>
+auto GetEigenTernaryConcatFn() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    auto arg0 = inputs[0].tensor<float, R>();
+    auto arg1 = inputs[1].tensor<float, R>();
+    auto arg2 = inputs[2].tensor<float, R>();
+    Tensor output(DT_FLOAT, GetOutShape<D>(inputs));
+    auto out = output.tensor<float, R>();
+    if (device.has_value()) {
+      out.device(*device) = arg0.log()
+                                .concatenate(arg1.log(), D)
+                                .concatenate(arg2.log(), D)
+                                .log();
+    } else {
+      out = arg0.log()
+                .concatenate(arg1.log(), D)
+                .concatenate(arg2.log(), D)
+                .log();
+    }
+  };
+}
+
+template <int64_t R, int64_t D>
+auto GetEigenOctonaryConcatFn() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    auto arg0 = inputs[0].tensor<float, R>();
+    auto arg1 = inputs[1].tensor<float, R>();
+    auto arg2 = inputs[2].tensor<float, R>();
+    auto arg3 = inputs[3].tensor<float, R>();
+    auto arg4 = inputs[4].tensor<float, R>();
+    auto arg5 = inputs[5].tensor<float, R>();
+    auto arg6 = inputs[6].tensor<float, R>();
+    auto arg7 = inputs[7].tensor<float, R>();
+    Tensor output(DT_FLOAT, GetOutShape<D>(inputs));
+    auto out = output.tensor<float, R>();
+    if (device.has_value()) {
+      out.device(*device) = arg0.log()
+                                .concatenate(arg1.log(), D)
+                                .concatenate(arg2.log(), D)
+                                .concatenate(arg3.log(), D)
+                                .concatenate(arg4.log(), D)
+                                .concatenate(arg5.log(), D)
+                                .concatenate(arg6.log(), D)
+                                .concatenate(arg7.log(), D)
+                                .log();
+    } else {
+      out = arg0.log()
+                .concatenate(arg1.log(), D)
+                .concatenate(arg2.log(), D)
+                .concatenate(arg3.log(), D)
+                .concatenate(arg4.log(), D)
+                .concatenate(arg5.log(), D)
+                .concatenate(arg6.log(), D)
+                .concatenate(arg7.log(), D)
+                .log();
+    }
+  };
+}
+
+#define WRAP(...) __VA_ARGS__
+
+#define BM_BINARY_CONCAT(NAME, RANK, ARG_SHAPE, DYNAMIC_DIMS, CONCAT_DIM)      \
+  BM_Jitrt(BinaryConcat_##NAME,                                                \
+           GetBinaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM), "main", \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),             \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))           \
+      ->Arg(0);                                                                \
+  BM_JitrtV(BinaryConcat_##NAME,                                               \
+            GetBinaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM),        \
+            "main",                                                            \
+            llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),            \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))          \
+      ->Arg(0);                                                                \
+  BM_Eigen(BinaryConcat_##NAME, (GetEigenBinaryConcatFn<RANK, CONCAT_DIM>()),  \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),             \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))           \
+      ->Arg(0)
+
+#define BM_TERNARY_CONCAT(NAME, RANK, ARG_SHAPE, DYNAMIC_DIMS, CONCAT_DIM) \
+  BM_Jitrt(TernaryConcat_##NAME,                                           \
+           GetTernaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM),    \
+           "main",                                                         \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))       \
+      ->Arg(0);                                                            \
+  BM_JitrtV(TernaryConcat_##NAME,                                          \
+            GetTernaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM),   \
+            "main",                                                        \
+            llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),        \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),        \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))      \
+      ->Arg(0);                                                            \
+  BM_Eigen(TernaryConcat_##NAME,                                           \
+           (GetEigenTernaryConcatFn<RANK, CONCAT_DIM>()),                  \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))       \
+      ->Arg(0)
+
+#define BM_OCTONARY_CONCAT(NAME, RANK, ARG_SHAPE, DYNAMIC_DIMS, CONCAT_DIM) \
+  BM_Jitrt(OcternaryConcat_##NAME,                                          \
+           GetOctonaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM),    \
+           "main",                                                          \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))        \
+      ->Arg(0);                                                             \
+  BM_JitrtV(OcternaryConcat_##NAME,                                         \
+            GetOctonaryConcatIR({ARG_SHAPE}, {DYNAMIC_DIMS}, CONCAT_DIM),   \
+            "main",                                                         \
+            llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),         \
+                            InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))       \
+      ->Arg(0);                                                             \
+  BM_Eigen(OcternaryConcat_##NAME,                                          \
+           (GetEigenOctonaryConcatFn<RANK, CONCAT_DIM>()),                  \
+           llvm::ArrayRef({InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE}),          \
+                           InputTensorSpec(DT_FLOAT, {ARG_SHAPE})}))        \
+      ->Arg(0)
+
+#define BM_NARY_CONCAT(NAME, RANK, ARG_SHAPE, DYNAMIC_DIMS, CONCAT_DIM) \
+  BM_BINARY_CONCAT(NAME, RANK, WRAP(ARG_SHAPE), WRAP(DYNAMIC_DIMS),     \
+                   CONCAT_DIM);                                         \
+  BM_TERNARY_CONCAT(NAME, RANK, WRAP(ARG_SHAPE), WRAP(DYNAMIC_DIMS),    \
+                    CONCAT_DIM);                                        \
+  BM_OCTONARY_CONCAT(NAME, RANK, WRAP(ARG_SHAPE), WRAP(DYNAMIC_DIMS),   \
+                     CONCAT_DIM)
+
+// Static Concat 1D
+#define BM_NARY_CONCAT_STATIC_1D(N) \
+  BM_NARY_CONCAT(Static1D_##N, 1, WRAP(N), WRAP(kStaticDim), 0)
+BM_NARY_CONCAT_STATIC_1D(1);
+BM_NARY_CONCAT_STATIC_1D(8);
+BM_NARY_CONCAT_STATIC_1D(1024);
+BM_NARY_CONCAT_STATIC_1D(1026);
+BM_NARY_CONCAT_STATIC_1D(1048576);
+BM_NARY_CONCAT_STATIC_1D(1048578);
+
+// Dynamic Concat 1D
+#define BM_NARY_CONCAT_DYNAMIC_1D(N) \
+  BM_NARY_CONCAT(Dynamic1D_##N, 1, WRAP(N), WRAP(kDynamicDim), 0)
+BM_NARY_CONCAT_DYNAMIC_1D(1);
+BM_NARY_CONCAT_DYNAMIC_1D(8);
+BM_NARY_CONCAT_DYNAMIC_1D(1024);
+BM_NARY_CONCAT_DYNAMIC_1D(1026);
+BM_NARY_CONCAT_DYNAMIC_1D(1048576);
+BM_NARY_CONCAT_DYNAMIC_1D(1048578);
+
+// Static Concat 2D
+#define BM_NARY_CONCAT_STATIC_2D(M, N, CONCAT_DIM)                    \
+  BM_NARY_CONCAT(Static2D_##M##x##N##_dim##CONCAT_DIM, 2, WRAP(M, N), \
+                 WRAP(kStaticDim, kStaticDim), CONCAT_DIM)
+// Sqaure operands
+BM_NARY_CONCAT_STATIC_2D(512, 512, 0);
+BM_NARY_CONCAT_STATIC_2D(512, 512, 1);
+BM_NARY_CONCAT_STATIC_2D(514, 514, 0);
+BM_NARY_CONCAT_STATIC_2D(514, 514, 1);
+BM_NARY_CONCAT_STATIC_2D(1024, 1024, 0);
+BM_NARY_CONCAT_STATIC_2D(1024, 1024, 1);
+BM_NARY_CONCAT_STATIC_2D(1026, 1026, 0);
+BM_NARY_CONCAT_STATIC_2D(1026, 1026, 1);
+// Slice operands
+BM_NARY_CONCAT_STATIC_2D(1, 1024, 0);
+BM_NARY_CONCAT_STATIC_2D(1024, 1, 1);
+BM_NARY_CONCAT_STATIC_2D(1, 1026, 0);
+BM_NARY_CONCAT_STATIC_2D(1026, 1, 1);
+BM_NARY_CONCAT_STATIC_2D(1, 1048576, 0);
+BM_NARY_CONCAT_STATIC_2D(1048576, 1, 1);
+BM_NARY_CONCAT_STATIC_2D(1, 1048578, 0);
+BM_NARY_CONCAT_STATIC_2D(1048578, 1, 1);
+
+// Concat 2D with static concatenation dimension
+#define BM_NARY_CONCAT_W_STATIC_CONCAT_DIM0_2D(M, N)                \
+  BM_NARY_CONCAT(StaticConcatDim2D_##M##x##N##_dim0, 2, WRAP(M, N), \
+                 WRAP(kStaticDim, kDynamicDim), 0)
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM0_2D(1, 1024);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM0_2D(1, 1026);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM0_2D(1, 1048576);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM0_2D(1, 1048578);
+#define BM_NARY_CONCAT_W_STATIC_CONCAT_DIM1_2D(M, N)                \
+  BM_NARY_CONCAT(StaticConcatDim2D_##M##x##N##_dim1, 2, WRAP(M, N), \
+                 WRAP(kDynamicDim, kStaticDim), 1)
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM1_2D(1024, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM1_2D(1026, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM1_2D(1048576, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM1_2D(1048578, 1);
+
+// Dynamic Concat 2D
+#define BM_NARY_CONCAT_DYNAMIC_2D(M, N, CONCAT_DIM)                    \
+  BM_NARY_CONCAT(Dynamic2D_##M##x##N##_dim##CONCAT_DIM, 2, WRAP(M, N), \
+                 WRAP(kDynamicDim, kDynamicDim), CONCAT_DIM)
+// Sqaure operands
+BM_NARY_CONCAT_DYNAMIC_2D(512, 512, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(512, 512, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(514, 514, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(514, 514, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(1024, 1024, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1024, 1024, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(1026, 1026, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1026, 1026, 1);
+// Slice operands
+BM_NARY_CONCAT_DYNAMIC_2D(1, 1024, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1024, 1, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(1, 1026, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1026, 1, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(1, 1048576, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1048576, 1, 1);
+BM_NARY_CONCAT_DYNAMIC_2D(1, 1048578, 0);
+BM_NARY_CONCAT_DYNAMIC_2D(1048578, 1, 1);
+
+// Static Concat 4D
+#define BM_NARY_CONCAT_STATIC_4D(M, N, O, P, CONCAT_DIM)                     \
+  BM_NARY_CONCAT(                                                            \
+      Static4D_##M##x##N##x##O##x##P##_dim##CONCAT_DIM, 4, WRAP(M, N, O, P), \
+      WRAP(kStaticDim, kStaticDim, kStaticDim, kStaticDim), CONCAT_DIM)
+// Sqaure operands
+BM_NARY_CONCAT_STATIC_4D(32, 32, 32, 32, 0);
+BM_NARY_CONCAT_STATIC_4D(32, 32, 32, 32, 1);
+BM_NARY_CONCAT_STATIC_4D(34, 34, 34, 34, 0);
+BM_NARY_CONCAT_STATIC_4D(34, 34, 34, 34, 1);
+BM_NARY_CONCAT_STATIC_4D(1024, 1024, 4, 4, 0);
+BM_NARY_CONCAT_STATIC_4D(1024, 1024, 4, 4, 1);
+BM_NARY_CONCAT_STATIC_4D(1026, 1026, 2, 6, 0);
+BM_NARY_CONCAT_STATIC_4D(1026, 1026, 2, 6, 1);
+// Slice operands
+BM_NARY_CONCAT_STATIC_4D(32, 32, 1, 1024, 2);
+BM_NARY_CONCAT_STATIC_4D(32, 32, 1024, 1, 3);
+BM_NARY_CONCAT_STATIC_4D(34, 34, 1, 1026, 2);
+BM_NARY_CONCAT_STATIC_4D(34, 34, 1026, 1, 3);
+BM_NARY_CONCAT_STATIC_4D(4, 4, 1, 1048576, 2);
+BM_NARY_CONCAT_STATIC_4D(4, 4, 1048576, 1, 3);
+BM_NARY_CONCAT_STATIC_4D(2, 6, 1, 1048578, 2);
+BM_NARY_CONCAT_STATIC_4D(2, 6, 1048578, 1, 3);
+
+// Concat 4D with static concatenation dimension
+#define BM_NARY_CONCAT_W_STATIC_CONCAT_DIM2_4D(M, N, O, P)          \
+  BM_NARY_CONCAT(StaticConcatDim4D_##M##x##N##x##O##x##P##_dim0, 4, \
+                 WRAP(M, N, O, P),                                  \
+                 WRAP(kDynamicDim, kDynamicDim, kStaticDim, kDynamicDim), 2)
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM2_4D(32, 32, 1, 1024);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM2_4D(34, 34, 1, 1026);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM2_4D(4, 4, 1, 1048576);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM2_4D(2, 6, 1, 1048578);
+#define BM_NARY_CONCAT_W_STATIC_CONCAT_DIM3_4D(M, N, O, P)          \
+  BM_NARY_CONCAT(StaticConcatDim4D_##M##x##N##x##O##x##P##_dim1, 4, \
+                 WRAP(M, N, O, P),                                  \
+                 WRAP(kDynamicDim, kDynamicDim, kDynamicDim, kStaticDim), 3)
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM3_4D(32, 32, 1024, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM3_4D(34, 34, 1026, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM3_4D(4, 4, 1048576, 1);
+BM_NARY_CONCAT_W_STATIC_CONCAT_DIM3_4D(2, 6, 1048578, 1);
+
+// Dynamic Concat 4D
+#define BM_NARY_CONCAT_DYNAMIC_4D(M, N, O, P, CONCAT_DIM)                     \
+  BM_NARY_CONCAT(                                                             \
+      Dynamic4D_##M##x##N##x##O##x##P##_dim##CONCAT_DIM, 4, WRAP(M, N, O, P), \
+      WRAP(kDynamicDim, kDynamicDim, kDynamicDim, kDynamicDim), CONCAT_DIM)
+// Sqaure operands
+BM_NARY_CONCAT_DYNAMIC_4D(32, 32, 32, 32, 0);
+BM_NARY_CONCAT_DYNAMIC_4D(32, 32, 32, 32, 1);
+BM_NARY_CONCAT_DYNAMIC_4D(34, 34, 34, 34, 0);
+BM_NARY_CONCAT_DYNAMIC_4D(34, 34, 34, 34, 1);
+BM_NARY_CONCAT_DYNAMIC_4D(1024, 1024, 4, 4, 0);
+BM_NARY_CONCAT_DYNAMIC_4D(1024, 1024, 4, 4, 1);
+BM_NARY_CONCAT_DYNAMIC_4D(1026, 1026, 2, 6, 0);
+BM_NARY_CONCAT_DYNAMIC_4D(1026, 1026, 2, 6, 1);
+// Slice operands
+BM_NARY_CONCAT_DYNAMIC_4D(32, 32, 1, 1024, 2);
+BM_NARY_CONCAT_DYNAMIC_4D(32, 32, 1024, 1, 3);
+BM_NARY_CONCAT_DYNAMIC_4D(34, 34, 1, 1026, 2);
+BM_NARY_CONCAT_DYNAMIC_4D(34, 34, 1026, 1, 3);
+BM_NARY_CONCAT_DYNAMIC_4D(4, 4, 1, 1048576, 2);
+BM_NARY_CONCAT_DYNAMIC_4D(4, 4, 1048576, 1, 3);
+BM_NARY_CONCAT_DYNAMIC_4D(2, 6, 1, 1048578, 2);
+BM_NARY_CONCAT_DYNAMIC_4D(2, 6, 1048578, 1, 3);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/fused_map_bcast_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/fused_map_bcast_benchmark.cc
new file mode 100644
index 00000000000..99f3a84d97d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/fused_map_bcast_benchmark.cc
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+const char* kMapIR = R"(
+  func.func @main(%arg0: {0}, %arg1: {1}) -> {1} {
+    %abs = "tf.Abs"(%arg0)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %exp = "tf.Exp"(%abs)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %tanh = "tf.Tanh"(%exp)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %add = "tf.AddV2"(%exp, %arg1)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}, {1}) -> {1}
+    func.return %add : {1}
+  }
+)";
+
+std::string Map(llvm::ArrayRef<bool> dynamic_dims,
+                llvm::ArrayRef<ssize_t> input_shape,
+                llvm::ArrayRef<ssize_t> bcst_shape) {
+  llvm::SmallVector<int64_t, 2> mlir_input_shape;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    mlir_input_shape.push_back(dynamic_dims[i] ? kDynSize : input_shape[i]);
+  }
+  return llvm::formatv(kMapIR, PrintTensorType(mlir_input_shape, "f32"),
+                       PrintTensorType(bcst_shape, "f32"));
+}
+
+auto EigenMap() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice>) {
+    Tensor output(DT_FLOAT, {inputs[0].dim_size(0), inputs[0].dim_size(1)});
+
+    auto in0 = inputs[0].tensor<float, 2>();
+    auto in1 = inputs[1].tensor<float, 2>();
+    auto out = output.tensor<float, 2>();
+    out.setZero();
+    Eigen::DefaultDevice d;
+    out.device(d) = in0.abs().exp().tanh() + in1;
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols,
+                                          ssize_t cols1) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols}),
+          InputTensorSpec(DT_FLOAT, {rows, cols1})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS, COLS1)           \
+  BM(JitrtV(NAME,                                                             \
+            Map({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}, {ROWS, COLS1}),     \
+            "main", Inputs(ROWS, COLS, COLS1)));                              \
+  BM(Eigen(NAME, EigenMap(), Inputs(ROWS, COLS, COLS1)));                     \
+  BM(Tfrt(NAME, Map({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}, {ROWS, COLS1}), \
+          "main", Inputs(ROWS, COLS, COLS1)))
+
+#define BM_DYNAMIC_ALL(ROWS, COLS, COLS1)                                      \
+  BM_SUITE(FusedMapBcastDynamicAll_##ROWS##_##COLS1, kDynamicDim, kDynamicDim, \
+           ROWS, COLS, COLS1)
+BM_DYNAMIC_ALL(2, 1, 80);
+BM_DYNAMIC_ALL(8, 1, 6);
+BM_DYNAMIC_ALL(80, 1, 1);
+BM_DYNAMIC_ALL(80, 1, 60);
+BM_DYNAMIC_ALL(81, 1, 61);
+BM_DYNAMIC_ALL(800, 1, 600);
+BM_DYNAMIC_ALL(802, 1, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS, COLS1)                                     \
+  BM_SUITE(FusedMapBcastStaticRow_##ROWS##_##COLS1, kStaticDim, kDynamicDim, \
+           ROWS, COLS, COLS1)
+BM_STATIC_ROW(2, 1, 80);
+BM_STATIC_ROW(8, 1, 6);
+BM_STATIC_ROW(80, 1, 1);
+BM_STATIC_ROW(80, 1, 60);
+BM_STATIC_ROW(81, 1, 61);
+BM_STATIC_ROW(800, 1, 600);
+BM_STATIC_ROW(802, 1, 602);
+
+#define BM_STATIC_COL(ROWS, COLS, COLS1)                                     \
+  BM_SUITE(FusedMapBcastStaticCol_##ROWS##_##COLS1, kDynamicDim, kStaticDim, \
+           ROWS, COLS, COLS1)
+BM_STATIC_COL(2, 1, 80);
+BM_STATIC_COL(8, 1, 6);
+BM_STATIC_COL(80, 1, 1);
+BM_STATIC_COL(80, 1, 60);
+BM_STATIC_COL(81, 1, 61);
+BM_STATIC_COL(800, 1, 600);
+BM_STATIC_COL(802, 1, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS, COLS1)                                    \
+  BM_SUITE(FusedMapBcastStaticAll_##ROWS##_##COLS1, kStaticDim, kStaticDim, \
+           ROWS, COLS, COLS1)
+BM_STATIC_ALL(2, 1, 80);
+BM_STATIC_ALL(8, 1, 6);
+BM_STATIC_ALL(80, 1, 1);
+BM_STATIC_ALL(80, 1, 60);
+BM_STATIC_ALL(81, 1, 61);
+BM_STATIC_ALL(800, 1, 600);
+BM_STATIC_ALL(802, 1, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/map_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/map_op_benchmark.cc
new file mode 100644
index 00000000000..53453374836
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/map_op_benchmark.cc
@@ -0,0 +1,115 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+const char* kMapIR = R"(
+  func.func @main(%arg0: {0}) -> {0} {
+    %abs = "tf.Abs"(%arg0)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %exp = "tf.Exp"(%abs)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %tanh = "tf.Tanh"(%exp)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    func.return %tanh : {0}
+  }
+)";
+
+std::string Map(llvm::ArrayRef<bool> dynamic_dims,
+                llvm::ArrayRef<ssize_t> input_shape) {
+  llvm::SmallVector<int64_t, 2> mlir_input_shape;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    mlir_input_shape.push_back(dynamic_dims[i] ? kDynSize : input_shape[i]);
+  }
+  return llvm::formatv(kMapIR, PrintTensorType(mlir_input_shape, "f32"));
+}
+
+auto EigenMap() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice>) {
+    Tensor output(DT_FLOAT, {inputs[0].dim_size(0), inputs[0].dim_size(1)});
+
+    auto in = inputs[0].tensor<float, 2>();
+    auto out = output.tensor<float, 2>();
+    out.setZero();
+    Eigen::DefaultDevice d;
+    out.device(d) = in.abs().exp().tanh();
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS)             \
+  BM(JitrtV(NAME, Map({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}), "main", \
+            Inputs(ROWS, COLS)));                                        \
+  BM(Eigen(NAME, EigenMap(), Inputs(ROWS, COLS)));                       \
+  BM(Tfrt(NAME, Map({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}), "main",   \
+          Inputs(ROWS, COLS)))
+
+#define BM_DYNAMIC_ALL(ROWS, COLS) \
+  BM_SUITE(MapDynamicAll_##ROWS##_##COLS, kDynamicDim, kDynamicDim, ROWS, COLS)
+BM_DYNAMIC_ALL(2, 80);
+BM_DYNAMIC_ALL(8, 6);
+BM_DYNAMIC_ALL(80, 1);
+BM_DYNAMIC_ALL(80, 60);
+BM_DYNAMIC_ALL(81, 61);
+BM_DYNAMIC_ALL(800, 600);
+BM_DYNAMIC_ALL(802, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS) \
+  BM_SUITE(MapStaticRow_##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+BM_STATIC_ROW(2, 80);
+BM_STATIC_ROW(8, 6);
+BM_STATIC_ROW(80, 1);
+BM_STATIC_ROW(80, 60);
+BM_STATIC_ROW(81, 61);
+BM_STATIC_ROW(800, 600);
+BM_STATIC_ROW(802, 602);
+
+#define BM_STATIC_COL(ROWS, COLS) \
+  BM_SUITE(MapStaticCol_##ROWS##_##COLS, kDynamicDim, kStaticDim, ROWS, COLS)
+BM_STATIC_COL(2, 80);
+BM_STATIC_COL(8, 6);
+BM_STATIC_COL(80, 1);
+BM_STATIC_COL(80, 60);
+BM_STATIC_COL(81, 61);
+BM_STATIC_COL(800, 600);
+BM_STATIC_COL(802, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS) \
+  BM_SUITE(MapStaticAll_##ROWS##_##COLS, kStaticDim, kStaticDim, ROWS, COLS)
+BM_STATIC_ALL(2, 80);
+BM_STATIC_ALL(8, 6);
+BM_STATIC_ALL(80, 1);
+BM_STATIC_ALL(80, 60);
+BM_STATIC_ALL(81, 61);
+BM_STATIC_ALL(800, 600);
+BM_STATIC_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.cc
index e602deb07ce..77198f43cb2 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.cc
@@ -15,36 +15,101 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.h"
 
+#include <string>
+
 namespace tensorflow {
 
-static const char* mlir_input = R"(
-func.func @matmul(%arg0: tensor<?x?xf32>,
-             %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-    %0 = "tf.MatMul"(%arg0, %arg1) {
+// Use type aliases compatible with MLIR type names.
+using f32 = float;
+
+static const char* matmul_ir_skeleton = R"(
+func.func @matmul(%arg0: {0}, %arg1: {1}) -> {2} {
+    %0 = "tf.MatMul"(%arg0, %arg1) {{
            transpose_a = false,
            transpose_b = false
-         } : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-    func.return %0 : tensor<?x?xf32>
+         } : ({0}, {1}) -> {2}
+    func.return %0 : {2}
   }
 )";
 
-// Use type aliases compatible with MLIR type names.
-using f32 = float;
+std::string GetMatmulIR(llvm::ArrayRef<int32_t> lhs_shape,
+                        llvm::ArrayRef<bool> lhs_dyn_dims,
+                        llvm::ArrayRef<int32_t> rhs_shape,
+                        llvm::ArrayRef<bool> rhs_dyn_dims,
+                        llvm::ArrayRef<int32_t> out_shape,
+                        llvm::ArrayRef<bool> out_dyn_dims,
+                        llvm::StringRef element_type) {
+  llvm::SmallVector<int64_t, 2> mlir_lhs_shape, mlir_rhs_shape, mlir_out_shape;
+  for (int i = 0; i < lhs_shape.size(); ++i) {
+    mlir_lhs_shape.push_back(lhs_dyn_dims[i] ? kDynSize : lhs_shape[i]);
+  }
+  for (int i = 0; i < rhs_shape.size(); ++i) {
+    mlir_rhs_shape.push_back(rhs_dyn_dims[i] ? kDynSize : rhs_shape[i]);
+  }
+  for (int i = 0; i < out_shape.size(); ++i) {
+    mlir_out_shape.push_back(out_dyn_dims[i] ? kDynSize : out_shape[i]);
+  }
+  return llvm::formatv(
+      matmul_ir_skeleton,
+      PrintTensorType(mlir_lhs_shape, element_type),  // LHS type {0}
+      PrintTensorType(mlir_rhs_shape, element_type),  // RHS type {1}
+      PrintTensorType(mlir_out_shape, element_type)   // Out type {2}
+  );
+}
+
+BM_TFMlir_DYNAMIC_ALL(16, 16, 16, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(16, 16, 16, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(16, 16, 16, f32);
+
+BM_TFMlir_DYNAMIC_ALL(64, 64, 64, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(64, 64, 64, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(64, 64, 64, f32);
+
+BM_TFMlir_DYNAMIC_ALL(128, 128, 128, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(128, 128, 128, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(128, 128, 128, f32);
+
+BM_TFMlir_DYNAMIC_ALL(256, 256, 256, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(256, 256, 256, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(256, 256, 256, f32);
+
+BM_TFMlir_DYNAMIC_ALL(512, 512, 512, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(512, 512, 512, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(512, 512, 512, f32);
+
+BM_TFMlir_DYNAMIC_ALL(1024, 1024, 1024, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(1024, 1024, 1024, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(1024, 1024, 1024, f32);
+
+BM_TFMlir_DYNAMIC_ALL(2048, 2048, 2048, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(2048, 2048, 2048, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(2048, 2048, 2048, f32);
+
+BM_TFMlir_DYNAMIC_ALL(100, 100, 100, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(100, 100, 100, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(100, 100, 100, f32);
+
+BM_TFMlir_DYNAMIC_ALL(1, 18, 300, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(1, 18, 300, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(1, 18, 300, f32);
+
+BM_TFMlir_DYNAMIC_ALL(1, 300, 300, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(1, 300, 300, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(1, 300, 300, f32);
+
+BM_TFMlir_DYNAMIC_ALL(1, 300, 1, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(1, 300, 1, 8, 8, 8, "matmul", f32);
+BM_Eigen_WRAPPER(1, 300, 1, f32);
 
-BM_TFMlir(MatMul, mlir_input, "matmul", f32)
-    ->Args({10, 10, 10})
-    ->Args({128, 128, 128})
-    ->Args({256, 256, 256})
-    ->Args({1, 18, 300})
-    ->Args({1, 300, 300})
-    ->Args({1, 300, 1});
-
-BM_Eigen(MatMul, f32)
-    ->Args({10, 10, 10})
-    ->Args({128, 128, 128})
-    ->Args({256, 256, 256})
-    ->Args({1, 18, 300})
-    ->Args({1, 300, 300})
-    ->Args({1, 300, 1});
+BM_TFMlir_DYNAMIC_ALL(10, 10, 10, 8, 8, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 8, 8, 8, "matmul", f32);
+BM_TFMlir_DYNAMIC_ALL(10, 10, 10, 4, 4, 4, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 4, 4, 4, "matmul", f32);
+BM_TFMlir_DYNAMIC_ALL(10, 10, 10, 2, 2, 2, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 2, 2, 2, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 2, 2, 8, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 2, 8, 2, "matmul", f32);
+BM_TFMlir_STATIC_ALL(10, 10, 10, 8, 2, 2, "matmul", f32);
+BM_Eigen_WRAPPER(10, 10, 10, f32);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.h
index 3741f894cd0..07d8952a7e4 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/matmul_op_benchmark.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_MATMUL_OP_BENCHMARK_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_MATMUL_OP_BENCHMARK_H_
 
+#include <string>
 #include <utility>
 
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 #include "tensorflow/compiler/mlir/tfrt/utils/host_context.h"
 
@@ -27,6 +29,13 @@ namespace tensorflow {
 // used only to build benchmarks for different functions in this folder, so
 // it is ok to put convenience using-declarations here.
 
+std::string GetMatmulIR(llvm::ArrayRef<int32_t> lhs_shape,
+                        llvm::ArrayRef<bool> lhs_dynamic_dims,
+                        llvm::ArrayRef<int32_t> rhs_shape,
+                        llvm::ArrayRef<bool> rhs_dynamic_dims,
+                        llvm::ArrayRef<int32_t> output_shape,
+                        llvm::ArrayRef<bool> output_dynamic_dims);
+
 using ::tfrt::AsyncValue;
 using ::tfrt::AsyncValuePtr;
 using ::tfrt::HostContext;
@@ -46,7 +55,9 @@ using ::xla::runtime::MemrefDesc;
 
 template <typename T>
 void RunMatMulMlirBenchmark(::testing::benchmark::State& state,
-                            llvm::StringRef mlir_input,
+                            // output_name is actually used on debug mode.
+                            // NOLINTNEXTLINE
+                            std::string output_name, llvm::StringRef mlir_input,
                             llvm::StringRef function_name) {
   // MatMul: [m, k] x [k, n]
   ssize_t m = state.range(0);
@@ -56,6 +67,13 @@ void RunMatMulMlirBenchmark(::testing::benchmark::State& state,
   std::unique_ptr<HostContext> host = CreateSingleThreadedHostContext();
 
   TfJitRtPipelineOptions tf_jitrt_opts;
+  tf_jitrt_opts.vectorize = tensorflow::GetJitRtFlags().vectorize;
+  tf_jitrt_opts.lower_to_mmt4d = tensorflow::GetJitRtFlags().pack_matmul;
+  tf_jitrt_opts.enable_xla_cpu_transformations =
+      tensorflow::GetJitRtFlags().enable_xla_cpu_transformations;
+  tf_jitrt_opts.matmul_tile_sizes = {state.range(3), state.range(4),
+                                     state.range(5)};
+
   JitExecutable& jit_executable =
       CreateJitExecutable(*host, mlir_input, function_name,
                           /*lower_from_tensorflow=*/true, tf_jitrt_opts);
@@ -99,6 +117,19 @@ void RunMatMulMlirBenchmark(::testing::benchmark::State& state,
       jit_executable.GetExecutable(operands);
   if (!executable.ok()) LOG(FATAL) << "Failed to specialize executable";
 
+#if defined(DEBUG_XLA_RUNTIME_COMPILER)
+  std::string dump_path = "/tmp/";
+  std::unique_ptr<llvm::MemoryBuffer> obj = (*executable)->obj_file();
+  CHECK(obj) << "Failed to get executable obj file";
+  std::string object_filename = output_name;
+  if (tf_jitrt_opts.lower_to_mmt4d) object_filename += "_packed";
+  object_filename += ".o";
+  std::error_code ec;
+  llvm::raw_fd_ostream dump_stream(dump_path + object_filename, ec);
+  CHECK(!ec) << "Failed to dump object file: " << ec.message();
+  dump_stream.write(obj->getBufferStart(), obj->getBufferSize());
+#endif
+
   // Wait for the compilation completion.
   host->Await({executable->CopyRef()});
 
@@ -143,6 +174,8 @@ void RunMatMulEigenBenchmark(::testing::benchmark::State& state) {
   using Device = Eigen::DefaultDevice;
   Device d;
 
+  CHECK(d.numThreads() == 1) << "Executing Eigen in multi-threaded";
+
   Eigen::Tensor<T, 2, Eigen::RowMajor> dst(m, n);
   dst.setZero();
 
@@ -166,16 +199,41 @@ void RunMatMulEigenBenchmark(::testing::benchmark::State& state) {
 // Macros to dispatch to different MatMul shapes.
 // -------------------------------------------------------------------------- //
 
-#define BM_TFMlir(NAME, MLIR_INPUT, FN, TYPE)                               \
-  static void BM_mlir_##NAME##_##TYPE(::testing::benchmark::State& state) { \
-    RunMatMulMlirBenchmark<TYPE>(state, MLIR_INPUT, FN);                    \
-  }                                                                         \
+#define INTS(...) __VA_ARGS__
+#define BOOLS(...) __VA_ARGS__
+
+#define BM_TFMlir(NAME, LHS_SHAPE, LHS_DYN_DIMS, RHS_SHAPE, RHS_DYN_DIMS,     \
+                  OUT_SHAPE, OUT_DYN_DIMS, FN, TYPE)                          \
+  static void BM_mlir_##NAME##_##TYPE(::testing::benchmark::State& state) {   \
+    RunMatMulMlirBenchmark<TYPE>(                                             \
+        state, #NAME,                                                         \
+        GetMatmulIR({LHS_SHAPE}, {LHS_DYN_DIMS}, {RHS_SHAPE}, {RHS_DYN_DIMS}, \
+                    {OUT_SHAPE}, {OUT_DYN_DIMS}, #TYPE),                      \
+        FN);                                                                  \
+  }                                                                           \
   BENCHMARK(BM_mlir_##NAME##_##TYPE)
 
+#define BM_TFMlir_DYNAMIC_ALL(M, N, K, T_M, T_N, T_K, FN, TYPE)       \
+  BM_TFMlir(MatmulDynamicAll_##M##_##K##_##N##_##T_M##_##T_N##_##T_K, \
+            INTS(M, K), BOOLS(kDynamicDim, kDynamicDim), INTS(K, N),  \
+            BOOLS(kDynamicDim, kDynamicDim), INTS(M, N),              \
+            BOOLS(kDynamicDim, kDynamicDim), FN, TYPE)                \
+      ->Args({M, K, N, T_M, T_N, T_K})
+
+#define BM_TFMlir_STATIC_ALL(M, N, K, T_M, T_N, T_K, FN, TYPE)       \
+  BM_TFMlir(MatmulStaticAll_##M##_##K##_##N##_##T_M##_##T_N##_##T_K, \
+            INTS(M, K), BOOLS(kStaticDim, kStaticDim), INTS(K, N),   \
+            BOOLS(kStaticDim, kStaticDim), INTS(M, N),               \
+            BOOLS(kStaticDim, kStaticDim), FN, TYPE)                 \
+      ->Args({M, K, N, T_M, T_N, T_K})
+
 #define BM_Eigen(NAME, TYPE)                                                 \
   static void BM_eigen_##NAME##_##TYPE(::testing::benchmark::State& state) { \
     RunMatMulEigenBenchmark<TYPE>(state);                                    \
   }                                                                          \
   BENCHMARK(BM_eigen_##NAME##_##TYPE)
 
+#define BM_Eigen_WRAPPER(M, N, K, TYPE) \
+  BM_Eigen(Matmul_##M##_##K##_##N, TYPE)->Args({M, K, N})
+
 #endif  // TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_MATMUL_OP_BENCHMARK_H_
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reverse_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reverse_op_benchmark.cc
new file mode 100644
index 00000000000..895075c468d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/reverse_op_benchmark.cc
@@ -0,0 +1,207 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "llvm/Support/FormatVariadic.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+const char* kReverseIR = R"(
+  func.func @main(%input: {0}) -> {0} {
+    %reverse_dims = "tf.Const"() {{
+      value = {1} : {2},
+      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    } : () -> {2}
+    %result = "tf.ReverseV2"(%input, %reverse_dims)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}, {2}) -> {0}
+    func.return %result : {0}
+  }
+)";
+
+std::string Reverse(llvm::ArrayRef<int64_t> input_shape,
+                    llvm::ArrayRef<bool> dynamic_dims,
+                    llvm::ArrayRef<int32_t> reverse_dims,
+                    llvm::StringRef element_type) {
+  llvm::SmallVector<int64_t, 4> mlir_input_shape =
+      GetTensorTypeShape(input_shape, dynamic_dims);
+  return llvm::formatv(
+      kReverseIR,
+      PrintTensorType(mlir_input_shape, element_type),  // Input type {0}
+      PrintDenseArray(reverse_dims),  // Dims to reverse attr {1}
+      PrintTensorType(static_cast<int64_t>(reverse_dims.size()),
+                      "i32")  // Dims to reverse type {2}
+  );
+}
+
+template <int64_t INPUT_RANK, size_t N_REVERSE_DIMS>
+auto EigenReverse(std::array<int64_t, N_REVERSE_DIMS> reverse_dims) {
+  return [reverse_dims](llvm::ArrayRef<Tensor> inputs,
+                        llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    std::array<bool, INPUT_RANK> bool_reverse_dims;
+    bool_reverse_dims.fill(false);
+    for (auto i : reverse_dims) {
+      bool_reverse_dims[i] = true;
+    }
+    Tensor output(DT_FLOAT, inputs[0].shape());
+    auto in = inputs[0].tensor<float, INPUT_RANK>();
+    auto out = output.tensor<float, INPUT_RANK>();
+    if (device.has_value()) {
+      out.device(*device) = in.reverse(bool_reverse_dims);
+    } else {
+      out = in.reverse(bool_reverse_dims);
+    }
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> GetInputSpec(
+    llvm::ArrayRef<ssize_t> input_shape) {
+  return {InputTensorSpec(DT_FLOAT, input_shape)};
+}
+
+#define INTS(...) __VA_ARGS__
+#define BOOLS(...) __VA_ARGS__
+
+#define BM(KIND, ...) BM_##KIND(__VA_ARGS__)->Arg(0);
+
+#define BM_SUITE(NAME, INPUT_RANK, INPUT_SHAPE, DYNAMIC_DIMS, N_REVERSE_DIMS, \
+                 REVERSE_DIMS)                                                \
+  BM(JitrtV, NAME,                                                            \
+     Reverse({INPUT_SHAPE}, {DYNAMIC_DIMS}, {REVERSE_DIMS}, "f32"), "main",   \
+     GetInputSpec({INPUT_SHAPE}));                                            \
+  BM(Eigen, NAME,                                                             \
+     (EigenReverse<INPUT_RANK>(                                               \
+         std::array<int64_t, N_REVERSE_DIMS>{REVERSE_DIMS})),                 \
+     GetInputSpec({INPUT_SHAPE}));                                            \
+  BM(Tfrt, NAME,                                                              \
+     Reverse({INPUT_SHAPE}, {DYNAMIC_DIMS}, {REVERSE_DIMS}, "f32"), "main",   \
+     GetInputSpec({INPUT_SHAPE}))
+
+////////////////////////////////////////////////////////////////////////////////
+// Reverse 1D tensors.
+////////////////////////////////////////////////////////////////////////////////
+
+#define BM_STATIC_1D(SIZE)                                               \
+  BM_SUITE(ReverseStatic_1D_##SIZE, 1, INTS(SIZE), BOOLS(kStaticDim), 1, \
+           INTS(0))
+BM_STATIC_1D(3);
+BM_STATIC_1D(8);
+BM_STATIC_1D(80);
+BM_STATIC_1D(800);
+BM_STATIC_1D(8000);
+BM_STATIC_1D(8131);
+BM_STATIC_1D(1000000);
+BM_STATIC_1D(1010131);
+
+#define BM_DYNAMIC_1D(SIZE)                                                \
+  BM_SUITE(ReverseDynamic_1D_##SIZE, 1, INTS(SIZE), BOOLS(kDynamicDim), 1, \
+           INTS(0))
+BM_DYNAMIC_1D(3);
+BM_DYNAMIC_1D(8);
+BM_DYNAMIC_1D(80);
+BM_DYNAMIC_1D(800);
+BM_DYNAMIC_1D(8000);
+BM_DYNAMIC_1D(8131);
+BM_DYNAMIC_1D(1000000);
+BM_DYNAMIC_1D(1010131);
+
+////////////////////////////////////////////////////////////////////////////////
+// Reverse 2D tensors.
+////////////////////////////////////////////////////////////////////////////////
+
+#define BM_STATIC_2D_ROW(ROWS, COLS)                                  \
+  BM_SUITE(ReverseStatic_2D_ROW_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kStaticDim, kStaticDim), 1, INTS(0))
+BM_STATIC_2D_ROW(2, 80);
+BM_STATIC_2D_ROW(8, 6);
+BM_STATIC_2D_ROW(80, 1);
+BM_STATIC_2D_ROW(80, 3);
+BM_STATIC_2D_ROW(80, 7);
+BM_STATIC_2D_ROW(80, 60);
+BM_STATIC_2D_ROW(81, 61);
+BM_STATIC_2D_ROW(800, 600);
+BM_STATIC_2D_ROW(802, 602);
+
+#define BM_STATIC_2D_COL(ROWS, COLS)                                  \
+  BM_SUITE(ReverseStatic_2D_COL_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kStaticDim, kStaticDim), 1, INTS(1))
+BM_STATIC_2D_COL(2, 80);
+BM_STATIC_2D_COL(8, 6);
+BM_STATIC_2D_COL(80, 1);
+BM_STATIC_2D_COL(80, 3);
+BM_STATIC_2D_COL(80, 7);
+BM_STATIC_2D_COL(80, 60);
+BM_STATIC_2D_COL(81, 61);
+BM_STATIC_2D_COL(800, 600);
+BM_STATIC_2D_COL(802, 602);
+
+#define BM_STATIC_2D_ALL(ROWS, COLS)                                  \
+  BM_SUITE(ReverseStatic_2D_ALL_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kStaticDim, kStaticDim), 2, INTS(0, 1))
+BM_STATIC_2D_ALL(2, 80);
+BM_STATIC_2D_ALL(8, 6);
+BM_STATIC_2D_ALL(80, 1);
+BM_STATIC_2D_ALL(80, 3);
+BM_STATIC_2D_ALL(80, 7);
+BM_STATIC_2D_ALL(80, 60);
+BM_STATIC_2D_ALL(81, 61);
+BM_STATIC_2D_ALL(800, 600);
+BM_STATIC_2D_ALL(802, 602);
+
+#define BM_DYNAMIC_2D_ROW(ROWS, COLS)                                  \
+  BM_SUITE(ReverseDynamic_2D_ROW_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kDynamicDim, kStaticDim), 1, INTS(0))
+BM_DYNAMIC_2D_ROW(2, 80);
+BM_DYNAMIC_2D_ROW(8, 6);
+BM_DYNAMIC_2D_ROW(80, 1);
+BM_DYNAMIC_2D_ROW(80, 3);
+BM_DYNAMIC_2D_ROW(80, 7);
+BM_DYNAMIC_2D_ROW(80, 60);
+BM_DYNAMIC_2D_ROW(81, 61);
+BM_DYNAMIC_2D_ROW(800, 600);
+BM_DYNAMIC_2D_ROW(802, 602);
+
+#define BM_DYNAMIC_2D_COL(ROWS, COLS)                                  \
+  BM_SUITE(ReverseDynamic_2D_COL_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kStaticDim, kDynamicDim), 1, INTS(1))
+BM_DYNAMIC_2D_COL(2, 80);
+BM_DYNAMIC_2D_COL(8, 6);
+BM_DYNAMIC_2D_COL(80, 1);
+BM_DYNAMIC_2D_COL(80, 3);
+BM_DYNAMIC_2D_COL(80, 7);
+BM_DYNAMIC_2D_COL(80, 60);
+BM_DYNAMIC_2D_COL(81, 61);
+BM_DYNAMIC_2D_COL(800, 600);
+BM_DYNAMIC_2D_COL(802, 602);
+
+#define BM_DYNAMIC_2D_ALL(ROWS, COLS)                                  \
+  BM_SUITE(ReverseDynamic_2D_ALL_##ROWS##_##COLS, 2, INTS(ROWS, COLS), \
+           BOOLS(kDynamicDim, kDynamicDim), 2, INTS(0, 1))
+BM_DYNAMIC_2D_ALL(2, 80);
+BM_DYNAMIC_2D_ALL(8, 6);
+BM_DYNAMIC_2D_ALL(80, 1);
+BM_DYNAMIC_2D_ALL(80, 3);
+BM_DYNAMIC_2D_ALL(80, 7);
+BM_DYNAMIC_2D_ALL(80, 60);
+BM_DYNAMIC_2D_ALL(81, 61);
+BM_DYNAMIC_2D_ALL(800, 600);
+BM_DYNAMIC_2D_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/scatter_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/scatter_op_benchmark.cc
new file mode 100644
index 00000000000..fef65faf7d0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/scatter_op_benchmark.cc
@@ -0,0 +1,130 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+// {0} -- updates_shape
+// {1} -- output_shape
+// {2} -- indices_value
+// {3} -- indices_value
+const char* kMapScatterIR = R"(
+  func.func @main(%updates: {0}, %out: {1}) -> {1} {
+    %indices = "tf.Const"()
+      {{value = {2} : {3},
+      device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : () -> {3}
+    %updates_exp = "tf.Exp"(%updates)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    %scattered = "tf.TensorScatterAdd"(%out, %indices, %updates)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({1}, {3}, {0}) -> {1}
+    func.return %scattered : {1}
+  }
+)";
+
+std::string GetScatterIndices(llvm::ArrayRef<ssize_t> updates_shape,
+                              ssize_t rows) {
+  std::string result{"dense<["};
+  llvm::raw_string_ostream ss(result);
+  for (size_t i = 0; i < updates_shape[0]; ++i) {
+    if (i > 0) ss << ',';
+    ss << "[0, " << (i * 5) % rows << "]";
+  }
+  ss << "]>";
+  return result;
+}
+
+std::string MapScatter(llvm::ArrayRef<bool> dynamic_dims,
+                       llvm::ArrayRef<ssize_t> updates_shape,
+                       llvm::ArrayRef<ssize_t> output_shape) {
+  llvm::SmallVector<int64_t, 2> mlir_output_shape;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    mlir_output_shape.push_back(dynamic_dims[i] ? kDynSize : output_shape[i]);
+  }
+  llvm::SmallVector<int64_t, 2> indeces_shape = {updates_shape[0], 2};
+  return llvm::formatv(kMapScatterIR, PrintTensorType(updates_shape, "f32"),
+                       PrintTensorType(mlir_output_shape, "f32"),
+                       GetScatterIndices(updates_shape, output_shape[1]),
+                       PrintTensorType(indeces_shape, "i32"));
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols,
+                                          ssize_t rows_upd) {
+  return {InputTensorSpec(DT_FLOAT, {rows_upd, cols}),  // updates_shape
+          InputTensorSpec(DT_FLOAT, {1, rows, cols})};  // output_shapes
+}
+
+// This benchmark checks the insertion of full rows (Tfrt requirement).
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS, ROWS_UPD) \
+  BM(JitrtV(NAME,                                                      \
+            MapScatter({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS_UPD, COLS},   \
+                       {1, ROWS, COLS}),                               \
+            "main", Inputs(ROWS, COLS, ROWS_UPD)));                    \
+  BM(Tfrt(NAME,                                                        \
+          MapScatter({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS_UPD, COLS},     \
+                     {1, ROWS, COLS}),                                 \
+          "main", Inputs(ROWS, COLS, ROWS_UPD)))
+
+#define BM_STATIC_ALL(ROWS, COLS, ROWS_UPD)                              \
+  BM_SUITE(MapScatterStaticAll_##ROWS##_##COLS##_##ROWS_UPD, kStaticDim, \
+           kStaticDim, ROWS, COLS, ROWS_UPD)
+BM_STATIC_ALL(11, 1, 5);
+BM_STATIC_ALL(20, 11, 5);
+BM_STATIC_ALL(1, 80, 100);
+BM_STATIC_ALL(80, 1, 5);
+BM_STATIC_ALL(800, 600, 10);
+BM_STATIC_ALL(802, 602, 100);
+
+#define BM_DYNAMIC_ALL(ROWS, COLS, ROWS_UPD)                               \
+  BM_SUITE(MapScatterDynamicAll_##ROWS##_##COLS##_##ROWS_UPD, kDynamicDim, \
+           kDynamicDim, ROWS, COLS, ROWS_UPD)
+BM_DYNAMIC_ALL(11, 1, 5);
+BM_DYNAMIC_ALL(20, 11, 5);
+BM_DYNAMIC_ALL(1, 80, 100);
+BM_DYNAMIC_ALL(80, 1, 5);
+BM_DYNAMIC_ALL(800, 600, 10);
+BM_DYNAMIC_ALL(802, 602, 100);
+
+#define BM_STATIC_ROW(ROWS, COLS, ROWS_UPD)                              \
+  BM_SUITE(MapScatterStaticRow_##ROWS##_##COLS##_##ROWS_UPD, kStaticDim, \
+           kDynamicDim, ROWS, COLS, ROWS_UPD)
+BM_STATIC_ROW(11, 1, 5);
+BM_STATIC_ROW(20, 11, 5);
+BM_STATIC_ROW(1, 80, 100);
+BM_STATIC_ROW(80, 1, 5);
+BM_STATIC_ROW(800, 600, 10);
+BM_STATIC_ROW(802, 602, 100);
+
+#define BM_STATIC_COL(ROWS, COLS, ROWS_UPD)                               \
+  BM_SUITE(MapScatterStaticCol_##ROWS##_##COLS##_##ROWS_UPD, kDynamicDim, \
+           kStaticDim, ROWS, COLS, ROWS_UPD)
+BM_STATIC_COL(11, 1, 5);
+BM_STATIC_COL(20, 11, 5);
+BM_STATIC_COL(1, 80, 100);
+BM_STATIC_COL(80, 1, 5);
+BM_STATIC_COL(800, 600, 10);
+BM_STATIC_COL(802, 602, 100);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc
index 07f53407e9c..ea68306d8ec 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc
@@ -51,8 +51,8 @@ static void ComputeSoftmax(const Eigen::DefaultDevice& d, InT logits,
   const int batch_size = logits.dimension(kBatchDim);
   const int num_classes = logits.dimension(kClassDim);
 
-// These arrays are used to reduce along the class dimension, and broadcast
-// the resulting value to all classes.
+  // These arrays are used to reduce along the class dimension, and broadcast
+  // the resulting value to all classes.
   Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
   Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
   batch_by_one.set(0, batch_size);
@@ -109,8 +109,9 @@ BM_DYNAMIC_ALL(81, 61);
 BM_DYNAMIC_ALL(800, 600);
 BM_DYNAMIC_ALL(802, 602);
 
-#define BM_STATIC_ROW(ROWS, COLS) \
-  BM_SUITE(SoftmaxStaticRow##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+#define BM_STATIC_ROW(ROWS, COLS)                                           \
+  BM_SUITE(SoftmaxStaticRow_##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, \
+           COLS)
 BM_STATIC_ROW(2, 80);
 BM_STATIC_ROW(8, 6);
 BM_STATIC_ROW(80, 1);
diff --git a/tensorflow/compiler/mlir/tfrt/function/function.cc b/tensorflow/compiler/mlir/tfrt/function/function.cc
index 306ec82e374..892fa355bcc 100644
--- a/tensorflow/compiler/mlir/tfrt/function/function.cc
+++ b/tensorflow/compiler/mlir/tfrt/function/function.cc
@@ -74,8 +74,10 @@ Status CompileTFMLIRToBEF(const TfrtFunctionCompileOptions& options,
   pass_options.tpu_fuse_ops = options.tpu_fuse_ops;
   pass_options.tpu_transfer_result_to_host =
       options.tpu_transfer_result_to_host;
-  pass_options.enable_native_ops = options.enable_native_ops;
-  tensorflow::CreateTfExecutorToTfrtPipeline(pm, pass_options);
+  Status status = tensorflow::CreateTfExecutorToTfrtPipeline(pm, pass_options);
+  if (!status.ok()) {
+    return diag_handler.Combine(status);
+  }
 
   if (mlir::failed(pm.run(module)))
     return diag_handler.Combine(tensorflow::errors::Internal(
diff --git a/tensorflow/compiler/mlir/tfrt/function/function.h b/tensorflow/compiler/mlir/tfrt/function/function.h
index 936119e5570..1a7d8bd0592 100644
--- a/tensorflow/compiler/mlir/tfrt/function/function.h
+++ b/tensorflow/compiler/mlir/tfrt/function/function.h
@@ -42,9 +42,6 @@ struct TfrtFunctionCompileOptions : public TfrtCompileOptions {
   // Currently only SavedModel API inference uses the tpu_fuse_ops option
   TfrtFunctionCompileOptions() {
     tpu_fuse_ops = false;
-    // TF function in eager execution uses CoreRT native ops as fallback states
-    // are not initialized in that code path.
-    enable_native_ops = true;
     // Currently grappler is not correctly applied in the eager execution of TF
     // functions, as it may sometimes remove arguments and results.
     enable_grappler = false;
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 99ec360b617..f63f0c7ff07 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -3,6 +3,7 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -181,3 +182,63 @@ gentbl_cc_library(
     ],
     deps = [":tfrt_fallback_td_files"],
 )
+
+td_library(
+    name = "gpu_ops_td_file",
+    srcs = [
+        "gpu_ops.td",
+    ],
+    includes = ["."],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+    ],
+    deps = [
+        ":tfrt_fallback_td_files",
+        "@tf_runtime//:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tfrt_gpu_opdefs_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "gpu_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "gpu_ops.cpp.inc",
+        ),
+        (
+            [
+                "-gen-dialect-decls",
+                "-dialect=gpurt",
+            ],
+            "gpurt_dialect.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "gpu_ops.td",
+    test = True,
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+    ],
+    deps = [":gpu_ops_td_file"],
+)
+
+cc_library(
+    name = "tfrt_gpu_opdefs",
+    srcs = [
+        "gpu_ops.cc",
+    ],
+    hdrs = ["gpu_ops.h"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+    ],
+    deps = [
+        ":tfrt_fallback_opdefs",
+        ":tfrt_gpu_opdefs_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.cc b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.cc
new file mode 100644
index 00000000000..77e71b834b1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.cc
@@ -0,0 +1,41 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
+
+namespace tfrt {
+namespace gpu {
+
+GpuRuntimeDialect::GpuRuntimeDialect(MLIRContext *context)
+    : Dialect(/*name=*/"gpurt", context, TypeID::get<GpuRuntimeDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.cpp.inc"
+      >();
+}
+
+}  // namespace gpu
+}  // namespace tfrt
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.cpp.inc"
diff --git a/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h
new file mode 100644
index 00000000000..a270b8badf1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+
+using namespace mlir;  // NOLINT
+
+namespace tfrt {
+namespace gpu {
+
+// Dialect for TFRT GPU operations.
+class GpuRuntimeDialect : public Dialect {
+ public:
+  explicit GpuRuntimeDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "gpurt"; }
+};
+
+}  // namespace gpu
+}  // namespace tfrt
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_GPU_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td
new file mode 100644
index 00000000000..88fc36fb6da
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/gpu_ops.td
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TFRT_GPU_OPS
+#else
+#define TFRT_GPU_OPS
+
+include "tfrt/tfrt_op_base.td"
+include "tfrt/compiler/opdefs/tfrt_op_interfaces.td"
+include "tfrt/compiler/opdefs/tfrt_traits.td"
+include "tfrt_fallback.td"
+
+def TFRT_GPU_Dialect : Dialect {
+  let name = "gpurt";
+
+  let description = [{
+    The TFRT GPU Dialect.
+  }];
+
+  let cppNamespace = "::tfrt::gpu";
+  let useFoldAPI = kEmitFoldAdaptorFolder;
+}
+
+class Gpu_Op<string mnemonic, list<Trait> traits = []> :
+    Op<TFRT_GPU_Dialect, mnemonic, traits> {
+}
+
+// TODO(b/260267885): We may add a device argument when we want to support
+// GPU MIG.
+def TransferToDeviceOp: Gpu_Op<"transfer_to_device"> {
+  let summary = "Transfer a CPU tensor to device.";
+
+  let description = [{
+    Transfer a CPU tensor to device.
+
+    Example:
+      %device_tensor = gpurt.transfer_to_device %cpu_tensor
+  }];
+
+  let arguments = (ins TFTensorType);
+  let results = (outs TFTensorType);
+  let assemblyFormat = "operands attr-dict";
+}
+
+// TODO(b/260267885): We may add a device argument when we want to support
+// GPU MIG.
+def TransferFromDeviceOp: Gpu_Op<"transfer_from_device"> {
+  let summary = "Transfer a tensor from device.";
+
+  let description = [{
+    Transfer a tensor from device.
+
+    Example:
+      %cpu_tensor = gpurt.transfer_from_device %device_tensor
+  }];
+
+  let arguments = (ins TFTensorType);
+  let results = (outs TFTensorType);
+  let assemblyFormat = "operands attr-dict";
+}
+
+// TODO(b/260267885): We may add a device argument when we want to support
+// GPU MIG.
+def MaybeTransferVariableOp: Gpu_Op<"maybe_transfer_variable"> {
+  let summary = "Transfer a CPU variable tensor to device.";
+  let description = [{
+    Transfer a CPU variable tensor to device if the variable has not been
+    transferred before.
+
+    Example:
+      %device_var = gpurt.maybe_transfer_variable %cpu_var
+  }];
+
+  let arguments = (ins TFTensorType);
+  let results = (outs TFTensorType);
+  let assemblyFormat = "operands attr-dict";
+}
+
+#endif // TFRT_GPU_OPS
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
index c06a389460c..540d5685929 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.td
@@ -27,8 +27,7 @@ def Fallback_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::fallback";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 // This corresponds to tensorflow::Tensor.
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
index 7247a3affed..9643c041cf6 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
@@ -46,7 +46,7 @@ struct FallbackInlinerInterface : public mlir::DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
   bool isLegalToInline(Operation *op, Region *dest, bool would_be_cloned,
-                       BlockAndValueMapping &) const final {
+                       IRMapping &) const final {
     return true;
   }
 };
@@ -304,9 +304,7 @@ void ExecuteOp::getOpAttrs(
 // ConstDenseTensorOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult ConstDenseTensorOp::fold(ArrayRef<Attribute> operands) {
-  return getValue();
-}
+OpFoldResult ConstDenseTensorOp::fold(FoldAdaptor) { return getValue(); }
 
 //===----------------------------------------------------------------------===//
 // CoreRTTensorHandleToFallbackTensorOp
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
index 260efe10bc0..881fdc5e35d 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.td
@@ -33,8 +33,7 @@ def FallbackAsync_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::fallback_async";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 class FallbackAsync_Op<string mnemonic, list<Trait> traits = []> :
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
index 83663441028..8083fcac076 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.cc
@@ -53,36 +53,6 @@ static Type GetTensorType(Builder *builder) {
   return tfrt::t::TensorType::get(builder->getContext());
 }
 
-LogicalResult SyncExecuteOp::verify() {
-  return fallback_common::VerifyExecuteOpCommon(*this);
-}
-
-ParseResult SyncExecuteOp::parse(OpAsmParser &parser, OperationState &result) {
-  fallback_common::ParseExecuteOpOptions parse_options;
-  parse_options.has_chain = false;
-  parse_options.has_key = false;
-  parse_options.has_device = false;
-  parse_options.has_func_attr = false;
-  parse_options.has_cost = false;
-
-  auto &builder = parser.getBuilder();
-  return fallback_common::ParseExecuteOpCommon(
-      parser, builder, result, GetTensorType(&builder), parse_options);
-}
-
-void SyncExecuteOp::print(OpAsmPrinter &p) {
-  p << " " << (*this)->getAttr("op_name") << '(' << operands() << ')';
-
-  fallback_common::PrintExecuteOpCommon(p, *this);
-  if (!getResults().empty()) p << " : " << getResults().size();
-}
-
-void SyncExecuteOp::getOpAttrs(
-    SmallVectorImpl<std::pair<StringRef, Attribute>> *op_attrs) {
-  fallback_common::GetExecuteOpAttrsCommon(
-      this->getContext(), this->getOpAttrs().getValue(), op_attrs);
-}
-
 }  // namespace fallback_sync
 }  // namespace tfrt
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
index d5a58b22fa9..0f9a50285d7 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
@@ -32,8 +32,7 @@ def FallbackSync_Dialect : Dialect {
   }];
 
   let cppNamespace = "::tfrt::fallback_sync";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 class FallbackSync_Op<string mnemonic, list<Trait> traits = []> :
@@ -84,7 +83,7 @@ def GetResourceOp : FallbackSync_Op<"get_resource",
   let assemblyFormat = "attr-dict `:` type($results)";
 }
 
-def CreateOp: FallbackSync_Op<"createop", [CoreRT_TypedAttributeTrait]> {
+def CreateOp: FallbackSync_Op<"createop", []> {
   let summary = "The Fallback CreateOp";
 
   let description = [{
@@ -102,17 +101,16 @@ def CreateOp: FallbackSync_Op<"createop", [CoreRT_TypedAttributeTrait]> {
   }];
 
   let arguments = (ins
-    I64Attr:$num_args,
-    ArrayAttr:$op_attrs,
-    I64Attr:$op_key,
-    StrAttr:$op_name
+    StrAttr:$node_def,
+    I32Attr:$op_key
   );
 
   let results = (outs);
+
+  let assemblyFormat = "attr-dict";
 }
 
-def SyncExecuteOp : FallbackSync_Op<"executeop",
-    [Pure, CoreRT_TypedAttributeTrait]> {
+def SyncExecuteOp : FallbackSync_Op<"executeop", [Pure]> {
   let summary = "The Fallback Sync ExecuteOp";
   let description = [{
     The ExecuteOp executes an operation on the specified device.
@@ -124,28 +122,14 @@ def SyncExecuteOp : FallbackSync_Op<"executeop",
   }];
 
   let arguments = (ins
-    Variadic<TFTensorType>:$operands,
-    ArrayAttr:$op_attrs,
-    I64Attr:$op_key,
-    StrAttr:$op_name
+    Variadic<TFTensorType>,
+    StrAttr:$node_def,
+    I32Attr:$op_key
   );
 
-  let results = (outs
-    Variadic<TFTensorType>:$results
-  );
-
-  let extraClassDeclaration = [{
-    void getOpAttrs(SmallVectorImpl<std::pair<StringRef, Attribute>>* op_attrs);
-  }];
-
-  let builders = [
-    OpBuilder<(ins "ArrayRef<Type>":$results, "ValueRange":$operands,
-      "ArrayRef<std::pair<StringRef, Attribute>>":$op_attrs,
-      "StringRef":$op_name)>];
-
-  let hasVerifier = 1;
+  let results = (outs Variadic<TFTensorType>);
 
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`(`operands`)` attr-dict `:` functional-type(operands, results)";
 }
 
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/default/BUILD b/tensorflow/compiler/mlir/tfrt/jit/default/BUILD
index 715c66af133..ee0e1e53cbc 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/default/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/default/BUILD
@@ -1,6 +1,7 @@
 load("@tf_runtime//:build_defs.bzl", "tfrt_cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.cc b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.cc
index 25a7e036c20..263595c9fcd 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.cc
@@ -48,12 +48,12 @@ struct JitRuntimeInlinerInterface : public DialectInlinerInterface {
   }
 
   bool isLegalToInline(Region*, Region*, bool,
-                       BlockAndValueMapping&) const final {
+                       IRMapping&) const final {
     return true;
   }
 
   bool isLegalToInline(Operation*, Region*, bool,
-                       BlockAndValueMapping&) const final {
+                       IRMapping&) const final {
     return true;
   }
 };
diff --git a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.td b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.td
index a426ce1432b..4176d16a394 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.td
@@ -40,8 +40,7 @@ def TF_JITRT_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlir::tf_jitrt";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
@@ -117,17 +116,15 @@ def FallbackExecuteOp : TF_JITRT_Op<"fallback.execute",
 
   let arguments = (ins
     SymbolRefAttr:$kernel,
-    Variadic<TFTensorType>:$operands,
+    Variadic<TFTensorType>,
     StrAttr:$device
   );
 
-  let results = (outs
-    Variadic<TFTensorType>:$results
-  );
+  let results = (outs Variadic<TFTensorType>);
 
   let assemblyFormat = [{
-    $kernel `(` $operands `)` `device` `(` $device `)` attr-dict `:`
-    functional-type($operands, $results)
+    $kernel `(` operands `)` `device` `(` $device `)` attr-dict `:`
+    functional-type(operands, results)
   }];
 }
 
@@ -141,7 +138,7 @@ def FallbackDebugExecuteOp : TF_JITRT_Op<"fallback.debug.execute"> {
 
   let arguments = (ins
     SymbolRefAttr:$kernel,
-    Variadic<TFTensorType>:$operands,
+    Variadic<TFTensorType>,
     StrAttr:$device,
     // Print to standard output whenever compiled kernel specialized for the
     // operands shapes or values.
@@ -154,13 +151,11 @@ def FallbackDebugExecuteOp : TF_JITRT_Op<"fallback.debug.execute"> {
     BoolAttr:$legalize_i1_tensors
   );
 
-  let results = (outs
-    Variadic<TFTensorType>:$results
-  );
+  let results = (outs Variadic<TFTensorType>);
 
   let assemblyFormat = [{
-    $kernel `(` $operands `)` `device` `(` $device `)` attr-dict `:`
-    functional-type($operands, $results)
+    $kernel `(` operands `)` `device` `(` $device `)` attr-dict `:`
+    functional-type(operands, results)
   }];
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD b/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
index b0c7f004ed2..6314fb77c90 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_test")
 licenses(["notice"])
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":__subpackages__"],
 )
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
index 723e3728e6f..b74ce13e165 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
@@ -76,12 +76,11 @@ TfJitRtExecutor::TfJitRtExecutor()
           },
           CreateMallocAllocator(), CreateMultiThreadedWorkQueue(4, 4)) {}
 
-TfJitRtExecutor::Handle TfJitRtExecutor::Compile(const std::string& mlir_module,
-                                                 const std::string& entrypoint,
-                                                 Specialization specialization,
-                                                 bool vectorize,
-                                                 bool codegen_transpose,
-                                                 bool legalize_i1_tensors) {
+TfJitRtExecutor::Handle TfJitRtExecutor::Compile(
+    const std::string& mlir_module, const std::string& entrypoint,
+    Specialization specialization, bool vectorize, bool codegen_transpose,
+    bool legalize_i1_tensors, bool peel, bool enable_xla_cpu_transformations,
+    bool pack_matmul) {
   // Options for the default JitRt compilation pipeline (lowering to LLVM).
   CompilationPipelineOptions copts;
   copts.alignment = EIGEN_MAX_ALIGN_BYTES;
@@ -102,6 +101,9 @@ TfJitRtExecutor::Handle TfJitRtExecutor::Compile(const std::string& mlir_module,
         opts.vectorize = vectorize;
         opts.codegen_transpose = codegen_transpose;
         opts.legalize_i1_tensors = legalize_i1_tensors;
+        opts.peel = peel;
+        opts.enable_xla_cpu_transformations = enable_xla_cpu_transformations;
+        opts.lower_to_mmt4d = pack_matmul;
         tensorflow::CreateTfJitRtPipeline(*passes, opts);
         CreateDefaultJitRtCompilationPipeline(passes, copts);
       };
@@ -245,7 +247,8 @@ std::vector<py::array> TfJitRtExecutor::Execute(
   PyBindingResultConverter converter(results, results_ctx);
   converter.AddConversion(ReturnStridedMemref<MemrefToPyArray>);
   if (auto st = (*executable)->Execute(memrefs, converter, opts); !st.ok())
-    throw std::runtime_error(StrCat("Unsupported argument: ", st.message()));
+    throw std::runtime_error(
+        StrCat("Unsupported argument: ", st.status().message()));
 
   // Pull Python arrays out of async values.
   std::vector<py::array> ret_values;
@@ -289,7 +292,9 @@ PYBIND11_MODULE(_tf_jitrt_executor, m) {
            py::arg("specialization") =
                tensorflow::TfJitRtExecutor::Specialization::kEnabled,
            py::arg("vectorize") = false, py::arg("codegen_transpose") = false,
-           py::arg("legalize_i1_tensors") = false)
+           py::arg("legalize_i1_tensors") = false, py::arg("peel") = true,
+           py::arg("enable_xla_cpu_transformations") = false,
+           py::arg("pack_matmul") = false)
       .def("execute", &tensorflow::TfJitRtExecutor::Execute)
       .def("built_with", &tensorflow::TfJitRtExecutor::BuiltWith,
            py::arg("cpu_feature"));
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.h b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.h
index e68b5f99896..0a6542c9a30 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/DenseMap.h"
 #include "pybind11/numpy.h"
@@ -43,7 +44,8 @@ class TfJitRtExecutor {
   // execute function.
   Handle Compile(const std::string& mlir_module, const std::string& entrypoint,
                  Specialization specialization, bool vectorize,
-                 bool codegen_transpose, bool legalize_i1_tensors);
+                 bool codegen_transpose, bool legalize_i1_tensors, bool peel,
+                 bool enable_xla_cpu_transformations, bool pack_matmul);
 
   // Executes compiled mlir module with Python array arguments. Converts
   // returned memrefs into Python arrays.
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
index eac5a04424a..25020671505 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
@@ -480,6 +480,9 @@ static Expected<AsyncValuePtr<JitExecutable>> CompileImpl(
             opts.legalize_i1_tensors = tf_jitrt_opts->legalize_i1_tensors;
           } else {
             opts.vectorize = GetJitRtFlags().vectorize;
+            opts.enable_xla_cpu_transformations =
+                tensorflow::GetJitRtFlags().enable_xla_cpu_transformations;
+            opts.lower_to_mmt4d = tensorflow::GetJitRtFlags().pack_matmul;
           }
 
           // Lower from Tensorflow to Linalg on buffers.
@@ -730,7 +733,7 @@ static void ExecuteImpl(Executable& executable, ArrayRef<MemrefDesc> memrefs,
   // notify the HostContext to emit the diagnostics for the kernel invocation.
   auto status = executable.Execute(memrefs, converter, opts);
   if (LLVM_UNLIKELY(!status.ok())) {
-    EmitError(exec_ctx, status.message());
+    EmitError(exec_ctx, status.status().message());
     return;
   }
 }
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
index 91482049bce..be2d8b4107c 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.cc
@@ -31,11 +31,13 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 
 // -------------------------------------------------------------------------- //
 // Custom passes that are missing upstream.
@@ -66,38 +68,6 @@ struct AddTensorflowProducerVersion
   }
 };
 
-// Adds Linalg passes to perform fusion, tiling, peeling and vectorization.
-void AddLinalgTransformations(OpPassManager& pm,
-                              const TfJitRtPipelineOptions& options) {
-  pm.addNestedPass<FuncOp>(CreateFusionPass());
-
-  if (!options.vectorize) return;
-
-  pm.addNestedPass<FuncOp>(CreateDetensorizeLinalgPass());
-
-  pm.addNestedPass<FuncOp>(CreateTileReductionPass(
-      options.vector_size, options.reduction_1d_tile_size,
-      options.reduction_2d_tile_sizes));
-
-  // TODO(b/248219927): Enable matmul transformations when bufferization works.
-  // pm.addNestedPass<FuncOp>(
-  //     mlir::gml_st::createTransformMatmulForCpuPass(options.matmul_tile_sizes));
-
-  if (options.vectorize && options.codegen_transpose)
-    pm.addNestedPass<FuncOp>(CreateTileTransposePass());
-  pm.addNestedPass<FuncOp>(CreateTileCWisePass(options.vector_size));
-  if (options.peel) {
-    pm.addNestedPass<FuncOp>(CreatePeelTiledLoopsPass());
-  }
-  pm.addNestedPass<FuncOp>(mlir::createCSEPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-  if (options.fuse_fill) {
-    pm.addNestedPass<FuncOp>(CreateFuseFillIntoTiledReductionPass());
-  }
-  pm.addNestedPass<FuncOp>(CreateTileFillPass(options.vector_size));
-  pm.addNestedPass<FuncOp>(mlir::gml_st::createVectorizeGmlStLoopsPass());
-}
-
 void AddBufferizationPasses(OpPassManager& pm) {
   // Rewrite tensor.empty ops to bufferization.alloc_tensor ops.
   pm.addNestedPass<FuncOp>(
@@ -122,8 +92,12 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm.addPass(mlir::createCanonicalizerPass());
 
+  // This will add regions to IfOp/WhileOp (turning them into IfRegionOp
+  // and WhileRegionOp), but be aware that those regions will still contain
+  // calls.
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+
   // Transform TF operation to HLO.
-  pm.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeTFPass());
 
   if (options.legalize_i1_tensors) {
@@ -132,43 +106,38 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   }
 
   // Remove redundant shape operations left after legalizing to HLO.
+  pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 
-  // Resolve all shape constraints (e.g. broadcast constraints that can be
-  // proved statically and changed to const witness) early to allow more
-  // efficient broadcast operations moving.
-  pm.addNestedPass<FuncOp>(
-      CreateSymbolicShapeOptimizationPass(/*constraints_only=*/true));
-
-  // Analyze shapes and try to simplify the IR as early as possible.
-  pm.addNestedPass<FuncOp>(mlir::createSymbolicShapeOptimizationPass());
+  // Analyze shapes and try to simplify the IR early.
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createSymbolicShapeOptimizationPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
   // Move up broadcasting operations to allow for more fusion opportunities.
-  // Add the broadcast propagation pass first, because it can help to avoid
-  // exponential complexity from the EarlyBroadcastInDimOp pattern which is used
-  // in the merge assuming ops pass further down.
   pm.addNestedPass<FuncOp>(mlir::mhlo::createMergeAssumingOpsPass());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createBroadcastPropagationPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
-  // After all shape constraints removed and broadcasts moved to the top, try
-  // to resolve broadcasts that can be converted to linalg generic operations.
-  pm.addNestedPass<FuncOp>(CreateSymbolicShapeOptimizationPass());
-
-  // Group reduction and parallel dimensions of reduction operations and realize
-  // them through equivalent 1D or 2D reductions, if possible.
   pm.addNestedPass<FuncOp>(mlir::mhlo::createGroupReductionDimensionsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createHloCanonicalizeScatterPass());
 
   // Also, try to simplify reshape operations.
-  pm.addNestedPass<FuncOp>(mlir::createSymbolicShapeOptimizationPass());
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createSymbolicShapeOptimizationPass());
 
   // Transform HLO operations to Linalg and Standard.
   pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeControlFlowPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeSortPass());
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeHloToLinalgPass());
+  pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeCollectiveOpsPass());
+
+  if (options.vectorize) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::mhlo::createLegalizeMHLOToTHLOPass());
+  }
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeHloToLinalgPass(
+      /*enablePrimitiveOps=*/options.vectorize));
   pm.addPass(mlir::mhlo::createLegalizeToArithmeticPass());
   pm.addNestedPass<FuncOp>(
       mlir::mhlo::createLegalizeHloShapeOpsToStandardPass());
@@ -180,7 +149,7 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
 
   // Lower shape dialect to standard to enable linalg canonicalizations (e.g.
   // use linalg inputs instead of outputs for memref.dim operations).
-  pm.addNestedPass<FuncOp>(mlir::createShapeSimplification());
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createShapeSimplification());
   pm.addNestedPass<FuncOp>(mlir::createShapeToShapeLowering());
   pm.addPass(mlir::createConvertShapeToStandardPass());
   pm.addNestedPass<FuncOp>(mlir::createConvertShapeConstraintsPass());
@@ -196,8 +165,20 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   // Convert complex types.
   pm.addPass(mlir::createConvertComplexToStandardPass());
 
-  // Add linalg passes to perform fusion, tiling, peeling and vectorization.
-  AddLinalgTransformations(pm, options);
+  // Add passes to perform fusion, tiling, peeling and vectorization.
+  if (options.vectorize) {
+    mlir::gml_st::GmlStCPUPipelineOptions gml_st_opts;
+    gml_st_opts.vectorize = options.vectorize;
+    gml_st_opts.vectorSize = options.vector_size;
+    gml_st_opts.reduction1DTileSize = options.reduction_1d_tile_size;
+    gml_st_opts.reduction2DTileSizes = options.reduction_2d_tile_sizes;
+    gml_st_opts.matmulTileSizes = options.matmul_tile_sizes;
+    gml_st_opts.lowerToMmt4d = options.lower_to_mmt4d;
+
+    mlir::gml_st::addCPUTilingPipeline(pm, gml_st_opts);
+  } else {
+    pm.addNestedPass<FuncOp>(CreateFusionPass());
+  }
 
   // Inline everything, bufferization doesn't model ownership across calls.
   pm.addPass(mlir::createInlinerPass());
@@ -211,6 +192,12 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
+  if (options.vectorize)
+    pm.addNestedPass<FuncOp>(mlir::gml_st::createVectorizeCopyPass());
+
+  if (options.enable_xla_cpu_transformations)
+    pm.addNestedPass<FuncOp>(mlir::gml_st::createSimplifyDeadCopyPass());
+
   // Deallocate all temporary buffers.
   pm.addNestedPass<FuncOp>(mlir::bufferization::createBufferDeallocationPass());
 
@@ -227,14 +214,14 @@ void CreateTfJitRtPipeline(OpPassManager& pm,
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
-  if (options.vectorize && options.codegen_transpose)
-    pm.addNestedPass<FuncOp>(CreateLowerVectorTransposePass());
+  pm.addNestedPass<FuncOp>(mlir::gml_st::createRewriteVectorTransposePass());
 
   mlir::VectorTransferToSCFOptions vec_to_scf_options;
   vec_to_scf_options.unroll = true;
   pm.addNestedPass<FuncOp>(
       mlir::createConvertVectorToSCFPass(vec_to_scf_options));
-  pm.addNestedPass<FuncOp>(createRewriteVectorMultiReductionPass());
+  pm.addNestedPass<FuncOp>(
+      mlir::gml_st::createRewriteVectorMultiReductionPass());
 
   pm.addNestedPass<FuncOp>(CreateMathApproximationPass({"all"}));
 }
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.h b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.h
index 450c519ad29..6f546dd77a5 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.h
@@ -29,6 +29,11 @@ struct TfJitRtPipelineOptions
                          llvm::cl::desc("Enable tiling for vectorization."),
                          llvm::cl::init(false)};
 
+  Option<bool> enable_xla_cpu_transformations{
+      *this, "enable_xla_cpu_transformations",
+      llvm::cl::desc("Enable tiling/fusion shared with XLA:CPU Next."),
+      llvm::cl::init(false)};
+
   Option<bool> peel{*this, "peel", llvm::cl::desc("Enable loop peeling."),
                     llvm::cl::init(true)};
 
@@ -55,6 +60,12 @@ struct TfJitRtPipelineOptions
       llvm::cl::desc("Tile sizes for `linalg.matmul`."),
       llvm::cl::list_init<int64_t>({4, 4, 4}), llvm::cl::ZeroOrMore};
 
+  Option<bool> lower_to_mmt4d{
+      *this, "lower-to-mmt4d",
+      llvm::cl::desc("Enable the specific code generation (packing) for matmul "
+                     "operations."),
+      llvm::cl::init(false)};
+
   Option<bool> legalize_i1_tensors{
       *this, "legalize-i1-tensors",
       llvm::cl::desc("Convert i1 tensors to i8 tensors."),
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
index ee6c883beb1..bfc26199451 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 # TF to TFRT kernels conversion.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/mlir/tfrt:friends"],
     licenses = ["notice"],
 )
@@ -47,20 +48,11 @@ cc_library(
         "tf_jitrt_buffer_forwarding.cc",
         "tf_jitrt_clustering_pass.cc",
         "tf_jitrt_copy_removal.cc",
-        "tf_jitrt_detensorize_linalg.cc",
         "tf_jitrt_fission.cc",
-        "tf_jitrt_fuse_fill_into_tiled_reduction.cc",
         "tf_jitrt_fusion.cc",
         "tf_jitrt_legalize_i1_type.cc",
-        "tf_jitrt_lower_vector_transpose.cc",
         "tf_jitrt_math_approximation.cc",
         "tf_jitrt_passes.cc",
-        "tf_jitrt_peel_tiled_loops.cc",
-        "tf_jitrt_rewrite_vector_multi_reduction.cc",
-        "tf_jitrt_symbolic_shape_optimization.cc",
-        "tf_jitrt_tile_cwise.cc",
-        "tf_jitrt_tile_reduction.cc",
-        "tf_jitrt_tile_transpose.cc",
     ],
     hdrs = ["tf_jitrt_passes.h"],
     compatible_with = get_compatible_with_cloud(),
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
index 80a4d063bd0..6391560c757 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering.cc
@@ -209,19 +209,19 @@ class BroadcastToOpClusteringPolicy
       BroadcastToOp op, const ValuesConstraintSet& results,
       ValuesConstraintSet& operands) const final {
     // Only ranked inputs are supported.
-    operands.Insert(op.input(), ValueConstraint::kRank);
+    operands.Insert(op.getInput(), ValueConstraint::kRank);
 
     if (auto result_constraint = results.GetConstraint(op.getResult())) {
       if (*result_constraint == ValueConstraint::kValue) return failure();
       // For a static output shape we need a constant shape operand.
       if (*result_constraint == ValueConstraint::kShape) {
-        operands.Insert(op.shape(), ValueConstraint::kValue);
+        operands.Insert(op.getShape(), ValueConstraint::kValue);
         return success();
       }
     }
 
     // Producing a ranked output requires a known shape for the shape operand.
-    operands.Insert(op.shape(), ValueConstraint::kShape);
+    operands.Insert(op.getShape(), ValueConstraint::kShape);
 
     return success();
   }
@@ -398,13 +398,13 @@ class ConcatV2OpClusteringPolicy
 
     // Propagate constraint from the result to the input. All inputs always need
     // a known rank.
-    for (auto value : op.values()) {
+    for (auto value : op.getValues()) {
       operands.Insert(value,
-                      result_constraint.getValueOr(ValueConstraint::kRank));
+                      result_constraint.value_or(ValueConstraint::kRank));
     }
 
     // Force axis to be a constant.
-    operands.Insert(op.axis(), ValueConstraint::kValue);
+    operands.Insert(op.getAxis(), ValueConstraint::kValue);
 
     return success();
   }
@@ -423,7 +423,7 @@ class ConstOpClusteringPolicy : public TensorflowOpClusteringPolicy<ConstOp> {
     auto result_constraint = results.GetConstraint(op.getResult());
     if (!result_constraint.has_value()) return failure();
 
-    return IsCompilableConstant(op.value());
+    return IsCompilableConstant(op.getValue());
   }
 };
 
@@ -439,13 +439,13 @@ class ExpandDimsOpClusteringPolicy
     // Propagate constraint from the result to the input.
     if (auto result_constraint = results.GetConstraint(op->getResult(0))) {
       if (*result_constraint == ValueConstraint::kValue) return failure();
-      operands.Insert(op.input(), *result_constraint);
+      operands.Insert(op.getInput(), *result_constraint);
     } else {
-      operands.Insert(op.input(), ValueConstraint::kRank);
+      operands.Insert(op.getInput(), ValueConstraint::kRank);
     }
 
     // The inserted dimension must be always known at compile time.
-    operands.Insert(op.dim(), ValueConstraint::kValue);
+    operands.Insert(op.getDim(), ValueConstraint::kValue);
 
     return success();
   }
@@ -466,12 +466,12 @@ class FusedMatMulOpClusteringPolicy
       return failure();
 
     // Check if we do support a set of fused operations.
-    size_t n = op.fused_ops().size();
+    size_t n = op.getFusedOps().size();
 
     auto fusion =
-        n > 0 ? op.fused_ops()[0].dyn_cast<mlir::StringAttr>() : nullptr;
+        n > 0 ? op.getFusedOps()[0].dyn_cast<mlir::StringAttr>() : nullptr;
     auto activation =
-        n > 1 ? op.fused_ops()[1].dyn_cast<mlir::StringAttr>() : nullptr;
+        n > 1 ? op.getFusedOps()[1].dyn_cast<mlir::StringAttr>() : nullptr;
 
     if ((n > 0 && !fusion) || (n > 1 && !activation)) return failure();
 
@@ -502,11 +502,11 @@ class FillOpClusteringPolicy : public TensorflowOpClusteringPolicy<FillOp> {
 
     // To know the result shape we need to know the shape operand value.
     if (*result_constraint == ValueConstraint::kShape)
-      operands.Insert(op.dims(), ValueConstraint::kValue);
+      operands.Insert(op.getDims(), ValueConstraint::kValue);
 
     // To know the result rank we need to know the shape operand shape.
     if (*result_constraint == ValueConstraint::kRank)
-      operands.Insert(op.dims(), ValueConstraint::kShape);
+      operands.Insert(op.getDims(), ValueConstraint::kShape);
 
     // Value constraint propagation is not supported.
     if (*result_constraint == ValueConstraint::kValue) return failure();
@@ -534,8 +534,8 @@ class OneHotOpClusteringPolicy : public TensorflowOpClusteringPolicy<OneHotOp> {
       if (*constraint == ValueConstraint::kValue) return failure();
 
     // MHLO lowering needs a static shape for the indices and a constant depth.
-    operands.Insert(op.indices(), ValueConstraint::kShape);
-    operands.Insert(op.depth(), ValueConstraint::kValue);
+    operands.Insert(op.getIndices(), ValueConstraint::kShape);
+    operands.Insert(op.getDepth(), ValueConstraint::kValue);
 
     return success();
   }
@@ -561,7 +561,7 @@ class RangeOpClusteringPolicy : public TensorflowOpClusteringPolicy<RangeOp> {
 
     // To know the result shape we need the input values.
     if (*result_constraint == ValueConstraint::kShape) {
-      operands.Insert({op.start(), op.limit(), op.delta()},
+      operands.Insert({op.getStart(), op.getLimit(), op.getDelta()},
                       ValueConstraint::kValue);
     }
 
@@ -582,7 +582,7 @@ class ReshapeOpClusteringPolicy
       ReshapeOp op, const ValuesConstraintSet& results,
       ValuesConstraintSet& operands) const final {
     // The runtime only supports ranked tensors.
-    operands.Insert(op.tensor(), ValueConstraint::kRank);
+    operands.Insert(op.getTensor(), ValueConstraint::kRank);
 
     // Reshape operation does not have any default constraints.
     auto result_constraint = results.GetConstraint(op.getResult());
@@ -591,13 +591,13 @@ class ReshapeOpClusteringPolicy
     // To know the result shape we need to know the shape operand value. We also
     // require a static shape on the input in case there's a -1 in the shape.
     if (*result_constraint == ValueConstraint::kShape) {
-      operands.Insert(op.shape(), ValueConstraint::kValue);
-      operands.Insert(op.tensor(), ValueConstraint::kShape);
+      operands.Insert(op.getShape(), ValueConstraint::kValue);
+      operands.Insert(op.getTensor(), ValueConstraint::kShape);
     }
 
     // To know the result rank we need to know the shape operand shape.
     if (*result_constraint == ValueConstraint::kRank)
-      operands.Insert(op.shape(), ValueConstraint::kShape);
+      operands.Insert(op.getShape(), ValueConstraint::kShape);
 
     // Value constraint propagation is not supported.
     if (*result_constraint == ValueConstraint::kValue) return failure();
@@ -615,7 +615,7 @@ class ShapeOpClusteringPolicy : public TensorflowOpClusteringPolicy<ShapeOp> {
       ShapeOp op, const ValuesConstraintSet& results,
       ValuesConstraintSet& operands) const final {
     // Unranked inputs aren't supported by JitRt.
-    operands.Insert(op.input(), ValueConstraint::kRank);
+    operands.Insert(op.getInput(), ValueConstraint::kRank);
 
     // Check constraint on the result value.
     auto result_constraint = results.GetConstraint(op.getResult());
@@ -623,11 +623,11 @@ class ShapeOpClusteringPolicy : public TensorflowOpClusteringPolicy<ShapeOp> {
 
     // To know the result shape we need only the rank of the input.
     if (*result_constraint == ValueConstraint::kShape)
-      operands.Insert(op.input(), ValueConstraint::kRank);
+      operands.Insert(op.getInput(), ValueConstraint::kRank);
 
     // To know the result value we need to know the shape of the input.
     if (*result_constraint == ValueConstraint::kValue)
-      operands.Insert(op.input(), ValueConstraint::kShape);
+      operands.Insert(op.getInput(), ValueConstraint::kShape);
 
     return success();
   }
@@ -667,9 +667,9 @@ class SqueezeOpClusteringPolicy
     }
 
     // If squeeze_dims is not present we need a static shape.
-    if (op.squeeze_dims().empty()) input_constraint = ValueConstraint::kShape;
+    if (op.getSqueezeDims().empty()) input_constraint = ValueConstraint::kShape;
 
-    operands.Insert(op.input(), input_constraint);
+    operands.Insert(op.getInput(), input_constraint);
     return success();
   }
 };
@@ -692,13 +692,13 @@ class TransposeOpClusteringPolicy
       ValuesConstraintSet& operands) const final {
     // Propagate result constraints to the input, at minimum require known rank.
     if (auto constraint = results.GetConstraint(op.getResult())) {
-      operands.Insert(op.x(), *constraint);
+      operands.Insert(op.getX(), *constraint);
     } else {
-      operands.Insert(op.x(), ValueConstraint::kRank);
+      operands.Insert(op.getX(), ValueConstraint::kRank);
     }
 
     // Permutation must be always known at compile time.
-    operands.Insert(op.perm(), ValueConstraint::kValue);
+    operands.Insert(op.getPerm(), ValueConstraint::kValue);
 
     return success();
   }
@@ -717,12 +717,12 @@ class SliceOpClusteringPolicy : public TensorflowOpClusteringPolicy<SliceOp> {
       if (*constraint == ValueConstraint::kValue) return failure();
 
     // We must know the shape of the input.
-    operands.Insert(op.input(), ValueConstraint::kShape);
+    operands.Insert(op.getInput(), ValueConstraint::kShape);
 
     // Force begin and size to be constants. The restriction on begin could be
     // lifted if we know that there are no `-1` sizes.
     // TODO(kramerb): Revisit this when mhlo.real_dynamic_slice stabilizes.
-    operands.Insert({op.begin(), op.size()}, ValueConstraint::kValue);
+    operands.Insert({op.getBegin(), op.getSize()}, ValueConstraint::kValue);
 
     return success();
   }
@@ -738,10 +738,10 @@ class StridedSliceOpClusteringPolicy
       StridedSliceOp op, const ValuesConstraintSet& results,
       ValuesConstraintSet& operands) const final {
     // We must know the shape of the input.
-    operands.Insert(op.input(), ValueConstraint::kShape);
+    operands.Insert(op.getInput(), ValueConstraint::kShape);
 
     // And values of operands that control the slice size.
-    operands.Insert({op.begin(), op.end(), op.strides()},
+    operands.Insert({op.getBegin(), op.getEnd(), op.getStrides()},
                     ValueConstraint::kValue);
 
     return success();
@@ -915,7 +915,7 @@ mlir::LogicalResult VerifyCluster(const Cluster& cluster) {
 
     // Small constants will be sunk into the compiled function body.
     auto const_op = mlir::dyn_cast<mlir::TF::ConstOp>(op);
-    if (!const_op || failed(IsCompilableConstant(const_op.value())))
+    if (!const_op || failed(IsCompilableConstant(const_op.getValue())))
       return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering_pass.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering_pass.cc
index ed61a74842e..ee10d7eedba 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_clustering_pass.cc
@@ -80,8 +80,8 @@ struct ClusteringPass : public impl::ClusteringBase<ClusteringPass> {
 
     // If the clustering tier is not defined, it means that the opset will later
     // filter supported operations, so it's ok to use `all` tier.
-    populateTfJitRtClusteringPolicies(
-        policies, tier.getValueOr(JitRtClusteringTier::kAll));
+    populateTfJitRtClusteringPolicies(policies,
+                                      tier.value_or(JitRtClusteringTier::kAll));
 
     // If opset is not empty restrict operations that are enabled for
     // clustering.
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_detensorize_linalg.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_detensorize_linalg.cc
deleted file mode 100644
index 9d76d3c3cec..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_detensorize_linalg.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_DETENSORIZELINALG
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using mlir::AffineMap;
-using mlir::ConversionPatternRewriter;
-using mlir::failure;
-using mlir::LogicalResult;
-using mlir::OpConversionPattern;
-using mlir::OpRewritePattern;
-using mlir::PatternRewriter;
-using mlir::RankedTensorType;
-using mlir::success;
-using mlir::Type;
-using mlir::TypeRange;
-using mlir::Value;
-using mlir::linalg::GenericOp;
-using mlir::tensor::ExtractOp;
-using mlir::tensor::FromElementsOp;
-
-bool IsNotZeroRankTensor(RankedTensorType tensor_type) {
-  return !tensor_type || tensor_type.getRank() > 0;
-}
-
-/// A conversion patttern for detensoring Linalg ops.
-struct DetensorizeLinalgOp : public OpConversionPattern<GenericOp> {
-  using OpConversionPattern<GenericOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      GenericOp op, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter& rewriter) const override {
-    mlir::Location loc = op.getLoc();
-    mlir::SmallVector<AffineMap, 3> indexing_maps = op.getIndexingMapsArray();
-
-    mlir::SmallVector<Value, 3> inputs;
-    bool found_zero_dim_tensor = false;
-    for (auto& en : llvm::enumerate(op.getDpsInputOperands())) {
-      auto tensor_type =
-          en.value()->get().getType().dyn_cast<RankedTensorType>();
-      if (IsNotZeroRankTensor(tensor_type)) {
-        inputs.push_back(en.value()->get());
-        continue;
-      }
-      found_zero_dim_tensor = true;
-      indexing_maps[en.index()] =
-          AffineMap::get(op.getNumLoops(), 0, llvm::None, op.getContext());
-      inputs.push_back(rewriter.create<ExtractOp>(loc, en.value()->get(),
-                                                  mlir::ValueRange{}));
-    }
-    if (!found_zero_dim_tensor) return failure();
-
-    auto linalg_op = rewriter.create<GenericOp>(
-        loc, op.getResultTypes(), inputs, op.getOutputs(),
-        rewriter.getAffineMapArrayAttr(indexing_maps), op.getIteratorTypes(),
-        mlir::StringAttr(), mlir::StringAttr());
-    mlir::Region& region = linalg_op.getRegion();
-    rewriter.inlineRegionBefore(op.getBodyRegion(), region, region.end());
-    rewriter.replaceOp(op, linalg_op.getResults());
-    return success();
-  }
-};
-
-struct DetensorizeLinalgPass
-    : public impl::DetensorizeLinalgBase<DetensorizeLinalgPass> {
-  DetensorizeLinalgPass() = default;
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto* context = &getContext();
-
-    mlir::ConversionTarget target(*context);
-    target.markUnknownOpDynamicallyLegal([](mlir::Operation*) { return true; });
-    target.addDynamicallyLegalOp<GenericOp>([&](GenericOp op) {
-      return llvm::all_of(TypeRange{op.getInputs()}, [&](Type type) {
-        return IsNotZeroRankTensor(type.dyn_cast<RankedTensorType>());
-      });
-    });
-
-    // Detensorize.
-    mlir::RewritePatternSet patterns(context);
-    patterns.add<DetensorizeLinalgOp>(context);
-    if (failed(applyFullConversion(func, target, std::move(patterns))))
-      signalPassFailure();
-
-    // Canonicalize.
-    mlir::RewritePatternSet canonicalization_patterns(context);
-    FromElementsOp::getCanonicalizationPatterns(patterns, context);
-    if (failed(applyPatternsAndFoldGreedily(
-            func, std::move(canonicalization_patterns))))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateDetensorizeLinalgPass() {
-  return std::make_unique<DetensorizeLinalgPass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fission.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fission.cc
index bfb2f031206..ce205d8f06c 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fission.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fission.cc
@@ -36,13 +36,13 @@ struct FusedMatMulFission
     auto loc = op.getLoc();
     auto type = op.getResult().getType();
 
-    size_t n = op.fused_ops().size();
+    size_t n = op.getFusedOps().size();
 
     // Extract fused operations from the operation attributes.
     mlir::StringAttr fusion0 =
-        n > 0 ? op.fused_ops()[0].dyn_cast<mlir::StringAttr>() : nullptr;
+        n > 0 ? op.getFusedOps()[0].dyn_cast<mlir::StringAttr>() : nullptr;
     mlir::StringAttr fusion1 =
-        n > 1 ? op.fused_ops()[1].dyn_cast<mlir::StringAttr>() : nullptr;
+        n > 1 ? op.getFusedOps()[1].dyn_cast<mlir::StringAttr>() : nullptr;
 
     // Match to supported operations
     bool is_bias_add = fusion0 && fusion0.getValue() == "BiasAdd";
@@ -53,7 +53,7 @@ struct FusedMatMulFission
       auto lhs = op.getOperand(0);
       auto rhs = op.getOperand(1);
       return rewriter.create<mlir::TF::MatMulOp>(
-          loc, type, lhs, rhs, op.transpose_a(), op.transpose_b());
+          loc, type, lhs, rhs, op.getTransposeA(), op.getTransposeB());
     };
 
     // FusedMatMul[BiasAdd].
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fuse_fill_into_tiled_reduction.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fuse_fill_into_tiled_reduction.cc
deleted file mode 100644
index 17e88727a5a..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fuse_fill_into_tiled_reduction.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_FUSEFILLINTOTILEDREDUCTION
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using llvm::makeArrayRef;
-using mlir::BlockAndValueMapping;
-using mlir::BlockArgument;
-using mlir::dyn_cast;
-using mlir::failure;
-using mlir::Location;
-using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpBuilder;
-using mlir::Operation;
-using mlir::OpFoldResult;
-using mlir::OpRewritePattern;
-using mlir::PatternRewriter;
-using mlir::SmallVector;
-using mlir::success;
-using mlir::Value;
-using mlir::ValueRange;
-using mlir::gml_st::LoopOp;
-using mlir::linalg::FillOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::LinalgOp;
-using mlir::linalg::YieldOp;
-using mlir::tensor::EmptyOp;
-using mlir::tensor::ExtractSliceOp;
-using mlir::tensor::InsertSliceOp;
-
-SmallVector<OpFoldResult> GetParallelDimStep(LoopOp tiled_loop) {
-  assert(tiled_loop.getNumLoops() == 2 && "Expected a 2D loop");
-  Value step = tiled_loop.isParallelDimension(0) ? tiled_loop.getStep().front()
-                                                 : tiled_loop.getStep().back();
-  if (auto constant = step.getDefiningOp<mlir::arith::ConstantOp>()) {
-    return {constant.getValue()};
-  }
-  return {step};
-}
-
-// Fuses `linalg.fill` into a loop with a tiled reduction.
-// Currently, only 2D case is supported. Fusion into a tiled 1D reduction is
-// also possible.
-struct FuseFillIntoTiledReductionPattern : public OpRewritePattern<GenericOp> {
-  explicit FuseFillIntoTiledReductionPattern(MLIRContext *context,
-                                             mlir::PatternBenefit benefit = 1)
-      : OpRewritePattern<GenericOp>(context, benefit) {}
-
-  LogicalResult matchAndRewrite(GenericOp linalg_op,
-                                PatternRewriter &rewriter) const override {
-    if (linalg_op.getNumDpsInits() != 1) return failure();
-    if (linalg_op.getNumLoops() != 2) return failure();
-
-    // Get immediate parent.
-    auto tiled_loop_op =
-        dyn_cast<LoopOp>(linalg_op->getParentRegion()->getParentOp());
-    if (!tiled_loop_op) return failure();
-    if (tiled_loop_op.getNumLoops() != 2) return failure();
-
-    return RewriteTiledReduction(rewriter, tiled_loop_op, linalg_op);
-  }
-
- private:
-  // Add a new output argument to the `tiled_loop`. It will be produced by
-  // `empty` op with the same shape of the tiled output argument.
-  //
-  // Rewrite
-  //
-  //   %init = tensor.empty
-  //   %fill = linalg.fill(%cst, %init)
-  //   linalg.tiled_loop outs(%fill)
-  //
-  // into
-  //
-  //   %init = tensor.empty
-  //** %init_tile = tensor.empty [%stride]
-  //   %fill = linalg.fill(%cst, %init)
-  //** linalg.tiled_loop outs(%fill, %init_tile)
-  BlockArgument CloneAndAppendEmptyTensorToTiledLoop(PatternRewriter &rewriter,
-                                                     FillOp fill,
-                                                     LoopOp tiled_loop) const {
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(fill);
-
-    auto empty = fill.output().getDefiningOp<EmptyOp>();
-
-    Value empty_clone = rewriter.create<EmptyOp>(
-        empty.getLoc(), GetParallelDimStep(tiled_loop),
-        empty.getType().cast<mlir::RankedTensorType>().getElementType());
-    mlir::OpOperand *empty_clone_output_operand;
-    rewriter.updateRootInPlace(tiled_loop, [&]() {
-      empty_clone_output_operand =
-          &tiled_loop.appendOutputOperand(rewriter, empty_clone);
-    });
-    return tiled_loop.getTiedBlockArgument(*empty_clone_output_operand);
-  }
-
-  // Fuse `fill` operation into the `tiled_loop`, rewire the `linalg.generic` to
-  // use it as the output for the reduced tile. Also create an additional
-  // `insert_slice` that updates the new output.
-  //
-  // Rewrite
-  //
-  // %init = tensor.empty
-  // %init_tile = tensor.empty [%stride]
-  // %fill = linalg.fill(%cst, %init)
-  // linalg.tiled_loop outs(%fill, %init_tile) {
-  //   %extract_output_slice = tensor.extract_slice %fill
-  //   %reduce = linalg.generic outs (%extract_output_slice)
-  //   %insert_output_slice = tensor.insert_slice %reduce into %fill
-  //   linalg.yield %insert_output_slice
-  // }
-  //
-  // into
-  //
-  // %init = tensor.empty
-  // %init_tile = tensor.empty
-  // %fill = linalg.fill(%cst, %init)
-  // linalg.tiled_loop outs(%fill, %init_tile) {
-  //   %extract_output_slice = tensor.extract_slice %fill
-  //
-  //** %slice_of_output_tile = tensor.extract_slice %init
-  //** %fill_of_output_tile = linalg.fill(%cst, %slice_of_output_tile)
-  //** %reduce = linalg.generic outs (%fill_of_output_tile)
-  //** %update_output_tile = tensor.insert_slice %reduce into %init_tile
-  //
-  //   %insert_output_slice = tensor.insert_slice %reduce into %fill
-  //   linalg.yield %insert_output_slice, %update_output_tile
-  // }
-  void FuseFill(PatternRewriter &rewriter, LinalgOp tiled_op, FillOp fill,
-                BlockArgument loop_output_bb_arg,
-                BlockArgument output_tile_bb_arg,
-                ExtractSliceOp extract_output_slice,
-                InsertSliceOp insert_output_slice) const {
-    Location loc = tiled_op.getLoc();
-
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(tiled_op);
-
-    SmallVector<OpFoldResult> offset{rewriter.getIndexAttr(0)};
-    Value slice_of_output_tile = rewriter.create<ExtractSliceOp>(
-        loc, output_tile_bb_arg, offset, extract_output_slice.getMixedSizes(),
-        extract_output_slice.getMixedStrides());
-
-    auto fused_fill =
-        rewriter.create<FillOp>(loc, fill.value(), slice_of_output_tile);
-    rewriter.updateRootInPlace(tiled_op, [&]() {
-      tiled_op.getDpsInitOperand(0)->set(fused_fill.result());
-    });
-
-    rewriter.setInsertionPointAfter(tiled_op);
-    Value cloned_insert = rewriter.create<mlir::tensor::InsertSliceOp>(
-        loc, fused_fill.getResult(0), output_tile_bb_arg, offset,
-        extract_output_slice.getMixedSizes(),
-        extract_output_slice.getMixedStrides());
-
-    auto yield = tiled_op.getOperation()->getBlock()->getTerminator();
-    rewriter.updateRootInPlace(
-        yield, [&]() { yield->insertOperands(1, cloned_insert); });
-  }
-
-  // Add an operation that combines the partial result with the output.
-  //
-  // Rewrite
-  //
-  // %init = tensor.empty
-  // %init_tile = tensor.empty
-  // %fill = linalg.fill(%cst, %init)
-  // linalg.tiled_loop outs(%fill, %init_tile) {
-  //   %extract_output_slice = tensor.extract_slice %fill
-  //
-  //   %slice_of_output_tile = tensor.extract_slice %init
-  //   %fill_of_output_tile = linalg.fill(%cst, %slice_of_output_tile)
-  //   %reduce = linalg.generic outs (%fill_of_output_tile)
-  //   %update_output_tile = tensor.insert_slice %reduce into %init_tile
-  //
-  //   %insert_output_slice = tensor.insert_slice %reduce into %fill
-  //   linalg.yield %insert_output_slice, %update_output_tile
-  // }
-  //
-  // into
-  //
-  // %init = tensor.empty
-  // %init_tile = tensor.empty
-  // %fill = linalg.fill(%cst, %init)
-  // linalg.tiled_loop outs(%fill, %init_tile) {
-  //   %extract_output_slice = tensor.extract_slice %fill
-  //
-  //   %slice_of_output_tile = tensor.extract_slice %init
-  //   %fill_of_output_tile = linalg.fill(%cst, %slice_of_output_tile)
-  //   %reduce = linalg.generic outs (%fill_of_output_tile)
-  //   %update_output_tile = tensor.insert_slice %reduce into %init_tile
-  //
-  //** %combine = linalg.generic ins (%reduce) outs (%extract_output_slice)
-  //** %insert_output_slice = tensor.insert_slice %combine into %fill
-  //
-  //   linalg.yield %insert_output_slice, %update_output_tile
-  // }
-  LogicalResult CombineReducedTileWithOutput(
-      PatternRewriter &rewriter, LinalgOp tiled_op, Value partial_result,
-      ExtractSliceOp extract_output_slice,
-      InsertSliceOp insert_output_slice) const {
-    rewriter.setInsertionPointAfter(tiled_op);
-    auto num_parallel_loops = tiled_op.getNumParallelLoops();
-    SmallVector<mlir::StringRef, 3> parallel_iter_types(
-        num_parallel_loops, mlir::getParallelIteratorTypeName());
-    auto id_map = rewriter.getMultiDimIdentityMap(num_parallel_loops);
-
-    auto combiner_or = DetectCombiner(tiled_op);
-    if (failed(combiner_or)) return failure();
-    Operation *combiner = combiner_or.getValue();
-
-    auto accumulator = rewriter.create<GenericOp>(
-        tiled_op.getLoc(), partial_result.getType(),
-        makeArrayRef(partial_result), makeArrayRef((Value)extract_output_slice),
-        makeArrayRef({id_map, id_map}), parallel_iter_types,
-        [&](OpBuilder &b, Location nested_loc, ValueRange args) {
-          BlockAndValueMapping bvm;
-          bvm.map(combiner->getOperands(), args);
-          Value result_val = b.clone(*combiner, bvm)->getResult(0);
-          b.create<YieldOp>(nested_loc, result_val);
-        });
-
-    rewriter.updateRootInPlace(insert_output_slice, [&]() {
-      insert_output_slice.getSourceMutable().assign(accumulator.getResult(0));
-    });
-    return success();
-  }
-
-  // Unfortunaly, there is no way to modify the results of the loop inplace. So
-  // we have to replace it with a clone.
-  LoopOp CreateLoopWithUpdatedResults(PatternRewriter &rewriter,
-                                      LoopOp tiled_loop) const {
-    auto loc = tiled_loop.getLoc();
-    rewriter.setInsertionPoint(tiled_loop);
-    auto new_loop = rewriter.create<LoopOp>(
-        loc, mlir::TypeRange(tiled_loop.getOutputs()), tiled_loop.getOperands(),
-        tiled_loop->getAttrs());
-    rewriter.inlineRegionBefore(tiled_loop.getRegion(), new_loop.getRegion(),
-                                new_loop.getRegion().begin());
-
-    rewriter.replaceOp(tiled_loop, new_loop.getResult(0));
-    return new_loop;
-  }
-
-  // Fuses FillOp producer of the output argument of the LoopOp and inserts
-  // an operation that accumulates the partial result, i.e. reduced tile, and
-  // the current value of the output tile.
-  LogicalResult RewriteTiledReduction(PatternRewriter &rewriter,
-                                      LoopOp tiled_loop,
-                                      LinalgOp tiled_op) const {
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointAfter(tiled_op);
-
-    // Find tiled loop output operand and the corresponding block argument.
-    mlir::OpOperand *loop_output_operand =
-        tiled_loop.findOutputOperand(tiled_loop.getOutputs().front());
-    BlockArgument loop_output_bb_arg =
-        tiled_loop.getTiedBlockArgument(*loop_output_operand);
-
-    // Find `linalg.fill` producer of the output.
-    auto fill = loop_output_operand->get().getDefiningOp<FillOp>();
-    if (!fill) return failure();
-
-    // Find extract_slice/insert_slice pair used to RMW output.
-    auto extract_output_slice =
-        tiled_op.getDpsInitOperand(0)->get().getDefiningOp<ExtractSliceOp>();
-    if (!extract_output_slice) return failure();
-
-    Value tiled_op_result = tiled_op->getResult(0);
-    auto insert_output_slice =
-        dyn_cast<InsertSliceOp>(*tiled_op_result.getUsers().begin());
-    if (!insert_output_slice) return failure();
-
-    // Fuse the output.
-    BlockArgument output_tile_bb_arg =
-        CloneAndAppendEmptyTensorToTiledLoop(rewriter, fill, tiled_loop);
-    FuseFill(rewriter, tiled_op, fill, loop_output_bb_arg, output_tile_bb_arg,
-             extract_output_slice, insert_output_slice);
-    // We have already modified the loop above, so we need to update the
-    // results.
-    CreateLoopWithUpdatedResults(rewriter, tiled_loop);
-    return CombineReducedTileWithOutput(rewriter, tiled_op, tiled_op_result,
-                                        extract_output_slice,
-                                        insert_output_slice);
-  }
-};
-
-struct FuseFillIntoTiledReductionPass
-    : public impl::FuseFillIntoTiledReductionBase<
-          FuseFillIntoTiledReductionPass> {
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto context = func.getContext();
-
-    mlir::RewritePatternSet patterns(context);
-    patterns.add<FuseFillIntoTiledReductionPattern>(context);
-    (void)mlir::applyPatternsAndFoldGreedily(func, std::move(patterns));
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateFuseFillIntoTiledReductionPass() {
-  return std::make_unique<FuseFillIntoTiledReductionPass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
index b0f9525f792..0e3a24ee5c1 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_fusion.cc
@@ -149,8 +149,7 @@ struct FusionPass : public impl::FusionBase<FusionPass> {
     // Use TopDownTraversal for compile time reasons.
     mlir::GreedyRewriteConfig grc;
     grc.useTopDownTraversal = true;
-    (void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns),
-                                       grc);
+    (void)applyPatternsAndFoldGreedily(op, std::move(patterns), grc);
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_lower_vector_transpose.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_lower_vector_transpose.cc
deleted file mode 100644
index 91df9886ef0..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_lower_vector_transpose.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/Dialect/X86Vector/Transforms.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_LOWERTRANSPOSE
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-struct LowerTransposePass
-    : public impl::LowerTransposeBase<LowerTransposePass> {
-  void runOnOperation() override {
-    auto avx_lowering_options =
-        mlir::x86vector::avx2::LoweringOptions().setTransposeOptions(
-            mlir::x86vector::avx2::TransposeLoweringOptions()
-                .lower4x8xf32()
-                .lower8x8xf32());
-
-    mlir::func::FuncOp funcOp = getOperation();
-    mlir::MLIRContext *context = funcOp.getContext();
-    mlir::RewritePatternSet patterns(context);
-    mlir::vector::VectorTransformsOptions vectorTransformOptions;
-    vectorTransformOptions = vectorTransformOptions.setVectorTransposeLowering(
-        mlir::vector::VectorTransposeLowering::EltWise);
-    mlir::vector::populateVectorTransposeLoweringPatterns(
-        patterns, vectorTransformOptions);
-    mlir::x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
-        patterns, avx_lowering_options, /*benefit=*/10);
-
-    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateLowerVectorTransposePass() {
-  return std::make_unique<LowerTransposePass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_math_approximation.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_math_approximation.cc
index dbd1678dff8..e0148059827 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_math_approximation.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_math_approximation.cc
@@ -256,7 +256,7 @@ struct EigenExpM1Approximation : public OpRewritePattern<math::ExpM1Op> {
 LogicalResult EigenExpM1Approximation::matchAndRewrite(
     math::ExpM1Op op, PatternRewriter &rewriter) const {
   auto shape = vectorShape(op.getOperand().getType(), isF32);
-  if (!shape.hasValue())
+  if (!shape.has_value())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
index 9d975a363b8..f81159c9699 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.cc
@@ -20,7 +20,6 @@ limitations under the License.
 namespace tensorflow {
 
 using ::mlir::Operation;
-using ::mlir::linalg::LinalgOp;
 
 bool IsContiguousMemref(mlir::Value value) {
   auto memref_type = value.getType().dyn_cast<mlir::MemRefType>();
@@ -29,29 +28,4 @@ bool IsContiguousMemref(mlir::Value value) {
   return canonical_type.getLayout().isIdentity();
 }
 
-mlir::FailureOr<Operation *> DetectCombiner(LinalgOp linalg_op) {
-  mlir::SmallVector<Operation *, 4> combiners;
-  if (!matchReduction(linalg_op.getRegionOutputArgs(), 0, combiners) ||
-      combiners.size() != 1)
-    return mlir::failure();
-  return combiners.front();
-}
-
-constexpr llvm::StringLiteral kTransformMarker =
-    "__internal_transformation_marker__";
-
-void setTransformationAttr(mlir::OpBuilder &b, Operation *op) {
-  op->setAttr(kTransformMarker, b.getBoolAttr(true));
-}
-
-void removeTransformationAttr(Operation *op) {
-  op->removeAttr(kTransformMarker);
-}
-
-bool hasTransformationAttr(Operation *op) {
-  auto marker = op->getAttr(kTransformMarker);
-  if (!marker) return false;
-  return marker && marker.cast<mlir::BoolAttr>().getValue();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h
index 1b4016d5de8..ae7a7b8da17 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h
@@ -29,13 +29,10 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace tensorflow {
-#define GEN_PASS_DECL_TILEREDUCTION
-#define GEN_PASS_DECL_TILEFILL
-#define GEN_PASS_DECL_TILECWISE
 #define GEN_PASS_DECL_MATHAPPROXIMATION
 #define GEN_PASS_DECL_CLUSTERING
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
@@ -48,62 +45,18 @@ CreateLinalgTrivialBufferForwardingPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateLinalgTrivialCopyRemovalPass();
 
-// Pass to optimize padding in tiled loops by peeling the final loop iteration.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreatePeelTiledLoopsPass();
-
-// Pass to tile and fuse linalg.generic on tensors that models reduction.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileReductionPass();
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileReductionPass(int64_t reduction_vector_size,
-                        int64_t reduction_1d_tile_size,
-                        llvm::ArrayRef<int64_t> reduction_2d_tile_sizes);
-
-// Pass to fuse `linalg.fill` into a tiled reduction.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateFuseFillIntoTiledReductionPass();
-
 // Pass to replace 'i1' tensor types with 'i8' tensor types. This pass is a
 // temporary workaround to avoid the problem of vectorizing 'i1' tensors (see
 // b/205714705).
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateJitRtLegalizeI1TypesPass();
 
-// Rewrite `vector.multi_reduction` into a sequence of `vector.reduction` ops.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRewriteVectorMultiReductionPass();
-
-// Code generation passes targeting transpose operations.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileTransposePass();
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateLowerVectorTransposePass();
-
-// Pass to tile elementwise linalg.generic on tensors.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileCWisePass();
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileCWisePass(
-    int64_t cwise_tile_size);
-
-// Pass to tile linalg.fill on tensors.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileFillPass();
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileFillPass(
-    int64_t cwise_tile_size);
-
 // Pass to split _Fused Tensorflow kernels into primitives.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateFissionPass();
 
 // Pass to fuse Linalg generic operations on Tensors.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateFusionPass();
 
-// Pass to optimize broadcasts based on the symbolic shape constraints.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateSymbolicShapeOptimizationPass(bool constraints_only = false);
-
-// Pass to replace 0-d tensor inputs to LinalgOp with extracted elements.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateDetensorizeLinalgPass();
-
 // Creates `tf_device.cluster` operations according to the TF JitRt clustering
 // policy.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
@@ -119,20 +72,6 @@ CreateMathApproximationPass(llvm::ArrayRef<std::string> oplist = {});
 // Returns true if the `value` type is a memref that is contiguous in memory.
 bool IsContiguousMemref(mlir::Value value);
 
-// Detects the combiner in the body of LinalgOp if any. Currently, only
-// ops with a single combiner are supported.
-mlir::FailureOr<mlir::Operation *> DetectCombiner(
-    mlir::linalg::LinalgOp linalg_op);
-
-// Sets the attribute to the `op` that indicates that the op was transformed.
-void setTransformationAttr(mlir::OpBuilder &b, mlir::Operation *op);
-
-// Removes the attribute that indicates that it was transformed.
-void removeTransformationAttr(mlir::Operation *op);
-
-// Checks if `op` has the attribute that indicates that it was transformed.
-bool hasTransformationAttr(mlir::Operation *op);
-
 }  // namespace tensorflow
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.td b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.td
index 391aab0eab8..5e2578bfb50 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.td
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.td
@@ -52,85 +52,16 @@ def LinalgTrivialCopyRemoval
   }];
 }
 
-def TileCWise : Pass<"tf-jitrt-tile-cwise", "mlir::func::FuncOp"> {
-  let summary = "Tile cwise linalg.generic on tensors.";
-  let constructor = "tensorflow::CreateTileCWisePass()";
-
-  let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect"
-  ];
-
-  let options = [
-    Option<"cwise_tile_size", "cwise-tile-size",
-           "int64_t", /*default=*/"8",
-           "Tile size for the innermost dimension of an elementwise op.">,
-  ];
-}
-
-def TileFill : Pass<"tf-jitrt-tile-fill", "mlir::func::FuncOp"> {
-  let summary = "Tile linalg.fill on tensors";
-  let constructor = "tensorflow::CreateTileFillPass()";
-
-  let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect"
-  ];
-
-  let options = [
-    Option<"cwise_tile_size", "cwise-tile-size",
-           "int64_t", /*default=*/"8",
-           "Tile size for the innermost dimension of an elementwise op.">,
-  ];
-}
-
-def PeelTiledLoops : Pass<"tf-jitrt-peel-tiled-loops", "mlir::func::FuncOp"> {
-  let summary = "Optimize away padding in tiled loops";
-  let constructor = "tensorflow::CreatePeelTiledLoopsPass()";
-  let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect"
-  ];
-}
-
-def TileReduction : Pass<"tf-jitrt-tile-reduction", "mlir::func::FuncOp"> {
-  let summary = "Tile and fuse linalg.generic reduction on tensors.";
-  let constructor = "tensorflow::CreateTileReductionPass()";
-  let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect",
-    "mlir::memref::MemRefDialect",
-    "mlir::scf::SCFDialect"
-  ];
-
+def Fusion : Pass<"tf-jitrt-fusion", "mlir::func::FuncOp"> {
+  let summary = "Fuse Linalg generic operations on Tensors";
+  let constructor = "tensorflow::CreateFusionPass()";
   let description = [{
-    Matches linalg.generic to understand whether it is a reduction or not.
-    After that performs tiling for vectorization and fusion of producers.
+    Fuse Linalg generic operations on Tensors using custom heuristics for
+    producer fusion profitability.
   }];
-
-  let options = [
-    Option<"reduction_vector_size", "reduction-vector-size",
-           "int64_t", /*default=*/"8", "Vector size.">,
-    Option<"reduction_1d_tile_size", "reduction-1d-tile-size",
-           "int64_t", /*default=*/"32", "Tile size for a 1D reduction.">,
-    ListOption<"reduction_2d_tile_sizes", "reduction-2d-tile-sizes", "int64_t",
-               "Tile sizes for a 2D reduction.">,
-  ];
-}
-
-def FuseFillIntoTiledReduction
-    : Pass<"tf-jitrt-fuse-fill-into-tiled-reduction", "mlir::func::FuncOp"> {
-  let summary = "Fuse `linalg.fill` into `linalg.tiled_loop` with a reduction.";
-  let constructor = "tensorflow::CreateFuseFillIntoTiledReductionPass()";
   let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect"
+    "mlir::TF::TensorFlowDialect"
   ];
-
-  let description = [{
-    Fuses `linalg.fill` producers of output tensor arguments into
-    `linalg.tiled_loop`.
-  }];
 }
 
 def Fission : Pass<"tf-jitrt-fission", "mlir::func::FuncOp"> {
@@ -141,18 +72,6 @@ def Fission : Pass<"tf-jitrt-fission", "mlir::func::FuncOp"> {
   ];
 }
 
-def Fusion : Pass<"tf-jitrt-fusion", "mlir::func::FuncOp"> {
-  let summary = "Fuse Linalg generic operations on Tensors";
-  let constructor = "tensorflow::CreateFusionPass()";
-  let description = [{
-    Fuse Linalg generic operations on Tensors using custom heuristics for
-    producer fusion profitability.
-  }];
-  let dependentDialects = [
-    "mlir::TF::TensorFlowDialect"
-  ];
-}
-
 def JitRtLegalizeI1Types
     : Pass<"tf-jitrt-legalize-i1-types", "mlir::ModuleOp"> {
   let summary = "Legalize 'i1' tensor types";
@@ -166,29 +85,6 @@ def JitRtLegalizeI1Types
   ];
 }
 
-def SymbolicShapeOptimization
-    : Pass<"tf-jitrt-symbolic-shape-optimization", "mlir::func::FuncOp"> {
-  let summary = "Optimizes broadcasts based on the symbolic shapes";
-  let constructor = "tensorflow::CreateSymbolicShapeOptimizationPass()";
-  let description = [{
-    A simple pass that replaces shape constraints with const witnesses and
-    rewrites mhlo.broadcast_in_dim operations with linalg.generic broadcasts
-    using the symbolic shape attributes defined on the entrypoint function
-    arguments.
-  }];
-  let dependentDialects = [
-    "mlir::mhlo::MhloDialect",
-    "mlir::linalg::LinalgDialect"
-  ];
-
-  let options = [
-   Option<"optimize_only_constraints", "optimize-only-constraints",
-          "bool", /*default=*/"false",
-          "Optimize only shape constraints and do not touch broadcasts.">,
-
-  ];
-}
-
 def Clustering : Pass<"tf-jitrt-clustering", "mlir::func::FuncOp"> {
   let summary = "Creates `tf_device.cluster` operations according to the TF "
                 "JitRt clustering policy";
@@ -210,38 +106,6 @@ def Clustering : Pass<"tf-jitrt-clustering", "mlir::func::FuncOp"> {
   ];
 }
 
-def TileTranspose : Pass<"tf-jitrt-tile-transpose", "mlir::func::FuncOp"> {
-  let summary = "Tile transpose operations";
-  let constructor = "tensorflow::CreateTileTransposePass()";
-  let dependentDialects = [
-    "mlir::gml_st::GmlStDialect",
-    "mlir::linalg::LinalgDialect"
-  ];
-}
-
-def LowerTranspose : Pass<"tf-jitrt-lower-vector-transpose", "mlir::func::FuncOp"> {
-  let summary = "Lower vector transpose operations";
-  let constructor = "tensorflow::CreateLowerVectorTransposePass()";
-  let dependentDialects = [
-    "mlir::vector::VectorDialect",
-    "mlir::LLVM::LLVMDialect"
-  ];
-}
-
-def RewriteVectorMultiReductionPass :
-    Pass<"tf-jitrt-rewrite-vector-multi-reduction", "mlir::func::FuncOp"> {
-  let summary = "Convert `vector.multi_reduction` into `vector.reduction` ops.";
-  let constructor = "tensorflow::createRewriteVectorMultiReductionPass()";
-  let dependentDialects = ["mlir::memref::MemRefDialect"];
-}
-
-
-def DetensorizeLinalg : Pass<"tf-jitrt-detensorize-linalg", "mlir::func::FuncOp"> {
-  let summary = "Replace 0d tensor inputs to LinalgOp with extracted elements.";
-  let constructor = "tensorflow::CreateDetensorizeLinalgPass()";
-  let dependentDialects = ["mlir::linalg::LinalgDialect"];
-}
-
 def MathApproximation : Pass<"tf-jitrt-math-approximation", "mlir::func::FuncOp"> {
   let summary = "Approximate math operations with an implementation meant to "
                 "match Eigen's results. This is a useful property to have when "
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_peel_tiled_loops.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_peel_tiled_loops.cc
deleted file mode 100644
index 99b8f729ff0..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_peel_tiled_loops.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/StringRef.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_PEELTILEDLOOPS
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-constexpr llvm::StringRef kWasPeeledAttr = "PeelStLoopsPeeledAttr";
-
-using mlir::gml_st::ForOp;
-using mlir::gml_st::LoopOp;
-using mlir::gml_st::ParallelOp;
-
-template <typename LoopTy>
-struct PeelGmlStLoop : public mlir::OpRewritePattern<LoopTy> {
-  using mlir::OpRewritePattern<LoopTy>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      LoopTy loop, mlir::PatternRewriter &rewriter) const override {
-    if (loop->hasAttr(kWasPeeledAttr)) return mlir::failure();
-    auto true_attr = mlir::BoolAttr::get(rewriter.getContext(), true);
-    loop->setAttr(kWasPeeledAttr, true_attr);
-    for (int peeled_idx = loop.getNumLoops() - 1; peeled_idx >= 0;
-         peeled_idx--) {
-      LoopTy peel;
-      // Mark the new loop if one was created
-      if (mlir::gml_st::peelAndCanonicalizeGmlStLoop(rewriter, loop, peeled_idx,
-                                                     peel)
-              .succeeded())
-        peel->setAttr(kWasPeeledAttr, true_attr);
-    }
-    return mlir::success();
-  }
-};
-
-struct PeelTiledLoopsPass
-    : public impl::PeelTiledLoopsBase<PeelTiledLoopsPass> {
-  void runOnOperation() override {
-    auto func_op = getOperation();
-
-    // Apply some canonicalizations before loop splitting confuses the
-    // situation.
-    // TODO(tpopp): See if this is still necessary in the integrated version.
-    mlir::RewritePatternSet canonicalizations(func_op.getContext());
-    LoopOp::getCanonicalizationPatterns(canonicalizations,
-                                        func_op.getContext());
-    ForOp::getCanonicalizationPatterns(canonicalizations, func_op.getContext());
-    mlir::linalg::populateLinalgTilingCanonicalizationPatterns(
-        canonicalizations);
-    (void)applyPatternsAndFoldGreedily(func_op, std::move(canonicalizations));
-
-    mlir::RewritePatternSet loop_peeling(func_op.getContext());
-    loop_peeling.add<PeelGmlStLoop<LoopOp>, PeelGmlStLoop<ParallelOp>,
-                     PeelGmlStLoop<ForOp>>(func_op.getContext());
-    (void)applyPatternsAndFoldGreedily(func_op, std::move(loop_peeling));
-
-    func_op->walk([&](LoopOp op) {
-      if (op->hasAttr(kWasPeeledAttr)) op->removeAttr(kWasPeeledAttr);
-    });
-    func_op->walk([&](ParallelOp op) {
-      if (op->hasAttr(kWasPeeledAttr)) op->removeAttr(kWasPeeledAttr);
-    });
-    func_op->walk([&](ForOp op) {
-      if (op->hasAttr(kWasPeeledAttr)) op->removeAttr(kWasPeeledAttr);
-    });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreatePeelTiledLoopsPass() {
-  return std::make_unique<PeelTiledLoopsPass>();
-}
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_rewrite_vector_multi_reduction.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_rewrite_vector_multi_reduction.cc
deleted file mode 100644
index 9033ef7962c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_rewrite_vector_multi_reduction.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_REWRITEVECTORMULTIREDUCTIONPASS
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using mlir::MLIRContext;
-using mlir::Operation;
-using mlir::vector::MultiDimReductionOp;
-using mlir::vector::VectorMultiReductionLowering;
-
-struct RewriteVectorMultiReductionPass
-    : public impl::RewriteVectorMultiReductionPassBase<
-          RewriteVectorMultiReductionPass> {
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-    Operation* op = getOperation();
-    if (failed(RewriteTwoAndMoreDimReductions(ctx, op))) signalPassFailure();
-    if (failed(RewriteOneDimReductions(ctx, op))) signalPassFailure();
-  }
-
-  // Rewrite N-D reductions as the sequence of vector operations without
-  // horizontal reduction, i.e. `vector.reduction`.
-  mlir::LogicalResult RewriteTwoAndMoreDimReductions(MLIRContext* ctx,
-                                                     Operation* op) const {
-    mlir::ConversionTarget target(*ctx);
-    target.addLegalDialect<mlir::arith::ArithDialect,
-                           mlir::vector::VectorDialect>();
-    target.addDynamicallyLegalOp<MultiDimReductionOp>(
-        [&](MultiDimReductionOp op) {
-          return op.getSourceVectorType().getRank() == 1;
-        });
-
-    mlir::RewritePatternSet patterns(ctx);
-    mlir::vector::populateVectorMultiReductionLoweringPatterns(
-        patterns, VectorMultiReductionLowering::InnerParallel);
-    return applyPartialConversion(op, target, std::move(patterns));
-  }
-
-  // Rewrite 1D reductions as a `vector.reduction`.
-  mlir::LogicalResult RewriteOneDimReductions(MLIRContext* ctx,
-                                              Operation* op) const {
-    mlir::RewritePatternSet patterns(ctx);
-    mlir::vector::populateVectorMultiReductionLoweringPatterns(
-        patterns, VectorMultiReductionLowering::InnerReduction);
-    return applyPatternsAndFoldGreedily(op, std::move(patterns));
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createRewriteVectorMultiReductionPass() {
-  return std::make_unique<RewriteVectorMultiReductionPass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_symbolic_shape_optimization.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_symbolic_shape_optimization.cc
deleted file mode 100644
index e7c7eb169ac..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_symbolic_shape_optimization.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <sys/types.h>
-
-#include <string>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/shape_component_analysis.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-
-namespace tensorflow {
-namespace {
-
-using llvm::ArrayRef;
-using llvm::SmallVector;
-
-using mlir::AffineExpr;
-using mlir::AffineMap;
-using mlir::failure;
-using mlir::Location;
-using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpBuilder;
-using mlir::OperationPass;
-using mlir::RankedTensorType;
-using mlir::ShapeComponentAnalysis;
-using mlir::success;
-using mlir::TypeRange;
-using mlir::Value;
-using mlir::ValueRange;
-using mlir::arith::ConstantIndexOp;
-using mlir::arith::ConstantOp;
-using mlir::arith::IndexCastOp;
-using mlir::func::FuncOp;
-
-namespace linalg = mlir::linalg;
-namespace mhlo = mlir::mhlo;
-namespace shape = mlir::shape;
-namespace tensor = mlir::tensor;
-
-#define GEN_PASS_DEF_SYMBOLICSHAPEOPTIMIZATION
-#define GEN_PASS_DECL_SYMBOLICSHAPEOPTIMIZATION
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-// -------------------------------------------------------------------------- //
-
-
-
-
-
-
-// Replace shape.broadcast with a shape if it's statically known.
-class BroadcastOpLowering final
-    : public mlir::OpRewritePattern<shape::BroadcastOp> {
- public:
-  explicit BroadcastOpLowering(MLIRContext* ctx) : OpRewritePattern(ctx) {}
-
-  LogicalResult matchAndRewrite(shape::BroadcastOp op,
-                                mlir::PatternRewriter& rewriter) const override;
-};
-
-// Returns a shape tensor if the shapes can be broadcasted to a known shape.
-// Will either return one of the shapes or a generated mix of the shapes.
-llvm::Optional<Value> simplifyBroadcast(ShapeComponentAnalysis& analysis,
-                                        ValueRange shapes, Location loc,
-                                        OpBuilder* builder) {
-  // First find the input shape with the largest rank.
-  SmallVector<ArrayRef<ShapeComponentAnalysis::SymbolicExpr>> shapes_found;
-  size_t maxRank = 0;
-  for (const auto &shape : llvm::enumerate(shapes)) {
-    auto found_shape = analysis.GetValueInfo(shape.value());
-    if (!found_shape) return {};
-    shapes_found.push_back(*found_shape);
-    maxRank = std::max(maxRank, found_shape->size());
-  }
-  if (maxRank == 0) {
-    return Value(builder->create<tensor::FromElementsOp>(
-        loc, shapes[0].getType(), SmallVector<Value>()));
-  }
-
-  SmallVector<const ShapeComponentAnalysis::SymbolicExpr*> joined_dimensions(
-      maxRank);
-  SmallVector<std::pair<Value, int64_t>> shape_and_rank_for_dim(maxRank);
-  for (const auto &shape : llvm::enumerate(shapes_found)) {
-    for (const auto &dim : llvm::enumerate(llvm::reverse(shape.value()))) {
-      // 1 dimensions don't contribute to the final result.
-      if (dim.value().isConstant(1)) continue;
-      // If it's not a 1 dimension it will be present in the result. Remember
-      // where it came from.
-      auto index = maxRank - dim.index() - 1;
-      if (!joined_dimensions[index]) {
-        joined_dimensions[index] = &dim.value();
-        shape_and_rank_for_dim[index] =
-            std::make_pair(shapes[shape.index()], shape.value().size());
-        continue;
-      }
-      // Bail if the dimensions are neither equal nor 1.
-      if (*joined_dimensions[index] != dim.value()) return {};
-    }
-  }
-  // If the output is the same as one of the inputs just return that.
-  if (llvm::all_equal(shape_and_rank_for_dim) &&
-      shape_and_rank_for_dim[0].first) {
-    return shape_and_rank_for_dim[0].first;
-  }
-  // Otherwise rematerialize the shape from the pieces we have.
-  SmallVector<Value> elements;
-  for (int i = 0; i != maxRank; ++i) {
-    // 1 dimensions are filtered above, recreate the constant.
-    if (!shape_and_rank_for_dim[i].first) {
-      auto one = builder->getIntegerAttr(
-          shapes[0].getType().cast<RankedTensorType>().getElementType(), 1);
-      elements.push_back(builder->create<ConstantOp>(loc, one));
-      continue;
-    }
-    // Extract from one of the shapes, accounting for the reverse indexing
-    // performed by broadcast.
-    Value index = builder->create<ConstantIndexOp>(
-        loc, i - maxRank + shape_and_rank_for_dim[i].second);
-    elements.push_back(builder->create<tensor::ExtractOp>(
-        loc, shape_and_rank_for_dim[i].first, index));
-  }
-  return Value(builder->create<tensor::FromElementsOp>(loc, elements));
-}
-
-LogicalResult BroadcastOpLowering::matchAndRewrite(
-    shape::BroadcastOp op, mlir::PatternRewriter& rewriter) const {
-  ShapeComponentAnalysis shape_component_analysis;
-  auto new_broadcast = simplifyBroadcast(
-      shape_component_analysis, op.getShapes(), op.getLoc(), &rewriter);
-  if (!new_broadcast) return failure();
-  rewriter.replaceOp(op, {*new_broadcast});
-  return success();
-}
-
-// -------------------------------------------------------------------------- //
-
-// Rewrite mhlo.dynamic_broadcast_in_dim operation into linalg.generic operation
-// if can infer the indexing maps for the operand from the symbolic shapes.
-class DynamicBroadcastInDimOpLowering
-    : public mlir::OpRewritePattern<mhlo::DynamicBroadcastInDimOp> {
- public:
-  using Base = OpRewritePattern<mhlo::DynamicBroadcastInDimOp>;
-
-  explicit DynamicBroadcastInDimOpLowering(MLIRContext* ctx);
-
-  LogicalResult matchAndRewrite(mhlo::DynamicBroadcastInDimOp op,
-                                mlir::PatternRewriter& rewriter) const override;
-};
-
-DynamicBroadcastInDimOpLowering::DynamicBroadcastInDimOpLowering(
-    MLIRContext* ctx)
-    : Base(ctx) {}
-
-// Check if broadcasting `from` to `to_shape` is statically known to only have
-// dimensions that never expand or always expand.
-llvm::Optional<AffineMap> isNonExpandingBroadcast(
-    ShapeComponentAnalysis& analysis, Value from, Value to_shape) {
-  auto in_shape = analysis.GetShapeInfo(from);
-  auto out_shape = analysis.GetValueInfo(to_shape);
-  if (!in_shape || !out_shape) return {};
-
-  SmallVector<AffineExpr> input_map_exprs;
-  size_t rank = out_shape->size();
-  MLIRContext* ctx = (*out_shape)[0].expr.getContext();
-  size_t d = 0;
-  auto affine_zero = getAffineConstantExpr(0, ctx);
-  for (auto zip :
-       llvm::zip(llvm::reverse(*in_shape), llvm::reverse(*out_shape))) {
-    const auto& in = std::get<0>(zip);
-    const auto& out = std::get<1>(zip);
-    bool extend = in.isConstant(1) && !out.isConstant(1);
-    input_map_exprs.push_back(extend ? affine_zero
-                                     : getAffineDimExpr(rank - d - 1, ctx));
-    ++d;
-
-    // Bail if this is neither a known expansion nor a known non-expansion.
-    if (!extend && in != out) return {};
-  }
-  // Any leading dimensions will be expanded.
-  input_map_exprs.resize(in_shape->size(), affine_zero);
-  std::reverse(input_map_exprs.begin(), input_map_exprs.end());
-  return AffineMap::get(/*dimCount=*/rank,
-                        /*symbolCount=*/0, input_map_exprs, ctx);
-}
-
-LogicalResult DynamicBroadcastInDimOpLowering::matchAndRewrite(
-    mhlo::DynamicBroadcastInDimOp op, mlir::PatternRewriter& rewriter) const {
-  MLIRContext* ctx = getContext();
-
-  auto in_type = op.getOperand().getType().dyn_cast<RankedTensorType>();
-  auto out_type = op.getResult().getType().dyn_cast<RankedTensorType>();
-  if (!in_type || !out_type) return failure();
-
-  // Check that broadcast is right-aligned (numpy style), so that operand
-  // dimensions broadcasted to match inner-most dimensions of the output.
-  auto bcast_dims = op.getBroadcastDimensions().getValues<int64_t>();
-  auto expected_bcast_dims = llvm::seq<int64_t>(
-      out_type.getRank() - in_type.getRank(), out_type.getRank());
-  if (!llvm::equal(bcast_dims, expected_bcast_dims)) return failure();
-
-  ShapeComponentAnalysis shape_component_analysis;
-  auto input_map = isNonExpandingBroadcast(
-      shape_component_analysis, op.getOperand(), op.getOutputDimensions());
-  if (!input_map) return failure();
-
-  // Resolve dynamic output dimensions for the `tensor.empty` operation.
-  SmallVector<Value> output_dyn_dimensions;
-  Location loc = op.getLoc();
-  int64_t rank = out_type.getRank();
-  for (size_t d = 0; d < rank; ++d) {
-    int64_t output_dim = out_type.getShape()[d];
-
-    // Skip static output dimensions, they will be resolved from the shape.
-    if (output_dim >= 0) continue;
-
-    // Resolve the dynamic size of the output dimension.
-    Value output_dyn_dim = rewriter.create<tensor::ExtractOp>(
-        loc, op.getOutputDimensions(),
-        ValueRange{rewriter.create<ConstantIndexOp>(loc, d)});
-
-    // Symbolic shape analysis might have given us an i32 or i64. Cast to index.
-    if (!output_dyn_dim.getType().isIndex())
-      output_dyn_dim = rewriter.create<IndexCastOp>(
-          loc, rewriter.getIndexType(), output_dyn_dim);
-
-    output_dyn_dimensions.push_back(output_dyn_dim);
-  }
-
-  // Create a tensor.empty operation to initialize output.
-  Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-      loc, out_type.getShape(), out_type.getElementType(),
-      output_dyn_dimensions);
-
-  // Output indexing map is an identity with `rank` number of loops.
-  AffineMap output_map = AffineMap::getMultiDimIdentityMap(rank, ctx);
-
-  // All iterators are parallel.
-  SmallVector<llvm::StringRef> iterator_types(rank, "parallel");
-
-  rewriter.replaceOpWithNewOp<linalg::GenericOp>(
-      op, /*resultTensorTypes=*/TypeRange{emptyTensor.getType()},
-      /*inputs=*/ValueRange{op.getOperand()},
-      /*outputs=*/ValueRange{emptyTensor},
-      /*indexingMaps=*/llvm::makeArrayRef({*input_map, output_map}),
-      /*iteratorTypes=*/iterator_types,
-      [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
-        nested_builder.create<linalg::YieldOp>(nested_loc, args[0]);
-      });
-
-  return success();
-}
-
-// -------------------------------------------------------------------------- //
-// Optimize function based on the symbolic shape attributes.
-// -------------------------------------------------------------------------- //
-
-struct SymbolicShapeOptimizationPass
-    : public impl::SymbolicShapeOptimizationBase<
-          SymbolicShapeOptimizationPass> {
-  SymbolicShapeOptimizationPass() = default;
-
-  explicit SymbolicShapeOptimizationPass(bool constraints_only) {
-    this->optimize_only_constraints = constraints_only;
-  }
-
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-    mlir::RewritePatternSet patterns(ctx);
-
-    // Rewrite shape.broadcast based on the symbolic shapes.
-    patterns.add<BroadcastOpLowering>(ctx);
-
-    // Rewrite broadcasts based on the symbolic shapes if enabled.
-    if (!optimize_only_constraints)
-      patterns.add<DynamicBroadcastInDimOpLowering>(ctx);
-
-    // Add shape dialect canonicalization patterns to fold shape operations
-    // after constraints are replaced with constant witness.
-    for (auto op : ctx->getRegisteredOperations()) {
-      if (llvm::isa<shape::ShapeDialect>(op.getDialect()))
-        op.getCanonicalizationPatterns(patterns, ctx);
-    }
-
-    if (failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
-                                                  std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<FuncOp>> CreateSymbolicShapeOptimizationPass(
-    bool constraints_only) {
-  return std::make_unique<SymbolicShapeOptimizationPass>(constraints_only);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_cwise.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_cwise.cc
deleted file mode 100644
index bd99c578362..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_cwise.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_TILEFILL
-#define GEN_PASS_DEF_TILECWISE
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using mlir::failure;
-using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpBuilder;
-using mlir::Operation;
-using mlir::PatternRewriter;
-using mlir::SmallVector;
-using mlir::success;
-using mlir::Value;
-using mlir::arith::ConstantIndexOp;
-using mlir::gml_st::LoopOp;
-using mlir::linalg::FillOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::LinalgOp;
-using mlir::linalg::LinalgTilingOptions;
-
-struct TileCWisePattern : public mlir::OpInterfaceRewritePattern<LinalgOp> {
-  TileCWisePattern(LinalgTilingOptions options, MLIRContext *context,
-                   llvm::function_ref<bool(Operation *)> match_fn,
-                   mlir::PatternBenefit benefit = 1)
-      : mlir::OpInterfaceRewritePattern<LinalgOp>(context, benefit),
-        match_fn(match_fn),
-        options(options) {}
-
-  LogicalResult matchAndRewrite(LinalgOp linalg_op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(linalg_op)) return failure();
-    if (!match_fn(linalg_op)) return failure();
-
-    auto tiled_linalg_op =
-        mlir::gml_st::tileLinalgOp(rewriter, linalg_op, options);
-    if (failed(tiled_linalg_op) || tiled_linalg_op.getValue().loops.empty())
-      return failure();
-
-    LoopOp tiled_loop =
-        mlir::dyn_cast<LoopOp>(*tiled_linalg_op.getValue().loops.front());
-    if (!tiled_loop) return failure();
-
-    tiled_loop->walk(
-        [&](LinalgOp tiledOp) { setTransformationAttr(rewriter, tiledOp); });
-
-    rewriter.replaceOp(linalg_op, tiled_loop->getResults());
-    return success();
-  }
-
- private:
-  llvm::function_ref<bool(Operation *)> match_fn;
-  LinalgTilingOptions options;
-};
-
-// Return true if the generic has only parallel iterations. This disallows
-// windowed and reduction iteration.
-bool isNonTiledCwiseGeneric(Operation *op) {
-  if (op->getParentOfType<LoopOp>()) return false;
-  auto linalg_op = mlir::dyn_cast<GenericOp>(op);
-  if (linalg_op) {
-    if (!linalg_op.hasTensorSemantics()) return false;
-    return llvm::all_of(linalg_op.getIteratorTypesArray(),
-                        mlir::linalg::isParallelIterator);
-  }
-  if (auto fill_op = mlir::dyn_cast<FillOp>(op)) {
-    return fill_op.hasTensorSemantics();
-  }
-  return false;
-}
-
-// Return true if the generic has only parallel iterations. This disallows
-// windowed and reduction iteration.
-bool isNonTiledFill(Operation *op) {
-  if (op->getParentOfType<LoopOp>()) return false;
-  if (auto fill_op = mlir::dyn_cast<FillOp>(op)) {
-    return fill_op.hasTensorSemantics();
-  }
-  return false;
-}
-
-void Tile(mlir::func::FuncOp func, int64_t tile_size,
-          llvm::function_ref<bool(Operation *)> match_fn) {
-  LinalgTilingOptions tiling_options;
-  // Tile the innermost dimension by `tile_size` for vectorization and scalarize
-  // the other dimensions.
-  tiling_options.setTileSizeComputationFunction(
-      [&](OpBuilder b, Operation *op) {
-        auto num_loops = llvm::cast<LinalgOp>(op).getNumLoops();
-        SmallVector<Value> tiles(num_loops,
-                                 b.create<ConstantIndexOp>(op->getLoc(), 1));
-        if (!tiles.empty())
-          tiles.back() = b.create<ConstantIndexOp>(op->getLoc(), tile_size);
-        return tiles;
-      });
-
-  mlir::RewritePatternSet patterns(func.getContext());
-  patterns.add<TileCWisePattern>(tiling_options, patterns.getContext(),
-                                 match_fn);
-  (void)mlir::applyPatternsAndFoldGreedily(func, std::move(patterns));
-
-  // Ensure we drop the marker in the end.
-  func.walk([](LinalgOp op) { removeTransformationAttr(op); });
-}
-
-struct TileCWisePass : public impl::TileCWiseBase<TileCWisePass> {
-  TileCWisePass() = default;
-  explicit TileCWisePass(int64_t tile_size) { cwise_tile_size = tile_size; }
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    Tile(func, cwise_tile_size, isNonTiledCwiseGeneric);
-  }
-};
-
-struct TileFillPass : public impl::TileFillBase<TileFillPass> {
-  TileFillPass() = default;
-  explicit TileFillPass(int64_t tile_size) { cwise_tile_size = tile_size; }
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    Tile(func, cwise_tile_size, isNonTiledFill);
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileCWisePass() {
-  return std::make_unique<TileCWisePass>();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileCWisePass(
-    int64_t cwise_tile_size) {
-  return std::make_unique<TileCWisePass>(cwise_tile_size);
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileFillPass() {
-  return std::make_unique<TileFillPass>();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateTileFillPass(
-    int64_t cwise_tile_size) {
-  return std::make_unique<TileFillPass>(cwise_tile_size);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_reduction.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_reduction.cc
deleted file mode 100644
index b65e8f1f48a..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_reduction.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_TILEREDUCTION
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using llvm::makeArrayRef;
-using mlir::BlockAndValueMapping;
-using mlir::dyn_cast;
-using mlir::failure;
-using mlir::FailureOr;
-using mlir::Location;
-using mlir::LogicalResult;
-using mlir::MLIRContext;
-using mlir::OpBuilder;
-using mlir::Operation;
-using mlir::OpRewritePattern;
-using mlir::PatternRewriter;
-using mlir::RankedTensorType;
-using mlir::ShapedType;
-using mlir::SmallVector;
-using mlir::success;
-using mlir::Value;
-using mlir::ValueRange;
-using mlir::arith::ConstantIndexOp;
-using mlir::gml_st::IteratorTypeAttr;
-using mlir::gml_st::LoopOp;
-using mlir::linalg::FillOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::LinalgOp;
-using mlir::linalg::LinalgTilingOptions;
-using mlir::tensor::EmptyOp;
-using mlir::tensor::ExpandShapeOp;
-using mlir::tensor::ExtractSliceOp;
-using mlir::utils::IteratorType;
-
-// Match 1D or 2D reduction.
-bool isCanonicalizedReduction(Operation *op) {
-  auto reduction = mlir::dyn_cast<GenericOp>(op);
-  if (!reduction) return false;
-
-  if (reduction.getNumDpsInits() != 1) return false;
-  if (reduction.getNumLoops() > 2) return false;
-  return reduction.getNumReductionLoops() == 1;
-}
-
-// Tiles a GenericOp that models a 2D row or column reduction.
-struct RowOrColumnReductionTilingPattern : public OpRewritePattern<GenericOp> {
-  RowOrColumnReductionTilingPattern(const LinalgTilingOptions &options,
-                                    MLIRContext *context,
-                                    mlir::PatternBenefit benefit = 1)
-      : OpRewritePattern<GenericOp>(context, benefit), options(options) {}
-
-  LogicalResult matchAndRewrite(GenericOp linalg_op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(linalg_op)) return failure();
-    if (!isCanonicalizedReduction(linalg_op)) return failure();
-
-    if (linalg_op.getNumDpsInits() != 1) return failure();
-    if (linalg_op.getNumLoops() != 2) return failure();
-
-    auto tiled_op = mlir::gml_st::tileLinalgOp(rewriter, linalg_op, options);
-    if (failed(tiled_op)) return failure();
-
-    tiled_op->loops.front()->walk(
-        [&](LinalgOp tOp) { setTransformationAttr(rewriter, tOp); });
-
-    rewriter.replaceOp(linalg_op, tiled_op->tensorResults);
-    return success();
-  }
-
- private:
-  LinalgTilingOptions options;
-};
-
-// Rewrites a 1D reduction for vectorization. Matches `linalg.generic` that
-// combines elements of tensor<?xELEM_TYPE> into tensor<ELEM_TYPE> and then
-// creates a perfectly-tilable loop to reduce tensor<?xELEM_TYPE> ->
-// tensor<VECTOR_SIZExELEM_TYPE> and an additional `linalg.generic` that reduces
-// tensor<VECTOR_SIZExELEM_TYPE> to tensor<ELEM_TYPE>.
-//
-// Example:
-//
-// %sum = linalg.generic {
-//   indexing_maps = [affine_map<(d0) -> (d0)>,
-//                    affine_map<(d0) -> ()>],
-//   iterator_types = ["reduction"]}
-//   ins(%input : tensor<?xf32>)
-//   outs(%fill : tensor<f32>) {
-// ^bb0(%in: f32, %out: f32):
-//   %add = arith.addf %in, %out : f32
-//   linalg.yield %add : f32
-// } -> tensor<f32>
-//
-// will be rewritten as
-//
-// %vector_result = gml_st.loop (%i)
-//     = (%c0) to (%TILABLE_UB) step (%vector_size)
-//     ins (%input_ = %input: tensor<?xf32>)
-//     outs (%tmp_result_ = %tmp_result: tensor<VECTOR_SIZExf32>)
-//     iterators["reduction"] {
-//   %tile = tensor.extract_slice %arg2[%i] [%TILE_SIZE] [1]
-//     : tensor<?xf32> to tensor<TILE_SIZExf32>
-//   %tile_reshape = tensor.expand_shape %tile [[0, 1]]
-//     : tensor<VECTOR_SIZExf32> into tensor<1xVECTOR_SIZExf32>
-//   %combine = linalg.generic ins(%tile_reshape : tensor<1xVECTOR_SIZExf32>)
-//     outs(%tmp_result_ : tensor<VECTOR_SIZExf32>) -> tensor<VECTOR_SIZExf32>
-//   linalg.yield %combine : tensor<VECTOR_SIZExf32>
-// }
-// %horizontal_reduce = linalg.generic
-//   ins(%vector_result : tensor<VECTOR_SIZExf32>)
-//   outs(%fill : tensor<f32>) -> tensor<f32> // combiner only
-// %result = gml_st.loop (%i)
-//     = (%TILABLE_UB) to (%INPUT_SIZE) step (%vector_size)
-//     ins (%input_ = %input: tensor<?xf32>)
-//     outs (%tmp_result_ = %horizontal_reduce: tensor<f32>)
-//     iterators["reduction"] {
-//   linalg.generic // reduces the tail
-// }
-//
-// This is necessary to push horizontal reduction to the later stage.
-struct OneDimReductionTilingPattern : public OpRewritePattern<GenericOp> {
-  OneDimReductionTilingPattern(int64_t vector_size, int64_t tile_size,
-                               mlir::MLIRContext *context,
-                               mlir::PatternBenefit benefit = 1)
-      : OpRewritePattern<GenericOp>(context, benefit),
-        vector_size(vector_size),
-        tile_size(tile_size) {}
-
-  LogicalResult matchAndRewrite(GenericOp linalg_op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(linalg_op)) return failure();
-    if (!isCanonicalizedReduction(linalg_op)) return failure();
-
-    // Check if all inputs have a 1D identity map.
-    if (linalg_op.getNumLoops() != 1) return failure();
-    auto indexing_maps = linalg_op.getIndexingMapsArray();
-    for (auto affine_map : makeArrayRef(indexing_maps).drop_back()) {
-      if (!affine_map.isIdentity()) return failure();
-    }
-
-    Location loc = linalg_op.getLoc();
-    Value input = linalg_op.getDpsInputOperand(0)->get();
-    // All inputs have the same size because of identity maps for indexing.
-    SmallVector<Value> inputs = linalg_op.getInputs();
-    Value input_size = rewriter.create<mlir::tensor::DimOp>(loc, input, 0);
-
-    auto fill_op = linalg_op.getOutputs().front().getDefiningOp<FillOp>();
-    auto empty_op = fill_op.output().getDefiningOp<mlir::tensor::EmptyOp>();
-
-    auto neutral_value = fill_op.value();
-    auto element_type = empty_op.getType().getElementType();
-
-    Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
-    Value tile_size_value = rewriter.create<ConstantIndexOp>(loc, tile_size);
-    Value new_empty =
-        rewriter.create<mlir::tensor::EmptyOp>(loc, vector_size, element_type);
-    Value new_fill =
-        rewriter.create<FillOp>(loc, fill_op.value(), new_empty).result();
-
-    llvm::Optional<Value> tilable_bound_or =
-        getTilableBound(rewriter, loc, zero, input_size, tile_size_value);
-    Value tilable_bound =
-        tilable_bound_or.has_value() ? *tilable_bound_or : input_size;
-
-    GenericOp tiled_reduction;
-    auto perfectly_tiled_loop = rewriter.create<LoopOp>(
-        loc, makeArrayRef(zero), makeArrayRef(tilable_bound),
-        makeArrayRef(tile_size_value), inputs, makeArrayRef(new_fill),
-        rewriter.getArrayAttr({IteratorTypeAttr::get(rewriter.getContext(),
-                                                     IteratorType::reduction)}),
-        [&](OpBuilder &b, Location nested_loc, ValueRange ivs,
-            ValueRange inputs, ValueRange outputs) {
-          SmallVector<Value, 2> reshaped_tiled_inputs =
-              TileAndReshapeInputTensors(b, nested_loc, ivs, inputs,
-                                         neutral_value, input_size,
-                                         tile_size_value);
-          // Create `linalg.generic` to combine
-          // `tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE> input with
-          // the `tensor<VECTOR_SIZExELEM_TYPE>` output.
-          SmallVector<mlir::StringRef, 2> iter_types{
-              mlir::getReductionIteratorTypeName(),
-              mlir::getParallelIteratorTypeName()};
-          SmallVector<mlir::AffineMap, 2> indexing_maps(
-              inputs.size(), rewriter.getMultiDimIdentityMap(2));
-          indexing_maps.push_back(
-              mlir::AffineMap::get(2, 0, b.getAffineDimExpr(1)));
-          tiled_reduction = b.create<GenericOp>(
-              nested_loc, outputs[0].getType(), reshaped_tiled_inputs,
-              makeArrayRef({outputs[0]}), indexing_maps, iter_types,
-              /*bodyBuild=*/nullptr);
-          mlir::Region &region = tiled_reduction.getRegion();
-          OpBuilder::InsertionGuard g(rewriter);
-          rewriter.cloneRegionBefore(linalg_op.getRegion(), region,
-                                     region.end());
-          b.create<mlir::gml_st::YieldOp>(nested_loc,
-                                          tiled_reduction.getResult(0));
-        });
-    // Create `linalg.generic` to reduce
-    // tensor<VECTOR_SIZExELEM_TYPE>->tensor<ELEM_TYPE>.
-    auto horizontal_reduction_or = ReduceVectorIntoOutput(
-        rewriter, linalg_op, perfectly_tiled_loop.getResult(0));
-    if (failed(horizontal_reduction_or)) return failure();
-    auto horizontal_reduction = horizontal_reduction_or.getValue();
-    Value result = horizontal_reduction->getResult(0);
-
-    // If the loop was not perfectly tiled, then we have to combine
-    // `horizontal_reduction` with the elements in the `tail`.
-    if (tilable_bound_or.has_value()) {
-      auto final_reduction = rewriter.create<LoopOp>(
-          loc, tilable_bound, input_size, tile_size_value, inputs,
-          makeArrayRef(result),
-          rewriter.getArrayAttr({IteratorTypeAttr::get(
-              rewriter.getContext(), IteratorType::reduction)}),
-          [&](OpBuilder &b, Location nested_loc, ValueRange ivs,
-              ValueRange inputs, ValueRange outputs) {
-            BlockAndValueMapping bvm;
-            mlir::AffineExpr sym0, sym1;
-            bindSymbols(b.getContext(), sym0, sym1);
-            auto diff_map = mlir::AffineMap::get(0, 2, {sym1 - sym0});
-
-            Value one = b.create<ConstantIndexOp>(nested_loc, 1);
-            auto size = b.createOrFold<mlir::AffineApplyOp>(
-                nested_loc, diff_map, ValueRange{tilable_bound, input_size});
-            std::vector<Value> sliced_inputs;
-            sliced_inputs.reserve(inputs.size());
-            for (Value input : inputs) {
-              sliced_inputs.push_back(
-                  b.create<ExtractSliceOp>(nested_loc, input, ivs, size, one));
-            }
-            bvm.map(linalg_op.getInputs(), sliced_inputs);
-            bvm.map(linalg_op.getOutputs(), outputs);
-            auto new_linalg_op = b.clone(*linalg_op.getOperation(), bvm);
-            setTransformationAttr(b, new_linalg_op);
-            b.create<mlir::gml_st::YieldOp>(nested_loc,
-                                            new_linalg_op->getResult(0));
-          });
-      result = final_reduction.getResult(0);
-    }
-    rewriter.replaceOp(linalg_op, result);
-
-    perfectly_tiled_loop->walk(
-        [&](GenericOp op) { setTransformationAttr(rewriter, op); });
-    setTransformationAttr(rewriter, horizontal_reduction);
-    return success();
-  }
-
- private:
-  // Computes an upper bound that can be perfectly tiled. Return llvm::None, if
-  // the loop is already perfectly tiled.
-  mlir::Optional<Value> getTilableBound(OpBuilder &b, Location loc, Value lb,
-                                        Value ub, Value step) const {
-    auto lb_int = getConstantIntValue(lb);
-    auto ub_int = getConstantIntValue(ub);
-    auto step_int = getConstantIntValue(step);
-
-    // No specialization necessary if step already divides upper bound evenly.
-    if (lb_int && ub_int && step_int && (*ub_int - *lb_int) % *step_int == 0)
-      return llvm::None;
-    // No specialization necessary if step size is 1.
-    if (mlir::isConstantIntValue(step, 1)) return llvm::None;
-    mlir::AffineExpr sym0, sym1, sym2;
-    bindSymbols(b.getContext(), sym0, sym1, sym2);
-
-    // New upper bound: %ub - (%ub - %lb) mod %step
-    auto mod_map = mlir::AffineMap::get(0, 3, {sym1 - ((sym1 - sym0) % sym2)});
-    return {b.createOrFold<mlir::AffineApplyOp>(loc, mod_map,
-                                                ValueRange{lb, ub, step})};
-  }
-
-  // Tiles, pads and reshapes every input argument of type tensor<?xELEM_TYPE>
-  // into tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE>.
-  SmallVector<Value, 2> TileAndReshapeInputTensors(
-      OpBuilder &b, Location nested_loc, ValueRange ivs, ValueRange inputs,
-      Value neutral_value, Value input_size, Value tile_size_value) const {
-    SmallVector<Value, 2> reshaped_tiled_inputs;
-
-    SmallVector<mlir::ReassociationIndices> indices = {{0, 1}};
-    auto identity_1d_map = b.getMultiDimIdentityMap(1);
-    auto iv = ivs.front();
-
-    mlir::OpFoldResult tile_size_fold = tile_size_value;
-    mlir::OpFoldResult input_size_fold = input_size;
-    auto tile_sizes = mlir::linalg::computeTileSizes(
-        b, nested_loc, tile_size_fold, input_size_fold);
-    for (auto input : inputs) {
-      // Extract slice of input.
-      Value slice = mlir::linalg::makeTiledShape(
-          b, nested_loc, input, tile_size_fold, identity_1d_map,
-          mlir::OpFoldResult(iv), input_size_fold, tile_sizes,
-          /*omitPartialTileCheck=*/true);
-      auto element_type = slice.getType().cast<ShapedType>().getElementType();
-
-      // Reshape input tile to
-      // tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE>.
-      Value expand_shape = b.create<ExpandShapeOp>(
-          nested_loc,
-          RankedTensorType::get({tile_size / vector_size, vector_size},
-                                element_type),
-          slice, indices);
-      reshaped_tiled_inputs.push_back(expand_shape);
-    }
-    return reshaped_tiled_inputs;
-  }
-
-  // Creates `linalg.generic` to reduce
-  // tensor<VECTOR_SIZExELEM_TYPE>->tensor<ELEM_TYPE>. To perform that we match
-  // the combiner in the original "untiled" linalg_op.
-  FailureOr<GenericOp> ReduceVectorIntoOutput(PatternRewriter &rewriter,
-                                              LinalgOp linalg_op,
-                                              Value partial_result) const {
-    SmallVector<mlir::StringRef, 3> reduction_iter_type(
-        1, mlir::getReductionIteratorTypeName());
-    auto map = mlir::AffineMap::get(1, 0, llvm::None, rewriter.getContext());
-
-    auto combiner_or = DetectCombiner(linalg_op);
-    if (failed(combiner_or)) return failure();
-    Operation *combiner = combiner_or.getValue();
-
-    auto accumulator = rewriter.create<GenericOp>(
-        linalg_op.getLoc(), linalg_op->getResultTypes(),
-        makeArrayRef(partial_result),
-        makeArrayRef(linalg_op.getDpsInitOperand(0)->get()),
-        makeArrayRef({rewriter.getMultiDimIdentityMap(1), map}),
-        reduction_iter_type,
-        [&](OpBuilder &b, Location nested_loc, ValueRange args) {
-          BlockAndValueMapping bvm;
-          bvm.map(combiner->getOperands(), args);
-          Value result_val = b.clone(*combiner, bvm)->getResult(0);
-          b.create<mlir::linalg::YieldOp>(nested_loc, result_val);
-        });
-    return accumulator;
-  }
-
- private:
-  int64_t vector_size;
-  int64_t tile_size;
-};
-
-struct TileReductionPass : public impl::TileReductionBase<TileReductionPass> {
-  TileReductionPass() = default;
-  TileReductionPass(int64_t vector_size, int64_t reduction_1d_tile,
-                    llvm::ArrayRef<int64_t> reduction_2d_tiles) {
-    reduction_vector_size = vector_size;
-    reduction_1d_tile_size = reduction_1d_tile;
-    reduction_2d_tile_sizes = reduction_2d_tiles;
-  }
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto context = func.getContext();
-
-    assert(reduction_1d_tile_size % reduction_vector_size == 0 &&
-           "Tile size for 1D reduction should be a multiple of vector size");
-    auto patterns =
-        mlir::linalg::getLinalgTilingCanonicalizationPatterns(context);
-    patterns.add<OneDimReductionTilingPattern>(
-        reduction_vector_size, reduction_1d_tile_size, patterns.getContext());
-
-    assert(reduction_2d_tile_sizes.size() == 2 &&
-           "Tiling sizes for 2D reductions should have two elements");
-    patterns.add<RowOrColumnReductionTilingPattern>(
-        LinalgTilingOptions{}.setTileSizes(reduction_2d_tile_sizes),
-        patterns.getContext());
-    (void)mlir::applyPatternsAndFoldGreedily(func, std::move(patterns));
-
-    // Ensure we drop the marker in the end.
-    func.walk([](LinalgOp op) { removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileReductionPass() {
-  return std::make_unique<TileReductionPass>();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileReductionPass(int64_t reduction_vector_size,
-                        int64_t reduction_1d_tile_size,
-                        llvm::ArrayRef<int64_t> reduction_2d_tile_sizes) {
-  return std::make_unique<TileReductionPass>(
-      reduction_vector_size, reduction_1d_tile_size, reduction_2d_tile_sizes);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_transpose.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_transpose.cc
deleted file mode 100644
index 72866dd1170..00000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_tile_transpose.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_DEF_TILETRANSPOSE
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h.inc"
-
-using llvm::SmallVector;
-using mlir::Attribute;
-using mlir::dyn_cast;
-using mlir::failure;
-using mlir::MLIRContext;
-using mlir::Operation;
-using mlir::PatternRewriter;
-using mlir::success;
-using mlir::Value;
-using mlir::arith::ConstantIndexOp;
-using mlir::gml_st::LoopOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::LinalgTilingOptions;
-
-/// Returns true if the operation is a GenericOp implementing a transposition.
-// TODO(diegocaballero): Move it to MLIR core?
-bool IsTransposeGenericOp(Operation *op) {
-  // Check that op is a generic op and has at least 2 dimensions.
-  auto generic_op = dyn_cast<GenericOp>(op);
-  if (!generic_op) return false;
-  if (generic_op.getNumLoops() < 2) return false;
-
-  // Check whether the body has only one operation (yield op). Transpose ops
-  // fused with any other operations are not supported for now.
-  mlir::Block *body = generic_op.getBody();
-  if (body->empty() || body->begin() != std::prev(body->end())) return false;
-  auto yield_op = dyn_cast<mlir::linalg::YieldOp>(body->back());
-  if (!yield_op || (yield_op.getNumOperands() != 1)) return false;
-
-  // Check input and output.
-  if ((generic_op.getNumDpsInputs() != 1) || (generic_op.getNumDpsInits() != 1))
-    return false;
-
-  // Check that input is yielded.
-  if (generic_op.getMatchingBlockArgument(generic_op.getDpsInputOperand(0)) !=
-      yield_op.getOperand(0))
-    return false;
-
-  // Check parallel iterators.
-  auto iterator_types = generic_op.getIteratorTypesArray();
-  if (std::any_of(iterator_types.begin(), iterator_types.end(),
-                  [](auto iterator_type) {
-                    return !mlir::linalg::isParallelIterator(iterator_type);
-                  }))
-    return false;
-
-  // Check that the two indexing maps are a permutation.
-  auto indexing_maps = generic_op.getIndexingMapsArray();
-  if (indexing_maps.size() != 2) return false;
-  return (indexing_maps[0].isIdentity() && indexing_maps[1].isPermutation()) ||
-         (indexing_maps[0].isPermutation() && indexing_maps[1].isIdentity());
-}
-
-struct TileTransposePattern : public mlir::OpRewritePattern<GenericOp> {
-  TileTransposePattern(LinalgTilingOptions options, MLIRContext *context,
-                       mlir::PatternBenefit benefit = 1)
-      : mlir::OpRewritePattern<GenericOp>(context, benefit), options(options) {}
-
-  mlir::LogicalResult matchAndRewrite(
-      GenericOp linalg_op, PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(linalg_op)) return failure();
-    if (!IsTransposeGenericOp(linalg_op)) return failure();
-
-    auto tiled_linalg_op =
-        mlir::gml_st::tileLinalgOp(rewriter, linalg_op, options);
-    if (failed(tiled_linalg_op) || tiled_linalg_op.getValue().loops.empty())
-      return failure();
-
-    auto tiled_loop =
-        mlir::dyn_cast<LoopOp>(*tiled_linalg_op.getValue().loops.front());
-    if (!tiled_loop) return failure();
-
-    tiled_loop->walk(
-        [&](GenericOp tiledOp) { setTransformationAttr(rewriter, tiledOp); });
-
-    rewriter.replaceOp(linalg_op, tiled_loop->getResults());
-    return success();
-  }
-
- private:
-  LinalgTilingOptions options;
-};
-
-struct TileTransposePass : public impl::TileTransposeBase<TileTransposePass> {
-  void runOnOperation() override {
-    auto get_tile_size = [&](mlir::OpBuilder b, Operation *op) {
-      auto generic_op = llvm::cast<GenericOp>(op);
-      unsigned num_loops = generic_op.getNumLoops();
-      assert(num_loops >= 2 && "Expect two or more dimension in transpose op");
-
-      // Compute the tile sizes for the 2-D vectorization of the transpose. We
-      // pick eight as default vectorization factor for both dimensions since
-      // it's the most performant AVX2 pattern for now. We pick the contiguous
-      // dimension of the input as first vector dimension and the contiguous
-      // dimension of the output as second vector dimension. This will maximize
-      // contiguous vector loads/stores and minimize insert/extract/gather/
-      // scatter operations.
-      SmallVector<Value> tiles(num_loops,
-                               b.create<ConstantIndexOp>(op->getLoc(), 1));
-      auto indexing_maps = generic_op.getIndexingMapsArray();
-      unsigned last_dim = num_loops - 1;
-      unsigned vec_factor0 = 8, vec_factor1 = 8;
-      unsigned vec_dim0 = indexing_maps[0].getDimPosition(last_dim);
-      unsigned vec_dim1 = indexing_maps[1].getDimPosition(last_dim);
-
-      // If the contiguous dimensions of both input and output are not
-      // transposed (i.e, they are the same), we vectorize only that dimension.
-      // That transpose case doesn't require intra-register transposition but
-      // just copying a set of contiguous sub-buffers from the input to the
-      // output tensor. Vectorizing a second dimension would increase too much
-      // the memory pressure for no reason.
-      if (vec_dim0 == vec_dim1) {
-        tiles[vec_dim0] = b.create<ConstantIndexOp>(op->getLoc(), vec_factor0);
-      } else {
-        tiles[vec_dim0] = b.create<ConstantIndexOp>(op->getLoc(), vec_factor0);
-        tiles[vec_dim1] = b.create<ConstantIndexOp>(op->getLoc(), vec_factor1);
-      }
-
-      return tiles;
-    };
-
-    auto func = getOperation();
-    auto tiling_options =
-        LinalgTilingOptions().setTileSizeComputationFunction(get_tile_size);
-
-    mlir::RewritePatternSet patterns(func.getContext());
-    patterns.add<TileTransposePattern>(tiling_options, patterns.getContext());
-    if (failed(mlir::applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-      signalPassFailure();
-    }
-
-    // Ensure we drop the marker in the end.
-    func.walk([](GenericOp op) { removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateTileTransposePass() {
-  return std::make_unique<TileTransposePass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/lhlo-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/lhlo-tfrt-opt.cc
index 3c61d83f6b9..7af989f9cd3 100644
--- a/tensorflow/compiler/mlir/tfrt/lhlo-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/lhlo-tfrt-opt.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/BUILD b/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
index 365e9e360f1..3413657181d 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 py_strict_test(
@@ -309,6 +311,30 @@ py_strict_test(
     ],
 )
 
+py_strict_test(
+    name = "tf_reverse_test",
+    srcs = ["tf_reverse_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_test(
+    name = "tf_scatter_test",
+    srcs = ["tf_scatter_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 td_library(
     name = "python_test_attrs_td_files",
     srcs = ["python_test_attrs.td"],
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/python_test_attrs.td b/tensorflow/compiler/mlir/tfrt/python_tests/python_test_attrs.td
index 0b575a07709..4b40d2c7865 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/python_test_attrs.td
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/python_test_attrs.td
@@ -36,8 +36,7 @@ def PythonTestAttrsDialect : Dialect {
       return (getDialectNamespace() + ".shape_value").str();
     }
   }];
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif  // PYTHON_TEST_ATTRS
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/broadcasting_25.mlir b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/broadcasting_25.mlir
new file mode 100644
index 00000000000..308dfc65d28
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/broadcasting_25.mlir
@@ -0,0 +1,15 @@
+func.func @test(
+    %V__0: tensor<?xf32> { python_test_attrs.static_type = tensor<1xf32> },
+    %V__1: tensor<?xf32> { python_test_attrs.static_type = tensor<1xf32> },
+    %V__2: tensor<?xf32> { python_test_attrs.static_type = tensor<1xf32> }
+  ) -> tensor<?xf32> {
+  %1 = "tf.AddV2"(%V__0, %V__0) { device = "/job:localhost/replica:0/task:0/device:CPU:0" }
+    : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %2 = "tf.Rint"(%V__1) { device = "/job:localhost/replica:0/task:0/device:CPU:0" }
+    : (tensor<?xf32>) -> tensor<?xf32>
+  %3 = "tf.Exp"(%2) { device = "/job:localhost/replica:0/task:0/device:CPU:0" }
+    : (tensor<?xf32>) -> tensor<?xf32>
+  %4 = "tf.AddV2"(%V__2, %3) { device = "/job:localhost/replica:0/task:0/device:CPU:0" }
+    : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  func.return %4 : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/tf_matmul_test.py b/tensorflow/compiler/mlir/tfrt/python_tests/tf_matmul_test.py
index 81a16bcf45e..f6c684fab5f 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/tf_matmul_test.py
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/tf_matmul_test.py
@@ -47,14 +47,16 @@ class TfMatMulTest(test.TestCase):
 
   # Matmul: [1, k] x [k, 1]
   def test_dot_product(self):
-    compiled = jitrt.compile(matmul(), "matmul", vectorize=True)
+    compiled = jitrt.compile(
+        matmul(), "matmul", vectorize=True, enable_xla_cpu_transformations=True)
     for _ in range(100):
       k = np.random.randint(1, 10)
       verify_matmul(compiled, 1, k, 1)
 
   # Matmul: [1, k] x [k, n]
   def test_vec_mat(self):
-    compiled = jitrt.compile(matmul(), "matmul", vectorize=True)
+    compiled = jitrt.compile(
+        matmul(), "matmul", vectorize=True, enable_xla_cpu_transformations=True)
     for _ in range(100):
       k = np.random.randint(1, 10)
       n = np.random.randint(1, 10)
@@ -62,7 +64,8 @@ def test_vec_mat(self):
 
   # Matmul: [n, k] x [k, 1]
   def test_mat_vec(self):
-    compiled = jitrt.compile(matmul(), "matmul", vectorize=True)
+    compiled = jitrt.compile(
+        matmul(), "matmul", vectorize=True, enable_xla_cpu_transformations=True)
     for _ in range(100):
       m = np.random.randint(1, 10)
       k = np.random.randint(1, 10)
@@ -70,7 +73,8 @@ def test_mat_vec(self):
 
   # Matmul: [m, k] x [k, n]
   def test_matmul(self):
-    compiled = jitrt.compile(matmul(), "matmul", vectorize=True)
+    compiled = jitrt.compile(
+        matmul(), "matmul", vectorize=True, enable_xla_cpu_transformations=True)
     for _ in range(100):
       m = np.random.randint(1, 10)
       k = np.random.randint(1, 10)
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/tf_reduction_test.py b/tensorflow/compiler/mlir/tfrt/python_tests/tf_reduction_test.py
index c7f43ed6975..1c8a1bd6b6d 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/tf_reduction_test.py
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/tf_reduction_test.py
@@ -175,7 +175,7 @@ def test_2d_row_any(self):
     compiled = jitrt.compile(
         mlir_function, 'test', vectorize=True, legalize_i1_tensors=True)
 
-    arg0 = np.random.choice(a=[False, True], size=(8, 10)).astype(np.bool)
+    arg0 = np.random.choice(a=[False, True], size=(8, 10)).astype(bool)
 
     [res] = jitrt.execute(compiled, [arg0])
     np.testing.assert_equal(res, np.any(arg0, axis=1))
@@ -193,7 +193,7 @@ def test_2d_row_all(self):
     compiled = jitrt.compile(
         mlir_function, 'test', vectorize=True, legalize_i1_tensors=True)
 
-    arg0 = np.random.choice(a=[False, True], size=(40, 2)).astype(np.bool)
+    arg0 = np.random.choice(a=[False, True], size=(40, 2)).astype(bool)
 
     [res] = jitrt.execute(compiled, [arg0])
     np.testing.assert_equal(res, np.all(arg0, axis=1))
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/tf_reverse_test.py b/tensorflow/compiler/mlir/tfrt/python_tests/tf_reverse_test.py
new file mode 100644
index 00000000000..9990a139cde
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/tf_reverse_test.py
@@ -0,0 +1,114 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Tensorflow -> jitrt compilation."""
+
+import numpy as np
+
+from tensorflow.compiler.mlir.tfrt.jit.python_binding import tf_jitrt
+from tensorflow.python.platform import test
+
+jitrt = tf_jitrt.TfJitRtExecutor()
+
+
+class TfReverseTest(test.TestCase):
+
+  def test_1d_static(self):
+    mlir_function = """
+        func.func @test(%input: tensor<10xf32>) -> tensor<10xf32> {
+          %reverse_dims =  "tf.Const"() {value = dense<[0]> : tensor<1xi64>}
+             : () -> tensor<1xi64>
+          %0 = "tf.ReverseV2"(%input, %reverse_dims)
+              : (tensor<10xf32>, tensor<1xi64>) -> tensor<10xf32>
+          func.return %0 : tensor<10xf32>
+      }"""
+
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+
+    arg0 = np.random.uniform(1.0, 10.0, size=(10)).astype(np.float32)
+
+    [res] = jitrt.execute(compiled, [arg0])
+    np.testing.assert_allclose(res, np.flip(arg0, axis=0))
+
+  def test_1d_dynamic(self):
+    mlir_function = """
+        func.func @test(%input: tensor<?xf32>) -> tensor<?xf32> {
+          %reverse_dims =  "tf.Const"() {value = dense<[0]> : tensor<1xi64>}
+             : () -> tensor<1xi64>
+          %0 = "tf.ReverseV2"(%input, %reverse_dims)
+              : (tensor<?xf32>, tensor<1xi64>) -> tensor<?xf32>
+          func.return %0 : tensor<?xf32>
+      }"""
+
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+
+    arg0 = np.random.uniform(1.0, 15.0, size=(15)).astype(np.float32)
+
+    [res] = jitrt.execute(compiled, [arg0])
+    np.testing.assert_allclose(res, np.flip(arg0, axis=0))
+
+  def test_2d_dynamic(self):
+    mlir_function = """
+        func.func @test(%input: tensor<?x?xf32>) -> tensor<?x?xf32> {
+          %reverse_dims =  "tf.Const"() {value = dense<[1]> : tensor<1xi64>}
+             : () -> tensor<1xi64>
+          %0 = "tf.ReverseV2"(%input, %reverse_dims)
+              : (tensor<?x?xf32>, tensor<1xi64>) -> tensor<?x?xf32>
+          func.return %0 : tensor<?x?xf32>
+      }"""
+
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+
+    arg0 = np.random.uniform(1.0, 10.0, size=(2, 2)).astype(np.float32)
+
+    [res] = jitrt.execute(compiled, [arg0])
+    np.testing.assert_allclose(res, np.flip(arg0, axis=1))
+
+  def test_3d_dynamic(self):
+    mlir_function = """
+        func.func @test(%input: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+          %reverse_dims =  "tf.Const"() {value = dense<[0, 1]> : tensor<2xi64>}
+             : () -> tensor<2xi64>
+          %0 = "tf.ReverseV2"(%input, %reverse_dims)
+              : (tensor<?x?x?xf32>, tensor<2xi64>) -> tensor<?x?x?xf32>
+          func.return %0 : tensor<?x?x?xf32>
+      }"""
+
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+
+    arg0 = np.random.uniform(1.0, 30.0, size=(2, 3, 4)).astype(np.float32)
+
+    [res] = jitrt.execute(compiled, [arg0])
+    np.testing.assert_allclose(res, np.flip(arg0, axis=(0, 1)))
+
+  def test_3d_dynamic_reverse_last(self):
+    mlir_function = """
+        func.func @test(%input: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+          %reverse_dims =  "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>}
+             : () -> tensor<2xi64>
+          %0 = "tf.ReverseV2"(%input, %reverse_dims)
+              : (tensor<?x?x?xf32>, tensor<2xi64>) -> tensor<?x?x?xf32>
+          func.return %0 : tensor<?x?x?xf32>
+      }"""
+
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+
+    arg0 = np.random.uniform(1.0, 30.0, size=(2, 3, 4)).astype(np.float32)
+
+    [res] = jitrt.execute(compiled, [arg0])
+    np.testing.assert_allclose(res, np.flip(arg0, axis=(0, 2)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/tf_scatter_test.py b/tensorflow/compiler/mlir/tfrt/python_tests/tf_scatter_test.py
new file mode 100644
index 00000000000..c908a775543
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/tf_scatter_test.py
@@ -0,0 +1,54 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Tensorflow -> jitrt compilation."""
+
+import numpy as np
+
+from tensorflow.compiler.mlir.tfrt.jit.python_binding import tf_jitrt
+from tensorflow.python.platform import test
+
+jitrt = tf_jitrt.TfJitRtExecutor()
+
+
+class TfScatterTest(test.TestCase):
+
+  def test_scatter(self):
+    mlir_function = """
+      func.func @test(%index: tensor<5x2xi32>, %updates: tensor<5x8xf32>,
+          %out: tensor<1x11x11xf32>) -> tensor<1x11x11xf32> {
+        %1 = "tf.TensorScatterAdd"(%out, %index, %updates)
+          : (tensor<1x11x11xf32>, tensor<5x2xi32>, tensor<5x8xf32>) ->
+          tensor<1x11x11xf32>
+        return %1 : tensor<1x11x11xf32>
+      }
+    """
+    compiled = jitrt.compile(mlir_function, 'test', vectorize=True)
+    index = np.array([[0, 0], [0, 0], [0, 5], [0, 5], [0, 10]], dtype=np.int32)
+    updates = np.array(
+        [[1] * 8, [2] * 8, [3] * 8, [4] * 8, [5] * 8], dtype=np.float32
+    )
+    out = np.zeros((1, 11, 11), dtype=np.float32)
+
+    exp_res = np.zeros((1, 11, 11), dtype=np.float32)
+    exp_res[0][0][:8] += 3
+    exp_res[0][5][:8] += 7
+    exp_res[0][10][:8] += 5
+
+    [res] = jitrt.execute(compiled, [index, updates, out])
+    np.testing.assert_allclose(res, exp_res)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
index 5bda1f8c12c..62ec862a393 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.cc
@@ -147,7 +147,7 @@ void RuntimeFallbackExecutor::Prepare(llvm::StringRef mlir_input) {
   TfrtPipelineOptions pipeline_opts;
   pipeline_opts.default_device = kDefaultHostDeviceName;
   pipeline_opts.hoist_invariant_ops = true;
-  pipeline_opts.enable_native_ops = false;
+  pipeline_opts.sink_in_invariant_ops = false;
   pipeline_opts.cost_threshold = 1024;
   pipeline_opts.upper_cost_threshold = 100000;
   pipeline_opts.merge_inter_dependent_streams = true;
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h
index a182df69778..597a2e2bf71 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_executor.h
@@ -59,4 +59,4 @@ class RuntimeFallbackExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_EXECUTOR_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_EXECUTOR_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
index 44925511380..cb515614cb6 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
@@ -42,8 +42,7 @@ def RuntimeFallback_Dialect : Dialect {
   }];
 
   let cppNamespace = "::mlir::tfd";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
index 6a4b45ab9ee..47572390666 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
@@ -37,6 +38,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+
 llvm::StringRef ProcessIndexPath(mlir::ArrayAttr index_path) {
   if (index_path.size() == 1 && index_path[0].isa<mlir::StringAttr>()) {
     // TODO(chky): Support cases where index_path is not a single string.
@@ -86,7 +89,7 @@ Status MapFunctionSignaturesFromTFSavedModelMLIR(
     llvm::SmallVector<mlir::Operation*, 4> bound_inputs;
     for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
       if (auto input_index_path = func.getArgAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
+              i, kTfSavedModelIndexPathAttr)) {
         input_names.push_back(ProcessIndexPath(input_index_path));
         auto statusor_spec =
             ProcessTensorSpec(func_type.getInput(i).cast<mlir::TensorType>());
@@ -114,7 +117,7 @@ Status MapFunctionSignaturesFromTFSavedModelMLIR(
         output_specs;
     for (unsigned i = 0, e = func.getNumResults(); i != e; ++i) {
       if (auto output_index_path = func.getResultAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
+              i, kTfSavedModelIndexPathAttr)) {
         output_names.push_back(ProcessIndexPath(output_index_path));
         auto statusor_spec =
             ProcessTensorSpec(func_type.getResult(i).cast<mlir::TensorType>());
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
index 0c69ce6158e..f7df07f4708 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
index 07185e853d5..91f9e57a9b2 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
@@ -1,11 +1,15 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    exclude = ["testdata/**"],
     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
     test_file_exts = ["mlir"],
 )
@@ -22,3 +26,23 @@ filegroup(
         "@llvm-project//mlir:run_lit.sh",
     ],
 )
+
+tf_cc_test(
+    name = "update_op_cost_in_tfrt_mlir_test",
+    srcs = ["update_op_cost_in_tfrt_mlir_test.cc"],
+    data = [
+        "testdata/test.mlir",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt:transforms/update_op_cost_in_tfrt_mlir",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_sync_opdefs",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:Parser",
+        "@tf_runtime//:init_tfrt_dialects",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/testdata/test.mlir b/tensorflow/compiler/mlir/tfrt/tests/analysis/testdata/test.mlir
new file mode 100644
index 00000000000..d4d843cd4ab
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/testdata/test.mlir
@@ -0,0 +1,7 @@
+func.func @test(%ch: !tfrt.chain, %arg0: !corert.tensorhandle, %arg1_th: !corert.tensorhandle) {
+  %cpu = corert.get_op_handler %ch "cpu"
+  %0 = corert.executeop(%cpu) "tf.Relu"(%arg0) { T = f32 } : 1
+  %arg1 = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor %arg1_th {_tfrt_cost = 1 : i64, device = "/CPU:0"} : (!corert.tensorhandle) -> (!tfrt_fallback.tf_tensor)
+  %1 = tfrt_fallback_async.executeop key(0) cost(100) device("/CPU:0") "tf.Relu"(%arg1) { T = f32 } : 1
+  tfrt.return
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc b/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc
new file mode 100644
index 00000000000..0c50bbbdea8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h"
+#include "tensorflow/core/platform/resource_loader.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tfrt/init_tfrt_dialects.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace {
+
+constexpr char kCostAttrName[] = "_tfrt_cost";
+constexpr char kOpKeyAttrName[] = "op_key";
+
+absl::flat_hash_map<int64_t, uint64_t> GetOpCostMap(mlir::ModuleOp op) {
+  absl::flat_hash_map<int64_t, uint64_t> op_cost_map;
+  op.walk([&](mlir::Operation* op) {
+    const auto cost_attr = op->getAttrOfType<mlir::IntegerAttr>(kCostAttrName);
+    if (!cost_attr) return;
+    const auto op_key_attr =
+        op->getAttrOfType<mlir::IntegerAttr>(kOpKeyAttrName);
+    if (!op_key_attr) return;
+    op_cost_map[op_key_attr.getInt()] = cost_attr.getInt();
+  });
+  return op_cost_map;
+}
+
+TEST(CostUpdateTest, Basic) {
+  std::string saved_model_mlir_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tfrt/tests/analysis/testdata/test.mlir");
+
+  mlir::DialectRegistry registry;
+  tfrt::RegisterTFRTDialects(registry);
+  registry.insert<tfrt::fallback_async::FallbackAsyncDialect>();
+  registry.insert<tfrt::fallback_sync::FallbackSyncDialect>();
+  mlir::MLIRContext context(registry);
+  auto module =
+      mlir::parseSourceFile<mlir::ModuleOp>(saved_model_mlir_path, &context);
+  ASSERT_TRUE(module);
+
+  // Create a cost recorder with fake cost records.
+  auto expected_op_cost_map = GetOpCostMap(module.get());
+  EXPECT_EQ(expected_op_cost_map.size(), 1);
+  unsigned int seed = 23579;
+  for (auto& [op_key, cost] : expected_op_cost_map) {
+    cost = rand_r(&seed) % 1000;
+  }
+  tensorflow::tfrt_stub::CostRecorder cost_recorder;
+  for (const auto& [op_key, cost] : expected_op_cost_map) {
+    cost_recorder.RecordCostNanosecond(op_key, cost);
+  }
+
+  // Update the TFRT MLIR with the cost recorder.
+  tfrt_compiler::UpdateOpCostInTfrtMlir(module.get(), cost_recorder);
+
+  // Check the updated costs.
+  const auto got_op_cost_map = GetOpCostMap(module.get());
+  EXPECT_THAT(got_op_cost_map, ::testing::ContainerEq(expected_op_cost_map));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/tests/batch_function_fallback_resource_variable_as_captured_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/batch_function_fallback_resource_variable_as_captured_tensor.mlir
index 9e8796d1efc..0791e2d8c9b 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/batch_function_fallback_resource_variable_as_captured_tensor.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/batch_function_fallback_resource_variable_as_captured_tensor.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -pass-pipeline='tf-executor-to-tfrt-pipeline{target-tpurt=true}' %s | FileCheck %s
+// RUN: tf-tfrt-opt -pass-pipeline='builtin.module(tf-executor-to-tfrt-pipeline{target-tpurt=true})' %s | FileCheck %s
 
 module attributes {tf_saved_model.semantics}  {
   // CHECK-LABEL: func @main
diff --git a/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
index 448686d176f..eaeaa638328 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/batch_function_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="enable-native-ops=false func-use-fallback-tensor=true" %s | FileCheck %s --dump-input=always
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="func-use-fallback-tensor=true" %s | FileCheck %s --dump-input=always
 
 func.func private @batched_function(%arg0: tensor<1x3xf32> {tf._user_specified_name = "0"}, %arg1: tensor<*x!tf_type.resource>) -> tensor<1x3xf32> attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<1x3xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
index 984d28af38e..01f1731e83a 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
@@ -89,3 +89,21 @@ func.func private @test_fuse_dynamic_dimension_ops(%arg0: tensor<?x?xi32>, %arg1
 
 }
 
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// CHECK-LABEL: func private @reorder_execute_arg_defining_ops
+// CHECK: tf.VarHandleOp
+// CHECK-NEXT: tf.ReadVariableOp
+// CHECK-NEXT: tf.TPUCompileMlirAndExecute
+func.func private @reorder_execute_arg_defining_ops(%arg0: tensor<1x3xf32> {tf.device = "/CPU:0"}) -> (tensor<1x1xf32> {tf.device = "/TPU:0"}) {
+  %compilation_status, %program = "tf._TPUCompileMlir"() {device = "/CPU:0", metadata = "metadata", mlir_module = "propgram"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+  "tf.TPUCompileSucceededAssert"(%compilation_status) {device = "/CPU:0"} : (tensor<!tf_type.string>) -> ()
+  %0 = "tf.VarHandleOp"() {_xla_inferred_shapes = [#tf_type.shape<>], allowed_devices = [], container = "", device = "/CPU:0", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/CPU:0"} : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> tensor<3x1xf32>
+  %2 = "tf.TPUExecute"(%arg0, %1, %program) {_producer_name = "UNKNOWN", device = "/TPU:0"} : (tensor<1x3xf32>, tensor<3x1xf32>, tensor<3x!tf_type.string>) -> tensor<1x1xf32>
+  return %2 : tensor<1x1xf32>
+}
+
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
index 411bbb54cf0..1cafb216743 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
@@ -128,10 +128,21 @@ func.func @hoist_var_read_write() -> (tensor<i32> {tf_saved_model.index_path = [
 
 module attributes {tf_saved_model.semantics} {
 
-// Test not hoisting varhandle op that used by control flow ops.
+// Test not hoisting read variable op that used by control flow ops if var handle op and read variable op are separated, but still hoists const ops and var handle ops.
 
+// CHECK-LABEL: func @_tfrt_resource_init
+// CHECK: [[handle:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: "tf._TfrtSetResource"([[handle]])
+// CHECK-SAME: index = [[handle_index:.*]]
+// CHECK: [[handle1:%.*]] = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+// CHECK: "tf._TfrtSetResource"([[handle1]])
+// CHECK-SAME: index = [[handle1_index:.*]]
+// CHECK: [[const:%.*]] = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
+// CHECK: "tf._TfrtSetResource"([[const]])
+// CHECK-SAME: index = [[const_index:.*]]
 func.func private @some_func(
     %arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK: tf.ReadVariableOp
   %0 = "tf.ReadVariableOp"(%arg) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
@@ -139,6 +150,8 @@ func.func private @some_func(
 // CHECK-LABEL: func @test_not_hoist_stateful_call
 func.func @not_hoist_stateful_call(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_not_hoist_stateful_call"]} {
+  // CHECK-NOT: tf.VarHandleOp
+  // CHECK:  "tf._TfrtGetResource"()
   %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
   // CHECK: tf.StatefulPartitionedCall
   %x = "tf.StatefulPartitionedCall"(%handle) {device = "/CPU:0", config = "", config_proto = "", executor_type = "", f = @some_func} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>)
@@ -150,6 +163,8 @@ func.func @not_hoist_stateful_call(%arg: tensor<i32> {tf_saved_model.index_path
 func.func @not_hoist_if(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
   attributes {tf_saved_model.exported_names = ["test_not_hoist_if"]} {
   %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK-NOT: tf.Const
+  // CHECK:  "tf._TfrtGetResource"() 
   %cond = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
   // CHECK: tf.If
   %x = "tf.If"(%cond, %handle) {then_branch = @some_func, else_branch = @some_func, is_stateless = false} : (tensor<i1>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
@@ -163,6 +178,39 @@ func.func @not_hoist_if(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]
 
 module attributes {tf_saved_model.semantics} {
 
+// Test hoist var handle op and read variable op in the batch function.
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK-NOT: tf.VarHandleOp
+  // CHECK-NOT: tf.ReadVariableOp
+  // CHECK:  "tf._TfrtGetResource"() 
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  %2 = "tf.AddV2"(%arg0, %1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %3 = "tf.Identity"(%2) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %3 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK-NOT: tf.VarHandleOp
+  // CHECK:  "tf._TfrtGetResource"() 
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  // CHECK: "tf.BatchFunction"(%arg0, %0)
+  // CHECK: operand_segment_sizes = array<i32: 1, 1>
+  %1 = "tf.BatchFunction"(%arg0, %0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
 // Test not hoisting callees in init functions.
 
 "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
index 307f5d867b9..e4662bd66d7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD b/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
index 1d57ed7b6df..d4abbb3bc44 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
@@ -2,7 +2,10 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
 load("@tf_runtime//:build_defs.bzl", "tfrt_cc_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
@@ -27,6 +30,7 @@ tfrt_cc_test(
     name = "tf_jitrt_benchmark_test",
     srcs = ["tf_jitrt_benchmark_test.cc"],
     deps = [
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tfrt:tf_jitrt_pipeline",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/detensorize_linalg.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/detensorize_linalg.mlir
deleted file mode 100644
index 9df18316667..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/detensorize_linalg.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-tfrt-opt %s -tf-jitrt-detensorize-linalg | FileCheck %s
-
-#id = affine_map<(d0) -> (d0)>
-#empty = affine_map<(d0) -> ()>
-
-// CHECK-LABEL: func @detensorize
-func.func @detensorize(%arg : tensor<100xi32>) -> (tensor<100xi1>) attributes {} {
-  %c10 = arith.constant 10 : i32
-  %tensor = tensor.from_elements %c10 : tensor<i32>
-  %init = tensor.empty() : tensor<100xi1>
-  %result = linalg.generic {
-      indexing_maps = [#id, #empty, #id],
-      iterator_types = ["parallel"]}
-      ins(%arg, %tensor : tensor<100xi32>, tensor<i32>)
-      outs(%init : tensor<100xi1>) {
-    ^bb0(%arg0: i32, %arg1: i32, %arg2: i1):
-      %0 = arith.cmpi slt, %arg0, %arg1 : i32
-      linalg.yield %0 : i1
-    } -> tensor<100xi1>
-  func.return %result : tensor<100xi1>
-}
-// CHECK: %[[C10:.*]] = arith.constant 10 : i32
-// CHECK: linalg.generic {
-// CHECK-SAME: indexing_maps = [#{{map[0-9]*}}, #{{[map0-9]*}}, #{{[map0-9]*}}],
-// CHECK-SAME: iterator_types = ["parallel"]
-// CHECK-SAME: ins(%{{.*}}, %[[C10]] : tensor<100xi32>, i32)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/symbolic_shape_optimization.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/symbolic_shape_optimization.mlir
deleted file mode 100644
index 1310af681c6..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/symbolic_shape_optimization.mlir
+++ /dev/null
@@ -1,203 +0,0 @@
-// RUN: tf-tfrt-opt %s -split-input-file -tf-jitrt-symbolic-shape-optimization \
-// RUN: | FileCheck %s
-
-// CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0)>
-
-// CHECK:       @optimize_1dx1d_bcast(
-// CHECK-SAME:    %[[ARG0:[a-z0-9]+]]: tensor<?xf32>
-// CHECK-SAME:    %[[ARG1:[a-z0-9]+]]: tensor<?xf32>
-func.func @optimize_1dx1d_bcast(
-  %arg0: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
-  %arg1: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
-) -> tensor<?xf32> {
-  %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
-  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
-  %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
-                             -> tensor<1xindex>
-
-  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
-  // CHECK:      %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK:      %[[OUT:.*]] = tensor.empty(%[[D0]])
-  // CHECK:      %[[RET:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
-  // CHECK-SAME: iterator_types = ["parallel"]
-  // CHECK-SAME: ins(%[[ARG0]] : tensor<?xf32>)
-  // CHECK-SAME: outs(%[[OUT]] : tensor<?xf32>)
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
-       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
-
-  func.return %3: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d1)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK:       @optimize_1dx2d_bcast_const_shape(
-// CHECK-SAME:    %[[ARG0:[a-z0-9]+]]: tensor<512xf32>
-// CHECK-SAME:    %[[ARG1:[a-z0-9]+]]: tensor<?x512xf32>
-func.func @optimize_1dx2d_bcast_const_shape(
-  %arg0: tensor<512xf32>,
-  %arg1: tensor<?x512xf32>
-    {rt.symbolic_shape = dense<[-2, 512]> : tensor<2xi64>}
-) -> tensor<?x512xf32> {
-  %0 = shape.const_shape [512] : tensor<1xindex>
-  %1 = shape.shape_of %arg1 : tensor<?x512xf32> -> tensor<2xindex>
-  %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<2xindex>
-                             -> tensor<2xindex>
-
-  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
-  // CHECK:      %[[D0:.*]] = tensor.dim %[[ARG1]], %[[C0]]
-  // CHECK:      %[[OUT:.*]] = tensor.empty(%[[D0]])
-  // CHECK:      %[[RET:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG0]] : tensor<512xf32>)
-  // CHECK-SAME: outs(%[[OUT]] : tensor<?x512xf32>)
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
-       : (tensor<512xf32>, tensor<2xindex>) -> tensor<?x512xf32>
-
-  func.return %3: tensor<?x512xf32>
-}
-
-// -----
-
-// CHECK: #[[MAP:.*]] = affine_map<(d0) -> (d0)>
-
-// CHECK:       @optimize_1dx1dx1d_bcast(
-// CHECK-SAME:    %[[ARG0:[a-z0-9]+]]: tensor<?xf32>
-// CHECK-SAME:    %[[ARG1:[a-z0-9]+]]: tensor<?xf32>
-// CHECK-SAME:    %[[ARG2:[a-z0-9]+]]: tensor<?xf32>
-func.func @optimize_1dx1dx1d_bcast(
-  %arg0: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
-  %arg1: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
-  %arg2: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
-) -> tensor<?xf32> {
-  %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
-  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
-  %2 = shape.shape_of %arg2 : tensor<?xf32> -> tensor<1xindex>
-  %3 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
-                             -> tensor<1xindex>
-  %4 = shape.broadcast %3, %2 : tensor<1xindex>, tensor<1xindex>
-                             -> tensor<1xindex>
-
-  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
-  // CHECK:      %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK:      %[[OUT:.*]] = tensor.empty(%[[D0]])
-  // CHECK:      %[[RET:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
-  // CHECK-SAME: iterator_types = ["parallel"]
-  // CHECK-SAME: ins(%[[ARG0]] : tensor<?xf32>)
-  // CHECK-SAME: outs(%[[OUT]] : tensor<?xf32>)
-  %5 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %4)
-         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
-       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
-
-  func.return %5: tensor<?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
-
-// CHECK:       @optimize_2dx1d_bcast(
-// CHECK-SAME:    %[[ARG0:[a-z0-9]+]]: tensor<10x?xf32>
-// CHECK-SAME:    %[[ARG1:[a-z0-9]+]]: tensor<?xf32>
-func.func @optimize_2dx1d_bcast(
-  %arg0: tensor<10x?xf32>
-    {rt.symbolic_shape = dense<[10, -2]> : tensor<2xi64>},
-  %arg1: tensor<?xf32>
-    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
-) -> (tensor<10x?xf32>, tensor<10x?xf32>) {
-  %0 = shape.shape_of %arg0 : tensor<10x?xf32> -> tensor<2xindex>
-  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
-  %2 = shape.broadcast %0, %1 : tensor<2xindex>, tensor<1xindex>
-                             -> tensor<2xindex>
-
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // CHECK:      %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-
-  // CHECK:      %[[OUT0:.*]] = tensor.empty(%[[D1]])
-  // CHECK:      %[[RET0:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP0]]]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG0]] : tensor<10x?xf32>)
-  // CHECK-SAME: outs(%[[OUT0]] : tensor<10x?xf32>)
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-       : (tensor<10x?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
-
-  // CHECK-DAG:  %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-  // CHECK:      %[[OUT1:.*]] = tensor.empty(%[[D0]])
-  // CHECK:      %[[RET1:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP1]], #[[MAP0]]]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG1]] : tensor<?xf32>)
-  // CHECK-SAME: outs(%[[OUT1]] : tensor<10x?xf32>)
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
-         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
-       : (tensor<?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
-
-  // CHECK: return %[[RET0]], %[[RET1]]
-  func.return %3, %4: tensor<10x?xf32>, tensor<10x?xf32>
-}
-
-// -----
-
-// CHECK: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, 0, d2)>
-// CHECK: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (0, d1, 0)>
-
-// CHECK:       @optimize_3dx3d_bcast(
-// CHECK-SAME:    %[[ARG0:[a-z0-9]+]]: tensor<?x1x?xf32>
-// CHECK-SAME:    %[[ARG1:[a-z0-9]+]]: tensor<1x?x1xf32>
-func.func @optimize_3dx3d_bcast(
-  %arg0: tensor<?x1x?xf32>
-    {rt.symbolic_shape = dense<[-2, 1, -3]> : tensor<3xi64>},
-  %arg1: tensor<1x?x1xf32>
-    {rt.symbolic_shape = dense<[1, -4, 1]> : tensor<3xi64>}
-) -> (tensor<?x?x?xf32>, tensor<?x?x?xf32>) {
-  %0 = shape.shape_of %arg0 : tensor<?x1x?xf32> -> tensor<3xindex>
-  %1 = shape.shape_of %arg1 : tensor<1x?x1xf32> -> tensor<3xindex>
-  %2 = shape.broadcast %0, %1 : tensor<3xindex>, tensor<3xindex>
-                             -> tensor<3xindex>
-
-  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG:  %[[C2:.*]] = arith.constant 2 : index
-
-  // CHECK:      %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-  // CHECK:      %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]]
-  // CHECK:      %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C2]]
-  // CHECK:      %[[OUT0:.*]] = tensor.empty(%[[D0]], %[[D1]], %[[D2]])
-  // CHECK:      %[[RET0:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG0]] : tensor<?x1x?xf32>)
-  // CHECK-SAME: outs(%[[OUT0]] : tensor<?x?x?xf32>)
-  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
-         {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
-       : (tensor<?x1x?xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-
-  // CHECK:      %[[OUT1:.*]] = tensor.empty(%[[D0]], %[[D1]], %[[D2]])
-  // CHECK:      %[[RET1:.*]] = linalg.generic
-  // CHECK-SAME: indexing_maps = [#[[MAP2]], #[[MAP1]]]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG1]] : tensor<1x?x1xf32>)
-  // CHECK-SAME: outs(%[[OUT1]] : tensor<?x?x?xf32>)
-  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
-         {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
-       : (tensor<1x?x1xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-
-  // CHECK: return %[[RET0]], %[[RET1]]
-  func.return %3, %4: tensor<?x?x?xf32>, tensor<?x?x?xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_benchmark_test.cc b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_benchmark_test.cc
index a08c3c725d8..cabd1c39fad 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_benchmark_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_benchmark_test.cc
@@ -17,6 +17,7 @@
 #include <utility>
 
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_pipeline.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
@@ -90,6 +91,9 @@ static void BM_InstantiateExecutable(::testing::benchmark::State& state) {
   opts.compiler.create_compilation_pipeline =
       [&](xla::runtime::PassManager& passes) {
         TfJitRtPipelineOptions opts;
+        opts.enable_xla_cpu_transformations =
+            tensorflow::GetJitRtFlags().enable_xla_cpu_transformations;
+        opts.lower_to_mmt4d = tensorflow::GetJitRtFlags().pack_matmul;
 
         // Lower from Tensorflow to Linalg on buffers.
         CreateTfJitRtPipeline(*passes, opts);
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose.mlir
index 5ccf7201a66..50b31cd12df 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose.mlir
@@ -1,7 +1,5 @@
 // RUN: tf-tfrt-opt -tf-jitrt-pipeline="vectorize codegen-transpose" -split-input-file %s | FileCheck %s
 
-// Verify that transpose codegen is working within the pipeline.
-
 func.func @transpose_2d(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = "tf.Const"()
        {value = dense<[1, 0]> : tensor<2xi64>,
@@ -18,41 +16,12 @@ func.func @transpose_2d(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
 // 8x8 tiling.
 // CHECK:           scf.parallel {{.*}} step (%[[C8]], %[[C8]]) {
 // Vector xfer reads: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
+// CHECK-COUNT-8:     vector.transfer_read
 // AVX2 shuffle/asm sequence.
 // CHECK-COUNT-12:    vector.shuffle
 // CHECK-COUNT-8:     llvm.inline_asm
 // CHECK-COUNT-8:     vector.shuffle
 // Vector xfer writes: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
 
 // -----
 
@@ -70,41 +39,12 @@ func.func @transpose_3d_021(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // 1x8x8 tiling.
 // CHECK:           scf.parallel {{.*}} step (%[[C1]], %[[C8]], %[[C8]]) {
 // Vector xfer reads: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
+// CHECK-COUNT-8:     vector.transfer_read
 // AVX2 shuffle/asm sequence.
 // CHECK-COUNT-12:    vector.shuffle
 // CHECK-COUNT-8:     llvm.inline_asm
 // CHECK-COUNT-8:     vector.shuffle
 // Vector xfer writes: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
 
 // -----
 
@@ -120,43 +60,14 @@ func.func @transpose_3d_201(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // CHECK-DAG:       %[[C8:.*]] = arith.constant 8 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // 8x1x8 tiling.
-// CHECK:           scf.parallel {{.*}} step (%[[C8]], %[[C1]], %[[C8]]) {
+// CHECK:           scf.parallel {{.*}} step (%[[C1]], %[[C8]], %[[C8]]) {
 // Vector xfer reads: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
+// CHECK-COUNT-8:     vector.transfer_read
 // AVX2 shuffle/asm sequence.
 // CHECK-COUNT-12:    vector.shuffle
 // CHECK-COUNT-8:     llvm.inline_asm
 // CHECK-COUNT-8:     vector.shuffle
 // Vector xfer writes: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
 
 // -----
 
@@ -174,41 +85,12 @@ func.func @transpose_3d_210(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // 8x1x8 tiling.
 // CHECK:           scf.parallel {{.*}} step (%[[C8]], %[[C1]], %[[C8]]) {
 // Vector xfer reads: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
+// CHECK-COUNT-8:     vector.transfer_read
 // AVX2 shuffle/asm sequence.
 // CHECK-COUNT-12:    vector.shuffle
 // CHECK-COUNT-8:     llvm.inline_asm
 // CHECK-COUNT-8:     vector.shuffle
 // Vector xfer writes: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
 
 // -----
 
@@ -224,64 +106,11 @@ func.func @transpose_3d_120(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
 // CHECK-DAG:       %[[C8:.*]] = arith.constant 8 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // 1x8x8 tiling.
-// CHECK:           scf.parallel {{.*}} step (%[[C1]], %[[C8]], %[[C8]]) {
+// CHECK:           scf.parallel {{.*}} step (%[[C8]], %[[C1]], %[[C8]]) {
 // Vector xfer reads: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_read
+// CHECK-COUNT-8:     vector.transfer_read
 // AVX2 shuffle/asm sequence.
 // CHECK-COUNT-12:    vector.shuffle
 // CHECK-COUNT-8:     llvm.inline_asm
 // CHECK-COUNT-8:     vector.shuffle
 // Vector xfer writes: unrolled second vector dimension.
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-// CHECK-NEXT:        affine.apply
-// CHECK-NEXT:        vector.transfer_write
-
-// -----
-
-func.func @transpose_3d_102(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %0 = "tf.Const"() { value = dense<[1, 0, 2]> : tensor<3xi64> }
-    : () -> tensor<3xi64>
-  %1 = "tf.Transpose"(%arg0, %0)
-    : (tensor<?x?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
-  func.return %1 : tensor<?x?x?xf32>
-}
-
-// CHECK-LABEL:   func @transpose_3d_102
-// CHECK-DAG:       %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// 1x1x8 tiling.
-// CHECK:           scf.parallel {{.*}} step (%[[C1]], %[[C1]], %[[C8]]) {
-// Vector xfer read: we only vectorize one dimension for "memcopy" transposes.
-// CHECK-NEXT:        vector.transfer_read
-// No transposition is required here so no AVX2 shuffle/asm should be generated.
-// CHECK-NOT:         vector.shuffle
-// CHECK-NOT:         llvm.inline_asm
-// Vector xfer write: we only vectorize one dimension for "memcopy" transposes.
-// CHECK-NEXT:        vector.transfer_write
-
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose_detection.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose_detection.mlir
deleted file mode 100644
index c32148cf963..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_codegen_transpose_detection.mlir
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: tf-tfrt-opt -tf-jitrt-tile-transpose -split-input-file %s | FileCheck %s
-
-// Make sure that transpose codegen passes only trigger on generic ops
-// implementing a transpose operation.
-
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @transpose_2d(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%1, %0) : tensor<?x?xf32>
-  %3 = linalg.generic {indexing_maps = [#map0, #map1],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg0 : tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<?x?xf32>
-  func.return %3 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL:   func @transpose_2d(
-// CHECK:           gml_st.loop
-// CHECK:             linalg.generic
-// CHECK:               linalg.yield
-// CHECK:             gml_st.yield
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @identity(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%1, %0) : tensor<?x?xf32>
-  %3 = linalg.generic {indexing_maps = [#map0, #map1],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg0 : tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<?x?xf32>
-  func.return %3 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL:   func @identity(
-// CHECK-NOT:       gml_st.loop
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d1, d0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @transpose_add(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32>{
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%1, %0) : tensor<?x?xf32>
-  %3 = linalg.generic {indexing_maps = [#map0, #map1],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg0 : tensor<?x?xf32>) outs(%2 : tensor<?x?xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %add = arith.addf %arg1, %arg1 : f32
-    linalg.yield %add : f32
-  } -> tensor<?x?xf32>
-  func.return %3 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL:   func @transpose_add(
-// CHECK-NOT:       gml_st.loop
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_fuse_fill_into_tiled_reduction.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_fuse_fill_into_tiled_reduction.mlir
deleted file mode 100644
index 13ae6536c78..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_fuse_fill_into_tiled_reduction.mlir
+++ /dev/null
@@ -1,230 +0,0 @@
-// RUN: tf-tfrt-opt -tf-jitrt-fuse-fill-into-tiled-reduction %s \
-// RUN: --split-input-file |\
-// RUN: FileCheck %s
-
-#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map1 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-func.func @reduce_row_sum_2d(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c2 = arith.constant 2 : index
-  %c1 = arith.constant 1 : index
-  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-  %1 = tensor.empty(%0) : tensor<?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-  %3 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-  %4 = tensor.dim %lhs, %c1 : tensor<?x?xf32>
-  %5 = gml_st.loop (%i, %j) = (%c0, %c0) to (%3, %4) step (%c4, %c2)
-         ins (%lhs_ = %lhs: tensor<?x?xf32>, %rhs_ = %rhs: tensor<?x?xf32>)
-         outs (%fill_ = %fill: tensor<?xf32>)
-         iterators[#gml_st.iterator_type<parallel>,
-                   #gml_st.iterator_type<reduction>] {
-    %6 = affine.min #map0(%i)[%3]
-    %7 = affine.min #map1(%j)[%4]
-    %8 = tensor.extract_slice %lhs_[%i, %j] [%6, %7] [1, 1]
-      : tensor<?x?xf32> to tensor<?x?xf32>
-    %9 = affine.min #map0(%i)[%3]
-    %10 = affine.min #map1(%j)[%4]
-    %11 = tensor.extract_slice %rhs_[%i, %j] [%9, %10] [1, 1]
-      : tensor<?x?xf32> to tensor<?x?xf32>
-    %12 = affine.min #map0(%i)[%3]
-    %13 = tensor.extract_slice %fill_[%i] [%12] [1]
-      : tensor<?xf32> to tensor<?xf32>
-    %14 = linalg.generic {
-            indexing_maps = [#map2, #map2, #map3],
-            iterator_types = ["parallel", "reduction"]}
-            ins(%8, %11 : tensor<?x?xf32>, tensor<?x?xf32>)
-            outs(%13 : tensor<?xf32>) {
-    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32):
-      %16 = arith.mulf %arg7, %arg8 : f32
-      %17 = arith.addf %16, %arg9 : f32
-      linalg.yield %17 : f32
-    } -> tensor<?xf32>
-    %15 = tensor.insert_slice %14 into %fill_[%i] [%12] [1]
-      : tensor<?xf32> into tensor<?xf32>
-    gml_st.yield %15 : tensor<?xf32>
-  }
-  func.return %5 : tensor<?xf32>
-}
-// CHECK-LABEL: func @reduce_row_sum_2d(
-// CHECK-SAME:    %[[LHS:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:    %[[RHS:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
-
-// CHECK-DAG:  %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
-// CHECK:      %[[INIT:.*]] = tensor.empty(%[[DIM_0]]) : [[TY_1D:.*]]
-// CHECK:      %[[INIT_TILE:.*]] = tensor.empty() : tensor<4xf32>
-// CHECK:      %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D]]
-// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[LHS]], %[[C1]] : [[TY_2D]]
-
-// CHECK:      gml_st.loop (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:   to (%[[DIM_0_]], %[[DIM_1]]) step (%[[C4]], %[[C2]])
-// CHECK-SAME:   ins (%[[LHS_:.*]] = %[[LHS]]: [[TY_2D]],
-// CHECK-SAME:        %[[RHS_:.*]] = %[[RHS]]: [[TY_2D]])
-// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[FILL]]: [[TY_1D]],
-// CHECK-SAME:         %[[INIT_TILE_:.*]] = %[[INIT_TILE]]: tensor<4xf32>)
-
-// CHECK:      %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS_]][%[[I]], %[[J]]]
-// CHECK:      %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS_]][%[[I]], %[[J]]]
-// CHECK:      %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]]]
-// CHECK:      %[[INIT_TILE_SUB:.*]] = tensor.extract_slice %[[INIT_TILE_]][0]
-
-// CHECK:      %[[FILL_SUB:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT_TILE_SUB]]
-// CHECK:      %[[SUM_OF_PROD_SUB:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[LHS_SUB]], %[[RHS_SUB]] : [[TY_2D]], [[TY_2D]])
-// CHECK-SAME:   outs(%[[FILL_SUB]] : [[TY_1D]])
-// CHECK:          mulf
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-// CHECK:      %[[ACC:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[SUM_OF_PROD_SUB]] : [[TY_1D]])
-// CHECK-SAME:   outs(%[[OUT_SUB]] : [[TY_1D]]) {
-// CHECK-NOT:      mulf
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-// CHECK:      %[[INIT_TILE_UPDATE:.*]] = tensor.insert_slice %[[SUM_SUB:.*]] into %[[INIT_TILE_]]
-// CHECK:      %[[UPDATE:.*]] = tensor.insert_slice %[[ACC:.*]] into %[[OUT_]]
-// CHECK:      gml_st.yield %[[UPDATE]], %[[INIT_TILE_UPDATE]] : [[TY_1D]], tensor<4xf32>
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-module  {
-  func.func @reduce_row_sum_2d_static(%in: tensor<8x16xf32>) -> tensor<8xf32> {
-    %cst = arith.constant 0.000000e+00 : f32
-    %c2 = arith.constant 2 : index
-    %c4 = arith.constant 4 : index
-    %c8 = arith.constant 8 : index
-    %c16 = arith.constant 16 : index
-    %c0 = arith.constant 0 : index
-    %0 = tensor.empty() : tensor<8xf32>
-    %fill = linalg.fill ins(%cst : f32) outs(%0 : tensor<8xf32>) -> tensor<8xf32>
-    %2 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c8, %c16) step (%c4, %c2)
-           ins (%in_ = %in: tensor<8x16xf32>)
-           outs (%fill_ = %fill: tensor<8xf32>)
-           iterators[#gml_st.iterator_type<parallel>,
-                     #gml_st.iterator_type<reduction>] {
-      %3 = tensor.extract_slice %in_[%i, %j] [4, 2] [1, 1]
-        : tensor<8x16xf32> to tensor<4x2xf32>
-      %4 = tensor.extract_slice %fill_[%i] [4] [1]
-        : tensor<8xf32> to tensor<4xf32>
-      %5 = linalg.generic {
-             indexing_maps = [#map0, #map1],
-             iterator_types = ["parallel", "reduction"]}
-             ins(%3 : tensor<4x2xf32>)
-             outs(%4 : tensor<4xf32>) {
-      ^bb0(%arg5: f32, %arg6: f32):
-        %7 = arith.addf %arg5, %arg6 : f32
-        linalg.yield %7 : f32
-      } -> tensor<4xf32>
-      %6 = tensor.insert_slice %5 into %fill_[%i] [4] [1]
-        : tensor<4xf32> into tensor<8xf32>
-      gml_st.yield %6 : tensor<8xf32>
-    }
-    func.return %2 : tensor<8xf32>
-  }
-}
-// CHECK-LABEL: func @reduce_row_sum_2d_static
-// CHECK: gml_st.loop
-// CHECK:   tensor.insert_slice
-
-// -----
-
-#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d1)>
-module  {
-  func.func @reduce_column_sum_2d(%in: tensor<?x?xf32>) -> tensor<?xf32> {
-    %cst = arith.constant 0.000000e+00 : f32
-    %c0 = arith.constant 0 : index
-    %c4 = arith.constant 4 : index
-    %c1 = arith.constant 1 : index
-    %0 = tensor.dim %in, %c0 : tensor<?x?xf32>
-    %1 = tensor.empty(%0) : tensor<?xf32>
-    %fill = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-    %3 = tensor.dim %in, %c0 : tensor<?x?xf32>
-    %4 = tensor.dim %in, %c1 : tensor<?x?xf32>
-    %5 = gml_st.loop (%i, %j) = (%c0, %c0) to (%3, %4) step (%c4, %c4)
-        ins (%in_ = %in: tensor<?x?xf32>)
-        outs (%fill_ = %fill: tensor<?xf32>)
-        iterators[#gml_st.iterator_type<reduction>,
-                  #gml_st.iterator_type<parallel>] {
-      %6 = affine.min #map0(%i)[%3]
-      %7 = affine.min #map0(%j)[%4]
-      %8 = tensor.extract_slice %in_[%i, %j] [%6, %7] [1, 1]
-        : tensor<?x?xf32> to tensor<?x?xf32>
-      %9 = affine.min #map0(%j)[%4]
-      %10 = tensor.extract_slice %fill_[%j] [%9] [1]
-        : tensor<?xf32> to tensor<?xf32>
-      %11 = linalg.generic {
-          indexing_maps = [#map1, #map2],
-          iterator_types = ["reduction", "parallel"]}
-          ins(%8 : tensor<?x?xf32>)
-          outs(%10 : tensor<?xf32>) {
-      ^bb0(%arg5: f32, %arg6: f32):
-        %13 = arith.addf %arg5, %arg6 : f32
-        linalg.yield %13 : f32
-      } -> tensor<?xf32>
-      %12 = tensor.insert_slice %11 into %fill_[%j] [%9] [1]
-        : tensor<?xf32> into tensor<?xf32>
-      gml_st.yield %12 : tensor<?xf32>
-    }
-    func.return %5 : tensor<?xf32>
-  }
-}
-// CHECK-LABEL: func @reduce_column_sum_2d
-// CHECK-SAME:    %[[INPUT:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
-
-// CHECK-DAG:  %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D:.*]]
-// CHECK:      %[[INIT:.*]] = tensor.empty(%[[DIM_0]]) : [[TY_1D:.*]]
-// CHECK:      %[[INIT_TILE:.*]] = tensor.empty() : tensor<4xf32>
-// CHECK:      %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D]]
-// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[INPUT]], %[[C1]] : [[TY_2D]]
-
-// CHECK:      gml_st.loop (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:   to (%[[DIM_0_]], %[[DIM_1]]) step (%[[C4]], %[[C4]])
-// CHECK-SAME:   ins (%[[IN_:.*]] = %[[INPUT]]: [[TY_2D]])
-// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[FILL]]: [[TY_1D]],
-// CHECK-SAME:         %[[INIT_TILE_:.*]] = %[[INIT_TILE]]: tensor<4xf32>)
-
-// CHECK:      %[[IN_SUB:.*]] = tensor.extract_slice %[[IN_]][%[[I]], %[[J]]]
-// CHECK:      %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[J]]]
-// CHECK:      %[[INIT_TILE_SUB:.*]] = tensor.extract_slice %[[INIT_TILE_]][0]
-
-// CHECK:      %[[FILL_SUB:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT_TILE_SUB]]
-// CHECK:      %[[SUM_SUB:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[IN_SUB]] : [[TY_2D]])
-// CHECK-SAME:   outs(%[[FILL_SUB]] : [[TY_1D]])
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-
-// CHECK:      %[[ACC:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[SUM_SUB]] : [[TY_1D]])
-// CHECK-SAME:   outs(%[[OUT_SUB]] : [[TY_1D]]) {
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-// CHECK:      %[[INIT_TILE_UPDATE:.*]] = tensor.insert_slice
-// CHECK-SAME:   %[[SUM_SUB:.*]] into %[[INIT_TILE_]]
-// CHECK:      %[[UPDATE:.*]] = tensor.insert_slice %[[ACC:.*]] into %[[OUT_]]
-// CHECK:      gml_st.yield %[[UPDATE]], %[[INIT_TILE_UPDATE]]
-// CHECK-SAME:   [[TY_1D]], tensor<4xf32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_peel_tiled_loops.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_peel_tiled_loops.mlir
deleted file mode 100644
index 726802b9f06..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_peel_tiled_loops.mlir
+++ /dev/null
@@ -1,373 +0,0 @@
-// RUN: tf-tfrt-opt %s -allow-unregistered-dialect -split-input-file \
-// RUN: -tf-jitrt-peel-tiled-loops -cse -canonicalize | FileCheck %s
-
-#map0 = affine_map<(d0) -> (8, -d0 + 102401)>
-#map1 = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @tanh_1d(%arg0: memref<102401xf32>) -> memref<102401xf32> {
-  %c102401 = arith.constant 102401 : index
-  %c8 = arith.constant 8 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = memref.alloc() : memref<102401xf32>
-  gml_st.loop (%arg1) = (%c0) to (%c102401) step (%c8)
-      ins (%arg2 = %arg0: memref<102401xf32>)
-      outs (%arg3 = %0: memref<102401xf32>) {
-    %1 = affine.min #map0(%arg1)
-    %2 = memref.subview %arg2[%arg1] [%1] [1]
-        : memref<102401xf32> to memref<?xf32, #map1>
-    %3 = memref.subview %arg3[%arg1] [%1] [1]
-        : memref<102401xf32> to memref<?xf32, #map1>
-    %4 = vector.transfer_read %2[%c0], %cst
-        : memref<?xf32, #map1>, vector<8xf32>
-    %5 = math.tanh %4 : vector<8xf32>
-    vector.transfer_write %5, %3[%c0] : vector<8xf32>, memref<?xf32, #map1>
-    memref.copy %3, %3 : memref<?xf32, #map1> to memref<?xf32, #map1>
-    gml_st.yield
-  }
-  func.return %0 : memref<102401xf32>
-}
-
-// CHECK-DAG:  #[[$MAP:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
-
-// CHECK-LABEL: func @tanh_1d
-
-// CHECK:       gml_st.loop
-// CHECK:           memref.subview
-// CHECK-SAME:        memref<102401xf32> to memref<8xf32, strided<[1], offset: ?>>
-// CHECK:           memref.subview
-// CHECK-SAME:        memref<102401xf32> to memref<8xf32, strided<[1], offset: ?>>
-
-// CHECK:       gml_st.loop
-// CHECK:           memref.subview
-// CHECK-SAME:        memref<102401xf32> to memref<?xf32, #[[$MAP]]>
-// CHECK:           memref.subview
-// CHECK-SAME:        memref<102401xf32> to memref<?xf32, #[[$MAP]]>
-
-// -----
-
-func.func @tanh_3d(%d0: index, %d1: index, %d2: index) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  gml_st.loop (%arg1 ,%arg2, %arg3) = (%c0, %c0, %c0)
-    to (%d0, %d1, %d2) step (%c8, %c1, %c8)
-    ins () outs () {
-    "prevent.dce"() : () -> ()
-    gml_st.yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: func @tanh_3d(
-// CHECK-SAME:    %[[D0:[a-z0-9]+]]: index, %[[D1:[a-z0-9]+]]: index,
-// CHECK-SAME:    %[[D2:[a-z0-9]+]]: index) {
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C8:.*]] = arith.constant 8 : index
-
-// CHECK-DAG:     %[[SPLIT0:.*]] = affine.apply{{.*}}%[[D0]]
-// CHECK-DAG:     %[[SPLIT2:.*]] = affine.apply{{.*}}%[[D2]]
-
-// CHECK:     gml_st.loop{{.*}}(%[[C0]], %[[C0]], %[[C0]])
-// CHECK-SAME:  to (%[[SPLIT0]], %arg1, %[[SPLIT2]])
-// CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])
-
-// CHECK:     gml_st.loop{{.*}}(%[[SPLIT0]], %[[C0]], %[[C0]])
-// CHECK-SAME:  to (%arg0, %arg1, %[[SPLIT2]])
-// CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])
-
-// CHECK:     gml_st.loop{{.*}}(%[[C0]], %[[C0]], %[[SPLIT2]])
-// CHECK-SAME:  to (%arg0, %arg1, %arg2)
-// CHECK-SAME:  step  (%[[C8]], %[[C1]], %[[C8]])
-
-// -----
-
-func.func @reduce_column_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-
-  %dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
-  %dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>
-
-  %1 = tensor.empty(%dim_Y) : tensor<?xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-  %5 = gml_st.loop (%i, %j) = (%c0, %c0) to (%dim_Y, %dim_X)
-         step (%c4, %c4)
-         ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
-         outs (%out_ = %2: tensor<?xf32>)
-         iterators[#gml_st.iterator_type<parallel>,
-                   #gml_st.iterator_type<reduction>] {
-    %6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_X]
-    %9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_Y]
-
-    %8 = tensor.extract_slice %in_[%j, %i] [%6, %9] [1, 1]
-           : tensor<?x?xf32> to tensor<?x?xf32>
-    %11 = tensor.extract_slice %out_[%i] [%9] [1]
-           : tensor<?xf32> to tensor<?xf32>
-
-    %12 = linalg.fill ins(%cst_ : f32) outs(%11 : tensor<?xf32>) -> tensor<?xf32>
-    %13 = linalg.generic {
-            indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
-                             affine_map<(d0, d1) -> (d0)>],
-            iterator_types = ["parallel", "reduction"]}
-            ins(%8 : tensor<?x?xf32>)
-            outs(%12 : tensor<?xf32>) {
-          ^bb0(%arg6: f32, %arg7: f32):
-            %16 = arith.addf %arg6, %arg7 : f32
-            linalg.yield %16 : f32
-          } -> tensor<?xf32>
-    %14 = linalg.generic {
-            indexing_maps = [affine_map<(d0) -> (d0)>,
-                             affine_map<(d0) -> (d0)>],
-            iterator_types = ["parallel"]}
-            ins(%13 : tensor<?xf32>)
-            outs(%11 : tensor<?xf32>) {
-          ^bb0(%arg6: f32, %arg7: f32):
-            %16 = arith.addf %arg6, %arg7 : f32
-            linalg.yield %16 : f32
-          } -> tensor<?xf32>
-    %15 = tensor.insert_slice %14 into %out_[%i] [%9] [1]
-            : tensor<?xf32> into tensor<?xf32>
-    gml_st.yield %15 : tensor<?xf32>
-  }
-  func.return %5 : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @reduce_column_sum_2d_dynamic
-
-// CHECK:       linalg.fill
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<4x4xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<4xf32>
-
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<4x?xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?xf32> to tensor<?xf32>
-
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<?x?xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?xf32> to tensor<?xf32>
-
-// -----
-
-func.func @reduce_row_sum_2d_dynamic(%in: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-
-  %dim_X = tensor.dim %in, %c0 : tensor<?x?xf32>
-  %dim_Y = tensor.dim %in, %c1 : tensor<?x?xf32>
-
-  %1 = tensor.empty(%dim_X) : tensor<?xf32>
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
-  %5 = gml_st.loop (%i, %j) = (%c0, %c0) to (%dim_X, %dim_Y)
-         step (%c4, %c4)
-         ins (%in_ = %in: tensor<?x?xf32>, %cst_ = %cst: f32)
-         outs (%out_ = %2: tensor<?xf32>)
-         iterators[#gml_st.iterator_type<parallel>,
-                   #gml_st.iterator_type<reduction>] {
-    %6 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%i)[%dim_X]
-    %7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%j)[%dim_Y]
-
-    %8 = tensor.extract_slice %in_[%i, %j] [%6, %7] [1, 1]
-           : tensor<?x?xf32> to tensor<?x?xf32>
-    %11 = tensor.extract_slice %out_[%i] [%6] [1]
-           : tensor<?xf32> to tensor<?xf32>
-    %12 = linalg.fill ins(%cst_ : f32) outs(%11 : tensor<?xf32>) -> tensor<?xf32>
-    %13 = linalg.generic {
-            indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                             affine_map<(d0, d1) -> (d0)>],
-            iterator_types = ["parallel", "reduction"]}
-            ins(%8 : tensor<?x?xf32>)
-            outs(%12 : tensor<?xf32>) {
-          ^bb0(%arg6: f32, %arg7: f32):
-            %16 = arith.addf %arg6, %arg7 : f32
-            linalg.yield %16 : f32
-          } -> tensor<?xf32>
-    %14 = linalg.generic {
-            indexing_maps = [affine_map<(d0) -> (d0)>,
-                             affine_map<(d0) -> (d0)>],
-            iterator_types = ["parallel"]}
-            ins(%13 : tensor<?xf32>)
-            outs(%11 : tensor<?xf32>) {
-          ^bb0(%arg6: f32, %arg7: f32):
-            %16 = arith.addf %arg6, %arg7 : f32
-            linalg.yield %16 : f32
-          } -> tensor<?xf32>
-    %15 = tensor.insert_slice %14 into %out_[%i] [%6] [1]
-            : tensor<?xf32> into tensor<?xf32>
-    gml_st.yield %15 : tensor<?xf32>
-  }
-  func.return %5 : tensor<?xf32>
-}
-
-// CHECK-LABEL: func @reduce_row_sum_2d_dynamic
-
-// CHECK:       linalg.fill
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<4x4xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<4xf32>
-
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<?x4xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?xf32> to tensor<?xf32>
-
-// CHECK:       gml_st.loop
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?x?xf32> to tensor<?x?xf32>
-// CHECK:           tensor.extract_slice
-// CHECK-SAME:        tensor<?xf32> to tensor<?xf32>
-
-// -----
-
-func.func @matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c2 = arith.constant 2 : index
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %dim_0 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %dim_1 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %dim_2 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %dim_3 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %2 = gml_st.parallel (%arg2, %arg3) = (%c0, %c0) to (%dim_1, %dim_3) step (%c8, %c4) {
-    %3 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 8)>(%arg2)[%dim_1]
-    %4 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%arg3)[%dim_3]
-    %5 = gml_st.tile [%arg2, 0] [%3, %dim_2] [1, 1] : !gml_st.tile<?x?>
-    %6 = gml_st.materialize %arg0[%5] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-    %7 = gml_st.tile [0, %arg3] [%dim_2, %4] [1, 1] : !gml_st.tile<?x?>
-    %8 = gml_st.materialize %arg1[%7] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-    %9 = gml_st.tile [%arg2, %arg3] [%3, %4] [1, 1] : !gml_st.tile<?x?>
-    %10 = gml_st.materialize %1[%9] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-    %dim_4 = tensor.dim %6, %c0 : tensor<?x?xf32>
-    %dim_5 = tensor.dim %6, %c1 : tensor<?x?xf32>
-    %dim_6 = tensor.dim %8, %c1 : tensor<?x?xf32>
-    %11 = gml_st.for (%arg4) = (%c0) to (%dim_5) step (%c2) outs (%arg5 = %10: tensor<?x?xf32>) {
-      %12 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 2)>(%arg4)[%dim_5]
-      %13 = gml_st.tile [0, %arg4] [%dim_4, %12] [1, 1] : !gml_st.tile<?x?>
-      %14 = gml_st.materialize %6[%13] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-      %15 = gml_st.tile [%arg4, 0] [%12, %dim_6] [1, 1] : !gml_st.tile<?x?>
-      %16 = gml_st.materialize %8[%15] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-      %17 = gml_st.tile [0, 0] [%dim_4, %dim_6] [1, 1] : !gml_st.tile<?x?>
-      %18 = gml_st.materialize %arg5[%17] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
-      %19 = linalg.matmul ins(%14, %16 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) -> tensor<?x?xf32>
-      gml_st.set_yield %19 into %arg5[%17] : tensor<?x?xf32> into tensor<?x?xf32>[!gml_st.tile<?x?>]
-    } : tensor<?x?xf32>
-    gml_st.set_yield %11 into %1[%9] : tensor<?x?xf32> into tensor<?x?xf32>[!gml_st.tile<?x?>]
-  } : tensor<?x?xf32>
-  return %2 : tensor<?x?xf32>
-}
-
-// CHECK-DAG:  #[[$MAP_MAIN_PAR_I:.*]] = affine_map<()[s0] -> ((s0 floordiv 8) * 8)>
-// CHECK-DAG:  #[[$MAP_MAIN_PAR_J:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-// CHECK-DAG:  #[[$MAP_MAIN_FOR:.*]] = affine_map<()[s0] -> ((s0 floordiv 2) * 2)>
-// CHECK-DAG:  #[[$MAP_REM_PAR1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
-// CHECK-DAG:  #[[$MAP_REM_PAR2:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 8)>
-// CHECK-DAG:  #[[$MAP_REM_FOR:.*]] = affine_map<(d0, d1) -> (-d0 + d1)>
-
-// CHECK-LABEL: func @matmul(
-// CHECK-SAME:    %[[LHS:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:    %[[RHS:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG:     %[[LHS_ROW:.*]] = tensor.dim %[[LHS]], %[[C0]]
-// CHECK-DAG:     %[[RHS_COL:.*]] = tensor.dim %[[RHS]], %[[C1]]
-// CHECK-DAG:     %[[FILL:.*]] = linalg.fill
-// CHECK-DAG:     %[[MAIN_PAR_I_UB:.*]] = affine.apply #[[$MAP_MAIN_PAR_I]]()[%[[LHS_ROW]]]
-// CHECK-DAG:     %[[MAIN_PAR_J_UB:.*]] = affine.apply #[[$MAP_MAIN_PAR_J]]()[%[[RHS_COL]]]
-
-// CHECK:         %[[MAIN_PAR:.*]] = gml_st.parallel (
-// CHECK-SAME:         %[[MAIN_PAR_I:.*]], %[[MAIN_PAR_J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:         to (%[[MAIN_PAR_I_UB]], %[[MAIN_PAR_J_UB]]) step (%[[C8]], %[[C4]])
-// CHECK:            %[[MAIN_PAR_OUT_TILE:.*]] = gml_st.tile [%[[MAIN_PAR_I]], %[[MAIN_PAR_J]]]
-// CHECK:            %[[MAIN_PAR_OUT_SLICE:.*]] = gml_st.materialize %[[FILL]][%[[MAIN_PAR_OUT_TILE]]] :
-// CHECK:            %[[MAIN_PAR_FOR_K_UB:.*]] = affine.apply #[[$MAP_MAIN_FOR]]()[%[[MAIN_PAR_LHS_SUB_COL:.*]]]
-
-// CHECK:            %[[MAIN_PAR_MAIN_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[MAIN_PAR_MAIN_FOR_K:.*]]) = (%[[C0]])
-// CHECK-SAME:           to (%[[MAIN_PAR_FOR_K_UB]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[MAIN_PAR_MAIN_FOR_OUT:.*]] = %[[MAIN_PAR_OUT_SLICE]]: tensor<8x4xf32>)
-// CHECK:              %[[MAIN_PAR_MAIN_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[MAIN_PAR_MAIN_FOR_OUT]][%[[MAIN_PAR_MAIN_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[MAIN_PAR_MAIN_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[MAIN_PAR_MAIN_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[MAIN_PAR_MAIN_FOR_MATMUL]] into %[[MAIN_PAR_MAIN_FOR_OUT]][%[[MAIN_PAR_MAIN_FOR_OUT_TILE]]] : tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
-
-// CHECK:            %[[MAIN_PAR_REM_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[MAIN_PAR_REM_FOR_K:.*]]) = (%[[MAIN_PAR_FOR_K_UB]])
-// CHECK-SAME:           to (%[[MAIN_PAR_LHS_SUB_COL]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[MAIN_PAR_REM_FOR_OUT:.*]] = %[[MAIN_PAR_MAIN_FOR]]: tensor<8x4xf32>)
-// CHECK:              %[[MAIN_PAR_REM_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[MAIN_PAR_REM_FOR_OUT]][%[[MAIN_PAR_REM_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[MAIN_PAR_REM_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[MAIN_PAR_REM_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[MAIN_PAR_REM_FOR_MATMUL]] into %[[MAIN_PAR_REM_FOR_OUT]][%[[MAIN_PAR_REM_FOR_OUT_TILE]]] : tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
-
-// CHECK:            gml_st.set_yield %[[MAIN_PAR_REM_FOR]] into %[[FILL]][%[[MAIN_PAR_OUT_TILE]]]
-
-// CHECK:         %[[REM_PAR_LHS_ROW:.*]] = gml_st.parallel (
-// CHECK-SAME:         %[[REM_PAR_LHS_ROW_I:.*]], %[[REM_PAR_LHS_ROW_J:.*]]) = (%[[MAIN_PAR_I_UB]], %[[C0]])
-// CHECK-SAME:         to (%[[LHS_ROW]], %[[MAIN_PAR_J_UB]]) step (%[[C8]], %[[C4]])
-// CHECK:            %[[REM_PAR_LHS_ROW_OUT_TILE:.*]] = gml_st.tile [%[[REM_PAR_LHS_ROW_I]], %[[REM_PAR_LHS_ROW_J]]]
-// CHECK:            %[[REM_PAR_LHS_ROW_OUT_SLICE:.*]] = gml_st.materialize %[[MAIN_PAR]][%[[REM_PAR_LHS_ROW_OUT_TILE]]] :
-// CHECK:            %[[REM_PAR_LHS_ROW_FOR_K_UB:.*]] = affine.apply #[[$MAP_MAIN_FOR]]()[%[[REM_PAR_LHS_ROW_LHS_SUB_COL:.*]]]
-
-// CHECK:            %[[REM_PAR_LHS_ROW_MAIN_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[REM_PAR_LHS_ROW_MAIN_FOR_K:.*]]) = (%[[C0]])
-// CHECK-SAME:           to (%[[REM_PAR_LHS_ROW_FOR_K_UB]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[REM_PAR_LHS_ROW_MAIN_FOR_OUT:.*]] = %[[REM_PAR_LHS_ROW_OUT_SLICE]]:
-// CHECK:              %[[REM_PAR_LHS_ROW_MAIN_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[REM_PAR_LHS_ROW_MAIN_FOR_OUT]][%[[REM_PAR_LHS_ROW_MAIN_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[REM_PAR_LHS_ROW_MAIN_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[REM_PAR_LHS_ROW_MAIN_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[REM_PAR_LHS_ROW_MAIN_FOR_MATMUL]] into %[[REM_PAR_LHS_ROW_MAIN_FOR_OUT]][%[[REM_PAR_LHS_ROW_MAIN_FOR_OUT_TILE]]]
-
-// CHECK:            %[[REM_PAR_LHS_ROW_REM_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[REM_PAR_LHS_ROW_REM_FOR_K:.*]]) = (%[[REM_PAR_LHS_ROW_FOR_K_UB]])
-// CHECK-SAME:           to (%[[REM_PAR_LHS_ROW_LHS_SUB_COL]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[REM_PAR_LHS_ROW_REM_FOR_OUT:.*]] = %[[REM_PAR_LHS_ROW_MAIN_FOR]]:
-// CHECK:              %[[REM_PAR_LHS_ROW_REM_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[REM_PAR_LHS_ROW_REM_FOR_OUT]][%[[REM_PAR_LHS_ROW_REM_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[REM_PAR_LHS_ROW_REM_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[REM_PAR_LHS_ROW_REM_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[REM_PAR_LHS_ROW_REM_FOR_MATMUL]] into %[[REM_PAR_LHS_ROW_REM_FOR_OUT]][%[[REM_PAR_LHS_ROW_REM_FOR_OUT_TILE]]]
-
-// CHECK:            gml_st.set_yield %[[REM_PAR_LHS_ROW_REM_FOR]] into %[[MAIN_PAR]][%[[REM_PAR_LHS_ROW_OUT_TILE]]]
-
-// CHECK:         %[[REM_PAR_RHS_COL:.*]] = gml_st.parallel (
-// CHECK-SAME:         %[[REM_PAR_RHS_COL_I:.*]], %[[REM_PAR_RHS_COL_J:.*]]) = (%[[C0]], %[[MAIN_PAR_J_UB]])
-// CHECK-SAME:         to (%[[LHS_ROW]], %[[RHS_COL]]) step (%[[C8]], %[[C4]])
-// CHECK:            %[[REM_PAR_RHS_COL_OUT_SLICE:.*]] = gml_st.materialize %[[REM_PAR_LHS_ROW]][%[[REM_PAR_RHS_COL_OUT_TILE:.*]]] :
-// CHECK:            %[[REM_PAR_RHS_COL_FOR_K_UB:.*]] = affine.apply #[[$MAP_MAIN_FOR]]()[%[[REM_PAR_RHS_COL_LHS_SUB_COL:.*]]]
-
-// CHECK:            %[[REM_PAR_RHS_COL_MAIN_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[REM_PAR_RHS_COL_MAIN_FOR_K:.*]]) = (%[[C0]])
-// CHECK-SAME:           to (%[[REM_PAR_RHS_COL_FOR_K_UB]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[REM_PAR_RHS_COL_MAIN_FOR_OUT:.*]] = %[[REM_PAR_RHS_COL_OUT_SLICE]]:
-// CHECK:              %[[REM_PAR_RHS_COL_MAIN_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[REM_PAR_RHS_COL_MAIN_FOR_OUT]][%[[REM_PAR_RHS_COL_MAIN_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[REM_PAR_RHS_COL_MAIN_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[REM_PAR_RHS_COL_MAIN_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[REM_PAR_RHS_COL_MAIN_FOR_MATMUL]] into %[[REM_PAR_RHS_COL_MAIN_FOR_OUT]][%[[REM_PAR_RHS_COL_MAIN_FOR_OUT_TILE]]]
-
-// CHECK:            %[[REM_PAR_RHS_COL_REM_FOR:.*]] = gml_st.for (
-// CHECK-SAME:           %[[REM_PAR_RHS_COL_REM_FOR_K:.*]]) = (%[[REM_PAR_RHS_COL_FOR_K_UB]])
-// CHECK-SAME:           to (%[[REM_PAR_RHS_COL_LHS_SUB_COL]]) step (%[[C2]])
-// CHECK-SAME:           outs (%[[REM_PAR_RHS_COL_REM_FOR_OUT:.*]] = %[[REM_PAR_RHS_COL_MAIN_FOR]]:
-// CHECK:              %[[REM_PAR_RHS_COL_REM_FOR_OUT_SLICE:.*]] = gml_st.materialize %[[REM_PAR_RHS_COL_REM_FOR_OUT]][%[[REM_PAR_RHS_COL_REM_FOR_OUT_TILE:.*]]] :
-// CHECK:              %[[REM_PAR_RHS_COL_REM_FOR_MATMUL:.*]] = linalg.matmul ins({{.*}}) outs(%[[REM_PAR_RHS_COL_REM_FOR_OUT_SLICE]]
-// CHECK-NEXT:         gml_st.set_yield %[[REM_PAR_RHS_COL_REM_FOR_MATMUL]] into %[[REM_PAR_RHS_COL_REM_FOR_OUT]][%[[REM_PAR_RHS_COL_REM_FOR_OUT_TILE]]]
-
-// CHECK:            gml_st.set_yield %[[REM_PAR_RHS_COL_REM_FOR]] into %[[REM_PAR_LHS_ROW]][%[[REM_PAR_RHS_COL_OUT_TILE]]]
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline.mlir
index 0a3a512a5a8..a01636a4d9d 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline.mlir
@@ -38,8 +38,8 @@ func.func @sigmoid_dynamic_dim(%arg0: tensor<?x1xf32>) -> tensor<?x1xf32> {
 
 // -----
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0) -> ()>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0) -> ()>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0) -> (d0)>
 
 // CHECK-LABEL: @add_scalar_with_vec
 func.func @add_scalar_with_vec(%arg0: tensor<f32>,
@@ -271,8 +271,8 @@ func.func @cast_sub(%arg0: tensor<?x32xi16>, %arg1: tensor<?x?x32xf16>)
 
 // -----
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
 
 // CHECK-LABEL: @tf_transpose_const_perm
 func.func @tf_transpose_const_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
@@ -289,8 +289,8 @@ func.func @tf_transpose_const_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
 
 // -----
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+// CHECK-DAG: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 
 // CHECK-LABEL: @tf_transpose_after_transpose
 func.func @tf_transpose_after_transpose(%arg0: tensor<?x?x?xf32>)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_one_shot.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_one_shot.mlir
deleted file mode 100644
index 0a3a512a5a8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_one_shot.mlir
+++ /dev/null
@@ -1,440 +0,0 @@
-// RUN: tf-tfrt-opt -split-input-file -tf-jitrt-pipeline %s | FileCheck %s
-
-// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @tanh_lower_and_fuse
-// CHECK-SAME: %[[ARG:.*]]: memref<?x32xf32>
-func.func @tanh_lower_and_fuse(%arg0: tensor<?x32xf32>) -> tensor<?x32xf32> {
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[DIM:.*]] = memref.dim %[[ARG]], %[[C0]]
-  // CHECK: %[[MEMREF:.*]] = memref.alloc(%[[DIM]]) {{.*}} : memref<?x32xf32>
-
-  // CHECK: linalg.generic
-  // CHECK-SAME: indexing_maps = [#map, #map]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel"]
-  // CHECK-SAME: ins(%[[ARG]] : memref<?x32xf32>)
-  // CHECK-SAME: outs(%[[MEMREF]] : memref<?x32xf32>)
-  // CHECK: tanh
-  // CHECK-NEXT: tanh
-
-  // CHECK: return %[[MEMREF]]
-  %0 = "tf.Tanh"(%arg0): (tensor<?x32xf32>) -> tensor<?x32xf32>
-  %1 = "tf.Tanh"(%0): (tensor<?x32xf32>) -> tensor<?x32xf32>
-  func.return %1 : tensor<?x32xf32>
-}
-
-// -----
-
-// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @sigmoid_dynamic_dim
-func.func @sigmoid_dynamic_dim(%arg0: tensor<?x1xf32>) -> tensor<?x1xf32> {
-  // CHECK: linalg.generic
-  // CHECK-SAME: indexing_maps = [#map, #map]
-  // CHECK-SAME: iterator_types = ["parallel", "parallel"]
-  %0 = "tf.Sigmoid"(%arg0) : (tensor<?x1xf32>) -> tensor<?x1xf32>
-  func.return %0 : tensor<?x1xf32>
-}
-
-// -----
-
-// CHECK: #map{{[0-9]*}} = affine_map<(d0) -> ()>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @add_scalar_with_vec
-func.func @add_scalar_with_vec(%arg0: tensor<f32>,
-                          %arg1: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK: linalg.generic
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg1): (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
-  func.return %0 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK: #map = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @add_vec_vec
-func.func @add_vec_vec(
-  %arg0: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg1: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>}
-) -> tensor<?xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK: linalg.generic
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg1): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  func.return %0 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK: #map = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @add_vec_vec_vec
-func.func @add_vec_vec_vec(
-  %arg0: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg1: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg2: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>}
-) -> tensor<?xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK: linalg.generic
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg1): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %1 = "tf.AddV2"(%0, %arg2): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  func.return %1 : tensor<?xf32>
-}
-
-// -----
-
-// Verify that symbolic shape optimization can move all the broadcasts up, and
-// progressively remove all shape constraints and replace mhlo broadcasts with
-// linalg.generic operations that in the end all are fused together.
-
-// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
-// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK: compute_with_bcast
-func.func @compute_with_bcast(
-  %arg0: tensor<1x?x1xf32>
-    {rt.symbolic_shape = dense<[1, -2, 1]> : tensor<3xi64>},
-  %arg1: tensor<512xf32>,
-  %arg2: tensor<1x?x512xf32>
-    {rt.symbolic_shape = dense<[1, -2, 512]> : tensor<3xi64>},
-  %arg3: tensor<1x?x1xf32>
-    {rt.symbolic_shape = dense<[1, -2, 1]> : tensor<3xi64>},
-  %arg4: tensor<512xf32>
-) -> tensor<?x?x512xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK:     linalg.generic
-  // CHECK:        addf
-  // CHECK-NEXT:   math.rsqrt
-  // CHECK-NEXT:   mulf
-  // CHECK-NEXT:   mulf
-  // CHECK-NEXT:   subf
-  // CHECK-NEXT:   mulf
-  // CHECK-NEXT:   addf
-  // CHECK-NEXT:   linalg.yield
-  // CHECK-NOT:    linalg.generic
-  %c = "tf.Const"() {value = dense<9.99999996E-13>
-       : tensor<f32>} : () -> tensor<f32>
-  %0 = "tf.AddV2"(%arg0, %c)
-       : (tensor<1x?x1xf32>, tensor<f32>) -> tensor<?x?x1xf32>
-  %1 = "tf.Rsqrt"(%0)
-       : (tensor<?x?x1xf32>) -> tensor<?x?x1xf32>
-  %2 = "tf.Mul"(%1, %arg1)
-       : (tensor<?x?x1xf32>, tensor<512xf32>) -> tensor<?x?x512xf32>
-  %3 = "tf.Mul"(%2, %arg2)
-       : (tensor<?x?x512xf32>, tensor<1x?x512xf32>) -> tensor<?x?x512xf32>
-  %4 = "tf.Mul"(%2, %arg3)
-       : (tensor<?x?x512xf32>, tensor<1x?x1xf32>) -> tensor<?x?x512xf32>
-  %5 = "tf.Sub"(%arg4, %4)
-       : (tensor<512xf32>, tensor<?x?x512xf32>) -> tensor<?x?x512xf32>
-  %6 = "tf.AddV2"(%3, %5)
-       : (tensor<?x?x512xf32>, tensor<?x?x512xf32>) -> tensor<?x?x512xf32>
-  func.return %6 : tensor<?x?x512xf32>
-}
-
-// -----
-
-// CHECK: add_vec_vec_vec_vec
-func.func @add_vec_vec_vec_vec(
-  %arg0: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg1: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg2: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>},
-  %arg3: tensor<?xf32> {rt.symbolic_shape = dense<-2>: tensor<1xi64>}
-) -> tensor<?xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK: linalg.generic
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg1): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %1 = "tf.AddV2"(%0, %arg2): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %2 = "tf.AddV2"(%1, %arg3): (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  func.return %2 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK: add_vec_tensor_tensor
-func.func @add_vec_tensor_tensor(
-  %arg0: tensor<512xf32>,
-  %arg1: tensor<1x?x512xf32>
-    {rt.symbolic_shape = dense<[1, -2, 512]> : tensor<3xi64>},
-  %arg2: tensor<1x?x512xf32>
-    {rt.symbolic_shape = dense<[1, -2, 512]> : tensor<3xi64>}
-) -> tensor<1x?x512xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK: linalg.generic
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg1)
-        : (tensor<512xf32>, tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
-  %1 = "tf.AddV2"(%arg2, %0)
-        : (tensor<1x?x512xf32>, tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
-  func.return %1 : tensor<1x?x512xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @tf_binary_with_bcast
-func.func @tf_binary_with_bcast(%arg0: tensor<?x1xf32>,
-                           %arg1: tensor<?x4xf32>) -> tensor<?x4xf32> {
-  // CHECK-NOT: shape.
-  // CHECK: %[[LHS:.*]] = memref.reinterpret_cast
-  // CHECK: %[[RHS:.*]] = memref.reinterpret_cast
-  // CHECK: linalg.generic {{.*}} ins(%[[LHS]], %[[RHS]] :
-  // CHECK:   mulf
-  %0 = "tf.Mul"(%arg0, %arg1)
-       : (tensor<?x1xf32>, tensor<?x4xf32>) -> tensor<?x4xf32>
-  func.return %0 : tensor<?x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @tf_binary_with_bcast_and_fusion
-// CHECK-SAME: %[[ARG0:.*]]: memref<?x4xf32>,
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xf32>,
-// CHECK-SAME: %[[ARG2:.*]]: memref<4xf32>
-func.func @tf_binary_with_bcast_and_fusion(%arg0: tensor<?x4xf32>,
-                                      %arg1: tensor<4xf32>,
-                                      %arg2: tensor<4xf32>) -> tensor<?x4xf32> {
-  // CHECK:      linalg.generic
-  // CHECK-SAME: ins(%[[ARG0]], %[[ARG1]], %[[ARG2]] : {{.*}})
-  // CHECK:        math.log1p
-  // CHECK-NEXT:   subf
-  // CHECK-NEXT:   mulf
-  // CHECK-NEXT:   linalg.yield
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.Log1p"(%arg0)
-       : (tensor<?x4xf32>) -> tensor<?x4xf32>
-  %1 = "tf.Sub"(%0, %arg1)
-       : (tensor<?x4xf32>, tensor<4xf32>) -> tensor<?x4xf32>
-  %2 = "tf.Mul"(%1, %arg2)
-       : (tensor<?x4xf32>, tensor<4xf32>) -> tensor<?x4xf32>
-  func.return %2 : tensor<?x4xf32>
-}
-
-// -----
-
-// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK: tf_binary_with_bcast_symbolic_shapes
-func.func @tf_binary_with_bcast_symbolic_shapes(
-  %arg0: tensor<?xf32>   {rt.symbolic_shape = dense<[   -3]>: tensor<1xi64>},
-  %arg1: tensor<?x?xf32> {rt.symbolic_shape = dense<[-2,-3]>: tensor<2xi64>},
-  %arg2: tensor<?x?xf32> {rt.symbolic_shape = dense<[-2,-3]>: tensor<2xi64>},
-  %arg3: tensor<?x?xf32> {rt.symbolic_shape = dense<[-2,-3]>: tensor<2xi64>}
-) -> tensor<?x?xf32> {
-  // CHECK-NOT: memref.reinterpret_cast
-  // CHECK: linalg.generic
-  // CHECK:   log1p
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK:   addf
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.Log1p"(%arg0)
-       : (tensor<?xf32>) -> tensor<?xf32>
-  %1 = "tf.AddV2"(%0, %arg1)
-       : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = "tf.AddV2"(%1, %arg2)
-       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  %3 = "tf.AddV2"(%2, %arg3)
-       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %3 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @cast_sub
-func.func @cast_sub(%arg0: tensor<?x32xi16>, %arg1: tensor<?x?x32xf16>)
-    -> tensor<?x?x32xf16> {
-  // CHECK:      linalg.generic
-  // CHECK-SAME: outs(%[[RESULT_BUF:.*]] : memref<?x?x32xf16>)
-  // CHECK-SAME: {
-  // CHECK:      ^bb0(%[[LHS:.*]]: f16, %[[RHS:.*]]: i16, %{{.*}}: f16):
-  // CHECK:        %[[RHS_CASTED:.*]] = arith.sitofp %[[RHS]] : i16 to f16
-  // CHECK:        %[[RESULT:.*]] = arith.subf %[[LHS]], %[[RHS_CASTED]] : f16
-  // CHECK:        linalg.yield %[[RESULT]] : f16
-  // CHECK:      }
-  // CHECK:      return %[[RESULT_BUF]] : memref<?x?x32xf16>
-  %0 = "tf.Cast"(%arg0) : (tensor<?x32xi16>) -> tensor<?x32xf16>
-  %1 = "tf.Sub"(%arg1, %0) : (tensor<?x?x32xf16>, tensor<?x32xf16>)
-      -> tensor<?x?x32xf16>
-  func.return %1 : tensor<?x?x32xf16>
-}
-
-// -----
-
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @tf_transpose_const_perm
-func.func @tf_transpose_const_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
-  // CHECK: %[[OUT:.*]] = memref.alloc() {{.*}} : memref<3x2xf32>
-  // CHECK: linalg.generic {indexing_maps = [#map{{[0-9]*}}, #map{{[0-9]*}}]
-  // CHECK-SAME: ins(%arg0 : memref<2x3xf32>)
-  // CHECK-SAME: outs(%[[OUT]] : memref<3x2xf32>)
-  %0 = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> }
-         : () -> tensor<2xi32>
-  %1 = "tf.Transpose"(%arg0, %0)
-         : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
-  func.return %1 : tensor<3x2xf32>
-}
-
-// -----
-
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-// CHECK-LABEL: @tf_transpose_after_transpose
-func.func @tf_transpose_after_transpose(%arg0: tensor<?x?x?xf32>)
-                                  -> tensor<?x?x?xf32> {
-  // CHECK: %[[OUT:.*]] = memref.alloc
-  // CHECK: linalg.generic {indexing_maps = [#map{{[0-9]*}}, #map{{[0-9]*}}]
-  // CHECK-SAME: ins(%arg0 :  memref<?x?x?xf32>)
-  // CHECK-SAME: outs(%[[OUT]] :  memref<?x?x?xf32>)
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.Const"() { value = dense<[0, 2, 1]> : tensor<3xi32> }
-         : () -> tensor<3xi32>
-  %1 = "tf.Const"() { value = dense<[2, 1, 0]> : tensor<3xi32> }
-         : () -> tensor<3xi32>
-  %2 = "tf.Transpose"(%arg0, %0)
-         : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
-  %3 = "tf.Transpose"(%2, %1)
-         : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
-  func.return %3 : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @bias_add_and_relu
-// CHECK-SAME: %[[ARG0:.*]]: memref<?x32xf32>
-// CHECK-SAME: %[[ARG1:.*]]: memref<32xf32>
-func.func @bias_add_and_relu(%arg0: tensor<?x32xf32>,
-                        %arg1: tensor<32xf32>) -> tensor<?x32xf32> {
-  // CHECK:      linalg.generic
-  // CHECK-SAME: ins(%[[ARG0]], %[[ARG1]] : {{.*}})
-  // CHECK:        addf
-  // CHECK:        maxf
-  // CHECK-NEXT:   linalg.yield
-  // CHECK-NOT:  linalg.generic
-  %0 = "tf.BiasAdd"(%arg0, %arg1)
-         : (tensor<?x32xf32>, tensor<32xf32>) -> tensor<?x32xf32>
-  %1 = "tf.Relu"(%0): (tensor<?x32xf32>) -> tensor<?x32xf32>
-  func.return %1 : tensor<?x32xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @sub_sub
-func.func @sub_sub(%arg0: tensor<?x32xf16>, %arg1: tensor<?x32xf16>, %arg2: tensor<?x?x32xf16>) -> tensor<?x?x32xf16> {
-  // CHECK:      linalg.generic
-  // CHECK-SAME: outs(%[[RESULT_BUF:.*]] : memref<?x?x32xf16>)
-  // CHECK:      ^bb0(%[[A:.*]]: f16, %[[B:.*]]: f16, %[[C:.*]]: f16, %{{.*}}: f16):
-  // CHECK:        %[[TMP:.*]] = arith.subf %[[B]], %[[C]]
-  // CHECK:        %[[RESULT:.*]] = arith.subf %[[A]], %[[TMP]]
-  // CHECK:        linalg.yield %[[RESULT]]
-  // CHECK:      return %[[RESULT_BUF]] : memref<?x?x32xf16>
-  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<?x32xf16>, tensor<?x32xf16>) -> tensor<?x32xf16>
-  %1 = "tf.Sub"(%arg2, %0) : (tensor<?x?x32xf16>, tensor<?x32xf16>) -> tensor<?x?x32xf16>
-  func.return %1 : tensor<?x?x32xf16>
-}
-
-// -----
-
-// CHECK-LABEL: @strided_slice_1d_to_0d
-func.func @strided_slice_1d_to_0d(%arg0: tensor<3xi32>) -> tensor<i32> {
-  %cst_0 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  %cst_1 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK:      %[[SUBVIEW:.*]] = memref.subview %arg0[0] [1] [1]
-  // CHECK-SAME:                 : memref<3xi32> to memref<1xi32, strided<[1]>>
-  // CHECK:      %[[RET:.*]] = memref.collapse_shape %[[SUBVIEW]]
-  // CHECK:      return %[[RET]]
-  %0 = "tf.StridedSlice"(%arg0, %cst_1, %cst_0, %cst_0)
-       {
-         begin_mask       = 0 : i64,
-         ellipsis_mask    = 0 : i64,
-         end_mask         = 0 : i64,
-         new_axis_mask    = 0 : i64,
-         shrink_axis_mask = 1 : i64
-       } : (tensor<3xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>)
-         -> tensor<i32>
-  func.return %0 : tensor<i32>
-}
-
-// -----
-
-// CHECK: memref.global "private" constant @__constant_2xi32 : memref<2xi32> = dense<[0, 1]>
-// CHECK-SAME: {alignment = 64 : i64}
-// CHECK-LABEL: @constant_folding
-func.func @constant_folding() -> tensor<2xi32> {
-  %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[CONST:.*]] = memref.get_global @__constant_2xi32 : memref<2xi32>
-  // CHECK: return %[[CONST]]
-  %2 = "tf.Pack"(%0, %1) {axis = 0 : i64}
-       : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-  func.return %2 : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @add_floormod_add
-func.func @add_floormod_add(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK:     linalg.generic
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.AddV2"(%arg0, %arg0)
-      : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = "tf.FloorMod"(%0, %arg0)
-      : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = "tf.AddV2"(%1, %arg0)
-      : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @min_clip_by_value
-func.func @min_clip_by_value(%V__0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %dims0 = "tf.Const"() { value = dense<[1, 2]> : tensor<2xi32> }: () -> tensor<2xi32>
-  %0 = "tf.Min"(%V__0, %dims0) {keep_dims = true} : (tensor<?x?x?xf32>, tensor<2xi32>) -> tensor<?x?x?xf32>
-  %1 = "tf.ClipByValue"(%V__0, %0, %V__0) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  func.return %1 : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @rint_sq_sub
-func.func @rint_sq_sub(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK:     linalg.generic
-  // CHECK-NOT: linalg.generic
-  %0 = "tf.Rint"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = "tf.Square"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  %2 = "tf.Sub"(%0, %1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %2 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @do_not_fuse_if_multiple_uses
-func.func @do_not_fuse_if_multiple_uses(%arg0: tensor<?x?xf32>)
-    -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK:     linalg.generic
-  // CHECK:       math.rsqrt
-  // CHECK-NEXT:  math.rsqrt
-  // CHECK-NEXT:  linalg.yield
-  %0 = "tf.Rsqrt"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  %1 = "tf.Rsqrt"(%0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK:     linalg.generic
-  // CHECK:       math.rsqrt
-  // CHECK-NEXT:  linalg.yield
-  %2 = "tf.Rsqrt"(%1) : (tensor<?x?xf32>) -> tensor<?x?xf32>
-  // CHECK-NOT: linalg.generic
-  func.return %1, %2 : tensor<?x?xf32>, tensor<?x?xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_vectorized.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_vectorized.mlir
index f5f7ad855e5..66bfff88f31 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_vectorized.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_pipeline_vectorized.mlir
@@ -9,14 +9,9 @@ func.func @reduce_row_sum_2d_dynamic(%input: tensor<?x?xf32>) -> tensor<?xf32> {
       : (tensor<?x?xf32>, tensor<1xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK: linalg.fill
-// CHECK: scf.parallel
-// CHECK:   scf.for
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK-NOT: arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
+// CHECK:         scf.parallel
+// CHECK:           scf.for
+// CHECK-COUNT-4:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
 
 // -----
 
@@ -28,14 +23,9 @@ func.func @reduce_column_sum_2d_dynamic(%input: tensor<?x?xf32>) -> tensor<?xf32
       : (tensor<?x?xf32>, tensor<1xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK: linalg.fill
-// CHECK: scf.parallel
-// CHECK:   scf.for
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK-NOT: arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
+// CHECK:         scf.parallel
+// CHECK:           scf.for
+// CHECK-COUNT-4:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
 
 // -----
 
@@ -47,17 +37,11 @@ func.func @reduce_row_mean_2d_dynamic(%input: tensor<?x?xf32>) -> tensor<?xf32>
       : (tensor<?x?xf32>, tensor<1xi32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
-// CHECK: linalg.fill
-// CHECK: scf.parallel
-// CHECK:   scf.for
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK-NOT: arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-// CHECK: scf.parallel
-// CHECK:      vector.broadcast %{{.*}} : f32 to vector<8xf32>
-// CHECK-NEXT: arith.divf %{{.*}}, %{{.*}} : vector<8xf32>
+// CHECK:         scf.parallel
+// CHECK:           scf.for
+// CHECK-COUNT-4:     arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
+// CHECK:             scf.yield
+// CHECK:           arith.divf %{{.*}}, %{{.*}} : vector<4xf32>
 
 // -----
 
@@ -85,11 +69,7 @@ func.func @reduction_of_cast(%arg0: tensor<?xi64>) -> tensor<i32> {
     : (tensor<?xi32>, tensor<1xi32>) -> tensor<i32>
   func.return %1 : tensor<i32>
 }
+// CHECK: scf.parallel
+// CHECK:   arith.trunci
 // CHECK: scf.for
-// CHECK:   arith.trunci %{{.*}} : vector<4x8xi64> to vector<4x8xi32>
-// CHECK:   arith.muli %{{.*}}, %{{.*}} : vector<8xi32>
-// CHECK: vector.reduction
-// CHECK: scf.for
-// CHECK:   linalg.generic
-// CHECK:     arith.trunci
-// CHECK:     arith.muli
+// CHECK:   arith.muli
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_rewrite_vector_multi_reduction.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_rewrite_vector_multi_reduction.mlir
deleted file mode 100644
index 5af902233a4..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_rewrite_vector_multi_reduction.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2022 The TensorFlow Runtime Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// RUN: tf-tfrt-opt %s --tf-jitrt-rewrite-vector-multi-reduction \
-// RUN: | FileCheck %s
-
-// CHECK-LABEL: func @vector_row
-func.func @vector_row(%arg0: vector<2x4xf32>, %acc: vector<2xf32>) -> vector<2xf32> {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [1] : vector<2x4xf32> to vector<2xf32>
-    func.return %0 : vector<2xf32>
-}
-// CHECK-COUNT-4: arith.mulf
-
-// CHECK-LABEL: func @vector_col
-func.func @vector_col(%arg0: vector<2x4xf32>, %acc: vector<4xf32>) -> vector<4xf32> {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<2x4xf32> to vector<4xf32>
-    func.return %0 : vector<4xf32>
-}
-// CHECK: arith.mulf
-// CHECK: arith.mulf
-
-// CHECK-LABEL: func @vector_1d
-func.func @vector_1d(%arg0: vector<4xf32>, %acc: f32) -> f32 {
-    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<4xf32> to f32
-    func.return %0 : f32
-}
-// CHECK: vector.reduction <mul>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_cwise.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_cwise.mlir
deleted file mode 100644
index 412db03b729..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_cwise.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: tf-tfrt-opt -tf-jitrt-tile-cwise %s | FileCheck %s
-
-#map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @tanh_2d(%input: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %dim0 = tensor.dim %input, %c0 : tensor<?x?xf32>
-  %dim1 = tensor.dim %input, %c1 : tensor<?x?xf32>
-  %init = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
-  %1 = linalg.generic
-    {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
-    ins(%input : tensor<?x?xf32>)
-    outs(%init : tensor<?x?xf32>)
-  {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %2 = math.tanh %arg1 : f32
-    linalg.yield %2 : f32
-  } -> tensor<?x?xf32>
-  func.return %1 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL:   func @tanh_2d(
-// CHECK-SAME:                  %[[INPUT:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[STEP:.*]] = arith.constant 8 : index
-// CHECK-NOT:       tensor.dim
-// CHECK-DAG:       %[[DIM0:.*]] = tensor.dim %[[INPUT]], %[[C0]]
-// CHECK-DAG:       %[[DIM1:.*]] = tensor.dim %[[INPUT]], %[[C1]]
-// CHECK:           %[[INIT:.*]] = tensor.empty(%[[DIM0]], %[[DIM1]])
-// CHECK-DAG:       %[[DIM0_OUT:.*]] = tensor.dim %[[INPUT]], %[[C0]]
-// CHECK-DAG:       %[[DIM1_OUT:.*]] = tensor.dim %[[INPUT]], %[[C1]]
-// CHECK:           %[[OUTPUT:.*]] = gml_st.loop
-// CHECK-SAME:          (%[[ARG1:.*]], %[[ARG2:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:          to (%[[DIM0_OUT]], %[[DIM1_OUT]])
-// CHECK-SAME:          step (%[[C1]], %[[STEP]])
-// CHECK-SAME:          ins (%[[IN_TENS:.*]] = %[[INPUT]]: tensor<?x?xf32>)
-// CHECK-SAME:          outs (%[[OUT_TENS:.*]] = %[[INIT]]: tensor<?x?xf32>) {
-// CHECK:           %[[IN_SLICE:.*]] = tensor.extract_slice
-// CHECK-SAME:          %[[IN_TENS]]{{\[}}%[[ARG1]], %[[ARG2]]]
-// CHECK-SAME:          {{\[}}1, %{{.*}}] [1, 1]
-// CHECK:           %[[OUT_SLICE:.*]] = tensor.extract_slice
-// CHECK-SAME:          %[[OUT_TENS]]{{\[}}%[[ARG1]], %[[ARG2]]]
-// CHECK-SAME:          {{\[}}1, %{{.*}}] [1, 1]
-// CHECK:           %[[VECTOR_RESULT:.*]] = linalg.generic
-// CHECK-SAME:          {indexing_maps = [#map1, #map1],
-// CHECK-SAME:          iterator_types = ["parallel", "parallel"]}
-// CHECK-SAME:          ins(%[[IN_SLICE]] : tensor<1x?xf32>)
-// CHECK-SAME:          outs(%[[OUT_SLICE]] : tensor<1x?xf32>) {
-// CHECK-NEXT:        ^bb0(%[[SCALAR_INPUT:.*]]: f32, %[[VAL_20:.*]]: f32):
-// CHECK-NEXT:          %[[TANH_OUT:.*]] = math.tanh %[[SCALAR_INPUT]] : f32
-// CHECK-NEXT:          linalg.yield %[[TANH_OUT]] : f32
-// CHECK-NEXT:        } -> tensor<1x?xf32>
-// CHECK-NEXT:        %[[INSERT_RESULT:.*]] = tensor.insert_slice
-// CHECK-SAME:            %[[VAL_23:.*]] into %[[OUT_TENS]]
-// CHECK-SAME:            {{\[}}%[[ARG1]], %[[ARG2]]] [1,
-// CHECK-SAME:            %{{.*}}] [1, 1]
-// CHECK-NEXT:        gml_st.yield %[[INSERT_RESULT]] : tensor<?x?xf32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:      return %[[FINAL_OUTPUT:.*]] : tensor<?x?xf32>
-// CHECK-NEXT:    }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_fill.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_fill.mlir
deleted file mode 100644
index 4385001296f..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_fill.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-tfrt-opt -tf-jitrt-tile-fill %s | FileCheck %s
-
-func.func @fill(%tensor : tensor<64xf32>, %value : f32) -> tensor<64xf32> {
-  %0 = linalg.fill ins(%value : f32) outs(%tensor : tensor<64xf32>) -> tensor<64xf32>
-  func.return %0 : tensor<64xf32>
-}
-// CHECK-LABEL: func @fill(
-// CHECK-SAME:      %[[TNSR:.*]]: tensor<64xf32>, %[[VAL:.*]]: f32)
-// CHECK-DAG:     %[[STEP:.*]] = arith.constant 8 : index
-// CHECK-DAG:     %[[C64:.*]] = arith.constant 64 : index
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK:         gml_st.loop (%[[I:.*]]) = (%[[C0]]) to (%[[C64]])
-// CHECK-SAME:        step (%[[STEP]])
-// CHECK-SAME:        ins (%[[VAL_:.*]] = %[[VAL]]: f32)
-// CHECK-SAME:        outs (%[[OUT_:.*]] = %[[TNSR]]: tensor<64xf32>)
-// CHECK:           %[[SLICE_:.*]] = tensor.extract_slice %[[OUT_]][%[[I]]] [8] [1]
-// CHECK:           %[[FILLED_:.*]] = linalg.fill ins(%[[VAL_]]{{.*}}outs(%[[SLICE_]]
-// CHECK:           %[[INSERTED_:.*]] = tensor.insert_slice %[[FILLED_]] into %[[OUT_]][%[[I]]] [8] [1]
-// CHECK:           gml_st.yield %[[INSERTED_:.*]]
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_matmul.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_matmul.mlir
deleted file mode 100644
index 5c24d7b4dc7..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_matmul.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: tf-tfrt-opt %s -split-input-file \
-// RUN:   -xla-cpu-transform-matmul="tile-sizes=0,0,0" \
-// RUN: | FileCheck %s --check-prefix=TILE-EMPTY
-
-// RUN: tf-tfrt-opt %s -split-input-file \
-// RUN:   -xla-cpu-transform-matmul="tile-sizes=8,4,2" \
-// RUN: | FileCheck %s
-
-func.func @matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %c1 = arith.constant 1 : index
-  %1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
-}
-
-// TILE-EMPTY-LABEL: func @matmul(
-// TILE-EMPTY-SAME:    %[[LHS:.*]]: tensor<?x?xf32>,
-// TILE-EMPTY-SAME:    %[[RHS:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32>
-
-// TILE-EMPTY-DAG:   %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// TILE-EMPTY-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// TILE-EMPTY-DAG:   %[[C1:.*]] = arith.constant 1 : index
-
-// TILE-EMPTY:       %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
-// TILE-EMPTY:       %[[DIM_1:.*]] = tensor.dim %[[RHS]], %[[C1]] : [[TY_2D]]
-// TILE-EMPTY:       %[[INIT:.*]] = tensor.empty(%[[DIM_0]], %[[DIM_1]]) : [[TY_2D]]
-// TILE-EMPTY:       %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-
-// TILE-EMPTY:       %[[MATMUL:.*]] = linalg.matmul
-// TILE-EMPTY-SAME:    ins(%[[LHS]], %[[RHS]] : [[TY_2D]], [[TY_2D]])
-// TILE-EMPTY-SAME:    outs(%[[FILL]] : [[TY_2D]])
-
-// -----
-
-// CHECK-LABEL:      func @matmul(
-// CHECK-SAME:         %[[LHS:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:         %[[RHS:.*]]: tensor<?x?xf32>)
-
-// CHECK-DAG:        %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:        %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:        %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:        %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:        %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:        %[[C8:.*]] = arith.constant 8 : index
-
-// CHECK:            %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
-// CHECK:            %[[DIM_1:.*]] = tensor.dim %[[RHS]], %[[C1]] : [[TY_2D]]
-// CHECK:            %[[INIT:.*]] = tensor.empty(%[[DIM_0]], %[[DIM_1]]) : [[TY_2D]]
-// CHECK:            %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK:            %[[LHS_ROW:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D]]
-// CHECK:            %[[LHS_COL:.*]] = tensor.dim %[[LHS]], %[[C1]] : [[TY_2D]]
-// CHECK:            %[[RHS_COL:.*]] = tensor.dim %[[RHS]], %[[C1]] : [[TY_2D]]
-
-// CHECK:            gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:         to (%[[LHS_ROW]], %[[RHS_COL]]) step (%[[C8]], %[[C4]])
-
-// CHECK:            %[[LHS_TILE:.*]] = gml_st.tile [%[[I]], 0]
-// CHECK:            %[[LHS_SLICE:.*]] = gml_st.materialize %[[LHS]][%[[LHS_TILE]]]
-
-// CHECK:            %[[RHS_TILE:.*]] = gml_st.tile [0, %[[J]]]
-// CHECK:            %[[RHS_SLICE:.*]] = gml_st.materialize %[[RHS]][%[[RHS_TILE]]]
-
-// CHECK:            %[[OUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]]
-// CHECK:            %[[OUT_SLICE:.*]] = gml_st.materialize %[[FILL]][%[[OUT_TILE]]]
-
-// CHECK:            %[[LHS_SUB_ROW:.*]] = tensor.dim %[[LHS_SLICE]], %[[C0]] : [[TY_2D]]
-// CHECK:            %[[LHS_SUB_COL:.*]] = tensor.dim %[[LHS_SLICE]], %[[C1]] : [[TY_2D]]
-// CHECK:            %[[RHS_SUB_COL:.*]] = tensor.dim %[[RHS_SLICE]], %[[C1]] : [[TY_2D]]
-// CHECK:            %[[FOR:.*]] = gml_st.for (%[[K:.*]]) = (%[[C0]])
-// CHECK-SAME:         to (%[[LHS_SUB_COL]]) step (%[[C2]])
-// CHECK-SAME:         outs (%[[OUT_SUB_ARG:.*]] = %[[OUT_SLICE]]: [[TY_2D]])
-
-// CHECK:            %[[LHS_SUB_TILE:.*]] = gml_st.tile [0, %[[K]]]
-// CHECK:            %[[LHS_SUB_SLICE:.*]] = gml_st.materialize %[[LHS_SLICE]][%[[LHS_SUB_TILE]]]
-
-// CHECK:            %[[RHS_SUB_TILE:.*]] = gml_st.tile [%[[K]], 0]
-// CHECK:            %[[RHS_SUB_SLICE:.*]] = gml_st.materialize %[[RHS_SLICE]][%[[RHS_SUB_TILE]]]
-
-// CHECK:            %[[OUT_SUB_TILE:.*]] = gml_st.tile [0, 0] [%[[LHS_SUB_ROW]], %[[RHS_SUB_COL]]]
-// CHECK:            %[[OUT_SUB_SLICE:.*]] = gml_st.materialize %[[OUT_SUB_ARG]][%[[OUT_SUB_TILE]]]
-
-// CHECK:            %[[MATMUL:.*]] = linalg.matmul
-// CHECK-SAME:         ins(%[[LHS_SUB_SLICE]], %[[RHS_SUB_SLICE]] : [[TY_2D]], [[TY_2D]])
-// CHECK:              outs(%[[OUT_SUB_SLICE]] : [[TY_2D]])
-
-// CHECK-NEXT:       gml_st.set_yield %[[MATMUL]] into %[[OUT_SUB_ARG]][%[[OUT_SUB_TILE]]]
-
-// CHECK:            gml_st.set_yield %[[FOR]] into %[[FILL]][%[[OUT_TILE]]]
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_reduction.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_reduction.mlir
deleted file mode 100644
index e791a49b5c7..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_jitrt_tile_reduction.mlir
+++ /dev/null
@@ -1,238 +0,0 @@
-// RUN: tf-tfrt-opt %s -split-input-file \
-// RUN:   -tf-jitrt-tile-reduction="reduction-2d-tile-sizes=4,4 reduction-vector-size=8 reduction-1d-tile-size=16" \
-// RUN: | FileCheck %s
-
-func.func @reduce_row_sum_2d(%lhs: tensor<?x?xf32>,
-                        %rhs: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-
-  %init = tensor.empty(%0) : tensor<?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
-  %sum_of_prod = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%fill : tensor<?xf32>) {
-  ^bb0(%l: f32, %r: f32, %o: f32):
-    %prod = arith.mulf %l, %r : f32
-    %add = arith.addf %prod, %o : f32
-    linalg.yield %add : f32
-  } -> tensor<?xf32>
-  func.return %sum_of_prod : tensor<?xf32>
-}
-// CHECK-LABEL: func @reduce_row_sum_2d(
-// CHECK-SAME:    %[[LHS:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:    %[[RHS:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
-
-// CHECK-DAG:  %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
-// CHECK:      %[[INIT:.*]] = tensor.empty(%[[DIM_0]]) : [[TY_1D:.*]]
-// CHECK:      %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D]]
-// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[LHS]], %[[C1]] : [[TY_2D]]
-
-// CHECK:      gml_st.loop (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:   to (%[[DIM_0_]], %[[DIM_1]]) step (%[[C4]], %[[C4]])
-// CHECK-SAME:   ins (%[[LHS_:.*]] = %[[LHS]]: [[TY_2D]],
-// CHECK-SAME:        %[[RHS_:.*]] = %[[RHS]]: [[TY_2D]])
-// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[FILL]]: [[TY_1D]])
-
-// CHECK:      %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS_]][%[[I]], %[[J]]]
-// CHECK:      %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS_]][%[[I]], %[[J]]]
-// CHECK:      %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]]]
-
-// CHECK:      %[[SUM_SUB:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[LHS_SUB]], %[[RHS_SUB]] : [[TY_2D]], [[TY_2D]])
-// CHECK-SAME:   outs(%[[OUT_SUB]] : [[TY_1D]])
-// CHECK:          mulf
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-// CHECK:      %[[UPDATE:.*]] = tensor.insert_slice %[[SUM_SUB]] into %[[OUT_]]
-// CHECK-NEXT: gml_st.yield %[[UPDATE]] : [[TY_1D]]
-
-// -----
-
-func.func @reduce_row_sum_2d_static(%input: tensor<8x16xf32>) -> tensor<8xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %input, %c0 : tensor<8x16xf32>
-
-  %init = tensor.empty() : tensor<8xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<8xf32>) -> tensor<8xf32>
-  %sum = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%input : tensor<8x16xf32>)
-    outs(%fill : tensor<8xf32>) {
-  ^bb0(%in: f32, %out: f32):
-    %add = arith.addf %in, %out : f32
-    linalg.yield %add : f32
-  } -> tensor<8xf32>
-  func.return %sum : tensor<8xf32>
-}
-// CHECK-LABEL: func @reduce_row_sum_2d_static
-// CHECK: gml_st.loop
-// CHECK:   tensor.insert_slice
-
-// -----
-
-func.func @reduce_column_sum_2d(%input: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %input, %c0 : tensor<?x?xf32>
-
-  %init = tensor.empty(%0) : tensor<?xf32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
-  %sum = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d1)>],
-    iterator_types = ["reduction", "parallel"]}
-    ins(%input : tensor<?x?xf32>)
-    outs(%fill : tensor<?xf32>) {
-  ^bb0(%in: f32, %out: f32):
-    %add = arith.addf %in, %out : f32
-    linalg.yield %add : f32
-  } -> tensor<?xf32>
-  func.return %sum : tensor<?xf32>
-}
-// CHECK-LABEL: func @reduce_column_sum_2d
-// CHECK-SAME:    %[[INPUT:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
-
-// CHECK-DAG:  %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-
-// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D:.*]]
-// CHECK:      %[[INIT:.*]] = tensor.empty(%[[DIM_0]]) : [[TY_1D:.*]]
-// CHECK:      %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D]]
-// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[INPUT]], %[[C1]] : [[TY_2D]]
-
-// CHECK:      gml_st.loop (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:   to (%[[DIM_0_]], %[[DIM_1]]) step (%[[C4]], %[[C4]])
-// CHECK-SAME:   ins (%[[IN_:.*]] = %[[INPUT]]: [[TY_2D]])
-// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[FILL]]: [[TY_1D]])
-
-// CHECK:      %[[IN_SUB:.*]] = tensor.extract_slice %[[IN_]][%[[I]], %[[J]]]
-// CHECK:      %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[J]]]
-
-// CHECK:      %[[SUM_SUB:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[IN_SUB]] : [[TY_2D]])
-// CHECK-SAME:   outs(%[[OUT_SUB]] : [[TY_1D]])
-// CHECK:          addf
-// CHECK-NEXT:     linalg.yield
-
-// CHECK:      %[[UPDATE:.*]] = tensor.insert_slice %[[ACC:.*]] into %[[OUT_]]
-// CHECK:      gml_st.yield %[[UPDATE]] : [[TY_1D]]
-
-// -----
-
-func.func @abs(%input: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %0 = tensor.dim %input, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %input, %c1 : tensor<?x?xf32>
-
-  %init = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %sum = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%input : tensor<?x?xf32>)
-    outs(%init : tensor<?x?xf32>) {
-  ^bb0(%in: f32, %out: f32):
-    %abs = math.absf %in: f32
-    linalg.yield %abs : f32
-  } -> tensor<?x?xf32>
-  func.return %sum : tensor<?x?xf32>
-}
-// CHECK-LABEL: func @abs
-// CHECK-NOT: gml_st.loop
-
-// -----
-
-func.func @reduce_sum_1d(%lhs: tensor<?xf32>, %rhs: tensor<?xf32>) -> tensor<f32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %lhs, %c0 : tensor<?xf32>
-
-  %init = tensor.empty() : tensor<f32>
-  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<f32>) -> tensor<f32>
-  %sum = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> ()>],
-    iterator_types = ["reduction"]}
-    ins(%lhs, %rhs : tensor<?xf32>, tensor<?xf32>)
-    outs(%fill : tensor<f32>) {
-  ^bb0(%l: f32, %r: f32, %out: f32):
-    %prod = arith.mulf %l, %r : f32
-    %add = arith.addf %prod, %out : f32
-    linalg.yield %add : f32
-  } -> tensor<f32>
-  func.return %sum : tensor<f32>
-}
-
-// CHECK-LABEL: func @reduce_sum_1d(
-// CHECK-SAME:    %[[LHS:.*]]: tensor<?xf32>, %[[RHS:.*]]: tensor<?xf32>)
-     // CHECK-DAG: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-     // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-     // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
-
-     // CHECK: %[[INIT:.*]] = tensor.empty() : tensor<f32>
-     // CHECK: %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-     // CHECK: %[[INPUT_SIZE:.*]] = tensor.dim %[[LHS]], %[[C0]]
-
-     // CHECK: %[[TMP_INIT:.*]] = tensor.empty() : tensor<8xf32>
-     // CHECK: %[[TMP_FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[TMP_INIT]]
-     // CHECK: %[[TILABLE_UB:.*]] = affine.apply
-// CHECK-SAME:  %[[INPUT_SIZE]]
-     // CHECK: %[[TMP_SUM:.*]] = gml_st.loop (%[[I:.*]]) = (%[[C0]])
-// CHECK-SAME:   to (%[[TILABLE_UB]]) step (%[[C16]])
-// CHECK-SAME:   ins (%[[LHS_:.*]] = %[[LHS]]: tensor<?xf32>,
-// CHECK-SAME:        %[[RHS_:.*]] = %[[RHS]]: tensor<?xf32>)
-// CHECK-SAME:   outs (%[[TMP_INIT_:.*]] = %[[TMP_FILL]]: tensor<8xf32>)
-
-     // CHECK: %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS_]][%[[I]]]
-     // CHECK: %[[LHS_RESHAPE:.*]] = tensor.expand_shape %[[LHS_SUB]]
-// CHECK-SAME:   {{\[\[}}0, 1]]
-// CHECK-SAME:   : tensor<16xf32> into tensor<2x8xf32>
-
-     // CHECK: %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS_]][%[[I]]]
-     // CHECK: %[[RHS_RESHAPE:.*]] = tensor.expand_shape %[[RHS_SUB]]
-// CHECK-SAME:   {{\[\[}}0, 1]]
-// CHECK-SAME:   : tensor<16xf32> into tensor<2x8xf32>
-
-     // CHECK: %[[SUM_OF_PROD:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[LHS_RESHAPE]], %[[RHS_RESHAPE]]
-// CHECK-SAME:       tensor<2x8xf32>, tensor<2x8xf32>)
-// CHECK-SAME:   outs(%[[TMP_INIT_]] : tensor<8xf32>) {
-     // CHECK:   ^bb0(%[[L:.*]]: f32, %[[R:.*]]: f32, %[[O:.*]]: f32):
-     // CHECK:     %[[MUL:.*]] = arith.mulf %[[L]], %[[R]] : f32
-     // CHECK:     %[[ADD:.*]] = arith.addf %[[MUL]], %[[O]] : f32
-     // CHECK:       linalg.yield %[[ADD]] : f32
-     // CHECK:     } -> tensor<8xf32>
-     // CHECK:   gml_st.yield %[[SUM_OF_PROD]] : tensor<8xf32>
-     // CHECK: }
-     // CHECK: %[[HORIZONTAL_REDUCE:.*]] = linalg.generic
-// CHECK-SAME: ins(%[[TMP_SUM]] : tensor<8xf32>) outs(%[[FILL]] : tensor<f32>)
-//  CHECK-NOT:  mulf
-//      CHECK:  addf
-
-     // CHECK: gml_st.loop (%[[K:.*]]) = (%[[TILABLE_UB]])
-// CHECK-SAME:   to (%[[INPUT_SIZE]]) step (%[[C16]])
-     // CHECK: linalg.generic
-//      CHECK:  mulf
-//      CHECK:  addf
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
index 9b2f0f1719d..8ddeef89b30 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 # copybara:uncomment_begin
 #
diff --git a/tensorflow/compiler/mlir/tfrt/tests/remote_run_encapsulate.mlir b/tensorflow/compiler/mlir/tfrt/tests/remote_run_encapsulate.mlir
deleted file mode 100644
index 934336a6c05..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/remote_run_encapsulate.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: tf-tfrt-opt -tfrt-dist-remote-run-encapsulate %s | FileCheck %s
-
-
-func.func private @init(%arg0 : !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle {tfrt.device = "/job:worker1/task:0/device:CPU:0"}) attributes {host = "/job:worker1/task:0"} {
-  %ch0 = tfrt.new.chain
-  %cpu = corert.get_op_handler %ch0 "cpu"
-  %arg1 = corert.executeop(%cpu) "tfrt_test.create_dense_tensor"()
-    { shape = [1, 65536], values = [1.0 : f32] } : 1
-  %result = corert.executeop(%cpu) "tf.AddV2"(%arg0, %arg1) : 1
-  tfrt.return %ch0, %result : !tfrt.chain, !corert.tensorhandle
-}
-
-func.func private @print(%chain : !tfrt.chain, %tensor_handle : !corert.tensorhandle) -> (!tfrt.chain) attributes {host = "/job:worker1/task:0"} {
-  %ch2 = "corert.print_tensorhandle"(%tensor_handle, %chain) : (!corert.tensorhandle, !tfrt.chain) -> !tfrt.chain
-  tfrt.return %ch2 : !tfrt.chain
-}
-
-// CHECK-LABEL: func @remote_execute
-func.func @remote_execute(%arg0 : !corert.tensorhandle) -> (!tfrt.chain, !tfrt.chain, !corert.tensorhandle) {
-  %c0 = tfrt.new.chain
-  // CHECK: %[[CONFIGS:.*]]:2 = tfrt_dist.test_create_configurations : 2
-  %configs:2 = tfrt_dist.test_create_configurations : 2
-  // CHECK-NEXT: %[[CLIENT_CTX:.*]] = tfrt_dist.test_create_distributed_context %[[CONFIGS]]#0
-  %client_context = tfrt_dist.test_create_distributed_context %configs#0 : (!tfrt_dist.dist_context_configuration) -> !tfrt_dist.dist_context
-  // CHECK-NEXT: %[[WORKER_TASK:.*]] = tfrt_dist.get_task_handle %[[CLIENT_CTX]] {task_name = "/job:worker/task:1"}
-  %worker_task = tfrt_dist.get_task_handle %client_context {task_name = "/job:worker/task:1"}
-  // This is the remote invocation of the @print and @init functions, check that
-  // we correctly serialize and encapsulate them.
-  // CHECK-NEXT: %[[REGISTER_CHAIN_0:.*]] = tfrt_dist.register_tfrt_function(%[[IN_CHAIN:.*]], %[[CLIENT_CTX]], %[[WORKER_TASK]]) "init" {{.*}}func @init(%[[ARG_0:.*]]: !corert.tensorhandle loc({{.*}}) -> (!tfrt.chain, !corert.tensorhandle {{.*}}
-  // CHECK-NEXT: %[[SPEC_0:.*]] = tfrt_dist.create_remote_execute_spec
-  // CHECK-SAME: {output_devices = ["/job:worker1/task:0/device:CPU:0", "/job:worker1/task:0/device:CPU:0"]}
-  // CHECK-NEXT: %[[OBJECT_ID_0:.*]] = tfrt_dist.get_remote_object_id_from_th %[[ARG_1:.*]]
-  // CHECK-NEXT: %[[EXEC_CHAIN_0:.*]], %[[RESULTS_0:.*]]:3 = tfrt_dist.remote_execute_th[%[[REGISTER_CHAIN_0]], %[[CLIENT_CTX]], %[[WORKER_TASK]], %[[SPEC_0]], 1] "init"(%[[OBJECT_ID_0]]) : (!tfrt_dist.remote_object_id) -> (!tfrt_dist.remote_object_id, !tfrt_dist.remote_object_id, !corert.tensorhandle)
-  %execute_chain, %remote_chain, %remote_tensor = tfrt_dist.remote_execute_func [%c0, %client_context, %worker_task] @init(%arg0) : (!corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
-
-  // CHECK-NEXT: %[[REGISTER_CHAIN_1:.*]] = tfrt_dist.register_tfrt_function(%[[EXEC_CHAIN_0]], %[[CLIENT_CTX]], %[[WORKER_TASK]]) "print" {{.*}}@print(%[[ARG_2:.*]]: !tfrt.chain loc({{.*}}), %[[ARG_3:.*]]: !corert.tensorhandle loc({{.*}})) -> !tfrt.chain {{.*}}
-  // CHECK-NEXT: %[[SPEC_1:.*]] = tfrt_dist.create_remote_execute_spec
-  // CHECK-SAME: {output_devices = ["/job:worker1/task:0/device:CPU:0"]}
-  // CHECK-NEXT: %[[OBJECT_ID_1:.*]] = tfrt_dist.get_remote_object_id_from_th %[[RESULTS_0]]#2
-  // CHECK-NEXT: %[[EXEC_CHAIN_1:.*]], %[[RESULTS_1:.*]] = tfrt_dist.remote_execute_th[%[[REGISTER_CHAIN_1]], %[[CLIENT_CTX]], %[[WORKER_TASK]], %[[SPEC_1]], 0] "print"(%[[RESULTS_0]]#0, %[[OBJECT_ID_1]]) : (!tfrt_dist.remote_object_id, !tfrt_dist.remote_object_id) -> !tfrt_dist.remote_object_id
-  %execute_chain2, %remote_chain2 = tfrt_dist.remote_execute_func[%execute_chain, %client_context, %worker_task] @print(%remote_chain, %remote_tensor) : (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain)
-
-
-  // CHECK-NEXT: tfrt.return %[[EXEC_CHAIN_0]], %[[EXEC_CHAIN_1]], %[[RESULTS_0]]#2 : !tfrt.chain, !tfrt.chain, !corert.tensorhandle
-  tfrt.return %execute_chain, %execute_chain2, %remote_tensor : !tfrt.chain, !tfrt.chain, !corert.tensorhandle
-}
-
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD b/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
index f7620143d00..f736d94ea71 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/BUILD
@@ -1,12 +1,17 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_cc_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.cc"],
     data = [
         "testdata/test.mlir",
+        "testdata/xla_launch.mlir",
+        "testdata/xla_launch_xla_reduce_window.mlir",
     ],
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
index e866979f302..995bb242fe3 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
@@ -101,6 +101,68 @@ TEST(SavedModelTest, CompileToBEF) {
   TF_ASSERT_OK(ConvertTfMlirToBef(options, module.get(), &bef_buffer));
 }
 
+TEST(SavedModelTest, ConvertTfMlirToBefWithXlaFuncExport) {
+  std::string saved_model_mlir_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
+      "xla_launch.mlir");
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  auto module =
+      mlir::parseSourceFile<mlir::ModuleOp>(saved_model_mlir_path, &context);
+  ASSERT_TRUE(module);
+
+  tfrt::BefBuffer bef_buffer;
+  TfrtCompileOptions options;
+  options.device_target = TfrtDeviceInfraTarget::kGpu;
+  options.use_bridge_for_gpu = true;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tfrt_stub::FallbackState> fallback_state,
+      tfrt_stub::FallbackState::Create(SessionOptions(), FunctionDefLibrary()));
+  TF_ASSERT_OK(ConvertTfMlirToBef(options, module.get(), &bef_buffer,
+                                  fallback_state.get()));
+
+  // The module contains an XLA function, as well as a while body and a while
+  // condition within the XLA function.
+  EXPECT_EQ(fallback_state->process_function_library_runtime()
+                .GetFunctionLibraryDefinition()
+                ->num_functions(),
+            3);
+}
+
+TEST(SavedModelTest, ConvertTfMlirToBefExportingXlaReduceWindow) {
+  std::string saved_model_mlir_path = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/"
+      "xla_launch_xla_reduce_window.mlir");
+
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::MLIRContext context(registry);
+  auto module =
+      mlir::parseSourceFile<mlir::ModuleOp>(saved_model_mlir_path, &context);
+  ASSERT_TRUE(module);
+
+  tfrt::BefBuffer bef_buffer;
+  TfrtCompileOptions options;
+  options.device_target = TfrtDeviceInfraTarget::kGpu;
+  options.use_bridge_for_gpu = true;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<tfrt_stub::FallbackState> fallback_state,
+      tfrt_stub::FallbackState::Create(SessionOptions(), FunctionDefLibrary()));
+  TF_ASSERT_OK(ConvertTfMlirToBef(options, module.get(), &bef_buffer,
+                                  fallback_state.get()));
+
+  // The module contains an XLA function, as well as a sum_reducer function
+  // referenced by an XlaReduceWindow op.
+  EXPECT_EQ(fallback_state->process_function_library_runtime()
+                .GetFunctionLibraryDefinition()
+                ->num_functions(),
+            2);
+}
+
 // TODO(b/162442824): Add a SavedModel test that covers the error pass.
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch.mlir b/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch.mlir
new file mode 100644
index 00000000000..4553a6fe278
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch.mlir
@@ -0,0 +1,25 @@
+
+func.func @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {value = dense<9> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Less"(%arg0, %0) {} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  func.return %1 : tensor<i1>
+}
+
+func.func @while_body(%arg0: tensor<i32>) -> tensor<i32> {
+  %1 = "tf.AddV2"(%arg0, %arg0) {} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+func.func private @xla_func_0(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.While"(%2) { cond = @while_cond, body = @while_body, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  func.return %1 : tensor<1x3xf32>
+}
+
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  %2 = "tf.XlaLaunch"(%arg0, %1) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_0, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch_xla_reduce_window.mlir b/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch_xla_reduce_window.mlir
new file mode 100644
index 00000000000..8541250a925
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/testdata/xla_launch_xla_reduce_window.mlir
@@ -0,0 +1,22 @@
+
+func.func private @sum_reducer(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+func.func @xla_func_0(%arg0: tensor<7xf32>, %arg1: tensor<f32>) -> tensor<10xf32> {
+  %cst = "tf.Const"() {value = dense<0> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
+  %cst_0 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_1 = "tf.Const"() {value = dense<2> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_2 = "tf.Const"() {value = dense<3> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_3 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tf.XlaReduceWindow"(%arg0, %arg1, %cst_0, %cst_1, %cst_2, %cst_3, %cst) {computation = @sum_reducer} : (tensor<7xf32>, tensor<f32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1x2xi32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+func.func @main(%arg0: tensor<7xf32>) -> tensor<10xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<f32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+  %2 = "tf.XlaLaunch"(%arg0, %1) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_0, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<7xf32>, tensor<f32>) -> tensor<10xf32>
+  func.return %2 : tensor<10xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
new file mode 100644
index 00000000000..84cb50239eb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/sink_in_invariant_ops.mlir
@@ -0,0 +1,248 @@
+// RUN: tf-tfrt-opt -split-input-file -tfrt-sink-in-invariant-ops %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sinks in var handle op to batch function.
+
+// CHECK-LABEL: func private @batched_function
+// CHECK: arg1
+func.func private @batched_function(%arg0: tensor<1x3xf32>, %arg1: tensor<*x!tf_type.resource>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  // CHECK: "tf.ReadVariableOp"([[handle]])
+  %0 = "tf.ReadVariableOp"(%arg1) {device = "/device:CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<1x3xf32>
+  %1 = "tf.AddV2"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %2 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK: tf.VarHandleOp
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+ 
+  // CHECK: "tf.BatchFunction"(%arg0, %0)
+  // CHECK: operand_segment_sizes = array<i32: 1, 1>
+  %1 = "tf.BatchFunction"(%arg0, %0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return %1 : tensor<*xf32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sinks in const op to batch function.
+
+// CHECK-LABEL: func private @batched_function
+// CHECK: arg1
+func.func private @batched_function(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK: tf.Const
+  %1 = "tf.AddV2"(%arg0, %arg1) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<i32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK: [[handle:%.*]] = "tf.Const"()
+  %0 = "tf.Const"() {device = "/CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.BatchFunction"(%arg0, [[handle]])
+  // CHECK-SAME: operand_segment_sizes = array<i32: 1, 1>
+  %1 = "tf.BatchFunction"(%arg0, %0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sink in multiple invariant ops.
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<!tf_type.resource<tensor<1x3xf32>>>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK: [[handle1:%.*]] = "tf.VarHandleOp"() {{{.*}}, shared_name = "variable1"}
+  // CHECK: [[handle2:%.*]] = "tf.VarHandleOp"() {{{.*}}, shared_name = "variable2"}
+  // CHECK: "tf.ReadVariableOp"([[handle1]])
+  // CHECK: "tf.ReadVariableOp"([[handle2]])
+  %0 = "tf.ReadVariableOp"(%arg0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  %1 = "tf.ReadVariableOp"(%arg1) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32> 
+  %2 = "tf.AddV2"(%0, %1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %3 = "tf.Identity"(%2) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %3 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK: tf.VarHandleOp
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable1"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable2"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  // CHECK: "tf.BatchFunction"(%0, %1)
+  %2 = "tf.BatchFunction"(%0, %1) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sinks in var handle op that used by control flow ops.
+
+// CHECK-LABEL: func private @some_func
+func.func private @some_func(
+    %arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK: tf.VarHandleOp
+  // CHECK: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func private @some_other_func
+func.func private @some_other_func(
+    %arg: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  // CHECK: "tf.ReadVariableOp"([[handle]])
+  %0 = "tf.ReadVariableOp"(%arg) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @sink_in_stateful_call
+func.func @sink_in_stateful_call(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_stateful_call"]} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: "tf.StatefulPartitionedCall"([[handle]])
+  %x = "tf.StatefulPartitionedCall"(%handle) {device = "/CPU:0", config = "", config_proto = "", executor_type = "", f = @some_func} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>)
+  %r = "tf.AddV2"(%arg, %x) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %r : tensor<i32>
+}
+
+// CHECK-LABEL: func @sink_in_if
+func.func @sink_in_if(%arg: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_if"]} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  %handle = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: [[cond:%.*]] = "tf.Const"()
+  %cond = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: "tf.If"([[cond]], [[handle]])
+  %x = "tf.If"(%cond, %handle) {then_branch = @some_other_func, else_branch = @some_other_func, is_stateless = false} : (tensor<i1>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %r = "tf.AddV2"(%arg, %x) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %r : tensor<i32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test doesn't sink in to the callee that invoked by multiple callers.
+
+// CHECK: func private @some_func([[arg0:.+]]: tensor<!tf_type.resource<tensor<i32>>>)
+func.func private @some_func(%arg0: tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK-NOT: tf.VarHandleOp
+  // CHECK: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg0) {device = "cpu"} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @sink_in_stateful_call
+func.func @sink_in_stateful_call(%arg0: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_stateful_call"]} {
+  // CHECK: tf.VarHandleOp
+  %0 = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%0)
+  %1 = "tf.StatefulPartitionedCall"(%0) {device = "/CPU:0", config = "", config_proto = "", executor_type = "", f = @some_func} : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<i32>)
+  %2 = "tf.AddV2"(%arg0, %1) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+// CHECK-LABEL: func @sink_in_if
+func.func @sink_in_if(%arg0: tensor<i32> {tf_saved_model.index_path = ["input"]}) -> (tensor<i32> {tf_saved_model.index_path = ["r"]})
+  attributes {tf_saved_model.exported_names = ["test_sink_in_if"]} {
+  // CHECK: tf.VarHandleOp
+  %0 = "tf.VarHandleOp"() {container = "", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  %cst = "tf.Const"() {device = "/CPU:0", value = dense<true> : tensor<i1>} : () -> tensor<i1>
+  // CHECK: "tf.If"(%cst, %0)
+  %1 = "tf.If"(%cst, %0) {then_branch = @some_func, else_branch = @some_func, is_stateless = false} : (tensor<i1>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %2 = "tf.AddV2"(%arg0, %1) {device = "/CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test doesn't sink in var handle op + read variable op. Consider implement when we see it from production.
+
+// CHECK-LABEL: func private @batched_function
+func.func private @batched_function(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK-NOT: tf.VarHandleOp
+  // CHECK-NOT: tf.ReadVariableOp
+  %1 = "tf.AddV2"(%arg0, %arg1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %2 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  // CHECK: "tf.ReadVariableOp"([[handle]])
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: "tf.BatchFunction"(%arg0, %1)
+  %2 = "tf.BatchFunction"(%arg0, %1) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+// Test sinks in var handle op if it's used by one callee, and also by read only ops in the current funciton.
+
+// CHECK-LABEL: func private @batched_function
+// CHECK: arg1
+func.func private @batched_function(%arg0: tensor<1x3xf32>, %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  attributes {tf._input_shapes = [#tf_type.shape<1x3>, #tf_type.shape<*>], tf.signature.is_stateful} {
+  // CHECK: tf.VarHandleOp
+  // CHECK: tf.ReadVariableOp
+  %1 = "tf.ReadVariableOp"(%arg1) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  %2 = "tf.AddV2"(%arg0, %1) {device = "/device:CPU:0"} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %3 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %2 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32> {tf_saved_model.index_path = ["input"]}) -> (tensor<*xf32> {tf_saved_model.index_path = ["r"]}) 
+  attributes {tf_saved_model.exported_names = ["main"]} {
+  // CHECK: [[handle:%.*]] = "tf.VarHandleOp"()
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  // CHECK: "tf.ReadVariableOp"([[handle]])
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: "tf.BatchFunction"(%arg0, [[handle]])
+  // CHECK-SAME: operand_segment_sizes = array<i32: 1, 1>
+  %2 = "tf.BatchFunction"(%arg0, %0) {allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64, batching_queue = "", container = "", device = "/device:CPU:0", enable_large_batch_splitting = false, f = @batched_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = "batch/"} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
index 30aa6a6ef3d..4badbc11669 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
@@ -1,12 +1,18 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
+    size_override = {
+        "fallback.mlir": "medium",
+    },
     test_file_exts = ["mlir"],
 )
 
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
index f314558dee9..77e795b8bf4 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-to-tfrt=func-use-fallback-tensor=true %s | FileCheck %s --dump-input=fail
 
 // _output_shapes and f.* attributes are removed during tf-to-tfrt lowering.
 // CHECK-LABEL: func @remove_unused_attr
@@ -14,8 +14,8 @@ func.func @basic(
     %arg1: tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<3x3xf32>) {
   %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "/device:CPU:0", dtype = f32} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
 
-  // CHECK: {{%.*}} = corert.executeop({{%.*}}) "tf.MatMul"
-  // CHECK-SAME: {T = f32, device = "/device:CPU:0", transpose_a = false, transpose_b = false}
+  // CHECK: {{%.*}} = tfrt_fallback_async.executeop {{.*}} device("/device:CPU:0") "tf.MatMul"
+  // CHECK-SAME: {T = f32, transpose_a = false, transpose_b = false}
   %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "/device:CPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   func.return %2 : tensor<3x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/auto-fusion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/auto-fusion.mlir
index 76d93695f34..b6c63f3f560 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/auto-fusion.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/auto-fusion.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="enable-native-ops=false auto-fusion-oplist=tf.Rsqrt,tf.Tanh auto-fusion-min-cluster-size=1" -split-input-file %s \
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="auto-fusion-oplist=tf.Rsqrt,tf.Tanh auto-fusion-min-cluster-size=1" -split-input-file %s \
 // RUN: | FileCheck %s --dump-input=always
 
 // CHECK-LABEL: func @single_op_cluster
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
index 22199ed5230..49debdadd36 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -pass-pipeline='func.func(tf-tensor-device-copy),tfrt-lower-tf-savedmodel{hoist-invariant-ops=true},tf-to-tfrt{enable-native-ops=false func-use-fallback-tensor=true tfrt-cost-threshold=1024 tfrt-upper-cost-threshold=65536 tfrt-merge-inter-dependent-streams=true}' %s | FileCheck %s --dump-input-filter=all
+// RUN: tf-tfrt-opt -pass-pipeline='builtin.module(func.func(tf-tensor-device-copy),tfrt-lower-tf-savedmodel{hoist-invariant-ops=true},tf-to-tfrt{func-use-fallback-tensor=true tfrt-cost-threshold=1024 tfrt-upper-cost-threshold=65536 tfrt-merge-inter-dependent-streams=true})' %s | FileCheck %s --dump-input-filter=all
 
 // CHECK-NOT: tf_saved_model.semantics
 // CHECK: tfrt.cost_threshold = 1024
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
index a9fdd728a5c..149fee8f244 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
@@ -1,23 +1,11 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-to-tfrt=func-use-fallback-tensor=true %s | FileCheck %s --dump-input=fail
 
 // CHECK-LABEL: func @device_test
 func.func @device_test(
     %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
     %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
       -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: {{%.*}} = corert.get_op_handler %arg0 "/device:GPU:0"
-
-  %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  func.return %2 : tensor<3x3xf32>
-}
-
-// CHECK-LABEL: func @legacy_device_name
-func.func @legacy_device_name(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: {{%.*}} = corert.get_op_handler %arg0 "/device:GPU:0"
-
+  // CHECK: device("/device:GPU:0")
   %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "/device:GPU:0", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   func.return %2 : tensor<3x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
index d8e5492614e..0e605ccc6af 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback.mlir
@@ -1,5 +1,5 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt=enable-native-ops=false %s | FileCheck %s --dump-input=fail --dump-input-filter=all
-// RUN: tf-tfrt-opt -pass-pipeline='tf-to-tfrt{enable-native-ops=false target-tpurt=true tpu-use-core-selector=false}' %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+// RUN: tf-tfrt-opt -tf-to-tfrt %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+// RUN: tf-tfrt-opt -pass-pipeline='builtin.module(tf-to-tfrt{target-tpurt=true tpu-use-core-selector=false})' %s | FileCheck %s --dump-input=fail --dump-input-filter=all
 
 // CHECK-LABEL: func @_tfrt_fallback_init
 // CHECK-SAME: {{.*}} !tfrt.chain
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback_canonicalization.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback_canonicalization.mlir
index 5af5e8dea54..30a643fbb80 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback_canonicalization.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fallback_canonicalization.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: tf-tfrt-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @test_const_tensor_canonicalization_single_denst_tensor_operand
 func.func @test_const_tensor_canonicalization_single_denst_tensor_operand() -> !tfrt_fallback.tf_tensor {
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes.mlir
index 92092520cd2..d4a8d0b5f75 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes.mlir
@@ -12,7 +12,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     func.return %4 : tensor<!tf_type.variant>
   }
   // CHECK-LABEL: __inference_Dataset_flat_map_lambda_190
-  func.func private @__inference_Dataset_flat_map_lambda_190(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<!tf_type.variant> attributes {tf._tf_data_function = true, tf.signature.is_stateful} {
+  func.func private @__inference_Dataset_flat_map_lambda_190(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<!tf_type.variant> attributes {tf._original_func_name = "__inference_Dataset_flat_map_lambda_19", tf._tf_data_function = true, tf.signature.is_stateful} {
     %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
     %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i64>} : () -> tensor<i64>
     %2 = "tf.Const"() {device = "/device:CPU:0", value = dense<5> : tensor<i64>} : () -> tensor<i64>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes_multiple_callers.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes_multiple_callers.mlir
new file mode 100644
index 00000000000..1615363409a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/func_attributes_multiple_callers.mlir
@@ -0,0 +1,40 @@
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline %s | FileCheck %s
+
+// Checks that the ops' function attribute references the original function name
+// `funcB` for `funcB_renamed` after the module is lowered to TFRT. Note that,
+// `funcB_renamed` are called twice, so `CreateGuaranteeAllFuncsOneUsePass` will
+// make a replicaion of `funcB_renamed` with a different name.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 567 : i32}} {
+  // CHECK-LABEL: @funcA
+  func.func @funcA() -> (tensor<!tf_type.variant>, tensor<!tf_type.variant>) attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "flatmapdataset__4_RetVal"}} {
+    %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<5> : tensor<i64>} : () -> tensor<i64>
+    %2 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %3 = "tf.RangeDataset"(%0, %1, %2) {device = "/device:CPU:0", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<!tf_type.variant>
+    // CHECK: tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/device:CPU:0") "tf.FlatMapDataset"({{.*}}) {Targuments = [], metadata = "", output_shapes = [#corert.shape<>], output_types = [i64]} {f = "funcB"} : 1
+    %4 = "tf.FlatMapDataset"(%3) {Targuments = [], device = "/device:CPU:0", f = @funcB_renamed, output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+    %5 = "tf.RangeDataset"(%1, %2, %0) {device = "/device:CPU:0", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<!tf_type.variant>
+    // CHECK: tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/device:CPU:0") "tf.FlatMapDataset"({{.*}}) {Targuments = [], metadata = "", output_shapes = [#corert.shape<>], output_types = [i64]} {f = "funcB"} : 1
+    %6 = "tf.FlatMapDataset"(%5) {Targuments = [], device = "/device:CPU:0", f = @funcB_renamed, output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+    func.return %4, %6 : tensor<!tf_type.variant>, tensor<!tf_type.variant>
+  }
+  // CHECK-LABEL: @funcB_renamed
+  func.func private @funcB_renamed(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<!tf_type.variant> attributes {tf._original_func_name = "funcB", tf._tf_data_function = true, tf.signature.is_stateful} {
+    %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %2 = "tf.Const"() {device = "/device:CPU:0", value = dense<5> : tensor<i64>} : () -> tensor<i64>
+    %3 = "tf.RangeDataset"(%0, %2, %1) {device = "/device:CPU:0", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<!tf_type.variant>
+    // CHECK: tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/device:CPU:0") "tf.MapDataset"({{.*}}) {Targuments = [], metadata = "", output_shapes = [#corert.shape<>], output_types = [i64], preserve_cardinality = true, use_inter_op_parallelism = true} {f = "funcC"} : 1
+    %4 = "tf.MapDataset"(%3) {device = "/device:CPU:0", f = @funcC_renamed, f._tf_data_function = true, output_shapes = [#tf_type.shape<>], output_types = [i64], preserve_cardinality = true, use_inter_op_parallelism = true, metadata = ""} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+    %5 = "tf.Identity"(%4) {device = "/device:CPU:0"} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+    func.return %5 : tensor<!tf_type.variant>
+  }
+  // CHECK-LABEL: @funcC_renamed
+  func.func private @funcC_renamed(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<i64> attributes {tf._tf_data_function = true, tf._original_func_name = "funcC"} {
+    %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf.Mul"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %2 = "tf.Identity"(%1) {device = "/device:CPU:0"} : (tensor<i64>) -> tensor<i64>
+    func.return %2 : tensor<i64>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
index 0f47e8bc872..dd57c72674a 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="enable-native-ops=false enable-optimizer=true tfrt-cost-threshold=1024" %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="enable-optimizer=true tfrt-cost-threshold=1024" %s | FileCheck %s --dump-input=fail
 
 // CHECK: tfrt.cost_threshold = 1024 : i64
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 462 : i32}} {
@@ -14,14 +14,14 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 // CHECK-NEXT: [[arg5:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg5_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
 // CHECK-NEXT: [[arg2:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg2_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
 // CHECK-NEXT: [[arg3:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[arg3_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"
-// CHECK: [[o2_chain:%.*]], [[o2:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(0) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg3]])
-// CHECK-NEXT: [[o3_chain:%.*]], [[o3:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(1) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg2]])
-// CHECK-NEXT: [[o4_chain:%.*]], [[o4:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(2) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg5]])
-// CHECK-NEXT: [[o5_chain:%.*]], [[o5:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(3) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg4]])
-// CHECK-NEXT: [[o6:%.*]] = tfrt_fallback_async.executeop key(4) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf._FusedConv2D"([[arg1]], [[o3]], [[o2]])
-// CHECK-NEXT: [[o7:%.*]] = tfrt_fallback_async.executeop key(5) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.AvgPool"([[o6]])
-// CHECK-NEXT: [[o8:%.*]] = tfrt_fallback_async.executeop key(6) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.Reshape"([[o7]], [[o1]])
-// CHECK-NEXT: [[o9:%.*]] = tfrt_fallback_async.executeop key(7) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf._FusedMatMul"([[o8]], [[o5]], [[o4]])
+// CHECK: [[o2_chain:%.*]], [[o2:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg3]])
+// CHECK-NEXT: [[o3_chain:%.*]], [[o3:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg2]])
+// CHECK-NEXT: [[o4_chain:%.*]], [[o4:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg5]])
+// CHECK-NEXT: [[o5_chain:%.*]], [[o5:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"([[arg4]])
+// CHECK-NEXT: [[o6:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf._FusedConv2D"([[arg1]], [[o3]], [[o2]])
+// CHECK-NEXT: [[o7:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.AvgPool"([[o6]])
+// CHECK-NEXT: [[o8:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.Reshape"([[o7]], [[o1]])
+// CHECK-NEXT: [[o9:%.*]] = tfrt_fallback_async.executeop key({{[0-9]+}}) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf._FusedMatMul"([[o8]], [[o5]], [[o4]])
 // CHECK-NEXT: [[out_chain:%.*]] = tfrt.merge.chains [[o2_chain]], [[o3_chain]], [[o4_chain]], [[o5_chain]]
 // CHECK-NEXT: [[o9_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o9]]
 // CHECK-NEXT: [[o5_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[o5]]
@@ -70,15 +70,15 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-DAG: tfrt_fallback_async.const_dense_tensor dense<0> : tensor<i32>
     // CHECK-NEXT: tfrt_fallback_async.executeop key({{.*}}) cost({{.*}}) device("/device:CPU:0") "tf.Less"
     // CHECK-NEXT: [[pred:%.*]] = tfrt_fallback_async.predicate
-    // CHECK-NEXT: tfrt.while [[pred]] @"while_body_add2/tfrt_body_1"
+    // CHECK-NEXT: tfrt.while [[pred]] @"[[while_func_prefix:.*]]/tfrt_body_1"
     // CHECK-NEXT: tfrt.merge.chains
     // CHECK-NEXT: tfrt.return
     %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
     %1 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
     func.return %1 : tensor<i32>
   }
-  // CHECK: func @"while_body_add2/tfrt_body_1"
+  // CHECK: func @"[[while_func_prefix]]/tfrt_body_1"
   // CHECK-NOT: tfrt.call
 
-  // CHECK: func @"while_cond_lt9/tfrt_predicate"
+  // CHECK: func @"[[while_cond_prefix:.*]]/tfrt_predicate"
 }
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_cpurt.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_cpurt.mlir
index 2ac33275179..43267d6ec22 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_cpurt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_cpurt.mlir
@@ -1,7 +1,6 @@
 // RUN: tf-tfrt-opt %s                                                         \
 // RUN:   -split-input-file                                                    \
 // RUN:   -tf-executor-to-tfrt-pipeline="                                      \
-// RUN:       enable-native-ops=false                                          \
 // RUN:       enable-optimizer=true                                            \
 // RUN:       tfrt-cost-threshold=1024                                         \
 // RUN:       auto-fusion-oplist=tf.Relu,tf.Transpose,tf.Const                 \
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
index 504357d8ad5..e9518003023 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_to_corert_pipeline_refvar.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline=enable-native-ops=false %s | FileCheck %s --dump-input=fail
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline %s | FileCheck %s --dump-input=fail
 
 // CHECK-LABEL: func @__inference_pruned_131
 // CHECK-SAME: ([[in_chain:%.*]]: !tfrt.chain) -> (!tfrt.chain, !corert.tensorhandle)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir
new file mode 100644
index 00000000000..5858a015061
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/whileop.mlir
@@ -0,0 +1,33 @@
+// RUN: tf-tfrt-opt -tf-executor-to-tfrt-pipeline="enable-optimizer=true tfrt-cost-threshold=1024" %s | FileCheck %s --dump-input=fail
+
+// Check that unused While op results and the associated ops are removed.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 462 : i32}} {
+  func.func @while_cond_lt9(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    func.return %1 : tensor<i1>
+  }
+
+  func.func @while_body_add2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+    %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.AddV2"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %2 = "tf.Div"(%arg1, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    func.return %1, %2 : tensor<i32>, tensor<i32>
+  }
+
+  // CHECK-LABEL: func @while_test_remove_unused_results
+  // CHECK:       [[pred:%.*]] = tfrt_fallback_async.predicate
+  // CHECK-NEXT:  tfrt.while [[pred]] @"[[while_func_prefix:.*]]/tfrt_body_1"
+  // CHECK-SAME:  (!tfrt.chain, !corert.tensorhandle) -> (!tfrt.chain, !corert.tensorhandle)
+  // CHECK-NOT:   func.call
+  func.func @while_test_remove_unused_results(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+    %0:2 = "tf.While"(%arg0, %arg1) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    %1:2 = func.call @while_body_add2(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    func.return %0#0, %1#0 : tensor<i32>, tensor<i32>
+  }
+
+  // CHECK:     func @"[[while_func_prefix]]/tfrt_body_1"
+  // CHECK:     "tf.AddV2"
+  // CHECK-NOT: "tf.Div"
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/BUILD
deleted file mode 100644
index 30aa6a6ef3d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("//tensorflow:tensorflow.bzl", "if_oss")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir/tfrt:tf-tfrt-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//llvm:not",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/batch.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/batch.mlir
deleted file mode 100644
index 0b94db888ef..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/batch.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt-data %s | FileCheck %s
-
-module {
-
-// CHECK-LABEL: func @main() -> !tfrt_data.dataset
-  func.func @main() -> tensor<*x!tf_type.variant> {
-    // CHECK-NEXT: %[[START:.*]] = tfrt.constant.i64 0
-    // CHECK-NEXT: %[[STEP:.*]] = tfrt.constant.i64 1
-    // CHECK-NEXT: %[[STOP:.*]] = tfrt.constant.i64 1000
-    // CHECK-NEXT: %[[RANGE:.*]] = tfrt_data.range_dataset %[[START]], %[[STOP]], %[[STEP]] {element_type = i64}
-    // CHECK-NEXT: %[[BATCH_SIZE:.*]] = tfrt.constant.i64 10
-    // CHECK-NEXT: %[[DROP_REMAINDER:.*]] = tfrt.constant.i1 false
-    // CHECK-NEXT: %[[BATCH:.*]] = tfrt_data.batch_dataset.i64 %[[RANGE]], %[[BATCH_SIZE]] {same_input_metadata = false}
-    // CHECK-NEXT: tfrt.return %[[BATCH]] : !tfrt_data.dataset
-    %start = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-    %step = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-    %stop = "tf.Const"() {value = dense<1000> : tensor<i64>} : () -> tensor<i64>
-    %range = "tf.RangeDataset"(%start, %stop, %step) {output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<*x!tf_type.variant>
-    %batch_size = "tf.Const"() {value = dense<10> : tensor<i64>} : () -> tensor<i64>
-    %drop_remainder = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
-    %batch = "tf.BatchDatasetV2"(%range, %batch_size, %drop_remainder) {output_shapes = [#tf_type.shape<>], output_types = [i64], parallel_copy = false, metadata = ""} : (tensor<*x!tf_type.variant>, tensor<i64>, tensor<i1>) -> tensor<*x!tf_type.variant>
-    func.return %batch : tensor<*x!tf_type.variant>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/range.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/range.mlir
deleted file mode 100644
index 69d6a40d521..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data/range.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: tf-tfrt-opt -tf-to-tfrt-data %s | FileCheck %s
-
-module {
-
-// CHECK-LABEL: func @main() -> !tfrt_data.dataset
-  func.func @main() -> tensor<*x!tf_type.variant> {
-    // CHECK-NEXT: %[[START:.*]] = tfrt.constant.i64 0
-    // CHECK-NEXT: %[[STEP:.*]] = tfrt.constant.i64 1
-    // CHECK-NEXT: %[[STOP:.*]] = tfrt.constant.i64 1000
-    // CHECK-NEXT: %[[RANGE:.*]] = tfrt_data.range_dataset %[[START]], %[[STOP]], %[[STEP]] {element_type = i64}
-    // CHECK-NEXT: tfrt.return %[[RANGE]] : !tfrt_data.dataset
-    %1 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-    %2 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-    %3 = "tf.Const"() {value = dense<1000> : tensor<i64>} : () -> tensor<i64>
-    %4 = "tf.RangeDataset"(%1, %3, %2) {device = "", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<*x!tf_type.variant>
-    func.return %4 : tensor<*x!tf_type.variant>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir
new file mode 100644
index 00000000000..3905074bfd2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/xla_launch_fallback.mlir
@@ -0,0 +1,82 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-executor-to-tfrt-pipeline="target-gpu=true use-bridge-for-gpu=true func-use-fallback-tensor=true" -tfrt-lower-tf-savedmodel=hoist-invariant-ops=true %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+
+func.func private @xla_func_0(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %1 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1x3xf32>) -> tensor<*xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: [[INPUT_0:%.*]] = gpurt.transfer_to_device
+  // CHECK: [[VAR_0:%.*]] = gpurt.maybe_transfer_variable
+  // CHECK: tfrt_fallback_async.executeop.seq{{.*}}"tf.XlaLaunch"([[INPUT_0]], [[VAR_0]])
+  // CHECK-SAME: {function = "xla_func_0"}
+  // CHECK: gpurt.transfer_from_device
+  %2 = "tf.XlaLaunch"(%arg0, %1) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_0, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+// Check the case when there are multiple XLA clusters.
+
+func.func private @xla_func_1(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %1 : tensor<1x3xf32>
+}
+
+func.func private @xla_func_2(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg0) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %1 : tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @multi_clusters
+func.func @multi_clusters(%arg0: tensor<1x3xf32>) -> tensor<*xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: [[INPUT_0:%.*]] = gpurt.transfer_to_device
+  // CHECK: [[VAR_0:%.*]] = gpurt.maybe_transfer_variable
+  // CHECK: tfrt_fallback_async.executeop.seq{{.*}}"tf.XlaLaunch"([[INPUT_0]], [[VAR_0]])
+  // CHECK-SAME: {function = "xla_func_1"}
+  // CHECK: [[RESULT_1:%.*]] = gpurt.transfer_from_device
+  %2 = "tf.XlaLaunch"(%arg0, %1) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_1, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+
+  // The output of the above XLA cluster is consumed by the below XLA cluster.
+  // Currently, the output is first transferred back to CPU and then
+  // transferred to GPU again, which is unnecessary.
+  // TODO(b/262280565): Remove unnecessary data transfers when there are
+  // multiple XLA clusters.
+  // CHECK: [[INPUT_1:%.*]] = gpurt.transfer_to_device [[RESULT_1]]
+  // CHECK: tfrt_fallback_async.executeop.seq{{.*}}"tf.XlaLaunch"([[INPUT_1]])
+  // CHECK-SAME: {function = "xla_func_2"}
+  // CHECK: gpurt.transfer_from_device
+  %3 = "tf.XlaLaunch"(%2) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_2, operand_segment_sizes = array<i32: 0, 1, 0>} : (tensor<1x3xf32>) -> tensor<*xf32>
+
+  func.return %3 : tensor<*xf32>
+}
+
+
+// Check that unused outputs of the XLA cluster are not transferred.
+
+func.func private @xla_func_3(%arg0: tensor<1x3xf32>, %arg1: tensor<1x3xf32>) -> (tensor<1x3xf32>, tensor<1x3xf32>) attributes {tf._XlaMustCompile = true, tf._noinline = true, tf._original_func_name = "should_not_be_used"} {
+  %1 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  %2 = "tf.DIV"(%arg0, %arg1) : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  func.return %1, %2 : tensor<1x3xf32>, tensor<1x3xf32>
+}
+
+// CHECK-LABEL: func @skip_unused_output
+func.func @skip_unused_output(%arg0: tensor<1x3xf32>) -> tensor<*xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "input:0", outputs = "output:0"}} {
+  %0 = "tf.VarHandleOp"() {device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) {device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+  // CHECK: [[INPUT_0:%.*]] = gpurt.transfer_to_device
+  // CHECK: [[VAR_0:%.*]] = gpurt.maybe_transfer_variable
+  // CHECK: tfrt_fallback_async.executeop.seq{{.*}}"tf.XlaLaunch"([[INPUT_0]], [[VAR_0]])
+  // CHECK-SAME: {function = "xla_func_3"}
+  // Since only one output of the XlaLaunch is used, there is only one data transfer.
+  // CHECK: gpurt.transfer_from_device
+  // CHECK-NOT: gpurt.transfer_from_device
+  %2:2 = "tf.XlaLaunch"(%arg0, %1) {_noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_3, operand_segment_sizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+  func.return %2#0 : tensor<*xf32>
+}
+
+
diff --git a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
index cb9c5fb9aed..eb5615dc2c6 100644
--- a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
@@ -27,9 +27,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_test_passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tfrt/init_tfrt_dialects.h"  // from @tf_runtime
 
@@ -56,6 +57,7 @@ int main(int argc, char **argv) {
   registry.insert<tfrt::fallback_async::FallbackAsyncDialect>();
   registry.insert<tfrt::fallback_sync::FallbackSyncDialect>();
   tensorflow::RegisterTPUDialects(&registry);
+  tensorflow::RegisterGpuDialects(&registry);
 
   tfrt::RegisterTFRTDialects(registry);
   return failed(
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
index 7990d58a369..41567df0731 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tfrt/core_runtime/opdefs/attributes.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/kernels.h"  // from @tf_runtime
 
 namespace tensorflow {
 
@@ -41,7 +40,6 @@ CoreRTConverter::CoreRTConverter(
     : builder_(context), side_effect_analysis_(*side_effect_analysis) {
   addConversion([](tfrt::compiler::ChainType type) { return type; });
   addConversion([](tfrt::corert::OpHandlerType type) { return type; });
-  addConversion([](tfrt::dist::DistributedContextType type) { return type; });
   addConversion([](tfrt::corert::TensorHandleType type) { return type; });
   addConversion([=](mlir::TensorType type) -> llvm::Optional<mlir::Type> {
     // Ref types are not supported in both compiler and runtime.
@@ -65,8 +63,9 @@ void CoreRTConverter::MaterializeDerivedAttributes(mlir::Operation *op) {
 }
 
 mlir::ArrayAttr CoreRTConverter::CreateOpFuncAttrs(
-    ArrayRef<NamedAttribute> attrs,
-    llvm::SmallVector<mlir::StringAttr, 4> *func_attr_keys) {
+    const mlir::SymbolTable &symbol_table, ArrayRef<NamedAttribute> attrs,
+    llvm::SmallVector<mlir::StringAttr, 4> *func_attr_keys,
+    bool use_mlir_func_name) {
   llvm::SmallVector<mlir::Attribute, 4> attr_array;
   for (auto key_and_value : attrs) {
     auto attr_key = key_and_value.getName();
@@ -74,7 +73,10 @@ mlir::ArrayAttr CoreRTConverter::CreateOpFuncAttrs(
     if (!IsUnusedTfrtAttribute(attr_key) &&
         attr_value.isa<mlir::FlatSymbolRefAttr, mlir::SymbolRefAttr>()) {
       auto func_attr = attr_value.dyn_cast<mlir::FlatSymbolRefAttr>();
-      auto converted = ConvertSymbolAttrToStringAttr(func_attr);
+      auto converted = ConvertSymbolAttrToStringAttr(symbol_table, func_attr,
+                                                     use_mlir_func_name);
+      if (!converted) return {};
+
       mlir::StringAttr key = builder_.getStringAttr(attr_key.strref());
       attr_array.push_back(builder_.getArrayAttr({key, converted}));
 
@@ -151,14 +153,7 @@ mlir::Value CoreRTConverter::GetDistributedContext(
   if (iter != distributed_context_by_func_.end()) {
     return iter->second;
   }
-  ConversionPatternRewriter::InsertionGuard insertion_guard(*rewriter);
-  rewriter->setInsertionPoint(op);
-  auto get_dist_ctx_op = rewriter->create<tfrt::dist::GetDistributedContextOp>(
-      op->getLoc(), distributed_context_type());
-
-  mlir::Value result = get_dist_ctx_op.getResult();
-  distributed_context_by_func_[func_op.getOperation()] = result;
-  return result;
+  return mlir::Value();
 }
 
 mlir::Value CoreRTConverter::GetRemoteChainManager(
@@ -168,18 +163,7 @@ mlir::Value CoreRTConverter::GetRemoteChainManager(
   if (iter != remote_chain_mgr_by_func_.end()) {
     return iter->second;
   }
-  ConversionPatternRewriter::InsertionGuard insertion_guard(*rewriter);
-  rewriter->setInsertionPoint(op);
-
-  mlir::Type remote_chain_mgr_type =
-      builder_.getType<::tfrt::dist::RemoteChainManagerType>();
-  mlir::Value dist_ctx = GetDistributedContext(op, rewriter);
-  auto create_mgr_op = rewriter->create<tfrt::dist::CreateRemoteChainManager>(
-      op->getLoc(), remote_chain_mgr_type, dist_ctx);
-
-  mlir::Value result = create_mgr_op.getResult();
-  remote_chain_mgr_by_func_[func_op.getOperation()] = result;
-  return result;
+  return mlir::Value();
 }
 
 mlir::Value CoreRTConverter::GetLocalSideEffectChain(
@@ -229,42 +213,41 @@ mlir::Value CoreRTConverter::GetTaskHandle(
     return iter->second;
   }
 
-  mlir::Value distributed_context = GetDistributedContext(op, rewriter);
-  auto task_handle_op = rewriter->create<tfrt::dist::GetTaskHandleOp>(
-      op->getLoc(), rewriter->getType<tfrt::dist::TaskHandleType>(),
-      distributed_context, task_name);
-
-  task_handle_by_name[task_name] = task_handle_op.getResult();
-  return task_handle_op.getResult();
-}
-
-mlir::Value CoreRTConverter::GetRemoteSideEffectChain(
-    mlir::Operation *op, StringRef remote_host,
-    mlir::ConversionPatternRewriter *rewriter) {
-  mlir::Value remote_chain_mgr = GetRemoteChainManager(op, rewriter);
-  mlir::Value local_chain = GetLocalSideEffectChain(op, rewriter);
-  mlir::Value task_handle = GetTaskHandle(op, remote_host, rewriter);
-  mlir::Type remote_obj_id_ty =
-      rewriter->getType<tfrt::dist::RemoteObjectIdType>();
-
-  // Get the remote chain using the tfrt_dist.get_chain_for_task_handle op.
-  auto get_chain_op = rewriter->create<tfrt::dist::GetChainForTaskHandleOp>(
-      op->getLoc(), remote_obj_id_ty, local_chain, remote_chain_mgr,
-      task_handle);
-  return get_chain_op.getResult();
+  return mlir::Value();
 }
 
 mlir::StringAttr CoreRTConverter::ConvertSymbolAttrToStringAttr(
-    mlir::FlatSymbolRefAttr symbol_attr) {
+    const mlir::SymbolTable &symbol_table, mlir::FlatSymbolRefAttr symbol_attr,
+    bool use_mlir_func_name) {
+  if (use_mlir_func_name) {
+    return mlir::StringAttr::get(builder_.getContext(),
+                                 symbol_attr.getValue().str());
+  }
+
   // Currently in TF graph to MLIR importing, a "0" is appended to the original
-  // function name, so we pop it here. The renaming is for TF/XLA v1 bridge
-  // use cases. Refer to b/142268695, b/141617294 for more context.
+  // function name. The renaming is for TF/XLA v1 bridge use cases. Refer to
+  // b/142268695, b/141617294 for more context.
   //
-  // In TFRT use cases, in almost every case "0" is the only literal
-  // appended since TF Graph already guarantee function name uniqueness.
-  // TODO(b/172092902): Investigate a better way to make the tf_func_name to
-  // mlir_tf_func_name conversion reversible.
-  auto func_name = symbol_attr.getValue().drop_back().str();
+  // TFRT currently uses the original function library. Hence, we retrieve the
+  // original function name from the function attributes. Longer term, we
+  // probably want to export the MLIR functions.
+  func::FuncOp callee =
+      symbol_table.lookup<func::FuncOp>(symbol_attr.getValue());
+  if (!callee) return mlir::StringAttr();
+
+  mlir::StringAttr original_func_name =
+      callee->getAttrOfType<mlir::StringAttr>("tf._original_func_name");
+  std::string func_name;
+  if (!original_func_name) {
+    // If there is no function attribute "tf._original_func_name" in the callee,
+    // we use the workaround to recover the original function name by removing
+    // the last char of the MLIR function name.
+    // TODO(b/259138201): Remove this workwaround after we make sure
+    // "tf._original_func_name" is present in callees in all code paths.
+    func_name = symbol_attr.getValue().drop_back().str();
+  } else {
+    func_name = original_func_name.str();
+  }
 
   return mlir::StringAttr::get(builder_.getContext(), func_name);
 }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h
index c7c42c81bbc..c96e1d8a43b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/types.h"  // from @tf_runtime
 
 namespace tensorflow {
 
@@ -49,9 +48,16 @@ class CoreRTConverter : public mlir::TypeConverter {
   // named attribute lists, which is an array of pairs, with keys and values
   // both being string attributes. The values represent function names.
   // This method also populates a vector of attribute keys to be removed.
+  // If `use_mlir_func_name` is true, the function name given by MLIR will be
+  // used, which could be different from the original function name in the graph
+  // function library. This is used when the original function has been changed
+  // by lowering passes, and hence it needs to be exported to function library
+  // for runtime to use.
   mlir::ArrayAttr CreateOpFuncAttrs(
+      const mlir::SymbolTable &symbol_table,
       llvm::ArrayRef<mlir::NamedAttribute> attrs,
-      llvm::SmallVector<mlir::StringAttr, 4> *func_attr_keys);
+      llvm::SmallVector<mlir::StringAttr, 4> *func_attr_keys,
+      bool use_mlir_func_name = false);
 
   // Parse the device name of `op` to TFRT's device name. For example, "/CPU:0"
   // will be parsed as "cpu". Return None if no device is assigned.
@@ -69,24 +75,19 @@ class CoreRTConverter : public mlir::TypeConverter {
 
   // Get a DistributedContext value to be used by the given op. The
   // DistributedContext value should be shared by all operations in the body
-  // of the same FuncOp. If there does not exist one, insert a
-  // GetDistributedContext op right before the given op and return the result
-  // value.
+  // of the same FuncOp. If there does not exist one, return a null Value.
   mlir::Value GetDistributedContext(mlir::Operation *op,
                                     mlir::ConversionPatternRewriter *rewriter);
 
   // Get a RemoteChainManager value to be used by the given op. The
   // RemoteChainManager value should be shared by all operations in the body
-  // of the same FuncOp. If there does not exist one, insert a
-  // tfrt_dist.test_create_remote_chain_manager op right before the given op and
-  // return the result value.
+  // of the same FuncOp. If there does not exist one, return a null Value.
   mlir::Value GetRemoteChainManager(mlir::Operation *op,
                                     mlir::ConversionPatternRewriter *rewriter);
 
   // Get a TaskHandle value with the given task name. If the TaskHandle value
   // has already been created for the given task name within the same FuncOp,
-  // return this TaskHandle value. Otherwise, insert a tfrt_dist.get_task_handle
-  // op right before the given op and return the result value.
+  // return this TaskHandle value. Otherwise, return a null Value.
   mlir::Value GetTaskHandle(mlir::Operation *op, StringRef task_name,
                             mlir::ConversionPatternRewriter *rewriter);
 
@@ -102,11 +103,6 @@ class CoreRTConverter : public mlir::TypeConverter {
   mlir::Value GetLocalSideEffectChain(
       mlir::Operation *op, mlir::ConversionPatternRewriter *rewriter);
 
-  // Return a remote chain for side effects for `op`.
-  mlir::Value GetRemoteSideEffectChain(
-      mlir::Operation *op, StringRef remote_host,
-      mlir::ConversionPatternRewriter *rewriter);
-
   mlir::Type op_handler_type() {
     return builder_.getType<::tfrt::corert::OpHandlerType>();
   }
@@ -119,10 +115,6 @@ class CoreRTConverter : public mlir::TypeConverter {
     return builder_.getType<::tfrt::compiler::ChainType>();
   }
 
-  mlir::Type distributed_context_type() {
-    return builder_.getType<::tfrt::dist::DistributedContextType>();
-  }
-
   mlir::Builder &builder() { return builder_; }
 
  private:
@@ -156,7 +148,8 @@ class CoreRTConverter : public mlir::TypeConverter {
   mlir::TypeAttr ConvertTypeAttribute(mlir::TypeAttr type_attr);
 
   mlir::StringAttr ConvertSymbolAttrToStringAttr(
-      mlir::FlatSymbolRefAttr symbol_attr);
+      const mlir::SymbolTable &symbol_table,
+      mlir::FlatSymbolRefAttr symbol_attr, bool use_mlir_func_name = false);
 
   mlir::Builder builder_;
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_batch_function.cc b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_batch_function.cc
index 970fa7ae959..31f3cc65d7b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_batch_function.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_batch_function.cc
@@ -131,10 +131,10 @@ mlir::LogicalResult DeduplicateFunctionsInovkedByBatchFunction::Run() {
           // User is not a BatchFunctionOp
           if (!op) return false;
           if (shared_name.empty()) {
-            shared_name = op.shared_name();
+            shared_name = op.getSharedName();
             return true;
           }
-          return shared_name == op.shared_name();
+          return shared_name == op.getSharedName();
         })) {
       shared_name_to_func_ops[shared_name].push_back(func);
     }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
index 26ad592d6f3..fd1d4cce445 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -26,6 +27,41 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+void RecursivelyMoveOp(mlir::TF::_TPUCompileMlirOp compile_op,
+                       mlir::Operation *op_candidate,
+                       llvm::SmallDenseSet<mlir::Operation *> *ops_to_move) {
+  if (!op_candidate || !ops_to_move->contains(op_candidate)) return;
+  // Move the parent first.
+  for (const auto &operand : op_candidate->getOperands()) {
+    RecursivelyMoveOp(compile_op, operand.getDefiningOp(), ops_to_move);
+  }
+  op_candidate->moveBefore(compile_op);
+  // Erase the op to avoid moving the common ancestor.
+  ops_to_move->erase(op_candidate);
+}
+
+// Move exec_op's args defining ops before the compile op to group compile op
+// and execute op.
+void GroupCompileOpAndExecuteOp(mlir::func::FuncOp func,
+                                mlir::TF::_TPUCompileMlirOp compile_op,
+                                mlir::TF::TPUExecuteOp exec_op) {
+  // Collect the ops between compile op and execute op.
+  llvm::SmallDenseSet<mlir::Operation *> ops_to_move;
+  bool collect = false;
+  func.walk(
+      [&ops_to_move, &collect, &compile_op, &exec_op](mlir::Operation *op) {
+        if (collect) ops_to_move.insert(op);
+        if (op == compile_op.getOperation()) collect = true;
+        return op == exec_op.getOperation() ? mlir::WalkResult::interrupt()
+                                            : mlir::WalkResult::advance();
+      });
+  // Recursively move the defining op of the execute op argument in front of the
+  // compile op such that the compile op and execute op are grouped together.
+  for (const auto &operand : exec_op.getArgs()) {
+    RecursivelyMoveOp(compile_op, operand.getDefiningOp(), &ops_to_move);
+  }
+}
+
 // This pass rewrites tf._TPUCompileMlirOp and tf.TPUExecuteOp into a single
 // tf.TPUCompileMlirAndExecuteOp. Also it removes the unnecessary
 // TPUCompileSucceededAssertOp.
@@ -79,7 +115,7 @@ class FuseTpuCompileAndExecutePass
     mlir::OpBuilder builder(&func.getBody());
 
     for (auto exec_op : tpu_execute_ops) {
-      auto compile_cache_entry = exec_op.key();
+      auto compile_cache_entry = exec_op.getKey();
       auto compile_op = ::llvm::dyn_cast<mlir::TF::_TPUCompileMlirOp>(
           compile_cache_entry.getDefiningOp());
       if (!compile_op) {
@@ -88,6 +124,8 @@ class FuseTpuCompileAndExecutePass
         return;
       }
 
+      GroupCompileOpAndExecuteOp(func, compile_op, exec_op);
+
       builder.setInsertionPointAfter(compile_op);
       llvm::SmallVector<mlir::Type, 4> output_types;
       output_types.push_back(mlir::RankedTensorType::get(
@@ -97,20 +135,20 @@ class FuseTpuCompileAndExecutePass
       llvm::SmallVector<int> static_shaped_operand_indices_attr;
       llvm::SmallVector<mlir::Value> static_shape_tensors;
       llvm::SmallVector<mlir::Value> exec_op_args;
-      exec_op_args.resize(exec_op.args().size());
+      exec_op_args.resize(exec_op.getArgs().size());
 
       auto &static_shaped_operands =
           exec_to_static_shaped_operands_map[exec_op];
-      for (int i = 0; i < exec_op.args().size(); ++i) {
+      for (int i = 0; i < exec_op.getArgs().size(); ++i) {
         auto iter = static_shaped_operands.find(i);
         if (iter != static_shaped_operands.end()) {
           static_shaped_operand_indices_attr.push_back(iter->first);
-          static_shape_tensors.push_back(iter->second.static_shape());
-          exec_op_args[i] = iter->second.input();
+          static_shape_tensors.push_back(iter->second.getStaticShape());
+          exec_op_args[i] = iter->second.getInput();
           // The first operand is the input tensor, while the second operand is
           // the static shape tensor, hence the drop_back here.
           iter->second->replaceAllUsesWith(
-              mlir::ValueRange({iter->second.input()}));
+              mlir::ValueRange({iter->second.getInput()}));
           iter->second->erase();
         } else {
           exec_op_args[i] = exec_op->getOperand(i);
@@ -126,12 +164,13 @@ class FuseTpuCompileAndExecutePass
               exec_op.getLoc(), output_types, exec_op_args,
               static_shape_tensors,
               builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
-              compile_op.mlir_module(), compile_op.metadata(), producer_name);
+              compile_op.getMlirModule(), compile_op.getMetadata(),
+              producer_name);
 
-      exec_op.replaceAllUsesWith(compile_and_execute_op.results());
-      for (auto program_result : compile_op.program()) {
+      exec_op.replaceAllUsesWith(compile_and_execute_op.getResults());
+      for (auto program_result : compile_op.getProgram()) {
         program_result.replaceAllUsesWith(
-            compile_and_execute_op.rendezvous_key_base());
+            compile_and_execute_op.getRendezvousKeyBase());
       }
 
       assert(exec_op.use_empty());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.cc
new file mode 100644
index 00000000000..732d5fcd07f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.cc
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
+
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h"
+
+namespace tensorflow {
+
+void RegisterGpuDialects(mlir::DialectRegistry *registry) {
+  registry->insert<tfrt::gpu::GpuRuntimeDialect>();
+}
+
+void AddGpuTargetDialectAndPatterns(mlir::MLIRContext *context,
+                                    mlir::ConversionTarget *target,
+                                    mlir::RewritePatternSet *patterns) {
+  target->addLegalDialect<tfrt::gpu::GpuRuntimeDialect>();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h b/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h
new file mode 100644
index 00000000000..2af46e34ce9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace tensorflow {
+
+// Registers dialects used in TFRT GPU lowering.
+void RegisterGpuDialects(mlir::DialectRegistry *registry);
+
+// Adds a target dialect and rewrite patterns for TFRT GPU lowering.
+void AddGpuTargetDialectAndPatterns(mlir::MLIRContext *context,
+                                    mlir::ConversionTarget *target,
+                                    mlir::RewritePatternSet *patterns);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_GPU_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 758d76a838a..75f39ac9b9d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
@@ -37,6 +37,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::mlir::tf_saved_model::kTfSavedModelExportedNamesAttr;
+using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
+
 constexpr char kCpuDeviceName[] =
     "/job:localhost/replica:0/task:0/device:CPU:0";
 
@@ -78,7 +81,7 @@ struct HoistInfo {
 
   // Mapping from the old values produced by hoisted ops before hoisting to the
   // new values after hoisting.
-  mlir::BlockAndValueMapping value_mapping;
+  mlir::IRMapping value_mapping;
 
   // `hoisted_values` is to keep all values that are produced by hoisted ops
   // but used by non-hoisted ops. These values will be replaced by results of
@@ -155,7 +158,7 @@ void ReplaceHoistedValues(
           builder.getStrArrayAttr(container_arr));
       get_resource_op->setAttr("device", builder.getStringAttr(device));
 
-      auto new_values = get_resource_op.results();
+      auto new_values = get_resource_op.getResults();
       for (auto iter : llvm::zip(old_values, new_values)) {
         auto old_value = std::get<0>(iter);
         auto new_value = std::get<1>(iter);
@@ -165,10 +168,11 @@ void ReplaceHoistedValues(
   }
 }
 
-bool OnlyHasReadEffect(mlir::Operation *op) {
+bool OnlyHasReadOrNoEffect(mlir::Operation *op) {
   auto interface = llvm::dyn_cast<mlir::MemoryEffectOpInterface>(op);
   if (!interface) return false;
-  return interface.onlyHasEffect<mlir::MemoryEffects::Read>();
+  return interface.onlyHasEffect<mlir::MemoryEffects::Read>() ||
+         interface.hasNoEffect();
 }
 
 bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
@@ -177,7 +181,7 @@ bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
   if (op->mightHaveTrait<mlir::OpTrait::IsTerminator>()) return false;
 
   // Non-side-effecting ops can be hoisted.
-  if (mlir::MemoryEffectOpInterface::hasNoEffect(op)) return true;
+  if (mlir::isMemoryEffectFree(op)) return true;
 
   // ResourceHandle ops can be hoisted.
   if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::HashTableV2Op>(op))
@@ -186,7 +190,7 @@ bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
   // If it is ReadVariableOp and the variable is readonly, it can be hoisted.
   if (auto read_var_op = llvm::dyn_cast<mlir::TF::ReadVariableOp>(op)) {
     if (auto var_handle_op = llvm::dyn_cast_or_null<mlir::TF::VarHandleOp>(
-            read_var_op.resource().getDefiningOp())) {
+            read_var_op.getResource().getDefiningOp())) {
       if (read_only_vars.count(GetResourceHandle(var_handle_op)) > 0)
         return true;
     }
@@ -197,7 +201,7 @@ bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
   if (auto lookup_table_size_op =
           llvm::dyn_cast<mlir::TF::LookupTableSizeV2Op>(op)) {
     if (auto hash_table_op = llvm::dyn_cast_or_null<mlir::TF::HashTableV2Op>(
-            lookup_table_size_op.table_handle().getDefiningOp())) {
+            lookup_table_size_op.getTableHandle().getDefiningOp())) {
       if (read_only_vars.count(GetResourceHandle(hash_table_op)) > 0)
         return true;
     }
@@ -332,7 +336,7 @@ void HoistInvariantOps(mlir::ModuleOp module) {
     const auto &vars = iter.second;
     if (std::all_of(vars.begin(), vars.end(), [](mlir::Operation *op) {
           for (auto *user : op->getUsers()) {
-            if (!OnlyHasReadEffect(user)) return false;
+            if (!OnlyHasReadOrNoEffect(user)) return false;
           }
           return true;
         })) {
@@ -458,7 +462,7 @@ class LowerTFSavedModelPass
     mlir::OpBuilder builder(&getContext());
     auto resource_id = builder.getStringAttr("tf.resource_name");
     auto bound_id = builder.getStringAttr("tf_saved_model.bound_input");
-    auto path_id = builder.getStringAttr("tf_saved_model.index_path");
+    auto path_id = builder.getStringAttr(kTfSavedModelIndexPathAttr);
 
     module.walk([resource_id, bound_id, path_id,
                  &builder](mlir::Operation *op) mutable {
@@ -478,7 +482,7 @@ class LowerTFSavedModelPass
           func_op.removeResultAttr(i, path_id);
         }
         if (auto exported_names = func_op->getAttrOfType<mlir::ArrayAttr>(
-                "tf_saved_model.exported_names")) {
+                kTfSavedModelExportedNamesAttr)) {
           bool is_session_initializer = IsSessionInitializer(func_op);
 
           // Create a function for each exported name.
@@ -486,7 +490,7 @@ class LowerTFSavedModelPass
           // TODO(b/148477882): TFRT dialect should have similar concepts of
           // exported names so that a function can be referenced by multiple
           // exported names.
-          func_op->removeAttr("tf_saved_model.exported_names");
+          func_op->removeAttr(kTfSavedModelExportedNamesAttr);
           for (auto exported_name : exported_names) {
             auto exported_func_op = func_op.clone();
             exported_func_op.setName(exported_name.cast<mlir::StringAttr>());
@@ -565,14 +569,14 @@ class ConvertReferenceVariableToResourceVariablePass
 mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     mlir::TF::VariableV2Op var_op) {
   auto tensor_type =
-      mlir::TF::DropRefType(var_op.ref().getType()).cast<mlir::TensorType>();
+      mlir::TF::DropRefType(var_op.getRef().getType()).cast<mlir::TensorType>();
 
   llvm::SmallVector<mlir::TF::IdentityOp, 4> identity_ops;
   llvm::SmallVector<mlir::TF::AssignOp, 4> assign_ops;
   llvm::SmallVector<std::pair<mlir::Operation *, unsigned>, 4>
       side_effect_free_ops;
 
-  for (mlir::OpOperand &use : var_op.ref().getUses()) {
+  for (mlir::OpOperand &use : var_op.getRef().getUses()) {
     mlir::Operation *user = use.getOwner();
 
     if (auto identity = llvm::dyn_cast<mlir::TF::IdentityOp>(user)) {
@@ -581,11 +585,11 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     } else if (auto assign = llvm::dyn_cast<mlir::TF::AssignOp>(user)) {
       // Conservatively we only allow the case that the output of this tf.Assign
       // is not consumed by any other ops.
-      if (assign.output_ref().use_empty()) {
+      if (assign.getOutputRef().use_empty()) {
         assign_ops.push_back(assign);
         continue;
       }
-    } else if (mlir::MemoryEffectOpInterface::hasNoEffect(user)) {
+    } else if (mlir::isMemoryEffectFree(user)) {
       side_effect_free_ops.push_back({user, use.getOperandNumber()});
       continue;
     }
@@ -603,7 +607,7 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
           {}, mlir::TF::ResourceType::get(
                   llvm::ArrayRef<mlir::TensorType>{tensor_type},
                   builder.getContext())),
-      var_op.container(), var_op.shared_name());
+      var_op.getContainer(), var_op.getSharedName());
 
   for (auto op : identity_ops) {
     // Set insertion point to this identity_op so that the side-effect
@@ -611,7 +615,7 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     builder.setInsertionPoint(op);
     auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
         op.getLoc(), op.getType(), var_handle_op);
-    op.replaceAllUsesWith(read_var_op.value());
+    op.replaceAllUsesWith(read_var_op.getValue());
     op.erase();
   }
 
@@ -620,7 +624,7 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // dominating the newly created op.
     builder.setInsertionPoint(op);
     builder.create<mlir::TF::AssignVariableOp>(op.getLoc(), var_handle_op,
-                                               op.value());
+                                               op.getValue());
     op.erase();
   }
 
@@ -633,7 +637,7 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // Create a new read variable op, so that the side-effects are preserved.
     auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
         op->getLoc(), tensor_type, var_handle_op);
-    op->setOperand(idx, read_var_op.value());
+    op->setOperand(idx, read_var_op.getValue());
   }
 
   return mlir::success();
@@ -650,7 +654,7 @@ void ConvertReferenceVariableToResourceVariablePass::runOnOperation() {
 
   // First, we collect all variables' corresponding tf.VariableV2 ops.
   module.walk([&ref_vars](mlir::TF::VariableV2Op op) {
-    if (op.shared_name().empty()) {
+    if (op.getSharedName().empty()) {
       op.emitOpError()
           << "unable to convert reference variables with empty shared_names.";
       return mlir::WalkResult::interrupt();
@@ -661,7 +665,7 @@ void ConvertReferenceVariableToResourceVariablePass::runOnOperation() {
       device = device_attr.getValue();
     }
 
-    ref_vars[{device, op.container(), op.shared_name()}].push_back(op);
+    ref_vars[{device, op.getContainer(), op.getSharedName()}].push_back(op);
 
     return mlir::WalkResult::advance();
   });
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
index ba0a0141acb..527f1c7c996 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
@@ -117,7 +117,7 @@ class MergeTfIfOpsPass
       auto if_op = llvm::dyn_cast<mlir::TF::IfOp>(&op);
 
       // Skip non tf.If ops and tf.If ops that are side-effecting.
-      if (!if_op || !if_op.is_stateless()) continue;
+      if (!if_op || !if_op.getIsStateless()) continue;
 
       if_ops_to_merge[if_op].push_back(if_op);
     }
@@ -169,33 +169,34 @@ class MergeTfIfOpsPass
     }
 
     auto branch_function_type = builder.getFunctionType(
-        if_ops.front().input().getTypes(), new_result_types);
+        if_ops.front().getInput().getTypes(), new_result_types);
 
     // Create new branches for the merged tf.If op.
     auto then_branch_name = CreateBranchFunction(
         builder, loc, branch_prefix,
         /*branch_suffix=*/"_then", branch_function_type, if_ops,
-        [](mlir::TF::IfOp op) { return op.then_branchAttr(); });
+        [](mlir::TF::IfOp op) { return op.getThenBranchAttr(); });
 
     auto else_branch_name = CreateBranchFunction(
         builder, loc, branch_prefix,
         /*branch_suffix=*/"_else", branch_function_type, if_ops,
-        [](mlir::TF::IfOp op) { return op.else_branchAttr(); });
+        [](mlir::TF::IfOp op) { return op.getElseBranchAttr(); });
 
     mlir::OpBuilder::InsertionGuard guard(builder);
     builder.setInsertionPoint(if_ops.front());
 
     // Create the merged tf.If op using the new branches.
     auto new_if_op = builder.create<mlir::TF::IfOp>(
-        loc, new_result_types, if_ops.front().cond(), if_ops.front().input(),
-        then_branch_name, else_branch_name, /*is_stateless=*/true);
+        loc, new_result_types, if_ops.front().getCond(),
+        if_ops.front().getInput(), then_branch_name, else_branch_name,
+        /*is_stateless=*/true);
 
     // Replace the uses of results of the original tf.If ops with the results of
     // the merged tf.If op.
-    auto new_result_iter = new_if_op.output().begin();
+    auto new_result_iter = new_if_op.getOutput().begin();
     for (auto if_op : if_ops) {
-      for (auto result : if_op.output()) {
-        assert(new_result_iter != new_if_op.output().end());
+      for (auto result : if_op.getOutput()) {
+        assert(new_result_iter != new_if_op.getOutput().end());
         result.replaceAllUsesWith(*new_result_iter);
         ++new_result_iter;
       }
@@ -233,7 +234,7 @@ class MergeTfIfOpsPass
           empty_string_attr);
 
       // The results are the concatenation of the original branches.
-      results.append(call_op.output().begin(), call_op.output().end());
+      results.append(call_op.getOutput().begin(), call_op.getOutput().end());
     }
 
     builder.create<mlir::func::ReturnOp>(loc, results);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
index 9f9e880685c..b73b84c330e 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
@@ -39,7 +39,7 @@ class FoldDeviceIndex : public mlir::OpRewritePattern<mlir::TF::DeviceIndexOp> {
       return mlir::failure();
 
     int32_t i = 0;
-    mlir::ArrayAttr device_names = op.device_names();
+    mlir::ArrayAttr device_names = op.getDeviceNames();
     for (; i < device_names.size(); ++i) {
       auto device_name = device_names[i].cast<mlir::StringAttr>().getValue();
       if (device_name == parsed_name.type) break;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize_tf_control_flow_side_effect.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize_tf_control_flow_side_effect.cc
index 6cf747d578c..29e7e744b0c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize_tf_control_flow_side_effect.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/optimize_tf_control_flow_side_effect.cc
@@ -32,7 +32,7 @@ bool FunctionHasSideEffect(
 
   auto op_has_side_effect = [&](mlir::Operation* op) {
     if (auto while_op = llvm::dyn_cast<mlir::TF::WhileOp>(op)) {
-      if (while_op.is_stateless()) return false;
+      if (while_op.getIsStateless()) return false;
 
       return FunctionHasSideEffect(while_op.cond_function(),
                                    function_side_effect) ||
@@ -41,7 +41,7 @@ bool FunctionHasSideEffect(
     }
 
     if (auto if_op = llvm::dyn_cast<mlir::TF::IfOp>(op)) {
-      if (if_op.is_stateless()) return false;
+      if (if_op.getIsStateless()) return false;
 
       return FunctionHasSideEffect(if_op.else_function(),
                                    function_side_effect) ||
@@ -53,7 +53,7 @@ bool FunctionHasSideEffect(
     // ops' callee functions contain them, we treat them as non-side-effecting.
     if (llvm::isa<mlir::TF::AssertOp, mlir::TF::TimestampOp>(op)) return false;
 
-    return !mlir::MemoryEffectOpInterface::hasNoEffect(op);
+    return !mlir::isMemoryEffectFree(op);
   };
 
   // Speculatively setting the function to have no side effect to avoid infinite
@@ -96,7 +96,7 @@ class OptimizeTfControlFlowSideEffectPass
     mlir::Builder builder(module.getContext());
     module.walk([&](mlir::Operation* op) {
       if (auto while_op = llvm::dyn_cast<mlir::TF::WhileOp>(op)) {
-        if (while_op.is_stateless()) return;
+        if (while_op.getIsStateless()) return;
 
         if (!FunctionHasSideEffect(while_op.cond_function(),
                                    function_side_effect) &&
@@ -107,7 +107,7 @@ class OptimizeTfControlFlowSideEffectPass
       }
 
       if (auto if_op = llvm::dyn_cast<mlir::TF::IfOp>(op)) {
-        if (if_op.is_stateless()) return;
+        if (if_op.getIsStateless()) return;
 
         if (!FunctionHasSideEffect(if_op.else_function(),
                                    function_side_effect) &&
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
index afebf94a121..e1e7df8fb11 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace mlir {
 class PassManager;
@@ -75,6 +76,11 @@ CreateOptimizeTfForTfrtPass();
 
 class CoreRTConverter;
 
+// Create a pass that sink in the var handle op to the callee function when
+// proper.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkInInvariantOpsPass();
+
 // Create a pass that rewrites tf_saved_model dialect's ops according to TFRT's
 // requirements.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -92,12 +98,6 @@ mlir::LogicalResult TFSavedModelToCoreRTConversionPassRun(
     mlir::ConversionTarget* target, mlir::RewritePatternSet* patterns,
     CoreRTConverter* corert_converter);
 
-// Create an operation pass that converts each tfrt_dist.remote_execute_func op
-// into a combination of tfrt_dist.register_tfrt_function op and
-// tfrt_dist.remote_execute op.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateDistRemoteRunEncapsulatePass();
-
 // Create an operation pass that removes the device attribute from every
 // corert.executeop.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -167,12 +167,17 @@ struct TfrtPipelineOptions
       llvm::cl::desc("If true, fallback executeops that produce inputs to tpu "
                      "program will use tpu host allocator."),
       llvm::cl::init(false)};
-  Option<bool> enable_native_ops{
-      *this, "enable-native-ops",
-      llvm::cl::desc(
-          "If true, native ops will be used on an opt-in basis instead of "
-          "fallback ops. If false, no native ops are used."),
-      llvm::cl::init(true)};
+
+  Option<bool> target_gpu{
+      *this, "target-gpu",
+      llvm::cl::desc("If true, target GPU compiler passes."),
+      llvm::cl::init(false)};
+
+  // TODO(b/260915352): Remove the flag and default to using bridge.
+  Option<bool> use_bridge_for_gpu{
+      *this, "use-bridge-for-gpu",
+      llvm::cl::desc("If true, GPU bridge is used."), llvm::cl::init(false)};
+
   Option<bool> func_use_fallback_tensor{
       *this, "func-use-fallback-tensor",
       llvm::cl::desc(
@@ -192,6 +197,12 @@ struct TfrtPipelineOptions
                      "out to run during loading."),
       llvm::cl::init(false)};
 
+  Option<bool> sink_in_invariant_ops{
+      *this, "sink-in-invariant-ops",
+      llvm::cl::desc("If true, sink the selected invariant ops in to the "
+                     "nested functions to facilitate invariant ops hoisting."),
+      llvm::cl::init(false)};
+
   Option<uint64_t> cost_threshold{
       *this, "tfrt-cost-threshold",
       llvm::cl::desc(
@@ -243,8 +254,8 @@ void CreateTFExecutorToTFPipeline(mlir::OpPassManager& pm,
 
 // Creates a pipeline of passes that lowers MLIR TF dialect from tf.function to
 // TFRT dialect. SavedModel related conversions are not included.
-void CreateTfExecutorToTfrtPipeline(mlir::PassManager& pm,
-                                    const TfrtPipelineOptions& options);
+tsl::Status CreateTfExecutorToTfrtPipeline(mlir::PassManager& pm,
+                                           const TfrtPipelineOptions& options);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/remote_run_encapsulate.cc b/tensorflow/compiler/mlir/tfrt/transforms/remote_run_encapsulate.cc
deleted file mode 100644
index 6b869311823..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/remote_run_encapsulate.cc
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This pass converts each tfrt_dist.remote_execute_func op into a combination
-// of tfrt_dist.register_tfrt_function op and tfrt_dist.remote_execute op. The
-// function to be executed in the remote host will be serialized as a string
-// attribute of the tfrt_dist.register_tfrt_function op.
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
-#include "tensorflow/core/util/device_name_utils.h"
-#include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/kernels.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/test_kernels/opdefs/test_kernels.h"  // from @tf_runtime
-
-namespace tensorflow {
-
-namespace {
-
-constexpr const char* kHost = "host";
-constexpr const char* kTFRTDevice = "tfrt.device";
-
-struct DistRemoteRunEncapsulatePass
-    : public PassWrapper<DistRemoteRunEncapsulatePass,
-                         OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(DistRemoteRunEncapsulatePass)
-
-  llvm::StringRef getArgument() const final {
-    return "tfrt-dist-remote-run-encapsulate";
-  }
-  llvm::StringRef getDescription() const final {
-    return "This pass looks for a remote_run_func and serialize the callee to "
-           "a string attribute attached to a remote_register operation, "
-           "followed by a remote_execute invocation.";
-  }
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<tfrt::dist::DistributedDialect>();
-  }
-};
-
-LogicalResult EncapsulateFuncAndSerialize(func::FuncOp entry_func,
-                                          std::string* serialized_func_module) {
-  ModuleOp module = entry_func->getParentOfType<ModuleOp>();
-  SymbolTable entry_module_table(module);
-  SmallVector<func::FuncOp, 4> referenced({entry_func});
-
-  // Create a new module to hold func and all referenced functions.
-  OwningOpRef<mlir::ModuleOp> module_for_func =
-      ModuleOp::create(mlir::UnknownLoc::get(entry_func.getContext()));
-  SymbolTable symbol_table(module_for_func.get());
-
-  while (!referenced.empty()) {
-    func::FuncOp func = referenced.pop_back_val();
-
-    // Skip functions that have already been cloned into new module.
-    if (symbol_table.lookup<func::FuncOp>(func.getName())) continue;
-
-    // Find any SymbolRefAttr in func that maps to a FuncOp. We need to clone
-    // all found FuncOps to new_module to make sure new_module is
-    // self-contained.
-    Optional<SymbolTable::UseRange> uses = SymbolTable::getSymbolUses(func);
-    assert(uses && "expected to be able to collect symbol uses");
-    for (SymbolTable::SymbolUse use : *uses) {
-      func::FuncOp referenced_func = entry_module_table.lookup<func::FuncOp>(
-          use.getSymbolRef().cast<FlatSymbolRefAttr>().getValue());
-
-      // Skip Symbols that do not map to a function.
-      if (!referenced_func) continue;
-
-      referenced.emplace_back(referenced_func);
-    }
-
-    func::FuncOp clone = func.clone();
-    if (clone.getName() == entry_func.getName()) {
-      clone.setPublic();
-    } else {
-      clone.setPrivate();
-    }
-    symbol_table.insert(clone);
-  }
-
-  *serialized_func_module =
-      tensorflow::SerializeMlirModule(module_for_func.get());
-  return success();
-}
-
-void DistRemoteRunEncapsulatePass::runOnOperation() {
-  mlir::TF::RuntimeDevices devices;
-  ModuleOp module = getOperation();
-  SymbolTable symtab(module);
-  Type chain_type = tfrt::compiler::ChainType::get(&getContext());
-  Type remote_object_id_ty = tfrt::dist::RemoteObjectIdType::get(&getContext());
-  Type tensor_handle_ty = tfrt::corert::TensorHandleType::get(&getContext());
-  module.walk([&](tfrt::dist::RemoteExecuteFuncOp remote_exec_op) {
-    FlatSymbolRefAttr callee_sym = remote_exec_op.getCalleeAttr();
-    func::FuncOp callee = symtab.lookup<func::FuncOp>(callee_sym.getValue());
-    if (!callee) {
-      remote_exec_op.emitOpError("callee function ")
-          << callee_sym.getValue() << " is not found";
-      signalPassFailure();
-      return WalkResult::interrupt();
-    }
-    std::string txt_module;
-    if (failed(EncapsulateFuncAndSerialize(callee, &txt_module))) {
-      remote_exec_op.emitOpError("failed to serialize the callee function ")
-          << callee.getName();
-      signalPassFailure();
-      return WalkResult::interrupt();
-    }
-    Location loc = remote_exec_op.getLoc();
-    StringAttr callee_name =
-        StringAttr::get(&getContext(), callee_sym.getValue());
-    OpBuilder builder(remote_exec_op);
-    auto register_op = builder.create<tfrt::dist::RegisterTFRTFunctionOp>(
-        loc, chain_type, remote_exec_op.getInOpChain(),
-        remote_exec_op.getContext(), remote_exec_op.getRemoteTask(),
-        StringAttr::get(&getContext(), txt_module), callee_name);
-
-    // Build the device assignment for the results
-    // TODO(tfrt-devs): Define properly MLIR types and operations
-    SmallVector<Attribute, 8> result_devices;
-    for (const auto& result : llvm::enumerate(remote_exec_op.getResults())) {
-      StringAttr device =
-          callee.getResultAttrOfType<StringAttr>(result.index(), kTFRTDevice);
-      if (!device) {
-        // The result might not have the device attribute if it is added by
-        // the tf-to-tfrt pass. Use the first CPU on the remote host as the
-        // device of this result.
-        DeviceNameUtils::ParsedName parsed_name;
-        if (StringAttr host_attr = callee->getAttrOfType<StringAttr>(kHost)) {
-          auto host = host_attr.getValue();
-          DeviceNameUtils::ParseFullName({host.data(), host.size()},
-                                         &parsed_name);
-        }
-        parsed_name.has_type = true;
-        parsed_name.type = "CPU";
-        parsed_name.has_id = true;
-        parsed_name.id = 0;
-        device = StringAttr::get(
-            &getContext(), DeviceNameUtils::ParsedNameToString(parsed_name));
-      }
-      result_devices.push_back(std::move(device));
-    }
-    // IDEA(donglin): Update the create_remote_execute_spec kernel to use Device
-    // object instead of Device string.
-    Type remote_spec_ty = tfrt::dist::RemoteExecuteSpecType::get(&getContext());
-    auto result_devices_attr = ArrayAttr::get(&getContext(), result_devices);
-    auto remote_spec = builder.create<tfrt::dist::CreateRemoteExecuteSpecOp>(
-        loc, remote_spec_ty, remote_exec_op.getContext(), result_devices_attr);
-    // If original argument is already tfrt_dist.remote_object_id, use it
-    // directly. If it is TensorHandle, insert an op to extract the
-    // tfrt_dist.remote_object_id from it. Otherwise, emit an error.
-    SmallVector<Value, 4> arguments;
-    for (Value value : remote_exec_op.getCalleeArgs()) {
-      if (value.getType().isa<tfrt::dist::RemoteObjectIdType>()) {
-        arguments.push_back(value);
-      } else if (value.getType().isa<tfrt::corert::TensorHandleType>()) {
-        auto new_op = builder.create<tfrt::dist::GetRemoteObjectIdFromTHOp>(
-            loc, remote_object_id_ty, value);
-        arguments.push_back(new_op.getResult());
-      } else {
-        remote_exec_op.emitOpError(
-            "callee argument type should be either "
-            "TensorHandle or RemoteObjectId");
-        signalPassFailure();
-        return WalkResult::interrupt();
-      }
-    }
-    // Result types are 1 chain, followed by `num_th_results + 1`
-    // tfrt_dist.remote_object_id results, followed by `num_th_results`
-    // corert.tensorhandle results.
-    int32_t num_th_results = remote_exec_op.getResults().size() - 1;
-    SmallVector<Type, 8> result_types;
-    result_types.push_back(chain_type);
-    for (int count : llvm::seq<int>(0, num_th_results + 1)) {
-      (void)count;
-      result_types.push_back(remote_object_id_ty);
-    }
-    for (int count : llvm::seq<int>(0, num_th_results)) {
-      (void)count;
-      result_types.push_back(tensor_handle_ty);
-    }
-    auto new_remote_exec_th_op = builder.create<tfrt::dist::RemoteExecuteTHOp>(
-        loc, result_types, register_op.getOutOpChain(),
-        remote_exec_op.getContext(), remote_exec_op.getRemoteTask(),
-        remote_spec, num_th_results, callee_name.getValue(),
-        std::move(arguments));
-    // The part of the new results to replace the original results are 2 chains,
-    // followed `num_th_results` corert.tesnorhandle results from the callee
-    // function.
-    SmallVector<Value, 4> new_results;
-    new_results.push_back(new_remote_exec_th_op.getResult(0));
-    new_results.push_back(new_remote_exec_th_op.getResult(1));
-    for (int i : llvm::seq<int>(0, num_th_results)) {
-      new_results.push_back(
-          new_remote_exec_th_op.getResult(i + 2 + num_th_results));
-    }
-    remote_exec_op.replaceAllUsesWith(new_results);
-    remote_exec_op.erase();
-
-    return WalkResult::advance();
-  });
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateDistRemoteRunEncapsulatePass() {
-  return std::make_unique<DistRemoteRunEncapsulatePass>();
-}
-
-static PassRegistration<DistRemoteRunEncapsulatePass> pass;
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/remove_device_attribute.cc b/tensorflow/compiler/mlir/tfrt/transforms/remove_device_attribute.cc
index 7051e8d5656..c765e08742a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/remove_device_attribute.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/remove_device_attribute.cc
@@ -63,7 +63,7 @@ void RemoveDeviceAttributePass::runOnOperation() {
     OpBuilder builder(execute_op);
     auto new_execute_op = builder.create<tfrt::corert::ExecuteOp>(
         execute_op.getLoc(), execute_op.getResultTypes(),
-        execute_op.getOpHandler(), execute_op.operands(), new_op_attrs,
+        execute_op.getOpHandler(), execute_op.getArguments(), new_op_attrs,
         op_func_attrs, execute_op.getOpName());
     execute_op.replaceAllUsesWith(new_execute_op.getResults());
     execute_op.erase();
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/remove_tf_if_const_args.cc b/tensorflow/compiler/mlir/tfrt/transforms/remove_tf_if_const_args.cc
index 07948452d99..d855fa41344 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/remove_tf_if_const_args.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/remove_tf_if_const_args.cc
@@ -61,7 +61,7 @@ class RemoveTfIfConstArgs
       llvm::SmallVector<unsigned, 2> const_arg_indices;
       // Record the remaining operands that won't be removed.
       llvm::SmallVector<mlir::Value, 2> remaining_args;
-      for (auto iter : llvm::enumerate(if_op.input())) {
+      for (auto iter : llvm::enumerate(if_op.getInput())) {
         mlir::Value operand = iter.value();
         if (auto const_op = operand.getDefiningOp<mlir::TF::ConstOp>()) {
           const_args.push_back(const_op);
@@ -95,10 +95,10 @@ class RemoveTfIfConstArgs
 
     // Change the if_op's argumetns to the new arguments, branches to new
     // branches. Note that the outputs are not changed.
-    if_op.inputMutable().assign(remaining_args);
-    if_op.then_branchAttr(
+    if_op.getInputMutable().assign(remaining_args);
+    if_op.setThenBranchAttr(
         mlir::SymbolRefAttr::get(builder.getContext(), new_then_function_name));
-    if_op.else_branchAttr(
+    if_op.setElseBranchAttr(
         mlir::SymbolRefAttr::get(builder.getContext(), new_else_function_name));
   }
 
@@ -153,7 +153,8 @@ class RemoveTfIfConstArgs
         new_branch.getLoc(), new_branch_type.getResults(), call_args,
         branch.getSymName(), "", "", "");
     // Note that the outputs are not changed.
-    builder.create<mlir::func::ReturnOp>(new_branch.getLoc(), call_op.output());
+    builder.create<mlir::func::ReturnOp>(new_branch.getLoc(),
+                                         call_op.getOutput());
 
     return new_branch.getSymName();
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/reorder_assert.cc b/tensorflow/compiler/mlir/tfrt/transforms/reorder_assert.cc
index b3993afeaf0..eb6eb9dbdae 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/reorder_assert.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/reorder_assert.cc
@@ -83,8 +83,7 @@ class ReorderTfAssertPass
   bool IsFunctionNonSideEffectingOrAssert(mlir::func::FuncOp func_op) {
     auto& block = func_op.front();
     for (mlir::Operation& op : block) {
-      if (!llvm::isa<mlir::TF::AssertOp>(&op) &&
-          !mlir::MemoryEffectOpInterface::hasNoEffect(&op))
+      if (!llvm::isa<mlir::TF::AssertOp>(&op) && !mlir::isMemoryEffectFree(&op))
         return false;
     }
     return true;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.cc
index 5df5f9264a7..a80d1ba7e18 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.cc
@@ -36,7 +36,7 @@ class SetShapeInvariantInWhileOps
     func_op.walk([&](mlir::TF::WhileOp op) {
       // Skip tf.While op on TPU.
       if (!op->hasAttr("_tpu_replicate")) {
-        op.shape_invariantAttr(shape_invariant);
+        op.setShapeInvariantAttr(shape_invariant);
       }
     });
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
new file mode 100644
index 00000000000..e775ebd896a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/sink_in_invariant_ops.cc
@@ -0,0 +1,186 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace tensorflow {
+namespace {
+
+// Clone the sinkable op associated with the func op to the func op
+void CloneOpIntoFuncOp(
+    const llvm::DenseMap<mlir::func::FuncOp,
+                         llvm::DenseMap<int32_t, mlir::Operation *>>
+        &func_op_operands_to_sink) {
+  for (auto &iter : func_op_operands_to_sink) {
+    auto func = iter.first;
+    mlir::OpBuilder builder(func);
+    builder.setInsertionPointToStart(&func.getBody().front());
+
+    for (auto &operand_iter : iter.second) {
+      auto *cloned_op = operand_iter.second->clone();
+      func.getArgument(operand_iter.first)
+          .replaceAllUsesWith(*cloned_op->getResults().begin());
+      builder.insert(cloned_op);
+      builder.setInsertionPointAfter(cloned_op);
+    }
+  }
+}
+
+// TODO(b/262610234): Generalize the sinking conditions.
+// Check if the op qualifies to sink to the callee.
+bool IsSinkCandidate(mlir::Operation *op) {
+  return op && llvm::isa<mlir::TF::VarHandleOp, mlir::TF::ConstOp,
+                         mlir::TF::HashTableOp>(op);
+}
+
+// Check if the op is allowed to be sinked. We are being conservative here to
+// whilelist very limited set of ops here.
+bool AllowSinkTo(mlir::Operation *op) {
+  return llvm::isa<mlir::TF::BatchFunctionOp, mlir::TF::IfOp,
+                   mlir::TF::StatefulPartitionedCallOp>(op);
+}
+
+// There are following cases:
+// #1, sink v1
+// @func1 { v1 = VarHandleOp, v2 = CallerOp{ f=@func2 }(v1) }
+// @func2(arg0) { v2 = ReadVariableOp }
+//
+// #2, copy v1 to callee, still keep in func1
+// @func1 { v1 = VarHandleOp, v2 = ReadVariableOp, v3 = CallerOp{ f=@func2 }(v1)
+// }
+// @func2(arg0) { v2 = ReadVariableOp(arg0) }
+//
+// #3, sink v1 and v2
+// @func1 { v1 = VarHandleOp, v2 = ReadVariableOp, v3 = CallerOp{ f=@func2 }(v2)
+// }
+// @func2(arg0) { v2 = OtherOp(arg0) }
+//
+// #4, copy v1 and v2 to func2, keep in func1
+// @func1 { v1 = VarHandleOp, v2 = ReadVariableOp, v3 = OtherOp(v2), v4 =
+// CallerOp{ f=@func2 }(v2) }
+// @func2(arg0) { v2 = OtherOp(arg0) }
+//
+// We only support #1 for now as that's the most common pattern from production.
+// If we implement #2 and #4 in the future, should consider dedupe in the
+// tfrt_resource_init because multiple resource handle will be created on the
+// same resource.
+
+void SinkInInvariantOps(mlir::ModuleOp module) {
+  mlir::SymbolTable symbol_table(module);
+  mlir::SymbolTableCollection symbol_table_collection;
+  mlir::SymbolUserMap symbol_users(symbol_table_collection, module);
+
+  // Maps from function op, to the operand index to erase, to the caller op.
+  llvm::DenseMap<mlir::func::FuncOp, llvm::DenseMap<int32_t, mlir::Operation *>>
+      func_op_operands_to_sink;
+
+  // TODO(b/263191534): Replace with CallOpInterface to handle callees.
+  // Identify the invariant Op, Caller, Callee FuncOp to update.
+  module.walk([&](mlir::Operation *op) {
+    if (!AllowSinkTo(op)) return;
+
+    auto track_callee = [&](mlir::func::FuncOp &func_op) {
+      auto diff = op->getNumOperands() - func_op.getNumArguments();
+      for (int i = 0; i < func_op.getNumArguments(); ++i) {
+        auto arg_op = op->getOperand(diff + i).getDefiningOp();
+        if (!IsSinkCandidate(arg_op)) continue;
+        func_op_operands_to_sink[func_op][i] = arg_op;
+      }
+    };
+
+    llvm::DenseSet<llvm::StringRef> callees;
+    for (const auto &named_attr : op->getAttrs()) {
+      if (auto symbol_attr =
+              named_attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+        auto symbol = symbol_attr.getValue();
+
+        auto callee = symbol_table.lookup<mlir::func::FuncOp>(symbol);
+        if (!callee) continue;
+
+        // One callee invoked by multiple caller is skipped for simplicity.
+        // Consider adding support if more usage are observed from production.
+        if (const llvm::ArrayRef<mlir::Operation *> users =
+                symbol_users.getUsers(callee);
+            users.size() > 1)
+          continue;
+
+        // Invoked by same caller multiple times, only process the first one.
+        if (callees.count(symbol)) continue;
+        track_callee(callee);
+        callees.insert(symbol);
+      }
+    }
+  });
+
+  CloneOpIntoFuncOp(func_op_operands_to_sink);
+}
+
+class SinkInInvariantOpsPass
+    : public mlir::PassWrapper<SinkInInvariantOpsPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SinkInInvariantOpsPass)
+
+  llvm::StringRef getArgument() const final {
+    return "tfrt-sink-in-invariant-ops";
+  }
+  llvm::StringRef getDescription() const final {
+    return "Sink in the invariant ops to facilitate invariant ops hoisting.";
+  }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    SinkInInvariantOps(module);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkInInvariantOpsPass() {
+  return std::make_unique<SinkInInvariantOpsPass>();
+}
+
+static mlir::PassRegistration<SinkInInvariantOpsPass>
+    sink_in_invariant_ops_pass;
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
index 39c558a66ad..91ec99eb94a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h"
 #include "tensorflow/compiler/mlir/tfrt/analysis/tensor_array_side_effect_analysis.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/gpu_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_jitrt_ops.h"
@@ -60,8 +61,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/transforms/attr_lowering_utils.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/corert_converter.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/fallback_converter.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/tstring.h"
@@ -72,8 +75,6 @@ limitations under the License.
 #include "tfrt/core_runtime/opdefs/attributes.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/core_runtime.h"  // from @tf_runtime
 #include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/kernels.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/opdefs/types.h"  // from @tf_runtime
 #include "tfrt/test_kernels/opdefs/test_kernels.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -93,8 +94,7 @@ constexpr int64_t kDefaultCheapCost = 1;
 void getDependentConversionDialects(mlir::DialectRegistry &registry) {
   registry.insert<tfrt::corert::CoreRTDialect, mlir::func::FuncDialect,
                   tfrt::fallback_async::FallbackAsyncDialect,
-                  tfrt::compiler::TFRTDialect, tfrt::dist::DistributedDialect,
-                  tf_jitrt::JitRuntimeDialect>();
+                  tfrt::compiler::TFRTDialect, tf_jitrt::JitRuntimeDialect>();
 }
 
 mlir::Value GetFunctionInputChain(mlir::Operation *op) {
@@ -102,6 +102,49 @@ mlir::Value GetFunctionInputChain(mlir::Operation *op) {
   return func_op.getArgument(0);
 }
 
+llvm::SmallVector<mlir::Value, 4> AddGpuVariableAndInputTensorTransferOps(
+    mlir::Operation *op, llvm::SmallVector<mlir::Value, 4> operands,
+    mlir::ConversionPatternRewriter &rewriter) {
+  llvm::SmallVector<mlir::Value, 4> new_operands;
+  assert(op->getOperands().size() == operands.size());
+  for (int i = 0; i < op->getOperands().size(); ++i) {
+    if (IsResultVariable(op->getOperand(i), operands[i])) {
+      auto transfer_variable_op =
+          rewriter.create<tfrt::gpu::MaybeTransferVariableOp>(
+              op->getLoc(), rewriter.getType<tfrt::fallback::TFTensorType>(),
+              mlir::ValueRange{operands[i]}, ArrayRef<mlir::NamedAttribute>());
+      new_operands.push_back(transfer_variable_op);
+    } else {
+      auto transfer_to_device_op =
+          rewriter.create<tfrt::gpu::TransferToDeviceOp>(
+              op->getLoc(), rewriter.getType<tfrt::fallback::TFTensorType>(),
+              mlir::ValueRange{operands[i]}, ArrayRef<mlir::NamedAttribute>());
+      new_operands.push_back(transfer_to_device_op);
+    }
+  }
+  return new_operands;
+}
+
+llvm::SmallVector<mlir::Value, 4> AddGpuTransferFromDeviceOps(
+    mlir::Operation *op, llvm::SmallVector<mlir::Value, 4> results,
+    mlir::ConversionPatternRewriter &rewriter) {
+  assert(results.size() == op->getNumResults());
+  llvm::SmallVector<mlir::Value, 4> new_results;
+  for (int idx = 0; idx < results.size(); ++idx) {
+    if (op->getResult(idx).use_empty()) {
+      // If the result is not used, it is not transferred.
+      new_results.push_back(results[idx]);
+    } else {
+      auto transfer_from_device_op =
+          rewriter.create<tfrt::gpu::TransferFromDeviceOp>(
+              op->getLoc(), rewriter.getType<tfrt::fallback::TFTensorType>(),
+              results[idx]);
+      new_results.push_back(transfer_from_device_op);
+    }
+  }
+  return new_results;
+}
+
 // Convert TF dialect ops to tfrt_fallback.executeop for non-side-effecting ops
 // and tfrt_fallback.executeop.seq for side-effecting ops.
 //
@@ -120,15 +163,18 @@ class FallbackExecuteOpConversion : public mlir::ConversionPattern {
   FallbackExecuteOpConversion(
       mlir::MLIRContext *context, CoreRTConverter *corert_converter,
       tfrt_compiler::FallbackConverter *fallback_converter,
+      const mlir::SymbolTable *symbol_table,
       const tfrt_compiler::CostAnalysis *cost_analysis,
-      bool tpu_lower_to_fallback, bool target_tpurt)
+      bool tpu_lower_to_fallback, bool target_tpurt, bool use_bridge_for_gpu)
       : mlir::ConversionPattern(mlir::Pattern::MatchAnyOpTypeTag(),
                                 kFallbackBenefit, context),
         corert_converter_(*corert_converter),
         fallback_converter_(*fallback_converter),
+        symbol_table_(*symbol_table),
         cost_analysis_(*cost_analysis),
         tpu_lower_to_fallback_(tpu_lower_to_fallback),
-        target_tpurt_(target_tpurt) {}
+        target_tpurt_(target_tpurt),
+        use_bridge_for_gpu_(use_bridge_for_gpu) {}
 
   LogicalResult matchAndRewrite(
       mlir::Operation *op, ArrayRef<mlir::Value> operands,
@@ -151,8 +197,21 @@ class FallbackExecuteOpConversion : public mlir::ConversionPattern {
     // Convert the function (symbol) attributes to an array of string
     // attributes, which represents the function names.
     llvm::SmallVector<mlir::StringAttr, 4> func_attr_keys;
-    mlir::ArrayAttr op_func_attrs =
-        corert_converter_.CreateOpFuncAttrs(op->getAttrs(), &func_attr_keys);
+
+    // If the op is XlaLaunch on GPU, the function attribute will use the
+    // function name in MLIR, instead of the original function name in the
+    // function library, because the function could have been changed by bridge,
+    // e.g., variable lifting. The new MLIR function will need to be exported to
+    // the function library for runtime to use.
+    bool use_mlir_func_name =
+        parsed_device_name->device_type == DEVICE_GPU && use_bridge_for_gpu_ &&
+        op->getName().getStringRef().str() == "tf.XlaLaunch";
+
+    mlir::ArrayAttr op_func_attrs = corert_converter_.CreateOpFuncAttrs(
+        symbol_table_, op->getAttrs(), &func_attr_keys, use_mlir_func_name);
+    if (!op_func_attrs) {
+      return op->emitWarning("failed to create func attributes.");
+    }
 
     // Remove the function attributes, which have already been processed.
     for (const auto &key : func_attr_keys) op->removeAttr(key);
@@ -235,9 +294,12 @@ class FallbackExecuteOpConversion : public mlir::ConversionPattern {
 
   CoreRTConverter &corert_converter_;
   tfrt_compiler::FallbackConverter &fallback_converter_;
+  const mlir::SymbolTable &symbol_table_;
   const tfrt_compiler::CostAnalysis &cost_analysis_;
   bool tpu_lower_to_fallback_;
   bool target_tpurt_;
+  // TODO(b/260915352): Remove the flag and default to using bridge.
+  bool use_bridge_for_gpu_;
 };
 
 mlir::LogicalResult FallbackExecuteOpConversion::ConvertToFallbackExecuteOp(
@@ -262,16 +324,27 @@ mlir::LogicalResult FallbackExecuteOpConversion::ConvertToFallbackExecuteOp(
   IntegerAttr cost;
   auto parsed_device_name =
       corert_converter_.ParseDeviceName(device.getValue());
-  if (parsed_device_name && parsed_device_name->device_type == DEVICE_GPU) {
+  bool is_gpu_op =
+      parsed_device_name && parsed_device_name->device_type == DEVICE_GPU;
+  if (is_gpu_op) {
     // For GPU ops, the host only needs to dispatch them to GPUs, which should
     // be relatively cheap for the host.
     cost = rewriter.getI64IntegerAttr(kDefaultCheapCost);
   } else {
-    cost = rewriter.getI64IntegerAttr(
-        cost_analysis_.GetCost(op, fallback_key.getInt()));
+    cost = rewriter.getI64IntegerAttr(cost_analysis_.GetCost(op));
+  }
+
+  // For now, we only consider GPU XLA clusters in the form of XlaLaunch for
+  // simplicity. We could extend to support other GPU ops that cann't be XLAed.
+  bool is_xla_launch_on_gpu =
+      is_gpu_op && use_bridge_for_gpu_ &&
+      op->getName().getStringRef().str() == "tf.XlaLaunch";
+  if (is_xla_launch_on_gpu) {
+    new_operands =
+        AddGpuVariableAndInputTensorTransferOps(op, new_operands, rewriter);
   }
 
-  if (mlir::MemoryEffectOpInterface::hasNoEffect(op)) {
+  if (mlir::isMemoryEffectFree(op)) {
     auto new_op = rewriter.create<tfrt::fallback_async::ExecuteOp>(
         op->getLoc(), result_types, new_operands, device, op_attrs,
         op_func_attrs, fallback_key, op_name, cost);
@@ -297,8 +370,16 @@ mlir::LogicalResult FallbackExecuteOpConversion::ConvertToFallbackExecuteOp(
           new_operands, device, op_attrs, op_func_attrs, fallback_key, op_name,
           cost);
       fallback_converter.RegisterFallbackOp(new_op);
-      rewriter.replaceOp(op, new_op.getResults());
       out_chain = new_op.getOutOpChain();
+      if (is_xla_launch_on_gpu) {
+        // TODO(b/262280565): Remove unnecessary data transfers when there are
+        // multiple XLA clusters.
+        auto results =
+            AddGpuTransferFromDeviceOps(op, new_op.getResults(), rewriter);
+        rewriter.replaceOp(op, results);
+      } else {
+        rewriter.replaceOp(op, new_op.getResults());
+      }
     }
 
     // Register the converted op so that it can be retrieved by successors.
@@ -328,7 +409,7 @@ mlir::LogicalResult FallbackExecuteOpConversion::ConvertToCoreRTExecuteOp(
       corert_converter_.ConvertOpHandler(op, op_handler_name, &rewriter);
   if (!op_handler) return failure();
 
-  if (mlir::MemoryEffectOpInterface::hasNoEffect(op)) {
+  if (mlir::isMemoryEffectFree(op)) {
     auto new_op = rewriter.create<tfrt::corert::ExecuteOp>(
         op->getLoc(), result_types, op_handler, new_operands, op_attrs,
         op_func_attrs, op_name);
@@ -360,15 +441,15 @@ class FallbackConstOpConversion
       mlir::TF::ConstOp op, OpAdaptor adaptor,
       mlir::ConversionPatternRewriter &rewriter) const override {
     // Some data types are handled separately using a fast path.
-    if (IsSupportedTfrtNumericDType(op.dtype()) ||
-        op.dtype().isa<mlir::TF::StringType>())
+    if (IsSupportedTfrtNumericDType(op.getDtype()) ||
+        op.getDtype().isa<mlir::TF::StringType>())
       return failure();
 
     // For other data types that do not have a fast path (eg. quantized types),
     // we convert it to serialized tensor proto.
 
     tensorflow::TensorProto tensor_proto;
-    auto status = ConvertToTensorProto(op.value(), &tensor_proto);
+    auto status = ConvertToTensorProto(op.getValue(), &tensor_proto);
     if (!status.ok()) return op.emitError(status.error_message());
 
     rewriter.replaceOpWithNewOp<tfrt::fallback_async::ConstTensorProtoOp>(
@@ -411,7 +492,7 @@ class FallbackSetResourceOp
     auto new_op = rewriter.create<tfrt::fallback_async::SetResourceOp>(
         op.getLoc(), corert_converter_.chain_type(),
         corert_converter_.GetLocalSideEffectChain(op, &rewriter),
-        new_operands[0], device.getValue(), op.index());
+        new_operands[0], device.getValue(), op.getIndex());
 
     // Register the converted op so that it can be retrieved by successors.
     corert_converter_.RegisterLocalSideEffectChain(op, new_op.getOutCh());
@@ -448,7 +529,7 @@ class FallbackGetResourceOp
 
     auto new_op = rewriter.create<tfrt::fallback_async::GetResourceOp>(
         op.getLoc(), corert_converter_.chain_type(), result_types, ready_chain,
-        device.getValue(), op.indices());
+        device.getValue(), op.getIndices());
 
     rewriter.replaceOp(op, new_op.getResults());
 
@@ -492,16 +573,6 @@ class TFDeviceRemoteRunOpConversion
   LogicalResult matchAndRewrite(
       tf_device::RemoteRunOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    mlir::Value distributed_context =
-        corert_converter_.GetDistributedContext(op.getOperation(), &rewriter);
-    mlir::Value in_op_chain =
-        corert_converter_.GetLocalSideEffectChain(op, &rewriter);
-    mlir::Value task_handle = corert_converter_.GetTaskHandle(
-        op.getOperation(), op.getHost(), &rewriter);
-    mlir::Value remote_chain_mgr =
-        corert_converter_.GetRemoteChainManager(op, &rewriter);
-    mlir::Type remote_obj_id_ty =
-        rewriter.getType<tfrt::dist::RemoteObjectIdType>();
     ModuleOp module = op->getParentOfType<ModuleOp>();
     SymbolTable symtab(module);
     func::FuncOp callee = symtab.lookup<func::FuncOp>(op.getCallee());
@@ -517,11 +588,7 @@ class TFDeviceRemoteRunOpConversion
     }
 
     llvm::SmallVector<mlir::Value, 4> arguments;
-    // The first argument of the remote function should be a remote chain which
-    // is added to the function signature when it is lowered from TF dialect to
-    // TFRT dialect.
-    arguments.push_back(corert_converter_.GetRemoteSideEffectChain(
-        op, host.getValue(), &rewriter));
+
     for (mlir::Value argument : op.getCalleeArgs()) {
       arguments.push_back(argument);
     }
@@ -530,23 +597,9 @@ class TFDeviceRemoteRunOpConversion
     // The first result of the remote function should be a remote chain which
     // is added to the function signature when it is lowered from TF dialect to
     // TFRT dialect.
-    result_types.push_back(remote_obj_id_ty);
     for (mlir::Type type : op.getResultTypes()) {
       (void)type_converter_.convertType(type, result_types);
     }
-    auto remote_execute_func_op =
-        rewriter.create<tfrt::dist::RemoteExecuteFuncOp>(
-            op.getLoc(), corert_converter_.chain_type(), result_types,
-            in_op_chain, distributed_context, task_handle, op.getCallee(),
-            arguments);
-    rewriter.replaceOp(op, remote_execute_func_op.getResults().drop_front(1));
-
-    auto set_chain_op = rewriter.create<tfrt::dist::SetChainForTaskHandleOp>(
-        op.getLoc(), corert_converter_.chain_type(),
-        remote_execute_func_op.getOutOpChain(), remote_chain_mgr, task_handle,
-        remote_execute_func_op.getResults().front());
-    corert_converter_.RegisterLocalSideEffectChain(
-        op, set_chain_op.getOutOpChain());
 
     return success();
   }
@@ -602,7 +655,8 @@ class FallbackBatchFunctionOpConversion
         op->getNumResults(), rewriter.getType<tfrt::fallback::TFTensorType>());
 
     auto new_op = rewriter.create<tfrt::fallback_async::BatchFunctionOp>(
-        op.getLoc(), result_types, new_operands, device, op.fAttr(), op_attrs);
+        op.getLoc(), result_types, new_operands, device, op.getFAttr(),
+        op_attrs);
     rewriter.replaceOp(op, new_op.getResults());
     return success();
   }
@@ -624,7 +678,7 @@ class CoreRTConstDenseTensorOpConversion
   LogicalResult matchAndRewrite(
       mlir::TF::ConstOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    if (!IsSupportedTfrtNumericDType(op.dtype())) return failure();
+    if (!IsSupportedTfrtNumericDType(op.getDtype())) return failure();
 
     // Only CPU ops can be lowered using this conversion. If there is no device
     // assignment, this op is treated as a CPU op and can be lowered.
@@ -633,7 +687,7 @@ class CoreRTConstDenseTensorOpConversion
 
     auto new_op = rewriter.create<tfrt::corert::ConstDenseTensorOp>(
         op.getLoc(), corert_converter_.tensor_handle_type(),
-        op.value().cast<DenseElementsAttr>());
+        op.getValue().cast<DenseElementsAttr>());
     rewriter.replaceOp(op, new_op->getResult(0));
     return success();
   }
@@ -746,9 +800,10 @@ class CoreRTConstStringTensorOpConversion
   LogicalResult matchAndRewrite(
       mlir::TF::ConstOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    if (!op.dtype().isa<mlir::TF::StringType>()) return failure();
+    if (!op.getDtype().isa<mlir::TF::StringType>()) return failure();
 
-    DenseStringElementsAttr attr = op.value().cast<DenseStringElementsAttr>();
+    DenseStringElementsAttr attr =
+        op.getValue().cast<DenseStringElementsAttr>();
 
     llvm::SmallVector<Attribute, 4> values;
     values.reserve(attr.getNumElements());
@@ -757,7 +812,7 @@ class CoreRTConstStringTensorOpConversion
           llvm::StringRef(element.data(), element.size())));
 
     // Create the shape attribute from the tensor shape.
-    ArrayRef<int64_t> shape = op.value().getType().getShape();
+    ArrayRef<int64_t> shape = op.getValue().getType().getShape();
     llvm::SmallVector<mlir::Attribute, 4> dims;
     dims.reserve(shape.size());
     auto i64_type = rewriter.getIntegerType(64);
@@ -777,100 +832,6 @@ class CoreRTConstStringTensorOpConversion
   CoreRTConverter &corert_converter_;
 };
 
-// Convert TF dialect operations with no side effects to CoreRT ExecuteOp. For
-// example,
-//
-// %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-//
-// is converted to
-//
-// %result = corert.executeop(%device)
-//    "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (!corert.tensorhandle, !corert.tensorhandle) -> !corert.tensorhandle
-//
-// Note that it will fail to match if some attributes are not supported.
-template <typename TF_Op>
-class CoreRTExecuteOpConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  CoreRTExecuteOpConversion(mlir::MLIRContext *context,
-                            CoreRTConverter *corert_converter)
-      : CoreRTExecuteOpConversion(context, corert_converter, "") {}
-
-  // If device_name is not empty, only ops that are using this device is lowered
-  // using CoreRTExecuteOpConversion.
-  CoreRTExecuteOpConversion(mlir::MLIRContext *context,
-                            CoreRTConverter *corert_converter,
-                            llvm::StringRef device_name)
-      : mlir::OpConversionPattern<TF_Op>(context, kCoreRTBenefit),
-        corert_converter_(*corert_converter),
-        device_name_(device_name) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, typename TF_Op::Adaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto parsed_device_name = corert_converter_.ParseDeviceName(op);
-    // Return failure and emit warning if there is no device assignment.
-    if (!parsed_device_name) {
-      return op->emitWarning(
-          "failed to retrieve valid device when converting to "
-          "corert.executeop");
-    }
-
-    // If device_name is specified, check the device of this op first.
-    if (!device_name_.empty()) {
-      // Skip if it does not match the specified device.
-      if (parsed_device_name->device_name != device_name_) return failure();
-    }
-
-    mlir::StringAttr op_name = rewriter.getStringAttr(op.getOperationName());
-
-    llvm::SmallVector<Type, 4> result_types;
-    for (auto type : op.getOperation()->getResultTypes()) {
-      if (failed(corert_converter_.convertType(type, result_types)))
-        return failure();
-    }
-
-    corert_converter_.MaterializeDerivedAttributes(op);
-
-    // Convert the function (symbol) attributes to an array of string
-    // attributes, which represents the function names.
-    llvm::SmallVector<mlir::StringAttr, 4> func_attr_keys;
-    ArrayAttr op_func_attrs =
-        corert_converter_.CreateOpFuncAttrs(op->getAttrs(), &func_attr_keys);
-
-    // Remove the function attributes, which have already been processed.
-    for (const auto &key : func_attr_keys) op->removeAttr(key);
-
-    Builder builder(op.getContext());
-    ArrayAttr op_attrs = CreateTfrtOpAttrs(op->getAttrs(), builder);
-    if (!op_attrs) return op.emitError("failed to lower attributes.");
-
-    llvm::SmallVector<mlir::Value, 4> new_operands;
-    if (mlir::failed(tfrt_compiler::ConvertCoreRTOperands(
-            op, adaptor.getOperands(), &new_operands, rewriter)))
-      return failure();
-
-    // Get the op handler, or create one if there does not exist one. Note that
-    // ConvertOpHandler changes internal state so it can only be called if the
-    // rewrite is guaranteed to succeed afterwards.
-    auto op_handler = corert_converter_.ConvertOpHandler(
-        op, parsed_device_name->op_handler_name, &rewriter);
-    if (!op_handler) return failure();
-
-    auto new_op = rewriter.create<tfrt::corert::ExecuteOp>(
-        op.getLoc(), result_types, op_handler, new_operands, op_attrs,
-        op_func_attrs, op_name);
-
-    rewriter.replaceOp(op, new_op.getResults());
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-  llvm::StringRef device_name_;
-};
-
 LogicalResult ConvertFunctionCallOperands(
     mlir::Operation *op, ValueRange operands,
     llvm::SmallVectorImpl<mlir::Value> *new_operands,
@@ -932,7 +893,7 @@ class TFRTCallOpConversion : public mlir::OpConversionPattern<CallOp> {
         new_operands);
     rewriter.replaceOp(op, new_op.getResults().drop_front());
 
-    if (!mlir::MemoryEffectOpInterface::hasNoEffect(op)) {
+    if (!mlir::isMemoryEffectFree(op)) {
       // Register the converted op so that it can be retrieved by successors.
       // TODO(chky): Add OpTraits or OpInterface, rather than assume first
       // result is a chain.
@@ -1011,7 +972,7 @@ class TFRTCaseOpConversion : public mlir::OpConversionPattern<TF::CaseOp> {
   LogicalResult matchAndRewrite(
       TF::CaseOp op, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
-    mlir::ArrayAttr branches = op.branches();
+    mlir::ArrayAttr branches = op.getBranches();
 
     llvm::SmallVector<mlir::Type, 4> result_types;
     result_types.push_back(corert_converter_.chain_type());
@@ -1092,8 +1053,8 @@ class TFRTCondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
   mlir::LogicalResult matchAndRewrite(
       mlir::TF::IfOp op, OpAdaptor adaptor,
       mlir::ConversionPatternRewriter &rewriter) const override {
-    mlir::FlatSymbolRefAttr then_branch = op.then_branchAttr();
-    mlir::FlatSymbolRefAttr else_branch = op.else_branchAttr();
+    mlir::FlatSymbolRefAttr then_branch = op.getThenBranchAttr();
+    mlir::FlatSymbolRefAttr else_branch = op.getElseBranchAttr();
 
     llvm::SmallVector<Type, 4> result_types;
     result_types.push_back(rewriter.getType<tfrt::compiler::ChainType>());
@@ -1124,7 +1085,7 @@ class TFRTCondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
     // The first result is a !tfrt.chain.
     rewriter.replaceOp(op, new_op.getResults().drop_front(1));
 
-    if (!mlir::MemoryEffectOpInterface::hasNoEffect(op)) {
+    if (!mlir::isMemoryEffectFree(op)) {
       // Register the converted op so that it can be retrieved by successors.
       // TODO(chky): Add OpTraits or OpInterface, rather than assume first
       // result is a chain.
@@ -1195,8 +1156,8 @@ class TFRTWhileOpConversion
   mlir::LogicalResult matchAndRewrite(
       mlir::TF::WhileOp op, OpAdaptor adaptor,
       mlir::ConversionPatternRewriter &rewriter) const override {
-    mlir::FlatSymbolRefAttr cond_fn = op.condAttr();
-    mlir::FlatSymbolRefAttr body_fn = op.bodyAttr();
+    mlir::FlatSymbolRefAttr cond_fn = op.getCondAttr();
+    mlir::FlatSymbolRefAttr body_fn = op.getBodyAttr();
 
     llvm::SmallVector<Type, 4> while_arg_result_types;
     // Insert a chain for side effects as the first argument/result.
@@ -1249,7 +1210,7 @@ class TFRTWhileOpConversion
     while_args[0] = pred_chain;
 
     int64_t parallel_iterations =
-        enable_while_parallel_iterations_ ? op.parallel_iterations() : 1;
+        enable_while_parallel_iterations_ ? op.getParallelIterations() : 1;
 
     auto new_op = rewriter.create<tfrt::compiler::WhileOp>(
         op.getLoc(), while_arg_result_types, first_iteration_bool_cond,
@@ -1259,7 +1220,7 @@ class TFRTWhileOpConversion
 
     if (!has_at_most_tensor_array_effect) out_chain = new_op.getResult(0);
 
-    if (!mlir::MemoryEffectOpInterface::hasNoEffect(op)) {
+    if (!mlir::isMemoryEffectFree(op)) {
       // Register the converted op so that it can be retrieved by successors.
       // TODO(chky): Add OpTraits or OpInterface, rather than assume first
       // result is a chain.
@@ -1370,7 +1331,7 @@ mlir::func::FuncOp TFRTWhileOpConversion::GetWhileBodyFunction(
     mlir::func::FuncOp pred_fn, mlir::TypeRange arg_types,
     mlir::ConversionPatternRewriter &rewriter) const {
   int64_t parallel_iterations =
-      enable_while_parallel_iterations_ ? op.parallel_iterations() : 1;
+      enable_while_parallel_iterations_ ? op.getParallelIterations() : 1;
 
   std::string body_fn_name = original_body_fn.getValue().str() + "/tfrt_body_" +
                              absl::StrCat(parallel_iterations);
@@ -1491,7 +1452,6 @@ void SetUpTFToTFRTConversionLegality(mlir::ConversionTarget *target,
   target->addLegalDialect<tfrt::corert::CoreRTDialect>();
   target->addLegalDialect<tfrt::fallback_async::FallbackAsyncDialect>();
   target->addLegalDialect<tfrt::compiler::TFRTDialect>();
-  target->addLegalDialect<tfrt::dist::DistributedDialect>();
   target->addLegalDialect<tfrt::test::TestDialect>();
   target->addLegalDialect<tf_jitrt::JitRuntimeDialect>();
   target->addIllegalDialect<TF::TensorFlowDialect>();
@@ -1529,13 +1489,12 @@ void PopulateTFToTFRTConversionPatterns(
     const tfrt_compiler::CostAnalysis *cost_analysis,
     const tfrt_compiler::TensorArraySideEffectAnalysis
         *tensor_array_side_effect_analysis,
-    bool enable_native_ops, bool func_use_fallback_tensor,
-    bool enable_while_parallel_iterations, bool tpu_lower_to_fallback,
-    bool target_tpurt) {
+    bool func_use_fallback_tensor, bool enable_while_parallel_iterations,
+    bool tpu_lower_to_fallback, bool target_tpurt, bool use_bridge_for_gpu) {
   // By default, we lower all TF ops to fallback ops.
   patterns->add<FallbackExecuteOpConversion>(
-      context, corert_converter, fallback_converter, cost_analysis,
-      tpu_lower_to_fallback, target_tpurt);
+      context, corert_converter, fallback_converter, symbol_table,
+      cost_analysis, tpu_lower_to_fallback, target_tpurt, use_bridge_for_gpu);
   patterns->add<FallbackConstOpConversion, FallbackSetResourceOp,
                 FallbackGetResourceOp>(context, corert_converter);
 
@@ -1571,47 +1530,6 @@ void PopulateTFToTFRTConversionPatterns(
   // use ExecuteOp pattern to convert string tensor attribute.
   patterns->add<CoreRTConstStringTensorOpConversion,
                 CoreRTConstDenseTensorOpConversion>(context, corert_converter);
-
-  if (enable_native_ops) {
-    // Below TF operations will be converted to use corert.executeop, which will
-    // invoke TFRT native op if implemented.
-    // TODO(b/187942369): Pattern registration for TF operations is not
-    // sustainable currently. We need to figure out a plan.
-    patterns->add<CoreRTExecuteOpConversion<TF::AddV2Op>,
-                  // TODO(chky): Move the ReadVariableOp + Identity pattern
-                  // to optimizer.
-                  // CoreRTExecuteOpConversion<TF::IdentityOp>,
-                  CoreRTExecuteOpConversion<TF::MulOp>,
-                  CoreRTExecuteOpConversion<TF::BiasAddOp>,
-                  CoreRTExecuteOpConversion<TF::Conv2DOp>,
-                  CoreRTExecuteOpConversion<TF::ConcatV2Op>,
-                  CoreRTExecuteOpConversion<TF::CastOp>,
-                  CoreRTExecuteOpConversion<TF::ExpandDimsOp>,
-                  CoreRTExecuteOpConversion<TF::TransposeOp>,
-                  CoreRTExecuteOpConversion<TF::FusedBatchNormV3Op>,
-                  CoreRTExecuteOpConversion<TF::_FusedBatchNormExOp>,
-                  CoreRTExecuteOpConversion<TF::LogOp>,
-                  CoreRTExecuteOpConversion<TF::Log1pOp>,
-                  CoreRTExecuteOpConversion<TF::LogSoftmaxOp>,
-                  CoreRTExecuteOpConversion<TF::MatMulOp>,
-                  CoreRTExecuteOpConversion<TF::_FusedMatMulOp>,
-                  CoreRTExecuteOpConversion<TF::MaxPoolOp>,
-                  CoreRTExecuteOpConversion<TF::MeanOp>,
-                  CoreRTExecuteOpConversion<TF::MulOp>,
-                  CoreRTExecuteOpConversion<TF::PadOp>,
-                  CoreRTExecuteOpConversion<TF::RealDivOp>,
-                  CoreRTExecuteOpConversion<TF::ReluOp>,
-                  CoreRTExecuteOpConversion<TF::ReshapeOp>,
-                  CoreRTExecuteOpConversion<TF::RsqrtOp>,
-                  CoreRTExecuteOpConversion<TF::SoftmaxOp>,
-                  CoreRTExecuteOpConversion<TF::ShapeOp>,
-                  CoreRTExecuteOpConversion<TF::SigmoidOp>,
-                  CoreRTExecuteOpConversion<TF::SubOp>,
-                  CoreRTExecuteOpConversion<TF::TileOp>,
-                  CoreRTExecuteOpConversion<TF::TanhOp>,
-                  CoreRTExecuteOpConversion<TF::ZerosLikeOp>>(context,
-                                                              corert_converter);
-  }
 }
 
 // Lower TF dialect MLIR to TFRT dialect.
@@ -1622,6 +1540,7 @@ class TfToTfrtConversionPass
     getDependentConversionDialects(registry);
 
     if (target_tpurt_) RegisterTPUDialects(&registry);
+    if (target_gpu_) RegisterGpuDialects(&registry);
   }
 
   llvm::StringRef getArgument() const final { return "tf-to-tfrt"; }
@@ -1636,7 +1555,6 @@ class TfToTfrtConversionPass
   TfToTfrtConversionPass() = default;
   explicit TfToTfrtConversionPass(const TfrtPipelineOptions &options) {
     target_tpurt_ = options.target_tpurt;
-    enable_native_ops_ = options.enable_native_ops;
     tpu_use_core_selector_ = options.tpu_use_core_selector;
     tpu_use_bundled_transfer_ = options.tpu_use_bundled_transfer;
     tpu_lower_to_fallback_ = options.tpu_lower_to_fallback;
@@ -1649,6 +1567,8 @@ class TfToTfrtConversionPass
     func_use_fallback_tensor_ = options.func_use_fallback_tensor;
     enable_while_parallel_iterations_ =
         options.enable_while_parallel_iterations;
+    target_gpu_ = options.target_gpu;
+    use_bridge_for_gpu_ = options.use_bridge_for_gpu;
   }
   TfToTfrtConversionPass(const TfToTfrtConversionPass &) {}
 
@@ -1673,6 +1593,10 @@ class TfToTfrtConversionPass
               tpu_transfer_result_to_host_, use_tpu_host_allocator_for_inputs_},
           tpu_lower_to_fallback_);
 
+    if (target_gpu_) {
+      AddGpuTargetDialectAndPatterns(&context, &target, &patterns);
+    }
+
     mlir::TypeConverter *func_type_converter;
     if (func_use_fallback_tensor_) {
       func_type_converter = &fallback_converter;
@@ -1686,9 +1610,8 @@ class TfToTfrtConversionPass
     PopulateTFToTFRTConversionPatterns(
         &context, &patterns, &corert_converter, &fallback_converter,
         &symbol_table, &cost_analysis, &tensor_array_side_effect_analysis,
-        enable_native_ops_, func_use_fallback_tensor_,
-        enable_while_parallel_iterations_, tpu_lower_to_fallback_,
-        target_tpurt_);
+        func_use_fallback_tensor_, enable_while_parallel_iterations_,
+        tpu_lower_to_fallback_, target_tpurt_, use_bridge_for_gpu_);
 
     return mlir::applyPartialConversion(func, target, std::move(patterns));
   }
@@ -1859,13 +1782,6 @@ class TfToTfrtConversionPass
                              llvm::cl::desc("Target TPURT dialect if true."),
                              llvm::cl::init(false)};
 
-  Option<bool> enable_native_ops_{
-      *this, "enable-native-ops",
-      llvm::cl::desc(
-          "If true, native ops will be used on an opt-in basis "
-          "instead of fallback ops. If false, no native ops are used."),
-      llvm::cl::init(true)};
-
   Option<bool> tpu_use_core_selector_{
       *this, "tpu-use-core-selector",
       llvm::cl::desc("If true, use ServingCoreSelector to pick TPU core. "
@@ -1899,6 +1815,16 @@ class TfToTfrtConversionPass
                      "program will use tpu host allocator."),
       llvm::cl::init(false)};
 
+  Option<bool> target_gpu_{
+      *this, "target-gpu",
+      llvm::cl::desc("If true, target GPU compiler passes."),
+      llvm::cl::init(false)};
+
+  // TODO(b/260915352): Remove the flag and default to using bridge.
+  Option<bool> use_bridge_for_gpu_{
+      *this, "use-bridge-for-gpu",
+      llvm::cl::desc("If true, GPU bridge is used."), llvm::cl::init(false)};
+
   Option<uint64_t> cost_threshold_{
       *this, "tfrt-cost-threshold",
       llvm::cl::desc(
@@ -2233,6 +2159,9 @@ void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
   pm.addNestedPass<mlir::func::FuncOp>(
       tfrt_compiler::CreateOptimizeTfForTfrtPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  // Guarantee all functions have one use, which enables more exact shape
+  // inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   pm.addPass(mlir::TF::CreateTFShapeInferencePass());
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createSymbolDCEPass());
@@ -2264,6 +2193,15 @@ void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
   pm.addPass(
       tfrt_compiler::CreateDeduplicateFunctionsInovkedByBatchFunctionPass());
 
+  // RemoveUnusedWhileResultsPass operates on the region-based control flow, so
+  // the functional control flow is first converted to region-based control
+  // flow, which is converted back after the optimization passes are performed.
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<func::FuncOp>(
+      mlir::TF::CreateRemoveUnusedWhileResultsPass());
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+
   // Apply standard optimization after optimizing control flow ops.
   pm.addPass(mlir::createInlinerPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
@@ -2309,28 +2247,26 @@ void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
   // convert them to functions. We currently support only tfrt fallback tensors
   // as operands, so we disable these passes if we can have native ops after
   // lowering.
-  if (!options.enable_native_ops) {
-    pm.addNestedPass<mlir::func::FuncOp>(CreateTfJitRtClusteringPass(
-        options.auto_fusion_oplist, options.auto_fusion_min_cluster_size));
-
-    // Sink small constants into the outlined clusters to reduce the number of
-    // arguments for each of the execute operations.
-    auto is_compilable_const = [](mlir::tf_device::ClusterOp cluster,
-                                  mlir::ElementsAttr value) -> bool {
-      // Ensure that cluster was formed for TFRT JIT compilation.
-      auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
-      if (!policy || policy.getValue() != "tfrt.auto-fusion") return false;
-
-      // Check that TF->JitRt compiler supports constant compilation.
-      return mlir::succeeded(IsCompilableConstant(value));
-    };
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfJitRtClusteringPass(
+      options.auto_fusion_oplist, options.auto_fusion_min_cluster_size));
 
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::TFDevice::CreateClusterConstantSinkingPass(is_compilable_const));
+  // Sink small constants into the outlined clusters to reduce the number of
+  // arguments for each of the execute operations.
+  auto is_compilable_const = [](mlir::tf_device::ClusterOp cluster,
+                                mlir::ElementsAttr value) -> bool {
+    // Ensure that cluster was formed for TFRT JIT compilation.
+    auto policy = cluster->getAttr("policy").dyn_cast_or_null<StringAttr>();
+    if (!policy || policy.getValue() != "tfrt.auto-fusion") return false;
 
-    // Outline formed JIT compiled device clusters into function.
-    pm.addPass(CreateOutlineJitRtClustersPass());
-  }
+    // Check that TF->JitRt compiler supports constant compilation.
+    return mlir::succeeded(IsCompilableConstant(value));
+  };
+
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::TFDevice::CreateClusterConstantSinkingPass(is_compilable_const));
+
+  // Outline formed JIT compiled device clusters into function.
+  pm.addPass(CreateOutlineJitRtClustersPass());
 
   // Rewriter operation sequences to device specific fusions.
   DeviceNameUtils::ParsedName parsed_name;
@@ -2358,6 +2294,10 @@ void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
 
   AddTfDeviceAssignmentPasses(pm, options);
 
+  if (options.sink_in_invariant_ops) {
+    pm.addPass(CreateSinkInInvariantOpsPass());
+  }
+
   pm.addPass(CreateLowerTFSavedModelPass(options.hoist_invariant_ops));
 }
 
@@ -2379,11 +2319,22 @@ void CreateTfExecutorToTfrtPipelineHelper(mlir::OpPassManager &pm,
   }
 }
 
+Status ValidateTfrtPipelineOptions(const TfrtPipelineOptions &options) {
+  if (options.target_tpurt &&
+      (options.target_gpu || options.use_bridge_for_gpu)) {
+    return tensorflow::errors::Internal(
+        "Invalid pipeline options. Targeting both TPU and GPU is not "
+        "supported.");
+  }
+  return OkStatus();
+}
+
 // If verbose logging is on, dump the output of each pass to a file directory,
 // set via env var TF_DUMP_GRAPH_PREFIX. e.g.:
 // export TF_DUMP_GRAPH_PREFIX=/tmp/mlir
-void CreateTfExecutorToTfrtPipeline(mlir::PassManager &pm,
-                                    const TfrtPipelineOptions &options) {
+Status CreateTfExecutorToTfrtPipeline(mlir::PassManager &pm,
+                                      const TfrtPipelineOptions &options) {
+  TF_RETURN_IF_ERROR(ValidateTfrtPipelineOptions(options));
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass, which requires disabling
     // multi-threading as well.
@@ -2392,6 +2343,7 @@ void CreateTfExecutorToTfrtPipeline(mlir::PassManager &pm,
         /*print_module_scope=*/true));
   }
   CreateTfExecutorToTfrtPipelineHelper(pm, options);
+  return OkStatus();
 }
 
 static mlir::PassRegistration<TfToTfrtConversionPass> tf_to_tfrt_pass;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.cc
deleted file mode 100644
index 8de47733baf..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.cc
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of TF dialect to TFRT data kernels.
-#include "tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.h"
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"  // from @tf_runtime
-#include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
-#include "tfrt/data/opdefs/data_ops.h"  // from @tf_runtime
-#include "tfrt/data/opdefs/types.h"  // from @tf_runtime
-
-#define DEBUG_TYPE "tf-to-tfrt-data"
-
-namespace tensorflow {
-namespace {
-
-bool isIntScalar(Type t, size_t width) {
-  if (auto ttype = t.dyn_cast<RankedTensorType>()) {
-    if (ttype.hasStaticShape() && ttype.getNumElements() == 1 &&
-        ttype.getRank() == 0 && ttype.getElementType().isSignlessInteger(width))
-      return true;
-  }
-  return false;
-}
-
-// Converts `value_attr` from a TF Const node to the required type attr type `U`
-template <typename T>
-T ConstAttrToTypeAttr(ElementsAttr value_attr) {
-  if (T type_attr = value_attr.dyn_cast<T>()) {
-    return type_attr;
-  } else if (auto v = value_attr.dyn_cast<SplatElementsAttr>()) {
-    return v.getSplatValue<Attribute>().dyn_cast<T>();
-  }
-  return T(nullptr);
-}
-
-template <typename T>
-LogicalResult ReplaceConst(TF::ConstOp &op, ConversionPatternRewriter &rewriter,
-                           Type type) {
-  IntegerAttr newAttr = ConstAttrToTypeAttr<IntegerAttr>(op.value());
-
-  if (!newAttr) {
-    return failure();
-  }
-
-  auto tfrtConst = rewriter.create<T>(op.getLoc(), type, newAttr);
-  rewriter.replaceOp(op.getOperation(), tfrtConst.getResult());
-  return success();
-}
-
-mlir::Type CreateDatasetType(mlir::Builder *builder) {
-  return builder->getType<tfrt::data::DatasetType>();
-}
-
-// A helper class for converting data-specific types and attributes
-class DataConverter : public mlir::TypeConverter {
- public:
-  explicit DataConverter(mlir::MLIRContext *context) {
-    addConversion([](Type type) { return type; });
-    addConversion([context](TensorType type) {
-      mlir::Builder builder(context);
-      // tf.data datasets are represented by DT_VARIANT tensors in TF.
-      // TODO(rachelim): Identify datasets more accurately.
-      if (type.getElementType().dyn_cast<TF::VariantType>()) {
-        return CreateDatasetType(&builder);
-      }
-      return type.dyn_cast<Type>();
-    });
-  }
-};  // namespace
-
-struct ConstOpConversion : public mlir::OpConversionPattern<TF::ConstOp> {
-  explicit ConstOpConversion(MLIRContext *context)
-      : OpConversionPattern<TF::ConstOp>(context) {}
-
-  LogicalResult matchAndRewrite(
-      TF::ConstOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    if (isIntScalar(op.getType(), 64)) {
-      return ReplaceConst<tfrt::compiler::ConstantI64Op>(op, rewriter,
-                                                         rewriter.getI64Type());
-    }
-    if (isIntScalar(op.getType(), 1)) {
-      return ReplaceConst<tfrt::compiler::ConstantI1Op>(op, rewriter,
-                                                        rewriter.getI1Type());
-    }
-    // TODO(rachelim): Support converting other const types.
-    return failure();
-  }
-};
-
-struct ReturnOpConversion
-    : public mlir::OpConversionPattern<mlir::func::ReturnOp> {
-  explicit ReturnOpConversion(MLIRContext *context)
-      : OpConversionPattern<mlir::func::ReturnOp>(context) {}
-
-  LogicalResult matchAndRewrite(
-      mlir::func::ReturnOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<tfrt::compiler::ReturnOp>(
-        op, adaptor.getOperands());
-    return success();
-  }
-};
-
-class RangeDatasetOpConversion
-    : public OpConversionPattern<TF::RangeDatasetOp> {
- public:
-  explicit RangeDatasetOpConversion(MLIRContext *context)
-      : OpConversionPattern<TF::RangeDatasetOp>(context),
-        builder_(context),
-        dataset_type_(CreateDatasetType(&builder_)) {}
-
-  LogicalResult matchAndRewrite(
-      TF::RangeDatasetOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    if (op.output_types().size() != 1) {
-      // Range dataset should only have one output type.
-      return failure();
-    }
-    if (auto output_type = op.output_types().begin()->cast<TypeAttr>()) {
-      rewriter.replaceOpWithNewOp<tfrt::data::RangeDatasetOp>(
-          op, dataset_type_, adaptor.start(), adaptor.stop(), adaptor.step(),
-          output_type);
-      return success();
-    }
-    return failure();
-  }
-
- private:
-  mlir::Builder builder_;
-  mlir::Type dataset_type_;
-};
-
-class BatchDatasetV2OpConversion
-    : public OpConversionPattern<TF::BatchDatasetV2Op> {
- public:
-  explicit BatchDatasetV2OpConversion(MLIRContext *context)
-      : OpConversionPattern<TF::BatchDatasetV2Op>(context),
-        builder_(context),
-        dataset_type_(CreateDatasetType(&builder_)) {}
-
-  LogicalResult matchAndRewrite(
-      TF::BatchDatasetV2Op op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Since TFRT's BatchDataset doesn't have a drop_remainder=True option,
-    // we only convert this op if its drop_remainder input is statically known
-    // to be false.
-    auto drop_remainder_op = op.drop_remainder().getDefiningOp<TF::ConstOp>();
-    if (!drop_remainder_op) return failure();
-    BoolAttr drop_remainder_val =
-        ConstAttrToTypeAttr<BoolAttr>(drop_remainder_op.value());
-    if (!drop_remainder_val || drop_remainder_val.getValue()) {
-      return failure();
-    }
-
-    // TODO(b/155892156): Support converting non-unary BatchDataset
-    if (op.output_types().size() != 1) return failure();
-
-    // TODO(b/155892156): Support converting BatchDataset with unknown rank
-    auto output_shape = op.output_shapes()[0].cast<TF::ShapeAttr>();
-    if (!output_shape.hasRank()) {
-      return failure();
-    }
-
-    if (output_shape.getRank() >= 2) {  // Input is a tensor
-      rewriter.replaceOpWithNewOp<tfrt::data::BatchDatasetTensorOp>(
-          op, dataset_type_, adaptor.input_dataset(), adaptor.batch_size(),
-          /*same_input_metadata=*/rewriter.getBoolAttr(false));
-      return success();
-    }
-
-    auto output_type = op.output_types()[0].cast<TypeAttr>().getValue();
-
-    if (output_type.isInteger(32)) {
-      rewriter.replaceOpWithNewOp<tfrt::data::BatchDatasetI32Op>(
-          op, dataset_type_, adaptor.input_dataset(), adaptor.batch_size(),
-          /*same_input_metadata=*/rewriter.getBoolAttr(false));
-      return success();
-    }
-    if (output_type.isInteger(64)) {
-      rewriter.replaceOpWithNewOp<tfrt::data::BatchDatasetI64Op>(
-          op, dataset_type_, adaptor.input_dataset(), adaptor.batch_size(),
-          /*same_input_metadata=*/rewriter.getBoolAttr(false));
-      return success();
-    }
-    return failure();
-  }
-
- private:
-  mlir::Builder builder_;
-  mlir::Type dataset_type_;
-};
-
-// This rewrite converts a tf.data function that returns a tf.data dataset (in
-// the TF dialect) to the equivalent function in the TFRT and Data dialects that
-// returns a `!tfrt.dataset`.
-//
-// For now, this can only lower a RangeDataset op and its inputs. As we add more
-// native TFRT datasets, we add the corresponding lowering pattern here.
-class TFToTFRTDataRewritePass
-    : public mlir::PassWrapper<TFToTFRTDataRewritePass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TFToTFRTDataRewritePass)
-
- private:
-  llvm::StringRef getArgument() const final { return "tf-to-tfrt-data"; }
-  llvm::StringRef getDescription() const final {
-    return "Convert Tensorflow dialect to TFRT's data dialect.";
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<tfrt::compiler::TFRTDialect, tfrt::data::DataDialect>();
-  }
-
-  void runOnOperation() override {
-    auto module = getOperation();
-    auto *context = &getContext();
-    mlir::ConversionTarget target(*context);
-    DataConverter data_converter(context);
-    target.addIllegalDialect<TF::TensorFlowDialect>();
-    target.addLegalDialect<tfrt::data::DataDialect>();
-    target.addLegalDialect<tfrt::compiler::TFRTDialect>();
-    target.addDynamicallyLegalOp<mlir::func::FuncOp>(
-        [&data_converter](func::FuncOp op) {
-          return data_converter.isSignatureLegal(op.getFunctionType());
-        });
-    mlir::RewritePatternSet patterns(&getContext());
-    patterns.add<RangeDatasetOpConversion, BatchDatasetV2OpConversion,
-                 ConstOpConversion, ReturnOpConversion>(context);
-    mlir::populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
-        patterns, data_converter);
-
-    auto result =
-        mlir::applyPartialConversion(module, target, std::move(patterns));
-    if (failed(result)) {
-      signalPassFailure();
-    }
-  }
-};
-
-// Creates a pipeline of passes that converts MLIR TF Executor dialect to
-// Hex and Data dialect.
-void CreateTFExecutorToTFRTDataPipeline(mlir::OpPassManager &pm) {
-  // Prune unused operations.
-  pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
-
-  // Run the TF standard pipeline
-  mlir::TF::StandardPipelineOptions tf_options;
-  tf_options.enable_inliner = true;
-  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
-
-  // After all the standard passes, lower to TFRT Data.
-  pm.addPass(CreateTFToTFRTDataConversionPass());
-}
-
-Status TFDataGraphDefToTFDataMLIR(
-    const GraphDef &graph_def, mlir::MLIRContext *mlir_ctx,
-    mlir::OwningOpRef<mlir::ModuleOp> *module_ref) {
-  // Import to TF dialect
-  string output_node;
-  for (const auto &node : graph_def.node()) {
-    if (node.op() == "_Retval") {
-      output_node = node.input(0);
-      VLOG(2) << "Output node: " << output_node;
-      break;
-    }
-  }
-  auto import_config = tensorflow::GraphImportConfig();
-  import_config.outputs.push_back(std::move(output_node));
-  import_config.prune_unused_nodes = true;
-  TF_ASSIGN_OR_RETURN(*module_ref, ConvertGraphdefToMlir(
-                                       graph_def, tensorflow::GraphDebugInfo(),
-                                       std::move(import_config), mlir_ctx));
-
-  return OkStatus();
-}
-
-Status CompileTFDataMLIRToBEF(mlir::ModuleOp module,
-                              tfrt::BefBuffer *bef_buffer) {
-  VLOG(1) << "TF Dialect: " << MlirModuleToString(module);
-
-  mlir::PassManager pm(module.getContext());
-  CreateTFExecutorToTFRTDataPipeline(pm);
-
-  mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
-  if (mlir::failed(pm.run(module)))
-    return diag_handler.Combine(
-        errors::Internal("failed to lower TF Dialect to TFRT Data dialect."));
-
-  VLOG(1) << "TFRT Dialect: " << MlirModuleToString(module);
-
-  *bef_buffer =
-      tfrt::ConvertMLIRToBEF(module, /*disable_optional_sections=*/false);
-  if (bef_buffer->empty())
-    return diag_handler.Combine(
-        errors::Internal("failed to convert MLIR to BEF."));
-
-  return OkStatus();
-}
-
-}  // namespace
-
-std::unique_ptr<mlir::Pass> CreateTFToTFRTDataConversionPass() {
-  return std::make_unique<TFToTFRTDataRewritePass>();
-}
-
-Status TFDataGraphDefToHostBEF(const GraphDef &graph_def,
-                               tfrt::BefBuffer *bef) {
-  mlir::MLIRContext mlir_ctx;
-  mlir::OwningOpRef<mlir::ModuleOp> module_ref;
-  TF_RETURN_IF_ERROR(
-      TFDataGraphDefToTFDataMLIR(graph_def, &mlir_ctx, &module_ref));
-  TF_RETURN_IF_ERROR(CompileTFDataMLIRToBEF(module_ref.get(), bef));
-
-  return OkStatus();
-}
-
-static mlir::PassRegistration<TFToTFRTDataRewritePass> pass;
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.h b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.h
deleted file mode 100644
index 1058215ce8e..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_tfrt_data.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TF_TO_TFRT_DATA_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TF_TO_TFRT_DATA_H_
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/platform/status.h"
-#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
-
-namespace tensorflow {
-
-// Create a pass that converts MLIR TF dialect to MLIR TFRT CoreRT dialect.
-std::unique_ptr<mlir::Pass> CreateTFToTFRTDataConversionPass();
-
-Status TFDataGraphDefToHostBEF(const GraphDef& graph_def, tfrt::BefBuffer* bef);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_TF_TO_TFRT_DATA_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc
new file mode 100644
index 00000000000..6b5a0132a67
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+constexpr char kCostAttrName[] = "_tfrt_cost";
+constexpr char kOpKeyAttrName[] = "op_key";
+
+void UpdateOpCostInTfrtMlir(mlir::ModuleOp op,
+                            const tfrt_stub::CostRecorder& cost_recorder) {
+  mlir::Builder builder(op);
+  op.walk([&](mlir::Operation* op) {
+    // Only update ops with existing cost attr.
+    const auto cost_attr = op->getAttrOfType<mlir::IntegerAttr>(kCostAttrName);
+    if (!cost_attr) return;
+    // Only fallback ops have `op_key`s.
+    const auto op_key_attr =
+        op->getAttrOfType<mlir::IntegerAttr>(kOpKeyAttrName);
+    if (!op_key_attr) return;
+    // Set the cost attr with a new value.
+    const int64_t op_key = op_key_attr.getInt();
+    op->setAttr(kCostAttrName, builder.getI64IntegerAttr(
+                                   cost_recorder.GetCostNanosecond(op_key)));
+  });
+}
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h
new file mode 100644
index 00000000000..99b7c192f7a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+
+namespace tensorflow {
+namespace tfrt_compiler {
+
+// Updates the existing costs for all the fallback ops with the records in
+// `cost_recorder`.
+void UpdateOpCostInTfrtMlir(mlir::ModuleOp op,
+                            const tfrt_stub::CostRecorder& cost_recorder);
+
+}  // namespace tfrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UPDATE_OP_COST_IN_TFRT_MLIR_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/utils.cc b/tensorflow/compiler/mlir/tfrt/transforms/utils.cc
new file mode 100644
index 00000000000..dc92e0cf511
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/utils.cc
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tfrt/basic_kernels/opdefs/tfrt_base.h"  // from @tf_runtime
+#include "tfrt/basic_kernels/opdefs/types.h"  // from @tf_runtime
+#include "tfrt/core_runtime/opdefs/types.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+bool IsResourceArgument(mlir::Value value) {
+  auto arg = value.dyn_cast<mlir::BlockArgument>();
+  if (!arg) return false;
+
+  auto func = llvm::cast<mlir::func::FuncOp>(arg.getOwner()->getParentOp());
+
+  return func.getArgAttr(arg.getArgNumber(), "tf.resource_name") != nullptr;
+}
+
+bool IsResultVariable(const mlir::Value &original_operand,
+                      const mlir::Value &operand) {
+  if (original_operand.isa<mlir::OpResult>()) {
+    auto defining_op = original_operand.getDefiningOp();
+
+    // TODO(b/174753886): When device assignment is properly done, we
+    // should check that TF::ReadVariableOp is for TPU device here.
+    if (llvm::isa<mlir::TF::ReadVariableOp>(defining_op) &&
+        defining_op->getNumOperands() == 1) {
+      return true;
+    } else if (llvm::isa<mlir::TF::_TfrtGetResourceOp>(defining_op)) {
+      return true;
+    }
+    return false;
+  }
+  return IsResourceArgument(operand);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/utils.h b/tensorflow/compiler/mlir/tfrt/transforms/utils.h
new file mode 100644
index 00000000000..4440149df89
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
+
+#include "mlir/IR/Value.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Checks if the given `value` is a resource argument.
+bool IsResourceArgument(mlir::Value value);
+
+// Checks if an operand is the value of a variable.
+bool IsResultVariable(const mlir::Value &original_operand,
+                      const mlir::Value &operand);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 0fd4bfbd06e..942096d2736 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 
+#include <deque>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -32,6 +37,65 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+// Exports all XLA functions in the form of XlaLaunch, and their nested
+// functions.
+StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(mlir::ModuleOp module) {
+  // Find all XLA functions.
+  std::vector<std::string> xla_functions;
+  module.walk([&](mlir::TF::XlaLaunchOp xla_launch_op) {
+    std::string func_name =
+        xla_launch_op.getFunctionAttr().getRootReference().str();
+    xla_functions.push_back(func_name);
+  });
+
+  // Convert all XLA functions and their nested functions.
+  std::deque<std::string> queue;
+  for (const std::string& func : xla_functions) {
+    queue.push_back(func);
+  }
+
+  const mlir::SymbolTable symbol_table(module);
+  absl::flat_hash_set<std::string> visited;
+  std::vector<FunctionDef> xla_func_defs;
+  while (!queue.empty()) {
+    const std::string func_name = queue.front();
+    queue.pop_front();
+
+    if (visited.contains(func_name)) continue;
+
+    const auto func_op = symbol_table.lookup<mlir::func::FuncOp>(func_name);
+    if (!func_op) {
+      return tensorflow::errors::Internal(
+          absl::StrCat("Function ", func_name, " is not found."));
+    }
+    FunctionDef func_def;
+    TF_RETURN_IF_ERROR(ConvertMlirFunctionToFunctionLibraryDef(
+        func_op, GraphExportConfig(), &func_def));
+    xla_func_defs.push_back(func_def);
+
+    // Visit each op in the function and find out referenced functions from the
+    // attributes.
+    func_op->walk([&](mlir::Operation* op) {
+      for (const mlir::NamedAttribute& attr : op->getAttrs()) {
+        if (const auto sym =
+                attr.getValue().dyn_cast<mlir::FlatSymbolRefAttr>()) {
+          mlir::Operation* func =
+              mlir::SymbolTable::lookupNearestSymbolFrom(op, sym);
+          if (func) {
+            queue.push_back(sym.getValue().str());
+          }
+        }
+      }
+    });
+    visited.insert(func_name);
+  }
+  return xla_func_defs;
+}
+
+}  // namespace
+
 Status ConvertFunctionToBef(
     mlir::StringRef function_name, const tensorflow::FunctionBody* fbody,
     const FunctionLibraryDefinition& flib_def,
@@ -62,10 +126,11 @@ Status ConvertFunctionToBef(
 }
 
 Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
-                          mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer) {
+                          mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer,
+                          tfrt_stub::FallbackState* fallback_state) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
 
-  if (options.tpu_target == TfrtTpuInfraTarget::kTpurt) {
+  if (options.device_target == TfrtDeviceInfraTarget::kTpurt) {
     VLOG(1) << "Running MLIR TPU bridge for tpurt";
     if (VLOG_IS_ON(1)) {
       tensorflow::DumpMlirOpToFile("tpu_bct_conversion_before", module);
@@ -90,13 +155,28 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
 
     TF_RETURN_IF_ERROR(
         mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
-  } else if (options.tpu_target == TfrtTpuInfraTarget::kTfFallback) {
+  } else if (options.device_target == TfrtDeviceInfraTarget::kTfFallback) {
     auto tpu_partitioned_call_fallback_compat_result =
         tensorflow::RunTPUPartitionedCallFallbackCompatConversion(module);
     if (mlir::failed(tpu_partitioned_call_fallback_compat_result)) {
       return diag_handler.Combine(tensorflow::errors::Internal(
           "Failed to process TPUPartitionedCallOp for fallback execution"));
     }
+  } else if (options.device_target == TfrtDeviceInfraTarget::kGpu &&
+             options.use_bridge_for_gpu) {
+    TF_RETURN_IF_ERROR(
+        mlir::TF::RunTFXLABridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
+
+    // GPU XLA clusters are wrapped in functions, which could be transformed by
+    // bridge. Hence, the MLIR functions for XLA clusters are exported and added
+    // to the function library.
+    if (fallback_state != nullptr) {
+      TF_ASSIGN_OR_RETURN(const std::vector<FunctionDef> xla_func_defs,
+                          ExportXlaFunctions(module));
+      for (const auto& func_def : xla_func_defs) {
+        TF_RETURN_IF_ERROR(fallback_state->AddFunctionDef(func_def));
+      }
+    }
   }
 
   if (VLOG_IS_ON(1)) {
@@ -118,12 +198,15 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
   // ops.
   pass_options.decompose_resource_ops = options.decompose_resource_ops;
   pass_options.enable_optimizer = options.enable_optimizer;
-  pass_options.enable_native_ops = options.enable_native_ops;
   pass_options.target_tpurt =
-      (options.tpu_target == TfrtTpuInfraTarget::kTpurt);
+      (options.device_target == TfrtDeviceInfraTarget::kTpurt);
+  pass_options.target_gpu =
+      (options.device_target == TfrtDeviceInfraTarget::kGpu);
+  pass_options.use_bridge_for_gpu = options.use_bridge_for_gpu;
   pass_options.tpu_fuse_ops = options.tpu_fuse_ops;
   pass_options.use_tpu_host_allocator_for_inputs =
       options.use_tpu_host_allocator_for_inputs;
+  pass_options.sink_in_invariant_ops = options.sink_in_invariant_ops;
   pass_options.hoist_invariant_ops = options.hoist_invariant_ops;
   pass_options.func_use_fallback_tensor = true;
   pass_options.enable_while_parallel_iterations =
@@ -135,7 +218,10 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
   pass_options.upper_cost_threshold = options.upper_cost_threshold;
   pass_options.merge_inter_dependent_streams =
       options.merge_inter_dependent_streams;
-  tensorflow::CreateTfExecutorToTfrtPipeline(pm, pass_options);
+  Status status = tensorflow::CreateTfExecutorToTfrtPipeline(pm, pass_options);
+  if (!status.ok()) {
+    return diag_handler.Combine(status);
+  }
 
   if (mlir::failed(pm.run(module)))
     return diag_handler.Combine(tensorflow::errors::Internal(
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
index b74712979fc..99d4bcf8a4d 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_IMPORT_MODEL_H_
 
+#include <vector>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -23,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -41,8 +44,12 @@ Status ConvertFunctionToBef(
     tfrt::BefBuffer* bef_buffer);
 
 // Converts an MLIR `module` in TF dialect to TFRT's Binary Executable Format.
+// If `fallback_state` is not null, the MLIR functions for XLA clusters in
+// the form of XlaLaunch will be exported and added to the function library when
+// needed. The nested functions will also be exported.
 Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
-                          mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer);
+                          mlir::ModuleOp module, tfrt::BefBuffer* bef_buffer,
+                          tfrt_stub::FallbackState* fallback_state = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
index 87cd5f3a51e..1e4a81d0d0c 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.cc
@@ -23,16 +23,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-std::ostream& operator<<(std::ostream& os, TfrtTpuInfraTarget tpu_target) {
-  switch (tpu_target) {
-    case TfrtTpuInfraTarget::kNoTpu:
-      return os << "NoTpu";
-    case TfrtTpuInfraTarget::kTpurt:
+std::ostream& operator<<(std::ostream& os,
+                         TfrtDeviceInfraTarget device_target) {
+  switch (device_target) {
+    case TfrtDeviceInfraTarget::kCpu:
+      return os << "Cpu";
+    case TfrtDeviceInfraTarget::kTpurt:
       return os << "Tpurt";
-    case TfrtTpuInfraTarget::kTfFallback:
+    case TfrtDeviceInfraTarget::kTfFallback:
       return os << "TfFallback";
-    case TfrtTpuInfraTarget::kBridgeFallback:
+    case TfrtDeviceInfraTarget::kBridgeFallback:
       return os << "BridgeFallback";
+    case TfrtDeviceInfraTarget::kGpu:
+      return os << "Gpu";
   }
 }
 
@@ -41,10 +44,9 @@ std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options) {
             << "variable_device = " << options.variable_device
             << ", default_device = " << options.default_device
             << ", enable_optimizer = " << options.enable_optimizer
-            << ", enable_native_ops = " << options.enable_native_ops
             << ", enable_grappler = " << options.enable_grappler
             << ", force_data_format = " << options.force_data_format
-            << ", tpu_target = " << options.tpu_target
+            << ", device_target = " << options.device_target
             << ", tpu_fuse_ops = " << options.tpu_fuse_ops
             << ", tpu_move_resource_gather_to_host = "
             << options.tpu_move_resource_gather_to_host
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
index d8cbfc57bfa..ab4f0a04304 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -23,15 +23,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-enum class TfrtTpuInfraTarget {
-  kNoTpu,           // No TPU support.
+enum class TfrtDeviceInfraTarget {
+  kCpu,             // CPU only, no device support.
   kTpurt,           // Target TPURT dialect and kernels.
   kTfFallback,      // Target TPU kernels in TF Fallback.
   kBridgeFallback,  // TPU support but choose kTpurt or kTfFallback depending on
-                    // whether the graph has unsupported feature in Bridge
+                    // whether the graph has unsupported feature in Bridge.
+  kGpu,             // Target GPU specific compiler passes and runtime
+                    // initializations.
 };
 
-std::ostream& operator<<(std::ostream& os, TfrtTpuInfraTarget tpu_target);
+std::ostream& operator<<(std::ostream& os, TfrtDeviceInfraTarget device_target);
 
 struct TfrtCompileOptions {
   // TODO(tfrt-devs): Ideally, compiler should make the decision where
@@ -42,13 +44,6 @@ struct TfrtCompileOptions {
   // Enable compiler optimization in TFRT dialect.
   bool enable_optimizer = true;
 
-  // If true, native ops will be used if they are implemented in TFRT. If
-  // false, all ops are using fallback.
-  //
-  // This option is experimental. Native ops are still under development and
-  // likely to cause performance issue when enabled.
-  bool enable_native_ops = false;
-
   // If true, run grappler passes before compiling.
   bool enable_grappler = true;
 
@@ -61,9 +56,9 @@ struct TfrtCompileOptions {
   // data format should be changed, instead of controlled by users.
   std::string force_data_format;
 
-  // The target TPU infrastructure to use. This will trigger TPU target specific
+  // The target device infrastructure to use. This will trigger target specific
   // compiler passes and runtime initialization.
-  TfrtTpuInfraTarget tpu_target = TfrtTpuInfraTarget::kNoTpu;
+  TfrtDeviceInfraTarget device_target = TfrtDeviceInfraTarget::kCpu;
 
   // If true, use the fused TPU compile_and_execute kernel, which performs all
   // TPU inference related operations, e.g. core selection, h2d/d2h transfers,
@@ -90,6 +85,13 @@ struct TfrtCompileOptions {
   // supposed to be turned on by default.
   bool hoist_invariant_ops = false;
 
+  // If true, the compiler will try to sink in the invariant ops (e.g. const
+  // ops, var handle ops, etc.) to the nested function (e.g. batch function) to
+  // facilitate invariant ops hoisting.
+  // TODO(tfrt-devs): Set the default value to true after testing as it is
+  // supposed to be turned on by default.
+  bool sink_in_invariant_ops = false;
+
   // If true, tf.While's iterations will be parallelized on a best-effort
   // basis. This is currently experimental.
   bool enable_while_parallel_iterations = false;
@@ -123,13 +125,17 @@ struct TfrtCompileOptions {
 
   // If true, streams with inter data depenedencies will be preferred to be
   // merged for inline execution.
-  bool merge_inter_dependent_streams = false;
+  bool merge_inter_dependent_streams = true;
 
   // Whether to enable the DecomposeResourceOpsPass.
   bool decompose_resource_ops = true;
 
   // Whether to compile to sync TFRT dialect.
   bool compile_to_sync_tfrt_dialect = false;
+
+  // Whether to use bridge for GPU.
+  // TODO(b/260915352): Remove the flag and default to using bridge.
+  bool use_bridge_for_gpu = false;
 };
 
 std::ostream& operator<<(std::ostream& os, const TfrtCompileOptions& options);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 3bf71aa5ee9..8e866876253 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -22,6 +22,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -98,7 +99,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index abddd366b56..f040ca2af3b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -3,6 +3,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/tools/kernel_gen:friends",
         # Allow visibility from the mlir language server.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index 3b2b8463a67..a5a9f12d6a2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -40,7 +40,7 @@ def TFFramework_Dialect : Dialect {
   }];
 
   let useDefaultTypePrinterParser = 1;
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 49ccde17069..48790bf78e4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -57,10 +57,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 #include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
@@ -120,7 +120,8 @@ Status LowerTFToJITInvocation(mlir::ModuleOp module,
   pm.addNestedPass<FuncOp>(
       mlir::kernel_gen::transforms::CreateTFToJITInvocationPass(
           tile_sizes, unroll_factors, max_supported_rank, enable_ftz,
-          index_64bit, jit_i64_indexed_for_large_tensors));
+          index_64bit,
+          /*cpu_codegen=*/false, jit_i64_indexed_for_large_tensors));
   pm.addPass(mlir::kernel_gen::tf_framework::CreateEmbedTFFrameworkPass());
   pm.addNestedPass<FuncOp>(
       mlir::bufferization::createEmptyTensorToAllocTensorPass());
@@ -150,6 +151,7 @@ Status LowerTFtoLoops(mlir::ModuleOp module, llvm::ArrayRef<int64_t> tile_sizes,
         mlir::kernel_gen::transforms::CreateTFToJITInvocationPass(
             tile_sizes, unroll_factors, max_supported_rank, enable_ftz,
             index_64bit,
+            /*cpu_codegen=*/false,
             /*jit_i64_indexed_for_large_tensors=*/true));
   }
   pm.addNestedPass<FuncOp>(mlir::mhlo::createLegalizeTFNoFallbackPass(
@@ -162,7 +164,7 @@ Status LowerTFtoLoops(mlir::ModuleOp module, llvm::ArrayRef<int64_t> tile_sizes,
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<FuncOp>(mlir::createCSEPass());
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
-  pm.addNestedPass<FuncOp>(mlir::createShapeSimplification());
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createShapeSimplification());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createMergeAssumingOpsPass());
   pm.addNestedPass<FuncOp>(mlir::mhlo::createBroadcastPropagationPass());
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
@@ -234,8 +236,8 @@ Status LowerTFtoLoops(mlir::ModuleOp module, llvm::ArrayRef<int64_t> tile_sizes,
   return OkStatus();
 }
 
-Status LowerLoopsToGPU(mlir::ModuleOp module, bool embed_memref_prints,
-                       bool index_64bit, bool apply_cl_options) {
+Status LowerLoopsToGPU(mlir::ModuleOp module, bool index_64bit,
+                       bool apply_cl_options) {
   mlir::PassManager pm(module.getContext());
   if (apply_cl_options) applyTensorflowAndCLOptions(pm);
 
@@ -297,9 +299,6 @@ Status LowerLoopsToGPU(mlir::ModuleOp module, bool embed_memref_prints,
   pm.addPass(::mlir::createConvertSCFToCFPass());
   // Map asserts to the tensorflow framework.
   pm.addPass(mlir::kernel_gen::tf_framework::CreateRewriteTFFrameworkAssert());
-  if (embed_memref_prints) {
-    pm.addPass(mlir::kernel_gen::transforms::CreateEmbedMemRefPrintsPass());
-  }
   if (failed(pm.run(module))) {
     return tensorflow::errors::Internal("Lowering to GPU kernels failed.");
   }
@@ -426,13 +425,23 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForTfCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code,
     llvm::ArrayRef<std::string> architectures,
     llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
-    int64_t max_supported_rank, bool embed_memref_prints, bool print_ptx,
-    bool print_llvmir, bool enable_ftz, bool index_64bit, bool jit_compile,
+    int64_t max_supported_rank, bool print_ptx, bool print_llvmir,
+    bool enable_ftz, bool index_64bit, bool jit_compile,
     bool jit_i64_indexed_for_large_tensors, bool apply_cl_options) {
+  if (jit_compile && jit_i64_indexed_for_large_tensors) {
+    return tensorflow::Status(
+        tensorflow::error::Code::INVALID_ARGUMENT,
+        "jit compilation for large tensors "
+        "(`jit_i64_indexed_for_large_tensors`) and unconditioned jit "
+        "compilation (`jit`) must not be requested together");
+  }
+
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       SetupContextAndParseModule(context, tf_code));
 
   if (jit_compile) {
+    assert(!jit_i64_indexed_for_large_tensors &&
+           "expect to have reported an error earlier");
     TF_RETURN_IF_ERROR(LowerTFToJITInvocation(
         module.get(), tile_sizes, unroll_factors, max_supported_rank,
         enable_ftz, index_64bit,
@@ -442,8 +451,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForTfCode(
         LowerTFtoLoops(module.get(), tile_sizes, unroll_factors,
                        max_supported_rank, enable_ftz, index_64bit,
                        jit_i64_indexed_for_large_tensors, apply_cl_options));
-    TF_RETURN_IF_ERROR(LowerLoopsToGPU(module.get(), embed_memref_prints,
-                                       index_64bit, apply_cl_options));
+    TF_RETURN_IF_ERROR(
+        LowerLoopsToGPU(module.get(), index_64bit, apply_cl_options));
     TF_RETURN_IF_ERROR(
         LowerKernelBodiesToLowLevelIr(module.get(), apply_cl_options));
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index 794f0eb61ad..0ac2af80ada 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -44,8 +44,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForTfCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code,
     llvm::ArrayRef<std::string> architectures,
     llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
-    int64_t max_supported_rank, bool embed_memref_prints, bool print_ptx,
-    bool print_llvmir, bool enable_ftz, bool index_64bit, bool jit_compile,
+    int64_t max_supported_rank, bool print_ptx, bool print_llvmir,
+    bool enable_ftz, bool index_64bit, bool jit_compile,
     bool jit_i64_indexed_for_large_tensors, bool apply_cl_options);
 
 }  // namespace kernel_gen
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
index 35b3e52dfe4..4d4abfd9d90 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/copy_cleanup.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/copy_cleanup.mlir
index 6c7be698b5a..a8ab6c524d8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/copy_cleanup.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/copy_cleanup.mlir
@@ -23,8 +23,8 @@ builtin.module {
   }
 }
 
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL:   func @Copy(
 // CHECK-SAME:              %[[LHS:.*]]: memref<?xi16>, %[[RHS:.*]]: memref<?xi16>) {
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
@@ -64,8 +64,8 @@ builtin.module {
   }
 }
 
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL:   func @CopyWithWrite(
 // CHECK-SAME:              %[[LHS:.*]]: memref<?xi16>, %[[RHS:.*]]: memref<?xi16>) {
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
@@ -109,8 +109,8 @@ builtin.module {
   }
 }
 
-// CHECK: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
-// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 * s0)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL:   func @CopyWithMutation(
 // CHECK-SAME:              %[[LHS:.*]]: memref<?xi16>, %[[RHS:.*]]: memref<?xi16>) {
 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/jit_i64_indexed_for_large_tensors.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/jit_i64_indexed_for_large_tensors.mlir
deleted file mode 100644
index 1aaddc58d99..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/jit_i64_indexed_for_large_tensors.mlir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: kernel-gen-opt %s --split-input-file \
-// RUN:   --tf-to-jit-invocation="tile-sizes=1,2,3 \
-// RUN:   unroll-factors=3,2,1 max-supported-rank=32 \
-// RUN:   enable-ftz=false index_64bit=false cpu-codegen=false \
-// RUN:   jit_i64_indexed_for_large_tensors=true" | \
-// RUN: FileCheck %s
-
-// CHECK-LABEL: @unary_tanh_rint
-// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
-func.func @unary_tanh_rint(%arg : tensor<*xf32>) -> (tensor<*xf32>) {
-  // CHECK:      %[[MAX_SIZE:.*]] = arith.constant 4294967296 : index
-  // CHECK:      %[[SHAPE:.*]] = shape.shape_of %arg0
-  // CHECK:      %[[ELEMENT_COUNT:.*]] = shape.num_elements %[[SHAPE:.*]] : tensor<?xindex> -> index
-  // CHECK:      %[[CONDITION:.*]] = arith.cmpi sgt, %[[ELEMENT_COUNT:.*]], %[[MAX_SIZE:.*]] : index
-  // CHECK:      %[[IF_RES:.*]] = scf.if %[[CONDITION:.*]] -> (tensor<*xf32>) {
-  // CHECK:        %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
-  // CHECK-SAME:   "
-  // CHECK-SAME:   module  {
-  // CHECK-SAME:     func @main(%arg0: tensor<*xf32>) -> tensor<*xf32>
-  // CHECK-SAME:     attributes {tf_entry}
-  // CHECK-SAME:     {
-  // CHECK-SAME:      %0 = \22tf.Tanh\22(%arg0)
-  // CHECK-SAME:      return %0
-  // CHECK-SAME:     }
-  // CHECK-SAME:   }
-  // CHECK-SAME:   "
-  // CHECK-SAME:   {
-  // CHECK-SAME:     cpuCodegen = false
-  // CHECK-SAME:     enableFtz = false
-  // CHECK-SAME:     index64Bit = true
-  // CHECK-SAME:     maxSupportedRank = 32
-  // CHECK-SAME:     tileSizes = [1, 2, 3]
-  // CHECK-SAME:     unrollFactors = [3, 2, 1]
-  // CHECK-SAME:   }
-  // CHECK:        %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG]])
-  // CHECK:        scf.yield %[[RES:.*]]
-  // CHECK:      } else {
-  // CHECK:        %4 = "tf.Tanh"(%arg0)
-  // CHECK:        scf.yield %4 : tensor<*xf32>
-  // CHECK:      }
-  // CHECK:      return %[[IF_RES]]
-  %0 = "tf.Tanh"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/print_memrefs.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/print_memrefs.mlir
deleted file mode 100644
index 3fe2c9af670..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/print_memrefs.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: kernel-gen-opt %s --embed-memref-prints | FileCheck %s
-
-func.func @print_memrefs(
-    %ctx: !tf_framework.op_kernel_context, %input: memref<*xf32>)
-    -> memref<*xf32> attributes {tf_entry} {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %rank = memref.rank %input : memref<*xf32>
-  %shape = memref.alloca(%rank) : memref<?xindex>
-  scf.for %i = %c0 to %rank step %c1 {
-    %dim = memref.dim %input, %i : memref<*xf32>
-    memref.store %dim, %shape[%i] : memref<?xindex>
-  }
-
-  %c9000 = arith.constant 9000 : index
-  %num_elem = memref.alloca() : memref<1xindex>
-  memref.store %c9000, %num_elem[%c0] : memref<1xindex>
-  %flat_input = memref.reshape %input(%num_elem)
-    : (memref<*xf32>, memref<1xindex>) -> memref<?xf32>
-
-  %flat_output = tf_framework.alloc(%ctx, %c9000) : memref<?xf32>
-  %output = memref.reshape %flat_output(%shape)
-    : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
-  func.return %output : memref<*xf32>
-}
-
-// CHECK-DAG: global internal constant @[[STR0:debug_op_[0-9]+]]({{.*}} @print_memrefs
-// CHECK-DAG: global internal constant @[[STR1:debug_op_[0-9]+]]({{.*}} -> memref<?xf32>
-// CHECK-DAG: global internal constant @[[STR2:debug_op_[0-9]+]]({{.*}} -> memref<*xf32>
-// CHECK-DAG: func private @printMemrefF32(memref<*xf32>)
-// CHECK-DAG: llvm.func @printCString(!llvm.ptr<i8>)
-
-// CHECK: func @print_memrefs
-// CHECK-SAME:     , %[[ARG:.*]]: memref<*xf32>)
-// Print debug info for the function arg.
-// CHECK:       %[[STR0_ADDR:.*]] = llvm.mlir.addressof @[[STR0]]
-// CHECK:       %[[STR0_PTR:.*]] = llvm.getelementptr %[[STR0_ADDR]]
-// CHECK:       llvm.call @printCString(%[[STR0_PTR]]) : (!llvm.ptr<i8>)
-// CHECK:       call @printMemrefF32(%[[ARG]]) : (memref<*xf32>) -> ()
-
-// Print debug info for reshape from unranked to ranked.
-// CHECK:       %[[RESHAPE:.*]] = memref.reshape %[[ARG]]
-// CHECK:       %[[STR1_ADDR:.*]] = llvm.mlir.addressof @[[STR1]]
-// CHECK:       %[[STR1_PTR:.*]] = llvm.getelementptr %[[STR1_ADDR]]
-// CHECK:       llvm.call @printCString(%[[STR1_PTR]]) : (!llvm.ptr<i8>)
-// CHECK:       %[[UNRANKED_BUF:.*]] = memref.cast %[[RESHAPE]]
-// CHECK:       call @printMemrefF32(%[[UNRANKED_BUF]]) : (memref<*xf32>)
-
-// Print debug info for reshape from ranked to unranked.
-// CHECK:       %[[ALLOC:.*]] = tf_framework.alloc
-// CHECK:       %[[RESHAPE_2:.*]] = memref.reshape %[[ALLOC]]
-// CHECK:       %[[STR2_ADDR:.*]] = llvm.mlir.addressof @[[STR2]]
-// CHECK:       %[[STR2_PTR:.*]] = llvm.getelementptr %[[STR2_ADDR]]
-// CHECK:       llvm.call @printCString(%[[STR2_PTR]]) : (!llvm.ptr<i8>)
-// CHECK:       call @printMemrefF32(%[[RESHAPE_2]]) : (memref<*xf32>)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
index f85658e6cea..79b1ca008b9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_jit_invocations.mlir
@@ -1,60 +1,90 @@
 // RUN: kernel-gen-opt %s --split-input-file \
 // RUN:   --tf-to-jit-invocation="tile-sizes=1,2,3 unroll-factors=3,2,1 \
-// RUN:     max-supported-rank=32 enable-ftz=false cpu-codegen=false" | \
+// RUN:       max-supported-rank=32 enable-ftz=false cpu-codegen=false" | \
 // RUN: FileCheck %s
 
-// CHECK-LABEL: @unary_tanh
-// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
+// RUN: kernel-gen-opt %s --split-input-file \
+// RUN:   --tf-to-jit-invocation="tile-sizes=1,2,3 unroll-factors=3,2,1 \
+// RUN:       max-supported-rank=32 enable-ftz=false cpu-codegen=false \
+// RUN:       jit_i64_indexed_for_large_tensors=true" | \
+// RUN: FileCheck %s --check-prefix=CHECK-JFLT
+
 func.func @unary_tanh(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
-  // CHECK-SAME: "
-  // CHECK-SAME: module {
-  // CHECK-SAME:   func @main(%arg0: tensor<*xf32>) -> tensor<*xf32>
-  // CHECK-SAME:     attributes {tf_entry}
-  // CHECK-SAME:   {
-  // CHECK-SAME:     %0 = \22tf.Tanh\22(%arg0)
-  // CHECK-SAME:     return %0
-  // CHECK-SAME:   }
-  // CHECK-SAME: }
-  // CHECK-SAME: "
-  // CHECK-SAME: {
-  // CHECK-SAME:   cpuCodegen = false
-  // CHECK-SAME:   enableFtz = false
-  // CHECK-SAME:   maxSupportedRank = 32 : i64
-  // CHECK-SAME:   tileSizes = [1, 2, 3]
-  // CHECK-SAME:   unrollFactors = [3, 2, 1]
-  // CHECK-SAME: }
-  // CHECK: %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG]])
-  // CHECK: return %[[RES]]
   %0 = "tf.Tanh"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: @unary_tanh
+// CHECK-SAME:  %[[ARG:.*]]: tensor<*xf32>
+// CHECK:       %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
+// CHECK-SAME:      "
+// CHECK-SAME:      module {
+// CHECK-SAME:        func @main(%[[ARG_JIT:.*]]: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-SAME:          attributes {tf_entry}
+// CHECK-SAME:        {
+// CHECK-SAME:          %[[RES_JIT:.*]] = \22tf.Tanh\22(%[[ARG_JIT]])
+// CHECK-SAME:          return %[[RES_JIT]]
+// CHECK-SAME:        }
+// CHECK-SAME:      }
+// CHECK-SAME:      "
+// CHECK-SAME:      {
+// CHECK-SAME:        cpuCodegen = false
+// CHECK-SAME:        enableFtz = false
+// CHECK-SAME:        maxSupportedRank = 32 : i64
+// CHECK-SAME:        tileSizes = [1, 2, 3]
+// CHECK-SAME:        unrollFactors = [3, 2, 1]
+// CHECK-SAME:      }
+// CHECK:       %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG]])
+// CHECK:       return %[[RES]]
+
+// CHECK-JFLT-LABEL: @unary_tanh
+// CHECK-JFLT-SAME:  %[[ARG0:.*]]: tensor<*xf32>
+// CHECK-JFLT-DAG:   %[[C4294967296:.*]] = arith.constant 4294967296
+// CHECK-JFLT:       %[[SHAPE:.*]] = shape.shape_of %[[ARG0]]
+// CHECK-JFLT:       %[[NUM:.*]] = shape.num_elements %[[SHAPE]]
+// CHECK-JFLT:       %[[CMPI:.*]] = arith.cmpi sgt, %[[NUM]], %[[C4294967296]]
+// CHECK-JFLT:       %[[IF:.*]] = scf.if %[[CMPI]]
+// CHECK-JFLT:         %[[JIT:.*]] = tf_framework.jit_compile_from_str
+// CHECK-JFLT-SAME:        "module
+// CHECK-JFLT-SAME:        cpuCodegen = false
+// CHECK-JFLT-SAME:        enableFtz = false
+// CHECK-JFLT-SAME:        index64Bit = true
+// CHECK-JFLT-SAME:        maxSupportedRank = 32
+// CHECK-JFLT-SAME:        tileSizes = [1, 2, 3]
+// CHECK-JFLT-SAME:        unrollFactors = [3, 2, 1]
+// CHECK-JFLT:         %[[JIT_0:.*]] = tf_framework.jit_execute %[[JIT]](%[[ARG0]])
+// CHECK-JFLT:         scf.yield %[[JIT_0]]
+// CHECK-JFLT:       else
+// CHECK-JFLT:         %[[VAL:.*]] = "tf.Tanh"(%[[ARG0]])
+// CHECK-JFLT:         scf.yield %[[VAL]]
+// CHECK-JFLT:       return %[[IF]]
+
 // -----
 
-// CHECK-LABEL: @binary_sub
-// CHECK-SAME: (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
 func.func @binary_sub(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
-  // CHECK-SAME: "
-  // CHECK-SAME: module {
-  // CHECK-SAME:   func @main(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32>
-  // CHECK-SAME:     attributes {tf_entry}
-  // CHECK-SAME:   {
-  // CHECK-SAME:     %0 = \22tf.Sub\22(%arg0, %arg1)
-  // CHECK-SAME:     return %0
-  // CHECK-SAME:   }
-  // CHECK-SAME: }
-  // CHECK-SAME: "
-  // CHECK-SAME: {
-  // CHECK-SAME:   cpuCodegen = false
-  // CHECK-SAME:   enableFtz = false
-  // CHECK-SAME:   maxSupportedRank = 32 : i64
-  // CHECK-SAME:   tileSizes = [1, 2, 3]
-  // CHECK-SAME:   unrollFactors = [3, 2, 1]
-  // CHECK-SAME: }
-  // CHECK: %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG0]], %[[ARG1]])
-  // CHECK: return %[[RES]]
   %0 = "tf.Sub"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
+
+// CHECK-LABEL: @binary_sub
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>
+// CHECK:       %[[CALLABLE:.*]] = tf_framework.jit_compile_from_str
+// CHECK-SAME:      "
+// CHECK-SAME:      module {
+// CHECK-SAME:        func @main(%[[ARG0_JIT:.*]]: tensor<*xf32>, %[[ARG1_JIT:.*]]: tensor<*xf32>) -> tensor<*xf32>
+// CHECK-SAME:          attributes {tf_entry}
+// CHECK-SAME:        {
+// CHECK-SAME:          %[[RES_JIT:.*]] = \22tf.Sub\22(%[[ARG0_JIT]], %[[ARG1_JIT]])
+// CHECK-SAME:          return %[[RES_JIT]]
+// CHECK-SAME:        }
+// CHECK-SAME:      }
+// CHECK-SAME:      "
+// CHECK-SAME:      {
+// CHECK-SAME:        cpuCodegen = false
+// CHECK-SAME:        enableFtz = false
+// CHECK-SAME:        maxSupportedRank = 32 : i64
+// CHECK-SAME:        tileSizes = [1, 2, 3]
+// CHECK-SAME:        unrollFactors = [3, 2, 1]
+// CHECK-SAME:      }
+// CHECK:       %[[RES:.*]] = tf_framework.jit_execute %[[CALLABLE]](%[[ARG0]], %[[ARG1]])
+// CHECK:       return %[[RES]]
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
index 482aea37393..38f0b297272 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index 71e862cd096..c893a81ab47 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -107,7 +107,7 @@ extern "C" void _mlir_ciface_tf_report_error(void* op_kernel_ctx,
   }
   auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
   ctx->CtxFailureWithWarning(
-      tensorflow::Status{ConvertAttrToEnumValue(symbol.getValue()), msg});
+      tensorflow::Status{ConvertAttrToEnumValue(symbol.value()), msg});
 }
 
 static void ReportError(void* op_kernel_ctx, ErrorCode error_code,
@@ -182,7 +182,7 @@ llvm::Expected<std::unique_ptr<ExecutionEngine>> Compile(
     tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> status_or_module =
         tensorflow::kernel_gen::GenerateKernelForTfCode(
             context, code, architectures, tile_sizes, unroll_factors,
-            max_supported_rank, /*embed_memref_prints=*/false,
+            max_supported_rank,
             /*print_ptx=*/false, /*print_llvmir=*/false, enable_ftz,
             index_64bit,
             /*jit_compile=*/false,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index 46b9ceb10b6..1a26888eb1f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -108,22 +108,21 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
            llvm::ArrayRef<std::string> architectures,
            llvm::ArrayRef<int64_t> tile_sizes,
            llvm::ArrayRef<int64_t> unroll_factors, int64_t max_supported_rank,
-           bool embed_memref_prints, bool print_ptx, bool print_llvmir,
-           bool enable_ftz, bool index_64bit, bool jit_compile,
-           bool jit_i64_indexed_for_large_tensors) {
+           bool print_ptx, bool print_llvmir, bool enable_ftz, bool index_64bit,
+           bool jit_compile, bool jit_i64_indexed_for_large_tensors) {
   // Read TF code.
   std::string tf_code;
   TF_RETURN_IF_ERROR(
       ReadFileToString(Env::Default(), input_file.str(), &tf_code));
+
   // Compile.
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
       GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
-                              unroll_factors, max_supported_rank,
-                              embed_memref_prints, print_ptx, print_llvmir,
-                              enable_ftz, index_64bit, jit_compile,
-                              jit_i64_indexed_for_large_tensors,
+                              unroll_factors, max_supported_rank, print_ptx,
+                              print_llvmir, enable_ftz, index_64bit,
+                              jit_compile, jit_i64_indexed_for_large_tensors,
                               /*apply_cl_options=*/true));
 
   // Get binary.
@@ -149,10 +148,6 @@ int main(int argc, char** argv) {
   llvm::cl::opt<bool> index_64bit("index_64bit",
                                   llvm::cl::desc("enable 64 bit indexing"),
                                   llvm::cl::init(false));
-  llvm::cl::opt<bool> embed_memref_prints(
-      "embed_memref_prints",
-      llvm::cl::desc("embed memref prints at the end of their lifetime"),
-      llvm::cl::init(false));
   llvm::cl::opt<bool> print_ptx(
       "print-ptx",
       llvm::cl::desc("print generated PTX code per target architecture."),
@@ -198,8 +193,8 @@ int main(int argc, char** argv) {
 
   auto status = tensorflow::kernel_gen::Run(
       input_file, output_file, architectures, tile_sizes, unroll_factors,
-      max_supported_rank, embed_memref_prints, print_ptx, print_llvmir,
-      enable_ftz, index_64bit, jit_compile, jit_i64_indexed_for_large_tensors);
+      max_supported_rank, print_ptx, print_llvmir, enable_ftz, index_64bit,
+      jit_compile, jit_i64_indexed_for_large_tensors);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
index 441f54a8848..7a862e83073 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tools/kernel-gen-opt/kernel-gen-opt.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 
 int main(int argc, char **argv) {
   mlir::registerAllPasses();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index d9af399327f..79a43aeb240 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -10,6 +10,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
     licenses = ["notice"],
 )
@@ -176,7 +177,6 @@ cc_library(
         "buffer_reuse_pass.cc",
         "bufferize_pass.cc",
         "copy_cleanup_pass.cc",
-        "embed_memref_prints.cc",
         "embed_tf_framework_pass.cc",
         "fuse_inner_parallel_loops_pass.cc",
         "parallel_loops_to_sequential.cc",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
index a99b0f38935..a0787525b39 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 
 constexpr llvm::StringRef
     mlir::kernel_gen::tf_framework::TFAllocOp::kReuseOutputAttrName;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
index 24c5e1c403c..91e80b920fa 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
index 46c83032570..75bf544a86f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 
 namespace mlir {
 namespace kernel_gen {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_memref_prints.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_memref_prints.cc
deleted file mode 100644
index 139a9fc7b1f..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_memref_prints.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-
-namespace mlir {
-namespace kernel_gen {
-namespace transforms {
-namespace {
-
-constexpr StringRef kPrintStringFuncName = "printCString";
-
-#define GEN_PASS_DEF_EMBEDMEMREFPRINTSPASS
-#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
-
-Operation* EmitMemRefPrint(Location loc, Type element_type, Value arg,
-                           OpBuilder* b) {
-  StringRef func_name;
-  if (element_type.isF32()) {
-    func_name = "printMemrefF32";
-  }
-  if (element_type.isF64()) {
-    func_name = "printMemrefF64";
-  }
-  if (element_type.isInteger(32)) {
-    func_name = "printMemrefI32";
-  }
-  if (element_type.isInteger(64) || element_type.isIndex()) {
-    func_name = "printMemrefI64";
-  }
-  assert(!func_name.empty() &&
-         "Did not find a print function for the element type");
-
-  auto caller_func =
-      b->getInsertionBlock()->getParent()->getParentOfType<func::FuncOp>();
-  auto func_name_attr = b->getStringAttr(func_name);
-
-  auto callee_func = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
-      caller_func, func_name_attr);
-  if (!callee_func) {
-    OpBuilder::InsertionGuard insertGuard(*b);
-
-    auto module = caller_func->getParentOfType<ModuleOp>();
-    b->setInsertionPointToStart(module.getBody());
-    auto func_type = FunctionType::get(b->getContext(), arg.getType(),
-                                       /*results=*/llvm::None);
-    callee_func =
-        b->create<func::FuncOp>(module.getLoc(), func_name, func_type);
-    callee_func.setPrivate();
-  }
-  return b->create<func::CallOp>(loc, callee_func, arg);
-}
-
-bool IsElementTypePrintalble(Type element_type) {
-  return element_type.isF32() || element_type.isF64() ||
-         element_type.isInteger(32) || element_type.isInteger(64) ||
-         element_type.isIndex();
-}
-
-void EmitMemRefPrint(Location loc, Value memref, OpBuilder* b) {
-  auto memref_type = memref.getType();
-  if (auto unranked_type = memref_type.dyn_cast<UnrankedMemRefType>()) {
-    Type element_type = unranked_type.getElementType();
-    if (!IsElementTypePrintalble(element_type)) return;
-
-    EmitMemRefPrint(loc, element_type, memref, b);
-  }
-  if (auto ranked_type = memref_type.dyn_cast<MemRefType>()) {
-    Type element_type = ranked_type.getElementType();
-    if (!IsElementTypePrintalble(element_type)) return;
-
-    if (element_type.isIndex()) {
-      element_type = b->getI64Type();
-      ranked_type = MemRefType::get(ranked_type.getShape(), element_type,
-                                    ranked_type.getLayout(),
-                                    ranked_type.getMemorySpace());
-      memref = b->create<arith::IndexCastOp>(loc, ranked_type, memref);
-    }
-
-    auto unranked_type = UnrankedMemRefType::get(
-        element_type, ranked_type.getMemorySpaceAsInt());
-    Value unranked_memref =
-        b->create<memref::CastOp>(loc, unranked_type, memref);
-    EmitMemRefPrint(loc, element_type, unranked_memref, b);
-  }
-}
-
-SmallVector<Value> ExtractValuesToPrint(Operation* op) {
-  if (isa<memref::ReinterpretCastOp>(op) || isa<memref::ReshapeOp>(op) ||
-      isa<memref::ExpandShapeOp>(op) || isa<memref::CollapseShapeOp>(op)) {
-    return {op->getResult(0)};
-  }
-  if (auto linalg = dyn_cast<linalg::LinalgOp>(op)) {
-    return linalg.getDpsInitOperands();
-  }
-  if (auto loop = dyn_cast<gml_st::LoopOp>(op)) {
-    return loop.getOutputs();
-  }
-  if (auto loop = dyn_cast<scf::ForOp>(op)) {
-    return loop.getIterOperands();
-  }
-  if (auto copy = dyn_cast<memref::CopyOp>(op)) {
-    return {copy.getTarget()};
-  }
-  return {};
-}
-
-void EmitOperationPrint(Operation* op, OpBuilder* b) {
-  std::string debug_str = "\n\nPrint memref content after the following op\n";
-  llvm::raw_string_ostream output_stream(debug_str);
-
-  mlir::OpPrintingFlags flags;
-  op->print(output_stream, flags);
-  output_stream << "\n\n";
-
-  Location loc = op->getLoc();
-  Value message_constant = CreateOrFindGlobalStringConstant(
-      loc, GetGlobalName("debug_op", debug_str), debug_str, b);
-
-  // Insert function call.
-  MLIRContext* ctx = op->getContext();
-  auto func_type = LLVM::LLVMFunctionType::get(
-      LLVM::LLVMVoidType::get(op->getContext()),
-      {LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8))});
-  FlatSymbolRefAttr tf_func_ref =
-      GetOrInsertLLVMFunction(kPrintStringFuncName, func_type, op, b);
-  b->create<LLVM::CallOp>(loc, llvm::None, tf_func_ref,
-                          llvm::makeArrayRef({message_constant}));
-}
-
-// The pass inserts printing on every mutation of memrefs.
-struct EmbedMemRefPrintsPass
-    : public impl::EmbedMemRefPrintsPassBase<EmbedMemRefPrintsPass> {
-  void runOnOperation() override {
-    ModuleOp module = getOperation();
-    module.walk([&](func::FuncOp func) {
-      if (func.isDeclaration()) return;
-      Block* body = &func.getBody().front();
-
-      // Print arguments.
-      OpBuilder b(&getContext());
-      b.setInsertionPointToStart(body);
-      Location loc = func.getLoc();
-      auto args = func.getArguments();
-      if (!args.empty()) {
-        EmitOperationPrint(func, &b);
-      }
-      for (auto arg : args) {
-        EmitMemRefPrint(loc, arg, &b);
-      }
-      // Print buffers after every change.
-      for (auto& op : func.getBody().front().getOperations()) {
-        b.setInsertionPointAfter(&op);
-        auto memrefs = ExtractValuesToPrint(&op);
-        if (!memrefs.empty()) {
-          EmitOperationPrint(&op, &b);
-        }
-        for (auto memref : memrefs) {
-          EmitMemRefPrint(op.getLoc(), memref, &b);
-        }
-      }
-    });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbedMemRefPrintsPass() {
-  return std::make_unique<EmbedMemRefPrintsPass>();
-}
-
-}  // namespace transforms
-}  // namespace kernel_gen
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 4a807527e92..dcb59b2ae06 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
@@ -143,7 +143,6 @@ class GpuKernelToBlobPass
 
     // Compile and collect requested cubin and PTX images.
     std::vector<tensorflow::se::CubinOrPTXImage> images;
-    TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
     auto gpu_asm_opts =
         xla::gpu::PtxOptsFromDebugOptions(config.debug_options());
     for (const std::string& arch_str : architectures_) {
@@ -165,7 +164,7 @@ class GpuKernelToBlobPass
           xla::gpu::nvptx::CompileToPtx(
               llvm_module_copy.get(),
               tensorflow::se::CudaComputeCapability{cc_major, cc_minor}, config,
-              libdevice_dir, enable_fusion));
+              enable_fusion));
       if (print_ptx_) {
         llvm::dbgs() << "Generated PTX code for module '"
                      << gpu_module.getName() << "' on architecture sm_" << arch
@@ -237,21 +236,6 @@ class GpuKernelToBlobPass
     return std::pair<bool, int>(is_compute_profile, arch);
   }
 
-  tensorflow::StatusOr<std::string> GetLibdeviceDir(
-      const xla::HloModuleConfig& hlo_module_config) {
-    for (const std::string& cuda_root : tsl::CandidateCudaRoots(
-             hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
-      std::string libdevice_dir =
-          tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
-      VLOG(2) << "Looking for libdevice at " << libdevice_dir;
-      if (tsl::Env::Default()->IsDirectory(libdevice_dir).ok()) {
-        VLOG(2) << "Found libdevice dir " << libdevice_dir;
-        return libdevice_dir;
-      }
-    }
-    return tensorflow::errors::Internal(
-        "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
-  }
   bool enable_ftz_;
 };
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index bc775f3877e..167b370e17f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -35,7 +35,6 @@ limitations under the License.
 #define GEN_PASS_DECL_PARALLELLOOPSTOSEQUENTIAL
 #define GEN_PASS_DECL_PROPAGATETFABIKNOWLEDGETOKERNELS
 #define GEN_PASS_DECL_PROPAGATESHAPEKNOWLEDGETOKERNELS
-#define GEN_PASS_DECL_EMBEDMEMREFPRINTSPASS
 #define GEN_PASS_DECL_FUSEINNERPARALLELLOOPSPASS
 #define GEN_PASS_DECL_COPYCLEANUPPASS
 
@@ -95,9 +94,6 @@ CreatePropagateTfAbiKnowledgeToKernels();
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreatePropagateShapeKnowledgeToKernels();
 
-// Pass to print content of memrefs.
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbedMemRefPrintsPass();
-
 /// Greedily maps loops to GPU hardware dimensions.
 std::unique_ptr<mlir::OperationPass<func::FuncOp>> CreateMapParallelLoopsPass();
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index a12551c7205..3e32ca9da0d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -111,12 +111,6 @@ def PropagateShapeKnowledgeToKernels
   let constructor = "transforms::CreatePropagateShapeKnowledgeToKernels()";
 }
 
-def EmbedMemRefPrintsPass : Pass<"embed-memref-prints", "mlir::ModuleOp"> {
-  let summary = "Pass to print content of memrefs";
-  let constructor = "transforms::CreateEmbedMemRefPrintsPass()";
-  let dependentDialects = ["LLVM::LLVMDialect"];
-}
-
 def FuseInnerParallelLoopsPass
     : Pass<"fuse-inner-parallel-loops", "mlir::func::FuncOp"> {
   let summary = "Limited pass to forward stores to loads.";
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
index 5d9efdc5f46..246556da0ca 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 
 #define DEBUG_TYPE "kernel-gen-shapes"
 
@@ -120,7 +120,7 @@ struct ShapeValue {
 
   ArrayRef<ValueOrConst> scalars() const {
     assert(!is_vector);
-    return llvm::makeArrayRef(shape);
+    return llvm::ArrayRef(shape);
   }
 
   bool isVector() const { return is_vector; }
@@ -294,8 +294,7 @@ class ShapeEqualityKnowledge {
             if (!candidate) candidate = dimOp.getSource();
             auto index = dimOp.getConstantIndex();
             if (!index.has_value()) return false;
-            return candidate == dimOp.getSource() &&
-                   p.index() == index.getValue();
+            return candidate == dimOp.getSource() && p.index() == index.value();
           });
       if (all_are_dimops && candidate) {
         equal_shapes_.unionSets(candidate.getAsOpaquePointer(),
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
index 4eb1b7f3fad..debac04d583 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 
 namespace mlir {
 namespace kernel_gen {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index d36ddaca580..d888f664830 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -59,13 +59,13 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
 
   std::pair<Value, Value> ConvertArrayAttrToStackAllocatedArray(
       Location loc, Type size_ty, Type element_ty,
-      llvm::Optional<ArrayAttr> attr, ConversionPatternRewriter *rewriter,
+      std::optional<ArrayAttr> attr, ConversionPatternRewriter *rewriter,
       std::function<Value(Attribute)> create_element) const {
     Type element_ptr_ty = LLVM::LLVMPointerType::get(element_ty);
 
     // If the attribute is missing or empty, set the element count to 0 and
     // return NULL.
-    if (!attr.has_value() || attr.getValue().empty()) {
+    if (!attr.has_value() || attr.value().empty()) {
       Value zero = rewriter->create<LLVM::ConstantOp>(
           loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
       Value null_ptr = rewriter->create<LLVM::NullOp>(loc, element_ptr_ty);
@@ -73,7 +73,7 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     }
 
     // Allocate array to store the elements.
-    auto &array_attr = attr.getValue();
+    auto &array_attr = attr.value();
     Value array_size = rewriter->create<LLVM::ConstantOp>(
         loc, size_ty, rewriter->getIntegerAttr(size_ty, array_attr.size()));
     Value array_ptr = rewriter->create<LLVM::AllocaOp>(
@@ -91,7 +91,7 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
 
   std::pair<Value, Value> ConvertIntegerArrayAttrToStackAllocatedArray(
       Location loc, Type size_ty, Type element_ty,
-      llvm::Optional<ArrayAttr> attr,
+      std::optional<ArrayAttr> attr,
       ConversionPatternRewriter *rewriter) const {
     assert(size_ty.isa<IntegerType>() && "expect integer size type");
     assert(element_ty.isa<IntegerType>() && "expect integer element type");
@@ -134,7 +134,7 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
     Value output_index = rewriter.create<LLVM::ConstantOp>(
         loc, llvmInt32Type,
         rewriter.getI32IntegerAttr(tf_alloc_op.getOutputIndex().has_value()
-                                       ? tf_alloc_op.getOutputIndex().getValue()
+                                       ? tf_alloc_op.getOutputIndex().value()
                                        : -1));
 
     // Convert `candidate_input_indices`.
@@ -150,10 +150,9 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
         rewriter
             .create<LLVM::CallOp>(
                 loc, getVoidPtrType(), tf_func_ref,
-                llvm::makeArrayRef({adaptor.getCtx(), num_elements,
-                                    element_size, output_index,
-                                    candidates_count_and_ptr.first,
-                                    candidates_count_and_ptr.second}))
+                llvm::ArrayRef({adaptor.getCtx(), num_elements, element_size,
+                                output_index, candidates_count_and_ptr.first,
+                                candidates_count_and_ptr.second}))
             .getResult();
 
     MemRefDescriptor memRefDescriptor = CreateMemRefDescriptor(
@@ -173,7 +172,7 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
     Type llvm_void_ptr_type = getVoidPtrType();
     return LLVM::LLVMFunctionType::get(
         llvm_void_ptr_type,
-        llvm::makeArrayRef(
+        llvm::ArrayRef(
             {/*void* op_kernel_ctx*/ llvm_void_ptr_type,
              /*size_t num_elements*/ getIndexType(),
              /*size_t element_size*/ getIndexType(),
@@ -239,7 +238,7 @@ class TFDeallocOpConverter : public ConvertToLLVMCallOpPattern<TFDeallocOp> {
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, llvm::None, tf_func_ref,
-        llvm::makeArrayRef({adaptor.getCtx(), allocated_bytes_ptr}));
+        llvm::ArrayRef({adaptor.getCtx(), allocated_bytes_ptr}));
     return success();
   }
 
@@ -285,10 +284,10 @@ class JITCompileFromStrOpConverter
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, getVoidPtrType(), tf_func_ref,
-        llvm::makeArrayRef({adaptor.getCtx(), jit_module_code, tile_sizes.first,
-                            tile_sizes.second, unroll_factors.first,
-                            unroll_factors.second, max_supported_rank,
-                            enable_ftz, index_64bit, cpu_codegen}));
+        llvm::ArrayRef({adaptor.getCtx(), jit_module_code, tile_sizes.first,
+                        tile_sizes.second, unroll_factors.first,
+                        unroll_factors.second, max_supported_rank, enable_ftz,
+                        index_64bit, cpu_codegen}));
     return success();
   }
 
@@ -418,7 +417,7 @@ class ReportErrorOpConverter
         adaptor.getErrorCodeAttr());
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, llvm::None, tf_func_ref,
-        llvm::makeArrayRef({adaptor.getCtx(), error_code, message_constant}));
+        llvm::ArrayRef({adaptor.getCtx(), error_code, message_constant}));
     return success();
   }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index 75caa39d1e4..0cc3e5f3a66 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -270,7 +270,7 @@ class TFKernelToLLVMPass
     arith::populateArithExpandOpsPatterns(patterns);
     memref::populateExpandOpsPatterns(patterns);
     arith::populateArithToLLVMConversionPatterns(type_converter, patterns);
-    populateMemRefToLLVMConversionPatterns(type_converter, patterns);
+    populateFinalizeMemRefToLLVMConversionPatterns(type_converter, patterns);
     populateMathToLLVMConversionPatterns(type_converter, patterns);
     populateFuncToLLVMConversionPatterns(type_converter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(type_converter, patterns);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
index 36147381845..6bc16df2a50 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_to_jit_invocations.cc
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -34,7 +33,6 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
@@ -49,7 +47,7 @@ namespace kernel_gen {
 namespace transforms {
 namespace {
 
-constexpr int64_t i32BitLimit = 4294967296;
+constexpr int64_t i32Limit = 4294967296;
 using shape::ShapeOfOp;
 
 bool IsSingleResultTFOperation(Operation *op) {
@@ -93,7 +91,7 @@ struct TFToJITInvocationsPattern : public RewritePattern {
                                           op->getOperandTypes(), locs);
 
       // Map operands.
-      BlockAndValueMapping bvm;
+      IRMapping bvm;
       for (auto it : llvm::zip(op->getOperands(), block->getArguments()))
         bvm.map(std::get<0>(it), std::get<1>(it));
 
@@ -122,60 +120,51 @@ struct TFToI64JITInvocationForLargeTensorsPattern : public RewritePattern {
       return failure();
     }
 
-    auto results = llvm::to_vector<16>(op->getResults());
-    auto operand_types = llvm::to_vector<16>(llvm::map_range(
-        op->getOperands(), [](Value v) { return v.getType(); }));
-    auto result_types = llvm::to_vector<16>(
-        llvm::map_range(results, [](Value v) { return v.getType(); }));
-
-    // Create the JIT compile op.
+    // Create large argument condition.
     auto loc = op->getLoc();
-    Value shape_size_limit =
-        rewriter.create<arith::ConstantIndexOp>(loc, i32BitLimit);
     auto arg = op->getOperands().front();
     auto shape = rewriter.create<shape::ShapeOfOp>(loc, arg);
     auto num_elems = rewriter.create<shape::NumElementsOp>(loc, shape);
-    Value coniditon_check_main = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, num_elems, shape_size_limit);
-
-    Value conditional_path =
-        rewriter
-            .create<scf::IfOp>(
-                loc, op->getResultTypes(), coniditon_check_main,
-                [&](OpBuilder &b, Location l) {
-                  auto jit_compile_op =
-                      rewriter.create<tf_framework::JITCompileOp>(
-                          loc,
-                          rewriter.getType<tf_framework::JITCallableType>(),
-                          llvm::None);
-                  BlockAndValueMapping bvm;
-                  {
-                    OpBuilder::InsertionGuard guard(rewriter);
-                    Block *block = rewriter.createBlock(
-                        &jit_compile_op.getBody(), {}, operand_types,
-                        SmallVector<Location>(operand_types.size(), loc));
-                    for (auto it :
-                         llvm::zip(op->getOperands(), block->getArguments()))
-                      bvm.map(std::get<0>(it), std::get<1>(it));
-                    rewriter.setInsertionPointToStart(block);
-                    rewriter.clone(*op, bvm);
-                    auto new_op = rewriter.clone(*op, bvm);
-                    rewriter.create<tf_framework::JITCompileYieldOp>(
-                        loc, TypeRange{}, new_op->getResults());
-                  }
-                  auto jit_execute_op =
-                      rewriter.create<tf_framework::JITExecuteOp>(
-                          loc, result_types, Value(),
-                          jit_compile_op.getResult(), op->getOperands());
-                  b.create<scf::YieldOp>(l, jit_execute_op.getResult());
-                },
-                [&](OpBuilder &b, Location l) {
-                  auto new_op = rewriter.clone(*op);
-                  b.create<scf::YieldOp>(l, new_op->getResult(0));
-                })
-            .getResult(0);
-
-    rewriter.replaceOp(op, conditional_path);
+    Value cst_i32_limit =
+        rewriter.create<arith::ConstantIndexOp>(loc, i32Limit);
+    Value large_tensor_predicate = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sgt, num_elems, cst_i32_limit);
+
+    // Create dispatch code.
+    auto jit_body_builder_fn = [&](OpBuilder &b, Location loc) {
+      // Create JIT compile op.
+      auto callable_ty = b.getType<tf_framework::JITCallableType>();
+      auto jit_compile_op = b.create<tf_framework::JITCompileOp>(
+          loc, callable_ty, /*ctx=*/Value());
+      IRMapping bvm;
+      {
+        OpBuilder::InsertionGuard g(b);
+        Block *block =
+            b.createBlock(&jit_compile_op.getBody(), {}, op->getOperandTypes(),
+                          SmallVector<Location>(op->getNumOperands(), loc));
+        for (auto it : llvm::zip(op->getOperands(), block->getArguments()))
+          bvm.map(std::get<0>(it), std::get<1>(it));
+        b.setInsertionPointToStart(block);
+        Operation *cloned_op = b.clone(*op, bvm);
+        b.create<tf_framework::JITCompileYieldOp>(
+            loc, cloned_op->getResults().front());
+      }
+
+      // Create JIT execute op.
+      auto jit_execute_op = b.create<tf_framework::JITExecuteOp>(
+          loc, op->getResultTypes().front(), /*ctx=*/Value(),
+          jit_compile_op.getResult(), arg);
+      b.create<scf::YieldOp>(loc, jit_execute_op.getResult());
+    };
+    auto aot_body_builder_fn = [&](OpBuilder &b, Location loc) {
+      Operation *cloned_op = b.clone(*op);
+      b.create<scf::YieldOp>(loc, cloned_op->getResults().front());
+    };
+
+    // Create and replace in two steps to clone the original op.
+    auto ifOp = rewriter.create<scf::IfOp>(
+        loc, large_tensor_predicate, jit_body_builder_fn, aot_body_builder_fn);
+    rewriter.replaceOp(op, ifOp.getResults());
     return success();
   }
 };
@@ -188,14 +177,13 @@ struct PackJITCompileOpPattern
                                    llvm::ArrayRef<int64_t> tile_sizes,
                                    llvm::ArrayRef<int64_t> unroll_factors,
                                    int64_t max_supported_rank, bool enable_ftz,
-                                   bool index_64bit_if_jit_compiling,
-                                   bool cpu_codegen)
+                                   bool index_64bit, bool cpu_codegen)
       : OpRewritePattern<tf_framework::JITCompileOp>(ctx),
         tile_sizes(tile_sizes),
         unroll_factors(unroll_factors),
         max_supported_rank(max_supported_rank),
         enable_ftz(enable_ftz),
-        index_64bit_if_jit_compiling(index_64bit_if_jit_compiling),
+        index_64bit(index_64bit),
         cpu_codegen(cpu_codegen) {}
 
   LogicalResult matchAndRewrite(tf_framework::JITCompileOp op,
@@ -207,25 +195,31 @@ struct PackJITCompileOpPattern
     // Temporarily, build the module that would be JIT-compiled. This is only to
     // obtain the serialized code attribute.
     auto loc = op->getLoc();
-    OpBuilder tmp_module_builder(getContext(), rewriter.getListener());
-    auto jit_module = tmp_module_builder.create<ModuleOp>(loc);
-    tmp_module_builder.setInsertionPointToStart(
-        jit_module.SingleBlock::getBody());
-    auto jit_function = tmp_module_builder.create<func::FuncOp>(
-        loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
-        tmp_module_builder.getFunctionType(body->getArgumentTypes(),
-                                           yield_op->getOperandTypes()));
-    jit_function->setAttr(tf_framework::TFFrameworkDialect::kTFEntryAttrName,
-                          tmp_module_builder.getUnitAttr());
-    jit_function.getBody().takeBody(op.getBodyRegion());
-    tmp_module_builder.setInsertionPointToEnd(&jit_function.getBody().front());
-    tmp_module_builder.create<func::ReturnOp>(loc, yield_op.getResult());
-    rewriter.eraseOp(yield_op);
+    auto jit_module = rewriter.create<ModuleOp>(loc);
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(jit_module.SingleBlock::getBody());
+      auto jit_function = rewriter.create<func::FuncOp>(
+          loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
+          rewriter.getFunctionType(body->getArgumentTypes(),
+                                   yield_op->getOperandTypes()));
+      jit_function->setAttr(tf_framework::TFFrameworkDialect::kTFEntryAttrName,
+                            rewriter.getUnitAttr());
+      jit_function.getBody().takeBody(op.getBodyRegion());
+      rewriter.setInsertionPointToEnd(&jit_function.getBody().front());
+      rewriter.create<func::ReturnOp>(loc, yield_op.getResult());
+      rewriter.eraseOp(yield_op);
+    }
 
     // Serialize JIT module.
     std::string code;
     llvm::raw_string_ostream ss(code);
-    jit_module.print(ss);
+    assert(succeeded(jit_module.verify()));
+    mlir::OpPrintingFlags flags;
+    jit_module.print(ss, flags.assumeVerified());
+
+    // Remove temporary module.
+    rewriter.eraseOp(jit_module);
 
     // Finally, create the new JIT compile op.
     rewriter.replaceOpWithNewOp<tf_framework::JITCompileFromStrOp>(
@@ -233,8 +227,7 @@ struct PackJITCompileOpPattern
         rewriter.getI64ArrayAttr(tile_sizes),
         rewriter.getI64ArrayAttr(unroll_factors),
         rewriter.getI64IntegerAttr(max_supported_rank),
-        rewriter.getBoolAttr(enable_ftz),
-        rewriter.getBoolAttr(index_64bit_if_jit_compiling),
+        rewriter.getBoolAttr(enable_ftz), rewriter.getBoolAttr(index_64bit),
         rewriter.getBoolAttr(cpu_codegen));
 
     return success();
@@ -245,7 +238,7 @@ struct PackJITCompileOpPattern
   llvm::ArrayRef<int64_t> unroll_factors;
   int64_t max_supported_rank;
   bool enable_ftz;
-  bool index_64bit_if_jit_compiling;
+  bool index_64bit;
   bool cpu_codegen;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index 935d6d08bf8..b942c64f611 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -8,6 +8,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 
 # TODO: Tighten visibility once targets are at the right granularity.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     licenses = ["notice"],
 )
@@ -193,6 +194,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineTransforms",
         "@llvm-project//mlir:Dialect",
diff --git a/tensorflow/compiler/mlir/tosa/tests/BUILD b/tensorflow/compiler/mlir/tosa/tests/BUILD
index 23b25f7b706..e7c4a5b9a61 100644
--- a/tensorflow/compiler/mlir/tosa/tests/BUILD
+++ b/tensorflow/compiler/mlir/tosa/tests/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index 1e18821b1e0..c0e5b23e3b8 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -10,7 +10,7 @@
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[3, 0, 1, 2]> : tensor<4xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32> {
   %3 = "tf.Conv2D"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32>
   func.return %3 : tensor<1x32x32x16xf32>
@@ -20,7 +20,7 @@ func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>
 
 // CHECK-LABEL: test_depthwise_conv2d
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.depthwise_conv2d"(%arg0, %arg1, %0) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+// CHECK: %[[VAR1:.*]] = "tosa.depthwise_conv2d"(%arg0, %arg1, %0) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32> {
   %5 = "tf.DepthwiseConv2dNative"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x2xf32>) -> tensor<1x32x32x16xf32>
   %6 = "tf.Identity"(%5)   : (tensor<1x32x32x16xf32>) -> tensor<1x32x32x16xf32>
@@ -29,11 +29,12 @@ func.func @test_depthwise_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2
 
 // -----
 
-// CHECK-LABEL: test_transpose_conv2d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.transpose_conv2d"(%arg0, %[[VAR2]], %[[VAR1]]) {out_pad = [0, 0, 0, 0], out_shape = [1, 32, 32, 16], stride = [1, 1]}
+// CHECK-LABEL: @test_transpose_conv2d
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8xf32>, %[[ARG1:.*]]: tensor<1x1x16x8xf32>
+// CHECK:         %[[CONST:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
+// CHECK:         %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) {new_shape = array<i64: 16, 1, 1, 8>}
+// CHECK:         %[[TRANSPOSE:.*]] = "tosa.transpose_conv2d"(%[[ARG0]], %[[RESHAPE]], %[[CONST]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
+// CHECK:         return %[[TRANSPOSE]]
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1x16x8xf32>) -> tensor<1x32x32x16xf32> {
   %3 = "tf.Const"()  {value = dense<[1, 32, 32, 16]> : tensor<4xi32>}  : () -> tensor<4xi32>
   %4 = "tf.Conv2DBackpropInput"(%3, %arg1, %arg0)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<4xi32>, tensor<1x1x16x8xf32>, tensor<1x32x32x8xf32>) -> tensor<1x32x32x16xf32>
@@ -48,7 +49,7 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1
 // CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<4xf32>}
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = [1, 1, 1], pad = [0, 1, 0, 1, 0, 1], stride = [1, 2, 2]}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 0, 1, 0, 1>, stride = array<i64: 1, 2, 2>}
 func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x4x64x64x4xf32> {
   %0 = "tf.Conv3D"(%arg0, %arg1) {data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 2, 2, 1]} : (tensor<2x4x128x128x8xf32>, tensor<2x3x3x2x4xf32>) -> tensor<2x4x64x64x4xf32>
   return %0 : tensor<2x4x64x64x4xf32>
@@ -62,7 +63,7 @@ func.func @test_conv3d(%arg0: tensor<2x4x128x128x8xf32>, %arg1: tensor<2x3x3x2x4
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<10xf32>) -> tensor<3x32x16x16x10xf32>
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = [1, 1, 1], pad = [0, 1, 1, 1, 1, 1], stride = [1, 1, 1]}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_bias(%arg0: tensor<3x32x16x16x5xf32>, %arg1: tensor<2x3x3x5x10xf32>, %bias: tensor<10xf32>) -> tensor<3x32x16x16x10xf32> {
   %0 = "tf.Conv3D"(%arg0, %arg1) {data_format = "NDHWC", device = "", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<3x32x16x16x5xf32>, tensor<2x3x3x5x10xf32>) -> tensor<3x32x16x16x10xf32>
   %1 = "tf.BiasAdd"(%0, %bias) {data_format = "NHWC", device = ""} : (tensor<3x32x16x16x10xf32>, tensor<10xf32>) -> tensor<3x32x16x16x10xf32>
@@ -241,7 +242,7 @@ func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> {
 
 // CHECK-LABEL: test_reduce_any
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Any"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -252,7 +253,7 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 
 // CHECK-LABEL: test_reduce_all
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_all"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.All"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -263,7 +264,7 @@ func.func @test_reduce_all(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 
 // CHECK-LABEL: test_reduce_min
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Min"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -274,7 +275,7 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_max
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Max"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -285,7 +286,7 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Sum"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -294,10 +295,27 @@ func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // -----
 
+// CHECK-LABEL: test_reduce_sum_nonzero_axis
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<10x20x30x40x50xf32>, tensor<5xi32>) -> tensor<10x20x30x50x40xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) {new_shape = array<i64: 300000, 40>} : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 10, 20, 30, 50>} : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
+// CHECK: return %[[VAL_5]] : tensor<10x20x30x50xf32>
+func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._user_specified_name = "inp_list"}) -> tensor<10x20x30x50xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Sum"(%arg0, %cst) {device = "", keep_dims = false} : (tensor<10x20x30x40x50xf32>, tensor<i32>) -> tensor<10x20x30x50xf32>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<10x20x30x50xf32>) -> tensor<10x20x30x50xf32>
+  func.return %1 : tensor<10x20x30x50xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_reduce_mean
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.0769230798> : tensor<1x1xf32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = [21, 3]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 21, 3>}
 // CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
 func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
@@ -309,7 +327,7 @@ func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_product
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Prod"(%arg0, %2)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -399,6 +417,62 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // -----
 
+// CHECK-LABEL: test_sin
+// CHECK-SAME: -> tensor<10xf32>
+func.func @test_sin(%arg0: tensor<10xf32>) -> tensor<*xf32> {
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%arg0, %[[IN_SCALE]])
+  // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
+  // CHECK-DAG: %[[SUB1:.+]] = "tosa.sub"(%[[IN_SCALED]], %[[FLOOR]])
+  // CHECK-DAG: %[[MUL1:.+]] = "tosa.mul"(%[[SUB1]], %[[TWO]])
+  // CHECK-DAG: %[[SUB2:.+]] = "tosa.sub"(%[[MUL1]], %[[ONE]])
+  // CHECK-DAG: %[[MUL2:.+]] = "tosa.mul"(%[[SUB2]], %[[INT_MAX]])
+  // CHECK-DAG: %[[TO_INT:.+]] = "tosa.cast"(%[[MUL2]])
+  // CHECK-DAG: %[[TABLE:.+]] = "tosa.table"(%[[TO_INT]], %[[TBLVAL]])
+  // CHECK-DAG: %[[TABLE_CAST:.+]] = "tosa.cast"(%[[TABLE]])
+  // CHECK-DAG: %[[RESULT:.+]] = "tosa.mul"(%[[TABLE_CAST:.+]], %[[RESULT_SCALE]])
+  %0 = "tf.Sin"(%arg0) : (tensor<10xf32>) -> tensor<*xf32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cos
+// CHECK-SAME: -> tensor<10xf32>
+func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
+  // CHECK-DAG: %[[RESULT_SCALE:.+]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1xf32>}
+  // CHECK-DAG: %[[INT_MAX:.+]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1xf32>}
+  // CHECK-DAG: %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[TWO:.+]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>}
+  // CHECK-DAG: %[[IN_SCALE:.+]] = "tosa.const"() {value = dense<0.159154937> : tensor<1xf32>}
+  // CHECK-DAG: %[[HALF_PI:.+]] = "tosa.const"() {value = dense<1.57079637> : tensor<1xf32>}
+  // CHECK-DAG: %[[TBLVAL:.+]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>}
+  // CHECK-DAG: %[[IN_TRANSLATE:.+]] = "tosa.add"(%arg0, %[[HALF_PI]])
+  // CHECK-DAG: %[[IN_SCALED:.+]] = "tosa.mul"(%[[IN_TRANSLATE]], %[[IN_SCALE]])
+  // CHECK-DAG: %[[FLOOR:.+]] = "tosa.floor"(%[[IN_SCALED]])
+  // CHECK-DAG: %[[SUB1:.+]] = "tosa.sub"(%[[IN_SCALED]], %[[FLOOR]])
+  // CHECK-DAG: %[[MUL1:.+]] = "tosa.mul"(%[[SUB1]], %[[TWO]])
+  // CHECK-DAG: %[[SUB2:.+]] = "tosa.sub"(%[[MUL1]], %[[ONE]])
+  // CHECK-DAG: %[[MUL2:.+]] = "tosa.mul"(%[[SUB2]], %[[INT_MAX]])
+  // CHECK-DAG: %[[TO_INT:.+]] = "tosa.cast"(%[[MUL2]])
+  // CHECK-DAG: %[[TABLE:.+]] = "tosa.table"(%[[TO_INT]], %[[TBLVAL]])
+  // CHECK-DAG: %[[TABLE_CAST:.+]] = "tosa.cast"(%[[TABLE]])
+  // CHECK-DAG: %[[RESULT:.+]] = "tosa.mul"(%[[TABLE_CAST:.+]], %[[RESULT_SCALE]])
+  %0 = "tf.Cos"(%arg0) : (tensor<10xf32>) -> tensor<*xf32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_sigmoid
 // CHECK: %[[VAR0:.*]] = "tosa.sigmoid"(%arg0)
 func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
@@ -475,7 +549,7 @@ func.func @test_argmax(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xi32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.AvgPool"(%arg0)  {data_format = "NHWC", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -484,7 +558,7 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32
 // -----
 
 // CHECK-LABEL: test_max_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.MaxPool"(%arg0)  {data_format = "NHWC", explicit_paddings = [], ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -493,7 +567,7 @@ func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32
 // -----
 
 // CHECK-LABEL: test_reshape
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 819]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 819>}
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
   %0 = "tf.Const"()  {value = dense<[1, 819]> : tensor<2xi32>}  : () -> tensor<2xi32>
   %3 = "tf.Reshape"(%arg0, %0)   : (tensor<13x21x3xf32>, tensor<2xi32>) -> tensor<1x819xf32>
@@ -515,7 +589,7 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [4, 11, 1], start = [6, 8, 0]}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
   %2 = "tf.Const"()  {value = dense<[6, 8, 0]> : tensor<3xi64>}  : () -> tensor<3xi64>
   %3 = "tf.Const"()  {value = dense<[4, 11, 1]> : tensor<3xi64>}  : () -> tensor<3xi64>
@@ -526,10 +600,10 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [9, 21, 2], start = [4, 0, 1]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [9, 1, 7, 3, 2, 1]}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = [9, 1, 7, 1, 2, 1], start = [0, 0, 0, 0, 0, 0]}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [9, 7, 2]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 9, 7, 2>}
 func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
   %2 = "tf.Const"()  {value = dense<[4, 0, 1]> : tensor<3xi64>}  : () -> tensor<3xi64>
   %3 = "tf.Const"()  {value = dense<[13, 21, 3]> : tensor<3xi64>}  : () -> tensor<3xi64>
@@ -541,7 +615,7 @@ func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = [1, 1, 1]} : (tensor<1xi1>) -> tensor<1x1x1xi1>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1>} : (tensor<1xi1>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = "tosa.select"(%[[VAR1]], %arg0, %arg1)
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
   %2 = "tf.SelectV2"(%arg2, %arg0, %arg1)   : (tensor<1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -573,7 +647,7 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 
 // CHECK-LABEL: test_stack
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [4, 13, 21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 4, 13, 21, 3>}
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %2 = "tf.Pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i64}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
   func.return %2 : tensor<4x13x21x3xf32>
@@ -582,7 +656,7 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [32, 32, 8]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 32, 32, 8>}
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32> {
   %2 = "tf.Unpack"(%arg0)  {axis = 0 : i64}  : (tensor<1x32x32x8xf32>) -> tensor<32x32x8xf32>
   %3 = "tf.Identity"(%2)   : (tensor<32x32x8xf32>) -> tensor<32x32x8xf32>
@@ -604,7 +678,7 @@ func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<15x23x5xf32> {
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 21, 3]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 21, 3>}
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
   %3 = "tf.ExpandDims"(%arg0, %2)   : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<1x13x21x3xf32>
@@ -614,7 +688,7 @@ func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<1x13x21x3xf32>
 // -----
 
 // CHECK-LABEL: test_expand_dims_negative_index
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [13, 1, 21, 3]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 1, 21, 3>}
 func.func @test_expand_dims_negative_index(%arg0: tensor<13x21x3xf32>) -> tensor<13x1x21x3xf32> {
   %2 = "tf.Const"()  {value = dense<-2> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.ExpandDims"(%arg0, %2)   : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<13x1x21x3xf32>
@@ -690,10 +764,10 @@ func.func @test_batch_matmul_3d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x3x4
 // -----
 
 // CHECK-LABEL: test_batch_matmul_4d
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [65, 21, 3]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [65, 3, 42]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 65, 21, 3>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 65, 3, 42>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [5, 13, 21, 42]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 5, 13, 21, 42>}
 func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32> {
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false, device = ""} : (tensor<5x13x21x3xf32>, tensor<5x13x3x42xf32>) -> tensor<5x13x21x42xf32>
   func.return %0 : tensor<5x13x21x42xf32>
@@ -702,10 +776,10 @@ func.func @test_batch_matmul_4d(%arg0: tensor<5x13x21x3xf32>, %arg1: tensor<5x13
 // -----
 
 // CHECK-LABEL: test_matmul
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 14, 19]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 19, 28]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 14, 19>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 19, 28>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [14, 28]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 14, 28>}
 func.func @test_matmul(%arg0: tensor<14x19xf32>, %arg1: tensor<19x28xf32>) -> tensor<14x28xf32> {
   %2 = "tf.MatMul"(%arg0, %arg1)  {transpose_a = false, transpose_b = false}  : (tensor<14x19xf32>, tensor<19x28xf32>) -> tensor<14x28xf32>
   func.return %2 : tensor<14x28xf32>
@@ -738,9 +812,9 @@ func.func @test_add_1d(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 0, 0]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 7, 0]}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 14, 0]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
 func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %6 = "tf.Const"()  {value = dense<1> : tensor<i32>}  : () -> tensor<i32>
   %7:3 = "tf.Split"(%6, %arg0)   : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -775,9 +849,9 @@ func.func @test_reverse(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
 // CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [13, 11, 2, 3]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, 11, 2, 3>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = [26, 11, 3]}
+// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 26, 11, 3>}
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %2 = "tf.Const"()  {value = dense<2> : tensor<1xi32>}  : () -> tensor<1xi32>
   %3 = "tf.Const"()  {value = dense<[[0, 1]]> : tensor<1x2xi32>}  : () -> tensor<1x2xi32>
@@ -791,9 +865,9 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [2, 2, 2, 32, 32, 1]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 2, 2, 2, 32, 32, 1>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = [2, 64, 64, 1]}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 2, 64, 64, 1>}
 // CHECK: return %[[VAR5]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
   %2 = "tf.Const"()  {value = dense<2> : tensor<2xi32>}  : () -> tensor<2xi32>
@@ -808,9 +882,9 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 
 // CHECK-LABEL: test_space_to_depth
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 16, 2, 16, 2, 8]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 16, 2, 16, 2, 8>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [1, 16, 16, 32]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 16, 16, 32>}
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %2 = "tf.SpaceToDepth"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
   func.return %2 : tensor<1x16x16x32xf32>
@@ -820,9 +894,9 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 
 // CHECK-LABEL: test_depth_to_space
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 32, 32, 2, 2, 2]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 32, 32, 2, 2, 2>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [1, 64, 64, 2]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 64, 64, 2>}
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %2 = "tf.DepthToSpace"(%arg0)  {block_size = 2 : i64, data_format = "NHWC"}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
   func.return %2 : tensor<1x64x64x2xf32>
@@ -848,17 +922,16 @@ func.func @test_right_shift(%arg0: tensor<4x4xi32>, %arg1: tensor<1x1xi32>) -> t
 
 // -----
 
-// CHECK-LABEL: test_one_hot
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 2, 1]> : tensor<3xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 1, 1]}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.tile"(%[[VAR1]]) {multiples = [16, 1, 1]}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg2) {new_shape = [1, 1, 1]}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.tile"(%[[VAR3]]) {multiples = [16, 2, 1]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg0) {new_shape = [16, 1]}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.scatter"(%[[VAR4]], %[[VAR5]], %[[VAR2]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [16, 1, 2]}
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.transpose"(%[[VAR7]], %[[VAR0]])
-// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = [4, 4, 2]}
+// CHECK-LABEL: @test_one_hot
+// CHECK-SAME:      %[[ARG0_0:.*]]: tensor<4x4xi32>, %[[ARG1_0:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
+// CHECK:         %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG1_0]]) {new_shape = array<i64: 1, 1, 1>}
+// CHECK:         %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE_0]]) {multiples = array<i64: 16, 1, 1>}
+// CHECK:         %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG2]]) {new_shape = array<i64: 1, 1, 1>}
+// CHECK:         %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_1]]) {multiples = array<i64: 16, 2, 1>}
+// CHECK:         %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[ARG0_0]]) {new_shape = array<i64: 16, 1>}
+// CHECK:         %[[SCATTER:.*]] = "tosa.scatter"(%[[TILE_0]], %[[RESHAPE_2]], %[[TILE]])
+// CHECK:         %[[RESHAPE_3:.*]] = "tosa.reshape"(%[[SCATTER]]) {new_shape = array<i64: 4, 4, 2>}
+// CHECK:         return %[[RESHAPE_3]]
 func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<4x4x2xf32> {
   %0 = "tf.Const"()  {value = dense<2> : tensor<i32>}  : () -> tensor<i32>
   %1 = "tf.OneHot"(%arg0, %0, %arg1, %arg2) {axis = -1 : i64} : (tensor<4x4xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<4x4x2xf32>
@@ -889,9 +962,9 @@ func.func @test_fakequant_with_min_max_args(%arg0: tensor<13x21x3xf32>) -> tenso
 // -----
 // CHECK-LABEL: test_gather
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x49xi32>}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 63]}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR0]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [7, 7, 21, 3]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather(%arg0: tensor<13x21x3xf32>) -> tensor<7x7x21x3xf32> {
   %0 = "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -904,9 +977,9 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>) -> tensor<7x7x21x3xf32> {
 // -----
 // CHECK-LABEL: test_gather_nd
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x42xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 63]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [6, 7, 21, 3]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 6, 7, 21, 3>}
 func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>) -> tensor<6x7x21x3xf32> {
   %0 = "tf.Const"() {device = "", value = dense<[[[0], [5], [3], [12], [2], [4], [3]], [[11], [1], [11], [10], [3], [12], [8]], [[5], [3], [1], [11], [3], [10], [0]], [[0], [8], [4], [7], [3], [12], [2]], [[7], [6], [11], [4], [2], [10], [11]], [[11], [1], [11], [1], [1], [11], [8]]]> : tensor<6x7x1xi32>} : () -> tensor<6x7x1xi32>
   %1 = "tf.GatherNd"(%arg0, %0) {device = ""} : (tensor<13x21x3xf32>, tensor<6x7x1xi32>) -> tensor<6x7x21x3xf32>
@@ -920,15 +993,15 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>) -> tensor<6x7x21x3xf32> {
 // CHECK-LABEL: test_fused_batch_norm
 func.func @test_fused_batch_norm(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
   // CHECK:  %[[ONE:.+]] = "tosa.const"() {value = dense<1.000000e-03> : tensor<1xf32>}
-  // CHECK:  %[[RES0:.+]] = "tosa.reshape"(%arg3) {new_shape = [1, 1, 1, 8]}
+  // CHECK:  %[[RES0:.+]] = "tosa.reshape"(%arg3) {new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[SUB0:.+]] = "tosa.sub"(%arg0, %[[RES0]])
   // CHECK:  %[[ADD0:.+]] = "tosa.add"(%arg4, %[[ONE]])
   // CHECK:  %[[RSQR:.+]] = "tosa.rsqrt"(%[[ADD0]])
-  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%[[RSQR]]) {new_shape = [1, 1, 1, 8]}
+  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%[[RSQR]]) {new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[MUL0:.+]] = "tosa.mul"(%[[SUB0]], %[[RES1]]) {shift = 0 : i32}
-  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%arg1) {new_shape = [1, 1, 1, 8]}
+  // CHECK:  %[[RES1:.+]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[MUL1:.+]] = "tosa.mul"(%[[MUL0]], %[[RES1]]) {shift = 0 : i32}
-  // CHECK:  %[[RES2:.+]] = "tosa.reshape"(%arg2) {new_shape = [1, 1, 1, 8]}
+  // CHECK:  %[[RES2:.+]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1, 8>}
   // CHECK:  %[[ADD1:.+]] = "tosa.add"(%[[MUL1]], %[[RES2]])
   %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<*xf32>)
 
@@ -949,12 +1022,12 @@ func.func @test_fused_batch_norm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: ten
 
 // CHECK-LABEL: mirrorpad_symmetric
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<5x10xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [1, 10], start = [0, 0]} : (tensor<5x10xf32>)
-// CHECK: %[[VAL_2:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [2, 10], start = [3, 0]} : (tensor<5x10xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 10>, start = array<i64: 0, 0>} : (tensor<5x10xf32>)
+// CHECK: %[[VAL_2:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 2, 10>, start = array<i64: 3, 0>} : (tensor<5x10xf32>)
 // CHECK: %[[VAL_3:.*]] = "tosa.reverse"(%[[VAL_2]]) {axis = 0 : i64} : (tensor<2x10xf32>)
 // CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]], %[[VAL_3]]) {axis = 0 : i64} : (tensor<1x10xf32>, tensor<5x10xf32>, tensor<2x10xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [8, 1], start = [0, 0]} : (tensor<8x10xf32>)
-// CHECK: %[[VAL_6:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [8, 2], start = [0, 8]} : (tensor<8x10xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 8, 1>, start = array<i64: 0, 0>} : (tensor<8x10xf32>)
+// CHECK: %[[VAL_6:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 8, 2>, start = array<i64: 0, 8>} : (tensor<8x10xf32>)
 // CHECK: %[[VAL_7:.*]] = "tosa.reverse"(%[[VAL_6]]) {axis = 1 : i64} : (tensor<8x2xf32>)
 // CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]], %[[VAL_7]]) {axis = 1 : i64} : (tensor<8x1xf32>, tensor<8x10xf32>, tensor<8x2xf32>)
 func.func @mirrorpad_symmetric(%arg0: tensor<5x10xf32>) -> tensor<8x13xf32> {
@@ -968,11 +1041,11 @@ func.func @mirrorpad_symmetric(%arg0: tensor<5x10xf32>) -> tensor<8x13xf32> {
 
 // CHECK-LABEL: mirrorpad_reflect
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<13x21x3xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [1, 21, 3], start = [1, 0, 0]} : (tensor<13x21x3xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 21, 3>, start = array<i64: 1, 0, 0>} : (tensor<13x21x3xf32>)
 // CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) {axis = 0 : i64} : (tensor<1x21x3xf32>, tensor<13x21x3xf32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = [14, 1, 3], start = [0, 1, 0]} : (tensor<14x21x3xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 14, 1, 3>, start = array<i64: 0, 1, 0>} : (tensor<14x21x3xf32>)
 // CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) {axis = 1 : i64} : (tensor<14x1x3xf32>, tensor<14x21x3xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [14, 22, 1], start = [0, 0, 1]} : (tensor<14x22x3xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 14, 22, 1>, start = array<i64: 0, 0, 1>} : (tensor<14x22x3xf32>)
 // CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) {axis = 2 : i64} : (tensor<14x22x1xf32>, tensor<14x22x3xf32>)
 func.func @mirrorpad_reflect(%arg0: tensor<13x21x3xf32>) -> tensor<14x22x4xf32> {
   %cst = "tf.Const"() {device = "", value = dense<[[1, 0], [1, 0], [1, 0]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
index 7633be3acf0..95c8f252767 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline-filtered.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt --pass-pipeline='func.func(tosa-legalize-tfl{disable-patterns=TFLConv2D,TFLSoftmax, enable-patterns=TFLFullyConnected,TFLTranspose})' %s | FileCheck %s
+// RUN: tf-opt --pass-pipeline='builtin.module(func.func(tosa-legalize-tfl{disable-patterns=TFLConv2D,TFLSoftmax, enable-patterns=TFLFullyConnected,TFLTranspose}))' %s | FileCheck %s
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index daad78226a4..839839a0926 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -12,7 +12,7 @@
 
 // CHECK-LABEL: test_conv2d
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 func.func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
   %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
@@ -33,7 +33,7 @@ func.func @test_conv2d_dynamic(%arg0: tensor<?x32x32x8xf32>, %arg1: tensor<16x1x
 // -----
 
 // CHECK-LABEL: test_conv2d_bias
-// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
 // CHECK-SAME: tensor<1x32x32x16xf32>
 func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8xf32>, %cst_0: tensor<16xf32>) -> tensor<*xf32> {
   %0 = "tfl.conv_2d"(%arg0, %cst, %cst_0)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
@@ -44,11 +44,24 @@ func.func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8x
 
 // CHECK-LABEL: test_transpose_conv2d
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) {out_pad = [0, 0, 0, 0], out_shape = [1, 32, 32, 16], stride = [1, 1]}
+// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
 func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
   %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
   %cst_1 = "tfl.no_value"() {value = unit} : () -> none
-  %0 = "tfl.transpose_conv"(%cst, %cst_0, %arg0, %cst_1)  {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<4xi32>, tensor<16x1x1x8xf32>, tensor<1x32x32x8xf32>, none) -> tensor<1x32x32x16xf32>
+  %0 = "tfl.transpose_conv"(%cst, %cst_0, %arg0, %cst_1)  {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32, fused_activation_function = "NONE"}  : (tensor<4xi32>, tensor<16x1x1x8xf32>, tensor<1x32x32x8xf32>, none) -> tensor<1x32x32x16xf32>
+  func.return %0 : tensor<1x32x32x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_transpose_conv2d_relu
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
+// CHECK: %[[VAR1:.*]] = "tosa.transpose_conv2d"(%arg0, %arg1, %[[VAR0]]) {out_pad = array<i64: 0, 0, 0, 0>, out_shape = array<i64: 1, 32, 32, 16>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR2:.*]] = "tosa.clamp"(%[[VAR1]]) {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+func.func @test_transpose_conv2d_relu(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x8xf32>) -> tensor<1x32x32x16xf32> {
+  %cst = arith.constant dense<[1, 32, 32, 16]> : tensor<4xi32>
+  %cst_1 = "tfl.no_value"() {value = unit} : () -> none
+  %0 = "tfl.transpose_conv"(%cst, %cst_0, %arg0, %cst_1)  {padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32, fused_activation_function = "RELU"}  : (tensor<4xi32>, tensor<16x1x1x8xf32>, tensor<1x32x32x8xf32>, none) -> tensor<1x32x32x16xf32>
   func.return %0 : tensor<1x32x32x16xf32>
 }
 
@@ -57,7 +70,7 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16
 // CHECK-LABEL: test_conv2d_qi8
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x2x2x8xi8>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0> : tensor<16xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) {dilation = [1, 1], pad = [0, 1, 0, 1], quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = [1, 1]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<42> : tensor<16x2x2x8xi8>} : () -> tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
@@ -71,7 +84,7 @@ func.func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.0156
 // CHECK-LABEL: test_conv2d_qi16
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0> : tensor<16xi48>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x1x1x8xi8>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR1]], %[[VAR0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = [1, 1]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR1]], %[[VAR0]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>}
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>>) -> tensor<1x32x32x16x!quant.uniform<i16:f32, 1.0>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>, value = dense<42> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8:f32, 1.0>>
@@ -82,16 +95,16 @@ func.func @test_conv2d_qi16(%arg0: tensor<1x32x32x8x!quant.uniform<i16:f32, 1.0>
 
 // -----
 
-// CHECK-LABEL: test_depthwise_conv2d_bias_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x2x2x16xi8>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[1, 2, 3, 0]> : tensor<4xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16xi32>}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.transpose"(%[[VAR0]], %[[VAR1]])
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%[[VAR3]]) {new_shape = [2, 2, 8, 2]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.depthwise_conv2d"(%arg0, %[[VAR4]], %[[VAR2]]) {dilation = [1, 1], pad = [0, 1, 0, 1], quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>, stride = [1, 1]}
-// CHECK: %[[VAR6:.*]] = "tosa.rescale"(%[[VAR5]])
-// CHECK-SAME: multiplier = [1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1803013871 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32, 1373724854 : i32]
-// CHECK-SAME: shift = [36 : i32, 36 : i32, 36 : i32, 36 : i32, 32 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32, 36 : i32]
+// CHECK-LABEL: @test_depthwise_conv2d_bias_qi8
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015678688883781433:-1>>
+// CHECK-DAG:     %[[CONST:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16xi32>}
+// CHECK-DAG:     %[[CONST_0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<1x2x2x16xi8>}
+// CHECK-DAG:     %[[RESHAPE:.*]] = "tosa.reshape"(%[[CONST_0]]) {new_shape = array<i64: 2, 2, 8, 2>}
+// CHECK-DAG:     %[[DEPTHWISE:.*]] = "tosa.depthwise_conv2d"(%[[ARG0]], %[[RESHAPE]], %[[CONST]]) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 1, 0, 1>, quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = 0>, stride = array<i64: 1, 1>}
+// CHECK:         %[[RESCALE:.*]] = "tosa.rescale"(%[[DEPTHWISE]])
+// CHECK-SAME:        multiplier = array<i32: 1373724854, 1373724854, 1373724854, 1373724854, 1803013871, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854, 1373724854>
+// CHECK-SAME:        shift = array<i32: 36, 36, 36, 36, 32, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36>
+// CHECK:         return %[[RESCALE]]
 func.func @test_depthwise_conv2d_bias_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015678688883781433:-1>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
   %0 = "tfl.pseudo_qconst"() {qtype = tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32:3, {0.1,0.1,0.1,0.1,2.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<[[[[-127, 127, 127, -127, -127, -127, -127, -127, -127, 127, 127, 127, 127, 127, -127, 127], [-127, 127, 127, -127, -127, -127, -127, -127, -127, 127, 127, 127, 127, 127, -127, 127]], [[-127, 127, 127, -127, -127, -127, -127, -127, -127, 127, 127, 127, 127, 127, -127, 127], [-127, 127, 127, -127, -127, -127, -127, -127, -127, 127, 127, 127, 127, 127, -127, 127]]]]> : tensor<1x2x2x16xi8>} : () -> tensor<1x2x2x16x!quant.uniform<i8<-127:127>:f32:3,  {0.1,0.1,0.1,0.1,2.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32:0, {9.1E-5,1.9E-4,2.3E-4,4.5E-5,3.6E-6,2.3E-4,2.3E-4,5.6E-5,5.8E-5,1.7E-4,7.1E-5,7.3E-5,2.2E-4,1.5E-4,1.7E-4,7.3E-5}>>, value = dense<[-2879, 6636, 3531, 23376, -79787, -6142, 5582, -30384, 17330, -4549, -3518, 16215, 2695, -2670, 8399, -12223]> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32:0,  {9.1E-5,1.9E-4,2.3E-4,4.5E-5,3.6E-6,2.3E-4,2.3E-4,5.6E-5,5.8E-5,1.7E-4,7.1E-5,7.3E-5,2.2E-4,1.5E-4,1.7E-4,7.3E-5} >>
@@ -117,7 +130,7 @@ func.func @test_depthwise_conv2d_bias_inferred(%arg0: tensor<?x32x32x8xf32>, %ar
 // CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<4xf32>}
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = [1, 1, 1], pad = [0, 1, 1, 1, 1, 1], stride = [1, 1, 1]}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32>) -> tensor<2x2x7x7x4xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<2x2x7x7x2xf32>, tensor<2x3x3x2x4xf32>, none) -> tensor<2x2x7x7x4xf32>
@@ -132,7 +145,7 @@ func.func @test_conv3d(%arg0: tensor<2x2x7x7x2xf32>, %arg1: tensor<2x3x3x2x4xf32
 // CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]> : tensor<5xi32>}
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = [1, 1, 1], pad = [1, 1, 0, 0, 0, 0], stride = [1, 1, 1]}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 1, 1, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x1x1x8x16xf32>) -> tensor<*xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<?x11x32x32x8xf32>, tensor<3x1x1x8x16xf32>, none) -> tensor<*xf32>
@@ -147,7 +160,7 @@ func.func @test_conv3d_dynamic(%arg0: tensor<?x11x32x32x8xf32>, %arg1: tensor<3x
 // CHECK-SAME: %[[VAL_2:.*]]: tensor<8xf32>
 // CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<[4, 0, 1, 2, 3]>
 // CHECK: %[[VAL_4:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_3]])
-// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = [1, 1, 1], pad = [7, 8, 0, 1, 0, 1], stride = [1, 1, 1]}
+// CHECK: %[[VAL_5:.*]] = "tosa.conv3d"(%[[VAL_0]], %[[VAL_4]], %[[VAL_2]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 7, 8, 0, 1, 0, 1>, stride = array<i64: 1, 1, 1>}
 func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2x2x12x8xf32>, %cst: tensor<8xf32>) -> tensor<10x3x64x64x8xf32> {
   %0 = "tfl.conv_3d"(%arg0, %arg1, %cst) {dilation_d_factor = 1 : i32, dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_d = 1 : i32, stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<10x3x64x64x12xf32>, tensor<16x2x2x12x8xf32>, tensor<8xf32>) -> tensor<10x3x64x64x8xf32>
   func.return %0 : tensor<10x3x64x64x8xf32>
@@ -168,7 +181,7 @@ func.func @test_conv3d_bias(%arg0: tensor<10x3x64x64x12xf32>, %arg1: tensor<16x2
 // CHECK: %[[VAL_9:.*]] = "tosa.sub"(%[[VAL_8]], %[[VAL_2]])
 // CHECK: %[[VAL_10:.*]] = "tosa.mul"(%[[VAL_9]], %[[VAL_3]]) {shift = 0 : i32}
 // CHECK: %[[VAL_11:.*]] = "tosa.transpose"(%[[VAL_1]], %[[VAL_7]])
-// CHECK: %[[VAL_12:.*]] = "tosa.conv3d"(%[[VAL_10]], %[[VAL_11]], %[[VAL_6]]) {dilation = [1, 1, 1], pad = [0, 1, 1, 1, 1, 1], stride = [1, 1, 2]}
+// CHECK: %[[VAL_12:.*]] = "tosa.conv3d"(%[[VAL_10]], %[[VAL_11]], %[[VAL_6]]) {dilation = array<i64: 1, 1, 1>, pad = array<i64: 0, 1, 1, 1, 1, 1>, stride = array<i64: 1, 1, 2>}
 // CHECK: %[[VAL_13:.*]] = "tosa.mul"(%[[VAL_12]], %[[VAL_4]]) {shift = 0 : i32}
 // CHECK: %[[VAL_14:.*]] = "tosa.add"(%[[VAL_13]], %[[VAL_5]])
 // CHECK: %[[VAL_15:.*]] = "tosa.cast"(%[[VAL_14]])
@@ -284,6 +297,15 @@ func.func @test_relu1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 
 // -----
 
+// CHECK-LABEL: test_relu0To1
+// CHECK: %[[VAL0:.*]] = "tosa.clamp"(%arg0) {max_fp = 1.000000e+00 : f32, max_int = 1 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+func.func @test_relu0To1(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = "tfl.relu_0_to_1"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  func.return %0 : tensor<13x21x3xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_relu6
 // CHECK: %[[VAR0:.*]] = "tosa.clamp"(%arg0) {max_fp = 6.000000e+00 : f32, max_int = 6 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
 func.func @test_relu6(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -318,7 +340,7 @@ func.func @test_leaky_relu(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_prelu
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 2, 3]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 2, 3>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.mul"(%arg0, %[[VAR1]]) {shift = 0 : i32}
 // CHECK-DAG: %[[VAR3:.*]] = "tosa.greater_equal"(%arg0, %[[VAR0]])
 // CHECK: %[[VAR4:.*]] = "tosa.select"(%[[VAR3]], %arg0, %[[VAR2]])
@@ -365,7 +387,7 @@ func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<*xi1> {
 
 // CHECK-LABEL: test_reduce_any
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_any"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_any"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xi1>, tensor<1xi32>) -> tensor<21x3xi1>
@@ -376,7 +398,7 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<21x3xi1> {
 
 // CHECK-LABEL: test_reduce_min
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_min"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_min"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -387,7 +409,7 @@ func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_max
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_max"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_max"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -398,13 +420,29 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_sum
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.sum"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
   func.return %0 : tensor<21x3xf32>
 }
 
+// CHECK-LABEL: test_reduce_sum_nonzero_axis
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<10x20x30x40x50xf32>
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+// CHECK: %[[VAL_2:.*]] = "tosa.transpose"(%[[VAL_0]], %[[VAL_1]]) : (tensor<10x20x30x40x50xf32>, tensor<5xi32>) -> tensor<10x20x30x50x40xf32>
+// CHECK: %[[VAL_3:.*]] = "tosa.reshape"(%[[VAL_2]]) {new_shape = array<i64: 300000, 40>} : (tensor<10x20x30x50x40xf32>) -> tensor<300000x40xf32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<300000x40xf32>) -> tensor<300000x1xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 10, 20, 30, 50>} : (tensor<300000x1xf32>) -> tensor<10x20x30x50xf32>
+// CHECK: return %[[VAL_5]] : tensor<10x20x30x50xf32>
+func.func @test_reduce_sum_nonzero_axis(%arg0: tensor<10x20x30x40x50xf32> {tf._user_specified_name = "inp_list"}) -> tensor<10x20x30x50xf32> {
+  %cst = arith.constant dense<3> : tensor<i32>
+  %0 = "tfl.sum"(%arg0, %cst) {device = "", keep_dims = false} : (tensor<10x20x30x40x50xf32>, tensor<i32>) -> tensor<10x20x30x50xf32>
+  func.return %0 : tensor<10x20x30x50xf32>
+}
+
+// -----
+
 // -----
 
 // CHECK-LABEL: test_reduce_sum_5D
@@ -412,9 +450,9 @@ func.func @test_reduce_sum_5D(%arg0: tensor<4x5x6x7x8xf32>) -> tensor<6x8xf32> {
   %cst = arith.constant dense<[0, 1, 3]> : tensor<3xi32>
   // CHECK-DAG: %[[PERM:.+]] = "tosa.const"() {value = dense<[2, 4, 0, 1, 3]> : tensor<5xi32>}
   // CHECK-DAG: %[[TRANSPOSE:.+]] = "tosa.transpose"(%arg0, %[[PERM]])
-  // CHECK-DAG: %[[RESHAPE0:.+]] = "tosa.reshape"(%[[TRANSPOSE:.+]]) {new_shape = [48, 140]}
+  // CHECK-DAG: %[[RESHAPE0:.+]] = "tosa.reshape"(%[[TRANSPOSE:.+]]) {new_shape = array<i64: 48, 140>}
   // CHECK-DAG: %[[REDUCE:.+]] = "tosa.reduce_sum"(%[[RESHAPE0]]) {axis = 1 : i64}
-  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[REDUCE]]) {new_shape = [6, 8]}
+  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[REDUCE]]) {new_shape = array<i64: 6, 8>}
   %0 = "tfl.sum"(%arg0, %cst)  {keep_dims = false}  : (tensor<4x5x6x7x8xf32>, tensor<3xi32>) -> tensor<6x8xf32>
   func.return %0 : tensor<6x8xf32>
 }
@@ -424,7 +462,7 @@ func.func @test_reduce_sum_5D(%arg0: tensor<4x5x6x7x8xf32>) -> tensor<6x8xf32> {
 // CHECK-LABEL: test_reduce_mean
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.0769230798> : tensor<1x1xf32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.reduce_sum"(%arg0) {axis = 0 : i64}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = [21, 3]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 21, 3>}
 // CHECK: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
 func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
@@ -436,7 +474,7 @@ func.func @test_reduce_mean(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
 
 // CHECK-LABEL: test_reduce_product
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.reduce_prod"(%arg0) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 21, 3>}
 func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xf32> {
   %cst = arith.constant dense<0> : tensor<1xi32>
   %0 = "tfl.reduce_prod"(%arg0, %cst)  {keep_dims = false}  : (tensor<13x21x3xf32>, tensor<1xi32>) -> tensor<21x3xf32>
@@ -602,6 +640,47 @@ func.func @test_cos(%arg0: tensor<10xf32>) -> tensor<*xf32> {
 
 // -----
 
+// CHECK-LABEL: test_atan2
+// CHECK-SAME: -> tensor<13x21x3xf32>
+// CHECK-DAG: %[[VAL_2:.*]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_3:.*]] = "tosa.const"() {value = dense<1.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_4:.*]] = "tosa.const"() {value = dense<3.276700e+04> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_5:.*]] = "tosa.const"() {value = dense<2.38418579E-7> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_6:.*]] = "tosa.const"() {value = dense<1.57079637> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_7:.*]] = "tosa.const"() {value = dense<3.14159274> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_8:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1x1xf32>} : () -> tensor<1x1x1xf32>
+// CHECK-DAG: %[[VAL_9:.*]] = "tosa.const"() {value = dense<{{.+}}> : tensor<513xi16>} : () -> tensor<513xi16>
+// CHECK: %[[VAL_10:.*]] = "tosa.abs"(%arg0) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_11:.*]] = "tosa.abs"(%arg1) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_12:.*]] = "tosa.minimum"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_13:.*]] = "tosa.maximum"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_14:.*]] = "tosa.reciprocal"(%[[VAL_13]]) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_15:.*]] = "tosa.mul"(%[[VAL_14]], %[[VAL_12]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_16:.*]] = "tosa.mul"(%[[VAL_15]], %[[VAL_2]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_17:.*]] = "tosa.sub"(%[[VAL_16]], %[[VAL_3]]) : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_18:.*]] = "tosa.mul"(%[[VAL_17]], %[[VAL_4]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_19:.*]] = "tosa.cast"(%[[VAL_18]]) : (tensor<13x21x3xf32>) -> tensor<13x21x3xi16>
+// CHECK: %[[VAL_20:.*]] = "tosa.table"(%[[VAL_19]], %[[VAL_9]]) : (tensor<13x21x3xi16>, tensor<513xi16>) -> tensor<13x21x3xi32>
+// CHECK: %[[VAL_21:.*]] = "tosa.cast"(%[[VAL_20]]) : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_22:.*]] = "tosa.mul"(%[[VAL_21]], %[[VAL_5]]) {shift = 0 : i32} : (tensor<13x21x3xf32>, tensor<1x1x1xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_23:.*]] = "tosa.sub"(%[[VAL_6]], %[[VAL_22]]) : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_24:.*]] = "tosa.greater"(%[[VAL_10]], %[[VAL_11]]) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+// CHECK: %[[VAL_25:.*]] = "tosa.select"(%[[VAL_24]], %[[VAL_23]], %[[VAL_22]]) : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_26:.*]] = "tosa.sub"(%[[VAL_7]], %[[VAL_25]]) : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_27:.*]] = "tosa.greater"(%[[VAL_8]], %arg1) : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+// CHECK: %[[VAL_28:.*]] = "tosa.select"(%[[VAL_27]], %[[VAL_26]], %[[VAL_25]]) : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_29:.*]] = "tosa.negate"(%[[VAL_28]]) : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: %[[VAL_30:.*]] = "tosa.greater"(%[[VAL_8]], %arg0) : (tensor<1x1x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
+// CHECK: %[[VAL_31:.*]] = "tosa.select"(%[[VAL_30]], %[[VAL_29]], %[[VAL_28]]) : (tensor<13x21x3xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+// CHECK: return %[[VAL_31]] : tensor<13x21x3xf32>
+func.func @test_atan2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<*xf32> {
+  %0 = "tfl.atan2"(%arg0, %arg1) : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+
 // CHECK-LABEL: test_sigmoid
 // CHECK: %[[VAR0:.*]] = "tosa.sigmoid"(%arg0)
 func.func @test_sigmoid(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -686,7 +765,7 @@ func.func @test_less_equal_dynamic(%arg0: tensor<13x1x3xf32>, %arg1: tensor<13x?
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -695,7 +774,7 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -704,7 +783,7 @@ func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_max_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -713,7 +792,7 @@ func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_max_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -722,7 +801,7 @@ func.func @test_max_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_reshape
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 819]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 819>}
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 819]> : tensor<2xi32>
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<13x21x3xf32>, tensor<2xi32>) -> tensor<*xf32>
@@ -732,7 +811,7 @@ func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_unknown
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [9, -1]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 9, -1>}
 // CHECK-SAME: -> tensor<9x91xf32>
 func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[9, -1]> : tensor<2xi32>
@@ -743,7 +822,7 @@ func.func @test_reshape_unknown(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_reshape_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [3, -1]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 3, -1>}
 // CHECK-SAME: -> tensor<3x?xf32>
 func.func @test_reshape_dynamic(%arg0: tensor<13x21x?xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[3, -1]> : tensor<2xi32>
@@ -776,7 +855,7 @@ func.func @test_transpose(%arg0: tensor<13x?x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_slice
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [4, 11, 1], start = [6, 8, 0]}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 4, 11, 1>, start = array<i64: 6, 8, 0>}
 func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[6, 8, 0]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[4, 11, 1]> : tensor<3xi32>
@@ -787,10 +866,10 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_strided_slice_simple
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [9, 21, 2], start = [4, 0, 1]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [9, 1, 7, 3, 2, 1]}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = [9, 1, 7, 1, 2, 1], start = [0, 0, 0, 0, 0, 0]}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [9, 7, 2]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 9, 7, 2>}
 func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -802,8 +881,8 @@ func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_strideless
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [9, 1, 2], start = [4, 0, 1]}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [9, 2]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 1, 2>, start = array<i64: 4, 0, 1>}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 9, 2>}
 func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -815,10 +894,10 @@ func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [1, 21, 1], start = [4, 0, 1]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [1, 1, 7, 3, 1, 1]}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = [1, 1, 7, 1, 1, 1], start = [0, 0, 0, 0, 0, 0]}
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [7]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 1, 21, 1>, start = array<i64: 4, 0, 1>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 1, 1, 7, 3, 1, 1>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) {size = array<i64: 1, 1, 7, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 7>}
 func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -830,8 +909,8 @@ func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 // -----
 
 // CHECK-LABEL: test_strided_slice_shrink_ignore_stride
-// CHECK-DAG: %[[VAR0:.*]] =  "tosa.slice"(%arg0) {size = [1, 1, 2], start = [4, 0, 1]}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [2]}
+// CHECK-DAG: %[[VAR0:.*]] =  "tosa.slice"(%arg0) {size = array<i64: 1, 1, 2>, start = array<i64: 4, 0, 1>}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 2>}
 func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
@@ -843,8 +922,8 @@ func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -
 // -----
 
 // CHECK-LABEL: test_strided_slice_unstrided
-// CEHCK-SAME: -> tensor<9x21x2xf32>
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [9, 21, 2], start = [4, 0, 1]}
+// CHECK-SAME: -> tensor<9x21x2xf32>
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
 // CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 2 : i64}
 // CHECK: return %[[VAR1]]
 func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -859,7 +938,7 @@ func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*x
 
 // CHECK-LABEL: test_strided_slice_unstrided_shorter
 // CHECK: -> tensor<9x21x3xf32>
-// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [9, 21, 3], start = [4, 0, 0]}
+// CHECK: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 9, 21, 3>, start = array<i64: 4, 0, 0>}
 // CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) {axis = 1 : i64}
 // CHECK: return %[[VAR1]]
 func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
@@ -911,10 +990,10 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
   %end = arith.constant dense<[7, -1, 6]> : tensor<3xi32>
   %stride = arith.constant dense<[1, 2, -1]> : tensor<3xi32>
 
-  // CHECK: %[[SLICE1:.+]] = "tosa.slice"(%arg0) {size = [7, -1, 1], start = [0, 1, 2]}
-  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) {new_shape = [7, 1, -1, 2, 1, 1]}
-  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) {size = [7, 1, -1, 1, 1, 1], start = [0, 0, 0, 0, 0, 0]}
-  // CHECK: %[[RESHAPE2:.+]] = "tosa.reshape"(%[[SLICE2]]) {new_shape = [7, -1]}
+  // CHECK: %[[SLICE1:.+]] = "tosa.slice"(%arg0) {size = array<i64: 7, -1, 1>, start = array<i64: 0, 1, 2>}
+  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) {new_shape = array<i64: 7, 1, -1, 2, 1, 1>}
+  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) {size = array<i64: 7, 1, -1, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+  // CHECK: %[[RESHAPE2:.+]] = "tosa.reshape"(%[[SLICE2]]) {new_shape = array<i64: 7, -1>}
   %0 = "tfl.strided_slice"(%arg0, %begin, %end, %stride)  {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 4 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   // CHECK: return %[[RESHAPE2]]
   func.return %0 : tensor<*xf32>
@@ -923,7 +1002,7 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: test_select
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = [1, 1, 1]} : (tensor<1xi1>) -> tensor<1x1x1xi1>
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg2) {new_shape = array<i64: 1, 1, 1>} : (tensor<1xi1>) -> tensor<1x1x1xi1>
 // CHECK: %[[VAR2:.*]] = "tosa.select"(%[[VAR1]], %arg0, %arg1)
 func.func @test_select(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<1xi1>) -> tensor<13x21x3xf32> {
   %0 = "tfl.select_v2"(%arg2, %arg0, %arg1) : (tensor<1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
@@ -964,7 +1043,7 @@ func.func @test_concatv2(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>,
 
 // CHECK-LABEL: test_stack
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.concat"(%arg0, %arg1, %arg2, %arg3) {axis = 0 : i64}
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = [4, 13, 21, 3]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) {new_shape = array<i64: 4, 13, 21, 3>}
 func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>, %arg3: tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32> {
   %0 = "tfl.pack"(%arg0, %arg1, %arg2, %arg3)  {axis = 0 : i32, values_count = 4 : i32}  : (tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<4x13x21x3xf32>
   func.return %0 : tensor<4x13x21x3xf32>
@@ -973,7 +1052,7 @@ func.func @test_stack(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>, %a
 // -----
 
 // CHECK-LABEL: test_unstack
-// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [32, 32, 8]}
+// CHECK: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 32, 32, 8>}
 func.func @test_unstack(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.unpack"(%arg0)  {axis = 0 : i32, num = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1013,7 +1092,7 @@ func.func @test_pad_v2(%arg0: tensor<1x256x8x25xf32>) -> (tensor<*xf32>) {
 // -----
 
 // CHECK-LABEL: test_expand_dims
-// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 21, 3]}
+// CHECK: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 21, 3>}
 func.func @test_expand_dims(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[1, 13, 21, 3]> : tensor<4xi32>
   %0 = "tfl.reshape"(%arg0, %cst) : (tensor<13x21x3xf32>, tensor<4xi32>) -> tensor<*xf32>
@@ -1168,10 +1247,10 @@ func.func @test_batch_matmul(%arg0: tensor<1x16x128xf32>, %arg1: tensor<1x128x32
 
 // CHECK-LABEL: @test_batch_matmul_4d
 func.func @test_batch_matmul_4d(%arg0: tensor<4x5x16x128xf32>, %arg1: tensor<4x5x128x32xf32>) -> (tensor<4x5x16x32xf32> ) {
-  // CHECK: %[[R0:.*]] = "tosa.reshape"(%arg0) {new_shape = [20, 16, 128]}
-  // CHECK: %[[R1:.*]] = "tosa.reshape"(%arg1) {new_shape = [20, 128, 32]}
+  // CHECK: %[[R0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 20, 16, 128>}
+  // CHECK: %[[R1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 20, 128, 32>}
   // CHECK: %[[MM:.*]] = "tosa.matmul"(%[[R0]], %[[R1]])
-  // CHECK: "tosa.reshape"(%[[MM]]) {new_shape = [4, 5, 16, 32]}
+  // CHECK: "tosa.reshape"(%[[MM]]) {new_shape = array<i64: 4, 5, 16, 32>}
   %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<4x5x16x128xf32>, tensor<4x5x128x32xf32>) -> tensor<4x5x16x32xf32>
   func.return %0 : tensor<4x5x16x32xf32>
 }
@@ -1265,9 +1344,9 @@ func.func @test_fused_activation_relun1to1_clamp(
 // -----
 
 // CHECK-LABEL: test_split
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 0, 0]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 7, 0]}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 14, 0]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
 func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %cst_0 = arith.constant dense<1> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -1279,13 +1358,13 @@ func.func @test_split(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor
 // CHECK-LABEL: test_split_dynamic
 func.func @test_split_dynamic(%arg0: tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>) {
   %cst_0 = arith.constant dense<1> : tensor<i32>
-  // CHECK-DAG: %[[VAR0:.+]] = "tosa.reshape"(%arg0) {new_shape = [13, 3, -1, 3]}
-  // CHECK-DAG: %[[VAR1:.+]] = "tosa.slice"(%[[VAR0]]) {size = [13, 1, -1, 3], start = [0, 0, 0, 0]}
-  // CHECK-DAG: %[[VAR2:.+]] = "tosa.slice"(%[[VAR0]]) {size = [13, 1, -1, 3], start = [0, 1, 0, 0]}
-  // CHECK-DAG: %[[VAR3:.+]] = "tosa.slice"(%[[VAR0]]) {size = [13, 1, -1, 3], start = [0, 2, 0, 0]}
-  // CHECK-DAG: %[[VAR4:.+]] = "tosa.reshape"(%[[VAR1]]) {new_shape = [13, -1, 3]}
-  // CHECK-DAG: %[[VAR5:.+]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [13, -1, 3]}
-  // CHECK-DAG: %[[VAR6:.+]] = "tosa.reshape"(%[[VAR3]]) {new_shape = [13, -1, 3]}
+  // CHECK-DAG: %[[VAR0:.+]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 3, -1, 3>}
+  // CHECK-DAG: %[[VAR1:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 0, 0, 0>}
+  // CHECK-DAG: %[[VAR2:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 1, 0, 0>}
+  // CHECK-DAG: %[[VAR3:.+]] = "tosa.slice"(%[[VAR0]]) {size = array<i64: 13, 1, -1, 3>, start = array<i64: 0, 2, 0, 0>}
+  // CHECK-DAG: %[[VAR4:.+]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 13, -1, 3>}
+  // CHECK-DAG: %[[VAR5:.+]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, -1, 3>}
+  // CHECK-DAG: %[[VAR6:.+]] = "tosa.reshape"(%[[VAR3]]) {new_shape = array<i64: 13, -1, 3>}
   // CHECK: return %[[VAR4]], %[[VAR5]], %[[VAR6]]
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>)
   func.return %0#0, %0#1, %0#2 : tensor<13x?x3xf32>, tensor<13x?x3xf32>, tensor<13x?x3xf32>
@@ -1294,9 +1373,9 @@ func.func @test_split_dynamic(%arg0: tensor<13x?x3xf32>) -> (tensor<13x?x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_neg
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 0, 0]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 7, 0]}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = [13, 7, 3], start = [0, 14, 0]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 7, 0>}
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 13, 7, 3>, start = array<i64: 0, 14, 0>}
 func.func @test_split_neg(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>) {
   %cst_0 = arith.constant dense<-2> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, tensor<13x7x3xf32>, tensor<13x7x3xf32>)
@@ -1306,9 +1385,9 @@ func.func @test_split_neg(%arg0: tensor<13x21x3xf32>) -> (tensor<13x7x3xf32>, te
 // -----
 
 // CHECK-LABEL: test_split_axis_0
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [7, 13, 3], start = [0, 0, 0]}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = [7, 13, 3], start = [7, 0, 0]}
-// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = [7, 13, 3], start = [14, 0, 0]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 7, 0, 0>}
+// CHECK: %[[VAR2:.*]] = "tosa.slice"(%arg0) {size = array<i64: 7, 13, 3>, start = array<i64: 14, 0, 0>}
 func.func @test_split_axis_0(%arg0: tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>, tensor<7x13x3xf32>, tensor<7x13x3xf32>) {
   %cst_0 = arith.constant dense<0> : tensor<i32>
   %0:3 = "tfl.split"(%cst_0, %arg0)  {num_splits = 3 : i32}  : (tensor<i32>, tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>, tensor<7x13x3xf32>, tensor<7x13x3xf32>)
@@ -1318,8 +1397,8 @@ func.func @test_split_axis_0(%arg0: tensor<21x13x3xf32>) -> (tensor<7x13x3xf32>,
 // -----
 
 // CHECK-LABEL: test_split_v_neg_axis
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = [2, 3, 3, 3], start = [0, 0, 0, 0]}
-// CHECK: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = [2, 3, 3, 5], start = [0, 0, 0, 3]}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) {size = array<i64: 2, 3, 3, 3>, start = array<i64: 0, 0, 0, 0>}
+// CHECK: %[[VAR1:.*]] = "tosa.slice"(%arg0) {size = array<i64: 2, 3, 3, 5>, start = array<i64: 0, 0, 0, 3>}
 func.func @test_split_v_neg_axis(%arg0: tensor<2x3x3x8xf32>) -> (tensor<2x3x3x3xf32>, tensor<2x3x3x5xf32>) {
   %split_size = arith.constant dense<[3, 5]> : tensor<2xi32>
   %axis = arith.constant dense<-1> : tensor<i32>
@@ -1344,9 +1423,9 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 0, 1, 3]> : tensor<4xi32>}
 // CHECK-DAG: %[[PVAL:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<f32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.pad"(%arg0, %[[VAR0]], %[[PVAL]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [13, 11, 2, 3]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 13, 11, 2, 3>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = [26, 11, 3]}
+// CHECK: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 26, 11, 3>}
 func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32> {
   %cst = arith.constant dense<2> : tensor<1xi32>
   %cst_0 = arith.constant dense<[[0, 1]]> : tensor<1x2xi32>
@@ -1361,9 +1440,9 @@ func.func @test_space_to_batch(%arg0: tensor<13x21x3xf32>) -> tensor<26x11x3xf32
 // CHECK-DAG: %[[C1:.+]] = "tosa.const"() {value = dense<{{\[\[}}0, 0], [0, 2], [0, 0], [0, 0]]> : tensor<4x2xi32>}
 // CHECK-DAG: %[[C2:.+]] = "tosa.const"() {value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi32>}
 // CHECK-DAG: %[[PAD:.+]] = "tosa.pad"(%arg0, %[[C1]], %[[C0]]) : (tensor<?x241x1x80xf32>, tensor<4x2xi32>, tensor<f32>) -> tensor<?x243x1x80xf32>
-// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%[[PAD]]) {new_shape = [-1, 81, 3, 1, 1, 80]}
+// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%[[PAD]]) {new_shape = array<i64: -1, 81, 3, 1, 1, 80>}
 // CHECK-DAG: %[[T:.+]] = "tosa.transpose"(%[[R0]], %[[C2]])
-// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = [-1, 81, 1, 80]}
+// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = array<i64: -1, 81, 1, 80>}
 // CHECK: return %[[R1]] : tensor<?x81x1x80xf32>
 func.func @test_space_to_batch_dyn(%arg0 : tensor<?x241x1x80xf32>) -> (tensor<?x81x1x80xf32>) {
     %0 = "tfl.pseudo_const"() {value = dense<[3, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -1378,9 +1457,9 @@ func.func @test_space_to_batch_dyn(%arg0 : tensor<?x241x1x80xf32>) -> (tensor<?x
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[3, 1, 2, 0]> : tensor<4xi32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [2, 2, 2, 32, 32, 1]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 2, 2, 2, 32, 32, 1>}
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.transpose"(%[[VAR3]], %[[VAR1]])
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = [2, 64, 64, 1]}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 2, 64, 64, 1>}
 // CHECK: return %[[VAR5:.*]]
 func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1xf32> {
   %cst = arith.constant dense<2> : tensor<2xi32>
@@ -1395,10 +1474,10 @@ func.func @test_batch_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<2x64x64x1
 
 // CHECK-LABEL: @test_batch_to_space_dyn
 // CHECK-DAG: %[[C0:.+]] = "tosa.const"() {value = dense<[2, 3, 0, 4, 1, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%arg0) {new_shape = [3, 1, -1, 79, 1, 80]}
+// CHECK-DAG: %[[R0:.+]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 3, 1, -1, 79, 1, 80>}
 // CHECK-DAG: %[[T:.+]] = "tosa.transpose"(%[[R0]], %[[C0]])
-// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = [-1, 237, 1, 80]}
-// CHECK-DAG: %[[SLICE:.+]] = "tosa.slice"(%[[R1]]) {size = [-1, 235, 1, 80], start = [0, 0, 0, 0]}
+// CHECK-DAG: %[[R1:.+]] = "tosa.reshape"(%[[T]]) {new_shape = array<i64: -1, 237, 1, 80>}
+// CHECK-DAG: %[[SLICE:.+]] = "tosa.slice"(%[[R1]]) {size = array<i64: -1, 235, 1, 80>, start = array<i64: 0, 0, 0, 0>}
 // CHECK: return %[[SLICE]]
 func.func @test_batch_to_space_dyn(%arg0 : tensor<?x79x1x80xf32>) -> (tensor<?x235x1x80xf32>) {
     %0 = "tfl.pseudo_const"() {value = dense<[3, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
@@ -1411,9 +1490,9 @@ func.func @test_batch_to_space_dyn(%arg0 : tensor<?x79x1x80xf32>) -> (tensor<?x2
 
 // CHECK-LABEL: test_space_to_depth
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 16, 2, 16, 2, 8]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 16, 2, 16, 2, 8>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [1, 16, 16, 32]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 16, 16, 32>}
 func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32> {
   %0 = "tfl.space_to_depth"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x16x16x32xf32>
   func.return %0 : tensor<1x16x16x32xf32>
@@ -1423,9 +1502,9 @@ func.func @test_space_to_depth(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x16x16x3
 
 // CHECK-LABEL: test_depth_to_space
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 3, 2, 4, 5]> : tensor<6xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 32, 32, 2, 2, 2]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 32, 32, 2, 2, 2>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%[[VAR1]], %[[VAR0]])
-// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [1, 64, 64, 2]}
+// CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 64, 64, 2>}
 func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32> {
   %0 = "tfl.depth_to_space"(%arg0)  {block_size = 2 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<1x64x64x2xf32>
   func.return %0 : tensor<1x64x64x2xf32>
@@ -1433,17 +1512,30 @@ func.func @test_depth_to_space(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x2
 
 // -----
 
-// CHECK-LABEL: test_one_hot
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 2, 1]> : tensor<3xi32>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 1, 1]}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.tile"(%[[VAR1]]) {multiples = [16, 1, 1]}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg2) {new_shape = [1, 1, 1]}
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.tile"(%[[VAR3]]) {multiples = [16, 2, 1]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg0) {new_shape = [16, 1]}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.scatter"(%[[VAR4]], %[[VAR5]], %[[VAR2]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [16, 1, 2]}
-// CHECK-DAG: %[[VAR8:.*]] = "tosa.transpose"(%[[VAR7]], %[[VAR0]])
-// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = [4, 4, 2]}
+// CHECK-LABEL: @test_bucketize
+// CHECK-DAG: %[[VAL_0:.*]] = "tosa.const"() {value = dense<{{\[\[\[}}0.000000e+00, 3.000000e+00, 8.000000e+00, 1.100000e+01]]]> : tensor<1x1x4xf32>}
+// CHECK: %[[VAL_1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 2, 5, 1>}
+// CHECK: %[[VAL_2:.*]] = "tosa.greater_equal"(%[[VAL_1]], %[[VAL_0]])
+// CHECK: %[[VAL_3:.*]] = "tosa.cast"(%[[VAL_2]]) : (tensor<2x5x4xi1>) -> tensor<2x5x4xi32>
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 2 : i64}
+// CHECK: %[[VAL_5:.*]] = "tosa.reshape"(%[[VAL_4]]) {new_shape = array<i64: 2, 5>}
+func.func @test_bucketize(%arg0: tensor<2x5xf32>) -> tensor<2x5xi32> {
+  %0 = "tfl.bucketize"(%arg0) {boundaries = [0.000000e+00 : f32, 3.000000e+00 : f32, 8.000000e+00 : f32, 1.100000e+01 : f32]} : (tensor<2x5xf32>) -> tensor<2x5xi32>
+  func.return %0 : tensor<2x5xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @test_one_hot
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<4x4xi32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<f32>
+// CHECK-DAG:     %[[RESHAPE:.*]] = "tosa.reshape"(%[[ARG1]]) {new_shape = array<i64: 1, 1, 1>}
+// CHECK-DAG:     %[[TILE:.*]] = "tosa.tile"(%[[RESHAPE]]) {multiples = array<i64: 16, 1, 1>}
+// CHECK-DAG:     %[[RESHAPE_0:.*]] = "tosa.reshape"(%[[ARG2]]) {new_shape = array<i64: 1, 1, 1>}
+// CHECK-DAG:     %[[TILE_0:.*]] = "tosa.tile"(%[[RESHAPE_0]]) {multiples = array<i64: 16, 2, 1>}
+// CHECK-DAG:     %[[RESHAPE_1:.*]] = "tosa.reshape"(%[[ARG0]]) {new_shape = array<i64: 16, 1>}
+// CHECK-DAG:     %[[SCATTER:.*]] = "tosa.scatter"(%[[TILE_0]], %[[RESHAPE_1]], %[[TILE]])
+// CHECK-DAG:     %[[RESHAPE_2:.*]] = "tosa.reshape"(%[[SCATTER]]) {new_shape = array<i64: 4, 4, 2>}
+// CHECK:         return %[[RESHAPE_2]]
 func.func @test_one_hot(%arg0: tensor<4x4xi32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<4x4x2xf32> {
   %0 = arith.constant dense<2> : tensor<i32>
   %1 = "tfl.one_hot"(%arg0, %0, %arg1, %arg2) {axis = -1 : i32} : (tensor<4x4xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<4x4x2xf32>
@@ -1551,7 +1643,7 @@ func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = array<i64: 1, 1>}
 // CHECK-SAME: -> tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
 func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
@@ -1561,7 +1653,7 @@ func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_i16
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 // CHECK-SAME: -> tensor<1x32x32x8xi16>
 func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xi16>) -> tensor<*xi16>
@@ -1571,7 +1663,7 @@ func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
 // -----
 
 // CHECK-LABEL: test_max_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
 func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.01568342000246048:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>> {
   %0 = "tfl.max_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.01568342000246048:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>>
   func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.01568342000246048:-1>>
@@ -1594,10 +1686,10 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // CHECK-DAG: %[[VAR12:.*]] = "tosa.const"() {value = dense<"0xE{{.*}}"> : tensor<513xi16>}
 // CHECK-DAG: %[[VAR13:.*]] = "tosa.const"() {value = dense<"0x4{{.*}}"> : tensor<513xi16>}
 // CHECK-DAG: %[[VAR14:.*]] = "tosa.const"() {value = dense<"0x0{{.*}}"> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR15:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = -1 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK-DAG: %[[VAR15:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = -1 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 // CHECK-DAG: %[[VAR16:.*]] = "tosa.reduce_max"(%[[VAR15]]) {axis = 2 : i64}
 // CHECK-DAG: %[[VAR17:.*]] = "tosa.sub"(%[[VAR15]], %[[VAR16]])
-// CHECK-DAG: %[[VAR18:.*]] = "tosa.rescale"(%[[VAR17]]) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [23 : i32]}
+// CHECK-DAG: %[[VAR18:.*]] = "tosa.rescale"(%[[VAR17]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 23>}
 // CHECK-DAG: %[[VAR19:.*]] = "tosa.table"(%[[VAR18]], %[[VAR14]])
 // CHECK-DAG: %[[VAR20:.*]] = "tosa.table"(%[[VAR18]], %[[VAR13]])
 // CHECK-DAG: %[[VAR21:.*]] = "tosa.table"(%[[VAR18]], %[[VAR12]])
@@ -1634,7 +1726,7 @@ func.func @test_max_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // CHECK-DAG: %[[VAR52:.*]] = "tosa.mul"(%[[VAR29]], %[[VAR51]]) {shift = 30 : i32}
 // CHECK-DAG: %[[VAR53:.*]] = "tosa.sub"(%[[VAR1]], %[[VAR32]])
 // CHECK-DAG: %[[VAR54:.*]] = "tosa.arithmetic_right_shift"(%[[VAR52]], %[[VAR53]]) {round = true}
-// CHECK: %[[VAR55:.*]] = "tosa.rescale"(%[[VAR54]]) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = -128 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK: %[[VAR55:.*]] = "tosa.rescale"(%[[VAR54]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015685837715864182:-1>>) -> tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
   func.return %0 : tensor<13x21x3x!quant.uniform<i8:f32, 3.906250e-03:-128>>
@@ -1653,10 +1745,10 @@ func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.const"() {value = dense<32767> : tensor<1x1xi32>}
 // CHECK-DAG: %[[VAR7:.*]] = "tosa.const"() {value = dense<"0xF{{.*}}>
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.const"() {value = dense<"0x0{{.*}}> : tensor<513xi16>}
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 // CHECK-DAG: %[[VAR10:.*]] = "tosa.reduce_max"(%[[VAR9]]) {axis = 1 : i64}
 // CHECK-DAG: %[[VAR11:.*]] = "tosa.sub"(%[[VAR9]], %[[VAR10]])
-// CHECK-DAG: %[[VAR12:.*]] = "tosa.rescale"(%[[VAR11]]) {double_round = true, input_zp = 0 : i32, multiplier = [1717965619 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [32 : i32]}
+// CHECK-DAG: %[[VAR12:.*]] = "tosa.rescale"(%[[VAR11]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1717965619>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 32>}
 // CHECK-DAG: %[[VAR13:.*]] = "tosa.add"(%[[VAR12]], %[[VAR6]])
 // CHECK-DAG: %[[VAR14:.*]] = "tosa.cast"(%[[VAR13]])
 // CHECK-DAG: %[[VAR15:.*]] = "tosa.table"(%[[VAR14]], %[[VAR8]])
@@ -1674,7 +1766,7 @@ func.func @test_softmax_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568
 // CHECK-DAG: %[[VAR27:.*]] = "tosa.mul"(%[[VAR26]], %[[VAR16]]) {shift = 0 : i32}
 // CHECK-DAG: %[[VAR28:.*]] = "tosa.sub"(%[[VAR0]], %[[VAR18]])
 // CHECK-DAG: %[[VAR29:.*]] = "tosa.arithmetic_right_shift"(%[[VAR27]], %[[VAR28]]) {round = true}
-// CHECK: %[[VAR30:.*]] = "tosa.rescale"(%[[VAR29]]) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK: %[[VAR30:.*]] = "tosa.rescale"(%[[VAR29]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 func.func @test_softmax_qi16(%arg0: tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>> {
   %0 = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<14x19x!quant.uniform<i16:f32, 6.103533087298274E-5>>) -> tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
   func.return %0 : tensor<14x19x!quant.uniform<i16:f32, 3.0517578125E-5>>
@@ -1712,6 +1804,16 @@ func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01567153
 
 // -----
 
+// CHECK-LABEL: test_relu0To1_qi8
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.rescale"(%arg0)
+// CHECK: %[[VAR1:.*]] = "tosa.clamp"(%0) {max_fp = 1.000000e+00 : f32, max_int = 64 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
+func.func @test_relu0To1_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015639215707778931>>) -> tensor<*x!quant.uniform<i8:f32, 0.015639215707778931>> {
+  %0 = "tfl.relu_0_to_1"(%arg0) : (tensor<13x21x3x!quant.uniform<i8:f32, 0.015639215707778931>>) -> tensor<*x!quant.uniform<i8:f32, 0.015639215707778931>>
+  func.return %0 : tensor<*x!quant.uniform<i8:f32, 0.015639215707778931>>
+}
+
+// -----
+
 // CHECK-LABEL: test_relu6_qi8
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.rescale"(%arg0)
 // CHECK: %[[VAR1:.*]] = "tosa.clamp"(%0) {max_fp = 6.000000e+00 : f32, max_int = 384 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64}
@@ -1723,10 +1825,10 @@ func.func @test_relu6_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.0156392
 // -----
 
 // CHECK-LABEL: test_relu6_qu8
-// CHECK: %[[CAST:.+]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = -128 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
-// CHECK: %[[RESCALE:.+]] = "tosa.rescale"(%[[CAST]]) {double_round = false, input_zp = -128 : i32, multiplier = [1073741824 : i32], output_zp = -128 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK: %[[CAST:.+]] = "tosa.rescale"(%arg0) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
+// CHECK: %[[RESCALE:.+]] = "tosa.rescale"(%[[CAST]]) {double_round = false, input_zp = -128 : i32, multiplier = array<i32: 1073741824>, output_zp = -128 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 // CHECK: %[[CLAMP:.+]] = "tosa.clamp"(%[[RESCALE]]) {max_fp = 6.000000e+00 : f32, max_int = 22 : i64, min_fp = 0.000000e+00 : f32, min_int = -128 : i64}
-// CHECK: %[[OUT:.+]] = "tosa.rescale"(%[[CLAMP]]) {double_round = false, input_zp = -128 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [30 : i32]}
+// CHECK: %[[OUT:.+]] = "tosa.rescale"(%[[CLAMP]]) {double_round = false, input_zp = -128 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 30>}
 func.func @test_relu6_qu8(%arg0: tensor<13x21x3x!quant.uniform<u8:f32, 0.04>>) -> tensor<*x!quant.uniform<u8:f32, 0.04>> {
   %0 = "tfl.relu6"(%arg0) : (tensor<13x21x3x!quant.uniform<u8:f32, 0.04>>) -> tensor<*x!quant.uniform<u8:f32, 0.04>>
   func.return %0 : tensor<*x!quant.uniform<u8:f32, 0.04>>
@@ -1749,8 +1851,8 @@ func.func @test_leaky_relu_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.0155
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [14, 14], mode = "BILINEAR", offset = [0, 0], scale = [16, 2, 16, 2]}
-// CHECK: %[[VAR2:.*]] = "tosa.rescale"(%[[VAR1]]) {double_round = false, input_zp = 0 : i32, multiplier = [1073741824 : i32], output_zp = 0 : i32, per_channel = false, scale32 = true, shift = [38 : i32]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 14, 14>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}
+// CHECK: %[[VAR2:.*]] = "tosa.rescale"(%[[VAR1]]) {double_round = false, input_zp = 0 : i32, multiplier = array<i32: 1073741824>, output_zp = 0 : i32, per_channel = false, scale32 = true, shift = array<i32: 38>}
 func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1760,7 +1862,7 @@ func.func @test_resize_bilinear_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f3
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_half_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [7, 7], mode = "BILINEAR", offset = [-7, -7], scale = [16, 2, 16, 2]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 7, 7>, mode = "BILINEAR", offset = array<i64: -7, -7>, scale = array<i64: 16, 2, 16, 2>}
 func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1770,7 +1872,7 @@ func.func @test_resize_bilinear_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [0, 0], mode = "BILINEAR", offset = [0, 0], scale = [1278, 158, 1278, 158]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 0, 0>, mode = "BILINEAR", offset = array<i64: 0, 0>, scale = array<i64: 1278, 158, 1278, 158>}
 func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1780,7 +1882,7 @@ func.func @test_resize_bilinear_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform
 // -----
 
 // CHECK-LABEL: test_resize_bilinear_align_half_qi8
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [-560, -560], mode = "BILINEAR", offset = [-560, -560], scale = [1278, 158, 1278, 158]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: -560, -560>, mode = "BILINEAR", offset = array<i64: -560, -560>, scale = array<i64: 1278, 158, 1278, 158>}
 func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_bilinear"(%arg0, %0) {align_corners = true, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1790,7 +1892,7 @@ func.func @test_resize_bilinear_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.un
 // -----
 
 // CHECK-LABEL: test_resize_nearest_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [14, 14], mode = "NEAREST_NEIGHBOR", offset = [0, 0], scale = [16, 2, 16, 2]}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 14, 14>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 0, 0>, scale = array<i64: 16, 2, 16, 2>}
 func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1801,7 +1903,7 @@ func.func @test_resize_nearest_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32
 // -----
 
 // CHECK-LABEL: test_resize_nearest_half_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [15, 15], mode = "NEAREST_NEIGHBOR", offset = [1, 1], scale = [16, 2, 16, 2]}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 15, 15>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 1, 1>, scale = array<i64: 16, 2, 16, 2>}
 func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = false, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1811,7 +1913,7 @@ func.func @test_resize_nearest_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [639, 639], mode = "NEAREST_NEIGHBOR", offset = [639, 639], scale = [1278, 158, 1278, 158]}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 639, 639>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 639, 639>, scale = array<i64: 1278, 158, 1278, 158>}
 func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = true, half_pixel_centers = false} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1821,7 +1923,7 @@ func.func @test_resize_nearest_align_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<
 // -----
 
 // CHECK-LABEL: test_resize_nearest_align_half_qi8
-// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = [718, 718], mode = "NEAREST_NEIGHBOR", offset = [718, 718], scale = [1278, 158, 1278, 158]}
+// CHECK: %[[VAR1:.*]] = "tosa.resize"(%arg0) {border = array<i64: 718, 718>, mode = "NEAREST_NEIGHBOR", offset = array<i64: 718, 718>, scale = array<i64: 1278, 158, 1278, 158>}
 func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>> {
   %0 = "tfl.pseudo_const"() {value = dense<640> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.resize_nearest_neighbor"(%arg0, %0) {align_corners = true, half_pixel_centers = true} : (tensor<1x80x80x2x!quant.uniform<i8:f32, 0.42546585202217102>>, tensor<2xi32>) -> tensor<1x640x640x2x!quant.uniform<i8:f32, 0.42546585202217102>>
@@ -1835,7 +1937,7 @@ func.func @test_resize_nearest_align_half_qi8(%arg0: tensor<1x80x80x2x!quant.uni
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0> : tensor<28xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR0]])
 // CHECK-DAG: %[[VAR3:.*]] = "tosa.fully_connected"(%arg0, %[[VAR2]], %[[VAR1]]) {quantization_info = #tosa.conv_quant<input_zp = -1, weight_zp = -1>}
-// CHECK: %[[VAR4:.*]] = "tosa.rescale"(%[[VAR3]]) {double_round = true, input_zp = 0 : i32, multiplier = [1353377973 : i32], output_zp = 3 : i32, per_channel = false, scale32 = true, shift = [40 : i32]}
+// CHECK: %[[VAR4:.*]] = "tosa.rescale"(%[[VAR3]]) {double_round = true, input_zp = 0 : i32, multiplier = array<i32: 1353377973>, output_zp = 3 : i32, per_channel = false, scale32 = true, shift = array<i32: 40>}
 func.func @test_fullyconnected_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.015685491263866425:-1>>, %arg1: tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>) -> tensor<14x28x!quant.uniform<i8:f32, 0.19988977909088135:3>> {
   %0 = "tfl.pseudo_const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
   %1 = "tfl.transpose"(%arg1, %0) : (tensor<19x28x!quant.uniform<i8:f32, 0.015685983002185822:-1>>, tensor<2xi32>) -> tensor<28x19x!quant.uniform<i8:f32, 0.015685983002185822:-1>>
@@ -1846,10 +1948,10 @@ func.func @test_fullyconnected_qi8(%arg0: tensor<14x19x!quant.uniform<i8:f32, 0.
 
 // -----
 // CHECK-LABEL: test_gather
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 63]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 49]}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [7, 7, 21, 3]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -1858,10 +1960,10 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi32>) -> te
 
 // -----
 // CHECK-LABEL: test_gather_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, -1, 63]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 49]}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, -1, 63>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [7, 7, 21, 3]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, 3>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<?x21x3xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -1871,10 +1973,10 @@ func.func @test_gather_dyn(%arg0: tensor<?x21x3xf32>, %arg1 : tensor<7x7xi32>) -
 
 // -----
 // CHECK-LABEL: test_gather_channel_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, -1]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 49]}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, -1>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 49>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [7, 7, 21, -1]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 7, 7, 21, -1>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x?xf32>, tensor<7x7xi32>) -> tensor<*xf32>
@@ -1883,10 +1985,10 @@ func.func @test_gather_channel_dyn(%arg0: tensor<13x21x?xf32>, %arg1: tensor<7x7
 
 // -----
 // CHECK-LABEL: test_gather_indices_dyn
-// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 13, 63]}
-// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, -1]}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, -1>}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.gather"(%[[VAR4]], %[[VAR5]])
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [-1, 7, 21, 3]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: -1, 7, 21, 3>}
 // CHECK: return %[[VAR7]]
 func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7xi32>) -> tensor<*xf32> {
   %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<?x7xi32>) -> tensor<*xf32>
@@ -1896,9 +1998,9 @@ func.func @test_gather_indices_dyn(%arg0: tensor<13x21x3xf32>, %arg1: tensor<?x7
 // -----
 // CHECK-LABEL: test_gather_batch
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 4, 16]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 4, 16>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %[[VAR0]])
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [1, 3, 4, 4]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: 1, 3, 4, 4>}
 // CHECK: return %[[VAR3]]
 func.func @test_gather_batch(%arg0: tensor<1x4x4x4xi32>) -> tensor<1x3x4x4xi32> {
   %0 = "tfl.pseudo_const"() {value = dense<[[0, 3, 1]]> : tensor<1x3xi32>} : () -> tensor<1x3xi32>
@@ -1908,9 +2010,9 @@ func.func @test_gather_batch(%arg0: tensor<1x4x4x4xi32>) -> tensor<1x3x4x4xi32>
 
 // -----
 // CHECK-LABEL: test_gather_batch_dyn
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = [-1, 4, 16]}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: -1, 4, 16>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.gather"(%[[VAR1]], %arg1)
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = [-1, 3, 4, 4]}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) {new_shape = array<i64: -1, 3, 4, 4>}
 // CHECK: return %[[VAR3]]
 func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi32>) -> tensor<?x3x4x4xi32> {
   %1 = "tfl.gather"(%arg0, %arg1) {axis = 1 : i32, batch_dims = 1 : i32} : (tensor<?x4x4x4xi32>, tensor<?x3xi32>) -> tensor<?x3x4x4xi32>
@@ -1920,18 +2022,31 @@ func.func @test_gather_batch_dyn(%arg0: tensor<?x4x4x4xi32>, %arg1: tensor<?x3xi
 // -----
 // CHECK-LABEL: test_gather_nd
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 273, 3]}
-// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg1) {new_shape = [42, 2]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 273, 3>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 42, 2>}
 // CHECK-DAG: %[[VAR5:.*]] = "tosa.mul"(%[[VAR3]], %[[VAR1]]) {shift = 0 : i32}
 // CHECK-DAG: %[[VAR6:.*]] = "tosa.reduce_sum"(%[[VAR5]]) {axis = 1 : i64}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = [1, 42]}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR6]]) {new_shape = array<i64: 1, 42>}
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.gather"(%[[VAR2]], %[[VAR7]])
-// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = [6, 7, 3]}
+// CHECK: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = array<i64: 6, 7, 3>}
 func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6x7x2xi32>) -> tensor<6x7x3xf32> {
   %1 = "tfl.gather_nd"(%arg0, %arg1) : (tensor<13x21x3xf32>, tensor<6x7x2xi32>) -> tensor<6x7x3xf32>
   func.return %1 : tensor<6x7x3xf32>
 }
 
+// -----
+// CHECK-LABEL: test_gather_cast
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.cast"(%arg1)
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 13, 63>}
+// CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR1]]) {new_shape = array<i64: 1, 49>}
+// CHECK-DAG: %[[VAR4:.*]] = "tosa.gather"(%[[VAR2]], %[[VAR3]])
+// CHECK-DAG: %[[VAR5:.*]] = "tosa.reshape"(%[[VAR4]]) {new_shape = array<i64: 7, 7, 21, 3>}
+// CHECK: return %[[VAR5]]
+func.func @test_gather_cast(%arg0: tensor<13x21x3xf32>, %arg1: tensor<7x7xi64>) -> tensor<*xf32> {
+  %2 = "tfl.gather"(%arg0, %arg1) {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<7x7xi64>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
 // -----
 
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{\[\[}}48, 1]]> : tensor<1x2xi32>}
@@ -1939,10 +2054,10 @@ func.func @test_gather_nd(%arg0: tensor<13x21x3xf32>, %arg1: tensor<6x7x2xi32>)
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.cast"(%arg0)
 // CHECK-DAG: %[[VAR4:.*]] = "tosa.mul"(%[[VAR2]], %[[VAR0]]) {shift = 0 : i32}
 // CHECK-DAG: %[[VAR5:.*]] = "tosa.reduce_sum"(%[[VAR4]]) {axis = 1 : i64}
-// CHECK-DAG: %[[VAR6:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, -1, 1]}
-// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR5]]) {new_shape = [1, -1]}
+// CHECK-DAG: %[[VAR6:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, -1, 1>}
+// CHECK-DAG: %[[VAR7:.*]] = "tosa.reshape"(%[[VAR5]]) {new_shape = array<i64: 1, -1>}
 // CHECK-DAG: %[[VAR8:.*]] = "tosa.scatter"(%[[VAR1]], %[[VAR7]], %[[VAR6]])
-// CHECK-DAG: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = [1, 48]}
+// CHECK-DAG: %[[VAR9:.*]] = "tosa.reshape"(%[[VAR8]]) {new_shape = array<i64: 1, 48>}
 // CHECK: return %[[VAR9]]
 func.func @sparse_to_dense(%arg0 : tensor<?x2xi64>, %arg1 : tensor<?xi64>) -> (tensor<1x48xi64>) {
   %0 = arith.constant dense<[1, 48]> : tensor<2xi64>
@@ -1963,6 +2078,16 @@ func.func @test_arg_max(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // -----
 
+// CHECK-LABEL: @test_arg_max_negative_dim
+func.func @test_arg_max_negative_dim(%arg0: tensor<13x21x3xf32>) -> tensor<13x21xf32> {
+  // CHECK: %[[ARGMAX:.+]] = "tosa.argmax"(%arg0) {axis = 2 : i64}
+  %0 = "tfl.pseudo_const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tfl.arg_max"(%arg0, %0) : (tensor<13x21x3xf32>, tensor<i32>) -> tensor<13x21xf32>
+  func.return %1 : tensor<13x21xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_fakequant
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<-2.00003052> : tensor<1x1x1xf32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<1.99996948> : tensor<1x1x1xf32>}
@@ -2065,33 +2190,94 @@ func.func @test_gelu_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.01568556
 // -----
 
 // CHECK-LABEL: mirrorpad_reflect
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x9xi32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [2, 9], start = [1, 0]} : (tensor<4x9xi32>)
-// CHECK: %[[VAL_2:.*]] = "tosa.reverse"(%[[VAL_1]]) {axis = 0 : i64} : (tensor<2x9xi32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [1, 9], start = [2, 0]} : (tensor<4x9xi32>)
-// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_2]], %[[VAL_0]], %[[VAL_3]]) {axis = 0 : i64} : (tensor<2x9xi32>, tensor<4x9xi32>, tensor<1x9xi32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [7, 2], start = [0, 1]} : (tensor<7x9xi32>)
-// CHECK: %[[VAL_6:.*]] = "tosa.reverse"(%[[VAL_5]]) {axis = 1 : i64} : (tensor<7x2xi32>)
-// CHECK: %[[VAL_7:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [7, 1], start = [0, 7]} : (tensor<7x9xi32>)
-// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_6]], %[[VAL_4]], %[[VAL_7]]) {axis = 1 : i64} : (tensor<7x2xi32>, tensor<7x9xi32>, tensor<7x1xi32>)
-func.func @mirrorpad_reflect(%arg0: tensor<4x9xi32>) -> tensor<7x12xi32> {
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 2, 9>, start = array<i64: 1, 0>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_2:.*]] = "tosa.reverse"(%[[VAL_1]]) {axis = 0 : i64} : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 9>, start = array<i64: 2, 0>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_2]], %[[VAL_0]], %[[VAL_3]]) {axis = 0 : i64} : (tensor<2x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<1x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 7, 2>, start = array<i64: 0, 1>} : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_6:.*]] = "tosa.reverse"(%[[VAL_5]]) {axis = 1 : i64} : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_7:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 7, 1>, start = array<i64: 0, 7>} : (tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+// CHECK: %[[VAL_8:.*]] = "tosa.concat"(%[[VAL_6]], %[[VAL_4]], %[[VAL_7]]) {axis = 1 : i64} : (tensor<7x2x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<7x1x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>)
+func.func @mirrorpad_reflect(%arg0: tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>) -> tensor<7x12x!quant.uniform<i8:f32, 0.0083315325900912285:-108>> {
   %0 = "tfl.pseudo_const"() {value = dense<[[2, 1], [2, 1]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
-  %1 = "tfl.mirror_pad"(%arg0, %0) {mode = #tfl<mirror_pad_attr REFLECT>} : (tensor<4x9xi32>, tensor<2x2xi32>) -> tensor<7x12xi32>
-  return %1 : tensor<7x12xi32>
+  %1 = "tfl.mirror_pad"(%arg0, %0) {mode = #tfl<mirror_pad_attr REFLECT>} : (tensor<4x9x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>, tensor<2x2xi32>) -> tensor<7x12x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
+  return %1 : tensor<7x12x!quant.uniform<i8:f32, 0.0083315325900912285:-108>>
 }
 
 // -----
 
 // CHECK-LABEL: mirrorpad_symmetric
 // CHECK-SAME: %[[VAL_0:.*]]: tensor<15x23x2xf32>
-// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = [1, 23, 2], start = [0, 0, 0]} : (tensor<15x23x2xf32>)
+// CHECK: %[[VAL_1:.*]] = "tosa.slice"(%[[VAL_0]]) {size = array<i64: 1, 23, 2>, start = array<i64: 0, 0, 0>} : (tensor<15x23x2xf32>)
 // CHECK: %[[VAL_2:.*]] = "tosa.concat"(%[[VAL_1]], %[[VAL_0]]) {axis = 0 : i64} : (tensor<1x23x2xf32>, tensor<15x23x2xf32>)
-// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = [16, 1, 2], start = [0, 0, 0]} : (tensor<16x23x2xf32>)
+// CHECK: %[[VAL_3:.*]] = "tosa.slice"(%[[VAL_2]]) {size = array<i64: 16, 1, 2>, start = array<i64: 0, 0, 0>} : (tensor<16x23x2xf32>)
 // CHECK: %[[VAL_4:.*]] = "tosa.concat"(%[[VAL_3]], %[[VAL_2]]) {axis = 1 : i64} : (tensor<16x1x2xf32>, tensor<16x23x2xf32>)
-// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = [16, 24, 1], start = [0, 0, 0]} : (tensor<16x24x2xf32>)
+// CHECK: %[[VAL_5:.*]] = "tosa.slice"(%[[VAL_4]]) {size = array<i64: 16, 24, 1>, start = array<i64: 0, 0, 0>} : (tensor<16x24x2xf32>)
 // CHECK: %[[VAL_6:.*]] = "tosa.concat"(%[[VAL_5]], %[[VAL_4]]) {axis = 2 : i64} : (tensor<16x24x1xf32>, tensor<16x24x2xf32>)
 func.func @mirrorpad_symmetric(%arg0: tensor<15x23x2xf32>) -> tensor<16x24x3xf32> {
   %0 = "tfl.pseudo_const"() {value = dense<[[1, 0], [1, 0], [1, 0]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32>
   %1 = "tfl.mirror_pad"(%arg0, %0) {mode = #tfl<mirror_pad_attr SYMMETRIC>} : (tensor<15x23x2xf32>, tensor<3x2xi32>) -> tensor<16x24x3xf32>
   return %1 : tensor<16x24x3xf32>
 }
+
+// -----
+
+// CHECK-LABEL: test_tfl_custom
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<1x64x64x32xf32>
+// CHECK: %[[VAL_0:.*]] = "tosa.custom"(%[[ARG_0]]) {config = "TFL", identifier = "MaxPoolingWithArgmax2D", implementation_attrs = "{{.*}}"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+func.func @test_tfl_custom(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
+  // custom op for "tfl.max_pooling_with_argmax_2d"(%arg0) {filter_h = 2 : i32, filter_w = 2 : i32, padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+  %0, %1 = "tfl.custom"(%arg0) {custom_option = #tfl<const_bytes : "0x01000000020000000200000002000000020000000000000000000000000000000000000000000000">, custom_code = "MaxPoolingWithArgmax2D"} : (tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>)
+  func.return %0, %1 : tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>
+}
+
+// -----
+// CHECK-LABEL: test_tfl_while_loop
+// CHECK: %[[VAL_0:.*]]: tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["placeholder_0"]}) -> (tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["output_0"]}) {
+// CHECK: %[[VAL_1:.*]] = "tosa.const"() {value = dense<2.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+// CHECK: %[[VAL_2:.*]] = "tosa.while_loop"(%[[VAL_0]]) ({
+// CHECK: ^bb0(%[[VAL_3:.*]]: tensor<1x4x4x4xf32>):
+// CHECK: %[[VAL_4:.*]] = "tosa.reduce_sum"(%[[VAL_3]]) {axis = 1 : i64} : (tensor<1x4x4x4xf32>) -> tensor<1x1x4x4xf32>
+// CHECK: %[[VAL_5:.*]] = "tosa.reduce_sum"(%[[VAL_4]]) {axis = 2 : i64} : (tensor<1x1x4x4xf32>) -> tensor<1x1x1x4xf32>
+// CHECK: %[[VAL_6:.*]] = "tosa.reduce_sum"(%[[VAL_5]]) {axis = 3 : i64} : (tensor<1x1x1x4xf32>) -> tensor<1x1x1x1xf32>
+// CHECK: %[[VAL_7:.*]] = "tosa.reshape"(%[[VAL_6]]) {new_shape = array<i64: 1>} : (tensor<1x1x1x1xf32>) -> tensor<1xf32>
+// CHECK: %[[VAL_8:.*]] = "tosa.greater"(%[[VAL_1]], %[[VAL_7]]) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1>
+// CHECK: %[[VAL_9:.*]] = "tosa.reshape"(%[[VAL_8]]) {new_shape = array<i64>} : (tensor<1xi1>) -> tensor<i1>
+// CHECK: "tosa.yield"(%[[VAL_9]]) : (tensor<i1>) -> ()
+// CHECK: }, {
+// CHECK: ^bb0(%[[VAL_10:.*]]: tensor<1x4x4x4xf32>):
+// CHECK: %[[VAL_11:.*]] = "tosa.sigmoid"(%[[VAL_10]]) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+// CHECK: %[[VAL_12:.*]] = "tosa.add"(%[[VAL_10]], %[[VAL_11]]) : (tensor<1x4x4x4xf32>, tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+// CHECK: "tosa.yield"(%[[VAL_12]]) : (tensor<1x4x4x4xf32>) -> ()
+// CHECK: }) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+// CHECK: return %[[VAL_13:.*]] : tensor<1x4x4x4xf32>
+// CHECK: }
+func.func @test_tfl_while_loop(%arg0: tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["placeholder_0"]}) -> (tensor<1x4x4x4xf32> {tf_saved_model.index_path = ["output_0"]}) {
+  %0 = "tfl.while"(%arg0) ({
+  ^bb0(%arg1: tensor<1x4x4x4xf32>):
+    %1 = func.call @result_cond(%arg1) : (tensor<1x4x4x4xf32>) -> tensor<i1>
+    "tfl.yield"(%1) : (tensor<i1>) -> ()
+  }, {
+  ^bb0(%arg1: tensor<1x4x4x4xf32>):
+    %1 = func.call @result_body(%arg1) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+    "tfl.yield"(%1) : (tensor<1x4x4x4xf32>) -> ()
+  }) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+  func.return %0 : tensor<1x4x4x4xf32>
+}
+func.func private @result_cond(%arg0: tensor<1x4x4x4xf32>) -> tensor<i1> {
+  %0 = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tfl.sum"(%arg0, %0) {keep_dims = false} : (tensor<1x4x4x4xf32>, tensor<4xi32>) -> tensor<f32>
+  %2 = "tfl.pseudo_const"() {value = dense<2.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+  %3 = tfl.less(%1, %2) : (tensor<f32>, tensor<1xf32>) -> tensor<1xi1>
+  %4 = "tfl.pseudo_const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %5 = "tfl.reshape"(%3, %4) : (tensor<1xi1>, tensor<0xi32>) -> tensor<i1>
+  func.return %5 : tensor<i1>
+}
+func.func private @result_body(%arg0: tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32> {
+  %0 = "tfl.logistic"(%arg0) : (tensor<1x4x4x4xf32>) -> tensor<1x4x4x4xf32>
+  %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<1x4x4x4xf32>
+  func.return %1 : tensor<1x4x4x4xf32>
+}
+
+
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index 15031a8620d..62058cbe799 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -195,9 +195,9 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
           function.getLoc(), int8_type, arg,
           builder.getI32IntegerAttr(uint8_zp),
           builder.getI32IntegerAttr(int8_zp),
-          builder.getI32ArrayAttr({1 << 30}), builder.getI32ArrayAttr({30}),
-          builder.getBoolAttr(true), builder.getBoolAttr(false),
-          builder.getBoolAttr(false));
+          builder.getDenseI32ArrayAttr({1 << 30}),
+          builder.getDenseI32ArrayAttr({30}), builder.getBoolAttr(true),
+          builder.getBoolAttr(false), builder.getBoolAttr(false));
 
       Operation *op_rescale_op = static_cast<Operation *>(rescale_op);
       bb.push_front(op_rescale_op);
@@ -313,9 +313,9 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
           function.getLoc(), uint8_output_type, input_val,
           builder.getI32IntegerAttr(int8_zp),
           builder.getI32IntegerAttr(uint8_zp),
-          builder.getI32ArrayAttr({1 << 30}), builder.getI32ArrayAttr({30}),
-          builder.getBoolAttr(true), builder.getBoolAttr(false),
-          builder.getBoolAttr(false));
+          builder.getDenseI32ArrayAttr({1 << 30}),
+          builder.getDenseI32ArrayAttr({30}), builder.getBoolAttr(true),
+          builder.getBoolAttr(false), builder.getBoolAttr(false));
 
       Operation *op_rescale_op = static_cast<Operation *>(rescale_op);
       bb.push_back(op_rescale_op);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
index 5b7261c4cdc..ae3a07d324a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
@@ -75,8 +75,8 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "output not a ranked tensor");
   }
 
-  auto value = tf_biasadd_op.value();
-  auto bias = tf_biasadd_op.bias();
+  auto value = tf_biasadd_op.getValue();
+  auto bias = tf_biasadd_op.getBias();
   auto bias_shape = bias.getType().cast<RankedTensorType>().getShape();
   if (bias_shape.size() != 1) {
     return rewriter.notifyMatchFailure(op, "bias tensor must be rank 1");
@@ -86,7 +86,7 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
           llvm::dyn_cast_if_present<TF::Conv2DOp>(value.getDefiningOp())) {
     // Sanity check to confirm rhs() has the expected shape of bias
     auto filter_shape =
-        tf_conv2d_op.filter().getType().cast<RankedTensorType>().getShape();
+        tf_conv2d_op.getFilter().getType().cast<RankedTensorType>().getShape();
 
     // Assume the filter shape is [H, W, I, O]
     if (filter_shape.back() != bias_shape.back()) {
@@ -95,10 +95,10 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
     }
 
     auto result = convertTFConv2DCommon(
-        rewriter, op, output_type, tf_conv2d_op.input(), tf_conv2d_op.filter(),
-        bias, tf_conv2d_op.strides(), tf_conv2d_op.dilations(),
-        tf_conv2d_op.explicit_paddings(), tf_conv2d_op.padding(),
-        tf_conv2d_op.data_format());
+        rewriter, op, output_type, tf_conv2d_op.getInput(),
+        tf_conv2d_op.getFilter(), bias, tf_conv2d_op.getStrides(),
+        tf_conv2d_op.getDilations(), tf_conv2d_op.getExplicitPaddings(),
+        tf_conv2d_op.getPadding(), tf_conv2d_op.getDataFormat());
 
     if (!result) return failure();
 
@@ -111,7 +111,7 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
           llvm::dyn_cast_if_present<TF::Conv3DOp>(value.getDefiningOp())) {
     // Sanity check to confirm rhs() has the expected shape of bias
     auto filter_shape =
-        tf_conv3d_op.filter().getType().cast<RankedTensorType>().getShape();
+        tf_conv3d_op.getFilter().getType().cast<RankedTensorType>().getShape();
 
     // Assume the filter shape is [D, H, W, I, O]
     if (filter_shape.back() != bias_shape.back()) {
@@ -120,9 +120,10 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
     }
 
     llvm::Optional<Value> result = convertTFConv3DCommon(
-        rewriter, op, output_type, tf_conv3d_op.input(), tf_conv3d_op.filter(),
-        bias, tf_conv3d_op.strides(), tf_conv3d_op.dilations(),
-        tf_conv3d_op.padding(), tf_conv3d_op.data_format());
+        rewriter, op, output_type, tf_conv3d_op.getInput(),
+        tf_conv3d_op.getFilter(), bias, tf_conv3d_op.getStrides(),
+        tf_conv3d_op.getDilations(), tf_conv3d_op.getPadding(),
+        tf_conv3d_op.getDataFormat());
 
     if (!result) return failure();
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index c0bb4ebd6ba..45e9c531f0b 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -44,8 +44,8 @@ namespace mlir {
 namespace tosa {
 
 static int64_t multiply_dims(int64_t a, int64_t b) {
-  if (a == ShapedType::kDynamicSize || b == ShapedType::kDynamicSize) {
-    return ShapedType::kDynamicSize;
+  if (a == ShapedType::kDynamic || b == ShapedType::kDynamic) {
+    return ShapedType::kDynamic;
   }
   return a * b;
 }
@@ -53,7 +53,7 @@ static int64_t multiply_dims(int64_t a, int64_t b) {
 static int64_t multiply_dims(llvm::ArrayRef<int64_t> dims, int64_t res = 1) {
   for (auto dim : dims) {
     if (ShapedType::isDynamic(dim)) {
-      return ShapedType::kDynamicSize;
+      return ShapedType::kDynamic;
     }
     res = res * dim;
   }
@@ -186,8 +186,8 @@ llvm::Optional<Value> convertPackOp(PatternRewriter& rewriter, Operation* op,
     RankedTensorType reshape_rank1_size1_type =
         tensorflow::GetTypeFromTFTensorShape(reshape_rank1_size1_shape,
                                              result_type.getElementType());
-    ArrayAttr shape_rank1_size1_attr =
-        rewriter.getI64ArrayAttr(reshape_rank1_size1_shape);
+    DenseI64ArrayAttr shape_rank1_size1_attr = rewriter.getDenseI64ArrayAttr(
+        tensorflow::ConvertMlirShapeToTF(reshape_rank1_size1_shape));
     for (int i = 0; i < inputs.size(); i++) {
       auto a0_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
           rewriter, op->getLoc(), reshape_rank1_size1_type, inputs[i],
@@ -254,7 +254,8 @@ llvm::Optional<Value> convertPackOp(PatternRewriter& rewriter, Operation* op,
                                 output_shape_vals.end());
   }
   IntegerAttr concat_axis_attr = rewriter.getI64IntegerAttr(concat_axis);
-  ArrayAttr shape_attr = rewriter.getI64ArrayAttr(reshape_output_shape);
+  DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(
+      tensorflow::ConvertMlirShapeToTF(reshape_output_shape));
 
   // Concat output shape will depend on concat_axis. E.g. [N * A, B, C]
   SmallVector<int64_t> concat_output_shape;
@@ -300,7 +301,7 @@ llvm::Optional<Value> convertPackOp(PatternRewriter& rewriter, Operation* op,
 
     return CreateOpAndInfer<tosa::TransposeOp>(
                rewriter, op->getLoc(), result_type, a2_reshape_op.getResult(),
-               a3_transpose_perm.getValue())
+               a3_transpose_perm.value())
         .getResult();
   }
 
@@ -353,7 +354,7 @@ llvm::Optional<SmallVector<Value>> convertUnpackOp(PatternRewriter& rewriter,
         rewriter, op->getLoc(),
         tensorflow::GetTypeFromTFTensorShape(a1_transpose_shape,
                                              input_type.getElementType()),
-        input_value, a1_transpose_perm.getValue());
+        input_value, a1_transpose_perm.value());
 
     transposed_input_value = a1_transpose_op.getResult();
   } else {
@@ -383,8 +384,9 @@ llvm::Optional<SmallVector<Value>> convertUnpackOp(PatternRewriter& rewriter,
       }
     }
 
-    ArrayAttr begin = rewriter.getI64ArrayAttr(begin_vals);
-    ArrayAttr size = rewriter.getI64ArrayAttr(size_vals);
+    DenseI64ArrayAttr begin = rewriter.getDenseI64ArrayAttr(begin_vals);
+    DenseI64ArrayAttr size = rewriter.getDenseI64ArrayAttr(
+        tensorflow::ConvertMlirShapeToTF(size_vals));
 
     auto a2_slice_op = CreateOpAndInfer<tosa::SliceOp>(
         rewriter, op->getLoc(),
@@ -396,7 +398,9 @@ llvm::Optional<SmallVector<Value>> convertUnpackOp(PatternRewriter& rewriter,
         rewriter, op->getLoc(),
         tensorflow::GetTypeFromTFTensorShape(
             shape_vals, transposed_input_type.getElementType()),
-        a2_slice_op.getResult(), rewriter.getI64ArrayAttr(shape_vals));
+        a2_slice_op.getResult(),
+        rewriter.getDenseI64ArrayAttr(
+            tensorflow::ConvertMlirShapeToTF(shape_vals)));
 
     results_vec.push_back(a3_reshape_op.getResult());
   }
@@ -446,7 +450,9 @@ llvm::Optional<Value> convertSelectOp(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(new_cond_dims,
                                            condition_type.getElementType()),
-      condition_value, rewriter.getI64ArrayAttr(new_cond_dims));
+      condition_value,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(new_cond_dims)));
 
   return CreateOpAndInfer<tosa::SelectOp>(rewriter, op->getLoc(), result_type,
                                           reshape_op, x_value, y_value)
@@ -600,7 +606,7 @@ llvm::Optional<Value> convertRoundOp(PatternRewriter& rewriter, Operation* op,
 
 // Lowers ConcatV2 to TOSA Concat.
 llvm::Optional<Value> convertConcatV2Op(PatternRewriter& rewriter,
-                                        Operation* op, Value result_value,
+                                        Operation* op, ShapedType result_type,
                                         SmallVectorImpl<Value>& values,
                                         int32_t axis) {
   // Check all inputs are RankedTensorType
@@ -611,15 +617,6 @@ llvm::Optional<Value> convertConcatV2Op(PatternRewriter& rewriter,
     }
   }
 
-  // Check output is Ranked tensor type
-  if (!result_value.getType().dyn_cast<RankedTensorType>()) {
-    (void)rewriter.notifyMatchFailure(op,
-                                      "output value type not ranked tensor");
-    return llvm::None;
-  }
-
-  RankedTensorType result_type =
-      result_value.getType().dyn_cast<RankedTensorType>();
   mlir::quant::UniformQuantizedType result_quant_type =
       result_type.getElementType()
           .dyn_cast_or_null<mlir::quant::UniformQuantizedType>();
@@ -660,7 +657,7 @@ llvm::Optional<Value> convertConcatV2Op(PatternRewriter& rewriter,
     }
   }
 
-  int32_t tensor_rank = result_type.getShape().size();
+  int32_t tensor_rank = values[0].getType().cast<RankedTensorType>().getRank();
 
   if (axis < 0) axis += tensor_rank;
   if ((axis < 0) || (axis > tensor_rank)) {
@@ -669,7 +666,7 @@ llvm::Optional<Value> convertConcatV2Op(PatternRewriter& rewriter,
   }
 
   auto concat_op = CreateOpAndInfer<tosa::ConcatOp>(
-      rewriter, op->getLoc(), result_value.getType(), values_rescaled,
+      rewriter, op->getLoc(), result_type, values_rescaled,
       rewriter.getI64IntegerAttr(axis));
 
   return concat_op.getResult();
@@ -772,11 +769,11 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
   auto input_shape = input_type.getShape();
 
   int block_rank = block_shape[0];
-  int batch_size = input_shape[0];
+  int64_t batch_size = input_shape[0];
   int input_rank = input_type.getRank();
   int remaining_shape_rank = input_rank - block_rank - 1;
-  int block_num_elems = 1;
-  int padding_sum = 0;
+  int64_t block_num_elems = 1;
+  int64_t padding_sum = 0;
 
   ElementsAttr block_shape_elems;
   ElementsAttr paddings_elems;
@@ -837,7 +834,7 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
   auto a0_pad_const_op = rewriter.create<tosa::ConstOp>(
       op->getLoc(), a0_pad_const_attr_type,
       DenseElementsAttr::get(a0_pad_const_attr_type,
-                             llvm::makeArrayRef(a0_pad_const)));
+                             llvm::ArrayRef(a0_pad_const)));
 
   auto a1_pad_input_op = CreateOpAndInfer<tosa::PadOp>(
       rewriter, op->getLoc(),
@@ -860,7 +857,7 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
     int32_t block_shape_val =
         block_shape_elems.getValues<IntegerAttr>()[i].getInt();
     a2_shape[1 + i * 2 + 0] = padded_shape[1 + i];
-    if (a2_shape[1 + i * 2 + 0] != ShapedType::kDynamicSize) {
+    if (a2_shape[1 + i * 2 + 0] != ShapedType::kDynamic) {
       a2_shape[1 + i * 2 + 0] /= block_shape_val;
     }
 
@@ -875,10 +872,10 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
 
   auto a2_reshape_a1_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(),
-      tensorflow::GetTypeFromTFTensorShape(a2_shape,
-                                           result_type.getElementType()),
+      RankedTensorType::get(a2_shape, result_type.getElementType()),
       a1_pad_input_op.getResult(),
-      rewriter.getI64ArrayAttr(tensorflow::ConvertTFShapeToMlir(a2_shape)));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF((a2_shape))));
 
   // 3. Transpose dimensions to:
   //  block-shape +
@@ -914,7 +911,7 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a3_transpose_shape,
                                            result_type.getElementType()),
-      a2_reshape_a1_op.getResult(), a3_transpose_const.getValue());
+      a2_reshape_a1_op.getResult(), a3_transpose_const.value());
 
   // 4. Reshape the transposed tensor to flatten block_shape
   // into the batch dimension with the following shape:
@@ -933,7 +930,7 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
     int32_t block_shape_val =
         block_shape_elems.getValues<IntegerAttr>()[i].getInt();
     a4_reshape_shape[i + 1] = padded_shape[i + 1];
-    if (a4_reshape_shape[i + 1] != ShapedType::kDynamicSize) {
+    if (a4_reshape_shape[i + 1] != ShapedType::kDynamic) {
       a4_reshape_shape[i + 1] /= block_shape_val;
     }
   }
@@ -946,7 +943,8 @@ llvm::Optional<Value> convertSpaceToBatchNDOp(PatternRewriter& rewriter,
   return CreateOpAndInfer<tosa::ReshapeOp>(
              rewriter, op->getLoc(), result_type,
              a3_transpose_a2_op.getResult(),
-             rewriter.getI64ArrayAttr(a4_reshape_shape))
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(a4_reshape_shape)))
       .getResult();
 }
 
@@ -1031,7 +1029,7 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
   // Another 4-step process
   int block_rank = block_shape_type.getShape()[0];
   int input_rank = input_type.getRank();
-  int crops_dims = crops_type.getShape()[0];
+  int64_t crops_dims = crops_type.getShape()[0];
   int remaining_shape_rank = input_rank - block_rank - 1;
   auto input_shape = input_type.getShape();
 
@@ -1052,9 +1050,9 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
   SmallVector<std::pair<int64_t, int64_t>> crops(crops_dims);
 
   // Extract values for block_shape and crops now.
-  int block_num_elems = 1;
+  int64_t block_num_elems = 1;
   for (int i = 0; i < block_rank; i++) {
-    int block_shape_val =
+    int64_t block_shape_val =
         rewriter
             .getI32IntegerAttr(
                 block_shape_elems.getValues<IntegerAttr>()[i].getInt())
@@ -1089,8 +1087,8 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
 
   for (int i = 0; i < block_rank; i++) a1_shape[i] = block_shape[i];
 
-  a1_shape[block_rank] = (input_shape[0] == ShapedType::kDynamicSize)
-                             ? ShapedType::kDynamicSize
+  a1_shape[block_rank] = (input_shape[0] == ShapedType::kDynamic)
+                             ? ShapedType::kDynamic
                              : input_shape[0] / block_num_elems;
 
   for (int i = 0; i < input_rank - 1; i++)
@@ -1100,7 +1098,9 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a1_shape,
                                            result_type.getElementType()),
-      input_value, rewriter.getI64ArrayAttr(a1_shape));
+      input_value,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(a1_shape)));
 
   // 2. Permute to shape
   // [ batch / prod(block_shape) ],
@@ -1137,7 +1137,7 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a2_transpose_shape,
                                            result_type.getElementType()),
-      a1_reshape_input_op.getResult(), a2_transpose_perm.getValue());
+      a1_reshape_input_op.getResult(), a2_transpose_perm.value());
 
   // Step 3. Reshape to:
   // [ batch / prod(block_shape) ],
@@ -1148,7 +1148,7 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
   SmallVector<int64_t> a4_shape(input_rank);
 
   a4_shape[0] = input_shape[0];
-  if (a4_shape[0] != ShapedType::kDynamicSize) {
+  if (a4_shape[0] != ShapedType::kDynamic) {
     a4_shape[0] /= block_num_elems;
   }
   for (int i = 0; i < block_rank; i++) {
@@ -1162,7 +1162,9 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a4_shape,
                                            result_type.getElementType()),
-      a2_transpose_a1_op.getResult(), rewriter.getI64ArrayAttr(a4_shape));
+      a2_transpose_a1_op.getResult(),
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(a4_shape)));
 
   // 4. Crop the start/end dimensions on 'spatial dimension' according to
   // crops
@@ -1191,8 +1193,10 @@ llvm::Optional<Value> convertBatchToSpaceNDOp(PatternRewriter& rewriter,
              rewriter, op->getLoc(),
              tensorflow::GetTypeFromTFTensorShape(a4_size_vals,
                                                   result_type.getElementType()),
-             a3_reshape_a2.getResult(), rewriter.getI64ArrayAttr(a4_begin_vals),
-             rewriter.getI64ArrayAttr(a4_size_vals))
+             a3_reshape_a2.getResult(),
+             rewriter.getDenseI64ArrayAttr(a4_begin_vals),
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(a4_size_vals)))
       .getResult();
 }
 
@@ -1252,7 +1256,8 @@ llvm::Optional<Value> convertExpandDimsOp(PatternRewriter& rewriter,
     }
   }
 
-  ArrayAttr shape_attr = rewriter.getI64ArrayAttr(reshape_dims);
+  DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(
+      tensorflow::ConvertMlirShapeToTF(reshape_dims));
 
   return CreateOpAndInfer<tosa::ReshapeOp>(rewriter, op->getLoc(), output_type,
                                            input_value, shape_attr)
@@ -1315,7 +1320,8 @@ llvm::Optional<Value> convertSqueezeOp(PatternRewriter& rewriter, Operation* op,
     }
   }
 
-  ArrayAttr shape_attr = rewriter.getI64ArrayAttr(reshape_dims);
+  DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(
+      tensorflow::ConvertMlirShapeToTF(reshape_dims));
 
   return CreateOpAndInfer<tosa::ReshapeOp>(rewriter, op->getLoc(), output_type,
                                            input_value, shape_attr)
@@ -1915,7 +1921,8 @@ llvm::Optional<Value> convertSpaceToDepthOp(PatternRewriter& rewriter,
       a_reshape_dims, output_type.getElementType());
   auto a2_reshape_a_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(), a_reshape_output_type, input_value,
-      rewriter.getI64ArrayAttr(a_reshape_dims));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(a_reshape_dims)));
 
   llvm::Optional<Value> a3_transpose_perm = getConstTensor<int32_t>(
       rewriter, op, /*vec=*/{0, 1, 3, 2, 4, 5}, /*shape=*/{6});
@@ -1924,7 +1931,7 @@ llvm::Optional<Value> convertSpaceToDepthOp(PatternRewriter& rewriter,
 
   auto a3_transpose_a2_op = CreateOpAndInfer<tosa::TransposeOp>(
       rewriter, op->getLoc(), a_reshape_output_type,
-      a2_reshape_a_op.getResult(), a3_transpose_perm.getValue());
+      a2_reshape_a_op.getResult(), a3_transpose_perm.value());
 
   SmallVector<int64_t, 4> a3_reshape_dims;
   a3_reshape_dims.push_back(input_shape[0]);
@@ -1938,7 +1945,8 @@ llvm::Optional<Value> convertSpaceToDepthOp(PatternRewriter& rewriter,
   return CreateOpAndInfer<tosa::ReshapeOp>(
              rewriter, op->getLoc(), a3_reshape_output_type,
              a3_transpose_a2_op.getResult(),
-             rewriter.getI64ArrayAttr(a3_reshape_dims))
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(a3_reshape_dims)))
       .getResult();
 }
 
@@ -2003,7 +2011,8 @@ llvm::Optional<Value> convertDepthToSpaceOp(PatternRewriter& rewriter,
       a_reshape_dims, output_type.getElementType());
   auto a2_reshape_a_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(), a_reshape_output_type, input_value,
-      rewriter.getI64ArrayAttr(a_reshape_dims));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(a_reshape_dims)));
 
   llvm::Optional<Value> a3_transpose_perm = getConstTensor<int32_t>(
       rewriter, op, /*vec=*/{0, 1, 3, 2, 4, 5}, /*shape=*/{6});
@@ -2012,7 +2021,7 @@ llvm::Optional<Value> convertDepthToSpaceOp(PatternRewriter& rewriter,
 
   auto a3_transpose_a2_op = CreateOpAndInfer<tosa::TransposeOp>(
       rewriter, op->getLoc(), a_reshape_output_type,
-      a2_reshape_a_op.getResult(), a3_transpose_perm.getValue());
+      a2_reshape_a_op.getResult(), a3_transpose_perm.value());
 
   SmallVector<int64_t, 4> a3_reshape_dims;
   a3_reshape_dims.push_back(input_shape[0]);
@@ -2026,7 +2035,8 @@ llvm::Optional<Value> convertDepthToSpaceOp(PatternRewriter& rewriter,
   return CreateOpAndInfer<tosa::ReshapeOp>(
              rewriter, op->getLoc(), a3_reshape_output_type,
              a3_transpose_a2_op.getResult(),
-             rewriter.getI64ArrayAttr(a3_reshape_dims))
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(a3_reshape_dims)))
       .getResult();
 }
 
@@ -2074,7 +2084,8 @@ llvm::Optional<SmallVector<Value>> convertSplitOp(
     slice_value = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(),
         tensorflow::GetTypeFromTFTensorShape(new_shape, etype), input_value,
-        rewriter.getI64ArrayAttr(new_shape));
+        rewriter.getDenseI64ArrayAttr(
+            tensorflow::ConvertMlirShapeToTF(new_shape)));
   }
 
   RankedTensorType slice_type = slice_value.getType().cast<RankedTensorType>();
@@ -2094,8 +2105,9 @@ llvm::Optional<SmallVector<Value>> convertSplitOp(
   SmallVector<Value> results_vec;
   for (int i = 0; i < num_split; i++) {
     begin_vals[axis] = i * size_vals[axis];
-    ArrayAttr begin = rewriter.getI64ArrayAttr(begin_vals);
-    ArrayAttr size = rewriter.getI64ArrayAttr(size_vals);
+    DenseI64ArrayAttr begin = rewriter.getDenseI64ArrayAttr(begin_vals);
+    DenseI64ArrayAttr size = rewriter.getDenseI64ArrayAttr(
+        tensorflow::ConvertMlirShapeToTF(size_vals));
 
     Value result = CreateOpAndInfer<tosa::SliceOp>(
         rewriter, op->getLoc(),
@@ -2112,7 +2124,9 @@ llvm::Optional<SmallVector<Value>> convertSplitOp(
           CreateOpAndInfer<tosa::ReshapeOp>(
               rewriter, op->getLoc(),
               tensorflow::GetTypeFromTFTensorShape(out_reshape_shape, etype),
-              result, rewriter.getI64ArrayAttr(out_reshape_shape))
+              result,
+              rewriter.getDenseI64ArrayAttr(
+                  tensorflow::ConvertMlirShapeToTF(out_reshape_shape)))
               .getResult();
     }
 
@@ -2178,8 +2192,9 @@ llvm::Optional<SmallVector<Value>> convertSplitVOp(
       }
     }
 
-    ArrayAttr begin = rewriter.getI64ArrayAttr(begin_vals);
-    ArrayAttr size = rewriter.getI64ArrayAttr(size_vals);
+    DenseI64ArrayAttr begin = rewriter.getDenseI64ArrayAttr(begin_vals);
+    DenseI64ArrayAttr size = rewriter.getDenseI64ArrayAttr(
+        tensorflow::ConvertMlirShapeToTF(size_vals));
 
     auto slice_op = CreateOpAndInfer<tosa::SliceOp>(
         rewriter, op->getLoc(),
@@ -2200,15 +2215,13 @@ llvm::Optional<SmallVector<Value>> convertSplitVOp(
 // the only legal negative stride.
 static Value reverseNegativeStride(PatternRewriter& rewriter, Operation* op,
                                    Value input, ArrayRef<int32_t> strides) {
-  Type reverse_ty = UnrankedTensorType::get(
-      input.getType().cast<ShapedType>().getElementType());
   for (auto it : llvm::enumerate(strides)) {
     auto axis = it.index();
     auto stride = it.value();
     if (stride != -1) continue;
 
     input = CreateOpAndInfer<tosa::ReverseOp>(rewriter, op->getLoc(),
-                                              reverse_ty, input,
+                                              input.getType(), input,
                                               rewriter.getI64IntegerAttr(axis))
                 .getResult();
   }
@@ -2380,8 +2393,8 @@ llvm::Optional<Value> convertStridedSliceOp(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a1_size,
                                            input_type.getElementType()),
-      input_value, rewriter.getI64ArrayAttr(a1_begin),
-      rewriter.getI64ArrayAttr(a1_size));
+      input_value, rewriter.getDenseI64ArrayAttr(a1_begin),
+      rewriter.getDenseI64ArrayAttr(tensorflow::ConvertMlirShapeToTF(a1_size)));
 
   if (all_strides_one) {
     auto reversed =
@@ -2398,7 +2411,8 @@ llvm::Optional<Value> convertStridedSliceOp(
 
     return CreateOpAndInfer<tosa::ReshapeOp>(
                rewriter, op->getLoc(), result_type, reversed,
-               rewriter.getI64ArrayAttr(new_shape))
+               rewriter.getDenseI64ArrayAttr(
+                   tensorflow::ConvertMlirShapeToTF(new_shape)))
         .getResult();
   }
 
@@ -2413,7 +2427,9 @@ llvm::Optional<Value> convertStridedSliceOp(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a2_shape,
                                            input_type.getElementType()),
-      a1_slice_op.getResult(), rewriter.getI64ArrayAttr(a2_shape));
+      a1_slice_op.getResult(),
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(a2_shape)));
 
   // Step 3: take a slice along the strides
   SmallVector<int64_t> a3_begin(input_rank * 2), a3_size(input_rank * 2);
@@ -2434,8 +2450,8 @@ llvm::Optional<Value> convertStridedSliceOp(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a3_size,
                                            input_type.getElementType()),
-      a2_reshape_op.getResult(), rewriter.getI64ArrayAttr(a3_begin),
-      rewriter.getI64ArrayAttr(a3_size));
+      a2_reshape_op.getResult(), rewriter.getDenseI64ArrayAttr(a3_begin),
+      rewriter.getDenseI64ArrayAttr(tensorflow::ConvertMlirShapeToTF(a3_size)));
 
   // Step 4: reshape the now-strided tensor
   SmallVector<int64_t> a4_shape;
@@ -2448,9 +2464,10 @@ llvm::Optional<Value> convertStridedSliceOp(
   }
 
   auto a4_reshape_op =
-      CreateOpAndInfer<tosa::ReshapeOp>(rewriter, op->getLoc(), result_type,
-                                        a3_slice_op.getResult(),
-                                        rewriter.getI64ArrayAttr(a4_shape))
+      CreateOpAndInfer<tosa::ReshapeOp>(
+          rewriter, op->getLoc(), result_type, a3_slice_op.getResult(),
+          rewriter.getDenseI64ArrayAttr(
+              tensorflow::ConvertMlirShapeToTF(a4_shape)))
           .getResult();
 
   return reverseNegativeStride(rewriter, op, a4_reshape_op, strides);
@@ -2667,16 +2684,17 @@ static Value convertGenericReduceOp(PatternRewriter& rewriter, Operation* op,
   Value perms_value =
       getConstTensor<int32_t>(rewriter, op, perms,
                               {static_cast<int64_t>(perms.size())})
-          .getValue();
+          .value();
 
   auto transpose_op = CreateOpAndInfer<tosa::TransposeOp>(
-      rewriter, loc, UnrankedTensorType::get(rewriter.getI32Type()), input,
-      perms_value);
+      rewriter, loc, UnrankedTensorType::get(input_etype), input, perms_value);
 
   auto reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, loc,
       tensorflow::GetTypeFromTFTensorShape(reshape_shape, input_etype),
-      transpose_op, rewriter.getI64ArrayAttr(reshape_shape));
+      transpose_op,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(reshape_shape)));
 
   return CreateOpAndInfer<T>(rewriter, loc,
                              UnrankedTensorType::get(reduce_etype), reshape_op,
@@ -2756,7 +2774,8 @@ llvm::Optional<Value> convertReduceOpCommon(
   // Squeeze out the reduced axes.
   auto reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(), output_type, val,
-      rewriter.getI64ArrayAttr(output_shape));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(output_shape)));
   return reshape_op.getResult();
 }
 
@@ -2967,7 +2986,7 @@ llvm::Optional<Value> convertReduceMeanOp(PatternRewriter& rewriter,
   if (!input_is_qtype) {
     Value div_const = getTosaConstTensorSingleF32(rewriter, op, div_scale);
     return CreateOpAndInfer<tosa::MulOp>(rewriter, op->getLoc(), output_type,
-                                         val.getValue(), div_const, 0)
+                                         val.value(), div_const, 0)
         .getResult();
   }
 
@@ -3094,10 +3113,12 @@ llvm::Optional<Value> convertResizeOp(PatternRewriter& rewriter, Operation* op,
   normalize(input_width, output_width, scale_x_n, scale_x_d, offset_x,
             border_x);
 
-  ArrayAttr scale =
-      rewriter.getI64ArrayAttr({scale_y_n, scale_y_d, scale_x_n, scale_x_d});
-  ArrayAttr offset = rewriter.getI64ArrayAttr({offset_y, offset_x});
-  ArrayAttr border = rewriter.getI64ArrayAttr({border_y, border_x});
+  DenseI64ArrayAttr scale = rewriter.getDenseI64ArrayAttr(
+      {scale_y_n, scale_y_d, scale_x_n, scale_x_d});
+  DenseI64ArrayAttr offset =
+      rewriter.getDenseI64ArrayAttr({offset_y, offset_x});
+  DenseI64ArrayAttr border =
+      rewriter.getDenseI64ArrayAttr({border_y, border_x});
 
   StringAttr resize_mode = rewriter.getStringAttr(mode);
 
@@ -3288,11 +3309,11 @@ llvm::Optional<Value> convertDequantizeOp(PatternRewriter& rewriter,
 
   auto op2_sub_op1 =
       CreateOpAndInfer<tosa::SubOp>(rewriter, op->getLoc(), output_type,
-                                    op1_cast_in.getResult(), zp_val.getValue());
+                                    op1_cast_in.getResult(), zp_val.value());
 
   return CreateOpAndInfer<tosa::MulOp>(rewriter, op->getLoc(), output_type,
                                        op2_sub_op1.getResult(),
-                                       scale_val.getValue(), 0)
+                                       scale_val.value(), 0)
       .getResult();
 }
 
@@ -3449,8 +3470,8 @@ llvm::Optional<Value> convertMirrorPadCommon(PatternRewriter& rewriter,
           rewriter, op->getLoc(),
           RankedTensorType::get(slice_before_size,
                                 output_type.getElementType()),
-          current_tensor, rewriter.getI64ArrayAttr(slice_before_begin),
-          rewriter.getI64ArrayAttr(slice_before_size));
+          current_tensor, rewriter.getDenseI64ArrayAttr(slice_before_begin),
+          rewriter.getDenseI64ArrayAttr(slice_before_size));
 
       // Reverse op is superfluous when the padding value is 1.
       if (pad_before == 1) {
@@ -3472,8 +3493,8 @@ llvm::Optional<Value> convertMirrorPadCommon(PatternRewriter& rewriter,
       auto slice_after_op = CreateOpAndInfer<tosa::SliceOp>(
           rewriter, op->getLoc(),
           RankedTensorType::get(slice_after_size, output_type.getElementType()),
-          current_tensor, rewriter.getI64ArrayAttr(slice_after_begin),
-          rewriter.getI64ArrayAttr(slice_after_size));
+          current_tensor, rewriter.getDenseI64ArrayAttr(slice_after_begin),
+          rewriter.getDenseI64ArrayAttr(slice_after_size));
 
       if (pad_after == 1) {
         slices.push_back(slice_after_op);
@@ -3491,21 +3512,16 @@ llvm::Optional<Value> convertMirrorPadCommon(PatternRewriter& rewriter,
         pad_before + input_type.getDimSize(axis) + pad_after;
 
     // Create the expected output shape and type, and initialize it with zero.
-    RankedTensorType result_type =
-        RankedTensorType::get(current_dim_size, output_type.getElementType());
-    DenseElementsAttr zero = result_type.getElementType().isa<FloatType>()
-                                 ? DenseElementsAttr::get(result_type, {0.f})
-                                 : DenseElementsAttr::get(result_type, {0});
-    Value result_value =
-        rewriter.create<tosa::ConstOp>(op->getLoc(), result_type, zero);
+    ShapedType result_type =
+        UnrankedTensorType::get(output_type.getElementType());
 
     // Concatenate the old tensor with padding areas.
-    result = convertConcatV2Op(rewriter, op, result_value, slices, axis);
+    result = convertConcatV2Op(rewriter, op, result_type, slices, axis);
 
     if (!result) return llvm::None;
 
     // Update to the padded tensor
-    current_tensor = result.getValue();
+    current_tensor = result.value();
   }
 
   return result;
@@ -3538,7 +3554,7 @@ llvm::Optional<Value> convertTFConv2DCommon(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(a1_transpose_dims,
                                            filter_type.getElementType()),
-      filter, a1_filter_transpose_perm.getValue());
+      filter, a1_filter_transpose_perm.value());
 
   // Only support NHWC now.
   if (data_format_ref.str() != "NHWC") {
@@ -3546,27 +3562,27 @@ llvm::Optional<Value> convertTFConv2DCommon(
     return llvm::None;
   }
 
-  ArrayAttr stride;
-  ArrayAttr dilation;
-  ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr dilation;
+  DenseI64ArrayAttr pad;
   {
     if (!strides_attr) {
-      stride = rewriter.getI64ArrayAttr({1, 1});
+      stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t stride_h = strides_attr[1].cast<IntegerAttr>().getInt();
       int64_t stride_w = strides_attr[2].cast<IntegerAttr>().getInt();
-      stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+      stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
   {
     if (!dilations_attr) {
-      dilation = rewriter.getI64ArrayAttr({1, 1});
+      dilation = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t dilation_h = dilations_attr[1].cast<IntegerAttr>().getInt();
       int64_t dilation_w = dilations_attr[2].cast<IntegerAttr>().getInt();
-      dilation = rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+      dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
     }
   }
   {
@@ -3628,12 +3644,12 @@ llvm::Optional<Value> convertConv3DCommon(PatternRewriter& rewriter,
     return llvm::None;
   }
 
-  ArrayAttr strides_attr = rewriter.getI64ArrayAttr(strides);
-  ArrayAttr dilations_attr = rewriter.getI64ArrayAttr(dilations);
+  DenseI64ArrayAttr strides_attr = rewriter.getDenseI64ArrayAttr(strides);
+  DenseI64ArrayAttr dilations_attr = rewriter.getDenseI64ArrayAttr(dilations);
   RankedTensorType input_type = input.getType().cast<RankedTensorType>();
   RankedTensorType filter_type = filter.getType().cast<RankedTensorType>();
 
-  ArrayAttr pads_attr;
+  DenseI64ArrayAttr pads_attr;
   if (!getPaddingValuesFromPadType(tf_pad, data_format_tf, 0, input_type,
                                    filter_type, strides_attr, dilations_attr,
                                    rewriter, pads_attr)) {
@@ -3783,6 +3799,15 @@ llvm::Optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
     return llvm::None;
   }
 
+  // tf/tfl allow i64 indices, but tosa does not.
+  if (indices_type.getElementType().isInteger(64)) {
+    indices_type =
+        indices_type.clone(rewriter.getI32Type()).dyn_cast<RankedTensorType>();
+    indices_value = CreateOpAndInfer<tosa::CastOp>(rewriter, op->getLoc(),
+                                                   indices_type, indices_value)
+                        .getResult();
+  }
+
   // Sizes for each of these fields.
   SmallVector<int64_t> params_batch, params_indices, params_left_channels,
       params_right_channels;
@@ -3914,7 +3939,7 @@ llvm::Optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(params_transpose_shape,
                                            params_type.getElementType()),
-      params_value, params_transpose_perm_val.getValue());
+      params_value, params_transpose_perm_val.value());
 
   if (count_dynamic_dims(tosa_values_shape) > 1) {
     return (void)rewriter.notifyMatchFailure(
@@ -3929,7 +3954,8 @@ llvm::Optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape(tosa_values_shape,
                                            params_type.getElementType()),
       params_transpose_op.getResult(),
-      rewriter.getI64ArrayAttr(tosa_values_shape));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(tosa_values_shape)));
 
   if (count_dynamic_dims(tosa_indices_shape) > 1) {
     return (void)rewriter.notifyMatchFailure(
@@ -3943,7 +3969,9 @@ llvm::Optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(tosa_indices_shape,
                                            indices_type.getElementType()),
-      indices_value, rewriter.getI64ArrayAttr(tosa_indices_shape));
+      indices_value,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(tosa_indices_shape)));
 
   auto tosa_gather_op = CreateOpAndInfer<tosa::GatherOp>(
       rewriter, op->getLoc(),
@@ -3963,12 +3991,13 @@ llvm::Optional<Value> convertGatherOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape(result_reshape_shape,
                                            params_type.getElementType()),
       tosa_gather_op.getResult(),
-      rewriter.getI64ArrayAttr(result_reshape_shape));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(result_reshape_shape)));
 
-  return CreateOpAndInfer<tosa::TransposeOp>(
-             rewriter, op->getLoc(), result_type,
-             tosa_result_reshape_op.getResult(),
-             result_transpose_perm_val.getValue())
+  return CreateOpAndInfer<tosa::TransposeOp>(rewriter, op->getLoc(),
+                                             result_type,
+                                             tosa_result_reshape_op.getResult(),
+                                             result_transpose_perm_val.value())
       .getResult();
 }
 
@@ -4082,14 +4111,18 @@ llvm::Optional<Value> convertGatherNdOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(tosa_values_shape,
                                            params_type.getElementType()),
-      params_value, rewriter.getI64ArrayAttr(tosa_values_shape));
+      params_value,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(tosa_values_shape)));
 
   // Flatten the input indices tensor to an [W, ND] matrix.
   auto indices_matrix_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(indices_matrix_shape,
                                            indices_type.getElementType()),
-      indices_value, rewriter.getI64ArrayAttr(indices_matrix_shape));
+      indices_value,
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(indices_matrix_shape)));
 
   SmallVector<int32_t> flattened_coeff_vec;
   for (int i = 1; i < ND; i++) {
@@ -4112,8 +4145,7 @@ llvm::Optional<Value> convertGatherNdOp(PatternRewriter& rewriter,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(indices_matrix_shape,
                                            indices_type.getElementType()),
-      indices_matrix_reshape_op.getResult(), flattened_coeff_value.getValue(),
-      0);
+      indices_matrix_reshape_op.getResult(), flattened_coeff_value.value(), 0);
 
   // Sum up the products of the coefficients and coordinates
   auto flattened_indices_reduce_op = CreateOpAndInfer<tosa::ReduceSumOp>(
@@ -4128,7 +4160,8 @@ llvm::Optional<Value> convertGatherNdOp(PatternRewriter& rewriter,
       tensorflow::GetTypeFromTFTensorShape(tosa_indices_shape,
                                            indices_type.getElementType()),
       flattened_indices_reduce_op.getResult(),
-      rewriter.getI64ArrayAttr(tosa_indices_shape));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(tosa_indices_shape)));
 
   // Now the gather op itself
   auto tosa_gather_op = CreateOpAndInfer<tosa::GatherOp>(
@@ -4141,7 +4174,8 @@ llvm::Optional<Value> convertGatherNdOp(PatternRewriter& rewriter,
   // ParamChannels].
   return CreateOpAndInfer<tosa::ReshapeOp>(
              rewriter, op->getLoc(), result_type, tosa_gather_op.getResult(),
-             rewriter.getI64ArrayAttr(result_type.getShape()))
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(result_type.getShape())))
       .getResult();
 }
 
@@ -4226,35 +4260,38 @@ llvm::Optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({1, 1, 1},
                                            on_value_type.getElementType()),
-      on_value, rewriter.getI64ArrayAttr({1, 1, 1}));
+      on_value, rewriter.getDenseI64ArrayAttr({1, 1, 1}));
 
   // And tile to [N, W, C]
   auto op2_tile_op1 = CreateOpAndInfer<tosa::TileOp>(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({N, W, C},
                                            on_value_type.getElementType()),
-      op1_reshape_on_value.getResult(), rewriter.getI64ArrayAttr({N, W, C}));
+      op1_reshape_on_value.getResult(),
+      rewriter.getDenseI64ArrayAttr({N, W, C}));
 
   // Reshape off_value to [1, 1, 1]
   auto op3_reshape_off_value = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({1, 1, 1},
                                            off_value_type.getElementType()),
-      off_value, rewriter.getI64ArrayAttr({1, 1, 1}));
+      off_value, rewriter.getDenseI64ArrayAttr({1, 1, 1}));
 
   // And tile to [N, K, C]
   auto op4_tile_op3 = CreateOpAndInfer<tosa::TileOp>(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({N, K, C},
                                            on_value_type.getElementType()),
-      op3_reshape_off_value.getResult(), rewriter.getI64ArrayAttr({N, K, C}));
+      op3_reshape_off_value.getResult(),
+      rewriter.getDenseI64ArrayAttr({N, K, C}));
 
   // Reshape indices to [N, W]
   auto op5_reshape_indices = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({N, W},
                                            indices_type.getElementType()),
-      indices_value, rewriter.getI64ArrayAttr({N, W}));
+      indices_value,
+      rewriter.getDenseI64ArrayAttr(tensorflow::ConvertMlirShapeToTF({N, W})));
 
   // Scatter to [N, K, C]
   auto op6_scatter_op4_op5_op2 = CreateOpAndInfer<tosa::ScatterOp>(
@@ -4270,7 +4307,8 @@ llvm::Optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape({left_dim, right_dim, K},
                                            result_type.getElementType()),
       op6_scatter_op4_op5_op2.getResult(),
-      rewriter.getI64ArrayAttr({left_dim, right_dim, K}));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF({left_dim, right_dim, K})));
 
   // Transposed to [LeftDims, K, RightDims].
   llvm::Optional<Value> perm_const =
@@ -4282,12 +4320,117 @@ llvm::Optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape({left_dim, K, right_dim},
                                            result_type.getElementType()),
-      op7_reshape_op6.getResult(), perm_const.getValue());
+      op7_reshape_op6.getResult(), perm_const.value());
 
   // Reshaped to result.shape.
   return CreateOpAndInfer<tosa::ReshapeOp>(
              rewriter, op->getLoc(), result_type, op8_transpose_op7.getResult(),
-             rewriter.getI64ArrayAttr(result_type.getShape()))
+             rewriter.getDenseI64ArrayAttr(
+                 tensorflow::ConvertMlirShapeToTF(result_type.getShape())))
+      .getResult();
+}
+
+// Lowers Sin operator to a sequence of TOSA ops.
+llvm::Optional<Value> convertSinOp(PatternRewriter& rewriter, Operation* op,
+                                   Value input, ShapedType output_type) {
+  RankedTensorType input_type = input.getType().dyn_cast<RankedTensorType>();
+  Location loc = op->getLoc();
+
+  Type input_ety = input_type.getElementType();
+  Type output_ety = output_type.getElementType();
+
+  if (!input) return llvm::None;
+
+  if (input_ety != output_ety) {
+    (void)rewriter.notifyMatchFailure(op,
+                                      "input/output element type must match");
+    return llvm::None;
+  }
+
+  bool input_is_fp = input_ety.isF32();
+  bool output_is_fp = output_ety.isF32();
+
+  if (!input_is_fp || !output_is_fp) {
+    (void)rewriter.notifyMatchFailure(op, "input/result must be fp32");
+    return llvm::None;
+  }
+
+  // To perform a sin operation we remap the sin domain to be over a single
+  // period of the function, remapping to the domain of the table function.
+  // We then remap the range of the table function to map to the range of the
+  // sin operation.
+
+  // 1. Normalize the period of the domain from [0, 2π) to [0, 1).
+  auto fp_scalar_ty = RankedTensorType::get({}, rewriter.getF32Type());
+  Value fp_scale = rewriter.create<tosa::ConstOp>(
+      loc, fp_scalar_ty,
+      DenseElementsAttr::get(fp_scalar_ty, {static_cast<float>(0.5 / M_PI)}));
+
+  // 2. Remap the periodic behavior of the domain to line up within [0, 1).
+  Value fp_scaled =
+      CreateOpAndInfer<tosa::MulOp>(rewriter, loc, input_type, input, fp_scale,
+                                    rewriter.getI32IntegerAttr(0));
+  auto floored =
+      CreateOpAndInfer<tosa::FloorOp>(rewriter, loc, input_type, fp_scaled);
+  auto repeated = CreateOpAndInfer<tosa::SubOp>(rewriter, loc, input_type,
+                                                fp_scaled, floored);
+
+  // 3. Scale and translate the normalized domain to the table domain. This
+  // includes a translating and scaling to [-int16_max, int16_max] and casting
+  // to an i16.
+  Value one = rewriter.create<tosa::ConstOp>(
+      loc, fp_scalar_ty, DenseElementsAttr::get(fp_scalar_ty, {1.0f}));
+
+  Value two = rewriter.create<tosa::ConstOp>(
+      loc, fp_scalar_ty, DenseElementsAttr::get(fp_scalar_ty, {2.0f}));
+  auto scale_up = CreateOpAndInfer<tosa::MulOp>(
+      rewriter, loc, input_type, repeated, two, rewriter.getI32IntegerAttr(0));
+  auto translate =
+      CreateOpAndInfer<tosa::SubOp>(rewriter, loc, input_type, scale_up, one);
+
+  Value int_limit = rewriter.create<tosa::ConstOp>(
+      loc, fp_scalar_ty,
+      DenseElementsAttr::get(
+          fp_scalar_ty,
+          {static_cast<float>(std::numeric_limits<int16_t>::max())}));
+  auto int_scaled =
+      CreateOpAndInfer<tosa::MulOp>(rewriter, loc, input_type, translate,
+                                    int_limit, rewriter.getI32IntegerAttr(0));
+
+  auto int16_ty = input_type.clone(rewriter.getIntegerType(16));
+  auto casted =
+      CreateOpAndInfer<tosa::CastOp>(rewriter, loc, int16_ty, int_scaled);
+
+  // 4. Compute the lookup table using the range of [-255, 255] for sin.
+  llvm::SmallVector<int16_t> values;
+  const int num_values = 513;
+  values.resize(num_values, 0);
+  // First and last values should be 0;
+  for (int i = 1; i < num_values - 1; ++i)
+    values[i] = std::numeric_limits<int16_t>::max() *
+                sin(static_cast<float>(i) * 2.0 * M_PI / (num_values - 1.0));
+
+  auto table_ty =
+      RankedTensorType::get({num_values}, rewriter.getIntegerType(16));
+  Value table = rewriter.create<tosa::ConstOp>(
+      loc, table_ty, DenseElementsAttr::get(table_ty, llvm::ArrayRef(values)));
+
+  auto table_result_ty = input_type.clone(rewriter.getIntegerType(32));
+  auto table_result = CreateOpAndInfer<tosa::TableOp>(
+      rewriter, loc, table_result_ty, casted, table);
+
+  // 5. The range of table is a 23-bit two's compliment value. Normalize the
+  // range by casting to an fp32 and dividing by 2^22.
+  auto table_result_fp =
+      CreateOpAndInfer<CastOp>(rewriter, loc, input_type, table_result);
+  auto output_scale = rewriter.create<ConstOp>(
+      loc, fp_scalar_ty,
+      DenseElementsAttr::get(
+          fp_scalar_ty,
+          {static_cast<float>(1.0 / static_cast<float>(1 << 22))}));
+
+  return CreateOpAndInfer<MulOp>(rewriter, loc, output_type, table_result_fp,
+                                 output_scale, rewriter.getI32IntegerAttr(0))
       .getResult();
 }
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
index 5684353860c..686b536032e 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.h
@@ -72,7 +72,7 @@ llvm::Optional<Value> convertRoundOp(PatternRewriter& rewriter, Operation* op,
 
 // Lowers ConcatV2 to TOSA.
 llvm::Optional<Value> convertConcatV2Op(PatternRewriter& rewriter,
-                                        Operation* op, Value result_value,
+                                        Operation* op, ShapedType result_type,
                                         SmallVectorImpl<Value>& values,
                                         int32_t axis);
 
@@ -298,6 +298,10 @@ llvm::Optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
                                       Value on_value, Value off_value,
                                       int32_t depth, int32_t axis);
 
+// Lowers 32-bit floating sin operator to a sequence of TOSA ops.
+llvm::Optional<Value> convertSinOp(PatternRewriter& rewriter, Operation* op,
+                                   Value input, ShapedType output_type);
+
 };  // namespace tosa
 };  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index 99f6ec88785..350cb4c295b 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <limits>
+#include <memory>
 #include <numeric>
+#include <utility>
 
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
@@ -45,7 +48,7 @@ namespace {
 // Performs lowering to TOSA dialect
 class LegalizeTF : public impl::TosaLegalizeTFPassBase<LegalizeTF> {
  public:
-  explicit LegalizeTF() {}
+  explicit LegalizeTF() = default;
   void runOnOperation() override;
 };
 
@@ -129,6 +132,8 @@ DECL_CONVERT_OP(GatherNd);
 DECL_CONVERT_OP(SelectV2);
 DECL_CONVERT_OP(SpaceToDepth);
 DECL_CONVERT_OP(DepthToSpace);
+DECL_CONVERT_OP(Sin);
+DECL_CONVERT_OP(Cos);
 DECL_CONVERT_OP(SpaceToBatchND);
 DECL_CONVERT_OP(BatchToSpaceND);
 DECL_CONVERT_OP(ZerosLike);
@@ -156,7 +161,7 @@ LogicalResult ConvertTFReluOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::ClampOp>(
-      rewriter, op, output_type, tf_relu_op.features(),
+      rewriter, op, output_type, tf_relu_op.getFeatures(),
       rewriter.getI64IntegerAttr(0),
       rewriter.getI64IntegerAttr(std::numeric_limits<int32_t>::max()),
       rewriter.getF32FloatAttr(0.0f),
@@ -174,7 +179,7 @@ LogicalResult ConvertTFRelu6Op::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::ClampOp>(
-      rewriter, op, output_type, tf_relu6_op.features(),
+      rewriter, op, output_type, tf_relu6_op.getFeatures(),
       rewriter.getI64IntegerAttr(0), rewriter.getI64IntegerAttr(6),
       rewriter.getF32FloatAttr(0.0f), rewriter.getF32FloatAttr(6.0f));
   return success();
@@ -189,8 +194,8 @@ LogicalResult ConvertTFEqualOp::matchAndRewrite(
   // Not a tensor output
   if (!output_type) return failure();
 
-  CreateReplaceOpAndInfer<tosa::EqualOp>(rewriter, op, output_type,
-                                         tf_equal_op.x(), tf_equal_op.y());
+  CreateReplaceOpAndInfer<tosa::EqualOp>(
+      rewriter, op, output_type, tf_equal_op.getX(), tf_equal_op.getY());
   return success();
 }
 
@@ -203,9 +208,9 @@ LogicalResult ConvertTFNotEqualOp::matchAndRewrite(
   // Not a tensor output
   if (!output_type) return failure();
 
-  auto op1_equal_in =
-      CreateOpAndInfer<tosa::EqualOp>(rewriter, op->getLoc(), output_type,
-                                      tf_not_equal_op.x(), tf_not_equal_op.y());
+  auto op1_equal_in = CreateOpAndInfer<tosa::EqualOp>(
+      rewriter, op->getLoc(), output_type, tf_not_equal_op.getX(),
+      tf_not_equal_op.getY());
 
   auto op2_not_op1 = CreateOpAndInfer<tosa::LogicalNotOp>(
       rewriter, op->getLoc(), output_type, op1_equal_in.getResult());
@@ -225,7 +230,7 @@ LogicalResult ConvertTFGreaterOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::GreaterOp>(
-      rewriter, op, output_type, tf_greater_op.x(), tf_greater_op.y());
+      rewriter, op, output_type, tf_greater_op.getX(), tf_greater_op.getY());
   return success();
 }
 
@@ -239,8 +244,50 @@ LogicalResult ConvertTFGreaterEqualOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::GreaterEqualOp>(rewriter, op, output_type,
-                                                tf_greater_equal_op.x(),
-                                                tf_greater_equal_op.y());
+                                                tf_greater_equal_op.getX(),
+                                                tf_greater_equal_op.getY());
+  return success();
+}
+
+LogicalResult ConvertTFSinOp::matchAndRewrite(Operation* op,
+                                              PatternRewriter& rewriter) const {
+  auto tf_sin_op = cast<TF::SinOp>(op);
+  ShapedType output_type = tf_sin_op.getResult().getType().cast<ShapedType>();
+
+  llvm::Optional<Value> result =
+      convertSinOp(rewriter, op, tf_sin_op.getX(), output_type);
+  if (!result) return failure();
+
+  rewriter.replaceOp(op, {result.value()});
+  return success();
+}
+
+LogicalResult ConvertTFCosOp::matchAndRewrite(Operation* op,
+                                              PatternRewriter& rewriter) const {
+  auto tf_cos_op = cast<TF::CosOp>(op);
+  Value input = tf_cos_op.getX();
+  RankedTensorType input_ty = input.getType().dyn_cast<RankedTensorType>();
+  ShapedType output_ty = tf_cos_op.getResult().getType().dyn_cast<ShapedType>();
+
+  if (!input_ty || !output_ty) return failure();
+
+  bool input_is_fp = input_ty.getElementType().isa<mlir::FloatType>();
+  bool output_is_fp = output_ty.getElementType().isa<mlir::FloatType>();
+
+  if (!input_is_fp || !output_is_fp) {
+    return rewriter.notifyMatchFailure(
+        op, "ConvertTFCosOp: input/result must be fp.");
+  }
+
+  // Replace with the equivalent sin operation:
+  //   cos(x) = sin(x + π / 2).
+  auto fp_scalar_ty = RankedTensorType::get({}, rewriter.getF32Type());
+  auto pi_2 = rewriter.create<ConstOp>(
+      op->getLoc(), fp_scalar_ty,
+      DenseElementsAttr::get(fp_scalar_ty, {static_cast<float>(M_PI_2)}));
+  auto offset = rewriter.create<AddOp>(op->getLoc(), input_ty, input, pi_2);
+
+  CreateReplaceOpAndInfer<TF::SinOp>(rewriter, op, output_ty, offset);
   return success();
 }
 
@@ -253,8 +300,8 @@ LogicalResult ConvertTFAddOp::matchAndRewrite(Operation* op,
   // Not a tensor output
   if (!output_type) return failure();
 
-  CreateReplaceOpAndInfer<tosa::AddOp>(rewriter, op, output_type, tf_add_op.x(),
-                                       tf_add_op.y());
+  CreateReplaceOpAndInfer<tosa::AddOp>(rewriter, op, output_type,
+                                       tf_add_op.getX(), tf_add_op.getY());
   return success();
 }
 
@@ -268,7 +315,7 @@ LogicalResult ConvertTFAddV2Op::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::AddOp>(rewriter, op, output_type,
-                                       tf_addv2_op.x(), tf_addv2_op.y());
+                                       tf_addv2_op.getX(), tf_addv2_op.getY());
   return success();
 }
 
@@ -282,7 +329,7 @@ LogicalResult ConvertTFAddNOp::matchAndRewrite(
   // Not a tensor output
   if (!output_type) return failure();
 
-  SmallVector<Value> inputs(tf_addn_op.inputs());
+  SmallVector<Value> inputs(tf_addn_op.getInputs());
 
   assert(inputs.size() >= 2);
 
@@ -307,8 +354,8 @@ LogicalResult ConvertTFSubOp::matchAndRewrite(Operation* op,
   // Not a tensor output
   if (!output_type) return failure();
 
-  CreateReplaceOpAndInfer<tosa::SubOp>(rewriter, op, output_type, tf_sub_op.x(),
-                                       tf_sub_op.y());
+  CreateReplaceOpAndInfer<tosa::SubOp>(rewriter, op, output_type,
+                                       tf_sub_op.getX(), tf_sub_op.getY());
   return success();
 }
 
@@ -317,11 +364,11 @@ LogicalResult ConvertTFMulOp::matchAndRewrite(Operation* op,
   auto tf_mul_op = cast<TF::MulOp>(op);
 
   llvm::Optional<Value> result = convertMultiplyOp(
-      rewriter, op, tf_mul_op.getResult(), tf_mul_op.x(), tf_mul_op.y());
+      rewriter, op, tf_mul_op.getResult(), tf_mul_op.getX(), tf_mul_op.getY());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -331,11 +378,11 @@ LogicalResult ConvertTFSquareOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertMultiplyOp(rewriter, op, tf_square_op.getResult(),
-                        tf_square_op.x(), tf_square_op.x());
+                        tf_square_op.getX(), tf_square_op.getX());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -345,11 +392,11 @@ LogicalResult ConvertTFSquaredDifferenceOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertSquaredDifferenceOp(rewriter, op, tf_squared_op.getResult(),
-                                 tf_squared_op.x(), tf_squared_op.y());
+                                 tf_squared_op.getX(), tf_squared_op.getY());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -357,22 +404,22 @@ LogicalResult ConvertTFRoundOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_round_op = cast<TF::RoundOp>(op);
 
-  TensorType input_type = tf_round_op.x().getType().dyn_cast<TensorType>();
+  TensorType input_type = tf_round_op.getX().getType().dyn_cast<TensorType>();
   if (!input_type) {
     return rewriter.notifyMatchFailure(op, "input not tensor type");
   }
 
   if (input_type.getElementType().isa<FloatType>()) {
-    llvm::Optional<Value> result =
-        convertRoundOp(rewriter, op, tf_round_op.getResult(), tf_round_op.x());
+    llvm::Optional<Value> result = convertRoundOp(
+        rewriter, op, tf_round_op.getResult(), tf_round_op.getX());
 
     if (!result) return failure();
 
-    rewriter.replaceOp(op, {result.getValue()});
+    rewriter.replaceOp(op, {result.value()});
     return success();
 
   } else {
-    tf_round_op.replaceAllUsesWith(tf_round_op.x());
+    tf_round_op.replaceAllUsesWith(tf_round_op.getX());
     return success();
   }
 }
@@ -383,11 +430,11 @@ LogicalResult ConvertTFFloorDivOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertFloorDivOp(rewriter, op, tf_floordiv_op.getResult(),
-                        tf_floordiv_op.x(), tf_floordiv_op.y());
+                        tf_floordiv_op.getX(), tf_floordiv_op.getY());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -398,11 +445,11 @@ LogicalResult ConvertTFFloorModOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertFloorModOp(rewriter, op, tf_floormod_op.getResult(),
-                        tf_floormod_op.x(), tf_floormod_op.y());
+                        tf_floormod_op.getX(), tf_floormod_op.getY());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -424,7 +471,7 @@ LogicalResult ConvertTFMaximumOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::MaximumOp>(
-      rewriter, op, output_type, tf_maximum_op.x(), tf_maximum_op.y());
+      rewriter, op, output_type, tf_maximum_op.getX(), tf_maximum_op.getY());
   return success();
 }
 
@@ -438,7 +485,7 @@ LogicalResult ConvertTFMinimumOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::MinimumOp>(
-      rewriter, op, output_type, tf_minimum_op.x(), tf_minimum_op.y());
+      rewriter, op, output_type, tf_minimum_op.getX(), tf_minimum_op.getY());
   return success();
 }
 
@@ -446,7 +493,7 @@ LogicalResult ConvertTFRealDivOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_div_op = cast<TF::RealDivOp>(op);
 
-  TensorType y_type = tf_div_op.y().getType().dyn_cast<TensorType>();
+  TensorType y_type = tf_div_op.getY().getType().dyn_cast<TensorType>();
   TensorType output_type =
       tf_div_op.getResult().getType().dyn_cast<TensorType>();
   // Not a tensor output
@@ -456,15 +503,15 @@ LogicalResult ConvertTFRealDivOp::matchAndRewrite(
 
   if (element_type.isa<IntegerType>()) {
     CreateReplaceOpAndInfer<tosa::DivOp>(rewriter, op, output_type,
-                                         tf_div_op.x(), tf_div_op.y());
+                                         tf_div_op.getX(), tf_div_op.getY());
     return success();
   }
 
   auto reciprocal_op = CreateOpAndInfer<tosa::ReciprocalOp>(
-      rewriter, op->getLoc(), tf_div_op.y().getType(), tf_div_op.y());
+      rewriter, op->getLoc(), tf_div_op.getY().getType(), tf_div_op.getY());
 
   auto mul_op = CreateOpAndInfer<tosa::MulOp>(rewriter, op->getLoc(),
-                                              output_type, tf_div_op.x(),
+                                              output_type, tf_div_op.getX(),
                                               reciprocal_op.getResult(), 0);
   rewriter.replaceOp(op, {mul_op.getResult()});
 
@@ -475,14 +522,15 @@ LogicalResult ConvertTFArgMaxOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_argmax_op = cast<TF::ArgMaxOp>(op);
 
-  TensorType input_type = tf_argmax_op.input().getType().dyn_cast<TensorType>();
+  TensorType input_type =
+      tf_argmax_op.getInput().getType().dyn_cast<TensorType>();
   TensorType output_type =
       tf_argmax_op.getResult().getType().dyn_cast<TensorType>();
   // Not a tensor output
   if (!output_type || !input_type) return failure();
 
   ElementsAttr axis_elems;
-  if (!matchPattern(tf_argmax_op.dimension(), m_Constant(&axis_elems)))
+  if (!matchPattern(tf_argmax_op.getDimension(), m_Constant(&axis_elems)))
     return failure();
 
   int32_t axis = axis_elems.getValues<IntegerAttr>()[0].getInt();
@@ -497,7 +545,7 @@ LogicalResult ConvertTFArgMaxOp::matchAndRewrite(
   IntegerAttr axis_attr = rewriter.getI64IntegerAttr(axis);
 
   CreateReplaceOpAndInfer<tosa::ArgMaxOp>(rewriter, op, output_type,
-                                          tf_argmax_op.input(), axis_attr);
+                                          tf_argmax_op.getInput(), axis_attr);
 
   return success();
 }
@@ -507,57 +555,57 @@ LogicalResult ConvertTFAvgPoolOp::matchAndRewrite(
   auto tf_avgpool_op = cast<TF::AvgPoolOp>(op);
 
   RankedTensorType input_type =
-      tf_avgpool_op.value().getType().dyn_cast<RankedTensorType>();
+      tf_avgpool_op.getValue().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_avgpool_op.getResult().getType().dyn_cast<RankedTensorType>();
   // Not a ranked tensor output
   if (!input_type || !output_type) return failure();
 
-  auto tmpAttr = tf_avgpool_op.data_formatAttr();
+  auto tmpAttr = tf_avgpool_op.getDataFormatAttr();
   if (tmpAttr && tmpAttr.getValue().str() != "NHWC") return failure();
 
-  ArrayAttr pad;
-  ArrayAttr stride;
-  ArrayAttr kernel;
+  DenseI64ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr kernel;
   {
-    auto tmpAttr = tf_avgpool_op.strides();
+    auto tmpAttr = tf_avgpool_op.getStrides();
     if (!tmpAttr) {
-      stride = rewriter.getI64ArrayAttr({1, 1});
+      stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t stride_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t stride_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+      stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
   {
-    auto tmpAttr = tf_avgpool_op.ksize();
+    auto tmpAttr = tf_avgpool_op.getKsize();
     if (!tmpAttr) {
-      kernel = rewriter.getI64ArrayAttr({1, 1});
+      kernel = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t kernel_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t kernel_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      kernel = rewriter.getI64ArrayAttr({kernel_h, kernel_w});
+      kernel = rewriter.getDenseI64ArrayAttr({kernel_h, kernel_w});
     }
   }
   {
     tensorflow::Padding tf_pad;
-    if (!GetPaddingFromString(tf_avgpool_op.padding().str(), &tf_pad).ok())
+    if (!GetPaddingFromString(tf_avgpool_op.getPadding().str(), &tf_pad).ok())
       return failure();
 
-    ArrayAttr dilation =
-        rewriter.getI64ArrayAttr({1, 1});  // Pooling has no non-unit dilation
+    DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr(
+        {1, 1});  // Pooling has no non-unit dilation
 
     SmallVector<int64_t, 4> i64array;
 
-    for (auto& elem : tf_avgpool_op.ksize()) {
+    for (auto& elem : tf_avgpool_op.getKsize()) {
       int64_t value = elem.dyn_cast<IntegerAttr>().getInt();
       i64array.emplace_back(value);
     }
 
     RankedTensorType filter_type = tensorflow::GetTypeFromTFTensorShape(
-        llvm::makeArrayRef(i64array), rewriter.getIntegerType(64));
+        llvm::ArrayRef(i64array), rewriter.getIntegerType(64));
 
     if (!getPaddingValuesFromPadType(
             tf_pad,
@@ -568,7 +616,7 @@ LogicalResult ConvertTFAvgPoolOp::matchAndRewrite(
   }
 
   CreateReplaceOpAndInfer<tosa::AvgPool2dOp>(
-      rewriter, op, output_type, tf_avgpool_op.value(), kernel, stride, pad);
+      rewriter, op, output_type, tf_avgpool_op.getValue(), kernel, stride, pad);
   return success();
 }
 
@@ -577,57 +625,57 @@ LogicalResult ConvertTFMaxPoolOp::matchAndRewrite(
   auto tf_maxpool_op = cast<TF::MaxPoolOp>(op);
 
   RankedTensorType input_type =
-      tf_maxpool_op.input().getType().dyn_cast<RankedTensorType>();
+      tf_maxpool_op.getInput().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_maxpool_op.getResult().getType().dyn_cast<RankedTensorType>();
   // Not a ranked tensor output
   if (!input_type || !output_type) return failure();
 
-  auto tmpAttr = tf_maxpool_op.data_formatAttr();
+  auto tmpAttr = tf_maxpool_op.getDataFormatAttr();
   if (tmpAttr && tmpAttr.getValue().str() != "NHWC") return failure();
 
-  ArrayAttr pad;
-  ArrayAttr stride;
-  ArrayAttr kernel;
+  DenseI64ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr kernel;
   {
-    auto tmpAttr = tf_maxpool_op.strides();
+    auto tmpAttr = tf_maxpool_op.getStrides();
     if (!tmpAttr) {
-      stride = rewriter.getI64ArrayAttr({1, 1});
+      stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t stride_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t stride_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+      stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
   {
-    auto tmpAttr = tf_maxpool_op.ksize();
+    auto tmpAttr = tf_maxpool_op.getKsize();
     if (!tmpAttr) {
-      kernel = rewriter.getI64ArrayAttr({1, 1});
+      kernel = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t kernel_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t kernel_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      kernel = rewriter.getI64ArrayAttr({kernel_h, kernel_w});
+      kernel = rewriter.getDenseI64ArrayAttr({kernel_h, kernel_w});
     }
   }
   {
     tensorflow::Padding tf_pad;
-    if (!GetPaddingFromString(tf_maxpool_op.padding().str(), &tf_pad).ok())
+    if (!GetPaddingFromString(tf_maxpool_op.getPadding().str(), &tf_pad).ok())
       return failure();
 
     // Pooling has no non-unit dilation
-    ArrayAttr dilation = rewriter.getI64ArrayAttr({1, 1});
+    DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr({1, 1});
 
     SmallVector<int64_t, 4> i64array;
 
-    for (auto& elem : tf_maxpool_op.ksize()) {
+    for (auto& elem : tf_maxpool_op.getKsize()) {
       int64_t value = elem.dyn_cast<IntegerAttr>().getInt();
       i64array.emplace_back(value);
     }
 
     RankedTensorType filter_type = tensorflow::GetTypeFromTFTensorShape(
-        llvm::makeArrayRef(i64array), rewriter.getIntegerType(64));
+        llvm::ArrayRef(i64array), rewriter.getIntegerType(64));
 
     if (!getPaddingValuesFromPadType(
             tf_pad,
@@ -638,27 +686,28 @@ LogicalResult ConvertTFMaxPoolOp::matchAndRewrite(
   }
 
   CreateReplaceOpAndInfer<tosa::MaxPool2dOp>(
-      rewriter, op, output_type, tf_maxpool_op.input(), kernel, stride, pad);
+      rewriter, op, output_type, tf_maxpool_op.getInput(), kernel, stride, pad);
   return success();
 }
 
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concatv2_op = cast<TF::ConcatV2Op>(op);
-  SmallVector<Value> values(tf_concatv2_op.values());
+  auto result_type = tf_concatv2_op.getResult().getType().cast<ShapedType>();
+  SmallVector<Value> values(tf_concatv2_op.getValues());
 
   ElementsAttr axis_elems;
-  if (!matchPattern(tf_concatv2_op.axis(), m_Constant(&axis_elems)))
+  if (!matchPattern(tf_concatv2_op.getAxis(), m_Constant(&axis_elems)))
     return failure();
 
   int32_t axis = axis_elems.getValues<IntegerAttr>()[0].getInt();
 
   llvm::Optional<Value> result =
-      convertConcatV2Op(rewriter, op, tf_concatv2_op.getResult(), values, axis);
+      convertConcatV2Op(rewriter, op, result_type, values, axis);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -678,10 +727,10 @@ LogicalResult ConvertTFReshapeOp::matchAndRewrite(
   for (int i = 0; i < output_type.getShape().size(); i++) {
     shape_vals.push_back(output_type.getShape()[i]);
   }
-  ArrayAttr shape_attr = rewriter.getI64ArrayAttr(shape_vals);
+  DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(shape_vals);
 
-  CreateReplaceOpAndInfer<tosa::ReshapeOp>(rewriter, op, output_type,
-                                           tf_reshape_op.tensor(), shape_attr);
+  CreateReplaceOpAndInfer<tosa::ReshapeOp>(
+      rewriter, op, output_type, tf_reshape_op.getTensor(), shape_attr);
   return success();
 }
 
@@ -690,14 +739,14 @@ LogicalResult ConvertTFRankOp::matchAndRewrite(
   auto tf_rank_op = cast<TF::RankOp>(op);
 
   RankedTensorType input_type =
-      tf_rank_op.input().getType().dyn_cast<RankedTensorType>();
+      tf_rank_op.getInput().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return failure();
 
   int32_t rank = input_type.getRank();
 
   RankedTensorType rank_type =
       tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(32));
-  auto rank_attr = DenseElementsAttr::get(rank_type, {rank});
+  auto rank_attr = DenseI32ArrayAttr::get(rewriter.getContext(), {rank});
   auto rank_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                     rank_type, rank_attr);
 
@@ -716,7 +765,7 @@ LogicalResult ConvertTFShapeOp::matchAndRewrite(
   if (!output_type) return failure();
 
   RankedTensorType input_type =
-      tf_shape_op.input().getType().dyn_cast<RankedTensorType>();
+      tf_shape_op.getInput().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return failure();
 
   auto input_shape = input_type.getShape();
@@ -729,7 +778,7 @@ LogicalResult ConvertTFShapeOp::matchAndRewrite(
   RankedTensorType shape_type = tensorflow::GetTypeFromTFTensorShape(
       {static_cast<int32_t>(shape_arr.size())}, rewriter.getIntegerType(32));
   auto shape_attr =
-      DenseElementsAttr::get(shape_type, llvm::makeArrayRef(shape_arr));
+      DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(shape_arr));
   auto shape_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                      shape_type, shape_attr);
 
@@ -742,13 +791,13 @@ LogicalResult ConvertTFExpandDimsOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_expanddims_op = cast<TF::ExpandDimsOp>(op);
 
-  llvm::Optional<Value> result =
-      convertExpandDimsOp(rewriter, op, tf_expanddims_op.getResult(),
-                          tf_expanddims_op.input(), tf_expanddims_op.dim());
+  llvm::Optional<Value> result = convertExpandDimsOp(
+      rewriter, op, tf_expanddims_op.getResult(), tf_expanddims_op.getInput(),
+      tf_expanddims_op.getDim());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -758,7 +807,7 @@ LogicalResult ConvertTFSqueezeOp::matchAndRewrite(
   auto tf_squeeze_op = cast<TF::SqueezeOp>(op);
 
   // Copy squeeze_dims into int32_t array
-  auto squeeze_dims_attr = tf_squeeze_op.squeeze_dimsAttr();
+  auto squeeze_dims_attr = tf_squeeze_op.getSqueezeDimsAttr();
   SmallVector<int32_t> squeeze_dims;
   for (auto& squeeze_dim : squeeze_dims_attr) {
     squeeze_dims.emplace_back(squeeze_dim.dyn_cast<IntegerAttr>().getInt());
@@ -766,11 +815,11 @@ LogicalResult ConvertTFSqueezeOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertSqueezeOp(rewriter, op, tf_squeeze_op.getResult(),
-                       tf_squeeze_op.input(), squeeze_dims);
+                       tf_squeeze_op.getInput(), squeeze_dims);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -785,7 +834,7 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
   if (!output_type) return failure();
 
   ElementsAttr dims_elems;
-  if (!matchPattern(tf_fill_op.dims(), m_Constant(&dims_elems)))
+  if (!matchPattern(tf_fill_op.getDims(), m_Constant(&dims_elems)))
     return failure();
   SmallVector<int64_t> dims_vals;
   uint32_t total_size = 1;
@@ -795,24 +844,26 @@ LogicalResult ConvertTFFillOp::matchAndRewrite(
   }
 
   ElementsAttr value_elem;
-  if (!matchPattern(tf_fill_op.value(), m_Constant(&value_elem)))
+  if (!matchPattern(tf_fill_op.getValue(), m_Constant(&value_elem)))
     return failure();
 
   RankedTensorType fill_type = tensorflow::GetTypeFromTFTensorShape(
       ArrayRef<int64_t>(dims_vals), value_elem.getType().getElementType());
-  DenseElementsAttr fill_attr;
+  DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type
   if (value_elem.getType().getElementType().isa<FloatType>()) {
     SmallVector<float> fill_arr(
         total_size,
         value_elem.getValues<FloatAttr>()[0].getValue().convertToFloat());
-    fill_attr = DenseElementsAttr::get(fill_type, llvm::makeArrayRef(fill_arr));
+    fill_attr =
+        DenseF32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   } else {
     SmallVector<int32_t> fill_arr(
         total_size,
         value_elem.getValues<IntegerAttr>()[0].getValue().getLimitedValue());
-    fill_attr = DenseElementsAttr::get(fill_type, llvm::makeArrayRef(fill_arr));
+    fill_attr =
+        DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
   auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                        fill_type, fill_attr);
@@ -826,7 +877,7 @@ LogicalResult ConvertTFConv2DOp::matchAndRewrite(
   auto tf_conv2d_op = cast<TF::Conv2DOp>(op);
 
   RankedTensorType filter_type =
-      tf_conv2d_op.filter().getType().dyn_cast<RankedTensorType>();
+      tf_conv2d_op.getFilter().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_conv2d_op.getResult().getType().dyn_cast<RankedTensorType>();
 
@@ -839,14 +890,14 @@ LogicalResult ConvertTFConv2DOp::matchAndRewrite(
                                               bias_attr.cast<ElementsAttr>());
 
   llvm::Optional<Value> result = convertTFConv2DCommon(
-      rewriter, op, output_type, tf_conv2d_op.input(), tf_conv2d_op.filter(),
-      bias, tf_conv2d_op.strides(), tf_conv2d_op.dilations(),
-      tf_conv2d_op.explicit_paddings(), tf_conv2d_op.padding(),
-      tf_conv2d_op.data_format());
+      rewriter, op, output_type, tf_conv2d_op.getInput(),
+      tf_conv2d_op.getFilter(), bias, tf_conv2d_op.getStrides(),
+      tf_conv2d_op.getDilations(), tf_conv2d_op.getExplicitPaddings(),
+      tf_conv2d_op.getPadding(), tf_conv2d_op.getDataFormat());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -856,7 +907,7 @@ LogicalResult ConvertTFConv3DOp::matchAndRewrite(
   auto tf_conv3d_op = cast<TF::Conv3DOp>(op);
 
   RankedTensorType filter_type =
-      tf_conv3d_op.filter().getType().dyn_cast<RankedTensorType>();
+      tf_conv3d_op.getFilter().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_conv3d_op.getResult().getType().dyn_cast<RankedTensorType>();
 
@@ -874,9 +925,10 @@ LogicalResult ConvertTFConv3DOp::matchAndRewrite(
                                               bias_attr.cast<ElementsAttr>());
 
   llvm::Optional<Value> result = convertTFConv3DCommon(
-      rewriter, op, output_type, tf_conv3d_op.input(), tf_conv3d_op.filter(),
-      bias, tf_conv3d_op.strides(), tf_conv3d_op.dilations(),
-      tf_conv3d_op.padding(), tf_conv3d_op.data_format());
+      rewriter, op, output_type, tf_conv3d_op.getInput(),
+      tf_conv3d_op.getFilter(), bias, tf_conv3d_op.getStrides(),
+      tf_conv3d_op.getDilations(), tf_conv3d_op.getPadding(),
+      tf_conv3d_op.getDataFormat());
 
   if (!result) return failure();
 
@@ -890,9 +942,9 @@ LogicalResult ConvertTFDepthwiseConv2dNativeOp::matchAndRewrite(
   auto tf_dwconv2d_op = cast<TF::DepthwiseConv2dNativeOp>(op);
 
   RankedTensorType input_type =
-      tf_dwconv2d_op.input().getType().dyn_cast<RankedTensorType>();
+      tf_dwconv2d_op.getInput().getType().dyn_cast<RankedTensorType>();
   RankedTensorType filter_type =
-      tf_dwconv2d_op.filter().getType().dyn_cast<RankedTensorType>();
+      tf_dwconv2d_op.getFilter().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_dwconv2d_op.getResult().getType().dyn_cast<RankedTensorType>();
   // Not a ranked tensor output
@@ -904,46 +956,47 @@ LogicalResult ConvertTFDepthwiseConv2dNativeOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "filter type unranked tensor");
   }
 
-  auto tmpAttr = tf_dwconv2d_op.data_formatAttr();
+  auto tmpAttr = tf_dwconv2d_op.getDataFormatAttr();
   if (tmpAttr && tmpAttr.getValue().str() != "NHWC") return failure();
 
-  ArrayAttr stride;
-  ArrayAttr dilation;
-  ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr dilation;
+  DenseI64ArrayAttr pad;
   {
-    auto tmpAttr = tf_dwconv2d_op.strides();
+    auto tmpAttr = tf_dwconv2d_op.getStrides();
     if (!tmpAttr) {
-      stride = rewriter.getI64ArrayAttr({1, 1});
+      stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t stride_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t stride_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+      stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
   {
-    auto tmpAttr = tf_dwconv2d_op.dilations();
+    auto tmpAttr = tf_dwconv2d_op.getDilations();
     if (!tmpAttr) {
-      dilation = rewriter.getI64ArrayAttr({1, 1});
+      dilation = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t dilation_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t dilation_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      dilation = rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+      dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
     }
   }
   {
     tensorflow::Padding tf_pad;
-    if (!GetPaddingFromString(tf_dwconv2d_op.padding().str(), &tf_pad).ok())
+    if (!GetPaddingFromString(tf_dwconv2d_op.getPadding().str(), &tf_pad).ok())
       return failure();
 
     tensorflow::TensorFormat data_format_tf;
-    if (!FormatFromString(tf_dwconv2d_op.data_format().str(), &data_format_tf))
+    if (!FormatFromString(tf_dwconv2d_op.getDataFormat().str(),
+                          &data_format_tf))
       return failure();
 
     if (tf_pad == tensorflow::Padding::EXPLICIT) {
       pad = getPaddingValuesFromExplicitPadAttr(
-          tf_dwconv2d_op.explicit_paddings(), data_format_tf, rewriter);
+          tf_dwconv2d_op.getExplicitPaddings(), data_format_tf, rewriter);
     } else {
       if (!getPaddingValuesFromPadType(tf_pad, data_format_tf,
                                        0,  // tensorflow::FORMAT_HWIO
@@ -962,8 +1015,8 @@ LogicalResult ConvertTFDepthwiseConv2dNativeOp::matchAndRewrite(
                                               bias_attr.cast<ElementsAttr>());
 
   CreateReplaceOpAndInfer<tosa::DepthwiseConv2DOp>(
-      rewriter, op, output_type, tf_dwconv2d_op.input(),
-      tf_dwconv2d_op.filter(), bias, pad, stride, dilation);
+      rewriter, op, output_type, tf_dwconv2d_op.getInput(),
+      tf_dwconv2d_op.getFilter(), bias, pad, stride, dilation);
   return success();
 }
 
@@ -972,9 +1025,9 @@ LogicalResult ConvertTFConv2DBackpropInputOp::matchAndRewrite(
   auto tf_conv_op = cast<TF::Conv2DBackpropInputOp>(op);
 
   RankedTensorType input_type =
-      tf_conv_op.out_backprop().getType().dyn_cast<RankedTensorType>();
+      tf_conv_op.getOutBackprop().getType().dyn_cast<RankedTensorType>();
   RankedTensorType filter_type =
-      tf_conv_op.filter().getType().dyn_cast<RankedTensorType>();
+      tf_conv_op.getFilter().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_conv_op.getResult().getType().dyn_cast<RankedTensorType>();
   // Not a ranked tensor output
@@ -998,24 +1051,24 @@ LogicalResult ConvertTFConv2DBackpropInputOp::matchAndRewrite(
       rewriter, op->getLoc(),
       tensorflow::GetTypeFromTFTensorShape(ArrayRef<int64_t>(a1_transpose_dims),
                                            filter_type.getElementType()),
-      tf_conv_op.filter(), a1_filter_transpose_perm.getValue());
+      tf_conv_op.getFilter(), a1_filter_transpose_perm.value());
 
-  ArrayAttr stride;
-  ArrayAttr outpad;
-  ArrayAttr output_shape;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr outpad;
+  DenseI64ArrayAttr output_shape;
   {
-    auto tmpAttr = tf_conv_op.strides();
+    auto tmpAttr = tf_conv_op.getStrides();
     if (!tmpAttr) {
-      stride = rewriter.getI64ArrayAttr({1, 1});
+      stride = rewriter.getDenseI64ArrayAttr({1, 1});
     } else {
       // Note: hardcoded to NHWC for now
       int64_t stride_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
       int64_t stride_w = tmpAttr[2].dyn_cast<IntegerAttr>().getInt();
-      stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+      stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
     }
   }
   {
-    auto tmpAttr = tf_conv_op.dilations();
+    auto tmpAttr = tf_conv_op.getDilations();
     if (tmpAttr) {
       // Note: hardcoded to NHWC for now
       int64_t dilation_h = tmpAttr[1].dyn_cast<IntegerAttr>().getInt();
@@ -1026,16 +1079,16 @@ LogicalResult ConvertTFConv2DBackpropInputOp::matchAndRewrite(
   }
   {
     tensorflow::Padding tf_pad;
-    if (!GetPaddingFromString(tf_conv_op.padding().str(), &tf_pad).ok())
+    if (!GetPaddingFromString(tf_conv_op.getPadding().str(), &tf_pad).ok())
       return failure();
 
     tensorflow::TensorFormat data_format_tf;
-    if (!FormatFromString(tf_conv_op.data_format().str(), &data_format_tf))
+    if (!FormatFromString(tf_conv_op.getDataFormat().str(), &data_format_tf))
       return failure();
 
     if (tf_pad == tensorflow::Padding::EXPLICIT) {
       outpad = getPaddingValuesFromExplicitPadAttr(
-          tf_conv_op.explicit_paddings(), data_format_tf, rewriter);
+          tf_conv_op.getExplicitPaddings(), data_format_tf, rewriter);
     } else {
       if (!getTransposeConv2dPaddingValues(tf_pad, data_format_tf,
                                            0,  // tensorflow::FORMAT_HWIO,
@@ -1047,16 +1100,16 @@ LogicalResult ConvertTFConv2DBackpropInputOp::matchAndRewrite(
   {
     ElementsAttr output_shape_elems;
     // Match from input_sizes tensor first.
-    if (matchPattern(tf_conv_op.input_sizes(),
+    if (matchPattern(tf_conv_op.getInputSizes(),
                      m_Constant(&output_shape_elems))) {
       SmallVector<int64_t> shape_vec;
       for (int i = 0; i < output_shape_elems.getNumElements(); i++)
         shape_vec.push_back(
             output_shape_elems.getValues<IntegerAttr>()[i].getInt());
-      output_shape = rewriter.getI64ArrayAttr(shape_vec);
+      output_shape = rewriter.getDenseI64ArrayAttr(shape_vec);
     } else {
       // Use output tensor's shape otherwise.
-      output_shape = rewriter.getI64ArrayAttr(output_type.getShape());
+      output_shape = rewriter.getDenseI64ArrayAttr(output_type.getShape());
     }
   }
 
@@ -1068,8 +1121,8 @@ LogicalResult ConvertTFConv2DBackpropInputOp::matchAndRewrite(
   if (!zero_bias) return failure();
 
   CreateReplaceOpAndInfer<tosa::TransposeConv2DOp>(
-      rewriter, op, output_type, tf_conv_op.out_backprop(),
-      a1_filter_transpose_op.getResult(), zero_bias.getValue(), outpad, stride,
+      rewriter, op, output_type, tf_conv_op.getOutBackprop(),
+      a1_filter_transpose_op.getResult(), zero_bias.value(), outpad, stride,
       output_shape);
 
   return success();
@@ -1084,15 +1137,15 @@ LogicalResult ConvertTFAllOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_all_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_all_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceAllOp(
-      rewriter, op, output_type, tf_all_op.input(), axes_elems);
+      rewriter, op, output_type, tf_all_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1106,15 +1159,15 @@ LogicalResult ConvertTFAnyOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_any_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_any_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceAnyOp(
-      rewriter, op, output_type, tf_any_op.input(), axes_elems);
+      rewriter, op, output_type, tf_any_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1128,15 +1181,15 @@ LogicalResult ConvertTFMaxOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_max_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_max_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceMaxOp(
-      rewriter, op, output_type, tf_max_op.input(), axes_elems);
+      rewriter, op, output_type, tf_max_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1150,15 +1203,15 @@ LogicalResult ConvertTFMinOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_min_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_min_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceMinOp(
-      rewriter, op, output_type, tf_min_op.input(), axes_elems);
+      rewriter, op, output_type, tf_min_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1172,15 +1225,15 @@ LogicalResult ConvertTFMeanOp::matchAndRewrite(
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_mean_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_mean_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceMeanOp(
-      rewriter, op, output_type, tf_mean_op.input(), axes_elems);
+      rewriter, op, output_type, tf_mean_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1194,15 +1247,15 @@ LogicalResult ConvertTFProdOp::matchAndRewrite(
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_prod_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_prod_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceProdOp(
-      rewriter, op, output_type, tf_prod_op.input(), axes_elems);
+      rewriter, op, output_type, tf_prod_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1216,15 +1269,15 @@ LogicalResult ConvertTFSumOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   ElementsAttr axes_elems;
-  if (!matchPattern(tf_sum_op.reduction_indices(), m_Constant(&axes_elems)))
+  if (!matchPattern(tf_sum_op.getReductionIndices(), m_Constant(&axes_elems)))
     return failure();
 
   llvm::Optional<Value> result = convertReduceSumOp(
-      rewriter, op, output_type, tf_sum_op.input(), axes_elems);
+      rewriter, op, output_type, tf_sum_op.getInput(), axes_elems);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1233,12 +1286,12 @@ LogicalResult ConvertTFEluOp::matchAndRewrite(Operation* op,
                                               PatternRewriter& rewriter) const {
   auto tf_elu_op = cast<TF::EluOp>(op);
 
-  llvm::Optional<Value> result =
-      convertEluOp(rewriter, op, tf_elu_op.getResult(), tf_elu_op.features());
+  llvm::Optional<Value> result = convertEluOp(
+      rewriter, op, tf_elu_op.getResult(), tf_elu_op.getFeatures());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1249,11 +1302,11 @@ LogicalResult ConvertTFSoftmaxOp::matchAndRewrite(
 
   llvm::Optional<Value> result =
       convertSoftmaxOp(rewriter, op, tf_softmax_op.getResult(),
-                       tf_softmax_op.logits(), /*beta=*/1.0);
+                       tf_softmax_op.getLogits(), /*beta=*/1.0);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1263,11 +1316,11 @@ LogicalResult ConvertTFLogSoftmaxOp::matchAndRewrite(
   auto tf_logsoftmax_op = cast<TF::LogSoftmaxOp>(op);
 
   llvm::Optional<Value> result = convertLogSoftmaxOp(
-      rewriter, op, tf_logsoftmax_op.getResult(), tf_logsoftmax_op.logits());
+      rewriter, op, tf_logsoftmax_op.getResult(), tf_logsoftmax_op.getLogits());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1302,9 +1355,9 @@ LogicalResult ConvertTFFusedBatchNormOp::matchAndRewrite(
   // op6 = add(op5, boffset)
 
   RankedTensorType mean_type =
-      tf_batchnorm_op.mean().getType().dyn_cast<RankedTensorType>();
+      tf_batchnorm_op.getMean().getType().dyn_cast<RankedTensorType>();
   RankedTensorType variance_type =
-      tf_batchnorm_op.variance().getType().dyn_cast<RankedTensorType>();
+      tf_batchnorm_op.getVariance().getType().dyn_cast<RankedTensorType>();
   if (!variance_type || !mean_type) return failure();
 
   Value mean_val, variance_val;
@@ -1312,25 +1365,25 @@ LogicalResult ConvertTFFusedBatchNormOp::matchAndRewrite(
   if (mean_type.getNumElements() == 0) {
     mean_val = getTosaConstTensorSingleF32(rewriter, tf_batchnorm_op, 0);
   } else {
-    mean_val = tf_batchnorm_op.mean();
+    mean_val = tf_batchnorm_op.getMean();
   }
 
   if (variance_type.getNumElements() == 0) {
     variance_val = getTosaConstTensorSingleF32(rewriter, tf_batchnorm_op, 1.0);
   } else {
-    variance_val = tf_batchnorm_op.variance();
+    variance_val = tf_batchnorm_op.getVariance();
   }
 
   RankedTensorType epsilon_type =
       tensorflow::GetTypeFromTFTensorShape({1}, variance_type.getElementType());
   auto epsilon_attr =
-      DenseFPElementsAttr::get(epsilon_type, {tf_batchnorm_op.epsilon()});
+      DenseFPElementsAttr::get(epsilon_type, {tf_batchnorm_op.getEpsilon()});
   auto epsilon_const = CreateOpAndInfer<tosa::ConstOp>(
       rewriter, op->getLoc(), epsilon_type, epsilon_attr);
 
   auto op1_sub_input_mean = CreateOpAndInfer<tosa::SubOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      tf_batchnorm_op.x(), mean_val);
+      tf_batchnorm_op.getX(), mean_val);
 
   auto op2_add_var_epsilon = CreateOpAndInfer<tosa::AddOp>(
       rewriter, op->getLoc(), variance_val.getType(), variance_val,
@@ -1346,11 +1399,11 @@ LogicalResult ConvertTFFusedBatchNormOp::matchAndRewrite(
 
   auto op5_mul_op4_scale = CreateOpAndInfer<tosa::MulOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      op4_mul_op1_op3.getResult(), tf_batchnorm_op.scale(), 0);
+      op4_mul_op1_op3.getResult(), tf_batchnorm_op.getScale(), 0);
 
   auto op6_add_op5_offset = CreateOpAndInfer<tosa::AddOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      op5_mul_op4_scale.getResult(), tf_batchnorm_op.offset());
+      op5_mul_op4_scale.getResult(), tf_batchnorm_op.getOffset());
 
   rewriter.replaceOp(op, {op6_add_op5_offset.getResult()});
   return success();
@@ -1360,7 +1413,7 @@ LogicalResult ConvertTFFusedBatchNormV3Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_batchnorm_op = cast<TF::FusedBatchNormV3Op>(op);
 
-  if (tf_batchnorm_op.is_training())
+  if (tf_batchnorm_op.getIsTraining())
     return rewriter.notifyMatchFailure(
         op, "unable to lower when is_training is set");
 
@@ -1389,25 +1442,25 @@ LogicalResult ConvertTFFusedBatchNormV3Op::matchAndRewrite(
 
   auto op1_sub_input_mean = CreateOpAndInfer<tosa::SubOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      tf_batchnorm_op.x(), tf_batchnorm_op.mean());
+      tf_batchnorm_op.getX(), tf_batchnorm_op.getMean());
 
   RankedTensorType variance_type =
-      tf_batchnorm_op.variance().getType().dyn_cast<RankedTensorType>();
+      tf_batchnorm_op.getVariance().getType().dyn_cast<RankedTensorType>();
   if (!variance_type) return failure();
 
   auto epsilon_type =
       tensorflow::GetTypeFromTFTensorShape({1}, variance_type.getElementType());
   auto epsilon_attr =
-      DenseFPElementsAttr::get(epsilon_type, {tf_batchnorm_op.epsilon()});
+      DenseFPElementsAttr::get(epsilon_type, {tf_batchnorm_op.getEpsilon()});
   auto epsilon_const = CreateOpAndInfer<tosa::ConstOp>(
       rewriter, op->getLoc(), epsilon_type, epsilon_attr);
 
   auto op2_add_var_epsilon = CreateOpAndInfer<tosa::AddOp>(
-      rewriter, op->getLoc(), tf_batchnorm_op.variance().getType(),
-      tf_batchnorm_op.variance(), epsilon_const);
+      rewriter, op->getLoc(), tf_batchnorm_op.getVariance().getType(),
+      tf_batchnorm_op.getVariance(), epsilon_const);
 
   auto op3_rsqrt_op2 = CreateOpAndInfer<tosa::RsqrtOp>(
-      rewriter, op->getLoc(), tf_batchnorm_op.variance().getType(),
+      rewriter, op->getLoc(), tf_batchnorm_op.getVariance().getType(),
       op2_add_var_epsilon.getResult());
 
   auto op4_mul_op1_op3 = CreateOpAndInfer<tosa::MulOp>(
@@ -1416,18 +1469,18 @@ LogicalResult ConvertTFFusedBatchNormV3Op::matchAndRewrite(
 
   auto op5_mul_op4_scale = CreateOpAndInfer<tosa::MulOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      op4_mul_op1_op3.getResult(), tf_batchnorm_op.scale(), 0);
+      op4_mul_op1_op3.getResult(), tf_batchnorm_op.getScale(), 0);
 
   auto op6_add_op5_offset = CreateOpAndInfer<tosa::AddOp>(
       rewriter, op->getLoc(), tf_batchnorm_op.getResult(0).getType(),
-      op5_mul_op4_scale.getResult(), tf_batchnorm_op.offset());
+      op5_mul_op4_scale.getResult(), tf_batchnorm_op.getOffset());
 
   llvm::SmallVector<Value> replacements = {
-      op6_add_op5_offset.getResult(), tf_batchnorm_op.mean(),
-      tf_batchnorm_op.variance(),
+      op6_add_op5_offset.getResult(), tf_batchnorm_op.getMean(),
+      tf_batchnorm_op.getVariance(),
       // The last three are reserved spaces and have no purpose currently.
-      tf_batchnorm_op.mean(), tf_batchnorm_op.variance(),
-      tf_batchnorm_op.variance()};
+      tf_batchnorm_op.getMean(), tf_batchnorm_op.getVariance(),
+      tf_batchnorm_op.getVariance()};
   rewriter.replaceOp(op, replacements);
   return success();
 }
@@ -1442,8 +1495,8 @@ LogicalResult ConvertTFBiasAddOp::matchAndRewrite(
   if (!output_type) return failure();
 
   auto add_op = CreateOpAndInfer<tosa::AddOp>(
-      rewriter, op->getLoc(), output_type, tf_biasadd_op.value(),
-      tf_biasadd_op.bias());
+      rewriter, op->getLoc(), output_type, tf_biasadd_op.getValue(),
+      tf_biasadd_op.getBias());
 
   rewriter.replaceOp(op, {add_op.getResult()});
   return success();
@@ -1463,7 +1516,7 @@ LogicalResult ConvertTFSliceOp::matchAndRewrite(
   SmallVector<int64_t> begin_vals, size_vals;
 
   // Assuming begin is always compile-time constant
-  if (!matchPattern(tf_slice_op.begin(), m_Constant(&begin_elems))) {
+  if (!matchPattern(tf_slice_op.getBegin(), m_Constant(&begin_elems))) {
     return rewriter.notifyMatchFailure(op, "begin is not constant");
   }
 
@@ -1472,7 +1525,7 @@ LogicalResult ConvertTFSliceOp::matchAndRewrite(
 
   // Try to match size as compile-time constant first,
   // if this fails, use the output tensor shape instead.
-  if (matchPattern(tf_slice_op.size(), m_Constant(&size_elems))) {
+  if (matchPattern(tf_slice_op.getSize(), m_Constant(&size_elems))) {
     for (int i = 0; i < size_elems.getNumElements(); i++)
       size_vals.push_back(size_elems.getValues<IntegerAttr>()[i].getInt());
   } else {
@@ -1480,11 +1533,11 @@ LogicalResult ConvertTFSliceOp::matchAndRewrite(
                      output_type.getShape().end());
   }
 
-  ArrayAttr begin = rewriter.getI64ArrayAttr(begin_vals);
-  ArrayAttr size = rewriter.getI64ArrayAttr(size_vals);
+  DenseI64ArrayAttr begin = rewriter.getDenseI64ArrayAttr(begin_vals);
+  DenseI64ArrayAttr size = rewriter.getDenseI64ArrayAttr(size_vals);
 
   CreateReplaceOpAndInfer<tosa::SliceOp>(rewriter, op, output_type,
-                                         tf_slice_op.input(), begin, size);
+                                         tf_slice_op.getInput(), begin, size);
   return success();
 }
 
@@ -1498,17 +1551,18 @@ LogicalResult ConvertTFTileOp::matchAndRewrite(
   if (!output_type) return failure();
 
   ElementsAttr multiples_elems;
-  if (!matchPattern(tf_tile_op.multiples(), m_Constant(&multiples_elems)))
+  if (!matchPattern(tf_tile_op.getMultiples(), m_Constant(&multiples_elems)))
     return failure();
   SmallVector<int64_t> multiples_vals;
   for (int i = 0; i < multiples_elems.getNumElements(); i++)
     multiples_vals.push_back(
         multiples_elems.getValues<IntegerAttr>()[i].getInt());
 
-  ArrayAttr multiples_attr = rewriter.getI64ArrayAttr(multiples_vals);
+  DenseI64ArrayAttr multiples_attr =
+      rewriter.getDenseI64ArrayAttr(multiples_vals);
 
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
-                                        tf_tile_op.input(), multiples_attr);
+                                        tf_tile_op.getInput(), multiples_attr);
 
   return success();
 }
@@ -1524,8 +1578,9 @@ LogicalResult ConvertTFTransposeOp::matchAndRewrite(
     return failure();
   }
 
-  CreateReplaceOpAndInfer<tosa::TransposeOp>(
-      rewriter, op, output_type, tf_transpose_op.x(), tf_transpose_op.perm());
+  CreateReplaceOpAndInfer<tosa::TransposeOp>(rewriter, op, output_type,
+                                             tf_transpose_op.getX(),
+                                             tf_transpose_op.getPerm());
 
   return success();
 }
@@ -1534,11 +1589,11 @@ LogicalResult ConvertTFPackOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_pack_op = cast<TF::PackOp>(op);
 
-  SmallVector<Value> inputs(tf_pack_op.values());
+  SmallVector<Value> inputs(tf_pack_op.getValues());
 
   assert(inputs.size() >= 2);
 
-  IntegerAttr axis_attr = tf_pack_op.axisAttr();
+  IntegerAttr axis_attr = tf_pack_op.getAxisAttr();
   if (!axis_attr) axis_attr = rewriter.getI64IntegerAttr(0);
 
   int32_t axis_i32 = axis_attr.getInt();
@@ -1548,7 +1603,7 @@ LogicalResult ConvertTFPackOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1559,18 +1614,18 @@ LogicalResult ConvertTFUnpackOp::matchAndRewrite(
 
   IntegerAttr axis_attr;
   {
-    auto tmpAttr = tf_unpack_op.axisAttr();
+    auto tmpAttr = tf_unpack_op.getAxisAttr();
     if (!tmpAttr) tmpAttr = rewriter.getI64IntegerAttr(0);
     axis_attr = tmpAttr;
   }
   int32_t axis_i32 = axis_attr.getInt();
 
   llvm::Optional<SmallVector<Value>> results =
-      convertUnpackOp(rewriter, op, tf_unpack_op.value(), axis_i32);
+      convertUnpackOp(rewriter, op, tf_unpack_op.getValue(), axis_i32);
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -1589,17 +1644,17 @@ LogicalResult ConvertTFSplitOp::matchAndRewrite(
   // Get the axis
   int32_t axis = 0;
   ElementsAttr axisAttrElems;
-  if (matchPattern(tf_split_op.split_dim(), m_Constant(&axisAttrElems))) {
+  if (matchPattern(tf_split_op.getSplitDim(), m_Constant(&axisAttrElems))) {
     axis = axisAttrElems.getValues<IntegerAttr>()[0].getInt();
   }
 
   llvm::Optional<SmallVector<Value>> results =
       convertSplitOp(rewriter, op, tf_split_op.getResult(0),
-                     tf_split_op.value(), num_split, axis);
+                     tf_split_op.getValue(), num_split, axis);
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -1612,7 +1667,7 @@ LogicalResult ConvertTFSplitVOp::matchAndRewrite(
   // Get the size_splits array
   SmallVector<int32_t> size_split;
   ElementsAttr size_split_elems;
-  if (!matchPattern(tf_splitv_op.size_splits(),
+  if (!matchPattern(tf_splitv_op.getSizeSplits(),
                     m_Constant(&size_split_elems))) {
     return failure();
   }
@@ -1623,7 +1678,7 @@ LogicalResult ConvertTFSplitVOp::matchAndRewrite(
 
   // Get the axis
   ElementsAttr axisAttrElems;
-  if (!matchPattern(tf_splitv_op.split_dim(), m_Constant(&axisAttrElems))) {
+  if (!matchPattern(tf_splitv_op.getSplitDim(), m_Constant(&axisAttrElems))) {
     return rewriter.notifyMatchFailure(op, "cannot read split_dim elems");
   }
 
@@ -1631,11 +1686,11 @@ LogicalResult ConvertTFSplitVOp::matchAndRewrite(
 
   llvm::Optional<SmallVector<Value>> results =
       convertSplitVOp(rewriter, op, tf_splitv_op.getResult(0),
-                      tf_splitv_op.value(), size_split, axis);
+                      tf_splitv_op.getValue(), size_split, axis);
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -1651,7 +1706,8 @@ LogicalResult ConvertTFLessOp::matchAndRewrite(
 
   // less(x, y) is not(greater_equal(x, y))
   auto greater_equal_op = CreateOpAndInfer<tosa::GreaterEqualOp>(
-      rewriter, op->getLoc(), output_type, tf_less_op.x(), tf_less_op.y());
+      rewriter, op->getLoc(), output_type, tf_less_op.getX(),
+      tf_less_op.getY());
 
   auto not_op = CreateOpAndInfer<tosa::LogicalNotOp>(
       rewriter, op->getLoc(), output_type, greater_equal_op.getResult());
@@ -1671,8 +1727,8 @@ LogicalResult ConvertTFLessEqualOp::matchAndRewrite(
 
   // less_equal(x, y) is not(greater(x, y))
   auto greater_op = CreateOpAndInfer<tosa::GreaterOp>(
-      rewriter, op->getLoc(), output_type, tf_less_equal_op.x(),
-      tf_less_equal_op.y());
+      rewriter, op->getLoc(), output_type, tf_less_equal_op.getX(),
+      tf_less_equal_op.getY());
   auto not_op = CreateOpAndInfer<tosa::LogicalNotOp>(
       rewriter, op->getLoc(), output_type, greater_op.getResult());
 
@@ -1689,9 +1745,9 @@ LogicalResult ConvertTFPadOp::matchAndRewrite(Operation* op,
   // Not a ranked tensor output
   if (!output_type) return failure();
 
-  auto pad_op =
-      CreateOpAndInfer<tosa::PadOp>(rewriter, op->getLoc(), output_type,
-                                    tf_pad_op.input(), tf_pad_op.paddings());
+  auto pad_op = CreateOpAndInfer<tosa::PadOp>(rewriter, op->getLoc(),
+                                              output_type, tf_pad_op.getInput(),
+                                              tf_pad_op.getPaddings());
 
   rewriter.replaceOp(op, {pad_op.getResult()});
   return success();
@@ -1708,7 +1764,7 @@ LogicalResult ConvertTFMirrorPadOp::matchAndRewrite(
   }
 
   TFTFLMirrorPaddingType mode;
-  StringRef tf_mode = tf_mirrorpad_op.mode();
+  StringRef tf_mode = tf_mirrorpad_op.getMode();
   if (tf_mode == "REFLECT") {
     mode = TFTFLMirrorPaddingType::REFLECT;
   } else if (tf_mode == "SYMMETRIC") {
@@ -1718,11 +1774,11 @@ LogicalResult ConvertTFMirrorPadOp::matchAndRewrite(
         op, "mode isn't one of REFLECT or SYMMETRIC");
   }
 
-  llvm::Optional<Value> result =
-      convertMirrorPadCommon(rewriter, op, output_type, tf_mirrorpad_op.input(),
-                             tf_mirrorpad_op.paddings(), mode);
+  llvm::Optional<Value> result = convertMirrorPadCommon(
+      rewriter, op, output_type, tf_mirrorpad_op.getInput(),
+      tf_mirrorpad_op.getPaddings(), mode);
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1737,13 +1793,13 @@ LogicalResult ConvertTFResizeBilinearOp::matchAndRewrite(
   if (!output_type) return failure();
 
   llvm::Optional<Value> result = convertResizeOp(
-      rewriter, op, output_type, tf_resize_op.images(), StringRef("BILINEAR"),
-      tf_resize_op.align_cornersAttr().getValue(),
-      tf_resize_op.half_pixel_centersAttr().getValue());
+      rewriter, op, output_type, tf_resize_op.getImages(),
+      StringRef("BILINEAR"), tf_resize_op.getAlignCornersAttr().getValue(),
+      tf_resize_op.getHalfPixelCentersAttr().getValue());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1758,14 +1814,14 @@ LogicalResult ConvertTFResizeNearestNeighborOp::matchAndRewrite(
   if (!output_type) return failure();
 
   llvm::Optional<Value> result =
-      convertResizeOp(rewriter, op, output_type, tf_resize_op.images(),
+      convertResizeOp(rewriter, op, output_type, tf_resize_op.getImages(),
                       StringRef("NEAREST_NEIGHBOR"),
-                      tf_resize_op.align_cornersAttr().getValue(),
-                      tf_resize_op.half_pixel_centersAttr().getValue());
+                      tf_resize_op.getAlignCornersAttr().getValue(),
+                      tf_resize_op.getHalfPixelCentersAttr().getValue());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1775,9 +1831,9 @@ LogicalResult ConvertTFMatMulOp::matchAndRewrite(
   auto tf_matmul_op = cast<TF::MatMulOp>(op);
 
   RankedTensorType a_type =
-      tf_matmul_op.a().getType().dyn_cast<RankedTensorType>();
+      tf_matmul_op.getA().getType().dyn_cast<RankedTensorType>();
   RankedTensorType b_type =
-      tf_matmul_op.b().getType().dyn_cast<RankedTensorType>();
+      tf_matmul_op.getB().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_matmul_op.getResult().getType().dyn_cast<RankedTensorType>();
 
@@ -1814,12 +1870,12 @@ LogicalResult ConvertTFMatMulOp::matchAndRewrite(
   // Need to reshape input and output since TOSA matmul only supports
   // [N, H, C] * [N, C, W] -> [N, H, W].
   auto op1_reshape_a = CreateOpAndInfer<tosa::ReshapeOp>(
-      rewriter, op->getLoc(), batch_a_type, tf_matmul_op.a(),
-      rewriter.getI64ArrayAttr(batch_a_shape));
+      rewriter, op->getLoc(), batch_a_type, tf_matmul_op.getA(),
+      rewriter.getDenseI64ArrayAttr(batch_a_shape));
 
   auto op2_reshape_b = CreateOpAndInfer<tosa::ReshapeOp>(
-      rewriter, op->getLoc(), batch_b_type, tf_matmul_op.b(),
-      rewriter.getI64ArrayAttr(batch_b_shape));
+      rewriter, op->getLoc(), batch_b_type, tf_matmul_op.getB(),
+      rewriter.getDenseI64ArrayAttr(batch_b_shape));
 
   auto op3_matmul_op1_op2 = CreateOpAndInfer<tosa::MatMulOp>(
       rewriter, op->getLoc(), batch_output_type, op1_reshape_a.getResult(),
@@ -1827,7 +1883,7 @@ LogicalResult ConvertTFMatMulOp::matchAndRewrite(
 
   CreateReplaceOpAndInfer<tosa::ReshapeOp>(
       rewriter, op, output_type, op3_matmul_op1_op2.getResult(),
-      rewriter.getI64ArrayAttr(output_type.getShape()));
+      rewriter.getDenseI64ArrayAttr(output_type.getShape()));
 
   return success();
 }
@@ -1841,12 +1897,12 @@ LogicalResult ConvertTFGatherOp::matchAndRewrite(
   int32_t axis = 0;
 
   llvm::Optional<Value> result = convertGatherOp(
-      rewriter, op, tf_gather_op.getResult(), tf_gather_op.params(),
-      tf_gather_op.indices(), batch_dims, axis);
+      rewriter, op, tf_gather_op.getResult(), tf_gather_op.getParams(),
+      tf_gather_op.getIndices(), batch_dims, axis);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1857,20 +1913,20 @@ LogicalResult ConvertTFGatherV2Op::matchAndRewrite(
 
   // Axis is a tensor.  Pull out the one integer value.
   ElementsAttr axis_elem;
-  if (!matchPattern(tf_gather_op.axis(), m_Constant(&axis_elem)))
+  if (!matchPattern(tf_gather_op.getAxis(), m_Constant(&axis_elem)))
     return failure();
   assert(axis_elem.getNumElements() == 1);
 
   int32_t axis = axis_elem.getValues<IntegerAttr>()[0].getInt();
-  int32_t batch_dims = tf_gather_op.batch_dimsAttr().getInt();
+  int32_t batch_dims = tf_gather_op.getBatchDimsAttr().getInt();
 
   llvm::Optional<Value> result = convertGatherOp(
-      rewriter, op, tf_gather_op.getResult(), tf_gather_op.params(),
-      tf_gather_op.indices(), batch_dims, axis);
+      rewriter, op, tf_gather_op.getResult(), tf_gather_op.getParams(),
+      tf_gather_op.getIndices(), batch_dims, axis);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1879,13 +1935,13 @@ LogicalResult ConvertTFGatherNdOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_gathernd_op = cast<TF::GatherNdOp>(op);
 
-  llvm::Optional<Value> result =
-      convertGatherNdOp(rewriter, op, tf_gathernd_op.getResult(),
-                        tf_gathernd_op.params(), tf_gathernd_op.indices());
+  llvm::Optional<Value> result = convertGatherNdOp(
+      rewriter, op, tf_gathernd_op.getResult(), tf_gathernd_op.getParams(),
+      tf_gathernd_op.getIndices());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1895,12 +1951,12 @@ LogicalResult ConvertTFSelectV2Op::matchAndRewrite(
   auto tf_sel_op = cast<TF::SelectV2Op>(op);
 
   llvm::Optional<Value> result = convertSelectOp(
-      rewriter, op, tf_sel_op.getResult(), tf_sel_op.condition(),
-      tf_sel_op.then_value(), tf_sel_op.else_value());
+      rewriter, op, tf_sel_op.getResult(), tf_sel_op.getCondition(),
+      tf_sel_op.getThenValue(), tf_sel_op.getElseValue());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1910,12 +1966,12 @@ LogicalResult ConvertTFSpaceToDepthOp::matchAndRewrite(
   auto tf_s2d_op = cast<TF::SpaceToDepthOp>(op);
 
   llvm::Optional<Value> result = convertSpaceToDepthOp(
-      rewriter, op, tf_s2d_op.getResult(), tf_s2d_op.input(),
-      tf_s2d_op.block_sizeAttr(), tf_s2d_op.data_formatAttr());
+      rewriter, op, tf_s2d_op.getResult(), tf_s2d_op.getInput(),
+      tf_s2d_op.getBlockSizeAttr(), tf_s2d_op.getDataFormatAttr());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1925,12 +1981,12 @@ LogicalResult ConvertTFDepthToSpaceOp::matchAndRewrite(
   auto tf_d2s_op = cast<TF::DepthToSpaceOp>(op);
 
   llvm::Optional<Value> result = convertDepthToSpaceOp(
-      rewriter, op, tf_d2s_op.getResult(), tf_d2s_op.input(),
-      tf_d2s_op.block_sizeAttr(), tf_d2s_op.data_formatAttr());
+      rewriter, op, tf_d2s_op.getResult(), tf_d2s_op.getInput(),
+      tf_d2s_op.getBlockSizeAttr(), tf_d2s_op.getDataFormatAttr());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1940,11 +1996,11 @@ LogicalResult ConvertTFSpaceToBatchNDOp::matchAndRewrite(
   auto tf_s2b_op = cast<TF::SpaceToBatchNDOp>(op);
 
   llvm::Optional<Value> result = convertSpaceToBatchNDOp(
-      rewriter, op, tf_s2b_op.getResult(), tf_s2b_op.input(),
-      tf_s2b_op.block_shape(), tf_s2b_op.paddings());
+      rewriter, op, tf_s2b_op.getResult(), tf_s2b_op.getInput(),
+      tf_s2b_op.getBlockShape(), tf_s2b_op.getPaddings());
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1954,12 +2010,12 @@ LogicalResult ConvertTFBatchToSpaceNDOp::matchAndRewrite(
   auto tf_b2s_op = cast<TF::BatchToSpaceNDOp>(op);
 
   llvm::Optional<Value> result = convertBatchToSpaceNDOp(
-      rewriter, op, tf_b2s_op.getResult(), tf_b2s_op.input(),
-      tf_b2s_op.block_shape(), tf_b2s_op.crops());
+      rewriter, op, tf_b2s_op.getResult(), tf_b2s_op.getInput(),
+      tf_b2s_op.getBlockShape(), tf_b2s_op.getCrops());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1969,15 +2025,16 @@ LogicalResult ConvertTFStridedSliceOp::matchAndRewrite(
   auto tf_ss_op = cast<TF::StridedSliceOp>(op);
 
   llvm::Optional<Value> result = convertStridedSliceOp(
-      rewriter, op, tf_ss_op.getResult(), tf_ss_op.input(), tf_ss_op.begin(),
-      tf_ss_op.end(), tf_ss_op.strides(), tf_ss_op.begin_maskAttr().getInt(),
-      tf_ss_op.end_maskAttr().getInt(), tf_ss_op.ellipsis_maskAttr().getInt(),
-      tf_ss_op.new_axis_maskAttr().getInt(),
-      tf_ss_op.shrink_axis_maskAttr().getInt());
+      rewriter, op, tf_ss_op.getResult(), tf_ss_op.getInput(),
+      tf_ss_op.getBegin(), tf_ss_op.getEnd(), tf_ss_op.getStrides(),
+      tf_ss_op.getBeginMaskAttr().getInt(), tf_ss_op.getEndMaskAttr().getInt(),
+      tf_ss_op.getEllipsisMaskAttr().getInt(),
+      tf_ss_op.getNewAxisMaskAttr().getInt(),
+      tf_ss_op.getShrinkAxisMaskAttr().getInt());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1987,11 +2044,11 @@ LogicalResult ConvertTFZerosLikeOp::matchAndRewrite(
   auto tf_zeroslike_op = cast<TF::ZerosLikeOp>(op);
 
   llvm::Optional<Value> result = convertZerosLikeOp(
-      rewriter, op, tf_zeroslike_op.getResult(), tf_zeroslike_op.x());
+      rewriter, op, tf_zeroslike_op.getResult(), tf_zeroslike_op.getX());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2004,7 +2061,7 @@ LogicalResult ConvertTFSigmoidOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::SigmoidOp>(rewriter, op, output_type,
-                                           tf_sigmoid_op.x());
+                                           tf_sigmoid_op.getX());
 
   return success();
 }
@@ -2017,7 +2074,7 @@ LogicalResult ConvertTFTanhOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::TanhOp>(rewriter, op, output_type,
-                                        tf_tanh_op.x());
+                                        tf_tanh_op.getX());
 
   return success();
 }
@@ -2050,7 +2107,7 @@ LogicalResult ConvertTFLeakyReluOp::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "only support F32");
   }
 
-  FloatAttr tmpAttr = tf_leakyrelu_op.alphaAttr();
+  FloatAttr tmpAttr = tf_leakyrelu_op.getAlphaAttr();
   // There is disagreement between the MLIR .td defaults and TF
   // documentation on 0.2 vs 0.3, but 0.2 will be used here.
   double alpha = 0.2;
@@ -2062,15 +2119,15 @@ LogicalResult ConvertTFLeakyReluOp::matchAndRewrite(
   Value const_zero = getTosaConstTensorSingleF32(rewriter, op, 0.0);
 
   auto a1_mul = CreateOpAndInfer<tosa::MulOp>(
-      rewriter, op->getLoc(), output_type, tf_leakyrelu_op.features(),
+      rewriter, op->getLoc(), output_type, tf_leakyrelu_op.getFeatures(),
       getTosaConstTensorSingleF32(rewriter, op, alpha), 0);
 
   auto a2_ge = CreateOpAndInfer<tosa::GreaterEqualOp>(
       rewriter, op->getLoc(), UnrankedTensorType::get(rewriter.getI1Type()),
-      tf_leakyrelu_op.features(), const_zero);
+      tf_leakyrelu_op.getFeatures(), const_zero);
 
   auto a3_select = CreateOpAndInfer<tosa::SelectOp>(
-      rewriter, op->getLoc(), output_type, a2_ge, tf_leakyrelu_op.features(),
+      rewriter, op->getLoc(), output_type, a2_ge, tf_leakyrelu_op.getFeatures(),
       a1_mul.getResult());
 
   rewriter.replaceOp(op, {a3_select.getResult()});
@@ -2086,7 +2143,7 @@ LogicalResult ConvertTFNegOp::matchAndRewrite(Operation* op,
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::NegateOp>(rewriter, op, output_type,
-                                          tf_neg_op.x());
+                                          tf_neg_op.getX());
 
   return success();
 }
@@ -2099,7 +2156,7 @@ LogicalResult ConvertTFStopGradientOp::matchAndRewrite(
   if (!output_type) return failure();
 
   CreateReplaceOpAndInfer<tosa::IdentityOp>(rewriter, op, output_type,
-                                            tf_stopgrad_op.input());
+                                            tf_stopgrad_op.getInput());
 
   return success();
 }
@@ -2108,17 +2165,17 @@ LogicalResult ConvertTFReverseV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_reverse_op = cast<TF::ReverseV2Op>(op);
   RankedTensorType input_type =
-      tf_reverse_op.tensor().getType().dyn_cast<RankedTensorType>();
+      tf_reverse_op.getTensor().getType().dyn_cast<RankedTensorType>();
   TensorType output_type =
       tf_reverse_op.getResult().getType().dyn_cast<TensorType>();
   if (!input_type || !output_type) return failure();
 
   ElementsAttr axis_elems;
-  if (!matchPattern(tf_reverse_op.axis(), m_Constant(&axis_elems)))
+  if (!matchPattern(tf_reverse_op.getAxis(), m_Constant(&axis_elems)))
     return failure();
 
   auto input_rank = input_type.getShape().size();
-  Value val = tf_reverse_op.tensor();
+  Value val = tf_reverse_op.getTensor();
   if (axis_elems.getNumElements() == 0) {
     auto identity_op = CreateOpAndInfer<tosa::IdentityOp>(
         rewriter, op->getLoc(), output_type, val);
@@ -2150,15 +2207,15 @@ LogicalResult ConvertTFFakeQuantWithMinMaxArgsOp::matchAndRewrite(
   if (!output_type) return failure();
 
   llvm::Optional<Value> result =
-      convertFakeQuantOp(rewriter, op, output_type, tf_fakequant_op.inputs(),
-                         tf_fakequant_op.minAttr().getValueAsDouble(),
-                         tf_fakequant_op.maxAttr().getValueAsDouble(),
-                         tf_fakequant_op.num_bitsAttr().getInt(),
-                         tf_fakequant_op.narrow_rangeAttr().getValue());
+      convertFakeQuantOp(rewriter, op, output_type, tf_fakequant_op.getInputs(),
+                         tf_fakequant_op.getMinAttr().getValueAsDouble(),
+                         tf_fakequant_op.getMaxAttr().getValueAsDouble(),
+                         tf_fakequant_op.getNumBitsAttr().getInt(),
+                         tf_fakequant_op.getNarrowRangeAttr().getValue());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2174,10 +2231,10 @@ LogicalResult ConvertTFFakeQuantWithMinMaxVarsOp::matchAndRewrite(
 
   // Only support min/max that can be matched at compile time
   ElementsAttr min_elems, max_elems;
-  if (!matchPattern(tf_fakequant_op.min(), m_Constant(&min_elems)))
+  if (!matchPattern(tf_fakequant_op.getMin(), m_Constant(&min_elems)))
     return failure();
 
-  if (!matchPattern(tf_fakequant_op.max(), m_Constant(&max_elems)))
+  if (!matchPattern(tf_fakequant_op.getMax(), m_Constant(&max_elems)))
     return failure();
 
   if (min_elems.getNumElements() != 1 && max_elems.getNumElements() != 1)
@@ -2187,13 +2244,13 @@ LogicalResult ConvertTFFakeQuantWithMinMaxVarsOp::matchAndRewrite(
   int64_t max_val = max_elems.getValues<IntegerAttr>()[0].getInt();
 
   llvm::Optional<Value> result = convertFakeQuantOp(
-      rewriter, op, output_type, tf_fakequant_op.inputs(), min_val, max_val,
-      tf_fakequant_op.num_bitsAttr().getInt(),
-      tf_fakequant_op.narrow_rangeAttr().getValue());
+      rewriter, op, output_type, tf_fakequant_op.getInputs(), min_val, max_val,
+      tf_fakequant_op.getNumBitsAttr().getInt(),
+      tf_fakequant_op.getNarrowRangeAttr().getValue());
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2206,8 +2263,9 @@ LogicalResult ConvertTFLeftShiftOp::matchAndRewrite(
       tf_left_shift_op.getResult().getType().dyn_cast<TensorType>();
   if (!output_type) return failure();
 
-  CreateReplaceOpAndInfer<tosa::LogicalLeftShiftOp>(
-      rewriter, op, output_type, tf_left_shift_op.x(), tf_left_shift_op.y());
+  CreateReplaceOpAndInfer<tosa::LogicalLeftShiftOp>(rewriter, op, output_type,
+                                                    tf_left_shift_op.getX(),
+                                                    tf_left_shift_op.getY());
 
   return success();
 }
@@ -2229,12 +2287,12 @@ LogicalResult ConvertTFRightShiftOp::matchAndRewrite(
 
   if (is_signed) {
     CreateReplaceOpAndInfer<tosa::ArithmeticRightShiftOp>(
-        rewriter, op, output_type, tf_right_shift_op.x(), tf_right_shift_op.y(),
-        false);
+        rewriter, op, output_type, tf_right_shift_op.getX(),
+        tf_right_shift_op.getY(), false);
   } else {
     CreateReplaceOpAndInfer<tosa::LogicalRightShiftOp>(
-        rewriter, op, output_type, tf_right_shift_op.x(),
-        tf_right_shift_op.y());
+        rewriter, op, output_type, tf_right_shift_op.getX(),
+        tf_right_shift_op.getY());
   }
 
   return success();
@@ -2245,20 +2303,20 @@ LogicalResult ConvertTFOneHotOp::matchAndRewrite(
   auto tf_one_hot_op = cast<TF::OneHotOp>(op);
 
   ElementsAttr depth_elems;
-  if (!matchPattern(tf_one_hot_op.depth(), m_Constant(&depth_elems)))
+  if (!matchPattern(tf_one_hot_op.getDepth(), m_Constant(&depth_elems)))
     return failure();
   int32_t depth = depth_elems.getValues<IntegerAttr>()[0].getInt();
 
-  IntegerAttr axisAttr = tf_one_hot_op.axisAttr();
+  IntegerAttr axisAttr = tf_one_hot_op.getAxisAttr();
   int32_t axis = axisAttr.getInt();
 
   llvm::Optional<Value> result = convertOneHotOp(
-      rewriter, op, tf_one_hot_op.getResult(), tf_one_hot_op.indices(),
-      tf_one_hot_op.on_value(), tf_one_hot_op.off_value(), depth, axis);
+      rewriter, op, tf_one_hot_op.getResult(), tf_one_hot_op.getIndices(),
+      tf_one_hot_op.getOnValue(), tf_one_hot_op.getOffValue(), depth, axis);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2268,9 +2326,9 @@ LogicalResult ConvertTFBatchMatMulV2Op::matchAndRewrite(
   auto tf_batch_matmul_op = cast<TF::BatchMatMulV2Op>(op);
 
   RankedTensorType x_type =
-      tf_batch_matmul_op.x().getType().dyn_cast<RankedTensorType>();
+      tf_batch_matmul_op.getX().getType().dyn_cast<RankedTensorType>();
   RankedTensorType y_type =
-      tf_batch_matmul_op.y().getType().dyn_cast<RankedTensorType>();
+      tf_batch_matmul_op.getY().getType().dyn_cast<RankedTensorType>();
   RankedTensorType output_type =
       tf_batch_matmul_op.getResult().getType().dyn_cast<RankedTensorType>();
 
@@ -2290,8 +2348,8 @@ LogicalResult ConvertTFBatchMatMulV2Op::matchAndRewrite(
   // Rank 3 batch matmul can be directly mapped to tosa.matmul trivially.
   if (x_type.getRank() == 3) {
     CreateReplaceOpAndInfer<tosa::MatMulOp>(rewriter, op, output_type,
-                                            tf_batch_matmul_op.x(),
-                                            tf_batch_matmul_op.y());
+                                            tf_batch_matmul_op.getX(),
+                                            tf_batch_matmul_op.getY());
   } else {
     // 1. Reshape x from: (similar for y)
     //  [a0, a1, ... an, H, C] to [N, H, C].
@@ -2321,12 +2379,12 @@ LogicalResult ConvertTFBatchMatMulV2Op::matchAndRewrite(
         rank3_output_shape, output_type.getElementType());
 
     auto op1_reshape_x = CreateOpAndInfer<tosa::ReshapeOp>(
-        rewriter, op->getLoc(), rank3_x_type, tf_batch_matmul_op.x(),
-        rewriter.getI64ArrayAttr(rank3_x_shape));
+        rewriter, op->getLoc(), rank3_x_type, tf_batch_matmul_op.getX(),
+        rewriter.getDenseI64ArrayAttr(rank3_x_shape));
 
     auto op2_reshape_y = CreateOpAndInfer<tosa::ReshapeOp>(
-        rewriter, op->getLoc(), rank3_y_type, tf_batch_matmul_op.y(),
-        rewriter.getI64ArrayAttr(rank3_y_shape));
+        rewriter, op->getLoc(), rank3_y_type, tf_batch_matmul_op.getY(),
+        rewriter.getDenseI64ArrayAttr(rank3_y_shape));
 
     auto op3_matmul_op1_op2 = CreateOpAndInfer<tosa::MatMulOp>(
         rewriter, op->getLoc(), rank3_output_type, op1_reshape_x.getResult(),
@@ -2334,7 +2392,7 @@ LogicalResult ConvertTFBatchMatMulV2Op::matchAndRewrite(
 
     CreateReplaceOpAndInfer<tosa::ReshapeOp>(
         rewriter, op, output_type, op3_matmul_op1_op2.getResult(),
-        rewriter.getI64ArrayAttr(output_type.getShape()));
+        rewriter.getDenseI64ArrayAttr(output_type.getShape()));
   }
   return success();
 }
@@ -2423,6 +2481,8 @@ void populateLegalizeTFPatterns(MLIRContext* ctx, RewritePatternSet& patterns) {
   patterns.add<ConvertTFSelectV2Op>(ctx);
   patterns.add<ConvertTFSpaceToDepthOp>(ctx);
   patterns.add<ConvertTFDepthToSpaceOp>(ctx);
+  patterns.add<ConvertTFSinOp>(ctx);
+  patterns.add<ConvertTFCosOp>(ctx);
   patterns.add<ConvertTFSpaceToBatchNDOp>(ctx);
   patterns.add<ConvertTFBatchToSpaceNDOp>(ctx);
   patterns.add<ConvertTFZerosLikeOp>(ctx);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index ca12ed601a7..8c7f0d7ecdc 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -29,15 +29,16 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
-#include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_common.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
@@ -91,6 +92,7 @@ struct ConvertConstantOp : public RewritePattern {
 DECL_CONVERT_OP(Gelu);
 DECL_CONVERT_OP(Relu);
 DECL_CONVERT_OP(Relu1);
+DECL_CONVERT_OP(Relu0To1);
 DECL_CONVERT_OP(Relu6);
 DECL_CONVERT_OP(Equal);
 DECL_CONVERT_OP(NotEqual);
@@ -157,8 +159,10 @@ DECL_CONVERT_OP(SpaceToBatchNd);
 DECL_CONVERT_OP(BatchToSpaceNd);
 DECL_CONVERT_OP(SpaceToDepth);
 DECL_CONVERT_OP(DepthToSpace);
+DECL_CONVERT_OP(Bucketize);
 DECL_CONVERT_OP(Sin);
 DECL_CONVERT_OP(Cos);
+DECL_CONVERT_OP(Atan2);
 DECL_CONVERT_OP(Logistic);
 DECL_CONVERT_OP(Tanh);
 DECL_CONVERT_OP(PRelu);
@@ -177,6 +181,7 @@ DECL_CONVERT_OP(SparseToDense);
 DECL_CONVERT_OP(OneHot);
 DECL_CONVERT_OP(ArgMax);
 DECL_CONVERT_OP(FakeQuant);
+DECL_CONVERT_OP(While);
 
 #undef DECL_CONVERT_OP
 
@@ -387,6 +392,56 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
   return success();
 }
 
+LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_relu0to1_op = cast<TFL::Relu0To1Op>(op);
+
+  ShapedType input_type = tfl_relu0to1_op.getX().getType().cast<ShapedType>();
+  ShapedType output_type =
+      tfl_relu0to1_op.getResult().getType().cast<ShapedType>();
+
+  bool input_is_qtype =
+      input_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+  bool output_is_qtype =
+      output_type.getElementType().isa<mlir::quant::UniformQuantizedType>();
+
+  if (input_is_qtype != output_is_qtype) {
+    return rewriter.notifyMatchFailure(
+        op,
+        "input/output tensor should be all quantized or all floating-point");
+  }
+
+  int64_t clamp_min = 0;
+  int64_t clamp_max = 1;
+  Value clamp_in = tfl_relu0to1_op.getX();
+
+  if (output_is_qtype && input_is_qtype) {
+    UniformQuantizedType input_qtype =
+        input_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+    UniformQuantizedType output_qtype =
+        output_type.getElementType().cast<mlir::quant::UniformQuantizedType>();
+
+    clamp_min = output_qtype.getZeroPoint();
+
+    clamp_max = std::llround(1.0f / output_qtype.getScale()) +
+                output_qtype.getZeroPoint();
+
+    clamp_in =
+        buildRescale(rewriter, op, output_type, tfl_relu0to1_op.getX(),
+                     input_qtype.getScale() / output_qtype.getScale(),
+                     input_qtype.getZeroPoint(), output_qtype.getZeroPoint(),
+                     /*double_round=*/false, /*scale32=*/true);
+  }
+
+  CreateReplaceOpAndInfer<tosa::ClampOp>(rewriter, op, output_type, clamp_in,
+                                         rewriter.getI64IntegerAttr(clamp_min),
+                                         rewriter.getI64IntegerAttr(clamp_max),
+                                         rewriter.getF32FloatAttr(0.0f),
+                                         rewriter.getF32FloatAttr(1.0f));
+
+  return success();
+}
+
 LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_relu6_op = cast<TFL::Relu6Op>(op);
@@ -671,7 +726,7 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
@@ -705,15 +760,15 @@ LogicalResult ConvertTFLMulOp::matchAndRewrite(
 
   if (fused_activation_fn) {
     llvm::Optional<Value> fused_activation_val = convertFusedActivation(
-        rewriter, op, result.getValue(), fused_activation_fn);
+        rewriter, op, result.value(), fused_activation_fn);
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -727,7 +782,7 @@ LogicalResult ConvertTFLSquareOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -741,7 +796,7 @@ LogicalResult ConvertTFLSquaredDifferenceOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -760,7 +815,7 @@ LogicalResult ConvertTFLRoundOp::matchAndRewrite(
 
     if (!result) return failure();
 
-    rewriter.replaceOp(op, {result.getValue()});
+    rewriter.replaceOp(op, {result.value()});
     return success();
 
   } else {
@@ -805,7 +860,7 @@ LogicalResult ConvertTFLDivOp::matchAndRewrite(
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
@@ -933,7 +988,7 @@ LogicalResult ConvertTFLFloorDivOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -948,7 +1003,7 @@ LogicalResult ConvertTFLFloorModOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -991,13 +1046,13 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
 
   // Kernels and strides are dimensionally ordered
   SmallVector<int64_t, 4> i64array({1, 1, 1, 1});
-  ArrayAttr kernel_size;
-  ArrayAttr stride;
-  ArrayAttr pad;
+  DenseI64ArrayAttr kernel_size;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr pad;
   {
     int64_t kernel_h = tfl_avgpool_op.getFilterHeight();
     int64_t kernel_w = tfl_avgpool_op.getFilterWidth();
-    kernel_size = rewriter.getI64ArrayAttr({kernel_h, kernel_w});
+    kernel_size = rewriter.getDenseI64ArrayAttr({kernel_h, kernel_w});
     // i64array is formatted as NHWC now
     i64array[1] = kernel_h;
     i64array[2] = kernel_w;
@@ -1005,7 +1060,7 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
   {
     int64_t stride_h = tfl_avgpool_op.getStrideH();
     int64_t stride_w = tfl_avgpool_op.getStrideW();
-    stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+    stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
   }
   {
     tensorflow::Padding tf_pad;
@@ -1013,10 +1068,10 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
       return failure();
 
     // Pooling has no non-unit dilation
-    ArrayAttr dilation = rewriter.getI64ArrayAttr({1, 1});
+    DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr({1, 1});
 
     RankedTensorType filter_type = RankedTensorType::get(
-        llvm::makeArrayRef(i64array), rewriter.getIntegerType(64));
+        llvm::ArrayRef(i64array), rewriter.getIntegerType(64));
 
     // TFLite doesn't support explicit padding
     if (!getPaddingValuesFromPadType(
@@ -1068,13 +1123,13 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
 
   // Kernels and strides are dimensionally ordered
   SmallVector<int64_t, 4> i64array({1, 1, 1, 1});
-  ArrayAttr kernel_size;
-  ArrayAttr stride;
-  ArrayAttr pad;
+  DenseI64ArrayAttr kernel_size;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr pad;
   {
     int64_t kernel_h = tfl_maxpool_op.getFilterHeight();
     int64_t kernel_w = tfl_maxpool_op.getFilterWidth();
-    kernel_size = rewriter.getI64ArrayAttr({kernel_h, kernel_w});
+    kernel_size = rewriter.getDenseI64ArrayAttr({kernel_h, kernel_w});
     // i64array is formatted as NHWC now
     i64array[1] = kernel_h;
     i64array[2] = kernel_w;
@@ -1082,7 +1137,7 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
   {
     int64_t stride_h = tfl_maxpool_op.getStrideH();
     int64_t stride_w = tfl_maxpool_op.getStrideW();
-    stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+    stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
   }
   {
     tensorflow::Padding tf_pad;
@@ -1090,7 +1145,7 @@ LogicalResult ConvertTFLMaxPool2DOp::matchAndRewrite(
       return failure();
 
     // Pooling has no non-unit dilation
-    ArrayAttr dilation = rewriter.getI64ArrayAttr({1, 1});
+    DenseI64ArrayAttr dilation = rewriter.getDenseI64ArrayAttr({1, 1});
 
     RankedTensorType filter_type =
         RankedTensorType::get(i64array, rewriter.getIntegerType(64));
@@ -1140,18 +1195,18 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
         "be all quantized or all floating-point");
   }
 
-  ArrayAttr pad;
-  ArrayAttr stride;
-  ArrayAttr dilation;
+  DenseI64ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr dilation;
   {
     int64_t stride_h = tfl_conv2d_op.getStrideH();
     int64_t stride_w = tfl_conv2d_op.getStrideW();
-    stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+    stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
   }
   {
     int64_t dilation_h = tfl_conv2d_op.getDilationHFactor();
     int64_t dilation_w = tfl_conv2d_op.getDilationWFactor();
-    dilation = rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+    dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
   }
   {
     tensorflow::Padding tf_pad;
@@ -1195,7 +1250,7 @@ LogicalResult ConvertTFLConv2DOp::matchAndRewrite(
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
@@ -1263,9 +1318,9 @@ LogicalResult ConvertTFLConv3DOp::matchAndRewrite(
 
   Value conv3d_output =
       input_is_qtype
-          ? buildRescaleOpConvOutput(rewriter, op, a1_conv3d_op.getValue(),
+          ? buildRescaleOpConvOutput(rewriter, op, a1_conv3d_op.value(),
                                      input_type, filter_type, output_type)
-          : a1_conv3d_op.getValue();
+          : a1_conv3d_op.value();
 
   if (auto fused_activation_fn =
           tfl_conv3d_op.getFusedActivationFunctionAttr()) {
@@ -1313,13 +1368,13 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
         "be all quantized or all floating-point");
   }
 
-  ArrayAttr stride;
-  ArrayAttr outpad;
-  ArrayAttr output_shape;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr outpad;
+  DenseI64ArrayAttr output_shape;
   {
     int64_t stride_h = tfl_conv_op.getStrideH();
     int64_t stride_w = tfl_conv_op.getStrideW();
-    stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+    stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
   }
 
   {
@@ -1343,10 +1398,10 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
       for (int i = 0; i < output_shape_elems.getNumElements(); i++)
         shape_vec.push_back(
             output_shape_elems.getValues<APInt>()[i].getSExtValue());
-      output_shape = rewriter.getI64ArrayAttr(shape_vec);
+      output_shape = rewriter.getDenseI64ArrayAttr(shape_vec);
     } else if (output_type.hasRank()) {
       // Use output tensor's shape otherwise
-      output_shape = rewriter.getI64ArrayAttr(output_type.getShape());
+      output_shape = rewriter.getDenseI64ArrayAttr(output_type.getShape());
     } else {
       // TODO(suderman): Figure out rankless shape propagation.
       return failure();
@@ -1390,7 +1445,7 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
 
   auto a1_conv2d_op = CreateOpAndInfer<tosa::TransposeConv2DOp>(
       rewriter, op->getLoc(), output_type.clone(bias_ety),
-      tfl_conv_op.getInput(), tfl_conv_op.getWeights(), zero_bias.getValue(),
+      tfl_conv_op.getInput(), tfl_conv_op.getWeights(), zero_bias.value(),
       outpad, stride, output_shape);
 
   Value conv2d_output;
@@ -1402,6 +1457,18 @@ LogicalResult ConvertTFLTransposeConvOp::matchAndRewrite(
     conv2d_output = a1_conv2d_op.getResult();
   }
 
+  auto fused_activation_fn = tfl_conv_op.getFusedActivationFunctionAttr();
+
+  if (fused_activation_fn) {
+    llvm::Optional<Value> fused_activation_val = convertFusedActivation(
+        rewriter, op, conv2d_output, fused_activation_fn);
+
+    if (!fused_activation_val) return failure();
+
+    rewriter.replaceOp(op, {fused_activation_val.value()});
+    return success();
+  }
+
   rewriter.replaceOp(op, {conv2d_output});
 
   return success();
@@ -1452,20 +1519,20 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
   // a3_transpose_conv2d = tosa.transpose_conv2d(input, a2_reshape, padding,
   // stride, dilation)
 
-  ArrayAttr pad;
-  ArrayAttr stride;
-  ArrayAttr dilation;
+  DenseI64ArrayAttr pad;
+  DenseI64ArrayAttr stride;
+  DenseI64ArrayAttr dilation;
   auto depth_multiplier = tfl_conv2d_op.getDepthMultiplierAttr();
 
   {
     int64_t stride_h = tfl_conv2d_op.getStrideH();
     int64_t stride_w = tfl_conv2d_op.getStrideW();
-    stride = rewriter.getI64ArrayAttr({stride_h, stride_w});
+    stride = rewriter.getDenseI64ArrayAttr({stride_h, stride_w});
   }
   {
     int64_t dilation_h = tfl_conv2d_op.getDilationHFactor();
     int64_t dilation_w = tfl_conv2d_op.getDilationWFactor();
-    dilation = rewriter.getI64ArrayAttr({dilation_h, dilation_w});
+    dilation = rewriter.getDenseI64ArrayAttr({dilation_h, dilation_w});
   }
   {
     tensorflow::Padding tf_pad;
@@ -1501,14 +1568,14 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
       rewriter, op->getLoc(),
       RankedTensorType::get(ArrayRef<int64_t>(a1_transpose_dims),
                             filter_type.getElementType()),
-      tfl_conv2d_op.getFilter(), a1_filter_transpose_perms.getValue());
+      tfl_conv2d_op.getFilter(), a1_filter_transpose_perms.value());
 
   auto a2_filter_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, op->getLoc(),
       RankedTensorType::get(ArrayRef<int64_t>(a2_reshape_dims),
                             filter_type.getElementType()),
       a1_filter_transpose_op.getResult(),
-      rewriter.getI64ArrayAttr(a2_reshape_dims));
+      rewriter.getDenseI64ArrayAttr(a2_reshape_dims));
 
   Value unquantized_bias = tfl_conv2d_op.getBias();
   Type bias_ety =
@@ -1538,7 +1605,7 @@ LogicalResult ConvertTFLDepthwiseConv2DOp::matchAndRewrite(
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
@@ -1590,11 +1657,11 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
     lhs = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(lhs_ty.getElementType()), lhs,
-        rewriter.getI64ArrayAttr(new_lhs_shape));
+        rewriter.getDenseI64ArrayAttr(new_lhs_shape));
     rhs = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(rhs_ty.getElementType()), rhs,
-        rewriter.getI64ArrayAttr(new_rhs_shape));
+        rewriter.getDenseI64ArrayAttr(new_rhs_shape));
     lhs_ty = lhs.getType().cast<RankedTensorType>();
     rhs_ty = rhs.getType().cast<RankedTensorType>();
   }
@@ -1602,7 +1669,7 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
   if (transpose_lhs) {
     Value perms =
         getConstTensor<int32_t>(rewriter, op, /*vec=*/{0, 2, 1}, /*shape=*/{3})
-            .getValue();
+            .value();
     Type output_type = UnrankedTensorType::get(lhs_ty.getElementType());
     lhs = CreateOpAndInfer<tosa::TransposeOp>(rewriter, op->getLoc(),
                                               output_type, lhs, perms)
@@ -1612,7 +1679,7 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
   if (transpose_rhs) {
     Value perms =
         getConstTensor<int32_t>(rewriter, op, /*vec=*/{0, 2, 1}, /*shape=*/{3})
-            .getValue();
+            .value();
     Type output_type = UnrankedTensorType::get(rhs_ty.getElementType());
     rhs = CreateOpAndInfer<tosa::TransposeOp>(rewriter, op->getLoc(),
                                               output_type, rhs, perms)
@@ -1640,7 +1707,7 @@ LogicalResult ConvertTFLBatchMatMulOp::matchAndRewrite(
     matmul = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(matmul_ty.getElementType()), matmul,
-        rewriter.getI64ArrayAttr(new_shape));
+        rewriter.getDenseI64ArrayAttr(new_shape));
   }
 
   if (lhs_is_qtype) {
@@ -1704,7 +1771,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
         RankedTensorType::get(shape_vals, input_type.getElementType());
     auto reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(), reshape_type, tfl_fc_op.getInput(),
-        rewriter.getI64ArrayAttr(shape_vals));
+        rewriter.getDenseI64ArrayAttr(shape_vals));
 
     input_val = reshape_op.getResult();
   }
@@ -1727,7 +1794,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
       new_bias_type =
           RankedTensorType::get(bias_shape, input_type.getElementType());
       bias_attr =
-          DenseElementsAttr::get(new_bias_type, llvm::makeArrayRef(bias_arr));
+          DenseElementsAttr::get(new_bias_type, llvm::ArrayRef(bias_arr));
     } else {
       SmallVector<int32_t> bias_arr(bias_shape[0]);
 
@@ -1745,7 +1812,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
                               : rewriter.getI32Type();
       new_bias_type = RankedTensorType::get(bias_shape, new_bias_ety);
       bias_attr =
-          DenseElementsAttr::get(new_bias_type, llvm::makeArrayRef(bias_arr));
+          DenseElementsAttr::get(new_bias_type, llvm::ArrayRef(bias_arr));
     }
     auto bias_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                    new_bias_type, bias_attr);
@@ -1778,7 +1845,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
     fc_output = CreateOpAndInfer<tosa::ReshapeOp>(
         rewriter, op->getLoc(),
         UnrankedTensorType::get(fc_type.getElementType()), fc_output,
-        rewriter.getI64ArrayAttr(output_type.getShape()));
+        rewriter.getDenseI64ArrayAttr(output_type.getShape()));
   }
 
   auto fused_activation_fn = tfl_fc_op.getFusedActivationFunctionAttr();
@@ -1789,7 +1856,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
 
     if (!fused_activation_val) return failure();
 
-    rewriter.replaceOp(op, {fused_activation_val.getValue()});
+    rewriter.replaceOp(op, {fused_activation_val.value()});
     return success();
   }
 
@@ -1801,6 +1868,7 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
 LogicalResult ConvertTFLConcatenationOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_concat_op = cast<TFL::ConcatenationOp>(op);
+  auto result_type = tfl_concat_op.getResult().getType().dyn_cast<ShapedType>();
 
   SmallVector<Value> values(tfl_concat_op.getValues());
 
@@ -1815,11 +1883,11 @@ LogicalResult ConvertTFLConcatenationOp::matchAndRewrite(
   int32_t axis = axis_attr.getInt();
 
   llvm::Optional<Value> result =
-      convertConcatV2Op(rewriter, op, tfl_concat_op.getResult(), values, axis);
+      convertConcatV2Op(rewriter, op, result_type, values, axis);
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
   return success();
 }
 
@@ -1831,20 +1899,20 @@ LogicalResult ConvertTFLReshapeOp::matchAndRewrite(
   ShapedType shape_type = shape.getType().dyn_cast<ShapedType>();
   ShapedType output_type = tfl_reshape_op.getType().dyn_cast<ShapedType>();
 
-  int64_t rank = ShapedType::kDynamicSize;
+  int64_t rank = ShapedType::kDynamic;
   if (output_type.hasRank()) rank = output_type.getRank();
 
   // Check the inferred rank from the shape tensor matches the output.
   if (shape_type.hasRank() && !shape_type.isDynamicDim(0)) {
     int64_t dim = shape_type.getDimSize(0);
-    if (rank != ShapedType::kDynamicSize && rank != dim) {
+    if (rank != ShapedType::kDynamic && rank != dim) {
       return rewriter.notifyMatchFailure(op,
                                          "static dim mismatch on tfl.reshape");
     }
     rank = dim;
   }
 
-  if (rank == ShapedType::kDynamicSize) {
+  if (rank == ShapedType::kDynamic) {
     return rewriter.notifyMatchFailure(op, "unknown rank for output shape");
   }
 
@@ -1856,10 +1924,10 @@ LogicalResult ConvertTFLReshapeOp::matchAndRewrite(
     auto e_ty = shape_ty.getElementType();
     Value dim = rewriter.createOrFold<tosa::SliceOp>(
         op->getLoc(), RankedTensorType::get({1}, e_ty), shape,
-        rewriter.getI64ArrayAttr({i}), rewriter.getI64ArrayAttr({1}));
+        rewriter.getDenseI64ArrayAttr({i}), rewriter.getDenseI64ArrayAttr({1}));
     dim = rewriter.createOrFold<tosa::ReshapeOp>(
         op->getLoc(), RankedTensorType::get({}, e_ty), dim,
-        rewriter.getI64ArrayAttr({}));
+        rewriter.getDenseI64ArrayAttr({}));
     shape_vals.push_back(dim);
   }
 
@@ -1870,7 +1938,7 @@ LogicalResult ConvertTFLReshapeOp::matchAndRewrite(
 
   if (!reshape.has_value()) return failure();
 
-  rewriter.replaceOp(op, {reshape.getValue()});
+  rewriter.replaceOp(op, {reshape.value()});
   return success();
 }
 
@@ -1886,7 +1954,7 @@ LogicalResult ConvertTFLRankOp::matchAndRewrite(
 
   RankedTensorType rank_type =
       RankedTensorType::get({1}, rewriter.getIntegerType(32));
-  auto rank_attr = DenseElementsAttr::get(rank_type, {rank});
+  auto rank_attr = DenseI32ArrayAttr::get(rewriter.getContext(), {rank});
   auto rank_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                     rank_type, rank_attr);
 
@@ -1919,7 +1987,7 @@ LogicalResult ConvertTFLShapeOp::matchAndRewrite(
   RankedTensorType shape_type = RankedTensorType::get(
       {static_cast<int32_t>(shape_arr.size())}, rewriter.getIntegerType(32));
   auto shape_attr =
-      DenseElementsAttr::get(shape_type, llvm::makeArrayRef(shape_arr));
+      DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(shape_arr));
   auto shape_const = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                      shape_type, shape_attr);
 
@@ -1938,7 +2006,7 @@ LogicalResult ConvertTFLExpandDimsOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1960,7 +2028,7 @@ LogicalResult ConvertTFLSqueezeOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -1990,17 +2058,19 @@ LogicalResult ConvertTFLFillOp::matchAndRewrite(
 
   RankedTensorType fill_type = RankedTensorType::get(
       ArrayRef<int64_t>(dims_vals), value_elem.getType().getElementType());
-  DenseElementsAttr fill_attr;
+  DenseArrayAttr fill_attr;
 
   // Convert to a compatible zero type.
   if (value_elem.getType().getElementType().isa<FloatType>()) {
     SmallVector<float> fill_arr(
         total_size, value_elem.getValues<APFloat>()[0].convertToFloat());
-    fill_attr = DenseElementsAttr::get(fill_type, llvm::makeArrayRef(fill_arr));
+    fill_attr =
+        DenseF32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   } else {
     SmallVector<int32_t> fill_arr(
         total_size, value_elem.getValues<APInt>()[0].getLimitedValue());
-    fill_attr = DenseElementsAttr::get(fill_type, llvm::makeArrayRef(fill_arr));
+    fill_attr =
+        DenseI32ArrayAttr::get(rewriter.getContext(), llvm::ArrayRef(fill_arr));
   }
   auto fill_const_op = CreateOpAndInfer<tosa::ConstOp>(rewriter, op->getLoc(),
                                                        fill_type, fill_attr);
@@ -2026,7 +2096,7 @@ LogicalResult ConvertTFLReduceAnyOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2048,7 +2118,7 @@ LogicalResult ConvertTFLReduceMaxOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2070,7 +2140,7 @@ LogicalResult ConvertTFLReduceMinOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2092,7 +2162,7 @@ LogicalResult ConvertTFLReduceProdOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2114,7 +2184,7 @@ LogicalResult ConvertTFLMeanOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2136,7 +2206,7 @@ LogicalResult ConvertTFLSumOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2150,7 +2220,7 @@ LogicalResult ConvertTFLEluOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2165,7 +2235,7 @@ LogicalResult ConvertTFLSoftmaxOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2201,14 +2271,14 @@ LogicalResult ConvertTFLL2NormalizationOp::matchAndRewrite(
         rewriter.getI64IntegerAttr(input_ty.getRank() - 1));
 
     SmallVector<float> min(1, sqrt(std::numeric_limits<float>::min()));
-    Value min_val = getConstTensor<float>(rewriter, op, min, {}).getValue();
+    Value min_val = getConstTensor<float>(rewriter, op, min, {}).value();
     auto max = CreateOpAndInfer<tosa::MaximumOp>(rewriter, loc, result_ty, sum,
                                                  min_val);
     auto rsqrt = CreateOpAndInfer<tosa::RsqrtOp>(rewriter, loc, result_ty, max)
                      .getResult();
-    auto result = CreateOpAndInfer<tosa::MulOp>(rewriter, loc, result_ty, rsqrt,
-                                                input, shift)
-                      .getResult();
+    Value result = CreateOpAndInfer<tosa::MulOp>(rewriter, loc, result_ty,
+                                                 rsqrt, input, shift)
+                       .getResult();
 
     auto fused_activation_fn = tfl_l2norm_op.getFusedActivationFunctionAttr();
 
@@ -2216,7 +2286,7 @@ LogicalResult ConvertTFLL2NormalizationOp::matchAndRewrite(
       llvm::Optional<Value> fused_activation_val =
           convertFusedActivation(rewriter, op, result, fused_activation_fn);
       if (!fused_activation_val) return failure();
-      result = fused_activation_val.getValue();
+      result = fused_activation_val.value();
     }
 
     rewriter.replaceOp(op, result);
@@ -2236,7 +2306,7 @@ LogicalResult ConvertTFLLogSoftmaxOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2265,8 +2335,8 @@ LogicalResult ConvertTFLSliceOp::matchAndRewrite(
   for (int i = 0; i < size_elems.getNumElements(); i++)
     size_vals.push_back(size_elems.getValues<APInt>()[i].getSExtValue());
 
-  ArrayAttr begin = rewriter.getI64ArrayAttr(begin_vals);
-  ArrayAttr size = rewriter.getI64ArrayAttr(size_vals);
+  DenseI64ArrayAttr begin = rewriter.getDenseI64ArrayAttr(begin_vals);
+  DenseI64ArrayAttr size = rewriter.getDenseI64ArrayAttr(size_vals);
 
   CreateReplaceOpAndInfer<tosa::SliceOp>(rewriter, op, output_type,
                                          tfl_slice_op.getInput(), begin, size);
@@ -2290,7 +2360,8 @@ LogicalResult ConvertTFLTileOp::matchAndRewrite(
     multiples_vals.push_back(
         multiples_elems.getValues<APInt>()[i].getSExtValue());
 
-  ArrayAttr multiples_attr = rewriter.getI64ArrayAttr(multiples_vals);
+  DenseI64ArrayAttr multiples_attr =
+      rewriter.getDenseI64ArrayAttr(multiples_vals);
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
                                         tfl_tile_op.getInput(), multiples_attr);
 
@@ -2329,7 +2400,7 @@ LogicalResult ConvertTFLPackOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2351,7 +2422,7 @@ LogicalResult ConvertTFLUnpackOp::matchAndRewrite(
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -2386,7 +2457,7 @@ LogicalResult ConvertTFLSplitOp::matchAndRewrite(
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -2424,7 +2495,7 @@ LogicalResult ConvertTFLSplitVOp::matchAndRewrite(
 
   if (!results) return failure();
 
-  rewriter.replaceOp(op, results.getValue());
+  rewriter.replaceOp(op, results.value());
 
   return success();
 }
@@ -2473,7 +2544,7 @@ LogicalResult ConvertTFLMirrorPadOp::matchAndRewrite(
       rewriter, op, output_type, tfl_mirrorpad_op.getInput(),
       tfl_mirrorpad_op.getPad(), mode);
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2508,7 +2579,7 @@ LogicalResult ConvertTFLResizeBilinearOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2530,7 +2601,7 @@ LogicalResult ConvertTFLResizeNearestNeighborOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2544,7 +2615,7 @@ LogicalResult ConvertTFLSelectOp::matchAndRewrite(
       tfl_sel_op.getX(), tfl_sel_op.getY());
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2558,7 +2629,7 @@ LogicalResult ConvertTFLSelectV2Op::matchAndRewrite(
       tfl_sel_op.getX(), tfl_sel_op.getY());
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2572,7 +2643,7 @@ LogicalResult ConvertTFLSpaceToBatchNdOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2587,7 +2658,7 @@ LogicalResult ConvertTFLBatchToSpaceNdOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2603,7 +2674,7 @@ LogicalResult ConvertTFLSpaceToDepthOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2619,7 +2690,69 @@ LogicalResult ConvertTFLDepthToSpaceOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
+
+  return success();
+}
+
+LogicalResult ConvertTFLBucketizeOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_bucketize_op = cast<TFL::BucketizeOp>(op);
+  Location loc = op->getLoc();
+
+  Value input = tfl_bucketize_op.getInput();
+  auto boundaries_attr = tfl_bucketize_op.getBoundaries();
+  RankedTensorType input_type = input.getType().dyn_cast<RankedTensorType>();
+  if (!input_type) {
+    return rewriter.notifyMatchFailure(op, "input is not a ranked tensor");
+  }
+
+  // The lowering is done by broadcasting the input and boundaries together, and
+  // using GE comparison for each input against each boundary. Adding the
+  // results of the comparison for each input generates the bucket it belongs
+  // to, as the boundaries are sorted.
+  ShapedType output_type =
+      tfl_bucketize_op.getResult().getType().dyn_cast<ShapedType>();
+
+  auto input_shape = input_type.getShape();
+
+  SmallVector<APFloat> boundaries;
+  for (auto& boundary : boundaries_attr) {
+    boundaries.emplace_back(boundary.dyn_cast<FloatAttr>().getValue());
+  }
+  int64_t boundaries_size = boundaries.size();
+
+  // Add a dim at the end of input shape for broadcasting with the boundaries.
+  SmallVector<int64_t> broadcast_shape(input_shape.begin(), input_shape.end());
+  broadcast_shape.push_back(boundaries_size);
+  SmallVector<int64_t> new_input_shape(input_shape.begin(), input_shape.end());
+  new_input_shape.push_back(1);
+
+  auto boundaries_type =
+      RankedTensorType::get({boundaries_size}, rewriter.getF32Type());
+
+  auto boundaries_op = CreateOpAndInfer<tosa::ConstOp>(
+      rewriter, loc, boundaries_type,
+      DenseElementsAttr::get(boundaries_type, boundaries));
+
+  auto reshaped_input = CreateOpAndInfer<tosa::ReshapeOp>(
+      rewriter, loc, input_type.clone(new_input_shape), input,
+      rewriter.getDenseI64ArrayAttr(new_input_shape));
+
+  auto ge = CreateOpAndInfer<tosa::GreaterEqualOp>(
+      rewriter, loc, UnrankedTensorType::get(rewriter.getIntegerType(1)),
+      reshaped_input, boundaries_op);
+
+  auto casted = CreateOpAndInfer<tosa::CastOp>(
+      rewriter, loc, UnrankedTensorType::get(rewriter.getIntegerType(32)), ge);
+
+  auto sum = CreateOpAndInfer<tosa::ReduceSumOp>(
+      rewriter, loc, output_type, casted,
+      rewriter.getI64IntegerAttr(input_type.getRank()));
+
+  CreateReplaceOpAndInfer<tosa::ReshapeOp>(
+      rewriter, op, output_type, sum,
+      rewriter.getDenseI64ArrayAttr(output_type.getShape()));
 
   return success();
 }
@@ -2638,7 +2771,7 @@ LogicalResult ConvertTFLStridedSliceOp::matchAndRewrite(
       tfl_ss_op.getShrinkAxisMaskAttr().getInt());
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2652,7 +2785,7 @@ LogicalResult ConvertTFLZerosLikeOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -2738,133 +2871,168 @@ LogicalResult ConvertTFLHardSwishOp::matchAndRewrite(
 LogicalResult ConvertTFLSinOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_sin_op = cast<TFL::SinOp>(op);
-  Location loc = op->getLoc();
-  Value input = tfl_sin_op.getX();
-  RankedTensorType input_ty = input.getType().dyn_cast<RankedTensorType>();
-  ShapedType output_ty =
+  auto input = tfl_sin_op.getX();
+  ShapedType output_type =
       tfl_sin_op.getResult().getType().dyn_cast<ShapedType>();
 
-  Type input_ety = input_ty.getElementType();
-  Type output_ety = output_ty.getElementType();
+  llvm::Optional<Value> result = convertSinOp(rewriter, op, input, output_type);
+  if (!result) return failure();
 
-  if (!input_ty || !output_ty) return failure();
+  rewriter.replaceOp(op, {result.value()});
+  return success();
+}
 
-  if (input_ety != output_ety) {
-    return rewriter.notifyMatchFailure(op,
-                                       "input/output element type must match");
-  }
+LogicalResult ConvertTFLCosOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_cos_op = cast<TFL::CosOp>(op);
+  Value input = tfl_cos_op.getX();
+  RankedTensorType input_ty = input.getType().dyn_cast<RankedTensorType>();
+  ShapedType output_ty =
+      tfl_cos_op.getResult().getType().dyn_cast<ShapedType>();
 
-  bool input_is_fp = input_ty.getElementType().isF32();
-  bool output_is_fp = output_ty.getElementType().isF32();
+  if (!input_ty || !output_ty) return failure();
+
+  bool input_is_fp = input_ty.getElementType().isa<mlir::FloatType>();
+  bool output_is_fp = output_ty.getElementType().isa<mlir::FloatType>();
 
   if (!input_is_fp || !output_is_fp) {
-    return rewriter.notifyMatchFailure(op, "input/result must be fp32");
+    return rewriter.notifyMatchFailure(op, "input/result must be fp");
   }
 
-  // To perform a sin operation we remap the sin domain to be over a single
-  // period of the function, remapping to the domain of the table function.
-  // We then remap the range of the table function to map to the range of the
-  // sin operation.
-
-  // 1. Normalize the period of the domain from [0, 2π) to [0, 1).
+  // Replace with the equivalent sin operation:
+  //   cos(x) = sin(x + π / 2).
   auto fp_scalar_ty = RankedTensorType::get({}, rewriter.getF32Type());
-  Value fp_scale = rewriter.create<tosa::ConstOp>(
-      loc, fp_scalar_ty,
-      DenseElementsAttr::get(fp_scalar_ty, {static_cast<float>(0.5 / M_PI)}));
+  auto pi_2 = rewriter.create<ConstOp>(
+      op->getLoc(), fp_scalar_ty,
+      DenseElementsAttr::get(fp_scalar_ty, {static_cast<float>(M_PI_2)}));
+  auto offset = rewriter.create<AddOp>(op->getLoc(), input_ty, input, pi_2);
 
-  // 2. Remap the periodic behavior of the domain to line up within [0, 1).
-  Value fp_scaled = CreateOpAndInfer<tosa::MulOp>(
-      rewriter, loc, input_ty, input, fp_scale, rewriter.getI32IntegerAttr(0));
-  auto floored =
-      CreateOpAndInfer<tosa::FloorOp>(rewriter, loc, input_ty, fp_scaled);
-  auto repeated = CreateOpAndInfer<tosa::SubOp>(rewriter, loc, input_ty,
-                                                fp_scaled, floored);
+  CreateReplaceOpAndInfer<TFL::SinOp>(rewriter, op, output_ty, offset);
+  return success();
+}
 
-  // 3. Scale and translate the normalized domain to the table domain. This
-  // includes a translating and scaling to [-int16_max, int16_max] and casting
-  // to an i16.
-  Value one = rewriter.create<tosa::ConstOp>(
-      loc, fp_scalar_ty, DenseElementsAttr::get(fp_scalar_ty, {1.0f}));
+LogicalResult ConvertTFLAtan2Op::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_atan2_op = cast<TFL::Atan2Op>(op);
+  Location loc = op->getLoc();
+  Value input_y = tfl_atan2_op.getY();
+  Value input_x = tfl_atan2_op.getX();
 
-  Value two = rewriter.create<tosa::ConstOp>(
-      loc, fp_scalar_ty, DenseElementsAttr::get(fp_scalar_ty, {2.0f}));
-  auto scale_up = CreateOpAndInfer<tosa::MulOp>(
-      rewriter, loc, input_ty, repeated, two, rewriter.getI32IntegerAttr(0));
-  auto translate =
-      CreateOpAndInfer<tosa::SubOp>(rewriter, loc, input_ty, scale_up, one);
+  auto input_y_ty = dyn_cast<RankedTensorType>(input_y.getType());
+  auto input_x_ty = dyn_cast<RankedTensorType>(input_x.getType());
+  auto output_ty = dyn_cast<ShapedType>(tfl_atan2_op.getResult().getType());
+
+  if (!input_y_ty || !input_x_ty || !output_ty) {
+    return rewriter.notifyMatchFailure(op, "ranked inputs/output required");
+  }
 
+  if (!input_y_ty.getElementType().isF32()) {
+    return rewriter.notifyMatchFailure(op, "input must be fp32");
+  }
+
+  // To perform an atan2 operation we make use of an atan lookup table,
+  // then determine the correct quadrant for each output. To restrict the
+  // input domain of the lookup table from [-inf, inf] to [0, 1], we make
+  // use of two identities and undo the transformation later on:
+  //
+  // acrtan(z) = π/2 - arctan(1/z)                                  (0)
+  //
+  // and
+  //
+  // arctan(-z) = -arctan(z)                                        (1)
+
+  Value pi = getTosaConstTensorSingleF32(rewriter, op, M_PI);
+  Value pi_2 = getTosaConstTensorSingleF32(rewriter, op, M_PI_2);
+  Value zero = getTosaConstTensorSingleF32(rewriter, op, 0.0);
+  Value one = getTosaConstTensorSingleF32(rewriter, op, 1.0);
+  Value two = getTosaConstTensorSingleF32(rewriter, op, 2.0);
+
+  // 1. Restrict the input to the atan lookup from [-inf, inf] to [0, 1].
+  // By utilizing (0) and (1) we compute: min(|x|, |y|) / max(|x|, |y|).
+  auto abs_y =
+      CreateOpAndInfer<tosa::AbsOp>(rewriter, loc, input_y_ty, input_y);
+  auto abs_x =
+      CreateOpAndInfer<tosa::AbsOp>(rewriter, loc, input_y_ty, input_x);
+  auto min_xy = CreateOpAndInfer<tosa::MinimumOp>(rewriter, loc, input_y_ty,
+                                                  abs_y, abs_x);
+  auto max_xy = CreateOpAndInfer<tosa::MaximumOp>(rewriter, loc, input_y_ty,
+                                                  abs_y, abs_x);
+  auto recip =
+      CreateOpAndInfer<tosa::ReciprocalOp>(rewriter, loc, input_y_ty, max_xy);
+  auto atan_input = CreateOpAndInfer<tosa::MulOp>(
+      rewriter, loc, input_y_ty, recip, min_xy, rewriter.getI32IntegerAttr(0));
+
+  // 2. Scale and translate the normalized domain to the table domain. This
+  // includes a translating and scaling to [-int16_max, int16_max] and casting
+  // to an i16 as it is the highest precision the table operation supports.
+  auto fp_scalar_ty = RankedTensorType::get({}, rewriter.getF32Type());
+  auto scale_up =
+      CreateOpAndInfer<tosa::MulOp>(rewriter, loc, input_y_ty, atan_input, two,
+                                    rewriter.getI32IntegerAttr(0));
+  auto translate =
+      CreateOpAndInfer<tosa::SubOp>(rewriter, loc, input_y_ty, scale_up, one);
   Value int_limit = rewriter.create<tosa::ConstOp>(
       loc, fp_scalar_ty,
       DenseElementsAttr::get(
           fp_scalar_ty,
           {static_cast<float>(std::numeric_limits<int16_t>::max())}));
   auto int_scaled =
-      CreateOpAndInfer<tosa::MulOp>(rewriter, loc, input_ty, translate,
+      CreateOpAndInfer<tosa::MulOp>(rewriter, loc, input_y_ty, translate,
                                     int_limit, rewriter.getI32IntegerAttr(0));
 
-  auto int16_ty = input_ty.clone(rewriter.getIntegerType(16));
+  auto int16_ty = input_y_ty.clone(rewriter.getIntegerType(16));
   auto casted =
       CreateOpAndInfer<tosa::CastOp>(rewriter, loc, int16_ty, int_scaled);
 
-  // 4. Compute the lookup table using the range of [-255, 255] for sin.
-  llvm::SmallVector<int16_t> values;
-  const int num_values = 513;
-  values.resize(num_values, 0);
-  // First and last values should be 0;
-  for (int i = 1; i < num_values - 1; ++i)
-    values[i] = std::numeric_limits<int16_t>::max() *
-                sin(static_cast<float>(i) * 2.0 * M_PI / (num_values - 1.0));
-
-  auto table_ty =
-      RankedTensorType::get({num_values}, rewriter.getIntegerType(16));
-  Value table = rewriter.create<tosa::ConstOp>(
-      loc, table_ty,
-      DenseElementsAttr::get(table_ty, llvm::makeArrayRef(values)));
-
-  auto table_result_ty = input_ty.clone(rewriter.getIntegerType(32));
+  // 3. Compute a lookup table using the domain of [0, 1] for atan.
+  // Note: the implementation of std::atan2 may be different on
+  // different machines, so may result in varying numerical results.
+  auto atan_func = [](double x) -> double { return std::atan(x); };
+  Value table_const = getTosaConst16bitTable(rewriter, op, atan_func, 0.0, 1.0);
   auto table_result = CreateOpAndInfer<tosa::TableOp>(
-      rewriter, loc, table_result_ty, casted, table);
+      rewriter, loc, output_ty.clone(rewriter.getIntegerType(32)), casted,
+      table_const);
 
-  // 5. The range of table is a 23-bit two's compliment value. Normalize the
+  // 4. The range of table is a 23-bit two's complement value. Normalize the
   // range by casting to an fp32 and dividing by 2^22.
   auto table_result_fp =
-      CreateOpAndInfer<CastOp>(rewriter, loc, input_ty, table_result);
+      CreateOpAndInfer<tosa::CastOp>(rewriter, loc, output_ty, table_result);
   auto output_scale = rewriter.create<ConstOp>(
       loc, fp_scalar_ty,
       DenseElementsAttr::get(
           fp_scalar_ty,
           {static_cast<float>(1.0 / static_cast<float>(1 << 22))}));
-  CreateReplaceOpAndInfer<MulOp>(rewriter, op, output_ty, table_result_fp,
-                                 output_scale, rewriter.getI32IntegerAttr(0));
-  return success();
-}
+  auto table_output = CreateOpAndInfer<tosa::MulOp>(
+      rewriter, loc, output_ty, table_result_fp, output_scale,
+      rewriter.getI32IntegerAttr(0));
 
-LogicalResult ConvertTFLCosOp::matchAndRewrite(
-    Operation* op, PatternRewriter& rewriter) const {
-  auto tfl_cos_op = cast<TFL::CosOp>(op);
-  Value input = tfl_cos_op.getX();
-  RankedTensorType input_ty = input.getType().dyn_cast<RankedTensorType>();
-  ShapedType output_ty =
-      tfl_cos_op.getResult().getType().dyn_cast<ShapedType>();
+  auto bool_ty = output_ty.clone(rewriter.getIntegerType(1));
 
-  if (!input_ty || !output_ty) return failure();
+  // 5. If (0) was applied to the atan input, apply π/2 - table_output.
+  auto sub_pi_2 = CreateOpAndInfer<tosa::SubOp>(rewriter, loc, output_ty, pi_2,
+                                                table_output);
+  auto condition =
+      CreateOpAndInfer<tosa::GreaterOp>(rewriter, loc, bool_ty, abs_y, abs_x);
+  auto transform_output = CreateOpAndInfer<tosa::SelectOp>(
+      rewriter, loc, output_ty, condition, sub_pi_2, table_output);
 
-  bool input_is_fp = input_ty.getElementType().isa<mlir::FloatType>();
-  bool output_is_fp = output_ty.getElementType().isa<mlir::FloatType>();
+  // 6. Determine the correct atan2 quadrant.
+  // If x < 0, apply π - transform_output.
+  auto sub_pi = CreateOpAndInfer<tosa::SubOp>(rewriter, loc, output_ty, pi,
+                                              transform_output);
+  auto cond_1 =
+      CreateOpAndInfer<tosa::GreaterOp>(rewriter, loc, bool_ty, zero, input_x);
+  auto quadrant_select = CreateOpAndInfer<tosa::SelectOp>(
+      rewriter, loc, output_ty, cond_1, sub_pi, transform_output);
 
-  if (!input_is_fp || !output_is_fp) {
-    return rewriter.notifyMatchFailure(op, "input/result must be fp");
-  }
+  // 7. If (1) was applied to the atan input, negate output.
+  auto neg_r = CreateOpAndInfer<tosa::NegateOp>(rewriter, loc, output_ty,
+                                                quadrant_select);
+  auto cond_2 =
+      CreateOpAndInfer<tosa::GreaterOp>(rewriter, loc, bool_ty, zero, input_y);
+  CreateReplaceOpAndInfer<tosa::SelectOp>(rewriter, op, output_ty, cond_2,
+                                          neg_r, quadrant_select);
 
-  // Replace with the equivalent sin operation:
-  //   cos(x) = sin(x + π / 2).
-  auto fp_scalar_ty = RankedTensorType::get({}, rewriter.getF32Type());
-  auto pi_2 = rewriter.create<ConstOp>(
-      op->getLoc(), fp_scalar_ty,
-      DenseElementsAttr::get(fp_scalar_ty, {static_cast<float>(M_PI_2)}));
-  auto offset = rewriter.create<AddOp>(op->getLoc(), input_ty, input, pi_2);
-
-  CreateReplaceOpAndInfer<TFL::SinOp>(rewriter, op, output_ty, offset);
   return success();
 }
 
@@ -3169,9 +3337,14 @@ LogicalResult ConvertTFLYieldOp::matchAndRewrite(
 LogicalResult ConvertTFLCustomOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tfl_custom_op = cast<TFL::CustomOp>(op);
-  rewriter.replaceOpWithNewOp<tosa::CustomOp>(op, op->getResultTypes(),
-                                              tfl_custom_op.getCustomCode(),
-                                              op->getOperands());
+  rewriter.replaceOpWithNewOp<tosa::CustomOp>(
+      op, op->getResultTypes(), tfl_custom_op.getCustomCode(),
+      rewriter.getStringAttr("TFL"),
+      tfl_custom_op.getCustomOption()
+          .cast<mlir::TFL::ConstBytesAttr>()
+          .getValue()
+          .str(),
+      op->getOperands());
 
   return success();
 }
@@ -3257,7 +3430,7 @@ LogicalResult ConvertTFLQuantizeOp::matchAndRewrite(
 
     if (!result) return failure();
 
-    rewriter.replaceOp(op, {result.getValue()});
+    rewriter.replaceOp(op, {result.value()});
 
     return success();
   }
@@ -3294,7 +3467,7 @@ LogicalResult ConvertTFLDequantizeOp::matchAndRewrite(
 
     if (!result) return failure();
 
-    rewriter.replaceOp(op, {result.getValue()});
+    rewriter.replaceOp(op, {result.value()});
     return success();
   }
 
@@ -3317,7 +3490,7 @@ LogicalResult ConvertTFLDequantizeOp::matchAndRewrite(
 
     if (!result) return failure();
 
-    rewriter.replaceOp(op, {result.getValue()});
+    rewriter.replaceOp(op, {result.value()});
     return success();
   }
 
@@ -3422,7 +3595,7 @@ LogicalResult ConvertTFLGatherOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -3436,7 +3609,7 @@ LogicalResult ConvertTFLGatherNdOp::matchAndRewrite(
       tfl_gathernd_op.getIndices());
 
   if (!result) return failure();
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -3490,7 +3663,7 @@ LogicalResult ConvertTFLSparseToDenseOp::matchAndRewrite(
   auto multiply_constant_type =
       RankedTensorType::get({result_rank}, indices_ety);
   auto multiply_constant_attr = DenseElementsAttr::get(
-      multiply_constant_type, llvm::makeArrayRef(multiply_constant_ints));
+      multiply_constant_type, llvm::ArrayRef(multiply_constant_ints));
   Value multiply_constant = CreateOpAndInfer<tosa::ConstOp>(
       rewriter, loc, multiply_constant_type, multiply_constant_attr);
 
@@ -3503,12 +3676,13 @@ LogicalResult ConvertTFLSparseToDenseOp::matchAndRewrite(
 
   auto values_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, loc, UnrankedTensorType::get(result_ety), values,
-      rewriter.getI64ArrayAttr(
-          ArrayRef<int64_t>{1, values_ty.getDimSize(0), 1}));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF({1, values_ty.getDimSize(0), 1})));
 
   auto index_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
       rewriter, loc, UnrankedTensorType::get(indices_ety), reduce_op,
-      rewriter.getI64ArrayAttr(ArrayRef<int64_t>{1, indices_ty.getDimSize(0)}));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF({1, indices_ty.getDimSize(0)})));
 
   auto scatter = CreateOpAndInfer<tosa::ScatterOp>(
       rewriter, loc, UnrankedTensorType::get(result_ety), default_const,
@@ -3516,7 +3690,8 @@ LogicalResult ConvertTFLSparseToDenseOp::matchAndRewrite(
 
   CreateReplaceOpAndInfer<tosa::ReshapeOp>(
       rewriter, op, result_ty, scatter,
-      rewriter.getI64ArrayAttr(result_ty.getShape()));
+      rewriter.getDenseI64ArrayAttr(
+          tensorflow::ConvertMlirShapeToTF(result_ty.getShape())));
 
   return success();
 }
@@ -3539,7 +3714,7 @@ LogicalResult ConvertTFLOneHotOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
 
   return success();
 }
@@ -3553,6 +3728,12 @@ LogicalResult ConvertTFLArgMaxOp::matchAndRewrite(
     return failure();
 
   int32_t dim = dim_elems.getValues<APInt>()[0].getSExtValue();
+
+  if (dim < 0) {
+    auto input_type = cast<RankedTensorType>(arg_max_op.getInput().getType());
+    dim += input_type.getRank();
+  }
+
   CreateReplaceOpAndInfer<tosa::ArgMaxOp>(
       rewriter, op, arg_max_op.getType(), arg_max_op.getInput(),
       rewriter.getIntegerAttr(rewriter.getI64Type(), dim));
@@ -3578,7 +3759,39 @@ LogicalResult ConvertTFLFakeQuantOp::matchAndRewrite(
 
   if (!result) return failure();
 
-  rewriter.replaceOp(op, {result.getValue()});
+  rewriter.replaceOp(op, {result.value()});
+
+  return success();
+}
+
+// Clone block, convert yield from TFL to TOSA
+static void inlineWhileCase(Region& srcRegion, Region& dstRegion,
+                            PatternRewriter& rewriter) {
+  rewriter.cloneRegionBefore(srcRegion, &dstRegion.back());
+  rewriter.eraseBlock(&dstRegion.back());
+
+  Block* headBlock = &dstRegion.front();
+
+  auto yield = cast<mlir::TFL::YieldOp>(headBlock->getTerminator());
+  rewriter.setInsertionPoint(yield);
+  rewriter.create<mlir::tosa::YieldOp>(yield.getLoc(), yield.getOperands());
+  rewriter.eraseOp(yield);
+}
+
+LogicalResult ConvertTFLWhileOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tfl_while_op = cast<TFL::WhileOp>(op);
+
+  auto while_op = rewriter.create<mlir::tosa::WhileOp>(
+      op->getLoc(), op->getResultTypes(), op->getOperands());
+
+  rewriter.createBlock(&while_op.getCond());
+  rewriter.createBlock(&while_op.getBody());
+
+  inlineWhileCase(tfl_while_op.getCond(), while_op.getCond(), rewriter);
+  inlineWhileCase(tfl_while_op.getBody(), while_op.getBody(), rewriter);
+
+  rewriter.replaceOp(tfl_while_op, while_op.getResults());
 
   return success();
 }
@@ -3623,6 +3836,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLGelu);
   DEF_PATTERN_INSERT(TFLRelu);
   DEF_PATTERN_INSERT(TFLRelu1);
+  DEF_PATTERN_INSERT(TFLRelu0To1);
   DEF_PATTERN_INSERT(TFLRelu6);
   DEF_PATTERN_INSERT(TFLEqual);
   DEF_PATTERN_INSERT(TFLNotEqual);
@@ -3689,8 +3903,10 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLBatchToSpaceNd);
   DEF_PATTERN_INSERT(TFLSpaceToDepth);
   DEF_PATTERN_INSERT(TFLDepthToSpace);
+  DEF_PATTERN_INSERT(TFLBucketize);
   DEF_PATTERN_INSERT(TFLSin);
   DEF_PATTERN_INSERT(TFLCos);
+  DEF_PATTERN_INSERT(TFLAtan2);
   DEF_PATTERN_INSERT(TFLLogistic);
   DEF_PATTERN_INSERT(TFLTanh);
   DEF_PATTERN_INSERT(TFLPRelu);
@@ -3710,6 +3926,7 @@ void populateLegalizeTFLPatterns(MLIRContext* ctx,
   DEF_PATTERN_INSERT(TFLOneHot);
   DEF_PATTERN_INSERT(TFLArgMax);
   DEF_PATTERN_INSERT(TFLFakeQuant);
+  DEF_PATTERN_INSERT(TFLWhile);
 }
 
 // Creates an instance of the TensorFlow Lite dialect LegalizeTFL pass.
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index 81f7f5ceecb..45bbfb12b59 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -113,7 +113,7 @@ llvm::Optional<Value> buildReshapeWithDynamicDims(PatternRewriter& rewriter,
     return llvm::None;
   }
 
-  ArrayAttr shape_attr = rewriter.getI64ArrayAttr(static_dims);
+  DenseI64ArrayAttr shape_attr = rewriter.getDenseI64ArrayAttr(static_dims);
   auto output_ty = tensorflow::GetTypeFromTFTensorShape(static_dims, e_ty);
   return rewriter
       .create<tosa::ReshapeOp>(op->getLoc(), output_ty, input_value, shape_attr)
@@ -136,9 +136,9 @@ Value buildRescale(PatternRewriter& rewriter, Operation* op,
       rewriter, op->getLoc(), output_type, input_val,
       rewriter.getI32IntegerAttr(static_cast<int32_t>(input_zp)),
       rewriter.getI32IntegerAttr(static_cast<int32_t>(output_zp)),
-      rewriter.getI32ArrayAttr({multiplier}), rewriter.getI32ArrayAttr({shift}),
-      rewriter.getBoolAttr(scale32), rewriter.getBoolAttr(double_round),
-      rewriter.getBoolAttr(false));
+      rewriter.getDenseI32ArrayAttr({multiplier}),
+      rewriter.getDenseI32ArrayAttr({shift}), rewriter.getBoolAttr(scale32),
+      rewriter.getBoolAttr(double_round), rewriter.getBoolAttr(false));
 
   return rescale_op.getResult();
 }
@@ -206,8 +206,8 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
     auto rescale_op = CreateOpAndInfer<tosa::RescaleOp>(
         rewriter, op->getLoc(), output_type, conv_val,
         rewriter.getI32IntegerAttr(0), rewriter.getI32IntegerAttr(output_zp),
-        rewriter.getI32ArrayAttr({multiplier}),
-        rewriter.getI32ArrayAttr({shift}), rewriter.getBoolAttr(scale32),
+        rewriter.getDenseI32ArrayAttr({multiplier}),
+        rewriter.getDenseI32ArrayAttr({shift}), rewriter.getBoolAttr(scale32),
         rewriter.getBoolAttr(double_round), rewriter.getBoolAttr(false));
 
     return rescale_op.getResult();
@@ -242,8 +242,8 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
     auto rescale_op = CreateOpAndInfer<tosa::RescaleOp>(
         rewriter, op->getLoc(), output_type, conv_val,
         rewriter.getI32IntegerAttr(0), rewriter.getI32IntegerAttr(output_zp),
-        rewriter.getI32ArrayAttr(multiplier_arr),
-        rewriter.getI32ArrayAttr(shift_arr), rewriter.getBoolAttr(scale32),
+        rewriter.getDenseI32ArrayAttr(multiplier_arr),
+        rewriter.getDenseI32ArrayAttr(shift_arr), rewriter.getBoolAttr(scale32),
         rewriter.getBoolAttr(double_round), rewriter.getBoolAttr(true));
 
     return rescale_op.getResult();
@@ -277,8 +277,7 @@ Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
       {256}, element_qtype.getStorageType());
-  auto const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(table));
+  auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
       rewriter.create<tosa::ConstOp>(op->getLoc(), const_type, const_attr);
@@ -314,14 +313,9 @@ Value getTosaConst16bitTable(PatternRewriter& rewriter, Operation* op,
   table.push_back(
       static_cast<int16_t>(std::min(std::max(max_val, -32768), 32767)));
 
-  auto element_qtype =
-      UniformQuantizedType::get(true, rewriter.getIntegerType(16),
-                                rewriter.getF32Type(), 1.0f, 0, -32768, 32767);
-  auto const_type = tensorflow::GetTypeFromTFTensorShape({513}, element_qtype);
-  auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {513}, element_qtype.getStorageType());
-  auto const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(table));
+  auto const_type =
+      tensorflow::GetTypeFromTFTensorShape({513}, rewriter.getIntegerType(16));
+  auto const_attr = DenseElementsAttr::get(const_type, llvm::ArrayRef(table));
 
   auto const_op =
       rewriter.create<tosa::ConstOp>(op->getLoc(), const_type, const_attr);
@@ -372,13 +366,13 @@ void getTosaConst32bitTable(PatternRewriter& rewriter, Operation* op,
       {513}, element_qtype.getStorageType());
 
   auto first_const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(first_table));
+      DenseElementsAttr::get(storage_type, llvm::ArrayRef(first_table));
   auto second_const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(second_table));
+      DenseElementsAttr::get(storage_type, llvm::ArrayRef(second_table));
   auto third_const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(third_table));
+      DenseElementsAttr::get(storage_type, llvm::ArrayRef(third_table));
   auto fourth_const_attr =
-      DenseElementsAttr::get(storage_type, llvm::makeArrayRef(fourth_table));
+      DenseElementsAttr::get(storage_type, llvm::ArrayRef(fourth_table));
 
   first_const =
       rewriter.create<tosa::ConstOp>(op->getLoc(), const_type, first_const_attr)
@@ -449,9 +443,10 @@ bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
                                  tensorflow::TensorFormat data_format_tf,
                                  uint32_t first_filter_spatial_dim,
                                  ShapedType input_type, ShapedType filter_type,
-                                 ArrayAttr strides, ArrayAttr dilations,
+                                 DenseI64ArrayAttr strides,
+                                 DenseI64ArrayAttr dilations,
                                  PatternRewriter& rewriter,
-                                 ArrayAttr& explicit_padding) {
+                                 DenseI64ArrayAttr& explicit_padding) {
   assert(tf_pad != tensorflow::Padding::EXPLICIT);
   if (!input_type.hasRank() || !filter_type.getRank()) return false;
   // Only support NHWC for now.
@@ -479,8 +474,8 @@ bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
     int64_t ifm_dim = i + dim_index_shift;
     int64_t filter_dim = first_filter_spatial_dim + i;
 
-    int64_t dim_dilation = dilations[i].template cast<IntegerAttr>().getInt();
-    int64_t dim_stride = strides[i].template cast<IntegerAttr>().getInt();
+    int64_t dim_dilation = dilations[i];
+    int64_t dim_stride = strides[i];
 
     int64_t ip_size = input_type.getDimSize(ifm_dim);
     int64_t f_size = filter_type.getDimSize(filter_dim);
@@ -499,7 +494,7 @@ bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
     computed_paddings.push_back(pad_after);
   }
 
-  explicit_padding = rewriter.getI64ArrayAttr(computed_paddings);
+  explicit_padding = rewriter.getDenseI64ArrayAttr(computed_paddings);
   return true;
 }
 
@@ -513,7 +508,7 @@ bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
 // The explicit padding array in TF holds 2 pad values for every
 // dimension, even those that are not the 2 spatial ones. Just extract the
 // 2x pad values for the XY dims.
-ArrayAttr getPaddingValuesFromExplicitPadAttr(
+DenseI64ArrayAttr getPaddingValuesFromExplicitPadAttr(
     ArrayAttr explicit_pad, tensorflow::TensorFormat data_format_tf,
     PatternRewriter& rewriter) {
   SmallVector<int64_t> computed_paddings;
@@ -522,22 +517,21 @@ ArrayAttr getPaddingValuesFromExplicitPadAttr(
   for (int i = 0; i < 2; i++) {  // Two spatial dimensions X&Y
     int64_t dim = GetTensorSpatialDimIndex(4, data_format_tf,
                                            i);  // 4D tensor, NHWC/NCHW format
-
     pad_before = explicit_pad[dim * 2].template cast<IntegerAttr>().getInt();
     pad_after = explicit_pad[dim * 2 + 1].template cast<IntegerAttr>().getInt();
     computed_paddings.push_back(pad_before);
     computed_paddings.push_back(pad_after);
   }
 
-  return rewriter.getI64ArrayAttr(computed_paddings);
+  return rewriter.getDenseI64ArrayAttr(computed_paddings);
 }
 
 // Calculates the TOSA padding values for transposeConv2d
 bool getTransposeConv2dPaddingValues(
     tensorflow::Padding tf_pad, tensorflow::TensorFormat data_format_tf,
     uint32_t first_filter_spatial_dim, ShapedType input_type,
-    ShapedType filter_type, ShapedType output_type, ArrayAttr strides,
-    PatternRewriter& rewriter, ArrayAttr& explicit_padding) {
+    ShapedType filter_type, ShapedType output_type, DenseI64ArrayAttr strides,
+    PatternRewriter& rewriter, DenseI64ArrayAttr& explicit_padding) {
   assert(tf_pad != tensorflow::Padding::EXPLICIT);
   if (!input_type.hasRank() || !filter_type.hasRank() || !output_type.hasRank())
     return false;
@@ -558,7 +552,7 @@ bool getTransposeConv2dPaddingValues(
     int64_t ifm_size = input_type.getDimSize(ifm_dim);
     int64_t filter_size = filter_type.getDimSize(filter_dim);
     int64_t ofm_size = output_type.getDimSize(ofm_dim);
-    int64_t dim_stride = strides[i].template cast<IntegerAttr>().getInt();
+    int64_t dim_stride = strides[i];
 
     // These dimensions need to be static to legalize.
     if (ShapedType::isDynamic(filter_size) || ShapedType::isDynamic(ifm_size) ||
@@ -576,7 +570,7 @@ bool getTransposeConv2dPaddingValues(
     computed_paddings.push_back(pad_after);
   }
 
-  explicit_padding = rewriter.getI64ArrayAttr(computed_paddings);
+  explicit_padding = rewriter.getDenseI64ArrayAttr(computed_paddings);
   return true;
 }
 
@@ -682,7 +676,7 @@ LogicalResult ApplyPatternsWithShapeResolution(
   // type stripping changing.
   func.walk([&](tosa::ConstOp op) {
     auto ety = op.getValue().getType().getElementType();
-    auto new_ty = op.getType().cast<ShapedType>().clone(ety);
+    auto new_ty = op.getType().cast<TensorType>().clone(ety);
     op.getResult().setType(new_ty);
   });
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index 854195d46bb..8a6ac407c27 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -107,12 +107,13 @@ bool getPaddingValuesFromPadType(tensorflow::Padding tf_pad,
                                  tensorflow::TensorFormat data_format_tf,
                                  uint32_t first_filter_spatial_dim,
                                  ShapedType input_type, ShapedType filter_type,
-                                 ArrayAttr strides, ArrayAttr dilations,
+                                 DenseI64ArrayAttr strides,
+                                 DenseI64ArrayAttr dilations,
                                  PatternRewriter& rewriter,
-                                 ArrayAttr& explicit_pad);
+                                 DenseI64ArrayAttr& explicit_pad);
 
 // Calculates the TOSA padding values for explicit-padded TF operators.
-ArrayAttr getPaddingValuesFromExplicitPadAttr(
+DenseI64ArrayAttr getPaddingValuesFromExplicitPadAttr(
     ArrayAttr explicit_pad, tensorflow::TensorFormat data_format_tf,
     PatternRewriter& rewriter);
 
@@ -120,8 +121,8 @@ ArrayAttr getPaddingValuesFromExplicitPadAttr(
 bool getTransposeConv2dPaddingValues(
     tensorflow::Padding tf_pad, tensorflow::TensorFormat data_format_tf,
     uint32_t first_filter_spatial_dim, ShapedType input_type,
-    ShapedType filter_type, ShapedType output_type, ArrayAttr strides,
-    PatternRewriter& rewriter, ArrayAttr& explicit_pad);
+    ShapedType filter_type, ShapedType output_type, DenseI64ArrayAttr strides,
+    PatternRewriter& rewriter, DenseI64ArrayAttr& explicit_pad);
 
 // Templated function to create a constant op for given type and shape.
 // T: storage C type.
@@ -183,7 +184,7 @@ TosaOp CreateOpAndInfer(PatternRewriter& rewriter, Location loc, Type result_ty,
   Type new_ty =
       newKnowledge.hasRank
           ? Type{tensorflow::GetTypeFromTFTensorShape(
-                llvm::makeArrayRef(newKnowledge.sizes), newKnowledge.dtype)}
+                llvm::ArrayRef(newKnowledge.sizes), newKnowledge.dtype)}
           : Type{mlir::UnrankedTensorType::get(newKnowledge.dtype)};
   result.setType(new_ty);
   return op;
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 8fe52c64d1a..b888cf43c40 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -47,30 +48,6 @@ gentbl_cc_library(
     ],
 )
 
-gentbl_cc_library(
-    name = "xla_passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=Xla",
-            ],
-            "transforms/xla_passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "transforms/xla_passes.td",
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncTdFiles",
-        "@llvm-project//mlir:PassBaseTdFiles",
-        "@llvm-project//mlir:TensorOpsTdFiles",
-    ],
-)
-
 gentbl_cc_library(
     name = "tf_xla_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
@@ -96,39 +73,6 @@ gentbl_cc_library(
     ],
 )
 
-cc_library(
-    name = "xla_passes",
-    srcs = [
-        "transforms/outline_with_xla_framework.cc",
-        "transforms/xla_framework_to_llvm_pass.cc",
-    ],
-    hdrs = [
-        "transforms/xla_passes.h",
-    ],
-    deps = [
-        ":xla_framework",
-        ":xla_passes_inc_gen",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithToLLVM",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
-        "@llvm-project//mlir:FuncTransforms",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMCommonConversion",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:MathToLLVM",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:SCFToControlFlow",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
 cc_library(
     name = "tf_xla_passes",
     srcs = [
@@ -148,6 +92,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
 )
 
@@ -177,7 +122,6 @@ cc_library(
         ":tf_xla_passes_inc_gen",
         ":xla_legalize_tf_passes_inc_gen",
         ":xla_passes_inc_gen",
-        "//tensorflow/compiler/mlir/disc:mhlo_disc",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -201,6 +145,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@stablehlo//:chlo_ops",
     ],
     alwayslink = 1,
@@ -215,44 +160,67 @@ cc_library(
         "transforms/adjust_layout.h",
     ],
     deps = [
-        ":tf_xla_passes_inc_gen",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client:sharding_builder",
-        "//tensorflow/compiler/xla/client/lib:conv_grad_size_util",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/kernels:conv_grad_shape_utils",
-        "//tensorflow/tsl/platform:bfloat16",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "xla_legalize_targets",
+    srcs = [
+        "transforms/xla_legalize_targets.cc",
+    ],
+    hdrs = [
+        "transforms/xla_legalize_targets.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_legalize_targets_test",
+    srcs = ["transforms/xla_legalize_targets_test.cc"],
+    deps = [
+        ":xla_legalize_targets",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:chlo_ops",
     ],
 )
 
 cc_library(
     name = "xla_legalize_tf",
     srcs = [
+        "transforms/convert_mhlo_quant_to_int.cc",
         "transforms/legalize_tf_collective.cc",
         "transforms/legalize_tf_communication.cc",
-        "transforms/legalize_tf_control_flow.cc",
         "transforms/legalize_tf_types.cc",
         "transforms/tf_xla_passes.h.inc",
+        "transforms/verify_tfxla_legalization.cc",
         "transforms/xla_legalize_tf.cc",
         "transforms/xla_legalize_tf_passes.h.inc",
     ],
@@ -262,6 +230,8 @@ cc_library(
     deps = [
         ":legalize_tf",
         ":legalize_utils",
+        ":xla_legalize_targets",
+        ":xla_legalize_tf_no_fallback",
         ":xla_legalize_tf_passes_inc_gen",
         ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -276,9 +246,12 @@ cc_library(
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:chlo_legalize_to_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/util/quantization:uniform_quant_ops_params",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -331,15 +304,19 @@ cc_library(
     srcs = [
         "transforms/legalize_tf_with_tf2xla.cc",
     ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
     deps = [
         ":tf_xla_passes_inc_gen",
+        ":xla_legalize_tf_passes_inc_gen",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_embedding_ops_registry",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/compiler/tf2xla:xla_compilation_device",
         "//tensorflow/compiler/tf2xla:xla_context",
@@ -349,7 +326,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/stream_executor:timer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:mlir_hlo_builder",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -371,82 +347,25 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "mhlo_to_lhlo_with_xla",
-    srcs = ["transforms/mhlo_to_lhlo_with_xla.cc"],
-    hdrs = ["transforms/mhlo_to_lhlo_with_xla.h"],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
-        "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
-        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
-        "//tensorflow/compiler/xla/service/gpu:matmul_utils",
-        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_module_importer",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
-        "//tensorflow/tsl/platform:status",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:TranslateLib",
-    ],
-)
-
-cc_library(
-    name = "translate_cl_registration",
-    testonly = True,
-    srcs = ["xla_mlir_translate_registration.cc"],
-    deps = [
-        "//tensorflow/compiler/jit:xla_cpu_jit",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
-        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
-        "@llvm-project//mlir:TranslateLib",
-    ],
-    alwayslink = 1,
-)
-
 tf_cc_binary(
     name = "xla-opt",
     testonly = True,
     srcs = ["xla_opt_main.cc"],
     deps = [
         ":adjust_layout",  # buildcleaner: keep
-        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
         ":tf_xla_passes",  # buildcleaner: keep
-        ":xla_framework",
         ":xla_legalize_tf",  # buildcleaner: keep
         ":xla_legalize_tf_no_fallback",  # buildcleaner: keep
-        ":xla_passes",  # buildcleaner: keep
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service/cpu:hlo_xla_runtime_pipeline",  # buildcleaner: keep
+        "//tensorflow/compiler/xla/service/cpu:hlo_xla_runtime_pipeline",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",  # buildcleaner: keep
         "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
@@ -460,86 +379,22 @@ tf_cc_binary(
     srcs = ["xla_opt_main.cc"],
     deps = [
         ":adjust_layout",  # buildcleaner: keep
-        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
         ":tf_xla_passes",  # buildcleaner: keep
-        ":xla_framework",  # buildcleaner: keep
         ":xla_legalize_tf",  # buildcleaner: keep
         ":xla_legalize_tf_no_fallback",  # buildcleaner: keep
-        ":xla_passes",  # buildcleaner: keep
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
         "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service/cpu:hlo_xla_runtime_pipeline",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",  # buildcleaner: keep
         "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
         "@stablehlo//:register",
     ],
 )
-
-td_library(
-    name = "td_files",
-    srcs = [
-        "ir/xla_framework_ops.td",
-    ],
-    compatible_with = get_compatible_with_cloud(),
-    deps = [
-        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "xla_framework_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "ir/xla_framework.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "ir/xla_framework.cc.inc",
-        ),
-        (
-            ["-gen-dialect-decls"],
-            "ir/xla_framework_dialect.h.inc",
-        ),
-        (
-            ["-gen-dialect-defs"],
-            "ir/xla_framework_dialect.cc.inc",
-        ),
-        (
-            ["-gen-typedef-decls"],
-            "ir/xla_framework_types.h.inc",
-        ),
-        (
-            ["-gen-typedef-defs"],
-            "ir/xla_framework_types.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "ir/xla_framework_ops.td",
-    deps = [":td_files"],
-)
-
-cc_library(
-    name = "xla_framework",
-    srcs = [
-        "ir/xla_framework.cc",
-        "ir/xla_framework.cc.inc",
-        "ir/xla_framework.h.inc",
-    ],
-    hdrs = ["ir/xla_framework.h"],
-    deps = [
-        ":xla_framework_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Support",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/BUILD b/tensorflow/compiler/mlir/xla/experimental/conv_emitter/BUILD
deleted file mode 100644
index 30920e1754e..00000000000
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/BUILD
+++ /dev/null
@@ -1,91 +0,0 @@
-# Description:
-#   MLIR-GPU-specific convolution in XLA service implementation.
-
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = ["//tensorflow/compiler/xla:friends"],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-cc_library(
-    name = "conv_emitter",
-    srcs = ["conv_emitter.cc"],
-    hdrs = ["conv_emitter.h"],
-    deps = [
-        ":conv_emitter_transforms",
-        "//tensorflow/compiler/xla:permutation_util",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-cc_library(
-    name = "conv_emitter_transforms",
-    srcs = ["conv_emitter_transforms.cc"],
-    hdrs = ["conv_emitter_transforms.h"],
-    deps = [
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:types",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
-
-tf_cc_test(
-    name = "conv_emitter_test",
-    srcs = ["conv_emitter_test.cc"],
-    deps = [
-        ":conv_emitter",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:filecheck",
-        "//tensorflow/compiler/xla/tests:verified_hlo_module",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_main",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFToControlFlow",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_framework.cc b/tensorflow/compiler/mlir/xla/ir/xla_framework.cc
deleted file mode 100644
index 4e9272ac79a..00000000000
--- a/tensorflow/compiler/mlir/xla/ir/xla_framework.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the xla_framework dialect.
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h"
-
-#include "llvm/ADT/TypeSwitch.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
-
-// Generated dialect definitions.
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework_dialect.cc.inc"
-#define GET_TYPEDEF_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework_types.cc.inc"
-
-namespace mlir {
-namespace xla_framework {
-
-// Setup operations and types
-void XLAFrameworkDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.cc.inc"
-#undef GET_OP_LIST
-      >();
-
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework_types.cc.inc"
-#undef GET_TYPEDEF_LIST
-      >();
-}
-
-}  // namespace xla_framework
-}  // namespace mlir
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.cc.inc"
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_framework.h b/tensorflow/compiler/mlir/xla/ir/xla_framework.h
deleted file mode 100644
index 77bc4a264e4..00000000000
--- a/tensorflow/compiler/mlir/xla/ir/xla_framework.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations and types used in the XLAFramework dialect.
-//
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_FRAMEWORK_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_FRAMEWORK_H_
-
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/OpImplementation.h"  // from @llvm-project
-
-#define GET_TYPEDEF_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework_types.h.inc"
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h.inc"
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework_dialect.h.inc"
-
-#undef GET_OP_CLASSES
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_XLA_FRAMEWORK_H_
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 85181486fa4..b43e012b752 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -1,11 +1,20 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    size_override = {
+        "legalize-tf-binary-elementwise.mlir": "medium",
+        "legalize-tf-include-tf2xla-fallback.mlir": "medium",
+        "legalize-tf-prefer-tf2xla.mlir": "medium",
+        "legalize-tf.mlir": "medium",
+    },
     test_file_exts = [
         "mlir",
         "hlotxt",
diff --git a/tensorflow/compiler/mlir/xla/tests/adjust-layout.mlir b/tensorflow/compiler/mlir/xla/tests/adjust-layout.mlir
index a68f469c71e..8d60633ab52 100644
--- a/tensorflow/compiler/mlir/xla/tests/adjust-layout.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/adjust-layout.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -pass-pipeline='func.func(xla-adjust-layout)' %s | FILECHECK_OPTS="" FileCheck %s
+// RUN: xla-opt -pass-pipeline='builtin.module(func.func(xla-adjust-layout))' %s | FILECHECK_OPTS="" FileCheck %s
 
 func.func @infeed_dequeue_tuple() -> (tensor<1x8x4x4xi32>, tensor<1x100x1xf32>) {
   // CHECK: [[TOKEN:%.*]] = mhlo.create_token : !mhlo.token
diff --git a/tensorflow/compiler/mlir/xla/tests/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/xla/tests/convert-mhlo-quant-to-int.mlir
new file mode 100644
index 00000000000..a2fae364156
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/convert-mhlo-quant-to-int.mlir
@@ -0,0 +1,51 @@
+// RUN: xla-opt -convert-mhlo-quant-to-int -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @uniform_quantize_and_dequantize
+func.func @uniform_quantize_and_dequantize(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK-DAG: %[[SCALES:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-128> : tensor<i32>
+  // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<127> : tensor<i32>
+  // CHECK: %[[VAL0:.*]] = chlo.broadcast_divide %arg0, %[[SCALES]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL1:.*]] = chlo.broadcast_add %[[VAL0]], %[[HALF]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL2:.*]] = mhlo.floor %[[VAL1]] : tensor<?x?xf32>
+  // CHECK: %[[VAL3:.*]] = mhlo.convert %[[VAL2]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[ZPS]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL5:.*]] = chlo.broadcast_maximum %[[VAL4]], %[[QUANT_MIN]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL6:.*]] = chlo.broadcast_minimum %[[VAL5]], %[[QUANT_MAX]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL7:.*]] = mhlo.convert %[[VAL6]] : (tensor<?x?xi32>) -> tensor<?x?xi8>
+
+  // CHECK-DAG: %[[SCALES_DQ:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[ZPS_DQ:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[VAL8:.*]] = mhlo.convert %[[VAL7]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL9:.*]] = chlo.broadcast_subtract %[[VAL8]], %[[ZPS_DQ]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL10:.*]] = mhlo.convert %[[VAL9]] : (tensor<?x?xi32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL11:.*]] = chlo.broadcast_multiply %[[VAL10]], %[[SCALES_DQ]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: return %[[VAL11]] : tensor<?x?xf32>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_and_dequantize_type_exensions
+func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>) -> () {
+  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?x?xi32, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?xi8, #mhlo.type_extensions<bounds = [4, 4]>>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>, tensor<f32>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>, #mhlo.type_extensions<bounds = [4, 4]>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_and_dequantize_sparse_tensor_encoding
+func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> () {
+  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?xi8, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD
deleted file mode 100644
index b3b199b73a0..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    tags_override = {
-        "gpu_ops.mlir": tf_cuda_tests_tags() + [
-            "noasan",
-            "nomsan",
-            "noubsan",
-            "no_cuda_asan",
-            "no_oss",
-        ],
-    },
-    test_file_exts = [
-        "mlir",
-        "hlo",
-        "hlotxt",
-    ],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-mlir-translate",
-        "//tensorflow/compiler/mlir/xla:xla-opt",
-        "//tensorflow/compiler/mlir/xla:xla-opt-gpu",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//llvm:not",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
deleted file mode 100644
index fbb4a4e2deb..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
+++ /dev/null
@@ -1,859 +0,0 @@
-// RUN: xla-opt -split-input-file -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.abs
-  %abs = "mhlo.abs"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %abs : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.add
-// CHECK: lmhlo.terminator
-  %res = "mhlo.add"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.and
-// CHECK: lmhlo.terminator
-  %res = "mhlo.and"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.atan2
-// CHECK: lmhlo.terminator
-  %res = "mhlo.atan2"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.bitcast_convert
-  %res = "mhlo.bitcast_convert"(%value) : (tensor<2x2xf32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.ceil
-  %res = "mhlo.ceil"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.cbrt
-  %res = "mhlo.cbrt"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8> {lmhlo.params = 2
-// CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
-func.func @main(%pred: tensor<2x2xf32>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.clamp
-// CHECK: lmhlo.terminator
-  %0 = "mhlo.clamp"(%pred, %lhs, %rhs) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<2x2xf32>)
-  func.return %0 : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.count_leading_zeros
-  %res = "mhlo.count_leading_zeros"(%value) : (tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xi1> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.compare
-// CHECK: lmhlo.terminator
-  %res = "mhlo.compare"(%value0, %value1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xi1>
-  func.return %res : tensor<2x2xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<8xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<1x2xf32>, %value1: tensor<1x2xf32>) -> tensor<1x2xcomplex<f32>> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.complex
-// CHECK: lmhlo.terminator
-  %res = "mhlo.complex"(%value0, %value1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  func.return %res : tensor<1x2xcomplex<f32>>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
-func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf16> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<2x2xf16>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.convert
-  %res = "mhlo.convert"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf16>
-  func.return %res : tensor<2x2xf16>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.cosine
-// CHECK: lmhlo.terminator
-  %res = "mhlo.cosine"(%value0) : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>>
-  func.return %res : tensor<1x2xcomplex<f32>>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.divide
-// CHECK: lmhlo.terminator
-  %res = "mhlo.divide"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.exponential
-  %res = "mhlo.exponential"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.exponential_minus_one
-  %res = "mhlo.exponential_minus_one"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.floor
-  %res = "mhlo.floor"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xi1> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.is_finite
-  %res = "mhlo.is_finite"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xi1>
-  func.return %res : tensor<2x2xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.log
-  %res = "mhlo.log"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.log_plus_one
-  %res = "mhlo.log_plus_one"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.map
-// CHECK: return
-  %res = "mhlo.map"(%value0, %value1) ({
-  ^bb0(%a: tensor<f32>, %b: tensor<f32>):
-    %c = "mhlo.add"(%a, %b) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    %ret = "mhlo.add"(%a, %c) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-    "mhlo.return"(%ret) : (tensor<f32>) -> ()
-  }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.maximum
-// CHECK: lmhlo.terminator
-  %res = "mhlo.maximum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.minimum
-// CHECK: lmhlo.terminator
-  %res = "mhlo.minimum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.multiply
-// CHECK: lmhlo.terminator
-  %res = "mhlo.multiply"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.negate
-  %res = "mhlo.negate"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8>
-func.func @main(%value0: tensor<2x2xi1>) -> tensor<2x2xi1> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.not
-  %res = "mhlo.not"(%value0) : (tensor<2x2xi1>) -> tensor<2x2xi1>
-  func.return %res : tensor<2x2xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.not
-  %res = "mhlo.not"(%value0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
-func.func @main(%value0: tensor<2x2xi1>, %value1: tensor<2x2xi1>) -> tensor<2x2xi1> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.or
-// CHECK: lmhlo.terminator
-  %res = "mhlo.or"(%value0, %value1) : (tensor<2x2xi1>, tensor<2x2xi1>) -> tensor<2x2xi1>
-  func.return %res : tensor<2x2xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.or
-// CHECK: lmhlo.terminator
-  %res = "mhlo.or"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.popcnt
-  %res = "mhlo.popcnt"(%value0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.power
-// CHECK: lmhlo.terminator
-  %res = "mhlo.power"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
-func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.real
-  %res = "mhlo.real"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  func.return %res : tensor<1x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
-func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.imag
-  %res = "mhlo.imag"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  func.return %res : tensor<1x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.reduce_precision
-  %res = "mhlo.reduce_precision"(%value0) {exponent_bits=5 : i32, mantissa_bits=12 : i32}: (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.remainder
-// CHECK: lmhlo.terminator
-  %res = "mhlo.remainder"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.round_nearest_afz
-  %res = "mhlo.round_nearest_afz"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.rsqrt
-  %res = "mhlo.rsqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8> {lmhlo.params = 2
-// CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
-func.func @main(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.select
-// CHECK: lmhlo.terminator
-  %0 = "mhlo.select"(%pred, %lhs, %rhs) : (tensor<2x2xi1>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<2x2xf32>)
-  func.return %0 : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.shift_left
-// CHECK: lmhlo.terminator
-  %res = "mhlo.shift_left"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.shift_right_arithmetic
-// CHECK: lmhlo.terminator
-  %res = "mhlo.shift_right_arithmetic"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.shift_right_logical
-// CHECK: lmhlo.terminator
-  %res = "mhlo.shift_right_logical"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.sign
-  %res = "mhlo.sign"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.sine
-  %res = "mhlo.sine"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.sqrt
-  %res = "mhlo.sqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.subtract
-// CHECK: lmhlo.terminator
-  %res = "mhlo.subtract"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.tanh
-  %res = "mhlo.tanh"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  func.return %res : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
-func.func @main(%value0: tensor<2x2xi1>, %value1: tensor<2x2xi1>) -> tensor<2x2xi1> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.xor
-// CHECK: lmhlo.terminator
-  %res = "mhlo.xor"(%value0, %value1) : (tensor<2x2xi1>, tensor<2x2xi1>) -> tensor<2x2xi1>
-  func.return %res : tensor<2x2xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
-// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
-// CHECK: lmhlo.fusion
-// CHECK: mhlo.xor
-// CHECK: lmhlo.terminator
-  %res = "mhlo.xor"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
-  func.return %res : tensor<2x2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<100xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<100xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<100xi8> {lmhlo.output_index = dense<0>
-// CHECK-SAME: %[[ARG3:.*]]: memref<100xi8> {lmhlo.output_index = dense<1>
-// CHECK: %[[VIEW0:.*]] = memref.view %[[ARG0]]{{.*}} : memref<100xi8> to memref<5x5xi32>
-// CHECK: %[[VIEW1:.*]] = memref.view %[[ARG1]]{{.*}} : memref<100xi8> to memref<5x5xf32>
-// CHECK: %[[VIEW2:.*]] = memref.view %[[ARG2]]{{.*}} : memref<100xi8> to memref<5x5xi32>
-// CHECK: %[[VIEW3:.*]] = memref.view %[[ARG3]]{{.*}} : memref<100xi8> to memref<5x5xf32>
-// CHECK: "lmhlo.sort"(%[[VIEW0]], %[[VIEW1]], %[[VIEW2]], %[[VIEW3]])
-func.func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>) {
-  %res:2 = "mhlo.sort"(%key, %value) ({
-  ^bb0(%a: tensor<i32>, %b: tensor<i32>, %c: tensor<f32>, %d: tensor<f32>):
-    %ret = "mhlo.compare"(%c, %d) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%ret) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true}: (tensor<5x5xi32>, tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>)
-
-  func.return %res#0, %res#1 : tensor<5x5xi32>, tensor<5x5xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
-// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
-// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
-// CHECK: %[[VIEW0:.*]] = memref.view %[[ARG0]]{{.*}} : memref<4xi8> to memref<f32>
-// CHECK: %[[VIEW1:.*]] = memref.view %[[ARG1]]{{.*}} : memref<4xi8> to memref<f32>
-// CHECK: "lmhlo.fusion"() ({
-// CHECK:   %[[VAR0:.*]] = bufferization.to_tensor %[[VIEW0]] : memref<f32>
-// CHECK:   %[[VAR1:.*]] = bufferization.to_tensor %[[VIEW1]] : memref<f32>
-// CHECK:   %[[VAR2:.*]] = mhlo.add %[[VAR0]], %[[VAR1]] : tensor<f32>
-// CHECK:   tensor_store %[[VAR2]], %[[MEMREF:.*]] : memref<f32>
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) : () -> ()
-func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-  %result = "mhlo.fusion"(%arg0, %arg1) ({
-    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
-      "mhlo.return"(%result) : (tensor<f32>) -> ()
-    }) { fusion_kind = #mhlo<fusion_kind kLoop> } : (tensor<f32>, tensor<f32>) -> tensor<f32>
-
-  func.return %result : tensor<f32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo.fusion"() ({
-// CHECK:   %[[VAL0:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
-// CHECK:   %[[VAL1:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
-// CHECK:   %[[VAL2:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
-// CHECK:   tensor_store %[[VAL0]], %{{.*}} : memref<f32>
-// CHECK:   tensor_store %[[VAL1]], %{{.*}} : memref<f32>
-// CHECK:   tensor_store %[[VAL2]], %{{.*}} : memref<f32>
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) : () -> ()
-func.func @main(%arg0: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg1: tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
-  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tuple<tensor<f32>>
-  %1 = "mhlo.get_tuple_element"(%0) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
-  %2 = "mhlo.get_tuple_element"(%arg0) {index = 1 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tensor<f32>
-  %3 = "mhlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
-  %result:3 = "mhlo.fusion"(%1, %2, %3) ({
-    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
-      "mhlo.return"(%arg2, %arg3, %arg4) : (tensor<f32>, tensor<f32>, tensor<f32>) -> ()
-    }) { fusion_kind = #mhlo<fusion_kind kLoop> } : (tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>)
-
-  %4 = "mhlo.tuple"(%result#0, %result#1, %result#2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
-  func.return %4 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK:   mhlo.reduce
-// CHECK:   (%[[VAL1:.*]]: tensor<f32>, %[[VAL3:.*]]: tensor<f32>)
-// CHECK-SAME: (%[[VAL2:.*]]: tensor<i32>, %[[VAL4:.*]]: tensor<i32>)
-// CHECK:     %[[VAL5:.*]] = mhlo.maximum %[[VAL1]], %[[VAL3]] : tensor<f32>
-// CHECK:     %[[VAL6:.*]] = mhlo.maximum %[[VAL2]], %[[VAL4:.*]] : tensor<i32>
-// CHECK:     mhlo.return %[[VAL5]], %[[VAL6:.*]] : tensor<f32>, tensor<i32>
-// CHECK:   })
-func.func @main(%arg0 : tensor<1x10xf32>, %arg1 : tensor<1x10xi32>, %arg2 : tensor<f32>, %arg3 : tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>) {
-  %result0, %result1 = "mhlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
-    ^bb0(%fa: tensor<f32>, %ia : tensor<i32>, %fb: tensor<f32>, %ib: tensor<i32>):
-      %fmax = "mhlo.maximum"(%fa, %fb) {} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      %imax = "mhlo.maximum"(%ia, %ib) {} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      "mhlo.return"(%fmax, %imax) : (tensor<f32>, tensor<i32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<1x10xi32>, tensor<f32>, tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>)
-  func.return %result0, %result1 : tensor<1xf32>, tensor<1xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.concatenate"(%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) {dimension = 1 : i64} : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
-func.func @main(%arg0 : tensor<5x2xf32>,
-           %arg1 : tensor<5x5xf32>,
-           %arg2 : tensor<5x7xf32>) -> tensor<5x14xf32> {
-  %result = "mhlo.concatenate"(%arg0, %arg1, %arg2) {
-    dimension = 1 : i64
-  } : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
-  func.return %result : tensor<5x14xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<1x10xf32>
-func.func @main() -> tensor<1x10xf32> {
-  %result = "mhlo.iota"() {
-    iota_dimension = 1 : i64
-  } : () -> tensor<1x10xf32>
-  func.return %result : tensor<1x10xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.reverse"(%{{.*}}) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32>
-func.func @main(%arg0 : tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
-  %result = "mhlo.reverse"(%arg0) {
-    dimensions = dense<[1,2]> : tensor<2xi64>
-  } : (tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32>
-  func.return %result : tensor<10x11x12x13xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.slice"(%{{.*}}) {
-// CHECK-SAME: limit_indices = dense<[2, 4]> : tensor<2xi64>,
-// CHECK-SAME: start_indices = dense<[1, 0]> : tensor<2xi64>,
-// CHECK-SAME: strides = dense<[1, 2]> : tensor<2xi64>}
-// CHECK-SAME: : (tensor<3x4xf32>) -> tensor<1x2xf32>
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<1x2xf32> {
-  %0 = "mhlo.slice"(%arg) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xf32>) -> tensor<1x2xf32>
-  func.return %0 : tensor<1x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.gather"(%{{.*}}, %{{.*}}) {
-// CHECK-SAME: dimension_numbers =
-// CHECK-SAME:   offset_dims = [1]
-// CHECK-SAME:   collapsed_slice_dims = [0, 1]
-// CHECK-SAME:   start_index_map = [0, 1]
-// CHECK-SAME:   index_vector_dim = 1
-// CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>}
-// CHECK-SAME: : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
-func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x300xf32> {
-  %0 = "mhlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 1,
-      offset_dims = [1],
-      start_index_map = [0, 1],
-    >,
-    indices_are_sorted = true,
-    slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
-  func.return %0 : tensor<10x300xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: "mhlo.dynamic_slice"(%{{.*}}, %{{.*}}, %{{.*}}) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xf32>, tensor<i64>, tensor<i64>) -> tensor<1x4xf32>
-func.func @main(%arg: tensor<3x4xf32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xf32> {
-  %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xf32>, tensor<i64>, tensor<i64>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @main
-// CHECK: mhlo.dynamic_update_slice %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x4xf32>
-func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4x4xf32> {
-  %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x4xf32>
-  func.return %0 : tensor<4x4xf32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_xla_runtime_pipeline.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_xla_runtime_pipeline.mlir
index b7014db13d1..64225279774 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_xla_runtime_pipeline.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_xla_runtime_pipeline.mlir
@@ -9,64 +9,44 @@ func.func @simple_add(%arg0: tensor<f64>) -> tensor<f64> {
 
 // -----
 
-// TODO(ecg): bring back the Sparse tests once BufferResultsToOutParams is
-// restricted to the main entry point.
-
-//#CSR = #sparse_tensor.encoding<{dimLevelType = [ "dense", "compressed" ]}>
-
-// NOCHECK-LABEL: func.func @csr_abs_eltwise(
-//func.func @csr_abs_eltwise(%arg0: tensor<10x20xf32, #CSR>)
-//    -> tensor<10x20xf32, #CSR> {
-  // NOCHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  // NOCHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // NOCHECK-DAG:  %[[C10:.*]] = arith.constant 10 : index
-  // NOCHECK-DAG:  %[[PTR:.*]] = call @sparsePointers0
-  // NOCHECK-DAG:  %[[IDX:.*]] = call @sparseIndices0
-  // NOCHECK-DAG:  %[[VAL:.*]] = call @sparseValuesF32
-  // NOCHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C10]] step %[[C1]] {
-  // NOCHECK:        %[[L:.*]] = memref.load %[[PTR]][%[[I]]] : memref<?xindex>
-  // NOCHECK:        %[[A:.*]] = arith.addi %[[I]], %[[C1]] : index
-  // NOCHECK:        %[[U:.*]] = memref.load %[[PTR]][%[[A]]] : memref<?xindex>
-  // NOCHECK:        scf.for %[[JJ:.*]] = %[[L]] to %[[U]] step %[[C1]] {
-  // NOCHECK:          %[[J:.*]] = memref.load %[[IDX]][%[[JJ]]] : memref<?xindex>
-  // NOCHECK:          %[[V:.*]] = memref.load %[[VAL]][%[[JJ]]] : memref<?xf32>
-  // NOCHECK:          math.absf %[[V]] : f32
-  // NOCHECK:        }
-  // NOCHECK:      }
-//  %0 = mhlo.abs %arg0 : tensor<10x20xf32, #CSR>
-//  func.return %0 : tensor<10x20xf32, #CSR>
-//}
-
-// -----
-
-//#CSR = #sparse_tensor.encoding<{dimLevelType = [ "dense", "compressed" ]}>
-
-// NOCHECK-LABEL: func.func @csr_gendot(
-//func.func @csr_gendot(%arg0: tensor<32x64xf64, #CSR>,
-//                      %arg1: tensor<64x32xf64>) -> tensor<32x32xf64> {
-  // NOCHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-  // NOCHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // NOCHECK-DAG:  %[[C32:.*]] = arith.constant 32 : index
-  // NOCHECK-DAG:  %[[PTR:.*]] = call @sparsePointers0
-  // NOCHECK-DAG:  %[[IDX:.*]] = call @sparseIndices0
-  // NOCHECK-DAG:  %[[VAL:.*]] = call @sparseValuesF64
-  // NOCHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C32]] step %[[C1]] {
-  // NOCHECK:        %[[L:.*]] = memref.load %[[PTR]][%[[I]]] : memref<?xindex>
-  // NOCHECK:        %[[A:.*]] = arith.addi %[[I]], %[[C1]] : index
-  // NOCHECK:        %[[U:.*]] = memref.load %[[PTR]][%[[A]]] : memref<?xindex>
-  // NOCHECK:        scf.for %[[JJ:.*]] = %[[L]] to %[[U]] step %[[C1]] {
-  // NOCHECK:          %[[J:.*]] = memref.load %[[IDX]][%[[JJ]]] : memref<?xindex>
-  // NOCHECK:          %[[V:.*]] = memref.load %[[VAL]][%[[JJ]]] : memref<?xf64>
-  // NOCHECK:          scf.for %[[K:.*]] = %[[C0]] to %[[C32]] step %[[C1]] {
-  // NOCHECK:          }
-  // NOCHECK:        }
-  // NOCHECK:      }
-//  %0 = "mhlo.dot_general"(%arg0, %arg1) {
-//    dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
-//                                      rhs_contracting_dimensions = [0]>,
-//    precision_config = [#mhlo<precision DEFAULT>,
-//                        #mhlo<precision DEFAULT>]}
-//    : (tensor<32x64xf64, #CSR>,
-//       tensor<64x32xf64>) -> tensor<32x32xf64>
-//  return %0 : tensor<32x32xf64>
-//}
+#CSR = #sparse_tensor.encoding<{dimLevelType = [ "dense", "compressed" ]}>
+
+// CHECK-LABEL: func.func @csr_gendot(
+// CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
+// CHECK-SAME:    %[[IDX:.*1]]: memref<?xindex>,
+// CHECK-SAME:    %[[VAL:.*2]]: memref<?xf64>,
+// CHECK-SAME:    %[[SPEC:.*3]]: !llvm.struct<(array<2 x i64>, array<3 x i64>)>
+// CHECK-SAME:    %[[DENSE:.*4]]: memref<64x32xf64>) -> memref<32x32xf64> {
+func.func @csr_gendot(%arg0: tensor<32x64xf64, #CSR>,
+                      %arg1: tensor<64x32xf64>) -> tensor<32x32xf64> {
+  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG:  %[[C32:.*]] = arith.constant 32 : index
+  // CHECK-DAG:  %[[ALLOC:.*]] = memref.alloc()
+  // CHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C32]] step %[[C1]] {
+  // CHECK:        %[[L:.*]] = memref.load %[[PTR]][%[[I]]] : memref<?xindex>
+  // CHECK:        %[[A:.*]] = arith.addi %[[I]], %[[C1]] : index
+  // CHECK:        %[[U:.*]] = memref.load %[[PTR]][%[[A]]] : memref<?xindex>
+  // CHECK:        scf.for %[[JJ:.*]] = %[[L]] to %[[U]] step %[[C1]] {
+  // CHECK:          %[[J:.*]] = memref.load %[[IDX]][%[[JJ]]] : memref<?xindex>
+  // CHECK:          %[[V:.*]] = memref.load %[[VAL]][%[[JJ]]] : memref<?xf64>
+  // CHECK:          scf.for %[[K:.*]] = %[[C0]] to %[[C32]] step %[[C1]] {
+  // CHECK:            %[[T1:.*]] = memref.load %[[ALLOC]][%[[I]], %[[K]]] : memref<32x32xf64>
+  // CHECK:            %[[T2:.*]] = memref.load %[[DENSE]][%[[J]], %[[K]]] : memref<64x32xf64>
+  // CHECK:            %[[T3:.*]] = arith.mulf %[[V]], %[[T2]] : f64
+  // CHECK:            %[[T4:.*]] = arith.addf %[[T1]], %[[T3]] : f64
+  // CHECK:            memref.store %[[T4]], %[[ALLOC]][%[[I]], %[[K]]] : memref<32x32xf64>
+  // CHECK:          }
+  // CHECK:        }
+  // CHECK:      }
+  // CHECK:      return %[[ALLOC]] : memref<32x32xf64>
+  // CHECK:    }
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
+                                      rhs_contracting_dimensions = [0]>,
+    precision_config = [#mhlo<precision DEFAULT>,
+                        #mhlo<precision DEFAULT>]}
+    : (tensor<32x64xf64, #CSR>,
+       tensor<64x32xf64>) -> tensor<32x32xf64>
+  return %0 : tensor<32x32xf64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_xla_sparsification.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_xla_sparsification.mlir
new file mode 100644
index 00000000000..712944d4157
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_xla_sparsification.mlir
@@ -0,0 +1,33 @@
+// RUN: xla-opt -hlo-legalize-to-linalg -hlo-xla-runtime-sparsification %s | FileCheck %s
+
+#SparseVector = #sparse_tensor.encoding<{ dimLevelType = ["compressed"] }>
+
+// CHECK-LABEL: func.func @mult_sparse_dense(
+// CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
+// CHECK-SAME:    %[[IDX:.*1]]: memref<?xindex>,
+// CHECK-SAME:    %[[VAL:.*2]]: memref<?xf64>,
+// CHECK-SAME:    %[[SPEC:.*3]]: !llvm.struct<(array<1 x i64>, array<3 x i64>)>
+// CHECK-SAME:    %[[DENSE:.*4]]: memref<10xf64>) -> memref<10xf64> {
+// CHECK-DAG:     %[[F0:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK:         %[[A:.*]] = memref.alloc() {alignment = 64 : i64} : memref<10xf64>
+// CHECK:         linalg.fill ins(%[[F0]] : f64) outs(%[[A]] : memref<10xf64>)
+// CHECK:         %[[LO:.*]] = memref.load %[[PTR]][%[[C0]]] : memref<?xindex>
+// CHECK:         %[[HI:.*]] = memref.load %[[PTR]][%[[C1]]] : memref<?xindex>
+// CHECK:         scf.for %[[II:.*]] = %[[LO]] to %[[HI]] step %[[C1]] {
+// CHECK:           %[[I:.*]] = memref.load %[[IDX]][%[[II]]] : memref<?xindex>
+// CHECK:           %[[T0:.*]] = memref.load %[[VAL]][%[[II]]] : memref<?xf64>
+// CHECK:           %[[T1:.*]] = memref.load %[[DENSE]][%[[I]]] : memref<10xf64>
+// CHECK:           %[[T3:.*]] = arith.mulf %[[T0]], %[[T1]] : f64
+// CHECK:           memref.store %[[T3]], %[[A]][%[[I]]] : memref<10xf64>
+// CHECK:         }
+// CHECK:         return %[[A]] : memref<10xf64>
+// CHECK:       }
+func.func @mult_sparse_dense(%arg0: tensor<10xf64, #SparseVector>,
+                             %arg1: tensor<10xf64>)
+			         -> tensor<10xf64> {
+  %0 = mhlo.multiply %arg0, %arg1 : (tensor<10xf64, #SparseVector>,
+                                     tensor<10xf64>) -> tensor<10xf64>
+  return %0 : tensor<10xf64>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
deleted file mode 100644
index ae491195515..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ /dev/null
@@ -1,223 +0,0 @@
-// RUN: xla-opt -split-input-file -xla-legalize-tf-control-flow %s | FileCheck %s
-
-// CHECK-LABEL: @if
-// CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
-func.func @if(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
-  // CHECK: [[VAL0:%.+]] = mhlo.compare GT, [[ARG0]], [[ARG1]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK: [[VAL2:%.+]] = "mhlo.if"([[VAL0]]) ({
-  // CHECK:   [[VAL3:%.+]] = func.call @cond_true([[ARG0]], [[ARG1]])
-  // CHECK:   mhlo.return [[VAL3]] : tensor<f32>
-  // CHECK: },  {
-  // CHECK:   [[VAL4:%.+]] = func.call @cond_false([[ARG0]], [[ARG1]])
-  // CHECK:   mhlo.return [[VAL4]] : tensor<f32>
-  // CHECK: })
-  %1 = "tf.If"(%0, %arg0, %arg1) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
-
-  // CHECK: return [[VAL2]]
-  func.return %1 : tensor<f32>
-}
-
-func.func @cond_false(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32>
-attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
-  %0 = mhlo.exponential %arg1 : (tensor<f32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
-
-func.func @cond_true(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32>
-attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
-  %0 = mhlo.log %arg0 : (tensor<f32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
-
-// CHECK-LABEL: @ifRegion
-// CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
-func.func @ifRegion(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
-  // CHECK: [[VAL0:%.+]] = mhlo.compare GT, [[ARG0]], [[ARG1]]
-  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK: [[VAL1:%.+]] = "mhlo.if"([[VAL0]]) ({
-  %1 = "tf.IfRegion"(%0) ({
-    // CHECK: [[VAL2:%.+]] = mhlo.log [[ARG0]]
-    %2 = mhlo.log %arg0 : (tensor<f32>) -> tensor<f32>
-    // CHECK: mhlo.return [[VAL2]]
-    "tf.Yield"(%2) : (tensor<f32>) -> ()
-  }, {
-    // CHECK: [[VAL3:%.+]] = mhlo.exponential [[ARG1]]
-    %2 = mhlo.exponential %arg1 : (tensor<f32>) -> tensor<f32>
-    // CHECK: mhlo.return [[VAL3]]
-    "tf.Yield"(%2) : (tensor<f32>) -> ()
-  // CHECK: }) : (tensor<i1>) -> tensor<f32>
-  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
-  // CHECK: return [[VAL1]]
-  func.return %1 : tensor<f32>
-}
-
-
-// CHECK-LABEL: func @case
-// CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
-func.func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK: %[[CASE:.*]]:2 = "mhlo.case"(%[[BRANCH_INDEX]]) ({
-  // CHECK:     %[[CALL_EXP:.*]]:2 = func.call @exponential(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK:     mhlo.return %[[CALL_EXP]]#0, %[[CALL_EXP]]#1 : tensor<f32>, tensor<f32>
-  // CHECK:   },  {
-  // CHECK:     %[[CALL_LOG:.*]]:2 = func.call @log(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK:     mhlo.return %[[CALL_LOG]]#0, %[[CALL_LOG]]#1 : tensor<f32>, tensor<f32>
-  // CHECK:   },  {
-  // CHECK:     %[[CALL_FLOOR:.*]]:2 = func.call @floor(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK:     mhlo.return %[[CALL_FLOOR]]#0, %[[CALL_FLOOR]]#1 : tensor<f32>, tensor<f32>
-  // CHECK:   }) : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
-  func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
-// CHECK:   return %[[CASE]]#0, %[[CASE]]#1 : tensor<f32>, tensor<f32>
-}
-
-func.func @exponential(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0 = mhlo.exponential %arg1 : (tensor<f32>) -> tensor<f32>
-  func.return %0, %arg1 : tensor<f32>, tensor<f32>
-}
-
-func.func @log(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0 = mhlo.log %arg0 : (tensor<f32>) -> tensor<f32>
-  func.return %0, %arg1 : tensor<f32>, tensor<f32>
-}
-
-func.func @floor(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  %0 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
-  func.return %0, %arg1 : tensor<f32>, tensor<f32>
-}
-
-
-// CHECK-LABEL: func @caseRegion
-// CHECK-SAME:  ([[BRANCH_INDEX:%.+]]: tensor<i32>, [[ARG0:.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
-func.func @caseRegion(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  // CHECK: [[VAL1:%.+]]:2 = "mhlo.case"([[BRANCH_INDEX]]) ({
-  %0:2 = "tf.CaseRegion"(%index) ({
-    // CHECK: [[VAL2:%.+]] = mhlo.exponential [[ARG1]]
-    %1 = mhlo.exponential %arg1 : (tensor<f32>) -> tensor<f32>
-    // CHECK: mhlo.return [[VAL2]], [[ARG1]]
-    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
-  }, {
-    // CHECK: [[VAL3:%.+]] = mhlo.log [[ARG0]]
-    %1 = mhlo.log %arg0 : (tensor<f32>) -> tensor<f32>
-    // CHECK: mhlo.return [[VAL3]], [[ARG1]]
-    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
-  }, {
-    // CHECK: [[VAL4:%.+]] = mhlo.floor [[ARG0]]
-    %1 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
-    // CHECK: mhlo.return [[VAL4]], [[ARG1]]
-    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
-  // CHECK: }) : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
-  }) {is_stateless = true} : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
-  // CHECK: return [[VAL1]]#0, [[VAL1]]#1 : tensor<f32>, tensor<f32>
-  func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
-}
-
-// -----
-
-// This test case also ensures the mhlo dialect is loaded as a dependency by the
-// pass and hence the split here.
-
-// CHECK-LABEL: func @while
-// CHECK-SAME: %[[VAL0:.*]]: tensor<i32>, %[[VAL1:.*]]: tensor<i32>
-func.func @while(%in0: tensor<i32>, %in1: tensor<i32>) -> tensor<i32> {
-  // CHECK: [[VAL2:%.+]]:3 = mhlo.while([[ITER_ARG0:.*]] = %[[VAL0]], [[ITER_ARG1:.*]] =  %[[VAL1]], [[ITER_ARG2:.*]] =  %[[VAL0]])
-  // CHECK:   [[VAL3:%.+]] = func.call @while_cond([[ITER_ARG0]], [[ITER_ARG1]], [[ITER_ARG2]])
-  // CHECK:   mhlo.return [[VAL3]]
-  // CHECK: } do {
-  // CHECK:   [[VAL3:%.+]]:3 = func.call @while_body([[ITER_ARG0]], [[ITER_ARG1]], [[ITER_ARG2]])
-  // CHECK:   mhlo.return [[VAL3]]#0, [[VAL3]]#1, [[VAL3]]#2
-  // CHECK: return [[VAL2]]#2
-  %2:3 = "tf.While"(%in0, %in1, %in0) {body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
-  func.return %2#2 : tensor<i32>
-}
-func.func @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1> {
-  %0 = "tf.Const"()  {value = dense<1> : tensor<i1>}  : () -> tensor<i1>
-  func.return %0 : tensor<i1>
-}
-func.func @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
-  %0 = "tf.Const"()  {value = dense<1> : tensor<i32>}  : () -> tensor<i32>
-  func.return %0, %0, %0 : tensor<i32>, tensor<i32>, tensor<i32>
-}
-
-// -----
-
-// CHECK-LABEL: func @whileRegion
-func.func @whileRegion() -> tensor<i32> {
-  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
-  %0 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
-  %1 = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK: [[VAL2:%.+]]:3 = mhlo.while([[ITER_ARG0:.*]] = [[VAL0]], [[ITER_ARG1:.*]] =  [[VAL1]], [[ITER_ARG2:.*]] =  [[VAL0]])
-  %2:3 = "tf.WhileRegion"(%0, %1, %0) ({
-  ^cond(%carg0: tensor<i32>, %carg1: tensor<i32>, %carg2: tensor<i32>):
-    // CHECK: [[VAL3:%.+]] = mhlo.constant dense<10>
-    %3 = mhlo.constant dense<10> : tensor<i32>
-    // CHECK: [[VAL4:%.+]] = mhlo.compare LT, [[ITER_ARG2]], [[VAL3]]
-    %4 = "mhlo.compare"(%carg2, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    // CHECK: mhlo.return [[VAL4]]
-    "tf.Yield"(%4) : (tensor<i1>) -> ()
-  }, {
-  ^body(%barg0: tensor<i32>, %barg1: tensor<i32>, %barg2: tensor<i32>):
-    // CHECK: [[VAL5:%.+]] = mhlo.constant dense<1>
-    %5 = mhlo.constant dense<1> : tensor<i32>
-    // CHECK: [[VAL6:%.+]] = mhlo.add [[ITER_ARG2]], [[VAL5]]
-    %6 = mhlo.add %barg2, %5 : tensor<i32>
-    // CHECK: [[VAL7:%.+]] = mhlo.add [[ITER_ARG0]], [[VAL5]]
-    %7 = mhlo.add %barg0, %5 : tensor<i32>
-    // CHECK: mhlo.return [[VAL7]], [[ITER_ARG1]], [[VAL6]]
-    "tf.Yield"(%7, %barg1, %6) : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
-  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
-  // CHECK: return [[VAL2]]#2
-  func.return %2#2 : tensor<i32>
-}
-
-
-// CHECK-LABEL: func @whileRegionImplicitInputs
-// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
-func.func @whileRegionImplicitInputs(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
-  %0 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
-  %1 = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK: [[VAL2:%.+]]:3 = mhlo.while([[ITER_ARG0:.*]] = [[ARG0]], [[ITER_ARG1:.*]] = [[VAL0]], [[ITER_ARG2:.*]] = [[VAL1]])
-  %2 = "tf.WhileRegion"(%arg0) ({
-  ^cond(%carg0: tensor<i32>):
-    // CHECK: [[VAL3:%.+]] = mhlo.compare LT, [[ITER_ARG0]], [[ITER_ARG1]]
-    %3 = mhlo.compare LT, %carg0, %0 : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    // CHECK: mhlo.return [[VAL3]]
-    "tf.Yield"(%3) : (tensor<i1>) -> ()
-  }, {
-  ^body(%barg0: tensor<i32>):
-    // CHECK: [[VAL3:%.+]] = mhlo.add [[ITER_ARG0]], [[ITER_ARG2]]
-    %3 = mhlo.add %barg0, %1 : tensor<i32>
-    // CHECK: [[VAL4:%.+]] = mhlo.add [[ITER_ARG0]], [[VAL3]]
-    %4 = mhlo.add %barg0, %3 : tensor<i32>
-    // CHECK: mhlo.return [[VAL4]], [[ITER_ARG1]], [[ITER_ARG2]]
-    "tf.Yield"(%4) : (tensor<i32>) -> ()
-  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
-  // CHECK: return [[VAL2]]#0
-  func.return %2 : tensor<i32>
-}
-
-
-// CHECK-LABEL: func @whileRegionMultipleImplicitInputs
-func.func @whileRegionMultipleImplicitInputs() {
-  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
-  %0 = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
-  %1 = mhlo.constant dense<-1> : tensor<i32>
-  // CHECK: [[VAL2:%.+]]:2 = mhlo.while([[ITER_ARG0:.*]] = [[VAL0]], [[ITER_ARG1:.*]] = [[VAL1]])
-  "tf.WhileRegion"() ({
-    // CHECK: [[VAL3:%.+]] = mhlo.compare LT, [[ITER_ARG0]], [[ITER_ARG1]]
-    %2 = "mhlo.compare"(%0, %1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    // CHECK: mhlo.return [[VAL3]]
-    "tf.Yield"(%2) : (tensor<i1>) -> ()
-  }, {
-    // CHECK: [[VAL3:%.+]] = mhlo.add [[ITER_ARG0]], [[ITER_ARG1]]
-    %2 = mhlo.add %0, %1 : tensor<i32>
-    // CHECK: mhlo.return [[ITER_ARG0]], [[ITER_ARG1]]
-    "tf.Yield"() : () -> ()
-  }) {is_stateless = true, parallel_iterations = 10 : i64} : () -> ()
-  // CHECK: return
-  func.return
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
index 000876ed4c2..d1301bb070b 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
@@ -2351,7 +2351,7 @@ func.func @acos_dynamic(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: @tan
 // CHECK-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
 func.func @tan(%arg : tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: chlo.tan %[[ARG]] : tensor<2xf32>
+  // CHECK: mhlo.tan %[[ARG]] : tensor<2xf32>
   %result = "tf.Tan"(%arg) : (tensor<2xf32>) -> tensor<2xf32>
   func.return %result : tensor<2xf32>
 }
@@ -2361,7 +2361,7 @@ func.func @tan(%arg : tensor<2xf32>) -> tensor<2xf32> {
 // CHECK-LABEL: @tan_unranked
 // CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
 func.func @tan_unranked(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: chlo.tan %[[ARG]] : tensor<*xf32>
+  // CHECK: mhlo.tan %[[ARG]] : tensor<*xf32>
   %result = "tf.Tan"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
   func.return %result : tensor<*xf32>
 }
@@ -4363,8 +4363,8 @@ func.func @conv_dynamic(%arg0: tensor<?x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32
 
 // -----
 
-// CHECK-LABEL: @split_not_match_non_const_split_dim
-func.func @split_not_match_non_const_split_dim(%input: tensor<4x4xf32>, %split_dim: tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
+// CHECK-LABEL: @split_not_match_dynamic_split_dim_input
+func.func @split_not_match_dynamic_split_dim_input(%input: tensor<4x4xf32>, %split_dim: tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
   // CHECK: tf.Split
   %0:2 = "tf.Split"(%split_dim, %input) : (tensor<i32>, tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>)
   func.return %0#0, %0#1 : tensor<*xf32>, tensor<*xf32>
@@ -4372,8 +4372,8 @@ func.func @split_not_match_non_const_split_dim(%input: tensor<4x4xf32>, %split_d
 
 // -----
 
-// CHECK-LABEL: @split_not_match_unknown_input_dim
-func.func @split_not_match_unknown_input_dim(%input: tensor<4x?x4xf32>) -> (tensor<4x?x4xf32>, tensor<4x?x4xf32>) {
+// CHECK-LABEL: @split_not_match_dynamic_input_shape
+func.func @split_not_match_dynamic_input_shape(%input: tensor<4x?x4xf32>) -> (tensor<4x?x4xf32>, tensor<4x?x4xf32>) {
   %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: tensor.dim {{.*}} : tensor<4x?x4xf32>
   // CHECK: arith.divsi {{.*}} : index
@@ -4391,6 +4391,25 @@ func.func @split_not_match_unknown_input_dim(%input: tensor<4x?x4xf32>) -> (tens
 
 // -----
 
+// CHECK-LABEL: @split_not_match_static_split_dim_size
+func.func @split_not_match_static_split_dim_size(%input: tensor<4x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
+  %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tensor.dim {{.*}} : tensor<4x?x4xf32>
+  // CHECK: arith.divsi {{.*}} : index
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: mhlo.real_dynamic_slice {{.*}} : (tensor<4x?x4xf32>, tensor<3xindex>, tensor<3xindex>, tensor<3xindex>) -> tensor<2x?x4xf32>
+  // CHECK: muli {{.*}} : index
+  // CHECK: muli {{.*}} : index
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: mhlo.real_dynamic_slice {{.*}} : (tensor<4x?x4xf32>, tensor<3xindex>, tensor<3xindex>, tensor<3xindex>) -> tensor<2x?x4xf32>
+  %0:2 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>)
+  func.return %0#0, %0#1 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @split_match_and_split_into_two
 func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x6xf32>, tensor<2x6xf32>) {
   %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -4403,18 +4422,6 @@ func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x
 
 // -----
 
-// CHECK-LABEL: @split_match_and_split_into_two_dynamic
-func.func @split_match_and_split_into_two_dynamic(%input: tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>) {
-  %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[2, -1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32>
-  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, -1]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32>
-  %0:2 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>)
-  // CHECK: return %[[ONE]], %[[TWO]]
-  func.return %0#0, %0#1 : tensor<2x?xf32>, tensor<2x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @split_match_and_split_into_three
 // CHECK-SAME: (%[[ARG:.*]]: tensor<4x6xf32>)
 func.func @split_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) {
@@ -4483,19 +4490,6 @@ func.func @splitv_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor
 
 // -----
 
-// CHECK-LABEL: @splitv_match_and_split_into_three_dynamic
-func.func @splitv_match_and_split_into_three_dynamic(%input: tensor<?x6xf32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>) {
-  %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %split_dim = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x1xf32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x2xf32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x3xf32>
-  %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<?x6xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>)
-  func.return %0#0, %0#1, %0#2 : tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @splitv_dynamic_dim_in_split_sizes
 func.func @splitv_dynamic_dim_in_split_sizes(%input: tensor<4x6xf32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) {
   %split_sizes = "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
@@ -4507,6 +4501,17 @@ func.func @splitv_dynamic_dim_in_split_sizes(%input: tensor<4x6xf32>) -> (tensor
   func.return %0#0, %0#1, %0#2 : tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>
 }
 
+// -----
+
+// CHECK-LABEL: @splitv_dynamic
+func.func @splitv_dynamic(%input: tensor<?x6xf32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>) {
+  %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %split_dim = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.SplitV
+  %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<?x6xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>)
+  func.return %0#0, %0#1, %0#2 : tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // tf.Assert legalization
 //===----------------------------------------------------------------------===//
@@ -5095,17 +5100,23 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   [[NEW_IV:%.*]] = chlo.broadcast_add [[ITER_ARG]], [[ONE]]
   // CHECK:   mhlo.return [[NEW_IV]], [[ITER_ARG1]], [[INDICES2]]
 
-  // CHECK: [[GATHER:%.*]] = "mhlo.gather"([[INPUT]], [[WHILE_OUT]]#2)
+  // CHECK: [[CONSTANT1:%.*]] = mhlo.constant dense<1> : tensor<1xi64>
+  // CHECK: [[ARITH_CONSTANT:%.*]] = arith.constant 1 : index
+  // CHECK: [[SHAPE_DIM:%.*]] = shape.dim %arg0, [[ARITH_CONSTANT]] : tensor<4x?x16xf32>, index -> index
+  // CHECK: [[INDEX_CAST:%.*]] = arith.index_cast [[SHAPE_DIM]] : index to i64
+  // CHECK: [[FROM_ELEMENTS:%.*]] = tensor.from_elements [[INDEX_CAST]] : tensor<1xi64>
+  // CHECK: [[CONSTANT2:%.*]] = mhlo.constant dense<16> : tensor<1xi64>
+  // CHECK: [[CONCATENATE:%.*]] = "mhlo.concatenate"([[CONSTANT1]], [[FROM_ELEMENTS]], [[CONSTANT2]]) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
+  // CHECK: [[DYNAMIC_GATHER:%.*]] = "mhlo.dynamic_gather"([[INPUT]], [[WHILE_OUT]]#2, [[CONCATENATE]])
   // CHECK-SAME:   dimension_numbers =
   // CHECK-SAME:     offset_dims = [1, 2]
   // CHECK-SAME:     collapsed_slice_dims = [0]
   // CHECK-SAME:     start_index_map = [0]
   // CHECK-SAME:     index_vector_dim = 1
   // CHECK-SAME: indices_are_sorted = false
-  // CHECK-SAME: slice_sizes = dense<[1, -1, 16]>
-  // CHECK: (tensor<4x?x16xf32>, tensor<4xi32>) -> tensor<4x?x16xf32>
+  // CHECK-SAME:: (tensor<4x?x16xf32>, tensor<4xi32>, tensor<3xi64>) -> tensor<4x?x16xf32>
 
-  // CHECK: return [[GATHER]]
+  // CHECK: return [[DYNAMIC_GATHER]]
 
   %0 = "tf.RandomShuffle"(%input) : (tensor<4x?x16xf32>) -> (tensor<4x?x16xf32>)
   func.return %0: tensor<4x?x16xf32>
@@ -5555,7 +5566,7 @@ func.func @avgpool_grad_bf16(%grad: tensor<10x12x16x64xbf16>) -> tensor<10x24x32
 
 // CHECK-LABEL: xla_sharding
 func.func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
-  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = ""}
+  // CHECK-NEXT: mhlo.custom_call @Sharding(%arg0) {mhlo.sharding = ""}
   %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "", sharding = ""} : (tensor<4x16xf32>) -> tensor<4x16xf32>
   func.return %0 : tensor<4x16xf32>
 }
@@ -5768,21 +5779,6 @@ func.func @cumprod(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   func.return %1 : tensor<4xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Qr op legalization
-//===----------------------------------------------------------------------===//
-
-// CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
-func.func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
-  // The tf.Qr lowering is a full algorithm that is not effective to verify with
-  // FileCheck. Just verify that it converted.
-  // TODO(laurenzo): Move this out of the mainline tf2xla conversion as it is
-  // really only applicable to certain legacy uses.
-  // CHECK-NOT: "tf.Qr"
-  %0:2 = "tf.Qr"(%arg0) {full_matrices = false} : (tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
-  func.return %0#0, %0#1 : tensor<500x100x75xf32>, tensor<500x75x75xf32>
-}
-
 //===----------------------------------------------------------------------===//
 // tf.Softplus legalization
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-prefer-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-prefer-tf2xla.mlir
index a997f933e74..911f2f6575a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-prefer-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-prefer-tf2xla.mlir
@@ -59,10 +59,10 @@ func.func @random_uniform_simple(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 // CHECK-LABEL: func @random_uniform_with_seeds
 func.func @random_uniform_with_seeds(%arg0: tensor<4xi32>) -> tensor<32x12x12x64xf32> {
   // CHECK:  %0 = mhlo.constant dense<[32, 12, 12, 64]> : tensor<4xi32>
-  // CHECK-NEXT :  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-NEXT :  %2 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK-NEXT :  %3 = mhlo.convert %0 : (tensor<4xi32>) -> tensor<4xi64>
-  // CHECK-NEXT :  %4 = "mhlo.rng"(%1, %2, %3) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
+  // CHECK-NEXT:  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %2 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT:  %3 = mhlo.convert %0 : (tensor<4xi32>) -> tensor<4xi64>
+  // CHECK-NEXT:  %4 = "mhlo.rng"(%1, %2, %3) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<4xi64>) -> tensor<32x12x12x64xf32>
   %cst = "tf.Const"() {value = dense<[32, 12, 12, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
   %0 = "tf.RandomUniform"(%cst) {seed = 87654321 : i64, seed2 = 0 : i64} : (tensor<4xi32>) -> tensor<32x12x12x64xf32>
   // CHECK: return %4 : tensor<32x12x12x64xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index 871a4cce648..603982022d5 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt "-xla-legalize-tf-with-tf2xla=device-type=XLA_CPU_JIT legalize-test-only-ops" %s -verify-diagnostics | FileCheck %s
+// RUN: xla-opt "-xla-legalize-tf=device-type=XLA_CPU_JIT allow-partial-conversion=true prefer-tf2xla=true use-tf2xla-fallback=true" %s -verify-diagnostics | FileCheck %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
 
@@ -29,7 +29,7 @@ func.func @not_allowlisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: t
 // CHECK-LABEL: unranked_operand
 func.func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: tf.Atan2
-  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
+  // expected-remark@+1 {{lowering requires bounded tensor operands}}
   %0 = "tf.Atan2"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
 
   func.return %0 : tensor<*xf32>
@@ -38,7 +38,7 @@ func.func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: dynamic_operand
 func.func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: tf.Atan2
-  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
+  // expected-remark@+1 {{lowering requires bounded tensor operands}}
   %0 = "tf.Atan2"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
 
   func.return %0 : tensor<?xf32>
@@ -102,13 +102,13 @@ func.func @convert(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 }
 
 // CHECK-LABEL: func @constant
-func.func @constant(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2xf32>
-  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[ONE]], %arg0 : tensor<2xf32>
+func.func @constant(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK: %[[RESULT:.*]] = mhlo.divide %[[ONE]], %arg0 : tensor<f32>
   // CHECK: return %[[RESULT]]
 
-  %0 = "tf.Inv"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
+  %0 = "tf.Inv"(%arg0) : (tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: func @greater
@@ -357,6 +357,16 @@ func.func @atan2_with_symbol_ref(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   func.return %0 : tensor<2xf32>
 }
 
+func.func private @branch0(tensor<2xf32>) -> tensor<2xf32>
+func.func private @branch1(tensor<2xf32>) -> tensor<2xf32>
+
+func.func @case_with_symbol_ref(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: tf.Case
+  // expected-remark@+1 {{ops with symbol references are not supported}}
+  %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
 // CHECK-LABEL: const
 func.func @const() -> tensor<2xf32> {
   // CHECK: mhlo.const
@@ -364,6 +374,53 @@ func.func @const() -> tensor<2xf32> {
   func.return %cst : tensor<2xf32>
 }
 
+// CHECK-LABEL: @bounds_propagation
+func.func @bounds_propagation(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
+  %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
+  // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+  %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+
+  %axis = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
+  // CHECK: %[[REVERSED:.*]] = "mhlo.reverse"(%[[BOUNDED]])
+  // CHECK-SAME: {dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-SAME: (tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+  %1 = "tf.ReverseV2"(%0, %axis) : (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+
+  // CHECK: %[[RESULT:.*]] = tensor.cast %[[REVERSED]] : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
+  // CHECK: return %[[RESULT]] : tensor<?xf32>
+  func.return %1 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @bounds_propagation_skip_symbol_ref_ops
+func.func @bounds_propagation_skip_symbol_ref_ops(%input: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
+  %dimension = "tf.Const"() { value = dense<0> : tensor<i32> } : () -> tensor<i32>
+  // CHECK: %[[BOUNDED:.*]] = "mhlo.set_dimension_size"
+  // CHECK-SAME: {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
+  %0 = "tf.XlaSetDynamicDimensionSize"(%input, %dimension, %size) : (tensor<4xf32>, tensor<i32>, tensor<i32>) -> tensor<?xf32>
+
+  // CHECK: %[[ORIGINAL:.*]] = tensor.cast %[[BOUNDED]] : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
+
+  %axis = "tf.Const"() { value = dense<0> : tensor<1xi32> } : () -> tensor<1xi32>
+  // CHECK: tf.ReverseV2
+  // CHECK-SAME: (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+  // expected-remark@+1 {{lowering requires bounded tensor operands}}
+  %1 = "tf.ReverseV2"(%0, %axis) {_body = @identity} : (tensor<?xf32>, tensor<1xi32>) -> tensor<?xf32>
+
+  func.return %1 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @set_bound
+func.func @set_bound(%arg0: tensor<i32>) -> tensor<i32> {
+  %bound = "tf.Const"() {value = dense<16> : tensor<i32>} : () -> tensor<i32>
+
+  // CHECK: %[[RESULT:.*]] = mhlo.custom_call @SetBound(%arg0) {backend_config = "", mhlo.literal = dense<16> : tensor<i32>}
+  %bounded = "tf.XlaSetBound"(%arg0, %bound) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %bounded : tensor<i32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 02b69842254..256257b6ad1 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1701,6 +1701,21 @@ func.func @unhandled_partitioned_call_2(%arg0: tensor<i32>, %arg1: tensor<*xi32>
 
 // -----
 
+// CHECK-LABEL: func @no_args_and_results
+func.func @no_args_and_results() {
+  // CHECK: call @callee() : () -> ()
+  // CHECK: call @callee() : () -> ()
+  // CHECK: call @callee() : () -> ()
+  "tf.PartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> ()
+  "tf.StatefulPartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> ()
+  "tf.LegacyCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> ()
+  func.return
+}
+
+func.func @callee() {
+  func.return
+}
+
 //===----------------------------------------------------------------------===//
 // ReverseV2 op legalization.
 //===----------------------------------------------------------------------===//
@@ -2406,36 +2421,6 @@ func.func @acos_dynamic(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
-// CHECK-LABEL: @tan
-// CHECK-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
-// CHLO-LABEL: @tan
-// CHLO-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
-func.func @tan(%arg : tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: chlo.tan %[[ARG]] : tensor<2xf32>
-  // CHLO: %[[SINE:.*]] = mhlo.sine %[[ARG]]
-  // CHLO  %[[COSINE:.*]] = mhlo.cosine %[[ARG]]
-  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
-  %result = "tf.Tan"(%arg) : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %result : tensor<2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @tan_unranked
-// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
-// CHLO-LABEL: @tan_unranked
-// CHLO-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
-func.func @tan_unranked(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: chlo.tan %[[ARG]] : tensor<*xf32>
-  // CHLO: %[[SINE:.*]] = mhlo.sine %[[ARG]]
-  // CHLO  %[[COSINE:.*]] = mhlo.cosine %[[ARG]]
-  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
-  %result = "tf.Tan"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %result : tensor<*xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @cast_dynamic_i2f
 func.func @cast_dynamic_i2f(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   // CHECK: mhlo.convert %arg0 : (tensor<?xi32>) -> tensor<?xf32>
@@ -2508,6 +2493,15 @@ func.func @cos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 
 // -----
 
+// CHECK-LABEL: @tan
+func.func @tan(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK:  mhlo.tan %arg0 : tensor<2xf32>
+  %0 = "tf.Tan"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @cos_dynamic
 func.func @cos_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK:  mhlo.cosine %arg0 : tensor<?xf32>
@@ -4488,8 +4482,8 @@ func.func @conv_dynamic(%arg0: tensor<?x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32
 
 // -----
 
-// CHECK-LABEL: @split_not_match_non_const_split_dim
-func.func @split_not_match_non_const_split_dim(%input: tensor<4x4xf32>, %split_dim: tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
+// CHECK-LABEL: @split_not_match_dynamic_split_dim_input
+func.func @split_not_match_dynamic_split_dim_input(%input: tensor<4x4xf32>, %split_dim: tensor<i32>) -> (tensor<*xf32>, tensor<*xf32>) {
   // CHECK: tf.Split
   %0:2 = "tf.Split"(%split_dim, %input) : (tensor<i32>, tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>)
   func.return %0#0, %0#1 : tensor<*xf32>, tensor<*xf32>
@@ -4497,8 +4491,8 @@ func.func @split_not_match_non_const_split_dim(%input: tensor<4x4xf32>, %split_d
 
 // -----
 
-// CHECK-LABEL: @split_not_match_unknown_input_dim
-func.func @split_not_match_unknown_input_dim(%input: tensor<4x?x4xf32>) -> (tensor<4x?x4xf32>, tensor<4x?x4xf32>) {
+// CHECK-LABEL: @split_not_match_dynamic_input_shape
+func.func @split_not_match_dynamic_input_shape(%input: tensor<4x?x4xf32>) -> (tensor<4x?x4xf32>, tensor<4x?x4xf32>) {
   %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   // CHECK: tensor.dim {{.*}} : tensor<4x?x4xf32>
   // CHECK: arith.divsi {{.*}} : index
@@ -4516,6 +4510,25 @@ func.func @split_not_match_unknown_input_dim(%input: tensor<4x?x4xf32>) -> (tens
 
 // -----
 
+// CHECK-LABEL: @split_not_match_static_split_dim_size
+func.func @split_not_match_static_split_dim_size(%input: tensor<4x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
+  %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tensor.dim {{.*}} : tensor<4x?x4xf32>
+  // CHECK: arith.divsi {{.*}} : index
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: mhlo.real_dynamic_slice {{.*}} : (tensor<4x?x4xf32>, tensor<3xindex>, tensor<3xindex>, tensor<3xindex>) -> tensor<2x?x4xf32>
+  // CHECK: muli {{.*}} : index
+  // CHECK: muli {{.*}} : index
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: tensor.from_elements {{.*}} : tensor<3xindex>
+  // CHECK: mhlo.real_dynamic_slice {{.*}} : (tensor<4x?x4xf32>, tensor<3xindex>, tensor<3xindex>, tensor<3xindex>) -> tensor<2x?x4xf32>
+  %0:2 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>)
+  func.return %0#0, %0#1 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @split_match_and_split_into_two
 func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x6xf32>, tensor<2x6xf32>) {
   %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -4528,18 +4541,6 @@ func.func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x
 
 // -----
 
-// CHECK-LABEL: @split_match_and_split_into_two_dynamic
-func.func @split_match_and_split_into_two_dynamic(%input: tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>) {
-  %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: %[[ONE:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[2, -1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32>
-  // CHECK: %[[TWO:.*]] = "mhlo.slice"(%{{.*}}) {limit_indices = dense<[4, -1]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32>
-  %0:2 = "tf.Split"(%cst, %input) : (tensor<i32>, tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>)
-  // CHECK: return %[[ONE]], %[[TWO]]
-  func.return %0#0, %0#1 : tensor<2x?xf32>, tensor<2x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @split_match_and_split_into_three
 // CHECK-SAME: (%[[ARG:.*]]: tensor<4x6xf32>)
 func.func @split_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) {
@@ -4608,19 +4609,6 @@ func.func @splitv_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor
 
 // -----
 
-// CHECK-LABEL: @splitv_match_and_split_into_three_dynamic
-func.func @splitv_match_and_split_into_three_dynamic(%input: tensor<?x6xf32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>) {
-  %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
-  %split_dim = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x1xf32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x2xf32>
-  // CHECK: "mhlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<?x6xf32>) -> tensor<?x3xf32>
-  %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<?x6xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>)
-  func.return %0#0, %0#1, %0#2 : tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @splitv_dynamic_dim_in_split_sizes
 func.func @splitv_dynamic_dim_in_split_sizes(%input: tensor<4x6xf32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) {
   %split_sizes = "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
@@ -4632,6 +4620,17 @@ func.func @splitv_dynamic_dim_in_split_sizes(%input: tensor<4x6xf32>) -> (tensor
   func.return %0#0, %0#1, %0#2 : tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>
 }
 
+// -----
+
+// CHECK-LABEL: @splitv_dynamic
+func.func @splitv_dynamic(%input: tensor<?x6xf32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>) {
+  %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %split_dim = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.SplitV
+  %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<?x6xf32>, tensor<3xi32>, tensor<i32>) -> (tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>)
+  func.return %0#0, %0#1, %0#2 : tensor<?x1xf32>, tensor<?x2xf32>, tensor<?x3xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // tf.Assert legalization
 //===----------------------------------------------------------------------===//
@@ -5175,6 +5174,17 @@ func.func @tensor_scatter_max(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi
 
 // -----
 
+// CHECK-LABEL: @random_shuffle_num_elems_le_1
+func.func @random_shuffle_num_elems_le_1() -> tensor<f32> {
+  // CHECK: [[INPUT:%.*]] = mhlo.constant dense<1.000000e+20> : tensor<f32>
+  // CHECK-NEXT: return [[INPUT]]
+  %cst = "tf.Const"() {value = dense<1.000000e+20> : tensor<f32>} : () -> tensor<f32>
+  %0 = "tf.RandomShuffle"(%cst) {device = "", seed = -4294967297 : i64, seed2 = -2147483649 : i64} : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
 // CHECK-LABEL: @random_shuffle_first_dim_1
 // CHECK-SAME: [[INPUT:%.*]]: tensor<1x?xf32>
 func.func @random_shuffle_first_dim_1(%input: tensor<1x?xf32>) -> tensor<1x?xf32> {
@@ -5243,17 +5253,23 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   mhlo.return [[NEW_IV]], [[ITER_ARG1]], [[INDICES2]]
   // CHECK: }
 
-  // CHECK: [[GATHER:%.*]] = "mhlo.gather"([[INPUT]], [[WHILE_OUT]]#2)
+  // CHECK: [[CONSTANT1:%.*]] = mhlo.constant dense<1> : tensor<1xi64>
+  // CHECK: [[ARITH_CONSTANT:%.*]] = arith.constant 1 : index
+  // CHECK: [[SHAPE_DIM:%.*]] = shape.dim %arg0, [[ARITH_CONSTANT]] : tensor<4x?x16xf32>, index -> index
+  // CHECK: [[INDEX_CAST:%.*]] = arith.index_cast [[SHAPE_DIM]] : index to i64
+  // CHECK: [[FROM_ELEMENTS:%.*]] = tensor.from_elements [[INDEX_CAST]] : tensor<1xi64>
+  // CHECK: [[CONSTANT2:%.*]] = mhlo.constant dense<16> : tensor<1xi64>
+  // CHECK: [[CONCATENATE:%.*]] = "mhlo.concatenate"([[CONSTANT1]], [[FROM_ELEMENTS]], [[CONSTANT2]]) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
+  // CHECK: [[DYNAMIC_GATHER:%.*]] = "mhlo.dynamic_gather"([[INPUT]], [[WHILE_OUT]]#2, [[CONCATENATE]])
   // CHECK-SAME:   dimension_numbers =
   // CHECK-SAME:     offset_dims = [1, 2]
   // CHECK-SAME:     collapsed_slice_dims = [0]
   // CHECK-SAME:     start_index_map = [0]
   // CHECK-SAME:     index_vector_dim = 1
   // CHECK-SAME: indices_are_sorted = false
-  // CHECK-SAME: slice_sizes = dense<[1, -1, 16]>
-  // CHECK: (tensor<4x?x16xf32>, tensor<4xi32>) -> tensor<4x?x16xf32>
+  // CHECK-SAME:: (tensor<4x?x16xf32>, tensor<4xi32>, tensor<3xi64>) -> tensor<4x?x16xf32>
 
-  // CHECK: return [[GATHER]]
+  // CHECK: return [[DYNAMIC_GATHER]]
 
   %0 = "tf.RandomShuffle"(%input) : (tensor<4x?x16xf32>) -> (tensor<4x?x16xf32>)
   func.return %0: tensor<4x?x16xf32>
@@ -5703,7 +5719,7 @@ func.func @avgpool_grad_bf16(%grad: tensor<10x12x16x64xbf16>) -> tensor<10x24x32
 
 // CHECK-LABEL: xla_sharding
 func.func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
-  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = ""}
+  // CHECK-NEXT: mhlo.custom_call @Sharding(%arg0) {mhlo.sharding = ""}
   %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "", sharding = ""} : (tensor<4x16xf32>) -> tensor<4x16xf32>
   func.return %0 : tensor<4x16xf32>
 }
@@ -5917,21 +5933,6 @@ func.func @cumprod(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   func.return %1 : tensor<4xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Qr op legalization
-//===----------------------------------------------------------------------===//
-
-// CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
-func.func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
-  // The tf.Qr lowering is a full algorithm that is not effective to verify with
-  // FileCheck. Just verify that it converted.
-  // TODO(laurenzo): Move this out of the mainline tf2xla conversion as it is
-  // really only applicable to certain legacy uses.
-  // CHECK-NOT: "tf.Qr"
-  %0:2 = "tf.Qr"(%arg0) {full_matrices = false} : (tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
-  func.return %0#0, %0#1 : tensor<500x100x75xf32>, tensor<500x75x75xf32>
-}
-
 //===----------------------------------------------------------------------===//
 // tf.UniformQuantizedDotHybrid legalization
 //===----------------------------------------------------------------------===//
@@ -5942,17 +5943,268 @@ func.func @quantized_matmul_fn(%input: tensor<*xf32>) -> tensor<*xf32> {
   %weight_scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
   %weight_zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
 
-  // CHECK: %[[CONST:.*]] = mhlo.constant
-  // CHECK-SAME{LITERAL}: dense<[[1, 2], [3, 4]]> : tensor<2x2xi8>
+  // CHECK: %[[CONST:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL}: value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi8>
   // CHECK-SAME: tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
-  // CHECK: %[[ADD:.*]] = chlo.broadcast_add
-  // CHECK: "mhlo.dot"(%[[ADD]], %[[CONST]]) : (tensor<*xf32>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+  // CHECK: "mhlo.dot"(%arg0, %[[CONST]]) : (tensor<*xf32>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+
+  %0 = "tf.UniformQuantizedDotHybrid"(%input, %weight, %weight_scales, %weight_zps) {rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64} : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// tf.UniformQuantizedConvolutionHybrid legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_convolution_hybrid
+func.func @uniform_quantized_convolution_hybrid(%input: tensor<1x2x2x3xf32>) -> tensor<*xf32> {
+  %weight = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2032207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2032207D207D20696E745F76616C3A20313237"> : tensor<2x3x3x2x!tf_type.qint8>} : () -> tensor<2x3x3x2x!tf_type.qint8>
+  %weight_scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %weight_zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[CONST:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL} value = dense<127> : tensor<2x3x3x2xi8>
+  // CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: mhlo.convolution(%arg0, %[[CONST]])
+  // CHECK-SAME{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  // CHECK-SAME{LITERAL}: window = {stride = [1, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]}
+  // CHECK-SAME{LITERAL}: batch_group_count = 1 : i64, feature_group_count = 1 : i64
+  // CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%input, %weight, %weight_scales, %weight_zps) {
+    window_strides = [1, 2],
+    padding = "VALID",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = -1 : i64,
+    rhs_quantization_min_val = -128 : i64,
+    rhs_quantization_max_val = 127 : i64
+  } : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
 
-  %0 = "tf.AddV2"(%input, %input) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %1 = "tf.UniformQuantizedDotHybrid"(%0, %weight, %weight_scales, %weight_zps) {rhs_quantization_axis = -1 : i64, rhs_quantization_min_val = -128 : i64, rhs_quantization_max_val = 127 : i64} : (tensor<*xf32>, tensor<2x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// -----
+
+// : func @uniform_quantized_convolution_hybrid_same
+func.func @uniform_quantized_convolution_hybrid_same(%input: tensor<1x2x2x3xf32>) -> tensor<*xf32> {
+  %weight = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2032207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2032207D207D20696E745F76616C3A20313237"> : tensor<2x3x3x2x!tf_type.qint8>} : () -> tensor<2x3x3x2x!tf_type.qint8>
+  %weight_scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %weight_zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[CONST:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL} value = dense<127> : tensor<2x3x3x2xi8>
+  // CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: mhlo.convolution(%arg0, %[[CONST]])
+  // CHECK-SAME{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  // CHECK-SAME{LITERAL}: window = {stride = [1, 2], pad = [[1, 1], [2, 1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]}
+  // CHECK-SAME{LITERAL}: batch_group_count = 1 : i64, feature_group_count = 1 : i64
+  // CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%input, %weight, %weight_scales, %weight_zps) {
+    window_strides = [1, 2],
+    padding = "SAME",
+    explicit_padding = [],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = -1 : i64,
+    rhs_quantization_min_val = -128 : i64,
+    rhs_quantization_max_val = 127 : i64
+  } : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+// : func @uniform_quantized_convolution_hybrid_explicit
+func.func @uniform_quantized_convolution_hybrid_explicit(%input: tensor<1x2x2x3xf32>) -> tensor<*xf32> {
+  %weight = "tf.Const"() {value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2032207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2033207D2064696D207B2073697A653A2032207D207D20696E745F76616C3A20313237"> : tensor<2x3x3x2x!tf_type.qint8>} : () -> tensor<2x3x3x2x!tf_type.qint8>
+  %weight_scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %weight_zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[CONST:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL} value = dense<127> : tensor<2x3x3x2xi8>
+  // CHECK-SAME: tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: mhlo.convolution(%arg0, %[[CONST]])
+  // CHECK-SAME{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  // CHECK-SAME{LITERAL}: window = {stride = [1, 2], pad = [[1, 2], [3, 4]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]}
+  // CHECK-SAME{LITERAL}: batch_group_count = 1 : i64, feature_group_count = 1 : i64
+  // CHECK-SAME: (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+
+  %0 = "tf.UniformQuantizedConvolutionHybrid"(%input, %weight, %weight_scales, %weight_zps) {
+    window_strides = [1, 2],
+    padding = "EXPLICIT",
+    explicit_padding = [1, 2, 3, 4],
+    lhs_dilation = [1, 1],
+    rhs_dilation = [2, 2],
+    batch_group_count = 1 : i64,
+    feature_group_count = 1 : i64,
+    dimension_numbers = "\10\03\1A\02\01\02 \02(\032\02\00\01@\03J\02\01\02",
+    rhs_quantization_axis = -1 : i64,
+    rhs_quantization_min_val = -128 : i64,
+    rhs_quantization_max_val = 127 : i64
+  } : (tensor<1x2x2x3xf32>, tensor<2x3x3x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// tf.UniformQuantize and tf.UniformDequantize legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_and_dequantize
+func.func @uniform_quantize_and_dequantize(%arg0 : tensor<*xf32>) -> tensor<*xf32> {
+  %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[QUANTIZE:.*]] = mhlo.uniform_quantize %arg0 : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: %[[DEQUANTIZE:.*]] = mhlo.uniform_dequantize %[[QUANTIZE]] : (tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+  // CHECK: return %[[DEQUANTIZE]] : tensor<*xf32>
+
+  %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*xf32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
   func.return %1 : tensor<*xf32>
 }
 
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_and_dequantize_per_axis
+func.func @uniform_quantize_and_dequantize_per_axis(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %scales = "tf.Const"() { value = dense<[1.0, 2.0]> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps = "tf.Const"() { value = dense<[3, 4]> : tensor<2xi32> } : () -> tensor<2xi32>
+
+  // CHECK: %[[QUANTIZE:.*]] = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00:3,2.000000e+00:4}>>
+  // CHECK: %[[DEQUANTIZE:.*]] = mhlo.uniform_dequantize %[[QUANTIZE]] : (tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00:3,2.000000e+00:4}>>) -> tensor<2x2xf32>
+  // CHECK: return %[[DEQUANTIZE]] : tensor<2x2xf32>
+
+  %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+    quantization_axis = 0 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2xf32>, tensor<2xf32>, tensor<2xi32>) -> tensor<2x2x!tf_type.qint8>
+  %1 = "tf.UniformDequantize"(%0, %scales, %zps) {
+    quantization_axis = 0 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  func.return %1 : tensor<2x2xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// tf.UniformRequantize legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_requantize_and_dequantize
+func.func @uniform_quantize_requantize_and_dequantize(%arg0 : tensor<*xf32>) -> tensor<*xf32> {
+  %scales_0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %zps_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+  %scales_1 = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %zps_1 = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[QUANTIZE:.*]] = mhlo.uniform_quantize %arg0 : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: %[[REQUANTIZE:.*]] = mhlo.uniform_quantize %[[QUANTIZE]] : (tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*x!quant.uniform<i8:f32, 2.000000e+00:5>>
+  // CHECK: %[[DEQUANTIZE:.*]] = mhlo.uniform_dequantize %[[REQUANTIZE]] : (tensor<*x!quant.uniform<i8:f32, 2.000000e+00:5>>) -> tensor<*xf32>
+  // CHECK: return %[[DEQUANTIZE]] : tensor<*xf32>
+
+  %0 = "tf.UniformQuantize"(%arg0, %scales_0, %zps_0) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*xf32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(%0, %scales_0, %zps_0, %scales_1, %zps_1) {
+    input_quantization_axis = -1 : i64, input_quantization_min_val = -128 : i64, input_quantization_max_val = 127 : i64,
+    output_quantization_axis = -1 : i64, output_quantization_min_val = -128 : i64, output_quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  %2 = "tf.UniformDequantize"(%1, %scales_1, %zps_1) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantize_requantize_and_dequantize_per_axis
+func.func @uniform_quantize_requantize_and_dequantize_per_axis(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %scales_0 = "tf.Const"() { value = dense<[1.0, 2.0]> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps_0 = "tf.Const"() { value = dense<[3, 4]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %scales_1 = "tf.Const"() { value = dense<[3.0, 4.0]> : tensor<2xf32> } : () -> tensor<2xf32>
+  %zps_1 = "tf.Const"() { value = dense<[5, 6]> : tensor<2xi32> } : () -> tensor<2xi32>
+
+  // CHECK: %[[QUANTIZE:.*]] = mhlo.uniform_quantize %arg0 : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00:3,2.000000e+00:4}>>
+  // CHECK: %[[REQUANTIZE:.*]] = mhlo.uniform_quantize %[[QUANTIZE]] : (tensor<2x2x!quant.uniform<i8:f32:0, {1.000000e+00:3,2.000000e+00:4}>>) -> tensor<2x2x!quant.uniform<i8:f32:0, {3.000000e+00:5,4.000000e+00:6}>>
+  // CHECK: %[[DEQUANTIZE:.*]] = mhlo.uniform_dequantize %[[REQUANTIZE]] : (tensor<2x2x!quant.uniform<i8:f32:0, {3.000000e+00:5,4.000000e+00:6}>>) -> tensor<2x2xf32>
+  // CHECK: return %[[DEQUANTIZE]] : tensor<2x2xf32>
+
+  %0 = "tf.UniformQuantize"(%arg0, %scales_0, %zps_0) {
+    quantization_axis = 0 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2xf32>, tensor<2xf32>, tensor<2xi32>) -> tensor<2x2x!tf_type.qint8>
+  %1 = "tf.UniformRequantize"(%0, %scales_0, %zps_0, %scales_1, %zps_1) {
+    input_quantization_axis = 0 : i64, input_quantization_min_val = -128 : i64, input_quantization_max_val = 127 : i64,
+    output_quantization_axis = 0 : i64, output_quantization_min_val = -128 : i64, output_quantization_max_val = 127 : i64
+  } : (tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<2xi32>, tensor<2xf32>, tensor<2xi32>) -> tensor<2x2x!tf_type.qint8>
+  %2 = "tf.UniformDequantize"(%1, %scales_1, %zps_1) {
+    quantization_axis = 0 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2x!tf_type.qint8>, tensor<2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  func.return %2 : tensor<2x2xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// tf.UniformQuantizedDot legalization
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @uniform_quantized_dot
+func.func @uniform_quantized_dot(%input: tensor<*xf32>) -> () {
+  %input_scales = "tf.Const"() { value = dense<2.0> : tensor<f32> } : () -> tensor<f32>
+  %input_zps = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+  %weight = "tf.Const"() { value = #tf_type<tensor_proto : "0x746674656E736F722464747970653A2044545F51494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2032207D2064696D207B2073697A653A2032207D207D2074656E736F725F636F6E74656E743A20225C3030315C3030325C3030335C30303422"> : tensor<2x2x!tf_type.qint8> } : () -> tensor<2x2x!tf_type.qint8>
+  %weight_scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %weight_zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  %output_scales = "tf.Const"() { value = dense<3.0> : tensor<f32> } : () -> tensor<f32>
+  %output_zps = "tf.Const"() { value = dense<5> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK-DAG: %[[LHS:.*]] = mhlo.uniform_quantize %arg0 : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 2.000000e+00:4>>
+  // CHECK-DAG: %[[RHS:.*]] = mhlo.constant()
+  // CHECK-SAME{LITERAL}: {value = dense<[[1, 2], [3, 4]]> : tensor<2x2xi8>} : () -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: "mhlo.dot"(%[[LHS]], %[[RHS]]) : (tensor<*x!quant.uniform<i8:f32, 2.000000e+00:4>>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:3>>)
+  // CHECK-SAME: -> tensor<*x!quant.uniform<i32:f32, 3.000000e+00:5>>
+
+  %0 = "tf.UniformQuantize"(%input, %input_scales, %input_zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*xf32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  %1 = "tf.UniformQuantizedDot"(
+    %0, %weight,
+    %input_scales, %input_zps,
+    %weight_scales, %weight_zps,
+    %output_scales, %output_zps) {
+      lhs_quantization_axis = -1 : i64,
+      lhs_quantization_min_val = -128 : i64,
+      lhs_quantization_max_val = 127 : i64,
+      rhs_quantization_axis = -1 : i64,
+      rhs_quantization_min_val = -128 : i64,
+      rhs_quantization_max_val = 127 : i64,
+      output_quantization_axis = -1 : i64,
+      output_quantization_min_val = -2147483648 : i64,
+      output_quantization_max_val = 2147483647 : i64} : (
+        tensor<*x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>,
+        tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint32>
+  func.return
+}
+
 //===----------------------------------------------------------------------===//
 // tf.Softplus legalization
 //===----------------------------------------------------------------------===//
@@ -6533,3 +6785,231 @@ func.func @test_xla_optimization_barrier(%arg0: tensor<4x4xf32>, %arg1: tensor<3
   %0, %1 = "tf.XlaOptimizationBarrier"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<3x4xi32>) -> (tensor<4x4xf32>, tensor<3x4xi32>)
   func.return %0, %1 : tensor<4x4xf32>, tensor<3x4xi32>
 }
+
+// CHECK-LABEL: @ifRegion
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
+func.func @ifRegion(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
+  // CHECK: [[VAL0:%.+]] = mhlo.compare GT, [[ARG0]], [[ARG1]]
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK: [[VAL1:%.+]] = "mhlo.if"([[VAL0]]) ({
+  %1 = "tf.IfRegion"(%0) ({
+    // CHECK: [[VAL2:%.+]] = mhlo.log [[ARG0]]
+    %2 = "tf.Log"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // CHECK: mhlo.return [[VAL2]]
+    "tf.Yield"(%2) : (tensor<f32>) -> ()
+  }, {
+    // CHECK: [[VAL3:%.+]] = mhlo.exponential [[ARG1]]
+    %2 = "tf.Exp"(%arg1) : (tensor<f32>) -> tensor<f32>
+    // CHECK: mhlo.return [[VAL3]]
+    "tf.Yield"(%2) : (tensor<f32>) -> ()
+  // CHECK: }) : (tensor<i1>) -> tensor<f32>
+  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  // CHECK: return [[VAL1]]
+  func.return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @caseRegion
+// CHECK-SAME:  ([[BRANCH_INDEX:%.+]]: tensor<i32>, [[ARG0:.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
+func.func @caseRegion(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  // CHECK: [[VAL1:%.+]]:2 = "mhlo.case"([[BRANCH_INDEX]]) ({
+  %0:2 = "tf.CaseRegion"(%index) ({
+    // CHECK: [[VAL2:%.+]] = mhlo.exponential [[ARG1]]
+    %1 = mhlo.exponential %arg1 : (tensor<f32>) -> tensor<f32>
+    // CHECK: mhlo.return [[VAL2]], [[ARG1]]
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  }, {
+    // CHECK: [[VAL3:%.+]] = mhlo.log [[ARG0]]
+    %1 = mhlo.log %arg0 : (tensor<f32>) -> tensor<f32>
+    // CHECK: mhlo.return [[VAL3]], [[ARG1]]
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  }, {
+    // CHECK: [[VAL4:%.+]] = mhlo.floor [[ARG0]]
+    %1 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // CHECK: mhlo.return [[VAL4]], [[ARG1]]
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK: }) : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
+  }) {is_stateless = true} : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK: return [[VAL1]]#0, [[VAL1]]#1 : tensor<f32>, tensor<f32>
+  func.return %0#0, %0#1 : tensor<f32>, tensor<f32>
+}
+
+// -----
+
+// This test case also ensures the mhlo dialect is loaded as a dependency by the
+// pass and hence the split here.
+
+// CHECK-LABEL: func @whileRegion
+func.func @whileRegion() -> tensor<i32> {
+  %0 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
+  %1 = "tf.Const"()  {value = dense<-1> : tensor<i32>}  : () -> tensor<i32>
+  %2:3 = "tf.WhileRegion"(%0, %1, %0) ({
+  ^cond(%carg0: tensor<i32>, %carg1: tensor<i32>, %carg2: tensor<i32>):
+    %3 = "tf.Const"()  {value = dense<1> : tensor<i1>}  : () -> tensor<i1>
+    "tf.Yield"(%3) : (tensor<i1>) -> ()
+  }, {
+  ^body(%barg0: tensor<i32>, %barg1: tensor<i32>, %barg2: tensor<i32>):
+    %4 = "tf.Const"()  {value = dense<1> : tensor<i32>}  : () -> tensor<i32>
+    "tf.Yield"(%4, %4, %4) : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+  func.return %2#2 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @whileRegion
+func.func @whileRegion() -> tensor<i32> {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant
+  %0 = "tf.Const"()  {value = dense<0> : tensor<i32>}  : () -> tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant
+  %1 = "tf.Const"()  {value = dense<-1> : tensor<i32>}  : () -> tensor<i32>
+  // CHECK: [[VAL2:%.+]]:3 = mhlo.while([[ITER_ARG0:.*]] = [[VAL0]], [[ITER_ARG1:.*]] =  [[VAL1]], [[ITER_ARG2:.*]] =  [[VAL0]])
+  %2:3 = "tf.WhileRegion"(%0, %1, %0) ({
+  ^cond(%carg0: tensor<i32>, %carg1: tensor<i32>, %carg2: tensor<i32>):
+    // CHECK: [[VAL3:%.+]] = mhlo.constant
+    %3 = "tf.Const"()  {value = dense<10> : tensor<i32>}  : () -> tensor<i32>
+    // CHECK: [[VAL4:%.+]] = mhlo.compare LT, [[ITER_ARG2]], [[VAL3]]
+    %4 = "mhlo.compare"(%carg2, %3) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: mhlo.return [[VAL4]]
+    "tf.Yield"(%4) : (tensor<i1>) -> ()
+  }, {
+  ^body(%barg0: tensor<i32>, %barg1: tensor<i32>, %barg2: tensor<i32>):
+    // CHECK: [[VAL5:%.+]] = mhlo.constant
+    %5 = "tf.Const"()  {value = dense<1> : tensor<i32>}  : () -> tensor<i32>
+    // CHECK: [[VAL6:%.+]] = mhlo.add [[ITER_ARG2]], [[VAL5]]
+    %6 = mhlo.add %barg2, %5 : tensor<i32>
+    // CHECK: [[VAL7:%.+]] = mhlo.add [[ITER_ARG0]], [[VAL5]]
+    %7 = mhlo.add %barg0, %5 : tensor<i32>
+    // CHECK: mhlo.return [[VAL7]], [[ITER_ARG1]], [[VAL6]]
+    "tf.Yield"(%7, %barg1, %6) : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+  // CHECK: return [[VAL2]]#2
+  func.return %2#2 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @whileRegionImplicitInputs
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func.func @whileRegionImplicitInputs(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
+  %1 = mhlo.constant dense<-1> : tensor<i32>
+  // CHECK: [[VAL2:%.+]] = mhlo.while([[ITER_ARG0:.*]] = [[ARG0]])
+  %2 = "tf.WhileRegion"(%arg0) ({
+  ^cond(%carg0: tensor<i32>):
+    // CHECK: [[VAL3:%.+]] = mhlo.compare LT, [[ITER_ARG0]], [[VAL0]]
+    %3 = mhlo.compare LT, %carg0, %0 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: mhlo.return [[VAL3]]
+    "tf.Yield"(%3) : (tensor<i1>) -> ()
+  }, {
+  ^body(%barg0: tensor<i32>):
+    // CHECK: [[VAL3:%.+]] = mhlo.add [[ITER_ARG0]], [[VAL1]]
+    %3 = mhlo.add %barg0, %1 : tensor<i32>
+    // CHECK: [[VAL4:%.+]] = mhlo.add [[ITER_ARG0]], [[VAL3]]
+    %4 = mhlo.add %barg0, %3 : tensor<i32>
+    // CHECK: mhlo.return [[VAL4]]
+    "tf.Yield"(%4) : (tensor<i32>) -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+  // CHECK: return [[VAL2]]
+  func.return %2 : tensor<i32>
+}
+
+// CHECK-LABEL: func @whileRegionMultipleImplicitInputs
+func.func @whileRegionMultipleImplicitInputs() {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
+  %1 = mhlo.constant dense<-1> : tensor<i32>
+  // CHECK: mhlo.while()
+  "tf.WhileRegion"() ({
+    // CHECK: [[VAL3:%.+]] = mhlo.compare LT, [[VAL0]], [[VAL1]]
+    %2 = "mhlo.compare"(%0, %1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: mhlo.return [[VAL3]]
+    "tf.Yield"(%2) : (tensor<i1>) -> ()
+  }, {
+    // CHECK: [[VAL3:%.+]] = mhlo.add [[VAL0]], [[VAL1]]
+    %2 = mhlo.add %0, %1 : tensor<i32>
+    // CHECK: mhlo.return
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : () -> ()
+  // CHECK: return
+  func.return
+}
+
+//===----------------------------------------------------------------------===//
+// quant.uniform type handling with control flow ops
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: func @while_region_with_quant
+func.func @while_region_with_quant(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %zps = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
+
+  // CHECK: %[[QUANT0:.*]] = mhlo.uniform_quantize %[[ARG:.*]] : (tensor<*xf32>) -> tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: %[[QUANT1:.*]] = mhlo.while(%[[ITER_ARG:.*]] = %[[QUANT0]]) : tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: mhlo.return %[[ITER_ARG]] : tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>
+  // CHECK: %[[RET:.*]] = mhlo.uniform_dequantize %[[QUANT1]] : (tensor<*x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<*xf32>
+  // CHECK: return %[[RET]] : tensor<*xf32>
+
+  %0 = "tf.UniformQuantize"(%arg0, %scales, %zps) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<*xf32>, tensor<f32>, tensor<i32>) -> tensor<*x!tf_type.qint8>
+  %1 = "tf.WhileRegion"(%0) ({
+  ^bb0(%carg0: tensor<*x!tf_type.qint8>):
+    %cst = "tf.Const"()  {value = dense<1> : tensor<i1>}  : () -> tensor<i1>
+    "tf.Yield"(%cst) : (tensor<i1>) -> ()
+    }, {
+  ^bb0(%barg0: tensor<*x!tf_type.qint8>):
+    %id = "tf.Identity"(%barg0) : (tensor<*x!tf_type.qint8>) -> tensor<*x!tf_type.qint8>
+    "tf.Yield"(%id) : (tensor<*x!tf_type.qint8>) -> ()
+  }) {is_stateless = false} : (tensor<*x!tf_type.qint8>) -> tensor<*x!tf_type.qint8>
+  %2 = "tf.UniformDequantize"(%1, %scales, %zps) {quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64} : (tensor<*x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+  func.return %2 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @while_region_with_quant_two_args
+func.func @while_region_with_quant_two_args(%arg0: tensor<2x2xf32>) -> (tensor<2x?xf32>, tensor<?x2xf32>) {
+  %scales = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
+  %zps2 = "tf.Const"() { value = dense<2> : tensor<i32> } : () -> tensor<i32>
+  %zps4 = "tf.Const"() { value = dense<4> : tensor<i32> } : () -> tensor<i32>
+
+
+  // CHECK: %[[QUANT0:.*]] = mhlo.uniform_quantize %[[ARG:.*]] : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:2>>
+  %0 = "tf.UniformQuantize"(%arg0, %scales, %zps2) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x2x!tf_type.qint8>
+  // CHECK: %[[QUANT1:.*]] = mhlo.uniform_quantize %[[ARG:.*]] : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:4>>
+  %1 = "tf.UniformQuantize"(%arg0, %scales, %zps4) {
+    quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64
+  } : (tensor<2x2xf32>, tensor<f32>, tensor<i32>) -> tensor<2x2x!tf_type.qint8>
+
+  // CHECK: %[[WHILE_RESULT:.*]]:2 = mhlo.while(%[[ARG0:.*]] = %[[QUANT0]], %[[ARG1:.*]] = %[[QUANT1]])
+  // CHECK-SAME: tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:2>>, tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:4>>
+
+  // CHECK: cond
+
+  // CHECK: do
+  // CHECK: mhlo.return %[[ARG0]], %[[ARG1]] : tensor<2x?x!quant.uniform<i8:f32, 1.000000e+00:2>>, tensor<?x2x!quant.uniform<i8:f32, 1.000000e+00:4>>
+
+  %2:2 = "tf.WhileRegion"(%0, %1) ({
+  ^bb0(%carg0: tensor<2x?x!tf_type.qint8>, %carg1: tensor<?x2x!tf_type.qint8>):
+    %cst = "tf.Const"()  {value = dense<1> : tensor<i1>}  : () -> tensor<i1>
+    "tf.Yield"(%cst) : (tensor<i1>) -> ()
+    }, {
+  ^bb0(%barg0: tensor<2x?x!tf_type.qint8>, %barg1: tensor<?x2x!tf_type.qint8>):
+    %id = "tf.Identity"(%barg0) : (tensor<2x?x!tf_type.qint8>) -> tensor<2x?x!tf_type.qint8>
+    "tf.Yield"(%id, %barg1) : (tensor<2x?x!tf_type.qint8>, tensor<?x2x!tf_type.qint8>) -> ()
+  }) {is_stateless = false} : (tensor<2x2x!tf_type.qint8>, tensor<2x2x!tf_type.qint8>) -> (tensor<2x?x!tf_type.qint8>, tensor<?x2x!tf_type.qint8>)
+
+  // %[[RESULT0:.*]] = mhlo.uniform_dequantize %[[WHILE_RESULT]]#0
+  %3 = "tf.UniformDequantize"(%2#0, %scales, %zps2) {quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64} : (tensor<2x?x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<2x?xf32>
+
+  // %[[RESULT1:.*]] = mhlo.uniform_dequantize %[[WHILE_RESULT]]#0
+  %4 = "tf.UniformDequantize"(%2#1, %scales, %zps4) {quantization_axis = -1 : i64, quantization_min_val = -128 : i64, quantization_max_val = 127 : i64} : (tensor<?x2x!tf_type.qint8>, tensor<f32>, tensor<i32>) -> tensor<?x2xf32>
+
+  // return %[[RESULT0]], %[[RESULT1]]
+  func.return %3, %4 : tensor<2x?xf32>, tensor<?x2xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization-no-chlo.mlir b/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization-no-chlo.mlir
new file mode 100644
index 00000000000..b4f83b39f4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization-no-chlo.mlir
@@ -0,0 +1,11 @@
+// RUN: xla-opt "-tfxla-verify-legalization=legalize-chlo=false" -verify-diagnostics -split-input-file %s | FileCheck %s --dump-input=fail
+// Tests the VerifyTFXLALegalization Pass, that just ensures we don't have
+// any illegal ops at the end of the pipeline. This runs with
+// legalize-chlo=false since errors can't be mixed with the legalize-chlo=True
+// version.
+
+// CHECK-LABEL: allows_chlo
+func.func @allows_chlo(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  func.return %0 : tensor<1x32x10x32xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization.mlir b/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization.mlir
new file mode 100644
index 00000000000..2e86dd5ea06
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/verify-tfxla-legalization.mlir
@@ -0,0 +1,39 @@
+// RUN: xla-opt "-tfxla-verify-legalization=legalize-chlo=true" -verify-diagnostics -split-input-file %s | FileCheck -dump-input=fail %s
+// Tests the VerifyTFXLALegalization Pass, that just ensures we don't have
+// any illegal ops at the end of the pipeline.
+
+// CHECK-LABEL: allowsMHLO
+func.func @allowsMHLO() -> (tensor<8x64x32x4xcomplex<f32>> {mhlo.sharding = ""}) {
+  %0 = mhlo.constant dense<(1.000000e+00,-1.000000e+00)> : tensor<128x32x4xcomplex<f32>>
+  %1 = mhlo.constant dense<(1.000000e+00,1.000000e+00)> : tensor<8x64x128xcomplex<f32>>
+  %2 = "mhlo.einsum"(%1, %0) {einsum_config = "abc,cde->abde"} : (tensor<8x64x128xcomplex<f32>>, tensor<128x32x4xcomplex<f32>>) -> tensor<8x64x32x4xcomplex<f32>>
+  return %2 : tensor<8x64x32x4xcomplex<f32>>
+}
+
+// -----
+
+func.func @invalid_non_mhlo() -> (tensor<8x64x32x4xcomplex<f32>> {mhlo.sharding = ""}) {
+  // expected-error @+1 {{Could not legalize op: tf.Const}}
+  %cst = "tf.Const"() {value = dense<(1.000000e+00,-1.000000e+00)> : tensor<128x32x4xcomplex<f32>>} : () -> tensor<128x32x4xcomplex<f32>>
+  %cst_0 = "tf.Const"() {value = dense<(1.000000e+00,1.000000e+00)> : tensor<8x64x128xcomplex<f32>>} : () -> tensor<8x64x128xcomplex<f32>>
+  %0 = "tf.XlaEinsum"(%cst_0, %cst) {equation = "abc,cde->abde"} : (tensor<8x64x128xcomplex<f32>>, tensor<128x32x4xcomplex<f32>>) -> tensor<8x64x32x4xcomplex<f32>>
+  return %0 : tensor<8x64x32x4xcomplex<f32>>
+}
+
+// -----
+
+func.func @invalid_mixed_mhlo() -> (tensor<8x64x32x4xcomplex<f32>> {mhlo.sharding = ""}) {
+  %0 = mhlo.constant dense<(1.000000e+00,-1.000000e+00)> : tensor<128x32x4xcomplex<f32>>
+  // expected-error @+1 {{Could not legalize op: tf.Const}}
+  %cst_0 = "tf.Const"() {value = dense<(1.000000e+00,1.000000e+00)> : tensor<8x64x128xcomplex<f32>>} : () -> tensor<8x64x128xcomplex<f32>>
+  %1 = "tf.XlaEinsum"(%cst_0, %0) {equation = "abc,cde->abde"} : (tensor<8x64x128xcomplex<f32>>, tensor<128x32x4xcomplex<f32>>) -> tensor<8x64x32x4xcomplex<f32>>
+  return %1 : tensor<8x64x32x4xcomplex<f32>>
+}
+
+// -----
+
+func.func @fails_chlo(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+  // expected-error @+1 {{Could not legalize op: chlo.broadcast_add}}
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  func.return %0 : tensor<1x32x10x32xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/adjust_layout.cc b/tensorflow/compiler/mlir/xla/transforms/adjust_layout.cc
index 5be2e9dbaa2..ae9974146ed 100644
--- a/tensorflow/compiler/mlir/xla/transforms/adjust_layout.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/adjust_layout.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/layout.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
@@ -48,8 +48,8 @@ static FailureOr<std::vector<int64_t>> GetTPUInfeedLayoutFromAPI(
   xla::Shape old_shape = xla::TypeToShape(t);
   XLA_Shape old_shape_c = {};
   XLA_Shape new_shape_c = {};
-  TfTpu_ExecutorApiFn *executor = tensorflow::tpu::ExecutorApiFn();
-  if (!tensorflow::tpu::IsInitialized(executor)) {
+  TfTpu_ExecutorApiFn *executor = stream_executor::tpu::ExecutorApiFn();
+  if (!stream_executor::tpu::IsInitialized(executor)) {
     return failure();
   }
   ApiConverter::ToC(old_shape, &old_shape_c);
@@ -72,7 +72,7 @@ FailureOr<Attribute> GetTPUInfeedLayout(const ArrayRef<Type> types,
       if (t.isa<TokenType>()) continue;
       auto layout = GetTPUInfeedLayout({t}, rewriter);
       if (failed(layout)) return failure();
-      v.push_back(layout.getValue());
+      v.push_back(layout.value());
     }
     ArrayRef<Attribute> shape(v);
     return rewriter.getArrayAttr(shape);
@@ -85,7 +85,7 @@ FailureOr<Attribute> GetTPUInfeedLayout(const ArrayRef<Type> types,
       if (t.isa<TokenType>()) continue;
       auto layout = GetTPUInfeedLayout({t}, rewriter);
       if (failed(layout)) return failure();
-      v.push_back(layout.getValue());
+      v.push_back(layout.value());
     }
     ArrayRef<Attribute> shape(v);
     return rewriter.getArrayAttr(shape);
@@ -94,7 +94,7 @@ FailureOr<Attribute> GetTPUInfeedLayout(const ArrayRef<Type> types,
     auto layout = GetTPUInfeedLayoutFromAPI(t);
     std::vector<int64_t> minor_to_major;
     if (succeeded(layout)) {
-      minor_to_major = layout.getValue();
+      minor_to_major = layout.value();
     } else {
       /* If we're not running on a TPU node, we might not be able to
        * actually call the part of the TPU API that gives us layout.
@@ -151,7 +151,7 @@ class AdjustLayout
       auto layout = GetTPUInfeedLayout(result_types, builder);
       if (failed(layout)) return;
 
-      op->setAttr("layout", layout.getValue());
+      op->setAttr("layout", layout.value());
     }
   }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/xla/transforms/convert_mhlo_quant_to_int.cc
new file mode 100644
index 00000000000..0dd72dc3ad6
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/convert_mhlo_quant_to_int.cc
@@ -0,0 +1,240 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace mlir {
+namespace mhlo {
+namespace {
+
+#define GEN_PASS_DEF_CONVERTMHLOQUANTTOINT
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.h.inc"
+
+FailureOr<TensorType> GetSameShapeTensorType(Operation *op,
+                                             TensorType tensor_type,
+                                             Type element_type,
+                                             PatternRewriter &rewriter) {
+  if (auto ranked_ty = tensor_type.dyn_cast_or_null<RankedTensorType>()) {
+    Attribute encoding = ranked_ty.getEncoding();
+    if (!(!encoding || encoding.isa<TypeExtensionsAttr>() ||
+          encoding.isa<sparse_tensor::SparseTensorEncodingAttr>())) {
+      return rewriter.notifyMatchFailure(
+          op,
+          "Ranked tensor encoding must be either null, TypeExtensionsAttr, or "
+          "SparseTensorEncodingAttr.");
+    }
+    return RankedTensorType::get(ranked_ty.getShape(), element_type, encoding);
+  }
+  if (auto unranked_ty = tensor_type.dyn_cast_or_null<UnrankedTensorType>()) {
+    return UnrankedTensorType::get(element_type);
+  }
+  llvm_unreachable("unhandled type");
+}
+
+class ConvertMHLOQuantToInt
+    : public impl::ConvertMHLOQuantToIntBase<ConvertMHLOQuantToInt> {
+ public:
+  // Performs conversion of MHLO quant ops to primitive ops.
+  void runOnOperation() override;
+};
+
+class ConvertUniformQuantizeOp
+    : public OpConversionPattern<mhlo::UniformQuantizeOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::UniformQuantizeOp op, UniformQuantizeOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    auto element_type = getElementTypeOrSelf(op.getResult().getType())
+                            .dyn_cast<quant::UniformQuantizedType>();
+    // Currently for activation, PTQ supports per-tensor quantization only, and
+    // UniformQuantize op is only for activation.
+    if (!element_type) {
+      return rewriter.notifyMatchFailure(
+          op, "Legalization supports only per-tensor quantization.");
+    }
+    Value scale = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(element_type.getScale()));
+    Value zero_point = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getI32IntegerAttr(
+                          static_cast<int32_t>(element_type.getZeroPoint())));
+    Value half = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(0.5f));
+    Value quantization_min = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
+                          element_type.getStorageTypeMin())));
+    Value quantization_max = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
+                          element_type.getStorageTypeMax())));
+
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    auto res_float_tensor_type_or =
+        GetSameShapeTensorType(op, op.getOperand().getType().cast<TensorType>(),
+                               rewriter.getF32Type(), rewriter);
+    if (failed(res_float_tensor_type_or)) {
+      return failure();
+    }
+    Value res_float = rewriter.create<chlo::BroadcastDivOp>(
+        op->getLoc(), *res_float_tensor_type_or, adaptor.getOperand(), scale,
+        scalar_broadcast_dims);
+    res_float = rewriter.create<chlo::BroadcastAddOp>(
+        op->getLoc(), *res_float_tensor_type_or, res_float, half,
+        scalar_broadcast_dims);
+    res_float = rewriter.create<mhlo::FloorOp>(op->getLoc(), res_float);
+    auto res_int32_tensor_type_or =
+        GetSameShapeTensorType(op, res_float.getType().cast<TensorType>(),
+                               rewriter.getI32Type(), rewriter);
+    if (failed(res_int32_tensor_type_or)) {
+      return failure();
+    }
+    Value res_int32 = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(), *res_int32_tensor_type_or, res_float);
+    res_int32 = rewriter.create<chlo::BroadcastAddOp>(
+        op->getLoc(), *res_int32_tensor_type_or, res_int32, zero_point,
+        scalar_broadcast_dims);
+    res_int32 = rewriter.create<chlo::BroadcastMaxOp>(
+        op->getLoc(), *res_int32_tensor_type_or, res_int32, quantization_min,
+        scalar_broadcast_dims);
+    res_int32 = rewriter.create<chlo::BroadcastMinOp>(
+        op->getLoc(), *res_int32_tensor_type_or, res_int32, quantization_max,
+        scalar_broadcast_dims);
+    auto res_final_tensor_type_or =
+        GetSameShapeTensorType(op, res_int32.getType().cast<TensorType>(),
+                               rewriter.getI8Type(), rewriter);
+    rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(op, *res_final_tensor_type_or,
+                                                 res_int32);
+    return success();
+  }
+};
+
+class ConvertUniformDequantizeOp
+    : public OpConversionPattern<mhlo::UniformDequantizeOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::UniformDequantizeOp op, UniformDequantizeOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    auto element_type = getElementTypeOrSelf(op.getOperand().getType())
+                            .dyn_cast<quant::UniformQuantizedType>();
+    // Currently for activation, PTQ supports per-tensor quantization only, and
+    // UniformQuantize op is only for activation.
+    if (!element_type) {
+      return rewriter.notifyMatchFailure(
+          op, "Legalization supports only per-tensor quantization.");
+    }
+    Value scale = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getF32FloatAttr(element_type.getScale()));
+    Value zero_point = rewriter.create<mhlo::ConstantOp>(
+        op->getLoc(), rewriter.getI32IntegerAttr(
+                          static_cast<int32_t>(element_type.getZeroPoint())));
+
+    Value input = adaptor.getOperand();
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    auto res_int32_tensor_type_or =
+        GetSameShapeTensorType(op, input.getType().cast<TensorType>(),
+                               rewriter.getI32Type(), rewriter);
+    if (failed(res_int32_tensor_type_or)) {
+      return failure();
+    }
+    Value res_int32 = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(), *res_int32_tensor_type_or, input);
+    res_int32 = rewriter.create<chlo::BroadcastSubOp>(
+        op->getLoc(), *res_int32_tensor_type_or, res_int32, zero_point,
+        scalar_broadcast_dims);
+    auto res_float_tensor_type_or =
+        GetSameShapeTensorType(op, res_int32.getType().cast<TensorType>(),
+                               rewriter.getF32Type(), rewriter);
+    if (failed(res_float_tensor_type_or)) {
+      return failure();
+    }
+    Value res_float = rewriter.create<mhlo::ConvertOp>(
+        op->getLoc(), *res_float_tensor_type_or, res_int32);
+    res_float = rewriter.replaceOpWithNewOp<chlo::BroadcastMulOp>(
+        op, *res_float_tensor_type_or, res_float, scale, scalar_broadcast_dims);
+    return success();
+  }
+};
+
+// Performs conversion of MHLO quant ops to primitive ops.
+void ConvertMHLOQuantToInt::runOnOperation() {
+  Operation *op = getOperation();
+  MLIRContext *context = op->getContext();
+  RewritePatternSet patterns(context);
+
+  // Populate MHLO quant ops conversion patterns.
+  patterns.add<ConvertUniformQuantizeOp, ConvertUniformDequantizeOp>(context);
+
+  ConversionTarget target(*op->getContext());
+  auto is_legal = [](Operation *op) {
+    auto is_not_quant = [](Type type) {
+      return !getElementTypeOrSelf(type).isa<quant::UniformQuantizedType>();
+    };
+    return llvm::all_of(op->getOperandTypes(), is_not_quant) &&
+           llvm::all_of(op->getResultTypes(), is_not_quant);
+  };
+  target.addDynamicallyLegalDialect<MhloDialect>(is_legal);
+  target.addDynamicallyLegalDialect<chlo::ChloDialect>(is_legal);
+
+  LogicalResult result =
+      applyPartialConversion(op, target, std::move(patterns));
+  if (failed(result)) {
+    signalPassFailure();
+  }
+}
+
+}  // end namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertMHLOQuantToIntPass() {
+  return std::make_unique<ConvertMHLOQuantToInt>();
+}
+
+}  // end namespace mhlo
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index f4ddee6bbbf..e1de21cf038 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "mlir/Transforms/DialectConversion.h"
-#include "tensorflow/compiler/mlir/disc/IR/hlo_disc_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
@@ -57,9 +56,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
@@ -221,87 +220,6 @@ static ConstantOp GetScalarLimitConstOfType(Type ty, Location loc,
   return builder->create<ConstantOp>(loc, hlo::getScalarLimitOfType(ty, limit));
 }
 
-// Creates an mhlo::SliceOp where the major dimensions have full size, and
-// the minor dimensions have the provided offsets and sizes.
-static Value SliceInMinorDims(Location loc, Value v,
-                              ArrayRef<int64_t> minor_starts,
-                              ArrayRef<int64_t> minor_limits,
-                              OpBuilder *builder) {
-  auto type = v.getType().cast<RankedTensorType>();
-  llvm::SmallVector<int64_t, 4> slice_starts(type.getRank(), 0);
-  int64_t major_dims = type.getRank() - minor_starts.size();
-  std::copy(minor_starts.begin(), minor_starts.end(),
-            slice_starts.begin() + major_dims);
-  auto slice_limits = llvm::to_vector<4>(type.getShape());
-  std::copy(minor_limits.begin(), minor_limits.end(),
-            slice_limits.begin() + major_dims);
-  llvm::SmallVector<int64_t, 4> slice_strides(type.getRank(), 1);
-  return builder->create<SliceOp>(loc, v,
-                                  GetI64ElementsAttr(slice_starts, builder),
-                                  GetI64ElementsAttr(slice_limits, builder),
-                                  GetI64ElementsAttr(slice_strides, builder));
-}
-
-// Creates a vector of index values:
-//  [0, 0, ..., minor_indices[0], minor_indices[1], ... minor_indices[-1]]
-// with length `rank`.
-static llvm::SmallVector<Value, 4> CreateFullIndexVectorFromMinorIndices(
-    Location loc, ArrayRef<Value> minor_indices, int64_t rank,
-    OpBuilder *builder) {
-  auto zero =
-      GetScalarConstOfType(getElementTypeOrSelf(minor_indices[0].getType()),
-                           loc, 0, builder)
-          .getOutput();
-  llvm::SmallVector<Value, 4> indices(rank, zero);
-  std::copy(minor_indices.begin(), minor_indices.end(),
-            indices.begin() + (rank - minor_indices.size()));
-  return indices;
-}
-
-// Creates an mhlo::DynamicSliceOp where the major dimensions have full size,
-// and the minor dimensions have the provided offsets and sizes.
-static Value DynamicSliceInMinorDims(Location loc, Value v,
-                                     ArrayRef<Value> minor_starts,
-                                     ArrayRef<int64_t> minor_sizes,
-                                     OpBuilder *builder) {
-  if (minor_starts.empty()) return v;
-  auto type = v.getType().cast<RankedTensorType>();
-  auto slice_starts = CreateFullIndexVectorFromMinorIndices(
-      loc, minor_starts, type.getRank(), builder);
-  int64_t major_dims = type.getRank() - minor_starts.size();
-  auto slice_sizes = llvm::to_vector<4>(type.getShape());
-  std::copy(minor_sizes.begin(), minor_sizes.end(),
-            slice_sizes.begin() + major_dims);
-  return builder->create<mhlo::DynamicSliceOp>(
-      loc, v, slice_starts, GetI64ElementsAttr(slice_sizes, builder));
-}
-
-// Creates an mhlo::DynamicUpdateSliceOp where the major dimensions have zero
-// offsets, and the minor dimensions have the provided offsets.
-static Value DynamicUpdateSliceInMinorDims(Location loc, Value v, Value update,
-                                           ArrayRef<Value> minor_starts,
-                                           OpBuilder *builder) {
-  if (minor_starts.empty()) return v;
-  auto type = v.getType().cast<RankedTensorType>();
-  auto dus_starts = CreateFullIndexVectorFromMinorIndices(
-      loc, minor_starts, type.getRank(), builder);
-  return builder->create<DynamicUpdateSliceOp>(loc, type, v, update,
-                                               llvm::makeArrayRef(dus_starts));
-}
-
-// Creates an mhlo::DynamicUpdateSliceOp where the major dimensions have zero
-// offsets, and the minor dimensions have the provided static offsets.
-static Value UpdateSliceInMinorDims(Location loc, Value v, Value update,
-                                    ArrayRef<int64_t> minor_starts,
-                                    OpBuilder *builder) {
-  llvm::SmallVector<Value, 4> dus_starts(minor_starts.size());
-  for (uint64_t i = 0; i < minor_starts.size(); ++i) {
-    dus_starts[i] = GetScalarConstOfType(builder->getIntegerType(32), loc,
-                                         minor_starts[i], builder);
-  }
-  return DynamicUpdateSliceInMinorDims(loc, v, update, dus_starts, builder);
-}
-
 // Deprecated: This is maintained to aid in porting old code that is not yet
 // dynamic shape aware and uses broadcasting modes that CHLO does not support.
 // Gets the resulting type from a broadcast between two types for statically
@@ -443,35 +361,6 @@ static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
       loc, to_type, input, result_extents, broadcast_dims);
 }
 
-// Creates a batch dot using mhlo::DotGeneralOp.
-Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
-               bool transpose_rhs, int64_t num_batch_dims,
-               ArrayAttr precision_config, OpBuilder *builder) {
-  auto batch_dimensions =
-      llvm::to_vector<4>(llvm::seq<int64_t>(0, num_batch_dims));
-  auto lhs_contracting_dimensions = llvm::to_vector<1>(llvm::makeArrayRef(
-      {transpose_lhs ? num_batch_dims : num_batch_dims + 1}));
-  auto rhs_contracting_dimensions = llvm::to_vector<1>(llvm::makeArrayRef(
-      {transpose_rhs ? num_batch_dims + 1 : num_batch_dims}));
-  auto dimension_numbers = DotDimensionNumbersAttr::get(
-      builder->getContext(),
-      /*lhs_batching_dimensions=*/batch_dimensions,
-      /*rhs_batching_dimensions=*/batch_dimensions,
-      /*lhs_contracting_dimensions=*/lhs_contracting_dimensions,
-      /*rhs_contracting_dimensions=*/rhs_contracting_dimensions);
-  auto lhs_shape = lhs.getType().cast<RankedTensorType>().getShape();
-  auto rhs_shape = rhs.getType().cast<RankedTensorType>().getShape();
-  auto shape = llvm::to_vector<4>(lhs_shape);
-  shape[shape.size() - 2] =
-      transpose_lhs ? lhs_shape.back() : lhs_shape[lhs_shape.size() - 2];
-  shape[shape.size() - 1] =
-      transpose_rhs ? rhs_shape[rhs_shape.size() - 2] : rhs_shape.back();
-  Type element_type = getElementTypeOrSelf(lhs.getType());
-  return builder->create<DotGeneralOp>(
-      loc, tensorflow::GetTypeFromTFTensorShape(shape, element_type), lhs, rhs,
-      dimension_numbers, precision_config);
-}
-
 // Builds a set of operations for applying reduction on the input value. A
 // tf.sum op is created and will be legalized to tfl ops automatically.
 static Value ApplyReduction(Location loc, Value input,
@@ -993,15 +882,15 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
-    auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
+    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
     if (!value_type) return failure();
     auto feature_dim = GetFeatureDimension(data_format, value_type);
-    auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
-                                                  feature_dim, rewriter);
-    Value add = rewriter.create<AddOp>(loc, op.value(), bias_broadcast);
+    auto bias_broadcast = Broadcast1DToFeatureDim(
+        loc, op.getValue(), op.getBias(), feature_dim, rewriter);
+    Value add = rewriter.create<AddOp>(loc, op.getValue(), bias_broadcast);
     if (add.getType() != op.getType()) {
       add = rewriter.create<tensor::CastOp>(loc, op.getType(), add);
     }
@@ -1018,10 +907,10 @@ class ConvertGatherV2OpDynamic : public OpRewritePattern<TF::GatherV2Op> {
   LogicalResult matchAndRewrite(TF::GatherV2Op op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value params = op.params();
+    Value params = op.getParams();
     // params and indices of GatherNdOp must be ranked
     auto params_ty = params.getType().dyn_cast<RankedTensorType>();
-    Value indices = op.indices();
+    Value indices = op.getIndices();
     auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
     if (!params_ty || !indices_ty) return failure();
 
@@ -1036,7 +925,7 @@ class ConvertGatherV2OpDynamic : public OpRewritePattern<TF::GatherV2Op> {
     // axis
     DenseIntElementsAttr axis_attr;
     // axis must be const for GatherOp
-    if (!matchPattern(op.axis(), m_Constant(&axis_attr))) return failure();
+    if (!matchPattern(op.getAxis(), m_Constant(&axis_attr))) return failure();
 
     int64_t axis = (*axis_attr.begin()).getSExtValue();
     if (axis < 0) axis += params_rank;
@@ -1060,7 +949,7 @@ class ConvertGatherV2OpDynamic : public OpRewritePattern<TF::GatherV2Op> {
             loc, rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
       } else {
         int64_t dim_size = params_ty.getDimSize(dim_idx);
-        if (dim_size != ShapedType::kDynamicSize) {
+        if (dim_size != ShapedType::kDynamic) {
           slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
               loc,
               rewriter.getIntegerAttr(indices_ty.getElementType(), dim_size)));
@@ -1095,7 +984,7 @@ class ConvertGatherV2OpDynamic : public OpRewritePattern<TF::GatherV2Op> {
         /*index_vector_dim=*/index_vector_dim);
 
     rewriter.replaceOpWithNewOp<mhlo::DynamicGatherOp>(
-        op, op.getType(), op.params(), op.indices(), slice_sizes_value,
+        op, op.getType(), op.getParams(), op.getIndices(), slice_sizes_value,
         dims_attr);
     return success();
   }
@@ -1182,16 +1071,17 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
   LogicalResult matchAndRewriteDynamicConv(OpT op,
                                            PatternRewriter &rewriter) const {
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
     auto filter_ty =
-        op.filter().getType().template dyn_cast<RankedTensorType>();
+        op.getFilter().getType().template dyn_cast<RankedTensorType>();
     auto result_ty = op.getType().template dyn_cast<RankedTensorType>();
     if (!input_ty || !filter_ty || !result_ty) return failure();
     // TODO(disc): Remove this constraint once fold and canonicalization
@@ -1199,8 +1089,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     if (input_ty.hasStaticShape() && filter_ty.hasStaticShape())
       return failure();
 
-    ArrayRef<Attribute> dilations = op.dilations().getValue();
-    ArrayRef<Attribute> strides = op.strides().getValue();
+    ArrayRef<Attribute> dilations = op.getDilations().getValue();
+    ArrayRef<Attribute> strides = op.getStrides().getValue();
     ArrayRef<Attribute> explicit_paddings;
     if (padding == tensorflow::Padding::EXPLICIT) {
       // EXPLICIT padding mode and the associated attribute is attached to
@@ -1247,8 +1137,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
         pad_low = get_const(get_int(explicit_paddings[2 * dim]));
         pad_high = get_const(get_int(explicit_paddings[2 * dim + 1]));
       } else {
-        auto input_size = get_dim_value(op.input(), dim);
-        auto filter_size = get_dim_value(op.filter(), i);
+        auto input_size = get_dim_value(op.getInput(), dim);
+        auto filter_size = get_dim_value(op.getFilter(), i);
         if (!GetPaddingValues(op, rewriter, input_size, filter_size, dilation,
                               stride, padding, shape_scalar_type, &pad_low,
                               &pad_high)) {
@@ -1299,9 +1189,9 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       llvm::SmallVector<int64_t, num_dims> new_shape(
           filter_shape.begin(), filter_shape.begin() + num_spatial_dims);
       new_shape.push_back(1);
-      if (filter_shape[num_spatial_dims] == ShapedType::kDynamicSize ||
-         filter_shape[num_spatial_dims + 1] == ShapedType::kDynamicSize) {
-        new_shape.push_back(ShapedType::kDynamicSize);
+      if (filter_shape[num_spatial_dims] == ShapedType::kDynamic ||
+          filter_shape[num_spatial_dims + 1] == ShapedType::kDynamic) {
+        new_shape.push_back(ShapedType::kDynamic);
       } else {
         new_shape.push_back(filter_shape[num_spatial_dims] *
                             filter_shape[num_spatial_dims + 1]);
@@ -1311,7 +1201,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       SmallVector<Value> filter_dim_sizes;
       for (int i = 0; i < rank; ++i) {
         filter_dim_sizes.push_back(
-            rewriter.create<tensor::DimOp>(loc, op.filter(), i));
+            rewriter.create<tensor::DimOp>(loc, op.getFilter(), i));
       }
       filter_dim_sizes[rank-1] = rewriter.create<arith::MulIOp>(
           loc, filter_dim_sizes[rank-1], filter_dim_sizes[rank-2]);
@@ -1327,7 +1217,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
                               dimension_numbers_attr, feature_group_count_attr,
                               batch_group_count_attr};
     rewriter.replaceOpWithNewOp<mhlo::DynamicConvOp>(op, op.getType(), operands,
-                                                     llvm::makeArrayRef(attrs));
+                                                     llvm::ArrayRef(attrs));
     return success();
   }
 
@@ -1367,16 +1257,17 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
     auto filter_ty =
-        op.filter().getType().template dyn_cast<RankedTensorType>();
+        op.getFilter().getType().template dyn_cast<RankedTensorType>();
 
     // With the exception of input's batch dimension, input and filter need to
     // have static shape for calculation of HLO paddings and feature group count
@@ -1384,8 +1275,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     if (!input_ty || !filter_ty || !filter_ty.hasStaticShape())
       return failure();
 
-    ArrayRef<Attribute> dilations = op.dilations().getValue();
-    ArrayRef<Attribute> strides = op.strides().getValue();
+    ArrayRef<Attribute> dilations = op.getDilations().getValue();
+    ArrayRef<Attribute> strides = op.getStrides().getValue();
     ArrayRef<Attribute> explicit_paddings;
     if (padding == tensorflow::Padding::EXPLICIT) {
       // EXPLICIT padding mode and the associated attribute is limited to
@@ -1423,7 +1314,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
         int64_t pad_low_int64;
         int64_t pad_high_int64;
         int64_t input_size = input_ty.getDimSize(dim);
-        if (input_size == ShapedType::kDynamicSize) return failure();
+        if (input_size == ShapedType::kDynamic) return failure();
         tsl::Status status = tensorflow::GetWindowedOutputSizeVerboseV2(
             input_size, filter_ty.getDimSize(i), dilation, stride, padding,
             &output_size, &pad_low_int64, &pad_high_int64);
@@ -1446,7 +1337,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
 
     const int64_t input_channels =
         GetDimSize(input_ty, GetTensorFeatureDimIndex(num_dims, data_format));
-    if (input_channels == ShapedType::kDynamicSize) return failure();
+    if (input_channels == ShapedType::kDynamic) return failure();
     // Filters data_format is always HWIO so input channels dimension is after
     // all spatial dimensions.
     const int64_t filter_channels = GetDimSize(filter_ty, num_spatial_dims);
@@ -1487,7 +1378,7 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
                               dimension_numbers_attr, feature_group_count_attr,
                               batch_group_count_attr, paddings_attr};
     rewriter.replaceOpWithNewOp<ConvolutionOp>(op, op.getType(), operands,
-                                               llvm::makeArrayRef(attrs));
+                                               llvm::ArrayRef(attrs));
     return success();
   }
 };
@@ -1507,9 +1398,9 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
   LogicalResult matchAndRewrite(TF::PadV2Op op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto input = op.input();
-    auto paddings = op.paddings();
-    auto constant_values = op.constant_values();
+    auto input = op.getInput();
+    auto paddings = op.getPaddings();
+    auto constant_values = op.getConstantValues();
     auto input_type = input.getType().dyn_cast<RankedTensorType>();
     auto paddings_type = paddings.getType().dyn_cast<RankedTensorType>();
     if (!input_type || !paddings_type || !paddings_type.hasStaticShape())
@@ -1572,16 +1463,16 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
   LogicalResult matchAndRewrite(TF::GatherNdOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto params = op.params();
+    auto params = op.getParams();
     auto params_ty = params.getType().dyn_cast<RankedTensorType>();
-    auto indices = op.indices();
+    auto indices = op.getIndices();
     auto indices_ty = indices.getType().dyn_cast<RankedTensorType>();
     auto params_rank = params_ty.getRank();
     auto indices_rank = indices_ty.getRank();
     int64_t num_index_dims = indices_ty.getDimSize(indices_rank - 1);
     if (!params_ty || !indices_ty) return failure();
     // the last dim of indices of GatherNdOp must be fixed shaped
-    if (num_index_dims == ShapedType::kDynamicSize) return failure();
+    if (num_index_dims == ShapedType::kDynamic) return failure();
 
     SmallVector<int64_t, 4> slice_sizes;
     slice_sizes.reserve(params_rank);
@@ -1602,7 +1493,7 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
             loc, rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
       } else {
         int64_t dim_size = params_ty.getDimSize(i);
-        if (dim_size != ShapedType::kDynamicSize) {
+        if (dim_size != ShapedType::kDynamic) {
           slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
               loc,
               rewriter.getIntegerAttr(indices_ty.getElementType(), dim_size)));
@@ -1644,11 +1535,11 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
     // implemented.
     if (params_ty.hasStaticShape() && indices_ty.hasStaticShape()) {
       rewriter.replaceOpWithNewOp<mhlo::GatherOp>(
-          op, op.getType(), op.params(), op.indices(), dims_attr,
+          op, op.getType(), op.getParams(), op.getIndices(), dims_attr,
           GetI64ElementsAttr(slice_sizes, &rewriter));
     } else {
       rewriter.replaceOpWithNewOp<mhlo::DynamicGatherOp>(
-          op, op.getType(), op.params(), op.indices(), slice_sizes_value,
+          op, op.getType(), op.getParams(), op.getIndices(), slice_sizes_value,
           dims_attr);
     }
     return success();
@@ -1673,12 +1564,12 @@ class ConvertBF16FloorDivOp : public OpRewritePattern<TF::FloorDivOp> {
 
   LogicalResult matchAndRewrite(TF::FloorDivOp op,
                                 PatternRewriter &rewriter) const override {
-    auto l = op.x();
-    auto r = op.y();
+    auto l = op.getX();
+    auto r = op.getY();
     auto element_type = getElementTypeOrSelf(l.getType());
     if (!element_type.isBF16()) return failure();
 
-    auto out_type = op.z().getType().cast<TensorType>();
+    auto out_type = op.getZ().getType().cast<TensorType>();
 
     l = rewriter.create<ConvertOp>(op.getLoc(), l, rewriter.getF32Type());
     r = rewriter.create<ConvertOp>(op.getLoc(), r, rewriter.getF32Type());
@@ -1701,8 +1592,8 @@ class ConvertBroadcastToOp : public OpRewritePattern<TF::BroadcastToOp> {
 
   LogicalResult matchAndRewrite(TF::BroadcastToOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
-    auto output_type = op.output().getType();
+    auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
+    auto output_type = op.getOutput().getType();
     if (!input_type) {
       return rewriter.notifyMatchFailure(op, "requires ranked input shape");
     }
@@ -1713,13 +1604,13 @@ class ConvertBroadcastToOp : public OpRewritePattern<TF::BroadcastToOp> {
         return rewriter.notifyMatchFailure(op, "requires ranked output shape");
       }
       auto rank_diff = ranked_output_type.getRank() - input_type.getRank();
-      // The tf.BroadcastTo op performs "right-aligned" numpy-style
+      // The tf.BroadcastTo op.getPerforms "right-aligned" numpy-style
       // broadcasting.
       broadcast_dimensions = llvm::to_vector<4>(
           llvm::seq<int64_t>(rank_diff, ranked_output_type.getRank()));
     }
     rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
-        op, output_type, op.input(), op.shape(),
+        op, output_type, op.getInput(), op.getShape(),
         rewriter.getI64TensorAttr(broadcast_dimensions));
     return success();
   }
@@ -1732,19 +1623,19 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(TF::RollOp op,
                                 PatternRewriter &rewriter) const override {
-    auto shift_ty = op.shift().getType().dyn_cast<RankedTensorType>();
+    auto shift_ty = op.getShift().getType().dyn_cast<RankedTensorType>();
     if (!shift_ty || shift_ty.getRank() != 0) {
       return rewriter.notifyMatchFailure(
           op, "require the type of shift to be 0D tensor");
     }
 
     APInt val;
-    if (!matchPattern(op.axis(), m_ConstantInt(&val))) {
+    if (!matchPattern(op.getAxis(), m_ConstantInt(&val))) {
       return rewriter.notifyMatchFailure(op, "require axis to be constant");
     }
     int axis = val.getSExtValue();
 
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_ty || !input_ty.hasStaticShape()) {
       return rewriter.notifyMatchFailure(
           op, "require the type of input to have static shapes");
@@ -1757,7 +1648,7 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     // offsets positive.
     // offset = ((offset % axis_size) + axis_size) % axis_size
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value offset = op.shift();
+    Value offset = op.getShift();
     auto axis_size = b.create<mhlo::ConstantOp>(b.getIntegerAttr(
         getElementTypeOrSelf(offset.getType()), input_shape[axis]));
     offset = b.create<RemOp>(
@@ -1768,8 +1659,8 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     // offset. This also works if shift is not constant.
     // DynamicSliceOp requires the sizes being integer, and we can get the
     // information from input shape.
-    auto concat = b.create<ConcatenateOp>(ValueRange{op.input(), op.input()},
-                                          b.getI64IntegerAttr(axis));
+    auto concat = b.create<ConcatenateOp>(
+        ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
     Value zero = b.create<mhlo::ConstantOp>(
         b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
     SmallVector<Value> slice_begin_indices(input_rank, zero);
@@ -1789,11 +1680,11 @@ class ConvertLeakyReluOp : public OpRewritePattern<TF::LeakyReluOp> {
   LogicalResult matchAndRewrite(TF::LeakyReluOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value features = op.features();
+    Value features = op.getFeatures();
 
     // Use ConstantLike for `alpha` to match the shape of feature.
     auto alphaVal = chlo::getConstantLike(
-        rewriter, loc, op.alpha().convertToFloat(), features);
+        rewriter, loc, op.getAlpha().convertToFloat(), features);
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyActivationVal =
@@ -1817,13 +1708,13 @@ class ConvertLeakyReluGradOp : public OpRewritePattern<TF::LeakyReluGradOp> {
   LogicalResult matchAndRewrite(TF::LeakyReluGradOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value gradients = op.gradients();
-    Value features = op.features();
+    Value gradients = op.getGradients();
+    Value features = op.getFeatures();
     auto featureType = features.getType();
 
     // Use ConstantLike for `alpha` to match the shape of feature.
     auto alphaVal = chlo::getConstantLike(
-        rewriter, loc, op.alpha().convertToFloat(), features);
+        rewriter, loc, op.getAlpha().convertToFloat(), features);
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyGradientVal =
@@ -1860,7 +1751,7 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
 
   LogicalResult matchAndRewrite(TF::DiagPartOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_type = op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_type || !input_type.hasStaticShape()) return failure();
     int64_t num_dims = input_type.getRank();
     if (num_dims < 2 || num_dims % 2 != 0) return failure();
@@ -1878,7 +1769,7 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
         op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({new_size, new_size},
                                              input_type.getElementType()),
-        op.input());
+        op.getInput());
     auto iota_type = tensorflow::GetTypeFromTFTensorShape(
         {new_size, new_size}, rewriter.getIntegerType(32));
     auto iota0 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
@@ -1920,7 +1811,7 @@ class ConvertMatrixDiagPartV3Op
   // tuple of two values (starting and ending diagonal, for a band).
   LogicalResult ExtractK(TF::MatrixDiagPartV3Op op, int64_t (*k)[2]) const {
     DenseIntElementsAttr kattr;
-    if (!matchPattern(op.k(), m_Constant(&kattr))) {
+    if (!matchPattern(op.getK(), m_Constant(&kattr))) {
       return failure();
     }
     DenseIntElementsAttr::iterator it = kattr.begin();
@@ -1956,7 +1847,7 @@ class ConvertMatrixDiagPartV3Op
   LogicalResult matchAndRewrite(TF::MatrixDiagPartV3Op op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    ShapedType input_type = op.input().getType().dyn_cast<ShapedType>();
+    ShapedType input_type = op.getInput().getType().dyn_cast<ShapedType>();
 
     // Align is a string specifying how superdiagonals and subdiagonals should
     // be aligned/padded for diagonals that are shorter than max_diag_len. The
@@ -2143,7 +2034,7 @@ class ConvertMatrixDiagPartV3Op
         /*collapsed_slice_dims=*/collapsed_dims, start_index_map,
         /*index_vector_dim=*/0);
     Value gather = rewriter.create<mhlo::GatherOp>(
-        loc, op.input(), start_indices, dims_attr,
+        loc, op.getInput(), start_indices, dims_attr,
         GetI64ElementsAttr(slice_sizes, &rewriter));
 
     // We now need to broadcast the "in_bounds" boolean expression, as well as
@@ -2158,7 +2049,7 @@ class ConvertMatrixDiagPartV3Op
                                              rewriter.getIntegerType(1)),
         in_bounds, GetI64ElementsAttr(broadcast_bounds, &rewriter));
     Value b_padding = rewriter.create<BroadcastOp>(
-        loc, op.padding_value(), GetI64ElementsAttr(output_shape, &rewriter));
+        loc, op.getPaddingValue(), GetI64ElementsAttr(output_shape, &rewriter));
 
     // Replace all out-of-bounds values in the result with padding_value.
     Value result =
@@ -2184,11 +2075,11 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
   LogicalResult matchAndRewrite(TF::EinsumOp op,
                                 PatternRewriter &rewriter) const override {
     StringAttr equation = op->getAttrOfType<StringAttr>("equation");
-    if (op.N() == 1) {
+    if (op.getN() == 1) {
       rewriter.replaceOpWithNewOp<UnaryEinsumOp>(
-          op, op.getType(), *op.inputs().begin(), equation);
-    } else if (op.N() == 2) {
-      ValueRange inputs = op.inputs();
+          op, op.getType(), *op.getInputs().begin(), equation);
+    } else if (op.getN() == 2) {
+      ValueRange inputs = op.getInputs();
       rewriter.replaceOpWithNewOp<EinsumOp>(op, op.getType(), inputs[0],
                                             inputs[1], equation);
     } else {
@@ -2217,13 +2108,13 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
   using OpRewritePattern<OpTy>::OpRewritePattern;
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto input_ty = op.input().getType().template cast<ShapedType>();
+    auto input_ty = op.getInput().getType().template cast<ShapedType>();
     if (!input_ty.hasRank()) {
       return failure();
     }
     auto input_shape = input_ty.getShape();
     DenseIntElementsAttr fft_length_attr;
-    if (!matchPattern(op.fft_length(), m_Constant(&fft_length_attr))) {
+    if (!matchPattern(op.getFftLength(), m_Constant(&fft_length_attr))) {
       return failure();
     }
     int64_t fft_length;
@@ -2250,7 +2141,7 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
     expected_shape.push_back(expected_dim);
 
     // Zero pad or truncate the last axis
-    Value reshaped = op.input();
+    Value reshaped = op.getInput();
     SmallVector<int64_t, 4> begin_indices(input_shape.size(), 0);
     SmallVector<int64_t, 4> strides(input_shape.size(), 1);
 
@@ -2260,7 +2151,7 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
           op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
-          op.input(), GetI64ElementsAttr(begin_indices, &rewriter),
+          op.getInput(), GetI64ElementsAttr(begin_indices, &rewriter),
           GetI64ElementsAttr(expected_shape, &rewriter),
           GetI64ElementsAttr(strides, &rewriter));
 
@@ -2275,7 +2166,7 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
           loc,
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
-          op.input(), zero, GetI64ElementsAttr(no_padding, &rewriter),
+          op.getInput(), zero, GetI64ElementsAttr(no_padding, &rewriter),
           GetI64ElementsAttr(padding, &rewriter),
           GetI64ElementsAttr(no_padding, &rewriter));
     }
@@ -2283,7 +2174,7 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
     rewriter.replaceOpWithNewOp<FftOp>(
         op, op.getType(), reshaped,
         FftTypeAttr::get(rewriter.getContext(),
-                         symbolizeFftType(fft_string).getValue()),
+                         symbolizeFftType(fft_string).value()),
         rewriter.getI64TensorAttr(fft_length));
     return success();
   }
@@ -2304,11 +2195,11 @@ class ConvertFusedBatchNormGradBase
   LogicalResult matchAndRewrite(FusedBatchNormGradOpT op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value grad = op.y_backprop();
-    Value act = op.x();
-    Value scale = op.scale();
-    Value mean = op.reserve_space_1();
-    Value var = op.reserve_space_2();
+    Value grad = op.getYBackprop();
+    Value act = op.getX();
+    Value scale = op.getScale();
+    Value mean = op.getReserveSpace_1();
+    Value var = op.getReserveSpace_2();
 
     // TODO(b/141785544): Update this to not require static shapes.
     // activation shape needs to be static to convert negative indices in
@@ -2325,7 +2216,7 @@ class ConvertFusedBatchNormGradBase
     act = rewriter.create<ConvertOp>(loc, act, kernel_type);
 
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     auto feature_dim_attr = getFeatureDimensionAttr(rewriter, data_format, act);
@@ -2333,7 +2224,7 @@ class ConvertFusedBatchNormGradBase
 
     // Gets the result values.
     Value x_backprop, scale_backprop, offset_backprop;
-    if (op.is_training()) {  // training
+    if (op.getIsTraining()) {  // training
       // TODO(b/145536565): handle GPU logic separately.
       // Infers the output type with the converted `act`.
       Type feature_type = tensorflow::GetTypeFromTFTensorShape(
@@ -2342,7 +2233,7 @@ class ConvertFusedBatchNormGradBase
       SmallVector<Type, 3> operand_types = {act.getType(), feature_type,
                                             feature_type};
       auto training_op = rewriter.create<BatchNormGradOp>(
-          loc, operand_types, act, scale, mean, var, grad, op.epsilon(),
+          loc, operand_types, act, scale, mean, var, grad, op.getEpsilon(),
           feature_dim);
 
       x_backprop = training_op.getResult(0);
@@ -2363,7 +2254,7 @@ class ConvertFusedBatchNormGradBase
       RankedTensorType scalar_float =
           tensorflow::GetTypeFromTFTensorShape({}, kernel_type);
       auto epsilon = rewriter.create<ConstantOp>(
-          loc, DenseFPElementsAttr::get(scalar_float, {op.epsilon()}));
+          loc, DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
       auto add_op = rewriter.create<chlo::BroadcastAddOp>(
           loc, var, epsilon.getResult(), scalar_broadcast_dims);
 
@@ -2379,7 +2270,7 @@ class ConvertFusedBatchNormGradBase
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<mhlo::MulOp>(loc, op.scale(), scratch1);
+          rewriter.create<mhlo::MulOp>(loc, op.getScale(), scratch1);
       x_backprop = rewriter.create<mhlo::MulOp>(
           loc, grad,
           Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
@@ -2396,7 +2287,7 @@ class ConvertFusedBatchNormGradBase
     Value last_val[2];
     if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
       // It doesn't matter what values we provide for the last 2 results.
-      last_val[0] = last_val[1] = op.x();
+      last_val[0] = last_val[1] = op.getX();
     } else {
       auto const_val = rewriter.create<ConstantOp>(
           op.getLoc(), DenseElementsAttr::get<float>(
@@ -2436,34 +2327,36 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
   LogicalResult matchAndRewrite(FusedBatchNormOpT op,
                                 PatternRewriter &rewriter) const override {
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
-    auto feature_dim = getFeatureDimensionAttr(rewriter, data_format, op.x());
+    auto feature_dim =
+        getFeatureDimensionAttr(rewriter, data_format, op.getX());
 
-    auto input_type_tensor = op.x().getType().template cast<TensorType>();
+    auto input_type_tensor = op.getX().getType().template cast<TensorType>();
     auto input_element_type = input_type_tensor.getElementType();
 
-    auto scale_type_tensor = op.scale().getType().template cast<TensorType>();
+    auto scale_type_tensor =
+        op.getScale().getType().template cast<TensorType>();
     auto scale_element_type = scale_type_tensor.getElementType();
 
-    auto mean_type_tensor = op.mean().getType().template cast<TensorType>();
+    auto mean_type_tensor = op.getMean().getType().template cast<TensorType>();
     auto mean_element_type = mean_type_tensor.getElementType();
     // In the training case, dimensions of input tensors must be static.
-    if (op.is_training() && (!input_type_tensor.hasStaticShape() ||
-                             !scale_type_tensor.hasStaticShape() ||
-                             !mean_type_tensor.hasStaticShape()))
+    if (op.getIsTraining() && (!input_type_tensor.hasStaticShape() ||
+                               !scale_type_tensor.hasStaticShape() ||
+                               !mean_type_tensor.hasStaticShape()))
       return failure();
 
     // TODO(b/69928690): Support mixed precision in the XLA batch
     // normalization operators. As a workaround, create a new x with the same
     // element type as scale (which may be more precise than the input type).
-    Value bn_train_input = rewriter.create<mhlo::ConvertOp>(op.getLoc(), op.x(),
-                                                            scale_element_type);
+    Value bn_train_input = rewriter.create<mhlo::ConvertOp>(
+        op.getLoc(), op.getX(), scale_element_type);
     TensorType bn_train_input_type_tensor =
         bn_train_input.getType().template cast<TensorType>();
 
-    if (op.is_training()) {
+    if (op.getIsTraining()) {
       // Training case.
       auto operand_shape = bn_train_input_type_tensor.getShape();
       // The mean and variance are each 1 dimensional arrays the size of the
@@ -2477,8 +2370,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       SmallVector<Type, 3> operand_types = {bn_train_input_type_tensor,
                                             mean_var_type, mean_var_type};
       auto bn_train_op = rewriter.create<mhlo::BatchNormTrainingOp>(
-          op.getLoc(), operand_types, bn_train_input, op.scale(), op.offset(),
-          op.epsilon(), feature_dim.getInt());
+          op.getLoc(), operand_types, bn_train_input, op.getScale(),
+          op.getOffset(), op.getEpsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
       Value y_out = bn_train_op.getResult(0);
       Value batch_mean = bn_train_op.getResult(1);
@@ -2505,7 +2398,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                                input_element_type);
 
       float exponential_avg_factor =
-          op.exponential_avg_factor().convertToFloat();
+          op.getExponentialAvgFactor().convertToFloat();
       if (exponential_avg_factor != 1.0f) {
         auto alpha = rewriter.create<mhlo::ConstantOp>(
             op.getLoc(), rewriter.getFloatAttr(mean_element_type,
@@ -2516,7 +2409,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
         // new_running_mean = alpha * old_mean + beta * batch_mean.
         auto alpha_mul_old_mean = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.mean().getType(), alpha, op.mean(),
+            op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
         auto beta_mul_batch_mean = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), batch_mean.getType(), beta, batch_mean,
@@ -2527,7 +2420,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
         auto alpha_mul_old_variance = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.variance().getType(), alpha, op.variance(),
+            op.getLoc(), op.getVariance().getType(), alpha, op.getVariance(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
         auto beta_mul_batch_variance = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
@@ -2572,8 +2465,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       auto bn_train_op = rewriter.create<BatchNormInferenceOp>(
           op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
-          op.scale(), op.offset(), op.mean(), op.variance(), op.epsilon(),
-          feature_dim.getInt());
+          op.getScale(), op.getOffset(), op.getMean(), op.getVariance(),
+          op.getEpsilon(), feature_dim.getInt());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
@@ -2587,10 +2480,10 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       // reserved_space_2.
       if (std::is_same<FusedBatchNormOpT, TF::FusedBatchNormV2Op>::value) {
         rewriter.replaceOp(op, {/*y=*/y_out,
-                                /*batch_mean=*/op.mean(),
-                                /*batch_variance=*/op.variance(),
-                                /*reserve_space_1=*/op.mean(),
-                                /*reserve_space_2=*/op.variance()});
+                                /*batch_mean=*/op.getMean(),
+                                /*batch_variance=*/op.getVariance(),
+                                /*reserve_space_1=*/op.getMean(),
+                                /*reserve_space_2=*/op.getVariance()});
       } else {
         // For FusedBatchNormV3Op, also create a constant tensor to forward to
         // last reserve_space_3 output.
@@ -2607,10 +2500,10 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
           dummy_const = rewriter.create<tensor::CastOp>(
               op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {/*y=*/y_out,
-                                /*batch_mean=*/op.mean(),
-                                /*batch_variance=*/op.variance(),
-                                /*reserve_space_1=*/op.mean(),
-                                /*reserve_space_2=*/op.variance(),
+                                /*batch_mean=*/op.getMean(),
+                                /*batch_variance=*/op.getVariance(),
+                                /*reserve_space_1=*/op.getMean(),
+                                /*reserve_space_2=*/op.getVariance(),
                                 /*reserve_space_3=*/dummy_const});
       }
     }
@@ -2695,7 +2588,7 @@ Operation *AvgPoolDivideByCount(
   RankedTensorType orig_input_type =
       tensorflow::GetTypeFromTFTensorShape(input_shape, element_type);
 
-  if (op.padding() == "VALID") {
+  if (op.getPadding() == "VALID") {
     // All window counts are equal here because we don't have padding
     // (each entry of `pooled` corresponds to a window that consists of
     //  original input entries only).
@@ -2708,7 +2601,7 @@ Operation *AvgPoolDivideByCount(
     result = rewriter.create<chlo::BroadcastDivOp>(
         loc, pooled_type, pooled, divisor, scalar_broadcast_dims);
   } else {
-    assert(op.padding() == "SAME");
+    assert(op.getPadding() == "SAME");
     // For SAME padding, only original entries that contributed to a window
     // are counted for the average of this window, not padded entries.
 
@@ -2718,8 +2611,9 @@ Operation *AvgPoolDivideByCount(
 
     // Get padding for the input.
     DenseIntElementsAttr input_padding_attr =
-        GetReduceWindowPaddingAsAttr<num_dims>(
-            input_shape, op.ksize(), op.strides(), op.padding(), &rewriter);
+        GetReduceWindowPaddingAsAttr<num_dims>(input_shape, op.getKsize(),
+                                               op.getStrides(), op.getPadding(),
+                                               &rewriter);
 
     // Count the 1's in each window, using the same padding as for the input,
     // which gives us the window counts by which `pooled` needs to be divided.
@@ -2727,8 +2621,8 @@ Operation *AvgPoolDivideByCount(
         loc, pooled_type,
         /*operand=*/all_ones_tensor,
         /*init_value=*/zero,
-        /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
-        /*window_strides=*/GetI64ElementsAttr(op.strides()),
+        /*window_dimensions=*/GetI64ElementsAttr(op.getKsize()),
+        /*window_strides=*/GetI64ElementsAttr(op.getStrides()),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(),
         /*padding=*/input_padding_attr);
@@ -2741,8 +2635,8 @@ Operation *AvgPoolDivideByCount(
   return result;
 }
 
-Value GetAvgPoolInput(TF::AvgPoolOp op) { return op.value(); }
-Value GetAvgPoolInput(TF::AvgPool3DOp op) { return op.input(); }
+Value GetAvgPoolInput(TF::AvgPoolOp op) { return op.getValue(); }
+Value GetAvgPoolInput(TF::AvgPool3DOp op) { return op.getInput(); }
 
 // Converts AvgPool op to HLO ReduceWindow op by setting appropriate window
 // dimensions with add as the reduction function. The reduction result is
@@ -2780,11 +2674,11 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     Value init =
         GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
-        input_type.getShape(), op.ksize(), op.strides(), op.padding(),
+        input_type.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
         op.getLoc(), result_type, input_value, init,
-        GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
+        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
     BuildReduceBody<AddOp>(sum_element_type, &reduce.getBody(), &rewriter);
@@ -2794,8 +2688,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     SmallVector<int64_t, num_dims> input_shape(
         llvm::to_vector<num_dims>(input_type.getShape()));
     SmallVector<int64_t, num_dims> ksize, strides;
-    GetI64ArrayAttrValues(op.ksize(), &ksize);
-    GetI64ArrayAttrValues(op.strides(), &strides);
+    GetI64ArrayAttrValues(op.getKsize(), &ksize);
+    GetI64ArrayAttrValues(op.getStrides(), &strides);
 
     Operation *result_op = AvgPoolDivideByCount<OpTy, num_dims>(
         reduce.getResult(0), input_shape, ksize, strides, op, init, rewriter);
@@ -2867,12 +2761,12 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format)) {
+    if (!FormatFromString(op.getDataFormat().str(), &data_format)) {
       return op.emitOpError("invalid data format");
     }
     // `out_grad` is the gradient that was propagated via backpropagation from
     // the output layer.
-    Value out_grad = op.grad();
+    Value out_grad = op.getGrad();
     auto out_grad_type =
         out_grad.getType().template dyn_cast<RankedTensorType>();
     if (!out_grad_type) {
@@ -2880,7 +2774,7 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     }
     Type element_type = out_grad_type.getElementType();
     DenseIntElementsAttr orig_input_shape_attr;
-    if (!matchPattern(op.orig_input_shape(),
+    if (!matchPattern(op.getOrigInputShape(),
                       m_Constant(&orig_input_shape_attr))) {
       return failure();
     }
@@ -2888,8 +2782,8 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     DimVector orig_input_shape(orig_input_shape_values.begin(),
                                orig_input_shape_values.end());
     DimVector ksize, strides;
-    GetI64ArrayAttrValues(op.ksize(), &ksize);
-    GetI64ArrayAttrValues(op.strides(), &strides);
+    GetI64ArrayAttrValues(op.getKsize(), &ksize);
+    GetI64ArrayAttrValues(op.getStrides(), &strides);
     Value zero = GetScalarConstOfType(element_type, loc, 0, &rewriter);
 
     auto out_grad_divided = AvgPoolDivideByCount<OpTy, num_dims>(
@@ -2897,7 +2791,8 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
 
     // Get same padding as for original input.
     PaddingArray orig_padding = GetReduceWindowPaddingAsArray<num_dims>(
-        orig_input_shape, op.ksize(), op.strides(), op.padding(), &rewriter);
+        orig_input_shape, op.getKsize(), op.getStrides(), op.getPadding(),
+        &rewriter);
 
     // Add padding around `out_grad_divided` values in such a way that the
     // subsequent `ReduceWindowOp` produces the gradient.
@@ -2970,7 +2865,7 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
                                              sum_element_type),
         /*operand=*/reduce_window_input,
         /*init_value=*/zero,
-        /*window_dimensions=*/GetI64ElementsAttr(op.ksize()),
+        /*window_dimensions=*/GetI64ElementsAttr(op.getKsize()),
         /*window_strides=*/ones,
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(),
@@ -3010,10 +2905,10 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     Type element_type =
-        op.input().getType().template cast<TensorType>().getElementType();
+        op.getInput().getType().template cast<TensorType>().getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
     if (padding == tensorflow::Padding::EXPLICIT) {
       return failure();
@@ -3022,13 +2917,15 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     ConstantOp init = GetScalarLimitConstOfType(
         element_type, loc, hlo::kInfinityLowest, &rewriter);
 
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
-        input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
+        input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
+        &rewriter);
     auto reduce = rewriter.create<ReduceWindowOp>(
-        loc, op.getType(), op.input(), init, GetI64ElementsAttr(op.ksize()),
-        GetI64ElementsAttr(op.strides()),
+        loc, op.getType(), op.getInput(), init,
+        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
         /*base_dilations=*/DenseIntElementsAttr(),
         /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
     BuildReduceBody<MaxOp>(element_type, &reduce.getBody(), &rewriter);
@@ -3050,17 +2947,17 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
   LogicalResult matchAndRewrite(TF::SelectOp op,
                                 PatternRewriter &rewriter) const override {
     // This lowering only works on ranked types.
-    auto cond_type = op.condition().getType().dyn_cast<RankedTensorType>();
-    auto then_type = op.then_value().getType().dyn_cast<RankedTensorType>();
-    auto else_type = op.else_value().getType().dyn_cast<RankedTensorType>();
+    auto cond_type = op.getCondition().getType().dyn_cast<RankedTensorType>();
+    auto then_type = op.getThenValue().getType().dyn_cast<RankedTensorType>();
+    auto else_type = op.getElseValue().getType().dyn_cast<RankedTensorType>();
     if (!cond_type || !then_type || !else_type) {
       return failure();
     }
 
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    Value cond_shape = b.createOrFold<shape::ShapeOfOp>(op.condition());
-    Value then_shape = b.createOrFold<shape::ShapeOfOp>(op.then_value());
-    Value else_shape = b.createOrFold<shape::ShapeOfOp>(op.else_value());
+    Value cond_shape = b.createOrFold<shape::ShapeOfOp>(op.getCondition());
+    Value then_shape = b.createOrFold<shape::ShapeOfOp>(op.getThenValue());
+    Value else_shape = b.createOrFold<shape::ShapeOfOp>(op.getElseValue());
 
     // First check that the `then` and `else` shapes are the equal.
     Value assumption =
@@ -3096,7 +2993,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     b.createBlock(&assuming_op.getDoRegion());
 
     // Broadcast the cond if necessary.
-    Value cond = op.condition();
+    Value cond = op.getCondition();
     if (needs_broadcast) {
       Value result_extents = b.create<shape::ToExtentTensorOp>(
           GetExtentsTensorTypeFor(result_type), then_shape);
@@ -3106,8 +3003,8 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
           cond, result_extents,
           GetI64ElementsAttrForSeq(0, cond_type.getRank(), &b));
     }
-    Value select = b.create<mhlo::SelectOp>(result_type, cond, op.then_value(),
-                                            op.else_value());
+    Value select = b.create<mhlo::SelectOp>(
+        result_type, cond, op.getThenValue(), op.getElseValue());
     b.create<shape::AssumingYieldOp>(select);
     rewriter.replaceOp(op, {assuming_op.getResult(0)});
     return success();
@@ -3180,9 +3077,9 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
   LogicalResult matchAndRewrite(TF::SliceOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value input = op.input();
-    Value begin_indices = op.begin();
-    Value sizes = op.size();
+    Value input = op.getInput();
+    Value begin_indices = op.getBegin();
+    Value sizes = op.getSize();
 
     auto input_ty = input.getType().dyn_cast<RankedTensorType>();
     auto begin_type = begin_indices.getType().dyn_cast<RankedTensorType>();
@@ -3197,7 +3094,7 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
     // TODO(disc): remove static shape check once folding/canonicalization func
     // added
     DenseIntElementsAttr size_attr;
-    if (matchPattern(op.size(), m_Constant(&size_attr)) && input_ty.hasStaticShape()
+    if (matchPattern(op.getSize(), m_Constant(&size_attr)) && input_ty.hasStaticShape()
         && result_ty.hasStaticShape()) {
       return failure();
     }
@@ -3341,15 +3238,15 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
 
   LogicalResult matchAndRewrite(TF::BatchMatMulV2Op op,
                                 PatternRewriter &rewriter) const override {
-    Value lhs = op.x();
-    Value rhs = op.y();
+    Value lhs = op.getX();
+    Value rhs = op.getY();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     if (!lhs_type || !rhs_type) return failure();
-    if (lhs_type.getElementType().isa<ComplexType>() && op.adj_x()) {
+    if (lhs_type.getElementType().isa<ComplexType>() && op.getAdjX()) {
       lhs = rewriter.create<TF::ConjOp>(op.getLoc(), lhs_type, lhs);
     }
-    if (rhs_type.getElementType().isa<ComplexType>() && op.adj_y()) {
+    if (rhs_type.getElementType().isa<ComplexType>() && op.getAdjY()) {
       rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
     }
 
@@ -3362,9 +3259,9 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
     int64_t rank = lhs_type.getRank();
     auto batch_dimensions = llvm::to_vector<4>(llvm::seq<int64_t>(0, rank - 2));
     auto lhs_contracting_dimensions = llvm::to_vector<4>(
-        llvm::makeArrayRef({op.adj_x() ? rank - 2 : rank - 1}));
+        llvm::ArrayRef({op.getAdjX() ? rank - 2 : rank - 1}));
     auto rhs_contracting_dimensions = llvm::to_vector<4>(
-        llvm::makeArrayRef({op.adj_y() ? rank - 1 : rank - 2}));
+        llvm::ArrayRef({op.getAdjY() ? rank - 1 : rank - 2}));
     auto dimension_numbers = DotDimensionNumbersAttr::get(
         rewriter.getContext(),
         /*lhs_batching_dimensions=*/batch_dimensions,
@@ -3386,15 +3283,15 @@ class ConvertBatchMatMulOp : public OpRewritePattern<TF::BatchMatMulOp> {
 
   LogicalResult matchAndRewrite(TF::BatchMatMulOp op,
                                 PatternRewriter &rewriter) const override {
-    Value lhs = op.x();
-    Value rhs = op.y();
+    Value lhs = op.getX();
+    Value rhs = op.getY();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     if (!lhs_type || !rhs_type) return failure();
-    if (lhs_type.getElementType().isa<ComplexType>() && op.adj_x()) {
+    if (lhs_type.getElementType().isa<ComplexType>() && op.getAdjX()) {
       lhs = rewriter.create<TF::ConjOp>(op.getLoc(), lhs_type, lhs);
     }
-    if (rhs_type.getElementType().isa<ComplexType>() && op.adj_y()) {
+    if (rhs_type.getElementType().isa<ComplexType>() && op.getAdjY()) {
       rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
     }
 
@@ -3407,8 +3304,8 @@ class ConvertBatchMatMulOp : public OpRewritePattern<TF::BatchMatMulOp> {
         rewriter.getContext(),
         /*lhs_batching_dimensions=*/batch_dimensions,
         /*rhs_batching_dimensions=*/batch_dimensions,
-        /*lhs_contracting_dimensions=*/{op.adj_x() ? rank - 2 : rank - 1},
-        /*rhs_contracting_dimensions=*/{op.adj_y() ? rank - 1 : rank - 2});
+        /*lhs_contracting_dimensions=*/{op.getAdjX() ? rank - 2 : rank - 1},
+        /*rhs_contracting_dimensions=*/{op.getAdjY() ? rank - 1 : rank - 2});
     // TODO(silvasean): Emit shape checks for contracting dimensions.
     // (The batch dimensions are checked by the broadcasting logic)
     rewriter.replaceOpWithNewOp<DotGeneralOp>(op, op.getType(), lhs, rhs,
@@ -3458,12 +3355,12 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
   LogicalResult matchAndRewrite(TF::SplitOp op,
                                 PatternRewriter &rewriter) const override {
     // We can only split along static dimensions.
-    auto input_type = op.value().getType().dyn_cast<RankedTensorType>();
+    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
     if (!input_type || !input_type.hasStaticShape()) return failure();
 
     // We can only match when the split dimension is a constant scalar.
     DenseIntElementsAttr split_dim_attr;
-    if (!matchPattern(op.split_dim(), m_Constant(&split_dim_attr)))
+    if (!matchPattern(op.getSplitDim(), m_Constant(&split_dim_attr)))
       return failure();
 
     // Get the dimension we are splitting at. Offset properly if it's negative.
@@ -3473,9 +3370,6 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
 
     // Calculate the dimension size for each slice along the split dimension.
     int64_t input_dim_size = input_type.getDimSize(dim_index);
-    // If we are splitting along the dynamic dimension then we cannot compute
-    // the static dimension length.
-    if (ShapedType::isDynamic(input_dim_size)) return failure();
 
     int64_t num_splits = op.getNumResults();
     int64_t slice_size = input_dim_size / num_splits;
@@ -3488,7 +3382,7 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
 
     // Parameters for constructing each slice.
     SmallVector<int64_t, 4> begin_indices(input_rank, 0);
-    auto end_indices = tensorflow::ConvertMlirShapeToTF(input_type.getShape());
+    auto end_indices = llvm::to_vector<4>(input_type.getShape());
     SmallVector<int64_t, 4> strides(input_rank, 1);
 
     // All HLO slice results used to replace the original tf.Split op.
@@ -3499,7 +3393,7 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
       begin_indices[dim_index] = i * slice_size;
       end_indices[dim_index] = (i + 1) * slice_size;
       slices.push_back(
-          rewriter.create<SliceOp>(op.getLoc(), slice_type, op.value(),
+          rewriter.create<SliceOp>(op.getLoc(), slice_type, op.getValue(),
                                    GetI64ElementsAttr(begin_indices, &rewriter),
                                    GetI64ElementsAttr(end_indices, &rewriter),
                                    GetI64ElementsAttr(strides, &rewriter)));
@@ -3521,12 +3415,19 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
   LogicalResult matchAndRewrite(TF::SplitOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value input = op.value();
+    Value input = op.getValue();
     auto input_type = input.getType().dyn_cast<RankedTensorType>();
     if (!input_type) return failure();
+
+    // TODO(disc): remove static shape check once folding/canonicalization func
+    // added and ConvertSplitOp deleted. Calculate the dimension size for each
+    // slice along the split dimension. We are splitting along the dynamic
+    // dimension, or using static pattern transform
+    if (input_type.hasStaticShape()) return failure();
+
     // We can only match when the split dimension is a constant scalar.
     DenseIntElementsAttr split_dim_attr;
-    if (!matchPattern(op.split_dim(), m_Constant(&split_dim_attr)))
+    if (!matchPattern(op.getSplitDim(), m_Constant(&split_dim_attr)))
       return failure();
 
     // Get the dimension we are splitting at. Offset properly if it's negative.
@@ -3635,19 +3536,19 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 
   LogicalResult matchAndRewrite(TF::SplitVOp op,
                                 PatternRewriter &rewriter) const override {
-    // We can only split along static dimensions.
+    // We can only split inputs that have fully static shape.
     // TODO(b/145731001): enhance to support dynamic-shaped inputs.
-    auto input_type = op.value().getType().dyn_cast<RankedTensorType>();
-    if (!input_type) return failure();
+    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
+    if (!input_type || !input_type.hasStaticShape()) return failure();
 
     // We can only match when the split dimension is a constant scalar.
     DenseIntElementsAttr split_dim_attr;
-    if (!matchPattern(op.split_dim(), m_Constant(&split_dim_attr)))
+    if (!matchPattern(op.getSplitDim(), m_Constant(&split_dim_attr)))
       return failure();
 
     // We can only match when the split sizes is a constant int vector.
     DenseIntElementsAttr split_sizes_attr;
-    if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr)))
+    if (!matchPattern(op.getSizeSplits(), m_Constant(&split_sizes_attr)))
       return failure();
 
     // Get each chunck's size along the dimension to split. It may contain
@@ -3675,8 +3576,6 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
     if (dim_index < 0) dim_index += input_rank;
 
     int64_t input_dim_size = input_type.getDimSize(dim_index);
-    if (ShapedType::isDynamic(input_dim_size)) return failure();
-
     assert(((dynamic_dim_index && total_dim_size <= input_dim_size) ||
             (!dynamic_dim_index && total_dim_size == input_dim_size)) &&
            "invalid split sizes");
@@ -3687,7 +3586,7 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 
     // Parameters for constructing each slice.
     SmallVector<int64_t, 4> begin_indices(input_rank, 0);
-    auto end_indices = tensorflow::ConvertMlirShapeToTF(input_type.getShape());
+    auto end_indices = llvm::to_vector<4>(input_type.getShape());
     SmallVector<int64_t, 4> strides(input_rank, 1);
 
     // All HLO slice results used to replace the original tf.Split op.
@@ -3697,7 +3596,8 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
     for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i];
       slices.push_back(rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), op.value(), GetI64ElementsAttr(begin_indices, &rewriter),
+          op.getLoc(), op.getValue(),
+          GetI64ElementsAttr(begin_indices, &rewriter),
           GetI64ElementsAttr(end_indices, &rewriter),
           GetI64ElementsAttr(strides, &rewriter)));
       // Prepare the begin indice for the next slice.
@@ -3778,10 +3678,10 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     }
 
     Location loc = op.getLoc();
-    Value input = op.input();
+    Value input = op.getInput();
     if (!dims_to_reverse.empty())
       input = rewriter.create<ReverseOp>(
-          loc, input_ty, op.input(),
+          loc, input_ty, op.getInput(),
           GetI64ElementsAttr(dims_to_reverse, &rewriter));
     auto sliced = rewriter.create<SliceOp>(
         loc, input, GetI64ElementsAttr(hlo_begin_indices, &rewriter),
@@ -3801,7 +3701,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // If begin and end values are dynamic, we can only support this lowering
     // if strides are a known value of 1.
     DenseIntElementsAttr sparse_strides_attr;
-    if (!matchPattern(op.strides(), m_Constant(&sparse_strides_attr))) {
+    if (!matchPattern(op.getStrides(), m_Constant(&sparse_strides_attr))) {
       return rewriter.notifyMatchFailure(
           op,
           "requires that strides are known when begin/end values are dynamic");
@@ -3822,7 +3722,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
 
     // When begin/end values are dynamic, the ellipsis mask, if set, must refer
     // to the last dimension.
-    int ellipsis_mask = op.ellipsis_mask();
+    int ellipsis_mask = op.getEllipsisMask();
     if (!(ellipsis_mask == 0 || ellipsis_mask == (1 << last_dim)))
       return rewriter.notifyMatchFailure(
           op,
@@ -3839,9 +3739,9 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // Begin must be a ranked, 1-dimensional tensor: This is checked by the
     // verifier.
     int64_t slicing_dim_size =
-        op.begin().getType().cast<RankedTensorType>().getDimSize(0);
-    uint64_t begin_mask = op.begin_mask();
-    uint64_t end_mask = op.end_mask();
+        op.getBegin().getType().cast<RankedTensorType>().getDimSize(0);
+    uint64_t begin_mask = op.getBeginMask();
+    uint64_t end_mask = op.getEndMask();
     const int input_rank = input_shape.size();
     for (int d = 0; d < input_rank; ++d) {
       // Each dimension is either sliced fully or has size of one.
@@ -3861,7 +3761,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // For the dimensions that are to be sliced, all have slice sizes of 1.
     SmallVector<int64_t, 4> slice_sizes;
     auto begin_element_ty =
-        op.begin().getType().cast<ShapedType>().getElementType();
+        op.getBegin().getType().cast<ShapedType>().getElementType();
     // Scalar tensor type.
     TensorType type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, begin_element_ty);
@@ -3876,7 +3776,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
       }
 
       auto index = rewriter.create<SliceOp>(
-          loc, op.begin(), GetI64ElementsAttr({d}, &rewriter),
+          loc, op.getBegin(), GetI64ElementsAttr({d}, &rewriter),
           GetI64ElementsAttr({d + 1}, &rewriter),
           GetI64ElementsAttr({1}, &rewriter));
       // Convert index to scalar.
@@ -3900,7 +3800,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // This must be an xla DynamicSlice op due to the inputs that aren't
     // constant.
     auto sliced = rewriter.create<DynamicSliceOp>(
-        loc, sliced_type, op.input(), slice_begin_indices, slice_sizes_attr);
+        loc, sliced_type, op.getInput(), slice_begin_indices, slice_sizes_attr);
 
     // Reshape slice result so that the shape is updated depending on
     // 'new_axis_mask' or 'shrink_axis_mask' attributes.
@@ -3915,7 +3815,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     //
     // TODO(hinsu): Relax this constraint for ops without negative indices and
     // strides.
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_ty || !input_ty.hasStaticShape()) return failure();
 
     // Output shape needs to be static to apply 'new_axis_mask' or
@@ -3926,8 +3826,8 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     if (!result_ty || !result_ty.hasStaticShape()) return failure();
 
     DenseIntElementsAttr sparse_begin_attr, sparse_end_attr;
-    if (!matchPattern(op.begin(), m_Constant(&sparse_begin_attr)) ||
-        !matchPattern(op.end(), m_Constant(&sparse_end_attr))) {
+    if (!matchPattern(op.getBegin(), m_Constant(&sparse_begin_attr)) ||
+        !matchPattern(op.getEnd(), m_Constant(&sparse_end_attr))) {
       // Disable this path and rely on ConvertStridedSliceOpDynamic to do the
       // conversion.
       // return rewriteWithUnknownBegin(op, input_ty, result_ty, rewriter);
@@ -3958,7 +3858,7 @@ class ConvertStridedSliceGradOp
                                 PatternRewriter &rewriter) const override {
     // We need constant input shape to perform padding calculations later.
     DenseIntElementsAttr input_shape_attr;
-    if (!matchPattern(op.shape(), m_Constant(&input_shape_attr)))
+    if (!matchPattern(op.getShape(), m_Constant(&input_shape_attr)))
       return failure();
 
     // We also need constant begin/end indices and strides to perform padding
@@ -3971,7 +3871,7 @@ class ConvertStridedSliceGradOp
                                          &strides))
       return failure();
 
-    Value grad = op.dy();
+    Value grad = op.getDy();
     Type element_type = grad.getType().cast<ShapedType>().getElementType();
 
     // Perform reshape to undo any new/shrink axes done by strided slice.
@@ -4346,14 +4246,17 @@ bool GetSlicedBoundRanges(
     SmallVectorImpl<Value>& input_shape_vec,
     SmallVectorImpl<Value>& output_shape_vec) {
   Location loc = op.getLoc();
-  auto input_ty = op.input().getType().cast<RankedTensorType>();
+  auto input_ty = op.getInput().getType().cast<RankedTensorType>();
   int64_t rank = input_ty.getRank();
-  Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.input());
+  Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
 
-  int64_t sparse_rank = op.begin().getType().cast<RankedTensorType>().getDimSize(0);
-  if (op.end().getType().cast<RankedTensorType>().getDimSize(0) != sparse_rank)
+  int64_t sparse_rank =
+      op.getBegin().getType().cast<RankedTensorType>().getDimSize(0);
+  if (op.getEnd().getType().cast<RankedTensorType>().getDimSize(0) !=
+      sparse_rank)
     return false;
-  if (op.strides().getType().cast<RankedTensorType>().getDimSize(0) != sparse_rank)
+  if (op.getStrides().getType().cast<RankedTensorType>().getDimSize(0) !=
+      sparse_rank)
     return false;
 
   auto to_shape_scalar_type = [&](Value v) {
@@ -4378,15 +4281,15 @@ bool GetSlicedBoundRanges(
   for (int64_t i = 0; i < sparse_rank; ++i) {
     Value idx = rewriter.create<arith::ConstantIndexOp>(loc, i);
     sparse_begin.push_back(
-      rewriter.create<tensor::ExtractOp>(loc, op.begin(), idx));
+        rewriter.create<tensor::ExtractOp>(loc, op.getBegin(), idx));
     sparse_end.push_back(
-      rewriter.create<tensor::ExtractOp>(loc, op.end(), idx));
+        rewriter.create<tensor::ExtractOp>(loc, op.getEnd(), idx));
   }
 
   CalculateSlicedShapeFromSparseIndices(
-      &rewriter, loc, shape_scalar_type, input_shape_vec, sparse_begin, sparse_end,
-      sparse_strides, op.begin_mask(), op.end_mask(), op.ellipsis_mask(),
-      op.new_axis_mask(), op.shrink_axis_mask(),
+      &rewriter, loc, shape_scalar_type, input_shape_vec, sparse_begin,
+      sparse_end, sparse_strides, op.getBeginMask(), op.getEndMask(),
+      op.getEllipsisMask(), op.getNewAxisMask(), op.getShrinkAxisMask(),
       &slice_begin, &slice_end, &slice_stride, &output_shape_vec,
       /*calc_final_shape*/ true);
 
@@ -4419,7 +4322,7 @@ class ConvertStridedSliceOpDynamic : public OpRewritePattern<TF::StridedSliceOp>
         loc, 1, elem_ty.getIntOrFloatBitWidth());
     Value zero = rewriter.create<arith::ConstantIntOp>(
         loc, 0, elem_ty.getIntOrFloatBitWidth());
-    auto in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.input());
+    auto in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
 
     for (int64_t i = 0; i < indices_elements; ++i) {
       Value idx = rewriter.create<arith::ConstantIndexOp>(loc, i);
@@ -4469,16 +4372,16 @@ class ConvertStridedSliceOpDynamic : public OpRewritePattern<TF::StridedSliceOp>
   LogicalResult matchAndRewrite(TF::StridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
     // Only static rank case is supported a.t.m.
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
 
     auto result_ty = op.getType().dyn_cast<RankedTensorType>();
     if (!result_ty) return failure();
 
     // Only static shape begin/end/strides is supported a.t.m.
-    auto begin_ty = op.begin().getType().dyn_cast<RankedTensorType>();
-    auto end_ty = op.end().getType().dyn_cast<RankedTensorType>();
-    auto strides_ty = op.strides().getType().dyn_cast<RankedTensorType>();
+    auto begin_ty = op.getBegin().getType().dyn_cast<RankedTensorType>();
+    auto end_ty = op.getEnd().getType().dyn_cast<RankedTensorType>();
+    auto strides_ty = op.getStrides().getType().dyn_cast<RankedTensorType>();
     if (!begin_ty || !begin_ty.hasStaticShape() ||
         !end_ty || !end_ty.hasStaticShape() ||
         !strides_ty || !strides_ty.hasStaticShape())
@@ -4489,7 +4392,7 @@ class ConvertStridedSliceOpDynamic : public OpRewritePattern<TF::StridedSliceOp>
     // TODO(disc): support negative indices.
     // TODO(disc): support dynamic stride
     DenseIntElementsAttr sparse_strides_attr;
-    if (!matchPattern(op.strides(), m_Constant(&sparse_strides_attr))) {
+    if (!matchPattern(op.getStrides(), m_Constant(&sparse_strides_attr))) {
       return rewriter.notifyMatchFailure(op,
            "requires that strides are constants");
     }
@@ -4520,12 +4423,12 @@ class ConvertStridedSliceOpDynamic : public OpRewritePattern<TF::StridedSliceOp>
             "failed to calculate reverse dims");
     }
 
-    Value input = op.input();
+    Value input = op.getInput();
     Location loc = op.getLoc();
     if (!dims_to_reverse.empty()) {
       input = rewriter.create<ReverseOp>(
-        loc, input_ty, op.input(),
-        GetI64ElementsAttr(dims_to_reverse, &rewriter));
+          loc, input_ty, op.getInput(),
+          GetI64ElementsAttr(dims_to_reverse, &rewriter));
     }
 
     Value begin_vec =
@@ -4534,8 +4437,8 @@ class ConvertStridedSliceOpDynamic : public OpRewritePattern<TF::StridedSliceOp>
         rewriter.create<tensor::FromElementsOp>(loc, hlo_end_indices);
     Value strides_vec =
         rewriter.create<tensor::FromElementsOp>(loc, hlo_strides);
-    SmallVector<int64_t, 4> slice_result_shape(
-        begin_indices.size(), ShapedType::kDynamicSize);
+    SmallVector<int64_t, 4> slice_result_shape(begin_indices.size(),
+                                               ShapedType::kDynamic);
     RankedTensorType slice_result_type =
       RankedTensorType::get(slice_result_shape, input_ty.getElementType());
     Value sliced = rewriter.create<mhlo::RealDynamicSliceOp>(
@@ -4589,11 +4492,11 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
     auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, op.delta(),
-        hlo::getBroadcastDimensionsAttr(&rewriter, iota, op.delta()));
+        op.getLoc(), result_type, iota, op.getDelta(),
+        hlo::getBroadcastDimensionsAttr(&rewriter, iota, op.getDelta()));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
-        op, result_type, scaled, op.start(),
-        hlo::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
+        op, result_type, scaled, op.getStart(),
+        hlo::getBroadcastDimensionsAttr(&rewriter, scaled, op.getStart()));
     return success();
   }
 };
@@ -4620,9 +4523,9 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
       return failure();
     }
 
-    Value start = op.start();
-    Value delta = op.delta();
-    Value limit = op.limit();
+    Value start = op.getStart();
+    Value delta = op.getDelta();
+    Value limit = op.getLimit();
 
     // To compute the length we need to use floating point calculations so that
     // ceil can be computed for the number of steps.
@@ -4711,7 +4614,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     }
 
     DenseIntElementsAttr num_attr;
-    if (!matchPattern(op.num(), m_Constant(&num_attr))) {
+    if (!matchPattern(op.getNum(), m_Constant(&num_attr))) {
       return rewriter.notifyMatchFailure(op, "Num must be a constant scalar");
     }
 
@@ -4722,10 +4625,11 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
 
     // Calculate the scaling that needs to be applied to the iota.
     auto step_numerator = rewriter.create<chlo::BroadcastSubOp>(
-        op.getLoc(), op.start().getType(), op.stop(), op.start(),
-        hlo::getBroadcastDimensionsAttr(&rewriter, op.stop(), op.start()));
+        op.getLoc(), op.getStart().getType(), op.getStop(), op.getStart(),
+        hlo::getBroadcastDimensionsAttr(&rewriter, op.getStop(),
+                                        op.getStart()));
     Value step_denominator = rewriter.create<ConvertOp>(
-        op.getLoc(), op.num(), result_type.getElementType());
+        op.getLoc(), op.getNum(), result_type.getElementType());
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
@@ -4745,8 +4649,8 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
         op.getLoc(), result_type, iota, step,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, step));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
-        op, result_type, scaled, op.start(),
-        hlo::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
+        op, result_type, scaled, op.getStart(),
+        hlo::getBroadcastDimensionsAttr(&rewriter, scaled, op.getStart()));
     return success();
   }
 };
@@ -4773,12 +4677,13 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // TODO(b/141785544): Update this to not require ranked shapes.
     // Input shape needs to be ranked to convert negative indices in TensorFlow
     // to absolute indices required by HLO.
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
 
     DenseIntElementsAttr dimensions;
-    if (!matchPattern(op.reduction_indices(), m_Constant(&dimensions)))
+    if (!matchPattern(op.getReductionIndices(), m_Constant(&dimensions)))
       return failure();
 
     // Build the final shape from input_shape and dimensions using a bitmap
@@ -4809,7 +4714,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     Type reduce_element_type =
         is_accumulation ? GetAccumulationType(element_type) : element_type;
     auto casted_input =
-        rewriter.create<ConvertOp>(loc, op.input(), reduce_element_type);
+        rewriter.create<ConvertOp>(loc, op.getInput(), reduce_element_type);
 
     // Each reduction op can have a different initial value.
     Value init = Derived::GetInitialValue(reduce_element_type, loc, &rewriter);
@@ -4823,7 +4728,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
 
     // The mean op needs to divide by the product of the reduced dimensions.
     if (std::is_same<OpTy, TF::MeanOp>::value) {
-      Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.input());
+      Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
       Value divisor_count = rewriter.create<arith::ConstantIndexOp>(loc, 1);
       for (size_t i = 0; i < input_shape.size(); ++i) {
         if (reduced_dimensions_bitmap[i]) {
@@ -4856,7 +4761,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // reshape. Various code generation techniques benefit from the knowledge
     // that this is a restricted form of shape manipulation that is just adding
     // unit dims.
-    if (op.keep_dims()) {
+    if (op.getKeepDims()) {
       for (auto &dim_is_reduced : llvm::enumerate(reduced_dimensions_bitmap)) {
         if (dim_is_reduced.value()) {
           auto index_attr = GetI32ElementsAttr(
@@ -5000,7 +4905,7 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     RankedTensorType input_type =
-        op.input().getType().template dyn_cast<RankedTensorType>();
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
     if (!input_type) {
       return failure();
     }
@@ -5015,7 +4920,7 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
         Derived::GetInitialValue(input_element_type, loc, rewriter);
 
     RankedTensorType output_type =
-        op.output().getType().template dyn_cast<RankedTensorType>();
+        op.getOutput().getType().template dyn_cast<RankedTensorType>();
     if (!output_type) {
       return rewriter.notifyMatchFailure(op, "requires known rank");
     }
@@ -5028,18 +4933,18 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
         input_type.getShape(), index_element_type);
 
     llvm::Optional<int64_t> optional_axis =
-        GetIntegerHLOAxisFromTFAxis(op.dimension(), input_type.getRank());
+        GetIntegerHLOAxisFromTFAxis(op.getDimension(), input_type.getRank());
     if (!optional_axis.has_value())
       return rewriter.notifyMatchFailure(op, "required axis");
-    int64_t axis = optional_axis.getValue();
+    int64_t axis = optional_axis.value();
 
     IntegerAttr iota_dimension =
         IntegerAttr::get(rewriter.getIntegerType(64), axis);
-    Value input_shape = rewriter.create<shape::ShapeOfOp>(loc, op.input());
+    Value input_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
     Value index_values = rewriter.create<DynamicIotaOp>(
         loc, index_type, input_shape, iota_dimension);
 
-    Value operands[] = {op.input(), index_values};
+    Value operands[] = {op.getInput(), index_values};
     Value init_values[] = {init_value, index_init_value};
     DenseIntElementsAttr reduction_dimensions =
         GetI64ElementsAttr({axis}, &rewriter);
@@ -5112,11 +5017,11 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
     auto tensor_ty =
-        op.tensor().getType().template dyn_cast<RankedTensorType>();
+        op.getTensor().getType().template dyn_cast<RankedTensorType>();
     auto indices_ty =
-        op.indices().getType().template dyn_cast<RankedTensorType>();
+        op.getIndices().getType().template dyn_cast<RankedTensorType>();
     auto updates_ty =
-        op.updates().getType().template dyn_cast<RankedTensorType>();
+        op.getUpdates().getType().template dyn_cast<RankedTensorType>();
 
     if (!tensor_ty || !indices_ty || !updates_ty) return failure();
     // Last dimension of the indices needs to known at compile time for
@@ -5125,7 +5030,7 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
     int64_t num_index_dims = indices_ty.getShape().back();
     if (ShapedType::isDynamic(num_index_dims)) return failure();
 
-    auto updates = op.updates();
+    auto updates = op.getUpdates();
 
     // Broadcast scalar `updates` in into expected shape as following shape:
     // updates.shape == indices.shape[:-1] + tensor.shape[indices.shape[-1]:]
@@ -5162,11 +5067,11 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
           rewriter.create<TF::ConstOp>(op->getLoc(), const_type, const_attr);
 
       auto broadcast_to_type = tensorflow::GetTypeFromTFTensorShape(
-          llvm::makeArrayRef<int64_t>(expected_update_shape),
+          llvm::ArrayRef<int64_t>(expected_update_shape),
           updates_ty.getElementType());
 
       updates = rewriter.create<TF::BroadcastToOp>(
-          op->getLoc(), broadcast_to_type, op.updates(), const_op);
+          op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
 
       updates_ty = updates.getType().template dyn_cast<RankedTensorType>();
     }
@@ -5186,9 +5091,9 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         indices_rank - 1);
 
     Location loc = op.getLoc();
-    auto scatter = rewriter.create<ScatterOp>(loc, op.getType(),
-                                              ValueRange(Value(op.tensor())),
-                                              op.indices(), updates, dims_attr);
+    auto scatter = rewriter.create<ScatterOp>(
+        loc, op.getType(), ValueRange(Value(op.getTensor())), op.getIndices(),
+        updates, dims_attr);
     Derived::BuildScatterBody(tensor_ty.getElementType(),
                               &scatter.getUpdateComputation(), loc, rewriter);
 
@@ -5305,13 +5210,13 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
 
   LogicalResult matchAndRewrite(TF::TileOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_ty = op.input().getType().dyn_cast<RankedTensorType>();
+    auto input_ty = op.getInput().getType().dyn_cast<RankedTensorType>();
     if (!input_ty || !input_ty.hasStaticShape()) return failure();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
     Type element_type = input_ty.getElementType();
 
     DenseIntElementsAttr multiples;
-    if (!matchPattern(op.multiples(), m_Constant(&multiples)) ||
+    if (!matchPattern(op.getMultiples(), m_Constant(&multiples)) ||
         multiples.getType().getRank() != 1)
       return failure();
 
@@ -5351,7 +5256,7 @@ class ConvertTileOp : public OpRewritePattern<TF::TileOp> {
     Type output_type = op.getType();
 
     Value result = rewriter.create<BroadcastInDimOp>(
-        loc, broadcasted_type, op.input(),
+        loc, broadcasted_type, op.getInput(),
         GetI64ElementsAttr(broadcast_dimensions, &rewriter));
 
     if (output_type != broadcasted_type) {
@@ -5384,8 +5289,8 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
   LogicalResult matchAndRewrite(TF::TileOp op,
                                 PatternRewriter &rewriter) const final {
     Location loc = op.getLoc();
-    Value input = op.input();
-    Value multiples = op.multiples();
+    Value input = op.getInput();
+    Value multiples = op.getMultiples();
     auto input_ty = input.getType().dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     auto result_ty = op.getType().dyn_cast<RankedTensorType>();
@@ -5399,7 +5304,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     SmallVector<Value, 4> input_shape_values;
     for (int64_t i = 0; i < input_rank; ++i) {
       auto dim_size = input_ty.getDimSize(i);
-      if (dim_size == ShapedType::kDynamicSize) {
+      if (dim_size == ShapedType::kDynamic) {
         input_shape_values.push_back(
             rewriter.create<tensor::DimOp>(loc, input, i));
       } else {
@@ -5445,7 +5350,7 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
             {static_cast<int64_t>(out_dim_size.size())}, index_ty),
         out_dim_size);
     SmallVector<int64_t, 4> broadcast_shape(input_rank * 2,
-                                            ShapedType::kDynamicSize);
+                                            ShapedType::kDynamic);
     RankedTensorType broadcast_type =
         tensorflow::GetTypeFromTFTensorShape(broadcast_shape, element_type);
     Value broadcast = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
@@ -5477,22 +5382,25 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
-    Type element_type =
-        op.orig_input().getType().template cast<TensorType>().getElementType();
+    Type element_type = op.getOrigInput()
+                            .getType()
+                            .template cast<TensorType>()
+                            .getElementType();
 
     // Compute paddings using the original input and kernel shape and strides.
     // Here, ReduceWindow op as used as the MaxPool op is lowered to the
     // ReduceWindow op.
     auto input_ty =
-        op.orig_input().getType().template dyn_cast<RankedTensorType>();
+        op.getOrigInput().getType().template dyn_cast<RankedTensorType>();
     if (!input_ty) return failure();
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
-        input_ty.getShape(), op.ksize(), op.strides(), op.padding(), &rewriter);
+        input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
+        &rewriter);
 
     auto result = rewriter.create<SelectAndScatterOp>(
-        loc, op.getType(), op.orig_input(), op.grad(),
+        loc, op.getType(), op.getOrigInput(), op.getGrad(),
         GetScalarConstOfType(element_type, loc, 0, &rewriter),
-        GetI64ElementsAttr(op.ksize()), GetI64ElementsAttr(op.strides()),
+        GetI64ElementsAttr(op.getKsize()), GetI64ElementsAttr(op.getStrides()),
         paddings_attr);
 
     BuildReduceBody<AddOp>(element_type, &result.getScatter(), &rewriter);
@@ -5534,19 +5442,19 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const override {
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
     constexpr int num_dims = num_spatial_dims + 2;
     int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
     auto out_backprop_ty =
-        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
+        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
     auto filter_ty =
-        op.filter().getType().template dyn_cast<RankedTensorType>();
+        op.getFilter().getType().template dyn_cast<RankedTensorType>();
 
     // With the exception of out_backprop's batch dimension, out_backprop and
     // filter need to have static shape. Filter is validated here, out_backprop
@@ -5562,24 +5470,24 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     //      "tf.Pack"(%142, %cst_301, %cst_301, %cst_300) {axis = 0 : i64, ...}
     std::vector<int64_t> input_shape;
     DenseIntElementsAttr input_shape_attr;
-    if (matchPattern(op.input_sizes(), m_Constant(&input_shape_attr)) &&
+    if (matchPattern(op.getInputSizes(), m_Constant(&input_shape_attr)) &&
         input_shape_attr.getType().getRank() == 1) {
       input_shape.insert(input_shape.end(),
                          input_shape_attr.getValues<int32_t>().begin(),
                          input_shape_attr.getValues<int32_t>().end());
     } else {
-      auto pack = op.input_sizes().template getDefiningOp<TF::PackOp>();
-      if (!pack || pack.axis() != 0) return failure();
+      auto pack = op.getInputSizes().template getDefiningOp<TF::PackOp>();
+      if (!pack || pack.getAxis() != 0) return failure();
       auto pack_ty = pack.getType().template dyn_cast<RankedTensorType>();
       if (!pack_ty || pack_ty.getRank() != 1) return failure();
       for (auto i = 0; i < pack_ty.getDimSize(0); ++i) {
         if (i == batch_dim) {
           // We don't use the batch dimension below, so we don't care about
           // its size. Might as well populate it with -1.
-          input_shape.push_back(ShapedType::kDynamicSize);
+          input_shape.push_back(ShapedType::kDynamic);
         } else {
           DenseIntElementsAttr input_dims_attr;
-          if (matchPattern(pack.values()[i], m_Constant(&input_dims_attr)) &&
+          if (matchPattern(pack.getValues()[i], m_Constant(&input_dims_attr)) &&
               input_dims_attr.getType().getRank() == 0) {
             input_shape.push_back(input_dims_attr.getSplatValue<int32_t>());
           } else {
@@ -5589,11 +5497,11 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       }
     }
 
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    auto dilations_attr = GetI64ElementsAttr(op.getDilations());
     std::vector<int> dilations{
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
-    auto strides_attr = GetI64ElementsAttr(op.strides());
+    auto strides_attr = GetI64ElementsAttr(op.getStrides());
     std::vector<tensorflow::int32> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
@@ -5627,9 +5535,9 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       // Prepare metadata indexed by spatial_dim for computing pad_before
       // and pad_after.
       int64_t input_size = input_shape[spatial_dim];
-      if (input_size == ShapedType::kDynamicSize) return failure();
+      if (input_size == ShapedType::kDynamic) return failure();
       int64_t output_size = out_backprop_ty.getDimSize(spatial_dim);
-      if (output_size == ShapedType::kDynamicSize) return failure();
+      if (output_size == ShapedType::kDynamic) return failure();
       int64_t filter_size = filter_ty.getDimSize(i);
       int64_t stride = strides[spatial_dim];
       int64_t dilation = dilations[spatial_dim];
@@ -5666,12 +5574,12 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
         {num_spatial_dims, 2}, rewriter.getIntegerType(64));
     auto paddings_attr = DenseIntElementsAttr::get(paddings_ty, paddings);
 
-    Value filter = op.filter();
+    Value filter = op.getFilter();
 
     const int feature_dim =
         tensorflow::GetTensorFeatureDimIndex(num_dims, data_format);
     const int64_t in_depth = *(input_shape.begin() + feature_dim);
-    if (in_depth == ShapedType::kDynamicSize) return failure();
+    if (in_depth == ShapedType::kDynamic) return failure();
     const int64_t filter_in_depth = filter_shape[num_spatial_dims];
     const int64_t feature_group_count = in_depth / filter_in_depth;
 
@@ -5716,7 +5624,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     Value result = rewriter.create<ConvolutionOp>(
-        op.getLoc(), op.getType(), op.out_backprop(), filter,
+        op.getLoc(), op.getType(), op.getOutBackprop(), filter,
         /*window_strides=*/
         GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
                                    &rewriter),
@@ -5803,7 +5711,7 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
  public:
   using OpRewritePattern<OpTy>::OpRewritePattern;
 
-  tensorflow::Status ConvBackpropExtractAndVerifyDimensionDyn(
+  tensorflow::Status ConvBackpopExtractAndVerifyDimensionDyn(
       OpTy& op, PatternRewriter& rewriter, std::vector<int64_t> dilations,
       const std::vector<int64_t>& strides, tensorflow::Padding padding,
       Value padding_before, Value padding_after, int spatial_dim,
@@ -5841,9 +5749,9 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
     };
     Value idx = rewriter.create<arith::ConstantIndexOp>(loc, spatial_dim);
     dim->input_size =
-        rewriter.create<tensor::ExtractOp>(loc, op.input_sizes(), idx);
-    dim->filter_size = get_dim_value(op.filter(), filter_spatial_dim);
-    dim->output_size = get_dim_value(op.out_backprop(), spatial_dim);
+        rewriter.create<tensor::ExtractOp>(loc, op.getInputSizes(), idx);
+    dim->filter_size = get_dim_value(op.getFilter(), filter_spatial_dim);
+    dim->output_size = get_dim_value(op.getOutBackprop(), spatial_dim);
     dim->stride = strides[spatial_dim];
     dim->dilation = dilations[spatial_dim];
 
@@ -5863,7 +5771,7 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
     dim->pad_after = sub_vals(
         sub_vals(padded_out_size, dim->expanded_output_size), dim->pad_before);
 
-    return tensorflow::Status::OK();
+    return tensorflow::OkStatus();
   }
 
   tensorflow::Status ConvBackpropComputeDimensionsV2Dyn(
@@ -5897,11 +5805,11 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
                                                  num_dims, "-dimensional");
     }
     int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
-    dims->batch_size = get_dim_value(op.out_backprop(), batch_dim);
+    dims->batch_size = get_dim_value(op.getOutBackprop(), batch_dim);
 
     // TODO(feiwen) : check in_depth and out_depth
-    dims->in_depth = get_dim_value(op.filter(), num_dims - 2);
-    dims->out_depth = get_dim_value(op.filter(), num_dims - 1);
+    dims->in_depth = get_dim_value(op.getFilter(), num_dims - 2);
+    dims->out_depth = get_dim_value(op.getFilter(), num_dims - 1);
 
     // TODO(feiwen): to support grouped conv
     dims->spatial_dims.resize(num_spatial_dims);
@@ -5913,11 +5821,11 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
         padding_before = get_const(explicit_paddings[2 * image_dim]);
         padding_after = get_const(explicit_paddings[2 * image_dim + 1]);
       }
-      TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimensionDyn(
+      TF_RETURN_IF_ERROR(ConvBackpopExtractAndVerifyDimensionDyn(
           op, rewriter, dilations, strides, padding, padding_before,
           padding_after, image_dim, i, &dims->spatial_dims[i]));
     }
-    return tensorflow::Status::OK();
+    return tensorflow::OkStatus();
   }
 
   bool GetPaddingValues(OpTy& op, PatternRewriter& rewriter, Value input_size,
@@ -6004,37 +5912,37 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
 
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
     auto input_sizes_ty =
-        op.input_sizes().getType().template dyn_cast<RankedTensorType>();
+        op.getInputSizes().getType().template dyn_cast<RankedTensorType>();
     constexpr int num_dims = num_spatial_dims + 2;
     if (!input_sizes_ty || input_sizes_ty.getRank() != 1
         || input_sizes_ty.getShape()[0] != num_dims)
       return failure();
 
     auto filter_ty =
-        op.filter().getType().template dyn_cast<RankedTensorType>();
+        op.getFilter().getType().template dyn_cast<RankedTensorType>();
     auto out_backprop_ty =
-        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
+        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
 
     DenseIntElementsAttr input_shape_attr;
     if (!out_backprop_ty || !filter_ty ||
         (out_backprop_ty.hasStaticShape() && filter_ty.hasStaticShape() &&
-         matchPattern(op.input_sizes(), m_Constant(&input_shape_attr)))) {
+         matchPattern(op.getInputSizes(), m_Constant(&input_shape_attr)))) {
       return failure();
     }
 
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    auto dilations_attr = GetI64ElementsAttr(op.getDilations());
     std::vector<int64_t> dilations{
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
-    auto strides_attr = GetI64ElementsAttr(op.strides());
+    auto strides_attr = GetI64ElementsAttr(op.getStrides());
     std::vector<int64_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
@@ -6078,7 +5986,7 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
       paddings.push_back(spatial_dim_i.pad_after);
     }
 
-    Value filter = op.filter();
+    Value filter = op.getFilter();
 
     // TODO(feiwen): support group conv
     const int feature_dim =
@@ -6096,7 +6004,7 @@ class ConvertConvBackpropInputDynamic : public OpRewritePattern<OpTy> {
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
     SmallVector<Value, 3> operands;
-    operands.push_back(op.out_backprop());
+    operands.push_back(op.getOutBackprop());
     operands.push_back(filter);
     Value paddings_op =
         rewriter.create<tensor::FromElementsOp>(op.getLoc(), paddings);
@@ -6159,16 +6067,17 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
                                 PatternRewriter &rewriter) const override {
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
     auto out_backprop_ty =
-        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
 
     for (RankedTensorType ty : {out_backprop_ty, input_ty})
       if (!ty || !ty.hasStaticShape()) return failure();
@@ -6177,15 +6086,15 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
     ArrayRef<int64_t> input_shape = input_ty.getShape();
 
     DenseIntElementsAttr filter_shape_attr;
-    if (!matchPattern(op.filter_sizes(), m_Constant(&filter_shape_attr)) ||
+    if (!matchPattern(op.getFilterSizes(), m_Constant(&filter_shape_attr)) ||
         filter_shape_attr.getType().getRank() != 1)
       return failure();
 
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    auto dilations_attr = GetI64ElementsAttr(op.getDilations());
     std::vector<int> dilations{
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
-    auto strides_attr = GetI64ElementsAttr(op.strides());
+    auto strides_attr = GetI64ElementsAttr(op.getStrides());
     std::vector<tensorflow::int32> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
@@ -6314,7 +6223,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
     Value result = rewriter.create<ConvolutionOp>(
-        op.getLoc(), op.getType(), op.input(), op.out_backprop(),
+        op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
         /*window_strides=*/GetI64ElementsAttr(window_strides, &rewriter),
         /*padding=*/paddings_attr, /*lhs_dilation=*/
         GetI64ElementsAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
@@ -6431,7 +6340,7 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
     return true;
   }
 
-  tensorflow::Status ConvBackpropExtractAndVerifyDimensionDyn(
+  tensorflow::Status ConvBackpopExtractAndVerifyDimensionDyn(
       OpTy& op, PatternRewriter& rewriter, const std::vector<int64_t> dilations,
       const std::vector<int64_t>& strides, tensorflow::Padding padding,
       Value padding_before, Value padding_after, int spatial_dim,
@@ -6467,12 +6376,12 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
     auto get_int = [](Attribute attr) {
       return attr.template cast<IntegerAttr>().getInt();
     };
-    dim->input_size = get_dim_value(op.input(), spatial_dim);
+    dim->input_size = get_dim_value(op.getInput(), spatial_dim);
     Value idx =
         rewriter.create<arith::ConstantIndexOp>(loc, filter_spatial_dim);
     dim->filter_size =
-        rewriter.create<tensor::ExtractOp>(loc, op.filter_sizes(), idx);
-    dim->output_size = get_dim_value(op.out_backprop(), spatial_dim);
+        rewriter.create<tensor::ExtractOp>(loc, op.getFilterSizes(), idx);
+    dim->output_size = get_dim_value(op.getOutBackprop(), spatial_dim);
     dim->stride = strides[spatial_dim];
     dim->dilation = dilations[spatial_dim];
     int64_t out_size = 0;
@@ -6491,7 +6400,7 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
     dim->pad_before = sub_one(sub_vals(effective_filter_size, padding_before));
     dim->pad_after = sub_vals(
         sub_vals(padded_out_size, dim->expanded_output_size), dim->pad_before);
-    return tensorflow::Status::OK();
+    return tensorflow::OkStatus();
   }
 
   tensorflow::Status ConvBackpropComputeDimensionsV2Dyn(
@@ -6523,14 +6432,14 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
                                                  num_dims, "-dimensional");
     }
     int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
-    dims->batch_size = get_dim_value(op.out_backprop(), batch_dim);
+    dims->batch_size = get_dim_value(op.getOutBackprop(), batch_dim);
 
     int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
     // TODO(feiwen) : check in_depth and out_depth
-    dims->in_depth = get_dim_value(op.input(), feature_dim);
+    dims->in_depth = get_dim_value(op.getInput(), feature_dim);
     // The input and output feature dimensions are the second last and last
     // dimensions of the filter Tensor.
-    dims->out_depth = get_dim_value(op.out_backprop(), feature_dim);
+    dims->out_depth = get_dim_value(op.getOutBackprop(), feature_dim);
 
     dims->spatial_dims.resize(num_spatial_dims);
     for (int i = 0; i < num_spatial_dims; ++i) {
@@ -6540,11 +6449,11 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
         padding_before = get_const(explicit_paddings[2 * image_dim]);
         padding_after = get_const(explicit_paddings[2 * image_dim + 1]);
       }
-      TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimensionDyn(
+      TF_RETURN_IF_ERROR(ConvBackpopExtractAndVerifyDimensionDyn(
           op, rewriter, dilations, strides, padding, padding_before,
           padding_after, image_dim, i, &dims->spatial_dims[i]));
     }
-    return tensorflow::Status::OK();
+    return tensorflow::OkStatus();
   }
 
   LogicalResult matchAndRewrite(OpTy op,
@@ -6581,32 +6490,33 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
     };
     // Unpack all of the attributes.
     tensorflow::TensorFormat data_format;
-    if (!FormatFromString(op.data_format().str(), &data_format))
+    if (!FormatFromString(op.getDataFormat().str(), &data_format))
       return op.emitOpError("invalid data format");
 
     tensorflow::Padding padding;
-    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+    if (!GetPaddingFromString(op.getPadding().str(), &padding).ok())
       return failure();
 
     auto out_backprop_ty =
-        op.out_backprop().getType().template dyn_cast<RankedTensorType>();
-    auto input_ty = op.input().getType().template dyn_cast<RankedTensorType>();
+        op.getOutBackprop().getType().template dyn_cast<RankedTensorType>();
+    auto input_ty =
+        op.getInput().getType().template dyn_cast<RankedTensorType>();
 
     DenseIntElementsAttr filter_shape_attr;
     if (!out_backprop_ty || !input_ty ||
         (out_backprop_ty.hasStaticShape() && input_ty.hasStaticShape() &&
-         matchPattern(op.filter_sizes(), m_Constant(&filter_shape_attr)))) {
+         matchPattern(op.getFilterSizes(), m_Constant(&filter_shape_attr)))) {
       return failure();
     }
 
     ArrayRef<int64_t> out_backprop_shape = out_backprop_ty.getShape();
     ArrayRef<int64_t> input_shape = input_ty.getShape();
 
-    auto dilations_attr = GetI64ElementsAttr(op.dilations());
+    auto dilations_attr = GetI64ElementsAttr(op.getDilations());
     std::vector<int64_t> dilations{
         dilations_attr.template getValues<int64_t>().begin(),
         dilations_attr.template getValues<int64_t>().end()};
-    auto strides_attr = GetI64ElementsAttr(op.strides());
+    auto strides_attr = GetI64ElementsAttr(op.getStrides());
     std::vector<int64_t> strides{
         strides_attr.template getValues<int64_t>().begin(),
         strides_attr.template getValues<int64_t>().end()};
@@ -6728,8 +6638,8 @@ class ConvertConvBackpropFilterDynamic : public OpRewritePattern<OpTy> {
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
     SmallVector<Value, 3> operands;
-    operands.push_back(op.input());
-    operands.push_back(op.out_backprop());
+    operands.push_back(op.getInput());
+    operands.push_back(op.getOutBackprop());
     Value paddings_op =
         rewriter.create<tensor::FromElementsOp>(op.getLoc(), paddings);
     operands.push_back(paddings_op);
@@ -6787,18 +6697,18 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
 
   LogicalResult matchAndRewrite(TF::OneHotOp op,
                                 PatternRewriter &rewriter) const override {
-    auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
+    auto indices_ty = op.getIndices().getType().dyn_cast<RankedTensorType>();
     if (!indices_ty || !indices_ty.hasStaticShape()) return failure();
     ArrayRef<int64_t> indices_shape = indices_ty.getShape();
     Type element_type = indices_ty.getElementType();
 
     DenseIntElementsAttr depth_attr;
-    if (!matchPattern(op.depth(), m_Constant(&depth_attr))) {
+    if (!matchPattern(op.getDepth(), m_Constant(&depth_attr))) {
       return failure();
     }
 
     int64_t depth = depth_attr.getValues<APInt>()[0].getSExtValue();
-    int64_t axis = op.axis();
+    int64_t axis = op.getAxis();
     if (axis == -1) axis = indices_shape.size();
 
     llvm::SmallVector<int64_t, 4> broadcast_dims(indices_shape.size());
@@ -6820,16 +6730,16 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     auto iota = rewriter.create<IotaOp>(
         loc, index_type, IntegerAttr::get(rewriter.getIntegerType(64), axis));
     auto broadcast_indices = rewriter.create<BroadcastInDimOp>(
-        loc, index_type, op.indices(),
+        loc, index_type, op.getIndices(),
         GetI64ElementsAttr(broadcast_dims, &rewriter));
 
     Value compare = rewriter.create<mhlo::CompareOp>(
         loc, broadcast_indices, iota, ComparisonDirection::EQ);
     Value on_value = rewriter.create<BroadcastOp>(
-        loc, op.getType(), op.on_value(),
+        loc, op.getType(), op.getOnValue(),
         GetI64ElementsAttr(output_dims, &rewriter));
     Value off_value = rewriter.create<BroadcastOp>(
-        loc, op.getType(), op.off_value(),
+        loc, op.getType(), op.getOffValue(),
         GetI64ElementsAttr(output_dims, &rewriter));
     Value result = rewriter.create<SelectOp>(loc, op.getType(), compare,
                                              on_value, off_value);
@@ -6868,8 +6778,8 @@ class ConvertInfeedDequeueTupleOp
   LogicalResult matchAndRewrite(TF::InfeedDequeueTupleOp op,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Type> result_types;
-    result_types.reserve(op.outputs().size() + 1);
-    for (const auto &output : op.outputs()) {
+    result_types.reserve(op.getOutputs().size() + 1);
+    for (const auto &output : op.getOutputs()) {
       Type ty = output.getType();
       if (auto tensor_ty = ty.dyn_cast<RankedTensorType>()) {
         if (!tensor_ty.hasStaticShape()) return failure();
@@ -6892,11 +6802,11 @@ class ConvertInfeedDequeueTupleOp
 
     result_types.pop_back();  // remove the token type.
 
-    if (op._XlaSharding().has_value()) {
+    if (op.get_XlaSharding().has_value()) {
       // _XlaSharding attribute in TF is a serialized string of the OpSharding
       // proto, so convert to a text form here.
       ::xla::OpSharding sharding_proto;
-      if (!sharding_proto.ParseFromString(op._XlaSharding().getValue().str()))
+      if (!sharding_proto.ParseFromString(op.get_XlaSharding().value().str()))
         return failure();
 
       // Token is a control signal and not a real data, so arbitrarily assign
@@ -6908,7 +6818,7 @@ class ConvertInfeedDequeueTupleOp
             kShardingAttr,
             rewriter.getStringAttr(sharding_proto.SerializeAsString()));
       } else {
-        data_and_token->setAttr(kShardingAttr, op._XlaShardingAttr());
+        data_and_token->setAttr(kShardingAttr, op.get_XlaShardingAttr());
       }
     }
 
@@ -6955,7 +6865,7 @@ class ConvertOutfeedEnqueueTupleOp
     auto token_type = mhlo::TokenType::get(rewriter.getContext());
     auto token = rewriter.create<CreateTokenOp>(op.getLoc(), token_type);
 
-    rewriter.create<OutfeedOp>(op.getLoc(), token_type, op.inputs(), token,
+    rewriter.create<OutfeedOp>(op.getLoc(), token_type, op.getInputs(), token,
                                /*outfeed_config=*/rewriter.getStringAttr(""));
     rewriter.eraseOp(op);
     return success();
@@ -6971,17 +6881,17 @@ class ConvertTopKV2Op : public OpRewritePattern<TF::TopKV2Op> {
                                 PatternRewriter &rewriter) const override {
     // We can only match when the `k` operand is a constant scalar.
     DenseIntElementsAttr k_attr;
-    if (!matchPattern(op.k(), m_Constant(&k_attr))) return failure();
+    if (!matchPattern(op.getK(), m_Constant(&k_attr))) return failure();
     int64_t k = (*k_attr.begin()).getSExtValue();
 
-    TensorType input_type = op.input().getType().cast<TensorType>();
+    TensorType input_type = op.getInput().getType().cast<TensorType>();
     if (!input_type.hasRank()) return failure();
     int64_t input_rank = input_type.getRank();
     int64_t last_dim_index = input_rank - 1;
     int64_t last_dim_size = input_type.getDimSize(last_dim_index);
-    if (last_dim_size == ShapedType::kDynamicSize) return failure();
+    if (last_dim_size == ShapedType::kDynamic) return failure();
 
-    rewriter.replaceOpWithNewOp<chlo::TopKOp>(op, op.input(), k);
+    rewriter.replaceOpWithNewOp<chlo::TopKOp>(op, op.getInput(), k);
     return success();
   }
 };
@@ -6998,11 +6908,11 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
 
   LogicalResult matchAndRewrite(TF::UnpackOp op,
                                 PatternRewriter &rewriter) const override {
-    auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
+    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
     if (!value_type || !value_type.hasStaticShape()) return failure();
 
     int64_t value_rank = value_type.getRank();
-    int64_t axis = op.axis();
+    int64_t axis = op.getAxis();
     if (axis < 0) axis += value_rank;
 
     // Parameters for constructing each slice.
@@ -7019,13 +6929,14 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
       end_indices[axis] = i + 1;
 
       auto slice_op = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), op.value(), GetI64ElementsAttr(begin_indices, &rewriter),
+          op.getLoc(), op.getValue(),
+          GetI64ElementsAttr(begin_indices, &rewriter),
           GetI64ElementsAttr(end_indices, &rewriter),
           GetI64ElementsAttr(strides, &rewriter));
       // Reshape to drop the axis dimension.
-      auto result =
-          rewriter.create<TF::SqueezeOp>(op.getLoc(), op.getType(i), slice_op,
-                                         rewriter.getI64ArrayAttr(op.axis()));
+      auto result = rewriter.create<TF::SqueezeOp>(
+          op.getLoc(), op.getType(i), slice_op,
+          rewriter.getI64ArrayAttr(op.getAxis()));
       results.push_back(result);
     }
 
@@ -7043,14 +6954,14 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
 
   LogicalResult matchAndRewrite(TF::UnpackOp op,
                                 PatternRewriter &rewriter) const override {
-    auto value_type = op.value().getType().dyn_cast<RankedTensorType>();
+    auto value_type = op.getValue().getType().dyn_cast<RankedTensorType>();
     if (!value_type) return failure();
     // TODO(disc): Remove this constraint once fold and canonicalization
     // implemented.
     if (value_type.hasStaticShape()) return failure();
 
     int64_t value_rank = value_type.getRank();
-    int64_t axis = op.axis();
+    int64_t axis = op.getAxis();
     if (axis < 0) axis += value_rank;
     Location loc = op.getLoc();
 
@@ -7064,10 +6975,10 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
     SmallVector<Value, 4> shape_values;
     shape_values.reserve(value_rank - 1);
     // slice shape before reshape, should be like{?, 1, ?, ?} if axis = 1
-    SmallVector<int64_t, 4> slice_shape(value_rank, ShapedType::kDynamicSize);
+    SmallVector<int64_t, 4> slice_shape(value_rank, ShapedType::kDynamic);
     for (int64_t dim_idx = 0; dim_idx < value_rank; ++dim_idx) {
       int64_t dim_size = value_type.getDimSize(dim_idx);
-      if (dim_size == ShapedType::kDynamicSize) {
+      if (dim_size == ShapedType::kDynamic) {
         Value dim_i = rewriter.create<arith::IndexCastOp>(
             loc, shape_scalar_type,
             rewriter.create<tensor::DimOp>(loc, op.getOperand(), dim_idx));
@@ -7102,7 +7013,7 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
           loc,
           tensorflow::GetTypeFromTFTensorShape(slice_shape,
                                                value_type.getElementType()),
-          op.value(),
+          op.getValue(),
           rewriter.create<tensor::FromElementsOp>(
               loc,
               tensorflow::GetTypeFromTFTensorShape(
@@ -7144,8 +7055,8 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
   LogicalResult matchAndRewrite(TF::SigmoidGradOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value y = op.y();
-    Value dy = op.dy();
+    Value y = op.getY();
+    Value dy = op.getDy();
     auto tp_y = y.getType().dyn_cast<RankedTensorType>();
     auto tp_dy = dy.getType().dyn_cast<RankedTensorType>();
     if (!tp_y || !tp_dy) return failure();
@@ -7180,7 +7091,7 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
 
 // Converts TF unsorted segment reduction ops to XLA HLO scatter op.
 //
-// TF unsorted segment reduction op peforms the following calculation:
+// TF unsorted segment reduction op.getPeforms the following calculation:
 //
 // Assume segment ids' shape is [SI0, SI1, ..., SIm] and data's  shape is
 // [D0, D1, ..., Dn]. Note that segment ids' shape must be a prefix of data's
@@ -7201,17 +7112,18 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    auto data_type = op.data().getType().template dyn_cast<RankedTensorType>();
+    auto data_type =
+        op.getData().getType().template dyn_cast<RankedTensorType>();
     if (!data_type) return failure();
     int64_t data_rank = data_type.getRank();
 
     auto segment_ids_type =
-        op.segment_ids().getType().template dyn_cast<RankedTensorType>();
+        op.getSegmentIds().getType().template dyn_cast<RankedTensorType>();
     if (!segment_ids_type) return failure();
     int64_t segment_ids_rank = segment_ids_type.getRank();
 
     DenseIntElementsAttr num_segments_attr;
-    if (!matchPattern(op.num_segments(), m_Constant(&num_segments_attr)))
+    if (!matchPattern(op.getNumSegments(), m_Constant(&num_segments_attr)))
       return failure();
 
     // The final shape for TF unsorted segment reduction op is [num_segments] +
@@ -7244,7 +7156,7 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
 
     auto scatter = rewriter.create<ScatterOp>(
         op.getLoc(), op.getType(), ValueRange(Value(broadcasted_init)),
-        op.segment_ids(), op.data(), dims_attr);
+        op.getSegmentIds(), op.getData(), dims_attr);
     BuildReduceBody<ReductionOp>(data_type.getElementType(),
                                  &scatter.getUpdateComputation(), &rewriter);
 
@@ -7321,19 +7233,24 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
 
   LogicalResult matchAndRewrite(TF::RandomShuffleOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = op.value().getType().dyn_cast<RankedTensorType>();
+    auto no_op = [&]() {
+      rewriter.replaceOp(op, op.getValue());
+      return success();
+    };
+
+    auto input_type = op.getValue().getType().dyn_cast<RankedTensorType>();
     if (!input_type) return failure();
+    if (input_type.hasStaticShape() && input_type.getNumElements() <= 1)
+      // No shuffling is required, so copy input directly to output.
+      return no_op();
 
     int64_t input_rank = input_type.getRank();
     int64_t first_dim_size = input_type.getDimSize(0);
     if (ShapedType::isDynamic(first_dim_size)) return failure();
 
-    // We are shuffling along the first dimension. If its size is <= 1, then
-    // shuffling is a no-op.
-    if (first_dim_size <= 1) {
-      rewriter.replaceOp(op, op.value());
-      return success();
-    }
+    if (first_dim_size <= 1)
+      // No shuffling is required, so copy input directly to output.
+      return no_op();
 
     // For vectors, shuffle values by sorting instead of the obvious
     // Fisher-Yates algorithm. Fisher-Yates is simple to implement and correct,
@@ -7378,7 +7295,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
       int rounds =
           std::ceil(exponent * std::log(num_elements) / std::log(u32_max));
 
-      Value current = op.value();
+      Value current = op.getValue();
       for (int i = 0; i < rounds; ++i) {
         auto keys =
             CreateRngUniform32(op.getLoc(), num_elements, /*lower_limit=*/0,
@@ -7434,11 +7351,11 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
       // Then perform the swap.
       // indices[i] <- indices[swaps[i]]
       indices = builder->create<mhlo::DynamicUpdateSliceOp>(
-          loc, indices.getType(), indices, target_index, llvm::makeArrayRef(i));
+          loc, indices.getType(), indices, target_index, llvm::ArrayRef(i));
       // indices[swaps[i]] <- indices[i]
       indices = builder->create<mhlo::DynamicUpdateSliceOp>(
           loc, indices.getType(), indices, source_index,
-          llvm::makeArrayRef(swap_index));
+          llvm::ArrayRef(swap_index));
 
       // Update new values.
       new_values->assign({swaps, indices});
@@ -7459,9 +7376,32 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
         /*collapsed_slice_dims=*/{0},
         /*start_index_map=*/{0},
         /*index_vector_dim=*/1);
-    rewriter.replaceOpWithNewOp<mhlo::GatherOp>(
-        op, op.getType(), op.value(), swaped_indices, dims_attr,
-        GetI64ElementsAttr(slice_sizes, &rewriter));
+
+    SmallVector<Value> slice_sizes_values;
+    for (auto i = 0; i < slice_sizes.size(); ++i) {
+      if (slice_sizes[i] == tensorflow::kTFDynamicSize) {
+        Value i_const = rewriter.create<arith::ConstantOp>(
+            op.getLoc(), rewriter.getIndexAttr(i));
+        Value slice_size_index =
+            rewriter.create<shape::DimOp>(op.getLoc(), op.getValue(), i_const);
+        Value index_to_i64 = rewriter.create<arith::IndexCastOp>(
+            op.getLoc(), rewriter.getI64Type(), slice_size_index);
+        Value i64_to_tensor = rewriter.create<tensor::FromElementsOp>(
+            op.getLoc(),
+            tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
+            index_to_i64);
+        slice_sizes_values.push_back(i64_to_tensor);
+      } else {
+        slice_sizes_values.push_back(rewriter.create<mhlo::ConstantOp>(
+            op.getLoc(), GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
+      }
+    }
+
+    auto slice_sizes_concat = rewriter.create<mhlo::ConcatenateOp>(
+        op.getLoc(), slice_sizes_values, rewriter.getI64IntegerAttr(0));
+    rewriter.replaceOpWithNewOp<mhlo::DynamicGatherOp>(
+        op, op.getType(), op.getValue(), swaped_indices, slice_sizes_concat,
+        dims_attr);
 
     return success();
   }
@@ -7476,15 +7416,15 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
                                 PatternRewriter &rewriter) const override {
     // TODO(b/148313088): define sharding attribute struct in MLIR intead of
     // using a string.
-    if (!op._XlaSharding().has_value()) return failure();
+    if (!op.get_XlaSharding().has_value()) return failure();
 
     NamedAttribute call_target_name = rewriter.getNamedAttr(
         "call_target_name", rewriter.getStringAttr("Sharding"));
 
     auto custom_call = rewriter.create<mhlo::CustomCallOp>(
-        op.getLoc(), op.getType(), op.input(),
+        op.getLoc(), op.getType(), op.getInput(),
         ArrayRef<NamedAttribute>{call_target_name});
-    custom_call->setAttr(kShardingAttr, op._XlaShardingAttr());
+    custom_call->setAttr(kShardingAttr, op.get_XlaShardingAttr());
     rewriter.replaceOp(op, custom_call.getResult(0));
 
     return success();
@@ -7498,9 +7438,9 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
 
   LogicalResult matchAndRewrite(TF::InplaceUpdateOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.x();
-    auto indices = op.i();
-    auto updates = op.v();
+    auto input = op.getX();
+    auto indices = op.getI();
+    auto updates = op.getV();
 
     // Slice each row of `i` and `v` to perform a separate dynamic-update-slice
     // on the contents of `x`.
@@ -7545,7 +7485,7 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     input_indices.resize(input_type.getRank(), cst);
 
     for (auto pair :
-         llvm::zip(unpacked_indices.output(), split_updates.output())) {
+         llvm::zip(unpacked_indices.getOutput(), split_updates.getOutput())) {
       input_indices.front() = std::get<0>(pair);
       input = rewriter.create<mhlo::DynamicUpdateSliceOp>(
           op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
@@ -7564,7 +7504,7 @@ class ConvertXlaDynamicUpdateSliceOp
 
   LogicalResult matchAndRewrite(TF::XlaDynamicUpdateSliceOp op,
                                 PatternRewriter &rewriter) const override {
-    auto indices_type = op.indices().getType().dyn_cast<RankedTensorType>();
+    auto indices_type = op.getIndices().getType().dyn_cast<RankedTensorType>();
     if (!indices_type || !indices_type.hasStaticShape() ||
         indices_type.getShape().size() != 1)
       return failure();
@@ -7573,10 +7513,11 @@ class ConvertXlaDynamicUpdateSliceOp
         indices_type.getDimSize(0), tensorflow::GetTypeFromTFTensorShape(
                                         {}, indices_type.getElementType()));
     auto unpacked_indices = rewriter.create<TF::UnpackOp>(
-        op.getLoc(), unpacked_indices_type, op.indices(),
+        op.getLoc(), unpacked_indices_type, op.getIndices(),
         IntegerAttr::get(rewriter.getIntegerType(64), 0));
     rewriter.replaceOpWithNewOp<mhlo::DynamicUpdateSliceOp>(
-        op, op.getType(), op.input(), op.update(), unpacked_indices.output());
+        op, op.getType(), op.getInput(), op.getUpdate(),
+        unpacked_indices.getOutput());
     return success();
   }
 };
@@ -7589,7 +7530,7 @@ class ConvertXlaReduceScatterOp
   LogicalResult matchAndRewrite(TF::XlaReduceScatterOp op,
                                 PatternRewriter &rewriter) const override {
     DenseIntElementsAttr group_assignment;
-    if (!matchPattern(op.group_assignment(), m_Constant(&group_assignment)))
+    if (!matchPattern(op.getGroupAssignment(), m_Constant(&group_assignment)))
       return failure();
     auto replica_groups =
         hlo::convertElementsAttr(group_assignment, rewriter.getIntegerType(64))
@@ -7597,19 +7538,19 @@ class ConvertXlaReduceScatterOp
     if (replica_groups.getType().getRank() != 2) return failure();
 
     APInt scatter_dimension;
-    if (!matchPattern(op.scatter_dimension(),
+    if (!matchPattern(op.getScatterDimension(),
                       m_ConstantInt(&scatter_dimension)))
       return failure();
 
     Location loc = op.getLoc();
-    Type element_type = getElementTypeOrSelf(op.input().getType());
+    Type element_type = getElementTypeOrSelf(op.getInput().getType());
 
     auto reduce_scatter = rewriter.create<ReduceScatterOp>(
-        loc, op.getType(), op.input(),
+        loc, op.getType(), op.getInput(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64),
                                 scatter_dimension.getSExtValue()),
         replica_groups, ChannelHandleAttr());
-    StringRef reduce_op = op.reduce_op();
+    StringRef reduce_op = op.getReduceOp();
     if (reduce_op == "Add") {
       BuildReduceBody<AddOp>(element_type, &reduce_scatter.getComputation(),
                              &rewriter);
@@ -7656,12 +7597,13 @@ class ConvertXlaReduceWindowOp
                                 PatternRewriter &rewriter) const override {
     DenseElementsAttr window_dimensions, window_strides, base_dilations,
         window_dilations, padding;
-    if (!(matchPattern(op.window_dimensions(),
+    if (!(matchPattern(op.getWindowDimensions(),
                        m_Constant(&window_dimensions)) &&
-          matchPattern(op.window_strides(), m_Constant(&window_strides)) &&
-          matchPattern(op.base_dilations(), m_Constant(&base_dilations)) &&
-          matchPattern(op.window_dilations(), m_Constant(&window_dilations)) &&
-          matchPattern(op.padding(), m_Constant(&padding))))
+          matchPattern(op.getWindowStrides(), m_Constant(&window_strides)) &&
+          matchPattern(op.getBaseDilations(), m_Constant(&base_dilations)) &&
+          matchPattern(op.getWindowDilations(),
+                       m_Constant(&window_dilations)) &&
+          matchPattern(op.getPadding(), m_Constant(&padding))))
       return failure();
 
     Location loc = op.getLoc();
@@ -7669,7 +7611,7 @@ class ConvertXlaReduceWindowOp
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the mhlo.SelectAndScatter op.
     auto reduce_window_op = rewriter.create<mhlo::ReduceWindowOp>(
-        loc, result_types, op.input(), op.init_value(),
+        loc, result_types, op.getInput(), op.getInitValue(),
         hlo::convertElementsAttr(window_dimensions, rewriter.getIntegerType(64))
             .cast<DenseIntElementsAttr>(),
         hlo::convertElementsAttr(window_strides, rewriter.getIntegerType(64))
@@ -7681,7 +7623,7 @@ class ConvertXlaReduceWindowOp
         hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))
             .cast<DenseIntElementsAttr>());
     // Insert a call to the reducer in the region of the mhlo op.
-    mlir::SymbolRefAttr func = op.computation();
+    mlir::SymbolRefAttr func = op.getComputation();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
         op->getParentOfType<mlir::ModuleOp>(), func));
     auto func_ty = func_op.getFunctionType();
@@ -7702,9 +7644,9 @@ class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
 
   LogicalResult matchAndRewrite(TF::ClipByValueOp op,
                                 PatternRewriter &rewriter) const override {
-    Value input = op.x();
-    Value min = op.clip_value_min();
-    Value max = op.clip_value_max();
+    Value input = op.getX();
+    Value min = op.getClipValueMin();
+    Value max = op.getClipValueMax();
 
     auto input_ty = input.getType().cast<ShapedType>();
     auto min_ty = min.getType().cast<ShapedType>();
@@ -7749,7 +7691,7 @@ class ConvertConstOp : public OpRewritePattern<TF::ConstOp> {
       return failure();
 
     Location loc = op.getLoc();
-    Value result = rewriter.create<mhlo::ConstantOp>(loc, op.value());
+    Value result = rewriter.create<mhlo::ConstantOp>(loc, op.getValue());
     if (result.getType() != op.getType())
       result = rewriter.create<tensor::CastOp>(loc, op.getType(), result);
     rewriter.replaceOp(op, result);
@@ -7768,7 +7710,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
 
   LogicalResult matchAndRewrite(OpT op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.x();
+    auto input = op.getX();
     auto input_type = input.getType().template dyn_cast<ShapedType>();
     if (!input_type || !input_type.hasStaticShape()) {
       return failure();
@@ -7779,7 +7721,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
 
     // We can only match when the axis is a constant scalar.
     DenseIntElementsAttr axis_attr;
-    if (!matchPattern(op.axis(), m_Constant(&axis_attr))) {
+    if (!matchPattern(op.getAxis(), m_Constant(&axis_attr))) {
       return failure();
     }
 
@@ -7792,7 +7734,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
 
     // If we're supposed to sum things up in the reverse direction, we reverse
     // the input and then later reverse the output.
-    if (op.reverse()) {
+    if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
       input = rewriter.create<ReverseOp>(
           op.getLoc(), input, GetI64ElementsAttr(dims_to_reverse, &rewriter));
@@ -7834,7 +7776,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
                                    &rewriter);
     Value result = reduce.getResult(0);
 
-    if (op.exclusive()) {
+    if (op.getExclusive()) {
       // In "exclusive" operation, the output will start with the "init" (0)
       // values. There is no way to express that as a ReduceWindowOp, so run the
       // normal operation, and then use a PadOp to add the 0 "column" on the
@@ -7854,7 +7796,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     result =
         rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
 
-    if (op.reverse()) {
+    if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
       result = rewriter.create<ReverseOp>(
           op.getLoc(), result, GetI64ElementsAttr(dims_to_reverse, &rewriter));
@@ -7878,7 +7820,7 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
 
   LogicalResult matchAndRewrite(TF::ShapeOp op,
                                 PatternRewriter &rewriter) const override {
-    Value input = op.input();
+    Value input = op.getInput();
 
     auto result_ty = op.getResult().getType().dyn_cast<RankedTensorType>();
     if (!result_ty) {
@@ -7900,7 +7842,7 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
 
   LogicalResult matchAndRewrite(TF::ExpandDimsOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.input();
+    auto input = op.getInput();
     auto input_ty = input.getType().cast<ShapedType>();
     auto result_ty = op.getType().cast<ShapedType>();
     if (!result_ty.hasRank() || !input_ty.hasRank() ||
@@ -7909,7 +7851,7 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
     }
 
     DenseIntElementsAttr expand_dims_attr;
-    if (!matchPattern(op.dim(), m_Constant(&expand_dims_attr))) {
+    if (!matchPattern(op.getDim(), m_Constant(&expand_dims_attr))) {
       return failure();
     }
 
@@ -7958,7 +7900,7 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
 
   LogicalResult matchAndRewrite(TF::SqueezeOp op,
                                 PatternRewriter &rewriter) const override {
-    auto input = op.input();
+    auto input = op.getInput();
     auto input_ty = input.getType().cast<ShapedType>();
     auto result_ty = op.getType().cast<ShapedType>();
     if (!result_ty.hasRank() || !input_ty.hasRank() ||
@@ -7967,14 +7909,14 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
     }
 
     // The fully dynamic case is unsupported.
-    if (op.squeeze_dims().empty()) {
+    if (op.getSqueezeDims().empty()) {
       return failure();
     }
 
     SmallVector<int64_t> squeeze_dims;
     int64_t input_rank = input_ty.getRank();
     for (const auto &squeeze_dim_apint :
-         op.squeeze_dims().getAsValueRange<IntegerAttr>()) {
+         op.getSqueezeDims().getAsValueRange<IntegerAttr>()) {
       int64_t squeeze_dim = squeeze_dim_apint.getSExtValue();
       // Handle negative inputs.
       if (squeeze_dim < 0) squeeze_dim += input_rank;
@@ -8008,7 +7950,7 @@ class ConvertDynamicStitchOpDynamic : public OpRewritePattern<TF::DynamicStitchO
   Type deriveRankedTensorTypes(Type ty, int64_t rank) const {
     auto tensor_ty = ty.dyn_cast<TensorType>();
     if (!tensor_ty) return ty;
-    SmallVector<int64_t, 8> shape(rank, ShapedType::kDynamicSize);
+    SmallVector<int64_t, 8> shape(rank, ShapedType::kDynamic);
     return RankedTensorType::get(shape, tensor_ty.getElementType());
   }
 
@@ -8024,8 +7966,8 @@ class ConvertDynamicStitchOpDynamic : public OpRewritePattern<TF::DynamicStitchO
     // Extract out all the constant indices' attributes and verify that data
     // types are static.
     SmallVector<DenseIntElementsAttr, 4> indices;
-    indices.reserve(op.N());
-    for (auto it : llvm::zip(op.indices(), op.data())) {
+    indices.reserve(op.getN());
+    for (auto it : llvm::zip(op.getIndices(), op.getData())) {
       Value index = std::get<0>(it);
       Value data = std::get<1>(it);
 
@@ -8042,7 +7984,7 @@ class ConvertDynamicStitchOpDynamic : public OpRewritePattern<TF::DynamicStitchO
     int64_t output_rank = out_ty.getRank();
     SmallVector<Value, 8> values(out_ty.getDimSize(0));
 
-    for (auto it : llvm::zip(indices, op.data())) {
+    for (auto it : llvm::zip(indices, op.getData())) {
       DenseIntElementsAttr index_attr = std::get<0>(it);
       SmallVector<Value, 4> shapes;
       Value data = std::get<1>(it);
@@ -8071,8 +8013,10 @@ class ConvertDynamicStitchOpDynamic : public OpRewritePattern<TF::DynamicStitchO
       // the number of UnpackOp output and indices are the same.
       auto reshaped_data_ty = reshaped_data.getType().cast<RankedTensorType>().getElementType();
       auto unpacked_ty = SmallVector<Type, 4>(
-          index_attr.size(), RankedTensorType::get(
-              SmallVector<int64_t, 8>(extracted_ranks, ShapedType::kDynamicSize), reshaped_data_ty));
+          index_attr.size(),
+          RankedTensorType::get(
+              SmallVector<int64_t, 8>(extracted_ranks, ShapedType::kDynamic),
+              reshaped_data_ty));
 
       auto items = rewriter.create<TF::UnpackOp>(
           loc, unpacked_ty, reshaped_data, /*axis*/0);
@@ -8088,509 +8032,6 @@ class ConvertDynamicStitchOpDynamic : public OpRewritePattern<TF::DynamicStitchO
   }
 };
 
-// Converts a TF QR op to HLO.
-class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TF::QrOp op,
-                                PatternRewriter &rewriter) const override {
-    // Block Householder QR Factorization. Algorithm 5.2.2 of Golub and van
-    // Loan. def qr_blocked(a, block_size):
-    //   m = a.shape[0]
-    //   n = a.shape[1]
-    //   q = np.eye(m)
-    //   for i in xrange(0, min(m, n), block_size):
-    //     k = min(block_size, min(m, n) - s)
-    //     (a, vs, taus) = qr(a[i:, i:i+k])
-    //     y = vs
-    //     w = ComputeWYRepresentation(vs, taus, m-i, k)
-    //     a[i:, i+r:] += np.dot(y, np.dot(w.T, a[i:, i+k:]))
-    //     q[:, i:] += np.dot(q[:, i:], np.dot(w, y.T))
-    //   return (q, a)
-    auto type = op.input().getType().dyn_cast<RankedTensorType>();
-    if (!type || !type.hasStaticShape()) return failure();
-    // The block size is chosen to match old bridge lowering.
-    constexpr int64_t kBlockSize = 128;
-    Value a = op.input();
-    int64_t m = type.getDimSize(type.getRank() - 2);
-    int64_t n = type.getDimSize(type.getRank() - 1);
-    int64_t p = std::min(m, n);
-    auto batch_dims = type.getShape().drop_back(2);
-    auto iota_type = tensorflow::GetTypeFromTFTensorShape(
-        {m, m}, rewriter.getIntegerType(32));
-    auto iota0 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
-                                         rewriter.getI64IntegerAttr(0));
-    auto iota1 = rewriter.create<IotaOp>(op.getLoc(), iota_type,
-                                         rewriter.getI64IntegerAttr(1));
-    Value compare = rewriter.create<CompareOp>(op.getLoc(), iota0, iota1,
-                                               ComparisonDirection::EQ);
-    Value identity_matrix =
-        rewriter.create<ConvertOp>(op.getLoc(), compare, type.getElementType());
-    auto q_shape = llvm::to_vector<4>(type.getShape());
-    q_shape.back() = m;
-    Value q =
-        rewriter.create<BroadcastOp>(op.getLoc(), identity_matrix,
-                                     GetI64ElementsAttr(batch_dims, &rewriter));
-    auto precision_config = rewriter.getArrayAttr(
-        {PrecisionAttr::get(rewriter.getContext(), Precision::HIGHEST),
-         PrecisionAttr::get(rewriter.getContext(), Precision::HIGHEST)});
-    for (int64_t i = 0; i < p; i += kBlockSize) {
-      int64_t k = std::min(kBlockSize, p - i);
-      auto a_block =
-          SliceInMinorDims(op.getLoc(), a, {i, i}, {m, i + k}, &rewriter);
-      Value r_block;
-      Value taus;
-      Value vs;
-      QRBlock(op.getLoc(), a_block, &r_block, &taus, &vs, &rewriter);
-      a = UpdateSliceInMinorDims(op.getLoc(), a, r_block, {i, i}, &rewriter);
-
-      // Compute the I-WY block representation of a product of Householder
-      // matrices.
-      Value w =
-          ComputeWYRepresentation(op.getLoc(), type.getElementType(),
-                                  batch_dims, vs, taus, m - i, k, &rewriter);
-      auto y = vs;
-
-      // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
-      Value a_panel =
-          SliceInMinorDims(op.getLoc(), a, {i, i + k}, {m, n}, &rewriter);
-      auto a_update = BatchDot(op.getLoc(), w, true, a_panel, false,
-                               batch_dims.size(), precision_config, &rewriter);
-      a_update = BatchDot(op.getLoc(), y, false, a_update, false,
-                          batch_dims.size(), precision_config, &rewriter);
-      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update);
-      a = UpdateSliceInMinorDims(op.getLoc(), a, a_panel, {i, i + k},
-                                 &rewriter);
-
-      // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
-      Value q_panel =
-          SliceInMinorDims(op.getLoc(), q, {0, i}, {m, m}, &rewriter);
-      Value q_update = BatchDot(op.getLoc(), q_panel, false, w, false,
-                                batch_dims.size(), precision_config, &rewriter);
-      q_update = BatchDot(op.getLoc(), q_update, false, y, true,
-                          batch_dims.size(), precision_config, &rewriter);
-      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update);
-      q = UpdateSliceInMinorDims(op.getLoc(), q, q_panel, {i}, &rewriter);
-    }
-    // full_matrices is false when only a partial result in needed. Slice to the
-    // needed dimensions here.
-    if (!op.full_matrices()) {
-      q = SliceInMinorDims(op.getLoc(), q, {0, 0}, {m, p}, &rewriter);
-      a = SliceInMinorDims(op.getLoc(), a, {0, 0}, {p, n}, &rewriter);
-    }
-    rewriter.replaceOp(op, {q, a});
-    return success();
-  }
-
- private:
-  // Computes a Householder reflection of the form:
-  // H = I - tau v v.T.
-  // such that
-  // H . ( x1  ) = ( x1   )
-  //     ( x2  ) = ( x2   )
-  //     ( ... ) = ( ...  )
-  //     ( xk  ) = ( beta )
-  //     ( ... )   ( 0    )
-  //     ( ... )   ( 0    )
-  // Unlike the usual formulation, we allow the caller to supply 'k' rather than
-  // only providing the relevant part of 'x' to maintain XLA's static shape
-  // invariant. In addition, the implementation supports batching.
-  // Pseudo-code, without batching:
-  //   alpha = x[k]
-  //   x_copy = np.copy(x)
-  //   x_copy[:k+1] = 0
-  //   xnorm = norm2(x_copy)
-  //   if xnorm == 0:
-  //     beta = alpha
-  //     tau = 0
-  //     v = np.zeros_like(x)
-  //   else:
-  //     beta = - np.sign(alpha) * dlapy2(alpha, xnorm)
-  //     tau = (beta - alpha) / beta
-  //     v = x / (alpha - beta)
-  //   v[k] = 1
-  //   return (v, tau, beta)
-  void House(Location loc, Value x, Value k, ArrayRef<int64_t> batch_dims,
-             const int64_t m, OpBuilder *builder, Value *v, Value *tau,
-             Value *beta) const {
-    auto x_type = x.getType().cast<RankedTensorType>();
-
-    llvm::SmallVector<int64_t, 4> batch_dim_ids(batch_dims.size());
-    std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
-    const int64_t minor_dim = batch_dims.size();
-
-    Value zero = GetScalarConstOfType(x_type.getElementType(), loc, 0, builder);
-    Value one = GetScalarConstOfType(x_type.getElementType(), loc, 1, builder);
-
-    // alpha = x[k]
-    Value alpha = DynamicSliceInMinorDims(loc, x, {k}, {1}, builder);
-    alpha = builder->create<ReshapeOp>(loc,
-                                       tensorflow::GetTypeFromTFTensorShape(
-                                           batch_dims, x_type.getElementType()),
-                                       alpha);
-
-    // Compute x[k+1:] (padded with zeros in elements 0..k)
-    Value iota = builder->create<IotaOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({m}, builder->getIntegerType(32)),
-        builder->getI64IntegerAttr(0));
-    Value gtk = builder->create<chlo::BroadcastCompareOp>(
-        loc, iota, k, GetI64ElementsAttr({}, builder),
-        chlo::ComparisonDirection::GT);
-    gtk = builder->create<ConvertOp>(loc, gtk, x_type.getElementType());
-    Value x_after_k = builder->create<chlo::BroadcastMulOp>(
-        loc, x, gtk, GetI64ElementsAttr({minor_dim}, builder));
-    Value x_after_k_sq = builder->create<MulOp>(loc, x_after_k, x_after_k);
-    // sigma = np.dot(x[k+1:], x[k+1:])
-    auto sigma = builder->create<ReduceOp>(
-        loc, x_after_k_sq, zero, GetI64ElementsAttr({minor_dim}, builder));
-    BuildReduceBody<AddOp>(x_type.getElementType(), &sigma.getBody(), builder);
-    // mu = np.sqrt(x[k]*x[k] + sigma)
-    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha);
-    Value mu = builder->create<SqrtOp>(
-        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0)));
-
-    Value sigma_is_zero = builder->create<chlo::BroadcastCompareOp>(
-        loc, sigma.getResult(0), zero, GetI64ElementsAttr({}, builder),
-        chlo::ComparisonDirection::EQ);
-    Value alpha_is_negative = builder->create<chlo::BroadcastCompareOp>(
-        loc, alpha, zero, GetI64ElementsAttr({}, builder),
-        chlo::ComparisonDirection::LT);
-    auto batch_size_one = builder->create<BroadcastOp>(
-        loc, one, GetI64ElementsAttr(batch_dims, builder));
-    Value signed_mu = builder->create<chlo::BroadcastMulOp>(
-        loc,
-        builder->create<SelectOp>(loc, alpha_is_negative, batch_size_one,
-                                  builder->create<NegOp>(loc, batch_size_one)),
-        mu, GetI64ElementsAttr({}, builder));
-    *beta = builder->create<SelectOp>(loc, sigma_is_zero, alpha, signed_mu);
-    *tau = builder->create<DivOp>(
-        loc, builder->create<SubtractOp>(loc, *beta, alpha), *beta);
-    Value zero_tau = builder->create<BroadcastOp>(
-        loc, zero, GetI64ElementsAttr(batch_dims, builder));
-    *tau = builder->create<SelectOp>(loc, sigma_is_zero, zero_tau, *tau);
-    Value divisor = builder->create<SubtractOp>(loc, alpha, *beta);
-    divisor =
-        builder->create<SelectOp>(loc, sigma_is_zero, batch_size_one, divisor);
-
-    Value eqk = builder->create<chlo::BroadcastCompareOp>(
-        loc, iota, k, GetI64ElementsAttr({}, builder),
-        chlo::ComparisonDirection::EQ);
-    eqk = builder->create<ConvertOp>(loc, eqk, x_type.getElementType());
-    llvm::SmallVector<int64_t, 4> e_k_shape(batch_dims.size(), 1);
-    e_k_shape.push_back(m);
-    auto e_k = builder->create<BroadcastOp>(
-        loc, eqk,
-        GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(batch_dims.size(), 1),
-                           builder));
-
-    // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
-    // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-    // Note that the add performs a degenerate broadcast.
-    *v = builder->create<chlo::BroadcastAddOp>(
-        loc, e_k,
-        StaticBinaryBroadcast<DivOp>(loc, x_after_k, divisor,
-                                     GetI64ElementsAttr(batch_dim_ids, builder),
-                                     *builder),
-        /*broadcast_dimensions=*/nullptr);
-  }
-
-  // Householder QR decomposition. Algorithm 5.2.1 from Golub and Van
-  // Loan "Matrix Computations", 4th Edition. This is an unblocked
-  // implementation used as an inner routine of the blocked implementation.
-  // Algorithm is adapted slightly so the shapes inside the loop are static, at
-  // the cost of some redundant computation. Since this is used as an inner
-  // block kernel, accumulates the Householder transformations (vs, taus) rather
-  // than the matrix q. Equivalent Python code, without batching: def qr(a):
-  //   m = a.shape[0]
-  //   n = a.shape[1]
-  //   vs = np.zeros([m, n])
-  //   taus = np.zeros([n])
-  //   for j in xrange(min(m, n)):
-  //     v, tau, beta = house(a[:, j], j)
-  //     # Unusually, we apply the Householder transformation to the entirety of
-  //     # a, wasting FLOPs to maintain the static shape invariant that XLA
-  //     # requires. For columns that precede j this has no effect.
-  //     a[:, :] -= tau * np.dot(v[:, np.newaxis],
-  //                              np.dot(v[np.newaxis, :], a[:, :]))
-  //     # Form column j explicitly rather than relying on the precision of the
-  //     # Householder update.
-  //     a[j, j] = beta
-  //     a[j+1:, j] = np.zeros([m - j - 1], dtype=a.dtype)
-  //     vs[:, j] = v
-  //     taus[j] = tau
-  //   return (q, vs, taus)
-  void QRBlock(Location loc, Value a, Value *r, Value *taus, Value *vs,
-               PatternRewriter *rewriter) const {
-    auto a_type = a.getType().cast<RankedTensorType>();
-    const int num_dims = a_type.getRank();
-    assert(num_dims >= 2 && "Argument to QR must have rank >= 2");
-
-    const int64_t m = a_type.getDimSize(a_type.getRank() - 2);
-    const int64_t n = a_type.getDimSize(a_type.getRank() - 1);
-
-    const int64_t num_batch_dims = num_dims - 2;
-    auto batch_dims = a_type.getShape().take_front(num_batch_dims);
-    llvm::SmallVector<int64_t, 4> batch_dim_indices(batch_dims.size());
-    std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
-
-    auto qr_body_fn = [&](Location loc, Value j, ArrayRef<Value> old_values,
-                          SmallVectorImpl<Value> *new_values,
-                          OpBuilder *builder) {
-      auto a = old_values[0];
-      auto vs = old_values[1];
-      auto taus = old_values[2];
-
-      // v, beta = house(a[:, j], j)
-      auto x = DynamicSliceInMinorDims(loc, a, {j}, {1}, builder);
-      auto x_collapsed_shape = llvm::to_vector<4>(batch_dims);
-      x_collapsed_shape.push_back(m);
-      auto x_collapsed = builder->create<ReshapeOp>(
-          loc,
-          tensorflow::GetTypeFromTFTensorShape(
-              x_collapsed_shape, getElementTypeOrSelf(x.getType())),
-          x);
-      Value v, tau, beta;
-      House(loc, x_collapsed, j, batch_dims, m, builder, &v, &tau, &beta);
-
-      auto shape = llvm::to_vector<4>(batch_dims);
-      shape.append({1, m});
-      auto v_broadcast = builder->create<ReshapeOp>(
-          loc,
-          tensorflow::GetTypeFromTFTensorShape(
-              shape, getElementTypeOrSelf(v.getType())),
-          v);
-      // a[:, :] -= tau * np.dot(v[:, np.newaxis],
-      //                          np.dot(v[np.newaxis, :], a[:, :]))
-      auto precision = builder->getArrayAttr(
-          {PrecisionAttr::get(builder->getContext(), Precision::HIGHEST),
-           PrecisionAttr::get(builder->getContext(), Precision::HIGHEST)});
-      auto vva = BatchDot(loc, v_broadcast, false, a, false, num_batch_dims,
-                          precision, builder);
-      vva = BatchDot(loc, v_broadcast, true, vva, false, num_batch_dims,
-                     precision, builder);
-      auto tau_x_vva = StaticBinaryBroadcast<mhlo::MulOp>(
-          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder),
-          *builder);
-      a = builder->create<SubtractOp>(loc, a, tau_x_vva);
-
-      // It is more precise to populate column 'k' explicitly, rather than
-      // computing it implicitly by applying the Householder transformation.
-      // a[k,k] = beta
-      // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
-      auto iota =
-          builder->create<IotaOp>(loc,
-                                  tensorflow::GetTypeFromTFTensorShape(
-                                      {m, 1}, builder->getIntegerType(32)),
-                                  builder->getI64IntegerAttr(0));
-      Value predecessor_mask = builder->create<chlo::BroadcastCompareOp>(
-          loc, iota, j, GetI64ElementsAttr({}, builder),
-          chlo::ComparisonDirection::LT);
-      predecessor_mask = builder->create<ConvertOp>(loc, predecessor_mask,
-                                                    a_type.getElementType());
-      Value mask = builder->create<chlo::BroadcastCompareOp>(
-          loc, iota, j, GetI64ElementsAttr({}, builder),
-          chlo::ComparisonDirection::EQ);
-      mask = builder->create<ConvertOp>(loc, mask, a_type.getElementType());
-      mask = builder->create<BroadcastOp>(
-          loc, mask,
-          GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(num_batch_dims, 1),
-                             builder));
-      Value predecessor_masked_x = StaticBinaryBroadcast<MulOp>(
-          loc, x, predecessor_mask,
-          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder), *builder);
-      Value masked_beta = StaticBinaryBroadcast<MulOp>(
-          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder),
-          *builder);
-      Value new_x =
-          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta);
-      // Update a[:,j]
-      llvm::SmallVector<int64_t, 4> dim_ids(num_dims);
-      std::iota(dim_ids.begin(), dim_ids.end(), 0);
-      new_x = builder->create<BroadcastInDimOp>(
-          loc, a_type, new_x, GetI64ElementsAttr(dim_ids, builder));
-      const int64_t minor_dim = num_batch_dims;
-      auto iota_mn = builder->create<IotaOp>(
-          loc,
-          tensorflow::GetTypeFromTFTensorShape(a_type.getShape(),
-                                               builder->getIntegerType(32)),
-          builder->getI64IntegerAttr(minor_dim + 1));
-      Value xa_mask = builder->create<chlo::BroadcastCompareOp>(
-          loc, iota_mn, j, GetI64ElementsAttr({}, builder),
-          chlo::ComparisonDirection::EQ);
-      a = builder->create<SelectOp>(loc, xa_mask, new_x, a);
-
-      // vs[:, j] = v
-      llvm::SmallVector<int64_t, 4> vs_broadcast_dims(num_batch_dims + 1);
-      std::iota(vs_broadcast_dims.begin(), vs_broadcast_dims.end(), 0);
-      Value vs_zeros =
-          GetScalarConstOfType(a_type.getElementType(), loc, 0, builder);
-      vs_zeros = builder->create<BroadcastOp>(
-          loc, vs_zeros,
-          GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
-                             builder));
-      auto vs_update = builder->create<SelectOp>(
-          loc, xa_mask,
-          StaticBinaryBroadcast<AddOp>(
-              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder),
-              *builder),
-          vs_zeros);
-      vs = builder->create<AddOp>(loc, vs, vs_update);
-
-      // taus[j] = tau
-      llvm::SmallVector<int64_t, 4> tau_broadcast_dims(batch_dims.size());
-      std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
-
-      auto iota_shape = llvm::to_vector<4>(batch_dims);
-      iota_shape.push_back(n);
-      auto iota_n =
-          builder->create<IotaOp>(loc,
-                                  tensorflow::GetTypeFromTFTensorShape(
-                                      iota_shape, builder->getIntegerType(32)),
-                                  builder->getI64IntegerAttr(minor_dim));
-      Value taus_zeros =
-          GetScalarConstOfType(a_type.getElementType(), loc, 0, builder);
-      taus_zeros = builder->create<BroadcastOp>(
-          loc, taus_zeros,
-          GetI64ElementsAttr(taus.getType().cast<RankedTensorType>().getShape(),
-                             builder));
-      Value taus_mask = builder->create<chlo::BroadcastCompareOp>(
-          loc, iota_n, j, GetI64ElementsAttr({}, builder),
-          chlo::ComparisonDirection::EQ);
-      auto taus_update = builder->create<SelectOp>(
-          loc, taus_mask,
-          StaticBinaryBroadcast<AddOp>(
-              loc, taus_zeros, tau,
-              GetI64ElementsAttr(tau_broadcast_dims, builder), *builder),
-          taus_zeros);
-      taus = builder->create<AddOp>(loc, taus, taus_update);
-      new_values->assign({a, vs, taus});
-    };
-
-    Value zero =
-        GetScalarConstOfType(a_type.getElementType(), loc, 0, rewriter);
-    *vs = rewriter->create<BroadcastOp>(
-        loc, zero, GetI64ElementsAttr(a_type.getShape(), rewriter));
-    auto taus_shape = llvm::to_vector<4>(batch_dims);
-    taus_shape.push_back(n);
-    *taus = rewriter->create<BroadcastOp>(
-        loc, zero, GetI64ElementsAttr(taus_shape, rewriter));
-
-    SmallVector<Value, 4> while_output;
-    CreateWhile32(loc, std::min(m, n), qr_body_fn, {a, *vs, *taus},
-                  &while_output, rewriter);
-    *r = while_output[0];
-    *vs = while_output[1];
-    *taus = while_output[2];
-  }
-
-  // Computes W and Y such that I-WY is equivalent to the sequence of
-  // Householder
-  // transformations given by vs and taus.
-  // Golub and van Loan, "Matrix Computations", algorithm 5.1.2.
-  // Y = np.zeros([m, n])
-  // W = np.zeros([m, n])
-  // Y[:, 0] = vs[:, 0]
-  // W[:, 0] = -taus[0] * vs[:, 0]
-  // for j in xrange(1, n):
-  //   v = vs[:, j]
-  //   z = -taus[j] * v - taus[j] * np.dot(W, np.dot(Y.T, v))
-  //   W[:, j] = z
-  //   Y[:, j] = v
-  // return W
-  // There is no need to return Y since at termination of the loop it is equal
-  // to vs.
-  Value ComputeWYRepresentation(Location loc, Type data_type,
-                                ArrayRef<int64_t> batch_dims, Value vs,
-                                Value taus, int64_t m, int64_t n,
-                                PatternRewriter *rewriter) const {
-    int64_t n_index = batch_dims.size() + 1;
-    llvm::SmallVector<int64_t, 4> batch_dim_indices(batch_dims.size());
-    std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
-
-    auto body_fn = [&](Location loc, Value j, ArrayRef<Value> old_values,
-                       SmallVectorImpl<Value> *new_values, OpBuilder *builder) {
-      // w has shape [..., m, n]
-      auto w = old_values[0];
-      const auto vs = old_values[1];
-      const auto taus = old_values[2];
-
-      // Want j values in range [1, ... n).
-      j = builder->create<AddOp>(
-          loc, j,
-          GetScalarConstOfType(getElementTypeOrSelf(j.getType()), loc, 1,
-                               builder));
-      // vs has shape [..., m, 1]
-      auto v = DynamicSliceInMinorDims(loc, vs, {j}, {1}, builder);
-      // beta has shape [..., 1]
-      auto beta = DynamicSliceInMinorDims(loc, taus, {j}, {1}, builder);
-
-      auto iota_shape = llvm::to_vector<4>(batch_dims);
-      iota_shape.append({m, n});
-      auto iota_mn =
-          builder->create<IotaOp>(loc,
-                                  tensorflow::GetTypeFromTFTensorShape(
-                                      iota_shape, builder->getIntegerType(32)),
-                                  builder->getI64IntegerAttr(n_index));
-
-      // y has shape [..., m, n]
-      Value zero = GetScalarConstOfType(getElementTypeOrSelf(vs.getType()), loc,
-                                        0, builder);
-      zero = builder->create<BroadcastOp>(
-          loc, zero,
-          GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
-                             builder));
-      auto compare = builder->create<chlo::BroadcastCompareOp>(
-          loc, iota_mn, j, GetI64ElementsAttr({}, builder),
-          chlo::ComparisonDirection::GE);
-      auto y = builder->create<SelectOp>(loc, compare, zero, vs);
-
-      // yv has shape [..., n, 1]
-      auto precision = builder->getArrayAttr(
-          {PrecisionAttr::get(builder->getContext(), Precision::HIGHEST),
-           PrecisionAttr::get(builder->getContext(), Precision::HIGHEST)});
-      auto yv = BatchDot(loc, y, true, v, false, batch_dims.size(), precision,
-                         builder);
-      // wyv has shape [..., m, 1]
-      auto wyv = BatchDot(loc, w, false, yv, false, batch_dims.size(),
-                          precision, builder);
-
-      // z = -beta * (v + wyv)
-      auto neg_beta = builder->create<NegOp>(loc, beta);
-      auto v_wyv = builder->create<AddOp>(loc, v, wyv);
-      auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
-      beta_broadcast_dims.push_back(n_index);
-      auto z = StaticBinaryBroadcast<MulOp>(
-          loc, neg_beta, v_wyv,
-          GetI64ElementsAttr(beta_broadcast_dims, builder), *rewriter);
-
-      w = DynamicUpdateSliceInMinorDims(loc, w, z, {j}, builder);
-      new_values->assign({w, vs, taus});
-    };
-
-    Value w =
-        GetScalarConstOfType(getElementTypeOrSelf(data_type), loc, 0, rewriter);
-    auto w_shape = llvm::to_vector<4>(batch_dims);
-    w_shape.append({m, n});
-    w = rewriter->create<BroadcastOp>(loc, w,
-                                      GetI64ElementsAttr(w_shape, rewriter));
-    auto v = SliceInMinorDims(loc, vs, {0}, {1}, rewriter);
-    auto beta = SliceInMinorDims(loc, taus, {0}, {1}, rewriter);
-    auto neg_beta = rewriter->create<NegOp>(loc, beta);
-    auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
-    beta_broadcast_dims.push_back(n_index);
-    auto bv = StaticBinaryBroadcast<MulOp>(
-        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter),
-        *rewriter);
-    w = UpdateSliceInMinorDims(loc, w, bv, {0}, rewriter);
-
-    SmallVector<Value, 4> while_output;
-    CreateWhile32(loc, n - 1, body_fn, {w, vs, taus}, &while_output, rewriter);
-    return while_output[0];
-  }
-};
-
 // Converts tf.XlaConvV2 to mhlo.Conv
 class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
  public:
@@ -8600,11 +8041,12 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
                                 PatternRewriter &rewriter) const override {
     DenseElementsAttr window_strides_attr, padding_attr, lhs_dilation_attr,
         rhs_dilation_attr, feature_group_count_attr;
-    if (!(matchPattern(op.window_strides(), m_Constant(&window_strides_attr)) &&
-          matchPattern(op.padding(), m_Constant(&padding_attr)) &&
-          matchPattern(op.lhs_dilation(), m_Constant(&lhs_dilation_attr)) &&
-          matchPattern(op.rhs_dilation(), m_Constant(&rhs_dilation_attr)) &&
-          matchPattern(op.feature_group_count(),
+    if (!(matchPattern(op.getWindowStrides(),
+                       m_Constant(&window_strides_attr)) &&
+          matchPattern(op.getPadding(), m_Constant(&padding_attr)) &&
+          matchPattern(op.getLhsDilation(), m_Constant(&lhs_dilation_attr)) &&
+          matchPattern(op.getRhsDilation(), m_Constant(&rhs_dilation_attr)) &&
+          matchPattern(op.getFeatureGroupCount(),
                        m_Constant(&feature_group_count_attr))))
       return failure();
 
@@ -8635,29 +8077,29 @@ class ConvertXlaConvV2Op : public OpRewritePattern<TF::XlaConvV2Op> {
         rewriter.getI64IntegerAttr(feature_group_count_val));
 
     auto batch_group_count_named_attr =
-        rewriter.getNamedAttr("batch_group_count", op.batch_group_countAttr());
+        rewriter.getNamedAttr("batch_group_count", op.getBatchGroupCountAttr());
 
     xla::ConvolutionDimensionNumbers dnums;
-    dnums.ParseFromString(op.dimension_numbersAttr().getValue().str());
+    dnums.ParseFromString(op.getDimensionNumbersAttr().getValue().str());
     auto dimension_numbers_named_attr = rewriter.getNamedAttr(
         "dimension_numbers",
         xla::ConvertConvDimensionNumbers(dnums, &rewriter));
 
     xla::PrecisionConfig precision_config;
     precision_config.ParseFromString(
-        op.precision_configAttr().getValue().str());
+        op.getPrecisionConfigAttr().getValue().str());
     auto precision_config_named_attr = rewriter.getNamedAttr(
         "precision_config",
         xla::ConvertPrecisionConfig(&precision_config, &rewriter));
 
-    SmallVector<Value, 2> operands{op.lhs(), op.rhs()};
+    SmallVector<Value, 2> operands{op.getLhs(), op.getRhs()};
     NamedAttribute attrs[] = {
         window_strides_named_attr,      padding_named_attr,
         lhs_dilation_named_attr,        rhs_dilation_named_attr,
         feature_group_count_named_attr, batch_group_count_named_attr,
         dimension_numbers_named_attr,   precision_config_named_attr};
     rewriter.replaceOpWithNewOp<mhlo::ConvolutionOp>(op, op.getType(), operands,
-                                                     llvm::makeArrayRef(attrs));
+                                                     llvm::ArrayRef(attrs));
     return success();
   }
 };
@@ -8671,10 +8113,10 @@ class ConvertXlaSelectAndScatterOp
   LogicalResult matchAndRewrite(TF::XlaSelectAndScatterOp op,
                                 PatternRewriter &rewriter) const override {
     ElementsAttr window_dimensions, window_strides, padding;
-    if (!(matchPattern(op.window_dimensions(),
+    if (!(matchPattern(op.getWindowDimensions(),
                        m_Constant(&window_dimensions)) &&
-          matchPattern(op.window_strides(), m_Constant(&window_strides)) &&
-          matchPattern(op.padding(), m_Constant(&padding))))
+          matchPattern(op.getWindowStrides(), m_Constant(&window_strides)) &&
+          matchPattern(op.getPadding(), m_Constant(&padding))))
       return failure();
 
     Location loc = op.getLoc();
@@ -8682,7 +8124,7 @@ class ConvertXlaSelectAndScatterOp
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the mhlo.SelectAndScatter op.
     auto select_and_scatter_op = rewriter.create<mhlo::SelectAndScatterOp>(
-        loc, result_types, op.operand(), op.source(), op.init_value(),
+        loc, result_types, op.getOperand(), op.getSource(), op.getInitValue(),
         hlo::convertElementsAttr(window_dimensions, rewriter.getIntegerType(64))
             .cast<DenseIntElementsAttr>(),
         hlo::convertElementsAttr(window_strides, rewriter.getIntegerType(64))
@@ -8698,10 +8140,10 @@ class ConvertXlaSelectAndScatterOp
     };
 
     // Insert a call to the select function in the select region of the mhlo op.
-    insert_call_to(op.select(), &select_and_scatter_op.getSelect());
+    insert_call_to(op.getSelect(), &select_and_scatter_op.getSelect());
     // Insert a call to the scatter function in the scatter region of the mhlo
     // op.
-    insert_call_to(op.scatter(), &select_and_scatter_op.getScatter());
+    insert_call_to(op.getScatter(), &select_and_scatter_op.getScatter());
 
     rewriter.replaceOp(op, select_and_scatter_op.getResult());
 
@@ -8717,9 +8159,9 @@ class ConvertXlaSortOp : public OpRewritePattern<TF::XlaSortOp> {
   LogicalResult matchAndRewrite(TF::XlaSortOp op,
                                 PatternRewriter &rewriter) const override {
     // Create the sort op.
-    Type element_type = getElementTypeOrSelf(op.input().getType());
+    Type element_type = getElementTypeOrSelf(op.getInput().getType());
     auto sort_op =
-        createSortOp(&rewriter, op.getLoc(), {op.input()}, {element_type},
+        createSortOp(&rewriter, op.getLoc(), {op.getInput()}, {element_type},
                      /*dimension=*/-1, /*is_stable=*/false,
                      /*direction=*/ComparisonDirection::LT);
     rewriter.replaceOp(op, sort_op.getResult(0));
@@ -8749,7 +8191,7 @@ class ConvertXlaRngBitGeneratorOp
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     DenseElementsAttr algorithm;
-    if (!(matchPattern(op.algorithm(), m_Constant(&algorithm))) ||
+    if (!(matchPattern(op.getAlgorithm(), m_Constant(&algorithm))) ||
         algorithm.getType().getRank()) {
       return op.emitOpError() << "algorithm must be a constant scalar";
     }
@@ -8762,9 +8204,9 @@ class ConvertXlaRngBitGeneratorOp
 
     auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
         rewriter.getContext(),
-        *mlir::mhlo::symbolizeRngAlgorithm(xla_alg.getValue()));
+        *mlir::mhlo::symbolizeRngAlgorithm(xla_alg.value()));
     auto rng_bit_generator_op = rewriter.create<mhlo::RngBitGeneratorOp>(
-        loc, op.getResultTypes(), algorithm_attr, op.initial_state());
+        loc, op.getResultTypes(), algorithm_attr, op.getInitialState());
 
     rewriter.replaceOp(op, rng_bit_generator_op.getResults());
 
@@ -8784,9 +8226,9 @@ class ConvertXlaVariadicReduceV2Op
 
     // Create the mhlo.reduce op.
     auto reduce_op = rewriter.create<mhlo::ReduceOp>(
-        loc, op.inputs(), op.init_values(),
-        GetI64ElementsAttr(op.dimensions_to_reduce()));
-    mlir::SymbolRefAttr func = op.reducer();
+        loc, op.getInputs(), op.getInitValues(),
+        GetI64ElementsAttr(op.getDimensionsToReduce()));
+    mlir::SymbolRefAttr func = op.getReducer();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
         op->getParentOfType<mlir::ModuleOp>(), func));
     auto func_ty = func_op.getFunctionType();
@@ -8809,12 +8251,12 @@ class ConvertXlaVariadicSortOp
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     ElementsAttr dimension;
-    matchPattern(op.dimension(), m_Constant(&dimension));
+    matchPattern(op.getDimension(), m_Constant(&dimension));
     // Create the mhlo.sort op.
     auto sort_op = rewriter.create<mhlo::SortOp>(
-        loc, op.inputs(), dimension.getValues<IntegerAttr>()[0].getInt(),
-        op.is_stable());
-    mlir::SymbolRefAttr func = op.comparator();
+        loc, op.getInputs(), dimension.getValues<IntegerAttr>()[0].getInt(),
+        op.getIsStable());
+    mlir::SymbolRefAttr func = op.getComparator();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
         op->getParentOfType<mlir::ModuleOp>(), func));
     auto func_ty = func_op.getFunctionType();
@@ -8835,25 +8277,118 @@ class ConvertXlaReducePrecisionOp
   LogicalResult matchAndRewrite(TF::XlaReducePrecisionOp op,
                                 PatternRewriter &rewriter) const override {
     IntegerType int32_type = rewriter.getIntegerType(32);
-    APInt exponent_bits = op.exponent_bitsAttr().getValue();
+    APInt exponent_bits = op.getExponentBitsAttr().getValue();
     // Truncating to 32-bits is safe, since pasing any number above the dtype
     // size (which is at most 64, for float64) is equivalent to passing the
     // dtype size.
     IntegerAttr new_exponent_attr =
         IntegerAttr::get(int32_type, exponent_bits.truncSSat(32));
-    APInt mantissa_bits = op.mantissa_bitsAttr().getValue();
+    APInt mantissa_bits = op.getMantissaBitsAttr().getValue();
     IntegerAttr new_mantissa_attr =
         IntegerAttr::get(int32_type, mantissa_bits.truncSSat(32));
     rewriter.replaceOpWithNewOp<mhlo::ReducePrecisionOp>(
-        op, op.getType(), op.operand(), new_exponent_attr, new_mantissa_attr);
+        op, op.getType(), op.getOperand(), new_exponent_attr,
+        new_mantissa_attr);
+    return success();
+  }
+};
+
+class LowerYieldOp : public OpConversionPattern<TF::YieldOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::YieldOp op, TF::YieldOp::Adaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<mhlo::ReturnOp>(op, adaptor.getOperands());
     return success();
   }
 };
 
+// Returns a new tensor type from the given type with element type updated to
+// the given type.
+TensorType UpdateElementTypeTo(Type ty, Type element_ty) {
+  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty) {
+    return UnrankedTensorType::get(element_ty);
+  }
+  return RankedTensorType::get(ranked_ty.getShape(), element_ty,
+                               ranked_ty.getEncoding());
+}
+
+template <typename SrcOpT, typename DstOpT>
+class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
+ public:
+  using OpConversionPattern<SrcOpT>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      SrcOpT op, typename SrcOpT::Adaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    DstOpT mhlo_op;
+    Location loc = op.getLoc();
+
+    // To handle quant type conversions, use the converted operands' element
+    // types and original source op's shapes and encoding to get converted op's
+    // result types. This is only done for the While op for now.
+    llvm::SmallVector<Type, 4> element_types;
+    int64_t num_results = op.getNumResults();
+    if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+      element_types.reserve(num_results);
+      for (Value value : adaptor.getOperands()) {
+        element_types.push_back(getElementTypeOrSelf(value.getType()));
+      }
+    }
+
+    if constexpr (std::is_same<DstOpT, mhlo::CaseOp>::value) {
+      // Explicitly handle the Case op because it has variadic regions and takes
+      // the number of regions as an input along with the operands.
+      mhlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
+                                        adaptor.getBranchIndex(),
+                                        op.getBranches().size());
+    } else if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+      llvm::SmallVector<Type, 4> while_result_types;
+      while_result_types.reserve(num_results);
+      for (int64_t idx = 0; idx < num_results; ++idx) {
+        auto ty = UpdateElementTypeTo(op.getType(idx), element_types[idx]);
+        while_result_types.push_back(ty);
+      }
+
+      mhlo_op = rewriter.create<DstOpT>(loc, TypeRange(while_result_types),
+                                        adaptor.getOperands());
+    } else {
+      mhlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
+                                        adaptor.getOperands());
+    }
+
+    // Replace all uses of `op` results with the newly created op.
+    rewriter.replaceOp(op, mhlo_op.getResults());
+
+    int64_t num_regions = op.getNumRegions();
+    for (int64_t idx = 0; idx < num_regions; ++idx) {
+      Region &region = mhlo_op.getBodyRegion(idx);
+      rewriter.inlineRegionBefore(op.getBodyRegion(idx), region, region.end());
+
+      // Update region's entry blocks argument types to handle quantized element
+      // types.
+      if constexpr (std::is_same<DstOpT, mhlo::WhileOp>::value) {
+        TypeConverter::SignatureConversion signature(num_results);
+        Block &block = region.front();
+        for (auto &[block_idx, original_ty] :
+             llvm::enumerate(block.getArgumentTypes())) {
+          TensorType updated_ty =
+              UpdateElementTypeTo(original_ty, element_types[block_idx]);
+          signature.addInputs(block_idx, {updated_ty});
+        }
+        rewriter.applySignatureConversion(&region, signature);
+      }
+    }
+    return success();
+  }
+};
 }  // end namespace
 
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
-
+// LINT.IfChange
 void PopulateLegalizeTfPatterns(MLIRContext *context,
                                 RewritePatternSet *patterns) {
   populateWithGenerated(*patterns);
@@ -8907,7 +8442,6 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
     ConvertOneHotOp,
     ConvertOutfeedEnqueueTupleOp,
     ConvertProdOp,
-    ConvertQrOp,
     ConvertDynamicRangeOp,
     ConvertMatrixDiagPartV3Op,
     ConvertRangeOp,
@@ -8960,9 +8494,13 @@ void PopulateLegalizeTfPatterns(MLIRContext *context,
     ConvertStridedSliceOpDynamic,
     ConvertConv2DBackpropInputDynamic,
     ConvertConv2DBackpropFilterDynamic,
-    ConvertDynamicStitchOpDynamic>(context);
+    ConvertDynamicStitchOpDynamic,
+    LowerControlFlowOp<TF::CaseRegionOp, mhlo::CaseOp>,
+    LowerControlFlowOp<TF::IfRegionOp, mhlo::IfOp>,
+    LowerControlFlowOp<TF::WhileRegionOp, mhlo::WhileOp>,
+    LowerYieldOp>(context);
   // clang-format on
 }
-
+// LINT.ThenChange(:MlirPreferredOps)
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_collective.cc
index a64cb31e5fd..2115fd69d0a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_collective.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/xla/transforms/utils.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace mlir {
@@ -206,7 +206,7 @@ class ConvertXlaAllReduce
   LogicalResult matchAndRewrite(TF::XlaAllReduceOp all_reduce,
                                 PatternRewriter& rewriter) const override {
     DenseIntElementsAttr replica_groups;
-    if (failed(ConvertReplicaGroups(rewriter, all_reduce.group_assignment(),
+    if (failed(ConvertReplicaGroups(rewriter, all_reduce.getGroupAssignment(),
                                     replica_groups, all_reduce))) {
       return failure();
     }
@@ -217,7 +217,7 @@ class ConvertXlaAllReduce
       return failure();
     }
 
-    StringRef reduce_op = all_reduce.reduce_op();
+    StringRef reduce_op = all_reduce.getReduceOp();
 
     StringRef merge_op, final_op;
     if (reduce_op == "Add") {
@@ -243,8 +243,9 @@ class ConvertXlaAllReduce
 
     int64_t channel_id = channel_id_++;
     return ConvertAllReduce(rewriter, channel_id, all_reduce.getType(),
-                            replica_groups, all_reduce.mode(),
-                            all_reduce.input(), merge_op, final_op, all_reduce);
+                            replica_groups, all_reduce.getMode(),
+                            all_reduce.getInput(), merge_op, final_op,
+                            all_reduce);
   }
 };
 
@@ -258,13 +259,14 @@ class ConvertCollectiveReduceV2
   LogicalResult matchAndRewrite(TF::CollectiveReduceV2Op all_reduce,
                                 PatternRewriter& rewriter) const override {
     TF::CollectiveAssignGroupV2Op assign_group =
-        all_reduce.group_size().getDefiningOp<TF::CollectiveAssignGroupV2Op>();
+        all_reduce.getGroupSize()
+            .getDefiningOp<TF::CollectiveAssignGroupV2Op>();
 
     if (assign_group) {
       // Found a group assignment. Use replica_groups to represent group
       // assignment.
 
-      if (assign_group != all_reduce.group_key()
+      if (assign_group != all_reduce.getGroupKey()
                               .getDefiningOp<TF::CollectiveAssignGroupV2Op>()) {
         return all_reduce->emitOpError()
                << "group_size and group_key are not from the "
@@ -272,7 +274,8 @@ class ConvertCollectiveReduceV2
       }
 
       DenseIntElementsAttr replica_groups;
-      if (failed(ConvertReplicaGroups(rewriter, assign_group.group_assignment(),
+      if (failed(ConvertReplicaGroups(rewriter,
+                                      assign_group.getGroupAssignment(),
                                       replica_groups, all_reduce))) {
         return failure();
       }
@@ -293,13 +296,14 @@ class ConvertCollectiveReduceV2
       // ops are used.
       return ConvertAllReduce(rewriter, channel_id, all_reduce.getType(),
                               replica_groups, /* mode=*/"CrossReplica",
-                              all_reduce.input(), all_reduce.merge_op(),
-                              all_reduce.final_op(), all_reduce);
+                              all_reduce.getInput(), all_reduce.getMergeOp(),
+                              all_reduce.getFinalOp(), all_reduce);
     }
 
     // No group assignment, use separate channels per group_key.
     DenseIntElementsAttr group_size_attr;
-    if (!matchPattern(all_reduce.group_size(), m_Constant(&group_size_attr))) {
+    if (!matchPattern(all_reduce.getGroupSize(),
+                      m_Constant(&group_size_attr))) {
       return all_reduce.emitOpError()
              << "group_size must be a compile time constant";
     }
@@ -322,7 +326,8 @@ class ConvertCollectiveReduceV2
       // TODO(b/226201111): Stop emitting CollectiveInfo when it is no longer
       // needed.
       DenseIntElementsAttr group_key_attr;
-      if (!matchPattern(all_reduce.group_key(), m_Constant(&group_key_attr))) {
+      if (!matchPattern(all_reduce.getGroupKey(),
+                        m_Constant(&group_key_attr))) {
         return all_reduce.emitOpError()
                << "group_key must be a compile time constant";
       }
@@ -342,8 +347,8 @@ class ConvertCollectiveReduceV2
     int64_t channel_id = channel_id_++;
     return ConvertAllReduce(
         rewriter, channel_id, all_reduce.getType(), replica_groups,
-        /* mode= */ "CrossReplicaAndPartition", all_reduce.input(),
-        all_reduce.merge_op(), all_reduce.final_op(), all_reduce);
+        /* mode= */ "CrossReplicaAndPartition", all_reduce.getInput(),
+        all_reduce.getMergeOp(), all_reduce.getFinalOp(), all_reduce);
   }
 };
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
index aa8143a76cd..ff72dce3e7d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/side_effect_util.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
@@ -330,18 +330,18 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
   Location loc = host_compute.getLoc();
 
   SmallVector<Value, 4> send_tokens;
-  for (auto operand : llvm::enumerate(host_compute.inputs())) {
+  for (auto operand : llvm::enumerate(host_compute.getInputs())) {
     auto send_token = CreateSendOp(
-        builder, channel_id, loc, operand.value(), host_compute.send_key(),
+        builder, channel_id, loc, operand.value(), host_compute.getSendKey(),
         operand.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName);
     send_tokens.push_back(send_token);
   }
   token = CreateSinkToken(builder, loc, send_tokens, token);
 
   SmallVector<Value, 4> recv_tokens;
-  for (auto result : llvm::enumerate(host_compute.outputs())) {
+  for (auto result : llvm::enumerate(host_compute.getOutputs())) {
     auto recv_token = CreateRecvOp(
-        builder, channel_id, loc, result.value(), host_compute.recv_key(),
+        builder, channel_id, loc, result.value(), host_compute.getRecvKey(),
         result.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName);
     recv_tokens.push_back(recv_token);
   }
@@ -356,7 +356,7 @@ Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
                           TF::XlaSendToHostOp send_to_host, Value token) {
   builder.setInsertionPoint(send_to_host);
   token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
-                       send_to_host.input(), send_to_host.key(),
+                       send_to_host.getInput(), send_to_host.getKey(),
                        /*index=*/0, token,
                        xla::kXlaHostTransferTfRendezvousHandlerName);
 
@@ -369,7 +369,7 @@ Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
                             TF::XlaRecvFromHostOp recv_from_host, Value token) {
   builder.setInsertionPoint(recv_from_host);
   token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
-                       recv_from_host.output(), recv_from_host.key(),
+                       recv_from_host.getOutput(), recv_from_host.getKey(),
                        /*index=*/0, token,
                        xla::kXlaHostTransferTfRendezvousHandlerName);
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
deleted file mode 100644
index 78190e1dd57..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ /dev/null
@@ -1,452 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering TensorFlow dialect's control flow to
-// the XLA dialect.
-
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <numeric>
-#include <tuple>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-
-using mlir::PassRegistration;
-
-namespace mlir {
-namespace mhlo {
-namespace {
-
-#define GEN_PASS_DEF_LEGALIZETFCONTROLFLOW
-#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.h.inc"
-
-class LegalizeTFControlFlow
-    : public impl::LegalizeTFControlFlowBase<LegalizeTFControlFlow> {
- public:
-  void runOnOperation() override;
-};
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createLegalizeTFControlFlowPass() {
-  return std::make_unique<LegalizeTFControlFlow>();
-}
-
-namespace {
-
-void Detuple(Value tuple, ValueRange replace, OpBuilder* builder) {
-  // De-tuple the results of the xla hlo if result.
-  for (auto result_it : llvm::enumerate(replace)) {
-    auto get_tuple_value = builder->create<mhlo::GetTupleElementOp>(
-        result_it.value().getLoc(), tuple, result_it.index());
-    result_it.value().replaceAllUsesWith(get_tuple_value);
-  }
-}
-
-// For mlir::IfOp or mlir::CaseOp, replace the uses of their region's block
-// arguments with 'implicit_operands'. Here | 'implicit_operands' | == Number of
-// arguments in any of the regions in IfOp or CaseOp.
-void ReplaceBlockArgumentsWithImplicitOperands(
-    mlir::Operation* op, llvm::ArrayRef<mlir::Value> implicit_operands) {
-  assert((mlir::dyn_cast<mlir::mhlo::IfOp>(*op) ||
-          mlir::dyn_cast<mlir::mhlo::CaseOp>(*op)) &&
-         "Unexpected mlir op in ReplaceBlockArgumentsWithImplicitOperands!");
-
-  for (auto& region : op->getRegions()) {
-    int implicit_operand_index = 0;
-    for (auto arg : region.getArguments()) {
-      assert(implicit_operand_index < implicit_operands.size());
-      arg.replaceAllUsesWith(implicit_operands[implicit_operand_index++]);
-    }
-
-    region.front().eraseArguments(0, region.getNumArguments());
-  }
-}
-
-// Imports the source region into the destination region. MHLO supports
-// multiple arguments per branch and multiple returns which are individually
-// tupled together during export to XLA. This tupling is needed as XLA if/while
-// operation only supports one argument per branch and a single return value.
-// `tuple_arg` allows any branch that requires additional arguments to have
-// their values be tupled together. Similarly, `tuple_return` allows the results
-// of the if/while operation to be tupled together.
-void ImportXlaRegion(mlir::func::FuncOp func, Region* dest_region, Location loc,
-                     bool tuple_return = true, bool tuple_arg = true) {
-  OpBuilder builder(dest_region);
-
-  auto entry_block = builder.createBlock(dest_region);
-  func::CallOp result;
-  if (!tuple_arg) {
-    auto inputs = func.getFunctionType().getInputs();
-    auto args = entry_block->addArguments(
-        inputs, SmallVector<Location>(inputs.size(), loc));
-    ArrayRef<Value> callop_args(args.begin(), args.end());
-    result = builder.create<func::CallOp>(loc, func, callop_args);
-  } else {
-    auto tuple_arg = entry_block->addArgument(
-        builder.getTupleType(func.getFunctionType().getInputs()), loc);
-    llvm::SmallVector<Value, 4> detupled_args;
-    detupled_args.reserve(func.getNumArguments());
-
-    for (int64_t i = 0, s = func.getNumArguments(); i < s; i++) {
-      auto extract = builder.create<GetTupleElementOp>(loc, tuple_arg, i);
-      detupled_args.push_back(extract);
-    }
-
-    result = builder.create<func::CallOp>(loc, func, detupled_args);
-  }
-
-  if (!tuple_return) {
-    builder.create<mhlo::ReturnOp>(loc, result.getResults());
-  } else {
-    auto tuple_op = builder.create<TupleOp>(loc, result.getResults());
-    builder.create<mhlo::ReturnOp>(loc, tuple_op.getResult());
-  }
-}
-
-void LowerIf(TF::IfOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  SmallVector<Value, 3> inputs(op.input());
-
-  // Create the new `mhlo.if` op.
-  auto if_op = builder.create<mhlo::IfOp>(loc, op.getResultTypes(), op.cond());
-
-  // Import the regions for both the true and false cases. These regions
-  // must be updated to tuple the return results together and use the xla hlo
-  // return op.
-  ImportXlaRegion(op.then_function(), &if_op.getTrueBranch(), loc,
-                  /*tuple_return=*/false, /*tuple_arg=*/false);
-  ImportXlaRegion(op.else_function(), &if_op.getFalseBranch(), loc,
-                  /*tuple_return=*/false, /*tuple_arg=*/false);
-
-  // Replace the uses of block-arguments of the IfOp with the
-  // implicit_operands.
-  ReplaceBlockArgumentsWithImplicitOperands(if_op.getOperation(), inputs);
-
-  op->replaceAllUsesWith(if_op);
-  op.erase();
-}
-
-void LowerCase(TF::CaseOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  SmallVector<Value, 4> inputs(op.input());
-
-  // Create the new `mhlo.case` op.
-  auto case_op = builder.create<mhlo::CaseOp>(
-      loc, op.getResultTypes(), op.branch_index(), op.branches().size());
-
-  // Import the regions for all branches.
-  for (unsigned i = 0; i < op.num_branches(); ++i) {
-    mlir::func::FuncOp branch_func = op.branch_function(i);
-    ImportXlaRegion(branch_func, &case_op.getBranches()[i], loc,
-                    /*tuple_return=*/false, /*tuple_arg=*/false);
-  }
-
-  // Replace the uses of block-arguments of the IfOp with the
-  // implicit_operands.
-  ReplaceBlockArgumentsWithImplicitOperands(case_op.getOperation(), inputs);
-
-  op.replaceAllUsesWith(case_op);
-  op.erase();
-}
-
-void LowerWhile(TF::WhileOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  // XLA prefers tuple arguments for control flow due to XLA not supporting
-  // multiple return values.
-  SmallVector<Value, 3> inputs(op.input());
-  builder.setInsertionPoint(op);
-
-  // Create the new `mhlo.while` op with inputs.
-  auto while_op =
-      builder.create<mhlo::WhileOp>(loc, op.getResultTypes(), inputs);
-
-  // Import the regions for both the cond and body.
-  ImportXlaRegion(op.body_function(), &while_op.getBody(), loc,
-                  /*tuple_return=*/false, /*tuple_arg=*/false);
-  ImportXlaRegion(op.cond_function(), &while_op.getCond(), loc,
-                  /*tuple_return=*/false, /*tuple_arg=*/false);
-
-  op->replaceAllUsesWith(while_op);
-  op.erase();
-}
-
-// Replaces all block arguments of a block with a single block arg of Tuple
-// type `tuple_type`. Single block arguments are removed and remapped to
-// get_tuple_element(tuple_arg, index).
-void ReplaceBlockArgs(Block* block, Type tuple_type, OpBuilder* builder) {
-  auto tuple_arg = block->addArgument(tuple_type, block->getParent()->getLoc());
-  Detuple(tuple_arg, block->getArguments().drop_back(1), builder);
-  for (int i = block->getNumArguments() - 2; i >= 0; --i)
-    block->eraseArgument(i);
-}
-
-// Replaces implicitly captured value uses with block arguments.
-llvm::SmallVector<Value, 4> ReplaceImplicitInputs(
-    Block* block, int offset, ArrayRef<Value> implicit_inputs) {
-  llvm::SmallVector<Value, 4> implicit_input_elements;
-  implicit_input_elements.reserve(implicit_inputs.size());
-
-  Region* region = block->getParent();
-
-  for (auto& implicit_input : llvm::enumerate(implicit_inputs)) {
-    Value implicit_input_value = implicit_input.value();
-    BlockArgument arg = block->getArgument(implicit_input.index() + offset);
-    implicit_input_elements.emplace_back(arg);
-    for (auto& use :
-         llvm::make_early_inc_range(implicit_input_value.getUses())) {
-      if (!region->isAncestor(use.getOwner()->getParentRegion())) continue;
-      use.set(arg);
-    }
-  }
-
-  return implicit_input_elements;
-}
-
-// Replaces implicitly captured value uses with tuple block argument.
-// get_tuple_element's are created to extract specific values. Values from
-// get_tuple_element's are returned in the order of `implicit_inputs`.
-llvm::SmallVector<Value, 4> ReplaceImplicitInputsWithTupleElements(
-    Block* block, int offset, ArrayRef<Value> implicit_inputs,
-    OpBuilder* builder) {
-  llvm::SmallVector<Value, 4> implicit_input_elements;
-  implicit_input_elements.reserve(implicit_inputs.size());
-
-  Region* region = block->getParent();
-  assert(block->getNumArguments() == 1);
-
-  BlockArgument tuple_arg = block->getArgument(0);
-  for (auto& implicit_input : llvm::enumerate(implicit_inputs)) {
-    Value implicit_input_value = implicit_input.value();
-    auto get_tuple_element = builder->create<mhlo::GetTupleElementOp>(
-        implicit_input_value.getLoc(), tuple_arg,
-        implicit_input.index() + offset);
-    implicit_input_elements.emplace_back(get_tuple_element.getResult());
-    for (auto& use :
-         llvm::make_early_inc_range(implicit_input_value.getUses())) {
-      if (!region->isAncestor(use.getOwner()->getParentRegion())) continue;
-      use.set(get_tuple_element.getResult());
-    }
-  }
-
-  return implicit_input_elements;
-}
-
-// Finds and replaces implicitly captured value uses with tuple block argument.
-// A tuple of implicitly captured values is also created and returned, for use
-// as an operand to the associated mhlo control flow op.
-Value TupleImplicitInputs(Region& region, Location loc, OpBuilder* builder) {
-  llvm::SetVector<Value> implicit_inputs;
-  getUsedValuesDefinedAbove(region, region, implicit_inputs);
-  llvm::ArrayRef<Value> implicit_inputs_ref = implicit_inputs.getArrayRef();
-  Value tuple_input = builder->create<mhlo::TupleOp>(loc, implicit_inputs_ref);
-  Block& block = region.front();
-  // `tf.CaseRegion`/`tf.IfRegion` are expected to have no block arguments and
-  // instead all inputs used by their branch regions are implicitly captured
-  // from above.
-  assert(block.getNumArguments() == 0);
-  block.addArgument(tuple_input.getType(), loc);
-  builder->setInsertionPointToStart(&block);
-  ReplaceImplicitInputsWithTupleElements(&block, /*offset=*/0,
-                                         implicit_inputs_ref, builder);
-  return tuple_input;
-}
-
-// Replaces block terminator (tf.Yield) with `mhlo.return`. Additional results
-// can be returned if `extra_results` is not empty. If `tuple_return` is
-// set, a tuple of the return values will be set as the terminator operand.
-void ReplaceTerminator(Block* block, ArrayRef<Value> extra_results,
-                       OpBuilder* builder, bool tuple_return = true) {
-  Operation* terminator = block->getTerminator();
-  assert(isa<TF::YieldOp>(terminator));
-  Location loc = terminator->getLoc();
-
-  builder->setInsertionPoint(terminator);
-  auto results = llvm::to_vector<4>(terminator->getOperands());
-  results.append(extra_results.begin(), extra_results.end());
-  if (tuple_return) {
-    auto tuple_results = builder->create<mhlo::TupleOp>(loc, results);
-    builder->create<mhlo::ReturnOp>(loc, tuple_results.getResult());
-  } else {
-    builder->create<mhlo::ReturnOp>(loc, results);
-  }
-
-  terminator->erase();
-}
-
-void LowerIfRegion(TF::IfRegionOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  builder.setInsertionPoint(op);
-  ReplaceTerminator(&op.then_branch().front(), /*extra_results=*/{}, &builder,
-                    /*tuple_return=*/false);
-
-  builder.setInsertionPoint(op);
-  ReplaceTerminator(&op.else_branch().front(), /*extra_results=*/{}, &builder,
-                    /*tuple_return=*/false);
-
-  // Create the new `mhlo.if` op and take ownership of regions from
-  // `tf.IfRegion` op.
-  builder.setInsertionPoint(op);
-  auto if_op = builder.create<mhlo::IfOp>(loc, op.getResultTypes(), op.cond());
-  if_op.getTrueBranch().takeBody(op.then_branch());
-  if_op.getFalseBranch().takeBody(op.else_branch());
-
-  // Replace all uses of `op` results with that of `mhlo.IfOp`.
-  op->replaceAllUsesWith(if_op);
-
-  op.erase();
-}
-
-void LowerCaseRegion(TF::CaseRegionOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  for (Region& region : op.branches()) {
-    builder.setInsertionPoint(op);
-    ReplaceTerminator(&region.front(), /*extra_results=*/{}, &builder,
-                      /*tuple_return=*/false);
-  }
-
-  // Create the new `mhlo.case` op and take ownership of regions from
-  // `tf.CaseRegion` op.
-  builder.setInsertionPoint(op);
-  auto case_op = builder.create<mhlo::CaseOp>(
-      loc, op.getResultTypes(), op.branch_index(), op.branches().size());
-  for (auto region : llvm::zip(case_op.getBranches(), op.branches()))
-    std::get<0>(region).takeBody(std::get<1>(region));
-
-  // Replace all uses of `op` results with that of `mhlo.CaseOp`.
-  op.replaceAllUsesWith(case_op);
-  op.erase();
-}
-
-void LowerWhileRegion(TF::WhileRegionOp op) {
-  Location loc = op.getLoc();
-  OpBuilder builder(op);
-
-  SmallVector<Value, 3> inputs(op.input());
-  const int inputs_size = inputs.size();
-  llvm::SetVector<Value> implicit_inputs;
-  getUsedValuesDefinedAbove(op.getOperation()->getRegions(), implicit_inputs);
-  inputs.append(implicit_inputs.begin(), implicit_inputs.end());
-
-  builder.setInsertionPoint(op);
-
-  // Create the new `mhlo.while` op with 'inputs'. Implicit inputs are also
-  // returned.
-  auto while_result_types = llvm::to_vector<4>(op.getResultTypes());
-  while_result_types.reserve(while_result_types.size() +
-                             implicit_inputs.size());
-  for (const auto& implicit_input : implicit_inputs)
-    while_result_types.emplace_back(implicit_input.getType());
-  auto while_op =
-      builder.create<mhlo::WhileOp>(loc, while_result_types, inputs);
-
-  // Rewrite cond and associated block arguments and terminator. Ownership of
-  // cond region is transfered over from `tf.WhileRegion` to `mhlo.while`.
-  Region& cond = while_op.getCond();
-  cond.takeBody(op.cond());
-  Block& cond_block = cond.front();
-  builder.setInsertionPointToStart(&cond_block);
-
-  // Add args corresponding to 'implicit_inputs'.
-  for (const auto& implicit_input : implicit_inputs)
-    cond_block.addArgument(implicit_input.getType(), loc);
-  ReplaceImplicitInputs(&cond_block, inputs_size,
-                        implicit_inputs.getArrayRef());
-  // Cond always returns a single result of bool type.
-  ReplaceTerminator(&cond_block, /*extra_results=*/{}, &builder,
-                    /*tuple_return=*/false);
-
-  // Rewrite body and associated block arguments and terminator. Ownership of
-  // body region is transfered over from `tf.WhileRegion` to `mhlo.while`.
-  Region& body = while_op.getBody();
-  body.takeBody(op.body());
-  Block& body_block = body.front();
-  builder.setInsertionPointToStart(&body_block);
-  // Add args corresponding to 'implicit_inputs'.
-  for (const auto& implicit_input : implicit_inputs)
-    body_block.addArgument(implicit_input.getType(), loc);
-  auto implicit_input_elements = ReplaceImplicitInputs(
-      &body_block, inputs_size, implicit_inputs.getArrayRef());
-  ReplaceTerminator(&body_block, implicit_input_elements, &builder, false);
-
-  // Replace all uses of `op` results with that of `mhlo.while`.
-  builder.setInsertionPoint(op);
-  if (while_op.getNumResults() > 1) {
-    for (const auto& result_it : llvm::enumerate(op.getResults()))
-      result_it.value().replaceAllUsesWith(
-          while_op.getResult(result_it.index()));
-  } else {
-    op->replaceAllUsesWith(while_op);
-  }
-  op.erase();
-}
-}  // namespace
-
-void LegalizeTFControlFlow::runOnOperation() {
-  getOperation().walk([&](Operation* op) {
-    if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-      LowerWhile(while_op);
-      return;
-    }
-    if (auto while_region_op = dyn_cast<TF::WhileRegionOp>(op)) {
-      LowerWhileRegion(while_region_op);
-      return;
-    }
-    if (auto if_op = dyn_cast<TF::IfOp>(op)) {
-      LowerIf(if_op);
-      return;
-    }
-    if (auto if_region_op = dyn_cast<TF::IfRegionOp>(op)) {
-      LowerIfRegion(if_region_op);
-      return;
-    }
-    if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
-      LowerCase(case_op);
-      return;
-    }
-    if (auto case_region_op = dyn_cast<TF::CaseRegionOp>(op)) {
-      LowerCaseRegion(case_region_op);
-      return;
-    }
-  });
-}
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 8804fb09af6..bf92d8cc8e3 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -21,7 +21,7 @@ include "mlir/Dialect/Func/IR/FuncOps.td"
 include "mlir/Dialect/Tensor/IR/TensorOps.td"
 include "stablehlo/dialect/ChloOps.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
-include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td"
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
 def UnsignedIntTensor : TensorOf<[UI8, UI16, UI32, UI64]>;
@@ -64,8 +64,8 @@ def CastElementsToI64Elements : NativeCodeCall<
 // ApproximateEqual op pattern.
 //===----------------------------------------------------------------------===//
 
-class HLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<HLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
+class MHLO_ComparisonDirectionValue<string enumStr> :
+  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
 
 class CHLO_ComparisonDirectionValue<string enumStr> :
   ConstantAttr<CHLO_ComparisonDirectionAttr, "::mlir::chlo::ComparisonDirection::" # enumStr>;
@@ -73,11 +73,11 @@ class CHLO_ComparisonDirectionValue<string enumStr> :
 // TODO(b/228291745): Assert that $x and $y have the same shape.
 def : Pat<(TF_ApproximateEqualOp:$result $x, $y, $tolerance),
           (CHLO_BroadcastCompareOp
-           (HLO_AbsOp:$abs (HLO_SubtractOp $x, $y)),
-           (CastValueToElementType $result, (HLO_ConstantOp $tolerance), $abs),
+           (MHLO_AbsOp:$abs (MHLO_SubtractOp $x, $y)),
+           (CastValueToElementType $result, (MHLO_ConstantOp $tolerance), $abs),
            (NullDenseIntElementsAttr),
            CHLO_ComparisonDirectionValue<"LT">,
-           (HLO_DEFAULT_COMPARISON_TYPE))>;
+           (CHLO_DEFAULT_COMPARISON_TYPE))>;
 
 //===----------------------------------------------------------------------===//
 // Assert op pattern.
@@ -131,7 +131,7 @@ def LowerRightShiftUnsigned :
 //
 //  return floor(div(x, y))
 def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
-          (HLO_FloorOp
+          (MHLO_FloorOp
            (CHLO_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r))),
           [(IEEEFloatTensor $l)]>;
 
@@ -146,7 +146,7 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
 // dimensions. This computes the broadcast of 'l' to broadcast('l', 'r')
 // without returning the broadcast of 'r' to broadcast('l', 'r').
 def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
-      (HLO_SelectOp
+      (MHLO_SelectOp
        (CHLO_BroadcastAndOp
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastMulOp:$mul
@@ -157,18 +157,18 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$l_cmp $l,
-          (HLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
+          (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
           (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$r_cmp $r,
-          (HLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
+          (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (BinBroadcastDimensions $l_cmp, $r_cmp), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (NullDenseIntElementsAttr)),
        (CHLO_BroadcastSubOp $div,
-        (HLO_ConstantOp:$ones (GetScalarOfType<1> $div)),
+        (MHLO_ConstantOp:$ones (GetScalarOfType<1> $div)),
         (NullDenseIntElementsAttr)), $div),
       [(SignedIntTensor $l)]>;
 
@@ -184,16 +184,16 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
 //   return trunc_mod != 0 && (y < 0 != trunc_mod < 0) ? trunc_mod + y
 //                                                     : trunc_mod
 def : Pat<(TF_FloorModOp AnyTensor:$l, AnyTensor:$r),
-      (HLO_SelectOp
+      (MHLO_SelectOp
        (CHLO_BroadcastAndOp
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
-         (HLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
+         (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
          (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$r_cmp $r,
-          (HLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
+          (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
@@ -214,10 +214,10 @@ def : Pat<(TF_FloorModOp AnyTensor:$l, AnyTensor:$r),
 def Get2DTransposePerm: NativeCodeCall<
   "Get2DTransposePerm($0, &$_builder)">;
 
-def : Pat<(TF_RiscAddOp $l, $r), (HLO_AddOp $l, $r)>;
+def : Pat<(TF_RiscAddOp $l, $r), (MHLO_AddOp $l, $r)>;
 
 def : Pat<(TF_RiscDotOp $a, $b, $transpose_a, $transpose_b),
-          (HLO_DotOp
+          (MHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
           /*precision_config=*/(NullArrayAttr))>;
@@ -259,7 +259,7 @@ class EqualityPat<Op FromOp, CHLO_ComparisonDirectionValue direction>
         (CHLO_BroadcastCompareOp
          $l, $r, (BinBroadcastDimensions $l, $r), direction,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
-        [(HLO_Tensor $l)]>;
+        [(MHLO_Tensor $l)]>;
 
 def : EqualityPat<TF_EqualOp, CHLO_ComparisonDirectionValue<"EQ">>;
 def : EqualityPat<TF_NotEqualOp, CHLO_ComparisonDirectionValue<"NE">>;
@@ -290,7 +290,7 @@ def IsShapedTensor
 // if HLO constant op is introduced as an replacement for the TensorFlow
 // Constant op.
 def : Pat<(TF_ConcatV2Op $inputs, (ConstantLikeMatcher OneElementAttr:$axis)),
-          (HLO_ConcatenateOp $inputs,
+          (MHLO_ConcatenateOp $inputs,
             (GetHLOAxisFromTFAxisVariadic $axis, $inputs)),
           [(HasRankedFirstOperand $inputs)]>;
 
@@ -299,7 +299,7 @@ def : Pat<(TF_ConcatV2Op $inputs, (ConstantLikeMatcher OneElementAttr:$axis)),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CollectivePermuteOp $input, (ConstantLikeMatcher ElementsAttr:$source_target_pairs)),
-          (HLO_CollectivePermuteOp $input,
+          (MHLO_CollectivePermuteOp $input,
             (CastElementsToI64Elements $source_target_pairs),
             (NullChannelHandleAttr))>;
 
@@ -308,22 +308,23 @@ def : Pat<(TF_CollectivePermuteOp $input, (ConstantLikeMatcher ElementsAttr:$sou
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_CrossReplicaSumOp $input, (ConstantLikeMatcher ElementsAttr:$group_assignment)),
-          (HLO_CrossReplicaSumOp $input,
+          (MHLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
 //===----------------------------------------------------------------------===//
 // All2All op patterns.
 //===----------------------------------------------------------------------===//
 
+def ValueToVariadic: NativeCodeCall<"SmallVector<Value, 1>{$0}">;
 def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (ConstantLikeMatcher ElementsAttr:$group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
-          (HLO_AllToAllOp $input, $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment))>;
+          (MHLO_AllToAllOp (ValueToVariadic $input), $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment), (NullChannelHandleAttr))>;
 
 //===----------------------------------------------------------------------===//
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
 
-class HLO_FftTypeValue<string enumStr> :
-  ConstantAttr<HLO_FftTypeAttr, "::mlir::mhlo::FftType::" # enumStr>;
+class MHLO_FftTypeValue<string enumStr> :
+  ConstantAttr<MHLO_FftTypeAttr, "::mlir::mhlo::FftType::" # enumStr>;
 
 def GetInnerDimFromValue : NativeCodeCall<
   "GetInnerDimFromValue($0.getType().cast<ShapedType>(), &$_builder)">;
@@ -332,11 +333,11 @@ def CheckInnerDimStatic
   : Constraint<CPred<"CheckInnerDimStatic($0.getType().cast<ShapedType>(), &$_builder)">>;
 
 def : Pat<(TF_FFTOp:$res $input),
-          (HLO_FftOp $input, HLO_FftTypeValue<"FFT">, (GetInnerDimFromValue $res)),
+          (MHLO_FftOp $input, MHLO_FftTypeValue<"FFT">, (GetInnerDimFromValue $res)),
           [(CheckInnerDimStatic $input)]>;
 
 def : Pat<(TF_IFFTOp:$res $input),
-          (HLO_FftOp $input, HLO_FftTypeValue<"IFFT">, (GetInnerDimFromValue $res)),
+          (MHLO_FftOp $input, MHLO_FftTypeValue<"IFFT">, (GetInnerDimFromValue $res)),
           [(CheckInnerDimStatic $input)]>;
 
 //===----------------------------------------------------------------------===//
@@ -352,7 +353,7 @@ def : Pat<(TF_IFFTOp:$res $input),
 // def LegalizeGatherV2 :
 //   Pat<(TF_GatherV2Op AnyRankedTensor:$params, AnyRankedTensor:$indices,
 //         (ConstantLikeMatcher ElementsAttr:$axis), $batch_dims),
-//       (HLO_TorchIndexSelectOp $params, $indices,
+//       (MHLO_TorchIndexSelectOp $params, $indices,
 //         (GetHLOAxisFromTFAxis $axis, $params),
 //         (GetHLOAxisFromTFAxis $batch_dims, $indices))>;
 
@@ -373,7 +374,7 @@ def GetInteriorPadding : NativeCodeCall <
 // TODO: commented by DISC due to mhlo.pad is not supported in bufferization
 //       and all the consequent passes.
 // def : Pat<(TF_PadV2Op $input, (ConstantLikeMatcher ElementsAttr:$padding), $c),
-//           (HLO_PadOp $input, $c,
+//           (MHLO_PadOp $input, $c,
 //            (SliceDenseIntElementsAttrColumn2D<"0"> $padding),
 //            (SliceDenseIntElementsAttrColumn2D<"1"> $padding),
 //            (GetInteriorPadding $padding))>;
@@ -394,7 +395,7 @@ foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
-          (HLO_DotOp
+          (MHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
           /*precision_config=*/(NullArrayAttr))>;
@@ -404,41 +405,41 @@ def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_ZerosLikeOp AnyTensor:$arg),
-          (HLO_ConstantLike<"0"> $arg)>;
+          (MHLO_ConstantLike<"0"> $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Lower `tf.OnesLike`
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_OnesLikeOp AnyTensor:$arg),
-          (HLO_ConstantLike<"1"> $arg)>;
+          (MHLO_ConstantLike<"1"> $arg)>;
 
 //===----------------------------------------------------------------------===//
 // Elu op patterns.
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_EluOp AnyTensor:$features),
-          (HLO_SelectOp
-           (HLO_CompareOp
+          (MHLO_SelectOp
+           (MHLO_CompareOp
               $features,
-              (HLO_ConstantLike<"0">:$zero $features),
-              HLO_ComparisonDirectionValue<"GT">, (HLO_DEFAULT_COMPARISON_TYPE)),
+              (MHLO_ConstantLike<"0">:$zero $features),
+              MHLO_ComparisonDirectionValue<"GT">, (MHLO_DEFAULT_COMPARISON_TYPE)),
            $features,
-           (HLO_Expm1Op $features))>;
+           (MHLO_Expm1Op $features))>;
 
 def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
-           (HLO_SelectOp
+           (MHLO_SelectOp
             (CHLO_BroadcastCompareOp
               $features,
-              (HLO_ConstantOp:$zero (GetScalarOfType<0> $features)),
+              (MHLO_ConstantOp:$zero (GetScalarOfType<0> $features)),
               (BinBroadcastDimensions $zero, $features),
               CHLO_ComparisonDirectionValue<"GT">, (CHLO_DEFAULT_COMPARISON_TYPE)),
             $gradients,
-            (HLO_MulOp
+            (MHLO_MulOp
              $gradients,
              (CHLO_BroadcastAddOp
                $features,
-               (HLO_ConstantOp:$one (GetScalarOfType<1> $features)),
+               (MHLO_ConstantOp:$one (GetScalarOfType<1> $features)),
                (BinBroadcastDimensions $one, $features))))>;
 
 //===----------------------------------------------------------------------===//
@@ -451,24 +452,24 @@ def : Pat<(TF_EluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$featur
 // TODO(hinsu): Lower quantized types after supporting them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyTensor:$input),
           (CHLO_BroadcastMaxOp
-               (HLO_ConstantOp:$zero (GetScalarOfType<0> $input)), $input,
+               (MHLO_ConstantOp:$zero (GetScalarOfType<0> $input)), $input,
                (BinBroadcastDimensions $zero, $input)),
           [(TF_IntOrFpTensor $input)]>;
 
 // TODO(hinsu): Lower quantized types after supporting them in GetScalarOfType.
 def : Pat<(TF_Relu6Op AnyRankedTensor:$input),
-          (HLO_ClampOp (HLO_ConstantOp (GetScalarOfType<0> $input)), $input,
-                       (HLO_ConstantOp (GetScalarOfType<6> $input))),
+          (MHLO_ClampOp (MHLO_ConstantOp (GetScalarOfType<0> $input)), $input,
+                       (MHLO_ConstantOp (GetScalarOfType<6> $input))),
           [(TF_IntOrFpTensor $input)]>;
 
 // ReluGrad(gradients, features) = gradients * (features > 0)
 // The condition that $gradients and $features need to have the same shape is
 // implicitly enforced: $zero is created to have the same shape as $features,
-// HLO_SelectOp enforces that $gradients and $zero have the same shape.
+// MHLO_SelectOp enforces that $gradients and $zero have the same shape.
 def : Pat<(TF_ReluGradOp AnyTensor:$gradients, AnyTensor:$features),
-          (HLO_SelectOp
-            (HLO_CompareOp $features, (HLO_ConstantLike<"0">:$zero $features),
-              HLO_ComparisonDirectionValue<"GT">, (HLO_DEFAULT_COMPARISON_TYPE)),
+          (MHLO_SelectOp
+            (MHLO_CompareOp $features, (MHLO_ConstantLike<"0">:$zero $features),
+              MHLO_ComparisonDirectionValue<"GT">, (MHLO_DEFAULT_COMPARISON_TYPE)),
             $gradients, $zero)>;
 
 //===----------------------------------------------------------------------===//
@@ -478,9 +479,9 @@ def : Pat<(TF_ReluGradOp AnyTensor:$gradients, AnyTensor:$features),
 /// Converts a TF::SoftsignOp to HLO.
 /// Softsign(features) = features / (1 + abs(features))
 def : Pat<(TF_SoftsignOp AnyTensor:$input),
-          (HLO_DivOp
+          (MHLO_DivOp
             $input,
-            (HLO_AddOp (HLO_ConstantLike<"1"> $input), (HLO_AbsOp $input))
+            (MHLO_AddOp (MHLO_ConstantLike<"1"> $input), (MHLO_AbsOp $input))
           )
          >;
 
@@ -489,12 +490,12 @@ def : Pat<(TF_SoftsignOp AnyTensor:$input),
 def : Pattern<
         (TF_SoftsignGradOp AnyRankedTensor:$gradients, AnyRankedTensor:$features),
         [(CHLO_BroadcastAddOp:$add
-          (HLO_ConstantOp:$one (GetScalarOfType<1> $features)), (HLO_AbsOp $features),
+          (MHLO_ConstantOp:$one (GetScalarOfType<1> $features)), (MHLO_AbsOp $features),
           (BinBroadcastDimensions $one, $features)
          ),
          (CHLO_BroadcastDivOp
            $gradients,
-           (HLO_MulOp $add, $add),
+           (MHLO_MulOp $add, $add),
            (BinBroadcastDimensions $gradients, $add)
          )
         ]>;
@@ -504,7 +505,7 @@ def : Pattern<
 //===----------------------------------------------------------------------===//
 
 def UnpackStartingIndices: NativeCodeCall<
-  "UnpackTensorAlongZeroDim($0.getLoc(), $1, &$_builder).output()">;
+  "UnpackTensorAlongZeroDim($0.getLoc(), $1, &$_builder).getOutput()">;
 
 def CanBeTranslatedToDynamicSlice : Constraint<CPred<
   "CanBeTranslatedToDynamicSlice($0, $1, $2.cast<DenseIntElementsAttr>())">>;
@@ -513,9 +514,9 @@ def TFSliceSizes2HLOSliceSizes : NativeCodeCall<
     "TFSliceSizes2HLOSliceSizes($0, $1, $2.cast<DenseIntElementsAttr>(),"
     "&$_builder)">;
 
-def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
+def : Pat<(TF_SliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
            (ConstantLikeMatcher AnyAttr:$slice_sizes)),
-          (HLO_DynamicSliceOp $input,
+          (MHLO_DynamicSliceOp $input,
            (UnpackStartingIndices $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes)),
           [(CanBeTranslatedToDynamicSlice $input, $starting_indices,
@@ -525,8 +526,8 @@ def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
 // Select op patterns.
 //===----------------------------------------------------------------------===//
 
- def : Pat<(TF_SelectV2Op HLO_Tensor:$pred, HLO_Tensor:$on_true,
-            HLO_Tensor:$on_false),
+ def : Pat<(TF_SelectV2Op MHLO_Tensor:$pred, MHLO_Tensor:$on_true,
+            MHLO_Tensor:$on_false),
            (CHLO_BroadcastSelectOp $pred, $on_true, $on_false)>;
 
 //===----------------------------------------------------------------------===//
@@ -534,7 +535,9 @@ def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
 //===----------------------------------------------------------------------===//
 
 def ArgTypesMatchCallee : Constraint<
-    CPred<"ArgTypesMatchCallee($0[0].getOwner(), $1, $2)">>;
+    // $0 is a resultset (possibly empty), and $_op isn't assigned. So retrieve
+    // the op using the builder.
+    CPred<"ArgTypesMatchCallee(&*$_builder.getInsertionPoint(), $1, $2)">>;
 
 foreach callOp = [TF_PartitionedCallOp, TF_StatefulPartitionedCallOp] in {
   def : Pat<(callOp:$op $args, FlatSymbolRefAttr:$f,
@@ -557,75 +560,75 @@ def : Pat<(TF_LegacyCallOp:$op $args, FlatSymbolRefAttr:$f, $attr),
 def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1.cast<ElementsAttr>(), &$_builder)">;
 
 def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (ConstantLikeMatcher ElementsAttr:$axis)),
-    (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
+    (MHLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
 //===----------------------------------------------------------------------===//
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
 
 foreach Mapping = [
-                   [TF_AbsOp, HLO_AbsOp],
+                   [TF_AbsOp, MHLO_AbsOp],
                    [TF_AcosOp, CHLO_AcosOp],
                    [TF_AcoshOp, CHLO_AcoshOp],
                    [TF_AsinOp, CHLO_AsinOp],
                    [TF_AsinhOp, CHLO_AsinhOp],
                    [TF_AtanOp, CHLO_AtanOp],
                    [TF_AtanhOp, CHLO_AtanhOp],
-                   [TF_CeilOp, HLO_CeilOp],
+                   [TF_CeilOp, MHLO_CeilOp],
                    [TF_CoshOp, CHLO_CoshOp],
-                   [TF_ComplexAbsOp, HLO_AbsOp],
+                   [TF_ComplexAbsOp, MHLO_AbsOp],
                    [TF_ConjOp, CHLO_ConjOp],
-                   [TF_CosOp, HLO_CosineOp],
+                   [TF_CosOp, MHLO_CosineOp],
                    [TF_DigammaOp, CHLO_DigammaOp],
-                   [TF_ExpOp, HLO_ExpOp],
-                   [TF_Expm1Op, HLO_Expm1Op],
+                   [TF_ExpOp, MHLO_ExpOp],
+                   [TF_Expm1Op, MHLO_Expm1Op],
                    [TF_ErfOp, CHLO_ErfOp],
                    [TF_ErfcOp, CHLO_ErfcOp],
-                   [TF_FloorOp, HLO_FloorOp],
-                   [TF_ImagOp, HLO_ImagOp],
-                   [TF_InvertOp, HLO_NotOp],
-                   [TF_IsFiniteOp, HLO_IsFiniteOp],
+                   [TF_FloorOp, MHLO_FloorOp],
+                   [TF_ImagOp, MHLO_ImagOp],
+                   [TF_InvertOp, MHLO_NotOp],
+                   [TF_IsFiniteOp, MHLO_IsFiniteOp],
                    [TF_IsInfOp, CHLO_IsInfOp],
                    [TF_LgammaOp, CHLO_LgammaOp],
-                   [TF_LogOp, HLO_LogOp],
-                   [TF_Log1pOp, HLO_Log1pOp],
-                   [TF_LogicalNotOp, HLO_NotOp],
-                   [TF_NegOp, HLO_NegOp],
-                   [TF_RealOp, HLO_RealOp],
-                   [TF_RsqrtOp, HLO_RsqrtOp],
-                   [TF_SigmoidOp, HLO_LogisticOp],
+                   [TF_LogOp, MHLO_LogOp],
+                   [TF_Log1pOp, MHLO_Log1pOp],
+                   [TF_LogicalNotOp, MHLO_NotOp],
+                   [TF_NegOp, MHLO_NegOp],
+                   [TF_RealOp, MHLO_RealOp],
+                   [TF_RsqrtOp, MHLO_RsqrtOp],
+                   [TF_SigmoidOp, MHLO_LogisticOp],
                    [TF_SinhOp, CHLO_SinhOp],
-                   [TF_SinOp, HLO_SineOp],
-                   [TF_SqrtOp, HLO_SqrtOp],
-                   [TF_TanhOp, HLO_TanhOp],
-                   [TF_TanOp, CHLO_TanOp]
+                   [TF_SinOp, MHLO_SineOp],
+                   [TF_SqrtOp, MHLO_SqrtOp],
+                   [TF_TanhOp, MHLO_TanhOp],
+                   [TF_TanOp, MHLO_TanOp]
                   ] in {
- def : Pat<(Mapping[0] HLO_Tensor:$input),
+ def : Pat<(Mapping[0] MHLO_Tensor:$input),
            (Mapping[1] $input)>;
 }
 
-def : Pat<(TF_AngleOp $x), (HLO_Atan2Op (HLO_ImagOp $x), (HLO_RealOp $x))>;
+def : Pat<(TF_AngleOp $x), (MHLO_Atan2Op (MHLO_ImagOp $x), (MHLO_RealOp $x))>;
 
 // TODO(bixia): Lower with Truncate=True for floating point value conversions.
-def : Pat<(TF_CastOp $arg, ConstBoolAttrFalse), (HLO_ConvertOp $arg)>;
+def : Pat<(TF_CastOp $arg, ConstBoolAttrFalse), (MHLO_ConvertOp $arg)>;
 
 def : Pat<(TF_TransposeOp:$res $arg, (ConstantLikeMatcher ElementsAttr:$permutation)),
-          (HLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>;
+          (MHLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>;
 
 
 // Lowering these ops with static shape to mhlo.reshape
 foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in {
-  def : Pat<(TfOp:$res HLO_Tensor:$arg, $ignored),
-            (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)],
+  def : Pat<(TfOp:$res MHLO_Tensor:$arg, $ignored),
+            (MHLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)],
             (addBenefit 2)>;
 }
 
 // Lowering tf.Reshape with dynamic shape
-def : Pat<(TF_ReshapeOp:$res HLO_Tensor:$arg, $shape),
+def : Pat<(TF_ReshapeOp:$res MHLO_Tensor:$arg, $shape),
           (CHLO_DynamicReshapeOp $arg, $shape)>;
 
 // Returns NaN if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-def : Pat<(TF_SignOp $x), (HLO_SignOp $x)>;
+def : Pat<(TF_SignOp $x), (MHLO_SignOp $x)>;
 
 def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
   "getElementTypeOrSelf($0.getType()).isIntOrFloat() && "
@@ -635,8 +638,8 @@ def BothElementTypesSameWidthIntOrFloat : Constraint<CPred<
 // TODO(mgester): Due to restrictions of xla::BitcastConvertType we currently
 // only lower if both input and output types are int or float and have same width
 
-def : Pat<(TF_BitcastOp:$res HLO_Tensor:$arg),
-          (HLO_BitcastConvertOp $arg),
+def : Pat<(TF_BitcastOp:$res MHLO_Tensor:$arg),
+          (MHLO_BitcastConvertOp $arg),
           [(BothElementTypesSameWidthIntOrFloat $res, $arg)]>;
 
 // TODO(jpienaar): Lower constant like to constant to broadcast if dynamic
@@ -647,39 +650,39 @@ def : Pat<(TF_BitcastOp:$res HLO_Tensor:$arg),
 //===----------------------------------------------------------------------===//
 // TODO(b/148269299): handle random number generator seeds/states correctly.
 
-class HLO_RngDistributionValue<string enumStr> :
-  ConstantAttr<HLO_RngDistributionAttr, "::mlir::mhlo::RngDistribution::" # enumStr>;
+class MHLO_RngDistributionValue<string enumStr> :
+  ConstantAttr<MHLO_RngDistributionAttr, "::mlir::mhlo::RngDistribution::" # enumStr>;
 
 def : Pat<(TF_RandomUniformOp:$old $shape, $seed, $seed2),
-          (HLO_RngOp
-            (HLO_ConstantOp
-              (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 0.0)">)),
-            (HLO_ConstantOp
-              (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 1.0)">)),
+          (MHLO_RngOp
+            (MHLO_ConstantOp
+              (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 0.0)">)),
+            (MHLO_ConstantOp
+              (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 1.0)">)),
             (CastValueToI64 $old, $shape),
-            HLO_RngDistributionValue<"UNIFORM">),
+            MHLO_RngDistributionValue<"UNIFORM">),
           [(IsShapedTensor $shape)]>;
 
 def : Pat<(TF_RandomStandardNormalOp:$old $shape, $seed, $seed2),
-          (HLO_RngOp
-            (HLO_ConstantOp
-              (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 0.0)">)),
-            (HLO_ConstantOp
-              (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 1.0)">)),
+          (MHLO_RngOp
+            (MHLO_ConstantOp
+              (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 0.0)">)),
+            (MHLO_ConstantOp
+              (NativeCodeCall<"$_builder.getFloatAttr(old.getDtype(), 1.0)">)),
             (CastValueToI64 $old, $shape),
-            HLO_RngDistributionValue<"NORMAL">),
+            MHLO_RngDistributionValue<"NORMAL">),
           [(IsShapedTensor $shape)]>;
 
 //===----------------------------------------------------------------------===//
 // Sigmoid grad op.
 //===----------------------------------------------------------------------===//
-
-// TODO(hinsu): Handle unranked inputs by broadcasting constant one to the
-// shape of $l instead of having it as a constant.
-def : Pat<(TF_SigmoidGradOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
-          (HLO_MulOp
-           (HLO_MulOp $r, $l),
-           (HLO_SubtractOp (HLO_ConstantOp (ConstantSplat<"1"> $l)), $l))>;
+// Disc disable: use ConvertSigmoidGradOpDynamic
+// // TODO(hinsu): Handle unranked inputs by broadcasting constant one to the
+// // shape of $l instead of having it as a constant.
+// def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
+//           (MHLO_MulOp
+//            (MHLO_MulOp $r, $l),
+//            (MHLO_SubtractOp (MHLO_ConstantOp (ConstantSplat<"1"> $l)), $l))>;
 
 //===----------------------------------------------------------------------===//
 // Softplus op.
@@ -689,22 +692,22 @@ def EpsilonValue : NativeCodeCall<"GetEpsilonValue($0.getType())">;
 
 def : Pattern<(TF_SoftplusOp AnyTensor:$features),
               [
-                (HLO_ExpOp:$features_exp $features),
+                (MHLO_ExpOp:$features_exp $features),
                 (CHLO_BroadcastAddOp:$threshold
-                 (HLO_LogOp (HLO_ConstantOp (EpsilonValue $features))),
-                 (HLO_ConstantOp (GetScalarOfType<2> $features)),
+                 (MHLO_LogOp (MHLO_ConstantOp (EpsilonValue $features))),
+                 (MHLO_ConstantOp (GetScalarOfType<2> $features)),
                  (NullDenseIntElementsAttr)
                 ),
-                (HLO_SelectOp:$output
+                (MHLO_SelectOp:$output
                  (CHLO_BroadcastCompareOp
                   $features,
-                  (HLO_NegOp $threshold),
+                  (MHLO_NegOp $threshold),
                   (NullDenseIntElementsAttr),
                   CHLO_ComparisonDirectionValue<"GT">,
                   (CHLO_DEFAULT_COMPARISON_TYPE)
                  ),
                  $features,
-                 (HLO_SelectOp
+                 (MHLO_SelectOp
                   (CHLO_BroadcastCompareOp
                    $features,
                    $threshold,
@@ -713,7 +716,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                    (CHLO_DEFAULT_COMPARISON_TYPE)
                   ),
                   $features_exp,
-                  (HLO_Log1pOp $features_exp)
+                  (MHLO_Log1pOp $features_exp)
                  )
                 ),
                 (replaceWithValue $output)
@@ -724,7 +727,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaReplicaIdOp),
-          (TF_CastOp (HLO_ReplicaIdOp), /*truncate=*/ConstBoolAttrFalse)>;
+          (TF_CastOp (MHLO_ReplicaIdOp), /*truncate=*/ConstBoolAttrFalse)>;
 
 //===----------------------------------------------------------------------===//
 // XlaGather op.
@@ -736,7 +739,7 @@ def HasValidGatherDims : Constraint<CPred<"HasValidGatherDims($0)">>;
 
 def : Pat<(TF_XlaGatherOp $operand, $start_indices, (ConstantLikeMatcher ElementsAttr:$slice_sizes),
                           $dimension_numbers, $indices_are_sorted),
-          (HLO_GatherOp $operand, $start_indices,
+          (MHLO_GatherOp $operand, $start_indices,
                         (ToGatherDimNumsAttr $dimension_numbers),
                         (CastElementsToI64Elements $slice_sizes),
                         $indices_are_sorted),
@@ -755,7 +758,7 @@ def HasValidDotDims : Constraint<CPred<"HasValidDotDims($0)">>;
 def HasValidPrecisionConfig : Constraint<CPred<"HasValidPrecisionConfig($0)">>;
 
 def : Pat<(TF_XlaDotOp $lhs, $rhs, $dimension_numbers, $precision_config),
-          (HLO_DotGeneralOp $lhs, $rhs,
+          (MHLO_DotGeneralOp $lhs, $rhs,
                             (ToDotDimNumsAttr $dimension_numbers),
                             (ToPrecisionConfigsAttr $precision_config)),
           [(HasValidDotDims $dimension_numbers), (HasValidPrecisionConfig $precision_config)]>;
@@ -765,7 +768,7 @@ def : Pat<(TF_XlaDotOp $lhs, $rhs, $dimension_numbers, $precision_config),
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaDotV2Op $lhs, $rhs, $dimension_numbers, $precision_config),
-          (HLO_DotGeneralOp $lhs, $rhs,
+          (MHLO_DotGeneralOp $lhs, $rhs,
                             (ToDotDimNumsAttr $dimension_numbers),
                             (ToPrecisionConfigsAttr $precision_config)),
           [(HasValidDotDims $dimension_numbers), (HasValidPrecisionConfig $precision_config)]>;
@@ -774,9 +777,9 @@ def : Pat<(TF_XlaDotV2Op $lhs, $rhs, $dimension_numbers, $precision_config),
 // XlaDynamicSlice op.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_XlaDynamicSliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices,
+def : Pat<(TF_XlaDynamicSliceOp:$op MHLO_Tensor:$input, MHLO_Tensor:$starting_indices,
            (ConstantLikeMatcher AnyAttr:$slice_sizes)),
-          (HLO_DynamicSliceOp $input,
+          (MHLO_DynamicSliceOp $input,
            (UnpackStartingIndices $op, $starting_indices),
            (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes))>;
 
@@ -785,11 +788,11 @@ def : Pat<(TF_XlaDynamicSliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indi
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaEinsumOp $lhs, $rhs, $equation),
-          (HLO_EinsumOp $lhs, $rhs, $equation)>;
+          (MHLO_EinsumOp $lhs, $rhs, $equation)>;
 
 //===----------------------------------------------------------------------===//
 // XlaOptimizationBarrierOp op.
 //===----------------------------------------------------------------------===//
 
 def : Pat<(TF_XlaOptimizationBarrierOp $args),
-          (HLO_OptimizationBarrierOp $args)>;
+          (MHLO_OptimizationBarrierOp $args)>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_types.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_types.cc
index 0ca3614c3d5..9ec9287d811 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_types.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_types.cc
@@ -141,7 +141,8 @@ class TfTypePattern : public ConversionPattern {
         if (newType == type) continue;
 
         tensorflow::Tensor out;
-        if (tensorflow::ConvertToTensor(elemsAttr, &out) != tensorflow::Status::OK())
+        if (tensorflow::ConvertToTensor(elemsAttr, &out) !=
+            tensorflow::OkStatus())
           return failure();
         ArrayRef<char> data(static_cast<char*>(out.data()), out.TotalBytes());
         auto newAttr = DenseElementsAttr::getFromRawBuffer(newType, data);
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 07a35b6bb06..78ec0ad35a3 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -39,21 +40,23 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -74,6 +77,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace mlir {
 namespace mhlo {
@@ -86,197 +90,213 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
   // all tf2xla kernels.
   // Use a pointer for the static set, so the set is not destructed upon thread
   // end, which would not be thread safe.
-  // clang-format off
 
-  static auto* ops =
-      new llvm::SmallDenseSet<mlir::TypeID, 512>{
-    TypeID::get<TF::AcoshOp>(),
-    TypeID::get<TF::AcosOp>(),
-    TypeID::get<TF::AddNOp>(),
-    TypeID::get<TF::AddV2Op>(),
-    TypeID::get<TF::AngleOp>(),
-    TypeID::get<TF::AdjustContrastv2Op>(),
-    TypeID::get<TF::AdjustHueOp>(),
-    TypeID::get<TF::AdjustSaturationOp>(),
-    TypeID::get<TF::ApproximateEqualOp>(),
-    TypeID::get<TF::ArgMaxOp>(),
-    TypeID::get<TF::ArgMinOp>(),
-    TypeID::get<TF::AsinhOp>(),
-    TypeID::get<TF::AsinOp>(),
-    TypeID::get<TF::Atan2Op>(),
-    TypeID::get<TF::AtanhOp>(),
-    TypeID::get<TF::BatchMatMulV2Op>(),
-    TypeID::get<TF::BatchMatMulV3Op>(),
-    TypeID::get<TF::BatchToSpaceOp>(),
-    TypeID::get<TF::BesselI0eOp>(),
-    TypeID::get<TF::BesselI1eOp>(),
-    TypeID::get<TF::BetaincOp>(),
-    TypeID::get<TF::BiasAddOp>(),
-    TypeID::get<TF::BitwiseAndOp>(),
-    TypeID::get<TF::BitwiseOrOp>(),
-    TypeID::get<TF::BitwiseXorOp>(),
-    TypeID::get<TF::BucketizeOp>(),
-    TypeID::get<TF::CastOp>(),
-    TypeID::get<TF::ClipByValueOp>(),
-    TypeID::get<TF::CholeskyOp>(),
-    TypeID::get<TF::ComplexAbsOp>(),
-    TypeID::get<TF::ConjugateTransposeOp>(),
-    TypeID::get<TF::CoshOp>(),
-    TypeID::get<TF::CrossOp>(),
-    TypeID::get<TF::CumulativeLogsumexpOp>(),
-    TypeID::get<TF::DataFormatDimMapOp>(),
-    TypeID::get<TF::DataFormatVecPermuteOp>(),
-    TypeID::get<TF::DepthToSpaceOp>(),
-    TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
-    TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
-    TypeID::get<TF::DiagOp>(),
-    TypeID::get<TF::DigammaOp>(),
-    TypeID::get<TF::DivNoNanOp>(),
-    TypeID::get<TF::EluGradOp>(),
-    TypeID::get<TF::EluOp>(),
-    TypeID::get<TF::EnsureShapeOp>(),
-    TypeID::get<TF::EqualOp>(),
-    TypeID::get<TF::ErfcOp>(),
-    TypeID::get<TF::ErfinvOp>(),
-    TypeID::get<TF::ErfOp>(),
-    TypeID::get<TF::ExtractImagePatchesOp>(),
-    TypeID::get<TF::FFT2DOp>(),
-    TypeID::get<TF::FFT3DOp>(),
-    TypeID::get<TF::FFTOp>(),
-    TypeID::get<TF::FakeParamOp>(),
-    TypeID::get<TF::FakeQuantWithMinMaxArgsGradientOp>(),
-    TypeID::get<TF::FakeQuantWithMinMaxVarsGradientOp>(),
-    TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelOp>(),
-    TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelGradientOp>(),
-    TypeID::get<TF::FloorDivOp>(),
-    TypeID::get<TF::FloorModOp>(),
-    TypeID::get<TF::GreaterOp>(),
-    TypeID::get<TF::HSVToRGBOp>(),
-    TypeID::get<TF::IFFT2DOp>(),
-    TypeID::get<TF::IFFT3DOp>(),
-    TypeID::get<TF::IRFFT2DOp>(),
-    TypeID::get<TF::IRFFT3DOp>(),
-    TypeID::get<TF::IgammaOp>(),
-    TypeID::get<TF::IgammacOp>(),
-    TypeID::get<TF::IgammaGradAOp>(),
-    TypeID::get<TF::InplaceAddOp>(),
-    TypeID::get<TF::InTopKV2Op>(),
-    TypeID::get<TF::InvertOp>(),
-    TypeID::get<TF::InvOp>(),
-    TypeID::get<TF::KthOrderStatisticOp>(),
-    TypeID::get<TF::LRNOp>(),
-    TypeID::get<TF::LRNGradOp>(),
-    TypeID::get<TF::LeakyReluGradOp>(),
-    TypeID::get<TF::LeakyReluOp>(),
-    TypeID::get<TF::LeftShiftOp>(),
-    TypeID::get<TF::LessOp>(),
-    TypeID::get<TF::ListDiffOp>(),
-    TypeID::get<TF::LogicalAndOp>(),
-    TypeID::get<TF::LogicalNotOp>(),
-    TypeID::get<TF::LogOp>(),
-    TypeID::get<TF::LowerBoundOp>(),
-    TypeID::get<TF::MakeUniqueOp>(),
-    TypeID::get<TF::MatMulOp>(),
-    TypeID::get<TF::MatrixDiagV3Op>(),
-    TypeID::get<TF::MatrixInverseOp>(),
-    TypeID::get<TF::MatrixSetDiagV3Op>(),
-    TypeID::get<TF::MatrixSolveOp>(),
-    TypeID::get<TF::MatrixTriangularSolveOp>(),
-    TypeID::get<TF::MaxPool3DGradGradOp>(),
-    TypeID::get<TF::MaxPoolGradGradOp>(),
-    TypeID::get<TF::MirrorPadOp>(),
-    TypeID::get<TF::MirrorPadGradOp>(),
-    TypeID::get<TF::MulOp>(),
-    TypeID::get<TF::MultinomialOp>(),
-    TypeID::get<TF::NdtriOp>(),
-    TypeID::get<TF::NegOp>(),
-    TypeID::get<TF::NextAfterOp>(),
-    TypeID::get<TF::NonMaxSuppressionV4Op>(),
-    TypeID::get<TF::NotEqualOp>(),
-    TypeID::get<TF::PadOp>(),
-    TypeID::get<TF::ParameterizedTruncatedNormalOp>(),
-    TypeID::get<TF::PlaceholderWithDefaultOp>(),
-    TypeID::get<TF::PolygammaOp>(),
-    TypeID::get<TF::PopulationCountOp>(),
-    TypeID::get<TF::PowOp>(),
-    // TODO(hinsu): Canonicalize QuantizeAndDequantize and
-    // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
-    // attributes to operands.
-    TypeID::get<TF::QuantizeAndDequantizeOp>(),
-    TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
-    TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
-    TypeID::get<TF::QuantizeAndDequantizeV4Op>(),
-    TypeID::get<TF::RFFT2DOp>(),
-    TypeID::get<TF::RFFT3DOp>(),
-    TypeID::get<TF::RGBToHSVOp>(),
-    TypeID::get<TF::RandomUniformIntOp>(),
-    TypeID::get<TF::RealDivOp>(),
-    TypeID::get<TF::ReciprocalGradOp>(),
-    TypeID::get<TF::Relu6GradOp>(),
-    TypeID::get<TF::ResizeBilinearOp>(),
-    TypeID::get<TF::ResizeBilinearGradOp>(),
-    TypeID::get<TF::ResizeNearestNeighborOp>(),
-    TypeID::get<TF::ResizeNearestNeighborGradOp>(),
-    TypeID::get<TF::ReverseSequenceOp>(),
-    TypeID::get<TF::RightShiftOp>(),
-    TypeID::get<TF::RintOp>(),
-    TypeID::get<TF::RollOp>(),
-    TypeID::get<TF::RoundOp>(),
-    TypeID::get<TF::SelectV2Op>(),
-    TypeID::get<TF::SelfAdjointEigV2Op>(),
-    TypeID::get<TF::SeluGradOp>(),
-    TypeID::get<TF::SeluOp>(),
-    TypeID::get<TF::SigmoidGradOp>(),
-    TypeID::get<TF::SinOp>(),
-    TypeID::get<TF::SoftplusGradOp>(),
-    TypeID::get<TF::SoftsignGradOp>(),
-    TypeID::get<TF::SoftsignOp>(),
-    TypeID::get<TF::SpaceToBatchNDOp>(),
-    TypeID::get<TF::SpaceToBatchOp>(),
-    TypeID::get<TF::SpaceToDepthOp>(),
-    TypeID::get<TF::SparseToDenseOp>(),
-    TypeID::get<TF::SquareOp>(),
-    TypeID::get<TF::StatelessMultinomialOp>(),
-    TypeID::get<TF::StatelessParameterizedTruncatedNormalOp>(),
-    TypeID::get<TF::StatelessRandomGetAlgOp>(),
-    TypeID::get<TF::StatelessRandomGetKeyCounterOp>(),
-    TypeID::get<TF::StatelessRandomGetKeyCounterAlgOp>(),
-    TypeID::get<TF::StatelessRandomNormalOp>(),
-    TypeID::get<TF::StatelessRandomNormalV2Op>(),
-    TypeID::get<TF::StatelessRandomUniformOp>(),
-    TypeID::get<TF::StatelessRandomUniformFullIntOp>(),
-    TypeID::get<TF::StatelessRandomUniformFullIntV2Op>(),
-    TypeID::get<TF::StatelessRandomUniformV2Op>(),
-    TypeID::get<TF::StatelessRandomUniformIntOp>(),
-    TypeID::get<TF::StatelessRandomUniformIntV2Op>(),
-    TypeID::get<TF::StatelessTruncatedNormalOp>(),
-    TypeID::get<TF::StatelessTruncatedNormalV2Op>(),
-    TypeID::get<TF::SubOp>(),
-    TypeID::get<TF::SvdOp>(),
-    TypeID::get<TF::TanOp>(),
-    TypeID::get<TF::TensorScatterAddOp>(),
-    TypeID::get<TF::TensorScatterSubOp>(),
-    TypeID::get<TF::TPUEmbeddingActivationsOp>(),
-    TypeID::get<TF::TopKUniqueOp>(),
-    TypeID::get<TF::TopKWithUniqueOp>(),
-    TypeID::get<TF::TransposeOp>(),
-    TypeID::get<TF::TridiagonalSolveOp>(),
-    TypeID::get<TF::TridiagonalMatMulOp>(),
-    TypeID::get<TF::TruncateDivOp>(),
-    TypeID::get<TF::TruncatedNormalOp>(),
-    TypeID::get<TF::TruncateModOp>(),
-    TypeID::get<TF::UniqueOp>(),
-    TypeID::get<TF::UnpackOp>(),
-    TypeID::get<TF::UpperBoundOp>(),
-    TypeID::get<TF::XlaBroadcastHelperOp>(),
-    TypeID::get<TF::XlaCustomCallV2Op>(),
-    TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
-    TypeID::get<TF::XlaKeyValueSortOp>(),
-    TypeID::get<TF::XlaPadOp>(),
-    TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
-    TypeID::get<TF::XlaSvdOp>(),
-  };
-  // clang-format on
+  static auto* ops = [] {
+    llvm::SmallDenseSet<mlir::TypeID, 512>* ops_set =
+        new llvm::SmallDenseSet<mlir::TypeID, 512>{
+            TypeID::get<TF::AcoshOp>(),
+            TypeID::get<TF::AcosOp>(),
+            TypeID::get<TF::AddNOp>(),
+            TypeID::get<TF::AddV2Op>(),
+            TypeID::get<TF::AngleOp>(),
+            TypeID::get<TF::AdjustContrastv2Op>(),
+            TypeID::get<TF::AdjustHueOp>(),
+            TypeID::get<TF::AdjustSaturationOp>(),
+            TypeID::get<TF::ApproximateEqualOp>(),
+            TypeID::get<TF::ArgMaxOp>(),
+            TypeID::get<TF::ArgMinOp>(),
+            TypeID::get<TF::AsinhOp>(),
+            TypeID::get<TF::AsinOp>(),
+            TypeID::get<TF::Atan2Op>(),
+            TypeID::get<TF::AtanhOp>(),
+            TypeID::get<TF::BatchMatMulV2Op>(),
+            TypeID::get<TF::BatchMatMulV3Op>(),
+            TypeID::get<TF::BatchToSpaceOp>(),
+            TypeID::get<TF::BesselI0eOp>(),
+            TypeID::get<TF::BesselI1eOp>(),
+            TypeID::get<TF::BetaincOp>(),
+            TypeID::get<TF::BiasAddOp>(),
+            TypeID::get<TF::BitwiseAndOp>(),
+            TypeID::get<TF::BitwiseOrOp>(),
+            TypeID::get<TF::BitwiseXorOp>(),
+            TypeID::get<TF::BucketizeOp>(),
+            // CaseOp isn't actually supported but is enabled for testing to
+            // make sure ops with symbol ref attributes are filtered out.
+            TypeID::get<TF::CaseOp>(),
+            TypeID::get<TF::CastOp>(),
+            TypeID::get<TF::ClipByValueOp>(),
+            TypeID::get<TF::CholeskyOp>(),
+            TypeID::get<TF::ComplexAbsOp>(),
+            TypeID::get<TF::ConjugateTransposeOp>(),
+            TypeID::get<TF::CoshOp>(),
+            TypeID::get<TF::CrossOp>(),
+            TypeID::get<TF::CumulativeLogsumexpOp>(),
+            TypeID::get<TF::DataFormatDimMapOp>(),
+            TypeID::get<TF::DataFormatVecPermuteOp>(),
+            TypeID::get<TF::DepthToSpaceOp>(),
+            TypeID::get<TF::DepthwiseConv2dNativeBackpropFilterOp>(),
+            TypeID::get<TF::DepthwiseConv2dNativeBackpropInputOp>(),
+            TypeID::get<TF::DiagOp>(),
+            TypeID::get<TF::DigammaOp>(),
+            TypeID::get<TF::DivNoNanOp>(),
+            TypeID::get<TF::DynamicPartitionOp>(),
+            TypeID::get<TF::EluGradOp>(),
+            TypeID::get<TF::EluOp>(),
+            TypeID::get<TF::EnsureShapeOp>(),
+            TypeID::get<TF::EqualOp>(),
+            TypeID::get<TF::ErfcOp>(),
+            TypeID::get<TF::ErfinvOp>(),
+            TypeID::get<TF::ErfOp>(),
+            TypeID::get<TF::ExtractImagePatchesOp>(),
+            TypeID::get<TF::FFT2DOp>(),
+            TypeID::get<TF::FFT3DOp>(),
+            TypeID::get<TF::FFTOp>(),
+            TypeID::get<TF::FakeParamOp>(),
+            TypeID::get<TF::FakeQuantWithMinMaxArgsGradientOp>(),
+            TypeID::get<TF::FakeQuantWithMinMaxVarsGradientOp>(),
+            TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelOp>(),
+            TypeID::get<TF::FakeQuantWithMinMaxVarsPerChannelGradientOp>(),
+            TypeID::get<TF::FloorDivOp>(),
+            TypeID::get<TF::FloorModOp>(),
+            TypeID::get<TF::GreaterOp>(),
+            TypeID::get<TF::HSVToRGBOp>(),
+            TypeID::get<TF::IFFT2DOp>(),
+            TypeID::get<TF::IFFT3DOp>(),
+            TypeID::get<TF::IRFFT2DOp>(),
+            TypeID::get<TF::IRFFT3DOp>(),
+            TypeID::get<TF::IgammaOp>(),
+            TypeID::get<TF::IgammacOp>(),
+            TypeID::get<TF::IgammaGradAOp>(),
+            TypeID::get<TF::InplaceAddOp>(),
+            TypeID::get<TF::InTopKV2Op>(),
+            TypeID::get<TF::InvertOp>(),
+            TypeID::get<TF::InvOp>(),
+            TypeID::get<TF::KthOrderStatisticOp>(),
+            TypeID::get<TF::LRNOp>(),
+            TypeID::get<TF::LRNGradOp>(),
+            TypeID::get<TF::LeakyReluGradOp>(),
+            TypeID::get<TF::LeakyReluOp>(),
+            TypeID::get<TF::LeftShiftOp>(),
+            TypeID::get<TF::LessOp>(),
+            TypeID::get<TF::ListDiffOp>(),
+            TypeID::get<TF::LogicalAndOp>(),
+            TypeID::get<TF::LogicalNotOp>(),
+            TypeID::get<TF::LogOp>(),
+            TypeID::get<TF::LowerBoundOp>(),
+            TypeID::get<TF::MakeUniqueOp>(),
+            TypeID::get<TF::MatMulOp>(),
+            TypeID::get<TF::MatrixDiagV3Op>(),
+            TypeID::get<TF::MatrixInverseOp>(),
+            TypeID::get<TF::MatrixSetDiagV3Op>(),
+            TypeID::get<TF::MatrixSolveOp>(),
+            TypeID::get<TF::MatrixTriangularSolveOp>(),
+            TypeID::get<TF::MaxPool3DGradGradOp>(),
+            TypeID::get<TF::MaxPoolGradGradOp>(),
+            TypeID::get<TF::MirrorPadOp>(),
+            TypeID::get<TF::MirrorPadGradOp>(),
+            TypeID::get<TF::MulOp>(),
+            TypeID::get<TF::MultinomialOp>(),
+            TypeID::get<TF::NdtriOp>(),
+            TypeID::get<TF::NegOp>(),
+            TypeID::get<TF::NextAfterOp>(),
+            TypeID::get<TF::NonMaxSuppressionV4Op>(),
+            TypeID::get<TF::NotEqualOp>(),
+            TypeID::get<TF::PadOp>(),
+            TypeID::get<TF::ParameterizedTruncatedNormalOp>(),
+            TypeID::get<TF::PlaceholderWithDefaultOp>(),
+            TypeID::get<TF::PolygammaOp>(),
+            TypeID::get<TF::PopulationCountOp>(),
+            TypeID::get<TF::PowOp>(),
+            TypeID::get<TF::QrOp>(),
+            // TODO(hinsu): Canonicalize QuantizeAndDequantize and
+            // QuantizeAndDequantizeV2 to QuantizeAndDequantizeV3 by converting
+            // attributes to operands.
+            TypeID::get<TF::QuantizeAndDequantizeOp>(),
+            TypeID::get<TF::QuantizeAndDequantizeV2Op>(),
+            TypeID::get<TF::QuantizeAndDequantizeV3Op>(),
+            TypeID::get<TF::QuantizeAndDequantizeV4Op>(),
+            TypeID::get<TF::RFFT2DOp>(),
+            TypeID::get<TF::RFFT3DOp>(),
+            TypeID::get<TF::RGBToHSVOp>(),
+            TypeID::get<TF::RandomUniformIntOp>(),
+            TypeID::get<TF::RealDivOp>(),
+            TypeID::get<TF::ReciprocalGradOp>(),
+            TypeID::get<TF::Relu6GradOp>(),
+            TypeID::get<TF::ResizeBilinearOp>(),
+            TypeID::get<TF::ResizeBilinearGradOp>(),
+            TypeID::get<TF::ResizeNearestNeighborOp>(),
+            TypeID::get<TF::ResizeNearestNeighborGradOp>(),
+            TypeID::get<TF::ReverseSequenceOp>(),
+            TypeID::get<TF::RightShiftOp>(),
+            TypeID::get<TF::RintOp>(),
+            TypeID::get<TF::RollOp>(),
+            TypeID::get<TF::RoundOp>(),
+            TypeID::get<TF::SegmentSumV2Op>(),
+            TypeID::get<TF::SegmentProdV2Op>(),
+            TypeID::get<TF::SelectV2Op>(),
+            TypeID::get<TF::SelfAdjointEigV2Op>(),
+            TypeID::get<TF::SeluGradOp>(),
+            TypeID::get<TF::SeluOp>(),
+            TypeID::get<TF::SigmoidGradOp>(),
+            TypeID::get<TF::SinOp>(),
+            TypeID::get<TF::SoftplusGradOp>(),
+            TypeID::get<TF::SoftsignGradOp>(),
+            TypeID::get<TF::SoftsignOp>(),
+            TypeID::get<TF::SpaceToBatchNDOp>(),
+            TypeID::get<TF::SpaceToBatchOp>(),
+            TypeID::get<TF::SpaceToDepthOp>(),
+            TypeID::get<TF::SparseToDenseOp>(),
+            TypeID::get<TF::SquareOp>(),
+            TypeID::get<TF::StatelessMultinomialOp>(),
+            TypeID::get<TF::StatelessParameterizedTruncatedNormalOp>(),
+            TypeID::get<TF::StatelessRandomGetAlgOp>(),
+            TypeID::get<TF::StatelessRandomGetKeyCounterOp>(),
+            TypeID::get<TF::StatelessRandomGetKeyCounterAlgOp>(),
+            TypeID::get<TF::StatelessRandomNormalOp>(),
+            TypeID::get<TF::StatelessRandomNormalV2Op>(),
+            TypeID::get<TF::StatelessRandomUniformOp>(),
+            TypeID::get<TF::StatelessRandomUniformFullIntOp>(),
+            TypeID::get<TF::StatelessRandomUniformFullIntV2Op>(),
+            TypeID::get<TF::StatelessRandomUniformV2Op>(),
+            TypeID::get<TF::StatelessRandomUniformIntOp>(),
+            TypeID::get<TF::StatelessRandomUniformIntV2Op>(),
+            TypeID::get<TF::StatelessTruncatedNormalOp>(),
+            TypeID::get<TF::StatelessTruncatedNormalV2Op>(),
+            TypeID::get<TF::SubOp>(),
+            TypeID::get<TF::SvdOp>(),
+            TypeID::get<TF::TanOp>(),
+            TypeID::get<TF::TensorScatterAddOp>(),
+            TypeID::get<TF::TensorScatterSubOp>(),
+            TypeID::get<TF::TPUEmbeddingActivationsOp>(),
+            TypeID::get<TF::TopKUniqueOp>(),
+            TypeID::get<TF::TopKWithUniqueOp>(),
+            TypeID::get<TF::TransposeOp>(),
+            TypeID::get<TF::TridiagonalSolveOp>(),
+            TypeID::get<TF::TridiagonalMatMulOp>(),
+            TypeID::get<TF::TruncateDivOp>(),
+            TypeID::get<TF::TruncatedNormalOp>(),
+            TypeID::get<TF::TruncateModOp>(),
+            TypeID::get<TF::UniqueOp>(),
+            TypeID::get<TF::UnpackOp>(),
+            TypeID::get<TF::UpperBoundOp>(),
+            TypeID::get<TF::WhereOp>(),
+            TypeID::get<TF::XlaBroadcastHelperOp>(),
+            TypeID::get<TF::XlaCustomCallV2Op>(),
+            TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
+            TypeID::get<TF::XlaKeyValueSortOp>(),
+            TypeID::get<TF::XlaPadOp>(),
+            TypeID::get<TF::XlaSetBoundOp>(),
+            TypeID::get<TF::XlaSetDynamicDimensionSizeOp>(),
+            TypeID::get<TF::XlaSvdOp>(),
+        };
+
+    // Add the ops from the TPUEmbeddingOpsRegistry.
+    for (auto op_type_id :
+         TF::TPUEmbeddingOpsRegistry::Global().GetOpsTypeIds()) {
+      ops_set->insert(op_type_id);
+    }
+    return ops_set;
+  }();
 
   auto abstractOp = op->getRegisteredInfo();
   if (!abstractOp) return false;
@@ -304,6 +324,7 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::BitcastOp>(),
     TypeID::get<TF::BroadcastToOp>(),
     TypeID::get<TF::CollectivePermuteOp>(),
+    TypeID::get<TF::ComplexOp>(),
     TypeID::get<TF::ConcatV2Op>(),
     TypeID::get<TF::ConjOp>(),
     TypeID::get<TF::Conv2DOp>(),
@@ -315,6 +336,7 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::CumprodOp>(),
     TypeID::get<TF::CumsumOp>(),
     TypeID::get<TF::DepthwiseConv2dNativeOp>(),
+    TypeID::get<TF::DivOp>(),
     TypeID::get<TF::DynamicStitchOp>(),
     TypeID::get<TF::_EagerConstOp>(),
     TypeID::get<TF::EmptyOp>(),
@@ -329,6 +351,7 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::FusedBatchNormV3Op>(),
     TypeID::get<TF::GatherNdOp>(),
     TypeID::get<TF::GatherV2Op>(),
+    TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::IdentityOp>(),
     TypeID::get<TF::IdentityNOp>(),
     TypeID::get<TF::InplaceUpdateOp>(),
@@ -336,7 +359,9 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::IRFFTOp>(),
     TypeID::get<TF::L2LossOp>(),
     TypeID::get<TF::LegacyCallOp>(),
+    TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LinSpaceOp>(),
+    TypeID::get<TF::LogicalOrOp>(),
     TypeID::get<TF::MatrixDiagPartV3Op>(),
     TypeID::get<TF::MaxOp>(),
     TypeID::get<TF::MaximumOp>(),
@@ -372,12 +397,15 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::SparseSoftmaxCrossEntropyWithLogitsOp>(),
     TypeID::get<TF::SplitOp>(),
     TypeID::get<TF::SplitVOp>(),
+    TypeID::get<TF::SqrtGradOp>(),
+    TypeID::get<TF::SquaredDifferenceOp>(),
     TypeID::get<TF::SqueezeOp>(),
     TypeID::get<TF::StatelessParameterizedTruncatedNormalOp>(),
     TypeID::get<TF::StatefulPartitionedCallOp>(),
     TypeID::get<TF::StopGradientOp>(),
     TypeID::get<TF::StridedSliceGradOp>(),
     TypeID::get<TF::SumOp>(),
+    TypeID::get<TF::TanhGradOp>(),
     TypeID::get<TF::TensorScatterUpdateOp>(),
     TypeID::get<TF::TileOp>(),
     TypeID::get<TF::TopKV2Op>(),
@@ -390,27 +418,17 @@ bool IsOpAllowedTf2XlaPreferred(Operation* op) {
     TypeID::get<TF::XlaAllReduceOp>(),
     TypeID::get<TF::XlaGatherOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
+    TypeID::get<TF::XlogyOp>(),
     TypeID::get<TF::ZerosLikeOp>(),
+    TypeID::get<TF::ZetaOp>(),
   };
   // clang-format on
-  auto abstractOp = op->getRegisteredInfo();
-  if (!abstractOp) return false;
-  return ops->count(abstractOp->getTypeID());
-}
-// LINT.ThenChange()
 
-bool IsOpAllowedForTesting(Operation* op) {
-  // clang-format off
-  static auto* ops =
-      new llvm::SmallDenseSet<mlir::TypeID, 16>{
-    // Op used to verify handling of XlaExpression of kind constant.
-    TypeID::get<TF::ConstOp>(),
-  };
-  // clang-format on
   auto abstractOp = op->getRegisteredInfo();
   if (!abstractOp) return false;
   return ops->count(abstractOp->getTypeID());
 }
+// LINT.ThenChange()
 
 // List of ops that require falling back to XlaOpKernel legalizations and also
 // require the ability to create functions.
@@ -547,21 +565,53 @@ LogicalResult Tf2XlaRewriter::PrepareParams() {
   return success();
 }
 
+// Returns true if the given type is a ranked tensor type with static or bounded
+// dimensions.
+bool IsBounded(Type ty) {
+  auto ranked_ty = ty.dyn_cast<RankedTensorType>();
+  if (!ranked_ty) return false;
+
+  if (ranked_ty.hasStaticShape()) return true;
+
+  auto encoding =
+      ranked_ty.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>();
+  if (!encoding) return false;
+
+  for (int i = 0; i < ranked_ty.getRank(); ++i) {
+    if (ranked_ty.isDynamicDim(i) &&
+        encoding.getBounds()[i] == ShapedType::kDynamic) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HasSymbolRefAttr(Operation* op) {
+  for (const auto& attr : op->getAttrs()) {
+    Attribute attr_value = attr.getValue();
+    if (attr_value.isa<SymbolRefAttr>()) {
+      return true;
+    } else if (auto array_attr = attr_value.dyn_cast<ArrayAttr>()) {
+      if (!array_attr.empty() && array_attr.begin()->isa<SymbolRefAttr>()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 LogicalResult Tf2XlaRewriter::LegalizeOp() {
-  // Only static shaped operands are supported in XLA builders for now.
   for (Type ty : op_->getOperandTypes()) {
     auto ranked_ty = ty.dyn_cast<ShapedType>();
-    if (!ranked_ty || !ranked_ty.hasStaticShape()) {
+    // Only bounded operands are supported in the XLA builders.
+    if (!IsBounded(ranked_ty)) {
       return op_->emitRemark()
-             << "lowering requires static shaped tensor operands";
+             << "lowering requires bounded tensor operands " << ranked_ty;
     }
   }
 
-  for (const auto& attr : op_->getAttrs()) {
-    if (attr.getValue().isa<SymbolRefAttr>()) {
-      return op_->emitRemark()
-             << "ops with symbol references are not supported";
-    }
+  if (HasSymbolRefAttr(op_)) {
+    return op_->emitRemark() << "ops with symbol references are not supported";
   }
 
   auto nodedef_or = tensorflow::ConvertTFDialectOpToNodeDef(
@@ -679,11 +729,6 @@ LogicalResult Tf2XlaRewriter::LegalizeOp() {
           "output");
     }
     mlir::Value value = hlo_builder_.GetValue(expr->AsXlaOp(&hlo_builder_));
-    mlir::OpResult old_result = op_->getResult(i);
-    if (value.getType() != old_result.getType()) {
-      value = hlo_builder_.create<mlir::tensor::CastOp>(old_result.getType(),
-                                                        value);
-    }
     values.push_back(value);
   }
   rewriter_.replaceOp(op_, values);
@@ -723,20 +768,27 @@ tensorflow::XlaExpression Tf2XlaRewriter::GetExprForOperand(Value operand,
   return tensorflow::XlaExpression::XlaOp(xla_op, dtype);
 }
 
-class Tf2XlaRewritePattern : public RewritePattern {
+class Tf2XlaRewritePattern : public ConversionPattern {
  public:
-  explicit Tf2XlaRewritePattern(MLIRContext* ctx,
+  explicit Tf2XlaRewritePattern(MLIRContext* ctx, TypeConverter& converter,
                                 const std::string& device_type,
-                                bool prefer_tf2xla, bool legalize_test_only_ops,
-                                bool is_module_pass)
-      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx),
+                                bool prefer_tf2xla, bool is_module_pass)
+      : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1, ctx),
         device_type_(device_type),
         prefer_tf2xla_(prefer_tf2xla),
-        legalize_test_only_ops_(legalize_test_only_ops),
         is_module_pass_(is_module_pass) {}
 
-  LogicalResult matchAndRewrite(Operation* op,
-                                PatternRewriter& rewriter) const override {
+  LogicalResult matchAndRewrite(
+      Operation* op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const override {
+    // This pattern is a conversion pattern because we want to specify a type
+    // converter. However, this pattern still uses the original op's operands
+    // while creating the ops so make sure there aren't any type changes between
+    // the original op operands and the operands during the conversion.
+    for (auto&& [old_val, new_val] : llvm::zip(op->getOperands(), operands)) {
+      if (old_val.getType() != new_val.getType()) return failure();
+    }
+
     if (is_module_pass_) {
       // Module passes should only ever legalize ops that have been specifically
       // whitelisted for legalization within a module pass. They will never
@@ -745,8 +797,7 @@ class Tf2XlaRewritePattern : public RewritePattern {
         return failure();
       }
     } else if (!(IsOpAllowedTf2XlaFallback(op) ||
-                 (prefer_tf2xla_ && IsOpAllowedTf2XlaPreferred(op)) ||
-                 (legalize_test_only_ops_ && IsOpAllowedForTesting(op)))) {
+                 (prefer_tf2xla_ && IsOpAllowedTf2XlaPreferred(op)))) {
       return failure();
     }
     return Tf2XlaRewriter::RewriteOp(op, rewriter, device_type_,
@@ -756,52 +807,101 @@ class Tf2XlaRewritePattern : public RewritePattern {
  private:
   std::string device_type_;
   bool prefer_tf2xla_;
-  bool legalize_test_only_ops_;
   bool is_module_pass_;
 };
 
-// Include declaration for LegalizeTFWithTF2XLAOptions
-#define GEN_PASS_DECL_LEGALIZETFWITHTF2XLA
-#define GEN_PASS_DEF_LEGALIZETFWITHTF2XLA
-#include "tensorflow/compiler/mlir/xla/transforms/tf_xla_passes.h.inc"
+bool ShouldRefineTypeTo(Type original_ty, Type updated_ty) {
+  auto updated = updated_ty.dyn_cast<ShapedType>();
+  auto original = original_ty.dyn_cast<ShapedType>();
+
+  // Both types must be shaped types.
+  if (!original || !updated) return false;
+
+  // Element types must match.
+  if (original.getElementType() != updated.getElementType()) return false;
+
+  // If the updated type doesn't have a rank, then it can't be a more refined
+  // type.
+  if (!updated.hasRank()) return false;
 
-class LegalizeTF : public impl::LegalizeTFWithTF2XLABase<LegalizeTF> {
+  // If the original type doesn't have a rank, then refine as the updated type
+  // has a rank.
+  if (!original.hasRank()) return true;
+
+  // Both types must have the same rank.
+  if (original.getRank() != updated.getRank()) return false;
+
+  // Refine if the updated type is bounded.
+  return IsBounded(updated);
+}
+
+// Propagates more refined type by cloning op using the new operands. This
+// allows all rewrite patterns that requires refined types to work without
+// requiring a rewrite to the conversion pattern. Declarative rewrite pattern
+// (DRR) doesn't even support conversion patterns with TableGen.
+class TypePropagator : public ConversionPattern {
  public:
-  LegalizeTF() = default;
-  explicit LegalizeTF(llvm::StringRef device_type, bool prefer_tf2xla) {
-    device_type_ = device_type.str();
-    prefer_tf2xla_ = prefer_tf2xla;
-  }
+  explicit TypePropagator(MLIRContext* ctx)
+      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult matchAndRewrite(
+      Operation* op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const override {
+    // This could be generalized to other ops as needs arise. We could even
+    // remove this restriction altogether except for the terminators that
+    // require function signature change and shouldn't be
+    if (op->getName().getDialectNamespace() !=
+        TF::TensorFlowDialect::getDialectNamespace())
+      return failure();
 
-  LegalizeTF(const LegalizeTF&) {}
+    // Refining types may have implications to the attached regions or symbol
+    // references so do not update such ops.
+    if (!op->getRegions().empty() || HasSymbolRefAttr(op)) return failure();
 
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<Tf2XlaRewritePattern>(&getContext(), device_type_,
-                                       prefer_tf2xla_, legalize_test_only_ops_,
-                                       /*is_module_pass=*/false);
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
-      signalPassFailure();
-  }
+    IRMapping mapper;
+    bool has_type_change = false;
+    for (auto [original, updated] : llvm::zip(op->getOperands(), operands)) {
+      Type original_ty = original.getType();
+      Type updated_ty = updated.getType();
+      if (original_ty != updated_ty) has_type_change = true;
 
- private:
+      if (!ShouldRefineTypeTo(original_ty, updated_ty)) return failure();
+      mapper.map(original, updated);
+    }
+    if (!has_type_change) return failure();
+
+    Operation* cloned_op = rewriter.clone(*op, mapper);
+    rewriter.replaceOp(op, cloned_op->getResults());
+    return success();
+  }
 };
 
 }  // end namespace
 
-void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
-                                          RewritePatternSet& patterns,
-                                          MLIRContext* ctx, bool prefer_tf2xla,
-                                          bool is_module_pass) {
-  patterns.add<Tf2XlaRewritePattern>(ctx, device_type.str(), prefer_tf2xla,
-                                     /*legalize_test_only_ops=*/false,
-                                     is_module_pass);
+Tf2XlaTypeConverter::Tf2XlaTypeConverter() {
+  // Currently, we don't do any type conversions. Any TensorFlow op with a type
+  // that is not supported in MHLO will fail conversion. Quantized types are
+  // going to handled separately so we don't need to handle those.
+  addConversion([](Type ty) { return ty; });
+
+  // This materialization is helpful in cases where we have more refined types
+  // after conversion to mhlo compared to the original type in TF. For example,
+  // a TF op with result type tensor<*xf32> will have a bounded type after
+  // fallback legalization.
+  auto cast_value = [&](OpBuilder& builder, Type result_type, ValueRange inputs,
+                        Location loc) -> Value {
+    return builder.create<mlir::tensor::CastOp>(loc, result_type,
+                                                inputs.front());
+  };
+  addSourceMaterialization(cast_value);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTfWithTf2XlaPass(
-    llvm::StringRef device_type, bool prefer_tf2xla) {
-  return std::make_unique<LegalizeTF>(device_type, prefer_tf2xla);
+void PopulateLegalizeTfWithTf2XlaPatterns(
+    llvm::StringRef device_type, RewritePatternSet& patterns, MLIRContext* ctx,
+    Tf2XlaTypeConverter& converter, bool prefer_tf2xla, bool is_module_pass) {
+  patterns.add<TypePropagator>(ctx);
+  patterns.add<Tf2XlaRewritePattern>(ctx, converter, device_type.str(),
+                                     prefer_tf2xla, is_module_pass);
 }
 
 }  // end namespace mhlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
deleted file mode 100644
index 121cd9d6a23..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
-
-#include "absl/types/optional.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/tsl/platform/status.h"
-
-namespace mlir {
-
-// This class will process an HloModule with the supplied BufferAssignment and
-// populate the MLIR ModuleOp with the computation converted in the LHLO
-// dialect.
-class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
- public:
-  // Initializes internal data structures. It must be called before calling any
-  // of the visitors.
-  tsl::Status Initialize();
-
-  LhloDialectEmitter(const xla::BufferAssignment& assignment,
-                     const xla::HloComputation& computation, ModuleOp module)
-      : assignment_(std::move(assignment)),
-        computation_(computation),
-        module_(module),
-        builder_(module.getContext()),
-        i8_type_(builder_.getIntegerType(8)) {}
-
-  xla::StatusOr<mlir::Operation*> EmitOp(const xla::HloInstruction* instr);
-
-  static xla::StatusOr<mhlo::ScatterDimensionNumbersAttr>
-  GetScatterDimensionNumbers(const xla::HloInstruction* instr,
-                             mlir::MLIRContext* context);
-
- private:
-  xla::StatusOr<lmhlo::SortOp> EmitSortOp(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::FusionOp> EmitFusionOp(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::ScatterOp> EmitScatterOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::SelectAndScatterOp> EmitSelectAndScatterOp(
-      const xla::HloInstruction* instr);
-
-  xla::StatusOr<Operation*> EmitCustomCallOp(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::FusionOp> EmitSoftmax(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo_gpu::CholeskyOp> EmitCholesky(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitGemm(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitCublasLtMatmul(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitDnnConvolution(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitDnnBatchNorm(
-      const xla::HloCustomCallInstruction* custom_call);
-
-  xla::StatusOr<memref::GetGlobalOp> EmitConstant(
-      const xla::HloInstruction* instr);
-
-  xla::StatusOr<lmhlo::InfeedOp> EmitInfeedOp(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::OutfeedOp> EmitOutfeedOp(
-      const xla::HloInstruction* instr);
-
-  xla::StatusOr<lmhlo::AllToAllOp> EmitAllToAllOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::AllGatherOp> EmitAllGatherOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::AllReduceOp> EmitAllReduceOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo_gpu::AllReduceStartOp> EmitAllReduceStartOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo_gpu::AllReduceDoneOp> EmitAllReduceDoneOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::ReduceScatterOp> EmitReduceScatterOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::CollectivePermuteOp> EmitCollectivePermuteOp(
-      const xla::HloInstruction* instr);
-
-  xla::StatusOr<lmhlo::RngGetAndUpdateStateOp> EmitRngGetAndUpdateStateOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::FftOp> EmitFftOp(const xla::HloInstruction* instr);
-  xla::StatusOr<lmhlo::TriangularSolveOp> EmitTriangularSolveOp(
-      const xla::HloInstruction* instr);
-  xla::StatusOr<Operation*> EmitBitcast(const xla::HloInstruction* instr);
-
-  xla::StatusOr<lmhlo::CaseOp> EmitCaseOp(const xla::HloInstruction* instr);
-
-  xla::StatusOr<lmhlo::WhileOp> EmitWhileOp(const xla::HloInstruction* instr);
-
-  xla::Status ImportAsLmhloRegion(xla::HloComputation* computation,
-                                  mlir::Region* region);
-
-  // Since LMHLO dialect does not define token types, this enum controls how
-  // token operand/results from XLA:HLO are lowered to MLIR.
-  enum class TokenLoweringMode {
-    kFailToLower,  // Fail lowering if token inputs are encountered.
-    kUseNull,      // Use a null Value in the operand list for each token.
-    // kSkip,        // Skip any token inputs or outputs (not yet needed)
-  };
-
-  // Create LHLO operation operands given an XLA HLO instruction. By default,
-  // all XLA HLO operands and results are converted to MLIR and appended to
-  // `operands`. If `num_operands` is specified, only the first `num_operand`
-  // operands of the instruction are converted to MLIR. The function returns the
-  // actual number of operands and results generated for MLIR in `num_arguments`
-  // and `num_results`.
-  xla::Status CreateOperands(const xla::HloInstruction* instr,
-                             std::optional<int64_t> num_operands,
-                             TokenLoweringMode token_mode,
-                             SmallVectorImpl<Value>& operands,
-                             size_t& num_arguments, size_t& num_results);
-
-  template <typename OpType>
-  xla::StatusOr<OpType> CreateOpWithoutAttrs(
-      const xla::HloInstruction* instr,
-      std::optional<int64_t> num_operands = std::nullopt) {
-    size_t unused;
-    return CreateOpWithoutAttrs<OpType>(instr, unused, unused, num_operands);
-  }
-
-  template <typename OpType>
-  xla::StatusOr<OpType> CreateOpWithoutAttrs(
-      const xla::HloInstruction* instr, size_t& num_arguments,
-      size_t& num_results, std::optional<int64_t> num_operands = std::nullopt);
-
-  template <typename OpType>
-  OpType CreateOpWithoutAttrs(const xla::HloInstruction* instr,
-                              ValueRange operands);
-
-  xla::StatusOr<mlir::Operation*> CreateOpInFusion(
-      const xla::HloInstruction* instr, ValueRange buffer_operands,
-      size_t num_arguments, size_t num_results);
-
-  xla::StatusOr<mlir::Operation*> CreateOpInFusion(
-      const xla::HloInstruction* instr);
-
-  template <typename T>
-  DenseIntElementsAttr GetI64DenseElementsAttr(const T& container) {
-    return builder_.getI64TensorAttr(
-        {container.data(), static_cast<size_t>(container.size())});
-  }
-
-  DenseIntElementsAttr GetWindowElements(
-      const xla::Window& window,
-      std::function<int64_t(const xla::WindowDimension& dim)> getter) {
-    llvm::SmallVector<int64_t, 4> elements;
-    elements.reserve(window.dimensions_size());
-    for (const xla::WindowDimension& dim : window.dimensions()) {
-      elements.push_back(getter(dim));
-    }
-    return GetI64DenseElementsAttr(elements);
-  }
-
-  static mlir::DenseIntElementsAttr GetLayoutAttribute(
-      const xla::Layout& layout, Builder* builder);
-
-  tsl::Status DefaultAction(const xla::HloInstruction* instr) final;
-
-  // Computation parameters don't need any specific handling when they are
-  // visited, they are already processed when we enter a new computation.
-  tsl::Status HandleParameter(const xla::HloInstruction* instr) final {
-    return ::tsl::OkStatus();
-  }
-
-  // Helper function that recursively visits the tuple structure in
-  // `current_shape`, and reconstruct a matching lmhlo::TupleOp.
-  // Each leaf node is converted to an std.view op with corresponding offsets.
-  // If no tuple presents, it simply returns a view of the buffer.
-  tsl::Status GetOrCreateViewImpl(const xla::HloInstruction* instr,
-                                  const xla::Shape& current_shape,
-                                  xla::ShapeIndex* current_shape_index,
-                                  SmallVectorImpl<Value>* values,
-                                  TokenLoweringMode token_mode);
-
-  // Helper function to create view/tuple of views to a buffer for a given
-  // instruction result. `result_subset` can be used to for instructions that
-  // have a tuple result and MLIR conversion needs to convert only one of the
-  // tuple elements. Note that if needed, this can be extended to take a list of
-  // ShapeIndex values in case we need finer control on what elements of the
-  // output tuple to be converted to MLIR.
-  tsl::Status GetOrCreateView(
-      const xla::HloInstruction* instr, SmallVectorImpl<Value>* values,
-      const xla::ShapeIndex& result_subset = {},
-      TokenLoweringMode token_mode = TokenLoweringMode::kFailToLower);
-
-  xla::StatusOr<Value> GetOrCreateArrayView(
-      const xla::HloInstruction* instr, const xla::Shape& current_shape,
-      const xla::ShapeIndex& current_shape_index);
-
-  xla::StatusOr<Value> RewriteFusionOperand(const xla::HloInstruction* root,
-                                            const xla::Shape& shape,
-                                            xla::ShapeIndex* shape_index,
-                                            OpBuilder* b, Location loc);
-
-  // Return an MLIR location for an HLO instruction.
-  Location getLocation(const xla::HloInstruction* inst) {
-    return NameLoc::get(builder_.getStringAttr(inst->name()));
-  }
-
-  // This map provides access to MLIR buffers for each HLO buffer allocation.
-  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
-  // parameters. It is populated at the beginning of the processing with all
-  // the buffer allocations and is unchanged afterward. Every HLOInstruction
-  // is using a "slice" of the buffer allocation and providing shape, layout,
-  // and Dtype. An MLIR view is used separately to model slices into the
-  // allocations (see below).
-  llvm::DenseMap<const xla::BufferAllocation*, Value> allocations_;
-
-  // This map provides access to MLIR buffers for each HLO instruction, keyed
-  // instruction identity. A slice is contained in a BufferAllocation, and has
-  // an offset and a size.
-  //
-  // As for why we don't use HloInstruction*, see GetOrCreateView(), but
-  // mostly we want to leverage better of the aliased buffers.
-  //
-  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
-  // Otherwise, there will be a single buffer.
-  //
-  // An MLIR buffer is either an input parameter, or a ViewOp in the case
-  // where the slice is only part of its allocation.
-  //
-  // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
-  // process every instruction.
-  absl::flat_hash_map<std::pair<const xla::HloInstruction*, xla::ShapeIndex>,
-                      Value>
-      slices_;
-
-  // The BufferAssignment computed by XLA ahead of time.
-  const xla::BufferAssignment& assignment_;
-
-  // The HLO module that will be converted.
-  const xla::HloComputation& computation_;
-
-  // This is the MLIR module in which a function will be created for every HLO
-  // computation.
-  ModuleOp module_;
-
-  // The builder keeps track of the current insertion point in the MLIR
-  // module.
-  OpBuilder builder_;
-  // Convenient "cached" access to this widely used MLIR type (i8).
-  Type i8_type_;
-
-  // Map all-reduce-start ops to their LHLO op, so we can connect the
-  // all-reduce-done op with the correct token.
-  absl::flat_hash_map<const xla::HloInstruction*, lmhlo_gpu::AllReduceStartOp>
-      all_reduce_start_ops_;
-};
-
-// Populate the MLIR `module` with the computation from the `hlo_module` using
-// the provided buffer `assignment`. The returned `Status` indicates success
-// or failure in the conversion.
-tsl::Status HloToLhloModule(const xla::BufferAssignment& assignment,
-                            const xla::HloModule& hlo_module, ModuleOp module);
-
-tsl::Status OptimizeAndConvertHloToLmhlo(
-    std::unique_ptr<xla::HloModule> hlo_module, ModuleOp module,
-    StringRef platform_name, bool optimize_xla_hlo);
-OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
-    llvm::StringRef input, MLIRContext* context, bool optimize_xla_hlo);
-
-// This register the MLIR pass with the command line.
-void RegisterMhloToLhloWithXlaPass();
-
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MHLO_TO_LHLO_WITH_XLA_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 48717085e94..280f5cb05a6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
 
@@ -54,20 +55,25 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass(
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFModulePass(
     StringRef tf2xla_fallback_device_type = "");
 
+// Legalizes from MHLO quantized ops with MHLO quant types to MHLO primitive ops
+// like int ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createConvertMHLOQuantToIntPass();
+
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFNoFallbackPass(
     bool allow_partial_conversion = false);
 
-/// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
-/// specified device type.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTfWithTf2XlaPass(
-    llvm::StringRef device_type = "", bool prefer_tf2xla = false);
-
 /// Replaces types that do not exist in MHLO with equivalent types that do
 /// exist.
 std::unique_ptr<OperationPass<void>> CreateLegalizeTfTypesPass();
 
+/// Converter to be used along with the fallback Tf2Xla patterns below.
+class Tf2XlaTypeConverter : public TypeConverter {
+ public:
+  Tf2XlaTypeConverter();
+};
+
 /// Adds the TF to XLA via TF2XLA rewrite patterns to the pattern list.
 /// `prefer_tf2xla` means an op will be included iff it is not in
 /// `MlirLegalizedUnderPreferTf2XlaSet`. `!prefer_tf2xla` mean an op will be
@@ -75,6 +81,7 @@ std::unique_ptr<OperationPass<void>> CreateLegalizeTfTypesPass();
 void PopulateLegalizeTfWithTf2XlaPatterns(llvm::StringRef device_type,
                                           RewritePatternSet& patterns,
                                           MLIRContext* ctx,
+                                          Tf2XlaTypeConverter& converter,
                                           bool prefer_tf2xla = false,
                                           bool is_module_pass = false);
 
@@ -93,9 +100,6 @@ void PopulateLegalizeTfQuantizationPatterns(MLIRContext* context,
 /// Checks whether the op is supported by the Tf2Xla fallback for legalization.
 bool HasTf2XlaFallback(Operation* op);
 
-/// Lowers from TF dialect's control flow to HLO dialect's control flow.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
-
 /// Converts the provided Operation as well as all nested operations into HLO
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
@@ -118,13 +122,18 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCommunicationPass();
 // ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeTFCollectivePass();
 
+// Verifies that the TF/XLA ops have all been lowered to MHLO.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateVerifyTFXLALegalizationPass(
+    bool legalize_chlo = true);
+
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_LEGALIZETF
 #define GEN_PASS_DECL_LEGALIZETFCOLLECTIVE
-#define GEN_PASS_DECL_LEGALIZETFCONTROLFLOW
 #define GEN_PASS_DECL_LEGALIZETFMODULEPASS
 #define GEN_PASS_DECL_LEGALIZETFNOFALLBACK
 #define GEN_PASS_DECL_LEGALIZETFTYPESPASS
+#define GEN_PASS_DECL_VERIFYTFXLALEGALIZATION
+#define GEN_PASS_DECL_CONVERTMHLOQUANTTOINT
 #include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.h.inc"
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/mlir/xla/transforms/tf_xla_passes.td b/tensorflow/compiler/mlir/xla/transforms/tf_xla_passes.td
index f89db021381..d42f91118f1 100644
--- a/tensorflow/compiler/mlir/xla/transforms/tf_xla_passes.td
+++ b/tensorflow/compiler/mlir/xla/transforms/tf_xla_passes.td
@@ -15,25 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def LegalizeTFWithTF2XLA
-    : Pass<"xla-legalize-tf-with-tf2xla", "mlir::func::FuncOp"> {
-  let summary = "Legalize from TensorFlow to the HLO dialect using tf2xla kernels";
-  let dependentDialects = ["mhlo::MhloDialect", "sparse_tensor::SparseTensorDialect"];
-  let constructor = "::mlir::mhlo::createLegalizeTfWithTf2XlaPass()";
-  let options = [
-  // TODO(hinsu): Support finer grained device type assignment instead of a
-  // global device type for all TensorFlow ops.
-  Option<"device_type_", "device-type", "std::string", "",
-         "XLA device type for execution of TensorFlow ops.">,
-  Option<"prefer_tf2xla_", "prefer-tf2xla", "bool", "",
-         "Enable legalization when it is not in the list of "
-                     "MLIR-legalized ops.">,
-  Option<"legalize_test_only_ops_", "legalize-test-only-ops", "bool", "",
-         "Enable tf2xla legalizations for some ops that are "
-                     "enabled only for testing.">
-  ];
-}
-
 def LegalizeTFCommunicationPass : Pass<"xla-legalize-tf-communication", "ModuleOp"> {
   let summary = "Legalize TF/XLA communication ops (TensorFlow dialect) to the HLO "
            "dialect";
diff --git a/tensorflow/compiler/mlir/xla/transforms/utils.cc b/tensorflow/compiler/mlir/xla/transforms/utils.cc
index 974e7236185..88243c6435f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/utils.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/xla/transforms/utils.h"
 
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/hlo_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/xla/transforms/utils.h b/tensorflow/compiler/mlir/xla/transforms/utils.h
index a5aa6a1418d..f1f25c842b3 100644
--- a/tensorflow/compiler/mlir/xla/transforms/utils.h
+++ b/tensorflow/compiler/mlir/xla/transforms/utils.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/xla/transforms/verify_tfxla_legalization.cc b/tensorflow/compiler/mlir/xla/transforms/verify_tfxla_legalization.cc
new file mode 100644
index 00000000000..6501b539efc
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/verify_tfxla_legalization.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h"
+
+namespace mlir {
+namespace mhlo {
+
+namespace {
+
+#define GEN_PASS_DEF_VERIFYTFXLALEGALIZATION
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.h.inc"
+
+class VerifyTFXLALegalization
+    : public impl::VerifyTFXLALegalizationBase<VerifyTFXLALegalization> {
+ public:
+  explicit VerifyTFXLALegalization(bool legalize_chlo) {
+    legalize_chlo_ = legalize_chlo_;
+  }
+
+  void runOnOperation() override;
+};
+
+void VerifyTFXLALegalization::runOnOperation() {
+  Operation* func_op = getOperation();
+  ConversionTarget default_conversion_target =
+      GetDefaultLegalConversionTargets(getContext(), legalize_chlo_);
+
+  auto walk_result = func_op->walk([&](Operation* op) {
+    if (default_conversion_target.isLegal(op)) {
+      return WalkResult::advance();
+    }
+
+    emitError(op->getLoc()) << "Could not legalize op: " << op->getName();
+
+    return WalkResult::interrupt();
+  });
+
+  if (walk_result.wasInterrupted()) signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateVerifyTFXLALegalizationPass(bool legalize_chlo) {
+  return std::make_unique<VerifyTFXLALegalization>(legalize_chlo);
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.cc
new file mode 100644
index 00000000000..c39026eb7e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.cc
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace mhlo {
+
+ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
+                                                  bool legalize_chlo) {
+  ConversionTarget target(mlir_context);
+
+  if (legalize_chlo) {
+    target.addIllegalDialect<chlo::ChloDialect>();
+  } else {
+    target.addLegalDialect<chlo::ChloDialect>();
+  }
+  target.addLegalDialect<MhloDialect>();
+  target.addLegalDialect<arith::ArithDialect>();
+  target.addLegalDialect<func::FuncDialect>();
+  target.addLegalDialect<tensor::TensorDialect>();
+  target.addLegalDialect<shape::ShapeDialect>();
+  target.addLegalOp<func::CallOp>();
+
+  // These ops are legalized in LegalizeTFCommunication after this and that pass
+  // only operates on MHLO control flow ops.
+  target.addLegalOp<TF::_XlaHostComputeMlirOp, TF::XlaSendToHostOp,
+                    TF::XlaRecvFromHostOp>();
+
+  return target;
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h
new file mode 100644
index 00000000000..62483edca80
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+
+namespace mlir {
+namespace mhlo {
+
+// Returns a ConversionTarget that includes default legalized MLIR dialects
+// for conversion to XLA.
+// If legalize_chlo is true, the resulting conversion target cannot have CHLO.
+mlir::ConversionTarget GetDefaultLegalConversionTargets(
+    MLIRContext& mlir_context, bool legalize_chlo);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_LEGALIZE_TARGETS_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets_test.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets_test.cc
new file mode 100644
index 00000000000..846e6358b6f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace mhlo {
+namespace {
+
+mlir::DialectRegistry GetDefaultDialectRegistry() {
+  mlir::DialectRegistry registry;
+
+  registry.insert<arith::ArithDialect>();
+  registry.insert<func::FuncDialect>();
+  registry.insert<tensor::TensorDialect>();
+  registry.insert<shape::ShapeDialect>();
+  registry.insert<TF::TensorFlowDialect>();
+  registry.insert<chlo::ChloDialect>();
+
+  return registry;
+}
+
+class XlaLegalizeTargetsTest : public testing::Test {
+ public:
+  XlaLegalizeTargetsTest()
+      : context_(GetDefaultDialectRegistry()),
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(&context_))),
+        builder_(&module_->getBodyRegion()) {
+    context_.loadAllAvailableDialects();
+  }
+
+ protected:
+  mlir::MLIRContext context_;
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
+  mlir::OpBuilder builder_;
+};
+
+TEST_F(XlaLegalizeTargetsTest, CreatesConversionTargets) {
+  auto const_int = builder_.create<mlir::arith::ConstantIntOp>(
+      builder_.getUnknownLoc(), /*value=*/10, builder_.getI32Type());
+
+  ConversionTarget target =
+      GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
+  EXPECT_TRUE(target.isLegal(const_int));
+}
+
+TEST_F(XlaLegalizeTargetsTest, AllowsCHLODialect) {
+  auto const_int = builder_.create<chlo::ConstantOp>(
+      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+
+  ConversionTarget target =
+      GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/true);
+
+  EXPECT_TRUE(target.isIllegal(const_int));
+}
+
+TEST_F(XlaLegalizeTargetsTest, DontAllowCHLODialect) {
+  auto const_int = builder_.create<chlo::ConstantOp>(
+      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+
+  ConversionTarget target =
+      GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
+  EXPECT_TRUE(target.isLegal(const_int));
+}
+
+}  // namespace
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf.cc
index 68fdc943641..c8b1fa65968 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf.cc
@@ -13,17 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -31,20 +37,31 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/disc/IR/hlo_disc_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/xla/transforms/xla_legalize_targets.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/quantization/uniform_quant_ops_attr.pb.h"
+#include "tensorflow/core/util/quantization/uniform_quant_ops_params.h"
 
 namespace mlir {
 namespace mhlo {
@@ -63,7 +80,7 @@ class LegalizeTF : public impl::LegalizeTFBase<LegalizeTF> {
     prefer_tf2xla_ = prefer_tf2xla;
     use_tf2xla_fallback_ = tf2xla_fallback_device_type.has_value();
     if (tf2xla_fallback_device_type.has_value()) {
-      device_type_ = tf2xla_fallback_device_type.getValue().str();
+      device_type_ = tf2xla_fallback_device_type.value().str();
     }
   }
   /// Performs the lowering to XLA dialect.
@@ -84,6 +101,19 @@ class LegalizeTFModulePass
   void runOnOperation() override;
 };
 
+FailureOr<IntegerType> GetStorageType(Operation *op,
+                                      Type original_output_element_type,
+                                      PatternRewriter &rewriter) {
+  if (original_output_element_type.isa<TF::Qint8Type>()) {
+    return rewriter.getIntegerType(8);
+  } else if (original_output_element_type.isa<TF::Qint32Type>()) {
+    return rewriter.getIntegerType(32);
+  } else {
+    return rewriter.notifyMatchFailure(
+        op, "Quantized type must be qint8 or qint32.");
+  }
+}
+
 TensorType GetSameShapeTensorType(TensorType tensor_type, Type element_type) {
   if (auto ranked_tensor_ty =
           tensor_type.dyn_cast_or_null<RankedTensorType>()) {
@@ -96,6 +126,180 @@ TensorType GetSameShapeTensorType(TensorType tensor_type, Type element_type) {
   llvm_unreachable("unhandled type");
 }
 
+template <typename UniformQuantizedOp>
+FailureOr<TensorType> GetUniformQuantizedType(
+    UniformQuantizedOp op, Type original_type,
+    TypedValue<TensorType> scales_value,
+    TypedValue<TensorType> zero_points_value, FloatType expressed_type,
+    int64_t storage_type_min, int64_t storage_type_max,
+    int64_t quantized_dimension, PatternRewriter &rewriter) {
+  // Check whether the scales operand has constant op.
+  DenseFPElementsAttr scales;
+  if (!matchPattern(scales_value, m_Constant(&scales))) {
+    return rewriter.notifyMatchFailure(op, "scales must be constant");
+  }
+
+  // Check whether the zero_points operand has constant op.
+  DenseIntElementsAttr zero_points;
+  if (!matchPattern(zero_points_value, m_Constant(&zero_points))) {
+    return rewriter.notifyMatchFailure(op, "zero_points must be constant");
+  }
+
+  auto storage_type_or =
+      GetStorageType(op, getElementTypeOrSelf(original_type), rewriter);
+  if (failed(storage_type_or)) {
+    return failure();
+  }
+
+  const unsigned flags = quant::QuantizationFlags::Signed;
+  Type elem_ty;
+  if (quantized_dimension == -1) {
+    elem_ty = quant::UniformQuantizedType::get(
+        flags, *storage_type_or, expressed_type, scales.getValues<float>()[0],
+        zero_points.getValues<int32_t>()[0], storage_type_min,
+        storage_type_max);
+  } else {
+    SmallVector<double> scales_vec;
+    SmallVector<int64_t> zero_points_vec;
+    for (auto elem : scales.getValues<float>()) scales_vec.push_back(elem);
+    for (auto elem : zero_points.getValues<int32_t>())
+      zero_points_vec.push_back(elem);
+    elem_ty = quant::UniformQuantizedPerAxisType::get(
+        flags, *storage_type_or, expressed_type, scales_vec, zero_points_vec,
+        quantized_dimension, storage_type_min, storage_type_max);
+  }
+
+  return GetSameShapeTensorType(original_type.cast<TensorType>(), elem_ty);
+}
+
+template <typename UniformQuantizedOp>
+FailureOr<mhlo::ConstantOp> CreateConstantOpForQint8Rhs(
+    UniformQuantizedOp op, TensorType new_rhs_type, PatternRewriter &rewriter) {
+  // Check whether the rhs operand has constant op.
+  TF::TensorProtoAttr tensor_proto_attr;
+  if (!matchPattern(op.getRhs(), m_Constant(&tensor_proto_attr))) {
+    return rewriter.notifyMatchFailure(op, "rhs must be constant.");
+  }
+
+  llvm::StringRef mangled_tensor = tensor_proto_attr.getValue();
+  absl::string_view tensor_view(mangled_tensor.data(), mangled_tensor.size());
+  // TODO(hinsu): Instead of getting the weight from TensorProto, use MLIR
+  // constant attribute to avoid depending on the Tensor proto.
+  tensorflow::TensorProto tensor_proto;
+  tensorflow::Status status =
+      tensorflow::mangling_util::DemangleTensor(tensor_view, &tensor_proto);
+  if (!status.ok()) {
+    return rewriter.notifyMatchFailure(op, status.error_message());
+  }
+
+  tensorflow::Tensor t;
+  if (!t.FromProto(tensor_proto)) {
+    return op.emitError("Failed to convert tensor proto to Tensor.");
+  }
+
+  auto arr = t.flat<tensorflow::qint8>();
+  auto dense_attr = mlir::DenseElementsAttr::get(
+      GetSameShapeTensorType(new_rhs_type, rewriter.getIntegerType(8)),
+      llvm::ArrayRef(arr.data(), arr.size()));
+  return rewriter.create<mhlo::ConstantOp>(op.getLoc(), new_rhs_type,
+                                           dense_attr);
+}
+
+xla::ConvolutionDimensionNumbers ConvertConvolutionDimensionNumbers(
+    const tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr
+        &dnums_input) {
+  xla::ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(dnums_input.input_batch_dimension());
+  dnums.set_input_feature_dimension(dnums_input.input_feature_dimension());
+  for (auto value : dnums_input.input_spatial_dimensions()) {
+    dnums.add_input_spatial_dimensions(value);
+  }
+  dnums.set_kernel_input_feature_dimension(
+      dnums_input.kernel_input_feature_dimension());
+  dnums.set_kernel_output_feature_dimension(
+      dnums_input.kernel_output_feature_dimension());
+  for (auto value : dnums_input.kernel_spatial_dimensions()) {
+    dnums.add_kernel_spatial_dimensions(value);
+  }
+  dnums.set_output_batch_dimension(dnums_input.output_batch_dimension());
+  dnums.set_output_feature_dimension(dnums_input.output_feature_dimension());
+  for (auto value : dnums_input.output_spatial_dimensions()) {
+    dnums.add_output_spatial_dimensions(value);
+  }
+  return dnums;
+}
+
+DenseIntElementsAttr ConvertToDenseElementsAttr(ArrayAttr array_attr,
+                                                PatternRewriter &rewriter) {
+  SmallVector<int64_t> array;
+  array.reserve(array_attr.size());
+  for (auto elem : array_attr.getAsRange<IntegerAttr>()) {
+    array.push_back(elem.getInt());
+  }
+  return DenseIntElementsAttr::get(
+      RankedTensorType::get({static_cast<int64_t>(array_attr.size())},
+                            rewriter.getIntegerType(64)),
+      array);
+}
+
+FailureOr<ElementsAttr> ConvertPaddingAttr(
+    TF::UniformQuantizedConvolutionHybridOp op,
+    const xla::ConvolutionDimensionNumbers &dnums, PatternRewriter &rewriter) {
+  StringAttr conv_padding = op.getPaddingAttr();
+  SmallVector<int64_t> padding_nums;
+  ShapedType lhs_shape = op.getLhs().getType().cast<ShapedType>();
+  ShapedType rhs_shape = op.getRhs().getType().cast<ShapedType>();
+
+  // Handle only static shape cases.
+  // TODO(b/260284866): Handle dynamic shape cases.
+  if (!lhs_shape.hasStaticShape()) {
+    return op.emitError("lhs must have static shape.");
+  }
+  if (!rhs_shape.hasStaticShape()) {
+    return op.emitError("rhs must have static shape.");
+  }
+
+  const int64_t padding_nums_size = 2 * (rhs_shape.getRank() - 2);
+  padding_nums.reserve(padding_nums_size);
+  if (conv_padding.strref().equals("EXPLICIT")) {
+    for (auto padding_elem :
+         op.getExplicitPaddingAttr().getAsRange<IntegerAttr>()) {
+      padding_nums.push_back(padding_elem.getInt());
+    }
+  } else if (conv_padding.strref().equals("VALID")) {
+    padding_nums.resize(padding_nums_size, 0);
+  } else {
+    padding_nums.resize(padding_nums_size);
+    for (int i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      const int64_t stride =
+          op.getWindowStridesAttr()[i].cast<IntegerAttr>().getInt();
+      const int64_t lhs_size_dilated =
+          tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
+              lhs_shape.getDimSize(dnums.input_spatial_dimensions(i)),
+              op.getLhsDilationAttr()[i].cast<IntegerAttr>().getInt());
+      const int64_t rhs_size_dilated =
+          tensorflow::UniformQuantizedConvolutionParams::DilatedSize(
+              rhs_shape.getDimSize(dnums.kernel_spatial_dimensions(i)),
+              op.getRhsDilationAttr()[i].cast<IntegerAttr>().getInt());
+
+      const int64_t output_size = (lhs_size_dilated + stride - 1) / stride;
+      const int64_t total_padding = std::max(
+          (output_size - 1) * stride + rhs_size_dilated - lhs_size_dilated,
+          static_cast<int64_t>(0));
+      const int64_t padding_end = total_padding / 2;
+      const int64_t padding_begin = total_padding - padding_end;
+      padding_nums[2 * i] = padding_begin;
+      padding_nums[2 * i + 1] = padding_end;
+    }
+  }
+
+  ElementsAttr padding_attr = DenseIntElementsAttr::get(
+      RankedTensorType::get({static_cast<int32_t>(padding_nums.size() / 2), 2},
+                            rewriter.getIntegerType(64)),
+      padding_nums);
+  return padding_attr;
+}
+
 // TODO(hinsu): Move this pattern to legalize_tf after resolving the dependency
 // on the tensor proto.
 class ConvertUniformQuantizedDotHybridOp
@@ -105,73 +309,204 @@ class ConvertUniformQuantizedDotHybridOp
 
   LogicalResult matchAndRewrite(TF::UniformQuantizedDotHybridOp op,
                                 PatternRewriter &rewriter) const override {
-    // Check whether the rhs operand has constant op.
-    TF::TensorProtoAttr tensor_proto_attr;
-    if (!matchPattern(op.rhs(), m_Constant(&tensor_proto_attr)))
+    // Uniform Quantized type for the rhs.
+    int64_t rhs_quantized_dimension = op.getRhsQuantizationAxis();
+    // Currently for dot, PTQ supports per-tensor quantization.
+    if (rhs_quantized_dimension != -1) {
+      return rewriter.notifyMatchFailure(
+          op, "Legalization supports only rhs_quantization_axis -1.");
+    }
+    auto rhs_type = GetUniformQuantizedType(
+        op, op.getRhs().getType(), op.getRhsScales(), op.getRhsZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getRhsQuantizationMinVal(),
+        op.getRhsQuantizationMaxVal(), rhs_quantized_dimension, rewriter);
+    if (failed(rhs_type)) {
       return failure();
+    }
 
-    // Check whether the rhs_scales operand has constant op.
-    DenseFPElementsAttr rhs_scales;
-    if (!matchPattern(op.rhs_scales(), m_Constant(&rhs_scales)))
+    auto rhs = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    if (failed(rhs)) {
       return failure();
+    }
 
-    // Check whether the rhs_zero_points operand has constant op.
-    DenseIntElementsAttr rhs_zero_points;
-    if (!matchPattern(op.rhs_zero_points(), m_Constant(&rhs_zero_points)))
-      return failure();
+    rewriter.replaceOpWithNewOp<mhlo::DotOp>(op, op.getType(), op.getLhs(),
+                                             *rhs,
+                                             /*precision_config=*/nullptr);
+    return success();
+  }
+};
 
-    // Invalid quantization parameter.
-    if (rhs_scales.empty()) return failure();
-    if (rhs_scales.size() != rhs_zero_points.size()) return failure();
+class ConvertUniformQuantizedConvolutionHybridOp
+    : public OpRewritePattern<TF::UniformQuantizedConvolutionHybridOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
 
+  LogicalResult matchAndRewrite(TF::UniformQuantizedConvolutionHybridOp op,
+                                PatternRewriter &rewriter) const override {
     // Uniform Quantized type for the rhs.
-    IntegerType storage_type = rewriter.getIntegerType(8);
-    FloatType expressed_type = rewriter.getF32Type();
-    int64_t storage_type_min = op.rhs_quantization_min_val();
-    int64_t storage_type_max = op.rhs_quantization_max_val();
-    int32_t quantized_dimension = op.rhs_quantization_axis();
-    const unsigned flags = mlir::quant::QuantizationFlags::Signed;
-
-    // Currently, PTQ supports per-tensor quantization, for now.
-    if (quantized_dimension != -1) return failure();
-
-    Type rhs_elem_ty;
-    rhs_elem_ty = quant::UniformQuantizedType::get(
-        flags, storage_type, expressed_type, rhs_scales.getValues<float>()[0],
-        rhs_zero_points.getValues<int32_t>()[0], storage_type_min,
-        storage_type_max);
+    auto rhs_type = GetUniformQuantizedType(
+        op, op.getRhs().getType(), op.getRhsScales(), op.getRhsZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getRhsQuantizationMinVal(),
+        op.getRhsQuantizationMaxVal(), op.getRhsQuantizationAxis(), rewriter);
+    if (failed(rhs_type)) {
+      return failure();
+    }
 
-    Type rhs_type = GetSameShapeTensorType(
-        op.rhs().getType().cast<TensorType>(), rhs_elem_ty);
-
-    llvm::StringRef mangled_tensor = tensor_proto_attr.getValue();
-    absl::string_view tensor_view(mangled_tensor.data(), mangled_tensor.size());
-    // TODO(hinsu): Instead of getting the weight from TensorProto, use MLIR
-    // constant attribute to avoid depending on the Tensor proto.
-    tensorflow::TensorProto tensor_proto;
-    tensorflow::Status status =
-        tensorflow::mangling_util::DemangleTensor(tensor_view, &tensor_proto);
-    if (!status.ok()) {
+    auto rhs = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    if (failed(rhs)) {
       return failure();
     }
 
-    tensorflow::Tensor t;
-    if (!t.FromProto(tensor_proto)) {
+    // TODO(b/261005147): Update the lowering logic after migration to mhlo
+    // ConvolutionDimensionNumbers.
+    tensorflow::UniformQuantizedConvolutionDimensionNumbersAttr dnums_input;
+    if (!dnums_input.ParseFromString(std::string(op.getDimensionNumbers()))) {
+      return op->emitError("Parse dimension_numbers failed.");
+    }
+    xla::ConvolutionDimensionNumbers dnums =
+        ConvertConvolutionDimensionNumbers(dnums_input);
+
+    SmallVector<NamedAttribute> converted_attrs;
+    for (auto attr : op->getAttrs()) {
+      if (attr.getName() == op.getFeatureGroupCountAttrName() ||
+          attr.getName() == op.getBatchGroupCountAttrName()) {
+        converted_attrs.push_back(attr);
+      } else if (attr.getName() == op.getDimensionNumbersAttrName()) {
+        attr.setValue(xla::ConvertConvDimensionNumbers(dnums, &rewriter));
+        converted_attrs.push_back(attr);
+      } else if (attr.getName() == op.getPaddingAttrName()) {
+        auto value_or = ConvertPaddingAttr(op, dnums, rewriter);
+        if (failed(value_or)) {
+          return failure();
+        }
+        attr.setValue(*value_or);
+        converted_attrs.push_back(attr);
+      } else if (attr.getName() == op.getWindowStridesAttrName() ||
+                 attr.getName() == op.getLhsDilationAttrName() ||
+                 attr.getName() == op.getRhsDilationAttrName()) {
+        attr.setValue(ConvertToDenseElementsAttr(
+            attr.getValue().cast<ArrayAttr>(), rewriter));
+        converted_attrs.push_back(attr);
+      }
+    }
+
+    SmallVector<Value, 2> operands{op.getLhs(), *rhs};
+    rewriter.replaceOpWithNewOp<mhlo::ConvolutionOp>(op, op.getType(), operands,
+                                                     converted_attrs);
+    return success();
+  }
+};
+
+class ConvertUniformQuantizeOp
+    : public OpRewritePattern<TF::UniformQuantizeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::UniformQuantizeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto output_type = GetUniformQuantizedType(
+        op, op.getOutput().getType(), op.getScales(), op.getZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getQuantizationMinVal(),
+        op.getQuantizationMaxVal(), op.getQuantizationAxis(), rewriter);
+    if (failed(output_type)) {
       return failure();
     }
 
-    auto arr = t.flat<tensorflow::qint8>();
-    auto dense_attr = ElementsAttr(mlir::DenseElementsAttr::get(
-        GetSameShapeTensorType(rhs_type.cast<TensorType>(), storage_type),
-        llvm::makeArrayRef(arr.data(), arr.size())));
+    rewriter.replaceOpWithNewOp<mhlo::UniformQuantizeOp>(op, *output_type,
+                                                         op.getInput());
+    return success();
+  }
+};
+
+// UniformDequantizeOp takes TF quantized types as input which would have been
+// converted to the mhlo quantized types. Use OpConversionPattern in order to
+// retrieve the operand type *after* conversion, using OpAdaptor operand
+// accessor.
+// Same for other Uniform Quant Ops that take TF quantized types as input.
+class ConvertUniformDequantizeOp
+    : public OpConversionPattern<TF::UniformDequantizeOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::UniformDequantizeOp op, TF::UniformDequantizeOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    Value input = adaptor.getInput();
+
+    rewriter.replaceOpWithNewOp<mhlo::UniformDequantizeOp>(
+        op, op.getOutput().getType(), input);
+    return success();
+  }
+};
 
-    Value lhs = op.lhs();
-    rewriter.setInsertionPointAfterValue(op.rhs());
-    Value rhs = rewriter.create<mhlo::ConstantOp>(rewriter.getUnknownLoc(),
-                                                  rhs_type, dense_attr);
+class ConvertUniformRequantizeOp
+    : public OpConversionPattern<TF::UniformRequantizeOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::UniformRequantizeOp op, TF::UniformRequantizeOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    Value input = adaptor.getInput();
+
+    auto output_type = GetUniformQuantizedType(
+        op, op.getOutput().getType(), op.getOutputScales(),
+        op.getOutputZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(),
+        op.getOutputQuantizationMinVal(), op.getOutputQuantizationMaxVal(),
+        op.getOutputQuantizationAxis(), rewriter);
+    if (failed(output_type)) {
+      return failure();
+    }
 
-    rewriter.setInsertionPoint(op);
-    rewriter.replaceOpWithNewOp<mhlo::DotOp>(op, lhs, rhs,
+    rewriter.replaceOpWithNewOp<mhlo::UniformQuantizeOp>(op, *output_type,
+                                                         input);
+    return success();
+  }
+};
+
+class ConvertUniformQuantizedDotOp
+    : public OpConversionPattern<TF::UniformQuantizedDotOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TF::UniformQuantizedDotOp op, TF::UniformQuantizedDotOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    Value lhs = adaptor.getLhs();
+
+    // Uniform Quantized type for the rhs.
+    int64_t rhs_quantized_dimension = op.getRhsQuantizationAxis();
+    // Currently for dot, PTQ supports per-tensor quantization.
+    if (rhs_quantized_dimension != -1) {
+      return rewriter.notifyMatchFailure(
+          op, "Legalization supports only rhs_quantization_axis -1.");
+    }
+    auto rhs_type = GetUniformQuantizedType(
+        op, adaptor.getRhs().getType(), op.getRhsScales(),
+        op.getRhsZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(), op.getRhsQuantizationMinVal(),
+        op.getRhsQuantizationMaxVal(), rhs_quantized_dimension, rewriter);
+    if (failed(rhs_type)) {
+      return failure();
+    }
+
+    auto rhs_or = CreateConstantOpForQint8Rhs(op, *rhs_type, rewriter);
+    if (failed(rhs_or)) {
+      return failure();
+    }
+
+    auto output_type = GetUniformQuantizedType(
+        op, op.getOutput().getType(), op.getOutputScales(),
+        op.getOutputZeroPoints(),
+        /*expressed_type=*/rewriter.getF32Type(),
+        op.getOutputQuantizationMinVal(), op.getOutputQuantizationMaxVal(),
+        op.getOutputQuantizationAxis(), rewriter);
+    if (failed(output_type)) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::DotOp>(op, *output_type, lhs, *rhs_or,
                                              /*precision_config=*/nullptr);
     return success();
   }
@@ -225,6 +560,7 @@ void EmitLegalizationErrors(Operation *op,
 /// Returns ops that should use MLIR legalization only in the case of
 /// prefer_tf2xla. All other ops not in this list should use XlaOpKernel
 /// legalization only or not be legalized by the new bridge.
+// LINT.IfChange
 const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
   // The static variable is a pointer in order to avoid destruction upon thread
   // termination.
@@ -239,25 +575,21 @@ const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
     TypeID::get<TF::BiasAddGradOp>(),
     TypeID::get<TF::CeilOp>(),
     TypeID::get<TF::CheckNumericsOp>(),
-    TypeID::get<TF::ComplexOp>(),
     TypeID::get<TF::CosOp>(),
+    TypeID::get<TF::TanOp>(),
     TypeID::get<TF::DiagPartOp>(),
-    TypeID::get<TF::DivOp>(),
     TypeID::get<TF::EinsumOp>(),
     TypeID::get<TF::ExpOp>(),
     TypeID::get<TF::Expm1Op>(),
     TypeID::get<TF::FakeQuantWithMinMaxArgsOp>(),
     TypeID::get<TF::FloorOp>(),
-    TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::IFFTOp>(),
     TypeID::get<TF::ImagOp>(),
     TypeID::get<TF::IsFiniteOp>(),
     TypeID::get<TF::IsInfOp>(),
     TypeID::get<TF::IsNanOp>(),
-    TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LgammaOp>(),
     TypeID::get<TF::Log1pOp>(),
-    TypeID::get<TF::LogicalOrOp>(),
     TypeID::get<TF::LogSoftmaxOp>(),
     TypeID::get<TF::MatrixBandPartOp>(),
     TypeID::get<TF::MaxPool3DGradOp>(),
@@ -274,10 +606,7 @@ const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
     TypeID::get<TF::SignOp>(),
     TypeID::get<TF::SoftmaxOp>(),
     TypeID::get<TF::SqrtOp>(),
-    TypeID::get<TF::SqrtGradOp>(),
-    TypeID::get<TF::SquaredDifferenceOp>(),
     TypeID::get<TF::TanhOp>(),
-    TypeID::get<TF::TanhGradOp>(),
     TypeID::get<TF::XlaConvV2Op>(),
     TypeID::get<TF::XlaDotOp>(),
     TypeID::get<TF::XlaDotV2Op>(),
@@ -290,8 +619,6 @@ const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
     TypeID::get<TF::XlaSortOp>(),
     TypeID::get<TF::XlaVariadicReduceV2Op>(),
     TypeID::get<TF::XlaVariadicSortOp>(),
-    TypeID::get<TF::XlogyOp>(),
-    TypeID::get<TF::ZetaOp>(),
 
     // Ops that have no XlaOpKernel.
     TypeID::get<TF::RiscAddOp>(),
@@ -321,10 +648,17 @@ const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
     TypeID::get<TF::RandomUniformOp>(),
     TypeID::get<TF::StridedSliceOp>(),
     TypeID::get<TF::SliceOp>(),
+
+    // Conditional ops
+    TypeID::get<TF::IfRegionOp>(),
+    TypeID::get<TF::WhileRegionOp>(),
+    TypeID::get<TF::CaseRegionOp>(),
+    TypeID::get<TF::YieldOp>(),
   };
   // clang-format on
   return *ops;
 }
+// LINT.ThenChange(:PopulateLegalizeTfPatterns)
 
 // Patterns whose root op is in the set `include_ops` are moved from the set
 // `from` to the returned set. This is used to partition patterns by op so they
@@ -334,7 +668,7 @@ RewritePatternSet PatternsIncludeOps(
   RewritePatternSet to(from.getContext());
   // Filter NativePatterns.
   for (auto &pattern : from.getNativePatterns()) {
-    Optional<OperationName> pat_op_name = pattern->getRootKind();
+    std::optional<OperationName> pat_op_name = pattern->getRootKind();
     // If the pattern does not have a specific operation, always include it,
     // If the pattern is in include_ops then include it.
     bool include =
@@ -352,18 +686,8 @@ RewritePatternSet PatternsIncludeOps(
 mlir::LogicalResult ApplyPatterns(Operation *op, RewritePatternSet &patterns,
                                   bool legalize_chlo,
                                   bool allow_partial_conversion) {
-  ConversionTarget target(*op->getContext());
-  if (legalize_chlo) {
-    target.addIllegalDialect<chlo::ChloDialect>();
-  } else {
-    target.addLegalDialect<chlo::ChloDialect>();
-  }
-  target.addLegalDialect<MhloDialect>();
-  target.addLegalDialect<arith::ArithDialect>();
-  target.addLegalDialect<func::FuncDialect>();
-  target.addLegalDialect<tensor::TensorDialect>();
-  target.addLegalDialect<shape::ShapeDialect>();
-  target.addLegalOp<func::CallOp>();
+  ConversionTarget target =
+      GetDefaultLegalConversionTargets(*op->getContext(), legalize_chlo);
 
   if (!allow_partial_conversion) {
     // Fully qualify ReturnOp here as mhlo dialect also defines a ReturnOp.
@@ -428,10 +752,12 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
           ? PatternsIncludeOps(legalize_lower_patterns, MlirPreferredOps())
           : std::move(legalize_lower_patterns);
 
+  Tf2XlaTypeConverter converter;
   if (tf2xla_fallback_device_type) {
     // Add TF->HLO legalization patterns via TF2XLA fallback.
-    PopulateLegalizeTfWithTf2XlaPatterns(tf2xla_fallback_device_type.getValue(),
-                                         patterns, context, prefer_tf2xla);
+    PopulateLegalizeTfWithTf2XlaPatterns(tf2xla_fallback_device_type.value(),
+                                         patterns, context, converter,
+                                         prefer_tf2xla);
   }
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
@@ -454,7 +780,6 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
   target.addLegalDialect<MhloDialect>();
   target.addLegalDialect<arith::ArithDialect>();
   target.addLegalDialect<func::FuncDialect>();
-  target.addLegalDialect<mhlo_disc::MhloDiscDialect>();
   target.addLegalDialect<tensor::TensorDialect>();
   target.addLegalDialect<shape::ShapeDialect>();
   target.addLegalOp<func::CallOp>();
@@ -500,8 +825,9 @@ void LegalizeTFModulePass::runOnOperation() {
   Operation *op = getOperation();
   MLIRContext *context = op->getContext();
   RewritePatternSet patterns(context);
+  Tf2XlaTypeConverter converter;
   PopulateLegalizeTfWithTf2XlaPatterns(device_type_, patterns, context,
-                                       /*prefer_tf2xla=*/false,
+                                       converter, /*prefer_tf2xla=*/false,
                                        /*is_module_pass=*/true);
 
   if (failed(ApplyPatterns(op, patterns,
@@ -515,7 +841,11 @@ void LegalizeTFModulePass::runOnOperation() {
 
 void PopulateLegalizeTfQuantizationPatterns(MLIRContext *context,
                                             RewritePatternSet *patterns) {
-  patterns->add<ConvertUniformQuantizedDotHybridOp>(context);
+  patterns->add<ConvertUniformQuantizedDotHybridOp,
+                ConvertUniformQuantizedConvolutionHybridOp,
+                ConvertUniformQuantizeOp, ConvertUniformRequantizeOp,
+                ConvertUniformDequantizeOp, ConvertUniformQuantizedDotOp>(
+      context);
 }
 
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeTFPass(
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_no_fallback.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_no_fallback.cc
index ffc9e7fc7d4..f2159a3ec4a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_no_fallback.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_no_fallback.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.td
index a84207c1dc0..3e72481ede8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_tf_passes.td
@@ -76,6 +76,18 @@ def LegalizeTFModulePass : Pass<"xla-fallback-legalize-tf-module-pass", "ModuleO
                            "shape::ShapeDialect", "func::FuncDialect", "sparse_tensor::SparseTensorDialect"];
 }
 
+def ConvertMHLOQuantToInt : Pass<"convert-mhlo-quant-to-int", "mlir::func::FuncOp"> {
+  let summary = "Convert from MHLO quantized ops to MHLO primitive ops.";
+
+  let description = [{
+    Convert from MHLO quantized ops with MHLO quant types to MHLO primitive ops
+    like int ops.
+  }];
+
+  let constructor = "mlir::mhlo::createConvertMHLOQuantToIntPass()";
+  let dependentDialects = ["chlo::ChloDialect", "mhlo::MhloDialect"];
+}
+
 def LegalizeTFNoFallback : Pass<"xla-legalize-tf-no-fallback", "mlir::func::FuncOp"> {
   let summary = "Legalize from TF dialect's or HLO dialect's control flow.";
 
@@ -95,13 +107,6 @@ def LegalizeTFNoFallback : Pass<"xla-legalize-tf-no-fallback", "mlir::func::Func
                            "shape::ShapeDialect", "func::FuncDialect", "sparse_tensor::SparseTensorDialect"];
 }
 
-def LegalizeTFControlFlow : Pass<"xla-legalize-tf-control-flow", "ModuleOp"> {
-  let summary = "Legalize from TF dialect's to HLO dialect's control flow.";
-
-  let constructor = "mlir::mhlo::createLegalizeTFControlFlowPass()";
-  let dependentDialects = ["mhlo::MhloDialect", "sparse_tensor::SparseTensorDialect"];
-}
-
 def LegalizeTfTypesPass : Pass<"xla-legalize-tf-types"> {
   let summary = "Replace TensorFlow types with types that are legal in the MHLO dialect";
 
@@ -126,3 +131,19 @@ def LegalizeTFCollective : Pass<"xla-legalize-tf-collective", "ModuleOp"> {
   let constructor = "mlir::mhlo::CreateLegalizeTFCollectivePass()";
   let dependentDialects = ["mhlo::MhloDialect", "sparse_tensor::SparseTensorDialect"];
 }
+
+def VerifyTFXLALegalization : Pass<"tfxla-verify-legalization", "mlir::func::FuncOp"> {
+  let summary = "Verifies that all TF ops have been legalized to XLA.";
+
+  let description = [{"Ensures that all Tensorflow ops have been legalized to "
+                    "XLA and reports an error about which op has not been"
+                    "legalized. This pass does not transform any ops and is just"
+                    " a verification pass to ensure invariants are true."}];
+
+  let options = [
+    Option<"legalize_chlo_", "legalize-chlo", "bool", /*default=*/"true",
+        "Legalizes intermediate chlo ops to hlo">
+    ];
+
+  let constructor = "mlir::mhlo::CreateVerifyTFXLALegalizationPass()";
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_passes.h b/tensorflow/compiler/mlir/xla/transforms/xla_passes.h
deleted file mode 100644
index 35ebbd68300..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/xla_passes.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_PASSES_H_
-
-#include <memory>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-
-namespace mlir {
-
-namespace func {
-class FuncOp;
-}  // namespace func
-template <typename T>
-class OperationPass;
-
-namespace mhlo {
-
-// Prepare module for export to XLA HLO protos/instruction.
-std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareForExport();
-
-// Wrap function with XLA:CPU's C interface.
-std::unique_ptr<OperationPass<ModuleOp>> CreateOutlineWithXLAFrameworkPass();
-
-// Convert XLAFramework operations to LLVM operations.
-std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeXLAFrameworkToLLVMPass();
-
-// Patterns to lower all XLAFramework operations and types to LLVM versions.
-void PopulateLegalizeXLAFrameworkToLLVMPatterns(llvm::StringRef device_type,
-                                                RewritePatternSet& patterns,
-                                                MLIRContext* ctx,
-                                                bool prefer_tf2xla = false);
-
-#define GEN_PASS_REGISTRATION
-#define GEN_PASS_DECL_LEGALIZEXLAFRAMEWORKTOLLVM
-#define GEN_PASS_DECL_OUTLINEWITHXLAFRAMEWORK
-#define GEN_PASS_DECL_PREPAREFOREXPORTPASS
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h.inc"
-
-}  // namespace mhlo
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_XLA_PASSES_H_
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_registration.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate_registration.cc
deleted file mode 100644
index b734edbcef0..00000000000
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate_registration.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
-
-namespace {
-// NOLINTNEXTLINE
-llvm::cl::opt<bool> optimize_xla_hlo(
-    "optimize-xla-hlo",
-    llvm::cl::desc("Enable optimizations when translating XLA HLO -> LHLO"),
-    llvm::cl::init(true));
-}  // namespace
-
-//----------------------------------------------------------------------------//
-// Hooks for tf-mlir-translate
-//----------------------------------------------------------------------------/
-
-// MHLO doesn't support explicit layouts, while XLA service does.
-// TODO(timshen): remove it once MHLO supports explicit layouts.
-static mlir::TranslateToMLIRRegistration HloTextToLhloMlirTranslate(
-    "hlo-text-to-lhlo", "hlo-text-to-lhlo",
-    [](llvm::StringRef input, mlir::MLIRContext* context) {
-      return mlir::HloTextToLhloTranslateFunction(input, context,
-                                                  optimize_xla_hlo);
-    });
diff --git a/tensorflow/compiler/mlir/xla/xla_opt_main.cc b/tensorflow/compiler/mlir/xla/xla_opt_main.cc
index 47dc6b3a0ee..77316ac2ef3 100644
--- a/tensorflow/compiler/mlir/xla/xla_opt_main.cc
+++ b/tensorflow/compiler/mlir/xla/xla_opt_main.cc
@@ -19,14 +19,15 @@ limitations under the License.
 #include "stablehlo/dialect/Register.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h"
 #include "tensorflow/compiler/mlir/xla/transforms/adjust_layout.h"
-#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/core/ir/types/dialect.h"
 
 int main(int argc, char **argv) {
@@ -37,7 +38,7 @@ int main(int argc, char **argv) {
   mlir::lmhlo::registerAllLmhloPasses();
   // These are in compiler/mlir/xla and not part of the above MHLO passes.
   mlir::mhlo::registerTfXlaPasses();
-  mlir::mhlo::registerXlaPasses();
+  mlir::mhlo::registerXlaFrameworkPasses();
   mlir::mhlo::RegisterAdjustLayoutPass();
   mlir::mhlo::registerLegalizeTfPasses();
   mlir::RegisterMhloToLhloWithXlaPass();
@@ -45,6 +46,7 @@ int main(int argc, char **argv) {
   mlir::registerAllDialects(registry);
   mlir::mhlo::registerAllMhloDialects(registry);
   mlir::stablehlo::registerAllDialects(registry);
+  xla::cpu::RegisterHloXlaRuntimePipelineDialects(registry);
   registry.insert<mlir::xla_framework::XLAFrameworkDialect,
                   mlir::TF::TensorFlowDialect, mlir::tf_type::TFTypeDialect>();
   return failed(
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index dc1c2391e94..e582e196099 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -31,6 +31,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 """
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 3bd9498aa9c..28c0667fd51 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -13,6 +13,7 @@ load(
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     licenses = ["notice"],
 )
@@ -728,7 +729,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "slice_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["slice_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1063,7 +1064,7 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
-    enable_mlir_bridge = False,
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1119,6 +1120,7 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
@@ -1203,6 +1205,30 @@ tf_xla_py_test(
     ],
 )
 
+# copybara:uncomment_begin(google-only)
+# tf_xla_py_test(
+#     name = "reverse_sequence_op_args_test",
+#     size = "medium",
+#     srcs = ["reverse_sequence_op_args_test.py"],
+#     enable_mlir_bridge = False,
+#     main = "reverse_sequence_op_args_test.py",
+#     python_version = "PY3",
+#     tags = [
+#         "no_pip",
+#         "optonly",
+#     ],
+#     deps = [
+#         ":xla_test",
+#         "//tensorflow/compiler/jit:xla_cpu_jit",  # DisableOnExport
+#         "//tensorflow/python:array_ops",
+#         "//tensorflow/python:framework",
+#         "//tensorflow/python:platform_test",
+#         "//tensorflow/python/compat:v2_compat",
+#         "//tensorflow/python/eager:function",
+#     ],
+# )
+# copybara:uncomment_end
+
 tf_xla_py_test(
     name = "rmsprop_test",
     size = "small",
@@ -1445,6 +1471,7 @@ tf_xla_py_test(
     srcs = ["unary_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
+    shard_count = 4,
     tags = [
         "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -2207,6 +2234,37 @@ tf_xla_py_test(
     size = "small",
     srcs = ["where_op_test.py"],
     enable_mlir_bridge = False,
+    enabled_backends = [
+        "cpu",
+        "gpu",
+    ],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python/compiler/xla:compiler_py",
+    ],
+)
+
+tf_xla_py_test(
+    name = "where_op_tpu_test",
+    size = "small",
+    srcs = ["where_op_test.py"],
+    args = ["--tpu_use_tfrt=true"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+        "gpu",
+    ],
+    enable_mlir_bridge = False,
+    main = "where_op_test.py",
     tags = [
         "no_pip",
         "optonly",
@@ -2259,10 +2317,24 @@ tf_xla_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "const_test",
+    size = "small",
+    srcs = ["const_test.py"],
+    python_version = "PY3",
+    xla_enable_strict_auto_jit = False,
+    xla_enabled = True,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework",
+    ],
+)
+
 tpu_py_test(
-    name = "const_op_test",
+    name = "giant_const_op_test",
     srcs = [
-        "const_op_test.py",
+        "giant_const_op_test.py",
     ],
     disable_experimental = True,
     # TODO(b/188995810): Add an optimization in MLIR importer to not
@@ -2345,3 +2417,41 @@ tf_xla_py_test(
         "//tensorflow/python:training",
     ],
 )
+
+tf_xla_py_test(
+    name = "bincount_op_test",
+    size = "small",
+    srcs = ["bincount_op_test.py"],
+    enable_mlir_bridge = False,
+    python_version = "PY3",
+    shard_count = 10,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
+    name = "unique_ops_test",
+    size = "small",
+    srcs = ["unique_ops_test.py"],
+    enable_mlir_bridge = False,
+    enabled_backends = [
+        "cpu",
+        "gpu",
+    ],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/compiler/tests/approx_topk_test.py b/tensorflow/compiler/tests/approx_topk_test.py
index e2672e0d399..53e5ce05429 100644
--- a/tensorflow/compiler/tests/approx_topk_test.py
+++ b/tensorflow/compiler/tests/approx_topk_test.py
@@ -146,19 +146,20 @@ def ann(qy, db, k):
   def test_l2ann(self, k, db_size, qy_size, feature_dim):
     qy = self._rng.random([qy_size, feature_dim], dtype=np.float32)
     db = self._rng.random([db_size, feature_dim], dtype=np.float32)
-    db_half_norm = np.linalg.norm(db, axis=1) / 2
+    db_half_norm_sq = np.linalg.norm(db, axis=1)**2 / 2
 
     @function(jit_compile=True)
-    def ann(qy, db, db_half_norm, k):
-      scores = db_half_norm - math_ops.matmul(qy, db, transpose_b=True)
+    def ann(qy, db, db_half_norm_sq, k):
+      scores = db_half_norm_sq - math_ops.matmul(qy, db, transpose_b=True)
       return nn_ops.approx_min_k(scores, k)
 
     with ops.device('/device:TPU:0'):
       qy_op = variables.Variable(qy)
       db_op = variables.Variable(db)
-      db_half_norm_op = variables.Variable(db_half_norm)
-      result = ann(qy_op, db_op, db_half_norm_op, k)[1]
-      scores = db_half_norm_op - math_ops.matmul(qy_op, db_op, transpose_b=True)
+      db_half_norm_sq_op = variables.Variable(db_half_norm_sq)
+      result = ann(qy_op, db_op, db_half_norm_sq_op, k)[1]
+      scores = db_half_norm_sq_op - math_ops.matmul(
+          qy_op, db_op, transpose_b=True)
 
     gt = np.argsort(scores.numpy())[:, :k]
     ann_recall = self.compute_recall(result.numpy(), gt)
@@ -218,6 +219,36 @@ def ann_with_grads(db, out_grads):
 
     self.assertAllClose(expected_in_grads, result_in_grads)
 
+  # Tests that multiple ops are supported and the comparison functions are
+  # renamed properly to avoid conflict while using the MLIR bridge.
+  def test_multiple_ops(self):
+    k = 1
+
+    row_size = 100
+    num_rows = 10
+
+    row = np.arange(row_size, dtype=np.float32)
+    db1 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
+    db2 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
+
+    @function(jit_compile=True)
+    def ann(db1, db2):
+      result1 = nn_ops.approx_max_k(db1, k, aggregate_to_topk=True)
+      result2 = nn_ops.approx_max_k(db2, k, aggregate_to_topk=True)
+      return (result1, result2)
+
+    with ops.device('/device:TPU:0'):
+      db1_op = variables.Variable(db1)
+      db2_op = variables.Variable(db2)
+      result1, result2 = ann(db1_op, db2_op)
+
+    gt = np.argsort(-db1)[:, :k]
+    ann_recall = self.compute_recall(result1[1].numpy(), gt)
+    self.assertGreaterEqual(ann_recall, 0.95)
+
+    gt = np.argsort(-db2)[:, :k]
+    ann_recall = self.compute_recall(result2[1].numpy(), gt)
+    self.assertGreaterEqual(ann_recall, 0.95)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/tests/async_comp_test.py b/tensorflow/compiler/tests/async_comp_test.py
index 06511451850..ce9b58329c9 100644
--- a/tensorflow/compiler/tests/async_comp_test.py
+++ b/tensorflow/compiler/tests/async_comp_test.py
@@ -15,6 +15,7 @@
 """Tests for asynchronous compilation on the CPU and GPU devices."""
 
 import os
+import unittest
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_lib
@@ -51,6 +52,7 @@ class AsyncCompilationTest(test.TestCase):
 
   # Asynchrobnous compilation uses the existing fallback path and existing
   # compiler. This test only tests that asynchronus compilation is performed.
+  @unittest.skip("b/263146341 - flaky Kokoro build.")
   def testAsyncCompilationJit(self):
 
     @function.Defun(compiled=True)
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 575e17ada1d..a312df10e1f 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -694,12 +694,20 @@ def _testDivision(self, dtype):
           rtol=7e-15 if dtype == np.float64 else None,
           atol=3.9e-15 if dtype == np.float64 else None)
 
-    if dtype not in self.complex_types:  # floordiv unsupported for complex.
+    # floordiv/truncatediv unsupported for complex.
+    if dtype not in self.complex_types:
       self._testBinary(
           gen_math_ops.floor_div,
           np.array([3, 3, -1, -9, -8], dtype=dtype),
           np.array([2, -2, 7, 2, -4], dtype=dtype),
           expected=np.array([1, -2, -1, -5, 2], dtype=dtype))
+
+      self._testBinary(
+          gen_math_ops.truncate_div,
+          np.array([3, 3, -1, -9, -8.1], dtype=dtype),
+          np.array([2, -2, 7, 2, -4], dtype=dtype),
+          expected=np.array([1, -1, 0, -4, 2], dtype=dtype))
+
     if dtype in self.signed_int_types:
       # Overflow cases.
       int_min = np.iinfo(dtype).min
diff --git a/tensorflow/compiler/tests/bincount_op_test.py b/tensorflow/compiler/tests/bincount_op_test.py
new file mode 100644
index 00000000000..79e8a7e91b8
--- /dev/null
+++ b/tensorflow/compiler/tests/bincount_op_test.py
@@ -0,0 +1,40 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for bincount using the XLA JIT."""
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.platform import googletest
+
+
+class BincountTest(xla_test.XLATestCase):
+
+  def testInputRank0(self):
+    with self.session():
+      with self.test_scope():
+        bincount = gen_math_ops.bincount(arr=6, size=804, weights=[52, 351])
+
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError,
+          (
+              "`weights` must be the same shape as `arr` or a length-0"
+              " `Tensor`, in which case it acts as all weights equal to 1."
+          ),
+      ):
+        self.evaluate(bincount)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 0f58bd5a951..bd10ecad818 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -108,6 +108,7 @@ def tf_xla_py_test(
 
         for mlir_option in enable_mlir_bridge_options:
             extra_dep = []
+            extra_tag = []
             updated_name = test_name
 
             mlir_bridge_dep = "//tensorflow/python:is_mlir_bridge_test_true"
@@ -117,6 +118,12 @@ def tf_xla_py_test(
                     updated_name = updated_name[:-5]
                 updated_name += "_mlir_bridge_test"
                 extra_dep = [] if has_mlir_dep else [mlir_bridge_dep]
+
+                # Mark gpu mlir_bridge tests as ondemand
+                #
+                # This is for testing book keeping because the bridge does not have any gpu specific
+                # logic at this time, so CPU testing is good enough and cheaper.
+                extra_tag = ["ondemand"] if backend == "gpu" else []
             elif has_mlir_dep:
                 # Some tests run only with mlir_bridge by explicitly adding the MLIR
                 # bridge dep so if the dep is already present skip non MLIR
@@ -131,7 +138,7 @@ def tf_xla_py_test(
                 main = "{}.py".format(name) if main == None else main,
                 data = data + backend_data,
                 deps = deps + backend_deps + extra_dep,
-                tags = test_tags,
+                tags = test_tags + extra_tag,
                 exec_properties = tf_exec_properties({"tags": test_tags}),
                 **kwargs
             )
diff --git a/tensorflow/compiler/tests/const_op_test.py b/tensorflow/compiler/tests/const_op_test.py
deleted file mode 100644
index cab962073e9..00000000000
--- a/tensorflow/compiler/tests/const_op_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for const op compilation."""
-
-import os
-import numpy as np
-
-from tensorflow.python.distribute import tpu_strategy as tpu_lib
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import remote
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import flags
-from tensorflow.python.tpu import tpu_strategy_util
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
-flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
-flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
-
-
-def get_tpu_cluster_resolver():
-  resolver = tpu_cluster_resolver.TPUClusterResolver(
-      tpu=FLAGS.tpu,
-      zone=FLAGS.zone,
-      project=FLAGS.project,
-  )
-  return resolver
-
-
-def get_tpu_strategy():
-  resolver = get_tpu_cluster_resolver()
-  remote.connect_to_cluster(resolver)
-  tpu_strategy_util.initialize_tpu_system(resolver)
-  return tpu_lib.TPUStrategyV2(resolver)
-
-
-# This test doesn't use XLATestCase like the other tests in this directory.
-# The Const op xla op kernel is compilation only and therefore is not executed
-# with XLA in the on demand compilation mode. Also, here we want to feed the
-# full program to XLA to verify handling of programs with giant constant
-# tensors.
-class ConstOp(test.TestCase):
-
-  def setUp(self):
-    super(ConstOp, self).setUp()
-    # Make sure TF_XLA_FLAGS is not already set to avoid dropping the existing
-    # value silently.
-    assert "TF_XLA_FLAGS" not in os.environ
-
-    # Disable tfxla constant folding that always creates full Tensors and will
-    # fail for giant tensors.
-    os.environ["TF_XLA_FLAGS"] = "--tf_xla_disable_constant_folding=true"
-
-  # Verifies that graphs containing giant const tensors that won't fit in memory
-  # are compiled correctly to HLO.
-  def testGiantConst(self):
-    strategy = get_tpu_strategy()
-
-    types = {
-        dtypes.bool,
-        dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-        dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64,
-        dtypes.float16, dtypes.bfloat16,
-        dtypes.float32, dtypes.float64,
-    }
-    for dtype in types:
-      values = [True if dtype is dtypes.bool else 1]
-
-      if dtype is dtypes.bool:
-        values.append(False)
-      elif dtype is not dtypes.float64:
-        # TPUs don't follow IEEE 754 float64 standard for 64 bit floating point
-        # numbers so it could return different output even with just data
-        # transformation ops without any arithmetic operations.
-        values.extend([dtype.min, dtype.max])
-
-      for value in values:
-
-        @def_function.function
-        def train_step():
-
-          # pylint: disable=cell-var-from-loop
-          def computation():
-            const = constant_op.constant(value, dtype=dtype, shape=[1024]*4)
-            return const[:1, :1, :1, :1]
-
-          return strategy.run(computation, args=())
-
-        output = strategy.experimental_local_results(train_step())[0]
-        expected = np.full((1, 1, 1, 1), value)
-        self.assertAllEqual(output, expected)
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/compiler/tests/const_test.py b/tensorflow/compiler/tests/const_test.py
new file mode 100644
index 00000000000..4e11a436e85
--- /dev/null
+++ b/tensorflow/compiler/tests/const_test.py
@@ -0,0 +1,60 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for const op compilation."""
+
+import numpy as np
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+# This test doesn't use XLATestCase like the other tests in this directory.
+# The Const op xla op kernel is compilation only and therefore is not executed
+# with XLA in the on demand compilation mode. Instead we use
+# tf.function(jit_compile=True)
+class ConstOpTest(test_util.TensorFlowTestCase):
+
+  # Verifies that the Const op works
+  # @test_util.run_v2_only
+  def testConst(self):
+    types = {
+        dtypes.bool, dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+        dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64,
+        dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
+        dtypes.float8_e5m2, dtypes.float8_e4m3fn,
+    }
+    for dtype in types:
+      with self.subTest(dtype=dtype):
+        if dtype == dtypes.bool:
+          values = [True, False]
+        else:
+          values = [0., 1., -1., dtype.min, dtype.max]
+        if dtype.is_floating:
+          values.extend([float("Inf"), -float("Inf"), float("NaN")])
+        values = np.array(values, dtype=dtype.as_numpy_dtype)
+
+        @def_function.function(jit_compile=True)
+        def f():
+          return constant_op.constant(values, dtype)  # pylint: disable=cell-var-from-loop
+
+        result = f()
+        self.assertAllEqual(self.evaluate(result), values)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index 767f62fbffc..50f7df3a975 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -27,8 +27,10 @@
 from tensorflow.python.platform import googletest
 
 BATCH_DIMS = (3, 5)
-RTOL = 0.02  # Eigen/cuFFT differ widely from np, especially for FFT3D
-ATOL = 1e-3
+RTOL = 0.009  # Eigen/cuFFT differ widely from np, especially for FFT3D
+ATOL = 1e-4
+RTOL_3D = 0.07
+ATOL_3D = 4e-4
 
 
 def pick_10(x):
@@ -55,8 +57,13 @@ def to_32bit(x):
 
 class FFTTest(xla_test.XLATestCase):
 
-  def _VerifyFftMethod(self, inner_dims, complex_to_input, input_to_expected,
-                       tf_method):
+  def _VerifyFftMethod(self,
+                       inner_dims,
+                       complex_to_input,
+                       input_to_expected,
+                       tf_method,
+                       atol=ATOL,
+                       rtol=RTOL):
     for indims in inner_dims:
       print("nfft =", indims)
       shape = BATCH_DIMS + indims
@@ -72,7 +79,7 @@ def _VerifyFftMethod(self, inner_dims, complex_to_input, input_to_expected,
               dtypes.as_dtype(data.dtype), shape=data.shape)
           out = tf_method(ph)
         value = sess.run(out, {ph: data})
-        self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL)
+        self.assertAllClose(expected, value, rtol=rtol, atol=atol)
 
   def testContribSignalSTFT(self):
     ws = 512
@@ -111,7 +118,7 @@ def testFFT2D(self):
   def testFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
-                          signal.fft3d)
+                          signal.fft3d, ATOL_3D, RTOL_3D)
 
   def testIFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
@@ -124,7 +131,7 @@ def testIFFT2D(self):
   def testIFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
-                          signal.ifft3d)
+                          signal.ifft3d, ATOL_3D, RTOL_3D)
 
   def testRFFT(self):
 
@@ -155,7 +162,8 @@ def _tf_fn(x):
       return signal.rfft3d(
           x, fft_length=[x.shape[-3], x.shape[-2], x.shape[-1]])
 
-    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
+    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn, ATOL_3D,
+                          RTOL_3D)
 
   def testRFFT3DMismatchedSize(self):
 
@@ -209,7 +217,8 @@ def _tf_fn(x):
       return signal.irfft3d(
           x, fft_length=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
 
-    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn,
+                          ATOL_3D, RTOL_3D)
 
   def testIRFFT3DMismatchedSize(self):
 
@@ -229,7 +238,8 @@ def _tf_fn(x):
       return signal.irfft3d(
           x, fft_length=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
 
-    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn,
+                          ATOL_3D, RTOL_3D)
 
 
 
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 8a7cbccd117..14a26570dc5 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -37,7 +37,7 @@ def testEnqueueWithShape(self):
       enqueue_correct_op.run()
       with self.assertRaises(ValueError):
         q.enqueue(([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],))
-      self.assertEqual(1, q.size().eval())
+      self.assertEqual(1, self.evaluate(q.size()))
 
   def testMultipleDequeues(self):
     with self.session(), self.test_scope():
@@ -85,7 +85,7 @@ def enqueue(enqueue_op):
       # Dequeue every element using a single thread.
       results = []
       for _ in range(len(elems)):
-        results.append(dequeued_t.eval())
+        results.append(self.evaluate(dequeued_t))
       self.assertItemsEqual(elems, results)
 
   def testParallelDequeue(self):
@@ -175,7 +175,7 @@ def testMultiEnqueueAndDequeue(self):
   def testQueueSizeEmpty(self):
     with self.session(), self.test_scope():
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
-      self.assertEqual([0], q.size().eval())
+      self.assertEqual([0], self.evaluate(q.size()))
 
   def testQueueSizeAfterEnqueueAndDequeue(self):
     with self.session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/giant_const_op_test.py b/tensorflow/compiler/tests/giant_const_op_test.py
new file mode 100644
index 00000000000..c0f4b47be01
--- /dev/null
+++ b/tensorflow/compiler/tests/giant_const_op_test.py
@@ -0,0 +1,109 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for giant const op compilation."""
+
+import os
+import numpy as np
+
+from tensorflow.python.distribute import tpu_strategy as tpu_lib
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import flags
+from tensorflow.python.tpu import tpu_strategy_util
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tpu_cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  remote.connect_to_cluster(resolver)
+  tpu_strategy_util.initialize_tpu_system(resolver)
+  return tpu_lib.TPUStrategyV2(resolver)
+
+
+# This test doesn't use XLATestCase like the other tests in this directory.
+# The Const op xla op kernel is compilation only and therefore is not executed
+# with XLA in the on demand compilation mode. Also, here we want to feed the
+# full program to XLA to verify handling of programs with giant constant
+# tensors.
+class GiantConstOp(test.TestCase):
+
+  def setUp(self):
+    super(GiantConstOp, self).setUp()
+    # Make sure TF_XLA_FLAGS is not already set to avoid dropping the existing
+    # value silently.
+    assert "TF_XLA_FLAGS" not in os.environ
+
+    # Disable tfxla constant folding that always creates full Tensors and will
+    # fail for giant tensors.
+    os.environ["TF_XLA_FLAGS"] = "--tf_xla_disable_constant_folding=true"
+
+  # Verifies that graphs containing giant const tensors that won't fit in memory
+  # are compiled correctly to HLO.
+  def testGiantConst(self):
+    strategy = get_tpu_strategy()
+
+    types = {
+        dtypes.bool,
+        dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+        dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64,
+        dtypes.float16, dtypes.bfloat16,
+        dtypes.float32, dtypes.float64,
+    }
+    for dtype in types:
+      values = [True if dtype is dtypes.bool else 1]
+
+      if dtype is dtypes.bool:
+        values.append(False)
+      elif dtype is not dtypes.float64:
+        # TPUs don't follow IEEE 754 float64 standard for 64 bit floating point
+        # numbers so it could return different output even with just data
+        # transformation ops without any arithmetic operations.
+        values.extend([dtype.min, dtype.max])
+
+      for value in values:
+
+        @def_function.function
+        def train_step():
+
+          # pylint: disable=cell-var-from-loop
+          def computation():
+            const = constant_op.constant(value, dtype=dtype, shape=[1024]*4)
+            return const[:1, :1, :1, :1]
+
+          return strategy.run(computation, args=())
+
+        output = strategy.experimental_local_results(train_step())[0]
+        expected = np.full((1, 1, 1, 1), value)
+        self.assertAllEqual(output, expected)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 45297ecbe18..d4a264953cf 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -78,7 +78,7 @@ def testRGBToHSVRoundTrip(self):
         with self.test_scope():
           hsv = image_ops.rgb_to_hsv(placeholder)
           rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval(feed_dict={placeholder: rgb_np})
+          rgb_tf = rgb.eval(feed_dict={placeholder: rgb_np})
       self.assertAllCloseAccordingToType(rgb_tf, rgb_np, bfloat16_atol=0.03)
 
   def testRGBToHSVNumpy(self):
@@ -520,9 +520,6 @@ def testBFloat16(self):
             dtype=np.float32))
 
   def testAlignCorners3x3To12x12_uint8(self):
-    # TODO(b/72099414): enable the test for TPU when the issue is fixed.
-    if (self.device not in ["XLA_GPU", "XLA_CPU"]):
-      return
     # Ensure that resize with convolution works on XLA/GPU for integer types
     self._assertForwardOpMatchesExpected(
         np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8), [12, 12],
diff --git a/tensorflow/compiler/tests/pooling_ops_test.py b/tensorflow/compiler/tests/pooling_ops_test.py
index 3d2695b15e9..3a7e22c02e5 100644
--- a/tensorflow/compiler/tests/pooling_ops_test.py
+++ b/tensorflow/compiler/tests/pooling_ops_test.py
@@ -18,7 +18,9 @@
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
@@ -560,6 +562,34 @@ def AvgPoolGrad(inputs, outputs, output_gradients, ksize, strides, padding,
 
     self._TestPooling(nn_ops.avg_pool, AvgPoolGrad)
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/266613412): investigate FPE in AvgPoolGrad for TPU"
+  )
+  def testAvgPoolGradSamePaddingZeroStrideZeroSize(self):
+    output_gradient_vals = np.array([0.39117979], dtype=np.float32)
+    output_gradient_vals = output_gradient_vals.reshape([1, 1, 1, 1])
+    with self.session() as sess:
+      with self.test_scope():
+        output_gradients = array_ops.placeholder(
+            dtypes.float32, shape=output_gradient_vals.shape
+        )
+        t = gen_nn_ops.avg_pool_grad(
+            orig_input_shape=[1, 0, 0, 0],
+            grad=output_gradients,
+            ksize=[1, 0, 0, 0],
+            strides=[1, 0, 0, 0],
+            padding="SAME",
+            data_format="NCHW",
+        )
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError,
+          (
+              "Sliding window ksize field for dimension 1 must be positive but"
+              " is 0"
+          ),
+      ):
+        sess.run(t, {output_gradients: output_gradient_vals})
+
   # The CPU implementation of AvgPoolGrad doesn't accept kernels smaller than
   # the stride size, so we only run the following tests on MaxPoolGrad.
 
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 03e919dee91..80be5b5e836 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -279,6 +279,12 @@ def testShuffle2d(self):
       self.assertAllEqual(len(result.flatten()), len(expected))
       self.assertAllEqual(set(result.flatten()), set(expected))
 
+  def testRandomShuffleInputRank0(self):
+    with self.session():
+      with self.test_scope():
+        shuffle = random_ops.random_shuffle(value=1e20)
+      self.evaluate(shuffle)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_args_test.py b/tensorflow/compiler/tests/reverse_sequence_op_args_test.py
new file mode 100644
index 00000000000..3ccb9b1df27
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_sequence_op_args_test.py
@@ -0,0 +1,52 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.reverse_sequence_op."""
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReverseSequenceArgsTest(xla_test.XLATestCase):
+  """Tests argument verification of array_ops.reverse_sequence."""
+
+  def testInvalidArguments(self):
+    # seq_axis negative
+    with self.assertRaisesRegex(
+        (errors.InvalidArgumentError, ValueError), "seq_dim must be >=0"
+    ):
+
+      @def_function.function(jit_compile=True)
+      def f(x):
+        return array_ops.reverse_sequence(x, [2, 2], seq_axis=-1)
+
+      f([[1, 2], [3, 4]])
+
+    # batch_axis negative
+    with self.assertRaisesRegex(ValueError, "batch_dim must be >=0"):
+
+      @def_function.function(jit_compile=True)
+      def g(x):
+        return array_ops.reverse_sequence(x, [2, 2], seq_axis=1, batch_axis=-1)
+
+      g([[1, 2], [3, 4]])
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
index 7ef6d6607c1..2eac4222ae3 100644
--- a/tensorflow/compiler/tests/segment_reduction_ops_test.py
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -41,6 +41,14 @@ def _unsortedSegmentSum(self, data, indices, num_segments):
     return self._segmentReduction(math_ops.unsorted_segment_sum, data, indices,
                                   num_segments)
 
+  def _segmentSumV2(self, data, indices, num_segments):
+    return self._segmentReduction(math_ops.segment_sum_v2, data, indices,
+                                  num_segments)
+
+  def _segmentProdV2(self, data, indices, num_segments):
+    return self._segmentReduction(math_ops.segment_prod_v2, data, indices,
+                                  num_segments)
+
   def _unsortedSegmentProd(self, data, indices, num_segments):
     return self._segmentReduction(math_ops.unsorted_segment_prod, data, indices,
                                   num_segments)
@@ -53,6 +61,38 @@ def _unsortedSegmentMax(self, data, indices, num_segments):
     return self._segmentReduction(math_ops.unsorted_segment_max, data, indices,
                                   num_segments)
 
+  def testSegmentSum(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([1, 0, 2, 12], dtype=dtype),
+          self._segmentSumV2(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 4))
+
+  def testSegmentProd(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([0, 1, 2, 60], dtype=dtype),
+          self._segmentProdV2(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 4))
+
+  def testSegmentProdNumSegmentsLess(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([0, 1, 2], dtype=dtype),
+          self._segmentProdV2(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 3))
+
+  def testSegmentProdNumSegmentsMore(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([0, 1, 2, 60, 1], dtype=dtype),
+          self._segmentProdV2(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([0, 0, 2, 3, 3, 3], dtype=np.int32), 5))
+
   def testUnsortedSegmentSum0DIndices1DData(self):
     for dtype in self.numeric_types:
       self.assertAllClose(
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index b6a1f08e0a1..012fe158e1c 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -15,10 +15,12 @@
 """Tests for stateless random-number generation ops."""
 
 import functools
+import os
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.client import device_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
@@ -34,14 +36,36 @@
 from tensorflow.python.platform import test
 
 
+def xla_device():
+  devices = device_lib.list_local_devices()
+
+  def find_type(device_type):
+    for d in devices:
+      if d.device_type == device_type:
+        return d
+    return None
+
+  d = find_type('TPU') or find_type('XLA_GPU') or find_type('XLA_CPU')
+  if d is None:
+    raise ValueError('Cannot find any XLA device. Available devices:\n%s' %
+                     devices)
+  return d
+
+
+def _allowed_types(include_int=False):
+  allowed_types = {
+      dtypes.float64, dtypes.float32, dtypes.float16, dtypes.bfloat16
+  }
+  if include_int:
+    allowed_types.update({dtypes.int32, dtypes.int64})
+  return allowed_types
+
+
 class StatelessRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   """Test cases for stateless random-number generator operators."""
 
   def _random_types(self, include_int=False):
-    allowed_types = {dtypes.float64, dtypes.float32, dtypes.bfloat16}
-    if include_int:
-      allowed_types.update({dtypes.int32, dtypes.int64})
-    return self.all_tf_types & allowed_types
+    return self.all_tf_types & _allowed_types(include_int)
 
   @test_util.run_v2_only
   def testForcedCompile(self):
@@ -148,28 +172,30 @@ def testLargeNormal(self):
       y = sess.run(x, {seed_t: [0x12345678, 0xabcdef1]})
       self.assertAllEqual([1024, 32000], y.shape)
 
-  def testDeterminism(self):
+  @parameterized.named_parameters(
+      (f'_{op_name}_{shape}_{dtype.name}', stateless_op, shape, dtype)  # pylint: disable=g-complex-comprehension
+      for dtype in _allowed_types() for shape in ((), (3,), (2, 5))
+      for op_name, stateless_op in (
+          ('uniform', stateless.stateless_random_uniform),
+          ('normal', stateless.stateless_random_normal),
+      ))
+  def testDeterminism(self, stateless_op, shape, dtype):
     # Stateless values should be equal iff the seeds are equal (roughly)
+    seeds = [(x, y) for x in range(-2, 3) for y in range(-2, 3)] * 3  # pylint: disable=g-complex-comprehension
     with self.session(), self.test_scope():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
-      seeds = [(x, y) for x in range(-2, 3) for y in range(-2, 3)] * 3  # pylint: disable=g-complex-comprehension
-      for stateless_op in [
-          stateless.stateless_random_uniform, stateless.stateless_random_normal
-      ]:
-        for shape in (), (3,), (2, 5):
-          for dtype in self._random_types():
-            # Skip bfloat16. The result of bfloat16 is truncated from 32-bit
-            # result. With different seeds, the 32-bit results are different,
-            # but the truncated 16-bit results might be the same.
-            if dtype == dtypes.bfloat16:
-              continue
-            pure = stateless_op(shape, seed=seed_t, dtype=dtype)
-            values = [(seed, pure.eval(feed_dict={
-                seed_t: seed
-            })) for seed in seeds]
-            for s0, v0 in values:
-              for s1, v1 in values:
-                self.assertEqual(s0 == s1, np.all(v0 == v1))
+      pure = stateless_op(shape, seed=seed_t, dtype=dtype)
+      values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds]
+      for s0, v0 in values:
+        for s1, v1 in values:
+          if s0 == s1:
+            self.assertAllEqual(v0, v1)
+          else:
+            # The resolutions of float16 and bfloat16 are too low, so
+            # in some cases (e.g. scalar shape) different seeds may
+            # lead to the same output. So we skip those dtypes.
+            if not (dtype in (dtypes.bfloat16, dtypes.float16) and shape == ()):  # pylint: disable=g-explicit-bool-comparison
+              self.assertNotAllEqual(v0, v1)
 
   def testRandomUniformIsInRange(self):
     with self.session() as sess, self.test_scope():
@@ -184,26 +210,59 @@ def testRandomUniformIsInRange(self):
         self.assertTrue(np.all(y >= 0))
         self.assertTrue(np.all(y < maxval))
 
-  def testDistributionOfStatelessRandomUniform(self):
+  @parameterized.named_parameters(
+      (f'_{alg.name}_{dtype.name}_{seed}', alg, dtype, seed)  # pylint: disable=g-complex-comprehension
+      for seed in ([1, 2], [12, 23], [123, 456], [565656, 121212])
+      for dtype in _allowed_types(include_int=True)
+      for alg in list(stateless.Algorithm))
+  def testDistributionOfStatelessRandomUniform(self, alg, dtype, seed):
     """Use Pearson's Chi-squared test to test for uniformity."""
+    philox = stateless.Algorithm.PHILOX
+    auto_select = stateless.Algorithm.AUTO_SELECT
+    device = xla_device()
+    if 'CPU' in device.device_type:
+      device_type = 'CPU'
+    elif 'GPU' in device.device_type:
+      device_type = 'GPU'
+    elif device.device_type == 'TPU':
+      device_type = 'TPU'
+    else:
+      device_type = None
+    bad_combos1 = [
+        (dtypes.int32, [123, 456]),
+        (dtypes.int64, [123, 456]),
+        (dtypes.float16, [565656, 121212]),
+        (dtypes.bfloat16, [1, 2]),
+    ]
+    bad_combos2 = [
+        (dtypes.int32, [1, 2]),
+        (dtypes.int32, [12, 23]),
+    ]
+    # TODO(b/244649364): Investigate why these combinations fail.
+    if (device_type in ('CPU', 'GPU') and alg in (philox, auto_select) and
+        (dtype, seed) in bad_combos1 or device_type == 'TPU' and
+        (alg == philox and
+         (dtype, seed) in bad_combos1 or alg == auto_select and
+         (dtype, seed) in bad_combos2)):
+      self.skipTest(
+          'This (device, alg, dtype, seed) combination fails (b/244649364).')
     with self.session() as sess, self.test_scope():
-      for dtype in self._random_types(include_int=True):
-        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
-        n = 1000
-        maxval = 1
-        if dtype.is_integer:
-          maxval = 100
-        x = stateless.stateless_random_uniform(
-            shape=[n], seed=seed_t, maxval=maxval, dtype=dtype)
-        y = sess.run(x, {seed_t: [565656, 121212]})
-        # Convert y to float and normalize its value to range [0, 1) when
-        # maxval != 1.
-        y = y.astype(float) / maxval
-        # Tests that the values are distributed amongst 10 bins with equal
-        # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
-        # p=0.05. This test is probabilistic and would be flaky if the random
-        # seed were not fixed.
-        self.assertLess(random_test_util.chi_squared(y, 10), 16.92)
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      n = 1000
+      maxval = 1
+      if dtype.is_integer:
+        maxval = 100
+      x = stateless.stateless_random_uniform(
+          shape=[n], seed=seed_t, maxval=maxval, dtype=dtype, alg=alg)
+      y = sess.run(x, {seed_t: seed})
+      # Convert y to float and normalize its value to range [0, 1) when
+      # maxval != 1.
+      y = y.astype(float) / maxval
+      # Tests that the values are distributed amongst 10 bins with equal
+      # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
+      # p=0.05. This test is probabilistic and would be flaky if the random
+      # seed were not fixed.
+      self.assertLess(random_test_util.chi_squared(y, 10), 16.92)
 
   def testRandomNormalIsFinite(self):
     with self.session() as sess, self.test_scope():
@@ -214,32 +273,60 @@ def testRandomNormalIsFinite(self):
         y = sess.run(x, {seed_t: [0x12345678, 0xabcdef1]})
         self.assertTrue(np.all(np.isfinite(y)))
 
-  def testDistributionOfStatelessRandomNormal(self):
+  @parameterized.named_parameters(
+      (f'_{dtype.name}_{seed}', dtype, seed)  # pylint: disable=g-complex-comprehension
+      for seed in ([1, 2], [12, 23], [123, 456], [25252, 314159])
+      for dtype in _allowed_types())
+  def testDistributionOfStatelessRandomNormal(self, dtype, seed):
     """Use Anderson-Darling test to test distribution appears normal."""
     with self.session() as sess, self.test_scope():
-      for dtype in self._random_types():
-        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
-        n = 1000
-        x = stateless.stateless_random_normal(
-            shape=[n], seed=seed_t, dtype=dtype)
-        y = sess.run(x, {seed_t: [25252, 314159]})
-        # The constant 2.492 is the 5% critical value for the Anderson-Darling
-        # test where the mean and variance are known. This test is probabilistic
-        # so to avoid flakiness the seed is fixed.
-        self.assertLess(
-            random_test_util.anderson_darling(y.astype(float)), 2.492)
-
-  def testTruncatedNormal(self):
-    for dtype in self._random_types():
-      with self.session() as sess, self.test_scope():
-        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
-        n = 10000000
-        x = stateless.stateless_truncated_normal(
-            shape=[n], seed=seed_t, dtype=dtype)
-        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef1]})
-        random_test_util.test_truncated_normal(
-            self.assertEqual, self.assertAllClose, n, y,
-            variance_rtol=6e-3 if dtype == dtypes.bfloat16 else 1e-3)
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      n = 1000
+      x = stateless.stateless_random_normal(shape=[n], seed=seed_t, dtype=dtype)
+      y = sess.run(x, {seed_t: seed})
+      # The constant 2.492 is the 5% critical value for the Anderson-Darling
+      # test where the mean and variance are known. This test is probabilistic
+      # so to avoid flakiness the seed is fixed.
+      self.assertLess(random_test_util.anderson_darling(y.astype(float)), 2.492)
+
+  @parameterized.named_parameters(
+      (f'_{dtype.name}', dtype) for dtype in _allowed_types())
+  def testTruncatedNormal(self, dtype):
+    with self.session() as sess, self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      n = 10000000
+      x = stateless.stateless_truncated_normal(
+          shape=[n], seed=seed_t, dtype=dtype)
+      y = sess.run(x, {seed_t: [0x12345678, 0xabcdef1]})
+      is_megacore = 'megacore' in os.environ.get('TEST_TARGET', '').lower()
+      if dtype == dtypes.float16:
+        if is_megacore:
+          mean_atol = 2e-3
+        else:
+          mean_atol = 7e-4
+      else:
+        mean_atol = 5e-4
+
+      if dtype == dtypes.float16 and is_megacore:
+        median_atol = 2e-3
+      else:
+        median_atol = 8e-4
+
+      if dtype == dtypes.bfloat16:
+        variance_rtol = 6e-3
+      elif dtype == dtypes.float16:
+        variance_rtol = 3e-3
+      else:
+        variance_rtol = 1e-3
+
+      random_test_util.test_truncated_normal(
+          self.assertEqual,
+          self.assertAllClose,
+          n,
+          y,
+          mean_atol=mean_atol,
+          median_atol=median_atol,
+          variance_rtol=variance_rtol)
 
   def _testParameterizedTruncatedNormal(self,
                                         means,
@@ -329,6 +416,10 @@ def builder_fn():
 
     xla_test.Benchmark(self, builder_fn, use_xla_jit=use_xla_jit, device='cpu')
 
+  def benchmarkUniformF16(self):
+    self._benchmarkUniform(
+        'uniform_f16', dtype=dtypes.float16, use_xla_jit=False)
+
   def benchmarkUniformF32(self):
     self._benchmarkUniform(
         'uniform_f32', dtype=dtypes.float32, use_xla_jit=False)
@@ -337,6 +428,10 @@ def benchmarkUniformF64(self):
     self._benchmarkUniform(
         'uniform_f64', dtype=dtypes.float64, use_xla_jit=False)
 
+  def benchmarkUniformF16XLA(self):
+    self._benchmarkUniform(
+        'uniform_f16', dtype=dtypes.float16, use_xla_jit=True)
+
   def benchmarkUniformF32XLA(self):
     self._benchmarkUniform(
         'uniform_f32', dtype=dtypes.float32, use_xla_jit=True)
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index 659d9f41e8d..3c9b29b1835 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -236,6 +236,17 @@ def testZerosLikeForTensorList(self):
       self.assertAllEqual(z.shape.as_list(), [None])
       self.assertAllEqual(z, [0.0, 0.0])
 
+  def testInvalidSplitLength(self):
+    with self.session(), self.test_scope():
+      tensor_list_split = list_ops.tensor_list_split(
+          tensor=[1], element_shape=[-1], lengths=[0]
+      )
+      with self.assertRaisesRegex(
+          errors.UnimplementedError, "All lengths must be positive"
+      ):
+        self.evaluate(tensor_list_split)
+
+
 if __name__ == "__main__":
   os.environ["TF_XLA_FLAGS"] = ("--tf_xla_min_cluster_size=2 " +
                                 os.environ.get("TF_XLA_FLAGS", ""))
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 7334d316a55..04ee8dbd615 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -864,52 +864,70 @@ def testBiasAddGrad(self):
         expected=np.array([14., 22.], dtype=np.float32))
 
   def testCast(self):
-    shapes = [[], [4], [2, 3], [2, 0, 4]]
     types = {
         dtypes.bool, dtypes.float32, dtypes.float64, dtypes.complex64,
         dtypes.int32, dtypes.int64, dtypes.uint32, dtypes.uint64
     }
     for src_type in types:
       for dst_type in types:
-        src_np_dtype = src_type.as_numpy_dtype
-        dst_np_dtype = dst_type.as_numpy_dtype
-
-        for shape in shapes:
-          src = np.arange(np.prod(shape)).astype(src_np_dtype)
-
-          if src_type in self.complex_tf_types:
-            src += (np.arange(np.prod(shape)) * 2j).astype(src_np_dtype)
-          src = src.reshape(shape)
-          dst = src.astype(dst_np_dtype)
-          self._assertOpOutputMatchesExpected(
-              lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
-              src,
-              expected=dst)
-
-        # Check special values.
-        if src_type.is_integer:
-          imin = np.iinfo(src_np_dtype).min
-          imax = np.iinfo(src_np_dtype).max
-          src = np.array([imin, imax, 0, 1, -1], dtype=src_np_dtype)
-        elif src_type in self.float_tf_types:
-          if dst_type.is_integer:
-            imin = np.iinfo(dst_np_dtype).min
-            imax = np.iinfo(dst_np_dtype).max // 2
-            src = np.array([imin, imax, 0, 1], dtype=src_np_dtype)
-          elif dst_type in self.float_tf_types:
-            fmin = np.finfo(dst_np_dtype).min
-            fmax = np.finfo(dst_np_dtype).max
-            tiny = np.finfo(dst_np_dtype).tiny
-            eps = np.finfo(dst_np_dtype).eps
-            src = np.array(
-                [fmin, fmax, np.nan, eps, -eps, tiny, -tiny, np.inf, -np.inf],
-                dtype=src_np_dtype)
+        self._testCast(src_type, dst_type)
+
+  def testCastFp8(self):
+    fp8_types = {dtypes.float8_e5m2, dtypes.float8_e4m3fn}
+    # TODO(b/259609697): Test casting to bool. Casting from float8 to bool is
+    # currently not supported since the cast is lowered to an Ne (not-equal) op,
+    # and FP8 is currently not supported with Ne.
+    other_types = {
+        dtypes.float32, dtypes.float64, dtypes.complex64,
+        dtypes.int32, dtypes.int64, dtypes.uint32, dtypes.uint64
+    }
+    for fp8_type in fp8_types:
+      for other_type in other_types | fp8_types:
+        self._testCast(fp8_type, other_type)
+        self._testCast(other_type, fp8_type)
+
+  def _testCast(self, src_type, dst_type):
+    with self.subTest(src_type=src_type, dst_type=dst_type):
+      shapes = [[], [4], [2, 3], [2, 0, 4]]
+      src_np_dtype = src_type.as_numpy_dtype
+      dst_np_dtype = dst_type.as_numpy_dtype
+
+      for shape in shapes:
+        src = np.arange(np.prod(shape)).astype(src_np_dtype)
+
+        if src_type in self.complex_tf_types:
+          src += (np.arange(np.prod(shape)) * 2j).astype(src_np_dtype)
+        src = src.reshape(shape)
         dst = src.astype(dst_np_dtype)
         self._assertOpOutputMatchesExpected(
             lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
             src,
             expected=dst)
 
+      # Check special values.
+      if src_type.is_integer:
+        imin = np.iinfo(src_np_dtype).min
+        imax = np.iinfo(src_np_dtype).max
+        src = np.array([imin, imax, 0, 1, -1], dtype=src_np_dtype)
+      elif src_type in self.float_tf_types:
+        if dst_type.is_integer:
+          imin = np.iinfo(dst_np_dtype).min
+          imax = np.iinfo(dst_np_dtype).max // 2
+          src = np.array([imin, imax, 0, 1], dtype=src_np_dtype)
+        elif dst_type in self.float_tf_types:
+          fmin = np.finfo(dst_np_dtype).min
+          fmax = np.finfo(dst_np_dtype).max
+          tiny = np.finfo(dst_np_dtype).tiny
+          eps = np.finfo(dst_np_dtype).eps
+          src = np.array(
+              [fmin, fmax, np.nan, eps, -eps, tiny, -tiny, np.inf, -np.inf],
+              dtype=src_np_dtype)
+      dst = src.astype(dst_np_dtype)
+      self._assertOpOutputMatchesExpected(
+          lambda x, dst_type=dst_type: math_ops.cast(x, dst_type),
+          src,
+          expected=dst)
+
   def testBitcast(self):
     self._assertOpOutputMatchesExpected(
         lambda x: array_ops.bitcast(x, dtypes.int32),
diff --git a/tensorflow/compiler/tests/unique_ops_test.py b/tensorflow/compiler/tests/unique_ops_test.py
new file mode 100644
index 00000000000..0938bfa430d
--- /dev/null
+++ b/tensorflow/compiler/tests/unique_ops_test.py
@@ -0,0 +1,46 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unique ops."""
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import googletest
+
+
+class UniqueTest(xla_test.XLATestCase):
+
+  def testNegativeAxis(self):
+    """Verifies that an axis with negative index is converted to positive."""
+    with self.session() as session:
+      with self.test_scope():
+        px = array_ops.placeholder(dtypes.float32, [2, 1, 1], name="x")
+        axis = constant_op.constant([-1], dtype=dtypes.int32)
+        output = gen_array_ops.unique_v2(px, axis)
+      result = session.run(
+          output, {px: np.array([[[-2.0]], [[10.0]]], dtype=np.float32)}
+      )
+      self.assertAllEqual(
+          result.y, np.array([[[-2.0]], [[10.0]]], dtype=np.float32)
+      )
+      self.assertAllEqual(result.idx, np.array([0], dtype=np.int32))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index afdd847d29e..ae18b95d301 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -160,13 +160,13 @@ def f(x):  # x: f32[2, b]
       module = """
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
-    %0 = mhlo.sine %arg1 : tensor<2x?xf32>
+    %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
     return %0, %arg0 : tensor<2x?xf32>, tensor<i32>
   }
 }
 """
       return xla.call_module([x],
-                             version=1,
+                             version=2,
                              module=module,
                              Tout=[x.dtype, np.int32],
                              Sout=[(None, 3), ()],
@@ -174,6 +174,29 @@ def f(x):  # x: f32[2, b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
 
+  def test_dim_var_basic_dim_arg_i64(self):
+    x = np.arange(6, dtype=np.float32).reshape((2, 3))
+
+    def f(x):  # x: f32[2, b]
+      # Module takes another argument which is the value of b
+      # (sin(x), x.shape[1])
+      module = """
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<i64>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i64>) {
+    %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
+    return %0, %arg0 : tensor<2x?xf32>, tensor<i64>
+  }
+}
+"""
+      return xla.call_module([x],
+                             version=2,
+                             module=module,
+                             Tout=[x.dtype, np.int64],
+                             Sout=[(None, 3), ()],
+                             dim_args_spec=['0.1'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
+
   def test_dim_var_basic_wrapped(self):
     """Like dim_arg_var_basic, but with the wrapper already added."""
     x = np.arange(6, dtype=np.float32).reshape((2, 3))
@@ -184,18 +207,18 @@ def f(x):  # x: f32[2, b]
       module = """
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
-    %arg0_new = "mhlo.get_dimension_size"(%arg1) {dimension = 1 : i64} : (tensor<2x?xf32>) -> tensor<i32>
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 1 : i64} : (tensor<2x?xf32>) -> tensor<i32>
     %arg1_new = tensor.cast %arg1 : tensor<2x?xf32> to tensor<2x?xf32>
     %0, %1 = call @dyn_main(%arg0_new, %arg1_new) : (tensor<i32>, tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>)
     return %0, %1 : tensor<2x?xf32>, tensor<i32>
   }
   func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
-    %0 = mhlo.sine %arg1 : tensor<2x?xf32>
+    %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
     return %0, %arg0 : tensor<2x?xf32>, tensor<i32>
   }
 }
 """
-      return xla.call_module([x], version=1,
+      return xla.call_module([x], version=2,
                              module=module,
                              Tout=[x.dtype, np.int32],
                              Sout=[(None, 3), ()],
@@ -203,6 +226,88 @@ def f(x):  # x: f32[2, b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
 
+  def test_dim_args_spec_errors(self):
+    # x, y: f32[2, b, c]
+    x = np.arange(24, dtype=np.float32).reshape((2, 3, 4))
+    y = x
+
+    # Module takes two prefix arguments with the values of b and c
+    #   return (sin(x + y), x.shape[1])
+    module = """
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<2x?x?xf32>, %arg3: tensor<2x?x?xf32>) -> (tensor<2x?x?xf32>, tensor<i32>) {
+    %0 = stablehlo.add %arg2, %arg3 : tensor<2x?x?xf32>
+    %1 = stablehlo.sine %0 : tensor<2x?x?xf32>
+    return %1, %arg0 : tensor<2x?x?xf32>, tensor<i32>
+  }
+}
+"""
+
+    dim_args_spec = ['0.1', '0.2']
+    def f(x, y):
+      return xla.call_module([x, y],
+                             version=2,
+                             module=module,
+                             Tout=[x.dtype, np.int32],
+                             Sout=[(None, 3), ()],
+                             dim_args_spec=dim_args_spec)
+    self._assertOpOutputMatchesExpected(f, (x, y), (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['0.0', '0.0', '0.0', '0.0']  # Too many dim_args_spec
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'The module should have 4 dimension arguments, '
+        'but it has only 4 total arguments'):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['0.0', '0.0', '0.0']  # dim_args_spec refers to non-scalar
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Module argument at index 2 should be a 0-dimensional integer-tensor '
+        'dimension argument but has type'):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = []  # No dim_args_spec
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Module main has dynamic shapes but no dim_args_spec was given'):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['1.0']  # Too few dim_args_spec
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Incorrect number of arguments for XlaCallModule: 2. '
+        'The module has 4 of which 1 were declared to be dimension arguments.'):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['0.b', '0.1']  # axis_idx not a number
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "Syntax error in dim_args_spec '0.b'"):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['2.0', '0.1']  # arg_idx too large
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Invalid argument index 2 when the number of non-dimension arguments '
+        "is 2 in dim_arg_spec '2.0'"):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+    dim_args_spec = ['0.3', '0.1']  # axis_idx too large
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Invalid axis index 3 when the rank of non-dimension argument 0 '
+        "is 3 in dim_arg_spec '0.3'"):
+      self._assertOpOutputMatchesExpected(f, (x, y),
+                                          (np.sin(x + y), x.shape[1]))
+
+  @unittest.skip('TODO(burmako): Re-enable this after shape refinement is done')
   def test_dynamic_iota(self):
     x = np.ones((3, 5), dtype=np.int32)
     res = np.arange(x.shape[0], dtype=np.int32)
@@ -212,13 +317,38 @@ def f(x):  # x: f32[b, 5]
       module = """
 module @jit_fun.1 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
-    %0 = mhlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-    %1 = "mhlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
+    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %1 = "stablehlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
     return %1 : tensor<?xi32>
   }
 }
 """
-      return xla.call_module([x,], version=1,
+      return xla.call_module([x,], version=2,
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[(None,)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Shape inference leaves dynamic_reshape')
+  def test_dynamic_reshape(self):
+    x = np.ones((4, 3), dtype=np.float32)
+    res = x.reshape((-1,))
+
+    def f(x):  # x: f32[b, 3]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
+    %0 = stablehlo.constant dense<3> : tensor<i32>
+    %1 = stablehlo.multiply %arg0, %0 : tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.dynamic_reshape %arg1, %2 : (tensor<?x3xf32>, tensor<1xi32>) -> tensor<?xf32>
+    return %3 : tensor<?xf32>
+  }
+}
+"""
+      return xla.call_module([x],
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None,)],
@@ -226,6 +356,121 @@ def f(x):  # x: f32[b, 5]
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
+  @unittest.skip('TODO(burmako): Shape inference adds tf.Cast')
+  def test_dynamic_reshape_cast(self):
+    x = np.ones((4, 2, 3), dtype=np.float32)
+    res = np.sin(x).reshape((4, -1))
+
+    def f(x):  # x: f32[b, 2, 3]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x2x3xf32>) -> tensor<?x6xf32> {
+    %0 = stablehlo.sine %arg1 : tensor<?x2x3xf32>
+    %1 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %2 = stablehlo.constant dense<6> : tensor<1xi32>
+    %3 = stablehlo.concatenate %1, %2, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %4 = stablehlo.dynamic_reshape %0, %3 : (tensor<?x2x3xf32>, tensor<2xi32>) -> tensor<?x6xf32>
+    return %4 : tensor<?x6xf32>
+  }
+}
+"""
+      return xla.call_module([x],
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[(None, 6)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Crash in simplifyDynamicGatherToGather()')
+  def test_dynamic_gather(self):
+    x = np.ones((3, 4), dtype=np.float32)
+    idx = np.array([2, 2], np.int32)
+    res = x[idx]
+
+    def f(x):  # x: f32[b, 4]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
+    %0 = stablehlo.constant dense<0> : tensor<i64>
+    %1 = stablehlo.constant dense<0> : tensor<1xi64>
+    %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.constant dense<2> : tensor<1xi32>
+    %4 = stablehlo.concatenate %2, %3, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %5 = "stablehlo.dynamic_gather"(%arg1, %1, %4) {dimension_numbers = #stablehlo.gather<offset_dims = [0, 1], start_index_map = [1]>, indices_are_sorted = true} : (tensor<?x4xf32>, tensor<1xi64>, tensor<2xi32>) -> tensor<?x2xf32>
+    return %5 : tensor<?x2xf32>
+  }
+}
+"""
+      return xla.call_module([x],
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[(None, 2)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Shape inference leaves real_dynamic_slice')
+  def test_real_dynamic_slice(self):
+    x = np.ones((3, 4), dtype=np.float32)
+    res = x[-1, :]  # TODO(necula): adjust this, if not the right result
+
+    def f(x):  # x: f32[b, 4]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
+    %0 = stablehlo.constant dense<-1> : tensor<i32>
+    %1 = stablehlo.add %arg0, %0 : tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.constant dense<0> : tensor<1xi32>
+    %4 = stablehlo.concatenate %2, %3, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %5 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %6 = stablehlo.constant dense<4> : tensor<1xi32>
+    %7 = stablehlo.concatenate %5, %6, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %10 = stablehlo.constant dense<1> : tensor<2xi32>
+    %11 = stablehlo.real_dynamic_slice %arg1, %4, %7, %10 : (tensor<?x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x4xf32>
+    %12 = stablehlo.reshape %11 : (tensor<1x4xf32>) -> tensor<4xf32>
+    return %12 : tensor<4xf32>
+  }
+}
+"""
+      return xla.call_module([x],
+                             module=module,
+                             Tout=[x.dtype],
+                             Sout=[(4,)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Module verification with dynamic_update_slice')
+  def test_dynamic_update_slice(self):
+    x = np.ones((3, 4), dtype=np.float32)
+    idx = np.int32(-2)
+    res = x   # The update should be a nop
+
+    def f(x, idx):  # x: f32[b, 4]  idx: i32
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
+    %0 = stablehlo.constant dense<0> : tensor<i32>
+    %1 = stablehlo.compare  LT, %arg2, %0,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %2 = stablehlo.add %arg2, %arg0 : tensor<i32>
+    %3 = stablehlo.select %1, %2, %arg2 : tensor<i1>, tensor<i32>
+    %4 = stablehlo.constant dense<0> : tensor<i32>
+    %5 = stablehlo.dynamic_update_slice %arg1, %arg1, %3, %4 : (tensor<?x4xf32>, tensor<?x4xf32>, tensor<i32>, tensor<i32>) -> tensor<?x4xf32>
+    return %5 : tensor<?x4xf32>
+  }
+}
+"""
+      return xla.call_module([x, idx],
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[(None, 4)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x, idx), (res,))
+
+  @unittest.skip('TODO(burmako): Re-enable this after shape refinement is done')
   def test_dynamic_broadcast_in_dim(self):
     x = np.ones((3, 4), dtype=np.float32)
     y = np.ones((2, 3, 4), dtype=np.float32)
@@ -236,17 +481,17 @@ def f(x, y):  # x: f32[b, 4]  y: f32[2, b, 4]
       module = """
 module @jit_fun.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
-    %0 = mhlo.constant dense<2> : tensor<1xi32>
-    %2 = mhlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-    %3 = mhlo.constant dense<4> : tensor<1xi32>
-    %4 = "mhlo.concatenate"(%0, %2, %3) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-    %5 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<?x4xf32>, tensor<3xi32>) -> tensor<2x?x4xf32>
-    %6 = mhlo.add %5, %arg2 : (tensor<2x?x4xf32>, tensor<2x?x4xf32>) -> tensor<2x?x4xf32>
+    %0 = stablehlo.constant dense<2> : tensor<1xi32>
+    %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.constant dense<4> : tensor<1xi32>
+    %4 = "stablehlo.concatenate"(%0, %2, %3) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+    %5 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<?x4xf32>, tensor<3xi32>) -> tensor<2x?x4xf32>
+    %6 = stablehlo.add %5, %arg2 : (tensor<2x?x4xf32>, tensor<2x?x4xf32>) -> tensor<2x?x4xf32>
     return %5, %6 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
   }
 }
 """
-      return xla.call_module([x, y], version=1,
+      return xla.call_module([x, y], version=2,
                              module=module,
                              Tout=[res[0].dtype, res[1].dtype],
                              Sout=[(2, None, 4), (2, None, 4)],
@@ -274,7 +519,7 @@ def f(x):  # x: i32[b]
   }
 }
 """
-      return xla.call_module([x], version=2,
+      return xla.call_module([x], version=1,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[res.shape],
@@ -282,6 +527,38 @@ def f(x):  # x: i32[b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
+  @unittest.skip('TODO(burmako): tf.Cast added after reduce')
+  def test_reduce_broadcast(self):
+    x = np.broadcast_to(np.arange(3, dtype=np.float32).reshape(3, 1), (3, 5))
+    res = np.any(x, axis=1)   # TODO(necula): not sure this should be the result
+
+    def f(x):  # x: f32[b, 5]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
+    %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = stablehlo.reduce(%arg1 init: %0) across dimensions = [1] : (tensor<?x5xf32>, tensor<f32>) -> tensor<?xf32>
+     reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
+      %6 = stablehlo.add %arg2, %arg3 : tensor<f32>
+      stablehlo.return %6 : tensor<f32>
+    }
+    %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.constant dense<1> : tensor<1xi32>
+    %4 = stablehlo.concatenate %2, %3, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %5 = stablehlo.dynamic_broadcast_in_dim %1, %4, dims = [0] : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x1xf32>
+    return %5 : tensor<?x1xf32>
+  }
+}
+"""
+      return xla.call_module([x,],
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[(None, 1)],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Re-enable this after shape refinement is done')
   def test_call(self):
     """A chain of calls."""
     x = np.ones((5,), dtype=np.float32)
@@ -295,13 +572,13 @@ def f(x):  # x: f32[b]
     return %0 : tensor<?xi32>
   }
   func.func private @f(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
-    %0 = mhlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
-    %1 = "mhlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
+    %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+    %1 = "stablehlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
     return %1 : tensor<?xi32>
   }
 }
 """
-      return xla.call_module([x,], version=1,
+      return xla.call_module([x,], version=2,
                              module=module,
                              Tout=[res.dtype],
                              Sout=[()],
@@ -309,6 +586,67 @@ def f(x):  # x: f32[b]
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
+  def test_identity(self):
+    x = np.ones((5,), dtype=np.float32)
+    res = x
+
+    def f(x):  # x: f32[b]
+      module = """
+module @jit_fun_3 {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+    return %arg1 : tensor<?xf32>
+  }
+}
+"""
+      return xla.call_module([x],
+                             version=2,
+                             module=module,
+                             Tout=[res.dtype],
+                             Sout=[()],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res,))
+
+  @unittest.skip('TODO(burmako): Shape inference failure for while')
+  def test_while(self):
+    """A while loop with carryied dynamic shapes."""
+    x = np.ones((5,), dtype=np.float32)
+    # Compute the result in Pyton first
+    res0 = x
+    for i in range(5):
+      res0 += np.arange(x.shape[0], dtype=np.float32)
+    res1 = np.int64(i)
+
+    def f(x):  # x: f32[b]
+      module = """
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
+    %0 = stablehlo.constant dense<0> : tensor<i64>
+    %1:2 = stablehlo.while(%iterArg = %arg1, %iterArg_0 = %0) : tensor<?xf32>, tensor<i64>
+     cond {
+      %2 = stablehlo.constant dense<5> : tensor<i64>
+      %3 = stablehlo.compare  LT, %iterArg_0, %2,  SIGNED : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %3 : tensor<i1>
+    } do {
+      %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
+      %3 = stablehlo.dynamic_iota %2, dim = 0 : (tensor<1xi32>) -> tensor<?xf32>
+      %4 = stablehlo.add %iterArg, %3 : tensor<?xf32>
+      %5 = stablehlo.constant dense<1> : tensor<i64>
+      %6 = stablehlo.add %iterArg_0, %5 : tensor<i64>
+      stablehlo.return %4, %6 : tensor<?xf32>, tensor<i64>
+    }
+    return %1#0, %1#1 : tensor<?xf32>, tensor<i64>
+  }
+}
+"""
+      return xla.call_module([x,], version=2,
+                             module=module,
+                             Tout=[res0.dtype, res1.dtype],
+                             Sout=[(None,), res1.shape],
+                             dim_args_spec=['0.0'])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (res0, res1))
+
 
 if __name__ == '__main__':
   # This test is using Tensorflow sessions which are not compatible with eager
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index bb075fddc36..17055f4070d 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -636,7 +636,7 @@ def testDynamicSliceWithIncorrectStartIndicesShape(self):
         session.run(output)
       self.assertRegex(
           invalid_arg_error.exception.message,
-          (r'op has mismatched number of slice sizes \(3\) and number of start'
+          (r'has mismatched number of slice sizes \(3\) and number of start'
            r' indices \(2\)'))
 
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
@@ -649,7 +649,7 @@ def testDynamicSliceWithIncorrectSizeIndicesShape(self):
         session.run(output)
       self.assertRegex(
           invalid_arg_error.exception.message,
-          (r'op has mismatched number of slice sizes \(2\) and number of start'
+          (r'has mismatched number of slice sizes \(2\) and number of start'
            r' indices \(3\)'))
 
   def test_optimization_barrier(self):
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index a83709e6b98..25dc03b4ee5 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -28,6 +28,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         "-layering_check",
@@ -99,7 +100,7 @@ tf_cuda_cc_test(
     deps = [
         ":trt_logging",
         ":utils",
-        "//tensorflow/core/common_runtime/gpu:gpu_init",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:test",
@@ -618,6 +619,11 @@ tf_cuda_library(
         ":utils",
         ":op_converter",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ] + if_static([":op_converter_registry_impl"]),
 )
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 608d88bb942..79a60d2b1de 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -785,13 +785,17 @@ Status ConvertGraph(const TRTOptimizationPass::ConversionParams& params,
                              params.use_calibration, params.use_implicit_batch,
                              params.use_explicit_precision);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
-      &graph, &static_graph_properties,
+      /*tf_graph=*/&graph,
+      /*graph_properties=*/&static_graph_properties,
+      /*candidate_fn=*/
       std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
                 std::placeholders::_1),
       // Input validation is already done by TrtNodeValidator, so we don't
       // need to check the input edges.
-      [](const Edge* edge) { return true; }, OutputEdgeValidator(),
-      segment_options, &initial_segments));
+      /*input_candidate_fn=*/[](const Edge* edge) { return true; },
+      /*output_candidate_fn=*/OutputEdgeValidator(),
+      /*options=*/segment_options,
+      /*segments=*/&initial_segments));
   LOG(INFO) << "Number of TensorRT candidate segments: "
             << initial_segments.size();
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 2ea9b47b7ef..584b9c867c5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -2098,7 +2098,11 @@ Status ConvertConv2DHelper(const OpConverterParams* params, int group,
   if (params->use_explicit_precision) {
     TRT_ENSURE(inputs.at(1).is_tensor());
 
-    conv_layer->setInput(1, *inputs.at(1).tensor()->trt_tensor());
+    nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+        *inputs.at(1).tensor()->trt_tensor());
+    layer->setFirstTranspose({3, 2, 0, 1});
+    layer->setReshapeDimensions({4, {0, 0, 0, 0}});
+    conv_layer->setInput(1, *layer->getOutput(0));
   }
 
   params->converter->SetLayerName(conv_layer, node_def, "conv");
@@ -3649,6 +3653,24 @@ Status ConvertIdentity(const OpConverterParams* params) {
   return OkStatus();
 }
 
+// This converter is a debug-only feature designed to allow graph segmentation
+// experiments. Its use is being controled by
+// `TF_TRT_OP_FAKELIST=OpName1,OpName2,...`.
+// See `op_converter_registry.cc` for further details.
+//
+// This converter is designed as followed:
+//   - always succeed at graph segmentation time.
+//   - always fail at TRT Engine build time.
+Status ConvertFake(const OpConverterParams* params) {
+  if (params->validation_only) return OkStatus();
+
+  return errors::Unimplemented(
+      "This converter is not valid after graph "
+      "segmentation. Building an engine using this "
+      "converter will trigger a native segment "
+      "fallback.");
+}
+
 Status ConvertSquare(const OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -5718,7 +5740,9 @@ Status ConvertAddN(const OpConverterParams* params) {
       tensor_inputs.push_back(input.tensor());
     } else {
       auto dims = input.weights().Shape();
-      TF_RETURN_IF_ERROR(dims.RemoveBatchDimension());
+      if (params->use_implicit_batch) {
+        TF_RETURN_IF_ERROR(dims.RemoveBatchDimension());
+      }
       tensor_inputs.push_back(params->converter->CreateConstantLayer(
           input.weights(), dims.AsTrtDims()));
     }
@@ -5795,6 +5819,8 @@ REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertIdentity,
                                    "StopGradient", "_CopyFromHostToGpu"});
 REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertBatchMatMul,
                                   {"BatchMatMul", "BatchMatMulV2"});
+// Debug converter only accessible via `TF_TRT_OP_FAKELIST=OpName1,OpName2,...`
+REGISTER_DEFAULT_TRT_OP_CONVERTER(ConvertFake, "FakeOp");
 
 Status ConvertGraphDefToEngine(
     const GraphDef& gdef, OpKernelContext* ctx, TrtPrecisionMode precision_mode,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index db67e6e358b..cf3c6ad22d4 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_tensor_proxy.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -44,7 +43,7 @@ namespace tensorflow {
 namespace tensorrt {
 
 namespace convert {
-using ::stream_executor::port::StatusOr;
+using ::tsl::StatusOr;
 
 struct EngineConnection {
   // Constructs a non-control edge.
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 170a707f88b..a52ceae693c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -5034,8 +5034,7 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertStridedSlice) {
   };
 
   // Same input is used for all tests.
-  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
-
+  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
   Status modified_batch_dim_status =
       (trt_mode_ == TrtTestMode::kImplicitBatch)
           ? errors::Unimplemented(
@@ -5712,6 +5711,48 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertStridedSlice) {
                      "new_axis_mask is not supported for StridedSlice"),
                  /*runtime_status=*/OkStatus(),
                  /*partial_input_dims=*/{1, 6}},
+      // Test all axes dynamic inputs with shrink_axis_mask
+      TestParams{/*input_dims=*/{1, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/1,
+                 /*expected_output_dims=*/{3, 2},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*conversion_status=*/modified_batch_dim_status, OkStatus(),
+                 /*partial_input_dims=*/{-1, -1, -1}},
+      // Test dynamic input with shrink_axis_mask along axis=0
+      TestParams{/*input_dims=*/{2, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/1,
+                 /*expected_output_dims=*/{3, 2},
+                 /*expected_output=*/{1, 2, 3, 4, 5, 6},
+                 /*conversion_status=*/modified_batch_dim_status, OkStatus(),
+                 /*partial_input_dims=*/{-1, -1, 2}},
+      // Test dynamic input sizes with multiple axes shrinking
+      TestParams{/*input_dims=*/{2, 3, 2},
+                 /*begin=*/{0, 0, 0},
+                 /*end=*/{0, 0, 3},
+                 /*strides=*/{1, 1, 1},
+                 /*begin_mask=*/get_mask({0, 1, 1}),
+                 /*end_mask=*/get_mask({0, 1, 1}),
+                 /*ellipsis_mask=*/0,
+                 /*new_axis_mask=*/0,
+                 /*shrink_axis_mask=*/3,
+                 /*expected_output_dims=*/{2},
+                 /*expected_output=*/{1, 2},
+                 /*conversion_status=*/modified_batch_dim_status, OkStatus(),
+                 /*partial_input_dims=*/{-1, -1, 2}},
   };
 
   int i = 0;
@@ -5737,7 +5778,6 @@ TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertStridedSlice) {
         if (p.partial_input_dims.size() > 0) {
           AddTestTensor("input", p.input_dims, tf_type_, ok_input,
                         p.partial_input_dims);
-
         } else {
           AddTestTensor("input", p.input_dims, tf_type_, ok_input,
                         p.input_dims);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
index b1c47282845..b33bd3f6a33 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.cc
@@ -15,9 +15,15 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
 
 #include <set>
+#include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
@@ -57,7 +63,37 @@ class OpConverterRegistry::Impl {
     return {};
   }
 
-  StatusOr<OpConverter> LookUp(const string& name) {
+  StatusOr<OpConverter> LookUp(string name) {
+    // Fetch the user-provide TF operations denylisted for conversion by TF-TRT.
+    static const absl::flat_hash_set<string> tftrt_op_fakelist = [] {
+      string tftrt_op_fakelist_str;
+      TF_CHECK_OK(ReadStringFromEnvVar("TF_TRT_OP_FAKELIST",
+                                       /*default_value=*/"",
+                                       &tftrt_op_fakelist_str));
+      absl::flat_hash_set<string> tftrt_op_fakelist{};
+      for (const auto& x : str_util::Split(tftrt_op_fakelist_str, ",")) {
+        tftrt_op_fakelist.insert(x);
+      }
+      // Force a rehash of the flat hash set
+      tftrt_op_fakelist.rehash(0);
+      return tftrt_op_fakelist;
+    }();
+
+    // In case the TensorFlow OP `name` matches any of the names passed to
+    // TF_TRT_OP_FAKELIST environment variable, force ::LookUp to resolves to
+    // ConvertFake OP converter.
+    if (tftrt_op_fakelist.contains(name)) {
+      LOG_FIRST_N(INFO, 2) << "Emulating OP Converter: `" << name << "`. It "
+                           << "will cause TRT engine building to fail. This "
+                           << "feature is only intended to be used for "
+                           << "TF-TRT graph segmentation experiments. This "
+                           << "feature is controlled using: "
+                           << "`TF_TRT_OP_FAKELIST=OpName1,OpName2`.";
+      // Forces ::LookUp to resolve to `ConvertFake` registred to `FakeOp`.
+      mutex_lock lock(mu_);
+      return registry_.find("FakeOp")->second.converter;
+    }
+
     mutex_lock lock(mu_);
     auto found = registry_.find(name);
     if (found != registry_.end()) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
index adf0796840b..fc5fc589211 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/fill_ops.cc
@@ -288,7 +288,7 @@ class ConvertRange : public ConvertFillBase<ConvertRange> {
 };
 
 std::string convert_range_error_msg(float start, float limit, float delta) {
-  const char* format_string =
+  constexpr char* format_string =
       "For parameters (start, limit) = (%.2f, %.2f) "
       "of the Range operation delta cannot be %s, got %.2f";
   return absl::StrFormat(format_string, start, limit,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.cc
index 2e3fd920b16..19305195016 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/slice_ops.cc
@@ -72,7 +72,11 @@ Status ConvertStridedSliceHelper(
     size_dims.d[i] = (std::abs(end_dims->dim(i) - begin_dims->dim(i)) +
                       std::abs(stride_dims->dim(i)) - 1) /
                      std::abs(stride_dims->dim(i));
-
+    // When begin tensor has negative values, currently range can't be computed.
+    if (begin_dims->dim(i) < 0) {
+      return errors::Unimplemented(
+          "Negative values in begin weight tensor are unsupported");
+    }
     if (input_dims.dim_size(i) < 0) {
       // end_dims and begin_dims do not have valid information yet.
       dynamic_input_size_indices.push_back(i);
@@ -103,14 +107,6 @@ Status ConvertStridedSliceHelper(
       params->converter->network(), params->weight_store);
   TRT_ENSURE_OK(builder);
 
-  // VLOG(2) << "strided slice helper:"
-  //         << " begin:" << DebugString(begin_dims)
-  //         << "\n stride: " << DebugString(stride_dims)
-  //         << "\n end: " << DebugString(end_dims)
-  //         << "\n size: " << DebugString(size_dims)
-  //         << "\n Dynamic indices: " <<
-  //         DebugString(dynamic_input_size_indices)
-  //         << "\n Static indices: " << DebugString(static_input_size_indices);
   // Create the slice operation. For dynamic dims, the inputs of the operations
   // may be reassigned later.
   StatusOr<nvinfer1::ISliceLayer*> slice =
@@ -130,11 +126,33 @@ Status ConvertStridedSliceHelper(
                                   op_instance);
   ITensorProxyPtr tensor = (*slice)->getOutput(0);
 
-  // Reshape for shrink_axis.
+  // Reshape for shrink axis, ellipsis masks based on the shape computed by
+  // ValidateStridedSliceOp or HandleDynamicStridedSliceInput.
+  nvinfer1::Dims dims = tensor->trt_tensor()->getDimensions();
+  std::vector<int> slice_input_dims(dims.d, dims.d + dims.nbDims);
+  StridedSliceShapeSpec empty_spec;
+  empty_spec.shrink_axis_dense_mask = 0;
+  auto shrink_axis_mask =
+      strided_slice_spec.value_or(empty_spec).shrink_axis_dense_mask;
   if (final_shape) {
-    TF_RETURN_IF_ERROR(PrepareTensorForShape(
-        params->converter, TRT_TensorOrWeights(tensor), *final_shape,
-        /*validation_only=*/false, &tensor, node_def, op_instance));
+    if (shrink_axis_mask) {
+      int shrink_idx = params->use_implicit_batch ? 1 : 0;
+      const auto bShrink_axis_mask = std::bitset<32>(shrink_axis_mask);
+      for (int idx = 0; idx < slice_input_dims.size(); ++idx, ++shrink_idx) {
+        const bool shrink_axis = bShrink_axis_mask[shrink_idx];
+        if (shrink_axis) {
+          slice_input_dims[idx] = 0;
+        }
+      }
+      TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
+          tensor, &slice_input_dims, params, &tensor, op_instance));
+    } else {
+      /* To do: pmajety:
+            Remove the else condition when shrink_axis_mask is always defined */
+      TF_RETURN_IF_ERROR(PrepareTensorForShape(
+          params->converter, TRT_TensorOrWeights(tensor), *final_shape,
+          /*validation_only=*/false, &tensor, node_def, op_instance));
+    }
   }
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
   return OkStatus();
@@ -152,6 +170,39 @@ Status HandleDynamicStridedSliceInput(
   nvinfer1::ITensor* input_tensor = slice_layer->getInput(0);
   TRT_ENSURE(input_tensor);
 
+  // When begin_mask or end_mask are set, we have to disregard the begin_tensor
+  // and end_tensor values. In static indices cases, ValidateStridedSliceOp
+  // returns the correct begin_tensor and end_tensor values, however with
+  // dynamic indices the correct shape has to be computed.
+
+  VLOG(3) << "begin_dims before: " << DebugString(begin_dims);
+  VLOG(3) << "end_dims before: " << DebugString(end_dims);
+  const auto begin_mask = std::bitset<32>(strided_slice_spec.begin_dense_mask);
+  const auto end_mask = std::bitset<32>(strided_slice_spec.end_dense_mask);
+  const auto shrink_axis_mask =
+      std::bitset<32>(strided_slice_spec.shrink_axis_dense_mask);
+  nvinfer1::Dims dims = input_tensor->getDimensions();
+
+  for (int idx = 0; idx < dims.nbDims; ++idx) {
+    VLOG(3) << "begin_mask[" << idx << "]: " << begin_mask[idx];
+    VLOG(3) << "end_mask[" << idx << "]: " << end_mask[idx];
+    VLOG(3) << "shrink_mask[" << idx << "]: " << shrink_axis_mask[idx];
+    if (begin_mask[idx]) {
+      begin_dims.d[idx] = 0;
+    }
+    if (end_mask[idx]) {
+      end_dims.d[idx] = dims.d[idx];
+    }
+    if (shrink_axis_mask[idx]) {
+      end_dims.d[idx] = begin_dims.d[idx] + 1;
+    }
+  }
+
+  VLOG(2) << "begin_dims after shrink_axis_mask correction: "
+          << DebugString(begin_dims);
+  VLOG(2) << "end_dims after shrink_axis_mask correction: "
+          << DebugString(end_dims);
+
   // For each dynamic input dimension of the input, do some preprocessing based
   // on whether this dimension is set in "begin_mask" or "end_mask" and the sign
   // of the dimension's stride value.
@@ -167,8 +218,7 @@ Status HandleDynamicStridedSliceInput(
   //     dynamic size of dimension "dynamic_idx".
   absl::InlinedVector<int64, 4> dynamic_begin_indices;
   absl::InlinedVector<int64, 4> dynamic_end_indices;
-  const auto begin_mask = std::bitset<32>(strided_slice_spec.begin_dense_mask);
-  const auto end_mask = std::bitset<32>(strided_slice_spec.end_dense_mask);
+
   for (int i = 0; i < dynamic_input_size_indices.size(); i++) {
     auto dynamic_idx = dynamic_input_size_indices[i];
     if (begin_mask[dynamic_idx]) {
@@ -177,7 +227,7 @@ Status HandleDynamicStridedSliceInput(
         dynamic_begin_indices.push_back(dynamic_idx);
       }
     }
-    if (end_mask[dynamic_idx]) {
+    if (end_mask[dynamic_idx] && !shrink_axis_mask[dynamic_idx]) {
       end_dims.d[dynamic_idx] = stride_dims.d[dynamic_idx] > 0 ? 0 : -1;
       if (stride_dims.d[dynamic_idx] > 0) {
         dynamic_end_indices.push_back(dynamic_idx);
@@ -185,8 +235,8 @@ Status HandleDynamicStridedSliceInput(
     }
   }
 
-  // VLOG(2) << " Dynamic begin indices: " << DebugString(dynamic_begin_indices)
-  //         << " Dynamic end indices: " << DebugString(dynamic_end_indices);
+  VLOG(2) << " Dynamic begin indices: " << DebugString(dynamic_begin_indices)
+          << " Dynamic end indices: " << DebugString(dynamic_end_indices);
 
   // Create ITensors for each of the begin/stride/end constants.
   StatusOr<nvinfer1::IConstantLayer*> begin_const = builder->Constant(
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 3ef38388edd..6e5366f6921 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
@@ -241,7 +241,7 @@ TEST(TensorrtTest, BasicFunctions) {
 #endif
 
   // Handle the case where the test is run on machine with no gpu available.
-  if (CHECK_NOTNULL(GPUMachineManager())->VisibleDeviceCount() <= 0) {
+  if (CHECK_NOTNULL(se::GPUMachineManager())->VisibleDeviceCount() <= 0) {
     LOG(WARNING) << "No gpu device available, probably not being run on a gpu "
                     "machine. Skipping...";
     return;
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index 1481856e066..109339d0af9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -35,10 +35,7 @@ bool IsGoogleTensorRTEnabled() {
 #else   // TF_USE_TENSORRT_STATIC
   auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries();
   if (!handle_or.ok()) {
-    LOG_WARNING_WITH_PREFIX
-        << "Cannot dlopen some TensorRT libraries. If you would like "
-           "to use Nvidia GPU with TensorRT, please make sure the "
-           "missing libraries mentioned above are installed properly.";
+    LOG_WARNING_WITH_PREFIX << "Could not find TensorRT";
   }
   return handle_or.ok();
 #endif  // TF_USE_TENSORRT_STATIC
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
index e51f5e75102..b0935afb5b2 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/common/datavec.h"
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -33,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-using ::stream_executor::port::StatusOr;
+using ::tsl::StatusOr;
 
 // Creates a TensorRT execution context.
 ExecutionContext CreateExecutionContext(nvinfer1::ICudaEngine* cuda_engine);
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index eefde262851..968882b88c4 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -20,6 +20,7 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//tensorflow/compiler/xla/service/cpu:build_defs.bzl", "runtime_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     licenses = ["notice"],
 )
@@ -287,11 +288,16 @@ cc_library(
         "//third_party/eigen3",
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/platform:bfloat16",
+        "//tensorflow/core/platform:float8",
         "//tensorflow/core/platform:stringpiece",
         # Extra dependencies required for multithreaded runtime objects.
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:mutex",
+        "//tensorflow/tsl/platform:blocking_counter",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:tstring",
+        "//tensorflow/tsl/platform:types",
     ] + tf_additional_tensor_coding_deps(),
     alwayslink = 1,
 )
@@ -313,6 +319,7 @@ cc_library(
 #         "//tensorflow/tsl/platform:byte_order",
 #         "//tensorflow/tsl/platform:cord",
 #         "//tensorflow/tsl/platform:env_time",
+#         "//tensorflow/tsl/platform:float8",
 #         "//tensorflow/tsl/platform:logging",
 #         "//tensorflow/tsl/platform:macros",
 #         "//tensorflow/tsl/platform:mutex",
@@ -337,6 +344,7 @@ cc_library(
 #         "//tensorflow/core/platform:byte_order",
 #         "//tensorflow/core/platform:cord",
 #         "//tensorflow/core/platform:env_time",
+#         "//tensorflow/core/platform:float8",
 #         "//tensorflow/core/platform:logging",
 #         "//tensorflow/core/platform:macros",
 #         "//tensorflow/core/platform:mutex",
@@ -457,6 +465,7 @@ cc_library(
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
+        "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/compiler/mlir:array_container_utils",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
@@ -472,8 +481,9 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/core/util:overflow",
+        "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -640,8 +650,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_placer_hdr",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:layout_util",
@@ -667,7 +677,7 @@ cc_library(
         ":host_compute_metadata_proto_cc",
         ":xla_resource",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/core:framework",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -907,7 +917,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1073,6 +1082,7 @@ cc_library(
     name = "mlir_bridge_pass",
     srcs = ["mlir_bridge_pass.cc"],
     hdrs = ["mlir_bridge_pass.h"],
+    visibility = [":internal"],
     deps = [
         ":tf2xla_defs",
         "//tensorflow/compiler/jit:flags",
@@ -1325,7 +1335,7 @@ cc_library(
     hdrs = ["mlir_xla_op_kernel.h"],
     deps = [
         ":xla_compiler",
-        "//tensorflow/compiler/jit:xla_compilation_cache",
+        "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/compiler/mlir:array_container_utils",
         "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index fc42992e4b7..0f6754e005b 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_gen_op_wrapper_cc")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/tf2xla:friends"],
     licenses = ["notice"],
 )
@@ -48,5 +49,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 6cebed16049..96787b05baf 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -12,6 +12,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # copybara:uncomment "//learning/infra/mira:__subpackages__",
         "//third_party/cloud_tpu/inference_converter:__subpackages__",
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 62f3209bcea..a970c873695 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -212,7 +212,24 @@ XLA_MAKE_BINARY(
     xla::Div(xla::Mul(rhs, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
              lhs, extend_dimensions));
 
-XLA_MAKE_BINARY(TruncateDiv, xla::Div(lhs, rhs, extend_dimensions));
+// Implementation of TruncateDiv.
+//
+// For floating-point values, returns trunc(x / y).  For integers, simply
+// returns x / y.
+static xla::XlaOp TruncateDivImpl(xla::XlaBuilder* b, DataType dtype,
+                                  xla::XlaOp x, xla::XlaOp y,
+                                  const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  if (!DataTypeIsFloating(dtype)) {
+    return xla::Div(x, y);
+  }
+  auto zero = XlaHelpers::Zero(b, dtype);
+  auto x_div_y = xla::Div(x, y);
+  auto round_up = xla::Lt(x_div_y, zero);
+  return xla::Select(round_up, xla::Ceil(x_div_y), xla::Floor(x_div_y));
+}
+XLA_MAKE_BINARY(TruncateDiv,
+                TruncateDivImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 XLA_MAKE_BINARY(TruncateMod, xla::Rem(lhs, rhs, extend_dimensions));
 
 // Comparison ops
diff --git a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
index ba95c77f612..5cac4d98c50 100644
--- a/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bincount_op.cc
@@ -62,21 +62,15 @@ class DenseBincountOp : public XlaOpKernel {
     StatusOr<xla::Shape> input_shape_or = ctx->builder()->GetShape(input);
     OP_REQUIRES_OK(ctx, input_shape_or.status());
     auto input_shape = input_shape_or.value();
-    auto size = input_shape.dimensions(0);
 
-    if (!size) {
-      output = xla::Broadcast(zero, {output_size});
-      ctx->SetOutput(0, output);
-      return;
-    }
     auto rank = input_shape.rank();
 
     OP_REQUIRES(ctx, rank <= 2,
                 errors::InvalidArgument(
                     "Shape must be at most rank 2 but is rank ", rank));
-
     xla::XlaOp weights = ctx->Input(2);
     StatusOr<xla::Shape> weights_shape_or = ctx->builder()->GetShape(weights);
+
     OP_REQUIRES_OK(ctx, weights_shape_or.status());
 
     auto weights_shape = weights_shape_or.value();
@@ -91,11 +85,20 @@ class DenseBincountOp : public XlaOpKernel {
                     "1. Received ",
                     weights_shape.DebugString()));
 
+    auto size = input_shape.dimensions(0);
+
+    if (!size) {
+      output = xla::Broadcast(zero, {output_size});
+      ctx->SetOutput(0, output);
+      return;
+    }
+
     auto weights_size = weights_shape.dimensions(0);
     bool has_weights = false;
     if (weights_size) {
       has_weights = true;
     }
+
     xla::Shape output_shape = xla::ShapeUtil::MakeShape(dtype, {output_size});
     xla::ScatterDimensionNumbers scatter_dnums;
     scatter_dnums.set_index_vector_dim(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 248ab5b5323..20934423141 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -153,7 +153,11 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
 
 }  // anonymous namespace
 
-std::vector<DataType> GetXlaConvTypes() {
+std::vector<DataType> GetXlaConvTypesForNonGpu() {
+  return {DT_FLOAT, DT_BFLOAT16, DT_HALF, DT_DOUBLE, DT_INT32};
+}
+
+std::vector<DataType> GetXlaConvTypesForGpu() {
   return {DT_FLOAT, DT_BFLOAT16, DT_HALF, DT_DOUBLE};
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index b7643999f81..7922c6ba821 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -35,9 +35,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// We don't support integers for convolutions, so we list the supported types
-// here.
-std::vector<DataType> GetXlaConvTypes();
+// We don't support integers for convolutions for GPU, so we list the supported
+// types for non-gpu and gpu here.
+std::vector<DataType> GetXlaConvTypesForNonGpu();
+std::vector<DataType> GetXlaConvTypesForGpu();
 
 // ConvOpAttrs contains all of the metadata necessary to specify a TF or XLA
 // convolution.
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 5a31901142e..1d94cf4969f 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -70,25 +70,21 @@ class Conv2DOp : public ConvOp {
   explicit Conv2DOp(OpKernelConstruction* ctx)
       : ConvOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv2D").TypeConstraint("T", GetXlaConvTypes()),
-                Conv2DOp);
+REGISTER_XLA_CONV_OP(Name("Conv2D"), Conv2DOp);
 
 class Conv3DOp : public ConvOp {
  public:
   explicit Conv3DOp(OpKernelConstruction* ctx)
       : ConvOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv3D").TypeConstraint("T", GetXlaConvTypes()),
-                Conv3DOp);
+REGISTER_XLA_CONV_OP(Name("Conv3D"), Conv3DOp);
 
 class DepthwiseConv2DOp : public ConvOp {
  public:
   explicit DepthwiseConv2DOp(OpKernelConstruction* ctx)
       : ConvOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
-REGISTER_XLA_OP(
-    Name("DepthwiseConv2dNative").TypeConstraint("T", GetXlaConvTypes()),
-    DepthwiseConv2DOp);
+REGISTER_XLA_CONV_OP(Name("DepthwiseConv2dNative"), DepthwiseConv2DOp);
 
 // Backprop for input.
 class ConvBackpropInputOp : public XlaOpKernel {
@@ -134,30 +130,27 @@ class Conv2DBackpropInputOp : public ConvBackpropInputOp {
   explicit Conv2DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv2DBackpropInput")
-                    .CompileTimeConstantInput("input_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                Conv2DBackpropInputOp);
+REGISTER_XLA_CONV_OP(
+    Name("Conv2DBackpropInput").CompileTimeConstantInput("input_sizes"),
+    Conv2DBackpropInputOp);
 
 class Conv3DBackpropInputOp : public ConvBackpropInputOp {
  public:
   explicit Conv3DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv3DBackpropInputV2")
-                    .CompileTimeConstantInput("input_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                Conv3DBackpropInputOp);
+REGISTER_XLA_CONV_OP(
+    Name("Conv3DBackpropInputV2").CompileTimeConstantInput("input_sizes"),
+    Conv3DBackpropInputOp);
 
 class DepthwiseConv2DBackpropInputOp : public ConvBackpropInputOp {
  public:
   explicit DepthwiseConv2DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
-REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropInput")
-                    .CompileTimeConstantInput("input_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                DepthwiseConv2DBackpropInputOp);
+REGISTER_XLA_CONV_OP(Name("DepthwiseConv2dNativeBackpropInput")
+                         .CompileTimeConstantInput("input_sizes"),
+                     DepthwiseConv2DBackpropInputOp);
 
 class ConvBackpropFilterOp : public XlaOpKernel {
  public:
@@ -198,10 +191,9 @@ class Conv2DBackpropFilterOp : public ConvBackpropFilterOp {
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {
   }
 };
-REGISTER_XLA_OP(Name("Conv2DBackpropFilter")
-                    .CompileTimeConstantInput("filter_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                Conv2DBackpropFilterOp);
+REGISTER_XLA_CONV_OP(
+    Name("Conv2DBackpropFilter").CompileTimeConstantInput("filter_sizes"),
+    Conv2DBackpropFilterOp);
 
 class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
  public:
@@ -209,20 +201,18 @@ class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {
   }
 };
-REGISTER_XLA_OP(Name("Conv3DBackpropFilterV2")
-                    .CompileTimeConstantInput("filter_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                Conv3DBackpropFilterOp);
+REGISTER_XLA_CONV_OP(
+    Name("Conv3DBackpropFilterV2").CompileTimeConstantInput("filter_sizes"),
+    Conv3DBackpropFilterOp);
 
 class DepthwiseConv2DBackpropFilterOp : public ConvBackpropFilterOp {
  public:
   explicit DepthwiseConv2DBackpropFilterOp(OpKernelConstruction* ctx)
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
-REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropFilter")
-                    .CompileTimeConstantInput("filter_sizes")
-                    .TypeConstraint("T", GetXlaConvTypes()),
-                DepthwiseConv2DBackpropFilterOp);
+REGISTER_XLA_CONV_OP(Name("DepthwiseConv2dNativeBackpropFilter")
+                         .CompileTimeConstantInput("filter_sizes"),
+                     DepthwiseConv2DBackpropFilterOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 7566cc79742..cd03b617158 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -146,6 +146,10 @@ class DynamicStitchOp : public XlaOpKernel {
     for (int input_num = 0; input_num < indices.size(); input_num++) {
       for (int i = 0; i < indices[input_num].shape().dimensions(0); ++i) {
         int index = indices[input_num].Get<int>({i});
+        OP_REQUIRES(
+            ctx, index >= 0,
+            errors::InvalidArgument("indices[", index, "] is out of range"));
+
         src_input_vector[index] = input_num;
         src_slice_vector[index] = i;
         if (!src_index_used[index]) {
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 7b46d848e1d..55bce65bd8e 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -178,11 +178,9 @@ class ExtractImagePatchesOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
 };
 
-// We don't support integers for the convolution used in the implementation of
-// this op, so we limit the supported types.
-REGISTER_XLA_OP(
-    Name("ExtractImagePatches").TypeConstraint("T", GetXlaConvTypes()),
-    ExtractImagePatchesOp);
+// We don't support integers for the convolution for GPU used in the
+// implementation of this op, so we limit the supported types.
+REGISTER_XLA_CONV_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc b/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
index d60d07b5caa..15314b0434e 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
@@ -49,8 +49,7 @@ absl::InlinedVector<int, 5> ConvertCompileTimeConstArgumentsToConst(
                                      xla::ValueInferenceMode::kUpperBound);
       if ((maybe_constant.ok() && maybe_constant->has_value()) ||
           (bounds.ok() && bounds->has_value())) {
-        StatusOr<Tensor> values_are_dynamic =
-            expression.ResolveDynamism(ctx->compiler()->client());
+        StatusOr<Tensor> values_are_dynamic = expression.ResolveDynamism();
         bool all_values_are_static = false;
         if (values_are_dynamic.ok()) {
           xla::Literal literal =
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index da15cf27e7c..4abfb149792 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -386,6 +386,13 @@ struct SuppressBodyFn {
         num_outputs_so_far);
     // Slice out the row_idx.
     auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
+
+    TF_ASSIGN_OR_RETURN(auto iou_shape, builder->GetShape(iou_mask));
+    auto boxes_runtime_size = xla::GetDimensionSize(row_iou, 1);
+    if (iou_shape.is_dynamic_dimension(1)) {
+      row_iou = xla::SetDimensionSize(row_iou, boxes_runtime_size, 1);
+    }
+
     // Remove the diagonal from consideration. An elem cannot suppress
     // itself.
     row_iou = xla::DynamicUpdateSlice(
@@ -395,8 +402,12 @@ struct SuppressBodyFn {
     row_iou = xla::Reshape(row_iou, {num_boxes});
     auto supp_mask = xla::Not(row_iou);
     // Update mask iff current elem is not suppressed.
-    included_iou = xla::Select(xla::Broadcast(active_elem, {num_boxes}),
-                               xla::And(included_iou, supp_mask), included_iou);
+    auto cond = xla::Broadcast(active_elem, {num_boxes});
+    if (iou_shape.is_dynamic_dimension(1)) {
+      cond = xla::SetDimensionSize(cond, boxes_runtime_size, 0);
+    }
+    included_iou =
+        xla::Select(cond, xla::And(included_iou, supp_mask), included_iou);
     row_idx = row_idx + xla::ConstantR0<int32>(builder, 1);
     return std::vector<xla::XlaOp>{row_idx, num_outputs_so_far, iou_mask,
                                    included_iou};
@@ -485,7 +496,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     const xla::XlaOp indices_sorted = xla::GetTupleElement(indices_sort, 1);
     const xla::XlaOp scores = xla::GetTupleElement(indices_sort, 0);
 
-    // Shapes are henceforth [1, num_boxes]. 'c_y0' denotes 'coordinate' y0.
+    // Shapes are henceforth [1, <=num_boxes]. 'c_y0' denotes 'coordinate' y0.
     const xla::XlaOp c_y0 = xla::Reshape(xla::SliceInDim(boxes_sorted,
                                                          /*start_index=*/0,
                                                          /*limit_index=*/1,
@@ -517,14 +528,14 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     xla::XlaOp x2 = xla::Select(xla::Le(c_x0, c_x1), c_x1, c_x0);
     xla::XlaOp area = (y2 - y1) * (x2 - x1);
 
-    // Shapes are henceforth [1, num_boxes].
+    // Shapes are henceforth [1, <=num_boxes].
     y1 = xla::Broadcast(y1, {1});
     y2 = xla::Broadcast(y2, {1});
     x1 = xla::Broadcast(x1, {1});
     x2 = xla::Broadcast(x2, {1});
     area = xla::Broadcast(area, {1});
 
-    // Shapes are henceforth [num_boxes, num_boxes].
+    // Shapes are henceforth [<=num_boxes, <=num_boxes].
     xla::XlaOp i_xmin = xla::Max(x1, xla::Transpose(x1, {1, 0}));
     xla::XlaOp i_ymin = xla::Max(y1, xla::Transpose(y1, {1, 0}));
     xla::XlaOp i_xmax = xla::Min(x2, xla::Transpose(x2, {1, 0}));
@@ -540,6 +551,13 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     xla::XlaOp included_iou =
         xla::Broadcast(xla::ConstantR0<bool>(builder, true), {num_boxes});
 
+    auto iou_shape_or = builder->GetShape(iou_thresh_mask);
+    OP_REQUIRES_OK(context, iou_shape_or.status());
+    auto boxes_runtime_size = xla::GetDimensionSize(iou_thresh_mask, 1);
+    if (iou_shape_or.value().is_dynamic_dimension(1)) {
+      included_iou = xla::SetDimensionSize(included_iou, boxes_runtime_size, 0);
+    }
+
     std::vector<xla::XlaOp> init_values;
     init_values.reserve(4);
     init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // col_idx
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index 8e2357a9499..f169d86e8b1 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <numeric>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/kernels/callback.pb.h"
@@ -60,7 +61,7 @@ const char* const kTfCallbackCustomCall = "GenericTfCallbackGPU";
 static StatusOr<Tensor> TensorFromProto(const TensorProto& proto) {
   Tensor out;
   if (!out.FromProto(proto)) {
-    return se::port::InternalError("Failed deserializing a TensorProto");
+    return tsl::errors::Internal("Failed deserializing a TensorProto");
   }
   return out;
 }
@@ -108,7 +109,7 @@ Status LightOutsideCompilationOp::CompileToCustomCallCallingTfKernel(
     if (absl::c_any_of(xla_shape.dynamic_dimensions(),
                        [](const bool is_dynamic) { return is_dynamic; })) {
       // TODO(cheshire): Support input dynamic dimensions.
-      return se::port::InternalError(
+      return tsl::errors::Internal(
           "Input dynamic dimensions are not supported for light outside "
           "compilation");
     }
@@ -157,8 +158,7 @@ Status LightOutsideCompilationOp::CompileToCustomCallCallingTfKernel(
     TensorShapeProto output_tensor_shape_proto =
         ic.ShapeHandleToProto(ic.output(i));
     if (output_tensor_shape_proto.unknown_rank()) {
-      return se::port::InternalError(
-          absl::StrCat("Output ", i, " has unknown rank"));
+      return tsl::errors::Internal("Output ", i, " has unknown rank");
     }
 
     int rank = output_tensor_shape_proto.dim_size();
@@ -172,8 +172,8 @@ Status LightOutsideCompilationOp::CompileToCustomCallCallingTfKernel(
 
       if (dim->size() < 0) {
         if (it == dimension_bounds.end()) {
-          return se::port::InternalError(absl::StrCat(
-              "Bound for unknown dimension not found for dimension ", d));
+          return tsl::errors::Internal(
+              "Bound for unknown dimension not found for dimension ", d);
         }
         dim->set_size(it->second);
         dynamic_dimensions[d] = true;
@@ -291,8 +291,12 @@ class TfCallbackDevice : public DeviceBase {
                             const TfCallbackData& callback_data)
       : DeviceBase(Env::Default()),
         stream_(stream),
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         gpu_allocator_(GPUProcessState::singleton()->GetGPUAllocator(
-            tsl::TfDeviceId{stream_->parent()->device_ordinal()})),
+            *BaseGPUDevice::FindTfDeviceId(stream))),
+#else
+        gpu_allocator_(nullptr),
+#endif
         cpu_allocator_(
             ProcessState::singleton()->GetCPUAllocator(/*numa_node=*/0)) {
     for (int i = 0; i < callback_data.outputs_size(); ++i) {
@@ -329,8 +333,7 @@ class TfCallbackDevice : public DeviceBase {
         context, gpu_stream,
         /*platform_device_id=*/
         tsl::PlatformDeviceId(stream_->parent()->device_ordinal()), allocator,
-        // TODO(cheshire): Pass meaningful scratch
-        // buffer.
+        // TODO(cheshire): Pass meaningful scratch buffer.
         /*scratch=*/nullptr);
     return OkStatus();
 #else
@@ -347,7 +350,12 @@ class TfCallbackDevice : public DeviceBase {
     if (attr.on_host()) {
       if (attr.gpu_compatible()) {
         GPUProcessState* ps = GPUProcessState::singleton();
-        return ps->GetGpuHostAllocator(0);
+        // TODO(jlebar): The very first call to GetGpuHostAllocator sets its
+        // memory limits.  So passing {} for the options here means that if
+        // nobody gets this allocator before us, we will not respect any limits
+        // the user might have set on host memory allocation.  Our call to
+        // GetGPUAllocator in the constructor has the same problem.
+        return ps->GetGpuHostAllocator(/*options=*/{}, 0);
       } else {
         return cpu_allocator_;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 8fdcd5d3199..81754a0a767 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 7f0d712f13d..64351c6a741 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -33,15 +33,41 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/determinism.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace {
 
+template <typename T>
+static Status ValidateKernelSizes(const T& ksizes) {
+  for (size_t i = 0; i < ksizes.size(); ++i) {
+    if (ksizes[i] <= 0) {
+      return errors::InvalidArgument(
+          "Sliding window ksize field for dimension ", i,
+          " must be positive but is ", ksizes[i]);
+    }
+  }
+  return OkStatus();
+}
+
+template <typename T>
+static Status ValidateStrides(const T& strides) {
+  for (size_t i = 0; i < strides.size(); ++i) {
+    if (strides[i] <= 0) {
+      return errors::InvalidArgument(
+          "Sliding window stride field for dimension ", i,
+          " must be positive but is ", strides[i]);
+    }
+  }
+  return OkStatus();
+}
+
 // Superclass of pooling ops.
 class PoolingOp : public XlaOpKernel {
  public:
@@ -83,50 +109,54 @@ class PoolingOp : public XlaOpKernel {
 
  protected:
   StatusOr<std::vector<int64_t>> GetKernelSize(XlaOpKernelContext* ctx) {
-    if (ctx->num_inputs() == 1) {
-      return ksize_;
-    }
-    const TensorShape ksize_shape = ctx->InputShape(1);
-    // Validate input sizes.
-    if (!TensorShapeUtils::IsVector(ksize_shape)) {
-      return errors::InvalidArgument("ksize must be a vector, not shape ",
-                                     ksize_shape.DebugString());
-    }
-    if (ksize_shape.num_elements() != num_dims()) {
-      return errors::InvalidArgument(
-          "Sliding window ksize field must "
-          "specify ",
-          num_dims(), " dimensions");
-    }
     std::vector<int64_t> ksize;
-    auto status = ctx->ConstantInputAsIntVector(1, &ksize);
-    if (!status.ok()) {
-      return status;
+    if (ctx->num_inputs() == 1) {
+      ksize = ksize_;
+    } else {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
+      if (!TensorShapeUtils::IsVector(ksize_shape)) {
+        return errors::InvalidArgument("ksize must be a vector, not shape ",
+                                       ksize_shape.DebugString());
+      }
+      if (ksize_shape.num_elements() != num_dims()) {
+        return errors::InvalidArgument(
+            "Sliding window ksize field must "
+            "specify ",
+            num_dims(), " dimensions");
+      }
+      auto status = ctx->ConstantInputAsIntVector(1, &ksize);
+      if (!status.ok()) {
+        return status;
+      }
     }
+    TF_RETURN_IF_ERROR(ValidateKernelSizes(ksize));
     return ksize;
   }
 
   StatusOr<std::vector<int64_t>> GetStride(XlaOpKernelContext* ctx) {
-    if (ctx->num_inputs() == 1) {
-      return stride_;
-    }
-    const TensorShape stride_shape = ctx->InputShape(2);
-    // Validate input sizes.
-    if (!TensorShapeUtils::IsVector(stride_shape)) {
-      return errors::InvalidArgument("stride must be a vector, not shape ",
-                                     stride_shape.DebugString());
-    }
-    if (stride_shape.num_elements() != num_dims()) {
-      return errors::InvalidArgument(
-          "Sliding window stride field must "
-          "specify ",
-          num_dims(), " dimensions");
-    }
     std::vector<int64_t> stride;
-    auto status = ctx->ConstantInputAsIntVector(2, &stride);
-    if (!status.ok()) {
-      return status;
+    if (ctx->num_inputs() == 1) {
+      stride = stride_;
+    } else {
+      const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
+      if (!TensorShapeUtils::IsVector(stride_shape)) {
+        return errors::InvalidArgument("stride must be a vector, not shape ",
+                                       stride_shape.DebugString());
+      }
+      if (stride_shape.num_elements() != num_dims()) {
+        return errors::InvalidArgument(
+            "Sliding window stride field must "
+            "specify ",
+            num_dims(), " dimensions");
+      }
+      auto status = ctx->ConstantInputAsIntVector(2, &stride);
+      if (!status.ok()) {
+        return status;
+      }
     }
+    TF_RETURN_IF_ERROR(ValidateStrides(stride));
     return stride;
   }
 
@@ -355,10 +385,12 @@ class MaxPoolGradOp : public XlaOpKernel {
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateKernelSizes(ksize_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateStrides(stride_));
 
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
@@ -446,11 +478,13 @@ class AvgPoolGradOp : public XlaOpKernel {
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateKernelSizes(ksize_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateStrides(stride_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     OP_REQUIRES(ctx, padding_ != EXPLICIT,
                 errors::Unimplemented(
@@ -579,10 +613,12 @@ class MaxPoolGradGradOp : public XlaOpKernel {
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateKernelSizes(ksize_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
+    OP_REQUIRES_OK(ctx, ValidateStrides(stride_));
 
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.h b/tensorflow/compiler/tf2xla/kernels/resampler_ops.h
index a8e78e4b5db..195f41dce76 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.h
@@ -37,4 +37,4 @@ class ResamplerGradOp : public XlaOpKernel {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_TF2XLA_KERNELS_RESAMPLER_OPS_H_
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_RESAMPLER_OPS_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index a9a67bc3b17..08a8545ec68 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -42,10 +42,15 @@ class ReverseSequenceOp : public XlaOpKernel {
                                         seq_lens_shape.dims()));
     OP_REQUIRES(context, batch_dim_ != seq_dim_,
                 errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_));
+    OP_REQUIRES(context, seq_dim_ >= 0,
+                errors::InvalidArgument("seq_dim must be >=0, got ", seq_dim_));
     OP_REQUIRES(
         context, seq_dim_ < input_shape.dims(),
         errors::InvalidArgument("seq_dim must be < input rank", " ( ", seq_dim_,
                                 " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context, batch_dim_ >= 0,
+        errors::InvalidArgument("batch_dim must be >=0, got ", batch_dim_));
     OP_REQUIRES(
         context, batch_dim_ < input_shape.dims(),
         errors::InvalidArgument("batch_dim must be < input rank", " ( ",
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index c9cb757c913..1812ddab2b6 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -119,9 +119,10 @@ class ScatterNdOp : public XlaOpKernel {
     auto updates = context->Input(1);
     auto combine =
         context->input_xla_type(1) == xla::PRED ? CombineBool : CombineNum;
-    auto result =
-        XlaScatter(buffer, updates, indices,
-                   /*indices_are_vectors=*/true, /*combiner=*/combine, builder);
+    auto result = XlaScatter(buffer, updates, indices,
+                             /*indices_are_vectors=*/true,
+                             /*indices_are_sorted=*/false,
+                             /*combiner=*/combine, builder);
     OP_REQUIRES_OK(context, result.status());
     context->SetOutput(0, result.value());
   }
@@ -173,7 +174,8 @@ void CompileTensorScatter(
   auto indices = context->Input(1);
   auto updates = context->Input(2);
   auto result = XlaScatter(buffer, updates, indices,
-                           /*indices_are_vectors=*/true, combiner, builder);
+                           /*indices_are_vectors=*/true,
+                           /*indices_are_sorted=*/false, combiner, builder);
   OP_REQUIRES_OK(context, result.status());
   context->SetOutput(0, result.value());
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 78a1a67c5a8..afceb14044e 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -25,9 +25,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-class UnsortedSegmentReduce : public XlaOpKernel {
+class SegmentReduce : public XlaOpKernel {
  public:
-  explicit UnsortedSegmentReduce(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+  explicit SegmentReduce(OpKernelConstruction* ctx, bool indices_are_sorted)
+      : XlaOpKernel(ctx), indices_are_sorted_(indices_are_sorted) {
     DataType dtype;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype));
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &type_));
@@ -46,7 +47,9 @@ class UnsortedSegmentReduce : public XlaOpKernel {
     //    output[i] == 0 if i does not appear in indices
     //
     // Contrast with segment_sum(), which assumes indices are sorted and that
-    // max(indices)+1 is the desired size of the output.
+    // max(indices)+1 is the desired size of the output. Note that
+    // segment_sum_v2 also takes num_segments as an input and can be supported
+    // similarly.
     //
     // The returned output tensor has the same type as data, and the same shape
     // as data with the first indices.rank dimensions are replaced
@@ -118,19 +121,22 @@ class UnsortedSegmentReduce : public XlaOpKernel {
                            xla::XlaBuilder* builder) { return Combine(a, b); };
 
     auto result = XlaScatter(buffer, /*updates=*/data, indices,
-                             /*indices_are_vectors=*/false, combiner, builder);
+                             /*indices_are_vectors=*/false, indices_are_sorted_,
+                             combiner, builder);
     OP_REQUIRES_OK(ctx, result.status());
     ctx->SetOutput(0, result.value());
   }
 
  protected:
   xla::PrimitiveType type_;
+  bool indices_are_sorted_;
 };
 
-class UnsortedSegmentSum : public UnsortedSegmentReduce {
+template <bool indices_are_sorted>
+class SegmentSum : public SegmentReduce {
  public:
-  explicit UnsortedSegmentSum(OpKernelConstruction* ctx)
-      : UnsortedSegmentReduce(ctx) {}
+  explicit SegmentSum(OpKernelConstruction* ctx)
+      : SegmentReduce(ctx, indices_are_sorted) {}
 
   xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return xla::Zero(builder, type_);
@@ -138,14 +144,17 @@ class UnsortedSegmentSum : public UnsortedSegmentReduce {
   xla::XlaOp Combine(xla::XlaOp a, xla::XlaOp b) override { return a + b; };
 };
 
+REGISTER_XLA_OP(Name("SegmentSumV2").CompileTimeConstantInput("num_segments"),
+                SegmentSum</*indices_are_sorted=*/true>);
 REGISTER_XLA_OP(
     Name("UnsortedSegmentSum").CompileTimeConstantInput("num_segments"),
-    UnsortedSegmentSum);
+    SegmentSum</*indices_are_sorted=*/false>);
 
-class UnsortedSegmentProd : public UnsortedSegmentReduce {
+template <bool indices_are_sorted>
+class SegmentProd : public SegmentReduce {
  public:
-  explicit UnsortedSegmentProd(OpKernelConstruction* ctx)
-      : UnsortedSegmentReduce(ctx) {}
+  explicit SegmentProd(OpKernelConstruction* ctx)
+      : SegmentReduce(ctx, indices_are_sorted) {}
 
   xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return xla::One(builder, type_);
@@ -155,12 +164,14 @@ class UnsortedSegmentProd : public UnsortedSegmentReduce {
 
 REGISTER_XLA_OP(
     Name("UnsortedSegmentProd").CompileTimeConstantInput("num_segments"),
-    UnsortedSegmentProd);
+    SegmentProd</*indices_are_sorted=*/false>);
+REGISTER_XLA_OP(Name("SegmentProdV2").CompileTimeConstantInput("num_segments"),
+                SegmentProd</*indices_are_sorted=*/true>);
 
-class UnsortedSegmentMin : public UnsortedSegmentReduce {
+class UnsortedSegmentMin : public SegmentReduce {
  public:
   explicit UnsortedSegmentMin(OpKernelConstruction* ctx)
-      : UnsortedSegmentReduce(ctx) {}
+      : SegmentReduce(ctx, /*indices_are_sorted=*/false) {}
 
   xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return xla::MaxFiniteValue(builder, type_);
@@ -174,10 +185,10 @@ REGISTER_XLA_OP(
     Name("UnsortedSegmentMin").CompileTimeConstantInput("num_segments"),
     UnsortedSegmentMin);
 
-class UnsortedSegmentMax : public UnsortedSegmentReduce {
+class UnsortedSegmentMax : public SegmentReduce {
  public:
   explicit UnsortedSegmentMax(OpKernelConstruction* ctx)
-      : UnsortedSegmentReduce(ctx) {}
+      : SegmentReduce(ctx, /*indices_are_sorted=*/false) {}
 
   xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return xla::MinFiniteValue(builder, type_);
diff --git a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
index bb5dfa5426b..4f3c7b79861 100644
--- a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
@@ -90,6 +90,7 @@ class SparseToDenseOp : public XlaOpKernel {
     }
     auto result = XlaScatter(buffer, sparse_values, indices,
                              /*indices_are_vectors=*/indices_shape.dims() > 1,
+                             /*indices_are_sorted=*/false,
                              /*combiner=*/{}, builder);
     context->SetOutput(0, builder->ReportErrorOrReturn(result));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index b8c033d3539..ae5150f14f9 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -59,8 +59,19 @@ class SplitOp : public XlaOpKernel {
         errors::InvalidArgument(
             "Number of ways to split should be > 0, but got ", num_split));
 
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(1);
+    auto shape_or = builder->GetShape(input);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+
+    xla::Shape xla_shape = shape_or.value();
+    OP_REQUIRES(
+        ctx, !xla_shape.is_dynamic_dimension(split_dim),
+        errors::InvalidArgument(
+            "Split op doesn't support split for the dynamic dimension"));
+
     OP_REQUIRES(
-        ctx, input_shape.dim_size(split_dim) % num_split == 0,
+        ctx, xla_shape.dimensions(split_dim) % num_split == 0,
         errors::InvalidArgument(
             "Number of ways to split should evenly divide the split "
             "dimension, but got split_dim ",
@@ -83,8 +94,6 @@ class SplitOp : public XlaOpKernel {
       limits[i] = dim;
     }
 
-    auto input = ctx->Input(1);
-
     // Create each of the outputs.
     for (int i = 0; i < num_split; ++i) {
       // Slice out the ith split from the split dimension.
@@ -164,12 +173,23 @@ class SplitVOp : public XlaOpKernel {
       }
     }
 
+    xla::XlaBuilder* builder = ctx->builder();
+    auto shape_or = builder->GetShape(input);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+
+    // TODO(b/265880112): Support this using the SetDimensionSize op.
+    xla::Shape xla_shape = shape_or.value();
+    OP_REQUIRES(
+        ctx, !xla_shape.is_dynamic_dimension(split_dim),
+        errors::Unimplemented("SplitV op doesn't yet support dynamic split "
+                              "dimension."));
+
     OP_REQUIRES(
         ctx,
         (neg_one_dim == -1 &&
-         total_split_size == input_shape.dim_size(split_dim)) ||
+         total_split_size == xla_shape.dimensions(split_dim)) ||
             (neg_one_dim >= 0 &&
-             total_split_size <= input_shape.dim_size(split_dim)),
+             total_split_size <= xla_shape.dimensions(split_dim)),
         errors::InvalidArgument("Determined shape must either match "
                                 "input shape along split_dim exactly if "
                                 "fully specified, or be less than the size of "
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 5f5ae70a89d..82547ae61f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -68,7 +68,7 @@ Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
   }
 
   TensorShape stack_shape;
-  stack_shape.AddDim(resource->max_array_size());
+  TF_RETURN_IF_ERROR(stack_shape.AddDimWithStatus(resource->max_array_size()));
   stack_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index b6aac5f53f1..bdb91c33509 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -230,6 +230,15 @@ Status CompileImpl(
   return OkStatus();
 }
 
+DataType MaybeConvertBF16ToF32(DataType const& dtype) {
+  if (dtype == DT_BFLOAT16) {
+    // We'll go through F32 to generate BF16.
+    // TODO(b/256243456): Generate BF16 directly from U16.
+    return DT_FLOAT;
+  }
+  return dtype;
+}
+
 class StatefulUniformOp : public XlaOpKernel {
  public:
   explicit StatefulUniformOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -241,8 +250,8 @@ class StatefulUniformOp : public XlaOpKernel {
     auto sampler = [builder, this](xla::RandomAlgorithm alg, xla::XlaOp state,
                                    xla::XlaOp key,
                                    TensorShape shape) -> SamplerReturnType {
+      auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
       xla::Shape xla_shape;
-      DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
       xla::PrimitiveType rng_primitive_type = xla_shape.element_type();
       xla::RngOutput uniform_state = StatefulRngUniform(
@@ -269,8 +278,8 @@ class StatefulUniformOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatefulUniform")
                     .CompileTimeConstantInput("algorithm")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatefulUniformOp);
 
 class StatefulStandardNormalOp : public XlaOpKernel {
@@ -285,8 +294,8 @@ class StatefulStandardNormalOp : public XlaOpKernel {
         // Needs explicit lambda return type because it fails to be inferred.
         [this](xla::RandomAlgorithm alg, xla::XlaOp state, xla::XlaOp key,
                TensorShape shape) -> SamplerReturnType {
+      auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
       xla::Shape xla_shape;
-      DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
       xla::RngOutput value_state = xla::NormalFloatingPointDistribution(
           key, state, BitGen(alg), xla_shape);
@@ -308,8 +317,8 @@ class StatefulStandardNormalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatefulStandardNormalV2")
                     .CompileTimeConstantInput("algorithm")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatefulStandardNormalOp);
 
 class StatefulTruncatedNormalOp : public XlaOpKernel {
@@ -326,8 +335,8 @@ class StatefulTruncatedNormalOp : public XlaOpKernel {
         [builder, this](xla::RandomAlgorithm alg, xla::XlaOp state,
                         xla::XlaOp key,
                         TensorShape shape) -> SamplerReturnType {
+      auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
       xla::Shape xla_shape;
-      DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
       TF_RETURN_IF_ERROR(TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
 
       xla::RngOutput uniform_result = StatefulRngUniform(
@@ -355,8 +364,8 @@ class StatefulTruncatedNormalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatefulTruncatedNormal")
                     .CompileTimeConstantInput("algorithm")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatefulTruncatedNormalOp);
 
 class StatefulUniformIntOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index df49a2bf794..ad33157ed78 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -67,6 +67,10 @@ xla::BitGeneratorTy GetBitGeneratorForDevice(
 xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
   if (dtype == DT_BFLOAT16) {
     xla::XlaBuilder* builder = input.builder();
+    // TODO(b/256243456): Instead of doing
+    // `ConvertElementType(BitcastConvertType(u32, F32), BF16)` we should do
+    // `BitcastConvertType(ConvertElementType(u32, U16), BF16)`, to avoid the
+    // unclear `ConvertElementType(f32, BF16)` behavior.
     xla::XlaOp output = xla::BitcastConvertType(input, xla::U32) &
                         xla::ConstantR0<uint32>(builder, 0xFFFF0000);
     return xla::ConvertElementType(xla::BitcastConvertType(output, xla::F32),
@@ -87,6 +91,7 @@ xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
   xla::XlaOp initial_state = xla::ConstantR0WithType(builder, xla::U64, 0);
   xla::PrimitiveType type = shape.element_type();
   switch (type) {
+    case xla::F16:
     case xla::F32:
     case xla::F64:
       return xla::UniformFloatingPointDistribution(
@@ -94,7 +99,7 @@ xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
                  GetBitGeneratorForDevice(device_type_string), minval, maxval,
                  shape)
           .value;
-    case xla::S32:  // fall through
+    case xla::S32:
     case xla::S64:
       return UniformIntDistribution(
                  key, initial_state,
@@ -104,7 +109,7 @@ xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
       break;
     default:
       return builder->ReportError(xla::Unimplemented(
-          "Types other than F32, S32 and S64 are not implemented by "
+          "Types other than F16, F32, S32 and S64 are not implemented by "
           "StatelessRngUniform; got %s",
           xla::primitive_util::LowercasePrimitiveTypeName(type)));
   }
@@ -139,6 +144,15 @@ xla::XlaOp StatelessRngUniformFullInt(absl::string_view device_type_string,
   }
 }
 
+DataType MaybeConvertBF16ToF32(DataType const& dtype) {
+  if (dtype == DT_BFLOAT16) {
+    // We'll go through F32 to generate BF16.
+    // TODO(b/256243456): Generate BF16 directly from U16.
+    return DT_FLOAT;
+  }
+  return dtype;
+}
+
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
@@ -159,7 +173,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
                                         seed_shape.DebugString()));
     xla::XlaOp seed = ctx->Input(1);
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
     xla::PrimitiveType rng_primitive_type = xla_shape.element_type();
@@ -182,7 +196,8 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomUniform")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_HALF, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomUniformOp);
 
@@ -295,9 +310,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
                 errors::InvalidArgument("seed must have shape [2], not ",
                                         seed_shape.DebugString()));
     xla::XlaOp seed = ctx->Input(1);
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
-
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
 
     xla::XlaBuilder* builder = seed.builder();
@@ -325,7 +339,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomNormal")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_HALF, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomNormalOp);
 
@@ -348,7 +363,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
     xla::XlaOp seed = ctx->Input(1);
     xla::XlaBuilder* builder = ctx->builder();
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
     xla::XlaOp uniform = StatelessRngUniform(
@@ -369,7 +384,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("StatelessTruncatedNormal")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_HALF, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessTruncatedNormalOp);
 
@@ -392,8 +408,7 @@ class StatelessParameterizedTruncatedNormalOp : public XlaOpKernel {
     xla::XlaOp seed = ctx->Input(1);
     xla::XlaBuilder* builder = ctx->builder();
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
-
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
 
@@ -432,7 +447,8 @@ class StatelessParameterizedTruncatedNormalOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("StatelessParameterizedTruncatedNormal")
                     .CompileTimeConstantInput("shape")
-                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_HALF, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessParameterizedTruncatedNormalOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index 6d3cce554e0..255474a62ca 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -99,6 +99,7 @@ xla::RngOutput StatelessRngUniformV2(xla::RandomAlgorithm const& alg,
   using std::placeholders::_3;
   auto generator = std::bind(BitGenerator, alg, _1, _2, _3);
   switch (type) {
+    case xla::F16:
     case xla::F32:
     case xla::F64:
       return xla::UniformFloatingPointDistribution(key, counter, generator,
@@ -112,7 +113,7 @@ xla::RngOutput StatelessRngUniformV2(xla::RandomAlgorithm const& alg,
       break;
     default:
       return {builder->ReportError(xla::Unimplemented(
-                  "Types other than F32, S32, S64, U32 and U64 are not "
+                  "Types other than F16, F32, S32, S64, U32 and U64 are not "
                   "implemented by "
                   "StatelessRngUniformV2; got %s",
                   xla::primitive_util::LowercasePrimitiveTypeName(type))),
@@ -179,6 +180,15 @@ xla::XlaOp MaybeSliceCounter(xla::RandomAlgorithm const& alg,
   return counter;
 }
 
+DataType MaybeConvertBF16ToF32(DataType const& dtype) {
+  if (dtype == DT_BFLOAT16) {
+    // We'll go through F32 to generate BF16.
+    // TODO(b/256243456): Generate BF16 directly from U16.
+    return DT_FLOAT;
+  }
+  return dtype;
+}
+
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
@@ -209,7 +219,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
                                              ctx->InputShape(key_input_idx),
                                              counter_shape));
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
     xla::PrimitiveType rng_primitive_type = xla_shape.element_type();
@@ -247,8 +257,8 @@ class StatelessRandomUniformOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatelessRandomUniformV2")
                     .CompileTimeConstantInput("shape")
                     .CompileTimeConstantInput("alg")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatelessRandomUniformOp);
 
 class StatelessRandomUniformIntOp : public XlaOpKernel {
@@ -392,8 +402,7 @@ class StatelessRandomNormalOp : public XlaOpKernel {
                                              ctx->InputShape(key_input_idx),
                                              counter_shape));
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
-
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
 
@@ -431,8 +440,8 @@ class StatelessRandomNormalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatelessRandomNormalV2")
                     .CompileTimeConstantInput("shape")
                     .CompileTimeConstantInput("alg")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatelessRandomNormalOp);
 
 class StatelessTruncatedNormalOp : public XlaOpKernel {
@@ -464,7 +473,7 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 
     xla::XlaBuilder* builder = ctx->builder();
 
-    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    auto rng_dtype = MaybeConvertBF16ToF32(dtype_);
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
 
@@ -488,8 +497,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("StatelessTruncatedNormalV2")
                     .CompileTimeConstantInput("shape")
                     .CompileTimeConstantInput("alg")
-                    .TypeConstraint("dtype",
-                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                    .TypeConstraint("dtype", {DT_DOUBLE, DT_FLOAT, DT_HALF,
+                                              DT_BFLOAT16}),
                 StatelessTruncatedNormalOp);
 
 class GetKeyCounterOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 4074a3fb3d3..a0f8f62cd57 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -76,7 +76,7 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
     TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape_or_status.value(), &shape));
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->max_array_size());
+    TF_RETURN_IF_ERROR(ta_shape.AddDimWithStatus(resource->max_array_size()));
     ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 980ca07e117..5d299fde600 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -553,6 +553,8 @@ class TensorListSplitOp : public XlaOpKernel {
       OP_REQUIRES(ctx, len == length,
                   errors::Unimplemented("All lengths have to be the same"));
     }
+    OP_REQUIRES(ctx, length,
+                errors::Unimplemented("All lengths must be positive"));
     OP_REQUIRES(
         ctx, element_dims[0] % length == 0,
         errors::Unimplemented("Buffer size has to be a multiple of length"));
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 84f8981a979..d873396a828 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -184,8 +184,9 @@ class InvertPermutationOp : public XlaOpKernel {
           xla::Iota(ctx->builder(),
                     xla::primitive_util::NativeToPrimitiveType<T>(), size);
       auto result = XlaScatter(iota, iota, indices,
-                               /*indices_are_vectors=*/false, /*combiner=*/{},
-                               ctx->builder());
+                               /*indices_are_vectors=*/false,
+                               /*indices_are_sorted=*/false,
+                               /*combiner=*/{}, ctx->builder());
       OP_REQUIRES_OK(ctx, result.status());
       ctx->SetOutput(0, result.value());
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index bf209504748..555670f69c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -64,6 +64,7 @@ REGISTER_XLA_OP(Name("Ceil"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Cos"), MlirXlaOpKernel);
 XLAJIT_MAKE_UNARY(Cosh, xla::Cosh(x));
 XLAJIT_MAKE_UNARY(Sin, xla::Sin(x));
+XLAJIT_MAKE_UNARY(Tan, xla::Tan(x));
 REGISTER_XLA_OP(Name("Exp"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Expm1"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Floor"), MlirXlaOpKernel);
@@ -117,7 +118,6 @@ XLAJIT_MAKE_UNARY(Softplus, Softplus(b, x));
 XLAJIT_MAKE_UNARY(Softsign, x / (xla::Abs(x) + xla::ScalarLike(x, 1.0)));
 REGISTER_XLA_OP(Name("Sqrt"), MlirXlaOpKernel);
 XLAJIT_MAKE_UNARY(Square, x* x);
-XLAJIT_MAKE_UNARY(Tan, xla::Tan(x));
 REGISTER_XLA_OP(Name("Tanh"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Real"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("Imag"), MlirXlaOpKernel);
diff --git a/tensorflow/compiler/tf2xla/kernels/unique_op.cc b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
index be31b285b6f..c4389baf4a8 100644
--- a/tensorflow/compiler/tf2xla/kernels/unique_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
@@ -147,6 +147,10 @@ class UniqueOpBase : public XlaOpKernel {
     StatusOr<xla::Shape> input_shape_or = ctx->builder()->GetShape(input);
     OP_REQUIRES_OK(ctx, input_shape_or.status());
     auto input_shape = input_shape_or.value();
+    axis = axis < 0 ? axis + input_shape.rank() : axis;
+    OP_REQUIRES(ctx, 0 <= axis && axis < input_shape.rank(),
+                errors::InvalidArgument("axis has to be between [0, ",
+                                        input_shape.rank(), ")"));
     auto aux = MoveAxis(input, axis, 0, input_shape);
     auto aux_shape = ctx->builder()->GetShape(aux).value();
     int64_t leading_size = aux_shape.dimensions(0);
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index aaddba9b8f0..6fe7a44f32a 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -190,7 +190,7 @@ class ResourceScatterOp : public XlaOpKernel {
     const xla::XlaOp updates = context->Input(2);
 
     auto result = XlaScatter(var_value, updates, indices, indices_are_vectors_,
-                             combiner_, builder);
+                             /*indices_are_sorted=*/false, combiner_, builder);
     OP_REQUIRES_OK(context, result.status());
     OP_REQUIRES_OK(context, context->AssignVariable(0, dtype, result.value()));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index 81c926b458c..7cf6d47bb28 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
@@ -33,7 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -62,10 +63,11 @@ StatusOr<mlir::Value> ComputeDimensionValue(int version, string dim_arg_spec,
     return errors::InvalidArgument("Syntax error in dim_args_spec '",
                                    dim_arg_spec, "'");
   }
-  if (arg_idx < 0 || arg_idx > arguments.size()) {
+  if (arg_idx < 0 || arg_idx >= arguments.size()) {
     return errors::InvalidArgument(
-        "Invalid argument index ", arg_idx, " when number of arguments is ",
-        arguments.size(), " in dim_arg_spec '", dim_arg_spec, "'");
+        "Invalid argument index ", arg_idx,
+        " when the number of non-dimension arguments is ", arguments.size(),
+        " in dim_arg_spec '", dim_arg_spec, "'");
   }
   mlir::RankedTensorType arg_type =
       arguments[arg_idx].getType().dyn_cast<mlir::RankedTensorType>();
@@ -75,19 +77,30 @@ StatusOr<mlir::Value> ComputeDimensionValue(int version, string dim_arg_spec,
         "' does not have a RankedTensorType");
   }
   if (arg_axis_idx < 0 || arg_axis_idx >= arg_type.getShape().size()) {
-    return errors::InvalidArgument(
-        "Invalid axis index ", arg_axis_idx, " when rank of input is ",
-        arg_type.getShape().size(), " in dim_arg_spec '", dim_arg_spec, "'");
+    return errors::InvalidArgument("Invalid axis index ", arg_axis_idx,
+                                   " when the rank of non-dimension argument ",
+                                   arg_idx, " is ", arg_type.getShape().size(),
+                                   " in dim_arg_spec '", dim_arg_spec, "'");
   }
   mlir::Value val;
+  mlir::Type get_dim_type =
+      mlir::RankedTensorType::get({}, op_builder.getI32Type());
   if (version >= VERSION_START_STABLE_HLO) {
     val = op_builder.create<mlir::stablehlo::GetDimensionSizeOp>(
-        arguments[arg_idx].getLoc(), dim_arg_type, arguments[arg_idx],
+        arguments[arg_idx].getLoc(), get_dim_type, arguments[arg_idx],
         op_builder.getI64IntegerAttr(arg_axis_idx));
+    if (dim_arg_type != get_dim_type) {
+      val = op_builder.create<mlir::stablehlo::ConvertOp>(
+          arguments[arg_idx].getLoc(), dim_arg_type, val);
+    }
   } else {
     val = op_builder.create<mlir::mhlo::GetDimensionSizeOp>(
-        arguments[arg_idx].getLoc(), dim_arg_type, arguments[arg_idx],
+        arguments[arg_idx].getLoc(), get_dim_type, arguments[arg_idx],
         op_builder.getI64IntegerAttr(arg_axis_idx));
+    if (dim_arg_type != get_dim_type) {
+      val = op_builder.create<mlir::mhlo::ConvertOp>(
+          arguments[arg_idx].getLoc(), dim_arg_type, val);
+    }
   }
   return val;
 }
@@ -100,11 +113,12 @@ StatusOr<mlir::Value> ComputeDimensionValue(int version, string dim_arg_spec,
 //
 // where %arg0 and %arg1 are dimension arguments, always first among the
 // arguments, and whose values are computed based on the static shapes of the
-// array arguments (%arg2 and following).
+// non-dimension arguments (%arg2 and following).
 // In the above example, the dim_args_spec array would have two elements, one
 // for %arg0 and one for %arg1. E.g., ['0.0', '0.1'] specifies that %arg0
 // should be set to the size of axis 0 or array argument 0 (%arg2), while
 // %arg1 should be set to the size of axis 1.
+// The dimension arguments must be 0-dimensional tensors of integer type.
 //
 // We create a new "main" function as follows:
 //   func public main(%arg2: f32[?, ?, 8]) {
@@ -132,9 +146,9 @@ Status AddMainWrapper(int version, mlir::ModuleOp module,
     return errors::InvalidArgument("Cannot find 'main' in module");
   }
   if (orig_main.getNumArguments() <= nr_dim_args) {
-    return errors::InvalidArgument("'main' has ", orig_main.getNumArguments(),
-                                   " arguments, but it must have at least ",
-                                   nr_dim_args, " dimension arguments");
+    return errors::InvalidArgument(
+        "The module should have ", nr_dim_args, " dimension arguments, but it ",
+        "has only ", orig_main.getNumArguments(), " total arguments");
   }
   mlir::Block &orig_main_body = orig_main.front();
 
@@ -161,6 +175,17 @@ Status AddMainWrapper(int version, mlir::ModuleOp module,
   std::vector<mlir::Value> call_args(orig_main_body.getNumArguments());
   for (int i = 0; i < orig_main_body.getNumArguments(); ++i) {
     if (i < nr_dim_args) {
+      mlir::Type arg_type = orig_main.getArgument(i).getType();
+      mlir::RankedTensorType arg_ranked_type =
+          arg_type.dyn_cast<mlir::RankedTensorType>();
+      if (!arg_ranked_type ||
+          !arg_ranked_type.getElementType().dyn_cast<mlir::IntegerType>() ||
+          !arg_ranked_type.getShape().empty()) {
+        return errors::InvalidArgument(
+            "Module argument at index ", i,
+            " should be a 0-dimensional integer-tensor dimension argument",
+            " but has type ", debugString(arg_type));
+      }
       TF_ASSIGN_OR_RETURN(call_args[i],
                           ComputeDimensionValue(
                               version, dim_args_spec[i], block_args, op_builder,
@@ -194,7 +219,8 @@ Status AddMainWrapper(int version, mlir::ModuleOp module,
 // inference to refine all dynamic shapes, and to rewrite the dynamic ops,
 // e.g., to replace dynamic_broadcast_in_dim with broadcast_in_dim.
 Status RefineDynamicShapes(XlaOpKernelContext *ctx,
-                           mlir::OwningOpRef<mlir::ModuleOp> *module) {
+                           mlir::OwningOpRef<mlir::ModuleOp> *module,
+                           int nr_dim_args) {
   // Locate the (wrapped) 'main' function.
   // This is the convention used by MlirToXlaComputation.
   mlir::func::FuncOp main = (*module)->lookupSymbol<mlir::func::FuncOp>("main");
@@ -202,17 +228,18 @@ Status RefineDynamicShapes(XlaOpKernelContext *ctx,
     return errors::InvalidArgument("Cannot find 'main' in module");
   }
   mlir::Block &main_body = main.front();
-  int nr_array_arguments = ctx->num_inputs();
-  if (nr_array_arguments != main_body.getNumArguments()) {
+  int non_dimension_arguments = ctx->num_inputs();
+  if (non_dimension_arguments != main_body.getNumArguments()) {
     return errors::InvalidArgument(
-        "Incorrect number of arguments for XlaCallModule. ",
-        "The wrapped module expects ", main_body.getNumArguments(),
-        " arguments, but there are ", nr_array_arguments, " arguments");
+        "Incorrect number of arguments for XlaCallModule: ",
+        non_dimension_arguments, ". The module has ",
+        main_body.getNumArguments() + nr_dim_args, " of which ", nr_dim_args,
+        " were declared to be dimension arguments.");
   }
 
   mlir::Builder builder((*module)->getContext());
-  std::vector<mlir::Type> static_array_input_types(nr_array_arguments);
-  for (int i = 0, end = nr_array_arguments; i < end; ++i) {
+  std::vector<mlir::Type> static_array_input_types(non_dimension_arguments);
+  for (int i = 0, end = non_dimension_arguments; i < end; ++i) {
     TF_ASSIGN_OR_RETURN(xla::Shape xla_shape, ctx->InputXlaShape(i));
     std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
                                         xla_shape.dimensions().end());
@@ -232,11 +259,24 @@ Status RefineDynamicShapes(XlaOpKernelContext *ctx,
   // This will only change the argument types and will not propagate the
   // additional type information further. For that, we'll need to run
   // shape inference as explained below.
-  main.setType(
-      builder.getFunctionType(static_array_input_types, main.getResultTypes()));
+  auto static_array_output_types = llvm::to_vector(main.getResultTypes());
   for (auto i = 0; i < main_body.getNumArguments(); ++i) {
-    main_body.getArgument(i).setType(static_array_input_types[i]);
+    auto arg = main_body.getArgument(i);
+    arg.setType(static_array_input_types[i]);
+    // If the argument is used by `func.return`, then we also need to
+    // update function result types. It's not great that we need this hack,
+    // but in the future when we have stablehlo.func, stablehlo.return, etc,
+    // this will not be needed.
+    // TODO(burmako): Once https://github.com/openxla/stablehlo/issues/425 is
+    // fixed, clean this up.
+    for (mlir::OpOperand &use : arg.getUses()) {
+      if (auto ret = llvm::dyn_cast<mlir::func::ReturnOp>(use.getOwner())) {
+        static_array_output_types[use.getOperandNumber()] = arg.getType();
+      }
+    }
   }
+  main.setType(builder.getFunctionType(static_array_input_types,
+                                       static_array_output_types));
   // --tf-shape-inference, despite its TF-specific name, seems to be general
   // enough to also work on MHLO. (Although it fails if it doesn't see a
   // tf.versions attribute on the module, which we hackily attach).
@@ -244,6 +284,14 @@ Status RefineDynamicShapes(XlaOpKernelContext *ctx,
       builder.getNamedAttr("producer", builder.getI32IntegerAttr(0));
   (**module)->setAttr("tf.versions", builder.getDictionaryAttr({tf_producer}));
 
+  // Verify the module before running passes on it.
+  // If the module doesn't pass verification, all sorts of weirdness might
+  // happen if we run the pass manager.
+  if (failed(verify(**module))) {
+    VLOG(3) << "XlaCallModule module with verification failed: "
+            << debugString(**module);
+    return errors::InvalidArgument("Module verification failed");
+  }
   mlir::PassManager pm((*module)->getContext());
   if (VLOG_IS_ON(3)) {
     auto print_before = [](mlir::Pass *, mlir::Operation *) { return true; };
@@ -260,6 +308,35 @@ Status RefineDynamicShapes(XlaOpKernelContext *ctx,
   if (!mlir::succeeded(pm.run(**module))) {
     return errors::InvalidArgument("Module shape inference failed");
   }
+
+  // Finally, make sure that no dynamic shapes are left, otherwise all sorts of
+  // weirdness might happen in the HLO exporter.
+  bool moduleHasDynamicShapes = false;
+  auto hasDynamicShape = [](mlir::Value value) {
+    auto shaped_type = value.getType().dyn_cast<mlir::ShapedType>();
+    return shaped_type ? !shaped_type.hasStaticShape() : false;
+  };
+  (*module)->walk([&](mlir::Operation *op) {
+    // It's sufficient to only check results because operands either come from
+    // results or from block arguments which are checked below.
+    bool opHasDynamicShapes = false;
+    opHasDynamicShapes |= llvm::any_of(op->getResults(), hasDynamicShape);
+    for (mlir::Region &region : op->getRegions()) {
+      opHasDynamicShapes |=
+          llvm::any_of(region.getArguments(), hasDynamicShape);
+    }
+    moduleHasDynamicShapes |= opHasDynamicShapes;
+    if (opHasDynamicShapes) {
+      std::string opStr;
+      llvm::raw_string_ostream os(opStr);
+      op->print(os);
+      VLOG(3) << "Operation still has dynamic shapes: " << opStr;
+    }
+  });
+  if (moduleHasDynamicShapes) {
+    return errors::InvalidArgument("Module still has dynamic shapes");
+  }
+
   VLOG(3) << "XlaCallModule module with inferred types: "
           << debugString(**module);
   return OkStatus();
@@ -284,7 +361,9 @@ Status LoadAndPreprocessModule(int version,
   if (!*module) {
     return errors::InvalidArgument("Cannot deserialize computation");
   }
-  VLOG(3) << "Parsed serialized module (version" << version << ")\n"
+  VLOG(3) << "Parsed serialized module (version " << version
+          << ", dim_args_spec = [" << absl::StrJoin(dim_args_spec, ", ")
+          << "])\n"
           << debugString(**module);
 
   if (failed((*module)->verifyInvariants())) {
@@ -310,8 +389,12 @@ Status LoadAndPreprocessModule(int version,
     }
   }
 
+  if (*has_dynamic_shapes && dim_args_spec.empty()) {
+    return errors::InvalidArgument(
+        "Module main has dynamic shapes but no dim_args_spec was given");
+  }
   if (!dim_args_spec.empty()) {
-    if (!has_dynamic_shapes) {
+    if (!*has_dynamic_shapes) {
       return errors::InvalidArgument(
           "Module main has dim_args_spec but does not have dynamic shapes");
     }
@@ -347,7 +430,8 @@ class XlaCallModuleOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext *ctx) override {
     if (has_dynamic_shapes_) {
-      OP_REQUIRES_OK(ctx, RefineDynamicShapes(ctx, &module_));
+      OP_REQUIRES_OK(ctx,
+                     RefineDynamicShapes(ctx, &module_, dim_args_spec_.size()));
     }
 
     std::vector<xla::XlaOp> inputs(ctx->num_inputs());
@@ -395,8 +479,8 @@ class XlaCallModuleOp : public XlaOpKernel {
   int nr_outputs_;
   std::vector<string> dim_args_spec_;
   bool has_dynamic_shapes_;
-  mlir::OwningOpRef<mlir::ModuleOp> module_;
   mlir::MLIRContext context_{mlir::MLIRContext::Threading::DISABLED};
+  mlir::OwningOpRef<mlir::ModuleOp> module_;
 };
 
 REGISTER_XLA_OP(Name("XlaCallModule"), XlaCallModuleOp);
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index a4e312a134f..14cc77651c8 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/tf2xla:friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 0cb8bf31778..c5cb60bc48c 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -34,6 +34,7 @@ namespace tensorflow {
 StatusOr<xla::XlaOp> XlaScatter(
     const xla::XlaOp& buffer, const xla::XlaOp& updates,
     const xla::XlaOp& indices, bool indices_are_vectors,
+    bool indices_are_sorted,
     const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
         combiner,
     xla::XlaBuilder* builder) {
@@ -200,7 +201,7 @@ StatusOr<xla::XlaOp> XlaScatter(
           << "]";
 
   return xla::Scatter(buffer, indices, new_updates, combiner_computation,
-                      dim_numbers);
+                      dim_numbers, indices_are_sorted);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 96881ed20ab..ef9c738d9b4 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -46,6 +46,7 @@ namespace tensorflow {
 StatusOr<xla::XlaOp> XlaScatter(
     const xla::XlaOp& buffer, const xla::XlaOp& updates,
     const xla::XlaOp& indices, bool indices_are_vectors,
+    bool indices_are_sorted,
     const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
         combiner,
     xla::XlaBuilder* builder);
diff --git a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
index 850f93065ca..e13333ca28b 100644
--- a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
+++ b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
@@ -191,7 +191,7 @@ class DynamicMultidimOp : public OpKernel {
     TensorShape output_shape;
     auto vec = ctx->input(0).flat<int32>();
     for (int i = 0; i < vec.size(); i++) {
-      output_shape.AddDim(vec(i));
+      OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(vec(i)));
     }
     Tensor* out_tensor = nullptr;
     OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 9786ab582a7..5ffe2a06f34 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -171,11 +171,6 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
 
   // We set `uses_uninitialized_resource_args` to false here because the first
   // phase of the bridge is not affected by uninitialized resource args.
-  // Note we are recording the stats using LogGraphFeatures in the pass
-  // that calls this one to avoid duplicate logging due to
-  // GetMlirBridgeRolloutPolicy being called multiple times for the same graph.
-  // TODO(b/241853328): Add caching of pass state and call logging/metrics
-  // related to graph analysis from here.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
       graph, &function_library, config_proto,
       /*uses_uninitialized_resource_args=*/false,
@@ -185,8 +180,6 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
       return MlirOptimizationPassState::Enabled;
     case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
       return MlirOptimizationPassState::FallbackEnabled;
-    case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysisSafeModeFallback:
-      return MlirOptimizationPassState::FallbackEnabled;
     case MlirBridgeRolloutPolicy::kDisabledByUser:
       VLOG(1) << "Skipping MLIR TPU Bridge, MLIR TPU bridge disabled by user. "
                  "Old bridge will evaluate.";
@@ -198,6 +191,14 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
                  "graph has unsupported features. Old bridge will evaluate.";
       metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
                                                    "invalid_graph");
+      // We set `uses_uninitialized_resource_args` to false here because the
+      // first phase of the bridge is not affected by uninitialized resource
+      // args.
+      // For Invalid Graph Analysis we need to log here because Run will not be
+      // called.
+      LogGraphFeatures(graph, &function_library, config_proto,
+                       /*uses_uninitialized_resource_args=*/false,
+                       /*is_v1_compat=*/false);
       return MlirOptimizationPassState::Disabled;
   }
 }
@@ -245,8 +246,17 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
 
   if (is_qualified_for_tpu_bridge) {
     bool fallback_enabled = false;
-    if (pass_state == MlirOptimizationPassState::FallbackEnabled)
+    if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
+      // We set `uses_uninitialized_resource_args` to false here because the
+      // first phase of the bridge is not affected by uninitialized resource
+      // args.
+      // TODO (b/241853328) Consider moving logging if caching for graph
+      // analysis or GetPassState is added
+      LogGraphFeatures(graph, &function_library, config_proto,
+                       /*uses_uninitialized_resource_args=*/false,
+                       /*is_v1_compat=*/false);
       fallback_enabled = true;
+    }
     VLOG(1) << "Running MLIR TPU Bridge";
     mlir_bridge_gauge_v2->GetCell()->Set(true);
     return mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1),
@@ -263,15 +273,8 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
   // Skip MLIR TPU Bridge if no TPU devices found.
   if (device_set && !HasTPUDevice(*device_set))
     return MlirOptimizationPassState::Disabled;
-
-  // Do not run the bridge if it's enabled by the graph analysis,
-  // only run if it's enabled by the user explicitly.
   // We set `uses_uninitialized_resource_args` to false here because the first
   // phase of the bridge is not affected by uninitialized resource args.
-  // Note we are recording the stats using LogGraphFeatures in the pass
-  // that calls this one.
-  // TODO(b/241853328): Add caching of pass state and call logging/metrics
-  // related to graph analysis from here.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
       graph, /*function_library=*/&function_library, config_proto,
       /*uses_uninitialized_resource_args=*/false, /*is_v1_compat=*/true,
@@ -279,8 +282,6 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
   switch (policy) {
     case MlirBridgeRolloutPolicy::kEnabledByUser:
       return MlirOptimizationPassState::Enabled;
-    case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysisSafeModeFallback:
-      return MlirOptimizationPassState::FallbackEnabled;
     case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
       return MlirOptimizationPassState::FallbackEnabled;
     case MlirBridgeRolloutPolicy::kDisabledByUser:
@@ -295,6 +296,14 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
                  "evaluate.";
       metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v1", true,
                                                    "invalid_graph");
+      // We set `uses_uninitialized_resource_args` to false here because the
+      // first phase of the bridge is not affected by uninitialized resource
+      // args.
+      // For Invalid Graph Analysis we need to log here because Run will not be
+      // called.
+      LogGraphFeatures(graph, &function_library, config_proto,
+                       /*uses_uninitialized_resource_args=*/false,
+                       /*is_v1_compat=*/true);
       return MlirOptimizationPassState::Disabled;
   }
 }
@@ -332,9 +341,18 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
 
   VLOG(1) << "Running MLIR TPU Bridge V1 Compat";
 
-  bool fallback_enabled = true;
-  if (pass_state == MlirOptimizationPassState::Enabled)
-    fallback_enabled = false;
+  bool fallback_enabled = false;
+  if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
+    // We set `uses_uninitialized_resource_args` to false here because the first
+    // phase of the bridge is not affected by uninitialized resource args.
+    // TODO (b/241853328) Consider moving logging if caching for graph analysis
+    // or GetPassState is added
+    LogGraphFeatures(**options.graph, options.flib_def,
+                     options.session_options->config,
+                     /*uses_uninitialized_resource_args=*/false,
+                     /*is_v1_compat=*/true);
+    fallback_enabled = true;
+  }
 
   mlir_bridge_gauge_v1->GetCell()->Set(true);
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index cc3e52d4ba1..16718e9026b 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
index 8f9a88530ba..2df09ff8f86 100644
--- a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 
-#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 
@@ -96,7 +96,8 @@ Status MlirXlaOpKernel::ConstructXlaOp(XlaOpKernelContext* ctx) {
   }
 
   // Create a graph that wraps the kernel.
-  TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(def(), xla_args, result_dtypes));
+  TF_ASSIGN_OR_RETURN(auto graph,
+                      CreateSingleOpGraph(def(), xla_args, result_dtypes));
 
   // Compile the graph to HLO.
   GraphDebugInfo debug_info;
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 5f073b3f305..ce8016bc509 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 5ff70d64bb5..9978030a252 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -1369,7 +1369,8 @@ specification of how to compute its value, as a string, in the form
 E.g., the specification "2.1" denotes the value args[2].shape[1].
 
 args: A list of `Tensor` with possibly different types to be passed as arguments
-  to the HLO module.
+  to the HLO module. These are all non-dimension arguments. The dimension
+  arguments are computed at JIT time.
 version: Changes when we change the semantics of the op, to support backwards
   compatibility. Version 1 carries an MHLO text or bytecode `module`. From
   version 2, the op carries a StableHLO text or bytecode `module`.
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index f5c9672e7bb..815fc42b44a 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -5,6 +5,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index bd582951bec..f763289bf57 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -102,6 +102,7 @@ def unary_op_wrapper(x, name=None):
 round = _unary_op(math_ops.round)
 sin = _unary_op(math_ops.sin)
 sign = _unary_op(math_ops.sign)
+tan = _unary_op(math_ops.tan)
 tanh = _unary_op(math_ops.tanh)
 
 # Bessel
@@ -603,7 +604,7 @@ def custom_call_v2(
   )
 
 
-def call_module(args, *, version=1, module, Tout, Sout, dim_args_spec=()):
+def call_module(args, *, version=2, module, Tout, Sout, dim_args_spec=()):
   # See documentation for the XlaCallModule op.
   return gen_xla_ops.xla_call_module(
       args, version=version, module=module, dim_args_spec=dim_args_spec,
diff --git a/tensorflow/compiler/tf2xla/resource_util.cc b/tensorflow/compiler/tf2xla/resource_util.cc
index 1bfe364ea6d..80fa72d84a5 100644
--- a/tensorflow/compiler/tf2xla/resource_util.cc
+++ b/tensorflow/compiler/tf2xla/resource_util.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -32,7 +31,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 const char kIdentityNOp[] = "IdentityN";
 const char kIfOp[] = "If";
diff --git a/tensorflow/compiler/tf2xla/resource_util.h b/tensorflow/compiler/tf2xla/resource_util.h
index cbc2b9cf91b..4aac73638d6 100644
--- a/tensorflow/compiler/tf2xla/resource_util.h
+++ b/tensorflow/compiler/tf2xla/resource_util.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/hash/hash.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index 4fb232f8b89..e40df038bbb 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -94,7 +94,7 @@ Status XLAShapeToTensorShape(const xla::Shape& shape,
   }
   *tensor_shape = TensorShape();
   for (int i = 0; i < shape.rank(); ++i) {
-    tensor_shape->AddDim(shape.dimensions(i));
+    TF_RETURN_IF_ERROR(tensor_shape->AddDimWithStatus(shape.dimensions(i)));
   }
   return OkStatus();
 }
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index 2ce48b8a72b..464413633a6 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -55,6 +57,12 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
     case tensorflow::DT_UINT64:
       *type = xla::U64;
       return OkStatus();
+    case tensorflow::DT_FLOAT8_E5M2:
+      *type = xla::F8E5M2;
+      return OkStatus();
+    case tensorflow::DT_FLOAT8_E4M3FN:
+      *type = xla::F8E4M3FN;
+      return OkStatus();
     case tensorflow::DT_BFLOAT16:
       *type = xla::BF16;
       return OkStatus();
@@ -84,6 +92,8 @@ StatusOr<DataType> EncodePrimitiveTypeAsDataType(xla::PrimitiveType type) {
   static const absl::flat_hash_map<xla::PrimitiveType, DataType>&
       data_type_map = *new absl::flat_hash_map<xla::PrimitiveType, DataType>({
           {xla::PRED, DT_BOOL},
+          {xla::F8E5M2, DT_FLOAT8_E5M2},
+          {xla::F8E4M3FN, DT_FLOAT8_E4M3FN},
           {xla::BF16, DT_BFLOAT16},
           {xla::F16, DT_HALF},
           {xla::F32, DT_FLOAT},
diff --git a/tensorflow/compiler/tf2xla/xla_argument.h b/tensorflow/compiler/tf2xla/xla_argument.h
index 6401c05544f..7497153e0bc 100644
--- a/tensorflow/compiler/tf2xla/xla_argument.h
+++ b/tensorflow/compiler/tf2xla/xla_argument.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index b6f15be0891..25e6ab60846 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
+#include <memory>
 #include <numeric>
 #include <vector>
 
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -60,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
@@ -726,6 +729,62 @@ std::vector<std::string> GetValidControlRets(
   return valid_control_rets;
 }
 
+Status XlaCompiler::CompileSingleOp(
+    const XlaCompiler::CompileOptions& compile_options,
+    const XlaCompiler::SingleOpCompileArgument& single_op_compile_argument,
+    absl::Span<const Argument> args, XlaCompiler::CompilationResult* result) {
+  const std::vector<DataType>& result_dtypes =
+      single_op_compile_argument.output_dtypes;
+  const NodeDef& node_def = single_op_compile_argument.node_def;
+  TF_ASSIGN_OR_RETURN(
+      auto graph,
+      CreateSingleOpGraph(node_def, args,
+                          single_op_compile_argument.output_dtypes));
+
+  auto compile_with_old_bridge = [&]() {
+    *result = {};
+    return CompileGraph(compile_options, node_def.name(), std::move(graph),
+                        args, result);
+  };
+
+  const ConfigProto* config = &(single_op_compile_argument.config_proto);
+  auto bridge_rollout = GetMlirBridgeRolloutState(
+      config ? std::optional<ConfigProto>(*config) : std::nullopt);
+  if (bridge_rollout ==
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED ||
+      node_def.op() == "VarIsInitializedOp" ||
+      (bridge_rollout !=
+           ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED &&
+       options_.device_type.type_string() != DEVICE_TPU_XLA_JIT)) {
+    return compile_with_old_bridge();
+  }
+
+  GraphDebugInfo debug_info;
+  std::vector<std::string> control_rets;
+  if (result_dtypes.empty()) {
+    control_rets.push_back(node_def.name());
+  }
+
+  bool mlir_enabled = (bridge_rollout ==
+                       ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED);
+  VLOG(1) << "Attempting MLIR bridge."
+          << (mlir_enabled ? " MLIR is explicitly enabled." : "");
+  auto mlir_result = CompileGraphToXlaHlo(
+      *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args), control_rets,
+      options_.device_type.type_string(), compile_options.use_tuple_arg,
+      /*analyse_graph=*/!mlir_enabled, *options_.flib_def, debug_info,
+      options_.shape_determination_fns, result);
+
+  if (mlir_result.ok() || mlir_enabled) {
+    return mlir_result;
+  }
+
+  VLOG(2) << "Failed second phase of the MLIR bridge. Will "
+             "retry with the old bridge. MLIR bridge compilation status: "
+          << mlir_result;
+  return compile_with_old_bridge();
+}
+
 Status XlaCompiler::CompileFunction(
     const XlaCompiler::CompileOptions& options,
     const NameAttrList& fn_name_attrs,
@@ -916,7 +975,7 @@ Status XlaCompiler::XLAShapeForArgument(
           }
           TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
-          shape.AddDim(arg.max_array_size);
+          TF_RETURN_IF_ERROR(shape.AddDimWithStatus(arg.max_array_size));
           shape.AppendShape(std::get<TensorShape>(arg.shape));
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
@@ -934,7 +993,7 @@ Status XlaCompiler::XLAShapeForArgument(
           }
           TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
-          shape.AddDim(arg.max_array_size);
+          TF_RETURN_IF_ERROR(shape.AddDimWithStatus(arg.max_array_size));
           shape.AppendShape(std::get<TensorShape>(arg.shape));
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
@@ -974,6 +1033,20 @@ void XlaCompiler::PopulateArgumentFromResource(const XlaResource& resource,
   arg->name = resource.name();
 }
 
+XlaCompiler::SingleOpCompileArgument::SingleOpCompileArgument(
+    const OpKernelContext& ctx) {
+  std::vector<DataType> output_dtypes(ctx.num_outputs());
+  for (int i = 0; i < output_dtypes.size(); ++i) {
+    output_dtypes[i] = ctx.expected_output_dtype(i);
+  }
+  this->output_dtypes = output_dtypes;
+  this->node_def = ctx.op_kernel().def();
+  auto* config_proto = ctx.function_library()->config_proto();
+  if (config_proto != nullptr) {
+    this->config_proto = *config_proto;
+  }
+}
+
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
@@ -1429,6 +1502,13 @@ Status XlaCompiler::CompileGraph(
       &result->resource_updates, &result->xla_output_shape,
       result->input_mapping));
 
+  for (const auto& [key, send] : host_compute_sends_) {
+    *result->host_compute_metadata.add_device_to_host() = send;
+  }
+  for (const auto& [key, recv] : host_compute_recvs_) {
+    *result->host_compute_metadata.add_host_to_device() = recv;
+  }
+
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
   VLOG(2) << "XLA output shape: "
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 1e424256957..d027326239e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -101,6 +101,8 @@ class XlaContext;
 // `tensor_array_gradients` ordered set.
 class XlaCompiler {
  public:
+  // TODO(b/255826209): Remove this alias. Depending on XlaCompiler just to use
+  // XlaArgument seeems weird and can cause circular dependencies.
   using Argument = ::tensorflow::XlaArgument;
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -212,6 +214,10 @@ class XlaCompiler {
     // This is currently only used to obtain MLIR TPU bridge rollout state.
     // Can be removed once full rollout is complete.
     ConfigProto config_proto;
+
+    SingleOpCompileArgument() = default;
+
+    explicit SingleOpCompileArgument(const OpKernelContext& ctx);
   };
 
   explicit XlaCompiler(Options options);
@@ -227,6 +233,11 @@ class XlaCompiler {
                          absl::Span<const Argument> args,
                          CompilationResult* result);
 
+  Status CompileSingleOp(
+      const CompileOptions& options,
+      const SingleOpCompileArgument& single_op_compile_argument,
+      absl::Span<const Argument> args, CompilationResult* result);
+
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 6abdec5b6dd..5231d9e4246 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -41,17 +41,21 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
@@ -1920,7 +1924,7 @@ TEST_F(XlaCompilerTest, AliasResourceUpdates) {
   EXPECT_EQ(alias.entries(0).parameter_number(), 0);
 }
 
-// Tests that passing in an exact duplicate input to SetDeviceToHostMeatadata
+// Tests that passing in an exact duplicate input to SetDeviceToHostMetadata
 // is not an error.
 TEST_F(XlaCompilerTest, SetDeviceToHostMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
@@ -1949,7 +1953,7 @@ TEST_F(XlaCompilerTest, SetDeviceToHostMetadataMismatchedDuplicate) {
   EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
 }
 
-// Tests that passing in an exact duplicate input to SetHostToDeviceMeatadata
+// Tests that passing in an exact duplicate input to SetHostToDeviceMetadata
 // is not an error.
 TEST_F(XlaCompilerTest, SetHostToDeviceMetadataExactDuplicate) {
   XlaCompiler compiler(DefaultOptions());
@@ -1978,5 +1982,67 @@ TEST_F(XlaCompilerTest, SetHostToDeviceMetadataMismatchedDuplicate) {
   EXPECT_EQ(status.code(), error::Code::INVALID_ARGUMENT);
 }
 
+TEST_F(OpsTestBase, BuildSingleOpCompileArgument) {
+  TF_EXPECT_OK(NodeDefBuilder("identity_op", "Identity")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DT_FLOAT)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 2}), {0, 1});
+  TF_EXPECT_OK(RunOpKernel());
+
+  XlaCompiler::SingleOpCompileArgument arg(*context_);
+
+  EXPECT_THAT(arg.output_dtypes, ::testing::ElementsAreArray({DT_FLOAT}));
+  EXPECT_EQ(arg.node_def.SerializeAsString(),
+            context_->op_kernel().def().SerializeAsString());
+  EXPECT_EQ(arg.config_proto.ByteSizeLong(), 0);
+}
+
+TEST_F(OpsTestBase, CompileSingleOp) {
+  TF_EXPECT_OK(NodeDefBuilder("identity_op", "Identity")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DT_FLOAT)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 2}), {6.9, 4.2});
+  TF_EXPECT_OK(RunOpKernel());
+
+  XlaCompiler::SingleOpCompileArgument single_op_arg(*context_);
+
+  xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
+  XlaOpRegistry::RegisterCompilationKernels();
+  FunctionDefLibrary flib;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def(
+      new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  XlaCompiler::Options options;
+  options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  options.client = client;
+  options.flib_def = flib_def.get();
+
+  XlaCompiler compiler(options);
+
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_FLOAT;
+  args[0].shape = TensorShape({1, 2});
+  args[0].constant_value = GetInput(0);
+  args[0].initialized = true;
+
+  XlaCompiler::CompilationResult result;
+  TF_EXPECT_OK(compiler.CompileSingleOp(XlaCompiler::CompileOptions(),
+                                        single_op_arg, args, &result));
+
+  // Tests that the generated computation works.
+  std::unique_ptr<xla::GlobalData> actual =
+      client->Execute(*result.computation, {}).value();
+  xla::Literal actual_literal = client->Transfer(*actual).value();
+
+  xla::Literal expected0 = xla::LiteralUtil::CreateR2<float>({{6.9, 4.2}});
+  xla::Literal expected_literal = xla::LiteralUtil::MakeTuple({&expected0});
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(expected_literal, actual_literal));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index 4725193b7a9..494660d48f0 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -112,7 +112,7 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
   });
 }
 
-StatusOr<Tensor> XlaExpression::ResolveDynamism(xla::Client* client) const {
+StatusOr<Tensor> XlaExpression::ResolveDynamism() const {
   switch (kind()) {
     case Kind::kConstant: {
       // Constant values are considered static.
@@ -133,9 +133,6 @@ StatusOr<Tensor> XlaExpression::ResolveDynamism(xla::Client* client) const {
           HumanString());
   }
 
-  if (!client)
-    return errors::InvalidArgument("client is required to resolve constant");
-
   TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
 
   // The XLA layout is specified minor to major, and TensorFlow uses a major to
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index 73173ac8c55..9eb11023ce9 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -131,7 +131,7 @@ class XlaExpression {
 
   // ResolveDynamism computes where a value inside this op is dynamic or can be
   // inferred at compile time.
-  StatusOr<Tensor> ResolveDynamism(xla::Client* client) const;
+  StatusOr<Tensor> ResolveDynamism() const;
 
   // Returns the shape of the tensor.
   // The shape of a resource is the shape of a resource handle (i.e., a scalar),
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 06d73fde192..0e621995cbc 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index ce765935885..6f45dcf1726 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -292,27 +292,26 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
 
     const string& Name() const override { return name_; }
 
-    se::port::StatusOr<std::unique_ptr<se::DeviceDescription>>
-    DescriptionForDevice(int ordinal) const override {
+    tsl::StatusOr<std::unique_ptr<se::DeviceDescription>> DescriptionForDevice(
+        int ordinal) const override {
       return std::unique_ptr<se::DeviceDescription>(nullptr);
     }
 
-    se::port::StatusOr<se::StreamExecutor*> ExecutorForDevice(
-        int ordinal) override {
+    tsl::StatusOr<se::StreamExecutor*> ExecutorForDevice(int ordinal) override {
       return nullptr;
     }
 
-    se::port::StatusOr<se::StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+    tsl::StatusOr<se::StreamExecutor*> ExecutorForDeviceWithPluginConfig(
         int ordinal, const se::PluginConfig& config) override {
       return nullptr;
     }
 
-    se::port::StatusOr<se::StreamExecutor*> GetExecutor(
+    tsl::StatusOr<se::StreamExecutor*> GetExecutor(
         const se::StreamExecutorConfig& config) override {
       return nullptr;
     }
 
-    se::port::StatusOr<std::unique_ptr<se::StreamExecutor>> GetUncachedExecutor(
+    tsl::StatusOr<std::unique_ptr<se::StreamExecutor>> GetUncachedExecutor(
         const se::StreamExecutorConfig& config) override {
       return std::unique_ptr<se::StreamExecutor>(nullptr);
     }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index cee1f62de6a..0f7373659bd 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -244,6 +244,13 @@ Status XlaOpKernelContext::ConstantInputAsIntScalar(
   return ConstantInputAsIntScalar(index, out, mode);
 }
 
+StatusOr<int64_t> XlaOpKernelContext::ConstantInputAsIntScalar(
+    absl::string_view name, xla::ValueInferenceMode mode) {
+  int64_t out;
+  TF_RETURN_IF_ERROR(ConstantInputAsIntScalar(name, &out, mode));
+  return out;
+}
+
 Status XlaOpKernelContext::ConstantInputAsFloatScalar(
     int index, double* out, xla::ValueInferenceMode mode) {
   xla::Literal literal;
@@ -270,8 +277,7 @@ static Status LiteralToPredVector(const xla::LiteralSlice& literal,
 Status XlaOpKernelContext::ResolveInputDynamismIntoPred(int index, bool* out) {
   xla::Literal literal;
   XlaExpression e = InputExpression(index);
-  auto* client = compiler() ? compiler()->client() : nullptr;
-  StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism(client);
+  StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism();
   if (!dynamism_or_status.ok()) {
     // When failed to resolve dynamism, conservatively consider the value
     // dynamic. This could happen if the input depends on some ops like
@@ -313,8 +319,7 @@ Status XlaOpKernelContext::ResolveInputDynamismReshaped(
     int index, absl::Span<const int64_t> new_dims,
     xla::Literal* dynamism_literal) {
   XlaExpression e = InputExpression(index);
-  auto* client = compiler() ? compiler()->client() : nullptr;
-  StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism(client);
+  StatusOr<Tensor> dynamism_or_status = e.ResolveDynamism();
   if (!dynamism_or_status.ok()) {
     xla::Literal true_literal = xla::LiteralUtil::CreateR0<bool>(true);
     // When failed to resolve dynamism, conservatively consider the value
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index b200af35a8d..d6aaa993eb7 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -162,6 +162,10 @@ class XlaOpKernelContext {
       absl::string_view name, int64_t* out,
       xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
 
+  StatusOr<int64_t> ConstantInputAsIntScalar(
+      absl::string_view name,
+      xla::ValueInferenceMode mode = xla::ValueInferenceMode::kValue);
+
   // Converts a constant scalar float32 or float64 tensor into a float64.
   Status ConstantInputAsFloatScalar(
       int index, double* out,
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index bb76b5d22b9..7f1b5dbd1b9 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -87,7 +87,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   }
   if (!x.has_device_allowlist && !y.has_device_allowlist) {
     LOG(WARNING) << "Duplicate registrations of " << x.name
-                 << "with no device allowlists.";
+                 << " with no device allowlists.";
     return false;
   }
   if (x.has_device_allowlist && y.has_device_allowlist) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index a4dd68f7db7..68656bb2e74 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -45,6 +45,8 @@ extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
 extern const char* const DEVICE_XLA_CPU;
 extern const char* const DEVICE_XLA_GPU;
 
+// Do not include DT_FLOAT8_* as float or numeric types since they are only
+// supported in a very limited set of ops.
 constexpr std::array<DataType, 4> kFloatTypes = {
     {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
 constexpr std::array<DataType, 6> kFloatAndComplexTypes = {
@@ -54,15 +56,17 @@ constexpr std::array<DataType, 14> kNumericTypes = {
      DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128,
      DT_BFLOAT16}};
 
-constexpr std::array<DataType, 18> kCpuAllTypes = {
-    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8,
-     DT_INT16, DT_INT32, DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
+constexpr std::array<DataType, 20> kCpuAllTypes = {
+    {DT_UINT8,      DT_QUINT8, DT_UINT16,   DT_UINT32,      DT_UINT64,
+     DT_INT8,       DT_QINT8,  DT_INT16,    DT_INT32,       DT_QINT32,
+     DT_INT64,      DT_HALF,   DT_FLOAT,    DT_DOUBLE,      DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL,   DT_BFLOAT16, DT_FLOAT8_E5M2, DT_FLOAT8_E4M3FN}};
 
-constexpr std::array<DataType, 18> kGpuAllTypes = {
-    {DT_UINT8, DT_QUINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8,
-     DT_INT16, DT_INT32, DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
+constexpr std::array<DataType, 20> kGpuAllTypes = {
+    {DT_UINT8,      DT_QUINT8, DT_UINT16,   DT_UINT32,      DT_UINT64,
+     DT_INT8,       DT_QINT8,  DT_INT16,    DT_INT32,       DT_QINT32,
+     DT_INT64,      DT_HALF,   DT_FLOAT,    DT_DOUBLE,      DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL,   DT_BFLOAT16, DT_FLOAT8_E5M2, DT_FLOAT8_E4M3FN}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
@@ -330,6 +334,12 @@ class XlaOpRegistry {
 #define REGISTER_XLA_OP(NAME, OP) \
   REGISTER_XLA_OP_UNIQ_HELPER(__COUNTER__, NAME, OP)
 
+#define REGISTER_XLA_CONV_OP(BUILDER, OP)                                      \
+  REGISTER_XLA_OP(BUILDER.TypeConstraint("T", GetXlaConvTypesForNonGpu()), OP) \
+  REGISTER_XLA_OP(BUILDER.TypeConstraint("T", GetXlaConvTypesForGpu())         \
+                      .Device(DEVICE_GPU_XLA_JIT),                             \
+                  OP)
+
 class XlaOpRegistrationBuilder {
  public:
   // Starts an operator registration chain.
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry_test.cc b/tensorflow/compiler/tf2xla/xla_op_registry_test.cc
index 7b3b15b1af7..4d8e1bc31f8 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry_test.cc
@@ -82,6 +82,25 @@ TEST(XlaOpRegistryTest, XlaOpRegistrationWithOverride) {
   }
 }
 
+TEST(XlaOpReigstryTest, XlaOpRegistrationDeviceKernels) {
+  XlaOpRegistry::RegisterCompilationKernels();
+  auto registered_devices = XlaOpRegistry::BackendNames();
+  for (const auto& resgistered_device : registered_devices) {
+    auto kernels = XlaOpRegistry::DeviceKernels(resgistered_device, true);
+    for (const auto& kernel : kernels) {
+      if (kernel->op() == "DummyDuplicateOp") {
+        if (resgistered_device == DEVICE_CPU_XLA_JIT) {
+          EXPECT_EQ(kernel->constraint(0).allowed_values().list().type(0),
+                    DT_INT32);
+        } else {
+          EXPECT_EQ(kernel->constraint(0).allowed_values().list().type(0),
+                    DT_FLOAT);
+        }
+      }
+    }
+  }
+}
+
 // A dummy generic OpKernel for all backends.
 class DummyInfeasibleTypeConstraintOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 68f574557d6..bf7e2ecc551 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -138,7 +138,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kTensorArray: {
       TensorShape ta_shape;
-      ta_shape.AddDim(max_array_size_);
+      TF_RETURN_IF_ERROR(ta_shape.AddDimWithStatus(max_array_size_));
       ta_shape.AppendShape(shape_);
       value_ = xla::Broadcast(XlaHelpers::Zero(builder, type_),
                               ta_shape.dim_sizes());
@@ -146,7 +146,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kStack: {
       TensorShape ta_shape;
-      ta_shape.AddDim(max_array_size_);
+      TF_RETURN_IF_ERROR(ta_shape.AddDimWithStatus(max_array_size_));
       ta_shape.AppendShape(shape_);
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
@@ -171,7 +171,7 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    ta_shape.AddDim(max_array_size_);
+    TF_RETURN_IF_ERROR(ta_shape.AddDimWithStatus(max_array_size_));
     ta_shape.AppendShape(shape_);
     xla::XlaOp gradient_value =
         xla::Broadcast(XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 08a9ecdcd06..65adb039a4f 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -1,24 +1,26 @@
-load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
 load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
+load(
+    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
 package_group(
     name = "friends",
-    includes = ["//tensorflow:internal"],
+    includes = ["//tensorflow/compiler/xla:internal"],
     packages = [
         # copybara:uncomment "//learning/infra/mira/...",
-        "//third_party/auroraml/...",
         "//third_party/australis/...",
         "//third_party/iree/...",
         "//third_party/mira/...",
@@ -33,6 +35,7 @@ package_group(
 
 package_group(
     name = "internal",
+    includes = ["//tensorflow:internal"],
     packages = [
         "//tensorflow/compiler/xla/...",
     ],
@@ -46,6 +49,8 @@ package_group(
     ],
 )
 
+exports_files(["run_lit.sh"])
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -108,7 +113,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "bit_cast_test",
     srcs = ["bit_cast_test.cc"],
     deps = [
@@ -136,13 +141,14 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto_cc",
+        "//tensorflow/tsl/platform:float8",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "comparison_util_test",
     srcs = ["comparison_util_test.cc"],
     deps = [
@@ -169,6 +175,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "frontend_attributes",
+    srcs = [
+        "frontend_attributes.cc",
+    ],
+    hdrs = [
+        "frontend_attributes.h",
+    ],
+    visibility = [":friends"],
+    deps = ["//tensorflow/compiler/xla/hlo/ir:hlo"],
+)
+
 cc_library(
     name = "test",
     testonly = 1,
@@ -218,7 +236,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "status_macros_test",
     size = "small",
     srcs = ["status_macros_test.cc"],
@@ -248,13 +266,13 @@ cc_library(
         "statusor.h",
     ],
     linkopts = select({
-        "//tensorflow:freebsd": ["-lexecinfo"],
+        "//tensorflow/tsl:freebsd": ["-lexecinfo"],
         "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
     deps = [
         ":status",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -294,13 +312,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "util_test",
     srcs = ["util_test.cc"],
     deps = [
         ":test",
         ":types",
         ":util",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
     ],
@@ -318,7 +337,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "permutation_util_test",
     srcs = ["permutation_util_test.cc"],
     deps = [
@@ -348,7 +367,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "iterator_util_test",
     srcs = ["iterator_util_test.cc"],
     deps = [
@@ -379,6 +398,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":permutation_util",
+        ":printer",
         ":status",
         ":status_macros",
         ":statusor",
@@ -386,6 +406,7 @@ cc_library(
         ":util",
         ":xla_data_proto_cc",
         "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
         "@com_google_absl//absl/algorithm:container",
@@ -394,6 +415,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -412,7 +434,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shape_test",
     srcs = ["shape_test.cc"],
     deps = [
@@ -430,7 +452,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shape_util_test",
     srcs = ["shape_util_test.cc"],
     deps = [
@@ -442,6 +464,7 @@ tf_cc_test(
         ":types",
         ":util",
         ":xla_data_proto_cc",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
@@ -449,7 +472,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "primitive_util_test",
     srcs = ["primitive_util_test.cc"],
     deps = [
@@ -465,7 +488,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "layout_util_test",
     srcs = ["layout_util_test.cc"],
     deps = [
@@ -476,7 +499,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "layout_test",
     srcs = ["layout_test.cc"],
     deps = [
@@ -488,7 +511,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "index_util_test",
     srcs = ["index_util_test.cc"],
     deps = [
@@ -509,6 +532,7 @@ cc_library(
         ":array3d",
         ":array4d",
         ":permutation_util",
+        ":printer",
         ":shape_util",
         ":status_macros",
         ":types",
@@ -516,6 +540,7 @@ cc_library(
         ":xla_data_proto_cc",
         "//tensorflow/tsl/lib/core:bitmap",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:protobuf",
@@ -523,14 +548,13 @@ cc_library(
         "//tensorflow/tsl/util:byte_swap_array",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "literal_test",
     srcs = ["literal_test.cc"],
     deps = [
@@ -543,6 +567,7 @@ tf_cc_test(
         ":test",
         ":types",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/base",
@@ -579,20 +604,22 @@ cc_library(
 cc_library(
     name = "error_spec",
     hdrs = ["error_spec.h"],
-    visibility = [":friends"],
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "literal_comparison",
     srcs = ["literal_comparison.cc"],
     hdrs = ["literal_comparison.h"],
-    visibility = [":friends"],
+    visibility = ["//visibility:public"],
     deps = [
         ":error_spec",
         ":literal",
         ":literal_util",
+        ":shape_util",
         ":util",
         "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -640,7 +667,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "array_test",
     srcs = ["array_test.cc"],
     deps = [
@@ -663,7 +690,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "array2d_test",
     srcs = ["array2d_test.cc"],
     deps = [
@@ -684,7 +711,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "array3d_test",
     srcs = ["array3d_test.cc"],
     deps = [
@@ -709,7 +736,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "array4d_test",
     srcs = ["array4d_test.cc"],
     deps = [
@@ -785,7 +812,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "text_literal_reader_test",
     srcs = ["text_literal_reader_test.cc"],
     deps = [
@@ -818,7 +845,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "text_literal_writer_test",
     srcs = ["text_literal_writer_test.cc"],
     deps = [
@@ -854,7 +881,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shape_tree_test",
     srcs = ["shape_tree_test.cc"],
     deps = [
@@ -875,6 +902,7 @@ cc_library(
     hdrs = ["shape_layout.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":printer",
         ":shape_util",
         ":types",
         ":util",
@@ -900,7 +928,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "window_util_test",
     srcs = ["window_util_test.cc"],
     deps = [
@@ -926,7 +954,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/tsl/lib/math:math_util",
         "//tensorflow/tsl/platform:logging",
@@ -936,7 +964,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reference_util_test",
     srcs = ["reference_util_test.cc"],
     deps = [
@@ -971,14 +999,13 @@ cc_library(
         ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "parse_flags_from_env_test",
     srcs = ["parse_flags_from_env_test.cc"],
     deps =
         [
             ":parse_flags_from_env",
             "//tensorflow/tsl/platform:env",
-            "//tensorflow/tsl/platform:env_impl",
             "//tensorflow/tsl/platform:logging",
             "//tensorflow/tsl/platform:subprocess",
             "//tensorflow/tsl/platform:test",
@@ -995,7 +1022,7 @@ cc_library(
     ],
     hdrs = ["debug_options_flags.h"],
     copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]),
-    visibility = [":friends"],
+    visibility = ["//visibility:public"],
     deps =
         [
             ":parse_flags_from_env",
@@ -1021,7 +1048,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "debug_options_parsers_test",
     size = "small",
     srcs = [
@@ -1031,7 +1058,7 @@ tf_cc_test(
     deps =
         [
             ":xla_proto_cc",
-            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/compiler/xla/hlo/ir:hlo",
             "//tensorflow/tsl/platform:logging",
             "//tensorflow/tsl/platform:test",
             "@com_google_absl//absl/container:flat_hash_map",
@@ -1052,7 +1079,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "refcounting_hash_map_test",
     srcs = ["refcounting_hash_map_test.cc"],
     deps = [
@@ -1073,6 +1100,68 @@ cc_library(
     hdrs = ["side_effect_util.h"],
 )
 
+cc_library(
+    name = "lazy",
+    hdrs = ["lazy.h"],
+    deps = ["@com_google_absl//absl/functional:any_invocable"],
+)
+
+tf_proto_library(
+    name = "autotune_results_proto",
+    srcs = ["autotune_results.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        "//tensorflow/tsl/protobuf:autotuning_proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+xla_py_proto_library(
+    name = "autotune_results_py_pb2",
+    api_version = 2,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":autotune_results_proto",
+    ],
+)
+
+cc_library(
+    name = "autotune_serialize",
+    srcs = if_cuda_is_configured(["autotune_serialize.cc"]),
+    hdrs = if_cuda_is_configured(["autotune_serialize.h"]),
+    # TODO(aminim): There appears to be an in-progress refactoring in TF/XLA to
+    # mark rules as compatible_with GCE.  The rules that this depends on are
+    # not yet marked as compatible, so this one can't be either (yet).
+    compatible_with = [],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":autotune_results_proto_cc",
+        ":statusor",
+        "//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
+        "//tensorflow/compiler/xla/service/gpu:gpu_conv_algorithm_picker",
+    ]),
+)
+
+cc_library(
+    name = "printer",
+    srcs = ["printer.cc"],
+    hdrs = ["printer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+    ],
+)
+
+filegroup(
+    name = "litfiles",
+    srcs = [
+        "runlit.cfg.py",
+        "runlit.site.cfg.py",
+    ],
+    visibility = ["//tensorflow/compiler/xla:__subpackages__"],
+)
+
 # -----------------------------------------------------------------------------
 
 # copybara:uncomment_begin(google-only)
@@ -1092,17 +1181,3 @@ cc_library(
 #     deps = [":xla_proto"],
 # )
 # copybara:uncomment_end
-
-# This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
-cc_header_only_library(
-    name = "xla_headers_lib",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_data_proto_cc",
-        ":xla_proto_cc",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:stream_executor_headers_lib",
-    ],
-)
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index 029a2e0081f..9799312625c 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -5,3 +5,91 @@
 XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
 algebra that optimizes TensorFlow computations. See the
 [documentation](./g3doc/index.md).
+
+This directory is currently migrating to [OpenXLA](https://github.com/openxla/)
+and will be the root of the [openxla/xla](https://github.com/openxla/xla)
+repository.
+
+== Directory Structure ==
+
+We're currently re-organizing the directory structure, the end result should be
+that no sources are directly present at the top-level. Here is the current plan
+for the directory layout:
+
+*   backends/ (created from directories under xla/service)
+    *   cpu/
+    *   gpu/
+    *   interpreter/
+    *   ...
+*   hlo/ (created from xla/service/ mostly, no sources expected directly here)
+    *   client/ (created from xla/client)
+    *   evaluator/ (created from the relevant files in xla/service)
+    *   experimental/ (created from xla/experimental)
+    *   ir/ (created from the relevant files in xla/service)
+    *   python/ (created from xla/python)
+    *   tests/ (created from xla/tests)
+    *   transforms/ (created from the relevant files in xla/service)
+    *   utils/ (created from the relevant files in xla/service)
+*   mlir/ (also exported as the root of https://github.com/tensorflow/mlir-hlo
+    and building with CMake)
+    *   CMakeLists.txt (just like now for mlir-hlo repo).
+    *   backends/ (same as xla/backends/ but for the MLIR specific bits: this is
+        a short-term solution pending more convergence / XLA Next)
+        *   cpu
+        *   gpu (populated from /compiler/xla/mlir/transforms/gpu/passes.td,
+            will contain all the glue for e2e GPU compilation)
+    *   bindings/
+        *   c/ (bootstrapped from mlir/hlo/{include,lib}/mlir-hlo-c)
+        *   python/ (bootstrapped from mlir/hlo/python, should talk about some
+            low-level LAX?)
+    *   integration_tests/ (to be defined / refined)
+    *   tools/ (xla-opt, fuzzer, ir-reducer, interpreter/evaluator)
+    *   transforms/ (generic / cross dialect transforms)
+    *   utils/
+*   // below are dialects and transforms folders
+    *   framework/ (moved from compiler/mlir/xla/ir/xla_framework_ops.td)
+    *   gml_st
+        *   gmlst-opt.cc
+        *   gmlst-runner.cc (runner tool that can execute IR at ~gmlst level)
+        *   ir/
+        *   integration_test (tests that run things: Tensor(s) in -> Tensor(s)
+            out)
+        *   test (IR -> IR tests for passes interaction)
+        *   transforms/
+            *   bufferize_tiled_loop/
+                *   bufferize_tiled_loop.cc
+                *   bufferize_tiled_loop.h
+            *   ...
+    *   lhlo_gpu/
+    *   mhlo/
+        *   mhlo-opt.cc
+        *   analysis/
+            *   dataflow/
+                *   dataflow.h
+                *   dataflow.cc
+                *   test_pass.cc // test_only target, linked into opt tool for
+                    testing only.
+        *   integration_test (tests that run things: Tensor(s) in -> Tensor(s)
+            out)
+        *   ir/ (dialect definition)
+        *   test (IR -> IR tests for passes interaction)
+        *   transforms/
+            *   materialize_broadcasts/
+                *   materialize_broadcasts.cc
+                *   materialize_broadcasts.h // headers stays with the source
+                *   broadcast_analysis.{cc, h} // private analysis/utils needed
+                    for this pass
+                *   test/ (.mlir unit-tests are collocated with the pass
+                    itself).
+            *   …
+            *   passes.td // enables group registration for all passes.
+        *   utils/
+    *   thlo/
+    *   runtime/
+*   pjrt/ (created from xla/pjrt)
+*   rpc/ (created from xla/rpc)
+*   runtime/
+*   stream_executor/ (moved from TensorFlow)
+*   third_party/ (vendoring of TSL base library)
+*   tools/ (created from mlir/hlo/tools and xla/tools)
+*   translate/ (StableHLO to MHLO, MHLO to HLO, HLO to MHLO, MHLO to TOSA)
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 6e20cda083b..bdfc8d687e6 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -40,29 +40,10 @@ namespace xla {
 
 namespace array_impl {
 
-// conjunction
-//
-// Performs a compile-time logical AND operation on the passed types (which
-// must have  `::value` members convertible to `bool`. Short-circuits if it
-// encounters any `false` members (and does not compare the `::value` members
-// of any remaining arguments).
-//
-// This metafunction is designed to be a drop-in replacement for the C++17
-// `std::conjunction` metafunction.
-template <typename... Ts>
-struct conjunction;
-
-template <typename T, typename... Ts>
-struct conjunction<T, Ts...>
-    : std::conditional<T::value, conjunction<Ts...>, T>::type {};
-
-template <>
-struct conjunction<> : std::true_type {};
-
 // A type trait that is valid when all elements in a parameter pack are of
 // integral type. Not using an alias template to work around MSVC 14.00 bug.
 template <typename... Ts>
-struct pack_is_integral : conjunction<std::is_integral<Ts>...> {};
+struct pack_is_integral : std::conjunction<std::is_integral<Ts>...> {};
 
 // Compares three same-sized vectors elementwise. For each item in `values`,
 // returns false if any of values[i] is outside the half-open range [starts[i],
@@ -139,10 +120,12 @@ class Array {
     CHECK(idx == num_elements());
   }
 
-  // Creates a 2D array of a floating-point type (half, bfloat16, float,
+  // Creates a 2D array of a floating-point type (float8, half, bfloat16, float,
   // or double) from an initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
+                             (std::is_same<T, tsl::float8_e4m3fn>::value ||
+                              std::is_same<T, tsl::float8_e5m2>::value ||
+                              std::is_same<T, Eigen::half>::value ||
                               std::is_same<T, bfloat16>::value ||
                               std::is_same<T, float>::value ||
                               std::is_same<T, double>::value) &&
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 38f63603728..77e3c9c94e8 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -50,10 +50,12 @@ class Array2D : public Array<T> {
   Array2D(std::initializer_list<std::initializer_list<T>> values)
       : Array<T>(values) {}
 
-  // Creates an array of a floating-point type (half, bfloat16, float,
+  // Creates an array of a floating-point type (float8, half, bfloat16, float,
   // or double) from the given nested initializer list of float values.
   template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
+                             (std::is_same<T, tsl::float8_e4m3fn>::value ||
+                              std::is_same<T, tsl::float8_e5m2>::value ||
+                              std::is_same<T, Eigen::half>::value ||
                               std::is_same<T, bfloat16>::value ||
                               std::is_same<T, float>::value ||
                               std::is_same<T, double>::value) &&
diff --git a/tensorflow/compiler/xla/autotune_results.proto b/tensorflow/compiler/xla/autotune_results.proto
new file mode 100644
index 00000000000..125b28cee79
--- /dev/null
+++ b/tensorflow/compiler/xla/autotune_results.proto
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+import "tensorflow/tsl/protobuf/autotuning.proto";
+
+// A collection of algorithms for particular dot/convs.  Usually this is "the
+// best" algorithm for the particular dot/conv, although that's not strictly
+// required.
+//
+// Users don't interact with this proto directly.  It's used internally to
+// facilitate ahead-of-time autotuning -- The string used by
+// xla::{Serialize,Load}AutotuneResults is, internally, a serialization of this
+// proto.
+//
+// LINT.IfChange
+message AutotuneResults {
+  message Entry {
+    string device = 1;
+    string hlo = 2;
+
+    // nb: These results are always tied to a particular version of
+    // cublas/cudnn, but this is *especially* true for cublasLt results.  For
+    // cublasLt gemms, the result is an index into the list of candidate
+    // algorithms returned by cublasLt.  Different version of cublasLt ->
+    // different list of algos -> different interpretation of results!
+    tensorflow.AutotuneResult result = 3;
+  }
+
+  int32 version = 1;
+  repeated Entry dots = 2;
+  repeated Entry convs = 3;
+}
+// LINT.ThenChange(
+//   "autotune_serialize.cc:version"
+// )
diff --git a/tensorflow/compiler/xla/autotune_serialize.cc b/tensorflow/compiler/xla/autotune_serialize.cc
new file mode 100644
index 00000000000..149fdf9f24f
--- /dev/null
+++ b/tensorflow/compiler/xla/autotune_serialize.cc
@@ -0,0 +1,63 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/autotune_serialize.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/autotune_results.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
+
+namespace xla {
+namespace {
+
+// Bump this version whenever you change the structure of the results.
+// LINT.IfChange(version)
+constexpr int kVersion = 1;
+// LINT.ThenChange()
+
+}  // anonymous namespace
+
+Status LoadAutotuneResults(absl::string_view data) {
+  AutotuneResults results;
+  // The cast here is necessary for MacOS builds.
+  if (!results.ParseFromString(std::string(data))) {  // NOLINT
+    return tsl::errors::InvalidArgument(
+        "Failed to parse autotune results string.");
+  }
+  if (results.version() != kVersion) {
+    return tsl::errors::InvalidArgument(absl::StrFormat(
+        "Version mismatch in autotune results.  Expected %d but was %d",
+        kVersion, results.version()));
+  }
+
+  TF_RETURN_IF_ERROR(gpu::GpuConvAlgorithmPicker::LoadAutotuneResults(results));
+  TF_RETURN_IF_ERROR(gpu::GemmAlgorithmPicker::LoadAutotuneResults(results));
+  return OkStatus();
+}
+
+StatusOr<std::string> SerializeAutotuneResults() {
+  AutotuneResults results;
+  results.set_version(kVersion);
+
+  TF_RETURN_IF_ERROR(
+      gpu::GpuConvAlgorithmPicker::WriteAutotuneResults(&results));
+  TF_RETURN_IF_ERROR(gpu::GemmAlgorithmPicker::WriteAutotuneResults(&results));
+
+  return results.SerializeAsString();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/autotune_serialize.h b/tensorflow/compiler/xla/autotune_serialize.h
new file mode 100644
index 00000000000..8d555b90575
--- /dev/null
+++ b/tensorflow/compiler/xla/autotune_serialize.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_AUTOTUNE_SERIALIZE_H_
+#define TENSORFLOW_COMPILER_XLA_AUTOTUNE_SERIALIZE_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Functions to save/load XLA's autotuning results.
+//
+// This is used for ahead-of-time autotuning.  Specifically:
+//
+// When XLA calls cublas (for matmuls, aka "gemm" or "dot") or cudnn (for
+// convolutions), it usually has to choose an "algorithm" for the particular
+// dot/conv.  XLA queries cublas/cudnn for a list of candidate algorithms.  Then
+// it runs all of them and picks the fastest one.  This is what we call
+// "autotuning". It happens in GemmAlgorithmPicker and GpuConvAlgorithmPicker.
+//
+// Autotuning is necessary to get good performance for dot/conv.  But it also
+// has some disadvantages.
+//
+//  - Because it relies on timing data, it is fundamentally nondeterministic.
+//    But even if two algorithms have similar runtimes, our choice of algorithm
+//    may be visible to the user: Different algorithms can have different
+//    numerics, and sometimes they can even have different bugs!
+//
+//  - Trying all the candidate algorithms can be slow, especially if when some
+//    of the candidates are "very bad" and run especially slowly compared to the
+//    optimal candidate.  This slows down compilation.
+//
+// To address the disadvantages above, we allow users to save/restore the
+// autotuning choices that XLA has made, using the functions below.
+//
+// Loading autotuning results does not erase existing autotuning choices, but in
+// the event of a disagreement between the existing data and the new data, the
+// new algorithm is chosen.
+//
+// Note that even if you call LoadAutotuneResults(), if XLA encounters a
+// dot/conv that is *not* covered by the loaded data, it will go ahead and
+// autotune it like normal.  In other words, the behavior of XLA should be
+// identical with or without ahead-of-time autotuning, modulo nondeterminism.
+//
+// This is important if you want to be able to use the same autotuning file with
+// different versions of XLA, because as XLA changes, exactly which dots/convs
+// it wants to run can also change.  For example, XLA might change the conv
+// padding heuristics it uses, and we don't want that to mean that all users of
+// ahead-of-time autotuning are broken.
+//
+StatusOr<std::string> SerializeAutotuneResults();
+Status LoadAutotuneResults(absl::string_view data);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_AUTOTUNE_SERIALIZE_H_
diff --git a/tensorflow/compiler/xla/backends/interpreter/BUILD b/tensorflow/compiler/xla/backends/interpreter/BUILD
index b3f82448e81..f431901a4e8 100644
--- a/tensorflow/compiler/xla/backends/interpreter/BUILD
+++ b/tensorflow/compiler/xla/backends/interpreter/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -32,6 +33,7 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:cholesky_expander",
@@ -43,7 +45,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:eigh_expander",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -84,16 +85,16 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:dynamic_dimension_inference",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:event",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -111,8 +112,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_module_config",
@@ -137,7 +138,8 @@ cc_library(
         ":executor",
         ":platform_id",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings:str_format",
     ],
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
@@ -155,6 +157,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/host:host_stream",
         "//tensorflow/compiler/xla/stream_executor/host:host_timer",
         "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/backends/interpreter/compiler.h b/tensorflow/compiler/xla/backends/interpreter/compiler.h
index 7f7b2717285..73423a0964a 100644
--- a/tensorflow/compiler/xla/backends/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/backends/interpreter/compiler.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/backends/interpreter/platform_id.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/backends/interpreter/executable.cc b/tensorflow/compiler/xla/backends/interpreter/executable.cc
index e47c19f282c..d2d5a3063b5 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/executable.cc
@@ -24,9 +24,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/backends/interpreter/executable_base.h"
 #include "tensorflow/compiler/xla/backends/interpreter/executor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/backends/interpreter/executable.h b/tensorflow/compiler/xla/backends/interpreter/executable.h
index 96ff3465ea2..9690159e68c 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executable.h
+++ b/tensorflow/compiler/xla/backends/interpreter/executable.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/backends/interpreter/executable_base.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
diff --git a/tensorflow/compiler/xla/backends/interpreter/executable_base.cc b/tensorflow/compiler/xla/backends/interpreter/executable_base.cc
index 63af800b4c4..7abcd9aa1f3 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/executable_base.cc
@@ -18,17 +18,17 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace interpreter {
diff --git a/tensorflow/compiler/xla/backends/interpreter/executable_base.h b/tensorflow/compiler/xla/backends/interpreter/executable_base.h
index ad6c5d7bb30..04c979a9fd9 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executable_base.h
+++ b/tensorflow/compiler/xla/backends/interpreter/executable_base.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/backends/interpreter/executor.cc b/tensorflow/compiler/xla/backends/interpreter/executor.cc
index a2dbfc8d966..9845e192127 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/executor.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/backends/interpreter/executor.h"
 
 #include <cstring>
+#include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace stream_executor {
@@ -53,15 +55,15 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
                                     uint64_t size) {
   AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
     // Ignore errors.
-    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
+    tsl::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
   });
-  port::Status status = AsExecutorStream(stream)->BlockUntilDone();
+  tsl::Status status = AsExecutorStream(stream)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
 
-  // TODO(b/199316985): Return 'Status' instead of 'bool', so we don't need to
-  // throw away error information here.
+  // TODO(b/199316985): Return 'tsl::Status' instead of 'bool', so we don't need
+  // to throw away error information here.
   LOG(WARNING) << "Memcpy: error on stream: " << status;
   return false;
 }
@@ -70,48 +72,49 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
                                     const void *host_src, uint64_t size) {
   AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
     // Ignore errors.
-    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
+    tsl::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
   });
-  port::Status status = AsExecutorStream(stream)->BlockUntilDone();
+  tsl::Status status = AsExecutorStream(stream)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
 
-  // TODO(b/199316985): Return 'Status' instead of 'bool', so we don't need to
-  // throw away error information here.
+  // TODO(b/199316985): Return 'tsl::Status' instead of 'bool', so we don't need
+  // to throw away error information here.
   LOG(WARNING) << "Memcpy: error on stream: " << status;
   return false;
 }
 
-port::Status XlaInterpreterExecutor::SynchronousMemcpy(
-    DeviceMemoryBase *dev_dst, const void *host_src, uint64_t size) {
+tsl::Status XlaInterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                                      const void *host_src,
+                                                      uint64_t size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return ::tsl::OkStatus();
 }
 
-port::Status XlaInterpreterExecutor::SynchronousMemcpy(
+tsl::Status XlaInterpreterExecutor::SynchronousMemcpy(
     void *host_dst, const DeviceMemoryBase &dev_src, uint64_t size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return ::tsl::OkStatus();
 }
 
 bool XlaInterpreterExecutor::HostCallback(
-    Stream *stream, std::function<port::Status()> callback) {
-  AsExecutorStream(stream)->EnqueueTaskWithStatus(callback);
+    Stream *stream, absl::AnyInvocable<tsl::Status() &&> callback) {
+  AsExecutorStream(stream)->EnqueueTaskWithStatus(std::move(callback));
   return true;
 }
 
 bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
                                                     Stream *other) {
-  AsExecutorStream(dependent)->EnqueueTask(
+  AsExecutorStream(dependent)->EnqueueTaskWithStatus(
       [other]() { return other->BlockHostUntilDone(); });
-  port::Status status = AsExecutorStream(dependent)->BlockUntilDone();
+  tsl::Status status = AsExecutorStream(dependent)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
 
-  // TODO(b/199316985): Return 'Status' instead of 'bool', so we don't need to
-  // throw away error information here.
+  // TODO(b/199316985): Return 'tsl::Status' instead of 'bool', so we don't need
+  // to throw away error information here.
   LOG(WARNING) << "CreateStreamDependency: error on stream: " << status;
   return false;
 }
@@ -126,11 +129,11 @@ bool XlaInterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-port::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+tsl::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   return AsExecutorStream(stream)->BlockUntilDone();
 }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 XlaInterpreterExecutor::CreateDeviceDescription(int device_ordinal) {
   internal::DeviceDescriptionBuilder builder;
 
diff --git a/tensorflow/compiler/xla/backends/interpreter/executor.h b/tensorflow/compiler/xla/backends/interpreter/executor.h
index 1d78220728f..290685579ba 100644
--- a/tensorflow/compiler/xla/backends/interpreter/executor.h
+++ b/tensorflow/compiler/xla/backends/interpreter/executor.h
@@ -19,9 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
 #define TENSORFLOW_COMPILER_XLA_BACKENDS_INTERPRETER_EXECUTOR_H_
 
-#include <functional>
 #include <memory>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
@@ -52,18 +52,18 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   explicit XlaInterpreterExecutor(const PluginConfig &plugin_config);
   ~XlaInterpreterExecutor() override;
 
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
+  tsl::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return ::tsl::OkStatus();
   }
 
-  port::Status GetKernel(const MultiKernelLoaderSpec &spec,
-                         KernelBase *kernel) override {
-    return port::UnimplementedError("Not Implemented");
+  tsl::Status GetKernel(const MultiKernelLoaderSpec &spec,
+                        KernelBase *kernel) override {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
-  port::Status Launch(Stream *stream, const ThreadDim &thread_dims,
-                      const BlockDim &block_dims, const KernelBase &kernel,
-                      const KernelArgsArrayBase &args) override {
-    return port::UnimplementedError("Not Implemented");
+  tsl::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                     const BlockDim &block_dims, const KernelBase &kernel,
+                     const KernelArgsArrayBase &args) override {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
@@ -88,59 +88,56 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return false;
   }
 
-  port::Status MemZero(Stream *stream, DeviceMemoryBase *location,
-                       uint64_t size) override {
-    return port::InternalError("Interpreter can not memzero");
+  tsl::Status MemZero(Stream *stream, DeviceMemoryBase *location,
+                      uint64_t size) override {
+    return tsl::errors::Internal("Interpreter can not memzero");
   }
-  port::Status Memset(Stream *stream, DeviceMemoryBase *location,
-                      uint8_t pattern, uint64_t size) override {
-    return port::InternalError("Interpreter can not memset");
+  tsl::Status Memset(Stream *stream, DeviceMemoryBase *location,
+                     uint8_t pattern, uint64_t size) override {
+    return tsl::errors::Internal("Interpreter can not memset");
   }
-  port::Status Memset32(Stream *stream, DeviceMemoryBase *location,
-                        uint32_t pattern, uint64_t size) override {
-    return port::InternalError("Interpreter can not memset");
+  tsl::Status Memset32(Stream *stream, DeviceMemoryBase *location,
+                       uint32_t pattern, uint64_t size) override {
+    return tsl::errors::Internal("Interpreter can not memset");
   }
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
-  port::Status SynchronousMemZero(DeviceMemoryBase *location,
-                                  uint64_t size) override {
-    return port::InternalError("Interpreter can not memzero");
+  tsl::Status SynchronousMemZero(DeviceMemoryBase *location,
+                                 uint64_t size) override {
+    return tsl::errors::Internal("Interpreter can not memzero");
   }
 
-  port::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                 uint64_t size) override {
-    return port::InternalError("Interpreter can not memset");
+  tsl::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
+                                uint64_t size) override {
+    return tsl::errors::Internal("Interpreter can not memset");
   }
 
-  port::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64_t size) override;
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &dev_src,
-                                 uint64_t size) override;
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
-                                               const DeviceMemoryBase &pop_src,
-                                               uint64_t size) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+  tsl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst, const void *host_src,
+                                uint64_t size) override;
+  tsl::Status SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &dev_src,
+                                uint64_t size) override;
+  tsl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
+                                              const DeviceMemoryBase &pop_src,
+                                              uint64_t size) override {
+    return tsl::Status{tsl::error::UNIMPLEMENTED, ""};
   }
 
   bool HostCallback(Stream *stream,
-                    std::function<port::Status()> callback) override;
+                    absl::AnyInvocable<tsl::Status() &&> callback) override;
 
-  port::Status AllocateEvent(Event *event) override {
-    return ::tsl::OkStatus();
-  }
+  tsl::Status AllocateEvent(Event *event) override { return ::tsl::OkStatus(); }
 
-  port::Status DeallocateEvent(Event *event) override {
+  tsl::Status DeallocateEvent(Event *event) override {
     return ::tsl::OkStatus();
   }
 
-  port::Status RecordEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, "RecordEvent"};
+  tsl::Status RecordEvent(Stream *stream, Event *event) override {
+    return tsl::Status{tsl::error::UNIMPLEMENTED, "RecordEvent"};
   }
 
-  port::Status WaitForEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, "WaitForEvent"};
+  tsl::Status WaitForEvent(Stream *stream, Event *event) override {
+    return tsl::Status{tsl::error::UNIMPLEMENTED, "WaitForEvent"};
   }
 
   Event::Status PollForEventStatus(Event *event) override {
@@ -156,7 +153,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   bool StartTimer(Stream *stream, Timer *timer) override;
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  port::Status BlockHostUntilDone(Stream *stream) override;
+  tsl::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
@@ -164,15 +161,15 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return false;
   }
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
     return CreateDeviceDescription(0);
   }
 
-  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  static tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+  tsl::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
     return ::tsl::OkStatus();
   }
 
@@ -203,8 +200,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
  private:
   DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);
 
-  port::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(
-      const xla::Shape &shape);
+  tsl::StatusOr<DeviceMemoryBase> AllocateOutputBuffer(const xla::Shape &shape);
 
   const PluginConfig plugin_config_;
 };
diff --git a/tensorflow/compiler/xla/backends/interpreter/interpreter_transfer_manager.h b/tensorflow/compiler/xla/backends/interpreter/interpreter_transfer_manager.h
index fa4c001e653..35701597742 100644
--- a/tensorflow/compiler/xla/backends/interpreter/interpreter_transfer_manager.h
+++ b/tensorflow/compiler/xla/backends/interpreter/interpreter_transfer_manager.h
@@ -26,6 +26,18 @@ class InterpreterTransferManager : public GenericTransferManager {
   InterpreterTransferManager();
   ~InterpreterTransferManager() override = default;
 
+  bool CanShapedBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const ShapedBuffer& device_buffer) const override {
+    return true;
+  }
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override {
+    return true;
+  }
+
  private:
   InterpreterTransferManager(const InterpreterTransferManager&) = delete;
   InterpreterTransferManager& operator=(const InterpreterTransferManager&) =
diff --git a/tensorflow/compiler/xla/backends/interpreter/platform.cc b/tensorflow/compiler/xla/backends/interpreter/platform.cc
index 9c3309fcda6..0259baf8221 100644
--- a/tensorflow/compiler/xla/backends/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/platform.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/backends/interpreter/executor.h"
 #include "tensorflow/compiler/xla/stream_executor/device_options.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace interpreter {
@@ -41,12 +41,12 @@ int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
 const std::string& XlaInterpreterPlatform::Name() const { return name_; }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 XlaInterpreterPlatform::DescriptionForDevice(int ordinal) const {
   return XlaInterpreterExecutor::CreateDeviceDescription(ordinal);
 }
 
-port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
+tsl::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
     int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -55,7 +55,7 @@ port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::ExecutorForDevice(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*>
+tsl::StatusOr<StreamExecutor*>
 XlaInterpreterPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
@@ -65,13 +65,13 @@ XlaInterpreterPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
+tsl::StatusOr<StreamExecutor*> XlaInterpreterPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
-port::StatusOr<std::unique_ptr<StreamExecutor>>
+tsl::StatusOr<std::unique_ptr<StreamExecutor>>
 XlaInterpreterPlatform::GetUncachedExecutor(
     const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
@@ -79,8 +79,8 @@ XlaInterpreterPlatform::GetUncachedExecutor(
       config.ordinal);
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
             config.ordinal, init_status.ToString())};
@@ -100,7 +100,7 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
 
 static void InitializeXlaInterpreterPlatform() {
   std::unique_ptr<Platform> platform(new XlaInterpreterPlatform);
-  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace interpreter
diff --git a/tensorflow/compiler/xla/backends/interpreter/platform.h b/tensorflow/compiler/xla/backends/interpreter/platform.h
index ba638071e33..d48d460ed78 100644
--- a/tensorflow/compiler/xla/backends/interpreter/platform.h
+++ b/tensorflow/compiler/xla/backends/interpreter/platform.h
@@ -40,18 +40,18 @@ class XlaInterpreterPlatform : public Platform {
 
   const std::string& Name() const override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& config) override;
 
-  port::StatusOr<StreamExecutor*> GetExecutor(
+  tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
 
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
diff --git a/tensorflow/compiler/xla/backends/profiler/BUILD b/tensorflow/compiler/xla/backends/profiler/BUILD
index 14c9a946112..c02ea2c3d0c 100644
--- a/tensorflow/compiler/xla/backends/profiler/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/BUILD
@@ -1,14 +1,26 @@
-load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_cuda_library")
+load("//tensorflow/tsl:tsl.bzl", "if_libtpu", "tsl_gpu_library")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 package_group(
     name = "friends",
-    packages = ["//tensorflow/compiler/xla/backends/profiler/tpu"],
+    packages = [
+        "//tensorflow/compiler/xla/backends/profiler/cpu",
+        "//tensorflow/compiler/xla/backends/profiler/gpu",
+        "//tensorflow/compiler/xla/backends/profiler/tpu",
+    ],
 )
 
-tf_cuda_library(
+tsl_gpu_library(
     name = "profiler_backends",
-    visibility = ["//tensorflow:internal"],
-    deps = if_libtpu([
+    cuda_deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:device_tracer",
+    ],
+    visibility = ["//tensorflow/compiler/xla:internal"],
+    deps = [
+        "//tensorflow/compiler/xla/backends/profiler/cpu:host_tracer",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:metadata_collector",
+    ] + if_libtpu([
         "//tensorflow/compiler/xla/backends/profiler/tpu:tpu_tracer",
     ]),
     alwayslink = True,
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/BUILD b/tensorflow/compiler/xla/backends/profiler/cpu/BUILD
new file mode 100644
index 00000000000..3543ed7a558
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/BUILD
@@ -0,0 +1,116 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+cc_library(
+    name = "host_tracer",
+    srcs = ["host_tracer_factory.cc"],
+    visibility = [
+        "//tensorflow/compiler/xla/backends/profiler:__pkg__",
+        "//tensorflow/core/profiler:internal",
+        "//third_party/car/onboard/gpu:__subpackages__",
+    ],
+    deps = [
+        ":host_tracer_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_factory",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "host_tracer_impl",
+    srcs = ["host_tracer.cc"],
+    hdrs = ["host_tracer.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/backends/cpu:host_tracer_utils",
+        "//tensorflow/tsl/profiler/backends/cpu:traceme_recorder",
+        "//tensorflow/tsl/profiler/lib:profiler_interface",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
+    ],
+)
+
+cc_library(
+    name = "python_tracer",
+    srcs = ["python_tracer_factory.cc"],
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/core/profiler:internal",
+    ],
+    deps = [
+        ":python_tracer_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_factory",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "python_tracer_impl",
+    srcs = ["python_tracer.cc"],
+    hdrs = ["python_tracer.h"],
+    copts = tf_profiler_copts() + ["-fexceptions"],
+    features = ["-use_header_modules"],
+    visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/python/profiler/internal:python_hooks",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:profiler_interface",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "metadata_collector",
+    srcs = ["metadata_collector.cc"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/backends/profiler:__pkg__",
+        "//tensorflow/core/profiler:internal",
+    ],
+    deps = [
+        ":metadata_utils",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:xla_debug_info_manager",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:profiler_factory",
+        "//tensorflow/tsl/profiler/lib:profiler_interface",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "metadata_utils",
+    hdrs = ["metadata_utils.h"],
+    visibility = [
+        "//tensorflow/core/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/profiler/convert:xla_op_utils",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:xplane_builder",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+    ],
+)
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.cc b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.cc
new file mode 100644
index 00000000000..1b52addbebd
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.h"
+#include "tensorflow/tsl/profiler/backends/cpu/traceme_recorder.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+// Controls TraceMeRecorder and converts TraceMeRecorder::Events into XEvents.
+//
+// Thread-safety: This class is go/thread-compatible.
+class HostTracer : public tsl::profiler::ProfilerInterface {
+ public:
+  explicit HostTracer(int host_trace_level);
+  ~HostTracer() override;
+
+  tsl::Status Start() override;  // TENSORFLOW_STATUS_OK
+
+  tsl::Status Stop() override;  // TENSORFLOW_STATUS_OK
+
+  tsl::Status CollectData(  // TENSORFLOW_STATUS_OK
+      tensorflow::profiler::XSpace* space) override;
+
+ private:
+  // Level of host tracing.
+  const int host_trace_level_;
+
+  // True if currently recording.
+  bool recording_ = false;
+
+  // Timestamp at the start of tracing.
+  uint64_t start_timestamp_ns_ = 0;
+
+  // Container of all traced events.
+  tsl::profiler::TraceMeRecorder::Events events_;
+};
+
+HostTracer::HostTracer(int host_trace_level)
+    : host_trace_level_(host_trace_level) {}
+
+HostTracer::~HostTracer() { Stop().IgnoreError(); }  // NOLINT
+
+tsl::Status HostTracer::Start() {  // TENSORFLOW_STATUS_OK
+  if (recording_) {
+    return tsl::errors::Internal("TraceMeRecorder already started");
+  }
+
+  // All TraceMe captured should have a timestamp greater or equal to
+  // start_timestamp_ns_ to prevent timestamp underflow in XPlane.
+  // Therefore this have to be done before TraceMeRecorder::Start.
+  start_timestamp_ns_ = tsl::profiler::GetCurrentTimeNanos();
+  recording_ = tsl::profiler::TraceMeRecorder::Start(host_trace_level_);
+  if (!recording_) {
+    return tsl::errors::Internal("Failed to start TraceMeRecorder");
+  }
+  return tsl::OkStatus();
+}
+
+tsl::Status HostTracer::Stop() {  // TENSORFLOW_STATUS_OK
+  if (!recording_) {
+    return tsl::errors::Internal("TraceMeRecorder not started");
+  }
+  events_ = tsl::profiler::TraceMeRecorder::Stop();
+  recording_ = false;
+  return tsl::OkStatus();
+}
+
+tsl::Status HostTracer::CollectData(  // TENSORFLOW_STATUS_OK
+    tensorflow::profiler::XSpace* space) {
+  VLOG(2) << "Collecting data to XSpace from HostTracer.";
+  if (recording_) {
+    return tsl::errors::Internal("TraceMeRecorder not stopped");
+  }
+  if (events_.empty()) {
+    return tsl::OkStatus();
+  }
+  tensorflow::profiler::XPlane* plane =
+      tsl::profiler::FindOrAddMutablePlaneWithName(
+          space, tsl::profiler::kHostThreadsPlaneName);
+  ConvertCompleteEventsToXPlane(start_timestamp_ns_, std::exchange(events_, {}),
+                                plane);
+  return tsl::OkStatus();
+}
+
+}  // namespace
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateHostTracer(
+    const HostTracerOptions& options) {
+  if (options.trace_level == 0) return nullptr;
+  return std::make_unique<HostTracer>(options.trace_level);
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h
new file mode 100644
index 00000000000..79d0fb6f2e7
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
+
+#include <memory>
+
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+
+namespace xla {
+namespace profiler {
+
+struct HostTracerOptions {
+  // Levels of host tracing:
+  // - Level 0 is used to disable host traces.
+  // - Level 1 enables tracing of only user instrumented (or default) TraceMe.
+  // - Level 2 enables tracing of all level 1 TraceMe(s) and instrumented high
+  //           level program execution details (expensive TF ops, XLA ops, etc).
+  //           This is the default.
+  // - Level 3 enables tracing of all level 2 TraceMe(s) and more verbose
+  //           (low-level) program execution details (cheap TF ops, etc).
+  int trace_level = 2;
+};
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateHostTracer(
+    const HostTracerOptions& options);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_HOST_TRACER_H_
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer_factory.cc b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer_factory.cc
new file mode 100644
index 00000000000..5ade9e67320
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/host_tracer_factory.cc
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateHostTracer(
+    const tensorflow::ProfileOptions& profile_options) {
+  HostTracerOptions options;
+  options.trace_level = profile_options.host_tracer_level();
+  return CreateHostTracer(options);
+}
+
+auto register_host_tracer_factory = [] {
+  RegisterProfilerFactory(&CreateHostTracer);
+  return 0;
+}();
+
+}  // namespace
+}  // namespace profiler
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/metadata_collector.cc b/tensorflow/compiler/xla/backends/profiler/cpu/metadata_collector.cc
new file mode 100644
index 00000000000..9f3a23b34d8
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/metadata_collector.cc
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/backends/profiler/cpu/metadata_utils.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+// MetadataCollector collect miscellaneous metadata for xprof, e.g. HLO protos
+// from XLA runtime etc.
+//
+// Thread-safety: This class is go/thread-compatible.
+class MetadataCollector : public tsl::profiler::ProfilerInterface {
+ public:
+  MetadataCollector() = default;
+
+  Status Start() override {
+    if (!trace_active_) {
+      xla::XlaDebugInfoManager::Get()->StartTracing();
+      trace_active_ = true;
+    }
+    return OkStatus();
+  }
+
+  Status Stop() override {
+    if (trace_active_) {
+      xla::XlaDebugInfoManager::Get()->StopTracing(&debug_info_);
+      trace_active_ = false;
+    }
+    return OkStatus();
+  }
+
+  Status CollectData(tsl::profiler::XSpace* space) override {
+    if (!debug_info_.empty()) {
+      tsl::profiler::XPlane* plane =
+          tsl::profiler::FindOrAddMutablePlaneWithName(
+              space, tsl::profiler::kMetadataPlaneName);
+      MetadataXPlaneBuilder metadata_plane(plane);
+      for (auto& hlo_proto : debug_info_) {
+        metadata_plane.AddHloProto(hlo_proto->hlo_module().id(), *hlo_proto);
+        hlo_proto.reset();
+      }
+      debug_info_.clear();
+    }
+    return OkStatus();
+  }
+
+ private:
+  std::vector<std::unique_ptr<xla::HloProto>> debug_info_;
+  bool trace_active_ = false;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MetadataCollector);
+};
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreatMetadataCollector(
+    const tensorflow::ProfileOptions& options) {
+  return options.enable_hlo_proto() ? std::make_unique<MetadataCollector>()
+                                    : nullptr;
+}
+
+}  // namespace
+
+auto register_metadata_collector_factory = [] {
+  RegisterProfilerFactory(&CreatMetadataCollector);
+  return 0;
+}();
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/metadata_utils.h b/tensorflow/compiler/xla/backends/profiler/cpu/metadata_utils.h
new file mode 100644
index 00000000000..349ddb977d0
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/metadata_utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/tsl/profiler/convert/xla_op_utils.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace xla {
+namespace profiler {
+
+class MetadataXPlaneBuilder {
+ public:
+  explicit MetadataXPlaneBuilder(tsl::profiler::XPlane* raw_plane)
+      : plane_(raw_plane),
+        hlo_proto_stat_(plane_.GetOrCreateStatMetadata(
+            GetStatTypeStr(tsl::profiler::StatType::kHloProto))) {}
+
+  void AddHloProto(uint64_t program_id, const xla::HloProto& hlo_proto) {
+    tsl::profiler::XEventMetadata* event_metadata =
+        plane_.GetOrCreateEventMetadata(program_id);
+    if (event_metadata->name().empty()) {
+      event_metadata->set_name(tsl::profiler::HloModuleNameWithProgramId(
+          hlo_proto.hlo_module().name(), program_id));
+      tsl::profiler::XStatsBuilder<tsl::profiler::XEventMetadata> event_stats(
+          event_metadata, &plane_);
+      event_stats.AddStatValue(*hlo_proto_stat_, hlo_proto);
+    }
+  }
+
+ private:
+  tsl::profiler::XPlaneBuilder plane_;
+  const tsl::profiler::XStatMetadata* hlo_proto_stat_ = nullptr;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_METADATA_UTILS_H_
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.cc b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.cc
new file mode 100644
index 00000000000..bd381905f23
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.cc
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/python/profiler/internal/python_hooks.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+// This profiler interface enables Python function call tracing.
+class PythonTracer : public tsl::profiler::ProfilerInterface {
+ public:
+  explicit PythonTracer(const PythonHooksOptions& options)
+      : options_(options) {}
+  ~PythonTracer() override;
+
+  tsl::Status Start() override;  // TENSORFLOW_STATUS_OK
+
+  tsl::Status Stop() override;  // TENSORFLOW_STATUS_OK
+
+  tsl::Status CollectData(  // TENSORFLOW_STATUS_OK
+      tensorflow::profiler::XSpace* space) override;
+
+ private:
+  bool recording_ = false;
+  const PythonHooksOptions options_;
+  std::unique_ptr<PythonHookContext> context_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PythonTracer);
+};
+
+PythonTracer::~PythonTracer() { Stop().IgnoreError(); }  // NOLINT
+
+tsl::Status PythonTracer::Start() {  // TENSORFLOW_STATUS_OK
+  if (recording_) {
+    return tsl::errors::Internal("PythonTracer already started");
+  }
+  VLOG(1) << __FUNCTION__;
+  recording_ = true;
+  PythonHooks::GetSingleton()->Start(options_);
+  return tsl::OkStatus();
+}
+
+tsl::Status PythonTracer::Stop() {  // TENSORFLOW_STATUS_OK
+  if (!recording_) {
+    return tsl::errors::Internal("PythonTracer not started");
+  }
+  VLOG(1) << __FUNCTION__;
+  context_ = PythonHooks::GetSingleton()->Stop();
+  recording_ = false;
+  return tsl::OkStatus();
+}
+
+tsl::Status PythonTracer::CollectData(  // TENSORFLOW_STATUS_OK
+    tensorflow::profiler::XSpace* space) {
+  VLOG(2) << "Collecting data to XSpace from PythonTracer.";
+  if (context_) {
+    context_->Finalize(space);
+    context_.reset();
+  }
+  return tsl::OkStatus();
+}
+
+}  // namespace
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreatePythonTracer(
+    const PythonTracerOptions& options) {
+  if (!options.enable_trace_python_function && !options.enable_python_traceme) {
+    return nullptr;
+  }
+  PythonHooksOptions pyhooks_options;
+  pyhooks_options.enable_trace_python_function =
+      options.enable_trace_python_function;
+  pyhooks_options.enable_python_traceme = options.enable_python_traceme;
+  pyhooks_options.end_to_end_mode = options.end_to_end_mode;
+  return std::make_unique<PythonTracer>(pyhooks_options);
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h
new file mode 100644
index 00000000000..4413bc72440
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
+
+#include <memory>
+
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+
+namespace xla {
+namespace profiler {
+
+struct PythonTracerOptions {
+  // Whether to enable python function calls tracing.
+  // NOTE: Runtime overhead ensues if enabled.
+  bool enable_trace_python_function = false;
+
+  // Whether to enable python TraceMe instrumentation.
+  bool enable_python_traceme = true;
+
+  // Whether profiling stops within an atexit handler.
+  bool end_to_end_mode = false;
+};
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreatePythonTracer(
+    const PythonTracerOptions& options);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_CPU_PYTHON_TRACER_H_
diff --git a/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer_factory.cc b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer_factory.cc
new file mode 100644
index 00000000000..71818f00c87
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/cpu/python_tracer_factory.cc
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace xla {
+namespace profiler {
+namespace {
+
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreatePythonTracer(
+    const tensorflow::ProfileOptions& profile_options) {
+  PythonTracerOptions options;
+  options.enable_trace_python_function = profile_options.python_tracer_level();
+  options.enable_python_traceme = profile_options.host_tracer_level();
+  return CreatePythonTracer(options);
+}
+
+auto register_python_tracer_factory = [] {
+  RegisterProfilerFactory(&CreatePythonTracer);
+  return 0;
+}();
+
+}  // namespace
+}  // namespace profiler
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/BUILD b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
new file mode 100644
index 00000000000..12a4f1a9926
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
@@ -0,0 +1,290 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "//tensorflow/tsl:tsl.bzl",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_gpu_cc_test")
+load(
+    "//tensorflow/tsl/platform:build_config.bzl",
+    "tf_additional_device_tracer_srcs",
+)
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load(
+    "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
+    "tf_additional_cupti_deps",
+)
+load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load(
+    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+
+package(
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    features = [
+        "-layering_check",
+    ],
+    licenses = ["notice"],
+)
+
+tsl_gpu_library(
+    name = "device_tracer",
+    srcs = tf_additional_device_tracer_srcs(),
+    copts = tf_profiler_copts() + tsl_copts(),
+    cuda_deps = [
+        ":cupti_tracer",
+        ":cupti_wrapper",
+        ":rocm_tracer",
+    ],
+    deps = [
+        ":cupti_utils",
+        "//tensorflow/tsl/platform:abi",
+        "//tensorflow/tsl/platform:env_time",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/profiler/lib:profiler_factory",
+        "//tensorflow/tsl/profiler/lib:profiler_interface",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
+tsl_gpu_library(
+    name = "cupti_interface",
+    hdrs = if_cuda(["cupti_interface.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:types",
+    ] + tf_additional_cupti_deps(),
+)
+
+tsl_gpu_library(
+    name = "mock_cupti",
+    testonly = 1,
+    hdrs = if_cuda(["mock_cupti.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    cuda_deps = [
+        ":cupti_interface",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:test",
+    ],
+)
+
+tsl_gpu_library(
+    name = "cupti_error_manager",
+    srcs = if_cuda(["cupti_error_manager.cc"]),
+    hdrs = if_cuda(["cupti_error_manager.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    cuda_deps = [
+        ":cupti_interface",
+        ":cupti_wrapper",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/debugging:leak_check",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tsl_gpu_cc_test(
+    name = "cupti_error_manager_test",
+    size = "small",
+    srcs = ["cupti_error_manager_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "nomac",
+        "gpu_cupti",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:test_main",
+    ] + if_cuda_is_configured([
+        ":cuda_test",
+        ":cupti_error_manager",
+        ":cupti_tracer",
+        ":cupti_utils",
+        ":cupti_wrapper",
+        ":mock_cupti",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack_impl",
+    ]),
+)
+
+cuda_library(
+    name = "cuda_test",
+    testonly = 1,
+    srcs = ["cuda_test.cu.cc"],
+    hdrs = ["cuda_test.h"],
+    copts = select({
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options",
+            "ptxas-options=-v",
+        ],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/platform:test",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart",
+    ],
+)
+
+# Rationale for linkstatic: The symbols in libcupti_static.a have hidden
+# visibility. The wrapper will fail to find them if it's ever built as a
+# shared library. This is the same issue as b/11094727. Always linking
+# the wrapper statically works around the issue. An alternative would be
+# to patch libcupti_static, but it's not worth the trouble considering
+# that the wrapper is about the only direct user.
+tsl_gpu_library(
+    name = "cupti_wrapper",
+    srcs = if_cuda(["cupti_wrapper.cc"]),
+    hdrs = if_cuda(["cupti_wrapper.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cupti_interface",
+    ] + tf_additional_cupti_deps(),
+)
+
+tsl_gpu_library(
+    name = "cupti_tracer",
+    srcs = if_cuda(["cupti_tracer.cc"]),
+    hdrs = if_cuda(["cupti_tracer.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cupti_collector",
+        ":cupti_interface",
+        ":cupti_utils",
+        ":nvtx_utils",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
+        "//tensorflow/tsl/profiler/lib:scoped_annotation",
+        "//tensorflow/tsl/profiler/utils:buffer_pool",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tsl_gpu_library(
+    name = "rocm_tracer",
+    srcs = if_rocm(["rocm_tracer.cc"]),
+    hdrs = if_rocm(["rocm_tracer.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla/stream_executor/rocm:roctracer_wrapper",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tsl_gpu_library(
+    name = "nvtx_utils",
+    srcs = if_cuda(["nvtx_utils.cc"]),
+    hdrs = if_cuda(["nvtx_utils.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    deps = [
+        "//tensorflow/tsl/platform",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+    ],
+)
+
+tsl_gpu_library(
+    name = "cupti_collector",
+    srcs = if_cuda(["cupti_collector.cc"]),
+    hdrs = if_cuda(["cupti_collector.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:abi",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:parse_annotation",
+        "//tensorflow/tsl/profiler/utils:xplane_builder",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
+        "//tensorflow/tsl/profiler/utils:trace_utils",
+    ] + tf_additional_cupti_deps(),
+)
+
+cc_library(
+    name = "cupti_collector_header",
+    hdrs = ["cupti_collector.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tsl_gpu_library(
+    name = "cupti_utils",
+    srcs = if_cuda(["cupti_utils.cc"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    cuda_deps = [
+        ":cupti_error_manager",
+        ":cupti_interface",
+        ":cupti_wrapper",
+    ],
+    visibility = ["//visibility:public"],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/profiler/backends/gpu/cuda_test.cu.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.cu.cc
similarity index 96%
rename from tensorflow/core/profiler/backends/gpu/cuda_test.cu.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.cu.cc
index 24e4656d1a9..4ff692341a8 100644
--- a/tensorflow/core/profiler/backends/gpu/cuda_test.cu.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.cu.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // Creates some GPU activity to test functionalities of gpuperfcounter/gputrace.
-#include "tensorflow/core/profiler/backends/gpu/cuda_test.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h"
 
 #if GOOGLE_CUDA
 #include <stdio.h>
@@ -23,9 +23,9 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/driver_types.h"
 #endif
 
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/platform/test.h"
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 namespace test {
 
@@ -185,4 +185,4 @@ void MemCopyP2PExplicit() {
 
 }  // namespace test
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h b/tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h
new file mode 100644
index 00000000000..ec583d5f4e1
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
+
+namespace xla {
+namespace profiler {
+namespace test {
+// Calls a function on the device to print a string as many times as indicated
+// by iters.
+void PrintfKernel(int iters = 1);
+
+// Calls an empty kernel (named "empty") on the device as many times as
+// indicated by iters.
+void EmptyKernel(int iters = 1);
+
+// Waits for device activity to complete.
+void Synchronize();
+
+// Copies a few bytes of memory from host to device.
+void MemCopyH2D();
+
+// Copies a few bytes of memory from device to host, asynchronously.
+void MemCopyH2D_Async();
+
+// Copies a few bytes of memory from device to host.
+void MemCopyD2H();
+
+// Returns true if it s possible to copy bytes from device 0 to device 1.
+bool MemCopyP2PAvailable();
+
+// Copies a few bytes of memory from device 0 to device 1.
+void MemCopyP2PImplicit();
+
+// Copies a few bytes of memory from device 0 to device 1.
+void MemCopyP2PExplicit();
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUDA_TEST_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_collector.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.cc
similarity index 89%
rename from tensorflow/core/profiler/backends/gpu/cupti_collector.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.cc
index 3018da4bd14..17a391be6f1 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_collector.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h"
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -23,20 +23,36 @@ limitations under the License.
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_occupancy.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/host_info.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/profiler/utils/parse_annotation.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/abi.h"
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/profiler/utils/parse_annotation.h"
+#include "tensorflow/tsl/profiler/utils/trace_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
 namespace profiler {
 
 namespace {
 
+using tensorflow::profiler::XEventMetadata;
+using tensorflow::profiler::XSpace;
+using tsl::mutex;
+using tsl::mutex_lock;
+using tsl::profiler::Annotation;
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::GpuPlaneName;
+using tsl::profiler::kCuptiDriverApiPlaneName;
+using tsl::profiler::kDeviceVendorNvidia;
+using tsl::profiler::kThreadIdOverhead;
+using tsl::profiler::ParseAnnotationStack;
+using tsl::profiler::StatType;
+using tsl::profiler::XEventBuilder;
+using tsl::profiler::XLineBuilder;
+using tsl::profiler::XPlaneBuilder;
+
 bool IsHostEvent(const CuptiTracerEvent& event, int64_t* line_id) {
   // DriverCallback(i.e. kernel launching) events are host events.
   if (event.source == CuptiTracerEventSource::DriverCallback) {
@@ -124,7 +140,7 @@ class PerDeviceCollector {
   }
 
   void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
-                    uint64 start_gpu_ns, uint64 end_gpu_ns,
+                    tsl::uint64 start_gpu_ns, tsl::uint64 end_gpu_ns,
                     XLineBuilder* line) {
     if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
         event.start_time_ns > event.end_time_ns) {
@@ -133,7 +149,7 @@ class PerDeviceCollector {
               << " end time(ns): " << event.end_time_ns;
       return;
     }
-    std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+    std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
     if (kernel_name.empty()) {
       kernel_name = GetTraceEventTypeName(event.type);
     }
@@ -161,7 +177,7 @@ class PerDeviceCollector {
     if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
       xevent.AddStatValue(
           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
-          absl::StrCat("$$", static_cast<uint64>(event.context_id)));
+          absl::StrCat("$$", static_cast<tsl::uint64>(event.context_id)));
     }
 
     if (event.type == CuptiTracerEventType::Kernel &&
@@ -190,10 +206,11 @@ class PerDeviceCollector {
                           occ_stats.occupancy_pct);
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kOccupancyMinGridSize)),
-                          static_cast<int32>(occ_stats.min_grid_size));
-      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                              StatType::kOccupancySuggestedBlockSize)),
-                          static_cast<int32>(occ_stats.suggested_block_size));
+                          static_cast<tsl::int32>(occ_stats.min_grid_size));
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
+          static_cast<tsl::int32>(occ_stats.suggested_block_size));
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kKernelDetails)),
                           *plane->GetOrCreateStatMetadata(ToXStat(
@@ -272,11 +289,11 @@ class PerDeviceCollector {
     }
   }
 
-  absl::optional<int> GetDeviceAttribute(CUdevice device,
-                                         CUdevice_attribute attrib) {
+  std::optional<int> GetDeviceAttribute(CUdevice device,
+                                        CUdevice_attribute attrib) {
     int ret_val;
     CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
-    if (err != CUDA_SUCCESS) return absl::nullopt;
+    if (err != CUDA_SUCCESS) return std::nullopt;
     return ret_val;
   }
 
@@ -303,7 +320,7 @@ class PerDeviceCollector {
     events_.emplace_back(std::move(event));
   }
 
-  size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
+  size_t Flush(tsl::uint64 start_gpu_ns, tsl::uint64 end_gpu_ns,
                XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
     mutex_lock l(m_);
     // Tracking event types per line.
@@ -375,7 +392,7 @@ class PerDeviceCollector {
       // Times 2 because HBM is DDR memory; it gets two data bits per each
       // data lane.
       auto memory_bandwidth =
-          uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
+          tsl::uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
       device_plane->AddStatValue(
           *device_plane->GetOrCreateStatMetadata(
               GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
@@ -387,7 +404,7 @@ class PerDeviceCollector {
       device_plane->AddStatValue(
           *device_plane->GetOrCreateStatMetadata(
               GetStatTypeStr(StatType::kDevCapMemorySize)),
-          static_cast<uint64>(total_memory));
+          static_cast<tsl::uint64>(total_memory));
     }
 
     auto compute_capability_major = GetDeviceAttribute(
@@ -452,7 +469,7 @@ class PerDeviceCollector {
 
 }  // namespace
 
-void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
+void AnnotationMap::Add(tsl::uint32 device_id, tsl::uint32 correlation_id,
                         const absl::string_view annotation,
                         const absl::string_view nvtx_range) {
   if (annotation.empty() && nvtx_range.empty()) return;
@@ -471,8 +488,8 @@ void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
   }
 }
 
-AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
-                                                    uint32 correlation_id) {
+AnnotationMap::AnnotationInfo AnnotationMap::LookUp(
+    tsl::uint32 device_id, tsl::uint32 correlation_id) {
   if (device_id >= per_device_map_.size()) return AnnotationInfo();
   auto& per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
@@ -486,7 +503,8 @@ AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
  public:
   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
-                          uint64 start_walltime_ns, uint64 start_gpu_ns)
+                          tsl::uint64 start_walltime_ns,
+                          tsl::uint64 start_gpu_ns)
       : CuptiTraceCollector(option),
         num_callback_events_(0),
         num_activity_events_(0),
@@ -512,13 +530,14 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     }
     per_device_collector_[event.device_id].AddEvent(std::move(event));
   }
-  void OnEventsDropped(const std::string& reason, uint32 num_events) override {
+  void OnEventsDropped(const std::string& reason,
+                       tsl::uint32 num_events) override {
     absl::MutexLock lock(&mutex_);
     dropped_events_[reason] += num_events;
   }
   void Flush() override {}
   // Returns true if some GPU events are captured.
-  bool Export(XSpace* space, uint64 end_gpu_ns) override {
+  bool Export(XSpace* space, tsl::uint64 end_gpu_ns) override {
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
               << " activity events. " << ReportDroppedEvents();
@@ -546,7 +565,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
 
   std::string ReportDroppedEvents() {
     absl::MutexLock lock(&mutex_);
-    string result;
+    std::string result;
     for (const auto& dropped : dropped_events_) {
       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
                       dropped.first, ";");
@@ -557,8 +576,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::string ReportNumEventsIfDropped() override {
     std::string events_dropped = ReportDroppedEvents();
     if (events_dropped.empty()) return "";
-    return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
-                        ": Profiler has collected ",
+    return absl::StrCat("Detected GPU events dropped on ",
+                        tsl::port::Hostname(), ": Profiler has collected ",
                         num_callback_events_.load(), " driver events and ",
                         num_activity_events_.load(), " device events.",
                         events_dropped);
@@ -568,10 +587,10 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::atomic<int> num_callback_events_;
   std::atomic<int> num_activity_events_;
   absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, uint64> dropped_events_
+  absl::flat_hash_map<std::string, tsl::uint64> dropped_events_
       ABSL_GUARDED_BY(mutex_);
-  uint64 start_walltime_ns_;
-  uint64 start_gpu_ns_;
+  tsl::uint64 start_walltime_ns_;
+  tsl::uint64 start_gpu_ns_;
   int num_gpus_;
 
   // Set the all XLines of specified XPlane to starting walltime.
@@ -580,7 +599,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   // this fact. Eventually we change line start time to corresponding
   // start_walltime_ns to normalize with CPU wall time.
   static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                  uint64 start_walltime_ns) {
+                                  tsl::uint64 start_walltime_ns) {
     plane->ForEachLine(
         [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
   }
@@ -591,8 +610,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
 };
 
 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
-    const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
-    const uint64 start_gputime_ns) {
+    const CuptiTracerCollectorOptions& options,
+    const tsl::uint64 start_walltime_ns, const tsl::uint64 start_gputime_ns) {
   return std::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
                                                    start_gputime_ns);
 }
@@ -621,4 +640,4 @@ absl::string_view GetMemoryKindName(int8_t memory_kind) {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h
new file mode 100644
index 00000000000..654a906834e
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h
@@ -0,0 +1,277 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
+
+#include <memory>
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  tsl::uint32 destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
+  // For events from other CuptiTracerEventSource, it is always 0.
+  tsl::int8 copy_kind;
+  // CUpti_ActivityMemoryKind of source.
+  tsl::int8 src_mem_kind;
+  // CUpti_ActivityMemoryKind of destination.
+  tsl::int8 dst_mem_kind;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct MemAllocDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  tsl::int8 mem_kind;
+  // The virtual address of allocation. 0 if it is a free operation.
+  tsl::uint64 address;
+};
+
+using MemFreeDetails = MemAllocDetails;
+
+// Memory residency contains details read from CUpti_ActivityMemory type. This
+// is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
+// event. The start of this even corresponse to a cudaMalloc, and the end
+// corresponds to a cudaFree.
+using MemoryResidencyDetails = MemAllocDetails;
+
+struct MemsetDetails {
+  // Size of memory to be written over in bytes.
+  size_t num_bytes;
+  // The CUpti_ActivityMemoryKind value for this activity event.
+  tsl::int8 mem_kind;
+  // Whether or not the memset is asynchronous.
+  bool async;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  tsl::uint32 registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  tsl::uint32 static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  tsl::uint32 dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  tsl::uint32 block_x;
+  // Y-dimension of a thread block.
+  tsl::uint32 block_y;
+  // Z-dimension of a thread block.
+  tsl::uint32 block_z;
+  // X-dimension of a grid.
+  tsl::uint32 grid_x;
+  // Y-dimension of a grid.
+  tsl::uint32 grid_y;
+  // Z-dimension of a grid.
+  tsl::uint32 grid_z;
+
+  // ID of the hardware channel on which this operation ran.
+  uint32_t channel_id = -1;
+  // CUpti_ChannelType of the channel above.
+  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+// Gets the name of the CUpti_ActivityMemoryKind value.
+absl::string_view GetMemoryKindName(int8_t memory_kind);
+
+enum class CuptiTracerEventType {
+  Unsupported = 0,
+  Kernel = 1,
+  MemcpyH2D = 2,
+  MemcpyD2H = 3,
+  MemcpyD2D = 4,
+  MemcpyP2P = 5,
+  MemcpyOther = 6,
+  MemoryAlloc = 7,
+  Overhead = 8,
+  UnifiedMemory = 9,
+  MemoryFree = 10,
+  Memset = 11,
+  MemoryResidency = 12,
+  Generic = 100,
+};
+
+const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
+
+enum class CuptiTracerEventSource {
+  Invalid = 0,
+  DriverCallback = 1,
+  Activity = 2,
+  // Maybe consider adding runtime callback and metric api in the future.
+};
+
+struct CuptiTracerEvent {
+  static constexpr tsl::uint32 kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr tsl::uint32 kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr tsl::uint64 kInvalidContextId =
+      std::numeric_limits<uint64_t>::max();
+  static constexpr tsl::uint64 kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
+  CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
+  // Although CUpti_CallbackData::functionName is persistent, however
+  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
+  // it.
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view nvtx_range;
+  tsl::uint64 start_time_ns = 0;
+  tsl::uint64 end_time_ns = 0;
+  tsl::uint32 device_id = 0;
+  tsl::uint32 correlation_id = kInvalidCorrelationId;
+  tsl::uint32 thread_id = kInvalidThreadId;
+  int64_t context_id = kInvalidContextId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    // For Memcpy API and activities. `type` must be Memcpy*.
+    MemcpyDetails memcpy_info;
+    // Used for MemAlloc API. `type` must be MemoryAlloc.
+    MemAllocDetails memalloc_info;
+    // Used for kernel activities. `type` must be Kernel.
+    KernelDetails kernel_info;
+    // Used for MemFree activities. `type` must be MemoryFree.
+    MemFreeDetails memfree_info;
+    // Used for Memset API and activities. `type` must be Memset.
+    MemsetDetails memset_info;
+    // Used for Memory residency activities. `type` must be MemoryResidency.
+    MemoryResidencyDetails memory_residency_info;
+  };
+};
+
+struct CuptiTracerCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  tsl::uint64 max_callback_api_events = 2 * 1024 * 1024;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  tsl::uint64 max_activity_api_events = 2 * 1024 * 1024;
+  // Maximum number of annotation strings that we can accommodate.
+  tsl::uint64 max_annotation_strings = 1024 * 1024;
+  // Number of GPUs involved.
+  tsl::uint32 num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  struct AnnotationInfo {
+    absl::string_view annotation;
+    absl::string_view nvtx_range;
+  };
+
+  explicit AnnotationMap(tsl::uint64 max_size, tsl::uint32 num_gpus)
+      : max_size_(max_size), per_device_map_(num_gpus) {}
+  void Add(tsl::uint32 device_id, tsl::uint32 correlation_id,
+           const absl::string_view annotation,
+           const absl::string_view nvtx_range);
+  AnnotationInfo LookUp(tsl::uint32 device_id, tsl::uint32 correlation_id);
+
+ private:
+  struct PerDeviceAnnotationMap {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::node_hash_set<std::string> nvtx_ranges;
+    absl::flat_hash_map<tsl::uint32, AnnotationInfo> correlation_map;
+  };
+  const tsl::uint64 max_size_;
+  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
+};
+
+class CuptiTraceCollector {
+ public:
+  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
+      : options_(options),
+        annotation_map_(options.max_annotation_strings, options.num_gpus) {}
+  virtual ~CuptiTraceCollector() {}
+
+  // Producer side functions (i.e. called by CuptiTracer).
+  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               tsl::uint32 num_events) = 0;
+  virtual void Flush() = 0;
+
+  // Consumer side functions (i.e. called by GPU tracer);
+  virtual bool Export(tensorflow::profiler::XSpace* space,
+                      tsl::uint64 end_gpu_ns) {
+    return true;
+  }
+  virtual std::string ReportNumEventsIfDropped() { return ""; }
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  CuptiTracerCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
+};
+
+std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
+    const CuptiTracerCollectorOptions& options,
+    const tsl::uint64 start_walltime_ns, const tsl::uint64 start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.cc
similarity index 99%
rename from tensorflow/core/profiler/backends/gpu/cupti_error_manager.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.cc
index a639af78d77..9b469261099 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.cc
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/cupti_error_manager.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h"
 
 #include <utility>
 
 #include "absl/debugging/leak_check.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/tsl/platform/logging.h"
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 
+using tsl::mutex_lock;
+
 CuptiErrorManager::CuptiErrorManager(std::unique_ptr<CuptiInterface> interface)
     : interface_(std::move(interface)), disabled_(0), undo_disabled_(false) {}
 
@@ -500,4 +502,4 @@ std::string CuptiErrorManager::ResultString(CUptiResult error) const {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h
new file mode 100644
index 00000000000..c197889507e
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -0,0 +1,277 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace xla {
+namespace profiler {
+
+class CuptiErrorManager : public xla::profiler::CuptiInterface {
+ public:
+  explicit CuptiErrorManager(std::unique_ptr<CuptiInterface> interface);
+
+  // Returns whether CUPTI is disabled.
+  bool Disabled() const override { return disabled_.load(); }
+
+  // CUPTI activity API: all thread-safe
+  // Disables activity monitoring.
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  // Enables activity monitoring. If this is successfully executed, we add
+  // ActivityDisable to the undo log.
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  // Flushes all outstanding activities.
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  // Gets a next activity record from a pool of already collected activity
+  // records.
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  // Reports the number of dropped activity records.
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  // Registers callback functions handling activity.
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  // Returns device ID for a given context.
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* device_id) override;
+
+  // Returns CUPTI timestamp.
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // Explicitly destroys and cleans up all resources associated with CUPTI in
+  // the current process.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  // Enables or disables callback. If we successfully enables callback, we add
+  // EnableCallback to disable callback to the undo log.
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId callback_id) override;
+
+  // Enables or disables callback domain. If we successfully enables a domain,
+  // we add EnableDomain to disable the domain to the undo log.
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  // Subscribes callbacks. If we successfully subscribes the callback, we add
+  // Unsubscribe to the undo log.
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  // Unsubscribes callbacks.
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  // CUPTI event API
+  // Returns a list of event domains.
+  CUptiResult DeviceEnumEventDomains(
+      CUdevice device, size_t* array_size_bytes,
+      CUpti_EventDomainID* domain_array) override;
+
+  // Returns domain attributes.
+  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
+                                            CUpti_EventDomainID event_domain,
+                                            CUpti_EventDomainAttribute attrib,
+                                            size_t* value_size,
+                                            void* value) override;
+
+  // Disables kernel replay mode.
+  CUptiResult DisableKernelReplayMode(CUcontext context) override;
+
+  // Enables kernel replay mode. If we successfully enable kernel replay mode,
+  // we add DisableKernelReplayMode to the undo log.
+  CUptiResult EnableKernelReplayMode(CUcontext context) override;
+
+  // Returns the number of event domains.
+  CUptiResult DeviceGetNumEventDomains(CUdevice device,
+                                       uint32_t* num_domains) override;
+
+  // Returns a list of events.
+  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
+                                    size_t* array_size_bytes,
+                                    CUpti_EventID* event_array) override;
+
+  // Returns the number of events.
+  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
+                                      uint32_t* num_events) override;
+
+  // Returns an event attribute.
+  CUptiResult EventGetAttribute(CUpti_EventID event,
+                                CUpti_EventAttribute attrib, size_t* value_size,
+                                void* value) override;
+
+  // Convverts event ID from event name.
+  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
+                                 CUpti_EventID* event) override;
+
+  // Disables event group.
+  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
+
+  // Enables event group. If we successfully enable an event group, we add
+  // EventGroupDisable to the undo log.
+  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
+
+  // Returns an event group attribute.
+  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t* value_size, void* value) override;
+
+  // Returns a performance counter value.
+  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
+                                  CUpti_ReadEventFlags flags,
+                                  CUpti_EventID event,
+                                  size_t* event_value_buffer_size_bytes,
+                                  uint64_t* event_value_buffer) override;
+
+  // Returns an event group set attribute.
+  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t value_size, void* value) override;
+
+  // Creates an event group set. If we successfully creates an event group set,
+  // we add EventGroupSetsDestroy to the undo log.
+  CUptiResult EventGroupSetsCreate(
+      CUcontext context, size_t event_id_array_size_bytes,
+      CUpti_EventID* event_id_array,
+      CUpti_EventGroupSets** event_group_passes) override;
+
+  // Destroys an event group set.
+  CUptiResult EventGroupSetsDestroy(
+      CUpti_EventGroupSets* event_group_sets) override;
+
+  // CUPTI metric API: all thread-safe
+  // Enumerates metrics.
+  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
+                                CUpti_MetricID* metricArray) override;
+
+  // Returns the number of metrics.
+  CUptiResult DeviceGetNumMetrics(CUdevice device,
+                                  uint32_t* num_metrics) override;
+
+  // Converts a metric ID to a metric name.
+  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
+                                  CUpti_MetricID* metric) override;
+
+  // Returns the number of events required to calculate a particular metric.
+  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
+                                 uint32_t* num_events) override;
+
+  // Returns a list of events required to calculate a particular metric.
+  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
+                               size_t* event_id_array_size_bytes,
+                               CUpti_EventID* event_id_array) override;
+
+  // Returns a metric attribute.
+  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
+                                 CUpti_MetricAttribute attrib,
+                                 size_t* value_size, void* value) override;
+
+  // Returns a metric value.
+  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                             size_t event_id_array_size_bytes,
+                             CUpti_EventID* event_id_array,
+                             size_t event_value_array_size_bytes,
+                             uint64_t* event_value_array,
+                             uint64_t time_duration,
+                             CUpti_MetricValue* metric_value) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
+
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
+
+  // Clears Undo stack. We are maintaining undo stack for each profiling phase.
+  // Once the profiling is done, we need to clear the undo stack.
+  void CleanUp() override;
+
+ private:
+  typedef std::function<CUptiResult()> UndoFunction;
+
+  // Register undo function.
+  void RegisterUndoFunction(const UndoFunction& func);
+
+  // Resets profiling status by calling some undo functions registered,
+  // and then disables profiling.
+  void UndoAndDisable();
+
+  // Returns a descriptive string for a CUptiResult.
+  std::string ResultString(CUptiResult result) const;
+
+  // Contains a pointer to a cupti interface instance. Normally, this will point
+  // to a real CUPTI interface that interacts with underlying hardware, but for
+  // testing, we often replace this with a CUPTI mock object to mock hardware
+  // behavior. This will be set when CuptiBase singleton was created and an
+  // object that this variable points to will die when CuptiBase singleton dies,
+  // i.e., at the end of program execution.
+  std::unique_ptr<CuptiInterface> interface_;
+
+  // A vector of functions that needs to be called by Undo upon an error
+  // detected. This vector is managed like a statck through push_back and
+  // pop_back. Whenever an API function is successfully executed, its
+  // corresponding undo function will be pushed into this stack and Undo will
+  // pop and execute the unroll function upon detecting an error.
+  std::vector<UndoFunction> undo_stack_ TF_GUARDED_BY(undo_stack_mu_);
+
+  // A mutex to guarantee atomicity for undo_stack_. Given that threads that
+  // can update undo_stack_ are a profiling control thread such as a webserver
+  // thread or a thread that executes a kernel during performance counter
+  // profiling, which is already serialized, the contention for this lock will
+  // be extremely low. In other words, it will be contended only when the
+  // profiling is being enabled or disabled, and we will have at most two
+  // threads that will contend for this mutex.
+  tsl::mutex undo_stack_mu_;
+
+  // Once an error is detected, we will ignore any CUPTI API call.
+  std::atomic<int> disabled_;
+
+  // Prevent recursive undo if an UndoFunction fails.
+  bool undo_disabled_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiErrorManager);
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_ERROR_MANAGER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_error_manager_test.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager_test.cc
similarity index 88%
rename from tensorflow/core/profiler/backends/gpu/cupti_error_manager_test.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager_test.cc
index 9d08b094857..27909b31cee 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_error_manager_test.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager_test.cc
@@ -15,30 +15,30 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/profiler/backends/gpu/cupti_error_manager.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h"
 
 #include <cstdint>
 #include <memory>
 #include <utility>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/backends/gpu/cuda_test.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_tracer.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_wrapper.h"
-#include "tensorflow/core/profiler/backends/gpu/mock_cupti.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-
-namespace tensorflow {
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/mock_cupti.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+
+namespace xla {
 namespace profiler {
 namespace test {
 
-using tensorflow::profiler::CuptiInterface;
-using tensorflow::profiler::CuptiTracer;
-using tensorflow::profiler::CuptiTracerCollectorOptions;
-using tensorflow::profiler::CuptiTracerOptions;
-using tensorflow::profiler::CuptiWrapper;
+using xla::profiler::CuptiInterface;
+using xla::profiler::CuptiTracer;
+using xla::profiler::CuptiTracerCollectorOptions;
+using xla::profiler::CuptiTracerOptions;
+using xla::profiler::CuptiWrapper;
 
 using ::testing::_;
 using ::testing::Invoke;
@@ -74,7 +74,7 @@ class CuptiErrorManagerTest : public ::testing::Test {
     CuptiTracerCollectorOptions collector_options;
     collector_options.num_gpus = CuptiTracer::NumGpus();
     uint64_t start_gputime_ns = CuptiTracer::GetTimestamp();
-    uint64_t start_walltime_ns = tensorflow::profiler::GetCurrentTimeNanos();
+    uint64_t start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
     cupti_collector_ = CreateCuptiCollector(
         collector_options, start_walltime_ns, start_gputime_ns);
   }
@@ -107,7 +107,7 @@ class CuptiErrorManagerTest : public ::testing::Test {
   // CuptiWrapper instance to which mock_ calls are delegated.
   std::unique_ptr<CuptiWrapper> cupti_wrapper_;
 
-  std::unique_ptr<tensorflow::profiler::CuptiTraceCollector> cupti_collector_;
+  std::unique_ptr<xla::profiler::CuptiTraceCollector> cupti_collector_;
 };
 
 // Verifies that failed EnableProfiling() does not kill an application.
@@ -208,6 +208,6 @@ TEST_F(CuptiErrorManagerTest, GpuTraceAutoEnableTest) {
 
 }  // namespace test
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h
new file mode 100644
index 00000000000..58fe6e78e49
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+// Provides a wrapper interface to every single CUPTI API function. This class
+// is needed to create an easy mock object for CUPTI API calls. All member
+// functions are defined in the following order: activity related APIs, callback
+// related APIs, Event APIs, and metric APIs. Within each category, we follow
+// the order in the original CUPTI documentation.
+class CuptiInterface {
+ public:
+  CuptiInterface() {}
+
+  virtual ~CuptiInterface() {}
+
+  // CUPTI activity API
+  virtual CUptiResult ActivityDisable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityEnable(CUpti_ActivityKind kind) = 0;
+
+  virtual CUptiResult ActivityFlushAll(uint32_t flag) = 0;
+
+  virtual CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                            size_t valid_buffer_size_bytes,
+                                            CUpti_Activity** record) = 0;
+
+  virtual CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                                   uint32_t stream_id,
+                                                   size_t* dropped) = 0;
+
+  virtual CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) = 0;
+
+  virtual CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
+
+  virtual CUptiResult GetDeviceId(CUcontext context, tsl::uint32* deviceId) = 0;
+
+  virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
+
+  virtual CUptiResult Finalize() = 0;
+
+  // CUPTI callback API
+  virtual CUptiResult EnableCallback(uint32_t enable,
+                                     CUpti_SubscriberHandle subscriber,
+                                     CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid) = 0;
+
+  virtual CUptiResult EnableDomain(uint32_t enable,
+                                   CUpti_SubscriberHandle subscriber,
+                                   CUpti_CallbackDomain domain) = 0;
+
+  virtual CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                                CUpti_CallbackFunc callback,
+                                void* userdata) = 0;
+
+  virtual CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) = 0;
+
+  // CUPTI event API
+  virtual CUptiResult DeviceEnumEventDomains(
+      CUdevice device, size_t* array_size_bytes,
+      CUpti_EventDomainID* domain_array) = 0;
+
+  virtual CUptiResult DeviceGetEventDomainAttribute(
+      CUdevice device, CUpti_EventDomainID event_domain,
+      CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult DisableKernelReplayMode(CUcontext context) = 0;
+
+  virtual CUptiResult EnableKernelReplayMode(CUcontext context) = 0;
+
+  virtual CUptiResult DeviceGetNumEventDomains(CUdevice device,
+                                               uint32_t* num_domains) = 0;
+
+  virtual CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
+                                            size_t* array_size_bytes,
+                                            CUpti_EventID* event_array) = 0;
+
+  virtual CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
+                                              uint32_t* num_events) = 0;
+
+  virtual CUptiResult EventGetAttribute(CUpti_EventID event,
+                                        CUpti_EventAttribute attrib,
+                                        size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult EventGetIdFromName(CUdevice device,
+                                         const char* event_name,
+                                         CUpti_EventID* event) = 0;
+
+  virtual CUptiResult EventGroupDisable(CUpti_EventGroup event_group) = 0;
+
+  virtual CUptiResult EventGroupEnable(CUpti_EventGroup event_group) = 0;
+
+  virtual CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
+                                             CUpti_EventGroupAttribute attrib,
+                                             size_t* value_size,
+                                             void* value) = 0;
+
+  virtual CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
+                                          CUpti_ReadEventFlags flags,
+                                          CUpti_EventID event,
+                                          size_t* event_value_buffer_size_bytes,
+                                          uint64_t* eventValueBuffer) = 0;
+
+  virtual CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
+                                             CUpti_EventGroupAttribute attrib,
+                                             size_t value_size,
+                                             void* value) = 0;
+
+  virtual CUptiResult EventGroupSetsCreate(
+      CUcontext context, size_t event_id_array_size_bytes,
+      CUpti_EventID* event_id_array,
+      CUpti_EventGroupSets** event_group_passes) = 0;
+
+  virtual CUptiResult EventGroupSetsDestroy(
+      CUpti_EventGroupSets* event_group_sets) = 0;
+
+  // CUPTI metric API
+  virtual CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
+                                        CUpti_MetricID* metricArray) = 0;
+
+  virtual CUptiResult DeviceGetNumMetrics(CUdevice device,
+                                          uint32_t* num_metrics) = 0;
+
+  virtual CUptiResult MetricGetIdFromName(CUdevice device,
+                                          const char* metric_name,
+                                          CUpti_MetricID* metric) = 0;
+
+  virtual CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
+                                         uint32_t* num_events) = 0;
+
+  virtual CUptiResult MetricEnumEvents(CUpti_MetricID metric,
+                                       size_t* event_id_array_size_bytes,
+                                       CUpti_EventID* event_id_array) = 0;
+
+  virtual CUptiResult MetricGetAttribute(CUpti_MetricID metric,
+                                         CUpti_MetricAttribute attrib,
+                                         size_t* value_size, void* value) = 0;
+
+  virtual CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                                     size_t event_id_array_size_bytes,
+                                     CUpti_EventID* event_id_array,
+                                     size_t event_value_array_size_bytes,
+                                     uint64_t* event_value_array,
+                                     uint64_t time_duration,
+                                     CUpti_MetricValue* metric_value) = 0;
+
+  virtual CUptiResult GetResultString(CUptiResult result, const char** str) = 0;
+
+  virtual CUptiResult GetContextId(CUcontext context, uint32_t* context_id) = 0;
+
+  virtual CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                                    uint8_t per_thread_stream,
+                                    uint32_t* stream_id) = 0;
+
+  // Interface maintenance functions. Not directly related to CUPTI, but
+  // required for implementing an error resilient layer over CUPTI API.
+
+  // Performance any clean up work that is required each time profile session
+  // is done. Therefore this can be called multiple times during process life
+  // time.
+  virtual void CleanUp() = 0;
+
+  // Whether CUPTI API is currently disabled due to unrecoverable errors.
+  // All subsequent calls will fail immediately without forwarding calls to
+  // CUPTI library.
+  virtual bool Disabled() const = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiInterface);
+};
+
+CuptiInterface* GetCuptiInterface();
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_tracer.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
similarity index 93%
rename from tensorflow/core/profiler/backends/gpu/cupti_tracer.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
index 8ea1b21da62..bf37f0d7cb3 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_tracer.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -13,29 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/cupti_tracer.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h"
 
+#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/container/node_hash_set.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/host_info.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
-#include "tensorflow/core/profiler/backends/gpu/nvtx_utils.h"
-
-namespace tensorflow {
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/profiler/backends/cpu/annotation_stack.h"
+
+namespace xla {
 namespace profiler {
 
 namespace {
 
+using tsl::Env;
+using tsl::OkStatus;
+using tsl::Status;
+using tsl::profiler::AnnotationStack;
+
 // CUPTI from CUDA 11.6 adds information about the hardware channel that ops
 // run on; this makes its way into the channel_id and channel_type fields in the
 // structs we export.
@@ -71,7 +76,7 @@ Status ToStatus(CUptiResult result) {
   }
   const char *str = nullptr;
   cuptiGetResultString(result, &str);
-  return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
+  return tsl::errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
 }
 
 Status ToStatus(CUresult result) {
@@ -80,7 +85,7 @@ Status ToStatus(CUresult result) {
   }
   const char *str = nullptr;
   cuGetErrorName(result, &str);
-  return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
+  return tsl::errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
 }
 
 inline void LogIfError(const Status &status) {
@@ -143,9 +148,9 @@ const char *getActivityUnifiedMemoryKindString(
       cupti_interface_->GetResultString(status, &errstr);                   \
       LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
-        return errors::PermissionDenied("CUPTI need root access!");         \
+        return tsl::errors::PermissionDenied("CUPTI need root access!");    \
       } else {                                                              \
-        return errors::Internal("CUPTI call error", errstr);                \
+        return tsl::errors::Internal("CUPTI call error", errstr);           \
       }                                                                     \
     }                                                                       \
   } while (false)
@@ -367,9 +372,10 @@ void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
   }
 }
 
-void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
+void AddKernelEventUponApiExit(CuptiTraceCollector *collector,
+                               tsl::uint32 device_id,
                                const CUpti_CallbackData *cbdata,
-                               uint64 start_time, uint64 end_time) {
+                               tsl::uint64 start_time, tsl::uint64 end_time) {
   CuptiTracerEvent event{};
   event.type = CuptiTracerEventType::Kernel;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -387,8 +393,8 @@ void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
 // Performs the actual callback for both normal and P2P memcpy operations.
 CuptiTracerEvent PopulateMemcpyCallbackEvent(
     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
-    size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
-    uint64 start_time, uint64 end_time) {
+    size_t num_bytes, tsl::uint32 src_device, tsl::uint32 dst_device,
+    bool async, tsl::uint64 start_time, tsl::uint64 end_time) {
   CuptiTracerEvent event{};
   event.type = type;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -409,9 +415,11 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent(
 }
 
 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
-                                     uint32 device_id, CUpti_CallbackId cbid,
+                                     tsl::uint32 device_id,
+                                     CUpti_CallbackId cbid,
                                      const CUpti_CallbackData *cbdata,
-                                     uint64 start_time, uint64 end_time) {
+                                     tsl::uint64 start_time,
+                                     tsl::uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
@@ -426,9 +434,9 @@ void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
-                                 uint32 device_id, CUpti_CallbackId cbid,
+                                 tsl::uint32 device_id, CUpti_CallbackId cbid,
                                  const CUpti_CallbackData *cbdata,
-                                 uint64 start_time, uint64 end_time) {
+                                 tsl::uint64 start_time, tsl::uint64 end_time) {
   // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
   // first member attribute, a CUdeviceptr.
   const auto *params =
@@ -459,16 +467,17 @@ void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
 
 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                   CuptiInterface *cupti_interface,
-                                  uint32 device_id, CUpti_CallbackId cbid,
+                                  tsl::uint32 device_id, CUpti_CallbackId cbid,
                                   const CUpti_CallbackData *cbdata,
-                                  uint64 start_time, uint64 end_time) {
+                                  tsl::uint64 start_time,
+                                  tsl::uint64 end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
       DecodeDriverMemcpy(cbid, cbdata->functionParams);
 
-  uint32 dst_device = -1, src_device = -1;
+  tsl::uint32 dst_device = -1, src_device = -1;
   const auto *p2p_params =
       static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
@@ -482,9 +491,10 @@ void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
-                                   uint32 device_id, CUpti_CallbackId cbid,
+                                   tsl::uint32 device_id, CUpti_CallbackId cbid,
                                    const CUpti_CallbackData *cbdata,
-                                   uint64 start_time, uint64 end_time) {
+                                   tsl::uint64 start_time,
+                                   tsl::uint64 end_time) {
   const auto *params =
       static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -505,9 +515,11 @@ void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
-                                        uint32 device_id, CUpti_CallbackId cbid,
+                                        tsl::uint32 device_id,
+                                        CUpti_CallbackId cbid,
                                         const CUpti_CallbackData *cbdata,
-                                        uint64 start_time, uint64 end_time) {
+                                        tsl::uint64 start_time,
+                                        tsl::uint64 end_time) {
   const auto *params =
       static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -529,9 +541,10 @@ void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
-                                  uint32 device_id, CUpti_CallbackId cbid,
+                                  tsl::uint32 device_id, CUpti_CallbackId cbid,
                                   const CUpti_CallbackData *cbdata,
-                                  uint64 start_time, uint64 end_time) {
+                                  tsl::uint64 start_time,
+                                  tsl::uint64 end_time) {
   const auto *params =
       static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -550,9 +563,9 @@ void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
-                                uint32 device_id, CUpti_CallbackId cbid,
+                                tsl::uint32 device_id, CUpti_CallbackId cbid,
                                 const CUpti_CallbackData *cbdata,
-                                uint64 start_time, uint64 end_time) {
+                                tsl::uint64 start_time, tsl::uint64 end_time) {
   CuptiTracerEvent event{};
   event.type = CuptiTracerEventType::Generic;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -862,8 +875,8 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
     }
 
     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
-    uint64 end_tsc = CuptiTracer::GetTimestamp();
-    uint64 start_tsc = *cbdata->correlationData;
+    uint64_t end_tsc = CuptiTracer::GetTimestamp();
+    uint64_t start_tsc = *cbdata->correlationData;
     TrackContext(cbid, cbdata->context);
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
@@ -910,11 +923,11 @@ struct KernelRecord {
   // record the stream and infer the context during collection.
   CUcontext context;
   CUstream stream;
-  uint32 correlation_id;
+  tsl::uint32 correlation_id;
   CUevent start_event;
   CUevent stop_event;
   KernelDetails details;
-  uint64 start_timestamp;
+  tsl::uint64 start_timestamp;
 };
 
 struct MemcpyRecord {
@@ -922,11 +935,11 @@ struct MemcpyRecord {
   size_t size_bytes;
   CUcontext context;
   CUstream stream;
-  uint32 correlation_id;
+  tsl::uint32 correlation_id;
   bool async;
   CUevent start_event;
   CUevent stop_event;
-  uint64 start_timestamp;
+  tsl::uint64 start_timestamp;
 };
 
 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
@@ -945,7 +958,7 @@ class ScopedCudaContext {
     CUcontext context;
     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
     context_ = context;
-    uint32 device_ordinal;
+    tsl::uint32 device_ordinal;
     if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
     device_ordinal_ = device_ordinal;
     context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
@@ -957,17 +970,17 @@ class ScopedCudaContext {
   }
 
   // If successful, return the device ordinal of the relevant cuda stream.
-  // Otherwise absl::nullopt;
-  absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
+  // Otherwise std::nullopt;     non-std ok
+  std::optional<tsl::uint32> GetDeviceOrdinal() { return device_ordinal_; }
 
   // If successful, return the cuda context of the relevant cuda stream.
-  // Otherwise absl::nullopt;
-  absl::optional<CUcontext> GetContext() { return context_; }
+  // Otherwise std::nullopt;
+  std::optional<CUcontext> GetContext() { return context_; }
 
  private:
   CUstream stream_;
-  absl::optional<CUcontext> context_;
-  absl::optional<uint32> device_ordinal_;
+  std::optional<CUcontext> context_;
+  std::optional<tsl::uint32> device_ordinal_;
   bool context_pushed_ = false;
 };
 #endif
@@ -994,7 +1007,7 @@ class CudaEventRecorder {
   // to StopKernel() after the kernel launch has completed.
   template <typename T>
   size_t StartKernel(const char *kernel_name, CUcontext context,
-                     uint32 correlation_id, const T *params) {
+                     tsl::uint32 correlation_id, const T *params) {
     CUstream stream = params->hStream;
     KernelRecord record = {kernel_name, context, stream, correlation_id};
     record.details.registers_per_thread = 0;  // unknown.
@@ -1013,7 +1026,7 @@ class CudaEventRecorder {
     kernel_records_.push_back(record);
     return kernel_records_.size() - 1;
   }
-  uint64 StopKernel(size_t index) {
+  tsl::uint64 StopKernel(size_t index) {
     absl::MutexLock lock(&mutex_);
     if (index >= kernel_records_.size()) return 0;
     auto &record = kernel_records_[index];
@@ -1024,8 +1037,8 @@ class CudaEventRecorder {
   // Registers the start of a copy operation. The returned index should be
   // passed to StopMemcpy() after the memcpy has completed.
   size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
-                     CUcontext context, CUstream stream, uint32 correlation_id,
-                     bool async) {
+                     CUcontext context, CUstream stream,
+                     tsl::uint32 correlation_id, bool async) {
     MemcpyRecord record = {type,   size_bytes,     context,
                            stream, correlation_id, async};
     record.start_timestamp = CuptiTracer::GetTimestamp();
@@ -1035,7 +1048,7 @@ class CudaEventRecorder {
     memcpy_records_.push_back(record);
     return memcpy_records_.size() - 1;
   }
-  uint64 StopMemcpy(size_t index) {
+  tsl::uint64 StopMemcpy(size_t index) {
     absl::MutexLock lock(&mutex_);
     if (index >= memcpy_records_.size()) return 0;
     auto &record = memcpy_records_[index];
@@ -1100,13 +1113,13 @@ class CudaEventRecorder {
 
  private:
   struct ContextInfo {
-    uint32 context_id = 0;
+    tsl::uint32 context_id = 0;
     int num_streams = 0;
     CUevent end_event;
   };
 
   struct StreamInfo {
-    uint32 stream_id = 0;
+    tsl::uint32 stream_id = 0;
     std::string name;
     int index;  // 0 is reserved for null stream.
     const ContextInfo *ctx_info;
@@ -1127,7 +1140,7 @@ class CudaEventRecorder {
     auto it = context_infos_.find(context);
 
     if (it == context_infos_.end()) {
-      uint32 context_id = 0;
+      tsl::uint32 context_id = 0;
       RETURN_IF_CUPTI_ERROR(
           cupti_interface_->GetContextId(context, &context_id));
       ContextInfo ctx_info = {context_id};
@@ -1154,7 +1167,7 @@ class CudaEventRecorder {
     ContextInfo *ctx_info;
     TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
     int index = stream ? ++ctx_info->num_streams : 0;
-    uint32 stream_id = 0;
+    tsl::uint32 stream_id = 0;
 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
     RETURN_IF_CUPTI_ERROR(
         cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
@@ -1174,7 +1187,7 @@ class CudaEventRecorder {
     CuptiApiTracingDisabler disabler;
     float elapsed_ms = 0.0f;
     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
-    return static_cast<uint64>(
+    return static_cast<tsl::uint64>(
         std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
   }
 
@@ -1245,13 +1258,14 @@ class CudaEventRecorder {
   absl::Mutex mutex_;
   bool stopped_ TF_GUARDED_BY(mutex_) = false;
   std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
-  std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
+  std::vector<MemcpyRecord> memcpy_records_
+      TF_GUARDED_BY(mutex_);  // non std ok
 
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
   const int ordinal_;
   std::string device_name_;
-  uint64 end_walltime_us_;
+  tsl::uint64 end_walltime_us_;
   // Include context in key to distinguish null streams.
   using StreamKey = std::pair<CUcontext, CUstream>;
 
@@ -1308,7 +1322,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
         const auto *params =
             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
                 cbdata->functionParams);
-        std::vector<uint32> record_indices;
+        std::vector<tsl::uint32> record_indices;
         record_indices.reserve(params->numDevices);
         *cbdata->correlationData = -1;  // Invalid value.
         const auto &annotation = AnnotationStack::Get();
@@ -1317,7 +1331,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
           ScopedCudaContext scoped_cuda_context(stream);
           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
           auto context = scoped_cuda_context.GetContext();
-          if (!dev_id) return errors::Internal("Invalid CUDA stream");
+          if (!dev_id) return tsl::errors::Internal("Invalid CUDA stream");
           // Because annotation are per device, therefore we need to populate
           // annotation for each device involved.
           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
@@ -1330,7 +1344,8 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
         auto *callback_context =
             new CuptiApiCallbackContext(std::move(record_indices));
         callback_contexts_.insert(callback_context);
-        *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
+        *cbdata->correlationData =
+            reinterpret_cast<tsl::uint64>(callback_context);
 #else
         VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
 #endif
@@ -1385,7 +1400,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
                          const CUpti_CallbackData *cbdata) override {
     auto *recorder = cuda_event_recorders_[device_id].get();
     if (*cbdata->correlationData == static_cast<size_t>(-1)) return OkStatus();
-    uint64 start_tsc = 0;
+    tsl::uint64 start_tsc = 0;
     switch (cbid) {
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
@@ -1402,12 +1417,12 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
                 cbdata->functionParams);
         if (record_indices.size() != params->numDevices)
-          return errors::Internal("Invalid correlation data");
+          return tsl::errors::Internal("Invalid correlation data");
         for (int i = 0; i < params->numDevices; ++i) {
           CUstream stream = params->launchParamsList[i].hStream;
           ScopedCudaContext scoped_cuda_context(stream);
           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
-          if (!dev_id) return errors::Internal("Invalid CUDA stream");
+          if (!dev_id) return tsl::errors::Internal("Invalid CUDA stream");
           start_tsc =
               cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
         }
@@ -1434,7 +1449,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
     }
 
     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
-    uint64 end_tsc = CuptiTracer::GetTimestamp();
+    tsl::uint64 end_tsc = CuptiTracer::GetTimestamp();
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
@@ -1505,9 +1520,9 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   // However there is no guarantee that we receive such callbacks in pairs, we
   // maintain a on-going API calls to make sure no memory leaks.
   struct CuptiApiCallbackContext {
-    explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
+    explicit CuptiApiCallbackContext(std::vector<tsl::uint32> &&r)
         : record_indices(std::move(r)) {}
-    std::vector<uint32> record_indices;
+    std::vector<tsl::uint32> record_indices;
   };
 
   const CuptiTracerOptions option_;
@@ -1519,14 +1534,14 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
 };
 
 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
-  return absl::StrCat(port::Hostname(), ": ", error_message);
+  return absl::StrCat(tsl::port::Hostname(), ": ", error_message);
 }
 
 }  // namespace
 
 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
-    int device_id, uint64 start_tsc, uint64 end_tsc,
+    int device_id, tsl::uint64 start_tsc, tsl::uint64 end_tsc,
     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
     const CUpti_CallbackData *cbdata) {
   switch (cbid) {
@@ -1681,13 +1696,13 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
   }
 
   Status status = EnableApiTracing();
-  need_root_access_ |= status.code() == error::PERMISSION_DENIED;
+  need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED;
   if (!status.ok()) return;
 
   if (option_->enable_activity_api) {
     EnableActivityTracing().IgnoreError();
   }
-  tensorflow::profiler::AnnotationStack::Enable(true);
+  tsl::profiler::AnnotationStack::Enable(true);
 }
 
 void CuptiTracer::Disable() {
@@ -1702,7 +1717,7 @@ void CuptiTracer::Disable() {
   collector_ = nullptr;
   option_.reset();
   cupti_driver_api_hook_.reset();
-  tensorflow::profiler::AnnotationStack::Enable(false);
+  tsl::profiler::AnnotationStack::Enable(false);
 }
 
 Status CuptiTracer::EnableApiTracing() {
@@ -1809,7 +1824,7 @@ Status CuptiTracer::Finalize() {
   return OkStatus();
 }
 
-/*static*/ uint64 CuptiTracer::GetTimestamp() {
+/*static*/ tsl::uint64 CuptiTracer::GetTimestamp() {
   uint64_t tsc;
   CuptiInterface *cupti_interface = GetCuptiInterface();
   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
@@ -1852,15 +1867,15 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
     // API callback is called before any CUDA context is created.
     // This is expected to be rare, and we ignore this case.
     VLOG(3) << "API callback received before creation of CUDA context\n";
-    return errors::Internal("cutpi callback without context");
+    return tsl::errors::Internal("cutpi callback without context");
   }
 
   // Grab a correct device ID.
-  uint32 device_id = -1;
+  tsl::uint32 device_id = -1;
   RETURN_IF_CUPTI_ERROR(
       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
   if (device_id >= num_gpus_) {
-    return errors::Internal("Invalid device id:", device_id);
+    return tsl::errors::Internal("Invalid device id:", device_id);
   }
 
   if (cbdata->callbackSite == CUPTI_API_ENTER) {
@@ -1937,8 +1952,7 @@ void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
 
 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
                                           uint8_t *buffer, size_t size) {
-  auto buffer_cleanup =
-      gtl::MakeCleanup([&]() { buffer_pool_.ReclaimBuffer(buffer); });
+  absl::Cleanup buffer_cleanup = [&]() { buffer_pool_.ReclaimBuffer(buffer); };
   if (size == 0) {
     return OkStatus();
   }
@@ -1946,7 +1960,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
     LOG(WARNING) << "CUPTI activity buffer is reclaimed after flush.";
     return OkStatus();
   }
-  if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
+  if (cupti_interface_->Disabled()) return tsl::errors::Internal("Disabled.");
 
   CUpti_Activity *record = nullptr;
   while (true) {
@@ -1996,7 +2010,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
     } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
       break;
     } else {
-      return errors::Internal("Parse cupti activity buffer error.");
+      return tsl::errors::Internal("Parse cupti activity buffer error.");
     }
   }
 
@@ -2005,7 +2019,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
       context, stream_id, &dropped));
   if (dropped != 0) {
-    uint32 device_id = -1;
+    tsl::uint32 device_id = -1;
     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
     collector_->OnEventsDropped("cupti activity buffer full", dropped);
   }
@@ -2026,4 +2040,4 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h
new file mode 100644
index 00000000000..96e32ca164d
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h
@@ -0,0 +1,156 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
+
+#include "absl/types/optional.h"
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/utils/buffer_pool.h"
+
+namespace xla {
+namespace profiler {
+
+struct CuptiTracerOptions {
+  bool enable_activity_api = true;
+
+  // Use cuda events to enclose the kernel/memcpy to measure device activity.
+  // enable_event_based_activity, if true, will override the enable_activity_api
+  // setting.
+  bool enable_event_based_activity = false;
+
+  bool required_callback_api_events = true;
+  // The callback ids that will be enabled and monitored, if empty, all
+  // Callback ids to be enabled using Callback API.
+  // We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
+  // redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
+  // CUPTI_CB_DOMAIN_RUNTIME_API.
+  std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
+  // Activity kinds to be collected using Activity API. If empty, the Activity
+  // API is disable.
+  std::vector<CUpti_ActivityKind> activities_selected;
+  // Whether to call cuptiFinalize.
+  bool cupti_finalize = false;
+  // Whether to call cuCtxSynchronize for each device before Stop().
+  bool sync_devices_before_stop = false;
+  // Whether to enable NVTX tracking, we need this for TensorRT tracking.
+  bool enable_nvtx_tracking = false;
+};
+
+class CuptiDriverApiHook {
+ public:
+  virtual ~CuptiDriverApiHook() {}
+
+  virtual tsl::Status OnDriverApiEnter(
+      int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info) = 0;
+  virtual tsl::Status OnDriverApiExit(
+      int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info) = 0;
+  virtual tsl::Status SyncAndFlush() = 0;
+
+ protected:
+  static tsl::Status AddDriverApiCallbackEvent(
+      CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
+      int device_id, tsl::uint64 start_tsc, tsl::uint64 end_tsc,
+      CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+      const CUpti_CallbackData* callback_info);
+};
+
+// The class use to enable cupti callback/activity API and forward the collected
+// trace events to CuptiTraceCollector. There should be only one CuptiTracer
+// per process.
+class CuptiTracer {
+ public:
+  // Not copyable or movable
+  CuptiTracer(const CuptiTracer&) = delete;
+  CuptiTracer& operator=(const CuptiTracer&) = delete;
+
+  // Returns a pointer to singleton CuptiTracer.
+  static CuptiTracer* GetCuptiTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+  bool NeedRootAccess() const { return need_root_access_; }
+
+  void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
+  void Disable();
+
+  tsl::Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+                             const CUpti_CallbackData* callback_info);
+
+  // Returns a buffer and its size for CUPTI to store activities. This buffer
+  // will be reclaimed when CUPTI makes a callback to ProcessActivityBuffer.
+  void RequestActivityBuffer(uint8_t** buffer, size_t* size);
+
+  // Parses CUPTI activity events from activity buffer, and emits events for
+  // CuptiTraceCollector. This function is public because called from registered
+  // callback.
+  tsl::Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
+                                    uint8_t* buffer, size_t size);
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+  // Returns the error (if any) when using libcupti.
+  static std::string ErrorIfAny();
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit CuptiTracer(CuptiInterface* cupti_interface);
+
+ private:
+  // Buffer size and alignment, 32K and 8 as in CUPTI samples.
+  static constexpr size_t kBufferSizeInBytes = 32 * 1024;
+
+  tsl::Status EnableApiTracing();
+  tsl::Status EnableActivityTracing();
+  tsl::Status DisableApiTracing();
+  tsl::Status DisableActivityTracing();
+  tsl::Status Finalize();
+  void ConfigureActivityUnifiedMemoryCounter(bool enable);
+  tsl::Status HandleNVTXCallback(CUpti_CallbackId cbid,
+                                 const CUpti_CallbackData* cbdata);
+
+  int num_gpus_;
+  std::optional<CuptiTracerOptions> option_;
+  CuptiInterface* cupti_interface_ = nullptr;
+  CuptiTraceCollector* collector_ = nullptr;
+
+  // CUPTI 10.1 and higher need root access to profile.
+  bool need_root_access_ = false;
+
+  bool api_tracing_enabled_ = false;
+  // Cupti handle for driver or runtime API callbacks. Cupti permits a single
+  // subscriber to be active at any time and can be used to trace Cuda runtime
+  // as and driver calls for all contexts and devices.
+  CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.
+
+  bool activity_tracing_enabled_ = false;
+
+  std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;
+
+  tsl::profiler::BufferPool buffer_pool_;
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_utils.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_utils.cc
similarity index 77%
rename from tensorflow/core/profiler/backends/gpu/cupti_utils.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_utils.cc
index 83bd6e86165..c7f1bb3fcfb 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_utils.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_utils.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "absl/memory/memory.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_error_manager.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_wrapper.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h"
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 
 CuptiInterface* GetCuptiInterface() {
@@ -27,4 +27,4 @@ CuptiInterface* GetCuptiInterface() {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.cc b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.cc
similarity index 97%
rename from tensorflow/core/profiler/backends/gpu/cupti_wrapper.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.cc
index c125e3d810e..99f9966c9ac 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/cupti_wrapper.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h"
 
 #include <type_traits>
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 
 CUptiResult CuptiWrapper::ActivityDisable(CUpti_ActivityKind kind) {
@@ -56,7 +56,8 @@ CUptiResult CuptiWrapper::ActivityRegisterCallbacks(
                                         func_buffer_completed);
 }
 
-CUptiResult CuptiWrapper::GetDeviceId(CUcontext context, uint32* deviceId) {
+CUptiResult CuptiWrapper::GetDeviceId(CUcontext context,
+                                      tsl::uint32* deviceId) {
   return cuptiGetDeviceId(context, deviceId);
 }
 
@@ -245,4 +246,4 @@ CUptiResult CuptiWrapper::GetStreamIdEx(CUcontext context, CUstream stream,
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h
new file mode 100644
index 00000000000..c1fb2999f31
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h
@@ -0,0 +1,185 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+
+namespace xla {
+namespace profiler {
+
+class CuptiWrapper : public xla::profiler::CuptiInterface {
+ public:
+  CuptiWrapper() {}
+
+  ~CuptiWrapper() override {}
+
+  // CUPTI activity API
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
+
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
+
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
+
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
+
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
+
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
+
+  CUptiResult GetDeviceId(CUcontext context, tsl::uint32* deviceId) override;
+
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
+
+  // cuptiFinalize is only defined in CUDA8 and above.
+  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
+  // be set to 1.
+  CUptiResult Finalize() override;
+
+  // CUPTI callback API
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId cbid) override;
+
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
+
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
+
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
+
+  // CUPTI event API
+  CUptiResult DeviceEnumEventDomains(
+      CUdevice device, size_t* array_size_bytes,
+      CUpti_EventDomainID* domain_array) override;
+
+  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
+                                            CUpti_EventDomainID event_domain,
+                                            CUpti_EventDomainAttribute attrib,
+                                            size_t* value_size,
+                                            void* value) override;
+
+  CUptiResult DisableKernelReplayMode(CUcontext context) override;
+
+  CUptiResult EnableKernelReplayMode(CUcontext context) override;
+
+  CUptiResult DeviceGetNumEventDomains(CUdevice device,
+                                       uint32_t* num_domains) override;
+
+  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
+                                    size_t* array_size_bytes,
+                                    CUpti_EventID* event_array) override;
+
+  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
+                                      uint32_t* num_events) override;
+
+  CUptiResult EventGetAttribute(CUpti_EventID event,
+                                CUpti_EventAttribute attrib, size_t* value_size,
+                                void* value) override;
+
+  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
+                                 CUpti_EventID* event) override;
+
+  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
+
+  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
+
+  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t* value_size, void* value) override;
+
+  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
+                                  CUpti_ReadEventFlags flags,
+                                  CUpti_EventID event,
+                                  size_t* event_value_buffer_size_bytes,
+                                  uint64_t* event_value_buffer) override;
+
+  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
+                                     CUpti_EventGroupAttribute attrib,
+                                     size_t value_size, void* value) override;
+
+  CUptiResult EventGroupSetsCreate(
+      CUcontext context, size_t event_id_array_size_bytes,
+      CUpti_EventID* event_id_array,
+      CUpti_EventGroupSets** event_group_passes) override;
+
+  CUptiResult EventGroupSetsDestroy(
+      CUpti_EventGroupSets* event_group_sets) override;
+
+  // CUPTI metric API
+  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
+                                CUpti_MetricID* metricArray) override;
+
+  CUptiResult DeviceGetNumMetrics(CUdevice device,
+                                  uint32_t* num_metrics) override;
+
+  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
+                                  CUpti_MetricID* metric) override;
+
+  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
+                                 uint32_t* num_events) override;
+
+  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
+                               size_t* event_id_array_size_bytes,
+                               CUpti_EventID* event_id_array) override;
+
+  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
+                                 CUpti_MetricAttribute attrib,
+                                 size_t* value_size, void* value) override;
+
+  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
+                             size_t event_id_array_size_bytes,
+                             CUpti_EventID* event_id_array,
+                             size_t event_value_array_size_bytes,
+                             uint64_t* event_value_array,
+                             uint64_t time_duration,
+                             CUpti_MetricValue* metric_value) override;
+
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
+
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
+
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
+
+  void CleanUp() override {}
+  bool Disabled() const override { return false; }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CuptiWrapper);
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_cuda.cc b/tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_cuda.cc
similarity index 83%
rename from tensorflow/core/profiler/backends/gpu/device_tracer_cuda.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_cuda.cc
index 68de0d32463..3e8980178b3 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_cuda.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -23,24 +23,29 @@ limitations under the License.
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_tracer.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_wrapper.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-#include "tensorflow/core/util/env_var.h"
-
-namespace tensorflow {
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+#include "tensorflow/tsl/util/env_var.h"
+
+namespace xla {
 namespace profiler {
 
+using tensorflow::ProfileOptions;
+using tensorflow::profiler::XSpace;
+using tsl::OkStatus;
+using tsl::ReadBoolFromEnvVar;
+using tsl::Status;
+
 // GpuTracer for GPU.
-class GpuTracer : public profiler::ProfilerInterface {
+class GpuTracer : public tsl::profiler::ProfilerInterface {
  public:
   GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface)
       : cupti_tracer_(cupti_tracer) {
@@ -73,7 +78,7 @@ class GpuTracer : public profiler::ProfilerInterface {
 
 Status GpuTracer::DoStart() {
   if (!cupti_tracer_->IsAvailable()) {
-    return errors::Unavailable("Another profile session running.");
+    return tsl::errors::Unavailable("Another profile session running.");
   }
 
   options_.cbids_selected = {
@@ -149,8 +154,8 @@ Status GpuTracer::DoStart() {
 
   CuptiTracerCollectorOptions collector_options;
   collector_options.num_gpus = cupti_tracer_->NumGpus();
-  uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
-  uint64 start_walltime_ns = GetCurrentTimeNanos();
+  tsl::uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
+  tsl::uint64 start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
   cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
                                           start_gputime_ns);
 
@@ -189,7 +194,8 @@ Status GpuTracer::CollectData(XSpace* space) {
       VLOG(1) << "No trace data collected, session wasn't started";
       return OkStatus();
     case State::kStartedOk:
-      return errors::FailedPrecondition("Cannot collect trace before stopping");
+      return tsl::errors::FailedPrecondition(
+          "Cannot collect trace before stopping");
     case State::kStartedError:
       LOG(ERROR) << "Cannot collect, profiler failed to start";
       return OkStatus();
@@ -206,17 +212,17 @@ Status GpuTracer::CollectData(XSpace* space) {
         space->add_warnings(std::move(events_dropped));
       }
       if (cupti_collector_) {
-        uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
+        tsl::uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
         cupti_collector_->Export(space, end_gpu_ns);
       }
       return OkStatus();
     }
   }
-  return errors::Internal("Invalid profiling state: ", profiling_state_);
+  return tsl::errors::Internal("Invalid profiling state: ", profiling_state_);
 }
 
 // Not in anonymous namespace for testing purposes.
-std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
+std::unique_ptr<tsl::profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
   if (options.device_tracer_level() == 0) return nullptr;
   if (options.device_type() != ProfileOptions::GPU &&
@@ -237,6 +243,6 @@ auto register_gpu_tracer_factory = [] {
 }();
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_rocm.cc b/tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_rocm.cc
similarity index 93%
rename from tensorflow/core/profiler/backends/gpu/device_tracer_rocm.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 2560c1b5d48..ecf9fee8d02 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_rocm.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -24,25 +24,46 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
-#include "tensorflow/core/profiler/backends/gpu/rocm_tracer.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/utils/parse_annotation.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/util/env_var.h"
-
-namespace tensorflow {
+#include "tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h"
+#include "tensorflow/tsl/platform/abi.h"
+#include "tensorflow/tsl/platform/env_time.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/utils/parse_annotation.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+#include "tensorflow/tsl/util/env_var.h"
+
+namespace xla {
 namespace profiler {
+
+using tensorflow::ProfileOptions;
+using tsl::mutex;
+using tsl::mutex_lock;
+using tsl::OkStatus;
+using tsl::Status;
+using tsl::profiler::Annotation;
+using tsl::profiler::AnnotationStack;
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::GetStatTypeStr;
+using tsl::profiler::GpuPlaneName;
+using tsl::profiler::kDeviceVendorAMD;
 using tsl::profiler::kThreadIdOverhead;
+using tsl::profiler::ParseAnnotationStack;
+using tsl::profiler::ProfilerInterface;
+using tsl::profiler::RegisterProfilerFactory;
+using tsl::profiler::StatType;
+using tsl::profiler::XEventBuilder;
+using tsl::profiler::XEventMetadata;
+using tsl::profiler::XLineBuilder;
+using tsl::profiler::XPlaneBuilder;
+using tsl::profiler::XSpace;
 
 namespace {
 // Set the all XLines of specified XPlane to starting walltime.
@@ -52,7 +73,7 @@ namespace {
 // start_walltime_ns to normalize with CPU wall time.
 static void NormalizeTimeStamps(XPlaneBuilder* plane,
                                 uint64_t start_walltime_ns) {
-  plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
+  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
     line.SetTimestampNs(start_walltime_ns);
   });
 }
@@ -170,8 +191,8 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
 
   void Export(XSpace* space) {
     uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
-    XPlaneBuilder host_plane(
-        FindOrAddMutablePlaneWithName(space, kRoctracerApiPlaneName));
+    XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
+        space, tsl::profiler::kRoctracerApiPlaneName));
     for (int i = 0; i < options_.num_gpus; ++i) {
       std::string name = GpuPlaneName(i);
       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
@@ -195,20 +216,20 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
   uint64_t start_gputime_ns_;
 
   mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32, RocmTracerEvent> api_events_map_
+  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
-  absl::flat_hash_map<uint32, RocmTracerEvent> activity_api_events_map_
+  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> activity_api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
 
   /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
     trigger multiple HIP ops domain activities. We keep them in a vector and
     merge them with api activities at flush time.
   */
-  absl::flat_hash_map<uint32, std::vector<RocmTracerEvent>>
+  absl::flat_hash_map<tsl::uint32, std::vector<RocmTracerEvent>>
       activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
   // This is for the APIs that we track because we need some information from
   // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32, RocmTracerEvent> auxiliary_api_events_map_
+  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> auxiliary_api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
 
   const std::vector<RocmTracerEvent> ApiActivityInfoExchange() {
@@ -563,7 +584,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         // Times 2 because HBM is DDR memory; it gets two data bits per each
         // data lane.
         auto memory_bandwidth =
-            uint64{2} * (mem_clock_khz)*1000 * (mem_bus_width_bits) / 8;
+            tsl::uint64{2} * (mem_clock_khz)*1000 * (mem_bus_width_bits) / 8;
         device_plane->AddStatValue(
             *device_plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
@@ -575,7 +596,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         device_plane->AddStatValue(
             *device_plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<uint64>(total_memory));
+            static_cast<tsl::uint64>(total_memory));
       }
 
       auto compute_capability_major = device_properties_.major;
@@ -671,7 +692,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
                 << " corr. id:" << event.correlation_id;
         return;
       }
-      std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
+      std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
       if (kernel_name.empty()) {
         kernel_name = GetRocmTracerEventTypeName(event.type);
       }
@@ -700,7 +721,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
       //   xevent.AddStatValue(
       //       *plane->GetOrCreateStatMetadata(
       //           GetStatTypeStr(StatType::kContextId)),
-      //       absl::StrCat("$$", static_cast<uint64>(event.context_id)));
+      //       absl::StrCat("$$", static_cast<tsl::uint64>(event.context_id)));
       // }
 
       if (event.type == RocmTracerEventType::Kernel &&
@@ -731,10 +752,11 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
                             occ_stats.occupancy_pct);
         xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
                                 StatType::kOccupancyMinGridSize)),
-                            static_cast<int32>(occ_stats.min_grid_size));
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kOccupancySuggestedBlockSize)),
-                            static_cast<int32>(occ_stats.suggested_block_size));
+                            static_cast<tsl::int32>(occ_stats.min_grid_size));
+        xevent.AddStatValue(
+            *plane->GetOrCreateStatMetadata(
+                GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
+            static_cast<tsl::int32>(occ_stats.suggested_block_size));
         xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                                 GetStatTypeStr(StatType::kKernelDetails)),
                             *plane->GetOrCreateStatMetadata(ToXStat(
@@ -821,7 +843,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         }
       }
     }
-    bool IsHostEvent(const RocmTracerEvent& event, int64* line_id) {
+    bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
       // DriverCallback(i.e. kernel launching) events are host events.
       if (event.source == RocmTracerEventSource::ApiCallback) {
         *line_id = event.thread_id;
@@ -859,7 +881,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
       int host_ev_cnt = 0, dev_ev_cnt = 0;
       mutex_lock l(events_mutex);
       // Tracking event types per line.
-      absl::flat_hash_map<int64, absl::flat_hash_set<RocmTracerEventType>>
+      absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
           events_types_per_line;
       for (const RocmTracerEvent& event : events) {
         int64_t line_id = RocmTracerEvent::kInvalidThreadId;
@@ -899,7 +921,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
 
     mutex events_mutex;
     std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info_
+    absl::flat_hash_map<tsl::uint32, CorrelationInfo> correlation_info_
         TF_GUARDED_BY(events_mutex);
     absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
         occupancy_cache_;
@@ -925,7 +947,6 @@ class GpuTracer : public profiler::ProfilerInterface {
  private:
   Status DoStart();
   Status DoStop();
-  Status DoCollectData(XSpace* space);
 
   RocmTracerOptions GetRocmTracerOptions();
 
@@ -1036,7 +1057,7 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
 
 Status GpuTracer::DoStart() {
   if (!rocm_tracer_->IsAvailable()) {
-    return errors::Unavailable("Another profile session running.");
+    return tsl::errors::Unavailable("Another profile session running.");
   }
 
   AnnotationStack::Enable(true);
@@ -1044,7 +1065,7 @@ Status GpuTracer::DoStart() {
   RocmTraceCollectorOptions trace_collector_options =
       GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
   uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
-  uint64_t start_walltime_ns = tensorflow::EnvTime::NowNanos();
+  uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
   rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
       trace_collector_options, start_walltime_ns, start_gputime_ns);
 
@@ -1079,18 +1100,14 @@ Status GpuTracer::Stop() {
   return OkStatus();
 }
 
-Status GpuTracer::DoCollectData(XSpace* space) {
-  if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
-  return OkStatus();
-}
-
 Status GpuTracer::CollectData(XSpace* space) {
   switch (profiling_state_) {
     case State::kNotStarted:
       VLOG(3) << "No trace data collected, session wasn't started";
       return OkStatus();
     case State::kStartedOk:
-      return errors::FailedPrecondition("Cannot collect trace before stopping");
+      return tsl::errors::FailedPrecondition(
+          "Cannot collect trace before stopping");
     case State::kStartedError:
       LOG(ERROR) << "Cannot collect, roctracer failed to start";
       return OkStatus();
@@ -1098,11 +1115,11 @@ Status GpuTracer::CollectData(XSpace* space) {
       VLOG(3) << "No trace data collected";
       return OkStatus();
     case State::kStoppedOk: {
-      DoCollectData(space);
+      if (rocm_trace_collector_) rocm_trace_collector_->Export(space);
       return OkStatus();
     }
   }
-  return errors::Internal("Invalid profiling state: ", profiling_state_);
+  return tsl::errors::Internal("Invalid profiling state: ", profiling_state_);
 }
 
 // Not in anonymous namespace for testing purposes.
@@ -1125,6 +1142,6 @@ auto register_rocm_gpu_tracer_factory = [] {
 }();
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
 
 #endif  // TENSORFLOW_USE_ROCM
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/mock_cupti.h b/tensorflow/compiler/xla/backends/profiler/gpu/mock_cupti.h
new file mode 100644
index 00000000000..3a7b4b3eb7e
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/mock_cupti.h
@@ -0,0 +1,168 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdint>
+
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace profiler {
+
+// A mock object automatically generated by gmock_gen.py.
+class MockCupti : public xla::profiler::CuptiInterface {
+ public:
+  MOCK_METHOD(CUptiResult, ActivityDisable, (CUpti_ActivityKind kind),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityEnable, (CUpti_ActivityKind kind),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityFlushAll, (uint32_t flag), (override));
+  MOCK_METHOD(CUptiResult, ActivityGetNextRecord,
+              (uint8_t * buffer, size_t valid_buffer_size_bytes,
+               CUpti_Activity** record),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityGetNumDroppedRecords,
+              (CUcontext context, uint32_t stream_id, size_t* dropped),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityConfigureUnifiedMemoryCounter,
+              (CUpti_ActivityUnifiedMemoryCounterConfig * config,
+               uint32_t count),
+              (override));
+  MOCK_METHOD(CUptiResult, ActivityRegisterCallbacks,
+              (CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+               CUpti_BuffersCallbackCompleteFunc func_buffer_completed),
+              (override));
+  MOCK_METHOD(CUptiResult, GetDeviceId, (CUcontext context, uint32_t* deviceId),
+              (override));
+  MOCK_METHOD(CUptiResult, GetTimestamp, (uint64_t * timestamp), (override));
+  MOCK_METHOD(CUptiResult, Finalize, (), (override));
+  MOCK_METHOD(CUptiResult, EnableCallback,
+              (uint32_t enable, CUpti_SubscriberHandle subscriber,
+               CUpti_CallbackDomain domain, CUpti_CallbackId cbid),
+              (override));
+  MOCK_METHOD(CUptiResult, EnableDomain,
+              (uint32_t enable, CUpti_SubscriberHandle subscriber,
+               CUpti_CallbackDomain domain),
+              (override));
+  MOCK_METHOD(CUptiResult, Subscribe,
+              (CUpti_SubscriberHandle * subscriber, CUpti_CallbackFunc callback,
+               void* userdata),
+              (override));
+  MOCK_METHOD(CUptiResult, Unsubscribe, (CUpti_SubscriberHandle subscriber),
+              (override));
+  MOCK_METHOD(CUptiResult, DeviceEnumEventDomains,
+              (CUdevice device, size_t* array_size_bytes,
+               CUpti_EventDomainID* domain_array),
+              (override));
+  MOCK_METHOD(CUptiResult, DeviceGetEventDomainAttribute,
+              (CUdevice device, CUpti_EventDomainID event_domain,
+               CUpti_EventDomainAttribute attrib, size_t* value_size,
+               void* value),
+              (override));
+  MOCK_METHOD(CUptiResult, DisableKernelReplayMode, (CUcontext context),
+              (override));
+  MOCK_METHOD(CUptiResult, EnableKernelReplayMode, (CUcontext context),
+              (override));
+  MOCK_METHOD(CUptiResult, DeviceGetNumEventDomains,
+              (CUdevice device, uint32_t* num_domains), (override));
+  MOCK_METHOD(CUptiResult, EventDomainEnumEvents,
+              (CUpti_EventDomainID event_domain, size_t* array_size_bytes,
+               CUpti_EventID* event_array),
+              (override));
+  MOCK_METHOD(CUptiResult, EventDomainGetNumEvents,
+              (CUpti_EventDomainID event_domain, uint32_t* num_events),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGetAttribute,
+              (CUpti_EventID event, CUpti_EventAttribute attrib,
+               size_t* value_size, void* value),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGetIdFromName,
+              (CUdevice device, const char* event_name, CUpti_EventID* event),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupDisable, (CUpti_EventGroup event_group),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupEnable, (CUpti_EventGroup event_group),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupGetAttribute,
+              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
+               size_t* value_size, void* value),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupReadEvent,
+              (CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
+               CUpti_EventID event, size_t* event_value_buffer_size_bytes,
+               uint64_t* eventValueBuffer),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupSetAttribute,
+              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
+               size_t value_size, void* value),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupSetsCreate,
+              (CUcontext context, size_t event_id_array_size_bytes,
+               CUpti_EventID* event_id_array,
+               CUpti_EventGroupSets** event_group_passes),
+              (override));
+  MOCK_METHOD(CUptiResult, EventGroupSetsDestroy,
+              (CUpti_EventGroupSets * event_group_sets), (override));
+  MOCK_METHOD(CUptiResult, DeviceEnumMetrics,
+              (CUdevice device, size_t* arraySizeBytes,
+               CUpti_MetricID* metricArray),
+              (override));
+  MOCK_METHOD(CUptiResult, DeviceGetNumMetrics,
+              (CUdevice device, uint32_t* num_metrics), (override));
+  MOCK_METHOD(CUptiResult, MetricGetIdFromName,
+              (CUdevice device, const char* metric_name,
+               CUpti_MetricID* metric),
+              (override));
+  MOCK_METHOD(CUptiResult, MetricGetNumEvents,
+              (CUpti_MetricID metric, uint32_t* num_events), (override));
+  MOCK_METHOD(CUptiResult, MetricEnumEvents,
+              (CUpti_MetricID metric, size_t* event_id_array_size_bytes,
+               CUpti_EventID* event_id_array),
+              (override));
+  MOCK_METHOD(CUptiResult, MetricGetAttribute,
+              (CUpti_MetricID metric, CUpti_MetricAttribute attrib,
+               size_t* value_size, void* value),
+              (override));
+  MOCK_METHOD(CUptiResult, MetricGetValue,
+              (CUdevice device, CUpti_MetricID metric,
+               size_t event_id_array_size_bytes, CUpti_EventID* event_id_array,
+               size_t event_value_array_size_bytes, uint64_t* event_value_array,
+               uint64_t time_duration, CUpti_MetricValue* metric_value),
+              (override));
+  MOCK_METHOD(CUptiResult, GetResultString,
+              (CUptiResult result, const char** str), (override));
+
+  MOCK_METHOD(CUptiResult, GetContextId,
+              (CUcontext context, uint32_t* context_id), (override));
+
+  MOCK_METHOD(CUptiResult, GetStreamIdEx,
+              (CUcontext context, CUstream stream, uint8_t per_thread_stream,
+               uint32_t* stream_id),
+              (override));
+
+  MOCK_METHOD(void, CleanUp, (), (override));
+  MOCK_METHOD(bool, Disabled, (), (const, override));
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_MOCK_CUPTI_H_
diff --git a/tensorflow/core/profiler/backends/gpu/nvtx_utils.cc b/tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.cc
similarity index 85%
rename from tensorflow/core/profiler/backends/gpu/nvtx_utils.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.cc
index 13d6807a845..2e54e047ab9 100644
--- a/tensorflow/core/profiler/backends/gpu/nvtx_utils.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/nvtx_utils.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h"
 
 #include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
-#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/tsl/platform/platform.h"
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 
 /*static*/ std::stack<std::string> &NVTXRangeTracker::GetRangeStack() {
@@ -27,4 +27,4 @@ namespace profiler {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h b/tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h
new file mode 100644
index 00000000000..f6fb1f27f12
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
+
+#include <stack>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/macros.h"
+
+namespace xla {
+namespace profiler {
+
+/***
+ * We have no intention to use NVTX in tensorflow right now, we use this class
+ * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
+ * This bears a lot of resemblance to ScopedAnnotation for now.  In the future,
+ * we will use TraceMe to keep track trace context within a thread.
+ */
+class NVTXRangeTracker {
+ public:
+  static void EnterRange(const std::string& range) {
+    auto& range_stack = GetRangeStack();
+    range_stack.push(range);
+  }
+  static void ExitRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) range_stack.pop();
+  }
+  static const absl::string_view CurrentRange() {
+    auto& range_stack = GetRangeStack();
+    if (!range_stack.empty()) return range_stack.top();
+    return "";
+  }
+
+ private:
+  static std::stack<std::string>& GetRangeStack();
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
+};
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_NVTX_UTILS_H_
diff --git a/tensorflow/core/profiler/backends/gpu/rocm_tracer.cc b/tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.cc
similarity index 94%
rename from tensorflow/core/profiler/backends/gpu/rocm_tracer.cc
rename to tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.cc
index 811168993dc..fddd2f9032c 100644
--- a/tensorflow/core/profiler/backends/gpu/rocm_tracer.cc
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -13,34 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/backends/gpu/rocm_tracer.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h"
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "rocm/rocm_config.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/mem.h"
+#include "tensorflow/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+
+namespace xla {
 namespace profiler {
 
+namespace se = ::stream_executor;
+using tsl::mutex;
+using tsl::mutex_lock;
+using tsl::profiler::AnnotationStack;
+
 constexpr uint32_t RocmTracerEvent::kInvalidDeviceId;
 
-#define RETURN_IF_ROCTRACER_ERROR(expr)                                      \
-  do {                                                                       \
-    roctracer_status_t status = expr;                                        \
-    if (status != ROCTRACER_STATUS_SUCCESS) {                                \
-      const char* errstr = se::wrap::roctracer_error_string();               \
-      LOG(ERROR) << "function " << #expr << "failed with error " << errstr;  \
-      return errors::Internal(absl::StrCat("roctracer call error", errstr)); \
-    }                                                                        \
+#define RETURN_IF_ROCTRACER_ERROR(expr)                                     \
+  do {                                                                      \
+    roctracer_status_t status = expr;                                       \
+    if (status != ROCTRACER_STATUS_SUCCESS) {                               \
+      const char* errstr = se::wrap::roctracer_error_string();              \
+      LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
+      return tsl::errors::Internal(                                         \
+          absl::StrCat("roctracer call error", errstr));                    \
+    }                                                                       \
   } while (false)
 
 namespace {
@@ -50,7 +54,7 @@ namespace {
 // it can take roughly 98ns, while it takes roughly 1ns with this caching.
 int32_t GetCachedTID() {
   static thread_local int32_t current_thread_id =
-      Env::Default()->GetCurrentThreadId();
+      tsl::Env::Default()->GetCurrentThreadId();
   return current_thread_id;
 }
 
@@ -77,7 +81,7 @@ const char* GetActivityDomainName(uint32_t domain) {
   return "";
 }
 
-string GetActivityDomainOpName(uint32_t domain, uint32_t op) {
+std::string GetActivityDomainOpName(uint32_t domain, uint32_t op) {
   std::ostringstream oss;
   oss << GetActivityDomainName(domain) << " - ";
   switch (domain) {
@@ -267,7 +271,7 @@ const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
 
 void DumpRocmTracerEvent(const RocmTracerEvent& event,
                          uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const string& message) {
+                         const std::string& message) {
   std::ostringstream oss;
   oss << "correlation_id=" << event.correlation_id;
   oss << ",type=" << GetRocmTracerEventTypeName(event.type);
@@ -308,8 +312,8 @@ void DumpRocmTracerEvent(const RocmTracerEvent& event,
   VLOG(3) << oss.str();
 }
 
-Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                       const void* cbdata) {
+tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
+                                            const void* cbdata) {
   /* Some APIs such as hipMalloc, implicitly work on th devices set by the
     user using APIs such as hipSetDevice. API callbacks and activity records
     for functions like hipMalloc does not return the device id (CUDA does). To
@@ -321,7 +325,7 @@ Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
 
   // DumpApiCallbackData(domain, cbid, cbdata);
 
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return OkStatus();
+  if (domain != ACTIVITY_DOMAIN_HIP_API) return tsl::OkStatus();
 
   const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
 
@@ -349,7 +353,7 @@ Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
       } else {
         LOG(WARNING) << "An API exit callback received without API enter "
                         "with same correlation id. Event droped!";
-        return OkStatus();  // This API does not belong to us.
+        return tsl::OkStatus();  // This API does not belong to us.
       }
       exit_time = RocmTracer::GetTimestamp();
     }
@@ -430,7 +434,7 @@ Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
         break;
     }
   }
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
@@ -869,8 +873,8 @@ void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
   collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                            const char* end) {
+tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
+                                                 const char* end) {
   // we do not dump activities in this set in logger
 
   static std::set<activity_op_t> dump_excluded_activities = {
@@ -947,7 +951,7 @@ Status RocmActivityCallbackImpl::operator()(const char* begin,
           default:
             if (dump_excluded_activities.find(record->op) ==
                 dump_excluded_activities.end()) {
-              string drop_message(
+              std::string drop_message(
                   "\nNot in the API tracked activities. Dropped!");
               DumpActivityRecord(record, drop_message);
             }
@@ -988,30 +992,36 @@ Status RocmActivityCallbackImpl::operator()(const char* begin,
                 // markers are with 0ns duration.
                 break;
               default:
-                string drop_message(
+                std::string drop_message(
                     "\nNot in the HIP-OPS-COPY tracked activities. Dropeed!");
                 DumpActivityRecord(record, drop_message);
                 break;
             }  // switch (record->kind)
             break;
           default:
-            string drop_message(
+            std::string drop_message(
                 "\nNot in the HIP-OPS tracked activities. Dropped!");
             DumpActivityRecord(record, drop_message);
             break;
         }  // switch (record->op).
         break;
       default:
-        string drop_message("\nNot in the tracked domain activities. Dropped!");
+        std::string drop_message(
+            "\nNot in the tracked domain activities. Dropped!");
         DumpActivityRecord(record, drop_message);
         break;
     }
 
     RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
-        roctracer_next_record(record, &record)));
+#if TF_ROCM_VERSION >= 50300
+        se::wrap::roctracer_next_record(record, &record)
+#else
+        roctracer_next_record(record, &record)
+#endif
+            ));
   }
 
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
@@ -1401,16 +1411,18 @@ void RocmTracer::Disable() {
 void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
                  void* user_data) {
   RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ApiCallbackHandler(domain, cbid, cbdata);
+  tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
 }
 
-void RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                    const void* cbdata) {
-  if (api_tracing_enabled_) (*api_cb_impl_)(domain, cbid, cbdata);
+tsl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                           const void* cbdata) {
+  if (api_tracing_enabled_)
+    TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
+  return tsl::OkStatus();
 }
 
-Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return OkStatus();
+tsl::Status RocmTracer::EnableApiTracing() {
+  if (api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = true;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1432,11 +1444,11 @@ Status RocmTracer::EnableApiTracing() {
       }
     }
   }
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
-Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return OkStatus();
+tsl::Status RocmTracer::DisableApiTracing() {
+  if (!api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = false;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1458,17 +1470,18 @@ Status RocmTracer::DisableApiTracing() {
       }
     }
   }
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
 void ActivityCallback(const char* begin, const char* end, void* user_data) {
   RocmTracer* tracer = reinterpret_cast<RocmTracer*>(user_data);
-  tracer->ActivityCallbackHandler(begin, end);
+  tracer->ActivityCallbackHandler(begin, end).IgnoreError();
 }
 
-void RocmTracer::ActivityCallbackHandler(const char* begin, const char* end) {
+tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
+                                                const char* end) {
   if (activity_tracing_enabled_) {
-    (*activity_cb_impl_)(begin, end);
+    TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
   } else {
     LOG(WARNING) << "ActivityCallbackHandler called when "
                     "activity_tracing_enabled_ is false";
@@ -1481,14 +1494,21 @@ void RocmTracer::ActivityCallbackHandler(const char* begin, const char* end) {
     while (record < end_record) {
       DumpActivityRecord(record,
                          "activity_tracing_enabled_ is false. Dropped!");
-      roctracer_next_record(record, &record);
+#if TF_ROCM_VERSION >= 50300
+      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
+          se::wrap::roctracer_next_record(record, &record)));
+#else
+      RETURN_IF_ROCTRACER_ERROR(static_cast<roctracer_status_t>(
+          roctracer_next_record(record, &record)));
+#endif
     }
     VLOG(3) << "Dropped Activity Records End";
   }
+  return tsl::OkStatus();
 }
 
-Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return OkStatus();
+tsl::Status RocmTracer::EnableActivityTracing() {
+  if (activity_tracing_enabled_) return tsl::OkStatus();
   activity_tracing_enabled_ = true;
 
   if (!options_->activity_tracing.empty()) {
@@ -1525,11 +1545,11 @@ Status RocmTracer::EnableActivityTracing() {
     }
   }
 
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
-Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return OkStatus();
+tsl::Status RocmTracer::DisableActivityTracing() {
+  if (!activity_tracing_enabled_) return tsl::OkStatus();
 
   for (auto& iter : options_->activity_tracing) {
     activity_domain_t domain = iter.first;
@@ -1574,13 +1594,13 @@ Status RocmTracer::DisableActivityTracing() {
             << ", Threshold = " << threshold;
     VLOG(3) << "Wait for pending activity records : sleep for " << duration_ms
             << " ms";
-    tensorflow::profiler::SleepForMillis(duration_ms);
+    tsl::profiler::SleepForMillis(duration_ms);
   }
   ClearPendingActivityRecordsCount();
 
   activity_tracing_enabled_ = false;
 
-  return OkStatus();
+  return tsl::OkStatus();
 }
 
 /*static*/ uint64_t RocmTracer::GetTimestamp() {
@@ -1596,4 +1616,4 @@ Status RocmTracer::DisableActivityTracing() {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h b/tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h
new file mode 100644
index 00000000000..38f583727ad
--- /dev/null
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h
@@ -0,0 +1,395 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
+#define TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+
+namespace xla {
+namespace profiler {
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+// RocmTracerSyncTypes forward decleration
+enum class RocmTracerSyncTypes;
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
+};
+enum class RocmTracerSyncTypes {
+  InvalidSync = 0,
+  StreamSynchronize,  // caller thread wait stream to become empty
+  EventSynchronize,   // caller thread will block until event happens
+  StreamWait          // compute stream will wait for event to happen
+};
+
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+void DumpRocmTracerEvent(const RocmTracerEvent& event,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+                         const std::string& message);
+
+struct RocmTracerOptions {
+  std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
+
+  // map of domain --> ops for which we need to enable the API callbacks
+  // If the ops vector is empty, then enable API callbacks for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
+
+  // map of domain --> ops for which we need to enable the Activity records
+  // If the ops vector is empty, then enable Activity records for entire domain
+  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
+      activity_tracing;
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+class RocmTraceCollector {
+ public:
+  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
+      : options_(options), annotation_map_(options.max_annotation_strings) {}
+  virtual ~RocmTraceCollector() {}
+
+  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  RocmTraceCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+ public:
+  // Disable copy and move.
+  RocmTraceCollector(const RocmTraceCollector&) = delete;
+  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
+};
+
+class RocmTracer;
+
+class RocmApiCallbackImpl {
+ public:
+  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                      RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  tsl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+
+ private:
+  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                 uint64_t enter_time, uint64_t exit_time);
+  void AddNormalMemcpyEventUponApiExit(uint32_t cbid,
+                                       const hip_api_data_t* data,
+                                       uint64_t enter_time, uint64_t exit_time);
+  void AddMemcpyPeerEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                     uint64_t enter_time, uint64_t exit_time);
+  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                 uint64_t enter_time, uint64_t exit_time);
+  void AddMallocFreeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                     uint32_t device_id, uint64_t enter_time,
+                                     uint64_t exit_time);
+  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
+                                            const hip_api_data_t* data,
+                                            uint64_t enter_time,
+                                            uint64_t exit_time);
+  void AddSynchronizeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
+                                      uint64_t enter_time, uint64_t exit_time);
+
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+  tsl::mutex api_call_start_mutex_;
+  // TODO(rocm-profiler): replace this with absl hashmap
+  // keep a map from the corr. id to enter time for API callbacks.
+  std::map<uint32_t, uint64_t> api_call_start_time_
+      TF_GUARDED_BY(api_call_start_mutex_);
+};
+
+class RocmActivityCallbackImpl {
+ public:
+  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
+                           RocmTraceCollector* collector)
+      : options_(options), tracer_(tracer), collector_(collector) {}
+
+  tsl::Status operator()(const char* begin, const char* end);
+
+ private:
+  void AddHipKernelActivityEvent(const roctracer_record_t* record);
+  void AddNormalHipMemcpyActivityEvent(const roctracer_record_t* record);
+  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
+  void AddHipMallocActivityEvent(const roctracer_record_t* record);
+  void AddHipStreamSynchronizeActivityEvent(const roctracer_record_t* record);
+  void AddHccKernelActivityEvent(const roctracer_record_t* record);
+  void AddNormalHipOpsMemcpyActivityEvent(const roctracer_record_t* record);
+  void AddHipOpsMemsetActivityEvent(const roctracer_record_t* record);
+  RocmTracerOptions options_;
+  RocmTracer* tracer_ = nullptr;
+  RocmTraceCollector* collector_ = nullptr;
+};
+
+// The class use to enable cupti callback/activity API and forward the collected
+// trace events to RocmTraceCollector. There should be only one RocmTracer
+// per process.
+class RocmTracer {
+ public:
+  // Returns a pointer to singleton RocmTracer.
+  static RocmTracer* GetRocmTracerSingleton();
+
+  // Only one profile session can be live in the same time.
+  bool IsAvailable() const;
+
+  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
+  void Disable();
+
+  tsl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                 const void* cbdata);
+  tsl::Status ActivityCallbackHandler(const char* begin, const char* end);
+
+  static uint64_t GetTimestamp();
+  static int NumGpus();
+
+  void AddToPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Add(correlation_id);
+  }
+
+  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
+    pending_activity_records_.Remove(correlation_id);
+  }
+
+  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
+
+  size_t GetPendingActivityRecordsCount() {
+    return pending_activity_records_.Count();
+  }
+
+ protected:
+  // protected constructor for injecting mock cupti interface for testing.
+  explicit RocmTracer() : num_gpus_(NumGpus()) {}
+
+ private:
+  tsl::Status EnableApiTracing();
+  tsl::Status DisableApiTracing();
+
+  tsl::Status EnableActivityTracing();
+  tsl::Status DisableActivityTracing();
+
+  int num_gpus_;
+  std::optional<RocmTracerOptions> options_;
+  RocmTraceCollector* collector_ = nullptr;
+
+  bool api_tracing_enabled_ = false;
+  bool activity_tracing_enabled_ = false;
+
+  RocmApiCallbackImpl* api_cb_impl_;
+  RocmActivityCallbackImpl* activity_cb_impl_;
+
+  class PendingActivityRecords {
+   public:
+    // add a correlation id to the pending set
+    void Add(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.insert(correlation_id);
+    }
+    // remove a correlation id from the pending set
+    void Remove(uint32_t correlation_id) {
+      absl::MutexLock lock(&mutex);
+      pending_set.erase(correlation_id);
+    }
+    // clear the pending set
+    void Clear() {
+      absl::MutexLock lock(&mutex);
+      pending_set.clear();
+    }
+    // count the number of correlation ids in the pending set
+    size_t Count() {
+      absl::MutexLock lock(&mutex);
+      return pending_set.size();
+    }
+
+   private:
+    // set of co-relation ids for which the hcc activity record is pending
+    absl::flat_hash_set<uint32_t> pending_set;
+    // the callback which processes the activity records (and consequently
+    // removes items from the pending set) is called in a separate thread
+    // from the one that adds item to the list.
+    absl::Mutex mutex;
+  };
+  PendingActivityRecords pending_activity_records_;
+
+ public:
+  // Disable copy and move.
+  RocmTracer(const RocmTracer&) = delete;
+  RocmTracer& operator=(const RocmTracer&) = delete;
+};
+
+}  // namespace profiler
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
diff --git a/tensorflow/compiler/xla/backends/profiler/tpu/BUILD b/tensorflow/compiler/xla/backends/profiler/tpu/BUILD
index a6518b3edd6..67261b24e7c 100644
--- a/tensorflow/compiler/xla/backends/profiler/tpu/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/tpu/BUILD
@@ -1,9 +1,9 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tensorflow:tensorflow.bzl", "if_with_tpu_support")
+load("//tensorflow/tsl:tsl.bzl", "if_with_tpu_support")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -11,17 +11,18 @@ cc_library(
     name = "tpu_tracer",
     srcs = if_with_tpu_support(["tpu_tracer.cc"]),
     copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/compiler/xla:internal"],
     deps = [
         "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/lib:profiler_factory",
         "//tensorflow/tsl/profiler/lib:profiler_interface",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/utils:xplane_schema",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc b/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
index 2091e7f4b5b..48f9ec1bead 100644
--- a/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
+++ b/tensorflow/compiler/xla/backends/profiler/tpu/tpu_tracer.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/lib/profiler_factory.h"
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/profiler/utils/xplane_schema.h"
 
 namespace xla {
@@ -62,21 +62,21 @@ class TpuTracer : public ProfilerInterface {
 
 TpuTracer::TpuTracer() {
   StatusHelper status;
-  tensorflow::tpu::OpsApiFn()->TpuProfiler_CreateFn(&tpu_profiler_,
-                                                    status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuProfiler_CreateFn(&tpu_profiler_,
+                                                         status.c_status);
   if (!status.ok()) {
     LOG(ERROR) << status.status().error_message();
   }
 }
 
 TpuTracer::~TpuTracer() {
-  tensorflow::tpu::OpsApiFn()->TpuProfiler_DestroyFn(tpu_profiler_);
+  stream_executor::tpu::OpsApiFn()->TpuProfiler_DestroyFn(tpu_profiler_);
 }
 
 Status TpuTracer::Start() {
   StatusHelper status;
-  tensorflow::tpu::OpsApiFn()->TpuProfiler_StartFn(tpu_profiler_,
-                                                   status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuProfiler_StartFn(tpu_profiler_,
+                                                        status.c_status);
   if (!status.ok()) {
     LOG(ERROR) << "TPU tracer failed to start.";
     return status.status();
@@ -86,8 +86,8 @@ Status TpuTracer::Start() {
 
 Status TpuTracer::Stop() {
   StatusHelper status;
-  tensorflow::tpu::OpsApiFn()->TpuProfiler_StopFn(tpu_profiler_,
-                                                  status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuProfiler_StopFn(tpu_profiler_,
+                                                       status.c_status);
   if (!status.ok()) {
     LOG(ERROR) << "TPU tracer failed to stop.";
     return status.status();
@@ -99,13 +99,13 @@ Status TpuTracer::CollectData(XSpace* space) {
   StatusHelper status;
   // Get size of buffer required for TPU driver to serialize XSpace into.
   size_t size_in_bytes;
-  tensorflow::tpu::OpsApiFn()->TpuProfiler_CollectDataFn(
+  stream_executor::tpu::OpsApiFn()->TpuProfiler_CollectDataFn(
       tpu_profiler_, status.c_status,
       /*buffer=*/nullptr, &size_in_bytes);
   // Prepare an appropriately sized buffer.
   if (size_in_bytes > 0) {
     std::vector<uint8_t> buffer(size_in_bytes);
-    tensorflow::tpu::OpsApiFn()->TpuProfiler_CollectDataFn(
+    stream_executor::tpu::OpsApiFn()->TpuProfiler_CollectDataFn(
         tpu_profiler_, status.c_status, buffer.data(), &size_in_bytes);
     // Deserialize XSpace from the buffer and return it.
     XSpace tpu_space;
@@ -132,7 +132,7 @@ std::unique_ptr<ProfilerInterface> CreateTpuTracer(
     return nullptr;
   }
   // Don't attempt to create a TpuTracer if the TPU C API isn't initialized.
-  if (tensorflow::tpu::OpsApiFn()->TpuProfiler_CreateFn == nullptr) {
+  if (stream_executor::tpu::OpsApiFn()->TpuProfiler_CreateFn == nullptr) {
     return nullptr;
   }
   return std::make_unique<TpuTracer>();
diff --git a/tensorflow/compiler/xla/c/BUILD b/tensorflow/compiler/xla/c/BUILD
index 21f7844ad7b..ad39c5cbfb4 100644
--- a/tensorflow/compiler/xla/c/BUILD
+++ b/tensorflow/compiler/xla/c/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain/tfrt/tpu_plugin:__subpackages__",
         "//tensorflow/core/common_runtime/next_pluggable_device:__subpackages__",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 2ec2f6a1021..2568100dac4 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   XLA client libraries.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -56,7 +57,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "padding_test",
     srcs = ["padding_test.cc"],
     deps = [
@@ -100,13 +101,15 @@ cc_library(
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/pjrt:compile_options_proto_cc",
+        "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -232,10 +235,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -263,7 +266,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/tsl/lib/core:bitmap",
@@ -278,10 +281,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "xla_builder_test",
     srcs = ["xla_builder_test.cc"],
     deps = [
+        ":sharding_builder",
         ":value_inference",
         ":xla_builder",
         ":xla_computation",
@@ -293,7 +297,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index c1d389fa839..30259c323a2 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -55,6 +60,13 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
   return *this;
 }
 
+CompilationEnvironments* ExecutableBuildOptions::mutable_comp_envs() {
+  if (!has_comp_envs()) {
+    comp_envs_.emplace();
+  }
+  return &*comp_envs_;
+}
+
 const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
@@ -109,6 +121,9 @@ StatusOr<ExecutableBuildOptionsProto> ExecutableBuildOptions::ToProto() const {
   if (result_layout()) {
     *output.mutable_result_layout() = result_layout()->ToProto();
   }
+  if (has_comp_envs()) {
+    *output.mutable_comp_envs() = comp_envs().ToProto();
+  }
   if (has_debug_options()) {
     *output.mutable_debug_options() = debug_options();
   }
@@ -128,8 +143,12 @@ StatusOr<ExecutableBuildOptionsProto> ExecutableBuildOptions::ToProto() const {
   }
   output.set_alias_passthrough_params(alias_passthrough_params());
   output.set_run_backend_only(run_backend_only());
-  output.set_allow_spmd_sharding_propagation_to_output(
-      allow_spmd_sharding_propagation_to_output());
+  if (!allow_spmd_sharding_propagation_to_output().empty()) {
+    output.mutable_allow_spmd_sharding_propagation_to_output()->Clear();
+    for (bool v : allow_spmd_sharding_propagation_to_output()) {
+      output.mutable_allow_spmd_sharding_propagation_to_output()->Add(v);
+    }
+  }
 
   return output;
 }
@@ -143,6 +162,12 @@ StatusOr<ExecutableBuildOptions> ExecutableBuildOptionsFromProto(
   if (input.has_result_layout()) {
     output.set_result_layout(xla::Shape(input.result_layout()));
   }
+  if (input.has_comp_envs()) {
+    TF_ASSIGN_OR_RETURN(
+        auto comp_envs,
+        xla::CompilationEnvironments::CreateFromProto(input.comp_envs()));
+    *output.mutable_comp_envs() = std::move(*comp_envs);
+  }
   if (input.has_debug_options()) {
     *output.mutable_debug_options() = input.debug_options();
   }
@@ -210,8 +235,14 @@ ExecutionOptions CreateExecutionOptions(
     execution_options.mutable_auto_spmd_partitioning_mesh_ids()->Add(t);
   }
   execution_options.set_deduplicate_hlo(build_options.deduplicate_hlo());
-  execution_options.set_allow_spmd_sharding_propagation_to_output(
-      build_options.allow_spmd_sharding_propagation_to_output());
+  if (!build_options.allow_spmd_sharding_propagation_to_output().empty()) {
+    execution_options.mutable_allow_spmd_sharding_propagation_to_output()
+        ->Clear();
+    for (bool v : build_options.allow_spmd_sharding_propagation_to_output()) {
+      execution_options.mutable_allow_spmd_sharding_propagation_to_output()
+          ->Add(v);
+    }
+  }
   if (build_options.has_device_assignment()) {
     TF_CHECK_OK(build_options.device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index a6f4e246772..ed4554e9c77 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -21,8 +21,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/pjrt/compile_options.pb.h"
+#include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -59,8 +62,16 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // Expose access to the XLA compilation environments, which will be passed to
+  // the compilation process. `comp_envs()` must not be called if
+  // `has_comp_envs()` returns false.
+  bool has_comp_envs() const { return comp_envs_.has_value(); }
+  const CompilationEnvironments& comp_envs() const { return *comp_envs_; }
+  CompilationEnvironments* mutable_comp_envs();
+
   // Expose access to the XLA debug options which will be passed to the
-  // compilation process.
+  // compilation process. `debug_options()` must not be called if
+  // `has_debug_options()` returns false.
   bool has_debug_options() const { return debug_options_.has_value(); }
   const DebugOptions& debug_options() const { return *debug_options_; }
   DebugOptions* mutable_debug_options();
@@ -146,9 +157,13 @@ class ExecutableBuildOptions {
     return *this;
   }
 
-  bool allow_spmd_sharding_propagation_to_output() const {
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_output() const {
     return allow_spmd_sharding_propagation_to_output_;
   }
+  bool any_allow_spmd_sharding_propagation_to_output() const {
+    return absl::c_any_of(allow_spmd_sharding_propagation_to_output_,
+                          [](bool b) { return b; });
+  }
   // Allows sharding propagation to propagate to the outputs. This changes the
   // output shape of the computation (which is undesirable), but it can be used
   // to allow to run partial compilation to determine what would be the output
@@ -157,9 +172,10 @@ class ExecutableBuildOptions {
   // sharding of operations when multiple computation would be chained and
   // merged together.
   ExecutableBuildOptions& set_allow_spmd_sharding_propagation_to_output(
-      bool allow_spmd_sharding_propagation_to_output) {
-    allow_spmd_sharding_propagation_to_output_ =
-        allow_spmd_sharding_propagation_to_output;
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_output) {
+    allow_spmd_sharding_propagation_to_output_.assign(
+        allow_spmd_sharding_propagation_to_output.begin(),
+        allow_spmd_sharding_propagation_to_output.end());
     return *this;
   }
 
@@ -190,6 +206,7 @@ class ExecutableBuildOptions {
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
+  std::optional<CompilationEnvironments> comp_envs_;
   std::optional<DebugOptions> debug_options_;
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
@@ -203,7 +220,8 @@ class ExecutableBuildOptions {
   std::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
   bool run_backend_only_ = false;
-  bool allow_spmd_sharding_propagation_to_output_ = false;
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_output_ = {
+      false};
   tsl::thread::ThreadPool* compile_thread_pool_ = nullptr;
   LayoutCanonicalizationCallback layout_canonicalization_callback_;
 };
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 8e503c1cd08..69531d0d307 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -1,10 +1,11 @@
 # Common computation builders for XLA.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/client:friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index d18395dbfed..1972b97c299 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1163,10 +1163,6 @@ XlaOp Asin(XlaOp x) {
 
 XlaOp Atan(XlaOp x) { return Atan2(x, ScalarLike(x, 1.0)); }
 
-XlaOp Tan(XlaOp x) {
-  return DoWithUpcastToF32(x, {F16}, [](XlaOp x) { return Sin(x) / Cos(x); });
-}
-
 // Hyperbolic trigonometric functions.
 
 // acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index e6b5ac992cc..b922f85b4a5 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -93,9 +93,6 @@ XlaOp Asin(XlaOp x);
 // Computes the arc tangent of 'x'.
 XlaOp Atan(XlaOp x);
 
-// Computes the tangent of 'x'.
-XlaOp Tan(XlaOp x);
-
 // Hyperbolic trigonometric functions
 
 // Computes the inverse hyperbolic cosine of 'x'.
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index b2399e62081..d31036e6035 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -199,6 +199,10 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
     } else {
       continue;
     }
+    if (ty == F8E5M2 || ty == F8E4M3FN) {
+      // TODO(b/259609697): Add FP8 support to math ops
+      continue;
+    }
 
     for (const auto& test :
          std::vector<std::pair<std::function<XlaOp(XlaOp)>, std::string>>({
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 1f224c7af13..8466b3d51a8 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -190,7 +190,9 @@ SplitShapePair SplitShapeIntoHalves(const Shape& shape) {
       }
     }
   }
-  CHECK_GE(pair.split_dim, 0);
+  if (pair.split_dim < 0) {
+    LOG(ERROR) << "This point shouldn't have been reached.";
+  }
   std::vector<int64_t> half_shape_dims;
   std::vector<int64_t> concat_shape_dims;
   const auto rank = shape.rank();
@@ -251,6 +253,18 @@ RngOutput ThreeFryRngBit32(XlaOp key, XlaOp initial_state, const Shape& shape) {
   return {result, inputs_state.second};
 }
 
+// Generates random 16bits with the given shape using the Three Fry
+// implementation. Returns the random bits and the new state.
+RngOutput ThreeFryRngBit16(XlaOp op_key, XlaOp initial_state,
+                           const Shape& shape) {
+  // TODO(b/256713018): Use a better approach to not waste the upper 16 bits.
+  auto new_shape = shape;
+  new_shape.set_element_type(U32);
+  auto output = ThreeFryRngBit32(op_key, initial_state, new_shape);
+  output.value = ConvertElementType(output.value, U16);
+  return output;
+}
+
 // Generates random 64bits with the given shape using the Three Fry
 // implementation. Returns the random bits and the new state.
 RngOutput ThreeFryRngBit64(XlaOp key, XlaOp initial_state, const Shape& shape) {
@@ -429,6 +443,21 @@ RngOutput PhiloxRngBit32(XlaOp op_key, XlaOp initial_state,
   return {Reshape(numbers, shape.dimensions()), new_state};
 }
 
+// Generates an array of primitive type U16 with the given shape containing
+// random bits generated by the Philox algorithm. Returns the array and the new
+// state of the random number generator.
+RngOutput PhiloxRngBit16(XlaOp op_key, XlaOp initial_state,
+                         const Shape& shape) {
+  // We use PhiloxRngBit32 and throw away the upper 16 bits here, to align with
+  // the non-XLA kernels.
+  // TODO(b/256713018): Use a better approach to not waste the upper 16 bits.
+  auto new_shape = shape;
+  new_shape.set_element_type(U32);
+  auto output = PhiloxRngBit32(op_key, initial_state, new_shape);
+  output.value = ConvertElementType(output.value, U16);
+  return output;
+}
+
 // Generates an array of primitive type U64 with the given shape containing
 // random bits generated by the Philox algorithm. Returns the array and the new
 // state of the random number generator.
@@ -471,29 +500,59 @@ XlaOp ConvertRandomBitsToUniformFloatingPoint(XlaOp bits, XlaOp minval,
     TF_ASSIGN_OR_RETURN(const Shape* bits_shape, builder->GetShapePtr(bits));
     PrimitiveType value_type = minval_shape->element_type();
     PrimitiveType bit_type = bits_shape->element_type();
-    CHECK((value_type == F32 && bit_type == U32) ||
-          (value_type == F64 && bit_type == U64));
-
-    // Form random mantissa bits for float/double, with a leading 1 bit.
-    int num_float_bits = primitive_util::BitWidth(value_type);
-    // Subtract one as SignificandWidth includes the leading 1 bit.
-    int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
-
-    // Ignore the exponent bits and convert the mantissa bits to the floating
-    // point type.
-    bits = ShiftRightLogical(
-        bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
-
-    // We have an integer-valued floating point number in the range
-    // [0, 2**{num_mantissa_bits}).
-    XlaOp values = ConvertElementType(bits, value_type);
-
-    // Divide by 2**{-num_mantissa_bits} to get a number in the range
-    // [0.0, 1.0).
-    values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+    auto is_f32_or_f64 = (value_type == F32 && bit_type == U32) ||
+                         (value_type == F64 && bit_type == U64);
+    auto is_f16 = value_type == F16 && bit_type == U16;
+    if (!(is_f32_or_f64 || is_f16)) {
+      return InvalidArgument(
+          "In ConvertRandomBitsToUniformFloatingPoint, value_type and bit_type "
+          "can only be one of those combinations: (float16, uint16), (float32, "
+          "uint32) and (float64, uint64). Got combination: (%s, %s).",
+          primitive_util::LowercasePrimitiveTypeName(value_type),
+          primitive_util::LowercasePrimitiveTypeName(bit_type));
+    }
 
-    // Multiply and add to shift to the range [minval, maxval).
-    return values * (maxval - minval) + minval;
+    if (is_f32_or_f64) {
+      // TODO(b/256715195): Consider using the approach in the F16 case.
+
+      // Form random mantissa bits for float/double, with a leading 1 bit.
+      int num_float_bits = primitive_util::BitWidth(value_type);
+      // Subtract one as SignificandWidth includes the leading 1 bit.
+      int num_mantissa_bits = primitive_util::SignificandWidth(value_type) - 1;
+
+      // Ignore the exponent bits and convert the mantissa bits to the floating
+      // point type.
+      bits = ShiftRightLogical(
+          bits, ScalarLike(bits, num_float_bits - num_mantissa_bits));
+
+      // We have an integer-valued floating point number in the range
+      // [0, 2**{num_mantissa_bits}).
+      XlaOp values = ConvertElementType(bits, value_type);
+
+      // Divide by 2**{-num_mantissa_bits} to get a number in the range
+      // [0.0, 1.0).
+      values = values * ScalarLike(values, std::ldexp(1., -num_mantissa_bits));
+
+      // Multiply and add to shift to the range [minval, maxval).
+      return values * (maxval - minval) + minval;
+    } else if (is_f16) {
+      // This path follows the approach of the non-XLA kernels (see
+      // `tsl::random::Uint16ToHalf`). IEEE754 halfs are formatted as follows
+      // (MSB first):
+      //    sign(1) exponent(5) mantissa(10)
+      // Conceptually construct the following:
+      //    sign == 0
+      //    exponent == 15  -- an excess 15 representation of a zero exponent
+      //    mantissa == 10 random bits
+
+      auto mantissa = bits & ScalarLike(bits, 0x3ffu);  // 10 bit mantissa
+      auto exponent = ScalarLike(bits, static_cast<uint16_t>(15) << 10);
+      auto u16_result = exponent | mantissa;
+      auto result = BitcastConvertType(u16_result, F16);
+      return result - ScalarLike(result, 1.0);
+    } else {
+      return InternalError("This point shouldn't have been reached.");
+    }
   });
 }
 
@@ -534,6 +593,10 @@ RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const Shape& shape) {
   PrimitiveType type = shape.element_type();
   switch (type) {
+    case F16:
+    case U16:
+    case S16:
+      return ThreeFryRngBit16(key, initial_state, shape);
     case F32:
     case U32:
     case S32:
@@ -543,11 +606,12 @@ RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
     case S64:
       return ThreeFryRngBit64(key, initial_state, shape);
     default:
-      return {key.builder()->ReportError(Unimplemented(
-                  "Types other than F32, F64, U32, S32, U64 and S64 "
-                  "are not implemented by ThreeFryBitGenerator; got %s",
-                  primitive_util::LowercasePrimitiveTypeName(type))),
-              initial_state};
+      return {
+          key.builder()->ReportError(Unimplemented(
+              "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and S64 "
+              "are not implemented by ThreeFryBitGenerator; got %s",
+              primitive_util::LowercasePrimitiveTypeName(type))),
+          initial_state};
   }
 }
 
@@ -555,6 +619,10 @@ RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state,
                              const Shape& shape) {
   PrimitiveType type = shape.element_type();
   switch (type) {
+    case F16:
+    case U16:
+    case S16:
+      return PhiloxRngBit16(key, initial_state, shape);
     case F32:
     case U32:
     case S32:
@@ -564,11 +632,12 @@ RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state,
     case S64:
       return PhiloxRngBit64(key, initial_state, shape);
     default:
-      return {key.builder()->ReportError(Unimplemented(
-                  "Types other than F32, F64, U32, S32, U64 and S64 "
-                  "are not implemented by PhiloxFryBitGenerator; got %s",
-                  primitive_util::LowercasePrimitiveTypeName(type))),
-              initial_state};
+      return {
+          key.builder()->ReportError(Unimplemented(
+              "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and S64 "
+              "are not implemented by PhiloxBitGenerator; got %s",
+              primitive_util::LowercasePrimitiveTypeName(type))),
+          initial_state};
   }
 }
 
@@ -600,9 +669,14 @@ RngOutput UniformIntDistribution(XlaOp key, XlaOp initial_state,
   PrimitiveType unsigned_type;
   if (type == U32 || type == S32) {
     unsigned_type = U32;
-  } else {
-    DCHECK(type == U64 || type == S64);
+  } else if (type == U64 || type == S64) {
     unsigned_type = U64;
+  } else {
+    return {key.builder()->ReportError(Unimplemented(
+                "Types other than U32, S32, U64 and S64 "
+                "are not implemented by UniformIntDistribution; got %s",
+                primitive_util::LowercasePrimitiveTypeName(type))),
+            initial_state};
   }
   return {
       ConvertRandomBitsToUniformInt(bits, minval, maxval, type, unsigned_type),
@@ -612,10 +686,18 @@ RngOutput UniformIntDistribution(XlaOp key, XlaOp initial_state,
 RngOutput NormalFloatingPointDistribution(XlaOp key, XlaOp initial_state,
                                           BitGeneratorTy bit_generator,
                                           const Shape& shape) {
+  XlaBuilder* builder = key.builder();
   PrimitiveType primitive_type = shape.element_type();
-  DCHECK(primitive_type == F32 || primitive_type == F64);
+  if (!(primitive_type == F16 || primitive_type == F32 ||
+        primitive_type == F64)) {
+    return {
+        builder->ReportError(Unimplemented(
+            "Types other than F16, F32 and F64 "
+            "are not implemented by NormalFloatingPointDistribution; got %s",
+            primitive_util::LowercasePrimitiveTypeName(primitive_type))),
+        initial_state};
+  }
 
-  XlaBuilder* builder = key.builder();
   auto shape_pair = SplitShapeIntoHalves(shape);
   RngOutput bits_state = UniformFloatingPointDistribution(
       key, initial_state, bit_generator,
diff --git a/tensorflow/compiler/xla/client/padding.cc b/tensorflow/compiler/xla/client/padding.cc
index 1fe8d556d1d..7fec04e2ac5 100644
--- a/tensorflow/compiler/xla/client/padding.cc
+++ b/tensorflow/compiler/xla/client/padding.cc
@@ -35,6 +35,16 @@ Status ValidatePaddingValues(absl::Span<const int64_t> input_dimensions,
         input_dimensions.size(), window_dimensions.size(),
         window_strides.size());
   }
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    if (window_dimensions[i] <= 0) {
+      return InvalidArgument("Window dimension %u has non-positive size %d", i,
+                             window_dimensions[i]);
+    }
+    if (window_strides[i] <= 0) {
+      return InvalidArgument("Window dimension %u has non-positive stride %d",
+                             i, window_strides[i]);
+    }
+  }
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/xla/client/value_inference.cc b/tensorflow/compiler/xla/client/value_inference.cc
index db873636140..c14bb8dd043 100644
--- a/tensorflow/compiler/xla/client/value_inference.cc
+++ b/tensorflow/compiler/xla/client/value_inference.cc
@@ -24,20 +24,20 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -510,6 +510,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
     case HloOpcode::kSubtract:
     case HloOpcode::kCos:
     case HloOpcode::kSin:
+    case HloOpcode::kTan:
     case HloOpcode::kNegate:
     case HloOpcode::kAbs:
     case HloOpcode::kDivide:
@@ -1125,6 +1126,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kConvert:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh: {
       // Forward operand as they don't change if a value is dynamic or static.
       return result.AddVisit([](Literal operand) { return operand; });
diff --git a/tensorflow/compiler/xla/client/value_inference.h b/tensorflow/compiler/xla/client/value_inference.h
index aedc214c8c2..2579f65059f 100644
--- a/tensorflow/compiler/xla/client/value_inference.h
+++ b/tensorflow/compiler/xla/client/value_inference.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bde30ba28c2..576c3ce68f4 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <numeric>
 #include <optional>
 #include <queue>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -29,7 +30,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -37,14 +37,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/sharding_op_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -109,13 +108,26 @@ XlaOp XlaBuilderFriend::BuildAddDependency(XlaBuilder* builder, XlaOp operand,
   });
 }
 
-XlaOp XlaBuilderFriend::BuildFusion(XlaBuilder* builder,
-                                    absl::Span<const XlaOp> operands,
-                                    absl::string_view fusion_kind,
-                                    const XlaComputation& fused_computation) {
+XlaOp XlaBuilderFriend::BuildFusion(
+    XlaBuilder* builder, absl::Span<const XlaOp> operands,
+    absl::string_view fusion_kind, const XlaComputation& fused_computation,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+        output_operand_aliasing) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_fusion_kind(std::string(fusion_kind));
+    if (!output_operand_aliasing.empty()) {
+      for (const auto& pair : output_operand_aliasing) {
+        auto aliasing = instr.add_output_operand_aliasing();
+        aliasing->set_operand_index(pair.second.first);
+        for (int64_t index : pair.second.second) {
+          aliasing->add_operand_shape_index(index);
+        }
+        for (int64_t index : pair.first) {
+          aliasing->add_output_shape_index(index);
+        }
+      }
+    }
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(auto program_shape,
                         fused_computation.GetProgramShape());
@@ -126,7 +138,7 @@ XlaOp XlaBuilderFriend::BuildFusion(XlaBuilder* builder,
   });
 }
 
-XlaOp XlaBuilderFriend::BuildAsyncStart(
+std::pair<XlaOp, int64_t> XlaBuilderFriend::BuildAsyncStart(
     XlaBuilder* builder, absl::Span<const XlaOp> operands,
     std::string execution_thread, const XlaComputation& called_computation,
     const Shape& shape) {
@@ -134,38 +146,42 @@ XlaOp XlaBuilderFriend::BuildAsyncStart(
                          called_computation, shape);
 }
 
-XlaOp XlaBuilderFriend::BuildAsyncStart(
+std::pair<XlaOp, int64_t> XlaBuilderFriend::BuildAsyncStart(
     XlaBuilder* builder, absl::Span<const XlaOp> operands,
     std::string execution_thread, int64_t group_id,
     const XlaComputation& called_computation, const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  int64_t called_computation_id;
+  auto start_op = builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
     instr.set_async_group_id(group_id);
     builder->AddCalledComputation(called_computation, &instr);
+    called_computation_id = instr.called_computation_ids()[0];
     return builder->AddInstruction(std::move(instr), HloOpcode::kAsyncStart,
                                    operands);
   });
+  return {start_op, called_computation_id};
 }
 
-XlaOp XlaBuilderFriend::BuildAsyncUpdate(
-    XlaBuilder* builder, const XlaOp operand, std::string execution_thread,
-    const XlaComputation& called_computation, const Shape& shape) {
+XlaOp XlaBuilderFriend::BuildAsyncUpdate(XlaBuilder* builder,
+                                         const XlaOp operand,
+                                         std::string execution_thread,
+                                         int64_t called_computation,
+                                         const Shape& shape) {
   return BuildAsyncUpdate(builder, operand, execution_thread, /*group_id=*/-1,
                           called_computation, shape);
 }
 
 XlaOp XlaBuilderFriend::BuildAsyncUpdate(
     XlaBuilder* builder, const XlaOp operand, std::string execution_thread,
-    int64_t group_id, const XlaComputation& called_computation,
-    const Shape& shape) {
+    int64_t group_id, int64_t called_computation, const Shape& shape) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
     instr.set_async_group_id(group_id);
-    builder->AddCalledComputation(called_computation, &instr);
+    instr.add_called_computation_ids(called_computation);
     return builder->AddInstruction(std::move(instr), HloOpcode::kAsyncUpdate,
                                    {operand});
   });
@@ -173,7 +189,7 @@ XlaOp XlaBuilderFriend::BuildAsyncUpdate(
 
 XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
                                        std::string execution_thread,
-                                       const XlaComputation& called_computation,
+                                       int64_t called_computation,
                                        const Shape& shape) {
   return BuildAsyncDone(builder, operand, execution_thread, /*group_id=*/-1,
                         called_computation, shape);
@@ -182,14 +198,14 @@ XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
 XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
                                        std::string execution_thread,
                                        int64_t group_id,
-                                       const XlaComputation& called_computation,
+                                       int64_t called_computation,
                                        const Shape& shape) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
     instr.set_async_group_id(group_id);
-    builder->AddCalledComputation(called_computation, &instr);
+    instr.add_called_computation_ids(called_computation);
     return builder->AddInstruction(std::move(instr), HloOpcode::kAsyncDone,
                                    {operand});
   });
@@ -239,6 +255,56 @@ XlaOp XlaBuilderFriend::BuildAllReduceDone(XlaBuilder* builder,
   });
 }
 
+XlaOp XlaBuilderFriend::BuildCopyStart(
+    XlaBuilder* builder, const XlaOp operand,
+    std::optional<int> cross_program_prefetch_index) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (cross_program_prefetch_index) {
+      instr.set_cross_program_prefetch_index(*cross_program_prefetch_index);
+    }
+
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape,
+                        builder->GetShapePtr(operand));
+    Shape u32 = ShapeUtil::MakeScalarShape(PrimitiveType::U32);
+    Shape shape =
+        ShapeUtil::MakeTupleShapeWithPtrs({operand_shape, operand_shape, &u32});
+    *instr.mutable_shape() = shape.ToProto();
+
+    return builder->AddInstruction(std::move(instr), HloOpcode::kCopyStart,
+                                   {operand});
+  });
+}
+
+XlaOp XlaBuilderFriend::BuildCopyDone(XlaBuilder* builder, const XlaOp operand,
+                                      const Shape& shape) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return builder->AddInstruction(std::move(instr), HloOpcode::kCopyDone,
+                                   {operand});
+  });
+}
+
+XlaOp XlaBuilderFriend::BuildCollectivePermuteStart(
+    XlaBuilder* builder, XlaOp operand,
+    const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+    const std::optional<ChannelHandle>& channel_id) {
+  return builder->CollectivePermuteImpl(operand, source_target_pairs,
+                                        channel_id, /*async=*/true);
+}
+
+XlaOp XlaBuilderFriend::BuildCollectivePermuteDone(XlaBuilder* builder,
+                                                   const XlaOp operand,
+                                                   const Shape& shape) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return builder->AddInstruction(
+        std::move(instr), HloOpcode::kCollectivePermuteDone, {operand});
+  });
+}
+
 XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
                                      const Shape& shape) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -271,6 +337,73 @@ XlaOp XlaBuilderFriend::BuildPartitionId(XlaBuilder* builder,
   });
 }
 
+XlaOp XlaBuilderFriend::BuildSend(XlaBuilder* builder, XlaOp operand,
+                                  XlaOp token, const ChannelHandle& handle,
+                                  bool is_host_transfer) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto send_instr;
+    TF_ASSIGN_OR_RETURN(const Shape* shape, builder->GetShapePtr(operand));
+    // Send instruction produces a tuple of {aliased operand, U32 context,
+    // token}.
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({*shape, ShapeUtil::MakeShape(U32, {}),
+                                   ShapeUtil::MakeTokenShape()})
+            .ToProto();
+    send_instr.set_channel_id(handle.handle());
+    send_instr.set_is_host_transfer(is_host_transfer);
+    return builder->AddInstruction(std::move(send_instr), HloOpcode::kSend,
+                                   {operand, token});
+  });
+}
+
+XlaOp XlaBuilderFriend::BuildSendDone(XlaBuilder* builder, XlaOp operand,
+                                      const ChannelHandle& handle,
+                                      bool is_host_transfer) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto send_done_instr;
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
+    send_done_instr.set_channel_id(handle.handle());
+    send_done_instr.set_is_host_transfer(is_host_transfer);
+    return builder->AddInstruction(std::move(send_done_instr),
+                                   HloOpcode::kSendDone, {operand});
+  });
+}
+
+XlaOp XlaBuilderFriend::BuildRecv(XlaBuilder* builder, XlaOp token,
+                                  const Shape& shape,
+                                  const ChannelHandle& handle,
+                                  bool is_host_transfer) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Recv instruction produces a tuple of {receive buffer, U32 context,
+    // token}.
+    HloInstructionProto recv_instr;
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
+    recv_instr.set_channel_id(handle.handle());
+    recv_instr.set_is_host_transfer(is_host_transfer);
+    return builder->AddInstruction(std::move(recv_instr), HloOpcode::kRecv,
+                                   {token});
+  });
+}
+
+XlaOp XlaBuilderFriend::BuildRecvDone(XlaBuilder* builder, XlaOp token,
+                                      const Shape& shape,
+                                      const ChannelHandle& handle,
+                                      bool is_host_transfer) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto recv_done_instr;
+    *recv_done_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
+    recv_done_instr.set_channel_id(handle.handle());
+    recv_done_instr.set_is_host_transfer(is_host_transfer);
+    return builder->AddInstruction(std::move(recv_done_instr),
+                                   HloOpcode::kRecvDone, {token});
+  });
+}
+
 XlaOp XlaBuilderFriend::BuildRngGetAndUpdateState(XlaBuilder* builder,
 
                                                   int64_t delta,
@@ -393,7 +526,7 @@ void XlaBuilder::ToStringHelper(std::string* out, int ident,
 XlaBuilder::XlaBuilder(const std::string& computation_name)
     : name_(computation_name) {}
 
-XlaBuilder::~XlaBuilder() {}
+XlaBuilder::~XlaBuilder() = default;
 
 XlaOp XlaBuilder::ReportError(const Status& error) {
   CHECK(!error.ok());
@@ -1988,15 +2121,35 @@ void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
 
     // Outfeed takes a token as its second operand. Generate the token to pass
     // to the outfeed.
-    HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
-    TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
-                                                    HloOpcode::kAfterAll, {}));
-
-    TF_RETURN_IF_ERROR(
-        AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand, token})
-            .status());
-
+    XlaOp token;
+    auto make_token = [&]() {
+      HloInstructionProto token_instr;
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
+      return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
+    };
+    auto make_outfeed = [&](XlaOp token) {
+      return AddInstruction(std::move(instr), HloOpcode::kOutfeed,
+                            {operand, token});
+    };
+    if (sharding()) {
+      XlaScopedShardingAssignment scoped_sharding(
+          this, sharding_builder::AssignDevice(0));
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    } else {
+      TF_ASSIGN_OR_RETURN(token, make_token());
+    }
+    if (sharding()) {
+      OpSharding tuple_sharding = *sharding();
+      if (tuple_sharding.type() != OpSharding::TUPLE) {
+        tuple_sharding = sharding_builder::Tuple({});
+        *tuple_sharding.add_tuple_shardings() = *sharding();
+      }
+      *tuple_sharding.add_tuple_shardings() = sharding_builder::AssignDevice(0);
+      XlaScopedShardingAssignment scoped_sharding(this, tuple_sharding);
+      TF_RETURN_IF_ERROR(make_outfeed(token).status());
+    } else {
+      TF_RETURN_IF_ERROR(make_outfeed(token).status());
+    }
     // The outfeed instruction produces a token. However, existing users expect
     // a nil shape (empty tuple). This should only be relevant if the outfeed is
     // the root of a computation.
@@ -2007,7 +2160,7 @@ void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
 
     // The dummy tuple should have no sharding.
     {
-      XlaScopedShardingAssignment scoped_sharding(this, OpSharding());
+      XlaScopedShardingAssignment scoped_sharding(this, std::nullopt);
       TF_ASSIGN_OR_RETURN(
           XlaOp empty_tuple,
           AddInstruction(std::move(tuple_instr), HloOpcode::kTuple, {}));
@@ -2163,7 +2316,7 @@ StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
     AddCalledComputation(*computation, &instr);
   }
   for (const auto& pair : output_operand_aliasing) {
-    auto aliasing = instr.add_custom_call_output_operand_aliasing();
+    auto aliasing = instr.add_output_operand_aliasing();
     aliasing->set_operand_index(pair.second.first);
     for (int64_t index : pair.second.second) {
       aliasing->add_operand_shape_index(index);
@@ -3232,20 +3385,22 @@ XlaOp XlaBuilder::ReduceScatter(
 XlaOp XlaBuilder::AllToAll(XlaOp operand, int64_t split_dimension,
                            int64_t concat_dimension, int64_t split_count,
                            absl::Span<const ReplicaGroup> replica_groups,
-                           const std::optional<Layout>& layout) {
+                           const std::optional<Layout>& layout,
+                           const std::optional<ChannelHandle>& channel_id) {
   // Array all_to_all may need to violate layout constraint to be legal so use
   // the tuple version.
   if (layout.has_value()) {
     return AllToAllTuple(operand, split_dimension, concat_dimension,
-                         split_count, replica_groups, layout);
+                         split_count, replica_groups, layout, channel_id);
   }
   return AllToAllArray(operand, split_dimension, concat_dimension, split_count,
-                       replica_groups);
+                       replica_groups, channel_id);
 }
 
-XlaOp XlaBuilder::AllToAllArray(XlaOp operand, int64_t split_dimension,
-                                int64_t concat_dimension, int64_t split_count,
-                                absl::Span<const ReplicaGroup> replica_groups) {
+XlaOp XlaBuilder::AllToAllArray(
+    XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+    int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+    const std::optional<ChannelHandle>& channel_id) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
@@ -3265,6 +3420,9 @@ XlaOp XlaBuilder::AllToAllArray(XlaOp operand, int64_t split_dimension,
       }
     }
     instr.add_dimensions(split_dimension);
+    if (channel_id.has_value()) {
+      instr.set_channel_id(channel_id->handle());
+    }
     TF_ASSIGN_OR_RETURN(
         XlaOp all_to_all,
         AddInstruction(std::move(instr), HloOpcode::kAllToAll, {operand}));
@@ -3297,9 +3455,11 @@ XlaOp XlaBuilder::AllToAllArray(XlaOp operand, int64_t split_dimension,
   });
 }
 
-XlaOp XlaBuilder::AllToAllTuple(absl::Span<const XlaOp> operands,
-                                absl::Span<const ReplicaGroup> replica_groups,
-                                const std::optional<Layout>& layout) {
+XlaOp XlaBuilder::AllToAllTuple(
+    absl::Span<const XlaOp> operands,
+    absl::Span<const ReplicaGroup> replica_groups,
+    const std::optional<Layout>& layout,
+    const std::optional<ChannelHandle>& channel_id) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(auto operand_shapes, this->GetOperandShapes(operands));
@@ -3330,15 +3490,19 @@ XlaOp XlaBuilder::AllToAllTuple(absl::Span<const XlaOp> operands,
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
     }
+    if (channel_id.has_value()) {
+      instr.set_channel_id(channel_id->handle());
+    }
 
     return AddInstruction(std::move(instr), HloOpcode::kAllToAll, operands);
   });
 }
 
-XlaOp XlaBuilder::AllToAllTuple(XlaOp operand, int64_t split_dimension,
-                                int64_t concat_dimension, int64_t split_count,
-                                absl::Span<const ReplicaGroup> replica_groups,
-                                const std::optional<Layout>& layout) {
+XlaOp XlaBuilder::AllToAllTuple(
+    XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+    int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+    const std::optional<Layout>& layout,
+    const std::optional<ChannelHandle>& channel_id) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
@@ -3366,7 +3530,8 @@ XlaOp XlaBuilder::AllToAllTuple(XlaOp operand, int64_t split_dimension,
     }
 
     // Handle data communication.
-    XlaOp alltoall = this->AllToAllTuple(slices, replica_groups, layout);
+    XlaOp alltoall =
+        this->AllToAllTuple(slices, replica_groups, layout, channel_id);
 
     // Concat the N received parts.
     std::vector<XlaOp> received;
@@ -3382,6 +3547,14 @@ XlaOp XlaBuilder::CollectivePermute(
     XlaOp operand,
     const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
     const std::optional<ChannelHandle>& channel_id) {
+  return CollectivePermuteImpl(operand, source_target_pairs, channel_id,
+                               /*async=*/false);
+}
+
+XlaOp XlaBuilder::CollectivePermuteImpl(
+    XlaOp operand,
+    const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+    const std::optional<ChannelHandle>& channel_id, bool async) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     HloInstructionProto instr;
@@ -3399,7 +3572,9 @@ XlaOp XlaBuilder::CollectivePermute(
       instr.set_channel_id(channel_id->handle());
     }
 
-    return AddInstruction(std::move(instr), HloOpcode::kCollectivePermute,
+    return AddInstruction(std::move(instr),
+                          async ? HloOpcode::kCollectivePermuteStart
+                                : HloOpcode::kCollectivePermute,
                           {operand});
   });
 }
@@ -3550,24 +3725,10 @@ XlaOp XlaBuilder::SendWithToken(XlaOp operand, XlaOp token,
       return InvalidArgument("Send must use a device-to-device channel");
     }
 
-    // Send instruction produces a tuple of {aliased operand, U32 context,
-    // token}.
-    HloInstructionProto send_instr;
-    TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
-    *send_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({*shape, ShapeUtil::MakeShape(U32, {}),
-                                   ShapeUtil::MakeTokenShape()})
-            .ToProto();
-    send_instr.set_channel_id(handle.handle());
-    TF_ASSIGN_OR_RETURN(XlaOp send,
-                        AddInstruction(std::move(send_instr), HloOpcode::kSend,
-                                       {operand, token}));
-
-    HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
-    send_done_instr.set_channel_id(handle.handle());
-    return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
-                          {send});
+    XlaOp send_op = internal::XlaBuilderFriend::BuildSend(this, operand, token,
+                                                          handle, false);
+    return internal::XlaBuilderFriend::BuildSendDone(this, send_op, handle,
+                                                     false);
   });
 }
 
@@ -3603,24 +3764,10 @@ XlaOp XlaBuilder::RecvWithToken(XlaOp token, const Shape& shape,
       return InvalidArgument("Recv must use a device-to-device channel");
     }
 
-    // Recv instruction produces a tuple of {receive buffer, U32 context,
-    // token}.
-    HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape(
-            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
-            .ToProto();
-    recv_instr.set_channel_id(handle.handle());
-    TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
-                                                   HloOpcode::kRecv, {token}));
-
-    HloInstructionProto recv_done_instr;
-    *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
-            .ToProto();
-    recv_done_instr.set_channel_id(handle.handle());
-    return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
-                          {recv});
+    XlaOp recv_op = internal::XlaBuilderFriend::BuildRecv(this, token, shape,
+                                                          handle, false);
+    return internal::XlaBuilderFriend::BuildRecvDone(this, recv_op, shape,
+                                                     handle, false);
   });
 }
 
@@ -4095,7 +4242,13 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
     *instr.mutable_metadata() = metadata_;
   }
   if (sharding_) {
-    *instr.mutable_sharding() = *sharding_;
+    // Normalize tuple sharding and fail the call if the sharding is not valid.
+    Shape shape(instr.shape());
+    TF_ASSIGN_OR_RETURN(HloSharding sharding,
+                        HloSharding::FromProto(*sharding_));
+    sharding = sharding.NormalizeTupleSharding(shape);
+    TF_RETURN_IF_ERROR(sharding.Validate(shape));
+    *instr.mutable_sharding() = sharding.ToProto();
   }
   *instr.mutable_frontend_attributes() = frontend_attributes_;
 
@@ -4648,10 +4801,10 @@ XlaOp OptimizationBarrier(XlaOp operand) {
   return operand.builder()->OptimizationBarrier(operand);
 }
 
-XlaOp Complex(const XlaOp lhs, const XlaOp rhs,
+XlaOp Complex(const XlaOp real, const XlaOp imag,
               absl::Span<const int64_t> broadcast_dimensions) {
-  return lhs.builder()->BinaryOp(HloOpcode::kComplex, lhs, rhs,
-                                 broadcast_dimensions);
+  return real.builder()->BinaryOp(HloOpcode::kComplex, real, imag,
+                                  broadcast_dimensions);
 }
 
 XlaOp Conj(const XlaOp operand) {
@@ -4854,25 +5007,30 @@ XlaOp ReduceScatter(const XlaOp operand, const XlaComputation& computation,
 XlaOp AllToAll(const XlaOp operand, int64_t split_dimension,
                int64_t concat_dimension, int64_t split_count,
                absl::Span<const ReplicaGroup> replica_groups,
-               const std::optional<Layout>& layout) {
+               const std::optional<Layout>& layout,
+               const std::optional<ChannelHandle>& channel_id) {
   return operand.builder()->AllToAll(operand, split_dimension, concat_dimension,
-                                     split_count, replica_groups, layout);
+                                     split_count, replica_groups, layout,
+                                     channel_id);
 }
 
 XlaOp AllToAllTuple(absl::Span<const XlaOp> operands,
                     absl::Span<const ReplicaGroup> replica_groups,
-                    const std::optional<Layout>& layout) {
+                    const std::optional<Layout>& layout,
+                    const std::optional<ChannelHandle>& channel_id) {
   CHECK(!operands.empty());
-  return operands[0].builder()->AllToAllTuple(operands, replica_groups, layout);
+  return operands[0].builder()->AllToAllTuple(operands, replica_groups, layout,
+                                              channel_id);
 }
 
 XlaOp AllToAllTuple(const XlaOp operand, int64_t split_dimension,
                     int64_t concat_dimension, int64_t split_count,
                     absl::Span<const ReplicaGroup> replica_groups,
-                    const std::optional<Layout>& layout) {
+                    const std::optional<Layout>& layout,
+                    const std::optional<ChannelHandle>& channel_id) {
   return operand.builder()->AllToAllTuple(operand, split_dimension,
                                           concat_dimension, split_count,
-                                          replica_groups, layout);
+                                          replica_groups, layout, channel_id);
 }
 
 XlaOp CollectivePermute(
@@ -4910,10 +5068,9 @@ XlaOp Abs(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kAbs, operand);
 }
 
-XlaOp Atan2(const XlaOp lhs, const XlaOp rhs,
+XlaOp Atan2(const XlaOp y, const XlaOp x,
             absl::Span<const int64_t> broadcast_dimensions) {
-  return lhs.builder()->BinaryOp(HloOpcode::kAtan2, lhs, rhs,
-                                 broadcast_dimensions);
+  return y.builder()->BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
 }
 
 XlaOp Exp(const XlaOp operand) {
@@ -4955,6 +5112,9 @@ XlaOp Cos(const XlaOp operand) {
 XlaOp Sin(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
 }
+XlaOp Tan(const XlaOp operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kTan, operand);
+}
 XlaOp Tanh(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index d5683432877..6a8654ec61d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -36,16 +36,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/stacktrace.h"
 
@@ -61,32 +59,26 @@ struct XlaBuilderFriend {
   static XlaOp BuildAddDependency(XlaBuilder* builder, XlaOp operand,
                                   XlaOp token, const Shape& shape);
 
-  static XlaOp BuildAsyncStart(XlaBuilder* builder,
-                               absl::Span<const XlaOp> operands,
-                               std::string execution_thread, int64_t group_id,
-                               const XlaComputation& called_computation,
-                               const Shape& shape);
-  static XlaOp BuildAsyncStart(XlaBuilder* builder,
-                               absl::Span<const XlaOp> operands,
-                               std::string execution_thread,
-                               const XlaComputation& called_computation,
-                               const Shape& shape);
+  static std::pair<XlaOp, int64_t> BuildAsyncStart(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      std::string execution_thread, int64_t group_id,
+      const XlaComputation& called_computation, const Shape& shape);
+  static std::pair<XlaOp, int64_t> BuildAsyncStart(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      std::string execution_thread, const XlaComputation& called_computation,
+      const Shape& shape);
   static XlaOp BuildAsyncUpdate(XlaBuilder* builder, const XlaOp operands,
                                 std::string execution_thread, int64_t group_id,
-                                const XlaComputation& called_computation,
-                                const Shape& shape);
+                                int64_t called_computation, const Shape& shape);
   static XlaOp BuildAsyncUpdate(XlaBuilder* builder, const XlaOp operands,
                                 std::string execution_thread,
-                                const XlaComputation& called_computation,
-                                const Shape& shape);
+                                int64_t called_computation, const Shape& shape);
   static XlaOp BuildAsyncDone(XlaBuilder* builder, const XlaOp operands,
                               std::string execution_thread, int64_t group_id,
-                              const XlaComputation& called_computation,
-                              const Shape& shape);
+                              int64_t called_computation, const Shape& shape);
   static XlaOp BuildAsyncDone(XlaBuilder* builder, const XlaOp operands,
                               std::string execution_thread,
-                              const XlaComputation& called_computation,
-                              const Shape& shape);
+                              int64_t called_computation, const Shape& shape);
 
   static XlaOp BuildAllGatherStart(
       XlaBuilder* builder, XlaOp operand, int64_t all_gather_dimension,
@@ -106,16 +98,43 @@ struct XlaBuilderFriend {
   static XlaOp BuildAllReduceDone(XlaBuilder* builder, const XlaOp operands,
                                   const Shape& shape);
 
-  static XlaOp BuildFusion(XlaBuilder* builder,
-                           absl::Span<const XlaOp> operands,
-                           absl::string_view fusion_kind,
-                           const XlaComputation& fused_computation);
+  static XlaOp BuildCollectivePermuteStart(
+      XlaBuilder* builder, XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
+  static XlaOp BuildCollectivePermuteDone(XlaBuilder* builder,
+                                          const XlaOp operands,
+                                          const Shape& shape);
+
+  static XlaOp BuildCopyStart(
+      XlaBuilder* builder, XlaOp operand,
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
+  static XlaOp BuildCopyDone(XlaBuilder* builder, const XlaOp operand,
+                             const Shape& shape);
+
+  static XlaOp BuildFusion(
+      XlaBuilder* builder, absl::Span<const XlaOp> operands,
+      absl::string_view fusion_kind, const XlaComputation& fused_computation,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_operand_aliasing = {});
 
   static XlaOp BuildBitcast(XlaBuilder* builder, XlaOp operand,
                             const Shape& shape);
 
   static XlaOp BuildPartitionId(XlaBuilder* builder, const Shape& shape);
 
+  static XlaOp BuildSend(XlaBuilder* builder, XlaOp operand, XlaOp token,
+                         const ChannelHandle& handle, bool is_host_transfer);
+  static XlaOp BuildSendDone(XlaBuilder* builder, XlaOp operand,
+                             const ChannelHandle& handle,
+                             bool is_host_transfer);
+
+  static XlaOp BuildRecv(XlaBuilder* builder, XlaOp token, const Shape& shape,
+                         const ChannelHandle& handle, bool is_host_transfer);
+  static XlaOp BuildRecvDone(XlaBuilder* builder, XlaOp token,
+                             const Shape& shape, const ChannelHandle& handle,
+                             bool is_host_transfer);
+
   static XlaOp BuildDomain(XlaBuilder* builder, XlaOp operand,
                            const OpSharding entry, const OpSharding exit,
                            const Shape& shape);
@@ -266,7 +285,8 @@ class XlaBuilder {
   // As a result they are set on the computation builder and all the
   // instructions generated via the computation builder will have the same
   // frontend attributes attached to them.
-  void SetFrontendAttributes(const FrontendAttributes& frontend_attributes) {
+  virtual void SetFrontendAttributes(
+      const FrontendAttributes& frontend_attributes) {
     frontend_attributes_ = frontend_attributes;
   }
 
@@ -361,7 +381,7 @@ class XlaBuilder {
   //
   // This will copy the needed ops/computations to the subgraph.
   StatusOr<XlaComputation> BuildConstantSubGraph(
-      XlaOp root_op, bool dynamic_dimension_is_uint_max = false);
+      XlaOp root_op, bool dynamic_dimension_is_minus_one = false);
 
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
@@ -815,16 +835,20 @@ class XlaBuilder {
   XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
                  int64_t concat_dimension, int64_t split_count,
                  absl::Span<const ReplicaGroup> replica_groups,
-                 const std::optional<Layout>& layout = std::nullopt);
+                 const std::optional<Layout>& layout = std::nullopt,
+                 const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
-  XlaOp AllToAllTuple(absl::Span<const XlaOp> operands,
-                      absl::Span<const ReplicaGroup> replica_groups,
-                      const std::optional<Layout>& layout);
+  XlaOp AllToAllTuple(
+      absl::Span<const XlaOp> operands,
+      absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<Layout>& layout,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
-  XlaOp AllToAllTuple(XlaOp operand, int64_t split_dimension,
-                      int64_t concat_dimension, int64_t split_count,
-                      absl::Span<const ReplicaGroup> replica_groups,
-                      const std::optional<Layout>& layout);
+  XlaOp AllToAllTuple(
+      XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+      int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<Layout>& layout,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
   XlaOp CollectivePermute(
       XlaOp operand,
@@ -1431,14 +1455,17 @@ class XlaBuilder {
   friend XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
                         int64_t concat_dimension, int64_t split_count,
                         absl::Span<const ReplicaGroup> replica_groups,
-                        const std::optional<Layout>& layout);
+                        const std::optional<Layout>& layout,
+                        const std::optional<ChannelHandle>& channel_id);
   friend XlaOp AllToAllTuple(absl::Span<const XlaOp> operands,
                              absl::Span<const ReplicaGroup> replica_groups,
-                             const std::optional<Layout>& layout);
+                             const std::optional<Layout>& layout,
+                             const std::optional<ChannelHandle>& channel_id);
   friend XlaOp AllToAllTuple(XlaOp operand, int64_t split_dimension,
                              int64_t concat_dimension, int64_t split_count,
                              absl::Span<const ReplicaGroup> replica_groups,
-                             const std::optional<Layout>& layout);
+                             const std::optional<Layout>& layout,
+                             const std::optional<ChannelHandle>& channel_id);
   friend XlaOp CollectivePermute(
       XlaOp operand,
       const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
@@ -1471,6 +1498,7 @@ class XlaBuilder {
   friend XlaOp Clz(XlaOp operand);
   friend XlaOp Cos(XlaOp operand);
   friend XlaOp Sin(XlaOp operand);
+  friend XlaOp Tan(XlaOp operand);
   friend XlaOp Tanh(XlaOp operand);
   friend XlaOp Real(XlaOp operand);
   friend XlaOp Imag(XlaOp operand);
@@ -1586,14 +1614,20 @@ class XlaBuilder {
                       const std::optional<bool> use_global_device_ids,
                       bool async);
 
+  XlaOp CollectivePermuteImpl(
+      XlaOp operand,
+      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
+      const std::optional<ChannelHandle>& channel_id, bool async);
+
   XlaOp ConditionalImpl(
       XlaOp branch_index,
       absl::Span<const XlaComputation* const> branch_computations,
       absl::Span<const XlaOp> branch_operands);
 
-  XlaOp AllToAllArray(XlaOp operand, int64_t split_dimension,
-                      int64_t concat_dimension, int64_t split_count,
-                      absl::Span<const ReplicaGroup> replica_groups);
+  XlaOp AllToAllArray(
+      XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+      int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
+      const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
   // Creates an op with the given opcode and the output shape.
   virtual StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
@@ -1624,7 +1658,7 @@ class XlaBuilder {
   //
   // TODO(hinsu): Return const pointer within StatusOr and use
   // absl::implicit_cast at callsites. This requires implicit_cast support in
-  // stream_executor::port::StatusOr similar to absl::StatusOr.
+  // xla::StatusOr similar to absl::StatusOr.
   template <typename InstructionType>
   StatusOr<InstructionType> LookUpInstructionInternal(XlaOp op) const {
     TF_RETURN_IF_ERROR(CheckOpBuilder(op));
@@ -2215,7 +2249,7 @@ XlaOp CustomCallWithComputation(
     absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
         output_operand_aliasing = {},
     const Literal* literal = nullptr,
-    CustomCallSchedule = CustomCallSchedule::SCHEDULE_NONE,
+    CustomCallSchedule schedule = CustomCallSchedule::SCHEDULE_NONE,
     CustomCallApiVersion api_version = API_VERSION_ORIGINAL);
 
 // Overload which constructs a custom call with fixed layouts. The operands will
@@ -2435,16 +2469,20 @@ XlaOp ReduceScatter(
 XlaOp AllToAll(XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
                int64_t split_count,
                absl::Span<const ReplicaGroup> replica_groups = {},
-               const std::optional<Layout>& layout = std::nullopt);
+               const std::optional<Layout>& layout = std::nullopt,
+               const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
-XlaOp AllToAllTuple(absl::Span<const XlaOp> operand,
-                    absl::Span<const ReplicaGroup> replica_groups = {},
-                    const std::optional<Layout>& layout = std::nullopt);
+XlaOp AllToAllTuple(
+    absl::Span<const XlaOp> operand,
+    absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<Layout>& layout = std::nullopt,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
-XlaOp AllToAllTuple(XlaOp operand, int64_t split_dimension,
-                    int64_t concat_dimension, int64_t split_count,
-                    absl::Span<const ReplicaGroup> replica_groups = {},
-                    const std::optional<Layout>& layout = std::nullopt);
+XlaOp AllToAllTuple(
+    XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
+    int64_t split_count, absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<Layout>& layout = std::nullopt,
+    const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
 // Enqueues an collective operation that sends and receives data cross replicas.
 //
@@ -2526,6 +2564,9 @@ XlaOp Cos(XlaOp operand);
 // Enqueues a sine instruction onto the computation.
 XlaOp Sin(XlaOp operand);
 
+// Enqueues a tan instruction onto the computation.
+XlaOp Tan(XlaOp operand);
+
 // Enqueues a tanh instruction onto the computation.
 XlaOp Tanh(XlaOp operand);
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 881af35e02a..d01b57e289d 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -15,17 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
+#include <algorithm>
+#include <complex>
+#include <functional>
+#include <memory>
 #include <string>
+#include <vector>
 
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/value_inference.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -1446,5 +1451,65 @@ TEST_F(XlaBuilderTest, ComplexAbsConstant) {
             PrimitiveType::F32);
 }
 
+TEST_F(XlaBuilderTest, OutfeedDummyTupleSharding) {
+  XlaBuilder b(TestName());
+  XlaOp value = ConstantR1<int32_t>(&b, {0});
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                                    /* minor_to_major= */ {0});
+  Outfeed(value, shape, "");
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  EXPECT_FALSE(module->entry_computation()->root_instruction()->has_sharding());
+}
+
+TEST_F(XlaBuilderTest, OutfeedTokenSharding) {
+  XlaBuilder b(TestName());
+  XlaOp value = ConstantR1<int32_t>(&b, {0});
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                                    /* minor_to_major= */ {0});
+  b.SetSharding(sharding_builder::Replicate());
+  Outfeed(value, shape, "");
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto it = std::find_if(module->entry_computation()->instructions().begin(),
+                         module->entry_computation()->instructions().end(),
+                         [](const HloInstruction* i) {
+                           return i->opcode() == HloOpcode::kOutfeed;
+                         });
+  EXPECT_NE(it, module->entry_computation()->instructions().end());
+  auto* outfeed = *it;
+  EXPECT_TRUE(outfeed->has_sharding());
+  EXPECT_TRUE(outfeed->sharding().IsTuple());
+  EXPECT_EQ(outfeed->sharding().tuple_elements().size(), 2);
+  EXPECT_TRUE(outfeed->operand(1)->has_sharding());
+  EXPECT_EQ(outfeed->sharding().tuple_elements().back(),
+            HloSharding::FromProto(sharding_builder::AssignDevice(0)).value());
+  EXPECT_EQ(outfeed->operand(1)->sharding(),
+            HloSharding::FromProto(sharding_builder::AssignDevice(0)).value());
+}
+
+TEST_F(XlaBuilderTest, NormalizeTupleSharding) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
+  b.SetSharding(sharding_builder::Replicate());
+  Parameter(&b, 0, tuple_param_shape, "p0");
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(root->has_sharding());
+  EXPECT_TRUE(root->sharding().IsTuple());
+  EXPECT_EQ(root->sharding().tuple_elements().size(), 2);
+}
+
+TEST_F(XlaBuilderTest, InvalidSharding) {
+  XlaBuilder b(TestName());
+  Shape shape2d = ShapeUtil::MakeShape(F32, {6, 8});
+  Shape shape1d = ShapeUtil::MakeShape(F32, {5});
+  b.SetSharding(sharding_builder::Tile1D(shape1d, 4));
+  Parameter(&b, 0, shape2d, "p0");
+  auto statusor = b.Build();
+  EXPECT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("Number of tile assignment dimensions (excluding "
+                        "subgroups) is different than the input rank"));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index 734e47a58af..91998790d1e 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -55,6 +55,11 @@ bool IsValidComparison(xla::PrimitiveType type, Comparison::Order order) {
     case PRIMITIVE_TYPE_INVALID:
     case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
     case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
+    // TODO(b/259609697): Add support for comparing F8 values. F8 values are
+    // comparable like any other floating-point type, but comparisons are not
+    // yet implemented by any backend.
+    case F8E5M2:
+    case F8E4M3FN:
       return false;
   }
 }
@@ -97,6 +102,8 @@ Comparison::Order DefaultOrdering(PrimitiveType type) {
     case U32:
     case U64:
       return Comparison::Order::kTotal;
+    case F8E5M2:
+    case F8E4M3FN:
     case BF16:
     case F16:
     case F32:
@@ -250,6 +257,8 @@ Comparison::Type Comparison::DefaultComparisonType(PrimitiveType type) {
     case U32:
     case U64:
       return Type::kUnsigned;
+    case F8E5M2:
+    case F8E4M3FN:
     case F16:
     case F32:
     case BF16:
@@ -317,6 +326,8 @@ std::optional<Comparison> Comparison::Inverse() const {
     case TUPLE:
     case OPAQUE_TYPE:
     case TOKEN:
+    case F8E5M2:
+    case F8E4M3FN:
     case PRIMITIVE_TYPE_INVALID:
     case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
     case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index 649788333ba..a9023e66891 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -164,7 +164,7 @@ class Comparison {
   // Returns a comparison operator: (T, T) -> bool for this Comparison's
   // Direction.
   template <typename T>
-  std::function<bool(T, T)> GetComparator() const {
+  inline std::function<bool(T, T)> GetComparator() const {
     switch (GetDirection()) {
       case Direction::kEq:
         return std::equal_to<T>();
@@ -184,8 +184,8 @@ class Comparison {
   // Applies the comparison from this Comparison's direction and ordering for
   // integral types.
   template <typename T, absl::enable_if_t<std::is_integral<T>::value, int> = 0>
-  bool Compare(const T a, const T b) const {
-    CHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
+  inline bool Compare(const T a, const T b) const {
+    DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
     return GetComparator<T>()(a, b);
   }
 
@@ -195,13 +195,13 @@ class Comparison {
             absl::enable_if_t<std::is_floating_point<T>::value ||
                                   std::is_same<T, xla::bfloat16>::value,
                               int> = 0>
-  bool Compare(const T a, const T b) const {
-    CHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
+  inline bool Compare(const T a, const T b) const {
+    DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
     if (IsTotalOrder()) {
       //  -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
       // Reference:
       // https://www.tensorflow.org/xla/operation_semantics#element-wise_comparison_operations
-      using R = typename SignedIntegerTypeForSize<sizeof(T)>::type;
+      using R = SignedIntegerTypeForSizeType<sizeof(T)>;
       return GetComparator<R>()(ToSignMagnitude(a), ToSignMagnitude(b));
     }
     return GetComparator<T>()(a, b);
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index d74da0f8b61..876e0b9596b 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/debug_options_parsers.h"
 #include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 
@@ -41,6 +42,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_multi_thread_eigen(true);
   opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   opts.set_xla_gpu_asm_extra_flags("");
+  opts.set_xla_gpu_use_runtime_fusion(true);
   opts.set_xla_eliminate_hlo_implicit_broadcast(true);
   opts.set_xla_dump_hlo_as_html(false);
   opts.set_xla_dump_fusion_visualization(false);
@@ -48,6 +50,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_dump_max_hlo_modules(-1);
   opts.set_xla_dump_module_metadata(false);
   opts.set_xla_dump_hlo_as_long_text(false);
+  opts.set_xla_dump_enable_mlir_pretty_form(true);
 #ifdef ENABLE_MKL
   opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // ENABLE_MKL
@@ -55,7 +58,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_acl(true);
 #endif
   opts.set_xla_cpu_use_xla_runtime(false);
-  opts.set_xla_gpu_max_kernel_unroll_factor(4);
 
   opts.set_xla_cpu_enable_fast_math(false);
   // Disable forms of fast math that have caused users problems in the past.
@@ -71,6 +73,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_cublaslt(false);
 
+  // TODO(b/258036887): Remove this flag once CUDA Graphs are fully supported.
+  opts.set_xla_gpu_enable_cuda_graphs(false);
+
   // Despite the name, fast min/max on GPUs does not seem to be any faster, and
   // adds very counter-intuitive "NaN-swallowing" behavior.
   opts.set_xla_gpu_enable_fast_min_max(false);
@@ -85,18 +90,24 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_multiheap_size_constraint_per_heap(-1);
   opts.set_xla_detailed_logging_and_dumping(true);
 
-  opts.set_xla_gpu_enable_xla_runtime_executable(false);
+  opts.set_xla_gpu_enable_xla_runtime_executable(true);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
 
   // Set 4GB space limit for redzone scratch allocator.
   opts.set_xla_gpu_redzone_scratch_max_megabytes(1LL << 12);
   opts.set_xla_gpu_shape_checks(DebugOptions::RUNTIME);
-  opts.set_xla_cpu_enable_mlir_lowering(false);
   opts.set_xla_gpu_enable_mlir_lowering(true);
-  opts.set_xla_gpu_enable_softmax_fusion(false);
-  opts.set_xla_gpu_normalize_layouts(false);
+  opts.set_xla_gpu_enable_softmax_fusion(true);
+  opts.set_xla_gpu_normalize_layouts(true);
   opts.set_xla_gpu_simplify_all_fp_conversions(true);
+  opts.set_xla_dump_latency_hiding_schedule(false);
+  opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
+
+  opts.set_xla_cpu_enable_mlir_tiling_and_fusion(false);
+
+  opts.set_xla_partitioning_algorithm(
+      DebugOptions::PARTITIONING_ALGORITHM_NOOP);
   return opts;
 }
 
@@ -137,86 +148,134 @@ static void WarnIfFuelWasNeverConsumed() {
   }
 }
 
-// Allocates flag_values and flag_objects; this function must not be called more
-// than once - its call done via call_once.
-static void AllocateFlags() {
-  flag_values = new DebugOptions(DefaultDebugOptionsIgnoringFlags());
-
-  // Returns a lambda that calls "member_setter" on "flag_values" with the
+void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                           DebugOptions* debug_options) {
+  // Returns a lambda that calls "member_setter" on "debug_options" with the
   // argument passed in to the lambda.
-  auto bool_setter_for = [](void (DebugOptions::*member_setter)(bool)) {
-    return [member_setter](bool value) {
-      (flag_values->*member_setter)(value);
-      return true;
-    };
-  };
+  auto bool_setter_for =
+      [debug_options](void (DebugOptions::*member_setter)(bool)) {
+        return [debug_options, member_setter](bool value) {
+          (debug_options->*member_setter)(value);
+          return true;
+        };
+      };
 
-  // Returns a lambda that calls "member_setter" on "flag_values" with the
+  // Returns a lambda that calls "member_setter" on "debug_options" with the
   // argument passed in to the lambda.
-  auto int32_setter_for = [](void (DebugOptions::*member_setter)(int32_t)) {
-    return [member_setter](int32_t value) {
-      (flag_values->*member_setter)(value);
-      return true;
-    };
-  };
+  auto int32_setter_for =
+      [debug_options](void (DebugOptions::*member_setter)(int32_t)) {
+        return [debug_options, member_setter](int32_t value) {
+          (debug_options->*member_setter)(value);
+          return true;
+        };
+      };
 
-  auto int64_setter_for = [](void (DebugOptions::*member_setter)(int64_t)) {
-    return [member_setter](int64_t value) {
-      (flag_values->*member_setter)(value);
+  auto int64_setter_for =
+      [debug_options](void (DebugOptions::*member_setter)(int64_t)) {
+        return [debug_options, member_setter](int64_t value) {
+          (debug_options->*member_setter)(value);
+          return true;
+        };
+      };
+
+  auto string_setter_for = [debug_options](void (DebugOptions::*member_setter)(
+                               const std::string& value)) {
+    return [debug_options, member_setter](const std::string& value) {
+      (debug_options->*member_setter)(value);
       return true;
     };
   };
 
-  auto string_setter_for =
-      [](void (DebugOptions::*member_setter)(const std::string& value)) {
-        return [member_setter](const std::string& value) {
-          (flag_values->*member_setter)(value);
-          return true;
-        };
+  // Custom "sub-parser" lambda for xla_gpu_shape_checks.
+  auto setter_for_xla_gpu_shape_checks =
+      [debug_options](const std::string& value) {
+        DebugOptions::ShapeChecks shape_checks;
+        if (!DebugOptions::ShapeChecks_Parse(value, &shape_checks)) {
+          return false;
+        }
+        debug_options->set_xla_gpu_shape_checks(shape_checks);
+        return true;
       };
 
   // Custom "sub-parser" lambda for xla_disable_hlo_passes.
   auto setter_for_xla_disable_hlo_passes =
-      [](std::string comma_separated_values) {
+      [debug_options](std::string comma_separated_values) {
         for (const auto& passname : std::vector<std::string>(
                  absl::StrSplit(comma_separated_values, ','))) {
-          flag_values->add_xla_disable_hlo_passes(passname);
+          debug_options->add_xla_disable_hlo_passes(passname);
         }
         return true;
       };
 
   // Custom "sub-parser" lambda for xla_enable_hlo_passes_only.
   auto setter_for_xla_enable_hlo_passes_only =
-      [](std::string comma_separated_values) {
+      [debug_options](std::string comma_separated_values) {
         for (const auto& passname : std::vector<std::string>(
                  absl::StrSplit(comma_separated_values, ','))) {
-          flag_values->add_xla_enable_hlo_passes_only(passname);
+          debug_options->add_xla_enable_hlo_passes_only(passname);
         }
         return true;
       };
 
   // Custom "sub-parser" lambda for xla_gpu_ptx_file.
-  auto setter_for_xla_gpu_ptx_file = [](std::string value) {
-    flag_values->add_xla_gpu_ptx_file(value);
+  auto setter_for_xla_gpu_ptx_file = [debug_options](std::string value) {
+    debug_options->add_xla_gpu_ptx_file(value);
     return true;
   };
 
   // Custom "sub-parser" lambda for xla_gpu_llvm_ir_file.
-  auto setter_for_xla_gpu_llvm_ir_file = [](const std::string& value) {
-    flag_values->add_xla_gpu_llvm_ir_file(value);
-    return true;
-  };
+  auto setter_for_xla_gpu_llvm_ir_file =
+      [debug_options](const std::string& value) {
+        debug_options->add_xla_gpu_llvm_ir_file(value);
+        return true;
+      };
 
   // Custom "sub-parser" lambda for xla_backend_extra_options.
   auto setter_for_xla_backend_extra_options =
-      [](std::string comma_separated_values) {
+      [debug_options](std::string comma_separated_values) {
         auto* extra_options_map =
-            flag_values->mutable_xla_backend_extra_options();
+            debug_options->mutable_xla_backend_extra_options();
         parse_xla_backend_extra_options(extra_options_map,
                                         comma_separated_values);
         return true;
       };
 
+  auto setter_for_xla_gpu_enable_softmax_fusion = [debug_options](bool value) {
+    // It is only possible to enable softmax fusion if
+    // xla_gpu_enable_mlir_lowering is also enabled.
+    if (value && !debug_options->xla_gpu_enable_mlir_lowering()) {
+      LOG(ERROR) << "xla_gpu_enable_softmax_fusion can only be enabled if "
+                    "xla_gpu_enable_mlir_lowering is enabled as well";
+      return false;
+    }
+    debug_options->set_xla_gpu_enable_softmax_fusion(value);
+    return true;
+  };
+
+  auto setter_for_xla_gpu_enable_mlir_lowering = [debug_options](bool value) {
+    // It is only possible to disable mlir lowering if
+    // xla_gpu_enable_softmax_fusion is also disabled.
+    if (!value && debug_options->xla_gpu_enable_softmax_fusion()) {
+      LOG(ERROR) << "xla_gpu_enable_mlir_lowering can only be disabled if "
+                    "xla_gpu_enable_softmax_fusion is disabled as well";
+      return false;
+    }
+    debug_options->set_xla_gpu_enable_mlir_lowering(value);
+    return true;
+  };
+
+  // Custom "sub-parser" lambda for xla_partitioning_algorithm.
+  auto setter_for_xla_partitioning_algorithm =
+      [debug_options](const std::string& value) {
+        DebugOptions::PartitioningAlgorithm partitioning_algorithm;
+        if (!DebugOptions::PartitioningAlgorithm_Parse(
+                value, &partitioning_algorithm)) {
+          return false;
+        }
+        debug_options->set_xla_partitioning_algorithm(partitioning_algorithm);
+        return true;
+      };
+
   // Custom "sub-parser" for xla_fuel.  Note that ConsumeFuel does not do any
   // locking on the fuel global variables.  This means that it's
   // illegal/undefined behavior to modify this flag value while the compiler is
@@ -266,96 +325,95 @@ static void AllocateFlags() {
     return true;
   };
 
-  flag_objects = new std::vector<tsl::Flag>();
   // Don't use an initializer list for initializing the vector; this would
   // create a temporary copy, and exceeds the stack space when compiling with
   // certain configurations.
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_enable_fast_math",
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-      flag_values->xla_cpu_enable_fast_math(),
+      debug_options->xla_cpu_enable_fast_math(),
       "Enable unsafe fast-math optimizations in the CPU compiler; this may "
       "produce faster code at the expense of some accuracy."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_nans",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_nans),
-      flag_values->xla_cpu_fast_math_honor_nans(),
+      debug_options->xla_cpu_fast_math_honor_nans(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "allow operations to produce NaNs.  Ignored when "
       "xla_cpu_enable_fast_math is false."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_infs",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_infs),
-      flag_values->xla_cpu_fast_math_honor_infs(),
+      debug_options->xla_cpu_fast_math_honor_infs(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "allow operations to produce infinites.  Ignored when "
       "xla_cpu_enable_fast_math is false."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_division",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_division),
-      flag_values->xla_cpu_fast_math_honor_division(),
+      debug_options->xla_cpu_fast_math_honor_division(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "forbid to use multiplication by the reciprocal instead of division. "
       "Ignored when xla_cpu_enable_fast_math is false."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_fast_math_honor_functions",
       bool_setter_for(&DebugOptions::set_xla_cpu_fast_math_honor_functions),
-      flag_values->xla_cpu_fast_math_honor_functions(),
+      debug_options->xla_cpu_fast_math_honor_functions(),
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "forbid to approximate calculations for functions. Ignored when "
       "xla_cpu_enable_fast_math is false."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
-      flag_values->xla_cpu_enable_fast_min_max(),
+      debug_options->xla_cpu_enable_fast_min_max(),
       "Enable fast floating point min/max lowering that always propagates "
       "NaNs."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
-      flag_values->xla_gpu_enable_fast_min_max(),
+      debug_options->xla_gpu_enable_fast_min_max(),
       "Enable fast floating point min/max lowering that does not propagate "
       "NaNs."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_llvm_enable_alias_scope_metadata",
       bool_setter_for(&DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
-      flag_values->xla_llvm_enable_alias_scope_metadata(),
+      debug_options->xla_llvm_enable_alias_scope_metadata(),
       "In LLVM-based backends, enable the emission of !alias.scope metadata in "
       "the generated IR."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_llvm_enable_noalias_metadata",
       bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
-      flag_values->xla_llvm_enable_noalias_metadata(),
+      debug_options->xla_llvm_enable_noalias_metadata(),
       "In LLVM-based backends, enable the emission of !noalias metadata in the "
       "generated IR."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_llvm_enable_invariant_load_metadata",
       bool_setter_for(
           &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
-      flag_values->xla_llvm_enable_invariant_load_metadata(),
+      debug_options->xla_llvm_enable_invariant_load_metadata(),
       "In LLVM-based backends, enable the emission of !invariant.load metadata "
       "in the generated IR."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_llvm_disable_expensive_passes",
       bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
-      flag_values->xla_llvm_disable_expensive_passes(),
+      debug_options->xla_llvm_disable_expensive_passes(),
       "In LLVM-based backends, disable a custom set of expensive optimization "
       "passes."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_backend_optimization_level",
       int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
-      flag_values->xla_backend_optimization_level(),
+      debug_options->xla_backend_optimization_level(),
       "Numerical optimization level for the XLA compiler backend."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
       "Comma-separated list of hlo passes to be disabled. These names must "
       "exactly match the passes' names; no whitespace around commas."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_enable_hlo_passes_only", setter_for_xla_enable_hlo_passes_only, "",
       "Comma-separated list of hlo passes to be enabled. These names must "
       "exactly match the passes' names; no whitespace around commas. The "
       "unspecified passes are all disabled."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_disable_all_hlo_passes",
       bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
       "Disables all HLO passes.  Notes that some passes are necessary for "
@@ -364,44 +422,39 @@ static void AllocateFlags() {
       "over time.  The only 'guarantee', such as it is, is that if you compile "
       "XLA and dump the optimized HLO for some graph, you should be able to "
       "run it again on the same device with the same build of XLA."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_embed_ir_in_executable",
                 bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
-                flag_values->xla_embed_ir_in_executable(),
+                debug_options->xla_embed_ir_in_executable(),
                 "Embed the compiler IR as a string in the executable."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_eliminate_hlo_implicit_broadcast",
       bool_setter_for(&DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
-      flag_values->xla_eliminate_hlo_implicit_broadcast(),
+      debug_options->xla_eliminate_hlo_implicit_broadcast(),
       "Eliminate implicit broadcasts when lowering user computations to HLO "
       "instructions; use explicit broadcast instead."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_multi_thread_eigen",
       bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
-      flag_values->xla_cpu_multi_thread_eigen(),
+      debug_options->xla_cpu_multi_thread_eigen(),
       "When generating calls to Eigen in the CPU backend, use multi-threaded "
       "Eigen mode."));
-  flag_objects->push_back(tsl::Flag(
-      "xla_gpu_cuda_data_dir", flag_values->mutable_xla_gpu_cuda_data_dir(),
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_cuda_data_dir", debug_options->mutable_xla_gpu_cuda_data_dir(),
       "If non-empty, specifies a local directory containing ptxas and nvvm "
       "libdevice files; otherwise we use those from runfile directories."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_ftz", bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
-      flag_values->xla_gpu_ftz(),
+      debug_options->xla_gpu_ftz(),
       "If true, flush-to-zero semantics are enabled in the code generated for "
       "GPUs."));
-  flag_objects->push_back(tsl::Flag(
-      "xla_gpu_max_kernel_unroll_factor",
-      int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
-      flag_values->xla_gpu_max_kernel_unroll_factor(),
-      "Specify the maximum kernel unroll factor for the GPU backend."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_ptx_file", setter_for_xla_gpu_ptx_file, "",
       "If non-empty, specifies a file containing ptx to use. The filename "
       "prefix must have the same pattern as PTX dumped by XLA. This allows to "
       "match one specific module. General workflow. Get the generated module "
       "ptx from XLA, modify it, then pass it back via this option."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_llvm_ir_file", setter_for_xla_gpu_llvm_ir_file, "",
       "If non-empty, specifies a file containing textual LLVM IR to use. The "
       "filename prefix must have the same pattern as LLVM dumped by XLA "
@@ -409,390 +462,446 @@ static void AllocateFlags() {
       "allows to match one specific module. General workflow. Get the not "
       "optimized LLVM IR from XLA, modify it, then pass it back via this "
       "option."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_test_all_output_layouts",
       bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
-      flag_values->xla_test_all_output_layouts(),
+      debug_options->xla_test_all_output_layouts(),
       "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
       "output layouts. For example, with a 3D shape, all permutations of the "
       "set {0, 1, 2} are tried."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_test_all_input_layouts",
       bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
-      flag_values->xla_test_all_input_layouts(),
+      debug_options->xla_test_all_input_layouts(),
       "Let ClientLibraryTestBase::ComputeAndCompare* test all permutations of "
       "*input* layouts. For example, for 2 input arguments with 2D shape and "
       "4D shape, the computation will run 2! * 4! times for every possible "
       "layouts"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_hlo_profile", bool_setter_for(&DebugOptions::set_xla_hlo_profile),
-      flag_values->xla_hlo_profile(),
+      debug_options->xla_hlo_profile(),
       "Instrument the computation to collect per-HLO cycle counts"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_backend_extra_options", setter_for_xla_backend_extra_options, "",
       "Extra options to pass to a backend; comma-separated list of 'key=val' "
       "strings (=val may be omitted); no whitespace around commas."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_cpu_use_mkl_dnn",
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
-                flag_values->xla_cpu_use_mkl_dnn(),
+                debug_options->xla_cpu_use_mkl_dnn(),
                 "Generate calls to MKL-DNN in the CPU backend."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_use_acl", bool_setter_for(&DebugOptions::set_xla_cpu_use_acl),
-      flag_values->xla_cpu_use_acl(),
+      debug_options->xla_cpu_use_acl(),
       "Generate calls to ACL (Arm Compute Library) in the CPU backend."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_cpu_use_xla_runtime",
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_xla_runtime),
-                flag_values->xla_cpu_use_xla_runtime(),
+                debug_options->xla_cpu_use_xla_runtime(),
                 "Enable XLA Runtime in the CPU backend."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_crash_on_verification_failures",
       bool_setter_for(
           &DebugOptions::set_xla_gpu_crash_on_verification_failures),
-      flag_values->xla_gpu_crash_on_verification_failures(),
+      debug_options->xla_gpu_crash_on_verification_failures(),
       "Crashes the program on extra verification failures, e.g. cuDNN cross "
       "checking failures"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_strict_conv_algorithm_picker",
       bool_setter_for(&DebugOptions::set_xla_gpu_strict_conv_algorithm_picker),
-      flag_values->xla_gpu_strict_conv_algorithm_picker(),
+      debug_options->xla_gpu_strict_conv_algorithm_picker(),
       "Upgrades warnings to failures when all algorithms fail conv "
       "autotuning."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_autotune_level",
       int32_setter_for(&DebugOptions::set_xla_gpu_autotune_level),
-      flag_values->xla_gpu_autotune_level(),
+      debug_options->xla_gpu_autotune_level(),
       "Set GEMM and Convolution auto-tuning level. 0 = off; 1 = on; 2 = "
       "on+init; 3 = on+init+reinit; 4 = on+init+reinit+check."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_force_host_platform_device_count",
       int32_setter_for(&DebugOptions::set_xla_force_host_platform_device_count),
-      flag_values->xla_force_host_platform_device_count(),
+      debug_options->xla_force_host_platform_device_count(),
       "Force the host platform to pretend that there are these many host "
       "\"devices\". All of these host devices are backed by the same "
       "threadpool. Setting this to anything other than 1 can increase overhead "
       "from context switching but we let the user override this behavior to "
       "help run tests on the host that run models in parallel across multiple "
       "devices."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_disable_gpuasm_optimizations",
       bool_setter_for(&DebugOptions::set_xla_gpu_disable_gpuasm_optimizations),
-      flag_values->xla_gpu_disable_gpuasm_optimizations(),
+      debug_options->xla_gpu_disable_gpuasm_optimizations(),
       "In XLA:GPU run ptxas in -O0 (default is -O3)."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_asm_extra_flags",
       string_setter_for(&DebugOptions::set_xla_gpu_asm_extra_flags), "",
       "Pass extra parameters to the GPU assembler tool (i.e., ptxas for CUDA). "
       "If multiple parameters, separate them by comma."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_fuel", setter_for_xla_fuel, /*default_value_for_display=*/"",
       "Sets compiler fuel, useful for bisecting bugs in passes.  Format "
       "--xla_fuel=PASS1=NUM1,PASS2=NUM2,..."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_to", string_setter_for(&DebugOptions::set_xla_dump_to),
-      flag_values->xla_dump_to(),
+      debug_options->xla_dump_to(),
       "Directory into which debugging data is written. If not specified but "
       "another dumping flag is passed, data will be written to stdout. To "
       "explicitly write to stdout, set this to \"-\". The values \"sponge\" "
       "and \"test_undeclared_outputs_dir\" have a special meaning: They cause "
       "us to dump into the directory specified by the environment variable "
       "TEST_UNDECLARED_OUTPUTS_DIR."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_as_text",
       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_text),
-      flag_values->xla_dump_hlo_as_text(),
-      "Dumps HLO modules as text before and after optimizations. Results are "
+      debug_options->xla_dump_hlo_as_text(),
+      "Dumps HLO modules as text before and after optimizations. debug_options "
+      "are "
       "written to the --xla_dump_to dir, or, if no dir is specified, to "
       "stdout."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_as_long_text",
       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_long_text),
-      flag_values->xla_dump_hlo_as_long_text(),
-      "Dumps HLO modules as long text before and after optimizations. Results "
+      debug_options->xla_dump_hlo_as_long_text(),
+      "Dumps HLO modules as long text before and after optimizations. "
+      "debug_options "
       "are written to the --xla_dump_to dir, or, if no dir is specified, to "
       "stdout. Ignored unless xla_dump_hlo_as_text is true."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_hlo_as_proto",
                 bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_proto),
-                flag_values->xla_dump_hlo_as_proto(),
+                debug_options->xla_dump_hlo_as_proto(),
                 "Dumps HLO modules as HloProtos to the directory specified by "
                 "--xla_dump_to."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_hlo_as_dot",
                 bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_dot),
-                flag_values->xla_dump_hlo_as_dot(),
+                debug_options->xla_dump_hlo_as_dot(),
                 "Dumps HLO modules rendered as dot files to the "
                 "directory specified by --xla_dump_to."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_hlo_as_html",
                 bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_html),
-                flag_values->xla_dump_hlo_as_html(),
+                debug_options->xla_dump_hlo_as_html(),
                 "Dumps HLO modules rendered as HTML files to the "
                 "directory specified by --xla_dump_to."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_as_url",
       bool_setter_for(&DebugOptions::set_xla_dump_hlo_as_url),
-      flag_values->xla_dump_hlo_as_url(),
+      debug_options->xla_dump_hlo_as_url(),
       "Tries to dump HLO modules rendered as URLs to stdout (and also to the "
       "directory specified by --xla_dump_to). This is not implemented by "
       "default; you need to add a plugin which calls "
       "RegisterGraphToURLRenderer()."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_fusion_visualization",
       bool_setter_for(&DebugOptions::set_xla_dump_fusion_visualization),
-      flag_values->xla_dump_fusion_visualization(),
+      debug_options->xla_dump_fusion_visualization(),
       "Tries to generate HLO fusion visualization as an HTML page to the "
       "directory specified by --xla_dump_to). This is not implemented by "
       "default; you need to add a plugin which calls "
       "RegisterGraphToURLRenderer(). Generates a file per computation. "
       "Currently only implemented for the GPU backend."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_snapshots",
       bool_setter_for(&DebugOptions::set_xla_dump_hlo_snapshots),
-      flag_values->xla_dump_hlo_snapshots(),
+      debug_options->xla_dump_hlo_snapshots(),
       "Every time an HLO module is run, dumps an HloSnapshot to the directory "
       "specified by --xla_dump_to."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_module_re",
       string_setter_for(&DebugOptions::set_xla_dump_hlo_module_re),
-      flag_values->xla_dump_hlo_module_re(),
+      debug_options->xla_dump_hlo_module_re(),
       "Limits dumping only to modules which match this regular expression. "
       "Default is to dump all modules."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_pass_re",
       string_setter_for(&DebugOptions::set_xla_dump_hlo_pass_re),
-      flag_values->xla_dump_hlo_pass_re(),
+      debug_options->xla_dump_hlo_pass_re(),
       "If specified, dumps HLO before and after optimization passes which "
       "match this regular expression, in addition to dumping at the very "
       "beginning and end of compilation."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_include_timestamp",
                 bool_setter_for(&DebugOptions::set_xla_dump_include_timestamp),
-                flag_values->xla_dump_include_timestamp(),
+                debug_options->xla_dump_include_timestamp(),
                 "If specified, includes a timestamp in the dumped filenames."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_max_hlo_modules",
                 int32_setter_for(&DebugOptions::set_xla_dump_max_hlo_modules),
-                flag_values->xla_dump_max_hlo_modules(),
+                debug_options->xla_dump_max_hlo_modules(),
                 "Max number of hlo module dumps in a directory. Set to < 0 for "
                 "unbounded."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_module_metadata",
       bool_setter_for(&DebugOptions::set_xla_dump_module_metadata),
-      flag_values->xla_dump_module_metadata(),
+      debug_options->xla_dump_module_metadata(),
       "Dumps HloModuleMetadata as text protos to the directory specified "
       "by --xla_dump_to."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_dump_compress_protos",
                 bool_setter_for(&DebugOptions::set_xla_dump_compress_protos),
-                flag_values->xla_dump_compress_protos(),
+                debug_options->xla_dump_compress_protos(),
                 "Gzip-compress protos dumped by --xla_dump_hlo_as_proto."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_hlo_graph_addresses",
       bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-      flag_values->xla_hlo_graph_addresses(),
+      debug_options->xla_hlo_graph_addresses(),
       "When rendering graphs (--xla_dump_hlo_as_{dot,html,url}), displays "
       "the address in memory of each HloInstruction object."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_hlo_graph_sharding_color",
       bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-      flag_values->xla_hlo_graph_sharding_color(),
+      debug_options->xla_hlo_graph_sharding_color(),
       "Assign colors based on sharding assignments when generating the HLO "
       "graphs."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_allow_excess_precision",
       bool_setter_for(&DebugOptions::set_xla_allow_excess_precision),
-      flag_values->xla_allow_excess_precision(),
+      debug_options->xla_allow_excess_precision(),
       "Allow xla to increase the output precision of an instruction."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_gpu_force_conv_nchw",
                 bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nchw),
-                flag_values->xla_gpu_force_conv_nchw(),
+                debug_options->xla_gpu_force_conv_nchw(),
                 "For cuDNN convolutions, always use NCHW layouts."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_gpu_force_conv_nhwc",
                 bool_setter_for(&DebugOptions::set_xla_gpu_force_conv_nhwc),
-                flag_values->xla_gpu_force_conv_nhwc(),
+                debug_options->xla_gpu_force_conv_nhwc(),
                 "For cuDNN convolutions, always use NHWC layouts."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_algorithm_denylist_path",
       string_setter_for(&DebugOptions::set_xla_gpu_algorithm_denylist_path),
-      flag_values->xla_gpu_algorithm_denylist_path(),
+      debug_options->xla_gpu_algorithm_denylist_path(),
       "An AlgorithmDenylist text proto file as a denylist of convolutions to "
       "avoid to use."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(
+      tsl::Flag("xla_gpu_use_runtime_fusion",
+                bool_setter_for(&DebugOptions::set_xla_gpu_use_runtime_fusion),
+                debug_options->xla_gpu_use_runtime_fusion(),
+                "For using cuDNN runtime compiled fusion kernels."));
+  flag_list->push_back(tsl::Flag(
       "xla_tpu_detect_nan",
       bool_setter_for(&DebugOptions::set_xla_tpu_detect_nan),
-      flag_values->xla_tpu_detect_nan(),
+      debug_options->xla_tpu_detect_nan(),
       "Trigger error on execution on TPU if a NAN value is detected"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_tpu_detect_inf",
       bool_setter_for(&DebugOptions::set_xla_tpu_detect_inf),
-      flag_values->xla_tpu_detect_inf(),
+      debug_options->xla_tpu_detect_inf(),
       "Trigger error on execution on TPU if a INF value is detected"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_enable_xprof_traceme",
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_xprof_traceme),
-      flag_values->xla_cpu_enable_xprof_traceme(),
+      debug_options->xla_cpu_enable_xprof_traceme(),
       "If true, XLA CPU generates code to call "
       "TraceMe::Activity{Start|End} around HLO operations."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found",
       bool_setter_for(
           &DebugOptions::
               set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found),
-      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(),
+      debug_options->xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(),
       "If true, XLA GPU falls back to the driver if ptxas is not found. Note "
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_multiheap_size_constraint_per_heap",
       int32_setter_for(
           &DebugOptions::set_xla_multiheap_size_constraint_per_heap),
-      flag_values->xla_multiheap_size_constraint_per_heap(),
+      debug_options->xla_multiheap_size_constraint_per_heap(),
       "Generates multiple heaps (i.e., temp buffers) with a size "
       "constraint on each heap to avoid Out-of-Memory due to memory "
       "fragmentation. The constraint is soft, so it works with tensors "
       "larger than the given constraint size. -1 corresponds to no "
       "constraints."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_force_compilation_parallelism",
       int32_setter_for(
           &DebugOptions::set_xla_gpu_force_compilation_parallelism),
-      flag_values->xla_gpu_force_compilation_parallelism(),
-      "Overrides normal multi-threaded compilation settting to use this many "
+      debug_options->xla_gpu_force_compilation_parallelism(),
+      "Overrides normal multi-threaded compilation setting to use this many "
       "threads. Setting to 0 (the default value) means no enforcement."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_gpu_deterministic_ops",
                 bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
-                flag_values->xla_gpu_deterministic_ops(),
+                debug_options->xla_gpu_deterministic_ops(),
                 "Guarantees run-to-run determinism on GPU."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_async_all_reduce",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_async_all_reduce),
-      flag_values->xla_gpu_enable_async_all_reduce(),
+      debug_options->xla_gpu_enable_async_all_reduce(),
       "Converts synchronous all-reduce ops into asynchronous."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_async_collective_permute",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_async_collective_permute),
+      debug_options->xla_gpu_enable_async_collective_permute(),
+      "Converts synchronous collective-permute ops into asynchronous."));
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_combine_threshold_bytes",
       int64_setter_for(
           &DebugOptions::set_xla_gpu_all_reduce_combine_threshold_bytes),
-      flag_values->xla_gpu_all_reduce_combine_threshold_bytes(),
+      debug_options->xla_gpu_all_reduce_combine_threshold_bytes(),
       "Size threshold (in bytes) for the GPU all-reduce combiner."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_contiguous",
       bool_setter_for(&DebugOptions::set_xla_gpu_all_reduce_contiguous),
-      flag_values->xla_gpu_all_reduce_contiguous(),
+      debug_options->xla_gpu_all_reduce_contiguous(),
       "Combine all-reduces into a single operation over a contiguous buffer."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_blueconnect_num_devices_per_host",
       int32_setter_for(
           &DebugOptions::
               set_xla_gpu_all_reduce_blueconnect_num_devices_per_host),
-      flag_values->xla_gpu_all_reduce_blueconnect_num_devices_per_host(),
+      debug_options->xla_gpu_all_reduce_blueconnect_num_devices_per_host(),
       "Number of devices per host for first stage of BlueConnect decomposition "
       "pass. The pass will attempt to decompose all-reduces ops into a "
       "ReduceScatter-AllReduce-AllGather sequence, with the initial "
       "ReduceScatter being performed over all of the devices in the same host. "
       "Set to < 1 to disable all-reduce decomposition."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_gpu_dump_llvmir",
                 bool_setter_for(&DebugOptions::set_xla_gpu_dump_llvmir),
-                flag_values->xla_gpu_dump_llvmir(), "Dump LLVM IR."));
-  flag_objects->push_back(tsl::Flag(
+                debug_options->xla_gpu_dump_llvmir(), "Dump LLVM IR."));
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_cudnn_frontend",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_cudnn_frontend),
-      flag_values->xla_gpu_enable_cudnn_frontend(),
+      debug_options->xla_gpu_enable_cudnn_frontend(),
       "Use the cuDNN frontend API for convolutions when possible."));
-  flag_objects->push_back(
+  flag_list->push_back(
       tsl::Flag("xla_gpu_enable_cublaslt",
                 bool_setter_for(&DebugOptions::set_xla_gpu_enable_cublaslt),
-                flag_values->xla_gpu_enable_cublaslt(),
+                debug_options->xla_gpu_enable_cublaslt(),
                 "Use cuBLASLt for GEMMs when possible."));
-  flag_objects->push_back(
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_cuda_graphs",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_cuda_graphs),
+      debug_options->xla_gpu_enable_cuda_graphs(),
+      "Use CUDA graphs to execute XLA GPU executables when possible."));
+  flag_list->push_back(
       tsl::Flag("xla_dump_disable_metadata",
                 bool_setter_for(&DebugOptions::set_xla_dump_disable_metadata),
-                flag_values->xla_dump_disable_metadata(),
+                debug_options->xla_dump_disable_metadata(),
                 "Disable dumping HLO metadata in HLO dumps."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_dump_hlo_pipeline_re",
       string_setter_for(&DebugOptions::set_xla_dump_hlo_pipeline_re),
-      flag_values->xla_dump_hlo_pipeline_re(),
+      debug_options->xla_dump_hlo_pipeline_re(),
       "If specified, dumps HLO before and after optimization passes in the "
       "pass pipelines that match this regular expression."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
+      "xla_dump_enable_mlir_pretty_form",
+      bool_setter_for(&DebugOptions::set_xla_dump_enable_mlir_pretty_form),
+      debug_options->xla_dump_enable_mlir_pretty_form(),
+      "Enable dumping MLIR using pretty print form. If set to false, the "
+      "dumped "
+      "MLIR will be in the llvm-parsable format and can be processed by "
+      "mlir-opt tools. "
+      "Pretty print form is not legal MLIR."));
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_xla_runtime_executable",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_xla_runtime_executable),
-      flag_values->xla_gpu_enable_xla_runtime_executable(),
+      debug_options->xla_gpu_enable_xla_runtime_executable(),
       "Whether to enable XLA runtime for XLA:GPU backend"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_nccl_termination_timeout_seconds",
       int64_setter_for(
           &DebugOptions::set_xla_gpu_nccl_termination_timeout_seconds),
-      flag_values->xla_gpu_nccl_termination_timeout_seconds(),
+      debug_options->xla_gpu_nccl_termination_timeout_seconds(),
       "Timeout in seconds before terminating jobs stuck in NCCL Rendezvous."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_shared_constants",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_shared_constants),
-      flag_values->xla_gpu_enable_shared_constants(),
+      debug_options->xla_gpu_enable_shared_constants(),
       "Enable constant sharing between GPU executables"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_redzone_scratch_max_megabytes",
       int64_setter_for(
           &DebugOptions::set_xla_gpu_redzone_scratch_max_megabytes),
-      flag_values->xla_gpu_redzone_scratch_max_megabytes(),
+      debug_options->xla_gpu_redzone_scratch_max_megabytes(),
       "Max size (in megabytes) for the GPU redzone scratch allocator."));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_gpu_simplify_all_fp_conversions",
       bool_setter_for(&DebugOptions::set_xla_gpu_simplify_all_fp_conversions),
-      flag_values->xla_gpu_simplify_all_fp_conversions(),
+      debug_options->xla_gpu_simplify_all_fp_conversions(),
       "Allows any chain of floating-point conversions to be simplified."));
-  flag_objects->push_back(tsl::Flag(
-      "xla_cpu_enable_mlir_lowering",
-      bool_setter_for(&DebugOptions::set_xla_cpu_enable_mlir_lowering),
-      flag_values->xla_cpu_enable_mlir_lowering(),
-      "Enable MLIR-based lowering in XLA:CPU instead of LLVM emitters."));
-  flag_objects->push_back(tsl::Flag(
-      "xla_gpu_enable_mlir_lowering",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_mlir_lowering),
-      flag_values->xla_gpu_enable_mlir_lowering(),
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_shape_checks", setter_for_xla_gpu_shape_checks,
+      DebugOptions::ShapeChecks_Name(debug_options->xla_gpu_shape_checks()),
+      "When to perform shape checks in XLA:GPU."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_mlir_lowering", setter_for_xla_gpu_enable_mlir_lowering,
+      debug_options->xla_gpu_enable_mlir_lowering(),
       "Enable MLIR-based lowering in XLA:GPU instead of LLVM emitters."));
-  flag_objects->push_back(tsl::Flag(
-      "xla_gpu_enable_softmax_fusion",
-      bool_setter_for(&DebugOptions::set_xla_gpu_enable_softmax_fusion),
-      flag_values->xla_gpu_enable_mlir_lowering(),
-      "Enable MLIR-based softmax fusion."));
-  flag_objects->push_back(
+  flag_list->push_back(tsl::Flag("xla_gpu_enable_softmax_fusion",
+                                 setter_for_xla_gpu_enable_softmax_fusion,
+                                 debug_options->xla_gpu_enable_softmax_fusion(),
+                                 "Enable MLIR-based softmax fusion."));
+  flag_list->push_back(
       tsl::Flag("xla_gpu_normalize_layouts",
                 bool_setter_for(&DebugOptions::set_xla_gpu_normalize_layouts),
-                flag_values->xla_gpu_normalize_layouts(),
+                debug_options->xla_gpu_normalize_layouts(),
                 "An experimental option to force all layouts present in the "
                 "after-optimizations HLO to be descending"));
-  flag_objects->push_back(tsl::Flag(
+  flag_list->push_back(tsl::Flag(
       "xla_cpu_strict_dot_conv_math",
       bool_setter_for(&DebugOptions::set_xla_cpu_strict_dot_conv_math),
-      flag_values->xla_cpu_strict_dot_conv_math(),
+      debug_options->xla_cpu_strict_dot_conv_math(),
       "By default, XLA:CPU will run fp16 dot/conv as fp32, as this is "
       "generally (much) faster on our hardware.  Set this flag to true to "
       "disable this behavior."));
+  flag_list->push_back(tsl::Flag(
+      "xla_dump_latency_hiding_schedule",
+      bool_setter_for(&DebugOptions::set_xla_dump_latency_hiding_schedule),
+      debug_options->xla_dump_latency_hiding_schedule(),
+      "Dump the schedule from the latency-hiding scheduler."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_mlir_tiling_and_fusion",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_mlir_tiling_and_fusion),
+      debug_options->xla_cpu_enable_mlir_tiling_and_fusion(),
+      "Enable MLIR tiling and fusion."));
+  flag_list->push_back(
+      tsl::Flag("xla_gpu_enable_latency_hiding_scheduler",
+                bool_setter_for(
+                    &DebugOptions::set_xla_gpu_enable_latency_hiding_scheduler),
+                debug_options->xla_gpu_enable_latency_hiding_scheduler(),
+                "Enable latency-hiding scheduler for XLA:GPU"));
+  flag_list->push_back(tsl::Flag(
+      "xla_partitioning_algorithm", setter_for_xla_partitioning_algorithm,
+      DebugOptions::PartitioningAlgorithm_Name(
+          debug_options->xla_partitioning_algorithm()),
+      "The partitioning algorithm to be used in the PartitionAssignment pass"));
+}  // NOLINT(readability/fn_size)
 
+// Allocates flag_values and flag_objects; this function must not be called more
+// than once - its call done via call_once.
+static void AllocateFlags(DebugOptions* defaults) {
+  if (defaults == nullptr) {
+    defaults = new DebugOptions(DefaultDebugOptionsIgnoringFlags());
+  }
+  flag_values = defaults;
+  flag_objects = new std::vector<tsl::Flag>();
+  MakeDebugOptionsFlags(flag_objects, flag_values);
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
-}  // NOLINT(readability/fn_size)
+}
 
-void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list) {
-  absl::call_once(flags_init, &AllocateFlags);
+void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                             DebugOptions* debug_options) {
+  absl::call_once(flags_init, &AllocateFlags, debug_options);
   flag_list->insert(flag_list->end(), flag_objects->begin(),
                     flag_objects->end());
 }
 
 xla::DebugOptions GetDebugOptionsFromFlags() {
-  absl::call_once(flags_init, &AllocateFlags);
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
   return *flag_values;
 }
 
 void ResetThreadLocalFuel() {
-  absl::call_once(flags_init, &AllocateFlags);
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
 
   thread_fuel = std::make_unique<
       absl::node_hash_map<std::string, std::atomic<int64_t>>>();
@@ -803,7 +912,7 @@ void ResetThreadLocalFuel() {
 }
 
 bool ConsumeFuel(absl::string_view pass, bool* just_ran_out) {
-  absl::call_once(flags_init, &AllocateFlags);
+  absl::call_once(flags_init, &AllocateFlags, nullptr);
   if (just_ran_out != nullptr) {
     *just_ran_out = false;
   }
diff --git a/tensorflow/compiler/xla/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
index 2669418d482..1b2ad1c6d65 100644
--- a/tensorflow/compiler/xla/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -25,8 +25,17 @@ limitations under the License.
 
 namespace xla {
 
-// Appends flag definitions for debug options to flag_list.
-void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list);
+// Construct flags which write to the debug_options proto when parsed. Existing
+// contents of debug_options is used as the default. Can be called multiple
+// times.
+void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                           DebugOptions* debug_options);
+
+// Appends flag definitions for debug options to flag_list. Existing
+// contents of debug_options is used as the default. If debug_options is null,
+// uses global defaults. Modifies global state on first call.
+void AppendDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
+                             DebugOptions* debug_options = nullptr);
 
 // Fetches a DebugOptions proto message from flags provided to the program.
 // Flags must be registered with the flags parser using AppendDebugOptionsFlags
diff --git a/tensorflow/compiler/xla/examples/axpy/BUILD b/tensorflow/compiler/xla/examples/axpy/BUILD
new file mode 100644
index 00000000000..a2e266481dd
--- /dev/null
+++ b/tensorflow/compiler/xla/examples/axpy/BUILD
@@ -0,0 +1,29 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+xla_cc_test(
+    name = "stablehlo_compile_test",
+    srcs = ["stablehlo_compile_test.cc"],
+    data = ["stablehlo_axpy.mlir"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/pjrt:local_device_state",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@stablehlo//:register",
+    ],
+)
diff --git a/tensorflow/compiler/xla/examples/axpy/README.md b/tensorflow/compiler/xla/examples/axpy/README.md
new file mode 100644
index 00000000000..397dd21c8fb
--- /dev/null
+++ b/tensorflow/compiler/xla/examples/axpy/README.md
@@ -0,0 +1,221 @@
+# Compile a StableHLO program with XLA
+
+This tutorial and the code in this directory shows how to write a simple
+StableHLO program and then compile it with XLA. The purpose is simply to
+show how XLA can injest a StableHLO program and produce an executable
+that's compatible with the local device. As such, the program is very
+simple: $\alpha x+y$ ("axpy").
+
+The process includes just a few steps:
+
+1.  Construct a StableHLO program using the StableHLO dialect.
+2.  Tell XLA to create a "computation" based on this program. In this example,
+    we will use PjRt (Pretty much just another Runtime) to achieve that.
+3.  Run the compiled executable with some inputs to compute results.
+
+All the code is already provided in this directory, which you can build and
+run using the steps at the end of this page.
+
+## 1. Create the StableHLO program
+
+We'll define the computation axpy as a StableHLO program, using an
+[MLIR](https://mlir.llvm.org/) file in the
+[StableHLO](https://github.com/openxla/stablehlo) dialect.
+
+It can be helpful to consider the computation as a graph, where each node is an
+operation (an "op" or "HLO" which means "high-level operation") and the graph
+edges are the data flow between operations. So the graph for axpy looks like
+this:
+
+```mermaid
+graph TD
+    p0(alpha f32) --> mul(Multiply 4xf32)
+    p1(x 4xf32) --> mul --> add(Add 4xf32)
+    p2(y 4xf32) --> add
+```
+
+And here's how we define the program using MLIR (in the StableHLO dialect):
+
+```mlir
+func.func @main(
+  %alpha: tensor<f32>, %x: tensor<4xf32>, %y: tensor<4xf32>
+) -> tensor<4xf32> {
+  %0 = stablehlo.broadcast_in_dim %alpha, dims = []
+    : (tensor<f32>) -> tensor<4xf32>
+  %1 = stablehlo.multiply %0, %x : tensor<4xf32>
+  %2 = stablehlo.add %1, %y : tensor<4xf32>
+  func.return %2: tensor<4xf32>
+}
+```
+
+This code is in [`stablehlo_axpy.mlir`](stablehlo_axpy.mlir).
+
+**Note:** StableHLO expresses broadcasting explicitly, so we use
+`"stablehlo.broadcast_in_dim"` to broadcast our scalar to a rank-1 tensor.
+
+## 2. Compile the StableHLO program
+
+Our program for this tutorial is set up as a test in
+[`stablehlo_compile_test.cc`](stablehlo_compile_test.cc). In this file,
+you'll see that we first set up a `PjRtStreamExecutorClient` that
+allows us to compile our StableHLO program:
+
+```c++
+// Setup client
+LocalClient* local_client = xla::ClientLibrary::LocalClientOrDie();
+
+// Retrieve the "platform" we intend to execute the computation on. The
+// concept of "platform" in XLA abstracts entirely everything need to
+// interact with some hardware (compiler, runtime, etc.). New HW vendor
+// plugs into XLA by registering a new platform with a different string
+// key. For example for an Nvidia GPU change the following to:
+//   PlatformUtil::GetPlatform("CUDA"));
+TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                        PlatformUtil::GetPlatform("cpu"));
+se::StreamExecutorConfig config;
+config.ordinal = 0;
+TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor,
+                        platform->GetExecutor(config));
+
+// LocalDeviceState and PjRtStreamExecutorDevice describes the state of a
+// device which can do computation or transfer buffers. Could represent a GPU
+// or accelerator, but we'll use the CPU for this example.
+auto device_state = std::make_unique<LocalDeviceState>(
+    executor, local_client, LocalDeviceState::kSynchronous,
+    /*max_inflight_computations=*/32,
+    /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
+auto device = std::make_unique<PjRtStreamExecutorDevice>(
+    0, std::move(device_state), "cpu");
+std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
+devices.emplace_back(std::move(device));
+
+// The PjRtStreamExecutorClient will allow us to compile and execute
+// computations on the device we just configured.
+auto pjrt_se_client = PjRtStreamExecutorClient(
+    "cpu", local_client, std::move(devices), /*process_index=*/0,
+    /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+    /*should_stage_host_to_device_transfers=*/false,
+    /*gpu_run_options=*/nullptr);
+```
+
+Then we read the StableHLO program from our MLIR file into a string:
+
+```c++
+// Read StableHLO program to string
+std::string program_path = tsl::io::JoinPath(
+    tsl::testing::XlaSrcRoot(), "examples", "axpy", "stablehlo_axpy.mlir");
+std::string program_string;
+
+TF_ASSERT_OK(
+    tsl::ReadFileToString(tsl::Env::Default(), program_path, &program_string));
+```
+
+In order to parse the StableHLO program, we must first register the appropriate
+MLIR dialects:
+
+```c++
+// Register MLIR dialects necessary to parse our program. In our case this is
+// just the Func dialect and StableHLO.
+mlir::DialectRegistry dialects;
+dialects.insert<mlir::func::FuncDialect>();
+mlir::stablehlo::registerAllDialects(dialects);
+
+// Parse StableHLO program.
+auto ctx = std::make_unique<mlir::MLIRContext>(dialects);
+mlir::OwningOpRef<mlir::ModuleOp> program =
+    mlir::parseSourceString<mlir::ModuleOp>(program_string, ctx.get());
+```
+
+Now that we've set up our client and parsed the StableHLO program we can
+compile it to an executable:
+
+```c++
+// Use our client to compile our StableHLO program to an executable.
+TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                        pjrt_se_client.Compile(*program, CompileOptions{}));
+```
+
+## 3. Execute the computation
+
+Finally, in [`stablehlo_compile_test.cc`](stablehlo_compile_test.cc),
+we can feed the executable some inputs for the three arguments and
+compute the results:
+
+```c++
+// Create inputs to our computation.
+auto alpha_literal = xla::LiteralUtil::CreateR0<float>(3.14f);
+auto x_literal = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+auto y_literal =
+    xla::LiteralUtil::CreateR1<float>({10.5f, 20.5f, 30.5f, 40.5f});
+
+// Get the host device.
+PjRtDevice* cpu = pjrt_se_client.devices()[0];
+
+// Transfer our literals to buffers. If we were using a GPU, these buffers
+// would correspond to device memory.
+TF_ASSERT_OK_AND_ASSIGN(
+    std::unique_ptr<PjRtBuffer> alpha,
+    pjrt_se_client.BufferFromHostLiteral(alpha_literal, cpu));
+TF_ASSERT_OK_AND_ASSIGN(
+    std::unique_ptr<PjRtBuffer> x,
+    pjrt_se_client.BufferFromHostLiteral(x_literal, cpu));
+TF_ASSERT_OK_AND_ASSIGN(
+    std::unique_ptr<PjRtBuffer> y,
+    pjrt_se_client.BufferFromHostLiteral(y_literal, cpu));
+
+// Do our computation.
+TF_ASSERT_OK_AND_ASSIGN(
+    std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> axpy_result,
+    executable->Execute({{alpha.get(), x.get(), y.get()}}, /*options=*/{}));
+
+// Convert result buffer back to literal.
+TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> axpy_result_literal,
+                        axpy_result[0][0]->ToLiteralSync());
+
+// Check to make sure that our results match what we expect.
+xla::LiteralTestUtil::ExpectR1Near<float>({13.64f, 26.78f, 39.92f, 53.06f},
+                                          *axpy_result_literal,
+                                          xla::ErrorSpec(0.01f));
+```
+
+## 4. Build and run the code
+
+You can build and run this example as follows using
+[Bazelisk](https://github.com/bazelbuild/bazelisk#readme) or
+[Bazel](https://bazel.build/) (run from within `xla/examples/axpy/`):
+
+```sh
+bazelisk test :stablehlo_compile_test --test_output=all --nocheck_visibility
+```
+
+Sample output from the test should look like this:
+
+```sh
+==================== Test output for //xla/examples/axpy:stablehlo_compile_test:
+[==========] Running 1 test from 1 test suite.
+[----------] Global test environment set-up.
+[----------] 1 test from StableHloAxpyTest
+[ RUN      ] StableHloAxpyTest.LoadAndRunCpuExecutable
+Loaded StableHLO program from xla/examples/axpy/stablehlo_axpy.mlir:
+func.func @main(
+  %alpha: tensor<f32>, %x: tensor<4xf32>, %y: tensor<4xf32>
+) -> tensor<4xf32> {
+  %0 = stablehlo.broadcast_in_dim %alpha, dims = []
+    : (tensor<f32>) -> tensor<4xf32>
+  %1 = stablehlo.multiply %0, %x : tensor<4xf32>
+  %2 = stablehlo.add %1, %y : tensor<4xf32>
+  func.return %2: tensor<4xf32>
+}
+
+Computation inputs:
+        alpha:f32[] 3.14
+        x:f32[4] {1, 2, 3, 4}
+        y:f32[4] {10.5, 20.5, 30.5, 40.5}
+Computation output: f32[4] {13.64, 26.78, 39.920002, 53.06}
+[       OK ] StableHloAxpyTest.LoadAndRunCpuExecutable (264 ms)
+[----------] 1 test from StableHloAxpyTest (264 ms total)
+
+[----------] Global test environment tear-down
+[==========] 1 test from 1 test suite ran. (264 ms total)
+[  PASSED  ] 1 test.
+```
diff --git a/tensorflow/compiler/xla/examples/axpy/stablehlo_axpy.mlir b/tensorflow/compiler/xla/examples/axpy/stablehlo_axpy.mlir
new file mode 100644
index 00000000000..7f4205999f8
--- /dev/null
+++ b/tensorflow/compiler/xla/examples/axpy/stablehlo_axpy.mlir
@@ -0,0 +1,9 @@
+func.func @main(
+  %alpha: tensor<f32>, %x: tensor<4xf32>, %y: tensor<4xf32>
+) -> tensor<4xf32> {
+  %0 = stablehlo.broadcast_in_dim %alpha, dims = []
+    : (tensor<f32>) -> tensor<4xf32>
+  %1 = stablehlo.multiply %0, %x : tensor<4xf32>
+  %2 = stablehlo.add %1, %y : tensor<4xf32>
+  func.return %2: tensor<4xf32>
+}
diff --git a/tensorflow/compiler/xla/examples/axpy/stablehlo_compile_test.cc b/tensorflow/compiler/xla/examples/axpy/stablehlo_compile_test.cc
new file mode 100644
index 00000000000..da0a9dba19b
--- /dev/null
+++ b/tensorflow/compiler/xla/examples/axpy/stablehlo_compile_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(StableHloAxpyTest, LoadAndRunCpuExecutable) {
+  // Setup client
+  LocalClient* local_client = xla::ClientLibrary::LocalClientOrDie();
+
+  // Retrieve the "platform" we intend to execute the computation on. The
+  // concept of "platform" in XLA abstracts entirely everything needed to
+  // interact with some hardware (compiler, runtime, etc.). New HW vendor
+  // plugs into XLA by registering a new platform with a different string
+  // key. For example for an Nvidia GPU change the following to:
+  //   PlatformUtil::GetPlatform("CUDA"));
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("cpu"));
+  se::StreamExecutorConfig config;
+  config.ordinal = 0;
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor,
+                          platform->GetExecutor(config));
+
+  // LocalDeviceState and PjRtStreamExecutorDevice describes the state of a
+  // device which can do computation or transfer buffers. This could represent a
+  // GPU or accelerator, but we'll use the CPU for this example.
+  auto device_state = std::make_unique<LocalDeviceState>(
+      executor, local_client, LocalDeviceState::kSynchronous,
+      /*max_inflight_computations=*/32,
+      /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
+  auto device = std::make_unique<PjRtStreamExecutorDevice>(
+      0, std::move(device_state), "cpu");
+  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
+  devices.emplace_back(std::move(device));
+
+  // The PjRtStreamExecutorClient will allow us to compile and execute
+  // computations on the device we just configured.
+  auto pjrt_se_client = PjRtStreamExecutorClient(
+      "cpu", local_client, std::move(devices), /*process_index=*/0,
+      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      /*should_stage_host_to_device_transfers=*/false,
+      /*gpu_run_options=*/nullptr);
+
+  // Read StableHLO program to string.
+  std::string program_path = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "examples", "axpy", "stablehlo_axpy.mlir");
+  std::string program_string;
+
+  TF_ASSERT_OK(tsl::ReadFileToString(tsl::Env::Default(), program_path,
+                                     &program_string));
+
+  std::cerr << "Loaded StableHLO program from " << program_path << ":\n"
+            << program_string << std::endl;
+
+  // Register MLIR dialects necessary to parse our program. In our case this is
+  // just the Func dialect and StableHLO.
+  mlir::DialectRegistry dialects;
+  dialects.insert<mlir::func::FuncDialect>();
+  mlir::stablehlo::registerAllDialects(dialects);
+
+  // Parse StableHLO program.
+  auto ctx = std::make_unique<mlir::MLIRContext>(dialects);
+  mlir::OwningOpRef<mlir::ModuleOp> program =
+      mlir::parseSourceString<mlir::ModuleOp>(program_string, ctx.get());
+
+  // Use our client to compile our StableHLO program to an executable.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                          pjrt_se_client.Compile(*program, CompileOptions{}));
+
+  // Create inputs to our computation.
+  auto alpha_literal = xla::LiteralUtil::CreateR0<float>(3.14f);
+  auto x_literal = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  auto y_literal =
+      xla::LiteralUtil::CreateR1<float>({10.5f, 20.5f, 30.5f, 40.5f});
+
+  std::cerr << "Computation inputs:" << std::endl;
+  std::cerr << "\talpha:" << alpha_literal << std::endl;
+  std::cerr << "\tx:" << x_literal << std::endl;
+  std::cerr << "\ty:" << y_literal << std::endl;
+
+  // Get the host device.
+  PjRtDevice* cpu = pjrt_se_client.devices()[0];
+
+  // Transfer our literals to buffers. If we were using a GPU, these buffers
+  // would correspond to device memory.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> alpha,
+      pjrt_se_client.BufferFromHostLiteral(alpha_literal, cpu));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtBuffer> x,
+                          pjrt_se_client.BufferFromHostLiteral(x_literal, cpu));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtBuffer> y,
+                          pjrt_se_client.BufferFromHostLiteral(y_literal, cpu));
+
+  // Do our computation.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> axpy_result,
+      executable->Execute({{alpha.get(), x.get(), y.get()}}, /*options=*/{}));
+
+  // Convert result buffer back to literal.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> axpy_result_literal,
+                          axpy_result[0][0]->ToLiteralSync());
+
+  // Check to make sure that our results match what we expect.
+  xla::LiteralTestUtil::ExpectR1Near<float>({13.64f, 26.78f, 39.92f, 53.06f},
+                                            *axpy_result_literal,
+                                            xla::ErrorSpec(0.01f));
+  std::cerr << "Computation output: " << *axpy_result_literal << std::endl;
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 2827c08ef4e..21264536631 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -72,6 +72,16 @@ stream_executor::Stream* ExecutableRunOptions::host_to_device_stream() const {
   return host_to_device_stream_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_device_to_host_stream(
+    stream_executor::Stream* stream) {
+  device_to_host_stream_ = stream;
+  return *this;
+}
+
+stream_executor::Stream* ExecutableRunOptions::device_to_host_stream() const {
+  return device_to_host_stream_;
+}
+
 ExecutableRunOptions& ExecutableRunOptions::set_intra_op_thread_pool(
     const Eigen::ThreadPoolDevice* intra_op_thread_pool) {
   intra_op_thread_pool_ = intra_op_thread_pool;
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 0b0a4c760c1..8a0aa19dc06 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+#include <cstdint>
 #include <functional>
+#include <memory>
 #include <string>
 
 // These classes are forward declared so that ExecutableRunOptions can be linked
@@ -25,18 +27,33 @@ limitations under the License.
 // need to be linked).
 namespace stream_executor {
 class Stream;
+class Event;
 class Platform;
 class DeviceMemoryAllocator;
+class DeviceMemoryBase;
 }  // namespace stream_executor
 
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
 
+namespace tsl {
+class Status;
+template <typename T>
+class StatusOr;
+template <typename T>
+class AsyncValueRef;
+}  // namespace tsl
+
 namespace xla {
 
+using ::tsl::Status;    // TENSORFLOW_STATUS_OK
+using ::tsl::StatusOr;  // TENSORFLOW_STATUS_OK
+
 class DeviceAssignment;
 class ExecutionProfile;
+class Shape;
+
 namespace gpu {
 class GpuExecutableRunOptions;
 }  // namespace gpu
@@ -77,6 +94,22 @@ class RunId {
 using ThenExecuteFunction =
     std::function<void(stream_executor::Stream*, std::function<void()>)>;
 
+// Callback for sending device buffer to a channel. Returned event will be
+// recorded on a `stream` once the send operation is completed and data was
+// copied from the `src` memory.
+using SendDeviceMemoryFunction =
+    std::function<StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+        int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
+        const stream_executor::DeviceMemoryBase& src)>;
+
+// Callback for receiving device buffer from a channel. Returned event will be
+// recorded on a `stream` once the recv operation is completed and data was
+// copied into the `dst` memory.
+using RecvDeviceMemoryFunction =
+    std::function<StatusOr<tsl::AsyncValueRef<stream_executor::Event>>(
+        int64_t channel_id, stream_executor::Stream* stream, const Shape& shape,
+        stream_executor::DeviceMemoryBase* dst)>;
+
 // Class containing options for running a LocalExecutable.
 class ExecutableRunOptions {
  public:
@@ -99,13 +132,21 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
   stream_executor::Stream* stream() const;
 
-  // If set, this is the stream to perform any pre-computation transfers on.
-  // The platform of the stream must match the platform the executable was
-  // built for.  A value of nullptr indicates the option has not been set.
+  // If set, this is the stream to perform host to device transfers on (e.g. any
+  // pre-computation transfers). The platform of the stream must match the
+  // platform the executable was built for. A value of nullptr indicates the
+  // option has not been set.
   ExecutableRunOptions& set_host_to_device_stream(
       stream_executor::Stream* stream);
   stream_executor::Stream* host_to_device_stream() const;
 
+  // If set, this is the stream to perform device to host transfers on.
+  // The platform of the stream must match the platform the executable was
+  // built for. A value of nullptr indicates the option has not been set.
+  ExecutableRunOptions& set_device_to_host_stream(
+      stream_executor::Stream* stream);
+  stream_executor::Stream* device_to_host_stream() const;
+
   // Sets the thread pool device on which to run Eigen subcomputations.
   //
   // This field must be set for XLA:CPU models that call Eigen routines, but may
@@ -148,6 +189,26 @@ class ExecutableRunOptions {
     return then_execute_function_;
   }
 
+  // See documentation on SendDeviceMemoryFunction.
+  ExecutableRunOptions& set_send_device_memory_function(
+      SendDeviceMemoryFunction* f) {
+    send_device_memory_function_ = f;
+    return *this;
+  }
+  SendDeviceMemoryFunction* send_device_memory_function() const {
+    return send_device_memory_function_;
+  }
+
+  // See documentation on RecvDeviceMemoryFunction.
+  ExecutableRunOptions& set_recv_device_memory_function(
+      RecvDeviceMemoryFunction* f) {
+    recv_device_memory_function_ = f;
+    return *this;
+  }
+  RecvDeviceMemoryFunction* recv_device_memory_function() const {
+    return recv_device_memory_function_;
+  }
+
   // GPU-backend specific options. These are kept out-of-line to avoid bloating
   // the size of this dependency for CPU-only AOT builds.
   ExecutableRunOptions& set_gpu_executable_run_options(
@@ -163,8 +224,11 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
   int rng_seed_ = 0;
   int32_t launch_id_ = 0;
+  stream_executor::Stream* device_to_host_stream_ = nullptr;
   stream_executor::Stream* host_to_device_stream_ = nullptr;
   ThenExecuteFunction* then_execute_function_ = nullptr;
+  SendDeviceMemoryFunction* send_device_memory_function_ = nullptr;
+  RecvDeviceMemoryFunction* recv_device_memory_function_ = nullptr;
   RunId run_id_;
   const gpu::GpuExecutableRunOptions* gpu_executable_run_options_ = nullptr;
 };
diff --git a/tensorflow/compiler/xla/experimental/conv_emitter/BUILD b/tensorflow/compiler/xla/experimental/conv_emitter/BUILD
new file mode 100644
index 00000000000..a786c5c89b6
--- /dev/null
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/BUILD
@@ -0,0 +1,91 @@
+# Description:
+#   MLIR-GPU-specific convolution in XLA service implementation.
+
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = ["//tensorflow/compiler/xla:friends"],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "conv_emitter",
+    srcs = ["conv_emitter.cc"],
+    hdrs = ["conv_emitter.h"],
+    deps = [
+        ":conv_emitter_transforms",
+        "//tensorflow/compiler/xla:permutation_util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineUtils",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "conv_emitter_transforms",
+    srcs = ["conv_emitter_transforms.cc"],
+    hdrs = ["conv_emitter_transforms.h"],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+xla_cc_test(
+    name = "conv_emitter_test",
+    srcs = ["conv_emitter_test.cc"],
+    deps = [
+        ":conv_emitter",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc
similarity index 99%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.cc
rename to tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc
index fe9157feb8d..c5af2884e0a 100644
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.cc
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.cc
@@ -25,7 +25,7 @@ limitations under the License.
 // * Use milr::AffineExpr to analyze all accesses. It aims to algorithmically
 //   find memory access strategies for given input layouts and tiling configs.
 
-#include "tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.h"
+#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h"
 
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -40,7 +40,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.h"
+#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "tensorflow/compiler/xla/window_util.h"
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.h b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h
similarity index 85%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.h
rename to tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h
index 12270e9da03..a380800b2f7 100644
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.h
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 namespace experimental {
@@ -46,4 +46,4 @@ Status ConvIsImplemented(const HloInstruction* conv);
 }  // namespace experimental
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_H_
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc
similarity index 97%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_test.cc
rename to tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc
index bbae2513ff5..8c66e6d5b82 100644
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter.h"
+#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter.h"
 
 #include <vector>
 
@@ -74,7 +74,7 @@ std::string CompileHloConvAndGetMlir(absl::string_view hlo_text) {
     mlir::PassManager pm(mlir_module->getContext());
     pm.addPass(mlir::createLowerAffinePass());
     pm.addPass(mlir::createConvertSCFToCFPass());
-    pm.addPass(mlir::createMemRefToLLVMConversionPass());
+    pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
     pm.addPass(mlir::createConvertFuncToLLVMPass());
     CHECK(mlir::succeeded(pm.run(*mlir_module)));
   }
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.cc b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc
similarity index 98%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.cc
rename to tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc
index 400a97d73d2..91268062959 100644
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.cc
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.h"
+#include "tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h"
 
 #include <iterator>
 
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.h b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h
similarity index 93%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.h
rename to tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h
index 5ae7d7473b2..97c44daa52f 100644
--- a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/conv_emitter_transforms.h
+++ b/tensorflow/compiler/xla/experimental/conv_emitter/conv_emitter_transforms.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
+#define TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
 
 #include "absl/types/span.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -99,4 +99,4 @@ void SinkPerfectlyNestedLoops(llvm::MutableArrayRef<mlir::AffineForOp> loops,
 }  // namespace experimental
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_EXPERIMENTAL_CONV_EMITTER_CONV_EMITTER_TRANSFORMS_H_
diff --git a/tensorflow/compiler/mlir/xla/experimental/conv_emitter/g3doc/conv_emitter.md b/tensorflow/compiler/xla/experimental/conv_emitter/g3doc/conv_emitter.md
similarity index 100%
rename from tensorflow/compiler/mlir/xla/experimental/conv_emitter/g3doc/conv_emitter.md
rename to tensorflow/compiler/xla/experimental/conv_emitter/g3doc/conv_emitter.md
diff --git a/tensorflow/compiler/xla/frontend_attributes.cc b/tensorflow/compiler/xla/frontend_attributes.cc
new file mode 100644
index 00000000000..a1f6ad58c27
--- /dev/null
+++ b/tensorflow/compiler/xla/frontend_attributes.cc
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/frontend_attributes.h"
+
+namespace xla {
+
+void SetDisjointReadWriteRegionsAttr(HloInstruction* instruction) {
+  FrontendAttributes attrs;
+  (*attrs.mutable_map())[xla::kXlaDisjointReadWriteRegions] = "true";
+  instruction->add_frontend_attributes(attrs);
+}
+
+bool HasDisjointReadWriteRegionsAttr(HloInstruction* instruction) {
+  return instruction->frontend_attributes().map().contains(
+      xla::kXlaDisjointReadWriteRegions);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/frontend_attributes.h b/tensorflow/compiler/xla/frontend_attributes.h
new file mode 100644
index 00000000000..32ba357e533
--- /dev/null
+++ b/tensorflow/compiler/xla/frontend_attributes.h
@@ -0,0 +1,38 @@
+#ifndef TENSORFLOW_COMPILER_XLA_FRONTEND_ATTRIBUTES_H_
+#define TENSORFLOW_COMPILER_XLA_FRONTEND_ATTRIBUTES_H_
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+// Attribute which indicates that an in-place instruction has disjoint read
+// and write regions w.r.t aliased input/output buffers.
+inline constexpr char kXlaDisjointReadWriteRegions[] =
+    "_xla_disjoint_read_write_regions";
+
+// Set frontend attribute on 'instruction' which indices that in-place
+// 'instruction' has disjoint read/write buffer regions.
+void SetDisjointReadWriteRegionsAttr(HloInstruction* instruction);
+
+// Returns 'true' if in-place 'instruction' has the kXlaDisjointReadWriteRegions
+// frontend attribute set (returns false otherwise).
+bool HasDisjointReadWriteRegionsAttr(HloInstruction* instruction);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_FRONTEND_ATTRIBUTES_H_
diff --git a/tensorflow/compiler/xla/g3doc/images/batch_group_counts.svg b/tensorflow/compiler/xla/g3doc/images/batch_group_counts.svg
new file mode 100644
index 00000000000..799e8f895a2
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/images/batch_group_counts.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 960.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="g185d8da9c3e_0_0.0"><path d="m0 0l960.0 0l0 540.0l-960.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#g185d8da9c3e_0_0.0)"><path fill="#ffffff" d="m0 0l960.0 0l0 540.0l-960.0 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m32.72441 46.721786l894.55115 0l0 60.125984l-894.55115 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m24.748032 7.312336l894.55115 0l0 31.59055l-894.55115 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m130.11548 185.98425l-0.12597656 32.503937" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m130.11548 185.98425l-0.12597656 32.503937" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.75853 136.65355l143.2441 0l0 40.40944l-143.2441 0z" fill-rule="evenodd"/><path fill="#595959" d="m89.41495 162.29355l0 -12.40625l4.65625 0q1.421875 0 2.28125 0.375q0.859375 0.375 1.34375 1.171875q0.484375 0.78125 0.484375 1.625q0 0.796875 -0.4375 1.5q-0.421875 0.703125 -1.296875 1.140625q1.125 0.328125 1.734375 1.125q0.609375 0.796875 0.609375 1.875q0 0.875 -0.375 1.625q-0.359375 0.734375 -0.90625 1.140625q-0.546875 0.40625 -1.359375 0.625q-0.8125 0.203125 -2.0 0.203125l-4.734375 0zm1.640625 -7.1875l2.6875 0q1.09375 0 1.5625 -0.140625q0.625 -0.1875 0.9375 -0.625q0.328125 -0.4375 0.328125 -1.078125q0 -0.625 -0.296875 -1.09375q-0.296875 -0.46875 -0.859375 -0.640625q-0.546875 -0.171875 -1.875 -0.171875l-2.484375 0l0 3.75zm0 5.71875l3.09375 0q0.796875 0 1.109375 -0.046875q0.578125 -0.109375 0.953125 -0.34375q0.375 -0.234375 0.625 -0.6875q0.25 -0.453125 0.25 -1.046875q0 -0.703125 -0.359375 -1.203125q-0.359375 -0.515625 -0.984375 -0.71875q-0.625 -0.21875 -1.8125 -0.21875l-2.875 0l0 4.265625zm15.651474 0.359375q-0.84375 0.71875 -1.625 1.015625q-0.78125 0.296875 -1.671875 0.296875q-1.484375 0 -2.28125 -0.71875q-0.796875 -0.734375 -0.796875 -1.859375q0 -0.65625 0.296875 -1.203125q0.3125 -0.546875 0.796875 -0.875q0.484375 -0.328125 1.09375 -0.5q0.4375 -0.109375 1.34375 -0.21875q1.84375 -0.21875 2.71875 -0.53125q0.015625 -0.3125 0.015625 -0.390625q0 -0.9375 -0.4375 -1.3125q-0.578125 -0.515625 -1.734375 -0.515625q-1.078125 0 -1.59375 0.375q-0.5 0.375 -0.75 1.328125l-1.484375 -0.203125q0.203125 -0.953125 0.65625 -1.53125q0.46875 -0.59375 1.34375 -0.90625q0.890625 -0.328125 2.046875 -0.328125q1.15625 0 1.875 0.28125q0.71875 0.265625 1.046875 0.671875q0.34375 0.40625 0.484375 1.03125q0.078125 0.390625 0.078125 1.40625l0 2.03125q0 2.125 0.09375 2.6875q0.09375 0.5625 0.390625 1.078125l-1.59375 0q-0.234375 -0.46875 -0.3125 -1.109375zm-0.125 -3.40625q-0.828125 0.34375 -2.484375 0.578125q-0.9375 0.140625 -1.328125 0.3125q-0.390625 0.171875 -0.609375 0.5q-0.203125 0.3125 -0.203125 0.71875q0 0.609375 0.453125 1.015625q0.46875 0.40625 1.359375 0.40625q0.875 0 1.5625 -0.390625q0.6875 -0.390625 1.015625 -1.046875q0.234375 -0.515625 0.234375 -1.53125l0 -0.5625zm7.230835 3.15625l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm7.345566 -1.9375l1.5 0.203125q-0.234375 1.546875 -1.25 2.421875q-1.015625 0.875 -2.484375 0.875q-1.84375 0 -2.96875 -1.203125q-1.125 -1.203125 -1.125 -3.453125q0 -1.46875 0.484375 -2.546875q0.484375 -1.09375 1.46875 -1.640625q0.984375 -0.546875 2.140625 -0.546875q1.46875 0 2.390625 0.75q0.9375 0.734375 1.203125 2.09375l-1.484375 0.21875q-0.203125 -0.890625 -0.75 -1.34375q-0.53125 -0.46875 -1.296875 -0.46875q-1.140625 0 -1.875 0.828125q-0.71875 0.828125 -0.71875 2.609375q0 1.8125 0.6875 2.640625q0.703125 0.8125 1.828125 0.8125q0.890625 0 1.484375 -0.546875q0.609375 -0.5625 0.765625 -1.703125zm2.8046875 3.296875l0 -12.40625l1.53125 0l0 4.453125q1.0625 -1.234375 2.6875 -1.234375q1.0 0 1.734375 0.390625q0.734375 0.390625 1.046875 1.09375q0.328125 0.6875 0.328125 2.015625l0 5.6875l-1.53125 0l0 -5.6875q0 -1.15625 -0.5 -1.671875q-0.484375 -0.515625 -1.390625 -0.515625q-0.6875 0 -1.28125 0.359375q-0.59375 0.34375 -0.84375 0.953125q-0.25 0.59375 -0.25 1.640625l0 4.921875l-1.53125 0zm17.35765 3.640625q-1.25 -1.578125 -2.125 -3.703125q-0.875 -2.140625 -0.875 -4.4375q0 -2.0 0.65625 -3.84375q0.765625 -2.140625 2.34375 -4.265625l1.09375 0q-1.015625 1.75 -1.34375 2.5q-0.515625 1.171875 -0.8125 2.4375q-0.375 1.59375 -0.375 3.1875q0 4.0625 2.53125 8.125l-1.09375 0zm2.301651 -8.140625q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.637085 4.5l0 -8.984375l1.375 0l0 1.28125q0.984375 -1.484375 2.859375 -1.484375q0.8125 0 1.484375 0.296875q0.6875 0.28125 1.015625 0.765625q0.34375 0.46875 0.484375 1.125q0.078125 0.421875 0.078125 1.46875l0 5.53125l-1.515625 0l0 -5.46875q0 -0.921875 -0.1875 -1.375q-0.171875 -0.46875 -0.625 -0.75q-0.453125 -0.28125 -1.0625 -0.28125q-0.96875 0 -1.671875 0.625q-0.703125 0.609375 -0.703125 2.34375l0 4.90625l-1.53125 0zm14.576401 0l0 -12.40625l1.640625 0l0 10.9375l6.109375 0l0 1.46875l-7.75 0zm9.762085 0l0 -12.40625l1.640625 0l0 5.09375l6.4375 0l0 -5.09375l1.65625 0l0 12.40625l-1.65625 0l0 -5.84375l-6.4375 0l0 5.84375l-1.640625 0zm11.904449 -3.984375l1.546875 -0.140625q0.109375 0.9375 0.5 1.53125q0.40625 0.59375 1.25 0.96875q0.859375 0.359375 1.90625 0.359375q0.9375 0 1.65625 -0.265625q0.734375 -0.28125 1.078125 -0.765625q0.34375 -0.5 0.34375 -1.078125q0 -0.578125 -0.34375 -1.015625q-0.328125 -0.4375 -1.109375 -0.734375q-0.5 -0.1875 -2.203125 -0.59375q-1.703125 -0.421875 -2.390625 -0.78125q-0.890625 -0.46875 -1.328125 -1.15625q-0.4375 -0.6875 -0.4375 -1.546875q0 -0.9375 0.53125 -1.75q0.53125 -0.8125 1.546875 -1.234375q1.03125 -0.421875 2.296875 -0.421875q1.375 0 2.421875 0.4375q1.0625 0.4375 1.625 1.3125q0.5625 0.859375 0.609375 1.953125l-1.578125 0.109375q-0.125 -1.171875 -0.859375 -1.765625q-0.734375 -0.609375 -2.15625 -0.609375q-1.484375 0 -2.171875 0.546875q-0.6875 0.546875 -0.6875 1.3125q0 0.671875 0.484375 1.109375q0.484375 0.421875 2.484375 0.875q2.0 0.453125 2.734375 0.796875q1.09375 0.5 1.609375 1.265625q0.515625 0.765625 0.515625 1.765625q0 0.984375 -0.578125 1.859375q-0.5625 0.875 -1.625 1.375q-1.0625 0.484375 -2.390625 0.484375q-1.6875 0 -2.828125 -0.484375q-1.125 -0.5 -1.78125 -1.484375q-0.640625 -0.984375 -0.671875 -2.234375zm12.917099 7.625l-1.09375 0q2.53125 -4.0625 2.53125 -8.125q0 -1.59375 -0.359375 -3.15625q-0.296875 -1.265625 -0.8125 -2.4375q-0.328125 -0.765625 -1.359375 -2.53125l1.09375 0q1.59375 2.125 2.359375 4.265625q0.640625 1.84375 0.640625 3.84375q0 2.296875 -0.875 4.4375q-0.875 2.125 -2.125 3.703125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m273.25723 148.40945l179.49606 0l0 61.417313l-179.49606 0z" fill-rule="evenodd"/><path fill="#595959" d="m304.12698 168.00258q0 -3.078125 1.65625 -4.828125q1.65625 -1.75 4.28125 -1.75q1.71875 0 3.09375 0.828125q1.375 0.8125 2.09375 2.28125q0.734375 1.46875 0.734375 3.328125q0 1.890625 -0.765625 3.390625q-0.765625 1.484375 -2.15625 2.25q-1.390625 0.765625 -3.015625 0.765625q-1.75 0 -3.125 -0.84375q-1.375 -0.859375 -2.09375 -2.3125q-0.703125 -1.46875 -0.703125 -3.109375zm1.6875 0.03125q0 2.234375 1.203125 3.53125q1.203125 1.296875 3.03125 1.296875q1.84375 0 3.03125 -1.296875q1.203125 -1.3125 1.203125 -3.703125q0 -1.515625 -0.515625 -2.640625q-0.5 -1.140625 -1.484375 -1.75q-0.984375 -0.625 -2.21875 -0.625q-1.734375 0 -3.0 1.203125q-1.25 1.1875 -1.25 3.984375zm17.978394 6.015625l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm1.4861755 4.796875l0 -12.421875l1.390625 0l0 1.171875q0.484375 -0.6875 1.09375 -1.03125q0.625 -0.34375 1.515625 -0.34375q1.140625 0 2.015625 0.59375q0.890625 0.59375 1.328125 1.671875q0.453125 1.078125 0.453125 2.359375q0 1.375 -0.5 2.484375q-0.484375 1.109375 -1.4375 1.703125q-0.9375 0.578125 -1.96875 0.578125q-0.765625 0 -1.375 -0.3125q-0.609375 -0.328125 -0.984375 -0.828125l0 4.375l-1.53125 0zm1.375 -7.875q0 1.734375 0.703125 2.5625q0.703125 0.828125 1.703125 0.828125q1.015625 0 1.734375 -0.859375q0.734375 -0.859375 0.734375 -2.65625q0 -1.71875 -0.71875 -2.578125q-0.703125 -0.859375 -1.6875 -0.859375q-0.96875 0 -1.71875 0.921875q-0.75 0.90625 -0.75 2.640625zm14.15271 4.4375l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm6.6598816 1.359375l0 -7.796875l-1.34375 0l0 -1.1875l1.34375 0l0 -0.953125q0 -0.90625 0.171875 -1.34375q0.21875 -0.59375 0.765625 -0.953125q0.5625 -0.375 1.5625 -0.375q0.640625 0 1.421875 0.140625l-0.234375 1.328125q-0.46875 -0.078125 -0.890625 -0.078125q-0.703125 0 -1.0 0.296875q-0.28125 0.296875 -0.28125 1.109375l0 0.828125l1.75 0l0 1.1875l-1.75 0l0 7.796875l-1.515625 0zm10.611206 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm14.34021 4.25q-0.84375 0.71875 -1.625 1.015625q-0.78125 0.296875 -1.671875 0.296875q-1.484375 0 -2.28125 -0.71875q-0.796875 -0.734375 -0.796875 -1.859375q0 -0.65625 0.296875 -1.203125q0.3125 -0.546875 0.796875 -0.875q0.484375 -0.328125 1.09375 -0.5q0.4375 -0.109375 1.34375 -0.21875q1.84375 -0.21875 2.71875 -0.53125q0.015625 -0.3125 0.015625 -0.390625q0 -0.9375 -0.4375 -1.3125q-0.578125 -0.515625 -1.734375 -0.515625q-1.078125 0 -1.59375 0.375q-0.5 0.375 -0.75 1.328125l-1.484375 -0.203125q0.203125 -0.953125 0.65625 -1.53125q0.46875 -0.59375 1.34375 -0.90625q0.890625 -0.328125 2.046875 -0.328125q1.15625 0 1.875 0.28125q0.71875 0.265625 1.046875 0.671875q0.34375 0.40625 0.484375 1.03125q0.078125 0.390625 0.078125 1.40625l0 2.03125q0 2.125 0.09375 2.6875q0.09375 0.5625 0.390625 1.078125l-1.59375 0q-0.234375 -0.46875 -0.3125 -1.109375zm-0.125 -3.40625q-0.828125 0.34375 -2.484375 0.578125q-0.9375 0.140625 -1.328125 0.3125q-0.390625 0.171875 -0.609375 0.5q-0.203125 0.3125 -0.203125 0.71875q0 0.609375 0.453125 1.015625q0.46875 0.40625 1.359375 0.40625q0.875 0 1.5625 -0.390625q0.6875 -0.390625 1.015625 -1.046875q0.234375 -0.515625 0.234375 -1.53125l0 -0.5625zm7.230835 3.15625l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm7.3768005 1.359375l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm3.730835 0l0 -8.984375l1.375 0l0 1.359375q0.5156555 -0.953125 0.9531555 -1.25q0.453125 -0.3125 0.984375 -0.3125q0.78125 0 1.5625 0.484375l-0.515625 1.421875q-0.5625 -0.328125 -1.125 -0.328125q-0.5 0 -0.890625 0.296875q-0.390625 0.296875 -0.5625 0.828125q-0.265625 0.8125 -0.265625 1.78125l0 4.703125l-1.5156555 0zm11.942291 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm7.87146 2.671875l1.515625 -0.234375q0.125 0.90625 0.703125 1.390625q0.578125 0.484375 1.609375 0.484375q1.0625 0 1.5625 -0.421875q0.515625 -0.4375 0.515625 -1.015625q0 -0.515625 -0.453125 -0.8125q-0.3125 -0.203125 -1.5625 -0.515625q-1.671875 -0.421875 -2.328125 -0.71875q-0.640625 -0.3125 -0.984375 -0.859375q-0.328125 -0.546875 -0.328125 -1.203125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.75 -0.859375q0.359375 -0.265625 0.96875 -0.4375q0.609375 -0.1875 1.3125 -0.1875q1.0625 0 1.859375 0.3125q0.796875 0.296875 1.171875 0.828125q0.390625 0.515625 0.53125 1.375l-1.484375 0.203125q-0.109375 -0.6875 -0.59375 -1.078125q-0.484375 -0.390625 -1.375 -0.390625q-1.046875 0 -1.5 0.34375q-0.4375 0.34375 -0.4375 0.8125q0 0.296875 0.171875 0.53125q0.1875 0.25 0.59375 0.40625q0.21875 0.09375 1.34375 0.390625q1.609375 0.4375 2.25 0.71875q0.640625 0.265625 1.0 0.796875q0.375 0.515625 0.375 1.296875q0 0.765625 -0.453125 1.4375q-0.4375 0.671875 -1.28125 1.046875q-0.828125 0.359375 -1.890625 0.359375q-1.75 0 -2.671875 -0.71875q-0.921875 -0.734375 -1.171875 -2.171875z" fill-rule="nonzero"/><path fill="#595959" d="m330.9448 198.49007q-1.25 -1.578125 -2.125 -3.703125q-0.875 -2.140625 -0.875 -4.4375q0 -2.0 0.65625 -3.84375q0.765625 -2.140625 2.34375 -4.265625l1.09375 0q-1.015625 1.75 -1.34375 2.5q-0.515625 1.171875 -0.8125 2.4375q-0.375 1.59375 -0.375 3.1875q0 4.0625 2.53125 8.125l-1.09375 0zm2.3016663 -8.140625q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.637085 4.5l0 -8.984375l1.375 0l0 1.28125q0.984375 -1.484375 2.859375 -1.484375q0.8125 0 1.484375 0.296875q0.6875 0.28125 1.015625 0.765625q0.34375 0.46875 0.484375 1.125q0.078125 0.421875 0.078125 1.46875l0 5.53125l-1.515625 0l0 -5.46875q0 -0.921875 -0.1875 -1.375q-0.171875 -0.46875 -0.625 -0.75q-0.453125 -0.28125 -1.0625 -0.28125q-0.96875 0 -1.671875 0.625q-0.703125 0.609375 -0.703125 2.34375l0 4.90625l-1.53125 0zm14.6701355 0l0 -12.40625l5.5 0q1.65625 0 2.515625 0.34375q0.875 0.328125 1.390625 1.171875q0.515625 0.84375 0.515625 1.875q0 1.3125 -0.859375 2.21875q-0.859375 0.90625 -2.640625 1.15625q0.65625 0.3125 1.0 0.609375q0.71875 0.671875 1.359375 1.65625l2.15625 3.375l-2.0625 0l-1.640625 -2.578125q-0.71875 -1.125 -1.1875 -1.703125q-0.46875 -0.59375 -0.84375 -0.828125q-0.359375 -0.25 -0.734375 -0.34375q-0.28125 -0.0625 -0.921875 -0.0625l-1.90625 0l0 5.515625l-1.640625 0zm1.640625 -6.921875l3.53125 0q1.125 0 1.75 -0.234375q0.640625 -0.234375 0.96875 -0.75q0.34375 -0.515625 0.34375 -1.109375q0 -0.890625 -0.640625 -1.453125q-0.640625 -0.5625 -2.03125 -0.5625l-3.921875 0l0 4.109375zm10.904449 6.921875l0 -12.40625l1.640625 0l0 5.09375l6.4375 0l0 -5.09375l1.65625 0l0 12.40625l-1.65625 0l0 -5.84375l-6.4375 0l0 5.84375l-1.640625 0zm11.904449 -3.984375l1.546875 -0.140625q0.109375 0.9375 0.5 1.53125q0.40625 0.59375 1.25 0.96875q0.859375 0.359375 1.90625 0.359375q0.9375 0 1.65625 -0.265625q0.734375 -0.28125 1.078125 -0.765625q0.34375 -0.5 0.34375 -1.078125q0 -0.578125 -0.34375 -1.015625q-0.328125 -0.4375 -1.109375 -0.734375q-0.5 -0.1875 -2.203125 -0.59375q-1.703125 -0.421875 -2.390625 -0.78125q-0.890625 -0.46875 -1.328125 -1.15625q-0.4375 -0.6875 -0.4375 -1.546875q0 -0.9375 0.53125 -1.75q0.53125 -0.8125 1.546875 -1.234375q1.03125 -0.421875 2.296875 -0.421875q1.375 0 2.421875 0.4375q1.0625 0.4375 1.625 1.3125q0.5625 0.859375 0.609375 1.953125l-1.578125 0.109375q-0.125 -1.171875 -0.859375 -1.765625q-0.734375 -0.609375 -2.15625 -0.609375q-1.484375 0 -2.171875 0.546875q-0.6875 0.546875 -0.6875 1.3125q0 0.671875 0.484375 1.109375q0.484375 0.421875 2.484375 0.875q2.0 0.453125 2.734375 0.796875q1.09375 0.5 1.609375 1.265625q0.515625 0.765625 0.515625 1.765625q0 0.984375 -0.578125 1.859375q-0.5625 0.875 -1.625 1.375q-1.0625 0.484375 -2.390625 0.484375q-1.6875 0 -2.828125 -0.484375q-1.125 -0.5 -1.78125 -1.484375q-0.640625 -0.984375 -0.671875 -2.234375zm12.917114 7.625l-1.09375 0q2.53125 -4.0625 2.53125 -8.125q0 -1.59375 -0.359375 -3.15625q-0.296875 -1.265625 -0.8125 -2.4375q-0.328125 -0.765625 -1.359375 -2.53125l1.09375 0q1.59375 2.125 2.359375 4.265625q0.640625 1.84375 0.640625 3.84375q0 2.296875 -0.875 4.4375q-0.875 2.125 -2.125 3.703125z" fill-rule="nonzero"/><path fill="#eeeeee" d="m186.72783 197.54453l0 0c0 -4.1114807 3.7226257 -7.444504 8.314728 -7.444504l0 0c4.5921173 0 8.314743 3.333023 8.314743 7.444504l0 0c0 4.111496 -3.7226257 7.444504 -8.314743 7.444504l0 0c-4.592102 0 -8.314728 -3.3330078 -8.314728 -7.444504z" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m186.72783 197.54453l0 0c0 -4.1114807 3.7226257 -7.444504 8.314728 -7.444504l0 0c4.5921173 0 8.314743 3.333023 8.314743 7.444504l0 0c0 4.111496 -3.7226257 7.444504 -8.314743 7.444504l0 0c-4.592102 0 -8.314728 -3.3330078 -8.314728 -7.444504z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m189.16315 192.28047l11.748474 10.51886" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m189.16315 192.28047l11.748474 10.51886" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m200.92197 192.28047l-11.748474 10.51886" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m200.92197 192.28047l-11.748474 10.51886" fill-rule="evenodd"/><path fill="#eeeeee" d="m493.13123 227.4357l70.59845 0l0 -7.047241l14.094482 14.094482l-14.094482 14.094482l0 -7.047241l-70.59845 0z" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m493.13123 227.4357l70.59845 0l0 -7.047241l14.094482 14.094482l-14.094482 14.094482l0 -7.047241l-70.59845 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m483.1942 160.69029l111.653564 0l0 61.417313l-111.653564 0z" fill-rule="evenodd"/><path fill="#595959" d="m503.4514 181.98654l1.640625 0.40625q-0.515625 2.03125 -1.859375 3.09375q-1.34375 1.0625 -3.28125 1.0625q-2.0 0 -3.265625 -0.8125q-1.25 -0.828125 -1.90625 -2.375q-0.65625 -1.546875 -0.65625 -3.328125q0 -1.9375 0.734375 -3.375q0.75 -1.453125 2.109375 -2.1875q1.375 -0.75 3.015625 -0.75q1.859375 0 3.125 0.953125q1.28125 0.9375 1.765625 2.65625l-1.609375 0.375q-0.4375 -1.34375 -1.265625 -1.953125q-0.8125 -0.625 -2.046875 -0.625q-1.4375 0 -2.40625 0.6875q-0.953125 0.671875 -1.34375 1.828125q-0.375 1.15625 -0.375 2.390625q0 1.578125 0.453125 2.765625q0.46875 1.171875 1.4375 1.765625q0.96875 0.578125 2.109375 0.578125q1.375 0 2.328125 -0.796875q0.953125 -0.796875 1.296875 -2.359375zm2.9044495 -0.15625q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.637085 4.5l0 -8.984375l1.375 0l0 1.28125q0.984375 -1.484375 2.859375 -1.484375q0.8125 0 1.484375 0.296875q0.6875 0.28125 1.015625 0.765625q0.34375 0.46875 0.484375 1.125q0.078125 0.421875 0.078125 1.46875l0 5.53125l-1.515625 0l0 -5.46875q0 -0.921875 -0.1875 -1.375q-0.171875 -0.46875 -0.625 -0.75q-0.453125 -0.28125 -1.0625 -0.28125q-0.96875 0 -1.671875 0.625q-0.703125 0.609375 -0.703125 2.34375l0 4.90625l-1.53125 0zm12.137085 0l-3.421875 -8.984375l1.609375 0l1.921875 5.375q0.3125 0.875 0.578125 1.8125q0.203125 -0.703125 0.578125 -1.703125l1.984375 -5.484375l1.578125 0l-3.40625 8.984375l-1.421875 0zm5.6015625 -4.5q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.605835 4.5l0 -12.40625l1.515625 0l0 12.40625l-1.515625 0zm9.771606 0l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm1.501831 -9.296875l0 -1.75l1.515625 0l0 1.75l-1.515625 0zm0 10.65625l0 -8.984375l1.515625 0l0 8.984375l-1.515625 0zm3.2716675 -4.5q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.637085 4.5l0 -8.984375l1.375 0l0 1.28125q0.984375 -1.484375 2.859375 -1.484375q0.8125 0 1.484375 0.296875q0.6875 0.28125 1.015625 0.765625q0.34375 0.46875 0.484375 1.125q0.078125 0.421875 0.078125 1.46875l0 5.53125l-1.515625 0l0 -5.46875q0 -0.921875 -0.1875 -1.375q-0.171875 -0.46875 -0.625 -0.75q-0.453125 -0.28125 -1.0625 -0.28125q-0.96875 0 -1.671875 0.625q-0.703125 0.609375 -0.703125 2.34375l0 4.90625l-1.53125 0z" fill-rule="nonzero"/><path fill="#595959" d="m515.5106 202.6303q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm14.52771 4.5l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm1.486206 4.796875l0 -12.421875l1.390625 0l0 1.171875q0.484375 -0.6875 1.09375 -1.03125q0.625 -0.34375 1.515625 -0.34375q1.140625 0 2.015625 0.59375q0.890625 0.59375 1.328125 1.671875q0.453125 1.078125 0.453125 2.359375q0 1.375 -0.5 2.484375q-0.484375 1.109375 -1.4375 1.703125q-0.9375 0.578125 -1.96875 0.578125q-0.765625 0 -1.375 -0.3125q-0.609375 -0.328125 -0.984375 -0.828125l0 4.375l-1.53125 0zm1.375 -7.875q0 1.734375 0.703125 2.5625q0.703125 0.828125 1.703125 0.828125q1.015625 0 1.734375 -0.859375q0.734375 -0.859375 0.734375 -2.65625q0 -1.71875 -0.71875 -2.578125q-0.703125 -0.859375 -1.6875 -0.859375q-0.96875 0 -1.71875 0.921875q-0.75 0.90625 -0.75 2.640625zm14.15271 4.4375l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m114.19948 216.66667c-2.0699844 0 -3.7480316 -1.6780548 -3.7480316 -3.7480316l0 -6.5984344c0 -2.0699768 -1.6780548 -3.7480316 -3.7480316 -3.7480316l0 0c2.0699768 0 3.7480316 -1.6780396 3.7480316 -3.7480316l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480316 3.7480316 -3.7480316z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m114.19948 216.66667c-2.0699844 0 -3.7480316 -1.6780548 -3.7480316 -3.7480316l0 -6.5984344c0 -2.0699768 -1.6780548 -3.7480316 -3.7480316 -3.7480316l0 0c2.0699768 0 3.7480316 -1.6780396 3.7480316 -3.7480316l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480316 3.7480316 -3.7480316" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m114.19948 216.66667c-2.0699844 0 -3.7480316 -1.6780548 -3.7480316 -3.7480316l0 -6.5984344c0 -2.0699768 -1.6780548 -3.7480316 -3.7480316 -3.7480316l0 0c2.0699768 0 3.7480316 -1.6780396 3.7480316 -3.7480316l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480316 3.7480316 -3.7480316" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m114.19948 290.50656c-2.0699844 0 -3.7480316 -1.6780396 -3.7480316 -3.7480164l0 -6.5984497c0 -2.0699768 -1.6780548 -3.7480164 -3.7480316 -3.7480164l0 0c2.0699768 0 3.7480316 -1.6780701 3.7480316 -3.7480469l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480164 3.7480316 -3.7480164z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m114.19948 290.50656c-2.0699844 0 -3.7480316 -1.6780396 -3.7480316 -3.7480164l0 -6.5984497c0 -2.0699768 -1.6780548 -3.7480164 -3.7480316 -3.7480164l0 0c2.0699768 0 3.7480316 -1.6780701 3.7480316 -3.7480469l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480164 3.7480316 -3.7480164" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m114.19948 290.50656c-2.0699844 0 -3.7480316 -1.6780396 -3.7480316 -3.7480164l0 -6.5984497c0 -2.0699768 -1.6780548 -3.7480164 -3.7480316 -3.7480164l0 0c2.0699768 0 3.7480316 -1.6780701 3.7480316 -3.7480469l0 -6.598419l0 0c0 -2.0699768 1.6780472 -3.7480164 3.7480316 -3.7480164" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m5.3805776 206.86876l84.69292 0l0 58.173233l-84.69292 0z" fill-rule="evenodd"/><path fill="#595959" d="m6.2555776 218.38876l0 -8.59375l3.2343745 0q0.984375 0 1.578125 0.265625q0.59375 0.265625 0.921875 0.8125q0.34375 0.53125 0.34375 1.125q0 0.546875 -0.296875 1.03125q-0.296875 0.484375 -0.90625 0.78125q0.78125 0.234375 1.1875 0.796875q0.421875 0.546875 0.421875 1.296875q0 0.59375 -0.25 1.125q-0.25 0.515625 -0.625 0.796875q-0.375 0.28125 -0.953125 0.421875q-0.5625 0.140625 -1.375 0.140625l-3.2812495 0zm1.140625 -4.984375l1.8593745 0q0.75 0 1.078125 -0.09375q0.4375 -0.125 0.65625 -0.421875q0.21875 -0.3125 0.21875 -0.765625q0 -0.421875 -0.203125 -0.75q-0.203125 -0.328125 -0.59375 -0.4375q-0.375 -0.125 -1.296875 -0.125l-1.7187495 0l0 2.59375zm0 3.96875l2.1406245 0q0.546875 0 0.765625 -0.046875q0.40625 -0.0625 0.65625 -0.21875q0.265625 -0.171875 0.4375 -0.484375q0.171875 -0.3125 0.171875 -0.71875q0 -0.484375 -0.25 -0.84375q-0.234375 -0.359375 -0.671875 -0.5q-0.4375 -0.140625 -1.265625 -0.140625l-1.9843745 0l0 2.953125zm10.847656 0.25q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm5.001953 2.1875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.0996094 -1.34375l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.9375 2.28125l0 -8.59375l1.046875 0l0 3.078125q0.734375 -0.84375 1.859375 -0.84375q0.703125 0 1.203125 0.28125q0.515625 0.265625 0.734375 0.75q0.21875 0.46875 0.21875 1.390625l0 3.9375l-1.046875 0l0 -3.9375q0 -0.796875 -0.34375 -1.15625q-0.34375 -0.359375 -0.96875 -0.359375q-0.46875 0 -0.890625 0.25q-0.40625 0.234375 -0.59375 0.65625q-0.171875 0.40625 -0.171875 1.140625l0 3.40625l-1.046875 0zm9.8046875 0.515625l1.03125 0.15625q0.0625 0.46875 0.359375 0.6875q0.390625 0.296875 1.0625 0.296875q0.734375 0 1.125 -0.296875q0.40625 -0.296875 0.546875 -0.8125q0.09375 -0.328125 0.078125 -1.359375q-0.6875 0.8125 -1.71875 0.8125q-1.28125 0 -1.984375 -0.921875q-0.703125 -0.9375 -0.703125 -2.21875q0 -0.890625 0.3125 -1.640625q0.328125 -0.765625 0.9375 -1.171875q0.609375 -0.40625 1.4375 -0.40625q1.109375 0 1.828125 0.890625l0 -0.75l0.96875 0l0 5.375q0 1.453125 -0.296875 2.0625q-0.296875 0.609375 -0.9375 0.953125q-0.640625 0.359375 -1.578125 0.359375q-1.109375 0 -1.796875 -0.5q-0.6875 -0.5 -0.671875 -1.515625zm0.875 -3.734375q0 1.21875 0.484375 1.78125q0.484375 0.5625 1.21875 0.5625q0.734375 0 1.21875 -0.5625q0.5 -0.5625 0.5 -1.75q0 -1.140625 -0.515625 -1.71875q-0.5 -0.578125 -1.21875 -0.578125q-0.703125 0 -1.203125 0.578125q-0.484375 0.5625 -0.484375 1.6875zm5.986328 3.21875l0 -6.21875l0.953125 0l0 0.9375q0.359375 -0.65625 0.65625 -0.859375q0.3125 -0.21875 0.6875 -0.21875q0.53125 0 1.078125 0.328125l-0.359375 0.984375q-0.390625 -0.234375 -0.765625 -0.234375q-0.359375 0 -0.640625 0.21875q-0.265625 0.203125 -0.375 0.578125q-0.1875 0.5625 -0.1875 1.21875l0 3.265625l-1.046875 0zm3.6210938 -3.109375q0 -1.734375 0.953125 -2.5625q0.796875 -0.6875 1.953125 -0.6875q1.28125 0 2.09375 0.84375q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.296875 0 -2.109375 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3125 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.78125 0 -1.3125 0.59375q-0.515625 0.578125 -0.515625 1.78125zm10.064453 3.109375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm2.5957031 2.390625l0 -8.609375l0.953125 0l0 0.796875q0.34375 -0.46875 0.765625 -0.703125q0.4375 -0.234375 1.046875 -0.234375q0.796875 0 1.40625 0.40625q0.609375 0.40625 0.90625 1.15625q0.3125 0.75 0.3125 1.640625q0 0.953125 -0.34375 1.71875q-0.328125 0.765625 -0.984375 1.171875q-0.65625 0.40625 -1.375 0.40625q-0.53125 0 -0.953125 -0.21875q-0.421875 -0.234375 -0.6875 -0.5625l0 3.03125l-1.046875 0zm0.953125 -5.46875q0 1.203125 0.484375 1.78125q0.484375 0.5625 1.171875 0.5625q0.703125 0 1.203125 -0.59375q0.5 -0.59375 0.5 -1.84375q0 -1.1875 -0.484375 -1.765625q-0.484375 -0.59375 -1.171875 -0.59375q-0.671875 0 -1.1875 0.625q-0.515625 0.625 -0.515625 1.828125zm5.298828 1.21875l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.140625 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.15625 -0.296875 -1.609375 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.921875 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.71875 0 -1.03125 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9375 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.21875 0 -1.859375 -0.5q-0.625 -0.515625 -0.796875 -1.5zm9.365234 -1.25q0 -1.734375 0.953125 -2.5625q0.796875 -0.6875 1.953125 -0.6875q1.28125 0 2.09375 0.84375q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.296875 0 -2.109375 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3125 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.78125 0 -1.3125 0.59375q-0.515625 0.578125 -0.515625 1.78125zm6.236328 3.109375l0 -5.40625l-0.9375 0l0 -0.8125l0.9375 0l0 -0.671875q0 -0.625 0.109375 -0.921875q0.15625 -0.421875 0.53125 -0.671875q0.390625 -0.25 1.078125 -0.25q0.453125 0 0.984375 0.109375l-0.15625 0.90625q-0.328125 -0.046875 -0.625 -0.046875q-0.484375 0 -0.6875 0.203125q-0.1875 0.203125 -0.1875 0.765625l0 0.578125l1.21875 0l0 0.8125l-1.21875 0l0 5.40625l-1.046875 0z" fill-rule="nonzero"/><path fill="#595959" d="m5.7555776 230.92938l1.03125 -0.15625q0.09375 0.625 0.484375 0.953125q0.40625 0.328125 1.1406245 0.328125q0.71875 0 1.0625 -0.28125q0.359375 -0.296875 0.359375 -0.703125q0 -0.359375 -0.3125 -0.5625q-0.21875 -0.140625 -1.078125 -0.359375q-1.1562495 -0.296875 -1.6093745 -0.5q-0.4375 -0.21875 -0.671875 -0.59375q-0.234375 -0.375 -0.234375 -0.84375q0 -0.40625 0.1875 -0.765625q0.1875 -0.359375 0.515625 -0.59375q0.25 -0.171875 0.671875 -0.296875q0.421875 -0.125 0.9218745 -0.125q0.71875 0 1.265625 0.21875q0.5625 0.203125 0.828125 0.5625q0.265625 0.359375 0.359375 0.953125l-1.03125 0.140625q-0.0625 -0.46875 -0.40625 -0.734375q-0.328125 -0.28125 -0.953125 -0.28125q-0.7187495 0 -1.0312495 0.25q-0.3125 0.234375 -0.3125 0.5625q0 0.203125 0.125 0.359375q0.140625 0.171875 0.40625 0.28125q0.15625 0.0625 0.9374995 0.265625q1.125 0.3125 1.5625 0.5q0.4375 0.1875 0.6875 0.546875q0.25 0.359375 0.25 0.90625q0 0.53125 -0.3125 1.0q-0.296875 0.453125 -0.875 0.71875q-0.578125 0.25 -1.3125 0.25q-1.2187495 0 -1.8593745 -0.5q-0.625 -0.515625 -0.796875 -1.5zm6.4218745 -5.515625l0 -1.21875l1.0625 0l0 1.21875l-1.0625 0zm0 7.375l0 -6.21875l1.0625 0l0 6.21875l-1.0625 0zm2.1035156 0l0 -0.859375l3.96875 -4.546875q-0.6875 0.03125 -1.203125 0.03125l-2.53125 0l0 -0.84375l5.09375 0l0 0.6875l-3.375 3.953125l-0.65625 0.71875q0.71875 -0.046875 1.328125 -0.046875l2.890625 0l0 0.90625l-5.515625 0zm10.8125 -2.0l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125z" fill-rule="nonzero"/><path fill="#595959" d="m7.1462026 247.18877l-0.984375 0l0 -8.59375l1.0625 0l0 3.0625q0.671875 -0.828125 1.7031245 -0.828125q0.578125 0 1.078125 0.234375q0.515625 0.21875 0.84375 0.640625q0.34375 0.421875 0.53125 1.015625q0.1875 0.59375 0.1875 1.265625q0 1.59375 -0.796875 2.46875q-0.796875 0.875 -1.890625 0.875q-1.1093745 0 -1.7343745 -0.921875l0 0.78125zm-0.015625 -3.15625q0 1.109375 0.3125 1.609375q0.5 0.8125 1.3437495 0.8125q0.6875 0 1.1875 -0.59375q0.515625 -0.59375 0.515625 -1.796875q0 -1.21875 -0.484375 -1.796875q-0.484375 -0.578125 -1.171875 -0.578125q-0.6875 0 -1.2031245 0.609375q-0.5 0.59375 -0.5 1.734375zm9.783203 2.390625q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm5.001953 2.1875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.0996094 -1.34375l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.9375 2.28125l0 -8.59375l1.046875 0l0 3.078125q0.734375 -0.84375 1.859375 -0.84375q0.703125 0 1.203125 0.28125q0.515625 0.265625 0.734375 0.75q0.21875 0.46875 0.21875 1.390625l0 3.9375l-1.046875 0l0 -3.9375q0 -0.796875 -0.34375 -1.15625q-0.34375 -0.359375 -0.96875 -0.359375q-0.46875 0 -0.890625 0.25q-0.40625 0.234375 -0.59375 0.65625q-0.171875 0.40625 -0.171875 1.140625l0 3.40625l-1.046875 0zm5.876953 0.140625l2.484375 -8.875l0.84375 0l-2.484375 8.875l-0.84375 0zm5.0996094 -0.140625l-0.984375 0l0 -8.59375l1.0625 0l0 3.0625q0.671875 -0.828125 1.703125 -0.828125q0.578125 0 1.078125 0.234375q0.515625 0.21875 0.84375 0.640625q0.34375 0.421875 0.53125 1.015625q0.1875 0.59375 0.1875 1.265625q0 1.59375 -0.796875 2.46875q-0.796875 0.875 -1.890625 0.875q-1.109375 0 -1.734375 -0.921875l0 0.78125zm-0.015625 -3.15625q0 1.109375 0.3125 1.609375q0.5 0.8125 1.34375 0.8125q0.6875 0 1.1875 -0.59375q0.515625 -0.59375 0.515625 -1.796875q0 -1.21875 -0.484375 -1.796875q-0.484375 -0.578125 -1.171875 -0.578125q-0.6875 0 -1.203125 0.609375q-0.5 0.59375 -0.5 1.734375zm9.783203 2.390625q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm5.001953 2.1875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.0996094 -1.34375l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.9375 2.28125l0 -8.59375l1.046875 0l0 3.078125q0.734375 -0.84375 1.859375 -0.84375q0.703125 0 1.203125 0.28125q0.515625 0.265625 0.734375 0.75q0.21875 0.46875 0.21875 1.390625l0 3.9375l-1.046875 0l0 -3.9375q0 -0.796875 -0.34375 -1.15625q-0.34375 -0.359375 -0.96875 -0.359375q-0.46875 0 -0.890625 0.25q-0.40625 0.234375 -0.59375 0.65625q-0.171875 0.40625 -0.171875 1.140625l0 3.40625l-1.046875 0zm5.689453 2.390625l0 -0.765625l7.0 0l0 0.765625l-7.0 0zm7.455078 -1.875l1.03125 0.15625q0.0625 0.46875 0.359375 0.6875q0.390625 0.296875 1.0625 0.296875q0.734375 0 1.125 -0.296875q0.40625 -0.296875 0.546875 -0.8125q0.09375 -0.328125 0.078125 -1.359375q-0.6875 0.8125 -1.71875 0.8125q-1.28125 0 -1.984375 -0.921875q-0.703125 -0.9375 -0.703125 -2.21875q0 -0.890625 0.3125 -1.640625q0.328125 -0.765625 0.9375 -1.171875q0.609375 -0.40625 1.4375 -0.40625q1.109375 0 1.828125 0.890625l0 -0.75l0.96875 0l0 5.375q0 1.453125 -0.296875 2.0625q-0.296875 0.609375 -0.9375 0.953125q-0.640625 0.359375 -1.578125 0.359375q-1.109375 0 -1.796875 -0.5q-0.6875 -0.5 -0.671875 -1.515625zm0.875 -3.734375q0 1.21875 0.484375 1.78125q0.484375 0.5625 1.21875 0.5625q0.734375 0 1.21875 -0.5625q0.5 -0.5625 0.5 -1.75q0 -1.140625 -0.515625 -1.71875q-0.5 -0.578125 -1.21875 -0.578125q-0.703125 0 -1.203125 0.578125q-0.484375 0.5625 -0.484375 1.6875zm5.986328 3.21875l0 -6.21875l0.953125 0l0 0.9375q0.359375 -0.65625 0.65625 -0.859375q0.3125 -0.21875 0.6875 -0.21875q0.53125 0 1.078125 0.328125l-0.359375 0.984375q-0.390625 -0.234375 -0.765625 -0.234375q-0.359375 0 -0.640625 0.21875q-0.265625 0.203125 -0.375 0.578125q-0.1875 0.5625 -0.1875 1.21875l0 3.265625l-1.046875 0z" fill-rule="nonzero"/><path fill="#595959" d="m5.7868276 258.4794q0 -1.734375 0.953125 -2.5625153q0.796875 -0.6875 1.9531245 -0.6875q1.28125 0 2.09375 0.84376526q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.2968745 0 -2.1093745 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3124995 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.7812495 0 -1.3124995 0.59375q-0.515625 0.578125 -0.515625 1.78125zm10.064453 3.109375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.8437653l1.0625 0l0 3.4531403q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.3281403l1.046875 0l0 6.2187653l-0.9375 0zm2.5957031 2.390625l0 -8.60939l0.953125 0l0 0.79689026q0.34375 -0.46876526 0.765625 -0.70314026q0.4375 -0.234375 1.046875 -0.234375q0.796875 0 1.40625 0.40625q0.609375 0.40626526 0.90625 1.1562653q0.3125 0.75 0.3125 1.640625q0 0.953125 -0.34375 1.71875q-0.328125 0.765625 -0.984375 1.171875q-0.65625 0.40625 -1.375 0.40625q-0.53125 0 -0.953125 -0.21875q-0.421875 -0.234375 -0.6875 -0.5625l0 3.03125l-1.046875 0zm0.953125 -5.46875q0 1.203125 0.484375 1.78125q0.484375 0.5625 1.171875 0.5625q0.703125 0 1.203125 -0.59375q0.5 -0.59375 0.5 -1.84375q0 -1.1875 -0.484375 -1.765625q-0.484375 -0.59375 -1.171875 -0.59375q-0.671875 0 -1.1875 0.625q-0.515625 0.625 -0.515625 1.828125zm4.736328 5.46875l0 -0.765625l7.0 0l0 0.765625l-7.0 0zm11.720703 -4.671875l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.76564026 1.015625 -1.1406403q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.50001526 0.84375 1.4531403l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.546875 -0.828125q0 -1.734375 0.953125 -2.5625153q0.796875 -0.6875 1.953125 -0.6875q1.28125 0 2.09375 0.84376526q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.296875 0 -2.109375 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3125 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.78125 0 -1.3125 0.59375q-0.515625 0.578125 -0.515625 1.78125zm10.064453 3.109375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.8437653l1.0625 0l0 3.4531403q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.3281403l1.046875 0l0 6.2187653l-0.9375 0zm2.5957031 0l0 -6.2187653l0.9375 0l0 0.87501526q0.6875 -1.0156403 1.984375 -1.0156403q0.5625 0 1.03125 0.203125q0.484375 0.203125 0.71875 0.53125q0.234375 0.32814026 0.328125 0.76564026q0.046875 0.296875 0.046875 1.03125l0 3.828125l-1.046875 0l0 -3.78125q0 -0.65625 -0.125 -0.96875q-0.125 -0.3125 -0.4375 -0.5q-0.3125 -0.203125 -0.734375 -0.203125q-0.671875 0 -1.171875 0.4375q-0.484375 0.421875 -0.484375 1.609375l0 3.40625l-1.046875 0zm8.970703 -0.9375l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.81251526l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.81251526l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125z" fill-rule="nonzero"/><path fill="#000000" d="m108.732285 225.09367l0 0c0 -1.6486969 1.6780472 -2.9852142 3.7480316 -2.9852142l0 0c2.0699768 0 3.7480316 1.3365173 3.7480316 2.9852142l0 0c0 1.6486816 -1.6780548 2.9852142 -3.7480316 2.9852142l0 0c-2.0699844 0 -3.7480316 -1.3365326 -3.7480316 -2.9852142z" fill-rule="evenodd"/><path fill="#000000" d="m108.732285 240.03447l0 0c0 -1.6486969 1.6780472 -2.9852142 3.7480316 -2.9852142l0 0c2.0699768 0 3.7480316 1.3365173 3.7480316 2.9852142l0 0c0 1.6486969 -1.6780548 2.9852142 -3.7480316 2.9852142l0 0c-2.0699844 0 -3.7480316 -1.3365173 -3.7480316 -2.9852142z" fill-rule="evenodd"/><path fill="#000000" d="m108.732285 254.97527l0 0c0 -1.6486816 1.6780472 -2.9852142 3.7480316 -2.9852142l0 0c2.0699768 0 3.7480316 1.3365326 3.7480316 2.9852142l0 0c0 1.6486969 -1.6780548 2.9852142 -3.7480316 2.9852142l0 0c-2.0699844 0 -3.7480316 -1.3365173 -3.7480316 -2.9852142z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m90.073494 235.95538l16.629921 -33.385834" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m90.073494 235.95538l13.954742 -28.015213" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m105.50671 208.6766l0.544899 -4.7985077l-3.501831 3.3256226z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m90.073494 235.95538l16.629921 40.472443" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m90.073494 235.95538l14.349548 34.92267" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m102.895256 271.50583l3.2525406 3.5697937l-0.19696808 -4.8253174z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m130.11548 216.66667l-0.12597656 45.32283" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m130.11548 216.66667l-0.12597656 45.32283" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m129.61548 260.1601l-0.12597656 32.503937" fill-rule="evenodd"/><path stroke="#0000ff" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.61548 260.1601l-0.12597656 32.503937" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m246.50131 227.5l66.99213 -0.50393677" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m246.50131 227.5l66.99213 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m313.49344 227.0l99.02362 -0.50393677" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m313.49344 227.0l99.02362 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m412.51706 226.5l66.99213 -0.50393677" fill-rule="evenodd"/><path stroke="#0000ff" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m412.51706 226.5l66.99213 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m302.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922394 0 -8.314972 -3.7227325 -8.314972 -8.314957z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m302.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922394 0 -8.314972 -3.7227325 -8.314972 -8.314957" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m302.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922394 0 -8.314972 -3.7227325 -8.314972 -8.314957" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m470.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922546 0 -8.314972 -3.7227325 -8.314972 -8.314957z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m470.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922546 0 -8.314972 -3.7227325 -8.314972 -8.314957" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m470.89148 234.7653l0 0c0 2.2052612 -0.8760376 4.3202057 -2.4353943 5.8795624c-1.5593567 1.5593567 -3.674286 2.4353943 -5.879547 2.4353943l-8.944885 0l0 0c-4.592224 0 -8.314972 3.7227325 -8.314972 8.314972l0 0c0 -4.5922394 -3.7227478 -8.314972 -8.314972 -8.314972l-8.944855 0c-4.5922546 0 -8.314972 -3.7227325 -8.314972 -8.314957" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m309.38058 284.86877l111.653534 0l0 43.62204l-111.653534 0z" fill-rule="evenodd"/><path fill="#595959" d="m309.9587 292.20126q0 -2.140625 1.140625 -3.34375q1.15625 -1.21875 2.96875 -1.21875q1.203125 0 2.15625 0.578125q0.953125 0.5625 1.453125 1.578125q0.5 1.015625 0.5 2.3125q0 1.296875 -0.53125 2.328125q-0.53125 1.03125 -1.5 1.5625q-0.953125 0.53125 -2.078125 0.53125q-1.21875 0 -2.171875 -0.578125q-0.953125 -0.59375 -1.453125 -1.59375q-0.484375 -1.015625 -0.484375 -2.15625zm1.171875 0.015625q0 1.5625 0.828125 2.453125q0.84375 0.890625 2.109375 0.890625q1.28125 0 2.109375 -0.890625q0.828125 -0.90625 0.828125 -2.578125q0 -1.046875 -0.359375 -1.828125q-0.34375 -0.78125 -1.03125 -1.203125q-0.6875 -0.4375 -1.53125 -0.4375q-1.203125 0 -2.078125 0.828125q-0.875 0.828125 -0.875 2.765625zm12.458984 4.171875l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm4.892578 -0.9375l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm1.0371094 3.328125l0 -8.609375l0.953125 0l0 0.796875q0.34375 -0.46875 0.765625 -0.703125q0.4375 -0.234375 1.046875 -0.234375q0.796875 0 1.40625 0.40625q0.609375 0.40625 0.90625 1.15625q0.3125 0.75 0.3125 1.640625q0 0.953125 -0.34375 1.71875q-0.328125 0.765625 -0.984375 1.171875q-0.65625 0.40625 -1.375 0.40625q-0.53125 0 -0.953125 -0.21875q-0.421875 -0.234375 -0.6875 -0.5625l0 3.03125l-1.046875 0zm0.953125 -5.46875q0 1.203125 0.484375 1.78125q0.484375 0.5625 1.171875 0.5625q0.703125 0 1.203125 -0.59375q0.5 -0.59375 0.5 -1.84375q0 -1.1875 -0.484375 -1.765625q-0.484375 -0.59375 -1.171875 -0.59375q-0.671875 0 -1.1875 0.625q-0.515625 0.625 -0.515625 1.828125zm9.798828 3.078125l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm4.892578 -0.9375l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125z" fill-rule="nonzero"/><path fill="#595959" d="m310.42746 310.78876l0 -5.40625l-0.9375 0l0 -0.8125l0.9375 0l0 -0.671875q0 -0.625 0.109375 -0.921875q0.15625 -0.421875 0.53125 -0.671875q0.390625 -0.25 1.078125 -0.25q0.453125 0 0.984375 0.109375l-0.15625 0.90625q-0.328125 -0.046875 -0.625 -0.046875q-0.484375 0 -0.6875 0.203125q-0.1875 0.203125 -0.1875 0.765625l0 0.578125l1.21875 0l0 0.8125l-1.21875 0l0 5.40625l-1.046875 0zm7.3339844 -2.0l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125zm9.939453 2.9375q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm5.001953 2.1875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.1152344 0.9375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm2.5800781 0l0 -6.21875l0.953125 0l0 0.9375q0.359375 -0.65625 0.65625 -0.859375q0.3125 -0.21875 0.6875 -0.21875q0.53125 0 1.078125 0.328125l-0.359375 0.984375q-0.390625 -0.234375 -0.765625 -0.234375q-0.359375 0 -0.640625 0.21875q-0.265625 0.203125 -0.375 0.578125q-0.1875 0.5625 -0.1875 1.21875l0 3.265625l-1.046875 0zm8.261719 -2.0l1.09375 0.125q-0.25 0.953125 -0.953125 1.484375q-0.703125 0.53125 -1.78125 0.53125q-1.359375 0 -2.171875 -0.84375q-0.796875 -0.84375 -0.796875 -2.359375q0 -1.5625 0.8125 -2.421875q0.8125 -0.875 2.09375 -0.875q1.25 0 2.03125 0.84375q0.796875 0.84375 0.796875 2.390625q0 0.09375 0 0.28125l-4.640625 0q0.0625 1.03125 0.578125 1.578125q0.515625 0.53125 1.296875 0.53125q0.578125 0 0.984375 -0.296875q0.421875 -0.3125 0.65625 -0.96875zm-3.453125 -1.703125l3.46875 0q-0.0625 -0.796875 -0.390625 -1.1875q-0.515625 -0.609375 -1.3125 -0.609375q-0.734375 0 -1.234375 0.484375q-0.484375 0.484375 -0.53125 1.3125zm5.080078 3.84375l2.484375 -8.875l0.84375 0l-2.484375 8.875l-0.84375 0zm5.0996094 -0.140625l-0.984375 0l0 -8.59375l1.0625 0l0 3.0625q0.671875 -0.828125 1.703125 -0.828125q0.578125 0 1.078125 0.234375q0.515625 0.21875 0.84375 0.640625q0.34375 0.421875 0.53125 1.015625q0.1875 0.59375 0.1875 1.265625q0 1.59375 -0.796875 2.46875q-0.796875 0.875 -1.890625 0.875q-1.109375 0 -1.734375 -0.921875l0 0.78125zm-0.015625 -3.15625q0 1.109375 0.3125 1.609375q0.5 0.8125 1.34375 0.8125q0.6875 0 1.1875 -0.59375q0.515625 -0.59375 0.515625 -1.796875q0 -1.21875 -0.484375 -1.796875q-0.484375 -0.578125 -1.171875 -0.578125q-0.6875 0 -1.203125 0.609375q-0.5 0.59375 -0.5 1.734375zm9.783203 2.390625q-0.59375 0.5 -1.140625 0.703125q-0.53125 0.203125 -1.15625 0.203125q-1.03125 0 -1.578125 -0.5q-0.546875 -0.5 -0.546875 -1.28125q0 -0.453125 0.203125 -0.828125q0.203125 -0.390625 0.546875 -0.609375q0.34375 -0.234375 0.765625 -0.34375q0.296875 -0.09375 0.9375 -0.171875q1.265625 -0.140625 1.875 -0.359375q0 -0.21875 0 -0.265625q0 -0.65625 -0.296875 -0.921875q-0.40625 -0.34375 -1.203125 -0.34375q-0.734375 0 -1.09375 0.265625q-0.359375 0.25 -0.53125 0.90625l-1.03125 -0.140625q0.140625 -0.65625 0.46875 -1.0625q0.328125 -0.40625 0.9375 -0.625q0.609375 -0.21875 1.40625 -0.21875q0.796875 0 1.296875 0.1875q0.5 0.1875 0.734375 0.46875q0.234375 0.28125 0.328125 0.71875q0.046875 0.265625 0.046875 0.96875l0 1.40625q0 1.46875 0.0625 1.859375q0.078125 0.390625 0.28125 0.75l-1.109375 0q-0.15625 -0.328125 -0.203125 -0.765625zm-0.09375 -2.359375q-0.578125 0.234375 -1.71875 0.40625q-0.65625 0.09375 -0.921875 0.21875q-0.265625 0.109375 -0.421875 0.328125q-0.140625 0.21875 -0.140625 0.5q0 0.421875 0.3125 0.703125q0.328125 0.28125 0.9375 0.28125q0.609375 0 1.078125 -0.265625q0.484375 -0.265625 0.703125 -0.734375q0.171875 -0.359375 0.171875 -1.046875l0 -0.390625zm5.001953 2.1875l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125zm5.0996094 -1.34375l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.9375 2.28125l0 -8.59375l1.046875 0l0 3.078125q0.734375 -0.84375 1.859375 -0.84375q0.703125 0 1.203125 0.28125q0.515625 0.265625 0.734375 0.75q0.21875 0.46875 0.21875 1.390625l0 3.9375l-1.046875 0l0 -3.9375q0 -0.796875 -0.34375 -1.15625q-0.34375 -0.359375 -0.96875 -0.359375q-0.46875 0 -0.890625 0.25q-0.40625 0.234375 -0.59375 0.65625q-0.171875 0.40625 -0.171875 1.140625l0 3.40625l-1.046875 0zm5.689453 2.390625l0 -0.765625l7.0 0l0 0.765625l-7.0 0zm7.455078 -1.875l1.03125 0.15625q0.0625 0.46875 0.359375 0.6875q0.390625 0.296875 1.0625 0.296875q0.734375 0 1.125 -0.296875q0.40625 -0.296875 0.546875 -0.8125q0.09375 -0.328125 0.078125 -1.359375q-0.6875 0.8125 -1.71875 0.8125q-1.28125 0 -1.984375 -0.921875q-0.703125 -0.9375 -0.703125 -2.21875q0 -0.890625 0.3125 -1.640625q0.328125 -0.765625 0.9375 -1.171875q0.609375 -0.40625 1.4375 -0.40625q1.109375 0 1.828125 0.890625l0 -0.75l0.96875 0l0 5.375q0 1.453125 -0.296875 2.0625q-0.296875 0.609375 -0.9375 0.953125q-0.640625 0.359375 -1.578125 0.359375q-1.109375 0 -1.796875 -0.5q-0.6875 -0.5 -0.671875 -1.515625zm0.875 -3.734375q0 1.21875 0.484375 1.78125q0.484375 0.5625 1.21875 0.5625q0.734375 0 1.21875 -0.5625q0.5 -0.5625 0.5 -1.75q0 -1.140625 -0.515625 -1.71875q-0.5 -0.578125 -1.21875 -0.578125q-0.703125 0 -1.203125 0.578125q-0.484375 0.5625 -0.484375 1.6875zm5.986328 3.21875l0 -6.21875l0.953125 0l0 0.9375q0.359375 -0.65625 0.65625 -0.859375q0.3125 -0.21875 0.6875 -0.21875q0.53125 0 1.078125 0.328125l-0.359375 0.984375q-0.390625 -0.234375 -0.765625 -0.234375q-0.359375 0 -0.640625 0.21875q-0.265625 0.203125 -0.375 0.578125q-0.1875 0.5625 -0.1875 1.21875l0 3.265625l-1.046875 0zm3.6210938 -3.109375q0 -1.734375 0.953125 -2.5625q0.796875 -0.6875 1.953125 -0.6875q1.28125 0 2.09375 0.84375q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.296875 0 -2.109375 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3125 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.78125 0 -1.3125 0.59375q-0.515625 0.578125 -0.515625 1.78125zm10.064453 3.109375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm2.5957031 2.390625l0 -8.609375l0.953125 0l0 0.796875q0.34375 -0.46875 0.765625 -0.703125q0.4375 -0.234375 1.046875 -0.234375q0.796875 0 1.40625 0.40625q0.609375 0.40625 0.90625 1.15625q0.3125 0.75 0.3125 1.640625q0 0.953125 -0.34375 1.71875q-0.328125 0.765625 -0.984375 1.171875q-0.65625 0.40625 -1.375 0.40625q-0.53125 0 -0.953125 -0.21875q-0.421875 -0.234375 -0.6875 -0.5625l0 3.03125l-1.046875 0zm0.953125 -5.46875q0 1.203125 0.484375 1.78125q0.484375 0.5625 1.171875 0.5625q0.703125 0 1.203125 -0.59375q0.5 -0.59375 0.5 -1.84375q0 -1.1875 -0.484375 -1.765625q-0.484375 -0.59375 -1.171875 -0.59375q-0.671875 0 -1.1875 0.625q-0.515625 0.625 -0.515625 1.828125z" fill-rule="nonzero"/><path fill="#595959" d="m309.19308 327.57938l0 -0.765625l7.0 0l0 0.765625l-7.0 0zm11.720703 -4.671875l1.03125 0.140625q-0.171875 1.0625 -0.875 1.671875q-0.703125 0.609375 -1.71875 0.609375q-1.28125 0 -2.0625 -0.828125q-0.765625 -0.84375 -0.765625 -2.40625q0 -1.0 0.328125 -1.75q0.34375 -0.765625 1.015625 -1.140625q0.6875 -0.375 1.5 -0.375q1.0 0 1.640625 0.515625q0.65625 0.5 0.84375 1.453125l-1.03125 0.15625q-0.140625 -0.625 -0.515625 -0.9375q-0.375 -0.328125 -0.90625 -0.328125q-0.796875 0 -1.296875 0.578125q-0.5 0.5625 -0.5 1.796875q0 1.265625 0.484375 1.828125q0.484375 0.5625 1.25 0.5625q0.625 0 1.03125 -0.375q0.421875 -0.375 0.546875 -1.171875zm1.546875 -0.828125q0 -1.734375 0.953125 -2.5625q0.796875 -0.6875 1.953125 -0.6875q1.28125 0 2.09375 0.84375q0.828125 0.828125 0.828125 2.3125q0 1.203125 -0.359375 1.890625q-0.359375 0.6875 -1.0625 1.078125q-0.6875 0.375 -1.5 0.375q-1.296875 0 -2.109375 -0.828125q-0.796875 -0.84375 -0.796875 -2.421875zm1.078125 0q0 1.1875 0.515625 1.78125q0.53125 0.59375 1.3125 0.59375q0.796875 0 1.3125 -0.59375q0.515625 -0.59375 0.515625 -1.8125q0 -1.15625 -0.53125 -1.75q-0.515625 -0.59375 -1.296875 -0.59375q-0.78125 0 -1.3125 0.59375q-0.515625 0.578125 -0.515625 1.78125zm10.064453 3.109375l0 -0.921875q-0.734375 1.0625 -1.984375 1.0625q-0.546875 0 -1.03125 -0.203125q-0.46875 -0.21875 -0.703125 -0.53125q-0.234375 -0.328125 -0.328125 -0.796875q-0.0625 -0.296875 -0.0625 -0.984375l0 -3.84375l1.0625 0l0 3.453125q0 0.8125 0.0625 1.109375q0.09375 0.40625 0.40625 0.65625q0.328125 0.234375 0.8125 0.234375q0.46875 0 0.875 -0.234375q0.421875 -0.25 0.59375 -0.671875q0.1875 -0.421875 0.1875 -1.21875l0 -3.328125l1.046875 0l0 6.21875l-0.9375 0zm2.5957031 0l0 -6.21875l0.9375 0l0 0.875q0.6875 -1.015625 1.984375 -1.015625q0.5625 0 1.03125 0.203125q0.484375 0.203125 0.71875 0.53125q0.234375 0.328125 0.328125 0.765625q0.046875 0.296875 0.046875 1.03125l0 3.828125l-1.046875 0l0 -3.78125q0 -0.65625 -0.125 -0.96875q-0.125 -0.3125 -0.4375 -0.5q-0.3125 -0.203125 -0.734375 -0.203125q-0.671875 0 -1.171875 0.4375q-0.484375 0.421875 -0.484375 1.609375l0 3.40625l-1.046875 0zm8.970703 -0.9375l0.15625 0.921875q-0.453125 0.09375 -0.796875 0.09375q-0.578125 0 -0.890625 -0.171875q-0.3125 -0.1875 -0.453125 -0.484375q-0.125 -0.296875 -0.125 -1.25l0 -3.578125l-0.765625 0l0 -0.8125l0.765625 0l0 -1.546875l1.046875 -0.625l0 2.171875l1.0625 0l0 0.8125l-1.0625 0l0 3.640625q0 0.453125 0.046875 0.578125q0.0625 0.125 0.1875 0.203125q0.125 0.078125 0.359375 0.078125q0.1875 0 0.46875 -0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m365.20734 284.86877l-87.90549 -33.480316" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m365.20734 284.86877l-82.29843 -31.344757" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m283.49683 251.98044l-4.828827 -0.07165527l3.6530151 3.158783z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m365.20734 284.86877l80.09451 -33.480316" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m365.20734 284.86877l74.558685 -31.16629" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m440.40305 255.22643l3.5499878 -3.2741547l-4.8240356 0.22625732z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m128.50131 200.0l154.48819 26.48819" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m131.8791 200.57915l147.73262 25.329895" fill-rule="evenodd"/><path fill="#ff0000" stroke="#ff0000" stroke-width="1.0" stroke-linecap="butt" d="m131.8791 200.57915l1.2984619 -0.9183655l-3.235382 0.5862732l2.8552856 1.6305542z" fill-rule="evenodd"/><path fill="#ff0000" stroke="#ff0000" stroke-width="1.0" stroke-linecap="butt" d="m279.6117 225.90904l-1.2984314 0.9183655l3.2353516 -0.5862732l-2.8552856 -1.6305542z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m129.50131 275.0l319.49606 -48.0" fill-rule="evenodd"/><path stroke="#0000ff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m132.89037 274.49084l312.71796 -46.981674" fill-rule="evenodd"/><path fill="#0000ff" stroke="#0000ff" stroke-width="1.0" stroke-linecap="butt" d="m132.89037 274.49084l0.9450226 -1.2791748l-2.8883972 1.5711365l3.2225494 0.65304565z" fill-rule="evenodd"/><path fill="#0000ff" stroke="#0000ff" stroke-width="1.0" stroke-linecap="butt" d="m445.60834 227.50916l-0.94503784 1.2791901l2.8883972 -1.5711517l-3.2225647 -0.6530609z" fill-rule="evenodd"/><path fill="#eeeeee" d="m191.68845 276.412l0 0c0 -4.1114807 3.722641 -7.444519 8.314743 -7.444519l0 0c4.592102 0 8.314743 3.3330383 8.314743 7.444519l0 0c0 4.1114807 -3.722641 7.4444885 -8.314743 7.4444885l0 0c-4.592102 0 -8.314743 -3.3330078 -8.314743 -7.4444885z" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m191.68845 276.412l0 0c0 -4.1114807 3.722641 -7.444519 8.314743 -7.444519l0 0c4.592102 0 8.314743 3.3330383 8.314743 7.444519l0 0c0 4.1114807 -3.722641 7.4444885 -8.314743 7.4444885l0 0c-4.592102 0 -8.314743 -3.3330078 -8.314743 -7.4444885z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m194.12378 271.14792l11.748474 10.51886" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m194.12378 271.14792l11.748474 10.51886" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m205.8826 271.14792l-11.748474 10.51886" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m205.8826 271.14792l-11.748474 10.51886" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m718.50134 227.5l66.992065 -0.50393677" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m718.50134 227.5l66.992065 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m785.4934 227.0l99.02368 -0.50393677" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m785.4934 227.0l99.02368 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m884.5171 226.5l66.99213 -0.50393677" fill-rule="evenodd"/><path stroke="#0000ff" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m884.5171 226.5l66.99213 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m719.1273 227.03806l-0.12597656 32.503937" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m719.1273 227.03806l-0.12597656 32.503937" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m718.50134 259.5l66.992065 -0.50393677" fill-rule="evenodd"/><path stroke="#ff0000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m718.50134 259.5l66.992065 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m785.4934 259.0l99.02368 -0.50393677" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m785.4934 259.0l99.02368 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m884.5171 258.5l66.99213 -0.50393677" fill-rule="evenodd"/><path stroke="#0000ff" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m884.5171 258.5l66.99213 -0.50393677" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m951.13385 226.32808l-0.12597656 32.503952" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m951.13385 226.32808l-0.12597656 32.503952" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m598.4042 196.71129l96.72443 0l0 88.88188l-96.72443 0z" fill-rule="evenodd"/><path fill="#595959" d="m599.1073 205.68192q0 -2.609375 1.40625 -4.078125q1.40625 -1.484375 3.625 -1.484375q1.453125 0 2.609375 0.703125q1.171875 0.6875 1.78125 1.921875q0.609375 1.234375 0.609375 2.8125q0 1.59375 -0.640625 2.859375q-0.640625 1.265625 -1.828125 1.90625q-1.171875 0.640625 -2.546875 0.640625q-1.46875 0 -2.640625 -0.703125q-1.171875 -0.71875 -1.78125 -1.953125q-0.59375 -1.25 -0.59375 -2.625zm1.4375 0.015625q0 1.90625 1.015625 3.0q1.015625 1.078125 2.5625 1.078125q1.5625 0 2.578125 -1.09375q1.015625 -1.109375 1.015625 -3.125q0 -1.28125 -0.4375 -2.234375q-0.4375 -0.96875 -1.265625 -1.484375q-0.828125 -0.53125 -1.875 -0.53125q-1.46875 0 -2.53125 1.015625q-1.0625 1.015625 -1.0625 3.375zm15.2126465 5.09375l0 -1.109375q-0.890625 1.28125 -2.421875 1.28125q-0.671875 0 -1.25 -0.25q-0.578125 -0.265625 -0.875 -0.65625q-0.28125 -0.390625 -0.390625 -0.953125q-0.078125 -0.375 -0.078125 -1.203125l0 -4.703125l1.28125 0l0 4.203125q0 1.015625 0.078125 1.359375q0.125 0.515625 0.515625 0.8125q0.40625 0.28125 0.984375 0.28125q0.578125 0 1.078125 -0.296875q0.515625 -0.296875 0.71875 -0.8125q0.21875 -0.515625 0.21875 -1.484375l0 -4.0625l1.28125 0l0 7.59375l-1.140625 0zm5.979187 -1.15625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm1.2595215 4.0625l0 -10.5l1.171875 0l0 0.984375q0.421875 -0.578125 0.9375 -0.859375q0.515625 -0.296875 1.265625 -0.296875q0.96875 0 1.71875 0.5q0.75 0.5 1.125 1.421875q0.375 0.90625 0.375 1.984375q0 1.171875 -0.421875 2.109375q-0.40625 0.921875 -1.21875 1.421875q-0.796875 0.5 -1.671875 0.5q-0.640625 0 -1.15625 -0.265625q-0.515625 -0.28125 -0.84375 -0.6875l0 3.6875l-1.28125 0zm1.15625 -6.65625q0 1.453125 0.59375 2.15625q0.609375 0.703125 1.453125 0.703125q0.859375 0 1.46875 -0.71875q0.609375 -0.734375 0.609375 -2.25q0 -1.453125 -0.609375 -2.171875q-0.59375 -0.734375 -1.421875 -0.734375q-0.8125 0 -1.453125 0.78125q-0.640625 0.765625 -0.640625 2.234375zm11.979187 3.75l0 -1.109375q-0.890625 1.28125 -2.421875 1.28125q-0.671875 0 -1.25 -0.25q-0.578125 -0.265625 -0.875 -0.65625q-0.28125 -0.390625 -0.390625 -0.953125q-0.078125 -0.375 -0.078125 -1.203125l0 -4.703125l1.28125 0l0 4.203125q0 1.015625 0.078125 1.359375q0.125 0.515625 0.515625 0.8125q0.40625 0.28125 0.984375 0.28125q0.578125 0 1.078125 -0.296875q0.515625 -0.296875 0.71875 -0.8125q0.21875 -0.515625 0.21875 -1.484375l0 -4.0625l1.28125 0l0 7.59375l-1.140625 0zm5.979248 -1.15625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm5.440857 1.15625l0 -10.484375l3.9375 0q1.203125 0 1.921875 0.3125q0.734375 0.3125 1.140625 0.984375q0.40625 0.65625 0.40625 1.375q0 0.671875 -0.375 1.265625q-0.359375 0.59375 -1.09375 0.96875q0.953125 0.28125 1.46875 0.953125q0.515625 0.671875 0.515625 1.578125q0 0.75 -0.3125 1.390625q-0.3125 0.625 -0.78125 0.96875q-0.453125 0.34375 -1.140625 0.515625q-0.6875 0.171875 -1.6875 0.171875l-4.0 0zm1.390625 -6.078125l2.265625 0q0.921875 0 1.3125 -0.125q0.53125 -0.15625 0.796875 -0.515625q0.28125 -0.375 0.28125 -0.921875q0 -0.53125 -0.25 -0.921875q-0.25 -0.390625 -0.71875 -0.53125q-0.46875 -0.15625 -1.59375 -0.15625l-2.09375 0l0 3.171875zm0 4.84375l2.609375 0q0.671875 0 0.9375 -0.046875q0.484375 -0.09375 0.796875 -0.296875q0.328125 -0.203125 0.53125 -0.578125q0.21875 -0.390625 0.21875 -0.890625q0 -0.578125 -0.3125 -1.015625q-0.296875 -0.4375 -0.828125 -0.609375q-0.53125 -0.171875 -1.53125 -0.171875l-2.421875 0l0 3.609375zm13.228699 0.296875q-0.71875 0.609375 -1.375 0.859375q-0.65625 0.25 -1.421875 0.25q-1.25 0 -1.921875 -0.609375q-0.671875 -0.609375 -0.671875 -1.5625q0 -0.5625 0.25 -1.015625q0.25 -0.46875 0.65625 -0.75q0.421875 -0.28125 0.9375 -0.421875q0.375 -0.09375 1.140625 -0.1875q1.5625 -0.1875 2.296875 -0.453125q0.015625 -0.265625 0.015625 -0.328125q0 -0.796875 -0.375 -1.109375q-0.484375 -0.4375 -1.453125 -0.4375q-0.921875 0 -1.359375 0.328125q-0.421875 0.3125 -0.625 1.109375l-1.265625 -0.171875q0.171875 -0.796875 0.5625 -1.296875q0.390625 -0.5 1.140625 -0.765625q0.75 -0.265625 1.71875 -0.265625q0.984375 0 1.59375 0.234375q0.609375 0.21875 0.890625 0.5625q0.28125 0.34375 0.40625 0.875q0.0625 0.328125 0.0625 1.1875l0 1.71875q0 1.796875 0.078125 2.28125q0.078125 0.46875 0.328125 0.90625l-1.34375 0q-0.203125 -0.40625 -0.265625 -0.9375zm-0.109375 -2.875q-0.703125 0.28125 -2.09375 0.484375q-0.796875 0.109375 -1.125 0.265625q-0.328125 0.140625 -0.515625 0.421875q-0.171875 0.265625 -0.171875 0.59375q0 0.515625 0.390625 0.859375q0.390625 0.34375 1.140625 0.34375q0.734375 0 1.3125 -0.3125q0.59375 -0.328125 0.859375 -0.890625q0.203125 -0.4375 0.203125 -1.296875l0 -0.46875zm6.119873 2.65625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm6.2125854 -1.625l1.265625 0.15625q-0.203125 1.3125 -1.0625 2.0625q-0.84375 0.734375 -2.09375 0.734375q-1.5625 0 -2.515625 -1.015625q-0.9375 -1.03125 -0.9375 -2.921875q0 -1.234375 0.40625 -2.15625q0.40625 -0.921875 1.234375 -1.375q0.84375 -0.46875 1.8125 -0.46875q1.25 0 2.03125 0.625q0.78125 0.625 1.015625 1.765625l-1.265625 0.203125q-0.171875 -0.765625 -0.625 -1.15625q-0.453125 -0.390625 -1.09375 -0.390625q-0.984375 0 -1.59375 0.703125q-0.609375 0.703125 -0.609375 2.203125q0 1.53125 0.578125 2.234375q0.59375 0.6875 1.546875 0.6875q0.75 0 1.265625 -0.453125q0.515625 -0.46875 0.640625 -1.4375zm2.375 2.78125l0 -10.484375l1.28125 0l0 3.75q0.90625 -1.03125 2.28125 -1.03125q0.84375 0 1.46875 0.328125q0.625 0.328125 0.890625 0.921875q0.265625 0.578125 0.265625 1.703125l0 4.8125l-1.28125 0l0 -4.8125q0 -0.96875 -0.421875 -1.40625q-0.421875 -0.4375 -1.1875 -0.4375q-0.578125 0 -1.078125 0.296875q-0.5 0.296875 -0.71875 0.8125q-0.21875 0.5 -0.21875 1.390625l0 4.15625l-1.28125 0z" fill-rule="nonzero"/><path fill="#595959" d="m598.88855 224.5944q0 -2.109375 1.171875 -3.125q0.984375 -0.84375 2.390625 -0.84375q1.578125 0 2.5625 1.03125q1.0 1.015625 1.0 2.828125q0 1.46875 -0.4375 2.3125q-0.4375 0.828125 -1.28125 1.296875q-0.84375 0.46875 -1.84375 0.46875q-1.59375 0 -2.578125 -1.015625q-0.984375 -1.03125 -0.984375 -2.953125zm1.328125 0q0 1.453125 0.625 2.1875q0.640625 0.71875 1.609375 0.71875q0.96875 0 1.59375 -0.71875q0.640625 -0.734375 0.640625 -2.234375q0 -1.40625 -0.640625 -2.125q-0.640625 -0.734375 -1.59375 -0.734375q-0.96875 0 -1.609375 0.71875q-0.625 0.71875 -0.625 2.1875zm7.619873 3.796875l0 -6.59375l-1.140625 0l0 -1.0l1.140625 0l0 -0.8125q0 -0.765625 0.125 -1.140625q0.1875 -0.5 0.65625 -0.8125q0.46875 -0.3125 1.3125 -0.3125q0.546875 0 1.203125 0.125l-0.1875 1.125q-0.40625 -0.0625 -0.765625 -0.0625q-0.578125 0 -0.828125 0.25q-0.234375 0.25 -0.234375 0.9375l0 0.703125l1.46875 0l0 1.0l-1.46875 0l0 6.59375l-1.28125 0zm7.315857 -2.265625l1.265625 -0.203125q0.109375 0.765625 0.59375 1.171875q0.5 0.40625 1.375 0.40625q0.890625 0 1.3125 -0.359375q0.4375 -0.359375 0.4375 -0.84375q0 -0.4375 -0.375 -0.6875q-0.265625 -0.171875 -1.3125 -0.4375q-1.421875 -0.359375 -1.96875 -0.609375q-0.546875 -0.265625 -0.828125 -0.734375q-0.28125 -0.46875 -0.28125 -1.015625q0 -0.515625 0.21875 -0.9375q0.234375 -0.4375 0.640625 -0.734375q0.296875 -0.21875 0.8125 -0.359375q0.53125 -0.15625 1.125 -0.15625q0.890625 0 1.5625 0.265625q0.671875 0.25 1.0 0.6875q0.328125 0.4375 0.4375 1.171875l-1.25 0.171875q-0.09375 -0.578125 -0.5 -0.90625q-0.40625 -0.34375 -1.15625 -0.34375q-0.890625 0 -1.28125 0.296875q-0.375 0.296875 -0.375 0.6875q0 0.25 0.15625 0.453125q0.15625 0.203125 0.5 0.34375q0.1875 0.078125 1.140625 0.328125q1.359375 0.359375 1.890625 0.59375q0.546875 0.234375 0.859375 0.6875q0.3125 0.4375 0.3125 1.09375q0 0.640625 -0.375 1.21875q-0.375 0.5625 -1.09375 0.875q-0.703125 0.3125 -1.59375 0.3125q-1.484375 0 -2.265625 -0.609375q-0.765625 -0.625 -0.984375 -1.828125zm7.84375 -6.75l0 -1.46875l1.296875 0l0 1.46875l-1.296875 0zm0 9.015625l0 -7.59375l1.296875 0l0 7.59375l-1.296875 0zm2.5686035 0l0 -1.046875l4.84375 -5.546875q-0.828125 0.03125 -1.453125 0.03125l-3.09375 0l0 -1.03125l6.203125 0l0 0.84375l-4.109375 4.828125l-0.796875 0.875q0.859375 -0.0625 1.625 -0.0625l3.515625 0l0 1.109375l-6.734375 0zm13.21875 -2.453125l1.328125 0.171875q-0.3125 1.171875 -1.171875 1.8125q-0.84375 0.640625 -2.171875 0.640625q-1.671875 0 -2.65625 -1.015625q-0.96875 -1.03125 -0.96875 -2.890625q0 -1.921875 0.984375 -2.96875q1.0 -1.0625 2.578125 -1.0625q1.515625 0 2.484375 1.03125q0.96875 1.03125 0.96875 2.921875q0 0.109375 -0.015625 0.34375l-5.65625 0q0.0625 1.25 0.703125 1.921875q0.640625 0.65625 1.59375 0.65625q0.703125 0 1.203125 -0.359375q0.5 -0.375 0.796875 -1.203125zm-4.234375 -2.078125l4.25 0q-0.09375 -0.953125 -0.484375 -1.4375q-0.625 -0.75 -1.609375 -0.75q-0.875 0 -1.484375 0.59375q-0.609375 0.59375 -0.671875 1.59375zm13.7074585 7.609375q-1.0625 -1.34375 -1.796875 -3.140625q-0.734375 -1.8125 -0.734375 -3.734375q0 -1.703125 0.546875 -3.265625q0.640625 -1.8125 1.984375 -3.609375l0.921875 0q-0.859375 1.484375 -1.140625 2.125q-0.4375 0.984375 -0.6875 2.0625q-0.296875 1.34375 -0.296875 2.6875q0 3.453125 2.125 6.875l-0.921875 0zm2.5369263 -3.078125l0 -10.484375l1.390625 0l0 9.25l5.15625 0l0 1.234375l-6.546875 0zm8.244873 0l0 -10.484375l1.390625 0l0 4.296875l5.453125 0l0 -4.296875l1.390625 0l0 10.484375l-1.390625 0l0 -4.9375l-5.453125 0l0 4.9375l-1.390625 0zm10.0686035 -3.375l1.3125 -0.109375q0.09375 0.78125 0.421875 1.296875q0.34375 0.5 1.0625 0.8125q0.71875 0.3125 1.609375 0.3125q0.796875 0 1.40625 -0.234375q0.609375 -0.234375 0.90625 -0.640625q0.296875 -0.421875 0.296875 -0.90625q0 -0.5 -0.296875 -0.859375q-0.28125 -0.375 -0.9375 -0.625q-0.421875 -0.171875 -1.875 -0.515625q-1.4375 -0.34375 -2.015625 -0.65625q-0.75 -0.390625 -1.125 -0.96875q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.796875 0.4375 -1.484375q0.453125 -0.6875 1.3125 -1.046875q0.875 -0.359375 1.9375 -0.359375q1.171875 0 2.0625 0.375q0.890625 0.375 1.359375 1.109375q0.484375 0.734375 0.515625 1.65625l-1.328125 0.09375q-0.109375 -1.0 -0.734375 -1.5q-0.609375 -0.515625 -1.8125 -0.515625q-1.265625 0 -1.84375 0.46875q-0.578125 0.46875 -0.578125 1.109375q0 0.5625 0.421875 0.9375q0.390625 0.359375 2.078125 0.75q1.703125 0.375 2.328125 0.65625q0.921875 0.421875 1.359375 1.078125q0.4375 0.640625 0.4375 1.484375q0 0.84375 -0.484375 1.59375q-0.484375 0.734375 -1.390625 1.140625q-0.890625 0.40625 -2.015625 0.40625q-1.421875 0 -2.390625 -0.40625q-0.953125 -0.421875 -1.5 -1.25q-0.546875 -0.828125 -0.578125 -1.890625z" fill-rule="nonzero"/><path fill="#595959" d="m599.4823 245.99129l0 -10.484375l3.9375 0q1.203125 0 1.921875 0.3125q0.734375 0.3125 1.140625 0.984375q0.40625 0.65625 0.40625 1.375q0 0.671875 -0.375 1.265625q-0.359375 0.59375 -1.09375 0.96875q0.953125 0.28125 1.46875 0.953125q0.515625 0.671875 0.515625 1.578125q0 0.75 -0.3125 1.390625q-0.3125 0.625 -0.78125 0.96875q-0.453125 0.34375 -1.140625 0.515625q-0.6875 0.171875 -1.6875 0.171875l-4.0 0zm1.390625 -6.078125l2.265625 0q0.921875 0 1.3125 -0.125q0.53125 -0.15625 0.796875 -0.515625q0.28125 -0.375 0.28125 -0.921875q0 -0.53125 -0.25 -0.921875q-0.25 -0.390625 -0.71875 -0.53125q-0.46875 -0.15625 -1.59375 -0.15625l-2.09375 0l0 3.171875zm0 4.84375l2.609375 0q0.671875 0 0.9375 -0.046875q0.484375 -0.09375 0.796875 -0.296875q0.328125 -0.203125 0.53125 -0.578125q0.21875 -0.390625 0.21875 -0.890625q0 -0.578125 -0.3125 -1.015625q-0.296875 -0.4375 -0.828125 -0.609375q-0.53125 -0.171875 -1.53125 -0.171875l-2.421875 0l0 3.609375zm13.22876 0.296875q-0.71875 0.609375 -1.375 0.859375q-0.65625 0.25 -1.421875 0.25q-1.25 0 -1.921875 -0.609375q-0.671875 -0.609375 -0.671875 -1.5625q0 -0.5625 0.25 -1.015625q0.25 -0.46875 0.65625 -0.75q0.421875 -0.28125 0.9375 -0.421875q0.375 -0.09375 1.140625 -0.1875q1.5625 -0.1875 2.296875 -0.453125q0.015625 -0.265625 0.015625 -0.328125q0 -0.796875 -0.375 -1.109375q-0.484375 -0.4375 -1.453125 -0.4375q-0.921875 0 -1.359375 0.328125q-0.421875 0.3125 -0.625 1.109375l-1.265625 -0.171875q0.171875 -0.796875 0.5625 -1.296875q0.390625 -0.5 1.140625 -0.765625q0.75 -0.265625 1.71875 -0.265625q0.984375 0 1.59375 0.234375q0.609375 0.21875 0.890625 0.5625q0.28125 0.34375 0.40625 0.875q0.0625 0.328125 0.0625 1.1875l0 1.71875q0 1.796875 0.078125 2.28125q0.078125 0.46875 0.328125 0.90625l-1.34375 0q-0.203125 -0.40625 -0.265625 -0.9375zm-0.109375 -2.875q-0.703125 0.28125 -2.09375 0.484375q-0.796875 0.109375 -1.125 0.265625q-0.328125 0.140625 -0.515625 0.421875q-0.171875 0.265625 -0.171875 0.59375q0 0.515625 0.390625 0.859375q0.390625 0.34375 1.140625 0.34375q0.734375 0 1.3125 -0.3125q0.59375 -0.328125 0.859375 -0.890625q0.203125 -0.4375 0.203125 -1.296875l0 -0.46875zm6.119873 2.65625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm6.2125854 -1.625l1.265625 0.15625q-0.203125 1.3125 -1.0625 2.0625q-0.84375 0.734375 -2.09375 0.734375q-1.5625 0 -2.515625 -1.015625q-0.9375 -1.03125 -0.9375 -2.921875q0 -1.234375 0.40625 -2.15625q0.40625 -0.921875 1.234375 -1.375q0.84375 -0.46875 1.8125 -0.46875q1.25 0 2.03125 0.625q0.78125 0.625 1.015625 1.765625l-1.265625 0.203125q-0.171875 -0.765625 -0.625 -1.15625q-0.453125 -0.390625 -1.09375 -0.390625q-0.984375 0 -1.59375 0.703125q-0.609375 0.703125 -0.609375 2.203125q0 1.53125 0.578125 2.234375q0.59375 0.6875 1.546875 0.6875q0.75 0 1.265625 -0.453125q0.515625 -0.46875 0.640625 -1.4375zm2.375 2.78125l0 -10.484375l1.28125 0l0 3.75q0.90625 -1.03125 2.28125 -1.03125q0.84375 0 1.46875 0.328125q0.625 0.328125 0.890625 0.921875q0.265625 0.578125 0.265625 1.703125l0 4.8125l-1.28125 0l0 -4.8125q0 -0.96875 -0.421875 -1.40625q-0.421875 -0.4375 -1.1875 -0.4375q-0.578125 0 -1.078125 0.296875q-0.5 0.296875 -0.71875 0.8125q-0.21875 0.5 -0.21875 1.390625l0 4.15625l-1.28125 0zm7.182373 0.171875l3.046875 -10.84375l1.03125 0l-3.046875 10.84375l-1.03125 0z" fill-rule="nonzero"/><path fill="#595959" d="m600.5604 263.59128l-1.203125 0l0 -10.484375l1.296875 0l0 3.734375q0.8125 -1.015625 2.078125 -1.015625q0.703125 0 1.328125 0.28125q0.625 0.28125 1.03125 0.796875q0.40625 0.5 0.625 1.234375q0.234375 0.71875 0.234375 1.53125q0 1.96875 -0.96875 3.03125q-0.953125 1.0625 -2.3125 1.0625q-1.34375 0 -2.109375 -1.125l0 0.953125zm-0.015625 -3.859375q0 1.375 0.375 1.984375q0.609375 0.984375 1.640625 0.984375q0.84375 0 1.453125 -0.734375q0.625 -0.734375 0.625 -2.1875q0 -1.484375 -0.59375 -2.1875q-0.59375 -0.71875 -1.421875 -0.71875q-0.84375 0 -1.46875 0.734375q-0.609375 0.734375 -0.609375 2.125zm11.932373 2.921875q-0.71875 0.609375 -1.375 0.859375q-0.65625 0.25 -1.421875 0.25q-1.25 0 -1.921875 -0.609375q-0.671875 -0.609375 -0.671875 -1.5625q0 -0.5625 0.25 -1.015625q0.25 -0.46875 0.65625 -0.75q0.421875 -0.28125 0.9375 -0.421875q0.375 -0.09375 1.140625 -0.1875q1.5625 -0.1875 2.296875 -0.453125q0.015625 -0.265625 0.015625 -0.328125q0 -0.796875 -0.375 -1.109375q-0.484375 -0.4375 -1.453125 -0.4375q-0.921875 0 -1.359375 0.328125q-0.421875 0.3125 -0.625 1.109375l-1.265625 -0.171875q0.171875 -0.796875 0.5625 -1.296875q0.390625 -0.5 1.140625 -0.765625q0.75 -0.265625 1.71875 -0.265625q0.984375 0 1.59375 0.234375q0.609375 0.21875 0.890625 0.5625q0.28125 0.34375 0.40625 0.875q0.0625 0.328125 0.0625 1.1875l0 1.71875q0 1.796875 0.078125 2.28125q0.078125 0.46875 0.328125 0.90625l-1.34375 0q-0.203125 -0.40625 -0.265625 -0.9375zm-0.109375 -2.875q-0.703125 0.28125 -2.09375 0.484375q-0.796875 0.109375 -1.125 0.265625q-0.328125 0.140625 -0.515625 0.421875q-0.171875 0.265625 -0.171875 0.59375q0 0.515625 0.390625 0.859375q0.390625 0.34375 1.140625 0.34375q0.734375 0 1.3125 -0.3125q0.59375 -0.328125 0.859375 -0.890625q0.203125 -0.4375 0.203125 -1.296875l0 -0.46875zm6.119873 2.65625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm6.2125854 -1.625l1.265625 0.15625q-0.203125 1.3125 -1.0625 2.0625q-0.84375 0.734375 -2.09375 0.734375q-1.5625 0 -2.515625 -1.015625q-0.9375 -1.03125 -0.9375 -2.921875q0 -1.234375 0.40625 -2.15625q0.40625 -0.921875 1.234375 -1.375q0.84375 -0.46875 1.8125 -0.46875q1.25 0 2.03125 0.625q0.78125 0.625 1.015625 1.765625l-1.265625 0.203125q-0.171875 -0.765625 -0.625 -1.15625q-0.453125 -0.390625 -1.09375 -0.390625q-0.984375 0 -1.59375 0.703125q-0.609375 0.703125 -0.609375 2.203125q0 1.53125 0.578125 2.234375q0.59375 0.6875 1.546875 0.6875q0.75 0 1.265625 -0.453125q0.515625 -0.46875 0.640625 -1.4375zm2.375 2.78125l0 -10.484375l1.28125 0l0 3.75q0.90625 -1.03125 2.28125 -1.03125q0.84375 0 1.46875 0.328125q0.625 0.328125 0.890625 0.921875q0.265625 0.578125 0.265625 1.703125l0 4.8125l-1.28125 0l0 -4.8125q0 -0.96875 -0.421875 -1.40625q-0.421875 -0.4375 -1.1875 -0.4375q-0.578125 0 -1.078125 0.296875q-0.5 0.296875 -0.71875 0.8125q-0.21875 0.5 -0.21875 1.390625l0 4.15625l-1.28125 0zm6.963623 2.90625l0 -0.921875l8.53125 0l0 0.921875l-8.53125 0zm9.104248 -2.28125l1.25 0.1875q0.078125 0.578125 0.4375 0.84375q0.46875 0.359375 1.3125 0.359375q0.890625 0 1.375 -0.359375q0.484375 -0.359375 0.65625 -1.0q0.109375 -0.390625 0.09375 -1.65625q-0.84375 1.0 -2.109375 1.0q-1.5625 0 -2.421875 -1.125q-0.859375 -1.140625 -0.859375 -2.71875q0 -1.09375 0.390625 -2.0q0.40625 -0.921875 1.140625 -1.421875q0.75 -0.5 1.765625 -0.5q1.34375 0 2.21875 1.078125l0 -0.90625l1.1875 0l0 6.5625q0 1.78125 -0.359375 2.515625q-0.359375 0.734375 -1.15625 1.15625q-0.78125 0.4375 -1.921875 0.4375q-1.359375 0 -2.203125 -0.609375q-0.828125 -0.609375 -0.796875 -1.84375zm1.0625 -4.5625q0 1.5 0.59375 2.1875q0.59375 0.6875 1.484375 0.6875q0.890625 0 1.484375 -0.6875q0.609375 -0.6875 0.609375 -2.140625q0 -1.390625 -0.625 -2.09375q-0.609375 -0.71875 -1.484375 -0.71875q-0.859375 0 -1.46875 0.703125q-0.59375 0.6875 -0.59375 2.0625zm7.307312 3.9375l0 -7.59375l1.15625 0l0 1.140625q0.453125 -0.796875 0.828125 -1.046875q0.375 -0.265625 0.8125 -0.265625q0.65625 0 1.328125 0.40625l-0.4375 1.203125q-0.46875 -0.28125 -0.953125 -0.28125q-0.421875 0 -0.765625 0.25q-0.328125 0.25 -0.46875 0.703125q-0.21875 0.6875 -0.21875 1.5l0 3.984375l-1.28125 0zm4.4119263 -3.796875q0 -2.109375 1.171875 -3.125q0.984375 -0.84375 2.390625 -0.84375q1.578125 0 2.5625 1.03125q1.0 1.015625 1.0 2.828125q0 1.46875 -0.4375 2.3125q-0.4375 0.828125 -1.28125 1.296875q-0.84375 0.46875 -1.84375 0.46875q-1.59375 0 -2.578125 -1.015625q-0.984375 -1.03125 -0.984375 -2.953125zm1.328125 0q0 1.453125 0.625 2.1875q0.640625 0.71875 1.609375 0.71875q0.96875 0 1.59375 -0.71875q0.640625 -0.734375 0.640625 -2.234375q0 -1.40625 -0.640625 -2.125q-0.640625 -0.734375 -1.59375 -0.734375q-0.96875 0 -1.609375 0.71875q-0.625 0.71875 -0.625 2.1875zm12.291748 3.796875l0 -1.109375q-0.890625 1.28125 -2.421875 1.28125q-0.671875 0 -1.25 -0.25q-0.578125 -0.265625 -0.875 -0.65625q-0.28125 -0.390625 -0.390625 -0.953125q-0.078125 -0.375 -0.078125 -1.203125l0 -4.703125l1.28125 0l0 4.203125q0 1.015625 0.078125 1.359375q0.125 0.515625 0.515625 0.8125q0.40625 0.28125 0.984375 0.28125q0.578125 0 1.078125 -0.296875q0.515625 -0.296875 0.71875 -0.8125q0.21875 -0.515625 0.21875 -1.484375l0 -4.0625l1.28125 0l0 7.59375l-1.140625 0zm3.166687 2.90625l0 -10.5l1.171875 0l0 0.984375q0.421875 -0.578125 0.9375 -0.859375q0.515625 -0.296875 1.265625 -0.296875q0.96875 0 1.71875 0.5q0.75 0.5 1.125 1.421875q0.375 0.90625 0.375 1.984375q0 1.171875 -0.421875 2.109375q-0.40625 0.921875 -1.21875 1.421875q-0.796875 0.5 -1.671875 0.5q-0.640625 0 -1.15625 -0.265625q-0.515625 -0.28125 -0.84375 -0.6875l0 3.6875l-1.28125 0zm1.15625 -6.65625q0 1.453125 0.59375 2.15625q0.609375 0.703125 1.453125 0.703125q0.859375 0 1.46875 -0.71875q0.609375 -0.734375 0.609375 -2.25q0 -1.453125 -0.609375 -2.171875q-0.59375 -0.734375 -1.421875 -0.734375q-0.8125 0 -1.453125 0.78125q-0.640625 0.765625 -0.640625 2.234375zm5.807373 6.65625l0 -0.921875l8.53125 0l0 0.921875l-8.53125 0z" fill-rule="nonzero"/><path fill="#595959" d="m604.32605 278.41003l1.265625 0.15625q-0.203125 1.3125 -1.0625 2.0625q-0.84375 0.734375 -2.09375 0.734375q-1.5625 0 -2.515625 -1.015625q-0.9375 -1.03125 -0.9375 -2.921875q0 -1.234375 0.40625 -2.15625q0.40625 -0.921875 1.234375 -1.375q0.84375 -0.46875 1.8125 -0.46875q1.25 0 2.03125 0.625q0.78125 0.625 1.015625 1.765625l-1.265625 0.203125q-0.171875 -0.765625 -0.625 -1.15625q-0.453125 -0.390625 -1.09375 -0.390625q-0.984375 0 -1.59375 0.703125q-0.609375 0.703125 -0.609375 2.203125q0 1.53125 0.578125 2.234375q0.59375 0.6875 1.546875 0.6875q0.75 0 1.265625 -0.453125q0.515625 -0.46875 0.640625 -1.4375zm1.890625 -1.015625q0 -2.109375 1.171875 -3.125q0.984375 -0.84375 2.390625 -0.84375q1.578125 0 2.5625 1.03125q1.0 1.015625 1.0 2.828125q0 1.46875 -0.4375 2.3125q-0.4375 0.828125 -1.28125 1.296875q-0.84375 0.46875 -1.84375 0.46875q-1.59375 0 -2.578125 -1.015625q-0.984375 -1.03125 -0.984375 -2.953125zm1.328125 0q0 1.453125 0.625 2.1875q0.640625 0.71875 1.609375 0.71875q0.96875 0 1.59375 -0.71875q0.640625 -0.734375 0.640625 -2.234375q0 -1.40625 -0.640625 -2.125q-0.640625 -0.734375 -1.59375 -0.734375q-0.96875 0 -1.609375 0.71875q-0.625 0.71875 -0.625 2.1875zm12.291748 3.796875l0 -1.109375q-0.890625 1.28125 -2.421875 1.28125q-0.671875 0 -1.25 -0.25q-0.578125 -0.265625 -0.875 -0.65625q-0.28125 -0.390625 -0.390625 -0.953125q-0.078125 -0.375 -0.078125 -1.203125l0 -4.703125l1.28125 0l0 4.203125q0 1.015625 0.078125 1.359375q0.125 0.515625 0.515625 0.8125q0.40625 0.28125 0.984375 0.28125q0.578125 0 1.078125 -0.296875q0.515625 -0.296875 0.71875 -0.8125q0.21875 -0.515625 0.21875 -1.484375l0 -4.0625l1.28125 0l0 7.59375l-1.140625 0zm3.166748 0l0 -7.59375l1.15625 0l0 1.078125q0.84375 -1.25 2.421875 -1.25q0.6875 0 1.265625 0.25q0.578125 0.234375 0.859375 0.640625q0.28125 0.40625 0.40625 0.953125q0.0625 0.359375 0.0625 1.25l0 4.671875l-1.28125 0l0 -4.625q0 -0.78125 -0.15625 -1.171875q-0.15625 -0.390625 -0.546875 -0.625q-0.375 -0.234375 -0.890625 -0.234375q-0.8125 0 -1.421875 0.53125q-0.59375 0.515625 -0.59375 1.96875l0 4.15625l-1.28125 0zm10.963623 -1.15625l0.1875 1.140625q-0.546875 0.109375 -0.984375 0.109375q-0.6875 0 -1.078125 -0.21875q-0.390625 -0.21875 -0.546875 -0.578125q-0.15625 -0.359375 -0.15625 -1.515625l0 -4.375l-0.953125 0l0 -1.0l0.953125 0l0 -1.890625l1.28125 -0.765625l0 2.65625l1.296875 0l0 1.0l-1.296875 0l0 4.4375q0 0.546875 0.0625 0.71875q0.078125 0.15625 0.21875 0.25q0.15625 0.078125 0.453125 0.078125q0.203125 0 0.5625 -0.046875zm2.1032104 4.234375l-0.921875 0q2.140625 -3.421875 2.140625 -6.875q0 -1.34375 -0.3125 -2.671875q-0.25 -1.0625 -0.671875 -2.046875q-0.28125 -0.65625 -1.15625 -2.15625l0.921875 0q1.34375 1.796875 1.984375 3.609375q0.546875 1.5625 0.546875 3.265625q0 1.921875 -0.734375 3.734375q-0.734375 1.796875 -1.796875 3.140625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m695.1286 241.15224l23.370056 0.3464508" fill-rule="evenodd"/><path stroke="#595959" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m695.1286 241.15224l17.370728 0.25752258" fill-rule="evenodd"/><path fill="#595959" stroke="#595959" stroke-width="1.0" stroke-linecap="butt" d="m712.47485 243.0613l4.5620728 -1.5842743l-4.5131226 -1.7188263z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m724.0 158.18636l230.01575 0l0 61.417313l-230.01575 0z" fill-rule="evenodd"/><path fill="#595959" d="m733.84375 177.77948q0 -3.078125 1.65625 -4.828125q1.65625 -1.75 4.28125 -1.75q1.71875 0 3.09375 0.828125q1.375 0.8125 2.09375 2.28125q0.734375 1.46875 0.734375 3.328125q0 1.890625 -0.765625 3.390625q-0.765625 1.484375 -2.15625 2.25q-1.390625 0.765625 -3.015625 0.765625q-1.75 0 -3.125 -0.84375q-1.375 -0.859375 -2.09375 -2.3125q-0.703125 -1.46875 -0.703125 -3.109375zm1.6875 0.03125q0 2.234375 1.203125 3.53125q1.203125 1.296875 3.03125 1.296875q1.84375 0 3.03125 -1.296875q1.203125 -1.3125 1.203125 -3.703125q0 -1.515625 -0.515625 -2.640625q-0.5 -1.140625 -1.484375 -1.75q-0.984375 -0.625 -2.21875 -0.625q-1.734375 0 -3.0 1.203125q-1.25 1.1875 -1.25 3.984375zm17.978394 6.015625l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm1.486206 4.796875l0 -12.421875l1.390625 0l0 1.171875q0.484375 -0.6875 1.09375 -1.03125q0.625 -0.34375 1.515625 -0.34375q1.140625 0 2.015625 0.59375q0.890625 0.59375 1.328125 1.671875q0.453125 1.078125 0.453125 2.359375q0 1.375 -0.5 2.484375q-0.484375 1.109375 -1.4375 1.703125q-0.9375 0.578125 -1.96875 0.578125q-0.765625 0 -1.375 -0.3125q-0.609375 -0.328125 -0.984375 -0.828125l0 4.375l-1.53125 0zm1.375 -7.875q0 1.734375 0.703125 2.5625q0.703125 0.828125 1.703125 0.828125q1.015625 0 1.734375 -0.859375q0.734375 -0.859375 0.734375 -2.65625q0 -1.71875 -0.71875 -2.578125q-0.703125 -0.859375 -1.6875 -0.859375q-0.96875 0 -1.71875 0.921875q-0.75 0.90625 -0.75 2.640625zm14.15271 4.4375l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm7.074585 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm6.659851 1.359375l0 -7.796875l-1.34375 0l0 -1.1875l1.34375 0l0 -0.953125q0 -0.90625 0.171875 -1.34375q0.21875 -0.59375 0.765625 -0.953125q0.5625 -0.375 1.5625 -0.375q0.640625 0 1.421875 0.140625l-0.234375 1.328125q-0.46875 -0.078125 -0.890625 -0.078125q-0.703125 0 -1.0 0.296875q-0.28125 0.296875 -0.28125 1.109375l0 0.828125l1.75 0l0 1.1875l-1.75 0l0 7.796875l-1.515625 0zm10.611206 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm14.34021 4.25q-0.84375 0.71875 -1.625 1.015625q-0.78125 0.296875 -1.671875 0.296875q-1.484375 0 -2.28125 -0.71875q-0.796875 -0.734375 -0.796875 -1.859375q0 -0.65625 0.296875 -1.203125q0.3125 -0.546875 0.796875 -0.875q0.484375 -0.328125 1.09375 -0.5q0.4375 -0.109375 1.34375 -0.21875q1.84375 -0.21875 2.71875 -0.53125q0.015625 -0.3125 0.015625 -0.390625q0 -0.9375 -0.4375 -1.3125q-0.578125 -0.515625 -1.734375 -0.515625q-1.078125 0 -1.59375 0.375q-0.5 0.375 -0.75 1.328125l-1.484375 -0.203125q0.203125 -0.953125 0.65625 -1.53125q0.46875 -0.59375 1.34375 -0.90625q0.890625 -0.328125 2.046875 -0.328125q1.15625 0 1.875 0.28125q0.71875 0.265625 1.046875 0.671875q0.34375 0.40625 0.484375 1.03125q0.078125 0.390625 0.078125 1.40625l0 2.03125q0 2.125 0.09375 2.6875q0.09375 0.5625 0.390625 1.078125l-1.59375 0q-0.234375 -0.46875 -0.3125 -1.109375zm-0.125 -3.40625q-0.828125 0.34375 -2.484375 0.578125q-0.9375 0.140625 -1.328125 0.3125q-0.390625 0.171875 -0.609375 0.5q-0.203125 0.3125 -0.203125 0.71875q0 0.609375 0.453125 1.015625q0.46875 0.40625 1.359375 0.40625q0.875 0 1.5625 -0.390625q0.6875 -0.390625 1.015625 -1.046875q0.234375 -0.515625 0.234375 -1.53125l0 -0.5625zm7.230835 3.15625l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm7.37677 1.359375l0 -1.3125q-1.046875 1.515625 -2.84375 1.515625q-0.796875 0 -1.5 -0.296875q-0.6875 -0.3125 -1.015625 -0.765625q-0.328125 -0.46875 -0.46875 -1.140625q-0.09375 -0.4375 -0.09375 -1.421875l0 -5.5625l1.515625 0l0 4.984375q0 1.1875 0.09375 1.609375q0.15625 0.59375 0.609375 0.9375q0.46875 0.34375 1.15625 0.34375q0.6875 0 1.28125 -0.34375q0.609375 -0.359375 0.859375 -0.953125q0.25 -0.609375 0.25 -1.765625l0 -4.8125l1.515625 0l0 8.984375l-1.359375 0zm3.730835 0l0 -8.984375l1.375 0l0 1.359375q0.515625 -0.953125 0.953125 -1.25q0.453125 -0.3125 0.984375 -0.3125q0.78125 0 1.5625 0.484375l-0.515625 1.421875q-0.5625 -0.328125 -1.125 -0.328125q-0.5 0 -0.890625 0.296875q-0.390625 0.296875 -0.5625 0.828125q-0.265625 0.8125 -0.265625 1.78125l0 4.703125l-1.515625 0zm11.942322 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm7.87146 2.671875l1.515625 -0.234375q0.125 0.90625 0.703125 1.390625q0.578125 0.484375 1.609375 0.484375q1.0625 0 1.5625 -0.421875q0.515625 -0.4375 0.515625 -1.015625q0 -0.515625 -0.453125 -0.8125q-0.3125 -0.203125 -1.5625 -0.515625q-1.671875 -0.421875 -2.328125 -0.71875q-0.640625 -0.3125 -0.984375 -0.859375q-0.328125 -0.546875 -0.328125 -1.203125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.75 -0.859375q0.359375 -0.265625 0.96875 -0.4375q0.609375 -0.1875 1.3125 -0.1875q1.0625 0 1.859375 0.3125q0.796875 0.296875 1.171875 0.828125q0.390625 0.515625 0.53125 1.375l-1.484375 0.203125q-0.109375 -0.6875 -0.59375 -1.078125q-0.484375 -0.390625 -1.375 -0.390625q-1.046875 0 -1.5 0.34375q-0.4375 0.34375 -0.4375 0.8125q0 0.296875 0.171875 0.53125q0.1875 0.25 0.59375 0.40625q0.21875 0.09375 1.34375 0.390625q1.609375 0.4375 2.25 0.71875q0.640625 0.265625 1.0 0.796875q0.375 0.515625 0.375 1.296875q0 0.765625 -0.453125 1.4375q-0.4375 0.671875 -1.28125 1.046875q-0.828125 0.359375 -1.890625 0.359375q-1.75 0 -2.671875 -0.71875q-0.921875 -0.734375 -1.171875 -2.171875z" fill-rule="nonzero"/><path fill="#595959" d="m737.0469 208.26697q-1.25 -1.578125 -2.125 -3.703125q-0.875 -2.140625 -0.875 -4.4375q0 -2.0 0.65625 -3.84375q0.765625 -2.140625 2.34375 -4.265625l1.09375 0q-1.015625 1.75 -1.34375 2.5q-0.515625 1.171875 -0.8125 2.4375q-0.375 1.59375 -0.375 3.1875q0 4.0625 2.53125 8.125l-1.09375 0zm2.2547607 -6.328125l1.515625 -0.234375q0.125 0.90625 0.703125 1.390625q0.578125 0.484375 1.609375 0.484375q1.0625 0 1.5625 -0.421875q0.515625 -0.4375 0.515625 -1.015625q0 -0.515625 -0.453125 -0.8125q-0.3125 -0.203125 -1.5625 -0.515625q-1.671875 -0.421875 -2.328125 -0.71875q-0.640625 -0.3125 -0.984375 -0.859375q-0.328125 -0.546875 -0.328125 -1.203125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.75 -0.859375q0.359375 -0.265625 0.96875 -0.4375q0.609375 -0.1875 1.3125 -0.1875q1.0625 0 1.859375 0.3125q0.796875 0.296875 1.171875 0.828125q0.390625 0.515625 0.53125 1.375l-1.484375 0.203125q-0.109375 -0.6875 -0.59375 -1.078125q-0.484375 -0.390625 -1.375 -0.390625q-1.046875 0 -1.5 0.34375q-0.4375 0.34375 -0.4375 0.8125q0 0.296875 0.171875 0.53125q0.1875 0.25 0.59375 0.40625q0.21875 0.09375 1.34375 0.390625q1.609375 0.4375 2.25 0.71875q0.640625 0.265625 1.0 0.796875q0.375 0.515625 0.375 1.296875q0 0.765625 -0.453125 1.4375q-0.4375 0.671875 -1.28125 1.046875q-0.828125 0.359375 -1.890625 0.359375q-1.75 0 -2.671875 -0.71875q-0.921875 -0.734375 -1.171875 -2.171875zm15.1328125 1.578125q-0.84375 0.71875 -1.625 1.015625q-0.78125 0.296875 -1.671875 0.296875q-1.484375 0 -2.28125 -0.71875q-0.796875 -0.734375 -0.796875 -1.859375q0 -0.65625 0.296875 -1.203125q0.3125 -0.546875 0.796875 -0.875q0.484375 -0.328125 1.09375 -0.5q0.4375 -0.109375 1.34375 -0.21875q1.84375 -0.21875 2.71875 -0.53125q0.015625 -0.3125 0.015625 -0.390625q0 -0.9375 -0.4375 -1.3125q-0.578125 -0.515625 -1.734375 -0.515625q-1.078125 0 -1.59375 0.375q-0.5 0.375 -0.75 1.328125l-1.484375 -0.203125q0.203125 -0.953125 0.65625 -1.53125q0.46875 -0.59375 1.34375 -0.90625q0.890625 -0.328125 2.046875 -0.328125q1.15625 0 1.875 0.28125q0.71875 0.265625 1.046875 0.671875q0.34375 0.40625 0.484375 1.03125q0.078125 0.390625 0.078125 1.40625l0 2.03125q0 2.125 0.09375 2.6875q0.09375 0.5625 0.390625 1.078125l-1.59375 0q-0.234375 -0.46875 -0.3125 -1.109375zm-0.125 -3.40625q-0.828125 0.34375 -2.484375 0.578125q-0.9375 0.140625 -1.328125 0.3125q-0.390625 0.171875 -0.609375 0.5q-0.203125 0.3125 -0.203125 0.71875q0 0.609375 0.453125 1.015625q0.46875 0.40625 1.359375 0.40625q0.875 0 1.5625 -0.390625q0.6875 -0.390625 1.015625 -1.046875q0.234375 -0.515625 0.234375 -1.53125l0 -0.5625zm3.90271 4.515625l0 -8.984375l1.359375 0l0 1.265625q0.421875 -0.671875 1.125 -1.0625q0.703125 -0.40625 1.609375 -0.40625q1.0 0 1.625 0.421875q0.640625 0.40625 0.90625 1.15625q1.0625 -1.578125 2.78125 -1.578125q1.328125 0 2.046875 0.75q0.71875 0.734375 0.71875 2.265625l0 6.171875l-1.515625 0l0 -5.65625q0 -0.921875 -0.15625 -1.3125q-0.140625 -0.40625 -0.53125 -0.65625q-0.375 -0.25 -0.90625 -0.25q-0.953125 0 -1.578125 0.640625q-0.625 0.625 -0.625 2.015625l0 5.21875l-1.515625 0l0 -5.84375q0 -1.015625 -0.375 -1.515625q-0.375 -0.515625 -1.21875 -0.515625q-0.640625 0 -1.1875 0.34375q-0.546875 0.34375 -0.796875 1.0q-0.234375 0.640625 -0.234375 1.875l0 4.65625l-1.53125 0zm20.590698 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm12.685791 2.671875l1.515625 -0.234375q0.125 0.90625 0.703125 1.390625q0.578125 0.484375 1.609375 0.484375q1.0625 0 1.5625 -0.421875q0.515625 -0.4375 0.515625 -1.015625q0 -0.515625 -0.453125 -0.8125q-0.3125 -0.203125 -1.5625 -0.515625q-1.671875 -0.421875 -2.328125 -0.71875q-0.640625 -0.3125 -0.984375 -0.859375q-0.328125 -0.546875 -0.328125 -1.203125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.75 -0.859375q0.359375 -0.265625 0.96875 -0.4375q0.609375 -0.1875 1.3125 -0.1875q1.0625 0 1.859375 0.3125q0.796875 0.296875 1.171875 0.828125q0.390625 0.515625 0.53125 1.375l-1.484375 0.203125q-0.109375 -0.6875 -0.59375 -1.078125q-0.484375 -0.390625 -1.375 -0.390625q-1.046875 0 -1.5 0.34375q-0.4375 0.34375 -0.4375 0.8125q0 0.296875 0.171875 0.53125q0.1875 0.25 0.59375 0.40625q0.21875 0.09375 1.34375 0.390625q1.609375 0.4375 2.25 0.71875q0.640625 0.265625 1.0 0.796875q0.375 0.515625 0.375 1.296875q0 0.765625 -0.453125 1.4375q-0.4375 0.671875 -1.28125 1.046875q-0.828125 0.359375 -1.890625 0.359375q-1.75 0 -2.671875 -0.71875q-0.921875 -0.734375 -1.171875 -2.171875zm9.2890625 -7.96875l0 -1.75l1.515625 0l0 1.75l-1.515625 0zm0 10.65625l0 -8.984375l1.515625 0l0 8.984375l-1.515625 0zm3.0372925 0l0 -1.234375l5.71875 -6.5625q-0.984375 0.046875 -1.71875 0.046875l-3.671875 0l0 -1.234375l7.34375 0l0 1.0l-4.859375 5.703125l-0.9375 1.046875q1.015625 -0.078125 1.921875 -0.078125l4.15625 0l0 1.3125l-7.953125 0zm15.6171875 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm19.15448 4.25q-0.84375 0.71875 -1.625 1.015625q-0.78125 0.296875 -1.671875 0.296875q-1.484375 0 -2.28125 -0.71875q-0.796875 -0.734375 -0.796875 -1.859375q0 -0.65625 0.296875 -1.203125q0.3125 -0.546875 0.796875 -0.875q0.484375 -0.328125 1.09375 -0.5q0.4375 -0.109375 1.34375 -0.21875q1.84375 -0.21875 2.71875 -0.53125q0.015625 -0.3125 0.015625 -0.390625q0 -0.9375 -0.4375 -1.3125q-0.578125 -0.515625 -1.734375 -0.515625q-1.078125 0 -1.59375 0.375q-0.5 0.375 -0.75 1.328125l-1.484375 -0.203125q0.203125 -0.953125 0.65625 -1.53125q0.46875 -0.59375 1.34375 -0.90625q0.890625 -0.328125 2.046875 -0.328125q1.15625 0 1.875 0.28125q0.71875 0.265625 1.046875 0.671875q0.34375 0.40625 0.484375 1.03125q0.078125 0.390625 0.078125 1.40625l0 2.03125q0 2.125 0.09375 2.6875q0.09375 0.5625 0.390625 1.078125l-1.59375 0q-0.234375 -0.46875 -0.3125 -1.109375zm-0.125 -3.40625q-0.828125 0.34375 -2.484375 0.578125q-0.9375 0.140625 -1.328125 0.3125q-0.390625 0.171875 -0.609375 0.5q-0.203125 0.3125 -0.203125 0.71875q0 0.609375 0.453125 1.015625q0.46875 0.40625 1.359375 0.40625q0.875 0 1.5625 -0.390625q0.6875 -0.390625 1.015625 -1.046875q0.234375 -0.515625 0.234375 -1.53125l0 -0.5625zm3.293335 1.828125l1.515625 -0.234375q0.125 0.90625 0.703125 1.390625q0.578125 0.484375 1.609375 0.484375q1.0625 0 1.5625 -0.421875q0.515625 -0.4375 0.515625 -1.015625q0 -0.515625 -0.453125 -0.8125q-0.3125 -0.203125 -1.5625 -0.515625q-1.671875 -0.421875 -2.328125 -0.71875q-0.640625 -0.3125 -0.984375 -0.859375q-0.328125 -0.546875 -0.328125 -1.203125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.75 -0.859375q0.359375 -0.265625 0.96875 -0.4375q0.609375 -0.1875 1.3125 -0.1875q1.0625 0 1.859375 0.3125q0.796875 0.296875 1.171875 0.828125q0.390625 0.515625 0.53125 1.375l-1.484375 0.203125q-0.109375 -0.6875 -0.59375 -1.078125q-0.484375 -0.390625 -1.375 -0.390625q-1.046875 0 -1.5 0.34375q-0.4375 0.34375 -0.4375 0.8125q0 0.296875 0.171875 0.53125q0.1875 0.25 0.59375 0.40625q0.21875 0.09375 1.34375 0.390625q1.609375 0.4375 2.25 0.71875q0.640625 0.265625 1.0 0.796875q0.375 0.515625 0.375 1.296875q0 0.765625 -0.453125 1.4375q-0.4375 0.671875 -1.28125 1.046875q-0.828125 0.359375 -1.890625 0.359375q-1.75 0 -2.671875 -0.71875q-0.921875 -0.734375 -1.171875 -2.171875zm13.525269 -1.8125q0 -2.484375 1.390625 -3.6875q1.15625 -1.0 2.8125 -1.0q1.859375 0 3.03125 1.21875q1.1875 1.203125 1.1875 3.34375q0 1.734375 -0.53125 2.734375q-0.515625 1.0 -1.515625 1.546875q-0.984375 0.546875 -2.171875 0.546875q-1.875 0 -3.046875 -1.203125q-1.15625 -1.21875 -1.15625 -3.5zm1.5625 0q0 1.734375 0.75 2.59375q0.75 0.859375 1.890625 0.859375q1.140625 0 1.890625 -0.859375q0.75 -0.875 0.75 -2.640625q0 -1.65625 -0.75 -2.515625q-0.75 -0.859375 -1.890625 -0.859375q-1.140625 0 -1.890625 0.859375q-0.75 0.84375 -0.75 2.5625zm8.637085 4.5l0 -8.984375l1.375 0l0 1.28125q0.984375 -1.484375 2.859375 -1.484375q0.8125 0 1.484375 0.296875q0.6875 0.28125 1.015625 0.765625q0.34375 0.46875 0.484375 1.125q0.078125 0.421875 0.078125 1.46875l0 5.53125l-1.515625 0l0 -5.46875q0 -0.921875 -0.1875 -1.375q-0.171875 -0.46875 -0.625 -0.75q-0.453125 -0.28125 -1.0625 -0.28125q-0.96875 0 -1.671875 0.625q-0.703125 0.609375 -0.703125 2.34375l0 4.90625l-1.53125 0zm17.779541 -1.359375l0.21875 1.34375q-0.640625 0.140625 -1.15625 0.140625q-0.828125 0 -1.28125 -0.265625q-0.453125 -0.265625 -0.640625 -0.6875q-0.1875 -0.4375 -0.1875 -1.796875l0 -5.171875l-1.125 0l0 -1.1875l1.125 0l0 -2.21875l1.515625 -0.921875l0 3.140625l1.53125 0l0 1.1875l-1.53125 0l0 5.25q0 0.65625 0.078125 0.84375q0.078125 0.1875 0.25 0.296875q0.1875 0.109375 0.53125 0.109375q0.25 0 0.671875 -0.0625zm1.486206 1.359375l0 -12.40625l1.53125 0l0 4.453125q1.0625 -1.234375 2.6875 -1.234375q1.0 0 1.734375 0.390625q0.734375 0.390625 1.046875 1.09375q0.328125 0.6875 0.328125 2.015625l0 5.6875l-1.53125 0l0 -5.6875q0 -1.15625 -0.5 -1.671875q-0.484375 -0.515625 -1.390625 -0.515625q-0.6875 0 -1.28125 0.359375q-0.59375 0.34375 -0.84375 0.953125q-0.25 0.59375 -0.25 1.640625l0 4.921875l-1.53125 0zm15.793335 -2.890625l1.5625 0.1875q-0.359375 1.390625 -1.375 2.15625q-1.0 0.75 -2.5625 0.75q-1.984375 0 -3.140625 -1.203125q-1.140625 -1.21875 -1.140625 -3.421875q0 -2.265625 1.15625 -3.515625q1.171875 -1.25 3.03125 -1.25q1.8125 0 2.953125 1.234375q1.140625 1.21875 1.140625 3.4375q0 0.140625 -0.015625 0.40625l-6.703125 0q0.09375 1.484375 0.84375 2.28125q0.75 0.78125 1.875 0.78125q0.84375 0 1.4375 -0.4375q0.59375 -0.4375 0.9375 -1.40625zm-5.0 -2.46875l5.015625 0q-0.109375 -1.125 -0.578125 -1.703125q-0.734375 -0.875 -1.890625 -0.875q-1.046875 0 -1.765625 0.703125q-0.703125 0.703125 -0.78125 1.875zm13.513916 5.359375l0 -12.40625l5.5 0q1.65625 0 2.515625 0.34375q0.875 0.328125 1.390625 1.171875q0.515625 0.84375 0.515625 1.875q0 1.3125 -0.859375 2.21875q-0.859375 0.90625 -2.640625 1.15625q0.65625 0.3125 1.0 0.609375q0.71875 0.671875 1.359375 1.65625l2.15625 3.375l-2.0625 0l-1.640625 -2.578125q-0.71875 -1.125 -1.1875 -1.703125q-0.46875 -0.59375 -0.84375 -0.828125q-0.359375 -0.25 -0.734375 -0.34375q-0.28125 -0.0625 -0.921875 -0.0625l-1.90625 0l0 5.515625l-1.640625 0zm1.640625 -6.921875l3.53125 0q1.125 0 1.75 -0.234375q0.640625 -0.234375 0.96875 -0.75q0.34375 -0.515625 0.34375 -1.109375q0 -0.890625 -0.640625 -1.453125q-0.640625 -0.5625 -2.03125 -0.5625l-3.921875 0l0 4.109375zm10.904419 6.921875l0 -12.40625l1.640625 0l0 5.09375l6.4375 0l0 -5.09375l1.65625 0l0 12.40625l-1.65625 0l0 -5.84375l-6.4375 0l0 5.84375l-1.640625 0zm11.904419 -3.984375l1.546875 -0.140625q0.109375 0.9375 0.5 1.53125q0.40625 0.59375 1.25 0.96875q0.859375 0.359375 1.90625 0.359375q0.9375 0 1.65625 -0.265625q0.734375 -0.28125 1.078125 -0.765625q0.34375 -0.5 0.34375 -1.078125q0 -0.578125 -0.34375 -1.015625q-0.328125 -0.4375 -1.109375 -0.734375q-0.5 -0.1875 -2.203125 -0.59375q-1.703125 -0.421875 -2.390625 -0.78125q-0.890625 -0.46875 -1.328125 -1.15625q-0.4375 -0.6875 -0.4375 -1.546875q0 -0.9375 0.53125 -1.75q0.53125 -0.8125 1.546875 -1.234375q1.03125 -0.421875 2.296875 -0.421875q1.375 0 2.421875 0.4375q1.0625 0.4375 1.625 1.3125q0.5625 0.859375 0.609375 1.953125l-1.578125 0.109375q-0.125 -1.171875 -0.859375 -1.765625q-0.734375 -0.609375 -2.15625 -0.609375q-1.484375 0 -2.171875 0.546875q-0.6875 0.546875 -0.6875 1.3125q0 0.671875 0.484375 1.109375q0.484375 0.421875 2.484375 0.875q2.0 0.453125 2.734375 0.796875q1.09375 0.5 1.609375 1.265625q0.515625 0.765625 0.515625 1.765625q0 0.984375 -0.578125 1.859375q-0.5625 0.875 -1.625 1.375q-1.0625 0.484375 -2.390625 0.484375q-1.6875 0 -2.828125 -0.484375q-1.125 -0.5 -1.78125 -1.484375q-0.640625 -0.984375 -0.671875 -2.234375zm12.917114 7.625l-1.09375 0q2.53125 -4.0625 2.53125 -8.125q0 -1.59375 -0.359375 -3.15625q-0.296875 -1.265625 -0.8125 -2.4375q-0.328125 -0.765625 -1.359375 -2.53125l1.09375 0q1.59375 2.125 2.359375 4.265625q0.640625 1.84375 0.640625 3.84375q0 2.296875 -0.875 4.4375q-0.875 2.125 -2.125 3.703125z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 26221082a06..bf9fe3c99ee 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -228,8 +228,11 @@ size `m` and spatial sizes `w` and `h`):
 \frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
 \left( \nabla y_{ijkl} \frac{x_{ijkl} - \mu_l}{\sigma^2_l+\epsilon} \right)
 \\\\
+d_l&=
+\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \nabla y_{ijkl}
+\\\\
 \nabla x_{ijkl} &= \frac{\gamma_{l}}{\sqrt{\sigma^2_{l}+\epsilon}}
-\left( \nabla y_{ijkl} - \mathrm{mean}(\nabla y) - c_l (x_{ijkl} - \mu_{l})
+\left( \nabla y_{ijkl} - d_l - c_l (x_{ijkl} - \mu_{l})
 \right)
 \\\\
 \nabla \gamma_l &= \sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h \left( \nabla y_{ijkl}
@@ -861,6 +864,16 @@ The output shape has these dimensions, in this order:
 *   `spatial_dims`: One value for each valid placement of the convolutional
     window.
 
+<div style="width:95%; margin:-5px; margin-bottom:-60px; margin-top:-20px;">
+<img style="width:100%" src="./images/batch_group_counts.svg">
+</div>
+
+The figure above shows how `batch_group_count` field works. Effectively, we
+slice each lhs batch into `batch_group_count` groups, and do the same for the
+output features. Then, for each of these groups we do pairwise convolutions and
+concatenate the output along the output feature dimension. The operational
+semantics of all the other dimensions (feature and spatial) remain the same.
+
 The valid placements of the convolutional window are determined by the strides
 and the size of the base area after padding.
 
@@ -1388,6 +1401,8 @@ using the comparison operator of the element type of `operand`.
 
 <b>`Cbrt(operand)`</b> Element-wise cubic root operation `x -> cbrt(x)`.
 
+<b>`Tan(operand)`</b> Element-wise tangent `x -> tan(x)`.
+
 <b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
 <b>`Round(operand)`</b> Element-wise rounding, ties away from zero.
@@ -1929,7 +1944,7 @@ XlaOp for the received data.
 The client API of `Recv` operation represents synchronous communication.
 However, the instruction is internally decomposed into 2 HLO instructions
 (`Recv` and `RecvDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
+[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h).
 
 <b>`Recv(const Shape& shape, int64 channel_id)`</b>
 
@@ -2785,7 +2800,7 @@ that shares the same channel handle. Does not return any data.
 Similar to the `Recv` operation, the client API of `Send` operation represents
 synchronous communication, and is internally decomposed into 2 HLO instructions
 (`Send` and `SendDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
+[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h).
 
 <b>`Send(HloInstruction operand, int64 channel_id)`</b>
 
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
index b1722f0d8da..88f94c2bbc3 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
@@ -70,7 +70,7 @@
       "source": [
         "This tutorial trains a TensorFlow model to classify the [CIFAR-10](https://en.wikipedia.org/wiki/CIFAR-10) dataset, and we compile it using XLA.\n",
         "\n",
-        "Load and normalize the dataset using the [TensorFlow Datasets](https://tensorflow.org/datasets) API:"
+        "You will load and normalize the dataset using the [TensorFlow Datasets (TFDS)](https://tensorflow.org/datasets) API. First, install/upgrade TensorFlow and TFDS:"
       ]
     },
     {
@@ -81,7 +81,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install tensorflow_datasets"
+        "!pip install -U -q tensorflow tensorflow_datasets"
       ]
     },
     {
@@ -190,7 +190,7 @@
       "outputs": [],
       "source": [
         "def compile_model(model):\n",
-        "  opt = tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6)\n",
+        "  opt = tf.keras.optimizers.RMSprop(learning_rate=0.0001)\n",
         "  model.compile(loss='categorical_crossentropy',\n",
         "                optimizer=opt,\n",
         "                metrics=['accuracy'])\n",
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb
index fb8a2ba0e12..b9967f4e94f 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb
@@ -81,8 +81,7 @@
       },
       "outputs": [],
       "source": [
-        "import tensorflow as tf\n",
-        "tf.compat.v1.enable_eager_execution()"
+        "import tensorflow as tf\n"
       ]
     },
     {
diff --git a/tensorflow/compiler/xla/glob_lit_test.bzl b/tensorflow/compiler/xla/glob_lit_test.bzl
new file mode 100644
index 00000000000..8863805fff7
--- /dev/null
+++ b/tensorflow/compiler/xla/glob_lit_test.bzl
@@ -0,0 +1,117 @@
+# Test definitions for Lit, the LLVM test runner.
+#
+# This is reusing the LLVM Lit test runner in the interim until the new build
+# rules are upstreamed.
+# TODO(b/136126535): remove this custom rule.
+"""Lit runner globbing test
+"""
+
+load("@bazel_skylib//lib:paths.bzl", "paths")
+
+# Default values used by the test runner.
+_default_test_file_exts = ["mlir", ".pbtxt", ".td"]
+_default_driver = "@llvm-project//mlir:run_lit.sh"
+_default_size = "small"
+_default_tags = []
+
+# These are patterns which we should never match, for tests, subdirectories, or
+# test input data files.
+_ALWAYS_EXCLUDE = [
+    "**/LICENSE.txt",
+    "**/README.txt",
+    "**/lit.local.cfg",
+    # Exclude input files that have spaces in their names, since bazel
+    # cannot cope with such "targets" in the srcs list.
+    "**/* *",
+    "**/* */**",
+]
+
+def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
+    """Runs lit on all tests it can find in `data` under xla/.
+
+    Note that, due to Bazel's hermetic builds, lit only sees the tests that
+    are included in the `data` parameter, regardless of what other tests might
+    exist in the directory searched.
+
+    Args:
+      name: str, the name of the test, including extension.
+      data: [str], the data input to the test.
+      size: str, the size of the test.
+      tags: [str], tags to attach to the test.
+      driver: str, label of the driver shell script.
+              Note: use of a custom driver is not currently supported
+              and specifying a default driver will abort the tests.
+      features: [str], list of extra features to enable.
+      exec_properties: may enable things like remote execution.
+    """
+
+    # Disable tests on windows for now, to enable testing rest of all xla and mlir.
+    native.py_test(
+        name = name,
+        srcs = ["@llvm-project//llvm:lit"],
+        tags = tags + ["no_pip", "no_windows"],
+        args = [
+            "xla/" + paths.basename(data[-1]) + " --config-prefix=runlit -v",
+        ] + features,
+        data = data + [
+            "//tensorflow/compiler/xla:litfiles",
+            "@llvm-project//llvm:FileCheck",
+            "@llvm-project//llvm:count",
+            "@llvm-project//llvm:not",
+        ],
+        size = size,
+        main = "lit.py",
+        exec_properties = exec_properties,
+    )
+
+def glob_lit_tests(
+        exclude = [],
+        test_file_exts = _default_test_file_exts,
+        default_size = _default_size,
+        size_override = {},
+        data = [],
+        per_test_extra_data = {},
+        default_tags = _default_tags,
+        tags_override = {},
+        driver = _default_driver,
+        features = [],
+        exec_properties = {}):
+    """Creates all plausible Lit tests (and their inputs) under this directory.
+
+    Args:
+      exclude: [str], paths to exclude (for tests and inputs).
+      test_file_exts: [str], extensions for files that are tests.
+      default_size: str, the test size for targets not in "size_override".
+      size_override: {str: str}, sizes to use for specific tests.
+      data: [str], additional input data to the test.
+      per_test_extra_data: {str: [str]}, extra data to attach to a given file.
+      default_tags: [str], additional tags to attach to the test.
+      tags_override: {str: str}, tags to add to specific tests.
+      driver: str, label of the driver shell script.
+              Note: use of a custom driver is not currently supported
+              and specifying a default driver will abort the tests.
+      features: [str], list of extra features to enable.
+      exec_properties: a dictionary of properties to pass on.
+    """
+
+    # Ignore some patterns by default for tests and input data.
+    exclude = _ALWAYS_EXCLUDE + exclude
+
+    tests = native.glob(
+        ["*." + ext for ext in test_file_exts],
+        exclude = exclude,
+    )
+
+    # Run tests individually such that errors can be attributed to a specific
+    # failure.
+    for curr_test in tests:
+        # Instantiate this test with updated parameters.
+        _run_lit_test(
+            name = curr_test + ".test",
+            data = data + [curr_test] + per_test_extra_data.get(curr_test, []),
+            size = size_override.get(curr_test, default_size),
+            tags = default_tags + tags_override.get(curr_test, []),
+            driver = driver,
+            features = features,
+            exec_properties = exec_properties,
+        )
diff --git a/tensorflow/compiler/xla/hlo/evaluator/BUILD b/tensorflow/compiler/xla/hlo/evaluator/BUILD
index 4d6ccf87e53..f99a903ecdf 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/BUILD
+++ b/tensorflow/compiler/xla/hlo/evaluator/BUILD
@@ -2,9 +2,10 @@
 #   XLA evaluator implementation.
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -27,6 +28,7 @@ cc_library(
         "hlo_evaluator_typed_visitor_complex64.cc",
         "hlo_evaluator_typed_visitor_double.cc",
         "hlo_evaluator_typed_visitor_float.cc",
+        "hlo_evaluator_typed_visitor_float8.cc",
         "hlo_evaluator_typed_visitor_half.cc",
         "hlo_evaluator_typed_visitor_int16.cc",
         "hlo_evaluator_typed_visitor_int32.cc",
@@ -49,13 +51,15 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:call_graph",
+        "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:dynamic_dimension_inference",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/lib/core:bitmap",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -75,7 +79,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_evaluator_test",
     srcs = ["hlo_evaluator_test.cc"],
     deps = [
@@ -92,7 +96,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
index 358a83d33b7..5911a721fdc 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -33,26 +34,29 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -179,22 +183,137 @@ StatusOr<Literal> Compare<complex128>(const Shape& shape,
   return std::move(result);
 }
 
+std::optional<bool> GetInstructionStaticValueAsBool(
+    HloInstruction* instruction) {
+  HloEvaluator evaluator;
+  StatusOr<Literal> static_value = evaluator.Evaluate(
+      instruction, /*recursively_evaluate_nonconstant_operands=*/true);
+  if (static_value.ok()) {
+    return static_value->GetFirstElement<bool>();
+  }
+  return std::nullopt;
+}
+
+constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
+
+// Use this class to represent the precise details of the error to enable
+// special treatment.
+enum class EvalErrorDetail : uint32_t {
+  // The evaluation result depends on dynamic values such as parameters and
+  // infeed. Therefore, the HLO's value cannot be statically evaluated.
+  kDynamicValueDependence = 0,
+};
+
+std::optional<EvalErrorDetail> ParseEvalErrorDetail(const Status& error) {
+  auto error_detail = error.GetPayload(kEvalErrorDetailUrl);
+  if (!error_detail.has_value() && error_detail->empty()) {
+    return std::nullopt;
+  }
+  return static_cast<EvalErrorDetail>(
+      absl::little_endian::Load32(error_detail->Flatten().data()));
+}
+
+Status MakeEvalErrorDueToParamOrInfeed(const HloInstruction& eval_instruction) {
+  Status error = tsl::errors::FailedPrecondition(
+      "Failed to evaluate instruction (", eval_instruction.name(),
+      ") since it depends on infeed or parameters to its parent computation (",
+      eval_instruction.parent()->name(), ").");
+  std::string error_payload;
+  error_payload.resize(sizeof(EvalErrorDetail));
+  absl::little_endian::Store32(
+      const_cast<char*>(error_payload.data()),
+      static_cast<uint32_t>(EvalErrorDetail::kDynamicValueDependence));
+  error.SetPayload(kEvalErrorDetailUrl, absl::Cord(error_payload));
+  return error;
+}
+
+// Repesents a value that might or might not be determined statically.
+struct DynamicOrStaticInteger {
+  std::optional<int64_t> static_value;
+  bool is_dynamic() const { return !static_value.has_value(); }
+
+  std::string ToString() const {
+    return is_dynamic() ? std::string("DYNAMIC") : absl::StrCat(*static_value);
+  }
+};
+
+std::optional<DynamicOrStaticInteger> GetInstructionValueAsInteger(
+    HloInstruction* instruction) {
+  HloEvaluator evaluator;
+  StatusOr<Literal> static_value = evaluator.Evaluate(
+      instruction, /*recursively_evaluate_nonconstant_operands=*/true);
+  if (static_value.ok()) {
+    if (instruction->shape().element_type() == PrimitiveType::PRED) {
+      return DynamicOrStaticInteger{
+          static_cast<int64_t>(static_value->GetFirstElement<bool>())};
+    } else {
+      return DynamicOrStaticInteger{static_value->GetFirstInteger()};
+    }
+  }
+
+  std::optional<EvalErrorDetail> eval_error_detail =
+      ParseEvalErrorDetail(static_value.status());
+  if (eval_error_detail.has_value() &&
+      *eval_error_detail == EvalErrorDetail::kDynamicValueDependence) {
+    return DynamicOrStaticInteger{std::nullopt};
+  }
+  return std::nullopt;
+}
+
 // Represents an index into the while argument tuple and / or a value.
 // At least one of param_index and value has a value; both of them could have
 // a value.
 struct ParamIndexAndValue {
   std::optional<int64_t> param_index;
-  std::optional<int64_t> value;
+  std::optional<DynamicOrStaticInteger> value;
 
   bool IsValid() const { return param_index.has_value() || value.has_value(); }
+  std::string ToString() const {
+    return absl::StrCat(
+        "param_index:",
+        !param_index.has_value() ? std::string("UNKNOWN")
+                                 : absl::StrCat(*param_index),
+        ",", "value:",
+        !value.has_value() ? std::string("UNKONWN") : value->ToString());
+  }
 };
 
+std::optional<ParamIndexAndValue> TryParsingInstructionAsParameterAndInteger(
+    HloInstruction* instruction) {
+  // Skip copies.
+  if (instruction->opcode() == HloOpcode::kCopy) {
+    return TryParsingInstructionAsParameterAndInteger(
+        instruction->mutable_operand(0));
+  }
+  if (instruction->opcode() == HloOpcode::kCopyDone) {
+    return TryParsingInstructionAsParameterAndInteger(
+        instruction->mutable_operand(0)->mutable_operand(1));
+  }
+  ParamIndexAndValue result;
+  if (Match(instruction, match::GetTupleElement().WithOperand(
+                             0, match::Parameter().WithParameterNum(0)))) {
+    result.param_index = instruction->tuple_index();
+  }
+  std::optional<DynamicOrStaticInteger> integer_value =
+      GetInstructionValueAsInteger(instruction);
+  result.value = std::move(integer_value);
+  if (!result.IsValid()) {
+    return std::nullopt;
+  }
+  return std::optional<ParamIndexAndValue>(std::move(result));
+}
+
 // Represents the while loop condition comparison.
 // We assume comparison is of the form: lhs comp rhs.
 struct WhileCondComparison {
-  ComparisonDirection comparson_direction;
+  ComparisonDirection comparison_direction;
   ParamIndexAndValue lhs;
   ParamIndexAndValue rhs;
+
+  std::string ToString() const {
+    return absl::StrCat("WhileCondComparison{", "LHS:{", lhs.ToString(),
+                        "},RHS:{", rhs.ToString(), "}}");
+  }
 };
 
 // Represents the parsed while loop condition. The loop induction variable may
@@ -204,57 +323,55 @@ struct WhileCondComparison {
 using WhileCondComparisonOrNoOp =
     std::variant<WhileCondComparison, ParamIndexAndValue>;
 
+std::optional<ParamIndexAndValue> ParseComparisonOperand(
+    HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kCopy ||
+      operand->opcode() == HloOpcode::kCopyStart ||
+      operand->opcode() == HloOpcode::kCopyDone) {
+    return ParseComparisonOperand(operand->mutable_operand(0));
+  }
+  std::optional<int64_t> param_index;
+  if (Match(operand, match::GetTupleElement().WithOperand(
+                         0, match::Parameter().WithParameterNum(0)))) {
+    param_index = operand->tuple_index();
+  }
+  std::optional<DynamicOrStaticInteger> operand_value =
+      GetInstructionValueAsInteger(operand);
+  if (!param_index.has_value() && !operand_value.has_value()) {
+    return std::nullopt;
+  }
+  return ParamIndexAndValue{param_index, operand_value};
+}
+
+std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondComparison(
+    HloInstruction* comparison) {
+  CHECK_EQ(comparison->opcode(), HloOpcode::kCompare);
+  std::optional<ParamIndexAndValue> lhs =
+      ParseComparisonOperand(comparison->mutable_operand(0));
+  std::optional<ParamIndexAndValue> rhs =
+      ParseComparisonOperand(comparison->mutable_operand(1));
+  if (!lhs.has_value() || !rhs.has_value()) {
+    return std::nullopt;
+  }
+  return WhileCondComparison{comparison->comparison_direction(),
+                             *std::move(lhs), *std::move(rhs)};
+}
 // Finds the while loop condition comparison by matching the loop condition root
 // with known patterns.
-std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondComparison(
+std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondRoot(
     HloInstruction* loop_cond_root) {
-  // Base pattern #1: gte-0 comp gte-1
-  if (Match(loop_cond_root,
-            match::Compare()
-                .WithOperand(0, match::GetTupleElement().WithOperand(
-                                    0, match::Parameter().WithParameterNum(0)))
-                .WithOperand(1,
-                             match::GetTupleElement().WithOperand(
-                                 0, match::Parameter().WithParameterNum(0))))) {
-    return WhileCondComparison{
-        loop_cond_root->comparison_direction(),
-        {/*param_index=*/loop_cond_root->operand(0)->tuple_index()},
-        {/*param_index=*/loop_cond_root->operand(1)->tuple_index()}};
-  }
-  // Base pattern #2: constant comp gte
-  if (Match(loop_cond_root,
-            match::Compare()
-                .WithOperand(0, match::Constant())
-                .WithOperand(1,
-                             match::GetTupleElement().WithOperand(
-                                 0, match::Parameter().WithParameterNum(0))))) {
-    std::optional<int64_t> lhs_value =
-        loop_cond_root->operand(0)->literal().GetFirstInteger();
-    if (!lhs_value.has_value()) {
-      return std::nullopt;
-    }
-    return WhileCondComparison{
-        loop_cond_root->comparison_direction(),
-        {/*param_index=*/std::nullopt, /*value=*/*lhs_value},
-        {/*param_index=*/loop_cond_root->operand(1)->tuple_index()}};
+  if (loop_cond_root->opcode() == HloOpcode::kCopy) {
+    return PatternMatchLoopCondRoot(loop_cond_root->mutable_operand(0));
   }
-  // Base pattern #3: gte comp constant
-  if (Match(loop_cond_root,
-            match::Compare()
-                .WithOperand(0, match::GetTupleElement().WithOperand(
-                                    0, match::Parameter().WithParameterNum(0)))
-                .WithOperand(1, match::Constant()))) {
-    std::optional<int64_t> rhs_value =
-        loop_cond_root->operand(1)->literal().GetFirstInteger();
-    if (!rhs_value.has_value()) {
-      return std::nullopt;
-    }
-    return WhileCondComparison{
-        loop_cond_root->comparison_direction(),
-        {/*param_index=*/loop_cond_root->operand(0)->tuple_index(),
-         /*value=*/std::nullopt},
-        {/*param_index=*/std::nullopt, /*value=*/*rhs_value},
-    };
+  if (loop_cond_root->opcode() == HloOpcode::kCopyDone) {
+    return PatternMatchLoopCondRoot(
+        loop_cond_root->mutable_operand(0)->mutable_operand(1));
+  }
+  if (loop_cond_root->opcode() == HloOpcode::kCompare) {
+    // Base pattern #1: gte-0 comp gte-1
+    // Base pattern #2: constant comp gte
+    // Base pattern #3: gte comp constant
+    return PatternMatchLoopCondComparison(loop_cond_root);
   }
   // Base pattern #4: gte is a boolean scalar and it was return immediately.
   if (Match(loop_cond_root, match::GetTupleElement().WithOperand(
@@ -279,7 +396,7 @@ std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondComparison(
     HloComputation* to_apply = call_instruction->to_apply();
     HloInstruction* to_apply_root = to_apply->root_instruction();
     if (Match(to_apply_root, match::Tuple())) {
-      return PatternMatchLoopCondComparison(
+      return PatternMatchLoopCondRoot(
           to_apply_root->mutable_operand(loop_cond_root->tuple_index()));
     }
   }
@@ -291,417 +408,384 @@ std::optional<WhileCondComparisonOrNoOp> PatternMatchLoopCondComparison(
     HloInstruction* new_cond_root =
         loop_cond_root->mutable_operand(0)->mutable_operand(
             loop_cond_root->tuple_index());
-    return PatternMatchLoopCondComparison(new_cond_root);
+    return PatternMatchLoopCondRoot(new_cond_root);
   }
   return std::nullopt;
 }
 
-// Tries to parse the loop body to find how the induction variable is updated
-// using pattern matching.
-std::optional<int64_t> PatternMatchInductionVarUpdate(
-    HloInstruction* loop_body_root, int64_t tuple_index) {
-  // Pattern #1: induc_var = induc_var + constant
-  if (Match(loop_body_root,
-            match::Tuple().WithOperand(
-                tuple_index,
-                match::Add()
-                    .WithOperand(0, match::GetTupleElement()
-                                        .WithTupleIndex(tuple_index)
-                                        .WithOperand(0, match::Parameter()))
-                    .WithOperand(1, match::Constant())))) {
-    std::optional<int64_t> step_size = loop_body_root->operand(tuple_index)
-                                           ->operand(1)
-                                           ->literal()
-                                           .GetFirstInteger();
-    if (!step_size.has_value()) {
-      return std::nullopt;
-    }
-    return *step_size;
-  }
-  // Pattern #2: induc_var = constant + induc_var
-  if (Match(
-          loop_body_root,
-          match::Tuple().WithOperand(
-              tuple_index,
-              match::Add()
-                  .WithOperand(0, match::Constant())
-                  .WithOperand(1, match::GetTupleElement()
-                                      .WithTupleIndex(tuple_index)
-                                      .WithOperand(0, match::Parameter()))))) {
-    std::optional<int64_t> step_size = loop_body_root->operand(tuple_index)
-                                           ->operand(0)
-                                           ->literal()
-                                           .GetFirstInteger();
-    if (!step_size.has_value()) {
-      return std::nullopt;
+std::optional<DynamicOrStaticInteger> PatternMatchInductionVarUpdate(
+    HloInstruction* induction_var_update, int64_t tuple_index) {
+  if (induction_var_update->opcode() == HloOpcode::kCopy) {
+    return PatternMatchInductionVarUpdate(
+        induction_var_update->mutable_operand(0), tuple_index);
+  }
+  if (induction_var_update->opcode() == HloOpcode::kCopyDone) {
+    return PatternMatchInductionVarUpdate(
+        induction_var_update->mutable_operand(0)->mutable_operand(1),
+        tuple_index);
+  }
+  std::optional<ParamIndexAndValue> update_param_index_and_value =
+      TryParsingInstructionAsParameterAndInteger(induction_var_update);
+
+  if (update_param_index_and_value.has_value()) {
+    if (update_param_index_and_value->param_index.has_value()) {
+      if (*update_param_index_and_value->param_index == tuple_index) {
+        // Pattern: the induc_var is directly returned from the loop body with
+        // no changes.
+        VLOG(3) << "PatternMatchInductionVarUpdate, pattern: [induc_var].";
+        return DynamicOrStaticInteger{/*static_value=*/0};
+      } else {
+        VLOG(3)
+            << "PatternMatchInductionVarUpdate, induction variable is set to "
+               "another parameter value. Parsed update: "
+            << update_param_index_and_value->ToString();
+        return std::nullopt;
+      }
     }
-    return *step_size;
-  }
-
-  // Pattern #3: induc_var = induc_var - constant
-  if (Match(loop_body_root,
-            match::Tuple().WithOperand(
-                tuple_index,
-                match::Subtract()
-                    .WithOperand(0, match::GetTupleElement()
-                                        .WithTupleIndex(tuple_index)
-                                        .WithOperand(0, match::Parameter()))
-                    .WithOperand(1, match::Constant())))) {
-    std::optional<int64_t> step_size = loop_body_root->operand(tuple_index)
-                                           ->operand(1)
-                                           ->literal()
-                                           .GetFirstInteger();
-    if (!step_size.has_value()) {
+    if (update_param_index_and_value->value.has_value() &&
+        !update_param_index_and_value->value->is_dynamic()) {
+      VLOG(3) << "PatternMatchInductionVarUpdate, induction variable is set to "
+                 "a constant. Parsed update: "
+              << update_param_index_and_value->ToString();
       return std::nullopt;
     }
-    return -*step_size;
   }
 
-  // Pattern #4: the induc_var is directly returned from the loop body with
-  // no changes.
-  if (Match(loop_body_root,
-            match::Tuple().WithOperand(
-                tuple_index,
-                match::GetTupleElement()
-                    .WithOperand(0, match::Parameter().WithParameterNum(0))
-                    .WithTupleIndex(tuple_index)))) {
-    return 0;
+  if (induction_var_update->opcode() != HloOpcode::kAdd &&
+      induction_var_update->opcode() != HloOpcode::kSubtract) {
+    return std::nullopt;
+  }
+  bool negate_update = induction_var_update->opcode() == HloOpcode::kSubtract;
+  HloInstruction* update_lhs = induction_var_update->mutable_operand(0);
+  VLOG(3) << "PatternMatchInductionVarUpdate, LHS: " << update_lhs->ToString();
+  std::optional<ParamIndexAndValue> update_lhs_param_index_and_value =
+      TryParsingInstructionAsParameterAndInteger(update_lhs);
+
+  HloInstruction* update_rhs = induction_var_update->mutable_operand(1);
+  VLOG(3) << "PatternMatchInductionVarUpdate, RHS: " << update_rhs->ToString();
+  std::optional<ParamIndexAndValue> update_rhs_param_index_and_value =
+      TryParsingInstructionAsParameterAndInteger(update_rhs);
+
+  if (!update_lhs_param_index_and_value.has_value() ||
+      !update_lhs_param_index_and_value->value.has_value() ||
+      !update_rhs_param_index_and_value.has_value() ||
+      !update_rhs_param_index_and_value->value.has_value()) {
+    VLOG(3) << "PatternMatchInductionVarUpdate, failed to parse operands. "
+               "Induction var update instruction: "
+            << induction_var_update->ToString();
+    return std::nullopt;
   }
-  return std::nullopt;
-}
 
-std::optional<bool> PatternMatchLoopCondVarOverride(
-    HloInstruction* loop_body_root, int64_t tuple_index) {
-  if (Match(loop_body_root, match::Tuple()) &&
-      loop_body_root->operand_count() > tuple_index) {
-    HloInstruction* cond_var_override =
-        loop_body_root->mutable_operand(tuple_index);
-    HloEvaluator evaluator;
-    StatusOr<Literal> new_cond_var = evaluator.Evaluate(
-        cond_var_override, /*recursively_evaluate_nonconstant_operands=*/true);
-    if (new_cond_var.ok()) {
-      return new_cond_var->GetFirstElement<bool>();
+  VLOG(3) << "update_lhs: " << update_lhs->ToString();
+  VLOG(3) << "update_rhs: " << update_rhs->ToString();
+
+  if (update_lhs_param_index_and_value->param_index.has_value() &&
+      *update_lhs_param_index_and_value->param_index == tuple_index &&
+      update_lhs_param_index_and_value->value->is_dynamic()) {
+    if (update_rhs_param_index_and_value->value->is_dynamic()) {
+      return update_rhs_param_index_and_value->value;
     }
+    int64_t update_value =
+        *update_rhs_param_index_and_value->value->static_value;
+    return negate_update
+               ? DynamicOrStaticInteger{/*static_value=*/-update_value}
+               : DynamicOrStaticInteger{/*static_value=*/update_value};
+  }
+
+  if (update_rhs_param_index_and_value->param_index.has_value() &&
+      *update_rhs_param_index_and_value->param_index == tuple_index &&
+      update_rhs_param_index_and_value->value->is_dynamic() && !negate_update) {
+    return update_lhs_param_index_and_value->value;
   }
+  VLOG(3) << "Failed to pattern match induction variable update.";
   return std::nullopt;
 }
 
-// Repesents a value that might or might not be determined statically.
-struct DynamicOrStaticValue {
-  std::optional<int64_t> static_value;
-  bool is_dynamic() const { return !static_value.has_value(); }
-};
-
-constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
-
-// Use this class to represent the precise details of the error to enable
-// special treatment.
-enum class EvalErrorDetail : uint32_t {
-  // The evaluation result depends on dynamic values such as parameters and
-  // infeed. Therefore, the HLO's value cannot be statically evaluated.
-  kDynamicValueDependence = 0,
-};
-
-Status MakeEvalErrorDueToParamOrInfeed(const HloInstruction& eval_instruction) {
-  Status error = tsl::errors::FailedPrecondition(
-      "Failed to evaluate instruction (", eval_instruction.name(),
-      ") since it depends on infeed or parameters to its parent computation (",
-      eval_instruction.parent()->name(), ").");
-  std::string error_payload;
-  error_payload.resize(sizeof(EvalErrorDetail));
-  absl::little_endian::Store32(
-      const_cast<char*>(error_payload.data()),
-      static_cast<uint32_t>(EvalErrorDetail::kDynamicValueDependence));
-  error.SetPayload(kEvalErrorDetailUrl, error_payload);
-  return error;
+// Tries to parse the loop body to find how the induction variable is updated
+// using pattern matching.
+std::optional<DynamicOrStaticInteger>
+PatternMatchInductionVarUpdateFromLoopBodyRoot(HloInstruction* loop_body_root,
+                                               int64_t tuple_index) {
+  if (loop_body_root->opcode() != HloOpcode::kTuple ||
+      loop_body_root->operand_count() <= tuple_index) {
+    return std::nullopt;
+  }
+  HloInstruction* induction_var_update =
+      loop_body_root->mutable_operand(tuple_index);
+  return PatternMatchInductionVarUpdate(induction_var_update, tuple_index);
 }
 
-std::optional<EvalErrorDetail> ParseEvalErrorDetail(const Status& error) {
-  auto error_detail = error.GetPayload(kEvalErrorDetailUrl);
-  if (!error_detail.has_value() && error_detail->empty()) {
+std::optional<bool> PatternMatchLoopCondVarOverride(
+    HloInstruction* loop_body_root, int64_t tuple_index) {
+  if (!Match(loop_body_root, match::Tuple()) ||
+      loop_body_root->operand_count() <= tuple_index) {
     return std::nullopt;
   }
-  return static_cast<EvalErrorDetail>(
-      absl::little_endian::Load32(error_detail->Flatten().data()));
+  HloInstruction* cond_var_override =
+      loop_body_root->mutable_operand(tuple_index);
+  return GetInstructionStaticValueAsBool(cond_var_override);
 }
 
 // A convenience wrapper to compute the while loop's argument's init value at
 // the given tuple_index. If the init value depends on parameters to the
 // while loop's parent computation or infeed, we consider the init value
 // dynamic.
-std::optional<DynamicOrStaticValue> EvaluateWhileLoopParamInitValue(
+std::optional<DynamicOrStaticInteger> EvaluateWhileLoopParamInitValue(
     HloInstruction* param_instruction, int64_t tuple_index) {
   if (param_instruction->opcode() != HloOpcode::kTuple) {
     return std::nullopt;
   }
   HloInstruction* element_instruction =
       param_instruction->mutable_operand(tuple_index);
-  HloEvaluator evaluator;
-  StatusOr<Literal> value = evaluator.Evaluate(
-      element_instruction, /*recursively_evaluate_nonconstant_operands=*/true);
-  if (value.ok()) {
-    if (element_instruction->shape().element_type() == PrimitiveType::PRED) {
-      return DynamicOrStaticValue{
-          static_cast<int64_t>(value->GetFirstElement<bool>())};
-    } else {
-      return DynamicOrStaticValue{value->GetFirstInteger()};
+  return GetInstructionValueAsInteger(element_instruction);
+}
+
+}  // namespace
+
+std::optional<ParsedWhileLoop> HandleNoopLoopCondition(
+    const ParamIndexAndValue& parameter_index_and_value,
+    HloInstruction* while_operand, HloComputation* while_body) {
+  CHECK(parameter_index_and_value.param_index.has_value());
+  int64_t loop_cond_var_index = *parameter_index_and_value.param_index;
+  std::optional<DynamicOrStaticInteger> noop_value =
+      EvaluateWhileLoopParamInitValue(while_operand, loop_cond_var_index);
+
+  if (noop_value.has_value()) {
+    if (noop_value->is_dynamic()) {
+      return kParsedDynamicWhileLoop;
+    } else if (*noop_value->static_value == 0) {
+      return ParsedWhileLoop{
+          ParsedStaticWhileLoop{/*trip_count=*/0,
+                                /*induction_var_index=*/loop_cond_var_index,
+                                /*induction_var_init_value=*/0,
+                                /*step_size=*/0,
+                                /*loop_bound=*/0}};
     }
-  } else {
-    std::optional<EvalErrorDetail> eval_error_detail =
-        ParseEvalErrorDetail(value.status());
-    if (eval_error_detail.has_value() &&
-        *eval_error_detail == EvalErrorDetail::kDynamicValueDependence) {
-      return DynamicOrStaticValue{std::nullopt};
+    std::optional<bool> updated_loop_cond_var = PatternMatchLoopCondVarOverride(
+        while_body->root_instruction(), loop_cond_var_index);
+    if (updated_loop_cond_var.has_value()) {
+      if (!*updated_loop_cond_var) {
+        return ParsedWhileLoop{
+            ParsedStaticWhileLoop{/*trip_count=*/1,
+                                  /*induction_var_index=*/loop_cond_var_index,
+                                  /*induction_var_init_value=*/0,
+                                  /*step_size=*/1,
+                                  /*loop_bound=*/1}};
+      } else {
+        // This is an infinite loop and we set trip_count to -1.
+        return ParsedWhileLoop{
+            ParsedStaticWhileLoop{/*trip_count=*/-1,
+                                  /*induction_var_index=*/loop_cond_var_index,
+                                  /*induction_var_init_value=*/0,
+                                  /*step_size=*/0,
+                                  /*loop_bound=*/1}};
+      }
     }
   }
   return std::nullopt;
 }
 
-}  // namespace
+int64_t ComputeTripCountFromComparison(int64_t init, int64_t bound,
+                                       int64_t update,
+                                       bool comparison_with_equal) {
+  if (comparison_with_equal && init > bound) {
+    return 0;
+  }
+  if (!comparison_with_equal && init >= bound) {
+    return 0;
+  }
+  int64_t distance = bound - init;
+  int64_t trip_count = (distance + update - 1) / update;
+  CHECK_GE(trip_count, 0);
+  // Additional logic to deal with equal comparison.
+  if (comparison_with_equal && (bound - init) % update == 0) {
+    trip_count += 1;
+  }
+  return trip_count;
+}
+
+std::optional<ParsedWhileLoop> HandleStaticLoopComparison(
+    int64_t lhs, int64_t rhs, Comparison::Direction comparison_direction) {
+  if ((comparison_direction == Comparison::Direction::kLt && lhs < rhs) ||
+      (comparison_direction == Comparison::Direction::kLe && lhs <= rhs) ||
+      (comparison_direction == Comparison::Direction::kGt && lhs > rhs) ||
+      (comparison_direction == Comparison::Direction::kGe && lhs >= rhs) ||
+      (comparison_direction == Comparison::Direction::kEq && lhs == rhs) ||
+      (comparison_direction == Comparison::Direction::kNe && lhs != rhs)) {
+    // This is an infinite loop and we set trip_count to -1.
+    // There is no induction variable.
+    return ParsedWhileLoop{ParsedStaticWhileLoop{/*trip_count=*/-1,
+                                                 /*induction_var_index=*/-1,
+                                                 /*induction_var_init_value=*/0,
+                                                 /*step_size=*/0,
+                                                 /*loop_bound=*/1}};
+  }
+  return ParsedWhileLoop{ParsedStaticWhileLoop{/*trip_count=*/0,
+                                               /*induction_var_index=*/-1,
+                                               /*induction_var_init_value=*/0,
+                                               /*step_size=*/0,
+                                               /*loop_bound=*/0}};
+}
 
 std::optional<ParsedWhileLoop> PatternMatchParseWhileLoop(
     HloInstruction* while_op) {
+  VLOG(3) << "PatternMatchParseWhileLoop, while_op: " << while_op->name();
   HloComputation* while_cond = while_op->while_condition();
   HloComputation* while_body = while_op->while_body();
   HloInstruction* while_operand = while_op->mutable_operand(0);
   // Try to parse the loop condition comparison.
   std::optional<WhileCondComparisonOrNoOp> loop_comparison_or_noop =
-      PatternMatchLoopCondComparison(while_cond->root_instruction());
+      PatternMatchLoopCondRoot(while_cond->root_instruction());
   if (!loop_comparison_or_noop.has_value()) {
     return std::nullopt;
   }
   if (loop_comparison_or_noop->index() == 1) {
-    ParamIndexAndValue& parameter_index_and_value =
-        std::get<ParamIndexAndValue>(*loop_comparison_or_noop);
-    CHECK(parameter_index_and_value.param_index.has_value());
-    int64_t loop_cond_var_index = *parameter_index_and_value.param_index;
-    std::optional<DynamicOrStaticValue> noop_value =
-        EvaluateWhileLoopParamInitValue(while_operand, loop_cond_var_index);
-
-    if (noop_value.has_value()) {
-      if (noop_value->is_dynamic()) {
-        return kParsedDynamicWhileLoop;
-      } else if (*noop_value->static_value == 0) {
-        return ParsedWhileLoop{
-            ParsedStaticWhileLoop{/*trip_count=*/0,
-                                  /*induction_var_index=*/loop_cond_var_index,
-                                  /*induction_var_init_value=*/0,
-                                  /*step_size=*/0,
-                                  /*loop_bound=*/0}};
-      }
-      std::optional<bool> updated_loop_cond_var =
-          PatternMatchLoopCondVarOverride(while_body->root_instruction(),
-                                          loop_cond_var_index);
-      if (updated_loop_cond_var.has_value()) {
-        if (!*updated_loop_cond_var) {
-          return ParsedWhileLoop{
-              ParsedStaticWhileLoop{/*trip_count=*/1,
-                                    /*induction_var_index=*/loop_cond_var_index,
-                                    /*induction_var_init_value=*/0,
-                                    /*step_size=*/1,
-                                    /*loop_bound=*/1}};
-        } else {
-          // This is an infinite loop and we set trip_count to -1.
-          return ParsedWhileLoop{
-              ParsedStaticWhileLoop{/*trip_count=*/-1,
-                                    /*induction_var_index=*/loop_cond_var_index,
-                                    /*induction_var_init_value=*/0,
-                                    /*step_size=*/0,
-                                    /*loop_bound=*/1}};
-        }
-      }
-    }
-    return std::nullopt;
+    return HandleNoopLoopCondition(
+        std::get<ParamIndexAndValue>(*loop_comparison_or_noop), while_operand,
+        while_body);
   }
   CHECK_EQ(loop_comparison_or_noop->index(), 0);
   WhileCondComparison loop_comparison =
       std::get<WhileCondComparison>(*loop_comparison_or_noop);
   CHECK(loop_comparison.lhs.IsValid() && loop_comparison.rhs.IsValid());
 
-  // If the while loop condition comparison's both sides take an init value
-  // from the while loop's parent computation's parameter, the loop is dynamic.
-  if (while_operand->opcode() == HloOpcode::kParameter) {
-    if (loop_comparison.lhs.param_index.has_value() ||
-        loop_comparison.rhs.param_index.has_value()) {
-      return kParsedDynamicWhileLoop;
-    }
-  }
-
   // We can't handle the case when the while loop argument is not a Tuple
   // instruction.
   if (while_operand->opcode() != HloOpcode::kTuple) {
     return std::nullopt;
   }
 
-  // If loop cond comparison LHS does not have a value defined inside the loop
-  // cond computation, try to evaluate its init value inside the while loop's
-  // parent computation.
-  if (!loop_comparison.lhs.value.has_value()) {
-    std::optional<DynamicOrStaticValue> lhs_init_value =
-        EvaluateWhileLoopParamInitValue(while_operand,
-                                        *loop_comparison.lhs.param_index);
-    if (lhs_init_value.has_value()) {
-      if (lhs_init_value->is_dynamic()) {
-        return kParsedDynamicWhileLoop;
-      } else {
-        loop_comparison.lhs.value = *(lhs_init_value->static_value);
-      }
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  // If loop cond comparison RHS does not have a value defined inside the loop
-  // cond computation, try to evaluate its init value inside the while loop's
-  // parent computation.
-  if (!loop_comparison.rhs.value.has_value()) {
-    std::optional<DynamicOrStaticValue> rhs_init_value =
-        EvaluateWhileLoopParamInitValue(while_operand,
-                                        *loop_comparison.rhs.param_index);
-    if (rhs_init_value.has_value()) {
-      if (rhs_init_value->is_dynamic()) {
-        return kParsedDynamicWhileLoop;
-      } else {
-        loop_comparison.rhs.value = *(rhs_init_value->static_value);
-      }
-    } else {
-      return std::nullopt;
-    }
+  if (!loop_comparison.lhs.value.has_value() ||
+      !loop_comparison.rhs.value.has_value()) {
+    return std::nullopt;
   }
 
-  // We have either successfully evaluated the init value for both LHS and RHS
-  // or have returned as dynamic loop or failure.
+  // We have either successfully parsed the init value for both LHS and RHS
+  // or have returned as failure.
   CHECK(loop_comparison.lhs.value.has_value());
   CHECK(loop_comparison.rhs.value.has_value());
 
-  if (loop_comparison.lhs.param_index.has_value()) {
-    VLOG(3) << __func__ << " lhs index: " << *loop_comparison.lhs.param_index;
-  }
-
-  VLOG(3) << __func__ << " lhs bound: " << *loop_comparison.lhs.value;
+  VLOG(3) << loop_comparison.ToString();
 
-  if (loop_comparison.rhs.param_index.has_value()) {
-    VLOG(3) << __func__ << " rhs index: " << *loop_comparison.rhs.param_index;
+  // If both operands of the loop condition comparison have dynamic value, the
+  // trip count might be dynamic or static. This is a case that our existing
+  // patterns could not yet handle, so we return std::nullopt.
+  if (loop_comparison.lhs.value->is_dynamic() &&
+      loop_comparison.rhs.value->is_dynamic()) {
+    VLOG(3) << "Both operands of the loop condition comparison are dynamic.";
+    return std::nullopt;
   }
-
-  VLOG(3) << __func__ << " rhs bound: " << *loop_comparison.rhs.value;
-
-  // Check whether LHS is the loop induction var.
-  std::optional<int64_t> lhs_induction_var_update;
-  if (loop_comparison.lhs.param_index.has_value()) {
-    lhs_induction_var_update = PatternMatchInductionVarUpdate(
-        while_body->root_instruction(), *loop_comparison.lhs.param_index);
+  // We would have returned if both operands are dynamic. So there is at most
+  // one dynamic operand, which is potentially the loop induction variable.
+  CHECK(!loop_comparison.lhs.value->is_dynamic() ||
+        !loop_comparison.rhs.value->is_dynamic());
+
+  if (!loop_comparison.lhs.value->is_dynamic() &&
+      !loop_comparison.rhs.value->is_dynamic()) {
+    int64_t lhs_value = *loop_comparison.lhs.value->static_value;
+    int64_t rhs_value = *loop_comparison.rhs.value->static_value;
+    Comparison::Direction comparison_direction =
+        loop_comparison.comparison_direction;
+    return HandleStaticLoopComparison(lhs_value, rhs_value,
+                                      comparison_direction);
+  }
+  std::optional<DynamicOrStaticInteger> induction_var_init;
+  std::optional<DynamicOrStaticInteger> induction_var_update;
+  bool lhs_is_induction_var = true;
+  if (loop_comparison.lhs.value->is_dynamic()) {
+    if (loop_comparison.lhs.param_index.has_value()) {
+      VLOG(3) << "Comparison LHS is induction variable.";
+      induction_var_init = EvaluateWhileLoopParamInitValue(
+          while_operand, *loop_comparison.lhs.param_index);
+      induction_var_update = PatternMatchInductionVarUpdateFromLoopBodyRoot(
+          while_body->root_instruction(), *loop_comparison.lhs.param_index);
+      lhs_is_induction_var = true;
+    }
+  } else {
+    CHECK(loop_comparison.rhs.value->is_dynamic());
+    if (loop_comparison.rhs.param_index.has_value()) {
+      VLOG(3) << "Comparison RHS is induction variable.";
+      induction_var_init = EvaluateWhileLoopParamInitValue(
+          while_operand, *loop_comparison.rhs.param_index);
+      induction_var_update = PatternMatchInductionVarUpdateFromLoopBodyRoot(
+          while_body->root_instruction(), *loop_comparison.rhs.param_index);
+      lhs_is_induction_var = false;
+    }
   }
 
-  // Check whether LHS is the loop induction var.
-  std::optional<int64_t> rhs_induction_var_update;
-  if (loop_comparison.rhs.param_index.has_value()) {
-    rhs_induction_var_update = PatternMatchInductionVarUpdate(
-        while_body->root_instruction(), *loop_comparison.rhs.param_index);
+  if (!induction_var_init.has_value() || !induction_var_update.has_value()) {
+    return std::nullopt;
   }
-
+  VLOG(3) << "induction_var_init: " << induction_var_init->ToString();
+  VLOG(3) << "induction_var_update: " << induction_var_update->ToString();
+  if (induction_var_init->is_dynamic() || induction_var_update->is_dynamic()) {
+    return kParsedDynamicWhileLoop;
+  }
+
+  int64_t init_value = *induction_var_init->static_value;
+  int64_t update_value = *induction_var_update->static_value;
+  Comparison::Direction comparison_direction =
+      loop_comparison.comparison_direction;
+  ParsedWhileLoop parsed_static_while_loop = ParsedWhileLoop{
+      ParsedStaticWhileLoop{/*trip_count=*/0,
+                            // Unassigned.
+                            /*induction_var_index=*/-1,
+                            /*induction_var_init_value=*/init_value,
+                            /*step_size=*/update_value,
+                            // Unassigned.
+                            /*loop_bound=*/-1}};
   // Lhs is the induction variable.
-  if (lhs_induction_var_update.has_value()) {
-    // We cannot handle the case when both LHS and RHS are updated inside
-    // the loop body.
-    if (rhs_induction_var_update.has_value() &&
-        *rhs_induction_var_update != 0) {
-      return std::nullopt;
+  if (lhs_is_induction_var) {
+    CHECK(loop_comparison.rhs.value.has_value() &&
+          !loop_comparison.rhs.value->is_dynamic());
+    int64_t bound = *loop_comparison.rhs.value->static_value;
+    parsed_static_while_loop.static_while_loop->induction_var_index =
+        *loop_comparison.lhs.param_index;
+    parsed_static_while_loop.static_while_loop->loop_bound = bound;
+    if (update_value > 0 &&
+        (comparison_direction == Comparison::Direction::kLt ||
+         comparison_direction == Comparison::Direction::kLe)) {
+      int64_t trip_count = ComputeTripCountFromComparison(
+          init_value, bound, update_value,
+          comparison_direction == Comparison::Direction::kLe);
+      parsed_static_while_loop.static_while_loop->trip_count = trip_count;
+      return parsed_static_while_loop;
     }
-    if (*lhs_induction_var_update > 0 &&
-        (loop_comparison.comparson_direction == Comparison::Direction::kLt ||
-         loop_comparison.comparson_direction == Comparison::Direction::kLe)) {
-      int64_t trip_count =
-          (*loop_comparison.rhs.value - *loop_comparison.lhs.value - 1) /
-              *lhs_induction_var_update +
-          1;
-      // Additional logic to deal with Equal comparison.
-      if (loop_comparison.comparson_direction == Comparison::Direction::kLe &&
-          (*loop_comparison.rhs.value - *loop_comparison.lhs.value) %
-                  *lhs_induction_var_update ==
-              0) {
-        trip_count += 1;
-      }
-      return ParsedWhileLoop{ParsedStaticWhileLoop{
-          /*trip_count=*/trip_count,
-          /*induction_var_index=*/*loop_comparison.lhs.param_index,
-          /*induction_var_init_value=*/*loop_comparison.lhs.value,
-          /*step_size=*/*lhs_induction_var_update,
-          /*loop_bound=*/*loop_comparison.rhs.value}};
-    } else if (*lhs_induction_var_update < 0 &&
-               (loop_comparison.comparson_direction ==
-                    Comparison::Direction::kGt ||
-                loop_comparison.comparson_direction ==
-                    Comparison::Direction::kGe)) {
-      int trip_count =
-          (*loop_comparison.lhs.value - *loop_comparison.rhs.value - 1) /
-              *lhs_induction_var_update +
-          1;
-      if (loop_comparison.comparson_direction == Comparison::Direction::kGe &&
-          (*loop_comparison.lhs.value - *loop_comparison.rhs.value) %
-                  *lhs_induction_var_update ==
-              0) {
-        trip_count += 1;
-      }
-      return ParsedWhileLoop{ParsedStaticWhileLoop{
-          /*trip_count=*/trip_count,
-          /*induction_var_index=*/*(loop_comparison.lhs.param_index),
-          /*induction_var_init_value=*/*(loop_comparison.lhs.value),
-          /*step_size=*/-*lhs_induction_var_update,
-          /*loop_bound=*/*(loop_comparison.rhs.value)}};
+    if (update_value < 0 &&
+        (comparison_direction == Comparison::Direction::kGt ||
+         comparison_direction == Comparison::Direction::kGe)) {
+      int64_t trip_count = ComputeTripCountFromComparison(
+          bound, init_value, -update_value,
+          comparison_direction == Comparison::Direction::kGe);
+      parsed_static_while_loop.static_while_loop->trip_count = trip_count;
+      return parsed_static_while_loop;
     }
     return std::nullopt;
   }
   // Rhs is the induction variable.
-  if (rhs_induction_var_update.has_value()) {
-    // We cannot handle the case when both LHS and RHS are updated inside
-    // the loop body.
-    if (lhs_induction_var_update.has_value() &&
-        *lhs_induction_var_update == 0) {
-      return std::nullopt;
-    }
-    if (*rhs_induction_var_update > 0 &&
-        (loop_comparison.comparson_direction == Comparison::Direction::kGt ||
-         loop_comparison.comparson_direction == Comparison::Direction::kGe)) {
-      int trip_count =
-          (*loop_comparison.lhs.value - *loop_comparison.rhs.value - 1) /
-              *rhs_induction_var_update +
-          1;
-      if (loop_comparison.comparson_direction == Comparison::Direction::kGe &&
-          (*loop_comparison.lhs.value - *loop_comparison.rhs.value) %
-                  *rhs_induction_var_update ==
-              0) {
-        trip_count += 1;
-      }
-      return ParsedWhileLoop{ParsedStaticWhileLoop{
-          /*trip_count=*/trip_count,
-          /*induction_var_index=*/*(loop_comparison.rhs.param_index),
-          /*induction_var_init_value=*/*(loop_comparison.rhs.value),
-          /*step_size=*/*rhs_induction_var_update,
-          /*loop_bound=*/*(loop_comparison.lhs.value)}};
-    } else if (*rhs_induction_var_update < 0 &&
-               (loop_comparison.comparson_direction ==
-                    Comparison::Direction::kLt ||
-                loop_comparison.comparson_direction ==
-                    Comparison::Direction::kLe)) {
-      int trip_count =
-          (*loop_comparison.rhs.value - *loop_comparison.lhs.value - 1) /
-              *rhs_induction_var_update +
-          1;
-      if (loop_comparison.comparson_direction == Comparison::Direction::kLe &&
-          (*loop_comparison.rhs.value - *loop_comparison.lhs.value) %
-                  *rhs_induction_var_update ==
-              0) {
-        trip_count += 1;
-      }
-      return ParsedWhileLoop{ParsedStaticWhileLoop{
-          /*trip_count=*/trip_count,
-          /*induction_var_index=*/*(loop_comparison.rhs.param_index),
-          /*induction_var_init_value=*/*(loop_comparison.rhs.value),
-          /*step_size=*/-*rhs_induction_var_update,
-          /*loop_bound=*/*(loop_comparison.lhs.value)}};
-    }
-    return std::nullopt;
+  CHECK(loop_comparison.lhs.value.has_value() &&
+        !loop_comparison.lhs.value->is_dynamic());
+  int64_t bound = *loop_comparison.lhs.value->static_value;
+  parsed_static_while_loop.static_while_loop->induction_var_index =
+      *loop_comparison.rhs.param_index;
+  parsed_static_while_loop.static_while_loop->loop_bound = bound;
+  if (update_value > 0 &&
+      (comparison_direction == Comparison::Direction::kGt ||
+       comparison_direction == Comparison::Direction::kGe)) {
+    int64_t trip_count = ComputeTripCountFromComparison(
+        init_value, bound, update_value,
+        comparison_direction == Comparison::Direction::kGe);
+    parsed_static_while_loop.static_while_loop->trip_count = trip_count;
+    return parsed_static_while_loop;
+  }
+  if (update_value < 0 &&
+      (comparison_direction == Comparison::Direction::kLt ||
+       comparison_direction == Comparison::Direction::kLe)) {
+    int64_t trip_count = ComputeTripCountFromComparison(
+        bound, init_value, -update_value,
+        comparison_direction == Comparison::Direction::kLe);
+    parsed_static_while_loop.static_while_loop->trip_count = trip_count;
+    return parsed_static_while_loop;
   }
   return std::nullopt;
 }
@@ -716,19 +800,19 @@ HloEvaluator::HloEvaluator(int64_t max_loop_iterations)
   typed_visitors_[PRED] =
       std::make_unique<HloEvaluatorTypedVisitor<bool>>(this);
   typed_visitors_[U8] =
-      std::make_unique<HloEvaluatorTypedVisitor<uint8_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<uint8_t, uint64_t>>(this);
   typed_visitors_[U16] =
-      std::make_unique<HloEvaluatorTypedVisitor<uint16_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<uint16_t, uint64_t>>(this);
   typed_visitors_[U32] =
-      std::make_unique<HloEvaluatorTypedVisitor<uint32_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<uint32_t, uint64_t>>(this);
   typed_visitors_[U64] =
       std::make_unique<HloEvaluatorTypedVisitor<uint64_t>>(this);
   typed_visitors_[S8] =
-      std::make_unique<HloEvaluatorTypedVisitor<int8_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<int8_t, int64_t>>(this);
   typed_visitors_[S16] =
-      std::make_unique<HloEvaluatorTypedVisitor<int16_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<int16_t, int64_t>>(this);
   typed_visitors_[S32] =
-      std::make_unique<HloEvaluatorTypedVisitor<int32_t>>(this);
+      std::make_unique<HloEvaluatorTypedVisitor<int32_t, int64_t>>(this);
   typed_visitors_[S64] =
       std::make_unique<HloEvaluatorTypedVisitor<int64_t>>(this);
   typed_visitors_[F16] =
@@ -742,12 +826,17 @@ HloEvaluator::HloEvaluator(int64_t max_loop_iterations)
   typed_visitors_[C128] =
       std::make_unique<HloEvaluatorTypedVisitor<complex128>>(this);
 
-  // Most of the evaluator computations we use don't support BF16 (e.g.,
-  // std::ceil, std::tanh). To make evaluator work with BF16, we set all
-  // elementwise computations to be done in F32 and do BF16<->F32 conversion
-  // around the input and the output of the computations.
+  // Most of the evaluator computations we use don't support BF16 and F8 (e.g.,
+  // std::ceil, std::tanh). To make evaluator work with these dtypes, we set all
+  // elementwise computations to be done in F32 and do BF16<->F32 or F8<->F32
+  // conversion around the input and the output of the computations.
   typed_visitors_[BF16] =
       std::make_unique<HloEvaluatorTypedVisitor<bfloat16, float>>(this);
+  typed_visitors_[F8E5M2] =
+      std::make_unique<HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>>(this);
+  typed_visitors_[F8E4M3FN] =
+      std::make_unique<HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>>(
+          this);
 
   typed_visitors_[TUPLE] =
       std::make_unique<FunctionVisitor>([](HloInstruction*) {
@@ -772,6 +861,7 @@ StatusOr<Literal> HloEvaluator::Evaluate(
   CHECK(computation.parent() != nullptr);
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
+  OnEvaluateComputation(computation);
 
   if (arg_literals.size() != computation.num_parameters()) {
     return InvalidArgument(
@@ -794,6 +884,8 @@ StatusOr<Literal> HloEvaluator::Evaluate(
 
   evaluated_.clear();
   arg_literals_.clear();
+  call_graph_cache_.reset();
+  tuple_points_to_analysis_cache_.reset();
   for (const auto& literal_ptr : arg_literals) {
     arg_literals_.push_back(&*literal_ptr);
   }
@@ -829,6 +921,8 @@ StatusOr<Literal> HloEvaluator::Evaluate(
     bool recursively_evaluate_nonconstant_operands) {
   arg_literals_.clear();
   evaluated_.clear();
+  call_graph_cache_.reset();
+  tuple_points_to_analysis_cache_.reset();
   auto enable_partial_evaluation_cleanup =
       absl::MakeCleanup([this] { enable_partial_evaluation_ = false; });
   enable_partial_evaluation_ = recursively_evaluate_nonconstant_operands;
@@ -967,6 +1061,61 @@ StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
+Status HloEvaluator::EvaluateParameterFromCallerArgument(
+    HloInstruction* parameter, const ShapeIndex& shape_index) {
+  CHECK(!evaluated_.contains(parameter));
+  const HloComputation* parent_computation = parameter->parent();
+  std::vector<HloInstruction*> computation_callers =
+      call_graph_cache_->GetComputationCallers(parent_computation);
+  // If the parent computation has multiple callers, we cannot determine from
+  // which caller the arguments are passed.
+  if (computation_callers.size() != 1) {
+    return tsl::errors::FailedPrecondition(
+        "The computation ", parent_computation->name(), " is called by ",
+        computation_callers.size(),
+        " callers and thus its argument value "
+        "cannot be determined statically.");
+  }
+  HloInstruction* computation_caller = computation_callers[0];
+  HloInstruction* caller_operand = computation_caller->mutable_operand(0);
+  if (computation_caller->opcode() != HloOpcode::kWhile &&
+      computation_caller->opcode() != HloOpcode::kCall) {
+    return tsl::errors::FailedPrecondition(
+        "The computation ", parent_computation->name(), " is called by ",
+        "instruction ", computation_caller->name(),
+        ", which is not yet supported.");
+  }
+  if (computation_caller->opcode() == HloOpcode::kWhile) {
+    HloComputation* while_body = computation_caller->while_body();
+    TF_ASSIGN_OR_RETURN(
+        const LogicalBuffer* logical_buffer,
+        tuple_points_to_analysis_cache_->GetBufferDefinedAt(
+            while_body->parameter_instruction(parameter->parameter_number()),
+            shape_index));
+    const TuplePointsToAnalysis::BufferAliasVector& buffer_aliases =
+        tuple_points_to_analysis_cache_->GetBufferAliases(*logical_buffer);
+    bool unchanged_in_return = false;
+    for (const BufferAlias& buffer_alias : buffer_aliases) {
+      if (buffer_alias.instruction() == while_body->root_instruction() &&
+          buffer_alias.index() == shape_index) {
+        unchanged_in_return = true;
+      }
+    }
+    if (!unchanged_in_return) {
+      return MakeEvalErrorDueToParamOrInfeed(*parameter);
+    }
+  }
+  TF_RETURN_IF_ERROR(EvaluateInternal(caller_operand, shape_index, true));
+  const Literal& caller_operand_literal =
+      GetEvaluatedLiteralFor(caller_operand);
+  evaluated_[parameter] =
+      Literal::CreateFromShapeWithUnknownLeafArrays(parameter->shape());
+  TF_RETURN_IF_ERROR(evaluated_[parameter].CopyFrom(
+      caller_operand_literal, /*dest_shape_index=*/shape_index,
+      /*src_shape_index=*/shape_index));
+  return OkStatus();
+}
+
 Status HloEvaluator::EvaluateInternal(
     HloInstruction* instruction, const ShapeIndex& shape_index,
     bool recursively_evaluate_nonconstant_operands) {
@@ -995,6 +1144,32 @@ Status HloEvaluator::EvaluateInternal(
       TF_RETURN_IF_ERROR(EvaluateInternal(
           instruction->mutable_operand(tuple_index), new_shape_index,
           /*recursively_evaluate_nonconstant_operands=*/true));
+    } else if (instruction->opcode() == HloOpcode::kParameter) {
+      if (!call_graph_cache_) {
+        HloModule* module = instruction->GetModule();
+        call_graph_cache_ = CallGraph::Build(module);
+      }
+      if (!tuple_points_to_analysis_cache_) {
+        HloModule* module = instruction->GetModule();
+        StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
+            tuple_points_to_analysis = TuplePointsToAnalysis::Run(module);
+        if (tuple_points_to_analysis.ok()) {
+          tuple_points_to_analysis_cache_ =
+              *std::move(tuple_points_to_analysis);
+        }
+      }
+      if (call_graph_cache_ && tuple_points_to_analysis_cache_) {
+        Status argument_eval_status =
+            EvaluateParameterFromCallerArgument(instruction, shape_index);
+        if (!argument_eval_status.ok()) {
+          VLOG(4) << "Failed to evaluate parameter " << instruction->name()
+                  << " from caller. Reason: "
+                  << argument_eval_status.error_message();
+        } else {
+          VLOG(4) << "Successfully evaluated parameter: "
+                  << instruction->name();
+        }
+      }
     } else {
       for (HloInstruction* operand : instruction->operands()) {
         TF_RETURN_IF_ERROR(EvaluateInternal(
@@ -1080,7 +1255,7 @@ Status HloEvaluator::HandleSetDimensionSize(
 }
 
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
-  if (arg_literals_.empty()) {
+  if (!IsAlreadyEvaluated(parameter, visitor_shape_index_)) {
     if (!enable_partial_evaluation_) {
       return tsl::errors::FailedPrecondition(
           "Failed to evaluate instruction since its operands are unknown "
@@ -1091,10 +1266,10 @@ Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
     return OkStatus();
   }
 
-  // Nothing to do other than sanity checks. Parameters' values are stored in
-  // arg_literals_.
-  CHECK_LT(parameter->parameter_number(), arg_literals_.size());
-
+  if (!arg_literals_.empty()) {
+    // Nothing to do other than sanity checks. Parameters' values are stored in
+    // arg_literals_.
+    CHECK_LT(parameter->parameter_number(), arg_literals_.size());
 #ifndef NDEBUG
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
   VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
@@ -1105,6 +1280,7 @@ Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
       << ", but input literal shape is: "
       << ShapeUtil::HumanStringWithLayout(input_literal->shape());
 #endif
+  }
 
   return OkStatus();
 }
@@ -1203,6 +1379,9 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
           "expected element type in shape to be floating point, but "
           "got: %s",
           PrimitiveType_Name(elem_ty));
+    case F8E5M2:
+    case F8E4M3FN:
+      return InvalidArgument("F8 is unsupported in IsFinite");
 
     case F16: {
       auto result_or = ElementWiseUnaryOpImpl<bool, Eigen::half>(
@@ -3148,7 +3327,9 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   HloModuleConfig config;
   // Attach cloned computation to an empty HLO module so the existing ones are
   // not modified.
-  HloModule empty_hlo_module("EmptyModuleForFusion", config);
+  HloModule empty_hlo_module("EmptyModuleForFusion", config,
+                             std::make_unique<CompilationEnvironments>(
+                                 fusion->GetModule()->comp_envs()));
   HloCloneContext context(&empty_hlo_module);
   auto cloned_fused_computation =
       fusion->fused_instructions_computation()->Clone(
@@ -3773,19 +3954,25 @@ Status HloEvaluator::HandleReduce(HloInstruction* instr) {
     }
   }
 
-  std::unique_ptr<HloEvaluator> embedded_evaluator =
-      CreateEmbedded(max_loop_iterations_);
+  const int num_threads = ShapeUtil::GetForEachIndexParallelThreadCount() + 1;
+  std::vector<std::unique_ptr<HloEvaluator>> embedded_evaluators;
+  embedded_evaluators.reserve(num_threads);
+  for (int i = 0; i < num_threads; ++i) {
+    embedded_evaluators.push_back(CreateEmbedded(max_loop_iterations_));
+  }
+
   absl::InlinedVector<Literal, 1> results(num_args);
   for (int64_t i = 0; i < num_args; ++i) {
     results[i] = Literal(is_tuple ? out_shape.tuple_shapes(i) : out_shape);
   }
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      output_shape, [&](absl::Span<const int64_t> output_index) {
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexParallelWithStatus(
+      output_shape, [&](absl::Span<const int64_t> output_index, int thread_id) {
         return GenerateReduceOutputElement(
             is_tuple, output_index, init_values, input_args,
-            absl::Span<Literal>(results), function, embedded_evaluator.get(),
-            arg_dim_steps, arg_dim_counts, result_to_arg_index);
+            absl::Span<Literal>(results), function,
+            embedded_evaluators[thread_id + 1].get(), arg_dim_steps,
+            arg_dim_counts, result_to_arg_index);
       }));
 
   if (is_tuple) {
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
index 6f7b962f51b..5636a0ff7fc 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
@@ -26,14 +26,16 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -90,6 +92,10 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     return std::make_unique<HloEvaluator>(max_loop_iterations);
   }
 
+  // Enables subclasses to be notified when a new computation is being
+  // evaluated.
+  virtual void OnEvaluateComputation(const HloComputation& computation) {}
+
   // Evaluates an HLO module and an array of pointers to literals.  Returns the
   // evaluated result as a literal if successful.
   //
@@ -242,6 +248,10 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   Status EvaluateInternal(
       HloInstruction* instruction, const ShapeIndex& shape_index = {},
       bool recursively_evaluate_nonconstant_operands = false);
+
+  Status EvaluateParameterFromCallerArgument(HloInstruction* parameter,
+                                             const ShapeIndex& shape_index);
+
   // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
   // class.
   //
@@ -425,6 +435,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   ShapeIndex visitor_shape_index_;
   bool enable_partial_evaluation_ = false;
 
+  std::unique_ptr<CallGraph> call_graph_cache_;
+  std::unique_ptr<TuplePointsToAnalysis> tuple_points_to_analysis_cache_;
+
   // Use fast path that uses eigen in the evaluator.
   bool use_fast_path_ = false;
 
@@ -439,8 +452,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> multi_index) {
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
           return unary_op(operand_literal.Get<NativeT>(multi_index));
         }));
     return std::move(result);
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
index c36ac720859..d1aaa6a42ee 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -23,12 +24,12 @@ limitations under the License.
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -460,6 +461,12 @@ TEST_P(HloEvaluatorBf16Test, DoesSinR2) {
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
+TEST_P(HloEvaluatorBf16Test, DoesTanR2) {
+  auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
+  auto expected = LiteralUtil::CreateR2<float>({{0, 0}, {0, 0}});
+  TestUnaryOp(HloOpcode::kTan, std::move(expected), std::move(operand),
+              use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
+}
 TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32_t>({{0, std::numeric_limits<int>::min()},
@@ -5462,5 +5469,135 @@ TEST_F(PatternMatchParseWhileLoopTest, BooleanCond) {
   EXPECT_EQ(parsed_while_loop->static_while_loop->loop_bound, 1);
 }
 
+TEST_F(PatternMatchParseWhileLoopTest, NestedLoop) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_all_reduce
+
+    %nested_while_condition {
+      %param = (s32[], s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %nested_while_body {
+      %param = (s32[], s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = f32[1024, 1024] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024, 1024] get-tuple-element(%param), index=3
+      %accumulation = f32[1024, 1024] add(f32[1024, 1024] %gte.2, f32[1024, 1024] %gte.3)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], f32[1024, 1024], f32[1024, 1024]) tuple(%increment_iteration, %gte.1, %gte.2, %accumulation)
+    }
+
+    %while_condition {
+      %param = (s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      ROOT result = pred[] compare(%gte.0, %gte.1), direction=LT
+    }
+
+    %while_body {
+      %param = (s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = s32[] get-tuple-element(%param), index=1
+      %gte.2 = s32[] get-tuple-element(%param), index=2
+      %gte.3 = f32[1024, 1024] get-tuple-element(%param), index=3
+      %gte.4 = f32[1024, 1024] get-tuple-element(%param), index=4
+      %constant.4 = s32[] constant(0)
+      %nested_while_init = (s32[], s32[], f32[1024, 1024], f32[1024, 1024]) tuple(s32[] %constant.4, s32[] %gte.2, f32[1024, 1024] %gte.3, f32[1024, 1024] %gte.4)
+      %nested_while = (s32[], s32[], f32[1024, 1024], f32[1024, 1024]) while(%nested_while_init), condition=%nested_while_condition, body=%nested_while_body
+      %nested_while_result = f32[1024, 1024] get-tuple-element((s32[], s32[], f32[1024, 1024], f32[1024, 1024]) %nested_while), index=3
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) tuple(%increment_iteration, %gte.1, %gte.2, %gte.3, %nested_while_result)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.1 = f32[1024, 1024] parameter(0)
+      %param.2 = s32[] parameter(1)
+      %constant.0 = s32[] constant(0)
+      %constant.2 = s32[] constant(4)
+      %loop_bound = s32[] multiply(s32[] %param.2, s32[] %constant.2)
+      %constant.3 = s32[] constant(5)
+      %nested_loop_bound = s32[] multiply(s32[] %constant.3, s32[] %constant.2)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024, 1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) tuple(s32[] %constant.0, s32[] %loop_bound, s32[] %nested_loop_bound, f32[1024, 1024] %param.1, f32[1024, 1024] %accumulation_buffer)
+      %while = (s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) while(%while_init), condition=%while_condition, body=%while_body
+      ROOT %result = f32[1024, 1024] get-tuple-element((s32[], s32[], s32[], f32[1024, 1024], f32[1024, 1024]) %while), index=4
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  HloInstruction* while_op =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* nested_while =
+      while_body->root_instruction()->mutable_operand(4)->mutable_operand(0);
+  CHECK_EQ(nested_while->opcode(), HloOpcode::kWhile);
+  std::optional<ParsedWhileLoop> parsed_while_loop =
+      PatternMatchParseWhileLoop(nested_while);
+  ASSERT_TRUE(parsed_while_loop.has_value());
+  EXPECT_FALSE(parsed_while_loop->is_dynamic());
+  EXPECT_EQ(parsed_while_loop->static_while_loop->trip_count, 20);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->induction_var_index, 0);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->induction_var_init_value, 0);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->step_size, 1);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->loop_bound, 20);
+}
+
+TEST_F(PatternMatchParseWhileLoopTest, CopiedLoopCond) {
+  constexpr absl::string_view kHloModule = R"(
+    HloModule accumulated_all_reduce
+
+    %while_condition {
+      %param = (s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %copy.0 = s32[] copy(s32[] %gte.0)
+      %loop_bound = s32[] constant(5)
+      %result = pred[] compare(%gte.0, %loop_bound), direction=LT
+      ROOT %copy.1 = pred[] copy(pred[] %result) 
+    }
+
+    %while_body {
+      %param = (s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+      %gte.0 = s32[] get-tuple-element(%param), index=0
+      %gte.1 = f32[1024, 1024] get-tuple-element(%param), index=1
+      %gte.2 = f32[1024, 1024] get-tuple-element(%param), index=2
+      %accumulation = f32[1024, 1024] add(f32[1024, 1024] %gte.1, f32[1024, 1024] %gte.2)
+      %constant = s32[] constant(1)
+      %increment_iteration = s32[] add(s32[] %gte.0, s32[] %constant)
+      ROOT %loop_result = (s32[], f32[1024, 1024], f32[1024, 1024]) tuple(%increment_iteration, %gte.1, %accumulation)
+    }
+
+    ENTRY accumulated_all_reduce {
+      %param.1 = f32[1024, 1024] parameter(0)
+      %constant.0 = s32[] constant(0)
+      %accumulation_buffer_init = f32[] constant(0)
+      %accumulation_buffer = f32[1024, 1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+      %while_init = (s32[], f32[1024, 1024], f32[1024, 1024]) tuple(s32[] %constant.0, f32[1024, 1024] %param.1, f32[1024, 1024] %accumulation_buffer)
+      %while = (s32[], f32[1024, 1024], f32[1024, 1024]) while(%while_init), condition=%while_condition, body=%while_body
+      ROOT %result = f32[1024, 1024] get-tuple-element((s32[], f32[1024, 1024], f32[1024, 1024]) %while), index=2
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+  HloInstruction* while_op =
+      hlo_module->entry_computation()->root_instruction()->mutable_operand(0);
+  std::optional<ParsedWhileLoop> parsed_while_loop =
+      PatternMatchParseWhileLoop(while_op);
+  ASSERT_TRUE(parsed_while_loop.has_value());
+  EXPECT_FALSE(parsed_while_loop->is_dynamic());
+  EXPECT_EQ(parsed_while_loop->static_while_loop->trip_count, 5);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->induction_var_index, 0);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->induction_var_init_value, 0);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->step_size, 1);
+  EXPECT_EQ(parsed_while_loop->static_while_loop->loop_bound, 5);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index 6542272c307..4bc37760338 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include <random>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
@@ -36,13 +37,15 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -69,11 +72,10 @@ T Nibble1(T t) {
 //
 // Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
 // a "private" header that's not exposed outside of hlo_evaluator.cc.
-//
-// Not using an alias template to work around MSVC 14.00 bug.
 template <typename T>
-struct is_complex_t : std::disjunction<std::is_same<T, complex64>,
-                                       std::is_same<T, complex128>> {};
+struct is_complex_t : std::false_type {};
+template <typename T>
+struct is_complex_t<std::complex<T>> : std::true_type {};
 
 template <typename T>
 inline constexpr bool is_complex_v = is_complex_t<T>::value;
@@ -101,33 +103,6 @@ auto ToArithmeticSafeType(T t) {
   }
 }
 
-// UintWithSize<N> gets an unsigned integer with the given size in bytes.
-template <size_t kBytes>
-struct UintWithSize {};
-
-template <>
-struct UintWithSize<1> {
-  using type = uint8_t;
-};
-
-template <>
-struct UintWithSize<2> {
-  using type = uint16_t;
-};
-
-template <>
-struct UintWithSize<4> {
-  using type = uint32_t;
-};
-
-template <>
-struct UintWithSize<8> {
-  using type = uint64_t;
-};
-
-template <size_t kBytes>
-using UintWithSizeType = typename UintWithSize<kBytes>::type;
-
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
@@ -277,16 +252,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT,
             typename std::enable_if_t<!is_complex_v<NativeT>>* = nullptr>
   Status HandleRoundNearestEven(HloInstruction* round) {
-    // Saves current rounding direction.
-    int curr_direction = fegetround();
-    fesetround(FE_TONEAREST);
+    // Verify the current rounding direction.
+    TF_RET_CHECK(fegetround() == FE_TONEAREST);
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[round],
         ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
           return std::nearbyint(elem_operand);
         }));
-    // Restores default rounding direction.
-    fesetround(curr_direction);
     return OkStatus();
   }
 
@@ -508,9 +480,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             return (ElementwiseT(0) < elem_operand) -
                    (elem_operand < ElementwiseT(0));
           }
-          if constexpr (std::is_same_v<NativeT, bfloat16> ||
-                        std::is_same_v<NativeT, Eigen::half> ||
-                        std::is_floating_point_v<NativeT>) {
+          if constexpr (std::is_floating_point_v<ElementwiseT>) {
             return std::isnan(elem_operand)
                        ? elem_operand
                        : std::copysign(elem_operand != ElementwiseT(0),
@@ -686,34 +656,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  template <typename NativeT,
-            typename std::enable_if_t<is_complex_v<NativeT>>* = nullptr>
-  Status HandleCbrt(HloInstruction* cbrt) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[cbrt],
-        ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) -> ElementwiseT {
-          return std::pow(elem_operand, static_cast<ElementwiseT>(1.0 / 3.0));
-          return elem_operand.real() < 0
-                     ? -std::pow(-elem_operand,
-                                 static_cast<ElementwiseT>(1.0 / 3.0))
-                     : std::pow(elem_operand,
-                                static_cast<ElementwiseT>(1.0 / 3.0));
-        }));
-    return OkStatus();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if_t<!is_complex_v<NativeT>>* = nullptr>
-  Status HandleCbrt(HloInstruction* cbrt) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cbrt],
-                        ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) {
-                          return std::cbrt(elem_operand);
-                        }));
-    return OkStatus();
-  }
-
   Status HandleCbrt(HloInstruction* cbrt) override {
-    return HandleCbrt<ElementwiseT>(cbrt);
+    if constexpr (!is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[cbrt],
+          ElementWiseUnaryOp(cbrt, [](ElementwiseT elem_operand) {
+            return std::cbrt(elem_operand);
+          }));
+      return OkStatus();
+    }
+    return UnsupportedTypeError(cbrt);
   }
 
   Status HandleRsqrt(HloInstruction* rsqrt) override {
@@ -809,13 +761,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightArithmetic(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using SignedT = std::make_signed_t<ElementwiseT>;
+      using SignedT = std::make_signed_t<ReturnT>;
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[shr],
           ElementWiseBinaryOp(
               shr, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
                 SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
-                if (IsShiftOutOfBounds<ElementwiseT>(rhs_elem)) {
+                if (IsShiftOutOfBounds<ReturnT>(rhs_elem)) {
                   return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
                 } else {
                   return lhs_signed >> rhs_elem;
@@ -829,13 +781,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightLogical(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using UnsignedT = std::make_unsigned_t<ElementwiseT>;
+      using UnsignedT = std::make_unsigned_t<ReturnT>;
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[shr],
                           ElementWiseBinaryOp(shr, [](ElementwiseT lhs_elem,
                                                       ElementwiseT rhs_elem) {
                             // If shift amount is greater than the number of
                             // bits, then return 0.
-                            if (IsShiftOutOfBounds<ElementwiseT>(rhs_elem)) {
+                            if (IsShiftOutOfBounds<ReturnT>(rhs_elem)) {
                               return static_cast<ElementwiseT>(0);
                             }
                             return static_cast<ElementwiseT>(
@@ -847,8 +799,18 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleStochasticConvert(HloInstruction* stochastic_convert) override {
-    // TODO(b/232442915): Add support for stochastic convert.
-    return UnsupportedTypeError(stochastic_convert);
+    const HloInstruction* operand = stochastic_convert->operand(0);
+    const HloInstruction* random = stochastic_convert->operand(1);
+    const Shape& result_shape = stochastic_convert->shape();
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), random->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), result_shape));
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& random_literal = parent_->GetEvaluatedLiteralFor(random);
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[stochastic_convert],
+        StochasticConvertOp(operand_literal, random_literal, result_shape));
+    return OkStatus();
   }
 
   Status HandleClamp(HloInstruction* clamp) override {
@@ -909,8 +871,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     Literal result(result_shape);
 
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> out_index) {
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> out_index, int) {
           std::vector<int64_t> from_index(out_index.begin(), out_index.end());
           for (const int64_t dim : reverse_dimensions) {
             from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
@@ -1006,8 +968,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64_t feature_group_index =
           out_index[output_z_dim] / output_feature_group_size;
 
-      const int64_t depthwise_multiplier =
-          batch_group_count > 1 ? output_z_size / input_batch_size : 1;
+      const int64_t depthwise_multiplier = output_z_size / batch_group_count;
       const int64_t batch_group_index =
           out_index[output_z_dim] / depthwise_multiplier;
 
@@ -1080,9 +1041,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           // This approach works out automatically for 'groups' in batches
           // with group_size > 1, because we already descend down the batch
           // dimension for the 'output_batch_dim' above.
-          lhs_linear_index +=
-              ((batch_group_index * batch_group_size) % input_batch_size) *
-              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += (batch_group_index * batch_group_size) *
+                              lhs_dim_multipliers[input_batch_dim];
 
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
           int64_t rhs_linear_index = rhs_linear_spatial_index;
@@ -1108,6 +1068,11 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       } while (IndexUtil::BumpIndices(window_shape,
                                       absl::MakeSpan(rhs_spatial_index)));
 
+      if constexpr (std::is_integral_v<ReturnT>) {
+        auto l = static_cast<ElementwiseT>(std::numeric_limits<ReturnT>::min());
+        auto h = static_cast<ElementwiseT>(std::numeric_limits<ReturnT>::max());
+        result_val = std::max(l, std::min(h, result_val));
+      }
       return static_cast<ReturnT>(result_val);
     };
 
@@ -1341,7 +1306,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               auto rhs_n0 = ToArithmeticSafeType(Nibble0(rhs));
               auto rhs_n1 = ToArithmeticSafeType(Nibble1(rhs));
               result_val += (lhs_n0 * rhs_n0) + (lhs_n1 * rhs_n1);
-
             } else {
               result_val +=
                   ToArithmeticSafeType(lhs) * ToArithmeticSafeType(rhs);
@@ -1421,8 +1385,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     ReturnT scalar =
         parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
     Literal result(pad->shape());
-    TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
-        [&scalar](absl::Span<const int64_t> multi_index) { return scalar; }));
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&scalar](absl::Span<const int64_t> multi_index, int) {
+          return scalar;
+        }));
 
     const Literal& evaluated_operand =
         parent_->GetEvaluatedLiteralFor(pad->operand(0));
@@ -1662,6 +1628,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64_t>(map));
         break;
       }
+      case F8E5M2: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
+                            MapImpl<tsl::float8_e5m2>(map));
+        break;
+      }
+      case F8E4M3FN: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
+                            MapImpl<tsl::float8_e4m3fn>(map));
+        break;
+      }
       case F16: {
         TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
                             MapImpl<Eigen::half>(map));
@@ -1848,7 +1824,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Shape window_shape = ShapeUtil::MakeShape(
         input_arrays[0]->shape().element_type(), window_dimension_sizes);
 
-    const int num_threads = tsl::port::MaxParallelism() + 1;
+    const int num_threads = ShapeUtil::GetForEachIndexParallelThreadCount() + 1;
     std::vector<std::unique_ptr<HloEvaluator>> embedded_evaluators;
     embedded_evaluators.reserve(num_threads);
     for (int i = 0; i < num_threads; ++i) {
@@ -1968,7 +1944,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     const int64_t rank = operand->shape().rank();
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto func = [&](absl::Span<const int64_t> out_index) {
+    auto func = [&](absl::Span<const int64_t> out_index, int) {
       DimensionVector operand_index(rank);
       for (int64_t i = 0; i < rank; ++i) {
         operand_index[i] =
@@ -1978,12 +1954,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     };
 
     Literal result(shape);
-    TF_RETURN_IF_ERROR(result.Populate<ReturnT>(func));
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(func));
     parent_->evaluated_[slice] = std::move(result);
     return OkStatus();
   }
 
-  // Enable CLZ only for int32_t, uint32_t, int64_t and uint64_t.
+  // Enable CLZ only for integer types.
   template <typename NativeT,
             typename std::enable_if_t<!std::is_integral_v<NativeT> ||
                                       std::is_same_v<NativeT, bool>>* = nullptr>
@@ -1995,13 +1971,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                                   std::is_integral_v<NativeT> &&
                                   !std::is_same_v<NativeT, bool>>* = nullptr>
   Status HandleClz(HloInstruction* clz) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[clz],
-        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-          using UnsignedElementwiseT = std::make_unsigned_t<ElementwiseT>;
-          return (std::numeric_limits<UnsignedElementwiseT>::digits - 1) -
-                 Log2Floor<UnsignedElementwiseT>(elem_operand);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          using UnsignedT = std::make_unsigned_t<ReturnT>;
+                          return (std::numeric_limits<UnsignedT>::digits - 1) -
+                                 Log2Floor<UnsignedT>(elem_operand);
+                        }));
     return OkStatus();
   }
 
@@ -2023,8 +1998,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[popcnt],
         ElementWiseUnaryOp(popcnt, [](ElementwiseT elem_operand) {
-          return std::bitset<CHAR_BIT * sizeof elem_operand>(elem_operand)
-              .count();
+          return std::bitset<CHAR_BIT * sizeof(ReturnT)>(elem_operand).count();
         }));
     return OkStatus();
   }
@@ -2033,33 +2007,38 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandlePopulationCount<ElementwiseT>(popcnt);
   }
 
-  template <typename NativeT, typename std::enable_if_t<
-                                  std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
-                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
-                          return std::sin(elem_operand);
-                        }));
-    return OkStatus();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if_t<std::is_integral_v<NativeT> ||
-                                      is_complex_v<NativeT>>* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
+  Status HandleSin(HloInstruction* sin) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[sin],
+          ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+            return std::sin(elem_operand);
+          }));
+      return OkStatus();
+    }
     return UnsupportedTypeError(sin);
   }
 
-  Status HandleSin(HloInstruction* sin) override {
-    return HandleSin<ElementwiseT>(sin);
+  Status HandleCos(HloInstruction* cos) override {
+    if constexpr (std::is_floating_point_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[cos],
+          ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+            return std::cos(elem_operand);
+          }));
+      return OkStatus();
+    }
+    return UnsupportedTypeError(cos);
   }
 
   template <typename NativeT, typename std::enable_if_t<
                                   std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
-                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
-                          return std::cos(elem_operand);
+  Status HandleTan(HloInstruction* tan) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tan],
+                        ElementWiseUnaryOp(tan, [](ElementwiseT elem_operand) {
+                          return std::tan(elem_operand);
                         }));
     return OkStatus();
   }
@@ -2067,17 +2046,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT,
             typename std::enable_if_t<std::is_integral_v<NativeT> ||
                                       is_complex_v<NativeT>>* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    return UnsupportedTypeError(cos);
+  Status HandleTan(HloInstruction* tan) {
+    return UnsupportedTypeError(tan);
   }
 
-  Status HandleCos(HloInstruction* cos) override {
-    return HandleCos<ElementwiseT>(cos);
+  Status HandleTan(HloInstruction* tan) override {
+    return HandleTan<ElementwiseT>(tan);
   }
 
   template <typename NativeT, typename std::enable_if_t<
-                                  std::is_same_v<NativeT, float> ||
-                                  std::is_same_v<NativeT, double>>* = nullptr>
+                                  std::is_floating_point_v<NativeT>>* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[reduce_precision],
@@ -2089,7 +2067,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           const uint32_t dest_mantissa_bits = reduce_precision->mantissa_bits();
           const uint32_t dest_exponent_bits = reduce_precision->exponent_bits();
 
-          using Uint = UintWithSizeType<sizeof(NativeT)>;
+          using Uint = UnsignedIntegerTypeForSizeType<sizeof(NativeT)>;
           Uint value_as_int = absl::bit_cast<Uint>(elem);
 
           // Code is based on the CPU/GPU implementation in LLVM-emitting code.
@@ -2186,143 +2164,118 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
-  template <typename NativeT,
-            typename std::enable_if_t<
-                std::is_same_v<NativeT, bfloat16> ||
-                std::is_same_v<NativeT, Eigen::half> ||
-                std::is_integral_v<NativeT> || is_complex_v<NativeT> ||
-                std::is_floating_point_v<NativeT>>* = nullptr>
-  Status HandleIota(HloInstruction* instruction) {
+  Status HandleIota(HloInstruction* instruction) override {
     auto* iota = Cast<HloIotaInstruction>(instruction);
-
-    Literal result(iota->shape());
-    ShapeUtil::ForEachIndex(iota->shape(), [&](absl::Span<const int64_t> idx) {
-      result.Set(idx, static_cast<NativeT>(idx[iota->iota_dimension()]));
-      return true;
-    });
-    parent_->evaluated_[iota] = std::move(result);
-    return OkStatus();
-  }
-  template <typename NativeT,
-            typename std::enable_if_t<
-                !(std::is_same_v<NativeT, bfloat16> ||
-                  std::is_same_v<NativeT, Eigen::half> ||
-                  std::is_integral_v<NativeT> || is_complex_v<NativeT> ||
-                  std::is_floating_point_v<NativeT>)>* = nullptr>
-  Status HandleIota(HloInstruction* iota) {
+    if constexpr (std::is_integral_v<ElementwiseT> ||
+                  is_complex_v<ElementwiseT> ||
+                  std::is_floating_point_v<ElementwiseT>) {
+      Literal result(iota->shape());
+      ShapeUtil::ForEachIndex(
+          iota->shape(), [&](absl::Span<const int64_t> idx) {
+            result.Set(idx, static_cast<ReturnT>(idx[iota->iota_dimension()]));
+            return true;
+          });
+      parent_->evaluated_[iota] = std::move(result);
+      return OkStatus();
+    }
     return UnsupportedTypeError(iota);
   }
-  Status HandleIota(HloInstruction* iota) override {
-    return HandleIota<ReturnT>(iota);
-  }
 
-  template <typename NativeT,
-            typename std::enable_if_t<!(std::is_integral_v<NativeT> ||
-                                        std::is_floating_point_v<NativeT>)>* =
-                nullptr>
-  Status HandleRng(HloInstruction* random) {
-    return UnsupportedTypeError(random);
-  }
-  template <
-      typename NativeT,
-      typename std::enable_if_t<(std::is_floating_point_v<NativeT>)>* = nullptr>
-  Status HandleRng(HloInstruction* random) {
+  Status HandleRng(HloInstruction* random) override {
     RandomDistribution distribution = random->random_distribution();
     const auto result_shape = random->shape();
     Literal result(result_shape);
 
-    switch (distribution) {
-      case RNG_UNIFORM: {
-        const Literal& low =
-            parent_->GetEvaluatedLiteralFor(random->operand(0));
-        const Literal& high =
-            parent_->GetEvaluatedLiteralFor(random->operand(1));
-
-        // std::uniform_real_distribution(a, b) can sometimes return a value
-        // equal to b.  Unclear if this is a spec bug or an implementation bug
-        // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
-        // interval, so we have to re-sample if we get `b` out.
-        //
-        // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
-        // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
-        // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
-        auto low_val = low.Get<NativeT>({});
-        auto high_val = high.Get<NativeT>({});
-        std::uniform_real_distribution<NativeT> generator(low_val, high_val);
-        TF_RETURN_IF_ERROR(result.Populate<NativeT>(
-            [&](absl::Span<const int64_t> /*indexes*/) {
-              while (true) {
-                NativeT v = generator(parent_->engine_);
-                if (v != high_val) {
-                  return v;
+    if constexpr (std::is_floating_point_v<ElementwiseT>) {
+      switch (distribution) {
+        case RNG_UNIFORM: {
+          const Literal& low =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& high =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          // std::uniform_real_distribution(a, b) can sometimes return a value
+          // equal to b.  Unclear if this is a spec bug or an implementation bug
+          // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
+          // interval, so we have to re-sample if we get `b` out.
+          //
+          // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
+          // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
+          // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
+          const ReturnT low_val = low.Get<ReturnT>({});
+          const ReturnT high_val = high.Get<ReturnT>({});
+          std::uniform_real_distribution<ElementwiseT> generator(
+              static_cast<ElementwiseT>(low_val),
+              static_cast<ElementwiseT>(high_val));
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                while (true) {
+                  const ReturnT v =
+                      static_cast<ReturnT>(generator(parent_->engine_));
+                  if (v >= low_val && v < high_val) {
+                    return v;
+                  }
                 }
-              }
-            }));
-        break;
-      }
-      case RNG_NORMAL: {
-        const Literal& mean =
-            parent_->GetEvaluatedLiteralFor(random->operand(0));
-        const Literal& stddev =
-            parent_->GetEvaluatedLiteralFor(random->operand(1));
-
-        std::normal_distribution<NativeT> generator(mean.Get<NativeT>({}),
-                                                    stddev.Get<NativeT>({}));
-
-        TF_RETURN_IF_ERROR(result.Populate<NativeT>(
-            [&](absl::Span<const int64_t> /*indexes*/) {
-              return generator(parent_->engine_);
-            }));
-        break;
+              }));
+          break;
+        }
+        case RNG_NORMAL: {
+          const Literal& mean =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& stddev =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          std::normal_distribution<ElementwiseT> generator(
+              static_cast<ElementwiseT>(mean.Get<ReturnT>({})),
+              static_cast<ElementwiseT>(stddev.Get<ReturnT>({})));
+
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                return static_cast<ReturnT>(generator(parent_->engine_));
+              }));
+          break;
+        }
+        default:
+          return UnimplementedStrCat("The distribution ",
+                                     RandomDistribution_Name(distribution),
+                                     " is not implemented.");
       }
-      default:
-        return UnimplementedStrCat("The distribution ",
-                                   RandomDistribution_Name(distribution),
-                                   " is not implemented.");
+      parent_->evaluated_[random] = std::move(result);
+      return OkStatus();
     }
-    parent_->evaluated_[random] = std::move(result);
-    return OkStatus();
-  }
-  template <typename NativeT,
-            typename std::enable_if_t<(std::is_integral_v<NativeT>)>* = nullptr>
-  Status HandleRng(HloInstruction* random) {
-    RandomDistribution distribution = random->random_distribution();
-    const auto result_shape = random->shape();
-    Literal result(result_shape);
-
-    switch (distribution) {
-      case RNG_UNIFORM: {
-        const Literal& low =
-            parent_->GetEvaluatedLiteralFor(random->operand(0));
-        const Literal& high =
-            parent_->GetEvaluatedLiteralFor(random->operand(1));
-
-        // Note std::uniform_int_distribution assumes interval is closed, i.e.,
-        // [low, high], but we want [low, high) instead. Hence high-1 is used as
-        // the upper range.
-        std::uniform_int_distribution<int64_t> generator(
-            low.Get<NativeT>({}), high.Get<NativeT>({}) - 1);
-
-        TF_RETURN_IF_ERROR(result.Populate<NativeT>(
-            [&](absl::Span<const int64_t> /*indexes*/) {
-              return static_cast<NativeT>(generator(parent_->engine_));
-            }));
-        break;
-      }
-      case RNG_NORMAL: {
-        return Unimplemented(
-            "Normal distribution is not supported for integral types.");
+    if constexpr (std::is_integral_v<ElementwiseT>) {
+      switch (distribution) {
+        case RNG_UNIFORM: {
+          const Literal& low =
+              parent_->GetEvaluatedLiteralFor(random->operand(0));
+          const Literal& high =
+              parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+          // Note std::uniform_int_distribution assumes interval is closed,
+          // i.e., [low, high], but we want [low, high) instead. Hence high-1 is
+          // used as the upper range.
+          std::uniform_int_distribution<int64_t> generator(
+              low.Get<ReturnT>({}), high.Get<ReturnT>({}) - 1);
+
+          TF_RETURN_IF_ERROR(result.Populate<ReturnT>(
+              [&](absl::Span<const int64_t> /*indexes*/) {
+                return static_cast<ReturnT>(generator(parent_->engine_));
+              }));
+          break;
+        }
+        case RNG_NORMAL: {
+          return Unimplemented(
+              "Normal distribution is not supported for integral types.");
+        }
+        default:
+          return UnimplementedStrCat("The distribution ",
+                                     RandomDistribution_Name(distribution),
+                                     " is not implemented.");
       }
-      default:
-        return UnimplementedStrCat("The distribution ",
-                                   RandomDistribution_Name(distribution),
-                                   " is not implemented.");
+      parent_->evaluated_[random] = std::move(result);
+      return OkStatus();
     }
-    parent_->evaluated_[random] = std::move(result);
-    return OkStatus();
-  }
-  Status HandleRng(HloInstruction* random) override {
-    return HandleRng<ReturnT>(random);
+    return UnsupportedTypeError(random);
   }
 
  private:
@@ -2494,8 +2447,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     Literal result(shape);
 
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> multi_index) {
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
           return ConvertBinaryFunction(binary_op)(
               lhs_literal.Get<ReturnT>(multi_index),
               rhs_literal.Get<ReturnT>(multi_index));
@@ -2521,8 +2474,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     Literal result(shape);
 
-    TF_RETURN_IF_ERROR(
-        result.Populate<ReturnT>([&](absl::Span<const int64_t> multi_index) {
+    TF_RETURN_IF_ERROR(result.PopulateParallel<ReturnT>(
+        [&](absl::Span<const int64_t> multi_index, int) {
           return ternary_op(lhs_literal.Get<LhsType>(multi_index),
                             rhs_literal.Get<RhsType>(multi_index),
                             ehs_literal.Get<EhsType>(multi_index));
@@ -2531,6 +2484,131 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return std::move(result);
   }
 
+  template <typename Fp, typename Uint, typename ResultT>
+  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                        const Literal& random_literal,
+                                        const Shape& result_shape) {
+    std::function<ResultT(Fp, Uint)> stochastic_convert_op =
+        [](Fp operand, Uint random) -> ResultT {
+      bool is_negative = ToSignMagnitude(operand) < 0;
+      if (Eigen::numext::isinf(operand)) {
+        return is_negative ? std::numeric_limits<ResultT>::min()
+                           : std::numeric_limits<ResultT>::max();
+      }
+      if (Eigen::numext::isnan(operand)) {
+        return static_cast<ResultT>(0);
+      }
+      if (operand >= static_cast<Fp>(std::numeric_limits<ResultT>::max())) {
+        return std::numeric_limits<ResultT>::max();
+      }
+      if (operand <= static_cast<Fp>(std::numeric_limits<ResultT>::min())) {
+        return std::numeric_limits<ResultT>::min();
+      }
+
+      operand = Eigen::numext::abs(operand);
+
+      // Gets the integral piece of the floating point input.
+      auto truncated = static_cast<ResultT>(operand);
+
+      // Removes the integral piece to obtain the fractional piece.
+      Fp fractional = operand - static_cast<Fp>(truncated);
+      if (fractional == Fp{0}) {
+        // No rounding necessary.
+        return is_negative ? -truncated : truncated;
+      }
+
+      // Compares fractional values against unsigned random values by
+      // normalizing random values into [0, 1): fractional vs. (random /
+      // random_max). This equals to comparing (fractional * random_max) vs.
+      // random.
+      auto fixed_fractional = static_cast<Uint>(std::ldexp(
+          static_cast<double>(fractional), std::numeric_limits<Uint>::digits));
+
+      // Rounds the integer output up if the fractional pieces is larger than
+      // the input random number.
+      if (random < fixed_fractional) {
+        // This only happens when the operand is in the (min, -max) range and
+        // should be rounded to min.
+        if (truncated == std::numeric_limits<ResultT>::max()) {
+          return std::numeric_limits<ResultT>::min();
+        }
+        truncated++;
+      }
+      return is_negative ? -truncated : truncated;
+    };
+
+    Literal result(result_shape);
+    TF_RETURN_IF_ERROR(
+        result.Populate<ResultT>([&](absl::Span<const int64_t> multi_index) {
+          return stochastic_convert_op(operand_literal.Get<Fp>(multi_index),
+                                       random_literal.Get<Uint>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  // Converts from primitive types to native types.
+  template <PrimitiveType operand_type, PrimitiveType random_type,
+            PrimitiveType result_type>
+  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                        const Literal& random_literal,
+                                        const Shape& result_shape) {
+    return StochasticConvertOp<
+        typename primitive_util::PrimitiveTypeToNative<operand_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<random_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<result_type>::type>(
+        operand_literal, random_literal, result_shape);
+  }
+
+  // Evaluates all possible paths of converting to different integers.
+  template <PrimitiveType operand_type, PrimitiveType random_type>
+  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                        const Literal& random_literal,
+                                        const Shape& result_shape) {
+    switch (result_shape.element_type()) {
+#define CONVERT_IF_RESULT_TYPES_MATCH(type)                        \
+  case (type):                                                     \
+    return StochasticConvertOp<operand_type, random_type, (type)>( \
+        operand_literal, random_literal, result_shape);
+      CONVERT_IF_RESULT_TYPES_MATCH(S32)
+      CONVERT_IF_RESULT_TYPES_MATCH(S16)
+      CONVERT_IF_RESULT_TYPES_MATCH(S8)
+#undef CONVERT_IF_RESULT_TYPES_MATCH
+      default:
+        break;
+    }
+    // TODO(b/232442915): Enable converting big floats to small floats.
+    return Unimplemented(
+        "Stochastically converting from type %s to type %s is not implemented.",
+        PrimitiveType_Name(operand_literal.shape().element_type()),
+        PrimitiveType_Name(result_shape.element_type()));
+  }
+
+  StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
+                                        const Literal& random_literal,
+                                        const Shape& result_shape) {
+    switch (operand_literal.shape().element_type()) {
+      case F16:
+        return StochasticConvertOp<F16, U16>(operand_literal, random_literal,
+                                             result_shape);
+      case BF16:
+        return StochasticConvertOp<BF16, U16>(operand_literal, random_literal,
+                                              result_shape);
+      case F32:
+        return StochasticConvertOp<F32, U32>(operand_literal, random_literal,
+                                             result_shape);
+      case F64:
+        return StochasticConvertOp<F64, U64>(operand_literal, random_literal,
+                                             result_shape);
+      default:
+        break;
+    }
+    // TODO(b/232442915): Enable converting big floats to small floats.
+    return Unimplemented(
+        "Stochastically converting from type %s to type %s is not implemented.",
+        PrimitiveType_Name(operand_literal.shape().element_type()),
+        PrimitiveType_Name(result_shape.element_type()));
+  }
+
   template <typename NativeT>
   static bool IsShiftOutOfBounds(NativeT rhs) {
     using UnsignedT = std::make_unsigned_t<NativeT>;
@@ -2546,11 +2624,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 // instantiating it.  We explicitly instantiate this class in the various
 // hlo_evaluator_typed_visitor*.cc files.
 extern template class HloEvaluatorTypedVisitor<bool>;
-extern template class HloEvaluatorTypedVisitor<uint8_t>;
-extern template class HloEvaluatorTypedVisitor<uint32_t>;
+extern template class HloEvaluatorTypedVisitor<uint8_t, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint16_t, uint64_t>;
+extern template class HloEvaluatorTypedVisitor<uint32_t, uint64_t>;
 extern template class HloEvaluatorTypedVisitor<uint64_t>;
-extern template class HloEvaluatorTypedVisitor<int8_t>;
-extern template class HloEvaluatorTypedVisitor<int32_t>;
+extern template class HloEvaluatorTypedVisitor<int8_t, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int16_t, int64_t>;
+extern template class HloEvaluatorTypedVisitor<int32_t, int64_t>;
 extern template class HloEvaluatorTypedVisitor<int64_t>;
 extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
 extern template class HloEvaluatorTypedVisitor<float>;
@@ -2558,6 +2638,8 @@ extern template class HloEvaluatorTypedVisitor<double>;
 extern template class HloEvaluatorTypedVisitor<complex64>;
 extern template class HloEvaluatorTypedVisitor<complex128>;
 extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
+extern template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
new file mode 100644
index 00000000000..3c878b900a6
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_float8.cc
@@ -0,0 +1,22 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<tsl::float8_e5m2, float>;
+template class HloEvaluatorTypedVisitor<tsl::float8_e4m3fn, float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int16.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int16.cc
index 25a7cba94ad..775c491f4c4 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int16.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int16.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<int16_t>;
+template class HloEvaluatorTypedVisitor<int16_t, int64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
index 993db017845..a22c31d516f 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int32.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<int32_t>;
+template class HloEvaluatorTypedVisitor<int32_t, int64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
index b8a4b954c12..72c38b0db6c 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_int8.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<int8_t>;
+template class HloEvaluatorTypedVisitor<int8_t, int64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint16.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint16.cc
index 4cd243cde64..7c0b14e82fb 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint16.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint16.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<uint16_t>;
+template class HloEvaluatorTypedVisitor<uint16_t, uint64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint32.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint32.cc
index 5740ea49bfe..56559d8fd21 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint32.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint32.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<uint32_t>;
+template class HloEvaluatorTypedVisitor<uint32_t, uint64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint8.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint8.cc
index bee43f3feb5..4e94729ce86 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint8.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor_uint8.cc
@@ -17,5 +17,5 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h"
 
 namespace xla {
-template class HloEvaluatorTypedVisitor<uint8_t>;
+template class HloEvaluatorTypedVisitor<uint8_t, uint64_t>;
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
index c5d822d5abe..858bc589280 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,8 +1,11 @@
 # Automatic sharding annotation
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 
-package(default_visibility = [":friends"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+)
 
 package_group(
     name = "friends",
@@ -16,21 +19,21 @@ cc_library(
     srcs = [
         "auto_sharding.cc",
         "auto_sharding_dot_handler.cc",
-        "auto_sharding_util.cc",
     ],
     hdrs = [
         "auto_sharding.h",
-        "auto_sharding_cost_graph.h",
-        "auto_sharding_strategy.h",
-        "auto_sharding_util.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla:array",
-        "//tensorflow/compiler/xla:shape_util",
+        ":auto_sharding_cost_graph",
+        ":auto_sharding_solver_option",
+        ":auto_sharding_strategy",
+        ":auto_sharding_util",
+        ":cluster_environment",
+        ":matrix",
+        ":metrics",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:heap_simulator",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_live_range",
         "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -38,27 +41,110 @@ cc_library(
         "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
         "@com_google_ortools//ortools/linear_solver",
         "@com_google_ortools//ortools/linear_solver:linear_solver_cc_proto",
     ],
 )
 
-tf_cc_binary(
+cc_library(
+    name = "auto_sharding_strategy",
+    hdrs = [
+        "auto_sharding_strategy.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_value",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "auto_sharding_cost_graph",
+    hdrs = [
+        "auto_sharding_cost_graph.h",
+    ],
+    deps = [
+        ":auto_sharding_strategy",
+        ":matrix",
+    ],
+)
+
+cc_library(
+    name = "matrix",
+    hdrs = [
+        "matrix.h",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "cluster_environment",
+    srcs = ["cluster_environment.cc"],
+    hdrs = ["cluster_environment.h"],
+    deps = [
+        ":auto_sharding_solver_option",
+        ":auto_sharding_util",
+        ":profiling_result",
+    ],
+)
+
+cc_library(
+    name = "profiling_result",
+    hdrs = ["profiling_result.h"],
+    deps = [":auto_sharding_strategy"],
+)
+
+cc_library(
+    name = "auto_sharding_solver_option",
+    hdrs = ["auto_sharding_solver_option.h"],
+)
+
+cc_library(
+    name = "auto_sharding_util",
+    srcs = [
+        "auto_sharding_util.cc",
+    ],
+    hdrs = [
+        "auto_sharding_util.h",
+    ],
+    deps = [
+        ":auto_sharding_strategy",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "metrics",
+    srcs = ["metrics.cc"],
+    hdrs = ["metrics.h"],
+    deps = ["//tensorflow/tsl/lib/monitoring:counter"],
+)
+
+xla_cc_binary(
     name = "auto_sharding_runner",
     srcs = ["auto_sharding_runner.cc"],
     deps = [
         ":auto_sharding",
         "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/compiler/xla/tools:hlo_module_loader",
         "//tensorflow/tsl/platform:platform_port",
     ],
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index c8a4da0b3dc..7c44139bf74 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -36,20 +36,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "ortools/linear_solver/linear_solver.h"
 #include "ortools/linear_solver/linear_solver.pb.h"
+#ifdef PLATFORM_GOOGLE
+#include "file/base/helpers.h"
+#include "util/task/status.pb.h"
+#endif
 
 using MPConstraint = operations_research::MPConstraint;
 using MPSolver = operations_research::MPSolver;
@@ -410,15 +417,9 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
                              const CallGraph& call_graph) {
   for (int64_t i = 0; i < shape.rank(); ++i) {
     for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
-      // Split one dim only when the tensor shape is divisable by device mesh.
-      // TODO(b/220942808) Shard non-divisible dimensions.
-      if (device_mesh.dim(j) == 1 ||
-          !IsDivisible(shape.dimensions(i), device_mesh.dim(j))) {
-        continue;
-      }
-
-      if (only_allow_divisible &&
-          shape.dimensions(i) % device_mesh.dim(j) != 0) {
+      if (device_mesh.dim(j) == 1 || shape.dimensions(i) < device_mesh.dim(j) ||
+          (only_allow_divisible &&
+           !IsDivisible(shape.dimensions(i), device_mesh.dim(j)))) {
         continue;
       }
 
@@ -435,6 +436,11 @@ void EnumerateAll1DPartition(const HloInstruction* ins, const Shape& shape,
         resharding_costs = ReshardingCostsForTupleOperand(
             ins->operand(0), strategy_map.at(ins->operand(0)).get());
         LOG(INFO) << absl::StrJoin(resharding_costs.back(), ",");
+      } else if (ins->opcode() == HloOpcode::kRngBitGenerator &&
+                 ins->operand(0)->shape().IsArray()) {
+        resharding_costs = GenerateReshardingCostsForAllOperands(
+            ins, output_spec, strategy_map, cluster_env, call_graph,
+            {HloSharding::Replicate()});
       } else {
         resharding_costs = GenerateReshardingCostsForAllOperands(
             ins, output_spec, strategy_map, cluster_env, call_graph);
@@ -479,9 +485,10 @@ void EnumerateAll2DPartition(const HloInstruction* ins, const Shape& shape,
       }
 
       if (only_allow_divisible &&
-          (shape.dimensions(i) % device_mesh.dim(shardable_mesh_dims[0]) != 0 ||
-           shape.dimensions(j) % device_mesh.dim(shardable_mesh_dims[1]) !=
-               0)) {
+          (!IsDivisible(shape.dimensions(i),
+                        device_mesh.dim(shardable_mesh_dims[0])) ||
+           !IsDivisible(shape.dimensions(j),
+                        device_mesh.dim(shardable_mesh_dims[1])))) {
         continue;
       }
 
@@ -522,14 +529,15 @@ void EnumerateAll1DPartitionReshape(const HloInstruction* ins,
                                     const ClusterEnvironment& cluster_env,
                                     const StrategyMap& strategy_map,
                                     std::unique_ptr<StrategyVector>& strategies,
+                                    bool only_allow_divisible,
                                     const std::string& suffix) {
   const HloInstruction* operand = ins->operand(0);
 
   for (int64_t i = 0; i < ins->shape().rank(); ++i) {
     for (int64_t j = 0; j < device_mesh.num_dimensions(); ++j) {
-      // TODO(b/220942808) Shard non-divisible dimensions.
       if (device_mesh.dim(j) == 1 ||
-          !IsDivisible(ins->shape().dimensions(i), device_mesh.dim(j))) {
+          (only_allow_divisible &&
+           !IsDivisible(ins->shape().dimensions(i), device_mesh.dim(j)))) {
         continue;
       }
       HloSharding output_spec = Tile(ins->shape(), {i}, {j}, device_mesh);
@@ -572,7 +580,8 @@ void Enumerate2DPartitionReshape(const HloInstruction* ins,
                                  const ClusterEnvironment& cluster_env,
                                  const StrategyMap& strategy_map,
                                  const InstructionBatchDimMap& batch_dim_map,
-                                 std::unique_ptr<StrategyVector>& strategies) {
+                                 std::unique_ptr<StrategyVector>& strategies,
+                                 bool only_allow_divisible) {
   std::vector<int64_t> shardable_mesh_dims =
       VectorGreaterThanOneElementIndices(device_mesh.dimensions());
   auto iter = batch_dim_map.find(GetBatchDimMapKey(ins));
@@ -595,6 +604,13 @@ void Enumerate2DPartitionReshape(const HloInstruction* ins,
               device_mesh.dim(shardable_mesh_dims[1])) {
         continue;
       }
+      if (only_allow_divisible &&
+          (!IsDivisible(ins->shape().dimensions(i),
+                        device_mesh.dim(shardable_mesh_dims[0])) ||
+           !IsDivisible(ins->shape().dimensions(j),
+                        device_mesh.dim(shardable_mesh_dims[1])))) {
+        continue;
+      }
 
       HloSharding output_spec =
           Tile(ins->shape(), {i, j},
@@ -742,7 +758,7 @@ void DisableIncompatibleMixedMeshShapeAndForceBatchDim(
                                           .dimensions(iter.second));
   }
 
-  if (batch_size % num_devices != 0) {
+  if (IsDivisible(batch_size, num_devices)) {
     if (solver_option.allow_mixed_mesh_shape) {
       solver_option.allow_mixed_mesh_shape = false;
       LOG(WARNING)
@@ -760,7 +776,8 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
     LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
     const StrategyMap& strategy_map,
     const AutoShardingSolverOption& solver_option, double replicated_penalty,
-    const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph) {
+    const InstructionBatchDimMap& batch_dim_map, const CallGraph& call_graph,
+    bool only_allow_divisible) {
   std::unique_ptr<StrategyVector> strategies;
   if (shape.IsTuple()) {
     strategies = CreateTupleStrategyVector(instruction_id);
@@ -770,14 +787,15 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
           CreateParameterStrategyVector(
               ins, shape.tuple_shapes().at(i), instruction_id, leaf_strategies,
               cluster_env, strategy_map, solver_option, replicated_penalty,
-              batch_dim_map, call_graph)
+              batch_dim_map, call_graph, only_allow_divisible)
               .value());
     }
   } else if (shape.IsArray()) {
     strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
                                           leaf_strategies);
     EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                            strategy_map, strategies, true, "", call_graph);
+                            strategy_map, strategies, only_allow_divisible, "",
+                            call_graph);
     // Split 2 dims
     if (cluster_env.IsDeviceMesh2D()) {
       // NOTE(zhuohan): In full alpa, we only include 2D partition strategy
@@ -785,8 +803,8 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
       //                this logic here since this pass might be used for
       //                more general cases.
       EnumerateAll2DPartition(ins, shape, cluster_env.device_mesh_, cluster_env,
-                              strategy_map, strategies, batch_dim_map, true,
-                              call_graph);
+                              strategy_map, strategies, batch_dim_map,
+                              only_allow_divisible, call_graph);
     }
 
     if (solver_option.allow_mixed_mesh_shape && cluster_env.IsDeviceMesh2D()) {
@@ -797,8 +815,8 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateParameterStrategyVector(
 
       // Split 1 dim, but for 1d mesh
       EnumerateAll1DPartition(ins, shape, cluster_env.device_mesh_1d_,
-                              cluster_env, strategy_map, strategies, true,
-                              " 1d", call_graph);
+                              cluster_env, strategy_map, strategies,
+                              only_allow_divisible, " 1d", call_graph);
     }
     if (solver_option.allow_replicated_parameters ||
         strategies->leaf_vector.empty()) {
@@ -829,14 +847,19 @@ bool ShardingIsComplete(const HloSharding& sharding, size_t total_num_devices) {
 
 // Two shardings shard the same dimension of a given tensor.
 bool ShardingIsConsistent(const HloSharding& partial_sharding,
-                          const HloSharding& complete_sharding) {
+                          const HloSharding& complete_sharding, bool strict) {
   if (partial_sharding.tile_assignment().num_dimensions() >
       complete_sharding.tile_assignment().num_dimensions()) {
     return false;
   }
   for (size_t i = 0; i < partial_sharding.tile_assignment().num_dimensions();
        ++i) {
-    if (partial_sharding.tile_assignment().dim(i) > 1 &&
+    if (strict && partial_sharding.tile_assignment().dim(i) > 1 &&
+        partial_sharding.tile_assignment().dim(i) ==
+            complete_sharding.tile_assignment().dim(i)) {
+      return true;
+    }
+    if (!strict && partial_sharding.tile_assignment().dim(i) > 1 &&
         complete_sharding.tile_assignment().dim(i) > 1) {
       return true;
     }
@@ -861,13 +884,13 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
     const std::vector<HloInstruction*> instructions,
     const HloSharding& existing_sharding, const ClusterEnvironment& cluster_env,
     StableHashMap<int64_t, std::vector<ShardingStrategy>>& trimmed_strategy_map,
-    const CallGraph& call_graph) {
+    const CallGraph& call_graph, bool strict) {
   if (strategies->is_tuple) {
     for (size_t i = 0; i < strategies->childs.size(); ++i) {
       TrimOrGenerateStrategiesBasedOnExistingSharding(
           output_shape.tuple_shapes(i), strategies->childs.at(i).get(),
           strategy_map, instructions, existing_sharding.tuple_elements().at(i),
-          cluster_env, trimmed_strategy_map, call_graph);
+          cluster_env, trimmed_strategy_map, call_graph, strict);
     }
   } else {
     if (ShardingIsComplete(existing_sharding,
@@ -923,13 +946,17 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
             ShardingStrategy({name, existing_sharding, 0, 0, memory_cost,
                               resharding_costs, input_shardings}));
       }
-    } else {
+    } else if (!strategies->following) {
       // If existing sharding is a partial sharding from previous iteration,
       // find the strategies that are 1D&&complete or align with user
       // sharding.
+      // It is IMPORTANT that we do this only for instructions that do no follow
+      // others, to keep the number of ILP variable small.
       std::vector<ShardingStrategy> new_vector;
       for (const auto& strategy : strategies->leaf_vector) {
-        if (ShardingIsConsistent(existing_sharding, strategy.output_sharding) ||
+        if (strategy.output_sharding.IsReplicated() ||
+            ShardingIsConsistent(existing_sharding, strategy.output_sharding,
+                                 strict) ||
             (VectorGreaterThanOneElementCount(
                  strategy.output_sharding.tile_assignment().dimensions()) ==
                  1 &&
@@ -942,7 +969,9 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
       // If no sharding strategy left, just keep the original set, because we do
       // not have to strictly keep those shardings and the only purpose is to
       // reduce problem size for the last iteration.
-      if (!new_vector.empty()) {
+      if (!new_vector.empty() &&
+          new_vector.size() != strategies->leaf_vector.size()) {
+        strategies->following = nullptr;
         strategies->leaf_vector = std::move(new_vector);
       }
     }
@@ -1062,6 +1091,7 @@ bool LeafVectorsAreConsistent(const std::vector<ShardingStrategy>& one,
 // Build possible sharding strategies and their costs for all instructions.
 StatusOr<std::tuple<StrategyMap, LeafStrategies, AssociativeDotPairs>>
 BuildStrategyAndCost(const HloInstructionSequence& sequence,
+                     const HloModule* module,
                      const InstructionDepthMap& depth_map,
                      const InstructionBatchDimMap& batch_dim_map,
                      const AliasMap& alias_map,
@@ -1114,15 +1144,31 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
     std::unique_ptr<StrategyVector> strategies;
 
     HloOpcode opcode = ins->opcode();
+
+    bool only_allow_divisible;
+    if (IsEntryComputationInputOrOutput(module, ins)) {
+      // With IsEntryComputationInputOrOutput(module, ins) == true, entry
+      // computation's root instruction may still be unevenly sharded because it
+      // usually "follows" other instruction's sharding. If the instruction it
+      // follows is an intermediate instruction, it may be able to choose
+      // unevenly sharded strategiyes. Usually if we constraint input's sharding
+      // strategies, outputs would be constrained as welll, but if outputs are
+      // still unevely sharded in some cases, we need to fix the implementation
+      // in auto sharding.
+      only_allow_divisible = solver_option.only_allow_divisible_input_output;
+    } else {
+      only_allow_divisible = solver_option.only_allow_divisible_intermediate;
+    }
     switch (opcode) {
       case HloOpcode::kParameter:
       case HloOpcode::kRngBitGenerator:
       case HloOpcode::kRng: {
-        strategies = CreateParameterStrategyVector(
-                         ins, ins->shape(), instruction_id, leaf_strategies,
-                         cluster_env, strategy_map, solver_option,
-                         replicated_penalty, batch_dim_map, call_graph)
-                         .value();
+        strategies =
+            CreateParameterStrategyVector(
+                ins, ins->shape(), instruction_id, leaf_strategies, cluster_env,
+                strategy_map, solver_option, replicated_penalty, batch_dim_map,
+                call_graph, only_allow_divisible)
+                .value();
         break;
       }
       case HloOpcode::kConstant: {
@@ -1157,7 +1203,9 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             // mesh.
             // TODO(b/220942808) Shard non-divisible dimensions.
             if (device_mesh.dim(j) == 1 ||
-                !IsDivisible(shape.dimensions(index_dim), device_mesh.dim(j))) {
+                (only_allow_divisible &&
+                 !IsDivisible(shape.dimensions(index_dim),
+                              device_mesh.dim(j)))) {
               continue;
             }
             std::string name = absl::StrCat("S", std::to_string(index_dim),
@@ -1206,16 +1254,18 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         CHECK(!operand_strategies->is_tuple);
         if (ins->shape().rank() == 1 || cluster_env.IsDeviceMesh1D()) {
           EnumerateAll1DPartition(ins, ins->shape(), cluster_env.device_mesh_,
-                                  cluster_env, strategy_map, strategies, true,
-                                  "", call_graph);
+                                  cluster_env, strategy_map, strategies,
+                                  only_allow_divisible, "", call_graph);
         } else {
           EnumerateAll2DPartition(ins, ins->shape(), cluster_env.device_mesh_,
                                   cluster_env, strategy_map, strategies,
-                                  batch_dim_map, true, call_graph);
+                                  batch_dim_map, only_allow_divisible,
+                                  call_graph);
           if (solver_option.allow_mixed_mesh_shape) {
-            EnumerateAll1DPartition(
-                ins, ins->shape(), cluster_env.device_mesh_1d_, cluster_env,
-                strategy_map, strategies, true, "1d", call_graph);
+            EnumerateAll1DPartition(ins, ins->shape(),
+                                    cluster_env.device_mesh_1d_, cluster_env,
+                                    strategy_map, strategies,
+                                    only_allow_divisible, "1d", call_graph);
           }
         }
         AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
@@ -1278,19 +1328,21 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           // Split 1 dim
           if (cluster_env.IsDeviceMesh1D()) {
             EnumerateAll1DPartitionReshape(ins, device_mesh, cluster_env,
-                                           strategy_map, strategies, "");
+                                           strategy_map, strategies,
+                                           only_allow_divisible, "");
           }
           if (solver_option.allow_mixed_mesh_shape &&
               cluster_env.IsDeviceMesh2D()) {
             // Split 1 dim, but for 1d mesh
             EnumerateAll1DPartitionReshape(ins, device_mesh_1d, cluster_env,
-                                           strategy_map, strategies, " 1d");
+                                           strategy_map, strategies,
+                                           only_allow_divisible, " 1d");
           }
           if (cluster_env.IsDeviceMesh2D()) {
             // Split 2 dim, one is always the batch dim
             Enumerate2DPartitionReshape(ins, device_mesh, cluster_env,
-                                        strategy_map, batch_dim_map,
-                                        strategies);
+                                        strategy_map, batch_dim_map, strategies,
+                                        only_allow_divisible);
           }
 
           // Replicate
@@ -1474,6 +1526,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       case HloOpcode::kSin:
       case HloOpcode::kSqrt:
       case HloOpcode::kCbrt:
+      case HloOpcode::kTan:
       case HloOpcode::kTanh:
       // Binary elementwise operations
       case HloOpcode::kAdd:
@@ -1600,14 +1653,14 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                                                             leaf_strategies);
         if (cluster_env.IsDeviceMesh1D()) {
           EnumerateAll1DPartition(ins, ins->shape(), device_mesh, cluster_env,
-                                  strategy_map, strategies, false, "",
-                                  call_graph);
+                                  strategy_map, strategies,
+                                  only_allow_divisible, "", call_graph);
         }
         if (cluster_env.IsDeviceMesh2D()) {
           // Split 2 dims
           EnumerateAll2DPartition(ins, ins->shape(), device_mesh, cluster_env,
                                   strategy_map, strategies, batch_dim_map,
-                                  false, call_graph);
+                                  only_allow_divisible, call_graph);
         }
         if (cluster_env.IsDeviceMesh2D() &&
             solver_option.allow_mixed_mesh_shape) {
@@ -1615,8 +1668,8 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           // For example, when the mesh shape is (2, 4), we add strategies for
           // mesh shape (1, 8) here in addition.
           EnumerateAll1DPartition(ins, ins->shape(), device_mesh_1d,
-                                  cluster_env, strategy_map, strategies, false,
-                                  " 1d", call_graph);
+                                  cluster_env, strategy_map, strategies,
+                                  only_allow_divisible, " 1d", call_graph);
         }
 
         if (strategies->leaf_vector.empty() || IsFollowedByBroadcast(ins)) {
@@ -1661,8 +1714,12 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
               /* have_memory_cost= */ true, leaf_strategies, cluster_env,
               trimmed_strategy_map);
         } else if (ins->has_sharding()) {
-          strategies = CreateLeafStrategyVector(instruction_id, ins,
-                                                strategy_map, leaf_strategies);
+          if (ins->shape().IsTuple()) {
+            strategies = CreateTupleStrategyVector(instruction_id);
+          } else {
+            strategies = CreateLeafStrategyVector(
+                instruction_id, ins, strategy_map, leaf_strategies);
+          }
         } else if (OutputInputSameShapes(ins)) {
           auto* partitioner =
               GetCustomCallPartitioner(ins->custom_call_target());
@@ -1678,10 +1735,25 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                 trimmed_strategy_map);
           }
         } else {
-          strategies = CreateLeafStrategyVector(instruction_id, ins,
-                                                strategy_map, leaf_strategies);
-          AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
-                                strategies, replicated_penalty);
+          // TODO (b/258723035) Handle CustomCall ops for GPUs in a better way.
+          if (ins->shape().IsTuple()) {
+            strategies = CreateTupleStrategyVector(instruction_id);
+            strategies->childs.reserve(ins->shape().tuple_shapes_size());
+            for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+              std::unique_ptr<StrategyVector> child_strategies =
+                  CreateLeafStrategyVector(instruction_id, ins, strategy_map,
+                                           leaf_strategies);
+              AddReplicatedStrategy(ins, ins->shape().tuple_shapes(i),
+                                    cluster_env, strategy_map, child_strategies,
+                                    replicated_penalty);
+              strategies->childs.push_back(std::move(child_strategies));
+            }
+          } else {
+            strategies = CreateLeafStrategyVector(
+                instruction_id, ins, strategy_map, leaf_strategies);
+            AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                                  strategies, replicated_penalty);
+          }
         }
         break;
       }
@@ -1709,10 +1781,10 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       // Do not merge nodes if this one instruction has annotations.
       // TODO(b/208668853) If needed, we can make auto sharding faster by using
       // this sharding spec when merging node using strategies->following.
-      strategies->following = nullptr;
       TrimOrGenerateStrategiesBasedOnExistingSharding(
           ins->shape(), strategies.get(), strategy_map, instructions,
-          ins->sharding(), cluster_env, trimmed_strategy_map, call_graph);
+          ins->sharding(), cluster_env, trimmed_strategy_map, call_graph,
+          solver_option.nd_sharding_iteratively_strict_search_space);
     }
     if (!strategies->is_tuple && strategies->following) {
       if (!LeafVectorsAreConsistent(
@@ -1736,6 +1808,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         }
       }
     }
+    RemoveInvalidShardingsWithShapes(ins->shape(), strategies.get());
     XLA_VLOG_LINES(2, absl::StrCat("strategies:\n", strategies->ToString()));
 
     // Debug options: forcibly set the strategy of some instructions.
@@ -1763,7 +1836,6 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
     // Checks the shape of resharding_costs is valid. It will check fail if the
     // shape is not as expected.
     CheckReshardingCostsShape(strategies.get());
-    RemoveInvalidShardingsWithShapes(ins->shape(), strategies.get());
     CheckMemoryCosts(strategies.get(), ins->shape());
     strategy_map[ins] = std::move(strategies);
   }  // end of for loop
@@ -1907,31 +1979,38 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
                   const std::vector<std::string>& instruction_names) {
   size_t num_edges = E.size();
 
-  std::unique_ptr<MPSolver> solver(
-      std::make_unique<MPSolver>("", MPSolver::GLPK_MIXED_INTEGER_PROGRAMMING));
+  int32_t num_workers = 32;
+  // SAT or SCIP
+  std::unique_ptr<MPSolver> solver(std::make_unique<MPSolver>("", MPSolver::GLPK_MIXED_INTEGER_PROGRAMMING));
   CHECK(solver);
   solver->MutableObjective()->SetMinimization();
-
+  std::string solver_parameter_str;
+#ifdef PLATFORM_GOOGLE
+  if (solver->ProblemType() ==
+      operations_research::MPSolver::SAT_INTEGER_PROGRAMMING) {
+    // Set random_seed, interleave_search and share_binary_clauses for
+    // determinism, and num_workers for parallelism.
+    solver_parameter_str = absl::StrCat(
+        "share_binary_clauses:false,random_seed:1,interleave_"
+        "search:true,num_workers:",
+        num_workers);
+    solver->SetSolverSpecificParametersAsString(solver_parameter_str);
+  }
+#endif
   // Create variables
   std::vector<std::vector<MPVariable*>> s(N);
   std::vector<std::vector<MPVariable*>> e(num_edges);
 
   size_t var_vector_cnt = 0;
-  size_t var_cnt = 0;
   for (size_t i = 0; i < N; ++i) {
     if (s_follow[i] < 0) {
       var_vector_cnt += 1;
-      var_cnt += s_len[i];
       // Creates variables for instructions that do not follow others.
       solver->MakeBoolVarArray(
           s_len[i], absl::StrCat("s[", std::to_string(i), "]"), &s[i]);
     }
   }
 
-  VLOG(1) << "Total variables for ILP: " << var_cnt
-          << ", total vector of variables: " << var_vector_cnt
-          << ", total instructions: " << N;
-
   for (size_t i = 0; i < N; ++i) {
     if (s_follow[i] >= 0) {
       // Copies the variable of followed instruction to the following
@@ -2046,6 +2125,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
       }
     }
   }
+
   // d. specified via "BoolVarArray"
   // e.
   for (size_t i = 0; i < num_edges; ++i) {
@@ -2098,21 +2178,50 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
       }
     }
   }
-  // Solve
-  VLOG(1) << "Total number of ILP constraints: " << solver->NumConstraints();
+
+#ifdef PLATFORM_GOOGLE
+  // Exports the model for debugging.
+  bool dump_model = false;
+  if (dump_model) {
+    operations_research::MPModelProto model_proto;
+    solver->ExportModelToProto(&model_proto);
+    auto write_status = file::SetTextProto(
+        // Modify this file path if needed.
+        absl::StrCat("/tmp/model_", solver->NumVariables(), ".proto"),
+        model_proto, file::Defaults());
+    if (!write_status.ok()) {
+      LOG(ERROR) << write_status.message();
+    }
+  }
+#endif
+  solver->set_time_limit(3600 * 1000);  // in ms
+  VLOG(0) << "Starting solver " << solver->ProblemType() << "\n"
+          << "Solver parameter string: " << solver_parameter_str << "\n"
+          << "Number of workers: " << num_workers << "\n"
+          << "Number of threads: " << solver->GetNumThreads() << "\n"
+          << "Time limit: " << solver->time_limit() << "\n"
+          << "Number variables for ILP: " << solver->NumVariables() << "\n"
+          << "Total vector of variables: " << var_vector_cnt << "\n"
+          << "Total instructions: " << N << "\n"
+          << "Memory budget: " << M / (1024 * 1024 * 1024) << "GB\n"
+          << "Number of ILP constraints: " << solver->NumConstraints();
   auto status = solver->Solve();
   if (status == operations_research::MPSolver::INFEASIBLE) {
     LOG(ERROR) << "MPSolver could not find any feasible solution.";
-    /*
-    // TODO (zhuohan): Move this part of code to a non-open sourced position.
-    //   Need to include "util/task/status.pb.h"
+#ifdef PLATFORM_GOOGLE
     operations_research::MPModelRequest model_request;
     solver->ExportModelToProto(model_request.mutable_model());
-    model_request.set_solver_type(
-        operations_research::MPModelRequest::SCIP_MIXED_INTEGER_PROGRAMMING);
+    if (solver->ProblemType() ==
+        operations_research::MPSolver::SAT_INTEGER_PROGRAMMING) {
+      model_request.set_solver_type(
+          operations_research::MPModelRequest::SAT_INTEGER_PROGRAMMING);
+    } else if (solver->ProblemType() ==
+               operations_research::MPSolver::SCIP_MIXED_INTEGER_PROGRAMMING) {
+      model_request.set_solver_type(
+          operations_research::MPModelRequest::SCIP_MIXED_INTEGER_PROGRAMMING);
+    }
     model_request.set_solver_time_limit_seconds(100);
     auto iis = MPSolver::ComputeIrreducibleInfeasibleSubset(model_request);
-
     LOG(INFO) << iis.status().DebugString();
     LOG(INFO) << "Infeasible constraints: ";
     for (int index : iis.constraint_index()) {
@@ -2123,7 +2232,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
           << " - "
           << model_request.model().general_constraint(index).DebugString();
     }
-    */
+#endif
 
     return tsl::errors::Internal(
         "MPSolver could not find any feasible solution.");
@@ -2328,10 +2437,9 @@ void CheckHloSharding(const HloInstructionSequence& sequence,
           size > 1) {
         LOG(INFO) << "Instruction is not fully sharded: (" << size << " GB) "
                   << ins->ToString();
+      } else if (!ins->has_sharding()) {
+        LOG(INFO) << "Instruction does not have sharding: " << ins->name();
       }
-    } else if (!ins->has_sharding()) {
-      LOG(INFO) << "Instruction does not have sharding: " << ins->name();
-    }
       for (const auto& op : ins->operands()) {
         if (op->has_sharding()) {
           if (op->sharding().IsReplicated() || ins->sharding().IsReplicated()) {
@@ -2373,6 +2481,7 @@ void CheckHloSharding(const HloInstructionSequence& sequence,
         }
       }
     }
+  }
   struct {
     bool operator()(const std::pair<size_t, std::string>& a,
                     const std::pair<size_t, std::string>& b) const {
@@ -2712,9 +2821,12 @@ void SaveShardingForInstruction(
     absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings,
     HloInstruction* inst) {
-  if (inst->has_sharding() && !inst->sharding().IsTuple()) {
+  if (!inst->has_sharding()) {
+    return;
+  }
+  if (!inst->sharding().IsTuple()) {
     preserve_shardings[inst->name()] = {inst->sharding()};
-  } else if (inst->has_sharding() && inst->sharding().IsTuple()) {
+  } else {
     preserve_shardings[inst->name()] = inst->sharding().tuple_elements();
   }
 }
@@ -2779,22 +2891,22 @@ void CheckUserShardingPreservation(
       if (preserve_shardings.find(inst->name()) == preserve_shardings.end()) {
         continue;
       }
-        if (!inst->has_sharding()) {
-          LOG(FATAL) << "User sharding is not preserved! Instruction with name "
-                     << inst->name() << " should be: "
-                     << preserve_shardings.at(inst->name())[0].ToString()
-                     << "\nbut it's empty.";
-        } else if (!inst->sharding().IsTuple() &&
-                   preserve_shardings.at(inst->name())[0].ToString() !=
-                       inst->sharding().ToString()) {
-          LOG(FATAL) << "User sharding is not preserved! Instruction with name "
-                     << inst->name() << " should be: "
-                     << preserve_shardings.at(inst->name())[0].ToString()
-                     << "\nbut it's: " << inst->sharding().ToString();
-        } else if (inst->sharding().IsTuple()) {
-          const std::vector<HloSharding>* preserve_shardings_tuple =
-              &preserve_shardings.at(inst->name());
-          for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
+      if (!inst->has_sharding()) {
+        LOG(FATAL) << "User sharding is not preserved! Instruction with name "
+                   << inst->name() << " should be: "
+                   << preserve_shardings.at(inst->name())[0].ToString()
+                   << "\nbut it's empty.";
+      } else if (!inst->sharding().IsTuple() &&
+                 preserve_shardings.at(inst->name())[0].ToString() !=
+                     inst->sharding().ToString()) {
+        LOG(FATAL) << "User sharding is not preserved! Instruction with name "
+                   << inst->name() << " should be: "
+                   << preserve_shardings.at(inst->name())[0].ToString()
+                   << "\nbut it's: " << inst->sharding().ToString();
+      } else if (inst->sharding().IsTuple()) {
+        const std::vector<HloSharding>* preserve_shardings_tuple =
+            &preserve_shardings.at(inst->name());
+        for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
           if (preserve_shardings_tuple->at(i).ToString() !=
               inst->sharding().tuple_elements().at(i).ToString()) {
             LOG(FATAL) << "Tuple sharding is not preserved! Instruction "
@@ -2805,8 +2917,8 @@ void CheckUserShardingPreservation(
                        << "\nbut it's: "
                        << inst->sharding().tuple_elements().at(i).ToString();
           }
-          }
         }
+      }
     }
   }
 }
@@ -2820,19 +2932,19 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
     for (const HloValue* value : liveness_set[t]) {
       size_t tmp;
       if (value->instruction()->shape().IsTuple() && value->index().empty()) {
-          continue;
+        continue;
       }
       Shape shape =
           ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
       if (value->instruction()->has_sharding()) {
-          tmp = GetShardedInstructionSize(
-              shape, num_devices,
-              !value->index().empty()
-                  ? value->instruction()->sharding().GetSubSharding(
-                        value->instruction()->shape(), value->index())
-                  : value->instruction()->sharding());
+        tmp = GetShardedInstructionSize(
+            shape, num_devices,
+            !value->index().empty()
+                ? value->instruction()->sharding().GetSubSharding(
+                      value->instruction()->shape(), value->index())
+                : value->instruction()->sharding());
       } else {
-          tmp = GetShardedInstructionSize(shape, num_devices);
+        tmp = GetShardedInstructionSize(shape, num_devices);
       }
       memory_usage += tmp;
     }
@@ -2862,6 +2974,669 @@ void RecoverShardingsFromPartialMesh(
     }
   }
 }
+// DFS to find the replicated set starting from cur instruction.
+void FindReplicateSet(
+    HloInstruction* cur, const AliasMap& alias_map, const CostGraph& cost_graph,
+    absl::Span<const int64_t> s_val, const StrategyMap& strategy_map,
+    const ShardingStrategy& strategy, const HloInstruction* output,
+    bool do_all_gather_after_backward, HloInstruction*& transpose_inst,
+    StableHashSet<HloInstruction*>& replicated_set,
+    StableHashSet<HloInstruction*>& boundary_set,
+    StableHashSet<HloInstruction*>& consumer_set,
+    StableHashSet<const HloInstruction*>& visited) {
+  visited.insert(cur);
+
+  // Check whether the node is a boundary node.
+  StableHashSet<HloInstruction*> users = UsersWithAlias(cur, alias_map, output);
+  for (HloInstruction* consumer : users) {
+    const HloInstruction* shape_inst = cur;
+
+    // Allow at most one transpose
+    if (consumer->opcode() == HloOpcode::kTranspose &&
+        (transpose_inst == nullptr ||
+         DimensionsEqual(transpose_inst->shape(), consumer->shape()))) {
+      shape_inst = consumer;
+      transpose_inst = consumer;
+      // TODO(zhuohan): fix output_sharding comparison.
+    }
+
+    if (consumer->opcode() == HloOpcode::kTuple ||
+        (do_all_gather_after_backward && IsParameterConvert(consumer)) ||
+        GetShardingStrategy(consumer, strategy_map, cost_graph, s_val)
+                .output_sharding != strategy.output_sharding ||
+        !DimensionsEqual(consumer->shape(), shape_inst->shape())) {
+      boundary_set.insert(cur);
+      return;
+    }
+  }
+
+  // If this node is not a boundary node, propagate from this node.
+  replicated_set.insert(cur);
+  for (HloInstruction* consumer : users) {
+    if (!visited.contains(consumer)) {
+      consumer_set.insert(consumer);
+      FindReplicateSet(consumer, alias_map, cost_graph, s_val, strategy_map,
+                       strategy, output, do_all_gather_after_backward,
+                       transpose_inst, replicated_set, boundary_set,
+                       consumer_set, visited);
+    }
+  }
+
+  for (size_t i = 0; i < cur->operand_count(); ++i) {
+    HloInstruction* operand = cur->mutable_operand(i);
+    operand = PassThroughCustomCallMarkerOperand(operand, cur);
+
+    if (!visited.contains(operand) && !IsAlwaysReplicated(operand) &&
+        GetShardingStrategy(operand, strategy_map, cost_graph, s_val)
+                .output_sharding == strategy.output_sharding &&
+        DimensionsEqual(operand->shape(), cur->shape())) {
+      FindReplicateSet(operand, alias_map, cost_graph, s_val, strategy_map,
+                       strategy, output, do_all_gather_after_backward,
+                       transpose_inst, replicated_set, boundary_set,
+                       consumer_set, visited);
+    }
+  }
+}
+
+// Substitute all-reduce strategies with their reduce-scatter variants.
+void GenerateReduceScatter(const HloInstructionSequence& sequence,
+                           const AliasMap& alias_map,
+                           const InstructionDepthMap& depth_map,
+                           const StrategyMap& strategy_map,
+                           const CostGraph& cost_graph,
+                           absl::Span<const int64_t> s_val,
+                           const ClusterEnvironment& cluster_env,
+                           const AutoShardingSolverOption& solver_option) {
+  const std::vector<HloInstruction*>& instructions = sequence.instructions();
+
+  // Propagation ends at output
+  const HloInstruction* output = instructions.back();
+  if (IsCustomCallMarker(output)) {
+    output = output->operand(0);
+  }
+
+  // A debug option: whether to do all-gather after backward pass.
+  // This controls the location of all-gather.
+  // If true, all-gather happens after backward pass, which is desired for
+  // gradient accumulation. If false, all-gather happens before forward pass,
+  // which can partitions more tensors.
+  bool do_all_gather_after_backward = true;
+
+  // If true, do not actually generate reduce-scatter + all-gather,
+  // but generate all-reduce + all-gather instead.
+  // This saves less memory but is more friendly to gradient accumulation.
+  // This is a temporary workaround due to implementation difficulty.
+  // Ideally, we should be able to generate a gradient-accumulation-friendly
+  // reduce-scatter + all-gather, but for now it is not easy to implement this
+  // in our current system. So we generate a gradient-accumulation-friendly
+  // all-reduce + all-gather, which has the same memory consumption but with 50%
+  // communication overhead.
+  bool use_all_reduce_for_grad_acc =
+      solver_option.reduce_scatter_grad_acc_friendly;
+
+  std::vector<HloInstruction*> insert_all_gather;
+  StableHashSet<const HloInstruction*> modified;
+
+  for (HloInstruction* inst : instructions) {
+    if (!HasReduceScatterOpportunity(inst, strategy_map, cost_graph, s_val,
+                                     modified)) {
+      continue;
+    }
+    const ShardingStrategy& strategy =
+        GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
+    if (!absl::StrContains(strategy.name, "allreduce")) {
+      continue;
+    }
+
+    StableHashSet<HloInstruction*> replicated_set;
+    StableHashSet<HloInstruction*> boundary_set;
+    StableHashSet<HloInstruction*> consumer_set;
+    StableHashSet<const HloInstruction*> visited;
+
+    // We allow at most one transpose in the path of replication analysis.
+    HloInstruction* transpose_inst = nullptr;
+
+    // Find the replicated set starting from the all-reduce instruction.
+    visited.insert(output);
+    FindReplicateSet(inst, alias_map, cost_graph, s_val, strategy_map, strategy,
+                     output, do_all_gather_after_backward, transpose_inst,
+                     replicated_set, boundary_set, consumer_set, visited);
+
+    // Try to reduce the boundary set to its common ancestor
+    TryReduceWithCommonAncestor(replicated_set, boundary_set, consumer_set,
+                                alias_map);
+
+    // Analyze the instructions after which all-gather should be inserted.
+    std::vector<HloInstruction*> need_all_gather;
+    for (HloInstruction* node : boundary_set) {
+      if (consumer_set.contains(node)) {
+        if (AllUsersAreReduce(node)) {
+          // If users are reduce, the all-gather cost after this instruction
+          // should be small, so we ignore all-gather cost of these
+          // instructions.
+          replicated_set.insert(node);
+        } else {
+          need_all_gather.push_back(node);
+        }
+      }
+    }
+
+    // If we do all-gather on some parameters, move this all-gather after
+    // backward.
+    if (do_all_gather_after_backward && need_all_gather.size() == 1) {
+      HloInstruction* point = need_all_gather.front();
+      std::vector<HloInstruction*> path;
+      HloInstruction* root = point;
+      while (true) {
+        path.push_back(root);
+        if (root->opcode() == HloOpcode::kGetTupleElement) {
+          root = PassThroughCustomCallMarkerOperand(root->mutable_operand(0),
+                                                    root);
+        } else {
+          break;
+        }
+      }
+
+      if (root->opcode() == HloOpcode::kParameter) {
+        for (auto x : path) {
+          replicated_set.erase(x);
+          boundary_set.erase(x);
+        }
+        need_all_gather.clear();
+        for (auto x : replicated_set) {
+          auto iter = alias_map.find(x);
+          if (iter != alias_map.end() && iter->second == root) {
+            boundary_set.insert(x);
+            need_all_gather.push_back(x);
+            break;
+          }
+        }
+      }
+    }
+
+    // Analyze how many parameters can be partitioned if we do this
+    // transformation.
+    int num_replicated_parameters = 0;
+    for (const HloInstruction* node : replicated_set) {
+      if (node->opcode() == HloOpcode::kParameter) {
+        num_replicated_parameters++;
+      }
+    }
+    for (const HloInstruction* to_split : need_all_gather) {
+      if (to_split->users().size() == 1 &&
+          to_split->users().front() == output && alias_map.contains(to_split)) {
+        // Move the all-gather to its alias parameter.
+        num_replicated_parameters++;
+      }
+    }
+
+    // Print replicated set and boundary set for debugging.
+    VLOG(10) << inst->ToString(HloPrintOptions::ShortParsable()) << "\n";
+    VLOG(10) << "replicated set (#parameter: " << num_replicated_parameters
+             << "):\n";
+    for (auto x : replicated_set) {
+      VLOG(10) << "  " << x->ToString(HloPrintOptions::ShortParsable()) << "\n";
+    }
+    VLOG(10) << "boundary set (#incompatible: " << need_all_gather.size()
+             << "):\n";
+    for (auto x : boundary_set) {
+      VLOG(10) << "  " << x->ToString(HloPrintOptions::ShortParsable()) << " "
+               << absl::c_linear_search(need_all_gather, x) << "\n";
+    }
+
+    // If applicable, replace all-reduce with reduce-scatter by
+    // setting instructions' sharding.
+    if (num_replicated_parameters >= 1 && need_all_gather.size() <= 1 &&
+        replicated_set.size() >= 5) {
+      HloSharding output_spec =
+          GetReduceScatterOutput(inst, strategy, cluster_env);
+      if (IsUndefined(output_spec)) {
+        continue;
+      }
+
+      VLOG(10) << "SET:  " << output_spec.ToString();
+
+      if (absl::StartsWith(strategy.name, "RR = RS x SR")) {
+        // If set the sharding for this dot instruction, the SPMD
+        // partitioner will generate bad fallback code.
+        replicated_set.erase(inst);
+      }
+
+      if (use_all_reduce_for_grad_acc) {
+        UseAllReduceForGradAcc(replicated_set, inst);
+      }
+
+      for (HloInstruction* to_split : replicated_set) {
+        SetSharding(to_split, output_spec, inst, transpose_inst, modified);
+      }
+
+      if (!solver_option.reduce_scatter_aggressive_partition) {
+        // The normal case
+        for (HloInstruction* to_split : need_all_gather) {
+          SetSharding(to_split, output_spec, inst, transpose_inst, modified);
+
+          if (!do_all_gather_after_backward && to_split->users().size() == 1 &&
+              to_split->users().front() == output &&
+              alias_map.contains(to_split)) {
+            // Move the all-gather to its alias parameter.
+            // This partitions more tensors but introduces communication
+            // in the forward pass, which is not desired in gradient
+            // accumulation.
+            SetSharding(alias_map.at(to_split), output_spec, inst,
+                        transpose_inst, modified);
+            insert_all_gather.push_back(alias_map.at(to_split));
+          } else {
+            insert_all_gather.push_back(to_split);
+
+            if (to_split->opcode() == HloOpcode::kGetTupleElement &&
+                IsCustomCallMarker(to_split->operand(0)) &&
+                to_split->users().size() == 1 &&
+                to_split->users().front() == output) {
+              insert_all_gather.push_back(PassThroughCustomCallMarkerOperand(
+                  to_split->mutable_operand(0), to_split));
+            }
+          }
+        }
+      } else {
+        // Aggressively partition more parameter tensors.
+        // This can result in a strategy similar to ZeRO stage 3.
+        // NOTE: The combination of this branch with pipeline parallel is not
+        // tested.
+        for (HloInstruction* to_split : need_all_gather) {
+          SetSharding(to_split, output_spec, inst, transpose_inst, modified);
+
+          if (to_split->users().size() == 1 &&
+              to_split->users().front() == output &&
+              alias_map.contains(to_split)) {
+            // Move the all-gather to its alias parameter.
+            HloInstruction* param = alias_map.at(to_split);
+
+            // Find the branching point (i.e., skip elementwise ops like
+            // convert)
+            HloInstruction* cur = param;
+            while (cur->users().size() == 1) {
+              // TODO(zhuohan): handle tuple.
+              CHECK(cur->shape().IsArray());
+              SetSharding(cur, output_spec, inst, transpose_inst, modified);
+              cur = cur->users().front();
+            }
+            SetSharding(cur, output_spec, inst, transpose_inst, modified);
+
+            CHECK(!cur->users().empty());
+
+            // Find the first user
+            HloInstruction* first_user = nullptr;
+            int64_t min_depth = ((int64_t)1) << 50;
+            for (const auto& x : cur->users()) {
+              auto iter = depth_map.find(x);
+              if (iter == depth_map.end()) {
+                LOG(FATAL) << "ERROR: " << x->ToString();
+              }
+              if (x->opcode() != HloOpcode::kConvolution &&
+                  x->opcode() != HloOpcode::kDot) {
+                // Only apply this aggressive optimization for dot and conv
+                continue;
+              }
+              if (iter->second < min_depth) {
+                first_user = x;
+                min_depth = iter->second;
+              }
+            }
+
+            if (first_user != nullptr) {
+              // Insert an identity to prevent CSE of all-gather
+              HloInstruction* identity = inst->parent()->AddInstruction(
+                  HloInstruction::CreateCustomCall(cur->shape(), {cur},
+                                                   kIdentityMarker));
+              SetSharding(identity, output_spec, inst, transpose_inst,
+                          modified);
+              ReplaceOperand(first_user, cur, identity);
+            }
+          }
+        }
+      }
+    }
+
+    VLOG(10) << "-----------------------done\n";
+  }
+
+  // Insert all-gather on the output of boundary nodes by setting
+  // their shardings. This also works as CSE of all-gather.
+  for (HloInstruction* inst : insert_all_gather) {
+    HloInstruction* replace_with = inst->parent()->AddInstruction(
+        HloInstruction::CreateReshape(inst->shape(), inst));
+    replace_with->set_sharding(
+        GetShardingStrategy(inst, strategy_map, cost_graph, s_val)
+            .output_sharding);
+    TF_CHECK_OK(inst->ReplaceAllUsesWith(replace_with));
+  }
+}
+
+void AnnotateShardingWithSimpleHeuristic(
+    HloModule* module, const std::string& heuristic, const AliasMap& alias_map,
+    const ClusterEnvironment& cluster_env) {
+  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
+  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
+  int64_t num_devices = device_mesh.num_elements();
+
+  // Count the non-one mesh dimension.
+  size_t mesh_nn_dims = 0;
+  for (int dim : device_mesh.dimensions()) {
+    if (dim > 1) {
+      mesh_nn_dims++;
+    }
+  }
+
+  // Shard instructions
+  HloComputation* entry_computation = module->entry_computation();
+  for (HloInstruction* inst : entry_computation->instructions()) {
+    if (inst->opcode() == HloOpcode::kParameter) {
+      HloSharding output_spec = HloSharding::Replicate();
+      inst->set_sharding(output_spec);
+
+      if (heuristic == "shard-largest") {
+        std::vector<int64_t> lengths;
+        for (int64_t i = 0; i < inst->shape().rank(); ++i) {
+          lengths.push_back(inst->shape().dimensions(i));
+        }
+
+        std::vector<int> indices = Argsort(lengths);
+        int common_dims = std::min(mesh_nn_dims, indices.size());
+
+        if (common_dims < 1) {
+          continue;
+        }
+
+        if (common_dims == 1) {
+          int dim = indices[0];
+          int length = lengths[dim];
+          if (length % num_devices == 0) {
+            output_spec = Tile(inst->shape(), {dim}, {0}, device_mesh_1d);
+          }
+        } else {
+          int dim1 = indices[0];
+          int length1 = lengths[dim1];
+          int dim0 = indices[1];
+          int length0 = lengths[dim0];
+
+          if (length0 % device_mesh.dim(0) == 0 &&
+              length1 % device_mesh.dim(1) == 0) {
+            output_spec =
+                Tile(inst->shape(), {dim0, dim1}, {0, 1}, device_mesh);
+          }
+        }
+      } else if (heuristic == "shard-first") {
+        if (inst->shape().rank() > 0 &&
+            inst->shape().dimensions(0) % num_devices == 0) {
+          output_spec = Tile(inst->shape(), {0}, {0}, device_mesh_1d);
+        }
+      } else if (heuristic == "shard-last") {
+        int64_t last_dim = inst->shape().rank() - 1;
+        if (inst->shape().rank() > 0 &&
+            inst->shape().dimensions(last_dim) % num_devices == 0) {
+          output_spec = Tile(inst->shape(), {last_dim}, {0}, device_mesh_1d);
+        }
+      } else {
+        LOG(FATAL) << "Invalid heuristic: " << heuristic;
+      }
+
+      inst->set_sharding(output_spec);
+      // std::cerr << "ins: " << inst->ToString() << ", spec: " <<
+      // output_spec.ToString() << std::endl;
+    } else if (inst->opcode() == HloOpcode::kDot) {
+      const HloInstruction* lhs = inst->operand(0);
+      const HloInstruction* rhs = inst->operand(1);
+      const DotDimensionNumbers& dot_dnums = inst->dot_dimension_numbers();
+      // const auto& lhs_con_dims = dot_dnums.lhs_contracting_dimensions();
+      // const auto& rhs_con_dims = dot_dnums.rhs_contracting_dimensions();
+      std::vector<int64_t> lhs_space_dims, rhs_space_dims;
+      std::tie(lhs_space_dims, rhs_space_dims) =
+          GetSpaceDims(lhs->shape(), rhs->shape(), dot_dnums);
+    }
+  }
+
+  // Meet the alias requirement for the output tuple.
+  HloInstruction* output = entry_computation->root_instruction();
+  const Shape& out_shape = output->shape();
+  ShapeTree<HloSharding> tuple_sharding(out_shape, HloSharding::Replicate());
+  std::vector<HloSharding> flattened_shardings;
+
+  std::function<void(HloInstruction*)> get_flattened_shardings;
+  get_flattened_shardings = [&](HloInstruction* cur) {
+    for (int64_t i = 0; i < cur->operand_count(); ++i) {
+      HloInstruction* operand = cur->mutable_operand(i);
+
+      if (operand->shape().IsTuple()) {
+        get_flattened_shardings(operand);
+      } else {
+        if (alias_map.contains(operand)) {
+          operand = alias_map.at(operand);
+        }
+        if (!operand->has_sharding()) {
+          operand->set_sharding(HloSharding::Replicate());
+        }
+        CHECK(operand->has_sharding());
+        flattened_shardings.push_back(operand->sharding());
+      }
+    }
+  };
+  get_flattened_shardings(output);
+  int i = 0;
+  for (auto& leaf : tuple_sharding.leaves()) {
+    leaf.second = flattened_shardings[i++];
+  }
+  CHECK_EQ(i, flattened_shardings.size());
+  output->set_sharding(HloSharding::Tuple(tuple_sharding));
+}
+
+// Filter strategies according to the solver_option.force_batch_dim_to_mesh_dim.
+// This can be used to forcibly generate data-parallel strategies.
+Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
+                      std::unique_ptr<StrategyVector>& strategies,
+                      const ClusterEnvironment& cluster_env,
+                      const InstructionBatchDimMap& batch_map,
+                      const AutoShardingSolverOption& solver_option) {
+  int mesh_dim = solver_option.force_batch_dim_to_mesh_dim;
+  int batch_dim = batch_map.at(GetBatchDimMapKey(ins));
+  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
+
+  if (shape.dimensions(batch_dim) % device_mesh.dim(mesh_dim) != 0) {
+    return tsl::errors::InvalidArgument(
+        "The length of batch dimension is "
+        "not divisible by the number of devices");
+  }
+
+  std::vector<ShardingStrategy> new_leaf_vector;
+  for (auto& stra : strategies->leaf_vector) {
+    std::vector<int64_t> tensor_dim_to_mesh_dim =
+        cluster_env.GetTensorDimToMeshDimWrapper(shape, stra.output_sharding);
+
+    if (device_mesh.dim(mesh_dim) > 1) {
+      // If the mesh dim is not one, the output tensor must be
+      // tiled along the mesh dim.
+      if (tensor_dim_to_mesh_dim[batch_dim] == mesh_dim) {
+        new_leaf_vector.push_back(std::move(stra));
+      }
+    } else {
+      // If the mesh dim is one, the output tensor must be replicated
+      // on the mesh dim.
+      if (tensor_dim_to_mesh_dim[batch_dim] == -1) {
+        new_leaf_vector.push_back(std::move(stra));
+      }
+    }
+  }
+  CHECK(!new_leaf_vector.empty())
+      << ins->ToString() << " does not have any valid strategies";
+  strategies->leaf_vector = std::move(new_leaf_vector);
+
+  return OkStatus();
+}
+
+// Return the output sharding of the reduce-scatter variant of a given strategy.
+HloSharding GetReduceScatterOutput(const HloInstruction* ins,
+                                   const ShardingStrategy& strategy,
+                                   const ClusterEnvironment& cluster_env) {
+  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
+  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
+
+  if (ins->opcode() == HloOpcode::kDot) {
+    const DotDimensionNumbers& dot_dnums = ins->dot_dimension_numbers();
+    int64_t space_base_dim = dot_dnums.lhs_batch_dimensions_size();
+
+    if (absl::StartsWith(strategy.name, "SR = SS x SR") ||
+        absl::StartsWith(strategy.name, "RS = RS x SS")) {
+      int mesh_dim0, mesh_dim1;
+      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
+
+      if (!IsDivisible(ins, device_mesh, {space_base_dim, space_base_dim + 1},
+                       {mesh_dim0, mesh_dim1})) {
+        // XLA supports uneven partitioning by adding padding.
+        // However, the ShardingSpec in Jax does not support uneven
+        // partitioning.
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {space_base_dim, space_base_dim + 1},
+                  {mesh_dim0, mesh_dim1}, device_mesh);
+    }
+    if (absl::StartsWith(strategy.name, "SbR = SbSk x SbSk")) {
+      int mesh_dim0, mesh_dim1;
+      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
+
+      if (!IsDivisible(ins, device_mesh, {0, space_base_dim},
+                       {mesh_dim0, mesh_dim1})) {
+        // XLA supports uneven partitioning by adding padding.
+        // However, the ShardingSpec in Jax does not support uneven
+        // partitioning.
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {0, space_base_dim}, {mesh_dim0, mesh_dim1},
+                  device_mesh);
+    }
+    if (absl::StartsWith(strategy.name, "RR = RS x SR")) {
+      int mesh_dim = absl::StrContains(strategy.name, "{0}") ? 0 : 1;
+
+      if (!IsDivisible(ins, device_mesh, {space_base_dim}, {mesh_dim})) {
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {space_base_dim}, {mesh_dim}, device_mesh);
+    }
+    if (absl::StartsWith(strategy.name, "R = Sk x Sk")) {
+      int mesh_dim = 0;
+
+      if (!IsDivisible(ins, device_mesh_1d, {space_base_dim}, {mesh_dim})) {
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {space_base_dim}, {mesh_dim}, device_mesh_1d);
+    }
+  } else if (ins->opcode() == HloOpcode::kConvolution) {
+    const ConvolutionDimensionNumbers& conv_dnums =
+        ins->convolution_dimension_numbers();
+    int out_batch_dim = conv_dnums.output_batch_dimension();
+    int out_out_channel_dim = conv_dnums.output_feature_dimension();
+
+    if (absl::StartsWith(strategy.name, "SR = SS x SR") ||
+        absl::StartsWith(strategy.name, "RS = RS x SS")) {
+      int mesh_dim0, mesh_dim1;
+      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
+
+      if (!IsDivisible(ins, device_mesh, {out_batch_dim, out_out_channel_dim},
+                       {mesh_dim0, mesh_dim1})) {
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {out_batch_dim, out_out_channel_dim},
+                  {mesh_dim0, mesh_dim1}, device_mesh);
+    }
+    if (absl::StartsWith(strategy.name, "R = Sk x Sk")) {
+      int mesh_dim = 0;
+
+      if (!IsDivisible(ins, device_mesh_1d, {out_batch_dim}, {mesh_dim})) {
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {out_batch_dim}, {mesh_dim}, device_mesh_1d);
+    }
+  } else if (ins->opcode() == HloOpcode::kReduce) {
+    // TODO(zhuohan): support more cases.
+    CHECK_EQ(ins->shape().rank(), 1);
+
+    int mesh_dim;
+    if (absl::StrContains(strategy.name, "allreduce @ [0]")) {
+      mesh_dim = 0;
+    } else {
+      mesh_dim = 1;
+    }
+
+    if (strategy.output_sharding.IsReplicated()) {
+      if (absl::StrContains(strategy.name, "1d")) {
+        if (!IsDivisible(ins, device_mesh_1d, {0}, {mesh_dim})) {
+          return Undefined();
+        }
+
+        return Tile(ins->shape(), {0}, {mesh_dim}, device_mesh_1d);
+      }
+      if (!IsDivisible(ins, device_mesh, {0}, {mesh_dim})) {
+        return Undefined();
+      }
+
+      return Tile(ins->shape(), {0}, {mesh_dim}, device_mesh);
+    }
+    if (!IsDivisible(ins, device_mesh_1d, {0}, {0})) {
+      return Undefined();
+    }
+
+    Array<int64_t> tile_assignment = strategy.output_sharding.tile_assignment();
+    tile_assignment.Reshape({cluster_env.total_devices_});
+    return HloSharding::Tile(std::move(tile_assignment));
+
+  } else {
+    LOG(FATAL) << "Invalid instruction: " << ins->ToString();
+  }
+
+  return Undefined();
+}
+
+// Return whether an instruction has the opportunity to generate reduce-scatter.
+bool HasReduceScatterOpportunity(
+    const HloInstruction* inst, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const int64_t> s_val,
+    const StableHashSet<const HloInstruction*>& modified) {
+  // If the operand is already modified by other ops, skip this instruction to
+  // avoid conflicts.
+  for (const HloInstruction* operand : inst->operands()) {
+    if (modified.contains(operand)) {
+      return false;
+    }
+  }
+  if (modified.contains(inst)) {
+    return false;
+  }
+
+  if (inst->opcode() == HloOpcode::kReduce && inst->shape().rank() == 1) {
+    return true;
+  }
+  if (inst->opcode() == HloOpcode::kDot) {
+    if (GetShardingStrategy(inst->operand(0), strategy_map, cost_graph, s_val)
+            .output_sharding.IsReplicated() &&
+        GetShardingStrategy(inst->operand(1), strategy_map, cost_graph, s_val)
+            .output_sharding.IsReplicated()) {
+      // This dot is replicated on all devices. Do not split it.
+      // TODO(zhuohan): improve this condition.
+      return false;
+    }
+
+    return true;
+  }
+  if (inst->opcode() == HloOpcode::kConvolution) {
+    return true;
+  }
+
+  return false;
+}
 
 }  // namespace spmd
 
@@ -2939,6 +3714,12 @@ StatusOr<bool> AutoSharding::Run(
   bool module_is_changed = false;
   VLOG(1) << "Start auto sharding pass";
 
+#if !defined(__APPLE__)
+  // Streamz metrics.
+  absl::Time start_time = absl::Now();
+  metrics::RecordAutoShardingInvocations();
+#endif
+
   bool set_to_memory_lower_bound = (option_.memory_budget_per_device == 0);
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
   VLOG(1) << "AutoShardingOptions:\n" << option_.ToString();
@@ -2977,6 +3758,9 @@ StatusOr<bool> AutoSharding::Run(
   solver_option.force_strategy_inst_indices =
       option_.force_strategy_inst_indices;
   solver_option.force_strategy_stra_names = option_.force_strategy_stra_names;
+  solver_option.only_allow_divisible_input_output = true;
+  solver_option.only_allow_divisible_intermediate = false;
+  solver_option.nd_sharding_iteratively_strict_search_space = false;
 
   // Remove CustomCalls with custom_call_target="Sharding" and move their
   // shardings to their input ops.
@@ -2989,7 +3773,7 @@ StatusOr<bool> AutoSharding::Run(
   // sharding propagation pass after that before spmd partitioner.
   auto status_or_changed = ProcessShardingInstruction(
       module, execution_threads, /*replace_sharding_with_copy=*/true,
-      &unspecified_dims);
+      &unspecified_dims, /*saved_root_shardings=*/nullptr);
   if (!status_or_changed.ok()) {
     return status_or_changed;
   }
@@ -3052,6 +3836,7 @@ StatusOr<bool> AutoSharding::Run(
   }
   VLOG(10) << hlo_live_range->ToString();
   VLOG(10) << spmd::PrintLivenessSet(liveness_set);
+  XLA_VLOG_LINES(10, spmd::PrintLivenessSet(liveness_set));
   const HloInstructionSequence& sequence =
       hlo_live_range->flattened_instruction_sequence();
 
@@ -3112,22 +3897,17 @@ StatusOr<bool> AutoSharding::Run(
               << " GB.";
     if (set_to_memory_lower_bound) {
       LOG(INFO)
-          << "--xla_tpu_auto_spmd_partitioning_memory_budget_gb is 0, setting "
-             "option.memory_budget_per_device to be the estimated memory "
-             "consumption lower bound of this module to maximize sharding. "
-             "Note "
-             "that the memory consumption estimation does not take into "
-             "account "
-             "alias pairs or while op inputs. So if the model "
-             "is very small such that the alias pairs and while op inputs "
-             "consist significant memory usage percentage, this lower bound "
-             "will "
-             "cause solver being unable to find feasible solutison. Please set "
-             "xla_tpu_auto_spmd_partitioning_memory_budget_gb to be greater "
-             "than "
-          << memory_lower_bound_gb << " if this behavior is undesired.";
-      option_.memory_budget_per_device =
-          memory_lower_bound_gb * (1024 * 1024 * 1024);
+          << "--xla_tpu_auto_spmd_partitioning_memory_budget_gb is 0, and "
+             "--xla_tpu_auto_spmd_partitioning_memory_budget_ratio is "
+          << option_.memory_budget_ratio
+          << ", so setting "
+             "option.memory_budget_per_device to "
+          << memory_lower_bound_gb << " x " << option_.memory_budget_ratio
+          << " = " << memory_lower_bound_gb * option_.memory_budget_ratio
+          << " GB";
+      option_.memory_budget_per_device = memory_lower_bound_gb *
+                                         (1024 * 1024 * 1024) *
+                                         option_.memory_budget_ratio;
     } else if (option_.memory_budget_per_device > 0) {
       option_.memory_budget_per_device = original_memory_budget *
                                          original_device_mesh.num_elements() /
@@ -3158,8 +3938,9 @@ StatusOr<bool> AutoSharding::Run(
 
     TF_ASSIGN_OR_RETURN(
         std::tie(strategy_map, leaf_strategies, associative_dot_pairs),
-        BuildStrategyAndCost(sequence, ins_depth_map, batch_dim_map, alias_map,
-                             cluster_env, solver_option, *call_graph));
+        BuildStrategyAndCost(sequence, module, ins_depth_map, batch_dim_map,
+                             alias_map, cluster_env, solver_option,
+                             *call_graph));
     spmd::AliasSet alias_set = spmd::BuildAliasSet(module, strategy_map);
     CheckAliasSetCompatibility(alias_set, leaf_strategies, sequence);
     XLA_VLOG_LINES(8, PrintStrategyMap(strategy_map, sequence));
@@ -3216,6 +3997,13 @@ StatusOr<bool> AutoSharding::Run(
   TF_RETURN_IF_ERROR(CanonicalizeLayouts(module));
   XLA_VLOG_LINES(6, absl::StrCat("After auto sharding:\n", module->ToString()));
   DumpHloModuleIfEnabled(*module, "after_auto_spmd_sharding");
+
+#if !defined(__APPLE__)
+  absl::Time end_time = absl::Now();
+  auto duration = end_time - start_time;
+  metrics::RecordAutoShardingCompilationTime(
+      absl::ToInt64Microseconds(duration));
+#endif
   return module_is_changed;
 }
 
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 80b036add45..db7fe6bfc41 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -17,17 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_H_
 
 #include <cstdint>
+#include <memory>
 #include <numeric>
 #include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/protobuf/error_codes.pb.h"
-
 namespace xla {
 
 class DummyAutoSharding : public HloModulePass {
@@ -73,8 +76,14 @@ struct AutoShardingOption {
   bool simplify_graph = true;
 
   // Memory budget (bytes) per device. Default value -1 means no memory budget.
+  // Value 0 means setting it to the memory lower bound estimation.
   int64_t memory_budget_per_device = -1;
 
+  // Memory budget =
+  //     memory_budget_ratio * (memory lower bound estimation).
+  // Enabled when memory_budget_per_device == 0;
+  float memory_budget_ratio = 1.1;
+
   // Overwrite the all gather cost with the input all reduce cost.
   bool force_all_gather_cost = false;
   double all_gather_cost;
@@ -329,6 +338,91 @@ class AutoSharding : public HloModulePass {
   AutoShardingOption option_;
 };
 
+namespace spmd {
+// Function declarations
+// Their comments can be found in their definitions in *.cc files.
+HloSharding Tile(const Shape& shape, absl::Span<const int64_t> tensor_dims,
+                 absl::Span<const int64_t> mesh_dims,
+                 const Array<int64_t>& device_mesh);
+
+std::vector<double> ReshardingCostVector(const StrategyVector* strategies,
+                                         const Shape& shape,
+                                         const HloSharding& required_sharding,
+                                         const ClusterEnvironment& cluster_env);
+
+std::vector<double> FollowInsCostVector(int64_t source_len, int64_t index);
+
+std::unique_ptr<StrategyVector> CreateLeafStrategyVector(
+    size_t instruction_id, const HloInstruction* ins,
+    const StrategyMap& strategy_map, LeafStrategies& leaf_strategies);
+
+void SetInNodesWithInstruction(std::unique_ptr<StrategyVector>& strategies,
+                               const HloInstruction* ins,
+                               const StrategyMap& strategy_map);
+
+void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies);
+
+Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
+                      std::unique_ptr<StrategyVector>& strategies,
+                      const ClusterEnvironment& cluster_env,
+                      const InstructionBatchDimMap& batch_map,
+                      const AutoShardingSolverOption& solver_option);
+
+Status HandleDot(std::unique_ptr<StrategyVector>& strategies,
+                 LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
+                 const HloInstruction* ins, size_t instruction_id,
+                 const ClusterEnvironment& cluster_env,
+                 const InstructionBatchDimMap& batch_map,
+                 const AutoShardingSolverOption& solver_option);
+
+Status HandleConv(std::unique_ptr<StrategyVector>& strategies,
+                  LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
+                  const HloInstruction* ins, size_t instruction_id,
+                  const ClusterEnvironment& cluster_env,
+                  const InstructionBatchDimMap& batch_map,
+                  const AutoShardingSolverOption& solver_option);
+
+void AnnotateShardingWithSimpleHeuristic(HloModule* module,
+                                         const std::string& heuristic,
+                                         const AliasMap& alias_map,
+                                         const ClusterEnvironment& cluster_env);
+
+// Handle alias: alias pairs must have the same HloSharding.
+// To deal with alias, we do special process both before and after
+// BuildStrategyAndCost. Because it is easier to handle elementwise
+// instructions before BuildStrategyAndCost and it is easier to handle
+// dot/conv instructions after BuildStrategyAndCost. Before
+// BuildStrategyAndCost, we build an AliasMap to guide the generation of
+// strategies. After BuildStrategyAndCost, we use AliasSet to add alias
+// constraints in the ILP problem.
+AliasMap BuildAliasMap(const HloModule* module);
+
+AliasSet BuildAliasSet(const HloModule* module,
+                       const StrategyMap& strategy_map);
+
+void CheckAliasSetCompatibility(const AliasSet& alias_set,
+                                const LeafStrategies& leaf_strategies,
+                                const HloInstructionSequence& sequence);
+
+void GenerateReduceScatter(const HloInstructionSequence& sequence,
+                           const AliasMap& alias_map,
+                           const InstructionDepthMap& depth_map,
+                           const StrategyMap& strategy_map,
+                           const CostGraph& cost_graph,
+                           absl::Span<const int64_t> s_val,
+                           const ClusterEnvironment& cluster_env,
+                           const AutoShardingSolverOption& solver_option);
+
+bool HasReduceScatterOpportunity(
+    const HloInstruction* inst, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const int64_t> s_val,
+    const StableHashSet<const HloInstruction*>& modified);
+
+HloSharding GetReduceScatterOutput(const HloInstruction* ins,
+                                   const ShardingStrategy& strategy,
+                                   const ClusterEnvironment& cluster_env);
+
+}  // namespace spmd
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index 0169d0a80de..718d00aea1f 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
-#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
-
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h"
 namespace xla {
 namespace spmd {
 
@@ -344,15 +343,6 @@ inline const ShardingStrategy& GetShardingStrategy(
   return strategies->leaf_vector[stra_idx];
 }
 
-void GenerateReduceScatter(const HloInstructionSequence& sequence,
-                           const AliasMap& alias_map,
-                           const InstructionDepthMap& depth_map,
-                           const StrategyMap& strategy_map,
-                           const CostGraph& cost_graph,
-                           absl::Span<const int64_t> s_val,
-                           const ClusterEnvironment& cluster_env,
-                           const AutoShardingSolverOption& solver_option);
-
 }  // namespace spmd
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_COST_GRAPH_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 9334c366a6e..70b121d1a16 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
@@ -88,11 +91,17 @@ class DotHandler {
   void SplitLhsSpaceRhsSpace(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = 0; j < rhs_space_dims_.size(); ++j) {
-        // TODO(b/220942808) Shard non-dividible op dimensions.
-        if (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                         device_mesh_.dim(mesh_dim0)) ||
-            !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
-                         device_mesh_.dim(mesh_dim1))) {
+        if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
+                device_mesh_.dim(mesh_dim0) ||
+            rhs_->shape().dimensions().at(rhs_space_dims_.at(j)) <
+                device_mesh_.dim(mesh_dim1)) {
+          continue;
+        }
+        if (solver_option_.only_allow_divisible_intermediate &&
+            (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
+                          device_mesh_.dim(mesh_dim0)) ||
+             !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
+                          device_mesh_.dim(mesh_dim1)))) {
           continue;
         }
         std::string name =
@@ -117,10 +126,17 @@ class DotHandler {
   void SplitLhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < lhs_space_dims_.size(); ++j) {
-        if (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                         device_mesh_.dim(mesh_dim0)) ||
-            !IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(j)),
-                         device_mesh_.dim(mesh_dim1))) {
+        if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
+                device_mesh_.dim(mesh_dim0) ||
+            lhs_->shape().dimensions().at(lhs_space_dims_.at(j)) <
+                device_mesh_.dim(mesh_dim1)) {
+          continue;
+        }
+        if (solver_option_.only_allow_divisible_intermediate &&
+            (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
+                          device_mesh_.dim(mesh_dim0)) ||
+             !IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(j)),
+                          device_mesh_.dim(mesh_dim1)))) {
           continue;
         }
         std::string name =
@@ -142,10 +158,17 @@ class DotHandler {
   void SplitRhsSpaceOnly(int mesh_dim0, int mesh_dim1) {
     for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
       for (int64_t j = i + 1; j < rhs_space_dims_.size(); ++j) {
-        if (!IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                         device_mesh_.dim(mesh_dim0)) ||
-            !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
-                         device_mesh_.dim(mesh_dim1))) {
+        if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
+                device_mesh_.dim(mesh_dim0) ||
+            rhs_->shape().dimensions().at(rhs_space_dims_.at(j)) <
+                device_mesh_.dim(mesh_dim1)) {
+          continue;
+        }
+        if (solver_option_.only_allow_divisible_intermediate &&
+            (!IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
+                          device_mesh_.dim(mesh_dim0)) ||
+             !IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(j)),
+                          device_mesh_.dim(mesh_dim1)))) {
           continue;
         }
         std::string name =
@@ -174,10 +197,18 @@ class DotHandler {
                           mesh_dim1, mesh_dim1);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim0)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim1))) {
+          if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim0) ||
+              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim1)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(
+                   lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
+                   device_mesh_.dim(mesh_dim0)) ||
+               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
+                            device_mesh_.dim(mesh_dim1)))) {
             continue;
           }
 
@@ -207,10 +238,18 @@ class DotHandler {
                           mesh_dim1, mesh_dim0);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_con_dims_.size(); ++j) {
-          if (!IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim1)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim0))) {
+          if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim1) ||
+              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim0)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(
+                   rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
+                   device_mesh_.dim(mesh_dim1)) ||
+               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
+                            device_mesh_.dim(mesh_dim0)))) {
             continue;
           }
           HloSharding output_spec =
@@ -240,7 +279,12 @@ class DotHandler {
                          [](int64_t size) { return size > 1; }) == 1) {
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
         for (int64_t j = 0; j < device_mesh_.num_dimensions(); ++j) {
-          if (!IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
+          if (lhs_->shape().dimensions().at(lhs_batch_dims_.at(i)) <
+              device_mesh_.dim(j)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
                            device_mesh_.dim(j))) {
             continue;
           }
@@ -260,11 +304,20 @@ class DotHandler {
 
   void SplitTwoBatchDims(int mesh_dim0, int mesh_dim1) {
     if (lhs_batch_dims_.size() == 2 && device_mesh_.dim(mesh_dim0) > 1 &&
-        device_mesh_.dim(mesh_dim1) > 1 &&
-        IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(0)),
-                    device_mesh_.dim(mesh_dim0)) &&
-        IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(1)),
-                    device_mesh_.dim(mesh_dim1))) {
+        device_mesh_.dim(mesh_dim1) > 1) {
+      if (lhs_->shape().dimensions().at(lhs_batch_dims_.at(0)) <
+              device_mesh_.dim(mesh_dim0) ||
+          lhs_->shape().dimensions().at(lhs_batch_dims_.at(1)) <
+              device_mesh_.dim(mesh_dim1)) {
+        return;
+      }
+      if (solver_option_.only_allow_divisible_intermediate &&
+          (!IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(0)),
+                        device_mesh_.dim(mesh_dim0)) ||
+           !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(1)),
+                        device_mesh_.dim(mesh_dim1)))) {
+        return;
+      }
       std::string name =
           absl::StrFormat("Sb = Sb x Sb @ {%d,%d}", mesh_dim0, mesh_dim1);
       HloSharding output_spec =
@@ -287,10 +340,19 @@ class DotHandler {
           absl::StrFormat("SbSi = SbSi x SbR @ {%d,%d}", mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!IsDivisible(lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim0)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim1))) {
+          if (lhs_->shape().dimensions().at(lhs_space_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim0) ||
+              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim1)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(
+                   lhs_->shape().dimensions().at(lhs_space_dims_.at(i)),
+                   device_mesh_.dim(mesh_dim0)) ||
+               !IsDivisible(
+                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
+                   device_mesh_.dim(mesh_dim1)))) {
             continue;
           }
           HloSharding output_spec =
@@ -316,10 +378,19 @@ class DotHandler {
           absl::StrFormat("SbSj = SbR x SbSj @ {%d,%d}", mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < rhs_space_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!IsDivisible(rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim1)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim0))) {
+          if (rhs_->shape().dimensions().at(rhs_space_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim1) ||
+              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim0)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(
+                   rhs_->shape().dimensions().at(rhs_space_dims_.at(i)),
+                   device_mesh_.dim(mesh_dim1)) ||
+               !IsDivisible(
+                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
+                   device_mesh_.dim(mesh_dim0)))) {
             continue;
           }
           HloSharding output_spec =
@@ -348,10 +419,18 @@ class DotHandler {
                           mesh_dim0, mesh_dim1, mesh_dim1);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = 0; j < lhs_batch_dims_.size(); ++j) {
-          if (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim1)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim0))) {
+          if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim1) ||
+              lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim0)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
+                            device_mesh_.dim(mesh_dim1)) ||
+               !IsDivisible(
+                   lhs_->shape().dimensions().at(lhs_batch_dims_.at(j)),
+                   device_mesh_.dim(mesh_dim0)))) {
             continue;
           }
           HloSharding output_spec =
@@ -383,14 +462,25 @@ class DotHandler {
                           mesh_dim0, mesh_dim1, mesh_dim0, mesh_dim1);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
         for (int64_t j = i + 1; j < lhs_con_dims_.size(); ++j) {
-          if (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim0)) ||
-              !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim1)) ||
-              !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(i)),
-                           device_mesh_.dim(mesh_dim0)) ||
-              !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(j)),
-                           device_mesh_.dim(mesh_dim1))) {
+          if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim0) ||
+              lhs_->shape().dimensions().at(lhs_con_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim1) ||
+              rhs_->shape().dimensions().at(rhs_con_dims_.at(i)) <
+                  device_mesh_.dim(mesh_dim0) ||
+              rhs_->shape().dimensions().at(rhs_con_dims_.at(j)) <
+                  device_mesh_.dim(mesh_dim1)) {
+            continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
+                            device_mesh_.dim(mesh_dim0)) ||
+               !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(j)),
+                            device_mesh_.dim(mesh_dim1)) ||
+               !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(i)),
+                            device_mesh_.dim(mesh_dim0)) ||
+               !IsDivisible(rhs_->shape().dimensions().at(rhs_con_dims_.at(j)),
+                            device_mesh_.dim(mesh_dim1)))) {
             continue;
           }
           HloSharding output_spec = HloSharding::Replicate();
@@ -416,7 +506,12 @@ class DotHandler {
       std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
                                          mesh_dim0, mesh_dim0);
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (!IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
+        if (lhs_->shape().dimensions().at(lhs_con_dims_.at(i)) <
+            device_mesh_.dim(mesh_dim0)) {
+          continue;
+        }
+        if (solver_option_.only_allow_divisible_intermediate &&
+            !IsDivisible(lhs_->shape().dimensions().at(lhs_con_dims_.at(i)),
                          device_mesh_.dim(mesh_dim0))) {
           continue;
         }
@@ -447,8 +542,14 @@ class DotHandler {
 
       // Si = Si x R @ 0
       for (int64_t i = 0; i < lhs_space_dims_.size(); ++i) {
-        if (IsDivisible(lhs_->shape().dimensions(lhs_space_dims_[i]),
-                        num_devices)) {
+        if (lhs_->shape().dimensions(lhs_space_dims_[i]) < num_devices) {
+          continue;
+        }
+        if (solver_option_.only_allow_divisible_intermediate &&
+            !IsDivisible(lhs_->shape().dimensions(lhs_space_dims_[i]),
+                         num_devices)) {
+          continue;
+        }
           std::string name = absl::StrFormat("Si = Si x R @ %d", mesh_dim);
           HloSharding output_spec = Tile(ins_->shape(), {space_base_dim_ + i},
                                          {mesh_dim}, device_mesh_1d_);
@@ -457,13 +558,18 @@ class DotHandler {
           HloSharding rhs_spec = HloSharding::Replicate();
           AppendNewStrategy(ins_, name, output_spec, {lhs_spec, rhs_spec}, 0, 0,
                             cluster_env_, strategy_map_, strategies_);
-        }
       }
 
       // R = Sk x Sk @ (allreduce @ 0)
       for (int64_t i = 0; i < lhs_con_dims_.size(); ++i) {
-        if (IsDivisible(lhs_->shape().dimensions(lhs_con_dims_[i]),
-                        num_devices)) {
+          if (lhs_->shape().dimensions(lhs_con_dims_[i]) < num_devices) {
+          continue;
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              !IsDivisible(lhs_->shape().dimensions(lhs_con_dims_[i]),
+                           num_devices)) {
+          continue;
+          }
           std::string name = absl::StrFormat(
               "R = Sk x Sk @ %d (allreduce @ %d)", mesh_dim, mesh_dim);
           HloSharding output_spec = HloSharding::Replicate();
@@ -479,7 +585,6 @@ class DotHandler {
                             communication_cost, cluster_env_, strategy_map_,
                             strategies_);
         }
-      }
     }
   }
 
@@ -489,10 +594,15 @@ class DotHandler {
                          [](int64_t size) { return size > 1; }) > 1) {
       int mesh_dim = 0;
       for (int64_t i = 0; i < lhs_batch_dims_.size(); ++i) {
-        if (!IsDivisible(rhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
-                         device_mesh_.dim(mesh_dim))) {
+          if (rhs_->shape().dimensions().at(lhs_batch_dims_.at(i)) <
+              device_mesh_.dim(mesh_dim)) {
           continue;
-        }
+          }
+          if (solver_option_.only_allow_divisible_intermediate &&
+              !IsDivisible(rhs_->shape().dimensions().at(lhs_batch_dims_.at(i)),
+                           device_mesh_.dim(mesh_dim))) {
+          continue;
+          }
         std::string name =
             absl::StrFormat("Sb_%d = Sb x Sb @ {%d} 1d", i, mesh_dim);
         HloSharding output_spec =
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
index 0a87624df87..e12ecef29f0 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_runner.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h
new file mode 100644
index 00000000000..18d5495ae12
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h
@@ -0,0 +1,107 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H__
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H__
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace xla {
+namespace spmd {
+// Options for the auto-sharding solver.
+struct AutoShardingSolverOption {
+  // Forcibly split the batch dimension and map it to a mesh dimension.
+  // This can force the auto-sharding pass to generate the data parallel
+  // strategy.
+  int force_batch_dim_to_mesh_dim;
+
+  // If true, override the cost of all-gather with the given value.
+  bool override_all_gather_cost;
+  double all_gather_cost;
+
+  // If true, override the cost of all-reduce with the given value.
+  bool override_all_reduce_cost;
+  double all_reduce_cost;
+
+  // If true, override the cost of reduce-scatter with the given value.
+  bool override_reduce_scatter_cost;
+  double reduce_scatter_cost;
+
+  // If true, override the cost of all-to-all with the given value.
+  bool override_all_to_all_cost;
+  double all_to_all_cost;
+
+  // If true, allow replicated parameters.
+  bool allow_replicated_parameters;
+
+  // If true, prefer reduce-scatter + all-gather over all-reduce.
+  // A post process will be applied to replace all-reduce with reduce-scater +
+  // all-gather if no communication overhead is introduced.
+  bool prefer_reduce_scatter;
+
+  // If True, generate a gradient-accumulation friendly variant of
+  // reduce-scatter
+  bool reduce_scatter_grad_acc_friendly;
+
+  // If true, aggressively partition more tensors when generating
+  // reduce-scatter, even if it introduces more communication.
+  bool reduce_scatter_aggressive_partition;
+
+  // If true, the batch matmul will always be parallelized on the batch dim in
+  // 2d mesh case.
+  bool batch_matmul_always_split_batch;
+
+  // If true, allow strategies that recompute heavy operators (e.g., dot)
+  // to reduce communication.
+  bool allow_recompute_heavy_op;
+
+  // If true, allow adding 1d strategies in 2d logical mesh.
+  bool allow_mixed_mesh_shape;
+
+  // The number of micro batches if gradient accumulation is used.
+  // If this is not 1, the cost of all-reduce for gradient synchronization
+  // is divided by this number.
+  int grad_acc_num_micro_batches;
+
+  // If true, load solution vector from PassContext
+  bool load_solution_vector;
+
+  // If it is not empty, forcibly use simple heuristic strategies
+  // instead of the ILP solver. This is used for ablation study.
+  std::string force_simple_heuristic;
+
+  // If true, forcibly set the strategy of some instructions
+  bool force_strategy;
+  std::vector<int64_t> force_strategy_inst_indices;
+  std::vector<std::string> force_strategy_stra_names;
+
+  bool only_allow_divisible_input_output;
+
+  bool only_allow_divisible_intermediate;
+
+  // If true, trictly limit the following iterations to use the same number of
+  // shards for sharded tensor dimensions; if false, the following iterations
+  // can choose different number of shards for sharded tensor dimensions.
+  // Enabling it can hurt the performance of dot ops, but can make the search
+  // space more scalable. Therefore leaving it as an option.
+  bool nd_sharding_iteratively_strict_search_space;
+};
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_SOLVER_OPTION_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
index d23d897e53d..49ef931c3ce 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -31,82 +31,32 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
-#include "tensorflow/compiler/xla/service/hlo_live_range.h"
-
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 namespace xla {
 namespace spmd {
 
 // A constant to represent infinity cost.
 constexpr double kInfinityCost = 1e13;
 
-// Options for the auto-sharding solver.
-struct AutoShardingSolverOption {
-  // Forcibly split the batch dimension and map it to a mesh dimension.
-  // This can force the auto-sharding pass to generate the data parallel
-  // strategy.
-  int force_batch_dim_to_mesh_dim;
-
-  // If true, override the cost of all-gather with the given value.
-  bool override_all_gather_cost;
-  double all_gather_cost;
-
-  // If true, override the cost of all-reduce with the given value.
-  bool override_all_reduce_cost;
-  double all_reduce_cost;
-
-  // If true, override the cost of reduce-scatter with the given value.
-  bool override_reduce_scatter_cost;
-  double reduce_scatter_cost;
-
-  // If true, override the cost of all-to-all with the given value.
-  bool override_all_to_all_cost;
-  double all_to_all_cost;
-
-  // If true, allow replicated parameters.
-  bool allow_replicated_parameters;
-
-  // If true, prefer reduce-scatter + all-gather over all-reduce.
-  // A post process will be applied to replace all-reduce with reduce-scater +
-  // all-gather if no communication overhead is introduced.
-  bool prefer_reduce_scatter;
-
-  // If True, generate a gradient-accumulation friendly variant of
-  // reduce-scatter
-  bool reduce_scatter_grad_acc_friendly;
-
-  // If true, aggressively partition more tensors when generating
-  // reduce-scatter, even if it introduces more communication.
-  bool reduce_scatter_aggressive_partition;
-
-  // If true, the batch matmul will always be parallelized on the batch dim in
-  // 2d mesh case.
-  bool batch_matmul_always_split_batch;
-
-  // If true, allow strategies that recompute heavy operators (e.g., dot)
-  // to reduce communication.
-  bool allow_recompute_heavy_op;
-
-  // If true, allow adding 1d strategies in 2d logical mesh.
-  bool allow_mixed_mesh_shape;
-
-  // The number of micro batches if gradient accumulation is used.
-  // If this is not 1, the cost of all-reduce for gradient synchronization
-  // is divided by this number.
-  int grad_acc_num_micro_batches;
-
-  // If true, load solution vector from PassContext
-  bool load_solution_vector;
-
-  // If it is not empty, forcibly use simple heuristic strategies
-  // instead of the ILP solver. This is used for ablation study.
-  std::string force_simple_heuristic;
-
-  // If true, forcibly set the strategy of some instructions
-  bool force_strategy;
-  std::vector<int64_t> force_strategy_inst_indices;
-  std::vector<std::string> force_strategy_stra_names;
-};
+// Type alias
+template <typename Key, typename Value>
+using StableHashMap = ::absl::flat_hash_map<Key, Value>;
+template <typename Key>
+using StableHashSet = ::absl::flat_hash_set<Key>;
+
+// Map an instruction to its depth.
+using InstructionDepthMap = StableHashMap<const HloInstruction*, int64_t>;
+// Map an instruction to its batch dimension.
+using InstructionBatchDimMap = StableHashMap<std::string, int>;
+// Map an instruction to its alias source parameter.
+using AliasMap = StableHashMap<const HloInstruction*, HloInstruction*>;
+// Map an instruction to its resharding cache.
+using ReshardingCache =
+    StableHashMap<const HloInstruction*,
+                  std::vector<std::pair<HloSharding, HloInstruction*>>>;
 
 // One sharding strategy
 struct ShardingStrategy {
@@ -233,524 +183,7 @@ using AssociativeDotPairs =
 // The set of all alias pairs
 using AliasSet = StableHashSet<std::pair<int64_t, int64_t>>;
 
-// Store the profiling results of communication and computation.
-class ProfilingResult {
- public:
-  // TODO (zhuohan): loading the profiling result.
-  ProfilingResult() {
-    if (all_reduce_cost_dict_.empty()) {
-      enabled_ = false;
-    } else {
-      enabled_ = true;
-    }
-  }
-
-  bool Enabled() const { return enabled_; }
-
-  double EstimateAllGatherCost(
-      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
-      const std::string& dtype) const {
-    if (all_gather_cost_dict_.empty()) {
-      // Use all-reduce to approximate all-gather.
-      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
-    }
-
-    return EstimateInternal(replica_groups, size, dtype,
-                            all_gather_cost_dict_) -
-           EstimateInternal(replica_groups, 0, dtype, all_gather_cost_dict_);
-  }
-
-  double EstimateAllReduceCost(
-      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
-      const std::string& dtype) const {
-    return EstimateInternal(replica_groups, size, dtype,
-                            all_reduce_cost_dict_) -
-           EstimateInternal(replica_groups, 0, dtype, all_reduce_cost_dict_);
-  }
-
-  double EstimateReduceScatterCost(
-      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
-      const std::string& dtype) const {
-    if (reduce_scatter_cost_dict_.empty()) {
-      // Use all-reduce to approximate reduce-scatter.
-      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
-    }
-
-    return EstimateInternal(replica_groups, size, dtype,
-                            reduce_scatter_cost_dict_) -
-           EstimateInternal(replica_groups, 0, dtype,
-                            reduce_scatter_cost_dict_);
-  }
-
-  double EstimateAllToAllCost(
-      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
-      const std::string& dtype) const {
-    // A penalty factor to make the theoretical cost match the
-    // empirical cost on v100 + nvlink.
-    int64_t num_devices = replica_groups.front().size();
-    double penalty_factor = static_cast<double>(num_devices) / 2.0;
-    // Use all-gather to approximate all-to-all.
-    return EstimateAllGatherCost(replica_groups, size / num_devices, dtype) *
-           penalty_factor;
-  }
-
-  std::string ToString() {
-    std::string str;
-    for (const auto& item : all_reduce_cost_dict_) {
-      absl::StrAppend(&str, item.first.first, " ", item.first.second, "\n");
-    }
-    return str;
-  }
-
- private:
-  // pair<group, dtype>
-  using Key = std::pair<std::string, std::string>;
-  // vector<pair<size, time>>
-  using Value = std::vector<std::pair<int64_t, double>>;
-
-  // Estimate the cost by linear interpolation between the two closest points.
-  double EstimateInternal(
-      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
-      const std::string& dtype,
-      const StableHashMap<Key, Value>& cost_dict) const {
-    Key key(Group2Str(replica_groups), dtype);
-    Value cost_list = cost_dict.at(key);
-
-    CHECK(!cost_list.empty());
-
-    size_t i;
-    if (size > cost_list.back().first) {
-      i = cost_list.size() - 2;
-    } else if (size < cost_list.front().first) {
-      i = 0;
-    } else {
-      for (i = 0; i < cost_list.size() - 1; ++i) {
-        if (cost_list[i].first <= size && size <= cost_list[i + 1].first) {
-          break;
-        }
-      }
-    }
-
-    int64_t left_size = cost_list[i].first;
-    double left_cost = cost_list[i].second;
-    int64_t right_size = cost_list[i + 1].first;
-    double right_cost = cost_list[i + 1].second;
-
-    return 1.0 * (size - left_size) / (right_size - left_size) *
-               (right_cost - left_cost) +
-           left_cost;
-  }
-
-  // Make a string key of a replica_groups.
-  std::string Group2Str(
-      const std::vector<std::vector<int64_t>>& replica_groups) const {
-    std::string str;
-    absl::StrAppend(&str, "(");
-    for (const auto& group : replica_groups) {
-      absl::StrAppend(&str, "(", absl::StrJoin(group, ","), ")");
-    }
-    absl::StrAppend(&str, ")");
-
-    return str;
-  }
-
-  bool enabled_;
-  StableHashMap<Key, Value> all_reduce_cost_dict_;
-  StableHashMap<Key, Value> all_gather_cost_dict_;
-  StableHashMap<Key, Value> reduce_scatter_cost_dict_;
-};
-
-// The cluster has a multi-dimensional device mesh topology.
-// Each mesh dimension has its own latency and bandwidth.
-// We use alpha-beta model to model the communication cost.
-// If profiling result is provided, we always prefer to use
-// the real profiling result.
-class ClusterEnvironment {
- public:
-  ClusterEnvironment(const Array<int64_t>& original_device_mesh,
-                     const Array<int64_t>& device_mesh,
-                     absl::Span<const double> mesh_alpha,
-                     absl::Span<const double> mesh_beta,
-                     const ProfilingResult& prof_result,
-                     const AutoShardingSolverOption& solver_option)
-      : original_device_mesh_(original_device_mesh),
-        device_mesh_(device_mesh),
-        mesh_alpha_(mesh_alpha.begin(), mesh_alpha.end()),
-        mesh_beta_(mesh_beta.begin(), mesh_beta.end()),
-        prof_result_(prof_result),
-        total_devices_(device_mesh.num_elements()),
-        device_mesh_1d_(original_device_mesh),
-        solver_option_(solver_option) {
-    // Build replica group for each dimension.
-    non_zero_mesh_dims_ =
-        VectorGreaterThanOneElementIndices(device_mesh.dimensions());
-    GenerateCachedReplicaGroups();
-    // TODO(yuemmawang) Find the largest dimension in original_device_mesh and
-    // create 1d mesh on that dimension.
-    device_mesh_1d_.Reshape({original_device_mesh.num_elements(), 1});
-  }
-
-  size_t NumDevices() const { return total_devices_; }
-
-  bool IsDeviceMesh3D() const {
-    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 3;
-  }
-
-  bool IsDeviceMesh2D() const {
-    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 2;
-  }
-
-  bool IsDeviceMesh1D() const {
-    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 1;
-  }
-
-  bool IsOriginalDeviceMesh2D() const {
-    return VectorGreaterThanOneElementCount(
-               original_device_mesh_.dimensions()) == 2;
-  }
-
-  double AllGatherCost(double num_bytes, int mesh_dim) const {
-    if (solver_option_.override_all_gather_cost) {
-      return solver_option_.all_gather_cost;
-    }
-
-    if (prof_result_.Enabled()) {
-      return prof_result_.EstimateAllGatherCost(
-          cached_replica_groups_[mesh_dim], num_bytes / 4, "float32");
-    }
-
-    if (solver_option_.force_batch_dim_to_mesh_dim == mesh_dim) {
-      // if data-parallel is forced on this dim, we only allow all-reduce
-      // in this dimension.
-      return kInfinityCost;
-    }
-
-    int64_t num_devices = device_mesh_.dim(mesh_dim);
-    return (round(mesh_alpha_[mesh_dim] + mesh_beta_[mesh_dim] *
-                                              (num_devices - 1) / num_devices *
-                                              num_bytes) +
-            0.1);
-  }
-
-  // TODO(zhuohan): distinguish dtype and reduce_op.
-  double AllReduceCost(double num_bytes, int32_t mesh_dim,
-                       int32_t mesh_dim_another = -1) const {
-    if (solver_option_.override_all_reduce_cost) {
-      return solver_option_.all_reduce_cost;
-    }
-
-    if (prof_result_.Enabled()) {
-      return prof_result_.EstimateAllReduceCost(
-          cached_replica_groups_[mesh_dim], num_bytes / 4, "float32");
-    }
-    double alpha, beta;
-    int64_t num_devices;
-    if (mesh_dim_another == -1) {
-      // Only communicating on one mesh dimension.
-      alpha = mesh_alpha_[mesh_dim];
-      beta = mesh_beta_[mesh_dim];
-      num_devices = device_mesh_.dim(mesh_dim);
-    } else {
-      // Communicating through both mesh dimensions.
-      alpha = std::max(mesh_alpha_[mesh_dim], mesh_alpha_[mesh_dim_another]);
-      beta = std::max(mesh_beta_[mesh_dim], mesh_beta_[mesh_dim_another]);
-      num_devices = device_mesh_.num_elements();
-    }
-    return (
-        round(alpha + beta * 2 * (num_devices - 1) / num_devices * num_bytes) +
-        0.01);
-  }
-
-  double ReduceScatterCost(double num_bytes, int mesh_dim) const {
-    if (solver_option_.override_reduce_scatter_cost) {
-      return solver_option_.reduce_scatter_cost;
-    }
-
-    if (prof_result_.Enabled()) {
-      return prof_result_.EstimateReduceScatterCost(
-          cached_replica_groups_[mesh_dim], num_bytes / 4, "float32");
-    }
-
-    int64_t num_devices = device_mesh_.dim(mesh_dim);
-    return (round(mesh_alpha_[mesh_dim] + mesh_beta_[mesh_dim] *
-                                              (num_devices - 1) / num_devices *
-                                              num_bytes) +
-            0.001);
-  }
-
-  double AllToAllCost(double num_bytes, int mesh_dim) const {
-    if (solver_option_.override_all_to_all_cost) {
-      return solver_option_.all_to_all_cost;
-    }
-
-    if (prof_result_.Enabled()) {
-      return prof_result_.EstimateAllToAllCost(cached_replica_groups_[mesh_dim],
-                                               num_bytes / 4, "float32");
-    }
-
-    if (solver_option_.force_batch_dim_to_mesh_dim == mesh_dim) {
-      // if data-parallel is forced on this dim, we only allow all-reduce
-      // in this dimension.
-      return kInfinityCost;
-    }
-
-    int64_t num_devices = device_mesh_.dim(mesh_dim);
-    return AllToAllCostUtil(num_bytes, mesh_dim, num_devices, mesh_alpha_,
-                            mesh_beta_);
-  }
-
-  double DotCost(const Shape& lhs_shape, const Shape& rhs_shape,
-                 const DotDimensionNumbers& dot_dnums) const {
-    if (!solver_option_.allow_recompute_heavy_op) {
-      return kInfinityCost;
-    }
-
-    // TODO(zhuohan): When profiling data is not available, it is not easy to
-    // align the scale of compute cost and communication cost. Here we just use
-    // a simple heuristic to compute the compute cost with communication cost.
-    double num_bytes = GetBytes(lhs_shape) + GetBytes(rhs_shape);
-    return AllReduceCost(num_bytes, 0) + AllReduceCost(num_bytes, 1);
-  }
-
-  // Get the corresponding mesh dimension for every tensor dimension.
-  // -1 means replicated on that dimension
-  std::vector<int64_t> GetTensorDimToMeshDimWrapper(
-      const Shape& shape, const HloSharding& spec) const {
-    int64_t n_dim = NumTileDimensions(spec);
-    std::vector<int64_t> tensor_dim_to_mesh_dim =
-        GetTensorDimToMeshDim(shape.rank(), spec, device_mesh_);
-    AdjustTensorMeshDimMapping(tensor_dim_to_mesh_dim, n_dim);
-    return tensor_dim_to_mesh_dim;
-  }
-
-  // The communication cost of resharding a tensor from src to dst
-  // TODO(b/238210866) Do not use kInfinityCost.
-  double ReshardingCost(const Shape& shape, const HloSharding& src_spec,
-                        const HloSharding& dst_spec) const {
-    // TODO(zhuohan): This function can be wrong and needs more tests.
-    if (src_spec == dst_spec || IsUndefined(src_spec)) {
-      return 0.0;
-    }
-    CHECK(!IsUndefined(dst_spec));
-    int64_t src_n_dim = NumTileDimensions(src_spec);
-    int64_t dst_n_dim = NumTileDimensions(dst_spec);
-    // When src_spec and dst_spec are for arrays with different number of
-    // dimensions, which could happen when an instruction follows the sharding
-    // of an operand with a different shape, we need to use their
-    // TiledDataRank().
-    size_t src_rank = shape.rank();
-    if (src_spec.IsTiled()) {
-      src_rank = src_spec.TiledDataRank();
-    }
-    size_t dst_rank = shape.rank();
-    if (dst_spec.IsTiled()) {
-      dst_rank = dst_spec.TiledDataRank();
-    }
-    std::vector<int64_t> src_tensor_dim_to_mesh_dim;
-    if (VectorGreaterThanOneElementCount(
-            src_spec.tile_assignment().dimensions()) == 1 &&
-        VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
-      // src spec is 1D and device_mesh is 2D or 3D
-      src_tensor_dim_to_mesh_dim =
-          GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_1d_);
-    } else {
-      src_tensor_dim_to_mesh_dim =
-          GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_);
-    }
-    std::vector<int64_t> dst_tensor_dim_to_mesh_dim;
-    if (VectorGreaterThanOneElementCount(
-            dst_spec.tile_assignment().dimensions()) == 1 &&
-        VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
-      // src spec is 1D and device_mesh is 2D or 3D
-      dst_tensor_dim_to_mesh_dim =
-          GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_1d_);
-    } else {
-      dst_tensor_dim_to_mesh_dim =
-          GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_);
-    }
-    if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
-      return ReshardingCostMixedMeshShape(
-          shape, src_tensor_dim_to_mesh_dim, dst_tensor_dim_to_mesh_dim,
-          device_mesh_.num_elements(), mesh_alpha_, mesh_beta_);
-    }
-
-    AdjustTensorMeshDimMapping(src_tensor_dim_to_mesh_dim, src_n_dim);
-    AdjustTensorMeshDimMapping(dst_tensor_dim_to_mesh_dim, dst_n_dim);
-
-    // Analyze the dims that need to dynamic-sliced or all-gather.
-    std::vector<int> slice_dims;
-    std::vector<int> all_gather_dims;
-    for (int64_t i = 0; i < std::min(src_rank, dst_rank); ++i) {
-      int src_mesh_dim = src_tensor_dim_to_mesh_dim[i];
-      int dst_mesh_dim = dst_tensor_dim_to_mesh_dim[i];
-      if (src_mesh_dim == dst_mesh_dim) {
-        continue;
-      }
-      if (src_mesh_dim == -1) {
-        slice_dims.push_back(src_mesh_dim);
-        continue;
-      }
-      if (dst_mesh_dim == -1) {
-        all_gather_dims.push_back(src_mesh_dim);
-        continue;
-      }
-      // Do not allow other re-sharding patterns. (e.g., collective-permute)
-      return kInfinityCost;
-    }
-
-    // Case 1: no communication is required. Only needs dynamic-slice.
-    if (all_gather_dims.empty()) {
-      return 0;
-    }
-
-    // Do not allow some strange re-sharding patterns.
-    if (slice_dims.size() > 1 && all_gather_dims.size() > 1) {
-      return kInfinityCost;
-    }
-
-    // Case 2: all-to-all
-    if (slice_dims.size() == 1 && all_gather_dims.size() == 1) {
-      if (device_mesh_.dim(0) > 1 && device_mesh_.dim(1) > 1) {
-        return kInfinityCost;
-      }
-
-      double bytes = GetBytes(shape);
-      return AllToAllCost(bytes, all_gather_dims.front());
-    }
-
-    // Case 3: all-gather
-    double bytes = GetBytes(shape) / src_spec.NumTiles();
-    double cost = 0.0;
-    for (int dim : all_gather_dims) {
-      if (dim >= device_mesh_.num_dimensions()) {
-        return kInfinityCost;
-      }
-      bytes *= device_mesh_.dim(dim);
-      cost += AllGatherCost(bytes, dim);
-    }
-    return cost;
-  }
-
-  // Print the information of this device mesh.
-  std::string ToString() {
-    std::string str;
-    absl::StrAppend(&str, "device_mesh: ", device_mesh_.ToString(), "\n");
-    absl::StrAppend(&str, "mesh_alpha: ", absl::StrJoin(mesh_alpha_, " "),
-                    "\n");
-    absl::StrAppend(&str, "mesh_beta: ", absl::StrJoin(mesh_beta_, " "), "\n");
-    return str;
-  }
-
-  // The original, complete device mesh shape that describes the hardware.
-  const Array<int64_t> original_device_mesh_;
-  // When solve_nd_sharding_iteratively is true, it is a partial mesh shape from
-  // the original_device_mesh_. When solve_nd_sharding_iteratively is false, it
-  // is the same as original_device_mesh_.
-  const Array<int64_t> device_mesh_;
-  // Bandwidth of the device mesh
-  const std::vector<double> mesh_alpha_;
-  const std::vector<double> mesh_beta_;
-  const ProfilingResult& prof_result_;
-  std::vector<int64_t> non_zero_mesh_dims_;
-  const int total_devices_;
-
-  // Cache a flatten 1d version of the device mesh.
-  // Used for mixed mesh shape strategies.
-  Array<int64_t> device_mesh_1d_;
-
-  // The solver option may override the cost of communication primitives
-  const AutoShardingSolverOption& solver_option_;
-
-  // Cached replica groups. Shape: [mesh_dim, group_id, ids in this group].
-  std::vector<std::vector<std::vector<int64_t>>> cached_replica_groups_;
-
- private:
-  void GenerateCachedReplicaGroups() {
-    // One vector per device_mesh_ dimension.
-    cached_replica_groups_.reserve(device_mesh_.num_dimensions());
-    for (size_t i = 0; i < device_mesh_.num_dimensions(); i++) {
-      cached_replica_groups_.push_back(
-          GetReplicaGroupsAlongOneDimension(device_mesh_, i));
-    }
-  }
-
-  void AdjustTensorMeshDimMapping(std::vector<int64_t>& mapping,
-                                  int64_t n_dim) const {
-    // Shift the non-zero dim for 1d mesh
-    if (n_dim == 1 && non_zero_mesh_dims_.size() == 1) {
-      for (size_t i = 0; i < mapping.size(); ++i) {
-        if (mapping[i] == 0) {
-          mapping[i] = non_zero_mesh_dims_.front();
-        }
-      }
-    }
-  }
-};
-
-// Function declarations
-// Their comments can be found in their definitions in *.cc files.
-HloSharding Tile(const Shape& shape, absl::Span<const int64_t> tensor_dims,
-                 absl::Span<const int64_t> mesh_dims,
-                 const Array<int64_t>& device_mesh);
-
-std::vector<double> ReshardingCostVector(const StrategyVector* strategies,
-                                         const Shape& shape,
-                                         const HloSharding& required_sharding,
-                                         const ClusterEnvironment& cluster_env);
-
-std::vector<double> FollowInsCostVector(int64_t source_len, int64_t index);
-
-std::unique_ptr<StrategyVector> CreateLeafStrategyVector(
-    size_t instruction_id, const HloInstruction* ins,
-    const StrategyMap& strategy_map, LeafStrategies& leaf_strategies);
-
-void SetInNodesWithInstruction(std::unique_ptr<StrategyVector>& strategies,
-                               const HloInstruction* ins,
-                               const StrategyMap& strategy_map);
-
-void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies);
-
-Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyVector>& strategies,
-                      const ClusterEnvironment& cluster_env,
-                      const InstructionBatchDimMap& batch_map,
-                      const AutoShardingSolverOption& solver_option);
-
-Status HandleDot(std::unique_ptr<StrategyVector>& strategies,
-                 LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
-                 const HloInstruction* ins, size_t instruction_id,
-                 const ClusterEnvironment& cluster_env,
-                 const InstructionBatchDimMap& batch_map,
-                 const AutoShardingSolverOption& solver_option);
-
-Status HandleConv(std::unique_ptr<StrategyVector>& strategies,
-                  LeafStrategies& leaf_strategies, StrategyMap& strategy_map,
-                  const HloInstruction* ins, size_t instruction_id,
-                  const ClusterEnvironment& cluster_env,
-                  const InstructionBatchDimMap& batch_map,
-                  const AutoShardingSolverOption& solver_option);
-
-void AnnotateShardingWithSimpleHeuristic(HloModule* module,
-                                         const std::string& heuristic,
-                                         const AliasMap& alias_map,
-                                         const ClusterEnvironment& cluster_env);
-
-// Handle alias: alias pairs must have the same HloSharding.
-// To deal with alias, we do special process both before and after
-// BuildStrategyAndCost. Because it is easier to handle elementwise
-// instructions before BuildStrategyAndCost and it is easier to handle
-// dot/conv instructions after BuildStrategyAndCost. Before
-// BuildStrategyAndCost, we build an AliasMap to guide the generation of
-// strategies. After BuildStrategyAndCost, we use AliasSet to add alias
-// constraints in the ILP problem.
-AliasMap BuildAliasMap(const HloModule* module);
-
-AliasSet BuildAliasSet(const HloModule* module,
-                       const StrategyMap& strategy_map);
 
-void CheckAliasSetCompatibility(const AliasSet& alias_set,
-                                const LeafStrategies& leaf_strategies,
-                                const HloInstructionSequence& sequence);
 }  // namespace spmd
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_STRATEGY_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index e9bdbbb731a..bbb5c831f2d 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -36,8 +36,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h"
-#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -231,21 +230,6 @@ std::optional<HloSharding> PropagateReduceWindowSharding(
   return input_spec;
 }
 
-// Pass through the custom call marker and get the source instruction
-inline const HloInstruction* PassThroughCustomCallMarkerGetSource(
-    const HloInstruction* ins) {
-  while (ins->opcode() == HloOpcode::kGetTupleElement &&
-         IsCustomCallMarker(ins->operand(0))) {
-    const HloInstruction* custom_call = ins->operand(0);
-    const HloInstruction* tuple = custom_call->operand(0);
-    while (IsCustomCallMarker(tuple)) {
-      tuple = tuple->operand(0);
-    }
-    ins = tuple->operand(ins->tuple_index());
-  }
-  return ins;
-}
-
 // Depth analysis (breadth first search).
 // We also assign a much larger distance to heavy operators (e.g., dot,
 // convolution).
@@ -446,6 +430,7 @@ void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
       case HloOpcode::kSin:
       case HloOpcode::kSqrt:
       case HloOpcode::kCbrt:
+      case HloOpcode::kTan:
       case HloOpcode::kTanh:
       // Binary elementwise operations
       case HloOpcode::kAdd:
@@ -704,6 +689,7 @@ void BatchDimMapBackward(const std::vector<HloInstruction*>& instructions,
       case HloOpcode::kSin:
       case HloOpcode::kSqrt:
       case HloOpcode::kCbrt:
+      case HloOpcode::kTan:
       case HloOpcode::kTanh:
       // Binary elementwise operations
       case HloOpcode::kAdd:
@@ -968,58 +954,7 @@ void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies) {
   }
 }
 
-// Filter strategies according to the solver_option.force_batch_dim_to_mesh_dim.
-// This can be used to forcibly generate data-parallel strategies.
-Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
-                      std::unique_ptr<StrategyVector>& strategies,
-                      const ClusterEnvironment& cluster_env,
-                      const InstructionBatchDimMap& batch_map,
-                      const AutoShardingSolverOption& solver_option) {
-  int mesh_dim = solver_option.force_batch_dim_to_mesh_dim;
-  int batch_dim = batch_map.at(GetBatchDimMapKey(ins));
-  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
-
-  if (shape.dimensions(batch_dim) % device_mesh.dim(mesh_dim) != 0) {
-    return tsl::errors::InvalidArgument(
-        "The length of batch dimension is "
-        "not divisible by the number of devices");
-  }
-
-  std::vector<ShardingStrategy> new_leaf_vector;
-  for (auto& stra : strategies->leaf_vector) {
-    std::vector<int64_t> tensor_dim_to_mesh_dim =
-        cluster_env.GetTensorDimToMeshDimWrapper(shape, stra.output_sharding);
-
-    if (device_mesh.dim(mesh_dim) > 1) {
-      // If the mesh dim is not one, the output tensor must be
-      // tiled along the mesh dim.
-      if (tensor_dim_to_mesh_dim[batch_dim] == mesh_dim) {
-        new_leaf_vector.push_back(std::move(stra));
-      }
-    } else {
-      // If the mesh dim is one, the output tensor must be replicated
-      // on the mesh dim.
-      if (tensor_dim_to_mesh_dim[batch_dim] == -1) {
-        new_leaf_vector.push_back(std::move(stra));
-      }
-    }
-  }
-  CHECK(!new_leaf_vector.empty())
-      << ins->ToString() << " does not have any valid strategies";
-  strategies->leaf_vector = std::move(new_leaf_vector);
-
-  return OkStatus();
-}
-
-inline std::pair<int, int> ParseMeshDims(const std::string& strategy_name) {
-  if (absl::StrContains(strategy_name, "{0,1}")) {
-    return {0, 1};
-  }
-  return {1, 0};
-}
 
-// Return whether the tensor shape is divisible by
-// the number of devices along multiple dimensions.
 bool IsDivisible(const HloInstruction* ins, const Array<int64_t>& device_mesh,
                  absl::Span<const int64_t> tensor_dims,
                  absl::Span<const int64_t> mesh_dims) {
@@ -1034,181 +969,6 @@ bool IsDivisible(const HloInstruction* ins, const Array<int64_t>& device_mesh,
   return true;
 }
 
-// Return the output sharding of the reduce-scatter variant of a given strategy.
-HloSharding GetReduceScatterOutput(const HloInstruction* ins,
-                                   const ShardingStrategy& strategy,
-                                   const ClusterEnvironment& cluster_env) {
-  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
-  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
-
-  if (ins->opcode() == HloOpcode::kDot) {
-    const DotDimensionNumbers& dot_dnums = ins->dot_dimension_numbers();
-    int64_t space_base_dim = dot_dnums.lhs_batch_dimensions_size();
-
-    if (absl::StartsWith(strategy.name, "SR = SS x SR") ||
-        absl::StartsWith(strategy.name, "RS = RS x SS")) {
-      int mesh_dim0, mesh_dim1;
-      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
-
-      if (!IsDivisible(ins, device_mesh, {space_base_dim, space_base_dim + 1},
-                       {mesh_dim0, mesh_dim1})) {
-        // XLA supports uneven partitioning by adding padding.
-        // However, the ShardingSpec in Jax does not support uneven
-        // partitioning.
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {space_base_dim, space_base_dim + 1},
-                  {mesh_dim0, mesh_dim1}, device_mesh);
-    }
-    if (absl::StartsWith(strategy.name, "SbR = SbSk x SbSk")) {
-      int mesh_dim0, mesh_dim1;
-      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
-
-      if (!IsDivisible(ins, device_mesh, {0, space_base_dim},
-                       {mesh_dim0, mesh_dim1})) {
-        // XLA supports uneven partitioning by adding padding.
-        // However, the ShardingSpec in Jax does not support uneven
-        // partitioning.
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {0, space_base_dim}, {mesh_dim0, mesh_dim1},
-                  device_mesh);
-    }
-    if (absl::StartsWith(strategy.name, "RR = RS x SR")) {
-      int mesh_dim = absl::StrContains(strategy.name, "{0}") ? 0 : 1;
-
-      if (!IsDivisible(ins, device_mesh, {space_base_dim}, {mesh_dim})) {
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {space_base_dim}, {mesh_dim}, device_mesh);
-    }
-    if (absl::StartsWith(strategy.name, "R = Sk x Sk")) {
-      int mesh_dim = 0;
-
-      if (!IsDivisible(ins, device_mesh_1d, {space_base_dim}, {mesh_dim})) {
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {space_base_dim}, {mesh_dim}, device_mesh_1d);
-    }
-  } else if (ins->opcode() == HloOpcode::kConvolution) {
-    const ConvolutionDimensionNumbers& conv_dnums =
-        ins->convolution_dimension_numbers();
-    int out_batch_dim = conv_dnums.output_batch_dimension();
-    int out_out_channel_dim = conv_dnums.output_feature_dimension();
-
-    if (absl::StartsWith(strategy.name, "SR = SS x SR") ||
-        absl::StartsWith(strategy.name, "RS = RS x SS")) {
-      int mesh_dim0, mesh_dim1;
-      std::tie(mesh_dim0, mesh_dim1) = ParseMeshDims(strategy.name);
-
-      if (!IsDivisible(ins, device_mesh, {out_batch_dim, out_out_channel_dim},
-                       {mesh_dim0, mesh_dim1})) {
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {out_batch_dim, out_out_channel_dim},
-                  {mesh_dim0, mesh_dim1}, device_mesh);
-    }
-    if (absl::StartsWith(strategy.name, "R = Sk x Sk")) {
-      int mesh_dim = 0;
-
-      if (!IsDivisible(ins, device_mesh_1d, {out_batch_dim}, {mesh_dim})) {
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {out_batch_dim}, {mesh_dim}, device_mesh_1d);
-    }
-  } else if (ins->opcode() == HloOpcode::kReduce) {
-    // TODO(zhuohan): support more cases.
-    CHECK_EQ(ins->shape().rank(), 1);
-
-    int mesh_dim;
-    if (absl::StrContains(strategy.name, "allreduce @ [0]")) {
-      mesh_dim = 0;
-    } else {
-      mesh_dim = 1;
-    }
-
-    if (strategy.output_sharding.IsReplicated()) {
-      if (absl::StrContains(strategy.name, "1d")) {
-        if (!IsDivisible(ins, device_mesh_1d, {0}, {mesh_dim})) {
-          return Undefined();
-        }
-
-        return Tile(ins->shape(), {0}, {mesh_dim}, device_mesh_1d);
-      }
-      if (!IsDivisible(ins, device_mesh, {0}, {mesh_dim})) {
-        return Undefined();
-      }
-
-      return Tile(ins->shape(), {0}, {mesh_dim}, device_mesh);
-    }
-    if (!IsDivisible(ins, device_mesh_1d, {0}, {0})) {
-      return Undefined();
-    }
-
-    Array<int64_t> tile_assignment = strategy.output_sharding.tile_assignment();
-    tile_assignment.Reshape({cluster_env.total_devices_});
-    return HloSharding::Tile(std::move(tile_assignment));
-
-  } else {
-    LOG(FATAL) << "Invalid instruction: " << ins->ToString();
-  }
-
-  return Undefined();
-}
-
-// Return whether an instruction has the opportunity to generate reduce-scatter.
-bool HasReduceScatterOpportunity(
-    const HloInstruction* inst, const StrategyMap& strategy_map,
-    const CostGraph& cost_graph, absl::Span<const int64_t> s_val,
-    const StableHashSet<const HloInstruction*>& modified) {
-  // If the operand is already modified by other ops, skip this instruction to
-  // avoid conflicts.
-  for (const HloInstruction* operand : inst->operands()) {
-    if (modified.contains(operand)) {
-      return false;
-    }
-  }
-  if (modified.contains(inst)) {
-    return false;
-  }
-
-  if (inst->opcode() == HloOpcode::kReduce && inst->shape().rank() == 1) {
-    return true;
-  }
-  if (inst->opcode() == HloOpcode::kDot) {
-    if (GetShardingStrategy(inst->operand(0), strategy_map, cost_graph, s_val)
-            .output_sharding.IsReplicated() &&
-        GetShardingStrategy(inst->operand(1), strategy_map, cost_graph, s_val)
-            .output_sharding.IsReplicated()) {
-      // This dot is replicated on all devices. Do not split it.
-      // TODO(zhuohan): improve this condition.
-      return false;
-    }
-
-    return true;
-  }
-  if (inst->opcode() == HloOpcode::kConvolution) {
-    return true;
-  }
-
-  return false;
-}
-
-// Return whether all users of an instruction is reduce.
-bool AllUsersAreReduce(const HloInstruction* inst) {
-  for (const HloInstruction* user : inst->users()) {
-    if (user->opcode() != HloOpcode::kReduce) {
-      return false;
-    }
-  }
-  return true;
-}
 
 // Set sharding, and apply transpose if necessary.
 void SetSharding(HloInstruction* to_split, const HloSharding& output_spec,
@@ -1226,8 +986,6 @@ void SetSharding(HloInstruction* to_split, const HloSharding& output_spec,
   }
 }
 
-// Return whether the instruction is always replicated.
-// (e.g., constant, broadcasted constant, scalar)
 bool IsAlwaysReplicated(const HloInstruction* inst) {
   if (inst->opcode() == HloOpcode::kConstant) {
     return true;
@@ -1241,149 +999,7 @@ bool IsAlwaysReplicated(const HloInstruction* inst) {
   return false;
 }
 
-// Return whether this instruction is a convert on a parameter.
-bool IsParameterConvert(const HloInstruction* inst) {
-  if (inst->opcode() == HloOpcode::kConvert &&
-      inst->operand(0)->opcode() == HloOpcode::kParameter) {
-    return true;
-  }
-  return false;
-}
-
-// Pass through the custom call marker and get the acutal operand.
-inline HloInstruction* PassThroughCustomCallMarkerOperand(
-    HloInstruction* raw_operand, const HloInstruction* inst) {
-  if (!IsCustomCallMarker(raw_operand)) {
-    return raw_operand;
-  }
-
-  CHECK_EQ(inst->opcode(), HloOpcode::kGetTupleElement);
-
-  int index = inst->tuple_index();
-  return raw_operand->mutable_operand(0)->mutable_operand(index);
-}
-
-// Return whether the tuple is only used by a custom call marker.
-inline bool IsCustomCallMarkerTuple(const HloInstruction* inst) {
-  return inst->opcode() == HloOpcode::kTuple && inst->users().size() == 1 &&
-         IsCustomCallMarker(inst->users().front());
-}
-
-// Pass through the custom call marker and get the actual user.
-inline HloInstruction* PassThroughCustomCallMarkerUser(
-    HloInstruction* raw_user, const HloInstruction* inst) {
-  if (!IsCustomCallMarkerTuple(raw_user)) {
-    return raw_user;
-  }
-
-  const HloInstruction* custom_call = raw_user->users().front();
-
-  int index = -1;
-  for (int i = 0; i < raw_user->operand_count(); i++) {
-    if (raw_user->operand(i) == inst) {
-      index = i;
-      break;
-    }
-  }
-  CHECK_NE(index, -1);
-
-  HloInstruction* ret = nullptr;
-  for (HloInstruction* user : custom_call->users()) {
-    CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement);
-    if (user->tuple_index() == index) {
-      CHECK_EQ(ret, nullptr);
-      ret = user;
-    }
-  }
-
-  return ret == nullptr ? raw_user : ret;
-}
-
-// Return the users of an instruction and its alias,
-// excluding the final output tuple.
-inline StableHashSet<HloInstruction*> UsersWithAlias(
-    const HloInstruction* inst, const AliasMap& alias_map,
-    const HloInstruction* output) {
-  StableHashSet<HloInstruction*> users;
 
-  for (HloInstruction* user : inst->users()) {
-    users.insert(PassThroughCustomCallMarkerUser(user, inst));
-  }
-
-  auto iter = alias_map.find(inst);
-  if (iter != alias_map.end()) {
-    for (HloInstruction* user : iter->second->users()) {
-      users.insert(PassThroughCustomCallMarkerUser(user, iter->second));
-    }
-  }
-
-  users.erase(output);
-  return users;
-}
-
-// DFS to find the replicated set starting from cur instruction.
-void FindReplicateSet(
-    HloInstruction* cur, const AliasMap& alias_map, const CostGraph& cost_graph,
-    absl::Span<const int64_t> s_val, const StrategyMap& strategy_map,
-    const ShardingStrategy& strategy, const HloInstruction* output,
-    bool do_all_gather_after_backward, HloInstruction*& transpose_inst,
-    StableHashSet<HloInstruction*>& replicated_set,
-    StableHashSet<HloInstruction*>& boundary_set,
-    StableHashSet<HloInstruction*>& consumer_set,
-    StableHashSet<const HloInstruction*>& visited) {
-  visited.insert(cur);
-
-  // Check whether the node is a boundary node.
-  StableHashSet<HloInstruction*> users = UsersWithAlias(cur, alias_map, output);
-  for (HloInstruction* consumer : users) {
-    const HloInstruction* shape_inst = cur;
-
-    // Allow at most one transpose
-    if (consumer->opcode() == HloOpcode::kTranspose &&
-        (transpose_inst == nullptr ||
-         DimensionsEqual(transpose_inst->shape(), consumer->shape()))) {
-      shape_inst = consumer;
-      transpose_inst = consumer;
-      // TODO(zhuohan): fix output_sharding comparison.
-    }
-
-    if (consumer->opcode() == HloOpcode::kTuple ||
-        (do_all_gather_after_backward && IsParameterConvert(consumer)) ||
-        GetShardingStrategy(consumer, strategy_map, cost_graph, s_val)
-                .output_sharding != strategy.output_sharding ||
-        !DimensionsEqual(consumer->shape(), shape_inst->shape())) {
-      boundary_set.insert(cur);
-      return;
-    }
-  }
-
-  // If this node is not a boundary node, propagate from this node.
-  replicated_set.insert(cur);
-  for (HloInstruction* consumer : users) {
-    if (!visited.contains(consumer)) {
-      consumer_set.insert(consumer);
-      FindReplicateSet(consumer, alias_map, cost_graph, s_val, strategy_map,
-                       strategy, output, do_all_gather_after_backward,
-                       transpose_inst, replicated_set, boundary_set,
-                       consumer_set, visited);
-    }
-  }
-
-  for (size_t i = 0; i < cur->operand_count(); ++i) {
-    HloInstruction* operand = cur->mutable_operand(i);
-    operand = PassThroughCustomCallMarkerOperand(operand, cur);
-
-    if (!visited.contains(operand) && !IsAlwaysReplicated(operand) &&
-        GetShardingStrategy(operand, strategy_map, cost_graph, s_val)
-                .output_sharding == strategy.output_sharding &&
-        DimensionsEqual(operand->shape(), cur->shape())) {
-      FindReplicateSet(operand, alias_map, cost_graph, s_val, strategy_map,
-                       strategy, output, do_all_gather_after_backward,
-                       transpose_inst, replicated_set, boundary_set,
-                       consumer_set, visited);
-    }
-  }
-}
 
 // Try to reduce the boundary set to its common ancestor
 void TryReduceWithCommonAncestor(StableHashSet<HloInstruction*>& replicated_set,
@@ -1476,281 +1092,6 @@ void UseAllReduceForGradAcc(StableHashSet<HloInstruction*>& replicated_set,
   }
 }
 
-// Substitute all-reduce strategies with their reduce-scatter variants.
-void GenerateReduceScatter(const HloInstructionSequence& sequence,
-                           const AliasMap& alias_map,
-                           const InstructionDepthMap& depth_map,
-                           const StrategyMap& strategy_map,
-                           const CostGraph& cost_graph,
-                           absl::Span<const int64_t> s_val,
-                           const ClusterEnvironment& cluster_env,
-                           const AutoShardingSolverOption& solver_option) {
-  const std::vector<HloInstruction*>& instructions = sequence.instructions();
-
-  // Propagation ends at output
-  const HloInstruction* output = instructions.back();
-  if (IsCustomCallMarker(output)) {
-    output = output->operand(0);
-  }
-
-  // A debug option: whether to do all-gather after backward pass.
-  // This controls the location of all-gather.
-  // If true, all-gather happens after backward pass, which is desired for
-  // gradient accumulation. If false, all-gather happens before forward pass,
-  // which can partitions more tensors.
-  bool do_all_gather_after_backward = true;
-
-  // If true, do not actually generate reduce-scatter + all-gather,
-  // but generate all-reduce + all-gather instead.
-  // This saves less memory but is more friendly to gradient accumulation.
-  // This is a temporary workaround due to implementation difficulty.
-  // Ideally, we should be able to generate a gradient-accumulation-friendly
-  // reduce-scatter + all-gather, but for now it is not easy to implement this
-  // in our current system. So we generate a gradient-accumulation-friendly
-  // all-reduce + all-gather, which has the same memory consumption but with 50%
-  // communication overhead.
-  bool use_all_reduce_for_grad_acc =
-      solver_option.reduce_scatter_grad_acc_friendly;
-
-  std::vector<HloInstruction*> insert_all_gather;
-  StableHashSet<const HloInstruction*> modified;
-
-  for (HloInstruction* inst : instructions) {
-    if (!HasReduceScatterOpportunity(inst, strategy_map, cost_graph, s_val,
-                                     modified)) {
-      continue;
-    }
-    const ShardingStrategy& strategy =
-        GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
-    if (!absl::StrContains(strategy.name, "allreduce")) {
-      continue;
-    }
-
-    StableHashSet<HloInstruction*> replicated_set;
-    StableHashSet<HloInstruction*> boundary_set;
-    StableHashSet<HloInstruction*> consumer_set;
-    StableHashSet<const HloInstruction*> visited;
-
-    // We allow at most one transpose in the path of replication analysis.
-    HloInstruction* transpose_inst = nullptr;
-
-    // Find the replicated set starting from the all-reduce instruction.
-    visited.insert(output);
-    FindReplicateSet(inst, alias_map, cost_graph, s_val, strategy_map, strategy,
-                     output, do_all_gather_after_backward, transpose_inst,
-                     replicated_set, boundary_set, consumer_set, visited);
-
-    // Try to reduce the boundary set to its common ancestor
-    TryReduceWithCommonAncestor(replicated_set, boundary_set, consumer_set,
-                                alias_map);
-
-    // Analyze the instructions after which all-gather should be inserted.
-    std::vector<HloInstruction*> need_all_gather;
-    for (HloInstruction* node : boundary_set) {
-      if (consumer_set.contains(node)) {
-        if (AllUsersAreReduce(node)) {
-          // If users are reduce, the all-gather cost after this instruction
-          // should be small, so we ignore all-gather cost of these
-          // instructions.
-          replicated_set.insert(node);
-        } else {
-          need_all_gather.push_back(node);
-        }
-      }
-    }
-
-    // If we do all-gather on some parameters, move this all-gather after
-    // backward.
-    if (do_all_gather_after_backward && need_all_gather.size() == 1) {
-      HloInstruction* point = need_all_gather.front();
-      std::vector<HloInstruction*> path;
-
-      HloInstruction* root = point;
-      while (true) {
-        path.push_back(root);
-        if (root->opcode() == HloOpcode::kGetTupleElement) {
-          root = PassThroughCustomCallMarkerOperand(root->mutable_operand(0),
-                                                    root);
-        } else {
-          break;
-        }
-      }
-
-      if (root->opcode() == HloOpcode::kParameter) {
-        for (auto x : path) {
-          replicated_set.erase(x);
-          boundary_set.erase(x);
-        }
-        need_all_gather.clear();
-        for (auto x : replicated_set) {
-          auto iter = alias_map.find(x);
-          if (iter != alias_map.end() && iter->second == root) {
-            boundary_set.insert(x);
-            need_all_gather.push_back(x);
-            break;
-          }
-        }
-      }
-    }
-
-    // Analyze how many parameters can be partitioned if we do this
-    // transformation.
-    int num_replicated_parameters = 0;
-    for (const HloInstruction* node : replicated_set) {
-      if (node->opcode() == HloOpcode::kParameter) {
-        num_replicated_parameters++;
-      }
-    }
-    for (const HloInstruction* to_split : need_all_gather) {
-      if (to_split->users().size() == 1 &&
-          to_split->users().front() == output && alias_map.contains(to_split)) {
-        // Move the all-gather to its alias parameter.
-        num_replicated_parameters++;
-      }
-    }
-
-    // Print replicated set and boundary set for debugging.
-    VLOG(10) << inst->ToString(HloPrintOptions::ShortParsable()) << "\n";
-    VLOG(10) << "replicated set (#parameter: " << num_replicated_parameters
-             << "):\n";
-    for (auto x : replicated_set) {
-      VLOG(10) << "  " << x->ToString(HloPrintOptions::ShortParsable()) << "\n";
-    }
-    VLOG(10) << "boundary set (#incompatible: " << need_all_gather.size()
-             << "):\n";
-    for (auto x : boundary_set) {
-      VLOG(10) << "  " << x->ToString(HloPrintOptions::ShortParsable()) << " "
-               << absl::c_linear_search(need_all_gather, x) << "\n";
-    }
-
-    // If applicable, replace all-reduce with reduce-scatter by
-    // setting instructions' sharding.
-    if (num_replicated_parameters >= 1 && need_all_gather.size() <= 1 &&
-        replicated_set.size() >= 5) {
-      HloSharding output_spec =
-          GetReduceScatterOutput(inst, strategy, cluster_env);
-      if (IsUndefined(output_spec)) {
-        continue;
-      }
-
-      VLOG(10) << "SET:  " << output_spec.ToString();
-
-      if (absl::StartsWith(strategy.name, "RR = RS x SR")) {
-        // If set the sharding for this dot instruction, the SPMD
-        // partitioner will generate bad fallback code.
-        replicated_set.erase(inst);
-      }
-
-      if (use_all_reduce_for_grad_acc) {
-        UseAllReduceForGradAcc(replicated_set, inst);
-      }
-
-      for (HloInstruction* to_split : replicated_set) {
-        SetSharding(to_split, output_spec, inst, transpose_inst, modified);
-      }
-
-      if (!solver_option.reduce_scatter_aggressive_partition) {
-        // The normal case
-        for (HloInstruction* to_split : need_all_gather) {
-          SetSharding(to_split, output_spec, inst, transpose_inst, modified);
-
-          if (!do_all_gather_after_backward && to_split->users().size() == 1 &&
-              to_split->users().front() == output &&
-              alias_map.contains(to_split)) {
-            // Move the all-gather to its alias parameter.
-            // This partitions more tensors but introduces communication
-            // in the forward pass, which is not desired in gradient
-            // accumulation.
-            SetSharding(alias_map.at(to_split), output_spec, inst,
-                        transpose_inst, modified);
-            insert_all_gather.push_back(alias_map.at(to_split));
-          } else {
-            insert_all_gather.push_back(to_split);
-
-            if (to_split->opcode() == HloOpcode::kGetTupleElement &&
-                IsCustomCallMarker(to_split->operand(0)) &&
-                to_split->users().size() == 1 &&
-                to_split->users().front() == output) {
-              insert_all_gather.push_back(PassThroughCustomCallMarkerOperand(
-                  to_split->mutable_operand(0), to_split));
-            }
-          }
-        }
-      } else {
-        // Aggressively partition more parameter tensors.
-        // This can result in a strategy similar to ZeRO stage 3.
-        // NOTE: The combination of this branch with pipeline parallel is not
-        // tested.
-        for (HloInstruction* to_split : need_all_gather) {
-          SetSharding(to_split, output_spec, inst, transpose_inst, modified);
-
-          if (to_split->users().size() == 1 &&
-              to_split->users().front() == output &&
-              alias_map.contains(to_split)) {
-            // Move the all-gather to its alias parameter.
-            HloInstruction* param = alias_map.at(to_split);
-
-            // Find the branching point (i.e., skip elementwise ops like
-            // convert)
-            HloInstruction* cur = param;
-            while (cur->users().size() == 1) {
-              // TODO(zhuohan): handle tuple.
-              CHECK(cur->shape().IsArray());
-              SetSharding(cur, output_spec, inst, transpose_inst, modified);
-              cur = cur->users().front();
-            }
-            SetSharding(cur, output_spec, inst, transpose_inst, modified);
-
-            CHECK(!cur->users().empty());
-
-            // Find the first user
-            HloInstruction* first_user = nullptr;
-            int64_t min_depth = ((int64_t)1) << 50;
-            for (const auto& x : cur->users()) {
-              auto iter = depth_map.find(x);
-              if (iter == depth_map.end()) {
-                LOG(FATAL) << "ERROR: " << x->ToString();
-              }
-              if (x->opcode() != HloOpcode::kConvolution &&
-                  x->opcode() != HloOpcode::kDot) {
-                // Only apply this aggressive optimization for dot and conv
-                continue;
-              }
-              if (iter->second < min_depth) {
-                first_user = x;
-                min_depth = iter->second;
-              }
-            }
-
-            if (first_user != nullptr) {
-              // Insert an identity to prevent CSE of all-gather
-              HloInstruction* identity = inst->parent()->AddInstruction(
-                  HloInstruction::CreateCustomCall(cur->shape(), {cur},
-                                                   kIdentityMarker));
-              SetSharding(identity, output_spec, inst, transpose_inst,
-                          modified);
-              ReplaceOperand(first_user, cur, identity);
-            }
-          }
-        }
-      }
-    }
-
-    VLOG(10) << "-----------------------done\n";
-  }
-
-  // Insert all-gather on the output of boundary nodes by setting
-  // their shardings. This also works as CSE of all-gather.
-  for (HloInstruction* inst : insert_all_gather) {
-    HloInstruction* replace_with = inst->parent()->AddInstruction(
-        HloInstruction::CreateReshape(inst->shape(), inst));
-    replace_with->set_sharding(
-        GetShardingStrategy(inst, strategy_map, cost_graph, s_val)
-            .output_sharding);
-    TF_CHECK_OK(inst->ReplaceAllUsesWith(replace_with));
-  }
-}
-
 void RemoveCustomCallMarker(HloModule* module) {
   HloComputation* entry_computation = module->entry_computation();
 
@@ -2029,133 +1370,21 @@ void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
   TF_CHECK_OK(inst->ReplaceOperandWith(operand_num, replace_with));
 }
 
-template <typename T>
-inline std::vector<int> Argsort(const std::vector<T>& scores) {
-  std::vector<int> index;
-  index.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    index.push_back(i);
+bool IsParameterConvert(const HloInstruction* inst) {
+  if (inst->opcode() == HloOpcode::kConvert &&
+      inst->operand(0)->opcode() == HloOpcode::kParameter) {
+    return true;
   }
-  auto cmp = [&scores](int l, int r) { return scores[l] > scores[r]; };
-  std::sort(index.begin(), index.end(), cmp);
-  return index;
+  return false;
 }
 
-void AnnotateShardingWithSimpleHeuristic(
-    HloModule* module, const std::string& heuristic, const AliasMap& alias_map,
-    const ClusterEnvironment& cluster_env) {
-  const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
-  const Array<int64_t>& device_mesh_1d = cluster_env.device_mesh_1d_;
-  int64_t num_devices = device_mesh.num_elements();
-
-  // Count the non-one mesh dimension.
-  size_t mesh_nn_dims = 0;
-  for (int dim : device_mesh.dimensions()) {
-    if (dim > 1) {
-      mesh_nn_dims++;
-    }
-  }
-
-  // Shard instructions
-  HloComputation* entry_computation = module->entry_computation();
-  for (HloInstruction* inst : entry_computation->instructions()) {
-    if (inst->opcode() == HloOpcode::kParameter) {
-      HloSharding output_spec = HloSharding::Replicate();
-      inst->set_sharding(output_spec);
-
-      if (heuristic == "shard-largest") {
-        std::vector<int64_t> lengths;
-        for (int64_t i = 0; i < inst->shape().rank(); ++i) {
-          lengths.push_back(inst->shape().dimensions(i));
-        }
-
-        std::vector<int> indices = Argsort(lengths);
-        int common_dims = std::min(mesh_nn_dims, indices.size());
-
-        if (common_dims < 1) {
-          continue;
-        }
-
-        if (common_dims == 1) {
-          int dim = indices[0];
-          int length = lengths[dim];
-          if (length % num_devices == 0) {
-            output_spec = Tile(inst->shape(), {dim}, {0}, device_mesh_1d);
-          }
-        } else {
-          int dim1 = indices[0];
-          int length1 = lengths[dim1];
-          int dim0 = indices[1];
-          int length0 = lengths[dim0];
-
-          if (length0 % device_mesh.dim(0) == 0 &&
-              length1 % device_mesh.dim(1) == 0) {
-            output_spec =
-                Tile(inst->shape(), {dim0, dim1}, {0, 1}, device_mesh);
-          }
-        }
-      } else if (heuristic == "shard-first") {
-        if (inst->shape().rank() > 0 &&
-            inst->shape().dimensions(0) % num_devices == 0) {
-          output_spec = Tile(inst->shape(), {0}, {0}, device_mesh_1d);
-        }
-      } else if (heuristic == "shard-last") {
-        int64_t last_dim = inst->shape().rank() - 1;
-        if (inst->shape().rank() > 0 &&
-            inst->shape().dimensions(last_dim) % num_devices == 0) {
-          output_spec = Tile(inst->shape(), {last_dim}, {0}, device_mesh_1d);
-        }
-      } else {
-        LOG(FATAL) << "Invalid heuristic: " << heuristic;
-      }
-
-      inst->set_sharding(output_spec);
-      // std::cerr << "ins: " << inst->ToString() << ", spec: " <<
-      // output_spec.ToString() << std::endl;
-    } else if (inst->opcode() == HloOpcode::kDot) {
-      const HloInstruction* lhs = inst->operand(0);
-      const HloInstruction* rhs = inst->operand(1);
-      const DotDimensionNumbers& dot_dnums = inst->dot_dimension_numbers();
-      // const auto& lhs_con_dims = dot_dnums.lhs_contracting_dimensions();
-      // const auto& rhs_con_dims = dot_dnums.rhs_contracting_dimensions();
-      std::vector<int64_t> lhs_space_dims, rhs_space_dims;
-      std::tie(lhs_space_dims, rhs_space_dims) =
-          GetSpaceDims(lhs->shape(), rhs->shape(), dot_dnums);
-    }
-  }
-
-  // Meet the alias requirement for the output tuple.
-  HloInstruction* output = entry_computation->root_instruction();
-  const Shape& out_shape = output->shape();
-  ShapeTree<HloSharding> tuple_sharding(out_shape, HloSharding::Replicate());
-  std::vector<HloSharding> flattened_shardings;
-
-  std::function<void(HloInstruction*)> get_flattened_shardings;
-  get_flattened_shardings = [&](HloInstruction* cur) {
-    for (int64_t i = 0; i < cur->operand_count(); ++i) {
-      HloInstruction* operand = cur->mutable_operand(i);
-
-      if (operand->shape().IsTuple()) {
-        get_flattened_shardings(operand);
-      } else {
-        if (alias_map.contains(operand)) {
-          operand = alias_map.at(operand);
-        }
-        if (!operand->has_sharding()) {
-          operand->set_sharding(HloSharding::Replicate());
-        }
-        CHECK(operand->has_sharding());
-        flattened_shardings.push_back(operand->sharding());
-      }
+bool AllUsersAreReduce(const HloInstruction* inst) {
+  for (const HloInstruction* user : inst->users()) {
+    if (user->opcode() != HloOpcode::kReduce) {
+      return false;
     }
-  };
-  get_flattened_shardings(output);
-  int i = 0;
-  for (auto& leaf : tuple_sharding.leaves()) {
-    leaf.second = flattened_shardings[i++];
   }
-  CHECK_EQ(i, flattened_shardings.size());
-  output->set_sharding(HloSharding::Tuple(tuple_sharding));
+  return true;
 }
 
 std::vector<int64_t> GetDimensionMapping(
@@ -2633,7 +1862,6 @@ bool AdjustShardingsWithPartialMeshShape(
     if (!inst->has_sharding()) {
       continue;
     }
-    LOG(INFO) << inst->ToString();
     if (inst->shape().IsTuple()) {
       ShapeTree<HloSharding> output_tuple_sharding(inst->shape(), Undefined());
       std::vector<HloSharding> output_flattened_shardings;
@@ -2703,5 +1931,18 @@ bool OutputInputSameShapes(const HloInstruction* ins) {
   return true;
 }
 
+bool IsEntryComputationInputOrOutput(const HloModule* module,
+                                     const HloInstruction* ins) {
+  for (const auto param :
+       module->entry_computation()->parameter_instructions()) {
+    if (param->name() == ins->name()) {
+      return true;
+    }
+  }
+  if (module->entry_computation()->root_instruction() == ins) {
+    return true;
+  }
+  return false;
+}
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index a97c07165ab..dfba0d6a459 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -32,38 +32,33 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 
 namespace xla {
 namespace spmd {
-// Type alias
-
-template <typename Key, typename Value>
-using StableHashMap = ::absl::flat_hash_map<Key, Value>;
-template <typename Key>
-using StableHashSet = ::absl::flat_hash_set<Key>;
-
-// Map an instruction to its depth.
-using InstructionDepthMap = StableHashMap<const HloInstruction*, int64_t>;
-// Map an instruction to its batch dimension.
-using InstructionBatchDimMap = StableHashMap<std::string, int>;
-// Map an instruction to its alias source parameter.
-using AliasMap = StableHashMap<const HloInstruction*, HloInstruction*>;
-// Map an instruction to its resharding cache.
-using ReshardingCache =
-    StableHashMap<const HloInstruction*,
-                  std::vector<std::pair<HloSharding, HloInstruction*>>>;
 
 inline constexpr absl::string_view kPipelineMarker = "xla_pipeline_marker";
 inline constexpr absl::string_view kIdentityMarker = "identity";
 inline constexpr absl::string_view kPipelineMarkerStartType = "start";
 inline constexpr absl::string_view kPipelineMarkerEndType = "end";
 
+inline std::pair<int, int> ParseMeshDims(const std::string& strategy_name) {
+  if (absl::StrContains(strategy_name, "{0,1}")) {
+    return {0, 1};
+  }
+  return {1, 0};
+}
+
+// Return whether the tensor shape is divisible by
+// the number of devices along multiple dimensions.
+bool IsDivisible(const HloInstruction* ins, const Array<int64_t>& device_mesh,
+                 absl::Span<const int64_t> tensor_dims,
+                 absl::Span<const int64_t> mesh_dims);
+
 // Array/Vector/Matrix Utility
 
 // Append elements of `array` to `result`. The `indices` is a generalized
@@ -131,86 +126,6 @@ std::string ToString(absl::Span<T> span) {
   return absl::StrCat("[", absl::StrJoin(span, ", "), "]");
 }
 
-// A simple matrix class to store and manipulate the cost matrices on edges.
-// It can create a view for matrix transpose without copying the memory.
-// TODO (zhuohan): Inherit from Array2D and add Transpose and operator+ (See
-// tensorflow/compiler/xla/array2d.h;l=39)
-class Matrix {
- public:
-  Matrix() : n_(0), m_(0), transpose_(false), data_(nullptr) {}
-
-  Matrix(size_t n, size_t m) {
-    this->n_ = n;
-    this->m_ = m;
-    transpose_ = false;
-    data_ = std::make_shared<std::vector<double>>(n * m, 0.0);
-  }
-
-  Matrix(size_t n, size_t m, bool transpose,
-         std::shared_ptr<std::vector<double>> data) {
-    this->n_ = n;
-    this->m_ = m;
-    this->transpose_ = transpose;
-    this->data_ = data;
-  }
-
-  Matrix Transpose() { return Matrix(m_, n_, !transpose_, data_); }
-
-  double operator()(size_t i, size_t j) const {
-    size_t idx;
-    if (transpose_) {
-      idx = j * n_ + i;
-    } else {
-      idx = i * m_ + j;
-    }
-    CHECK(data_ != nullptr) << n_ << " , " << m_;
-    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
-    return (*data_)[idx];
-  }
-
-  double& operator()(size_t i, size_t j) {
-    size_t idx;
-    if (transpose_) {
-      idx = j * n_ + i;
-    } else {
-      idx = i * m_ + j;
-    }
-    CHECK(data_ != nullptr) << n_ << " , " << m_;
-    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
-    return (*data_)[idx];
-  }
-
-  Matrix operator+(const Matrix& other) {
-    CHECK_EQ(n_, other.n_);
-    CHECK_EQ(m_, other.m_);
-    Matrix ret = Matrix(n_, m_);
-    for (size_t i = 0; i < n_; ++i) {
-      for (size_t j = 0; j < m_; ++j) {
-        ret(i, j) = operator()(i, j) + other(i, j);
-      }
-    }
-    return ret;
-  }
-
-  std::string ToString() const {
-    std::string str;
-
-    for (size_t i = 0; i < n_; ++i) {
-      for (size_t j = 0; j < m_; ++j) {
-        absl::StrAppend(&str, operator()(i, j), " ");
-      }
-      absl::StrAppend(&str, "\n");
-    }
-
-    return str;
-  }
-
-  size_t n_;
-  size_t m_;
-  bool transpose_;
-  std::shared_ptr<std::vector<double>> data_;
-};
-
 // Shape Utility
 
 // Get the bytes of an array shape without checking its layout.
@@ -286,6 +201,128 @@ inline bool IsCustomCallMarker(const HloInstruction* inst) {
   return inst->IsCustomCall({kPipelineMarker, kIdentityMarker});
 }
 
+// Pass through the custom call marker and get the source instruction
+inline const HloInstruction* PassThroughCustomCallMarkerGetSource(
+    const HloInstruction* ins) {
+  while (ins->opcode() == HloOpcode::kGetTupleElement &&
+         IsCustomCallMarker(ins->operand(0))) {
+    const HloInstruction* custom_call = ins->operand(0);
+    const HloInstruction* tuple = custom_call->operand(0);
+    while (IsCustomCallMarker(tuple)) {
+      tuple = tuple->operand(0);
+    }
+    ins = tuple->operand(ins->tuple_index());
+  }
+  return ins;
+}
+
+// Pass through the custom call marker and get the acutal operand.
+inline HloInstruction* PassThroughCustomCallMarkerOperand(
+    HloInstruction* raw_operand, const HloInstruction* inst) {
+  if (!IsCustomCallMarker(raw_operand)) {
+    return raw_operand;
+  }
+
+  CHECK_EQ(inst->opcode(), HloOpcode::kGetTupleElement);
+
+  int index = inst->tuple_index();
+  return raw_operand->mutable_operand(0)->mutable_operand(index);
+}
+
+// Return whether the tuple is only used by a custom call marker.
+inline bool IsCustomCallMarkerTuple(const HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kTuple && inst->users().size() == 1 &&
+         IsCustomCallMarker(inst->users().front());
+}
+
+// Pass through the custom call marker and get the actual user.
+inline HloInstruction* PassThroughCustomCallMarkerUser(
+    HloInstruction* raw_user, const HloInstruction* inst) {
+  if (!IsCustomCallMarkerTuple(raw_user)) {
+    return raw_user;
+  }
+
+  const HloInstruction* custom_call = raw_user->users().front();
+
+  int index = -1;
+  for (int i = 0; i < raw_user->operand_count(); i++) {
+    if (raw_user->operand(i) == inst) {
+      index = i;
+      break;
+    }
+  }
+  CHECK_NE(index, -1);
+
+  HloInstruction* ret = nullptr;
+  for (HloInstruction* user : custom_call->users()) {
+    CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement);
+    if (user->tuple_index() == index) {
+      CHECK_EQ(ret, nullptr);
+      ret = user;
+    }
+  }
+
+  return ret == nullptr ? raw_user : ret;
+}
+
+// Return the users of an instruction and its alias,
+// excluding the final output tuple.
+inline StableHashSet<HloInstruction*> UsersWithAlias(
+    const HloInstruction* inst, const AliasMap& alias_map,
+    const HloInstruction* output) {
+  StableHashSet<HloInstruction*> users;
+
+  for (HloInstruction* user : inst->users()) {
+    users.insert(PassThroughCustomCallMarkerUser(user, inst));
+  }
+
+  auto iter = alias_map.find(inst);
+  if (iter != alias_map.end()) {
+    for (HloInstruction* user : iter->second->users()) {
+      users.insert(PassThroughCustomCallMarkerUser(user, iter->second));
+    }
+  }
+
+  users.erase(output);
+  return users;
+}
+
+// Return whether this instruction is a convert on a parameter.
+bool IsParameterConvert(const HloInstruction* inst);
+
+// Return whether the instruction is always replicated.
+// (e.g., constant, broadcasted constant, scalar)
+bool IsAlwaysReplicated(const HloInstruction* inst);
+
+// Try to reduce the boundary set to its common ancestor
+void TryReduceWithCommonAncestor(StableHashSet<HloInstruction*>& replicated_set,
+                                 StableHashSet<HloInstruction*>& boundary_set,
+                                 StableHashSet<HloInstruction*>& consumer_set,
+                                 const AliasMap& alias_map);
+
+// Return whether all users of an instruction is reduce.
+bool AllUsersAreReduce(const HloInstruction* inst);
+
+void UseAllReduceForGradAcc(StableHashSet<HloInstruction*>& replicated_set,
+                            const HloInstruction* inst);
+
+void SetSharding(HloInstruction* to_split, const HloSharding& output_spec,
+                 const HloInstruction* ref_inst,
+                 const HloInstruction* shape_inst,
+                 StableHashSet<const HloInstruction*>& modified);
+
+template <typename T>
+inline std::vector<int> Argsort(const std::vector<T>& scores) {
+  std::vector<int> index;
+  index.reserve(scores.size());
+  for (size_t i = 0; i < scores.size(); ++i) {
+    index.push_back(i);
+  }
+  auto cmp = [&scores](int l, int r) { return scores[l] > scores[r]; };
+  std::sort(index.begin(), index.end(), cmp);
+  return index;
+}
+
 // Return whether the reshape is a special reshape that switches the batch dim
 // of a dot.
 bool IsBatchDimSwitchReshape(const HloInstruction* inst);
@@ -475,6 +512,11 @@ HloSharding Tile(const Shape& tensor_shape,
                  absl::Span<const int64_t> mesh_dims,
                  const Array<int64_t>& device_mesh);
 
+AliasMap BuildAliasMap(const HloModule* module);
+
+AliasSet BuildAliasSet(const HloModule* module,
+                       const StrategyMap& strategy_map);
+
 // Transpose an array of any number of dimensions given any axes order.
 // Similar to numpy.transpose(array, axes=()) function.
 template <typename T>
@@ -536,6 +578,9 @@ std::vector<std::vector<int64_t>> DecomposeMeshShapes(
     std::vector<int64_t> mesh_shape);
 
 bool OutputInputSameShapes(const HloInstruction* ins);
+
+bool IsEntryComputationInputOrOutput(const HloModule* module,
+                                     const HloInstruction* ins);
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc
new file mode 100644
index 00000000000..9bc1eca4ed4
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.cc
@@ -0,0 +1,248 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace xla {
+namespace spmd {
+
+double ClusterEnvironment::AllGatherCost(double num_bytes, int mesh_dim) const {
+  if (solver_option_.override_all_gather_cost) {
+    return solver_option_.all_gather_cost;
+  }
+
+  if (prof_result_.Enabled()) {
+    return prof_result_.EstimateAllGatherCost(cached_replica_groups_[mesh_dim],
+                                              num_bytes / 4, "float32");
+  }
+
+  if (solver_option_.force_batch_dim_to_mesh_dim == mesh_dim) {
+    // if data-parallel is forced on this dim, we only allow all-reduce
+    // in this dimension.
+    return kInfinityCost;
+  }
+
+  int64_t num_devices = device_mesh_.dim(mesh_dim);
+  return (round(mesh_alpha_[mesh_dim] + mesh_beta_[mesh_dim] *
+                                            (num_devices - 1) / num_devices *
+                                            num_bytes) +
+          0.1);
+}
+
+// TODO(zhuohan): distinguish dtype and reduce_op.
+double ClusterEnvironment::AllReduceCost(double num_bytes, int32_t mesh_dim,
+                                         int32_t mesh_dim_another) const {
+  if (solver_option_.override_all_reduce_cost) {
+    return solver_option_.all_reduce_cost;
+  }
+
+  if (prof_result_.Enabled()) {
+    return prof_result_.EstimateAllReduceCost(cached_replica_groups_[mesh_dim],
+                                              num_bytes / 4, "float32");
+  }
+  double alpha, beta;
+  int64_t num_devices;
+  if (mesh_dim_another == -1) {
+    // Only communicating on one mesh dimension.
+    alpha = mesh_alpha_[mesh_dim];
+    beta = mesh_beta_[mesh_dim];
+    num_devices = device_mesh_.dim(mesh_dim);
+  } else {
+    // Communicating through both mesh dimensions.
+    alpha = std::max(mesh_alpha_[mesh_dim], mesh_alpha_[mesh_dim_another]);
+    beta = std::max(mesh_beta_[mesh_dim], mesh_beta_[mesh_dim_another]);
+    num_devices = device_mesh_.num_elements();
+  }
+  return (
+      round(alpha + beta * 2 * (num_devices - 1) / num_devices * num_bytes) +
+      0.01);
+}
+
+double ClusterEnvironment::ReduceScatterCost(double num_bytes,
+                                             int mesh_dim) const {
+  if (solver_option_.override_reduce_scatter_cost) {
+    return solver_option_.reduce_scatter_cost;
+  }
+
+  if (prof_result_.Enabled()) {
+    return prof_result_.EstimateReduceScatterCost(
+        cached_replica_groups_[mesh_dim], num_bytes / 4, "float32");
+  }
+
+  int64_t num_devices = device_mesh_.dim(mesh_dim);
+  return (round(mesh_alpha_[mesh_dim] + mesh_beta_[mesh_dim] *
+                                            (num_devices - 1) / num_devices *
+                                            num_bytes) +
+          0.001);
+}
+
+double ClusterEnvironment::AllToAllCost(double num_bytes, int mesh_dim) const {
+  if (solver_option_.override_all_to_all_cost) {
+    return solver_option_.all_to_all_cost;
+  }
+
+  if (prof_result_.Enabled()) {
+    return prof_result_.EstimateAllToAllCost(cached_replica_groups_[mesh_dim],
+                                             num_bytes / 4, "float32");
+  }
+
+  if (solver_option_.force_batch_dim_to_mesh_dim == mesh_dim) {
+    // if data-parallel is forced on this dim, we only allow all-reduce
+    // in this dimension.
+    return kInfinityCost;
+  }
+
+  int64_t num_devices = device_mesh_.dim(mesh_dim);
+  return AllToAllCostUtil(num_bytes, mesh_dim, num_devices, mesh_alpha_,
+                          mesh_beta_);
+}
+
+double ClusterEnvironment::DotCost(const Shape& lhs_shape,
+                                   const Shape& rhs_shape,
+                                   const DotDimensionNumbers& dot_dnums) const {
+  if (!solver_option_.allow_recompute_heavy_op) {
+    return kInfinityCost;
+  }
+
+  // TODO(zhuohan): When profiling data is not available, it is not easy to
+  // align the scale of compute cost and communication cost. Here we just use
+  // a simple heuristic to compute the compute cost with communication cost.
+  double num_bytes = GetBytes(lhs_shape) + GetBytes(rhs_shape);
+  return AllReduceCost(num_bytes, 0) + AllReduceCost(num_bytes, 1);
+}
+
+// The communication cost of resharding a tensor from src to dst
+// TODO(b/238210866) Do not use kInfinityCost.
+double ClusterEnvironment::ReshardingCost(const Shape& shape,
+                                          const HloSharding& src_spec,
+                                          const HloSharding& dst_spec) const {
+  // TODO(zhuohan): This function can be wrong and needs more tests.
+  if (src_spec == dst_spec || IsUndefined(src_spec)) {
+    return 0.0;
+  }
+  CHECK(!IsUndefined(dst_spec));
+  int64_t src_n_dim = NumTileDimensions(src_spec);
+  int64_t dst_n_dim = NumTileDimensions(dst_spec);
+  // When src_spec and dst_spec are for arrays with different number of
+  // dimensions, which could happen when an instruction follows the sharding
+  // of an operand with a different shape, we need to use their
+  // TiledDataRank().
+  size_t src_rank = shape.rank();
+  if (src_spec.IsTiled()) {
+    src_rank = src_spec.TiledDataRank();
+  }
+  size_t dst_rank = shape.rank();
+  if (dst_spec.IsTiled()) {
+    dst_rank = dst_spec.TiledDataRank();
+  }
+  std::vector<int64_t> src_tensor_dim_to_mesh_dim;
+  if (VectorGreaterThanOneElementCount(
+          src_spec.tile_assignment().dimensions()) == 1 &&
+      VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
+    // src spec is 1D and device_mesh is 2D or 3D
+    src_tensor_dim_to_mesh_dim =
+        GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_1d_);
+  } else {
+    src_tensor_dim_to_mesh_dim =
+        GetTensorDimToMeshDim(src_rank, src_spec, device_mesh_);
+  }
+  std::vector<int64_t> dst_tensor_dim_to_mesh_dim;
+  if (VectorGreaterThanOneElementCount(
+          dst_spec.tile_assignment().dimensions()) == 1 &&
+      VectorGreaterThanOneElementCount(device_mesh_.dimensions()) > 1) {
+    // src spec is 1D and device_mesh is 2D or 3D
+    dst_tensor_dim_to_mesh_dim =
+        GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_1d_);
+  } else {
+    dst_tensor_dim_to_mesh_dim =
+        GetTensorDimToMeshDim(dst_rank, dst_spec, device_mesh_);
+  }
+  if (src_n_dim != dst_n_dim && src_n_dim != -1 && dst_n_dim != -1) {
+    return ReshardingCostMixedMeshShape(
+        shape, src_tensor_dim_to_mesh_dim, dst_tensor_dim_to_mesh_dim,
+        device_mesh_.num_elements(), mesh_alpha_, mesh_beta_);
+  }
+
+  AdjustTensorMeshDimMapping(src_tensor_dim_to_mesh_dim, src_n_dim);
+  AdjustTensorMeshDimMapping(dst_tensor_dim_to_mesh_dim, dst_n_dim);
+
+  // Analyze the dims that need to dynamic-sliced or all-gather.
+  std::vector<int> slice_dims;
+  std::vector<int> all_gather_dims;
+  for (int64_t i = 0; i < std::min(src_rank, dst_rank); ++i) {
+    int src_mesh_dim = src_tensor_dim_to_mesh_dim[i];
+    int dst_mesh_dim = dst_tensor_dim_to_mesh_dim[i];
+    if (src_mesh_dim == dst_mesh_dim) {
+      continue;
+    }
+    if (src_mesh_dim == -1) {
+      slice_dims.push_back(src_mesh_dim);
+      continue;
+    }
+    if (dst_mesh_dim == -1) {
+      all_gather_dims.push_back(src_mesh_dim);
+      continue;
+    }
+    // Do not allow other re-sharding patterns. (e.g., collective-permute)
+    return kInfinityCost;
+  }
+
+  // Case 1: no communication is required. Only needs dynamic-slice.
+  if (all_gather_dims.empty()) {
+    return 0;
+  }
+
+  // Do not allow some strange re-sharding patterns.
+  if (slice_dims.size() > 1 && all_gather_dims.size() > 1) {
+    return kInfinityCost;
+  }
+
+  // Case 2: all-to-all
+  if (slice_dims.size() == 1 && all_gather_dims.size() == 1) {
+    if (device_mesh_.dim(0) > 1 && device_mesh_.dim(1) > 1) {
+      return kInfinityCost;
+    }
+
+    double bytes = GetBytes(shape);
+    return AllToAllCost(bytes, all_gather_dims.front());
+  }
+
+  // Case 3: all-gather
+  double bytes = GetBytes(shape) / src_spec.NumTiles();
+  double cost = 0.0;
+  for (int dim : all_gather_dims) {
+    if (dim >= device_mesh_.num_dimensions()) {
+      return kInfinityCost;
+    }
+    bytes *= device_mesh_.dim(dim);
+    cost += AllGatherCost(bytes, dim);
+  }
+  return cost;
+}
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h
new file mode 100644
index 00000000000..b6c217f35d5
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -0,0 +1,170 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_solver_option.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h"
+
+namespace xla {
+namespace spmd {
+
+// The cluster has a multi-dimensional device mesh topology.
+// Each mesh dimension has its own latency and bandwidth.
+// We use alpha-beta model to model the communication cost.
+// If profiling result is provided, we always prefer to use
+// the real profiling result.
+class ClusterEnvironment {
+ public:
+  ClusterEnvironment(const Array<int64_t>& original_device_mesh,
+                     const Array<int64_t>& device_mesh,
+                     absl::Span<const double> mesh_alpha,
+                     absl::Span<const double> mesh_beta,
+                     const ProfilingResult& prof_result,
+                     const AutoShardingSolverOption& solver_option)
+      : original_device_mesh_(original_device_mesh),
+        device_mesh_(device_mesh),
+        mesh_alpha_(mesh_alpha.begin(), mesh_alpha.end()),
+        mesh_beta_(mesh_beta.begin(), mesh_beta.end()),
+        prof_result_(prof_result),
+        total_devices_(device_mesh.num_elements()),
+        device_mesh_1d_(original_device_mesh),
+        solver_option_(solver_option) {
+    // Build replica group for each dimension.
+    non_zero_mesh_dims_ =
+        VectorGreaterThanOneElementIndices(device_mesh.dimensions());
+    GenerateCachedReplicaGroups();
+    // TODO(yuemmawang) Find the largest dimension in original_device_mesh and
+    // create 1d mesh on that dimension.
+    device_mesh_1d_.Reshape({original_device_mesh.num_elements(), 1});
+  }
+
+  size_t NumDevices() const { return total_devices_; }
+
+  bool IsDeviceMesh3D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 3;
+  }
+
+  bool IsDeviceMesh2D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 2;
+  }
+
+  bool IsDeviceMesh1D() const {
+    return VectorGreaterThanOneElementCount(device_mesh_.dimensions()) == 1;
+  }
+
+  bool IsOriginalDeviceMesh2D() const {
+    return VectorGreaterThanOneElementCount(
+               original_device_mesh_.dimensions()) == 2;
+  }
+
+  // Get the corresponding mesh dimension for every tensor dimension.
+  // -1 means replicated on that dimension
+  std::vector<int64_t> GetTensorDimToMeshDimWrapper(
+      const Shape& shape, const HloSharding& spec) const {
+    int64_t n_dim = NumTileDimensions(spec);
+    std::vector<int64_t> tensor_dim_to_mesh_dim =
+        GetTensorDimToMeshDim(shape.rank(), spec, device_mesh_);
+    AdjustTensorMeshDimMapping(tensor_dim_to_mesh_dim, n_dim);
+    return tensor_dim_to_mesh_dim;
+  }
+
+  double AllGatherCost(double num_bytes, int mesh_dim) const;
+
+  double AllReduceCost(double num_bytes, int32_t mesh_dim,
+                       int32_t mesh_dim_another = -1) const;
+
+  double ReduceScatterCost(double num_bytes, int mesh_dim) const;
+
+  double AllToAllCost(double num_bytes, int mesh_dim) const;
+
+  double DotCost(const Shape& lhs_shape, const Shape& rhs_shape,
+                 const DotDimensionNumbers& dot_dnums) const;
+
+  double ReshardingCost(const Shape& shape, const HloSharding& src_spec,
+                        const HloSharding& dst_spec) const;
+
+  // Print the information of this device mesh.
+  std::string ToString() {
+    std::string str;
+    absl::StrAppend(&str, "device_mesh: ", device_mesh_.ToString(), "\n");
+    absl::StrAppend(&str, "mesh_alpha: ", absl::StrJoin(mesh_alpha_, " "),
+                    "\n");
+    absl::StrAppend(&str, "mesh_beta: ", absl::StrJoin(mesh_beta_, " "), "\n");
+    return str;
+  }
+
+  // The original, complete device mesh shape that describes the hardware.
+  const Array<int64_t> original_device_mesh_;
+  // When solve_nd_sharding_iteratively is true, it is a partial mesh shape from
+  // the original_device_mesh_. When solve_nd_sharding_iteratively is false, it
+  // is the same as original_device_mesh_.
+  const Array<int64_t> device_mesh_;
+  // Bandwidth of the device mesh
+  const std::vector<double> mesh_alpha_;
+  const std::vector<double> mesh_beta_;
+  const ProfilingResult& prof_result_;
+  std::vector<int64_t> non_zero_mesh_dims_;
+  const int total_devices_;
+
+  // Cache a flatten 1d version of the device mesh.
+  // Used for mixed mesh shape strategies.
+  Array<int64_t> device_mesh_1d_;
+
+  // The solver option may override the cost of communication primitives
+  const AutoShardingSolverOption& solver_option_;
+
+  // Cached replica groups. Shape: [mesh_dim, group_id, ids in this group].
+  std::vector<std::vector<std::vector<int64_t>>> cached_replica_groups_;
+
+ private:
+  void GenerateCachedReplicaGroups() {
+    // One vector per device_mesh_ dimension.
+    cached_replica_groups_.reserve(device_mesh_.num_dimensions());
+    for (size_t i = 0; i < device_mesh_.num_dimensions(); i++) {
+      cached_replica_groups_.push_back(
+          GetReplicaGroupsAlongOneDimension(device_mesh_, i));
+    }
+  }
+
+  void AdjustTensorMeshDimMapping(std::vector<int64_t>& mapping,
+                                  int64_t n_dim) const {
+    // Shift the non-zero dim for 1d mesh
+    if (n_dim == 1 && non_zero_mesh_dims_.size() == 1) {
+      for (size_t i = 0; i < mapping.size(); ++i) {
+        if (mapping[i] == 0) {
+          mapping[i] = non_zero_mesh_dims_.front();
+        }
+      }
+    }
+  }
+};
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_CLUSTER_ENVIRONMENT_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h
new file mode 100644
index 00000000000..f18bc36efea
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/matrix.h
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/tsl/platform/logging.h"
+
+namespace xla {
+namespace spmd {
+// A simple matrix class to store and manipulate the cost matrices on edges.
+// It can create a view for matrix transpose without copying the memory.
+// TODO (zhuohan): Inherit from Array2D and add Transpose and operator+ (See
+// tensorflow/compiler/xla/array2d.h;l=39)
+class Matrix {
+ public:
+  Matrix() : n_(0), m_(0), transpose_(false), data_(nullptr) {}
+
+  Matrix(size_t n, size_t m) {
+    this->n_ = n;
+    this->m_ = m;
+    transpose_ = false;
+    data_ = std::make_shared<std::vector<double>>(n * m, 0.0);
+  }
+
+  Matrix(size_t n, size_t m, bool transpose,
+         std::shared_ptr<std::vector<double>> data) {
+    this->n_ = n;
+    this->m_ = m;
+    this->transpose_ = transpose;
+    this->data_ = data;
+  }
+
+  Matrix Transpose() { return Matrix(m_, n_, !transpose_, data_); }
+
+  double operator()(size_t i, size_t j) const {
+    size_t idx;
+    if (transpose_) {
+      idx = j * n_ + i;
+    } else {
+      idx = i * m_ + j;
+    }
+    CHECK(data_ != nullptr) << n_ << " , " << m_;
+    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
+    return (*data_)[idx];
+  }
+
+  double& operator()(size_t i, size_t j) {
+    size_t idx;
+    if (transpose_) {
+      idx = j * n_ + i;
+    } else {
+      idx = i * m_ + j;
+    }
+    CHECK(data_ != nullptr) << n_ << " , " << m_;
+    CHECK(idx < n_ * m_) << idx << " , " << n_ << " , " << m_;
+    return (*data_)[idx];
+  }
+
+  Matrix operator+(const Matrix& other) {
+    CHECK_EQ(n_, other.n_);
+    CHECK_EQ(m_, other.m_);
+    Matrix ret = Matrix(n_, m_);
+    for (size_t i = 0; i < n_; ++i) {
+      for (size_t j = 0; j < m_; ++j) {
+        ret(i, j) = operator()(i, j) + other(i, j);
+      }
+    }
+    return ret;
+  }
+
+  std::string ToString() const {
+    std::string str;
+
+    for (size_t i = 0; i < n_; ++i) {
+      for (size_t j = 0; j < m_; ++j) {
+        absl::StrAppend(&str, operator()(i, j), " ");
+      }
+      absl::StrAppend(&str, "\n");
+    }
+
+    return str;
+  }
+
+  size_t n_;
+  size_t m_;
+  bool transpose_;
+  std::shared_ptr<std::vector<double>> data_;
+};
+}  // namespace spmd
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_MATRIX_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.cc
new file mode 100644
index 00000000000..4431c9ac81b
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.cc
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h"
+
+#include <cstdint>
+#include <string>
+
+#include "tensorflow/tsl/lib/monitoring/counter.h"
+
+namespace xla {
+namespace metrics {
+namespace {
+
+auto* xla_auto_sharding_invocations = tsl::monitoring::Counter<0>::New(
+    "/tensorflow/compiler/xla/hlo/xla_auto_sharding_invocations",
+    "The number of XLA auto sharding invocations used to collect "
+    "/tensorflow/compiler/xla/hlo/xla_compilation_time_in_auto_sharding_usecs");
+
+auto* auto_sharding_compilation_time_usecs = tsl::monitoring::Counter<0>::New(
+    "/tensorflow/compiler/xla/hlo/xla_compilation_time_in_auto_sharding_usecs",
+    "The total time spent on compiling XLA graphs in auto sharding pass in in "
+    "microseconds.");
+
+}  // namespace
+
+void RecordAutoShardingInvocations() {
+  xla_auto_sharding_invocations->GetCell()->IncrementBy(1);
+}
+
+void RecordAutoShardingCompilationTime(const uint64_t time_usecs) {
+  auto_sharding_compilation_time_usecs->GetCell()->IncrementBy(time_usecs);
+}
+
+}  // namespace metrics
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h
new file mode 100644
index 00000000000..4555f6ff6af
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/metrics.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
+
+#include <cstdint>
+
+namespace xla {
+namespace metrics {
+
+void RecordAutoShardingInvocations();
+
+void RecordAutoShardingCompilationTime(uint64_t time_usecs);
+
+}  // namespace metrics
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_METRICS_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h
new file mode 100644
index 00000000000..92a3a417649
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/profiling_result.h
@@ -0,0 +1,159 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
+
+namespace xla {
+namespace spmd {
+
+// Store the profiling results of communication and computation.
+class ProfilingResult {
+ public:
+  // TODO (zhuohan): loading the profiling result.
+  ProfilingResult() {
+    if (all_reduce_cost_dict_.empty()) {
+      enabled_ = false;
+    } else {
+      enabled_ = true;
+    }
+  }
+
+  bool Enabled() const { return enabled_; }
+
+  double EstimateAllGatherCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    if (all_gather_cost_dict_.empty()) {
+      // Use all-reduce to approximate all-gather.
+      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
+    }
+
+    return EstimateInternal(replica_groups, size, dtype,
+                            all_gather_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype, all_gather_cost_dict_);
+  }
+
+  double EstimateAllReduceCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    return EstimateInternal(replica_groups, size, dtype,
+                            all_reduce_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype, all_reduce_cost_dict_);
+  }
+
+  double EstimateReduceScatterCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    if (reduce_scatter_cost_dict_.empty()) {
+      // Use all-reduce to approximate reduce-scatter.
+      return EstimateAllReduceCost(replica_groups, size, dtype) / 2;
+    }
+
+    return EstimateInternal(replica_groups, size, dtype,
+                            reduce_scatter_cost_dict_) -
+           EstimateInternal(replica_groups, 0, dtype,
+                            reduce_scatter_cost_dict_);
+  }
+
+  double EstimateAllToAllCost(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype) const {
+    // A penalty factor to make the theoretical cost match the
+    // empirical cost on v100 + nvlink.
+    int64_t num_devices = replica_groups.front().size();
+    double penalty_factor = static_cast<double>(num_devices) / 2.0;
+    // Use all-gather to approximate all-to-all.
+    return EstimateAllGatherCost(replica_groups, size / num_devices, dtype) *
+           penalty_factor;
+  }
+
+  std::string ToString() {
+    std::string str;
+    for (const auto& item : all_reduce_cost_dict_) {
+      absl::StrAppend(&str, item.first.first, " ", item.first.second, "\n");
+    }
+    return str;
+  }
+
+ private:
+  // pair<group, dtype>
+  using Key = std::pair<std::string, std::string>;
+  // vector<pair<size, time>>
+  using Value = std::vector<std::pair<int64_t, double>>;
+
+  // Estimate the cost by linear interpolation between the two closest points.
+  double EstimateInternal(
+      const std::vector<std::vector<int64_t>>& replica_groups, int64_t size,
+      const std::string& dtype,
+      const StableHashMap<Key, Value>& cost_dict) const {
+    Key key(Group2Str(replica_groups), dtype);
+    Value cost_list = cost_dict.at(key);
+
+    CHECK(!cost_list.empty());
+
+    size_t i;
+    if (size > cost_list.back().first) {
+      i = cost_list.size() - 2;
+    } else if (size < cost_list.front().first) {
+      i = 0;
+    } else {
+      for (i = 0; i < cost_list.size() - 1; ++i) {
+        if (cost_list[i].first <= size && size <= cost_list[i + 1].first) {
+          break;
+        }
+      }
+    }
+
+    int64_t left_size = cost_list[i].first;
+    double left_cost = cost_list[i].second;
+    int64_t right_size = cost_list[i + 1].first;
+    double right_cost = cost_list[i + 1].second;
+
+    return 1.0 * (size - left_size) / (right_size - left_size) *
+               (right_cost - left_cost) +
+           left_cost;
+  }
+
+  // Make a string key of a replica_groups.
+  std::string Group2Str(
+      const std::vector<std::vector<int64_t>>& replica_groups) const {
+    std::string str;
+    absl::StrAppend(&str, "(");
+    for (const auto& group : replica_groups) {
+      absl::StrAppend(&str, "(", absl::StrJoin(group, ","), ")");
+    }
+    absl::StrAppend(&str, ")");
+
+    return str;
+  }
+
+  bool enabled_;
+  StableHashMap<Key, Value> all_reduce_cost_dict_;
+  StableHashMap<Key, Value> all_gather_cost_dict_;
+  StableHashMap<Key, Value> reduce_scatter_cost_dict_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_PROFILING_RESULT_H_
diff --git a/tensorflow/compiler/xla/hlo/ir/BUILD b/tensorflow/compiler/xla/hlo/ir/BUILD
index b4a9a21ac37..b70aec6060b 100644
--- a/tensorflow/compiler/xla/hlo/ir/BUILD
+++ b/tensorflow/compiler/xla/hlo/ir/BUILD
@@ -4,7 +4,8 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = [":friends"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -56,6 +57,7 @@ cc_library(
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:printer",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
@@ -66,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:computation_placer_hdr",
         "//tensorflow/compiler/xla/service:hlo_module_config",
@@ -95,3 +98,15 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "hlo_module_group",
+    srcs = ["hlo_module_group.cc"],
+    hdrs = ["hlo_module_group.h"],
+    deps = [
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
diff --git a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
index 6fde433ed95..fd68cc08df9 100644
--- a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
@@ -63,8 +63,8 @@ class DfsHloVisitorBase {
       "HloInstruction*");
 
  public:
-  DfsHloVisitorBase() {}
-  virtual ~DfsHloVisitorBase() {}
+  DfsHloVisitorBase() = default;
+  virtual ~DfsHloVisitorBase() = default;
 
   // These routines are self-descriptive, see class comment for usage
   // information.
@@ -112,26 +112,33 @@ class DfsHloVisitorBase {
   virtual Status HandleCbrt(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
-  virtual Status HandleFft(HloInstructionPtr fft) = 0;
-  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
-  virtual Status HandleOptimizationBarrier(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted start */
   virtual Status HandleAllGather(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllGatherStart(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllGatherDone(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllGatherStart(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReduceScatter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleAllReduceStart(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduceDone(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllReduceStart(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
-  virtual Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
+  virtual Status HandleOptimizationBarrier(HloInstructionPtr hlo) = 0;
   virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReduceScatter(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  /* go/keep-sorted start */
+  virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleSetDimensionSize(HloInstructionPtr hlo) = 0;
+
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
@@ -195,6 +202,9 @@ class DfsHloVisitorBase {
   virtual Status HandleSin(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleTan(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleTanh(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
@@ -240,37 +250,42 @@ class DfsHloVisitorBase {
     return HandleElementwiseUnary(hlo);
   }
 
+  /* go/keep-sorted start */
   virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRng(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRngBitGenerator(HloInstructionPtr hlo) = 0;
-  virtual Status HandleRngGetAndUpdateState(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSort(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
-  virtual Status HandleIota(HloInstructionPtr hlo) = 0;
-  virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
+
+  /* go/keep-sorted start */
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
-  virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
-  virtual Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
-  virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
-  virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
-  virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
   virtual Status HandleCall(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConditional(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
   virtual Status HandleCustomCall(HloInstructionPtr hlo) = 0;
-  virtual Status HandleSlice(HloInstructionPtr hlo) = 0;
+  virtual Status HandleDynamicReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleDynamicSlice(HloInstructionPtr hlo) = 0;
   virtual Status HandleDynamicUpdateSlice(HloInstructionPtr hlo) = 0;
-  virtual Status HandleTuple(HloInstructionPtr hlo) = 0;
+  virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGather(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
+  virtual Status HandleIota(HloInstructionPtr hlo) = 0;
   virtual Status HandleMap(HloInstructionPtr hlo) = 0;
+  virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
+  virtual Status HandleRng(HloInstructionPtr hlo) = 0;
+  virtual Status HandleRngBitGenerator(HloInstructionPtr hlo) = 0;
+  virtual Status HandleRngGetAndUpdateState(HloInstructionPtr hlo) = 0;
+  virtual Status HandleScatter(HloInstructionPtr hlo) = 0;
   virtual Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSlice(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSort(HloInstructionPtr hlo) = 0;
+  virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
+  virtual Status HandleTuple(HloInstructionPtr hlo) = 0;
   virtual Status HandleWhile(HloInstructionPtr hlo) = 0;
-  virtual Status HandleConditional(HloInstructionPtr hlo) = 0;
-  virtual Status HandleGather(HloInstructionPtr hlo) = 0;
-  virtual Status HandleScatter(HloInstructionPtr hlo) = 0;
+  /* go/keep-sorted end */
 
   virtual Status HandlePad(HloInstructionPtr hlo) = 0;
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
index 1d2d4911dc3..f696ec182b2 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -424,43 +425,25 @@ void ComputeComputationPostOrder(HloComputation* computation,
   }
 }
 
-std::optional<int64_t> GetChannelId(const HloInstruction& inst) {
-  // Note that we only include Send and RecvDone, as we want to create a
-  // dependency between those, but not SendDone and Recv.
-  switch (inst.opcode()) {
-    case HloOpcode::kSend:
-    case HloOpcode::kRecvDone:
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kAllGather:
-    case HloOpcode::kAllToAll:
-    case HloOpcode::kCollectivePermute:
-    case HloOpcode::kReduceScatter:
-      return inst.channel_id();
-    default:
-      return std::nullopt;
-  }
-}
-
 }  // namespace
 
 void HloComputation::ComputeInstructionPostOrder(
-    HloInstruction* root,
-    HloComputation::ChannelDependencyGroup& channel_dependencies,
+    HloInstruction* root, const ChannelDependencies& channel_dependencies,
     absl::flat_hash_map<HloInstruction*, VisitState>& visited,
     std::vector<HloInstruction*>& post_order) const {
   std::vector<HloInstruction*> dfs_stack = {root};
   while (!dfs_stack.empty()) {
     HloInstruction& current = *dfs_stack.back();
 
-    auto result = visited.insert({&current, kVisiting});
-    if (!result.second) {  // We've already seen this instruction.
+    auto [it, was_inserted] = visited.insert({&current, kVisiting});
+    if (!was_inserted) {  // We've already seen this instruction.
       dfs_stack.pop_back();
-      if (result.first->second != kVisited) {
+      if (it->second != kVisited) {
         DCHECK_EQ(current.parent(), this)
             << "Instruction " << current.name()
             << " is not in the current computation (" << name() << ").";
         post_order.push_back(&current);
-        result.first->second = kVisited;
+        it->second = kVisited;
       }
       continue;
     }
@@ -470,15 +453,10 @@ void HloComputation::ComputeInstructionPostOrder(
     // Collectives with the same channel ID must be performed together, as these
     // represent MPMD-partitioned that will later be split into separate modules
     // and the order must be preserved.
-    std::optional<int64_t> channel_id =
-        ((&current != root) && (current.opcode() != HloOpcode::kSend))
-            ? GetChannelId(current)
-            : std::nullopt;
-    if (channel_id) {
-      auto it = channel_dependencies.find(*channel_id);
+    if (&current != root) {
+      auto it = channel_dependencies.find(&current);
       if (it != channel_dependencies.end()) {
         dfs_stack.insert(dfs_stack.end(), it->second.begin(), it->second.end());
-        channel_dependencies.erase(it);
       }
     }
 
@@ -494,25 +472,68 @@ void HloComputation::ComputeInstructionPostOrder(
   }
 }
 
-HloComputation::ChannelDependencyGroup
-HloComputation::ComputeChannelDependencies() const {
+HloComputation::ChannelDependencies HloComputation::ComputeChannelDependencies()
+    const {
   if (parent() && parent()->config().has_static_device_assignment() &&
       (parent()->config().static_device_assignment().computation_count() == 1 ||
        parent()->config().use_spmd_partitioning())) {
     return {};
   }
 
-  ChannelDependencyGroup channel_dependencies;
+  using Instructions = absl::InlinedVector<HloInstruction*, 1>;
+  absl::flat_hash_map<int64_t, Instructions> channel_groups;
+
+  // Create dependencies RecvDone -> Send, and between partitioned collectives.
+  ChannelDependencies dependencies;
   for (const auto& instruction : instructions_) {
-    std::optional<int64_t> channel_id = GetChannelId(*instruction);
-    if (channel_id)
-      channel_dependencies[*channel_id].push_back(instruction.get());
+    switch (instruction->opcode()) {
+      case HloOpcode::kSend: {
+        Instructions& group = channel_groups[*instruction->channel_id()];
+        if (group.empty()) {
+          group.push_back(instruction.get());
+        } else {
+          dependencies[group[0]] = {instruction.get()};
+        }
+        break;
+      }
+      case HloOpcode::kRecvDone: {
+        Instructions& group = channel_groups[*instruction->channel_id()];
+        if (group.empty()) {
+          group.push_back(instruction.get());
+        } else {
+          dependencies[instruction.get()] = {group[0]};
+        }
+        break;
+      }
+      case HloOpcode::kAllReduce:
+      case HloOpcode::kAllGather:
+      case HloOpcode::kAllToAll:
+      case HloOpcode::kCollectivePermute:
+      case HloOpcode::kReduceScatter: {
+        std::optional<int64_t> channel_id = instruction->channel_id();
+        if (channel_id) {
+          Instructions& group = channel_groups[*channel_id];
+          for (const HloInstruction* group_inst : group) {
+            dependencies[group_inst].push_back(instruction.get());
+          }
+          dependencies[instruction.get()] = group;
+          group.push_back(instruction.get());
+        }
+        break;
+      }
+      default:
+        break;
+    }
   }
-  return channel_dependencies;
+  return dependencies;
 }
 
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
-  ChannelDependencyGroup channel_dependencies = ComputeChannelDependencies();
+  return MakeInstructionPostOrder(ComputeChannelDependencies());
+}
+
+std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder(
+    const ChannelDependencies& channel_dependencies) const {
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   absl::flat_hash_map<HloInstruction*, VisitState> visited;
@@ -551,36 +572,37 @@ std::vector<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-std::string HloComputation::ToString(const HloPrintOptions& options) const {
-  return ToString(options, MakeInstructionPostOrder());
+void HloComputation::Print(Printer* printer,
+                           const HloPrintOptions& options) const {
+  Print(printer, options, MakeInstructionPostOrder());
 }
 
-std::string HloComputation::ToString(
-    const HloPrintOptions& options,
+void HloComputation::Print(
+    Printer* printer, const HloPrintOptions& options,
     absl::Span<const HloInstruction* const> instruction_order) const {
   CHECK_EQ(instruction_order.size(), instruction_count());
   const std::string tab(2 * options.indent_amount(), ' ');
 
-  std::string result;
-  absl::StrAppend(&result, tab);
+  printer->Append(tab);
 
   if (!options.is_in_nested_computation()) {
     if (options.print_percent()) {
-      absl::StrAppend(&result, "%");
+      printer->Append("%");
     }
     if (options.print_ids()) {
       // When print_ids() is false, exclude entry computation's name because it
       // includes and leads to non-deterministic fingerprint.
-      absl::StrAppend(&result, name(), " ");
+      printer->Append(name());
+      printer->Append(" ");
     }
   }
 
   if (options.print_program_shape()) {
-    absl::StrAppend(
-        &result,
-        ShapeUtil::HumanString(ComputeProgramShape(options.print_ids())), " ");
+    ShapeUtil::PrintHumanString(printer,
+                                ComputeProgramShape(options.print_ids()));
+    printer->Append(" ");
   }
-  absl::StrAppend(&result, "{\n");
+  printer->Append("{\n");
 
   {
     // Print the instructions in this computation.
@@ -593,24 +615,37 @@ std::string HloComputation::ToString(
     for (const HloInstruction* const instruction : instruction_order) {
       DCHECK_EQ(this, instruction->parent());
       // 2 more spaces than just 'tab' due to indent_amount()+1 above
-      absl::StrAppend(&result, tab, "  ");
+      printer->Append(tab);
+      printer->Append("  ");
       if (instruction == root_instruction_) {
-        absl::StrAppend(&result, "ROOT ");
+        printer->Append("ROOT ");
       }
-      absl::StrAppend(
-          &result,
-          instruction->ToStringWithCanonicalNameMap(new_options, &name_map),
-          "\n");
+      instruction->PrintWithCanonicalNameMap(printer, new_options, &name_map);
+      printer->Append("\n");
     }
   }
 
-  absl::StrAppend(&result, tab, "}");
+  printer->Append(tab);
+  printer->Append("}");
   if (options.print_ids() && !IsMainThread()) {
     // When print_ids() is false, exclude entry computation's thread name
     // because it includes and leads to non-deterministic fingerprint.
-    absl::StrAppend(&result, ", execution_thread=\"", execution_thread(), "\"");
+    printer->Append(", execution_thread=\"");
+    printer->Append(execution_thread());
+    printer->Append("\"");
   }
-  return result;
+}
+
+std::string HloComputation::ToString(const HloPrintOptions& options) const {
+  return ToString(options, MakeInstructionPostOrder());
+}
+
+std::string HloComputation::ToString(
+    const HloPrintOptions& options,
+    absl::Span<const HloInstruction* const> instruction_order) const {
+  StringPrinter printer;
+  Print(&printer, options, instruction_order);
+  return std::move(printer).ToString();
 }
 
 absl::Cord HloComputation::ToCord(const HloPrintOptions& options) const {
@@ -620,7 +655,9 @@ absl::Cord HloComputation::ToCord(const HloPrintOptions& options) const {
 absl::Cord HloComputation::ToCord(
     const HloPrintOptions& options,
     absl::Span<const HloInstruction* const> instruction_order) const {
-  return absl::Cord(ToString(options, instruction_order));
+  CordPrinter printer;
+  Print(&printer, options, instruction_order);
+  return std::move(printer).ToCord();
 }
 
 HloComputationProto HloComputation::ToProto() const {
@@ -874,7 +911,7 @@ ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const {
   for (auto* param_instruction : param_instructions_) {
     *program_shape.add_parameters() = param_instruction->shape();
     *program_shape.add_parameter_names() =
-        PrintName(param_instruction->name(), include_ids);
+        std::string(PrintName(param_instruction->name(), include_ids));
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
index 63e193dabea..cac2f47b59a 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -239,6 +240,17 @@ class HloComputation {
   // on the computation's existing name.
   void UniquifyName(NameUniquer* name_uniquer);
 
+  // Prints a string representation of the computation.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+  void Print(Printer* printer, const HloPrintOptions& options,
+             absl::Span<const HloInstruction* const> instruction_order) const;
+
   // Return a string representation of the computation.
   //
   // (We express the default options using an overload rather than a default
@@ -312,9 +324,15 @@ class HloComputation {
             MakeUnwrappingIterator(instructions_.end())};
   }
 
+  using ChannelDependencies =
+      absl::flat_hash_map<const HloInstruction*,
+                          absl::InlinedVector<HloInstruction*, 1>>;
+
   // Compute and return a post-order of the instructions in the computation. In
   // this order, definitions of values always appear before their uses.
   std::vector<HloInstruction*> MakeInstructionPostOrder() const;
+  std::vector<HloInstruction*> MakeInstructionPostOrder(
+      const ChannelDependencies& channel_dependencies) const;
 
   int64_t instruction_count() const { return instruction_iterators_.size(); }
 
@@ -557,13 +575,13 @@ class HloComputation {
   // make each channel complete).
   bool IsSafelyRemovable(const HloInstruction* instruction);
 
-  // Returns a map from channel-id to the group of instructions associated with
-  // the channel. These instructions will be considered as a single node for
-  // dependency purposes. Send and RecvDone are in the group, and AllReduces
-  // with the same channel id are in the group.
-  using ChannelDependencyGroup =
-      absl::flat_hash_map<int64_t, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyGroup ComputeChannelDependencies() const;
+  // Returns a map from an instruction to the group of instructions associated
+  // with the same channel. These instructions will be considered as a single
+  // node for dependency purposes.
+  // RecvDone ops will map to the corresponding Send op.
+  // Cross-partition collectives will map to every other instruction with the
+  // same channel ID (it doesn't map to itself).
+  ChannelDependencies ComputeChannelDependencies() const;
 
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
@@ -711,8 +729,7 @@ class HloComputation {
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
-      HloInstruction* root,
-      HloComputation::ChannelDependencyGroup& channel_dependencies,
+      HloInstruction* root, const ChannelDependencies& channel_dependencies,
       absl::flat_hash_map<HloInstruction*, VisitState>& visited,
       std::vector<HloInstruction*>& post_order) const;
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
index 3cbab1a0e3c..9366c8908fd 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -234,8 +235,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kCopyStart: {
-      instruction = CreateCopyStart(shape, operands(0),
-                                    proto.is_cross_program_prefetch());
+      std::optional<int> cross_program_prefetch_index;
+      if (proto.optional_cross_program_prefetch_index_case() ==
+          HloInstructionProto::kCrossProgramPrefetchIndex) {
+        cross_program_prefetch_index =
+            std::make_optional(proto.cross_program_prefetch_index());
+
+        // Silently upgrade HLO protos using the old field.
+      } else if (proto.is_cross_program_prefetch()) {
+        cross_program_prefetch_index = 0;
+      }
+
+      instruction =
+          CreateCopyStart(shape, operands(0), cross_program_prefetch_index);
       break;
     }
     case HloOpcode::kCompare: {
@@ -424,6 +436,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "No fusion computation with id " << fusion_id;
       instruction =
           CreateFusion(shape, fusion_kind, all_operands(), fused_computation);
+      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          output_to_operand_aliasing;
+      for (const auto& aliasing : proto.output_operand_aliasing()) {
+        output_to_operand_aliasing.emplace_back(
+            ShapeIndex(aliasing.output_shape_index().begin(),
+                       aliasing.output_shape_index().end()),
+            std::make_pair(aliasing.operand_index(),
+                           ShapeIndex(aliasing.operand_shape_index().begin(),
+                                      aliasing.operand_shape_index().end())));
+      }
+      auto fusion_instr = DynCast<HloFusionInstruction>(instruction.get());
+      fusion_instr->set_output_to_operand_aliasing(
+          std::move(output_to_operand_aliasing));
       break;
     }
     case HloOpcode::kRng:
@@ -782,14 +807,13 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       *custom_call_instr->mutable_precision_config() = precision_config;
       std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
           output_to_operand_aliasing;
-      for (const auto& aliasing : proto.custom_call_output_operand_aliasing()) {
+      for (const auto& aliasing : proto.output_operand_aliasing()) {
         output_to_operand_aliasing.emplace_back(
             ShapeIndex(aliasing.output_shape_index().begin(),
                        aliasing.output_shape_index().end()),
-            std::pair<int64_t, ShapeIndex>{
-                aliasing.operand_index(),
-                ShapeIndex(aliasing.operand_shape_index().begin(),
-                           aliasing.operand_shape_index().end())});
+            std::make_pair(aliasing.operand_index(),
+                           ShapeIndex(aliasing.operand_shape_index().begin(),
+                                      aliasing.operand_shape_index().end())));
       }
       custom_call_instr->set_output_to_operand_aliasing(
           std::move(output_to_operand_aliasing));
@@ -998,8 +1022,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->unique_id_ = proto.id();
 
   if (proto.has_sharding()) {
-    TF_ASSIGN_OR_RETURN(const auto& sharding,
+    TF_ASSIGN_OR_RETURN(HloSharding sharding,
                         HloSharding::FromProto(proto.sharding()));
+    // To allow for existing Hlo protos to not fail verification, apply tuple
+    // sharding normalization.
+    sharding = sharding.NormalizeTupleSharding(instruction->shape());
     instruction->set_sharding(sharding);
   }
 
@@ -1106,6 +1133,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
+    case HloOpcode::kTan:
       break;
     default:
       LOG(FATAL) << "Invalid unary instruction opcode "
@@ -1220,9 +1248,9 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCopyStart(
     const Shape& shape, HloInstruction* operand,
-    bool is_cross_program_prefetch) {
+    std::optional<int> cross_program_prefetch) {
   return std::make_unique<HloCopyStartInstruction>(shape, operand,
-                                                   is_cross_program_prefetch);
+                                                   cross_program_prefetch);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCompare(
@@ -1557,6 +1585,17 @@ HloInstruction::CreateBitcastConvert(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateStochasticConvert(const Shape& shape,
+                                        HloInstruction* operand,
+                                        HloInstruction* random) {
+  auto instruction = absl::WrapUnique(
+      new HloInstruction(HloOpcode::kStochasticConvert, shape));
+  instruction->AppendOperand(operand);
+  instruction->AppendOperand(random);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateBitcast(
     const Shape& shape, HloInstruction* operand) {
   auto instruction =
@@ -2096,6 +2135,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateUnary(shape, opcode_, new_operands[0]);
@@ -2117,7 +2157,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
-    case HloOpcode::kStochasticConvert:
       CHECK_EQ(new_operands.size(), 2);
       clone = CreateBinary(shape, opcode_, new_operands[0], new_operands[1]);
       break;
@@ -2140,6 +2179,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBitcastConvert(shape, new_operands[0]);
       break;
+    case HloOpcode::kStochasticConvert:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateStochasticConvert(shape, new_operands[0], new_operands[1]);
+      break;
     case HloOpcode::kDynamicUpdateSlice:
       clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
                                        new_operands.subspan(2));
@@ -2550,6 +2593,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kStochasticConvert:
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
     case HloOpcode::kTuple:
       return true;
@@ -2928,7 +2972,7 @@ std::string HloInstruction::SignatureString() const {
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
-std::string PrintName(const std::string& name, bool print_ids) {
+absl::string_view PrintName(absl::string_view name, bool print_ids) {
   if (print_ids) {
     return name;
   } else {
@@ -2941,10 +2985,19 @@ namespace {
 
 using DFSStack = absl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
 
+void PrintNameInternal(Printer* printer, absl::string_view name,
+                       const HloPrintOptions& options) {
+  if (options.print_percent()) {
+    printer->Append("%");
+  }
+  printer->Append(PrintName(name, options.print_ids()));
+}
+
 std::string PrintNameInternal(const std::string& name,
                               const HloPrintOptions& options) {
-  return StrCat(options.print_percent() ? "%" : "",
-                PrintName(name, options.print_ids()));
+  StringPrinter printer;
+  PrintNameInternal(&printer, name, options);
+  return std::move(printer).ToString();
 }
 
 void PrintCycle(const HloInstruction* child, DFSStack* dfs_stack) {
@@ -2988,9 +3041,16 @@ void PrintCycle(const HloInstruction* child, DFSStack* dfs_stack) {
 
 }  // namespace
 
-std::string HloInstruction::ToString(const HloPrintOptions& options) const {
+void HloInstruction::Print(Printer* printer,
+                           const HloPrintOptions& options) const {
   CanonicalNameMap new_map;
-  return ToStringWithCanonicalNameMap(options, &new_map);
+  PrintWithCanonicalNameMap(printer, options, &new_map);
+}
+
+std::string HloInstruction::ToString(const HloPrintOptions& options) const {
+  StringPrinter printer;
+  Print(&printer, options);
+  return std::move(printer).ToString();
 }
 
 bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
@@ -3023,6 +3083,7 @@ bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
       return true;
 
@@ -3044,6 +3105,7 @@ bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kStochasticConvert:
       return true;
 
     // Ternary elementwise operations.
@@ -3077,30 +3139,31 @@ bool HloInstruction::IsCrossReplicaAllReduce() const {
   return opcode() == HloOpcode::kAllReduce && !channel_id();
 }
 
-std::string HloInstruction::ToStringWithCanonicalNameMap(
-    const HloPrintOptions& options,
+void HloInstruction::PrintWithCanonicalNameMap(
+    Printer* printer, const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
-  std::string result = "";
-
   // Logic to print the instruction name (e.g. "%foo = ").
   if (options.canonicalize_instruction_names()) {
     if (options.is_in_nested_computation()) {
       // If we are canonicalizing instruction names and this is a top-level
       // HloInstruction::ToString() call, don't print an instruction name.
       DCHECK(!options.print_percent());  // no need to call PrintNameInternal
-      StrAppend(&result, canonical_name_map->LookupOrInsert(name()), " = ");
+      printer->Append(canonical_name_map->LookupOrInsert(name()));
+      printer->Append(" = ");
     }
   } else {
-    StrAppend(&result, PrintNameInternal(name(), options), " = ");
+    PrintNameInternal(printer, name(), options);
+    printer->Append(" = ");
   }
 
   if (options.print_result_shape()) {
     // Print shape.
     if (options.include_layout_in_shapes()) {
-      StrAppend(&result, ShapeUtil::HumanStringWithLayout(shape()), " ");
+      ShapeUtil::PrintHumanStringWithLayout(printer, shape());
     } else {
-      StrAppend(&result, ShapeUtil::HumanString(shape()), " ");
+      ShapeUtil::PrintHumanString(printer, shape());
     }
+    printer->Append(" ");
   }
 
   // Print opcode, operand(s).
@@ -3117,42 +3180,45 @@ std::string HloInstruction::ToStringWithCanonicalNameMap(
           return "-done";
       }
     }();
-    StrAppend(&result, HloOpcodeString(async_wrapped_opcode()), suffix);
+    printer->Append(HloOpcodeString(async_wrapped_opcode()));
+    printer->Append(suffix);
   } else {
-    StrAppend(&result, HloOpcodeString(opcode()));
+    printer->Append(HloOpcodeString(opcode()));
   }
-  StrAppend(&result, "(",
-            OperandsToStringWithCanonicalNameMap(options, canonical_name_map),
-            ")");
+  printer->Append("(");
+  PrintOperandsWithCanonicalNameMap(printer, options, canonical_name_map);
+  printer->Append(")");
 
   // Print additional attributes. If an instruction contains a subcomputation,
   // the subcomputation is also printed here.
   for (const std::string& extra : ExtraAttributesToString(options)) {
-    StrAppend(&result, ", ", extra);
+    printer->Append(", ");
+    printer->Append(extra);
   }
 
   if (options.print_metadata() &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
-    StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
+    printer->Append(", metadata={");
+    printer->Append(xla::OpMetadataToString(metadata_));
+    printer->Append("}");
   }
   if (options.print_backend_config() && !backend_config_.empty()) {
-    StrAppend(&result, ", backend_config=\"",
-              CEscape(backend_config_.GetRawString()), "\"");
+    printer->Append(", backend_config=\"");
+    printer->Append(CEscape(backend_config_.GetRawString()));
+    printer->Append("\"");
   }
-  return result;
 }
 
-std::string HloInstruction::OperandsToString(
-    const HloPrintOptions& options) const {
+void HloInstruction::PrintOperands(Printer* printer,
+                                   const HloPrintOptions& options) const {
   CanonicalNameMap new_map;
-  return OperandsToStringWithCanonicalNameMap(options, &new_map);
+  PrintOperandsWithCanonicalNameMap(printer, options, &new_map);
 }
 
-std::string HloInstruction::OperandsToStringWithCanonicalNameMap(
-    const HloPrintOptions& options,
+void HloInstruction::PrintOperandsWithCanonicalNameMap(
+    Printer* printer, const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
-  std::string operands;
   absl::Span<HloInstruction* const> slice(operands_);
   const int64_t kMaxOperandsToShowIfCompact = 4;
   if (options.compact_operands() &&
@@ -3162,42 +3228,45 @@ std::string HloInstruction::OperandsToStringWithCanonicalNameMap(
   for (int64_t i = 0; i < slice.size(); ++i) {
     HloInstruction* operand = slice[i];
     if (i != 0) {
-      StrAppend(&operands, ", ");
+      printer->Append(", ");
       if (options.print_operand_index_annotation_interval() != 0 &&
           i % options.print_operand_index_annotation_interval() == 0) {
-        StrAppend(&operands, absl::StrFormat("/*index=%lld*/", i));
+        printer->Append(absl::StrFormat("/*index=%lld*/", i));
       }
     }
     // If operand is already been deleted, put `null` to the string output.
     if (operand == nullptr) {
-      StrAppend(&operands, "null ");
+      printer->Append("null ");
       continue;
     }
-    std::vector<std::string> str;
+    bool add_space = false;
     if (options.print_operand_shape()) {
       if (options.include_layout_in_shapes()) {
-        str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape()));
+        ShapeUtil::PrintHumanStringWithLayout(printer, operand->shape());
       } else {
-        str.push_back(ShapeUtil::HumanString(operand->shape()));
+        ShapeUtil::PrintHumanString(printer, operand->shape());
       }
+      add_space = true;
     }
     if (options.canonicalize_instruction_names()) {
       if (options.is_in_nested_computation()) {
         // In a top-level HloInstruction::ToString() call, the operand name is
         // not part of the canonical string.
         DCHECK(!options.print_percent());  // no need to call PrintNameInternal
-        str.push_back(canonical_name_map->LookupOrInsert(operand->name()));
+        if (add_space) printer->Append(" ");
+        printer->Append(canonical_name_map->LookupOrInsert(operand->name()));
       }
     } else if (options.print_operand_names()) {
-      str.push_back(PrintNameInternal(operand->name(), options));
+      if (add_space) printer->Append(" ");
+      PrintNameInternal(printer, operand->name(), options);
     }
-    StrAppend(&operands, StrJoin(str, " "));
   }
   const int64_t remaining = operands_.size() - slice.size();
   if (slice.size() != operands_.size()) {
-    StrAppend(&operands, ", ...(+", remaining, ")");
+    printer->Append(", ...(+");
+    printer->Append(absl::AlphaNum(remaining).Piece());
+    printer->Append(")");
   }
-  return operands;
 }
 
 namespace {
@@ -3621,6 +3690,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleLog(this);
     case HloOpcode::kLog1p:
       return visitor->HandleLog1p(this);
+    case HloOpcode::kTan:
+      return visitor->HandleTan(this);
     case HloOpcode::kTanh:
       return visitor->HandleTanh(this);
     case HloOpcode::kCos:
@@ -4901,8 +4972,8 @@ void HloInstruction::set_called_computations_execution_thread(
       async_execution_thread, skip_async_execution_thread_overwrite);
 }
 
-bool HloInstruction::is_cross_program_prefetch() const {
-  return Cast<HloCopyStartInstruction>(this)->is_cross_program_prefetch();
+std::optional<int> HloInstruction::cross_program_prefetch_index() const {
+  return Cast<HloCopyStartInstruction>(this)->cross_program_prefetch_index();
 }
 
 ComparisonDirection HloInstruction::comparison_direction() const {
@@ -4922,8 +4993,8 @@ const CholeskyOptions& HloInstruction::cholesky_options() const {
 }
 
 const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
-HloInstruction::custom_call_output_operand_aliasing() const {
-  return Cast<HloCustomCallInstruction>(this)->output_to_operand_aliasing();
+HloInstruction::output_operand_aliasing() const {
+  return Cast<HloCallableInstruction>(this)->output_to_operand_aliasing();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
index d48e326eadb..3a9dc645179 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -65,7 +66,7 @@ namespace xla {
 class HloComputation;
 class HloModule;
 
-std::string PrintName(const std::string& name, bool print_ids);
+absl::string_view PrintName(absl::string_view name, bool print_ids);
 
 // A bunch of switches that control how the hlo text should be printed.
 class HloPrintOptions {
@@ -646,7 +647,7 @@ class HloInstruction {
   // prefetch or not.
   static std::unique_ptr<HloInstruction> CreateCopyStart(
       const Shape& shape, HloInstruction* operand,
-      bool is_cross_program_prefetch = false);
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
 
   // Creates a compare op, performing the comparison specified in direction.
   static std::unique_ptr<HloInstruction> CreateCompare(
@@ -833,6 +834,12 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateBitcastConvert(
       const Shape& shape, HloInstruction* operand);
 
+  // Creates a stochastic conversion instruction, where operand is the data to
+  // convert, random is a given random input to determine the rounding direction
+  // and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateStochasticConvert(
+      const Shape& shape, HloInstruction* operand, HloInstruction* random);
+
   // Creates an infeed instruction, which reads data of the given shape from the
   // Infeed interface of the device. infeed_shape is the shape of the data
   // received from the infeed *not* the shape of the infeed instruction which
@@ -1498,6 +1505,12 @@ class HloInstruction {
   // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
   std::string SignatureString() const;
 
+  // Prints a debugging string that represents this instruction.
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+
   // Returns a debugging string that represents this instruction.
   //
   // (We express the default options using an overload rather than a default
@@ -1509,10 +1522,10 @@ class HloInstruction {
   std::string ToString() const { return ToString(HloPrintOptions()); }
   std::string ToString(const HloPrintOptions& options) const;
 
-  // Components of the ToString() representation:
+  // Components of the Print() and ToString() representation:
 
-  // Returns a string representation of the operand list.
-  std::string OperandsToString(const HloPrintOptions& options) const;
+  // Prints a string representation of the operand list.
+  void PrintOperands(Printer* printer, const HloPrintOptions& options) const;
 
   // Returns string representation of op-specific attributes.
   std::vector<std::string> ExtraAttributesToString(
@@ -1526,9 +1539,9 @@ class HloInstruction {
   // The canonical string representation needs to name operands and instruction
   // names in a consistent way. This is implemented through the
   // canonical_name_map.
-  std::string ToStringWithCanonicalNameMap(
-      const HloPrintOptions& options,
-      CanonicalNameMap* canonical_name_map) const;
+  void PrintWithCanonicalNameMap(Printer* printer,
+                                 const HloPrintOptions& options,
+                                 CanonicalNameMap* canonical_name_map) const;
 
   // Returns a serialized representation of this instruction.
   virtual HloInstructionProto ToProto() const;
@@ -1568,14 +1581,14 @@ class HloInstruction {
   // Returns the sharding unique device, if any.
   std::optional<int64_t> sharding_unique_device() const {
     if (sharding_ == nullptr) {
-      return std::optional<int64_t>();
+      return std::nullopt;
     }
     return sharding_->UniqueDevice();
   }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
-    sharding_ = std::make_shared<const HloSharding>(sharding);
+    set_sharding(std::make_shared<const HloSharding>(sharding));
   }
   void set_sharding(std::shared_ptr<const HloSharding> sharding) {
     sharding_ = std::move(sharding);
@@ -2143,8 +2156,8 @@ class HloInstruction {
       absl::string_view async_execution_thread,
       bool skip_async_execution_thread_overwrite);
 
-  // Delegates to HloCopyStartInstruction::is_cross_program_prefetch().
-  bool is_cross_program_prefetch() const;
+  // Delegates to HloCopyStartInstruction::is_cross_program_prefetch_index().
+  std::optional<int> cross_program_prefetch_index() const;
 
   // Delegates to HloCompareInstruction::direction().
   ComparisonDirection comparison_direction() const;
@@ -2157,9 +2170,9 @@ class HloInstruction {
   // Delegates to HloCholeskyInstruction::cholesky_options().
   const CholeskyOptions& cholesky_options() const;
 
-  // Delegates to HloCustomCallInstruction::output_to_operand_aliasing().
+  // Delegates to HloCallableInstruction::output_to_operand_aliasing().
   const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
-  custom_call_output_operand_aliasing() const;
+  output_operand_aliasing() const;
 
   // Appends operand to the list of operands and adds this instruction as a user
   // of the operand.
@@ -2275,8 +2288,8 @@ class HloInstruction {
       const std::optional<int64_t>& operand_idx) const;
 
   // Prints an operand to a string. Accessed by friend class HloInstruction.
-  virtual std::string OperandsToStringWithCanonicalNameMap(
-      const HloPrintOptions& options,
+  virtual void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
       CanonicalNameMap* canonical_name_map) const;
 
   // See comments on Identical().
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
index a674391edf0..d437f7db846 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -369,25 +370,28 @@ HloInstructionProto HloAsyncInstruction::ToProto() const {
   return proto;
 }
 
-HloCopyStartInstruction::HloCopyStartInstruction(const Shape& shape,
-                                                 HloInstruction* operand,
-                                                 bool is_cross_program_prefetch)
+HloCopyStartInstruction::HloCopyStartInstruction(
+    const Shape& shape, HloInstruction* operand,
+    std::optional<int> cross_program_prefetch_index)
     : HloInstruction(HloOpcode::kCopyStart, shape),
-      is_cross_program_prefetch_(is_cross_program_prefetch) {
+      cross_program_prefetch_index_(cross_program_prefetch_index) {
   AppendOperand(operand);
 }
 
 HloInstructionProto HloCopyStartInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
-  proto.set_is_cross_program_prefetch(is_cross_program_prefetch_);
+  if (cross_program_prefetch_index_.has_value()) {
+    proto.set_cross_program_prefetch_index(*cross_program_prefetch_index_);
+  }
   return proto;
 }
 
 std::vector<std::string> HloCopyStartInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   std::vector<std::string> result;
-  if (is_cross_program_prefetch()) {
-    result.push_back("is_cross_program_prefetch=true");
+  if (cross_program_prefetch_index_.has_value()) {
+    result.push_back("cross_program_prefetch_index=" +
+                     std::to_string(*cross_program_prefetch_index_));
   }
   return result;
 }
@@ -397,8 +401,8 @@ bool HloCopyStartInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   const auto& casted_other = static_cast<const HloCopyStartInstruction&>(other);
-  return is_cross_program_prefetch() ==
-         casted_other.is_cross_program_prefetch();
+  return cross_program_prefetch_index() ==
+         casted_other.cross_program_prefetch_index();
 }
 
 std::unique_ptr<HloInstruction>
@@ -406,8 +410,8 @@ HloCopyStartInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
-  return std::make_unique<HloCopyStartInstruction>(shape, new_operands[0],
-                                                   is_cross_program_prefetch());
+  return std::make_unique<HloCopyStartInstruction>(
+      shape, new_operands[0], cross_program_prefetch_index());
 }
 
 HloCompareInstruction::HloCompareInstruction(
@@ -460,8 +464,8 @@ std::unique_ptr<HloInstruction> HloCompareInstruction::CloneWithNewOperandsImpl(
 
 namespace {
 
-// Converts a protocol buffer message (e.g., TriangularSolveOptions) to a vector
-// of "key=value" attribute strings generically, using protocol buffer
+// Converts a protocol buffer message (e.g., TriangularSolveOptions) to a
+// vector of "key=value" attribute strings generically, using protocol buffer
 // reflection.
 //
 // Currently implements a small subset of cases; feel free to add more as
@@ -1509,23 +1513,36 @@ HloConstantInstruction::CloneWithNewOperandsImpl(
                                                   this->shape());
 }
 
-std::string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
-    const HloPrintOptions& options,
+void HloConstantInstruction::PrintOperandsWithCanonicalNameMap(
+    Printer* printer, const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
   if (options.print_only_essential_constants()) {
     if (!literal_.has_value()) {
-      return "{...}";
+      printer->Append("{...}");
+      return;
     }
     if (literal().IsAll(0)) {
-      return "0";
+      printer->Append("0");
+      return;
     }
     if (literal().IsAll(1)) {
-      return "1";
+      printer->Append("1");
+      return;
     }
     if (shape().IsInteger()) {
-      return literal_->ToStringWithoutShapeOneline();
+      // The following prevents high compilation latencies caused by serializing
+      // large constant tensors; for example: b/265669625. The limit of 500k was
+      // chosen empirically to make sure that serialization of the `literal_` is
+      // less than a second.
+      if (auto num_constants =
+              absl::c_accumulate(shape().dimensions(), 1, std::multiplies<>());
+          num_constants <= 500'000) {
+        literal_->PrintWithoutShapeOneline(printer);
+        return;
+      }
     }
-    return "{...}";
+    printer->Append("{...}");
+    return;
   }
 
   // For constants, show the actual value in place of an empty operand list.
@@ -1534,10 +1551,10 @@ std::string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
        options.print_large_constants())) {
     // Literal::ToString emits multidimensional arrays over multiple
     // lines. Compact this into one line by stripping out white space.
-    return literal_->ToStringWithoutShapeOneline();
+    literal_->PrintWithoutShapeOneline(printer);
   } else {
     // Do not show large constants or tuples.
-    return "{...}";
+    printer->Append("{...}");
   }
 }
 
@@ -1641,16 +1658,16 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
         CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build()));
     clone = called_computation_root();
   } else {
-    // When add_output is false, instruction_to_append is necessarily an operand
-    // of the callable instruction. After appending this will no longer be the
-    // case. Remove the operand from the operand list and remove its
-    // corresponding called computation parameter instruction.
+    // When add_output is false, instruction_to_append is necessarily an
+    // operand of the callable instruction. After appending this will no
+    // longer be the case. Remove the operand from the operand list and remove
+    // its corresponding called computation parameter instruction.
     bool in_operand_list =
         absl::c_linear_search(operands(), instruction_to_append);
     CHECK(add_output || in_operand_list);
     if (do_not_clone) {
-      // We assume all uses of a kTuple operation are GTE ops. In this case, we
-      // don't need to clone 'instruction_to_append'.
+      // We assume all uses of a kTuple operation are GTE ops. In this case,
+      // we don't need to clone 'instruction_to_append'.
       CHECK(!in_operand_list);
       clone = instruction_to_append;
     } else {
@@ -1662,8 +1679,8 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     for (int64_t operand_num = 0; operand_num < operand_count();
          ++operand_num) {
       if (instruction_to_append == operand(operand_num)) {
-        // Replace the called computation parameter instruction's uses with the
-        // clone.
+        // Replace the called computation parameter instruction's uses with
+        // the clone.
         HloInstruction* called_computation_parameter =
             called_computation_parameters[operand_num];
         TF_CHECK_OK(called_computation_parameter->ReplaceAllUsesWith(clone));
@@ -1679,8 +1696,8 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     // this callable instruction is no longer a use of instruction_to_append.
     if (in_operand_list) {
       DetachFrom(instruction_to_append);
-      // When the instruction_to_append does not have other users, we don't need
-      // to generate a multioutput instruction.
+      // When the instruction_to_append does not have other users, we don't
+      // need to generate a multioutput instruction.
       if (instruction_to_append->user_count() == 0) {
         add_output = false;
       }
@@ -1691,8 +1708,8 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
   const std::vector<HloInstruction*>& called_computation_parameters =
       called_computation()->parameter_instructions();
 
-  // Add each operand of the clone as an operand of the callable instruction. A
-  // complication is that some clone operands may already be operands of the
+  // Add each operand of the clone as an operand of the callable instruction.
+  // A complication is that some clone operands may already be operands of the
   // callable instruction.
   for (int64_t operand_num = 0; operand_num < clone->operand_count();
        ++operand_num) {
@@ -1709,9 +1726,9 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     }
 
     if (called_computation_parameter == nullptr) {
-      // Clone's operand was not already an operand of the callable instruction.
-      // Add it as an operand and add a corresponding called computation
-      // parameter instruction.
+      // Clone's operand was not already an operand of the callable
+      // instruction. Add it as an operand and add a corresponding called
+      // computation parameter instruction.
       called_computation_parameter = AddCallOperand(operand);
     }
     TF_CHECK_OK(
@@ -1720,7 +1737,8 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
 
   if (add_output) {
     CHECK_GT(instruction_to_append->user_count(), 0);
-    // If this is already a multioutput instruction, expand the root tuple by 1.
+    // If this is already a multioutput instruction, expand the root tuple
+    // by 1.
     HloInstruction* root = called_computation_root();
     HloInstruction::InstructionVector tuple_elements;
     bool newly_created_tuple_instr = false;
@@ -1742,6 +1760,9 @@ HloCallableInstruction::CloneAndAppendInstructionIntoCalledComputation(
     called_computation()->set_root_instruction(new_root,
                                                /*accept_different_shape=*/true);
     *mutable_shape() = new_root->shape();
+    // The instruction might have an existing sharding, which will no longer
+    // be valid after we change the shape. So clear the sharding.
+    clear_sharding();
     if (root->opcode() == HloOpcode::kTuple) {
       TF_CHECK_OK(called_computation()->RemoveInstruction(root));
     }
@@ -1845,7 +1866,8 @@ void HloFusionInstruction::ClearFusionComputationInstruction() {
   // Each fusion calls a single computation, but we use called_computations()
   // instead of fused_instructions_computation(), because the order in which
   // things get destructed can vary; the fusion computation's back-pointer may
-  // already be null, which violates a check in fused_instructions_computation.
+  // already be null, which violates a check in
+  // fused_instructions_computation.
   for (HloComputation* computation : called_computations()) {
     // Some passes that rewrite fusions may reassign a fusion computation to a
     // different fusion instruction as this instruction gets destructed.
@@ -1876,6 +1898,16 @@ std::string HloFusionInstruction::ToCategory() const {
 HloInstructionProto HloFusionInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_fusion_kind(xla::ToString(fusion_kind()));
+  for (const auto& pair : output_to_operand_aliasing()) {
+    auto aliasing = proto.add_output_operand_aliasing();
+    aliasing->set_operand_index(pair.second.first);
+    for (int64_t index : pair.first) {
+      aliasing->add_output_shape_index(index);
+    }
+    for (int64_t index : pair.second.second) {
+      aliasing->add_operand_shape_index(index);
+    }
+  }
   proto.add_called_computation_ids(
       fused_instructions_computation()->unique_id());
   return proto;
@@ -1963,8 +1995,8 @@ void HloFusionInstruction::MergeFusionInstruction(
   // Replace instruction_to_merge use of 'this' with unfused_root.
   TF_CHECK_OK(instruction_to_merge->ReplaceUseWith(this, unfused_root));
 
-  // Build a dummy root for the cloned fusion as we may remove the original root
-  // in the fusion process.
+  // Build a dummy root for the cloned fusion as we may remove the original
+  // root in the fusion process.
   if (!unfused_instructions.empty()) {
     HloComputation* computation = unfused_root->parent();
     auto* dummy_root = computation->AddInstruction(
@@ -2100,7 +2132,20 @@ int64_t HloFusionInstruction::fused_instruction_count() const {
 
 std::vector<std::string> HloFusionInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("kind=", xla::ToString(fusion_kind()))};
+  std::vector<std::string> extra = {
+      StrCat("kind=", xla::ToString(fusion_kind()))};
+  if (!output_to_operand_aliasing().empty()) {
+    std::vector<std::string> pair_strings;
+    pair_strings.reserve(output_to_operand_aliasing().size());
+    for (const auto& pair : output_to_operand_aliasing()) {
+      pair_strings.push_back(StrCat(pair.first.ToString(), ": (",
+                                    pair.second.first, ", ",
+                                    pair.second.second.ToString(), ")"));
+    }
+    extra.push_back(StrCat("output_to_operand_aliasing={",
+                           StrJoin(pair_strings, ", "), "}"));
+  }
+  return extra;
 }
 
 bool HloFusionInstruction::IdenticalSlowPath(
@@ -2108,6 +2153,9 @@ bool HloFusionInstruction::IdenticalSlowPath(
     absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
         eq_computations) const {
   return fusion_kind() == other.fusion_kind() &&
+         output_to_operand_aliasing() ==
+             static_cast<const HloFusionInstruction&>(other)
+                 .output_to_operand_aliasing() &&
          eq_computations(fused_instructions_computation(),
                          other.fused_instructions_computation());
 }
@@ -2241,10 +2289,10 @@ std::vector<std::string> HloParameterInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-std::string HloParameterInstruction::OperandsToStringWithCanonicalNameMap(
-    const HloPrintOptions& options,
+void HloParameterInstruction::PrintOperandsWithCanonicalNameMap(
+    Printer* printer, const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
-  return StrCat(parameter_number_);
+  printer->Append(absl::AlphaNum(parameter_number_).Piece());
 }
 
 bool HloParameterInstruction::IdenticalSlowPath(
@@ -2739,8 +2787,8 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   if (literal_.has_value()) {
     *proto.mutable_literal() = literal_->ToProto();
   }
-  for (const auto& pair : output_to_operand_aliasing_) {
-    auto aliasing = proto.add_custom_call_output_operand_aliasing();
+  for (const auto& pair : output_to_operand_aliasing()) {
+    auto aliasing = proto.add_output_operand_aliasing();
     aliasing->set_operand_index(pair.second.first);
     for (int64_t index : pair.first) {
       aliasing->add_output_shape_index(index);
@@ -2780,8 +2828,8 @@ std::vector<std::string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
     extra.push_back(StrCat("padding_type=", PaddingType_Name(padding_type())));
   }
   // By contract, we print the custom call target even if
-  // options.print_subcomputation_mode() == kOff, because the call target is not
-  // an HloComputation.
+  // options.print_subcomputation_mode() == kOff, because the call target is
+  // not an HloComputation.
   extra.push_back(
       StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
 
@@ -2800,10 +2848,10 @@ std::vector<std::string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (literal_.has_value()) {
     extra.push_back(StrCat("literal=", literal_->ToStringWithLayoutOneline()));
   }
-  if (!output_to_operand_aliasing_.empty()) {
+  if (!output_to_operand_aliasing().empty()) {
     std::vector<std::string> pair_strings;
-    pair_strings.reserve(output_to_operand_aliasing_.size());
-    for (const auto& pair : output_to_operand_aliasing_) {
+    pair_strings.reserve(output_to_operand_aliasing().size());
+    for (const auto& pair : output_to_operand_aliasing()) {
       pair_strings.push_back(StrCat(pair.first.ToString(), ": (",
                                     pair.second.first, ", ",
                                     pair.second.second.ToString(), ")"));
@@ -2867,7 +2915,7 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
       casted_other.custom_call_has_side_effect()) {
     return false;
   }
-  if (output_to_operand_aliasing_ !=
+  if (output_to_operand_aliasing() !=
       casted_other.output_to_operand_aliasing()) {
     return false;
   }
@@ -2928,7 +2976,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
   cloned->set_feature_group_count(feature_group_count_);
   cloned->set_batch_group_count(batch_group_count_);
   cloned->set_custom_call_has_side_effect(custom_call_has_side_effect_);
-  cloned->set_output_to_operand_aliasing(output_to_operand_aliasing_);
+  cloned->set_output_to_operand_aliasing(output_to_operand_aliasing());
   cloned->set_padding_type(padding_type_);
   *cloned->mutable_precision_config() = precision_config();
   cloned->set_custom_call_schedule(custom_call_schedule_);
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
index 5ad24d6759d..a611b092d45 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -274,10 +275,13 @@ class HloAsyncInstruction : public HloInstruction {
 
 class HloCopyStartInstruction : public HloInstruction {
  public:
-  explicit HloCopyStartInstruction(const Shape& shape, HloInstruction* operand,
-                                   bool is_cross_program_prefetch);
+  explicit HloCopyStartInstruction(
+      const Shape& shape, HloInstruction* operand,
+      std::optional<int> cross_program_prefetch_index);
 
-  bool is_cross_program_prefetch() const { return is_cross_program_prefetch_; }
+  std::optional<int> cross_program_prefetch_index() const {
+    return cross_program_prefetch_index_;
+  }
   HloInstructionProto ToProto() const override;
 
   static bool ClassOf(const HloInstruction* hlo) {
@@ -295,7 +299,14 @@ class HloCopyStartInstruction : public HloInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
 
-  bool is_cross_program_prefetch_;
+  // Each cross program prefetched buffer has a unique index. The indices are
+  // assigned contiguously starting from zero in
+  // AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer. This value
+  // is used during codegen to determine which buffer is being speculated at
+  // runtime. One possible implementation is to initialize an array with boolean
+  // values indicating whether the cross program prefetch succeeds or fails for
+  // each buffer.
+  std::optional<int> cross_program_prefetch_index_;
 };
 
 class HloCompareInstruction : public HloInstruction {
@@ -1128,8 +1139,8 @@ class HloConstantInstruction : public HloInstruction {
       const HloInstruction& other,
       absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
           eq_computations) const override;
-  std::string OperandsToStringWithCanonicalNameMap(
-      const HloPrintOptions& options,
+  void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
       CanonicalNameMap* canonical_name_map) const override;
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
@@ -1200,9 +1211,30 @@ class HloCallableInstruction : public HloInstruction {
            hlo->opcode() == HloOpcode::kCustomCall;
   }
 
+  // Gets a list of output/operand buffer pairs that alias each other, where the
+  // output buffer is represented as a ShapeIndex, and the operand buffer is
+  // represented as the operand index and the ShapeIndex. By default this list
+  // is empty.
+  const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
+  output_to_operand_aliasing() const {
+    return output_to_operand_aliasing_;
+  }
+  // Sets the list of output/operand buffer pairs that alias each other.
+  void set_output_to_operand_aliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+          aliasing) {
+    output_to_operand_aliasing_ = std::move(aliasing);
+  }
+
  protected:
   // Returns the default called computation name.
   virtual std::string default_called_computation_name() const = 0;
+
+ private:
+  // A list of output/operand buffer pairs that alias each other. See comment of
+  // output_to_operand_aliasing().
+  std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+      output_to_operand_aliasing_;
 };
 
 class HloFusionInstruction : public HloCallableInstruction {
@@ -1428,8 +1460,8 @@ class HloParameterInstruction : public HloInstruction {
       const HloInstruction& other,
       absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
           eq_computations) const override;
-  std::string OperandsToStringWithCanonicalNameMap(
-      const HloPrintOptions& options,
+  void PrintOperandsWithCanonicalNameMap(
+      Printer* printer, const HloPrintOptions& options,
       CanonicalNameMap* canonical_name_map) const override;
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
@@ -1892,20 +1924,6 @@ class HloCustomCallInstruction : public HloCallableInstruction {
     CHECK(layout_constrained());
     return operand_shapes_with_layout_;
   }
-  // Gets a list of output/operand buffer pairs that alias each other, where the
-  // output buffer is represented as a ShapeIndex, and the operand buffer is
-  // represented as the operand index and the ShapeIndex. By default this list
-  // is empty.
-  const std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>&
-  output_to_operand_aliasing() const {
-    return output_to_operand_aliasing_;
-  }
-  // Sets the list of output/operand buffer pairs that alias each other.
-  void set_output_to_operand_aliasing(
-      std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
-          aliasing) {
-    output_to_operand_aliasing_ = std::move(aliasing);
-  }
   void set_custom_call_schedule(CustomCallSchedule custom_call_schedule) {
     custom_call_schedule_ = custom_call_schedule;
   }
@@ -1958,10 +1976,6 @@ class HloCustomCallInstruction : public HloCallableInstruction {
   std::vector<Shape> operand_shapes_with_layout_;
   // Whether this custom call has a side-effect.
   bool custom_call_has_side_effect_;
-  // A list of output/operand buffer pairs that alias each other. See comment of
-  // output_to_operand_aliasing().
-  std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
-      output_to_operand_aliasing_;
   std::optional<Literal> literal_;
   // A custom-call schedule hint.
   CustomCallSchedule custom_call_schedule_;
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module.cc b/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
index e6b95f16b39..2d33b8be202 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module.cc
@@ -36,13 +36,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/gtl/map_util.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -56,6 +57,16 @@ namespace xla {
 HloModule::HloModule(const std::string& name, HloModuleConfig config)
     : HloModule(name, config, std::make_unique<CompilationEnvironments>()) {}
 
+HloModule::HloModule(const std::string& name, HloModuleConfig config,
+                     std::unique_ptr<CompilationEnvironments> comp_envs)
+    : name_(NameUniquer::GetSanitizedName(name)),
+      config_(std::move(config)),
+      unique_id_(next_unique_module_id_++),
+      metadata_(tsl::Env::Default()),
+      comp_envs_(std::move(comp_envs)) {
+  metadata_.set_canonical_module_id(unique_id_);
+}
+
 Status HloModule::set_schedule(HloSchedule schedule) {
   TF_RET_CHECK(schedule.module() == this);
   TF_RETURN_IF_ERROR(schedule.Verify());
@@ -233,40 +244,45 @@ void HloModule::ReplaceComputations(
   computations_ = std::move(new_computations);
 }
 
-std::string HloModule::ToString(const HloPrintOptions& options) const {
-  return std::string(ToCord(options));
-}
-
-absl::Cord HloModule::ToCord(const HloPrintOptions& options) const {
-  absl::Cord result;
-  result.Append("HloModule ");
+void HloModule::Print(Printer* printer, const HloPrintOptions& options) const {
+  printer->Append("HloModule ");
   if (options.print_ids()) {
     // When print_ids() is false, exclude module's name because it includes and
     // leads to non-deterministic fingerprint.
-    result.Append(name());
+    printer->Append(name());
   }
   if (has_schedule()) {
     TF_CHECK_OK(schedule().Verify());
-    result.Append(", is_scheduled=true");
+    printer->Append(", is_scheduled=true");
   }
   std::string serialized_aliasing = input_output_alias_config().ToShortString();
   if (!serialized_aliasing.empty()) {
-    result.Append(", input_output_alias={ ");
-    result.Append(std::move(serialized_aliasing));
-    result.Append(" }");
+    printer->Append(", input_output_alias={ ");
+    printer->Append(std::move(serialized_aliasing));
+    printer->Append(" }");
   }
   if (config_.alias_passthrough_params()) {
-    result.Append(", alias_passthrough_params=true");
+    printer->Append(", alias_passthrough_params=true");
   }
   if (config_.has_entry_computation_layout()) {
-    result.Append(", entry_computation_layout={");
-    result.Append(entry_computation_layout().ToString());
-    result.Append("}");
+    printer->Append(", entry_computation_layout={");
+    entry_computation_layout().Print(printer);
+    printer->Append("}");
   }
-  if (config_.allow_spmd_sharding_propagation_to_output()) {
-    result.Append(", allow_spmd_sharding_propagation_to_output=true");
+  if (config_.allow_spmd_sharding_propagation_to_output().size() != 1 ||
+      config_.allow_spmd_sharding_propagation_to_output().back()) {
+    struct BoolFormatter {
+      void operator()(std::string* out, bool i) const {
+        out->append(i ? "true" : "false");
+      }
+    };
+    printer->Append(absl::StrCat(
+        ", allow_spmd_sharding_propagation_to_output={",
+        absl::StrJoin(config_.allow_spmd_sharding_propagation_to_output(), ",",
+                      BoolFormatter()),
+        "}"));
   }
-  result.Append("\n\n");
+  printer->Append("\n\n");
   const auto& computations = options.canonicalize_computations()
                                  ? MakeComputationSorted()
                                  : MakeComputationPostOrder();
@@ -277,17 +293,28 @@ absl::Cord HloModule::ToCord(const HloPrintOptions& options) const {
       continue;
     }
     if (computation == entry_computation()) {
-      result.Append("ENTRY ");
+      printer->Append("ENTRY ");
     }
     if (has_schedule() && schedule().is_computation_scheduled(computation)) {
-      result.Append(computation->ToCord(
-          options, schedule().sequence(computation).instructions()));
+      computation->Print(printer, options,
+                         schedule().sequence(computation).instructions());
     } else {
-      result.Append(computation->ToCord(options));
+      computation->Print(printer, options);
     }
-    result.Append("\n\n");
+    printer->Append("\n\n");
   }
-  return result;
+}
+
+std::string HloModule::ToString(const HloPrintOptions& options) const {
+  StringPrinter printer;
+  Print(&printer, options);
+  return std::move(printer).ToString();
+}
+
+absl::Cord HloModule::ToCord(const HloPrintOptions& options) const {
+  CordPrinter printer;
+  Print(&printer, options);
+  return std::move(printer).ToCord();
 }
 
 HloModuleProto HloModule::ToProto() const {
@@ -310,14 +337,16 @@ HloModuleProto HloModule::ToProto() const {
   *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
   *proto.mutable_dynamic_parameter_binding() =
       dynamic_parameter_binding().ToProto();
-  for (const auto& parameter_indices : CrossProgramPrefetches()) {
-    const auto& parameter = parameter_indices.first;
-    const auto& indices = parameter_indices.second;
+  for (const auto& [parameter, indices, alt_memory_offset] :
+       CrossProgramPrefetches()) {
     auto* prefetch = proto.mutable_cross_program_prefetches()->Add();
     prefetch->set_parameter(parameter);
     for (auto index : indices) {
       prefetch->add_index(index);
     }
+    if (alt_memory_offset) {
+      prefetch->set_offset(*alt_memory_offset);
+    }
   }
   proto.set_is_dynamic(is_dynamic_);
   if (has_spmd_output_sharding()) {
@@ -349,6 +378,13 @@ HloModuleProto HloModule::ToProto() const {
   return proto;
 }
 
+StatusOr<HloModuleProtoWithConfig> HloModule::ToProtoWithConfig() const {
+  HloModuleProtoWithConfig result;
+  TF_ASSIGN_OR_RETURN(*result.mutable_config(), config_.ToProto());
+  *result.mutable_hlo_module() = ToProto();
+  return result;
+}
+
 Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
   absl::flat_hash_set<std::string> computation_names;
   absl::flat_hash_set<int> computation_ids;
@@ -475,7 +511,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   for (const auto& prefetch : proto.cross_program_prefetches()) {
     module->AddCrossProgramPrefetch(
         prefetch.parameter(),
-        ShapeIndex(prefetch.index().begin(), prefetch.index().end()));
+        ShapeIndex(prefetch.index().begin(), prefetch.index().end()),
+        prefetch.offset());
   }
 
   module->set_is_dynamic(proto.is_dynamic());
@@ -540,8 +577,11 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     }
     module_config.set_auto_spmd_partitioning_mesh_ids(mesh_ids);
     module_config.set_deduplicate_hlo(execution_options->deduplicate_hlo());
-    module_config.set_allow_spmd_sharding_propagation_to_output(
-        execution_options->allow_spmd_sharding_propagation_to_output());
+    if (!execution_options->allow_spmd_sharding_propagation_to_output()
+             .empty()) {
+      module_config.set_allow_spmd_sharding_propagation_to_output(
+          execution_options->allow_spmd_sharding_propagation_to_output());
+    }
     if (execution_options->has_device_assignment()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<DeviceAssignment> device_assignment,
                           DeviceAssignment::Deserialize(
@@ -556,6 +596,15 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
                  module_config.num_partitions());
       }
     }
+    std::vector<bool> param_requires_broadcast_via_collectives(
+        execution_options->param_requires_broadcast_via_collectives().begin(),
+        execution_options->param_requires_broadcast_via_collectives().end());
+    module_config.set_param_requires_broadcast_via_collectives(
+        param_requires_broadcast_via_collectives);
+    module_config.set_allow_separate_sharding_programs(
+        execution_options->allow_separate_sharding_programs());
+    HloModuleConfig::AssignStructShardableValueUpdatePairs(
+        module_config, execution_options->shardable_value_update_pairs());
   }
 
   // The module config is constructed with default layouts regardless of what is
@@ -596,6 +645,15 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
   return config;
 }
 
+StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProtoWithConfig(
+    const HloModuleProtoWithConfig& proto, bool prohibit_empty_literal) {
+  auto hlo_module_proto = proto.hlo_module();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> config_ptr,
+                      HloModuleConfig::CreateFromProto(proto.config()));
+  return HloModule::CreateFromProto(hlo_module_proto, *config_ptr,
+                                    prohibit_empty_literal);
+}
+
 namespace {
 // Returns whether `hlo` is used outside the given subcomputation.
 // `instructions_in_subcomputation` is the instruction set of the given
@@ -893,10 +951,8 @@ std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
     }
     TF_CHECK_OK(module->set_schedule(std::move(clone_schedule)));
   }
-  for (const auto& parameter_indices : CrossProgramPrefetches()) {
-    const auto& parameter = parameter_indices.first;
-    const auto& indices = parameter_indices.second;
-    module->AddCrossProgramPrefetch(parameter, indices);
+  for (const auto& [parameter, indices, offset] : CrossProgramPrefetches()) {
+    module->AddCrossProgramPrefetch(parameter, indices, offset);
   }
 
   // To make clone behavior match uncloned behavior, we reorder
@@ -919,7 +975,8 @@ std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
 Status HloModule::RemoveUnusedComputations() {
   std::string suffix = "tmp";
   auto module = std::make_unique<HloModule>(
-      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config());
+      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config(),
+      std::make_unique<CompilationEnvironments>(*comp_envs_));
   HloCloneContext context(module.get(), suffix);
   entry_computation_->Clone(suffix, &context);
   std::vector<HloComputation*> to_remove;
@@ -963,16 +1020,6 @@ HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
   return it == computations_in_module.end() ? nullptr : *it;
 }
 
-HloModule::HloModule(const std::string& name, HloModuleConfig config,
-                     std::unique_ptr<CompilationEnvironments> comp_envs)
-    : name_(NameUniquer::GetSanitizedName(name)),
-      config_(std::move(config)),
-      unique_id_(next_unique_module_id_++),
-      metadata_(tsl::Env::Default()),
-      comp_envs_(std::move(comp_envs)) {
-  metadata_.set_canonical_module_id(unique_id_);
-}
-
 /* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module.h b/tensorflow/compiler/xla/hlo/ir/hlo_module.h
index 2dfc8140e03..3e35fb8e1ef 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module.h
@@ -37,11 +37,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/tsl/lib/gtl/iterator_range.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -69,7 +70,11 @@ class HloModule {
  public:
   // Constructor.
   HloModule(const std::string& name, HloModuleConfig config);
-  virtual ~HloModule() {}
+  // REQUIRED:
+  // - comp_envs must not be null.
+  HloModule(const std::string& name, HloModuleConfig config,
+            std::unique_ptr<CompilationEnvironments> comp_envs);
+  virtual ~HloModule() = default;
 
   // Adds an entry computation to the module. A module can only have one entry
   // computation. Returns a pointer to the newly added computation.
@@ -319,6 +324,15 @@ class HloModule {
   bool is_dynamic() const { return is_dynamic_; }
   void set_is_dynamic(bool is_dynamic) { is_dynamic_ = is_dynamic; }
 
+  // Prints a string representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  void Print(Printer* printer) const {
+    return Print(printer, HloPrintOptions());
+  }
+  void Print(Printer* printer, const HloPrintOptions& options) const;
+
   // Return a string representation of the module.
   //
   // (We express the default options using an overload rather than a default
@@ -339,6 +353,12 @@ class HloModule {
       const HloModuleProto& proto, const HloModuleConfig& module_config,
       bool prohibit_empty_literal = true);
 
+  // Convert an HloModule to or from a proto that includes module configuration
+  StatusOr<HloModuleProtoWithConfig> ToProtoWithConfig() const;
+  static StatusOr<std::unique_ptr<HloModule>> CreateFromProtoWithConfig(
+      const HloModuleProtoWithConfig& proto,
+      bool prohibit_empty_literal = true);
+
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
   static StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
@@ -461,14 +481,36 @@ class HloModule {
     spmd_output_sharding_ = sharding;
   }
 
+  // Describes a buffer to be used for cross program prefetching.
+  struct CrossProgramPrefetchInfo {
+    // The parameter to prefetch.
+    int64_t parameter;
+    // Index of the buffer within a tuple-typed parameter.
+    ShapeIndex index;
+    // Offset into alt memory where the cross program pretched buffer will be
+    // stored.
+    std::optional<int64_t> alt_memory_offset;
+  };
+
   // Add a program argument to be prefetched across programs.
-  void AddCrossProgramPrefetch(int64_t parameter, const ShapeIndex& index) {
-    cross_program_prefetches_.emplace_back(parameter, index);
+  void AddCrossProgramPrefetch(
+      int64_t parameter, const ShapeIndex& index,
+      std::optional<int64_t> alt_memory_offset = std::nullopt) {
+    cross_program_prefetches_.emplace_back(
+        CrossProgramPrefetchInfo{parameter, index, alt_memory_offset});
+  }
+
+  Status SetCrossProgramPrefetchOffset(int64_t prefetch_index, int64_t offset) {
+    TF_RET_CHECK(prefetch_index < cross_program_prefetches_.size());
+    auto& [parameter, index, optional_offset] =
+        cross_program_prefetches_[prefetch_index];
+    TF_RET_CHECK(!optional_offset.has_value());
+    optional_offset = offset;
+    return OkStatus();
   }
 
   // Get the list of program arguments to be prefetch across programs.
-  const absl::Span<const std::pair<int64_t, ShapeIndex>>
-  CrossProgramPrefetches() const {
+  absl::Span<const CrossProgramPrefetchInfo> CrossProgramPrefetches() const {
     return cross_program_prefetches_;
   }
 
@@ -501,6 +543,23 @@ class HloModule {
     return profile_info_list_;
   }
 
+  void add_autofdo_pre_pass_fingerprint(absl::string_view fingerprint) {
+    autofdo_pre_pass_fingerprints_.push_back(std::string(fingerprint));
+  }
+
+  void set_autofdo_pre_pass_fingerprints(
+      const std::vector<std::string>& fingerprints) {
+    autofdo_pre_pass_fingerprints_ = fingerprints;
+  }
+
+  const std::vector<std::string>& autofdo_pre_pass_fingerprints() const {
+    return autofdo_pre_pass_fingerprints_;
+  }
+
+  bool has_module_autofdo_profiles() const {
+    return !autofdo_pre_pass_fingerprints_.empty();
+  }
+
   void set_relative_speedup(double relative_speedup) {
     relative_speedup_ = relative_speedup;
   }
@@ -516,11 +575,6 @@ class HloModule {
   CompilationEnvironments& comp_envs() const { return *comp_envs_; }
 
  private:
-  // This constructor is used in Clone() to copy the CompilationEnvironments.
-  // comp_envs may be null, in which case a clean one will be created.
-  HloModule(const std::string& name, HloModuleConfig config,
-            std::unique_ptr<CompilationEnvironments> comp_envs);
-
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
       bool uniquify_identifiers, bool preserve_entry_layouts);
@@ -569,7 +623,7 @@ class HloModule {
   std::optional<HloSharding> spmd_output_sharding_;
 
   // Arguments to be prefetched across programs.
-  std::vector<std::pair<int64_t, ShapeIndex>> cross_program_prefetches_;
+  std::vector<CrossProgramPrefetchInfo> cross_program_prefetches_;
 
   // Metadata for this module, such as its canonical id and the HLO passes run.
   HloModuleMetadata metadata_;
@@ -590,6 +644,10 @@ class HloModule {
   // The unoptimized module fingerprint.
   std::string autofdo_fingerprint_;
 
+  // The pre-pass module fingerprints used to retrieve the optimization profiles
+  // this module contains.
+  std::vector<std::string> autofdo_pre_pass_fingerprints_;
+
   bool use_auto_spmd_partitioning_ = false;
 
   // Layout canonicalization callback, used only when
diff --git a/tensorflow/compiler/xla/service/hlo_module_group.cc b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
similarity index 98%
rename from tensorflow/compiler/xla/service/hlo_module_group.cc
rename to tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
index 789b77e92f3..b24e1ef931e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_group.h b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
similarity index 94%
rename from tensorflow/compiler/xla/service/hlo_module_group.h
rename to tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
index dc4a9b72bf4..3a73fd8296e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_H_
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_MODULE_GROUP_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_MODULE_GROUP_H_
 
 #include <iosfwd>
 #include <string>
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
 
@@ -113,4 +113,4 @@ std::ostream& operator<<(std::ostream& out, const HloModuleGroup& group);
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_GROUP_H_
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_MODULE_GROUP_H_
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
index 114f1f5e735..2f591956a76 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
@@ -48,20 +48,22 @@ namespace xla {
 // the MHLO opset to keep both opsets synchronized.
 // LINT.IfChange
 #define HLO_OPCODE_LIST(V)                                                     \
+  /* go/keep-sorted start */                                                   \
   V(kAbs, "abs", 1)                                                            \
   V(kAdd, "add", 2)                                                            \
   V(kAddDependency, "add-dependency", 2)                                       \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                              \
   V(kAllGather, "all-gather", kHloOpcodeIsVariadic)                            \
-  V(kAllGatherStart, "all-gather-start", kHloOpcodeIsVariadic)                 \
   V(kAllGatherDone, "all-gather-done", 1)                                      \
+  V(kAllGatherStart, "all-gather-start", kHloOpcodeIsVariadic)                 \
   V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                            \
-  V(kAllReduceStart, "all-reduce-start", kHloOpcodeIsVariadic)                 \
   V(kAllReduceDone, "all-reduce-done", 1)                                      \
+  V(kAllReduceStart, "all-reduce-start", kHloOpcodeIsVariadic)                 \
   V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                             \
+  V(kAnd, "and", 2)                                                            \
+  V(kAsyncDone, "async-done", 1)                                               \
   V(kAsyncStart, "async-start", kHloOpcodeIsVariadic)                          \
   V(kAsyncUpdate, "async-update", 1)                                           \
-  V(kAsyncDone, "async-done", 1)                                               \
   V(kAtan2, "atan2", 2)                                                        \
   V(kBatchNormGrad, "batch-norm-grad", 5)                                      \
   V(kBatchNormInference, "batch-norm-inference", 5)                            \
@@ -70,13 +72,14 @@ namespace xla {
   V(kBitcastConvert, "bitcast-convert", 1)                                     \
   V(kBroadcast, "broadcast", 1)                                                \
   V(kCall, "call", kHloOpcodeIsVariadic)                                       \
+  V(kCbrt, "cbrt", 1)                                                          \
   V(kCeil, "ceil", 1)                                                          \
   V(kCholesky, "cholesky", 1)                                                  \
   V(kClamp, "clamp", 3)                                                        \
+  V(kClz, "count-leading-zeros", 1)                                            \
   V(kCollectivePermute, "collective-permute", kHloOpcodeIsVariadic)            \
-  V(kCollectivePermuteStart, "collective-permute-start", kHloOpcodeIsVariadic) \
   V(kCollectivePermuteDone, "collective-permute-done", 1)                      \
-  V(kClz, "count-leading-zeros", 1)                                            \
+  V(kCollectivePermuteStart, "collective-permute-start", kHloOpcodeIsVariadic) \
   V(kCompare, "compare", 2)                                                    \
   V(kComplex, "complex", 2)                                                    \
   V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)                         \
@@ -92,6 +95,7 @@ namespace xla {
   V(kDivide, "divide", 2)                                                      \
   V(kDomain, "domain", 1)                                                      \
   V(kDot, "dot", 2)                                                            \
+  V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)                  \
   V(kDynamicSlice, "dynamic-slice", kHloOpcodeIsVariadic)                      \
   V(kDynamicUpdateSlice, "dynamic-update-slice", kHloOpcodeIsVariadic)         \
   V(kExp, "exponential", 1)                                                    \
@@ -101,7 +105,6 @@ namespace xla {
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                                   \
   V(kGather, "gather", 2)                                                      \
   V(kGetDimensionSize, "get-dimension-size", 1)                                \
-  V(kSetDimensionSize, "set-dimension-size", 2)                                \
   V(kGetTupleElement, "get-tuple-element", 1)                                  \
   V(kImag, "imag", 1)                                                          \
   V(kInfeed, "infeed", 1)                                                      \
@@ -110,16 +113,14 @@ namespace xla {
   V(kLog, "log", 1)                                                            \
   V(kLog1p, "log-plus-one", 1)                                                 \
   V(kLogistic, "logistic", 1)                                                  \
-  V(kAnd, "and", 2)                                                            \
-  V(kNot, "not", 1)                                                            \
-  V(kOptimizationBarrier, "opt-barrier", 1)                                    \
-  V(kOr, "or", 2)                                                              \
-  V(kXor, "xor", 2)                                                            \
   V(kMap, "map", kHloOpcodeIsVariadic)                                         \
   V(kMaximum, "maximum", 2)                                                    \
   V(kMinimum, "minimum", 2)                                                    \
   V(kMultiply, "multiply", 2)                                                  \
   V(kNegate, "negate", 1)                                                      \
+  V(kNot, "not", 1)                                                            \
+  V(kOptimizationBarrier, "opt-barrier", 1)                                    \
+  V(kOr, "or", 2)                                                              \
   V(kOutfeed, "outfeed", 2)                                                    \
   V(kPad, "pad", 2)                                                            \
   V(kParameter, "parameter", 0)                                                \
@@ -136,11 +137,10 @@ namespace xla {
   V(kRemainder, "remainder", 2)                                                \
   V(kReplicaId, "replica-id", 0)                                               \
   V(kReshape, "reshape", 1)                                                    \
-  V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)                  \
   V(kReverse, "reverse", 1)                                                    \
   V(kRng, "rng", kHloOpcodeIsVariadic)                                         \
-  V(kRngGetAndUpdateState, "rng-get-and-update-state", 0)                      \
   V(kRngBitGenerator, "rng-bit-generator", 1)                                  \
+  V(kRngGetAndUpdateState, "rng-get-and-update-state", 0)                      \
   V(kRoundNearestAfz, "round-nearest-afz", 1)                                  \
   V(kRoundNearestEven, "round-nearest-even", 1)                                \
   V(kRsqrt, "rsqrt", 1)                                                        \
@@ -149,6 +149,7 @@ namespace xla {
   V(kSelectAndScatter, "select-and-scatter", 3)                                \
   V(kSend, "send", 2)                                                          \
   V(kSendDone, "send-done", 1)                                                 \
+  V(kSetDimensionSize, "set-dimension-size", 2)                                \
   V(kShiftLeft, "shift-left", 2)                                               \
   V(kShiftRightArithmetic, "shift-right-arithmetic", 2)                        \
   V(kShiftRightLogical, "shift-right-logical", 2)                              \
@@ -158,14 +159,16 @@ namespace xla {
   V(kSort, "sort", kHloOpcodeIsVariadic)                                       \
   V(kSqrt, "sqrt", 1)                                                          \
   V(kStochasticConvert, "stochastic-convert", 2)                               \
-  V(kCbrt, "cbrt", 1)                                                          \
   V(kSubtract, "subtract", 2)                                                  \
+  V(kTan, "tan", 1)                                                            \
   V(kTanh, "tanh", 1)                                                          \
   V(kTranspose, "transpose", 1)                                                \
   V(kTriangularSolve, "triangular-solve", 2)                                   \
   V(kTuple, "tuple", kHloOpcodeIsVariadic)                                     \
-  V(kWhile, "while", 1)
-// LINT.ThenChange(../../mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td)
+  V(kWhile, "while", 1)                                                        \
+  V(kXor, "xor", 2)                                                            \
+  /* go/keep-sorted end */
+// LINT.ThenChange(../../mlir_hlo/mhlo/IR/hlo_ops.td)
 
 enum class HloOpcode {
 #define DECLARE_ENUM(enum_name, opcode_name, ...) enum_name,
@@ -219,7 +222,7 @@ inline bool HloOpcodeIsBinaryCommutative(HloOpcode opcode) {
 }
 
 // Returns the number of HloOpcode values.
-inline const uint32_t HloOpcodeCount() {
+inline constexpr uint32_t HloOpcodeCount() {
 #define HLO_COUNT_ONE(...) +1
 #define HLO_XLIST_LENGTH(list) list(HLO_COUNT_ONE)
   return HLO_XLIST_LENGTH(HLO_OPCODE_LIST);
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_schedule.cc b/tensorflow/compiler/xla/hlo/ir/hlo_schedule.cc
index 7be9b10bdf7..ea231dcd9b5 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_schedule.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
@@ -365,23 +366,27 @@ std::string HloSchedule::ToString() const {
   std::vector<std::string> pieces;
 
   pieces.push_back("HloSchedule");
+  std::vector<int64_t> sorted_ids;
   for (const auto& id_sequence : sequences_) {
-    const HloComputation* computation =
-        IdToComputation(module_, id_sequence.first);
+    sorted_ids.push_back(id_sequence.first);
+  }
+  absl::c_sort(sorted_ids);
+
+  for (const int64_t id : sorted_ids) {
+    const HloComputation* computation = IdToComputation(module_, id);
+    const HloInstructionSequence& sequence = sequences_.at(id);
     if (computation == nullptr) {
       // The computation is not in the module and may have been deleted so it is
       // not safe to dereference any HLO pointers. Just use the HLO unique ids
       // stored in this object.
-      pieces.push_back(
-          absl::StrFormat("computation with id %d (no longer in HLO module):",
-                          id_sequence.first));
-      for (int id : id_sequence.second.ids()) {
+      pieces.push_back(absl::StrFormat(
+          "computation with id %d (no longer in HLO module):", id));
+      for (int id : sequence.ids()) {
         pieces.push_back(absl::StrCat("  ", id));
       }
     } else {
       pieces.push_back(absl::StrFormat("computation %s:", computation->name()));
-      for (const HloInstruction* instruction :
-           id_sequence.second.instructions()) {
+      for (const HloInstruction* instruction : sequence.instructions()) {
         pieces.push_back(absl::StrCat("  ", instruction->name()));
       }
     }
@@ -390,8 +395,7 @@ std::string HloSchedule::ToString() const {
 }
 
 std::ostream& operator<<(std::ostream& out, const HloSchedule& schedule) {
-  out << schedule.ToString();
-  return out;
+  return out << schedule.ToString();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
index 430697b519c..75519576c55 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 
@@ -95,9 +96,13 @@ HloSharding HloSharding::PartialTile(
     return HloSharding(fully_tiled, /*replicate_on_last_tile_dim=*/false,
                        metadata);
   }
-  std::vector<std::set<int64_t>> sorted_groups(
-      tile_assignment_last_dim_replicate.num_elements() /
-      tile_assignment_last_dim_replicate.dimensions().back());
+  std::vector<int64_t> sorted_groups(
+      tile_assignment_last_dim_replicate.num_elements());
+  const int64_t group_size =
+      tile_assignment_last_dim_replicate.dimensions().back();
+  const int64_t num_groups =
+      tile_assignment_last_dim_replicate.num_elements() / group_size;
+  std::vector<int32_t> current_group_idx(num_groups, 0);
   auto get_group_id = [&](absl::Span<const int64_t> indices) {
     int64_t group_id = 0;
     for (int64_t i = 0; i < indices.size() - 1; ++i) {
@@ -108,14 +113,20 @@ HloSharding HloSharding::PartialTile(
   };
   tile_assignment_last_dim_replicate.Each(
       [&](absl::Span<const int64_t> indices, const int64_t device) {
-        sorted_groups[get_group_id(indices)].insert(device);
+        const int64_t group_id = get_group_id(indices);
+        sorted_groups[group_id * group_size + current_group_idx[group_id]++] =
+            device;
       });
+  for (int i = 0; i < num_groups; ++i) {
+    std::sort(sorted_groups.begin() + i * group_size,
+              sorted_groups.begin() + (i + 1) * group_size);
+  }
+  absl::c_fill(current_group_idx, 0);
   Array<int64_t> sorted_tile(tile_assignment_last_dim_replicate.dimensions());
   sorted_tile.Each([&](absl::Span<const int64_t> indices, int64_t* device) {
     const int64_t group_id = get_group_id(indices);
-    auto begin = sorted_groups[group_id].begin();
-    *device = *begin;
-    sorted_groups[group_id].erase(begin);
+    *device =
+        sorted_groups[group_id * group_size + current_group_idx[group_id]++];
   });
   return HloSharding(sorted_tile, /*replicate_on_last_tile_dim=*/true,
                      metadata);
@@ -514,7 +525,14 @@ StatusOr<HloSharding> HloSharding::GetTupleSharding(const Shape& shape) const {
     TF_RETURN_IF_ERROR(CheckLeafCount(shape));
     return *this;
   }
-  return Tuple(ShapeTree<HloSharding>(shape, *this));
+  return SingleTuple(shape, *this);
+}
+
+HloSharding HloSharding::NormalizeTupleSharding(const Shape& shape) const {
+  if (shape.IsTuple() && !IsTuple()) {
+    return HloSharding::SingleTuple(shape, *this);
+  }
+  return *this;
 }
 
 std::optional<int64_t> HloSharding::UniqueDevice() const {
@@ -545,7 +563,7 @@ int64_t HloSharding::GetUniqueDevice() const {
 }
 
 Status HloSharding::ValidateTuple(const Shape& shape,
-                                  int64_t num_devices) const {
+                                  std::optional<int64_t> num_devices) const {
   if (!shape.IsTuple()) {
     return tsl::errors::InvalidArgument(
         StrCat("Sharding is tuple-shaped but validation shape is not."));
@@ -573,7 +591,8 @@ Status HloSharding::ValidateTuple(const Shape& shape,
   return OkStatus();
 }
 
-Status HloSharding::Validate(const Shape& shape, int64_t num_devices) const {
+Status HloSharding::Validate(const Shape& shape,
+                             std::optional<int64_t> num_devices) const {
   if (shape.IsToken()) {
     return OkStatus();
   }
@@ -588,7 +607,7 @@ Status HloSharding::Validate(const Shape& shape, int64_t num_devices) const {
 }
 
 Status HloSharding::ValidateNonTuple(const Shape& shape,
-                                     int64_t num_devices) const {
+                                     std::optional<int64_t> num_devices) const {
   if (shape.IsTuple()) {
     return tsl::errors::InvalidArgument(
         StrCat("Validation shape is a tuple but sharding is not."));
@@ -597,42 +616,44 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
     return OkStatus();
   }
 
-  // All tile assignments must be less than the number of available cores and
+  // All tile assignments must be less than the number of available devices and
   // unique.
-  Status status = OkStatus();
-  absl::flat_hash_set<int64_t> seen_cores;
-  tile_assignment_.Each([&](absl::Span<const int64_t> indices, int32_t core) {
-    // Don't overwrite a bad status, so we report the first error.
-    if (status.ok()) {
-      if (core >= num_devices) {
-        status = tsl::errors::InvalidArgument(
-            StrCat("core ", core, " > ", num_devices, " in tile assignment"));
-      } else if (seen_cores.contains(core)) {
-        status = tsl::errors::InvalidArgument(
-            StrCat("core ", core, " is not unique in tile assignment"));
-      }
-      seen_cores.insert(core);
-    }
-  });
-  if (!status.ok()) {
-    return status;
-  }
+  absl::flat_hash_set<int64_t> seen_devices;
+  Status status = tile_assignment_.EachStatus(
+      [&num_devices, &seen_devices](absl::Span<const int64_t> /*indices*/,
+                                    int32_t device) {
+        if (num_devices.has_value() && device >= *num_devices) {
+          return tsl::errors::InvalidArgument(
+              StrCat("device ", device, " > num_devices (", *num_devices,
+                     ") in tile assignment"));
+        } else if (seen_devices.contains(device)) {
+          return tsl::errors::InvalidArgument(
+              StrCat("device ", device, " is not unique in tile assignment"));
+        }
+        seen_devices.insert(device);
+        return OkStatus();
+      });
+  TF_RETURN_IF_ERROR(status);
 
   if (IsTileMaximal() || IsManual()) {
     return OkStatus();
   }
 
-  // The tile assignment tensor must have the same rank as the input, or input
-  // rank + 1 for replicate_on_last_tile_dim_.
-  if (shape.rank() + (replicate_on_last_tile_dim_ ? 1 : 0) +
-          subgroup_types_.size() !=
-      tile_assignment_.num_dimensions()) {
+  // The tile assignment tensor must have the same rank as the tiled data rank.
+  if (shape.rank() != TiledDataRank()) {
     return tsl::errors::InvalidArgument(
-        "Number of tile assignment dimensions is different to the input rank. "
+        "Number of tile assignment dimensions (excluding subgroups) is "
+        "different than the input rank. "
         "sharding=",
         ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
+  // All devices should be seen in the tile assignment.
+  if (num_devices.has_value() && seen_devices.size() != *num_devices) {
+    return tsl::errors::InvalidArgument("tile_assignment should have ",
+                                        *num_devices, " devices");
+  }
+
   // The correct constructor has to be used to create tile maximal shardings.
   if (tile_assignment_.num_elements() == 1) {
     return tsl::errors::InvalidArgument(
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.h b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.h
index 8ea57c9d9c6..83775be7079 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.h
@@ -121,7 +121,8 @@ class HloSharding {
   std::string ToString(bool include_metadata = false) const;
 
   // Validate that this sharding can be applied to a tensor with shape `shape`.
-  Status Validate(const Shape& shape, int64_t num_devices) const;
+  Status Validate(const Shape& shape,
+                  std::optional<int64_t> num_devices = {}) const;
 
   // Returns true if the sharding has tuple type.
   bool IsTuple() const { return tuple_; }
@@ -248,6 +249,12 @@ class HloSharding {
   // having this object sharding.
   StatusOr<HloSharding> GetTupleSharding(const Shape& shape) const;
 
+  // If the shape is tuple and the current sharding is not a tuple, attempt to
+  // construct a sharding that is compatible with the shape by replicating the
+  // current sharding across all tuple elements. Note that the returned
+  // sharding is not guaranteed to be compatible with the input shape.
+  HloSharding NormalizeTupleSharding(const Shape& shape) const;
+
   // Extracts the sharding that is common within the current sharding.
   // If the current sharding is not a tuple sharding, the current sharding will
   // be returned. If it is a tuple, and all the tuple elements are common, the
@@ -363,13 +370,13 @@ class HloSharding {
  private:
   explicit HloSharding(bool manual, bool replicated,
                        absl::Span<const OpMetadata> metadata)
-      : replicated_(replicated),
+      : tile_assignment_({0}),
+        metadata_(metadata.begin(), metadata.end()),
+        replicated_(replicated),
         maximal_(replicated),
         tuple_(false),
         manual_(manual),
-        tile_assignment_({0}),
-        replicate_on_last_tile_dim_(false),
-        metadata_(metadata.begin(), metadata.end()) {}
+        replicate_on_last_tile_dim_(false) {}
   // device_id values:
   // -2: magic number to mean unassigned device, used by spatial partitioning
   // -1: the id of the host
@@ -377,41 +384,41 @@ class HloSharding {
   // NOTE(dimvar): -1 is needed for outside compilation. It can be removed once
   // we have fully switched to the side-effect tokens.
   explicit HloSharding(int64_t device_id, absl::Span<const OpMetadata> metadata)
-      : replicated_(false),
+      : tile_assignment_({1}, device_id),
+        metadata_(metadata.begin(), metadata.end()),
+        replicated_(false),
         maximal_(true),
         tuple_(false),
         manual_(false),
-        tile_assignment_({1}, device_id),
-        replicate_on_last_tile_dim_(false),
-        metadata_(metadata.begin(), metadata.end()) {}
+        replicate_on_last_tile_dim_(false) {}
   explicit HloSharding(const Array<int64_t>& tile_assignment,
                        bool replicate_on_last_tile_dim,
                        absl::Span<const OpMetadata> metadata = {})
-      : replicated_(false),
+      : tile_assignment_(tile_assignment),
+        metadata_(metadata.begin(), metadata.end()),
+        replicated_(false),
         maximal_(false),
         tuple_(false),
         manual_(false),
-        tile_assignment_(tile_assignment),
-        replicate_on_last_tile_dim_(replicate_on_last_tile_dim),
-        metadata_(metadata.begin(), metadata.end()) {}
+        replicate_on_last_tile_dim_(replicate_on_last_tile_dim) {}
   explicit HloSharding(const Array<int64_t>& tile_assignment,
                        absl::Span<const OpSharding::Type> subgroup_types,
                        absl::Span<const OpMetadata> metadata = {})
-      : replicated_(false),
+      : tile_assignment_(tile_assignment),
+        metadata_(metadata.begin(), metadata.end()),
+        subgroup_types_(subgroup_types.begin(), subgroup_types.end()),
+        replicated_(false),
         maximal_(false),
         tuple_(false),
         manual_(false),
-        tile_assignment_(tile_assignment),
-        replicate_on_last_tile_dim_(false),
-        metadata_(metadata.begin(), metadata.end()),
-        subgroup_types_(subgroup_types.begin(), subgroup_types.end()) {}
+        replicate_on_last_tile_dim_(false) {}
   explicit HloSharding(const std::vector<HloSharding>& tuple_shardings)
-      : replicated_(false),
+      : tile_assignment_({0}),
+        tuple_elements_(tuple_shardings),
+        replicated_(false),
         maximal_(false),
         tuple_(true),
         manual_(false),
-        tile_assignment_({0}),
-        tuple_elements_(tuple_shardings),
         replicate_on_last_tile_dim_(false) {}
 
   // Checks that the number of elements in tuple_elements_ is consistent with
@@ -419,15 +426,13 @@ class HloSharding {
   Status CheckLeafCount(const Shape& shape) const;
 
   // Internal helper to validate a tuple sharding.
-  Status ValidateTuple(const Shape& shape, int64_t num_devices) const;
+  Status ValidateTuple(const Shape& shape,
+                       std::optional<int64_t> num_devices) const;
 
   // Internal helper to validate a non-tuple (leaf) sharding.
-  Status ValidateNonTuple(const Shape& shape, int64_t num_devices) const;
+  Status ValidateNonTuple(const Shape& shape,
+                          std::optional<int64_t> num_devices) const;
 
-  bool replicated_;
-  bool maximal_;
-  bool tuple_;
-  bool manual_;
   // This field is only used if replicated_ is false. If maximal_ is true, then
   // the field contains a rank 1 array with a single element, which is the
   // device the HLO is assigned to. If maximal_ is false, the field contains an
@@ -450,7 +455,6 @@ class HloSharding {
   // true, tile_assignment_ will have an extra dimension in addition to the data
   // shape rank, and the added last dimension represents the subgroups of
   // replications, i.e., elements in slice [..., :] will be replicated.
-  bool replicate_on_last_tile_dim_;
   // This field is used to track the source of this sharding, usually derived
   // from instructions. Multiple metadata may be populated if sharding is
   // combined with other shardings. Metadata are to not be populated when
@@ -464,6 +468,11 @@ class HloSharding {
   // When creating HloSharding, subgroup dims of the same type will be merged,
   // so that there is at most one dim with a given type.
   std::vector<OpSharding::Type> subgroup_types_;
+  bool replicated_;
+  bool maximal_;
+  bool tuple_;
+  bool manual_;
+  bool replicate_on_last_tile_dim_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding);
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
index 37c3ad0e541..24912eb98e2 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
@@ -128,7 +128,7 @@ Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
     HloInstruction* gte = pass_through.operand->parent()->AddInstruction(
         HloInstruction::CreateGetTupleElement(pass_through.operand->shape(),
                                               tuple, 0));
-    gte->set_sharding(sharding);
+    gte->set_sharding(sharding.NormalizeTupleSharding(gte->shape()));
     if (pass_through.user != nullptr) {
       TF_RETURN_IF_ERROR(
           pass_through.operand->ReplaceUseWith(pass_through.user, gte));
@@ -139,8 +139,8 @@ Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
   return OkStatus();
 }
 
-// For tuple shardings if every element have the same sharsing then we want to
-// treat them as single element sharsings to insert less domain separation as a
+// For tuple shardings if every element have the same sharding then we want to
+// treat them as single element shardings to insert less domain separation as a
 // domain can prevent some optimizations and we want to minimize that from
 // happening.
 std::shared_ptr<const HloSharding> CloneShardingForDomain(
diff --git a/tensorflow/compiler/xla/hlo/transforms/BUILD b/tensorflow/compiler/xla/hlo/transforms/BUILD
new file mode 100644
index 00000000000..695cc44e295
--- /dev/null
+++ b/tensorflow/compiler/xla/hlo/transforms/BUILD
@@ -0,0 +1,39 @@
+# Description:
+#   Implementation of XLA’s HLO transformations.
+
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+cc_library(
+    name = "hlo_constant_splitter",
+    srcs = ["hlo_constant_splitter.cc"],
+    hdrs = ["hlo_constant_splitter.h"],
+    deps = ["//tensorflow/compiler/xla/service:hlo_pass"],
+)
+
+xla_cc_test(
+    name = "hlo_constant_splitter_test",
+    srcs = ["hlo_constant_splitter_test.cc"],
+    deps = [
+        ":hlo_constant_splitter",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/hlo_constant_splitter.cc b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.cc
similarity index 98%
rename from tensorflow/compiler/xla/service/hlo_constant_splitter.cc
rename to tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.cc
index b44a4dcc24a..a1ae36ebcfa 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_splitter.cc
+++ b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.cc
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_constant_splitter.h"
+#include "tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h"
 
 #include <iterator>
 #include <utility>
diff --git a/tensorflow/compiler/xla/service/hlo_constant_splitter.h b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h
similarity index 88%
rename from tensorflow/compiler/xla/service/hlo_constant_splitter.h
rename to tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h
index 56f222a4608..f4d3117068c 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_splitter.h
+++ b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h
@@ -12,8 +12,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_SPLITTER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_SPLITTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_HLO_TRANSFORMS_HLO_CONSTANT_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_HLO_TRANSFORMS_HLO_CONSTANT_SPLITTER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -43,4 +43,4 @@ class HloConstantSplitter : public HloModulePass {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_SPLITTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_HLO_TRANSFORMS_HLO_CONSTANT_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_constant_splitter_test.cc b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter_test.cc
similarity index 98%
rename from tensorflow/compiler/xla/service/hlo_constant_splitter_test.cc
rename to tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter_test.cc
index 8a75714f2ff..e0210d2f8c2 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_splitter_test.cc
+++ b/tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter_test.cc
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_constant_splitter.h"
+#include "tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h"
 
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 3687dbc3e76..ac35fd0567b 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -25,70 +25,6 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ int64_t IndexUtil::MultidimensionalIndexToLinearIndex(
-    const Shape& shape, absl::Span<const int64_t> multi_index) {
-  DCHECK_EQ(shape.dimensions_size(), multi_index.size());
-
-  for (size_t i = 0; i < multi_index.size(); ++i) {
-    DCHECK_GE(multi_index[i], 0);
-    DCHECK_LT(multi_index[i], shape.dimensions(i))
-        << "indexing beyond extent in dimension " << i << ":"
-        << "\n\tindex: " << absl::StrJoin(multi_index, ",")
-        << "\n\tshape: " << ShapeUtil::HumanString(shape);
-  }
-
-  // Let the array be sized like so for dimensions i from 0 to n-1:
-  //
-  //   [D{n-1} x D{n-2} x .. x D{0}]
-  //
-  // Let the order of the dimensions in the minor_to_major field in
-  // Layout be:
-  //
-  //   L(0), L(1), ... , L(n-1)
-  //
-  // where L(0) is the most-minor dimension and L(n-1) the most-major. The
-  // multidimensional index:
-  //
-  //   [I{0}, I{1}, ... , I{n-1}]
-  //
-  // then corresponds to the following linear index:
-  //
-  // linear_index =
-  //   (((  ... + I{L(2)}) * D{L(1)} + I{L(1)}) * D{L(0)} + I{L(0)}
-  //
-  // or equivalently:
-  //
-  // linear_index =
-  //   I{L(n-1)} * (D{L(n-2)} * D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
-  //   I{L(n-2)} *             (D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
-  //   I{L(n-3)} *                         (D{L(n-4)} *     ....    D{L(0)}) +
-  //                                   ...                                   +
-  //   I{L(2)} *                                         (D{L(1)} * D{L(0)}) +
-  //   I{L(1)} *                                                    D{L(0)}  +
-  //   I{L(0)}
-  //
-  // We compute the linear index value by accumulating the terms above from
-  // I{L(0)} up to I{L(n-1)}. Scale accumulates the product term D{L(0}} *
-  // D{L(1)} * ...
-
-  // Scale factor holding the growing product of D{L(i)} terms.
-  int64_t scale = 1;
-  int64_t linear_index = 0;
-  bool first = true;
-  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
-    if (first) {
-      // Avoid two multiplies on the first loop iteration
-      linear_index = multi_index[dimension];
-      scale = shape.dimensions(dimension);
-      first = false;
-    } else {
-      linear_index += scale * multi_index[dimension];
-      scale *= shape.dimensions(dimension);
-    }
-  }
-  return linear_index;
-}
-
 /* static */ std::vector<int64_t> IndexUtil::LinearIndexToMultidimensionalIndex(
     const Shape& shape, int64_t linear_index) {
   DCHECK_GE(linear_index, 0);
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index b3a7cfb5d03..be01a96f0d0 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -34,8 +35,62 @@ class IndexUtil {
   // Converts a multidimensional index (eg {x, y, z}) into a linear index based
   // on the shape and its layout. The first index in the multi_index is
   // dimension 0.
-  static int64_t MultidimensionalIndexToLinearIndex(
-      const Shape& shape, absl::Span<const int64_t> multi_index);
+  static inline int64_t MultidimensionalIndexToLinearIndex(
+      const Shape& shape, absl::Span<const int64_t> multi_index) {
+    // Let the array be sized like so for dimensions i from 0 to n-1:
+    //
+    //   [D{n-1} x D{n-2} x .. x D{0}]
+    //
+    // Let the order of the dimensions in the minor_to_major field in
+    // Layout be:
+    //
+    //   L(0), L(1), ... , L(n-1)
+    //
+    // where L(0) is the most-minor dimension and L(n-1) the most-major. The
+    // multidimensional index:
+    //
+    //   [I{0}, I{1}, ... , I{n-1}]
+    //
+    // then corresponds to the following linear index:
+    //
+    // linear_index =
+    //   (((  ... + I{L(2)}) * D{L(1)} + I{L(1)}) * D{L(0)} + I{L(0)}
+    //
+    // or equivalently:
+    //
+    // linear_index =
+    //   I{L(n-1)} * (D{L(n-2)} * D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
+    //   I{L(n-2)} *             (D{L(n-3)} * D{L(n-4)} *     ....    D{L(0)}) +
+    //   I{L(n-3)} *                         (D{L(n-4)} *     ....    D{L(0)}) +
+    //                                   ...                                   +
+    //   I{L(2)} *                                         (D{L(1)} * D{L(0)}) +
+    //   I{L(1)} *                                                    D{L(0)}  +
+    //   I{L(0)}
+    //
+    // We compute the linear index value by accumulating the terms above from
+    // I{L(0)} up to I{L(n-1)}. Scale accumulates the product term D{L(0}} *
+    // D{L(1)} * ...
+
+    // Scale factor holding the growing product of D{L(i)} terms.
+    for (size_t i = 0; i < multi_index.size(); ++i) {
+      DCHECK_GE(multi_index[i], 0);
+      DCHECK_LT(multi_index[i], shape.dimensions(i))
+          << "indexing beyond extent in dimension " << i << ":"
+          << "\n\tindex: " << absl::StrJoin(multi_index, ",")
+          << "\n\tshape: " << ShapeUtil::HumanString(shape);
+    }
+    auto effective_shape = LayoutUtil::MinorToMajor(shape);
+    if (effective_shape.empty()) {
+      return 0;
+    }
+    int64_t linear_index = multi_index[effective_shape[0]];
+    int64_t scale = 1;
+    for (int i = 1; i < effective_shape.size(); ++i) {
+      scale *= shape.dimensions(effective_shape[i - 1]);
+      linear_index += scale * multi_index[effective_shape[i]];
+    }
+    return linear_index;
+  }
 
   // Converts a linear index into multidimensional index (eg {x, y, z}) based on
   // the shape and its layout. The first index in the returned multidimensional
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
index 629336d9442..ec842216d86 100644
--- a/tensorflow/compiler/xla/layout.cc
+++ b/tensorflow/compiler/xla/layout.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -38,22 +40,30 @@ TileProto Tile::ToProto() const {
   return tile_proto;
 }
 
-std::string Tile::ToString() const {
-  std::vector<std::string> elements;
+void Tile::Print(Printer* printer) const {
+  printer->Append("(");
   const auto& dims = dimensions();
-  elements.reserve(dims.size());
-  for (auto dim : dims) {
+  for (int i = 0; i < dims.size(); ++i) {
+    const auto dim = dims[i];
+    if (i != 0) printer->Append(",");
     if (dim >= 0) {
-      elements.push_back(std::to_string(dim));
+      printer->Append(absl::AlphaNum(dim).Piece());
     } else {
       if (dim == kCombineDimension) {
-        elements.push_back("*");
+        printer->Append("*");
       } else {
-        elements.push_back(absl::StrCat("Invalid value ", dim));
+        printer->Append("Invalid value ");
+        printer->Append(absl::AlphaNum(dim).Piece());
       }
     }
   }
-  return absl::StrCat("(", absl::StrJoin(elements, ","), ")");
+  printer->Append(")");
+}
+
+std::string Tile::ToString() const {
+  StringPrinter printer;
+  Print(&printer);
+  return std::move(printer).ToString();
 }
 
 Layout::Layout() = default;
@@ -67,7 +77,8 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
                absl::Span<const bool> dim_ordered, absl::Span<const Tile> tiles,
                PrimitiveType index_primitive_type,
                PrimitiveType pointer_primitive_type, int64_t memory_space,
-               std::unique_ptr<Shape> physical_shape)
+               std::unique_ptr<Shape> physical_shape,
+               int64_t dynamic_shape_metadata_prefix_bytes)
     : dim_level_types_(dim_level_types.begin(), dim_level_types.end()),
       dim_unique_(dim_unique.begin(), dim_unique.end()),
       dim_ordered_(dim_ordered.begin(), dim_ordered.end()),
@@ -76,7 +87,9 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
       index_primitive_type_(index_primitive_type),
       pointer_primitive_type_(pointer_primitive_type),
       memory_space_(memory_space),
-      physical_shape_(std::move(physical_shape)) {}
+      physical_shape_(std::move(physical_shape)),
+      dynamic_shape_metadata_prefix_bytes_(
+          dynamic_shape_metadata_prefix_bytes) {}
 
 Layout::Layout(const Layout& other)
     : dim_level_types_(other.dim_level_types_),
@@ -89,7 +102,9 @@ Layout::Layout(const Layout& other)
       memory_space_(other.memory_space_),
       physical_shape_(other.physical_shape_ != nullptr
                           ? std::make_unique<Shape>(*other.physical_shape_)
-                          : nullptr) {}
+                          : nullptr),
+      dynamic_shape_metadata_prefix_bytes_(
+          other.dynamic_shape_metadata_prefix_bytes_) {}
 
 Layout::Layout(Layout&& other) = default;
 
@@ -110,6 +125,8 @@ Layout& Layout::operator=(const Layout& other) {
     } else {
       physical_shape_ = nullptr;
     }
+    dynamic_shape_metadata_prefix_bytes_ =
+        other.dynamic_shape_metadata_prefix_bytes_;
   }
   return *this;
 }
@@ -140,6 +157,8 @@ Layout& Layout::operator=(Layout&& other) = default;
   if (proto.has_physical_shape()) {
     *layout.mutable_physical_shape() = Shape(proto.physical_shape());
   }
+  layout.set_dynamic_shape_metadata_prefix_bytes(
+      proto.dynamic_shape_metadata_prefix_bytes());
   return layout;
 }
 
@@ -167,6 +186,8 @@ LayoutProto Layout::ToProto() const {
   if (has_physical_shape()) {
     *proto.mutable_physical_shape() = physical_shape_->ToProto();
   }
+  proto.set_dynamic_shape_metadata_prefix_bytes(
+      dynamic_shape_metadata_prefix_bytes_);
   return proto;
 }
 
@@ -185,58 +206,96 @@ absl::string_view DimLevelTypeAbbrev(DimLevelType dim_level_type) {
 }
 }  // namespace
 
-std::string Layout::ToString() const {
-  std::string colon_string;
+void Layout::Print(Printer* printer) const {
+  printer->Append("{");
+  printer->Append(absl::StrJoin(minor_to_major(), ","));
+
+  bool colon_printed = false;
+  auto print_colon = [&]() {
+    if (colon_printed) return;
+    printer->Append(":");
+    colon_printed = true;
+  };
 
   if (!dim_level_types().empty()) {
-    absl::StrAppend(&colon_string, "D(");
+    print_colon();
+    printer->Append("D(");
     for (int i = 0; i < dim_level_types().size(); ++i) {
       if (i != 0) {
-        absl::StrAppend(&colon_string, ",");
+        printer->Append(",");
       }
-      absl::StrAppend(&colon_string, DimLevelTypeAbbrev(dim_level_type(i)));
+      printer->Append(DimLevelTypeAbbrev(dim_level_type(i)));
       if (!dim_unique().empty() && !dim_unique(i)) {
-        absl::StrAppend(&colon_string, "+");
+        printer->Append("+");
       }
       if (!dim_ordered().empty() && !dim_ordered(i)) {
-        absl::StrAppend(&colon_string, "~");
+        printer->Append("~");
       }
     }
-    absl::StrAppend(&colon_string, ")");
+    printer->Append(")");
   }
 
   if (!tiles().empty()) {
-    absl::StrAppend(&colon_string, "T");
+    print_colon();
+    printer->Append("T");
     for (const Tile& tile : tiles()) {
-      absl::StrAppend(&colon_string, tile.ToString());
+      tile.Print(printer);
     }
   }
 
   if (index_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-    absl::StrAppend(
-        &colon_string, "#(",
-        primitive_util::LowercasePrimitiveTypeName(index_primitive_type()),
-        ")");
+    print_colon();
+    if (primitive_util::IsIntegralType(index_primitive_type())) {
+      printer->Append("#(");
+      printer->Append(
+          primitive_util::LowercasePrimitiveTypeName(index_primitive_type()));
+      printer->Append(")");
+    } else {
+      printer->Append("#(invalid)");
+    }
   }
 
   if (pointer_primitive_type() != PRIMITIVE_TYPE_INVALID) {
-    absl::StrAppend(
-        &colon_string, "*(",
-        primitive_util::LowercasePrimitiveTypeName(pointer_primitive_type()),
-        ")");
+    print_colon();
+    if (primitive_util::IsIntegralType(pointer_primitive_type())) {
+      printer->Append("*(");
+      printer->Append(
+          primitive_util::LowercasePrimitiveTypeName(pointer_primitive_type()));
+      printer->Append(")");
+    } else {
+      printer->Append("*(invalid)");
+    }
   }
 
   if (memory_space() != 0) {
-    absl::StrAppend(&colon_string, "S(", memory_space(), ")");
+    print_colon();
+    printer->Append("S(");
+    printer->Append(absl::AlphaNum(memory_space()).Piece());
+    printer->Append(")");
   }
 
   if (has_physical_shape()) {
-    absl::StrAppend(&colon_string, "P(",
-                    physical_shape_->ToString(/*print_layout=*/true), ")");
+    print_colon();
+    printer->Append("P(");
+    physical_shape_->Print(printer, /*print_layout=*/true);
+    printer->Append(")");
+  }
+
+  if (dynamic_shape_metadata_prefix_bytes_ > 0) {
+    print_colon();
+    printer->Append("M(");
+    printer->Append(
+        absl::AlphaNum(dynamic_shape_metadata_prefix_bytes()).Piece());
+    printer->Append(")");
   }
 
-  return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","),
-                      colon_string.empty() ? "" : ":", colon_string, "}");
+  printer->Append("}");
+}
+
+std::string Layout::ToString() const {
+  StringPrinter printer;
+  Print(&printer);
+  return std::move(printer).ToString();
 }
 
 bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index d1e4d892763..7e266e46038 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -50,6 +51,8 @@ class Tile {
   }
   bool operator!=(const Tile& other) const { return !(*this == other); }
 
+  void Print(Printer* printer) const;
+
   std::string ToString() const;
 
   // Returns the bound of the tile in the given dimension index.
@@ -103,7 +106,8 @@ class Layout {
                   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
                   PrimitiveType element_primitive_type = PRIMITIVE_TYPE_INVALID,
                   int64_t memory_space = 0,
-                  std::unique_ptr<Shape> physical_shape = nullptr);
+                  std::unique_ptr<Shape> physical_shape = nullptr,
+                  int64_t dynamic_shape_metadata_prefix_bytes = 0);
 
   Layout& operator=(const Layout& other);
   Layout& operator=(Layout&& other);
@@ -114,6 +118,9 @@ class Layout {
   // Returns a LayoutProto representation of the Layout.
   LayoutProto ToProto() const;
 
+  // Prints a human-readable string that represents this layout.
+  void Print(Printer* printer) const;
+
   // Returns a human-readable string that represents this layout.
   std::string ToString() const;
 
@@ -306,6 +313,13 @@ class Layout {
   Shape* mutable_physical_shape();
   void clear_physical_shape();
 
+  int64_t dynamic_shape_metadata_prefix_bytes() const {
+    return dynamic_shape_metadata_prefix_bytes_;
+  }
+  void set_dynamic_shape_metadata_prefix_bytes(int64_t bytes) {
+    dynamic_shape_metadata_prefix_bytes_ = bytes;
+  }
+
   void Swap(Layout* other) {
     using std::swap;
     swap(*this, *other);
@@ -355,6 +369,11 @@ class Layout {
 
   // The physical on-device shape used to represent a sparse array.
   std::unique_ptr<Shape> physical_shape_;
+
+  // The dynamic shape metadata size in bytes in front of the shape data. The
+  // field may be non-zero for a static shape whose associated buffer is for a
+  // dynamic shape, e.g. a result of SliceToDynamic.
+  int64_t dynamic_shape_metadata_prefix_bytes_ = 0;
 };
 
 std::ostream& operator<<(std::ostream& out, const Tile& Tile);
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 1586f735a01..ff03706ef97 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -28,7 +28,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -49,6 +51,8 @@ void SetDefaultLayoutToContainer(T* minor_to_major) {
   }
 }
 
+absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
+
 }  // namespace
 
 /* static */ Layout LayoutUtil::MakeLayout(
@@ -57,7 +61,8 @@ void SetDefaultLayoutToContainer(T* minor_to_major) {
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
     PrimitiveType pointer_primitive_type, int64_t memory_space,
-    std::optional<Shape> physical_shape) {
+    std::optional<Shape> physical_shape,
+    int64_t dynamic_shape_metadata_prefix_bytes) {
   Layout layout;
   for (int64_t dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
@@ -88,6 +93,8 @@ void SetDefaultLayoutToContainer(T* minor_to_major) {
   if (physical_shape != std::nullopt) {
     *layout.mutable_physical_shape() = *std::move(physical_shape);
   }
+  layout.set_dynamic_shape_metadata_prefix_bytes(
+      dynamic_shape_metadata_prefix_bytes);
   return layout;
 }
 
@@ -271,6 +278,34 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
     }
   }
 
+  if (!layout.dim_unique().empty()) {
+    if (layout.dim_unique().size() != shape.rank()) {
+      return InvalidArgument(
+          "layout dim_unique field contains %d elements, but shape is "
+          "rank %d: {%s}; shape: %s",
+          layout.dim_unique_size(), shape.rank(),
+          absl::StrJoin(layout.dim_unique(), ", ",
+                        [](std::string* out, bool dim_unique) {
+                          absl::StrAppend(out, BoolToString(dim_unique));
+                        }),
+          shape.ShortDebugString());
+    }
+  }
+
+  if (!layout.dim_ordered().empty()) {
+    if (layout.dim_ordered().size() != shape.rank()) {
+      return InvalidArgument(
+          "layout dim_unique field contains %d elements, but shape is "
+          "rank %d: {%s}; shape: %s",
+          layout.dim_ordered_size(), shape.rank(),
+          absl::StrJoin(layout.dim_unique(), ", ",
+                        [](std::string* out, bool dim_unique) {
+                          absl::StrAppend(out, BoolToString(dim_unique));
+                        }),
+          shape.ShortDebugString());
+    }
+  }
+
   if (LayoutUtil::IsSparse(layout)) {
     if (layout.tiles_size() > 0) {
       return InvalidArgument(
@@ -323,6 +358,26 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
           "layout has a physical_shape, but is not a sparse array: %s",
           shape.ShortDebugString());
     }
+    for (const auto& tile : layout.tiles()) {
+      if (tile.dimensions().empty() ||
+          absl::c_any_of(tile.dimensions(),
+                         [](int64_t dim) { return dim == 0; })) {
+        return InvalidArgument("layout has invalid tiles: %s",
+                               shape.ShortDebugString());
+      }
+    }
+  }
+
+  for (int64_t dim = 0; dim < shape.rank(); ++dim) {
+    DimLevelType dim_level_type = GetDimLevelType(layout, dim);
+    bool dim_unique = DimUnique(layout, dim);
+    bool dim_ordered = DimOrdered(layout, dim);
+    if (!ValidateDimLevel(dim_level_type, dim_unique, dim_ordered)) {
+      return InvalidArgument(
+          "layout dimension %d has invalid level encoding %s%s%s: %s", dim,
+          DimLevelType_Name(dim_level_type), dim_unique ? "" : ", non-unique",
+          dim_ordered ? "" : ", non-ordered", shape.ShortDebugString());
+    }
   }
 
   return OkStatus();
@@ -441,32 +496,6 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
   return lhs == rhs;
 }
 
-/* static */ absl::Span<const int64_t> LayoutUtil::MinorToMajor(
-    const Shape& shape) {
-  CHECK(shape.IsArray());
-  return shape.layout().minor_to_major();
-}
-
-/* static */ absl::Span<const int64_t> LayoutUtil::MinorToMajor(
-    const Layout& layout) {
-  return layout.minor_to_major();
-}
-
-/* static */ int64_t LayoutUtil::Major(const Layout& layout,
-                                       int64_t physical_dimension_number) {
-  CHECK_LE(0, physical_dimension_number);
-  CHECK_LT(physical_dimension_number, layout.minor_to_major_size());
-  return Minor(layout,
-               layout.minor_to_major_size() - 1 - physical_dimension_number);
-}
-
-/* static */ int64_t LayoutUtil::Minor(const Layout& layout,
-                                       int64_t physical_dimension_number) {
-  CHECK_LE(0, physical_dimension_number);
-  CHECK_LT(physical_dimension_number, layout.minor_to_major_size());
-  return layout.minor_to_major(physical_dimension_number);
-}
-
 /* static */ std::vector<int64_t> LayoutUtil::MakeLogicalToPhysical(
     const Layout& layout) {
   std::vector<int64_t> logical_to_physical(layout.minor_to_major_size());
@@ -478,6 +507,11 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
   return logical_to_physical;
 }
 
+/* static */ void LayoutUtil::PrintHumanString(Printer* printer,
+                                               const Layout& layout) {
+  layout.Print(printer);
+}
+
 /* static */ std::string LayoutUtil::HumanString(const Layout& layout) {
   return layout.ToString();
 }
@@ -635,4 +669,42 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
                             : Layout::kDefaultMemorySpace;
 }
 
+/*static*/ DimLevelType LayoutUtil::GetDimLevelType(const Layout& layout,
+                                                    int64_t dim) {
+  if (layout.dim_level_types_size() == 0) {
+    return DIM_DENSE;
+  }
+  CHECK_LT(dim, layout.dim_level_types_size());
+  return layout.dim_level_type(dim);
+}
+
+/*static*/ bool LayoutUtil::DimUnique(const Layout& layout, int64_t dim) {
+  if (layout.dim_unique_size() == 0) {
+    return true;
+  }
+  CHECK_LT(dim, layout.dim_unique_size());
+  return layout.dim_unique(dim);
+}
+
+/*static*/ bool LayoutUtil::DimOrdered(const Layout& layout, int64_t dim) {
+  if (layout.dim_ordered_size() == 0) {
+    return true;
+  }
+  CHECK_LT(dim, layout.dim_ordered_size());
+  return layout.dim_ordered(dim);
+}
+
+bool LayoutUtil::ValidateDimLevel(DimLevelType dim_level_type, bool dim_unique,
+                                  bool dim_ordered) {
+  switch (dim_level_type) {
+    case DIM_DENSE:
+      return dim_unique && dim_ordered;
+    case DIM_COMPRESSED:
+    case DIM_SINGLETON:
+      return true;
+    default:
+      return false;
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 702c5fb63f1..28d461f808b 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -44,7 +45,8 @@ class LayoutUtil {
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
       int64_t memory_space = 0,
-      std::optional<Shape> physical_shape = std::nullopt);
+      std::optional<Shape> physical_shape = std::nullopt,
+      int64_t dynamic_shape_metadata_prefix_bytes = 0);
 
   // Similar to MakeLayout, but take indices in reverse order.
   static Layout MakeLayoutFromMajorToMinor(
@@ -163,8 +165,14 @@ class LayoutUtil {
 
   // Returns the minor_to_major array for the given Shape.  Requires that the
   // shape is an array.
-  static absl::Span<const int64_t> MinorToMajor(const Shape& shape);
-  static absl::Span<const int64_t> MinorToMajor(const Layout& layout);
+  static inline absl::Span<const int64_t> MinorToMajor(const Shape& shape) {
+    DCHECK(shape.IsArray());
+    return shape.layout().minor_to_major();
+  }
+
+  static inline absl::Span<const int64_t> MinorToMajor(const Layout& layout) {
+    return layout.minor_to_major();
+  }
 
   // Major(0) is the most major logical dimension number, Major(1) is the
   // second-most-major logical dimension number and so on.
@@ -180,11 +188,22 @@ class LayoutUtil {
   // the most major. Then Major(0) is the most major logical dimension, so Major
   // maps the physical dimension number 0 to the most major logical dimension
   // number Major(0).
-  static int64_t Major(const Layout& layout, int64_t physical_dimension_number);
+  static int64_t Major(const Layout& layout,
+                       int64_t physical_dimension_number) {
+    DCHECK_LE(0, physical_dimension_number);
+    DCHECK_LT(physical_dimension_number, layout.minor_to_major_size());
+    return Minor(layout,
+                 layout.minor_to_major_size() - 1 - physical_dimension_number);
+  }
 
   // Minor(0) is the most minor logical dimension number, minor(1) is the
   // second-most-minor logical dimension number and so on.
-  static int64_t Minor(const Layout& layout, int64_t physical_dimension_number);
+  static inline int64_t Minor(const Layout& layout,
+                              int64_t physical_dimension_number) {
+    DCHECK_LE(0, physical_dimension_number);
+    DCHECK_LT(physical_dimension_number, layout.minor_to_major_size());
+    return layout.minor_to_major(physical_dimension_number);
+  }
 
   // Returns the inverse mapping of the Major() function. More precisely, return
   // a vector v such that if l == Major(p), then v[l] == p.
@@ -201,6 +220,9 @@ class LayoutUtil {
   // the most minor physical dimension.
   static std::vector<int64_t> MakeLogicalToPhysical(const Layout& layout);
 
+  // Prints a human-readable string that represents the given layout.
+  static void PrintHumanString(Printer* printer, const Layout& layout);
+
   // Returns a human-readable string that represents the given layout.
   static std::string HumanString(const Layout& layout);
 
@@ -239,6 +261,15 @@ class LayoutUtil {
   // returns Layout::kDefaultMemorySpace.
   static int64_t MemorySpace(const Shape& shape);
 
+  static xla::DimLevelType GetDimLevelType(const Layout& layout, int64_t dim);
+  static bool DimUnique(const Layout& layout, int64_t dim);
+  static bool DimOrdered(const Layout& layout, int64_t dim);
+
+  // Return true iff the given DimLevelType and dim_unique/dim_ordered values
+  // represent a valid encoding.
+  static bool ValidateDimLevel(xla::DimLevelType dim_level_type,
+                               bool dim_unique, bool dim_ordered);
+
  private:
   LayoutUtil(const LayoutUtil&) = delete;
   LayoutUtil& operator=(const LayoutUtil&) = delete;
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 6603b8a7178..3f412774d2e 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -474,6 +474,13 @@ TEST_F(LayoutUtilTest, ValidateLayout_Sparse) {
           tsl::error::INVALID_ARGUMENT,
           ::testing::HasSubstr(
               "layout has a physical_shape, but is not a sparse array")));
+  *shape.mutable_layout() =
+      LayoutUtil::MakeLayout({1, 0}, {DIM_DENSE, DIM_DENSE}, {true, false});
+  EXPECT_THAT(LayoutUtil::ValidateLayoutInShape(shape),
+              tsl::testing::StatusIs(
+                  tsl::error::INVALID_ARGUMENT,
+                  ::testing::HasSubstr("layout dimension 1 has invalid level "
+                                       "encoding DIM_DENSE, non-unique")));
 }
 
 TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
diff --git a/tensorflow/compiler/xla/lazy.h b/tensorflow/compiler/xla/lazy.h
new file mode 100644
index 00000000000..3ef9c093b02
--- /dev/null
+++ b/tensorflow/compiler/xla/lazy.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_LAZY_H_
+#define TENSORFLOW_COMPILER_XLA_LAZY_H_
+
+#include <variant>
+
+#include "absl/functional/any_invocable.h"
+
+namespace xla {
+
+template <typename T>
+class Lazy {
+ public:
+  explicit Lazy(absl::AnyInvocable<T() &&> func)
+      : maybe_value_(std::move(func)) {}
+
+  const T& get() const {
+    if (!std::holds_alternative<T>(maybe_value_)) {
+      maybe_value_ =
+          std::move(std::get<absl::AnyInvocable<T() &&>>(maybe_value_))();
+    }
+    return std::get<T>(maybe_value_);
+  }
+
+ private:
+  mutable std::variant<absl::AnyInvocable<T() &&>, T> maybe_value_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_LAZY_H_
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 9e948438a9d..1a246789105 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/mem.h"
 #include "tensorflow/tsl/util/byte_swap_array.h"
@@ -77,32 +78,6 @@ void ConvertEndianShort(char* bytes, int64_t size) {
   }
 }
 
-std::string CompactOneline(const std::string& input) {
-  std::string result;
-  std::vector<std::string> v = absl::StrSplit(input, absl::ByAnyChar("\n "));
-  bool first = true;
-  // Concatenate elements in "v" with spaces separating them, but ignoring
-  // empty entries.
-  for (const auto& s : v) {
-    if (s.empty()) {
-      continue;
-    }
-    absl::StrAppend(&result, (first ? "" : " "), s);
-    first = false;
-  }
-  return result;
-}
-
-// Since Eigen::half doesn't satisfy the absl::bit_cast contract, we need to be
-// able to transparently access the raw 16-bit value contained within.
-template <typename T>
-T GetRawValue(T val) {
-  return val;
-}
-uint16_t GetRawValue(Eigen::half val) {
-  return Eigen::numext::bit_cast<uint16_t>(val);
-}
-
 bool LiteralProtoHasValues(const LiteralProto& proto) {
   return proto.preds_size() || !proto.s8s().empty() || !proto.u8s().empty() ||
          proto.s32s_size() || proto.s64s_size() || proto.u32s_size() ||
@@ -110,7 +85,8 @@ bool LiteralProtoHasValues(const LiteralProto& proto) {
          proto.c64s_size() || proto.c128s_size() ||
          proto.tuple_literals_size() || !proto.f16s().empty() ||
          !proto.bf16s().empty() || !proto.u16s().empty() ||
-         !proto.s16s().empty();
+         !proto.s16s().empty() || !proto.f8e5m2s().empty() ||
+         !proto.f8e4m3fns().empty();
 }
 
 // Lazy getter for the interned scalar shape in static storage. We reuse this
@@ -147,6 +123,10 @@ const Shape& ScalarShape(PrimitiveType type) {
       return ScalarShapeImpl<S32>();
     case S64:
       return ScalarShapeImpl<S64>();
+    case F8E5M2:
+      return ScalarShapeImpl<F8E5M2>();
+    case F8E4M3FN:
+      return ScalarShapeImpl<F8E4M3FN>();
     case F16:
       return ScalarShapeImpl<F16>();
     case BF16:
@@ -675,6 +655,8 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
       COPY_ELEMENTS(S16, int16_t);
       COPY_ELEMENTS(S32, int32_t);
       COPY_ELEMENTS(S64, int64_t);
+      COPY_ELEMENTS(F8E5M2, tsl::float8_e5m2);
+      COPY_ELEMENTS(F8E4M3FN, tsl::float8_e4m3fn);
       COPY_ELEMENTS(F16, half);
       COPY_ELEMENTS(BF16, bfloat16);
       COPY_ELEMENTS(F32, float);
@@ -835,6 +817,12 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
     case S64:
       return CopySliceFromInternal<int64_t>(src_literal, src_base, dest_base,
                                             copy_size);
+    case F8E5M2:
+      return CopySliceFromInternal<tsl::float8_e5m2>(src_literal, src_base,
+                                                     dest_base, copy_size);
+    case F8E4M3FN:
+      return CopySliceFromInternal<tsl::float8_e4m3fn>(src_literal, src_base,
+                                                       dest_base, copy_size);
     case F16:
       return CopySliceFromInternal<half>(src_literal, src_base, dest_base,
                                          copy_size);
@@ -933,6 +921,9 @@ Literal LiteralBase::ToStatic() const {
           return;
         }
         for (int64_t i = 0; i < subshape->rank(); ++i) {
+          // GetDynamicSize has a 32-bit return type and may truncate static
+          // dimensions, so make sure to skip.
+          if (!subshape->is_dynamic_dimension(i)) continue;
           subshape->set_dynamic_dimension(i, false);
           subshape->set_dimensions(i, GetDynamicSize(i, index));
         }
@@ -1129,6 +1120,10 @@ Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
       return SliceInternal<int32_t>(result_shape, start_indices);
     case S64:
       return SliceInternal<int64_t>(result_shape, start_indices);
+    case F8E5M2:
+      return SliceInternal<tsl::float8_e5m2>(result_shape, start_indices);
+    case F8E4M3FN:
+      return SliceInternal<tsl::float8_e4m3fn>(result_shape, start_indices);
     case F16:
       return SliceInternal<half>(result_shape, start_indices);
     case BF16:
@@ -1196,6 +1191,12 @@ std::string LiteralBase::GetAsString(absl::Span<const int64_t> multi_index,
       return RoundTripFpToString(Get<float>(multi_index, shape_index));
     case BF16:
       return RoundTripFpToString(Get<bfloat16>(multi_index, shape_index));
+    case F8E5M2:
+      return RoundTripFpToString(
+          Get<tsl::float8_e5m2>(multi_index, shape_index));
+    case F8E4M3FN:
+      return RoundTripFpToString(
+          Get<tsl::float8_e4m3fn>(multi_index, shape_index));
     case F64:
       return RoundTripFpToString(Get<double>(multi_index, shape_index));
     case C64: {
@@ -1244,6 +1245,10 @@ std::optional<double> LiteralBase::GetAsDouble(
     absl::Span<const int64_t> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
+    case F8E5M2:
+      return static_cast<double>(Get<tsl::float8_e5m2>(multi_index));
+    case F8E4M3FN:
+      return static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index));
     case F16:
       return static_cast<double>(Get<half>(multi_index));
     case F32:
@@ -1260,6 +1265,10 @@ std::optional<double> LiteralBase::GetAsDouble(
 std::optional<complex128> LiteralBase::GetAsComplex128(
     absl::Span<const int64_t> multi_index) const {
   switch (shape().element_type()) {
+    case F8E5M2:
+      return {{static_cast<double>(Get<tsl::float8_e5m2>(multi_index)), 0}};
+    case F8E4M3FN:
+      return {{static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index)), 0}};
     case BF16:
       return {{static_cast<double>(Get<bfloat16>(multi_index)), 0}};
     case F16:
@@ -1324,6 +1333,13 @@ Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
     case BF16:
       Set<bfloat16>(multi_index, static_cast<bfloat16>(value));
       break;
+    case F8E5M2:
+      Set<tsl::float8_e5m2>(multi_index, static_cast<tsl::float8_e5m2>(value));
+      break;
+    case F8E4M3FN:
+      Set<tsl::float8_e4m3fn>(multi_index,
+                              static_cast<tsl::float8_e4m3fn>(value));
+      break;
     default:
       return FailedPrecondition("Array element type is not floating: %s",
                                 PrimitiveType_Name(shape().element_type()));
@@ -1333,46 +1349,44 @@ Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
 
 namespace {
 
-std::string ShapeToString(bool print_layout, const Shape& shape) {
-  return print_layout ? ShapeUtil::HumanStringWithLayout(shape)
-                      : ShapeUtil::HumanString(shape);
+void PrintShape(bool print_layout, const Shape& shape, Printer* printer) {
+  if (print_layout) {
+    ShapeUtil::PrintHumanStringWithLayout(printer, shape);
+  } else {
+    ShapeUtil::PrintHumanString(printer, shape);
+  }
 }
 
-void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_shape, bool print_layout,
-                    std::vector<std::string>* pieces);
+void PrintHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                 bool print_shape, bool print_layout, bool oneline,
+                 Printer* printer);
 
-void TupleToStringHelper(const LiteralBase& literal,
-                         const ShapeIndex& shape_index, bool print_shape,
-                         bool print_layout, std::vector<std::string>* pieces) {
+void TuplePrintHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                      bool print_shape, bool print_layout, bool oneline,
+                      Printer* printer) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  pieces->push_back("(\n");
-  std::vector<std::string> tuple_pieces;
-  const auto tuple_element_count = ShapeUtil::TupleElementCount(subshape);
-  tuple_pieces.reserve(tuple_element_count);
+  printer->Append(oneline ? "( " : "(\n");
   for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
     ShapeIndex element_index = shape_index;
     element_index.push_back(i);
-    std::vector<std::string> element_pieces;
-    ToStringHelper(literal, element_index, print_shape, print_layout,
-                   &element_pieces);
-    tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+    if (i > 0) printer->Append(oneline ? ", " : ",\n");
+    PrintHelper(literal, element_index, print_shape, print_layout, oneline,
+                printer);
   }
-  pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
-  pieces->push_back("\n)");
+  printer->Append(oneline ? " )" : "\n)");
 }
 
-void DenseArrayToStringHelper(const LiteralBase& literal,
-                              const ShapeIndex& shape_index, bool print_shape,
-                              bool print_layout,
-                              std::vector<std::string>* pieces) {
+void DenseArrayPrintHelper(const LiteralBase& literal,
+                           const ShapeIndex& shape_index, bool print_shape,
+                           bool print_layout, bool oneline, Printer* printer) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   int64_t rank = subshape.rank();
+  const absl::string_view linebreak = oneline ? " " : "\n";
 
   std::function<void(absl::Span<const int64_t> dimensions,
                      std::vector<int64_t>*)>
-      to_string_recursive = [&](absl::Span<const int64_t> dimensions,
-                                std::vector<int64_t>* accum_indices) {
+      print_recursive = [&](absl::Span<const int64_t> dimensions,
+                            std::vector<int64_t>* accum_indices) {
         // dimensions.size() decreases by 1 at each recursive call,
         // and accum_indices->size() increases by 1.
         // Their sum is equal to the rank of the tensor.
@@ -1385,7 +1399,8 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
           }
           // Handle the innermost tensor of a 2D+ tensor.
           if (dimensions.size() == 1 && brace == "{") {
-            return StrCat("  ", brace, dimensions[0] <= 1 ? "" : " ");
+            return StrCat(oneline ? "" : "  ", brace,
+                          dimensions[0] <= 1 ? "" : " ");
           }
           if (dimensions.size() == 1 && brace == "}") {
             return StrCat(dimensions[0] <= 1 ? "" : " ", brace);
@@ -1397,11 +1412,13 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
                 accum_indices_size < rank) {
               int index = accum_indices->size() - 1;
               int value = accum_indices->back();
-              return StrCat(brace, " /*i", index, "=", value, "*/\n");
+              int size = dimensions.front();
+              return StrCat(brace, " /*i", index, "=", value, "*/",
+                            size > 0 ? linebreak : "");
             }
-            return StrCat(brace, "\n");
+            return StrCat(brace, linebreak);
           }
-          return StrCat("\n", brace);
+          return StrCat(linebreak, brace);
         };
 
         if (dimensions.empty()) {
@@ -1412,35 +1429,36 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
           } else {
             elem = literal.GetAsString(*accum_indices, shape_index);
           }
-          pieces->push_back(elem);
+          printer->Append(elem);
         } else {
-          pieces->push_back(brace_to_string("{"));
+          printer->Append(brace_to_string("{"));
           for (int i = 0; i < dimensions[0]; ++i) {
             accum_indices->push_back(i);
-            to_string_recursive(dimensions.subspan(1), accum_indices);
+            print_recursive(dimensions.subspan(1), accum_indices);
             accum_indices->pop_back();
             if (i < dimensions[0] - 1) {
-              pieces->push_back(",");
-              pieces->push_back(dimensions.size() > 1 ? "\n" : " ");
+              printer->Append(",");
+              printer->Append(dimensions.size() > 1 ? linebreak : " ");
             }
           }
-          pieces->push_back(brace_to_string("}"));
+          printer->Append(brace_to_string("}"));
         }
       };
 
   if (print_shape) {
-    pieces->push_back(ShapeToString(print_layout, subshape));
+    PrintShape(print_layout, subshape, printer);
     if (subshape.is_dynamic()) {
-      pieces->push_back("(");
+      printer->Append("(");
       for (int64_t i = 0; i < subshape.dimensions_size(); ++i) {
-        pieces->push_back(StrCat(literal.GetDynamicSize(i, shape_index)));
+        printer->Append(
+            absl::AlphaNum(literal.GetDynamicSize(i, shape_index)).Piece());
         if (i < subshape.dimensions_size() - 1) {
-          pieces->push_back(",");
+          printer->Append(",");
         }
       }
-      pieces->push_back(")");
+      printer->Append(")");
     }
-    pieces->push_back(" ");
+    printer->Append(" ");
   }
   std::vector<int64_t> indices = {};
   std::vector<int64_t> dimensions;
@@ -1448,73 +1466,108 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
   for (int64_t i = 0; i < subshape.rank(); ++i) {
     dimensions.push_back(literal.GetDynamicSize(i, shape_index));
   }
-  to_string_recursive(dimensions, &indices);
+  print_recursive(dimensions, &indices);
 }
 
-void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_shape, bool print_layout,
-                    std::vector<std::string>* pieces) {
+void PrintHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                 bool print_shape, bool print_layout, bool oneline,
+                 Printer* printer) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   CHECK(LayoutUtil::HasLayout(literal.shape()));
   CHECK(LayoutUtil::HasLayout(subshape));
   if (subshape.IsTuple()) {
-    TupleToStringHelper(literal, shape_index, print_shape, print_layout,
-                        pieces);
+    TuplePrintHelper(literal, shape_index, print_shape, print_layout, oneline,
+                     printer);
   } else if (subshape.IsToken()) {
-    pieces->push_back("token");
+    printer->Append("token");
   } else {
     CHECK(LayoutUtil::IsDenseArray(subshape));
     if (literal.IsKnown(shape_index)) {
-      DenseArrayToStringHelper(literal, shape_index, print_shape, print_layout,
-                               pieces);
+      DenseArrayPrintHelper(literal, shape_index, print_shape, print_layout,
+                            oneline, printer);
     } else {
-      pieces->push_back(ShapeToString(print_layout, subshape));
-      pieces->push_back(" ");
+      PrintShape(print_layout, subshape, printer);
+      printer->Append(" ");
       if (literal.IsDetermined(shape_index)) {
-        pieces->push_back("unknown");
+        printer->Append("unknown");
       } else {
-        pieces->push_back("undetermined");
+        printer->Append("undetermined");
       }
     }
   }
 }
-
 }  // namespace
 
-std::string LiteralBase::ToString() const {
-  std::vector<std::string> pieces;
+void LiteralBase::Print(Printer* printer) const {
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  PrintHelper(*this, {}, /*print_shape=*/true, /*print_layout=*/false,
+              /*oneline=*/false, printer);
+}
+
+void LiteralBase::PrintOneline(Printer* printer) const {
   CHECK(LayoutUtil::HasLayout(this->shape()));
-  ToStringHelper(*this, {}, /*print_shape=*/true,
-                 /*print_layout=*/false, &pieces);
-  return absl::StrJoin(pieces, "");
+  PrintHelper(*this, {}, /*print_shape=*/true, /*print_layout=*/false,
+              /*oneline=*/true, printer);
+}
+
+void LiteralBase::PrintWithoutShape(Printer* printer) const {
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  PrintHelper(*this, {}, /*print_shape=*/false, /*print_layout=*/false,
+              /*oneline=*/false, printer);
+}
+
+void LiteralBase::PrintWithoutShapeOneline(Printer* printer) const {
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  PrintHelper(*this, {}, /*print_shape=*/false, /*print_layout=*/false,
+              /*oneline=*/true, printer);
+}
+
+void LiteralBase::PrintWithLayout(Printer* printer) const {
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  PrintHelper(*this, {}, /*print_shape=*/true, /*print_layout=*/true,
+              /*oneline=*/false, printer);
+}
+
+void LiteralBase::PrintWithLayoutOneline(Printer* printer) const {
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  PrintHelper(*this, {}, /*print_shape=*/true, /*print_layout=*/true,
+              /*oneline=*/true, printer);
+}
+
+std::string LiteralBase::ToString() const {
+  StringPrinter printer;
+  Print(&printer);
+  return std::move(printer).ToString();
 }
 
 std::string LiteralBase::ToStringOneline() const {
-  return CompactOneline(ToString());
+  StringPrinter printer;
+  PrintOneline(&printer);
+  return std::move(printer).ToString();
 }
 
 std::string LiteralBase::ToStringWithoutShape() const {
-  std::vector<std::string> pieces;
-  CHECK(LayoutUtil::HasLayout(this->shape()));
-  ToStringHelper(*this, {}, /*print_shape=*/false,
-                 /*print_layout=*/false, &pieces);
-  return absl::StrJoin(pieces, "");
+  StringPrinter printer;
+  PrintWithoutShape(&printer);
+  return std::move(printer).ToString();
 }
 
 std::string LiteralBase::ToStringWithoutShapeOneline() const {
-  return CompactOneline(ToStringWithoutShape());
+  StringPrinter printer;
+  PrintWithoutShapeOneline(&printer);
+  return std::move(printer).ToString();
 }
 
 std::string LiteralBase::ToStringWithLayout() const {
-  std::vector<std::string> pieces;
-  CHECK(LayoutUtil::HasLayout(this->shape()));
-  ToStringHelper(*this, {}, /*print_shape=*/true,
-                 /*print_layout=*/true, &pieces);
-  return absl::StrJoin(pieces, "");
+  StringPrinter printer;
+  PrintWithLayout(&printer);
+  return std::move(printer).ToString();
 }
 
 std::string LiteralBase::ToStringWithLayoutOneline() const {
-  return CompactOneline(ToStringWithLayout());
+  StringPrinter printer;
+  PrintWithLayoutOneline(&printer);
+  return std::move(printer).ToString();
 }
 
 void LiteralBase::EachCellAsString(
@@ -1550,24 +1603,11 @@ Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<std::is_same<NativeSrcT, Eigen::half>::value &&
-                            (std::is_same<NativeDestT, complex64>::value ||
-                             std::is_same<NativeDestT, complex128>::value),
-                        Literal>::type
-ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
-  auto converter = [](NativeSrcT src) {
-    return NativeDestT(static_cast<typename NativeDestT::value_type>(src));
-  };
-  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
-      src_literal, converter);
-}
-
-template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<std::is_floating_point<NativeSrcT>::value &&
-                            std::is_integral<NativeDestT>::value,
-                        Literal>::type
-ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
+    if constexpr (std::is_same_v<NativeSrcT, NativeDestT>) {
+      return src;
+    }
     // C++ [conv.bool]p1:
     //   A prvalue of arithmetic [...] type can be converted to a prvalue of
     //   type bool. A zero value [...] is converted to false; any other value is
@@ -1580,14 +1620,18 @@ ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
     // may be undefined if the value's magnitude is too large or it is a NaN.
     // Let's choose saturating arithmetic as it captures the spirit of infinity
     // and arbitrarily map NaN to zero.
-    if (!std::is_same<NativeDestT, bool>::value) {
+    if constexpr (!std::is_same_v<NativeDestT, bool> &&
+                  !std::numeric_limits<NativeSrcT>::is_integer &&
+                  std::numeric_limits<NativeDestT>::is_integer) {
       if (src != src) {
         return NativeDestT{0};
       }
-      if (src >= std::numeric_limits<NativeDestT>::max()) {
+      if (src >=
+          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::max())) {
         return std::numeric_limits<NativeDestT>::max();
       }
-      if (src <= std::numeric_limits<NativeDestT>::lowest()) {
+      if (src <=
+          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::lowest())) {
         return std::numeric_limits<NativeDestT>::lowest();
       }
     }
@@ -1598,53 +1642,18 @@ ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<!(std::is_floating_point<NativeSrcT>::value &&
-                          std::is_integral<NativeDestT>::value) &&
-                            !(std::is_same<NativeSrcT, Eigen::half>::value &&
-                              (std::is_same<NativeDestT, complex64>::value ||
-                               std::is_same<NativeDestT, complex128>::value)),
-                        Literal>::type
-ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
-  auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
-  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
-      src_literal, converter);
-}
-
-template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT) &&
-                         !std::is_same<NativeDestT, Eigen::half>::value),
-                        Literal>::type
-BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
-  auto converter = [](NativeSrcT src) {
-    return absl::bit_cast<NativeDestT>(GetRawValue(src));
-  };
-  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
-      src_literal, converter);
-}
-
-template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) == sizeof(Eigen::half) &&
-                         std::is_same<NativeDestT, Eigen::half>::value),
-                        Literal>::type
-BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
-  // Eigen::half doesn't satisfy the absl::bit_cast contract, so explicitly
-  // cast to unsigned short first.
-  auto converter = [](NativeSrcT src) {
-    return Eigen::numext::bit_cast<Eigen::half>(
-        absl::bit_cast<uint16_t>(GetRawValue(src)));
-  };
-  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, Eigen::half>(
-      src_literal, converter);
-}
-
-// This template specialization is here to make the compiler happy. bit_cast has
-// a static check that the types are the same size. This specialization should
-// never be used because the source and destination types are checked for
-// identical sizes higher up.
-template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
-                        Literal>::type
-BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
+Literal BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
+  if constexpr (sizeof(NativeSrcT) == sizeof(NativeDestT)) {
+    auto converter = [](NativeSrcT src) {
+      return Eigen::numext::bit_cast<NativeDestT>(src);
+    };
+    return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+        src_literal, converter);
+  }
+  // This template specialization is here to make the compiler happy. bit_cast
+  // has a static check that the types are the same size. This specialization
+  // should never be used because the source and destination types are checked
+  // for identical sizes higher up.
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
 }
 
@@ -1688,6 +1697,8 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
     CONVERT_IF_TYPES_MATCH(F32)
     CONVERT_IF_TYPES_MATCH(F64)
     CONVERT_IF_TYPES_MATCH(BF16)
+    CONVERT_IF_TYPES_MATCH(F8E5M2)
+    CONVERT_IF_TYPES_MATCH(F8E4M3FN)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
       if (bitcast) {
@@ -1733,6 +1744,8 @@ StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
     CONVERT_IF_DEST_TYPE_MATCHES(F32)
     CONVERT_IF_DEST_TYPE_MATCHES(F64)
     CONVERT_IF_DEST_TYPE_MATCHES(BF16)
+    CONVERT_IF_DEST_TYPE_MATCHES(F8E5M2)
+    CONVERT_IF_DEST_TYPE_MATCHES(F8E4M3FN)
 #undef CONVERT_IF_DEST_TYPE_MATCHES
       // Other types are not yet supported.
     default:
@@ -1922,6 +1935,10 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<half>(other, &multi_index);
     case BF16:
       return EqualElementsInternal<bfloat16>(other, &multi_index);
+    case F8E5M2:
+      return EqualElementsInternal<tsl::float8_e5m2>(other, &multi_index);
+    case F8E4M3FN:
+      return EqualElementsInternal<tsl::float8_e4m3fn>(other, &multi_index);
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
     case C128:
@@ -2030,6 +2047,13 @@ bool Literal::Piece::IsAll(const Literal& scalar) const {
     case PRED:
       return AllElementsEqualValue(data<bool>(),
                                    scalar.GetFirstElement<bool>());
+    case F8E5M2:
+      return AllElementsEqualValue(data<tsl::float8_e5m2>(),
+                                   scalar.GetFirstElement<tsl::float8_e5m2>());
+    case F8E4M3FN:
+      return AllElementsEqualValue(
+          data<tsl::float8_e4m3fn>(),
+          scalar.GetFirstElement<tsl::float8_e4m3fn>());
     case F16:
       return AllElementsEqualValue(data<half>(),
                                    scalar.GetFirstElement<half>());
@@ -2063,7 +2087,7 @@ bool LiteralBase::IsAll(int8_t value) const {
   }
   PrimitiveType ty = shape().element_type();
   if (primitive_util::IsFloatingPointType(ty)) {
-    return IsAllFloat(value);
+    return IsAllFloatImpl(value, /*round_value=*/false);
   }
   if (primitive_util::IsUnsignedIntegralType(ty) && value < 0) {
     return false;
@@ -2110,12 +2134,23 @@ bool LiteralBase::IsAll(int8_t value) const {
 }
 
 bool LiteralBase::IsAllFloat(float value) const {
+  return IsAllFloatImpl(value, /*round_value=*/true);
+}
+
+bool LiteralBase::IsAllFloatImpl(float value, bool round_value) const {
   if (!shape().IsArray()) {
     return false;
   }
   PrimitiveType ty = shape().element_type();
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
   switch (ty) {
+    case F8E5M2:
+      scalar.Set<tsl::float8_e5m2>({}, static_cast<tsl::float8_e5m2>(value));
+      break;
+    case F8E4M3FN:
+      scalar.Set<tsl::float8_e4m3fn>({},
+                                     static_cast<tsl::float8_e4m3fn>(value));
+      break;
     case F16:
       scalar.Set<half>({}, static_cast<half>(value));
       break;
@@ -2131,6 +2166,9 @@ bool LiteralBase::IsAllFloat(float value) const {
     default:
       return false;
   }
+  if (!round_value && scalar.GetAsDouble({}) != value) {
+    return false;
+  }
   return root_piece().IsAll(scalar);
 }
 
@@ -2207,6 +2245,12 @@ bool LiteralBase::IsR1Iota() const {
         return Get<half>({idx}) == static_cast<half>(idx);
       case BF16:
         return Get<bfloat16>({idx}) == static_cast<bfloat16>(idx);
+      case F8E5M2:
+        return Get<tsl::float8_e5m2>({idx}) ==
+               static_cast<tsl::float8_e5m2>(idx);
+      case F8E4M3FN:
+        return Get<tsl::float8_e4m3fn>({idx}) ==
+               static_cast<tsl::float8_e4m3fn>(idx);
       case C64:
         return Get<complex64>({idx}) == complex64(idx, 0.0f);
       case C128:
@@ -2316,6 +2360,12 @@ bool LiteralBase::IsZero(absl::Span<const int64_t> indices) const {
       return Get<half>(indices) == static_cast<half>(0.0f);
     case BF16:
       return Get<bfloat16>(indices) == static_cast<bfloat16>(0.0f);
+    case F8E5M2:
+      return Get<tsl::float8_e5m2>(indices) ==
+             static_cast<tsl::float8_e5m2>(0.0f);
+    case F8E4M3FN:
+      return Get<tsl::float8_e4m3fn>(indices) ==
+             static_cast<tsl::float8_e4m3fn>(0.0f);
     case PRED:
       return Get<bool>(indices) == false;
     default:
@@ -2399,6 +2449,16 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
         ConvertEndianShort(proto->mutable_bf16s());
       }
       break;
+    case F8E5M2:
+      *proto->mutable_f8e5m2s() = std::string(
+          reinterpret_cast<const char*>(data<tsl::float8_e5m2>().data()),
+          size_bytes_dense());
+      break;
+    case F8E4M3FN:
+      *proto->mutable_f8e4m3fns() = std::string(
+          reinterpret_cast<const char*>(data<tsl::float8_e4m3fn>().data()),
+          size_bytes_dense());
+      break;
     case F32:
       CopyToRepeatedField(proto->mutable_f32s(), data<float>());
       break;
@@ -2506,6 +2566,19 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
         ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
     } break;
+    case F8E5M2: {
+      const std::string& s(proto.f8e5m2s());
+      TF_RET_CHECK(data<tsl::float8_e5m2>().size() * sizeof(tsl::float8_e5m2) ==
+                   s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+    } break;
+    case F8E4M3FN: {
+      const std::string& s(proto.f8e4m3fns());
+      TF_RET_CHECK(data<tsl::float8_e4m3fn>().size() *
+                       sizeof(tsl::float8_e4m3fn) ==
+                   s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+    } break;
     case F16: {
       const std::string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 8f8972fe75b..8607d805df9 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -40,12 +40,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/bitmap.h"
+#include "tensorflow/tsl/platform/cpu_info.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -90,15 +92,38 @@ class LiteralBase {
   // array.
   std::string GetR1U8AsString() const;
 
-  // Returns a string representation of the literal value. The Shape of the
+  // Prints a string representation of the literal value. The Shape of the
   // literal is a prefix of the literal value in the string.
+  //
+  // Warning: this function can take minutes for multi-million element Literals.
+  void Print(Printer* printer) const;
+
+  // Similar to Print, but prints the result in a compact one-line form.
+  void PrintOneline(Printer* printer) const;
+
+  // Prints a string representation of the literal value which does *not*
+  // include the shape string.
+  void PrintWithoutShape(Printer* printer) const;
+
+  // Similar to PrintWithoutShape, but prints the result in a compact one-line
+  // form.
+  void PrintWithoutShapeOneline(Printer* printer) const;
+
+  // Prints a string representation of the literal value which includes the
+  // shape string with its layout.does *not* include the shape string.
+  void PrintWithLayout(Printer* printer) const;
 
-  // Warning: this function can take minutes for multi-million
-  // element Literals.
+  // Similar to PrintWithLayout, but prints the result in a compact one-line
+  // form.
+  void PrintWithLayoutOneline(Printer* printer) const;
+
+  // Returns a string representation of the literal value. The Shape of the
+  // literal is a prefix of the literal value in the string.
+  //
+  // Warning: this function can take minutes for multi-million element Literals.
   std::string ToString() const;
 
-  // Similar to ToString, but return the result in a compact
-  // one-line form.
+  // Similar to ToString, but return the result in a compact one-line form.
   std::string ToStringOneline() const;
 
   // Returns a string representation of the literal value which does *not*
@@ -113,8 +138,8 @@ class LiteralBase {
   // shape string with its layout.does *not* include the shape string.
   std::string ToStringWithLayout() const;
 
-  // Similar to ToStringWithLayout, but return the result in a compact
-  // one-line form.
+  // Similar to ToStringWithLayout, but return the result in a compact one-line
+  // form.
   std::string ToStringWithLayoutOneline() const;
 
   // Gets an element in the literal at the given index. The multi_index is
@@ -152,7 +177,9 @@ class LiteralBase {
   template <typename T>
   typename std::enable_if<(std::is_arithmetic<T>::value ||
                            std::is_same<T, Eigen::half>::value ||
-                           std::is_same<T, bfloat16>::value),
+                           std::is_same<T, bfloat16>::value ||
+                           std::is_same<T, tsl::float8_e5m2>::value ||
+                           std::is_same<T, tsl::float8_e4m3fn>::value),
                           bool>::type
   IsEqualAt(absl::Span<const int64_t> multi_index, T value) const {
     if (auto as_s64 = GetIntegralAsS64(multi_index)) {
@@ -238,9 +265,8 @@ class LiteralBase {
   // if it's not an array.
   //
   // This casts value to the type of literal, then compares using ==, with the
-  // caveat that NaNs are considered equal.  The usual admonishments about
-  // floating-point equality checks apply.  We expect you to use this to check
-  // for values that can be expressed precisely as a float, e.g. -0.5.
+  // caveat that NaNs are considered equal. Unlike IsAll, this does not
+  // necessarily return false if the value does not fit in this literal's type.
   bool IsAllFloat(float value) const;
   bool IsAllComplex(complex64 value) const;
 
@@ -772,6 +798,12 @@ class LiteralBase {
   template <typename NativeT>
   Literal SliceInternal(const Shape& result_shape,
                         absl::Span<const int64_t> start_indices) const;
+
+  // Like IsAllFloat, but if round_value is false and the value is not
+  // representable with the literal's type (e.g., due to rounding error or
+  // overflow/underflow when casting the value to the literal's type), returns
+  // false.
+  bool IsAllFloatImpl(float value, bool round_value) const;
 };
 
 // Abstract base class representing a mutable literal in XLA.
@@ -1492,7 +1524,7 @@ TF_ATTRIBUTE_NOINLINE Status MutableLiteralBase::PopulateParallel(
       [&](absl::Span<const int64_t> indexes, int thread_id) {
         return generator(indexes, thread_id);
       },
-      /*parallel=*/true);
+      /*parallel=*/data<NativeT>().size() > 32);
 }
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index a30dbc1a0d9..01d3e813e37 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -20,14 +20,17 @@ limitations under the License.
 #endif
 
 #include <cmath>
+#include <string>
 #include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 using absl::StrAppend;
 using absl::StrAppendFormat;
@@ -70,6 +73,19 @@ bool CompareEqual(NativeT lhs, NativeT rhs,
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
+bool CompareEqual<tsl::float8_e5m2>(tsl::float8_e5m2 lhs, tsl::float8_e5m2 rhs,
+                                    absl::Span<const int64_t> multi_index) {
+  return CompareFloatsBitwiseEqual<tsl::float8_e5m2, uint8_t>(lhs, rhs,
+                                                              multi_index);
+}
+template <>
+bool CompareEqual<tsl::float8_e4m3fn>(tsl::float8_e4m3fn lhs,
+                                      tsl::float8_e4m3fn rhs,
+                                      absl::Span<const int64_t> multi_index) {
+  return CompareFloatsBitwiseEqual<tsl::float8_e4m3fn, uint8_t>(lhs, rhs,
+                                                                multi_index);
+}
+template <>
 bool CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
                             absl::Span<const int64_t> multi_index) {
   return CompareFloatsBitwiseEqual<bfloat16, uint16_t>(lhs, rhs, multi_index);
@@ -127,6 +143,18 @@ Status MakeErrorStatus(NativeT lhs, NativeT rhs,
       LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs));
 }
 
+template <>
+Status MakeErrorStatus(tsl::float8_e5m2 lhs, tsl::float8_e5m2 rhs,
+                       absl::Span<const int64_t> multi_index) {
+  return MakeBitwiseErrorStatus<tsl::float8_e5m2, uint8_t>(lhs, rhs,
+                                                           multi_index);
+}
+template <>
+Status MakeErrorStatus(tsl::float8_e4m3fn lhs, tsl::float8_e4m3fn rhs,
+                       absl::Span<const int64_t> multi_index) {
+  return MakeBitwiseErrorStatus<tsl::float8_e4m3fn, uint8_t>(lhs, rhs,
+                                                             multi_index);
+}
 template <>
 Status MakeErrorStatus(bfloat16 lhs, bfloat16 rhs,
                        absl::Span<const int64_t> multi_index) {
@@ -239,6 +267,14 @@ bool IsNan(NativeT value) {
 }
 
 // Converts the given floating-point value to a string.
+std::string FpValueToString(tsl::float8_e5m2 value) {
+  return absl::StrFormat("%5.3g", static_cast<double>(value));
+}
+
+std::string FpValueToString(tsl::float8_e4m3fn value) {
+  return absl::StrFormat("%5.3g", static_cast<double>(value));
+}
+
 std::string FpValueToString(bfloat16 value) {
   return absl::StrFormat("%10.4g", static_cast<double>(value));
 }
@@ -266,7 +302,7 @@ std::string FpValueToString(complex128 value) {
 }
 
 // A wrapper of std::abs to include data types that are not supported by
-// std::abs, in particular, bfloat16 and half.
+// std::abs, such as bfloat16 and half.
 template <typename NativeT>
 double FpAbsoluteValue(NativeT value) {
   return std::abs(value);
@@ -282,6 +318,16 @@ double FpAbsoluteValue(half value) {
   return FpAbsoluteValue<float>(static_cast<float>(value));
 }
 
+template <>
+double FpAbsoluteValue(tsl::float8_e5m2 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+double FpAbsoluteValue(tsl::float8_e4m3fn value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
 // Helper class for comparing floating-point literals within an error bound.
 template <typename NativeT>
 class NearComparator {
@@ -701,7 +747,11 @@ constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
 Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
                    const ShapeIndex& shape_index,
                    const MiscompareCallback& miscompare_callback) {
-  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  if (expected.shape().is_static() && actual.shape().is_static()) {
+    TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  } else {
+    TF_RETURN_IF_ERROR(EqualDynamicShapesAndDimensions(expected, actual));
+  }
 
   Status result;
   if (expected.shape().IsTuple()) {
@@ -756,6 +806,14 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
       case U64:
         result = Equal<uint64_t>(expected, actual, index, 0, miscompared_ptr);
         break;
+      case F8E5M2:
+        result = Equal<tsl::float8_e5m2>(expected, actual, index, 0,
+                                         miscompared_ptr);
+        break;
+      case F8E4M3FN:
+        result = Equal<tsl::float8_e4m3fn>(expected, actual, index, 0,
+                                           miscompared_ptr);
+        break;
       case BF16:
         result = Equal<bfloat16>(expected, actual, index, 0, miscompared_ptr);
         break;
@@ -798,7 +856,11 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
                   const ShapeIndex& shape_index, const ErrorSpec& error,
                   std::optional<bool> detailed_message,
                   const MiscompareCallback& miscompare_callback) {
-  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  if (expected.shape().is_static() && actual.shape().is_static()) {
+    TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  } else {
+    TF_RETURN_IF_ERROR(EqualDynamicShapesAndDimensions(expected, actual));
+  }
 
   if (expected.shape().IsTuple()) {
     Status return_status;
@@ -840,6 +902,16 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
     bool use_detailed_message = detailed_message.value_or(
         ShapeUtil::ElementsIn(expected.shape()) >= 64);
     switch (expected.shape().element_type()) {
+      case F8E5M2:
+        return NearComparator<tsl::float8_e5m2>::Compare(
+            expected, actual, shape_index, error, use_detailed_message,
+            miscompare_callback);
+        break;
+      case F8E4M3FN:
+        return NearComparator<tsl::float8_e4m3fn>::Compare(
+            expected, actual, shape_index, error, use_detailed_message,
+            miscompare_callback);
+        break;
       case BF16:
         return NearComparator<bfloat16>::Compare(expected, actual, shape_index,
                                                  error, use_detailed_message,
@@ -932,6 +1004,53 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
   return OkStatus();
 }
 
+Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
+                                       const LiteralSlice& actual) {
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  return ShapeUtil::ForEachSubshapeWithStatus(
+      expected.shape(), [&expected, &actual](const Shape& expected_shape,
+                                             const ShapeIndex& index) {
+        auto actual_shape = ShapeUtil::GetSubshape(actual.shape(), index);
+        for (int i = 0; i < expected_shape.dimensions().size(); ++i) {
+          if (!expected_shape.is_dynamic_dimension(i) &&
+              !actual_shape.is_dynamic_dimension(i)) {
+            // We're only interested in dynamic dimensions.
+            continue;
+          }
+          if (expected_shape.is_dynamic_dimension(i) &&
+              !actual_shape.is_dynamic_dimension(i)) {
+            return InvalidArgument(
+                "mismatch at dimension %d. the expected shape %s is dynamic "
+                "while "
+                "the actual shape %s is not.",
+                i, ShapeUtil::HumanString(expected.shape()),
+                ShapeUtil::HumanString(actual.shape()));
+          }
+          if (!expected_shape.is_dynamic_dimension(i) &&
+              actual_shape.is_dynamic_dimension(i)) {
+            return InvalidArgument(
+                "mismatch at dimension %d. the expected shape %s is not "
+                "dynamic "
+                "while the actual shape %s is dynamic.",
+                i, ShapeUtil::HumanString(expected.shape()),
+                ShapeUtil::HumanString(actual.shape()));
+          }
+          // Both dimensions are dynamic. Check that they are equal.
+          int64_t expected_dynamic_size = expected.GetDynamicSize(i, index);
+          int64_t actual_dynamic_size = actual.GetDynamicSize(i, index);
+          if (expected_dynamic_size != actual_dynamic_size) {
+            return InvalidArgument(
+                "mismatch at dimension %d. The expected dynamic size does not "
+                "match "
+                "the actual dynamic size. %d vs. %d",
+                i, expected_dynamic_size, actual_dynamic_size);
+          }
+        }
+
+        return OkStatus();
+      });
+}
+
 namespace {
 
 // If result is an error, extend the error message with the expected and actual
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
index 49b2c688167..24a657d4ddb 100644
--- a/tensorflow/compiler/xla/literal_comparison.h
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -30,6 +30,11 @@ namespace literal_comparison {
 // primitive types.
 Status EqualShapes(const Shape& expected, const Shape& actual);
 
+// Returns ok if the given literals share identical dynamic shapes and
+// dimension sizes.
+Status EqualDynamicShapesAndDimensions(const LiteralSlice& expected,
+                                       const LiteralSlice& actual);
+
 // Returns ok if the expected and actual literals are (bitwise) equal for all
 // elements in the literal. Also, asserts that the rank, dimensions sizes, and
 // primitive type are equal.
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index d6e15f570cc..ff539de0b05 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 namespace {
@@ -132,6 +133,19 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto bf16_lit_truncated2 =
       LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(9.001f));
   EXPECT_EQ("bf16[] 9", bf16_lit_truncated2.ToString());
+
+  auto f8e5m2_lit =
+      LiteralUtil::CreateR0<tsl::float8_e5m2>(tsl::float8_e5m2(0.5));
+  EXPECT_EQ("f8e5m2[] 0.5", f8e5m2_lit.ToString());
+
+  // 3.14 will be rounded to 3 in e5m2 format.
+  auto f8e5m2_lit_truncated =
+      LiteralUtil::CreateR0<tsl::float8_e5m2>(tsl::float8_e5m2(3.141));
+  EXPECT_EQ("f8e5m2[] 3", f8e5m2_lit_truncated.ToString());
+
+  auto f8e4m3_lit =
+      LiteralUtil::CreateR0<tsl::float8_e4m3fn>(tsl::float8_e4m3fn(0.5));
+  EXPECT_EQ("f8e4m3fn[] 0.5", f8e4m3_lit.ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -554,6 +568,15 @@ TEST_F(LiteralUtilTest, IsAll) {
   bfloat16 b90(9.00f);
   EXPECT_TRUE(LiteralUtil::CreateR2<bfloat16>({{b91}, {b90}}).IsAll(9.0));
 
+  tsl::float8_e5m2 q16(8);
+  EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e5m2>({q16}).IsAll(8));
+  // 9 rounds to 8 in E5M2 but is not equal to 8, so this should be false
+  EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e5m2>({q16}).IsAll(9));
+
+  tsl::float8_e4m3fn r16(9);  // Exactly representable in e4m3
+  EXPECT_FALSE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(8));
+  EXPECT_TRUE(LiteralUtil::CreateR1<tsl::float8_e4m3fn>({r16}).IsAll(9));
+
   complex64 c8_9 = {8, 9};
   EXPECT_FALSE(LiteralUtil::CreateR2<complex64>({{c8_9}, {c8_9}}).IsAll(8));
 
@@ -585,6 +608,10 @@ TEST_F(LiteralUtilTest, IsAllFloat) {
   EXPECT_FALSE(LiteralUtil::CreateR0<double>(-.5).IsAllFloat(-.49));
   EXPECT_FALSE(
       LiteralUtil::CreateR2<double>({{0, 0, 0}, {0, .1, 0}}).IsAllFloat(0));
+
+  // IsAllFloat rounds the input scalar to the literal type
+  EXPECT_TRUE(
+      LiteralUtil::CreateR0<bfloat16>(bfloat16(128.)).IsAllFloat(128.5));
 }
 
 TEST_F(LiteralUtilTest, IsAllComplex) {
@@ -1032,6 +1059,22 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR0F8e5m2) {
+  Literal output(ShapeUtil::MakeShape(F8E5M2, {}));
+  tsl::float8_e5m2 x(0.25f);
+  output.PopulateWithValue<tsl::float8_e5m2>(x);
+  auto expected = LiteralUtil::CreateR0<tsl::float8_e5m2>(x);
+  EXPECT_EQ(output, expected);
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR1F8e4m3) {
+  Literal output(ShapeUtil::MakeShape(F8E4M3FN, {3}));
+  tsl::float8_e4m3fn x(0.5f);
+  output.PopulateWithValue<tsl::float8_e4m3fn>(x);
+  auto expected = LiteralUtil::CreateR1<tsl::float8_e4m3fn>({x, x, x});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, ReplicateR2U32) {
   auto input = LiteralUtil::CreateR2<uint32_t>(
       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
@@ -1500,6 +1543,58 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   EXPECT_EQ(c128.Convert(S32).status().code(), tsl::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, ConvertIfTypesMatchF8) {
+  auto s8 = LiteralUtil::CreateR2WithLayout<int8_t>({{0, 1}, {2, 3}},
+                                                    layout_r2_dim0major_);
+  auto f32 = LiteralUtil::CreateR2WithLayout<float>({{0., 1.}, {2., 3.}},
+                                                    layout_r2_dim0major_);
+  auto c128 = LiteralUtil::CreateR2WithLayout<complex128>({{0., 1.}, {2., 3.}},
+                                                          layout_r2_dim0major_);
+  using e5 = tsl::float8_e5m2;
+  auto f8e5m2 = LiteralUtil::CreateR2WithLayout<e5>(
+      {{e5{0.}, e5{1.}}, {e5{2.}, e5{3.}}}, layout_r2_dim0major_);
+  using e4 = tsl::float8_e4m3fn;
+  auto f8e4m3 = LiteralUtil::CreateR2WithLayout<e4>(
+      {{e4{0.}, e4{1.}}, {e4{2.}, e4{3.}}}, layout_r2_dim0major_);
+  Literal conv;
+
+  conv = s8.Convert(F8E5M2).value();
+  EXPECT_EQ(conv, f8e5m2);
+
+  conv = f32.Convert(F8E5M2).value();
+  EXPECT_EQ(conv, f8e5m2);
+
+  conv = f8e4m3.Convert(F8E5M2).value();
+  EXPECT_EQ(conv, f8e5m2);
+
+  conv = s8.Convert(F8E4M3FN).value();
+  EXPECT_EQ(conv, f8e4m3);
+
+  conv = f32.Convert(F8E4M3FN).value();
+  EXPECT_EQ(conv, f8e4m3);
+
+  conv = f8e5m2.Convert(F8E4M3FN).value();
+  EXPECT_EQ(conv, f8e4m3);
+
+  conv = f8e5m2.Convert(S8).value();
+  EXPECT_EQ(conv, s8);
+
+  conv = f8e5m2.Convert(F32).value();
+  EXPECT_EQ(conv, f32);
+
+  conv = f8e5m2.Convert(C128).value();
+  EXPECT_EQ(conv, c128);
+
+  conv = f8e4m3.Convert(S8).value();
+  EXPECT_EQ(conv, s8);
+
+  conv = f8e4m3.Convert(F32).value();
+  EXPECT_EQ(conv, f32);
+
+  conv = f8e4m3.Convert(C128).value();
+  EXPECT_EQ(conv, c128);
+}
+
 TEST_F(LiteralUtilTest, BitcastConvert) {
   Literal original = LiteralUtil::CreateR1<uint32_t>(
       {absl::bit_cast<uint32_t>(2.5f), absl::bit_cast<uint32_t>(-42.25f),
@@ -1888,6 +1983,12 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
       {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}});
   auto vector_half =
       LiteralUtil::CreateR1<half>({half{10.0}, half{20.0}, half{-30.0}});
+  using e5 = tsl::float8_e5m2;
+  auto vector_f8e5m2 =
+      LiteralUtil::CreateR1<e5>({e5{10.0}, e5{20.0}, e5{-32.0}});
+  using e4 = tsl::float8_e4m3fn;
+  auto vector_f8e4m3 =
+      LiteralUtil::CreateR1<e4>({e4{10.0}, e4{20.0}, e4{-32.0}});
   auto matrix_pred =
       LiteralUtil::CreateR2<bool>({{true, false, true}, {false, false, true}});
   auto tuple = LiteralUtil::MakeTuple(
@@ -1906,6 +2007,8 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   EXPECT_EQ(vector_c64, to_from_proto(vector_c64));
   EXPECT_EQ(vector_c128, to_from_proto(vector_c128));
   EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
+  EXPECT_EQ(vector_f8e5m2, to_from_proto(vector_f8e5m2));
+  EXPECT_EQ(vector_f8e4m3, to_from_proto(vector_f8e4m3));
   EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
   EXPECT_EQ(tuple, to_from_proto(tuple));
   EXPECT_EQ(nested_tuple, to_from_proto(nested_tuple));
@@ -2127,19 +2230,24 @@ TEST_F(LiteralUtilTest, IsEqualAt) {
   Literal c2 = LiteralUtil::CreateR0<double>(10);
   EXPECT_TRUE(c2.IsEqualAt({}, val_double));
   EXPECT_TRUE(c2.IsEqualAt({}, val_integral));
-  complex128 val_complex = {10, 0};
-  EXPECT_TRUE(c2.IsEqualAt({}, val_complex));
-  EXPECT_TRUE(c1.IsEqualAt({}, val_complex));
-  Literal c3 = LiteralUtil::CreateR0<complex128>(val_complex);
+  Literal c3 =
+      LiteralUtil::CreateR0<tsl::float8_e5m2>(tsl::float8_e5m2{val_double});
   EXPECT_TRUE(c3.IsEqualAt({}, val_double));
   EXPECT_TRUE(c3.IsEqualAt({}, val_integral));
+  complex128 val_complex = {10, 0};
+  EXPECT_TRUE(c1.IsEqualAt({}, val_complex));
+  EXPECT_TRUE(c2.IsEqualAt({}, val_complex));
   EXPECT_TRUE(c3.IsEqualAt({}, val_complex));
-  EXPECT_FALSE(c3.IsEqualAt({}, std::numeric_limits<double>::infinity()));
+  Literal c4 = LiteralUtil::CreateR0<complex128>(val_complex);
+  EXPECT_TRUE(c4.IsEqualAt({}, val_double));
+  EXPECT_TRUE(c4.IsEqualAt({}, val_integral));
+  EXPECT_TRUE(c4.IsEqualAt({}, val_complex));
+  EXPECT_FALSE(c4.IsEqualAt({}, std::numeric_limits<double>::infinity()));
   complex128 val_true_complex = {10, 3};
   complex64 val_smaller_complex = {10, 3};
-  Literal c4 = LiteralUtil::CreateR0<complex128>(val_true_complex);
-  EXPECT_TRUE(c4.IsEqualAt({}, val_true_complex));
-  EXPECT_TRUE(c4.IsEqualAt({}, val_smaller_complex));
+  Literal c5 = LiteralUtil::CreateR0<complex128>(val_true_complex);
+  EXPECT_TRUE(c5.IsEqualAt({}, val_true_complex));
+  EXPECT_TRUE(c5.IsEqualAt({}, val_smaller_complex));
 }
 
 TEST_F(LiteralUtilTest, CreateFromShapeWithUnknownLeafArrays) {
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 9285b826a9b..a766e451d7f 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -107,6 +107,11 @@ Literal CreateScalar(PrimitiveType primitive_type, Args... args) {
       return CreateScalarImpl<S32>(F<S32>{}, std::forward<Args>(args)...);
     case S64:
       return CreateScalarImpl<S64>(F<S64>{}, std::forward<Args>(args)...);
+    case F8E5M2:
+      return CreateScalarImpl<F8E5M2>(F<F8E5M2>{}, std::forward<Args>(args)...);
+    case F8E4M3FN:
+      return CreateScalarImpl<F8E4M3FN>(F<F8E4M3FN>{},
+                                        std::forward<Args>(args)...);
     case F16:
       return CreateScalarImpl<F16>(F<F16>{}, std::forward<Args>(args)...);
     case BF16:
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/BUILD
new file mode 100644
index 00000000000..06ddfdc6ea7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/BUILD
@@ -0,0 +1,32 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla/mlir:__subpackages__"],
+    licenses = ["notice"],
+)
+
+xla_cc_binary(
+    name = "xla-cpu-opt",
+    srcs = ["xla-cpu-opt.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/xla_cpu/ir:xla_cpu",
+        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_test_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:thlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "@llvm-project//mlir:BufferizationTransforms",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+        "@stablehlo//:register",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
new file mode 100644
index 00000000000..51902deb2c0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
@@ -0,0 +1,62 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=CpuTransforms",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "legalize_collective_ops.cc",
+        "legalize_i1_vector_transfers.cc",
+        "lmhlo_to_cpu_runtime.cc",
+        "remove_copies_to_out_params.cc",
+        "xla_abi_legalization.cc",
+        "xla_cpu_memref_element_cast_to_llvm.cc",
+    ],
+    hdrs = ["passes.h"],
+    deps = [
+        ":passes_inc_gen",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
+        "//tensorflow/compiler/xla/mlir/runtime/utils:custom_calls",
+        "//tensorflow/compiler/xla/mlir/xla_cpu/ir:xla_cpu",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:VectorDialect",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc
new file mode 100644
index 00000000000..61d8e500157
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_collective_ops.cc
@@ -0,0 +1,303 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+#define GEN_PASS_DEF_LEGALIZECOLLECTIVEOPSPASS
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+class LegalizeCollectiveOpsPass
+    : public impl::LegalizeCollectiveOpsPassBase<LegalizeCollectiveOpsPass> {
+  void runOnOperation() override;
+};
+
+Optional<xla_cpu::ReductionKind> MatchReductionComputation(Region& region) {
+  if (!region.hasOneBlock()) {
+    return llvm::None;
+  }
+
+  auto ret = dyn_cast<mhlo::ReturnOp>(region.front().getTerminator());
+  if (!ret || ret->getNumOperands() != 1) {
+    return llvm::None;
+  }
+
+  auto computation = ret.getOperand(0).getDefiningOp();
+  if (computation->getNumOperands() != 2 ||
+      computation->getOperand(0) != region.front().getArgument(0) ||
+      computation->getOperand(1) != region.front().getArgument(1)) {
+    return llvm::None;
+  }
+
+  if (isa<mhlo::AddOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_SUM;
+  }
+  if (isa<mhlo::MulOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_PRODUCT;
+  }
+  if (isa<mhlo::MinOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_MIN;
+  }
+  if (isa<mhlo::MaxOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_MAX;
+  }
+
+  auto type = computation->getOperandTypes().front().dyn_cast<ShapedType>();
+  if (!type || !type.getElementType().isInteger(1)) {
+    return llvm::None;
+  }
+
+  if (isa<mhlo::AndOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_MIN;
+  }
+  if (isa<mhlo::OrOp>(computation)) {
+    return xla_cpu::ReductionKind::ALL_REDUCE_MAX;
+  }
+
+  return llvm::None;
+}
+
+// Returns a `tensor.empty` with the same shape as `tensor`.
+Value CreateEmptyLike(OpBuilder& b, Location loc, Value tensor) {
+  auto ty = tensor.getType().cast<ShapedType>();
+  auto sizes = tensor::getMixedSizes(b, loc, tensor);
+  return b.create<tensor::EmptyOp>(loc, sizes, ty.getElementType());
+}
+
+class AllReduceLowering : public OpRewritePattern<mhlo::AllReduceOp> {
+  using OpRewritePattern<mhlo::AllReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::AllReduceOp op,
+                                PatternRewriter& rewriter) const override {
+    auto reduction_kind = MatchReductionComputation(op.getRegion());
+    if (!reduction_kind) {
+      return failure();
+    }
+
+    SmallVector<Value> dsts;
+    for (auto operand : op->getOperands()) {
+      // The operands and results have the same shapes.
+      dsts.push_back(CreateEmptyLike(rewriter, op.getLoc(), operand));
+    }
+
+    rewriter.replaceOpWithNewOp<xla_cpu::AllReduceOp>(
+        op, op->getResultTypes(), op->getOperands(), dsts,
+        op.getReplicaGroupsAttr(),
+        rewriter.getI64IntegerAttr(op.getChannelHandle()
+                                       ? op.getChannelHandle()->getHandle()
+                                       : int64_t{0}),
+        rewriter.getI32IntegerAttr(op.getUseGlobalDeviceIdsAttr() ? 1 : 0),
+        rewriter.getI32IntegerAttr(static_cast<int32_t>(*reduction_kind)));
+
+    return success();
+  };
+};
+
+template <typename IdOp, typename XlaIdOp>
+class IdLowering : public OpRewritePattern<IdOp> {
+  using OpRewritePattern<IdOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(IdOp op,
+                                PatternRewriter& rewriter) const override {
+    Value id = rewriter.create<XlaIdOp>(op.getLoc());
+    // Wrap the scalar in a tensor.
+    Value id_tensor = rewriter.create<tensor::FromElementsOp>(
+        op.getLoc(), RankedTensorType::get({}, rewriter.getI32Type()), id);
+    // And convert it to unsigned. This becomes a noop later.
+    rewriter.replaceOpWithNewOp<mhlo::ConvertOp>(
+        op,
+        RankedTensorType::get({}, IntegerType::get(rewriter.getContext(), 32,
+                                                   IntegerType::Unsigned)),
+        id_tensor);
+    return success();
+  };
+};
+
+class CollectivePermuteLowering
+    : public OpRewritePattern<mhlo::CollectivePermuteOp> {
+  using OpRewritePattern<mhlo::CollectivePermuteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::CollectivePermuteOp op,
+                                PatternRewriter& rewriter) const override {
+    // The result of collective_permute has the same shape as the operand.
+    Value dst = CreateEmptyLike(rewriter, op.getLoc(), op.getOperand());
+    rewriter.replaceOpWithNewOp<xla_cpu::CollectivePermuteOp>(
+        op, op->getResultTypes(), op->getOperand(0), dst,
+        op.getSourceTargetPairsAttr(),
+        rewriter.getI64IntegerAttr(op.getChannelHandle()
+                                       ? op.getChannelHandle()->getHandle()
+                                       : int64_t{0}));
+    return success();
+  };
+};
+
+class AllToAllLowering : public OpRewritePattern<mhlo::AllToAllOp> {
+  using OpRewritePattern<mhlo::AllToAllOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::AllToAllOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    SmallVector<Value> dsts;
+
+    if (!op.getConcatDimensionAttr()) {
+      for (auto operand : op->getOperands()) {
+        // The operands and results of TupleAllToAll the same shapes.
+        dsts.push_back(CreateEmptyLike(rewriter, op.getLoc(), operand));
+      }
+    } else {
+      auto sizes =
+          getAsValues(b, b.getLoc(),
+                      tensor::getMixedSizes(b, op.getLoc(), op->getOperand(0)));
+      uint64_t split_dimension = *op.getSplitDimension();
+      Value split_count = b.create<arith::ConstantIndexOp>(*op.getSplitCount());
+      sizes[split_dimension] = b.createOrFold<arith::DivUIOp>(
+          b.getIndexType(), sizes[split_dimension], split_count);
+      uint64_t concat_dimension = *op.getConcatDimension();
+      sizes[concat_dimension] =
+          b.createOrFold<arith::MulIOp>(sizes[concat_dimension], split_count);
+
+      dsts.push_back(rewriter.create<tensor::EmptyOp>(
+          op.getLoc(), getAsOpFoldResult(sizes),
+          op->getResultTypes()[0].cast<ShapedType>().getElementType()));
+    }
+
+    rewriter.replaceOpWithNewOp<xla_cpu::AllToAllOp>(
+        op, op->getResultTypes(), op->getOperands(), dsts,
+        op.getReplicaGroupsAttr(), op.getSplitDimensionAttr(),
+        op.getConcatDimensionAttr(), op.getSplitCountAttr());
+    return success();
+  };
+};
+
+class FftLowering : public OpRewritePattern<mhlo::FftOp> {
+  using OpRewritePattern<mhlo::FftOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::FftOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // TODO(jreiffers): Support dynamic sizes.
+    auto dst = b.create<tensor::EmptyOp>(op.getLoc(), op.getType().getShape(),
+                                         op.getType().getElementType());
+
+    auto lengths =
+        llvm::to_vector<3>(op.getFftLengthAttr().getValues<int64_t>());
+    rewriter.replaceOpWithNewOp<xla_cpu::FftOp>(
+        op, op->getResultTypes(), op->getOperand(0), dst,
+        static_cast<int32_t>(op.getFftType()),
+        rewriter.getI64ArrayAttr(lengths));
+    return success();
+  };
+};
+
+class OutfeedLowering : public OpRewritePattern<mhlo::OutfeedOp> {
+  using OpRewritePattern<mhlo::OutfeedOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::OutfeedOp op,
+                                PatternRewriter& rewriter) const override {
+    SmallVector<Attribute> result_types;
+    for (auto operand : op.getInputs()) {
+      result_types.push_back(
+          TypeAttr::get(operand.getType().cast<ShapedType>().getElementType()));
+    }
+    rewriter.create<xla_cpu::OutfeedOp>(
+        op.getLoc(), llvm::None, op.getInputs(), op.getOutfeedConfigAttr(),
+        ArrayAttr::get(op->getContext(), result_types));
+
+    // Replacing the op with the token.
+    rewriter.replaceOp(op, op.getToken());
+    return success();
+  };
+};
+
+class RngBitGeneratorLowering
+    : public OpRewritePattern<mhlo::RngBitGeneratorOp> {
+  using OpRewritePattern<mhlo::RngBitGeneratorOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::RngBitGeneratorOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    auto state_init = CreateEmptyLike(b, op.getLoc(), op.getOperand());
+    auto output_init =
+        b.create<tensor::EmptyOp>(op.getLoc(), op.getType(1), ValueRange{});
+
+    rewriter.replaceOpWithNewOp<xla_cpu::RngBitGeneratorOp>(
+        op, op->getResultTypes(), op->getOperand(0), state_init, output_init,
+        op.getRngAlgorithmAttr());
+    return success();
+  };
+};
+
+class AddDependencyLowering : public OpRewritePattern<mhlo::AddDependencyOp> {
+  using OpRewritePattern<mhlo::AddDependencyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::AddDependencyOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<xla_cpu::AddDependencyOp>(
+        op, op->getResultTypes(), op->getOperands());
+    return success();
+  };
+};
+
+void LegalizeCollectiveOpsPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+
+  // Convert mhlo collective operations to XLA cpu ops.
+  RewritePatternSet patterns(ctx);
+  patterns.insert<AddDependencyLowering, AllReduceLowering, AllToAllLowering,
+                  CollectivePermuteLowering, FftLowering,
+                  IdLowering<mhlo::PartitionIdOp, xla_cpu::PartitionIdOp>,
+                  IdLowering<mhlo::ReplicaIdOp, xla_cpu::ReplicaIdOp>,
+                  OutfeedLowering, RngBitGeneratorLowering>(ctx);
+
+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeCollectiveOpsPass() {
+  return std::make_unique<LegalizeCollectiveOpsPass>();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc
new file mode 100644
index 00000000000..b108e1f21c8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_i1_vector_transfers.cc
@@ -0,0 +1,139 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+#define GEN_PASS_DEF_LEGALIZEI1VECTORTRANSFEROPSPASS
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+class LegalizeI1VectorTransferOpsPass
+    : public impl::LegalizeI1VectorTransferOpsPassBase<
+          LegalizeI1VectorTransferOpsPass> {
+  void runOnOperation() override;
+};
+
+Value CastToI8(Value in, ImplicitLocOpBuilder& b, bool optional = false) {
+  auto ty = in.getType();
+  assert(optional || getElementTypeOrSelf(ty).isInteger(1));
+  if (!getElementTypeOrSelf(ty).isInteger(1)) {
+    return {};
+  }
+
+  if (auto vec_ty = ty.dyn_cast<VectorType>()) {
+    return b.create<arith::ExtUIOp>(
+        vec_ty.cloneWith(std::nullopt, b.getI8Type()), in);
+  }
+  if (auto memref_ty = ty.dyn_cast<MemRefType>()) {
+    auto cast_ty = memref_ty.cloneWith(std::nullopt, b.getI8Type());
+    return b.create<xla_cpu::MemRefElementCastOp>(cast_ty, in);
+  }
+  if (ty == b.getI1Type()) {
+    return b.create<arith::ExtUIOp>(b.getI8Type(), in);
+  }
+  return {};
+}
+
+class I1TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    b.setInsertionPoint(op);
+    Value cast_src = CastToI8(op.getSource(), b, /*optional=*/true);
+    if (!cast_src) {
+      return failure();
+    }
+
+    auto cast_result_ty =
+        op.getVector().getType().cloneWith(std::nullopt, b.getI8Type());
+    TypedValue<VectorType> new_read =
+        b.create<vector::TransferReadOp>(
+             TypeRange{cast_result_ty}, cast_src, op.getIndices(),
+             op.getPermutationMap(), CastToI8(op.getPadding(), b), op.getMask(),
+             op.getInBoundsAttr())
+            .getResult();
+    Value zero = b.create<arith::ConstantOp>(
+        DenseElementsAttr::get(new_read.getType(), b.getI8IntegerAttr(0)));
+    auto result =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::ne, new_read, zero);
+    rewriter.replaceOp(op, {result});
+    return success();
+  };
+};
+
+class I1TransferWriteLowering
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    b.setInsertionPoint(op);
+    // Confusingly, the destination is called 'source'.
+    auto cast_dst = CastToI8(op.getSource(), b, /*optional=*/true);
+    if (!cast_dst) {
+      return failure();
+    }
+
+    op.getVectorMutable().assign(CastToI8(op.getVector(), b));
+    op.getSourceMutable().assign(cast_dst);
+    return success();
+  };
+};
+
+void LegalizeI1VectorTransferOpsPass::runOnOperation() {
+  func::FuncOp func = getOperation();
+  MLIRContext* ctx = func.getContext();
+
+  RewritePatternSet patterns(ctx);
+  patterns.insert<I1TransferReadLowering, I1TransferWriteLowering>(ctx);
+  // TODO(jreiffers): Handle other transfer ops if we need them (load,
+  // maskedload, etc.).
+
+  if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeI1VectorTransferOpsPass() {
+  return std::make_unique<LegalizeI1VectorTransferOpsPass>();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
new file mode 100644
index 00000000000..1444d8daece
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
@@ -0,0 +1,515 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h"
+#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+#define GEN_PASS_DEF_CONVERTLMHLOTOCPURUNTIMEPASS
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+using mlir::lmhlo::CustomCallOp;
+using mlir::lmhlo::InfeedOp;
+using mlir::lmhlo::OutfeedOp;
+
+using xla_cpu::PartitionIdOp;
+using xla_cpu::ReplicaIdOp;
+
+using xla::runtime::AppendCustomCallAttrs;
+using xla::runtime::CustomCallDeclarations;
+
+class ConvertLmhloToCpuRuntimePass
+    : public impl::ConvertLmhloToCpuRuntimePassBase<
+          ConvertLmhloToCpuRuntimePass> {
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<func::FuncDialect, memref::MemRefDialect>();
+  }
+};
+
+// Copies memrefs with non-identity layouts (e.g. results of memref.subviews)
+// to newly allocated memrefs, ensuring all outputs have flat layouts.
+// TODO(jreiffers): If the memref just as an offset, but its layout is otherwise
+// default, the copy is overkill.
+SmallVector<Value> EnsureFlatMemrefs(ValueRange values,
+                                     ImplicitLocOpBuilder& b) {
+  SmallVector<Value> out;
+  for (Value value : values) {
+    auto ty = value.getType().dyn_cast<MemRefType>();
+    if (!ty || ty.getLayout().isIdentity()) {
+      out.push_back(value);
+    } else {
+      auto default_layout_ty =
+          MemRefType::get(ty.getShape(), ty.getElementType());
+      auto alloc =
+          out.emplace_back(b.create<memref::AllocOp>(default_layout_ty));
+      b.create<memref::CopyOp>(value, alloc);
+    }
+  }
+  return out;
+}
+
+// Replaces a DPS style collective op with a custom call.
+func::CallOp CreateCallForDpsCollectiveOp(Operation* op,
+                                          CustomCallDeclarations& custom_calls,
+                                          StringRef call_target,
+                                          PatternRewriter& rewriter) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+  b.setInsertionPoint(op);
+
+  // Subview ops result in strided Memrefs. The runtime can't deal with them,
+  // so we copy everything that doesn't have the default layout.
+  SmallVector<Value> new_operands = EnsureFlatMemrefs(op->getOperands(), b);
+
+  func::FuncOp callee = custom_calls.GetOrCreate(
+      b, call_target, TypeRange(ValueRange(new_operands)), TypeRange());
+  auto call =
+      b.create<func::CallOp>(callee.getName(), TypeRange(), new_operands);
+
+  // Copy attributes from original op.
+  for (auto& attr : op->getAttrs()) {
+    call->setAttr(attr.getName(), attr.getValue());
+  }
+  rewriter.eraseOp(op);
+  return call;
+}
+
+//===----------------------------------------------------------------------===//
+
+class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
+ private:
+  static constexpr const char kCustomCallTarget[] = "xla.cpu.custom_call";
+
+ public:
+  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  // Rewrite custom call with `API_VERSION_TYPED_FFI` version into XLA runtime
+  // custom calls bypassing custom call adaptor.
+  LogicalResult rewriteTypedCustomCall(CustomCallOp op,
+                                       PatternRewriter& rewriter) const {
+    // TODO(ezhulenev): Support target arg mapping, or explain why we do not
+    // need them for typed custom calls.
+    if (op.getTargetArgMapping())
+      return op.emitOpError(
+          "API_VERSION_TYPED_FFI custom calls do not "
+          "support target arg mapping");
+
+    // Create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee =
+        custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
+    callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
+
+    // Forward backend config to the custom call implementation.
+    auto dict = op.getBackendConfig()
+                    ? op.getBackendConfig()->cast<mlir::DictionaryAttr>()
+                    : nullptr;
+    llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
+
+    // Call the custom call function forwarding user-defined attributes.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), op.getOperands());
+    AppendCustomCallAttrs(call, backend_config);
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(CustomCallOp op,
+                                PatternRewriter& rewriter) const override {
+    // Typed custom calls lowered directly to XLA runtime custom calls.
+    if (op.getApiVersion() == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
+      return rewriteTypedCustomCall(op, rewriter);
+
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // By default all operands passed to the custom call handler.
+    llvm::SmallVector<Value> operands = op.getOperands();
+
+    // Get the number of outputs from operand_segment_sizes.
+    int64_t num_results = op->getAttrOfType<DenseI32ArrayAttr>(
+        op.getOperandSegmentSizesAttrName())[1];
+
+    // If custom call has target arguments mapping, then we need to pass empty
+    // memrefs in place of holes.
+    if (op.getTargetArgMapping().has_value()) {
+      auto mapping = *op.getTargetArgMapping();
+      int64_t num_args = mapping.getNumArgs();
+      num_results = mapping.getNumResults();
+
+      // Always create an `alloca` in the parent function entry block.
+      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
+      Value hole = [&]() -> Value {
+        OpBuilder::InsertionGuard guard(b);
+        b.setInsertionPointToStart(
+            &op->getParentOfType<func::FuncOp>().front());
+        return b.create<memref::AllocaOp>(MemRefType::get({0}, b.getI8Type()));
+      }();
+
+      // We represent holes as empty i8 memrefs.
+      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
+
+      // Update operands to mapped custom call arguments.
+      auto args = mapping.getArgsToTargetArgs();
+      for (const auto& indexed : llvm::enumerate(args))
+        operands[indexed.value()] = op.getArgs()[indexed.index()];
+
+      // Update operands to mapped custom call results.
+      auto res = mapping.getResultsToTargetResults();
+      for (const auto& indexed : llvm::enumerate(res))
+        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
+    }
+
+    // TODO(jreiffers): This will break if an output has a non-default layout.
+    operands = EnsureFlatMemrefs(operands, b);
+    // Create a custom call function declaration.
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
+
+    // The ABI is different depending on whether the original op was outputting
+    // a tuple or not. For multiple outputs this is trivial but for a single
+    // output we rely on the xla_shape attribute to distinguish the ABIs.
+    bool output_tuple = num_results > 1;
+    if (auto xla_shape = op->getAttrOfType<StringAttr>("xla_shape"))
+      output_tuple = ParseShape(xla_shape.strref())->IsTuple();
+
+    // This is not equivalent to op.getApiVersionAttr() - that call returns null
+    // if the attribute is absent. getApiVersion returns the default.
+    Attribute api_version =
+        mhlo::CustomCallApiVersionAttr::get(getContext(), op.getApiVersion());
+    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
+        {b.getStringAttr("num_results"),
+         b.getI32IntegerAttr(static_cast<int32_t>(num_results))},
+        {b.getStringAttr("output_tuple"), b.getBoolAttr(output_tuple)},
+        {b.getStringAttr("api_version"), api_version},
+        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
+
+    // Call the runtime intrinsic with the original operands.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), operands);
+    AppendCustomCallAttrs(call, custom_call_attrs);
+
+    return success();
+  }
+
+ private:
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class InfeedOpLowering : public OpRewritePattern<InfeedOp> {
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.infeed";
+
+ public:
+  InfeedOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(InfeedOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+
+    // By default all operands are passed to the custom call handler.
+    llvm::SmallVector<Value> operands = op->getOperands();
+
+    // Create a custom call function declaration.
+    func::FuncOp callee =
+        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
+                                  TypeRange(ValueRange(operands)), TypeRange());
+
+    // Call the runtime intrinsic with the original operands.
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
+                                              operands);
+    return success();
+  }
+
+ private:
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+template <typename IdOp>
+class IdOpLowering : public OpRewritePattern<IdOp> {
+ public:
+  IdOpLowering(MLIRContext* ctx, llvm::StringRef call_target,
+               CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<IdOp>(ctx),
+        call_target_(call_target),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(IdOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+
+    // Create a custom call function declaration.
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, call_target_, TypeRange(), TypeRange(rewriter.getI32Type()));
+
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
+                                              TypeRange(rewriter.getI32Type()));
+    return success();
+  }
+
+ private:
+  llvm::StringRef call_target_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class AllReduceLowering : public OpRewritePattern<xla_cpu::AllReduceOp> {
+ public:
+  AllReduceLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::AllReduceOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!op.getOperandTypes().front().isa<MemRefType>()) {
+      return failure();
+    }
+
+    auto call = CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_,
+                                             kCallTarget, rewriter);
+
+    // Set default attributes.
+    if (!call->hasAttr("use_global_device_ids")) {
+      call->setAttr("use_global_device_ids", rewriter.getI32IntegerAttr(0));
+    }
+    if (!call->hasAttr("op_id")) {
+      call->setAttr("op_id", rewriter.getI64IntegerAttr(0));
+    }
+
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.all_reduce";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class AllToAllLowering : public OpRewritePattern<xla_cpu::AllToAllOp> {
+ public:
+  AllToAllLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::AllToAllOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getSplitDimensionAttr()) {
+      op.emitOpError("ArrayAllToAll is not supported");
+      return failure();
+    }
+    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
+                                 rewriter);
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.tuple_all_to_all";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class CollectivePermuteLowering
+    : public OpRewritePattern<xla_cpu::CollectivePermuteOp> {
+ public:
+  CollectivePermuteLowering(MLIRContext* ctx,
+                            CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::CollectivePermuteOp op,
+                                PatternRewriter& rewriter) const override {
+    if (!op.getOperandTypes().front().isa<MemRefType>()) {
+      return failure();
+    }
+
+    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
+                                 rewriter);
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.collective_permute";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class FftLowering : public OpRewritePattern<xla_cpu::FftOp> {
+ public:
+  FftLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::FftOp op,
+                                PatternRewriter& rewriter) const override {
+    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_, kCallTarget,
+                                 rewriter);
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.fft";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class RngBitGeneratorLowering
+    : public OpRewritePattern<xla_cpu::RngBitGeneratorOp> {
+ public:
+  RngBitGeneratorLowering(MLIRContext* ctx,
+                          CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::RngBitGeneratorOp op,
+                                PatternRewriter& rewriter) const override {
+    auto algorithm =
+        op.getRngAlgorithmAttr().cast<mhlo::RngAlgorithmAttr>().getValue();
+    op->removeAttr("rng_algorithm");
+
+    CreateCallForDpsCollectiveOp(op.getOperation(), custom_calls_,
+                                 algorithm == mhlo::RngAlgorithm::THREE_FRY
+                                     ? kThreeFryTarget
+                                     : kPhiloxTarget,
+                                 rewriter);
+    return success();
+  }
+
+ private:
+  static constexpr const char kThreeFryTarget[] = "xla.cpu.rng.three_fry";
+  static constexpr const char kPhiloxTarget[] = "xla.cpu.rng.philox";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class OutfeedLowering : public OpRewritePattern<xla_cpu::OutfeedOp> {
+ public:
+  OutfeedLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::OutfeedOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+
+    // By default all operands are passed to the custom call handler.
+    llvm::SmallVector<Value> operands = EnsureFlatMemrefs(op->getOperands(), b);
+
+    // Create a custom call function declaration.
+    func::FuncOp callee =
+        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
+                                  TypeRange(ValueRange(operands)), TypeRange());
+
+    llvm::SmallVector<NamedAttribute> custom_call_attrs;
+    SmallVector<int32_t> types;
+    for (int i = 0; i < op.getResultType().size(); ++i) {
+      auto type_attr = cast<TypeAttr>(op.getResultType()[i]);
+      auto status_or_primitive_type =
+          xla::runtime::TypeConverter::ConvertElementType(type_attr.getValue());
+      if (!status_or_primitive_type.ok()) {
+        return rewriter.notifyMatchFailure(
+            op,
+            "is not provided with a supported primitive type in the result "
+            "type attribute.");
+      }
+      types.push_back(status_or_primitive_type.value());
+    }
+
+    // Call the runtime intrinsic with the original operands.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), operands);
+    call->setAttr("result_type", b.getI32ArrayAttr(types));
+
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.outfeed";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+void ConvertLmhloToCpuRuntimePass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* ctx = module.getContext();
+
+  // Keep track of the custom calls created from the lowered operations.
+  SymbolTable sym_table(module);
+  CustomCallDeclarations custom_calls(std::move(sym_table));
+
+  // Convert lmhlo operations to XLA cpu runtime custom calls.
+  RewritePatternSet patterns(ctx);
+  patterns.insert<AllReduceLowering, AllToAllLowering,
+                  CollectivePermuteLowering, CustomCallOpLowering, FftLowering,
+                  InfeedOpLowering, OutfeedLowering, RngBitGeneratorLowering>(
+      ctx, custom_calls);
+  patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
+                                               custom_calls);
+  patterns.insert<IdOpLowering<ReplicaIdOp>>(ctx, "xla.cpu.replica_id",
+                                             custom_calls);
+
+  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloToCpuRuntimePass() {
+  return std::make_unique<ConvertLmhloToCpuRuntimePass>();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
new file mode 100644
index 00000000000..34aae02bdcb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+namespace cpu {
+
+//===----------------------------------------------------------------------===//
+// Auxiliary passes for lowering to XLA Cpu runtime.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloToCpuRuntimePass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createXlaAbiLegalizationPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createLegalizeCollectiveOpsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createLegalizeI1VectorTransferOpsPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createConvertXlaCpuMemRefElementCastToLLVMPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createRemoveCopiesToOutParamsPass();
+
+//===-----------------------------------------------------------------------===/
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_CPU_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
new file mode 100644
index 00000000000..50aa6ff516e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
@@ -0,0 +1,111 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
+
+include "mlir/Pass/PassBase.td"
+
+//===----------------------------------------------------------------------===//
+// Auxiliary passes for lowering to XLA Cpu runtime.
+//===----------------------------------------------------------------------===//
+
+def ConvertLmhloToCpuRuntimePass :
+    Pass<"xla-lmhlo-to-cpu-runtime", "mlir::ModuleOp"> {
+  let summary = "Converts lmhlo operations to XLA Cpu runtime custom calls";
+
+  let description = [{
+      Converts lmhlo dialect operations to XLA Cpu runtime custom calls.
+  }];
+
+  let constructor = "createConvertLmhloToCpuRuntimePass()";
+}
+
+def LegalizeXlaAbiPass :
+    Pass<"xla-legalize-abi", "mlir::ModuleOp"> {
+  let summary = "Convers layouts and data formats at ABI boundaries";
+
+  let description = [{
+      Converts layouts defined in entry_computation_layout to the default
+      layout assumed by the MLIR pipeline. The same is done for custom calls.
+
+      This pass should run immediately after expand-hlo-tuples.
+  }];
+
+  let dependentDialects = [
+      "mlir::func::FuncDialect", "mlir::mhlo::MhloDialect"
+  ];
+
+  let constructor = "createXlaAbiLegalizationPass()";
+}
+
+def LegalizeCollectiveOpsPass :
+   Pass<"xla-legalize-collective-ops", "mlir::func::FuncOp"> {
+  let summary = "Legalizes collective ops to AllToAll and regular ops.";
+
+  let description = [{
+      Lowers collective ops to xla_cpu ops.
+  }];
+
+  let dependentDialects = [
+    "mlir::mhlo::MhloDialect", "mlir::xla_cpu::XlaCpuDialect"
+  ];
+
+  let constructor = "createLegalizeCollectiveOpsPass()";
+}
+
+def LegalizeI1VectorTransferOpsPass :
+   Pass<"xla-legalize-i1-vector-transfers", "mlir::func::FuncOp"> {
+  let summary = "Legalizes transfer ops operating on vectors of i1.";
+
+  let description = [{
+      Replaces transfers involving vector<i1>s with memref casts to i8, and
+      vector comparisons.
+  }];
+
+  let dependentDialects = [
+    "mlir::vector::VectorDialect", "mlir::xla_cpu::XlaCpuDialect"
+  ];
+
+  let constructor = "createLegalizeI1VectorTransferOpsPass()";
+}
+
+def ConvertXlaCpuMemRefElementCastToLLVMPass :
+   Pass<"xla-convert-memref-element-cast-to-llvm", "mlir::func::FuncOp"> {
+  let summary = "Converts xla_cpu.memref_element_cast ops to LLVM.";
+
+  let description = [{
+      Rewrites xla_cpu.memref_elements_cast ops as a new memref descriptor,
+      where the allocated and aligned pointers are updated.
+  }];
+
+  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
+
+  let constructor = "createConvertXlaCpuMemRefElementCastToLLVMPass()";
+}
+
+def RemoveCopiesToOutParamsPass :
+   Pass<"xla-remove-copies-to-out-params", "mlir::func::FuncOp"> {
+  let summary = "Removes redundant alloc/copy pairs to out params.";
+
+  let description = [{
+      Removes redundant alloc/alloca + copy pairs that can remain after running
+      bufferization's BufferResultsToOutParams pass.
+  }];
+
+  let constructor = "createRemoveCopiesToOutParamsPass()";
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc
new file mode 100644
index 00000000000..3d1d7d813ff
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/remove_copies_to_out_params.cc
@@ -0,0 +1,129 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using ::mlir::LogicalResult;
+using ::mlir::Operation;
+using ::mlir::OperationPass;
+using ::mlir::PatternRewriter;
+using ::mlir::RewritePatternSet;
+using ::mlir::Value;
+
+namespace memref = ::mlir::memref;
+namespace func = ::mlir::func;
+
+#define GEN_PASS_DEF_REMOVECOPIESTOOUTPARAMSPASS
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+LogicalResult AllocRemoval(memref::CopyOp copy, PatternRewriter &rewriter) {
+  Value from = copy.getSource();
+  Value to = copy.getTarget();
+
+  Operation *alloc =
+      llvm::dyn_cast_or_null<memref::AllocOp>(from.getDefiningOp());
+  if (!alloc) {
+    return mlir::failure();
+  }
+
+  // Only match if we dealloc immediately after the copy.
+  auto dealloc = llvm::dyn_cast_or_null<memref::DeallocOp>(copy->getNextNode());
+  if (!dealloc || dealloc.getMemref() != from) {
+    return mlir::failure();
+  }
+
+  // Only go up one level to grab the parent function; the match we're looking
+  // for is at the very end of a function.
+  auto func = llvm::dyn_cast_or_null<func::FuncOp>(copy->getParentOp());
+  if (!func) {
+    return mlir::failure();
+  }
+
+  // If the copy target is a function argument, use it directly.
+  if (llvm::is_contained(func.getArguments(), to)) {
+    rewriter.replaceAllUsesWith(from, to);
+    rewriter.eraseOp(alloc);
+    rewriter.eraseOp(dealloc);
+    rewriter.eraseOp(copy);
+    return mlir::success();
+  }
+  return mlir::failure();
+}
+
+LogicalResult AllocaRemoval(memref::CopyOp copy, PatternRewriter &rewriter) {
+  Value from = copy.getSource();
+  Value to = copy.getTarget();
+
+  Operation *alloca =
+      llvm::dyn_cast_or_null<memref::AllocaOp>(from.getDefiningOp());
+  if (!alloca) {
+    return mlir::failure();
+  }
+
+  // Only go up one level to grab the parent function; the match we're looking
+  // for is at the very end of a function.
+  auto func = llvm::dyn_cast_or_null<func::FuncOp>(copy->getParentOp());
+  if (!func) {
+    return mlir::failure();
+  }
+
+  // If the copy target is a function argument, use it directly.
+  if (llvm::is_contained(func.getArguments(), to)) {
+    rewriter.replaceAllUsesWith(from, to);
+    rewriter.eraseOp(alloca);
+    rewriter.eraseOp(copy);
+    return mlir::success();
+  }
+  return mlir::failure();
+}
+
+class RemoveCopiesToOutParamsPass
+    : public impl::RemoveCopiesToOutParamsPassBase<
+          RemoveCopiesToOutParamsPass> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add(AllocRemoval);
+    patterns.add(AllocaRemoval);
+    if (failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
+                                                  std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createRemoveCopiesToOutParamsPass() {
+  return std::make_unique<RemoveCopiesToOutParamsPass>();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD
new file mode 100644
index 00000000000..e65d104f2d4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/backends/cpu:xla-cpu-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
new file mode 100644
index 00000000000..f02a4eac6ab
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
@@ -0,0 +1,256 @@
+// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+
+func.func @max_reduce(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = "mhlo.all_reduce"(%arg0) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    channel_handle = #mhlo.channel_handle<
+      handle = 5,
+      type = 2
+    >,
+    use_global_device_ids
+  } : (tensor<10xf32>) -> tensor<10xf32>
+   func.return %0 : tensor<10xf32>
+}
+
+// CHECK-LABEL: @max_reduce
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<10xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<10xf32>
+//       CHECK: %[[RET:.*]] = "xla_cpu.all_reduce"(%[[ARG0]], %[[DST]]) {
+//  CHECK-SAME:   channel_handle = 5 : i64,
+//  CHECK-SAME:   reduction_kind = 3 : i32,
+//  CHECK-SAME:   replica_groups = dense<{{\[}}[0, 2, 4, 6], [1, 3, 5, 7]]>
+//  CHECK-SAME:   use_global_device_ids = 1
+//       CHECK: return %[[RET]]
+
+func.func @and_reduce(%arg0: tensor<1xi1>) -> tensor<1xi1> {
+  %0 = "mhlo.all_reduce"(%arg0) ({
+    ^bb0(%lhs: tensor<i1>, %rhs: tensor<i1>):
+    %1 = mhlo.and %lhs, %rhs : tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  }) {
+    replica_groups = dense<> : tensor<0x0xi64>
+  } : (tensor<1xi1>) -> tensor<1xi1>
+  func.return %0 : tensor<1xi1>
+}
+
+// CHECK-LABEL: @and_reduce
+//       CHECK:   reduction_kind = 2 : i32,
+
+func.func @or_reduce(%arg0: tensor<1xi1>) -> tensor<1xi1> {
+  %0 = "mhlo.all_reduce"(%arg0) ({
+    ^bb0(%lhs: tensor<i1>, %rhs: tensor<i1>):
+    %1 = mhlo.or %lhs, %rhs : tensor<i1>
+    mhlo.return %1 : tensor<i1>
+  }) {
+    replica_groups = dense<> : tensor<0x0xi64>
+  } : (tensor<1xi1>) -> tensor<1xi1>
+  func.return %0 : tensor<1xi1>
+}
+
+// CHECK-LABEL: @or_reduce
+//       CHECK:   reduction_kind = 3 : i32,
+
+func.func @min_reduce_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = "mhlo.all_reduce"(%arg0) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.minimum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<> : tensor<0x0xi64>,
+    channel_handle = #mhlo.channel_handle<
+      handle = 5,
+      type = 2
+    >
+  } : (tensor<?xf32>) -> tensor<?xf32>
+   func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @min_reduce
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<?xf32>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+//       CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+//       CHECK: %[[DST:.*]] = tensor.empty(%[[DIM]])
+//       CHECK: "xla_cpu.all_reduce"(%[[ARG0]], %[[DST]])
+//  CHECK-SAME:   reduction_kind = 2
+//  CHECK-SAME:   use_global_device_ids = 0
+
+func.func @partition_id() -> tensor<ui32> {
+  %0 = "mhlo.partition_id"() : () -> tensor<ui32>
+  func.return %0 : tensor<ui32>
+}
+
+// CHECK-LABEL: @partition_id
+// CHECK: %[[ID:.*]] = "xla_cpu.partition_id"() : () -> i32
+// CHECK: %[[TENSOR:.*]] = tensor.from_elements %[[ID]] : tensor<i32>
+// CHECK: %[[CAST:.*]] = mhlo.convert %[[TENSOR]] : (tensor<i32>) -> tensor<ui32>
+// CHECK: return %[[CAST]]
+
+func.func @replica_id() -> tensor<ui32> {
+  %0 = "mhlo.replica_id"() : () -> tensor<ui32>
+  func.return %0 : tensor<ui32>
+}
+
+// CHECK-LABEL: @replica_id
+// CHECK: %[[ID:.*]] = "xla_cpu.replica_id"() : () -> i32
+// CHECK: %[[TENSOR:.*]] = tensor.from_elements %[[ID]] : tensor<i32>
+// CHECK: %[[CAST:.*]] = mhlo.convert %[[TENSOR]] : (tensor<i32>) -> tensor<ui32>
+// CHECK: return %[[CAST]]
+
+func.func @collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
+  %0 = "mhlo.collective_permute"(%arg0) {
+    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
+  } : (tensor<16x8xf32>) -> tensor<16x8xf32>
+  func.return %0 : tensor<16x8xf32>
+}
+
+// CHECK-LABEL: @collective_permute
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<16x8xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<16x8xf32>
+//       CHECK: %[[RET:.*]] = "xla_cpu.collective_permute"(%[[ARG0]], %[[DST]]) {
+//  CHECK-SAME:    channel_handle = 1
+//  CHECK-SAME:    source_target_pairs = dense<
+//       CHECK: return %[[RET]]
+
+func.func @collective_permute_dynamic(%arg0: tensor<16x?xf32>)
+    -> tensor<16x?xf32> {
+  %0 = "mhlo.collective_permute"(%arg0) {
+    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
+  } : (tensor<16x?xf32>) -> tensor<16x?xf32>
+  func.return %0 : tensor<16x?xf32>
+}
+
+// CHECK-LABEL: @collective_permute_dynamic
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<16x?xf32>
+//   CHECK-DAG: %[[C1:.*]] = arith.constant 1
+//       CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK: %[[DST:.*]] = tensor.empty(%[[DIM]]) : tensor<16x?xf32>
+//       CHECK: "xla_cpu.collective_permute"(%[[ARG0]], %[[DST]]) {
+
+func.func @all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  %0 = "mhlo.all_to_all"(%arg0) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// CHECK-LABEL: @all_to_all
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<4x16xf32>
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<16x4xf32>
+//       CHECK: %[[RET:.*]] = "xla_cpu.all_to_all"(%[[ARG0]], %[[DST]]) {
+//  CHECK-SAME:    concat_dimension = 0
+//  CHECK-SAME:    replica_groups = dense<
+//  CHECK-SAME:    split_count = 4
+//  CHECK-SAME:    split_dimension = 1
+//       CHECK: return %[[RET]]
+
+func.func @all_to_all_dynamic_concat_dim(%arg0: tensor<?x16xf32>)
+    -> tensor<?x4xf32> {
+  %0 = "mhlo.all_to_all"(%arg0) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<?x16xf32>) -> tensor<?x4xf32>
+  func.return %0 : tensor<?x4xf32>
+}
+
+// CHECK-LABEL: @all_to_all_dynamic_concat_dim
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<?x16xf32>
+//   CHECK-DAG: %[[C0:.*]] = arith.constant 0
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4
+//       CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+//       CHECK: %[[CONCAT_DIM:.*]] = arith.muli %[[DIM]], %[[C4]]
+//       CHECK: %[[DST:.*]] = tensor.empty(%[[CONCAT_DIM]]) : tensor<?x4xf32>
+//       CHECK: "xla_cpu.all_to_all"(%[[ARG0]], %[[DST]]) {
+
+func.func @all_to_all_dynamic_split_dim(%arg0: tensor<4x?xf32>)
+    -> tensor<16x?xf32> {
+  %0 = "mhlo.all_to_all"(%arg0) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<4x?xf32>) -> tensor<16x?xf32>
+  func.return %0 : tensor<16x?xf32>
+}
+
+// CHECK-LABEL: @all_to_all_dynamic_split_dim
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<4x?xf32>
+//   CHECK-DAG: %[[C1:.*]] = arith.constant 1
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4
+//       CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK: %[[CONCAT_DIM:.*]] = arith.divui %[[DIM]], %[[C4]]
+//       CHECK: %[[DST:.*]] = tensor.empty(%[[CONCAT_DIM]]) : tensor<16x?xf32>
+//       CHECK: "xla_cpu.all_to_all"(%[[ARG0]], %[[DST]]) {
+
+func.func @all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>)
+    -> (tensor<128x4xf32>, tensor<128x4xf32>) {
+  %0:2 = "mhlo.all_to_all"(%arg0, %arg1) {
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  } : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
+  return %0#0, %0#1 : tensor<128x4xf32>, tensor<128x4xf32>
+}
+
+// CHECK-LABEL: @all_to_all_tuple
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<128x4xf32>,
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<128x4xf32>
+//       CHECK: %[[DST0:.*]] = tensor.empty() : tensor<128x4xf32>
+//       CHECK: %[[DST1:.*]] = tensor.empty() : tensor<128x4xf32>
+//       CHECK: "xla_cpu.all_to_all"(%[[ARG0]], %[[ARG1]], %[[DST0]], %[[DST1]])
+
+func.func @outfeed_0_input(%token: !mhlo.token) -> !mhlo.token {
+  %res = "mhlo.outfeed"(%token) {outfeed_config = "foobar"} : (!mhlo.token) -> !mhlo.token
+  func.return %res : !mhlo.token
+}
+
+// CHECK-LABEL: @outfeed_0_input
+//       CHECK: "xla_cpu.outfeed"() {config = "foobar", result_type = []} : () -> ()
+
+func.func @outfeed_1_input(%data: tensor<2xui32>, %token: !mhlo.token)
+  -> !mhlo.token attributes {xlaframework.result_mapping = 1 : i32} {
+    %res = "mhlo.outfeed"(%data, %token) {
+      outfeed_config = "", xla_shape = "token[]"
+      } : (tensor<2xui32>, !mhlo.token) -> !mhlo.token
+    func.return %res : !mhlo.token
+}
+
+// CHECK-LABEL: @outfeed_1_input
+//  CHECK-SAME: %[[DATA:.*]]: tensor<2xui32>
+//  CHECK-SAME: %[[TOKEN:.*]]: !mhlo.token
+//       CHECK: "xla_cpu.outfeed"(%[[DATA]]) {config = "", result_type = [ui32]} : (tensor<2xui32>) -> ()
+//       CHECK: return %[[TOKEN]] : !mhlo.token
+
+func.func @outfeed_2_input(%data1: tensor<3xui32>, %data2: tensor<3xi32>, %token: !mhlo.token) -> !mhlo.token {
+  %res = "mhlo.outfeed"(%data1, %data2,  %token) {outfeed_config = "foobar"}
+    : (tensor<3xui32>, tensor<3xi32>, !mhlo.token) -> !mhlo.token
+  func.return %res : !mhlo.token
+}
+
+// CHECK-LABEL: @outfeed_2_input
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<3xui32>
+//  CHECK-SAME: %[[ARG1:.*]]: tensor<3xi32>
+//       CHECK: "xla_cpu.outfeed"(%[[ARG0]], %[[ARG1]]) {config = "foobar", result_type = [ui32, i32]}
+//  CHECK-SAME: (tensor<3xui32>, tensor<3xi32>)
+
+func.func @add_dependency(%arg0: tensor<16xf32>, %arg1: !mhlo.token) -> tensor<16xf32> {
+  %0 = "mhlo.add_dependency"(%arg0, %arg1) : (tensor<16xf32>, !mhlo.token) -> tensor<16xf32>
+  func.return %0 : tensor<16xf32>
+}
+
+// CHECK-LABEL: @add_dependency
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<16xf32>
+//  CHECK-SAME: %[[ARG1:.*]]: !mhlo.token
+//       CHECK: %[[RES:.*]] = "xla_cpu.add_dependency"
+//  CHECK-SAME: %[[ARG0]], %[[ARG1]]
+//       CHECK: return %[[RES]] : tensor<16xf32>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
new file mode 100644
index 00000000000..f50cc3752f3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
@@ -0,0 +1,102 @@
+// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime | FileCheck %s
+
+func.func @partition_id() -> i32 {
+  %0 = "xla_cpu.partition_id"() : () -> i32
+  func.return %0 : i32
+}
+
+// CHECK-LABEL: @partition_id
+// CHECK: call @xla.cpu.partition_id() : () -> i32
+
+// CHECK: func private @xla.cpu.partition_id() -> i32 attributes {rt.custom_call = "xla.cpu.partition_id"}
+
+// -----
+
+func.func @replica_id() -> i32 {
+  %0 = "xla_cpu.replica_id"() : () -> i32
+  func.return %0 : i32
+}
+
+// CHECK-LABEL: @replica_id
+// CHECK: call @xla.cpu.replica_id() : () -> i32
+
+// CHECK: func private @xla.cpu.replica_id() -> i32 attributes {rt.custom_call = "xla.cpu.replica_id"}
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (d0 + s0)>
+func.func @all_reduce(%arg0: memref<32xf32, #map>, %arg1: memref<32xf32>) {
+  "xla_cpu.all_reduce"(%arg0, %arg1) {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    channel_handle = 42 : i64,
+    reduction_kind = 3 : i32,
+    use_global_device_ids = 0 : i32
+  } : (memref<32xf32, #map>, memref<32xf32>) -> ()
+  func.return
+}
+
+// CHECK-LABEL: @all_reduce
+//  CHECK-SAME:   %[[ARG0:.*]]: memref<32xf32,
+//  CHECK-SAME:   %[[ARG1:.*]]: memref<32xf32>
+//       CHECK: %[[ALLOC:.*]] = memref.alloc
+//       CHECK: memref.copy %[[ARG0]], %[[ALLOC]]
+//       CHECK: call @xla.cpu.all_reduce(%[[ALLOC]], %[[ARG1]])
+//  CHECK-SAME:   channel_handle = 42
+//  CHECK-SAME:   op_id = 0
+//  CHECK-SAME:   reduction_kind = 3
+//  CHECK-SAME:   replica_groups = dense<
+//       CHECK: func.func private @xla.cpu.all_reduce(
+//  CHECK-SAME:     memref<32xf32>, memref<32xf32>)
+//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.all_reduce"}
+
+
+// -----
+
+func.func @collective_permute(%arg0: memref<16x8xf32>, %arg1: memref<16x8xf32>) {
+  "xla_cpu.collective_permute"(%arg0, %arg1) {
+    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
+    channel_handle = 42 : i64
+  } : (memref<16x8xf32>, memref<16x8xf32>) -> ()
+  func.return
+}
+
+// CHECK-LABEL: @collective_permute
+//  CHECK-SAME:   %[[ARG0:.*]]: memref<16x8xf32>,
+//  CHECK-SAME:   %[[ARG1:.*]]: memref<16x8xf32>
+//       CHECK: call @xla.cpu.collective_permute(%[[ARG0]], %[[ARG1]])
+//  CHECK-SAME:   channel_handle = 42
+//  CHECK-SAME:   source_target_pairs = dense<
+//       CHECK: func.func private @xla.cpu.collective_permute(
+//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.collective_permute"}
+
+// -----
+
+func.func @rng_bit_generator_default(%state: memref<3xui64>,
+    %state_out: memref<3xui64>, %values_out: memref<10xui32>) {
+  "xla_cpu.rng_bit_generator"(%state, %state_out, %values_out)
+    {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  } : (memref<3xui64>, memref<3xui64>, memref<10xui32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @rng_bit_generator_default
+//  CHECK-SAME:   %[[ARG0:.*]]: memref<3xui64>, %[[ARG1:.*]]: memref<3xui64>,
+//  CHECK-SAME:   %[[ARG2:.*]]: memref<10xui32>
+//       CHECK: call @xla.cpu.rng.philox(%[[ARG0]], %[[ARG1]], %[[ARG2]])
+//       CHECK: func.func private @xla.cpu.rng.philox(
+//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.rng.philox"}
+
+// -----
+
+func.func @rng_bit_generator_three_fry(%state: memref<2xui64>,
+    %state_out: memref<2xui64>, %values_out: memref<10xui32>) {
+  "xla_cpu.rng_bit_generator"(%state, %state_out, %values_out)
+    {rng_algorithm = #mhlo.rng_algorithm<THREE_FRY>
+  } : (memref<2xui64>, memref<2xui64>, memref<10xui32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @rng_bit_generator_three_fry
+//       CHECK: call @xla.cpu.rng.three_fry(
+//       CHECK: func.func private @xla.cpu.rng.three_fry(
+//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.rng.three_fry"}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir
new file mode 100644
index 00000000000..a914a754148
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/fft.mlir
@@ -0,0 +1,16 @@
+// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+
+func.func @fft(%arg0: tensor<3x5x4x8x256xf32>) -> tensor<3x5x4x8x129xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) {
+    fft_length = dense<[4, 8, 256]> : tensor<3xi64>,
+    fft_type = #mhlo<fft_type RFFT>
+  } : (tensor<3x5x4x8x256xf32>) -> tensor<3x5x4x8x129xcomplex<f32>>
+  func.return %0 : tensor<3x5x4x8x129xcomplex<f32>>
+}
+
+// CHECK-LABEL: @fft
+//  CHECK-SAME: %[[ARG0:.*]]: tensor
+//       CHECK: %[[DST:.*]] = tensor.empty() : tensor<3x5x4x8x129xcomplex<f32>>
+//       CHECK: %[[FFT:.*]] = "xla_cpu.fft"(%[[ARG0]], %[[DST]])
+//  CHECK-SAME: {fft_length = [4, 8, 256], fft_type = 2 : i32}
+//       CHECK: return %[[FFT]]
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/legalize_i1_vector_transfers.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/legalize_i1_vector_transfers.mlir
new file mode 100644
index 00000000000..39914b38770
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/legalize_i1_vector_transfers.mlir
@@ -0,0 +1,35 @@
+// RUN: xla-cpu-opt %s -split-input-file -xla-legalize-i1-vector-transfers \
+// RUN:   | FileCheck %s
+
+func.func @transfer_read(%in: memref<8xi1>) -> vector<8xi1> {
+  %pad = arith.constant true
+  %c1 = arith.constant 1 : index
+  %ret = vector.transfer_read %in[%c1], %pad : memref<8xi1>, vector<8xi1>
+  return %ret : vector<8xi1>
+}
+
+// CHECK-LABEL: @transfer_read
+//  CHECK-SAME:     %[[IN:.*]]: memref<8xi1>
+//   CHECK-DAG:   %[[C1_I8:.*]] = arith.constant 1 : i8
+//   CHECK-DAG:   %[[C0_V:.*]] = arith.constant dense<0> : vector<8xi8>
+//   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//       CHECK:   %[[CAST:.*]] = xla_cpu.memref_element_cast %[[IN]]
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[CAST]][%[[C1]]],
+//  CHECK-SAME:                    %[[C1_I8]]
+//       CHECK:   %[[RET:.*]] = arith.cmpi ne, %[[READ]], %[[C0_V]]
+//       CHECK:   return %[[RET]]
+
+func.func @transfer_write(%in: vector<8xi1>, %out: memref<8xi1>) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %in, %out[%c0] : vector<8xi1>, memref<8xi1>
+  return
+}
+
+// CHECK-LABEL: @transfer_write
+//  CHECK-SAME:     %[[IN:.*]]: vector<8xi1>
+//  CHECK-SAME:     %[[OUT:.*]]: memref<8xi1>
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[CAST_IN:.*]] = arith.extui %[[IN]] {{.*}} to vector<8xi8>
+//   CHECK-DAG:   %[[CAST_OUT:.*]] = xla_cpu.memref_element_cast %[[OUT]]
+//   CHECK-NOT:   vector.transfer_write {{.*}}%[[IN]]
+//       CHECK:   vector.transfer_write %[[CAST_IN]], %[[CAST_OUT]][%[[C0]]]
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
new file mode 100644
index 00000000000..12162cdfe90
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
@@ -0,0 +1,86 @@
+// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime \
+// RUN:   | FileCheck %s
+
+// CHECK: func @test
+// CHECK:   %[[ARG0:.*]]: memref<f32>
+// CHECK: )
+func.func @test(%arg0: memref<f32>) {
+  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
+  // CHECK-SAME:   api_version = 2 : i32
+  // CHECK-SAME:   call_target_name = "target"
+  // CHECK-SAME:   num_results = 1 : i32
+  // CHECK-SAME:   output_tuple = false
+  // CHECK-SAME: : (memref<f32>) -> ()
+  "lmhlo.custom_call"(%arg0) ({}) {
+    api_version = 2 : i32,
+    call_target_name = "target",
+    operand_segment_sizes = array<i32: 0, 1>
+  } : (memref<f32>) -> ()
+  return
+}
+
+// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.custom_call"}
+
+// -----
+
+// CHECK: func @test_with_mapping
+// CHECK:   %[[ARG0:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG1:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG2:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG3:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG4:[0-9a-z]*]]: memref<f32>
+// CHECK: )
+func.func @test_with_mapping(
+    %arg0: memref<f32>,
+    %arg1: memref<f32>,
+    %arg2: memref<f32>,
+    %arg3: memref<f32>,
+    %arg4: memref<f32>) {
+  // CHECK: %[[HOLE:.*]] = memref.alloca() : memref<0xi8>
+
+  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
+  // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
+  // CHECK-SAME:   api_version = 1 : i32
+  // CHECK-SAME:   call_target_name = "target"
+  // CHECK-SAME:   num_results = 4 : i32
+  // CHECK-SAME:   output_tuple = true
+  "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) ({}) {
+    api_version = 1 : i32,
+    call_target_name = "target",
+    operand_segment_sizes = array<i32: 2, 3>,
+    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
+      num_args = 4,
+      num_results = 4,
+      args_to_target_args = [0, 2],
+      results_to_target_results = [0, 1, 3]>
+    } : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
+
+  return
+}
+
+// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>, memref<0xi8>,
+// CHECK-SAME: memref<f32>, memref<0xi8>, memref<f32>, memref<f32>,
+// CHECK-SAME: memref<0xi8>, memref<f32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.custom_call"}
+
+// -----
+
+// CHECK: func @one_element_output_tuple
+// CHECK:   %[[ARG0:.*]]: memref<f32>
+// CHECK: )
+func.func @one_element_output_tuple(%arg0: memref<f32>) {
+  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
+  // CHECK-SAME:   api_version = 2 : i32
+  // CHECK-SAME:   call_target_name = "target"
+  // CHECK-SAME:   num_results = 1 : i32
+  // CHECK-SAME:   output_tuple = true
+  // CHECK-SAME: : (memref<f32>) -> ()
+  "lmhlo.custom_call"(%arg0) ({}) {
+    api_version = 2 : i32,
+    call_target_name = "target",
+    operand_segment_sizes = array<i32: 0, 1>,
+    xla_shape = "(f32[])"
+  } : (memref<f32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir
new file mode 100644
index 00000000000..5db2be4725e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir
@@ -0,0 +1,13 @@
+// RUN: xla-cpu-opt %s -xla-lmhlo-to-cpu-runtime | FileCheck %s
+
+// CHECK: func @cpu_infeed(
+// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<8xf32>
+// CHECK: )
+func.func @cpu_infeed(%arg0: memref<8xf32>) {
+  // CHECK: call @[[INFEED:.*]](%[[ARG0]]) : (memref<8xf32>) -> ()
+  "lmhlo.infeed"(%arg0) {config = "abc"} : (memref<8xf32>) -> ()
+  return
+}
+
+// CHECK: func private @[[INFEED]](memref<8xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.infeed"}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/remove_copies_to_out_params.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/remove_copies_to_out_params.mlir
new file mode 100644
index 00000000000..fbe3b502ca6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/remove_copies_to_out_params.mlir
@@ -0,0 +1,127 @@
+// RUN: xla-cpu-opt %s -split-input-file -xla-remove-copies-to-out-params \
+// RUN:   | FileCheck %s
+
+func.func @alloca(%arg0: memref<f64>, %arg1: memref<f64>) {
+  %0 = memref.load %arg0[] : memref<f64>
+  %1 = arith.addf %0, %0 : f64
+  %alloca = memref.alloca() : memref<f64>
+  memref.store %1, %alloca[] : memref<f64>
+  memref.copy %alloca, %arg1 : memref<f64> to memref<f64>
+  return
+}
+
+// CHECK-LABEL: func.func @alloca(
+// CHECK-SAME:                    %[[ARG0:.*]]: memref<f64>,
+// CHECK-SAME:                    %[[ARG1:.*]]: memref<f64>) {
+// CHECK:         %[[R0:.*]] = memref.load %[[ARG0]][] : memref<f64>
+// CHECK:         %[[R1:.*]] = arith.addf %[[R0]], %[[R0]] : f64
+// CHECK-NOT      memref.alloca
+// CHECK:         memref.store %[[R1]], %[[ARG1]][] : memref<f64>
+// CHECK-NOT:     memref.copy
+// CHECK-NEXT:    return
+// CHECK:       }
+
+// -----
+
+func.func @alloc_vectorized(%arg0: memref<1024xf64>, %arg1: memref<1024xf64>) {
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %cst = arith.constant 0.000000e+00 : f64
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1024xf64>
+  scf.parallel (%arg2) = (%c0) to (%c1024) step (%c8) {
+    %subview = memref.subview %alloc[%arg2] [8] [1] :
+        memref<1024xf64> to memref<8xf64, strided<[1], offset: ?>>
+    %0 = vector.transfer_read %arg0[%arg2], %cst {in_bounds = [true]} :
+        memref<1024xf64>, vector<8xf64>
+    %1 = arith.addf %0, %0 : vector<8xf64>
+    vector.transfer_write %1, %subview[%c0] {in_bounds = [true]} :
+        vector<8xf64>, memref<8xf64, strided<[1], offset: ?>>
+    scf.yield
+  }
+  memref.copy %alloc, %arg1 : memref<1024xf64> to memref<1024xf64>
+  memref.dealloc %alloc : memref<1024xf64>
+  return
+}
+
+// CHECK-LABEL: func.func @alloc_vectorized(
+// CHECK-SAME:                              %[[ARG0:.*]]: memref<1024xf64>,
+// CHECK-SAME:                              %[[ARG1:.*]]: memref<1024xf64>) {
+// CHECK-NOT:     memref.alloc
+// CHECK:         scf.parallel
+// CHECK:           %[[SUBVIEW:.*]] = memref.subview %[[ARG1]]
+// CHECK:           %[[R0:.*]] = vector.transfer_read %[[ARG0]]
+// CHECK:           %[[R1:.*]] = arith.addf %[[R0]], %[[R0]] : vector<8xf64>
+// CHECK:           vector.transfer_write %[[R1]], %[[SUBVIEW]]
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK-NOT:     memref.copy
+// CHECK-NOT:     memref.dealloc
+// CHECK-NEXT:    return
+// CHECK:       }
+
+// -----
+
+// Similar to alloc_vectorized, but with two output params (%arg1 and %arg2).
+// Note: %arg1 = %arg0 + %arg0, and %arg2 = (%arg0 + %arg0) * %arg0
+func.func @alloc2_vectorized(%arg0: memref<256xf64>,
+                             %arg1: memref<256xf64>,
+                             %arg2: memref<256xf64>) {
+  %c256 = arith.constant 256 : index
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %cst = arith.constant 0.000000e+00 : f64
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<256xf64>
+  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256xf64>
+  scf.parallel (%arg3) = (%c0) to (%c256) step (%c8) {
+    %alloca = memref.alloca() : memref<8xf64>
+    %0 = vector.transfer_read %arg0[%arg3], %cst {in_bounds = [true]} : memref<256xf64>, vector<8xf64>
+    %1 = arith.addf %0, %0 : vector<8xf64>
+    vector.transfer_write %1, %alloca[%c0] {in_bounds = [true]} : vector<8xf64>, memref<8xf64>
+    %subview = memref.subview %alloc_0[%arg3] [8] [1] : memref<256xf64> to memref<8xf64, strided<[1], offset: ?>>
+    memref.copy %alloca, %subview : memref<8xf64> to memref<8xf64, strided<[1], offset: ?>>
+    scf.yield
+  }
+  scf.parallel (%arg3) = (%c0) to (%c256) step (%c8) {
+    %subview = memref.subview %alloc[%arg3] [8] [1] : memref<256xf64> to memref<8xf64, strided<[1], offset: ?>>
+    %0 = vector.transfer_read %alloc_0[%arg3], %cst {in_bounds = [true]} : memref<256xf64>, vector<8xf64>
+    %1 = vector.transfer_read %arg0[%arg3], %cst {in_bounds = [true]} : memref<256xf64>, vector<8xf64>
+    %2 = arith.mulf %0, %1 : vector<8xf64>
+    vector.transfer_write %2, %subview[%c0] {in_bounds = [true]} : vector<8xf64>, memref<8xf64, strided<[1], offset: ?>>
+    scf.yield
+  }
+  memref.copy %alloc_0, %arg1 : memref<256xf64> to memref<256xf64>
+  memref.dealloc %alloc_0 : memref<256xf64>
+  memref.copy %alloc, %arg2 : memref<256xf64> to memref<256xf64>
+  memref.dealloc %alloc : memref<256xf64>
+  return
+}
+
+// CHECK-LABEL: func.func @alloc2_vectorized(
+// CHECK-SAME:      %[[ARG0:[0-9a-z]*]]: memref<256xf64>,
+// CHECK-SAME:      %[[ARG1:.*]]: memref<256xf64>,
+// CHECK-SAME:      %[[ARG2:.*]]: memref<256xf64>) {
+// CHECK-NOT:     memref.alloc
+// CHECK:         scf.parallel
+// CHECK:           %[[ALLOCA:.*]] = memref.alloca()
+// CHECK:           %[[R0:.*]] = vector.transfer_read %[[ARG0]]
+// CHECK:           %[[R1:.*]] = arith.addf %[[R0]], %[[R0]]
+// CHECK:           vector.transfer_write %[[R1]], %[[ALLOCA]]
+// CHECK:           %[[SUBVIEW:.*]] = memref.subview %[[ARG1]]
+// CHECK:           memref.copy %[[ALLOCA]], %[[SUBVIEW]]
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK-NOT:     memref.copy
+// CHECK-NOT:     memref.dealloc
+// CHECK-NEXT:    scf.parallel
+// CHECK:           %[[SUBVIEW:.*]] = memref.subview %[[ARG2]]
+// CHECK:           %[[R0:.*]] = vector.transfer_read %[[ARG1]]
+// CHECK:           %[[R1:.*]] = vector.transfer_read %[[ARG0]]
+// CHECK:           %[[R2:.*]] = arith.mulf %[[R0]], %[[R1]]
+// CHECK:           vector.transfer_write %[[R2]], %[[SUBVIEW]]
+// CHECK:           scf.yield
+// CHECK:         }
+// CHECK-NOT:     memref.copy
+// CHECK-NOT:     memref.dealloc
+// CHECK-NEXT:    return
+// CHECK:       }
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir
new file mode 100644
index 00000000000..c1b934dd693
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/rng_bit_generator.mlir
@@ -0,0 +1,16 @@
+// RUN: xla-cpu-opt %s -xla-legalize-collective-ops | FileCheck %s
+
+func.func @rng_bit_generator(%state: tensor<2xui64>) -> (tensor<2xui64>, tensor<10xui32>) {
+  %new_state, %output = "mhlo.rng_bit_generator"(%state) {
+    rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  } : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10xui32>)
+  func.return %new_state, %output : tensor<2xui64>, tensor<10xui32>
+}
+
+// CHECK-LABEL: @rng_bit_generator
+//  CHECK-SAME: %[[ARG0:.*]]: tensor
+//       CHECK: %[[STATE_INIT:.*]] = tensor.empty() : tensor<2xui64>
+//       CHECK: %[[DST_INIT:.*]] = tensor.empty() : tensor<10xui32>
+//       CHECK: "xla_cpu.rng_bit_generator"(%[[ARG0]], %[[STATE_INIT]], %[[DST_INIT]])
+//  CHECK-SAME:   {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} :
+//  CHECK-SAME:   (tensor<2xui64>, tensor<2xui64>, tensor<10xui32>) -> (tensor<2xui64>, tensor<10xui32>)
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/xla_abi_legalization.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_abi_legalization.mlir
similarity index 90%
rename from tensorflow/compiler/xla/mlir/transforms/cpu/tests/xla_abi_legalization.mlir
rename to tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_abi_legalization.mlir
index 03a649def74..9e6b3520a3f 100644
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/xla_abi_legalization.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_abi_legalization.mlir
@@ -92,7 +92,7 @@ func.func @custom_call(%arg0: tensor<f32>, %arg1: tensor<2x3xf32>) -> (tensor<6x
 //   CHECK-NOT: result_layouts
 //       CHECK: %[[T1:.*]] = "mhlo.transpose"(%[[ARG1]]) {{.*}} -> tensor<3x2xf32>
 //       CHECK: %[[R1:.*]] = mhlo.reshape %[[T1]] {{.*}} -> tensor<2x3xf32>
-//       CHECK: %[[CC:.*]]:2 = "mhlo.custom_call"(%[[ARG0]], %[[R1]])
+//       CHECK: %[[CC:.*]]:2 = mhlo.custom_call @yolo(%[[ARG0]], %[[R1]])
 //       CHECK: %[[RR:.*]] = mhlo.reshape %[[CC]]#0 {{.*}} -> tensor<3x6xf32>
 //       CHECK: %[[TR:.*]] = "mhlo.transpose"(%[[RR]]) {{.*}} -> tensor<6x3xf32>
 //       CHECK: return %[[TR]], %[[CC]]#1
@@ -107,7 +107,7 @@ func.func @custom_call_i1_input(%arg0: tensor<42xi1>) {
 
 // CHECK-LABEL: @custom_call_i1_input
 // CHECK: %[[CONVERTED:.*]] = mhlo.convert {{.*}} : (tensor<42xi1>) -> tensor<42xui8>
-// CHECK: "mhlo.custom_call"(%[[CONVERTED]])
+// CHECK: mhlo.custom_call @yolo(%[[CONVERTED]])
 
 // -----
 
@@ -122,4 +122,18 @@ func.func @constant_with_layout() -> tensor<2x3xf32> {
 // CHECK-LABEL: @constant_with_layout
 //       CHECK: %[[CST:.*]] = mhlo.constant {{.*}} : tensor<3x2xf32>
 //       CHECK: %[[TR:.*]] = "mhlo.transpose"(%[[CST]]) {{.*}} -> tensor<2x3xf32>
-//       CHECK: return %[[TR]]
\ No newline at end of file
+//       CHECK: return %[[TR]]
+
+// -----
+
+func.func @non_tensor_inouts() -> !mhlo.token {
+  %0 = mhlo.create_token : !mhlo.token
+  %1 = "mhlo.custom_call"(%0) {
+      call_target_name = "yolo",
+      operand_layouts = [dense<> : tensor<0xindex>],
+      result_layouts = [dense<> : tensor<0xindex>]
+  } : (!mhlo.token) -> (!mhlo.token)
+  return %1 : !mhlo.token
+}
+
+// CHECK-LABEL: @non_tensor_inouts
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir
new file mode 100644
index 00000000000..6ec509beecc
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_memref_element_cast_to_llvm.mlir
@@ -0,0 +1,48 @@
+// RUN: xla-cpu-opt -xla-convert-memref-element-cast-to-llvm %s \
+// RUN: -split-input-file | FileCheck %s
+
+func.func @memref_cast(%arg0: memref<10xf32>) -> memref<10xi32> {
+  %ret = xla_cpu.memref_element_cast %arg0 : memref<10xf32> to memref<10xi32>
+  return %ret : memref<10xi32>
+}
+// CHECK-LABEL: func.func @memref_cast(
+// CHECK-SAME:      %[[SRC:.*]]: memref<10xf32>) -> memref<10xi32>
+// CHECK:         %[[SRC_DESC:.*]] = builtin.unrealized_conversion_cast %[[SRC]]
+// CHECK-SAME:      : memref<10xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[ALLOC_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][0]
+// CHECK-NEXT:    %[[ALIGN_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][1]
+
+// CHECK:         %[[ALLOC_PTR_CAST:.*]] = llvm.bitcast %[[ALLOC_PTR]] : !llvm.ptr<f32> to !llvm.ptr<i32>
+// CHECK-NEXT:    %[[ALIGN_PTR_CAST:.*]] = llvm.bitcast %[[ALIGN_PTR]] : !llvm.ptr<f32> to !llvm.ptr<i32>
+
+// CHECK:         %[[DST_DESC:.*]] = llvm.mlir.undef
+// CHECK-SAME:      : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR_CAST]], %[[DST_DESC]][0]
+// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR_CAST]], %[[DST_DESC_]][1]
+
+// CHECK:         builtin.unrealized_conversion_cast
+// CHECK-SAME:      : !llvm.struct<(ptr<i32>, ptr<i32>, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi32>
+
+//  -----
+
+func.func @memref_cast_i1(%arg0: memref<10xi1>) -> memref<10xi8> {
+  %ret = xla_cpu.memref_element_cast %arg0 : memref<10xi1> to memref<10xi8>
+  return %ret : memref<10xi8>
+}
+// CHECK-LABEL: func.func @memref_cast_i1(
+// CHECK-SAME:      %[[SRC:.*]]: memref<10xi1>) -> memref<10xi8>
+// CHECK:         %[[SRC_DESC:.*]] = builtin.unrealized_conversion_cast %[[SRC]]
+// CHECK-SAME:      : memref<10xi1> to !llvm.struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[ALLOC_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][0]
+// CHECK-NEXT:    %[[ALIGN_PTR:.*]] = llvm.extractvalue %[[SRC_DESC]][1]
+
+// CHECK:         %[[ALLOC_PTR_CAST:.*]] = llvm.bitcast %[[ALLOC_PTR]] : !llvm.ptr<i1> to !llvm.ptr<i8>
+// CHECK-NEXT:    %[[ALIGN_PTR_CAST:.*]] = llvm.bitcast %[[ALIGN_PTR]] : !llvm.ptr<i1> to !llvm.ptr<i8>
+
+// CHECK:         %[[DST_DESC:.*]] = llvm.mlir.undef
+// CHECK-SAME:      : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-NEXT:    %[[DST_DESC_:.*]] = llvm.insertvalue %[[ALLOC_PTR_CAST]], %[[DST_DESC]][0]
+// CHECK-NEXT:    llvm.insertvalue %[[ALIGN_PTR_CAST]], %[[DST_DESC_]][1]
+
+// CHECK:         builtin.unrealized_conversion_cast
+// CHECK-SAME:      : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)> to memref<10xi8>
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir
new file mode 100644
index 00000000000..5ab7ae76411
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir
@@ -0,0 +1,37 @@
+// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime \
+// RUN: | FileCheck %s
+
+func.func @cpu_onfeed(%arg0: memref<8xf32>, %arg1: memref<10xui32>) {
+  "xla_cpu.outfeed"(%arg0, %arg1) {config = "abc", result_type = [f32, ui32]} : (memref<8xf32>, memref<10xui32>) -> ()
+  return
+}
+
+//      CHECK: func @cpu_onfeed(
+// CHECK-SAME:   %[[ARG0:[a-z0-9]+]]: memref<8xf32>
+// CHECK-SAME:   %[[ARG1:[a-z0-9]+]]: memref<10xui32>
+// CHECK-SAME: )
+//      CHECK:   call @[[OUTFEED:.*]](%[[ARG0]], %[[ARG1]])
+// CHECK-SAME:   {result_type = [11 : i32, 8 : i32]} : (memref<8xf32>, memref<10xui32>) -> ()
+//      CHECK:   func private @[[OUTFEED]](memref<8xf32>, memref<10xui32>)
+// CHECK-SAME:   attributes {rt.custom_call = "xla.cpu.outfeed"}
+
+// -----
+
+func.func @cpu_onfeed_strided(
+  %arg0: memref<8x8xf32, strided<[?, 1], offset: ?>>,
+  %arg1: memref<10xui32>) {
+    "xla_cpu.outfeed"(%arg0, %arg1) {config = "abc", result_type = [f32, ui32]}
+      : (memref<8x8xf32, strided<[?, 1], offset: ?>>, memref<10xui32>) -> ()
+    return
+}
+
+//      CHECK: func @cpu_onfeed_strided(
+// CHECK-SAME:   %[[ARG0:[a-z0-9]+]]: memref<8x8xf32, strided<[?, 1], offset: ?>>
+// CHECK-SAME:   %[[ARG1:[a-z0-9]+]]: memref<10xui32>
+// CHECK-SAME: )
+// CHECK-NEXT:   %[[ALLOC:.*]] = memref.alloc()
+// CHECK-NEXT:   memref.copy %[[ARG0]], %[[ALLOC]]
+//      CHECK:   call @[[OUTFEED:.*]](%[[ALLOC]], %[[ARG1]])
+// CHECK-SAME:   {result_type = [11 : i32, 8 : i32]} : (memref<8x8xf32>, memref<10xui32>) -> ()
+//      CHECK:   func private @[[OUTFEED]](memref<8x8xf32>, memref<10xui32>)
+// CHECK-SAME:   attributes {rt.custom_call = "xla.cpu.outfeed"}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/xla_abi_legalization.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc
similarity index 90%
rename from tensorflow/compiler/xla/mlir/transforms/cpu/xla_abi_legalization.cc
rename to tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc
index 6c1eac0e04f..fb300c49270 100644
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/xla_abi_legalization.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_abi_legalization.cc
@@ -20,22 +20,22 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
 #define GEN_PASS_DEF_LEGALIZEXLAABIPASS
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
@@ -75,14 +75,14 @@ Value NormalizeTensor(ImplicitLocOpBuilder& b, TypedValue<ShapedType> tensor,
   return b.create<mhlo::ReshapeOp>(tensor.getType(), transpose);
 }
 
-void NormalizeInputInPlace(ImplicitLocOpBuilder& b,
-                           TypedValue<ShapedType> tensor,
+void NormalizeInputInPlace(ImplicitLocOpBuilder& b, Value tensor,
                            ArrayRef<int64_t> layout) {
-  if (IsDefaultLayout(layout)) {
+  auto typedTensor = tensor.dyn_cast<TypedValue<ShapedType>>();
+  if (!typedTensor || IsDefaultLayout(layout)) {
     return;
   }
 
-  Value normalized = NormalizeTensor(b, tensor, layout, /*isInput=*/true);
+  Value normalized = NormalizeTensor(b, typedTensor, layout, /*isInput=*/true);
   tensor.replaceAllUsesExcept(
       normalized, normalized.getDefiningOp()->getOperand(0).getDefiningOp());
 }
@@ -97,7 +97,9 @@ SmallVector<SmallVector<int64_t>> FlattenLayoutAttribute(Attribute attr) {
   };
 
   if (auto array = attr.dyn_cast<ArrayAttr>()) {
-    array.walkSubAttrs(visit_attr);
+    for (int64_t i = 0; i < array.size(); ++i) {
+      visit_attr(array[i]);
+    }
   } else {
     visit_attr(attr);
   }
@@ -125,7 +127,7 @@ struct RewriteInputArgs : OpRewritePattern<func::FuncOp> {
 
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     b.setInsertionPointToStart(&op.getBody().front());
-    BlockAndValueMapping bvm;
+    IRMapping bvm;
     for (const auto&& [param, layout] :
          llvm::zip(op.getArguments(), param_layouts)) {
       NormalizeInputInPlace(b, param, layout);
@@ -161,7 +163,9 @@ struct RewriteReturnArgs : OpRewritePattern<func::ReturnOp> {
       results.push_back(
           IsDefaultLayout(layout)
               ? result
-              : NormalizeTensor(b, result, layout, /*isInput=*/false));
+              : NormalizeTensor(b, result.cast<TypedValue<ShapedType>>(),
+                                layout,
+                                /*isInput=*/false));
     }
 
     func->removeAttr("xla_entry_computation_result_layout");
@@ -228,12 +232,13 @@ struct RewriteCustomCalls : OpRewritePattern<mhlo::CustomCallOp> {
       for (const auto& [index, operand] : llvm::enumerate(op.getOperands())) {
         const auto& layout = operand_layouts[index];
         if (!IsDefaultLayout(layout)) {
-          Value normalized = NormalizeTensor(b, op.getOperand(index), layout,
-                                             /*isInput=*/false);
+          Value normalized = NormalizeTensor(
+              b, op.getOperand(index).cast<TypedValue<ShapedType>>(), layout,
+              /*isInput=*/false);
           op.setOperand(index, normalized);
         }
       }
-      op.removeOperand_layoutsAttr();
+      op.removeOperandLayoutsAttr();
     }
 
     // Rewrite i1 inputs to ui8.
@@ -257,7 +262,7 @@ struct RewriteCustomCalls : OpRewritePattern<mhlo::CustomCallOp> {
         NormalizeInputInPlace(b, result, layout);
       }
 
-      op.removeResult_layoutsAttr();
+      op.removeResultLayoutsAttr();
     }
 
     return success();
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
new file mode 100644
index 00000000000..a9f4ab79dfe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_memref_element_cast_to_llvm.cc
@@ -0,0 +1,117 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Analysis/DataLayoutAnalysis.h"  // from @llvm-project
+#include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+#define GEN_PASS_DEF_CONVERTXLACPUMEMREFELEMENTCASTTOLLVMPASS
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+struct MemRefElementCastOpLowering
+    : public ConvertOpToLLVMPattern<xla_cpu::MemRefElementCastOp> {
+  using ConvertOpToLLVMPattern<
+      xla_cpu::MemRefElementCastOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      xla_cpu::MemRefElementCastOp cast_op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    auto target_memref_ty = cast_op.getDst().getType().cast<MemRefType>();
+
+    LLVMTypeConverter type_converter = *getTypeConverter();
+    auto target_desc_ty = type_converter.convertType(target_memref_ty)
+                              .dyn_cast_or_null<LLVM::LLVMStructType>();
+    if (!target_desc_ty) {
+      return failure();
+    }
+
+    // Unpack the descriptor into the list of its fields.
+    Location loc = cast_op.getLoc();
+    Type src_type = cast_op.getSrc().getType();
+
+    SmallVector<Value> desc_fields;
+    MemRefDescriptor::unpack(rewriter, loc, adaptor.getSrc(),
+                             src_type.cast<MemRefType>(), desc_fields);
+
+    // Bitcast allocated and aligned pointers.
+    auto dst_elem_ty =
+        typeConverter->convertType(cast_op.getType().getElementType());
+    auto dst_elem_ptr_ty = LLVM::LLVMPointerType::get(
+        dst_elem_ty, cast_op.getType().getMemorySpaceAsInt());
+    desc_fields[0] =
+        rewriter.create<LLVM::BitcastOp>(loc, dst_elem_ptr_ty, desc_fields[0]);
+    desc_fields[1] =
+        rewriter.create<LLVM::BitcastOp>(loc, dst_elem_ptr_ty, desc_fields[1]);
+
+    // Create descriptor.
+    auto dst_desc = MemRefDescriptor::pack(rewriter, loc, type_converter,
+                                           cast_op.getType(), desc_fields);
+    rewriter.replaceOp(cast_op, {dst_desc});
+    return success();
+  }
+};
+
+struct ConvertXlaCpuMemRefElementCastToLLVMPass
+    : public impl::ConvertXlaCpuMemRefElementCastToLLVMPassBase<
+          ConvertXlaCpuMemRefElementCastToLLVMPass> {
+  ConvertXlaCpuMemRefElementCastToLLVMPass() = default;
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    const auto &data_layout_analysis = getAnalysis<DataLayoutAnalysis>();
+    LowerToLLVMOptions options(&getContext(),
+                               data_layout_analysis.getAtOrAbove(op));
+
+    LLVMTypeConverter type_converter(&getContext(), options,
+                                     &data_layout_analysis);
+    RewritePatternSet patterns(&getContext());
+    patterns.add<MemRefElementCastOpLowering>(type_converter);
+
+    LLVMConversionTarget target(getContext());
+    target.addLegalOp<func::FuncOp>();
+    if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createConvertXlaCpuMemRefElementCastToLLVMPass() {
+  return std::make_unique<ConvertXlaCpuMemRefElementCastToLLVMPass>();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/xla-cpu-opt.cc b/tensorflow/compiler/xla/mlir/backends/cpu/xla-cpu-opt.cc
new file mode 100644
index 00000000000..eae0d1ec8c0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/xla-cpu-opt.cc
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h"
+
+int main(int argc, char **argv) {
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
+  mlir::gml_st::registerGmlStPasses();
+  mlir::gml_st::registerGmlStTestPasses();
+  mlir::bufferization::registerBufferizationPasses();
+
+  mlir::DialectRegistry registry;
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  registry.insert<mlir::func::FuncDialect, mlir::lmhlo::LmhloDialect,
+                  mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
+                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
+                  mlir::linalg::LinalgDialect, mlir::tensor::TensorDialect,
+                  mlir::vector::VectorDialect, mlir::xla_cpu::XlaCpuDialect>();
+
+  xla::cpu::registerCpuTransformsPasses();
+
+  return failed(MlirOptMain(argc, argv, "Xla Cpu Pass Driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/BUILD
new file mode 100644
index 00000000000..1d0879a0234
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla/mlir:__subpackages__"],
+    licenses = ["notice"],
+)
+
+xla_cc_binary(
+    name = "xla-gpu-opt",
+    srcs = ["xla-gpu-opt.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir/backends/gpu/transforms:passes",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MlirOptLib",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
new file mode 100644
index 00000000000..087efcb98d4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
@@ -0,0 +1,70 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=GpuTransforms",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "add_hlo_trace_annotations.cc",
+        "gpu_to_gpu_runtime.cc",
+        "lmhlo_gpu_to_gpu_runtime.cc",
+        "lmhlo_to_gpu_launch.cc",
+        "lmhlo_to_gpu_runtime.cc",
+        "memref_get_global_to_arg.cc",
+        "outline_cuda_graphs.cc",
+        "passes.cc",
+        "uid_generator.h",
+    ],
+    hdrs = ["passes.h"],
+    # Override cc_library()'s internal default value of ["//buildenv/target:gce"].`
+    # TODO(ezhulenev): Do not depend on NCCL thunks in compiler passes.
+    compatible_with = [],
+    deps = [
+        ":passes_inc_gen",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
+        "//tensorflow/compiler/xla/mlir/runtime/utils:custom_calls",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
+        "//tensorflow/compiler/xla/service/gpu:nccl_collective_thunks",
+        "//tensorflow/compiler/xla/stream_executor:blas",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/add_hlo_trace_annotations.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
similarity index 80%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/add_hlo_trace_annotations.cc
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
index 810a2b8d1bb..fc029349b61 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/add_hlo_trace_annotations.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
@@ -20,15 +20,15 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
 
 namespace xla {
 namespace gpu {
 
 #define GEN_PASS_DEF_ADDHLOTRACEANNOTATIONSPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
@@ -51,14 +51,6 @@ void AddHloTraceAnnotationsPass::runOnOperation() {
   ModuleOp module = getOperation();
   SymbolTable sym_table(module);
 
-  // Get a unique mhlo id from the top level module.
-  auto uid = module->getAttrOfType<IntegerAttr>("mhlo.unique_id");
-  int64_t program_id = uid ? uid.getValue().getZExtValue() : -1;
-
-  // XLA HLO -> MLIR export encodes module name in the location.
-  std::string module_name =
-      mlir::mhlo::GetDebugNameFromLocation(module->getLoc());
-
   getOperation().walk([&](func::CallOp call) {
     // Check if the callee is a custom call.
     auto callee = sym_table.lookup<func::FuncOp>(call.getCallee());
@@ -66,7 +58,7 @@ void AddHloTraceAnnotationsPass::runOnOperation() {
 
     // HLO operation name is encoded in the operation location.
     std::string hlo_op = mlir::mhlo::GetDebugNameFromLocation(call->getLoc());
-    auto annotation = HloTraceAttr::get(ctx, hlo_op, module_name, program_id);
+    auto annotation = HloTraceAttr::get(ctx, std::move(hlo_op));
     call->setAttr("rt.trace", annotation);
   });
 }
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/gpu_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
similarity index 86%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/gpu_to_gpu_runtime.cc
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
index b3ed5ed7565..f0e67615918 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/gpu_to_gpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
@@ -27,13 +27,14 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h"
 #include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
 
 namespace xla {
 namespace gpu {
 
 #define GEN_PASS_DEF_CONVERTGPUTOGPURUNTIMEPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
@@ -139,8 +140,9 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
   static constexpr const char kCustomCallTarget[] = "xla.gpu.func.launch";
 
  public:
-  LaunchFuncOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+  LaunchFuncOpLowering(MLIRContext* ctx, UidGenerator& uid,
+                       CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), uid_(uid), custom_calls_(custom_calls) {}
 
   LogicalResult matchAndRewrite(LaunchFuncOp op,
                                 PatternRewriter& rewriter) const override {
@@ -157,6 +159,15 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
         cast(op.getGridSizeZ()),  cast(op.getBlockSizeX()),
         cast(op.getBlockSizeY()), cast(op.getBlockSizeZ())};
 
+    // Shared memory size is optional for the `gpu.launch` but mandatory for the
+    // Xla runtime kernel launch custom call.
+    if (op.getDynamicSharedMemorySize()) {
+      args.insert(args.begin(), op.getDynamicSharedMemorySize());
+    } else {
+      auto zero = b.create<arith::ConstantIntOp>(0, b.getI32Type());
+      args.insert(args.begin(), zero);
+    }
+
     // Add kernel arguments.
     llvm::copy(op.getKernelOperands(), std::back_inserter(args));
 
@@ -168,6 +179,9 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
     auto call = b.create<func::CallOp>(callee.getName(), TypeRange(), args);
     call->setAttr(b.getStringAttr("kernel"), op.getKernelName());
 
+    // Assign a unique id to this instance of a kernel launch operation.
+    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
+
     // Erase the original gpu launch operation.
     rewriter.eraseOp(op);
 
@@ -175,6 +189,7 @@ class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
   }
 
  private:
+  UidGenerator& uid_;
   CustomCallDeclarations& custom_calls_;
 };
 
@@ -188,11 +203,14 @@ void ConvertGpuToGpuRuntimePass::runOnOperation() {
   SymbolTable sym_table(module);
   CustomCallDeclarations custom_calls(std::move(sym_table));
 
+  // Each kernel launch operation gets a unique id.
+  UidGenerator kernel_uid;
+
   // Convert gpu operations to XLA gpu runtime custom calls.
   RewritePatternSet patterns(ctx);
   patterns.insert<GpuModuleOpLowering>(ctx);
-  patterns.insert<LaunchFuncOpLowering, MemcpyOpLowering, MemsetOpLowering>(
-      ctx, custom_calls);
+  patterns.insert<LaunchFuncOpLowering>(ctx, kernel_uid, custom_calls);
+  patterns.insert<MemcpyOpLowering, MemsetOpLowering>(ctx, custom_calls);
 
   if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
     return signalPassFailure();
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_gpu_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
similarity index 76%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_gpu_to_gpu_runtime.cc
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
index dfccbd08d2a..2ae32d3cc84 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_gpu_to_gpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -32,18 +31,19 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h"
 #include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/uid_generator.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 
 namespace xla {
 namespace gpu {
 
 #define GEN_PASS_DEF_CONVERTLMHLOGPUTOGPURUNTIMEPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
@@ -53,6 +53,7 @@ using mlir::lmhlo_gpu::ConvBackwardInputOp;
 using mlir::lmhlo_gpu::ConvForwardFusedOp;
 using mlir::lmhlo_gpu::ConvForwardFusedSideInputOp;
 using mlir::lmhlo_gpu::ConvForwardOp;
+using mlir::lmhlo_gpu::CublasLtMatmulF8Op;
 using mlir::lmhlo_gpu::CublasLtMatmulOp;
 using mlir::lmhlo_gpu::GEMMOp;
 
@@ -130,16 +131,104 @@ class CublasLtMatmulOpLowering : public OpRewritePattern<CublasLtMatmulOp> {
   LogicalResult matchAndRewrite(CublasLtMatmulOp op,
                                 PatternRewriter& rewriter) const override {
     // Get the custom call target.
-    std::string matmul;
-    switch (op.getOperands().size()) {
-      case 4:
-        matmul = kCustomCallTarget;
+    std::string matmul = kCustomCallTarget;
+
+    switch (op.getEpilogue()) {
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Default:
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Relu:
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Gelu:
+        if (op.getNumOperands() != 4) {
+          return op.emitOpError("unexpected number of operands for matmul");
+        }
         break;
-      case 5:
-        matmul = absl::StrCat(kCustomCallTarget, ".bias");
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Bias:
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasRelu:
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGelu:
+        if (op.getNumOperands() != 5) {
+          return op.emitOpError("unexpected number of operands for matmul");
+        }
+        matmul += ".bias";
         break;
-      default:
-        return op.emitOpError("unexpected number of operands for matmul");
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::GeluAux:
+        if (op.getNumOperands() != 5) {
+          return op.emitOpError("unexpected number of operands for matmul");
+        }
+        matmul += ".aux";
+        break;
+      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGeluAux:
+        if (op.getNumOperands() != 6) {
+          return op.emitOpError("unexpected number of operands for matmul");
+        }
+        matmul += ".bias.aux";
+        break;
+    }
+
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, matmul, op);
+
+    // Convert matmul to a function call.
+    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
+                                              TypeRange(), op.getOperands());
+
+    // Assign a unique id to this instance of a matmul operation.
+    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
+
+    // Copy backend specific attributes.
+    call->setAttr(b.getStringAttr("algorithm"), op.getAlgorithmAttr());
+    call->setAttr(b.getStringAttr("alpha_imag"), op.getAlphaImagAttr());
+    call->setAttr(b.getStringAttr("alpha_real"), op.getAlphaRealAttr());
+    call->setAttr(b.getStringAttr("beta"), op.getBetaAttr());
+    call->setAttr(b.getStringAttr("dot_dims"), op.getDotDimensionNumbers());
+    call->setAttr(b.getStringAttr("epilogue"), op.getEpilogueAttr());
+
+    // TODO(ezhulenev): Today we can't pass an array of enum attributes to the
+    // custom call. Also we do not have a corresponding precision enum on the
+    // SE/XLA side, so we encode it as an i32 array (tensor).
+    if (auto precisions = op.getPrecisionConfig()) {
+      llvm::SmallVector<int32_t> values;
+      for (auto precision : *precisions) {
+        auto value = precision.cast<mhlo::PrecisionAttr>().getValue();
+        values.push_back(static_cast<int32_t>(value));
+      }
+      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr(values));
+    } else {
+      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr({0, 0}));
+    }
+
+    // Erase the original matmul operation.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+ private:
+  UidGenerator& uid_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+// As above for FP8 Custom Calls.
+class CublasLtMatmulF8OpLowering : public OpRewritePattern<CublasLtMatmulF8Op> {
+ private:
+  static constexpr const char kCustomCallTarget[] =
+      "xla.gpu.cublas.lt.matmul.f8";
+
+ public:
+  CublasLtMatmulF8OpLowering(MLIRContext* ctx, UidGenerator& uid,
+                             CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<CublasLtMatmulF8Op>(ctx),
+        uid_(uid),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(CublasLtMatmulF8Op op,
+                                PatternRewriter& rewriter) const override {
+    // Get the custom call target.
+    std::string matmul = kCustomCallTarget;
+
+    if (op.getNumOperands() == 9) {
+      matmul += ".d_amax";
+    } else if (op.getNumOperands() != 8) {
+      return op.emitOpError("unexpected number of operands for matmul");
     }
 
     // Get or create a custom call function declaration.
@@ -228,7 +317,8 @@ class ConvOpLowering : public OpRewritePattern<Conv> {
       call->setAttr(b.getStringAttr(name), attr);
     };
 
-    auto set_xi64 = [&](StringRef name, Optional<DenseIntElementsAttr> attr) {
+    auto set_xi64 = [&](StringRef name,
+                        std::optional<DenseIntElementsAttr> attr) {
       SmallVector<int64_t> values;
       if (attr.has_value())
         values = llvm::to_vector(attr->getValues<int64_t>());
@@ -237,7 +327,7 @@ class ConvOpLowering : public OpRewritePattern<Conv> {
 
     // Convert `BoolElementsAttr` to i64 before passing to the runtime.
     // TODO(ezhulenev): Allow passing boolean tensors to the XLA custom calls.
-    auto set_xi1 = [&](StringRef name, Optional<DenseElementsAttr> attr) {
+    auto set_xi1 = [&](StringRef name, std::optional<DenseElementsAttr> attr) {
       SmallVector<int64_t> values;
       if (attr.has_value())
         values.assign(attr->getValues<bool>().begin(),
@@ -378,8 +468,8 @@ void ConvertLmhloGpuToGpuRuntimePass::runOnOperation() {
 
   // Each unique Gemm/Matmul operation in the module will get assigned a uid.
   UidGenerator matmul_uid;
-  patterns.insert<GemmOpLowering, CublasLtMatmulOpLowering>(ctx, matmul_uid,
-                                                            custom_calls);
+  patterns.insert<GemmOpLowering, CublasLtMatmulOpLowering,
+                  CublasLtMatmulF8OpLowering>(ctx, matmul_uid, custom_calls);
 
   // Each unique Conv operation in the module will get assigned a uid.
   UidGenerator conv_uid;
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_launch.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
similarity index 75%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_launch.cc
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
index 604ee6eed8f..a381d679261 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_launch.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <iterator>
 #include <memory>
 #include <numeric>
+#include <string_view>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -29,14 +31,12 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
@@ -49,7 +49,7 @@ namespace xla {
 namespace gpu {
 
 #define GEN_PASS_DEF_CONVERTLMHLOTOGPULAUNCHPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
@@ -79,6 +79,13 @@ class ConvertLmhloToGpuLaunchPass
   ThunkSequence* thunk_sequence_;
 };
 
+// XLA some times (ab)uses custom calls to represent operations for which we do
+// not want to define a separate `HloOpcode`. These operations emitted as device
+// kernels (similar to fusions), and we detect such custom calls by name, and
+// handle them similar to how we handle fusions.
+static std::array<std::string_view, 3> kCustomCallIntrinsics = {
+    "SliceToDynamic", "PadToStatic", "__triton"};
+
 //===-----------------------------------------------------------------------===/
 
 static Value MakeBitPatternConstant(OpBuilder& b, Location loc, Type type,
@@ -157,17 +164,6 @@ static Value MakeBitPatternConstant(OpBuilder& b, Location loc, Type type,
   return b.create<arith::ConstantIndexOp>(loc, 0);
 }
 
-// Replaces lmhlo ops within a module with gpu.launch_func and gpu.memcpy ops.
-struct KernelOpsPattern : OpRewritePattern<ModuleOp> {
-  KernelOpsPattern(MLIRContext* context, ThunkSequence* thunk_sequence)
-      : OpRewritePattern(context), thunk_sequence(thunk_sequence) {}
-
-  LogicalResult matchAndRewrite(ModuleOp module_op,
-                                PatternRewriter& rewriter) const override;
-
-  ThunkSequence* thunk_sequence;
-};
-
 static void ExtractThunksForOp(Operation* op, ThunkSequence& thunk_sequence,
                                ThunkSequence* thunks_for_op) {
   for (std::unique_ptr<Thunk>& thunk : thunk_sequence) {
@@ -219,44 +215,44 @@ static absl::StatusOr<std::unique_ptr<ThunkSequence>> Match(
   return std::move(thunks_for_op);
 }
 
-static void LowerThunkToGpuOp(Operation* op, PatternRewriter& rewriter,
+static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
                               GPUModuleOp gpu_module, Thunk* thunk);
 
 // Replaces op with gpu.launch_func, gpu.memcpy, gpu.memset ops.
-static void Rewrite(Operation* op, PatternRewriter& rewriter,
-                    SymbolTable& symbol_table, ThunkSequence* thunks) {
-  OpBuilder::InsertionGuard guard(rewriter);
+static void Rewrite(Operation* op, OpBuilder& b, SymbolTable& symbol_table,
+                    ThunkSequence* thunks) {
+  OpBuilder::InsertionGuard guard(b);
   auto loc = op->getLoc();
 
-  rewriter.setInsertionPoint(op->getParentOfType<func::FuncOp>());
-  auto gpu_module = rewriter.create<GPUModuleOp>(loc, "gpu_module");
+  b.setInsertionPoint(op->getParentOfType<func::FuncOp>());
+  auto gpu_module = b.create<GPUModuleOp>(loc, "gpu_module");
   symbol_table.insert(gpu_module);
 
   for (const std::unique_ptr<Thunk>& thunk : *thunks) {
-    LowerThunkToGpuOp(op, rewriter, gpu_module, thunk.get());
+    LowerThunkToGpuOp(op, b, gpu_module, thunk.get());
   }
 
-  rewriter.eraseOp(op);
+  op->erase();
 }
 
-static void LowerThunkToGpuOp(Operation* op, PatternRewriter& rewriter,
+static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
                               GPUModuleOp gpu_module, Thunk* thunk) {
   auto loc = op->getLoc();
 
   if (thunk->kind() == Thunk::kSequential) {
     const auto* seq_thunk = static_cast<const SequentialThunk*>(thunk);
     for (const std::unique_ptr<Thunk>& thunk : seq_thunk->thunks()) {
-      LowerThunkToGpuOp(op, rewriter, gpu_module, thunk.get());
+      LowerThunkToGpuOp(op, b, gpu_module, thunk.get());
     }
     return;
   }
 
   if (thunk->kind() == Thunk::kCopy) {
     const auto* copy_thunk = static_cast<const DeviceToDeviceCopyThunk*>(thunk);
-    rewriter.setInsertionPoint(op);
-    rewriter.create<MemcpyOp>(loc, TypeRange(), ValueRange(),
-                              copy_thunk->destination_value(),
-                              copy_thunk->source_value());
+    b.setInsertionPoint(op);
+    b.create<MemcpyOp>(loc, TypeRange(), ValueRange(),
+                       copy_thunk->destination_value(),
+                       copy_thunk->source_value());
     return;
   }
 
@@ -264,11 +260,9 @@ static void LowerThunkToGpuOp(Operation* op, PatternRewriter& rewriter,
                             uint32_t memset_value, Value buffer_arg) {
     auto element_type =
         buffer_arg.getType().cast<MemRefType>().getElementType();
-    rewriter.setInsertionPoint(op);
-    Value value =
-        MakeBitPatternConstant(rewriter, loc, element_type, memset_value);
-    rewriter.create<MemsetOp>(loc, TypeRange(), ValueRange(), buffer_arg,
-                              value);
+    b.setInsertionPoint(op);
+    Value value = MakeBitPatternConstant(b, loc, element_type, memset_value);
+    b.create<MemsetOp>(loc, TypeRange(), ValueRange(), buffer_arg, value);
   };
 
   if (thunk->kind() == Thunk::kMemset32BitValue) {
@@ -285,25 +279,24 @@ static void LowerThunkToGpuOp(Operation* op, PatternRewriter& rewriter,
   }
 
   const auto* kernel_thunk = static_cast<const KernelThunk*>(thunk);
-  rewriter.setInsertionPointToStart(gpu_module.getBody());
+  b.setInsertionPointToStart(gpu_module.getBody());
 
   SmallVector<Value> kernel_args;
   for (auto kernel_arg : kernel_thunk->values())
     kernel_args.push_back(kernel_arg);
 
-  auto func_type = rewriter.getType<FunctionType>(
-      TypeRange(ValueRange(kernel_args)), TypeRange());
+  auto func_type =
+      b.getType<FunctionType>(TypeRange(ValueRange(kernel_args)), TypeRange());
 
-  gpu::GPUFuncOp kernel_func = rewriter.create<gpu::GPUFuncOp>(
-      loc, kernel_thunk->kernel_name(), func_type);
-  kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(),
-                       rewriter.getUnitAttr());
-  rewriter.setInsertionPointToEnd(&kernel_func.getBody().back());
-  rewriter.create<ReturnOp>(loc);
+  gpu::GPUFuncOp kernel_func =
+      b.create<gpu::GPUFuncOp>(loc, kernel_thunk->kernel_name(), func_type);
+  kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(), b.getUnitAttr());
+  b.setInsertionPointToEnd(&kernel_func.getBody().back());
+  b.create<ReturnOp>(loc);
 
   auto make_const_idx = [&](int64_t value) {
-    auto attr = rewriter.getIndexAttr(value);
-    return rewriter.create<arith::ConstantOp>(loc, attr).getResult();
+    auto attr = b.getIndexAttr(value);
+    return b.create<arith::ConstantOp>(loc, attr).getResult();
   };
 
   auto make_kernel_dim3 = [&](const auto& dim3) {
@@ -313,13 +306,15 @@ static void LowerThunkToGpuOp(Operation* op, PatternRewriter& rewriter,
 
   const auto& launch_dims = kernel_thunk->launch_dimensions();
 
-  rewriter.setInsertionPoint(op);
+  b.setInsertionPoint(op);
   auto grid_size = make_kernel_dim3(launch_dims.block_counts());
   auto block_size = make_kernel_dim3(launch_dims.thread_counts_per_block());
+  auto shmem_size = b.create<arith::ConstantOp>(
+      loc,
+      b.getI32IntegerAttr(kernel_thunk->launch_dimensions().SharedMemBytes()));
 
-  rewriter.create<LaunchFuncOp>(loc, kernel_func, grid_size, block_size,
-                                /*shared_memory_size_bytes=*/nullptr,
-                                kernel_args);
+  b.create<LaunchFuncOp>(loc, kernel_func, grid_size, block_size, shmem_size,
+                         kernel_args);
 }
 
 // An overload set for defining predicates for operations that should
@@ -331,36 +326,39 @@ static bool HasGpuEmitter(OpTy) {
 
 // Select custom calls that have corresponding GPU emitters.
 static bool HasGpuEmitter(lmhlo::CustomCallOp custom_call) {
-  llvm::StringRef target = custom_call.getCallTargetName();
-  return target == "SliceToDynamic" || target == "PadToStatic";
+  return llvm::any_of(kCustomCallIntrinsics, [&](std::string_view name) {
+    return custom_call.getCallTargetName().equals(name);
+  });
 }
 
-LogicalResult KernelOpsPattern::matchAndRewrite(
-    ModuleOp module_op, PatternRewriter& rewriter) const {
+//===-----------------------------------------------------------------------===/
+
+void ConvertLmhloToGpuLaunchPass::runOnOperation() {
+  ModuleOp module = getOperation();
+
   // No thunks to lower from. Skip pass.
-  if (thunk_sequence == nullptr) {
-    return failure();
-  }
+  if (thunk_sequence_ == nullptr) return signalPassFailure();
 
+  // Collect thunks for rewriting each compatible operation in the module into
+  // the sequence of device kernel launches. Some operation might have an empty
+  // thunk sequence (e.g. redundant copy operation that does not require running
+  // anything on device).
   absl::flat_hash_map<Operation*, std::unique_ptr<ThunkSequence>> rewrites;
 
   // Get data to rewrite kernel ops without changing the IR.
   auto walk = [&](auto op_type_tag) {
-    using OpTy = decltype(op_type_tag);
-
-    return module_op.walk([&](OpTy op) -> WalkResult {
+    return module.walk([&](decltype(op_type_tag) op) -> WalkResult {
       if (!HasGpuEmitter(op)) return success();
 
-      auto data = Match(op, *thunk_sequence);
-      if (!data.ok())
-        return rewriter.notifyMatchFailure(op, data.status().message());
+      auto data = Match(op, *thunk_sequence_);
+      if (!data.ok()) return op.emitOpError(data.status().message());
 
       rewrites[op] = std::move(*data);
       return success();
     });
   };
 
-  // Compile all operations that have GPU code emitters to the GPU binary,
+  // Collect all operations that have GPU code emitters.
   if (walk(lmhlo::FusionOp()).wasInterrupted() ||
       walk(lmhlo::RngGetAndUpdateStateOp()).wasInterrupted() ||
       walk(lmhlo::ScatterOp()).wasInterrupted() ||
@@ -368,37 +366,21 @@ LogicalResult KernelOpsPattern::matchAndRewrite(
       walk(lmhlo::SortOp()).wasInterrupted() ||
       walk(lmhlo::CustomCallOp()).wasInterrupted() ||
       walk(LaunchFuncOp()).wasInterrupted())
-    return failure();
+    return signalPassFailure();
 
-  if (rewrites.empty()) {
-    return rewriter.notifyMatchFailure(module_op, "No kernel ops");
-  }
+  // No operations that should be lowered to sequence of device launches.
+  if (rewrites.empty()) return;
 
-  // Mark module as gpu.container_module.
-  rewriter.updateRootInPlace(module_op, [&] {
-    module_op->setAttr(GPUDialect::getContainerModuleAttrName(),
-                       rewriter.getUnitAttr());
-  });
+  OpBuilder b(module);
+  SymbolTable symbol_table(module);
 
-  // Replace the kernel ops with gpu.launch_func.
-  SymbolTable symbol_table(module_op);
-  for (const auto& rewrite : rewrites) {
-    Rewrite(rewrite.first, rewriter, symbol_table, rewrite.second.get());
+  // Replace matched operations with gpu.launch_func's.
+  for (const auto& [op, thunks] : rewrites) {
+    Rewrite(op, b, symbol_table, thunks.get());
   }
 
-  return success();
-}
-
-//===-----------------------------------------------------------------------===/
-
-void ConvertLmhloToGpuLaunchPass::runOnOperation() {
-  MLIRContext* ctx = &getContext();
-
-  RewritePatternSet patterns(ctx);
-  patterns.insert<KernelOpsPattern>(ctx, thunk_sequence_);
-
-  if (failed(applyOpPatternsAndFold(getOperation(), std::move(patterns))))
-    return signalPassFailure();
+  // Mark module as gpu.container_module.
+  module->setAttr(GPUDialect::getContainerModuleAttrName(), b.getUnitAttr());
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
new file mode 100644
index 00000000000..e2e0cba0a85
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
@@ -0,0 +1,1142 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/functional/any_invocable.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h"
+#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DEF_CONVERTLMHLOTOGPURUNTIMEPASS
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+using mlir::gpu::MemcpyOp;
+
+using mlir::lmhlo::CaseOp;
+using mlir::lmhlo::CustomCallOp;
+using mlir::lmhlo::FftOp;
+using mlir::lmhlo::InfeedOp;
+using mlir::lmhlo::OutfeedOp;
+using mlir::lmhlo::TerminatorOp;
+using mlir::lmhlo::WhileOp;
+
+using xla::runtime::AppendCustomCallAttrs;
+using xla::runtime::CustomCallDeclarations;
+
+class ConvertLmhloToGpuRuntimePass
+    : public impl::ConvertLmhloToGpuRuntimePassBase<
+          ConvertLmhloToGpuRuntimePass> {
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry
+        .insert<arith::ArithDialect, cf::ControlFlowDialect, func::FuncDialect,
+                memref::MemRefDialect, scf::SCFDialect>();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class TerminatorOpLowering : public OpRewritePattern<TerminatorOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TerminatorOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<func::ReturnOp>(op);
+    return mlir::success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+template <typename IoFeedOp>
+class IoFeedOpLowering : public OpRewritePattern<IoFeedOp> {
+  static StringRef Target(InfeedOp) { return "xla.gpu.infeed"; }
+  static StringRef Target(OutfeedOp) { return "xla.gpu.outfeed"; }
+
+ public:
+  IoFeedOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<IoFeedOp>(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(IoFeedOp op,
+                                PatternRewriter& rewriter) const override {
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
+
+    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
+        {b.getStringAttr("config"), op.getConfigAttr()}};
+
+    // Call the runtime intrinsic with the original operands.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), op.getOperands());
+    AppendCustomCallAttrs(call, custom_call_attrs);
+
+    return success();
+  }
+
+ private:
+  CustomCallDeclarations& custom_calls_;
+};
+
+class InfeedOpLowering : public IoFeedOpLowering<InfeedOp> {
+ public:
+  using IoFeedOpLowering::IoFeedOpLowering;
+};
+
+class OutfeedOpLowering : public IoFeedOpLowering<OutfeedOp> {
+ public:
+  using IoFeedOpLowering::IoFeedOpLowering;
+};
+
+//===----------------------------------------------------------------------===//
+
+class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
+ private:
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.custom_call";
+
+ public:
+  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  // Rewrite custom call with `API_VERSION_TYPED_FFI` version into XLA runtime
+  // custom calls bypassing custom call adaptor.
+  LogicalResult rewriteTypedCustomCall(CustomCallOp op,
+                                       PatternRewriter& rewriter) const {
+    // TODO(ezhulenev): Support target arg mapping, or explain why we do not
+    // need them for typed custom calls.
+    if (op.getTargetArgMapping())
+      return op.emitOpError(
+          "API_VERSION_TYPED_FFI custom calls do not "
+          "support target arg mapping");
+
+    // Create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee =
+        custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
+    callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
+
+    // Forward backend config to the custom call implementation.
+    auto dict = op.getBackendConfig()
+                    ? op.getBackendConfig()->cast<mlir::DictionaryAttr>()
+                    : nullptr;
+    llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
+
+    // Call the custom call function forwarding user-defined attributes.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), op.getOperands());
+    AppendCustomCallAttrs(call, backend_config);
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(CustomCallOp op,
+                                PatternRewriter& rewriter) const override {
+    // Typed custom calls lowered directly to XLA runtime custom calls.
+    if (op.getApiVersion() == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
+      return rewriteTypedCustomCall(op, rewriter);
+
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // By default all operands passed to the custom call handler.
+    llvm::SmallVector<Value> operands = op.getOperands();
+
+    // If custom call has target arguments mapping, then we need to pass `i64`
+    // scalars in place of holes to detect them in custom call handler.
+    //
+    // TODO(ezhulenev): We need an `xla` dialect to model Xla framework
+    // semantics including holes for custom call. As a work around we pass `i64`
+    // values because xla custom call do not support scalar arguments, and we
+    // can disambiguate holes from buffers.
+    if (op.getTargetArgMapping().has_value()) {
+      auto mapping = *op.getTargetArgMapping();
+      int64_t num_args = mapping.getNumArgs();
+      int64_t num_results = mapping.getNumResults();
+
+      // We represent holes as an arbitrary `i64` constant.
+      Value hole = b.create<arith::ConstantOp>(b.getI64IntegerAttr(-1));
+      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
+
+      // Update operands to mapped custom call arguments.
+      auto args = mapping.getArgsToTargetArgs();
+      for (const auto& indexed : llvm::enumerate(args))
+        operands[indexed.value()] = op.getArgs()[indexed.index()];
+
+      // Update operands to mapped custom call results.
+      auto res = mapping.getResultsToTargetResults();
+      for (const auto& indexed : llvm::enumerate(res))
+        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
+    }
+
+    // Create a custom call function declaration.
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
+
+    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
+        {b.getStringAttr("api_version"), op.getApiVersionAttr()},
+        {b.getStringAttr("backend_config"), op.getBackendConfigAttr()},
+        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
+
+    // Call the runtime intrinsic with the original operands.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), operands);
+    AppendCustomCallAttrs(call, custom_call_attrs);
+
+    return success();
+  }
+
+ private:
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class FftOpLowering : public OpRewritePattern<FftOp> {
+ private:
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.fft";
+
+ public:
+  FftOpLowering(MLIRContext* ctx, UidGenerator& uid,
+                CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), uid_(uid), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(FftOp op,
+                                PatternRewriter& rewriter) const override {
+    // Create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
+
+    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
+        {b.getStringAttr("fft_length"), op.getFftLengthAttr()},
+        {b.getStringAttr("fft_type"), op.getFftTypeAttr()},
+        {b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid())}};
+
+    // Convert Fft to a function call.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
+        op, callee.getName(), TypeRange(), op.getOperands());
+    AppendCustomCallAttrs(call, custom_call_attrs);
+    return success();
+  }
+
+ private:
+  UidGenerator& uid_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class CaseOpLowering : public OpRewritePattern<CaseOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CaseOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // Copy index buffer to the host ...
+    auto index_type = op.getIndex().getType().dyn_cast<MemRefType>();
+
+    // Always create an `alloca` in the parent function entry block.
+    // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
+    Value index_on_host = [&]() -> Value {
+      OpBuilder::InsertionGuard guard(b);
+      b.setInsertionPointToStart(&op->getParentOfType<func::FuncOp>().front());
+      return b.create<memref::AllocaOp>(index_type);
+    }();
+
+    b.create<MemcpyOp>(TypeRange(), ValueRange({index_on_host, op.getIndex()}));
+
+    // Get the index value from the buffer.
+    Value index = b.create<memref::LoadOp>(index_type.getElementType(),
+                                           index_on_host, ValueRange());
+
+    bool is_predicate = index_type.getElementType().isInteger(1);
+
+    // For binary index (predicate) convert i1 to i32 index.
+    if (is_predicate) {
+      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
+      Value c1 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(1));
+      index = b.create<arith::SelectOp>(index, c0, c1);
+    }
+
+    // For integer index make sure that it is within range.
+    if (!is_predicate) {
+      unsigned n = op.getNumRegions() - 1;
+      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
+      Value cN = b.create<arith::ConstantOp>(b.getI32IntegerAttr(n));
+
+      Value too_small = b.create<arith::CmpIOp>(
+          b.getI1Type(), arith::CmpIPredicate::slt, index, c0);
+      Value too_large = b.create<arith::CmpIOp>(
+          b.getI1Type(), arith::CmpIPredicate::sgt, index, cN);
+
+      Value out_of_range = b.create<arith::OrIOp>(too_small, too_large);
+      index = b.create<arith::SelectOp>(out_of_range, cN, index);
+    }
+
+    // Wrap the CFG constructed from the `lmhlo.case` operation in an
+    // `scf.execute_region` operation, so that we do not introduce the CFG
+    // into regions that expect a single block (e.g. inside the loop body).
+    auto execute = b.create<scf::ExecuteRegionOp>(TypeRange());
+
+    // Add an entry block to the execute region operation.
+    Block& entry = execute.getRegion().emplaceBlock();
+
+    // Create a block with `scf.yield` terminator.
+    Block& yield = execute.getRegion().emplaceBlock();
+    b.setInsertionPointToStart(&yield);
+    b.create<scf::YieldOp>();
+
+    // Prepare case destinations for the `scf.switch` operation.
+    llvm::SmallVector<llvm::APInt> case_values;
+    llvm::SmallVector<Block*> case_blocks;
+    llvm::SmallVector<ValueRange> case_operands;
+
+    // Create blocks from each of the case regions.
+    for (Region& region : op->getRegions()) {
+      // Move `lmhlo.case` block into the execute region.
+      Block& block = region.front();
+      block.moveBefore(&yield);
+
+      // Erase original `lmhlo.terminator`.
+      rewriter.eraseOp(block.getTerminator());
+
+      // Branch into the yield block.
+      b.setInsertionPointToEnd(&block);
+      b.create<cf::BranchOp>(&yield);
+
+      // Add a `cf.switch` case.
+      int32_t idx = case_blocks.size();
+      case_values.push_back(b.getI32IntegerAttr(idx).getValue());
+      case_blocks.push_back(&block);
+      case_operands.push_back({});
+    }
+
+    // Create a `cf.switch` operation in the execute region entry block.
+    b.setInsertionPointToEnd(&entry);
+    b.create<cf::SwitchOp>(index, &yield, ValueRange(), case_values,
+                           case_blocks, case_operands);
+
+    // Erase the original case operation.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class WhileOpLowering : public OpRewritePattern<WhileOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  // Rewrite while loop with known trip count to `scf.for` operation.
+  LogicalResult rewriteForLoop(WhileOp op, PatternRewriter& rewriter) const {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    Value lb = b.create<arith::ConstantIndexOp>(0);
+    Value ub = b.create<arith::ConstantIndexOp>(*op.getTripCount());
+    Value c1 = b.create<arith::ConstantIndexOp>(1);
+
+    // Create an `scf.for` loop in place of `lmhlo.while` loop.
+    auto loop = b.create<scf::ForOp>(lb, ub, c1, ValueRange());
+
+    // Move body region into the new loop operation.
+    IRMapping mapping;
+    rewriter.eraseOp(op.getBody().front().getTerminator());
+    rewriter.mergeBlockBefore(&op.getBody().front(),
+                              loop.getLoopBody().front().getTerminator());
+
+    // Erase the original while loop.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+  // Rewrite while loop with unknown trip count to `scf.while` operation.
+  LogicalResult rewriteWhileLoop(WhileOp op, PatternRewriter& rewriter) const {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // Create an `scf.while` loop in place of `lmhlo.while` loop.
+    auto loop = b.create<scf::WhileOp>(TypeRange(), ValueRange());
+
+    // Predicate buffer placed on the device.
+    Value pred = op.getOperand(0);
+
+    // Inline condition and body regions into the new loop operation.
+    IRMapping mapping;
+    rewriter.inlineRegionBefore(op.getCond(), loop.getBefore(),
+                                loop.getBefore().begin());
+    rewriter.inlineRegionBefore(op.getBody(), loop.getAfter(),
+                                loop.getAfter().begin());
+
+    {  // Replace loop condition terminator.
+      auto* terminator = loop.getBefore().back().getTerminator();
+      b.setInsertionPointAfter(terminator);
+
+      auto i1 = b.getI1Type();
+
+      // Always create an `alloca` in the parent function entry block.
+      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
+      Value pred_on_host = [&]() -> Value {
+        OpBuilder::InsertionGuard guard(b);
+        b.setInsertionPointToStart(
+            &op->getParentOfType<func::FuncOp>().front());
+        return b.create<memref::AllocaOp>(MemRefType::get({}, i1));
+      }();
+
+      // Copy predicate buffer to the host ...
+      b.create<gpu::MemcpyOp>(TypeRange(), ValueRange({pred_on_host, pred}));
+
+      // .. and check if we need to continue loop iteration.
+      Value cond = b.create<memref::LoadOp>(i1, pred_on_host, ValueRange());
+      b.create<scf::ConditionOp>(cond, ValueRange());
+      rewriter.eraseOp(terminator);
+    }
+
+    {  // Replace loop body terminator.
+      auto* terminator = loop.getAfter().back().getTerminator();
+      b.setInsertionPointAfter(terminator);
+      b.create<scf::YieldOp>(TypeRange(), ValueRange());
+      rewriter.eraseOp(terminator);
+    }
+
+    // Erase the original while loop.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(WhileOp op,
+                                PatternRewriter& rewriter) const override {
+    assert(op.getNumOperands() == 1 && "expected single lmhlo.while operand");
+    return op.getTripCount().has_value() ? rewriteForLoop(op, rewriter)
+                                         : rewriteWhileLoop(op, rewriter);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Collective operations lowerings.
+//===----------------------------------------------------------------------===//
+
+using mlir::lmhlo::AllGatherOp;
+using mlir::lmhlo::AllReduceOp;
+using mlir::lmhlo::AllToAllOp;
+using mlir::lmhlo::CollectivePermuteOp;
+using mlir::lmhlo::PartitionIdOp;
+using mlir::lmhlo::ReduceScatterOp;
+using mlir::lmhlo::ReplicaIdOp;
+using mlir::lmhlo_gpu::AllReduceDoneOp;
+using mlir::lmhlo_gpu::AllReduceStartOp;
+using mlir::lmhlo_gpu::CollectivePermuteDoneOp;
+using mlir::lmhlo_gpu::CollectivePermuteStartOp;
+
+// We assign unique id to all collective operations in the module, so that we
+// can efficiently access per-op state at run time. Exception to this rule are
+// asynchronous collective operations, that share the same unique id by the pair
+// of corresponding `start` and `done` operations.
+//
+// Asynchronous collective operations pass HLO Token to represent the dependency
+// between the `Start` and `Done` operations. When we lower to XLA runtime
+// custom calls we rely on assigning each unique pair of `Start` and `Done`
+// operations a unique event id, and use shared "context" owned by the
+// GpuExecutable to pass Gpu events from `Start` to `Done` custom call handlers.
+//
+// TODO(ezhulenev): Once XLA runtime custom calls support returning values, we
+// should explicitly return event id from the `Start` custom call, and pass it
+// to the `Done` custom call. Longer term this should become an `!async.token`
+// and rely on XLA runtime asynchonous execution.
+class CollectiveUidGenerator {
+ public:
+  CollectiveUidGenerator() : cnt_(0) {}
+
+  // Assings a unique event id to the pair of start and done operations.
+  int32_t AssignUid(Operation* start, Operation* done) {
+    int32_t id = next();
+    uids_[start] = id;
+    uids_[done] = id;
+    return id;
+  }
+
+  FailureOr<int32_t> AssignedUid(Operation* op) {
+    // Async operations must be assigned uid ahead of time.
+    if (isa<AllReduceStartOp, AllReduceDoneOp, CollectivePermuteStartOp,
+            CollectivePermuteDoneOp>(op)) {
+      auto it = uids_.find(op);
+      if (it == uids_.end()) return failure();
+      return it->second;
+    }
+    // For every other operation we just assign a next id.
+    return next();
+  }
+
+ private:
+  int32_t next() { return cnt_++; }
+
+  int32_t cnt_;
+  llvm::DenseMap<Operation*, int32_t> uids_;
+};
+
+template <typename CollectiveOp>
+class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
+  static StringRef Target(AllGatherOp) { return "xla.gpu.all_gather"; }
+  static StringRef Target(AllReduceOp) { return "xla.gpu.all_reduce"; }
+  static StringRef Target(AllToAllOp) { return "xla.gpu.all_to_all"; }
+  static StringRef Target(ReduceScatterOp) { return "xla.gpu.reduce_scatter"; }
+  static StringRef Target(CollectivePermuteOp) {
+    return "xla.gpu.collective_permute";
+  }
+  static StringRef Target(CollectivePermuteStartOp) {
+    return "xla.gpu.collective_permute_start";
+  }
+  static StringRef Target(AllReduceStartOp) {
+    return "xla.gpu.all_reduce_start";
+  }
+
+  template <typename ReduceOrGatherOp>
+  static NcclCollectiveConfig GetNcclCollectiveConfig(ReduceOrGatherOp op,
+                                                      int /*replica_count*/,
+                                                      int /*num_partitions*/) {
+    return GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds());
+  }
+
+  static NcclCollectiveConfig GetNcclCollectiveConfig(AllToAllOp op,
+                                                      int /*replica_count*/,
+                                                      int /*num_partitions*/) {
+    // TODO(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
+    // attribute and it should be removed.
+    return GetNcclCollectiveConfigForMlir(op, std::nullopt);
+  }
+
+  static NcclCollectiveConfig GetNcclCollectiveConfig(CollectivePermuteOp op,
+                                                      int replica_count,
+                                                      int num_partitions) {
+    return NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
+               op, replica_count, num_partitions)
+        .config;
+  }
+
+  static NcclCollectiveConfig GetNcclCollectiveConfig(
+      CollectivePermuteStartOp op, int replica_count, int num_partitions) {
+    return NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
+               op, replica_count, num_partitions)
+        .config;
+  }
+
+  template <typename NonCollectivePermuteOp>
+  static LogicalResult TryDegenerateToMemCopy(
+      NonCollectivePermuteOp op, const NcclCollectiveConfig& config,
+      int replica_count, int num_partitions, PatternRewriter& rewriter) {
+    if (!config.IsDegenerate(replica_count, num_partitions)) {
+      return failure();
+    }
+
+    for (int64_t i = 0; i < op.getInputs().size(); i++) {
+      rewriter.create<gpu::MemcpyOp>(
+          op.getLoc(), TypeRange(),
+          ValueRange({op.getOutputs()[i], op.getOperands()[i]}));
+    }
+
+    return success();
+  }
+
+  template <typename ThunkT, typename OpT>
+  static LogicalResult TryDegenerateCollectivePermuteToMemCopy(
+      OpT op, const NcclCollectiveConfig& config, int replica_count,
+      int num_partitions, PatternRewriter& rewriter) {
+    if (!ThunkT::IsDegenerate(op, replica_count, num_partitions)) {
+      return failure();
+    }
+
+    rewriter.create<gpu::MemcpyOp>(
+        op.getLoc(), TypeRange(),
+        ValueRange({op.getOutput(), op.getOperand()}));
+
+    return success();
+  }
+
+  static LogicalResult TryDegenerateToMemCopy(
+      CollectivePermuteOp op, const NcclCollectiveConfig& config,
+      int replica_count, int num_partitions, PatternRewriter& rewriter) {
+    return TryDegenerateCollectivePermuteToMemCopy<NcclCollectivePermuteThunk>(
+        op, config, replica_count, num_partitions, rewriter);
+  }
+
+  static LogicalResult TryDegenerateToMemCopy(
+      CollectivePermuteStartOp op, const NcclCollectiveConfig& config,
+      int replica_count, int num_partitions, PatternRewriter& rewriter) {
+    return TryDegenerateCollectivePermuteToMemCopy<
+        NcclCollectivePermuteStartThunk>(op, config, replica_count,
+                                         num_partitions, rewriter);
+  }
+
+  static bool CanImplement(AllGatherOp op) {
+    return NcclAllGatherThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(AllReduceOp op) {
+    return NcclAllReduceThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(AllReduceStartOp op) {
+    return NcclAllReduceStartThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(ReduceScatterOp op) {
+    return NcclReduceScatterThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(AllToAllOp op) {
+    return NcclAllToAllThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(CollectivePermuteOp op) {
+    return NcclCollectivePermuteThunk::CanImplement(op);
+  }
+
+  static bool CanImplement(CollectivePermuteStartOp op) {
+    return NcclCollectivePermuteStartThunk::CanImplement(op);
+  }
+
+  template <typename ReduceOp>
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, ReduceOp op,
+                                        func::CallOp call) {
+    std::optional<xla::ReductionKind> reduction_kind =
+        NcclAllReduceThunkBase::MatchAllReduceComputation(op.getComputation());
+    if (!reduction_kind.has_value())
+      return op.emitOpError()
+             << "Failed to determine reduction computation for AllReduce";
+
+    call->setAttr(
+        b.getStringAttr("reduction_kind"),
+        b.getI64IntegerAttr(static_cast<int64_t>(reduction_kind.value())));
+
+    return success();
+  }
+
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, AllGatherOp op,
+                                        func::CallOp call) {
+    return success();
+  }
+
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, AllToAllOp op,
+                                        func::CallOp call) {
+    call->setAttr(b.getStringAttr("has_split_dimension"),
+                  b.getBoolAttr(op.getSplitDimension().has_value()));
+    return success();
+  }
+
+  template <typename OpT>
+  static LogicalResult SetCollectivePermuteAttrs(ImplicitLocOpBuilder& b,
+                                                 OpT op, func::CallOp call) {
+    auto source_target_pairs_or =
+        ConvertNx2Attribute(op.getSourceTargetPairs());
+    if (!source_target_pairs_or.ok()) {
+      return op.emitOpError()
+             << source_target_pairs_or.status().error_message();
+    }
+
+    // Pass an array of pairs as two vectors.
+    std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
+        std::move(source_target_pairs_or.value());
+    std::vector<int64_t> source_peers;
+    std::vector<int64_t> target_peers;
+    source_peers.reserve(source_target_pairs.size());
+    target_peers.reserve(source_target_pairs.size());
+    for (const auto& source_target_pair : source_target_pairs) {
+      source_peers.push_back(source_target_pair.first);
+      target_peers.push_back(source_target_pair.second);
+    }
+
+    auto source_peers_attr = b.getI64TensorAttr(source_peers);
+    auto target_peers_attr = b.getI64TensorAttr(target_peers);
+    call->setAttr(b.getStringAttr("source_peers"), source_peers_attr);
+    call->setAttr(b.getStringAttr("target_peers"), target_peers_attr);
+    return success();
+  }
+
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
+                                        CollectivePermuteOp op,
+                                        func::CallOp call) {
+    return SetCollectivePermuteAttrs(b, op, call);
+  }
+
+  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
+                                        CollectivePermuteStartOp op,
+                                        func::CallOp call) {
+    return SetCollectivePermuteAttrs(b, op, call);
+  }
+
+ public:
+  CollectiveOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
+                       CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<CollectiveOp>(ctx),
+        uid_(uid),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(CollectiveOp op,
+                                PatternRewriter& rewriter) const override {
+    // Construct an NCCL collective config from the parent func attributes.
+    func::FuncOp fn = op->template getParentOfType<func::FuncOp>();
+    auto replica_count_attr = fn->getAttrOfType<IntegerAttr>("replica_count");
+    auto num_partitions_attr = fn->getAttrOfType<IntegerAttr>("num_partitions");
+    const int64_t replica_count = replica_count_attr.getInt();
+    const int64_t num_partitions = num_partitions_attr.getInt();
+
+    NcclCollectiveConfig config =
+        GetNcclCollectiveConfig(op, replica_count, num_partitions);
+
+    // A given collective op can be degenerate if across all groups formed
+    // by it are singleton. In such a case, we don't need to do any
+    // communication and we can just copy the input to the output.
+    if (succeeded(TryDegenerateToMemCopy(op, config, replica_count,
+                                         num_partitions, rewriter))) {
+      // For async collective erase all corresponding done operations.
+      if (auto start = dyn_cast<AllReduceStartOp>(op.getOperation())) {
+        auto users = llvm::to_vector(start.getToken().getUsers());
+        llvm::for_each(users, [&](Operation* user) {
+          if (isa<AllReduceDoneOp>(user)) rewriter.eraseOp(user);
+        });
+      }
+
+      // Erase the original collective operation.
+      rewriter.eraseOp(op);
+
+      return success();
+    }
+
+    if (!CanImplement(op)) {
+      return op.emitOpError()
+             << "Requested " << Target(op)
+             << " not implemented on GPU; replica_count: " << replica_count
+             << ", num_partitions: " << num_partitions << ", group_mode: "
+             << CollectiveOpGroupModeToString(config.group_mode)
+             << ", NCCL support: " << NcclCollectiveThunk::NcclIsEnabled();
+    }
+
+    // Check that we have and assigned unique collective operation id.
+    auto uid = uid_.AssignedUid(op);
+    if (failed(uid)) {
+      return op.emitOpError("failed to get a unique collective operation id");
+    }
+
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // We always drop the return value from the signature, because for
+    // AllReduceStart operation we pass dependency through the collective
+    // operation id.
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, Target(op), TypeRange(op.getOperands()), TypeRange());
+
+    // Convert collective op to a function call.
+    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
+                                              TypeRange(), op.getOperands());
+
+    // Copy backend specific attributes.
+    call->setAttr(b.getStringAttr("group_mode"),
+                  b.getI64IntegerAttr(static_cast<int64_t>(config.group_mode)));
+    call->setAttr(b.getStringAttr("op_id"), b.getI64IntegerAttr(config.op_id));
+
+    // TODO(b/233930690): Pass the attribute below as a nested array.
+    // Pass an array of arrays using two vectors; one specifying all the values
+    // and another specifying the (ending) offsets of each array in the other
+    // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
+    // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
+    std::vector<int64_t> replica_group_offsets;
+    std::vector<int64_t> replica_group_values;
+    replica_group_offsets.reserve(config.replica_groups.size());
+    int replica_group_offset = 0;
+    for (const auto& replica_group : config.replica_groups) {
+      replica_group_offset += replica_group.replica_ids_size();
+      replica_group_offsets.push_back(replica_group_offset);
+      replica_group_values.reserve(replica_group_offset);
+      for (auto replica_id : replica_group.replica_ids()) {
+        replica_group_values.push_back(replica_id);
+      }
+    }
+    call->setAttr(b.getStringAttr("replica_group_offsets"),
+                  b.getI64TensorAttr(replica_group_offsets));
+    call->setAttr(b.getStringAttr("replica_group_values"),
+                  b.getI64TensorAttr(replica_group_values));
+
+    // Assign a unique collective operation id.
+    call->setAttr(b.getStringAttr("uid"), b.getI32IntegerAttr(*uid));
+
+    // Set attributes specific to the type of collective operation.
+    auto result = SetSpecificAttrs(b, op, call);
+    if (failed(result)) return result;
+
+    // For asynchonous start operation we need to produce a fake token, that
+    // will be later removed, because corresponding `done` operation doesn't
+    // have a token argument. We rely on the `unrealized_conversion_cast`
+    // operation to create a fake token from the `i8` constant, and on the dead
+    // code elimination pass that will remove unused fake tokens.
+    if constexpr (std::is_same_v<CollectiveOp, AllReduceStartOp> ||
+                  std::is_same_v<CollectiveOp, CollectivePermuteStartOp>) {
+      Value token = op.getToken();
+      Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
+      auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
+      token.replaceAllUsesWith(fake.getResult(0));
+    }
+
+    // Erase the original collective operation.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+ private:
+  CollectiveUidGenerator& uid_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+#define DEFINE_COLLECTIVE_OP_LOWERING(OP)                \
+  class OP##Lowering : public CollectiveOpLowering<OP> { \
+   public:                                               \
+    using CollectiveOpLowering::CollectiveOpLowering;    \
+  }
+
+DEFINE_COLLECTIVE_OP_LOWERING(AllGatherOp);
+DEFINE_COLLECTIVE_OP_LOWERING(AllReduceOp);
+DEFINE_COLLECTIVE_OP_LOWERING(AllReduceStartOp);
+DEFINE_COLLECTIVE_OP_LOWERING(ReduceScatterOp);
+DEFINE_COLLECTIVE_OP_LOWERING(AllToAllOp);
+DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteOp);
+DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteStartOp);
+
+#undef DEFINE_COLLECTIVE_OP_LOWERING
+
+template <typename OpT>
+class AsyncDoneOpLowering : public OpRewritePattern<OpT> {
+ public:
+  AsyncDoneOpLowering(MLIRContext* ctx, const char* custom_call_target,
+                      CollectiveUidGenerator& uid,
+                      CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<OpT>(ctx),
+        custom_call_target_(custom_call_target),
+        uid_(uid),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(OpT op,
+                                PatternRewriter& rewriter) const override {
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, custom_call_target_,
+                                                    TypeRange(), TypeRange());
+
+    // Get a unique collective operation id.
+    FailureOr<int32_t> uid = uid_.AssignedUid(op);
+    if (failed(uid))
+      return op.emitOpError("failed to get a unique collective operation id");
+
+    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
+        {b.getStringAttr("uid"), b.getI32IntegerAttr(*uid)}};
+
+    // Convert AllReduceDone to a function call.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
+                                                          TypeRange());
+    AppendCustomCallAttrs(call, custom_call_attributes);
+
+    return success();
+  }
+
+ private:
+  const char* custom_call_target_;
+  CollectiveUidGenerator& uid_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+class AllReduceDoneOpLowering : public AsyncDoneOpLowering<AllReduceDoneOp> {
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.all_reduce_done";
+
+ public:
+  AllReduceDoneOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
+                          CustomCallDeclarations& custom_calls)
+      : AsyncDoneOpLowering(ctx, kCustomCallTarget, uid, custom_calls) {}
+};
+
+class CollectivePermuteDoneOpLowering
+    : public AsyncDoneOpLowering<CollectivePermuteDoneOp> {
+  static constexpr const char kCustomCallTarget[] =
+      "xla.gpu.collective_permute_done";
+
+ public:
+  CollectivePermuteDoneOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
+                                  CustomCallDeclarations& custom_calls)
+      : AsyncDoneOpLowering(ctx, kCustomCallTarget, uid, custom_calls) {}
+};
+
+template <typename CollectiveIdOp>
+class CollectiveIdOpLowering : public OpRewritePattern<CollectiveIdOp> {
+  static StringRef Target(ReplicaIdOp) { return "xla.gpu.replica_id"; }
+  static StringRef Target(PartitionIdOp) { return "xla.gpu.partition_id"; }
+
+ public:
+  CollectiveIdOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<CollectiveIdOp>(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(CollectiveIdOp op,
+                                PatternRewriter& rewriter) const override {
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
+
+    // Call the runtime intrinsic with the original operands.
+    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
+                                              op->getOperands());
+    return success();
+  }
+
+ private:
+  CustomCallDeclarations& custom_calls_;
+};
+
+class ReplicaIdOpLowering : public CollectiveIdOpLowering<ReplicaIdOp> {
+ public:
+  using CollectiveIdOpLowering::CollectiveIdOpLowering;
+};
+
+class PartitionIdOpLowering : public CollectiveIdOpLowering<PartitionIdOp> {
+ public:
+  using CollectiveIdOpLowering::CollectiveIdOpLowering;
+};
+
+//===----------------------------------------------------------------------===//
+// Point-to-Point communication ops lowering (Send/Recv).
+//===----------------------------------------------------------------------===//
+
+template <typename OpT>
+class SendRecvOpLowering : public OpRewritePattern<OpT> {
+ public:
+  SendRecvOpLowering(MLIRContext* ctx, const char* custom_call_target,
+                     CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<OpT>(ctx),
+        custom_call_target_(custom_call_target),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(OpT op,
+                                PatternRewriter& rewriter) const override {
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(
+        b, custom_call_target_, TypeRange(op.getOperands()), TypeRange());
+
+    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
+        {b.getStringAttr("channel_handle"), op.getChannelHandle()},
+        {b.getStringAttr("is_host_transfer"), op.getIsHostTransferAttr()},
+        {b.getStringAttr("frontend_attributes"), op.getFrontendAttributes()}};
+
+    // Convert Send/Recv to a function call.
+    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
+                                              TypeRange(), op.getOperands());
+    AppendCustomCallAttrs(call, custom_call_attributes);
+
+    // For communication operation we need to produce a fake token, that will be
+    // later removed, because corresponding `done` operation doesn't have the
+    // token argument. We rely on the `unrealized_conversion_cast` operation to
+    // create a fake token from the `i8` constant.
+    Value token = op.getResult();
+    Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
+    auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
+    token.replaceAllUsesWith(fake.getResult(0));
+
+    // Erase the original operation.
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+ private:
+  const char* custom_call_target_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+template <typename OpT>
+class SendRecvDoneOpLowering : public OpRewritePattern<OpT> {
+ public:
+  SendRecvDoneOpLowering(MLIRContext* ctx, const char* custom_call_target,
+                         CustomCallDeclarations& custom_calls)
+      : OpRewritePattern<OpT>(ctx),
+        custom_call_target_(custom_call_target),
+        custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(OpT op,
+                                PatternRewriter& rewriter) const override {
+    // Get or create a custom call function declaration.
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    func::FuncOp callee = custom_calls_.GetOrCreate(b, custom_call_target_,
+                                                    TypeRange(), TypeRange());
+
+    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
+        {b.getStringAttr("channel_handle"), op.getChannelHandleAttr()},
+        {b.getStringAttr("is_host_transfer"), op.getIsHostTransferAttr()}};
+
+    // Convert SendDone/RecvDone to a function call.
+    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
+                                                          TypeRange());
+    AppendCustomCallAttrs(call, custom_call_attributes);
+
+    return success();
+  }
+
+ private:
+  const char* custom_call_target_;
+  CustomCallDeclarations& custom_calls_;
+};
+
+struct SendOpLowering : public SendRecvOpLowering<lmhlo::SendOp> {
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.send";
+  SendOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : SendRecvOpLowering(ctx, kCustomCallTarget, custom_calls) {}
+};
+
+struct SendDoneOpLowering : public SendRecvDoneOpLowering<lmhlo::SendDoneOp> {
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.send_done";
+  SendDoneOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : SendRecvDoneOpLowering(ctx, kCustomCallTarget, custom_calls) {}
+};
+
+struct RecvOpLowering : public SendRecvOpLowering<lmhlo::RecvOp> {
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.recv";
+  RecvOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : SendRecvOpLowering(ctx, kCustomCallTarget, custom_calls) {}
+};
+
+struct RecvDoneOpLowering : public SendRecvDoneOpLowering<lmhlo::RecvDoneOp> {
+  static constexpr const char kCustomCallTarget[] = "xla.gpu.recv_done";
+  RecvDoneOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : SendRecvDoneOpLowering(ctx, kCustomCallTarget, custom_calls) {}
+};
+
+//===----------------------------------------------------------------------===//
+
+template <typename StartOpT, typename DoneOpT>
+static absl::AnyInvocable<WalkResult(StartOpT)> GetAsyncUidGenerator(
+    CollectiveUidGenerator& collective_uid) {
+  return [&collective_uid](StartOpT start) -> WalkResult {
+    Value token = start.getToken();
+
+    // We expect the token to be consumed just once.
+    if (!token.hasOneUse()) return start.emitOpError("token has multiple uses");
+
+    // Token must be consumed by the corresponding done operation.
+    auto done = dyn_cast<DoneOpT>(*token.getUsers().begin());
+    if (!done) return start.emitOpError("illegal token user");
+
+    collective_uid.AssignUid(start, done);
+    return WalkResult::advance();
+  };
+}
+
+void ConvertLmhloToGpuRuntimePass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* ctx = module.getContext();
+
+  // Keep track of the custom calls created from the lowered operations.
+  SymbolTable sym_table(module);
+  CustomCallDeclarations custom_calls(std::move(sym_table));
+
+  // Convert lmhlo operations to XLA gpu runtime custom calls.
+  RewritePatternSet patterns(ctx);
+  patterns.insert<TerminatorOpLowering, CaseOpLowering, WhileOpLowering>(ctx);
+  patterns.insert<InfeedOpLowering, OutfeedOpLowering, CustomCallOpLowering>(
+      ctx, custom_calls);
+
+  UidGenerator fft_uid;
+  patterns.insert<FftOpLowering>(ctx, fft_uid, custom_calls);
+
+  // Assign shared unique id to each unique pair of async start-done operations,
+  // all other collective operations will get assigned uid.
+  CollectiveUidGenerator collective_uid;
+  auto walked = module.walk(
+      GetAsyncUidGenerator<AllReduceStartOp, AllReduceDoneOp>(collective_uid));
+  if (walked.wasInterrupted()) return signalPassFailure();
+  walked = module.walk(
+      GetAsyncUidGenerator<CollectivePermuteStartOp, CollectivePermuteDoneOp>(
+          collective_uid));
+  if (walked.wasInterrupted()) return signalPassFailure();
+
+  // Convert lmhlo collective operations to XLA gpu runtime custom calls.
+  patterns.insert<PartitionIdOpLowering, ReplicaIdOpLowering>(ctx,
+                                                              custom_calls);
+  patterns.insert<AllGatherOpLowering, AllReduceOpLowering,
+                  AllReduceStartOpLowering, AllToAllOpLowering,
+                  CollectivePermuteOpLowering, CollectivePermuteStartOpLowering,
+                  ReduceScatterOpLowering>(ctx, collective_uid, custom_calls);
+
+  // Convert lmhlo point-to-point communication operations to XLA gpu runtime.
+  patterns.insert<SendOpLowering, SendDoneOpLowering, RecvOpLowering,
+                  RecvDoneOpLowering>(ctx, custom_calls);
+
+  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+    return signalPassFailure();
+
+  // TODO(ezhulenev): We must run `done` op lowering after the `start` op
+  // lowering to ensure that all redundant collective operations will be
+  // safely replaced by a `memcpy` operations.
+  //
+  // This should be a part of lmhlo operation canonicalization.
+  {
+    RewritePatternSet patterns(ctx);
+    patterns.insert<AllReduceDoneOpLowering, CollectivePermuteDoneOpLowering>(
+        ctx, collective_uid, custom_calls);
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+      return signalPassFailure();
+  }
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloToGpuRuntimePass() {
+  return std::make_unique<ConvertLmhloToGpuRuntimePass>();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/memref_get_global_to_arg.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/memref_get_global_to_arg.cc
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
index 8ee51e3cfef..5eb8efcb0fc 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/memref_get_global_to_arg.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 
 namespace xla {
 namespace gpu {
 
 #define GEN_PASS_DEF_CONVERTMEMREFGETGLOBALTOARGPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
new file mode 100644
index 00000000000..2c4955a8eb2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
@@ -0,0 +1,354 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dominance.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
+#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DEF_OUTLINECUDAGRAPHSPASS
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+
+using mlir::gpu::LaunchFuncOp;
+
+class OutlineCudaGraphsPass
+    : public impl::OutlineCudaGraphsPassBase<OutlineCudaGraphsPass> {
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<func::FuncDialect, runtime::RuntimeDialect>();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+struct OpCapturePattern {
+  // CUDA-graph-compatible operations can be either moved or cloned into the
+  // graph capture function. Most of the operations should be moved, as they
+  // have side effects, however small constants and pure operations like
+  // `memref.view` can be safely cloned into the graph region. We rely on later
+  // dead code elimination to erase them from the "main" function if they are
+  // not used by any other operations.
+  enum class Capture { kMove, kClone };
+
+  virtual ~OpCapturePattern() = default;
+  virtual FailureOr<Capture> match(Operation* op) = 0;
+};
+
+using OpCapturePatternSet = std::vector<std::unique_ptr<OpCapturePattern>>;
+
+// A sequence of operations to be outlined into cuda graph capture function.
+using CaptureSequence =
+    llvm::SmallVector<std::pair<Operation*, OpCapturePattern::Capture>>;
+
+//===----------------------------------------------------------------------===//
+
+template <OpCapturePattern::Capture capture, typename T, typename... Ts>
+struct OpCapture : public OpCapturePattern {
+  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
+    if (isa<T, Ts...>(op)) return capture;
+    return failure();
+  }
+};
+
+static constexpr auto kMove = OpCapturePattern::Capture::kMove;
+static constexpr auto kClone = OpCapturePattern::Capture::kClone;
+
+template <typename T, typename... Ts>
+using MoveOp = OpCapture<kMove, T, Ts...>;
+template <typename T, typename... Ts>
+using CloneOp = OpCapture<kClone, T, Ts...>;
+
+// Capture gpu operations by moving them intp graph capture function.
+struct LaunchFuncOpCapture : public MoveOp<LaunchFuncOp> {};
+struct ConvOpCapture : public MoveOp<lmhlo_gpu::ConvForwardFusedOp> {};
+
+// Capture pure operations by cloning them into graph capture function.
+struct ConstantOpCapture : public CloneOp<arith::ConstantOp> {};
+struct ViewOpCapture : public CloneOp<memref::ViewOp> {};
+
+//===----------------------------------------------------------------------===//
+
+// Collect sequences of operations that can be outlined into Cuda Graphs.
+static std::vector<CaptureSequence> CollectCaptureSequences(
+    DominanceInfo& dominance, ModuleOp module, OpCapturePatternSet& patterns) {
+  std::vector<CaptureSequence> seqs;
+
+  // Match given operation with all capture patterns.
+  auto match = [&](Operation* op) -> FailureOr<OpCapturePattern::Capture> {
+    for (auto& pattern : patterns) {
+      if (auto matched = pattern->match(op); succeeded(matched)) return matched;
+    }
+    return failure();
+  };
+
+  // Find graph-compatible sequences of operations in every block.
+  module.walk([&](Block* block) {
+    CaptureSequence* seq = &seqs.emplace_back();
+
+    for (Operation& op : *block) {
+      FailureOr<OpCapturePattern::Capture> matched = match(&op);
+      // Append matched operation to the current sequence. We only append
+      // operations that must be moved into the graph capture function (ops with
+      // side effects), and add cloneable operations later.
+      if (succeeded(matched) && *matched == kMove)
+        seq->emplace_back(&op, *matched);
+
+      // Skip unsupported operation and start a new sequence.
+      if (failed(matched) && !seq->empty()) seq = &seqs.emplace_back();
+    }
+
+    // Remove the last sequence if it's empty.
+    if (seq->empty()) seqs.pop_back();
+  });
+
+  // Remove cloneable operations accidentally captured by the sequence of ops,
+  // e.g. we can have `memref.view` between two kernel launch operations that
+  // is not used by operations in the captured sequence.
+  for (CaptureSequence& seq : seqs) {
+    llvm::DenseSet<Operation*> moveable_ops;
+    for (auto& [op, capture] : seq)
+      if (capture == kMove) moveable_ops.insert(op);
+
+    llvm::erase_if(seq, [&](auto& pair) {
+      return pair.second == kClone &&
+             llvm::none_of(pair.first->getUsers(), [&](Operation* user) {
+               return moveable_ops.contains(user);
+             });
+    });
+  }
+
+  // Try to extend discovered sequences of ops following operands use-def chains
+  // and pulling cloneable operations defining operands into the graph capture
+  // sequence. In practice we just clone `arith.constant` and `memref.view`
+  // operations into the graph capture function, to make it cheaper to compute
+  // the hash of the arguments at run time.
+  for (CaptureSequence& seq : seqs) {
+    llvm::DenseSet<Operation*> seq_ops;  // operations already in `seq`
+    llvm::SmallVector<Operation*> worklist;
+
+    // Add operations that define `op` arguments to the worklist.
+    auto populate_worklist = [&](Operation* op) {
+      for (Value arg : op->getOperands())
+        if (Operation* op = arg.getDefiningOp()) worklist.push_back(op);
+    };
+
+    for (auto& [op, _] : seq) {
+      seq_ops.insert(op);
+      populate_worklist(op);
+    }
+
+    // Find cloneable ops and group them by block where they are defined.
+    llvm::DenseMap<Block*, llvm::SmallVector<Operation*>> cloneable;
+
+    // Traverse use-def chains to collect all cloneable operations.
+    while (!worklist.empty()) {
+      Operation* op = worklist.pop_back_val();
+      if (seq_ops.contains(op)) continue;
+
+      // Check if operation can be cloned into graph capture function.
+      if (auto matched = match(op);
+          succeeded(matched) && *matched == OpCapturePattern::Capture::kClone) {
+        cloneable[op->getBlock()].push_back(op);
+        seq_ops.insert(op);
+        populate_worklist(op);
+      }
+    }
+
+    // Traverse blocks according to their dominance to avoid used-before-defined
+    // invalid SSA region construction in graph capture function.
+    llvm::SmallVector<Block*> blocks;
+    for (auto& [block, _] : cloneable) blocks.push_back(block);
+    llvm::sort(blocks, [&](Block* a, Block* b) {
+      return dominance.properlyDominates(a, b);
+    });
+
+    for (Block* block : llvm::reverse(blocks)) {
+      // Sort operations according to their original position in the block.
+      llvm::sort(cloneable[block], [](Operation* a, Operation* b) {
+        return a->isBeforeInBlock(b);
+      });
+
+      // Prepend all cloneable operations to the discovered ops sequence.
+      auto cloned = llvm::map_range(cloneable[block], [](Operation* op) {
+        return std::make_pair(op, OpCapturePattern::Capture::kClone);
+      });
+      seq.insert(seq.begin(), cloned.begin(), cloned.end());
+    }
+  }
+
+  return seqs;
+}
+
+//===----------------------------------------------------------------------===//
+
+using xla::runtime::CustomCallDeclarations;
+
+static std::vector<Value> GetGraphCaptureFuncArgs(const CaptureSequence& seq) {
+  llvm::SetVector<Value> args;
+
+  // Values defined by operations in the capture sequence.
+  llvm::DenseSet<Value> defined_by_seq;
+  for (auto& [op, _] : seq)
+    defined_by_seq.insert(op->result_begin(), op->result_end());
+
+  // Add arguments defined outside of the capture sequence.
+  for (auto& [op, _] : seq) {
+    auto external_args = llvm::make_filter_range(
+        op->getOperands(),
+        [&](Value arg) { return !defined_by_seq.contains(arg); });
+    args.insert(external_args.begin(), external_args.end());
+  }
+
+  return args.takeVector();
+}
+
+// Given a sequence of operations, outline them into a graph capture function
+// and replace them with an XLA Gpu runtime function call.
+static LogicalResult Outline(unsigned ordinal,
+                             CustomCallDeclarations& custom_calls,
+                             CaptureSequence& seq) {
+  // Only operations that have to be moved into the graph capture function
+  // represent Gpu computations.
+  unsigned num_move_captures = llvm::count_if(seq, [](auto capture) {
+    return capture.second == OpCapturePattern::Capture::kMove;
+  });
+  if (num_move_captures < 2) return failure();
+
+  SymbolTable& sym_table = custom_calls.sym_table();
+  MLIRContext* ctx = sym_table.getOp()->getContext();
+
+  // Create a fused location out of LaunchFuncOp operations.
+  llvm::SmallVector<Location> locations;
+  for (auto& op : seq) locations.push_back(op.first->getLoc());
+  ImplicitLocOpBuilder b(FusedLoc::get(ctx, locations), sym_table.getOp());
+
+  // Arguments of the graph capture function.
+  std::vector<Value> args = GetGraphCaptureFuncArgs(seq);
+
+  // Create a function in the compiled module.
+  auto func = b.create<func::FuncOp>(
+      "xla.gpu.cuda.graph.capture",
+      FunctionType::get(ctx, TypeRange(ValueRange(args)), TypeRange()));
+
+  // Add graph capture function to the module.
+  sym_table.insert(func);
+
+  // Export graph capture function to the runtime.
+  b.setInsertionPoint(func);
+  b.create<runtime::ExportOp>(func, ordinal);
+
+  // Create a custom call declaration corresponding to the outlined graph
+  // capture function.
+  func::FuncOp graph_launch = custom_calls.GetOrCreate(
+      b, "xla.gpu.cuda.graph.launch", TypeRange(ValueRange(args)), TypeRange());
+
+  // Call the cuda graph launch custom call right before the first moved op.
+  auto insertion_point = llvm::find_if(seq, [](auto capture) {
+    return capture.second == OpCapturePattern::Capture::kMove;
+  });
+  b.setInsertionPoint(insertion_point->first);
+
+  auto call = b.create<func::CallOp>(graph_launch.getName(), TypeRange(), args);
+  call->setAttr(b.getStringAttr("capture"), FlatSymbolRefAttr::get(func));
+
+  // At this point we successfully added new functions to the module, so we can
+  // move or clone captured operations from their original location to the graph
+  // capture function.
+  Block* body = func.addEntryBlock();
+
+  // We'll need to replace operands of cloned/moved operations inside the graph
+  // capture function.
+  llvm::SmallVector<std::pair<Value, Value>> mappings;  // {from, to} mappings
+  for (auto mapping : llvm::zip(args, func.getArguments()))
+    mappings.emplace_back(std::get<0>(mapping), std::get<1>(mapping));
+
+  // Move or clone operations into the graph capture function.
+  for (auto& [op, capture] : seq) {
+    if (capture == OpCapturePattern::Capture::kMove)
+      op->moveBefore(body, body->end());
+
+    if (capture == OpCapturePattern::Capture::kClone) {
+      Operation* clone = op->clone();
+      OpBuilder::atBlockEnd(body).insert(clone);
+
+      for (auto mapping : llvm::zip(op->getResults(), clone->getResults()))
+        mappings.emplace_back(std::get<0>(mapping), std::get<1>(mapping));
+    }
+  }
+
+  // Update def-use chains inside the graph capture function.
+  for (auto mapping : mappings) {
+    replaceAllUsesInRegionWith(mapping.first, mapping.second, func.getBody());
+  }
+
+  // Add a return operation to the graph capture function.
+  b.setInsertionPointToEnd(body);
+  b.create<func::ReturnOp>(ValueRange());
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+
+void OutlineCudaGraphsPass::runOnOperation() {
+  SymbolTable sym_table(getOperation());
+  CustomCallDeclarations custom_calls(std::move(sym_table));
+
+  OpCapturePatternSet patterns;
+  patterns.emplace_back(new LaunchFuncOpCapture());
+  patterns.emplace_back(new ConvOpCapture());
+  patterns.emplace_back(new ConstantOpCapture());
+  patterns.emplace_back(new ViewOpCapture());
+
+  unsigned ordinal = 1;  // entry point will be exported with ordinal 0
+  for (auto& seq : CollectCaptureSequences(getAnalysis<DominanceInfo>(),
+                                           getOperation(), patterns)) {
+    if (succeeded(Outline(ordinal, custom_calls, seq))) ordinal++;
+  }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>> createOutlineCudaGraphsPass() {
+  return std::make_unique<OutlineCudaGraphsPass>();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
new file mode 100644
index 00000000000..e71c76ab0e7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
+
+#include <cstdlib>
+#include <string_view>
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+
+namespace xla {
+namespace gpu {
+
+using namespace mlir;  // NOLINT
+
+void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
+                                 ThunkSequence* thunk_sequence,
+                                 const GpuPipelineOpts& opts) {
+  // Lower operations with registered IR emitters to Gpu launches.
+  pm.addPass(createConvertLmhloToGpuLaunchPass(thunk_sequence));
+
+  // Clean up IR before converting it to the runtime operations.
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+
+  // Convert global memrefs corresponding to constant arguments.
+  pm.addPass(createConvertMemrefGetGlobalToArgPass());
+  pm.addPass(createSymbolDCEPass());  // Clean up unused global constants.
+
+  // Outline CUDA-Graph-compatible operations into graph capture functions.
+  if (opts.enable_cuda_graphs) {
+    pm.addPass(createOutlineCudaGraphsPass());
+  }
+
+  // Lower all Gpu operations to the XLA Gpu runtime custom calls.
+  pm.addPass(createConvertLmhloGpuToGpuRuntimePass());
+  pm.addPass(createConvertLmhloToGpuRuntimePass());
+  pm.addPass(createConvertGpuToGpuRuntimePass());
+
+  // Add performance tracing annotations.
+  pm.addPass(createAddHloTraceAnnotationsPass());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
new file mode 100644
index 00000000000..1777e06505f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
@@ -0,0 +1,108 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+namespace gpu {
+
+#define GEN_PASS_DECL_ADDHLOTRACEANNOTATIONSPASS
+#define GEN_PASS_DECL_CONVERTGPUTOGPURUNTIMEPASS
+#define GEN_PASS_DECL_CONVERTLMHLOGPUTOGPURUNTIMEPASS
+#define GEN_PASS_DECL_CONVERTLMHLOTOGPULAUNCHPASS
+#define GEN_PASS_DECL_CONVERTLMHLOTOGPURUNTIMEPASS
+#define GEN_PASS_DECL_CONVERTMEMREFGETGLOBALTOARGPASS
+#define GEN_PASS_DECL_OUTLINECUDAGRAPHSPASS
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
+
+class ThunkSequence;  // forward declare
+
+struct GpuPipelineOpts {
+  // Enable experimental pass that outlines parts of the XLA computation into
+  // CUDA Graphs, which allows us to amortize the cost of launching multiple
+  // device kernels.
+  bool enable_cuda_graphs = false;
+};
+
+// Populate passes that lower MLIR modules from a combination of LMHLO and
+// LMHLO_GPU dialects to the XLA Gpu runtime. This pipeline is composed from
+// the passes defined below, and few builtin MLIR passes.
+void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
+                                 ThunkSequence* thunk_sequence,
+                                 const GpuPipelineOpts& opts = {});
+
+//===----------------------------------------------------------------------===//
+// Auxiliary passes for lowering to XLA Gpu runtime.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertMemrefGetGlobalToArgPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertMemrefGetGlobalToArgPass(int64_t min_num_elements);
+
+//===-----------------------------------------------------------------------===/
+// Passes for lowering from the `gpu` dialect.
+//===-----------------------------------------------------------------------===/
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertGpuToGpuRuntimePass();
+
+//===----------------------------------------------------------------------===//
+// Passes for lowering from the `lmhlo` dialect.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloToGpuLaunchPass(ThunkSequence* thunk_sequence = nullptr);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloToGpuRuntimePass();
+
+//===----------------------------------------------------------------------===//
+// Passes for lowering from the `lmhlo_gpu` dialect.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createConvertLmhloGpuToGpuRuntimePass();
+
+//===----------------------------------------------------------------------===//
+// XLA runtime performance tracing passes.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createAddHloTraceAnnotationsPass();
+
+//===----------------------------------------------------------------------===//
+// XLA runtime <-> Cuda Graphs integration.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createOutlineCudaGraphsPass();
+
+//===-----------------------------------------------------------------------===/
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td
new file mode 100644
index 00000000000..9608e325811
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td
@@ -0,0 +1,190 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_GPU_PASSES
+#define XLA_GPU_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+//===----------------------------------------------------------------------===//
+// Auxiliary passes for lowering to XLA Gpu runtime.
+//===----------------------------------------------------------------------===//
+
+def ConvertMemrefGetGlobalToArgPass :
+    Pass<"xla-memref-get-global-to-arg", "mlir::ModuleOp"> {
+  let summary = "Converts memref.get_global corresponding to lmhlo constants";
+
+  let description = [{
+    Replaces `memref.get_global` operations corresponding to the lmhlo constant
+    arguments (arguments marked with `lmhlo.constant_name` attribute) to use
+    the constant arguments directly.
+
+    Once we used global constants for constant folding, we no longer need to
+    keep them in the module, because they'll be in the binary constant section
+    on the host, and we need them on the device.
+  }];
+
+  let constructor = "createConvertMemrefGetGlobalToArgPass()";
+
+  let options = [
+    Option<"min_num_elements_", "min-num-elements", "int64_t", /*default=*/"0",
+           "Do not convert `memref.get_global` operation if the number of "
+           "elements is smaller than the given value.">,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// Passes for lowering from the `gpu` dialect.
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuToGpuRuntimePass :
+    Pass<"xla-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
+  let summary = "Converts gpu operations to XLA Gpu runtime custom calls";
+
+  let description = [{
+    Converts gpu operations (function launch, memcpy, etc...) to the XLA Gpu
+    runtime custom calls.
+  }];
+
+  let constructor = "createConvertGpuToGpuRuntimePass()";
+}
+
+//===----------------------------------------------------------------------===//
+// Passes for lowering from the `lmhlo` dialect.
+//===----------------------------------------------------------------------===//
+
+def ConvertLmhloToGpuLaunchPass :
+    Pass<"xla-lmhlo-to-gpu-launch", "mlir::ModuleOp"> {
+  let summary = "Converts lmhlo fusions to Gpu dialect kernel launch";
+
+  let description = [{
+    Converts lmhlo operations that have registered IR emitters (e.g. fusions) to
+    Gpu dialect kernel launch operations (and trivial memory operations like
+    memcpy or memset). This pass relies on a pre-compiled ThunkSequence with an
+    associated device module (PTX and cubin) to find device kernels
+    corresponding to lmhlo operation in the input module.
+
+    Created Gpu kernel launch operations can be further lowered to the Gpu
+    runtime by the `xla-gpu-to-gpu-runtime` pass.
+  }];
+
+  let constructor = "createConvertLmhloToGpuLaunchPass()";
+}
+
+def ConvertLmhloToGpuRuntimePass :
+    Pass<"xla-lmhlo-to-gpu-runtime", "mlir::ModuleOp"> {
+  let summary = "Converts lmhlo operations to XLA Gpu runtime custom calls";
+
+  let description = [{
+    Converts lmhlo dialect operations (infeed, outfeed, collectives, etc...) to
+    the XLA Gpu runtime custom calls.
+  }];
+
+  let constructor = "createConvertLmhloToGpuRuntimePass()";
+}
+
+//===----------------------------------------------------------------------===//
+// Passes for lowering from the `lmhlo_gpu` dialect.
+//===----------------------------------------------------------------------===//
+
+def ConvertLmhloGpuToGpuRuntimePass :
+    Pass<"xla-lmhlo-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
+  let summary = "Converts lmhlo_gpu operations to XLA Gpu runtime custom calls";
+
+  let description = [{
+    Converts lmhlo_gpu dialect operations (gemm, convolution, etc...) to
+    the XLA Gpu runtime custom calls.
+  }];
+
+  let constructor = "createConvertLmhloGpuToGpuRuntimePass()";
+}
+
+//===----------------------------------------------------------------------===//
+// XLA runtime performance tracing passes.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): This pass should be generic for all backends, consider
+// moving it to the `transforms/runtime` folder once it will be used by CPU
+// compiler.
+
+def AddHloTraceAnnotationsPass :
+    Pass<"xla-add-hlo-trace-annotations", "mlir::ModuleOp"> {
+  let summary = "Adds HLO trace annotations to the supported operations";
+
+  let description = [{
+    Adds HLO trace annotations to the operations that result from compiling
+    an input HLO module, e.g. it adds HLO trace annotations to all runtime custom
+    calls that are constructed from the corresponding HLO operations.
+
+    Example:
+
+    ```mlir
+    call @xla.gpu.gemm(...) : (...) -> memref<?x?xf32>
+    ```
+
+    becomes:
+
+    ```mlir
+    call @xla.gpu.gemm(...) { rt.trace = #rt.hlo<"gemm.1", "xla_module", 0> }
+      : (...) -> memref<?x?xf32>
+    ```
+
+    XLA compilation pipeline wraps traced operations into the `rt.trace`
+    operation, and eventually lowers them to the tracing API calls.
+  }];
+
+  let constructor = "createAddHloTraceAnnotationsPass()";
+}
+
+//===----------------------------------------------------------------------===//
+// Xla Gpu <-> Cuda Graphs integration.
+//===----------------------------------------------------------------------===//
+
+def OutlineCudaGraphsPass :
+    Pass<"xla-gpu-outline-cuda-graphs", "mlir::ModuleOp"> {
+  let summary = "Outline sequences of Xla Gpu operations into CUDA Graphs";
+
+  let description = [{
+    Converts sequences of supported Xla Gpu operations to Cuda Graph capture
+    functions, and replaces the original sequences with calls to the Xla Cuda
+    Graph runtime API.
+
+    Example:
+
+    ```mlir
+    gpu.launch_func @compute::foo args(%arg0: memref<?xf32>)
+    gpu.launch_func @compute::bar args(%arg1: memref<?xf32>)
+    ```
+
+    becomes:
+
+    ```mlir
+    // Export cuda graph capture function to Xla runtime.
+    rt.export @capture ordinal 1
+    func.func @capture(@arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+      ... capture a graph corresponding to a sequence of `gpu.launch_func` ops
+    }
+
+    // Replace a sequence of graph launch operations with a call to runtime API.
+    call @xla.gpu.cuda.graph.launch(%arg0: memref<?xf32>,
+                                     %arg1: memref<?xf32>)
+      attributes { capture = @capture }
+    ```
+  }];
+
+  let constructor = "createOutlineCudaGraphsPass()";
+}
+
+#endif  // XLA_GPU_PASSES
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD
new file mode 100644
index 00000000000..3d18ffe7130
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/backends/gpu:xla-gpu-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/add_hlo_trace.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir
similarity index 77%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/add_hlo_trace.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir
index bb5cb5dce72..d2f46c8d6bd 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/add_hlo_trace.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-gpu-opt %s -xla-add-hlo-trace-annotations | FileCheck %s --dump-input=always
+// RUN: xla-gpu-opt %s -xla-add-hlo-trace-annotations | FileCheck %s
 
 module attributes { mhlo.unique_id = 42 : i64 } {
 
@@ -7,7 +7,7 @@ func.func private @xla.foo() attributes { rt.custom_call = "xla.foo" }
 // CHECK: func @func() {
 func.func @func() {
   // CHECK: call @xla.foo()
-  // CHECK-SAME: rt.trace = #rt.hlo_trace<"gemm.name.42", "module-name", 42>
+  // CHECK-SAME: rt.trace = #rt.hlo_trace<"gemm.name.42">
   call @xla.foo() : () -> () loc("gemm.name.42")
   return
 }
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir
new file mode 100644
index 00000000000..e05ff982bb3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir
@@ -0,0 +1,64 @@
+// RUN: xla-gpu-opt %s -xla-gpu-to-gpu-runtime | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+// CHECK-NOT: gpu.module
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
+    gpu.return
+  }
+  gpu.func @fn1(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(
+// CHECK:   %[[ARG0:.*]]: memref<4x4xf32>,
+// CHECK:   %[[ARG1:.*]]: memref<4x4xf32>
+// CHECK: )
+func.func @func(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) {
+  // Launch dimensions converted to i32 as a part of the lowering.
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
+  // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : i32
+  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
+  // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32
+  // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : i32
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+  // CHECK-DAG: %[[C256:.*]] = arith.constant 256 : i32
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+  %c256 = arith.constant 256 : i32
+
+  // CHECK: call @[[LAUNCH:[_a-z.]+]](%[[C0]], %[[C1]], %[[C2]], %[[C3]],
+  // CHECK-SAME: %[[C4]], %[[C5]], %[[C6]], %[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: kernel = "fn0"
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c2, %c3)
+    threads in (%c4, %c5, %c6)
+    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
+
+  // CHECK: call @[[LAUNCH]](%[[C256]], %[[C3]], %[[C2]], %[[C1]], %[[C6]],
+  // CHECK-SAME: %[[C5]], %[[C4]], %[[ARG0]], %[[ARG1]])
+  // CHECK-DAG: kernel = "fn1"
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c3, %c2, %c1)
+    threads in (%c6, %c5, %c4)
+    dynamic_shared_memory_size %c256
+    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
+
+  func.return
+}
+
+// CHECK: func private @[[LAUNCH]](i32, i32, i32, i32, i32, i32,
+// CHECK-SAME: memref<4x4xf32>, memref<4x4xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.func.launch"}
+
+// Check that we have a single custom call declaration in the module.
+// CHECK-NOT: rt.custom_call
+
+}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_memcpy.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_memcpy.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_memcpy.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_memcpy.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_memset.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_memset.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_memset.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/gpu_memset.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_case.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_case.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_case.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_case.mlir
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir
new file mode 100644
index 00000000000..0a1b067f50c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir
@@ -0,0 +1,64 @@
+// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-to-gpu-runtime \
+// RUN:   | FileCheck %s
+
+// CHECK: func @test
+// CHECK:   %[[ARG0:.*]]: memref<f32>
+// CHECK: )
+func.func @test(%arg0: memref<f32>) {
+  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
+  // CHECK-SAME:   api_version = 2 : i32
+  // CHECK-SAME:   backend_config = ""
+  // CHECK-SAME:   call_target_name = "target"
+  // CHECK-SAME: : (memref<f32>) -> ()
+  "lmhlo.custom_call"(%arg0) ({}) {
+    api_version = 2 : i32,
+    backend_config = "",
+    call_target_name = "target",
+    operand_segment_sizes = array<i32: 0, 1>
+  } : (memref<f32>) -> ()
+  return
+}
+
+// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
+
+// -----
+
+// CHECK: func @test_with_mapping
+// CHECK:   %[[ARG0:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG1:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG2:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG3:[0-9a-z]*]]: memref<f32>,
+// CHECK:   %[[ARG4:[0-9a-z]*]]: memref<f32>
+// CHECK: )
+func.func @test_with_mapping(
+    %arg0: memref<f32>,
+    %arg1: memref<f32>,
+    %arg2: memref<f32>,
+    %arg3: memref<f32>,
+    %arg4: memref<f32>) {
+  // CHECK: %[[HOLE:.*]] = arith.constant -1 : i64
+
+  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
+  // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
+  // CHECK-SAME:   api_version = 1 : i32
+  // CHECK-SAME:   backend_config = ""
+  // CHECK-SAME:   call_target_name = "target"
+  "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) ({}) {
+    api_version = 1 : i32,
+    backend_config = "",
+    call_target_name = "target",
+    operand_segment_sizes = array<i32: 2, 3>,
+    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
+      num_args = 4,
+      num_results = 4,
+      args_to_target_args = [0, 2],
+      results_to_target_results = [0, 1, 3]>
+    } : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
+
+  return
+}
+
+// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>, i64, memref<f32>, i64,
+// CHECK-SAME: memref<f32>, memref<f32>, i64, memref<f32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_fft.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir
similarity index 96%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_fft.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir
index 99c97350fc9..aeb19228d01 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_fft.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir
@@ -10,6 +10,7 @@ func.func @compute(%arg0: memref<3x5x16x5xcomplex<f32>>,
   // CHECK: call @[[FFT:.*]](%[[ARG0]], %[[ARG1]])
   // CHECK-SAME: fft_length = dense<[16, 8]> : tensor<2xi64>
   // CHECK-SAME: fft_type = #mhlo<fft_type IRFFT>
+  // CHECK-SAME: uid = 0 : i64
   "lmhlo.fft"(%arg0, %arg1) {
     fft_length = dense<[16, 8]> : tensor<2xi64>,
     fft_type = #mhlo<fft_type IRFFT>
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_cholesky.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cholesky.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_cholesky.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cholesky.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_conv.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_conv.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_conv.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_conv.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_cublas_lt_matmul.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir
similarity index 94%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_cublas_lt_matmul.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir
index 15f0b199906..d5f99754fd9 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_cublas_lt_matmul.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir
@@ -34,7 +34,8 @@ func.func @compute(%a: memref<2x6x2x2xf32>,
        lhs_contracting_dimensions = [3],
        rhs_contracting_dimensions = [2]>,
      epilogue = #lmhlo_gpu<epilogue Default>,
-     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
+     operand_segment_sizes = array<i32: 1, 1, 1, 1, 0, 0>
    } : (memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
         memref<2x6x2x2xf32>, memref<2x6x2x2xf32>) -> ()
 
@@ -70,7 +71,7 @@ func.func @compute(%a: memref<2x6x2x2xf32>,
   // CHECK-SAME:                        rhs_batching_dimensions = [0, 1],
   // CHECK-SAME:                        lhs_contracting_dimensions = [3],
   // CHECK-SAME:                        rhs_contracting_dimensions = [2]>
-  // CHECK-SAME:   epilogue = #lmhlo_gpu<epilogue Default>
+  // CHECK-SAME:   epilogue = #lmhlo_gpu<epilogue Bias>
   // CHECK-SAME:   precision = dense<0> : tensor<2xi32>
   // CHECK-SAME:   uid = 0 : i64
   "lmhlo_gpu.cublas.lt.matmul"(%a, %b, %c, %d, %bias) {
@@ -83,8 +84,9 @@ func.func @compute(%a: memref<2x6x2x2xf32>,
        rhs_batching_dimensions = [0, 1],
        lhs_contracting_dimensions = [3],
        rhs_contracting_dimensions = [2]>,
-     epilogue = #lmhlo_gpu<epilogue Default>,
-     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+     epilogue = #lmhlo_gpu<epilogue Bias>,
+     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
+     operand_segment_sizes = array<i32: 1, 1, 1, 1, 1, 0>
    } : (memref<2x6x2x2xf32>, memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
         memref<2x6x2x2xf32>, memref<2x6x2x2xf32>) -> ()
 
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_gemm.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_gemm.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_gpu_gemm.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_gemm.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_infeed.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_infeed.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_infeed.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_infeed.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_outfeed.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_outfeed.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_outfeed.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_outfeed.mlir
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir
new file mode 100644
index 00000000000..f51bc6a1657
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir
@@ -0,0 +1,102 @@
+// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-to-gpu-runtime \
+// RUN:   | FileCheck %s
+
+// CHECK: func @send(
+// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<4xf32>
+// CHECK: )
+func.func @send(%arg0: memref<4xf32>) {
+  // CHECK: call @xla.gpu.send(%[[ARG0]]) {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+  // CHECK-SAME:   frontend_attributes = {
+  // CHECK-SAME:     _xla_dcn_recv_channel = "2",
+  // CHECK-SAME:     _xla_host_transfer_handler_name = "undef",
+  // CHECK-SAME:     _xla_host_transfer_is_lower_bits = "false",
+  // CHECK-SAME:     _xla_host_transfer_original_type = "f32",
+  // CHECK-SAME:     _xla_host_transfer_rendezvous = "undef"
+  // CHECK-SAME:   },
+  // CHECK-SAME:   is_host_transfer = true
+  // CHECK-SAME: } : (memref<4xf32>) -> ()
+  "lmhlo.send"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    frontend_attributes = {_xla_dcn_recv_channel = "2",
+                           _xla_host_transfer_handler_name = "undef",
+                           _xla_host_transfer_is_lower_bits = "false",
+                           _xla_host_transfer_original_type = "f32",
+                           _xla_host_transfer_rendezvous = "undef"},
+    is_host_transfer = true
+  } : (memref<4xf32>) -> !mhlo.token
+  return
+}
+
+// CHECK: func private @xla.gpu.send(memref<4xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.send"}
+
+// -----
+
+// CHECK: func @recv(
+// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<4xf32>
+// CHECK: )
+func.func @recv(%arg0: memref<4xf32>) {
+  // CHECK: call @xla.gpu.recv(%[[ARG0]]) {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+  // CHECK-SAME:   frontend_attributes = {
+  // CHECK-SAME:     _xla_host_transfer_handler_name = "undef",
+  // CHECK-SAME:     _xla_host_transfer_is_lower_bits = "false",
+  // CHECK-SAME:     _xla_host_transfer_original_type = "f32",
+  // CHECK-SAME:     _xla_host_transfer_rendezvous = "undef"
+  // CHECK-SAME:   },
+  // CHECK-SAME:   is_host_transfer = true
+  // CHECK-SAME: } : (memref<4xf32>) -> ()
+  "lmhlo.recv"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    frontend_attributes = {_xla_host_transfer_handler_name = "undef",
+                           _xla_host_transfer_is_lower_bits = "false",
+                           _xla_host_transfer_original_type = "f32",
+                           _xla_host_transfer_rendezvous = "undef"},
+    is_host_transfer = true
+  } : (memref<4xf32>) -> !mhlo.token
+  return
+}
+
+// CHECK: func private @xla.gpu.recv(memref<4xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.recv"}
+
+// -----
+
+// CHECK: func @send_done(
+// CHECK:   %[[ARG0:[a-z0-9]+]]: !mhlo.token
+// CHECK: )
+func.func @send_done(%arg0: !mhlo.token) {
+  // CHECK: call @xla.gpu.send_done() {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+  // CHECK-SAME:   is_host_transfer = true
+  // CHECK-SAME: } : () -> ()
+  "lmhlo.send_done"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    is_host_transfer = true
+  } : (!mhlo.token) -> ()
+  return
+}
+
+// CHECK: func private @xla.gpu.send_done()
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.send_done"}
+
+// -----
+
+// CHECK: func @recv_done(
+// CHECK:   %[[ARG0:[a-z0-9]+]]: !mhlo.token
+// CHECK: )
+func.func @recv_done(%arg0: !mhlo.token) {
+  // CHECK: call @xla.gpu.recv_done() {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+  // CHECK-SAME:   is_host_transfer = true
+  // CHECK-SAME: } : () -> ()
+  "lmhlo.recv_done"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    is_host_transfer = true
+  } : (!mhlo.token) -> ()
+  return
+}
+
+// CHECK: func private @xla.gpu.recv_done()
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.recv_done"}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_while.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_while.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_while.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/lmhlo_while.mlir
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/memref_get_global_to_arg.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir
similarity index 99%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/tests/memref_get_global_to_arg.mlir
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir
index 9583a5b7fd3..6361c77f145 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/memref_get_global_to_arg.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir
@@ -38,6 +38,6 @@ func.func @get_global(%arg0: memref<24xi8> {lmhlo.constant_name = "cst0"},
   %2 = memref.get_global @cst2 : memref<2x3xf32, #map>
 
   // CHECK: return %[[V0]], %[[V1]], %[[V2]]
-  // CHECK-SAME: : memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map>
+  // CHECK-SAME: : memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map{{[0-9]*}}>
   return %0, %1, %2 : memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map>
 }
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
new file mode 100644
index 00000000000..72593f188c1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
@@ -0,0 +1,290 @@
+// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-outline-cuda-graphs \
+// RUN:   | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(
+// CHECK:   %[[ARG0:.*]]: memref<?xf32>,
+// CHECK:   %[[ARG1:.*]]: memref<?xf32>
+// CHECK: )
+func.func @func(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c6 = arith.constant 6 : index
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: return
+
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c2, %c3)
+    threads in (%c4, %c5, %c6)
+    args(%arg0 : memref<?xf32>)
+
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c3, %c2, %c1)
+    threads in (%c6, %c5, %c4)
+    args(%arg1 : memref<?xf32>)
+
+  func.return
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT:  %[[C1:.*]] = arith.constant 1
+// CHECK-NEXT:  %[[C2:.*]] = arith.constant 2
+// CHECK-NEXT:  %[[C3:.*]] = arith.constant 3
+// CHECK-NEXT:  %[[C4:.*]] = arith.constant 4
+// CHECK-NEXT:  %[[C5:.*]] = arith.constant 5
+// CHECK-NEXT:  %[[C6:.*]] = arith.constant 6
+// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
+// CHECK-SAME:    blocks in (%[[C1]], %[[C2]], %[[C3]])
+// CHECK-SAME:    threads in (%[[C4]], %[[C5]], %[[C6]])
+// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn1
+// CHECK-SAME:    blocks in (%[[C3]], %[[C2]], %[[C1]])
+// CHECK-SAME:    threads in (%[[C6]], %[[C5]], %[[C4]])
+// CHECK-NEXT:  return
+
+// CHECK: func private @xla.gpu.cuda.graph.launch(memref<?xf32>, memref<?xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.cuda.graph.launch"}
+}
+
+// -----
+// Check that single function launch was not outlined into graph capture.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
+func.func @func(%arg0: memref<?xf32>) {
+  %c1 = arith.constant 1 : index
+
+  // CHECK: gpu.launch_func {{.*}} args(%[[ARG0]] : memref<?xf32>)
+  // CHECK-NOT: call @xla.gpu.cuda.graph.launch
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg0 : memref<?xf32>)
+
+  func.return
+}
+
+}
+
+// -----
+// Check that two different sequences are outlined in different capture
+// functions.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
+func.func @func(%arg0: memref<?xf32>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1
+  %c1 = arith.constant 1 : index
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+  // CHECK-SAME: {capture = @[[CAPTURE:.*]]}
+
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg0 : memref<?xf32>)
+
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg0 : memref<?xf32>)
+
+  // CHECK: %[[C2:.*]] = arith.constant 2
+  %c2 = arith.constant 2 : index
+
+  // Use function call to break the captured ops sequence.
+  // CHECK: call @external
+  call @external(): () -> ()
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+  // CHECK-SAME: {capture = @[[CAPTURE_0:.*]]}
+
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c2, %c2, %c2)
+    threads in (%c2, %c2, %c2)
+    args(%arg0 : memref<?xf32>)
+
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c2, %c2, %c2)
+    threads in (%c2, %c2, %c2)
+    args(%arg0 : memref<?xf32>)
+
+  func.return
+}
+
+func.func private @external()
+
+// CHECK: rt.export @[[CAPTURE]]
+// CHECK: func.func @[[CAPTURE]](%arg0: memref<?xf32>)
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+
+// CHECK: rt.export @[[CAPTURE_0]]
+// CHECK: func.func @[[CAPTURE_0]](%arg0: memref<?xf32>)
+// CHECK-NEXT: arith.constant 2
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+
+}
+
+// -----
+// Check that constants from the different basic blocks are cloned into the
+// graph capture function.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(
+// CHECK:   %[[ARG0:.*]]: memref<?xf32>,
+// CHECK:   %[[ARG1:.*]]: memref<?xf32>
+// CHECK: )
+func.func @func(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
+  cf.br ^bb2
+^bb1:
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: return
+
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg0 : memref<?xf32>)
+
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg1 : memref<?xf32>)
+
+  func.return
+
+^bb2:
+  %c1 = arith.constant 1 : index
+  cf.br ^bb1
+}
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
+
+// -----
+// Check that memref.view operations are cloned into the graph capture function.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<4xf32>) kernel { gpu.return }
+  gpu.func @fn1(%arg0: memref<4xf32>) kernel { gpu.return }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
+func.func @func(%arg0: memref<16xi8>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %view = memref.view %arg0[%c0][] : memref<16xi8> to memref<4xf32>
+
+  call @external() : () -> ()
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: return
+  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%view : memref<4xf32>)
+  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%view : memref<4xf32>)
+
+  func.return
+}
+
+func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 0
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: memref.view
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
+
+// -----
+// Check that memref.view not used by operations in the captured graph will not
+// be moved into the graph capture function.
+
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
+  gpu.func @fn1(%arg0: memref<16xi8>) kernel { gpu.return }
+}
+
+// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
+func.func @func(%arg0: memref<16xi8>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  call @external() : () -> ()
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: memref.view
+  // CHECK-NEXT: return
+  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%arg0 : memref<16xi8>)
+  %view = memref.view %arg0[%c0][] : memref<16xi8> to memref<4xf32>
+  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1) args(%arg0 : memref<16xi8>)
+
+  func.return
+}
+
+func.func private @external()
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture
+// CHECK-NEXT: arith.constant 1
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
+// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
+// CHECK-NEXT: return
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/uid_generator.h b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h
similarity index 83%
rename from tensorflow/compiler/xla/mlir/transforms/gpu/uid_generator.h
rename to tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h
index 4740363e13b..9b3a925e0a7 100644
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/uid_generator.h
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/uid_generator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_UID_GENERATOR_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_UID_GENERATOR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
 
 #include <atomic>
 
@@ -39,4 +39,4 @@ class UidGenerator {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_UID_GENERATOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/xla-gpu-opt.cc b/tensorflow/compiler/xla/mlir/backends/gpu/xla-gpu-opt.cc
new file mode 100644
index 00000000000..74a6552f191
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/xla-gpu-opt.cc
@@ -0,0 +1,34 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+
+int main(int argc, char **argv) {
+  mlir::DialectRegistry registry;
+  registry
+      .insert<mlir::memref::MemRefDialect, mlir::func::FuncDialect,
+              mlir::gpu::GPUDialect, mlir::mhlo::MhloDialect,
+              mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect>();
+
+  xla::gpu::registerGpuTransformsPasses();
+
+  return failed(MlirOptMain(argc, argv, "Xla Gpu Pass Driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/mlir/framework/ir/BUILD b/tensorflow/compiler/xla/mlir/framework/ir/BUILD
new file mode 100644
index 00000000000..07f6d640466
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/ir/BUILD
@@ -0,0 +1,74 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+td_library(
+    name = "td_files",
+    srcs = [
+        "xla_framework_ops.td",
+    ],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "xla_framework_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "xla_framework.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "xla_framework.cc.inc",
+        ),
+        (
+            ["-gen-dialect-decls"],
+            "xla_framework_dialect.h.inc",
+        ),
+        (
+            ["-gen-dialect-defs"],
+            "xla_framework_dialect.cc.inc",
+        ),
+        (
+            ["-gen-typedef-decls"],
+            "xla_framework_types.h.inc",
+        ),
+        (
+            ["-gen-typedef-defs"],
+            "xla_framework_types.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xla_framework_ops.td",
+    deps = [":td_files"],
+)
+
+cc_library(
+    name = "xla_framework",
+    srcs = [
+        "xla_framework.cc",
+        "xla_framework.cc.inc",
+        "xla_framework.h.inc",
+    ],
+    hdrs = ["xla_framework.h"],
+    deps = [
+        ":xla_framework_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.cc b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.cc
new file mode 100644
index 00000000000..9ad145d764d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.cc
@@ -0,0 +1,51 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the xla_framework dialect.
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+
+// Generated dialect definitions.
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework_dialect.cc.inc"
+#define GET_TYPEDEF_CLASSES
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework_types.cc.inc"
+
+namespace mlir {
+namespace xla_framework {
+
+// Setup operations and types
+void XLAFrameworkDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.cc.inc"
+#undef GET_OP_LIST
+      >();
+
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework_types.cc.inc"
+#undef GET_TYPEDEF_LIST
+      >();
+}
+
+}  // namespace xla_framework
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h
new file mode 100644
index 00000000000..1ab417c5a70
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h
@@ -0,0 +1,36 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations and types used in the XLAFramework dialect.
+//
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+
+#define GET_TYPEDEF_CLASSES
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework_types.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h.inc"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework_dialect.h.inc"
+
+#undef GET_OP_CLASSES
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_IR_XLA_FRAMEWORK_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/xla_framework_ops.td b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
similarity index 98%
rename from tensorflow/compiler/mlir/xla/ir/xla_framework_ops.td
rename to tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
index d4a260f22e1..8b57eff2aa4 100644
--- a/tensorflow/compiler/mlir/xla/ir/xla_framework_ops.td
+++ b/tensorflow/compiler/xla/mlir/framework/ir/xla_framework_ops.td
@@ -37,8 +37,8 @@ def XLAFramework_Dialect : Dialect {
     static constexpr StringRef kXLAEntryAttrName = "xla_entry";
   }];
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
   let useDefaultTypePrinterParser = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 def XLAFramework_BufferType : TypeDef<XLAFramework_Dialect, "Buffer"> {
diff --git a/tensorflow/compiler/xla/mlir/framework/tests/BUILD b/tensorflow/compiler/xla/mlir/framework/tests/BUILD
new file mode 100644
index 00000000000..8fefb30ac1b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/tests/BUILD
@@ -0,0 +1,28 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+        "hlotxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+# This intentionally does not pull-in the top-level tf-opt to reduce the
+# dependencies.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla:xla-translate-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-xla-framework.mlir b/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/xla/tests/legalize-xla-framework.mlir
rename to tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
index 8ff92f186c2..3747f70feb7 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-xla-framework.mlir
+++ b/tensorflow/compiler/xla/mlir/framework/tests/legalize-xla-framework.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s -xla-legalize-xla-framework-to-llvm | FileCheck %s
+// RUN: xla-translate-opt %s -xla-legalize-xla-framework-to-llvm | FileCheck %s
 
 memref.global "private" constant @__constant_xf32 : memref<f32> = dense<42.0>
 
diff --git a/tensorflow/compiler/mlir/xla/tests/outline-with-xla-framework.mlir b/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
similarity index 86%
rename from tensorflow/compiler/mlir/xla/tests/outline-with-xla-framework.mlir
rename to tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
index 85aa1a9b4db..25dc64e8f61 100644
--- a/tensorflow/compiler/mlir/xla/tests/outline-with-xla-framework.mlir
+++ b/tensorflow/compiler/xla/mlir/framework/tests/outline-with-xla-framework.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s -split-input-file -outline-with-xla-framework | FileCheck %s
+// RUN: xla-translate-opt %s -split-input-file -outline-with-xla-framework | FileCheck %s
 
 // CHECK-LABEL: @func_to_outline_xla_framework
 // CHECK-SAME: %[[ARG0:.*]]: !xla_framework.buffer
diff --git a/tensorflow/compiler/mlir/xla/tests/xla-framework.mlir b/tensorflow/compiler/xla/mlir/framework/tests/xla-framework.mlir
similarity index 93%
rename from tensorflow/compiler/mlir/xla/tests/xla-framework.mlir
rename to tensorflow/compiler/xla/mlir/framework/tests/xla-framework.mlir
index e081c578484..d8d6e9abf06 100644
--- a/tensorflow/compiler/mlir/xla/tests/xla-framework.mlir
+++ b/tensorflow/compiler/xla/mlir/framework/tests/xla-framework.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s | xla-opt | FileCheck %s
+// RUN: xla-translate-opt %s | FileCheck %s
 
 // CHECK-LABEL: @buffer_type
 func.func @buffer_type(%arg1: !xla_framework.buffer) -> !xla_framework.buffer
diff --git a/tensorflow/compiler/xla/mlir/framework/transforms/BUILD b/tensorflow/compiler/xla/mlir/framework/transforms/BUILD
new file mode 100644
index 00000000000..5a1fddaa0a9
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/BUILD
@@ -0,0 +1,57 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=XlaFramework",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@llvm-project//mlir:PassBaseTdFiles",
+        "@llvm-project//mlir:TensorOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "outline_with_xla_framework.cc",
+        "xla_framework_to_llvm_pass.cc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    deps = [
+        ":passes_inc_gen",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/transforms/outline_with_xla_framework.cc b/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
similarity index 91%
rename from tensorflow/compiler/mlir/xla/transforms/outline_with_xla_framework.cc
rename to tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
index 1b7482c0300..76a5cff9cc3 100644
--- a/tensorflow/compiler/mlir/xla/transforms/outline_with_xla_framework.cc
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/outline_with_xla_framework.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "mlir/IR/TypeRange.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 
 namespace mlir {
 namespace mhlo {
@@ -54,14 +54,13 @@ struct OutlineXLAFunc : public RewritePattern {
   explicit OutlineXLAFunc(MLIRContext *context, PatternBenefit benefit = 1)
       : RewritePattern(func::FuncOp::getOperationName(), benefit, context) {}
 
-  static void filterFuncAttributes(ArrayRef<NamedAttribute> attrs,
-                                   bool argAttrs,
+  static void filterFuncAttributes(func::FuncOp func, bool argAttrs,
                                    SmallVectorImpl<NamedAttribute> &result) {
-    for (const auto &attr : attrs) {
+    for (const auto &attr : func->getAttrs()) {
       if (attr.getName() == SymbolTable::getSymbolAttrName() ||
-          attr.getName() == FunctionOpInterface::getTypeAttrName() ||
+          attr.getName() == func.getFunctionTypeAttrName() ||
           attr.getName() == "std.varargs" ||
-          (argAttrs && attr.getName() == func::FuncOp::getArgDictAttrName()))
+          (argAttrs && attr.getName() == func.getArgAttrsAttrName()))
         continue;
       result.push_back(attr);
     }
@@ -91,7 +90,7 @@ struct OutlineXLAFunc : public RewritePattern {
                                    ::mlir::xla_framework::BufferType::get(ctx));
     auto func_type = FunctionType::get(ctx, operands, result_array);
     SmallVector<NamedAttribute> attrs;
-    filterFuncAttributes(func->getAttrs(), true, attrs);
+    filterFuncAttributes(func, true, attrs);
     SmallVector<DictionaryAttr> arg_attrs;
     func.getAllArgAttrs(arg_attrs);
 
@@ -132,7 +131,7 @@ struct OutlineXLAFunc : public RewritePattern {
 };
 
 #define GEN_PASS_DEF_OUTLINEWITHXLAFRAMEWORK
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h.inc"
 
 class OutlineWithXLAFrameworkPass
     : public impl::OutlineWithXLAFrameworkBase<OutlineWithXLAFrameworkPass> {
diff --git a/tensorflow/compiler/xla/mlir/framework/transforms/passes.h b/tensorflow/compiler/xla/mlir/framework/transforms/passes.h
new file mode 100644
index 00000000000..eb175a0ab2e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/passes.h
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace func {
+class FuncOp;
+}  // namespace func
+template <typename T>
+class OperationPass;
+
+namespace mhlo {
+
+// Prepare module for export to XLA HLO protos/instruction.
+std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareForExport();
+
+// Wrap function with XLA:CPU's C interface.
+std::unique_ptr<OperationPass<ModuleOp>> CreateOutlineWithXLAFrameworkPass();
+
+// Convert XLAFramework operations to LLVM operations.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeXLAFrameworkToLLVMPass();
+
+// Patterns to lower all XLAFramework operations and types to LLVM versions.
+void PopulateLegalizeXLAFrameworkToLLVMPatterns(llvm::StringRef device_type,
+                                                RewritePatternSet& patterns,
+                                                MLIRContext* ctx,
+                                                bool prefer_tf2xla = false);
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_LEGALIZEXLAFRAMEWORKTOLLVM
+#define GEN_PASS_DECL_OUTLINEWITHXLAFRAMEWORK
+#define GEN_PASS_DECL_PREPAREFOREXPORTPASS
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h.inc"
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_FRAMEWORK_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_passes.td b/tensorflow/compiler/xla/mlir/framework/transforms/passes.td
similarity index 100%
rename from tensorflow/compiler/mlir/xla/transforms/xla_passes.td
rename to tensorflow/compiler/xla/mlir/framework/transforms/passes.td
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_framework_to_llvm_pass.cc b/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
similarity index 96%
rename from tensorflow/compiler/mlir/xla/transforms/xla_framework_to_llvm_pass.cc
rename to tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
index 9ca8864edd2..50ac0bdabb3 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_framework_to_llvm_pass.cc
+++ b/tensorflow/compiler/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -35,8 +35,8 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 
 namespace mlir {
 namespace mhlo {
@@ -130,7 +130,7 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
            "xla_entry function lowered with result values when memrefs should "
            "be caller supplied");
 
-    BlockAndValueMapping mapping;
+    IRMapping mapping;
     auto num_refs = funcOp.getFunctionType().getNumInputs();
     auto result_index = 0;
     for (unsigned i = 0; i < num_refs; ++i) {
@@ -179,7 +179,7 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
           rewriter.create<LLVM::StoreOp>(
               loc, ptr,
               rewriter.create<LLVM::GEPOp>(loc, ptr_type, first_load,
-                                           llvm::makeArrayRef(second_index)));
+                                           llvm::ArrayRef(second_index)));
 
         } else {
           // Non tuple outputs can be simply mapped to the first load op.
@@ -217,7 +217,7 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
 };
 
 #define GEN_PASS_DEF_LEGALIZEXLAFRAMEWORKTOLLVM
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h.inc"
 
 class LegalizeXLAFrameworkToLLVMPass
     : public impl::LegalizeXLAFrameworkToLLVMBase<
diff --git a/tensorflow/compiler/xla/mlir/math/BUILD b/tensorflow/compiler/xla/mlir/math/BUILD
new file mode 100644
index 00000000000..39215827e06
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/BUILD
@@ -0,0 +1,16 @@
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/compiler/xla/mlir/...",
+        # copybara:uncomment_begin(google-only)
+        # # TODO(ezhulenev): Clean up dependencies that are leforvers from Autofusion project.
+        # "@tf_runtime//...",
+        # copybara:uncomment_end(google-only)
+    ],
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/BUILD b/tensorflow/compiler/xla/mlir/math/transforms/BUILD
new file mode 100644
index 00000000000..23edd754501
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/BUILD
@@ -0,0 +1,53 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla/mlir/math:friends"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=MathTransforms",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "math_approximation.cc",
+        "math_legalization.cc",
+        "math_optimization.cc",
+    ],
+    hdrs = ["passes.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":passes_inc_gen",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MathToLibm",
+        "@llvm-project//mlir:MathTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
+        "@llvm-project//mlir:X86VectorDialect",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc b/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc
new file mode 100644
index 00000000000..fffd0126049
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc
@@ -0,0 +1,293 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Math/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
+
+namespace xla {
+namespace {
+
+#define GEN_PASS_DEF_MATHAPPROXIMATIONPASS
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h.inc"
+
+using ::llvm::ArrayRef;
+using ::llvm::SmallVector;
+
+using ::mlir::ImplicitLocOpBuilder;
+using ::mlir::LogicalResult;
+using ::mlir::OperationPass;
+using ::mlir::OpRewritePattern;
+using ::mlir::PatternRewriter;
+using ::mlir::RewritePatternSet;
+using ::mlir::Type;
+using ::mlir::Value;
+using ::mlir::VectorType;
+
+namespace arith = ::mlir::arith;
+namespace func = ::mlir::func;
+namespace math = ::mlir::math;
+namespace vector = ::mlir::vector;
+
+using TypePredicate = ::llvm::function_ref<bool(Type)>;
+
+// Returns vector shape if the element type is matching the predicate (scalars
+// that do match the predicate have shape equal to `{1}`).
+llvm::Optional<SmallVector<int64_t, 2>> vectorShape(Type type,
+                                                    TypePredicate pred) {
+  // If the type matches the predicate then its shape is `{1}`.
+  if (pred(type)) return SmallVector<int64_t, 2>{1};
+
+  // Otherwise check if the type is a vector type.
+  auto vectorType = type.dyn_cast<VectorType>();
+  if (vectorType && pred(vectorType.getElementType())) {
+    return llvm::to_vector<2>(vectorType.getShape());
+  }
+
+  return llvm::None;
+}
+
+bool isF32(Type type) { return type.isF32(); }
+
+//----------------------------------------------------------------------------//
+// Broadcast scalar types and values into vector types and values.
+//----------------------------------------------------------------------------//
+
+// Returns true if shape != {1}.
+bool isNonScalarShape(ArrayRef<int64_t> shape) {
+  return shape.size() > 1 || shape[0] > 1;
+}
+
+// Broadcasts scalar type into vector type (iff shape is non-scalar).
+Type broadcast(Type type, ArrayRef<int64_t> shape) {
+  assert(!type.isa<VectorType>() && "must be scalar type");
+  return isNonScalarShape(shape) ? VectorType::get(shape, type) : type;
+}
+
+// Broadcasts scalar value into vector (iff shape is non-scalar).
+Value broadcast(ImplicitLocOpBuilder &builder, Value value,
+                ArrayRef<int64_t> shape) {
+  assert(!value.getType().isa<VectorType>() && "must be scalar value");
+  auto type = broadcast(value.getType(), shape);
+  return isNonScalarShape(shape)
+             ? builder.create<vector::BroadcastOp>(type, value)
+             : value;
+}
+
+//----------------------------------------------------------------------------//
+// Helper functions to create constants.
+//----------------------------------------------------------------------------//
+
+Value f32Cst(ImplicitLocOpBuilder &builder, float value) {
+  return builder.create<arith::ConstantOp>(builder.getF32FloatAttr(value));
+}
+
+Value i32Cst(ImplicitLocOpBuilder &builder, int32_t value) {
+  return builder.create<arith::ConstantOp>(builder.getI32IntegerAttr(value));
+}
+
+Value f32FromBits(ImplicitLocOpBuilder &builder, uint32_t bits) {
+  Value i32v = i32Cst(builder, static_cast<int32_t>(bits));
+  return builder.create<arith::BitcastOp>(builder.getF32Type(), i32v);
+}
+
+struct EigenExpM1Approximation : public OpRewritePattern<math::ExpM1Op> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::ExpM1Op op,
+                                PatternRewriter &rewriter) const final;
+};
+
+LogicalResult EigenExpM1Approximation::matchAndRewrite(
+    math::ExpM1Op op, PatternRewriter &rewriter) const {
+  auto shape = vectorShape(op.getOperand().getType(), isF32);
+  if (!shape.has_value())
+    return rewriter.notifyMatchFailure(op, "unsupported operand type");
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, *shape);
+  };
+
+  // expm1(x) = exp(x) - 1 = u - 1.
+  // We have to handle it carefully when x is near 0, i.e. u ~= 1,
+  // and when the input is ~= -inf, i.e. u - 1 ~= -1.
+  Value cstOne = bcast(f32Cst(builder, 1.0f));
+  Value cstNegOne = bcast(f32Cst(builder, -1.0f));
+  Value x = op.getOperand();
+  Value u = builder.create<math::ExpOp>(x);
+  Value uEqOneOrNaN =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::UEQ, u, cstOne);
+  Value uMinusOne = builder.create<arith::SubFOp>(u, cstOne);
+  Value uMinusOneEqNegOne = builder.create<arith::CmpFOp>(
+      arith::CmpFPredicate::OEQ, uMinusOne, cstNegOne);
+  // logU = log(u) ~= x
+  Value logU = builder.create<math::LogOp>(u);
+
+  // Detect exp(x) = +inf; written this way to avoid having to form +inf.
+  Value isInf =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, logU, u);
+
+  // (u - 1) * (x / ~x)
+  Value expm1 = builder.create<arith::MulFOp>(
+      uMinusOne, builder.create<arith::DivFOp>(x, logU));
+  expm1 = builder.create<arith::SelectOp>(isInf, u, expm1);
+  Value approximation = builder.create<arith::SelectOp>(
+      uEqOneOrNaN, x,
+      builder.create<arith::SelectOp>(uMinusOneEqNegOne, cstNegOne, expm1));
+  rewriter.replaceOp(op, approximation);
+
+  return mlir::success();
+}
+
+struct LogApproximation : public OpRewritePattern<math::LogOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::LogOp op,
+                                PatternRewriter &rewriter) const final;
+};
+
+LogicalResult LogApproximation::matchAndRewrite(
+    math::LogOp op, PatternRewriter &rewriter) const {
+  auto shape = vectorShape(op.getOperand().getType(), isF32);
+  if (!shape.has_value()) {
+    return rewriter.notifyMatchFailure(op, "unsupported operand type");
+  }
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, *shape);
+  };
+
+  Value cst_min_norm_pos = bcast(f32FromBits(builder, 0x00800000u));
+  Value cst_zero = bcast(f32Cst(builder, 0.0f));
+
+  Value x = op.getOperand();
+
+  // Flush positive denormals to zero.
+  Value less_than_zero =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, x, cst_zero);
+  Value less_than_min_norm_pos = builder.create<arith::CmpFOp>(
+      arith::CmpFPredicate::OLT, x, cst_min_norm_pos);
+  x = builder.create<arith::SelectOp>(
+      less_than_min_norm_pos,
+      builder.create<arith::SelectOp>(less_than_zero, x, cst_zero), x);
+
+  // Emit Log2Op instead of LogOp to avoid an infinite match-and-rewrite loop.
+  Value log2 = builder.create<math::Log2Op>(x);
+  Value cst = bcast(f32Cst(builder, 6.93147181e-1f));
+  Value res = builder.create<arith::MulFOp>(cst, log2);
+  rewriter.replaceOp(op, res);
+  return mlir::success();
+}
+
+struct Log1pApproximation : public OpRewritePattern<math::Log1pOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::Log1pOp op,
+                                PatternRewriter &rewriter) const final;
+};
+
+// Approximate log(1+x).
+LogicalResult Log1pApproximation::matchAndRewrite(
+    math::Log1pOp op, PatternRewriter &rewriter) const {
+  auto shape = vectorShape(op.getOperand().getType(), isF32);
+  if (!shape.has_value()) {
+    return rewriter.notifyMatchFailure(op, "unsupported operand type");
+  }
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, *shape);
+  };
+
+  // Approximate log(1+x) using the following, due to W. Kahan:
+  //   u = x + 1.0;
+  //   if (u == 1.0 || u == inf) return x;
+  //   return x * log(u) / (u - 1.0);
+  //          ^^^^^^^^^^^^^^^^^^^^^^
+  //             "log_large" below.
+  Value cst_one = bcast(f32Cst(builder, 1.0f));
+  Value x = op.getOperand();
+  Value u = builder.create<arith::AddFOp>(x, cst_one);
+  Value u_small =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, cst_one);
+  Value log_u = builder.create<math::LogOp>(u);
+  Value u_inf =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, log_u);
+  Value log_large = builder.create<arith::MulFOp>(
+      x, builder.create<arith::DivFOp>(
+             log_u, builder.create<arith::SubFOp>(u, cst_one)));
+  Value approximation = builder.create<arith::SelectOp>(
+      builder.create<arith::OrIOp>(u_small, u_inf), x, log_large);
+  rewriter.replaceOp(op, approximation);
+  return mlir::success();
+}
+
+void populateMathApproximationPatterns(RewritePatternSet &patterns,
+                                       ArrayRef<std::string> oplist) {
+  for (const std::string &op : oplist) {
+    if (op == "all") {
+      patterns
+          .add<EigenExpM1Approximation, LogApproximation, Log1pApproximation>(
+              patterns.getContext());
+    } else if (op == "expm1") {
+      patterns.add<EigenExpM1Approximation>(patterns.getContext());
+    } else if (op == "log") {
+      patterns.add<LogApproximation>(patterns.getContext());
+    } else if (op == "log1p") {
+      patterns.add<Log1pApproximation>(patterns.getContext());
+    }
+  }
+}
+
+struct MathApproximationPass
+    : public impl::MathApproximationPassBase<MathApproximationPass> {
+  explicit MathApproximationPass(ArrayRef<std::string> approx_oplist) {
+    this->oplist = approx_oplist;
+  }
+
+  void runOnOperation() override;
+};
+
+void MathApproximationPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  populateMathApproximationPatterns(patterns, oplist);
+  if (failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
+                                                std::move(patterns))))
+    signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> CreateMathApproximationPass(
+    ArrayRef<std::string> oplist) {
+  return std::make_unique<MathApproximationPass>(oplist);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/math_legalization.cc b/tensorflow/compiler/xla/mlir/math/transforms/math_legalization.cc
new file mode 100644
index 00000000000..72e3d7b04db
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/math_legalization.cc
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Dialect/Math/Transforms/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"  // from @llvm-project
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/MathToLibm/MathToLibm.h"  // from @llvm-project
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
+#include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
+
+namespace xla {
+
+using namespace mlir;  // NOLINT
+
+#define GEN_PASS_DEF_MATHLEGALIZATIONPASS
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h.inc"
+
+struct MathLegalizationPass
+    : public impl::MathLegalizationPassBase<MathLegalizationPass> {
+  explicit MathLegalizationPass(bool enable_approximations) {
+    enable_approximations_ = enable_approximations;
+  }
+  void runOnOperation() override;
+};
+
+void MathLegalizationPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  LLVMTypeConverter converter(&getContext());
+
+  populateMathToLLVMConversionPatterns(converter, patterns);
+  int32_t libm_log1p_benefit = enable_approximations_ ? 0 : 2;
+  // MathToLibm patterns are a last resort, so they have a 0 benefit (except
+  // for log1p if approximations are disabled, because it has accuracy issues
+  // near 0 if implemented naively).
+  populateMathToLibmConversionPatterns(patterns, 0, {libm_log1p_benefit});
+
+  ConversionTarget target(getContext());
+  target.addIllegalDialect<math::MathDialect>();
+  target.addLegalDialect<LLVM::LLVMDialect, arith::ArithDialect,
+                         func::FuncDialect, vector::VectorDialect>();
+  if (failed(applyPartialConversion(getOperation(), target,
+                                    std::move(patterns)))) {
+    signalPassFailure();
+  }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateMathLegalizationPass(
+    bool enable_approximations) {
+  return std::make_unique<MathLegalizationPass>(enable_approximations);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/math_optimization.cc b/tensorflow/compiler/xla/mlir/math/transforms/math_optimization.cc
similarity index 91%
rename from tensorflow/compiler/xla/mlir/transforms/math/math_optimization.cc
rename to tensorflow/compiler/xla/mlir/math/transforms/math_optimization.cc
index f91c66be333..c2caee87646 100644
--- a/tensorflow/compiler/xla/mlir/transforms/math/math_optimization.cc
+++ b/tensorflow/compiler/xla/mlir/math/transforms/math_optimization.cc
@@ -21,15 +21,14 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"  // from @llvm-project
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h"
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
 
 namespace xla {
-namespace runtime {
 
 using namespace mlir;  // NOLINT
 
 #define GEN_PASS_DEF_MATHOPTIMIZATIONPASS
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h.inc"
 
 struct MathOptimizationPass
     : public impl::MathOptimizationPassBase<MathOptimizationPass> {
@@ -56,5 +55,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateMathOptimizationPass(
   return std::make_unique<MathOptimizationPass>(enable_avx2);
 }
 
-}  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/passes.h b/tensorflow/compiler/xla/mlir/math/transforms/passes.h
new file mode 100644
index 00000000000..38c6f8236bd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/passes.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_MATH_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_MATH_TRANSFORMS_PASSES_H_
+
+#include <memory>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+
+#define GEN_PASS_DECL_MATHAPPROXIMATIONPASS
+#define GEN_PASS_DECL_MATHOPTIMIZATIONPASS
+#define GEN_PASS_DECL_MATHLEGALIZATIONPASS
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateMathOptimizationPass(bool enable_avx2 = false);
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateMathApproximationPass(llvm::ArrayRef<std::string> oplist = {});
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateMathLegalizationPass(
+    bool enable_approximations = true);
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h.inc"
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_MATH_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/passes.td b/tensorflow/compiler/xla/mlir/math/transforms/passes.td
new file mode 100644
index 00000000000..2de3f189b68
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/passes.td
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MATH_PASSES
+#define XLA_MATH_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def MathOptimizationPass
+    : Pass<"xla-math-optimization", "mlir::func::FuncOp"> {
+  let summary = "Optimize operations from the `math` dialect.";
+
+  let description = [{
+    This pass performs algebraic simplification and polynomial approximation for
+    ops from the Math dialect.
+  }];
+
+  let dependentDialects = [
+    "mlir::vector::VectorDialect",
+    "mlir::x86vector::X86VectorDialect"
+  ];
+
+  let constructor = "::xla::CreateMathOptimizationPass()";
+
+  let options = [
+   Option<"enable_avx2_", "enable-avx2", "bool", "false",
+          "Enable math approximations that emit AVX2 intrinsics.">
+  ];
+}
+
+def MathApproximationPass
+    : Pass<"xla-math-approximation", "mlir::func::FuncOp"> {
+  let summary = "Approximate math operations for accuracy and speed.";
+  let constructor = "::xla::CreateMathApproximationPass()";
+  let options = [
+   ListOption<"oplist", "oplist", "std::string",
+              "List of math operations to be approximated. Use 'all' to select "
+              "all supported math operations.">,
+  ];
+}
+
+def MathLegalizationPass : Pass<"xla-math-legalization", "mlir::ModuleOp"> {
+  let summary = "Legalize operations from the `math` dialect.";
+
+  let description = [{
+    This pass lowers ops from the Math dialect to LLVM.
+  }];
+
+  let dependentDialects = [
+    "mlir::arith::ArithDialect",
+    "mlir::func::FuncDialect",
+    "mlir::vector::VectorDialect",
+    "LLVM::LLVMDialect",
+  ];
+
+  let constructor = "::xla::CreateMathLegalizationPass()";
+
+  let options = [
+   Option<"enable_approximations_", "enable-approximations", "bool", "true",
+          "Enable math approximations.">
+  ];
+}
+
+#endif  // XLA_MATH_PASSES
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD
new file mode 100644
index 00000000000..153584bd1a0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/runtime:xla-runtime-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/tests/math_legalization.mlir b/tensorflow/compiler/xla/mlir/math/transforms/tests/math_legalization.mlir
new file mode 100644
index 00000000000..f60970f3c09
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/math/transforms/tests/math_legalization.mlir
@@ -0,0 +1,19 @@
+// RUN: xla-runtime-opt %s --xla-math-legalization \
+// RUN:   | FileCheck %s
+
+// RUN: xla-runtime-opt %s --xla-math-legalization=enable-approximations=0 \
+// RUN:   | FileCheck --check-prefix=NO-APPROX %s
+
+// CHECK-LABEL: func @log1p(
+//   CHECK-DAG:   %[[C1:.*]] = llvm.mlir.constant(1.0
+//       CHECK:   %[[P1:.*]] = llvm.fadd %[[C1]]
+//       CHECK:   %[[RET:.*]] = llvm.intr.log(%[[P1]])
+//       CHECK:   return %[[RET]]
+
+// NO-APPROX-LABEL: func @log1p(
+//       NO-APPROX:   call @log1pf
+
+func.func @log1p(%arg0: f32) -> f32 {
+  %0 = math.log1p %arg0 : f32
+  func.return %0 : f32
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/tests/math_optimization.mlir b/tensorflow/compiler/xla/mlir/math/transforms/tests/math_optimization.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/math/tests/math_optimization.mlir
rename to tensorflow/compiler/xla/mlir/math/transforms/tests/math_optimization.mlir
diff --git a/tensorflow/compiler/xla/mlir/memref/BUILD b/tensorflow/compiler/xla/mlir/memref/BUILD
new file mode 100644
index 00000000000..39215827e06
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/memref/BUILD
@@ -0,0 +1,16 @@
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/compiler/xla/mlir/...",
+        # copybara:uncomment_begin(google-only)
+        # # TODO(ezhulenev): Clean up dependencies that are leforvers from Autofusion project.
+        # "@tf_runtime//...",
+        # copybara:uncomment_end(google-only)
+    ],
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/compiler/xla/mlir/memref/transforms/BUILD b/tensorflow/compiler/xla/mlir/memref/transforms/BUILD
new file mode 100644
index 00000000000..cd77dc34eea
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/BUILD
@@ -0,0 +1,41 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla/mlir/memref:friends"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=MemrefTransforms",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "passes",
+    srcs = ["aligned_allocations.cc"],
+    hdrs = ["passes.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/aligned_allocations.cc b/tensorflow/compiler/xla/mlir/memref/transforms/aligned_allocations.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir/transforms/memref/aligned_allocations.cc
rename to tensorflow/compiler/xla/mlir/memref/transforms/aligned_allocations.cc
index d3cd9e962a4..97d5c5b561a 100644
--- a/tensorflow/compiler/xla/mlir/transforms/memref/aligned_allocations.cc
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/aligned_allocations.cc
@@ -20,15 +20,14 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h"
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h"
 
 namespace xla {
-namespace runtime {
 
 using namespace mlir;  // NOLINT
 
 #define GEN_PASS_DEF_ALIGNEDALLOCATIONSPASS
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h.inc"
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h.inc"
 
 struct AlignedAllocationsPass
     : public impl::AlignedAllocationsPassBase<AlignedAllocationsPass> {
@@ -56,5 +55,4 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateAlignedAllocationsPass(
   return std::make_unique<AlignedAllocationsPass>(alignment);
 }
 
-}  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/memref/transforms/passes.h b/tensorflow/compiler/xla/mlir/memref/transforms/passes.h
new file mode 100644
index 00000000000..57f77bca519
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace xla {
+
+#define GEN_PASS_DECL_ALIGNEDALLOCATIONSPASS
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h.inc"
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateAlignedAllocationsPass(int64_t alignment = 64);
+
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h.inc"
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_MEMREF_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/memref/transforms/passes.td b/tensorflow/compiler/xla/mlir/memref/transforms/passes.td
new file mode 100644
index 00000000000..cf657aafa49
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/passes.td
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_MEMREF_PASSES
+#define XLA_MEMREF_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def AlignedAllocationsPass
+    : Pass<"xla-memref-aligned-allocations", "mlir::func::FuncOp"> {
+  let summary = "Add alignment attribute to all `alloc` operations.";
+
+  let description = [{
+    This pass adds an alignment attribute to all `alloc` operations which don't
+    have such an attribute yet, or which have a smaller alignment than the one
+    configured for this pass.
+  }];
+
+  let constructor = "::xla::CreateAlignedAllocationsPass()";
+
+  let options = [
+   Option<"alignment_", "alignment", "int64_t", "64",
+          "Byte alignment for allocated memrefs.">
+  ];
+}
+
+#endif  // XLA_MEMREF_PASSES
diff --git a/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD
new file mode 100644
index 00000000000..153584bd1a0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/runtime:xla-runtime-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/tests/aligned_allocations.mlir b/tensorflow/compiler/xla/mlir/memref/transforms/tests/aligned_allocations.mlir
similarity index 100%
rename from tensorflow/compiler/xla/mlir/transforms/memref/tests/aligned_allocations.mlir
rename to tensorflow/compiler/xla/mlir/memref/transforms/tests/aligned_allocations.mlir
diff --git a/tensorflow/compiler/xla/mlir/runtime/BUILD b/tensorflow/compiler/xla/mlir/runtime/BUILD
index 76340c2afa9..ea462a3865f 100644
--- a/tensorflow/compiler/xla/mlir/runtime/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 
 package_group(
     name = "friends",
@@ -21,21 +21,23 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "xla-runtime-opt",
     srcs = ["xla-runtime-opt.cc"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        "//tensorflow/compiler/xla/mlir/math/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/memref/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/ir/tests:testlib",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_cpu",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:passes",
-        "//tensorflow/compiler/xla/mlir/transforms/math:passes",
-        "//tensorflow/compiler/xla/mlir/transforms/memref:passes",
+        "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/BUILD b/tensorflow/compiler/xla/mlir/runtime/ir/BUILD
index cb91a60c683..2f7875bb911 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/BUILD
@@ -1,8 +1,9 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
@@ -52,14 +53,6 @@ gentbl_cc_library(
             ["-gen-typedef-defs"],
             "rt_types.cc.inc",
         ),
-        (
-            ["-gen-attr-interface-decls"],
-            "rt_attr_interfaces.h.inc",
-        ),
-        (
-            ["-gen-attr-interface-defs"],
-            "rt_attr_interfaces.cc.inc",
-        ),
         (
             ["-gen-attrdef-decls"],
             "rt_attrs.h.inc",
@@ -74,6 +67,24 @@ gentbl_cc_library(
     deps = [":rt_ops_td_files"],
 )
 
+gentbl_cc_library(
+    name = "rt_interfaces_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-attr-interface-decls"],
+            "rt_attr_interfaces.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "rt_attr_interfaces.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "rt_interfaces.td",
+    deps = [":rt_ops_td_files"],
+)
+
 cc_library(
     name = "rt",
     srcs = [
@@ -89,6 +100,7 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         ":rt_inc_gen",
+        ":rt_interfaces_inc_gen",
         "//tensorflow/compiler/xla/runtime:constraints",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.cc b/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.cc
index e798c601550..2a88d311da2 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.cc
@@ -69,7 +69,7 @@ mlir::LogicalResult RuntimeDialect::verifyOperationAttribute(
                                << " to be an integer attribute";
     }
 
-    auto func = llvm::dyn_cast<mlir::func::FuncOp>(op);
+    auto func = llvm::dyn_cast<mlir::FunctionOpInterface>(op);
     if (!func) {
       return op->emitError()
              << attribute.getName() << " can only be applied to a function";
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.td b/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.td
index c70d53f7d11..dc505f9e5d1 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.td
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.td
@@ -38,13 +38,12 @@ def RuntimeDialect : Dialect {
 
   let cppNamespace = "::xla::runtime";
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
-
   let useDefaultTypePrinterParser = 1;
 
   let useDefaultAttributePrinterParser = 1;
 
   let hasOperationAttrVerify = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 def RT_Ordinal : SignlessIntegerAttrBase<I32, "ordinal value">;
@@ -90,12 +89,10 @@ def HloTraceAttr : AttrDef<RuntimeDialect, "HloTrace",
   let summary = "Trace execution of the HLO operation";
 
   let parameters = (ins
-    StringRefParameter<"hlo_op">:$hlo_op,
-    StringRefParameter<"module">:$module,
-    "int64_t":$program_id
+    StringRefParameter<"hlo_op">:$hlo_op
   );
 
-  let assemblyFormat = "`<` $hlo_op `,` $module `,` $program_id `>`";
+  let assemblyFormat = "`<` $hlo_op `>`";
 }
 
 #endif // RT_DIALECT
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.cc b/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.cc
index bfdee7d699f..db7502cb933 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.cc
@@ -36,24 +36,24 @@ using llvm::Optional;
 //===----------------------------------------------------------------------===//
 
 void ExportOp::build(OpBuilder &builder, OperationState &result,
-                     func::FuncOp function_ref) {
+                     FunctionOpInterface function_ref) {
   result.addAttribute("function_ref", SymbolRefAttr::get(function_ref));
 }
 
 void ExportOp::build(OpBuilder &builder, OperationState &result,
-                     func::FuncOp function_ref, unsigned ordinal) {
+                     FunctionOpInterface function_ref, unsigned ordinal) {
   build(builder, result, function_ref);
   result.addAttribute("ordinal", builder.getI32IntegerAttr(ordinal));
 }
 
 LogicalResult ExportOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   Operation *op = getOperation();
-  auto func = symbolTable.lookupNearestSymbolFrom<func::FuncOp>(
+  auto func = symbolTable.lookupNearestSymbolFrom<FunctionOpInterface>(
       op, getFunctionRefAttr());
 
   // Function reference must reference a valid FuncOp operation.
   if (!func) {
-    return op->emitError() << "func.func op named '" << getFunctionRef()
+    return op->emitError() << "func op named '" << getFunctionRef()
                            << "' not found for export";
   }
 
@@ -65,16 +65,16 @@ Optional<unsigned> ExportOp::ordinal() {
   return llvm::None;
 }
 
-mlir::func::FuncOp ExportOp::exported(mlir::SymbolTable &sym_table) {
-  return sym_table.lookupNearestSymbolFrom<func::FuncOp>(getOperation(),
-                                                         getFunctionRefAttr());
+FunctionOpInterface ExportOp::exported(mlir::SymbolTable &sym_table) {
+  return sym_table.lookupNearestSymbolFrom<FunctionOpInterface>(
+      getOperation(), getFunctionRefAttr());
 }
 
 //===----------------------------------------------------------------------===//
 // TraceOp
 //===----------------------------------------------------------------------===//
 
-void TraceOp::getSuccessorRegions(Optional<unsigned> index,
+void TraceOp::getSuccessorRegions(std::optional<unsigned> index,
                                   ArrayRef<Attribute> operands,
                                   SmallVectorImpl<RegionSuccessor> &regions) {
   // If the predecessor is the TraceOp, branch into the body.
@@ -122,8 +122,8 @@ void TraceOp::build(OpBuilder &builder, OperationState &result,
 //===----------------------------------------------------------------------===//
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    Optional<unsigned> index) {
-  return operandsMutable();
+    std::optional<unsigned> index) {
+  return getArgumentsMutable();
 }
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.td b/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.td
index 0f7e147bf76..a7e21050182 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.td
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.td
@@ -67,13 +67,14 @@ def RT_ExportOp : RT_Op<"export", [
 
   let skipDefaultBuilders = 1;
   let builders = [
-    OpBuilder<(ins "mlir::func::FuncOp":$function_ref)>,
-    OpBuilder<(ins "mlir::func::FuncOp":$function_ref, "unsigned":$ordinal)>,
+    OpBuilder<(ins "mlir::FunctionOpInterface":$function_ref)>,
+    OpBuilder<(ins "mlir::FunctionOpInterface":$function_ref, 
+                   "unsigned":$ordinal)>,
   ];
 
   let extraClassDeclaration = [{
     llvm::Optional<unsigned> ordinal();
-    mlir::func::FuncOp exported(mlir::SymbolTable& sym_table);
+    mlir::FunctionOpInterface exported(mlir::SymbolTable& sym_table);
   }];
 
   let assemblyFormat = "$function_ref (`ordinal` $ordinal^)? attr-dict";
@@ -243,7 +244,7 @@ def RT_CallOp : RT_Op<"call"> {
     ExecutionContextType:$ctx,
     StrAttr:$callee,
     UnitAttr:$dynamic,
-    Variadic<AnyType>:$operands
+    Variadic<AnyType>:$arguments
   );
 
   let results = (outs
@@ -252,8 +253,8 @@ def RT_CallOp : RT_Op<"call"> {
   );
 
   let assemblyFormat = [{
-    (`dynamic` $dynamic^)? $ctx `[` $callee `]`  `(` $operands `)`
-    attr-dict `:` functional-type($operands, $results)
+    (`dynamic` $dynamic^)? $ctx `[` $callee `]`  `(` $arguments `)`
+    attr-dict `:` functional-type($arguments, $results)
   }];
 }
 
@@ -327,9 +328,9 @@ def RT_YieldOp : RT_Op<"yield",
     the users of `rt.trace` operation results.
   }];
 
-  let arguments = (ins Variadic<AnyType>:$operands);
+  let arguments = (ins Variadic<AnyType>:$arguments);
 
-  let assemblyFormat = "($operands^ `:` type($operands))? attr-dict";
+  let assemblyFormat = "($arguments^ `:` type($arguments))? attr-dict";
 
   // Default builder needed for ensureTerminator
   let builders = [OpBuilder<(ins), "build($_builder, $_state, {});">];
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD b/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
index 9b29f4d0e76..b5d3ee388f1 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
@@ -1,10 +1,12 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
     data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
 )
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops.mlir b/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops.mlir
index 25ba552ac44..36387c24a5e 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops.mlir
@@ -69,13 +69,13 @@ func.func @opaque_arg(%ctx: !rt.execution_context,
 // CHECK: ) -> memref<?x?xf32>
 func.func @trace(%ctx: !rt.execution_context,
                  %arg: memref<?x?xf32>) -> memref<?x?xf32> {
-  // CHECK: rt.trace #rt.hlo_trace<"fusion", "foo", 0>, %[[CTX]]
-  rt.trace #rt.hlo_trace<"fusion", "foo", 0>, %ctx {}
+  // CHECK: rt.trace #rt.hlo_trace<"fusion">, %[[CTX]]
+  rt.trace #rt.hlo_trace<"fusion">, %ctx {}
 
-  // CHECK: rt.trace #rt.hlo_trace<"fusion", "bar", 0>
+  // CHECK: rt.trace #rt.hlo_trace<"fusion">
   // CHECK-SAME: %[[CTX]] -> memref<?x?xf32>
   // CHECK-NEXT: yield %[[ARG]] : memref<?x?xf32>
-  %0 = rt.trace #rt.hlo_trace<"fusion", "bar", 0>, %ctx -> memref<?x?xf32> {
+  %0 = rt.trace #rt.hlo_trace<"fusion">, %ctx -> memref<?x?xf32> {
     yield %arg : memref<?x?xf32>
   }
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops_verify.mlir b/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops_verify.mlir
index 997e59f69d7..4fb5231582a 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops_verify.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/tests/ops_verify.mlir
@@ -1,7 +1,7 @@
 // RUN: xla-runtime-opt -verify-diagnostics -split-input-file %s
 
 // -----
-// expected-error @+1 {{func.func op named 'foo' not found for export}}
+// expected-error @+1 {{func op named 'foo' not found for export}}
 rt.export @foo
 
 // -----
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.td b/tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.td
index d6f38878cdd..fd3bbade524 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.td
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.td
@@ -34,8 +34,7 @@ def TestlibDialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
 
   let useDefaultTypePrinterParser = 1;
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 include "testlib_attrs.td"
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
index 1960ae3eabc..fde2d7d6955 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
@@ -1,9 +1,11 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow/tsl/platform:build_config.bzl", "if_llvm_system_z_available")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
@@ -73,7 +75,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "calling_convention_test",
     srcs = ["calling_convention_test.cc"],
     compatible_with = get_compatible_with_cloud(),
@@ -100,9 +102,9 @@ cc_library(
         ":compiler",
         ":custom_call_encoding",
         ":passes",
-        "//tensorflow/compiler/xla/mlir/transforms/cpu:passes",
-        "//tensorflow/compiler/xla/mlir/transforms/math:passes",
-        "//tensorflow/compiler/xla/mlir/transforms/memref:passes",
+        "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/math/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/memref/transforms:passes",
         "//tensorflow/compiler/xla/runtime:compiler",
         "@llvm-project//mlir:AMXToLLVMIRTranslation",
         "@llvm-project//mlir:AffineDialect",
@@ -131,6 +133,7 @@ cc_library(
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:X86VectorToLLVMIRTranslation",
@@ -153,6 +156,9 @@ cc_library(
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/compiler/xla/runtime:compiler",
+        "@llvm-project//mlir:AsyncDialect",
+        "@llvm-project//mlir:AsyncToLLVM",
+        "@llvm-project//mlir:AsyncTransforms",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:IR",
@@ -163,6 +169,7 @@ cc_library(
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Transforms",
     ],
     alwayslink = 1,  # has pipeline registration
 )
@@ -190,11 +197,13 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:type_id",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:AsyncDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:Support",
+        "@tf_runtime//:async_value",
     ],
 )
 
@@ -213,12 +222,10 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:arguments",
         "//tensorflow/compiler/xla/runtime:compiler",
         "//tensorflow/compiler/xla/runtime:constraints",
-        "//tensorflow/compiler/xla/runtime:errors",
         "//tensorflow/compiler/xla/runtime:executable",
         "//tensorflow/compiler/xla/runtime:symbolic_shape",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ExecutionEngineUtils",
@@ -226,7 +233,22 @@ cc_library(
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ToLLVMIRTranslation",
-    ],
+    ] + select({
+        "//tensorflow/tsl:arm_any": [
+            "@llvm-project//llvm:AArch64AsmParser",
+        ],
+        "//tensorflow/tsl:linux_ppc64le": [
+            "@llvm-project//llvm:PowerPCAsmParser",
+        ],
+        "//tensorflow/tsl:macos_arm64": [
+            "@llvm-project//llvm:AArch64AsmParser",
+        ],
+        "//conditions:default": [
+            "@llvm-project//llvm:X86AsmParser",
+        ],
+    }) + if_llvm_system_z_available([
+        "@llvm-project//llvm:SystemZAsmParser",
+    ]),
 )
 
 cc_library(
@@ -273,7 +295,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "type_converter_test",
     srcs = ["type_converter_test.cc"],
     compatible_with = get_compatible_with_cloud(),
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
index 893b94615bf..278a087c89b 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ArmNeon/ArmNeonToLLVMIRTranslation.h"  // from @llvm-project
@@ -47,24 +48,25 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h"
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h"
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h"
 
 namespace xla {
 namespace runtime {
 
 void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects) {
   // Register MLIR dialects supported by the compiled executables.
-  dialects->insert<mlir::AffineDialect, mlir::arith::ArithDialect,
-                   mlir::async::AsyncDialect, mlir::cf::ControlFlowDialect,
-                   mlir::linalg::LinalgDialect, mlir::math::MathDialect,
-                   mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
-                   mlir::func::FuncDialect, mlir::tensor::TensorDialect,
-                   mlir::vector::VectorDialect, RuntimeDialect>();
+  dialects->insert<
+      mlir::AffineDialect, mlir::arith::ArithDialect, mlir::async::AsyncDialect,
+      mlir::cf::ControlFlowDialect, mlir::linalg::LinalgDialect,
+      mlir::math::MathDialect, mlir::memref::MemRefDialect,
+      mlir::scf::SCFDialect, mlir::func::FuncDialect,
+      mlir::sparse_tensor::SparseTensorDialect, mlir::tensor::TensorDialect,
+      mlir::vector::VectorDialect, RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
   mlir::registerArmNeonDialectTranslation(*dialects);
@@ -76,8 +78,11 @@ void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects) {
 
 static void CreateDefaultXlaCpuRuntimeCompilationPipeline(
     mlir::OpPassManager& pm, const CpuPipelineOptions& opts) {
+  pm.addPass(mlir::createAsyncFuncToAsyncRuntimePass());
+
   // Convert entry function to the XLA entrypoint.
   pm.addPass(CreateExportRuntimeFunctionsPass());
+  pm.addPass(cpu::createConvertLmhloToCpuRuntimePass());
   pm.addPass(CreateConvertCustomCallsPass());
   pm.addPass(CreateConvertAssertsPass());
 
@@ -85,10 +90,9 @@ static void CreateDefaultXlaCpuRuntimeCompilationPipeline(
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
 
-  // Optimize operations from the math dialect before outlining compute regions
-  // into functions to see all constant operands.
+  // Enable math approximations to match XLA's FP accuracy spec.
   pm.addNestedPass<mlir::func::FuncOp>(
-      xla::runtime::CreateMathOptimizationPass(opts.math_avx2));
+      xla::CreateMathApproximationPass({"all"}));
 
   // Convert all linalg operations to parallel loops.
   pm.addNestedPass<mlir::func::FuncOp>(
@@ -107,10 +111,12 @@ static void CreateDefaultXlaCpuRuntimeCompilationPipeline(
   // Expand math operations into std/arith dialect operations.
   pm.addNestedPass<mlir::func::FuncOp>(mlir::arith::createArithExpandOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::memref::createExpandOpsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::memref::createExpandStridedMetadataPass());
 
   // Add alignment attribute to all memref allocations.
   pm.addNestedPass<mlir::func::FuncOp>(
-      xla::runtime::CreateAlignedAllocationsPass(opts.alignment));
+      xla::CreateAlignedAllocationsPass(opts.alignment));
 
   // Lower everything down to LLVM dialect.
   pm.addPass(mlir::createConvertLinalgToLLVMPass());
@@ -118,8 +124,6 @@ static void CreateDefaultXlaCpuRuntimeCompilationPipeline(
   pm.addPass(mlir::createConvertSCFToCFPass());
 
   // Convert runtime operations and custom calls to LLVM dialect.
-  pm.addPass(cpu::createConvertLmhloToCpuRuntimePass());
-  pm.addPass(CreateConvertCustomCallsPass());
   const CompilationPipelineOptions& copts = opts.common_options;
   ConvertRuntimeToLLvmOpts rt_to_llvm_opts = {
       copts.populate_type_id_names, copts.populate_type_conversions,
@@ -130,17 +134,13 @@ static void CreateDefaultXlaCpuRuntimeCompilationPipeline(
   // Convert async dialect to LLVM once everything else is in the LLVM dialect.
   pm.addPass(mlir::createConvertAsyncToLLVMPass());
 
-  {
-    mlir::OpPassManager& fpm = pm.nest<mlir::func::FuncOp>();
-    fpm.addPass(mlir::createConvertMathToLLVMPass());
-  }
-  pm.addPass(mlir::createConvertMathToLibmPass());
+  pm.addPass(xla::CreateMathLegalizationPass(/*enable_approximations=*/false));
 
   // Convert everything else to LLVM dialect.
   mlir::LowerVectorToLLVMOptions vector_to_llvm_opts;
   if (opts.math_avx2) vector_to_llvm_opts.enableX86Vector();
   pm.addPass(mlir::createConvertVectorToLLVMPass(vector_to_llvm_opts));
-  pm.addPass(mlir::createMemRefToLLVMConversionPass());
+  pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
   pm.addPass(mlir::createConvertFuncToLLVMPass());
   pm.addPass(mlir::createConvertComplexToLLVMPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
index aeeab6f50ae..cb05272ed15 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.cc
@@ -17,21 +17,24 @@ limitations under the License.
 
 #include <utility>
 
+#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/Async/IR/Async.h"  // from @llvm-project
+#include "mlir/Dialect/Async/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 
 namespace xla {
 namespace runtime {
@@ -41,7 +44,7 @@ void RegisterDefaultXlaGpuRuntimeDialects(DialectRegistry& dialects) {
   dialects->insert<mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
                    mlir::func::FuncDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
                    mlir::lmhlo::LmhloDialect, mlir::mhlo::MhloDialect,
-                   RuntimeDialect>();
+                   mlir::async::AsyncDialect, RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
   mlir::registerLLVMDialectTranslation(*dialects);
@@ -58,12 +61,19 @@ void RegisterTestlibDialect(DialectRegistry& dialects) {
 static void CreateDefaultXlaGpuRuntimeCompilationPipeline(
     mlir::OpPassManager& pm, const CompilationPipelineOptions& opts) {
   pm.addPass(mlir::createConvertSCFToCFPass());
+  pm.addPass(mlir::createAsyncFuncToAsyncRuntimePass());
 
   // Export functions to the XLA runtime.
   pm.addPass(CreateExportRuntimeFunctionsPass());
   pm.addPass(CreateConvertCustomCallsPass());
   pm.addPass(CreateConvertAssertsPass());
 
+  // Lower from high level async operations to async runtime.
+  pm.addPass(mlir::createAsyncToAsyncRuntimePass());
+
+  // Add async.runtime reference counting operations.
+  pm.addPass(mlir::createAsyncRuntimePolicyBasedRefCountingPass());
+
   // Convert runtime operations and custom calls to LLVM dialect.
   ConvertRuntimeToLLvmOpts rt_to_llvm_opts = {
       opts.populate_type_id_names, opts.populate_type_conversions,
@@ -71,10 +81,17 @@ static void CreateDefaultXlaGpuRuntimeCompilationPipeline(
       opts.populate_attr_encodings};
   pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
 
-  // Convert everythinG else to LLVM dialect.
-  pm.addPass(mlir::createMemRefToLLVMConversionPass());
+  // Convert async dialect to LLVM once everything else is in the LLVM dialect.
+  pm.addPass(mlir::createConvertAsyncToLLVMPass());
+
+  // Convert everything else to LLVM dialect.
+  pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
   pm.addPass(mlir::createConvertFuncToLLVMPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
+
+  // Clean up IR before passing it to LLVM.
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
 }
 
 void CreateDefaultXlaGpuRuntimeCompilationPipeline(
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
index a19a7033246..c4dd90e93a1 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Conversion/LLVMCommon/MemRefBuilder.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Async/IR/AsyncTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
@@ -32,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -39,6 +42,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/tracing.h"
 #include "tensorflow/compiler/xla/runtime/type_id.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
 
 namespace Eigen {
 struct half;
@@ -59,13 +64,13 @@ using llvm::ArrayRef;
 
 using EncodedArg = CustomCallArgEncodingSet::Encoded;
 
-FailureOr<EncodedArg> CustomCallArgEncodingSet::Encode(Globals &g,
+FailureOr<EncodedArg> CustomCallArgEncodingSet::Encode(Globals &g, Allocas &a,
                                                        ImplicitLocOpBuilder &b,
                                                        Value value,
                                                        Value converted) const {
   for (auto &encoding : encodings_)
     if (succeeded(encoding->Match(value, converted)))
-      return encoding->Encode(g, b, value, converted);
+      return encoding->Encode(g, a, b, value, converted);
   return failure();
 }
 
@@ -75,13 +80,13 @@ FailureOr<EncodedArg> CustomCallArgEncodingSet::Encode(Globals &g,
 
 using EncodedRet = CustomCallRetEncodingSet::Encoded;
 
-FailureOr<EncodedRet> CustomCallRetEncodingSet::Encode(Globals &g,
+FailureOr<EncodedRet> CustomCallRetEncodingSet::Encode(Globals &g, Allocas &a,
                                                        ImplicitLocOpBuilder &b,
                                                        Type type,
                                                        Type converted) const {
   for (auto &encoding : encodings_)
     if (succeeded(encoding->Match(type, converted)))
-      return encoding->Encode(g, b, type, converted);
+      return encoding->Encode(g, a, b, type, converted);
   return failure();
 }
 
@@ -114,13 +119,14 @@ FailureOr<EncodedAttr> CustomCallAttrEncodingSet::Encode(
 // A set of helper functions for packing primitive attributes.
 //===----------------------------------------------------------------------===//
 
-Value PackTypeId(Globals &g, ImplicitLocOpBuilder &b, TypeID type_id) {
-  auto global = g.GetOrCreate(b, type_id);
-  return Globals::AddrOf(b, global);
+LLVM::GlobalOp EncodeTypeId(Globals &g, ImplicitLocOpBuilder &b,
+                            TypeID type_id) {
+  return g.GetOrCreate(b, type_id);
 }
 
-Value PackString(Globals &g, ImplicitLocOpBuilder &b, std::string_view strref,
-                 std::string_view symbol_base) {
+LLVM::GlobalOp EncodeString(Globals &g, ImplicitLocOpBuilder &b,
+                            std::string_view strref,
+                            std::string_view symbol_base) {
   MLIRContext *ctx = b.getContext();
   int64_t size = strref.size();
 
@@ -142,15 +148,13 @@ Value PackString(Globals &g, ImplicitLocOpBuilder &b, std::string_view strref,
   };
 
   auto value = b.getStringAttr(strref);
-  auto global = g.GetOrCreate(b, value, type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, value, type, symbol_base, init);
 }
 
-// Packs scalar attribute as a global constant. Returns `!llvm.ptr<AttrType>`.
-Value PackScalarAttribute(Globals &g, ImplicitLocOpBuilder &b, Attribute value,
-                          std::string_view symbol_base) {
-  auto global = g.GetOrCreate(b, value, symbol_base);
-  return Globals::AddrOf(b, global);
+mlir::LLVM::GlobalOp EncodeScalar(Globals &g, mlir::ImplicitLocOpBuilder &b,
+                                  mlir::Attribute value,
+                                  std::string_view symbol_base) {
+  return g.GetOrCreate(b, value, symbol_base);
 }
 
 // Reshape dense elements as a one-dimensional array.
@@ -165,11 +169,10 @@ static mlir::DenseElementsAttr Flatten(DenseIntOrFPElementsAttr dense) {
 // A set of helper functions for packing dense and array-like attributes.
 //===----------------------------------------------------------------------===//
 
-// Packs dense elements attribute as a global constant. Returns
-// `!llvm.ptr<EncodedDenseElements>`.
-static Value PackDenseElementsAttribute(Globals &g, ImplicitLocOpBuilder &b,
-                                        Attribute value,
-                                        std::string_view symbol_base) {
+// Encodes dense elements attribute as a global constant.
+static LLVM::GlobalOp EncodeDenseElementsAttribute(
+    Globals &g, ImplicitLocOpBuilder &b, Attribute value,
+    std::string_view symbol_base) {
   MLIRContext *ctx = b.getContext();
   DenseIntOrFPElementsAttr dense = value.cast<DenseIntOrFPElementsAttr>();
 
@@ -186,7 +189,7 @@ static Value PackDenseElementsAttribute(Globals &g, ImplicitLocOpBuilder &b,
   // cast pointers to dense elements attributes (shaped tensors) as pointers to
   // flat array attributes.
   //
-  // See `PackArrayAttribute` defined below.
+  // See `EncodeArrayAttribute` defined below.
   Type encoded_arr_type =
       LLVM::LLVMStructType::getLiteral(ctx, {b.getI64Type(), ptr});
 
@@ -229,15 +232,13 @@ static Value PackDenseElementsAttribute(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(encoded);
   };
 
-  auto global = g.GetOrCreate(b, value, type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, value, type, symbol_base, init);
 }
 
-// Create a global for the data array in an EncodedArray.
-// Returns `!llvm.ptr<array<element_type x size>>
-static Value CreateGlobalFromArray(Globals &g, ImplicitLocOpBuilder &b,
-                                   ArrayAttr array, Type element_type,
-                                   std::string_view symbol_base) {
+// Encodes the payload of an array attribute as a global constant.
+static LLVM::GlobalOp EncodeArrayAttrData(Globals &g, ImplicitLocOpBuilder &b,
+                                          ArrayAttr array, Type element_type,
+                                          std::string_view symbol_base) {
   Type arr_type = LLVM::LLVMArrayType::get(element_type, array.size());
 
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
@@ -249,14 +250,13 @@ static Value CreateGlobalFromArray(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(data);
   };
 
-  auto global = g.GetOrCreate(b, array, arr_type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, array, arr_type, symbol_base, init);
 }
 
-// Packs array attribute as a global constant. Returns `!llvm.ptr<EncodedArr>`.
-static Value PackArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
-                                ArrayAttr array, Type element_type,
-                                std::string_view symbol_base) {
+// Encodes array attribute as a global constant.
+static LLVM::GlobalOp EncodeArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
+                                           ArrayAttr array, Type element_type,
+                                           std::string_view symbol_base) {
   MLIRContext *ctx = b.getContext();
 
   int64_t size = array.size();
@@ -269,7 +269,8 @@ static Value PackArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
     // Array size and the pointer to data.
     Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(size));
-    Value data = CreateGlobalFromArray(g, b, array, element_type, symbol_base);
+    Value data = Globals::AddrOf(
+        b, EncodeArrayAttrData(g, b, array, element_type, symbol_base));
 
     // Store size and values into the struct.
     Value encoded = ib.create<LLVM::UndefOp>(type);
@@ -279,8 +280,7 @@ static Value PackArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(encoded);
   };
 
-  auto global = g.GetOrCreate(b, array, type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, array, type, symbol_base, init);
 }
 
 template <typename T, typename AttrType, typename ArrayType>
@@ -295,10 +295,12 @@ static Value FillDataFromDenseArrayAttr(
   return data;
 }
 
-static Value CreateGlobalFromDenseArray(Globals &g, ImplicitLocOpBuilder &b,
-                                        DenseArrayAttr base_array,
-                                        Type arr_type,
-                                        std::string_view symbol_base) {
+// Encodes the payload of a dense array attribute as a global constant.
+static LLVM::GlobalOp EncodeDenseArrayAttrData(Globals &g,
+                                               ImplicitLocOpBuilder &b,
+                                               DenseArrayAttr base_array,
+                                               Type arr_type,
+                                               std::string_view symbol_base) {
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
     Value data = ib.create<LLVM::UndefOp>(arr_type);
     llvm::TypeSwitch<DenseArrayAttr>(base_array)
@@ -332,13 +334,13 @@ static Value CreateGlobalFromDenseArray(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(data);
   };
 
-  auto global = g.GetOrCreate(b, base_array, arr_type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, base_array, arr_type, symbol_base, init);
 }
 
-static Value PackDenseArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
-                                     Attribute value,
-                                     std::string_view symbol_base) {
+static LLVM::GlobalOp EncodeDenseArrayAttribute(Globals &g,
+                                                ImplicitLocOpBuilder &b,
+                                                Attribute value,
+                                                std::string_view symbol_base) {
   MLIRContext *ctx = b.getContext();
 
   DenseArrayAttr base_array = value.cast<DenseArrayAttr>();
@@ -347,7 +349,7 @@ static Value PackDenseArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
   Type ptr = LLVM::LLVMPointerType::get(ctx);
 
   // Stored array type: !llvm.array<element_type x size>
-  Type element_type = base_array.getType().getElementType();
+  Type element_type = base_array.getElementType();
   Type arr_type = LLVM::LLVMArrayType::get(element_type, size);
 
   // Encoded array type: !llvm.struct<(i64, !llvm.ptr)>.
@@ -357,8 +359,8 @@ static Value PackDenseArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
   auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
     // Array size and values.
     Value num_elements = ib.create<ConstantOp>(b.getI64IntegerAttr(size));
-    Value data =
-        CreateGlobalFromDenseArray(g, ib, base_array, arr_type, symbol_base);
+    Value data = Globals::AddrOf(
+        b, EncodeDenseArrayAttrData(g, ib, base_array, arr_type, symbol_base));
 
     // Store size and values into the struct.
     Value encoded = ib.create<LLVM::UndefOp>(type);
@@ -368,13 +370,13 @@ static Value PackDenseArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(encoded);
   };
 
-  auto global = g.GetOrCreate(b, value, type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, value, type, symbol_base, init);
 }
 
-static Value PackEmptyArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
-                                     Attribute value,
-                                     std::string_view symbol_base) {
+static LLVM::GlobalOp EncodeEmptyArrayAttribute(Globals &g,
+                                                ImplicitLocOpBuilder &b,
+                                                Attribute value,
+                                                std::string_view symbol_base) {
   MLIRContext *ctx = b.getContext();
 
   Type ptr = LLVM::LLVMPointerType::get(ctx);
@@ -396,8 +398,7 @@ static Value PackEmptyArrayAttribute(Globals &g, ImplicitLocOpBuilder &b,
     ib.create<LLVM::ReturnOp>(encoded);
   };
 
-  auto global = g.GetOrCreate(b, value, type, symbol_base, init);
-  return Globals::AddrOf(b, global);
+  return g.GetOrCreate(b, value, type, symbol_base, init);
 }
 
 //===----------------------------------------------------------------------===//
@@ -414,22 +415,14 @@ static FuncOp GetParentFunc(Value value) {
 }
 
 // Packs value on the stack. Returns allocation holding the value.
-static LLVM::AllocaOp PackValue(ImplicitLocOpBuilder &b, Value value) {
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-
-  // Always create an `alloca` in the parent function entry block.
-  // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-  LLVM::AllocaOp mem = [&]() -> LLVM::AllocaOp {
-    Block &block = GetParentFunc(value).getBody().front();
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(&block);
-    Value one = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-    return b.create<LLVM::AllocaOp>(ptr, value.getType(), one, 0);
-  }();
+static LLVM::AllocaOp PackValue(ImplicitLocOpBuilder &b, Allocas &a,
+                                Value value) {
+  LLVM::AllocaOp alloca = a.GetOrCreate(b, value.getType());
+  // Start the lifetime of encoded value.
+  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
+  b.create<LLVM::StoreOp>(value, alloca);
 
-  b.create<LLVM::StoreOp>(value, mem);
-
-  return mem;
+  return alloca;
 }
 
 //===----------------------------------------------------------------------===//
@@ -527,6 +520,50 @@ mlir::FailureOr<mlir::LLVM::GlobalOp> Globals::TryGetOrCreate(
                                      global.getSymName());
 }
 
+//===----------------------------------------------------------------------===//
+// A helper class to create alloca operations for encoded arguments.
+//===----------------------------------------------------------------------===//
+
+Allocas::Allocas(Block *block,
+                 llvm::DenseMap<mlir::Type, TypedAllocas> *allocas)
+    : block_(block), allocas_(allocas) {
+  for (auto &[_, v] : *allocas_) {
+    assert(v.offset == 0 && "expected zero offset");
+    (void)v;
+  }
+}
+
+Allocas::~Allocas() {
+  for (auto &[k, v] : *allocas_) v.offset = 0;
+}
+
+mlir::LLVM::AllocaOp Allocas::GetOrCreate(mlir::ImplicitLocOpBuilder &b,
+                                          mlir::Type type) {
+  TypedAllocas &allocas = (*allocas_)[type];
+
+  // Reuse existing alloca for the given type.
+  if (allocas.offset < allocas.allocas.size()) {
+    return allocas.allocas[allocas.offset++];
+  }
+
+  // Create a new alloca at the beginning of the block.
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(block_);
+  Value c1 = b.create<ConstantOp>(b.getI32IntegerAttr(1));
+  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
+  auto alloca = b.create<LLVM::AllocaOp>(ptr, type, c1, 0);
+
+  ++allocas.offset;
+  return allocas.allocas.emplace_back(alloca);
+}
+
+Allocas EncodingAllocas::GetForOperation(mlir::Operation *op) {
+  // Always create an `alloca` in the parent function entry block.
+  // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
+  Block *block = &op->getParentOfType<func::FuncOp>().getBody().front();
+  return Allocas(block, &allocas_[block]);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions for encoding attributes and values for custom calls.
 //===----------------------------------------------------------------------===//
@@ -592,6 +629,8 @@ static PrimitiveType ScalarPrimitiveType(Type type) {
   if (type.isInteger(64)) return PrimitiveType::S64;
 
   // Floating point types.
+  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
+  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isF16()) return PrimitiveType::F16;
   if (type.isF32()) return PrimitiveType::F32;
   if (type.isF64()) return PrimitiveType::F64;
@@ -608,12 +647,39 @@ static PrimitiveType ScalarPrimitiveType(Type type) {
 }
 
 static TypeID ArrayRuntimeTypeId(Type elem_type) {
-  if (elem_type.isInteger(8)) return TypeID::get<Tagged<ArrayRef<int8_t>>>();
-  if (elem_type.isInteger(16)) return TypeID::get<Tagged<ArrayRef<int16_t>>>();
-  if (elem_type.isInteger(32)) return TypeID::get<Tagged<ArrayRef<int32_t>>>();
-  if (elem_type.isInteger(64)) return TypeID::get<Tagged<ArrayRef<int64_t>>>();
-  if (elem_type.isF32()) return TypeID::get<Tagged<ArrayRef<float>>>();
-  if (elem_type.isF64()) return TypeID::get<Tagged<ArrayRef<double>>>();
+  if (elem_type.isInteger(8))
+    return TypeID::get<Tagged<absl::Span<const int8_t>>>();
+  if (elem_type.isInteger(16))
+    return TypeID::get<Tagged<absl::Span<const int16_t>>>();
+  if (elem_type.isInteger(32))
+    return TypeID::get<Tagged<absl::Span<const int32_t>>>();
+  if (elem_type.isInteger(64))
+    return TypeID::get<Tagged<absl::Span<const int64_t>>>();
+
+  if (elem_type.isF32()) return TypeID::get<Tagged<absl::Span<const float>>>();
+  if (elem_type.isF64()) return TypeID::get<Tagged<absl::Span<const double>>>();
+
+  assert(false && "unsupported type id");
+  return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
+}
+
+static TypeID AsyncValueRuntimeTypeId(Type elem_type) {
+  if (elem_type.isInteger(1))
+    return TypeID::get<Tagged<tsl::AsyncValueRef<bool>>>();
+  if (elem_type.isInteger(8))
+    return TypeID::get<Tagged<tsl::AsyncValueRef<int8_t>>>();
+  if (elem_type.isInteger(16))
+    return TypeID::get<Tagged<tsl::AsyncValueRef<int16_t>>>();
+  if (elem_type.isInteger(32))
+    return TypeID::get<Tagged<tsl::AsyncValueRef<int32_t>>>();
+  if (elem_type.isInteger(64))
+    return TypeID::get<Tagged<tsl::AsyncValueRef<int64_t>>>();
+  if (elem_type.isF32())
+    return TypeID::get<Tagged<tsl::AsyncValueRef<float>>>();
+  if (elem_type.isF64())
+    return TypeID::get<Tagged<tsl::AsyncValueRef<double>>>();
+  if (elem_type.isa<MemRefType>())
+    return TypeID::get<Tagged<tsl::AsyncValueRef<MemrefView>>>();
 
   assert(false && "unsupported type id");
   return TypeID::getFromOpaquePointer(reinterpret_cast<void *>(0xDEADBEEF));
@@ -651,9 +717,9 @@ FailureOr<EncodedAttr> StringAttrEncoding::Encode(mlir::SymbolTable &,
   auto str = attr.cast<StringAttr>();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, TypeID::get<Tagged<std::string_view>>());
-  encoded.value = PackString(g, b, str.getValue(), kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<std::string_view>>());
+  encoded.value = EncodeString(g, b, str.getValue(), kAttrValue);
   return encoded;
 }
 
@@ -673,9 +739,9 @@ FailureOr<EncodedAttr> ScalarAttrEncoding::Encode(mlir::SymbolTable &,
   Type type = attr.cast<TypedAttr>().getType();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, ScalarRuntimeTypeId(type));
-  encoded.value = PackScalarAttribute(g, b, attr, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(type));
+  encoded.value = EncodeScalar(g, b, attr, kAttrValue);
 
   return encoded;
 }
@@ -697,9 +763,9 @@ FailureOr<EncodedAttr> DenseElementsAttrEncoding::Encode(
   Type elem_type = dense.getType().getElementType();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, DenseElementsRuntimeTypeId(elem_type));
-  encoded.value = PackDenseElementsAttribute(g, b, attr, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, DenseElementsRuntimeTypeId(elem_type));
+  encoded.value = EncodeDenseElementsAttribute(g, b, attr, kAttrValue);
 
   return encoded;
 }
@@ -732,9 +798,9 @@ FailureOr<EncodedAttr> ArrayAttrEncoding::Encode(mlir::SymbolTable &,
   if (!all_of_same_type) return failure();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, ArrayRuntimeTypeId(elem_type));
-  encoded.value = PackArrayAttribute(g, b, array, elem_type, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, ArrayRuntimeTypeId(elem_type));
+  encoded.value = EncodeArrayAttribute(g, b, array, elem_type, kAttrValue);
 
   return encoded;
 }
@@ -755,12 +821,12 @@ FailureOr<EncodedAttr> DenseArrayAttrEncoding::Encode(mlir::SymbolTable &,
                                                       ImplicitLocOpBuilder &b,
                                                       std::string_view name,
                                                       Attribute attr) const {
-  Type elem_type = attr.cast<DenseArrayAttr>().getType().getElementType();
+  Type elem_type = attr.cast<DenseArrayAttr>().getElementType();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, ArrayRuntimeTypeId(elem_type));
-  encoded.value = PackDenseArrayAttribute(g, b, attr, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, ArrayRuntimeTypeId(elem_type));
+  encoded.value = EncodeDenseArrayAttribute(g, b, attr, kAttrValue);
 
   return encoded;
 }
@@ -782,9 +848,9 @@ FailureOr<EncodedAttr> EmptyArrayAttrEncoding::Encode(mlir::SymbolTable &,
                                                       std::string_view name,
                                                       Attribute attr) const {
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, TypeID::get<Tagged<EmptyArrayRef>>());
-  encoded.value = PackEmptyArrayAttribute(g, b, attr, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<EmptyArray>>());
+  encoded.value = EncodeEmptyArrayAttribute(g, b, attr, kAttrValue);
 
   return encoded;
 }
@@ -815,9 +881,9 @@ FailureOr<EncodedAttr> SymbolRefAttrEncoding::Encode(
   auto type_id = TypeID::get<Tagged<CustomCall::FunctionOrdinal>>();
 
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, type_id);
-  encoded.value = PackScalarAttribute(g, b, ordinal, kAttrValue);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, type_id);
+  encoded.value = EncodeScalar(g, b, ordinal, kAttrValue);
 
   return encoded;
 }
@@ -833,14 +899,40 @@ FailureOr<EncodedAttr> UnitAttrEncoding::Encode(mlir::SymbolTable &, Globals &g,
                                                 ImplicitLocOpBuilder &b,
                                                 std::string_view name,
                                                 Attribute attr) const {
-  // Unit attribute encodes empty optional as a null pointer.
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-
   Encoded encoded;
-  encoded.name = PackString(g, b, name, kAttrName);
-  encoded.type_id = PackTypeId(g, b, TypeID::get<Tagged<std::nullopt_t>>());
-  encoded.value = b.create<LLVM::NullOp>(ptr);
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<std::nullopt_t>>());
+  encoded.value = nullptr;  // unit attribute encoded as null global op
+
+  return encoded;
+}
+
+//===----------------------------------------------------------------------===//
+
+LogicalResult DictionaryAttrEncoding::Match(mlir::SymbolTable &,
+                                            std::string_view,
+                                            Attribute attr) const {
+  return success(attr.isa<DictionaryAttr>());
+}
 
+FailureOr<EncodedAttr> DictionaryAttrEncoding::Encode(
+    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
+    std::string_view name, Attribute attr) const {
+  // TODO(ezhulenev): Add current set of available encodings to `Encode`
+  // arguments and remove it from `AggregateAttrEncoding` constructor.
+  CustomCallAttrEncodingSet encoding = DefaultAttrEncodings();
+
+  auto dict = cast<DictionaryAttr>(attr);
+  auto encoded_dict = EncodeAttributes(
+      sym_table, g, b, encoding, "__rt_dictionary",
+      // We rely on the fact that dictionary keeps attributes sorted by name.
+      llvm::SmallVector<NamedAttribute>(dict.begin(), dict.end()));
+  if (mlir::failed(encoded_dict)) return mlir::failure();
+
+  Encoded encoded;
+  encoded.name = EncodeString(g, b, name, kAttrName);
+  encoded.type_id = EncodeTypeId(g, b, TypeID::get<Tagged<Dictionary>>());
+  encoded.value = *encoded_dict;
   return encoded;
 }
 
@@ -848,18 +940,17 @@ FailureOr<EncodedAttr> UnitAttrEncoding::Encode(mlir::SymbolTable &, Globals &g,
 // Encoding for collection of attributes.
 //===----------------------------------------------------------------------===//
 
-FailureOr<Value> EncodeAttributes(mlir::SymbolTable &sym_table, Globals &g,
-                                  ImplicitLocOpBuilder &b,
-                                  const CustomCallAttrEncodingSet &encoding,
-                                  std::string_view symbol_base,
-                                  ArrayRef<NamedAttribute> attrs) {
+FailureOr<LLVM::GlobalOp> EncodeAttributes(
+    mlir::SymbolTable &sym_table, Globals &g, ImplicitLocOpBuilder &b,
+    const CustomCallAttrEncodingSet &encoding, std::string_view symbol_base,
+    ArrayRef<NamedAttribute> attrs) {
   using EncodedAttr =
       std::pair<std::string_view, CustomCallAttrEncoding::Encoded>;
 
   // In addition to encoded attributes we encode the number of attributes.
   int64_t n_attrs = attrs.size();
 
-  // We store encoded attribute as `!llvm.array<ptr<i8> x len>`.
+  // We store encoded attribute as `!llvm.array<ptr x len>`.
   Type ptr = LLVM::LLVMPointerType::get(b.getContext());
   Type type = LLVM::LLVMArrayType::get(ptr, 1 + n_attrs * 3);
 
@@ -881,18 +972,26 @@ FailureOr<Value> EncodeAttributes(mlir::SymbolTable &sym_table, Globals &g,
     };
 
     // Insert the number of encoded attributes.
-    Attribute num_attrs = b.getI64IntegerAttr(n_attrs);
-    Value size = PackScalarAttribute(g, b, num_attrs, "__rt_num_attrs");
-    insert_value(size, 0);
+    LLVM::GlobalOp num_attrs =
+        EncodeScalar(g, b, b.getI64IntegerAttr(n_attrs), "__rt_num_attrs");
+    insert_value(Globals::AddrOf(b, num_attrs), 0);
 
     // Insert encoded attributes into the allocated storage.
     for (auto &pair : llvm::enumerate(encoded_attrs)) {
       CustomCallAttrEncoding::Encoded encoded = pair.value().second;
       int64_t offset = 1 + pair.index() * 3;
 
-      insert_value(encoded.name, offset + 0);
-      insert_value(encoded.type_id, offset + 1);
-      insert_value(encoded.value, offset + 2);
+      insert_value(Globals::AddrOf(b, encoded.name), offset + 0);
+      insert_value(Globals::AddrOf(b, encoded.type_id), offset + 1);
+
+      // For unit attributes we do not create any global operations, and just
+      // pass them as a null pointer. Attribute decoding treats null pointers as
+      // empty optional attributes.
+      if (encoded.value) {
+        insert_value(Globals::AddrOf(b, encoded.value), offset + 2);
+      } else {
+        insert_value(b.create<LLVM::NullOp>(ptr), offset + 2);
+      }
     }
 
     // Return attributes array from the global initializer block.
@@ -907,8 +1006,8 @@ FailureOr<Value> EncodeAttributes(mlir::SymbolTable &sym_table, Globals &g,
   auto global = g.TryGetOrCreate(b, attrs_map, type, symbol_base, init);
   if (failed(global)) return failure();
 
-  // Return an address of global encoding attributes.
-  return Globals::AddrOf(b, *global);
+  // Return global encoding attributes.
+  return *global;
 }
 
 //===----------------------------------------------------------------------===//
@@ -919,15 +1018,24 @@ LogicalResult ScalarArgEncoding::Match(Value value, Value converted) const {
   return success(IsSupportedScalarType(value.getType()));
 }
 
-FailureOr<EncodedArg> ScalarArgEncoding::Encode(Globals &g,
+FailureOr<EncodedArg> ScalarArgEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Value value,
                                                 Value converted) const {
   Type type = converted.getType();
 
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, ScalarRuntimeTypeId(type));
-  encoded.value = PackValue(b, converted);
+  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(type));
+
+  // Encode constant arguments as global values.
+  if (IntegerAttr cst; matchPattern(converted, m_Constant(&cst))) {
+    std::string name = llvm::formatv("__rt_c{0}", cst.getValue());
+    encoded.value = g.GetOrCreate(b, cst, name);
+  } else if (FloatAttr cst; matchPattern(converted, m_Constant(&cst))) {
+    encoded.value = g.GetOrCreate(b, cst, "__rt_cst");
+  } else {
+    encoded.value = PackValue(b, a, converted);
+  }
 
   return encoded;
 }
@@ -951,13 +1059,13 @@ LogicalResult OpaqueArgEncoding::Match(Value value, Value converted) const {
   return failure();
 }
 
-FailureOr<EncodedArg> OpaqueArgEncoding::Encode(Globals &g,
+FailureOr<EncodedArg> OpaqueArgEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Value value,
                                                 Value converted) const {
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, type_id_);
-  encoded.value = PackValue(b, converted);
+  encoded.type_id = EncodeTypeId(g, b, type_id_);
+  encoded.value = PackValue(b, a, converted);
   return encoded;
 }
 
@@ -1012,7 +1120,7 @@ static Value EncodeMemRef(ImplicitLocOpBuilder &b, MemRefType memref_ty,
   llvm::SmallVector<int64_t> strides;
   int64_t memref_offset;
   if (failed(getStridesAndOffset(memref_ty, strides, memref_offset)))
-    strides.resize(memref_ty.getRank(), ShapedType::kDynamicStrideOrOffset);
+    strides.resize(memref_ty.getRank(), ShapedType::kDynamic);
 
   // Build encoded memref sizes + strides: !llvm.array<... x i64>
   Value payload = b.create<LLVM::UndefOp>(type.getBody()[3]);
@@ -1024,10 +1132,9 @@ static Value EncodeMemRef(ImplicitLocOpBuilder &b, MemRefType memref_ty,
                     ? desc->size(b, loc, i)
                     : b.create<ConstantOp>(i64(dim_size));
 
-    Value stride =
-        ShapedType::isDynamicStrideOrOffset(stride_size) && desc.has_value()
-            ? desc->stride(b, loc, i)
-            : b.create<ConstantOp>(i64(stride_size));
+    Value stride = ShapedType::isDynamic(stride_size) && desc.has_value()
+                       ? desc->stride(b, loc, i)
+                       : b.create<ConstantOp>(i64(stride_size));
 
     auto stride_pos = memref_ty.getRank() + i;
 
@@ -1045,9 +1152,12 @@ static Value EncodeMemRef(ImplicitLocOpBuilder &b, MemRefType memref_ty,
   // dynamic values into the struct after all statically know values leads to a
   // better canonicalization and cleaner final LLVM IR.
   if (desc.has_value()) {
+    Value offset = b.create<ConstantOp>(i64(memref_offset));
+    Value data = b.create<LLVM::GEPOp>(desc->getElementPtrType(),
+                                       desc->alignedPtr(b, loc), offset);
     auto ptr = LLVM::LLVMPointerType::get(b.getContext());
-    Value data = b.create<LLVM::BitcastOp>(ptr, desc->alignedPtr(b, loc));
-    memref = b.create<LLVM::InsertValueOp>(memref, data, 2);
+    memref = b.create<LLVM::InsertValueOp>(
+        memref, b.create<LLVM::BitcastOp>(ptr, data), 2);
   }
 
   return memref;
@@ -1057,7 +1167,7 @@ LogicalResult MemrefArgEncoding::Match(Value value, Value converted) const {
   return success(value.getType().isa<MemRefType>());
 }
 
-FailureOr<EncodedArg> MemrefArgEncoding::Encode(Globals &g,
+FailureOr<EncodedArg> MemrefArgEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Value value,
                                                 Value converted) const {
@@ -1070,8 +1180,8 @@ FailureOr<EncodedArg> MemrefArgEncoding::Encode(Globals &g,
                      : TypeID::get<Tagged<StridedMemrefView>>();
 
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, type_id);
-  encoded.value = PackValue(b, EncodeMemRef(b, memref_type, converted));
+  encoded.type_id = EncodeTypeId(g, b, type_id);
+  encoded.value = PackValue(b, a, EncodeMemRef(b, memref_type, converted));
 
   return encoded;
 }
@@ -1084,16 +1194,16 @@ LogicalResult ScalarRetEncoding::Match(Type type, Type converted) const {
   return success(IsSupportedScalarType(type));
 }
 
-FailureOr<EncodedRet> ScalarRetEncoding::Encode(Globals &g,
+FailureOr<EncodedRet> ScalarRetEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Type type,
                                                 Type converted) const {
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, ScalarRuntimeTypeId(converted));
+  encoded.type_id = EncodeTypeId(g, b, ScalarRuntimeTypeId(converted));
+  encoded.value = a.GetOrCreate(b, converted);
 
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Value one = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-  encoded.value = b.create<LLVM::AllocaOp>(ptr, converted, one, 0);
+  // Start the lifetime of encoded result.
+  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), encoded.value);
 
   return encoded;
 }
@@ -1121,16 +1231,16 @@ LogicalResult OpaqueRetEncoding::Match(Type type, Type converted) const {
   return failure();
 }
 
-FailureOr<EncodedRet> OpaqueRetEncoding::Encode(Globals &g,
+FailureOr<EncodedRet> OpaqueRetEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Type value,
                                                 Type converted) const {
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, type_id_);
+  encoded.type_id = EncodeTypeId(g, b, type_id_);
+  encoded.value = a.GetOrCreate(b, converted);
 
-  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Value one = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-  encoded.value = b.create<LLVM::AllocaOp>(ptr, converted, one, 0);
+  // Start the lifetime of encoded result.
+  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), encoded.value);
 
   return encoded;
 }
@@ -1148,7 +1258,7 @@ LogicalResult MemrefRetEncoding::Match(Type type, Type converted) const {
                  converted.isa<LLVM::LLVMStructType>());
 }
 
-FailureOr<EncodedRet> MemrefRetEncoding::Encode(Globals &g,
+FailureOr<EncodedRet> MemrefRetEncoding::Encode(Globals &g, Allocas &a,
                                                 ImplicitLocOpBuilder &b,
                                                 Type type,
                                                 Type converted) const {
@@ -1159,11 +1269,11 @@ FailureOr<EncodedRet> MemrefRetEncoding::Encode(Globals &g,
   auto type_id = TypeID::get<Tagged<MemrefView>>();
 
   Encoded encoded;
-  encoded.type_id = PackTypeId(g, b, type_id);
+  encoded.type_id = EncodeTypeId(g, b, type_id);
   // No memref descriptor for result, we only encode compile time known info:
   // dtype, rank, dims
   encoded.value =
-      PackValue(b, EncodeMemRef(b, memref_ty, /*descriptor=*/nullptr));
+      PackValue(b, a, EncodeMemRef(b, memref_ty, /*descriptor=*/nullptr));
 
   return encoded;
 }
@@ -1194,7 +1304,6 @@ FailureOr<Value> MemrefRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
                                              b.create<LLVM::LoadOp>(ptr, gep));
   memref_desc.setAllocatedPtr(b, loc, data_ptr);
   memref_desc.setAlignedPtr(b, loc, data_ptr);
-  memref_desc.setConstantOffset(b, loc, 0);
 
   // Get the statically known strides and offset from the memref type.
   SmallVector<int64_t> strides;
@@ -1203,6 +1312,8 @@ FailureOr<Value> MemrefRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
     return failure();
   }
 
+  memref_desc.setConstantOffset(b, loc, memref_offset);
+
   // Fill memref descriptor dimensions and strides.
   for (unsigned i = 0; i < memref_type.getRank(); ++i) {
     memref_desc.setConstantSize(b, loc, i, memref_type.getDimSize(i));
@@ -1214,6 +1325,69 @@ FailureOr<Value> MemrefRetEncoding::Decode(ImplicitLocOpBuilder &b, Type type,
   return casted.getResult(0);
 }
 
+//===----------------------------------------------------------------------===//
+
+LogicalResult AsyncValueRetEncoding::Match(Type type, Type converted) const {
+  return success(
+      (type.isa<async::ValueType>() || type.isa<async::TokenType>()) &&
+      converted.isa<LLVM::LLVMPointerType>());
+}
+
+FailureOr<EncodedRet> AsyncValueRetEncoding::Encode(Globals &g, Allocas &a,
+                                                    ImplicitLocOpBuilder &b,
+                                                    Type type,
+                                                    Type converted) const {
+  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
+  Value one = b.create<ConstantOp>(b.getI32IntegerAttr(1));
+
+  auto type_id = type.isa<async::ValueType>()
+                     ? AsyncValueRuntimeTypeId(
+                           type.cast<async::ValueType>().getValueType())
+                     : TypeID::get<Tagged<tsl::AsyncValueRef<tsl::Chain>>>();
+
+  Encoded encoded;
+  encoded.type_id = EncodeTypeId(g, b, type_id);
+
+  // for !async.value<memref> encoding its dtype, rank and dims with
+  // EncodedMemRef struct; we use its data field to store async value ptr.
+  if (auto value_ty = type.dyn_cast<async::ValueType>()) {
+    if (auto memref_ty = value_ty.getValueType().dyn_cast<MemRefType>()) {
+      encoded.value =
+          PackValue(b, a, EncodeMemRef(b, memref_ty, /*descriptor=*/nullptr));
+      return encoded;
+    }
+  }
+
+  encoded.value = b.create<LLVM::AllocaOp>(ptr, converted, one, 0);
+
+  return encoded;
+}
+
+FailureOr<Value> AsyncValueRetEncoding::Decode(ImplicitLocOpBuilder &b,
+                                               Type type, Type converted,
+                                               LLVM::AllocaOp alloca) const {
+  if (auto value_ty = type.dyn_cast<async::ValueType>()) {
+    if (auto memref_ty = value_ty.getValueType().dyn_cast<MemRefType>()) {
+      // TODO(ezhulenev): Add support for returning dynamically shaped memref.
+      if (!memref_ty.hasStaticShape()) return failure();
+
+      Value c0 = b.create<ConstantOp>(b.getI64IntegerAttr(0));
+      Value c2 = b.create<ConstantOp>(b.getI64IntegerAttr(2));
+      Type ptr = LLVM::LLVMPointerType::get(b.getContext());
+      LLVM::LLVMStructType encoded = GetEncodeMemRefType(b, memref_ty);
+      Value gep =
+          b.create<LLVM::GEPOp>(ptr, encoded, alloca, ValueRange({c0, c2}));
+      Value async_value = b.create<LLVM::LoadOp>(converted, gep);
+      auto casted = b.create<UnrealizedConversionCastOp>(type, async_value);
+      return casted.getResult(0);
+    }
+  }
+
+  auto async_value = Value{b.create<LLVM::LoadOp>(converted, alloca)};
+  auto casted = b.create<UnrealizedConversionCastOp>(type, async_value);
+  return casted.getResult(0);
+}
+
 //===----------------------------------------------------------------------===//
 // Default encodings for arguments, attributes, and results
 //===----------------------------------------------------------------------===//
@@ -1223,13 +1397,11 @@ CustomCallAttrEncodingSet DefaultAttrEncodings() {
   encodings
       .Add<StringAttrEncoding, ScalarAttrEncoding, DenseElementsAttrEncoding,
            ArrayAttrEncoding, DenseArrayAttrEncoding, EmptyArrayAttrEncoding,
-           SymbolRefAttrEncoding, UnitAttrEncoding>();
+           SymbolRefAttrEncoding, UnitAttrEncoding, DictionaryAttrEncoding>();
 
   encodings.Add<AggregateAttrEncoding<HloTraceAttr, HloTrace>>(
-      encodings, AggregateAttrDef<HloTraceAttr>()
-                     .Add("hlo_op", &HloTraceAttr::getHloOp)
-                     .Add("module", &HloTraceAttr::getModule)
-                     .Add("program_id", &HloTraceAttr::getProgramId));
+      encodings,
+      AggregateAttrDef<HloTraceAttr>().Add("hlo_op", &HloTraceAttr::getHloOp));
 
   return encodings;
 }
@@ -1242,7 +1414,8 @@ CustomCallArgEncodingSet DefaultArgEncodings() {
 
 CustomCallRetEncodingSet DefaultRetEncodings() {
   CustomCallRetEncodingSet encodings;
-  encodings.Add<ScalarRetEncoding, OpaqueRetEncoding, MemrefRetEncoding>();
+  encodings.Add<ScalarRetEncoding, OpaqueRetEncoding, MemrefRetEncoding,
+                AsyncValueRetEncoding>();
   return encodings;
 }
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h
index d59a9d5f3a1..6e1e8dae14f 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h
@@ -23,12 +23,14 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -60,23 +62,27 @@ namespace runtime {
 //
 // Custom call arguments are encoded as an array of pointers allocated on the
 // stack. Each individual argument is also encoded on the stack, because
-// arguments are run time values and we can't encode them in the constant
-// section.
+// arguments are typically run time values and we can't encode them in the
+// constant section. Statically known arguments (constants) can be encoded as
+// global values together with attributes.
 
 // Forward declare class declared below.
 class Globals;
+class Allocas;
 
 //===----------------------------------------------------------------------===//
 // Custom call arguments encoding.
 //===----------------------------------------------------------------------===//
 
-// Encodes argument into stack allocated storage according to the ABI. If
-// argument is a constant, then it can be packed as a global constant.
+// Encodes argument into stack allocated storage according to the ABI.
 class CustomCallArgEncoding {
  public:
   struct Encoded {
-    mlir::Value type_id;  // !llvm.ptr<i64>
-    mlir::Value value;    // !llvm.ptr<ArgType>
+    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
+
+    // Statically known arguments might be encoded as global constants,
+    // otherwise it will be `!llvm.alloca 1 x ArgType`.
+    std::variant<mlir::LLVM::AllocaOp, mlir::LLVM::GlobalOp> value;
   };
 
   virtual ~CustomCallArgEncoding() = default;
@@ -84,7 +90,7 @@ class CustomCallArgEncoding {
   virtual mlir::LogicalResult Match(mlir::Value value,
                                     mlir::Value conterted) const = 0;
 
-  virtual mlir::FailureOr<Encoded> Encode(Globals &g,
+  virtual mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
                                           mlir::ImplicitLocOpBuilder &b,
                                           mlir::Value value,
                                           mlir::Value converted) const = 0;
@@ -97,7 +103,8 @@ class CustomCallArgEncodingSet {
 
   // Finds matching argument encoding and tries to encode the values. Returns
   // failure if didn't match values to any of the argument encodings.
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b,
                                   mlir::Value value,
                                   mlir::Value converted) const;
 
@@ -128,8 +135,8 @@ class CustomCallArgEncodingSet {
 class CustomCallRetEncoding {
  public:
   struct Encoded {
-    mlir::Value type_id;         // !llvm.ptr<i64>
-    mlir::LLVM::AllocaOp value;  // !llvm.alloca 1 x ResultType
+    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
+    mlir::LLVM::AllocaOp value;    // !llvm.alloca 1 x ResultType
   };
 
   virtual ~CustomCallRetEncoding() = default;
@@ -137,7 +144,7 @@ class CustomCallRetEncoding {
   virtual mlir::LogicalResult Match(mlir::Type type,
                                     mlir::Type converted) const = 0;
 
-  virtual mlir::FailureOr<Encoded> Encode(Globals &g,
+  virtual mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
                                           mlir::ImplicitLocOpBuilder &b,
                                           mlir::Type type,
                                           mlir::Type converted) const = 0;
@@ -154,7 +161,8 @@ class CustomCallRetEncodingSet {
 
   // Finds matching result encoding and tries to encode the values. Returns
   // failure if didn't match values to any of the result encodings.
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b,
                                   mlir::Type type, mlir::Type converted) const;
 
   // Convert the encoded value in alloca back to a value with the converted
@@ -193,9 +201,9 @@ struct CustomCallAttrEncoding {
   static constexpr char kAttrValue[] = "__rt_attr_value";
 
   struct Encoded {
-    mlir::Value name;     // !llvm.ptr<i8>
-    mlir::Value type_id;  // !llvm.ptr<i64>
-    mlir::Value value;    // !llvm.ptr<EncodedAttrType>
+    mlir::LLVM::GlobalOp name;     // llvm.mlir.global <encoded-name>
+    mlir::LLVM::GlobalOp type_id;  // llvm.mlir.global external $type_name : i64
+    mlir::LLVM::GlobalOp value;    // llvm.mlir.global <encoded-attribute>
   };
 
   virtual ~CustomCallAttrEncoding() = default;
@@ -243,26 +251,28 @@ class CustomCallAttrEncodingSet {
 };
 
 //===----------------------------------------------------------------------===//
-// A set of helper functions for packing primitive attributes.
+// A set of helper functions for packing encoding attributes.
 //===----------------------------------------------------------------------===//
 
-// Packs TypeID as `i64` constant value and casts it to the `!llvm.ptr<i8>`,
-// because type id internally is implemented as an opaque pointer.
-mlir::Value PackTypeId(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                       mlir::TypeID type_id);
-
-// Packs string as a module global null-terminated string constant. We reuse
-// the encoding scheme for arrays to store sting with its size, to avoid
-// computing the length of the null-terminated string at run tine.
-//
-// Returns `!llvm.ptr<EncodedArray<char>>`.
-mlir::Value PackString(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                       std::string_view strref, std::string_view symbol_base);
-
-// Packs scalar attribute as a global constant. Returns `!llvm.ptr<AttrType>`.
-mlir::Value PackScalarAttribute(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                mlir::Attribute value,
-                                std::string_view symbol_base);
+// Encodes type id as an external LLVM global of type `i64`. The global name is
+// defined by the type id name registry. Internally type id implemented as an
+// opaque pointer (void*), and type equality check at run time is just a pointer
+// comparison. All type id symbols at run time must be resolved to the type id
+// instances defined in the current process.
+mlir::LLVM::GlobalOp EncodeTypeId(Globals &g, mlir::ImplicitLocOpBuilder &b,
+                                  TypeID type_id);
+
+// Encodes string as a module global null-terminated string constant + size. We
+// reuse the encoding scheme for arrays to store sting with its size, to avoid
+// computing the length of the null-terminated string at run time.
+mlir::LLVM::GlobalOp EncodeString(Globals &g, mlir::ImplicitLocOpBuilder &b,
+                                  std::string_view strref,
+                                  std::string_view symbol_base);
+
+// Encodes scalar attribute as a global constant.
+mlir::LLVM::GlobalOp EncodeScalar(Globals &g, mlir::ImplicitLocOpBuilder &b,
+                                  mlir::Attribute value,
+                                  std::string_view symbol_base);
 
 //===----------------------------------------------------------------------===//
 // A helper class to create global constants in the module.
@@ -286,7 +296,7 @@ class Globals {
 
   // Creates a global external variable for the type id.
   mlir::LLVM::GlobalOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
-                                   mlir::TypeID type_id);
+                                   TypeID type_id);
 
   // Creates a global null-terminated string constant.
   mlir::LLVM::GlobalOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
@@ -338,6 +348,55 @@ class Globals {
   TypeIDNameRegistry type_id_names_;
 };
 
+//===----------------------------------------------------------------------===//
+// A helper class to create alloca operations for encoded arguments.
+//===----------------------------------------------------------------------===//
+
+class EncodingAllocas;
+
+// We reuse allocas for encoding custom call arguments and results, because we
+// potentially can have thousands of custom calls, and we do not want to
+// accidentally blow up the stack size. It means that we might encode the same
+// argument multiple times, but encoding is cheap (few store operations), and
+// LLVM can potentially optimize them away.
+//
+// TODO(ezhulenev): Use `llvm.invariant.start` and `llvm.invariant.end` to mark
+// encoded arguments allocas.
+class Allocas {
+ public:
+  ~Allocas();
+
+  mlir::LLVM::AllocaOp GetOrCreate(mlir::ImplicitLocOpBuilder &b,
+                                   mlir::Type type);
+
+ private:
+  friend class EncodingAllocas;
+
+  struct TypedAllocas {
+    size_t offset = 0;
+    llvm::SmallVector<mlir::LLVM::AllocaOp> allocas;
+  };
+
+  explicit Allocas(mlir::Block *block,
+                   llvm::DenseMap<mlir::Type, TypedAllocas> *allocas);
+
+  mlir::Block *block_;
+  llvm::DenseMap<mlir::Type, TypedAllocas> *allocas_;
+};
+
+// Mapping from basic block to allocas.
+class EncodingAllocas {
+ public:
+  Allocas GetForOperation(mlir::Operation *op);
+
+ private:
+  friend class Allocas;
+
+  llvm::DenseMap<mlir::Block *,
+                 llvm::DenseMap<mlir::Type, Allocas::TypedAllocas>>
+      allocas_;
+};
+
 //===----------------------------------------------------------------------===//
 // Custom call attributes encoding.
 //===----------------------------------------------------------------------===//
@@ -357,7 +416,7 @@ class Globals {
 //   2. Custom call attributes, where the attributes sorted lexicographically by
 //      name, to be able to efficiently decode named attributes.
 //
-mlir::FailureOr<mlir::Value> EncodeAttributes(
+mlir::FailureOr<mlir::LLVM::GlobalOp> EncodeAttributes(
     mlir::SymbolTable &sym_table, Globals &g, mlir::ImplicitLocOpBuilder &b,
     const CustomCallAttrEncodingSet &encoding, std::string_view symbol_base,
     llvm::ArrayRef<mlir::NamedAttribute> attrs);
@@ -434,6 +493,15 @@ struct UnitAttrEncoding : public CustomCallAttrEncoding {
                                   mlir::Attribute) const final;
 };
 
+struct DictionaryAttrEncoding : public CustomCallAttrEncoding {
+  mlir::LogicalResult Match(mlir::SymbolTable &, std::string_view,
+                            mlir::Attribute) const final;
+  mlir::FailureOr<Encoded> Encode(mlir::SymbolTable &, Globals &,
+                                  mlir::ImplicitLocOpBuilder &,
+                                  std::string_view,
+                                  mlir::Attribute) const final;
+};
+
 // Custom call attribute encoding that encodes enums using their underlying
 // scalar type. Type id is based on the enum type passed to the runtime.
 //
@@ -471,13 +539,13 @@ struct EnumAttrEncoding : public CustomCallAttrEncoding {
     using T = std::underlying_type_t<RuntimeEnumType>;
     T underlying_value = static_cast<T>(run_time_enum);
 
-    mlir::TypeID type_id = mlir::TypeID::get<Tagged<RuntimeEnumType>>();
+    TypeID type_id = TypeID::get<Tagged<RuntimeEnumType>>();
     mlir::Attribute underlying_attr = AsAttr(b, underlying_value);
 
     Encoded encoded;
-    encoded.name = PackString(g, b, name, kAttrName);
-    encoded.type_id = PackTypeId(g, b, type_id);
-    encoded.value = PackScalarAttribute(g, b, underlying_attr, kAttrValue);
+    encoded.name = EncodeString(g, b, name, kAttrName);
+    encoded.type_id = EncodeTypeId(g, b, type_id);
+    encoded.value = EncodeScalar(g, b, underlying_attr, kAttrValue);
 
     return encoded;
   }
@@ -557,15 +625,15 @@ struct AggregateAttrEncoding : public CustomCallAttrEncoding {
       attrs.emplace_back(bind(attr.cast<AttrType>(), b));
 
     // Encode extracted attributes as an aggregate.
-    auto type_id = mlir::TypeID::get<Tagged<RuntimeType>>();
+    auto type_id = TypeID::get<Tagged<RuntimeType>>();
     auto sym = "__rt_aggregate_" + AttrType::getMnemonic();
     auto aggregate =
         EncodeAttributes(sym_table, g, b, encoding, sym.str(), attrs);
     if (mlir::failed(aggregate)) return mlir::failure();
 
     Encoded encoded;
-    encoded.name = PackString(g, b, name, kAttrName);
-    encoded.type_id = PackTypeId(g, b, type_id);
+    encoded.name = EncodeString(g, b, name, kAttrName);
+    encoded.type_id = EncodeTypeId(g, b, type_id);
     encoded.value = *aggregate;
     return encoded;
   }
@@ -582,8 +650,9 @@ struct AggregateAttrEncoding : public CustomCallAttrEncoding {
 class ScalarArgEncoding : public CustomCallArgEncoding {
  public:
   mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Value, mlir::Value) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
+                                  mlir::Value) const final;
 };
 
 // Encodes custom call arguments passed as an opaque LLVM pointer (!llvm.ptr)
@@ -595,8 +664,9 @@ class OpaqueArgEncoding : public CustomCallArgEncoding {
   OpaqueArgEncoding(std::function<bool(mlir::Value)> match, TypeID type_id);
 
   mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Value, mlir::Value) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
+                                  mlir::Value) const final;
 
   template <typename T>
   static auto Match() {
@@ -612,8 +682,9 @@ class OpaqueArgEncoding : public CustomCallArgEncoding {
 class MemrefArgEncoding : public CustomCallArgEncoding {
  public:
   mlir::LogicalResult Match(mlir::Value, mlir::Value) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Value, mlir::Value) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Value,
+                                  mlir::Value) const final;
 };
 
 //===----------------------------------------------------------------------===//
@@ -624,8 +695,9 @@ class MemrefArgEncoding : public CustomCallArgEncoding {
 class ScalarRetEncoding : public CustomCallRetEncoding {
  public:
   mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Type, mlir::Type) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
+                                  mlir::Type) const final;
   mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
                                       mlir::Type,
                                       mlir::LLVM::AllocaOp) const final;
@@ -640,8 +712,9 @@ class OpaqueRetEncoding : public CustomCallRetEncoding {
   OpaqueRetEncoding(std::function<bool(mlir::Type)> match, TypeID type_id);
 
   mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Type, mlir::Type) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
+                                  mlir::Type) const final;
   mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
                                       mlir::Type,
                                       mlir::LLVM::AllocaOp) const final;
@@ -660,8 +733,20 @@ class OpaqueRetEncoding : public CustomCallRetEncoding {
 class MemrefRetEncoding : public CustomCallRetEncoding {
  public:
   mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
-  mlir::FailureOr<Encoded> Encode(Globals &g, mlir::ImplicitLocOpBuilder &b,
-                                  mlir::Type, mlir::Type) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
+                                  mlir::Type) const final;
+  mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
+                                      mlir::Type,
+                                      mlir::LLVM::AllocaOp) const final;
+};
+
+class AsyncValueRetEncoding : public CustomCallRetEncoding {
+ public:
+  mlir::LogicalResult Match(mlir::Type, mlir::Type) const final;
+  mlir::FailureOr<Encoded> Encode(Globals &g, Allocas &a,
+                                  mlir::ImplicitLocOpBuilder &b, mlir::Type,
+                                  mlir::Type) const final;
   mlir::FailureOr<mlir::Value> Decode(mlir::ImplicitLocOpBuilder &b, mlir::Type,
                                       mlir::Type,
                                       mlir::LLVM::AllocaOp) const final;
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
index ed722569027..0e45d184c7b 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
@@ -51,7 +52,7 @@ static bool DebugJitCompiler() {
 #if defined(DEBUG_XLA_RUNTIME_COMPILER)
   return true;
 #endif
-  return false;
+  return VLOG_IS_ON(5);
 }
 
 static bool EnablePassTiming() {
@@ -94,12 +95,13 @@ static LogicalResult RunPipeline(
     ModuleOp module, const std::function<void(PassManager&)>& create_pipeline) {
   if (!create_pipeline) return success();
 
-  mlir::PassManager pm(module.getContext());
-  SetupPassDebugging(module.getContext(), pm);
-
   // Instrument the pass manager to capture timing information.
   DefaultTimingManager tm;
   TimingScope timing;
+
+  mlir::PassManager pm(module.getContext());
+  SetupPassDebugging(module.getContext(), pm);
+
   if (EnablePassTiming()) {
     tm.setEnabled(true);
     timing = tm.getRootScope();
@@ -125,19 +127,17 @@ static LogicalResult RunSpecializationPipeline(
 
 //===----------------------------------------------------------------------===//
 
-// Creates a new MLIR Context and registers all the dialects that are expected
+// Configures MLIR Context and registers all the dialects that are expected
 // in the compiled module.
-static std::unique_ptr<MLIRContext> CreateMlirContext(
-    const JitCompiler::Options& opts) {
-  DialectRegistry dialects;
-
-  // Call user-provided callback to register all required dialects.
-  if (opts.register_dialects) opts.register_dialects(dialects);
-
-  auto threading = MLIRContext::Threading::DISABLED;
-  auto ctx = std::make_unique<MLIRContext>(*dialects, threading);
-  ctx->loadAllAvailableDialects();
-  return ctx;
+static void ConfigureMlirContext(MLIRContext* context,
+                                 const JitCompiler::Options& opts) {
+  if (opts.register_dialects) {
+    // Call user-provided callback to register all required dialects.
+    DialectRegistry dialects;
+    opts.register_dialects(dialects);
+    context->appendDialectRegistry(*dialects);
+    context->loadAllAvailableDialects();
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -147,33 +147,36 @@ static std::unique_ptr<MLIRContext> CreateMlirContext(
 JitCompiler::JitCompiler(JitCompiler::Options opts,
                          std::string_view mlir_module)
     : opts_(std::move(opts)),
-      context_(CreateMlirContext(opts_)),
+      owned_context_(
+          std::make_unique<MLIRContext>(MLIRContext::Threading::DISABLED)),
+      context_(owned_context_.get()),
       diagnostic_os_(diagnostic_),
-      handler_(source_mgr_, context_.get(), diagnostic_os_),
+      handler_(source_mgr_, context_, diagnostic_os_),
       specialized_(false) {
+  ConfigureMlirContext(context_, opts_);
   source_mgr_.AddNewSourceBuffer(
       llvm::MemoryBuffer::getMemBuffer(mlir_module, "xla.program"),
       llvm::SMLoc());
-  module_ = parseSourceFile<ModuleOp>(source_mgr_, context_.get());
+  module_ = parseSourceFile<ModuleOp>(source_mgr_, context_);
 }
 
-/*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
-JitCompiler::Instantiate(JitCompiler::Options opts,
-                         std::string_view mlir_module,
-                         absl::Span<const std::string_view> exported) {
-  std::unique_ptr<JitCompiler> compiler(
-      new JitCompiler(std::move(opts), mlir_module));
-
-  // Check that mlir source was parsed into module operation.
-  if (!compiler->module_)
-    return compiler->Error("failed to parse the mlir source");
+JitCompiler::JitCompiler(JitCompiler::Options opts, mlir::ModuleOp mlir_module)
+    : opts_(std::move(opts)),
+      context_(mlir_module.getContext()),
+      diagnostic_os_(diagnostic_),
+      handler_(source_mgr_, context_, diagnostic_os_),
+      module_(mlir_module),
+      specialized_(false) {
+  ConfigureMlirContext(context_, opts_);
+}
 
-  ModuleOp module = *compiler->module_;
-  SymbolTable sym_table(module);
+absl::Status JitCompiler::ComputeOrdinalsForExportedFunctions(
+    absl::Span<const std::string_view> exported) {
+  SymbolTable sym_table(*module_);
 
   // Add `rt.export` operations for all explicitly exported functions.
   for (auto& indexed : llvm::enumerate(exported)) {
-    if (auto func = sym_table.lookup<func::FuncOp>(indexed.value())) {
+    if (auto func = sym_table.lookup<FunctionOpInterface>(indexed.value())) {
       OpBuilder(func).create<ExportOp>(func.getLoc(), func, indexed.index());
       continue;
     }
@@ -182,19 +185,50 @@ JitCompiler::Instantiate(JitCompiler::Options opts,
 
   // Assign unique ordinals to all exported functions, including functions that
   // were already exported with `rt.export` operations in the input IR.
-  mlir::PassManager pm(module.getContext());
+  mlir::PassManager pm(module_->getContext());
   pm.addPass(CreateOrdinalAssignmentPass());
-  if (failed(pm.run(module)))
-    return compiler->Error("failed to run ordinal assignment pass");
+  if (failed(pm.run(*module_)))
+    return Error("failed to run ordinal assignment pass");
 
   // Resolve all functions exported from the module indexed by ordinal.
-  for (ExportOp op : module.getOps<ExportOp>()) {
+  for (ExportOp op : module_->getOps<ExportOp>()) {
     unsigned ordinal = *op.ordinal();
-    if (ordinal >= compiler->exported_.size())
-      compiler->exported_.resize(ordinal + 1);
-    compiler->exported_[ordinal] = op.exported(sym_table);
+    if (ordinal >= exported_.size()) exported_.resize(ordinal + 1);
+    exported_[ordinal] = op.exported(sym_table);
   }
 
+  return absl::OkStatus();
+}
+
+/*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
+JitCompiler::Instantiate(JitCompiler::Options opts,
+                         std::string_view mlir_module,
+                         absl::Span<const std::string_view> exported) {
+  std::unique_ptr<JitCompiler> compiler(
+      new JitCompiler(std::move(opts), mlir_module));
+
+  // Check that mlir source was parsed into module operation.
+  if (!compiler->module_)
+    return compiler->Error("failed to parse the mlir source");
+
+  auto status = compiler->ComputeOrdinalsForExportedFunctions(exported);
+  if (!status.ok()) return status;
+
+  // Initialize LLVM compiler internals.
+  InitializeLlvmCompiler();
+
+  return {std::move(compiler)};
+}
+
+/*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
+JitCompiler::Instantiate(JitCompiler::Options opts, ModuleOp mlir_module,
+                         absl::Span<const std::string_view> exported) {
+  std::unique_ptr<JitCompiler> compiler(
+      new JitCompiler(std::move(opts), mlir_module));
+
+  auto status = compiler->ComputeOrdinalsForExportedFunctions(exported);
+  if (!status.ok()) return status;
+
   // Initialize LLVM compiler internals.
   InitializeLlvmCompiler();
 
@@ -218,15 +252,17 @@ JitCompiler::Instantiate(JitCompiler::Options opts,
   std::vector<std::string_view> exported;  // names of exported functions
 
   for (auto& indexed : llvm::enumerate(compiler->exported())) {
-    func::FuncOp func = indexed.value();
+    auto func = indexed.value();
     std::string_view name = exported.emplace_back(func.getName());
 
     // Get the signature of the exported function.
-    auto signature = opts.type_converter.Convert(func.getFunctionType());
+    auto signature = opts.type_converter.Convert(
+        llvm::cast<mlir::FunctionType>(func.getFunctionType()));
     if (!signature.ok()) return signature.status();
 
     // Calling convention conversion can fail if some types are not supported.
-    auto runtime_type = opts.calling_convention(func.getFunctionType());
+    auto runtime_type = opts.calling_convention(
+        llvm::cast<mlir::FunctionType>(func.getFunctionType()));
     if (!runtime_type)
       return compiler->Error(StrFormat(
           "calling convention failed to convert function type for %s", name));
@@ -247,14 +283,11 @@ JitCompiler::Instantiate(JitCompiler::Options opts,
 
     // Add function with an unresolved function pointer; it will be updated once
     // we compile the input module to the native executable.
-    Executable::Function function{std::string(name),
-                                  /*fptr=*/nullptr,
-                                  std::move(*signature),
-                                  std::move(*runtime_signature),
-                                  std::move(*arguments_memory_layout),
-                                  std::move(*results_memory_layout)};
-
-    functions.push_back(std::move(function));
+    functions.push_back(Executable::Function(
+        name,
+        /*fptr=*/nullptr, std::move(*signature), std::move(*runtime_signature),
+        std::move(*arguments_memory_layout),
+        std::move(*results_memory_layout)));
   }
 
   // Run the compilation pipeline to lower the module to LLVM dialect.
@@ -335,7 +368,7 @@ absl::Status JitCompiler::Specialize(unsigned ordinal, ArgumentsRef arguments,
   assert(!specialized_ && "can specialize executable only once");
   specialized_ = true;
 
-  func::FuncOp func = exported(ordinal);
+  auto func = exported(ordinal);
 
   // Update function signature and sink constant arguments into the body.
   if (auto specialized = SpecializeFunction(func, arguments, symbolic_shapes,
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
index b15a4031ff7..0c18b831563 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/calling_convention.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/specialization.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h"
@@ -113,6 +112,11 @@ class JitCompiler {
       Options opts, std::string_view mlir_module,
       absl::Span<const std::string_view> exported);
 
+  // Instantiates compiler from the mlir module.
+  static absl::StatusOr<std::unique_ptr<JitCompiler>> Instantiate(
+      Options opts, mlir::ModuleOp mlir_module,
+      absl::Span<const std::string_view> exported);
+
   // Makes an executable from an instance of the JitCompiler. This is the end of
   // life for the `JitCompiler`, it effectively converts the MLIR module
   // to the executable (function pointer) using LLVM JIT code generation.
@@ -154,15 +158,21 @@ class JitCompiler {
 
   size_t num_exported() const { return exported_.size(); }
 
-  absl::Span<const mlir::func::FuncOp> exported() const { return exported_; }
+  absl::Span<const mlir::FunctionOpInterface> exported() const {
+    return exported_;
+  }
 
-  mlir::func::FuncOp exported(unsigned ordinal) const {
+  mlir::FunctionOpInterface exported(unsigned ordinal) const {
     assert(exported_[ordinal] && "failed to resolve exported function");
     return exported_[ordinal];
   }
 
  private:
   JitCompiler(Options opts, std::string_view mlir_module);
+  JitCompiler(Options opts, mlir::ModuleOp mlir_module);
+
+  absl::Status ComputeOrdinalsForExportedFunctions(
+      absl::Span<const std::string_view> exported);
 
   absl::Status Error(std::string_view error) {
     // TODO(ezhulenev): Pass diagnstic as a status payload.
@@ -170,7 +180,8 @@ class JitCompiler {
   }
 
   Options opts_;
-  std::unique_ptr<mlir::MLIRContext> context_;
+  std::unique_ptr<mlir::MLIRContext> owned_context_;  // set if context is owned
+  mlir::MLIRContext* context_;
 
   std::string diagnostic_;
   llvm::raw_string_ostream diagnostic_os_;
@@ -178,8 +189,8 @@ class JitCompiler {
   llvm::SourceMgr source_mgr_;
   mlir::SourceMgrDiagnosticHandler handler_;
 
-  mlir::OwningOpRef<mlir::ModuleOp> module_;  // can be null if failed to parse
-  std::vector<mlir::func::FuncOp> exported_;  // can be empty if failed to parse
+  mlir::OwningOpRef<mlir::ModuleOp> module_;         // null if failed to parse
+  std::vector<mlir::FunctionOpInterface> exported_;  // empty if failed to parse
 
   bool specialized_;
 };
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
index a1e595c4cbc..d5fd07f68b9 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/rt_to_llvm.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "llvm/ADT/None.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
@@ -104,10 +106,12 @@ struct RuntimeAPI {
 };
 
 // Adds function declaration if it doesn't already exist.
-static void AddDeclaration(ModuleOp module, std::string_view name,
-                           FunctionType type) {
+static void AddDeclaration(SymbolTable &sym_table, ModuleOp module,
+                           std::string_view name, FunctionType type) {
+  assert(sym_table.getOp() == module && "incorrect symbol table");
+  if (sym_table.lookup(name)) return;
+
   auto b = ImplicitLocOpBuilder::atBlockEnd(module.getLoc(), module.getBody());
-  if (module.lookupSymbol(name)) return;
 
   MLIRContext *ctx = module.getContext();
   func::FuncOp func = b.create<func::FuncOp>(name, type);
@@ -116,12 +120,14 @@ static void AddDeclaration(ModuleOp module, std::string_view name,
   // TODO(ezhulenev): Add per-argument nocapture attributes?
   func->setAttr("passthrough",
                 ArrayAttr::get(ctx, {StringAttr::get(ctx, "nounwind")}));
+
+  sym_table.insert(func);
 }
 
 // Adds Runtime C API declarations to the module.
-static void AddRuntimeApiDeclarations(ModuleOp module) {
+static void AddRuntimeApiDeclarations(SymbolTable &sym_table, ModuleOp module) {
   auto add = [&](std::string_view name, FunctionType type) {
-    AddDeclaration(module, name, type);
+    AddDeclaration(sym_table, module, name, type);
   };
 
   MLIRContext *ctx = module.getContext();
@@ -215,82 +221,136 @@ class IsOkOpLowering : public OpConversionPattern<IsOkOp> {
 // Convert rt.custom_call to the corresponding runtime API call.
 //===----------------------------------------------------------------------===//
 
-static FailureOr<Value> EncodeArguments(
-    CallOp op, CustomCallArgEncodingSet &encodings, Globals &g,
+static Value AsPtr(ImplicitLocOpBuilder &b,
+                   std::variant<LLVM::AllocaOp, LLVM::GlobalOp> &v) {
+  if (auto *alloca = std::get_if<LLVM::AllocaOp>(&v))
+    return alloca->getResult();
+  return Globals::AddrOf(b, std::get<LLVM::GlobalOp>(v));
+}
+
+static LLVM::GlobalOp EncodeEmptyArgsRets(Globals &g, ImplicitLocOpBuilder &b,
+                                          std::string_view symbol_base) {
+  // Empty args/rets is just an array with a single pointer to size (zero).
+  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
+  Type type = LLVM::LLVMArrayType::get(ptr, 1);
+
+  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) {
+    LLVM::GlobalOp zero =
+        EncodeScalar(g, b, b.getI64IntegerAttr(0), "__rt_zero");
+
+    Value arr = b.create<LLVM::UndefOp>(type);
+    arr = b.create<LLVM::InsertValueOp>(arr, Globals::AddrOf(b, zero), 0);
+    b.create<LLVM::ReturnOp>(arr);
+  };
+
+  return g.GetOrCreate(b, b.getArrayAttr({}), type, symbol_base, init);
+}
+
+static LLVM::GlobalOp EncodeTypeTable(Globals &g, ImplicitLocOpBuilder &b,
+                                      ArrayRef<LLVM::GlobalOp> type_ids,
+                                      std::string_view symbol_base) {
+  // We store type table as `!llvm.array<ptr x len>`.
+  Type ptr = LLVM::LLVMPointerType::get(b.getContext());
+  Type type = LLVM::LLVMArrayType::get(ptr, type_ids.size());
+
+  // Global initializer that encodes type ids as pointers.
+  auto init = [&](ImplicitLocOpBuilder &ib, Attribute) -> LogicalResult {
+    Value arr = b.create<LLVM::UndefOp>(type);
+    for (auto &pair : llvm::enumerate(type_ids)) {
+      arr = b.create<LLVM::InsertValueOp>(arr, Globals::AddrOf(b, pair.value()),
+                                          pair.index());
+    }
+    b.create<LLVM::ReturnOp>(arr);
+    return success();
+  };
+
+  // Put all type ids into an array attribute, so we can use it as a globals
+  // cache key, so we do not encode the same type table multiple times.
+  llvm::SmallVector<llvm::StringRef> type_id_syms;
+  for (auto type_id : type_ids) type_id_syms.push_back(type_id.getSymName());
+  auto arr_attr = b.getStrArrayAttr(type_id_syms);
+
+  return g.GetOrCreate(b, arr_attr, type, symbol_base, init);
+}
+
+struct EncodedArguments {
+  std::variant<LLVM::AllocaOp, LLVM::GlobalOp> encoded;  // `args` argument
+  SmallVector<std::variant<LLVM::AllocaOp, LLVM::GlobalOp>> values;
+};
+
+static FailureOr<EncodedArguments> EncodeArguments(
+    CallOp op, CustomCallArgEncodingSet &encodings, Globals &g, Allocas &a,
     DenseMap<Value, CustomCallArgEncoding::Encoded> &encoded_args,
     ImplicitLocOpBuilder &b, ValueRange operands, ValueRange converted) {
   llvm::SmallVector<CustomCallArgEncoding::Encoded> encoded;
 
-  // Encode all arguments as a set of pointers (skip the execution context).
-  for (auto tuple : llvm::drop_begin(llvm::zip(operands, converted))) {
-    // Check if the value was already encoded.
-    auto it = encoded_args.find(std::get<0>(tuple));
-    if (it != encoded_args.end()) {
-      encoded.push_back(it->second);
-      continue;
-    }
+  // Encode empty arguments as a global array (skip the status type).
+  if (operands.drop_front().empty()) {
+    return EncodedArguments{EncodeEmptyArgsRets(g, b, "__rt_empty_args"), {}};
+  }
 
-    // Otherwise encode it right after the converted value definition.
-    OpBuilder::InsertionGuard guard(b);
-    if (auto *defining_op = std::get<1>(tuple).getDefiningOp()) {
-      b.setInsertionPointAfter(defining_op);
-    } else {
-      b.setInsertionPointToStart(std::get<1>(tuple).getParentBlock());
-    }
+  EncodedArguments arguments;
 
+  // Encode all arguments as a set of pointers (skip the execution context).
+  for (auto tuple : llvm::drop_begin(llvm::zip(operands, converted))) {
     auto encoded_arg =
-        encodings.Encode(g, b, std::get<0>(tuple), std::get<1>(tuple));
+        encodings.Encode(g, a, b, std::get<0>(tuple), std::get<1>(tuple));
     if (failed(encoded_arg)) return failure();
     encoded.push_back(*encoded_arg);
     encoded_args.try_emplace(std::get<0>(tuple), *encoded_arg);
   }
 
-  // We store encoded arguments as `!llvm.array<ptr<i8> x len>`.
+  // We store encoded arguments as `!llvm.array<ptr x len>`.
+  size_t len = encoded.empty() ? 1 : 2 + encoded.size();
   Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, 1 + encoded.size() * 2);
+  Type type = LLVM::LLVMArrayType::get(ptr, len);
 
-  // Prepare an array for encoding arguments.
+  // Prepare an array for encoded arguments.
   Value arr = b.create<LLVM::UndefOp>(type);
   auto insert_value = [&](Value value, int64_t offset) {
     arr = b.create<LLVM::InsertValueOp>(arr, value, offset);
   };
 
   // Insert the number of encoded arguments.
-  Attribute num_args = b.getI64IntegerAttr(encoded.size());
-  insert_value(PackScalarAttribute(g, b, num_args, "__rt_num_args"), 0);
-
-  // Store encoded arguments into the allocated storage.
+  LLVM::GlobalOp num_args =
+      EncodeScalar(g, b, b.getI64IntegerAttr(encoded.size()), "__rt_num_args");
+  insert_value(Globals::AddrOf(b, num_args), 0);
+
+  // Package arguments type ids into a type table global value.
+  llvm::SmallVector<LLVM::GlobalOp> type_ids;
+  for (auto &arg : encoded) type_ids.push_back(arg.type_id);
+  LLVM::GlobalOp type_table =
+      EncodeTypeTable(g, b, type_ids, "__rt_args_type_table");
+  if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
+
+  // Store pointer to encoded arguments into the allocated storage.
   for (auto &pair : llvm::enumerate(encoded)) {
     CustomCallArgEncoding::Encoded encoded = pair.value();
-    int64_t offset = 1 + pair.index() * 2;
-
-    insert_value(encoded.type_id, offset + 0);
-    insert_value(encoded.value, offset + 1);
+    int64_t offset = 2 + pair.index();
+    insert_value(AsPtr(b, encoded.value), offset);
+    arguments.values.push_back(encoded.value);
   }
 
-  // Always create an `alloca` in the parent function entry block.
-  // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-  Value mem = [&]() -> Value {
-    Block &block = op->getParentOfType<func::FuncOp>().getBody().front();
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(&block);
-    Value c1 = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-    return b.create<LLVM::AllocaOp>(ptr, type, c1, 0);
-  }();
+  // Get allocation for packed arguments pointers.
+  LLVM::AllocaOp alloca = a.GetOrCreate(b, type);
+
+  // Start the lifetime of the encoded arguments pointers.
+  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
 
-  // Store constructed arguments array on the stack and return a pointer to it.
-  b.create<LLVM::StoreOp>(arr, mem);
+  // Store constructed arguments pointers array into the alloca.
+  b.create<LLVM::StoreOp>(arr, alloca.getRes());
 
-  // Return a pointer to the encoded arguments.
-  return mem;
+  // Alloca that encodes the custom call arguments.
+  arguments.encoded = alloca;
+
+  return arguments;
 }
 
 // Encodes attributes into the global constant (array of pointers to the
 // attributes data, which are also stored as global constants).
-static FailureOr<Value> EncodeAttributes(CustomCallAttrEncodingSet &encodings,
-                                         SymbolTable &sym_table, Globals &g,
-                                         ImplicitLocOpBuilder &b,
-                                         ArrayRef<NamedAttribute> attrs) {
+static FailureOr<LLVM::GlobalOp> EncodeAttributes(
+    CustomCallAttrEncodingSet &encodings, SymbolTable &sym_table, Globals &g,
+    ImplicitLocOpBuilder &b, ArrayRef<NamedAttribute> attrs) {
   // Forward attributes that are not part of the custom call operation itself.
   auto forward_attr = [](NamedAttribute attr) -> bool {
     return attr.getName() != "callee" && attr.getName() != "dynamic";
@@ -310,30 +370,34 @@ static FailureOr<Value> EncodeAttributes(CustomCallAttrEncodingSet &encodings,
 }
 
 struct EncodedResults {
-  Value result_array_ptr;  // passed as 'rets' argument to custom call
-  SmallVector<LLVM::AllocaOp> allocas;  // storage for values of results
+  std::variant<LLVM::AllocaOp, LLVM::GlobalOp> encoded;  // `rets` argument
+  SmallVector<LLVM::AllocaOp> allocas;                   // encoded returns
 };
 
 static FailureOr<EncodedResults> EncodeResults(
-    CallOp op, CustomCallRetEncodingSet &encodings, Globals &g,
+    CallOp op, CustomCallRetEncodingSet &encodings, Globals &g, Allocas &a,
     ImplicitLocOpBuilder &b, TypeRange ret_types, TypeRange converted_types) {
   llvm::SmallVector<CustomCallRetEncoding::Encoded> encoded;
+
+  // Encode empty returns as a global array (skip the status type).
+  if (ret_types.drop_front().empty()) {
+    return EncodedResults{EncodeEmptyArgsRets(g, b, "__rt_empty_rets"), {}};
+  }
+
   EncodedResults results;
 
   // Encode all returns as a set of pointers (skip the status type).
   for (auto tuple : llvm::drop_begin(llvm::zip(ret_types, converted_types))) {
-    Block &block = op->getParentOfType<func::FuncOp>().getBody().front();
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(&block);
     auto encoded_ret =
-        encodings.Encode(g, b, std::get<0>(tuple), std::get<1>(tuple));
+        encodings.Encode(g, a, b, std::get<0>(tuple), std::get<1>(tuple));
     if (failed(encoded_ret)) return failure();
     encoded.push_back(*encoded_ret);
   }
 
-  // We store encoded results as `!llvm.array<ptr<i8> x len>`.
+  // We store encoded results as `!llvm.array<ptr x len>`.
+  size_t len = encoded.empty() ? 1 : 2 + encoded.size();
   Type ptr = LLVM::LLVMPointerType::get(b.getContext());
-  Type type = LLVM::LLVMArrayType::get(ptr, 1 + encoded.size() * 2);
+  Type type = LLVM::LLVMArrayType::get(ptr, len);
 
   // Prepare an array for encoding results.
   Value arr = b.create<LLVM::UndefOp>(type);
@@ -342,35 +406,37 @@ static FailureOr<EncodedResults> EncodeResults(
   };
 
   // Insert the number of encoded results.
-  Attribute num_rets = b.getI64IntegerAttr(encoded.size());
-  insert_value(PackScalarAttribute(g, b, num_rets, "__rt_num_rets"), 0);
+  LLVM::GlobalOp num_rets =
+      EncodeScalar(g, b, b.getI64IntegerAttr(encoded.size()), "__rt_num_rets");
+  insert_value(Globals::AddrOf(b, num_rets), 0);
+
+  // Package results type ids into a type table global value.
+  llvm::SmallVector<LLVM::GlobalOp> type_ids;
+  for (auto &arg : encoded) type_ids.push_back(arg.type_id);
+  LLVM::GlobalOp type_table =
+      EncodeTypeTable(g, b, type_ids, "__rt_rets_type_table");
+  if (!encoded.empty()) insert_value(Globals::AddrOf(b, type_table), 1);
 
   // Store encoded results into the allocated storage.
   for (auto &pair : llvm::enumerate(encoded)) {
     CustomCallRetEncoding::Encoded encoded_pair = pair.value();
-    int64_t offset = 1 + pair.index() * 2;
-
-    insert_value(encoded_pair.type_id, offset + 0);
-    insert_value(encoded_pair.value, offset + 1);
-
+    int64_t offset = 2 + pair.index();
+    insert_value(encoded_pair.value, offset);
     results.allocas.push_back(encoded_pair.value);
   }
 
-  // Always create an `alloca` in the parent function entry block.
-  // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-  Value mem = [&]() -> Value {
-    Block &block = op->getParentOfType<func::FuncOp>().getBody().front();
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(&block);
-    Value c1 = b.create<ConstantOp>(b.getI32IntegerAttr(1));
-    return b.create<LLVM::AllocaOp>(ptr, type, c1, 0);
-  }();
+  // Get allocation for packed results pointers.
+  LLVM::AllocaOp alloca = a.GetOrCreate(b, type);
+
+  // Start the lifetime of the encoded results pointers allocation.
+  b.create<LLVM::LifetimeStartOp>(b.getI64IntegerAttr(-1), alloca);
 
-  // Store constructed results array on the stack
-  b.create<LLVM::StoreOp>(arr, mem);
+  // Store constructed results pointers array on the stack
+  b.create<LLVM::StoreOp>(arr, alloca);
+
+  // Alloca that encodes the custom call returns.
+  results.encoded = alloca;
 
-  // Return a pointer to the encoded results.
-  results.result_array_ptr = mem;
   return results;
 }
 
@@ -398,6 +464,7 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
 
   CallOpLowering(TypeConverter &converter, MLIRContext *ctx,
                  SymbolTable &sym_table, Globals &globals,
+                 EncodingAllocas &allocas,
                  CustomCallArgEncodingSet &arg_encoding,
                  CustomCallAttrEncodingSet &attr_encoding,
                  CustomCallRetEncodingSet &ret_encoding,
@@ -405,6 +472,7 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
       : OpConversionPattern(converter, ctx),
         sym_table_(sym_table),
         globals_(globals),
+        allocas_(allocas),
         arg_encoding_(arg_encoding),
         attr_encoding_(attr_encoding),
         ret_encoding_(ret_encoding),
@@ -415,9 +483,13 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
       ConversionPatternRewriter &rewriter) const override {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
+    // Reuse allocas for encoding custom call arguments.
+    Allocas allocas = allocas_.GetForOperation(op);
+
     // Encode operation arguments as a runtime API arguments.
-    auto args = EncodeArguments(op, arg_encoding_, globals_, encoded_args_, b,
-                                op->getOperands(), adaptor.getOperands());
+    auto args =
+        EncodeArguments(op, arg_encoding_, globals_, allocas, encoded_args_, b,
+                        op->getOperands(), adaptor.getOperands());
     if (failed(args)) return op.emitOpError() << "failed to encode arguments";
 
     // Encode operation attributes as a runtime API argument.
@@ -431,8 +503,8 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
     std::transform(
         ret_types.begin(), ret_types.end(), converted_ret_types.begin(),
         [&](Type type) { return getTypeConverter()->convertType(type); });
-    auto rets = EncodeResults(op, ret_encoding_, globals_, b, ret_types,
-                              converted_ret_types);
+    auto rets = EncodeResults(op, ret_encoding_, globals_, allocas, b,
+                              ret_types, converted_ret_types);
     if (failed(rets)) return op.emitOpError() << "failed to encode results";
 
     // Creates a dynamic custom call resolved by name at run time.
@@ -442,19 +514,20 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
 
       return b.create<func::CallOp>(
           kCustomCall, TypeRange(rewriter.getI1Type()),
-          ValueRange({adaptor.getCtx(), callee, *args, *attrs,
-                      rets->result_array_ptr}));
+          ValueRange({adaptor.getCtx(), callee, AsPtr(b, args->encoded),
+                      Globals::AddrOf(b, *attrs), AsPtr(b, rets->encoded)}));
     };
 
     // Creates a direct custom call resolved at link time.
     auto call_direct = [&]() -> func::CallOp {
       auto type = RuntimeAPI::DirectCustomCallFunctionType(op.getContext());
-      AddDeclaration(op->getParentOfType<ModuleOp>(), op.getCallee(), type);
+      AddDeclaration(sym_table_, op->getParentOfType<ModuleOp>(),
+                     op.getCallee(), type);
 
-      return b.create<func::CallOp>(op.getCallee(),
-                                    TypeRange(rewriter.getI1Type()),
-                                    ValueRange({adaptor.getCtx(), *args, *attrs,
-                                                rets->result_array_ptr}));
+      return b.create<func::CallOp>(
+          op.getCallee(), TypeRange(rewriter.getI1Type()),
+          ValueRange({adaptor.getCtx(), AsPtr(b, args->encoded),
+                      Globals::AddrOf(b, *attrs), AsPtr(b, rets->encoded)}));
     };
 
     // Build a call operation and result decoding right after the original op.
@@ -470,6 +543,23 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
     if (failed(decoded_results))
       return op.emitOpError() << "failed to decode results";
 
+    auto size = b.getI64IntegerAttr(-1);
+
+    // End the lifetime of encoded arguments and results pointers.
+    if (auto *alloca = std::get_if<LLVM::AllocaOp>(&args->encoded))
+      b.create<LLVM::LifetimeEndOp>(size, *alloca);
+    if (auto *alloca = std::get_if<LLVM::AllocaOp>(&rets->encoded))
+      b.create<LLVM::LifetimeEndOp>(size, *alloca);
+
+    // End the lifetime of arguments encoded on a stack.
+    for (auto &arg : args->values)
+      if (auto *alloca = std::get_if<LLVM::AllocaOp>(&arg))
+        b.create<LLVM::LifetimeEndOp>(size, *alloca);
+
+    // End the lifetime of results encoded on a stack.
+    for (LLVM::AllocaOp alloca : rets->allocas)
+      b.create<LLVM::LifetimeEndOp>(size, alloca);
+
     rewriter.replaceOp(op, ValueRange(*decoded_results));
     return success();
   }
@@ -477,6 +567,7 @@ class CallOpLowering : public OpConversionPattern<CallOp> {
  private:
   SymbolTable &sym_table_;
   Globals &globals_;
+  EncodingAllocas &allocas_;
   CustomCallArgEncodingSet &arg_encoding_;
   CustomCallAttrEncodingSet &attr_encoding_;
   CustomCallRetEncodingSet &ret_encoding_;
@@ -588,8 +679,11 @@ void ConvertRuntimeToLLVMPass::runOnOperation() {
   ModuleOp module = getOperation();
   MLIRContext *ctx = module.getContext();
 
+  // A symbol table for resolving symbol references attributes.
+  SymbolTable sym_table(module);
+
   // Add declarations for the runtime API functions.
-  AddRuntimeApiDeclarations(module);
+  AddRuntimeApiDeclarations(sym_table, module);
 
   RuntimeTypeConverter converter;
   RewritePatternSet patterns(ctx);
@@ -637,12 +731,12 @@ void ConvertRuntimeToLLVMPass::runOnOperation() {
   PopulateTraceTypeIdNames(type_id_names);
   if (opts_.populate_type_id_names) opts_.populate_type_id_names(type_id_names);
 
-  // A symbol table for resolving symbol references attributes.
-  SymbolTable sym_table(module);
-
   // A helper class to create unique global constants.
   Globals globals(module, type_id_names);
 
+  // A helper class to create allocas for values encoded on a stack.
+  EncodingAllocas allocas;
+
   // Keep a cache of encoded values to encode each unique value just once.
   DenseMap<Value, CustomCallArgEncoding::Encoded> encoded_args;
 
@@ -667,8 +761,8 @@ void ConvertRuntimeToLLVMPass::runOnOperation() {
   if (opts_.populate_attr_encodings) opts_.populate_attr_encodings(attrs);
   if (opts_.populate_ret_encodings) opts_.populate_ret_encodings(rets);
 
-  patterns.add<CallOpLowering>(llvm_converter, ctx, sym_table, globals, args,
-                               attrs, rets, encoded_args);
+  patterns.add<CallOpLowering>(llvm_converter, ctx, sym_table, globals, allocas,
+                               args, attrs, rets, encoded_args);
 
   // Convert function signatures and call sites.
   mlir::populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.cc
index 49b664d8d23..45f8dc16a1d 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h"
@@ -145,7 +147,8 @@ static mlir::DenseElementsAttr GetMemrefValues(mlir::Builder& builder,
   return mlir::DenseElementsAttr::get(ranked_tensor, attributes);
 }
 
-Status SpecializeFunction(mlir::func::FuncOp func, ArgumentsRef arguments,
+Status SpecializeFunction(mlir::FunctionOpInterface func,
+                          ArgumentsRef arguments,
                           ArrayRef<SymbolicShape> symbolic_shapes,
                           ArrayRef<ArgumentConstraint> constraints,
                           const SpecializationListener* listener) {
@@ -156,16 +159,17 @@ Status SpecializeFunction(mlir::func::FuncOp func, ArgumentsRef arguments,
   // Specialize all function inputs to the given arguments.
   llvm::SmallVector<mlir::Type> specialized_inputs(num_inputs);
   for (unsigned i = 0; i < num_inputs; ++i) {
-    auto specialized =
-        SpecializeOperandType(i, func.getFunctionType().getInput(i),
-                              arguments[i], symbolic_shapes[i]);
+    auto specialized = SpecializeOperandType(
+        i, llvm::cast<mlir::FunctionType>(func.getFunctionType()).getInput(i),
+        arguments[i], symbolic_shapes[i]);
     if (!specialized.ok()) return specialized.status();
     specialized_inputs[i] = *specialized;
   }
 
   // Update function type to a new specialized one.
   auto specialized = mlir::FunctionType::get(
-      ctx, specialized_inputs, func.getFunctionType().getResults());
+      ctx, specialized_inputs,
+      llvm::cast<mlir::FunctionType>(func.getFunctionType()).getResults());
   func.setType(specialized);
 
   // Update function entry block arguments.
@@ -213,12 +217,13 @@ Status SpecializeFunction(mlir::func::FuncOp func, ArgumentsRef arguments,
   }
 
   // Sink small constants into the function body.
-  builder.setInsertionPointToStart(&func.getBody().front());
+  builder.setInsertionPointToStart(&func.getFunctionBody().front());
   for (int i = 0; i < constraints.size(); ++i) {
     if (constraints[i] != ArgumentConstraint::kValue) continue;
 
     // We only support sinking of Tensor arguments into the function body.
-    mlir::Type input = func.getFunctionType().getInput(i);
+    mlir::Type input =
+        llvm::cast<mlir::FunctionType>(func.getFunctionType()).getInput(i);
     mlir::TensorType tensor = input.dyn_cast<mlir::TensorType>();
     if (!tensor || !SupportsValueSpecialization(tensor)) {
       return InvalidArgumentError(StrCat(
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.h b/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.h
index 8b97f90df49..3fa163e5a74 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/specialization.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_TRANSFORMS_SPECIALIZATION_H_
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/runtime/arguments.h"
 #include "tensorflow/compiler/xla/runtime/constraints.h"
@@ -58,7 +58,7 @@ struct SpecializationListener {
 //
 // Returns error if arguments are not compatible with the function signature.
 absl::Status SpecializeFunction(
-    mlir::func::FuncOp func, ArgumentsRef arguments,
+    mlir::FunctionOpInterface func, ArgumentsRef arguments,
     llvm::ArrayRef<SymbolicShapesResolver::SymbolicShape> symbolic_shapes,
     llvm::ArrayRef<ArgumentConstraint> constraints,
     const SpecializationListener* listener = nullptr);
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
index e73af47cf56..f63e709402b 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
@@ -1,11 +1,14 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
 )
 
@@ -38,6 +41,8 @@ cc_library(
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:SCFDialect",
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/convert_custom_calls.mlir b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/convert_custom_calls.mlir
index 975ad8c90c5..4b4e8495454 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/convert_custom_calls.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/convert_custom_calls.mlir
@@ -46,14 +46,14 @@ func.func @function_call_to_traced_custom_call(
     %arg0: !rt.execution_context,
     %arg1: memref<?xf32>
 ) -> memref<?xf32> attributes {rt.exported = 0 : i32} {
-  // CHECK: %[[RES:.*]]:2 = rt.trace #rt.hlo_trace<"fusion", "foo", 0>, %[[CTX]]
+  // CHECK: %[[RES:.*]]:2 = rt.trace #rt.hlo_trace<"fusion">, %[[CTX]]
   // CHECK-SAME: -> !rt.status, memref<?xf32> {
   // CHECK-NEXT:   %[[STATUS:.*]], %[[RET:.*]] = call %[[CTX]]["target"]
   // CHECK-NOT:    #rt.hlo_trace
   // CHECK-NEXT:   yield %[[STATUS]], %[[RET]] : !rt.status, memref<?xf32>
   // CHECK-NEXT: }
   // CHECK: rt.is_ok %[[RES]]#0
-  %0 = call @custom_call(%arg1) { rt.trace = #rt.hlo_trace<"fusion", "foo", 0> }
+  %0 = call @custom_call(%arg1) { rt.trace = #rt.hlo_trace<"fusion"> }
     : (memref<?xf32>) -> memref<?xf32>
   return %0 : memref<?xf32>
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
index 348f869e461..31914e3cb36 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/rt_to_llvm.mlir
@@ -151,6 +151,13 @@ func.func @custom_call(%arg0: !rt.execution_context) {
 // -----
 
 // CHECK: global internal constant @__rt_custom_call_name("target\00")
+
+// CHECK: global internal constant @__rt_empty_rets()
+// CHECK: {
+// CHECK:   llvm.mlir.undef : !llvm.array<1 x ptr>
+// CHECK:    llvm.mlir.addressof @__rt_zero : !llvm.ptr
+// CHECK: }
+
 // CHECK: global internal constant @__rt_num_attrs(0 : i64)
 
 // CHECK: global internal constant @__rt_custom_call_attrs()
@@ -159,22 +166,21 @@ func.func @custom_call(%arg0: !rt.execution_context) {
 // CHECK:   llvm.mlir.addressof @__rt_num_attrs : !llvm.ptr
 // CHECK: }
 
-// CHECK: global internal constant @__rt_num_args(0 : i64)
+// CHECK: global internal constant @__rt_empty_args()
+// CHECK: {
+// CHECK:   llvm.mlir.undef : !llvm.array<1 x ptr>
+// CHECK:    llvm.mlir.addressof @__rt_zero : !llvm.ptr
+// CHECK: }
 
 // CHECK: func @dynamic_custom_call(
 // CHECK:   %[[CTX:.*]]: !llvm.ptr
 // CHECK: )
 func.func @dynamic_custom_call(%arg0: !rt.execution_context) {
 
-  // CHECK: %[[C1:.*]] = arith.constant 1 : i32
-  // CHECK: %[[RETS:.*]] = llvm.alloca %[[C1]] x !llvm.array<1 x ptr>
-
-  // CHECK: %[[C1_0:.*]] = arith.constant 1 : i32
-  // CHECK: %[[ARGS:.*]] = llvm.alloca %[[C1_0]] x !llvm.array<1 x ptr>
-
-  // CHECK: %[[ATTRS:.*]] = llvm.mlir.addressof @__rt_custom_call_attrs
-
   // CHECK: %[[CALLEE_ADDR:.*]] = llvm.mlir.addressof @__rt_custom_call_name
+  // CHECK: %[[ARGS:.*]] = llvm.mlir.addressof @__rt_empty_args
+  // CHECK: %[[ATTRS:.*]] = llvm.mlir.addressof @__rt_custom_call_attrs
+  // CHECK: %[[RETS:.*]] = llvm.mlir.addressof @__rt_empty_rets
 
   // CHECK: %[[STATUS:.*]] = call @runtimeCustomCall(%[[CTX]], %[[CALLEE_ADDR]],
   // CHECK-SAME:                                     %[[ARGS]], %[[ATTRS]],
@@ -200,9 +206,10 @@ func.func @dynamic_custom_call(%arg0: !rt.execution_context) {
 
 // CHECK: global internal constant @__rt_custom_call_attrs()
 // CHECK-SAME: : !llvm.array<4 x ptr> {
+// CHECK:   llvm.mlir.addressof @__rt_num_attrs
 // CHECK:   llvm.mlir.addressof @__rt_attr_name
 // CHECK:   llvm.mlir.addressof @__type_id_float
-// CHECK:   llvm.mlir.addressof @__rt_attr_value : !llvm.ptr
+// CHECK:   llvm.mlir.addressof @__rt_attr_value
 // CHECK: }
 
 // CHECK: func @custom_call(
@@ -310,6 +317,15 @@ func.func @custom_call(%arg0: !rt.execution_context) {
 
 // -----
 
+// CHECK: llvm.mlir.global internal constant @__rt_empty_rets()
+
+// CHECK: llvm.mlir.global internal constant @__rt_num_attrs(0 : i64)
+// CHECK: llvm.mlir.global internal constant @__rt_custom_call_attrs
+
+// CHECK: llvm.mlir.global internal constant @__rt_args_type_table
+// CHECK: llvm.mlir.undef : !llvm.array<1 x ptr>
+// CHECK: llvm.mlir.addressof @__type_id_float
+
 // CHECK: func @custom_call(
 // CHECK:   %[[CTX:.*]]: !llvm.ptr
 // CHECK:   %[[ARG:.*]]: f32
@@ -318,19 +334,27 @@ func.func @custom_call(%arg0: !rt.execution_context, %arg1 : f32) {
   // CHECK-DAG: %[[MEM:.*]] = llvm.alloca {{.*}} x f32
   // CHECK-DAG: %[[ARGS:.*]] = llvm.alloca {{.*}} x !llvm.array<3 x ptr>
 
-  // CHECK-DAG: %[[TYPE_ID:.*]] = llvm.mlir.addressof @__type_id_float
   // CHECK-DAG: %[[N_ARGS:.*]] = llvm.mlir.addressof @__rt_num_args
-
   // CHECK-DAG: llvm.store %[[ARG]], %[[MEM]]
-  // CHECK-DAG: llvm.store {{.*}}, %[[ARGS]] : !llvm.array<3 x ptr>, !llvm.ptr
+
+  // CHECK: %[[ARGS_TYPES:.*]] = llvm.mlir.addressof @__rt_args_type_table
+  // CHECK: llvm.insertvalue %[[ARGS_TYPES]], {{.*}}[1] : !llvm.array<3 x ptr>
+  // CHECK: llvm.intr.lifetime.start -1, %[[ARGS]]
+  // CHECK: llvm.store {{.*}}, %[[ARGS]] : !llvm.array<3 x ptr>, !llvm.ptr
+
+  // CHECK: %[[RETS:.*]] = llvm.mlir.addressof @__rt_empty_rets
 
   // CHECK: call @target
+  // CHECK: llvm.intr.lifetime.end -1, %[[ARGS]]
   rt.call %arg0["target"] (%arg1) : (f32) -> ()
   func.return
 }
 
 // -----
 
+// CHECK: llvm.mlir.global internal constant @__rt_args_type_table
+// CHECK: llvm.mlir.addressof @__type_id_memref_view
+
 // CHECK: func @custom_call(
 // CHECK:   %[[CTX:.*]]: !llvm.ptr
 // CHECK:   %[[ARG:.*]]: memref<?x256xf32>
@@ -340,8 +364,6 @@ func.func @custom_call(%arg0: !rt.execution_context, %arg1 : memref<?x256xf32>)
   // CHECK: %[[DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG]]
   // CHECK-SAME: to !llvm.struct
 
-  // CHECK: %[[TYPE_ID:.*]] = llvm.mlir.addressof @__type_id_memref_view
-
   // CHECK: llvm.mlir.undef : !llvm.array<4 x i64>
   // CHECK-NEXT: llvm.extractvalue %[[DESC]][3, 0]
   // CHECK-NEXT: arith.constant 256 : i64
@@ -359,6 +381,7 @@ func.func @custom_call(%arg0: !rt.execution_context, %arg1 : memref<?x256xf32>)
   // CHECK: llvm.insertvalue
 
   // CHECK: %[[N_ARGS:.*]] = llvm.mlir.addressof @__rt_num_args
+  // CHECK: %[[TYPES:.*]] = llvm.mlir.addressof @__rt_args_type_table
 
   // CHECK: call @target
   rt.call %arg0["target"] (%arg1) : (memref<?x256xf32>) -> ()
@@ -399,17 +422,18 @@ func.func @dynamic_custom_call(%arg0: !rt.execution_context) {
 
 // -----
 
-// CHECK: %[[C1:.*]] = arith.constant 1 : i32
-// CHECK: %[[RETS:.*]] = llvm.alloca %[[C1]] x !llvm.array<3 x ptr>
+func.func @custom_call(%ctx: !rt.execution_context) -> (f32) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+  // CHECK: %[[RETS:.*]] = llvm.alloca %[[C1]] x !llvm.array<3 x ptr>
 
-// CHECK: %[[C1_0:.*]] = arith.constant 1 : i32
-// CHECK: %[[F32_ALLOCA:.*]] = llvm.alloca %[[C1_0]] x f32
+  // CHECK: %[[C1_0:.*]] = arith.constant 1 : i32
+  // CHECK: %[[F32_ALLOCA:.*]] = llvm.alloca %[[C1_0]] x f32
 
-// CHECK: %[[N_RETS:.*]]  = llvm.mlir.addressof @__rt_num_rets
+  // CHECK: %[[N_RETS:.*]]  = llvm.mlir.addressof @__rt_num_rets
 
-// CHECK: call @f32_reduce
-// CHECK: %[[LOAD2:.*]] = llvm.load %[[F32_ALLOCA]]
-func.func @custom_call(%ctx: !rt.execution_context) -> (f32) {
+  // CHECK: call @f32_reduce
+  // CHECK: %[[LOAD2:.*]] = llvm.load %[[F32_ALLOCA]]
+  // CHECK: llvm.intr.lifetime.end -1, %[[F32_ALLOCA]]
   %status, %0 = rt.call %ctx["f32_reduce"] () : () -> (f32)
   return %0 : f32
 }
@@ -426,6 +450,9 @@ func.func @opaque_arg(%ctx: !rt.execution_context, %arg: !rt.opaque) {
 
 // -----
 
+// CHECK: llvm.mlir.global internal constant @__rt_args_type_table
+// CHECK: llvm.mlir.addressof @__type_id_opaque : !llvm.ptr
+
 // CHECK: func @opaque_custom_call_arg(
 // CHECK-SAME:   %[[ARG0:.*]]: !llvm.ptr,
 // CHECK-SAME:   %[[ARG1:.*]]: !llvm.ptr
@@ -433,7 +460,6 @@ func.func @opaque_arg(%ctx: !rt.execution_context, %arg: !rt.opaque) {
 func.func @opaque_custom_call_arg(%ctx: !rt.execution_context,
                                   %arg: !rt.opaque) {
   // CHECK: %[[ALLOCA:.*]] = llvm.alloca {{.*}} x !llvm.ptr
-  // CHECK: llvm.mlir.addressof @__type_id_opaque : !llvm.ptr
   // CHECK: llvm.store %[[ARG1]], %[[ALLOCA]] : !llvm.ptr
   // CHECK: call @target
   %status = rt.call %ctx["target"] (%arg) : (!rt.opaque) -> ()
@@ -504,6 +530,22 @@ func.func @custom_call(%ctx: !rt.execution_context) -> (memref<2x2xf32>) {
 
 // -----
 
+// CHECK: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK: %[[RETS_ALLOCA:.*]] = llvm.alloca %[[C1]] x !llvm.array<3 x ptr>
+
+// CHECK: %[[C1_0:.*]] = arith.constant 1 : i32
+// CHECK: %[[MEMREF_ALLOCA:.*]] = llvm.alloca %[[C1_0]] x !llvm.struct<(i8, i8, ptr, array<4 x i64>)>
+
+// CHECK: call @f32_reduce
+func.func @custom_call(%ctx: !rt.execution_context)
+                          -> (!async.value<memref<2x2xf32>>) {
+  %status, %0 = rt.call %ctx["f32_reduce"] ()
+                          : () -> (!async.value<memref<2x2xf32>>)
+  return %0 : !async.value<memref<2x2xf32>>
+}
+
+// -----
+
 // Test that custom call encoding can pass a reference to exported function as a
 // custom call attribute.
 func.func @init(%ctx: !rt.execution_context)
@@ -536,9 +578,72 @@ func.func @trace(%ctx: !rt.execution_context) -> tensor<?xf32> {
   // CHECK: call @xla.trace.activity_start
   // CHECK: call @compute
   // CHECK: call @xla.trace.activity_end
-  %0 = rt.trace #rt.hlo_trace<"foo", "bar", 0>, %ctx -> tensor<?xf32> {
+  %0 = rt.trace #rt.hlo_trace<"foo">, %ctx -> tensor<?xf32> {
     %1 = func.call @compute(): () -> tensor<?xf32>
     yield %1 : tensor<?xf32>
   }
   return %0 : tensor<?xf32>
 }
+
+// -----
+
+// CHECK: llvm.mlir.global internal constant @__rt_c123(123 : i32)
+
+// CHECK: func @custom_call(
+// CHECK:   %[[CTX:.*]]: !llvm.ptr
+// CHECK: )
+func.func @custom_call(%arg0: !rt.execution_context) {
+  // CHECK: llvm.mlir.addressof @__rt_c123 : !llvm.ptr
+  // CHECK: call @target
+  %c123 = arith.constant 123 : i32
+  rt.call %arg0["target"] (%c123) : (i32) -> ()
+  func.return
+}
+
+// -----
+
+// CHECK: llvm.mlir.global internal constant @__rt_cst(1.234560e+02 : f32)
+
+// CHECK: func @custom_call(
+// CHECK:   %[[CTX:.*]]: !llvm.ptr
+// CHECK: )
+func.func @custom_call(%arg0: !rt.execution_context) {
+  // CHECK: llvm.mlir.addressof @__rt_cst : !llvm.ptr
+  // CHECK: call @target
+  %cst = arith.constant 123.456 : f32
+  rt.call %arg0["target"] (%cst) : (f32) -> ()
+  func.return
+}
+
+// -----
+// Check that we reuse allocas for encoding arguments on the stack.
+
+// CHECK: func @custom_call(
+// CHECK:   %[[CTX:.*]]: !llvm.ptr,
+// CHECK:   %[[ARG:.*]]: f32
+// CHECK: )
+func.func @custom_call(%arg0: !rt.execution_context, %arg1: f32) {
+  // CHECK: %[[ARGS:.*]] = llvm.alloca {{.*}} x !llvm.array<3 x ptr>
+  // CHECK: %[[ARG_ALLOCA:.*]] = llvm.alloca %{{.*}} x f32
+  // CHECK-NOT: llvm.alloca
+
+  // llvm.intr.lifetime.start -1, %[[ARG_ALLOCA]] : !llvm.ptr
+  // CHECK: llvm.store %[[ARG]], %[[ARG_ALLOCA]] : f32, !llvm.ptr
+  // llvm.intr.lifetime.start -1, %[[ARGS]] : !llvm.ptr
+  // CHECK: llvm.store {{.*}}, %[[ARGS]]
+  // CHECK: call @target
+  rt.call %arg0["target"] (%arg1) : (f32) -> ()
+  // llvm.intr.lifetime.end -1, %[[ARGS]] : !llvm.ptr
+  // llvm.intr.lifetime.end -1, %[[ARG_ALLOCA]] : !llvm.ptr
+
+  // llvm.intr.lifetime.start -1, %[[ARG_ALLOCA]] : !llvm.ptr
+  // CHECK: llvm.store %[[ARG]], %[[ARG_ALLOCA]] : f32, !llvm.ptr
+  // llvm.intr.lifetime.start -1, %[[ARGS]] : !llvm.ptr
+  // CHECK: llvm.store {{.*}}, %[[ARGS]]
+  // CHECK: call @target
+  rt.call %arg0["target"] (%arg1) : (f32) -> ()
+  // llvm.intr.lifetime.end -1, %[[ARGS]] : !llvm.ptr
+  // llvm.intr.lifetime.end -1, %[[ARG_ALLOCA]] : !llvm.ptr
+
+  func.return
+}
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
index 76311710863..e2d45689da7 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.cc
@@ -19,12 +19,14 @@ limitations under the License.
 
 #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Async/IR/Async.h"  // from @llvm-project
 #include "mlir/Dialect/Async/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
@@ -39,7 +41,7 @@ void RegisterXlaRuntimeTestlibDialects(DialectRegistry& dialects) {
   // Register MLIR dialects supported by the Xla runtime tests.
   dialects->insert<mlir::arith::ArithDialect, mlir::async::AsyncDialect,
                    mlir::scf::SCFDialect, mlir::func::FuncDialect,
-                   RuntimeDialect>();
+                   mlir::memref::MemRefDialect, RuntimeDialect>();
 
   // Register MLIR dialects that can be translated to LLVM IR.
   registerLLVMDialectTranslation(*dialects);
@@ -47,9 +49,11 @@ void RegisterXlaRuntimeTestlibDialects(DialectRegistry& dialects) {
 
 void CreateXlaRuntimeTestlibPipeline(PassManager& passes) {
   passes->addPass(mlir::createConvertSCFToCFPass());
+  passes->addPass(mlir::createAsyncFuncToAsyncRuntimePass());
 
   // Export functions to the XLA runtime.
   passes->addPass(CreateExportRuntimeFunctionsPass());
+  passes->addPass(CreateConvertCustomCallsPass());
   passes->addPass(CreateConvertAssertsPass());
 
   // Lower from high level async operations to async runtime.
@@ -66,6 +70,7 @@ void CreateXlaRuntimeTestlibPipeline(PassManager& passes) {
   passes->addPass(mlir::createConvertAsyncToLLVMPass());
 
   // Convert everything else to LLVM dialect.
+  passes->addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
   passes->addPass(mlir::createConvertFuncToLLVMPass());
   passes->addPass(mlir::createReconcileUnrealizedCastsPass());
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
index c126b4017a7..89b2df9b3af 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
@@ -108,6 +108,8 @@ static std::unique_ptr<Type> ConvertCanonicalType(
 
 /*static*/ StatusOr<PrimitiveType> TypeConverter::ConvertElementType(
     mlir::Type type) {
+  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
+  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isIndex()) return PrimitiveType::S64;
   if (type.isBF16()) return PrimitiveType::BF16;
   if (type.isF16()) return PrimitiveType::F16;
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h
index 8b8506e781c..ef43fcec553 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h
@@ -40,6 +40,13 @@ class TypeConverter {
   // time type if the conversion is successful, or `nullptr` if failed.
   using ConversionFn = std::function<std::unique_ptr<Type>(mlir::Type)>;
 
+  TypeConverter() = default;
+
+  template <typename... Fns>
+  explicit TypeConverter(Fns&&... fn) {
+    (AddConversion(std::forward<Fns>(fn)), ...);
+  }
+
   // Adds a type conversion function with a type predicate.
   //
   // Example:
@@ -50,7 +57,7 @@ class TypeConverter {
   // result for all other types, and the type converter will try the next
   // conversion function (see `Convert` implementation).
   template <typename Fn, typename FnTraits = llvm::function_traits<Fn>>
-  void AddConversion(Fn fn) {
+  void AddConversion(Fn&& fn) {
     using ArgType = typename FnTraits::template arg_t<0>;
     conversions_.emplace_back(
         [fn = std::forward<Fn>(fn)](mlir::Type type) -> std::unique_ptr<Type> {
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
index 0ace63c24a0..0464e84ccb2 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
@@ -1,7 +1,8 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
index fe40566e0e5..27c7081fd0d 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.cc
@@ -55,7 +55,7 @@ void ExtractAsyncValue(
 
   // Fast path if async value is already available.
   if (async_value->IsAvailable()) {
-    void *storage = AsyncRuntime::GetStorage(value);
+    auto *storage = AsyncRuntime::GetStorage(value);
     emplace_fn(storage, dst);
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
     return;
@@ -63,7 +63,7 @@ void ExtractAsyncValue(
 
   // Wait for the async value completion, and emplace the `dst`.
   async_value->AndThen([value, emplace_fn, dst = FormRef(dst)]() {
-    void *storage = AsyncRuntime::GetStorage(value);
+    auto *storage = AsyncRuntime::GetStorage(value);
     emplace_fn(storage, dst.get());
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
   });
@@ -77,7 +77,7 @@ void ExtractAsyncValue(
 
   // Fast path if async value is already available.
   if (async_value->IsAvailable()) {
-    void *storage = AsyncRuntime::GetStorage(value);
+    auto *storage = AsyncRuntime::GetStorage(value);
     emplace_fn(storage, dst, context);
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
     return;
@@ -85,7 +85,7 @@ void ExtractAsyncValue(
 
   // Wait for the async value completion, and emplace the `dst`.
   async_value->AndThen([value, emplace_fn, context, dst = FormRef(dst)]() {
-    void *storage = AsyncRuntime::GetStorage(value);
+    auto *storage = AsyncRuntime::GetStorage(value);
     emplace_fn(storage, dst.get(), context);
     AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
   });
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/constraints.cc b/tensorflow/compiler/xla/mlir/runtime/utils/constraints.cc
index 3ce5deb51bf..61355cec648 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/constraints.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/constraints.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 
 namespace xla {
@@ -35,7 +36,7 @@ using absl::StrCat;
 using llvm::SmallVector;
 
 StatusOr<SmallVector<ArgumentConstraint>> GetArgumentsConstraints(
-    func::FuncOp func) {
+    FunctionOpInterface func) {
   llvm::SmallVector<ArgumentConstraint> constraints;
   constraints.reserve(func.getNumArguments());
 
@@ -52,7 +53,8 @@ StatusOr<SmallVector<ArgumentConstraint>> GetArgumentsConstraints(
   };
 
   for (int i = 0; i < func.getNumArguments(); ++i) {
-    auto arg_type = func.getFunctionType().getInput(i);
+    auto arg_type =
+        llvm::cast<FunctionType>(func.getFunctionType()).getInput(i);
 
     auto constraint = parse(func.getArgAttr(i, kArgumentConstraintAttrName));
     if (!constraint.ok()) return constraint.status();
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/constraints.h b/tensorflow/compiler/xla/mlir/runtime/utils/constraints.h
index 9c460d2cf23..2ab66c44b64 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/constraints.h
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/constraints.h
@@ -17,17 +17,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
 
 #include "absl/status/statusor.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/runtime/constraints.h"
 
 namespace xla {
 namespace runtime {
-
 // Returns arguments constraints inferred from the function signature.
 absl::StatusOr<llvm::SmallVector<ArgumentConstraint>> GetArgumentsConstraints(
-    mlir::func::FuncOp func);
+    mlir::FunctionOpInterface func);
 
 // Resolves argument constraint based on the argument type, if constraint is
 // fully satisfied by the type, returns `kResolved`.
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.cc b/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.cc
index 4b16c4f6d63..ddc413519a4 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.cc
@@ -40,7 +40,7 @@ FuncOp CustomCallDeclarations::GetOrCreate(ImplicitLocOpBuilder& b,
                                            StringRef target,
                                            FunctionType type) {
   // Check if we already have a custom all declaration.
-  Key key = {target, type};
+  Key key = {b.getStringAttr(target), type};
   if (auto it = custom_calls_.find(key); it != custom_calls_.end())
     return it->second;
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h b/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h
index 69dc72d7649..2e081047849 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -59,7 +60,7 @@ class CustomCallDeclarations {
  private:
   mlir::SymbolTable sym_table_;
 
-  using Key = std::pair<llvm::StringRef, mlir::FunctionType>;
+  using Key = std::pair<mlir::StringAttr, mlir::FunctionType>;
   llvm::DenseMap<Key, mlir::func::FuncOp> custom_calls_;
 };
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/xla-runtime-opt.cc b/tensorflow/compiler/xla/mlir/runtime/xla-runtime-opt.cc
index a1c38251095..34e05c414fa 100644
--- a/tensorflow/compiler/xla/mlir/runtime/xla-runtime-opt.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/xla-runtime-opt.cc
@@ -13,24 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/Async/IR/Async.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/tests/testlib.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h"
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h"
 
 int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
 
   registry.insert<mlir::func::FuncDialect, mlir::memref::MemRefDialect,
                   mlir::math::MathDialect, xla::runtime::RuntimeDialect,
-                  xla::runtime::TestlibDialect>();
-
-  xla::runtime::registerMathTransformsPasses();
-  xla::runtime::registerMemrefTransformsPasses();
+                  mlir::async::AsyncDialect, xla::runtime::TestlibDialect>();
+  xla::registerMathTransformsPasses();
+  xla::registerMemrefTransformsPasses();
   xla::runtime::registerRuntimeTransformsPasses();
 
   return failed(MlirOptMain(argc, argv, "Xla Runtime Pass Driver\n", registry));
diff --git a/tensorflow/compiler/xla/mlir/tools/BUILD b/tensorflow/compiler/xla/mlir/tools/BUILD
deleted file mode 100644
index 41c0a7c96f1..00000000000
--- a/tensorflow/compiler/xla/mlir/tools/BUILD
+++ /dev/null
@@ -1,43 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-        "@tf_runtime//:friends",
-    ],
-    licenses = ["notice"],
-)
-
-tf_cc_binary(
-    name = "xla-cpu-opt",
-    srcs = ["xla_cpu_opt.cc"],
-    deps = [
-        "//tensorflow/compiler/xla/mlir/transforms/cpu:passes",
-        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
-        "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:gml_st_test_passes",
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:thlo",
-        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:TensorDialect",
-        "@stablehlo//:register",
-    ],
-)
-
-tf_cc_binary(
-    name = "xla-gpu-opt",
-    srcs = ["xla_gpu_opt.cc"],
-    deps = [
-        "//tensorflow/compiler/xla/mlir/transforms/gpu:passes",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MlirOptLib",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD
new file mode 100644
index 00000000000..319ed6ac5b0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD
@@ -0,0 +1,57 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+
+xla_cc_binary(
+    name = "mlir-bisect",
+    srcs = ["mlir_bisect.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":bisect_lib",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_bufferizable_op_interface",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st_test_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lmhlo_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_dialects",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "//tensorflow/compiler/xla/mlir_hlo:thlo",
+        "//tensorflow/compiler/xla/mlir_hlo:thlo_passes",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:platform_port",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirReduceLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "bisect_lib",
+    srcs = [
+        "bisect_lib.cc",
+        "test_passes.cc",
+    ],
+    hdrs = [
+        "bisect_lib.h",
+        "test_passes.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc_impl",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/README.md b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/README.md
new file mode 100644
index 00000000000..c18a21ac0c5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/README.md
@@ -0,0 +1,85 @@
+# MLIR HLO mlir_bisect
+
+This is a test case reduction tool, similar in purpose to `mlir-reduce`, but
+specific to the `mlir-interpreter` infrastructure. In particular, reductions can
+depend on concrete values encountered during execution, and reductions can (and
+usually do) generate multiple candidates.
+
+For example, the `ReplaceOpWithConstant` reduction will attempt to replace each
+op with each of its results. If the op is in a loop, each execution will be a
+candidate for replacement.
+
+## Using this tool
+
+1.  Run a JAX test with snapshots enabled:
+
+    ```
+    bazel test some-jax-test
+      --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/dump
+      --xla_dump_hlo_snapshots" --test_filter=SomeSpecific.Test
+      --test_sharding_strategy=disabled
+    ```
+
+1.  Figure out the culprit module and pass (sorry, no automation yet):
+
+    ```
+    bazel run tensorflow/compiler/xla/mlir/tools/mlir_replay:mlir_replay -- \
+      --mlir_compilation_trace=/tmp/dump/module_0000.jit__something.mlir-trace.pb \
+      --hlo_snapshot=/tmp/dump/module_0000.jit__something.snapshot.0.pb \
+      --print_changes_only \
+      --execution_trace_dir=/tmp/execution
+    ```
+
+    You should see a pass after which results change. You'll want to use the
+    .mlir file in `/tmp/execution` corresponding to the pass *before* that with
+    the bisect tool.
+
+    Note: If the failing pass is bufferization, you may have to use an earlier
+    snapshot, e.g. before EmptyTensorToAllocTensor.
+1.  Run bisect:
+
+    ```
+    bazel run tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect -- \
+      --hlo-snapshot=/tmp/dump/module_0000.jit_something.snapshot.0.pb \
+      --pass-pipeline="builtin.module(empty-tensor-to-alloc-tensor,one-shot-bufferize{allow-return-allocs bufferize-function-boundaries create-deallocs=0})" \
+      /tmp/execution/0052.ScalarizationPass.mlir
+    ```
+
+## Adding a reduction
+
+To add a reduction, create a function that generates the candidates and register
+it:
+
+```
+SmallVector<OwningOpRef<ModuleOp>>
+FrobulateAndDefenestrate(BisectState&, dialect::SomeOp some_op) {
+  auto [cloned_module_1, cloned_op_1] = CloneModuleFor(some_op);
+  Frobulate(cloned_op_1);
+
+  auto [cloned_module_2, cloned_op_2] = CloneModuleFor(some_op);
+  Defenestrate(cloned_op_2);
+
+  return {cloned_module_1, cloned_module_2};
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(FrobulateAndDefenestrate);
+```
+
+Then, add a test for the strategy. Make sure your strategy is linked into
+mlir-bisect and has `alwayslink` set.
+
+```
+// RUN: mlir-bisect %s --debug-strategy=FrobulateAndDefenestrate | FileCheck %s
+
+func.func @main() {
+  dialect.some_op()
+}
+
+// CHECK: func @main()
+// CHECK-NEXT: frobulated
+
+// CHECK: func @main()
+// CHECK-NEXT: defenestrated
+```
+
+`--debug-strategy` will print all candidates generated by the given strategy.
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.cc
new file mode 100644
index 00000000000..475a042e140
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.cc
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+#include <functional>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace bisect {
+
+Operation* FindInClone(Operation* op, ModuleOp clone) {
+  if (llvm::isa<ModuleOp>(op)) {
+    return clone;
+  }
+
+  auto* parent_clone = FindInClone(op->getParentOp(), clone);
+  auto cloned_ops =
+      parent_clone->getRegions()[op->getParentRegion()->getRegionNumber()]
+          .getOps();
+  for (auto [original_op, cloned_op] :
+       llvm::zip(op->getParentRegion()->getOps(), cloned_ops)) {
+    if (&original_op == op) {
+      return &cloned_op;
+    }
+  }
+
+  llvm_unreachable("Op not found in clone.");
+}
+
+std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op) {
+  auto module = op->getParentOfType<ModuleOp>().clone();
+  return {OwningOpRef<ModuleOp>{module}, FindInClone(op, module)};
+}
+
+namespace detail {
+
+DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
+GetStrategies() {
+  static auto* strategies =
+      new DenseMap<StringRef,
+                   std::function<CandidateVector(BisectState&, Operation*)>>();
+  return *strategies;
+}
+
+void RegisterReduceStrategy(
+    StringRef name,
+    std::function<CandidateVector(BisectState&, Operation*)> fn) {
+  GetStrategies()[name] = fn;
+}
+
+CandidateVector GetCandidates(
+    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
+    BisectState& state, ModuleOp op) {
+  assert(strategy && "GetCandidates was passed a null strategy");
+  CandidateVector result;
+  op.lookupSymbol("main")->walk([&](Operation* subOp) {
+    llvm::move(strategy(state, subOp), std::back_inserter(result));
+  });
+  return result;
+}
+
+}  // namespace detail
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h
new file mode 100644
index 00000000000..c57b49f3fd0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h
@@ -0,0 +1,96 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#define REGISTER_MLIR_REDUCE_STRATEGY(name)                      \
+  static int name##_init = []() {                                \
+    ::mlir::bisect::detail::RegisterReduceStrategy(#name, name); \
+    return 1;                                                    \
+  }();
+
+namespace mlir {
+namespace bisect {
+
+class BisectState {
+ public:
+  void SetTrace(mlir::interpreter::ExecutionTrace trace) {
+    trace_ = std::move(trace);
+  }
+
+  // Returns all executions of the given op.
+  llvm::SmallVector<const interpreter::InstructionTrace*> GetExecutions(
+      mlir::Operation* op) const {
+    return interpreter::FindOpExecutionsInTrace(trace_, op);
+  }
+
+ private:
+  mlir::interpreter::ExecutionTrace trace_;
+};
+
+std::pair<OwningOpRef<ModuleOp>, Operation*> CloneModuleFor(Operation* op);
+Operation* FindInClone(Operation* op, ModuleOp clone);
+
+template <typename Op>
+std::pair<OwningOpRef<ModuleOp>, Op> CloneModuleFor(Op op) {
+  auto [module, op_clone] = CloneModuleFor(op.getOperation());
+  return {std::move(module), llvm::cast<Op>(op_clone)};
+}
+
+namespace detail {
+
+using CandidateVector = SmallVector<OwningOpRef<ModuleOp>>;
+
+CandidateVector GetCandidates(
+    const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
+    BisectState& state, ModuleOp op);
+
+DenseMap<StringRef, std::function<CandidateVector(BisectState&, Operation*)>>&
+GetStrategies();
+
+// Registers a strategy that applies to all ops.
+void RegisterReduceStrategy(
+    StringRef name,
+    std::function<CandidateVector(BisectState&, Operation*)> fn);
+
+// Registers a strategy that applies to specific ops.
+template <typename Op>
+void RegisterReduceStrategy(StringRef name,
+                            CandidateVector (*fn)(BisectState&, Op)) {
+  RegisterReduceStrategy(
+      name, [fn](BisectState& state, Operation* op) -> CandidateVector {
+        if (auto cast = llvm::dyn_cast<Op>(op)) {
+          return fn(state, cast);
+        }
+        return {};
+      });
+}
+
+}  // namespace detail
+
+}  // namespace bisect
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_BISECT_LIB_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
new file mode 100644
index 00000000000..aca37f2ceea
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
@@ -0,0 +1,346 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/init_main.h"
+
+struct Options {
+  llvm::cl::opt<std::string> input_filename{llvm::cl::Positional,
+                                            llvm::cl::desc("<input file>"),
+                                            llvm::cl::init("-")};
+  llvm::cl::opt<std::string> hlo_snapshot{
+      "hlo-snapshot",
+      llvm::cl::desc(
+          "If set, get argument values from the given snapshot. If not set, "
+          "the input function must not have any arguments."),
+      llvm::cl::init("")};
+  llvm::cl::opt<std::string> debug_strategy{
+      "debug-strategy",
+      llvm::cl::desc("If set, print all reductions for the given strategy and "
+                     "exit. For testing."),
+      llvm::cl::init("")};
+  llvm::cl::opt<std::string> expected_error{
+      "expected-error",
+      llvm::cl::desc("If set, expect the given error message after applying "
+                     "the pass instead of a successful execution."),
+      llvm::cl::init("")};
+  llvm::cl::opt<int64_t> max_steps_per_run{
+      "max-steps-per-run",
+      llvm::cl::desc("Maximum number of steps to execute for each attempt."),
+      llvm::cl::init(100000)};
+  mlir::PassPipelineCLParser pass_pipeline{"", "Passes to run"};
+  llvm::cl::opt<bool> canonicalize{
+      "enable-canonicalization",
+      llvm::cl::desc("If set, canonicalize candidates before trying them. Set "
+                     "to false if you're bisecting --canonicalize."),
+      llvm::cl::init(true)};
+};
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+OwningOpRef<ModuleOp> ParseMlirInput(llvm::StringRef inputFilename,
+                                     MLIRContext* context) {
+  std::string error_message;
+  auto file = mlir::openInputFile(inputFilename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return {};
+  }
+
+  auto source_mgr = std::make_shared<llvm::SourceMgr>();
+  source_mgr->AddNewSourceBuffer(std::move(file), SMLoc());
+  return {
+      llvm::cast<ModuleOp>(parseSourceFileForTool(source_mgr, context,
+                                                  /*insertImplicitModule=*/true)
+                               .release())};
+}
+
+LogicalResult RunPipeline(ModuleOp module, const Options& options) {
+  if (!options.pass_pipeline.hasAnyOccurrences()) {
+    return mlir::success();
+  }
+
+  auto error_handler = [&](const Twine& msg) {
+    llvm::errs() << msg << "\n";
+    return failure();
+  };
+  PassManager pm(module.getContext());
+  if (failed(options.pass_pipeline.addToPipeline(pm, error_handler)) ||
+      failed(pm.run(module))) {
+    llvm::errs() << "pipeline failed\n";
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult Run(ModuleOp module, interpreter::ExecutionTrace* trace,
+                  const Options& options) {
+  SymbolTable symbol_table{module};
+  interpreter::ExecutionTraceListener tracer(trace);
+  interpreter::InterpreterOptions interpreter_options;
+  interpreter_options.listener = &tracer;
+  interpreter_options.maxSteps = options.max_steps_per_run;
+  auto results_before_pass = interpreter::runInterpreter(
+      symbol_table, llvm::cast<func::FuncOp>(symbol_table.lookup("main")), {},
+      interpreter_options);
+
+  if (!succeeded(results_before_pass)) {
+    llvm::errs() << "Interpreter failed\n";
+    return failure();
+  }
+
+  if (!options.debug_strategy.empty()) {
+    return success();
+  }
+
+  OwningOpRef<ModuleOp> clone(module.clone());
+  if (!succeeded(RunPipeline(*clone, options))) {
+    return failure();
+  }
+
+  SymbolTable symbol_table_after{*clone};
+  interpreter_options.listener = nullptr;
+  bool found_expected_error = false;
+  if (!options.expected_error.empty()) {
+    auto original_handler = interpreter_options.errorHandler;
+    interpreter_options.errorHandler = [&](llvm::StringRef failure) {
+      found_expected_error |=
+          failure.find(options.expected_error) != std::string::npos;
+      original_handler(failure);
+    };
+  }
+
+  auto results_after_pass = interpreter::runInterpreter(
+      symbol_table_after,
+      llvm::cast<func::FuncOp>(symbol_table_after.lookup("main")), {},
+      interpreter_options);
+
+  if (!succeeded(results_after_pass)) {
+    if (found_expected_error) {
+      return success();
+    }
+    llvm::errs() << "Interpreter failed\n";
+    return failure();
+  } else if (!options.expected_error.empty()) {
+    llvm::errs() << "Expected error not seen\n";
+    return failure();
+  }
+
+  // If the results are the same, the bug is no longer present.
+  if (*results_before_pass == *results_after_pass) {
+    return failure();
+  }
+
+  llvm::errs() << "results before:\n";
+  for (auto& result : *results_before_pass) {
+    llvm::errs() << "  " << result.toString() << "\n";
+  }
+  llvm::errs() << "\nresults after:\n";
+  for (auto& result : *results_after_pass) {
+    llvm::errs() << "  " << result.toString() << "\n";
+  }
+
+  return success();
+}
+
+LogicalResult Canonicalize(ModuleOp module) {
+  PassManager pm(module.getContext());
+  pm.addPass(createCanonicalizerPass());
+  return pm.run(module.getOperation());
+}
+
+OwningOpRef<ModuleOp> ReduceModule(OwningOpRef<ModuleOp> module,
+                                   BisectState& state, const Options& options) {
+  auto strategies = llvm::to_vector(mlir::bisect::detail::GetStrategies());
+
+  auto apply_step = [&]() -> std::optional<OwningOpRef<ModuleOp>> {
+    for (auto it = strategies.begin(); it != strategies.end(); ++it) {
+      for (auto& candidate :
+           detail::GetCandidates(it->second, state, *module)) {
+        if (!mlir::verify(*candidate).succeeded()) {
+          continue;
+        }
+        if (options.canonicalize && !Canonicalize(*candidate).succeeded()) {
+          continue;
+        }
+
+        interpreter::ExecutionTrace trace;
+        // Verify that the candidate is still buggy.
+        if (!Run(*candidate, &trace, options).succeeded()) {
+          continue;
+        }
+
+        // Print the new buggy module.
+        llvm::outs() << "module after " << it->first << ":\n"
+                     << *candidate << "\n\n";
+
+        // Update the trace.
+        state.SetTrace(trace);
+
+        // Move failed strategies to the end.
+        decltype(strategies) new_strategies;
+        std::copy(it, strategies.end(), std::back_inserter(new_strategies));
+        std::copy(strategies.begin(), it, std::back_inserter(new_strategies));
+        strategies = new_strategies;
+        return {std::move(candidate)};
+      }
+    }
+    return std::nullopt;
+  };
+
+  while (auto new_module = apply_step()) {
+    module = std::move(*new_module);
+  }
+  return module;
+}
+
+void ReplaceArgsWithConstants(ModuleOp module,
+                              const xla::HloSnapshot& snapshot) {
+  auto main = llvm::cast<func::FuncOp>(module.lookupSymbol("main"));
+  OpBuilder b(main.getBody());
+  for (auto [arg, bbarg] :
+       llvm::zip(snapshot.arguments(), main.getBody().getArguments())) {
+    auto attr = interpreter::ValueToAttribute(
+        *interpreter::LiteralToValue(*xla::Literal::CreateFromProto(arg)),
+        bbarg.getType());
+    CHECK_EQ(attr.size(), 1) << "unsupported argument";
+
+    bbarg.replaceAllUsesWith(b.create<arith::ConstantOp>(
+        main.getLoc(), attr.front(), bbarg.getType()));
+  }
+  while (main.getBody().getNumArguments() > 0) {
+    main.getBody().eraseArgument(0);
+  }
+  main.setFunctionType(FunctionType::get(main.getContext(), /*inputs=*/{},
+                                         main.getFunctionType().getResults()));
+  main.setArgAttrsAttr(b.getArrayAttr({}));
+}
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
+
+int main(int argc, char* argv[]) {
+  llvm::errs().tie(&llvm::outs());
+  llvm::outs().tie(&llvm::errs());
+  int dummy_argc = 1;
+  tsl::port::InitMain("", &dummy_argc, &argv);
+
+  Options options;
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR bisect tool\n");
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllPasses();
+  mlir::bisect::test::RegisterTestPasses();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
+  mlir::thlo::registerAllThloPasses();
+  mlir::gml_st::registerGmlStPasses();
+  mlir::gml_st::registerGmlStTestPasses();
+  mlir::gml_st::registerBufferizableOpInterfaceExternalModels(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+
+  registry.insert<mlir::lmhlo::LmhloDialect, mlir::gml_st::GmlStDialect,
+                  mlir::thlo::THLODialect, xla::runtime::RuntimeDialect>();
+
+  mlir::MLIRContext context(registry);
+  auto module = mlir::bisect::ParseMlirInput(options.input_filename, &context);
+
+  if (!options.hlo_snapshot.empty()) {
+    xla::HloSnapshot snapshot;
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), options.hlo_snapshot,
+                                     &snapshot));
+    mlir::bisect::ReplaceArgsWithConstants(*module, snapshot);
+  }
+
+  if (options.debug_strategy.empty()) {
+    llvm::outs() << "initial module:\n" << *module << "\n";
+  }
+
+  mlir::interpreter::ExecutionTrace trace;
+  if (!mlir::bisect::Run(*module, &trace, options).succeeded()) {
+    llvm::outs() << "Did not find bug in initial module\n";
+    if (options.pass_pipeline.hasAnyOccurrences() &&
+        mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
+      llvm::outs() << "Module after running pipeline:\n" << *module << "\n";
+    }
+    return 1;
+  }
+
+  mlir::bisect::BisectState state;
+  state.SetTrace(trace);
+  if (!options.debug_strategy.empty()) {
+    bool some_failed = false;
+    for (auto& candidate : mlir::bisect::detail::GetCandidates(
+             mlir::bisect::detail::GetStrategies()[options.debug_strategy],
+             state, *module)) {
+      llvm::outs() << *candidate << "\n\n";
+      if (!mlir::verify(*candidate).succeeded()) {
+        some_failed = true;
+        llvm::errs() << "verification failed\n";
+      }
+    }
+    return some_failed ? 1 : 0;
+  }
+
+  module = mlir::bisect::ReduceModule(std::move(module), state, options);
+
+  llvm::outs() << "Final module:\n" << *module << "\n";
+  if (options.pass_pipeline.hasAnyOccurrences() &&
+      mlir::succeeded(mlir::bisect::RunPipeline(*module, options))) {
+    llvm::outs() << "Final module after running pipeline:\n" << *module << "\n";
+  }
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD
new file mode 100644
index 00000000000..ae56591cbf7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+
+cc_library(
+    name = "rewrites",
+    srcs = [
+        "func.cc",
+        "general.cc",
+        "gml_st.cc",
+        "scf.cc",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir/tools/mlir_bisect:bisect_lib",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc
new file mode 100644
index 00000000000..08e5306f2a1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <iterator>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+void SetReturnValues(func::FuncOp func, ValueRange values) {
+  // We only operate on functions without arguments.
+  func.setFunctionType(mlir::FunctionType::get(func.getContext(), /*inputs=*/{},
+                                               values.getTypes()));
+  func.getBody().getBlocks().front().getTerminator()->setOperands(values);
+}
+
+SmallVector<OwningOpRef<ModuleOp>> TruncateFunction(BisectState&,
+                                                    func::FuncOp func) {
+  SmallVector<OwningOpRef<ModuleOp>> result;
+  for (auto& ret : func.getBody().getBlocks().front().without_terminator()) {
+    if (func.getBody().getBlocks().front().getTerminator()->getOperands() ==
+        ret.getResults()) {
+      continue;
+    }
+    auto [module, ret_clone] = CloneModuleFor(&ret);
+    SetReturnValues(ret_clone->getParentOfType<func::FuncOp>(),
+                    ret_clone->getResults());
+    result.push_back(std::move(module));
+  }
+  return result;
+}
+
+SmallVector<OwningOpRef<ModuleOp>> ReturnOperandsOfTerminatorOperands(
+    BisectState&, func::FuncOp func) {
+  SmallVector<OwningOpRef<ModuleOp>> result;
+  auto [module, func_clone] = CloneModuleFor(func);
+  auto* terminator = func_clone.getBody().getBlocks().front().getTerminator();
+  SmallVector<Value> new_operands;
+  for (auto operand : terminator->getOperands()) {
+    if (operand.getDefiningOp()) {
+      llvm::copy(operand.getDefiningOp()->getOperands(),
+                 std::back_inserter(new_operands));
+    } else {
+      return result;
+    }
+  }
+  SetReturnValues(func_clone, new_operands);
+  result.push_back(std::move(module));
+  return result;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(TruncateFunction);
+REGISTER_MLIR_REDUCE_STRATEGY(ReturnOperandsOfTerminatorOperands);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
new file mode 100644
index 00000000000..24368d320e8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
@@ -0,0 +1,173 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+bool IsTerminator(Operation* op) {
+  return op->hasTrait<OpTrait::IsTerminator>();
+}
+
+bool IsTopLevelOp(Operation* op) {
+  return !op->getBlock()->back().mightHaveTrait<OpTrait::IsTerminator>();
+}
+
+SmallVector<OwningOpRef<ModuleOp>> EraseOpWithoutResults(BisectState& state,
+                                                         Operation* op) {
+  // Only erase ops with results if they're unused.
+  if (op->getNumResults() > 0 && !op->use_empty()) {
+    return {};
+  }
+
+  // Don't erase entire functions, constants, terminators.
+  if (IsTopLevelOp(op) || IsTerminator(op)) {
+    return {};
+  }
+
+  auto [module, cloned_op] = CloneModuleFor(op);
+  cloned_op->erase();
+  SmallVector<OwningOpRef<ModuleOp>> ret;
+  ret.push_back(std::move(module));
+  return ret;
+}
+
+llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithConstant(
+    BisectState& state, Operation* op) {
+  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
+      IsTerminator(op) || op->use_empty() || op->getNumResults() == 0) {
+    return result;
+  }
+
+  // Ops that are never executed won't be replaced here, but we have other
+  // strategies that get rid of them (e.g. deleting the entire region).
+  for (auto* execution : state.GetExecutions(op)) {
+    assert(execution->results_size() == op->getNumResults() &&
+           "unexpected number of results");
+
+    auto [module_clone, op_clone] = CloneModuleFor(op);
+    SmallVector<Value> results;
+    OpBuilder b(op_clone);
+    bool all_replaced = true;
+    for (int64_t i = 0; i < op->getNumResults(); ++i) {
+      auto type = op->getResultTypes()[i];
+      auto value = *interpreter::TracedValueToValue(
+          execution->results(static_cast<int>(i)));
+      auto attribute = interpreter::ValueToAttribute(value, type);
+      if (attribute.size() == 1) {
+        op_clone->getResults()[i].replaceAllUsesWith(
+            b.create<arith::ConstantOp>(op_clone->getLoc(), attribute.front(),
+                                        type));
+      } else {
+        // We don't currently support tuples.
+        all_replaced = false;
+      }
+    }
+    if (all_replaced) {
+      result.push_back(std::move(module_clone));
+    }
+  }
+  return result;
+}
+
+llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOperandWithConstant(
+    BisectState& state, Operation* op) {
+  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+  if (IsTopLevelOp(op) || op->getNumOperands() == 0) {
+    return result;
+  }
+
+  for (auto* execution : state.GetExecutions(op)) {
+    for (int64_t i = 0; i < op->getNumOperands(); ++i) {
+      auto operand = op->getOperand(i);
+      if (operand.getDefiningOp() &&
+          operand.getDefiningOp()->hasTrait<OpTrait::ConstantLike>()) {
+        continue;
+      }
+      auto type = op->getOperandTypes()[i];
+      auto value = *interpreter::TracedValueToValue(
+          execution->args(static_cast<int>(i)));
+      auto attribute = interpreter::ValueToAttribute(value, type);
+      if (attribute.size() == 1) {
+        auto [module_clone, op_clone] = CloneModuleFor(op);
+        OpBuilder b(op_clone);
+        op_clone->setOperand(
+            i, b.create<arith::ConstantOp>(op_clone->getLoc(),
+                                           attribute.front(), type));
+        result.push_back(std::move(module_clone));
+      }
+    }
+  }
+  return result;
+}
+
+// Replaces an op's result with some other value with the same type defined
+// previously in the same region.
+llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithValue(BisectState&,
+                                                            Operation* op) {
+  llvm::SmallVector<OwningOpRef<ModuleOp>> ret;
+  if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
+      IsTerminator(op)) {
+    return ret;
+  }
+
+  // TODO(jreiffers): Consider bbargs.
+  llvm::DenseMap<mlir::Type, llvm::SmallVector<std::pair<Operation*, int64_t>>>
+      candidates_by_type;
+  for (auto* pred = op->getPrevNode(); pred != nullptr;
+       pred = pred->getPrevNode()) {
+    for (auto [index, result] : llvm::enumerate(pred->getResults())) {
+      candidates_by_type[result.getType()].emplace_back(pred, index);
+    }
+  }
+
+  for (auto [index, result] : llvm::enumerate(op->getResults())) {
+    if (result.use_empty()) {
+      continue;
+    }
+
+    for (auto [new_result_op, new_result_index] :
+         candidates_by_type[result.getType()]) {
+      auto [module_clone, op_clone] = CloneModuleFor(op);
+      op_clone->getResults()[index].replaceAllUsesWith(
+          FindInClone(new_result_op, module_clone.get())
+              ->getResults()[new_result_index]);
+      ret.push_back(std::move(module_clone));
+    }
+  }
+  return ret;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(EraseOpWithoutResults);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithConstant);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOpWithValue);
+REGISTER_MLIR_REDUCE_STRATEGY(ReplaceOperandWithConstant);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/gml_st.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/gml_st.cc
new file mode 100644
index 00000000000..140297e1351
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/gml_st.cc
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+SmallVector<OwningOpRef<ModuleOp>> ReduceGmlStParallelBounds(
+    BisectState&, gml_st::ParallelOp parallel_op) {
+  SmallVector<OwningOpRef<ModuleOp>> result;
+  for (int64_t i = 0; i < parallel_op.getUpperBound().size(); ++i) {
+    if (!parallel_op.getUpperBound()[i]
+             .getDefiningOp()
+             ->hasTrait<OpTrait::ConstantLike>()) {
+      continue;
+    }
+
+    auto [module, op] = CloneModuleFor(parallel_op);
+    OpBuilder b(op);
+    op.getUpperBoundMutable().slice(i, 1).assign(
+        b.createOrFold<mlir::arith::SubIOp>(
+            op->getLoc(), op.getUpperBound()[i],
+            b.create<mlir::arith::ConstantIndexOp>(op->getLoc(), 1)));
+    result.push_back(std::move(module));
+  }
+  return result;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(ReduceGmlStParallelBounds);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
new file mode 100644
index 00000000000..4dcbd911106
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
+
+#include <functional>  // NOLINT
+#include <utility>     // NOLINT
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
+
+namespace mlir {
+namespace bisect {
+namespace {
+
+constexpr int64_t kMaxWhileIterations = 1;
+
+// Rewrites a while loop to execute its body a fixed number of times. The
+// condition is executed, but its result is ignored.
+// For ease of implementation, this generates scf.execute_region ops. These are
+// subsequently canonicalized away.
+llvm::SmallVector<OwningOpRef<ModuleOp>> InlineScfWhile(BisectState&,
+                                                        scf::WhileOp whileOp) {
+  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+  for (int64_t num_executions = 0; num_executions <= kMaxWhileIterations;
+       ++num_executions) {
+    using ::mlir::scf::ExecuteRegionOp;
+
+    auto [module, op] = CloneModuleFor(whileOp);
+    OpBuilder b(op);
+    llvm::SmallVector<scf::ExecuteRegionOp> regions;
+
+    auto wrap_region_in_execute = [&, loc = op.getLoc()](mlir::Region& region) {
+      regions
+          .emplace_back(b.create<ExecuteRegionOp>(
+              loc,
+              region.getBlocks().front().getTerminator()->getOperandTypes(),
+              mlir::ValueRange{}))
+          .getRegion()
+          .takeBody(region);
+    };
+
+    wrap_region_in_execute(op.getBefore());
+    // Replace the condition terminator with a yield terminator.
+    {
+      auto& before_block = regions[0].getRegion().getBlocks().front();
+      OpBuilder before_builder(before_block.getTerminator());
+      IRRewriter before_rewriter(before_builder);
+      before_rewriter.replaceOpWithNewOp<scf::YieldOp>(
+          before_block.getTerminator(),
+          before_block.getTerminator()->getOperands());
+    }
+
+    // Clone the execute region ops the requested number of times.
+    if (num_executions > 0) {
+      wrap_region_in_execute(op.getAfter());
+      for (int64_t i = 0; i < num_executions - 1; ++i) {
+        b.insert(regions.emplace_back(regions[0].clone()));
+        b.insert(regions.emplace_back(regions[1].clone()));
+      }
+      b.insert(regions.emplace_back(regions[0].clone()));
+    }
+
+    // Rewire region arguments and erase them.
+    for (int64_t i = 0; i < regions.size(); ++i) {
+      auto args = i == 0 ? ValueRange{op.getOperands()}
+                         : ValueRange{regions[i - 1].getResults()};
+      bool is_after_region = (i & 1) == 1;
+      auto& region = regions[i].getRegion();
+      for (int64_t arg = static_cast<int64_t>(region.getNumArguments()) - 1;
+           arg >= 0; --arg) {
+        region.getArgument(arg).replaceAllUsesWith(
+            args[is_after_region ? arg + 1 : arg]);
+        region.eraseArgument(arg);
+      }
+    }
+    op->replaceAllUsesWith(regions.back().getResults().drop_front(1));
+    op->erase();
+    result.push_back(std::move(module));
+  }
+  return result;
+}
+
+REGISTER_MLIR_REDUCE_STRATEGY(InlineScfWhile);
+
+}  // namespace
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
new file mode 100644
index 00000000000..cba567fb924
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
new file mode 100644
index 00000000000..e918e112fe4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/erase-op-without-results.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-bisect %s --debug-strategy=EraseOpWithoutResults | FileCheck %s
+
+func.func @main() -> memref<i32> {
+  %a = arith.constant 1 : i32
+  %b = memref.alloc() : memref<i32>
+  memref.store %a, %b[] : memref<i32>
+  func.return %b : memref<i32>
+}
+
+//      CHECK: func.func @main()
+//      CHECK:   %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:   return %[[ALLOC]]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
new file mode 100644
index 00000000000..6c9deddbc37
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/inline-scf-while.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-bisect %s --debug-strategy=InlineScfWhile | FileCheck %s
+
+func.func @main() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
+    %cond = arith.cmpi slt, %arg0, %c4 : i64
+    scf.condition(%cond) %arg0 : i64
+  } do {
+  ^bb0(%arg1: i64):
+    %add = arith.addi %arg1, %c1 : i64
+    scf.yield %add : i64
+  }
+  return %ret : i64
+}
+
+//     CHECK: func @main
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C4:.*]] = arith.constant 4
+//     CHECK:   %[[RET:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi slt, %[[C0]], %[[C4]]
+//     CHECK:     yield {{.*}}, %[[C0]]
+//     CHECK:   return %[[RET]]#1
+
+//     CHECK: func @main
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+//     CHECK:   %[[BEFORE0:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi
+//     CHECK:     yield {{.*}}, %[[C0]]
+//     CHECK:   %[[AFTER:.*]] = scf.execute_region
+//     CHECK:     %[[ADD:.*]] = arith.addi %[[BEFORE0]]#1, %[[C1]]
+//     CHECK:     yield %[[ADD]]
+//     CHECK:   %[[BEFORE1:.*]]:2 = scf.execute_region
+//     CHECK:     arith.cmpi
+//     CHECK:     yield {{.*}}, %[[AFTER]]
+//     CHECK:   return %[[BEFORE1]]#1
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-gml-st-parallel-bounds.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-gml-st-parallel-bounds.mlir
new file mode 100644
index 00000000000..d45683ddda4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/reduce-gml-st-parallel-bounds.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-bisect %s --debug-strategy=ReduceGmlStParallelBounds | FileCheck %s
+
+func.func @main() -> tensor<8xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %init = tensor.empty() : tensor<8xindex>
+  %iota = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1)
+      outs (%init_ = %init: tensor<8xindex>) {
+    %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %i into %init_[%tile]
+      : index into tensor<8xindex>[!gml_st.tile<1>]
+  } : tensor<8xindex>
+  func.return %iota : tensor<8xindex>
+}
+
+// CHECK: func @main()
+// CHECK:   %[[C7:.*]] = arith.constant 7
+// CHECK:   gml_st.parallel {{.*}} to (%[[C7]])
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
new file mode 100644
index 00000000000..171472ad733
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-constant.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithConstant | FileCheck %s
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+//      CHECK: func.func @main()
+// CHECK-NEXT:   arith.constant dense<3>
+// CHECK-NEXT:   arith.constant dense<2>
+// CHECK-NEXT:   arith.constant dense<5>
+// CHECK-NEXT:   %[[ADD:.*]] = mhlo.add
+//  CHECK-NOT:   %[[ADD]]
+// CHECK-NEXT:   mhlo.multiply
+// CHECK-NEXT:   return
+
+//      CHECK: func.func @main()
+// CHECK-NEXT:   arith.constant dense<3>
+// CHECK-NEXT:   arith.constant dense<2>
+// CHECK-NEXT:   mhlo.add
+// CHECK-NEXT:   %[[D:.*]] = arith.constant dense<10>
+// CHECK-NEXT:   mhlo.multiply
+// CHECK-NEXT:   return %[[D]]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
new file mode 100644
index 00000000000..f89f647f14d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-op-with-value.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOpWithValue | FileCheck %s
+
+func.func @main() -> (memref<i32>, memref<i32>) {
+  %a = memref.alloc() : memref<i32>
+  %b = memref.alloc() : memref<i32>
+  %c0 = arith.constant 0 : i32
+  memref.store %c0, %b[] : memref<i32>
+  return %a, %b : memref<i32>, memref<i32>
+}
+
+//      CHECK: func @main()
+//      CHECK:   %[[ALLOC:.*]] = memref.alloc()
+// CHECK-NEXT:   memref.alloc
+// CHECK-NEXT:   constant
+// CHECK-NEXT:   memref.store {{.*}}, %[[ALLOC]]
+// CHECK-NEXT:   return %[[ALLOC]], %[[ALLOC]]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
new file mode 100644
index 00000000000..7619a8a500c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/replace-operand-with-constant.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-bisect %s --debug-strategy=ReplaceOperandWithConstant | FileCheck %s
+
+func.func @main() -> (tensor<2xi32>, tensor<2xi32>) {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %c, %d : tensor<2xi32>, tensor<2xi32>
+}
+
+// CHECK: func @main()
+// CHECK:   %[[C2:.*]] = arith.constant dense<2>
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   %[[C5:.*]] = arith.constant dense<5>
+// CHECK:   %[[MUL:.*]] = mhlo.multiply %[[C2]], %[[C5]] : tensor<2xi32>
+// CHECK:   return %[[ADD]], %[[MUL]]
+
+// CHECK: func @main()
+// CHECK:   mhlo.add
+// CHECK:   %[[MUL:.*]] = mhlo.multiply %cst_0, %0 : tensor<2xi32>
+// CHECK:   %[[C5:.*]] = arith.constant dense<5>
+// CHECK:   return %[[C5]], %[[MUL]]
+
+// CHECK: func @main()
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   mhlo.multiply
+// CHECK:   %[[C10:.*]] = arith.constant dense<10>
+// CHECK:   return %[[ADD]], %[[C10]]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
new file mode 100644
index 00000000000..8584e2a0008
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/return-operands-of-terminator-operands.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-bisect %s --debug-strategy=ReturnOperandsOfTerminatorOperands | FileCheck %s
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<3> : tensor<2xi32>
+  %b = arith.constant dense<2> : tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+// CHECK: @main
+// CHECK:   %[[C2:.*]] = arith.constant dense<2>
+// CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK:   mhlo.multiply
+// CHECK:   return %[[C2]], %[[ADD]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
new file mode 100644
index 00000000000..af06778bd47
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/truncate-function.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-bisect %s --debug-strategy=TruncateFunction | FileCheck %s
+
+// Function to prevent constant folding below.
+func.func private @cst() -> tensor<2xi32> {
+  %cst = arith.constant dense<2> : tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+func.func @main() -> tensor<2xi32> {
+  %a = arith.constant dense<1> : tensor<2xi32>
+  %b = func.call @cst() : () -> tensor<2xi32>
+  %c = mhlo.add %a, %b : tensor<2xi32>
+  %d = mhlo.multiply %b, %c : tensor<2xi32>
+  func.return %d : tensor<2xi32>
+}
+
+//     CHECK: func @main()
+//     CHECK:   %[[A:.*]] = arith.constant dense<1>
+//     CHECK:   return %[[A]]
+
+//     CHECK: func @main()
+//     CHECK:   %[[B:.*]] = call @cst()
+//     CHECK:   return %[[B]]
+
+//     CHECK: func @main()
+//     CHECK:   %[[A:.*]] = arith.constant dense<1>
+//     CHECK:   %[[B:.*]] = call @cst()
+//     CHECK:   %[[ADD:.*]] = mhlo.add
+// CHECK-DAG:   %[[A]]
+// CHECK-DAG:   %[[B]]
+//     CHECK:   return %[[ADD]]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.cc
new file mode 100644
index 00000000000..28daec2025e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.cc
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h"
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace bisect {
+namespace test {
+namespace {
+
+struct BreakLinalgTransposePass
+    : public PassWrapper<BreakLinalgTransposePass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BreakLinalgTransposePass)
+
+  StringRef getArgument() const final { return "test-break-linalg-transpose"; }
+  StringRef getDescription() const final { return "breaks linalg transpose"; }
+  BreakLinalgTransposePass() = default;
+
+  void runOnOperation() override {
+    getOperation().walk([](linalg::TransposeOp op) {
+      auto permutation = llvm::to_vector(op.getPermutation());
+      std::swap(permutation[0], permutation[1]);
+      op.setPermutation(permutation);
+    });
+  }
+};
+}  // namespace
+
+void RegisterTestPasses() { PassRegistration<BreakLinalgTransposePass>(); }
+
+}  // namespace test
+}  // namespace bisect
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h
new file mode 100644
index 00000000000..84511daec2d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
+
+namespace mlir {
+namespace bisect {
+namespace test {
+
+void RegisterTestPasses();
+
+}
+}  // namespace bisect
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_BISECT_TEST_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD
new file mode 100644
index 00000000000..57fad554f46
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "snapshot.mlir.pb",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_bisect:mlir-bisect",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/bisect.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
new file mode 100644
index 00000000000..ca839d982c4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/bisect.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-bisect %s \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: --max-steps-per-run=200 \
+// RUN: | FileCheck %s
+
+func.func @main() -> (memref<2x2xindex>, memref<2x2xindex>) {
+  %a = memref.alloc() : memref<2x2xindex>
+  %b = memref.alloc() : memref<2x2xindex>
+  %c = memref.alloc() : memref<2x2xindex>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  scf.for %i = %c0 to %c2 step %c1 {
+    scf.for %j = %c0 to %c2 step %c1 {
+      memref.store %i, %a[%i, %j] : memref<2x2xindex>
+      memref.store %j, %b[%i, %j] : memref<2x2xindex>
+    }
+  }
+
+  %i = scf.while: () -> (index) {
+    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
+    %cond = arith.cmpi slt, %value, %c3 : index
+    scf.condition(%cond) %value : index
+  } do {
+  ^bb0(%_: index):
+    %value = memref.load %a[%c0, %c0] : memref<2x2xindex>
+    %add = arith.addi %value, %c1 : index
+    memref.store %add, %a[%c0, %c0] : memref<2x2xindex>
+    linalg.transpose ins(%b : memref<2x2xindex>) outs(%c : memref<2x2xindex>)
+      permutation = [1, 0]
+    memref.copy %c, %b : memref<2x2xindex> to memref<2x2xindex>
+    scf.yield
+  }
+
+  return %a, %b : memref<2x2xindex>, memref<2x2xindex>
+}
+
+//     CHECK: Final module
+//     CHECK: func @main() -> memref<2x2xindex> {
+// CHECK-NOT: scf.while
+// CHECK-NOT: scf.for
+//     CHECK: linalg.transpose {{.*}} permutation = [1, 0]
+
+//     CHECK: Final module after running pipeline
+//     CHECK: linalg.transpose {{.*}} permutation = [0, 1]
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
new file mode 100644
index 00000000000..df343f3bf8b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/no-bug.mlir
@@ -0,0 +1,10 @@
+// RUN: not mlir-bisect %s \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: | FileCheck %s
+
+func.func @main() -> memref<2x2xindex> {
+  %a = memref.alloc() : memref<2x2xindex>
+  return %a : memref<2x2xindex>
+}
+
+// CHECK: Did not find bug in initial module
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
new file mode 100644
index 00000000000..916ca47ab0f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir
@@ -0,0 +1,12 @@
+// RUN: not mlir-bisect %s --hlo-snapshot=%s.pb \
+// RUN: --pass-pipeline="builtin.module(test-break-linalg-transpose)" \
+// RUN: | FileCheck %s
+
+func.func @main(%a: tensor<3x1xi32>, %b: tensor<3x1xi32>) -> tensor<3x1xi32> {
+  return %a : tensor<3x1xi32>
+}
+
+// CHECK: initial module
+// CHECK: func @main() -> tensor<3x1xi32> {
+// CHECK{LITERAL}: arith.constant dense<[[2], [-4], [5]]> : tensor<3x1xi32>
+// CHECK{LITERAL}: arith.constant dense<[[0], [7], [-5]]> : tensor<3x1xi32>
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb
new file mode 100644
index 00000000000..ee3c8f75949
Binary files /dev/null and b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/snapshot.mlir.pb differ
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
new file mode 100644
index 00000000000..ee7345122cb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
@@ -0,0 +1,54 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+
+xla_cc_binary(
+    name = "mlir_replay",
+    srcs = ["mlir_replay.cc"],
+    deps = [
+        ":mlir_replay_lib",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc_impl",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//tensorflow/compiler/xla/mlir_hlo:gml_st",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_dialects",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "//tensorflow/compiler/xla/mlir_hlo:thlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "mlir_replay_lib",
+    srcs = ["mlir_replay_lib.cc"],
+    hdrs = ["mlir_replay_lib.h"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_proto_cc",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/random:bit_gen_ref",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirReduceLib",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/README.md b/tensorflow/compiler/xla/mlir/tools/mlir_replay/README.md
new file mode 100644
index 00000000000..89478f5b1bf
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/README.md
@@ -0,0 +1,48 @@
+# MLIR Replay tool
+
+This tool is mainly intended for helping debug miscompiles. It takes as inputs
+an HLO snapshot proto with input tensors and a compiler trace proto with the
+state of the IR after each pass.
+
+This tool is built on top of
+[mlir-interpreter](https://github.com/tensorflow/mlir-hlo/tree/master/tools/mlir_interpreter/).
+
+Example usage:
+
+```
+# Run a JAX test with debug flags enabled:
+$ bazel test :some_jax_test --compilation_mode=opt \
+  --test_env=XLA_FLAGS="--xla_cpu_use_xla_runtime --xla_dump_to=/tmp/test-dump --xla_dump_hlo_snapshots" \
+  --test_filter=SomeSpecific.TestCase \
+  --test_sharding_strategy=disabled
+
+# JAX tends to compile many modules, so first check which one is broken:
+./mlir_replay \
+  --mlir_compilation_trace_dir=/tmp/test-dump
+
+Failures for /tmp/test-dump/module_1234.jit_something.mlir-trace.pb:
+  Result mismatch for /tmp/test-dump/module_1234.jit_something.snapshot.56.pb: TensorOrMemref<3xi32>: [1, 2, 3] != TensorOrMemref<3xi32>: [1, 1, 1]
+  run :mlir_replay -- --mlir_compilation_trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb --hlo_snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb --print_changes_only --stop_after_first_failure
+```
+
+There may be multiple failing modules. You can run the provided command to
+replay a particular one:
+
+```
+# Run the IR after each pass. Note that JAX typically compiles many modules, so
+# you may have check more than one.
+# There is one .mlir-trace.pb file per module (containing the intermediate IR)
+# and one .snapshot.pb file per execution (containing the inputs and outputs).
+$ ./mlir_replay \
+  --mlir_compilation_trace=/tmp/test-dump/module_1234.jit_something.mlir-trace.pb \
+  --hlo_snapshot=/tmp/test-dump/module_1234.jit_something.snapshot.56.pb \
+  --print_changes_only --stop_after_first_failure
+Running IR after APass
+Results: [1, 2, 3]
+
+Running IR after BPass
+Running IR after CPass
+Running IR after BrokenPass
+Results: [1, 1, 1]
+```
+
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc
new file mode 100644
index 00000000000..11a24b294a7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay.cc
@@ -0,0 +1,230 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/init_main.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
+
+struct ReplayOptions {
+  std::string hlo_snapshot;
+  std::string mlir_compilation_trace;
+  std::string mlir_compilation_trace_dir = "";
+  std::string execution_trace_dir = "";
+  std::string entry_point = "main";
+  bool print_changes_only = false;
+  bool stop_after_first_failure = false;
+};
+
+bool ResultsMatch(const xla::HloSnapshot& snapshot,
+                  const llvm::SmallVector<mlir::interpreter::InterpreterValue>&
+                      first_pass_results,
+                  std::vector<std::string>& failures) {
+  auto actual = mlir::interpreter::LiteralToValue(snapshot.result());
+  TF_CHECK_OK(actual.status());
+
+  // We assume this is MHLO, so multiple results will be in a tuple.
+  if (first_pass_results.size() != 1) {
+    failures.push_back("expected one result");
+    return false;
+  }
+
+  if (!(*actual == first_pass_results[0])) {
+    failures.push_back("result mismatch: " + actual->toString() +
+                       " != " + first_pass_results[0].toString());
+    return false;
+  }
+  return true;
+}
+
+void TestAll(mlir::MLIRContext& context, const ReplayOptions& opts) {
+  std::vector<std::string> traces;
+  TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(
+      opts.mlir_compilation_trace_dir + "/*.mlir-trace.pb", &traces));
+
+  for (const auto& trace_path : traces) {
+    mlir::interpreter::MlirCompilationTrace trace;
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), trace_path, &trace))
+        << "Failed to load " << trace_path;
+
+    std::vector<std::string> snapshots;
+    std::string prefix =
+        trace_path.substr(0, trace_path.length() - strlen(".mlir-trace.pb"));
+    TF_CHECK_OK(tsl::Env::Default()->GetMatchingPaths(prefix + "*.snapshot.*",
+                                                      &snapshots));
+    CHECK_NE(snapshots.size(), 0)
+        << "No snapshots found for module " << trace_path << ".";
+
+    std::vector<std::string> failures;
+    for (const auto& snapshot_path : snapshots) {
+      xla::HloSnapshot snapshot;
+      TF_CHECK_OK(
+          tsl::ReadBinaryProto(tsl::Env::Default(), snapshot_path, &snapshot));
+
+      auto results =
+          mlir::interpreter::Run(context, trace.passes(0).mlir_module(),
+                                 snapshot, nullptr, opts.entry_point);
+      if (!results.status().ok()) {
+        failures.push_back("Failed to execute " + snapshot_path + ": " +
+                           results.status().ToString());
+      } else {
+        if (!ResultsMatch(snapshot, *results, failures)) {
+          failures.push_back(
+              std::string("run :mlir_replay -- --mlir_compilation_trace=") +
+              trace_path + " --hlo_snapshot=" + snapshot_path +
+              " --print_changes_only --stop_after_first_failure");
+        }
+      }
+    }
+
+    if (!failures.empty()) {
+      llvm::errs() << "Failures for " << trace_path << ":\n  "
+                   << absl::StrJoin(failures, "\n  ") << "\n";
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  // Flush llvm::outs before writing errors.
+  llvm::errs().tie(&llvm::outs());
+
+  ReplayOptions opts;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("hlo_snapshot", &opts.hlo_snapshot,
+                "Filename of an HloSnapshot proto. Only used to read inputs."),
+      tsl::Flag("mlir_compilation_trace", &opts.mlir_compilation_trace,
+                "Filename of an MlirCompilerTrace proto."),
+      tsl::Flag("mlir_compilation_trace_dir", &opts.mlir_compilation_trace_dir,
+                "Directory from which to load MlirCompilerTrace and "
+                "HloSnapshot protos. The tool will run all snapshots and "
+                "report the ones with bugs."),
+      tsl::Flag("execution_trace_dir", &opts.execution_trace_dir,
+                "Directory where to store the execution traces (optional)."),
+      tsl::Flag("entry_point", &opts.entry_point,
+                "Program entry function (optional, defaults to 'main')."),
+      tsl::Flag("print_changes_only", &opts.print_changes_only,
+                "If set, only print changed values"),
+      tsl::Flag("stop_after_first_failure", &opts.stop_after_first_failure,
+                "If set, stop after the first failed invocation."),
+  };
+  xla::AppendDebugOptionsFlags(&flag_list);
+
+  // The usage string includes the message at the top of the file, the
+  // DebugOptions flags and the flags defined above.
+  std::string usage_string = tsl::Flags::Usage(argv[0], flag_list);
+  if (!tsl::Flags::Parse(&argc, argv, flag_list)) {
+    return 1;
+  }
+  tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
+
+  CHECK(opts.mlir_compilation_trace.empty() !=
+        opts.mlir_compilation_trace_dir.empty())
+      << "Exactly one of --mlir_compilation_trace and "
+         "--mlir_compilation_trace_dir must be specified.";
+
+  CHECK(opts.mlir_compilation_trace_dir.empty() || opts.hlo_snapshot.empty())
+      << "If --mlir_compilation_trace_dir is set, --hlo_snapshot must not be.";
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  registry.insert<mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
+                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
+                  xla::runtime::RuntimeDialect>();
+
+  mlir::MLIRContext context(registry);
+
+  if (!opts.mlir_compilation_trace_dir.empty()) {
+    TestAll(context, opts);
+    return 0;
+  }
+
+  xla::HloSnapshot snapshot;
+  if (!opts.hlo_snapshot.empty()) {
+    TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(), opts.hlo_snapshot,
+                                     &snapshot));
+  }
+  mlir::interpreter::MlirCompilationTrace trace;
+  TF_CHECK_OK(tsl::ReadBinaryProto(tsl::Env::Default(),
+                                   opts.mlir_compilation_trace, &trace));
+
+  llvm::SmallVector<mlir::interpreter::InterpreterValue> previous_results;
+  int pass_id = 0;
+  for (auto& state : trace.passes()) {
+    llvm::outs() << "Running IR after " << state.after_pass() << ".\n";
+    mlir::interpreter::ExecutionTrace execution_trace;
+    auto results = mlir::interpreter::Run(
+        context, state.mlir_module(), snapshot,
+        opts.execution_trace_dir.empty() ? nullptr : &execution_trace,
+        opts.entry_point);
+    if (results.status().ok()) {
+      if (!opts.print_changes_only || (*results != previous_results)) {
+        llvm::outs() << "Results:\n";
+        for (const auto& result : *results) {
+          llvm::outs() << result.toString() << "\n";
+        }
+        previous_results = *results;
+        llvm::outs() << "\n";
+      }
+    } else {
+      llvm::errs() << results.status().ToString() << "\n";
+      if (opts.stop_after_first_failure) {
+        return 1;
+      }
+    }
+
+    if (!opts.execution_trace_dir.empty()) {
+      TF_CHECK_OK(
+          tsl::Env::Default()->RecursivelyCreateDir(opts.execution_trace_dir));
+      std::string filename = tsl::io::JoinPath(
+          opts.execution_trace_dir,
+          absl::StrFormat("%.4d.%s.mlir", pass_id, state.after_pass()));
+      TF_CHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), filename,
+                                         execution_trace.ir()));
+
+      filename = tsl::io::JoinPath(
+          opts.execution_trace_dir,
+          absl::StrFormat("%.4d.%s.trace.pb", pass_id, state.after_pass()));
+      TF_CHECK_OK(tsl::WriteBinaryProto(tsl::Env::Default(), filename,
+                                        execution_trace));
+    }
+    ++pass_id;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
new file mode 100644
index 00000000000..e5b5e445104
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
@@ -0,0 +1,187 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h"
+
+#include <complex>
+#include <iterator>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/random/bit_gen_ref.h"
+#include "absl/random/random.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Tools/ParseUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+tsl::StatusOr<SmallVector<InterpreterValue>> LoadArgs(
+    const xla::HloSnapshot& snapshot) {
+  SmallVector<InterpreterValue> result;
+  for (const auto& arg : snapshot.arguments()) {
+    TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(arg));
+    result.push_back(std::move(converted));
+  }
+  return result;
+}
+
+namespace {
+template <typename T, template <typename _> class rng_t>
+mlir::interpreter::InterpreterValue RandomTensor(absl::BitGenRef bitgen,
+                                                 mlir::Type type) {
+  llvm::SmallVector<int64_t> shape;
+  auto shaped_ty = type.dyn_cast<mlir::ShapedType>();
+  if (shaped_ty) {
+    shape = llvm::to_vector(shaped_ty.getShape());
+  }
+
+  auto rng = rng_t<T>{};
+  auto result = mlir::interpreter::TensorOrMemref<T>::empty(shape);
+  for (const auto& index : result.view.indices()) {
+    auto& elem = result.at(index) = rng(bitgen);
+    // Ints are typically indices, so scale them down to a more reasonable
+    // range.
+    if constexpr (std::is_same_v<T, int64_t>) {
+      elem >>= 60;
+    }
+  }
+  if (shaped_ty) {
+    return {result};
+  }
+  return {result.at({})};
+}
+}  // namespace
+
+mlir::FailureOr<mlir::interpreter::InterpreterValue> MakeRandomInput(
+    absl::BitGenRef bitgen, mlir::Type type) {
+  auto elem_ty =
+      type.isa<ShapedType>() ? type.cast<ShapedType>().getElementType() : type;
+  if (elem_ty.isF32()) {
+    return RandomTensor<float, absl::gaussian_distribution>(bitgen, type);
+  }
+  if (elem_ty.isF64()) {
+    return RandomTensor<double, absl::gaussian_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(32)) {
+    return RandomTensor<int32_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(16)) {
+    return RandomTensor<int16_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(64)) {
+    return RandomTensor<int64_t, absl::uniform_int_distribution>(bitgen, type);
+  }
+  if (elem_ty.isInteger(1)) {
+    return {{TensorOrMemref<bool>::empty(type.cast<ShapedType>().getShape())}};
+  }
+
+  llvm::errs() << "Unsupported type: ";
+  type.print(llvm::errs());
+  llvm::errs() << "\n";
+  return failure();
+}
+
+}  // namespace
+
+tsl::StatusOr<SmallVector<InterpreterValue>> Run(
+    MLIRContext& context, const std::string& mlir_ir,
+    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
+    const std::string& entry) {
+  auto sourceMgr = std::make_shared<llvm::SourceMgr>();
+  sourceMgr->AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(mlir_ir),
+                                mlir::SMLoc());
+  mlir::OwningOpRef<mlir::Operation*> module =
+      mlir::parseSourceFileForTool(sourceMgr, &context, false);
+  if (!module) {
+    return tsl::errors::InvalidArgument("failed to parse MLIR");
+  }
+
+  SymbolTable symbols(*module);
+  auto main = llvm::dyn_cast_or_null<func::FuncOp>(symbols.lookup(entry));
+  if (!main) {
+    return tsl::errors::InvalidArgument("failed to find entry function \"" +
+                                        entry + "\"");
+  }
+
+  if (trace) {
+    llvm::raw_string_ostream os(*trace->mutable_ir());
+    (*module)->print(os, OpPrintingFlags().printGenericOpForm());
+  }
+
+  // After xla-rt-export-functions, we have an execution context as the first
+  // argument. The interpreter currently cannot deal with these things, so we
+  // fail in that case.
+  auto function_args = main.getBody().getBlocks().front().getArguments();
+  if (!llvm::all_of(function_args, [](Value arg) {
+        return arg.getType().isa<ShapedType>();
+      })) {
+    return tsl::errors::InvalidArgument(
+        "expected all function arguments to be shaped types");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto args, LoadArgs(snapshot));
+  auto out_args =
+      main.getBody().getBlocks().front().getArguments().drop_front(args.size());
+
+  std::seed_seq my_seed_seq({0});
+  absl::BitGen bitgen(my_seed_seq);
+  llvm::SmallVector<InterpreterValue> out_buffers;
+  // Add random inputs for output arguments and unspecified inputs.
+  for (auto arg : out_args) {
+    auto arg_or = MakeRandomInput(bitgen, arg.getType());
+    if (!succeeded(arg_or)) {
+      return tsl::errors::InvalidArgument("failed to create input");
+    }
+    out_buffers.push_back(*arg_or);
+    args.push_back(*arg_or);
+  }
+
+  InterpreterOptions options;
+  ExecutionTraceListener tracer(trace);
+  if (trace) {
+    options.listener = &tracer;
+  }
+  auto results_or = runInterpreter(symbols, main, args, options);
+  if (!succeeded(results_or)) {
+    return tsl::errors::Internal("interpreter failed");
+  }
+
+  if (results_or->empty()) {
+    return out_buffers;
+  }
+  return *results_or;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
new file mode 100644
index 00000000000..e4a16f663c8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
+
+#include <string>
+
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Runs the given IR on the inputs from `snapshot` and returns the result.
+tsl::StatusOr<SmallVector<InterpreterValue>> Run(
+    MLIRContext& context, const std::string& mlir_ir,
+    const xla::HloSnapshot& snapshot, ExecutionTrace* trace,
+    const std::string& entry);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_MLIR_REPLAY_LIB_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD
new file mode 100644
index 00000000000..b4b88af76c3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD
@@ -0,0 +1,69 @@
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "compiler_trace_instrumentation",
+    srcs = ["compiler_trace_instrumentation.cc"],
+    hdrs = ["compiler_trace_instrumentation.h"],
+    deps = [
+        ":compiler_trace_proto_cc",
+        ":compiler_trace_proto_cc_impl",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:protobuf",
+        "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "execution_trace_utils",
+    srcs = ["execution_trace_utils.cc"],
+    hdrs = ["execution_trace_utils.h"],
+    deps = [
+        ":execution_trace_proto_cc",
+        ":execution_trace_proto_cc_impl",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "//tensorflow/tsl/platform:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "execution_trace_utils_test",
+    srcs = ["execution_trace_utils_test.cc"],
+    deps = [
+        ":execution_trace_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_proto_library(
+    name = "execution_trace_proto",
+    srcs = ["execution_trace.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "compiler_trace_proto",
+    srcs = ["compiler_trace.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/README.md b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/README.md
new file mode 100644
index 00000000000..c886abf5ffd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/README.md
@@ -0,0 +1,10 @@
+# Public API of mlir_replay
+
+This contains protocol buffers and utilities that can be reused for other
+debugging tools:
+
+1.  **The compiler trace proto**: A record of the state of the IR after each
+    compilation pass
+1.  A compiler instrumentation to create the above proto.
+1.  **The execution trace proto**: A record of SSA values as the IR is executed
+1.  Utilities for working with the above protos.
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.proto b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
new file mode 100644
index 00000000000..bf144716208
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.proto
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package mlir.interpreter;
+
+message MlirCompilationTraceEntry {
+  // The name of the pass that was previously executed.
+  optional string after_pass = 1;
+
+  // MLIR module IR of the state after the pass.
+  optional string mlir_module = 2;
+}
+
+message MlirCompilationTrace {
+  // MLIR modules corresponding to each stage of the compilation pipeline.
+  repeated MlirCompilationTraceEntry passes = 1;
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
new file mode 100644
index 00000000000..8c9697723f5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.cc
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
+
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/path.h"
+
+namespace mlir {
+namespace interpreter {
+
+void MlirCompilerTraceInstrumentation::runAfterPass(Pass* pass, Operation* op) {
+  ModuleOp module = llvm::dyn_cast<ModuleOp>(op);
+  if (!module) {
+    module = op->getParentOfType<mlir::ModuleOp>();
+  }
+  if (!module) {
+    LOG(ERROR) << "Failed to find a ModuleOp: " << pass->getName().str() << ".";
+    return;
+  }
+
+  auto* item = trace_.mutable_passes()->Add();
+  item->set_after_pass(pass->getName().str());
+  llvm::raw_string_ostream os(*item->mutable_mlir_module());
+  module.print(os);
+}
+
+MlirCompilerTraceInstrumentation::~MlirCompilerTraceInstrumentation() {
+  if (!trace_.passes().empty()) {
+    std::string filename;
+    absl::StrAppendFormat(&filename, "module_%04d", unique_id_);
+    if (!module_name_.empty()) {
+      absl::StrAppend(&filename, ".", module_name_);
+    }
+    absl::StrAppend(&filename, ".mlir-trace.pb");
+    filename = tsl::io::JoinPath(dirname_, filename);
+    TF_CHECK_OK(tsl::Env::Default()->RecursivelyCreateDir(dirname_));
+    TF_CHECK_OK(tsl::WriteBinaryProto(tsl::Env::Default(), filename, trace_));
+  }
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
new file mode 100644
index 00000000000..fa687217005
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
+
+#include <string>
+
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Instrumentation that logs the state of the IR after each pass.
+class MlirCompilerTraceInstrumentation : public PassInstrumentation {
+ public:
+  explicit MlirCompilerTraceInstrumentation(const std::string& dirname,
+                                            int unique_id,
+                                            const std::string& module_name)
+      : dirname_(dirname), unique_id_(unique_id), module_name_(module_name) {}
+  ~MlirCompilerTraceInstrumentation() override;
+  void runAfterPass(Pass* pass, Operation* op) override;
+
+ private:
+  MlirCompilationTrace trace_;
+  std::string dirname_;
+  int unique_id_;
+  std::string module_name_;
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_COMPILER_TRACE_INSTRUMENTATION_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.proto b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.proto
new file mode 100644
index 00000000000..6be6407505c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.proto
@@ -0,0 +1,72 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package mlir.interpreter;
+
+message TracedValue {
+  // The shape - includes vector dimensions.
+  // TODO(jreiffers): Model vector dimensions separately.
+  repeated int64 shape = 1;
+  optional bool is_scalar = 2;
+
+  enum ElementType {
+    UNKNOWN = 0;
+    INTEGRAL = 1;
+    UNSIGNED = 2;
+    FLOAT = 3;
+    COMPLEX = 4;
+    TUPLE = 5;
+  }
+
+  optional int32 bit_width = 3;
+  optional ElementType element_type = 4;
+
+  repeated float floats = 5 [packed = true];
+  repeated double doubles = 6 [packed = true];
+  repeated int64 ints = 7 [packed = true];
+  repeated uint64 uints = 8 [packed = true];
+  repeated TracedValue tuple_elements = 9;
+}
+
+message InstructionTrace {
+  optional string name = 1;
+  repeated TracedValue args = 2;
+  repeated TracedValue results = 3;
+  // TODO(jreiffers): Model side effects (e.g. memref.store).
+
+  repeated RegionTrace regions = 4;
+}
+
+message RegionTrace {
+  // The number of the region that is being executed (within the parent op).
+  // For example: '1' for an scf.while's `after` region.
+  optional int32 region_number = 1;
+  // The arguments that were passed to the region.
+  repeated TracedValue bbargs = 2;
+  // One instruction per instruction in the region.
+  repeated InstructionTrace instructions = 3;
+  repeated TracedValue results = 4;
+}
+
+message ExecutionTrace {
+  // The IR that was executed. Note: this should always be filled in the generic
+  // format.
+  optional string ir = 1;
+
+  // The trace of the entry function execution.
+  optional RegionTrace trace = 2;
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
new file mode 100644
index 00000000000..bbbc48df149
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
@@ -0,0 +1,429 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+// Visitor for converting an InterpreterValue to a TracedValue.
+struct TraceInterpreterValueVisitor {
+  TracedValue out;
+
+  void Add(float v) { out.add_floats(v); }
+  void Add(double v) { out.add_doubles(v); }
+  void Add(std::complex<float> v) {
+    out.add_floats(v.real());
+    out.add_floats(v.imag());
+  }
+  void Add(std::complex<double> v) {
+    out.add_doubles(v.real());
+    out.add_doubles(v.imag());
+  }
+  void Add(int64_t v) { out.add_ints(v); }
+  void Add(int32_t v) { out.add_ints(v); }
+  void Add(int16_t v) { out.add_ints(v); }
+  void Add(int8_t v) { out.add_ints(v); }
+  void Add(uint64_t v) { out.add_uints(v); }
+  void Add(uint32_t v) { out.add_uints(v); }
+  void Add(uint16_t v) { out.add_uints(v); }
+  void Add(uint8_t v) { out.add_uints(v); }
+  void Add(bool v) { out.add_ints(static_cast<int64_t>(v)); }
+
+  template <typename T>
+  void operator()(T v) {
+    SetElementType<T>();
+    out.set_is_scalar(true);
+    Add(v);
+  }
+
+  void operator()(const Tuple& t) {
+    out.set_element_type(TracedValue::TUPLE);
+    for (const auto& v : t.values) {
+      *out.add_tuple_elements() = ValueToTracedValue(*v);
+    }
+  }
+
+  template <typename T>
+  void operator()(const TensorOrMemref<T>& v) {
+    for (int64_t size : v.view.sizes) {
+      out.add_shape(size);
+    }
+    SetElementType<T>();
+    for (const auto& index : v.view.indices()) {
+      Add(v.at(index));
+    }
+  }
+
+  template <typename T>
+  void SetElementType() {
+    out.set_element_type(GetElementType(T{}));
+    if constexpr (std::is_same_v<T, bool>) {
+      out.set_bit_width(1);
+    } else {
+      out.set_bit_width(sizeof(T) * 8);
+    }
+  }
+
+  template <typename T>
+  static TracedValue::ElementType GetElementType(const T&) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return TracedValue::FLOAT;
+    } else if constexpr (std::is_integral_v<T>) {
+      if constexpr (std::is_unsigned_v<T>) {
+        return TracedValue::UNSIGNED;
+      } else {
+        return TracedValue::INTEGRAL;
+      }
+    } else {
+      T{"invalid type"} + 0;
+      return TracedValue::UNKNOWN;
+    }
+  }
+
+  template <typename T>
+  static TracedValue::ElementType GetElementType(const std::complex<T>&) {
+    return TracedValue::COMPLEX;
+  }
+
+  static TracedValue::ElementType GetElementType(const Tuple&) {
+    return TracedValue::UNKNOWN;
+  }
+};
+
+}  // namespace
+
+void ExecutionTraceListener::beforeOp(ArrayRef<InterpreterValue> args,
+                                      Operation* op) {
+  auto* inst = regions_.back()->add_instructions();
+  inst->set_name(op->getName().getStringRef().str());
+  for (const auto& arg : args) {
+    *inst->add_args() = ValueToTracedValue(arg);
+  }
+}
+
+void ExecutionTraceListener::afterOp(ArrayRef<InterpreterValue> results) {
+  auto* traced_results =
+      regions_.back()->mutable_instructions()->rbegin()->mutable_results();
+  for (const auto& result : results) {
+    *traced_results->Add() = ValueToTracedValue(result);
+  }
+}
+
+void ExecutionTraceListener::enterRegion(ArrayRef<InterpreterValue> bbargs,
+                                         Region& region) {
+  if (regions_.empty()) {
+    regions_.push_back(trace_->mutable_trace());
+  } else {
+    regions_.push_back(
+        regions_.back()->mutable_instructions()->rbegin()->add_regions());
+  }
+
+  auto& traced_region = *regions_.back();
+  traced_region.set_region_number(region.getRegionNumber());
+  for (const auto& bbarg : bbargs) {
+    *traced_region.add_bbargs() = ValueToTracedValue(bbarg);
+  }
+}
+
+void ExecutionTraceListener::leaveRegion(ArrayRef<InterpreterValue> yielded) {
+  for (const auto& result : yielded) {
+    *regions_.back()->add_results() = ValueToTracedValue(result);
+  }
+  regions_.pop_back();
+}
+
+llvm::SmallVector<mlir::Attribute> ValueToAttribute(
+    const InterpreterValue& value, mlir::Type type) {
+  if (std::holds_alternative<Tuple>(value.storage)) {
+    auto types = type.cast<TupleType>().getTypes();
+    const auto& t = std::get<Tuple>(value.storage);
+    llvm::SmallVector<mlir::Attribute> attrs;
+    for (const auto& [v, ty] : llvm::zip(t.values, types)) {
+      auto attr = ValueToAttribute(*v, ty);
+      assert(attr.size() == 1 && "nested tuples not supported");
+      attrs.push_back(attr.front());
+    }
+    return attrs;
+  }
+
+  if (!value.isTensor()) {
+    return {cast<DenseElementsAttr>(
+                ValueToAttribute(value.asUnitTensor(),
+                                 mlir::RankedTensorType::get({}, type))
+                    .front())
+                .getValues<mlir::Attribute>()[0]};
+  }
+
+  if (!type.isa<ShapedType>()) {
+    return {};
+  }
+
+  return {
+      dispatchScalarType(type.cast<ShapedType>().getElementType(),
+                         [&](auto dummy) -> mlir::Attribute {
+                           using T = decltype(dummy);
+                           auto& t = std::get<TensorOrMemref<T>>(value.storage);
+                           SmallVector<T> vals;
+                           for (const auto& index : t.view.indices()) {
+                             vals.push_back(t.at(index));
+                           }
+                           if constexpr (std::is_same_v<T, bool>) {
+                             return mlir::DenseElementsAttr::get(type, vals);
+                           } else {
+                             return mlir::DenseElementsAttr::get<T>(type, vals);
+                           }
+                         })};
+}
+
+namespace {
+template <typename T>
+TensorOrMemref<T> ArrayLiteralToTensor(const xla::Literal& literal) {
+  SmallVector<int64_t> layout;
+  if (literal.shape().has_layout()) {
+    llvm::copy(literal.shape().layout().minor_to_major(),
+               std::back_inserter(layout));
+  }
+  SmallVector<int64_t> shape{literal.shape().dimensions().begin(),
+                             literal.shape().dimensions().end()};
+  auto result = TensorOrMemref<T>::empty(shape, layout);
+  assert(literal.size_bytes() == result.buffer->getByteSize() &&
+         "expected buffer sizes to match");
+  memcpy(result.buffer->at(0, 0), literal.untyped_data(),
+         result.buffer->getByteSize());
+  return result;
+}
+}  // namespace
+
+tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal) {
+  if (literal.shape().IsTuple()) {
+    auto elements = literal.Clone().DecomposeTuple();
+    Tuple result;
+    for (auto& element : elements) {
+      TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(element));
+      result.values.push_back(
+          std::make_shared<InterpreterValue>(std::move(converted)));
+    }
+    return {{result}};
+  }
+
+  if (literal.shape().IsToken()) {
+    return tsl::errors::Unimplemented("token arguments are not implemented");
+  }
+
+  if (literal.shape().IsArray()) {
+    switch (literal.shape().element_type()) {
+      case xla::PRED:
+        return {{ArrayLiteralToTensor<bool>(literal)}};
+      case xla::S8:
+        return {{ArrayLiteralToTensor<int8_t>(literal)}};
+      case xla::S16:
+        return {{ArrayLiteralToTensor<int16_t>(literal)}};
+      case xla::S32:
+        return {{ArrayLiteralToTensor<int32_t>(literal)}};
+      case xla::S64:
+        return {{ArrayLiteralToTensor<int64_t>(literal)}};
+      case xla::U8:
+        return {{ArrayLiteralToTensor<uint8_t>(literal)}};
+      case xla::U16:
+        return {{ArrayLiteralToTensor<uint16_t>(literal)}};
+      case xla::U32:
+        return {{ArrayLiteralToTensor<uint32_t>(literal)}};
+      case xla::U64:
+        return {{ArrayLiteralToTensor<uint64_t>(literal)}};
+      case xla::F16:
+        return tsl::errors::Unimplemented("F16 not implemented");
+      case xla::F32:
+        return {{ArrayLiteralToTensor<float>(literal)}};
+      case xla::BF16:
+        return tsl::errors::Unimplemented("BF16 not implemented");
+      case xla::F64:
+        return {{ArrayLiteralToTensor<double>(literal)}};
+      case xla::F8E5M2:
+        return tsl::errors::Unimplemented("F8E5M2 not implemented");
+      case xla::F8E4M3FN:
+        return tsl::errors::Unimplemented("F8E4M3FN not implemented");
+      case xla::C64:
+        return {{ArrayLiteralToTensor<std::complex<float>>(literal)}};
+      case xla::C128:
+        return {{ArrayLiteralToTensor<std::complex<double>>(literal)}};
+      default:
+        // Fallthrough intended.
+        break;
+    }
+  }
+
+  return tsl::errors::InvalidArgument("unexpected literal type");
+}
+
+tsl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal) {
+  TF_ASSIGN_OR_RETURN(auto deserialized,
+                      xla::Literal::CreateFromProto(literal));
+  return LiteralToValue(deserialized);
+}
+
+TracedValue ValueToTracedValue(const InterpreterValue& value) {
+  TraceInterpreterValueVisitor visitor;
+  std::visit(visitor, value.storage);
+  return visitor.out;
+}
+
+tsl::StatusOr<InterpreterValue> TracedValueToValue(
+    const TracedValue& traced_value) {
+  auto extract = [&](auto dummy, auto& elements) -> InterpreterValue {
+    using T = decltype(dummy);
+    if (traced_value.is_scalar()) {
+      return {static_cast<T>(elements[0])};
+    }
+
+    auto result =
+        TensorOrMemref<T>::empty(llvm::to_vector(traced_value.shape()));
+    for (auto [index, element] : llvm::zip(result.view.indices(), elements)) {
+      result.at(index) = element;
+    }
+    return {result};
+  };
+  auto extract_complex = [&](auto& elements) -> InterpreterValue {
+    using T = std::complex<std::decay_t<decltype(elements[0])>>;
+    if (traced_value.is_scalar()) {
+      return {T{elements[0], elements[1]}};
+    }
+
+    auto result =
+        TensorOrMemref<T>::empty(llvm::to_vector(traced_value.shape()));
+    int64_t i = 0;
+    for (auto it = result.view.indices().begin(),
+              end = result.view.indices().end();
+         it != end; ++it, i += 2) {
+      result.at(*it) = {elements[i], elements[i + 1]};
+    }
+    return {result};
+  };
+  switch (traced_value.element_type()) {
+    case TracedValue::UNKNOWN:
+      break;
+    case TracedValue::FLOAT:
+      if (traced_value.bit_width() == 32) {
+        return extract(float{}, traced_value.floats());
+      }
+      return extract(double{}, traced_value.doubles());
+    case TracedValue::UNSIGNED:
+      switch (traced_value.bit_width()) {
+        case 1:
+          return extract(bool{}, traced_value.ints());
+        case 8:
+          return extract(uint8_t{}, traced_value.uints());
+        case 16:
+          return extract(uint16_t{}, traced_value.uints());
+        case 32:
+          return extract(uint32_t{}, traced_value.uints());
+        case 64:
+          return extract(uint64_t{}, traced_value.uints());
+      }
+      break;
+    case TracedValue::INTEGRAL:
+      switch (traced_value.bit_width()) {
+        case 8:
+          return extract(int8_t{}, traced_value.ints());
+        case 16:
+          return extract(int16_t{}, traced_value.ints());
+        case 32:
+          return extract(int32_t{}, traced_value.ints());
+        case 64:
+          return extract(int64_t{}, traced_value.ints());
+      }
+      break;
+    case TracedValue::COMPLEX:
+      switch (traced_value.bit_width()) {
+        case 64:
+          return extract_complex(traced_value.floats());
+        case 128:
+          return extract_complex(traced_value.doubles());
+      }
+      break;
+    case TracedValue::TUPLE:
+      Tuple result;
+      for (const auto& elem : traced_value.tuple_elements()) {
+        TF_ASSIGN_OR_RETURN(auto converted, TracedValueToValue(elem));
+        result.values.push_back(
+            std::make_shared<InterpreterValue>(std::move(converted)));
+      }
+      return {{std::move(result)}};
+  }
+  return tsl::errors::InvalidArgument("unexpected type: " +
+                                      traced_value.DebugString());
+}
+
+llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
+    const ExecutionTrace& trace, mlir::Operation* op) {
+  llvm::SmallVector<int64_t> region_indices;
+  llvm::SmallVector<int64_t> op_indices;
+
+  std::function<void(mlir::Operation*)> get_op_path;
+  get_op_path = [&](mlir::Operation* op) {
+    auto* parent = op->getParentOp();
+    if (!llvm::isa<func::FuncOp>(parent)) {
+      get_op_path(parent);
+      region_indices.push_back(op->getParentRegion()->getRegionNumber());
+    }
+
+    int64_t index = 0;
+    while ((op = op->getPrevNode()) != nullptr) ++index;
+    op_indices.push_back(index);
+  };
+  get_op_path(op);
+
+  llvm::SmallVector<const InstructionTrace*> result;
+  std::function<void(const RegionTrace& trace, int index)> step;
+  step = [&](const RegionTrace& trace, int index) {
+    auto& instruction_trace = trace.instructions(op_indices[index]);
+    if (region_indices.size() > index) {
+      for (const auto& region : instruction_trace.regions()) {
+        if (region.region_number() == region_indices[index]) {
+          step(region, index + 1);
+        }
+      }
+    } else {
+      result.push_back(&instruction_trace);
+    }
+  };
+  step(trace.trace(), 0);
+
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
new file mode 100644
index 00000000000..7a92d51585c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace.pb.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Interpreter listener that builds a trace of all executed ops and regions.
+class ExecutionTraceListener : public InterpreterListener {
+ public:
+  explicit ExecutionTraceListener(ExecutionTrace* trace) : trace_(trace) {}
+
+  void beforeOp(ArrayRef<InterpreterValue> args, Operation* op) override;
+  void afterOp(ArrayRef<InterpreterValue> results) override;
+  void enterRegion(ArrayRef<InterpreterValue> bbargs, Region& region) override;
+  void leaveRegion(ArrayRef<InterpreterValue> yielded) override;
+
+ private:
+  ExecutionTrace* trace_;
+  SmallVector<RegionTrace*> regions_;
+};
+
+// Returns an attribute with the given contents and type.
+llvm::SmallVector<mlir::Attribute> ValueToAttribute(
+    const InterpreterValue& value, mlir::Type type);
+
+// Deserializes the given literal.
+tsl::StatusOr<InterpreterValue> LiteralToValue(
+    const xla::LiteralProto& literal);
+
+// Deserializes the given literal.
+tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal);
+
+// Serializes the given interpreter value.
+TracedValue ValueToTracedValue(const InterpreterValue& value);
+
+// Deserializes the given traced value.
+tsl::StatusOr<InterpreterValue> TracedValueToValue(
+    const TracedValue& traced_value);
+
+// Returns all executions of the given op in the given trace.
+llvm::SmallVector<const InstructionTrace*> FindOpExecutionsInTrace(
+    const ExecutionTrace& trace, mlir::Operation* op);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TOOLS_MLIR_REPLAY_PUBLIC_EXECUTION_TRACE_UTILS_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
new file mode 100644
index 00000000000..d23ef77b929
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
+
+#include <cmath>
+#include <complex>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class TracedValueRoundTripTest
+    : public ::testing::TestWithParam<InterpreterValue> {};
+
+TEST_P(TracedValueRoundTripTest, Run) {
+  auto traced_value = ValueToTracedValue(GetParam());
+  TF_ASSERT_OK_AND_ASSIGN(auto value, TracedValueToValue(traced_value));
+  EXPECT_EQ(GetParam(), value) << GetParam().toString();
+}
+
+template <typename T>
+InterpreterValue MakeTensor(ArrayRef<int64_t> shape, ArrayRef<T> values) {
+  auto result = TensorOrMemref<T>::empty(shape);
+  for (auto [indices, value] : llvm::zip(result.view.indices(), values)) {
+    result.at(indices) = value;
+  }
+  return {result};
+}
+
+template <typename T>
+std::shared_ptr<T> WrapShared(T value) {
+  return std::make_shared<T>(std::move(value));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RoundTrip, TracedValueRoundTripTest,
+    ::testing::ValuesIn(std::vector<InterpreterValue>{
+        {uint8_t{42}},
+        {uint16_t{43}},
+        {uint32_t{44}},
+        {uint64_t{45}},
+        {int8_t{-47}},
+        {int16_t{-48}},
+        {int32_t{-49}},
+        {int64_t{-50}},
+        {float{42.0}},
+        {double{42.0}},
+        {std::complex<float>{1.0, 2.0}},
+        {std::complex<double>{3.0, 4.0}},
+        {true},
+        {false},
+        {MakeTensor<int16_t>({1, 2}, {42, 43})},
+        {MakeTensor<double>({2, 2}, {1.0, -INFINITY, INFINITY, NAN})},
+        {MakeTensor<std::complex<double>>({}, {{1.0, 2.0}})},
+        {Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
+            WrapShared(InterpreterValue{42}),
+            WrapShared(InterpreterValue{43.0}),
+        }}}}));
+
+class FromLiteralTest
+    : public ::testing::TestWithParam<
+          std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>> {};
+
+TEST_P(FromLiteralTest, Run) {
+  TF_ASSERT_OK_AND_ASSIGN(auto value, LiteralToValue(*GetParam().first));
+  EXPECT_EQ(value, GetParam().second)
+      << value.toString() << " vs " << GetParam().second.toString();
+}
+
+std::vector<std::pair<std::shared_ptr<xla::Literal>, InterpreterValue>>
+MakeInputs() {
+  using ::xla::LiteralUtil;
+  return {
+      {WrapShared(LiteralUtil::CreateR2<uint8_t>({{41, 42}})),
+       MakeTensor<uint8_t>({1, 2}, {41, 42})},
+      {WrapShared(LiteralUtil::CreateR0<uint16_t>(43)),
+       MakeTensor<uint16_t>({}, {43})},
+      {WrapShared(LiteralUtil::CreateR0<uint32_t>(44)),
+       MakeTensor<uint32_t>({}, {44})},
+      {WrapShared(LiteralUtil::CreateR0<uint64_t>(45)),
+       MakeTensor<uint64_t>({}, {45})},
+      {WrapShared(LiteralUtil::CreateR0<int8_t>(46)),
+       MakeTensor<int8_t>({}, {46})},
+      {WrapShared(LiteralUtil::CreateR0<int16_t>(47)),
+       MakeTensor<int16_t>({}, {47})},
+      {WrapShared(LiteralUtil::CreateR0<int32_t>(48)),
+       MakeTensor<int32_t>({}, {48})},
+      {WrapShared(LiteralUtil::CreateR0<int64_t>(49)),
+       MakeTensor<int64_t>({}, {49})},
+      {WrapShared(LiteralUtil::CreateR0<float>(50.0)),
+       MakeTensor<float>({}, {50.0})},
+      {WrapShared(LiteralUtil::CreateR0<double>(51.0)),
+       MakeTensor<double>({}, {51.0})},
+      {WrapShared(LiteralUtil::CreateR0<std::complex<float>>({52.0, 53.0})),
+       MakeTensor<std::complex<float>>({}, {{52.0, 53.0}})},
+      {WrapShared(LiteralUtil::CreateR0<std::complex<double>>({54.0, 55.0})),
+       MakeTensor<std::complex<double>>({}, {{54.0, 55.0}})},
+      {WrapShared(LiteralUtil::CreateR1<bool>({true, false})),
+       MakeTensor<bool>({2}, {true, false})},
+      {WrapShared(
+           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<bool>(true),
+                                       LiteralUtil::CreateR0<int8_t>(56))),
+       InterpreterValue{Tuple{SmallVector<std::shared_ptr<InterpreterValue>>{
+           std::make_shared<InterpreterValue>(MakeTensor<bool>({}, {true})),
+           std::make_shared<InterpreterValue>(
+               MakeTensor<int8_t>({}, {56}))}}}}};
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, FromLiteralTest,
+                         ::testing::ValuesIn(MakeInputs()));
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir/tools/xla_cpu_opt.cc b/tensorflow/compiler/xla/mlir/tools/xla_cpu_opt.cc
deleted file mode 100644
index c8d2027a611..00000000000
--- a/tensorflow/compiler/xla/mlir/tools/xla_cpu_opt.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Tools/mlir-opt/MlirOptMain.h"
-#include "stablehlo/dialect/Register.h"  // from @stablehlo
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-
-int main(int argc, char **argv) {
-  mlir::mhlo::registerAllMhloPasses();
-  mlir::lmhlo::registerAllLmhloPasses();
-  mlir::gml_st::registerGmlStPasses();
-  mlir::gml_st::registerGmlStTestPasses();
-
-  mlir::DialectRegistry registry;
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::stablehlo::registerAllDialects(registry);
-  registry.insert<mlir::func::FuncDialect, mlir::lmhlo::LmhloDialect,
-                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect,
-                  mlir::linalg::LinalgDialect, mlir::tensor::TensorDialect>();
-
-  xla::cpu::registerCpuTransformsPasses();
-
-  return failed(MlirOptMain(argc, argv, "Xla Cpu Pass Driver\n", registry));
-}
diff --git a/tensorflow/compiler/xla/mlir/tools/xla_gpu_opt.cc b/tensorflow/compiler/xla/mlir/tools/xla_gpu_opt.cc
deleted file mode 100644
index ab76a186863..00000000000
--- a/tensorflow/compiler/xla/mlir/tools/xla_gpu_opt.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-
-int main(int argc, char **argv) {
-  mlir::DialectRegistry registry;
-  registry.insert<mlir::memref::MemRefDialect, mlir::func::FuncDialect,
-                  mlir::gpu::GPUDialect, mlir::lmhlo::LmhloDialect,
-                  mlir::lmhlo_gpu::LmhloGpuDialect>();
-
-  xla::gpu::registerGpuTransformsPasses();
-
-  return failed(MlirOptMain(argc, argv, "Xla Gpu Pass Driver\n", registry));
-}
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/BUILD b/tensorflow/compiler/xla/mlir/transforms/cpu/BUILD
deleted file mode 100644
index 1415f64b400..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=CpuTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    visibility = ["//visibility:private"],
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = [
-        "lmhlo_to_cpu_runtime.cc",
-        "xla_abi_legalization.cc",
-    ],
-    hdrs = ["passes.h"],
-    deps = [
-        ":passes_inc_gen",
-        "//tensorflow/compiler/xla/mlir/runtime/utils:custom_calls",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:TransformUtils",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/lmhlo_to_cpu_runtime.cc b/tensorflow/compiler/xla/mlir/transforms/cpu/lmhlo_to_cpu_runtime.cc
deleted file mode 100644
index 35b09a5588c..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/lmhlo_to_cpu_runtime.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-#define GEN_PASS_DEF_CONVERTLMHLOTOCPURUNTIMEPASS
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::lmhlo::CustomCallOp;
-
-using xla::runtime::AppendCustomCallAttrs;
-using xla::runtime::CustomCallDeclarations;
-
-class ConvertLmhloToCpuRuntimePass
-    : public impl::ConvertLmhloToCpuRuntimePassBase<
-          ConvertLmhloToCpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<func::FuncDialect, memref::MemRefDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.cpu.custom_call";
-
- public:
-  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // By default all operands passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op.getOperands();
-
-    // Get the number of outputs from operand_segment_sizes.
-    int64_t num_results = op->getAttrOfType<DenseI32ArrayAttr>(
-        op.getOperandSegmentSizesAttrName())[1];
-
-    // If custom call has target arguments mapping, then we need to pass empty
-    // memrefs in place of holes.
-    if (op.getTargetArgMapping().has_value()) {
-      auto mapping = *op.getTargetArgMapping();
-      int64_t num_args = mapping.getNumArgs();
-      num_results = mapping.getNumResults();
-
-      // Always create an `alloca` in the parent function entry block.
-      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-      Value hole = [&]() -> Value {
-        OpBuilder::InsertionGuard guard(b);
-        b.setInsertionPointToStart(
-            &op->getParentOfType<func::FuncOp>().front());
-        return b.create<memref::AllocaOp>(MemRefType::get({0}, b.getI8Type()));
-      }();
-
-      // We represent holes as empty i8 memrefs.
-      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
-
-      // Update operands to mapped custom call arguments.
-      auto args = mapping.getArgsToTargetArgs();
-      for (const auto& indexed : llvm::enumerate(args))
-        operands[indexed.value()] = op.getArgs()[indexed.index()];
-
-      // Update operands to mapped custom call results.
-      auto res = mapping.getResultsToTargetResults();
-      for (const auto& indexed : llvm::enumerate(res))
-        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
-    }
-
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("num_results"),
-         b.getI32IntegerAttr(static_cast<int32_t>(num_results))},
-        {b.getStringAttr("api_version"), op.getApiVersionAttr()},
-        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-void ConvertLmhloToCpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Convert lmhlo operations to XLA cpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<CustomCallOpLowering>(ctx, custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-}
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToCpuRuntimePass() {
-  return std::make_unique<ConvertLmhloToCpuRuntimePass>();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/passes.h b/tensorflow/compiler/xla/mlir/transforms/cpu/passes.h
deleted file mode 100644
index f9b5096fb60..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/passes.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-namespace cpu {
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Cpu runtime.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToCpuRuntimePass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createXlaAbiLegalizationPass();
-
-//===-----------------------------------------------------------------------===/
-
-#define GEN_PASS_REGISTRATION
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h.inc"
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/passes.td b/tensorflow/compiler/xla/mlir/transforms/cpu/passes.td
deleted file mode 100644
index 19a8be700de..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/passes.td
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
-
-include "mlir/Pass/PassBase.td"
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Cpu runtime.
-//===----------------------------------------------------------------------===//
-
-def ConvertLmhloToCpuRuntimePass :
-    Pass<"xla-lmhlo-to-cpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo operations to XLA Cpu runtime custom calls";
-
-  let description = [{
-      Converts lmhlo dialect operations to XLA Cpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertLmhloToCpuRuntimePass()";
-}
-
-def LegalizeXlaAbiPass :
-    Pass<"xla-legalize-abi", "mlir::ModuleOp"> {
-  let summary = "Convers layouts and data formats at ABI boundaries";
-
-  let description = [{
-      Converts layouts defined in entry_computation_layout to the default
-      layout assumed by the MLIR pipeline. The same is done for custom calls.
-
-      This pass should run immediately after expand-hlo-tuples.
-  }];
-
-  let dependentDialects = [
-      "mlir::func::FuncDialect", "mlir::mhlo::MhloDialect"
-  ];
-
-  let constructor = "createXlaAbiLegalizationPass()";
-}
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_CPU_PASSES_TD_
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/BUILD b/tensorflow/compiler/xla/mlir/transforms/cpu/tests/BUILD
deleted file mode 100644
index b2e46760f92..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/xla/mlir/tools:xla-cpu-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/transforms/cpu/tests/lmhlo_custom_call.mlir
deleted file mode 100644
index 05130d71154..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/cpu/tests/lmhlo_custom_call.mlir
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: func @test
-// CHECK:   %[[ARG0:.*]]: memref<f32>
-// CHECK: )
-func.func @test(%arg0: memref<f32>) {
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
-  // CHECK-SAME:   api_version = 2 : i32
-  // CHECK-SAME:   call_target_name = "target"
-  // CHECK-SAME:   num_results = 1 : i32
-  // CHECK-SAME: : (memref<f32>) -> ()
-  "lmhlo.custom_call"(%arg0) {
-    api_version = 2 : i32,
-    call_target_name = "target",
-    operand_segment_sizes = array<i32: 0, 1>
-  } : (memref<f32>) -> ()
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.custom_call"}
-
-// -----
-
-// CHECK: func @test_with_mapping
-// CHECK:   %[[ARG0:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG1:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG2:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG3:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG4:[0-9a-z]*]]: memref<f32>
-// CHECK: )
-func.func @test_with_mapping(
-    %arg0: memref<f32>,
-    %arg1: memref<f32>,
-    %arg2: memref<f32>,
-    %arg3: memref<f32>,
-    %arg4: memref<f32>) {
-  // CHECK: %[[HOLE:.*]] = memref.alloca() : memref<0xi8>
-
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
-  // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
-  // CHECK-SAME:   api_version = 1 : i32
-  // CHECK-SAME:   call_target_name = "target"
-  // CHECK-SAME:   num_results = 4 : i32
-  "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) {
-    api_version = 1 : i32,
-    call_target_name = "target",
-    operand_segment_sizes = array<i32: 2, 3>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 4,
-      args_to_target_args = [0, 2],
-      results_to_target_results = [0, 1, 3]>
-    } : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
-
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>, memref<0xi8>,
-// CHECK-SAME: memref<f32>, memref<0xi8>, memref<f32>, memref<f32>,
-// CHECK-SAME: memref<0xi8>, memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.custom_call"}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/BUILD b/tensorflow/compiler/xla/mlir/transforms/gpu/BUILD
deleted file mode 100644
index 00143ed1066..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/BUILD
+++ /dev/null
@@ -1,70 +0,0 @@
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=GpuTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = [
-        "add_hlo_trace_annotations.cc",
-        "gpu_to_gpu_runtime.cc",
-        "launch_func_to_cuda_graph.cc",
-        "lmhlo_gpu_to_gpu_runtime.cc",
-        "lmhlo_to_gpu_launch.cc",
-        "lmhlo_to_gpu_runtime.cc",
-        "memref_get_global_to_arg.cc",
-        "passes.cc",
-        "uid_generator.h",
-    ],
-    hdrs = ["passes.h"],
-    # Override cc_library()'s internal default value of ["//buildenv/target:gce"].`
-    # TODO(ezhulenev): Do not depend on NCCL thunks in compiler passes.
-    compatible_with = [],
-    deps = [
-        ":passes_inc_gen",
-        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
-        "//tensorflow/compiler/xla/mlir/runtime/utils:custom_calls",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
-        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
-        "//tensorflow/compiler/xla/service/gpu:nccl_collective_thunks",
-        "//tensorflow/compiler/xla/stream_executor:blas",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/launch_func_to_cuda_graph.cc b/tensorflow/compiler/xla/mlir/transforms/gpu/launch_func_to_cuda_graph.cc
deleted file mode 100644
index 441cc4db232..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/launch_func_to_cuda_graph.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
-#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
-#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTLAUNCHFUNCTOCUDAGRAPHPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::LaunchFuncOp;
-
-class ConvertLaunchFuncToCudaGraphPass
-    : public impl::ConvertLaunchFuncToCudaGraphPassBase<
-          ConvertLaunchFuncToCudaGraphPass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<func::FuncDialect, runtime::RuntimeDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-// A sequence of launch func operation to be outlined into cuda graph
-// constructor.
-struct LaunchFuncSequence {
-  llvm::SmallVector<gpu::LaunchFuncOp> ops;
-};
-
-// Collect sequences of LaunchFuncOp operations that can be outlined into
-// Cuda Graph functions.
-//
-// TODO(ezhulenev): Do not collect launch func sequences if they are already
-// inside a graph capture function.
-static llvm::SmallVector<LaunchFuncSequence> CollectLaunchFuncSequences(
-    ModuleOp module) {
-  llvm::SmallVector<LaunchFuncSequence> seqs;
-  llvm::DenseSet<LaunchFuncOp> outlined;
-
-  module.walk([&](LaunchFuncOp op) {
-    // This launch operation is a part of already collected sequence.
-    if (outlined.contains(op)) return;
-
-    // Find the first LaunchFuncOp in a sequence.
-    Operation* first = op;
-    while (Operation* prev = first->getPrevNode()) {
-      if (!isa<LaunchFuncOp>(prev)) break;
-      first = prev;
-    }
-
-    // Find the last LaunchFuncOp in a sequence.
-    Operation* last = op;
-    while (Operation* next = last->getNextNode()) {
-      if (!isa<LaunchFuncOp>(next)) break;
-      last = next;
-    }
-
-    // Skip sequences consisting of a single operation.
-    if (first == last) return;
-
-    // Collect all launch func ops.
-    LaunchFuncSequence& seq = seqs.emplace_back();
-
-    auto r = llvm::make_range(Block::iterator(first), ++Block::iterator(last));
-    llvm::transform(r, std::back_inserter(seq.ops), [&](Operation& op) {
-      auto launch = cast<LaunchFuncOp>(op);
-      outlined.insert(launch);
-      return launch;
-    });
-  });
-
-  return seqs;
-}
-
-//===----------------------------------------------------------------------===//
-
-using xla::runtime::CustomCallDeclarations;
-
-// Given a sequence of LaunchFuncOp operations outline them into a function,
-// and replace with an XLA Gpu runtime function call.
-static void Outline(CustomCallDeclarations& custom_calls,
-                    LaunchFuncSequence& seq) {
-  SymbolTable& sym_table = custom_calls.sym_table();
-  MLIRContext* ctx = sym_table.getOp()->getContext();
-
-  // Create a fused location out of LaunchFuncOp operations.
-  llvm::SmallVector<Location> locations;
-  for (auto& op : seq.ops) locations.push_back(op.getLoc());
-  ImplicitLocOpBuilder b(FusedLoc::get(ctx, locations), sym_table.getOp());
-
-  // Collect all arguments used by the launch func operations.
-  llvm::SetVector<Value> args;
-  for (LaunchFuncOp op : seq.ops)
-    args.insert(op.operand_begin(), op.operand_end());
-
-  llvm::SmallVector<Type> args_types;
-  for (Value arg : args) args_types.push_back(arg.getType());
-
-  // Create a function in the compiled module.
-  auto func_type = FunctionType::get(ctx, args_types, TypeRange());
-  auto func = b.create<func::FuncOp>("xla.gpu.cuda.graph.capture", func_type);
-
-  // Add graph building function to the module.
-  sym_table.insert(func);
-
-  // Export graph builder function to runtime.
-  b.setInsertionPoint(func);
-  b.create<runtime::ExportOp>(func);
-
-  // Create a custom call declaration corresponding to the outlined graph
-  // capture function.
-  func::FuncOp graph_launch = custom_calls.GetOrCreate(
-      b, "xla.gpu.cuda.graph.launch", args_types, TypeRange());
-
-  // Call the cuda graph launch custom call.
-  b.setInsertionPoint(seq.ops.front());
-  auto call = b.create<func::CallOp>(graph_launch.getName(), TypeRange(),
-                                     args.getArrayRef());
-  call->setAttr(b.getStringAttr("capture"), FlatSymbolRefAttr::get(func));
-
-  // At this point we successfully added new functions to the module, so we can
-  // move LaunchFuncOp operations from their original location to the graph
-  // capture function.
-
-  // Move all launch func operations into the function body.
-  Block* body = func.addEntryBlock();
-  for (LaunchFuncOp op : seq.ops) op->moveBefore(body, body->end());
-
-  // Replace uses of original values with block arguments.
-  for (auto p : llvm::zip(args, func.getArguments()))
-    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p), func.getBody());
-
-  // Add a return operation to the graph capture function.
-  b.setInsertionPointToEnd(body);
-  b.create<func::ReturnOp>(ValueRange());
-}
-
-//===----------------------------------------------------------------------===//
-
-void ConvertLaunchFuncToCudaGraphPass::runOnOperation() {
-  SymbolTable sym_table(getOperation());
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  for (auto& seq : CollectLaunchFuncSequences(getOperation())) {
-    Outline(custom_calls, seq);
-  }
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLaunchFuncToCudaGraphPass() {
-  return std::make_unique<ConvertLaunchFuncToCudaGraphPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_runtime.cc
deleted file mode 100644
index adfd93fb8f4..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/lmhlo_to_gpu_runtime.cc
+++ /dev/null
@@ -1,907 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTLMHLOTOGPURUNTIMEPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::MemcpyOp;
-
-using mlir::lmhlo::CaseOp;
-using mlir::lmhlo::CustomCallOp;
-using mlir::lmhlo::FftOp;
-using mlir::lmhlo::InfeedOp;
-using mlir::lmhlo::OutfeedOp;
-using mlir::lmhlo::TerminatorOp;
-using mlir::lmhlo::WhileOp;
-
-using xla::runtime::AppendCustomCallAttrs;
-using xla::runtime::CustomCallDeclarations;
-
-class ConvertLmhloToGpuRuntimePass
-    : public impl::ConvertLmhloToGpuRuntimePassBase<
-          ConvertLmhloToGpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<arith::ArithDialect, cf::ControlFlowDialect, func::FuncDialect,
-                memref::MemRefDialect, scf::SCFDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class TerminatorOpLowering : public OpRewritePattern<TerminatorOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TerminatorOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<func::ReturnOp>(op);
-    return mlir::success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-template <typename IoFeedOp>
-class IoFeedOpLowering : public OpRewritePattern<IoFeedOp> {
-  static StringRef Target(InfeedOp) { return "xla.gpu.infeed"; }
-  static StringRef Target(OutfeedOp) { return "xla.gpu.outfeed"; }
-
- public:
-  IoFeedOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<IoFeedOp>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(IoFeedOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("config"), op.getConfigAttr()}};
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class InfeedOpLowering : public IoFeedOpLowering<InfeedOp> {
- public:
-  using IoFeedOpLowering::IoFeedOpLowering;
-};
-
-class OutfeedOpLowering : public IoFeedOpLowering<OutfeedOp> {
- public:
-  using IoFeedOpLowering::IoFeedOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.custom_call";
-
- public:
-  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // By default all operands passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op.getOperands();
-
-    // If custom call has target arguments mapping, then we need to pass `i64`
-    // scalars in place of holes to detect them in custom call handler.
-    //
-    // TODO(ezhulenev): We need an `xla` dialect to model Xla framework
-    // semantics including holes for custom call. As a work around we pass `i64`
-    // values because xla custom call do not support scalar arguments, and we
-    // can disambiguate holes from buffers.
-    if (op.getTargetArgMapping().has_value()) {
-      auto mapping = *op.getTargetArgMapping();
-      int64_t num_args = mapping.getNumArgs();
-      int64_t num_results = mapping.getNumResults();
-
-      // We represent holes as an arbitrary `i64` constant.
-      Value hole = b.create<arith::ConstantOp>(b.getI64IntegerAttr(-1));
-      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
-
-      // Update operands to mapped custom call arguments.
-      auto args = mapping.getArgsToTargetArgs();
-      for (const auto& indexed : llvm::enumerate(args))
-        operands[indexed.value()] = op.getArgs()[indexed.index()];
-
-      // Update operands to mapped custom call results.
-      auto res = mapping.getResultsToTargetResults();
-      for (const auto& indexed : llvm::enumerate(res))
-        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
-    }
-
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("api_version"), op.getApiVersionAttr()},
-        {b.getStringAttr("backend_config"), op.getBackendConfigAttr()},
-        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class FftOpLowering : public OpRewritePattern<FftOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.fft";
-
- public:
-  FftOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(FftOp op,
-                                PatternRewriter& rewriter) const override {
-    // Create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("fft_length"), op.getFftLengthAttr()},
-        {b.getStringAttr("fft_type"), op.getFftTypeAttr()}};
-
-    // Convert Fft to a function call.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, custom_call_attrs);
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CaseOpLowering : public OpRewritePattern<CaseOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(CaseOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Copy index buffer to the host ...
-    auto index_type = op.getIndex().getType().dyn_cast<MemRefType>();
-
-    // Always create an `alloca` in the parent function entry block.
-    // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-    Value index_on_host = [&]() -> Value {
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(&op->getParentOfType<func::FuncOp>().front());
-      return b.create<memref::AllocaOp>(index_type);
-    }();
-
-    b.create<MemcpyOp>(TypeRange(), ValueRange({index_on_host, op.getIndex()}));
-
-    // Get the index value from the buffer.
-    Value index = b.create<memref::LoadOp>(index_type.getElementType(),
-                                           index_on_host, ValueRange());
-
-    bool is_predicate = index_type.getElementType().isInteger(1);
-
-    // For binary index (predicate) convert i1 to i32 index.
-    if (is_predicate) {
-      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
-      Value c1 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(1));
-      index = b.create<arith::SelectOp>(index, c0, c1);
-    }
-
-    // For integer index make sure that it is within range.
-    if (!is_predicate) {
-      unsigned n = op.getNumRegions() - 1;
-      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
-      Value cN = b.create<arith::ConstantOp>(b.getI32IntegerAttr(n));
-
-      Value too_small = b.create<arith::CmpIOp>(
-          b.getI1Type(), arith::CmpIPredicate::slt, index, c0);
-      Value too_large = b.create<arith::CmpIOp>(
-          b.getI1Type(), arith::CmpIPredicate::sgt, index, cN);
-
-      Value out_of_range = b.create<arith::OrIOp>(too_small, too_large);
-      index = b.create<arith::SelectOp>(out_of_range, cN, index);
-    }
-
-    // Wrap the CFG constructed from the `lmhlo.case` operation in an
-    // `scf.execute_region` operation, so that we do not introduce the CFG
-    // into regions that expect a single block (e.g. inside the loop body).
-    auto execute = b.create<scf::ExecuteRegionOp>(TypeRange());
-
-    // Add an entry block to the execute region operation.
-    Block& entry = execute.getRegion().emplaceBlock();
-
-    // Create a block with `scf.yield` terminator.
-    Block& yield = execute.getRegion().emplaceBlock();
-    b.setInsertionPointToStart(&yield);
-    b.create<scf::YieldOp>();
-
-    // Prepare case destinations for the `scf.switch` operation.
-    llvm::SmallVector<llvm::APInt> case_values;
-    llvm::SmallVector<Block*> case_blocks;
-    llvm::SmallVector<ValueRange> case_operands;
-
-    // Create blocks from each of the case regions.
-    for (Region& region : op->getRegions()) {
-      // Move `lmhlo.case` block into the execute region.
-      Block& block = region.front();
-      block.moveBefore(&yield);
-
-      // Erase original `lmhlo.terminator`.
-      rewriter.eraseOp(block.getTerminator());
-
-      // Branch into the yield block.
-      b.setInsertionPointToEnd(&block);
-      b.create<cf::BranchOp>(&yield);
-
-      // Add a `cf.switch` case.
-      int32_t idx = case_blocks.size();
-      case_values.push_back(b.getI32IntegerAttr(idx).getValue());
-      case_blocks.push_back(&block);
-      case_operands.push_back({});
-    }
-
-    // Create a `cf.switch` operation in the execute region entry block.
-    b.setInsertionPointToEnd(&entry);
-    b.create<cf::SwitchOp>(index, &yield, ValueRange(), case_values,
-                           case_blocks, case_operands);
-
-    // Erase the original case operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class WhileOpLowering : public OpRewritePattern<WhileOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  // Rewrite while loop with known trip count to `scf.for` operation.
-  LogicalResult rewriteForLoop(WhileOp op, PatternRewriter& rewriter) const {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    Value lb = b.create<arith::ConstantIndexOp>(0);
-    Value ub = b.create<arith::ConstantIndexOp>(*op.getTripCount());
-    Value c1 = b.create<arith::ConstantIndexOp>(1);
-
-    // Create an `scf.for` loop in place of `lmhlo.while` loop.
-    auto loop = b.create<scf::ForOp>(lb, ub, c1, ValueRange());
-
-    // Move body region into the new loop operation.
-    BlockAndValueMapping mapping;
-    rewriter.eraseOp(op.getBody().front().getTerminator());
-    rewriter.mergeBlockBefore(&op.getBody().front(),
-                              loop.getLoopBody().front().getTerminator());
-
-    // Erase the original while loop.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
-  // Rewrite while loop with unknown trip count to `scf.while` operation.
-  LogicalResult rewriteWhileLoop(WhileOp op, PatternRewriter& rewriter) const {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Create an `scf.while` loop in place of `lmhlo.while` loop.
-    auto loop = b.create<scf::WhileOp>(TypeRange(), ValueRange());
-
-    // Predicate buffer placed on the device.
-    Value pred = op.getOperand(0);
-
-    // Inline condition and body regions into the new loop operation.
-    BlockAndValueMapping mapping;
-    rewriter.inlineRegionBefore(op.getCond(), loop.getBefore(),
-                                loop.getBefore().begin());
-    rewriter.inlineRegionBefore(op.getBody(), loop.getAfter(),
-                                loop.getAfter().begin());
-
-    {  // Replace loop condition terminator.
-      auto* terminator = loop.getBefore().back().getTerminator();
-      b.setInsertionPointAfter(terminator);
-
-      auto i1 = b.getI1Type();
-
-      // Always create an `alloca` in the parent function entry block.
-      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-      Value pred_on_host = [&]() -> Value {
-        OpBuilder::InsertionGuard guard(b);
-        b.setInsertionPointToStart(
-            &op->getParentOfType<func::FuncOp>().front());
-        return b.create<memref::AllocaOp>(MemRefType::get({}, i1));
-      }();
-
-      // Copy predicate buffer to the host ...
-      b.create<gpu::MemcpyOp>(TypeRange(), ValueRange({pred_on_host, pred}));
-
-      // .. and check if we need to continue loop iteration.
-      Value cond = b.create<memref::LoadOp>(i1, pred_on_host, ValueRange());
-      b.create<scf::ConditionOp>(cond, ValueRange());
-      rewriter.eraseOp(terminator);
-    }
-
-    {  // Replace loop body terminator.
-      auto* terminator = loop.getAfter().back().getTerminator();
-      b.setInsertionPointAfter(terminator);
-      b.create<scf::YieldOp>(TypeRange(), ValueRange());
-      rewriter.eraseOp(terminator);
-    }
-
-    // Erase the original while loop.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
-  LogicalResult matchAndRewrite(WhileOp op,
-                                PatternRewriter& rewriter) const override {
-    assert(op.getNumOperands() == 1 && "expected single lmhlo.while operand");
-    return op.getTripCount().has_value() ? rewriteForLoop(op, rewriter)
-                                         : rewriteWhileLoop(op, rewriter);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Collective operations lowerings.
-//===----------------------------------------------------------------------===//
-
-using mlir::lmhlo::AllGatherOp;
-using mlir::lmhlo::AllReduceOp;
-using mlir::lmhlo::AllToAllOp;
-using mlir::lmhlo::CollectivePermuteOp;
-using mlir::lmhlo::PartitionIdOp;
-using mlir::lmhlo::ReduceScatterOp;
-using mlir::lmhlo::ReplicaIdOp;
-using mlir::lmhlo_gpu::AllReduceDoneOp;
-using mlir::lmhlo_gpu::AllReduceStartOp;
-
-// We assign unique id to all collective operations in the module, so that we
-// can efficiently access per-op state at run time. Exception to this rule are
-// asynchronous collective operations, that share the same unique id by the pair
-// of corresponding `start` and `done` operations.
-//
-// Asynchronous collective operations pass HLO Token to represent the dependency
-// between the `Start` and `Done` operations. When we lower to XLA runtime
-// custom calls we rely on assigning each unique pair of `Start` and `Done`
-// operations a unique event id, and use shared "context" owned by the
-// GpuExecutable to pass Gpu events from `Start` to `Done` custom call handlers.
-//
-// TODO(ezhulenev): Once XLA runtime custom calls support returning values, we
-// should explicitly return event id from the `Start` custom call, and pass it
-// to the `Done` custom call. Longer term this should become an `!async.token`
-// and rely on XLA runtime asynchonous execution.
-class CollectiveUidGenerator {
- public:
-  CollectiveUidGenerator() : cnt_(0) {}
-
-  // Assings a unique event id to the pair of start and done operations.
-  int32_t AssignUid(AllReduceStartOp start, AllReduceDoneOp done) {
-    int32_t id = next();
-    uids_[start] = id;
-    uids_[done] = id;
-    return id;
-  }
-
-  FailureOr<int32_t> AssignedUid(Operation* op) {
-    // Async operations must be assigned uid ahead of time.
-    if (isa<AllReduceStartOp, AllReduceDoneOp>(op)) {
-      auto it = uids_.find(op);
-      if (it == uids_.end()) return failure();
-      return it->second;
-    }
-    // For every other operation we just assign a next id.
-    return next();
-  }
-
- private:
-  int32_t next() { return cnt_++; }
-
-  int32_t cnt_;
-  llvm::DenseMap<Operation*, int32_t> uids_;
-};
-
-template <typename CollectiveOp>
-class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
-  static StringRef Target(AllGatherOp) { return "xla.gpu.all_gather"; }
-  static StringRef Target(AllReduceOp) { return "xla.gpu.all_reduce"; }
-  static StringRef Target(AllToAllOp) { return "xla.gpu.all_to_all"; }
-  static StringRef Target(ReduceScatterOp) { return "xla.gpu.reduce_scatter"; }
-  static StringRef Target(CollectivePermuteOp) {
-    return "xla.gpu.collective_permute";
-  }
-  static StringRef Target(AllReduceStartOp) {
-    return "xla.gpu.all_reduce_start";
-  }
-
-  template <typename ReduceOrGatherOp>
-  static NcclCollectiveConfig GetNcclCollectiveConfig(ReduceOrGatherOp op,
-                                                      int /*replica_count*/,
-                                                      int /*num_partitions*/) {
-    return GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds());
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(AllToAllOp op,
-                                                      int /*replica_count*/,
-                                                      int /*num_partitions*/) {
-    // TODO(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
-    // attribute and it should be removed.
-    return GetNcclCollectiveConfigForMlir(op, std::nullopt);
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(CollectivePermuteOp op,
-                                                      int replica_count,
-                                                      int num_partitions) {
-    return NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
-               op, replica_count, num_partitions)
-        .config;
-  }
-
-  template <typename NonCollectivePermuteOp>
-  static LogicalResult TryDegenerateToMemCopy(
-      NonCollectivePermuteOp op, const NcclCollectiveConfig& config,
-      int replica_count, int num_partitions, PatternRewriter& rewriter) {
-    if (!config.IsDegenerate(replica_count, num_partitions)) {
-      return failure();
-    }
-
-    for (int64_t i = 0; i < op.getInputs().size(); i++) {
-      rewriter.create<gpu::MemcpyOp>(
-          op.getLoc(), TypeRange(),
-          ValueRange({op.getOutputs()[i], op.getOperands()[i]}));
-    }
-
-    return success();
-  }
-
-  static LogicalResult TryDegenerateToMemCopy(
-      CollectivePermuteOp op, const NcclCollectiveConfig& config,
-      int replica_count, int num_partitions, PatternRewriter& rewriter) {
-    if (!NcclCollectivePermuteThunk::IsDegenerate(op, replica_count,
-                                                  num_partitions)) {
-      return failure();
-    }
-
-    rewriter.create<gpu::MemcpyOp>(
-        op.getLoc(), TypeRange(),
-        ValueRange({op.getOutput(), op.getOperand()}));
-
-    return success();
-  }
-
-  static bool CanImplement(AllGatherOp op) {
-    return NcclAllGatherThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(AllReduceOp op) {
-    return NcclAllReduceThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(AllReduceStartOp op) {
-    return NcclAllReduceStartThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(ReduceScatterOp op) {
-    return NcclReduceScatterThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(AllToAllOp op) {
-    return NcclAllToAllThunk::CanImplement(op);
-  }
-
-  static bool CanImplement(CollectivePermuteOp op) {
-    return NcclCollectivePermuteThunk::CanImplement(op);
-  }
-
-  template <typename ReduceOp>
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, ReduceOp op,
-                                        func::CallOp call) {
-    std::optional<xla::ReductionKind> reduction_kind =
-        NcclAllReduceThunkBase::MatchAllReduceComputation(op.getComputation());
-    if (!reduction_kind.has_value())
-      return op.emitOpError()
-             << "Failed to determine reduction computation for AllReduce";
-
-    call->setAttr(
-        b.getStringAttr("reduction_kind"),
-        b.getI64IntegerAttr(static_cast<int64_t>(reduction_kind.value())));
-
-    return success();
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, AllGatherOp op,
-                                        func::CallOp call) {
-    return success();
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b, AllToAllOp op,
-                                        func::CallOp call) {
-    call->setAttr(b.getStringAttr("has_split_dimension"),
-                  b.getBoolAttr(op.getSplitDimension().has_value()));
-    return success();
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
-                                        CollectivePermuteOp op,
-                                        func::CallOp call) {
-    auto source_target_pairs_or =
-        ConvertNx2Attribute(op.getSourceTargetPairs());
-    if (!source_target_pairs_or.ok()) {
-      return op.emitOpError()
-             << source_target_pairs_or.status().error_message();
-    }
-
-    // Pass an array of pairs as two vectors.
-    std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
-        std::move(source_target_pairs_or.value());
-    std::vector<int64_t> source_peers;
-    std::vector<int64_t> target_peers;
-    source_peers.reserve(source_target_pairs.size());
-    target_peers.reserve(source_target_pairs.size());
-    for (const auto& source_target_pair : source_target_pairs) {
-      source_peers.push_back(source_target_pair.first);
-      target_peers.push_back(source_target_pair.second);
-    }
-
-    auto source_peers_attr = b.getI64TensorAttr(source_peers);
-    auto target_peers_attr = b.getI64TensorAttr(target_peers);
-    call->setAttr(b.getStringAttr("source_peers"), source_peers_attr);
-    call->setAttr(b.getStringAttr("target_peers"), target_peers_attr);
-    return success();
-  }
-
- public:
-  CollectiveOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
-                       CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CollectiveOp>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CollectiveOp op,
-                                PatternRewriter& rewriter) const override {
-    // Construct an NCCL collective config from the parent func attributes.
-    func::FuncOp fn = op->template getParentOfType<func::FuncOp>();
-    auto replica_count_attr = fn->getAttrOfType<IntegerAttr>("replica_count");
-    auto num_partitions_attr = fn->getAttrOfType<IntegerAttr>("num_partitions");
-    const int64_t replica_count = replica_count_attr.getInt();
-    const int64_t num_partitions = num_partitions_attr.getInt();
-
-    NcclCollectiveConfig config =
-        GetNcclCollectiveConfig(op, replica_count, num_partitions);
-
-    // A given collective op can be degenerate if across all groups formed
-    // by it are singleton. In such a case, we don't need to do any
-    // communication and we can just copy the input to the output.
-    if (succeeded(TryDegenerateToMemCopy(op, config, replica_count,
-                                         num_partitions, rewriter))) {
-      // For async collective erase all corresponding done operations.
-      if (auto start = dyn_cast<AllReduceStartOp>(op.getOperation())) {
-        auto users = llvm::to_vector(start.getToken().getUsers());
-        llvm::for_each(users, [&](Operation* user) {
-          if (isa<AllReduceDoneOp>(user)) rewriter.eraseOp(user);
-        });
-      }
-
-      // Erase the original collective operation.
-      rewriter.eraseOp(op);
-
-      return success();
-    }
-
-    if (!CanImplement(op)) {
-      return op.emitOpError()
-             << "Requested " << Target(op)
-             << " not implemented on GPU; replica_count: " << replica_count
-             << ", num_partitions: " << num_partitions << ", group_mode: "
-             << CollectiveOpGroupModeToString(config.group_mode)
-             << ", NCCL support: " << NcclCollectiveThunk::NcclIsEnabled();
-    }
-
-    // Check that we have and assigned unique collective operation id.
-    auto uid = uid_.AssignedUid(op);
-    if (failed(uid)) {
-      return op.emitOpError("failed to get a unique collective operation id");
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // We always drop the return value from the signature, because for
-    // AllReduceStart operation we pass dependency through the collective
-    // operation id.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, Target(op), TypeRange(op.getOperands()), TypeRange());
-
-    // Convert collective op to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("group_mode"),
-                  b.getI64IntegerAttr(static_cast<int64_t>(config.group_mode)));
-    call->setAttr(b.getStringAttr("op_id"), b.getI64IntegerAttr(config.op_id));
-
-    // TODO(b/233930690): Pass the attribute below as a nested array.
-    // Pass an array of arrays using two vectors; one specifying all the values
-    // and another specifying the (ending) offsets of each array in the other
-    // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
-    // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
-    std::vector<int64_t> replica_group_offsets;
-    std::vector<int64_t> replica_group_values;
-    replica_group_offsets.reserve(config.replica_groups.size());
-    int replica_group_offset = 0;
-    for (const auto& replica_group : config.replica_groups) {
-      replica_group_offset += replica_group.replica_ids_size();
-      replica_group_offsets.push_back(replica_group_offset);
-      replica_group_values.reserve(replica_group_offset);
-      for (auto replica_id : replica_group.replica_ids()) {
-        replica_group_values.push_back(replica_id);
-      }
-    }
-    call->setAttr(b.getStringAttr("replica_group_offsets"),
-                  b.getI64TensorAttr(replica_group_offsets));
-    call->setAttr(b.getStringAttr("replica_group_values"),
-                  b.getI64TensorAttr(replica_group_values));
-
-    // Assign a unique collective operation id.
-    call->setAttr(b.getStringAttr("uid"), b.getI32IntegerAttr(*uid));
-
-    // Set attributes specific to the type of collective operation.
-    auto result = SetSpecificAttrs(b, op, call);
-    if (failed(result)) return result;
-
-    // For asynchonous start operation we need to produce a fake token, that
-    // will be later removed, because corresponding `done` operation doesn't
-    // have the token argument. We rely on the `unrealized_conversion_cast`
-    // operation to create a fake token from the `i8` constant.
-    if (auto start = dyn_cast<AllReduceStartOp>(op.getOperation())) {
-      Value token = start.getToken();
-      Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
-      auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
-      token.replaceAllUsesWith(fake.getResult(0));
-    }
-
-    // Erase the original collective operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  CollectiveUidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-#define DEFINE_COLLECTIVE_OP_LOWERING(OP)                \
-  class OP##Lowering : public CollectiveOpLowering<OP> { \
-   public:                                               \
-    using CollectiveOpLowering::CollectiveOpLowering;    \
-  }
-
-DEFINE_COLLECTIVE_OP_LOWERING(AllGatherOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllReduceOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllReduceStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(ReduceScatterOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllToAllOp);
-DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteOp);
-
-#undef DEFINE_COLLECTIVE_OP_LOWERING
-
-class AllReduceDoneOpLowering : public OpRewritePattern<AllReduceDoneOp> {
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.all_reduce_done";
-
- public:
-  AllReduceDoneOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
-                          CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(AllReduceDoneOp op,
-                                PatternRewriter& rewriter) const override {
-    // For done operation we drop the token argument and communicate async event
-    // dependency through the `uid` attribute.
-    llvm::SmallVector<Value> operands = op.getOperands().drop_front();
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    // Get a unique collective operation id.
-    FailureOr<int32_t> uid = uid_.AssignedUid(op);
-    if (failed(uid))
-      return op.emitOpError("failed to get a unique collective operation id");
-
-    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
-        {b.getStringAttr("uid"), b.getI32IntegerAttr(*uid)}};
-
-    // Convert AllReduceDone to a function call.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attributes);
-
-    return success();
-  }
-
- private:
-  CollectiveUidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-template <typename CollectiveIdOp>
-class CollectiveIdOpLowering : public OpRewritePattern<CollectiveIdOp> {
-  static StringRef Target(ReplicaIdOp) { return "xla.gpu.replica_id"; }
-  static StringRef Target(PartitionIdOp) { return "xla.gpu.partition_id"; }
-
- public:
-  CollectiveIdOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CollectiveIdOp>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CollectiveIdOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
-
-    // Call the runtime intrinsic with the original operands.
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
-                                              op->getOperands());
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class ReplicaIdOpLowering : public CollectiveIdOpLowering<ReplicaIdOp> {
- public:
-  using CollectiveIdOpLowering::CollectiveIdOpLowering;
-};
-
-class PartitionIdOpLowering : public CollectiveIdOpLowering<PartitionIdOp> {
- public:
-  using CollectiveIdOpLowering::CollectiveIdOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-
-void ConvertLmhloToGpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Convert lmhlo operations to XLA gpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<TerminatorOpLowering, CaseOpLowering, WhileOpLowering>(ctx);
-  patterns.insert<InfeedOpLowering, OutfeedOpLowering, CustomCallOpLowering,
-                  FftOpLowering>(ctx, custom_calls);
-
-  // Assign shared unique id to each unique pair of async start-done operations,
-  // all other collective operations will get assigned uid.
-  CollectiveUidGenerator collective_uid;
-  auto walked = module.walk([&](AllReduceStartOp start) -> WalkResult {
-    Value token = start.getToken();
-
-    // We expect the token to be consumed just once.
-    if (!token.hasOneUse()) return start.emitOpError("token has multiple uses");
-
-    // Token must be consumed by the corresponding done operation.
-    auto done = dyn_cast<AllReduceDoneOp>(*token.getUsers().begin());
-    if (!done) return start.emitOpError("illegal token user");
-
-    collective_uid.AssignUid(start, done);
-    return WalkResult::advance();
-  });
-  if (walked.wasInterrupted()) return signalPassFailure();
-
-  // Convert lmhlo collective operations to XLA gpu runtime custom calls.
-  patterns.insert<PartitionIdOpLowering, ReplicaIdOpLowering>(ctx,
-                                                              custom_calls);
-  patterns.insert<AllGatherOpLowering, AllReduceOpLowering,
-                  AllReduceStartOpLowering, AllToAllOpLowering,
-                  CollectivePermuteOpLowering, ReduceScatterOpLowering>(
-      ctx, collective_uid, custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-
-  // TODO(ezhulenev): We must run `done` op lowering after the `start` op
-  // lowering to ensure that all redundant collective operations will be
-  // safely replaced by a `memcpy` operations.
-  //
-  // This should be a part of lmhlo operation canonicalization.
-  {
-    RewritePatternSet patterns(ctx);
-    patterns.insert<AllReduceDoneOpLowering>(ctx, collective_uid, custom_calls);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-      return signalPassFailure();
-  }
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuRuntimePass() {
-  return std::make_unique<ConvertLmhloToGpuRuntimePass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.cc b/tensorflow/compiler/xla/mlir/transforms/gpu/passes.cc
deleted file mode 100644
index 058e93632ce..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
-
-#include <cstdlib>
-#include <string_view>
-
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-
-namespace xla {
-namespace gpu {
-
-using namespace mlir;  // NOLINT
-
-static bool UseExperimentalCudaGraphs() {
-  std::string_view flag = std::getenv("XLA_GPU_RUNTIME_USE_CUDA_GRAPHS");
-  return flag == "true" || flag == "1";
-}
-
-void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
-                                 ThunkSequence* thunk_sequence) {
-  // Lower operations with registered IR emitters to Gpu launches.
-  pm.addPass(createConvertLmhloToGpuLaunchPass(thunk_sequence));
-
-  // Convert global memrefs corresponding to constant arguments.
-  pm.addPass(createConvertMemrefGetGlobalToArgPass());
-  pm.addPass(createSymbolDCEPass());  // Clean up unused global constants.
-
-  // Lower all Gpu operations to the XLA Gpu runtime custom calls.
-  pm.addPass(createConvertLmhloGpuToGpuRuntimePass());
-  pm.addPass(createConvertLmhloToGpuRuntimePass());
-
-  // Enable experimental pass that wraps all launch func operations into Cuda
-  // Graph. Currently it's intended to be a proof of concept and not anywhere
-  // near production readiness.
-  if (UseExperimentalCudaGraphs()) {
-    pm.addPass(createConvertLaunchFuncToCudaGraphPass());
-  }
-
-  pm.addPass(createConvertGpuToGpuRuntimePass());
-
-  // Add performance tracing annotations.
-  pm.addPass(createAddHloTraceAnnotationsPass());
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.h b/tensorflow/compiler/xla/mlir/transforms/gpu/passes.h
deleted file mode 100644
index badbc2205cf..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_PASSES_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_PASSES_H_
-
-#include <memory>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DECL_ADDHLOTRACEANNOTATIONSPASS
-#define GEN_PASS_DECL_CONVERTGPUTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTLMHLOGPUTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTLMHLOTOGPULAUNCHPASS
-#define GEN_PASS_DECL_CONVERTLMHLOTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTMEMREFGETGLOBALTOARGPASS
-#define GEN_PASS_DECL_CONVERTLAUNCHFUNCTOCUDAGRAPHPASS
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
-
-class ThunkSequence;  // forward declare
-
-// Populate passes that lower MLIR modules from a combination of LMHLO and
-// LMHLO_GPU dialects to the XLA Gpu runtime. This pipeline is composed from
-// the passes defined below, and few builtin MLIR passes.
-void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
-                                 ThunkSequence* thunk_sequence);
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Gpu runtime.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass(int64_t min_num_elements);
-
-//===-----------------------------------------------------------------------===/
-// Passes for lowering from the `gpu` dialect.
-//===-----------------------------------------------------------------------===/
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertGpuToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo` dialect.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuLaunchPass(ThunkSequence* thunk_sequence = nullptr);
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo_gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloGpuToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// XLA runtime performance tracing passes.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createAddHloTraceAnnotationsPass();
-
-//===----------------------------------------------------------------------===//
-// XLA runtime <-> Cuda Graphs experimental integration.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLaunchFuncToCudaGraphPass();
-
-//===-----------------------------------------------------------------------===/
-
-#define GEN_PASS_REGISTRATION
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h.inc"
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_GPU_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.td b/tensorflow/compiler/xla/mlir/transforms/gpu/passes.td
deleted file mode 100644
index 7374eaad7ed..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/passes.td
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_GPU_PASSES
-#define XLA_GPU_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Gpu runtime.
-//===----------------------------------------------------------------------===//
-
-def ConvertMemrefGetGlobalToArgPass :
-    Pass<"xla-memref-get-global-to-arg", "mlir::ModuleOp"> {
-  let summary = "Converts memref.get_global corresponding to lmhlo constants";
-
-  let description = [{
-    Replaces `memref.get_global` operations corresponding to the lmhlo constant
-    arguments (arguments marked with `lmhlo.constant_name` attribute) to use
-    the constant arguments directly.
-
-    Once we used global constants for constant folding, we no longer need to
-    keep them in the module, because they'll be in the binary constant section
-    on the host, and we need them on the device.
-  }];
-
-  let constructor = "createConvertMemrefGetGlobalToArgPass()";
-
-  let options = [
-    Option<"min_num_elements_", "min-num-elements", "int64_t", /*default=*/"0",
-           "Do not convert `memref.get_global` operation if the number of "
-           "elements is smaller than the given value.">,
-  ];
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertGpuToGpuRuntimePass :
-    Pass<"xla-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts gpu operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts gpu operations (function launch, memcpy, etc...) to the XLA Gpu
-    runtime custom calls.
-  }];
-
-  let constructor = "createConvertGpuToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertLmhloToGpuLaunchPass :
-    Pass<"xla-lmhlo-to-gpu-launch", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo fusions to Gpu dialect kernel launch";
-
-  let description = [{
-    Converts lmhlo operations that have registered IR emitters (e.g. fusions) to
-    Gpu dialect kernel launch operations (and trivial memory operations like
-    memcpy or memset). This pass relies on a pre-compiled ThunkSequence with an
-    associated device module (PTX and cubin) to find device kernels
-    corresponding to lmhlo operation in the input module.
-
-    Created Gpu kernel launch operations can be further lowered to the Gpu
-    runtime by the `xla-gpu-to-gpu-runtime` pass.
-  }];
-
-  let constructor = "createConvertLmhloToGpuLaunchPass()";
-}
-
-def ConvertLmhloToGpuRuntimePass :
-    Pass<"xla-lmhlo-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts lmhlo dialect operations (infeed, outfeed, collectives, etc...) to
-    the XLA Gpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertLmhloToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo_gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertLmhloGpuToGpuRuntimePass :
-    Pass<"xla-lmhlo-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo_gpu operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts lmhlo_gpu dialect operations (gemm, convolution, etc...) to
-    the XLA Gpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertLmhloGpuToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// XLA runtime performance tracing passes.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): This pass should be generic for all backends, consider
-// moving it to the `transforms/runtime` folder once it will be used by CPU
-// compiler.
-
-def AddHloTraceAnnotationsPass :
-    Pass<"xla-add-hlo-trace-annotations", "mlir::ModuleOp"> {
-  let summary = "Adds HLO trace annotations to the supported operations";
-
-  let description = [{
-    Adds HLO trace annotations to the operations that result from compiling
-    an input HLO module, e.g. it adds HLO trace annotations to all runtime custom
-    calls that are constructed from the corresponding HLO operations.
-
-    Example:
-
-    ```mlir
-    call @xla.gpu.gemm(...) : (...) -> memref<?x?xf32>
-    ```
-
-    becomes:
-
-    ```mlir
-    call @xla.gpu.gemm(...) { rt.trace = #rt.hlo<"gemm.1", "xla_module", 0> }
-      : (...) -> memref<?x?xf32>
-    ```
-
-    XLA compilation pipeline wraps traced operations into the `rt.trace`
-    operation, and eventually lowers them to the tracing API calls.
-  }];
-
-  let constructor = "createAddHloTraceAnnotationsPass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Experimental passes for Xla Gpu <-> Cuda Graphs integration.
-//===----------------------------------------------------------------------===//
-
-def ConvertLaunchFuncToCudaGraphPass :
-    Pass<"xla-gpu-launch-func-to-cuda-graphs", "mlir::ModuleOp"> {
-  let summary = "Capture sequence of Gpu function launches as cuda graphs";
-
-  let description = [{
-    Converts sequences of two or more `gpu.launch_func` operations to Cuda
-    Graph building functions, and replaces the original sequences with calls to
-    the Xla Cuda Graph runtime API.
-
-    Example:
-
-    ```mlir
-    gpu.launch_func @compute::foo args(%arg0: memref<?xf32>)
-    gpu.launch_func @compute::bar args(%arg1: memref<?xf32>)
-    ```
-
-    becomes:
-
-    ```mlir
-    // Export cuda graph builder function to Xla runtime.
-    rt.export @builder ordinal 1
-    func.func @builder(@arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-      ... capture a graph corresponding to a sequence of `gpu.launch_func` ops
-    }
-
-    // Replace a sequence of graph launch operations with a call to runtime API.
-    call @xla.gpu.cuda.graph.execute(%arg0: memref<?xf32>,
-                                     %arg1: memref<?xf32>)
-      attributes { builder = @builder }
-    ```
-  }];
-
-  let constructor = "createConvertLaunchFuncToCudaGraphPass()";
-}
-
-#endif  // XLA_GPU_PASSES
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/BUILD b/tensorflow/compiler/xla/mlir/transforms/gpu/tests/BUILD
deleted file mode 100644
index 4025b08c4b1..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/xla/mlir/tools:xla-gpu-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch.mlir b/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch.mlir
deleted file mode 100644
index 93a69774e55..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch.mlir
+++ /dev/null
@@ -1,60 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-gpu-to-gpu-runtime | FileCheck %s
-
-module attributes {gpu.container_module} {
-
-// CHECK-NOT: gpu.module
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<4x4xf32>,
-// CHECK:   %[[ARG1:.*]]: memref<4x4xf32>
-// CHECK: )
-func.func @func(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) {
-  // Launch dimensions converted to i32 as a part of the lowering.
-  // CHECK: %[[C1:.*]] = arith.constant 1 : i32
-  // CHECK: %[[C2:.*]] = arith.constant 2 : i32
-  // CHECK: %[[C3:.*]] = arith.constant 3 : i32
-  // CHECK: %[[C4:.*]] = arith.constant 4 : i32
-  // CHECK: %[[C5:.*]] = arith.constant 5 : i32
-  // CHECK: %[[C6:.*]] = arith.constant 6 : i32
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %c5 = arith.constant 5 : index
-  %c6 = arith.constant 6 : index
-
-  // CHECK: call @[[LAUNCH:[_a-z.]+]](%[[C1]], %[[C2]], %[[C3]], %[[C4]],
-  // CHECK-SAME: %[[C5]], %[[C6]], %[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: kernel = "fn0"
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c2, %c3)
-    threads in (%c4, %c5, %c6)
-    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
-
-  // CHECK: call @[[LAUNCH]](%[[C3]], %[[C2]], %[[C1]], %[[C6]],
-  // CHECK-SAME: %[[C5]], %[[C4]], %[[ARG0]], %[[ARG1]])
-  // CHECK-DAG: kernel = "fn1"
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c3, %c2, %c1)
-    threads in (%c6, %c5, %c4)
-    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
-
-  func.return
-}
-
-// CHECK: func private @[[LAUNCH]](i32, i32, i32, i32, i32, i32,
-// CHECK-SAME: memref<4x4xf32>, memref<4x4xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.func.launch"}
-
-// Check that we have a single custom call declaration in the module.
-// CHECK-NOT: rt.custom_call
-
-}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch_to_cuda_graph.mlir b/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch_to_cuda_graph.mlir
deleted file mode 100644
index ba92e509236..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/gpu_launch_to_cuda_graph.mlir
+++ /dev/null
@@ -1,149 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-launch-func-to-cuda-graphs \
-// RUN:   | FileCheck %s
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<?xf32>,
-// CHECK:   %[[ARG1:.*]]: memref<?xf32>
-// CHECK: )
-func.func @func(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-  // CHECK: %[[C1:.*]] = arith.constant 1
-  // CHECK: %[[C2:.*]] = arith.constant 2
-  // CHECK: %[[C3:.*]] = arith.constant 3
-  // CHECK: %[[C4:.*]] = arith.constant 4
-  // CHECK: %[[C5:.*]] = arith.constant 5
-  // CHECK: %[[C6:.*]] = arith.constant 6
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %c5 = arith.constant 5 : index
-  %c6 = arith.constant 6 : index
-
-  // CHECK: call @xla.gpu.cuda.graph.launch(
-  // CHECK:  %[[C1]], %[[C2]], %[[C3]], %[[C4]], %[[C5]], %[[C6]],
-  // CHECK:  %[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
-  // CHECK-NEXT: return
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c2, %c3)
-    threads in (%c4, %c5, %c6)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c3, %c2, %c1)
-    threads in (%c6, %c5, %c4)
-    args(%arg1 : memref<?xf32>)
-
-  func.return
-}
-
-// CHECK: func @xla.gpu.cuda.graph.capture
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT:  return
-
-// CHECK: func private @xla.gpu.cuda.graph.launch(
-// CHECK-SAME:  index, index, index, index, index, index,
-// CHECK-SAME:  memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.cuda.graph.launch"}
-}
-
-// -----
-// Check that single function launch was not outlined into graph capture.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
-func.func @func(%arg0: memref<?xf32>) {
-  %c1 = arith.constant 1 : index
-
-  // CHECK: gpu.launch_func {{.*}} args(%[[ARG0]] : memref<?xf32>)
-  // CHECK-NOT: call @xla.gpu.cuda.graph.launch
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  func.return
-}
-
-}
-
-// -----
-// Check that two different sequences are outlined in different capture
-// functions.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
-func.func @func(%arg0: memref<?xf32>) {
-  // CHECK: %[[C1:.*]] = arith.constant 1
-  %c1 = arith.constant 1 : index
-
-  // CHECK: call @xla.gpu.cuda.graph.launch(%[[C1]], %[[ARG0]])
-  // CHECK-SAME: {capture = @[[CAPTURE:.*]]}
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  // Use constant to break the large function launch sequence.
-  // CHECK: %[[C2:.*]] = arith.constant 2
-  %c2 = arith.constant 2 : index
-
-  // CHECK: call @xla.gpu.cuda.graph.launch(%[[C2]], %[[ARG0]])
-  // CHECK-SAME: {capture = @[[CAPTURE_0:.*]]}
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c2, %c2, %c2)
-    threads in (%c2, %c2, %c2)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c2, %c2, %c2)
-    threads in (%c2, %c2, %c2)
-    args(%arg0 : memref<?xf32>)
-
-  func.return
-}
-
-// CHECK: rt.export @[[CAPTURE]]
-// CHECK: func.func @[[CAPTURE]](%arg0: index, %arg1: memref<?xf32>)
-
-// CHECK: rt.export @[[CAPTURE_0]]
-// CHECK: func.func @[[CAPTURE_0]](%arg0: index, %arg1: memref<?xf32>)
-
-}
diff --git a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_custom_call.mlir
deleted file mode 100644
index 1013cf446dd..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/gpu/tests/lmhlo_custom_call.mlir
+++ /dev/null
@@ -1,64 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: func @test
-// CHECK:   %[[ARG0:.*]]: memref<f32>
-// CHECK: )
-func.func @test(%arg0: memref<f32>) {
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
-  // CHECK-SAME:   api_version = 2 : i32
-  // CHECK-SAME:   backend_config = ""
-  // CHECK-SAME:   call_target_name = "target"
-  // CHECK-SAME: : (memref<f32>) -> ()
-  "lmhlo.custom_call"(%arg0) {
-    api_version = 2 : i32,
-    backend_config = "",
-    call_target_name = "target",
-    operand_segment_sizes = array<i32: 0, 1>
-  } : (memref<f32>) -> ()
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
-
-// -----
-
-// CHECK: func @test_with_mapping
-// CHECK:   %[[ARG0:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG1:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG2:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG3:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG4:[0-9a-z]*]]: memref<f32>
-// CHECK: )
-func.func @test_with_mapping(
-    %arg0: memref<f32>,
-    %arg1: memref<f32>,
-    %arg2: memref<f32>,
-    %arg3: memref<f32>,
-    %arg4: memref<f32>) {
-  // CHECK: %[[HOLE:.*]] = arith.constant -1 : i64
-
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
-  // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
-  // CHECK-SAME:   api_version = 1 : i32
-  // CHECK-SAME:   backend_config = ""
-  // CHECK-SAME:   call_target_name = "target"
-  "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) {
-    api_version = 1 : i32,
-    backend_config = "",
-    call_target_name = "target",
-    operand_segment_sizes = array<i32: 2, 3>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 4,
-      args_to_target_args = [0, 2],
-      results_to_target_results = [0, 1, 3]>
-    } : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
-
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>, i64, memref<f32>, i64,
-// CHECK-SAME: memref<f32>, memref<f32>, i64, memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/BUILD b/tensorflow/compiler/xla/mlir/transforms/math/BUILD
deleted file mode 100644
index c26222ecc5a..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/math/BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-        "@tf_runtime//:friends",
-    ],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=MathTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = ["math_optimization.cc"],
-    hdrs = ["passes.h"],
-    compatible_with = get_compatible_with_cloud(),
-    deps = [
-        ":passes_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MathTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:X86VectorDialect",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/passes.h b/tensorflow/compiler/xla/mlir/transforms/math/passes.h
deleted file mode 100644
index fcf911f504c..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/math/passes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MATH_PASSES_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MATH_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-#define GEN_PASS_DECL_MATHOPTIMIZATIONPASS
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h.inc"
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateMathOptimizationPass(bool enable_avx2 = false);
-
-#define GEN_PASS_REGISTRATION
-#include "tensorflow/compiler/xla/mlir/transforms/math/passes.h.inc"
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MATH_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/passes.td b/tensorflow/compiler/xla/mlir/transforms/math/passes.td
deleted file mode 100644
index 66e154bd0fa..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/math/passes.td
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MATH_PASSES
-#define XLA_MATH_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-def MathOptimizationPass
-    : Pass<"xla-math-optimization", "mlir::func::FuncOp"> {
-  let summary = "Optimize operations from the `math` dialect.";
-
-  let description = [{
-    This pass performs algebraic simplification and polynomial approximation for
-    ops from the Math dialect.
-  }];
-
-  let dependentDialects = [
-    "mlir::vector::VectorDialect",
-    "mlir::x86vector::X86VectorDialect"
-  ];
-
-  let constructor = "::xla::runtime::CreateMathOptimizationPass()";
-
-  let options = [
-   Option<"enable_avx2_", "enable-avx2", "bool", "false",
-          "Enable math approximations that emit AVX2 intrinsics.">
-  ];
-}
-
-#endif  // XLA_MATH_PASSES
diff --git a/tensorflow/compiler/xla/mlir/transforms/math/tests/BUILD b/tensorflow/compiler/xla/mlir/transforms/math/tests/BUILD
deleted file mode 100644
index 7f511fa1c49..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/math/tests/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/xla/mlir/runtime:xla-runtime-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/BUILD b/tensorflow/compiler/xla/mlir/transforms/memref/BUILD
deleted file mode 100644
index 0ac3f5886d3..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/memref/BUILD
+++ /dev/null
@@ -1,43 +0,0 @@
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-        "@tf_runtime//:friends",
-    ],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=MemrefTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "passes",
-    srcs = ["aligned_allocations.cc"],
-    hdrs = ["passes.h"],
-    compatible_with = get_compatible_with_cloud(),
-    deps = [
-        ":passes_inc_gen",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/passes.h b/tensorflow/compiler/xla/mlir/transforms/memref/passes.h
deleted file mode 100644
index 6db63fdf378..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/memref/passes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MEMREF_PASSES_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MEMREF_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace xla {
-namespace runtime {
-
-#define GEN_PASS_DECL_ALIGNEDALLOCATIONSPASS
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h.inc"
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-CreateAlignedAllocationsPass(int64_t alignment = 64);
-
-#define GEN_PASS_REGISTRATION
-#include "tensorflow/compiler/xla/mlir/transforms/memref/passes.h.inc"
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_TRANSFORMS_MEMREF_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/passes.td b/tensorflow/compiler/xla/mlir/transforms/memref/passes.td
deleted file mode 100644
index f50bf3a4450..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/memref/passes.td
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MEMREF_PASSES
-#define XLA_MEMREF_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-def AlignedAllocationsPass
-    : Pass<"xla-memref-aligned-allocations", "mlir::func::FuncOp"> {
-  let summary = "Add alignment attribute to all `alloc` operations.";
-
-  let description = [{
-    This pass adds an alignment attribute to all `alloc` operations which don't
-    have such an attribute yet, or which have a smaller alignment than the one
-    configured for this pass.
-  }];
-
-  let constructor = "::xla::runtime::CreateAlignedAllocationsPass()";
-
-  let options = [
-   Option<"alignment_", "alignment", "int64_t", "64",
-          "Byte alignment for allocated memrefs.">
-  ];
-}
-
-#endif  // XLA_MEMREF_PASSES
diff --git a/tensorflow/compiler/xla/mlir/transforms/memref/tests/BUILD b/tensorflow/compiler/xla/mlir/transforms/memref/tests/BUILD
deleted file mode 100644
index 7f511fa1c49..00000000000
--- a/tensorflow/compiler/xla/mlir/transforms/memref/tests/BUILD
+++ /dev/null
@@ -1,21 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/xla/mlir/runtime:xla-runtime-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir/utils/BUILD b/tensorflow/compiler/xla/mlir/utils/BUILD
index 64139a49eb7..9e9a6f974d9 100644
--- a/tensorflow/compiler/xla/mlir/utils/BUILD
+++ b/tensorflow/compiler/xla/mlir/utils/BUILD
@@ -1,9 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow:internal",
+        "//tensorflow/compiler/xla:internal",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD b/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD
new file mode 100644
index 00000000000..88c3b2aa49a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD
@@ -0,0 +1,106 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+td_library(
+    name = "td_files",
+    srcs = [
+        "xla_cpu_dialect.td",
+        "xla_cpu_enums.td",
+        "xla_cpu_ops.td",
+    ],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
+        "@llvm-project//mlir:BufferizableOpInterfaceTdFiles",
+        "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "xla_cpu_dialect_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-dialect-decls"],
+            "xla_cpu_dialect.h.inc",
+        ),
+        (
+            ["-gen-dialect-defs"],
+            "xla_cpu_dialect.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xla_cpu_dialect.td",
+    deps = [":td_files"],
+)
+
+gentbl_cc_library(
+    name = "xla_cpu_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "xla_cpu.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "xla_cpu.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xla_cpu_ops.td",
+    deps = [":td_files"],
+)
+
+gentbl_cc_library(
+    name = "xla_cpu_enums_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            ["-gen-enum-decls"],
+            "xla_cpu_enums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "xla_cpu_enums.cc.inc",
+        ),
+        (
+            ["-gen-attrdef-decls"],
+            "xla_cpu_attrdefs.h.inc",
+        ),
+        (
+            ["-gen-attrdef-defs"],
+            "xla_cpu_attrdefs.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "xla_cpu_enums.td",
+    deps = [
+        ":td_files",
+    ],
+)
+
+cc_library(
+    name = "xla_cpu",
+    srcs = [
+        "xla_cpu.cc",
+    ],
+    hdrs = ["xla_cpu.h"],
+    deps = [
+        ":xla_cpu_dialect_inc_gen",
+        ":xla_cpu_enums_inc_gen",
+        ":xla_cpu_inc_gen",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
new file mode 100644
index 00000000000..293c55b95b4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
@@ -0,0 +1,164 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.cc.inc"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.cc.inc"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_attrdefs.cc.inc"
+
+namespace mlir {
+namespace xla_cpu {
+
+using ::mlir::mhlo::TokenType;
+
+void XlaCpuDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc.inc"
+#undef GET_OP_LIST
+      >();
+}
+
+template <typename Op>
+LogicalResult BufferizeOp(Op op, RewriterBase &rewriter,
+                          const bufferization::BufferizationOptions &options,
+                          int64_t num_inputs) {
+  if (op.getOperands().front().getType().template isa<MemRefType>()) {
+    return success();
+  }
+  SmallVector<Value> new_operands;
+  for (auto operand : op.getOperands()) {
+    FailureOr<Value> maybe_buffer = getBuffer(rewriter, operand, options);
+    if (failed(maybe_buffer)) {
+      return failure();
+    }
+    new_operands.push_back(*maybe_buffer);
+  }
+  rewriter.create<Op>(op.getLoc(), TypeRange{}, new_operands,
+                      op.getOperation()->getAttrs());
+  bufferization::replaceOpWithBufferizedValues(
+      rewriter, op.getOperation(),
+      llvm::ArrayRef(new_operands).drop_front(num_inputs));
+  return success();
+}
+
+bool AllReduceOp::bufferizesToMemoryRead(OpOperand &opOperand,
+                                         const bufferization::AnalysisState &) {
+  return opOperand.getOperandNumber() < getNumOperands() / 2;
+}
+
+bool AllReduceOp::bufferizesToMemoryWrite(
+    OpOperand &opOperand, const bufferization::AnalysisState &state) {
+  return !bufferizesToMemoryRead(opOperand, state);
+}
+
+bufferization::AliasingOpResultList AllReduceOp::getAliasingOpResults(
+    OpOperand &opOperand, const bufferization::AnalysisState &) {
+  if (opOperand.getOperandNumber() < getNumOperands() / 2) {
+    return {};
+  }
+  return {getOperation()->getOpResult(opOperand.getOperandNumber() -
+                                      getNumOperands() / 2)};
+}
+
+LogicalResult AllReduceOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
+}
+
+bufferization::BufferRelation AllReduceOp::bufferRelation(
+    OpResult, const bufferization::AnalysisState &) {
+  return bufferization::BufferRelation::Equivalent;
+}
+
+LogicalResult CollectivePermuteOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
+}
+
+LogicalResult AllToAllOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
+}
+
+LogicalResult FftOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
+}
+
+LogicalResult OutfeedOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, this->getNumOperands());
+}
+
+LogicalResult RngBitGeneratorOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, 1);
+}
+
+LogicalResult AddDependencyOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  FailureOr<Value> maybe_buffer =
+      getBuffer(rewriter, this->getOperand(), options);
+  if (failed(maybe_buffer)) {
+    return rewriter.notifyMatchFailure(*this,
+                                       "failed during bufferizing operand");
+  }
+  bufferization::replaceOpWithBufferizedValues(rewriter, this->getOperation(),
+                                               *maybe_buffer);
+  return success();
+}
+
+LogicalResult MemRefElementCastOp::verify() {
+  auto src_memref_ty = getSrc().getType().cast<MemRefType>();
+  auto dst_memref_ty = getDst().getType().cast<MemRefType>();
+  if (src_memref_ty.getShape() != dst_memref_ty.getShape()) {
+    return emitOpError() << "expects matching shapes";
+  }
+
+  unsigned src_width = src_memref_ty.getElementType().getIntOrFloatBitWidth();
+  unsigned dst_width = dst_memref_ty.getElementType().getIntOrFloatBitWidth();
+  if ((src_width + CHAR_BIT - 1) / CHAR_BIT !=
+      (dst_width + CHAR_BIT - 1) / CHAR_BIT) {
+    return emitOpError() << "cannot cast from "
+                         << src_memref_ty.getElementType() << " to "
+                         << dst_memref_ty.getElementType();
+  }
+  return success();
+}
+
+}  // namespace xla_cpu
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h
new file mode 100644
index 00000000000..de5c65c7600
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations and types used in the XLAFramework dialect.
+//
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h.inc"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.h.inc"
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.h.inc"
+#define GET_ATTRDEF_CLASSES
+#include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_attrdefs.h.inc"
+#undef GET_OP_CLASSES
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_IR_XLA_CPU_H_
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
new file mode 100644
index 00000000000..906d665e2da
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_DIALECT_TD_
+#define TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_DIALECT_TD_
+
+include "mlir/IR/OpBase.td"
+
+def XlaCpuDialect : Dialect {
+  let name = "xla_cpu";
+
+  let summary = "Enums and operations for the xla_cpu dialect";
+  let description = [{
+    This dialect contains operations that bridge the gap between HLO and the
+    CPU runtime.
+  }];
+  let cppNamespace = "::mlir::xla_cpu";
+  let useFoldAPI = kEmitFoldAdaptorFolder;
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_DIALECT_TD_
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.td b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.td
new file mode 100644
index 00000000000..7c3656ff1bc
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.td
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_ENUMS_TD_
+#define TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_ENUMS_TD_
+
+include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td"
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/PatternBase.td"
+
+def ALL_REDUCE_SUM     : I32EnumAttrCase<"ALL_REDUCE_SUM", 0>;
+def ALL_REDUCE_PRODUCT : I32EnumAttrCase<"ALL_REDUCE_PRODUCT", 1>;
+def ALL_REDUCE_MIN     : I32EnumAttrCase<"ALL_REDUCE_MIN", 2>;
+def ALL_REDUCE_MAX     : I32EnumAttrCase<"ALL_REDUCE_MAX", 3>;
+
+def XlaCpuReductionKind : I32EnumAttr<"ReductionKind",
+    "Type of reduction to apply.",
+    [ALL_REDUCE_SUM, ALL_REDUCE_PRODUCT, ALL_REDUCE_MIN, ALL_REDUCE_MAX]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xla_cpu";
+}
+
+def XlaCpuReductionKindEnum : EnumAttr<XlaCpuDialect, XlaCpuReductionKind, "reduction_kind"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+#endif // TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_ENUMS_TD_
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
new file mode 100644
index 00000000000..183b87c375f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
@@ -0,0 +1,339 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_OPS_TD_
+#define TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_OPS_TD_
+
+include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_dialect.td"
+include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_enums.td"
+include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_base.td"
+
+// Base class for XLA CPU dialect ops.
+class XlaCpu_Op<string mnemonic, list<Trait> traits = []> :
+    Op<XlaCpuDialect, mnemonic, traits>;
+
+def TensorOrMemref :
+  AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
+
+def AllReduceOp : XlaCpu_Op<"all_reduce",
+    [SameOperandsElementType,
+     SameVariadicOperandSize,
+     BufferizableOpInterface]> {
+  let summary = [{
+    CPU-specific version of AllReduce.
+  }];
+
+  let description = [{
+    The major differences between this and HLO's all_reduce are:
+    - It bufferizes to itself.
+    - It has no region.
+    - It uses destination passing style.
+  }];
+
+  let arguments = (ins
+      Variadic<TensorOrMemref>:$operand,
+      Variadic<TensorOrMemref>:$dsts,
+      I64ElementsAttr:$replica_groups,
+      I64Attr:$channel_handle,
+      I32Attr:$use_global_device_ids,
+      XlaCpuReductionKind:$reduction_kind
+  );
+  let results = (outs
+      Variadic<TensorOrMemref>
+  );
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state);
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state);
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state);
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state);
+  }];
+}
+
+def ReplicaIdOp : XlaCpu_Op<"replica_id"> {
+  let summary = "CPU-specific version of ReplicaId";
+  let description = [{
+    ReplicaId, but returns a i32 instead of tensor<ui32>.
+  }];
+  let results = (outs I32);
+}
+
+def PartitionIdOp : XlaCpu_Op<"partition_id"> {
+  let summary = "CPU-specific version of PartitionId";
+  let description = [{
+    PartitionId, but returns a i32 instead of tensor<ui32>.
+  }];
+  let results = (outs I32);
+}
+
+def CollectivePermuteOp : XlaCpu_Op<"collective_permute", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of CollectivePermute";
+  let description = [{
+    The major differences between this and HLO's collective_permute are:
+    - It bufferizes to itself.
+    - It uses destination passing style.
+  }];
+
+  let arguments = (ins
+      TensorOrMemref:$operand,
+      TensorOrMemref:$dst,
+      I64ElementsAttr:$source_target_pairs,
+      I64Attr:$channel_handle
+  );
+  let results = (outs Variadic<TensorOrMemref>);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 0;
+    }
+    bool bufferizesToMemoryWrite(OpOperand   &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 1;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (opOperand.getOperandNumber() == 0) return {};
+      return {getOperation()->getOpResult(0)};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Equivalent;
+    }
+  }];
+}
+
+def AllToAllOp : XlaCpu_Op<"all_to_all",
+  [SameOperandsElementType,
+   SameVariadicOperandSize,
+   BufferizableOpInterface]> {
+  let summary = "CPU-specific version of AllToAll";
+  let description = [{
+    The major differences between this and HLO's all_to_all are:
+    - It bufferizes to itself.
+    - It uses destination passing style.
+  }];
+
+  let arguments = (ins
+      Variadic<TensorOrMemref>:$operand,
+      Variadic<TensorOrMemref>:$dst,
+      I64ElementsAttr:$replica_groups,
+      OptionalAttr<I64Attr>:$split_dimension,
+      OptionalAttr<I64Attr>:$concat_dimension,
+      OptionalAttr<I64Attr>:$split_count
+  );
+  let results = (outs Variadic<TensorOrMemref>);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() < getNumOperands() / 2;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() >= getNumOperands() / 2;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (bufferizesToMemoryRead(opOperand, state)) return {};
+      return {getOperation()->getOpResult(opOperand.getOperandNumber() - getNumOperands() / 2)};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Equivalent;
+    }
+  }];
+}
+
+def FftOp : XlaCpu_Op<"fft", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of FFT";
+  let description = [{
+    The major differences between this and HLO's fft are:
+    - It bufferizes to itself.
+    - It uses destination passing style.
+  }];
+
+  let arguments = (ins
+      TensorOrMemref:$operand,
+      TensorOrMemref:$dst,
+      I32Attr:$fft_type,
+      I64ArrayAttr:$fft_length
+  );
+  let results = (outs Variadic<TensorOrMemref>);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 0;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 1;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (opOperand.getOperandNumber() == 0) return {};
+      return {getOperation()->getOpResult(0)};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Equivalent;
+    }
+  }];
+}
+
+def OutfeedOp : XlaCpu_Op<"outfeed", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of Outfeed";
+  let description = [{
+    The major differences between this and HLO's outfeed are:
+    - It bufferizes to itself.
+    - It captures the output type to reinstate it after signless conversions.
+  }];
+  let arguments = (ins
+    Variadic<TensorOrMemref>:$operand,
+    DefaultValuedStrAttr<StrAttr, "">:$config,
+    ArrayAttr:$result_type
+  );
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return true;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return false;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      return {};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Equivalent;
+    }
+  }];
+}
+
+def MemRefElementCastOp : XlaCpu_Op<"memref_element_cast",
+    [SameOperandsAndResultShape]> {
+  let summary = "MemRef reinterpret_cast on element types";
+  let description = [{
+    This op is the equivalent of C++'s reinterpret_cast on pointers. The element
+    types' storage sizes must be the same. Does not cast shapes.
+  }];
+  let arguments = (ins
+    MemRefOf<[I1, I8, I16, I32, I64, BF16, F16, F32, F64]>:$src
+  );
+  let results = (outs
+    MemRefOf<[I1, I8, I16, I32, I64, BF16, F16, F32, F64]>:$dst
+  );
+  let assemblyFormat = "$src attr-dict `:` type($src) `to` type($dst)";
+  let hasVerifier = 1;
+}
+
+def RngBitGeneratorOp : XlaCpu_Op<"rng_bit_generator", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of rng_bit_generator";
+  let description = [{
+    The major differences between this and HLO's rng_bit_generator are:
+    - It bufferizes to itself.
+    - It uses destination passing style.
+  }];
+  let arguments = (ins
+      TensorOrMemref:$state,
+      TensorOrMemref:$dst_state,
+      TensorOrMemref:$dst,
+      AnyAttr:$rng_algorithm
+  );
+  let results = (outs Variadic<TensorOrMemref>);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 0;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() != 0;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (opOperand.getOperandNumber() == 0) return {};
+      return {getOperation()->getOpResult(opOperand.getOperandNumber()-1)};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Equivalent;
+    }
+  }];
+}
+
+def AddDependencyOp : XlaCpu_Op<"add_dependency", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of AddDependency";
+  let description = [{
+    The major differences between this and HLO's add_dependency are:
+    - It bufferizes itself.
+  }];
+  let arguments = (ins
+    MHLO_TensorOrToken:$operand,
+    MHLO_Token:$token
+  );
+  let results = (outs MHLO_TensorOrToken);
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return opOperand.getOperandNumber() == 0;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return false;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      if (opOperand.getOperandNumber() == 0 || opOperand.getOperandNumber() == 1)
+        return {};
+      return {getOperation()->getOpResult(0)};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+    bufferization::BufferRelation bufferRelation(OpResult opResult,
+        const bufferization::AnalysisState &state) {
+      return bufferization::BufferRelation::Unknown;
+    }
+  }];
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_XLA_CPU_OPS_TD_
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD b/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD
new file mode 100644
index 00000000000..c29814eebc5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD
@@ -0,0 +1,25 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "//tensorflow/compiler/xla:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+# This intentionally does not pull-in the top-level tf-opt to reduce the
+# dependencies.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir/backends/cpu:xla-cpu-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir b/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir
new file mode 100644
index 00000000000..f0d5cde450c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/bufferize.mlir
@@ -0,0 +1,129 @@
+// RUN: xla-cpu-opt %s -split-input-file -empty-tensor-to-alloc-tensor \
+// RUN:   -one-shot-bufferize | FileCheck %s
+
+func.func @max_reduce(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  %0 = tensor.empty() : tensor<10xf32>
+  %1 = "xla_cpu.all_reduce"(%arg0, %0) {
+    channel_handle = 5 : i64,
+    reduction_kind = 3 : i32,
+    replica_groups = dense<[]> : tensor<0xi64>,
+    use_global_device_ids = 0 : i32
+  } : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+  return %1 : tensor<10xf32>
+}
+
+// CHECK-LABEL: @max_reduce
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<10xf32>
+//       CHECK: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0]]
+//       CHECK: %[[OUT:.*]] = memref.alloc() {{.*}} memref<10xf32>
+//       CHECK: "xla_cpu.all_reduce"(%[[ARG0_MEMREF]], %[[OUT]]) {
+//  CHECK-SAME:   channel_handle = 5
+//       CHECK: %[[RESULT:.*]] = bufferization.to_tensor %[[OUT]]
+//       CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @collective_permute(%arg0: tensor<16x8xf32>) -> tensor<16x8xf32> {
+  %0 = tensor.empty() : tensor<16x8xf32>
+  %1 = "xla_cpu.collective_permute"(%arg0, %0) {
+    channel_handle = 1 : i64,
+    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>
+  } : (tensor<16x8xf32>, tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %1 : tensor<16x8xf32>
+}
+
+// CHECK-LABEL: @collective_permute
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<16x8xf32>
+//       CHECK: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0]]
+//       CHECK: %[[OUT:.*]] = memref.alloc() {{.*}} memref<16x8xf32>
+//       CHECK: "xla_cpu.collective_permute"(%[[ARG0_MEMREF]], %[[OUT]]) {
+//  CHECK-SAME:   channel_handle = 1
+//       CHECK: %[[RESULT:.*]] = bufferization.to_tensor %[[OUT]]
+//       CHECK: return %[[RESULT]]
+
+// -----
+
+func.func @all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  %0 = tensor.empty() : tensor<16x4xf32>
+  %1 = "xla_cpu.all_to_all"(%arg0, %0) {
+    concat_dimension = 0 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    split_count = 4 : i64,
+    split_dimension = 1 : i64
+  } : (tensor<4x16xf32>, tensor<16x4xf32>) -> tensor<16x4xf32>
+  return %1 : tensor<16x4xf32>
+}
+
+// CHECK-LABEL: @all_to_all
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x16xf32>
+//       CHECK: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0]]
+//       CHECK: %[[OUT:.*]] = memref.alloc() {{.*}} memref<16x4xf32>
+//       CHECK: "xla_cpu.all_to_all"(%[[ARG0_MEMREF]], %[[OUT]]) {
+//  CHECK-SAME:   split_count = 4
+//       CHECK: %[[RESULT:.*]] = bufferization.to_tensor %[[OUT]]
+//       CHECK: return %[[RESULT]]
+
+
+// -----
+
+func.func @all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>)
+    -> (tensor<128x4xf32>, tensor<128x4xf32>) {
+  %0 = tensor.empty() : tensor<128x4xf32>
+  %1 = tensor.empty() : tensor<128x4xf32>
+  %2:2 = "xla_cpu.all_to_all"(%arg0, %arg1, %0, %1) {
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  } : (tensor<128x4xf32>, tensor<128x4xf32>,
+       tensor<128x4xf32>, tensor<128x4xf32>) ->
+      (tensor<128x4xf32>, tensor<128x4xf32>)
+  return %2#0, %2#1 : tensor<128x4xf32>, tensor<128x4xf32>
+}
+
+// CHECK-LABEL: @all_to_all_tuple
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<128x4xf32>,
+//  CHECK-SAME:   %[[ARG1:.*]]: tensor<128x4xf32>
+//   CHECK-DAG: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0]]
+//   CHECK-DAG: %[[ARG1_MEMREF:.*]] = bufferization.to_memref %[[ARG1]]
+//   CHECK-DAG: "xla_cpu.all_to_all"(%[[ARG0_MEMREF]], %[[ARG1_MEMREF]], %[[OUT0:.*]], %[[OUT1:.*]]) {
+//   CHECK-DAG: %[[OUT0]] = memref.alloc() {{.*}} memref<128x4xf32>
+//   CHECK-DAG: %[[OUT1]] = memref.alloc() {{.*}} memref<128x4xf32>
+//   CHECK-DAG: %[[RESULT0:.*]] = bufferization.to_tensor %[[OUT0]] :
+//   CHECK-DAG: %[[RESULT1:.*]] = bufferization.to_tensor %[[OUT1]] :
+//       CHECK: return %[[RESULT0]], %[[RESULT1]]
+
+// -----
+
+func.func @fft(%arg0: tensor<3x5x4x8x256xf32>) -> tensor<3x5x4x8x129xcomplex<f32>> {
+  %0 = tensor.empty() : tensor<3x5x4x8x129xcomplex<f32>>
+  %1 = "xla_cpu.fft"(%arg0, %0) {
+    fft_length = [4, 8, 256],
+    fft_type = 2 : i32
+   } : (tensor<3x5x4x8x256xf32>,tensor<3x5x4x8x129xcomplex<f32>>) -> tensor<3x5x4x8x129xcomplex<f32>>
+  return %1 : tensor<3x5x4x8x129xcomplex<f32>>
+}
+
+// CHECK-LABEL: @fft
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<3x5x4x8x256xf32>
+//       CHECK: %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0]]
+//       CHECK: %[[OUT:.*]] = memref.alloc() {{.*}}
+//       CHECK: "xla_cpu.fft"(%[[ARG0_MEMREF]], %[[OUT]])
+
+
+// -----
+
+func.func @rng_bit_generator(%state: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
+  %new_state_init = tensor.empty() : tensor<2xui64>
+  %output_init = tensor.empty() : tensor<10x12xui32>
+  %new_state, %output = "xla_cpu.rng_bit_generator"(%state, %new_state_init,
+      %output_init) {
+    rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  } : (tensor<2xui64>, tensor<2xui64>, tensor<10x12xui32>)
+      -> (tensor<2xui64>, tensor<10x12xui32>)
+  func.return %new_state, %output : tensor<2xui64>, tensor<10x12xui32>
+}
+
+// CHECK-LABEL: @rng_bit_generator
+//  CHECK-SAME:   %[[STATE:.*]]: tensor
+//       CHECK: %[[STATE_MEMREF:.*]] = bufferization.to_memref %[[STATE]]
+//       CHECK: %[[STATE_OUT:.*]] = memref.alloc() {{.*}}<2xui64>
+//       CHECK: %[[OUTPUT:.*]] = memref.alloc() {{.*}}<10x12xui32>
+//       CHECK: "xla_cpu.rng_bit_generator"(%[[STATE_MEMREF]], %[[STATE_OUT]], %[[OUTPUT]])
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/invalid.mlir b/tensorflow/compiler/xla/mlir/xla_cpu/tests/invalid.mlir
new file mode 100644
index 00000000000..8f9584417e6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/invalid.mlir
@@ -0,0 +1,7 @@
+// RUN: xla-cpu-opt %s -split-input-file -verify-diagnostics
+
+func.func @memref_cast_out_of_place(%arg0: memref<10xi1>) -> memref<10xi16> {
+  // expected-error @+1 {{cannot cast from 'i1' to 'i16'}}
+  %ret = xla_cpu.memref_element_cast %arg0 : memref<10xi1> to memref<10xi16>
+  return %ret : memref<10xi16>
+}
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/ops.mlir b/tensorflow/compiler/xla/mlir/xla_cpu/tests/ops.mlir
new file mode 100644
index 00000000000..7f06ab3fd3d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/ops.mlir
@@ -0,0 +1,16 @@
+// RUN: xla-cpu-opt %s -split-input-file -empty-tensor-to-alloc-tensor \
+// RUN:   -one-shot-bufferize | FileCheck %s
+
+func.func @memref_cast(%arg0: memref<10xf32>) -> memref<10xi32> {
+  %ret = xla_cpu.memref_element_cast %arg0 : memref<10xf32> to memref<10xi32>
+  return %ret : memref<10xi32>
+}
+
+// CHECK: xla_cpu.memref_element_cast {{.*}} : memref<10xf32> to memref<10xi32>
+
+func.func @memref_cast_i1(%arg0: memref<10xi1>) -> memref<10xi8> {
+  %ret = xla_cpu.memref_element_cast %arg0 : memref<10xi1> to memref<10xi8>
+  return %ret : memref<10xi8>
+}
+
+// CHECK: xla_cpu.memref_element_cast {{.*}} : memref<10xi1> to memref<10xi8>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/BUILD b/tensorflow/compiler/xla/mlir_hlo/BUILD
index 877c1c738c4..fb8b4700681 100644
--- a/tensorflow/compiler/xla/mlir_hlo/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/BUILD
@@ -1,30 +1,31 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "gentbl_filegroup", "td_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
 exports_files([
-    "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
-    "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td",
+    "mhlo/IR/hlo_ops.td",
+    "lhlo/IR/lhlo_ops.td",
 ])
 
 # Python extension sources.
-exports_files(["python/MlirHloModule.cc"])
+exports_files(["bindings/python/MlirHloModule.cc"])
 
 filegroup(
     name = "hlo_ops_td_filegroup",
-    srcs = glob(["include/mlir-hlo/Dialect/mhlo/IR/*.td"]),
+    srcs = glob(["mhlo/IR/*.td"]),
 )
 
 td_library(
     name = "hlo_ops_td_files",
-    srcs = glob(["include/mlir-hlo/Dialect/mhlo/IR/*.td"]),
+    srcs = glob(["mhlo/IR/*.td"]),
     compatible_with = get_compatible_with_cloud(),
-    includes = ["include"],
+    includes = ["."],
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
@@ -45,130 +46,133 @@ td_library(
 gentbl_cc_library(
     name = "mhlo_pass_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=AllMhlo",
             ],
-            "include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc",
+            "mhlo/transforms/mhlo_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td",
+    td_file = "mhlo/transforms/mhlo_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 gentbl_cc_library(
     name = "lmhlo_pass_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=AllLmhlo",
             ],
-            "include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc",
+            "lhlo/transforms/lmhlo_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.td",
+    td_file = "lhlo/transforms/lmhlo_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 gentbl_cc_library(
     name = "hlo_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc",
+            "mhlo/IR/hlo_ops.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc",
+            "mhlo/IR/hlo_ops.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "hlo_ops_attrs_inc_gen",
     compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-attrdef-decls"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.h.inc",
+            "mhlo/IR/hlo_ops_attrs.h.inc",
         ),
         (
             ["-gen-attrdef-defs"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.cc.inc",
+            "mhlo/IR/hlo_ops_attrs.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "hlo_ops_enums_inc_gen",
     compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-enum-decls"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.h.inc",
+            "mhlo/IR/hlo_ops_enums.h.inc",
         ),
         (
             ["-gen-enum-defs"],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.cc.inc",
+            "mhlo/IR/hlo_ops_enums.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "hlo_ops_typedefs_inc_gen",
     compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-typedef-decls",
                 "--typedefs-dialect=mhlo",
             ],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.h.inc",
+            "mhlo/IR/hlo_ops_typedefs.h.inc",
         ),
         (
             [
                 "-gen-typedef-defs",
                 "--typedefs-dialect=mhlo",
             ],
-            "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.cc.inc",
+            "mhlo/IR/hlo_ops_typedefs.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "hlo_ops_pattern_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "lib/Dialect/mhlo/IR/",
+    strip_include_prefix = "mhlo/IR/",
     tbl_outs = [
         (
             ["-gen-rewriters"],
-            "lib/Dialect/mhlo/IR/hlo_patterns.cc.inc",
+            "mhlo/IR/hlo_patterns.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/IR/hlo_patterns.td",
+    td_file = "mhlo/IR/hlo_patterns.td",
     deps = [
         ":hlo_ops_td_files",
         "@llvm-project//mlir:FuncTdFiles",
@@ -179,95 +183,95 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "lhlo_ops_structs_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-attrdef-decls"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h.inc",
+            "lhlo/IR/lhlo_ops_structs.h.inc",
         ),
         (
             ["-gen-attrdef-defs"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.cc.inc",
+            "lhlo/IR/lhlo_ops_structs.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.td",
+    td_file = "lhlo/IR/lhlo_ops_structs.td",
     deps = [":lhlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "lhlo_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h.inc",
+            "lhlo/IR/lhlo_ops.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.cc.inc",
+            "lhlo/IR/lhlo_ops.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td",
+    td_file = "lhlo/IR/lhlo_ops.td",
     deps = [":lhlo_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "lhlo_gpu_ops_enums_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-enum-decls"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.h.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_enums.h.inc",
         ),
         (
             ["-gen-enum-defs"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.cc.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_enums.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
+    td_file = "lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
     deps = [":lhlo_gpu_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "lhlo_gpu_ops_dialect_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-dialect-decls"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_dialect.h.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_dialect.h.inc",
         ),
         (
             ["-gen-dialect-defs"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_dialect.cc.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_dialect.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
+    td_file = "lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
     deps = [":lhlo_gpu_ops_td_files"],
 )
 
 gentbl_cc_library(
     name = "lhlo_gpu_ops_attrdefs_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-attrdef-decls"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.h.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.h.inc",
         ),
         (
             ["-gen-attrdef-defs"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
+    td_file = "lhlo_gpu/IR/lhlo_gpu_ops_enums.td",
     deps = [":lhlo_gpu_ops_td_files"],
 )
 
@@ -284,7 +288,7 @@ gentbl_filegroup(
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "mhlo/IR/hlo_ops.td",
     deps = [":hlo_ops_td_files"],
 )
 
@@ -301,15 +305,15 @@ gentbl_filegroup(
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td",
+    td_file = "lhlo/IR/lhlo_ops.td",
     deps = [":lhlo_ops_td_files"],
 )
 
 cc_library(
     name = "hlo_ops_common",
-    srcs = ["lib/Dialect/mhlo/IR/hlo_ops_common.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"],
-    includes = ["include"],
+    srcs = ["mhlo/IR/hlo_ops_common.cc"],
+    hdrs = ["mhlo/IR/hlo_ops_common.h"],
+    strip_include_prefix = ".",
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -318,9 +322,9 @@ cc_library(
 
 td_library(
     name = "lhlo_gpu_ops_td_files",
-    srcs = glob(["include/mlir-hlo/Dialect/lhlo_gpu/IR/*.td"]),
+    srcs = glob(["lhlo_gpu/IR/*.td"]),
     compatible_with = get_compatible_with_cloud(),
-    includes = ["include"],
+    includes = ["."],
     deps = [
         ":hlo_ops_td_files",
         ":lhlo_ops_td_files",
@@ -331,19 +335,19 @@ td_library(
 gentbl_cc_library(
     name = "lhlo_gpu_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc.inc",
+            "lhlo_gpu/IR/lhlo_gpu_ops.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.td",
+    td_file = "lhlo_gpu/IR/lhlo_gpu_ops.td",
     deps = [":lhlo_gpu_ops_td_files"],
 )
 
@@ -351,23 +355,23 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "canonicalize_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "lib/Dialect/mhlo/IR/",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-rewriters"],
-            "lib/Dialect/mhlo/IR/mhlo_canonicalize.inc",
+            "mhlo/IR/mhlo_canonicalize.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/IR/mhlo_canonicalize.td",
+    td_file = "mhlo/IR/mhlo_canonicalize.td",
     deps = [":hlo_ops_td_files"],
 )
 
 td_library(
     name = "lhlo_ops_td_files",
-    srcs = glob(["include/mlir-hlo/Dialect/lhlo/IR/*.td"]),
+    srcs = glob(["lhlo/IR/*.td"]),
     compatible_with = get_compatible_with_cloud(),
-    includes = ["include"],
+    includes = ["."],
     deps = [
         ":hlo_ops_td_files",
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
@@ -385,29 +389,30 @@ td_library(
 gentbl_cc_library(
     name = "lhlo_structured_interface_inc_gen",
     compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-interface-decls"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h.inc",
+            "lhlo/IR/lhlo_structured_interface.h.inc",
         ),
         (
             ["-gen-op-interface-defs"],
-            "include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.cpp.inc",
+            "lhlo/IR/lhlo_structured_interface.cpp.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.td",
+    td_file = "lhlo/IR/lhlo_structured_interface.td",
     deps = [":lhlo_ops_td_files"],
 )
 
 cc_library(
     name = "lhlo_structured_interface",
-    srcs = ["lib/Dialect/lhlo/IR/lhlo_structured_interface.cc"],
+    srcs = ["lhlo/IR/lhlo_structured_interface.cc"],
     hdrs = [
-        "include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h",
-        "include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h.inc",
+        "lhlo/IR/lhlo_structured_interface.h",
+        "lhlo/IR/lhlo_structured_interface.h.inc",
     ],
-    includes = ["include"],
+    strip_include_prefix = ".",
     deps = [
         ":lhlo_structured_interface_inc_gen",
         "@llvm-project//mlir:IR",
@@ -417,9 +422,9 @@ cc_library(
 
 cc_library(
     name = "convert_op_folder",
-    srcs = ["lib/utils/convert_op_folder.cc"],
-    hdrs = ["include/mlir-hlo/utils/convert_op_folder.h"],
-    includes = ["include"],
+    srcs = ["utils/convert_op_folder.cc"],
+    hdrs = ["utils/convert_op_folder.h"],
+    strip_include_prefix = ".",
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -429,24 +434,24 @@ cc_library(
 cc_library(
     name = "mlir_hlo",
     srcs = [
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.cc.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.h.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.cc.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.h.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.cc.inc",
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.h.inc",
-        "lib/Dialect/mhlo/IR/hlo_ops.cc",
-        "lib/Dialect/mhlo/IR/mhlo_bytecode.cc",
-        "lib/utils/hlo_utils.cc",
+        "mhlo/IR/hlo_ops.cc",
+        "mhlo/IR/hlo_ops.cc.inc",
+        "mhlo/IR/hlo_ops.h.inc",
+        "mhlo/IR/hlo_ops_attrs.cc.inc",
+        "mhlo/IR/hlo_ops_attrs.h.inc",
+        "mhlo/IR/hlo_ops_enums.cc.inc",
+        "mhlo/IR/hlo_ops_enums.h.inc",
+        "mhlo/IR/hlo_ops_typedefs.cc.inc",
+        "mhlo/IR/hlo_ops_typedefs.h.inc",
+        "mhlo/IR/mhlo_bytecode.cc",
+        "utils/hlo_utils.cc",
     ],
     hdrs = [
-        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h",
-        "include/mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h",
-        "include/mlir-hlo/utils/hlo_utils.h",
+        "mhlo/IR/hlo_ops.h",
+        "mhlo/IR/mhlo_bytecode.h",
+        "utils/hlo_utils.h",
     ],
-    includes = ["include"],
+    strip_include_prefix = ".",
     deps = [
         ":canonicalize_inc_gen",
         ":convert_op_folder",
@@ -465,6 +470,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
@@ -485,13 +491,13 @@ cc_library(
 
 cc_library(
     name = "lhlo",
-    srcs = ["lib/Dialect/lhlo/IR/lhlo_ops.cc"],
+    srcs = ["lhlo/IR/lhlo_ops.cc"],
     hdrs = [
-        "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h",
-        "include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h",
-        "include/mlir-hlo/utils/lhlo_utils.h",
+        "lhlo/IR/lhlo_ops.h",
+        "lhlo/IR/lhlo_ops_structs.h",
+        "lhlo/utils/lhlo_utils.h",
     ],
-    includes = ["include"],
+    strip_include_prefix = ".",
     deps = [
         ":hlo_ops_common",
         ":lhlo_ops_inc_gen",
@@ -507,14 +513,15 @@ cc_library(
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:ViewLikeInterface",
+        "@stablehlo//:stablehlo_type_inference",
     ],
 )
 
 cc_library(
     name = "lhlo_gpu",
-    srcs = ["lib/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"],
-    includes = ["include"],
+    srcs = ["lhlo_gpu/IR/lhlo_gpu_ops.cc"],
+    hdrs = ["lhlo_gpu/IR/lhlo_gpu_ops.h"],
+    strip_include_prefix = ".",
     deps = [
         ":hlo_ops_common",
         ":lhlo",
@@ -532,9 +539,9 @@ cc_library(
 
 cc_library(
     name = "lhlo_gpu_ops_ops",
-    srcs = ["include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"],
-    hdrs = ["include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h.inc"],
-    includes = ["include"],
+    srcs = ["lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"],
+    hdrs = ["lhlo_gpu/IR/lhlo_gpu_ops.h.inc"],
+    strip_include_prefix = ".",
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -555,8 +562,9 @@ cc_library(
 
 cc_library(
     name = "hlo_dialect_registration",
-    srcs = ["lib/Dialect/mhlo/IR/init.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/IR/register.h"],
+    srcs = ["mhlo/IR/init.cc"],
+    hdrs = ["mhlo/IR/register.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:IR",
@@ -569,59 +577,62 @@ cc_library(
 cc_library(
     name = "mhlo_passes",
     srcs = [
-        "include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc",
-        "lib/Dialect/mhlo/transforms/broadcast_propagation.cc",
-        "lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc",
-        "lib/Dialect/mhlo/transforms/collapse_elementwise_map.cc",
-        "lib/Dialect/mhlo/transforms/constraint_fusion_pass.cc",
-        "lib/Dialect/mhlo/transforms/convert_to_signless_pass.cc",
-        "lib/Dialect/mhlo/transforms/expand_hlo_tuples.cc",
-        "lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc",
-        "lib/Dialect/mhlo/transforms/generated_lower_complex.inc",
-        "lib/Dialect/mhlo/transforms/group_reduction_dimensions.cc",
-        "lib/Dialect/mhlo/transforms/hlo_legalize_shape_ops_to_standard.cc",
-        "lib/Dialect/mhlo/transforms/hlo_legalize_to_arithmetic.cc",
-        "lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc",
-        "lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc",
-        "lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo_pass.cc",
-        "lib/Dialect/mhlo/transforms/legalize_control_flow.cc",
-        "lib/Dialect/mhlo/transforms/legalize_einsum_to_dot_general.cc",
-        "lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc",
-        "lib/Dialect/mhlo/transforms/legalize_mhlo_to_thlo.cc",
-        "lib/Dialect/mhlo/transforms/legalize_shape_computations.cc",
-        "lib/Dialect/mhlo/transforms/legalize_sort.cc",
-        "lib/Dialect/mhlo/transforms/legalize_to_linalg.cc",
-        "lib/Dialect/mhlo/transforms/legalize_to_standard.cc",
-        "lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc",
-        "lib/Dialect/mhlo/transforms/lower_complex.cc",
-        "lib/Dialect/mhlo/transforms/lower_general_dot.cc",
-        "lib/Dialect/mhlo/transforms/materialize_broadcasts.cc",
-        "lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc",
-        "lib/Dialect/mhlo/transforms/merge_assuming_ops.cc",
-        "lib/Dialect/mhlo/transforms/mhlo_canonicalize_gather.cc",
-        "lib/Dialect/mhlo/transforms/mhlo_canonicalize_reduction.cc",
-        "lib/Dialect/mhlo/transforms/mhlo_canonicalize_scatter.cc",
-        "lib/Dialect/mhlo/transforms/mhlo_flatten_tuple.cc",
-        "lib/Dialect/mhlo/transforms/optimize_mhlo.cc",
-        "lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc",
-        "lib/Dialect/mhlo/transforms/prepare_for_export.cc",
-        "lib/Dialect/mhlo/transforms/rank_specialization.cc",
-        "lib/Dialect/mhlo/transforms/restrict_max_rank.cc",
-        "lib/Dialect/mhlo/transforms/shape_reification_pass.cc",
-        "lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc",
-        "lib/Dialect/mhlo/transforms/sparse_chlo_legalize_to_linalg.cc",
-        "lib/Dialect/mhlo/transforms/sparse_rewriting.cc",
-        "lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo_pass.cc",
-        "lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc",
-        "lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc",
+        "mhlo/transforms/broadcast_propagation/broadcast_propagation.cc",
+        "mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc",
+        "mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc",
+        "mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc",
+        "mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc",
+        "mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc",
+        "mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc",
+        "mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc",
+        "mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc",
+        "mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc",
+        "mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc",
+        "mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc",
+        "mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc",
+        "mhlo/transforms/legalize_control_flow/legalize_control_flow.cc",
+        "mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc",
+        "mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc",
+        "mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc",
+        "mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc",
+        "mhlo/transforms/legalize_sort/legalize_sort.cc",
+        "mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc",
+        "mhlo/transforms/legalize_to_standard/generated_legalize_to_standard.inc",
+        "mhlo/transforms/legalize_to_standard/legalize_to_standard.cc",
+        "mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc",
+        "mhlo/transforms/lower_complex/generated_lower_complex.inc",
+        "mhlo/transforms/lower_complex/lower_complex.cc",
+        "mhlo/transforms/lower_general_dot/lower_general_dot.cc",
+        "mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc",
+        "mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc",
+        "mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc",
+        "mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc",
+        "mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc",
+        "mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc",
+        "mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc",
+        "mhlo/transforms/mhlo_passes.h.inc",
+        "mhlo/transforms/optimize_mhlo/optimize_mhlo.cc",
+        "mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc",
+        "mhlo/transforms/prepare_for_export/prepare_for_export.cc",
+        "mhlo/transforms/rank_specialization/rank_specialization.cc",
+        "mhlo/transforms/restrict_max_rank/restrict_max_rank.cc",
+        "mhlo/transforms/shape_reification/shape_reification_pass.cc",
+        "mhlo/transforms/shape_simplification/shape_simplification.cc",
+        "mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc",
+        "mhlo/transforms/sparse_chlo_legalize_to_linalg/sparse_chlo_legalize_to_linalg.cc",
+        "mhlo/transforms/sparse_rewriting/sparse_rewriting.cc",
+        "mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc",
+        "mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc",
+        "mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc",
+        "mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc",
     ],
     hdrs = [
-        "include/mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h",
-        "include/mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h",
-        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
-        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+        "mhlo/interfaces/bufferizable_op_interface_impl.h",
+        "mhlo/transforms/passes.h",
+        "mhlo/transforms/rewriters.h",
+        "mhlo/utils/legalize_to_linalg_utils.h",
     ],
-    includes = ["include"],
+    strip_include_prefix = ".",
     deps = [
         ":chlo_legalize_to_hlo",
         ":gml_st_bufferizable_op_interface",
@@ -636,6 +647,7 @@ cc_library(
         ":mhlo_pass_inc_gen",
         ":mhlo_scatter_gather_utils",
         ":mlir_hlo",
+        ":shape_component_analysis",
         ":stablehlo_legalize_to_hlo",
         ":thlo",
         ":thlo_bufferizable_op_interface",
@@ -655,6 +667,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:LinalgUtils",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
@@ -662,6 +675,7 @@ cc_library(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeTransforms",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
@@ -674,9 +688,9 @@ cc_library(
 
 cc_library(
     name = "type_conversion",
-    srcs = ["lib/Dialect/mhlo/transforms/type_conversion.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"],
-    includes = ["include"],
+    srcs = ["mhlo/utils/type_conversion.cc"],
+    hdrs = ["mhlo/utils/type_conversion.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:FuncDialect",
@@ -690,7 +704,8 @@ cc_library(
 
 cc_library(
     name = "map_lmhlo_to_scalar_op",
-    hdrs = ["include/mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h"],
+    hdrs = ["lhlo/transforms/map_lmhlo_to_scalar_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":lhlo",
         ":map_lhlo_to_hlo_op",
@@ -708,7 +723,8 @@ cc_library(
 
 cc_library(
     name = "map_mhlo_to_scalar_op",
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"],
+    hdrs = ["mhlo/transforms/map_mhlo_to_scalar_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -723,7 +739,8 @@ cc_library(
 
 cc_library(
     name = "map_chlo_to_hlo_op",
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"],
+    hdrs = ["mhlo/transforms/map_chlo_to_hlo_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:IR",
@@ -733,7 +750,8 @@ cc_library(
 
 cc_library(
     name = "map_hlo_to_lhlo_op",
-    hdrs = ["include/mlir-hlo/Dialect/lhlo/transforms/map_hlo_to_lhlo_op.h"],
+    hdrs = ["lhlo/transforms/map_hlo_to_lhlo_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":lhlo",
         ":mlir_hlo",
@@ -742,7 +760,8 @@ cc_library(
 
 cc_library(
     name = "map_lhlo_to_hlo_op",
-    hdrs = ["include/mlir-hlo/Dialect/lhlo/transforms/map_lhlo_to_hlo_op.h"],
+    hdrs = ["lhlo/transforms/map_lhlo_to_hlo_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":lhlo",
         ":mlir_hlo",
@@ -751,7 +770,8 @@ cc_library(
 
 cc_library(
     name = "map_stablehlo_to_hlo_op",
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h"],
+    hdrs = ["mhlo/transforms/map_stablehlo_to_hlo_op.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:IR",
@@ -762,14 +782,14 @@ cc_library(
 cc_library(
     name = "lmhlo_passes",
     srcs = [
-        "include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc",
-        "lib/Dialect/lhlo/transforms/legalize_to_tensor_op.cc",
-        "lib/Dialect/lhlo/transforms/lhlo_fuse_linalg.cc",
-        "lib/Dialect/lhlo/transforms/lhlo_legalize_to_affine.cc",
-        "lib/Dialect/lhlo/transforms/lhlo_legalize_to_gpu.cc",
-        "lib/Dialect/lhlo/transforms/lhlo_legalize_to_parallel_loops.cc",
-    ],
-    hdrs = ["include/mlir-hlo/Dialect/lhlo/transforms/passes.h"],
+        "lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc",
+        "lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc",
+        "lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc",
+        "lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc",
+        "lhlo/transforms/lmhlo_passes.h.inc",
+    ],
+    hdrs = ["lhlo/transforms/passes.h"],
+    strip_include_prefix = ".",
     deps = [
         ":lhlo",
         ":lmhlo_pass_inc_gen",
@@ -782,7 +802,6 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgAnalysis",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MemRefDialect",
@@ -799,9 +818,9 @@ cc_library(
 
 cc_library(
     name = "codegen_utils",
-    srcs = ["lib/utils/codegen_utils.cc"],
-    hdrs = ["include/mlir-hlo/utils/codegen_utils.h"],
-    includes = ["include"],
+    srcs = ["utils/codegen_utils.cc"],
+    hdrs = ["utils/codegen_utils.h"],
+    strip_include_prefix = ".",
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
@@ -815,15 +834,16 @@ cc_library(
 
 cc_library(
     name = "placement_utils",
-    hdrs = ["include/mlir-hlo/utils/placement_utils.h"],
-    includes = ["include"],
+    hdrs = ["utils/placement_utils.h"],
+    strip_include_prefix = ".",
     deps = ["@llvm-project//llvm:Support"],
 )
 
 cc_library(
     name = "lhlo_elemental_utils",
-    srcs = ["lib/Dialect/lhlo/transforms/lhlo_elemental_utils.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/lhlo/transforms/lhlo_elemental_utils.h"],
+    srcs = ["lhlo/transforms/lhlo_elemental_utils.cc"],
+    hdrs = ["lhlo/transforms/lhlo_elemental_utils.h"],
+    strip_include_prefix = ".",
     deps = [
         ":codegen_utils",
         ":lhlo",
@@ -842,8 +862,9 @@ cc_library(
 
 cc_library(
     name = "legalize_to_linalg_utils",
-    srcs = ["lib/Dialect/mhlo/transforms/legalize_to_linalg_utils.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h"],
+    srcs = ["mhlo/utils/legalize_to_linalg_utils.cc"],
+    hdrs = ["mhlo/utils/legalize_to_linalg_utils.h"],
+    strip_include_prefix = ".",
     deps = [
         ":map_mhlo_to_scalar_op",
         ":mlir_hlo",
@@ -868,8 +889,9 @@ cc_library(
 
 cc_library(
     name = "mhlo_scatter_gather_utils",
-    srcs = ["lib/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h"],
+    srcs = ["mhlo/utils/mhlo_scatter_gather_utils.cc"],
+    hdrs = ["mhlo/utils/mhlo_scatter_gather_utils.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:DialectUtils",
@@ -880,15 +902,15 @@ cc_library(
 gentbl_cc_library(
     name = "legalize_to_standard_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
+    strip_include_prefix = "mhlo/transforms/",
     tbl_outs = [
         (
             ["-gen-rewriters"],
-            "lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc",
+            "mhlo/transforms/legalize_to_standard/generated_legalize_to_standard.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td",
+    td_file = "mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td",
     deps = [
         ":hlo_ops_td_files",
         "@llvm-project//mlir:ArithOpsTdFiles",
@@ -900,15 +922,15 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "lower_complex_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
+    strip_include_prefix = "mhlo/transforms/",
     tbl_outs = [
         (
             ["-gen-rewriters"],
-            "lib/Dialect/mhlo/transforms/generated_lower_complex.inc",
+            "mhlo/transforms/lower_complex/generated_lower_complex.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/transforms/lower_complex_patterns.td",
+    td_file = "mhlo/transforms/lower_complex/lower_complex_patterns.td",
     deps = [
         ":hlo_ops_td_files",
         "@llvm-project//mlir:FuncTdFiles",
@@ -917,8 +939,9 @@ gentbl_cc_library(
 
 cc_library(
     name = "unfuse_batch_norm",
-    srcs = ["lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    srcs = ["mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc"],
+    hdrs = ["mhlo/transforms/rewriters.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -933,8 +956,9 @@ cc_library(
 
 cc_library(
     name = "chlo_legalize_to_hlo",
-    srcs = ["lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    srcs = ["mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc"],
+    hdrs = ["mhlo/transforms/rewriters.h"],
+    strip_include_prefix = ".",
     deps = [
         ":chlo_legalize_to_hlo_inc_gen",
         ":map_chlo_to_hlo_op",
@@ -955,22 +979,23 @@ cc_library(
 gentbl_cc_library(
     name = "chlo_legalize_to_hlo_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
+    strip_include_prefix = "mhlo/transforms",
     tbl_outs = [
         (
             ["-gen-rewriters"],
-            "lib/Dialect/mhlo/transforms/generated_chlo_legalize_to_hlo.inc",
+            "mhlo/transforms/chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td",
+    td_file = "mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td",
     deps = [":hlo_ops_td_files"],
 )
 
 cc_library(
     name = "hlo_legalize_to_stablehlo",
-    srcs = ["lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    srcs = ["mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc"],
+    hdrs = ["mhlo/transforms/rewriters.h"],
+    strip_include_prefix = ".",
     deps = [
         ":map_stablehlo_to_hlo_op",
         ":mlir_hlo",
@@ -985,12 +1010,14 @@ cc_library(
 
 cc_library(
     name = "stablehlo_legalize_to_hlo",
-    srcs = ["lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    srcs = ["mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc"],
+    hdrs = ["mhlo/transforms/rewriters.h"],
+    strip_include_prefix = ".",
     deps = [
         ":map_stablehlo_to_hlo_op",
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -1004,18 +1031,19 @@ cc_library(
     srcs = [
         # These are not exposed as headers in the dependent targets, and
         # shouldn't be. Ideally, this entire target should be removed.
-        "include/mlir-hlo/Dialect/gml_st/transforms/passes.h.inc",
-        "include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc",
-        "include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc",
-        "include/mlir-hlo/Transforms/passes.h.inc",
+        "lhlo/transforms/lmhlo_passes.h.inc",
+        "gml_st/transforms/passes.h.inc",
+        "thlo/transforms/thlo_passes.h.inc",
+        "transforms/passes.h.inc",
     ],
     hdrs = [
-        "include/mlir-hlo/Dialect/gml_st/transforms/passes.h",
-        "include/mlir-hlo/Dialect/lhlo/transforms/passes.h",
-        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
-        "include/mlir-hlo/Dialect/thlo/transforms/passes.h",
-        "include/mlir-hlo/Transforms/passes.h",
+        "gml_st/transforms/passes.h",
+        "lhlo/transforms/passes.h",
+        "mhlo/transforms/passes.h",
+        "thlo/transforms/passes.h",
+        "transforms/passes.h",
     ],
+    strip_include_prefix = ".",
     deps = [
         ":chlo_legalize_to_hlo",
         ":gml_st_passes",
@@ -1033,6 +1061,7 @@ cc_library(
         ":userange_analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:Pass",
     ],
 )
@@ -1040,30 +1069,29 @@ cc_library(
 cc_library(
     name = "transforms_passes",
     srcs = [
-        "include/mlir-hlo/Transforms/passes.h.inc",
-        "include/mlir-hlo/Transforms/rewriters.h",
-        "lib/Analysis/test_shape_component_analysis.cc",
-        "lib/Analysis/test_userange_analysis.cc",
-        "lib/Transforms/alloc_to_arg_pass.cc",
-        "lib/Transforms/buffer_packing.cc",
-        "lib/Transforms/buffer_reuse.cc",
-        "lib/Transforms/bufferize.cc",
-        "lib/Transforms/bufferize_pass.cc",
-        "lib/Transforms/collapse_parallel_loops_to_1d_pass.cc",
-        "lib/Transforms/copy_removal.cc",
-        "lib/Transforms/detensorize_scf_ops.cc",
-        "lib/Transforms/generic_host_to_llvm.cc",
-        "lib/Transforms/inline_fusion_pass.cc",
-        "lib/Transforms/lower_index_cast_pass.cc",
-        "lib/Transforms/propagate_static_shapes_to_kernel.cc",
-        "lib/Transforms/scalarization.cc",
-        "lib/Transforms/shape_simplification.cc",
-        "lib/Transforms/symbolic_shape_optimization.cc",
-        "lib/Transforms/tile_loops_pass.cc",
-        "lib/Transforms/unbufferize_pass.cc",
-        "lib/Transforms/unroll_loops.cc",
-    ],
-    hdrs = ["include/mlir-hlo/Transforms/passes.h"],
+        "analysis/test_userange_analysis.cc",
+        "mhlo/analysis/test_shape_component_analysis.cc",
+        "transforms/alloc_to_arg_pass.cc",
+        "transforms/buffer_packing.cc",
+        "transforms/buffer_reuse.cc",
+        "transforms/bufferize.cc",
+        "transforms/bufferize_pass.cc",
+        "transforms/collapse_parallel_loops_to_1d_pass.cc",
+        "transforms/copy_removal.cc",
+        "transforms/detensorize_scf_ops.cc",
+        "transforms/generic_host_to_llvm.cc",
+        "transforms/lower_index_cast_pass.cc",
+        "transforms/propagate_static_shapes_to_kernel.cc",
+        "transforms/tile_loops_pass.cc",
+        "transforms/unbufferize_pass.cc",
+        "transforms/unroll_loops.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/passes.h.inc",
+        "transforms/rewriters.h",
+    ],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st",
         ":gml_st_bufferizable_op_interface",
@@ -1079,6 +1107,7 @@ cc_library(
         ":userange_analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
@@ -1092,6 +1121,7 @@ cc_library(
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:CopyOpInterface",
+        "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:FuncTransforms",
@@ -1132,12 +1162,14 @@ cc_library(
 cc_library(
     name = "transforms_gpu_passes",
     srcs = [
-        "include/mlir-hlo/Transforms/gpu_passes.h.inc",
-        "lib/Transforms/gpu_fusion_rewrite.cc",
-        "lib/Transforms/gpu_kernel_lowering_passes.cc",
-        "lib/Transforms/hlo_to_gpu_pipeline.cc",
-    ],
-    hdrs = ["include/mlir-hlo/Transforms/gpu_passes.h"],
+        "transforms/gpu_fusion_rewrite.cc",
+        "transforms/gpu_kernel_lowering_passes.cc",
+        "transforms/gpu_passes.h.inc",
+        "transforms/hlo_to_gpu_pipeline.cc",
+        "transforms/hlo_to_triton_pipeline.cc",
+    ],
+    hdrs = ["transforms/gpu_passes.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st_passes",
         ":gpu_transforms_passes_inc_gen",
@@ -1148,6 +1180,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:BufferizationDialect",
@@ -1167,6 +1200,7 @@ cc_library(
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
@@ -1174,39 +1208,43 @@ cc_library(
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:ShapeToStandard",
+        "@llvm-project//mlir:TensorInferTypeOpInterfaceImpl",
         "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
+        "@llvm-project//mlir:VectorTransforms",
     ],
 )
 
 gentbl_cc_library(
     name = "gml_st_test_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=GmlStTest",
             ],
-            "include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h.inc",
+            "gml_st/transforms/test_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/gml_st/transforms/test_passes.td",
+    td_file = "gml_st/transforms/test_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 cc_library(
     name = "gml_st_test_passes",
     srcs = [
-        "include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h.inc",
-        "lib/Dialect/gml_st/transforms/test_passes.cc",
+        "gml_st/transforms/test_passes.cc",
+        "gml_st/transforms/test_passes.h.inc",
     ],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h"],
-    includes = ["include"],
+    hdrs = ["gml_st/transforms/test_passes.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st_bufferizable_op_interface",
+        ":gml_st_passes",
         ":gml_st_test_passes_inc_gen",
         ":gml_st_transforms",
         "@llvm-project//mlir:AffineDialect",
@@ -1223,44 +1261,44 @@ cc_library(
 gentbl_cc_library(
     name = "transforms_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=LMHLOTransforms",
             ],
-            "include/mlir-hlo/Transforms/passes.h.inc",
+            "transforms/passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Transforms/passes.td",
+    td_file = "transforms/passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 gentbl_cc_library(
     name = "gpu_transforms_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=LMHLOGPUTransforms",
             ],
-            "include/mlir-hlo/Transforms/gpu_passes.h.inc",
+            "transforms/gpu_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Transforms/gpu_passes.td",
+    td_file = "transforms/gpu_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 cc_library(
     name = "userange_analysis",
-    srcs = ["lib/Analysis/userange_analysis.cc"],
-    hdrs = ["include/mlir-hlo/Analysis/userange_analysis.h"],
-    includes = ["include"],
+    srcs = ["analysis/userange_analysis.cc"],
+    hdrs = ["analysis/userange_analysis.h"],
+    strip_include_prefix = ".",
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -1272,9 +1310,9 @@ cc_library(
 
 cc_library(
     name = "shape_component_analysis",
-    srcs = ["lib/Analysis/shape_component_analysis.cc"],
-    hdrs = ["include/mlir-hlo/Analysis/shape_component_analysis.h"],
-    includes = ["include"],
+    srcs = ["mhlo/analysis/shape_component_analysis.cc"],
+    hdrs = ["mhlo/analysis/shape_component_analysis.h"],
+    strip_include_prefix = ".",
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -1289,31 +1327,50 @@ cc_library(
 cc_library(
     name = "gml_st_passes",
     srcs = [
-        "include/mlir-hlo/Dialect/gml_st/transforms/passes.h.inc",
-        "include/mlir-hlo/Dialect/gml_st/transforms/transforms.h",
-        "lib/Dialect/gml_st/transforms/collapse_materialize_ops.cc",
-        "lib/Dialect/gml_st/transforms/fusion.cc",
-        "lib/Dialect/gml_st/transforms/gml_st_to_gpu.cc",
-        "lib/Dialect/gml_st/transforms/gml_st_to_scf.cc",
-        "lib/Dialect/gml_st/transforms/linalg_utils.cc",
-        "lib/Dialect/gml_st/transforms/tiling.cc",
-        "lib/Dialect/gml_st/transforms/tiling_cwise.cc",
-        "lib/Dialect/gml_st/transforms/tiling_gpu_warp.cc",
-        "lib/Dialect/gml_st/transforms/tiling_softmax.cc",
-        "lib/Dialect/gml_st/transforms/transform_map_for_cpu.cc",
-        "lib/Dialect/gml_st/transforms/transform_matmul_for_cpu.cc",
-        "lib/Dialect/gml_st/transforms/transform_scatter_for_cpu.cc",
-        "lib/Dialect/gml_st/transforms/transform_transpose_for_cpu.cc",
-        "lib/Dialect/gml_st/transforms/vectorization.cc",
+        "gml_st/transforms/add_debug_info/add_debug_info.cc",
+        "gml_st/transforms/collapse_shape/collapse_shape.cc",
+        "gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc",
+        "gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc",
+        "gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_sort_for_cpu.cc",
+        "gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc",
+        "gml_st/transforms/fusion/fusion.cc",
+        "gml_st/transforms/gml_st_simtfy/gml_st_simtfy.cc",
+        "gml_st/transforms/gml_st_to_gpu/gml_st_to_gpu.cc",
+        "gml_st/transforms/gml_st_to_scf/gml_st_to_scf.cc",
+        "gml_st/transforms/gpu_tiling/greedy_fusion.cc",
+        "gml_st/transforms/gpu_tiling/tiling_cwise.cc",
+        "gml_st/transforms/gpu_tiling/tiling_gpu_warp.cc",
+        "gml_st/transforms/passes.h.inc",
+        "gml_st/transforms/peeling/peeling.cc",
+        "gml_st/transforms/rewrite_vector_ops/rewrite_vector_contract.cc",
+        "gml_st/transforms/rewrite_vector_ops/rewrite_vector_multi_reduction.cc",
+        "gml_st/transforms/rewrite_vector_ops/rewrite_vector_transpose.cc",
+        "gml_st/transforms/scalarization/scalarization.cc",
+        "gml_st/transforms/tiling/tiling.cc",
+        "gml_st/transforms/tiling_softmax/tiling_softmax.cc",
+        "gml_st/transforms/transforms.h",
+        "gml_st/transforms/triton_tiling/transform_matmul_for_triton.cc",
+        "gml_st/transforms/vectorization/vectorization.cc",
+        "gml_st/transforms/vectorization/vectorize_copy.cc",
+        "gml_st/transforms/vectorization/vectorize_for_cpu.cc",
+        "gml_st/transforms/vectorization/vectorize_for_gpu.cc",
+        "gml_st/utils/linalg_utils.cc",
     ],
     hdrs = [
-        "include/mlir-hlo/Dialect/gml_st/transforms/fusion.h",
-        "include/mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h",
-        "include/mlir-hlo/Dialect/gml_st/transforms/passes.h",
-        "include/mlir-hlo/Dialect/gml_st/transforms/rewriters.h",
-        "include/mlir-hlo/Dialect/gml_st/transforms/tiling.h",
-        "include/mlir-hlo/Dialect/gml_st/transforms/vector_utils.h",
-    ],
+        "gml_st/transforms/fusion/fusion.h",
+        "gml_st/transforms/passes.h",
+        "gml_st/transforms/peeling/peeling.h",
+        "gml_st/transforms/tiling/tiling.h",
+        "gml_st/transforms/vectorization/vectorization.h",
+        "gml_st/utils/linalg_utils.h",
+        "gml_st/utils/vector_utils.h",
+    ],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st",
         ":gml_st_passes_inc_gen",
@@ -1321,11 +1378,11 @@ cc_library(
         ":lhlo",
         ":mlir_hlo",
         ":thlo",
-        ":tiling_interface",
-        ":tiling_interface_impl",
         ":type_conversion",
+        "@llvm-project//llvm:BinaryFormat",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:BufferizationDialect",
@@ -1342,43 +1399,50 @@ cc_library(
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:LinalgUtils",
+        "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TensorInferTypeOpInterfaceImpl",
+        "@llvm-project//mlir:TensorTilingInterfaceImpl",
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TilingInterface",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorTransforms",
+        "@llvm-project//mlir:X86VectorTransforms",
         "@stablehlo//:chlo_ops",
     ],
 )
 
 CAPI_HEADERS = [
-    "include/mlir-hlo-c/Attributes.h",
-    "include/mlir-hlo-c/Dialects.h",
-    "include/mlir-hlo-c/Passes.h",
-    "include/mlir-hlo-c/Types.h",
+    "bindings/c/Attributes.h",
+    "bindings/c/Dialects.h",
+    "bindings/c/Passes.h",
+    "bindings/c/Types.h",
 ]
 
 CAPI_SOURCES = [
-    "lib/CAPI/Attributes.cc",
-    "lib/CAPI/Dialects.cc",
-    "lib/CAPI/Passes.cc",
-    "lib/CAPI/Types.cc",
+    "bindings/c/Attributes.cc",
+    "bindings/c/Dialects.cc",
+    "bindings/c/Passes.cc",
+    "bindings/c/Types.cc",
 ]
 
 cc_library(
     name = "CAPI",
     srcs = CAPI_SOURCES,
     hdrs = CAPI_HEADERS,
+    strip_include_prefix = ".",
     deps = [
         ":all_passes",
         ":mlir_hlo",
@@ -1390,7 +1454,7 @@ cc_library(
 cc_library(
     name = "CAPIHeaders",
     hdrs = CAPI_HEADERS,
-    includes = ["include"],
+    strip_include_prefix = ".",
     deps = ["@llvm-project//mlir:CAPIIRHeaders"],
 )
 
@@ -1399,6 +1463,7 @@ cc_library(
     name = "CAPIObjects",
     srcs = CAPI_SOURCES,
     hdrs = CAPI_HEADERS,
+    strip_include_prefix = ".",
     deps = [
         ":all_passes",
         ":mlir_hlo",
@@ -1435,7 +1500,6 @@ cc_binary(
 td_library(
     name = "MhloOpsPyTdFiles",
     srcs = ["@llvm-project//mlir:include/mlir/Bindings/Python/Attributes.td"],
-    includes = ["include"],
     deps = [
         ":hlo_ops_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
@@ -1450,27 +1514,27 @@ gentbl_filegroup(
                 "-gen-python-op-bindings",
                 "-bind-dialect=mhlo",
             ],
-            "python/mlir/dialects/_mhlo_ops_gen.py",
+            "bindings/python/mlir/dialects/_mhlo_ops_gen.py",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "python/mlir/dialects/MhloOps.td",
+    td_file = "bindings/python/mlir/dialects/MhloOps.td",
     deps = [":MhloOpsPyTdFiles"],
 )
 
 filegroup(
     name = "MhloOpsPyFiles",
     srcs = [
-        "python/mlir/dialects/mhlo.py",
+        "bindings/python/mlir/dialects/mhlo.py",
         ":MhloOpsPyGen",
     ],
 )
 
 td_library(
     name = "gml_st_ops_td_files",
-    srcs = glob(["include/mlir-hlo/Dialect/gml_st/IR/*.td"]),
+    srcs = glob(["gml_st/IR/*.td"]),
     compatible_with = get_compatible_with_cloud(),
-    includes = ["include"],
+    includes = ["."],
     deps = [
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
         "@llvm-project//mlir:DialectUtilsTdFiles",
@@ -1485,54 +1549,51 @@ td_library(
 gentbl_cc_library(
     name = "gml_st_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h.inc",
+            "gml_st/IR/gml_st_ops.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.cc.inc",
+            "gml_st/IR/gml_st_ops.cc.inc",
         ),
         (
             ["-gen-dialect-decls"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_dialect.h.inc",
+            "gml_st/IR/gml_st_dialect.h.inc",
         ),
         (
             ["-gen-dialect-defs"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_dialect.cc.inc",
+            "gml_st/IR/gml_st_dialect.cc.inc",
         ),
         (
             ["-gen-typedef-decls"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_types.h.inc",
+            "gml_st/IR/gml_st_types.h.inc",
         ),
         (
             ["-gen-typedef-defs"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_types.cc.inc",
+            "gml_st/IR/gml_st_types.cc.inc",
         ),
         (
             ["-gen-attrdef-decls"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_attrs.h.inc",
+            "gml_st/IR/gml_st_attrs.h.inc",
         ),
         (
             ["-gen-attrdef-defs"],
-            "include/mlir-hlo/Dialect/gml_st/IR/gml_st_attrs.cc.inc",
+            "gml_st/IR/gml_st_attrs.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.td",
-    td_srcs = [
-        "include/mlir-hlo/Dialect/gml_st/IR/gml_st_legacy_ops.td",
-    ],
+    td_file = "gml_st/IR/gml_st_ops.td",
     deps = [":gml_st_ops_td_files"],
 )
 
 cc_library(
     name = "gml_st",
-    srcs = ["lib/Dialect/gml_st/IR/gml_st_ops.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"],
-    includes = ["include"],
+    srcs = ["gml_st/IR/gml_st_ops.cc"],
+    hdrs = ["gml_st/IR/gml_st_ops.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st_ops_inc_gen",
         "@llvm-project//llvm:Support",
@@ -1547,82 +1608,16 @@ cc_library(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
-        "@llvm-project//mlir:ViewLikeInterface",
-    ],
-)
-
-td_library(
-    name = "tiling_interface_td_files",
-    srcs = ["include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td"],
-    includes = ["include"],
-    deps = ["@llvm-project//mlir:OpBaseTdFiles"],
-)
-
-gentbl_cc_library(
-    name = "tiling_interface_inc_gen",
-    compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
-    tbl_outs = [
-        (
-            ["-gen-op-interface-decls"],
-            "include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h.inc",
-        ),
-        (
-            ["-gen-op-interface-defs"],
-            "include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td",
-    deps = ["@llvm-project//mlir:OpBaseTdFiles"],
-)
-
-cc_library(
-    name = "tiling_interface",
-    srcs = [
-        "include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.cc.inc",
-        "include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h.inc",
-        "lib/Dialect/gml_st/transforms/tiling_interface.cc",
-    ],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"],
-    includes = ["include"],
-    deps = [
-        ":tiling_interface_inc_gen",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:ViewLikeInterface",
     ],
 )
 
-cc_library(
-    name = "tiling_interface_impl",
-    srcs = ["lib/Dialect/gml_st/transforms/tiling_interface_impl.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"],
-    includes = ["include"],
-    deps = [
-        ":gml_st",
-        ":thlo",
-        ":tiling_interface",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:DestinationStyleOpInterface",
-        "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LinalgUtils",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:TensorUtils",
-    ],
-)
-
 cc_library(
     name = "gml_st_bufferizable_op_interface",
-    srcs = ["lib/Dialect/gml_st/transforms/bufferizable_op_interface_impl.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h"],
-    includes = ["include"],
+    srcs = ["gml_st/interfaces/bufferizable_op_interface_impl.cc"],
+    hdrs = ["gml_st/interfaces/bufferizable_op_interface_impl.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st",
         "@llvm-project//mlir:ArithDialect",
@@ -1630,6 +1625,7 @@ cc_library(
         "@llvm-project//mlir:BufferizationDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:ViewLikeInterface",
     ],
 )
@@ -1637,43 +1633,48 @@ cc_library(
 gentbl_cc_library(
     name = "gml_st_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=GmlSt",
             ],
-            "include/mlir-hlo/Dialect/gml_st/transforms/passes.h.inc",
+            "gml_st/transforms/passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/gml_st/transforms/passes.td",
+    td_file = "gml_st/transforms/passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 cc_library(
     name = "gml_st_transforms",
-    srcs = ["lib/Dialect/gml_st/transforms/transforms.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/gml_st/transforms/transforms.h"],
-    includes = ["include"],
+    srcs = ["gml_st/transforms/transforms.cc"],
+    hdrs = ["gml_st/transforms/transforms.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
     ],
 )
 
 td_library(
     name = "thlo_ops_td_files",
-    srcs = glob(["include/mlir-hlo/Dialect/thlo/IR/*.td"]),
+    srcs = glob(["thlo/IR/*.td"]),
     compatible_with = get_compatible_with_cloud(),
-    includes = ["include"],
+    includes = ["."],
     deps = [
         "@llvm-project//mlir:ControlFlowInterfacesTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
@@ -1684,43 +1685,42 @@ td_library(
 gentbl_cc_library(
     name = "thlo_ops_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             ["-gen-op-decls"],
-            "include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h.inc",
+            "thlo/IR/thlo_ops.h.inc",
         ),
         (
             ["-gen-op-defs"],
-            "include/mlir-hlo/Dialect/thlo/IR/thlo_ops.cc.inc",
+            "thlo/IR/thlo_ops.cc.inc",
         ),
         (
             ["-gen-dialect-decls"],
-            "include/mlir-hlo/Dialect/thlo/IR/thlo_dialect.h.inc",
+            "thlo/IR/thlo_dialect.h.inc",
         ),
         (
             ["-gen-dialect-defs"],
-            "include/mlir-hlo/Dialect/thlo/IR/thlo_dialect.cc.inc",
+            "thlo/IR/thlo_dialect.cc.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/thlo/IR/thlo_ops.td",
-    td_srcs = ["include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td"],
+    td_file = "thlo/IR/thlo_ops.td",
     deps = [
         ":thlo_ops_td_files",
-        "@llvm-project//mlir:LinalgStructuredOpsTdFiles",
+        "@llvm-project//mlir:DestinationStyleOpInterfaceTdFiles",
+        "@llvm-project//mlir:TilingInterfaceTdFiles",
     ],
 )
 
 cc_library(
     name = "thlo",
-    srcs = ["lib/Dialect/thlo/IR/thlo_ops.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h"],
-    includes = ["include"],
+    srcs = ["thlo/IR/thlo_ops.cc"],
+    hdrs = ["thlo/IR/thlo_ops.h"],
+    strip_include_prefix = ".",
     deps = [
         ":gml_st",
         ":thlo_ops_inc_gen",
-        ":tiling_interface",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
@@ -1736,15 +1736,16 @@ cc_library(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TilingInterface",
         "@llvm-project//mlir:ViewLikeInterface",
     ],
 )
 
 cc_library(
     name = "thlo_bufferizable_op_interface",
-    srcs = ["lib/Dialect/thlo/transforms/bufferizable_op_interface_impl.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h"],
-    includes = ["include"],
+    srcs = ["thlo/interfaces/bufferizable_op_interface_impl.cc"],
+    hdrs = ["thlo/interfaces/bufferizable_op_interface_impl.h"],
+    strip_include_prefix = ".",
     deps = [
         ":thlo",
         "@llvm-project//mlir:BufferizationDialect",
@@ -1755,36 +1756,38 @@ cc_library(
 gentbl_cc_library(
     name = "thlo_passes_inc_gen",
     compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
+    strip_include_prefix = ".",
     tbl_outs = [
         (
             [
                 "-gen-pass-decls",
                 "-name=AllThlo",
             ],
-            "include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc",
+            "thlo/transforms/thlo_passes.h.inc",
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.td",
+    td_file = "thlo/transforms/thlo_passes.td",
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
 cc_library(
     name = "thlo_passes",
     srcs = [
-        "include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc",
-        "lib/Dialect/thlo/transforms/legalize_sort.cc",
+        "thlo/transforms/legalize_sort/legalize_sort.cc",
+        "thlo/transforms/thlo_passes.h.inc",
     ],
     hdrs = [
-        "include/mlir-hlo/Dialect/thlo/transforms/passes.h",
+        "thlo/transforms/passes.h",
     ],
+    strip_include_prefix = ".",
     deps = [
         ":thlo",
         ":thlo_passes_inc_gen",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
@@ -1800,3 +1803,106 @@ cc_binary(
     linkstatic = False,
     deps = ["@llvm-project//mlir:mlir_c_runner_utils"],
 )
+
+cc_library(
+    name = "mlir_interpreter_dialects",
+    srcs = glob(
+        [
+            "tools/mlir_interpreter/dialects/*.cc",
+        ],
+        exclude = ["tools/mlir_interpreter/dialects/util.cc"],
+    ),
+    strip_include_prefix = ".",
+    deps = [
+        ":gml_st",
+        ":mlir_hlo",
+        ":mlir_interpreter_dialect_utils",
+        ":mlir_interpreter_framework",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
+        "@llvm-project//mlir:ViewLikeInterface",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_interpreter_dialect_utils",
+    srcs = [
+        "tools/mlir_interpreter/dialects/util.cc",
+    ],
+    hdrs = [
+        "tools/mlir_interpreter/dialects/comparators.h",
+        "tools/mlir_interpreter/dialects/cwise_math.h",
+        "tools/mlir_interpreter/dialects/util.h",
+    ],
+    strip_include_prefix = ".",
+    deps = [
+        ":mlir_interpreter_framework",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ViewLikeInterface",
+    ],
+)
+
+cc_library(
+    name = "mlir_interpreter_framework",
+    srcs = [
+        "tools/mlir_interpreter/framework/interpreter.cc",
+        "tools/mlir_interpreter/framework/interpreter_value.cc",
+        "tools/mlir_interpreter/framework/registration.cc",
+        "tools/mlir_interpreter/framework/tensor_or_memref.cc",
+    ],
+    hdrs = [
+        "tools/mlir_interpreter/framework/interpreter.h",
+        "tools/mlir_interpreter/framework/interpreter_value.h",
+        "tools/mlir_interpreter/framework/interpreter_value_util.h",
+        "tools/mlir_interpreter/framework/registration.h",
+        "tools/mlir_interpreter/framework/tensor_or_memref.h",
+    ],
+    strip_include_prefix = ".",
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_binary(
+    name = "mlir-interpreter-runner",
+    srcs = ["tools/mlir_interpreter/mlir-interpreter-runner.cc"],
+    deps = [
+        ":gml_st",
+        ":hlo_dialect_registration",
+        ":lhlo",
+        ":lhlo_gpu",
+        ":mhlo_passes",
+        ":mlir_interpreter_dialects",
+        ":mlir_interpreter_framework",
+        ":thlo",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirReduceLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
index d9b372a5b53..f4e5bf8c2e3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
@@ -125,6 +125,7 @@ include(HandleLLVMOptions)
 include_directories(${LLVM_INCLUDE_DIRS})
 include_directories(${MLIR_INCLUDE_DIRS})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/)
 link_directories(${LLVM_BUILD_LIBRARY_DIR})
@@ -153,22 +154,26 @@ endif()
 
 set(MLIR_HLO_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(MLIR_HLO_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-set(MLIR_HLO_MAIN_INCLUDE_DIR ${MLIR_HLO_SOURCE_DIR}/include )
+set(MLIR_HLO_MAIN_INCLUDE_DIR ${MLIR_HLO_SOURCE_DIR}/include)
 set(MLIR_HLO_GEN_INCLUDE_DIR ${MLIR_HLO_BINARY_DIR}/include)
 set(MLIR_HLO_TOOLS_DIR ${MLIR_HLO_BINARY_DIR}/bin)
 set(MLIR_HLO_LIB_DIR ${MLIR_HLO_BINARY_DIR}/lib)
 
 add_custom_target(check-mlir-hlo)
 
-add_subdirectory(include/mlir-hlo)
-add_subdirectory(lib)
+add_subdirectory(analysis)
+add_subdirectory(bindings)
+add_subdirectory(gml_st)
+add_subdirectory(lhlo)
+add_subdirectory(lhlo_gpu)
+add_subdirectory(mhlo)
 add_subdirectory(stablehlo)
+add_subdirectory(tests)
+add_subdirectory(thlo)
 add_subdirectory(tools)
 add_subdirectory(tosa)
-add_subdirectory(tests)
+add_subdirectory(transforms)
+add_subdirectory(utils)
 
-if(MHLO_ENABLE_BINDINGS_PYTHON)
-  add_subdirectory(python)
-endif()
 
 add_subdirectory(cmake/modules)
diff --git a/tensorflow/compiler/xla/mlir_hlo/analysis/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/analysis/CMakeLists.txt
new file mode 100644
index 00000000000..88a0fec1494
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_mlir_library(MLIRHLOAnalysis
+  userange_analysis.cc
+
+  DEPENDS
+  mlir-headers
+
+  LINK_LIBS PUBLIC
+  MLIRAnalysis
+  MLIRIR
+)
+
+add_mlir_library(MLIRHLOTestAnalysis
+  test_userange_analysis.cc
+
+  DEPENDS
+  LMHLOTransformsPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  LmhloGPUDialect
+  MLIRHLOAnalysis
+  MLIRAnalysis
+  MLIRPass
+  MLIRTransforms
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_userange_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
similarity index 91%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_userange_analysis.cc
rename to tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
index 97aca6e3b67..f29916297ed 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_userange_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Analysis/userange_analysis.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "analysis/userange_analysis.h"
+#include "lhlo/IR/lhlo_ops.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
 #include "mlir/Pass/Pass.h"
@@ -22,7 +22,7 @@ limitations under the License.
 namespace mlir {
 
 #define GEN_PASS_DEF_TESTUSERANGE
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/userange_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Analysis/userange_analysis.cc
rename to tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
index 51d90f33dbe..05731828b1b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/userange_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Analysis/userange_analysis.h"
+#include "analysis/userange_analysis.h"
 
 #include <algorithm>
 #include <utility>
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/userange_analysis.h b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.h
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/userange_analysis.h
rename to tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.h
index 219f556b65b..ba6131ae86e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/userange_analysis.h
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef MLIR_HLO_ANALYSIS_USERANGE_ANALYSIS_H
 #define MLIR_HLO_ANALYSIS_USERANGE_ANALYSIS_H
 
+#include <optional>
 #include <vector>
 
 #include "mlir/Analysis/Liveness.h"
@@ -110,7 +111,7 @@ class UserangeAnalysis {
   /// empty Optional if the value has no uses.
   llvm::Optional<size_t> getFirstUseIndex(Value value) const {
     auto &intervals = useIntervalMap.find(value)->second;
-    if (intervals.empty()) return llvm::None;
+    if (intervals.empty()) return std::nullopt;
     return intervals.begin()->start;
   }
 
@@ -118,7 +119,7 @@ class UserangeAnalysis {
   llvm::Optional<const UseInterval::Vector *> getUserangeInterval(
       Value value) const {
     auto intervals = useIntervalMap.find(value);
-    if (intervals == useIntervalMap.end()) return llvm::None;
+    if (intervals == useIntervalMap.end()) return std::nullopt;
     return &intervals->second;
   }
 
@@ -128,7 +129,7 @@ class UserangeAnalysis {
       Value value) const {
     auto usePosition = usePositionMap.find(value);
     if (usePosition == usePositionMap.end() || usePosition->second.empty())
-      return llvm::None;
+      return std::nullopt;
     return &usePosition->second;
   }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/bindings/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/bindings/CMakeLists.txt
new file mode 100644
index 00000000000..ec1a1c36e99
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(c)
+
+if(MHLO_ENABLE_BINDINGS_PYTHON)
+  add_subdirectory(python)
+endif()
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Attributes.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.cc
similarity index 79%
rename from tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Attributes.cc
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.cc
index 101dcf9aa1d..7e5b16faea3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Attributes.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo-c/Attributes.h"
+#include "bindings/c/Attributes.h"
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include <optional>
+
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Support.h"
 
@@ -26,10 +28,9 @@ MlirAttribute mlirMhloScatterDimensionNumbersGet(
     const int64_t *insertedWindowDims, intptr_t nScatteredDimsToOperandDims,
     const int64_t *scatteredDimsToOperandDims, int64_t indexVectorDim) {
   return wrap(mlir::mhlo::ScatterDimensionNumbersAttr::get(
-      unwrap(ctx), llvm::makeArrayRef(updateWindowDims, nUpdateWindowDims),
-      llvm::makeArrayRef(insertedWindowDims, nInsertedWindowDims),
-      llvm::makeArrayRef(scatteredDimsToOperandDims,
-                         nScatteredDimsToOperandDims),
+      unwrap(ctx), llvm::ArrayRef(updateWindowDims, nUpdateWindowDims),
+      llvm::ArrayRef(insertedWindowDims, nInsertedWindowDims),
+      llvm::ArrayRef(scatteredDimsToOperandDims, nScatteredDimsToOperandDims),
       indexVectorDim));
 }
 
@@ -98,9 +99,9 @@ MlirAttribute mlirMhloGatherDimensionNumbersGet(
     intptr_t nStartIndexMap, const int64_t *startIndexMap,
     int64_t indexVectorDim) {
   return wrap(mlir::mhlo::GatherDimensionNumbersAttr::get(
-      unwrap(ctx), llvm::makeArrayRef(offsetDims, nOffsetDims),
-      llvm::makeArrayRef(collapsedSliceDims, nCollapsedSliceDims),
-      llvm::makeArrayRef(startIndexMap, nStartIndexMap), indexVectorDim));
+      unwrap(ctx), llvm::ArrayRef(offsetDims, nOffsetDims),
+      llvm::ArrayRef(collapsedSliceDims, nCollapsedSliceDims),
+      llvm::ArrayRef(startIndexMap, nStartIndexMap), indexVectorDim));
 }
 
 bool mlirMhloAttributeIsAGatherDimensionNumbers(MlirAttribute attr) {
@@ -169,10 +170,10 @@ MlirAttribute mlirMhloDotDimensionNumbersGet(
     const int64_t *rhsContractingDimensions) {
   return wrap(mlir::mhlo::DotDimensionNumbersAttr::get(
       unwrap(ctx),
-      llvm::makeArrayRef(lhsBatchingDimensions, nLhsBatchingDimensions),
-      llvm::makeArrayRef(rhsBatchingDimensions, nRhsBatchingDimensions),
-      llvm::makeArrayRef(lhsContractingDimensions, nLhsContractingDimensions),
-      llvm::makeArrayRef(rhsContractingDimensions, nRhsContractingDimensions)));
+      llvm::ArrayRef(lhsBatchingDimensions, nLhsBatchingDimensions),
+      llvm::ArrayRef(rhsBatchingDimensions, nRhsBatchingDimensions),
+      llvm::ArrayRef(lhsContractingDimensions, nLhsContractingDimensions),
+      llvm::ArrayRef(rhsContractingDimensions, nRhsContractingDimensions)));
 }
 
 bool mlirMhloAttributeIsADotDimensionNumbers(MlirAttribute attr) {
@@ -252,11 +253,11 @@ MlirAttribute mlirMhloConvDimensionNumbersGet(
     intptr_t nOutputSpatialDimensions, const int64_t *outputSpatialDimensions) {
   return wrap(mlir::mhlo::ConvDimensionNumbersAttr::get(
       unwrap(ctx), inputBatchDimension, inputFeatureDimension,
-      llvm::makeArrayRef(inputSpatialDimensions, nInputSpatialDimensions),
+      llvm::ArrayRef(inputSpatialDimensions, nInputSpatialDimensions),
       kernelInputFeatureDimension, kernelOutputFeatureDimension,
-      llvm::makeArrayRef(kernelSpatialDimensions, nKernelSpatialDimensions),
+      llvm::ArrayRef(kernelSpatialDimensions, nKernelSpatialDimensions),
       outputBatchDimension, outputFeatureDimension,
-      llvm::makeArrayRef(outputSpatialDimensions, nOutputSpatialDimensions)));
+      llvm::ArrayRef(outputSpatialDimensions, nOutputSpatialDimensions)));
 }
 
 bool mlirMhloAttributeIsAConvDimensionNumbers(MlirAttribute attr) {
@@ -358,9 +359,8 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirMhloOutputOperandAliasGet(
     const int64_t *outputTupleIndices, int64_t operandIndex,
     intptr_t nOperandTupleIndices, const int64_t *operandTupleIndices) {
   return wrap(mlir::mhlo::OutputOperandAliasAttr::get(
-      unwrap(ctx), llvm::makeArrayRef(outputTupleIndices, nOutputTupleIndices),
-      operandIndex,
-      llvm::makeArrayRef(operandTupleIndices, nOperandTupleIndices)));
+      unwrap(ctx), llvm::ArrayRef(outputTupleIndices, nOutputTupleIndices),
+      operandIndex, llvm::ArrayRef(operandTupleIndices, nOperandTupleIndices)));
 }
 
 bool mlirMhloAttributeIsAOutputOperandAlias(MlirAttribute attr) {
@@ -407,20 +407,19 @@ int64_t mlirMhloOutputOperandAliasGetOperandTupleIndicesElem(MlirAttribute attr,
 // ComparisonDirectionAttr.
 //
 MlirAttribute mlirMhloComparisonDirectionAttrGet(MlirContext ctx,
-                                                 MlirStringRef direction) {
-  llvm::Optional<mlir::mhlo::ComparisonDirection> compareDirection =
-      mlir::mhlo::symbolizeComparisonDirection(unwrap(direction));
-  if (!compareDirection)
-    llvm_unreachable("Invalid comparison-direction specified.");
+                                                 MlirStringRef value) {
+  std::optional<mlir::mhlo::ComparisonDirection> comparisonDirection =
+      mlir::mhlo::symbolizeComparisonDirection(unwrap(value));
+  if (!comparisonDirection) llvm_unreachable("Invalid value.");
   return wrap(mlir::mhlo::ComparisonDirectionAttr::get(
-      unwrap(ctx), compareDirection.value()));
+      unwrap(ctx), comparisonDirection.value()));
 }
 
 bool mlirMhloAttributeIsAComparisonDirectionAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::ComparisonDirectionAttr>();
 }
 
-MlirStringRef mlirMhloComparisonDirectionAttrGetDirection(MlirAttribute attr) {
+MlirStringRef mlirMhloComparisonDirectionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyComparisonDirection(
       unwrap(attr).cast<mlir::mhlo::ComparisonDirectionAttr>().getValue()));
 }
@@ -430,19 +429,19 @@ MlirStringRef mlirMhloComparisonDirectionAttrGetDirection(MlirAttribute attr) {
 //
 
 MlirAttribute mlirMhloComparisonTypeAttrGet(MlirContext ctx,
-                                            MlirStringRef type) {
-  llvm::Optional<mlir::mhlo::ComparisonType> compareType =
-      mlir::mhlo::symbolizeComparisonType(unwrap(type));
-  if (!compareType) llvm_unreachable("Invalid comparison-type specified.");
+                                            MlirStringRef value) {
+  std::optional<mlir::mhlo::ComparisonType> comparisonType =
+      mlir::mhlo::symbolizeComparisonType(unwrap(value));
+  if (!comparisonType) llvm_unreachable("Invalid value.");
   return wrap(
-      mlir::mhlo::ComparisonTypeAttr::get(unwrap(ctx), compareType.value()));
+      mlir::mhlo::ComparisonTypeAttr::get(unwrap(ctx), comparisonType.value()));
 }
 
 bool mlirMhloAttributeIsAComparisonTypeAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::ComparisonTypeAttr>();
 }
 
-MlirStringRef mlirMhloComparisonTypeAttrGetType(MlirAttribute attr) {
+MlirStringRef mlirMhloComparisonTypeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyComparisonType(
       unwrap(attr).cast<mlir::mhlo::ComparisonTypeAttr>().getValue()));
 }
@@ -451,10 +450,10 @@ MlirStringRef mlirMhloComparisonTypeAttrGetType(MlirAttribute attr) {
 // DomainKindAttr.
 //
 
-MlirAttribute mlirMhloDomainKindAttrGet(MlirContext ctx, MlirStringRef kind) {
-  llvm::Optional<mlir::mhlo::DomainKind> domainKind =
-      mlir::mhlo::symbolizeDomainKind(unwrap(kind));
-  if (!domainKind) llvm_unreachable("Invalid domain kind specified.");
+MlirAttribute mlirMhloDomainKindAttrGet(MlirContext ctx, MlirStringRef value) {
+  std::optional<mlir::mhlo::DomainKind> domainKind =
+      mlir::mhlo::symbolizeDomainKind(unwrap(value));
+  if (!domainKind) llvm_unreachable("Invalid value.");
   return wrap(mlir::mhlo::DomainKindAttr::get(unwrap(ctx), domainKind.value()));
 }
 
@@ -462,7 +461,7 @@ bool mlirMhloAttributeIsADomainKindAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::DomainKindAttr>();
 }
 
-MlirStringRef mlirMhloDomainKindAttrGetType(MlirAttribute attr) {
+MlirStringRef mlirMhloDomainKindAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyDomainKind(
       unwrap(attr).cast<mlir::mhlo::DomainKindAttr>().getValue()));
 }
@@ -471,19 +470,18 @@ MlirStringRef mlirMhloDomainKindAttrGetType(MlirAttribute attr) {
 // PrecisionAttr.
 //
 
-MlirAttribute mlirMhloPrecisionAttrGet(MlirContext ctx, MlirStringRef type) {
-  llvm::Optional<mlir::mhlo::Precision> precisionType =
-      mlir::mhlo::symbolizePrecision(unwrap(type));
-  if (!precisionType) llvm_unreachable("Invalid precision-type specified.");
-  return wrap(
-      mlir::mhlo::PrecisionAttr::get(unwrap(ctx), precisionType.value()));
+MlirAttribute mlirMhloPrecisionAttrGet(MlirContext ctx, MlirStringRef value) {
+  std::optional<mlir::mhlo::Precision> precision =
+      mlir::mhlo::symbolizePrecision(unwrap(value));
+  if (!precision) llvm_unreachable("Invalid value specified.");
+  return wrap(mlir::mhlo::PrecisionAttr::get(unwrap(ctx), precision.value()));
 }
 
 bool mlirMhloAttributeIsAPrecisionAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::PrecisionAttr>();
 }
 
-MlirStringRef mlirMhloPrecisionAttrGetPrecision(MlirAttribute attr) {
+MlirStringRef mlirMhloPrecisionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyPrecision(
       unwrap(attr).cast<mlir::mhlo::PrecisionAttr>().getValue()));
 }
@@ -492,10 +490,10 @@ MlirStringRef mlirMhloPrecisionAttrGetPrecision(MlirAttribute attr) {
 // FftTypeAttr.
 //
 
-MlirAttribute mlirMhloFftTypeAttrGet(MlirContext ctx, MlirStringRef type) {
-  llvm::Optional<mlir::mhlo::FftType> fftType =
-      mlir::mhlo::symbolizeFftType(unwrap(type));
-  if (!fftType) llvm_unreachable("Invalid fft-type specified.");
+MlirAttribute mlirMhloFftTypeAttrGet(MlirContext ctx, MlirStringRef value) {
+  std::optional<mlir::mhlo::FftType> fftType =
+      mlir::mhlo::symbolizeFftType(unwrap(value));
+  if (!fftType) llvm_unreachable("Invalid value.");
   return wrap(mlir::mhlo::FftTypeAttr::get(unwrap(ctx), fftType.value()));
 }
 
@@ -503,7 +501,7 @@ bool mlirMhloAttributeIsAFftTypeAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::FftTypeAttr>();
 }
 
-MlirStringRef mlirMhloFftTypeAttrGetFftType(MlirAttribute attr) {
+MlirStringRef mlirMhloFftTypeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyFftType(
       unwrap(attr).cast<mlir::mhlo::FftTypeAttr>().getValue()));
 }
@@ -513,10 +511,10 @@ MlirStringRef mlirMhloFftTypeAttrGetFftType(MlirAttribute attr) {
 //
 
 MlirAttribute mlirMhloDequantizeModeAttrGet(MlirContext ctx,
-                                            MlirStringRef mode) {
-  llvm::Optional<mlir::mhlo::DequantizeMode> dequantizeMode =
-      mlir::mhlo::symbolizeDequantizeMode(unwrap(mode));
-  if (!dequantizeMode) llvm_unreachable("Invalid dequantize-mode specified.");
+                                            MlirStringRef value) {
+  std::optional<mlir::mhlo::DequantizeMode> dequantizeMode =
+      mlir::mhlo::symbolizeDequantizeMode(unwrap(value));
+  if (!dequantizeMode) llvm_unreachable("Invalid value.");
   return wrap(
       mlir::mhlo::DequantizeModeAttr::get(unwrap(ctx), dequantizeMode.value()));
 }
@@ -525,7 +523,7 @@ bool mlirMhloAttributeIsADequantizeModeAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::DequantizeModeAttr>();
 }
 
-MlirStringRef mlirMhloDequantizeModeAttrGetDequantizeMode(MlirAttribute attr) {
+MlirStringRef mlirMhloDequantizeModeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyDequantizeMode(
       unwrap(attr).cast<mlir::mhlo::DequantizeModeAttr>().getValue()));
 }
@@ -534,19 +532,18 @@ MlirStringRef mlirMhloDequantizeModeAttrGetDequantizeMode(MlirAttribute attr) {
 // TransposeAttr.
 //
 
-MlirAttribute mlirMhloTransposeAttrGet(MlirContext ctx, MlirStringRef type) {
-  llvm::Optional<mlir::mhlo::Transpose> transposeType =
-      mlir::mhlo::symbolizeTranspose(unwrap(type));
-  if (!transposeType) llvm_unreachable("Invalid transpose-type specified.");
-  return wrap(
-      mlir::mhlo::TransposeAttr::get(unwrap(ctx), transposeType.value()));
+MlirAttribute mlirMhloTransposeAttrGet(MlirContext ctx, MlirStringRef value) {
+  std::optional<mlir::mhlo::Transpose> transpose =
+      mlir::mhlo::symbolizeTranspose(unwrap(value));
+  if (!transpose) llvm_unreachable("Invalid value.");
+  return wrap(mlir::mhlo::TransposeAttr::get(unwrap(ctx), transpose.value()));
 }
 
 bool mlirMhloAttributeIsATransposeAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::TransposeAttr>();
 }
 
-MlirStringRef mlirMhloTransposeAttrGetTranspose(MlirAttribute attr) {
+MlirStringRef mlirMhloTransposeAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyTranspose(
       unwrap(attr).cast<mlir::mhlo::TransposeAttr>().getValue()));
 }
@@ -555,10 +552,10 @@ MlirStringRef mlirMhloTransposeAttrGetTranspose(MlirAttribute attr) {
 // FusionKindAttr.
 //
 
-MlirAttribute mlirMhloFusionKindAttrGet(MlirContext ctx, MlirStringRef kind) {
-  llvm::Optional<mlir::mhlo::FusionKind> fusionKind =
-      mlir::mhlo::symbolizeFusionKind(unwrap(kind));
-  if (!fusionKind) llvm_unreachable("Invalid fusion-kind specified.");
+MlirAttribute mlirMhloFusionKindAttrGet(MlirContext ctx, MlirStringRef value) {
+  std::optional<mlir::mhlo::FusionKind> fusionKind =
+      mlir::mhlo::symbolizeFusionKind(unwrap(value));
+  if (!fusionKind) llvm_unreachable("Invalid value.");
   return wrap(mlir::mhlo::FusionKindAttr::get(unwrap(ctx), fusionKind.value()));
 }
 
@@ -566,7 +563,7 @@ bool mlirMhloAttributeIsAFusionKindAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::FusionKindAttr>();
 }
 
-MlirStringRef mlirMhloFusionKindAttrGetFusionKind(MlirAttribute attr) {
+MlirStringRef mlirMhloFusionKindAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyFusionKind(
       unwrap(attr).cast<mlir::mhlo::FusionKindAttr>().getValue()));
 }
@@ -576,10 +573,10 @@ MlirStringRef mlirMhloFusionKindAttrGetFusionKind(MlirAttribute attr) {
 //
 
 MlirAttribute mlirMhloRngDistributionAttrGet(MlirContext ctx,
-                                             MlirStringRef distribution) {
-  llvm::Optional<mlir::mhlo::RngDistribution> rngDistribution =
-      mlir::mhlo::symbolizeRngDistribution(unwrap(distribution));
-  if (!rngDistribution) llvm_unreachable("Invalid rng-distribution specified.");
+                                             MlirStringRef value) {
+  std::optional<mlir::mhlo::RngDistribution> rngDistribution =
+      mlir::mhlo::symbolizeRngDistribution(unwrap(value));
+  if (!rngDistribution) llvm_unreachable("Invalid value.");
   return wrap(mlir::mhlo::RngDistributionAttr::get(unwrap(ctx),
                                                    rngDistribution.value()));
 }
@@ -588,8 +585,7 @@ bool mlirMhloAttributeIsARngDistributionAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::RngDistributionAttr>();
 }
 
-MlirStringRef mlirMhloRngDistributionAttrGetRngDistribution(
-    MlirAttribute attr) {
+MlirStringRef mlirMhloRngDistributionAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyRngDistribution(
       unwrap(attr).cast<mlir::mhlo::RngDistributionAttr>().getValue()));
 }
@@ -599,10 +595,10 @@ MlirStringRef mlirMhloRngDistributionAttrGetRngDistribution(
 //
 
 MlirAttribute mlirMhloRngAlgorithmAttrGet(MlirContext ctx,
-                                          MlirStringRef algorithm) {
-  llvm::Optional<mlir::mhlo::RngAlgorithm> rngAlgorithm =
-      mlir::mhlo::symbolizeRngAlgorithm(unwrap(algorithm));
-  if (!rngAlgorithm) llvm_unreachable("Invalid rng-algorithm specified.");
+                                          MlirStringRef value) {
+  std::optional<mlir::mhlo::RngAlgorithm> rngAlgorithm =
+      mlir::mhlo::symbolizeRngAlgorithm(unwrap(value));
+  if (!rngAlgorithm) llvm_unreachable("Invalid value.");
   return wrap(
       mlir::mhlo::RngAlgorithmAttr::get(unwrap(ctx), rngAlgorithm.value()));
 }
@@ -611,7 +607,7 @@ bool mlirMhloAttributeIsARngAlgorithmAttr(MlirAttribute attr) {
   return unwrap(attr).isa<mlir::mhlo::RngAlgorithmAttr>();
 }
 
-MlirStringRef mlirMhloRngAlgorithmAttrGetRngAlgorithm(MlirAttribute attr) {
+MlirStringRef mlirMhloRngAlgorithmAttrGetValue(MlirAttribute attr) {
   return wrap(mlir::mhlo::stringifyRngAlgorithm(
       unwrap(attr).cast<mlir::mhlo::RngAlgorithmAttr>().getValue()));
 }
@@ -644,7 +640,7 @@ int64_t mlirMhloChannelHandleGetType(MlirAttribute attr) {
 MlirAttribute mlirMhloTypeExtensionsGet(MlirContext ctx, intptr_t nBounds,
                                         const int64_t *bounds) {
   return wrap(mlir::mhlo::TypeExtensionsAttr::get(
-      unwrap(ctx), llvm::makeArrayRef(bounds, nBounds)));
+      unwrap(ctx), llvm::ArrayRef(bounds, nBounds)));
 }
 
 bool mlirMhloAttributeIsTypeExtensions(MlirAttribute attr) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Attributes.h b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.h
similarity index 86%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Attributes.h
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.h
index e91c0a5714c..624440c2ce8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Attributes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Attributes.h
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef MLIR_HLO_C_ATTRIBUTES_H
-#define MLIR_HLO_C_ATTRIBUTES_H
+#ifndef MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
+#define MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
 
 #include <sys/types.h>
 
@@ -197,9 +197,9 @@ MLIR_CAPI_EXPORTED int64_t mlirMhloOutputOperandAliasGetOperandTupleIndicesElem(
 // ComparisonDirectionAttr.
 //
 // Creates a new ComparisonDirection attribute with the given
-// 'direction' string parameter.
+// 'value' string parameter.
 MLIR_CAPI_EXPORTED MlirAttribute
-mlirMhloComparisonDirectionAttrGet(MlirContext ctx, MlirStringRef direction);
+mlirMhloComparisonDirectionAttrGet(MlirContext ctx, MlirStringRef value);
 
 // Returns true if the given attribute is a ComparisonDirection attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonDirectionAttr(
@@ -207,15 +207,15 @@ MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonDirectionAttr(
 
 // Returns the direction string associated with ComparisonDirection attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloComparisonDirectionAttrGetDirection(MlirAttribute attr);
+mlirMhloComparisonDirectionAttrGetValue(MlirAttribute attr);
 
 //
 // ComparisonTypeAttr.
 //
-// Creates a new ComparisonType attribute with the given 'type' string
+// Creates a new ComparisonType attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute
-mlirMhloComparisonTypeAttrGet(MlirContext ctx, MlirStringRef type);
+mlirMhloComparisonTypeAttrGet(MlirContext ctx, MlirStringRef value);
 
 // Returns true if the given attribute is a ComparisonType attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonTypeAttr(
@@ -223,128 +223,127 @@ MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAComparisonTypeAttr(
 
 // Returns the type string associated with ComparisonType attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloComparisonTypeAttrGetType(MlirAttribute attr);
+mlirMhloComparisonTypeAttrGetValue(MlirAttribute attr);
 
 //
 // DomainKindAttr.
 //
-// Creates a new DomainKind attribute with the given 'kind' string
+// Creates a new DomainKind attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloDomainKindAttrGet(MlirContext ctx,
-                                                           MlirStringRef kind);
+                                                           MlirStringRef value);
 
 // Returns true if the given attribute is a DomainKind attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsADomainKindAttr(MlirAttribute attr);
 
-// Returns the type string associated with DomainKind attribute.
+// Returns the value string associated with DomainKind attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloDomainKindAttrGetType(MlirAttribute attr);
+mlirMhloDomainKindAttrGetValue(MlirAttribute attr);
 
 //
 // PrecisionAttr.
 //
-// Creates a new Precision attribute with the given 'type' string
+// Creates a new Precision attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloPrecisionAttrGet(MlirContext ctx,
-                                                          MlirStringRef type);
+                                                          MlirStringRef value);
 
 // Returns true if the given attribute is a Precision attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAPrecisionAttr(MlirAttribute attr);
 
-// Returns the type string associated with Precision attribute.
+// Returns the value string associated with Precision attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloPrecisionAttrGetPrecision(MlirAttribute attr);
+mlirMhloPrecisionAttrGetValue(MlirAttribute attr);
 
 //
 // FftTypeAttr.
 //
-// Creates a new FftType attribute with the given 'type' string parameter.
+// Creates a new FftType attribute with the given 'value' string parameter.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloFftTypeAttrGet(MlirContext ctx,
-                                                        MlirStringRef type);
+                                                        MlirStringRef value);
 
 // Returns true if the given attribute is a FftType attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAFftTypeAttr(MlirAttribute attr);
 
-// Returns the type string associated with FftType attribute.
+// Returns the value string associated with FftType attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloFftTypeAttrGetFftType(MlirAttribute attr);
+mlirMhloFftTypeAttrGetValue(MlirAttribute attr);
 
 //
 // DequantizeModeAttr.
 //
-// Creates a new DequantizeMode attribute with the given 'mode' string
+// Creates a new DequantizeMode attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute
-mlirMhloDequantizeModeAttrGet(MlirContext ctx, MlirStringRef mode);
+mlirMhloDequantizeModeAttrGet(MlirContext ctx, MlirStringRef value);
 
 // Returns true if the given attribute is a DequantizeMode attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsADequantizeModeAttr(
     MlirAttribute attr);
 
-// Returns the mode string associated with DequantizeMode attribute.
+// Returns the value string associated with DequantizeMode attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloDequantizeModeAttrGetDequantizeMode(MlirAttribute attr);
+mlirMhloDequantizeModeAttrGetValue(MlirAttribute attr);
 
 //
 // TransposeAttr.
 //
-// Creates a new Transpose attribute with the given 'type' string parameter.
+// Creates a new Transpose attribute with the given 'value' string parameter.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloTransposeAttrGet(MlirContext ctx,
-                                                          MlirStringRef type);
+                                                          MlirStringRef value);
 
 // Returns true if the given attribute is a Transpose attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsATransposeAttr(MlirAttribute attr);
 
-// Returns the type string associated with Transpose attribute.
+// Returns the value string associated with Transpose attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloTransposeAttrGetTranspose(MlirAttribute attr);
+mlirMhloTransposeAttrGetValue(MlirAttribute attr);
 
 //
 // FusionKindAttr.
 //
-// Creates a new FusionKind attribute with the given 'kind' string parameter.
+// Creates a new FusionKind attribute with the given 'value' string parameter.
 MLIR_CAPI_EXPORTED MlirAttribute mlirMhloFusionKindAttrGet(MlirContext ctx,
-                                                           MlirStringRef kind);
+                                                           MlirStringRef value);
 
 // Returns true if the given attribute is a FusionKind attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsAFusionKindAttr(MlirAttribute attr);
 
-// Returns the fusion-kind string associated with FusionKind attribute.
+// Returns the value string associated with FusionKind attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloFusionKindAttrGetFusionKind(MlirAttribute attr);
+mlirMhloFusionKindAttrGetValue(MlirAttribute attr);
 
 //
 // RngDistributionAttr.
 //
-// Creates a new RngDistribution attribute with the given 'distribution' string
+// Creates a new RngDistribution attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute
-mlirMhloRngDistributionAttrGet(MlirContext ctx, MlirStringRef distribution);
+mlirMhloRngDistributionAttrGet(MlirContext ctx, MlirStringRef value);
 
 // Returns true if the given attribute is a RngDistribution attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsARngDistributionAttr(
     MlirAttribute attr);
 
-// Returns the rng-distribution string associated with RngDistribution
-// attribute.
+// Returns the value string associated with RngDistribution attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloRngDistributionAttrGetRngDistribution(MlirAttribute attr);
+mlirMhloRngDistributionAttrGetValue(MlirAttribute attr);
 
 //
 // RngAlgorithmAttr.
 //
-// Creates a new RngAlgorithm attribute with the given 'algorithm' string
+// Creates a new RngAlgorithm attribute with the given 'value' string
 // parameter.
 MLIR_CAPI_EXPORTED MlirAttribute
-mlirMhloRngAlgorithmAttrGet(MlirContext ctx, MlirStringRef algorithm);
+mlirMhloRngAlgorithmAttrGet(MlirContext ctx, MlirStringRef value);
 
 // Returns true if the given attribute is a RngAlgorithm attribute.
 MLIR_CAPI_EXPORTED bool mlirMhloAttributeIsARngAlgorithmAttr(
     MlirAttribute attr);
 
-// Returns the rng-algorithm string associated with RngAlgorithm attribute.
+// Returns the value string associated with RngAlgorithm attribute.
 MLIR_CAPI_EXPORTED MlirStringRef
-mlirMhloRngAlgorithmAttrGetRngAlgorithm(MlirAttribute attr);
+mlirMhloRngAlgorithmAttrGetValue(MlirAttribute attr);
 
 //
 // ChannelHandle
@@ -386,4 +385,4 @@ mlirMhloTypeExtensionsGetBoundsElem(MlirAttribute attr, intptr_t pos);
 }
 #endif
 
-#endif  // MLIR_HLO_C_ATTRIBUTES_H
+#endif  // MLIR_HLO_BINDINGS_C_ATTRIBUTES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/bindings/c/CMakeLists.txt
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/lib/CAPI/CMakeLists.txt
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/CMakeLists.txt
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Dialects.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.cc
similarity index 90%
rename from tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Dialects.cc
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.cc
index 67917cced59..edb68eb11a3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Dialects.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.cc
@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo-c/Dialects.h"
+#include "bindings/c/Dialects.h"
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/CAPI/Registration.h"
 
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(Mhlo, mhlo, mlir::mhlo::MhloDialect)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Dialects.h b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.h
similarity index 88%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Dialects.h
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.h
index 895213f5a6e..ca18c4980e4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Dialects.h
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Dialects.h
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_C_DIALECTS_H
-#define MLIR_HLO_C_DIALECTS_H
+#ifndef MLIR_HLO_BINDINGS_C_DIALECTS_H
+#define MLIR_HLO_BINDINGS_C_DIALECTS_H
 
 #include "mlir-c/RegisterEverything.h"
 
@@ -26,4 +26,4 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(Mhlo, mhlo);
 }
 #endif
 
-#endif  // MLIR_HLO_C_DIALECTS_H
+#endif  // MLIR_HLO_BINDINGS_C_DIALECTS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Passes.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.cc
similarity index 89%
rename from tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Passes.cc
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.cc
index 8f25bf3e3ed..0a47ced1836 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Passes.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.cc
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo-c/Passes.h"
+#include "bindings/c/Passes.h"
 
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/transforms/passes.h"
 
 void mlirRegisterAllMhloPasses() { mlir::mhlo::registerAllMhloPasses(); }
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Passes.h b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.h
similarity index 88%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Passes.h
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.h
index c0d254dd2a2..a2cfb784575 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Passes.h
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_C_PASSES_H
-#define MLIR_HLO_C_PASSES_H
+#ifndef MLIR_HLO_BINDINGS_C_PASSES_H
+#define MLIR_HLO_BINDINGS_C_PASSES_H
 
 #include "mlir-c/IR.h"
 #include "mlir-c/Support.h"
@@ -27,4 +27,4 @@ MLIR_CAPI_EXPORTED void mlirRegisterAllMhloPasses();
 }
 #endif
 
-#endif  // MLIR_HLO_C_PASSES_H
+#endif  // MLIR_HLO_BINDINGS_C_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Types.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Types.cc
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.cc
index 252eed07c02..b4669eccb8e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/CAPI/Types.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.cc
@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo-c/Types.h"
+#include "bindings/c/Types.h"
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/CAPI/IR.h"
 
 MlirType mlirMhloTokenTypeGet(MlirContext ctx) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Types.h b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.h
similarity index 90%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Types.h
rename to tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.h
index 54a3871b3aa..6869997aa03 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo-c/Types.h
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/c/Types.h
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_C_TYPES_H
-#define MLIR_HLO_C_TYPES_H
+#ifndef MLIR_HLO_BINDINGS_C_TYPES_H
+#define MLIR_HLO_BINDINGS_C_TYPES_H
 
 #include "mlir-c/IR.h"
 #include "mlir-c/Support.h"
@@ -30,4 +30,4 @@ MLIR_CAPI_EXPORTED bool mlirMhloTypeIsAToken(MlirType type);
 }
 #endif
 
-#endif  // MLIR_HLO_C_TYPES_H
+#endif  // MLIR_HLO_BINDINGS_C_TYPES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/python/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/bindings/python/CMakeLists.txt
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/python/CMakeLists.txt
rename to tensorflow/compiler/xla/mlir_hlo/bindings/python/CMakeLists.txt
diff --git a/tensorflow/compiler/xla/mlir_hlo/python/MlirHloModule.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
similarity index 80%
rename from tensorflow/compiler/xla/mlir_hlo/python/MlirHloModule.cc
rename to tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
index 6c4f821959f..3215acd8aa0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/python/MlirHloModule.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
@@ -10,11 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "bindings/c/Attributes.h"
+#include "bindings/c/Dialects.h"
+#include "bindings/c/Passes.h"
+#include "bindings/c/Types.h"
 #include "mlir-c/IR.h"
-#include "mlir-hlo-c/Attributes.h"
-#include "mlir-hlo-c/Dialects.h"
-#include "mlir-hlo-c/Passes.h"
-#include "mlir-hlo-c/Types.h"
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 
 namespace py = pybind11;
@@ -348,138 +348,128 @@ PYBIND11_MODULE(_mlirHlo, m) {
       m, "ComparisonDirectionAttr", mlirMhloAttributeIsAComparisonDirectionAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &direction, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloComparisonDirectionAttrGet(
-                ctx, mlirStringRefCreate(direction.c_str(), direction.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("comparison_direction"),
-          py::arg("context") = py::none(),
-          "Creates a ComparisonDirection attribute with the given direction.")
-      .def_property_readonly("comparison_direction", [](MlirAttribute self) {
-        return toPyString(mlirMhloComparisonDirectionAttrGetDirection(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a ComparisonDirection attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloComparisonDirectionAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "ComparisonTypeAttr", mlirMhloAttributeIsAComparisonTypeAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &type, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloComparisonTypeAttrGet(
-                ctx, mlirStringRefCreate(type.c_str(), type.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("comparison_type"),
-          py::arg("context") = py::none(),
-          "Creates a ComparisonType attribute with the given type.")
-      .def_property_readonly("comparison_type", [](MlirAttribute self) {
-        return toPyString(mlirMhloComparisonTypeAttrGetType(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a ComparisonType attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloComparisonTypeAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "PrecisionAttr", mlirMhloAttributeIsAPrecisionAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &type, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloPrecisionAttrGet(
-                ctx, mlirStringRefCreate(type.c_str(), type.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("precision_type"),
-          py::arg("context") = py::none(),
-          "Creates a Precision attribute with the given type.")
-      .def_property_readonly("precision_type", [](MlirAttribute self) {
-        return toPyString(mlirMhloPrecisionAttrGetPrecision(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a Precision attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloPrecisionAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "FftTypeAttr", mlirMhloAttributeIsAFftTypeAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &type, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloFftTypeAttrGet(
-                ctx, mlirStringRefCreate(type.c_str(), type.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("fft_type"), py::arg("context") = py::none(),
-          "Creates a FftType attribute with the given type.")
-      .def_property_readonly("fft_type", [](MlirAttribute self) {
-        return toPyString(mlirMhloFftTypeAttrGetFftType(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a FftType attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloFftTypeAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "DequantizeModeAttr", mlirMhloAttributeIsADequantizeModeAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &type, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloDequantizeModeAttrGet(
-                ctx, mlirStringRefCreate(type.c_str(), type.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("dequantize_mode"),
-          py::arg("context") = py::none(),
-          "Creates a DequantizeMode attribute with the given mode.")
-      .def_property_readonly("dequantize_mode", [](MlirAttribute self) {
-        return toPyString(mlirMhloDequantizeModeAttrGetDequantizeMode(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a DequantizeMode attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloDequantizeModeAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "TransposeAttr", mlirMhloAttributeIsATransposeAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &type, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloTransposeAttrGet(
-                ctx, mlirStringRefCreate(type.c_str(), type.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("transpose_type"),
-          py::arg("context") = py::none(),
-          "Creates a Transpose attribute with the given type.")
-      .def_property_readonly("transpose_type", [](MlirAttribute self) {
-        return toPyString(mlirMhloTransposeAttrGetTranspose(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a Transpose attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloTransposeAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "FusionKindAttr", mlirMhloAttributeIsAFusionKindAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &kind, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloFusionKindAttrGet(
-                ctx, mlirStringRefCreate(kind.c_str(), kind.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("fusion_kind"),
-          py::arg("context") = py::none(),
-          "Creates a FusionKind attribute with the given kind.")
-      .def_property_readonly("fusion_kind", [](MlirAttribute self) {
-        return toPyString(mlirMhloFusionKindAttrGetFusionKind(self));
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a FusionKind attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        return toPyString(mlirMhloFusionKindAttrGetValue(self));
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "RngDistributionAttr", mlirMhloAttributeIsARngDistributionAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &distribution, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloRngDistributionAttrGet(
-                ctx, mlirStringRefCreate(distribution.c_str(),
-                                         distribution.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("rng_distribution"),
-          py::arg("context") = py::none(),
-          "Creates a RngDistribution attribute with the given rng "
-          "distribution.")
-      .def_property_readonly("rng_distribution", [](MlirAttribute self) {
-        auto distribution = mlirMhloRngDistributionAttrGetRngDistribution(self);
-        return py::str(distribution.data, distribution.length);
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a RngDistribution attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        auto value = mlirMhloRngDistributionAttrGetValue(self);
+        return py::str(value.data, value.length);
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
       m, "RngAlgorithmAttr", mlirMhloAttributeIsARngAlgorithmAttr)
       .def_classmethod(
           "get",
-          [](py::object cls, const std::string &algorithm, MlirContext ctx) {
+          [](py::object cls, const std::string &value, MlirContext ctx) {
             return cls(mlirMhloRngAlgorithmAttrGet(
-                ctx, mlirStringRefCreate(algorithm.c_str(), algorithm.size())));
+                ctx, mlirStringRefCreate(value.c_str(), value.size())));
           },
-          py::arg("cls"), py::arg("rng_algorithm"),
-          py::arg("context") = py::none(),
-          "Creates a RngAlgorithm attribute with the given rng algorithm.")
-      .def_property_readonly("rng_algorithm", [](MlirAttribute self) {
-        auto algorithm = mlirMhloRngAlgorithmAttrGetRngAlgorithm(self);
-        return py::str(algorithm.data, algorithm.length);
+          py::arg("cls"), py::arg("value"), py::arg("context") = py::none(),
+          "Creates a RngAlgorithm attribute with the given value.")
+      .def_property_readonly("value", [](MlirAttribute self) {
+        auto value = mlirMhloRngAlgorithmAttrGetValue(self);
+        return py::str(value.data, value.length);
       });
 
   mlir::python::adaptors::mlir_attribute_subclass(
diff --git a/tensorflow/compiler/xla/mlir_hlo/python/mlir/dialects/MhloOps.td b/tensorflow/compiler/xla/mlir_hlo/bindings/python/mlir/dialects/MhloOps.td
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/python/mlir/dialects/MhloOps.td
rename to tensorflow/compiler/xla/mlir_hlo/bindings/python/mlir/dialects/MhloOps.td
index eb9d1c52e17..f7bfea08eb2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/python/mlir/dialects/MhloOps.td
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/python/mlir/dialects/MhloOps.td
@@ -19,6 +19,6 @@ limitations under the License.
 #define PYTHON_BINDINGS_MHLO_OPS
 
 include "mlir/Bindings/Python/Attributes.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mhlo/IR/hlo_ops.td"
 
 #endif
diff --git a/tensorflow/compiler/xla/mlir_hlo/python/mlir/dialects/mhlo.py b/tensorflow/compiler/xla/mlir_hlo/bindings/python/mlir/dialects/mhlo.py
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/python/mlir/dialects/mhlo.py
rename to tensorflow/compiler/xla/mlir_hlo/bindings/python/mlir/dialects/mhlo.py
diff --git a/tensorflow/compiler/xla/mlir_hlo/cmake/modules/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/cmake/modules/CMakeLists.txt
index dea85e1a660..462a7f0475d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/cmake/modules/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/cmake/modules/CMakeLists.txt
@@ -23,6 +23,7 @@ set(MHLO_CONFIG_CMAKE_DIR "${mhlo_cmake_builddir}")
 set(MHLO_CONFIG_LLVM_CMAKE_DIR "${llvm_cmake_builddir}")
 set(MHLO_CONFIG_INCLUDE_EXPORTS "include(\"\${MHLO_CMAKE_DIR}/MHLOTargets.cmake\")")
 set(MHLO_CONFIG_INCLUDE_DIRS
+  "${MLIR_HLO_SOURCE_DIR}"
   "${MLIR_HLO_MAIN_INCLUDE_DIR}"
   "${MLIR_HLO_GEN_INCLUDE_DIR}"
   )
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt
new file mode 100644
index 00000000000..47c038050ca
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(IR)
+add_subdirectory(interfaces)
+add_subdirectory(transforms)
+add_subdirectory(utils)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/CMakeLists.txt
new file mode 100644
index 00000000000..b1fe0fda4dd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(LLVM_TARGET_DEFINITIONS gml_st_ops.td)
+mlir_tablegen(gml_st_ops.h.inc -gen-op-decls)
+mlir_tablegen(gml_st_ops.cc.inc -gen-op-defs)
+mlir_tablegen(gml_st_types.h.inc -gen-typedef-decls)
+mlir_tablegen(gml_st_types.cc.inc -gen-typedef-defs)
+mlir_tablegen(gml_st_dialect.h.inc -gen-dialect-decls)
+mlir_tablegen(gml_st_dialect.cc.inc -gen-dialect-defs)
+mlir_tablegen(gml_st_attrs.h.inc -gen-attrdef-decls)
+mlir_tablegen(gml_st_attrs.cc.inc -gen-attrdef-defs)
+
+add_public_tablegen_target(MLIRgml_st_opsIncGen)
+add_dependencies(mlir-headers MLIRgml_st_opsIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_dialect_library(GmlStDialect
+  gml_st_ops.cc
+
+  DEPENDS
+  MLIRgml_st_opsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithUtils
+  MLIRControlFlowInterfaces
+  MLIRIR
+  MLIRInferTypeOpInterface
+  MLIRLoopLikeInterface
+  MLIRMemRefDialect
+  MLIRSideEffectInterfaces
+  MLIRSupport
+  MLIRTensorDialect
+  MLIRViewLikeInterface
+  MLIRVectorDialect
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
new file mode 100644
index 00000000000..a1985a06609
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.cc
@@ -0,0 +1,1460 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/IR/gml_st_ops.h"
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <utility>
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+
+namespace mlir {
+namespace {
+
+void printShapeTypeDimensionsList(AsmPrinter &printer,
+                                  ArrayRef<int64_t> integers) {
+  llvm::interleave(
+      integers, printer,
+      [&](int64_t val) {
+        if (val == ShapedType::kDynamic) {
+          printer << '?';
+        } else {
+          printer << val;
+        }
+      },
+      "x");
+}
+
+ParseResult parseShapeTypeDimensionsList(AsmParser &parser,
+                                         SmallVectorImpl<int64_t> &dims) {
+  SmallVector<int64_t> vals;
+  if (failed(parser.parseDimensionList(vals, /*allowDynamic=*/true,
+                                       /*withTrailingX=*/false))) {
+    return failure();
+  }
+  dims = vals;
+  return success();
+}
+
+Type inferReturnType(ShapedType sourceType, ArrayRef<int64_t> tileShape) {
+  return sourceType.clone(tileShape, sourceType.getElementType());
+}
+
+LogicalResult verifyCompatibleExtractedSubset(Operation *op,
+                                              ShapedType shapedType,
+                                              Type extractedType,
+                                              ArrayRef<int64_t> tileShape) {
+  auto sourceRank = shapedType.getRank();
+  auto elementType = shapedType.getElementType();
+
+  // If the result is a scalar, check that the tile had a single element.
+  if (!extractedType.isa<ShapedType>()) {
+    if (extractedType != elementType) {
+      return op->emitOpError("expected the result type ")
+             << extractedType << " to match source element type "
+             << elementType;
+    }
+    if (!ShapedType::isDynamicShape(tileShape) &&
+        ShapedType::getNumElements(tileShape) == 1)
+      return success();
+
+    return op->emitOpError("expected tile type ")
+           << tileShape << " to have a single element shape";
+  }
+
+  // If the result is a shaped type, compare with the inferred type.
+  auto extractedShapedType = extractedType.cast<ShapedType>();
+  unsigned tileRank = tileShape.size();
+  if (tileRank != sourceRank) {
+    return op->emitOpError("expected source rank = ")
+           << sourceRank << " to match tile rank = " << tileRank;
+  }
+
+  auto inferredType = shapedType.clone(tileShape, shapedType.getElementType());
+  if (extractedShapedType != inferredType) {
+    return op->emitOpError("expected result type = ")
+           << extractedShapedType
+           << " to match the inferred type = " << inferredType;
+  }
+
+  return success();
+}
+
+}  // namespace
+}  // namespace mlir
+
+// Generated dialect definitions.
+#include "gml_st/IR/gml_st_dialect.cc.inc"
+
+// Generated type classes.
+#define GET_TYPEDEF_CLASSES
+#include "gml_st/IR/gml_st_types.cc.inc"
+
+// Generated attribute classes.
+#define GET_ATTRDEF_CLASSES
+#include "gml_st/IR/gml_st_attrs.cc.inc"
+
+namespace mlir {
+namespace gml_st {
+
+//===----------------------------------------------------------------------===//
+// GmlStDialect
+//===----------------------------------------------------------------------===//
+
+void GmlStDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "gml_st/IR/gml_st_ops.cc.inc"
+      >();
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "gml_st/IR/gml_st_types.cc.inc"
+      >();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "gml_st/IR/gml_st_attrs.cc.inc"
+      >();
+}
+
+Operation *GmlStDialect::materializeConstant(OpBuilder &builder, Attribute attr,
+                                             Type type, Location loc) {
+  if (type.isa<IndexType>()) {
+    int64_t intValue = attr.cast<IntegerAttr>().getInt();
+    return builder.create<arith::ConstantIndexOp>(loc, intValue);
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// MaterializeOp
+//===----------------------------------------------------------------------===//
+
+void MaterializeOp::build(OpBuilder &b, OperationState &result, Value source,
+                          ArrayRef<OpFoldResult> offsets,
+                          ArrayRef<OpFoldResult> sizes,
+                          ArrayRef<OpFoldResult> strides) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+  auto sourceType = source.getType().cast<ShapedType>();
+  Type resultType = inferReturnType(sourceType, staticSizes);
+  build(b, result, resultType, source, dynamicOffsets, dynamicSizes,
+        dynamicStrides, b.getDenseI64ArrayAttr(staticOffsets),
+        b.getDenseI64ArrayAttr(staticSizes),
+        b.getDenseI64ArrayAttr(staticStrides));
+}
+
+void MaterializeOp::build(OpBuilder &b, OperationState &result, Type resultType,
+                          Value source, ArrayRef<OpFoldResult> offsets,
+                          ArrayRef<OpFoldResult> sizes,
+                          ArrayRef<OpFoldResult> strides) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+  build(b, result, resultType, source, dynamicOffsets, dynamicSizes,
+        dynamicStrides, b.getDenseI64ArrayAttr(staticOffsets),
+        b.getDenseI64ArrayAttr(staticSizes),
+        b.getDenseI64ArrayAttr(staticStrides));
+}
+
+void MaterializeOp::build(OpBuilder &b, OperationState &result, Value source,
+                          ArrayRef<OpFoldResult> offsets) {
+  SmallVector<OpFoldResult> unitSizesAndStrides(offsets.size(),
+                                                b.getIndexAttr(1));
+  build(b, result, source, offsets, unitSizesAndStrides, unitSizesAndStrides);
+}
+
+LogicalResult MaterializeOp::verify() {
+  return verifyCompatibleExtractedSubset(getOperation(), getSource().getType(),
+                                         getType(), getStaticSizes());
+}
+
+LogicalResult MaterializeOp::reifyResultShapes(
+    OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
+  reifiedReturnShapes.push_back(
+      getAsValues(builder, getLoc(), getMixedSizes()));
+  return success();
+}
+
+namespace {
+
+/// Adapted from OpWithOffsetSizesAndStridesConstantArgumentFolder, which makes
+/// slightly incompatible assumptions about the op.
+struct FoldConstantsIntoMaterializeOp : public OpRewritePattern<MaterializeOp> {
+  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MaterializeOp op,
+                                PatternRewriter &rewriter) const override {
+    // No constant operand, just return;
+    if (llvm::none_of(op.getOperands(), [](Value operand) {
+          return matchPattern(operand, matchConstantIndex());
+        }))
+      return failure();
+
+    // At least one of offsets/sizes/strides is a new constant.
+    // Form the new list of operands and constant attributes from the existing.
+    SmallVector<OpFoldResult> mixedOffsets(op.getMixedOffsets());
+    SmallVector<OpFoldResult> mixedSizes(op.getMixedSizes());
+    SmallVector<OpFoldResult> mixedStrides(op.getMixedStrides());
+    canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamic);
+    canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
+    canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamic);
+
+    SmallVector<int64_t> staticSizes;
+    SmallVector<Value> dynamicSizes;
+    dispatchIndexOpFoldResults(mixedSizes, dynamicSizes, staticSizes);
+
+    Type opResultType = op.getType();
+    Type newResultType =
+        opResultType.isa<ShapedType>()
+            ? inferReturnType(op.getSource().getType(), staticSizes)
+            : opResultType;
+    // Create the new tile in canonical form.
+    auto newMaterializeOp = rewriter.create<MaterializeOp>(
+        op.getLoc(), newResultType, op.getSource(), mixedOffsets, mixedSizes,
+        mixedStrides);
+
+    // Cast the result back to the original type.
+    if (opResultType != newResultType) {
+      rewriter.replaceOpWithNewOp<tensor::CastOp>(op, opResultType,
+                                                  newMaterializeOp.getResult());
+    } else {
+      rewriter.replaceOp(op, newMaterializeOp.getResult());
+    }
+    return success();
+  }
+};
+
+/// Folds tensor::CastOp sources into MaterializeOp.
+struct FoldSrcCastIntoMaterialize : public OpRewritePattern<MaterializeOp> {
+  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MaterializeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto cast = op.getSource().getDefiningOp<tensor::CastOp>();
+    if (!cast) return failure();
+
+    auto src = cast.getSource();
+    auto shape = op.getStaticSizes();
+    rewriter.replaceOpWithNewOp<MaterializeOp>(
+        op, inferReturnType(src.getType(), shape), src, op.getMixedOffsets(),
+        op.getMixedSizes(), op.getMixedStrides());
+    return success();
+  }
+};
+}  // namespace
+
+void MaterializeOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                MLIRContext *context) {
+  results.add<FoldConstantsIntoMaterializeOp, FoldSrcCastIntoMaterialize>(
+      context);
+}
+
+//===----------------------------------------------------------------------===//
+// LoopLikeOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+ParseResult parseLoopLikeOpOutputArgs(
+    OpAsmParser &parser, OperationState &result,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &regionOperands,
+    SmallVectorImpl<Type> &regionTypes, int32_t *outputCount) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> outputs, outputRegionArgs;
+  SmallVector<Type, 4> outputTypes;
+
+  auto parseElt = [&]() -> ParseResult {
+    if (parser.parseOperand(outputRegionArgs.emplace_back(),
+                            /*allowResultNumber=*/false) ||
+        parser.parseEqual()) {
+      return failure();
+    }
+    if (parser.parseOperand(outputs.emplace_back()) || parser.parseColon() ||
+        parser.parseType(outputTypes.emplace_back())) {
+      return failure();
+    }
+    *outputCount = static_cast<int32_t>(outputs.size());
+    return success();
+  };
+  if (succeeded(parser.parseOptionalKeyword("outs"))) {
+    SMLoc loc = parser.getCurrentLocation();
+
+    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
+      return failure();
+    if (parser.resolveOperands(outputs, outputTypes, loc, result.operands))
+      return failure();
+  }
+  regionOperands.append(outputRegionArgs);
+  regionTypes.append(outputTypes);
+  return success();
+}
+
+}  // namespace
+
+template <typename LoopTy>
+ParseResult parseLoopLikeOp(OpAsmParser &parser, OperationState &result) {
+  auto &builder = parser.getBuilder();
+  // Parse an opening `(` followed by induction variables followed by `)`
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> ivs;
+  if (parser.parseOperandList(ivs, OpAsmParser::Delimiter::Paren,
+                              /*allowResultNumber=*/false))
+    return failure();
+
+  // Parse loop bounds.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> lower;
+  if (parser.parseEqual() ||
+      parser.parseOperandList(lower, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
+    return failure();
+
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> upper;
+  if (parser.parseKeyword("to") ||
+      parser.parseOperandList(upper, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
+    return failure();
+
+  // Parse step values.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> steps;
+  if (parser.parseKeyword("step") ||
+      parser.parseOperandList(steps, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
+    return failure();
+
+  // Parse the output tensors and the body.
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> regionOperands(ivs);
+  SmallVector<Type, 4> regionTypes(ivs.size(), builder.getIndexType());
+
+  int32_t outputCount = 0;
+  if (failed(parseLoopLikeOpOutputArgs(parser, result, regionOperands,
+                                       regionTypes, &outputCount)))
+    return failure();
+
+  // Parse distribution type (only for ParallelOp)
+  if (std::is_same<LoopTy, ParallelOp>::value) {
+    if (succeeded(parser.parseOptionalKeyword("distribution"))) {
+      StringAttr distributionType;
+      if (parser.parseLParen() || parser.parseAttribute(distributionType) ||
+          parser.parseRParen())
+        return failure();
+      result.addAttribute(ParallelOp::getDistributionTypeAttrName(result.name),
+                          distributionType);
+    }
+  }
+
+  SmallVector<OpAsmParser::Argument, 4> regionArgs;
+  for (auto argAndType : llvm::zip(regionOperands, regionTypes)) {
+    auto &arg = regionArgs.emplace_back();
+    std::tie(arg.ssaName, arg.type) = argAndType;
+  }
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, regionArgs)) return failure();
+
+  // Parse attributes.
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  // Parser result types.
+  if (parser.parseOptionalColonTypeList(result.types)) return failure();
+
+  // Add segment sizes.
+  result.addAttribute(LoopTy::getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(lower.size()),
+                           static_cast<int32_t>(upper.size()),
+                           static_cast<int32_t>(steps.size()), outputCount}));
+
+  return success();
+}
+
+template <typename LoopTy>
+void buildLoopLikeOp(
+    OpBuilder &builder, OperationState &result, TypeRange resultTypes,
+    ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps,
+    ValueRange outputs,
+    function_ref<void(OpBuilder &, Location, ValueRange, ValueRange)>
+        bodyBuilderFn) {
+  result.addOperands(lowerBounds);
+  result.addOperands(upperBounds);
+  result.addOperands(steps);
+  result.addOperands(outputs);
+  result.addTypes(resultTypes);
+  result.addAttribute(
+      LoopTy::getOperandSegmentSizeAttr(),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lowerBounds.size()),
+                                    static_cast<int32_t>(upperBounds.size()),
+                                    static_cast<int32_t>(steps.size()),
+                                    static_cast<int32_t>(outputs.size())}));
+
+  OpBuilder::InsertionGuard guard(builder);
+  unsigned numIvs = steps.size();
+  SmallVector<Type, 8> argTypes(numIvs, builder.getIndexType());
+  SmallVector<Location, 8> argLocs(numIvs, result.location);
+  for (Value output : outputs) {
+    argTypes.push_back(output.getType());
+    argLocs.push_back(output.getLoc());
+  }
+  Region *bodyRegion = result.addRegion();
+  Block *bodyBlock = builder.createBlock(bodyRegion, {}, argTypes, argLocs);
+
+  if (bodyBuilderFn) {
+    builder.setInsertionPointToStart(bodyBlock);
+    bodyBuilderFn(builder, result.location,
+                  bodyBlock->getArguments().take_front(numIvs),
+                  bodyBlock->getArguments().take_back(outputs.size()));
+    LoopTy::ensureTerminator(*bodyRegion, builder, result.location);
+  }
+}
+
+template <typename LoopLikeOp>
+struct CollapseSingleIterationLoops : public OpRewritePattern<LoopLikeOp> {
+  explicit CollapseSingleIterationLoops(
+      MLIRContext *context,
+      llvm::function_ref<bool(LoopLikeOp)> filterFn = nullptr,
+      PatternBenefit benefit = 1)
+      : OpRewritePattern<LoopLikeOp>(context, benefit), filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(LoopLikeOp op,
+                                PatternRewriter &rewriter) const override {
+    if (filterFn && !filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+
+    IRMapping mapping;
+    // Compute new loop bounds that omit all single-iteration loop dimensions.
+    SmallVector<Value> newLowerBounds, newUpperBounds, newSteps;
+    newLowerBounds.reserve(op.getLowerBound().size());
+    newUpperBounds.reserve(op.getUpperBound().size());
+    newSteps.reserve(op.getStep().size());
+    auto getConstant = [](Value v) -> Optional<int64_t> {
+      auto constant =
+          dyn_cast_or_null<arith::ConstantIndexOp>(v.getDefiningOp());
+      if (constant) return constant.value();
+      return std::nullopt;
+    };
+    for (auto [lowerBound, upperBound, step, iv] :
+         llvm::zip(op.getLowerBound(), op.getUpperBound(), op.getStep(),
+                   op.getInductionVars())) {
+      // Collect the statically known loop bounds.
+      auto lowerBoundConstant = getConstant(lowerBound);
+      auto upperBoundConstant = getConstant(upperBound);
+      auto stepConstant = getConstant(step);
+      // Remove the loop if it performs zero iterations.
+      if (lowerBoundConstant && upperBoundConstant &&
+          *lowerBoundConstant == *upperBoundConstant) {
+        rewriter.replaceOp(op, op.getOutputs());
+        return success();
+      }
+      // Replace the loop induction variable by the lower bound if the loop
+      // performs a single iteration. Otherwise, copy the loop bounds.
+      if (lowerBoundConstant && upperBoundConstant && stepConstant &&
+          (*upperBoundConstant - *lowerBoundConstant) > 0 &&
+          (*upperBoundConstant - *lowerBoundConstant) <= *stepConstant) {
+        mapping.map(iv, lowerBound);
+      } else {
+        newLowerBounds.push_back(lowerBound);
+        newUpperBounds.push_back(upperBound);
+        newSteps.push_back(step);
+      }
+    }
+    // Exit if none of the loop dimensions perform a single iteration.
+    if (newLowerBounds.size() == op.getLowerBound().size()) return failure();
+
+    // All of the loop dimensions perform a single iteration. Inline loop body.
+    if (newLowerBounds.empty()) {
+      mapping.map(op.getRegionOutputArgs(), op.getOutputs());
+      for (auto &bodyOp : op.getBody()->without_terminator()) {
+        rewriter.clone(bodyOp, mapping);
+      }
+      SmallVector<Value> results;
+      results.reserve(op.getResults().size());
+      SetYieldOp terminator = op.getTerminator();
+      for (const auto &[dst, src, set] :
+           llvm::zip(terminator.getDsts(), terminator.getSrcs(),
+                     terminator.getSets())) {
+        auto tileOp = set.template getDefiningOp<TileOp>();
+
+        if (!tileOp) {
+          return terminator.emitOpError(
+              "expected the SetYieldOp terminator of gml_st loop to have a "
+              "TileOp set");
+        }
+        auto getMappedValues = [&](ValueRange values) {
+          return llvm::to_vector(llvm::map_range(values, [&](Value value) {
+            return mapping.lookupOrDefault(value);
+          }));
+        };
+
+        if (dst.getType().template isa<TensorType>()) {
+          Value srcVal = mapping.lookupOrDefault(src);
+          if (srcVal.getType().isa<TensorType>()) {
+            results.push_back(rewriter.create<tensor::InsertSliceOp>(
+                op.getLoc(), dst.getType(), srcVal,
+                mapping.lookupOrDefault(dst),
+                getMappedValues(tileOp.getOffsets()),
+                getMappedValues(tileOp.getSizes()),
+                getMappedValues(tileOp.getStrides()), tileOp.getStaticOffsets(),
+                tileOp.getStaticSizes(), tileOp.getStaticStrides()));
+          } else {
+            SmallVector<Value> mappedOffsets =
+                getMappedValues(tileOp.getOffsets());
+            SmallVector<OpFoldResult> ofrs;
+            int idx = 0;
+            for (int64_t offset : tileOp.getStaticOffsets()) {
+              if (ShapedType::isDynamic(offset)) {
+                ofrs.push_back(mappedOffsets[idx++]);
+              } else {
+                ofrs.push_back(rewriter.getIndexAttr(offset));
+              }
+            }
+            results.push_back(rewriter.create<tensor::InsertOp>(
+                op.getLoc(), srcVal, mapping.lookupOrDefault(dst),
+                getAsValues(rewriter, op.getLoc(), ofrs)));
+          }
+        } else if (dst.getType().template isa<VectorType>()) {
+          results.push_back(rewriter.create<vector::InsertStridedSliceOp>(
+              op.getLoc(), dst.getType(), mapping.lookupOrDefault(src),
+              mapping.lookupOrDefault(dst),
+              rewriter.getI64ArrayAttr(tileOp.getStaticSizes()),
+              rewriter.getI64ArrayAttr(tileOp.getStaticStrides())));
+        } else {
+          return op.emitOpError(
+              "expected output of gml_st loop to be either a tensor or a "
+              "vector");
+        }
+      }
+      rewriter.replaceOp(op, results);
+      return success();
+    }
+
+    // Replace the loop by a lower-dimensional loop.
+    LoopLikeOp newOp;
+    if constexpr (std::is_same_v<LoopLikeOp, ParallelOp>) {
+      auto parallelLoop = cast<ParallelOp>(op);
+      newOp = rewriter.create<ParallelOp>(op.getLoc(), op.getResultTypes(),
+                                          newLowerBounds, newUpperBounds,
+                                          newSteps, parallelLoop.getOutputs());
+    } else {
+      newOp = rewriter.create<ForOp>(op.getLoc(), op.getResultTypes(),
+                                     newLowerBounds, newUpperBounds, newSteps,
+                                     op.getOutputs(), nullptr);
+    }
+    // The new loop needs to keep all attributes from the old one, except for
+    // "operand_segment_sizes" which captures the outdated information of the
+    // old iteration domain.
+    for (const auto &namedAttr : op->getAttrs()) {
+      if (namedAttr.getName() == LoopLikeOp::getOperandSegmentSizeAttr())
+        continue;
+      newOp->setAttr(namedAttr.getName(), namedAttr.getValue());
+    }
+
+    // Clone the loop body and remap the block arguments of the collapsed loops
+    // (inlining does not support a cancellable block argument mapping).
+    rewriter.cloneRegionBefore(op.getRegion(), newOp.getRegion(),
+                               newOp.getRegion().begin(), mapping);
+    rewriter.replaceOp(op, newOp.getResults());
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(LoopLikeOp)> filterFn;
+};
+
+//===----------------------------------------------------------------------===//
+// ParallelOp
+//===----------------------------------------------------------------------===//
+
+Region &ParallelOp::getLoopBody() { return getRegion(); }
+
+SetYieldOp ParallelOp::getTerminator() {
+  return cast<SetYieldOp>(getBody()->getTerminator());
+}
+
+LogicalResult ParallelOp::verify() {
+  if (getNumResults() != getNumOutputs()) {
+    return emitOpError() << "expected the number of output arguments to match "
+                            "the number of results";
+  }
+
+  // Check if types of output arguments match region args types.
+  for (auto &item : llvm::enumerate(
+           llvm::zip(getOutputs(), getRegionOutputArgs(), getResultTypes()))) {
+    Value output, outputRegionArg;
+    Type resultType;
+    unsigned index = item.index();
+    std::tie(output, outputRegionArg, resultType) = item.value();
+    if (output.getType() != outputRegionArg.getType()) {
+      return emitOpError("expected output arg ")
+             << index << " with type = " << output.getType()
+             << " to match region arg " << index + getNumLoops()
+             << " type = " << outputRegionArg.getType();
+    }
+    if (output.getType() != resultType) {
+      return emitOpError("expected output arg ")
+             << index << " with type = " << output.getType()
+             << " to match resultType " << index << " type = " << resultType;
+    }
+    auto terminator = getTerminator();
+    auto numDstOperands = terminator.getNumDstOperands();
+    if (index >= numDstOperands) {
+      const auto *s = index ? "s" : "";
+      return terminator.emitOpError("expected to have at least ")
+             << index + 1 << " destination operand" << s << " (currently "
+             << numDstOperands << ")";
+    }
+
+    if (terminator.getDstOperand(index)->get() != outputRegionArg) {
+      return terminator.emitOpError("expected output block argument ")
+             << index << " to match set_yield destination";
+    }
+  }
+  return success();
+}
+
+void ParallelOp::build(
+    OpBuilder &builder, OperationState &result, TypeRange resultTypes,
+    ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps,
+    ValueRange outputs, std::optional<StringAttr> distributionType,
+    function_ref<void(OpBuilder &, Location, ValueRange, ValueRange)>
+        bodyBuilderFn) {
+  if (distributionType.has_value())
+    result.addAttribute(getDistributionTypeAttrName(result.name),
+                        distributionType.value());
+
+  buildLoopLikeOp<ParallelOp>(builder, result, resultTypes, lowerBounds,
+                              upperBounds, steps, outputs, bodyBuilderFn);
+}
+
+void ParallelOp::print(OpAsmPrinter &p) {
+  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
+    << getUpperBound() << ") step (" << getStep() << ") ";
+
+  if (!getOutputs().empty()) {
+    p << "outs (";
+    llvm::interleaveComma(
+        llvm::zip(getRegionOutputArgs(), getOutputs()), p, [&](auto it) {
+          Value outputRegionArg, output;
+          std::tie(outputRegionArg, output) = it;
+          p << outputRegionArg << " = " << output << ": " << output.getType();
+        });
+    p << ") ";
+  }
+
+  if (getDistributionType().has_value())
+    p << "distribution (" << getDistributionTypeAttr() << ") ";
+
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(
+      getOperation()->getAttrs(),
+      /*elidedAttrs=*/{ParallelOp::getOperandSegmentSizeAttr(),
+                       getDistributionTypeAttrName()});
+
+  if (!getResultTypes().empty()) {
+    p << " : ";
+    llvm::interleave(getResultTypes(), p, ", ");
+  }
+}
+
+ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseLoopLikeOp<ParallelOp>(parser, result);
+}
+
+namespace {
+
+/// Fold tensor.dim(gml_st.parallel outs(... = %t)) to tensor.dim(%t).
+struct DimOfParallelOp : public OpRewritePattern<tensor::DimOp> {
+  using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::DimOp dimOp,
+                                PatternRewriter &rewriter) const final {
+    auto parallelOp = dimOp.getSource().getDefiningOp<ParallelOp>();
+    if (!parallelOp) return failure();
+
+    OpOperand &out =
+        parallelOp.getOpOperandForResult(dimOp.getSource().cast<OpResult>());
+    rewriter.updateRootInPlace(
+        dimOp, [&]() { dimOp.getSourceMutable().assign(out.get()); });
+    return success();
+  }
+};
+
+/// Fold tensor.casts into the output arguments of gml_st.parallel.
+struct FoldTensorCastIntoParallelOp
+    : public OpRewritePattern<gml_st::ParallelOp> {
+  using OpRewritePattern<gml_st::ParallelOp>::OpRewritePattern;
+
+  struct TypeCast {
+    Type srcType;
+    Type dstType;
+  };
+
+  LogicalResult matchAndRewrite(gml_st::ParallelOp parallelOp,
+                                PatternRewriter &rewriter) const final {
+    llvm::SmallMapVector<unsigned, TypeCast, 2> tensorCastProducers;
+    llvm::SmallVector<Value> newOutputTensors = parallelOp.getOutputs();
+    for (auto &en : llvm::enumerate(newOutputTensors)) {
+      if (auto castOp = en.value().getDefiningOp<tensor::CastOp>()) {
+        tensorCastProducers[en.index()] =
+            TypeCast{castOp.getSource().getType(), castOp.getType()};
+        en.value() = castOp.getSource();
+      }
+    }
+
+    if (tensorCastProducers.empty()) return failure();
+
+    // Create new loop.
+    Location loc = parallelOp.getLoc();
+    std::optional<StringAttr> distTypeAttr;
+    if (auto distType = parallelOp.getDistributionType())
+      distTypeAttr = rewriter.getStringAttr(*distType);
+    auto newParallelOp = rewriter.create<ParallelOp>(
+        loc, TypeRange{ValueRange{newOutputTensors}},
+        parallelOp.getLowerBound(), parallelOp.getUpperBound(),
+        parallelOp.getStep(), newOutputTensors, distTypeAttr, nullptr);
+
+    Block *loopBody = newParallelOp.getBody();
+
+    // Cast bbArgs back to the original types.
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointToStart(loopBody);
+    SmallVector<Value> castBBArgs =
+        ValueRange{newParallelOp.getRegionOutputArgs()};
+    for (auto &item : tensorCastProducers) {
+      Value &oldTypeBBArg = castBBArgs[item.first];
+      oldTypeBBArg = rewriter.create<tensor::CastOp>(loc, item.second.dstType,
+                                                     oldTypeBBArg);
+    }
+
+    // Move old body into new parallel loop.
+    SmallVector<Value> blockArgs = newParallelOp.getInductionVars();
+    blockArgs.append(castBBArgs);
+    rewriter.mergeBlocks(parallelOp.getBody(), loopBody, blockArgs);
+
+    // Cast `set_yield` destination operands to the new types.
+    SetYieldOp terminator = newParallelOp.getTerminator();
+    rewriter.setInsertionPoint(terminator);
+    SmallVector<Value> castDsts = terminator.getDsts();
+    for (auto &item : tensorCastProducers) {
+      Value &newTypeDsts = castDsts[item.first];
+      newTypeDsts = rewriter.create<tensor::CastOp>(loc, item.second.srcType,
+                                                    newTypeDsts);
+    }
+    terminator.getDstsMutable().assign(castDsts);
+
+    // Cast results back to the original types.
+    rewriter.setInsertionPointAfter(newParallelOp);
+    SmallVector<Value> castResults = newParallelOp.getResults();
+    for (auto &item : tensorCastProducers) {
+      Value &oldTypeResult = castResults[item.first];
+      oldTypeResult = rewriter.create<tensor::CastOp>(loc, item.second.dstType,
+                                                      oldTypeResult);
+    }
+    rewriter.replaceOp(parallelOp, castResults);
+
+    return success();
+  }
+};
+
+}  // namespace
+
+void ParallelOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                             MLIRContext *context) {
+  results.add<CollapseSingleIterationLoops<ParallelOp>>(
+      context,
+      [&](ParallelOp op) { return !op.getDistributionType().has_value(); });
+  results.add<DimOfParallelOp, FoldTensorCastIntoParallelOp>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// ForOp
+//===----------------------------------------------------------------------===//
+
+Region &ForOp::getLoopBody() { return getRegion(); }
+
+SetYieldOp ForOp::getTerminator() {
+  return cast<SetYieldOp>(getBody()->getTerminator());
+}
+
+LogicalResult ForOp::verify() {
+  if (getNumResults() != getNumOutputs()) {
+    return emitOpError() << "expected the number of output arguments to match "
+                            "the number of results";
+  }
+
+  // Check if types of output arguments match region args types.
+  for (auto &item : llvm::enumerate(
+           llvm::zip(getOutputs(), getRegionOutputArgs(), getResultTypes()))) {
+    Value output, outputRegionArg;
+    Type resultType;
+    unsigned index = item.index();
+    std::tie(output, outputRegionArg, resultType) = item.value();
+    if (output.getType() != outputRegionArg.getType()) {
+      return emitOpError("expected output arg ")
+             << index << " with type = " << output.getType()
+             << " to match region arg " << index + getNumLoops()
+             << " type = " << outputRegionArg.getType();
+    }
+    if (output.getType() != resultType) {
+      return emitOpError("expected output arg ")
+             << index << " with type = " << output.getType()
+             << " to match resultType " << index << " type = " << resultType;
+    }
+    auto terminator = getTerminator();
+    auto numDstOperands = terminator.getNumDstOperands();
+    if (index >= numDstOperands) {
+      const auto *s = index ? "s" : "";
+      return terminator.emitOpError("expected to have at least ")
+             << index + 1 << " destination operand" << s << " (currently "
+             << numDstOperands << ")";
+    }
+
+    if (terminator.getDstOperand(index)->get() != outputRegionArg) {
+      return terminator.emitOpError("expected output block argument ")
+             << index << " to match set_yield destination";
+    }
+  }
+  return success();
+}
+
+void ForOp::build(
+    OpBuilder &builder, OperationState &result, TypeRange resultTypes,
+    ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps,
+    ValueRange outputs,
+    function_ref<void(OpBuilder &, Location, ValueRange, ValueRange)>
+        bodyBuilderFn) {
+  buildLoopLikeOp<ForOp>(builder, result, resultTypes, lowerBounds, upperBounds,
+                         steps, outputs, bodyBuilderFn);
+}
+
+void ForOp::print(OpAsmPrinter &p) {
+  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
+    << getUpperBound() << ") step (" << getStep() << ")";
+
+  if (!getOutputs().empty()) {
+    p << " outs (";
+    llvm::interleaveComma(
+        llvm::zip(getRegionOutputArgs(), getOutputs()), p, [&](auto it) {
+          Value outputRegionArg, output;
+          std::tie(outputRegionArg, output) = it;
+          p << outputRegionArg << " = " << output << ": " << output.getType();
+        });
+    p << ")";
+  }
+
+  p << ' ';
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(getOperation()->getAttrs(),
+                          /*elidedAttrs=*/{ForOp::getOperandSegmentSizeAttr()});
+
+  if (!getResultTypes().empty()) {
+    p << " : ";
+    llvm::interleave(getResultTypes(), p, ", ");
+  }
+}
+
+ParseResult ForOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseLoopLikeOp<ForOp>(parser, result);
+}
+
+namespace {
+/// Folds CastOp of loop outputs into ForOp
+struct RefineForOpShape : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ForOp op,
+                                PatternRewriter &rewriter) const override {
+    if (llvm::all_of(op.getOutputs(), [](auto out) {
+          return out.template getDefiningOp<tensor::CastOp>() == nullptr;
+        }))
+      return failure();
+
+    Location loc = op.getLoc();
+    // Scans through output args to find what args are produced by `tensor.cast`
+    // ops. Also cache the info since we are gonna reuse it a lot.
+    SmallVector<Value> newOutputs{op.getOutputs()};
+    SmallVector<Type> newTypes{op.getResultTypes()};
+    SmallVector<tensor::CastOp> castOutputs;
+    for (auto &&[out, type] : llvm::zip(newOutputs, newTypes)) {
+      if (auto cast =
+              castOutputs.emplace_back(out.getDefiningOp<tensor::CastOp>())) {
+        out = cast.getSource();
+        type = out.getType();
+      }
+    }
+
+    auto newFor = rewriter.create<ForOp>(loc, newTypes, op.getLowerBound(),
+                                         op.getUpperBound(), op.getStep(),
+                                         newOutputs, nullptr);
+    // The new loop needs to keep all attributes from the old one.
+    newFor->setAttrs(op->getAttrs());
+
+    // Map outputs, insert `tensor.cast` if necessary.
+    IRMapping bvm;
+    bvm.map(op.getInductionVars(), newFor.getInductionVars());
+
+    auto innerBuilder = ImplicitLocOpBuilder::atBlockEnd(loc, newFor.getBody());
+    rewriter.setInsertionPointAfter(newFor);
+
+    for (const auto &[oldArg, newArg, cast] :
+         llvm::zip(op.getRegionOutputArgs(), newFor.getRegionOutputArgs(),
+                   castOutputs)) {
+      bvm.map(oldArg,
+              cast ? innerBuilder.create<tensor::CastOp>(cast.getType(), newArg)
+                   : Value(newArg));
+    }
+    // Cast the loop results for downstream uses of the loop if necessary.
+    SmallVector<Value> newResults{newFor.getResults()};
+    for (auto &&[res, cast] : llvm::zip(newResults, castOutputs)) {
+      if (cast) res = rewriter.create<tensor::CastOp>(loc, cast.getType(), res);
+    }
+
+    // Clone loop body.
+    for (auto &o : *(op.getBody())) innerBuilder.clone(o, bvm);
+
+    // Update set_yield destinations to the new type.
+    auto term = cast<SetYieldOp>(newFor.getTerminator());
+    rewriter.updateRootInPlace(term, [&]() {
+      term.getDstsMutable().assign(newFor.getRegionOutputArgs());
+    });
+
+    // Update the original loop by the new loop + CastOp.
+    rewriter.replaceOp(op, newResults);
+    return success();
+  }
+};
+
+// Fold away ForOp iter arguments when:
+// 1) The op yields the iter arguments.
+// 2) The iter arguments have no use and the corresponding outer region
+// iterators (inputs) are yielded.
+// 3) The iter arguments have no use and the corresponding (operation) results
+// have no use.
+//
+// These arguments must be defined outside of the ForOp region and can just be
+// forwarded after simplifying the op inits, yields and returns.
+//
+// The implementation uses `mergeBlockBefore` to steal the content of the
+// original ForOp and avoid cloning.
+struct ForOpIterArgsFolder : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ForOp forOp,
+                                PatternRewriter &rewriter) const final {
+    bool canonicalize = false;
+    auto yieldOp = forOp.getTerminator();
+
+    // An internal flat vector of block transfer
+    // arguments `newBlockTransferArgs` keeps the 1-1 mapping of original to
+    // transformed block argument mappings. This plays the role of a
+    // IRMapping for the particular use case of calling into
+    // `mergeBlockBefore`.
+    SmallVector<bool, 4> keepMask;
+    keepMask.reserve(yieldOp.getNumUpdates());
+    SmallVector<Value, 4> newBlockTransferArgs, newOutputArgs, newResultValues;
+    newBlockTransferArgs.reserve(1 + forOp.getNumOutputs());
+    newBlockTransferArgs.push_back(Value());  // iv placeholder with null value
+    newOutputArgs.reserve(forOp.getNumOutputs());
+    newResultValues.reserve(forOp.getNumResults());
+    // [iter from outside, iter inside region, op results, yield sources]
+    for (auto [out, arg, res, yieldSrc] :
+         llvm::zip(forOp.getOutputs(), forOp.getRegionOutputArgs(),
+                   forOp.getResults(), yieldOp.getSrcs())) {
+      // Forwarded is `true` when:
+      // 1) The region `iter` argument is yielded.
+      // 2) The region `iter` argument has no use, and the corresponding iter
+      // operand (input) is yielded.
+      // 3) The region `iter` argument has no use, and the corresponding op
+      // result has no use.
+      bool forwarded =
+          ((arg == yieldSrc) ||
+           (arg.use_empty() && (out == yieldSrc || res.use_empty())));
+      keepMask.push_back(!forwarded);
+      canonicalize |= forwarded;
+      if (forwarded) {
+        newBlockTransferArgs.push_back(out);
+        newResultValues.push_back(out);
+        continue;
+      }
+      newOutputArgs.push_back(out);
+      newBlockTransferArgs.push_back(Value());  // placeholder with null value
+      newResultValues.push_back(Value());       // placeholder with null value
+    }
+
+    if (!canonicalize) return failure();
+
+    auto newForOp = rewriter.create<ForOp>(
+        forOp.getLoc(),
+        llvm::to_vector<1>(llvm::map_range(
+            newOutputArgs, [&](Value v) -> Type { return v.getType(); })),
+        forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(),
+        newOutputArgs, nullptr);
+    // The new loop needs to keep all attributes from the old one, except for
+    // "operand_segment_sizes" which captures the outdated information of the
+    // old iteration domain.
+    for (const auto &namedAttr : forOp->getAttrs()) {
+      if (namedAttr.getName() == ForOp::getOperandSegmentSizeAttr()) continue;
+      newForOp->setAttr(namedAttr.getName(), namedAttr.getValue());
+    }
+    Block &newBlock = newForOp.getRegion().front();
+
+    // Replace the null placeholders with newly constructed values.
+    newBlockTransferArgs[0] = newBlock.getArgument(0);  // iv
+    for (unsigned idx = 0, collapsedIdx = 0, e = newResultValues.size();
+         idx != e; ++idx) {
+      Value &blockTransferArg = newBlockTransferArgs[1 + idx];
+      Value &newResultVal = newResultValues[idx];
+      assert((blockTransferArg && newResultVal) ||
+             (!blockTransferArg && !newResultVal));
+      if (!blockTransferArg) {
+        blockTransferArg = newForOp.getRegionOutputArgs()[collapsedIdx];
+        newResultVal = newForOp.getResult(collapsedIdx++);
+      }
+    }
+
+    Block &oldBlock = forOp.getRegion().front();
+    assert(oldBlock.getNumArguments() == newBlockTransferArgs.size() &&
+           "unexpected argument size mismatch");
+
+    // No terminator case: merge and rewrite the merged terminator.
+    auto cloneFilteredTerminator = [&](SetYieldOp mergedTerminator) {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPoint(mergedTerminator);
+      SmallVector<Value, 4> filteredSrcs, filteredDsts, filteredSets;
+      filteredSrcs.reserve(newResultValues.size());
+      filteredDsts.reserve(newResultValues.size());
+      filteredSets.reserve(newResultValues.size());
+      for (unsigned idx = 0, e = keepMask.size(); idx < e; ++idx) {
+        if (keepMask[idx]) {
+          filteredSrcs.push_back(mergedTerminator.getSrcs()[idx]);
+          filteredDsts.push_back(mergedTerminator.getDsts()[idx]);
+          filteredSets.push_back(mergedTerminator.getSets()[idx]);
+        }
+      }
+      rewriter.create<SetYieldOp>(mergedTerminator.getLoc(), filteredSrcs,
+                                  filteredDsts, filteredSets);
+    };
+
+    rewriter.mergeBlocks(&oldBlock, &newBlock, newBlockTransferArgs);
+    auto mergedYieldOp = newForOp.getTerminator();
+    cloneFilteredTerminator(mergedYieldOp);
+    rewriter.eraseOp(mergedYieldOp);
+    rewriter.replaceOp(forOp, newResultValues);
+    return success();
+  }
+};
+}  // namespace
+
+void ForOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                        MLIRContext *context) {
+  results.add<CollapseSingleIterationLoops<ForOp>, RefineForOpShape,
+              ForOpIterArgsFolder>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// TileOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Fold gml_st.tile [%c0] ... into gml_st.tile [0] ...
+/// Adapted from OpWithOffsetSizesAndStridesConstantArgumentFolder, which makes
+/// slightly incompatible assumptions about the op.
+struct FoldConstantsIntoTileType : public OpRewritePattern<TileOp> {
+  using OpRewritePattern<TileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TileOp op,
+                                PatternRewriter &rewriter) const override {
+    // No constant operand, just return;
+    if (llvm::none_of(op.getOperands(), [](Value operand) {
+          return matchPattern(operand, matchConstantIndex());
+        }))
+      return failure();
+
+    // At least one of offsets/sizes/strides is a new constant.
+    // Form the new list of operands and constant attributes from the existing.
+    SmallVector<OpFoldResult> mixedOffsets(op.getMixedOffsets());
+    SmallVector<OpFoldResult> mixedSizes(op.getMixedSizes());
+    SmallVector<OpFoldResult> mixedStrides(op.getMixedStrides());
+    canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamic);
+    canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
+    canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamic);
+
+    // Create the new tile in canonical form.
+    TileOp newOp = rewriter.create<TileOp>(op.getLoc(), mixedOffsets,
+                                           mixedSizes, mixedStrides);
+    // Cast the result back to the original type. This will be folded further
+    // materialize ops.
+    rewriter.replaceOpWithNewOp<UnrealizedConversionCastOp>(
+        op, TypeRange{op.getType()}, ValueRange{newOp});
+
+    return success();
+  }
+};
+}  // namespace
+
+void TileOp::build(OpBuilder &b, OperationState &result,
+                   ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+                   ArrayRef<OpFoldResult> strides,
+                   ArrayRef<NamedAttribute> attrs) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+  auto tileType = TileType::get(b.getContext(), staticSizes);
+  build(b, result, tileType, dynamicOffsets, dynamicSizes, dynamicStrides,
+        b.getDenseI64ArrayAttr(staticOffsets),
+        b.getDenseI64ArrayAttr(staticSizes),
+        b.getDenseI64ArrayAttr(staticStrides));
+  result.addAttributes(attrs);
+}
+
+void TileOp::build(OpBuilder &b, OperationState &result,
+                   ArrayRef<OpFoldResult> offsets,
+                   ArrayRef<NamedAttribute> attrs) {
+  SmallVector<OpFoldResult> unitSizesAndStrides(offsets.size(),
+                                                b.getIndexAttr(1));
+  return build(b, result, offsets, unitSizesAndStrides, unitSizesAndStrides,
+               attrs);
+}
+
+LogicalResult TileOp::inferReturnTypes(
+    MLIRContext *ctx, std::optional<Location> /*loc*/, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  // Derive result shape.
+  TileOp::Adaptor adaptor(operands, attributes, regions);
+  SmallVector<int64_t> shape = llvm::to_vector(adaptor.getStaticSizes());
+
+  auto resultTy = TileType::get(ctx, shape);
+  inferredReturnTypes.push_back(resultTy);
+  return success();
+}
+
+LogicalResult TileOp::verify() {
+  auto resultType = getType();
+  auto rank = resultType.getRank();
+  if (failed(mlir::verifyListOfOperandsOrIntegers(
+          getOperation(), "size", rank, getStaticSizes(), getSizes()))) {
+    return failure();
+  }
+  if (failed(mlir::verifyListOfOperandsOrIntegers(
+          getOperation(), "offset", rank, getStaticOffsets(), getOffsets()))) {
+    return failure();
+  }
+  if (failed(mlir::verifyListOfOperandsOrIntegers(
+          getOperation(), "stride", rank, getStaticStrides(), getStrides()))) {
+    return failure();
+  }
+  for (auto [tileSize, offset, size, stride] :
+       llvm::zip(resultType.getShape(), getStaticOffsets(), getStaticSizes(),
+                 getStaticStrides())) {
+    if (offset < 0 && offset != ShapedType::kDynamic) {
+      return emitOpError("expected offset = ")
+             << offset << " to be non-negative";
+    }
+    if (size < 0 && size != ShapedType::kDynamic) {
+      return emitOpError("expected size = ") << size << " to be non-negative";
+    }
+    if (stride < 0 && stride != ShapedType::kDynamic) {
+      return emitOpError("expected stride = ")
+             << stride << " to be non-negative";
+    }
+    if (tileSize != size) {
+      return emitOpError("size arg = ")
+             << size << " does not match tile size = " << tileSize;
+    }
+  }
+  return success();
+}
+
+void TileOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                         MLIRContext *context) {
+  results.add<FoldConstantsIntoTileType>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// SetYieldOp
+//===----------------------------------------------------------------------===//
+
+using AccumulatorRegionBuilderFn =
+    function_ref<void(OpBuilder &, Location, Value, Value)>;
+
+void SetYieldOp::build(OpBuilder &builder, OperationState &result) {
+  build(builder, result, std::nullopt, std::nullopt, std::nullopt);
+}
+
+void SetYieldOp::build(OpBuilder &builder, OperationState &result,
+                       ValueRange srcs, ValueRange dsts, ValueRange sets) {
+  SmallVector<bool, 2> accumulatorFlags(srcs.size(), false);
+  build(builder, result, srcs, dsts, sets,
+        builder.getBoolArrayAttr(accumulatorFlags), std::nullopt);
+}
+
+void SetYieldOp::build(
+    OpBuilder &builder, OperationState &result, ValueRange srcs,
+    ValueRange dsts, ValueRange sets, ArrayAttr accumulatorFlags,
+    ArrayRef<AccumulatorRegionBuilderFn> accumulatorBuilderFns) {
+  assert(dsts.size() == srcs.size() &&
+         "`dsts` and `srcs` should have the same size");
+  assert(sets.size() == srcs.size() &&
+         "`sets` and `srcs` should have the same size");
+  assert(accumulatorFlags.size() == srcs.size() &&
+         "`accumulatorFlags` and `srcs` should have the same size");
+
+  auto accumulatorCount = llvm::count_if(accumulatorFlags, [](Attribute attr) {
+    return attr.cast<BoolAttr>().getValue();
+  });
+  (void)accumulatorCount;
+  assert(accumulatorCount ==
+             static_cast<int64_t>(accumulatorBuilderFns.size()) &&
+         "the number of flags set in `accumulatorFlags` attribute should be "
+         "equal to the number of `accumulatorBuilderFns`");
+
+  result.addOperands(srcs);
+  result.addOperands(dsts);
+  result.addOperands(sets);
+  result.addAttribute(SetYieldOp::getAccumulatorFlagsAttrName(result.name),
+                      accumulatorFlags);
+
+  const auto *builderFnIt = accumulatorBuilderFns.begin();
+  for (auto item : llvm::zip(srcs, accumulatorFlags)) {
+    Value src = std::get<0>(item);
+    auto accumulatorFlag = std::get<1>(item).cast<BoolAttr>();
+
+    if (!accumulatorFlag.getValue()) continue;
+    Region *region = result.addRegion();
+    OpBuilder::InsertionGuard g(builder);
+    SmallVector<Type, 2> argTypes(2, src.getType());
+    builder.createBlock(region);
+    Block &bodyBlock = region->front();
+    bodyBlock.addArguments(argTypes, {result.location, result.location});
+
+    builder.setInsertionPointToStart(&bodyBlock);
+    (*builderFnIt)(builder, result.location, bodyBlock.getArgument(0),
+                   bodyBlock.getArgument(1));
+    ++builderFnIt;
+  }
+}
+
+LogicalResult SetYieldOp::verify() {
+  for (const auto [dst, src, set] :
+       llvm::zip(getDsts(), getSrcs(), getSets())) {
+    if (failed(verifyCompatibleExtractedSubset(
+            getOperation(), dst.getType().cast<ShapedType>(), src.getType(),
+            set.getType().cast<TileType>().getShape())))
+      return failure();
+  }
+  auto accumulatorCount = llvm::count_if(
+      getAccumulatorFlags(),
+      [](Attribute attr) { return attr.cast<BoolAttr>().getValue(); });
+  if (accumulatorCount != static_cast<int64_t>(getAccumulators().size()))
+    return emitOpError("expected the number of accumulator regions ")
+           << getAccumulators().size()
+           << " to match the number of set accumulator flags "
+           << accumulatorCount;
+
+  auto *regionIt = getAccumulators().begin();
+  for (auto item : llvm::zip(getSrcs(), getAccumulatorFlags())) {
+    Type srcType = std::get<0>(item).getType();
+    BoolAttr accumulatorFlag = std::get<1>(item).cast<BoolAttr>();
+    if (!accumulatorFlag.getValue()) continue;
+
+    Block &block = regionIt->front();
+    if (block.getArgumentTypes() != SmallVector<Type>{srcType, srcType})
+      return emitOpError()
+             << "expected accumulator region to have 2 arguments of type "
+             << srcType;
+    ++regionIt;
+  }
+  return success();
+}
+
+void SetYieldOp::print(OpAsmPrinter &p) {
+  p.printOptionalAttrDict(getOperation()->getAttrs(), /*elidedAttrs = */
+                          {getAccumulatorFlagsAttrName().str()});
+
+  auto *regionIt = getOperation()->getRegions().begin();
+  for (auto &en : llvm::enumerate(
+           llvm::zip(getSrcs(), getDsts(), getSets(), getAccumulatorFlags()))) {
+    if (en.index() > 0) {
+      p << ',';
+      p.printNewline();
+    }
+    Value src = std::get<0>(en.value());
+    Value dst = std::get<1>(en.value());
+    Value set = std::get<2>(en.value());
+    auto accumulatorFlag = std::get<3>(en.value()).cast<BoolAttr>();
+
+    p << ' ' << src << " into " << dst << '[' << set << ']';
+
+    if (accumulatorFlag.getValue()) {
+      auto &block = regionIt->getBlocks().front();
+      Value newValue = block.getArgument(0);
+      Value oldValue = block.getArgument(1);
+      p << " acc (" << newValue << ", " << oldValue << ": "
+        << oldValue.getType() << ") ";
+
+      p.printRegion(*regionIt, false);
+      ++regionIt;
+    }
+
+    p << " : " << src.getType() << " into " << dst.getType() << '['
+      << set.getType() << ']';
+  }
+}
+
+ParseResult SetYieldOp::parse(OpAsmParser &parser, OperationState &result) {
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  SmallVector<bool, 2> accumulatorFlags;
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> srcs, dsts, sets;
+  SmallVector<Type, 4> srcTypes, dstTypes, setTypes;
+
+  auto parseElt = [&]() -> ParseResult {
+    OpAsmParser::UnresolvedOperand src;
+    auto parseResult = parser.parseOptionalOperand(src);
+
+    if (!parseResult.has_value()) return success();
+    srcs.push_back(src);
+
+    if (parser.parseKeyword("into") ||
+        parser.parseOperand(dsts.emplace_back()) || parser.parseLSquare() ||
+        parser.parseOperand(sets.emplace_back()) || parser.parseRSquare())
+      return failure();
+
+    OpBuilder b(parser.getBuilder().getContext());
+    bool hasAccumulatorRegion = succeeded(parser.parseOptionalKeyword("acc"));
+    accumulatorFlags.push_back(hasAccumulatorRegion);
+    if (hasAccumulatorRegion) {
+      auto region = std::make_unique<Region>();
+      OpAsmParser::UnresolvedOperand newValue, oldValue;
+      Type argType;
+      if (parser.parseLParen() || parser.parseOperand(newValue) ||
+          parser.parseComma() || parser.parseOperand(oldValue) ||
+          parser.parseColonType(argType) || parser.parseRParen())
+        return failure();
+
+      SmallVector<OpAsmParser::Argument, 4> regionArgs;
+      for (auto value : {newValue, oldValue}) {
+        auto &arg = regionArgs.emplace_back();
+        arg.ssaName = value;
+        arg.type = argType;
+      }
+
+      if (parser.parseRegion(*region, regionArgs)) return failure();
+      result.addRegion(std::move(region));
+    }
+    if (parser.parseColon() || parser.parseType(srcTypes.emplace_back()) ||
+        parser.parseKeyword("into") ||
+        parser.parseType(dstTypes.emplace_back()) || parser.parseLSquare() ||
+        parser.parseType(setTypes.emplace_back()) || parser.parseRSquare())
+      return failure();
+
+    return success();
+  };
+  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::None, parseElt))
+    return failure();
+
+  if (parser.resolveOperands(srcs, srcTypes, parser.getCurrentLocation(),
+                             result.operands) ||
+      parser.resolveOperands(dsts, dstTypes, parser.getCurrentLocation(),
+                             result.operands) ||
+      parser.resolveOperands(sets, setTypes, parser.getCurrentLocation(),
+                             result.operands))
+    return failure();
+
+  result.addAttribute(SetYieldOp::getAccumulatorFlagsAttrName(result.name),
+                      parser.getBuilder().getBoolArrayAttr(accumulatorFlags));
+  return success();
+}
+
+namespace {
+/// Folds UnrealizedConversionCast of TileType into SetYieldOp.
+struct FoldTileCastIntoSetYield : public OpRewritePattern<SetYieldOp> {
+  using OpRewritePattern<SetYieldOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SetYieldOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!llvm::any_of(op.getSets(), [](auto set) {
+          return set.template getDefiningOp<UnrealizedConversionCastOp>() !=
+                 nullptr;
+        }))
+      return failure();
+    SmallVector<Value> newSrcs{op.getSrcs()};
+    SmallVector<Value> newSets{op.getSets()};
+    for (auto &&[src, set] : llvm::zip(newSrcs, newSets)) {
+      auto cast = set.getDefiningOp<UnrealizedConversionCastOp>();
+      if (!cast) continue;
+      set = cast.getOperand(0);
+      Type castResultType = src.getType();
+      if (auto shapedType = dyn_cast<ShapedType>(castResultType)) {
+        castResultType =
+            shapedType.clone(set.getType().cast<TileType>().getShape(),
+                             shapedType.getElementType());
+        src = rewriter.create<tensor::CastOp>(op.getLoc(), castResultType, src);
+      }
+    }
+    rewriter.replaceOpWithNewOp<SetYieldOp>(op, newSrcs, op.getDsts(), newSets);
+    return success();
+  }
+};
+}  // namespace
+
+void SetYieldOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                             MLIRContext *context) {
+  results.add<FoldTileCastIntoSetYield>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult YieldOp::verify() { return success(); }
+
+}  // namespace gml_st
+}  // namespace mlir
+
+// Generated op classes.
+#define GET_OP_CLASSES
+#include "gml_st/IR/gml_st_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h
similarity index 78%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h
index ad8d7bede60..eb56b4b437a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // This file defines the operations used in the GML ST dialect.
 
-#ifndef MLIR_HLO_DIALECT_GML_ST_IR_GML_ST_OPS_H
-#define MLIR_HLO_DIALECT_GML_ST_IR_GML_ST_OPS_H
+#ifndef MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
+#define MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
 
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/Dialect.h"
@@ -29,18 +29,18 @@ limitations under the License.
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
 // Generated dialect declarations.
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_dialect.h.inc"
+#include "gml_st/IR/gml_st_dialect.h.inc"
 
 // Generated custom type declarations.
 #define GET_TYPEDEF_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_types.h.inc"
+#include "gml_st/IR/gml_st_types.h.inc"
 
 // Generated attribute classes.
 #define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_attrs.h.inc"
+#include "gml_st/IR/gml_st_attrs.h.inc"
 
 // Generated operation classes.
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h.inc"
+#include "gml_st/IR/gml_st_ops.h.inc"
 
-#endif  // MLIR_HLO_DIALECT_GML_ST_IR_GML_ST_OPS_H
+#endif  // MLIR_HLO_GML_ST_IR_GML_ST_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
new file mode 100644
index 00000000000..8da88d0a0e8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.td
@@ -0,0 +1,635 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for ST ops.
+
+#ifndef GML_ST_OPS
+#define GML_ST_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
+include "gml_st/IR/gml_st_ops_base.td"
+
+///////////////////////////////////////////////////////////////////////////////
+// Types
+///////////////////////////////////////////////////////////////////////////////
+
+// Base class of all subset types.
+class GMLST_Set<string name> : TypeDef<GmlSt_Dialect, name> { }
+
+def GMLST_TileType : GMLST_Set<"Tile"> {
+  let mnemonic = "tile";
+
+  let summary = "Type that represents a tile in an N-d iteration space.";
+  let description = [{
+    Values with the tile type represent an N-dimensional rectangle. Unlike a
+    TensorType, there is no element type specified for the tile. Each dimension
+    may be a static non-negative decimal constant or be dynamically determined
+    (indicated by `?`).
+
+    Examples:
+
+    ```mlir
+    // Full dynamic dimensions.
+    !gml_st.tile<? x ? x f32>
+
+    // Partially dynamic dimensions.
+    !gml_st.tile<? x 3 x ? x f32>
+
+    // Full static shape.
+    !gml_st.tile<16 x 8 x f32>
+
+    // Tile with rank zero.
+    !gml_st.tile<f32>
+
+    // Zero-element dimensions are allowed.
+    !gml_st.tile<0 x 42 x f32>
+    ```
+  }];
+
+  let parameters = (ins ArrayRefParameter<"int64_t">:$shape);
+  let assemblyFormat = "`<` custom<ShapeTypeDimensionsList>($shape) `>`";
+
+  let extraClassDeclaration = [{
+    unsigned getRank() const { return getShape().size(); }
+    bool hasStaticShape() const {
+      return llvm::none_of(getShape(), ShapedType::isDynamic);
+    }
+    int64_t getNumElements() const {
+      return ShapedType::getNumElements(getShape());
+    }
+  }];
+}
+
+def AnySet : Type<Or<[GMLST_TileType.predicate]>, "subset type">;
+
+def Vector : AnyTypeOf<[
+  AnyVectorOfAnyRank
+], "", "::mlir::ShapedType">;
+def VectorOrScalar : AnyTypeOf<[
+  AnyVectorOfAnyRank, AnyFloat, AnyInteger, AnyComplex, Index
+]>;
+def RankedTensorOrVector : AnyTypeOf<[
+  AnyRankedTensor, AnyVectorOfAnyRank
+], "", "::mlir::ShapedType">;
+def RankedTensorOrVectorOrScalar : AnyTypeOf<[
+  AnyRankedTensor, AnyVectorOfAnyRank, AnyFloat, AnyInteger, AnyComplex, Index
+]>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Ops
+///////////////////////////////////////////////////////////////////////////////
+
+def GMLST_TileOp : GMLST_Op<"tile", [
+      AttrSizedOperandSegments,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>,
+      OffsetSizeAndStrideOpInterface,
+      Pure
+    ]> {
+  let summary = "Operation to generate a TileType from offsets, sizes, strides";
+  let description = [{
+    This is a metadata operation that encapsulates mixed dynamic and static
+    offsets, sizes and strides.
+
+    Examples:
+
+    ```mlir
+    // Tile with partially dynamic sizes.
+    %tile_2d = gml_st.tile [0, 0] [%x, 16] [1, 1] : !gml_st.tile<?x16>
+
+    // Tile with rank zero.
+    %tile_0d = gml_st.tile [] [] [] : !gml_st.tile<>
+    ```
+  }];
+  let arguments = (ins Variadic<Index>:$offsets,
+                       Variadic<Index>:$sizes,
+                       Variadic<Index>:$strides,
+                       DenseI64ArrayAttr:$static_offsets,
+                       DenseI64ArrayAttr:$static_sizes,
+                       DenseI64ArrayAttr:$static_strides);
+  let results = (outs GMLST_TileType:$result);
+
+  let builders = [
+   OpBuilder<(ins "ArrayRef<OpFoldResult>":$offsets,
+      "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+   OpBuilder<(ins "ArrayRef<OpFoldResult>":$offsets,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+  ];
+
+  let assemblyFormat = [{
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    custom<DynamicIndexList>($sizes, $static_sizes)
+    custom<DynamicIndexList>($strides, $static_strides)
+    attr-dict `:` qualified(type($result))
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getType().cast<TileType>().getRank();
+      return {rank, rank, rank};
+    }
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 0; }
+  }];
+  let hasCanonicalizer = 1;
+  let hasVerifier = 1;
+}
+
+def GMLST_MaterializeOp : GMLST_Op<"materialize", [
+    AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+    OffsetSizeAndStrideOpInterface, Pure]> {
+  let summary = "Operation to extract values from tensor or vector";
+  let description = [{
+    The "materialize" operation extracts a vector from another or a vector.
+    MaterializeOp accepts a list of offsets-sizes-strides.
+
+    ```
+    %subvector = gml_st.materialize %vector[0, 0][42, 16][1, 1]
+      : vector<?x?xf32>[!gml_st.tile<42x16>] to vector<42x16xf32>
+    ```
+
+    If `sizes` define a shape with a single element, it is also possible to
+    extract an element from a vector.
+
+    ```
+    %element = gml_st.materialize %vector[0, 0][1, 1][1, 1]
+      : vector<3x1xindex>[!gml_st.tile<1x1>] to index
+
+    %element = gml_st.materialize %vector[%tile_0d]
+      : vector<f32>[][][] to f32
+
+    %subvector = gml_st.materialize %vector[%tile_with_single_element]
+      : vector<3x1xindex>[0, 0][1, 1][1, 1] to vector<1x1xindex>
+    ```
+  }];
+  let arguments = (ins
+    Vector:$source,
+    Variadic<Index>:$offsets,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides,
+    DenseI64ArrayAttr:$static_offsets,
+    DenseI64ArrayAttr:$static_sizes,
+    DenseI64ArrayAttr:$static_strides
+  );
+  let results = (outs VectorOrScalar:$result);
+
+  let builders = [
+   OpBuilder<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets,
+     "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides)>,
+   OpBuilder<(ins "Type":$resultType, "Value":$source,
+     "ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
+     "ArrayRef<OpFoldResult>":$strides)>,
+   OpBuilder<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets)>,
+  ];
+
+  let assemblyFormat = [{
+    $source
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    custom<DynamicIndexList>($sizes, $static_sizes)
+    custom<DynamicIndexList>($strides, $static_strides)
+    attr-dict
+    `:` type($source) `to` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getSource().getType().cast<ShapedType>().getRank();
+      return {rank, rank, rank};
+    }
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+  }];
+  let hasCanonicalizer = 1;
+  let hasVerifier = 1;
+}
+
+class GMLST_LoopLikeOp<string mnemonic, list<Trait> traits = []>
+    : GMLST_Op<mnemonic, !listconcat(traits, [
+      AttrSizedOperandSegments,
+      DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+      RecursiveMemoryEffects,
+      SingleBlockImplicitTerminator<"gml_st::SetYieldOp">
+    ])> {
+  let results = (outs Variadic<RankedTensorOrVector>:$results);
+  let regions = (region SizedRegion<1>:$region);
+
+  code extraBaseClassDeclaration = [{
+    /// Number of loops
+    unsigned getNumLoops() { return getStep().size(); }
+
+    /// Number of operands controlling the loop: lbs, ubs, steps
+    unsigned getNumControlOperands() { return 3 * getNumLoops(); }
+
+    ValueRange getInductionVars() {
+      return getBody()->getArguments().take_front(getNumLoops());
+    }
+
+    /// Return whether the op has no output tensors.
+    bool hasBufferSemantics() {
+      return this->getOperation()->getNumResults() == 0;
+    }
+
+    /// Return terminator of the loop body.
+    SetYieldOp getTerminator();
+
+    /// Number of output operands
+    unsigned getNumOutputs() { return getOutputs().size(); }
+
+    /// Get the region output args.
+    Block::BlockArgListType getRegionOutputArgs() {
+      return getBody()->getArguments().take_back(getNumOutputs());
+    }
+
+    /// Get the region output arg that corresponds to an OpOperand.
+    BlockArgument getRegionOutputArgForOpOperand(OpOperand &opOperand) {
+      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
+             "expected an output args operand");
+      assert(opOperand.getOwner() == getOperation() &&
+             "opOperand does not belong to this gml_st::ForOp operation");
+      return getBody()->getArgument(opOperand.getOperandNumber() -
+                                    getNumControlOperands() + getNumLoops());
+    }
+
+    /// Get the OpOperand& that corresponds to a region output arg.
+    OpOperand &getOpOperandForRegionOutputArg(BlockArgument bbArg) {
+      assert(bbArg.getArgNumber() >= getNumLoops() &&
+             "expected a bbArg that is not an induction variable");
+      assert(bbArg.getOwner()->getParentOp() == getOperation() &&
+             "bbArg does not belong to the gml_st::ForOp body");
+      return getOperation()->getOpOperand(
+        getNumControlOperands() + bbArg.getArgNumber() - getNumLoops());
+    }
+
+    /// Get the OpResult that corresponds to an OpOperand.
+    OpResult getResultForOpOperand(OpOperand &opOperand) {
+      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
+             "expected an output args operand");
+      assert(opOperand.getOwner() == getOperation() &&
+             "opOperand does not belong to this gml_st::ForOp operation");
+      return getOperation()->getResult(
+        opOperand.getOperandNumber() - getNumControlOperands());
+    }
+
+    /// Get the OpOperand& that corresponds to an OpResultOpOperand.
+    OpOperand &getOpOperandForResult(OpResult opResult) {
+      assert(opResult.getDefiningOp() == getOperation() &&
+             "opResult does not belong to the gml_st::ForOp operation");
+      return getOperation()->getOpOperand(
+        getNumControlOperands() + opResult.getResultNumber());
+    }
+
+    /// Return the destinations for a gml_st.for op.
+    ValueRange getLoopLikeOpInits() {
+      return getOutputs();
+    }
+
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+}
+
+def GMLST_ParallelOp : GMLST_LoopLikeOp<"parallel", []> {
+  let summary = "Loop-like operation for parallel loops";
+  let description = [{
+    This is a multi-dimensional loop-like operation to support distribution on
+    tensors. The loop can have variadic number of results.
+
+    The body region contains exactly one block that terminates with
+    `gml_st.set_yield` which specifies how to combine partial results computed
+    in every iteration of the loop.
+
+    Example:
+
+    ```mlir
+    %add = gml_st.parallel (%i, %j)
+        = (%c0, %c0) to (%c8, %c16) step (%c4, %c4) {
+
+      %tile = gml_st.tile [%i, %j] [4, 4] [1, 1]
+        : !gml_st.tile<8x16> to !gml_st.tile<4x4>
+
+      %lhs_sub = gml_st.materialize %lhs_[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>] to tensor<4x4xf32>
+      %rhs_sub = gml_st.materialize %rhs_[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>] to tensor<4x4xf32>
+      %out_sub = gml_st.materialize %out_[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>] to tensor<4x4xf32>
+
+      %add_sub = linalg.map
+        ins(%lhs_sub: tensor<4x4xf32>, %rhs_sub: tensor<4x4xf32>)
+        outs(%out_sub: tensor<4x4xf32>)
+        (%lhs_elem: f32, %rhs_elem: f32) {
+          %0 = arith.addf %lhs_elem, %rhs_elem : f32
+          linalg.yield %0 : f32
+        }
+
+      gml_st.set_yield %add_sub into %out[%tile]
+        : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
+    } : tensor<16x64xf32>
+    ```
+
+    The terminator specifies, that the partial result `%add_sub` is to become a
+    part of the final result initialized by `%out` at the position defined by
+    `%tile`.
+
+    Example with concurrent updates:
+
+    %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1) {
+      %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+      %in_sub = gml_st.materialize %in[%tile]
+        : tensor<8xf32>[!gml_st.tile<1>] to tensor<1xf32>
+
+      gml_st.set_yield %in_sub into %out[%tile_0d]
+        acc (%new, %old: tensor<1xf32>) {
+          %combined = mhlo.add %new, %old : tensor<1xf32>
+          gml_st.yield %combined : tensor<1xf32>
+        } : tensor<1xf32> into tensor<8xf32>[!gml_st.tile<1>]
+    } : tensor<8xf32>
+
+    Every iteration of this loop extracts an element of the input and combines
+    it with the overlapping subset of the output. In that case,
+    `gml_st.set_yield` has an optional 'accumulator' region, that models the
+    concurrent update. The code in the accumulator can be lowered to atomic RMW
+    or to some other synchronization primitive.
+
+    After bufferization the loop does not produce any results.
+
+    ```mlir
+    gml_st.parallel (%i, %j) = (%c0, %c0) to (%c8, %c16) step (%c4, %c4) {
+      %lhs_sub = memref.subview %lhs[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+      %rhs_sub = memref.subview %rhs[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+      %out_sub = memref.subview %out[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+
+      linalg.map
+        ins(%lhs_sub: memref<4x4xf32>, %rhs_sub: memref<4x4xf32>)
+        outs(%out_sub: memref<4x4xf32>)
+        (%lhs_elem: f32, %rhs_elem: f32) {
+          %0 = arith.addf %lhs_elem, %rhs_elem : f32
+          linalg.yield %0 : f32
+        }
+
+      gml_st.set_yield
+    }
+    ```
+  }];
+
+  let arguments = (ins Variadic<Index>:$lowerBound,
+                       Variadic<Index>:$upperBound,
+                       Variadic<Index>:$step,
+                       Variadic<AnyShaped>:$outputs,
+                       OptionalAttr<StrAttr>:$distributionType);
+
+  let builders = [
+    OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBounds,
+      "ValueRange":$upperBounds, "ValueRange":$steps, "ValueRange":$outputs,
+      CArg<"std::optional<StringAttr>", "std::nullopt">:$distributionType,
+      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange, /*outputs=*/ValueRange)>",
+      "nullptr">:$bodyBuilderFn)>,
+  ];
+  let skipDefaultBuilders = 1;
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+  let hasCanonicalizer = 1;
+}
+
+def GMLST_ForOp : GMLST_LoopLikeOp<"for", []> {
+  let summary = "Loop-like operation for sequential loops";
+  let description = [{
+    This is a multi-dimensional loop-like operation with sequential semantics.
+    The arguments also include the loop-carried tensor or vector variables.
+    The loop can have variadic number of results.
+
+    The body region contains exactly one block that terminates with
+    `gml_st.set_yield` which specifies how to update a subset of the
+    loop-carried variable on every iteration.
+
+    Example:
+
+    ```mlir
+    %add = gml_st.for (%i) = (%c0, %c0) to (%c8, %c16) step (%c4, %c4)
+        outs(%out_ = %out: tensor<8x16xf32>) {
+      %tile = gml_st.tile [%i, %j] [4, 4] [1, 1]
+        : !gml_st.tile<8x16> to !gml_st.tile<4x4>
+
+      %lhs_sub = gml_st.materialize %lhs[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>]
+      %rhs_sub = gml_st.materialize %rhs[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>]
+      %out_sub = gml_st.materialize %out_[%tile]
+        : tensor<8x16xf32>[!gml_st.tile<4x4>]
+
+      %add_sub = linalg.map
+        ins(%lhs_sub: tensor<4x4xf32>, %rhs_sub: tensor<4x4xf32>)
+        outs(%out_sub: tensor<4x4xf32>)
+        (%lhs_elem: f32, %rhs_elem: f32) {
+          %0 = arith.addf %lhs_elem, %rhs_elem : f32
+          linalg.yield %0 : f32
+        }
+
+      gml_st.set_yield %add_sub into %out_[%tile]
+        : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
+    }
+    ```
+
+    After bufferization the `outs` argument list becomes empty and the loop
+    does not produce any results.
+
+    ```mlir
+    gml_st.for (%i, %j) = (%c0, %c0) to (%c8, %c16) step (%c4, %c4) {
+      %lhs_sub = memref.subview %lhs[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+      %rhs_sub = memref.subview %rhs[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+      %out_sub = memref.subview %out[%tile]
+        : memref<8x16xf32> to memref<4x4xf32, #map>
+
+      linalg.map
+        ins(%lhs_sub: memref<4x4xf32>, %rhs_sub: memref<4x4xf32>)
+        outs(%out_sub: memref<4x4xf32>)
+        (%lhs_elem: f32, %rhs_elem: f32) {
+          %0 = arith.addf %lhs_elem, %rhs_elem : f32
+          linalg.yield %0 : f32
+        }
+
+      gml_st.set_yield
+    }
+    ```
+  }];
+
+  let arguments = (ins Variadic<Index>:$lowerBound,
+                       Variadic<Index>:$upperBound,
+                       Variadic<Index>:$step,
+                       Variadic<AnyShaped>:$outputs);
+
+  let builders = [
+    OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBounds,
+      "ValueRange":$upperBounds, "ValueRange":$steps, "ValueRange":$outputs,
+      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
+        "/*outputs=*/ValueRange)>", "nullptr">:$bodyBuilderFn)>,
+  ];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
+  let hasCanonicalizer = 1;
+}
+
+def GMLST_SetYieldOp : GMLST_Op<"set_yield", [
+      Pure,
+      ReturnLike,
+      SameVariadicOperandSize,
+      SingleBlockImplicitTerminator<"YieldOp">,
+      Terminator
+    ]> {
+  let summary = "Set yield operation";
+  let description = [{
+    `gml_st.set_yield` is a special terminator operation for `gml_st.parallel`
+    or `gml_st.for` body. It specifies how to combine a source tensor, vector
+    or scalar with the destination tensor or vector.
+
+    Example:
+
+    ```mlir
+    // `src` is a tensor.
+    gml_st.set_yield %src at %tile into %dst
+      : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
+
+    // `src` is a scalar.
+    gml_st.set_yield %src at %tile into %dst
+      : f32 into tensor<16x64xf32>[!gml_st.tile<1x1>]
+
+    // `src` and `dst` are vectors.
+    gml_st.set_yield %src at %tile into %dst
+      : vector<4x4xf32> into vector<16x64xf32>[!gml_st.tile<4x4>]
+
+    ```
+
+    The operation is designed to be polymorphic to support non-rectangular
+    subsets. It will accept `set` arguments of types other than `!gml_st.tile`.
+  }];
+
+  let arguments = (ins Variadic<RankedTensorOrVectorOrScalar>:$srcs,
+                       Variadic<RankedTensorOrVector>:$dsts,
+                       Variadic<AnySet>:$sets,
+                       BoolArrayAttr:$accumulatorFlags);
+  let regions = (region VariadicRegion<SizedRegion<1>>:$accumulators);
+
+  let builders = [
+    OpBuilder<(ins)>,
+
+    // Builder with default update behaviour, i.e. overriding output.
+    OpBuilder<(ins "ValueRange":$srcs, "ValueRange":$dsts, "ValueRange":$sets)>,
+
+    // Builder with custom update behaviour.
+    OpBuilder<(ins "ValueRange":$srcs, "ValueRange":$dsts, "ValueRange":$sets,
+      "ArrayAttr":$accumulatorFlags,
+      "ArrayRef<function_ref<void(OpBuilder &, Location, Value, Value)>>"
+      :$combiners)>
+  ];
+  let skipDefaultBuilders = 1;
+
+  let hasCustomAssemblyFormat = 1;
+  let extraClassDeclaration = [{
+    unsigned getNumUpdates() { return getSrcs().size(); }
+
+    // Methods for `dst` arguments.
+    OpOperand* getDstOperand(unsigned i) {
+      return &getOperation()->getOpOperand(getNumUpdates() + i);
+    }
+
+    FailureOr<OpResult> getTiedOpResult(OpOperand &opOperand) {
+      if (!isDstOperand(opOperand)) return failure();
+
+      auto parent = getOperation()->getBlock()->getParentOp();
+      if (isa<ParallelOp>(parent) || isa<ForOp>(parent)) {
+        return parent->getResult(opOperand.getOperandNumber() -
+                                 getNumUpdates());
+      }
+      return failure();
+    }
+
+    bool isDstOperand(OpOperand& operand) {
+      return operand.getOperandNumber() >= getNumUpdates() &&
+             operand.getOperandNumber() < getNumUpdates() * 2;
+    }
+
+    unsigned getNumDstOperands() { return getNumOperands() - getNumUpdates(); }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def GMLST_YieldOp : GMLST_Op<"yield", [Pure, ReturnLike, Terminator,
+    HasParent<"::mlir::gml_st::SetYieldOp">]>,
+    Arguments<(ins AnyType:$value)> {
+  let summary = "Yield operation";
+  let description = [{
+    `gml_st.yield` is a special terminator operation for accumulator regions of
+    `gml_st.set_yield`.
+
+    Example:
+
+    ```mlir
+    gml_st.yield %f0: tensor<f32>
+    ```
+  }];
+  let assemblyFormat = "attr-dict $value `:` type($value)";
+}
+
+// TODO(b/253560795): Figure out where this operation shoud live, and how to
+// model it properly.
+def GMLST_DistributeOp : GMLST_Op<"distribute", [
+      Pure,
+      AllElementTypesMatch<["source", "result"]>
+    ]> {
+  let summary = "Tile combining operation";
+  let description = [{
+    `gml_st.distribute` is in a sense an inverse operation to
+    `gml_st.materialize`, and a non-terminator version of `gml_st.set_yield`.
+    It takes one tile of a vector per parallel iteration, and returns a larger
+    vector composed of these tiles.
+
+    It is only needed as an intermediate step when flattening gml_st.parallel
+    operations during SIMTfication for GPU, where one `gml_st.distribute`
+    replaces each output of `gml_st.set_yield`. It only works on vectors, since
+    tensor arguments have been bufferized into memrefs by that point in the
+    pipeline. This is also the reason why it does not need a destination
+    argument.
+  }];
+
+  let arguments = (ins AnyVectorOfAnyRank:$source, AnySet:$set);
+  let results = (outs AnyVectorOfAnyRank:$result);
+
+  let assemblyFormat = [{
+    $source `into` `[` $set `]` attr-dict `:` type($source)
+      `into` type($result) `[` type($set) `]`
+  }];
+
+  let hasVerifier = 0;
+}
+
+#endif // GML_ST_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td
similarity index 87%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td
index ebfe4f5854d..2560de459ca 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops_base.td
@@ -23,12 +23,16 @@ include "mlir/IR/OpBase.td"
 def GmlSt_Dialect : Dialect {
   let name = "gml_st";
   let cppNamespace = "::mlir::gml_st";
+  let description = [{
+    The GmlSt (Google ML Structured) dialect is intended to hold operations,
+    types and transformations to assist structured code generation.
+  }];
 
   let dependentDialects = ["tensor::TensorDialect"];
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
   let useDefaultTypePrinterParser = 1;
   let useDefaultAttributePrinterParser = 1;
   let hasConstantMaterializer = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 class GMLST_Op<string mnemonic, list<Trait> traits> :
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/README.md b/tensorflow/compiler/xla/mlir_hlo/gml_st/README.md
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/README.md
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/README.md
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
new file mode 100644
index 00000000000..652f653b2ee
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/CMakeLists.txt
@@ -0,0 +1,35 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_OPTIONAL_SOURCES
+  bufferizable_op_interface_impl.cc
+)
+
+add_mlir_library(GmlStBufferizableOpInterface
+  bufferizable_op_interface_impl.cc
+
+  LINK_LIBS PUBLIC
+  GmlStDialect
+  MLIRBufferizationDialect
+  MLIRBufferizationTransforms
+  MLIRDestinationStyleOpInterface
+  MLIRIR
+  MLIRSupport
+)
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
new file mode 100644
index 00000000000..fff37a31150
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.cc
@@ -0,0 +1,558 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
+
+#include <iterator>
+#include <optional>
+#include <tuple>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Support/LogicalResult.h"
+
+using mlir::bufferization::AliasingOpOperandList;
+using mlir::bufferization::AliasingOpResultList;
+using mlir::bufferization::AnalysisState;
+using mlir::bufferization::BufferizableOpInterface;
+using mlir::bufferization::BufferizationOptions;
+using mlir::bufferization::BufferRelation;
+using mlir::bufferization::ToTensorOp;
+using mlir::tensor::ExtractSliceOp;
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+// Returns a scalar or a memref type result of `gml_st.materialize` op after
+// bufferization.
+FailureOr<Value> materializeExtraction(OpBuilder &b, Value memref,
+                                       MaterializeOp materializeOp) {
+  Location loc = materializeOp.getLoc();
+  if (!materializeOp.getType().isa<ShapedType>()) {
+    auto indices = getValueOrCreateConstantIndexOp(
+        b, loc, materializeOp.getMixedOffsets());
+    return b.create<memref::LoadOp>(loc, memref, indices).getResult();
+  }
+  Value subview = b.create<memref::SubViewOp>(
+      loc, memref, materializeOp.getMixedOffsets(),
+      materializeOp.getMixedSizes(), materializeOp.getMixedStrides());
+  return subview;
+}
+
+LogicalResult materializeInsertion(OpBuilder &b, Value update, Value set,
+                                   Value memref,
+                                   const BufferizationOptions &options) {
+  Location loc = update.getLoc();
+
+  Operation *setDefiningOp = set.getDefiningOp();
+
+  // Create subviews or store ops for the set computation.
+  auto tile = dyn_cast<TileOp>(setDefiningOp);
+  if (!tile) {
+    // TODO(bchetioui): this check for an unrealized conversion cast does not
+    // belong here. This workaround will have to be deleted once SetYieldOp can
+    // be canonicalized correctly.
+
+    // If constants were folded into the tile type during canonicalization,
+    // tile creation is followed by an UnrealizedConversionCastOp on the tile.
+    auto castOp = dyn_cast<UnrealizedConversionCastOp>(setDefiningOp);
+    if (!castOp) return failure();
+
+    tile = dyn_cast<TileOp>(castOp->getOperand(0).getDefiningOp());
+    if (!tile) return failure();
+  }
+
+  if (!update.getType().isa<ShapedType>()) {
+    auto indices =
+        getValueOrCreateConstantIndexOp(b, loc, tile.getMixedOffsets());
+    b.create<memref::StoreOp>(loc, update, memref, indices);
+    return success();
+  }
+
+  memref =
+      b.create<memref::SubViewOp>(loc, memref, tile.getMixedOffsets(),
+                                  tile.getMixedSizes(), tile.getMixedStrides());
+  return options.createMemCpy(b, loc, update, memref);
+}
+
+struct MaterializeOpInterface
+    : public BufferizableOpInterface::ExternalModel<MaterializeOpInterface,
+                                                    MaterializeOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand &opOperand,
+                              const AnalysisState & /*state*/) const {
+    return opOperand.getOperandNumber() == 0;
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand,
+      const AnalysisState & /*state*/) const {
+    auto result = op->getOpResult(0);
+    if (result.getType().isa<RankedTensorType>() &&
+        opOperand.getOperandNumber() == 0)
+      return {result};
+    return {};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Unknown;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto materializeOp = cast<MaterializeOp>(op);
+
+    FailureOr<Value> bufferOr =
+        getBuffer(rewriter, materializeOp->getOpOperand(0).get(), options);
+    if (failed(bufferOr)) return failure();
+
+    rewriter.setInsertionPoint(materializeOp);
+    FailureOr<Value> resultOr =
+        materializeExtraction(rewriter, *bufferOr, materializeOp);
+
+    if (failed(resultOr)) return failure();
+
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, *resultOr);
+    return success();
+  }
+};
+
+struct ParallelOpInterface
+    : public BufferizableOpInterface::ExternalModel<ParallelOpInterface,
+                                                    ParallelOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    auto parallelOp = cast<ParallelOp>(op);
+
+    // gml_st.parallel alone doesn't bufferize to a memory read, one of the uses
+    // of its matching bbArg may.
+    return state.isValueRead(
+        parallelOp.getRegionOutputArgForOpOperand(opOperand));
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    // Outputs of gml_st::ParallelOp are always considered as a write.
+    return true;
+  }
+
+  AliasingOpResultList getAliasingOpResults(Operation *op, OpOperand &opOperand,
+                                            const AnalysisState &) const {
+    auto parallelOp = cast<ParallelOp>(op);
+    return {parallelOp.getResultForOpOperand(opOperand)};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+
+  bool isWritable(Operation * /*op*/, Value /*value*/,
+                  const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto parallelOp = cast<ParallelOp>(op);
+
+    // Get the bufferized output arguments.
+    Location loc = op->getLoc();
+    SmallVector<Value> bufferizedOutputs;
+    bufferizedOutputs.reserve(parallelOp.getNumOutputs());
+    for (Value output : parallelOp.getOutputs()) {
+      FailureOr<Value> maybeBuffer = getBuffer(rewriter, output, options);
+      if (failed(maybeBuffer)) return failure();
+      bufferizedOutputs.push_back(*maybeBuffer);
+    }
+
+    // Create new ParallelOp.
+    std::optional<StringAttr> distTypeAttr;
+    if (auto distType = cast<ParallelOp>(op).getDistributionType())
+      distTypeAttr = rewriter.getStringAttr(*distType);
+
+    auto newParallelOp = rewriter.create<ParallelOp>(
+        loc, TypeRange{}, parallelOp.getLowerBound(),
+        parallelOp.getUpperBound(), parallelOp.getStep(), ValueRange{},
+        distTypeAttr, nullptr);
+    Block *loopBody = newParallelOp.getBody();
+
+    // Add conversions to tensor so that we can reuse the old loop body.
+    rewriter.setInsertionPointToStart(loopBody);
+    SmallVector<Value> outputsToTensors;
+    for (auto buf : bufferizedOutputs) {
+      Value tensor = rewriter.create<bufferization::ToTensorOp>(loc, buf);
+      outputsToTensors.push_back(tensor);
+    }
+    SmallVector<Value> blockArgs = newParallelOp.getInductionVars();
+    blockArgs.append(outputsToTensors);
+
+    // Move old body into new for loop.
+    rewriter.mergeBlocks(parallelOp.getBody(), loopBody, blockArgs);
+
+    // Replace results and delete old op.
+    bufferization::replaceOpWithBufferizedValues(rewriter, op,
+                                                 bufferizedOutputs);
+    return success();
+  }
+
+  FailureOr<BaseMemRefType> getBufferType(
+      Operation *op, Value value, const BufferizationOptions &options,
+      const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
+    auto parallelOp = cast<ParallelOp>(op);
+
+    if (auto bbArg = value.dyn_cast<BlockArgument>()) {
+      // A tensor block argument has the same bufferized type as the
+      // corresponding output operand.
+      return bufferization::getBufferType(
+          parallelOp.getOpOperandForRegionOutputArg(bbArg).get(), options,
+          fixedTypes);
+    }
+
+    // The bufferized result type is the same as the bufferized type of the
+    // corresponding output operand.
+    return bufferization::getBufferType(
+        parallelOp.getOutputs()[value.cast<OpResult>().getResultNumber()],
+        options, fixedTypes);
+  }
+
+  bool isRepetitiveRegion(Operation * /*op*/, unsigned /*index*/) const {
+    return true;
+  }
+};
+
+struct ForOpInterface
+    : public BufferizableOpInterface::ExternalModel<ForOpInterface, ForOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    auto forOp = cast<gml_st::ForOp>(op);
+    return state.isValueRead(forOp.getRegionOutputArgForOpOperand(opOperand));
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand,
+      const AnalysisState & /*state*/) const {
+    auto forOp = cast<gml_st::ForOp>(op);
+    return {forOp.getResultForOpOperand(opOperand)};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+
+  bool isWritable(Operation * /*op*/, Value /*value*/,
+                  const AnalysisState & /*state*/) const {
+    // Interestingly, ForOp's bbArg can **always** be viewed
+    // inplace from the perspective of ops nested under:
+    //   1. Either the matching iter operand is not bufferized inplace and an
+    //      alloc + optional copy makes the bbArg itself inplaceable.
+    //   2. Or the matching iter operand is bufferized inplace and bbArg just
+    //      bufferizes to that too.
+    return true;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto forOp = cast<ForOp>(op);
+    Location loc = forOp.getLoc();
+
+    // Get the bufferized output arguments.
+    SmallVector<Value> bufferizedOutputs;
+    bufferizedOutputs.reserve(forOp.getNumOutputs());
+    for (Value output : forOp.getOutputs()) {
+      FailureOr<Value> maybeBuffer = getBuffer(rewriter, output, options);
+      if (failed(maybeBuffer)) return failure();
+      bufferizedOutputs.push_back(*maybeBuffer);
+    }
+
+    // Create new ForOp.
+    auto newForOp = rewriter.create<ForOp>(
+        loc, TypeRange{}, forOp.getLowerBound(), forOp.getUpperBound(),
+        forOp.getStep(), ValueRange{}, nullptr);
+    Block *loopBody = newForOp.getBody();
+
+    // Add conversions to tensor so that we can reuse the old loop body.
+    rewriter.setInsertionPointToStart(loopBody);
+    SmallVector<Value> outputsToTensors;
+    for (auto buf : bufferizedOutputs) {
+      Value tensor = rewriter.create<bufferization::ToTensorOp>(loc, buf);
+      outputsToTensors.push_back(tensor);
+    }
+    SmallVector<Value> blockArgs = newForOp.getInductionVars();
+    blockArgs.append(outputsToTensors);
+
+    // Move old body into new for loop.
+    rewriter.mergeBlocks(forOp.getBody(), loopBody, blockArgs);
+
+    // Replace results and delete old op.
+    bufferization::replaceOpWithBufferizedValues(rewriter, op,
+                                                 bufferizedOutputs);
+    return success();
+  }
+
+  FailureOr<BaseMemRefType> getBufferType(
+      Operation *op, Value value, const BufferizationOptions &options,
+      const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
+    auto forOp = cast<ForOp>(op);
+
+    if (auto bbArg = value.dyn_cast<BlockArgument>()) {
+      // A tensor block argument has the same bufferized type as the
+      // corresponding output operand.
+      return bufferization::getBufferType(
+          forOp.getOpOperandForRegionOutputArg(bbArg).get(), options,
+          fixedTypes);
+    }
+
+    // The bufferized result type is the same as the bufferized type of the
+    // corresponding output operand.
+    return bufferization::getBufferType(
+        forOp.getOutputs()[value.cast<OpResult>().getResultNumber()], options,
+        fixedTypes);
+  }
+};
+
+struct SetYieldOpInterface
+    : public BufferizableOpInterface::ExternalModel<SetYieldOpInterface,
+                                                    SetYieldOp> {
+  AliasingOpResultList getAliasingOpResults(
+      Operation * /*op*/, OpOperand & /*opOperand*/,
+      const AnalysisState & /*state*/) const {
+    return {};
+  }
+
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState & /*state*/) const {
+    return cast<SetYieldOp>(op).isDstOperand(opOperand);
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /* opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto yieldOp = cast<SetYieldOp>(op);
+    Operation *loop = yieldOp->getParentOp();
+
+    rewriter.setInsertionPoint(op);
+    for (const auto &it :
+         llvm::enumerate(llvm::zip(yieldOp.getSrcs(), yieldOp.getDsts(),
+                                   yieldOp.getSets(), loop->getResults()))) {
+      Value src, dst, set, loopResult;
+      std::tie(src, dst, set, loopResult) = it.value();
+
+      // `src` can be a scalar, that's `getBuffer()` should be called only for
+      // tensor types.
+      if (src.getType().isa<RankedTensorType>()) {
+        FailureOr<Value> srcBufferOr = getBuffer(rewriter, src, options);
+        if (failed(srcBufferOr)) return failure();
+
+        src = *srcBufferOr;
+      }
+
+      FailureOr<Value> dstBufferOr = getBuffer(rewriter, dst, options);
+      if (failed(dstBufferOr)) return failure();
+      Value dstBuffer = *dstBufferOr;
+
+      if (failed(materializeInsertion(rewriter, src, set, dstBuffer, options)))
+        return failure();
+      if (auto parallelOp =
+              dyn_cast<gml_st::ParallelOp>(yieldOp->getParentOp())) {
+        // Replace results of the enclosing loop with `to_tensor(dst)`.
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPointAfter(loop);
+
+        Value resultToTensor =
+            rewriter.create<ToTensorOp>(loop->getLoc(), dstBuffer);
+        for (OpOperand &use :
+             llvm::make_early_inc_range(loopResult.getUses())) {
+          rewriter.updateRootInPlace(use.getOwner(),
+                                     [&]() { use.set(resultToTensor); });
+        }
+      }
+    }
+    rewriter.replaceOpWithNewOp<SetYieldOp>(op);
+    return success();
+  }
+
+  LogicalResult resolveConflicts(Operation *op, RewriterBase &rewriter,
+                                 const AnalysisState &state) const {
+    OpBuilder::InsertionGuard g(rewriter);
+    SmallVector<OpOperand *> outOfPlaceOpOperands;
+    DenseSet<OpOperand *> copiedOpOperands;
+    DenseSet<OpOperand *> escapingOpOperandCopies;
+
+    // Find all out-of-place OpOperands.
+    for (OpOperand &opOperand : op->getOpOperands()) {
+      Type operandType = opOperand.get().getType();
+      if (!operandType.isa<TensorType>()) continue;
+      if (state.isInPlace(opOperand)) continue;
+      if (operandType.isa<UnrankedTensorType>())
+        return op->emitError("copies of unranked tensors are not supported");
+
+      AliasingOpResultList aliasingOpResults =
+          state.getAliasingOpResults(opOperand);
+      // Is the result yielded from a block? Or are deallocations turned off
+      // entirely? In either case, mark the allocation as "escaping", so that it
+      // will not be deallocated.
+      bool escape = !state.getOptions().createDeallocs ||
+                    llvm::any_of(aliasingOpResults, [&](Value v) {
+                      return state.isTensorYielded(v);
+                    });
+
+      // In all other cases, make a copy of the OpOperand.
+      outOfPlaceOpOperands.push_back(&opOperand);
+      if (!state.canOmitTensorCopy(opOperand))
+        copiedOpOperands.insert(&opOperand);
+      if (escape) escapingOpOperandCopies.insert(&opOperand);
+    }
+
+    // Insert copies of OpOperands before the loop.
+    rewriter.setInsertionPoint(op->getParentOp());
+    for (OpOperand *opOperand : outOfPlaceOpOperands) {
+      FailureOr<Value> copy = allocateTensorForShapedValue(
+          rewriter, op->getLoc(), opOperand->get(),
+          escapingOpOperandCopies.contains(opOperand), state.getOptions(),
+          copiedOpOperands.contains(opOperand));
+      if (failed(copy)) return failure();
+      rewriter.updateRootInPlace(op, [&]() { opOperand->set(*copy); });
+    }
+
+    return success();
+  }
+
+  bool areEquivalentSlices(const AnalysisState &state,
+                           ExtractSliceOp extractSliceOp, SetYieldOp setYieldOp,
+                           int64_t updateIdx) const {
+    if (!extractSliceOp || !setYieldOp) return false;
+    if (extractSliceOp != setYieldOp &&
+        !state.areEquivalentBufferizedValues(extractSliceOp.getSource(),
+                                             setYieldOp.getDsts()[updateIdx])) {
+      return false;
+    }
+    if (!sameOffsetsSizesAndStrides(
+            extractSliceOp,
+            setYieldOp.getSets()[updateIdx].getDefiningOp<TileOp>(),
+            isEqualConstantIntOrValue))
+      return false;
+    return true;
+  }
+
+  /// Return true if `value` is originating from an ExtractSliceOp that matches
+  /// the given SetYieldOp.
+  bool matchesInsertDestination(const AnalysisState &state, Value value,
+                                SetYieldOp setYieldOp,
+                                int64_t updateIdx) const {
+    // Look for matching slices.
+    auto matchesSlice = [&](Value val) {
+      if (auto materializeOp = val.getDefiningOp<ExtractSliceOp>()) {
+        if (areEquivalentSlices(state, materializeOp, setYieldOp, updateIdx)) {
+          return true;
+        }
+      }
+      return false;
+    };
+    return llvm::all_of(
+        state.findValueInReverseUseDefChain(value, matchesSlice), matchesSlice);
+  }
+
+  // Copied and modified for gml_st.materialize/gml_st.set_yield pairs from
+  // mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+  // Takes into account that gml_st.set_yield can have multiple src/dst pairs.
+  bool isNotConflicting(Operation *op, OpOperand *uRead,
+                        OpOperand *uConflictingWrite,
+                        const AnalysisState &state) const {
+    if (llvm::isa<ForOp>(op->getParentOp())) {
+      return true;
+    }
+    Operation *readingOp = uRead->getOwner();
+    Operation *conflictingWritingOp = uConflictingWrite->getOwner();
+
+    // Special rules for matching SetYieldOp/ExtractSliceOp pairs. If
+    // uRead is an SetYieldOp...
+    if (auto setYieldOp = dyn_cast<SetYieldOp>(readingOp)) {
+      for (int64_t updateIdx :
+           llvm::seq<int64_t>(0, setYieldOp.getNumUpdates())) {
+        OpOperand &srcOpOperand = setYieldOp->getOpOperand(updateIdx);
+        OpOperand *dstOpOperand = setYieldOp.getDstOperand(updateIdx);
+
+        if (uRead == dstOpOperand /*dest*/ &&
+            matchesInsertDestination(state, uConflictingWrite->get(),
+                                     setYieldOp, updateIdx))
+          return true;
+
+        if (uRead == &srcOpOperand /*source*/ &&
+            uConflictingWrite == dstOpOperand /*dest*/ &&
+            matchesInsertDestination(state, uRead->get(), setYieldOp,
+                                     updateIdx))
+          return true;
+      }
+    }
+
+    // If uConflictingWrite is an SetYieldOp...
+    if (auto setYieldOp = dyn_cast<SetYieldOp>(conflictingWritingOp)) {
+      for (int64_t updateIdx :
+           llvm::seq<int64_t>(0, setYieldOp.getNumUpdates())) {
+        if (uConflictingWrite == setYieldOp.getDstOperand(updateIdx) &&
+            state.areEquivalentBufferizedValues(
+                uRead->get(), setYieldOp.getSrcs()[updateIdx]) &&
+            matchesInsertDestination(state, setYieldOp.getSrcs()[updateIdx],
+                                     setYieldOp, updateIdx))
+          return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+}  // namespace
+}  // namespace gml_st
+}  // namespace mlir
+
+void mlir::gml_st::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(
+      +[](MLIRContext *ctx, gml_st::GmlStDialect * /*dialect*/) {
+        ForOp::attachInterface<ForOpInterface>(*ctx);
+        MaterializeOp::attachInterface<MaterializeOpInterface>(*ctx);
+        ParallelOp::attachInterface<ParallelOpInterface>(*ctx);
+        SetYieldOp::attachInterface<SetYieldOpInterface>(*ctx);
+      });
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
new file mode 100644
index 00000000000..5b3e35fc7b1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+#define MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace gml_st {
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..ccc7ca94d0d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/CMakeLists.txt
@@ -0,0 +1,114 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS passes.td)
+mlir_tablegen(passes.h.inc -gen-pass-decls -name GmlSt)
+add_public_tablegen_target(MLIRGmlStPassIncGen)
+
+set(LLVM_TARGET_DEFINITIONS test_passes.td)
+mlir_tablegen(test_passes.h.inc -gen-pass-decls -name GmlStTest)
+add_public_tablegen_target(MLIRGmlStTestPassIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(GmlStPasses
+  add_debug_info/add_debug_info.cc
+  collapse_shape/collapse_shape.cc
+  compose_extract_insert_slice/compose_extract_insert_slice.cc
+  cpu_tiling/cpu_tiling_pipeline.cc
+  cpu_tiling/transform_map_for_cpu.cc
+  cpu_tiling/transform_matmul_for_cpu.cc
+  cpu_tiling/transform_reduce_for_cpu.cc
+  cpu_tiling/transform_reverse_for_cpu.cc
+  cpu_tiling/transform_scatter_for_cpu.cc
+  cpu_tiling/transform_sort_for_cpu.cc
+  cpu_tiling/transform_transpose_for_cpu.cc
+  fusion/fusion.cc
+  gml_st_simtfy/gml_st_simtfy.cc
+  gml_st_to_gpu/gml_st_to_gpu.cc
+  gml_st_to_scf/gml_st_to_scf.cc
+  gpu_tiling/greedy_fusion.cc
+  gpu_tiling/tiling_cwise.cc
+  gpu_tiling/tiling_gpu_warp.cc
+  peeling/peeling.cc
+  rewrite_vector_ops/rewrite_vector_contract.cc
+  rewrite_vector_ops/rewrite_vector_multi_reduction.cc
+  rewrite_vector_ops/rewrite_vector_transpose.cc
+  scalarization/scalarization.cc
+  tiling/tiling.cc
+  tiling_softmax/tiling_softmax.cc
+  triton_tiling/transform_matmul_for_triton.cc
+  vectorization/vectorization.cc
+  vectorization/vectorize_copy.cc
+  vectorization/vectorize_for_cpu.cc
+  vectorization/vectorize_for_gpu.cc
+
+  DEPENDS
+  MLIRGmlStPassIncGen
+  MLIRGmlStUtils
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRDestinationStyleOpInterface
+  MhloDialect
+  MLIRDialectUtils
+  MLIRAffineDialect
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRGPUOps
+  MLIRIR
+  MLIRLinalgDialect
+  MLIRLinalgTransforms
+  MLIRMemRefDialect
+  MLIRPass
+  MLIRSCFUtils
+  MLIRSupport
+  MLIRVectorDialect
+)
+
+add_mlir_library(GmlStTransforms
+  transforms.cc
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  GmlStDialect
+  MLIRAffineDialect
+  MLIRDialectUtils
+  MLIRIR
+)
+
+add_mlir_library(GmlStTestPasses
+  test_passes.cc
+
+  DEPENDS
+  MLIRGmlStTestPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  GmlStBufferizableOpInterface
+  GmlStDialect
+  GmlStTransforms
+  MLIRPass
+  MLIRTransforms
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc
new file mode 100644
index 00000000000..8cf648cb9b2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/add_debug_info/add_debug_info.cc
@@ -0,0 +1,74 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_ADDDEBUGINFOPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct AddDebugInfoPass : public impl::AddDebugInfoPassBase<AddDebugInfoPass> {
+  void runOnOperation() override {
+    auto module = getOperation();
+    auto *context = &getContext();
+    OpBuilder builder(context);
+    std::string inputFilePath("-");
+
+    if (auto fileLoc = module.getLoc().dyn_cast<mlir::FileLineColLoc>())
+      inputFilePath = fileLoc.getFilename().getValue();
+
+    auto fileAttr =
+        LLVM::DIFileAttr::get(context, llvm::sys::path::filename(inputFilePath),
+                              llvm::sys::path::parent_path(inputFilePath));
+
+    auto producer = StringAttr::get(context, "XLA CPU");
+    auto cuAttr = LLVM::DICompileUnitAttr::get(
+        context, llvm::dwarf::DW_LANG_C_plus_plus_17, fileAttr, producer,
+        /*isOptimized=*/false, LLVM::DIEmissionKind::LineTablesOnly);
+    module.walk([&](func::FuncOp funcOp) {
+      StringAttr funcName = StringAttr::get(context, funcOp.getName());
+      auto bT = LLVM::DIBasicTypeAttr::get(
+          context, llvm::dwarf::DW_TAG_base_type, "void", /*sizeInBits=*/0,
+          /*encoding=*/1);
+      auto subTypeAttr = LLVM::DISubroutineTypeAttr::get(
+          context, llvm::dwarf::DW_CC_normal, {bT});
+      auto spAttr = LLVM::DISubprogramAttr::get(
+          context, cuAttr, fileAttr, funcName, funcName, fileAttr, /*line=*/1,
+          /*scopeline=*/1, LLVM::DISubprogramFlags::Definition, subTypeAttr);
+      funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
+    });
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createAddDebugInfoPass() {
+  return std::make_unique<AddDebugInfoPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc
new file mode 100644
index 00000000000..7da420ef633
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collapse_shape/collapse_shape.cc
@@ -0,0 +1,351 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/linalg_utils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_COLLAPSESHAPEPASS
+#include "gml_st/transforms/passes.h.inc"
+
+// Creates reassociation indices for `shape_collapse` and `shape_expand` ops.
+// Given `rank`(N) and `retainTrailingDims`(M), returns the following
+// reassociation:
+//     [[0, 1, ..., N-M-1], [N-M], [N-M+1], ..., [N-1]]
+//                         |--- retainTrailingDims ---|
+//     |-------------------- rank --------------------|
+SmallVector<ReassociationIndices> getCollapsingReassociationIndices(
+    int64_t rank, int64_t retainTrailingDims) {
+  SmallVector<ReassociationIndices> reassociation;
+  reassociation.reserve(retainTrailingDims + 1);
+  if (rank > retainTrailingDims) {
+    auto seq = llvm::seq<int64_t>(0, rank - retainTrailingDims);
+    reassociation.emplace_back(seq.begin(), seq.end());
+  }
+  for (int64_t i = rank - retainTrailingDims; i < rank; ++i)
+    reassociation.push_back({i});
+  return reassociation;
+}
+
+struct CollapseBcastPattern : OpRewritePattern<linalg::BroadcastOp> {
+  using OpRewritePattern<linalg::BroadcastOp>::OpRewritePattern;
+
+  CollapseBcastPattern(MLIRContext* ctx, int64_t retainTrailingDims)
+      : OpRewritePattern<linalg::BroadcastOp>(ctx),
+        retainTrailingDims(retainTrailingDims) {}
+
+  LogicalResult matchAndRewrite(linalg::BroadcastOp op,
+                                PatternRewriter& rewriter) const override {
+    Value init = op.getInit();
+    auto initTy = init.getType().cast<RankedTensorType>();
+    int64_t initRank = initTy.getRank();
+    int64_t numCollapsedDims = initRank - retainTrailingDims;
+
+    if (numCollapsedDims < 2) {
+      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
+    }
+
+    // Dimensions to be collapsed must either be all broadcasted or not
+    // broadcasted.
+    llvm::ArrayRef<int64_t> nonBroadcastedDims = op.getDimensions();
+
+    bool firstDimsBroadcasted = true;
+    if (!nonBroadcastedDims.empty()) {
+      int64_t i = 0;
+      while (i < (int64_t)nonBroadcastedDims.size() &&
+             nonBroadcastedDims[i] == i && i < numCollapsedDims) {
+        ++i;
+      }
+      if (i >= numCollapsedDims) {
+        firstDimsBroadcasted = false;
+      } else if (llvm::any_of(nonBroadcastedDims,
+                              [numCollapsedDims](unsigned dim) {
+                                return dim < numCollapsedDims;
+                              })) {
+        return rewriter.notifyMatchFailure(
+            op, "collapsed dims are not broadcasted in order");
+      }
+    }
+
+    Value operand = op.getInput();
+    auto operandTy = operand.getType().cast<RankedTensorType>();
+    int64_t operandRank = operandTy.getRank();
+    llvm::DenseSet<int64_t> nonBroadcastedDimsSet(nonBroadcastedDims.begin(),
+                                                  nonBroadcastedDims.end());
+    llvm::SmallVector<int64_t> collapsedNonBroadcastedDims;
+    collapsedNonBroadcastedDims.reserve(numCollapsedDims +
+                                        (firstDimsBroadcasted ? 1 : 0));
+    for (int64_t dim = numCollapsedDims; dim < initRank; ++dim) {
+      if (nonBroadcastedDimsSet.contains(dim)) {
+        collapsedNonBroadcastedDims.push_back(dim - numCollapsedDims + 1);
+      }
+    }
+    int64_t operandRetainTrailingDims =
+        retainTrailingDims - collapsedNonBroadcastedDims.size();
+
+    // Collapse operand and init tensor.
+    // For bcasts, this retains the last `retainTrailingDims` dimensions of the
+    // *result* and collapses all others.
+    Location loc = op.getLoc();
+    Value collapsedOperand = operand;
+    if (operandRank > operandRetainTrailingDims + 1) {
+      SmallVector<ReassociationIndices> operandReassociation =
+          getCollapsingReassociationIndices(operandRank,
+                                            operandRetainTrailingDims);
+      collapsedOperand = rewriter.createOrFold<tensor::CollapseShapeOp>(
+          loc, operand, operandReassociation);
+    }
+    SmallVector<ReassociationIndices> initReassociation =
+        getCollapsingReassociationIndices(initRank, retainTrailingDims);
+    Value collapsedInit =
+        rewriter.create<tensor::CollapseShapeOp>(loc, init, initReassociation);
+
+    // Create collapsed bcast op.
+    if (!firstDimsBroadcasted) {
+      collapsedNonBroadcastedDims.push_back(0);
+    }
+    Value collapsedBcastOp =
+        rewriter
+            .create<linalg::BroadcastOp>(
+                loc, collapsedOperand, collapsedInit,
+                ArrayRef<int64_t>(collapsedNonBroadcastedDims))
+            .getResult()
+            .front();
+
+    // Re-expand broadcast op and replace the original.
+    auto reexpandedBcastOp = rewriter.create<tensor::ExpandShapeOp>(
+        loc, initTy, collapsedBcastOp, initReassociation);
+    rewriter.replaceOp(op, reexpandedBcastOp.getResult());
+    return success();
+  }
+
+ private:
+  int64_t retainTrailingDims;
+};
+
+struct CollapseReductionPattern : OpRewritePattern<linalg::ReduceOp> {
+  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
+
+  CollapseReductionPattern(MLIRContext* ctx, int64_t retainTrailingDims)
+      : OpRewritePattern<linalg::ReduceOp>(ctx),
+        retainTrailingDims(retainTrailingDims) {}
+
+  LogicalResult matchAndRewrite(linalg::ReduceOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getNumDpsInits() != 1 || op.getDimensions().empty())
+      return failure();
+    int64_t reductionDim = op.getDimensions()[0];
+
+    Value operand = op.getInputs().front();
+    auto operandTy = operand.getType().cast<RankedTensorType>();
+    int64_t operandRank = operandTy.getRank();
+
+    if (operandRank <= retainTrailingDims + 1) {
+      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
+    }
+
+    if (operandRank - 1 - reductionDim >= retainTrailingDims) {
+      return rewriter.notifyMatchFailure(
+          op, "reduction dimension must be retained");
+    }
+
+    Value init = op.getInits().front();
+    auto initTy = init.getType().cast<RankedTensorType>();
+    int64_t initRank = initTy.getRank();
+
+    // Collapse operand and init tensor.
+    // For reductions, this retains the last `retainTrailingDims` dimensions of
+    // the *operand* and collapses all others.
+    Location loc = op.getLoc();
+    SmallVector<ReassociationIndices> operandReassociation =
+        getCollapsingReassociationIndices(operandRank, retainTrailingDims);
+    Value collapsedOperand = rewriter.create<tensor::CollapseShapeOp>(
+        loc, operand, operandReassociation);
+    SmallVector<ReassociationIndices> initReassociation =
+        getCollapsingReassociationIndices(initRank, retainTrailingDims - 1);
+    Value collapsedInit =
+        rewriter.create<tensor::CollapseShapeOp>(loc, init, initReassociation);
+
+    auto collapsedOperandTy =
+        collapsedOperand.getType().cast<RankedTensorType>();
+    int64_t collapsedOperandRank = collapsedOperandTy.getRank();
+    auto collapsedInitTy = collapsedInit.getType().cast<RankedTensorType>();
+
+    // Create collapsed reduction op.
+    int64_t collapsedReductionDim =
+        reductionDim - operandRank + collapsedOperandRank;
+    SmallVector<utils::IteratorType> collapsedIteratorTypes(
+        collapsedOperandRank, utils::IteratorType::parallel);
+    collapsedIteratorTypes[collapsedReductionDim] =
+        utils::IteratorType::reduction;
+    auto collapsedReductionOp = rewriter.create<linalg::ReduceOp>(
+        loc, collapsedInitTy, collapsedOperand, collapsedInit,
+        ArrayRef<int64_t>({collapsedReductionDim}));
+    collapsedReductionOp.getRegion().takeBody(op.getBodyRegion());
+
+    // Re-expand reduction op and replace the original.
+    auto reexpandedReductionOp = rewriter.create<tensor::ExpandShapeOp>(
+        loc, initTy, collapsedReductionOp.getResults().front(),
+        initReassociation);
+    rewriter.replaceOp(op, reexpandedReductionOp.getResult());
+    return success();
+  }
+
+ private:
+  int64_t retainTrailingDims;
+};
+
+linalg::MapOp createCollapsedMapOp(
+    linalg::MapOp mapOp, PatternRewriter& rewriter,
+    const SmallVector<ReassociationIndices>& reassociation) {
+  // Collapsed operands and init tensor.
+  Location loc = mapOp.getLoc();
+  SmallVector<Value> collapsedOperands = llvm::to_vector(
+      llvm::map_range(mapOp.getInputs(), [&](Value it) -> Value {
+        return rewriter.create<tensor::CollapseShapeOp>(loc, it, reassociation);
+      }));
+  Value init = mapOp.getInit();
+  Value collapsedInit =
+      rewriter.create<tensor::CollapseShapeOp>(loc, init, reassociation);
+
+  // Create collapsed map op.
+  auto collapsedInitTy = collapsedInit.getType().cast<RankedTensorType>();
+  auto collapsedMapOp = rewriter.create<linalg::MapOp>(
+      loc, collapsedInitTy, collapsedOperands, collapsedInit);
+  IRMapping bvm;
+  mapOp.getBodyRegion().cloneInto(&collapsedMapOp.getRegion(), bvm);
+  return collapsedMapOp;
+}
+
+struct CollapseMapPattern : OpRewritePattern<linalg::MapOp> {
+  using OpRewritePattern<linalg::MapOp>::OpRewritePattern;
+
+  CollapseMapPattern(MLIRContext* ctx, int64_t retainTrailingDims)
+      : OpRewritePattern<linalg::MapOp>(ctx),
+        retainTrailingDims(retainTrailingDims) {}
+
+  LogicalResult matchAndRewrite(linalg::MapOp op,
+                                PatternRewriter& rewriter) const override {
+    Value init = op.getInit();
+    auto initTy = init.getType().cast<RankedTensorType>();
+    int64_t rank = initTy.getRank();
+
+    if (rank <= retainTrailingDims + 1) {
+      return rewriter.notifyMatchFailure(op, "no dimension to collapse");
+    }
+
+    SmallVector<ReassociationIndices> reassociation =
+        getCollapsingReassociationIndices(rank, retainTrailingDims);
+    auto collapsedMapOp = createCollapsedMapOp(op, rewriter, reassociation);
+
+    // Re-expand map op and replace the original.
+    auto reexpandedMapOp = rewriter.create<tensor::ExpandShapeOp>(
+        op.getLoc(), initTy, collapsedMapOp.getResult().front(), reassociation);
+    rewriter.replaceOp(op, reexpandedMapOp.getResult());
+    return success();
+  }
+
+ private:
+  int64_t retainTrailingDims;
+};
+
+struct MoveCollapseBeforeMapPattern
+    : OpRewritePattern<tensor::CollapseShapeOp> {
+  using OpRewritePattern<tensor::CollapseShapeOp>::OpRewritePattern;
+
+  explicit MoveCollapseBeforeMapPattern(MLIRContext* ctx)
+      : OpRewritePattern<tensor::CollapseShapeOp>(ctx) {}
+
+  LogicalResult matchAndRewrite(tensor::CollapseShapeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto mapOp = op.getSrc().getDefiningOp<linalg::MapOp>();
+    if (!mapOp) return failure();
+    auto collapsedMapOp =
+        createCollapsedMapOp(mapOp, rewriter, op.getReassociationIndices());
+    rewriter.replaceOp(op, collapsedMapOp.getResult());
+    return success();
+  }
+};
+
+struct CollapseShapePass
+    : public impl::CollapseShapePassBase<CollapseShapePass> {
+  using CollapseShapePassBase<CollapseShapePass>::CollapseShapePassBase;
+
+  void getDependentDialects(DialectRegistry& registry) const override {
+    CollapseShapePassBase<CollapseShapePass>::getDependentDialects(registry);
+
+    // TODO(frgossen): Move these iface implementations into the tensor dialect.
+    // Some of its canonicalizations depend on it. Until then, we have to
+    // register them explicitly.
+    tensor::registerInferTypeOpInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext* ctx = &getContext();
+
+    // Populate shape-collapsing patterns for cwise ops, reductions, and bcasts.
+    RewritePatternSet patterns(ctx);
+    patterns.add<CollapseBcastPattern, CollapseMapPattern,
+                 CollapseReductionPattern>(ctx, retainTrailingDims);
+    // By moving CollapseShapeOp before MapOp, we can potentially remove it if
+    // it cancels out with an ExpandShapeOp.
+    patterns.add<MoveCollapseBeforeMapPattern>(ctx);
+
+    // Collect some related canonicalization patterns.
+    linalg::BroadcastOp::getCanonicalizationPatterns(patterns, ctx);
+    linalg::FillOp::getCanonicalizationPatterns(patterns, ctx);
+    linalg::MapOp::getCanonicalizationPatterns(patterns, ctx);
+    linalg::ReduceOp::getCanonicalizationPatterns(patterns, ctx);
+    tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, ctx);
+    tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
+    tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, ctx);
+    tensor::populateFoldTensorEmptyPatterns(patterns);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass() {
+  return std::make_unique<CollapseShapePass>();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass(
+    const CollapseShapePassOptions& options) {
+  return std::make_unique<CollapseShapePass>(options);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
new file mode 100644
index 00000000000..3765bcb2a04
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/compose_extract_insert_slice/compose_extract_insert_slice.cc
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_COMPOSEEXTRACTINSERTSLICEPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct ComposeExtractInsertSlicePass
+    : public impl::ComposeExtractInsertSlicePassBase<
+          ComposeExtractInsertSlicePass> {
+  void runOnOperation() override {
+    MLIRContext* ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
+
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createComposeExtractInsertSlicePass() {
+  return std::make_unique<ComposeExtractInsertSlicePass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
new file mode 100644
index 00000000000..4fc64e49f2f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/cpu_tiling_pipeline.cc
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+namespace mlir {
+namespace gml_st {
+
+void addCPUTilingPipeline(OpPassManager& pm,
+                          const GmlStCPUPipelineOptions& options) {
+  using func::FuncOp;
+
+  pm.addNestedPass<FuncOp>(createTransformScatterForCpuPass());
+  pm.addNestedPass<FuncOp>(createTransformReduceForCpuPass(
+      options.vectorSize, options.reduction1DTileSize,
+      options.reduction2DTileSizes));
+  pm.addNestedPass<FuncOp>(createTransformMatmulForCpuPass(
+      options.matmulTileSizes, options.lowerToMmt4d));
+  pm.addNestedPass<FuncOp>(createTransformTransposeForCpuPass());
+  pm.addNestedPass<FuncOp>(createTransformMapForCpuPass(options.vectorSize));
+  pm.addNestedPass<FuncOp>(createTransformSortForCpuPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::gml_st::createTransformReverseForCpuPass());
+
+  pm.addPass(createCSEPass());
+  pm.addPass(createCanonicalizerPass());
+
+  pm.addNestedPass<FuncOp>(createComposeExtractInsertSlicePass());
+  pm.addNestedPass<FuncOp>(createVectorizeForCPUPass());
+  pm.addNestedPass<FuncOp>(createScalarizationPass());
+  pm.addNestedPass<FuncOp>(createRewriteVectorContractPass());
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc
new file mode 100644
index 00000000000..8bb766a6720
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_map_for_cpu.cc
@@ -0,0 +1,148 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMMAPFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+static constexpr llvm::StringRef kMapTransformedLabel =
+    "__map_transformed_label__";
+
+struct TileMapPattern : public OpRewritePattern<linalg::MapOp> {
+  TileMapPattern(MLIRContext *context, int64_t innerDimTileSize,
+                 PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MapOp>(context, benefit),
+        innerDimTileSize(innerDimTileSize) {}
+
+  LogicalResult matchAndRewrite(linalg::MapOp op,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(op, kMapTransformedLabel)) return failure();
+
+    if (isa<gml_st::ParallelOp, gml_st::ForOp>(op->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          op, "has already been tiled by another pass.");
+
+    auto fuseFilterFn = [](Operation *op) {
+      return isa<linalg::BroadcastOp, linalg::MapOp>(op);
+    };
+
+    // Find there another linalg.map where this op can be fused.
+    op = findRootMap(op, fuseFilterFn);
+
+    if (hasLabel(op, kMapTransformedLabel)) return failure();
+
+    mlir::gml_st::TilingOptions opts;
+    opts.tileSizeComputationFn = [&](OpBuilder &b, Operation *op) {
+      auto numLoops = cast<linalg::MapOp>(op).getNumLoops();
+      SmallVector<Value> tiles(
+          numLoops, b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
+      if (!tiles.empty())
+        tiles.back() =
+            b.create<arith::ConstantIndexOp>(op->getLoc(), innerDimTileSize);
+      return tiles;
+    };
+
+    auto tiledLoop = tileUsingGmlStParallelAndFuseGreedily(
+        rewriter, op, opts, kMapTransformedLabel, fuseFilterFn);
+    if (failed(tiledLoop)) return failure();
+
+    // Peel parallel loops.
+    auto peelingResult = peelAllLoops(*tiledLoop, rewriter);
+    setLabel(*tiledLoop, kPerfectlyTiledLoopLabel);
+
+    // Tile ops in the peeled loop again, to size 1, so they can be
+    // scalarized.
+    if (failed(tilePeeledOpsToScalars(rewriter, peelingResult,
+                                      kMapTransformedLabel, fuseFilterFn)))
+      return failure();
+
+    return success();
+  }
+
+ private:
+  // Find the root of the fusion cluster.
+  linalg::MapOp findRootMap(
+      linalg::MapOp op,
+      llvm::function_ref<bool(Operation *)> fuseFilterFn) const {
+    linalg::MapOp rootMap = op;
+
+    Operation *curOp = op;
+    while (fuseFilterFn(curOp)) {
+      auto users = llvm::to_vector(curOp->getUsers());
+      // The op has more than 1 user. It will no be fused.
+      if (users.size() != 1) break;
+      curOp = users[0];
+
+      if (auto curMap = dyn_cast<linalg::MapOp>(curOp)) rootMap = curMap;
+    }
+    return rootMap;
+  }
+
+  int64_t innerDimTileSize;
+};
+
+struct TransformMapForCpuPass
+    : public impl::TransformMapForCpuPassBase<TransformMapForCpuPass> {
+  explicit TransformMapForCpuPass(int64_t ts) { tileSize = ts; }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    linalg::LinalgDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *context = &getContext();
+
+    RewritePatternSet patterns(context);
+    patterns.add<TileMapPattern>(context, tileSize);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    f.walk([](linalg::MapOp op) { removeLabel(op, kMapTransformedLabel); });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformMapForCpuPass(int64_t tileSize) {
+  return std::make_unique<mlir::gml_st::TransformMapForCpuPass>(tileSize);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc
new file mode 100644
index 00000000000..4a9b59d58eb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_matmul_for_cpu.cc
@@ -0,0 +1,788 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMMATMULFORCPUPASS
+#define GEN_PASS_DEF_SIMPLIFYDEADCOPYPASS
+#include "gml_st/transforms/passes.h.inc"
+
+static constexpr llvm::StringRef kMatmulTransformedLabel =
+    "__matmul_transformed_label__";
+
+// Helper to pick the tile shapes to use as the 2 inner dimensions of the
+// 4D shapes appearing in a Mmt4D.
+class Mmt4DTileParams {
+ public:
+  Mmt4DTileParams(ArrayRef<int> m0k0n0, const llvm::StringRef comment)
+      : m0(m0k0n0[0]), k0(m0k0n0[1]), n0(m0k0n0[2]), comment(comment) {}
+  std::array<int64_t, 2> lhs() const { return {m0, k0}; }
+  std::array<int64_t, 2> rhs() const { return {k0, n0}; }
+  std::array<int64_t, 2> acc() const { return {m0, n0}; }
+  std::array<int64_t, 2> rhsTranspose() const { return {n0, k0}; }
+  const std::string &getComment() const { return comment; }
+
+ private:
+  const int64_t m0;
+  const int64_t k0;
+  const int64_t n0;
+  const std::string comment;
+};
+
+Value getDimValue(OpBuilder &builder, Location loc, Value v, int64_t dim) {
+  ShapedType type = v.getType().cast<ShapedType>();
+  if (!type.isDynamicDim(dim)) {
+    return builder.create<arith::ConstantIndexOp>(loc, type.getDimSize(dim));
+  }
+  return TypeSwitch<Type, Value>(v.getType())
+      .Case<RankedTensorType>([&](RankedTensorType /*t*/) -> Value {
+        return builder.create<tensor::DimOp>(loc, v, dim);
+      })
+      .Case<MemRefType>([&](MemRefType /*t*/) -> Value {
+        return builder.create<memref::DimOp>(loc, v, dim);
+      });
+}
+
+OpFoldResult getDim(OpBuilder &builder, Location loc, Value v, int64_t dim) {
+  auto t = v.getType().cast<ShapedType>();
+  if (t.isDynamicDim(dim)) {
+    return getDimValue(builder, loc, v, dim);
+  }
+  return builder.getI64IntegerAttr(t.getDimSize(dim));
+}
+
+// Returns dimensions of |shapedTypeValue|, handling both static and dynamic
+// shapes.
+SmallVector<OpFoldResult> getDims(OpBuilder &builder, Location loc,
+                                  Value shapedTypeValue) {
+  return llvm::to_vector(llvm::map_range(
+      llvm::seq<int64_t>(
+          0, shapedTypeValue.getType().cast<ShapedType>().getRank()),
+      [&](int64_t dim) { return getDim(builder, loc, shapedTypeValue, dim); }));
+}
+
+Optional<Value> getPaddingValue(Value &source) {
+  auto padOp = source.getDefiningOp<tensor::PadOp>();
+  if (!padOp || padOp.getNofold() || !padOp.hasZeroLowPad())
+    return std::nullopt;
+
+  Value constantPaddingValue = padOp.getConstantPaddingValue();
+  if (!constantPaddingValue) return std::nullopt;
+
+  source = padOp.getSource();
+  return constantPaddingValue;
+}
+
+// Returns a tiled and packed value of |source|, the data layout is described by
+// |innerDimsPos|, |innerTileSizes| and |outerDimsPerm|.
+Value pack(Location loc, PatternRewriter &rewriter, Value source,
+           ArrayRef<int64_t> innerDimsPos, ArrayRef<int64_t> innerTileSizes,
+           ArrayRef<int64_t> outerDimsPerm) {
+  SmallVector<OpFoldResult> innerTileSizesOfr =
+      getAsOpFoldResult(rewriter.getI64ArrayAttr(innerTileSizes));
+  auto empty = tensor::PackOp::createDestinationTensor(
+      rewriter, loc, source, innerTileSizesOfr, innerDimsPos, outerDimsPerm);
+  Optional<Value> paddingValue = getPaddingValue(source);
+  return rewriter.create<tensor::PackOp>(loc, source, empty, innerDimsPos,
+                                         innerTileSizesOfr, paddingValue,
+                                         outerDimsPerm);
+}
+
+// Returns an unpacked value of |source|, the data layout is described by
+// |innerDimsPos|, |innerTileSizes| and |outerDimsPerm|. |resultShapeValue| is
+// used to create the destination tensor for the resulting unpacked value.
+Value unpack(Location loc, PatternRewriter &rewriter, Value source,
+             Value resultShapeValue, ArrayRef<int64_t> innerDimsPos,
+             ArrayRef<int64_t> innerTileSizes,
+             ArrayRef<int64_t> outerDimsPerm) {
+  SmallVector<OpFoldResult> resultDims =
+      getDims(rewriter, loc, resultShapeValue);
+  auto empty = rewriter.create<tensor::EmptyOp>(
+      loc, resultDims,
+      source.getType().cast<RankedTensorType>().getElementType());
+
+  SmallVector<OpFoldResult> innerTileSizesOfr =
+      getAsOpFoldResult(rewriter.getI64ArrayAttr(innerTileSizes));
+
+  return rewriter.create<tensor::UnPackOp>(loc, source, empty, innerDimsPos,
+                                           innerTileSizesOfr, outerDimsPerm);
+}
+
+// Returns true if an input of the given |inputShape| needs padding to
+// ensure that its shape will be a multiple of |tileShape|. That's always true
+// in the dynamic shape case.
+bool needsPadding(ArrayRef<int64_t> inputShape, ArrayRef<int64_t> tileShape) {
+  assert(inputShape.size() == tileShape.size());
+  for (size_t i = 0; i < inputShape.size(); i++) {
+    if (inputShape[i] == ShapedType::kDynamic) {
+      return true;
+    }
+    if (inputShape[i] % tileShape[i] != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Pads |input| on the bottom and on the right to the next multiple of
+// |tileShape|.
+Value pad(Location loc, PatternRewriter &rewriter, Value input,
+          ArrayRef<int64_t> tileShape) {
+  SmallVector<OpFoldResult, 2> lowPadding, highPadding;
+  SmallVector<int64_t, 2> resultTypeShape;
+  auto inputType = input.getType().cast<RankedTensorType>();
+  ArrayRef<int64_t> inputShape = inputType.getShape();
+  if (!needsPadding(inputShape, tileShape)) {
+    return input;
+  }
+  int64_t rank = inputType.getRank();
+  for (int64_t i = 0; i < rank; ++i) {
+    // No 'low' padding i.e. no padding at the top and on the left.
+    lowPadding.push_back(rewriter.getIndexAttr(0));
+    // 'High' padding i.e. padding at the bottom and on the right, and the
+    // result type shape, will be dynamic in any dimension if and only if the
+    // input shape is.
+    if (inputShape[i] == ShapedType::kDynamic) {
+      resultTypeShape.push_back(ShapedType::kDynamic);
+      // There only remains to compute the 'high' padding Value.
+      auto add = [&](Value a, Value b) {
+        return rewriter.create<arith::AddIOp>(loc, a, b);
+      };
+      auto sub = [&](Value a, Value b) {
+        return rewriter.create<arith::SubIOp>(loc, a, b);
+      };
+      auto rem = [&](Value a, Value b) {
+        return rewriter.create<arith::RemSIOp>(loc, a, b);
+      };
+      // Compare to the plainer distanceToNextMultipleOf in the static
+      // dimension case below.
+      auto distanceToNextMultipleOf = [&](Value a, Value b) {
+        Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+        Value bMinusOne = sub(b, one);
+        return sub(bMinusOne, rem(add(a, bMinusOne), b));
+      };
+      Value inputDim = rewriter.create<tensor::DimOp>(loc, input, i);
+      Value tileDim =
+          rewriter.create<arith::ConstantIndexOp>(loc, tileShape[i]);
+      Value padding = distanceToNextMultipleOf(inputDim, tileDim);
+      highPadding.push_back(padding);
+    } else {
+      auto distanceToNextMultipleOf = [=](int64_t a, int64_t b) {
+        int64_t bMinusOne = b - 1;
+        return bMinusOne - ((a + bMinusOne) % b);
+      };
+      int64_t inputDim = inputShape[i];
+      int64_t tileDim = tileShape[i];
+      int64_t padding = distanceToNextMultipleOf(inputDim, tileDim);
+      resultTypeShape.push_back(inputDim + padding);
+      highPadding.push_back(rewriter.getIndexAttr(padding));
+    }
+  }
+  Type elementType = inputType.getElementType();
+  RankedTensorType resultType =
+      RankedTensorType::get(resultTypeShape, elementType);
+  Value padValue;
+  if (auto complexTy = elementType.dyn_cast<ComplexType>()) {
+    auto zero = rewriter.getZeroAttr(complexTy.getElementType());
+    padValue = rewriter.create<complex::ConstantOp>(
+        loc, elementType, rewriter.getArrayAttr({zero, zero}));
+  } else {
+    auto zero = rewriter.getZeroAttr(elementType);
+    padValue = rewriter.create<arith::ConstantOp>(loc, elementType, zero);
+  }
+  return rewriter.create<tensor::PadOp>(loc, resultType, input, lowPadding,
+                                        highPadding, padValue);
+}
+
+// Returns a top-left slice from |input| shaped like |likeWhat|.
+Value extractSliceLike(Location loc, PatternRewriter &rewriter, Value input,
+                       Value likeWhat) {
+  SmallVector<OpFoldResult, 2> offsets, dims, strides;
+  auto resultType = likeWhat.getType().cast<RankedTensorType>();
+  int64_t rank = resultType.getRank();
+  auto resultShape = likeWhat.getType().cast<ShapedType>().getShape();
+  for (int i = 0; i < rank; ++i) {
+    offsets.push_back(rewriter.getIndexAttr(0));
+    strides.push_back(rewriter.getIndexAttr(1));
+    if (resultShape[i] == ShapedType::kDynamic) {
+      dims.emplace_back(rewriter.create<tensor::DimOp>(loc, likeWhat, i));
+    } else {
+      dims.push_back(rewriter.getIndexAttr(resultShape[i]));
+    }
+  }
+  return rewriter.create<tensor::ExtractSliceOp>(loc, resultType, input,
+                                                 offsets, dims, strides);
+}
+
+bool haveEqualShapeDim(Value x, Value y, int i) {
+  return x.getType().cast<ShapedType>().getDimSize(i) ==
+         y.getType().cast<ShapedType>().getDimSize(i);
+}
+
+// Pattern to convert linalg.matmul to an equivalent subgraph using
+// linalg.mmt4d. Currently, m0, n0 and k0 (packing parameters, aka layout tiling
+// parameters) are compile-time constants.
+struct MatmulToMmt4dPattern : public OpRewritePattern<linalg::MatmulOp> {
+  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
+
+  explicit MatmulToMmt4dPattern(MLIRContext *context,
+                                PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MatmulOp>(context, benefit) {}
+
+  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
+                                PatternRewriter &rewriter) const override {
+    Location loc = matmulOp.getLoc();
+
+    Value lhs = matmulOp.getDpsInputOperand(0)->get();
+    Value rhs = matmulOp.getDpsInputOperand(1)->get();
+    Value acc = matmulOp.getDpsInitOperand(0)->get();
+
+    // This transformation supports any mixing of static and dynamic dimensions,
+    // with one exception: the dynamic-ness of each dimension of the accumulator
+    // must match the dynamic-ness of the corresponding lhs/rhs dimension.
+    // This limitation is not inherent to this transformation's code, it's just
+    // here to avoid a current linalg folding limitation: at the moment,
+    // removing this gives the following error in e2e matmul tests,
+    //   "error: failed to legalize operation 'tensor.cast' that was explicitly
+    //   marked illegal"
+    // apparently due to some missing folding of tensor.cast op into reshapes.
+    if (!haveEqualShapeDim(lhs, acc, 0) || !haveEqualShapeDim(rhs, acc, 1)) {
+      return failure();
+    }
+
+    ShapedType lhsType = lhs.getType().cast<ShapedType>();
+    ShapedType rhsType = rhs.getType().cast<ShapedType>();
+    int64_t shapeM = lhsType.getShape()[0];
+    int64_t shapeN = rhsType.getShape()[1];
+    auto chooseMatMulOrMatVec = [=](ArrayRef<int> m0k0n0,
+                                    ArrayRef<int> m0k0n0ForMatVec,
+                                    ArrayRef<int> m0k0n0ForWhenRhsHas2Columns,
+                                    std::string comment) {
+      assert(m0k0n0ForMatVec[2] == 1 && "not a matrix*vector shape");
+      assert(m0k0n0ForWhenRhsHas2Columns[2] == 2 &&
+             "N=2 is expected when RHS has 2 columns");
+
+      SmallVector<int> params;
+      if (shapeN == 1 || shapeM == 1) {
+        params.assign(m0k0n0ForMatVec.begin(), m0k0n0ForMatVec.end());
+      } else if (shapeN == 2 || shapeM == 2) {
+        params.assign(m0k0n0ForWhenRhsHas2Columns.begin(),
+                      m0k0n0ForWhenRhsHas2Columns.end());
+      } else {
+        return Mmt4DTileParams(m0k0n0, comment);
+      }
+
+      if (shapeN == 1 || shapeN == 2) {
+        comment += ", matrix * narrow matrix, where the narrow matrix has " +
+                   std::to_string(shapeN) + " column(s)";
+      } else {
+        // The vector*matrix case is intentionally derived from the
+        // matrix*vector case by swapping M and N dims so that in kernel
+        // codegen we can reuse matrix*vector kernels by swapping LHS and RHS.
+        std::swap(params[0], params[2]);
+        comment += ", narrow matrix * matrix, where the narrow matrix has " +
+                   std::to_string(shapeM) + " column(s)";
+      }
+      return Mmt4DTileParams(params, comment);
+    };
+
+    const auto &tileParams = chooseMatMulOrMatVec(
+        {8, 1, 8}, {8, 1, 1}, {8, 1, 2}, "f32*f32->f32, generic");
+
+    Value paddedLhs = pad(loc, rewriter, lhs, tileParams.lhs());
+    Value paddedRhs = pad(loc, rewriter, rhs, tileParams.rhs());
+    Value paddedAcc = pad(loc, rewriter, acc, tileParams.acc());
+
+    Value packed4DLhs =
+        pack(loc, rewriter, paddedLhs, {0, 1}, tileParams.lhs(), {});
+    Value packed4DRhs = pack(loc, rewriter, paddedRhs, {1, 0},
+                             tileParams.rhsTranspose(), {1, 0});
+    Value packed4DAcc =
+        pack(loc, rewriter, paddedAcc, {0, 1}, tileParams.acc(), {});
+
+    auto mmt4d = rewriter.create<linalg::Mmt4DOp>(
+        loc, packed4DAcc.getType(), ValueRange{packed4DLhs, packed4DRhs},
+        ValueRange{packed4DAcc});
+    mmt4d->setAttr(StringAttr::get(getContext(), "comment"),
+                   StringAttr::get(getContext(), tileParams.getComment()));
+
+    Value paddedResult = unpack(loc, rewriter, mmt4d.getResult(0), paddedAcc,
+                                {0, 1}, tileParams.acc(), {});
+
+    Value result = extractSliceLike(loc, rewriter, paddedResult, acc);
+    rewriter.replaceOp(matmulOp, ArrayRef<Value>{result});
+
+    return success();
+  }
+};
+
+FailureOr<TilingResult> tileMatmul(PatternRewriter &rewriter, Operation *op,
+                                   ArrayRef<int64_t> tileSizes) {
+  TilingOptions opts;
+  opts.setTileSizeComputationFn(tileSizes);
+  opts.distribute = true;
+  return tileUsingGmlSt(opts, rewriter, cast<TilingInterface>(op));
+}
+
+/// Splits the tile sizes in `parallelSizes` into `reductionSizes` for the
+/// reduction loops.
+void splitParallelAndReductionTiles(linalg::LinalgOp op,
+                                    SmallVectorImpl<int64_t> &parallelSizes,
+                                    SmallVectorImpl<int64_t> &reductionSizes) {
+  reductionSizes.assign(parallelSizes.begin(), parallelSizes.end());
+  for (auto [index, iteratorType] :
+       llvm::enumerate(op.getIteratorTypesArray())) {
+    if (iteratorType == utils::IteratorType::parallel) {
+      reductionSizes[index] = 0;
+    } else {
+      parallelSizes[index] = 0;
+    }
+  }
+}
+
+FailureOr<Operation *> tileUsingSCFForAndReplace(
+    PatternRewriter &rewriter, Operation *op,
+    const scf::SCFTilingOptions &tilingOptions) {
+  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, tilingOptions);
+  if (failed(tilingResult) || tilingResult->loops.empty()) return failure();
+  rewriter.replaceOp(op, tilingResult->replacements);
+  return tilingResult->tiledOps.front();
+}
+
+/// Pattern to tile `linalg.mmt4d`.
+struct Mmt4DTransformPattern : public OpRewritePattern<linalg::Mmt4DOp> {
+  using OpRewritePattern<linalg::Mmt4DOp>::OpRewritePattern;
+
+  explicit Mmt4DTransformPattern(MLIRContext *context,
+                                 PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::Mmt4DOp>(context, benefit) {}
+
+  LogicalResult matchAndRewrite(linalg::Mmt4DOp mmt4dOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(mmt4dOp, kMatmulTransformedLabel)) {
+      return rewriter.notifyMatchFailure(mmt4dOp,
+                                         "has already been transformed.");
+    }
+
+    // Tile tensor.pack ops.
+    auto packTilingOptions =
+        scf::SCFTilingOptions().setTileSizeComputationFunction(
+            [&](OpBuilder b, Operation *op) {
+              auto numLoops =
+                  cast<mlir::TilingInterface>(op).getLoopIteratorTypes().size();
+              SmallVector<Value> tiles(
+                  numLoops, b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
+              return tiles;
+            });
+
+    auto *lhsOp = mmt4dOp.getInputs()[0].getDefiningOp();
+    if (failed(tileUsingSCFForAndReplace(rewriter, lhsOp, packTilingOptions)))
+      return failure();
+
+    auto *rhsOp = mmt4dOp.getInputs()[1].getDefiningOp();
+    if (failed(tileUsingSCFForAndReplace(rewriter, rhsOp, packTilingOptions)))
+      return failure();
+
+    auto *accOp = mmt4dOp.getOutputs()[0].getDefiningOp();
+    if (failed(tileUsingSCFForAndReplace(rewriter, accOp, packTilingOptions)))
+      return failure();
+
+    // Tile tensor.unpack op.
+    auto unpackTilingOptions =
+        scf::SCFTilingOptions().setTileSizeComputationFunction(
+            [](OpBuilder &builder, Operation *op) {
+              Location loc = op->getLoc();
+              auto unpackOp = cast<tensor::UnPackOp>(op);
+              auto numLoops = unpackOp.getDestRank();
+              auto dimAndTileMapping = unpackOp.getDimAndTileMapping();
+              SmallVector<Value> tileSizes;
+              for (size_t i = 0; i < numLoops; ++i) {
+                if (dimAndTileMapping.count(i)) {
+                  tileSizes.push_back(getValueOrCreateConstantIndexOp(
+                      builder, loc, dimAndTileMapping[i]));
+                } else {
+                  tileSizes.push_back(
+                      getDimValue(builder, loc, unpackOp.getDest(), i));
+                }
+              }
+              return tileSizes;
+            });
+
+    auto *unpackOp = *mmt4dOp->user_begin();
+    if (failed(
+            tileUsingSCFForAndReplace(rewriter, unpackOp, unpackTilingOptions)))
+      return failure();
+
+    // Compute the tile sizes. Note that at this stage we only do layout tiling.
+    // Later we might also want to do traversal tiling (only on M and N dims).
+    auto getL1TileSizes = [&]() -> SmallVector<int64_t> {
+      auto lhsShape =
+          mmt4dOp.getInputs()[0].getType().cast<ShapedType>().getShape();
+      auto rhsShape =
+          mmt4dOp.getInputs()[1].getType().cast<ShapedType>().getShape();
+      int64_t m0 = lhsShape[2];
+      int64_t n0 = rhsShape[2];
+      int64_t k0 = lhsShape[3];
+      return {1, 1, 1, m0, n0, k0};
+    };
+
+    SmallVector<int64_t> parallelTileSizes = getL1TileSizes();
+    SmallVector<int64_t> reductionTileSizes;
+
+    // Search the number of outer parallel loops to separate them from possible
+    // inner reduction dimensions.
+    auto iterTypes = mmt4dOp.getIteratorTypesArray();
+    // Make sure to only look at the leading loops for tiling---we will scan
+    // this array to find the first non-parallel loop later and use that for
+    // indexing into the tile sizes.
+    if (iterTypes.size() > parallelTileSizes.size()) {
+      iterTypes.resize(parallelTileSizes.size());
+    }
+
+    splitParallelAndReductionTiles(mmt4dOp.getOperation(), parallelTileSizes,
+                                   reductionTileSizes);
+
+    // Tile the parallel loops.
+    auto tiledOp = tileUsingSCFForAndReplace(
+        rewriter, mmt4dOp.getOperation(),
+        scf::SCFTilingOptions().setTileSizes(parallelTileSizes));
+    if (failed(tiledOp)) return failure();
+    mmt4dOp = cast<linalg::Mmt4DOp>(*tiledOp);
+
+    // Tile the reduction loops.
+    tiledOp = tileUsingSCFForAndReplace(
+        rewriter, mmt4dOp.getOperation(),
+        scf::SCFTilingOptions().setTileSizes(reductionTileSizes));
+    if (failed(tiledOp)) return failure();
+    mmt4dOp = cast<linalg::Mmt4DOp>(*tiledOp);
+
+    setLabel(mmt4dOp, kMatmulTransformedLabel);
+    return success();
+  }
+};
+
+/// Pattern to tile `linalg.matmul`, fuse `linalg.fill` into generated
+/// `gml_st.parallel`, and peel the generated loops.
+struct MatmulTransformPattern : public OpRewritePattern<linalg::MatmulOp> {
+  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
+
+  explicit MatmulTransformPattern(MLIRContext *context,
+                                  int64_t lhsParallelDimTileSize = 2,
+                                  int64_t rhsParallelDimTileSize = 4,
+                                  int64_t reductionDimTileSize = 8,
+                                  PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
+        lhsParallelDimTileSize(lhsParallelDimTileSize),
+        rhsParallelDimTileSize(rhsParallelDimTileSize),
+        reductionDimTileSize(reductionDimTileSize) {}
+
+  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(matmulOp, kMatmulTransformedLabel))
+      return rewriter.notifyMatchFailure(matmulOp,
+                                         "has already been transformed.");
+    if (isa<gml_st::ParallelOp, scf::ForOp>(matmulOp->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          matmulOp, "has already been tiled by another pass.");
+
+    auto cluster = findMapFusionCluster(matmulOp);
+    auto fusionCluster = cluster.operations;
+    auto *tilingRoot = cluster.root;
+
+    // Tiling of linalg.map requires two dimensions, linalg.matmul requires
+    // three.
+    SmallVector<int64_t> parallelDimsTileSizes{lhsParallelDimTileSize,
+                                               rhsParallelDimTileSize};
+    if (isa<linalg::MatmulOp>(tilingRoot)) parallelDimsTileSizes.push_back(0);
+
+    // First level tiling: parallel dimensions.
+    auto tilingParallelDimsResult =
+        tileMatmul(rewriter, tilingRoot, parallelDimsTileSizes);
+    if (failed(tilingParallelDimsResult)) return failure();
+
+    // Update the results if tiling occurred.
+    if (tilingParallelDimsResult->loop != nullptr) {
+      rewriter.replaceOp(tilingRoot,
+                         tilingParallelDimsResult->loop->getResults());
+      tilingRoot = tilingParallelDimsResult->tiledOps.front();
+
+      // Fuse ops into the loop.
+      fuseGreedily(rewriter, *tilingRoot->getBlock(),
+                   [&](Operation *op) { return fusionCluster.contains(op); });
+      (void)fuseFillOpsIntoParallelOp(
+          rewriter, cast<ParallelOp>(tilingParallelDimsResult->loop));
+    }
+
+    // Second level tiling: reduction dimension for matmuls.
+    SmallVector<scf::SCFTilingResult> tilingReductionDimsResults;
+    for (auto op :
+         llvm::to_vector(tilingRoot->getBlock()->getOps<linalg::MatmulOp>())) {
+      auto result = tileMatmulReductionDims(rewriter, op);
+      if (failed(result)) return failure();
+      tilingReductionDimsResults.push_back(*result);
+    }
+
+    // Peel parallel loops.
+    //
+    // We only want to peel (1) the parallel loop then (2) our kernel.
+    if (auto loop =
+            dyn_cast_or_null<ParallelOp>(tilingParallelDimsResult->loop)) {
+      auto peelingResult = peelAllLoops(loop, rewriter);
+    }
+
+    // Peel reduction loop inside the main parallel loop, label the main loop as
+    // "perfectly tiled" one, to enable vectorization after canonicalization.
+    for (auto &res : tilingReductionDimsResults) {
+      if (res.loops.size() == 1) {
+        auto peelingResult = peelSCFForOp(rewriter, res.loops.front());
+        setLabel(peelingResult.mainLoop, kPerfectlyTiledLoopLabel);
+      }
+    }
+    return success();
+  }
+
+ private:
+  FailureOr<scf::SCFTilingResult> tileMatmulReductionDims(
+      PatternRewriter &rewriter, linalg::MatmulOp matmulOp) const {
+    SmallVector<int64_t> reductionDimsTileSizes{0, 0, reductionDimTileSize};
+    scf::SCFTilingOptions opts;
+    opts.setTileSizes(reductionDimsTileSizes);
+    auto tilingReductionDimsResult =
+        scf::tileUsingSCFForOp(rewriter, matmulOp.getOperation(), opts);
+    if (failed(tilingReductionDimsResult)) return failure();
+
+    // Update the results if tiling occurred.
+    if (!tilingReductionDimsResult->loops.empty()) {
+      rewriter.replaceOp(matmulOp, tilingReductionDimsResult->replacements);
+      matmulOp =
+          cast<linalg::MatmulOp>(tilingReductionDimsResult->tiledOps.front());
+    }
+
+    setLabel(matmulOp, kMatmulTransformedLabel);
+    return tilingReductionDimsResult;
+  }
+
+  int64_t lhsParallelDimTileSize;
+  int64_t rhsParallelDimTileSize;
+  int64_t reductionDimTileSize;
+};
+
+struct TransformMatmulForCpuPass
+    : public impl::TransformMatmulForCpuPassBase<TransformMatmulForCpuPass> {
+  TransformMatmulForCpuPass() = default;
+
+  explicit TransformMatmulForCpuPass(llvm::ArrayRef<int64_t> matmulTileSizes,
+                                     bool lowerToMmt4DOp) {
+    tileSizes = matmulTileSizes;
+    lowerToMmt4D = lowerToMmt4DOp;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+    tensor::registerTilingInterfaceExternalModels(registry);
+    tensor::registerInferTypeOpInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Just do tiling and fusion on linalg.matmul.
+    if (!lowerToMmt4D) {
+      if (tileSizes.empty()) {
+        tileSizes = {4, 4, 4};
+      }
+      assert(tileSizes.size() == 3 &&
+             "Tiling sizes for MatMul should have 3 elements");
+      RewritePatternSet patterns(ctx);
+      patterns.add<MatmulTransformPattern>(ctx, tileSizes[0], tileSizes[1],
+                                           tileSizes[2]);
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+      // Ensure we drop the marker in the end.
+      f.walk([](linalg::MatmulOp op) {
+        removeLabel(op, kMatmulTransformedLabel);
+      });
+      return;
+    }
+
+    // Lower linalg.matmul to linalg.mmt4d (packed matmul).
+    {
+      // Convert linalg.matmul to linalg.mmt4d.
+      RewritePatternSet patterns(ctx);
+      patterns.add<MatmulToMmt4dPattern>(ctx);
+
+      // Canonicalization.
+      tensor::ExpandShapeOp::getCanonicalizationPatterns(patterns, ctx);
+      tensor::EmptyOp::getCanonicalizationPatterns(patterns, ctx);
+      linalg::FillOp::getCanonicalizationPatterns(patterns, ctx);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+      // Ensure we drop the marker in the end.
+      f.walk([](Operation *op) {
+        if (isa<linalg::MatmulOp>(op) || isa<linalg::Mmt4DOp>(op))
+          removeLabel(op, kMatmulTransformedLabel);
+      });
+    }
+    // Tiling pack, unpack and mmt4d ops.
+    {
+      RewritePatternSet patterns(ctx);
+      // We tile towards SIMD codegen, so the tile sizes depend on the target
+      // architecture (vector instruction sizes, etc.). Luckily, this
+      // information is already captured in linalg.mmt4d during linalg.matmul ->
+      // linalg.mmt4d lowering phase. It is hardcoded for AVX on x86 for now.
+      patterns.add<Mmt4DTransformPattern>(ctx);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+      // Ensure we drop the marker in the end.
+      f.walk(
+          [](linalg::Mmt4DOp op) { removeLabel(op, kMatmulTransformedLabel); });
+    }
+    // Expanding pack and unpack ops to other primitive tensor/linalg ops and
+    // canonicalize tiled ops.
+    {
+      RewritePatternSet patterns(ctx);
+      linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
+      patterns.add<linalg::GeneralizeOuterUnitDimsPackOpPattern>(ctx);
+      patterns.add<linalg::GeneralizeOuterUnitDimsUnPackOpPattern>(ctx);
+      if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                              std::move(patterns)))) {
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+/// Remove memref::CopyOp whose target (can be either a memref::SubViewOp or
+/// memref::AllocOp) has no other users.
+struct SimplifyDeadCopyPattern : public OpRewritePattern<memref::CopyOp> {
+  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::CopyOp op,
+                                PatternRewriter &rewriter) const override {
+    auto valueIt = op.getTarget();
+    Operation *onlyNonStoreLikeUser = op;
+    for (auto subviewOp = valueIt.getDefiningOp<memref::SubViewOp>(); subviewOp;
+         onlyNonStoreLikeUser = subviewOp, valueIt = subviewOp.getSource(),
+              subviewOp = valueIt.getDefiningOp<memref::SubViewOp>()) {
+      // TODO(vuson) simplify if other uses are also memref.copy writing to
+      // subview
+      //    %alloc_4 = memref.alloc()
+      //    %subview_5 = memref.subview %alloc_4
+      //    %subview_6 = memref.subview %alloc_4
+      //    memref.copy %arg0, %subview_6
+      //    memref.copy %arg1, %subview_5
+      if (!subviewOp->hasOneUse()) return failure();
+    }
+
+    auto hasOnlyStoreLikeUsers = [&](Value alloc) {
+      return !llvm::any_of(alloc.getUsers(), [&](Operation *op) {
+        if (op == onlyNonStoreLikeUser) return false;
+        // TODO(vuson) remove this exception when MemoryEffectOpInterface gets
+        // corrected for linalg::FillOp. Right now it has MemoryEffects::Read
+        // while the only thing it ever reads is metadata such as dynamic sizes.
+        if (isa<linalg::FillOp>(op)) return false;
+        if (auto effect = dyn_cast<MemoryEffectOpInterface>(op)) {
+          return effect.getEffectOnValue<MemoryEffects::Read>(alloc)
+                     .has_value() ||
+                 !effect.getEffectOnValue<MemoryEffects::Write>(alloc)
+                      .has_value();
+        }
+        return true;
+      });
+    };
+    if (!valueIt.getDefiningOp<memref::AllocOp>() ||
+        !hasOnlyStoreLikeUsers(valueIt))
+      return failure();
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct SimplifyDeadCopyPass
+    : public impl::SimplifyDeadCopyPassBase<SimplifyDeadCopyPass> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *ctx = func.getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<SimplifyDeadCopyPattern>(ctx);
+    memref::AllocOp::getCanonicalizationPatterns(patterns, ctx);
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformMatmulForCpuPass() {
+  return std::make_unique<mlir::gml_st::TransformMatmulForCpuPass>();
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformMatmulForCpuPass(llvm::ArrayRef<int64_t> matmulTileSizes,
+                                bool lowerToMmt4DOp) {
+  return std::make_unique<mlir::gml_st::TransformMatmulForCpuPass>(
+      matmulTileSizes, lowerToMmt4DOp);
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createSimplifyDeadCopyPass() {
+  return std::make_unique<SimplifyDeadCopyPass>();
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
new file mode 100644
index 00000000000..fc81040b954
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
@@ -0,0 +1,529 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMREDUCEFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kReduceTransformedLabel =
+    "__reduce_transformed_label__";
+
+FailureOr<TilingResult> tileReduce(PatternRewriter &rewriter,
+                                   linalg::ReduceOp reduceOp,
+                                   ArrayRef<int64_t> tileSizes) {
+  TilingOptions opts;
+  opts.setTileSizeComputationFn(tileSizes);
+  opts.distribute = true;
+  return tileUsingGmlSt(opts, rewriter,
+                        cast<TilingInterface>(reduceOp.getOperation()));
+}
+
+SmallVector<int64_t> getParallelDimTileSizes(int64_t reductionDim,
+                                             int64_t parallelDimTileSize) {
+  return reductionDim ? SmallVector<int64_t>{parallelDimTileSize, 0}
+                      : SmallVector<int64_t>{0, parallelDimTileSize};
+}
+
+SmallVector<int64_t> getReductionDimTileSizes(int64_t reductionDim,
+                                              int64_t reductionDimTileSize) {
+  return reductionDim ? SmallVector<int64_t>{0, reductionDimTileSize}
+                      : SmallVector<int64_t>{reductionDimTileSize, 0};
+}
+
+LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
+                         int64_t expectedRank) {
+  ArrayRef<int64_t> reduceDimensions = reduceOp.getDimensions();
+  if (reduceDimensions.size() != 1) {
+    return rewriter.notifyMatchFailure(
+        reduceOp, "expects 1 reduction dimension element. 0 or > 1 received.");
+  }
+  OpOperandVector operands = reduceOp.getDpsInputOperands();
+  if (operands.size() != 1) {
+    return rewriter.notifyMatchFailure(reduceOp,
+                                       "expects 1 operand. 0 or > 1 received.");
+  }
+  const int64_t operandRank =
+      operands[0]->get().getType().cast<RankedTensorType>().getRank();
+  if (operandRank != expectedRank) {
+    return rewriter.notifyMatchFailure(reduceOp, [&](::mlir::Diagnostic &diag) {
+      diag << "expects rank " << expectedRank << ". " << operandRank
+           << "received.";
+    });
+  }
+  return success();
+}
+
+struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
+  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
+
+  explicit Reduce1DTransformPattern(MLIRContext *context, int64_t vectorSize,
+                                    int64_t tileSize,
+                                    PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::ReduceOp>(context, benefit),
+        vectorSize(vectorSize),
+        tileSize(tileSize) {}
+
+  LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(reduceOp, kReduceTransformedLabel)) {
+      return rewriter.notifyMatchFailure(reduceOp,
+                                         "has already been transformed.");
+    }
+
+    if (isa<gml_st::ParallelOp, scf::ForOp>(reduceOp->getParentOp())) {
+      return rewriter.notifyMatchFailure(
+          reduceOp, "has already been tiled by another pass.");
+    }
+
+    if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/1)))
+      return failure();
+
+    Location loc = reduceOp.getLoc();
+
+    // Constants.
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value tileSizeValue =
+        rewriter.create<arith::ConstantIndexOp>(loc, tileSize);
+
+    // Input.
+    Value input = reduceOp.getInputs().front();
+    Value inputSize = rewriter.create<tensor::DimOp>(loc, input, 0);
+
+    // Loop boundaries.
+    //   tileableBound = inputSize - inputSize % tileSize
+    //   remainderSize = inputSize - tileableBound
+    Value tileableBound = getTileableBound(rewriter, loc, inputSize);
+    Value remainderSize =
+        getRemainderSize(rewriter, loc, tileableBound, inputSize);
+
+    // 0-d tensor with the neutral elements.
+    auto fillOp = reduceOp.getInits().front().getDefiningOp<linalg::FillOp>();
+    if (!fillOp) return failure();
+    auto neutralValue = fillOp.value();
+
+    // fillOp.getValue();
+    Type elementType = neutralValue.getType();
+
+    // Create tensor<VECTOR_SIZExELEM_TYPE> with neutral elements for tile loop
+    // init.
+    Value emptyVector = rewriter.create<tensor::EmptyOp>(
+        loc, llvm::ArrayRef({vectorSize}), elementType);
+    Value filledVector =
+        rewriter.create<linalg::FillOp>(loc, neutralValue, emptyVector)
+            .getResult(0);
+
+    auto tiledLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
+                                    ValueRange inits) {
+      // Tile input as tensor<TILE_SIZExELEM_TYPE> and reshape into
+      // tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE>.
+      Value inputSlice = tileAndReshapeInput(b, loc, iv, input, elementType);
+
+      tensor::ExtractSliceOp initSlice = create1DSlice(
+          b, loc, inits.front(), b.getIndexAttr(0), b.getIndexAttr(vectorSize));
+
+      // Create `linalg.reduce` to combine
+      // `tensor<(TILE_SIZE/VECTOR_SIZE)xVECTOR_SIZExELEM_TYPE> input with the
+      // `tensor<VECTOR_SIZExELEM_TYPE>` accumulator.
+      auto tiledReduceOp = b.create<linalg::ReduceOp>(
+          loc, ValueRange{inputSlice}, ValueRange{initSlice},
+          /*dimensions=*/SmallVector<int64_t>{0},
+          /*bodyBuilder=*/nullptr, linalg::getPrunedAttributeList(reduceOp));
+      OpBuilder::InsertionGuard g(rewriter);
+      Region &region = tiledReduceOp.getRegion();
+      rewriter.cloneRegionBefore(reduceOp.getRegion(), region, region.end());
+      setLabel(tiledReduceOp, kReduceTransformedLabel);
+
+      b.create<scf::YieldOp>(loc, tiledReduceOp.getResults());
+    };
+
+    // Create a tiled loop
+    auto tiledLoop =
+        rewriter.create<scf::ForOp>(loc, zero, tileableBound, tileSizeValue,
+                                    filledVector, tiledLoopBodyBuilder);
+    setLabel(tiledLoop, kPerfectlyTiledLoopLabel);
+
+    // Create `linalg.reduce` from tensor<VECTOR_SIZExELEM_TYPE> to
+    // tensor<ELEM_TYPE>.
+    auto horizontalReduce =
+        cloneReduceOp(rewriter, reduceOp, tiledLoop.getResult(0),
+                      reduceOp.getInits().front());
+
+    auto remainderLoopBodyBuilder = [&](OpBuilder &b, Location loc, Value iv,
+                                        ValueRange inits) {
+      Value inputSlice = create1DSlice(b, loc, input, iv, remainderSize);
+
+      Value initSlice = b.create<tensor::ExtractSliceOp>(
+          loc, inits.front(), /*offsets=*/SmallVector<OpFoldResult>{},
+          /*sizes=*/SmallVector<OpFoldResult>{},
+          /*strides=*/SmallVector<OpFoldResult>{});
+
+      auto newReduce = cloneReduceOp(b, reduceOp, inputSlice, initSlice);
+      b.create<scf::YieldOp>(loc, newReduce);
+    };
+
+    // Combine `horizontal reduce` with the tail of the input. The tail is
+    // always smaller than TILE_SIZE.
+    auto remainderLoop =
+        rewriter
+            .create<scf::ForOp>(loc, tileableBound, inputSize, tileSizeValue,
+                                horizontalReduce, remainderLoopBodyBuilder)
+            .getResult(0);
+
+    rewriter.replaceOp(reduceOp, remainderLoop);
+
+    return success();
+  }
+
+ private:
+  Value getTileableBound(OpBuilder &b, Location loc, Value inputSize) const {
+    if (tileSize == 1) return inputSize;
+
+    auto inputSizeInt = getConstantIntValue(inputSize);
+    if (inputSizeInt && *inputSizeInt % tileSize == 0) return inputSize;
+
+    AffineExpr sym0;
+    bindSymbols(b.getContext(), sym0);
+
+    auto modMap = AffineMap::get(0, 1, {sym0 - sym0 % tileSize});
+    return b.createOrFold<AffineApplyOp>(loc, modMap, ValueRange{inputSize});
+  }
+
+  Value getRemainderSize(OpBuilder &b, Location loc, Value tileableBound,
+                         Value inputSize) const {
+    AffineExpr sym0, sym1;
+    bindSymbols(b.getContext(), sym0, sym1);
+    auto diffMap = AffineMap::get(0, 2, {sym1 - sym0});
+    return b.create<AffineApplyOp>(loc, diffMap,
+                                   ValueRange{tileableBound, inputSize});
+  }
+
+  tensor::ExtractSliceOp create1DSlice(OpBuilder &b, Location loc, Value source,
+                                       OpFoldResult offset,
+                                       OpFoldResult size) const {
+    SmallVector<OpFoldResult> offsets{offset};
+    SmallVector<OpFoldResult> sizes{size};
+    SmallVector<OpFoldResult> strides{b.getIndexAttr(1)};
+
+    return b.create<tensor::ExtractSliceOp>(loc, source, offsets, sizes,
+                                            strides);
+  }
+
+  Value cloneReduceOp(OpBuilder &b, linalg::ReduceOp reduceOp,
+                      ValueRange newInputs, Value newInit) const {
+    IRMapping bvm;
+    bvm.map(reduceOp.getInputs(), newInputs);
+    bvm.map(reduceOp.getInits(), ValueRange{newInit});
+
+    auto *newReduceOp = b.clone(*reduceOp.getOperation(), bvm);
+    setLabel(newReduceOp, kReduceTransformedLabel);
+    return newReduceOp->getResult(0);
+  }
+
+  Value tileAndReshapeInput(OpBuilder &b, Location loc, Value iv, Value input,
+                            Type elementType) const {
+    Value inputSlice =
+        create1DSlice(b, loc, input, iv, b.getIndexAttr(tileSize));
+
+    auto reshapeType =
+        RankedTensorType::get({tileSize / vectorSize, vectorSize}, elementType);
+    SmallVector<ReassociationIndices> ri = {{0, 1}};
+    return b.create<tensor::ExpandShapeOp>(loc, reshapeType, inputSlice, ri);
+  }
+
+  int64_t vectorSize;
+  int64_t tileSize;
+};
+
+/// Pattern to tile `linalg.reduce` and fuse `linalg.fill` into generated
+/// `gml_st.parallel`.
+struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
+  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
+
+  explicit Reduce2DTransformPattern(MLIRContext *context,
+                                    int64_t parallelDimTileSize = 4,
+                                    int64_t reductionDimTileSize = 2,
+                                    PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::ReduceOp>(context, benefit),
+        parallelDimTileSize(parallelDimTileSize),
+        reductionDimTileSize(reductionDimTileSize) {}
+
+  LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(reduceOp, kReduceTransformedLabel)) {
+      return rewriter.notifyMatchFailure(reduceOp,
+                                         "has already been transformed.");
+    }
+
+    if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/2)))
+      return failure();
+
+    auto cluster = getFusionCluster(reduceOp);
+    auto fusionCluster = cluster.operations;
+    auto *tilingRoot = cluster.root;
+    if (!isa<linalg::MapOp>(tilingRoot) && !isa<linalg::ReduceOp>(tilingRoot)) {
+      return rewriter.notifyMatchFailure(
+          tilingRoot,
+          "Expected MapOp or ReduceOp as a root of fusion cluster.");
+    }
+
+    // First level tiling: parallel dimension.
+    auto tilingParallelDimsResult =
+        tileParallelDimensions(tilingRoot, rewriter);
+    if (failed(tilingParallelDimsResult)) return failure();
+
+    // Update the results if tiling occurred.
+    rewriter.replaceOp(tilingRoot,
+                       tilingParallelDimsResult->loop->getResults());
+    tilingRoot = (tilingParallelDimsResult->tiledOps.front());
+
+    // Fuse greedily into root op.
+    fuseGreedily(rewriter, *tilingRoot->getBlock(),
+                 [&](Operation *op) { return fusionCluster.contains(op); });
+    (void)fuseFillOpsIntoParallelOp(
+        rewriter, cast<ParallelOp>(tilingParallelDimsResult->loop));
+
+    // Process all reduces in a fusion cluster.
+    for (auto tiledReduceOp :
+         llvm::to_vector(tilingRoot->getBlock()->getOps<linalg::ReduceOp>())) {
+      // Second level tiling: reduction dimension.
+      auto tilingReductionDimsResult =
+          tileReductionDims(rewriter, tiledReduceOp);
+      if (failed(tilingReductionDimsResult)) return failure();
+
+      // Update the results if tiling occurred.
+      if (!tilingReductionDimsResult->loops.empty()) {
+        rewriter.replaceOp(tiledReduceOp,
+                           tilingReductionDimsResult->replacements);
+        tiledReduceOp =
+            cast<linalg::ReduceOp>(tilingReductionDimsResult->tiledOps.front());
+        fuseGreedily(rewriter, *tiledReduceOp->getBlock(),
+                     [&](Operation *op) { return isa<linalg::MapOp>(op); });
+      }
+      setLabel(tiledReduceOp, kReduceTransformedLabel);
+
+      // Peel reduction loops.
+      if (failed(peelReduction(rewriter, tilingParallelDimsResult.value(),
+                               tilingReductionDimsResult.value())))
+        return failure();
+    }
+
+    return success();
+  }
+
+ private:
+  // Find a cluster of operations that can be tiled and fused together around
+  // the root op.
+  FusionCluster getFusionCluster(linalg::ReduceOp reduceOp) const {
+    // Find a chain of MapOp users and use the last one as a root of cluster.
+    DenseSet<Operation *> resultOps;
+    Operation *rootOp = reduceOp.getOperation();
+
+    while (true) {
+      auto users = llvm::to_vector(rootOp->getUsers());
+
+      if (users.size() != 1) break;
+      if (!isa<linalg::MapOp>(users[0])) break;
+      resultOps.insert(rootOp);
+
+      rootOp = users[0];
+    }
+
+    // Run DFS to find all MapOps, TransposeOps, BroadcastOps that can be fused
+    // in the root op.
+    SmallVector<Operation *> remainingProducers;
+    remainingProducers.reserve(reduceOp.getDpsInputOperands().size());
+    resultOps.insert(reduceOp.getOperation());
+    for (auto *operand : reduceOp.getDpsInputOperands())
+      remainingProducers.push_back(operand->get().getDefiningOp());
+
+    while (!remainingProducers.empty()) {
+      Operation *curOp = remainingProducers.pop_back_val();
+      if (!curOp || resultOps.contains(curOp)) continue;
+      auto linalgOp = dyn_cast<linalg::LinalgOp>(curOp);
+      if (linalgOp &&
+          isa<linalg::BroadcastOp, linalg::TransposeOp, linalg::MapOp>(curOp)) {
+        resultOps.insert(curOp);
+        for (auto *operand : linalgOp.getDpsInputOperands())
+          remainingProducers.push_back(operand->get().getDefiningOp());
+      }
+    }
+    return {resultOps, rootOp};
+  }
+
+  FailureOr<TilingResult> tileParallelDimensions(
+      Operation *tilingRoot, PatternRewriter &rewriter) const {
+    FailureOr<TilingResult> tilingParallelDimsResult;
+    if (auto reduceOp = dyn_cast<linalg::ReduceOp>(tilingRoot)) {
+      tilingParallelDimsResult =
+          tileReduce(rewriter, reduceOp,
+                     getParallelDimTileSizes(reduceOp.getDimensions()[0],
+                                             parallelDimTileSize));
+    } else if (isa<linalg::MapOp>(tilingRoot)) {
+      TilingOptions opts;
+      opts.setTileSizeComputationFn({parallelDimTileSize});
+      opts.distribute = true;
+
+      tilingParallelDimsResult =
+          tileUsingGmlSt(opts, rewriter, cast<TilingInterface>(tilingRoot));
+    } else {
+      return failure();
+    }
+
+    return tilingParallelDimsResult;
+  }
+
+  FailureOr<scf::SCFTilingResult> tileReductionDims(
+      PatternRewriter &rewriter, linalg::ReduceOp reduceOp) const {
+    scf::SCFTilingOptions tilingOptions;
+    tilingOptions.setTileSizes(getReductionDimTileSizes(
+        reduceOp.getDimensions()[0], reductionDimTileSize));
+    return scf::tileUsingSCFForOp(rewriter, reduceOp.getOperation(),
+                                  tilingOptions);
+  }
+
+  LogicalResult peelReduction(
+      PatternRewriter &rewriter, const TilingResult &tilingParallelDimsResult,
+      const scf::SCFTilingResult &tilingReductionDimsResult) const {
+    // Peel parallel loops.
+    if (auto loop =
+            dyn_cast_or_null<ParallelOp>(tilingParallelDimsResult.loop)) {
+      auto peelingResult = peelAllLoops(loop, rewriter);
+    }
+
+    // Peel reduction loop inside the main parallel loop, label the main loop as
+    // "perfectly tiled" one, to enable vectorization after canonicalization.
+    if (!tilingReductionDimsResult.loops.empty()) {
+      scf::ForOp forLoop = tilingReductionDimsResult.loops.front();
+      SCFForPeelingResult peelingResult = peelSCFForOp(rewriter, forLoop);
+      if (peelingResult.mainLoop) {
+        setLabel(peelingResult.mainLoop, kPerfectlyTiledLoopLabel);
+      }
+
+      if (!peelingResult.tailLoop) return success();
+      // Tile ops in the peeled loop again, to size 1, so they can be
+      // scalarized.
+      scf::ForOp peeledLoop = peelingResult.tailLoop;
+      auto yieldOp = cast<scf::YieldOp>(peeledLoop.getBody()->getTerminator());
+      auto reduceOp = getRootReduce(yieldOp);
+      if (!reduceOp) return failure();
+
+      scf::SCFTilingOptions opts;
+      opts.setTileSizes(
+          getReductionDimTileSizes(reduceOp.getDimensions()[0], 1));
+
+      if (failed(tileUsingSCFForOpAndFuseGreedily(
+              rewriter, reduceOp, opts, kReduceTransformedLabel,
+              [&](Operation *op) { return isa<linalg::MapOp>(op); })))
+        return failure();
+    }
+    return success();
+  }
+
+  linalg::ReduceOp getRootReduce(scf::YieldOp yieldOp) const {
+    if (yieldOp.getResults().size() != 1) return nullptr;
+
+    Value reduceResult = yieldOp.getResults().front();
+    if (auto insertSliceOp =
+            reduceResult.getDefiningOp<tensor::InsertSliceOp>()) {
+      reduceResult = insertSliceOp.getSource();
+    }
+    return reduceResult.getDefiningOp<linalg::ReduceOp>();
+  }
+
+  int64_t parallelDimTileSize;
+  int64_t reductionDimTileSize;
+};
+
+struct TransformReduceForCpuPass
+    : public impl::TransformReduceForCpuPassBase<TransformReduceForCpuPass> {
+  TransformReduceForCpuPass() = default;
+
+  explicit TransformReduceForCpuPass(int64_t reduceVectorSize = 8,
+                                     int64_t reduceTileSize1D = 32,
+                                     ArrayRef<int64_t> reduceTileSizes2D = {}) {
+    vectorSize = reduceVectorSize;
+    tileSize1D = reduceTileSize1D;
+    tileSizes2D = reduceTileSizes2D;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    linalg::LinalgDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    if (tileSizes2D.empty()) {
+      tileSizes2D = {4, 2};
+    }
+
+    assert(tileSizes2D.size() == 2 &&
+           "Tiling sizes for Reduce should have 2 element.");
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<Reduce1DTransformPattern>(ctx, vectorSize, tileSize1D);
+    patterns.add<Reduce2DTransformPattern>(ctx, tileSizes2D[0], tileSizes2D[1]);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    // Ensure we drop the marker in the end.
+    f.walk([](linalg::ReduceOp reduceOp) {
+      removeLabel(reduceOp, kReduceTransformedLabel);
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformReduceForCpuPass(int64_t vectorSize, int64_t tileSize1D,
+                                ArrayRef<int64_t> tileSizes2D) {
+  return std::make_unique<mlir::gml_st::TransformReduceForCpuPass>(
+      vectorSize, tileSize1D, tileSizes2D);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc
new file mode 100644
index 00000000000..3c183e1b018
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reverse_for_cpu.cc
@@ -0,0 +1,162 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMREVERSEFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kReverseTransformedLabel =
+    "__reverse_transformed_label__";
+
+FailureOr<TilingResult> tileReverseAndUpdateResultIfTiled(
+    PatternRewriter &rewriter, thlo::ReverseOp &reverseOp,
+    ArrayRef<int64_t> tileSizes, bool distribute) {
+  TilingOptions opts;
+  opts.setTileSizeComputationFn(tileSizes);
+  opts.distribute = distribute;
+  auto tilingResult = tileUsingGmlSt(
+      opts, rewriter, cast<TilingInterface>(reverseOp.getOperation()));
+
+  if (failed(tilingResult)) return failure();
+
+  // Update the results if tiling occurred.
+  if (tilingResult->loop != nullptr) {
+    rewriter.replaceOp(reverseOp, tilingResult->loop->getResults());
+    reverseOp = cast<thlo::ReverseOp>(tilingResult->tiledOps.front());
+  }
+
+  return tilingResult;
+}
+
+SmallVector<int64_t> getTileSizes(int64_t rank, int64_t vectorSize,
+                                  bool tileToScalarize) {
+  SmallVector<int64_t> sizes(rank, 1);
+  if (!tileToScalarize) sizes[rank - 1] = vectorSize;
+  return sizes;
+}
+
+/// Pattern to tile `thlo.reverse`.
+struct ReverseTransformPattern : public OpRewritePattern<thlo::ReverseOp> {
+  using OpRewritePattern<thlo::ReverseOp>::OpRewritePattern;
+
+  explicit ReverseTransformPattern(MLIRContext *context, int64_t vectorSize,
+                                   PatternBenefit benefit = 1)
+      : OpRewritePattern<thlo::ReverseOp>(context, benefit),
+        vectorSize(vectorSize) {}
+
+  LogicalResult matchAndRewrite(thlo::ReverseOp reverseOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(reverseOp, kReverseTransformedLabel))
+      return rewriter.notifyMatchFailure(reverseOp,
+                                         "has already been transformed.");
+    if (isa<gml_st::ParallelOp, gml_st::ForOp>(reverseOp->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          reverseOp, "has already been tiled by another pass.");
+
+    // Parallel dimension tiling. Tiling will be of the form
+    // 1x1x..x1xVectorSize.
+    int64_t rank = reverseOp.getInput().getType().getRank();
+    auto tilingResult = tileReverseAndUpdateResultIfTiled(
+        rewriter, reverseOp, getTileSizes(rank, vectorSize, false),
+        /*distribute=*/true);
+
+    // Peel parallel loop.
+    if (auto loop = dyn_cast_or_null<ParallelOp>(tilingResult->loop)) {
+      auto peelingResult = peelAllLoops(loop, rewriter);
+
+      // If last dim is to be reversed.
+      if (llvm::is_contained(reverseOp.getReverseDimensions(), rank - 1)) {
+        // If we have a remaining loop, we tile this to sizes of 1.
+        for (auto *remParLoop : peelingResult.tailLoops) {
+          remParLoop->walk([&](Operation *childOp) {
+            if (isa<thlo::ReverseOp>(childOp)) {
+              auto innerReverseOp = dyn_cast<thlo::ReverseOp>(*childOp);
+              auto secondTiling = tileReverseAndUpdateResultIfTiled(
+                  rewriter, innerReverseOp,
+                  getTileSizes(rank, vectorSize, true),
+                  /*distribute=*/true);
+              setLabel(innerReverseOp, kReverseTransformedLabel);
+            }
+          });
+        }
+      }
+    }
+
+    setLabel(reverseOp, kReverseTransformedLabel);
+    return success();
+  }
+
+ private:
+  int64_t vectorSize;
+};
+
+struct TransformReverseForCpuPass
+    : public impl::TransformReverseForCpuPassBase<TransformReverseForCpuPass> {
+  TransformReverseForCpuPass() = default;
+
+  explicit TransformReverseForCpuPass(int64_t reverseVectorSize = 8) {
+    vectorSize = reverseVectorSize;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<ReverseTransformPattern>(ctx, vectorSize);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    // Ensure we drop the marker in the end.
+    f.walk([](thlo::ReverseOp reverseOp) {
+      removeLabel(reverseOp, kReverseTransformedLabel);
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformReverseForCpuPass(int64_t vectorSize) {
+  return std::make_unique<mlir::gml_st::TransformReverseForCpuPass>(vectorSize);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
new file mode 100644
index 00000000000..31c3aecafbb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_scatter_for_cpu.cc
@@ -0,0 +1,113 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMSCATTERFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kScatterTransformedLabel =
+    "__scatter_transformed_label__";
+
+struct TileScatterPattern : public OpRewritePattern<thlo::ScatterOp> {
+  using OpRewritePattern<thlo::ScatterOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::ScatterOp op,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(op, kScatterTransformedLabel)) return failure();
+
+    if (isa<scf::ForOp>(op->getParentOp())) {
+      return rewriter.notifyMatchFailure(
+          op, "has already been tiled by another pass.");
+    }
+
+    // Tile everything to points.
+    scf::SCFTilingOptions opts;
+    opts.setTileSizeComputationFunction([](OpBuilder &b, Operation *op) {
+      OpBuilder::InsertionGuard guard(b);
+      b.setInsertionPointToStart(
+          &op->getParentOfType<func::FuncOp>().getBody().front());
+
+      auto loops = cast<TilingInterface>(op).getLoopIteratorTypes();
+      return SmallVector<Value>(
+          loops.size(), b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
+    });
+
+    auto tilingResult = scf::tileUsingSCFForOp(
+        rewriter, cast<TilingInterface>(op.getOperation()), opts);
+    if (failed(tilingResult)) return failure();
+
+    // If we did not tile, do not replace original op and just mark it as
+    // transformed then return.
+    if (!tilingResult->loops.empty()) {
+      rewriter.replaceOp(op, tilingResult->replacements);
+    }
+    setLabel(tilingResult->tiledOps.front(), kScatterTransformedLabel);
+    return success();
+  }
+};
+
+struct TransformScatterForCpuPass
+    : public impl::TransformScatterForCpuPassBase<TransformScatterForCpuPass> {
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<arith::ArithDialect, gml_st::GmlStDialect, scf::SCFDialect,
+                    tensor::TensorDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<TileScatterPattern>(ctx);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+      return signalPassFailure();
+
+    // Ensure we drop the marker in the end.
+    f.walk([](thlo::ScatterOp scatterOp) {
+      removeLabel(scatterOp, kScatterTransformedLabel);
+    });
+  }
+};
+
+}  // namespace
+}  // namespace mlir::gml_st
+
+namespace mlir::gml_st {
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformScatterForCpuPass() {
+  return std::make_unique<mlir::gml_st::TransformScatterForCpuPass>();
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_sort_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_sort_for_cpu.cc
new file mode 100644
index 00000000000..b7dfee3a992
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_sort_for_cpu.cc
@@ -0,0 +1,118 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMSORTFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using mlir::arith::ConstantIndexOp;
+using mlir::thlo::SortOp;
+
+constexpr llvm::StringRef kSortTransformedLabel = "__sort_transformed_label__";
+
+struct TileSortPattern : public OpRewritePattern<SortOp> {
+  TileSortPattern(MLIRContext *context, TilingOptions options,
+                  PatternBenefit benefit = 1)
+      : OpRewritePattern<thlo::SortOp>(context, benefit),
+        options(std::move(options)) {}
+
+  LogicalResult matchAndRewrite(SortOp op,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(op, kSortTransformedLabel)) return failure();
+
+    if (isa<gml_st::ParallelOp, gml_st::ForOp>(op->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          op, "has already been tiled by another pass.");
+
+    auto tilingResult = tileUsingGmlSt(
+        options, rewriter, cast<TilingInterface>(op.getOperation()));
+    if (failed(tilingResult)) return failure();
+
+    // If we did not tile (e.g. when all tile sizes are 0), do not replace
+    // original op and just mark it as transformed then return.
+    if (tilingResult->loop != nullptr) {
+      rewriter.replaceOp(op, tilingResult->loop->getResults());
+    }
+    setLabel(tilingResult->tiledOps.front(), kSortTransformedLabel);
+    return success();
+  }
+
+ private:
+  TilingOptions options;
+};
+
+struct TransformSortForCpuPass
+    : public impl::TransformSortForCpuPassBase<TransformSortForCpuPass> {
+  TransformSortForCpuPass() = default;
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<GmlStDialect, arith::ArithDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    auto getTileSize = [&](mlir::OpBuilder b, Operation *op) {
+      // use tile sizes 1 by default
+      auto sortOp = llvm::cast<SortOp>(op);
+      auto size = sortOp.getLoopIteratorTypes().size();
+      return SmallVector<Value>(size,
+                                b.create<ConstantIndexOp>(op->getLoc(), 1));
+    };
+
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    TilingOptions tilingOptions;
+    tilingOptions.tileSizeComputationFn = getTileSize;
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<TileSortPattern>(ctx, tilingOptions);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    // Ensure we drop the marker in the end.
+    f.walk([](thlo::SortOp sortOp) {
+      removeLabel(sortOp, kSortTransformedLabel);
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformSortForCpuPass() {
+  return std::make_unique<TransformSortForCpuPass>();
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_transpose_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_transpose_for_cpu.cc
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc
index 31060bf2cea..bc6703f0d3f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_transpose_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_transpose_for_cpu.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -32,10 +35,13 @@ namespace mlir::gml_st {
 namespace {
 
 #define GEN_PASS_DEF_TRANSFORMTRANSPOSEFORCPUPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
+#include "gml_st/transforms/passes.h.inc"
 
 using mlir::arith::ConstantIndexOp;
 
+static constexpr llvm::StringRef kTransposeTransformedLabel =
+    "__transpose_transformed_label__";
+
 struct TileTransposePattern : public OpRewritePattern<linalg::TransposeOp> {
   TileTransposePattern(MLIRContext *context, TilingOptions options,
                        PatternBenefit benefit = 1)
@@ -44,10 +50,14 @@ struct TileTransposePattern : public OpRewritePattern<linalg::TransposeOp> {
 
   LogicalResult matchAndRewrite(linalg::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(op)) return failure();
+    if (hasLabel(op, kTransposeTransformedLabel)) return failure();
+
+    if (isa<LoopLikeOpInterface>(op->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          op, "has already been tiled by another pass.");
 
-    auto tilingResult =
-        tile(options, rewriter, cast<TilingInterface>(op.getOperation()));
+    auto tilingResult = tileUsingGmlSt(
+        options, rewriter, cast<TilingInterface>(op.getOperation()));
     if (failed(tilingResult)) return failure();
 
     // If we did not tile (e.g. when all tile sizes are 0), do not replace
@@ -55,7 +65,22 @@ struct TileTransposePattern : public OpRewritePattern<linalg::TransposeOp> {
     if (tilingResult->loop != nullptr) {
       rewriter.replaceOp(op, tilingResult->loop->getResults());
     }
-    setTransformationAttr(rewriter, tilingResult->tiledOp);
+    setLabel(tilingResult->tiledOps.front(), kTransposeTransformedLabel);
+
+    // Peel parallel loops, label the main loop as "perfectly tiled" one, to
+    // enable vectorization after canonicalization.
+    if (auto loop = dyn_cast_or_null<ParallelOp>(tilingResult->loop)) {
+      auto peelingResult = peelAllLoops(loop, rewriter);
+      setLabel(loop, kPerfectlyTiledLoopLabel);
+
+      // Tile ops in the peeled loop again, to size 1, so they can be
+      // scalarized.
+      if (failed(tilePeeledOpsToScalars(rewriter, peelingResult,
+                                        kTransposeTransformedLabel,
+                                        /*fuseFilterFn=*/nullptr)))
+        return failure();
+    }
+
     return success();
   }
 
@@ -75,7 +100,7 @@ struct TransformTransposeForCpuPass
   void getDependentDialects(DialectRegistry &registry) const final {
     registry.insert<GmlStDialect, arith::ArithDialect, linalg::LinalgDialect,
                     tensor::TensorDialect>();
-    registerGmlStTilingInterfaceExternalModels(registry);
+    linalg::registerTilingInterfaceExternalModels(registry);
   }
 
   void runOnOperation() override {
@@ -130,7 +155,9 @@ struct TransformTransposeForCpuPass
     }
 
     // Ensure we drop the marker in the end.
-    func.walk([](linalg::TransposeOp op) { removeTransformationAttr(op); });
+    func.walk([](linalg::TransposeOp op) {
+      removeLabel(op, kTransposeTransformedLabel);
+    });
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
new file mode 100644
index 00000000000..9bcf4e4dce4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
@@ -0,0 +1,537 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/fusion/fusion.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/transforms.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_FUSIONPASS
+#include "gml_st/transforms/passes.h.inc"
+
+// TODO(frgossen): Move this to the shape reification pass.
+struct DimOpFissionPattern : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp extract,
+                                PatternRewriter& rewriter) const override {
+    auto shapeDef = llvm::dyn_cast_or_null<shape::ShapeOfOp>(
+        extract.getTensor().getDefiningOp());
+    if (!shapeDef || extract.getIndices().size() != 1) return failure();
+    rewriter.replaceOpWithNewOp<tensor::DimOp>(extract, shapeDef.getArg(),
+                                               extract.getIndices().front());
+    return success();
+  }
+};
+
+// TODO(frgossen): Implement this through the shape reification interface and
+// move this pattern to the shape reification pass.
+struct DimOpReificationPattern : public OpRewritePattern<tensor::DimOp> {
+  using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::DimOp op,
+                                PatternRewriter& rewriter) const override {
+    Operation* def = op.getSource().getDefiningOp();
+    if (!def) return failure();
+
+    // TODO(pifon): Split this pattern into many.
+    // Case tensor::ExtractSliceOp.
+    if (auto extractSliceOp = llvm::dyn_cast<tensor::ExtractSliceOp>(def)) {
+      assert(extractSliceOp->getNumResults() == 1 && "assume single result");
+      auto dimConstantIndex = op.getConstantIndex();
+      if (!dimConstantIndex.has_value()) return failure();
+
+      rewriter.replaceOp(op, extractSliceOp.getSizes()[*dimConstantIndex]);
+      return success();
+    }
+    // Case LinalgOp.
+    if (auto linalgOp = llvm::dyn_cast<linalg::LinalgOp>(def)) {
+      if (linalgOp->getNumResults() != 1 || !linalgOp.hasTensorSemantics()) {
+        return failure();
+      }
+      Value outputOperand = linalgOp.getDpsInitOperand(0)->get();
+      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, outputOperand,
+                                                 op.getIndex());
+      return success();
+    }
+
+    // Case EmptyOp.
+    if (auto emptyTensorOp = llvm::dyn_cast<tensor::EmptyOp>(def)) {
+      if (auto indexConstantOp = llvm::dyn_cast_or_null<arith::ConstantOp>(
+              op.getIndex().getDefiningOp())) {
+        int64_t idx =
+            indexConstantOp.getValue().dyn_cast<IntegerAttr>().getInt();
+        OpFoldResult dim = emptyTensorOp.getMixedSizes()[idx];
+        Value dimValue;
+        if (dim.is<Value>()) {
+          dimValue = dim.get<Value>();
+        } else {
+          assert(dim.is<Attribute>() && "expected Value or Attribute");
+          int64_t dimInt = dim.get<Attribute>().cast<IntegerAttr>().getInt();
+          dimValue =
+              rewriter.create<arith::ConstantIndexOp>(op.getLoc(), dimInt);
+        }
+        assert(dimValue);
+        rewriter.replaceOp(op, ValueRange{dimValue});
+        return success();
+      }
+    }
+
+    // Case ConcatenateOp.
+    if (auto concat = llvm::dyn_cast<thlo::ConcatenateOp>(def)) {
+      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, concat.getInit(),
+                                                 op.getIndex());
+      return success();
+    }
+
+    // Case DynamicBroadcastInDimOp.
+    if (auto bcast = llvm::dyn_cast<thlo::DynamicBroadcastInDimOp>(def)) {
+      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, bcast.getInit(),
+                                                 op.getIndex());
+      return success();
+    }
+
+    return failure();
+  }
+};
+
+class FusionPattern : public OpRewritePattern<tensor::ExtractSliceOp> {
+ public:
+  FusionPattern(MLIRContext* context,
+                function_ref<LogicalResult(tensor::ExtractSliceOp)> filterFn,
+                mlir::PatternBenefit benefit = 1)
+      : OpRewritePattern<tensor::ExtractSliceOp>(context, benefit),
+        filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractSliceOp,
+                                PatternRewriter& rewriter) const override {
+    assert(filterFn && "expect filter function");
+    if (failed(filterFn(extractSliceOp)))
+      return rewriter.notifyMatchFailure(extractSliceOp, "filtered");
+
+    // If there is an output argument produced by `linalg.fill`, then we can
+    // also fuse it into the parallel loop. To check that, we verify that the
+    // `src` of `extract_slice` is the bbArg of `gml_st.parallel` and that the
+    // corresponding operand of `gml_st.parallel` is defined by `linalg.fill`.
+    if (auto bbArg = dyn_cast<BlockArgument>(extractSliceOp.getSource())) {
+      if (auto parallelOp =
+              dyn_cast_or_null<ParallelOp>(bbArg.getOwner()->getParentOp())) {
+        Value loopOperand =
+            parallelOp.getOpOperandForRegionOutputArg(bbArg).get();
+        if (loopOperand.getDefiningOp<linalg::FillOp>())
+          return fuseFillOpsIntoParallelOp(rewriter, parallelOp);
+      }
+    }
+    return fuse(rewriter, extractSliceOp);
+  }
+
+ private:
+  function_ref<LogicalResult(tensor::ExtractSliceOp)> filterFn;
+};
+
+struct FusionPass : public impl::FusionPassBase<FusionPass> {
+  FusionPass(StringRef producer, StringRef consumer) {
+    this->producerLabel = producer.str();
+    this->consumerLabel = consumer.str();
+  }
+
+  void getDependentDialects(DialectRegistry& registry) const final {
+    registry.insert<GmlStDialect, scf::SCFDialect, tensor::TensorDialect>();
+  }
+
+  void runOnOperation() final {
+    MLIRContext* ctx = &getContext();
+
+    auto filterFn = [&](tensor::ExtractSliceOp op) {
+      Operation* producerOp = op.getSource().getDefiningOp();
+      if (auto bbArg = dyn_cast<BlockArgument>(op.getSource())) {
+        if (isa<ParallelOp>(bbArg.getOwner()->getParentOp())) return success();
+      }
+      if (!producerOp || (!producerLabel.empty() &&
+                          !hasMatchingLabel(producerOp, producerLabel))) {
+        return failure();
+      }
+
+      Operation* consumerOp = nullptr;
+      if (!consumerLabel.empty()) {
+        for (Operation* user : op.getResult().getUsers()) {
+          if (hasMatchingLabel(user, consumerLabel)) {
+            consumerOp = user;
+            break;
+          }
+        }
+        return success(consumerOp != nullptr);
+      }
+
+      return success();
+    };
+
+    // Populate patterns.
+    RewritePatternSet patterns(ctx);
+    populateFusionPatterns(ctx, filterFn, &patterns);
+
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+bool isEqualOp(const Operation* lhsC, const Operation* rhsC) {
+  return OperationEquivalence::isEquivalentTo(
+      const_cast<Operation*>(lhsC), const_cast<Operation*>(rhsC),
+      OperationEquivalence::exactValueMatch,
+      /*markEquivalent=*/nullptr, OperationEquivalence::IgnoreLocations);
+}
+
+template <class OpTy>
+void eliminateEqualOps(PatternRewriter& rewriter, Block& block) {
+  SmallVector<OpTy> uniqueOps;
+  for (auto op : llvm::make_early_inc_range(block.getOps<OpTy>())) {
+    auto* it = llvm::find_if(
+        uniqueOps, [&](OpTy uniqueOp) { return isEqualOp(uniqueOp, op); });
+    if (it == uniqueOps.end()) {
+      uniqueOps.push_back(op);
+    } else {
+      rewriter.replaceOp(op, it->getResult());
+    }
+  }
+}
+
+void eliminateTriviallyDeadUsers(PatternRewriter& rewriter, Operation* op) {
+  for (auto* user :
+       DenseSet<Operation*>(op->getUsers().begin(), op->getUsers().end())) {
+    if (isOpTriviallyDead(user)) rewriter.eraseOp(user);
+  }
+}
+
+void reifyDimOp(PatternRewriter& rewriter, tensor::DimOp dimOp) {
+  auto dimValue = dimOp.getSource().template dyn_cast<OpResult>();
+  if (!dimValue) return;
+  auto rankedShapeTypeOp =
+      dyn_cast<ReifyRankedShapedTypeOpInterface>(dimValue.getOwner());
+  if (!rankedShapeTypeOp) return;
+
+  std::optional<int64_t> dimIndex = dimOp.getConstantIndex();
+  if (!dimIndex) return;
+
+  SmallVector<SmallVector<Value>> reifiedResultShapes;
+  if (failed(
+          rankedShapeTypeOp.reifyResultShapes(rewriter, reifiedResultShapes)))
+    return;
+
+  if (reifiedResultShapes.size() != rankedShapeTypeOp->getNumResults()) return;
+
+  unsigned resultNumber = dimValue.getResultNumber();
+  auto sourceType = dimValue.getType().dyn_cast<RankedTensorType>();
+  if (reifiedResultShapes[resultNumber].size() !=
+      static_cast<size_t>(sourceType.getRank()))
+    return;
+
+  rewriter.replaceOp(dimOp, reifiedResultShapes[resultNumber][*dimIndex]);
+}
+
+void reifyDimOpsUsers(PatternRewriter& rewriter, Operation* op) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointAfter(op);
+
+  for (auto* user : llvm::make_early_inc_range(op->getUsers())) {
+    auto dimOp = dyn_cast<tensor::DimOp>(user);
+    if (dimOp) reifyDimOp(rewriter, dimOp);
+  }
+}
+
+// Iterates over tensor::ExtractSliceOp inside the block, finds a suitable
+// candidate for fusion and fuses it. The fusion candidate should satisfy the
+// filter function and not have uses outside of the block. Fails if nothing
+// can be fused.
+LogicalResult fuseGreedilyOneOpIntoBlock(
+    PatternRewriter& rewriter, Block& block,
+    llvm::function_ref<bool(Operation*)> filterFn) {
+  // Ad-hoc CSE to eliminate duplicate MatrializeOp that could have been added
+  // after previous fusions. Running the whole CSE pass would be to expensive
+  // here and unnecessary. Without removing those duplicate, some ops will be
+  // fused multiple times resulting in exponential code growth.
+  eliminateEqualOps<TileOp>(rewriter, block);
+  eliminateEqualOps<tensor::ExtractSliceOp>(rewriter, block);
+
+  for (auto extractSliceOp : block.getOps<tensor::ExtractSliceOp>()) {
+    auto* fusionCandidate = extractSliceOp.getSource().getDefiningOp();
+    // Do not fuse if there is no defining op. Of example if it's a
+    // materialize from a function argument.
+    if (!fusionCandidate) continue;
+
+    if (filterFn && !filterFn(fusionCandidate)) continue;
+
+    // Ad-hoc DCE to trim the fusion candidate from dead users that could have
+    // been added in the previous fusion cycles. Normally those ops would be
+    // garbage collected after the pattern rewriter driver finished working,
+    // but here it requires manual handling.
+    eliminateTriviallyDeadUsers(rewriter, fusionCandidate);
+
+    // Push tensor.dim ops 'above' the fusion candidate. This is normally done
+    // by canonicalization passes, but running the whole canonicalization
+    // pipeline here is too expensive.
+    reifyDimOpsUsers(rewriter, fusionCandidate);
+
+    // After the previous steps, extractSliceOp should be only one user of the
+    // fusion candidate. Otherwise this candidate should not be fused.
+    auto fusionCandidateUsers = llvm::to_vector(fusionCandidate->getUsers());
+    if (fusionCandidateUsers.size() != 1 ||
+        fusionCandidateUsers[0] != extractSliceOp)
+      continue;
+
+    if (succeeded(fuse(rewriter, extractSliceOp))) {
+      return success();
+    }
+  }
+  return failure();
+}
+
+}  // namespace
+
+FailureOr<Operation*> fuse(PatternRewriter& rewriter,
+                           tensor::ExtractSliceOp extractSliceOp) {
+  Location loc = extractSliceOp.getLoc();
+  FailureOr<Value> fusedOr = createFusedOp(rewriter, extractSliceOp);
+  if (failed(fusedOr)) return failure();  // Match failure already notified.
+
+  // Insert cast if needed.
+  Value fused = *fusedOr;
+  if (fused.getType() != extractSliceOp.getType()) {
+    // The result should be a tensor, cast it to the correct shape
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(fused.getDefiningOp());
+    fused =
+        rewriter.create<tensor::CastOp>(loc, extractSliceOp.getType(), fused);
+  }
+
+  rewriter.replaceOp(extractSliceOp, fused);
+  return fused.getDefiningOp();
+}
+
+void fuseGreedily(PatternRewriter& rewriter, Block& block,
+                  llvm::function_ref<bool(Operation*)> filterFn) {
+  while (succeeded(fuseGreedilyOneOpIntoBlock(rewriter, block, filterFn)))
+    ;
+}
+
+FusionCluster findMapFusionCluster(Operation* op) {
+  // Find the root operation in the chain of elementwise ops. Current approach
+  // doesn't work well if maps don't form a chain.
+  Operation* rootOp = op;
+  while (true) {
+    auto users = llvm::to_vector(rootOp->getUsers());
+
+    if (users.size() != 1) break;
+    if (!isa<linalg::MapOp>(users[0])) break;
+
+    rootOp = users[0];
+  }
+
+  // Run a graph search to find all linalg.map and that can be fused in
+  // the root op.
+  DenseSet<Operation*> resultOps;
+  SmallVector<Operation*> remainingProducers{rootOp};
+
+  while (!remainingProducers.empty()) {
+    Operation* curOp = remainingProducers.pop_back_val();
+    if (!curOp) continue;
+
+    if (auto mapOp = dyn_cast<linalg::MapOp>(curOp)) {
+      resultOps.insert(curOp);
+      for (auto* operand : mapOp.getDpsInputOperands())
+        remainingProducers.push_back(operand->get().getDefiningOp());
+    } else if (curOp->getName() == op->getName()) {
+      for (auto* u : curOp->getUsers()) {
+        // Do not fuse curOp that is used by another op of the same type.
+        if (u->getName() == op->getName()) continue;
+      }
+      resultOps.insert(curOp);
+    }
+  }
+  return {resultOps, rootOp};
+}
+
+LogicalResult fuseFillOpsIntoParallelOp(PatternRewriter& rewriter,
+                                        ParallelOp parallelOp) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(parallelOp.getBody());
+  bool fillOpsWereFused = false;
+  for (OpOperand& output :
+       parallelOp->getOpOperands().take_back(parallelOp.getNumOutputs())) {
+    auto fillOp = output.get().getDefiningOp<linalg::FillOp>();
+    if (!fillOp) continue;
+
+    fillOpsWereFused = true;
+
+    // Clone `linalg.fill` op inside the loop, update the uses of bbArg.
+    BlockArgument regionOutputArg =
+        parallelOp.getRegionOutputArgForOpOperand(output);
+    auto clonedFill = cast<linalg::FillOp>(
+        mlir::clone(rewriter, fillOp, fillOp.getResultTypes(),
+                    {fillOp.value(), regionOutputArg}));
+
+    output.set(fillOp.output());
+
+    SmallVector<tensor::ExtractSliceOp> sliceOps;
+    regionOutputArg.replaceUsesWithIf(
+        clonedFill.getResult(0), [&](OpOperand& operand) {
+          Operation* owner = operand.getOwner();
+          if (auto sliceOp = dyn_cast_or_null<tensor::ExtractSliceOp>(owner))
+            sliceOps.push_back(sliceOp);
+          return owner != clonedFill && !isa<SetYieldOp>(owner) &&
+                 owner->getParentOfType<ParallelOp>() == parallelOp;
+        });
+
+    // Use standard fusion logic to swap extract_slice(fill) ->
+    // fill(extract_slice).
+    for (tensor::ExtractSliceOp sliceOp : sliceOps)
+      (void)fuse(rewriter, sliceOp);
+  }
+  return success(fillOpsWereFused);
+}
+
+FailureOr<ParallelOp> tileUsingGmlStParallelAndFuseGreedily(
+    PatternRewriter& rewriter, Operation* op,
+    const mlir::gml_st::TilingOptions& opts, StringRef label,
+    llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  assert(opts.distribute == true &&
+         "gml_st.for should not be used for CPU pipeline");
+  auto tilingResult = tileUsingGmlSt(opts, rewriter, cast<TilingInterface>(op));
+  if (failed(tilingResult)) return failure();
+
+  // If we did not tile (e.g. when all tile sizes are 0), do not replace
+  // original op and just mark it as transformed then return.
+  if (tilingResult->loop != nullptr) {
+    rewriter.replaceOp(op, tilingResult->loop->getResults());
+
+    // Fuse ops into the loop.
+    fuseGreedily(rewriter, *tilingResult->tiledOps.front()->getBlock(),
+                 fuseFilterFn);
+  }
+  setLabel(tilingResult->tiledOps.front(), label);
+  return cast<ParallelOp>(tilingResult->loop);
+}
+
+FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
+    PatternRewriter& rewriter, Operation* op, const scf::SCFTilingOptions& opts,
+    StringRef label, llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  auto tilingResult = scf::tileUsingSCFForOp(rewriter, op, opts);
+  if (failed(tilingResult)) return failure();
+
+  // If we did not tile (e.g. when all tile sizes are 0), do not replace
+  // original op and just mark it as transformed then return.
+  if (!tilingResult->loops.empty()) {
+    rewriter.replaceOp(op, tilingResult->replacements);
+
+    // Fuse ops into the loop.
+    fuseGreedily(rewriter, *tilingResult->loops.back().getBody(), fuseFilterFn);
+  }
+  setLabel(tilingResult->tiledOps.front(), label);
+  return tilingResult;
+}
+
+LogicalResult tilePeeledOpsToScalars(
+    PatternRewriter& rewriter, const GmlStPeelingResult& peelingResult,
+    StringRef label, llvm::function_ref<bool(Operation*)> fuseFilterFn) {
+  for (auto* loop : peelingResult.tailLoops) {
+    ParallelOp peeledLoop = dyn_cast<ParallelOp>(loop);
+    auto* terminatorOp = peeledLoop->getRegion(0).front().getTerminator();
+    if (!terminatorOp) return failure();
+
+    auto* definingOp = terminatorOp->getOperand(0).getDefiningOp();
+    if (!definingOp) return failure();
+
+    mlir::gml_st::TilingOptions opts;
+    opts.setTileSizeComputationFn(SmallVector<int64_t>(
+        cast<linalg::LinalgOp>(definingOp).getNumLoops(), 1));
+
+    if (failed(tileUsingGmlStParallelAndFuseGreedily(rewriter, definingOp, opts,
+                                                     label, fuseFilterFn)))
+      return failure();
+  }
+  return success();
+}
+
+FailureOr<Value> createFusedOp(PatternRewriter& rewriter,
+                               tensor::ExtractSliceOp extractSliceOp) {
+  Value src = extractSliceOp.getSource();
+  if (!src) return failure();
+  auto tileableOp = src.getDefiningOp<TilingInterface>();
+  if (!tileableOp) {
+    return rewriter.notifyMatchFailure(
+        extractSliceOp,
+        "expected source to be defined by tiling interface op ");
+  }
+
+  SmallVector<OpFoldResult> offsets = extractSliceOp.getMixedOffsets();
+  SmallVector<OpFoldResult> sizes = extractSliceOp.getMixedSizes();
+
+  // Tile the producer.
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(extractSliceOp);
+  FailureOr<Value> tiledProducer = tileableOp.generateResultTileValue(
+      rewriter, /*resultNumber=*/0, offsets, sizes);
+  if (failed(tiledProducer)) {
+    return rewriter.notifyMatchFailure(tileableOp,
+                                       "failed to tile the producer");
+  }
+
+  return tiledProducer;
+}
+
+void populateFusionPatterns(
+    MLIRContext* ctx,
+    function_ref<LogicalResult(tensor::ExtractSliceOp)> filterFn,
+    RewritePatternSet* patterns) {
+  patterns->insert<FusionPattern>(ctx, filterFn);
+  // clang-format off
+  patterns->insert<
+      DimOpFissionPattern,
+      DimOpReificationPattern>(ctx);
+  // clang-format on
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
+    StringRef producer, StringRef consumer) {
+  return std::make_unique<FusionPass>(producer, consumer);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
new file mode 100644
index 00000000000..2f7b23bc637
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.h
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
+
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gml_st {
+
+// Create fused operation based on the specificed subset. The result is
+// equivalent to the given `tensor.extract_slice` op.
+FailureOr<Value> createFusedOp(PatternRewriter &rewriter,
+                               tensor::ExtractSliceOp materializeOp);
+
+// Fuses an op into `tensor.extract_slice` and performs the necessary updates to
+// the surrounding loop if any.
+FailureOr<Operation *> fuse(PatternRewriter &rewriter,
+                            tensor::ExtractSliceOp materializeOp);
+
+// Finds `tensor.extract_slice` ops in the block and fuses ops into them.
+// Verifies that fusion candidate doesn't have any uses except the one
+// `tensor.extract_slice` in the block to avoid exponential code growth.
+void fuseGreedily(PatternRewriter &rewriter, Block &block,
+                  llvm::function_ref<bool(Operation *)> filterFn = nullptr);
+
+/// Populate fusion patterns.
+void populateFusionPatterns(
+    MLIRContext *ctx,
+    function_ref<LogicalResult(tensor::ExtractSliceOp)> filterFn,
+    RewritePatternSet *patterns);
+
+struct FusionCluster {
+  DenseSet<Operation *> operations;
+  Operation *root;
+};
+
+// Find a cluster of operations that can be tiled and fused together around
+// the root op. We want to fuse output of the fusion op with elementwise ops. In
+// general case a cluster is a tree that can have multiple leaf-node ops,
+// e.g. map(op, map(op)).
+// First element of the cluster is always the root for tiling.
+FusionCluster findMapFusionCluster(Operation *op);
+
+// Fuses linalg.fill that is used in output argument of the ParallelOp.
+LogicalResult fuseFillOpsIntoParallelOp(PatternRewriter &rewriter,
+                                        ParallelOp parallelOp);
+
+// Tiles the op to gml_st.parallel and fuses greedily according to the filter.
+FailureOr<ParallelOp> tileUsingGmlStParallelAndFuseGreedily(
+    PatternRewriter &rewriter, Operation *op,
+    const mlir::gml_st::TilingOptions &opts, StringRef label,
+    llvm::function_ref<bool(Operation *)> fuseFilterFn);
+
+// Tiles the op to scf.for and fuses greedily according to the filter.
+FailureOr<scf::SCFTilingResult> tileUsingSCFForOpAndFuseGreedily(
+    PatternRewriter &rewriter, Operation *op, const scf::SCFTilingOptions &opts,
+    StringRef label, llvm::function_ref<bool(Operation *)> fuseFilterFn);
+
+// Tiles the op to 1 for all dimensions and fuses greedily according to the
+// filter function.
+LogicalResult tilePeeledOpsToScalars(
+    PatternRewriter &rewriter, const GmlStPeelingResult &peelingResult,
+    StringRef label, llvm::function_ref<bool(Operation *)> fuseFilterFn);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_FUSION_FUSION_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_simtfy/gml_st_simtfy.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_simtfy/gml_st_simtfy.cc
new file mode 100644
index 00000000000..6795c25b35e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_simtfy/gml_st_simtfy.cc
@@ -0,0 +1,354 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LogicalResult.h"
+
+#define GEN_PASS_DEF_GMLSTSIMTFYPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using namespace mlir;
+using namespace mlir::gml_st;
+using mlir::gpu::LaunchOp;
+
+namespace {
+
+/// Converts a sequence of 3 nested gml_st.parallel ops into a gpu.launch op.
+/// Throughout this pass we call the first level of nesting "block", the second
+/// "warp", and the 3rd "thread" level. The intent is to allude to the fact that
+/// these will likely correspond to the CUDA programming concepts of the same
+/// name when the IR is lowered to PTX. However, this pass does not make, nor
+/// verify all the requirements (e.g., that the warp-level iteration contains
+/// exactly 32 steps) for mapping to this level.
+///
+/// Each gml_st.parallel is expected to only have a single induction variable.
+/// The loops representing the block, warp, and thread level are mapped to
+/// gridDim.x, blockDim.y, and blockDim.x launch dimensions of gpu.launch,
+/// respectively.
+///
+/// All operations from within the nested gml_st.parallel regions are copied
+/// directly into the gpu.launch region, with induction variables replaced by
+/// equivalent values computed using the blockIdx.x, threadIdx.y and threadIdx.x
+/// indices. Thus, the 3 nested parallel regions are effectively flattened into
+/// a single level of nesting within the gpu.launch region.
+///
+/// At any level of nesting, multiple gml_st.parallel operations are allowed, as
+/// long as they have the same iteration space, i.e., the SSA values defining
+/// the lower bound, upper bound and the step of all parallels on the same level
+/// of nesting are the same values.
+LogicalResult simtfyOp(ParallelOp root, RewriterBase& rewriter);
+
+/// Implements the GmlStToGpuPass declared in
+/// include/mlir-hlo/Dialect/gml_st/transforms/passes.td.
+struct GmlStSimtfyPass : public ::impl::GmlStSimtfyPassBase<GmlStSimtfyPass> {
+  using GmlStSimtfyPassBase<GmlStSimtfyPass>::GmlStSimtfyPassBase;
+
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    IRRewriter rewriter(&getContext());
+
+    WalkResult walk = func.walk([&](ParallelOp op) {
+      if (op.getDistributionType().has_value() &&
+          op.getDistributionType().value() == blockDistributionLabel) {
+        if (failed(simtfyOp(op, rewriter))) {
+          op->emitOpError("failed to simtfy");
+          return WalkResult::interrupt();
+        }
+      }
+      return WalkResult::skip();
+    });
+    if (walk.wasInterrupted()) signalPassFailure();
+  }
+};
+}  // namespace
+
+/// Creates an initial gpu.launch op with launch configuration set to a single
+/// thread. The idea is to update those later, as we discover the correct values
+/// from the nesting structure.
+static LaunchOp createInitialGpuLaunchOp(Location loc, Value defaultSize,
+                                         RewriterBase& rewriter) {
+  auto launch =
+      rewriter.create<LaunchOp>(loc, defaultSize, defaultSize, defaultSize,
+                                defaultSize, defaultSize, defaultSize);
+  Block* body = &launch.getBody().front();
+  rewriter.setInsertionPointToEnd(body);
+  rewriter.create<gpu::TerminatorOp>(loc);
+  rewriter.setInsertionPointToStart(body);
+  return launch;
+}
+
+/// Returns the induction variable index `idx` of gpu.launch that should be used
+/// for the given gml_st.parallel's `nestingLevel`.
+static LogicalResult getInductionVarIdxForLevel(unsigned nestingLevel,
+                                                unsigned& idx) {
+  constexpr std::array<unsigned, 3> kNestingToLaunchIdx{
+      0,  // block IDs map to blockIdx.x
+      4,  // warp IDs map to threadIdx.y
+      3,  // thread IDs map to threadIdx.x
+  };
+  if (nestingLevel >= kNestingToLaunchIdx.size()) return failure();
+  idx = kNestingToLaunchIdx[nestingLevel];
+  return success();
+}
+
+/// Verifies that the loop bounds of `currentBound` (which is a result of
+/// affine.apply) are the same ones as the bounds of the `parallel` op.
+static LogicalResult verifyLoopBoundsMatch(Value currentBound,
+                                           ParallelOp parallel) {
+  auto applyOp = currentBound.getDefiningOp<AffineApplyOp>();
+  assert(applyOp && "inferred bounds should be expressed as affine.apply");
+  OperandRange operands = applyOp.getMapOperands();
+  assert(operands.size() == 3 &&
+         "affine map expressing the launch bound should have three operands");
+  return success(operands[0] == parallel.getUpperBound().front() &&
+                 operands[1] == parallel.getLowerBound().front() &&
+                 operands[2] == parallel.getStep().front());
+}
+
+/// Emits code that infers and returns an iteration-independent version of
+/// `upperBound` in cases when the tiling size does not evenly divide the
+/// problem size.
+///
+/// In these cases, `upperBound` depends on other values within the `launch`
+/// region, so it cannot be used to infer the launch bounds of `launch`. This
+/// function returns an approximation of `upperBound` that does not depend on
+/// such values, and emmits code that masks off extra threads (identified by
+/// `inductionVar`) that result from using the approximated value.
+static Value handleImperfectTile(Location loc, LaunchOp launch,
+                                 Value upperBound, Value inductionVar,
+                                 RewriterBase& rewriter) {
+  // We are assuming that imperfect tiling is expressed through an affine.min
+  // op with an affine map of the form (<something>)[<something>] ->
+  // (<something>, tileSize), where <something>s possibly depend on values
+  // defined within the regions of nested gml_st.parallel. Since local values
+  // are not available outside of the loops, which is needed for launch bounds
+  // computation, we only use tileSize to compte the launch bounds. We then mask
+  // off the threads that would be computing out-of-bound values.
+  // TODO(b/244314345): Replace this pattern matching with a proper way to
+  // handle imperfect tiling once we figure this out.
+  auto affineMin = upperBound.getDefiningOp<AffineMinOp>();
+  if (!affineMin || affineMin.getMap().getNumResults() != 2) return upperBound;
+  auto tileSize =
+      affineMin.getMap().getResult(1).dyn_cast<AffineConstantExpr>();
+  if (!tileSize) return upperBound;
+
+  // Insert a guard in the region to mask off threads that would operate outside
+  // the tile bounds.
+  auto predicate = rewriter.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::slt, inductionVar, upperBound);
+  auto scfIf =
+      rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
+  rewriter.setInsertionPointToStart(&scfIf.getThenRegion().front());
+
+  // Create a constant corresponding to the tile size, and return it as the
+  // iteration-independent upper bound.
+  RewriterBase::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(launch);
+  return rewriter.create<arith::ConstantIndexOp>(loc, tileSize.getValue());
+}
+
+/// Matches the `launchIdx`-th iteration space of `launch` to the iteration
+/// space of `parallel`. Returns an SSA value that is a part of the `launch`'s
+/// region, and represents the value of `parallel`'s induction variable.
+static Value matchLaunchSpaceToLoop(ParallelOp parallel,
+                                    const IRMapping& bvm,
+                                    unsigned launchIdx, LaunchOp launch,
+                                    RewriterBase& rewriter) {
+  Location loc = parallel.getLoc();
+  Value upperBound = bvm.lookupOrDefault(parallel.getUpperBound().front());
+  Value lowerBound = bvm.lookupOrDefault(parallel.getLowerBound().front());
+  Value step = bvm.lookupOrDefault(parallel.getStep().front());
+
+  // Compute the value that gml_st.parallel's induction variable would have in
+  // each iteration, and make it available to operations within the gpu.launch
+  // region.
+  AffineMap inductionVarMap = AffineMap::get(
+      /*dimCount=*/1, /*symbolCount=*/2,
+      rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(1) +
+          rewriter.getAffineSymbolExpr(0));
+  Value inductionVar = rewriter.create<AffineApplyOp>(
+      loc, inductionVarMap,
+      ValueRange{launch.getBody().getArgument(launchIdx), lowerBound, step});
+
+  // Infer the launch bound from the loop bounds and the step.
+  Value iterIndependentUpperBound =
+      handleImperfectTile(loc, launch, upperBound, inductionVar, rewriter);
+  AffineMap launchBoundMap = AffineMap::get(
+      /*dimCount=*/1, /*symbolCount=*/2,
+      (rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
+          .ceilDiv(rewriter.getAffineSymbolExpr(1)));
+  RewriterBase::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(launch);
+  launch.setOperand(
+      launchIdx, rewriter.create<AffineApplyOp>(
+                     loc, launchBoundMap,
+                     ValueRange{iterIndependentUpperBound, lowerBound, step}));
+  return inductionVar;
+}
+
+namespace {
+// Converts the 3 nested gml_st.parallel ops rooted at `root` into a
+// gpu.launch op. We do this by creating an empty gpu.launch region and
+// copying all the operations in gml_st.parallel into that region,
+// recursively copying the bodies of any nested gml_st.parallel regions that
+// we encounter.
+LogicalResult simtfyOp(ParallelOp root, RewriterBase& rewriter) {
+  rewriter.setInsertionPoint(root);
+  Location loc = root.getLoc();
+
+  Value defaultSize = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  LaunchOp launch = createInitialGpuLaunchOp(loc, defaultSize, rewriter);
+
+  constexpr size_t kNumberOfNestedLoops = 3;
+
+  IRMapping bvm;
+  // We need to keep track of which value in the gpu.launch region represents
+  // which level of the induction variable in the nested region. This is because
+  // we might have multiple gml_st.parallel operations on the same level, and
+  // their induction variables should map to the same value in the flattened
+  // gpu.launch region.
+  SmallVector<Value, kNumberOfNestedLoops> nestingLevelToInductionVarMap;
+  // This is our stack holding in-flight operations of gml_st.parallel regions
+  // that we started to copy over to the gpu.launch region, but are on hold
+  // while we are processing a nested gml_st.parallel.
+  SmallVector<iterator_range<Block::iterator>, kNumberOfNestedLoops>
+      loopIterators;
+
+  // This functor implements the processing of a single parallel op:
+  // 1)  update of GPU launch bounds according to the interation space
+  // 2)  addition of a nesting level to `loopIterators`, with the iterator
+  //     over `parallel`'s body
+  // 3)  propagation of the distribution level of the op to all its
+  //     (non-ParallelOp) children if it is not at the innermost nesting level
+  auto processParallelOp = [&](ParallelOp parallel) {
+    unsigned nestingLevel = loopIterators.size();
+    unsigned inductionVarIdx = 0;
+    if (failed(getInductionVarIdxForLevel(nestingLevel, inductionVarIdx)))
+      return rewriter.notifyMatchFailure(parallel, "is nested too deeply");
+    if (parallel.getNumLoops() != 1) {
+      return rewriter.notifyMatchFailure(
+          parallel, "should only have a single induction variable");
+    }
+
+    Value currentBound = launch.getOperand(inductionVarIdx);
+    if (nestingLevel < nestingLevelToInductionVarMap.size()) {
+      // We already inferred the launch bound for this nesting level.
+      if (failed(verifyLoopBoundsMatch(currentBound, parallel))) {
+        return rewriter.notifyMatchFailure(
+            parallel,
+            "should have the same iteration space as other parallel operations "
+            "on the same nesting level");
+      }
+    } else {
+      // We are encountering a loop at this level of nesting for the first time.
+      assert(currentBound == defaultSize &&
+             "launch bound should use the default size");
+      nestingLevelToInductionVarMap.push_back(matchLaunchSpaceToLoop(
+          parallel, bvm, inductionVarIdx, launch, rewriter));
+    }
+
+    Block* body = parallel.getBody();
+
+    // Check that the nesting level is not the innermost one (i.e. that we
+    // are not at the thread level, but either at the block, or at the warp
+    // level).
+    if (nestingLevel < kNumberOfNestedLoops - 1) {
+      if (!parallel.getDistributionType().has_value())
+        return rewriter.notifyMatchFailure(
+            parallel,
+            "expected parallel operation to define a distribution type");
+
+      const StringAttr distributionTypeAttr =
+          parallel.getDistributionTypeAttr();
+      body->walk<WalkOrder::PreOrder>(
+          [&distributionTypeAttr](Operation* nestedOp) {
+            if (dyn_cast<ParallelOp>(nestedOp)) return WalkResult::skip();
+            nestedOp->setAttr(kDistributionLabelKey, distributionTypeAttr);
+            return WalkResult::advance();
+          });
+    }
+
+    bvm.map(parallel.getInductionVars().front(),
+            nestingLevelToInductionVarMap[nestingLevel]);
+    loopIterators.push_back(llvm::make_range(body->begin(), body->end()));
+
+    return success();
+  };
+
+  if (failed(processParallelOp(root))) return failure();
+
+  while (!loopIterators.empty()) {
+    auto currentLoop = loopIterators.pop_back_val();
+    for (Operation& op : currentLoop) {
+      if (auto nestedParallel = dyn_cast<ParallelOp>(&op)) {
+        // Push the current state back to loopIterator and start the next level
+        // of nesting.
+        loopIterators.push_back(
+            llvm::make_range(std::next(op.getIterator()), currentLoop.end()));
+        if (failed(processParallelOp(nestedParallel))) return failure();
+        break;
+      }
+      if (auto setYield = dyn_cast<SetYieldOp>(&op)) {
+        // convert setYield into distribute
+        auto parallelOp = setYield->getParentOfType<ParallelOp>();
+        assert(parallelOp &&
+               "gml_st.set_yield should have a parent gml_st.parallel op");
+        for (auto [result, src, set] :
+             llvm::zip(parallelOp.getResults(), setYield.getSrcs(),
+                       setYield.getSets())) {
+          bvm.map(result,
+                  rewriter.create<DistributeOp>(op.getLoc(), result.getType(),
+                                                bvm.lookupOrDefault(src),
+                                                bvm.lookupOrDefault(set)));
+        }
+        continue;
+      }
+      // TODO(b/244314146): Figure out what we need to do for operations
+      // encountered on upper nesting levels to correctly lower them after the
+      // rewrite to gpu.launch.
+      Operation* clone = rewriter.clone(op, bvm);
+      bvm.map(op.getResults(), clone->getResults());
+    }
+  }
+
+  rewriter.eraseOp(root);
+  return success();
+}
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::gml_st::createGmlStSimtfyPass(StringRef blockDistributionLabel) {
+  const GmlStSimtfyPassOptions passOptions = {
+      /*.warpDistributionLabel=*/std::string(blockDistributionLabel)};
+  return std::make_unique<GmlStSimtfyPass>(passOptions);
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_gpu/gml_st_to_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_gpu/gml_st_to_gpu.cc
new file mode 100644
index 00000000000..737a51df36f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_gpu/gml_st_to_gpu.cc
@@ -0,0 +1,364 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/utils/vector_utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define GEN_PASS_DEF_GMLSTTOGPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using namespace mlir;
+using namespace mlir::gml_st;
+using mlir::memref::SubViewOp;
+using mlir::vector::CombiningKind;
+using mlir::vector::ExtractOp;
+using mlir::vector::MultiDimReductionOp;
+using mlir::vector::TransferReadOp;
+using mlir::vector::TransferWriteOp;
+
+namespace {
+
+struct MultiDimReductionOpToWarpReductionPattern
+    : OpRewritePattern<MultiDimReductionOp> {
+  using OpRewritePattern<MultiDimReductionOp>::OpRewritePattern;
+
+  MultiDimReductionOpToWarpReductionPattern(MLIRContext* context,
+                                            StringRef warpDistributionLabel)
+      : OpRewritePattern<MultiDimReductionOp>(context),
+        warpDistributionLabel(warpDistributionLabel) {}
+
+  LogicalResult matchAndRewrite(MultiDimReductionOp reductionOp,
+                                PatternRewriter& rewriter) const override;
+
+ private:
+  std::string warpDistributionLabel;
+};
+
+struct EliminateMaterializeOfTransferReadPattern
+    : OpRewritePattern<MaterializeOp> {
+  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MaterializeOp materialize,
+                                PatternRewriter& rewriter) const override;
+};
+
+struct EliminateDistributeIntoTransferWritePattern
+    : OpRewritePattern<TransferWriteOp> {
+  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TransferWriteOp transferWrite,
+                                PatternRewriter& rewriter) const override;
+};
+
+/// Implements the GmlStToGpuPass declared in
+/// gml_st/transforms/passes.td.
+struct GmlStToGpuPass : public ::impl::GmlStToGpuPassBase<GmlStToGpuPass> {
+  using GmlStToGpuPassBase<GmlStToGpuPass>::GmlStToGpuPassBase;
+
+  void runOnOperation() override {
+    MLIRContext& ctx = getContext();
+    RewritePatternSet patterns(&ctx);
+
+    patterns.add<EliminateMaterializeOfTransferReadPattern,
+                 EliminateDistributeIntoTransferWritePattern>(&ctx);
+    patterns.add<MultiDimReductionOpToWarpReductionPattern>(
+        &ctx, warpDistributionLabel);
+
+    func::FuncOp func = getOperation();
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+Value createCombineOp(Location loc, Value lhs, Value rhs, CombiningKind kind,
+                      PatternRewriter& rewriter, Type elementType) {
+  auto helper = [&](auto dummy) {
+    return rewriter.create<decltype(dummy)>(loc, lhs, rhs);
+  };
+  bool isInt = elementType.isa<IntegerType, IndexType>();
+  switch (kind) {
+    case CombiningKind::ADD:
+      if (isInt) return helper(arith::AddIOp());
+      return helper(arith::AddFOp());
+    case CombiningKind::MUL:
+      if (isInt) return helper(arith::MulIOp());
+      return helper(arith::MulFOp());
+    case CombiningKind::MINUI:
+      return helper(arith::MinUIOp());
+    case CombiningKind::MINSI:
+      return helper(arith::MinSIOp());
+    case CombiningKind::MINF:
+      return helper(arith::MinFOp());
+    case CombiningKind::MAXUI:
+      return helper(arith::MaxUIOp());
+    case CombiningKind::MAXSI:
+      return helper(arith::MaxSIOp());
+    case CombiningKind::MAXF:
+      return helper(arith::MaxFOp());
+    case CombiningKind::AND:
+      return helper(arith::AndIOp());
+    case CombiningKind::OR:
+      return helper(arith::OrIOp());
+    case CombiningKind::XOR:
+      return helper(arith::XOrIOp());
+  }
+  llvm_unreachable("unhandled");
+}
+
+}  // namespace
+
+LogicalResult MultiDimReductionOpToWarpReductionPattern::matchAndRewrite(
+    MultiDimReductionOp reductionOp, PatternRewriter& rewriter) const {
+  auto distributionLevelAttr =
+      reductionOp->getAttrOfType<StringAttr>(kDistributionLabelKey);
+
+  if (!distributionLevelAttr ||
+      distributionLevelAttr.getValue() != warpDistributionLabel) {
+    return rewriter.notifyMatchFailure(reductionOp,
+                                       "expected warp-level operation");
+  }
+
+  auto inType = reductionOp.getSourceVectorType();
+  auto elementType = inType.getElementType();
+  if (!elementType.isIntOrFloat() || elementType.getIntOrFloatBitWidth() > 32) {
+    return rewriter.notifyMatchFailure(
+        reductionOp, "expected int or float element type <= 32b");
+  }
+  int64_t width = inType.getNumElements();
+  std::initializer_list<int64_t> supportedWidths = {1, 2, 4, 8, 16, 32};
+  if (!llvm::is_contained(supportedWidths, width)) {
+    return rewriter.notifyMatchFailure(
+        reductionOp, "expected input vector with size 2^N, <=32");
+  }
+  auto hasOneElement = [](auto type) {
+    return type && type.getNumElements() == 1;
+  };
+  auto outType = reductionOp.getDestType().dyn_cast<VectorType>();
+  if (!hasOneElement(outType)) {
+    return rewriter.notifyMatchFailure(reductionOp, "expected 1-vector output");
+  }
+  auto distribute = reductionOp.getSource().getDefiningOp<DistributeOp>();
+  if (!distribute) {
+    return rewriter.notifyMatchFailure(
+        reductionOp, "source not defined by gml_st.distribute");
+  }
+  // Even if this value was not written into the tile corresponding to the
+  // current thread's lane id, this is fine, since it doesn't matter which
+  // thread processes which element within a reduction.
+  TypedValue<VectorType> distributeSource = distribute.getSource();
+  if (!hasOneElement(distributeSource.getType())) {
+    return rewriter.notifyMatchFailure(distribute, "expected 1-vector input");
+  }
+
+  // Preamble: extract element from input.
+  Location loc = reductionOp->getLoc();
+  Value result = rewriter.create<ExtractOp>(
+      loc, distributeSource,
+      SmallVector<int64_t>(distributeSource.getType().getRank(), 0));
+
+  auto createConstant = [&](int32_t value) {
+    return rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getI32IntegerAttr(value));
+  };
+  // Always have all lanes participate. This assumes that the lanes are either
+  // in convergence or that they have exited the kernel.
+  Value cWarpWidth = createConstant(32);
+  // Create warp shuffles of increasing offset and interleave with a clone of
+  // the accumulate block.
+  unsigned bitWidth = elementType.getIntOrFloatBitWidth();
+  for (int64_t i = 1; i < width; i *= 2) {
+    Value shuffle = result;
+    if (bitWidth < 32) {
+      shuffle = rewriter.create<arith::ExtUIOp>(
+          loc, rewriter.getI32Type(),
+          rewriter.create<arith::BitcastOp>(
+              loc, rewriter.getIntegerType(bitWidth), shuffle));
+    }
+    shuffle = rewriter
+                  .create<gpu::ShuffleOp>(
+                      loc, shuffle, createConstant(static_cast<int32_t>(i)),
+                      cWarpWidth, gpu::ShuffleMode::XOR)
+                  .getShuffleResult();
+    if (bitWidth < 32) {
+      shuffle = rewriter.create<arith::BitcastOp>(
+          loc, elementType,
+          rewriter.create<arith::TruncIOp>(
+              loc, rewriter.getIntegerType(bitWidth), shuffle));
+    }
+    result = createCombineOp(loc, result, shuffle, reductionOp.getKind(),
+                             rewriter, elementType);
+  }
+
+  // Combine with init element and broadcast result back to vector.
+  Value acc = rewriter.create<ExtractOp>(loc, reductionOp.getAcc(), 0);
+  result = createCombineOp(loc, acc, result, reductionOp.getKind(), rewriter,
+                           elementType);
+  rewriter.replaceOpWithNewOp<vector::BroadcastOp>(reductionOp, outType,
+                                                   result);
+
+  return success();
+}
+
+namespace {
+SubViewOp createSubView(Location loc, Value source,
+                        ArrayRef<OpFoldResult> offsets,
+                        ArrayRef<OpFoldResult> sizes,
+                        ArrayRef<OpFoldResult> strides,
+                        PatternRewriter& rewriter) {
+  Type memRefType = SubViewOp::inferResultType(
+      source.getType().cast<MemRefType>(), offsets, sizes, strides);
+  return rewriter.create<SubViewOp>(loc, memRefType.cast<MemRefType>(), source,
+                                    offsets, sizes, strides);
+}
+
+// Matches a simple version of vector.transfer_read `op`.
+// 1.  it has a minor identity permutation map
+// 2.  it has no mask
+LogicalResult matchNonPermutingTransferRead(vector::TransferReadOp op,
+                                            PatternRewriter& rewriter) {
+  if (!op.getPermutationMap().isMinorIdentity()) {
+    return rewriter.notifyMatchFailure(op,
+                                       "expected cannonical permutation map");
+  }
+  if (op.getMask()) {
+    return rewriter.notifyMatchFailure(op, "should have no mask");
+  }
+  return success();
+}
+
+}  // namespace
+
+LogicalResult EliminateMaterializeOfTransferReadPattern::matchAndRewrite(
+    MaterializeOp materialize, PatternRewriter& rewriter) const {
+  // Match the following pattern:
+  //  gml_st.materialize(
+  //  vector.transfer_read Memref:$src[(arith.constant 0)...]
+  //  gml_st.tile [$offsets] [$sizes] [$strides])
+  auto transferRead = materialize.getSource().getDefiningOp<TransferReadOp>();
+  if (!transferRead) {
+    return rewriter.notifyMatchFailure(
+        materialize, "expected vector.transfer_read as source");
+  }
+  Value source = transferRead.getSource();
+  if (!source.getType().isa<MemRefType>()) {
+    return rewriter.notifyMatchFailure(transferRead,
+                                       "expected memref as source");
+  }
+  if (failed(matchNonPermutingTransferRead(transferRead, rewriter)))
+    return failure();
+
+  // Rewrite the pattern as:
+  // vector.transfer_read
+  //   (memref.subview $src [$offsets] [$sizes] [$strides])
+  //   [(arith.constant 0)...]
+  // TODO(b/254271932): This might not be correct if there is someone writing
+  // to `source` in between `transferRead` and `materialize`. This won't happen
+  // for elementwise fusion and softmax, but might become a problem down the
+  // line.
+  SmallVector<OpFoldResult> offsets;
+  for (auto en : llvm::zip(transferRead.getIndices(),
+                           getAsValues(rewriter, materialize.getLoc(),
+                                       materialize.getMixedOffsets()))) {
+    Value transferReadOffset = std::get<0>(en);
+    Value materializeOffset = std::get<1>(en);
+    offsets.push_back({rewriter.createOrFold<arith::AddIOp>(
+        materialize.getLoc(), transferReadOffset, materializeOffset)});
+  }
+  SmallVector<Value> zeros(
+      transferRead.getIndices().size(),
+      rewriter.create<arith::ConstantIndexOp>(materialize.getLoc(), 0));
+  auto subview = createSubView(materialize.getLoc(), source, offsets,
+                               materialize.getMixedSizes(),
+                               materialize.getMixedStrides(), rewriter);
+  Type resultType = materialize.getResult().getType();
+  if (!resultType.isa<VectorType>()) {
+    // We have a transfer to a single element: just use memref.load directly.
+    rewriter.replaceOpWithNewOp<memref::LoadOp>(materialize, subview, zeros);
+    return success();
+  }
+  rewriter.replaceOpWithNewOp<TransferReadOp>(
+      materialize, resultType, subview, zeros, transferRead.getPermutationMap(),
+      transferRead.getPadding(),
+      /*mask=*/nullptr, transferRead.getInBounds().value_or(nullptr));
+  return success();
+}
+
+LogicalResult EliminateDistributeIntoTransferWritePattern::matchAndRewrite(
+    TransferWriteOp transferWrite, PatternRewriter& rewriter) const {
+  // Match the following pattern:
+  //  vector.transfer_write
+  //    (gml_st.distribute $src into
+  //      [(gml_st.tile [$offsets] [$sizes] [$strides])])
+  //    Memref:$dst[(arith.constant 0)]
+  Value destination = transferWrite.getSource();
+  if (!destination.getType().isa<MemRefType>()) {
+    return rewriter.notifyMatchFailure(transferWrite,
+                                       "expected memref as destination");
+  }
+  if (failed(matchSimpleTransferOp(transferWrite, rewriter))) return failure();
+
+  auto distribute = transferWrite.getVector().getDefiningOp<DistributeOp>();
+  if (!distribute) {
+    return rewriter.notifyMatchFailure(transferWrite,
+                                       "expected distribute as source");
+  }
+  Value source = distribute.getSource();
+
+  auto tile = distribute.getSet().getDefiningOp<TileOp>();
+  if (!tile) {
+    return rewriter.notifyMatchFailure(distribute,
+                                       "expected gml_st.tile as set");
+  }
+
+  // Rewrite the pattern as:
+  // vector.transfer_write $src,
+  //   (memref.subview $dst [$offsets] [$sizes] [$strides])
+  //   [(arith.constant 0)...]
+  auto subview =
+      createSubView(transferWrite.getLoc(), destination, tile.getMixedOffsets(),
+                    tile.getMixedSizes(), tile.getMixedStrides(), rewriter);
+  rewriter.replaceOpWithNewOp<TransferWriteOp>(
+      transferWrite, /*resultType=*/std::nullopt, source, subview,
+      transferWrite.getIndices(), transferWrite.getPermutationMap(),
+      /*mask=*/nullptr, transferWrite.getInBounds().value_or(nullptr));
+  return success();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> mlir::gml_st::createGmlStToGpuPass(
+    StringRef warpDistributionLabel) {
+  const GmlStToGpuPassOptions passOptions = {
+      /*.warpDistributionLabel=*/std::string(warpDistributionLabel)};
+  return std::make_unique<GmlStToGpuPass>(passOptions);
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_scf/gml_st_to_scf.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_scf/gml_st_to_scf.cc
new file mode 100644
index 00000000000..0ff52d6ccfb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gml_st_to_scf/gml_st_to_scf.cc
@@ -0,0 +1,112 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_GMLSTTOSCF
+#include "gml_st/transforms/passes.h.inc"
+
+/// Converts gml_st.parallel to SCF loop nest.
+struct ParallelOpToSCFPattern : public OpRewritePattern<ParallelOp> {
+  using OpRewritePattern<ParallelOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ParallelOp loop,
+                                PatternRewriter &rewriter) const override {
+    // Fail conversion if the loop has not been bufferized.
+    if (!loop.hasBufferSemantics()) return failure();
+
+    auto cloneBody = [&](OpBuilder &builder, Location /*loc*/, ValueRange ivs) {
+      IRMapping bvm;
+      bvm.map(loop.getInductionVars(), ivs);
+
+      for (auto &op : loop.getBody()->without_terminator())
+        builder.clone(op, bvm);
+    };
+
+    rewriter.create<scf::ParallelOp>(loop.getLoc(), loop.getLowerBound(),
+                                     loop.getUpperBound(), loop.getStep(),
+                                     cloneBody);
+
+    rewriter.eraseOp(loop);
+    return success();
+  }
+};
+
+/// Converts gml_st.for to SCF loop nest.
+struct ForOpToSCFPattern : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ForOp loop,
+                                PatternRewriter &rewriter) const override {
+    auto cloneBody = [&](OpBuilder &builder, Location /*loc*/, ValueRange ivs,
+                         ValueRange iterArgs) {
+      IRMapping bvm;
+      bvm.map(loop.getInductionVars(), ivs);
+      bvm.map(loop.getRegionOutputArgs(), iterArgs);
+
+      for (auto &op : loop.getBody()->without_terminator())
+        builder.clone(op, bvm);
+
+      scf::ValueVector result;
+      llvm::transform(loop.getTerminator().getSrcs(),
+                      std::back_inserter(result),
+                      [&](Value src) { return bvm.lookupOrDefault(src); });
+      return result;
+    };
+
+    scf::LoopNest nest = scf::buildLoopNest(
+        rewriter, loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(),
+        loop.getStep(), loop.getOutputs(), cloneBody);
+    rewriter.replaceOp(loop, nest.results);
+    return success();
+  }
+};
+
+struct GmlStToScfPass : public impl::GmlStToScfBase<GmlStToScfPass> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    patterns.add<ForOpToSCFPattern, ParallelOpToSCFPattern>(
+        patterns.getContext());
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createGmlStToScfPass() {
+  return std::make_unique<GmlStToScfPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/greedy_fusion.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/greedy_fusion.cc
new file mode 100644
index 00000000000..135413aa6e1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/greedy_fusion.cc
@@ -0,0 +1,159 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/linalg_utils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_GREEDYFUSIONPASS
+#include "gml_st/transforms/passes.h.inc"
+
+namespace {
+
+class FuseTensorExtractPattern : public OpRewritePattern<tensor::ExtractOp> {
+ public:
+  explicit FuseTensorExtractPattern(MLIRContext *context)
+      : OpRewritePattern<tensor::ExtractOp>(context) {}
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    if (extractOp->getParentOfType<ParallelOp>())
+      return rewriter.notifyMatchFailure(extractOp, "already fused");
+
+    if (extractOp->getUsers().empty())
+      return rewriter.notifyMatchFailure(extractOp, "op is trivially dead");
+
+    ParallelOp outerMostParallelOp;
+    for (Operation *user : extractOp->getUsers()) {
+      auto parallelOp = user->getParentOfType<gml_st::ParallelOp>();
+      while (parallelOp && parallelOp->getParentOfType<gml_st::ParallelOp>())
+        parallelOp = parallelOp->getParentOfType<gml_st::ParallelOp>();
+
+      if (!parallelOp)
+        return rewriter.notifyMatchFailure(extractOp, "consumer is not fused");
+
+      if (!outerMostParallelOp) {
+        outerMostParallelOp = parallelOp;
+      } else if (outerMostParallelOp != parallelOp) {
+        return rewriter.notifyMatchFailure(
+            extractOp,
+            "consumers are not all nested under the same ParallelOp");
+      }
+    }
+
+    rewriter.setInsertionPointToStart(outerMostParallelOp.getBody());
+    Value newExtractOp = rewriter.create<tensor::ExtractOp>(
+        extractOp.getLoc(), extractOp.getTensor(), extractOp.getIndices());
+    rewriter.replaceAllUsesWith(extractOp, newExtractOp);
+
+    return success();
+  }
+};
+
+}  // namespace
+
+struct GreedyFusionPass : public impl::GreedyFusionPassBase<GreedyFusionPass> {
+  GreedyFusionPass() = default;
+  GreedyFusionPass(bool distr, ArrayRef<int64_t> ts, StringRef dl) {
+    this->distribute = distr;
+    this->tileSizes = ts;
+    this->distributionLabel = dl.str();
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry
+        .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    TilingOptions opts;
+    opts.distribute = distribute;
+    opts.distributionLabel = distributionLabel;
+    SmallVector<int64_t> ts(tileSizes.begin(), tileSizes.end());
+    opts.tileSizeComputationFn = [ts](OpBuilder &b, Operation *op) {
+      OpBuilder::InsertionGuard guard(b);
+      b.setInsertionPointToStart(
+          &op->getParentOfType<func::FuncOp>().getBody().front());
+      return llvm::to_vector(llvm::map_range(ts, [&](int64_t s) {
+        Value v = b.create<arith::ConstantIndexOp>(op->getLoc(), s);
+        return v;
+      }));
+    };
+
+    auto tilingFilterFn = [&](TilingInterface op) {
+      return success(llvm::none_of(op->getUsers(), [](Operation *user) {
+        return llvm::isa<tensor::ExtractSliceOp>(user) ||
+               llvm::isa<TilingInterface>(user);
+      }));
+    };
+
+    {
+      RewritePatternSet patterns(ctx);
+      populateTilingPatterns(ctx, tilingFilterFn, opts, &patterns);
+
+      auto fusionFilterFn = [](tensor::ExtractSliceOp) { return success(); };
+      populateFusionPatterns(ctx, fusionFilterFn, &patterns);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    RewritePatternSet patterns(ctx);
+
+    patterns.add<FuseTensorExtractPattern>(ctx);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+      return signalPassFailure();
+
+    // Clean up by removing temporary attributes.
+    removeTilingLabels(f);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass() {
+  return std::make_unique<GreedyFusionPass>();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass(
+    bool distribute, ArrayRef<int64_t> tileSizes, StringRef distributionLabel) {
+  return std::make_unique<GreedyFusionPass>(distribute, tileSizes,
+                                            distributionLabel);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_cwise.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_cwise.cc
similarity index 77%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_cwise.cc
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_cwise.cc
index 6d9361330f1..c04b77c6b04 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_cwise.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_cwise.cc
@@ -17,19 +17,19 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/linalg_utils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/fusion.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -38,12 +38,13 @@ namespace gml_st {
 namespace {
 
 #define GEN_PASS_DEF_TILINGCWISEPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
+#include "gml_st/transforms/passes.h.inc"
 
 bool isRootOfCwiseExpr(Operation *op) {
   return isCwiseGenericOp(op) &&
          llvm::none_of(op->getUsers(), [](Operation *user) {
-           return isCwiseGenericOp(user) || llvm::isa<MaterializeOp>(user);
+           return isCwiseGenericOp(user) ||
+                  llvm::isa<tensor::ExtractSliceOp>(user);
          });
 }
 
@@ -59,7 +60,7 @@ struct TilingCwisePass : public impl::TilingCwisePassBase<TilingCwisePass> {
   void getDependentDialects(DialectRegistry &registry) const final {
     registry
         .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
-    registerGmlStTilingInterfaceExternalModels(registry);
+    linalg::registerTilingInterfaceExternalModels(registry);
   }
 
   void runOnOperation() override {
@@ -88,15 +89,11 @@ struct TilingCwisePass : public impl::TilingCwisePassBase<TilingCwisePass> {
     opts.distributionLabel = distributionLabel_;
 
     // Tile the roots of cwise expressions and fuse all cwise operands greedily.
-    auto tileRootOfCwiseExprFn = [](Operation *op) {
-      if (!isRootOfCwiseExpr(op)) return failure();
-      return success();
+    auto tileRootOfCwiseExprFn = [](TilingInterface op) {
+      return success(isRootOfCwiseExpr(op));
     };
-    auto fuseCwiseOperandsGreedilyFn = [](Operation *op) {
-      Operation *producerOp =
-          llvm::cast<MaterializeOp>(op).getSource().getDefiningOp();
-      if (!isCwiseGenericOp(producerOp)) return failure();
-      return success();
+    auto fuseCwiseOperandsGreedilyFn = [](tensor::ExtractSliceOp op) {
+      return success(isCwiseGenericOp(op.getSource().getDefiningOp()));
     };
 
     // Populate tiling and fusion patterns.
@@ -109,7 +106,7 @@ struct TilingCwisePass : public impl::TilingCwisePassBase<TilingCwisePass> {
     }
 
     // Clean up by removing temporary attributes.
-    f.walk([](Operation *op) { removeTransformationAttr(op); });
+    removeTilingLabels(f);
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_gpu_warp.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_gpu_warp.cc
new file mode 100644
index 00000000000..75ea8a1dafe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/gpu_tiling/tiling_gpu_warp.cc
@@ -0,0 +1,360 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define GEN_PASS_DEF_TILINGGPUWARPPASS
+#include "gml_st/transforms/passes.h.inc"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+constexpr llvm::StringRef kTileGpuWarpAppliedLabel =
+    "__tile_gpu_warp_applied_label__";
+
+constexpr const char* kWarpDistributionLabel = "warp";
+constexpr const char* kThreadDistributionLabel = "thread";
+
+using OpFoldResults = SmallVector<OpFoldResult>;
+
+Value materializePoint(OpBuilder& b, Location loc, Value valueToTile,
+                       ArrayRef<OpFoldResult> offsets) {
+  auto tensorType = valueToTile.getType().cast<RankedTensorType>();
+  int64_t rank = tensorType.getRank();
+
+  IntegerAttr oneAttr = b.getIndexAttr(1);
+  SmallVector<OpFoldResult> sizes(rank, oneAttr);
+  SmallVector<OpFoldResult> strides(rank, oneAttr);
+
+  Value slice = b.create<tensor::ExtractSliceOp>(loc, valueToTile, offsets,
+                                                 sizes, strides);
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  return b.create<tensor::ExtractOp>(loc, slice,
+                                     SmallVector<Value>(rank, zero));
+}
+
+// Returns 'count' rounded up to power of two, up to warp size (32).
+int64_t getGroupSize(int64_t count) {
+  constexpr int64_t kWarpSize = 32;
+  if (count < 0) return kWarpSize;
+  for (int64_t i = 1; i < kWarpSize; i *= 2)
+    if (i >= count) return i;
+  return kWarpSize;
+}
+
+bool isWarpLevelOp(Operation* op) {
+  if (!op) return false;
+  auto parentPloop = op->getParentOfType<ParallelOp>();
+  return parentPloop && parentPloop.getDistributionType() &&
+         *parentPloop.getDistributionType() == kWarpDistributionLabel;
+}
+
+struct TilingCwisePattern : OpRewritePattern<linalg::MapOp> {
+  using OpRewritePattern<linalg::MapOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::MapOp mapOp,
+                                PatternRewriter& rewriter) const override {
+    if (hasLabel(mapOp, kTileGpuWarpAppliedLabel)) {
+      return rewriter.notifyMatchFailure(mapOp, "already transformed");
+    }
+
+    // Match only `linalg.map` ops on the shape 1x?.
+    if (mapOp.getNumDpsInits() != 1) {
+      return rewriter.notifyMatchFailure(mapOp, "not element-wise");
+    }
+    Value mapOpResult = mapOp.getResult().front();
+    auto ploopTy = mapOpResult.getType().dyn_cast<RankedTensorType>();
+    if (!ploopTy || ploopTy.getRank() != 2 || ploopTy.getDimSize(0) != 1) {
+      return rewriter.notifyMatchFailure(mapOp, "result no tensor<1x?>");
+    }
+
+    // Only tile root ops on the warp level.
+    if (!isWarpLevelOp(mapOp) || !mapOp->hasOneUse() ||
+        !llvm::isa<SetYieldOp>(*mapOp->getUsers().begin())) {
+      return rewriter.notifyMatchFailure(mapOp, "not a warp level root op");
+    }
+
+    // The number of threads per row (power of two, <= kWarpSize).
+    int64_t groupSize = getGroupSize(ploopTy.getDimSize(1));
+
+    // Constants and attributes.
+    Location loc = mapOp.getLoc();
+    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value cGroupSize = rewriter.create<arith::ConstantIndexOp>(loc, groupSize);
+    Value cGroupSizeMinusOne =
+        rewriter.create<arith::ConstantIndexOp>(loc, groupSize - 1);
+    Attribute zeroAttr = rewriter.getIndexAttr(0);
+    Attribute oneAttr = rewriter.getIndexAttr(1);
+    Attribute groupSizeAttr = rewriter.getIndexAttr(groupSize);
+    StringAttr threadDistrLabel =
+        rewriter.getStringAttr(kThreadDistributionLabel);
+
+    // Create `gml_st.parallel` loop to distribute among lanes.
+    Value init = mapOp.getInit();
+    Value dimSize = rewriter.createOrFold<tensor::DimOp>(loc, mapOpResult, c1);
+    Value dimSizePlusWarpSizeMinusOne =
+        rewriter.createOrFold<arith::AddIOp>(loc, dimSize, cGroupSizeMinusOne);
+    auto ploop = rewriter.create<gml_st::ParallelOp>(
+        loc, ploopTy, c0, cGroupSize, c1, ValueRange{init}, threadDistrLabel,
+        [&](OpBuilder& b, Location loc, ValueRange ivs, ValueRange initBbArg) {
+          // Compute the lane tile with a stride of `warpSize`. This tile
+          // defines the subset of the result that is produced by the lane.
+          // The `laneId` defines the initial offset into the tensor. The
+          // remaining length to be addressed by the lane is
+          //     `dimSize` - `laneId`.
+          // With a stride of `warpSize`, every lane addresses a total of
+          //     ceil((`dimSize` - `laneId`) / `cWarpSize`)
+          //     = (`dimSize` + `cWarpSize` - 1 - `laneId`) / `cWarpSize`
+          // elements.
+          Value laneId = ivs.front();
+          Value laneTileSize = b.create<arith::DivUIOp>(
+              loc,
+              b.create<arith::SubIOp>(loc, dimSizePlusWarpSizeMinusOne, laneId),
+              cGroupSize);
+          Value laneInit = b.create<tensor::ExtractSliceOp>(
+              loc, initBbArg.front(), OpFoldResults{zeroAttr, laneId},
+              OpFoldResults{oneAttr, laneTileSize},
+              OpFoldResults{oneAttr, groupSizeAttr});
+
+          // Create `gml_st.for` loop to iterate over the lane's tile.
+          auto sloopTy = ploopTy.clone({1, ShapedType::kDynamic});
+          auto sloop = b.create<gml_st::ForOp>(
+              loc, sloopTy, c0, laneTileSize, c1, laneInit,
+              [&](OpBuilder& b, Location loc, ValueRange ivs, ValueRange aggr) {
+                // Create the iteration tile. This specifies the scalar subset
+                // in the warp-level operands.
+                Value i = ivs.front();
+                Value iterTileOffset = b.create<arith::AddIOp>(
+                    loc, laneId, b.create<arith::MulIOp>(loc, i, cGroupSize));
+
+                // Materialize scalar subsets per operand.
+                SmallVector<Value> iterOperands = llvm::to_vector(
+                    llvm::map_range(mapOp.getInputs(), [&](Value arg) -> Value {
+                      return materializePoint(
+                          b, loc, arg, OpFoldResults{zeroAttr, iterTileOffset});
+                    }));
+
+                // Create scalar computation from `linalg.map` body by (i)
+                // mapping its block arguments to the newly materialized
+                // scalar operands, and (ii) cloning the body.
+                IRMapping bvm;
+                bvm.map(mapOp.getBlock()->getArguments(), iterOperands);
+                for (auto& innerOp : mapOp.getBody()->without_terminator()) {
+                  rewriter.clone(innerOp, bvm);
+                }
+
+                // Yield iteration result.
+                Value iterResult =
+                    bvm.lookup(mapOp.getBody()->getTerminator()->getOperand(0));
+                Value iterTileInLaneTile =
+                    b.create<gml_st::TileOp>(loc, OpFoldResults{zeroAttr, i},
+                                             OpFoldResults{oneAttr, oneAttr},
+                                             OpFoldResults{oneAttr, oneAttr});
+                b.create<gml_st::SetYieldOp>(loc, iterResult, aggr,
+                                             iterTileInLaneTile);
+              });
+          Value laneTile = b.createOrFold<gml_st::TileOp>(
+              loc, OpFoldResults{zeroAttr, laneId},
+              OpFoldResults{oneAttr, laneTileSize},
+              OpFoldResults{oneAttr, groupSizeAttr});
+          b.create<gml_st::SetYieldOp>(loc, sloop.getResult(0), initBbArg,
+                                       laneTile);
+        });
+
+    rewriter.replaceOp(mapOp, ploop.getResults());
+    return success();
+  }
+};
+
+struct TilingReductionPattern : OpRewritePattern<linalg::ReduceOp> {
+  using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::ReduceOp reduceOp,
+                                PatternRewriter& rewriter) const override {
+    if (hasLabel(reduceOp, kTileGpuWarpAppliedLabel)) {
+      return rewriter.notifyMatchFailure(reduceOp, "already transformed");
+    }
+
+    // Only tile ops on the warp level.
+    if (!isWarpLevelOp(reduceOp)) {
+      return rewriter.notifyMatchFailure(reduceOp, "not a warp level op");
+    }
+
+    // Match only if it's a linalg.reduce tensor<1x?xf32> -> tensor<1xf32>
+    if (reduceOp.getNumDpsInputs() != 1 || reduceOp.getNumDpsInits() != 1) {
+      return rewriter.notifyMatchFailure(reduceOp,
+                                         "Expected single input and output");
+    }
+
+    auto inputTy =
+        reduceOp.getInputs().front().getType().dyn_cast<RankedTensorType>();
+
+    // The number of threads per row (power of two, <= kWarpSize).
+    int64_t groupSize = getGroupSize(inputTy.getDimSize(1));
+
+    // Attributes and constants.
+    Location loc = reduceOp->getLoc();
+    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value cGroupSize = rewriter.create<arith::ConstantIndexOp>(loc, groupSize);
+    IntegerAttr zeroAttr = rewriter.getIndexAttr(0);
+    IntegerAttr oneAttr = rewriter.getIndexAttr(1);
+    IntegerAttr groupSizeAttr = rewriter.getIndexAttr(groupSize);
+    StringAttr threadDistrLabel =
+        rewriter.getStringAttr(kThreadDistributionLabel);
+
+    Value operand = reduceOp.getInputs().front();
+    Value init = reduceOp.getInits().front();
+
+    Type scalarTy = inputTy.getElementType();
+
+    Value reductionDimSize = rewriter.create<tensor::DimOp>(loc, operand, c1);
+
+    // Create warp-sized partial reduction result tensor.
+    Value warpResult = rewriter.create<tensor::EmptyOp>(
+        loc, OpFoldResults{oneAttr, groupSizeAttr}, scalarTy);
+    Value initMaterialized =
+        materializePoint(rewriter, loc, init, OpFoldResults{zeroAttr});
+    warpResult =
+        rewriter.create<linalg::FillOp>(loc, initMaterialized, warpResult)
+            .getResult(0);
+
+    // Create gml_st.parallel finalizing the partial result.
+    auto parallelOpBodyBuilderFn = [&](OpBuilder& b, Location loc,
+                                       ValueRange ivs,
+                                       ValueRange parallelLoopOutputs) {
+      Value laneId = ivs.front();
+      Value laneResult = b.create<tensor::ExtractSliceOp>(
+          loc, parallelLoopOutputs.front(), OpFoldResults{zeroAttr, laneId},
+          OpFoldResults{oneAttr, oneAttr}, OpFoldResults{oneAttr, oneAttr});
+
+      // Create gml_st.for sequentially reducing parts of the row.
+      auto forOpBodyBuilderFn = [&](OpBuilder& b, Location loc, ValueRange ivs,
+                                    ValueRange outputs) {
+        Value iterationId = ivs.front();
+        Value laneAcc = outputs.front();
+
+        // Materialize operand subset.
+        Value operandMaterialized = materializePoint(
+            b, loc, operand, ArrayRef<OpFoldResult>{zeroAttr, iterationId});
+
+        // Materialize intermediate result.
+        Value iterationResult = materializePoint(
+            rewriter, loc, laneAcc, OpFoldResults{zeroAttr, zeroAttr});
+
+        // Create scalar computation based on `linalg.reduce` body.
+        IRMapping bvm;
+        bvm.map(reduceOp.getBlock()->getArguments()[0], operandMaterialized);
+        bvm.map(reduceOp.getBlock()->getArguments()[1], iterationResult);
+        for (Operation& inner : reduceOp.getBody()->without_terminator()) {
+          rewriter.clone(inner, bvm);
+        }
+        iterationResult =
+            bvm.lookup(reduceOp.getBody()->getTerminator()->getOperand(0));
+
+        Value iterationTile =
+            rewriter.create<TileOp>(loc, OpFoldResults{zeroAttr, zeroAttr});
+        b.create<gml_st::SetYieldOp>(loc, iterationResult, laneAcc,
+                                     iterationTile);
+      };
+      laneResult = b.create<gml_st::ForOp>(loc, laneResult.getType(), laneId,
+                                           reductionDimSize, cGroupSize,
+                                           laneResult, forOpBodyBuilderFn)
+                       .getResult(0);
+
+      Value laneTile = b.create<TileOp>(loc, OpFoldResults{zeroAttr, laneId});
+      b.create<gml_st::SetYieldOp>(loc, laneResult, parallelLoopOutputs.front(),
+                                   laneTile);
+    };
+    warpResult = rewriter
+                     .create<gml_st::ParallelOp>(
+                         loc, warpResult.getType(), c0, cGroupSize, c1,
+                         /*outputs=*/ValueRange{warpResult}, threadDistrLabel,
+                         parallelOpBodyBuilderFn)
+                     .getResult(0);
+
+    // Change existing linalg.generic to warp-reduce the partial results.
+    rewriter.updateRootInPlace(reduceOp, [&] {
+      reduceOp->setOperand(0, warpResult);
+      setLabel(reduceOp, kTileGpuWarpAppliedLabel);
+    });
+
+    return success();
+  }
+};
+
+struct TilingGPUWarpPass
+    : public ::impl::TilingGPUWarpPassBase<TilingGPUWarpPass> {
+  void getDependentDialects(DialectRegistry& registry) const final {
+    ::impl::TilingGPUWarpPassBase<TilingGPUWarpPass>::getDependentDialects(
+        registry);
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    MLIRContext* ctx = &getContext();
+
+    // Populate tiling patterns
+    RewritePatternSet patterns(ctx);
+    patterns.add<TilingCwisePattern, TilingReductionPattern>(ctx);
+
+    // Populate fusion patterns.
+    auto fuseGreedilyFilterFn = [](Operation* op) {
+      auto materializeOp = llvm::dyn_cast<tensor::ExtractSliceOp>(op);
+      Operation* source = materializeOp.getSource().getDefiningOp();
+
+      // Do not fuse warp-level reductions.
+      auto reductionOp = llvm::dyn_cast_or_null<linalg::ReduceOp>(source);
+      if (reductionOp && reductionOp.getNumDpsInits() == 1 &&
+          isWarpLevelOp(source))
+        return failure();
+
+      return success();
+    };
+    populateFusionPatterns(ctx, fuseGreedilyFilterFn, &patterns);
+
+    func::FuncOp func = getOperation();
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    // Clean up by removing temporary attributes.
+    func.walk([](Operation* op) { removeLabel(op, kTileGpuWarpAppliedLabel); });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingGpuWarpPass() {
+  return std::make_unique<TilingGPUWarpPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
new file mode 100644
index 00000000000..6a83bdec9f3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
@@ -0,0 +1,202 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Pass/Pass.h"
+
+#define GEN_PASS_DECL
+#include "gml_st/transforms/passes.h.inc"
+
+namespace mlir {
+namespace gml_st {
+
+/// The key to the attribute corresponding to the distribution type of
+/// operations that have been SIMTfied.
+inline constexpr const char kDistributionLabelKey[] =
+    "gml-st-distribution-label";
+
+/// Pass to tile ops using TilingInterface.
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingPass(
+    StringRef opName = "", StringRef opLabel = "", bool distribute = true,
+    ArrayRef<int64_t> tileSizes = {});
+
+/// Pass to fuse producers into a tiled consumer.
+std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
+    StringRef producer = "", StringRef consumer = "");
+
+/// Pass to tile and fuse all cwise ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingCwisePass(
+    bool distribute, ArrayRef<int64_t> tileSizes,
+    StringRef distributionLabel = "");
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingCwisePass();
+
+/// Pass to tile warp-level ops on GPU.
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingGpuWarpPass();
+
+/// Pass to match, tile, and fuse softmax implementations.
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
+    bool distribute, ArrayRef<int64_t> tileSizes,
+    StringRef distributionLabel = "");
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass();
+
+/// Pass to tile the root operation and to greedily fuse producers into it.
+std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass(
+    bool distribute, ArrayRef<int64_t> tileSizes, StringRef distributionLabel);
+std::unique_ptr<OperationPass<func::FuncOp>> createGreedyFusionPass();
+
+// Pass to collapse dimensions of bcasts, reductions, and cwise ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass();
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseShapePass(
+    const CollapseShapePassOptions &options);
+
+/// Pass to compose tensor.extract_slice/insert_slice ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createComposeExtractInsertSlicePass();
+
+/// Pass to lower `gml_st.parallel` to `gpu.launch`, transforming the code into
+/// its SIMT interpretation.
+std::unique_ptr<OperationPass<func::FuncOp>> createGmlStSimtfyPass(
+    StringRef blockDistributionLabel = "block");
+
+/// Pass to eliminate the remaining `gml_st` ops after SIMTfication.
+std::unique_ptr<OperationPass<func::FuncOp>> createGmlStToGpuPass(
+    StringRef warpDistributionLabel = "warp");
+
+/// Create a pass to convert `gml_st.loop` to `scf.for` and `scf.parallel`
+/// loops and memref.load/memref.store accesses.
+std::unique_ptr<OperationPass<func::FuncOp>> createGmlStToScfPass();
+
+/// Pass to vectorize compute ops and gml_st.loops.
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForGPUPass(
+    bool vectorizeGmlStOps = false,
+    ArrayRef<StringRef> distributionLabels = {});
+
+/// Pass to vectorize compute ops and scf.for loops that are tiled perfectly.
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass();
+
+/// Pass to vectorize `memref.copy`.
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
+
+/// Pass to eliminate dead `memref.copy`.
+std::unique_ptr<OperationPass<func::FuncOp>> createSimplifyDeadCopyPass();
+
+/// Pass to rewrite vector.contract.
+std::unique_ptr<OperationPass<func::FuncOp>> createRewriteVectorContractPass();
+
+/// Pass to rewrite vector.transpose.
+std::unique_ptr<OperationPass<func::FuncOp>> createRewriteVectorTransposePass();
+
+/// Pass to rewrite vector.multi_reduction.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createRewriteVectorMultiReductionPass();
+
+/// Pass to transform a thlo.scatter op for CPU backend.
+std::unique_ptr<OperationPass<func::FuncOp>> createTransformScatterForCpuPass();
+
+/// Pass to transform a linalg.matmul op for CPU backend.
+std::unique_ptr<OperationPass<func::FuncOp>> createTransformMatmulForCpuPass(
+    ArrayRef<int64_t> matmulTileSizes = std::nullopt,
+    bool lowerToMmt4DOp = false);
+
+/// Pass to transform a linalg.matmul op for Triton.
+std::unique_ptr<OperationPass<func::FuncOp>> createTransformMatmulForTritonPass(
+    ArrayRef<int64_t> matmulTileSizes = std::nullopt,
+    StringRef distributionLabel = "");
+
+/// Pass to fuse linalg on tensor operations.
+std::unique_ptr<OperationPass<func::FuncOp>> createFusionOfTensorOpsPass();
+
+/// Pass to convert ops on tensors with 1 element to scalar ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass();
+
+/// Pass to transform a linalg.map op for CPU backend.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformMapForCpuPass(int64_t tileSize = 1);
+
+/// Pass to transform a linalg.reduce op for CPU backend.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformReduceForCpuPass(int64_t vectorSize = 8, int64_t tileSize1D = 32,
+                                ArrayRef<int64_t> tileSizes2D = {});
+
+/// Pass to transform a thlo.reverse op for CPU backend.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformReverseForCpuPass(int64_t vectorSize = 8);
+
+/// Pass to transform a linalg.transpose op for CPU backend.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformTransposeForCpuPass(ArrayRef<int64_t> tileSizes = std::nullopt);
+
+/// Pass to transform a thlo.sort op for CPU backend.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformSortForCpuPass();
+
+/// Pass to add debug info to be propagated into LLVM backend.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createAddDebugInfoPass();
+
+struct GmlStCPUPipelineOptions
+    : public mlir::PassPipelineOptions<GmlStCPUPipelineOptions> {
+  Option<bool> vectorize{*this, "vectorize",
+                         llvm::cl::desc("Enable tiling for vectorization."),
+                         llvm::cl::init(false)};
+
+  Option<int64_t> vectorSize{*this, "vector-size",
+                             llvm::cl::desc("Vector size for a 1D reduction."),
+                             llvm::cl::init(8)};
+
+  Option<int64_t> reduction1DTileSize{
+      *this, "reduction-1d-tile-size",
+      llvm::cl::desc("Tile size for a 1D reduction."), llvm::cl::init(32)};
+
+  ListOption<int64_t> reduction2DTileSizes{
+      *this, "reduction-2d-tile-sizes",
+      llvm::cl::desc("Tile sizes for a 2D reduction."),
+      llvm::cl::list_init<int64_t>({4, 4}), llvm::cl::ZeroOrMore};
+
+  ListOption<int64_t> matmulTileSizes{
+      *this, "matmul-tile-sizes",
+      llvm::cl::desc("Tile sizes for `linalg.matmul`."),
+      llvm::cl::list_init<int64_t>({4, 4, 4}), llvm::cl::ZeroOrMore};
+
+  Option<bool> lowerToMmt4d{
+      *this, "lower-to-mmt4d",
+      llvm::cl::desc("Enable the specific code generation (packing) for matmul "
+                     "operations."),
+      llvm::cl::init(false)};
+};
+
+// Make GmlStCPUPipelineOptions hashable.
+inline ::llvm::hash_code hashValue(const GmlStCPUPipelineOptions &opts) {
+  return ::llvm::hash_value(static_cast<bool>(opts.vectorize));
+}
+
+// Adds tiling-fusion-vectorization passes for tHLO/Linalg ops mix.
+void addCPUTilingPipeline(OpPassManager &pm,
+                          const GmlStCPUPipelineOptions &options);
+
+#define GEN_PASS_REGISTRATION
+#include "gml_st/transforms/passes.h.inc"
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
new file mode 100644
index 00000000000..e2ccbfe5153
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
@@ -0,0 +1,322 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def TilingPass : Pass<"gml-tiling", "mlir::func::FuncOp"> {
+  let summary = "Tile operations using TilingInterface to produce gml_st.for";
+  let constructor = "::mlir::gml_st::createTilingPass()";
+  let options = [
+    Option<"opName", "op-name", "std::string", /*default=*/"",
+           "Operation with this name is the anchor to latch on.">,
+    Option<"opLabel", "op-label", "std::string", /*default=*/"",
+           "Operation with this label is the anchor to latch on.">,
+    Option<"distribute", "distribute", "bool", /*default=*/"true",
+           "Generate gml_st.parallel or gml_st.for">,
+    ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes",
+               "llvm::cl::ZeroOrMore">,
+  ];
+}
+
+def FusionPass : Pass<"gml-fusion", "mlir::func::FuncOp"> {
+  let summary = "Fuse producers in into `gml_st.materialize` operations";
+  let constructor = "::mlir::gml_st::createFusionPass()";
+  let options = [
+    Option<"producerLabel", "producer-label", "std::string", /*default=*/"",
+           "Producer label.">,
+    Option<"consumerLabel", "consumer-label", "std::string", /*default=*/"",
+           "Consumer label.">,
+  ];
+}
+
+def TilingCwisePass : Pass<"gml-tiling-cwise", "mlir::func::FuncOp"> {
+  let summary = "Tile and fuse all cwise ops";
+  let constructor = "::mlir::gml_st::createTilingCwisePass()";
+  let options = [
+    Option<"distribute_", "distribute", "bool", /*default=*/"true",
+           "Generate gml_st.parallel or gml_st.for">,
+    ListOption<"tileSizes_", "tile-sizes", "int64_t",
+               "Right-aligned tile sizes. Do not tile possible remaining "
+               "dimensions", "llvm::cl::ZeroOrMore">,
+    Option<"distributionLabel_", "distribution-label", "std::string",
+            /*default=*/"", "Distribution label for generated gml_st.parallel">,
+  ];
+}
+
+def TilingGPUWarpPass : Pass<"gml-tiling-gpu-warp", "mlir::func::FuncOp"> {
+  let summary = "Tile warp-level ops for GPU";
+  let constructor = "::mlir::gml_st::createTilingGpuWarpPass()";
+  let dependentDialects = ["::mlir::gml_st::GmlStDialect",
+                           "::mlir::arith::ArithDialect"];
+}
+
+def TilingSoftmaxPass : Pass<"gml-tiling-softmax", "mlir::func::FuncOp"> {
+  let summary = "Match, tile, and fuse softmax implementations";
+  let constructor = "::mlir::gml_st::createTilingSoftmaxPass()";
+  let options = [
+    Option<"distribute", "distribute", "bool", /*default=*/"true",
+           "Generate gml_st.parallel or gml_st.for">,
+    ListOption<"tileSizes", "tile-sizes", "int64_t",
+               "Right-aligned tile sizes. Do not tile possible remaining "
+               "dimensions", "llvm::cl::ZeroOrMore">,
+    Option<"distributionLabel", "distribution-label", "std::string",
+            /*default=*/"", "Distribution label for generated gml_st.parallel">,
+  ];
+}
+
+def GreedyFusionPass : Pass<"gml-greedy-fusion", "mlir::func::FuncOp"> {
+  let summary = "Pass to tile the root operation and to greedily fuse "
+      "producers into it.";
+  let constructor = "::mlir::gml_st::createGreedyFusionPass()";
+  let options = [
+    Option<"distribute", "distribute", "bool", /*default=*/"true",
+           "Generate gml_st.parallel or gml_st.for">,
+    ListOption<"tileSizes", "tile-sizes", "int64_t",
+               "Tile sizes", "llvm::cl::ZeroOrMore">,
+    Option<"distributionLabel", "distribution-label", "std::string",
+           /*default=*/"", "Distribution label for generated gml_st.parallel">,
+  ];
+}
+
+def CollapseShapePass : Pass<"gml-collapse-shape", "mlir::func::FuncOp"> {
+  let summary = "Collapse dimensions of bcasts, reductions, and cwise ops";
+  let description = [{
+    Pass to collapse dimensions of bcasts, reductions, and cwise ops. A given
+    number of trailing dimensions remains untouched while the remaining leading
+    dimensions will be collapsed where possible.
+  }];
+  let constructor = "::mlir::gml_st::createCollapseShapePass()";
+  let options = [
+    Option<"retainTrailingDims", "retain-trailing-dims", "int64_t",
+           /*default=*/"0",
+           "Number of trailing dimensions that will not be collapsed.">,
+  ];
+  let dependentDialects = ["::mlir::tensor::TensorDialect"];
+}
+
+def ComposeExtractInsertSlicePass : Pass<"gml-compose-extract-insert-slice",
+    "mlir::func::FuncOp"> {
+  let summary = "Compose tensor.extract_slice/insert_slice ops.";
+  let constructor = "::mlir::gml_st::createComposeExtractInsertSlicePass()";
+}
+
+def GmlStToScf : Pass<"gml-st-to-scf", "mlir::func::FuncOp"> {
+  let summary = "Lower `gml_st.loop` to SCF loops and parallel loops";
+  let constructor = "::mlir::gml_st::createGmlStToScfPass()";
+  let dependentDialects = ["::mlir::scf::SCFDialect"];
+}
+
+def GmlStToGpuPass : Pass<"gml-st-to-gpu", "mlir::func::FuncOp"> {
+  let summary = "Lower nested `gml_st.parallel` to `gpu.launch`";
+  let constructor = "::mlir::gml_st::createGmlStToGpuPass()";
+  let dependentDialects = ["::mlir::gpu::GPUDialect",
+                           "::mlir::vector::VectorDialect",
+                           "::mlir::memref::MemRefDialect"];
+  let options = [
+    Option<"warpDistributionLabel", "warp-distribution-label",
+           "std::string", /*default=*/"\"warp\"",
+           "Direct children of `gml_st.parallel` loops with this distribution "
+           "type are distributed over warps.">
+  ];
+}
+
+def GmlStSimtfyPass : Pass<"gml-st-simtfy", "mlir::func::FuncOp"> {
+  let summary = "Lower nested `gml_st.parallel` to `gpu.launch`";
+  let constructor = "::mlir::gml_st::createGmlStSimtfyPass()";
+  let dependentDialects = ["::mlir::AffineDialect",
+                           "::mlir::arith::ArithDialect",
+                           "::mlir::gpu::GPUDialect",
+                           "::mlir::scf::SCFDialect"];
+  let options = [
+    Option<"blockDistributionLabel", "block-distribution-label",
+           "std::string", /*default=*/"\"block\"",
+           "Direct children of `gml_st.parallel` loops with this distribution "
+           "type are distributed over blocks.">
+  ];
+}
+
+def VectorizeForGPUPass : Pass<"vectorize-for-gpu", "mlir::func::FuncOp"> {
+  let summary = "Pass to vectorize compute ops and gml_st.loops.";
+  let constructor = "::mlir::gml_st::createVectorizeForGPUPass()";
+  let options = [
+    Option<"vectorizeGmlStOps", "vectorize-gml-st-ops", "bool", "false",
+           "If true, vectorizes GmlSt ops in addition to linalg ops">,
+    ListOption<"distributionLabels", "included-distribution-labels",
+               "std::string", "Distribution labels of gml_st.parallel ops "
+               "where vectorization is allowed. Empty list signifies that "
+               "vectorization is allowed within all loops.",
+               "llvm::cl::ZeroOrMore">,
+  ];
+  let dependentDialects = ["::mlir::vector::VectorDialect"];
+}
+
+def VectorizeForCPUPass : Pass<"vectorize-for-cpu", "mlir::func::FuncOp"> {
+  let summary = "Pass to vectorize gml_st.for loops that are tiled perfectly.";
+  let constructor = "::mlir::gml_st::createVectorizeForCPUPass()";
+  let dependentDialects = [
+    "::mlir::vector::VectorDialect",
+    "::mlir::tensor::TensorDialect"
+  ];
+}
+
+def VectorizeCopyPass :
+    Pass<"vectorize-copy", "mlir::func::FuncOp"> {
+  let summary = "Pass to vectorize `memref.copy`.";
+  let constructor = "::mlir::gml_st::createVectorizeCopyPass()";
+  let dependentDialects = ["::mlir::vector::VectorDialect"];
+}
+
+def SimplifyDeadCopyPass :
+    Pass<"simplify-dead-copy", "mlir::func::FuncOp"> {
+  let summary = "Pass to simplify dead `memref.copy`.";
+  let constructor = "::mlir::gml_st::createSimplifyDeadCopyPass()";
+  let dependentDialects = ["::mlir::vector::VectorDialect",
+                           "::mlir::memref::MemRefDialect"];
+}
+
+def RewriteVectorContractPass :
+    Pass<"rewrite-vector-contract", "mlir::func::FuncOp"> {
+  let summary = "Pass to rewrite vector.contract.";
+  let constructor = "::mlir::gml_st::createRewriteVectorContractPass()";
+  let dependentDialects = ["::mlir::vector::VectorDialect"];
+}
+
+
+def RewriteVectorMultiReductionPass :
+    Pass<"rewrite-vector-multi-reduction", "mlir::func::FuncOp"> {
+  let summary = "Pass to rewrite vector.multi_reduction.";
+  let constructor = "::mlir::gml_st::createRewriteVectorMultiReductionPass()";
+  let dependentDialects = ["::mlir::vector::VectorDialect"];
+}
+
+def RewriteVectorTransposePass : Pass<"rewrite-vector-transpose", "mlir::func::FuncOp"> {
+  let summary = "Pass to rewrite vector.transpose.";
+  let constructor = "::mlir::gml_st::createRewriteVectorTransposePass()";
+  let dependentDialects = [
+    "::mlir::LLVM::LLVMDialect",
+    "::mlir::vector::VectorDialect",
+  ];
+}
+
+def ScalarizationPass : Pass<"scalarize", "mlir::func::FuncOp"> {
+  let summary = "Converts ops on tensors with 1 element to scalar ops.";
+  let dependentDialects = [
+    "arith::ArithDialect",
+    "gml_st::GmlStDialect",
+    "scf::SCFDialect",
+    "tensor::TensorDialect"
+  ];
+  let constructor = "createScalarizationPass()";
+}
+
+def TransformScatterForCpuPass :
+    Pass<"xla-cpu-transform-scatter", "mlir::func::FuncOp"> {
+  let summary = "Transform scatter ops for running on CPU";
+
+  let constructor = "createTransformScatterForCpuPass()";
+}
+
+def TransformMatmulForCpuPass :
+    Pass<"xla-cpu-transform-matmul", "mlir::func::FuncOp"> {
+  let summary = "Transform matmul ops for running on CPU";
+
+  let constructor = "createTransformMatmulForCpuPass()";
+
+  let options = [
+    Option<"lowerToMmt4D", "lower-to-mmt4d", "bool", "false",
+           "If true, lower linalg.matmul into linalg.mmt4d">,
+    ListOption<"tileSizes", "tile-sizes", "int64_t",
+               "Tile sizes for a `linalg.matmul`">,
+  ];
+}
+
+def TransformMatmulForTritonPass :
+    Pass<"xla-triton-transform-matmul", "mlir::func::FuncOp"> {
+  let summary = "Transform matmul ops for lowering to Triton";
+
+  let constructor = "createTransformMatmulForTritonPass()";
+
+  let options = [
+    ListOption<"tileSizes", "tile-sizes", "int64_t",
+               "Tile sizes for a `linalg.matmul`">,
+    Option<"distributionLabel", "distribution-label", "std::string",
+           /*default=*/"", "Distribution label for generated gml_st.parallel">,
+  ];
+}
+
+def TransformMapForCpuPass :
+    Pass <"gml-st-cpu-transform-map", "mlir::func::FuncOp"> {
+  let summary = "Transform map ops for running on CPU";
+
+  let constructor = "::mlir::gml_st::createTransformMapForCpuPass()";
+
+  let options = [
+    Option<"tileSize", "tile-size", "int64_t", "1",
+           "Tile size for the innermost dimension of `linalg.map`">,
+  ];
+}
+
+def TransformTransposeForCpuPass :
+    Pass<"gml-st-cpu-transform-transpose", "mlir::func::FuncOp"> {
+  let summary = "Transform transpose ops for running on CPU";
+
+  let constructor = "createTransformTransposeForCpuPass()";
+
+  let options = [
+    ListOption<"tileSizes", "tile-sizes", "int64_t",
+               "Tile sizes for a `linalg.transpose`">,
+  ];
+}
+
+def TransformReduceForCpuPass :
+    Pass<"xla-cpu-transform-reduce", "mlir::func::FuncOp"> {
+  let summary = "Transform reduce ops for running on CPU";
+
+  let constructor = "createTransformReduceForCpuPass()";
+
+  let options = [
+    Option<"vectorSize", "vector-size", "int64_t", "8",
+           "Vector size for a 1D `linalg.reduce`">,
+    Option<"tileSize1D", "tile-size-1d", "int64_t", "32",
+               "Tile size for a 1D `linalg.reduce`">,
+    ListOption<"tileSizes2D", "tile-sizes-2d", "int64_t",
+               "Tile sizes for a `linalg.reduce`. tileSizes[0] is the parallel "
+               "dimension and tileSizes[1] is the reduction dimension.">,
+  ];
+}
+
+def TransformReverseForCpuPass :
+  Pass<"xla-cpu-transform-reverse", "mlir::func::FuncOp"> {
+    let summary = "Transform reverse ops for running on CPU";
+    let constructor = "createTransformReverseForCpuPass()";
+    let options = [
+      Option<"vectorSize", "vector-size", "int64_t", "8",
+           "Vector size for 'thlo.reverse`">,
+    ];
+  }
+
+def TransformSortForCpuPass :
+    Pass<"gml-st-cpu-transform-sort", "mlir::func::FuncOp"> {
+  let summary = "Transform sort ops for running on CPU";
+
+  let constructor = "createTransformSortForCpuPass()";
+}
+
+def AddDebugInfoPass :
+    Pass<"add-debug-info", "mlir::ModuleOp"> {
+  let summary = "Add debug info for the whole module";
+  let constructor = "::mlir::gml_st::createAddDebugInfoPass()";
+  let dependentDialects = ["::mlir::LLVM::LLVMDialect"];
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
new file mode 100644
index 00000000000..20d6a11042d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.cc
@@ -0,0 +1,180 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/peeling/peeling.h"
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
+#include "mlir/IR/IRMapping.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+bool isATensor(Type t) { return t.isa<TensorType>(); }
+
+/// Return true if the given op has only tensor-typed results or operands.
+bool hasTensorSemantics(Operation *op) {
+  return llvm::all_of(op->getResultTypes(), isATensor) ||
+         llvm::all_of(op->getOperandTypes(), isATensor);
+}
+
+LogicalResult peelLoop(RewriterBase &b, ParallelOp loopOp, int64_t idx,
+                       ParallelOp &result, Value &splitBound) {
+  if (!hasTensorSemantics(loopOp)) return failure();
+
+  Value lb = loopOp.getLowerBound()[idx], ub = loopOp.getUpperBound()[idx],
+        step = loopOp.getStep()[idx];
+  auto ubInt = getConstantIntValue(ub);
+
+  auto loc = loopOp.getLoc();
+  AffineExpr exprLb, exprUb, exprStep;
+  bindSymbols(b.getContext(), exprLb, exprUb, exprStep);
+  // New upper bound: %ub - (%ub - %lb) mod %step
+  auto modMap = AffineMap::get(0, 3, exprUb - ((exprUb - exprLb) % exprStep));
+  SmallVector<Value> operands{lb, ub, step};
+  canonicalizeMapAndOperands(&modMap, &operands);
+  modMap = simplifyAffineMap(modMap);
+  RewriterBase::InsertionGuard guard(b);
+  b.setInsertionPoint(loopOp);
+  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, operands);
+
+  // No specialization necessary if step already divides upper bound evenly.
+  if (splitBound == ub || (ubInt && ubInt == getConstantIntValue(splitBound)))
+    return failure();
+
+  // Create remainder loop.
+  IRMapping bvm;
+  for (const auto &[res, termDst] :
+       llvm::zip(loopOp.getResults(), loopOp.getLoopLikeOpInits())) {
+    bvm.map(termDst, res);
+  }
+  b.setInsertionPointAfter(loopOp);
+  auto remainderLoop = cast<ParallelOp>(b.clone(*loopOp.getOperation(), bvm));
+
+  Operation *remainderLoopOp = remainderLoop.getOperation();
+
+  for (const auto &[oldRes, newRes] :
+       llvm::zip(loopOp.getResults(), remainderLoop.getResults())) {
+    SmallPtrSet<Operation *, 4> exceptions({remainderLoopOp});
+    for (OpOperand &use : oldRes.getUses()) {
+      Operation *user = use.getOwner();
+      if (user->getParentOp() == remainderLoopOp) exceptions.insert(user);
+    }
+    oldRes.replaceAllUsesExcept(newRes, exceptions);
+  }
+
+  // Set new loop bounds.
+  b.updateRootInPlace(loopOp, [&]() {
+    SmallVector<Value> ubs = loopOp.getUpperBound();
+    ubs[idx] = splitBound;
+    loopOp.getUpperBoundMutable().assign(ubs);
+  });
+  SmallVector<Value> lbs = remainderLoop.getLowerBound();
+  lbs[idx] = splitBound;
+  b.updateRootInPlace(remainderLoop, [&]() {
+    remainderLoop.getLowerBoundMutable().assign(lbs);
+  });
+
+  result = remainderLoop;
+  return success();
+}
+
+template <typename OpTy>
+void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, Operation *mainLoop,
+                                 Operation *remainderLoop, Value mainIv,
+                                 Value remainderIv, Value ub, Value step) {
+  mainLoop->walk([&](OpTy affineOp) {
+    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, mainIv, ub, step,
+                                     /*insideLoop=*/true);
+  });
+  remainderLoop->walk([&](OpTy affineOp) {
+    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, remainderIv, ub, step,
+                                     /*insideLoop=*/false);
+  });
+}
+
+}  // namespace
+
+GmlStPeelingResult peelAllLoops(ParallelOp loop,
+                                mlir::PatternRewriter &rewriter) {
+  setLabel(loop, kPeelingAppliedLabel);
+  GmlStPeelingResult peelingResult;
+
+  bool hasMainLoop = true;
+  for (unsigned peeledIdx = 0; peeledIdx < loop.getNumLoops(); ++peeledIdx) {
+    int64_t numLoops = loop.getNumLoops();
+    if (peeledIdx < 0 || numLoops <= peeledIdx) continue;
+
+    Value ub = loop.getUpperBound()[peeledIdx];
+    Value step = loop.getStep()[peeledIdx];
+    auto ubInt = getConstantIntValue(ub);
+    auto stepInt = getConstantIntValue(step);
+
+    // If the loop is smaller than the step, then append loop as tail. Needs to
+    // be done only once.
+    if (ubInt && stepInt && ubInt < stepInt) {
+      if (hasMainLoop) {
+        peelingResult.tailLoops.push_back(loop);
+        hasMainLoop = false;
+      }
+      continue;
+    }
+
+    ParallelOp remainderLoop;
+    Value splitBound;
+    if (failed(peelLoop(rewriter, loop, peeledIdx, remainderLoop, splitBound)))
+      continue;
+
+    // Rewrite affine.min and affine.max ops.
+    Value mainIv = loop.getInductionVars()[peeledIdx],
+          remainderIv = remainderLoop.getInductionVars()[peeledIdx];
+
+    rewriteAffineOpAfterPeeling<AffineMinOp>(rewriter, loop, remainderLoop,
+                                             mainIv, remainderIv, ub, step);
+    rewriteAffineOpAfterPeeling<AffineMaxOp>(rewriter, loop, remainderLoop,
+                                             mainIv, remainderIv, ub, step);
+
+    // Mark the new loop if one was created.
+    setLabel(remainderLoop.getOperation(), kPeelingAppliedLabel);
+    peelingResult.tailLoops.push_back(remainderLoop);
+  }
+
+  // Update main loop if applicable.
+  if (hasMainLoop) peelingResult.mainLoop = loop;
+
+  return peelingResult;
+}
+
+SCFForPeelingResult peelSCFForOp(RewriterBase &rewriter, scf::ForOp loop) {
+  // Peeling fails, if the step divides the upper bound. In that case,
+  // we still want to return {loop, nullptr}.
+  scf::ForOp tailLoop;
+  return succeeded(scf::peelAndCanonicalizeForLoop(rewriter, loop, tailLoop))
+             ? SCFForPeelingResult{loop, tailLoop}
+             : SCFForPeelingResult{loop, nullptr};
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h
new file mode 100644
index 00000000000..02aa905518a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/peeling/peeling.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
+
+#include <functional>
+#include <string>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir {
+namespace gml_st {
+
+constexpr llvm::StringRef kPeelingAppliedLabel = "__peeling_applied_label__";
+
+struct GmlStPeelingResult {
+  Operation *mainLoop = nullptr;
+  SmallVector<Operation *> tailLoops = {};
+};
+
+/// Rewrite a gml_st::ParallelOp with bounds/step that potentially do not divide
+/// evenly into a gml_st::ParallelOp where the step divides the iteration space
+/// evenly, followed by another gml_st::ParallelOp for the last (partial)
+/// iteration (if any).  This transformation is called "loop peeling".
+///
+/// These functions peel all loops in the loop nest by calling
+/// peelAndCanonicalizeGmlStLoop. Additionally, they mark all loops (main and
+/// remainder loops) as peeled, so the same loop is not rewritten a second time.
+GmlStPeelingResult peelAllLoops(ParallelOp loop,
+                                mlir::PatternRewriter &rewriter);
+
+struct SCFForPeelingResult {
+  scf::ForOp mainLoop = nullptr;
+  scf::ForOp tailLoop = nullptr;
+};
+SCFForPeelingResult peelSCFForOp(RewriterBase &rewriter, scf::ForOp);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_PEELING_PEELING_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_contract.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_contract.cc
new file mode 100644
index 00000000000..8eb3dfeb84c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_contract.cc
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/vector_utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+using vector::OuterProductOp;
+
+#define GEN_PASS_DEF_REWRITEVECTORCONTRACTPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct OuterProductOpCanonicalizationPattern
+    : public OpRewritePattern<OuterProductOp> {
+  OuterProductOpCanonicalizationPattern(
+      MLIRContext *context, llvm::function_ref<bool(OuterProductOp)> filterFn,
+      PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit), filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(OuterProductOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+
+    bool changed = false;
+    SmallVector<Value> newAccs{op.getAcc()};
+    for (auto &acc : newAccs) {
+      auto materializeOp = acc.getDefiningOp<MaterializeOp>();
+      auto src = materializeOp.getSource();
+      auto srcType = src.getType().cast<ShapedType>();
+      if (auto resType = op.getResult().getType().dyn_cast<ShapedType>()) {
+        if (resType.hasStaticShape() && srcType == resType) {
+          acc = src;
+          changed = true;
+        }
+      }
+    }
+    if (!changed) return failure();
+    rewriter.updateRootInPlace(op,
+                               [&]() { op.getAccMutable().assign(newAccs); });
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(OuterProductOp)> filterFn;
+};
+
+struct RewriteVectorContractPass
+    : public impl::RewriteVectorContractPassBase<RewriteVectorContractPass> {
+  RewriteVectorContractPass() = default;
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *ctx = func.getContext();
+
+    // Reduce vector.contract dimensions to fit one of the lowering patterns to
+    // vector.outerproduct.
+    {
+      RewritePatternSet castAwayUnitDimPatterns(ctx);
+      vector::populateCastAwayVectorLeadingOneDimPatterns(
+          castAwayUnitDimPatterns);
+      if (failed(applyPatternsAndFoldGreedily(
+              func, std::move(castAwayUnitDimPatterns)))) {
+        return signalPassFailure();
+      }
+
+      RewritePatternSet reductionToContractPatterns(ctx);
+      vector::populateVectorReductionToContractPatterns(
+          reductionToContractPatterns);
+      vector::ExtractOp::getCanonicalizationPatterns(
+          reductionToContractPatterns, ctx);
+      if (failed(applyPatternsAndFoldGreedily(
+              func, std::move(reductionToContractPatterns)))) {
+        return signalPassFailure();
+      }
+    }
+
+    RewritePatternSet patterns(ctx);
+
+    auto outerProductOpFilter = [&](OuterProductOp op) {
+      return (llvm::any_of(op.getAcc(), [](auto acc) {
+        return acc.template getDefiningOp<MaterializeOp>() != nullptr;
+      }));
+    };
+
+    vector::populateVectorToVectorCanonicalizationPatterns(patterns);
+    // Currently we always lower vector.contract into vector.outerproduct.
+    patterns.add<vector::ContractionOpToOuterProductOpLowering,
+                 vector::ContractionOpLowering>(
+        vector::VectorTransformsOptions().setVectorTransformsOptions(
+            vector::VectorContractLowering::OuterProduct),
+        ctx, 2);
+    patterns.add<OuterProductOpCanonicalizationPattern>(ctx,
+                                                        outerProductOpFilter);
+    vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
+
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createRewriteVectorContractPass() {
+  return std::make_unique<RewriteVectorContractPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_multi_reduction.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_multi_reduction.cc
new file mode 100644
index 00000000000..1058d12d798
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_multi_reduction.cc
@@ -0,0 +1,78 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_REWRITEVECTORMULTIREDUCTIONPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct RewriteVectorMultiReductionPass
+    : public impl::RewriteVectorMultiReductionPassBase<
+          RewriteVectorMultiReductionPass> {
+  void runOnOperation() override {
+    MLIRContext* ctx = &getContext();
+    Operation* op = getOperation();
+    if (failed(rewriteTwoAndMoreDimReductions(ctx, op))) signalPassFailure();
+    if (failed(rewriteOneDimReductions(ctx, op))) signalPassFailure();
+  }
+
+  // Rewrite N-D reductions as the sequence of vector operations without
+  // horizontal reduction, i.e. `vector.reduction`.
+  LogicalResult rewriteTwoAndMoreDimReductions(MLIRContext* ctx,
+                                               Operation* op) const {
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<arith::ArithDialect, vector::VectorDialect>();
+    target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+        [&](vector::MultiDimReductionOp op) {
+          return op.getSourceVectorType().getRank() == 1;
+        });
+
+    RewritePatternSet patterns(ctx);
+    vector::populateVectorMultiReductionLoweringPatterns(
+        patterns, vector::VectorMultiReductionLowering::InnerParallel);
+    return applyPartialConversion(op, target, std::move(patterns));
+  }
+
+  // Rewrite 1D reductions as a `vector.reduction`.
+  LogicalResult rewriteOneDimReductions(MLIRContext* ctx, Operation* op) const {
+    RewritePatternSet patterns(ctx);
+    vector::populateVectorMultiReductionLoweringPatterns(
+        patterns, vector::VectorMultiReductionLowering::InnerReduction);
+    return applyPatternsAndFoldGreedily(op, std::move(patterns));
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createRewriteVectorMultiReductionPass() {
+  return std::make_unique<RewriteVectorMultiReductionPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_transpose.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_transpose.cc
new file mode 100644
index 00000000000..ab2e372be70
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/rewrite_vector_ops/rewrite_vector_transpose.cc
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_REWRITEVECTORTRANSPOSEPASS
+#include "gml_st/transforms/passes.h.inc"
+
+struct RewriteVectorTransposePass
+    : public impl::RewriteVectorTransposePassBase<RewriteVectorTransposePass> {
+  void runOnOperation() override {
+    auto avxLoweringOptions =
+        x86vector::avx2::LoweringOptions().setTransposeOptions(
+            x86vector::avx2::TransposeLoweringOptions()
+                .lower4x8xf32()
+                .lower8x8xf32());
+
+    func::FuncOp funcOp = getOperation();
+    MLIRContext *context = funcOp.getContext();
+    RewritePatternSet patterns(context);
+    vector::VectorTransformsOptions vectorTransformOptions;
+    vectorTransformOptions = vectorTransformOptions.setVectorTransposeLowering(
+        vector::VectorTransposeLowering::EltWise);
+    vector::populateVectorTransposeLoweringPatterns(patterns,
+                                                    vectorTransformOptions);
+    x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
+        patterns, avxLoweringOptions, /*benefit=*/10);
+
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createRewriteVectorTransposePass() {
+  return std::make_unique<RewriteVectorTransposePass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
new file mode 100644
index 00000000000..11d3731504d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/scalarization/scalarization.cc
@@ -0,0 +1,616 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_SCALARIZATIONPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using linalg::LinalgOp;
+using tensor::ExtractOp;
+using tensor::FromElementsOp;
+using tensor::InsertOp;
+
+Value materializePoint(OpBuilder &b, Location loc, Value valueToTile,
+                       ArrayRef<OpFoldResult> offsets) {
+  auto tensorType = valueToTile.getType().cast<RankedTensorType>();
+  int64_t rank = tensorType.getRank();
+
+  IntegerAttr oneAttr = b.getIndexAttr(1);
+  SmallVector<OpFoldResult> sizes(rank, oneAttr);
+  SmallVector<OpFoldResult> strides(rank, oneAttr);
+
+  Value slice = b.create<tensor::ExtractSliceOp>(loc, valueToTile, offsets,
+                                                 sizes, strides);
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  return b.create<tensor::ExtractOp>(loc, slice,
+                                     SmallVector<Value>(rank, zero));
+}
+
+struct ScalarizeLinalgOp : public OpInterfaceRewritePattern<LinalgOp> {
+  using OpInterfaceRewritePattern<LinalgOp>::OpInterfaceRewritePattern;
+
+  static LogicalResult inlinePayload(PatternRewriter &rewriter, Location loc,
+                                     LinalgOp linalgOp, ValueRange argValues) {
+    // Clone everything but terminator.
+    Block *body = linalgOp.getBlock();
+    IRMapping map;
+    map.map(body->getArguments(), argValues);
+    for (auto &op : body->without_terminator()) {
+      if (auto indexOp = dyn_cast<linalg::IndexOp>(&op)) {
+        Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+        map.map(indexOp.getResult(), zero);
+        continue;
+      }
+      rewriter.clone(op, map);
+    }
+
+    // Wrap every scalar result into a tensor using `tensor.from_elements`.
+    SmallVector<Value> newResults;
+    for (auto [resultType, yieldOperand] :
+         llvm::zip(linalgOp->getResultTypes(),
+                   body->getTerminator()->getOperands())) {
+      auto scalarValue = map.lookupOrDefault(yieldOperand);
+      newResults.push_back(
+          rewriter.create<FromElementsOp>(loc, resultType, scalarValue));
+    }
+    rewriter.replaceOp(linalgOp, newResults);
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(LinalgOp linalgOp,
+                                PatternRewriter &rewriter) const override {
+    // Fail if not every argument is a scalar or a single-element tensor.
+    if (!hasSingleElementOperandsAndResults(linalgOp)) return failure();
+
+    // TODO(aliia): fix scalarization of FillOp.
+    if (auto *fillOp = dyn_cast<linalg::FillOp>(&linalgOp)) return failure();
+
+    // Load the data corresponding to the block arguments that
+    // represent input operands.
+    SmallVector<Value> indexedValues;
+    indexedValues.reserve(linalgOp->getNumOperands());
+    Location loc = linalgOp->getLoc();
+    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    for (OpOperand &operand : linalgOp->getOpOperands()) {
+      if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
+        indexedValues.push_back(nullptr);
+        continue;
+      }
+      if (linalgOp.isScalar(&operand)) {
+        indexedValues.push_back(operand.get());
+        continue;
+      }
+      Value operandValue = operand.get();
+      Type operandType = operandValue.getType();
+      SmallVector<Value> indices(operandType.cast<RankedTensorType>().getRank(),
+                                 zero);
+      Value load = rewriter.create<ExtractOp>(loc, operandValue, indices);
+      indexedValues.push_back(load);
+    }
+
+    // Inline the op payload and rewrite the operation.
+    return inlinePayload(rewriter, loc, linalgOp, indexedValues);
+  }
+};
+
+// Returns `startIndices`[0, :] for `startIndices` of shape 1xn. Returns None if
+// startIndices has a different shape.
+Optional<SmallVector<Value>> extractStartIndices(
+    ImplicitLocOpBuilder &b, TypedValue<ShapedType> startIndices) {
+  if (startIndices.getType().getRank() != 2 ||
+      startIndices.getType().getDimSize(0) != 1) {
+    return std::nullopt;
+  }
+
+  int64_t indexVectorSize = startIndices.getType().getDimSize(1);
+  SmallVector<Value> result;
+  result.reserve(indexVectorSize);
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  for (int64_t i = 0; i < indexVectorSize; ++i) {
+    result.push_back(b.create<ExtractOp>(
+        startIndices, ValueRange{zero, b.create<arith::ConstantIndexOp>(i)}));
+  }
+  return result;
+}
+
+struct ScalarizeScatterOp : public OpRewritePattern<thlo::ScatterOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::ScatterOp scatterOp,
+                                PatternRewriter &rewriter) const override {
+    Location loc = scatterOp.getLoc();
+    ImplicitLocOpBuilder b(loc, rewriter);
+
+    auto scatterIndices = extractStartIndices(b, scatterOp.getIndices());
+    if (!scatterIndices) return failure();
+
+    // Create the loop nest that spans window dimensions of `updates`.
+    Value updates = scatterOp.getUpdates();
+    auto updatesType = updates.getType().dyn_cast<RankedTensorType>();
+    if (!updatesType) return failure();
+    int64_t updatesRank = updatesType.getRank();
+
+    SmallVector<OpFoldResult> updatesDimSizes =
+        tensor::getMixedSizes(b, loc, updates);
+    auto updatesDimValues =
+        getValueOrCreateConstantIndexOp(b, loc, updatesDimSizes);
+
+    Value init = scatterOp.getInit();
+    auto initType = init.getType().dyn_cast<RankedTensorType>();
+    if (!initType) return failure();
+
+    SmallVector<OpFoldResult> initDimSizes =
+        tensor::getMixedSizes(b, loc, init);
+    auto initDimValues = getValueOrCreateConstantIndexOp(b, loc, initDimSizes);
+
+    Value zero = b.create<arith::ConstantIndexOp>(0);
+    Value one = b.create<arith::ConstantIndexOp>(1);
+
+    // Create a loop that spans the dimensions of the update slice.
+    SmallVector<Value> lbs(updatesRank, zero);
+    SmallVector<Value> steps(updatesRank, one);
+
+    SmallVector<Value> limitIndex{
+        ArrayRef<Value>(updatesDimValues).drop_front()};
+    for (const auto &en : llvm::enumerate(*scatterIndices)) {
+      limitIndex[en.index()] =
+          b.create<arith::AddIOp>(loc, limitIndex[en.index()], en.value());
+    }
+    for (auto &value : limitIndex) {
+      value = b.create<arith::SubIOp>(loc, value, one);
+    }
+
+    Value indexIsInBounds =
+        isValidIndex(b, loc, limitIndex, initDimValues, zero);
+    indexIsInBounds = b.create<arith::AndIOp>(
+        loc, indexIsInBounds,
+        isValidIndex(b, loc, *scatterIndices, initDimValues, zero));
+    auto ifOp = b.create<scf::IfOp>(
+        loc, indexIsInBounds,
+        [&](OpBuilder &thenBuilder, Location thenLoc) {
+          scf::LoopNest loopNest = scf::buildLoopNest(
+              thenBuilder, thenLoc, lbs, updatesDimValues, steps,
+              ValueRange{init},
+              [&](OpBuilder &nestedBuilder, Location bodyLoc,
+                  ValueRange updateIndex, ValueRange loopInits) {
+                Value initBlockArg = loopInits.front();
+
+                auto initIndex = llvm::to_vector(updateIndex.drop_front());
+                for (const auto &en : llvm::enumerate(*scatterIndices)) {
+                  initIndex[en.index()] = nestedBuilder.create<arith::AddIOp>(
+                      bodyLoc, initIndex[en.index()], en.value());
+                }
+
+                Value updateValue = materializePoint(
+                    thenBuilder, loc, updates, getAsOpFoldResult(updateIndex));
+                Value currentValue =
+                    materializePoint(thenBuilder, loc, initBlockArg,
+                                     getAsOpFoldResult(initIndex));
+
+                // Combine update with the value in the output.
+                Block *body = scatterOp.getBody();
+                IRMapping bvm;
+                bvm.map(body->getArgument(0), updateValue);
+                bvm.map(body->getArgument(1), currentValue);
+
+                for (Operation &op : body->without_terminator())
+                  thenBuilder.clone(op, bvm);
+
+                // Wrap every scalar result into a tensor using
+                // `tensor.from_elements`.
+                auto combinedValue =
+                    bvm.lookup(body->getTerminator()->getOperand(0));
+                Value updatedInit = thenBuilder.create<InsertOp>(
+                    thenLoc, combinedValue, initBlockArg, initIndex);
+
+                return scf::ValueVector({updatedInit});
+              });
+
+          thenBuilder.create<scf::YieldOp>(thenLoc, loopNest.results);
+        },
+        [&](OpBuilder &elseBuilder, Location elseLoc) {
+          elseBuilder.create<scf::YieldOp>(elseLoc, init);
+        });
+    rewriter.replaceOp(scatterOp, ifOp.getResults());
+    return success();
+  }
+
+ private:
+  // Return i1 value after checking that 0 <= indices < dims(tensor).
+  Value isValidIndex(OpBuilder &b, Location loc, ArrayRef<Value> indices,
+                     ArrayRef<Value> tensorDims, Value zero) const {
+    auto i1Type = b.getI1Type();
+    Value isValid = b.create<arith::ConstantOp>(
+        loc, i1Type, IntegerAttr::get(i1Type, APInt(1, 1)));
+
+    for (auto [dim, index] : llvm::zip(tensorDims, indices)) {
+      Value geZero =
+          b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge, index, zero);
+      Value ltDim =
+          b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, index, dim);
+      Value dimInBounds = b.create<arith::AndIOp>(loc, geZero, ltDim);
+      isValid = b.create<arith::AndIOp>(loc, isValid, dimInBounds);
+    }
+    return isValid;
+  }
+};
+
+struct ScalarizeGatherOp : public OpRewritePattern<thlo::GatherOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::GatherOp gatherOp,
+                                PatternRewriter &rewriter) const override {
+    Location loc = gatherOp.getLoc();
+    ImplicitLocOpBuilder b(loc, rewriter);
+    auto startIndices = extractStartIndices(b, gatherOp.getStartIndices());
+    if (!startIndices) return failure();
+
+    TypedValue<ShapedType> init = gatherOp.getInit();
+    ShapedType initTy = init.getType();
+    int64_t initRank = initTy.getRank();
+    SmallVector<OpFoldResult> initDimSizes =
+        tensor::getMixedSizes(b, loc, init);
+    SmallVector<Value> initDimSizeValues =
+        getValueOrCreateConstantIndexOp(b, loc, initDimSizes);
+
+    IntegerAttr oneAttr = b.getI64IntegerAttr(1);
+
+    TypedValue<ShapedType> operand = gatherOp.getOperand();
+    auto operandSizes = getValueOrCreateConstantIndexOp(
+        b, loc, tensor::createDimValues(b, loc, operand));
+    Value zero = b.create<arith::ConstantIndexOp>(0);
+    Value one = b.create<arith::ConstantIndexOp>(1);
+
+    SmallVector<Value> sliceSizes{initDimSizeValues.begin() + 1,
+                                  initDimSizeValues.end()};
+    while (sliceSizes.size() < startIndices->size()) {
+      sliceSizes.push_back(one);
+    }
+
+    // Clamp the indices.
+    for (auto &&[startIndex, max, sliceSize] :
+         llvm::zip(*startIndices, operandSizes, sliceSizes)) {
+      auto maxMinusSize = b.createOrFold<arith::SubIOp>(loc, max, sliceSize);
+      startIndex = b.create<arith::MinSIOp>(loc, startIndex, maxMinusSize);
+      startIndex = b.create<arith::MaxSIOp>(loc, startIndex, zero);
+    }
+
+    SmallVector<Value> lbs(initRank, zero);
+    SmallVector<Value> steps(initRank, one);
+    rewriter.replaceOpWithNewOp<gml_st::ForOp>(
+        gatherOp, TypeRange(ValueRange{init}), lbs, initDimSizeValues, steps,
+        init,
+        [&](OpBuilder &nestedBuilder, Location bodyLoc, ValueRange ivs,
+            ValueRange loopInits) {
+          // Compute the index in the operand.
+          SmallVector<Value> readIndices(operand.getType().getRank(), zero);
+          llvm::copy(ivs.drop_front(1), readIndices.begin());
+          for (auto &&[readIndex, startIndex] :
+               llvm::zip(readIndices, *startIndices)) {
+            readIndex = nestedBuilder.create<arith::AddIOp>(bodyLoc, readIndex,
+                                                            startIndex);
+          }
+
+          // Materialize the value and yield it.
+          SmallVector<OpFoldResult> ones(initRank, oneAttr);
+          Value tile = nestedBuilder.create<gml_st::TileOp>(
+              bodyLoc, SmallVector<OpFoldResult>(ivs), ones, ones);
+          Value val = materializePoint(nestedBuilder, bodyLoc, operand,
+                                       getAsOpFoldResult(readIndices));
+          nestedBuilder.create<gml_st::SetYieldOp>(bodyLoc, val,
+                                                   loopInits.front(), tile);
+        });
+    return success();
+  }
+};
+
+// Replace `thlo.concatenate` that is statically known to have only one element
+// in concatenation dimension in all the inputs with
+// `gml_st.materialize/tensor.insert_slice`.
+struct ScalarizeConcatenateOp : public OpRewritePattern<thlo::ConcatenateOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::ConcatenateOp concatenateOp,
+                                PatternRewriter &rewriter) const override {
+    Location loc = concatenateOp.getLoc();
+    int64_t concatDim = concatenateOp.getDimension().getSExtValue();
+
+    auto initTensor = concatenateOp.getInit();
+    auto initType = initTensor.getType();
+    int64_t rank = initTensor.getType().getRank();
+
+    // Only scalarize when it's statically known that output concatenation dim
+    // size is one.
+    if (initType.getShape()[concatDim] != 1) {
+      return failure();
+    }
+
+    IntegerAttr oneAttr = rewriter.getIndexAttr(1);
+    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> strides(rank, oneAttr);
+
+    SmallVector<OpFoldResult> sizes;
+    for (int i = 0; i < rank; ++i) {
+      if (i == concatDim) {
+        sizes.push_back(oneAttr);
+      } else {
+        sizes.emplace_back(rewriter.create<tensor::DimOp>(loc, initTensor, i));
+      }
+    }
+
+    auto materializeAndInsert = [&](OpBuilder &b, Location l, Value input) {
+      Value slice =
+          b.create<tensor::ExtractSliceOp>(l, input, offsets, sizes, strides);
+      return b.create<tensor::InsertSliceOp>(l, slice, initTensor, offsets,
+                                             sizes, strides);
+    };
+
+    Value res =
+        extractElementFromInputs(rewriter, loc, concatenateOp.getInputs(),
+                                 initType, concatDim, materializeAndInsert);
+
+    rewriter.replaceOp(concatenateOp, res);
+
+    return success();
+  }
+
+ private:
+  Value tensorHasElement(OpBuilder &b, Location loc, Value input,
+                         int64_t concatDim) const {
+    Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+    Value concatDimSize = b.create<tensor::DimOp>(loc, input, concatDim);
+    return b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, concatDimSize,
+                                   zero);
+  }
+
+  Value extractElementFromInputs(
+      OpBuilder &b, Location loc, ValueRange inputs, Type resultType,
+      int64_t concatDim,
+      llvm::function_ref<Value(OpBuilder &, Location, Value)>
+          materializeAndInsert) const {
+    if (inputs.size() == 1) {
+      return materializeAndInsert(b, loc, inputs.front());
+    }
+
+    return b
+        .create<scf::IfOp>(
+            loc, tensorHasElement(b, loc, inputs.front(), concatDim),
+            [&](OpBuilder &thenBuilder, Location thenLoc) {
+              thenBuilder.create<scf::YieldOp>(
+                  thenLoc,
+                  materializeAndInsert(thenBuilder, thenLoc, inputs.front()));
+            },
+            [&](OpBuilder &elseBuilder, Location elseLoc) {
+              elseBuilder.create<scf::YieldOp>(
+                  elseLoc, extractElementFromInputs(
+                               elseBuilder, elseLoc, inputs.drop_front(),
+                               resultType, concatDim, materializeAndInsert));
+            })
+        .getResult(0);
+  }
+};
+
+namespace {
+LogicalResult scalarizeOp(Operation *op, PatternRewriter &rewriter,
+                          TypedValue<ShapedType> &input,
+                          TypedValue<ShapedType> &output) {
+  ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+
+  auto outputType = output.getType().dyn_cast<RankedTensorType>();
+  if (!outputType) {
+    return rewriter.notifyMatchFailure(
+        op, "failed to cast output to RankedTensorType");
+  }
+  if (!hasSingleElement(outputType)) {
+    return rewriter.notifyMatchFailure(
+        op, "has output with number of elements not equal to 1");
+  }
+
+  auto inputType = input.getType().dyn_cast<RankedTensorType>();
+  if (!inputType) {
+    return rewriter.notifyMatchFailure(
+        op, "failed to cast input to RankedTensorType");
+  }
+
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  llvm::SmallVector<Value> indicesInput(inputType.getRank(), zero);
+  llvm::SmallVector<Value> indicesOutput(outputType.getRank(), zero);
+
+  Value extractedValue = b.create<ExtractOp>(input, indicesInput);
+  Value result = b.create<tensor::FromElementsOp>(outputType, extractedValue);
+
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
+}  // namespace
+
+struct ScalarizeDynamicBroadcastInDimOp
+    : public OpRewritePattern<thlo::DynamicBroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::DynamicBroadcastInDimOp broadcastOp,
+                                PatternRewriter &rewriter) const override {
+    auto input = broadcastOp.getOperand();
+    auto output = broadcastOp.getInit();
+    return scalarizeOp(broadcastOp, rewriter, input, output);
+  }
+};
+
+struct ScalarizeReverseOp : public OpRewritePattern<thlo::ReverseOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(thlo::ReverseOp reverseOp,
+                                PatternRewriter &rewriter) const override {
+    auto input = reverseOp.getInput();
+    auto output = reverseOp.getInit();
+    return scalarizeOp(reverseOp, rewriter, input, output);
+  }
+};
+
+struct ScalarizeIfOp : public OpRewritePattern<scf::IfOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::IfOp op,
+                                PatternRewriter &rewriter) const override {
+    // Analyse result types and determine what we can scalarize.
+    int64_t numResults = op.getNumResults();
+    SmallVector<bool> isScalarizableResult(numResults, false);
+    SmallVector<Type> unscalarizedResultType =
+        llvm::to_vector(op.getResultTypes());
+    SmallVector<Type> scalarizedResultType =
+        llvm::to_vector(op.getResultTypes());
+    bool isAnyResultScalarizable = false;
+    for (int64_t i = 0; i < numResults; ++i) {
+      auto rankedTy = scalarizedResultType[i].dyn_cast<RankedTensorType>();
+      if (!rankedTy || !hasSingleElement(rankedTy)) continue;
+      isScalarizableResult[i] = true;
+      scalarizedResultType[i] = rankedTy.getElementType();
+      isAnyResultScalarizable = true;
+    }
+
+    if (!isAnyResultScalarizable) {
+      return rewriter.notifyMatchFailure(op, "cannot scalarize any result");
+    }
+
+    // Create new if op.
+    Location loc = op.getLoc();
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto scalarizedOp = rewriter.create<scf::IfOp>(loc, scalarizedResultType,
+                                                   op.getCondition());
+    scalarizedOp.getThenRegion().takeBody(op.getThenRegion());
+    scalarizedOp.getElseRegion().takeBody(op.getElseRegion());
+    for (int64_t i = 0; i < numResults; ++i) {
+      if (!isScalarizableResult[i]) continue;
+
+      // Insert `extract` ops to yield value as a scalar.
+      llvm::SmallVector<Value> zeroIndices(
+          unscalarizedResultType[i].cast<RankedTensorType>().getRank(), zero);
+      rewriter.setInsertionPoint(scalarizedOp.thenYield());
+      Value thenScalar = rewriter.createOrFold<tensor::ExtractOp>(
+          loc, scalarizedOp.thenYield().getOperand(i), zeroIndices);
+      scalarizedOp.thenYield().setOperand(i, thenScalar);
+      rewriter.setInsertionPoint(scalarizedOp.elseYield());
+      Value elseScalar = rewriter.createOrFold<tensor::ExtractOp>(
+          loc, scalarizedOp.elseYield().getOperand(i), zeroIndices);
+      scalarizedOp.elseYield().setOperand(i, elseScalar);
+    }
+
+    // Insert `from_elements` op to be type compatible.
+    rewriter.setInsertionPointAfter(scalarizedOp);
+    SmallVector<Value> results(scalarizedOp.getResults());
+    for (int64_t i = 0; i < numResults; ++i) {
+      if (!isScalarizableResult[i]) continue;
+
+      // Wrap scalar.
+      results[i] = rewriter.create<tensor::FromElementsOp>(
+          loc, unscalarizedResultType[i], results[i]);
+    }
+
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
+// Fold `gml_st.set_yield(tensor.from_elements(x) -> tensor<1x1xf32>)` into
+//      `gml_st.set_yield(x)` for single-element tensors.
+struct FoldTensorFromElementsIntoSetYield
+    : public OpRewritePattern<gml_st::SetYieldOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gml_st::SetYieldOp yieldOp,
+                                PatternRewriter &rewriter) const override {
+    bool isFoldingPossible = false;
+    SmallVector<Value> newSrcs;
+    for (auto [src, set] : llvm::zip(yieldOp.getSrcs(), yieldOp.getSets())) {
+      auto fromElementsOp = src.getDefiningOp<FromElementsOp>();
+      if (!fromElementsOp) continue;
+
+      if (hasSingleElement(fromElementsOp.getType())) {
+        newSrcs.push_back(fromElementsOp.getElements().front());
+        isFoldingPossible = true;
+        continue;
+      }
+      newSrcs.push_back(src);
+    }
+
+    if (!isFoldingPossible) return failure();
+
+    // Update in-place to make sure that the accumulator regions don't get lost.
+    rewriter.updateRootInPlace(
+        yieldOp, [&]() { yieldOp.getSrcsMutable().assign(newSrcs); });
+    return success();
+  }
+};
+
+void populateTensorInsertExtractFoldingPatterns(RewritePatternSet *patterns) {
+  patterns->add<FoldTensorFromElementsIntoSetYield>(patterns->getContext());
+}
+
+struct ScalarizationPass
+    : public impl::ScalarizationPassBase<ScalarizationPass> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *context = &getContext();
+
+    RewritePatternSet patterns(context);
+    // clang-format off
+    patterns.add<
+        ScalarizeConcatenateOp,
+        ScalarizeDynamicBroadcastInDimOp,
+        ScalarizeGatherOp,
+        ScalarizeIfOp,
+        ScalarizeLinalgOp,
+        ScalarizeReverseOp,
+        ScalarizeScatterOp>(context);
+    // clang-format on
+    populateTensorInsertExtractFoldingPatterns(&patterns);
+    FromElementsOp::getCanonicalizationPatterns(patterns, context);
+    gml_st::ForOp::getCanonicalizationPatterns(patterns, context);
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass() {
+  return std::make_unique<ScalarizationPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc
new file mode 100644
index 00000000000..b823f88e6db
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.cc
@@ -0,0 +1,95 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/test_passes.h"
+
+#include <string>
+#include <utility>
+
+#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/peeling/peeling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TESTGMLSTGREEDYFUSION
+#include "gml_st/transforms/test_passes.h.inc"
+
+static constexpr llvm::StringRef kTestFusionAppliedLabel =
+    "__test_fusion_applied_label__";
+
+struct GreedyFusionPattern : public OpRewritePattern<gml_st::ParallelOp> {
+  using OpRewritePattern<gml_st::ParallelOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(gml_st::ParallelOp op,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(op, kTestFusionAppliedLabel)) return failure();
+
+    rewriter.updateRootInPlace(op, [&]() {
+      fuseGreedily(rewriter, op.getRegion().front(), [](Operation *op) {
+        return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp>(op);
+      });
+    });
+
+    setLabel(op, kTestFusionAppliedLabel);
+    return success();
+  }
+};
+
+struct TestGmlStGreedyFusionPass
+    : public impl::TestGmlStGreedyFusionBase<TestGmlStGreedyFusionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry
+        .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+
+    MLIRContext *ctx = funcOp.getContext();
+    RewritePatternSet patterns(ctx);
+
+    patterns.add<GreedyFusionPattern>(ctx);
+
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns))))
+      return signalPassFailure();
+
+    funcOp.walk([](gml_st::ParallelOp op) {
+      removeLabel(op, kTestFusionAppliedLabel);
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStGreedyFusionPass() {
+  return std::make_unique<TestGmlStGreedyFusionPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h
new file mode 100644
index 00000000000..6458bc5bc07
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace gml_st {
+
+#define GEN_PASS_DECL
+#include "gml_st/transforms/test_passes.h.inc"
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStGreedyFusionPass();
+
+#define GEN_PASS_REGISTRATION
+#include "gml_st/transforms/test_passes.h.inc"
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.td
new file mode 100644
index 00000000000..2be3577fc79
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.td
@@ -0,0 +1,21 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def TestGmlStGreedyFusion : Pass<"test-gml-st-greedy-fusion", "mlir::func::FuncOp"> {
+  let summary = "Fuse ops greedily into gml-st loops.";
+  let constructor = "::mlir::gml_st::createTestGmlStGreedyFusionPass()";
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
new file mode 100644
index 00000000000..59a86a311a5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.cc
@@ -0,0 +1,368 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/tiling/tiling.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+
+void TilingOptions::setTileSizeComputationFn(ArrayRef<int64_t> ts) {
+  SmallVector<int64_t, 4> tileSizes(ts.begin(), ts.end());
+  tileSizeComputationFn = [tileSizes](OpBuilder &b, Operation *op) {
+    return llvm::to_vector<4>(map_range(tileSizes, [&](int64_t s) {
+      return b.create<arith::ConstantIndexOp>(op->getLoc(), s).getResult();
+    }));
+  };
+}
+
+namespace {
+
+#define GEN_PASS_DEF_TILINGPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kTileAppliedLabel = "__tile_applied_label__";
+
+// Compute tile size for the tile that starts at `offset`, has size `tileSize`
+// for the tensor with the dimension size `dimSize`.
+// The tile size is static when `tileSize` divides `dimSize` or when the
+// `tileSize` is 1.
+// Otherwise, it is minimum of `tileSize` and `dimSize - offset` to avoid out of
+// bounds access.
+OpFoldResult computeTileSizeInDim(OpBuilder &builder, Location loc,
+                                  OpFoldResult tileSize, OpFoldResult dimSize,
+                                  OpFoldResult offset) {
+  std::optional<int64_t> tileCst = getConstantIntValue(tileSize);
+  std::optional<int64_t> dimCst = getConstantIntValue(dimSize);
+
+  bool hasTileSizeOne = tileCst && *tileCst == 1;
+  bool dividesEvenly = tileCst && dimCst && ((*dimCst % *tileCst) == 0);
+  if (hasTileSizeOne || dividesEvenly) return builder.getIndexAttr(*tileCst);
+
+  AffineExpr d0, s0;
+  bindDims(builder.getContext(), d0);
+  bindSymbols(builder.getContext(), s0);
+  OpFoldResult residualTileSize =
+      makeComposedFoldedAffineApply(builder, loc, s0 - d0, {offset, dimSize});
+
+  return makeComposedFoldedAffineMin(
+      builder, loc, AffineMap::getMultiDimIdentityMap(2, loc.getContext()),
+      {residualTileSize, tileSize});
+}
+
+/// Generate an empty loop nest that represents the tiled loop nest shell.
+/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
+/// - `tileSizeVals` is the tile sizes to use. Zero represent untiled loops.
+/// - In `offsets` and `sizes` return the multi-dimensional offset and size of
+/// the tile processed within the inner most loop.
+Operation *generateTileLoopNest(OpBuilder &builder, Location loc,
+                                ArrayRef<Range> loopRanges,
+                                ArrayRef<Value> tileSizeVals,
+                                ArrayRef<Value> dstOperands, bool distribute,
+                                StringRef distributionLabel,
+                                SmallVector<OpFoldResult> &offsets,
+                                SmallVector<OpFoldResult> &sizes) {
+  assert(!loopRanges.empty() && "expected at least one loop range");
+  assert(loopRanges.size() == tileSizeVals.size() &&
+         "expected as many tile sizes as loop ranges");
+  OpBuilder::InsertionGuard guard(builder);
+
+  SmallVector<OpFoldResult> lbs, ubs, steps;
+  SmallVector<unsigned> nonemptyRangeIndices;
+  for (auto &loopRange : llvm::enumerate(loopRanges)) {
+    Value offset =
+        getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().offset);
+    Value size =
+        getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().size);
+    // No loops if tile size is zero. Set offset and size to the loop offset and
+    // size.
+    offsets.push_back(offset);
+    sizes.push_back(size);
+    if (matchPattern(tileSizeVals[loopRange.index()], m_Zero())) continue;
+    lbs.push_back(offset);
+    ubs.push_back(size);
+    steps.push_back(tileSizeVals[loopRange.index()]);
+    nonemptyRangeIndices.push_back(loopRange.index());
+  }
+
+  auto buildBody = [&](OpBuilder &nestedBuilder, Location bodyLoc,
+                       ValueRange ivs) {
+    for (const auto &[index, iv] : llvm::enumerate(ivs)) {
+      offsets[nonemptyRangeIndices[index]] = iv;
+      sizes[nonemptyRangeIndices[index]] = computeTileSizeInDim(
+          nestedBuilder, bodyLoc, steps[index], ubs[index], iv);
+    }
+  };
+  std::optional<StringAttr> distributionLabelAttr;
+  if (!distributionLabel.empty()) {
+    distributionLabelAttr =
+        StringAttr::get(builder.getContext(), distributionLabel);
+  }
+  Operation *loop =
+      distribute ? builder
+                       .create<gml_st::ParallelOp>(
+                           loc, TypeRange(ValueRange{dstOperands}),
+                           getValueOrCreateConstantIndexOp(builder, loc, lbs),
+                           getValueOrCreateConstantIndexOp(builder, loc, ubs),
+                           getValueOrCreateConstantIndexOp(builder, loc, steps),
+                           dstOperands, distributionLabelAttr,
+                           [&](OpBuilder &nestedBuilder, Location bodyLoc,
+                               ValueRange ivs, ValueRange /*outputs*/) {
+                             buildBody(nestedBuilder, bodyLoc, ivs);
+                           })
+                       .getOperation()
+                 : builder
+                       .create<gml_st::ForOp>(
+                           loc, TypeRange(ValueRange{dstOperands}),
+                           getValueOrCreateConstantIndexOp(builder, loc, lbs),
+                           getValueOrCreateConstantIndexOp(builder, loc, ubs),
+                           getValueOrCreateConstantIndexOp(builder, loc, steps),
+                           dstOperands,
+                           [&](OpBuilder &nestedBuilder, Location bodyLoc,
+                               ValueRange ivs, ValueRange /*inits*/) {
+                             buildBody(nestedBuilder, bodyLoc, ivs);
+                           })
+                       .getOperation();
+  return loop;
+}
+
+/// Pattern to tile an op that implements the `TilingInterface` using
+/// `gml_st.for` for iterating over the tiles.
+struct TilingPattern : public OpInterfaceRewritePattern<TilingInterface> {
+  TilingPattern(MLIRContext *context,
+                llvm::function_ref<LogicalResult(TilingInterface)> filterFn,
+                TilingOptions options, PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern<TilingInterface>(context, benefit),
+        filterFn(filterFn),
+        options(std::move(options)) {}
+
+  LogicalResult matchAndRewrite(TilingInterface op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn || failed(filterFn(op)) || hasLabel(op, kTileAppliedLabel))
+      return failure();
+
+    auto tilingResult = tileUsingGmlSt(options, rewriter, op);
+    if (failed(tilingResult)) return failure();
+
+    // If we did not tile (e.g. when all tile sizes are 0), do not replace
+    // original op and just mark it as transformed then return.
+    if (tilingResult->loop != nullptr) {
+      rewriter.replaceOp(op, tilingResult->loop->getResults());
+    }
+    setLabel(tilingResult->tiledOps.front(), kTileAppliedLabel);
+    return success();
+  }
+
+ private:
+  llvm::function_ref<LogicalResult(TilingInterface)> filterFn;
+  TilingOptions options;
+};
+
+struct TilingPass : public impl::TilingPassBase<TilingPass> {
+  TilingPass() = default;
+  TilingPass(StringRef name, StringRef label, bool distributeFlag,
+             llvm::ArrayRef<int64_t> sizes) {
+    opName = name.str();
+    opLabel = label.str();
+    distribute = distributeFlag;
+    tileSizes = sizes;
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<GmlStDialect, tensor::TensorDialect, linalg::LinalgDialect,
+                    scf::SCFDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    TilingOptions opts;
+    opts.distribute = distribute;
+    SmallVector<int64_t> ts(tileSizes.begin(), tileSizes.end());
+    opts.tileSizeComputationFn = [ts](OpBuilder &b, Operation *op) {
+      OpBuilder::InsertionGuard guard(b);
+      b.setInsertionPointToStart(
+          &op->getParentOfType<func::FuncOp>().getBody().front());
+      return llvm::to_vector<4>(llvm::map_range(ts, [&](int64_t s) {
+        Value v = b.create<arith::ConstantIndexOp>(op->getLoc(), s);
+        return v;
+      }));
+    };
+
+    auto filterFn = [&](TilingInterface op) {
+      if (!opName.empty() && op->getName().getStringRef() != opName)
+        return failure();
+      if (!opLabel.empty() && !hasMatchingLabel(op, opLabel)) return failure();
+      return success();
+    };
+    RewritePatternSet patterns(ctx);
+    populateTilingPatterns(ctx, filterFn, opts, &patterns);
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+      return signalPassFailure();
+
+    // Clean up by removing temporary attributes.
+    removeTilingLabels(f);
+  }
+};
+
+template <typename LoopTy>
+void insertTerminatorAndUpdateOutputs(PatternRewriter &rewriter,
+                                      const TilingResult &tilingResult,
+                                      SetYieldOp terminator,
+                                      ValueRange dstOperands,
+                                      ValueRange outputTiles) {
+  auto parallelLoop = cast<LoopTy>(tilingResult.loop);
+  rewriter.replaceOpWithNewOp<SetYieldOp>(
+      terminator, tilingResult.tiledOps.front()->getResults(),
+      parallelLoop.getRegionOutputArgs(), outputTiles);
+
+  if (auto dstOp = dyn_cast<DestinationStyleOpInterface>(
+          tilingResult.tiledOps.front())) {
+    for (auto [dst, regionArg] :
+         llvm::zip(dstOperands, parallelLoop.getRegionOutputArgs())) {
+      dst.replaceUsesWithIf(regionArg, [&](OpOperand &operand) {
+        Operation *owner = operand.getOwner();
+        return isa<tensor::ExtractSliceOp, TilingInterface>(owner) &&
+               owner->getParentOfType<LoopTy>() == parallelLoop.getOperation();
+      });
+    }
+  }
+}
+
+}  // namespace
+
+FailureOr<TilingResult> tileUsingGmlSt(const TilingOptions &options,
+                                       PatternRewriter &rewriter,
+                                       TilingInterface op) {
+  rewriter.setInsertionPoint(op);
+  if (!options.tileSizeComputationFn) {
+    return rewriter.notifyMatchFailure(
+        op, "missing tile size computation function");
+  }
+  Location loc = op.getLoc();
+
+  // 1. Get the range of the loops that are represented by the operation.
+  SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
+  size_t numLoops = iterationDomain.size();
+  if (numLoops == 0) return failure();
+
+  // 2. Materialize the tile sizes. Enforce the convention that "tiling by
+  // zero" skips tiling a particular dimension. This convention is
+  // significantly simpler to handle instead of adjusting affine maps to
+  // account for missing dimensions.
+  SmallVector<Value> tileSizeVector;
+  {
+    OpBuilder::InsertionGuard guard(rewriter);
+    tileSizeVector = options.tileSizeComputationFn(rewriter, op);
+  }
+
+  if (tileSizeVector.size() < iterationDomain.size()) {
+    auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
+  }
+
+  if (llvm::all_of(tileSizeVector, mlir::gml_st::isZero)) {
+    return TilingResult{{op}, nullptr};
+  }
+
+  // 3. Materialize an empty loop nest that iterates over the tiles.
+  SmallVector<Value> dstOperands;
+  if (failed(tensor::getOrCreateDestinations(rewriter, loc, op, dstOperands)))
+    return rewriter.notifyMatchFailure(op, "failed to get destinations");
+  SmallVector<OpFoldResult> offsets, sizes;
+  TilingResult tilingResult;
+  tilingResult.loop = generateTileLoopNest(
+      rewriter, loc, iterationDomain, tileSizeVector, dstOperands,
+      options.distribute, options.distributionLabel, offsets, sizes);
+  Block *loopBody = &tilingResult.loop->getRegion(0).front();
+  auto terminator = cast<SetYieldOp>(loopBody->getTerminator());
+  rewriter.setInsertionPoint(terminator);
+
+  // 4. Insert the tiled implementation within the loop.
+  tilingResult.tiledOps = op.getTiledImplementation(rewriter, offsets, sizes);
+
+  // 5. Compute tiles for the insertion.
+  int64_t numResults = op->getNumResults();
+  SmallVector<Value> outputTiles;
+  auto oneAttr = rewriter.getI64IntegerAttr(1);
+  for (const auto &result : llvm::enumerate(op->getResults())) {
+    SmallVector<OpFoldResult> resultOffsetsList(numResults),
+        resultSizesList(numResults);
+    if (failed(op.getResultTilePosition(rewriter, result.index(), offsets,
+                                        sizes, resultOffsetsList,
+                                        resultSizesList))) {
+      return rewriter.notifyMatchFailure(
+          op, "failed to get slice of result produced");
+    }
+    outputTiles.push_back(rewriter.createOrFold<TileOp>(
+        loc, resultOffsetsList, resultSizesList,
+        SmallVector<OpFoldResult>(resultSizesList.size(), oneAttr)));
+  }
+
+  // 6. Add a `set_yield` terminator, update the uses of `outputs` with the
+  // output bbArgs.
+  if (options.distribute) {
+    insertTerminatorAndUpdateOutputs<ParallelOp>(
+        rewriter, tilingResult, terminator, dstOperands, outputTiles);
+  } else {
+    insertTerminatorAndUpdateOutputs<ForOp>(rewriter, tilingResult, terminator,
+                                            dstOperands, outputTiles);
+  }
+  return tilingResult;
+}
+
+void populateTilingPatterns(
+    MLIRContext *context,
+    llvm::function_ref<LogicalResult(TilingInterface)> filterFn,
+    const TilingOptions &opts, RewritePatternSet *patterns) {
+  patterns->add<TilingPattern>(context, filterFn, opts);
+}
+
+void removeTilingLabels(Operation *op) {
+  op->walk([](Operation *op) { removeLabel(op, kTileAppliedLabel); });
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingPass(
+    StringRef opName, StringRef opLabel, bool distribute,
+    ArrayRef<int64_t> tileSizes) {
+  return std::make_unique<TilingPass>(opName, opLabel, distribute, tileSizes);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
new file mode 100644
index 00000000000..c0a24c9246d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tiling.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
+
+#include <functional>
+#include <string>
+
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/TilingInterface.h"
+
+namespace mlir {
+namespace gml_st {
+
+struct TilingResult {
+  SmallVector<Operation *> tiledOps;
+  Operation *loop = nullptr;
+};
+
+/// Options to use to control tiling.
+struct TilingOptions {
+  using TileSizeComputationFn =
+      std::function<SmallVector<Value>(OpBuilder &, Operation *)>;
+
+  /// Function to materialize the tile sizes for a given operation. This allows
+  /// to infer tile sizes statically, e.g. based on an operation's rank, and
+  /// also dynamically based, e.g. based on a tensor's shape at runtime.
+  TileSizeComputationFn tileSizeComputationFn = nullptr;
+
+  /// If `true`, generate a `gml_st.parallel` loop nest.
+  bool distribute = true;
+
+  // Distribution label to add to the gml_st.parallel op
+  std::string distributionLabel = "";
+
+  /// Convenience function to set the `tileSizeComputationFn` to a
+  /// function that computes tile sizes from an input vector parameter.
+  void setTileSizeComputationFn(ArrayRef<int64_t> ts);
+};
+
+/// Create tiled operation based on the specified tiling options. The result is
+/// equivalent to original op.
+FailureOr<TilingResult> tileUsingGmlSt(const TilingOptions &options,
+                                       PatternRewriter &rewriter,
+                                       TilingInterface op);
+
+/// Populate tiling patterns.
+void populateTilingPatterns(
+    MLIRContext *context,
+    llvm::function_ref<LogicalResult(TilingInterface)> filterFn,
+    const TilingOptions &opts, RewritePatternSet *patterns);
+
+/// Cleans up attributes from applying above tiling patterns.
+void removeTilingLabels(Operation *op);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TILING_TILING_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
new file mode 100644
index 00000000000..67e465d272d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling_softmax/tiling_softmax.cc
@@ -0,0 +1,307 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/utils/linalg_utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TILINGSOFTMAXPASS
+#include "gml_st/transforms/passes.h.inc"
+
+constexpr llvm::StringRef kTileSoftmaxAppliedLabel =
+    "__tile_softmax_applied_label__";
+
+Operation *fuseIthOperandInPlace(PatternRewriter &rewriter, Operation *op,
+                                 int64_t i) {
+  auto matOp =
+      llvm::cast<tensor::ExtractSliceOp>(op->getOperand(i).getDefiningOp());
+  FailureOr<Value> fused = createFusedOp(rewriter, matOp);
+  assert(succeeded(fused) && "expect success after matching");
+  rewriter.replaceOp(matOp, *fused);
+  return fused->getDefiningOp();
+}
+
+LogicalResult tilePartialSoftmax(
+    TilingInterface op, PatternRewriter &rewriter,
+    llvm::function_ref<FailureOr<Operation *>(Operation *, int64_t)>
+        tileOperationFn) {
+  // Match cwise root op.
+  // Match all operands to be derived from the same source value in one of two
+  // ways:
+  //   i)  by a reduction and subsequent bcast in one dimension, or
+  //   ii) by using the source value as is.
+  Value commonSource;
+  Optional<int64_t> commonReductionDim;
+  SmallVector<Optional<SimpleBcastReduction>> simpleBcastReductions;
+  auto mapOp = llvm::dyn_cast_or_null<linalg::MapOp>(op.getOperation());
+  if (!mapOp || mapOp.getNumDpsInits() != 1)
+    return rewriter.notifyMatchFailure(op, "no mapOp");
+  for (Value operand : mapOp.getInputs()) {
+    // Case i.
+    SimpleBcastReduction bcastReduction;
+    int64_t reductionDim;
+    if (isSimpleBcastReduction(operand.getDefiningOp(), &reductionDim,
+                               &bcastReduction)) {
+      if (commonSource && commonSource != bcastReduction.operand) {
+        return rewriter.notifyMatchFailure(bcastReduction.bcast,
+                                           "no common reduction source");
+      }
+      commonSource = bcastReduction.operand;
+      if (commonReductionDim && *commonReductionDim != reductionDim) {
+        return rewriter.notifyMatchFailure(bcastReduction.reduction,
+                                           "no common reduction dim");
+      }
+      commonReductionDim = reductionDim;
+      simpleBcastReductions.push_back(bcastReduction);
+      continue;
+    }
+
+    // Case ii.
+    if (commonSource && commonSource != operand)
+      return rewriter.notifyMatchFailure(op, "common source != operand");
+    commonSource = operand;
+    simpleBcastReductions.push_back(std::nullopt);
+  }
+
+  if (!commonReductionDim || !commonSource)
+    return rewriter.notifyMatchFailure(op, "no common dim/src");
+
+  // Tile or fuse cwise root op.
+  FailureOr<Operation *> tiledOp = tileOperationFn(op, *commonReductionDim);
+  if (failed(tiledOp))
+    return rewriter.notifyMatchFailure(op, "call to tileOperationFn failed");
+  setLabel(*tiledOp, kTileSoftmaxAppliedLabel);
+
+  // Fuse through the bcast reduction chains.
+  Value commonTiledSource;
+  for (int64_t i = 0; i < static_cast<int64_t>(simpleBcastReductions.size());
+       i++) {
+    if (!simpleBcastReductions[i]) continue;
+
+    // Fuse.
+    Operation *tiledBcast = fuseIthOperandInPlace(rewriter, *tiledOp, i);
+    Operation *tiledReduction =
+        fuseIthOperandInPlace(rewriter, tiledBcast, /*i=*/0);
+
+    // Use common tiled source value.
+    if (commonTiledSource) {
+      tiledReduction->setOperand(0, commonTiledSource);
+    } else {
+      commonTiledSource = tiledReduction->getOperands().front();
+    }
+  }
+
+  // Also use the common tiled source value for the remaining operands.
+  for (size_t i = 0; i < simpleBcastReductions.size(); i++) {
+    if (simpleBcastReductions[i]) continue;
+    (*tiledOp)->setOperand(i, commonTiledSource);
+  }
+
+  return success();
+}
+
+struct TilePartialSoftmaxPattern
+    : public OpInterfaceRewritePattern<TilingInterface> {
+  using OpInterfaceRewritePattern<TilingInterface>::OpInterfaceRewritePattern;
+
+  TilePartialSoftmaxPattern(MLIRContext *ctx, bool distribute,
+                            SmallVector<int64_t> tileSizes,
+                            StringRef distributionLabel,
+                            PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern<TilingInterface>(ctx, benefit),
+        distribute(distribute),
+        tileSizes(std::move(tileSizes)),
+        distributionLabel(distributionLabel) {}
+
+  LogicalResult matchAndRewrite(TilingInterface op,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(op, kTileSoftmaxAppliedLabel))
+      return rewriter.notifyMatchFailure(op, "has tranformation attr");
+
+    // Only apply to non-fusable occurrences.
+    bool hasFusableOccurrences = llvm::any_of(
+        op->getUsers(),
+        [](Operation *op) { return llvm::isa<tensor::ExtractSliceOp>(op); });
+    if (hasFusableOccurrences)
+      return rewriter.notifyMatchFailure(op, "has fusable occurrences");
+
+    return tilePartialSoftmax(
+        op, rewriter,
+        [&](Operation *op,
+            int64_t commonReductionDim) -> FailureOr<Operation *> {
+          // Populate tiling options.
+          TilingOptions tilingOptions;
+          tilingOptions.tileSizeComputationFn =
+              [&](OpBuilder &b, Operation *op) -> SmallVector<Value> {
+            Location loc = op->getLoc();
+            SmallVector<Value> tileSizeValues;
+            for (int64_t i = 0; i < static_cast<int64_t>(tileSizes.size());
+                 i++) {
+              // Skip tiling the reduction dimension. By convention, this is a
+              // tile size of 0.
+              int64_t tileSizeInDim =
+                  i == commonReductionDim ? 0 : tileSizes[i];
+              tileSizeValues.push_back(
+                  b.create<arith::ConstantIndexOp>(loc, tileSizeInDim));
+            }
+            return tileSizeValues;
+          };
+          tilingOptions.distribute = distribute;
+          tilingOptions.distributionLabel = distributionLabel;
+          // Tile.
+          FailureOr<TilingResult> tilingResult =
+              tileUsingGmlSt(tilingOptions, rewriter, op);
+          if (failed(tilingResult)) return failure();
+
+          rewriter.replaceOp(op, tilingResult->loop->getResults());
+          setLabel(tilingResult->tiledOps.front(), kTileSoftmaxAppliedLabel);
+          return tilingResult->tiledOps.front();
+        });
+  }
+
+ private:
+  bool distribute;
+  SmallVector<int64_t> tileSizes;
+  std::string distributionLabel;
+};
+
+struct FusePartialSoftmaxPattern
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
+                                PatternRewriter &rewriter) const override {
+    Value source = op.getSource();
+    Operation *def = source.getDefiningOp();
+    if (!def) return failure();
+
+    if (!llvm::isa<TilingInterface>(def)) return failure();
+
+    return tilePartialSoftmax(
+        def, rewriter,
+        [&](Operation *cwiseOp,
+            int64_t /*commonReductionDim*/) -> FailureOr<Operation *> {
+          auto iface = llvm::dyn_cast_or_null<TilingInterface>(cwiseOp);
+          if (!iface) {
+            return rewriter.notifyMatchFailure(
+                cwiseOp, "doesn't implement tiling iface");
+          }
+
+          // By construction, we assume that the tile spans the operand in the
+          // common reduction dimension (`commonReductionDim`).
+          // TODO(frgossen): Assert this assumption when we have moved to
+          // unnested tiles.
+
+          // Fuse.
+          SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
+          SmallVector<OpFoldResult> sizes = op.getMixedSizes();
+          FailureOr<Value> result =
+              iface.generateResultTileValue(rewriter, 0, offsets, sizes);
+          if (failed(result)) {
+            return rewriter.notifyMatchFailure(
+                cwiseOp, "failed to generate result tile");
+          }
+
+          rewriter.replaceOp(op, *result);
+          return result->getDefiningOp();
+        });
+  }
+};
+
+struct FuseUnaryCwisePattern : public OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
+                                PatternRewriter &rewriter) const override {
+    // Match unary cwise ops.
+    Operation *source = op.getSource().getDefiningOp();
+    auto mapOp = dyn_cast_or_null<linalg::MapOp>(source);
+    if (!mapOp || mapOp.getNumDpsInputs() != 1) return failure();
+    // Fuse.
+    FailureOr<Value> fused = createFusedOp(rewriter, op);
+    if (failed(fused)) return failure();
+
+    rewriter.replaceOp(op, *fused);
+    return success();
+  }
+};
+
+struct TilingSoftmaxPass
+    : public impl::TilingSoftmaxPassBase<TilingSoftmaxPass> {
+  TilingSoftmaxPass() = default;
+  TilingSoftmaxPass(bool distr, ArrayRef<int64_t> ts, StringRef dl) {
+    this->distribute = distr;
+    this->tileSizes = ts;
+    this->distributionLabel = dl.str();
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry
+        .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Populate tiling and fusion patterns for partial softmax and unary cwise
+    // ops.
+    RewritePatternSet patterns(ctx);
+    SmallVector<int64_t> tileSizes(this->tileSizes.begin(),
+                                   this->tileSizes.end());
+    patterns.insert<TilePartialSoftmaxPattern>(ctx, distribute, tileSizes,
+                                               distributionLabel);
+    patterns.insert<FuseUnaryCwisePattern, FusePartialSoftmaxPattern>(ctx);
+
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+
+    // Clean up by removing temporary attributes.
+    f.walk([](Operation *op) { removeLabel(op, kTileSoftmaxAppliedLabel); });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass() {
+  return std::make_unique<TilingSoftmaxPass>();
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
+    bool distribute, ArrayRef<int64_t> tileSizes, StringRef distributionLabel) {
+  return std::make_unique<TilingSoftmaxPass>(distribute, tileSizes,
+                                             distributionLabel);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.cc
new file mode 100644
index 00000000000..fb20d311ed8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.cc
@@ -0,0 +1,312 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/transforms.h"
+
+#include <cstddef>
+#include <tuple>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+
+bool isZero(Value v) { return matchPattern(v, m_Zero()); }
+bool isOne(Value v) { return matchPattern(v, m_One()); }
+
+bool hasSingleElementOperandsAndResults(Operation *op) {
+  auto isScalar = [](Type type) {
+    return !type.isa<mlir::ShapedType>() ||
+           (type.isa<TensorType>() &&
+            hasSingleElement(type.cast<TensorType>()));
+  };
+  return llvm::all_of(op->getOperandTypes(), isScalar) &&
+         llvm::all_of(op->getResultTypes(), isScalar);
+}
+
+/// Hoisting after vectorization
+namespace {
+
+using mlir::vector::TransferReadOp;
+using mlir::vector::TransferWriteOp;
+
+bool isLoopInvariantTransferWriteOp(ForOp forOp, TransferWriteOp candidate) {
+  // Indexing must not depend on `forOp`.
+  for (Value operand : candidate.getIndices())
+    if (!forOp.isDefinedOutsideOfLoop(operand)) return false;
+  return candidate->hasOneUse();
+}
+
+/// Look for a TransferReadOp, in the given tensor users, accessing the same
+/// offset as `write`.
+FailureOr<TransferReadOp> findMatchingTransferRead(TransferWriteOp write,
+                                                   Value srcTensor) {
+  SmallVector<Operation *> users(srcTensor.getUsers().begin(),
+                                 srcTensor.getUsers().end());
+  while (!users.empty()) {
+    Operation *user = users.pop_back_val();
+
+    auto read = dyn_cast<vector::TransferReadOp>(user);
+    if (read && read.getIndices() == write.getIndices() &&
+        read.getVectorType() == write.getVectorType())
+      return read;
+  }
+  return failure();
+}
+
+/// Check if the chunk of data inserted by `write` is read by any
+/// other op than `candidateRead` or `terminator`.
+bool tensorChunkAccessedByUnknownOp(TransferWriteOp write,
+                                    TransferReadOp candidateRead, Value tensor,
+                                    SetYieldOp terminator) {
+  // Make sure none of the other uses read the part of the tensor modified
+  // by the transfer_write.
+  llvm::SmallVector<Value::use_range, 1> uses;
+  uses.push_back(tensor.getUses());
+  while (!uses.empty()) {
+    for (OpOperand &use : uses.pop_back_val()) {
+      Operation *user = use.getOwner();
+      // Skip the candidate and terminator uses, only inspect the "other" uses.
+      if (user == candidateRead || user == write || user == terminator)
+        continue;
+      // Consider all transitive uses through a extract_slice / insert_slice.
+      // Consider all transitive uses through a vector.transfer_write.
+      // Consider all nested uses through a gml_st::ForOp. We may have
+      // pass-through tensor arguments left from previous level of hoisting.
+      // TODO(vuson): atm we just bail because a stronger analysis is needed for
+      // these cases.
+      if (isa<tensor::ExtractSliceOp, tensor::InsertSliceOp,
+              vector::TransferWriteOp, ForOp, SetYieldOp>(user))
+        return true;
+
+      auto read = dyn_cast<TransferReadOp>(user);
+      if (!read || !vector::isDisjointTransferIndices(
+                       cast<VectorTransferOpInterface>(read.getOperation()),
+                       cast<VectorTransferOpInterface>(write.getOperation()))) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+ForOp replaceLoopWithNewYields(OpBuilder &builder, ForOp loop,
+                               ValueRange newOutputOperands,
+                               ValueRange newYieldValues, Value yieldSet) {
+  assert(newOutputOperands.size() == newYieldValues.size() &&
+         "expected as many new yield values as new iter operands");
+  // Create a new loop before the existing one, with the extra operands.
+  OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPoint(loop);
+  auto operands = llvm::to_vector(loop.getOutputs());
+  operands.append(newOutputOperands.begin(), newOutputOperands.end());
+  auto newLoop = builder.create<ForOp>(
+      loop.getLoc(),
+      llvm::to_vector<1>(llvm::map_range(
+          operands, [&](Value v) -> Type { return v.getType(); })),
+      loop.getLowerBound(), loop.getUpperBound(), loop.getStep(), operands,
+      nullptr);
+
+  Block *loopBody = loop.getBody();
+  Block *newLoopBody = newLoop.getBody();
+
+  // Move the body of the original loop to the new loop.
+  builder.setInsertionPointToStart(newLoopBody);
+  IRMapping bvm;
+  for (Operation &bodyMember : loopBody->without_terminator()) {
+    builder.clone(bodyMember, bvm);
+  }
+
+  // Generate the new yield values to use by using the callback and append the
+  // yield values to the set_yield operation.
+  auto oldYield = loop.getTerminator();
+  ArrayRef<BlockArgument> newBBArgs =
+      newLoopBody->getArguments().take_back(newOutputOperands.size());
+  {
+    OpBuilder::InsertionGuard g(builder);
+    builder.setInsertionPointToEnd(newLoopBody);
+    auto getMappedValues = [&](ValueRange values) {
+      return llvm::to_vector(llvm::map_range(
+          values, [&](Value value) { return bvm.lookupOrDefault(value); }));
+    };
+    auto srcs = getMappedValues(oldYield.getSrcs());
+    srcs.append(getMappedValues(newYieldValues));
+    auto dsts = getMappedValues(oldYield.getDsts());
+    dsts.append(newBBArgs.begin(), newBBArgs.end());
+    auto sets = getMappedValues(oldYield.getSets());
+    sets.append(newYieldValues.size(), bvm.lookupOrDefault(yieldSet));
+    builder.create<SetYieldOp>(newLoop.getLoc(), srcs, dsts, sets);
+  }
+
+  // Remap the BlockArguments from the original loop to the new loop
+  // BlockArguments.
+  ArrayRef<BlockArgument> bbArgs = loopBody->getArguments();
+  for (auto it :
+       llvm::zip(bbArgs, newLoopBody->getArguments().take_front(bbArgs.size())))
+    std::get<0>(it).replaceAllUsesWith(std::get<1>(it));
+
+  // Replace all uses of `newOutputOperands` with the corresponding basic block
+  // arguments.
+  for (auto it : llvm::zip(newOutputOperands, newBBArgs)) {
+    std::get<0>(it).replaceUsesWithIf(std::get<1>(it), [&](OpOperand &use) {
+      Operation *user = use.getOwner();
+      return newLoop->isProperAncestor(user);
+    });
+  }
+
+  // Replace all uses of the original loop with corresponding values from the
+  // new loop.
+  loop.replaceAllUsesWith(
+      newLoop.getResults().take_front(loop.getNumResults()));
+
+  return newLoop;
+}
+
+/// Mechanical hoisting of a matching transfeSeread / transfer_write pair.
+void hoistReadWrite(TransferReadOp read, TransferWriteOp write,
+                    BlockArgument tensorBBArg, Value yieldSet) {
+  auto forOp = cast<ForOp>(tensorBBArg.getOwner()->getParentOp());
+
+  // Hoist the transfer_read op.
+  forOp.moveOutOfLoop(read);
+
+  // FIXME: don't hardcode /*numIvs=*/1.
+  assert(tensorBBArg.getArgNumber() >= /*numIvs=*/1);
+  unsigned initArgNumber = tensorBBArg.getArgNumber() - /*numIvs=*/1;
+
+  // Update the source tensor.
+  read.getSourceMutable().assign(forOp.getOutputs()[initArgNumber]);
+
+  // Hoist write after.
+  write->moveAfter(forOp);
+
+  // Update the yield.
+  auto setYieldOp = forOp.getTerminator();
+  setYieldOp->setOperand(initArgNumber, write.getSource());
+
+  // Rewrite `loop` with additional new yields.
+  OpBuilder b(read);
+  auto newForOp = replaceLoopWithNewYields(b, forOp, read.getVector(),
+                                           write.getVector(), yieldSet);
+
+  // Transfer write has been hoisted, need to update the vector and tensor
+  // source. Replace the result of the loop to use the new tensor created
+  // outside the loop.
+  // Depending on whether a insert_slice is present or not, it carries the
+  // update on the tensor operands.
+  newForOp.getResult(initArgNumber).replaceAllUsesWith(write.getResult());
+  write.getSourceMutable().assign(newForOp.getResult(initArgNumber));
+
+  // Always update with the newly yield tensor and vector.
+  write.getVectorMutable().assign(newForOp.getResults().back());
+}
+}  // namespace
+
+bool isIdentitySlice(ValueRange offsets, ValueRange strides) {
+  // Offsets must be all 0s and strides must be all 1s.
+  return llvm::all_of(offsets, [](Value v) { return isZero(v); }) &&
+         llvm::all_of(strides, [](Value v) { return isOne(v); });
+}
+
+bool haveSameStaticShape(Value lhs, Value rhs) {
+  auto lhsType = lhs.getType().cast<ShapedType>();
+  auto rhsType = rhs.getType().cast<ShapedType>();
+  if (!lhsType.hasStaticShape() || !rhsType.hasStaticShape()) return false;
+  return lhsType == rhsType;
+}
+
+void hoistRedundantVectorTransfersOnTensor(func::FuncOp func) {
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    func.walk([&](ForOp forOp) {
+      auto terminator = forOp.getTerminator();
+      for (const auto &[src, set, dst, outputArg] :
+           llvm::zip(terminator.getSrcs(), terminator.getSets(),
+                     terminator.getDsts(), forOp.getRegionOutputArgs())) {
+        auto write = src.getDefiningOp<TransferWriteOp>();
+        if (!write) continue;
+        if (!isLoopInvariantTransferWriteOp(forOp, write)) continue;
+
+        auto srcTensor = write.getSource();
+        if (srcTensor != outputArg) continue;
+
+        auto tileOp = set.getDefiningOp<TileOp>();
+        if (!tileOp ||
+            !isIdentitySlice(tileOp.getOffsets(), tileOp.getStrides()) ||
+            !haveSameStaticShape(src, dst))
+          continue;
+
+        // Find a read with the same type and indices.
+        auto matchingRead = findMatchingTransferRead(write, srcTensor);
+
+        // Make sure none of the other uses reads the part of the tensor
+        // modified by the transfer_write.
+        if (failed(matchingRead) ||
+            tensorChunkAccessedByUnknownOp(write, *matchingRead, srcTensor,
+                                           terminator))
+          continue;
+
+        hoistReadWrite(*matchingRead, write, outputArg, set);
+        changed = true;
+        forOp.erase();
+
+        // Need to interrupt and restart: erasing the loop messes up the walk.
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+    // Apply canonicalization so the newForOp + yield folds immediately, thus
+    // cleaning up the IR and potentially enabling more hoisting.
+    if (changed) {
+      auto *ctx = func->getContext();
+      RewritePatternSet patterns(ctx);
+      ForOp::getCanonicalizationPatterns(patterns, ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
+  }
+}
+
+void setLabel(Operation *op, StringRef name) {
+  op->setAttr(name, UnitAttr::get(op->getContext()));
+}
+
+void removeLabel(Operation *op, StringRef name) { op->removeAttr(name); }
+
+bool hasLabel(Operation *op, StringRef name) { return op->hasAttr(name); }
+
+constexpr llvm::StringLiteral kOpLabel = "op_label";
+
+bool hasMatchingLabel(Operation *op, StringRef label) {
+  auto opLabelAttr = op->getAttr(kOpLabel);
+  if (!opLabelAttr) return false;
+
+  return opLabelAttr.cast<StringAttr>().getValue() == label;
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.h
new file mode 100644
index 00000000000..22da1122e88
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/transforms.h
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "llvm/ADT/Hashing.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+
+namespace mlir {
+
+class OpPassManager;
+
+namespace linalg {
+
+class LinalgOp;
+struct TiledLinalgOp;
+struct LinalgTilingOptions;
+
+}  // namespace linalg
+}  // namespace mlir
+
+namespace mlir {
+namespace gml_st {
+
+constexpr llvm::StringRef kPerfectlyTiledLoopLabel =
+    "__perfectly_tiled_loop_label__";
+
+bool isZero(Value v);
+bool isOne(Value v);
+
+template <typename ShapedTy>
+bool hasSingleElement(ShapedTy type) {
+  return type.hasStaticShape() && type.getNumElements() == 1;
+}
+bool hasSingleElementOperandsAndResults(Operation *op);
+
+/// Hoist vector.transfer_read/vector.transfer_write pairs out of immediately
+/// enclosing gml_st::ForOp iteratively, if the following conditions are true:
+///   1. The two ops access the same tensor with the same indices.
+///   2. All operands are invariant under the enclosing gml_st::ForOp.
+///   3. No uses of the tensor either dominate the transfer_read or are
+///   dominated by the transfer_write (i.e. no aliasing between the write and
+///   the read across the loop)
+/// The transformation follows this logic:
+///   1. Look for transfer_write with a single use from ForOp terminator
+///   2. Check the uses of the matching block argument and look for a
+///   transfer_read with the same indices.
+///   3. Check that all the other uses of the tensor argument are either
+///   disjoint tensor_read or transfer_write. For transfer_write uses recurse to
+///   make sure the new tensor has the same restrictions on its uses.
+///   4. Hoist the tensor_read/tensor_write and update the tensor SSA links.
+///
+/// Example:
+///   %for = gml_st.for ... outs (%arg6 = %out: tensor<8x4xf32>) {
+///     %tile = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
+///     ...
+///     %read = vector.transfer_read %arg6[%c0, %c0]
+///     %compute = foo(%read) : vector<8x4xf32>
+///     %write = vector.transfer_write %compute, %arg6[%c0, %c0]
+///     gml_st.set_yield %write into %arg6[%tile]
+///   } : tensor<8x4xf32>
+///
+///   will be transformed into:
+///
+///   %read = vector.transfer_read %out[%c0, %c0]
+///   %for = gml_st.for ... outs (%arg6 = %read: vector<8x4xf32>) {
+///     %tile = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
+///     ...
+///     %compute = foo(%read) : vector<8x4xf32>
+///     gml_st.set_yield %compute into %arg6[%tile]
+///   } : vector<8x4xf32>
+///   %write = vector.transfer_write %for, %out[%c0, %c0]
+///
+/// After this transformation the gml_st.ForOp may have unused arguments that
+/// can be remove by the canonicalization pass.
+void hoistRedundantVectorTransfersOnTensor(func::FuncOp func);
+
+/// Returns true if `candidate`'s offsets are all 0s and strides are all 1s.
+bool isIdentitySlice(ValueRange offsets, ValueRange strides);
+
+/// Returns true if `lhs` and `rhs` are of same static shape.
+bool haveSameStaticShape(Value lhs, Value rhs);
+
+// Sets the attribute to the `op` that indicates that the op was transformed.
+void setLabel(Operation *op, StringRef name);
+
+// Removes the attribute that indicates that it was transformed.
+void removeLabel(Operation *op, StringRef name);
+
+// Checks if `op` has the attribute that indicates that it was transformed.
+bool hasLabel(Operation *op, StringRef name);
+
+// Checks if `op` has the matching label attribute.
+bool hasMatchingLabel(Operation *op, StringRef label);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_TRANSFORMS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/triton_tiling/transform_matmul_for_triton.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/triton_tiling/transform_matmul_for_triton.cc
new file mode 100644
index 00000000000..3e2c663d6ac
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/triton_tiling/transform_matmul_for_triton.cc
@@ -0,0 +1,192 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/fusion/fusion.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/tiling/tiling.h"
+#include "gml_st/transforms/transforms.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::gml_st {
+namespace {
+
+#define GEN_PASS_DEF_TRANSFORMMATMULFORTRITONPASS
+#include "gml_st/transforms/passes.h.inc"
+
+static constexpr llvm::StringRef kMatmulTransformedLabel =
+    "__matmul_transformed_label__";
+
+FailureOr<TilingResult> tileMatmul(PatternRewriter &rewriter, Operation *op,
+                                   ArrayRef<int64_t> tileSizes, bool distribute,
+                                   StringRef distributionLabel = "") {
+  TilingOptions opts;
+  opts.setTileSizeComputationFn(tileSizes);
+  opts.distribute = distribute;
+  opts.distributionLabel = distributionLabel;
+  return tileUsingGmlSt(opts, rewriter, cast<TilingInterface>(op));
+}
+
+/// Pattern to tile `linalg.matmul`, fuse `linalg.fill` into generated
+/// `gml_st.parallel`, and peel the generated loops.
+struct MatmulTransformPattern : public OpRewritePattern<linalg::MatmulOp> {
+  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
+
+  explicit MatmulTransformPattern(MLIRContext *context,
+                                  int64_t lhsParallelDimTileSize = 2,
+                                  int64_t rhsParallelDimTileSize = 4,
+                                  int64_t reductionDimTileSize = 8,
+                                  StringRef distributionLabel = "",
+                                  PatternBenefit benefit = 1)
+      : OpRewritePattern<linalg::MatmulOp>(context, benefit),
+        lhsParallelDimTileSize(lhsParallelDimTileSize),
+        rhsParallelDimTileSize(rhsParallelDimTileSize),
+        reductionDimTileSize(reductionDimTileSize),
+        distributionLabel(distributionLabel) {}
+
+  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
+                                PatternRewriter &rewriter) const override {
+    if (hasLabel(matmulOp, kMatmulTransformedLabel))
+      return rewriter.notifyMatchFailure(matmulOp,
+                                         "has already been transformed.");
+
+    if (isa<gml_st::ParallelOp, gml_st::ForOp>(matmulOp->getParentOp()))
+      return rewriter.notifyMatchFailure(
+          matmulOp, "has already been tiled by another pass.");
+
+    auto cluster = findMapFusionCluster(matmulOp);
+    auto fusionCluster = cluster.operations;
+    Operation *tilingRoot = cluster.root;
+    // Tiling of linalg.map requires two dimensions, linalg.matmul requires
+    // three.
+    SmallVector<int64_t> parallelDimsTileSizes{lhsParallelDimTileSize,
+                                               rhsParallelDimTileSize};
+    if (isa<linalg::MatmulOp>(tilingRoot)) parallelDimsTileSizes.push_back(0);
+
+    auto tilingParallelDimsResult =
+        tileMatmul(rewriter, tilingRoot, parallelDimsTileSizes,
+                   /*distribute=*/true, distributionLabel);
+    if (failed(tilingParallelDimsResult)) return failure();
+
+    // Update the results if tiling occurred.
+    if (tilingParallelDimsResult->loop != nullptr) {
+      rewriter.replaceOp(tilingRoot,
+                         tilingParallelDimsResult->loop->getResults());
+      tilingRoot = tilingParallelDimsResult->tiledOps.front();
+      // Fuse ops into the loop.
+      fuseGreedily(rewriter, *tilingRoot->getBlock(),
+                   [&](Operation *op) { return fusionCluster.contains(op); });
+      (void)fuseFillOpsIntoParallelOp(
+          rewriter, cast<ParallelOp>(tilingParallelDimsResult->loop));
+    }
+
+    auto inputFusionFilterFn = [&](Operation *op) {
+      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp>(op);
+    };
+
+    // Second level tiling: reduction dimension.
+    SmallVector<int64_t> reductionDimsTileSizes{0, 0, reductionDimTileSize};
+    for (auto op :
+         llvm::to_vector(tilingRoot->getBlock()->getOps<linalg::MatmulOp>())) {
+      fuseGreedily(rewriter, *op->getBlock(), inputFusionFilterFn);
+
+      auto tilingReductionDimsResult = tileMatmul(
+          rewriter, op, reductionDimsTileSizes, /*distribute=*/false);
+      if (failed(tilingReductionDimsResult)) return failure();
+
+      // Update the results if tiling occurred.
+      if (tilingReductionDimsResult->loop != nullptr) {
+        rewriter.replaceOp(op, tilingReductionDimsResult->loop->getResults());
+        op =
+            cast<linalg::MatmulOp>(tilingReductionDimsResult->tiledOps.front());
+
+        fuseGreedily(rewriter, *op->getBlock(), inputFusionFilterFn);
+      }
+
+      setLabel(op, kMatmulTransformedLabel);
+    }
+
+    return success();
+  }
+
+ private:
+  int64_t lhsParallelDimTileSize;
+  int64_t rhsParallelDimTileSize;
+  int64_t reductionDimTileSize;
+  std::string distributionLabel;
+};
+
+struct TransformMatmulForTritonPass
+    : public impl::TransformMatmulForTritonPassBase<
+          TransformMatmulForTritonPass> {
+  TransformMatmulForTritonPass() = default;
+
+  explicit TransformMatmulForTritonPass(llvm::ArrayRef<int64_t> matmulTileSizes,
+                                        StringRef distributionLabelParam) {
+    tileSizes = matmulTileSizes;
+    distributionLabel = distributionLabelParam.str();
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
+                    linalg::LinalgDialect, tensor::TensorDialect>();
+    linalg::registerTilingInterfaceExternalModels(registry);
+  }
+
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext *ctx = &getContext();
+
+    // Just do tiling and fusion on linalg.matmul.
+    if (tileSizes.empty()) {
+      tileSizes = {4, 4, 4};
+    }
+    assert(tileSizes.size() == 3 &&
+           "Tiling sizes for MatMul should have 3 elements");
+    RewritePatternSet patterns(ctx);
+    patterns.add<MatmulTransformPattern>(ctx, tileSizes[0], tileSizes[1],
+                                         tileSizes[2], distributionLabel);
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+    // Ensure we drop the marker in the end.
+    f.walk(
+        [](linalg::MatmulOp op) { removeLabel(op, kMatmulTransformedLabel); });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+createTransformMatmulForTritonPass(llvm::ArrayRef<int64_t> matmulTileSizes,
+                                   StringRef distributionLabel) {
+  return std::make_unique<mlir::gml_st::TransformMatmulForTritonPass>(
+      matmulTileSizes, distributionLabel);
+}
+
+}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
new file mode 100644
index 00000000000..94d505633d6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.cc
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/transforms/vectorization/vectorization.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+using mlir::tensor::ExpandShapeOp;
+using mlir::vector::TransferReadOp;
+using mlir::vector::TransferWriteOp;
+
+// Rewrite `vector.transfer_read(linalg.expand_shape)` as
+// `vector.shape_cast(vector.transfer_read)`.
+struct TransferReadOfOneDimExpandShape
+    : public mlir::OpRewritePattern<TransferReadOp> {
+  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      TransferReadOp vectorRead,
+      mlir::PatternRewriter &rewriter) const override {
+    auto expand = vectorRead.getSource().getDefiningOp<ExpandShapeOp>();
+    if (!expand) return failure();
+
+    auto expandSrc = expand.getSrc();
+    auto expandSrcType = expand.getSrcType();
+    auto expandDstType = expand.getResultType();
+    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
+      return failure();
+
+    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
+    if (!resultType || resultType.getShape() != expandDstType.getShape())
+      return failure();
+
+    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
+    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
+                                    vectorRead.getContext());
+    // TODO(pifon): Also support canonicalization in case the map is not an
+    // identity.
+    if (!map.isIdentity()) return failure();
+
+    auto newRead = rewriter.create<TransferReadOp>(
+        vectorRead.getLoc(),
+        mlir::VectorType::get(expandSrcType.getShape(),
+                              expandSrcType.getElementType()),
+        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
+        vectorRead.getPadding(),
+        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
+    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
+        vectorRead, vectorRead.getType(), newRead);
+    return success();
+  }
+};
+
+}  // namespace
+
+void populateTransferReadOfOneDimExpandShapePattern(
+    RewritePatternSet &patterns) {
+  patterns.add<TransferReadOfOneDimExpandShape>(patterns.getContext());
+}
+
+RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx) {
+  RewritePatternSet patterns(ctx);
+  mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
+  mlir::vector::populateVectorReductionToContractPatterns(patterns);
+  patterns.add<mlir::linalg::LinalgCopyVTRForwardingPattern,
+               mlir::linalg::LinalgCopyVTWForwardingPattern>(ctx,
+                                                             /*benefit=*/2);
+  TransferWriteOp::getCanonicalizationPatterns(patterns, ctx);
+  return patterns;
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
new file mode 100644
index 00000000000..c3e2ca2e04c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
+#define MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
+
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace gml_st {
+
+// The upper limit for vectorization of untiled `linalg.fill`. If a tensor has a
+// static shape with more elements, then `linalg.fill` won't be vectorized. It
+// is expected that such operations are tiled to get to small static shapes.
+static constexpr int64_t kNumElementsThreshold = 1024;
+
+// TODO(manany): This should be parameterized later on depending on hardware.
+static constexpr int64_t kNumElementsVectorization = 8;
+
+template <typename OpTy>
+struct VectorizationPattern : public mlir::OpRewritePattern<OpTy> {
+  VectorizationPattern(MLIRContext *context,
+                       llvm::function_ref<bool(OpTy)> matchFn,
+                       mlir::PatternBenefit benefit = 1)
+      : mlir::OpRewritePattern<OpTy>(context, benefit), filterFn(matchFn) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+    return mlir::linalg::vectorize(rewriter, op);
+  }
+
+ private:
+  llvm::function_ref<bool(OpTy)> filterFn;
+};
+
+void populateTransferReadOfOneDimExpandShapePattern(
+    RewritePatternSet &patterns);
+
+RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx);
+
+}  // namespace gml_st
+}  // namespace mlir
+
+#endif  // MLIR_HLO_GML_ST_TRANSFORMS_VECTORIZATION_VECTORIZATION_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
new file mode 100644
index 00000000000..d37a1c30207
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
@@ -0,0 +1,108 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/vectorization/vectorization.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_VECTORIZECOPYPASS
+#include "gml_st/transforms/passes.h.inc"
+
+/// Custom vectorization pattern for small and non-contiguous memref::CopyOp.
+struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
+  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::CopyOp op,
+                                PatternRewriter &rewriter) const override {
+    auto srcType = op.getSource().getType().cast<BaseMemRefType>();
+    auto targetType = op.getTarget().getType().cast<BaseMemRefType>();
+
+    auto isStaticShapeAndContiguousRowMajor = [](MemRefType type) {
+      if (!type.hasStaticShape()) return false;
+
+      SmallVector<int64_t> strides;
+      int64_t offset;
+      if (failed(getStridesAndOffset(type, strides, offset))) return false;
+
+      int64_t runningStride = 1;
+      for (unsigned i = strides.size(); i > 0; --i) {
+        if (strides[i - 1] != runningStride) return false;
+        runningStride *= type.getDimSize(i - 1);
+      }
+      return true;
+    };
+
+    auto isContiguousMemrefType = [&](BaseMemRefType type) {
+      auto memrefType = type.dyn_cast<mlir::MemRefType>();
+      return memrefType && (memrefType.getLayout().isIdentity() ||
+                            isStaticShapeAndContiguousRowMajor(memrefType));
+    };
+
+    auto isSmallMemrefType = [&](BaseMemRefType type) {
+      auto memrefType = type.dyn_cast<mlir::MemRefType>();
+      return memrefType && memrefType.hasStaticShape() &&
+             memrefType.getNumElements() > 0 &&
+             memrefType.getNumElements() < kNumElementsThreshold;
+    };
+
+    // If memref has an identity layout or is contiguous with an arbitrary
+    // offset, it will be turned into llvm.memcpy intrinsic later, do not
+    // vectorize it.
+    if (isContiguousMemrefType(srcType) && isContiguousMemrefType(targetType)) {
+      return failure();
+    }
+
+    // If memref is too big, vectorizing it actually explodes the compilation
+    // time. Also, ignore empty memrefs, which will be handled by memrefCopy
+    // function.
+    if (!isSmallMemrefType(srcType) || !isSmallMemrefType(targetType)) {
+      return failure();
+    }
+    return linalg::vectorizeCopy(rewriter, op);
+  }
+};
+
+struct VectorizeCopyPass
+    : public impl::VectorizeCopyPassBase<VectorizeCopyPass> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *ctx = func.getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<CopyVectorizationPattern>(ctx);
+    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass() {
+  return std::make_unique<VectorizeCopyPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
new file mode 100644
index 00000000000..3b9eb9b3686
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
@@ -0,0 +1,264 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/transforms/vectorization/vectorization.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_VECTORIZEFORCPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using mlir::linalg::BroadcastOp;
+using mlir::linalg::FillOp;
+using mlir::linalg::GenericOp;
+using mlir::linalg::MapOp;
+using mlir::linalg::MatmulOp;
+using mlir::linalg::Mmt4DOp;
+using mlir::linalg::ReduceOp;
+using mlir::linalg::TransposeOp;
+using mlir::tensor::ExpandShapeOp;
+using mlir::thlo::ReverseOp;
+using mlir::vector::TransferReadOp;
+using mlir::vector::TransferWriteOp;
+
+// Rewrite `vector.transfer_read(linalg.expand_shape)` as
+// `vector.shape_cast(vector.transfer_read)`.
+struct TransferReadOfOneDimExpandShape
+    : public mlir::OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      vector::TransferReadOp vectorRead,
+      mlir::PatternRewriter &rewriter) const override {
+    auto expand = vectorRead.getSource().getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expand) return failure();
+
+    auto expandSrc = expand.getSrc();
+    auto expandSrcType = expand.getSrcType();
+    auto expandDstType = expand.getResultType();
+    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
+      return failure();
+
+    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
+    if (!resultType || resultType.getShape() != expandDstType.getShape())
+      return failure();
+
+    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
+    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
+                                    vectorRead.getContext());
+    // TODO(pifon): Also support canonicalization in case the map is not an
+    // identity.
+    if (!map.isIdentity()) return failure();
+
+    auto newRead = rewriter.create<vector::TransferReadOp>(
+        vectorRead.getLoc(),
+        mlir::VectorType::get(expandSrcType.getShape(),
+                              expandSrcType.getElementType()),
+        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
+        vectorRead.getPadding(),
+        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
+    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
+        vectorRead, vectorRead.getType(), newRead);
+    return success();
+  }
+};
+
+// This currently matches for all thlo.reverse of the form 1x1x..x1xVectorSize.
+// DimSize < kNumElementsVectorization will be handled by Scalarization.
+bool isPerfectlyTiledReverse(thlo::ReverseOp reverseOp) {
+  auto inputType = reverseOp.getInput().getType();
+  for (unsigned i = 0; i < inputType.getRank(); ++i) {
+    if (inputType.isDynamicDim(i)) {
+      return false;
+    }
+    if (i == inputType.getRank() - 1) {
+      return inputType.getDimSize(i) == kNumElementsVectorization &&
+             llvm::is_contained(reverseOp.getReverseDimensions(), i);
+    }
+    if (inputType.getDimSize(i) != 1) {
+      return false;
+    }
+  }
+  return false;
+}
+
+// Rewrite thlo.reverse of pattern 1x1x..x1xVectorSize as vector.transfer_read
+// followed by vector.shuffle followed by vector.transfer_write.
+struct ThloReverseVectorizationPattern
+    : public mlir::OpRewritePattern<thlo::ReverseOp> {
+  explicit ThloReverseVectorizationPattern(MLIRContext *context,
+                                           mlir::PatternBenefit benefit = 1)
+      : mlir::OpRewritePattern<thlo::ReverseOp>(context, benefit) {}
+
+  LogicalResult matchAndRewrite(thlo::ReverseOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!isPerfectlyTiledReverse(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+
+    auto inputType = op.getInput().getType();
+    auto vecTargetType =
+        RankedTensorType::get(inputType.getShape()[inputType.getRank() - 1],
+                              inputType.getElementType());
+    Value zero = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
+    SmallVector<Value> indices(op.getInit().getType().getRank(), zero);
+
+    auto readInput = rewriter.create<vector::TransferReadOp>(
+        op.getLoc(),
+        VectorType::get(vecTargetType.getShape(),
+                        vecTargetType.getElementType()),
+        op.getInput(), indices);
+
+    SmallVector<int64_t> mask;
+    int64_t maskSize = inputType.getShape()[inputType.getRank() - 1];
+    mask.reserve(maskSize);
+    for (int64_t i = maskSize - 1; i >= 0; --i) {
+      mask.push_back(i);
+    }
+    auto shuffle = rewriter.create<vector::ShuffleOp>(op.getLoc(), readInput,
+                                                      readInput, mask);
+
+    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+        op, shuffle.getResult(), op.getInit(), indices);
+    return success();
+  }
+};
+
+struct IdentityTransposeOpFoldingPattern
+    : public OpRewritePattern<TransposeOp> {
+  explicit IdentityTransposeOpFoldingPattern(MLIRContext *context,
+                                             PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit) {}
+
+  LogicalResult matchAndRewrite(TransposeOp op,
+                                PatternRewriter & /*rewriter*/) const override {
+    auto perm = op.getPermutation();
+    for (int64_t i = 0; static_cast<uint64_t>(i) < perm.size(); ++i) {
+      if (perm[i] != i) return failure();
+    }
+
+    if (!hasSingleElementOperandsAndResults(op)) return failure();
+
+    op.replaceAllUsesWith(SmallVector<Value>(1, op.getInput()));
+    return success();
+  }
+};
+
+bool isInsideGmlStLoop(Operation *op) {
+  Operation *parent = op->getParentOp();
+  return isa<ParallelOp>(parent) || isa<ForOp>(parent);
+}
+
+bool isFillTiledOrSmall(linalg::FillOp fill) {
+  if (isInsideGmlStLoop(fill)) return true;
+
+  // Allow vectorization for static shapes with low number of elements.
+  auto outputType = fill.output().getType().dyn_cast<mlir::RankedTensorType>();
+  return outputType && outputType.hasStaticShape() &&
+         outputType.getNumElements() < kNumElementsThreshold;
+}
+
+struct VectorizeForCPUPass
+    : public impl::VectorizeForCPUPassBase<VectorizeForCPUPass> {
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *ctx = func.getContext();
+
+    auto hasSmallStaticOutputs = [&](Operation *op) {
+      return llvm::all_of(op->getResultTypes(), [](Type type) {
+        auto outputType = type.dyn_cast<mlir::RankedTensorType>();
+        return outputType && outputType.hasStaticShape() &&
+               outputType.getNumElements() < kNumElementsThreshold;
+      });
+    };
+    auto isPerfectlyTiledLoop = [&](Operation *op) {
+      return (isa<ForOp, ParallelOp, scf::ForOp>(op)) &&
+             hasLabel(op, kPerfectlyTiledLoopLabel);
+    };
+    auto isInsidePerfectlyTiledLoop = [&](Operation *op) {
+      return isPerfectlyTiledLoop(op->getParentOp());
+    };
+    auto isInsidePerfectlyTiledLoopOrSmall = [&](Operation *op) {
+      return !hasSingleElementOperandsAndResults(op) &&
+             (isInsidePerfectlyTiledLoop(op) || hasSmallStaticOutputs(op));
+    };
+    {
+      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
+      TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
+      // clang-format off
+      patterns.add<
+        VectorizationPattern<BroadcastOp>,
+        VectorizationPattern<GenericOp>,
+        VectorizationPattern<MapOp>,
+        VectorizationPattern<MatmulOp>,
+        VectorizationPattern<Mmt4DOp>,
+        VectorizationPattern<ReduceOp>,
+        VectorizationPattern<TransposeOp>
+      >(ctx, isInsidePerfectlyTiledLoopOrSmall);
+      // clang-format on
+      patterns.add<VectorizationPattern<FillOp>>(ctx, isFillTiledOrSmall);
+      populateTransferReadOfOneDimExpandShapePattern(patterns);
+      patterns.add<ThloReverseVectorizationPattern>(ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
+
+    {
+      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
+      TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
+      linalg::populatePadOpVectorizationPatterns(patterns);
+      patterns.add<IdentityTransposeOpFoldingPattern>(ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
+
+    // Hoisting transfer_read/transfer_write.
+    {
+      RewritePatternSet patterns(ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+
+      hoistRedundantVectorTransfersOnTensor(func);
+    }
+    // Hoisting transfer_read/transfer_write.
+    linalg::hoistRedundantVectorTransfersOnTensor(func);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass() {
+  return std::make_unique<VectorizeForCPUPass>();
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_gpu.cc
new file mode 100644
index 00000000000..11ba182e597
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_gpu.cc
@@ -0,0 +1,702 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/transforms.h"
+#include "gml_st/transforms/vectorization/vectorization.h"
+#include "gml_st/utils/vector_utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace gml_st {
+namespace {
+
+#define GEN_PASS_DEF_VECTORIZEFORGPUPASS
+#include "gml_st/transforms/passes.h.inc"
+
+using mlir::linalg::BroadcastOp;
+using mlir::linalg::FillOp;
+using mlir::linalg::GenericOp;
+using mlir::linalg::LinalgOp;
+using mlir::linalg::MapOp;
+using mlir::linalg::MatmulOp;
+using mlir::linalg::ReduceOp;
+using mlir::vector::TransferReadOp;
+using mlir::vector::TransferWriteOp;
+
+// Generates an offset of all 0s suitable as the index paramter for the builder
+// of vector.transfer_read or vector.transfer_write with input or output
+// `value`, respectively.
+SmallVector<Value, 4> generateDefaultOffsetFor(Value value,
+                                               OpBuilder &builder) {
+  auto shapedType = value.getType().dyn_cast<ShapedType>();
+  if (!shapedType) return {};
+  Value offset = builder.create<arith::ConstantIndexOp>(value.getLoc(), 0);
+  return SmallVector<Value, 4>(shapedType.getRank(), offset);
+}
+
+// Converts the ranked-tensor-typed `bvm`-mapped operands of `op` into vectors
+// via vector.transfer_read. Updates `bvm`'s mapping of `op`'s operands to the
+// newly created vector values.
+void convertTensorOperandsToVector(Operation *op, IRMapping &bvm,
+                                   OpBuilder &builder) {
+  OpBuilder::InsertionGuard guard(builder);
+  for (Value operand : op->getOperands()) {
+    Value mappedOperand = bvm.lookupOrDefault(operand);
+    auto tensorType = mappedOperand.getType().dyn_cast<RankedTensorType>();
+    if (!tensorType || tensorType.getNumDynamicDims() > 0) continue;
+    builder.setInsertionPointAfterValue(mappedOperand);
+    Value vectorOperand = builder.createOrFold<TransferReadOp>(
+        mappedOperand.getLoc(),
+        VectorType::get(tensorType.getShape(), tensorType.getElementType()),
+        mappedOperand, generateDefaultOffsetFor(mappedOperand, builder));
+    bvm.map(operand, vectorOperand);
+  }
+}
+
+// Converts the `bvm`-mapped `results` from vectors to tensors using
+// vector.transfer_write, passing in corresponding `destinations` as the
+// destination parameter of vector.transfer_write. Updates `bvm`'s mapping of
+// `op`'s results to the newly generated tensors. Expects that the operation's
+// results are vectors, and the destinations tensors.
+void convertVectorResultsToTensor(ValueRange results, ValueRange destinations,
+                                  IRMapping &bvm, OpBuilder &builder) {
+  for (auto [result, dest] : llvm::zip(results, destinations)) {
+    Value mappedResult = bvm.lookupOrDefault(result);
+    // Skip over scalars and leave them as is.
+    if (!mappedResult.getType().isa<ShapedType>()) continue;
+    assert(mappedResult.getType().isa<VectorType>() &&
+           "op's result should be a vector");
+    assert(dest.getType().isa<RankedTensorType>() &&
+           "destination should be a tensor");
+    auto writeOp = builder.create<vector::TransferWriteOp>(
+        mappedResult.getLoc(), mappedResult, dest,
+        generateDefaultOffsetFor(dest, builder));
+    bvm.map(result, writeOp.getResult());
+  }
+}
+
+// Converts static tensors among `types` to their equivalent vectors.
+SmallVector<Type, 1> convertToVectorTypes(TypeRange types) {
+  return llvm::to_vector<1>(llvm::map_range(types, [&](Type type) -> Type {
+    if (auto tensorType = type.dyn_cast<RankedTensorType>()) {
+      return VectorType::get(tensorType.getShape(),
+                             tensorType.getElementType());
+    }
+    return type;
+  }));
+}
+
+// Copies the body of a loop `op` that is being vectorized, vectorizing the
+// terminator, and stores the mapping to new values into `bvm`.
+void copyLoopBodyAndVectorizeTerminator(LoopLikeOpInterface op,
+                                        OpBuilder &builder, IRMapping &bvm) {
+  auto &blocks = op.getLoopBody().getBlocks();
+  assert(blocks.size() == 1 && "loop body should contain a single block");
+  Block &block = blocks.front();
+  for (Operation &bodyMember : block.without_terminator()) {
+    builder.clone(bodyMember, bvm);
+  }
+  convertTensorOperandsToVector(block.getTerminator(), bvm, builder);
+  builder.clone(*block.getTerminator(), bvm);
+}
+
+// Vectorizes a gml_st.parallel `op`, and stores the mapping from old to new
+// values into `bvm`.
+ParallelOp vectorizeLoopLikeOp(ParallelOp op, IRMapping &bvm,
+                               PatternRewriter &rewriter) {
+  convertTensorOperandsToVector(op, bvm, rewriter);
+  auto outputs = llvm::to_vector(llvm::map_range(
+      op.getOutputs(), [&](Value v) { return bvm.lookupOrDefault(v); }));
+
+  std::optional<StringAttr> distTypeAttr;
+  if (auto distType = op.getDistributionType())
+    distTypeAttr = rewriter.getStringAttr(*distType);
+  return rewriter.create<ParallelOp>(
+      op.getLoc(), convertToVectorTypes(op->getResultTypes()),
+      op.getLowerBound(), op.getUpperBound(), op.getStep(), outputs,
+      distTypeAttr,
+      [&](OpBuilder &builder, Location, ValueRange inductionVars,
+          ValueRange outputs) {
+        bvm.map(op.getInductionVars(), inductionVars);
+        bvm.map(op.getRegionOutputArgs(), outputs);
+        copyLoopBodyAndVectorizeTerminator(op, builder, bvm);
+      });
+}
+
+// Vectorizes a gml_st.for `op`, and stores the mapping from old to new
+// values into `bvm`.
+ForOp vectorizeLoopLikeOp(ForOp op, IRMapping &bvm, PatternRewriter &rewriter) {
+  convertTensorOperandsToVector(op, bvm, rewriter);
+  auto outputs = llvm::to_vector(llvm::map_range(
+      op.getOutputs(), [&](Value v) { return bvm.lookupOrDefault(v); }));
+  return rewriter.create<ForOp>(
+      op.getLoc(), convertToVectorTypes(op->getResultTypes()),
+      op.getLowerBound(), op.getUpperBound(), op.getStep(), outputs,
+      [&](OpBuilder &builder, Location, ValueRange inductionVars,
+          ValueRange outputs) {
+        bvm.map(op.getInductionVars(), inductionVars);
+        bvm.map(op.getRegionOutputArgs(), outputs);
+        convertVectorResultsToTensor(op.getRegionOutputArgs(), op.getOutputs(),
+                                     bvm, builder);
+        copyLoopBodyAndVectorizeTerminator(op, builder, bvm);
+      });
+}
+
+template <typename LoopLikeOp>
+struct LoopLikeOpVectorizationPattern : public OpRewritePattern<LoopLikeOp> {
+  LoopLikeOpVectorizationPattern(MLIRContext *context,
+                                 llvm::function_ref<bool(LoopLikeOp)> filterFn,
+                                 PatternBenefit benefit = 1)
+      : OpRewritePattern<LoopLikeOp>(context, benefit), filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(LoopLikeOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+    SetYieldOp setYield = op.getTerminator();
+    // Make sure that all the arguments are either tiles or ranked tensors, and
+    // that we have at least one tensor (so that the rewrite is not a no-op).
+    bool hasTensor = false;
+    for (auto [srcType, dstType] : llvm::zip(setYield.getSrcs().getTypes(),
+                                             setYield.getDsts().getTypes())) {
+      // gcc is failing without `template dyn_cast` here.
+      auto dstTensor = dstType.template dyn_cast<RankedTensorType>();
+      // TODO(b/244314345): Support imperfect tiling, which results in dynamic
+      // shapes.
+      if (!dstTensor || dstTensor.getNumDynamicDims() > 0) {
+        return rewriter.notifyMatchFailure(
+            op, "destination tensors should be statically shaped");
+      }
+      hasTensor = true;
+      if (!srcType.template isa<ShapedType>()) continue;
+      auto srcTensor = srcType.template dyn_cast<RankedTensorType>();
+      if (!srcTensor || srcTensor.getNumDynamicDims() > 0) {
+        return rewriter.notifyMatchFailure(
+            op, "source tensors should be statically shaped");
+      }
+    }
+    if (!hasTensor) {
+      return rewriter.notifyMatchFailure(
+          op, "should yield at least one tensor to be vectorized");
+    }
+    // We currently only support set_yield without an accumulator, since this
+    // pattern is only needed for GPU, where accumulators are not used.
+    if (!setYield.getAccumulators().empty()) {
+      return rewriter.notifyMatchFailure(
+          op, "shoud not use set_yield accumulators");
+    }
+
+    IRMapping bvm;
+
+    auto vectorLoopLikeOp = vectorizeLoopLikeOp(op, bvm, rewriter);
+    bvm.map(op.getResults(), vectorLoopLikeOp.getResults());
+
+    convertVectorResultsToTensor(op->getResults(), op.getLoopLikeOpInits(), bvm,
+                                 rewriter);
+    SmallVector<Value, 1> mappedResults = llvm::to_vector<1>(llvm::map_range(
+        op.getResults(), [&](Value v) { return bvm.lookupOrDefault(v); }));
+
+    rewriter.replaceOp(op, mappedResults);
+
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(LoopLikeOp)> filterFn;
+};
+// Prepend a set_yield of scalar into 1-element vector with a vector.insert.
+struct SetYieldOfScalarToVectorPattern : public OpRewritePattern<SetYieldOp> {
+  using OpRewritePattern<SetYieldOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SetYieldOp op,
+                                PatternRewriter &rewriter) const override {
+    auto tryRewrite = [&](Value dst, Value set, OpOperand &src) {
+      if (!dst.getType().isa<VectorType>()) return failure();
+      if (src.get().getType().isa<VectorType>()) return failure();
+      auto tileOp = set.getDefiningOp<TileOp>();
+      if (!tileOp || !tileOp.getOffsets().empty()) return failure();
+
+      src.set(rewriter.create<vector::InsertOp>(op.getLoc(), src.get(), dst,
+                                                tileOp.getStaticOffsets()));
+      return success();
+    };
+
+    if (llvm::none_of(
+            llvm::zip_first(op.getDsts(), op.getSets(), op->getOpOperands()),
+            [&](auto &&tuple) {
+              return succeeded(std::apply(tryRewrite, tuple));
+            })) {
+      return rewriter.notifyMatchFailure(
+          op, "expected scalar srcs and static offsets");
+    }
+
+    return success();
+  }
+};
+
+// Rewrite materialize of scalar from 1-element vector into a vector.extract /
+// vector.extractelement.
+struct MaterializeFromSingleElementToExtractPattern
+    : public OpRewritePattern<MaterializeOp> {
+  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(MaterializeOp op,
+                                PatternRewriter &rewriter) const override {
+    Value source = op.getSource();
+    auto sourceType = source.getType().dyn_cast<VectorType>();
+    if (!sourceType || sourceType.getNumDynamicDims() > 0 ||
+        sourceType.getNumElements() > 1) {
+      return rewriter.notifyMatchFailure(
+          op, "source should be a single element vector");
+    }
+    if (op.getResult().getType().isa<ShapedType>())
+      return rewriter.notifyMatchFailure(op, "result should be a scalar");
+
+    int64_t rank = sourceType.getRank();
+    if (rank == 0) {
+      // vector.extract doesn't support 0D tensors at the moment,
+      // use vector.extractelement.
+      rewriter.replaceOpWithNewOp<vector::ExtractElementOp>(op, source);
+      return success();
+    }
+    rewriter.replaceOpWithNewOp<vector::ExtractOp>(
+        op, source, SmallVector<int64_t>(rank, 0));
+    return success();
+  }
+};
+
+/// Update tensor operand of vector.transfer_write that uses MaterializeOp.
+struct MaterializeUpdateTransferWriteTensorOperand
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!op->getParentOfType<ForOp>()) return failure();
+
+    // Sanity checks of TransferWriteOp.
+    if (op.hasOutOfBoundsDim()) return failure();
+    if (op.getVectorType().getRank() != op.getShapedType().getRank())
+      return failure();
+    if (op.getMask()) return failure();
+    // Fold only if the TransferWriteOp completely overwrites the `source`
+    // with a vector, i.e. the result of the TransferWriteOp is a new tensor
+    // whose content is the data of the vector.
+    if (!llvm::equal(op.getVectorType().getShape(),
+                     op.getShapedType().getShape()))
+      return failure();
+    if (!op.getPermutationMap().isIdentity()) return failure();
+
+    auto src = op.getSource().getDefiningOp<MaterializeOp>();
+    if (!src) return failure();
+
+    SmallVector<Value> indices = getValueOrCreateConstantIndexOp(
+        rewriter, op.getLoc(), src.getMixedOffsets());
+    SmallVector<bool> inBounds(op.getTransferRank(), true);
+    rewriter.setInsertionPointAfter(op);
+    auto newOp = rewriter.create<vector::TransferWriteOp>(
+        op.getLoc(), op.getVector(), src.getSource(), indices,
+        ArrayRef<bool>{inBounds});
+    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+        op, op.getResult().getType().cast<RankedTensorType>(),
+        newOp.getResult(), src.getOffsets(), src.getSizes(), src.getStrides(),
+        src.getStaticOffsets(), src.getStaticSizes(), src.getStaticStrides());
+
+    return success();
+  }
+};
+
+/// Update tensor operand of vector.transfer_write used by SetYieldOp.
+struct SetYieldUpdateTransferWriteTensorOperand
+    : public OpRewritePattern<SetYieldOp> {
+  using OpRewritePattern<SetYieldOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SetYieldOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!op->getParentOfType<ForOp>()) return failure();
+
+    bool changed = false;
+    for (const auto &[src, dst, set] :
+         llvm::zip(op.getSrcs(), op.getDsts(), op.getSets())) {
+      auto xferOp = src.getDefiningOp<vector::TransferWriteOp>();
+
+      // Sanity checks of TransferWriteOp.
+      if (!xferOp) continue;
+      if (xferOp.getSource() == dst) continue;
+      if (xferOp.hasOutOfBoundsDim()) continue;
+      if (xferOp.getVectorType().getRank() != xferOp.getShapedType().getRank())
+        continue;
+      if (xferOp.getMask()) continue;
+      // Fold only if the TransferWriteOp completely overwrites the `source`
+      // with a vector, i.e. the result of the TransferWriteOp is a new tensor
+      // whose content is the data of the vector.
+      if (!llvm::equal(xferOp.getVectorType().getShape(),
+                       xferOp.getShapedType().getShape()))
+        continue;
+      if (!xferOp.getPermutationMap().isIdentity()) continue;
+
+      auto tileOp = set.getDefiningOp<TileOp>();
+
+      if (!tileOp) continue;
+
+      SmallVector<Value> indices = getValueOrCreateConstantIndexOp(
+          rewriter, op.getLoc(), tileOp.getMixedOffsets());
+      SmallVector<bool> inBounds(xferOp.getTransferRank(), true);
+      auto newOp = rewriter.create<vector::TransferWriteOp>(
+          xferOp.getLoc(), xferOp.getVector(), dst, indices,
+          ArrayRef<bool>{inBounds});
+      rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+          xferOp, xferOp.getResult().getType().cast<RankedTensorType>(),
+          newOp.getResult(), tileOp.getOffsets(), tileOp.getSizes(),
+          tileOp.getStrides(), tileOp.getStaticOffsets(),
+          tileOp.getStaticSizes(), tileOp.getStaticStrides());
+      changed = true;
+    }
+    return success(changed);
+  }
+};
+
+struct MaterializeOpVectorizationPattern
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  MaterializeOpVectorizationPattern(
+      MLIRContext *context,
+      llvm::function_ref<bool(tensor::ExtractSliceOp)> filterFn,
+      PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit), filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+    TypedValue<TensorType> source = op.getSource();
+    ShapedType sourceType = source.getType();
+    // TODO(b/244314345): Support imperfect tiling, which results in dynamic
+    // shapes.
+    if (!sourceType.isa<RankedTensorType>() ||
+        sourceType.getNumDynamicDims() > 0 ||
+        ShapedType::isDynamicShape(op.getStaticSizes()))
+      return rewriter.notifyMatchFailure(op, "input is not statically shaped");
+
+    Location loc = op.getLoc();
+    IRMapping bvm;
+    convertTensorOperandsToVector(op, bvm, rewriter);
+    Type newResult = op.getResult().getType();
+    if (auto tensorResult = newResult.dyn_cast<RankedTensorType>()) {
+      newResult = VectorType::get(tensorResult.getShape(),
+                                  tensorResult.getElementType());
+    }
+    Value vectorMaterialize = rewriter.create<MaterializeOp>(
+        loc, newResult, bvm.lookupOrDefault(source), op.getMixedOffsets(),
+        op.getMixedSizes(), op.getMixedStrides());
+    bvm.map(op, vectorMaterialize);
+    if (auto vectorType = newResult.dyn_cast<VectorType>()) {
+      // The result is not a scalar, generate a TransferWrite back to tensor.
+      // transfer_write uses destination passing style, so we need to "invent" a
+      // destination tensor. The entinre tensor_write op, together with the
+      // invented tensor will be folded when vectorizing the final
+      // gml_st.set_yield op.
+      auto emptyTensor = rewriter.create<tensor::EmptyOp>(
+          loc, vectorType.getShape(), vectorType.getElementType());
+      convertVectorResultsToTensor(op->getResults(), {emptyTensor}, bvm,
+                                   rewriter);
+    }
+    rewriter.replaceOp(op, bvm.lookupOrDefault(op));
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(tensor::ExtractSliceOp)> filterFn;
+};
+
+// TODO(pifon): Remove patterns that use gml_st.materialize, once GmlSt loops
+// are removed/upstreamed.
+struct FoldVectorExtractOfMaterialize
+    : public OpRewritePattern<vector::ExtractOp> {
+  explicit FoldVectorExtractOfMaterialize(MLIRContext *context,
+                                          PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit) {}
+
+  LogicalResult matchAndRewrite(vector::ExtractOp op,
+                                PatternRewriter &rewriter) const override {
+    auto materializeOp = op.getVector().getDefiningOp<gml_st::MaterializeOp>();
+    if (!materializeOp) return failure();
+
+    if (llvm::any_of(op.getPosition().getAsRange<IntegerAttr>(),
+                     [](IntegerAttr pos) { return pos.getInt() != 0; }))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<gml_st::MaterializeOp>(
+        op, op.getType(), materializeOp.getSource(),
+        materializeOp.getMixedOffsets(), materializeOp.getMixedSizes(),
+        materializeOp.getMixedStrides());
+    return success();
+  }
+};
+
+// Rewrite vector.transfer_read(tensor.empty) into a constant vector of the
+// right size. This is our temporary way of expressing the nonexistent
+// vector.undef, which creates a vector to be used in destination-passing-style
+// ops.
+// TODO(b/255779480): Figure out how to properly solve this issue.
+struct TensorEmptyToVectorBroadcastPattern
+    : public OpRewritePattern<TransferReadOp> {
+  TensorEmptyToVectorBroadcastPattern(
+      MLIRContext *context, llvm::function_ref<bool(TransferReadOp)> filterFn,
+      PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit), filterFn(filterFn) {}
+
+  LogicalResult matchAndRewrite(TransferReadOp op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(matchSimpleTransferOp(op, rewriter))) return failure();
+    auto tensorEmpty = op.getSource().getDefiningOp<tensor::EmptyOp>();
+    if (!tensorEmpty)
+      return rewriter.notifyMatchFailure(op, "source should be tensor.empty");
+    auto vectorType = op.getResult().getType().dyn_cast<VectorType>();
+    if (!vectorType)
+      return rewriter.notifyMatchFailure(op, "result should be a vector");
+    Type elementType = vectorType.getElementType();
+    TypedAttr nanAttr;
+    if (elementType.isa<IntegerType>()) {
+      nanAttr = rewriter.getIntegerAttr(elementType, 0l);
+    } else if (elementType.isa<FloatType>()) {
+      nanAttr = rewriter.getFloatAttr(elementType,
+                                      std::numeric_limits<double>::quiet_NaN());
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "should operate on integer or floating point vectors");
+    }
+
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+        op, DenseElementsAttr::get(vectorType, nanAttr));
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(TransferReadOp)> filterFn;
+};
+
+struct IdentityMaterializeOpFoldingPattern
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  explicit IdentityMaterializeOpFoldingPattern(MLIRContext *context,
+                                               PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit) {}
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp op,
+                                PatternRewriter &rewriter) const override {
+    auto src = op.getSource();
+    // Only fold identity materialize of ForOp's block argument.
+    // Set has to be an identity tile op and source and result are static and
+    // have the same shapes.
+    if (!op->getParentOfType<ForOp>() || !src.isa<BlockArgument>() ||
+        !isIdentitySlice(op.getOffsets(), op.getStrides()) ||
+        !haveSameStaticShape(src, op.getResult()))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+
+    op.replaceAllUsesWith(src);
+    return success();
+  }
+};
+
+// Rewrite tensor.extract on single-element tensors into a vector.extract.
+struct TensorToElementVectorizationPattern
+    : public mlir::OpRewritePattern<tensor::ExtractOp> {
+  TensorToElementVectorizationPattern(
+      MLIRContext *context, llvm::function_ref<bool(tensor::ExtractOp)> matchFn,
+      mlir::PatternBenefit benefit = 1)
+      : mlir::OpRewritePattern<tensor::ExtractOp>(context, benefit),
+        filterFn(matchFn) {}
+
+  LogicalResult matchAndRewrite(tensor::ExtractOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!filterFn(op))
+      return rewriter.notifyMatchFailure(op, "did not match filter");
+    TensorType tensorType = op.getTensor().getType();
+    if (tensorType.getNumDynamicDims() > 0 || tensorType.getNumElements() > 1)
+      return rewriter.notifyMatchFailure(op, "should have a single element");
+
+    IRMapping bvm;
+    convertTensorOperandsToVector(op, bvm, rewriter);
+    if (tensorType.getRank() == 0) {
+      // ExtractOp only supports ranks > 0, for rank = 0 use ExtractElementOp
+      rewriter.replaceOpWithNewOp<vector::ExtractElementOp>(
+          op, bvm.lookupOrDefault(op.getTensor()));
+    } else {
+      rewriter.replaceOpWithNewOp<vector::ExtractOp>(
+          op, bvm.lookupOrDefault(op.getTensor()),
+          SmallVector<int64_t, 1>(tensorType.getRank(), 0));
+    }
+    return success();
+  }
+
+ private:
+  llvm::function_ref<bool(tensor::ExtractOp)> filterFn;
+};
+
+bool isInsideGmlStLoop(Operation *op) {
+  Operation *parent = op->getParentOp();
+  return isa<ParallelOp>(parent) || isa<ForOp>(parent);
+}
+
+bool isFillTiledOrSmall(FillOp fill) {
+  if (isInsideGmlStLoop(fill)) return true;
+
+  // Allow vectorization for static shapes with low number of elements.
+  auto outputType = fill.output().getType().dyn_cast<mlir::RankedTensorType>();
+  return outputType && outputType.hasStaticShape() &&
+         outputType.getNumElements() < kNumElementsThreshold;
+}
+
+bool isLinalgOpTiledOrOneDimReduction(linalg::LinalgOp op) {
+  if (isInsideGmlStLoop(op)) return true;
+
+  // Allow vectorization of 1D reductions.
+  return op.getNumLoops() == 1 && op.getNumReductionLoops() == 1;
+}
+
+bool isGenericOpTiledOrOneDimReduction(linalg::GenericOp generic) {
+  if (isInsideGmlStLoop(generic)) return true;
+
+  // Allow vectorization of 1D reductions.
+  return generic.getNumLoops() == 1 && generic.getNumReductionLoops() == 1;
+}
+
+struct VectorizeForGPUPass
+    : public impl::VectorizeForGPUPassBase<VectorizeForGPUPass> {
+  VectorizeForGPUPass(bool vectorizeGmlStOpsParam,
+                      ArrayRef<StringRef> distributionLabelsParam) {
+    vectorizeGmlStOps = vectorizeGmlStOpsParam;
+    for (StringRef distribution : distributionLabelsParam)
+      distributionLabels.push_back(distribution.str());
+  }
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::vector::VectorDialect>();
+  }
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    auto *ctx = func.getContext();
+
+    auto isValidDistribution = [&](Operation *op) {
+      if (distributionLabels.empty()) return true;
+      auto parent = op->getParentOfType<ParallelOp>();
+      if (!parent || !parent.getDistributionType().has_value()) return false;
+      return llvm::find(distributionLabels,
+                        parent.getDistributionType().value()) !=
+             distributionLabels.end();
+    };
+    // These lambdas have to be assigned to local variables, so that they
+    // survive beyond patterns.add() and applyPatternsAndFoldGreedily() calls.
+    auto fillOpFilter = [&](FillOp op) {
+      bool filter = isValidDistribution(op) && isFillTiledOrSmall(op);
+      return filter;
+    };
+    auto linalgOpFilter = [&](LinalgOp op) {
+      return isValidDistribution(op) && isLinalgOpTiledOrOneDimReduction(op);
+    };
+    auto genericOpFilter = [&](GenericOp op) {
+      return isValidDistribution(op) && isGenericOpTiledOrOneDimReduction(op);
+    };
+    auto matmulOpFilter = [&](MatmulOp op) {
+      if (isInsideGmlStLoop(op)) return true;
+      // Allow vectorization for static shapes.
+      auto outputType =
+          op.getResult(0).getType().cast<mlir::RankedTensorType>();
+      return outputType.hasStaticShape();
+    };
+    auto materializeOpFilter = [&](tensor::ExtractSliceOp op) {
+      // Materialize op should only be vectorized if the producer of its
+      // source is within the vectorized region, otherwise we vectorize one
+      // level too much. (E.g., for GPU, if we are vectorizing up to warp level,
+      // we should not vectorize materializes of warp-level tiles from
+      // block-level tiles, since it means we are inserting a
+      // vector.transfer_read on the source, i.e., a block-level tile).
+      Operation *sourceOp = op.getSource().getDefiningOp();
+      // Only vectorize MaterializeOp inside a loop, since we are only enabling
+      // this pattern when vectorizing ForOp and ParallelOp anyway.
+      Operation *parent = op->getParentOp();
+      bool opInsideLoop = isa<ParallelOp>(parent) || isa<ForOp>(parent);
+      return sourceOp != nullptr && opInsideLoop &&
+             isValidDistribution(sourceOp);
+    };
+    {
+      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
+
+      populateTransferReadOfOneDimExpandShapePattern(patterns);
+      patterns.add<FoldVectorExtractOfMaterialize,
+                   MaterializeFromSingleElementToExtractPattern,
+                   SetYieldOfScalarToVectorPattern>(ctx);
+      patterns.add<VectorizationPattern<FillOp>>(ctx, fillOpFilter);
+      patterns.add<VectorizationPattern<GenericOp>>(ctx, genericOpFilter);
+      patterns.add<VectorizationPattern<BroadcastOp>,
+                   VectorizationPattern<MapOp>, VectorizationPattern<ReduceOp>>(
+          ctx, linalgOpFilter);
+      patterns.add<VectorizationPattern<MatmulOp>>(ctx, matmulOpFilter);
+      patterns.add<TensorToElementVectorizationPattern,
+                   TensorEmptyToVectorBroadcastPattern>(ctx,
+                                                        isValidDistribution);
+      if (vectorizeGmlStOps) {
+        patterns.add<MaterializeOpVectorizationPattern>(ctx,
+                                                        materializeOpFilter);
+        patterns.add<LoopLikeOpVectorizationPattern<ParallelOp>,
+                     LoopLikeOpVectorizationPattern<ForOp>>(
+            ctx, isValidDistribution);
+      }
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
+
+    {
+      RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
+      patterns.add<MaterializeUpdateTransferWriteTensorOperand,
+                   SetYieldUpdateTransferWriteTensorOperand>(ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
+
+    // Hoisting transfer_read/transfer_write.
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add<IdentityMaterializeOpFoldingPattern>(ctx);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+
+      hoistRedundantVectorTransfersOnTensor(func);
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForGPUPass(
+    bool vectorizeGmlStOps, ArrayRef<StringRef> distributionLabels) {
+  return std::make_unique<VectorizeForGPUPass>(vectorizeGmlStOps,
+                                               distributionLabels);
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
new file mode 100644
index 00000000000..2bdbbdbe291
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_mlir_dialect_library(MLIRGmlStUtils
+  linalg_utils.cc
+
+  LINK_LIBS PUBLIC
+  MLIRLinalgDialect
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
new file mode 100644
index 00000000000..d53e3ef3b24
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.cc
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/utils/linalg_utils.h"
+
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+
+namespace mlir {
+namespace gml_st {
+
+bool isCwiseGenericOp(Operation *op, int64_t *arity) {
+  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
+  if (!genericOp || genericOp.getNumDpsInits() != 1) return false;
+
+  // Check all-parallel iterator types.
+  if (!llvm::all_of(genericOp.getIteratorTypesArray(),
+                    linalg::isParallelIterator))
+    return false;
+
+  // Check all-identity maps.
+  if (!llvm::all_of(genericOp.getIndexingMapsArray(),
+                    [](AffineMap map) { return map.isIdentity(); })) {
+    return false;
+  }
+
+  // Allow for pattern matching the arity.
+  if (arity != nullptr) *arity = genericOp.getNumDpsInputs();
+  return true;
+}
+
+bool isSimpleBcastReduction(Operation *op, int64_t *dimension,
+                            SimpleBcastReduction *chain) {
+  // Match bcast.
+  auto broadcastOp = llvm::dyn_cast_or_null<linalg::BroadcastOp>(op);
+  if (!broadcastOp) return false;
+
+  // Match reduction.
+  auto reduceOp = llvm::dyn_cast_or_null<linalg::ReduceOp>(
+      broadcastOp.getOperands().front().getDefiningOp());
+  if (!reduceOp || reduceOp.getNumDpsInits() != 1) return false;
+
+  // Check that bcast and reduction dimensions match.
+  auto bcstDimensions = broadcastOp.getDimensions();
+  if (!bcstDimensions.empty() && bcstDimensions != reduceOp.getDimensions())
+    return false;
+
+  // Allow for pattern matching the reduction dimension and operation chain.
+  if (dimension != nullptr) *dimension = bcstDimensions.front();
+  if (chain != nullptr) {
+    chain->bcast = op;
+    chain->reduction = reduceOp;
+    chain->operand = reduceOp.getInputs().front();
+  }
+  return true;
+}
+
+}  // namespace gml_st
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
similarity index 76%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
index e0e4185456b..4f18251c6dd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/linalg_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_LINALG_UTILS_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_LINALG_UTILS_H
+#ifndef MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
+#define MLIR_HLO_GML_ST_UTILS_LINALG_UTILS_H
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 
@@ -24,17 +24,9 @@ namespace gml_st {
 // Helper functions to match `linalg.generic` ops that implement simple
 // reductions, bcasts, and cwise ops.
 
-bool isSimpleReduction(Operation *op, int64_t *dimension = nullptr,
-                       Value *operand = nullptr);
-
 // Returns whether 'op' is element-wise linalg.generic with single result.
 bool isCwiseGenericOp(Operation *op, int64_t *arity = nullptr);
 
-bool isUnaryCwiseGenericOp(Operation *op);
-
-bool isSimpleBcast(Operation *op, int64_t *dimension = nullptr,
-                   Value *operand = nullptr);
-
 struct SimpleBcastReduction {
   Operation *bcast;
   Operation *reduction;
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/vector_utils.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/vector_utils.h
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/vector_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/gml_st/utils/vector_utils.h
index 1a2c2b3cd52..752a8c6b152 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/vector_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/utils/vector_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_VECTOR_UTILS_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_VECTOR_UTILS_H
+#ifndef MLIR_HLO_GML_ST_UTILS_VECTOR_UTILS_H
+#define MLIR_HLO_GML_ST_UTILS_VECTOR_UTILS_H
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/CMakeLists.txt
deleted file mode 100644
index eae7839da74..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(Dialect)
-add_subdirectory(Transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/CMakeLists.txt
deleted file mode 100644
index fdb06328302..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(gml_st)
-add_subdirectory(lhlo)
-add_subdirectory(lhlo_gpu)
-add_subdirectory(mhlo)
-add_subdirectory(thlo)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/CMakeLists.txt
deleted file mode 100644
index 88672e5e298..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/CMakeLists.txt
deleted file mode 100644
index 1f00694bf40..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS gml_st_ops.td)
-mlir_tablegen(gml_st_ops.h.inc -gen-op-decls)
-mlir_tablegen(gml_st_ops.cc.inc -gen-op-defs)
-mlir_tablegen(gml_st_types.h.inc -gen-typedef-decls)
-mlir_tablegen(gml_st_types.cc.inc -gen-typedef-defs)
-mlir_tablegen(gml_st_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(gml_st_dialect.cc.inc -gen-dialect-defs)
-mlir_tablegen(gml_st_attrs.h.inc -gen-attrdef-decls)
-mlir_tablegen(gml_st_attrs.cc.inc -gen-attrdef-defs)
-
-add_public_tablegen_target(MLIRgml_st_opsIncGen)
-add_dependencies(mlir-headers MLIRgml_st_opsIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_legacy_ops.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_legacy_ops.td
deleted file mode 100644
index 86e02eb245c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_legacy_ops.td
+++ /dev/null
@@ -1,339 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the operation definition file for ST ops.
-
-#ifndef GML_ST_LEGACY_OPS
-#define GML_ST_LEGACY_OPS
-
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-def GMLST_LoopOp : GMLST_Op<"loop", [
-     AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-     RecursiveMemoryEffects,
-     SingleBlockImplicitTerminator<"gml_st::YieldOp">
-    ]> {
-  let summary = "Loop-like operation";
-  let description = [{
-    This is a loop-like operation with additional properties. The arguments
-    also include the input and the output tensors or memrefs and the attributes
-    to specify the iterator types.
-
-    Parsing LoopOp will set all elements of the `iterator_types` attribute
-    to "parallel" type, when it is absent from the custom format.
-
-    Tensor-based version:
-
-    The body region of the loop contains `extract_slice` operations applied to
-    every tensor argument of LoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `gml_st.yield` with the operands resulting from `insert_slice` operations.
-
-    Example:
-
-    ```mlir
-    %0 = gml_st.loop (%i) = (%c0) to (%c24) step (%c4)
-        ins(%lhs, %rhs : tensor<24x64xi8>, tensor<24x64xi8>)
-        outs(%out : tensor<24x64xi8>)
-        iterators("parallel")
-        distribution("block_x") {
-      %lhs_sub = tensor.extract_slice %lhs[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-      %rhs_sub = tensor.extract_slice %rhs[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-      %out_sub = tensor.extract_slice %out[%i, 0] [%c4, %c64] [1, 1]
-          : tensor<24x64xi8> to tensor<?x?xi8>
-
-      %result_sub = linalg.generic ...
-
-      %result = tensor.insert_slice %result_sub into %out[%i, 0][%c4, %c64][1, 1]
-        : tensor<?x?xi8> into tensor<24x64xi8>
-      gml_st.yield %result : tensor<24x64xi8>
-    }
-    ```
-
-    MemRef-based version:
-
-    The body region of the loop contains `subview` operations applied to
-    every memref argument of LoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `gml_st.yield` with no operands.
-
-    Example:
-
-    ```mlir
-    gml_st.loop (%i) = (%c0) to (%c24) step (%c4)
-        ins(%lhs, %rhs : memref<24x64xi8>, memref<24x64xi8>)
-        outs(%out : memref<24x64xi8>)
-        iterators("parallel")
-        distribution("block_x") {
-      %lhs_sub = subview %lhs[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-      %rhs_sub = subview %rhs[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-      %out_sub = subview %out[%i, 0] [%c4, %c64] [1, 1]
-          : memref<24x64xi8> to memref<?x?xi8>
-
-      %result_sub = linalg.generic ...
-      gml_st.yield
-    }
-    ```
-  }];
-
-  let arguments = (ins Variadic<Index>:$lowerBound,
-                       Variadic<Index>:$upperBound,
-                       Variadic<Index>:$step,
-                       Variadic<AnyType>:$inputs,
-                       Variadic<AnyShaped>:$outputs,
-                       IteratorTypeArrayAttr:$iterator_types,
-                       OptionalAttr<ArrayAttr>:$distribution_types);
-  let results = (outs Variadic<AnyRankedTensor>:$results);
-  let regions = (region SizedRegion<1>:$region);
-
-  let builders = [
-    OpBuilder<(ins "ValueRange":$lowerBounds, "ValueRange":$upperBounds,
-      "ValueRange":$steps, "ValueRange":$inputs, "ValueRange":$outputs,
-      "ArrayAttr":$iteratorTypes, "Optional<ArrayAttr>":$distributionTypes,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
-        "/*inputs=*/ValueRange, /*outputs=*/ValueRange)>",
-        "nullptr">:$bodyBuilderFn)>,
-    OpBuilder<(ins "ValueRange":$lowerBounds, "ValueRange":$upperBounds,
-      "ValueRange":$steps, "ValueRange":$inputs, "ValueRange":$outputs,
-      "ArrayAttr":$iteratorTypes,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
-        "/*inputs=*/ValueRange, /*outputs=*/ValueRange)>",
-        "nullptr">:$bodyBuilderFn)>,
-  ];
-
-  let extraClassDeclaration = [{
-    /// Number of loops
-    unsigned getNumLoops() { return getStep().size(); }
-
-    /// Number of input operands
-    unsigned getNumInputs() { return getInputs().size(); }
-
-    /// Number of output operands
-    unsigned getNumOutputs() { return getOutputs().size(); }
-
-    /// Number of operands controlling the loop: lbs, ubs, steps
-    unsigned getNumControlOperands() { return 3 * getNumLoops(); }
-
-    ValueRange getInductionVars() {
-      return getBody()->getArguments().take_front(getNumLoops());
-    }
-    ValueRange getRegionInputArgs() {
-      return getBody()->getArguments().slice(getNumLoops(), getInputs().size());
-    }
-    ValueRange getRegionOutputArgs() {
-      return getBody()->getArguments().take_back(getOutputs().size());
-    }
-
-    void setDistributionTypes(Builder& b, ArrayRef<StringRef> types) {
-      assert(types.size() == getNumLoops() &&
-             "expected distribution type for every dimension");
-      setDistributionTypesAttr(b.getStrArrayAttr(types));
-    }
-
-    void setLowerBounds(ValueRange lowerBounds) {
-      unsigned numLoops = getNumLoops();
-      assert(lowerBounds.size() == numLoops &&
-             "expected lower bounds for every loop dimension");
-      for (unsigned i = 0; i < numLoops; ++i)
-        setOperand(i, lowerBounds[i]);
-    }
-
-    void setUpperBounds(ValueRange upperBounds) {
-      unsigned numLoops = getNumLoops();
-      assert(upperBounds.size() == numLoops &&
-             "expected upper bounds for every loop dimension");
-      for (unsigned i = 0, pos = numLoops; i < numLoops; ++i, ++pos)
-        setOperand(pos, upperBounds[i]);
-    }
-
-    void setSteps(ValueRange steps) {
-      unsigned numLoops = getNumLoops();
-      assert(steps.size() == numLoops &&
-             "expected upper bounds for every loop dimension");
-      for (unsigned i = 0, pos = 2 * numLoops; i < numLoops; ++i, ++pos)
-        setOperand(pos, steps[i]);
-    }
-
-    /// Operand that corresponds to the `bbArg` block argument.
-    OpOperand& getTiedOperand(BlockArgument& bbArg) {
-      return getOperation()->getOpOperand(getNumControlOperands() +
-                                          bbArg.getArgNumber() - getNumLoops());
-    }
-
-    /// Block argument that corresponds to the `input` or `output` operand.
-    BlockArgument getTiedBlockArgument(OpOperand& operand) {
-      auto operandIndex = operand.getOperandNumber();
-      assert(
-          operandIndex >= getNumControlOperands() &&
-          operandIndex < getNumOperands() &&
-          "tied block arg is defined only for `input` and `output` arguments");
-      return getBody()->getArgument(operandIndex - 2 * getNumLoops());
-    }
-
-   /// Result that corresponds to the `outputs` argument of tensor type.
-   OpResult getTiedOpResult(OpOperand& opOperand) {
-      // No result can correspond to a memref argument.
-      if (opOperand.get().getType().isa<MemRefType>()) return OpResult();
-
-      // Check whether the operand index is in bounds of `outputs()` arg.
-      int operandIndex = opOperand.getOperandNumber();
-      int outputIndexStart =
-          getNumControlOperands() + getInputs().size();
-      int outputIndexEnd = outputIndexStart + getOutputs().size();
-      if (operandIndex < outputIndexStart || operandIndex >= outputIndexEnd)
-        return OpResult();
-
-      // Count tensor arguments in `outputs` to compute the result index.
-      int tensorId = -1;
-      for (int i = outputIndexStart; i <= operandIndex; ++i)
-        tensorId += getOperand(i).getType().isa<RankedTensorType>();
-      return getOperation()->getResult(tensorId);
-    }
-
-    /// Append `operand` to the `input` arguments.
-    OpOperand& appendInputOperand(OpBuilder& builder, Value operand) {
-      int numLoops = getNumLoops();
-      int numInputs = getNumInputs();
-      int numOutputs = getNumOutputs();
-
-      getOperation()->insertOperands(getNumControlOperands() + numInputs,
-                                     operand);
-      getBody()->insertArgument(numLoops + numInputs, operand.getType(),
-                                getLoc());
-      getOperation()->setAttr(
-          LoopOp::getOperandSegmentSizeAttr(),
-          builder.getDenseI32ArrayAttr(
-              {numLoops, numLoops, numLoops, numInputs + 1, numOutputs}));
-      return getOperation()->getOpOperand(getNumControlOperands() + numInputs);
-    }
-
-    /// Append `operand` to the `output` arguments.
-    OpOperand& appendOutputOperand(OpBuilder& builder, Value operand) {
-      int numLoops = getNumLoops();
-      int numInputs = getNumInputs();
-      int numOutputs = getNumOutputs();
-
-      getOperation()->insertOperands(
-          getNumControlOperands() + numInputs + numOutputs, operand);
-      getBody()->insertArgument(numLoops + numInputs + numOutputs,
-                                operand.getType(), getLoc());
-      getOperation()->setAttr(
-          LoopOp::getOperandSegmentSizeAttr(),
-          builder.getDenseI32ArrayAttr(
-              {numLoops, numLoops, numLoops, numInputs, numOutputs + 1}));
-      return getOperation()->getOpOperand(getNumControlOperands() + numInputs +
-                                          numOutputs);
-    }
-
-    /// Erase `operand` from the `input` or `output` arguments.
-    void eraseOperand(OpBuilder& builder, OpOperand& operand) {
-      int numInputs = getNumInputs();
-      int numLoops = getNumLoops();
-      int numOutputs = getNumOutputs();
-      int numControlOperands = getNumControlOperands();
-
-      int operandIndex = operand.getOperandNumber();
-      assert(operandIndex >= numControlOperands &&
-             operandIndex < static_cast<int>(getNumOperands()) &&
-             "Can erase only `input` or `output` operand");
-
-      if (operandIndex >= numControlOperands + numInputs)
-        --numOutputs;
-      else
-        --numInputs;
-
-      getOperation()->eraseOperand(operandIndex);
-      getBody()->eraseArgument(operandIndex - 2 * numLoops);
-      getOperation()->setAttr(
-          LoopOp::getOperandSegmentSizeAttr(),
-          builder.getDenseI32ArrayAttr(
-              {numLoops, numLoops, numLoops, numInputs, numOutputs}));
-    }
-
-    OpOperand* findInputOperand(Value value) {
-      OperandRange::iterator it = llvm::find(getInputs(), value);
-      if (it == getInputs().end()) return nullptr;
-      return it.getBase();
-    }
-
-    OpOperand* findOutputOperand(Value value) {
-      OperandRange::iterator it = llvm::find(getOutputs(), value);
-      if (it == getOutputs().end()) return nullptr;
-      return it.getBase();
-    }
-
-    /// Return whether the op has only MemRef input and outputs.
-    bool hasBufferSemantics() {
-      Operation* op = this->getOperation();
-      return op->getNumResults() == 0 &&
-             llvm::all_of(op->getOpOperands(), [&](OpOperand & operand) {
-               return !operand.get().getType().template isa<ShapedType>() ||
-                      operand.get().getType().template isa<MemRefType>();
-             });
-    }
-
-    static constexpr StringRef getDistributionTypesAttrStrName() {
-      return "distribution_types";
-    }
-    static constexpr StringRef getIteratorTypesAttrStrName() {
-      return "iterator_types";
-    }
-
-    /// Return whether the loop dimension is parallel or not.
-    bool isParallelDimension(unsigned dim) {
-      IteratorTypeAttr attr =
-          this->getIteratorTypes()[dim].cast<IteratorTypeAttr>();
-      return attr.getValue() == utils::IteratorType::parallel;
-    }
-
-    /// Return the destinations for a gml_st.loop op.
-    ValueRange getLoopLikeOpInits() {
-      return getOutputs();
-    }
-  }];
-
-  let hasCanonicalizer = 1;
-  let hasCustomAssemblyFormat = 1;
-  let hasFolder = 1;
-}
-
-def GMLST_YieldOp : GMLST_Op<"yield", [Pure, ReturnLike, Terminator,
-    HasParent<"::mlir::gml_st::LoopOp, ::mlir::gml_st::SetYieldOp">]>,
-    Arguments<(ins Variadic<AnyType>:$values)> {
-  let summary = "Yield operation";
-  let description = [{
-    `gml_st.yield` is a special terminator operation for `gml_st.loop` body or
-    for accumulator regions of `gml_st.set_yield`.
-
-    Example:
-
-    ```mlir
-    gml_st.yield %f0, %f1 : tensor<f32>, tensor<?xf32>
-    ```
-  }];
-  let builders = [OpBuilder<(ins), [{ /* nothing to do */ }]>];
-  let assemblyFormat = "attr-dict ($values^ `:` type($values))?";
-}
-
-#endif // GML_ST_LEGACY_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.td
deleted file mode 100644
index 43497e6dff6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/IR/gml_st_ops.td
+++ /dev/null
@@ -1,417 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the operation definition file for ST ops.
-
-#ifndef GML_ST_OPS
-#define GML_ST_OPS
-
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops_base.td"
-include "mlir-hlo/Dialect/gml_st/IR/gml_st_legacy_ops.td"
-
-///////////////////////////////////////////////////////////////////////////////
-// Types
-///////////////////////////////////////////////////////////////////////////////
-
-// Base class of all subset types.
-class GMLST_Set<string name> : TypeDef<GmlSt_Dialect, name> { }
-
-def GMLST_TileType : GMLST_Set<"Tile"> {
-  let mnemonic = "tile";
-  let summary = "Type that represents a tile of an index space.";
-  let parameters = (ins ArrayRefParameter<"int64_t">:$shape);
-  let assemblyFormat = "`<` custom<ShapeTypeDimensionsList>($shape) `>`";
-  let extraClassDeclaration = [{
-    unsigned getRank() const { return getShape().size(); }
-    bool hasStaticShape() const {
-      return llvm::none_of(getShape(), ShapedType::isDynamic);
-    }
-    int64_t getNumElements() const {
-      return ShapedType::getNumElements(getShape());
-    }
-  }];
-}
-
-def AnySet : Type<Or<[GMLST_TileType.predicate]>, "subset type">;
-
-def RankedTensorOrVector : AnyTypeOf<[
-  AnyRankedTensor, AnyVectorOfAnyRank
-], "", "::mlir::ShapedType">;
-def RankedTensorOrVectorOrScalar : AnyTypeOf<[
-  AnyRankedTensor, AnyVectorOfAnyRank, AnyFloat, AnyInteger, AnyComplex
-]>;
-
-///////////////////////////////////////////////////////////////////////////////
-// Ops
-///////////////////////////////////////////////////////////////////////////////
-
-def GMLST_TileOp : GMLST_Op<"tile", [
-    Pure,
-    AttrSizedOperandSegments,
-    OffsetSizeAndStrideOpInterface,
-    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
-  let arguments = (ins Variadic<Index>:$offsets,
-                       Variadic<Index>:$sizes,
-                       Variadic<Index>:$strides,
-                       I64ArrayAttr:$static_offsets,
-                       I64ArrayAttr:$static_sizes,
-                       I64ArrayAttr:$static_strides);
-  let results = (outs GMLST_TileType:$result);
-  let assemblyFormat = [{
-    custom<DynamicIndexList>($offsets, $static_offsets,
-                               "ShapedType::kDynamicStrideOrOffset")
-    custom<DynamicIndexList>($sizes, $static_sizes,
-                               "ShapedType::kDynamicSize")
-    custom<DynamicIndexList>($strides, $static_strides,
-                               "ShapedType::kDynamicStrideOrOffset")
-    attr-dict `:` qualified(type($result))
-  }];
-  let builders = [
-   OpBuilder<(ins "ArrayRef<OpFoldResult>":$offsets,
-      "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
-   OpBuilder<(ins "ArrayRef<OpFoldResult>":$offsets,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
-  ];
-  let extraClassDeclaration = [{
-    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
-    /// and `static_strides` attributes.
-    std::array<unsigned, 3> getArrayAttrMaxRanks() {
-      unsigned rank = getType().cast<TileType>().getRank();
-      return {rank, rank, rank};
-    }
-    /// Return the number of leading operands before the `offsets`, `sizes` and
-    /// and `strides` operands.
-    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 0; }
-  }];
-  let hasCanonicalizer = 1;
-  let hasVerifier = 1;
-}
-
-def GMLST_MaterializeOp : GMLST_Op<"materialize", [Pure]> {
-  let arguments = (ins RankedTensorOrVector:$source, AnySet:$set);
-  let results = (outs RankedTensorOrVectorOrScalar:$result);
-
-  let builders = [OpBuilder<(ins "Value":$source, "Value":$set)>];
-
-  let assemblyFormat = [{
-    $source`[` $set `]` attr-dict `:` type($source) `[` type($set) `]`
-      `to` type($result)
-  }];
-
-  let hasCanonicalizer = 1;
-}
-
-class GMLST_LoopLikeOp<string mnemonic, list<Trait> traits = []>
-    : GMLST_Op<mnemonic, !listconcat(traits, [
-      AttrSizedOperandSegments,
-      DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-      RecursiveMemoryEffects,
-      SingleBlockImplicitTerminator<"gml_st::SetYieldOp">
-    ])> {
-  let results = (outs Variadic<RankedTensorOrVector>:$results);
-  let regions = (region SizedRegion<1>:$region);
-
-  code extraBaseClassDeclaration = [{
-    /// Number of loops
-    unsigned getNumLoops() { return getStep().size(); }
-
-    /// Number of operands controlling the loop: lbs, ubs, steps
-    unsigned getNumControlOperands() { return 3 * getNumLoops(); }
-
-    ValueRange getInductionVars() {
-      return getBody()->getArguments().take_front(getNumLoops());
-    }
-
-    /// Return whether the op has no output tensors.
-    bool hasBufferSemantics() {
-      return this->getOperation()->getNumResults() == 0;
-    }
-
-    /// Return terminator of the loop body.
-    SetYieldOp getTerminator();
-  }];
-
-  let hasCustomAssemblyFormat = 1;
-}
-
-def GMLST_ParallelOp : GMLST_LoopLikeOp<"parallel", []> {
-  let summary = "Loop-like operation for parallel loops";
-  let description = [{
-    This is a loop-like operation with additional properties. The arguments
-    also include the output tensors or memrefs.
-
-    Tensor-based version:
-
-    The body region of the loop contains set operations applied to
-    every output tensor argument of LoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `gml_st.set_yield` which yields a tensor into a subset of outs.
-
-    Example:
-
-    ```mlir
-    %space = gml_st.space [8, 16] : !gml_st.tile<8x16>
-
-    %result = gml_st.parallel (%i) = (%c0, %c0) to (%c8, %c16) step (%c4, %c4) {
-      %tile = gml_st.tile %space [%i, %j] [4, 4] [1, 1]
-        : ! gml_st.tile<8x16> to !gml_st.tile<4x4>
-
-      %lhs_sub = gml_st.materialize %lhs_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-      %rhs_sub = gml_st.materialize %rhs_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-      %out_sub = gml_st.materialize %out_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-
-      %result_sub = linalg.generic (%lhs_sub, %rhs_sub, %out_sub) ...
-
-      gml_st.set_yield %result_sub into %out[%tile]
-        : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
-    }
-    ```
-  }];
-  let arguments = (ins Variadic<Index>:$lowerBound,
-                       Variadic<Index>:$upperBound,
-                       Variadic<Index>:$step,
-                       OptionalAttr<StrAttr>:$distributionType);
-  // The default builder does not generate the block with induction variables
-  // as arguments, and conflicts with the custom one. Prevent tablegen from
-  // generating it.
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBounds,
-      "ValueRange":$upperBounds, "ValueRange":$steps,
-      CArg<"Optional<StringAttr>", "llvm::None">:$distributionType,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange)>",
-      "nullptr">:$bodyBuilderFn)>,
-  ];
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    /// Return the destinations for a gml_st.parallel op.
-    ValueRange getLoopLikeOpInits();
-  }];
-}
-
-def GMLST_ForOp : GMLST_LoopLikeOp<"for", []> {
-  let summary = "Loop-like operation for sequential loops";
-  let description = [{
-    This is a loop-like operation with additional properties. The arguments
-    also include the output tensors or memrefs.
-
-    Tensor-based version:
-
-    The body region of the loop contains set operations applied to
-    every output tensor argument of LoopOp.
-
-    The body region must contain exactly one block that terminates with
-    `gml_st.set_yield` which yields a tensor into a subset of outs.
-
-    Example:
-
-    ```mlir
-    %space = gml_st.space [8, 16] : !gml_st.tile<8x16>
-
-    %result = gml_st.for (%i) = (%c0, %c0) to (%c8, %c16) step (%c4, %c4)
-        outs(%out_ = %output: tensor<8x16xf32>) {
-      %tile = gml_st.tile %in_space [%i, %j] [4, 4] [1, 1]
-        : ! gml_st.tile<8x16> to !gml_st.tile<4x4>
-
-      %lhs_sub = gml_st.materialize %lhs_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-      %rhs_sub = gml_st.materialize %rhs_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-      %out_sub = gml_st.materialize %out_[%tile]
-        : tensor<8x16xf32>[!gml_st.tile<4x4>]
-
-      %result_sub = linalg.generic (%lhs_sub, %rhs_sub, %out_sub) ...
-
-      gml_st.set_yield %result_sub into %out_[%tile]
-        : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
-    }
-    ```
-  }];
-
-  let arguments = (ins Variadic<Index>:$lowerBound,
-                       Variadic<Index>:$upperBound,
-                       Variadic<Index>:$step,
-                       Variadic<AnyShaped>:$outputs);
-
-  let builders = [
-    OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBounds,
-      "ValueRange":$upperBounds, "ValueRange":$steps, "ValueRange":$outputs,
-      CArg<"function_ref<void (OpBuilder &, Location, /*ivs=*/ValueRange,"
-        "/*outputs=*/ValueRange)>", "nullptr">:$bodyBuilderFn)>,
-  ];
-
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
-    /// Number of output operands
-    unsigned getNumOutputs() { return getOutputs().size(); }
-
-    /// Get the region output args.
-    Block::BlockArgListType getRegionOutputArgs() {
-      return getBody()->getArguments().take_back(getNumOutputs());
-    }
-
-    /// Get the region output arg that corresponds to an OpOperand.
-    BlockArgument getRegionOutputArgForOpOperand(OpOperand &opOperand) {
-      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
-             "expected an output args operand");
-      assert(opOperand.getOwner() == getOperation() &&
-             "opOperand does not belong to this gml_st::ForOp operation");
-      return getBody()->getArgument(opOperand.getOperandNumber() -
-                                    getNumControlOperands() + getNumLoops());
-    }
-
-    /// Get the OpOperand& that corresponds to a region output arg.
-    OpOperand &getOpOperandForRegionOutputArg(BlockArgument bbArg) {
-      assert(bbArg.getArgNumber() >= getNumLoops() &&
-             "expected a bbArg that is not an induction variable");
-      assert(bbArg.getOwner()->getParentOp() == getOperation() &&
-             "bbArg does not belong to the gml_st::ForOp body");
-      return getOperation()->getOpOperand(
-        getNumControlOperands() + bbArg.getArgNumber() - getNumLoops());
-    }
-
-    /// Get the OpResult that corresponds to an OpOperand.
-    OpResult getResultForOpOperand(OpOperand &opOperand) {
-      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
-             "expected an output args operand");
-      assert(opOperand.getOwner() == getOperation() &&
-             "opOperand does not belong to this gml_st::ForOp operation");
-      return getOperation()->getResult(
-        opOperand.getOperandNumber() - getNumControlOperands());
-    }
-
-    /// Get the OpOperand& that corresponds to an OpResultOpOperand.
-    OpOperand &getOpOperandForResult(OpResult opResult) {
-      assert(opResult.getDefiningOp() == getOperation() &&
-             "opResult does not belong to the gml_st::ForOp operation");
-      return getOperation()->getOpOperand(
-        getNumControlOperands() + opResult.getResultNumber());
-    }
-
-    /// Return the destinations for a gml_st.for op.
-    ValueRange getLoopLikeOpInits() {
-      return getOutputs();
-    }
-  }];
-
-  let hasCanonicalizer = 1;
-}
-
-def GMLST_SetYieldOp : GMLST_Op<"set_yield", [Pure, ReturnLike,
-      Terminator, SameVariadicOperandSize,
-      SingleBlockImplicitTerminator<"YieldOp">
-  ]> {
-  let summary = "Set yield operation";
-  let description = [{
-    `gml_st.set_yield` is a special terminator operation for
-    `gml_st.parallel` or `gml_st.for` body.
-
-    Example:
-
-    ```mlir
-    gml_st.set_yield %result_sub at %tile into %dst
-      : tensor<4x4xf32> into tensor<16x64xf32>[!gml_st.tile<4x4>]
-    ```
-  }];
-  let arguments = (ins Variadic<RankedTensorOrVectorOrScalar>:$srcs,
-                       Variadic<RankedTensorOrVector>:$dsts,
-                       Variadic<AnySet>:$sets,
-                       BoolArrayAttr:$accumulatorFlags);
-  let regions = (region VariadicRegion<SizedRegion<1>>:$accumulators);
-  let hasCustomAssemblyFormat = 1;
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<(ins)>,
-
-    // Builder with default update behaviour, i.e. overriding output.
-    OpBuilder<(ins "ValueRange":$srcs, "ValueRange":$dsts, "ValueRange":$sets)>,
-
-    // Builder with custom update behaviour.
-    OpBuilder<(ins "ValueRange":$srcs, "ValueRange":$dsts, "ValueRange":$sets,
-      "ArrayAttr":$accumulatorFlags,
-      "ArrayRef<function_ref<void(OpBuilder &, Location, Value, Value)>>"
-      :$combiners)>
-  ];
-
-  let extraClassDeclaration = [{
-
-    unsigned getNumUpdates() { return getSrcs().size(); }
-
-    // Methods for `dst` arguments.
-    OpOperand* getDstOperand(unsigned i) {
-      return &getOperation()->getOpOperand(getNumUpdates() + i);
-    }
-
-    FailureOr<OpResult> getTiedOpResult(OpOperand &opOperand) {
-      if (!isDstOperand(opOperand)) return failure();
-
-      auto parent = getOperation()->getBlock()->getParentOp();
-      if (isa<ParallelOp>(parent) || isa<ForOp>(parent)) {
-        return parent->getResult(opOperand.getOperandNumber() -
-                                 getNumUpdates());
-      }
-      return failure();
-    }
-
-    bool isDstOperand(OpOperand& operand) {
-      return operand.getOperandNumber() >= getNumUpdates() &&
-             operand.getOperandNumber() < getNumUpdates() * 2;
-    }
-
-    unsigned getNumDstOperands() { return getNumOperands() - getNumUpdates(); }
-  }];
-
-  let hasCanonicalizer = 1;
-}
-
-// TODO(b/253560795): Figure out where this operation shoud live, and how to
-// model it properly.
-def GMLST_DistributeOp : GMLST_Op<"distribute", [Pure,
-      AllElementTypesMatch<["source", "result"]>]> {
-  let summary = "Tile combining operation";
-  let description = [{
-    `gml_st.distribute` is in a sense an inverse operation to
-    `gml_st.materialize`, and a non-terminator version of `gml_st.set_yield`.
-    It takes one tile of a vector per parallel iteration, and returns a larger
-    vector composed of these tiles.
-
-    It is only needed as an intermediate step when flattening gml_st.parallel
-    operations during SIMTfication for GPU, where one `gml_st.distribute`
-    replaces each output of `gml_st.set_yield`. It only works on vectors, since
-    tensor arguments have been bufferized into memrefs by that point in the
-    pipeline. This is also the reason why it does not need a destination
-    argument.
-  }];
-  let arguments = (ins AnyVectorOfAnyRank:$source, AnySet:$set);
-  let results = (outs AnyVectorOfAnyRank:$result);
-
-  let assemblyFormat = [{
-    $source `into` `[` $set `]` attr-dict `:` type($source)
-      `into` type($result) `[` type($set) `]`
-  }];
-
-  let hasVerifier = 0;
-}
-
-#endif // GML_ST_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/CMakeLists.txt
deleted file mode 100644
index f6a9948ec57..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS passes.td)
-mlir_tablegen(passes.h.inc -gen-pass-decls -name GmlSt)
-add_public_tablegen_target(MLIRGmlStPassIncGen)
-
-set(LLVM_TARGET_DEFINITIONS test_passes.td)
-mlir_tablegen(test_passes.h.inc -gen-pass-decls -name GmlStTest)
-add_public_tablegen_target(MLIRGmlStTestPassIncGen)
-
-set(LLVM_TARGET_DEFINITIONS tiling_interface.td)
-mlir_tablegen(tiling_interface.h.inc -gen-op-interface-decls)
-mlir_tablegen(tiling_interface.cc.inc -gen-op-interface-defs)
-add_public_tablegen_target(MLIRGmlStTilingInterfaceIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h
deleted file mode 100644
index 6a4844771ef..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-
-namespace mlir {
-class DialectRegistry;
-
-namespace gml_st {
-
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion.h
deleted file mode 100644
index d182c9d9255..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/fusion.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_FUSION_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_FUSION_H
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir {
-namespace gml_st {
-
-// Create fused operation based on the specificed subset. The result is
-// equivalent to the given `materialize` op.
-FailureOr<Value> createFusedOp(PatternRewriter &rewriter,
-                               MaterializeOp materializeOp);
-
-/// Populate fusion patterns.
-void populateFusionPatterns(MLIRContext *ctx,
-                            function_ref<LogicalResult(Operation *)> filterFn,
-                            RewritePatternSet *patterns);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_FUSION_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h
deleted file mode 100644
index 20a98220b99..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_PASSES_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_PASSES_H
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
-
-#define GEN_PASS_DECL
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-namespace mlir {
-namespace gml_st {
-
-/// Pass to tile ops using TilingInterface.
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingPass(
-    StringRef opName = "", StringRef opLabel = "", bool distribute = true,
-    ArrayRef<int64_t> tileSizes = {});
-
-/// Pass to fuse producers into a tiled consumer.
-std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
-    StringRef producer = "", StringRef consumer = "");
-
-/// Pass to tile and fuse all cwise ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingCwisePass(
-    bool distribute, ArrayRef<int64_t> tileSizes,
-    StringRef distributionLabel = "");
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingCwisePass();
-
-/// Pass to tile warp-level ops on GPU.
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingGPUWarpPass();
-
-/// Pass to match, tile, and fuse softmax implementations.
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
-    bool distribute, ArrayRef<int64_t> tileSizes,
-    StringRef distributionLabel = "");
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass();
-
-/// Pass to collapse (or uncollapse) materialize operations.
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseMaterializeOpsPass();
-
-/// Create a pass to convert `gml_st.loop` to `scf.for` and `scf.parallel`
-/// loops and memref.load/memref.store accesses.
-std::unique_ptr<OperationPass<func::FuncOp>> createGmlStToScfPass();
-
-/// Pass to vectorize linalg.generic ops tiled to gml_st.parallel and gml_st.for
-/// loops.
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeGmlStLoopsPass(
-    bool vectorizeGmlStOps = false,
-    ArrayRef<StringRef> distributionLabels = {});
-
-/// Pass to transform a thlo.scatter op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformScatterForCpuPass();
-
-/// Pass to transform a linalg.matmul op for CPU backend.
-std::unique_ptr<OperationPass<func::FuncOp>> createTransformMatmulForCpuPass(
-    ArrayRef<int64_t> tileSizes = llvm::None);
-
-/// Pass to transform a linalg.map op for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMapForCpuPass(int64_t tileSize = 1);
-
-/// Pass to transform a linalg.transpose op for CPU backend.
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformTransposeForCpuPass(ArrayRef<int64_t> tileSizes = llvm::None);
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td
deleted file mode 100644
index 80237e3edfe..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.td
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TilingPass : Pass<"gml-tiling", "mlir::func::FuncOp"> {
-  let summary = "Tile operations using TilingInterface to produce gml_st.for";
-  let constructor = "::mlir::gml_st::createTilingPass()";
-  let options = [
-    Option<"opName", "op-name", "std::string", /*default=*/"",
-           "Operation with this name is the anchor to latch on.">,
-    Option<"opLabel", "op-label", "std::string", /*default=*/"",
-           "Operation with this label is the anchor to latch on.">,
-    Option<"distribute", "distribute", "bool", /*default=*/"true",
-           "Generate gml_st.parallel or gml_st.for">,
-    ListOption<"tileSizes", "tile-sizes", "int64_t", "Tile sizes",
-               "llvm::cl::ZeroOrMore">,
-  ];
-}
-
-def FusionPass : Pass<"gml-fusion", "mlir::func::FuncOp"> {
-  let summary = "Fuse producers in into `gml_st.materialize` operations";
-  let constructor = "::mlir::gml_st::createFusionPass()";
-  let options = [
-    Option<"producerLabel", "producer-label", "std::string", /*default=*/"",
-           "Producer label.">,
-    Option<"consumerLabel", "consumer-label", "std::string", /*default=*/"",
-           "Consumer label.">,
-  ];
-}
-
-def TilingCwisePass : Pass<"gml-tiling-cwise", "mlir::func::FuncOp"> {
-  let summary = "Tile and fuse all cwise ops";
-  let constructor = "::mlir::gml_st::createTilingCwisePass()";
-  let options = [
-    Option<"distribute_", "distribute", "bool", /*default=*/"true",
-           "Generate gml_st.parallel or gml_st.for">,
-    ListOption<"tileSizes_", "tile-sizes", "int64_t", 
-               "Right-aligned tile sizes. Do not tile possible remaining "
-               "dimensions", "llvm::cl::ZeroOrMore">,
-    Option<"distributionLabel_", "distribution-label", "std::string",
-            /*default=*/"", "Distribution label for generated gml_st.parallel">,
-  ];
-}
-
-def TilingGPUWarpPass : Pass<"gml-tiling-gpu-warp", "mlir::func::FuncOp"> {
-  let summary = "Tile warp-level ops for GPU";
-  let constructor = "::mlir::gml_st::createTilingGPUWarpPass()";
-  let dependentDialects = ["::mlir::gml_st::GmlStDialect",
-                           "::mlir::arith::ArithDialect"];
-}
-
-def TilingSoftmaxPass : Pass<"gml-tiling-softmax", "mlir::func::FuncOp"> {
-  let summary = "Match, tile, and fuse softmax implementations";
-  let constructor = "::mlir::gml_st::createTilingSoftmaxPass()";
-  let options = [
-    Option<"distribute", "distribute", "bool", /*default=*/"true",
-           "Generate gml_st.parallel or gml_st.for">,
-    ListOption<"tileSizes", "tile-sizes", "int64_t",
-               "Right-aligned tile sizes. Do not tile possible remaining "
-               "dimensions", "llvm::cl::ZeroOrMore">,
-    Option<"distributionLabel", "distribution-label", "std::string",
-            /*default=*/"", "Distribution label for generated gml_st.parallel">,
-  ];
-}
-
-def CollapseMaterializeOpsPass : Pass<"gml-collapse-materialize-ops",
-    "mlir::func::FuncOp"> {
-  let summary = "Collapse (or uncollapse) materialize operations.";
-  let constructor = "::mlir::gml_st::createCollapseMaterializeOpsPass()";
-}
-
-def GmlStToScf : Pass<"gml-st-to-scf", "mlir::func::FuncOp"> {
-  let summary = "Lower `gml_st.loop` to SCF loops and parallel loops";
-  let constructor = "::mlir::gml_st::createGmlStToScfPass()";
-  let dependentDialects = ["::mlir::scf::SCFDialect"];
-}
-
-def GmlStToGpuPass : Pass<"gml-st-to-gpu", "mlir::func::FuncOp"> {
-  let summary = "Lower nested `gml_st.parallel` to `gpu.launch`";
-  let dependentDialects = ["::mlir::AffineDialect",
-                           "::mlir::arith::ArithDialect",
-                           "::mlir::gpu::GPUDialect", "::mlir::scf::SCFDialect",
-                           "::mlir::vector::VectorDialect",
-                           "::mlir::memref::MemRefDialect"];
-}
-
-def VectorizeGmlStLoopsPass :
-    Pass<"vectorize-gml-st-loops", "mlir::func::FuncOp"> {
-  let summary =
-      "Pass to vectorize linalg.generic ops tiled to gml_st.parallel and " #
-      "gml_st.for loops.";
-  let constructor = "::mlir::gml_st::createVectorizeGmlStLoopsPass()";
-  let options = [
-    Option<"vectorizeGmlStOps", "vectorize-gml-st-ops", "bool", "false",
-           "If true, vectorizes GmlSt ops in addition to linalg ops">,
-    ListOption<"distributionLabels", "included-distribution-labels",
-               "std::string", "Distribution labels of gml_st.parallel ops "
-               "where vectorization is allowed. Empty list signifies that "
-               "vectorization is allowed within all loops.",
-               "llvm::cl::ZeroOrMore">,
-  ];
-  let dependentDialects = ["::mlir::vector::VectorDialect"];
-}
-
-def TransformScatterForCpuPass :
-    Pass<"xla-cpu-transform-scatter", "mlir::func::FuncOp"> {
-  let summary = "Transform scatter ops for running on CPU";
-
-  let constructor = "createTransformScatterForCpuPass()";
-}
-
-def TransformMatmulForCpuPass :
-    Pass<"xla-cpu-transform-matmul", "mlir::func::FuncOp"> {
-  let summary = "Transform matmul ops for running on CPU";
-
-  let constructor = "createTransformMatmulForCpuPass()";
-
-  let options = [
-    ListOption<"tileSizes", "tile-sizes", "int64_t",
-               "Tile sizes for a `linalg.matmul`">,
-  ];
-}
-
-def TransformMapForCpuPass :
-    Pass <"gml-st-cpu-transform-map", "mlir::func::FuncOp"> {
-  let summary = "Transform map ops for running on CPU";
-
-  let constructor = "::mlir::gml_st::createTransformMapForCpuPass()";
-
-  let options = [
-    Option<"tileSize", "tile-size", "int64_t", "1",
-           "Tile size for the innermost dimension of `linalg.map`">,
-  ];
-}
-
-def TransformTransposeForCpuPass :
-    Pass<"gml-st-cpu-transform-transpose", "mlir::func::FuncOp"> {
-  let summary = "Transform transpose ops for running on CPU";
-
-  let constructor = "createTransformTransposeForCpuPass()";
-
-  let options = [
-    ListOption<"tileSizes", "tile-sizes", "int64_t",
-               "Tile sizes for a `linalg.transpose`">,
-  ];
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/rewriters.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/rewriters.h
deleted file mode 100644
index bf7950f3bd1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/rewriters.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_REWRITERS_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_REWRITERS_H
-
-#include <functional>
-
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace bufferization {
-class BufferizeTypeConverter;
-}  // namespace bufferization
-class MLIRContext;
-class RewritePatternSet;
-
-namespace gml_st {
-
-/// Populate pattern to bufferize `linalg.tiled_loop`.
-void populateTiledLoopBufferizePattern(
-    MLIRContext *context,
-    mlir::bufferization::BufferizeTypeConverter *converter,
-    RewritePatternSet *patterns);
-
-void populateCollapseMaterializeOpsPatterns(MLIRContext *, RewritePatternSet *);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_REWRITERS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h
deleted file mode 100644
index c4cc30d423d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TEST_PASSES_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TEST_PASSES_H
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace gml_st {
-
-#define GEN_PASS_DECL
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h.inc"
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStLoopPeelingPass();
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStLoopTilingPass();
-
-std::unique_ptr<OperationPass<ModuleOp>> createTestGmlStBufferizationPass();
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h.inc"
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TEST_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.td
deleted file mode 100644
index ad35a35da47..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/test_passes.td
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TestGmlStLoopPeeling : Pass<"test-gml-st-loop-peeling", "mlir::func::FuncOp"> {
-  let summary = "Peel `gml_st.loop`";
-  let constructor = "::mlir::gml_st::createTestGmlStLoopPeelingPass()";
-  let options = [
-    Option<"skip_partial", "skip-partial", "bool", /*default=*/"false",
-           "Skip loops inside partial iterations during peeling">,
-    ListOption<"dims", "dims", "unsigned", "Dimensions to peel",
-           "llvm::cl::OneOrMore">,
-  ];
-}
-
-def TestGmlStLoopTiling : Pass<"test-gml-st-loop-tiling", "mlir::func::FuncOp"> {
-  let summary = "Tile `gml_st.loop`.";
-  let constructor = "::mlir::gml_st::createTestGmlStLoopTilingPass()";
-  let dependentDialects = [
-    "AffineDialect",
-    "gml_st::GmlStDialect",
-    "linalg::LinalgDialect",
-    "memref::MemRefDialect"
-  ];
-  let options = [
-    ListOption<"tile_sizes", "tile-sizes", "int64_t", "Tile sizes",
-               "llvm::cl::ZeroOrMore">,
-    ListOption<"distribution_types", "distribution-types", "std::string",
-               "Distribution types",
-               "llvm::cl::ZeroOrMore">
-
-  ];
-}
-
-def TestGmlStBufferization
-    : Pass<"test-gml-st-bufferization", "mlir::ModuleOp"> {
-  let summary = "Bufferize `gml_st.loop`.";
-  let constructor = "::mlir::gml_st::createTestGmlStBufferizationPass()";
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling.h
deleted file mode 100644
index d856e33dd6c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_H
-
-#include <functional>
-#include <string>
-
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir {
-namespace gml_st {
-
-struct TilingResult {
-  Operation *tiledOp = nullptr;
-  Operation *loop = nullptr;
-};
-
-/// Options to use to control tiling.
-struct TilingOptions {
-  using TileSizeComputationFn =
-      std::function<SmallVector<Value>(OpBuilder &, Operation *)>;
-
-  /// Function to materialize the tile sizes for a given operation. This allows
-  /// to infer tile sizes statically, e.g. based on an operation's rank, and
-  /// also dynamically based, e.g. based on a tensor's shape at runtime.
-  TileSizeComputationFn tileSizeComputationFn = nullptr;
-
-  /// If `true`, generate a `gml_st.parallel` loop nest.
-  bool distribute = true;
-
-  // Distribution label to add to the gml_st.parallel op
-  std::string distributionLabel = "";
-
-  /// Convenience function to set the `tileSizeComputationFn` to a
-  /// function that computes tile sizes from an input vector parameter.
-  void setTileSizeComputationFn(ArrayRef<int64_t> ts);
-};
-
-/// Create tiled operation based on the specified tiling options. The result is
-/// equivalent to original op.
-FailureOr<TilingResult> tile(const TilingOptions &options,
-                             PatternRewriter &rewriter, TilingInterface op);
-
-/// Populate tiling patterns.
-void populateTilingPatterns(
-    MLIRContext *context,
-    llvm::function_ref<LogicalResult(Operation *)> filterFn,
-    const TilingOptions &opts, RewritePatternSet *patterns);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h
deleted file mode 100644
index b15c137375c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_H
-
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-
-// Include generated definitions.
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h.inc"
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td
deleted file mode 100644
index 3907368cad6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef GML_ST_TILING_INTERFACE
-#define GML_ST_TILING_INTERFACE
-
-include "mlir/IR/OpBase.td"
-
-def TilingInterface : OpInterface<"TilingInterface"> {
-  let description = [{
-    Interface for operations to expose information needed to tile them.
-  }];
-  let cppNamespace = "::mlir::gml_st";
-  let methods = [
-      InterfaceMethod<
-        /*desc=*/[{
-          Returns a list of operands into which the result of the
-          tiled implementation is written into. With `tensor`
-          operands, this will be used as the initial tensor into which
-          the tiled results are inserted into. With `memref` operands,
-          this will be the operand into which the result of the tiled
-          operation is written into.
-        }],
-        /*retType=*/"SmallVector<Value>",
-        /*methodName=*/"getDestinationOperands",
-        /*args=*/(ins "OpBuilder &":$b)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Returns a list of iterator types that describe the number of loops.
-        }],
-        /*retType=*/"SmallVector<utils::IteratorType>",
-        /*methodName=*/"getLoopIteratorTypes",
-        /*args=*/(ins)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Returns a list of ranges that describe the loop bounds and
-          step for the loops of the operation.
-        }],
-        /*retTy=*/"SmallVector<Range>",
-        /*methodName=*/"getIterationDomain",
-        /*args=*/(ins "OpBuilder &":$b)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Method to generate the tiled implementation of an operation.
-
-          The iteration space of the operation is returned by
-          `getIterationDomain`. The caller provides the information of the
-          tile within this iteration space whose implementation the
-          caller needs.
-          - `offsets` provides the offset of the tile in the coordinate system
-            of the original iteration space, i.e., if an iteration space
-            dimension had non-zero offset, it must be included in the offset
-            provided here (as opposed to zero-based offset "relative" to the
-            iteration space).
-          - `sizes` provides the size of the tile.
-
-          The method returns the operation that is the tiled
-          implementation.
-        }],
-        /*retType=*/"mlir::gml_st::TilingInterface",
-        /*methodName=*/"getTiledImplementation",
-        /*args=*/(ins
-            "OpBuilder &":$b,
-            "ArrayRef<OpFoldResult>":$offsets,
-            "ArrayRef<OpFoldResult>":$sizes)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Generates the IR that computes the tile of a result of the
-          operation.  The `offsets` and `sizes` describe the tile of
-          the output required. This is different from
-          `getTiledImplementation` which generates the tiled
-          implementation of the operation given a tile of the
-          iteration space. This method generates a tiled
-          implementation of the operation based on the tile of the
-          result required. This method enables fusion by using tile
-          and fuse. The method returns failure if the operation can't be
-          tiled to generate the result tile. In practical terms this
-          implies it cannot be tiled and fused with its consumers.
-
-          - `offsets` provides the offset of the tile in the coordinate system
-            of the original iteration space, i.e., if an iteration space
-            dimension had non-zero offset, it must be included in the offset
-            provided here (as opposed to zero-based offset "relative" to the
-            iteration space).
-          - `sizes` provides the size of the tile.
-        }],
-        /*retType=*/"FailureOr<Value>",
-        /*methodName=*/"generateResultTileValue",
-        /*args=*/(ins
-          "OpBuilder &":$b,
-          "unsigned":$resultNumber,
-          "ArrayRef<OpFoldResult>":$offsets,
-          "ArrayRef<OpFoldResult>":$sizes)
-      >
-  ];
-}
-#endif // GML_ST_TILING_INTERFACE
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h
deleted file mode 100644
index f99cbf2327f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_IMPL_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_IMPL_H
-
-namespace mlir {
-
-class DialectRegistry;
-
-namespace gml_st {
-
-void registerGmlStTilingInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TILING_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/transforms.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/transforms.h
deleted file mode 100644
index 3e6ca879e39..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/transforms.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TRANSFORMS_H
-#define MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TRANSFORMS_H
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir/IR/PatternMatch.h"
-
-namespace mlir {
-namespace linalg {
-class LinalgOp;
-struct TiledLinalgOp;
-struct LinalgTilingOptions;
-}  // namespace linalg
-}  // namespace mlir
-
-namespace mlir {
-namespace gml_st {
-
-bool isZero(Value v);
-
-/// Rewrite a gml_st::LoopOp/ParallelOp/ForOp with bounds/step that potentially
-/// do not divide evenly into a gml_st::LoopOp/ParallelOp/ForOp where the step
-/// divides the iteration space evenly, followed by another
-/// gml_st::LoopOp/ParallelOp/ForOp for the last (partial) iteration (if any).
-/// This transformation is called "loop peeling".
-///
-/// This function peels the `idx`-th loop of the
-/// gml_st::LoopOp/ParallelOp/ForOp. To tile all loops in the loop nest, this
-/// function must be called multiple times.
-///
-/// After loop peeling, this function tries to simplify/canonicalize affine.min
-/// and affine.max ops in the body of the two gml_st::LoopOp/ParallelOp/ForOps.
-/// For more details, refer to `mlir::scf::peelAndCanonicalizeForLoop`.
-///
-/// The return value indicates whether the loop was rewritten or not. Loops are
-/// not rewritten if:
-/// * Loop step size is 1 or
-/// * Loop bounds and step size are static, and step already divides the
-///   iteration space evenly.
-///
-/// Note: This function rewrites the given gml_st::LoopOp/ParallelOp/ForOp
-/// in-place and clones the gml_st::LoopOp/ParallelOp/ForOp operation for the
-/// last iteration. It replaces all uses of the unpeeled
-/// gml_st::LoopOp/ParallelOp/ForOp with the results of the newly generated
-/// gml_st::LoopOp/ParallelOp/ForOp.
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter,
-                                           LoopOp loopOp, int64_t idx,
-                                           LoopOp &result);
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter,
-                                           ParallelOp loopOp, int64_t idx,
-                                           ParallelOp &result);
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter, ForOp loopOp,
-                                           int64_t idx, ForOp &result);
-
-/// Perform standalone tiling of a single LinalgOp by `tileSizes`.
-/// An empty vector is interpreted as the identity permutation and the
-/// transformation returns early.
-///
-/// Return a struct containing the tiled loops in the specified order
-/// and the cloned op if successful, llvm::None otherwise.
-FailureOr<linalg::TiledLinalgOp> tileLinalgOp(
-    RewriterBase &b, linalg::LinalgOp op,
-    const linalg::LinalgTilingOptions &options);
-
-// Sets the attribute to the `op` that indicates that the op was transformed.
-void setTransformationAttr(OpBuilder &b, Operation *op);
-
-// Removes the attribute that indicates that it was transformed.
-void removeTransformationAttr(Operation *op);
-
-// Checks if `op` has the attribute that indicates that it was transformed.
-bool hasTransformationAttr(Operation *op);
-
-// Checks if `op` has the matching label attribute.
-bool hasMatchingLabel(Operation *op, StringRef label);
-
-// Uncollapse materialize operations with nested tile chains t1, t2, ..., tn. A
-// materialize op of the form ...
-//   `materialize(t1(t2(...(tn(sn)))), arg)`
-// ... is expanded into ...
-//   `materialize(t1(s1), materialize(t2(...(tn(sn))), arg))`.
-FailureOr<MaterializeOp> uncollapseMaterializeOp(OpBuilder &b,
-                                                 MaterializeOp op);
-
-// Collapse materialize operations with nested tile chains t1, t2, ..., tn, and
-// u1, u2, ..., un. A materialize op of the form ...
-//   `materialize(t1(t2(...(tn(sn)))), materialize(u1(u2(...(un(sn')))), arg))`
-// ... is collapsed as ...
-//   `materialize(t1(t2(...(tn(u1(u2(...(un(sn'))))))), arg)`.
-FailureOr<MaterializeOp> collapseMaterializeOp(OpBuilder &b, MaterializeOp op);
-
-}  // namespace gml_st
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_GML_ST_TRANSFORMS_TRANSFORMS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/CMakeLists.txt
deleted file mode 100644
index 8f2c2057af1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
-function(add_mlir_hlo_dialect_separate_files dialect)
-  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
-  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
-  mlir_tablegen(${dialect}.cc.inc -gen-op-defs)
-  set(LLVM_TARGET_DEFINITIONS ${dialect}_structs.td)
-  mlir_tablegen(${dialect}_structs.h.inc -gen-attrdef-decls)
-  mlir_tablegen(${dialect}_structs.cc.inc -gen-attrdef-defs)
-  add_public_tablegen_target(MLIR${dialect}IncGen)
-  add_dependencies(mlir-headers MLIR${dialect}IncGen)
-endfunction()
-
-add_mlir_hlo_dialect_separate_files(lhlo_ops)
-
-add_mlir_interface(lhlo_structured_interface)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h
deleted file mode 100644
index 29623435b00..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
-#define MLIR_HLO_DIALECT_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
-
-#include "mlir/IR/OpDefinition.h"
-
-/// Include the generated interface declarations.
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h.inc"
-
-#endif  // MLIR_HLO_DIALECT_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/CMakeLists.txt
deleted file mode 100644
index c53be9d74f4..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS lmhlo_passes.td)
-mlir_tablegen(lmhlo_passes.h.inc -gen-pass-decls -name AllLmhlo)
-add_public_tablegen_target(MLIRLmhloPassIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.td
deleted file mode 100644
index 14ee607c671..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.td
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def LhloFuseLinalgPass : Pass<"lhlo-fuse-linalg", "func::FuncOp"> {
-  let summary = "Greedily fuse linalg ops obtained after LHLO lowering.";
-  let constructor = "createLhloFuseLinalgPass()";
-  let options = [
-    Option<"use_parallel_loops_", "use-parallel-loops", "bool",
-           /*default=*/"false", "Tiles GenericOp consumer to parallel loops before linalg fusion">,
-    ListOption<"tile_sizes_", "tile-sizes", "unsigned",
-           "Faster memory space number to promote fusion buffers to",
-           "llvm::cl::ZeroOrMore">,
-  ];
-}
-
-def LhloLegalizeToAffinePass : Pass<"lhlo-legalize-to-affine", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to affine dialect.";
-  let constructor = "createLhloLegalizeToAffinePass()";
-}
-
-
-def LhloLegalizeToGpuPass : Pass<"lhlo-legalize-to-gpu", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to GPU dialect.";
-  let constructor = "createLegalizeToGpuPass()";
-}
-
-
-def LhloLegalizeToParallelLoopsPass : Pass<"lhlo-legalize-to-parallel-loops", "func::FuncOp"> {
-  let summary = "Legalize from LHLO dialect to parallel loops.";
-  let constructor = "createLegalizeLhloToParallelLoopsPass()";
-}
-
-def LegalizeToTensorOpPass : Pass<"lhlo-legalize-to-tensor-op", "func::FuncOp"> {
-  let summary = "Legalize bufferization.to_tensor ops inserted during mhlo to lmhlo conversion.";
-  let constructor = "createLegalizeToTensorOpPass()";
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h
deleted file mode 100644
index 580788168e1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/passes.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_LHLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_DIALECT_LHLO_TRANSFORMS_PASSES_H
-
-#include <memory>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-class ModuleOp;
-class Operation;
-template <typename T>
-class OperationPass;
-class Pass;
-namespace func {
-class FuncOp;
-}  // namespace func
-namespace lmhlo {
-class FusionOp;
-}  // namespace lmhlo
-
-namespace lmhlo {
-
-#define GEN_PASS_DECL
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
-
-// Lowers from LHLO dialect to Affine dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLhloLegalizeToAffinePass();
-
-// Lowers from LHLO dialect to GPU dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToGpuPass();
-
-// Fuses linalg ops obtained after LHLO lowering. To enable fusion,
-// operations are first tiled.
-//
-// When 'use_parallel_loops' is set, the tiling will use scf.parallel
-// operations. Otherwise, scf.for operations are used.
-//
-// 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
-// operation has more dimensions than tile sizes provided, 1 is used as
-// default.
-std::unique_ptr<OperationPass<func::FuncOp>> createLhloFuseLinalgPass(
-    bool useParallelLoops = false, llvm::ArrayRef<unsigned> tileSizes = {});
-
-// Lowers from LHLO dialect to parallel loops.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeLhloToParallelLoopsPass();
-
-// Legalizes tensor load ops that are inserted during mhlo to lmhlo conversion.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToTensorOpPass();
-
-// Input inline fusion pass for fusion codegen
-std::unique_ptr<OperationPass<func::FuncOp>> createInputInlineFusionPass();
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_LHLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/CMakeLists.txt
deleted file mode 100644
index 94b73785217..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops.td)
-mlir_tablegen(lhlo_gpu_ops.h.inc -gen-op-decls)
-mlir_tablegen(lhlo_gpu_ops.cc.inc -gen-op-defs)
-
-set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops_enums.td)
-mlir_tablegen(lhlo_gpu_ops_enums.h.inc -gen-enum-decls)
-mlir_tablegen(lhlo_gpu_ops_enums.cc.inc -gen-enum-defs)
-mlir_tablegen(lhlo_gpu_ops_attrdefs.h.inc -gen-attrdef-decls)
-mlir_tablegen(lhlo_gpu_ops_attrdefs.cc.inc -gen-attrdef-defs)
-mlir_tablegen(lhlo_gpu_ops_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(lhlo_gpu_ops_dialect.cc.inc -gen-dialect-defs)
-
-add_public_tablegen_target(MLIRlhlo_gpu_opsIncGen)
-add_dependencies(mlir-headers MLIRlhlo_gpu_opsIncGen)
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h
deleted file mode 100644
index dc1a0caae76..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the LHLO dialect.
-
-#ifndef MLIR_HLO_DIALECT_LHLO_GPU_IR_LHLO_GPU_OPS_H
-#define MLIR_HLO_DIALECT_LHLO_GPU_IR_LHLO_GPU_OPS_H
-
-#include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-
-namespace mlir {
-class OpBuilder;
-}  // namespace mlir
-
-// Include order below matters.
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_dialect.h.inc"
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.h.inc"
-#define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.h.inc"
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h.inc"
-
-#endif  // MLIR_HLO_DIALECT_LHLO_GPU_IR_LHLO_GPU_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
deleted file mode 100644
index e138afa587f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
deleted file mode 100644
index 6441c084a85..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
-set(LLVM_TARGET_DEFINITIONS hlo_ops.td)
-mlir_tablegen(hlo_ops.h.inc -gen-op-decls)
-mlir_tablegen(hlo_ops.cc.inc -gen-op-defs)
-mlir_tablegen(hlo_ops_enums.h.inc -gen-enum-decls)
-mlir_tablegen(hlo_ops_enums.cc.inc -gen-enum-defs)
-mlir_tablegen(hlo_ops_attrs.h.inc -gen-attrdef-decls)
-mlir_tablegen(hlo_ops_attrs.cc.inc -gen-attrdef-defs)
-mlir_tablegen(hlo_ops_typedefs.h.inc -gen-typedef-decls --typedefs-dialect=mhlo)
-mlir_tablegen(hlo_ops_typedefs.cc.inc -gen-typedef-defs --typedefs-dialect=mhlo)
-add_public_tablegen_target(MLIRhlo_opsIncGen)
-add_dependencies(mlir-headers MLIRhlo_opsIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.td
deleted file mode 100644
index 05bbbd6a0b9..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.td
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-
-#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
-#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
-
-include "mlir/IR/OpBase.td"
-include "mlir/IR/TensorEncoding.td"
-
-def HLODim : ArrayRefParameter<"int64_t", "Dimension"> {
-  let parser = "mlir::mhlo::parseIntArray($_parser)";
-  let printer = "mlir::mhlo::printIntArray($_printer, $_self)";
-}
-
-def ScatterDimensionNumbers : AttrDef<HLO_Dialect, "ScatterDimensionNumbers"> {
-  let mnemonic = "scatter";
-  let summary = "Attribute that models the dimension information for scatter";
-  let parameters = (ins
-      HLODim:$updateWindowDims,
-      HLODim:$insertedWindowDims,
-      HLODim:$scatterDimsToOperandDims,
-      "int64_t":$indexVectorDim
-  );
-  let hasCustomAssemblyFormat = 1;
-}
-
-def GatherDimensionNumbers : AttrDef<HLO_Dialect, "GatherDimensionNumbers"> {
-  let mnemonic = "gather";
-  let summary = "Attribute that models the dimension information for gather";
-  let parameters = (ins
-      HLODim:$offsetDims,
-      HLODim:$collapsedSliceDims,
-      HLODim:$startIndexMap,
-      "int64_t":$indexVectorDim
-  );
-  let hasCustomAssemblyFormat = 1;
-}
-
-def DotDimensionNumbers : AttrDef<HLO_Dialect, "DotDimensionNumbers"> {
-  let mnemonic = "dot";
-  let summary = "Attribute that models the dimension information for dot.";
-  let parameters = (ins
-      HLODim:$lhsBatchingDimensions,
-      HLODim:$rhsBatchingDimensions,
-      HLODim:$lhsContractingDimensions,
-      HLODim:$rhsContractingDimensions
-  );
-  let hasCustomAssemblyFormat = 1;
-}
-
-def ConvDimensionNumbers : AttrDef<HLO_Dialect, "ConvDimensionNumbers"> {
-  let cppNamespace = "::mlir::mhlo";
-  let mnemonic = "conv";
-  let summary = "Structure of dimension information for conv op";
-  let parameters = (ins
-    "int64_t":$inputBatchDimension,
-    "int64_t":$inputFeatureDimension,
-    HLODim:$inputSpatialDimensions,
-    "int64_t":$kernelInputFeatureDimension,
-    "int64_t":$kernelOutputFeatureDimension,
-    HLODim:$kernelSpatialDimensions,
-    "int64_t":$outputBatchDimension,
-    "int64_t":$outputFeatureDimension,
-    HLODim:$outputSpatialDimensions
-  );
-  let hasCustomAssemblyFormat = 1;
-}
-
-def OutputOperandAlias : AttrDef<HLO_Dialect, "OutputOperandAlias"> {
-  let cppNamespace = "::mlir::mhlo";
-  let mnemonic = "output_operand_alias";
-  let summary =
-    "Attribute that models the alias relationship of output and operand of a CustomCall op";
-  let description = [{
-    This attribute captures the alias relationship of the output to one of the
-    operands for a CustomCall op, denoted by `operand_index`. The
-    `output_tuple_indices` and `operand_tuple_indices` are used to index into
-    output and operand types. These indices lists are empty if the corresponding
-    types are not tuple types, and can be arbitrarily long in case of
-    arbitrarily nested tuple types.
-
-    See https://www.tensorflow.org/xla/aliasing.
-
-    Example when used as array with in mhlo.custom-call:
-
-    ```mlir
-    %0 = "mhlo.custom_call"(%arg0, %arg1) {
-      // other attributes
-      output_operand_alias = [
-        #mhlo.output_operand_alias<output_tuple_indices = [0],
-                                   operand_index = 0,
-                                   operand_tuple_indices = [1]>
-      ]
-    } : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
-
-    The output and the 0th operand are both tuples. The aliasing shows the
-    relationship between the 0th element in output tuple with the 1st element in
-    the 0th operand. And both of them are of the same type: tensor<2x3xf32>.
-    ```
-  }];
-  let parameters = (ins
-    HLODim:$outputTupleIndices,
-    "int64_t":$operandIndex,
-    HLODim:$operandTupleIndices
-  );
-  let assemblyFormat = [{
-    `<` `output_tuple_indices` `=` $outputTupleIndices `,`
-        `operand_index` `=` $operandIndex `,`
-        `operand_tuple_indices` `=` $operandTupleIndices `>`
-  }];
-}
-
-def ArgResultAlias : AttrDef<HLO_Dialect, "ArgResultAlias"> {
-  let cppNamespace = "::mlir::mhlo";
-  let mnemonic = "result_alias";
-  let summary =
-    "Attribute that models the alias relationship of entry function argument";
-  let description = [{
-    This attribute captures the alias relationship of an MHLO main function
-    argument to one of the results, denoted by `resultIndex`. The
-    `argTupleIndices` and `resultTupleIndices` are used to index into nested
-    tuples in operand and result respectively. If `isMustAlias` is true then the
-    operand-result pair must alias.
-
-    This is meant to be used as an attribute on a function argument in MHLO.
-    For example, in the following code it expresses that `%arg1` may alias 0-th
-    result.
-
-    ```mlir
-    func @main(%arg0: tensor<2xf32>, %arg1: tensor<3xf32> {mhlo.result_alias =
-        mhlo.result_alias<result_index = [2], ...>}
-      ) -> tensor<2xf32>, tensor<3xf32> {
-      // function body ...
-    }
-    ```
-  }];
-  let parameters = (ins
-    HLODim:$argTupleIndices,
-    "int64_t":$resultIndex,
-    HLODim:$resultTupleIndices,
-    "bool":$isMustAlias
-  );
-  let hasCustomAssemblyFormat = 1;
-}
-
-// Represents a unique identifier for each Send/Recv instruction pair or
-// optionally for collective instructions (AllReduce, CollectivePermute,
-// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
-def ChannelHandle : AttrDef<HLO_Dialect, "ChannelHandle"> {
-  let mnemonic = "channel_handle";
-  let parameters = (ins "int64_t":$handle, "int64_t":$type);
-  let summary = "two 64-bit integers 'handle' and 'type'";
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-// Note: This is an experimental attribute and shouldn't be relied upon for
-// production.
-def TypeExtensions : AttrDef<HLO_Dialect, "TypeExtensions", [
-    DeclareAttrInterfaceMethods<VerifiableTensorEncoding>,
-    DeclareAttrInterfaceMethods<HLO_BoundedAttrInterface>]> {
-  let mnemonic = "type_extensions";
-
-  // TODO(b/238903065): Move sparsity related info here from the standalone
-  // attribute. That will allow composition of bounds and sparsity info.
-  let parameters = (ins
-    ArrayRefParameter<"int64_t">:$bounds
-  );
-
-  let summary = "Attribute that extends tensor type with MHLO type properties.";
-
-  let description = [{
-    This attribute is used to extend MLIR tensor type with MHLO tensor specific
-    properties. These properties aren't modeled in the MLIR type. This
-    attribute is set in the `encoding` field of the tensor type.
-
-    See `HLO_BoundedAttrInterface` for documentation for `bounds`.
-  }];
-  let assemblyFormat = "`<` `bounds` `=` `[` $bounds `]` `>`";
-}
-
-// A layout attribute (1D tensor of index type)
-def HLO_LayoutAttr : Attr<
-  And<[IndexElementsAttr.predicate,
-       CPred<[{$_self.cast<::mlir::DenseIntElementsAttr>().getType().getRank()
-               == 1}]>]>,
-  "A 1D tensor of index type (layout)"> {
-  let storageType = IndexElementsAttr.storageType;
-  let returnType = IndexElementsAttr.returnType;
-  let convertFromStorage = IndexElementsAttr.convertFromStorage;
-}
-
-// An array of layout (1D tensor) attributes.
-def HLO_ArrayOfLayoutAttr : TypedArrayAttrBase<HLO_LayoutAttr,
-    "Array of layout (1D tensor of index type) attributes">;
-
-// An array of FlatSymbolRef attributes that can be used as a default valued
-// attribute.
-def HLO_FlatSymbolRefArrayAttr :
-  TypedArrayAttrBase<FlatSymbolRefAttr, "flat symbol ref array attribute"> {
-  let constBuilderCall = "::mlir::ArrayAttr::get($_builder.getContext(), $0)";
-}
-
-//===----------------------------------------------------------------------===//
-// Common convolution attributes
-//===----------------------------------------------------------------------===//
-
-def BoolElementsAttr :
-    ElementsAttrBase<
-      And<[CPred<"$_self.isa<::mlir::DenseIntOrFPElementsAttr>()">,
-           CPred<"$_self.cast<::mlir::DenseIntOrFPElementsAttr>().getType().getElementType().isInteger(1)">]>,
-      "constant boolean vector/tensor attribute"> {
-  let storageType = [{ ::mlir::DenseElementsAttr }];
-  let returnType = [{ ::mlir::DenseElementsAttr }];
-
-  let convertFromStorage = "$_self";
-}
-
-def ConvolutionAttributes {
-  dag attributes = (ins
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$window_strides,
-    // Default value: two zeros for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$padding,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$lhs_dilation,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$rhs_dilation,
-    // Default value: false for each of the spatial dimension.
-    OptionalAttr<BoolElementsAttr>:$window_reversal,
-    ConvDimensionNumbers:$dimension_numbers,
-    I64Attr:$feature_group_count,
-    I64Attr:$batch_group_count,
-    HLO_PrecisionConfigAttr:$precision_config
-  );
-}
-
-#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h
deleted file mode 100644
index f1c3094926e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON_H
-#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON_H
-
-// This file defines functionality shared between chlo/mhlo/lhlo dialects.
-
-#include <algorithm>
-
-#include "llvm/ADT/SmallSet.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace hlo {
-
-// TODO(b/236017415): remove when mhlo uses prefix accessor.
-namespace accessor_dispatch {
-template <typename OpT>
-auto getReplicaGroups(OpT op, int)
-    -> decltype(op.getReplicaGroups(), DenseIntElementsAttr{}) {
-  return op.getReplicaGroups();
-}
-template <typename OpT>
-auto getReplicaGroups(OpT op, char)
-    -> decltype(op.replica_groups(), DenseIntElementsAttr{}) {
-  return op.replica_groups();
-}
-}  // namespace accessor_dispatch
-
-// Verifies replica groups attached to collective communication operations.
-// If the attribute is not empty, it must be a rank 2 tensor, and each replica
-// should appear exactly once. If `is_uniform_sized` is true, then we also check
-// that each group is of the same size. If the operation has
-// `use_global_device_ids` set, then replica group cannot be empty.
-template <typename OpT>
-LogicalResult verifyReplicaGroups(OpT op, bool isUniformSized) {
-  DenseIntElementsAttr attr = accessor_dispatch::getReplicaGroups(op, 0);
-  auto replicaGroupType = attr.getType().dyn_cast<RankedTensorType>();
-  if (!replicaGroupType || replicaGroupType.getRank() != 2 ||
-      !replicaGroupType.getElementType().isInteger(/*width=*/64))
-    return op.emitOpError(
-        "replica groups should be a rank 2 tensor of 64 bit integers");
-
-  if (replicaGroupType.getShape().equals(ArrayRef<int64_t>{0, 0})) {
-    // verifyReplicaGroups() is used by MHLO and LMHLO, note that MHLO does not
-    // have attr 'use_global_device_ids' actually.
-    if (op->hasAttr("use_global_device_ids") &&
-        op->getAttr("use_global_device_ids")
-            .template cast<BoolAttr>()
-            .getValue()) {
-      return op.emitOpError(
-          "if `use_global_device_ids` is set, the replica groups cannot be "
-          "empty");
-    }
-    return success();
-  }
-
-  int64_t maxReplicaIdSeen = 0;
-  llvm::SmallSet<int64_t, 8> replicaSeen;
-  for (int64_t id : attr.getValues<int64_t>()) {
-    // Replica groups are stored in a 2D tensor. If the op supports non-uniform
-    // groups, null replica IDs are stored as -1.
-    if (id == -1) {
-      if (isUniformSized) {
-        return op.emitOpError("Invalid replica id -1");
-      }
-      continue;
-    }
-
-    if (!replicaSeen.insert(id).second) {
-      return op.emitOpError("replica id #") << id << " seen more than once";
-    }
-    maxReplicaIdSeen = std::max(maxReplicaIdSeen, id);
-  }
-
-  for (int64_t id = 0; id <= maxReplicaIdSeen; id++) {
-    if (!replicaSeen.contains(id)) {
-      return op.emitOpError("replica id #")
-             << id << " not seen in replica groups";
-    }
-  }
-  return success();
-}
-
-// Verifies the source target pairs attached to collective permute.
-LogicalResult verifyCollectivePermuteSourceTargetPairs(
-    Operation* op, DenseIntElementsAttr attr);
-
-LogicalResult verifyReduceScatter(Operation* op, TypeRange operandTypes,
-                                  TypeRange resultTypes,
-                                  uint64_t scatterDimension);
-
-// Custom formatting for convolution window attributes.
-void printWindowAttributes(OpAsmPrinter& p, Operation* op,
-                           llvm::Optional<DenseIntElementsAttr> windowStrides,
-                           llvm::Optional<DenseIntElementsAttr> padding,
-                           llvm::Optional<DenseIntElementsAttr> lhsDilation,
-                           llvm::Optional<DenseIntElementsAttr> rhsDilation,
-                           llvm::Optional<DenseElementsAttr> windowReversal);
-
-ParseResult parseWindowAttributes(OpAsmParser& parser,
-                                  DenseIntElementsAttr& windowStrides,
-                                  DenseIntElementsAttr& padding,
-                                  DenseIntElementsAttr& lhsDilation,
-                                  DenseIntElementsAttr& rhsDilation,
-                                  DenseElementsAttr& windowReversal);
-
-}  // namespace hlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.td
deleted file mode 100644
index 81937627495..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.td
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
-#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
-
-include "mlir/IR/EnumAttr.td"
-include "mlir/IR/PatternBase.td"
-
-//===----------------------------------------------------------------------===//
-// Precision Config enum definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA PrecisionConfig proto enum.
-def HLO_PRECISION_DEFAULT : I32EnumAttrCase<"DEFAULT", 0>;
-def HLO_PRECISION_HIGH    : I32EnumAttrCase<"HIGH", 1>;
-def HLO_PRECISION_HIGHEST : I32EnumAttrCase<"HIGHEST", 2>;
-def HLO_PRECISION_PACKED_NIBBLE : I32EnumAttrCase<"PACKED_NIBBLE", 3>;
-
-def HLO_Precision : I32EnumAttr<"Precision",
-    "XLA precision for an operand. Has backend specific meaning.",
-    [HLO_PRECISION_DEFAULT,  HLO_PRECISION_HIGH, HLO_PRECISION_HIGHEST, HLO_PRECISION_PACKED_NIBBLE]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_PrecisionAttr : EnumAttr<HLO_Dialect, HLO_Precision, "precision">;
-
-// TODO(b/129153247) See if it's possible to also validate the size.
-def HLO_PrecisionConfigAttr:
-    OptionalAttr<
-          TypedArrayAttrBase<HLO_PrecisionAttr, "Precision Config attribute">>;
-
-//===----------------------------------------------------------------------===//
-// Domain Metadata Kind  enum definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA FftType proto enum.
-def HLO_DOMAIN_KIND_SHARDING : I32EnumAttrCase<"sharding", 0>;
-
-def HLO_DomainKind : I32EnumAttr<"DomainKind",
-    "Kind of domain metatdata attached to an HLO domain.",
-    [HLO_DOMAIN_KIND_SHARDING]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_DomainKindAttr : EnumAttr<HLO_Dialect, HLO_DomainKind, "kind">;
-
-//===----------------------------------------------------------------------===//
-// Fast Fourier Transform Type enum definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA FftType proto enum.
-def HLO_FFT_TYPE_FFT : I32EnumAttrCase<"FFT", 0>;
-def HLO_FFT_TYPE_IFFT : I32EnumAttrCase<"IFFT", 1>;
-def HLO_FFT_TYPE_RFFT : I32EnumAttrCase<"RFFT", 2>;
-def HLO_FFT_TYPE_IRFFT : I32EnumAttrCase<"IRFFT", 3>;
-
-def HLO_FftType : I32EnumAttr<"FftType",
-    "XLA fast fourier transform type.",
-    [HLO_FFT_TYPE_FFT, HLO_FFT_TYPE_IFFT,
-     HLO_FFT_TYPE_RFFT, HLO_FFT_TYPE_IRFFT]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_FftTypeAttr : EnumAttr<HLO_Dialect, HLO_FftType, "fft_type">;
-
-//===----------------------------------------------------------------------===//
-// Custom call enum definitions.
-//===----------------------------------------------------------------------===//
-
-// TODO(b/189822916): Remove this enum when all clients are migrated to the
-// status-returning API.
-def HLO_CUSTOM_CALL_API_VERISON_UNSPECIFIED :
-    I32EnumAttrCase<"API_VERSION_UNSPECIFIED", 0>;
-def HLO_CUSTOM_CALL_API_VERSION_ORIGINAL :
-    I32EnumAttrCase<"API_VERSION_ORIGINAL", 1>;
-def HLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING :
-    I32EnumAttrCase<"API_VERSION_STATUS_RETURNING", 2>;
-def HLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING_UNIFIED :
-    I32EnumAttrCase<"API_VERSION_STATUS_RETURNING_UNIFIED", 3>;
-def HLO_CustomCallApiVersionAttr :
-    I32EnumAttr<"CustomCallApiVersion", "Custom call API version", [
-        HLO_CUSTOM_CALL_API_VERISON_UNSPECIFIED,
-        HLO_CUSTOM_CALL_API_VERSION_ORIGINAL,
-        HLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING,
-        HLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING_UNIFIED
-    ]> {
-  let cppNamespace = "::mlir::mhlo";
-}
-
-//===----------------------------------------------------------------------===//
-// Comparison op definitions.
-//===----------------------------------------------------------------------===//
-
-// These mirror the XLA ComparisonDirection enum.
-def HLO_COMPARISON_DIRECTION_EQ : I32EnumAttrCase<"EQ", 0>;
-def HLO_COMPARISON_DIRECTION_NE : I32EnumAttrCase<"NE", 1>;
-def HLO_COMPARISON_DIRECTION_GE : I32EnumAttrCase<"GE", 2>;
-def HLO_COMPARISON_DIRECTION_GT : I32EnumAttrCase<"GT", 3>;
-def HLO_COMPARISON_DIRECTION_LE : I32EnumAttrCase<"LE", 4>;
-def HLO_COMPARISON_DIRECTION_LT : I32EnumAttrCase<"LT", 5>;
-
-def HLO_ComparisonDirection : I32EnumAttr<"ComparisonDirection",
-    "Which comparison operation to perform.",
-    [
-      HLO_COMPARISON_DIRECTION_EQ,
-      HLO_COMPARISON_DIRECTION_NE,
-      HLO_COMPARISON_DIRECTION_GE,
-      HLO_COMPARISON_DIRECTION_GT,
-      HLO_COMPARISON_DIRECTION_LE,
-      HLO_COMPARISON_DIRECTION_LT
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_ComparisonDirectionAttr : EnumAttr<HLO_Dialect, HLO_ComparisonDirection, "comparison_direction">;
-
-def HLO_DEFAULT_COMPARISON_TYPE : NativeCodeCall<"::mlir::mhlo::ComparisonTypeAttr()">;
-def HLO_COMPARISON_TYPE_NOTYPE : I32EnumAttrCase<"NOTYPE", 0>;
-def HLO_COMPARISON_TYPE_FLOAT : I32EnumAttrCase<"FLOAT", 1>;
-def HLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER : I32EnumAttrCase<"TOTALORDER", 2>;
-def HLO_COMPARISON_TYPE_SIGNED : I32EnumAttrCase<"SIGNED", 3>;
-def HLO_COMPARISON_TYPE_UNSIGNED : I32EnumAttrCase<"UNSIGNED", 4>;
-
-def HLO_ComparisonType : I32EnumAttr<"ComparisonType",
-    "Which comparison type to use.",
-    [
-      HLO_COMPARISON_TYPE_NOTYPE,
-      HLO_COMPARISON_TYPE_FLOAT,
-      HLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
-      HLO_COMPARISON_TYPE_SIGNED,
-      HLO_COMPARISON_TYPE_UNSIGNED
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_ComparisonTypeAttr : EnumAttr<HLO_Dialect, HLO_ComparisonType, "comparison_type">;
-
-// These mirror the XLA Dequantize mode string enum.
-def HLO_MIN_COMBINED : I32EnumAttrCase<"MIN_COMBINED", 0>;
-
-def HLO_DequantizeMode : I32EnumAttr<"DequantizeMode",
-  "Dequantization mode. Only MIN_COMBINED is supported.",
-  [HLO_MIN_COMBINED]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_DequantizeModeAttr : EnumAttr<HLO_Dialect, HLO_DequantizeMode, "dequantize_mode">;
-
-// These mirror the XLA Transpose enum in Triangular Solve options.
-def HLO_TRANSPOSE_INVALID : I32EnumAttrCase<"TRANSPOSE_INVALID", 0>;
-def HLO_NO_TRANSPOSE : I32EnumAttrCase<"NO_TRANSPOSE", 1>;
-def HLO_TRANSPOSE : I32EnumAttrCase<"TRANSPOSE", 2>;
-def HLO_ADJOINT : I32EnumAttrCase<"ADJOINT", 3>;
-
-def HLO_Transpose : I32EnumAttr<"Transpose",
-    "Transpose options",
-    [
-      HLO_TRANSPOSE_INVALID,
-      HLO_NO_TRANSPOSE,
-      HLO_TRANSPOSE,
-      HLO_ADJOINT
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_TransposeAttr : EnumAttr<HLO_Dialect, HLO_Transpose, "transpose">;
-
-def HLO_LOOP_FUSION : I32EnumAttrCase<"kLoop", 0>;
-def HLO_INPUT_FUSION : I32EnumAttrCase<"kInput", 1>;
-def HLO_OUTPUT_FUSION : I32EnumAttrCase<"kOutput", 2>;
-def HLO_CUSTOM_FUSION : I32EnumAttrCase<"kCustom", 3>;
-def HLO_FusionKind : I32EnumAttr<"FusionKind", "fusion kind", [
-    HLO_LOOP_FUSION, HLO_INPUT_FUSION, HLO_OUTPUT_FUSION, HLO_CUSTOM_FUSION
-]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_RNG_DISTRIBUTION_UNIFORM : I32EnumAttrCase<"UNIFORM", 1>;
-def HLO_RNG_DISTRIBUTION_NORMAL : I32EnumAttrCase<"NORMAL", 2>;
-
-def HLO_RNG_DISTRIBUTION : I32EnumAttr<"RngDistribution",
-    "XLA PRNG distribution to be used.",
-    [
-      HLO_RNG_DISTRIBUTION_UNIFORM,
-      HLO_RNG_DISTRIBUTION_NORMAL
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_RngDistributionAttr : EnumAttr<HLO_Dialect, HLO_RNG_DISTRIBUTION, "rng_distribution"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
-def HLO_FusionKindAttr : EnumAttr<HLO_Dialect, HLO_FusionKind, "fusion_kind">;
-
-def HLO_RNG_ALGORITHM_DEFAULT : I32EnumAttrCase<"DEFAULT", 0>;
-def HLO_RNG_ALGORITHM_THREE_FRY : I32EnumAttrCase<"THREE_FRY", 1>;
-def HLO_RNG_ALGORITHM_PHILOX : I32EnumAttrCase<"PHILOX", 2>;
-
-def HLO_RNG_ALGORITHM : I32EnumAttr<"RngAlgorithm",
-    "XLA PRNG algorithm to be used.",
-    [
-      HLO_RNG_ALGORITHM_DEFAULT,
-      HLO_RNG_ALGORITHM_THREE_FRY,
-      HLO_RNG_ALGORITHM_PHILOX
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::mhlo";
-}
-
-def HLO_RngAlgorithmAttr : EnumAttr<HLO_Dialect, HLO_RNG_ALGORITHM, "rng_algorithm"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
-#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
deleted file mode 100644
index c65c2f33784..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS mhlo_passes.td)
-mlir_tablegen(mhlo_passes.h.inc -gen-pass-decls -name AllMhlo)
-add_public_tablegen_target(MLIRMhloPassIncGen)
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h
deleted file mode 100644
index 4fc8a0b13b6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-
-#include <functional>
-#include <memory>
-
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-
-namespace mlir {
-namespace mhlo {
-
-/// Register the external models for bufferizing mhlo ops.
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace mhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
deleted file mode 100644
index 95545f16384..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H
-
-#include <memory>
-#include <string>
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-class ModuleOp;
-class Operation;
-template <typename T>
-class OperationPass;
-class Pass;
-namespace func {
-class FuncOp;
-}  // namespace func
-namespace lmhlo {
-class FusionOp;
-}  // namespace lmhlo
-
-namespace mhlo {
-
-#define GEN_PASS_DECL
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
-
-/// Lowers HLO control flow ops to SCF.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeControlFlowPass();
-
-/// Lowers sort to SCF & arith.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
-
-/// Lowers from HLO dialect to Standard dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToStdPass();
-
-/// Lowers from the CHLO dialect to the HLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
-    bool legalizeBroadcasts = true, bool expandCompositions = true);
-
-// Lowers from sparse ops in CHLO dialect to Linalg dialect.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeSparseChloToLinalgPass();
-
-// Canonicalize reduction ops to be suitable for codegen.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createHloCanonicalizeReductionPass();
-
-// Rewrites scatter into transposes, reshapes and a simpler scatter.
-std::unique_ptr<OperationPass<func::FuncOp>> createHloCanonicalizeScatterPass();
-
-// Rewrites gather into transposes, reshapes and a simpler gather.
-std::unique_ptr<OperationPass<func::FuncOp>> createHloCanonicalizeGatherPass();
-
-/// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
-/// buffers if necessary.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
-
-/// Lowers from HLO dialect to Memref dialect allocating/deallocating temporary
-/// buffers if necessary.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass();
-
-/// Lowers from HLO dialect to Arithmetic dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToArithmeticPass();
-
-// Lowers shape operations from HLO dialect to Standard dialect.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeHloShapeOpsToStandardPass();
-
-/// Lowers from MHLO dialect to THLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass();
-
-/// Lowers from HLO dialect to Linalg dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeHloToLinalgPass();
-
-/// Lowers from HLO dialects dim operations.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeShapeComputationsPass();
-
-// Sinks constants implicitly captured in control flow regions. This is
-// necessary to export to XLA.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createSinkConstantsToControlFlowPass();
-
-/// Lowers trigonometric operations from the standard dialect to approximations
-/// that do not use intrinsics.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeTrigonometricToApproximationPass();
-
-// Move dynamic broadcasts up over element-wise operations and broadcast the
-// operands rather than the result. This will eventually allow for larger
-// fusions.
-std::unique_ptr<OperationPass<func::FuncOp>> createBroadcastPropagationPass();
-
-// Transformations that helps in restricting maximum rank among tensors in the
-// pass.
-std::unique_ptr<OperationPass<func::FuncOp>> createRestrictMaxRankPass();
-
-// Prepare moving dynamic broadcasts up over element-wise operations and
-// broadcast the operands rather than the result. This will eventually allow for
-// larger fusions.
-std::unique_ptr<OperationPass<func::FuncOp>> createMergeAssumingOpsPass();
-
-// Iteratively reifies all shape computations in the function.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeReificationPass();
-
-// Fuse shape constraints and merge all assuming regions.
-std::unique_ptr<OperationPass<func::FuncOp>> createConstraintFusionPass();
-
-// Group reduction and parallel dimensions of reduction operations and realize
-// them through equivalent 1D or 2D reductions.
-std::unique_ptr<OperationPass<func::FuncOp>> createGroupReductionDimensionsPass(
-    bool preferColumnsReductions = true);
-
-/// Rank specialization passes:
-///   - Find compatible operations and group them together in one rank
-///     specialization cluster.
-///   - Lower rank specialization clusters to SCF and ranked operations.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createRankSpecializationClusterPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createRankSpecializationToSCFPass(
-    int64_t maxTargetRank = 5);
-
-std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeMhloPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexPass();
-std::unique_ptr<::mlir::Pass> createLegalizeGeneralDotPass();
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeEinsumToDotGeneralPass();
-std::unique_ptr<OperationPass<func::FuncOp>>
-createLegalizeGatherToTorchIndexSelectPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createFlattenTuplePass();
-
-// Creates a pass for expanding mhlo.tuple ops.
-std::unique_ptr<OperationPass<ModuleOp>> createExpandHloTuplesPass(
-    const std::string& entryFunctionName = "main");
-
-// Creates a pass for collapsing the mhlo.map if the map only has elementwise
-// op.
-std::unique_ptr<OperationPass<func::FuncOp>> createCollapseElementwiseMapPass();
-
-// Pass to replace unsigned types with signless integers.
-std::unique_ptr<OperationPass<ModuleOp>> createConvertToSignlessPass();
-
-/// Creates pass for rewriting sparse mhlo ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createSparseRewritingPass();
-
-// Legalizes from the MHLO dialect to the StableHLO dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createHloLegalizeToStablehloPass();
-
-// Legalizes from the StableHLO dialect to the MHLO dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createStablehloLegalizeToHloPass();
-
-// Test passes.
-std::unique_ptr<Pass> createTestInferShapedTypeMethodsPass();
-std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
-std::unique_ptr<Pass> createTestUnfuseBatchNormPass();
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
-
-}  // namespace mhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
deleted file mode 100644
index 550195b3597..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H
-
-#include <functional>
-#include <memory>
-
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace bufferization {
-class BufferizeTypeConverter;
-}  // namespace bufferization
-namespace mhlo {
-
-// Collection of rewrite patterns for lowering a general dot product.
-void populateGeneralDotOpLoweringPatterns(RewritePatternSet *patterns,
-                                          MLIRContext *ctx);
-
-// Collection of rewrite patterns for lowering complex operations to equivalent
-// float operations.
-void populateComplexLoweringPatterns(MLIRContext *context,
-                                     RewritePatternSet *patterns);
-
-void populateOptimizeMhloPatterns(MLIRContext *context,
-                                  RewritePatternSet *patterns);
-
-// Rewrite patterns for einsum to equivalent dot_general legalization.
-void populateEinsumToDotGeneralPatterns(mlir::MLIRContext *context,
-                                        RewritePatternSet *patterns);
-
-// Rewrite patterns for gather to equivalent torch index select legalization.
-void populateGatherToTorchIndexSelectPatterns(mlir::MLIRContext *context,
-                                              RewritePatternSet *patterns);
-
-void populateMhloToStdPatterns(RewritePatternSet *patterns, MLIRContext *ctx);
-
-// Collection of rewrite patterns for lowering all mhlo ops to their
-// lmhlo counterparts.
-void populateDynamicHloToLhloConversionPattern(
-    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
-    RewritePatternSet *patterns);
-
-// Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHloToLhloConversionPattern(
-    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
-    RewritePatternSet *patterns);
-
-// Collection of rewrite patterns for lowering of HLO to arithmetic dialect.
-void populateHloToArithmeticConversionPatterns(RewritePatternSet *patterns);
-
-// Collection of rewrite patterns for lowering pointwise HLO ops with scalar
-// arguments to arithmetic dialect.
-void populateScalarHloToArithmeticConversionPatterns(
-    MLIRContext *context, TypeConverter &typeConverter,
-    RewritePatternSet *patterns);
-
-// Collection of rewrite patterns for lowering of shape operations from the HLO
-// dialect to the standard dialect.
-void populateHloShapeOpsToStandardConversionPattern(
-    MLIRContext *context, TypeConverter &typeConverter,
-    RewritePatternSet *patterns);
-
-// Collection of rewrite patterns for lowering of HLO to Linalg dialect.
-void populateHloToLinalgConversionPattern(MLIRContext *context,
-                                          TypeConverter &typeConverter,
-                                          RewritePatternSet *patterns,
-                                          bool enablePrimitiveOps = false);
-
-// Collection of rewrite patterns for lowering of HLO dim operations.
-void populateShapeComputationPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
-
-// Converter to signless intergers to be used with linalg conversion patterns.
-std::unique_ptr<TypeConverter> createHloToLinalgTypeConverter();
-
-// Sets up legality definitions for materializing broadcasts.
-void setupMaterializeBroadcastsLegality(MLIRContext *context,
-                                        ConversionTarget *conversionTarget);
-
-// Populates a collection of rewrite patterns for materializing broadcast
-// attributes to equivalent sequences of ops.
-void populateMaterializeBroadcastsPatterns(MLIRContext *context,
-                                           RewritePatternSet *patterns);
-
-// Populates a collection of rewrite patterns to realize element-wise operations
-// on ranked tensors where possible.
-void populateTransformUnrankedHloPatterns(MLIRContext *context,
-                                          RewritePatternSet *patterns);
-
-void populateDynamicShapeFusionPatterns(MLIRContext *context,
-                                        RewritePatternSet *patterns);
-
-// Populate a collection of conversion patterns for un-fusing
-// batch_norm_inference into constituent HLO ops.
-void populateUnfuseBatchNormInferencePattern(MLIRContext *context,
-                                             RewritePatternSet *patterns);
-
-// Populate a collection of conversion patterns for un-fusing
-// batch_norm_training into constituent HLO ops.
-void populateUnfuseBatchNormTrainingPattern(MLIRContext *context,
-                                            RewritePatternSet *patterns);
-
-// Populate a collection of conversion patterns for un-fusing
-// // batch_norm_inference and batch_norm_training into constituent HLO ops.
-inline void populateUnfuseBatchNormPatterns(MLIRContext *context,
-                                            RewritePatternSet *patterns) {
-  populateUnfuseBatchNormInferencePattern(context, patterns);
-  populateUnfuseBatchNormTrainingPattern(context, patterns);
-}
-
-// Populates patterns that translate the trigonometric operations from the
-// standard dialect to approximations that do not use intrinsics.
-void populateTrigonometricToApproximationPatterns(MLIRContext *context,
-                                                  RewritePatternSet *patterns);
-
-// Populate patterns to prepare moving dynamic broadcasts up over element-wise
-// operations and broadcast the operands rather than the result. This will
-// eventually allow for larger fusions.
-void populateMergeAssumingOpsPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
-
-// Populate patterns for iterative shape reification.
-void populateShapeReificationPatterns(MLIRContext *, RewritePatternSet *);
-
-// Populate patterns to group reduction and parallel dimensions of reduction
-// operations and realize them through equivalent 1D or 2D reductions.
-void populateGroupReductionDimensionsPatterns(MLIRContext *context,
-                                              RewritePatternSet *patterns,
-                                              bool preferColumnsReductions);
-
-/// Populate rank specialization clustering and lowering patterns.
-void populateRankSpecializationClusterPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns);
-void populateRankSpecializationToSCFPatterns(MLIRContext *context,
-                                             RewritePatternSet *patterns,
-                                             int64_t maxTargetRank);
-
-/// Populate sparse tensor specific rewriting patterns.
-void populateSparseRewritingPatterns(RewritePatternSet *patterns,
-                                     MLIRContext *ctx);
-
-/// Populates sparse ops in CHLO to linalg rewriting patterns.
-void populateLegalizeSparseChloToLinalgPatterns(MLIRContext *context,
-                                                TypeConverter &typeConverter,
-                                                RewritePatternSet *patterns);
-
-}  // namespace mhlo
-
-namespace chlo {
-
-// Populates a collection of conversion patterns for legalizing broadcasting
-// client-HLO to their non-broadcasting counterparts.
-void populateChloBroadcastingPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
-
-// Populates a collection of conversion patterns for legalizing client-HLO to
-// HLO by decomposing client-operations to corresponding sequences of more
-// primitive operations. This does not include the
-// PopulateChloBroadcastingPatterns above.
-void populateDecomposeChloPatterns(MLIRContext *context,
-                                   RewritePatternSet *patterns);
-
-}  // namespace chlo
-
-namespace stablehlo {
-
-// Populates MHLO ops to StableHLO ops rewriting patterns.
-// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
-// which make sure `func.func`, `func.call` and `func.return` which involve
-// illegal types also get converted.
-void populateHloToStablehloPatterns(RewritePatternSet *patterns,
-                                    TypeConverter *converter,
-                                    MLIRContext *context);
-
-// Populates StableHLO ops to MHLO ops rewriting patterns.
-// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
-// which make sure `func.func`, `func.call` and `func.return` which involve
-// illegal types also get converted.
-void populateStablehloToHloPatterns(RewritePatternSet *patterns,
-                                    TypeConverter *converter,
-                                    MLIRContext *context);
-
-}  // namespace stablehlo
-
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_REWRITERS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/CMakeLists.txt
deleted file mode 100644
index 2c8a564de16..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(transforms)
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/CMakeLists.txt
deleted file mode 100644
index 94e3aa5acd0..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(LLVM_TARGET_DEFINITIONS thlo_ops.td)
-mlir_tablegen(thlo_ops.h.inc -gen-op-decls)
-mlir_tablegen(thlo_ops.cc.inc -gen-op-defs)
-mlir_tablegen(thlo_dialect.h.inc -gen-dialect-decls)
-mlir_tablegen(thlo_dialect.cc.inc -gen-dialect-defs)
-
-add_public_tablegen_target(MLIRthlo_opsIncGen)
-add_dependencies(mlir-headers MLIRthlo_opsIncGen)
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.td
deleted file mode 100644
index 6588c90e40e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.td
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THLO_OPS
-#define THLO_OPS
-
-include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.td"
-include "mlir/Dialect/Linalg/IR/LinalgInterfaces.td"
-include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
-include "mlir/Interfaces/DestinationStyleOpInterface.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-def TensorOrMemref :
-  AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
-
-class TensorOrMemrefOf<list<Type> allowedTypes> :
-  AnyTypeOf<[MemRefOf<allowedTypes>, RankedTensorOf<allowedTypes>],
-  "", "::mlir::ShapedType">;
-
-def THLO_Dialect : Dialect {
-  let name = "thlo";
-  let cppNamespace = "::mlir::thlo";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
-}
-
-class THLO_Op<string mnemonic, list<Trait> traits> :
-    Op<THLO_Dialect, mnemonic, traits> {
-  let hasVerifier = 1;
-}
-
-class THLO_DstStyleOp<string mnemonic, list<Trait> traits> : THLO_Op<mnemonic, [
-    DestinationStyleOpInterface] # traits> {
-  let hasCustomAssemblyFormat = 1;
-}
-
-def THLO_YieldOp : THLO_Op<"yield", [Pure, ReturnLike, Terminator,
-    ParentOneOf<["ScatterOp", "SortOp"]>]>,
-    Arguments<(ins Variadic<AnyType>:$values)> {
-  let summary = "Yield operation for tHLO ops with regions.";
-  let assemblyFormat = "attr-dict $values `:` type($values)";
-  let hasVerifier = 1;
-}
-
-def THLO_ConcatenateOp : THLO_DstStyleOp<"concatenate", [
-    DeclareOpInterfaceMethods<TilingInterface>]> {
-  let summary = "Destination-style twin for `mhlo.concatenate`";
-  let arguments = (ins
-    Variadic<TensorOrMemref>:$inputs,
-    TensorOrMemref:$init,
-    I64Attr:$dimension
-  );
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
-    }
-  }];
-}
-
-def THLO_DynamicBroadcastInDimOp : THLO_DstStyleOp<"dynamic_broadcast_in_dim", [
-    DeclareOpInterfaceMethods<TilingInterface>]> {
-  let summary = "Destination-style twin for `mhlo.dynamic_broadcast_in_dim`";
-
-  let arguments = (ins
-    // Input args
-    TensorOrMemref:$operand,
-    // Output arg
-    TensorOrMemref:$init,
-
-    DenseI64ArrayAttr:$broadcast_dimensions,
-    OptionalAttr<DenseI64ArrayAttr>:$known_expanding_dimensions,
-    OptionalAttr<DenseI64ArrayAttr>:$known_nonexpanding_dimensions
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
-    }
-  }];
-}
-
-def THLO_GatherOp : THLO_DstStyleOp<"gather", [
-    DeclareOpInterfaceMethods<TilingInterface>]> {
-  let summary = "Destination-style twin for `mhlo.gather`";
-  let description = [{
-    tHLO GatherOp corresponds to the canonicalized mHLO GatherOp, i.e.
-
-    - start_indices is a two-dimensional tensor.
-    - index_vector_dim is 1
-    - offset_dims is [1, 2, ...]
-    - collapsed_slice_dims is []
-    - start_index_map is range(start_indices.shape[1])
-  }];
-  let arguments = (ins
-    // Input args
-    TensorOrMemref:$operand,
-    TensorOrMemrefOf<[Index]>:$start_indices,
-    // Output arg
-    TensorOrMemref:$init
-  );
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
-    }
-  }];
-}
-
-def THLO_ScatterOp : THLO_DstStyleOp<"scatter", [
-    DeclareOpInterfaceMethods<TilingInterface>,
-    SingleBlockImplicitTerminator<"YieldOp">]> {
-  let summary = "Destination-style twin for `mhlo.scatter`";
-  let description = [{
-    tHLO ScatterOp corresponds to the canonicalized mHLO ScatterOp, i.e.
-
-    - update_window_dims is range(1, rank(update_window_dims))
-    - inserted_window_dims is []
-    - scatter_dims_to_operand_dims is range(0, rank(indices))
-    - index_vector_dim is rank(indices) - 1
-
-    At the moment, the variadic case is not supported.
-  }];
-  let arguments = (ins
-    // Input args
-    TensorOrMemrefOf<[Index]>:$indices,
-    TensorOrMemref:$updates,
-    // Output arg
-    TensorOrMemref:$init
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-
-  let regions = (region SizedRegion<1>:$update_computation);
-
-  let extraClassDeclaration = [{
-    // Returns index vector dimension size, which is always statically-known.
-    int64_t getIndexVectorDimSize() {
-      return getIndices().getType().getDimSize(1);
-    }
-
-    // Returns the number of indices, i.e. number of scalar/tensor updates.
-    int64_t getIndicesCount() { return getIndices().getType().getDimSize(0); }
-
-    // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - 1, getNumOperands};
-    }
-  }];
-}
-
-def THLO_SortOp : THLO_DstStyleOp<"sort", [
-        DeclareOpInterfaceMethods<TilingInterface>, SameVariadicOperandSize,
-        SingleBlockImplicitTerminator<"YieldOp">]> {
-  let summary = "Destination-style twin for the `mhlo.sort`";
-  let description = [{
-    Sorts the given `operands` along the given `dimension` using the given
-    `comparator`.
-
-    Example:
-    ```
-      %sorted1, %sorted2 = thlo.sort
-          ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
-          outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-          { dimension = 0 : i64, is_stable = true }
-          (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
-            %gt = arith.cmpf ogt, %e11, %e12: f32
-            thlo.yield %gt : i1
-          }
-    ```
-    See https://www.tensorflow.org/xla/operation_semantics#sort.
-  }];
-
-  let arguments = (ins
-    // Input args
-    Variadic<TensorOrMemref>:$inputs,
-    // Output args
-    Variadic<TensorOrMemref>:$inits,
-
-    I64Attr:$dimension,
-    BoolAttr:$is_stable
-  );
-
-  let results = (outs Variadic<AnyTensor>:$result);
-  let regions = (region SizedRegion<1>:$comparator);
-
-  let extraClassDeclaration = [{
-    // Implement method necessary for DestinationStyleOpInterface.
-    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
-      int64_t getNumOperands = this->getNumOperands();
-      return {getNumOperands - getInits().size(), getNumOperands};
-    }
-  }];
-}
-
-#endif // THLO_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/CMakeLists.txt
deleted file mode 100644
index 3bd45d3674e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS thlo_passes.td)
-mlir_tablegen(thlo_passes.h.inc -gen-pass-decls -name AllThlo)
-add_public_tablegen_target(MLIRThloPassIncGen)
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h
deleted file mode 100644
index 9b033298725..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_THLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-#define MLIR_HLO_DIALECT_THLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
-
-namespace mlir {
-class DialectRegistry;
-
-namespace thlo {
-
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
-
-}  // namespace thlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_THLO_TRANSFORMS_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/passes.h
deleted file mode 100644
index 67b99a75f8c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/passes.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_DIALECT_THLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_DIALECT_THLO_TRANSFORMS_PASSES_H
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-template <typename T>
-class OperationPass;
-
-namespace func {
-class FuncOp;
-}  // namespace func
-
-namespace thlo {
-
-#define GEN_PASS_DECL_THLOLEGALIZESORTPASS
-#include "mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc"
-
-/// Lowers sort to Arith, MemRef, and SCF
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc"
-
-}  // namespace thlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_DIALECT_THLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/CMakeLists.txt
deleted file mode 100644
index a3176719006..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS passes.td)
-mlir_tablegen(passes.h.inc -gen-pass-decls -name LMHLOTransforms)
-add_public_tablegen_target(LMHLOTransformsPassIncGen)
-
-set(LLVM_TARGET_DEFINITIONS gpu_passes.td)
-mlir_tablegen(gpu_passes.h.inc -gen-pass-decls -name LMHLOGPUTransforms)
-add_public_tablegen_target(LMHLOGPUTransformsPassIncGen)
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gml_st_pipeline.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gml_st_pipeline.h
deleted file mode 100644
index 38b94d5a54a..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gml_st_pipeline.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TRANSFORMS_GML_ST_PIPELINE_H
-#define MLIR_HLO_TRANSFORMS_GML_ST_PIPELINE_H
-
-#include <string>
-
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Pass/PassOptions.h"
-
-namespace mlir {
-struct GmlStPipelineOptions
-    : public mlir::PassPipelineOptions<GmlStPipelineOptions> {
-  ListOption<int64_t> tileSizes{*this, "tile-sizes",
-                                llvm::cl::desc("Tile sizes")};
-  Option<bool> lowerToLoops{
-      *this, "lower-to-loops",
-      llvm::cl::desc("Enable bufferization and lowering to SCF dialect for "
-                     "GmlSt and Linalg ops."),
-      llvm::cl::init(false)};
-};
-
-void createGmlStPipeline(mlir::OpPassManager& pm,
-                         const GmlStPipelineOptions& options);
-
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TRANSFORMS_GML_ST_PIPELINE_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h
deleted file mode 100644
index 4b46155f5be..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TRANSFORMS_GPU_PASSES_H
-#define MLIR_HLO_TRANSFORMS_GPU_PASSES_H
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-class ModuleOp;
-class PassManager;
-namespace gpu {
-class GPUModuleOp;
-}  // namespace gpu
-
-#define GEN_PASS_DECL
-#include "mlir-hlo/Transforms/gpu_passes.h.inc"
-
-// Create a pass which lowers a subset of lmhlo.fusion ops to gpu.launch_func
-// plus a gpu.module containing the kernel.
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createGpuFusionRewritePass();
-
-// Returns array of bool attributes. The value of each element specifies whether
-// the corresponding operand is written. This attribute is attached to
-// 'gpu.launc_func' ops during the fusion rewrite pass above.
-ArrayAttr getWrittenOperandsAttribute(Operation* op);
-
-/// Pass that transforms gpu modules in standard dialect to NNVM.
-std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
-createGpuKernelToNvvmPass();
-
-/// Pass that transforms gpu modules in standard dialect to ROCDL.
-std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
-createGpuKernelToRocdlPass();
-
-/// Creates a pipeline that converts operations in HLO dialect to GPU kernels
-/// written in a combination of LLVM and NVVM dialects, and appends the pipeline
-/// to `pm`. `blockTileDim`, `warpTileDim` and `threadTileDim` indicate the
-/// size of the subproblem that will be operated on by the block, warp, and
-/// thread level, respectively.
-void createHloToGpuPipeline(OpPassManager& pm, ArrayRef<int64_t> blockTileDim,
-                            ArrayRef<int64_t> warpTileDim,
-                            ArrayRef<int64_t> threadTileDim,
-                            bool experimentalSoftmax);
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Transforms/gpu_passes.h.inc"
-
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TRANSFORMS_GPU_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h
deleted file mode 100644
index 5a7e9b94d38..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_TRANSFORMS_PASSES_H
-#define MLIR_HLO_TRANSFORMS_PASSES_H
-
-#include <functional>
-#include <memory>
-
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-class ModuleOp;
-class MLIRContext;
-class ConversionTarget;
-class DialectRegistry;
-class PassManager;
-
-namespace func {
-class FuncOp;
-}  // namespace func
-namespace bufferization {
-class BufferizeTypeConverter;
-}  // namespace bufferization
-
-using BufferizeDialectsCallback = std::function<void(DialectRegistry&)>;
-using BufferizePatternsCallback = std::function<void(
-    ConversionTarget&, MLIRContext*, bufferization::BufferizeTypeConverter*,
-    RewritePatternSet*)>;
-
-//===----------------------------------------------------------------------===//
-// Passes
-//===----------------------------------------------------------------------===//
-
-#define GEN_PASS_DECL_BUFFERPACKING
-#define GEN_PASS_DECL_FINALBUFFERIZEPASS
-#define GEN_PASS_DECL_PROPAGATESTATICSHAPESTOKERNELPASS
-#define GEN_PASS_DECL_TILELOOPSPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-/// Creates a pass that reuses buffers which are already allocated.
-std::unique_ptr<OperationPass<func::FuncOp>> createBufferReusePass();
-
-/// Creates a pass to analyze shapes and to use that information for
-/// shape-related optimizations.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createSymbolicShapeOptimizationPass();
-
-/// Creates a pass that merges smaller buffer into bigger buffer to optimize
-/// memory consumption.
-std::unique_ptr<OperationPass<func::FuncOp>> createBufferPackingPass(
-    unsigned windowSize = 5);
-
-/// Creates a pass that tests the useranges of the UserangeAnalysis.
-std::unique_ptr<OperationPass<func::FuncOp>> createTestUserangePass();
-
-/// Creates a pass that prints the analysis results of ShapeComponentsAnalysis.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createTestShapeComponentAnalysisPass();
-
-/// Creates a pass that removes redundant operations that implement a
-/// CopyOpInterface.
-std::unique_ptr<OperationPass<func::FuncOp>> createCopyRemovalPass();
-
-/// Creates a pass that computes the allocated memory.
-std::unique_ptr<OperationPass<func::FuncOp>> createMemoryCountPass();
-
-// Pass to lower index cast on tensors to tensor dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass();
-
-// Pass to simplify shape ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification();
-
-// Pass to tranform compute computations (hlo and linalg) on values to their
-// corresponding counterparts on buffers. Also bufferizes function signatures.
-std::unique_ptr<OperationPass<ModuleOp>> createComputeOpAndFuncBufferizePass();
-
-// Pass to tranform computations on values to their corresponding parts on
-// buffers.
-std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass();
-
-std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass(
-    uint64_t alignment, BufferizeDialectsCallback dc = {},
-    BufferizePatternsCallback pc = {});
-
-// Pass to propagate static shapes to kernel, reducing the kernel arguments
-// from a flattened memref to a single pointer. The pointer is converted to
-// `pointer_type`, if provided.
-std::unique_ptr<OperationPass<ModuleOp>>
-createPropagateStaticShapesToKernelPass(Type pointerType = {});
-
-// Creates a pass for collapsing multidimensional parallel loops into 1D loops.
-std::unique_ptr<OperationPass<>> createCollapseParallelLoopsTo1DPass();
-
-// Creates a TileLoopsPass with tiles sizes provided through `tile_sizes`
-// and unroll factors provided through `unroll_factors`.
-std::unique_ptr<OperationPass<func::FuncOp>> createTileLoopsPass(
-    ArrayRef<int64_t> tileSizes = {}, ArrayRef<int64_t> unrollFactors = {});
-
-// Detensorizes loop-carried variables and block arguments of scf.while, scf.for
-// and scf.if.
-std::unique_ptr<OperationPass<func::FuncOp>> createDetensorizeScfOpsPass();
-
-// Converts ops on tensors with 1 element to scalar ops.
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass();
-
-namespace hlo {
-std::unique_ptr<OperationPass<ModuleOp>> createOneShotBufferizePass();
-
-std::unique_ptr<OperationPass<ModuleOp>> createGenericHostToLLVMPass();
-
-std::unique_ptr<OperationPass<func::FuncOp>> createInlineFusionPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createUnbufferizePass();
-std::unique_ptr<OperationPass<func::FuncOp>> createAllocToArgPass();
-
-// Unrolls scf.for loops with static iteration count no larger than 8.
-std::unique_ptr<Pass> createUnrollLoopsPass();
-
-#define GEN_PASS_REGISTRATION
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-}  // namespace hlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.td
deleted file mode 100644
index 65b2abe57ef..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.td
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
-#define TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-def BufferReuse : Pass<"buffer-reuse", "func::FuncOp"> {
-  let summary = "Reuses already allocated buffers to save allocation "
-                "operations if it is provably safe.";
-  let description = [{
-    This pass tries to reuse already allocated buffers if they have the same
-    size, are in the right appearance order and have no interference among
-    themselves.
-  }];
-  let constructor = "createBufferReusePass()";
-}
-
-def SymbolicShapeOptimization : Pass<"symbolic-shape-optimization", "func::FuncOp"> {
-  let summary = "Analyzes shapes and performs shape-related optimizations";
-  let constructor = "createSymbolicShapeOptimizationPass()";
-}
-
-def CopyRemoval : Pass<"copy-removal", "func::FuncOp"> {
-  let summary = "Removes redundant operations that implement a "
-                "CopyOpInterface, if the intersection of the useranges from"
-                "copy source and target only contains the CopyOp.";
-  let constructor = "createCopyRemovalPass()";
-}
-
-def BufferPacking : Pass<"buffer-packing", "func::FuncOp"> {
-  let summary = "Pass to pack allocated buffer to reduce memory consumption.";
-  let description = [{The pass tries to pack smaller buffers into larger buffers.
-  To do this, it sorts all allocated buffers by multiple criteria depends on the
-  selected window-size.
-  After this sorting, the buffers are checked whether subsequent buffers can be
-  packed into them.}];
-  let dependentDialects = ["func::FuncDialect","memref::MemRefDialect",
-    "arith::ArithDialect"];
-  let constructor = "createBufferPackingPass()";
-  let options = [
-   Option<"window_size_", "window-size", "unsigned",
-           /*default=*/"5", "The window size blurs the start position of an"
-           "allocated buffer. Buffers allocated in the same sliding window area"
-           "are treated equally in terms of starting position, withing the"
-           "sliding window area they are sorted by memory size."
-           "A window size of zero sorts the buffers only by memory size.">,
-  ];
-}
-
-def CollapseParallelLoopsTo1DPass : Pass<"collapse-parallel-loops-to-1d"> {
-  let summary = "Collapses multidimensional loops.";
-  let description = [{ The pass converts a multidimensional `scf.parallel` loop
-  into a 1D `scf.parallel` and index computation from a 1D to multidimensional
-  index. }];
-  let constructor = "createCollapseParallelLoopsTo1DPass()";
-}
-
-def DetensorizeScfOpsPass : Pass<"detensorize-scf-ops", "func::FuncOp"> {
-  let summary = "Detensorize arguments of SCF ops where possible.";
-  let constructor = "createDetensorizeScfOpsPass()";
-  let dependentDialects = ["scf::SCFDialect", "tensor::TensorDialect"];
-}
-
-def TileLoopsPass : Pass<"tile-loops", "func::FuncOp"> {
-  let summary = "Tiles parallel loops.";
-  let description = [{ The pass converts an `scf.parallel` loop into a nested,
-  "tiled", `scf.parallel` loop with 2 to 3 levels of nesting. The 3rd level of
-  nesting represents operation unrolling within a tile and is only applied on
-  simple memory access patterns (ones resulting from same shape, scalar, and/or
-  constant operands).}];
-  let constructor = "createTileLoopsPass()";
-  let options = [
-    ListOption<"tile_sizes_", "tile-sizes", "int64_t", "The size of the tile "
-               "in each dimension, expressed as the number of "
-               "`unroll_factors_` in that dimension.", "llvm::cl::ZeroOrMore">,
-    ListOption<"unroll_factors_", "unroll-factors", "int64_t", "The unroll "
-               "factor in each dimension, expressed as the number of elements "
-               "in that dimension.", "llvm::cl::ZeroOrMore">,
-  ];
-  let dependentDialects = ["AffineDialect"];
-}
-
-def MemoryCount : Pass<"memory-count", "func::FuncOp"> {
-  let summary = "Test pass to count the allocated memory of a module.";
-  let description = [{A test pass that prints the size of allocated memory of a
-  module.}];
-  let constructor = "createMemoryCountPass()";
-}
-
-def TestUserange : Pass<"test-print-userange", "func::FuncOp"> {
-  let summary = "Test pass for checking userange intervals.";
-  let constructor = "createTestUserangePass()";
-}
-
-def TestShapeComponentAnalysis : Pass<"test-print-shape-components",
-                                      "func::FuncOp"> {
-  let summary = "Test pass for analyzing shape components.";
-  let constructor = "createTestShapeComponentAnalysisPass()";
-}
-
-def LowerIndexCastPass
-    : Pass<"lower-index-cast", "mlir::func::FuncOp"> {
-  let summary = "Lower index cast on tensors to tensor dialect";
-  let constructor = "createLowerIndexCastPass()";
-}
-
-def ShapeSimplification
-    : Pass<"shape-simplification", "mlir::func::FuncOp"> {
-  let summary = "Simplify shape ops";
-  let constructor = "createShapeSimplification()";
-}
-
-def OneShotBufferize : Pass<"hlo-one-shot-bufferize", "ModuleOp"> {
-  let summary = "One shot bufferization pass.";
-  let constructor = "hlo::createOneShotBufferizePass()";
-}
-
-def ComputeOpAndFuncBufferizePass : Pass<"computeop-and-func-bufferize", "ModuleOp"> {
-  let summary = "Pass to transform compute operations (hlo and linalg) on "
-                "values to buffer based ones.";
-  let constructor = "createComputeOpAndFuncBufferizePass()";
-}
-
-def FinalBufferizePass : Pass<"final-bufferize", "ModuleOp"> {
-  let summary = "Pass to transform late operations on values to buffer based "
-                "ones.";
-  let constructor = "createFinalBufferizePass()";
-  let options = [
-      Option<"alignment_", "alignment", "uint64_t",
-             /*default=*/"64", "Memory alignment">,
-  ];
-}
-
-def PropagateStaticShapesToKernelPass : Pass<"propagate-static-shapes", "ModuleOp"> {
-  let summary = "Pass to rewrite statically shaped kernel arguments to a pointer.";
-  let constructor = "createPropagateStaticShapesToKernelPass()";
-  let options = [
-      Option<"ptr_type_opt", "convert_pointer_args", "std::string",
-             /*default=*/"", "Pointer type to convert pointer arguments to">,
-  ];
-}
-
-def GenericHostToLLVMPass : Pass<"generic-host-to-llvm", "ModuleOp"> {
-  let summary = "Pass to lower common dialects resulting from HLO to LLVM.";
-  let constructor = "hlo::createGenericHostToLLVMPass()";
-  let options = [];
-}
-
-def InlineFusionPass : Pass<"inline-fusion", "mlir::func::FuncOp"> {
-  let summary = "Inline mhlo.fusion regions.";
-  let description = [{
-    Inlines all mhlo.fusion regions into the parent block.
-  }];
-  let constructor = "hlo::createInlineFusionPass()";
-}
-
-def UnbufferizePass : Pass<"unbufferize", "mlir::func::FuncOp"> {
-  let summary = "Unbufferize partially bufferized functions.";
-  let description = [{
-    Removes bufferization.to_tensor and memref.tensor_store ops that are the 
-    result of XLA bufferizing during HLO to MHLO transformation.
-  }];
-  let constructor = "hlo::createUnbufferizePass()";
-}
-
-def UnrollLoopsPass : Pass<"unroll-loops"> {
-  let summary = "Unrolls scf.for loops with small static iteration counts.";
-  let constructor = "hlo::createUnrollLoopsPass()";
-}
-
-def AllocToArgPass : Pass<"alloc-to-arg", "mlir::func::FuncOp"> {
-  let summary = "Hoist memref allocations to function arguments.";
-  let description = [{
-    Replaces memref.alloc uses with a new argument of the parent function.
-  }];
-  let constructor = "hlo::createAllocToArgPass()";
-}
-
-def ScalarizationPass : Pass<"scalarize", "mlir::func::FuncOp"> {
-  let summary = "Converts ops on tensors with 1 element to scalar ops.";
-  let dependentDialects = [
-    "arith::ArithDialect",
-    "gml_st::GmlStDialect",
-    "scf::SCFDialect",
-    "tensor::TensorDialect"
-  ];
-  let constructor = "createScalarizationPass()";
-}
-
-#endif // TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo/CMakeLists.txt
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/CMakeLists.txt
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/CMakeLists.txt
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..1cffce7c86f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/CMakeLists.txt
@@ -0,0 +1,58 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
+function(add_mlir_hlo_dialect_separate_files dialect)
+  set(LLVM_TARGET_DEFINITIONS ${dialect}.td)
+  mlir_tablegen(${dialect}.h.inc -gen-op-decls)
+  mlir_tablegen(${dialect}.cc.inc -gen-op-defs)
+  set(LLVM_TARGET_DEFINITIONS ${dialect}_structs.td)
+  mlir_tablegen(${dialect}_structs.h.inc -gen-attrdef-decls)
+  mlir_tablegen(${dialect}_structs.cc.inc -gen-attrdef-defs)
+  add_public_tablegen_target(MLIR${dialect}IncGen)
+  add_dependencies(mlir-headers MLIR${dialect}IncGen)
+endfunction()
+
+add_mlir_hlo_dialect_separate_files(lhlo_ops)
+
+add_mlir_interface(lhlo_structured_interface)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(LmhloStructuredInterface
+  lhlo_structured_interface.cc
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+
+  DEPENDS
+  MLIRlhlo_structured_interfaceIncGen
+)
+
+add_mlir_dialect_library(LmhloDialect
+  lhlo_ops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+
+  LINK_LIBS PUBLIC
+  HloOpsCommon
+  LmhloStructuredInterface
+  MhloDialect
+  MLIRIR
+)
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_dialect.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_dialect.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td
index 684b5797301..2f973a38e10 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_dialect.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_dialect.td
@@ -22,8 +22,7 @@ include "mlir/IR/OpBase.td"
 def LHLO_Dialect : Dialect {
   let name = "lmhlo";
   let cppNamespace = "::mlir::lmhlo";
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif  // LHLO_DIALECT
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
similarity index 91%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_ops.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
index c282009f9a8..e9ebf1e4de0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
@@ -15,14 +15,16 @@ limitations under the License.
 
 // This file defines the operations used in the LMHLO dialect.
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "lhlo/IR/lhlo_ops.h"
 
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 
+#include <optional>
 #include <unordered_set>
 
+#include "lhlo/utils/lhlo_utils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -32,13 +34,12 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"
-#include "mlir-hlo/utils/lhlo_utils.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops_common.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
@@ -54,21 +55,23 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 
 #define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.cc.inc"
+#include "lhlo/IR/lhlo_ops_structs.cc.inc"
 
 namespace mlir {
 namespace lmhlo {
 
+using mhlo::TokenType;
+
 LmhloDialect::LmhloDialect(MLIRContext* context)
     : Dialect(getDialectNamespace(), context, TypeID::get<LmhloDialect>()) {
   context->loadDialect<mhlo::MhloDialect>();
   addOperations<
 #define GET_OP_LIST
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.cc.inc"
+#include "lhlo/IR/lhlo_ops.cc.inc"
       >();
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.cc.inc"
+#include "lhlo/IR/lhlo_ops_structs.cc.inc"
       >();
 }
 
@@ -120,13 +123,19 @@ LogicalResult AbsOp::verify() {
 // TODO(jurahul): Add verification for output shape.
 LogicalResult AllGatherOp::verify() {
   AllGatherOp op = *this;
-  return mlir::hlo::verifyReplicaGroups(op, /*isUniformSized=*/true);
+  return mlir::hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
+                                        /*allGroupsMustHaveSameSize=*/true,
+                                        op.getUseGlobalDeviceIds(),
+                                        /*expectedGroupSize=*/std::nullopt);
 }
 
 // TODO(jurahul): Add verification for output shape.
 LogicalResult AllToAllOp::verify() {
   AllToAllOp op = *this;
-  return mlir::hlo::verifyReplicaGroups(op, /*isUniformSized=*/true);
+  return mlir::hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
+                                        /*allGroupsMustHaveSameSize=*/true,
+                                        /*useGlobalDeviceIds=*/false,
+                                        /*expectedGroupSize=*/std::nullopt);
 }
 
 //===----------------------------------------------------------------------===//
@@ -144,7 +153,10 @@ LogicalResult AllReduceOp::verify() {
 
 LogicalResult ReduceScatterOp::verify() {
   ReduceScatterOp op = *this;
-  if (failed(mlir::hlo::verifyReplicaGroups(op, /*isUniformSized=*/true)))
+  if (failed(hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
+                                      /*allGroupsMustHaveSameSize=*/true,
+                                      op.getUseGlobalDeviceIds(),
+                                      /*expectedGroupSize=*/std::nullopt)))
     return failure();
   if (failed(mlir::hlo::verifyReduceScatter(
           op, /*operandTypes=*/op.getInputs().getTypes(),
@@ -158,7 +170,7 @@ LogicalResult ReduceScatterOp::verify() {
 // CaseOp
 //===----------------------------------------------------------------------===//
 
-void CaseOp::getSuccessorRegions(Optional<unsigned> index,
+void CaseOp::getSuccessorRegions(std::optional<unsigned> index,
                                  ArrayRef<Attribute> /*operands*/,
                                  SmallVectorImpl<RegionSuccessor>& regions) {
   // If the predecessor is the CaseOp, branch to all other branches.
@@ -356,7 +368,7 @@ struct RemoveCopyInReduceBody : public OpRewritePattern<ReduceOp> {
         SmallVector<Location>(oldReduceBody.getNumArguments(),
                               reduce.getLoc()));
 
-    mlir::BlockAndValueMapping bvm;
+    mlir::IRMapping bvm;
     for (auto item : llvm::zip(reduce.getBody().front().getArguments(),
                                newBlock->getArguments())) {
       bvm.map(std::get<0>(item), std::get<1>(item));
@@ -396,7 +408,7 @@ LogicalResult ReduceWindowOp::verify() {
 // WhileOp
 //===----------------------------------------------------------------------===//
 
-void WhileOp::getSuccessorRegions(Optional<unsigned> index,
+void WhileOp::getSuccessorRegions(std::optional<unsigned> index,
                                   ArrayRef<Attribute> /*operands*/,
                                   SmallVectorImpl<RegionSuccessor>& regions) {
   // If the predecessor is the WhileOp or the body region, branch into the
@@ -422,7 +434,7 @@ using mlir::hlo::printWindowAttributes;
 }  // namespace mlir
 
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.cc.inc"
+#include "lhlo/IR/lhlo_ops.cc.inc"
 
 namespace mlir {
 namespace lmhlo {
@@ -436,7 +448,7 @@ void FusionOp::build(OpBuilder& builder, OperationState& result,
   FusionOp::ensureTerminator(*bodyRegion, builder, result.location);
 }
 
-void FusionOp::getSuccessorRegions(Optional<unsigned> index,
+void FusionOp::getSuccessorRegions(std::optional<unsigned> index,
                                    ArrayRef<Attribute> /*operands*/,
                                    SmallVectorImpl<RegionSuccessor>& regions) {
   // If the predecessor is the fusion region, jump back to the parent op.
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h
similarity index 84%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h
index cd355cfd6bf..f5a1f7c013c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h
@@ -15,13 +15,13 @@ limitations under the License.
 
 // This file defines the operations used in the LHLO dialect.
 
-#ifndef MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_H
-#define MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_H
+#ifndef MLIR_HLO_LHLO_IR_LHLO_OPS_H
+#define MLIR_HLO_LHLO_IR_LHLO_OPS_H
 
+#include "lhlo/IR/lhlo_ops_structs.h"
+#include "lhlo/IR/lhlo_structured_interface.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -59,6 +59,6 @@ class LmhloDialect : public Dialect {
 }  // end namespace mlir
 
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h.inc"
+#include "lhlo/IR/lhlo_ops.h.inc"
 
-#endif  // MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_H
+#endif  // MLIR_HLO_LHLO_IR_LHLO_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
similarity index 93%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
index a7e1af82d60..f6b36221f96 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 // to merge these two files together, but we need to consider the following
 // obstacles:
 // * We need to have a common representation for arguments. That is to say,
-//   HLO_Array<X> translates to HLO_Tensor<X> in HLO dialect, and
+//   HLO_Array<X> translates to MHLO_Tensor<X> in HLO dialect, and
 //   Arg<LHLO_Buffer<X>, "", [Mem(Read|Write)]> in LHLO. Array types within
 //   tuples also need to be transformed.
 // * As of now, TableGen's dag functions are not sufficient to accomplish the
@@ -40,10 +40,10 @@ include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_dialect.td"
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_base.td"
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.td"
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.td"
+include "lhlo/IR/lhlo_dialect.td"
+include "lhlo/IR/lhlo_ops_base.td"
+include "lhlo/IR/lhlo_ops_structs.td"
+include "lhlo/IR/lhlo_structured_interface.td"
 
 //===----------------------------------------------------------------------===//
 // LMHLO nullary op definitions.
@@ -161,6 +161,15 @@ def LHLO_CosineOp: LHLO_UnaryElementwiseOp<"cosine", LHLO_FpOrComplexBuffer> {
     https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
   }];
 }
+def LHLO_TanOp: LHLO_UnaryElementwiseOp<"tan", LHLO_FpOrComplexBuffer> {
+  let summary = "Tan operator";
+  let description = [{
+    Returns `Tan(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
 def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exponential", LHLO_FpOrComplexBuffer> {
   let summary = "Exponential operator";
   let description = [{
@@ -631,15 +640,16 @@ def LHLO_CustomCallOp : LHLO_Op<"custom_call", [AttrSizedOperandSegments]> {
     Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$output,
     StrAttr:$call_target_name,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$has_side_effect,
-    DefaultValuedStrAttr<StrAttr, "">:$backend_config,
+    OptionalAttr<AnyAttrOf<[StrAttr, DictionaryAttr]>>:$backend_config,
     // TODO(b/189822916): Remove this field when all clients are migrated to
     // the status-returning API.
-    DefaultValuedOptionalAttr<HLO_CustomCallApiVersionAttr,
+    DefaultValuedOptionalAttr<MHLO_CustomCallApiVersionAttr,
                       "mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL">:
                       $api_version,
     OptionalAttr<CustomCallTargetArgMappingAttr>:$target_arg_mapping
   );
   let hasVerifier = 1;
+  let regions = (region AnyRegion:$called_computation);
 }
 
 //===----------------------------------------------------------------------===//
@@ -662,8 +672,8 @@ def LHLO_CompareOp: LHLO_Op<"compare", [Elementwise]> {
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
     Arg<LHLO_PredBuffer, "", [MemWrite]>:$out,
     OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
-    HLO_ComparisonDirectionAttr:$comparison_direction,
-    OptionalAttr<HLO_ComparisonTypeAttr>:$compare_type
+    MHLO_ComparisonDirectionAttr:$comparison_direction,
+    OptionalAttr<MHLO_ComparisonTypeAttr>:$compare_type
   );
 }
 
@@ -874,7 +884,7 @@ def LHLO_ConvolutionOp : LHLO_Op<"convolution", []> {
        Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
        Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
        Arg<LHLO_Buffer, "", [MemWrite]>:$output),
-    ConvolutionAttributes.attributes);
+    MHLO_ConvolutionAttributes.attributes);
 
   code extraClassDeclaration = [{
     bool hasWindowReversal() {
@@ -921,8 +931,8 @@ def LHLO_DotOp: LHLO_Op<"dot", []> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    DotDimensionNumbers:$dot_dimension_numbers,
-    HLO_PrecisionConfigAttr:$precision_config,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
 }
@@ -931,7 +941,7 @@ def LHLO_GatherOp: LHLO_Op<"gather", []> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_IntBuffer, "", [MemRead]>:$start_indices,
-    GatherDimensionNumbers:$dimension_numbers,
+    MHLO_GatherDimensionNumbers:$dimension_numbers,
     I64ElementsAttr:$slice_sizes,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
@@ -964,7 +974,7 @@ def LHLO_ScatterOp: LHLO_Op<"scatter", []> {
     Arg<LHLO_Buffer, "", [MemRead]>:$scatter_indices,
     Arg<LHLO_Buffer, "", [MemRead]>:$updates,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    ScatterDimensionNumbers:$scatter_dimension_numbers,
+    MHLO_ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$unique_indices
   );
@@ -1091,7 +1101,7 @@ class LHLO_CollectiveCommunicationOp<string name, list<Trait> traits = []> :
     Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
     I64ElementsAttr:$replica_groups,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$constrain_layout,
-    OptionalAttr<ChannelHandle>:$channel_id,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_id,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids
   );
   let hasVerifier = 1;
@@ -1162,7 +1172,7 @@ def LHLO_CollectivePermuteOp: LHLO_Op<"collective_permute", [SameTypeOperands]>
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<ChannelHandle>:$channel_id
+    OptionalAttr<MHLO_ChannelHandle>:$channel_id
   );
   let hasVerifier = 1;
 }
@@ -1178,7 +1188,7 @@ def LHLO_FftOp: LHLO_Op<"fft", []> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    HLO_FftTypeAttr:$fft_type,
+    MHLO_FftTypeAttr:$fft_type,
     I64ElementsAttr:$fft_length
   );
 }
@@ -1289,10 +1299,10 @@ def LHLO_TriangularSolveOp: LHLO_Op<"triangular_solve", [SameOperandsElementType
     BoolAttr:$left_side,
     BoolAttr:$lower,
     BoolAttr:$unit_diagonal,
-    HLO_TransposeAttr:$transpose_a,
-    HLO_LayoutAttr:$layout_a,
-    HLO_LayoutAttr:$layout_b,
-    HLO_LayoutAttr:$layout_output
+    MHLO_TransposeAttr:$transpose_a,
+    MHLO_LayoutAttr:$layout_a,
+    MHLO_LayoutAttr:$layout_b,
+    MHLO_LayoutAttr:$layout_output
   );
 }
 
@@ -1345,6 +1355,90 @@ def LHLO_SortOp: LHLO_Op<"sort", [SameVariadicOperandSize, SameOperandsShape]> {
   let regions = (region SizedRegion<1>:$comparator);
 }
 
+//===----------------------------------------------------------------------===//
+// Point-to-point communication operations.
+//===----------------------------------------------------------------------===//
+
+def LHLO_SendOp : LHLO_Op<"send", []> {
+
+  let summary = "Send operator";
+
+  let description = [{
+    Sends the given operand data to a Recv instruction in another computation
+    that shares the same channel handle. Does not return any data. Send is an
+    asynchronous operation, and must be paired with a SendDone operation to
+    wait for the completion of the data transfer.
+
+    See https://www.tensorflow.org/xla/operation_semantics#send.
+  }];
+
+  let arguments = (ins
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
+    MHLO_ChannelHandle:$channel_handle,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer,
+    DefaultValuedOptionalAttr<DictionaryAttr, "{}">:$frontend_attributes
+  );
+
+  let results = (outs MHLO_Token:$token);
+}
+
+def LHLO_RecvOp : LHLO_Op<"recv", []> {
+
+  let summary = "Recv operator";
+
+  let description = [{
+    Receives data of the given shape from a Send instruction in another
+    computation that shares the same channel handle. Recv is an asynchronous
+    operation, and must be paired with a RecvDone operation to wait for the
+    completion of the data transfer.
+
+    See https://www.tensorflow.org/xla/operation_semantics#recv.
+  }];
+
+  let arguments = (ins
+    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
+    MHLO_ChannelHandle:$channel_handle,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer,
+    DefaultValuedOptionalAttr<DictionaryAttr, "{}">:$frontend_attributes
+  );
+
+  let results = (outs MHLO_Token:$token);
+}
+
+def LHLO_SendDoneOp : LHLO_Op<"send_done", []> {
+
+  let summary = "SendDone operator";
+
+  let description = [{
+    Waits for the completion of corresponding Send operation data transfer.
+
+    See https://www.tensorflow.org/xla/operation_semantics#send.
+  }];
+
+  let arguments = (ins
+    MHLO_Token:$token,
+    MHLO_ChannelHandle:$channel_handle,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
+  );
+}
+
+def LHLO_RecvDoneOp : LHLO_Op<"recv_done", []> {
+
+  let summary = "RecvDone operator";
+
+  let description = [{
+    Waits for the completion of corresponding Recv operation data transfer.
+
+    See https://www.tensorflow.org/xla/operation_semantics#recv.
+  }];
+
+  let arguments = (ins
+    MHLO_Token:$token,
+    MHLO_ChannelHandle:$channel_handle,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // Late operations
 //===----------------------------------------------------------------------===//
@@ -1426,7 +1520,7 @@ def TerminatorOp :
   }];
   let builders = [
     OpBuilder<(ins "ValueRange":$operands),
-    [{ build($_builder, $_state, llvm::None, operands, llvm::None); }]>];
+    [{ build($_builder, $_state, std::nullopt, operands, std::nullopt); }]>];
 }
 
 def LHLO_RealDynamicSliceOp: LHLO_Op<
@@ -1475,8 +1569,8 @@ def LHLO_DotGeneralOp: LHLO_Op<"dot_general", []> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    DotDimensionNumbers:$dot_dimension_numbers,
-    HLO_PrecisionConfigAttr:$precision_config,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
 }
@@ -1491,7 +1585,7 @@ def LHLO_DynamicGatherOp: LHLO_Op<"dynamic_gather", []> {
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_IntBuffer, "", [MemRead]>:$start_indices,
     Arg<LHLO_DimensionBuffer, "", [MemRead]>:$slice_sizes,
-    GatherDimensionNumbers:$dimension_numbers,
+    MHLO_GatherDimensionNumbers:$dimension_numbers,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
 }
@@ -1568,7 +1662,7 @@ def LHLO_DynamicConvOp : LHLO_Op<"dynamic_conv", []> {
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$d_padding,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output),
-    ConvolutionAttributes.attributes);
+    MHLO_ConvolutionAttributes.attributes);
 }
 
 def LHLO_DynamicReshapeOp: LHLO_Op<"dynamic_reshape", []> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_base.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td
similarity index 82%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_base.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td
index 58cf86ddc8d..0ba78dfd698 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_base.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_base.td
@@ -18,14 +18,14 @@ limitations under the License.
 
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
 include "mlir/IR/OpBase.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.td"
+include "mhlo/IR/hlo_ops_common.td"
 
 //===----------------------------------------------------------------------===//
 // LMHLO type definitions.
 //===----------------------------------------------------------------------===//
 
 // Any integer tensor types
-def LHLO_IntBuffer : MemRefOf<[HLO_Int]>;
+def LHLO_IntBuffer : MemRefOf<[MHLO_Int]>;
 
 // Any floating-point tensor types
 def LHLO_FpBuffer : MemRefOf<[AnyFloat]>;
@@ -34,16 +34,16 @@ def LHLO_ComplexBuffer : MemRefOf<[AnyComplex]>;
 
 def LHLO_FpOrComplexBuffer : MemRefOf<[AnyFloat, AnyComplex]>;
 
-def LHLO_PredBuffer : MemRefOf<[HLO_Pred]>;
+def LHLO_PredBuffer : MemRefOf<[MHLO_Pred]>;
 
 // Any integer or floating-point tensor types
-def LHLO_IntOrFpBuffer : MemRefOf<[HLO_Int, AnyFloat]>;
+def LHLO_IntOrFpBuffer : MemRefOf<[MHLO_Int, AnyFloat]>;
 
-def LHLO_PredOrIntBuffer : MemRefOf<[HLO_Int, HLO_Pred]>;
+def LHLO_PredOrIntBuffer : MemRefOf<[MHLO_Int, MHLO_Pred]>;
 
 def LHLO_Buffer : MemRefOf<[AnyFloat, AnyInteger, AnyComplex]>;
 
-def LHLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
+def LHLO_DimensionValue : AnyTypeOf<[Index, MHLO_Pred, MHLO_Int]>;
 
 // Dynamic representation of a shape vector
 def LHLO_DimensionBuffer : MemRefRankOf<[LHLO_DimensionValue], [1]>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h
similarity index 81%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h
index 534d9ffdc12..593249a71eb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // This file defines structures used in LMHLO dialect.
 
-#ifndef MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_STRUCTS_H
-#define MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_STRUCTS_H
+#ifndef MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
+#define MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
 
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -25,6 +25,6 @@ limitations under the License.
 // Order matters, this .inc header is not self-contained, and relies on the
 // #includes above.
 #define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.h.inc"
+#include "lhlo/IR/lhlo_ops_structs.h.inc"
 
-#endif  // MLIR_HLO_DIALECT_LHLO_IR_LHLO_OPS_STRUCTS_H
+#endif  // MLIR_HLO_LHLO_IR_LHLO_OPS_STRUCTS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td
index dd0601cfc28..44a3650fa69 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops_structs.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops_structs.td
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef LHLO_OPS_STRUCTS
 #define LHLO_OPS_STRUCTS
 
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_dialect.td"
+include "lhlo/IR/lhlo_dialect.td"
 include "mlir/IR/AttrTypeBase.td"
 
 // This attribute defines information about how arguments to the LHLO custom
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_structured_interface.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc
similarity index 84%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_structured_interface.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc
index a6cb1103cb7..73bd3450c16 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/lhlo_structured_interface.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.h"
+#include "lhlo/IR/lhlo_structured_interface.h"
 
 namespace mlir {
 namespace lmhlo {
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.cpp.inc"
+#include "lhlo/IR/lhlo_structured_interface.cpp.inc"
 
 }  // namespace lmhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h
new file mode 100644
index 00000000000..0a584db58c4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.h
@@ -0,0 +1,24 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
+#define MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
+
+#include "mlir/IR/OpDefinition.h"
+
+/// Include the generated interface declarations.
+#include "lhlo/IR/lhlo_structured_interface.h.inc"
+
+#endif  // MLIR_HLO_LHLO_IR_LHLO_STRUCTURED_INTERFACE_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.td
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_structured_interface.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_structured_interface.td
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..b1a44b270e6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
@@ -0,0 +1,51 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS lmhlo_passes.td)
+mlir_tablegen(lmhlo_passes.h.inc -gen-pass-decls -name AllLmhlo)
+add_public_tablegen_target(MLIRLmhloPassIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(LmhloPasses
+  legalize_to_tensor_op/legalize_to_tensor_op.cc
+  lhlo_elemental_utils.cc
+  lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
+  lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
+  lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
+
+  DEPENDS
+  MLIRlhlo_opsIncGen
+  MLIRLmhloPassIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MLIRComplexDialect
+  MLIRGPUOps
+  MLIRLinalgDialect
+  MLIRLinalgTransforms
+  MLIRMhloUtils
+  MLIRIR
+  MLIRPass
+  MLIRRewrite
+  MLIRTransformUtils
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/legalize_to_tensor_op.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/legalize_to_tensor_op.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
index 0c5ea02abe1..a53eb342c94 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/legalize_to_tensor_op.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file implements logic for lowering bufferization.to_tensor ops that are
 // inserted during `mhlo-legalize-to-lmhlo`.
 
-#include "mlir-hlo/Dialect/lhlo/transforms/passes.h"
+#include "lhlo/transforms/passes.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -34,7 +34,7 @@ namespace mlir {
 namespace lmhlo {
 
 #define GEN_PASS_DEF_LEGALIZETOTENSOROPPASS
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
+#include "lhlo/transforms/lmhlo_passes.h.inc"
 
 namespace {
 using shape::ShapeOfOp;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_elemental_utils.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_elemental_utils.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc
index 116df06fd0e..85183e5b020 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_elemental_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc
@@ -16,12 +16,11 @@ limitations under the License.
 // This file provides basic utilities for the elemental lowering of
 // each node
 
-#include "mlir-hlo/Dialect/lhlo/transforms/lhlo_elemental_utils.h"
+#include "lhlo/transforms/lhlo_elemental_utils.h"
 
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
 #include "llvm/Support/Debug.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h"
-#include "mlir-hlo/utils/codegen_utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "utils/codegen_utils.h"
 
 using mlir::memref::DimOp;
 using mlir::memref::LoadOp;
@@ -173,7 +173,7 @@ Value elementalLowerImplForBroadcastInDimOps(OpBuilder* b, Location loc,
         auto zero = b->create<arith::ConstantOp>(
             loc, b->getIndexType(), b->getIntegerAttr(b->getIndexType(), 0));
         inputIndex.push_back(zero);
-      } else if (staticDimSize == ShapedType::kDynamicSize) {
+      } else if (staticDimSize == ShapedType::kDynamic) {
         // we are not sure if this dim is to be broadcasted at compile time
         auto dimSize = b->create<DimOp>(loc, operandMemref, inputDim);
         auto one = b->create<arith::ConstantOp>(
@@ -253,7 +253,7 @@ memref::ReinterpretCastOp createMemRef1DReinterpretCast(OpBuilder& b,
   Value zero = b.create<mlir::arith::ConstantOp>(
       loc, b.getIndexType(), b.getIntegerAttr(b.getIndexType(), 0));
   auto memref1dType =
-      MemRefType::get({ShapedType::kDynamicSize}, memrefTy.getElementType(),
+      MemRefType::get({ShapedType::kDynamic}, memrefTy.getElementType(),
                       b.getMultiDimIdentityMap(1), memrefTy.getMemorySpace());
   return b.create<memref::ReinterpretCastOp>(
       loc, memref1dType, memref, zero, ValueRange{size}, ValueRange{stride});
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lhlo_elemental_utils.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lhlo_elemental_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h
index 88dea945c59..ac906415a4a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/lhlo_elemental_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
-#define MLIR_HLO_DIALECT_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
+#ifndef MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
+#define MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
 
 #include "mlir/IR/Builders.h"
 
@@ -71,4 +71,4 @@ memref::LoadOp createOffsetLoad(OpBuilder& b, Location loc, Value memref,
 }  // namespace lmhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
+#endif  // MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_affine.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
index ad53a5ab9ae..0e75f30e72b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 // This file implements logic for lowering LHLO dialect to Affine dialect.
 
+#include <optional>
 #include <utility>
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -30,7 +31,7 @@ namespace mlir {
 namespace lmhlo {
 
 #define GEN_PASS_DEF_LHLOLEGALIZETOAFFINEPASS
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
+#include "lhlo/transforms/lmhlo_passes.h.inc"
 
 namespace {
 
@@ -212,7 +213,7 @@ static void fillBuffer(Location loc, Value buffer, Value fillValue,
   SmallVector<Value, 4> ivs(rank);
   AffineForOp forOp;
   for (unsigned i = 0; i < rank; ++i) {
-    forOp = builder.create<AffineForOp>(loc, llvm::None, lbMap, dimSizes[i],
+    forOp = builder.create<AffineForOp>(loc, std::nullopt, lbMap, dimSizes[i],
                                         idSymMap);
     builder.setInsertionPointToStart(forOp.getBody());
     ivs[i] = forOp.getInductionVar();
@@ -223,7 +224,7 @@ static void fillBuffer(Location loc, Value buffer, Value fillValue,
           fillValueType.isIntOrFloat()) &&
          "init value has to be a 0-d memref or int or fp");
   Value initVal = fillMemRefType ? builder.create<AffineLoadOp>(
-                                       loc, fillValue, /*indices=*/llvm::None)
+                                       loc, fillValue, /*indices=*/std::nullopt)
                                  : fillValue;
   builder.create<AffineStoreOp>(loc, initVal, buffer, ivs);
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_gpu.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
index 0fd72820d6f..e3794a13620 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
@@ -16,10 +16,11 @@ limitations under the License.
 // This file implements logic for lowering LHLO dialect to GPU dialect.
 
 #include <cstdint>
+#include <optional>
 
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -28,7 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -43,7 +44,7 @@ namespace mlir {
 namespace lmhlo {
 
 #define GEN_PASS_DEF_LHLOLEGALIZETOGPUPASS
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
+#include "lhlo/transforms/lmhlo_passes.h.inc"
 
 namespace {
 
@@ -125,9 +126,8 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       // inline the body.
       auto output = *reduceOp.getOut().begin();
       auto resType = MemRefType::get(
-          llvm::None, getElementTypeOrSelf(output.getType()),
-          makeStridedLinearLayoutMap(llvm::None,
-                                     MemRefType::getDynamicStrideOrOffset(),
+          std::nullopt, getElementTypeOrSelf(output.getType()),
+          makeStridedLinearLayoutMap(std::nullopt, ShapedType::kDynamic,
                                      rewriter.getContext()));
       OpFoldResult offset = launchOp.getThreadIds().x;
       auto oneAttr = rewriter.getI64IntegerAttr(1);
@@ -152,7 +152,7 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
 
       // Now copy over the actual body of the reduction, leaving out the
       // terminator.
-      BlockAndValueMapping mapping;
+      IRMapping mapping;
       mapping.map(reduceOp.getBody().getArgument(0), accumulator);
       mapping.map(reduceOp.getBody().getArgument(1), rhs);
       mapping.map(reduceOp.getBody().getArgument(2), accumulator);
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_parallel_loops.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
index bc6c354ce5a..e192c0491b5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+
+#include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -30,7 +32,7 @@ namespace mlir {
 namespace lmhlo {
 
 #define GEN_PASS_DEF_LHLOLEGALIZETOPARALLELLOOPSPASS
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
+#include "lhlo/transforms/lmhlo_passes.h.inc"
 
 namespace {
 
@@ -56,7 +58,7 @@ Value applySingleResultLhloCode(Location loc, ValueRange operands,
     b->create<memref::StoreOp>(loc, operand.value(), argBufs[operand.index()]);
   }
   // Clone the ops from `lhlo_block`.
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   mapping.map(lhloBlock->getArguments(), argBufs);
   for (auto& nested : lhloBlock->without_terminator()) {
     auto* clone = b->clone(nested, mapping);
@@ -84,7 +86,7 @@ void convertToReductionOperator(Location loc, scf::ReduceOp reduceOp,
 // to extract dimension at runtime.
 Value getStaticOrDynamicDim(mlir::Location loc, Value shapedValue,
                             size_t dimIndex, int64_t dim, OpBuilder* b) {
-  return dim == ShapedType::kDynamicSize
+  return dim == ShapedType::kDynamic
              ? (Value)b->create<memref::DimOp>(loc, shapedValue, dimIndex)
              : (Value)b->create<arith::ConstantIndexOp>(loc, dim);
 }
@@ -208,7 +210,7 @@ class ReduceOpConverter : public OpConversionPattern<lmhlo::ReduceOp> {
         createReduceOpInNestedParallelLoops(reduceOp, &rewriter);
     convertToReductionOperator(reduceOp.getLoc(), scfReduceOp,
                                &reduceOp.getBody().front(), &rewriter);
-    rewriter.replaceOp(reduceOp, llvm::None);
+    rewriter.replaceOp(reduceOp, std::nullopt);
     return success();
   }
 
@@ -384,7 +386,7 @@ class ReduceWindowOpConverter
 
     convertToReductionOperator(reduceWindowOp.getLoc(), reduceOp,
                                &reduceWindowOp.getBody().front(), &rewriter);
-    rewriter.replaceOp(reduceWindowOp, llvm::None);
+    rewriter.replaceOp(reduceWindowOp, std::nullopt);
     return success();
   }
 
@@ -519,7 +521,7 @@ class SelectAndScatterOpConverter
                                   &sAndSOp.getScatter().front(), &rmwBuilder);
     rmwBuilder.create<memref::AtomicYieldOp>(loc, accResult);
 
-    rewriter.replaceOp(sAndSOp, llvm::None);
+    rewriter.replaceOp(sAndSOp, std::nullopt);
     return success();
   }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td
new file mode 100644
index 00000000000..8cddbbf7dc9
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lmhlo_passes.td
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def LhloLegalizeToAffinePass : Pass<"lhlo-legalize-to-affine", "func::FuncOp"> {
+  let summary = "Legalize from LHLO dialect to affine dialect.";
+  let constructor = "createLhloLegalizeToAffinePass()";
+}
+
+
+def LhloLegalizeToGpuPass : Pass<"lhlo-legalize-to-gpu", "func::FuncOp"> {
+  let summary = "Legalize from LHLO dialect to GPU dialect.";
+  let constructor = "createLegalizeToGpuPass()";
+}
+
+
+def LhloLegalizeToParallelLoopsPass : Pass<"lhlo-legalize-to-parallel-loops", "func::FuncOp"> {
+  let summary = "Legalize from LHLO dialect to parallel loops.";
+  let constructor = "createLegalizeLhloToParallelLoopsPass()";
+}
+
+def LegalizeToTensorOpPass : Pass<"lhlo-legalize-to-tensor-op", "func::FuncOp"> {
+  let summary = "Legalize bufferization.to_tensor ops inserted during mhlo to lmhlo conversion.";
+  let constructor = "createLegalizeToTensorOpPass()";
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h
similarity index 91%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_hlo_to_lhlo_op.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h
index a4c0515bcf3..b3849af1c8e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_hlo_to_lhlo_op.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
-#define MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
+#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
+#define MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
 
 #include <type_traits>
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
@@ -98,6 +98,7 @@ MAP_HLO_TO_LHLO(SineOp);
 MAP_HLO_TO_LHLO(SliceOp);
 MAP_HLO_TO_LHLO(SqrtOp);
 MAP_HLO_TO_LHLO(SubtractOp);
+MAP_HLO_TO_LHLO(TanOp);
 MAP_HLO_TO_LHLO(TanhOp);
 MAP_HLO_TO_LHLO(TransposeOp);
 MAP_HLO_TO_LHLO(XorOp);
@@ -110,4 +111,4 @@ MAP_HLO_TO_LHLO(RoundOp);
 }  // namespace mhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
+#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lhlo_to_hlo_op.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h
similarity index 91%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lhlo_to_hlo_op.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h
index 9666e4ffd6f..d1e7b520693 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lhlo_to_hlo_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lhlo_to_hlo_op.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
-#define MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
+#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
+#define MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
 
 #include <type_traits>
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -93,6 +93,7 @@ MAP_LHLO_TO_HLO(SineOp);
 MAP_LHLO_TO_HLO(SliceOp);
 MAP_LHLO_TO_HLO(SqrtOp);
 MAP_LHLO_TO_HLO(SubtractOp);
+MAP_LHLO_TO_HLO(TanOp);
 MAP_LHLO_TO_HLO(TanhOp);
 MAP_LHLO_TO_HLO(TransposeOp);
 MAP_LHLO_TO_HLO(XorOp);
@@ -105,4 +106,4 @@ MAP_LHLO_TO_HLO(RoundOp);
 }  // namespace lmhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
+#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h
similarity index 88%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h
index a9d450188a5..fb4a2e86672 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
-#define MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
+#ifndef MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
+#define MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
 
-#include "mlir-hlo/Dialect/lhlo/transforms/map_lhlo_to_hlo_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "lhlo/transforms/map_lhlo_to_hlo_op.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -61,4 +61,4 @@ struct LhloOpToStdScalarOp {
 }  // namespace lmhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
+#endif  // MLIR_HLO_LHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h
new file mode 100644
index 00000000000..8225dfa238c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/passes.h
@@ -0,0 +1,65 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
+
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+namespace func {
+class FuncOp;
+}  // namespace func
+namespace lmhlo {
+class FusionOp;
+}  // namespace lmhlo
+
+namespace lmhlo {
+
+#define GEN_PASS_DECL
+#include "lhlo/transforms/lmhlo_passes.h.inc"
+
+// Lowers from LHLO dialect to Affine dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLhloLegalizeToAffinePass();
+
+// Lowers from LHLO dialect to GPU dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToGpuPass();
+
+// Lowers from LHLO dialect to parallel loops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeLhloToParallelLoopsPass();
+
+// Legalizes tensor load ops that are inserted during mhlo to lmhlo conversion.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToTensorOpPass();
+
+// Input inline fusion pass for fusion codegen
+std::unique_ptr<OperationPass<func::FuncOp>> createInputInlineFusionPass();
+
+#define GEN_PASS_REGISTRATION
+#include "lhlo/transforms/lmhlo_passes.h.inc"
+
+}  // namespace lmhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_LHLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/lhlo_utils.h b/tensorflow/compiler/xla/mlir_hlo/lhlo/utils/lhlo_utils.h
similarity index 81%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/lhlo_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/lhlo/utils/lhlo_utils.h
index aac5863530b..007e9ddc7ea 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/lhlo_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/utils/lhlo_utils.h
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_UTILS_LHLO_UTILS_H
-#define MLIR_HLO_UTILS_LHLO_UTILS_H
+#ifndef MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
+#define MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"
+#include <optional>
+
+#include "mhlo/IR/hlo_ops_common.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
+#include "stablehlo/dialect/TypeInference.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -47,7 +50,10 @@ auto getInputs(OpT op, char) -> decltype(op.operands(), ValueRange{}) {
 
 template <typename OpT>
 static LogicalResult verifyAllReduce(OpT op) {
-  if (failed(mlir::hlo::verifyReplicaGroups(op, /*is_uniform_sized=*/false)))
+  if (failed(hlo::verifyReplicaGroups(op.getLoc(), op.getReplicaGroups(),
+                                      /*allGroupsMustHaveSameSize=*/false,
+                                      op.getUseGlobalDeviceIds(),
+                                      /*expectedGroupSize=*/std::nullopt)))
     return failure();
 
   // AllReduce has variadic operands and results that have the same size.
@@ -69,4 +75,4 @@ static LogicalResult verifyAllReduce(OpT op) {
 }  // namespace lmhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_UTILS_LHLO_UTILS_H
+#endif  // MLIR_HLO_LHLO_UTILS_LHLO_UTILS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/CMakeLists.txt
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/CMakeLists.txt
rename to tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/CMakeLists.txt
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/CMakeLists.txt
new file mode 100644
index 00000000000..81912905556
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/CMakeLists.txt
@@ -0,0 +1,46 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops.td)
+mlir_tablegen(lhlo_gpu_ops.h.inc -gen-op-decls)
+mlir_tablegen(lhlo_gpu_ops.cc.inc -gen-op-defs)
+
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops_enums.td)
+mlir_tablegen(lhlo_gpu_ops_enums.h.inc -gen-enum-decls)
+mlir_tablegen(lhlo_gpu_ops_enums.cc.inc -gen-enum-defs)
+mlir_tablegen(lhlo_gpu_ops_attrdefs.h.inc -gen-attrdef-decls)
+mlir_tablegen(lhlo_gpu_ops_attrdefs.cc.inc -gen-attrdef-defs)
+mlir_tablegen(lhlo_gpu_ops_dialect.h.inc -gen-dialect-decls)
+mlir_tablegen(lhlo_gpu_ops_dialect.cc.inc -gen-dialect-defs)
+
+add_public_tablegen_target(MLIRlhlo_gpu_opsIncGen)
+add_dependencies(mlir-headers MLIRlhlo_gpu_opsIncGen)
+
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_dialect_library(LmhloGPUDialect
+  lhlo_gpu_ops.cc
+
+  DEPENDS
+  MLIRlhlo_gpu_opsIncGen
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MLIRIR
+  HloOpsCommon
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.cc
similarity index 79%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc
rename to tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.cc
index 8aee61e8ee0..1977edd2b2e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 // This file defines the operations used in the LMHLO GPU dialect.
 
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.h"
 
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 
+#include "lhlo/utils/lhlo_utils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -29,9 +30,8 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"
-#include "mlir-hlo/utils/lhlo_utils.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops_common.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -73,10 +73,10 @@ static FailureOr<SmallVector<int64_t>> parseI64Array(AsmParser &parser) {
 }  // namespace mlir
 
 // Include order below matters.
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_dialect.cc.inc"
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops_dialect.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops_enums.cc.inc"
 #define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc"
 
 namespace mlir {
 namespace lmhlo_gpu {
@@ -87,11 +87,11 @@ void LmhloGpuDialect::initialize() {
   getContext()->loadDialect<mhlo::MhloDialect>();
   addOperations<
 #define GET_OP_LIST
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"
       >();
   addAttributes<
 #define GET_ATTRDEF_LIST
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.cc.inc"
       >();
 }
 
@@ -109,8 +109,18 @@ mlir::LogicalResult AllReduceStartOp::verify() {
   return lmhlo::verifyAllReduce(op);
 }
 
+//===----------------------------------------------------------------------===//
+// CollectivePermuteStartOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult CollectivePermuteStartOp::verify() {
+  CollectivePermuteStartOp op = *this;
+  return mlir::hlo::verifyCollectivePermuteSourceTargetPairs(
+      op, op.getSourceTargetPairs());
+}
+
 }  // namespace lmhlo_gpu
 }  // namespace mlir
 
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h
new file mode 100644
index 00000000000..75d5f338693
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LHLO dialect.
+
+#ifndef MLIR_HLO_LHLO_GPU_IR_LHLO_GPU_OPS_H
+#define MLIR_HLO_LHLO_GPU_IR_LHLO_GPU_OPS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir {
+class OpBuilder;
+}  // namespace mlir
+
+// Include order below matters.
+#include "lhlo_gpu/IR/lhlo_gpu_ops_dialect.h.inc"
+#include "lhlo_gpu/IR/lhlo_gpu_ops_enums.h.inc"
+#define GET_ATTRDEF_CLASSES
+#include "lhlo_gpu/IR/lhlo_gpu_ops_attrdefs.h.inc"
+#define GET_OP_CLASSES
+#include "lhlo_gpu/IR/lhlo_gpu_ops.h.inc"
+
+#endif  // MLIR_HLO_LHLO_GPU_IR_LHLO_GPU_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.td b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
index 426e261fd8f..e81d405b981 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
@@ -21,9 +21,9 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops_base.td"
-include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_base.td"
-include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td"
+include "lhlo/IR/lhlo_ops_base.td"
+include "lhlo_gpu/IR/lhlo_gpu_ops_base.td"
+include "lhlo_gpu/IR/lhlo_gpu_ops_enums.td"
 include "stablehlo/dialect/Base.td"
 
 class LHLOGPU_Op<string mnemonic, list<Trait> traits = []> :
@@ -42,7 +42,7 @@ def I32Buffer : MemRefOf<[I32]>;
 
 class GpuConvolutionAttributes<dag extraAttribs> {
   dag attributes = !con(
-    ConvolutionAttributes.attributes,
+    MHLO_ConvolutionAttributes.attributes,
     (ins F64Attr:$result_scale),
     extraAttribs,
     (ins ConvolutionBackendConfigAttr:$backend_config));
@@ -131,23 +131,44 @@ def LHLOGPU_GEMMOp : LHLOGPU_Op<"gemm"> {
     Arg<LHLO_Buffer, "", [MemRead]>:$a,
     Arg<LHLO_Buffer, "", [MemRead]>:$b,
     Arg<LHLO_Buffer, "", [MemRead, MemWrite]>:$c,
-    DotDimensionNumbers:$dot_dimension_numbers,
-    HLO_PrecisionConfigAttr:$precision_config,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config,
     F64Attr:$alpha_real,
     F64Attr:$alpha_imag,
     F64Attr:$beta,
     OptionalAttr<I64Attr>:$algorithm);
 }
 
-def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul"> {
+def LHLOGPU_CublasLtMatmulOp : LHLOGPU_Op<"cublas.lt.matmul", [AttrSizedOperandSegments]> {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$a,
     Arg<LHLO_Buffer, "", [MemRead]>:$b,
     Arg<LHLO_Buffer, "", [MemRead]>:$c,
     Arg<LHLO_Buffer, "", [MemWrite]>:$d,
     Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$bias,
-    DotDimensionNumbers:$dot_dimension_numbers,
-    HLO_PrecisionConfigAttr:$precision_config,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$aux,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config,
+    F64Attr:$alpha_real,
+    F64Attr:$alpha_imag,
+    F64Attr:$beta,
+    CublasLtMatmulEpilogueAttr:$epilogue,
+    I64Attr:$algorithm);
+}
+
+def LHLOGPU_CublasLtMatmulF8Op : LHLOGPU_Op<"cublas.lt.matmul.f8"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$a,
+    Arg<LHLO_Buffer, "", [MemRead]>:$b,
+    Arg<LHLO_Buffer, "", [MemRead]>:$c,
+    Arg<LHLO_Buffer, "", [MemRead]>:$a_scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$b_scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$c_scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$d_scale,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$d,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$d_amax,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config,
     F64Attr:$alpha_real,
     F64Attr:$alpha_imag,
     F64Attr:$beta,
@@ -164,8 +185,14 @@ def LHLOGPU_CholeskyOp : LHLOGPU_Op<"cholesky"> {
     BoolAttr:$is_lower);
 }
 
+class LHLOGPU_AsyncCollectiveCommunicationOp<string name, list<Trait> traits = []> :
+  LHLOGPU_Op<name, !listconcat(traits, [SameVariadicOperandSize])> {
+  let results = (outs MHLO_Token:$token);
+  let hasVerifier = 1;
+}
+
 def LHLOGPU_AllReduceStartOp :
-  LHLOGPU_Op<"all_reduce_start", [SameOperandsElementType, SameVariadicOperandSize]> {
+  LHLOGPU_AsyncCollectiveCommunicationOp<"all_reduce_start", [SameOperandsElementType]> {
   let summary = "AllReduceStart operator";
   let description = [{
     Performs an asynchronous custom reduction across replicas.
@@ -175,22 +202,31 @@ def LHLOGPU_AllReduceStartOp :
     Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs,
     I64ElementsAttr:$replica_groups,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$constrain_layout,
-    OptionalAttr<ChannelHandle>:$channel_id,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_id,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$use_global_device_ids
   );
-  let results = (outs HLO_Token:$token);
   let regions = (region SizedRegion<1>:$computation);
-  let hasVerifier = 1;
 }
 
-def LHLOGPU_AllReduceDoneOp:
-  LHLOGPU_Op<"all_reduce_done", [SameVariadicOperandSize]> {
+def LHLOGPU_AllReduceDoneOp: LHLOGPU_Op<"all_reduce_done"> {
   let summary = "AllReduceDone operator";
+  let arguments = (ins MHLO_Token:$token);
+}
+
+def LHLOGPU_CollectivePermuteStartOp :
+  LHLOGPU_AsyncCollectiveCommunicationOp<"collective_permute_start"> {
+  let summary = "CollectivePermuteStart operator";
   let arguments = (ins
-    HLO_Token:$token,
-    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$inputs,
-    Arg<Variadic<LHLO_Buffer>, "", [MemWrite]>:$outputs
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    I64ElementsAttr:$source_target_pairs,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_id
   );
 }
 
+def LHLOGPU_CollectivePermuteDoneOp: LHLOGPU_Op<"collective_permute_done"> {
+  let summary = "CollectivePermuteDone operator";
+  let arguments = (ins MHLO_Token:$token);
+}
+
 #endif // LHLO_GPU_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_base.td b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_base.td
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_base.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_base.td
index e54c4378a17..8019a504912 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_base.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_base.td
@@ -25,7 +25,7 @@ def LmhloGpuDialect : Dialect {
   let cppNamespace = "::mlir::lmhlo_gpu";
 
   let useDefaultAttributePrinterParser = 1;
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif // LHLO_GPU_OPS_BASE
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
similarity index 86%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
rename to tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
index 9f8f865efa8..6dda1577c05 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
@@ -20,7 +20,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/AttrTypeBase.td"
 
-include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops_base.td"
+include "lhlo_gpu/IR/lhlo_gpu_ops_base.td"
 
 def ActivationModeNone : I32EnumAttrCase<"None", 0>;
 def ActivationModeSigmoid : I32EnumAttrCase<"Sigmoid", 1>;
@@ -29,12 +29,13 @@ def ActivationModeRelu : I32EnumAttrCase<"Relu", 3>;
 def ActivationModeRelu6 : I32EnumAttrCase<"Relu6", 4>;
 def ActivationModeReluX : I32EnumAttrCase<"ReluX", 5>;
 def ActivationModeBandPass : I32EnumAttrCase<"BandPass", 6>;
+def ActivationModeElu: I32EnumAttrCase<"Elu", 7>;
 
 def Activation: I32EnumAttr<"Activation",
     "Activation applied with fused convolution",
     [ActivationModeNone,  ActivationModeSigmoid, ActivationModeTanh,
      ActivationModeRelu, ActivationModeRelu6, ActivationModeReluX,
-     ActivationModeBandPass]> {
+     ActivationModeBandPass, ActivationModeElu]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::lmhlo_gpu";
 }
@@ -86,12 +87,18 @@ def CublasLtMatmulEpilogueDefault : I32EnumAttrCase<"Default", 0>;
 def CublasLtMatmulEpilogueBias : I32EnumAttrCase<"Bias", 1>;
 def CublasLtMatmulEpilogueRelu : I32EnumAttrCase<"Relu", 2>;
 def CublasLtMatmulEpilogueBiasRelu : I32EnumAttrCase<"BiasRelu", 3>;
+def CublasLtMatmulEpilogueGelu : I32EnumAttrCase<"Gelu", 4>;
+def CublasLtMatmulEpilogueBiasGelu : I32EnumAttrCase<"BiasGelu", 5>;
+def CublasLtMatmulEpilogueGeluAux : I32EnumAttrCase<"GeluAux", 6>;
+def CublasLtMatmulEpilogueBiasGeluAux : I32EnumAttrCase<"BiasGeluAux", 7>;
 
 
 def CublasLtMatmulEpilogue: I32EnumAttr<"CublasLtMatmulEpilogue",
     "Epilogue for cublasLt matmul",
     [CublasLtMatmulEpilogueDefault, CublasLtMatmulEpilogueBias,
-     CublasLtMatmulEpilogueRelu, CublasLtMatmulEpilogueBiasRelu]> {
+     CublasLtMatmulEpilogueRelu, CublasLtMatmulEpilogueBiasRelu,
+     CublasLtMatmulEpilogueGelu, CublasLtMatmulEpilogueBiasGelu,
+     CublasLtMatmulEpilogueGeluAux, CublasLtMatmulEpilogueBiasGeluAux]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::lmhlo_gpu";
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/CMakeLists.txt
deleted file mode 100644
index a68ef2e84da..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-add_mlir_library(MLIRHLOAnalysis
-  shape_component_analysis.cc
-  userange_analysis.cc
-
-  DEPENDS
-  mlir-headers
-
-  LINK_LIBS PUBLIC
-  MLIRAnalysis
-  MLIRIR
-)
-
-add_mlir_library(MLIRHLOTestAnalysis
-  test_shape_component_analysis.cc
-  test_userange_analysis.cc
-
-  DEPENDS
-  LMHLOTransformsPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  LmhloGPUDialect
-  MLIRHLOAnalysis
-  MLIRAnalysis
-  MLIRPass
-  MLIRTransforms
-  )
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/CMakeLists.txt
deleted file mode 100644
index 4e35b973cc3..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(Analysis)
-add_subdirectory(CAPI)
-add_subdirectory(Dialect)
-add_subdirectory(Transforms)
-add_subdirectory(utils)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/CMakeLists.txt
deleted file mode 100644
index 67607173fe4..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(gml_st)
-add_subdirectory(lhlo)
-add_subdirectory(lhlo_gpu)
-add_subdirectory(mhlo)
-add_subdirectory(thlo)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/CMakeLists.txt
deleted file mode 100644
index 88672e5e298..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/CMakeLists.txt
deleted file mode 100644
index bd318090516..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(GmlStDialect
-  gml_st_ops.cc
-
-  DEPENDS
-  MLIRgml_st_opsIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRArithUtils
-  MLIRControlFlowInterfaces
-  MLIRIR
-  MLIRInferTypeOpInterface
-  MLIRLoopLikeInterface
-  MLIRMemRefDialect
-  MLIRSideEffectInterfaces
-  MLIRSupport
-  MLIRTensorDialect
-  MLIRViewLikeInterface
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc
deleted file mode 100644
index 96e0eca39fb..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/IR/gml_st_ops.cc
+++ /dev/null
@@ -1,1959 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/DialectImplementation.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-
-namespace mlir {
-namespace {
-
-void printShapeTypeDimensionsList(AsmPrinter &printer,
-                                  ArrayRef<int64_t> integers) {
-  llvm::interleave(
-      integers, printer,
-      [&](int64_t val) {
-        if (val == ShapedType::kDynamicSize)
-          printer << '?';
-        else
-          printer << val;
-      },
-      "x");
-}
-
-ParseResult parseShapeTypeDimensionsList(
-    AsmParser &parser, FailureOr<SmallVector<int64_t>> &dims) {
-  SmallVector<int64_t> vals;
-  if (failed(parser.parseDimensionList(vals, /*allowDynamic=*/true,
-                                       /*withTrailingX=*/false))) {
-    return failure();
-  }
-  dims = vals;
-  return success();
-}
-
-ParseResult parseAssignmentListWithTypes(
-    OpAsmParser &parser, SmallVectorImpl<OpAsmParser::UnresolvedOperand> &lhs,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &rhs,
-    SmallVectorImpl<Type> &types) {
-  auto parseElt = [&]() -> ParseResult {
-    if (parser.parseOperand(lhs.emplace_back(), /*allowResultNumber=*/false) ||
-        parser.parseEqual() || parser.parseOperand(rhs.emplace_back()) ||
-        parser.parseColon() || parser.parseType(types.emplace_back())) {
-      return failure();
-    }
-    return success();
-  };
-  return parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt);
-}
-
-}  // namespace
-}  // namespace mlir
-
-// Generated dialect definitions.
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_dialect.cc.inc"
-
-// Generated type classes.
-#define GET_TYPEDEF_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_types.cc.inc"
-
-// Generated attribute classes.
-#define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_attrs.cc.inc"
-
-namespace mlir {
-namespace gml_st {
-
-//===----------------------------------------------------------------------===//
-// GmlStDialect
-//===----------------------------------------------------------------------===//
-
-void GmlStDialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.cc.inc"
-      >();
-  addTypes<
-#define GET_TYPEDEF_LIST
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_types.cc.inc"
-      >();
-  addAttributes<
-#define GET_ATTRDEF_LIST
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_attrs.cc.inc"
-      >();
-}
-
-// Helper function to ensure index types for some attrbutes when folding.
-static OpFoldResult ensureIndexTypeForAttribute(OpFoldResult foldResult) {
-  if (foldResult.is<Attribute>()) {
-    auto attr = foldResult.get<Attribute>().dyn_cast<IntegerAttr>();
-    if (!attr.getType().isa<IndexType>()) {
-      Builder b(attr.getContext());
-      return b.getIndexAttr(attr.getInt());
-    }
-  }
-  return foldResult;
-}
-
-Operation *GmlStDialect::materializeConstant(OpBuilder &builder, Attribute attr,
-                                             Type type, Location loc) {
-  if (type.isa<IndexType>()) {
-    int64_t intValue = attr.cast<IntegerAttr>().getInt();
-    return builder.create<arith::ConstantIndexOp>(loc, intValue);
-  }
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// MaterializeOp
-//===----------------------------------------------------------------------===//
-
-static Type inferReturnType(ShapedType sourceType, Type setType) {
-  if (auto tileType = setType.dyn_cast<TileType>()) {
-    return sourceType.clone(tileType.getShape(), sourceType.getElementType());
-  }
-  assert(false && "could not infer result type");
-  return {};
-}
-
-void MaterializeOp::build(OpBuilder &builder, OperationState &result,
-                          Value source, Value set) {
-  auto sourceType = source.getType().cast<ShapedType>();
-  auto resultType = inferReturnType(sourceType, set.getType());
-  build(builder, result, resultType, source, set);
-}
-
-LogicalResult verifyCompatibleExtractedSubset(Operation *op,
-                                              ShapedType shapedType,
-                                              Type extractedType,
-                                              Type setType) {
-  auto sourceRank = shapedType.getRank();
-  auto elementType = shapedType.getElementType();
-
-  // If the result is a scalar, check that the tile had a single element.
-  if (!extractedType.isa<ShapedType>()) {
-    auto tileType = setType.cast<TileType>();
-    if (extractedType != elementType) {
-      return op->emitOpError("expected the result type ")
-             << extractedType << " to match source element type "
-             << elementType;
-    }
-    if (tileType.hasStaticShape() && tileType.getNumElements() == 1)
-      return success();
-
-    return op->emitOpError("expected tile type ")
-           << tileType << " to have a single element shape";
-  }
-
-  // If the result is a shaped type, compare with the inferred type.
-  auto extractedShapedType = extractedType.cast<ShapedType>();
-  auto tileType = setType.cast<TileType>();
-  int64_t tileRank = tileType.getRank();
-  if (tileRank != sourceRank) {
-    return op->emitOpError("expected source rank = ")
-           << sourceRank << " to match tile rank = " << tileRank;
-  }
-
-  auto inferredType =
-      shapedType.clone(tileType.getShape(), shapedType.getElementType());
-  if (extractedShapedType != inferredType) {
-    return op->emitOpError("expected result type = ")
-           << extractedShapedType
-           << " to match the inferred type = " << inferredType;
-  }
-
-  return success();
-}
-
-LogicalResult MaterializeOp::verify() {
-  // TODO(pifon): Add verification that was removed from TileOp::verify.
-  return verifyCompatibleExtractedSubset(getOperation(), getSource().getType(),
-                                         getType(), getSet().getType());
-}
-
-namespace {
-/// Cleans up UnrealizedConversionCast sets from materialize ops.
-struct FoldMaterializeUnrealizedConversionCast
-    : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto cast = op.getSet().getDefiningOp<UnrealizedConversionCastOp>();
-    if (!cast) return failure();
-
-    auto set = cast.getOperand(0);
-    auto newOp = rewriter.create<MaterializeOp>(
-        op.getLoc(), inferReturnType(op.getSource().getType(), set.getType()),
-        op.getSource(), set);
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, op.getType(), newOp);
-    return success();
-  }
-};
-
-/// Folds tensor::CastOp sources into MaterializeOp.
-struct FoldSrcCastIntoMaterialize : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto cast = op.getSource().getDefiningOp<tensor::CastOp>();
-    if (!cast) return failure();
-
-    auto src = cast.getSource();
-    auto set = op.getSet();
-    rewriter.replaceOpWithNewOp<MaterializeOp>(
-        op, inferReturnType(src.getType(), set.getType()), src, set);
-    return success();
-  }
-};
-}  // namespace
-
-void MaterializeOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                MLIRContext *context) {
-  results
-      .add<FoldMaterializeUnrealizedConversionCast, FoldSrcCastIntoMaterialize>(
-          context);
-}
-
-//===----------------------------------------------------------------------===//
-// LoopOp
-//===----------------------------------------------------------------------===//
-
-void LoopOp::build(OpBuilder &builder, OperationState &result,
-                   ValueRange lowerBounds, ValueRange upperBounds,
-                   ValueRange steps, ValueRange inputs, ValueRange outputs,
-                   ArrayAttr iteratorTypes,
-                   function_ref<void(OpBuilder &, Location, ValueRange,
-                                     ValueRange, ValueRange)>
-                       bodyBuilderFn) {
-  build(builder, result, lowerBounds, upperBounds, steps, inputs, outputs,
-        iteratorTypes, llvm::None, bodyBuilderFn);
-}
-
-void LoopOp::build(OpBuilder &builder, OperationState &result,
-                   ValueRange lowerBounds, ValueRange upperBounds,
-                   ValueRange steps, ValueRange inputs, ValueRange outputs,
-                   ArrayAttr iteratorTypes,
-                   Optional<ArrayAttr> distributionTypes,
-                   function_ref<void(OpBuilder &, Location, ValueRange,
-                                     ValueRange, ValueRange)>
-                       bodyBuilderFn) {
-  result.addOperands(lowerBounds);
-  result.addOperands(upperBounds);
-  result.addOperands(steps);
-  result.addOperands(inputs);
-  result.addOperands(outputs);
-  result.addAttribute(
-      LoopOp::getOperandSegmentSizeAttr(),
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lowerBounds.size()),
-                                    static_cast<int32_t>(upperBounds.size()),
-                                    static_cast<int32_t>(steps.size()),
-                                    static_cast<int32_t>(inputs.size()),
-                                    static_cast<int32_t>(outputs.size())}));
-  result.addAttribute(getIteratorTypesAttrStrName(), iteratorTypes);
-
-  if (distributionTypes.has_value())
-    result.addAttribute(getDistributionTypesAttrStrName(),
-                        distributionTypes.value());
-
-  // Add output types for `RankedTensorType` output arguments.
-  for (Value output : outputs) {
-    Type outputType = output.getType();
-    if (outputType.isa<RankedTensorType>()) result.addTypes(outputType);
-  }
-
-  OpBuilder::InsertionGuard guard(builder);
-  unsigned numIVs = steps.size();
-  SmallVector<Type, 8> argTypes(numIVs, builder.getIndexType());
-  SmallVector<Location, 8> argLocs(numIVs, result.location);
-  for (Value input : inputs) {
-    argTypes.push_back(input.getType());
-    argLocs.push_back(input.getLoc());
-  }
-  for (Value output : outputs) {
-    argTypes.push_back(output.getType());
-    argLocs.push_back(output.getLoc());
-  }
-  Region *bodyRegion = result.addRegion();
-  Block *bodyBlock = builder.createBlock(bodyRegion, {}, argTypes, argLocs);
-
-  if (bodyBuilderFn) {
-    builder.setInsertionPointToStart(bodyBlock);
-    bodyBuilderFn(builder, result.location,
-                  bodyBlock->getArguments().take_front(numIVs),
-                  bodyBlock->getArguments().slice(numIVs, inputs.size()),
-                  bodyBlock->getArguments().take_back(outputs.size()));
-    LoopOp::ensureTerminator(*bodyRegion, builder, result.location);
-  }
-}
-
-void LoopOp::print(OpAsmPrinter &p) {
-  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
-    << getUpperBound() << ") step (" << getStep() << ")";
-
-  if (!getInputs().empty()) {
-    p << " ins (";
-    llvm::interleaveComma(llvm::zip(getRegionInputArgs(), getInputs()), p,
-                          [&](auto it) {
-                            p << std::get<0>(it) << " = " << std::get<1>(it)
-                              << ": " << std::get<1>(it).getType();
-                          });
-    p << ")";
-  }
-  if (!getOutputs().empty()) {
-    p << " outs (";
-    llvm::interleaveComma(llvm::zip(getRegionOutputArgs(), getOutputs()), p,
-                          [&](auto it) {
-                            p << std::get<0>(it) << " = " << std::get<1>(it)
-                              << ": " << std::get<1>(it).getType();
-                          });
-    p << ")";
-  }
-
-  if (llvm::any_of(getIteratorTypes(), [](Attribute attr) {
-        return attr.cast<IteratorTypeAttr>().getValue() !=
-               utils::IteratorType::parallel;
-      }))
-    p << " iterators" << getIteratorTypes();
-
-  if (getDistributionTypes().has_value())
-    p << " distribution" << getDistributionTypes().value();
-
-  p << ' ';
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict(
-      getOperation()->getAttrs(),
-      /*elidedAttrs=*/{LoopOp::getOperandSegmentSizeAttr(),
-                       LoopOp::getIteratorTypesAttrName(),
-                       LoopOp::getDistributionTypesAttrName()});
-}
-
-ParseResult LoopOp::parse(OpAsmParser &parser, OperationState &result) {
-  auto &builder = parser.getBuilder();
-  // Parse an opening `(` followed by induction variables followed by `)`
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> ivs;
-  if (parser.parseOperandList(ivs, OpAsmParser::Delimiter::Paren,
-                              /*allowResultNumber=*/false))
-    return failure();
-
-  // Parse loop bounds.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> lower;
-  if (parser.parseEqual() ||
-      parser.parseOperandList(lower, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
-    return failure();
-
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> upper;
-  if (parser.parseKeyword("to") ||
-      parser.parseOperandList(upper, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
-    return failure();
-
-  // Parse step values.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> steps;
-  if (parser.parseKeyword("step") ||
-      parser.parseOperandList(steps, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
-    return failure();
-
-  // Parse input tensors.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> inputs, inputRegionArgs;
-  SmallVector<Type, 4> inputTypes;
-  if (succeeded(parser.parseOptionalKeyword("ins"))) {
-    SMLoc inputsOperandsLoc = parser.getCurrentLocation();
-
-    if (parseAssignmentListWithTypes(parser, inputRegionArgs, inputs,
-                                     inputTypes))
-      return failure();
-
-    if (parser.resolveOperands(inputs, inputTypes, inputsOperandsLoc,
-                               result.operands))
-      return failure();
-  }
-
-  // Parse output tensors.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> outputs, outputRegionArgs;
-  SmallVector<Type, 4> outputTypes;
-  if (succeeded(parser.parseOptionalKeyword("outs"))) {
-    SMLoc outputsOperandsLoc = parser.getCurrentLocation();
-
-    if (parseAssignmentListWithTypes(parser, outputRegionArgs, outputs,
-                                     outputTypes))
-      return failure();
-
-    if (parser.resolveOperands(outputs, outputTypes, outputsOperandsLoc,
-                               result.operands))
-      return failure();
-    for (Type outputType : outputTypes)
-      if (outputType.isa<RankedTensorType>()) result.addTypes(outputType);
-  }
-
-  Attribute iterTypes;
-  if (succeeded(parser.parseOptionalKeyword("iterators"))) {
-    if (parser.parseAttribute(iterTypes)) return failure();
-  } else {
-    // Set all loop iterator types to "parallel" if they are not printed in IR.
-    auto parallelIter =
-        builder.getAttr<IteratorTypeAttr>(utils::IteratorType::parallel);
-    iterTypes = builder.getArrayAttr(
-        SmallVector<Attribute, 4>(ivs.size(), parallelIter));
-  }
-
-  result.addAttribute(LoopOp::getIteratorTypesAttrStrName(), iterTypes);
-
-  if (succeeded(parser.parseOptionalKeyword("distribution"))) {
-    Attribute distributionTypes;
-    if (failed(parser.parseAttribute(distributionTypes))) return failure();
-    result.addAttribute(LoopOp::getDistributionTypesAttrStrName(),
-                        distributionTypes);
-  }
-
-  result.addAttribute(
-      LoopOp::getOperandSegmentSizeAttr(),
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lower.size()),
-                                    static_cast<int32_t>(upper.size()),
-                                    static_cast<int32_t>(steps.size()),
-                                    static_cast<int32_t>(inputs.size()),
-                                    static_cast<int32_t>(outputs.size())}));
-
-  // Parse the body.
-  Region *body = result.addRegion();
-
-  SmallVector<Type, 4> regionTypes(ivs.size(), builder.getIndexType());
-  regionTypes.append(inputTypes);
-  regionTypes.append(outputTypes);
-
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> regionOperands(ivs);
-  regionOperands.append(inputRegionArgs);
-  regionOperands.append(outputRegionArgs);
-
-  SmallVector<OpAsmParser::Argument, 4> regionArgs;
-
-  for (auto argAndType : llvm::zip(regionOperands, regionTypes)) {
-    auto &arg = regionArgs.emplace_back();
-    arg.ssaName = std::get<0>(argAndType);
-    arg.type = std::get<1>(argAndType);
-  }
-
-  if (parser.parseRegion(*body, regionArgs)) return failure();
-
-  // Parse optional attributes.
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  return success();
-}
-
-Region &LoopOp::getLoopBody() { return getRegion(); }
-
-LogicalResult LoopOp::verify() {
-  // Check if iterator types are provided for every loop dimension.
-  if (getIteratorTypes().size() != getNumLoops())
-    return emitOpError("expected iterator types array attribute size = ")
-           << getIteratorTypes().size()
-           << " to match the number of loops = " << getNumLoops();
-
-  // Check if types of input arguments match region args types.
-  for (auto &item :
-       llvm::enumerate(llvm::zip(getInputs(), getRegionInputArgs()))) {
-    Value input, inputRegionArg;
-    unsigned index = item.index();
-    std::tie(input, inputRegionArg) = item.value();
-    if (input.getType() != inputRegionArg.getType())
-      return emitOpError("expected input arg ")
-             << index << " with type = " << input.getType()
-             << " to match region arg " << index + getNumLoops()
-             << " type = " << inputRegionArg.getType();
-  }
-
-  // Check if types of output arguments match region args types.
-  for (auto &item :
-       llvm::enumerate(llvm::zip(getOutputs(), getRegionOutputArgs()))) {
-    Value output, outputRegionArg;
-    unsigned index = item.index();
-    std::tie(output, outputRegionArg) = item.value();
-    if (output.getType() != outputRegionArg.getType())
-      return emitOpError("expected output arg ")
-             << index << " with type = " << output.getType()
-             << " to match region arg "
-             << index + getNumLoops() + getInputs().size()
-             << " type = " << outputRegionArg.getType();
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// LoopLikeOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-ParseResult parseForOpOutputArgs(
-    OpAsmParser &parser, OperationState &result,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &regionOperands,
-    SmallVectorImpl<Type> &regionTypes, int32_t *outputCount) {
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> outputs, outputRegionArgs;
-  SmallVector<Type, 4> outputTypes;
-
-  auto parseElt = [&]() -> ParseResult {
-    if (parser.parseOperand(outputRegionArgs.emplace_back(),
-                            /*allowResultNumber=*/false) ||
-        parser.parseEqual()) {
-      return failure();
-    }
-    if (parser.parseOperand(outputs.emplace_back()) || parser.parseColon() ||
-        parser.parseType(outputTypes.emplace_back())) {
-      return failure();
-    }
-    *outputCount = outputs.size();
-    return success();
-  };
-  if (succeeded(parser.parseOptionalKeyword("outs"))) {
-    SMLoc loc = parser.getCurrentLocation();
-
-    if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Paren, parseElt))
-      return failure();
-    if (parser.resolveOperands(outputs, outputTypes, loc, result.operands))
-      return failure();
-  }
-  regionOperands.append(outputRegionArgs);
-  regionTypes.append(outputTypes);
-  return success();
-}
-
-}  // namespace
-
-template <typename LoopTy>
-ParseResult parseLoopLikeOp(OpAsmParser &parser, OperationState &result) {
-  auto &builder = parser.getBuilder();
-  // Parse an opening `(` followed by induction variables followed by `)`
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> ivs;
-  if (parser.parseOperandList(ivs, OpAsmParser::Delimiter::Paren,
-                              /*allowResultNumber=*/false))
-    return failure();
-
-  // Parse loop bounds.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> lower;
-  if (parser.parseEqual() ||
-      parser.parseOperandList(lower, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
-    return failure();
-
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> upper;
-  if (parser.parseKeyword("to") ||
-      parser.parseOperandList(upper, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
-    return failure();
-
-  // Parse step values.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> steps;
-  if (parser.parseKeyword("step") ||
-      parser.parseOperandList(steps, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
-    return failure();
-
-  SmallVector<int32_t> segmentSizes{static_cast<int32_t>(lower.size()),
-                                    static_cast<int32_t>(upper.size()),
-                                    static_cast<int32_t>(steps.size())};
-
-  // Parse distribution type (only for ParallelOp)
-  if (std::is_same<LoopTy, ParallelOp>::value) {
-    if (succeeded(parser.parseOptionalKeyword("distribution"))) {
-      StringAttr distributionType;
-      if (parser.parseLParen() || parser.parseAttribute(distributionType) ||
-          parser.parseRParen())
-        return failure();
-      result.addAttribute(ParallelOp::getDistributionTypeAttrName(result.name),
-                          distributionType);
-    }
-  }
-
-  // Parse the output tensors (only for ForOp) and the body.
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> regionOperands(ivs);
-  SmallVector<Type, 4> regionTypes(ivs.size(), builder.getIndexType());
-
-  if (std::is_same<LoopTy, ForOp>::value) {
-    int32_t outputCount = 0;
-    if (parseForOpOutputArgs(parser, result, regionOperands, regionTypes,
-                             &outputCount))
-      return failure();
-    segmentSizes.push_back(outputCount);
-  }
-
-  SmallVector<OpAsmParser::Argument, 4> regionArgs;
-  for (auto argAndType : llvm::zip(regionOperands, regionTypes)) {
-    auto &arg = regionArgs.emplace_back();
-    std::tie(arg.ssaName, arg.type) = argAndType;
-  }
-  Region *body = result.addRegion();
-  if (parser.parseRegion(*body, regionArgs)) return failure();
-
-  // Parse attributes.
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  // Parser result types.
-  if (parser.parseOptionalColonTypeList(result.types)) return failure();
-
-  // Add segment sizes.
-  result.addAttribute(LoopTy::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr(segmentSizes));
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ParallelOp
-//===----------------------------------------------------------------------===//
-
-Region &ParallelOp::getLoopBody() { return getRegion(); }
-
-SetYieldOp ParallelOp::getTerminator() {
-  return cast<SetYieldOp>(getBody()->getTerminator());
-}
-
-LogicalResult ParallelOp::verify() { return success(); }
-
-void ParallelOp::build(
-    OpBuilder &builder, OperationState &result, TypeRange resultTypes,
-    ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps,
-    Optional<StringAttr> distributionType,
-    function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuilderFn) {
-  result.addOperands(lowerBounds);
-  result.addOperands(upperBounds);
-  result.addOperands(steps);
-  result.addTypes(resultTypes);
-  result.addAttribute(
-      LoopOp::getOperandSegmentSizeAttr(),
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lowerBounds.size()),
-                                    static_cast<int32_t>(upperBounds.size()),
-                                    static_cast<int32_t>(steps.size())}));
-
-  if (distributionType.has_value())
-    result.addAttribute(getDistributionTypeAttrName(result.name),
-                        distributionType.value());
-
-  OpBuilder::InsertionGuard guard(builder);
-  unsigned numIvs = steps.size();
-  SmallVector<Type, 8> argTypes(numIvs, builder.getIndexType());
-  SmallVector<Location, 8> argLocs(numIvs, result.location);
-  Region *bodyRegion = result.addRegion();
-  Block *bodyBlock = builder.createBlock(bodyRegion, {}, argTypes, argLocs);
-
-  if (bodyBuilderFn) {
-    builder.setInsertionPointToStart(bodyBlock);
-    bodyBuilderFn(builder, result.location,
-                  bodyBlock->getArguments().take_front(numIvs));
-    ParallelOp::ensureTerminator(*bodyRegion, builder, result.location);
-  }
-}
-
-void ParallelOp::print(OpAsmPrinter &p) {
-  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
-    << getUpperBound() << ") step (" << getStep() << ") ";
-
-  if (getDistributionType().has_value())
-    p << "distribution (" << getDistributionTypeAttr() << ") ";
-
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict(
-      getOperation()->getAttrs(),
-      /*elidedAttrs=*/{ParallelOp::getOperandSegmentSizeAttr(),
-                       getDistributionTypeAttrName()});
-
-  if (!getResultTypes().empty()) {
-    p << " : ";
-    llvm::interleave(getResultTypes(), p, ", ");
-  }
-}
-
-ParseResult ParallelOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseLoopLikeOp<ParallelOp>(parser, result);
-}
-
-ValueRange ParallelOp::getLoopLikeOpInits() {
-  return getTerminator().getDsts();
-}
-
-//===----------------------------------------------------------------------===//
-// ForOp
-//===----------------------------------------------------------------------===//
-
-Region &ForOp::getLoopBody() { return getRegion(); }
-
-SetYieldOp ForOp::getTerminator() {
-  return cast<SetYieldOp>(getBody()->getTerminator());
-}
-
-LogicalResult ForOp::verify() {
-  // Check if types of output arguments match region args types.
-  for (auto &item :
-       llvm::enumerate(llvm::zip(getOutputs(), getRegionOutputArgs()))) {
-    Value output, outputRegionArg;
-    unsigned index = item.index();
-    std::tie(output, outputRegionArg) = item.value();
-    if (output.getType() != outputRegionArg.getType()) {
-      return emitOpError("expected output arg ")
-             << index << " with type = " << output.getType()
-             << " to match region arg " << index + getNumLoops()
-             << " type = " << outputRegionArg.getType();
-    }
-    auto terminator = getTerminator();
-    auto numDstOperands = terminator.getNumDstOperands();
-    if (index >= numDstOperands) {
-      const auto *s = index ? "s" : "";
-      return terminator.emitOpError("expected to have at least ")
-             << index + 1 << " destination operand" << s << " (currently "
-             << numDstOperands << ")";
-    }
-
-    if (terminator.getDstOperand(index)->get() != outputRegionArg) {
-      return terminator.emitOpError("expected output block argument ")
-             << index << " to match set_yield destination";
-    }
-  }
-  return success();
-}
-
-void ForOp::build(
-    OpBuilder &builder, OperationState &result, TypeRange resultTypes,
-    ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps,
-    ValueRange outputs,
-    function_ref<void(OpBuilder &, Location, ValueRange, ValueRange)>
-        bodyBuilderFn) {
-  result.addOperands(lowerBounds);
-  result.addOperands(upperBounds);
-  result.addOperands(steps);
-  result.addOperands(outputs);
-  result.addTypes(resultTypes);
-  result.addAttribute(
-      LoopOp::getOperandSegmentSizeAttr(),
-      builder.getDenseI32ArrayAttr({static_cast<int32_t>(lowerBounds.size()),
-                                    static_cast<int32_t>(upperBounds.size()),
-                                    static_cast<int32_t>(steps.size()),
-                                    static_cast<int32_t>(outputs.size())}));
-
-  OpBuilder::InsertionGuard guard(builder);
-  unsigned numIvs = steps.size();
-  SmallVector<Type, 8> argTypes(numIvs, builder.getIndexType());
-  SmallVector<Location, 8> argLocs(numIvs, result.location);
-  for (Value output : outputs) {
-    argTypes.push_back(output.getType());
-    argLocs.push_back(output.getLoc());
-  }
-  Region *bodyRegion = result.addRegion();
-  Block *bodyBlock = builder.createBlock(bodyRegion, {}, argTypes, argLocs);
-
-  if (bodyBuilderFn) {
-    builder.setInsertionPointToStart(bodyBlock);
-    bodyBuilderFn(builder, result.location,
-                  bodyBlock->getArguments().take_front(numIvs),
-                  bodyBlock->getArguments().take_back(outputs.size()));
-    ForOp::ensureTerminator(*bodyRegion, builder, result.location);
-  }
-}
-
-void ForOp::print(OpAsmPrinter &p) {
-  p << " (" << getInductionVars() << ") = (" << getLowerBound() << ") to ("
-    << getUpperBound() << ") step (" << getStep() << ")";
-
-  if (!getOutputs().empty()) {
-    p << " outs (";
-    llvm::interleaveComma(
-        llvm::zip(getRegionOutputArgs(), getOutputs()), p, [&](auto it) {
-          Value outputRegionArg, output;
-          std::tie(outputRegionArg, output) = it;
-          p << outputRegionArg << " = " << output << ": " << output.getType();
-        });
-    p << ")";
-  }
-
-  p << ' ';
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
-  p.printOptionalAttrDict(getOperation()->getAttrs(),
-                          /*elidedAttrs=*/{ForOp::getOperandSegmentSizeAttr()});
-
-  if (!getResultTypes().empty()) {
-    p << " : ";
-    llvm::interleave(getResultTypes(), p, ", ");
-  }
-}
-
-ParseResult ForOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseLoopLikeOp<ForOp>(parser, result);
-}
-
-namespace {
-// Collapse loop dimensions that perform a single iteration.
-// This is a partial copy of the corresponding pattern from SCF.
-struct CollapseSingleIterationLoops : public OpRewritePattern<ForOp> {
-  using OpRewritePattern<ForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ForOp op,
-                                PatternRewriter &rewriter) const override {
-    BlockAndValueMapping mapping;
-    // Compute new loop bounds that omit all single-iteration loop dimensions.
-    SmallVector<Value> newLowerBounds, newUpperBounds, newSteps;
-    newLowerBounds.reserve(op.getLowerBound().size());
-    newUpperBounds.reserve(op.getUpperBound().size());
-    newSteps.reserve(op.getStep().size());
-    auto getConstant = [](Value v) -> Optional<int64_t> {
-      auto constant =
-          dyn_cast_or_null<arith::ConstantIndexOp>(v.getDefiningOp());
-      if (constant) return constant.value();
-      return None;
-    };
-    for (auto [lowerBound, upperBound, step, iv] :
-         llvm::zip(op.getLowerBound(), op.getUpperBound(), op.getStep(),
-                   op.getInductionVars())) {
-      // Collect the statically known loop bounds.
-      auto lowerBoundConstant = getConstant(lowerBound);
-      auto upperBoundConstant = getConstant(upperBound);
-      auto stepConstant = getConstant(step);
-      // Replace the loop induction variable by the lower bound if the loop
-      // performs a single iteration. Otherwise, copy the loop bounds.
-      if (lowerBoundConstant && upperBoundConstant && stepConstant &&
-          (*upperBoundConstant - *lowerBoundConstant) > 0 &&
-          (*upperBoundConstant - *lowerBoundConstant) <= *stepConstant) {
-        mapping.map(iv, lowerBound);
-      } else {
-        newLowerBounds.push_back(lowerBound);
-        newUpperBounds.push_back(upperBound);
-        newSteps.push_back(step);
-      }
-    }
-    // Exit if none of the loop dimensions perform a single iteration.
-    if (newLowerBounds.size() == op.getLowerBound().size()) return failure();
-
-    // Replace the parallel loop by lower-dimensional parallel loop.
-    auto newOp = rewriter.create<ForOp>(op.getLoc(), op.getResultTypes(),
-                                        newLowerBounds, newUpperBounds,
-                                        newSteps, op.getOutputs(), nullptr);
-    // Clone the loop body and remap the block arguments of the collapsed loops
-    // (inlining does not support a cancellable block argument mapping).
-    rewriter.cloneRegionBefore(op.getRegion(), newOp.getRegion(),
-                               newOp.getRegion().begin(), mapping);
-    rewriter.replaceOp(op, newOp.getResults());
-    return success();
-  }
-};
-
-/// Folds CastOp of loop outputs into ForOp
-struct RefineForOpShape : public OpRewritePattern<ForOp> {
-  using OpRewritePattern<ForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ForOp op,
-                                PatternRewriter &rewriter) const override {
-    if (llvm::all_of(op.getOutputs(), [](auto out) {
-          return out.template getDefiningOp<tensor::CastOp>() == nullptr;
-        }))
-      return failure();
-
-    Location loc = op.getLoc();
-    // Scans through output args to find what args are produced by `tensor.cast`
-    // ops. Also cache the info since we are gonna reuse it a lot.
-    SmallVector<Value> newOutputs{op.getOutputs()};
-    SmallVector<Type> newTypes{op.getResultTypes()};
-    SmallVector<tensor::CastOp> castOutputs;
-    for (auto &&[out, type] : llvm::zip(newOutputs, newTypes)) {
-      if (auto cast =
-              castOutputs.emplace_back(out.getDefiningOp<tensor::CastOp>())) {
-        out = cast.getSource();
-        type = out.getType();
-      }
-    }
-
-    auto newFor = rewriter.create<ForOp>(loc, newTypes, op.getLowerBound(),
-                                         op.getUpperBound(), op.getStep(),
-                                         newOutputs, nullptr);
-
-    // Map outputs, insert `tensor.cast` if necessary.
-    BlockAndValueMapping bvm;
-    bvm.map(op.getInductionVars(), newFor.getInductionVars());
-
-    auto innerBuilder = ImplicitLocOpBuilder::atBlockEnd(loc, newFor.getBody());
-    rewriter.setInsertionPointAfter(newFor);
-
-    for (const auto &[oldArg, newArg, cast] :
-         llvm::zip(op.getRegionOutputArgs(), newFor.getRegionOutputArgs(),
-                   castOutputs)) {
-      bvm.map(oldArg,
-              cast ? innerBuilder.create<tensor::CastOp>(cast.getType(), newArg)
-                   : Value(newArg));
-    }
-    // Cast the loop results for downstream uses of the loop if necessary.
-    SmallVector<Value> newResults{newFor.getResults()};
-    for (auto &&[res, cast] : llvm::zip(newResults, castOutputs)) {
-      if (cast) res = rewriter.create<tensor::CastOp>(loc, cast.getType(), res);
-    }
-
-    // Clone loop body.
-    for (auto &o : *(op.getBody())) innerBuilder.clone(o, bvm);
-
-    // Update set_yield destinations to the new type.
-    auto term = cast<SetYieldOp>(newFor.getTerminator());
-    rewriter.updateRootInPlace(term, [&]() {
-      term.getDstsMutable().assign(newFor.getRegionOutputArgs());
-    });
-
-    // Update the original loop by the new loop + CastOp.
-    rewriter.replaceOp(op, newResults);
-    return success();
-  }
-};
-}  // namespace
-
-void ForOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                        MLIRContext *context) {
-  results.add<CollapseSingleIterationLoops, RefineForOpShape>(context);
-}
-
-namespace {
-
-static constexpr int64_t kNoMatch = -1;
-
-// Folds away LoopOp inputs if they have no uses within the body.
-//
-// Example:
-//
-// %0 = gml_st.loop ...  ins (%in_ = %in: tensor<...>,
-//                                  %in_buf_ = %in_buf: memref<...>) {...}
-// Becomes
-//
-// gml_st.loop ...  ins (%in_buf_ = %in_buf: memref<...>) {...}
-struct LoopInputsFolder : public OpRewritePattern<LoopOp> {
-  using OpRewritePattern<LoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LoopOp loop,
-                                PatternRewriter &rewriter) const final {
-    SmallVector<Value, 2> newInputs, regionInputTensorArgs;
-    // Store ids of the corresponding old and new input operands.
-    SmallVector<int64_t, 2> oldInputIdToNew(loop.getInputs().size(), kNoMatch);
-    for (const auto &en : llvm::enumerate(
-             llvm::zip(loop.getInputs(), loop.getRegionInputArgs()))) {
-      Value in, bbArg;
-      size_t index = en.index();
-      std::tie(in, bbArg) = en.value();
-      if (!bbArg.use_empty()) {
-        oldInputIdToNew[index] = newInputs.size();
-        newInputs.push_back(in);
-      }
-    }
-    if (newInputs.size() == loop.getInputs().size()) return failure();
-    Location loc = loop.getLoc();
-    auto newLoop = rewriter.create<LoopOp>(
-        loc, loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
-        newInputs, loop.getOutputs(), loop.getIteratorTypes(),
-        loop.getDistributionTypes());
-
-    // Clone the region.
-    BlockAndValueMapping bvm;
-    bvm.map(loop.getInductionVars(), newLoop.getInductionVars());
-    bvm.map(loop.getRegionOutputArgs(), newLoop.getRegionOutputArgs());
-    for (const auto &en : llvm::enumerate(oldInputIdToNew))
-      if (en.value() != kNoMatch)
-        bvm.map(loop.getRegionInputArgs()[en.index()],
-                newLoop.getRegionInputArgs()[en.value()]);
-    OpBuilder innerBuilder =
-        OpBuilder::atBlockEnd(newLoop.getBody(), rewriter.getListener());
-    for (auto &op : *loop.getBody()) innerBuilder.clone(op, bvm);
-    rewriter.replaceOp(loop, newLoop.getResults());
-
-    return success();
-  }
-};
-
-}  // namespace
-
-/// A simple, conservative analysis to determine if the loop is shape
-/// conserving. I.e., the type of the arg-th yielded value is the same as the
-/// type of the corresponding basic block argument of the loop.
-/// Note: This function handles only simple cases. Expand as needed.
-static bool isShapePreserving(LoopOp loopOp, int64_t arg) {
-  auto yieldOp = cast<YieldOp>(loopOp.getLoopBody().front().getTerminator());
-  if (yieldOp.getValues().empty())
-    // Loop either has no outputs or is a "memref-based version". In either
-    // case, the loop is shape conserving.
-    return true;
-  assert(arg < static_cast<int64_t>(yieldOp.getValues().size()) &&
-         "arg is out of bounds");
-  Value value = yieldOp.getValues()[arg];
-  while (value) {
-    if (value == loopOp.getRegionOutputArgs()[arg]) return true;
-    OpResult opResult = value.dyn_cast<OpResult>();
-    if (!opResult) return false;
-
-    using tensor::InsertSliceOp;
-    value = llvm::TypeSwitch<Operation *, Value>(opResult.getOwner())
-                .template Case<InsertSliceOp>(
-                    [&](InsertSliceOp op) { return op.getDest(); })
-                .template Case<LoopOp>([&](LoopOp loopOp) {
-                  return isShapePreserving(loopOp, opResult.getResultNumber())
-                             ? loopOp.getOutputs()[opResult.getResultNumber()]
-                             : Value();
-                })
-                .Default([&](auto /*op*/) { return Value(); });
-  }
-  return false;
-}
-
-namespace {
-
-/// Fold dim(x) where `x` is an input/output argument of a LoopOp block
-/// to dim(y) where `y` is the initial input/output value of the argument.
-///
-/// E.g.:
-/// %y = ... : tensor<...>
-/// gml_st.loop ... ins(%x = %y : tensor<...>) {
-///   tensor.dim %x, %c0 : tensor<...>
-/// }
-///
-/// is folded to:
-/// %y = ... : tensor<...>
-/// gml_st.loop ... ins(%x = %y : tensor<...>) {
-///   tensor.dim %y, %c0 : tensor<...>
-/// }
-///
-/// Note: Dim ops are folded only if it can be proven that the runtime type of
-/// the yielded value (in case of outputs) does not change with loop iterations.
-template <typename OpTy>
-struct DimOfLoopInsOutsFolder : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy dimOp,
-                                PatternRewriter &rewriter) const final {
-    auto src = dimOp.getSource().template dyn_cast<BlockArgument>();
-    if (!src) return failure();
-    auto loopOp = dyn_cast<LoopOp>(src.getOwner()->getParent()->getParentOp());
-    if (!loopOp) return failure();
-    unsigned numLoops = loopOp.getNumLoops();
-    unsigned numInputArgs = loopOp.getRegionInputArgs().size();
-    if (src.getArgNumber() >= numInputArgs + numLoops &&
-        !isShapePreserving(loopOp,
-                           src.getArgNumber() - numInputArgs - numLoops))
-      return failure();
-
-    auto inputArgs = loopOp.getRegionInputArgs();
-    auto it1 = llvm::find(inputArgs, src);
-    if (it1 != inputArgs.end()) {
-      rewriter.updateRootInPlace(dimOp, [&] {
-        dimOp.getSourceMutable().assign(
-            loopOp.getInputs()[it1 - inputArgs.begin()]);
-      });
-      return success();
-    }
-
-    auto outputArgs = loopOp.getRegionOutputArgs();
-    auto it2 = llvm::find(outputArgs, src);
-    if (it2 != outputArgs.end()) {
-      rewriter.updateRootInPlace(dimOp, [&] {
-        dimOp.getSourceMutable().assign(
-            loopOp.getOutputs()[it2 - outputArgs.begin()]);
-      });
-      return success();
-    }
-
-    return failure();
-  }
-};
-
-/// Fold dim(r) where `r` is the result of a LoopOp to dim(y) where `y`
-/// is the initial output value of the loop.
-///
-/// E.g.:
-/// %y = ... : tensor<...>
-/// %r = gml_st.loop ... outs(%i = %y : tensor<...>) {
-///   ...
-/// }
-/// %0 = tensor.dim %r, %c0 : tensor<...>
-///
-/// is folded to:
-/// %y = ... : tensor<...>
-/// gml_st.loop ... outs(%i = %y : tensor<...>) {
-///   ...
-/// }
-/// %0 = tensor.dim %y, %c0 : tensor<...>
-///
-/// Note: Dim ops are folded only if it can be proven that the runtime type of
-/// the yielded value (in case of outputs) does not change with loop iterations.
-template <typename OpTy>
-struct DimOfLoopResultFolder : public OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(OpTy dimOp,
-                                PatternRewriter &rewriter) const final {
-    auto loopOp = dimOp.getSource().template getDefiningOp<LoopOp>();
-    if (!loopOp) return failure();
-    auto opResult = dimOp.getSource().template cast<OpResult>();
-    unsigned resultNumber = opResult.getResultNumber();
-    if (!isShapePreserving(loopOp, resultNumber)) return failure();
-    rewriter.updateRootInPlace(dimOp, [&]() {
-      dimOp.getSourceMutable().assign(loopOp.getOutputs()[resultNumber]);
-    });
-    return success();
-  }
-};
-
-// Folds away LoopOp output tensors when the following conditions are met:
-// * result of `gml_st.loop` has no uses
-// * output tensor is the argument of `gml_st.yield`
-//
-// Example:
-//
-// %0 = gml_st.loop ...  outs (%o_ = %out: tensor<...>,
-//                                   %obuf_ = %out_buf: memref<...>) {
-//   ...
-//   gml_st.yield %o_ : tensor ...
-// }
-//
-// Becomes
-//
-// gml_st.loop ...  outs (%obuf_ = %out_buf: memref<...>) {
-//   ...
-//   gml_st.yield
-// }
-struct LoopResultsFolder : public OpRewritePattern<LoopOp> {
-  using OpRewritePattern<LoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LoopOp loop,
-                                PatternRewriter &rewriter) const final {
-    if (loop.getNumResults() == 0) return failure();
-
-    Block *block = loop.getBody();
-    auto yieldOp = cast<YieldOp>(block->getTerminator());
-
-    // Match the pattern and collect output buffers that will replace the output
-    // tensors and also the ops that will be ignored when cloning the body.
-    SmallVector<Value, 2> newOutputOperands, newYieldArgs;
-    int resultId = 0;
-    // Store ids of the corresponding old and new output operands.
-    SmallVector<int64_t, 2> oldOutputIdToNew(loop.getOutputs().size(),
-                                             kNoMatch);
-    // Store ids of the corresponding old and new results.
-    SmallVector<int64_t, 2> oldResultIdToNew(loop.getNumResults(), kNoMatch);
-    SmallVector<Value, 2> resultReplacement(loop.getNumResults());
-    for (const auto &en : llvm::enumerate(
-             llvm::zip(loop.getOutputs(), loop.getRegionOutputArgs()))) {
-      size_t index = en.index();
-      Value out = std::get<0>(en.value());
-      Value outRegionArg = std::get<1>(en.value());
-
-      if (!out.getType().isa<RankedTensorType>()) {
-        oldOutputIdToNew[index] = newOutputOperands.size();
-        newOutputOperands.push_back(out);
-        continue;
-      }
-      Value result = loop.getResult(resultId);
-      Value yieldArg = yieldOp.getOperand(resultId);
-      if (yieldArg != outRegionArg || !result.use_empty()) {
-        oldOutputIdToNew[index] = newOutputOperands.size();
-        oldResultIdToNew[resultId] = newYieldArgs.size();
-        resultReplacement[resultId] = out;
-        newOutputOperands.push_back(out);
-        newYieldArgs.push_back(yieldArg);
-      }
-      ++resultId;
-    }
-    if (newOutputOperands.size() == loop.getOutputs().size()) return failure();
-
-    Location loc = loop.getLoc();
-    auto newLoop = rewriter.create<LoopOp>(
-        loc, loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
-        loop.getInputs(), newOutputOperands, loop.getIteratorTypes(),
-        loop.getDistributionTypes());
-
-    // Clone the region.
-    BlockAndValueMapping bvm;
-    bvm.map(loop.getInductionVars(), newLoop.getInductionVars());
-    bvm.map(loop.getRegionInputArgs(), newLoop.getRegionInputArgs());
-    for (const auto &en : llvm::enumerate(oldOutputIdToNew)) {
-      if (en.value() != kNoMatch)
-        bvm.map(loop.getRegionOutputArgs()[en.index()],
-                newLoop.getRegionOutputArgs()[en.value()]);
-      else
-        bvm.map(loop.getRegionOutputArgs()[en.index()],
-                loop.getOutputs()[en.index()]);
-    }
-    OpBuilder innerBuilder =
-        OpBuilder::atBlockEnd(newLoop.getBody(), rewriter.getListener());
-    for (auto &op : loop.getBody()->without_terminator())
-      innerBuilder.clone(op, bvm);
-    innerBuilder.create<YieldOp>(
-        loc, llvm::to_vector<2>(llvm::map_range(
-                 newYieldArgs, [&](Value arg) { return bvm.lookup(arg); })));
-
-    for (const auto &en : llvm::enumerate(oldResultIdToNew))
-      if (en.value() != kNoMatch)
-        resultReplacement[en.index()] = newLoop.getResult(en.value());
-    rewriter.replaceOp(loop, resultReplacement);
-
-    return success();
-  }
-};
-
-/// Pull `gml_st.loop` input/output arguments that are produced by
-/// `tensor.cast` ops inside `gml_st.loop`:
-///
-/// ```
-///   %in = tensor.cast %t0 : tensor<32x1024xf32> to tensor<?x?xf32>
-///   %out = tensor.cast %t1 : tensor<32x1024xf32> to tensor<?x?xf32>
-///   %result = gml_st.loop %i = %c0 to %c1024 step %c32
-///       ins (%in_ = %in: tensor<?x?xf32>)
-///       outs (%out_ = %out: tensor<?x?xf32>) {
-///     %0 = call @do(%in_, %out_)
-///       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-///     scf.yield %0 : tensor<?x?xf32>
-///   }
-///   %result_cast = tensor.cast %result
-///     : tensor<?x?xf32> to tensor<32x1024xf32>
-///   use_of(%result_cast)
-/// ```
-///
-/// folds into:
-//
-/// ```
-///   %result = gml_st.loop %i = %c0 to %c1024 step %c32
-///       ins (%in_ = %t0: tensor<32x1024xf32>)
-///       outs (%out_ = %t1: tensor<32x1024xf32>) {
-///     %in_cast = tensor.cast %in_ : tensor<32x1024xf32> to tensor<?x?xf32>
-///     %out_cast = tensor.cast %out_ : tensor<32x1024xf32> to tensor<?x?xf32>
-///     %0 = call @do(%in_, %out_)
-///       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-///     %0_cast = tensor.cast %0 : tensor<?x?xf32> to tensor<32x1024xf32>
-///     scf.yield %0 : tensor<32x1024xf32>
-///   }
-///   use_of(%result)
-/// ```
-struct TensorCastOfLoopInsOutsFolder : public OpRewritePattern<LoopOp> {
-  using OpRewritePattern<LoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LoopOp loop,
-                                PatternRewriter &rewriter) const override {
-    CastOpsOfArgs inputCasts = findTensorCastOps(loop.getInputs());
-    CastOpsOfArgs outputCasts = findTensorCastOps(loop.getOutputs());
-    if (!inputCasts.castFound && !outputCasts.castFound) return failure();
-
-    auto newLoop = rewriter.create<LoopOp>(
-        loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(),
-        loop.getStep(), inputCasts.updatedArgs, outputCasts.updatedArgs,
-        loop.getIteratorTypes(), loop.getDistributionTypes());
-
-    rewriter.replaceOp(loop, insertCastsAndCloneBody(inputCasts, outputCasts,
-                                                     loop, newLoop, rewriter));
-    return success();
-  }
-
- private:
-  struct CastOpsOfArgs {
-    SmallVector<tensor::CastOp, 4> ops;
-    // Contains either old arguments or arguments of `tensor.cast`.
-    SmallVector<Value, 4> updatedArgs;
-    bool castFound = false;
-  };
-
-  // Scans through args to find what args are produced by `tensor.cast` ops.
-  CastOpsOfArgs findTensorCastOps(ValueRange args) const {
-    CastOpsOfArgs result;
-    for (auto arg : args) {
-      if (auto cast = arg.getDefiningOp<tensor::CastOp>()) {
-        result.ops.push_back(cast);
-        result.updatedArgs.push_back(cast.getSource());
-        result.castFound = true;
-        continue;
-      }
-      result.ops.push_back(nullptr);
-      result.updatedArgs.push_back(arg);
-    }
-    return result;
-  }
-
-  SmallVector<Value, 4> insertCastsAndCloneBody(
-      const CastOpsOfArgs &inputCasts, const CastOpsOfArgs &outputCasts,
-      LoopOp loop, LoopOp newLoop, PatternRewriter &rewriter) const {
-    auto loc = newLoop.getLoc();
-    BlockAndValueMapping bvm;
-    bvm.map(loop.getInductionVars(), newLoop.getInductionVars());
-
-    auto innerBuilder =
-        OpBuilder::atBlockEnd(newLoop.getBody(), rewriter.getListener());
-
-    Value oldArg, newArg, yieldArg, result;
-    tensor::CastOp argCast;
-
-    // Map inputs, insert `tensor.cast` if necessary.
-    for (auto item : llvm::zip(loop.getRegionInputArgs(),
-                               newLoop.getRegionInputArgs(), inputCasts.ops)) {
-      std::tie(oldArg, newArg, argCast) = item;
-      if (!argCast) {
-        bvm.map(oldArg, newArg);
-        continue;
-      }
-      Value newCast =
-          innerBuilder.create<tensor::CastOp>(loc, argCast.getType(), newArg);
-      bvm.map(oldArg, newCast);
-    }
-
-    // Map outputs, insert `tensor.cast` and cast the loop results if necessary.
-    SmallVector<Value, 4> newResults;
-    rewriter.setInsertionPointAfter(newLoop);
-    for (auto item :
-         llvm::zip(loop.getRegionOutputArgs(), newLoop.getRegionOutputArgs(),
-                   outputCasts.ops, newLoop.getResults())) {
-      std::tie(oldArg, newArg, argCast, result) = item;
-      if (!argCast) {
-        bvm.map(oldArg, newArg);
-        newResults.push_back(result);
-        continue;
-      }
-      Value newCast =
-          innerBuilder.create<tensor::CastOp>(loc, argCast.getType(), newArg);
-      bvm.map(oldArg, newCast);
-
-      newResults.push_back(
-          rewriter.create<tensor::CastOp>(loc, argCast.getType(), result));
-    }
-
-    // Clone loop body.
-    for (auto &op : loop.getBody()->without_terminator())
-      innerBuilder.clone(op, bvm);
-
-    // Cast yield arguments to the new type.
-    SmallVector<Value, 4> yieldArgs =
-        loop.getBody()->getTerminator()->getOperands();
-    SmallVector<Value, 4> newYieldArgs;
-    for (auto item : llvm::zip(yieldArgs, outputCasts.ops)) {
-      std::tie(yieldArg, argCast) = item;
-      if (!argCast) {
-        newYieldArgs.push_back(bvm.lookup(yieldArg));
-        continue;
-      }
-      newYieldArgs.push_back(innerBuilder.create<tensor::CastOp>(
-          loc, argCast.getSource().getType(), bvm.lookup(yieldArg)));
-    }
-    innerBuilder.create<YieldOp>(loc, newYieldArgs);
-    return newResults;
-  }
-};
-
-/// Removes loops in which at least one lower/upper bound pair consists
-/// of the same values - such loops have an empty iteration domain.
-struct FoldEmptyLoops : public OpRewritePattern<LoopOp> {
-  using OpRewritePattern<LoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LoopOp op,
-                                PatternRewriter &rewriter) const override {
-    for (auto dim : llvm::zip(op.getLowerBound(), op.getUpperBound())) {
-      if (std::get<0>(dim) != std::get<1>(dim)) continue;
-      SmallVector<Value> tensorOutputs;
-      for (Value out : op.getOutputs()) {
-        if (out.getType().isa<RankedTensorType>()) tensorOutputs.push_back(out);
-      }
-      rewriter.replaceOp(op, tensorOutputs);
-      return success();
-    }
-    return failure();
-  }
-};
-
-}  // namespace
-
-void LoopOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                         MLIRContext *context) {
-  results
-      .add<FoldEmptyLoops, LoopInputsFolder, LoopResultsFolder,
-           DimOfLoopInsOutsFolder<tensor::DimOp>,
-           DimOfLoopInsOutsFolder<memref::DimOp>,
-           DimOfLoopResultFolder<tensor::DimOp>,
-           DimOfLoopResultFolder<memref::DimOp>, TensorCastOfLoopInsOutsFolder>(
-          context);
-}
-
-/// This is used for patterns of the form
-/// ```
-///    gml_st.loop(memrefcast(%src)) -> gml_st.loop(%src)
-/// ```
-/// It folds the source of the memref.cast into the root operation directly.
-LogicalResult LoopOp::fold(ArrayRef<Attribute>,
-                           SmallVectorImpl<OpFoldResult> &) {
-  LoopOp op = *this;
-  bool folded = false;
-  Location loc = op->getLoc();
-
-  Block *body = op.getBody();
-  OpBuilder b = OpBuilder::atBlockBegin(body);
-
-  // Update `input` and `output` operands and block arguments if necessary.
-  // Operands list: [lbs, ubs, steps, inputs, outputs].
-  // Block args list: [ivs, inputs, outputs].
-  for (size_t operandIndex = op.getNumControlOperands(),
-              bbArgIndex = op.getNumLoops(), e = op.getNumOperands();
-       operandIndex < e; ++operandIndex, ++bbArgIndex) {
-    OpOperand &operand = op->getOpOperand(operandIndex);
-
-    auto castOp = operand.get().getDefiningOp<memref::CastOp>();
-    if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) {
-      operand.set(castOp.getOperand());
-      BlockArgument newBbArg = body->insertArgument(
-          bbArgIndex, castOp.getOperand().getType(), op.getLoc());
-      BlockArgument oldBbArg = body->getArgument(newBbArg.getArgNumber() + 1);
-
-      // Insert memref.cast back to the original type.
-      oldBbArg.replaceAllUsesWith(
-          b.create<memref::CastOp>(loc, oldBbArg.getType(), newBbArg));
-      body->eraseArgument(oldBbArg.getArgNumber());
-
-      folded = true;
-    }
-  }
-  return success(folded);
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult YieldOp::verify() {
-  auto *parentOp = getOperation()->getParentOp();
-
-  if (auto setYield = dyn_cast<SetYieldOp>(parentOp)) {
-    if (getValues().size() != 1)
-      return emitOpError(
-          "expected a single argument for the terminator of accumulator "
-          "region");
-    return success();
-  }
-  auto loopOp = cast<LoopOp>(parentOp);
-  // Check if output args with tensor types match results types.
-  SmallVector<Value, 2> tensorOuts;
-  llvm::copy_if(
-      loopOp.getOutputs(), std::back_inserter(tensorOuts),
-      [&](Value out) { return out.getType().isa<RankedTensorType>(); });
-  if (tensorOuts.size() != getValues().size())
-    return emitOpError("expected number of tensor output args = ")
-           << tensorOuts.size()
-           << " to match the number of yield operands = " << getValues().size();
-
-  TypeRange tensorTypes{ValueRange{tensorOuts}};
-  for (auto &item :
-       llvm::enumerate(llvm::zip(tensorTypes, getOperandTypes()))) {
-    Type outType, resultType;
-    unsigned index = item.index();
-    std::tie(outType, resultType) = item.value();
-    if (outType != resultType)
-      return emitOpError("expected yield operand ")
-             << index << " with type = " << resultType
-             << " to match output arg type = " << outType;
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TileOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Fold gml_st.tile [%c0] ... into gml_st.tile [0] ...
-/// Adapted from OpWithOffsetSizesAndStridesConstantArgumentFolder, which makes
-/// slightly incompatible assumptions about the op.
-struct FoldConstantsIntoTileType : public OpRewritePattern<TileOp> {
-  using OpRewritePattern<TileOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TileOp op,
-                                PatternRewriter &rewriter) const override {
-    // No constant operand, just return;
-    if (llvm::none_of(op.getOperands(), [](Value operand) {
-          return matchPattern(operand, matchConstantIndex());
-        }))
-      return failure();
-
-    // At least one of offsets/sizes/strides is a new constant.
-    // Form the new list of operands and constant attributes from the existing.
-    SmallVector<OpFoldResult> mixedOffsets(op.getMixedOffsets());
-    SmallVector<OpFoldResult> mixedSizes(op.getMixedSizes());
-    SmallVector<OpFoldResult> mixedStrides(op.getMixedStrides());
-    canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamicStrideOrOffset);
-    canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
-    canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamicStrideOrOffset);
-
-    // Create the new tile in canonical form.
-    TileOp newOp = rewriter.create<TileOp>(op.getLoc(), mixedOffsets,
-                                           mixedSizes, mixedStrides);
-    // Cast the result back to the original type. This will be folded further
-    // materialize ops.
-    rewriter.replaceOpWithNewOp<UnrealizedConversionCastOp>(
-        op, TypeRange{op.getType()}, ValueRange{newOp});
-
-    return success();
-  }
-};
-}  // namespace
-
-void TileOp::build(OpBuilder &b, OperationState &result,
-                   ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
-                   ArrayRef<OpFoldResult> strides,
-                   ArrayRef<NamedAttribute> attrs) {
-  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
-  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
-                             ShapedType::kDynamicStrideOrOffset);
-  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
-                             ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
-                             ShapedType::kDynamicStrideOrOffset);
-  auto tileType = TileType::get(b.getContext(), staticSizes);
-  build(b, result, tileType, dynamicOffsets, dynamicSizes, dynamicStrides,
-        b.getI64ArrayAttr(staticOffsets), b.getI64ArrayAttr(staticSizes),
-        b.getI64ArrayAttr(staticStrides));
-  result.addAttributes(attrs);
-}
-
-void TileOp::build(OpBuilder &b, OperationState &result,
-                   ArrayRef<OpFoldResult> offsets,
-                   ArrayRef<NamedAttribute> attrs) {
-  SmallVector<OpFoldResult> unitSizesAndStrides(offsets.size(),
-                                                b.getIndexAttr(1));
-  return build(b, result, offsets, unitSizesAndStrides, unitSizesAndStrides,
-               attrs);
-}
-
-LogicalResult TileOp::inferReturnTypes(
-    MLIRContext *ctx, Optional<Location> /*loc*/, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type> &inferredReturnTypes) {
-  // Derive result shape.
-  TileOp::Adaptor adaptor(operands, attributes, regions);
-  SmallVector<int64_t> shape = llvm::to_vector(
-      llvm::map_range(adaptor.getStaticSizes(), [&](const auto &size) {
-        return size.template dyn_cast<mlir::IntegerAttr>()
-            .getValue()
-            .getSExtValue();
-      }));
-
-  auto resultTy = TileType::get(ctx, shape);
-  inferredReturnTypes.push_back(resultTy);
-  return success();
-}
-
-LogicalResult TileOp::verify() {
-  auto resultType = getType();
-  auto rank = resultType.getRank();
-  if (failed(mlir::verifyListOfOperandsOrIntegers(getOperation(), "size", rank,
-                                                  getStaticSizes(), getSizes(),
-                                                  ShapedType::isDynamic))) {
-    return failure();
-  }
-  if (failed(mlir::verifyListOfOperandsOrIntegers(
-          getOperation(), "offset", rank, getStaticOffsets(), getOffsets(),
-          ShapedType::isDynamicStrideOrOffset))) {
-    return failure();
-  }
-  if (failed(mlir::verifyListOfOperandsOrIntegers(
-          getOperation(), "stride", rank, getStaticStrides(), getStrides(),
-          ShapedType::isDynamicStrideOrOffset))) {
-    return failure();
-  }
-  for (auto it : llvm::zip(resultType.getShape(), getStaticOffsets(),
-                           getStaticSizes(), getStaticStrides())) {
-    auto offset =
-        std::get<1>(it).dyn_cast<mlir::IntegerAttr>().getValue().getSExtValue();
-    if (offset < 0 && offset != ShapedType::kDynamicStrideOrOffset) {
-      return emitOpError("expected offset = ")
-             << offset << " to be non-negative";
-    }
-    auto size =
-        std::get<2>(it).dyn_cast<mlir::IntegerAttr>().getValue().getSExtValue();
-    if (size < 0 && size != ShapedType::kDynamicSize) {
-      return emitOpError("expected size = ") << size << " to be non-negative";
-    }
-    auto stride =
-        std::get<3>(it).dyn_cast<mlir::IntegerAttr>().getValue().getSExtValue();
-    if (stride < 0 && stride != ShapedType::kDynamicStrideOrOffset) {
-      return emitOpError("expected stride = ")
-             << stride << " to be non-negative";
-    }
-    auto tileSize = std::get<0>(it);
-    if (tileSize != size) {
-      return emitOpError("size arg = ")
-             << size << " does not match tile size = " << tileSize;
-    }
-  }
-  return success();
-}
-
-void TileOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                         MLIRContext *context) {
-  results.add<FoldConstantsIntoTileType>(context);
-}
-
-namespace {
-
-OpFoldResult multiplyOperandsOrIntegers(OpBuilder &builder, Location loc,
-                                        OpFoldResult lhs, OpFoldResult rhs) {
-  // Both operands are static.
-  if (lhs.is<Attribute>() && rhs.is<Attribute>()) {
-    return builder.getI64IntegerAttr(
-        lhs.get<Attribute>().cast<IntegerAttr>().getInt() *
-        rhs.get<Attribute>().cast<IntegerAttr>().getInt());
-  }
-
-  // Exploit commutativity and move static operand to the left (if any).
-  if (rhs.is<Attribute>()) std::swap(lhs, rhs);
-
-  // Create constant if needed.
-  if (lhs.is<Attribute>()) {
-    int64_t lhsInt = lhs.get<Attribute>().cast<IntegerAttr>().getInt();
-
-    // Exploit static operand if possible.
-    if (lhsInt == 0) return lhs;
-    if (lhsInt == 1) return rhs;
-
-    lhs = builder.create<arith::ConstantIndexOp>(loc, lhsInt).getResult();
-  }
-
-  // Multiply.
-  return builder.create<arith::MulIOp>(loc, lhs.get<Value>(), rhs.get<Value>())
-      .getResult();
-}
-
-OpFoldResult addOperandsOrIntegers(OpBuilder &builder, Location loc,
-                                   OpFoldResult lhs, OpFoldResult rhs) {
-  // Both operands are static.
-  if (lhs.is<Attribute>() && rhs.is<Attribute>()) {
-    return builder.getI64IntegerAttr(
-        lhs.get<Attribute>().cast<IntegerAttr>().getInt() +
-        rhs.get<Attribute>().cast<IntegerAttr>().getInt());
-  }
-
-  // Exploit commutativity and move static operand to the left (if any).
-  if (rhs.is<Attribute>()) std::swap(lhs, rhs);
-
-  // Create constant if needed.
-  if (lhs.is<Attribute>()) {
-    int64_t lhsInt = lhs.get<Attribute>().cast<IntegerAttr>().getInt();
-
-    // Exploit static operand if possible.
-    if (lhsInt == 0) return rhs;
-
-    lhs = builder.create<arith::ConstantIndexOp>(loc, lhsInt).getResult();
-  }
-
-  // Add.
-  return builder.create<arith::AddIOp>(loc, lhs.get<Value>(), rhs.get<Value>())
-      .getResult();
-}
-
-// Compose offsets with newOffset = supersetOffset + supersetStride * offset.
-SmallVector<OpFoldResult> composeOffsets(
-    const llvm::SmallVectorImpl<OpFoldResult> &supersetOffsets,
-    const llvm::SmallVectorImpl<OpFoldResult> &supersetStrides,
-    const llvm::SmallVectorImpl<OpFoldResult> &offsets, Location loc,
-    OpBuilder &builder) {
-  SmallVector<OpFoldResult> composedOffsets;
-  for (auto it : llvm::zip(supersetOffsets, supersetStrides, offsets)) {
-    composedOffsets.push_back(addOperandsOrIntegers(
-        builder, loc, std::get<0>(it),
-        multiplyOperandsOrIntegers(builder, loc, std::get<1>(it),
-                                   std::get<2>(it))));
-  }
-  return composedOffsets;
-}
-
-// Compose strides with newStride = supersetStride * stride.
-SmallVector<OpFoldResult> composeStrides(
-    OpBuilder &builder, Location loc,
-    const llvm::SmallVectorImpl<OpFoldResult> &supersetStrides,
-    const llvm::SmallVectorImpl<OpFoldResult> &strides) {
-  SmallVector<OpFoldResult> composedStrides;
-  for (auto it : llvm::zip(supersetStrides, strides)) {
-    composedStrides.push_back(multiplyOperandsOrIntegers(
-        builder, loc, std::get<0>(it), std::get<1>(it)));
-  }
-  return composedStrides;
-}
-
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// SetYieldOp
-//===----------------------------------------------------------------------===//
-
-using AccumulatorRegionBuilderFn =
-    function_ref<void(OpBuilder &, Location, Value, Value)>;
-
-void SetYieldOp::build(OpBuilder &builder, OperationState &result) {
-  build(builder, result, llvm::None, llvm::None, llvm::None);
-}
-
-void SetYieldOp::build(OpBuilder &builder, OperationState &result,
-                       ValueRange srcs, ValueRange dsts, ValueRange sets) {
-  SmallVector<bool, 2> accumulatorFlags(srcs.size(), false);
-  build(builder, result, srcs, dsts, sets,
-        builder.getBoolArrayAttr(accumulatorFlags), llvm::None);
-}
-
-void SetYieldOp::build(
-    OpBuilder &builder, OperationState &result, ValueRange srcs,
-    ValueRange dsts, ValueRange sets, ArrayAttr accumulatorFlags,
-    ArrayRef<AccumulatorRegionBuilderFn> accumulatorBuilderFns) {
-  assert(dsts.size() == srcs.size() &&
-         "`dsts` and `srcs` should have the same size");
-  assert(sets.size() == srcs.size() &&
-         "`sets` and `srcs` should have the same size");
-  assert(accumulatorFlags.size() == srcs.size() &&
-         "`accumulatorFlags` and `srcs` should have the same size");
-
-  auto accumulatorCount = llvm::count_if(accumulatorFlags, [](Attribute attr) {
-    return attr.cast<BoolAttr>().getValue();
-  });
-  (void)accumulatorCount;
-  assert(accumulatorCount ==
-             static_cast<int64_t>(accumulatorBuilderFns.size()) &&
-         "the number of flags set in `accumulatorFlags` attribute should be "
-         "equal to the number of `accumulatorBuilderFns`");
-
-  result.addOperands(srcs);
-  result.addOperands(dsts);
-  result.addOperands(sets);
-  result.addAttribute(SetYieldOp::getAccumulatorFlagsAttrName(result.name),
-                      accumulatorFlags);
-
-  const auto *builderFnIt = accumulatorBuilderFns.begin();
-  for (auto item : llvm::zip(srcs, accumulatorFlags)) {
-    Value src = std::get<0>(item);
-    auto accumulatorFlag = std::get<1>(item).cast<BoolAttr>();
-
-    if (!accumulatorFlag.getValue()) continue;
-    Region *region = result.addRegion();
-    OpBuilder::InsertionGuard g(builder);
-    SmallVector<Type, 2> argTypes(2, src.getType());
-    builder.createBlock(region);
-    Block &bodyBlock = region->front();
-    bodyBlock.addArguments(argTypes, {result.location, result.location});
-
-    builder.setInsertionPointToStart(&bodyBlock);
-    (*builderFnIt)(builder, result.location, bodyBlock.getArgument(0),
-                   bodyBlock.getArgument(1));
-    ++builderFnIt;
-  }
-}
-
-LogicalResult SetYieldOp::verify() {
-  for (const auto [dst, src, set] :
-       llvm::zip(getDsts(), getSrcs(), getSets())) {
-    if (failed(verifyCompatibleExtractedSubset(getOperation(),
-                                               dst.getType().cast<ShapedType>(),
-                                               src.getType(), set.getType())))
-      return failure();
-  }
-  auto accumulatorCount = llvm::count_if(
-      getAccumulatorFlags(),
-      [](Attribute attr) { return attr.cast<BoolAttr>().getValue(); });
-  if (accumulatorCount != static_cast<int64_t>(getAccumulators().size()))
-    return emitOpError("expected the number of accumulator regions ")
-           << getAccumulators().size()
-           << " to match the number of set accumulator flags "
-           << accumulatorCount;
-
-  auto *regionIt = getAccumulators().begin();
-  for (auto item : llvm::zip(getSrcs(), getAccumulatorFlags())) {
-    Type srcType = std::get<0>(item).getType();
-    BoolAttr accumulatorFlag = std::get<1>(item).cast<BoolAttr>();
-    if (!accumulatorFlag.getValue()) continue;
-
-    Block &block = regionIt->front();
-    if (block.getArgumentTypes() != SmallVector<Type>{srcType, srcType})
-      return emitOpError()
-             << "expected accumulator region to have 2 arguments of type "
-             << srcType;
-    ++regionIt;
-  }
-  return success();
-}
-
-void SetYieldOp::print(OpAsmPrinter &p) {
-  p.printOptionalAttrDict(getOperation()->getAttrs(), /*elidedAttrs = */
-                          {getAccumulatorFlagsAttrName().str()});
-
-  auto *regionIt = getOperation()->getRegions().begin();
-  for (auto &en : llvm::enumerate(
-           llvm::zip(getSrcs(), getDsts(), getSets(), getAccumulatorFlags()))) {
-    if (en.index() > 0) {
-      p << ',';
-      p.printNewline();
-    }
-    Value src = std::get<0>(en.value());
-    Value dst = std::get<1>(en.value());
-    Value set = std::get<2>(en.value());
-    auto accumulatorFlag = std::get<3>(en.value()).cast<BoolAttr>();
-
-    p << ' ' << src << " into " << dst << '[' << set << ']';
-
-    if (accumulatorFlag.getValue()) {
-      auto &block = regionIt->getBlocks().front();
-      Value newValue = block.getArgument(0);
-      Value oldValue = block.getArgument(1);
-      p << " acc (" << newValue << ", " << oldValue << ": "
-        << oldValue.getType() << ") ";
-
-      p.printRegion(*regionIt, false);
-      ++regionIt;
-    }
-
-    p << " : " << src.getType() << " into " << dst.getType() << '['
-      << set.getType() << ']';
-  }
-}
-
-ParseResult SetYieldOp::parse(OpAsmParser &parser, OperationState &result) {
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  SmallVector<bool, 2> accumulatorFlags;
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> srcs, dsts, sets;
-  SmallVector<Type, 4> srcTypes, dstTypes, setTypes;
-
-  auto parseElt = [&]() -> ParseResult {
-    OpAsmParser::UnresolvedOperand src;
-    auto parseResult = parser.parseOptionalOperand(src);
-
-    if (!parseResult.has_value()) return success();
-    srcs.push_back(src);
-
-    if (parser.parseKeyword("into") ||
-        parser.parseOperand(dsts.emplace_back()) || parser.parseLSquare() ||
-        parser.parseOperand(sets.emplace_back()) || parser.parseRSquare())
-      return failure();
-
-    OpBuilder b(parser.getBuilder().getContext());
-    bool hasAccumulatorRegion = succeeded(parser.parseOptionalKeyword("acc"));
-    accumulatorFlags.push_back(hasAccumulatorRegion);
-    if (hasAccumulatorRegion) {
-      auto region = std::make_unique<Region>();
-      OpAsmParser::UnresolvedOperand newValue, oldValue;
-      Type argType;
-      if (parser.parseLParen() || parser.parseOperand(newValue) ||
-          parser.parseComma() || parser.parseOperand(oldValue) ||
-          parser.parseColonType(argType) || parser.parseRParen())
-        return failure();
-
-      SmallVector<OpAsmParser::Argument, 4> regionArgs;
-      for (auto value : {newValue, oldValue}) {
-        auto &arg = regionArgs.emplace_back();
-        arg.ssaName = value;
-        arg.type = argType;
-      }
-
-      if (parser.parseRegion(*region, regionArgs)) return failure();
-      result.addRegion(std::move(region));
-    }
-    if (parser.parseColon() || parser.parseType(srcTypes.emplace_back()) ||
-        parser.parseKeyword("into") ||
-        parser.parseType(dstTypes.emplace_back()) || parser.parseLSquare() ||
-        parser.parseType(setTypes.emplace_back()) || parser.parseRSquare())
-      return failure();
-
-    return success();
-  };
-  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::None, parseElt))
-    return failure();
-
-  if (parser.resolveOperands(srcs, srcTypes, parser.getCurrentLocation(),
-                             result.operands) ||
-      parser.resolveOperands(dsts, dstTypes, parser.getCurrentLocation(),
-                             result.operands) ||
-      parser.resolveOperands(sets, setTypes, parser.getCurrentLocation(),
-                             result.operands))
-    return failure();
-
-  result.addAttribute(SetYieldOp::getAccumulatorFlagsAttrName(result.name),
-                      parser.getBuilder().getBoolArrayAttr(accumulatorFlags));
-  return success();
-}
-
-namespace {
-/// Folds UnrealizedConversionCast of TileType into SetYieldOp.
-struct FoldTileCastIntoSetYield : public OpRewritePattern<SetYieldOp> {
-  using OpRewritePattern<SetYieldOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SetYieldOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!llvm::any_of(op.getSets(), [](auto set) {
-          return set.template getDefiningOp<UnrealizedConversionCastOp>() !=
-                 nullptr;
-        }))
-      return failure();
-    SmallVector<Value> newSrcs{op.getSrcs()};
-    SmallVector<Value> newSets{op.getSets()};
-    for (auto &&[src, set] : llvm::zip(newSrcs, newSets)) {
-      auto cast = set.getDefiningOp<UnrealizedConversionCastOp>();
-      if (!cast) continue;
-      set = cast.getOperand(0);
-      Type castResultType = src.getType();
-      if (auto shapedType = dyn_cast<ShapedType>(castResultType)) {
-        castResultType =
-            shapedType.clone(set.getType().cast<TileType>().getShape(),
-                             shapedType.getElementType());
-        src = rewriter.create<tensor::CastOp>(op.getLoc(), castResultType, src);
-      }
-    }
-    rewriter.replaceOpWithNewOp<SetYieldOp>(op, newSrcs, op.getDsts(), newSets);
-    return success();
-  }
-};
-}  // namespace
-
-void SetYieldOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                             MLIRContext *context) {
-  results.add<FoldTileCastIntoSetYield>(context);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
-
-// Generated op classes.
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt
deleted file mode 100644
index ea3c6cfb6bd..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-set(LLVM_OPTIONAL_SOURCES
-  bufferizable_op_interface_impl.cc
-)
-
-add_mlir_library(GmlStTilingInterface
-  tiling_interface.cc
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRSupport
-
-  DEPENDS
-  MLIRGmlStTilingInterfaceIncGen
-)
-
-add_mlir_library(GmlStTilingInterfaceImpl
-  tiling_interface_impl.cc
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  GmlStTilingInterface
-  MLIRArithDialect
-  MLIRAffineDialect
-  MLIRDestinationStyleOpInterface
-  MLIRLinalgDialect
-  MLIRLinalgTransforms
-  MLIRLinalgUtils
-  MLIRTensorDialect
-  MLIRTensorUtils
-  MLIRIR
-  MLIRSupport
-  THLODialect
-)
-
-add_mlir_library(GmlStBufferizableOpInterface
-  bufferizable_op_interface_impl.cc
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRBufferizationDialect
-  MLIRBufferizationTransforms
-  MLIRDestinationStyleOpInterface
-  MLIRIR
-  MLIRSupport
-)
-
-add_mlir_library(GmlStPasses
-  collapse_materialize_ops.cc
-  fusion.cc
-  gml_st_to_scf.cc
-  gml_st_to_gpu.cc
-  linalg_utils.cc
-  tiling.cc
-  tiling_cwise.cc
-  tiling_gpu_warp.cc
-  tiling_softmax.cc
-  transform_map_for_cpu.cc
-  transform_matmul_for_cpu.cc
-  transform_transpose_for_cpu.cc
-  transform_scatter_for_cpu.cc
-  vectorization.cc
-
-  DEPENDS
-  MLIRGmlStPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStTilingInterface
-  GmlStTilingInterfaceImpl
-  MLIRDestinationStyleOpInterface
-  MhloDialect
-  MLIRDialectUtils
-  MLIRAffineDialect
-  MLIRArithDialect
-  MLIRFuncDialect
-  MLIRGPUOps
-  MLIRIR
-  MLIRLinalgDialect
-  MLIRLinalgTransforms
-  MLIRPass
-  MLIRSupport
-  MLIRVectorDialect
-)
-
-add_mlir_library(GmlStTransforms
-  transforms.cc
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRAffineDialect
-  MLIRDialectUtils
-  MLIRIR
-  MLIRSCFUtils
-)
-
-add_mlir_library(GmlStTestPasses
-  test_passes.cc
-
-  DEPENDS
-  MLIRGmlStTestPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStBufferizableOpInterface
-  GmlStDialect
-  GmlStTransforms
-  MLIRPass
-  MLIRTransforms
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/bufferizable_op_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/bufferizable_op_interface_impl.cc
deleted file mode 100644
index 12dbcd4621d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/bufferizable_op_interface_impl.cc
+++ /dev/null
@@ -1,543 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h"
-
-#include <iterator>
-#include <tuple>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-#include "mlir/Support/LogicalResult.h"
-
-using mlir::bufferization::AnalysisState;
-using mlir::bufferization::BufferizableOpInterface;
-using mlir::bufferization::BufferizationOptions;
-using mlir::bufferization::BufferRelation;
-using mlir::bufferization::ToMemrefOp;
-using mlir::bufferization::ToTensorOp;
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-/// Bufferization of gml_st.loop. Replace with a new gml_st.loop
-/// that operates entirely on memrefs.
-struct LoopOpInterface
-    : public BufferizableOpInterface::ExternalModel<LoopOpInterface, LoopOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const AnalysisState &state) const {
-    auto loopOp = cast<LoopOp>(op);
-
-    // gml_st.loop operands alone do not bufferize to a memory read, but
-    // one of the uses of their matching bbArgs may.
-    return state.isValueRead(loopOp.getTiedBlockArgument(opOperand));
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const AnalysisState &state) const {
-    // Only operands with an aliasing OpResult (i.e., output operands) bufferize
-    // to a memory write.
-    auto bufferizableOp = cast<BufferizableOpInterface>(op);
-    return !bufferizableOp.getAliasingOpResult(opOperand, state).empty();
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand &opOperand,
-      const AnalysisState & /*state*/) const {
-    auto loopOp = cast<LoopOp>(op);
-
-    // Output operands are tied to their corresponding OpResults.
-    OpResult opResult = loopOp.getTiedOpResult(opOperand);
-    if (!opResult) return {};
-    return {opResult};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  bool isWritable(Operation * /*op*/, Value /*value*/,
-                  const AnalysisState & /*state*/) const {
-    // Interestingly, LoopOp's bbArgs can **always** be viewed
-    // inplace from the perspective of nested ops:
-    //   1. Either the matching iter operand is not bufferized inplace and an
-    //      alloc + optional copy makes the bbArg itself inplaceable.
-    //   2. Or the matching iter operand is bufferized inplace and bbArg just
-    //      bufferizes to that too.
-    return true;
-  }
-
-  FailureOr<BaseMemRefType> getBufferType(
-      Operation *op, Value value, const BufferizationOptions &options,
-      const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
-    auto loopOp = cast<LoopOp>(op);
-    if (auto opResult = value.dyn_cast<OpResult>()) {
-      return bufferization::getBufferType(
-          loopOp.getOutputs()[opResult.getResultNumber()], options, fixedTypes);
-    }
-    BlockArgument bbArg = value.cast<BlockArgument>();
-    return bufferization::getBufferType(loopOp.getTiedOperand(bbArg).get(),
-                                        options, fixedTypes);
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto loopOp = cast<LoopOp>(op);
-
-    // Compute new inputs, outputs and results.
-    SmallVector<Value> newInputs, newOutputs, newResults;
-    for (unsigned i = loopOp.getNumControlOperands();
-         i < loopOp->getNumOperands(); ++i) {
-      OpOperand &operand = loopOp->getOpOperand(i);
-      Value rewrittenValue = operand.get();
-      if (rewrittenValue.getType().isa<TensorType>()) {
-        FailureOr<Value> maybeBuffer =
-            getBuffer(rewriter, operand.get(), options);
-        if (failed(maybeBuffer)) return failure();
-        rewrittenValue = *maybeBuffer;
-      }
-      if (i < loopOp.getNumControlOperands() + loopOp.getNumInputs()) {
-        newInputs.push_back(rewrittenValue);
-      } else {
-        newOutputs.push_back(rewrittenValue);
-        if (operand.get().getType().isa<TensorType>())
-          newResults.push_back(rewrittenValue);
-      }
-    }
-
-    // Create new TiledLoopOp.
-    auto newLoopOp = rewriter.create<LoopOp>(
-        loopOp.getLoc(), loopOp.getLowerBound(), loopOp.getUpperBound(),
-        loopOp.getStep(), newInputs, newOutputs, loopOp.getIteratorTypes(),
-        loopOp.getDistributionTypes());
-
-    // Remove terminator.
-    if (!newLoopOp.getBody()->empty())
-      rewriter.eraseOp(loopOp.getBody()->getTerminator());
-
-    // Compute new loop body arguments.
-    SmallVector<Value> newBlockArgs, newRegionInOutArgs, oldRegionInOutArgs;
-    ValueRange newInductionVars = newLoopOp.getInductionVars();
-    newBlockArgs.append(newInductionVars.begin(), newInductionVars.end());
-
-    ValueRange newRegionInArgs = newLoopOp.getRegionInputArgs();
-    ValueRange newRegionOutArgs = newLoopOp.getRegionOutputArgs();
-    newRegionInOutArgs.append(newRegionInArgs.begin(), newRegionInArgs.end());
-    newRegionInOutArgs.append(newRegionOutArgs.begin(), newRegionOutArgs.end());
-
-    ValueRange oldRegionInArgs = loopOp.getRegionInputArgs();
-    ValueRange oldRegionOutArgs = loopOp.getRegionOutputArgs();
-    oldRegionInOutArgs.append(oldRegionInArgs.begin(), oldRegionInArgs.end());
-    oldRegionInOutArgs.append(oldRegionOutArgs.begin(), oldRegionOutArgs.end());
-    assert(newRegionInArgs.size() == oldRegionInArgs.size() &&
-           "expected same number of input args");
-    assert(newRegionOutArgs.size() == oldRegionOutArgs.size() &&
-           "expected same number of output args");
-
-    for (auto it : llvm::zip(oldRegionInOutArgs, newRegionInOutArgs)) {
-      Value oldArg = std::get<0>(it);
-      Value newArg = std::get<1>(it);
-      rewriter.setInsertionPointToStart(newLoopOp.getBody());
-      if (oldArg.getType().isa<TensorType>()) {
-        newBlockArgs.push_back(rewriter.create<bufferization::ToTensorOp>(
-            oldArg.getLoc(), newArg));
-      } else {
-        newBlockArgs.push_back(newArg);
-      }
-    }
-
-    // Move old body into new loop.
-    rewriter.mergeBlocks(loopOp.getBody(), newLoopOp.getBody(), newBlockArgs);
-
-    // Replace previous terminator with a new one that does not yield anything.
-    auto oldTerminator =
-        cast<gml_st::YieldOp>(newLoopOp.getBody()->getTerminator());
-    rewriter.setInsertionPointToEnd(newLoopOp.getBody());
-    auto newTerminator =
-        rewriter.create<gml_st::YieldOp>(oldTerminator->getLoc());
-
-    // Copy buffer of yielded tensor to output buffer. If everything bufferized
-    // inplace, this copy will fold away.
-    rewriter.setInsertionPoint(newTerminator);
-    for (auto it : llvm::zip(oldTerminator.getValues(), newOutputs)) {
-      Value output = std::get<1>(it);
-      Value toMemrefOp = rewriter.create<bufferization::ToMemrefOp>(
-          newTerminator.getLoc(), output.getType(), std::get<0>(it));
-      if (failed(options.createMemCpy(rewriter, newTerminator.getLoc(),
-                                      toMemrefOp, output)))
-        return failure();
-    }
-
-    // Erase old terminator.
-    rewriter.eraseOp(oldTerminator);
-
-    // Replace results and delete old op.
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, newResults);
-
-    return success();
-  }
-};
-
-// Returns a scalar or a memref type result of `gml_st.materialize` op after
-// bufferization.
-FailureOr<Value> materializeExtraction(OpBuilder &b, Value memref,
-                                       MaterializeOp materializeOp) {
-  Value set = materializeOp.getSet();
-
-  Operation *setDefiningOp = set.getDefiningOp();
-
-  Location loc = set.getLoc();
-  if (auto tile = dyn_cast<TileOp>(setDefiningOp)) {
-    if (!materializeOp.getType().isa<ShapedType>()) {
-      auto indices =
-          getValueOrCreateConstantIndexOp(b, loc, tile.getMixedOffsets());
-      return b.create<memref::LoadOp>(loc, memref, indices).getResult();
-    }
-    Value subview = b.create<memref::SubViewOp>(
-        loc, memref, tile.getMixedOffsets(), tile.getMixedSizes(),
-        tile.getMixedStrides());
-    return subview;
-  }
-  return failure();
-}
-
-LogicalResult materializeInsertion(OpBuilder &b, Value update, Value set,
-                                   Value memref,
-                                   const BufferizationOptions &options) {
-  Location loc = update.getLoc();
-
-  Operation *setDefiningOp = set.getDefiningOp();
-
-  // Create subviews or store ops for the set computation.
-  auto tile = dyn_cast<TileOp>(setDefiningOp);
-  if (!tile) {
-    // TODO(bchetioui): this check for an unrealized conversion cast does not
-    // belong here. This workaround will have to be deleted once SetYieldOp can
-    // be canonicalized correctly.
-
-    // If constants were folded into the tile type during canonicalization,
-    // tile creation is followed by an UnrealizedConversionCastOp on the tile.
-    auto castOp = dyn_cast<UnrealizedConversionCastOp>(setDefiningOp);
-    if (!castOp) return failure();
-
-    tile = dyn_cast<TileOp>(castOp->getOperand(0).getDefiningOp());
-    if (!tile) return failure();
-  }
-
-  if (!update.getType().isa<ShapedType>()) {
-    auto indices =
-        getValueOrCreateConstantIndexOp(b, loc, tile.getMixedOffsets());
-    b.create<memref::StoreOp>(loc, update, memref, indices);
-    return success();
-  }
-
-  memref =
-      b.create<memref::SubViewOp>(loc, memref, tile.getMixedOffsets(),
-                                  tile.getMixedSizes(), tile.getMixedStrides());
-  return options.createMemCpy(b, loc, update, memref);
-}
-
-struct MaterializeOpInterface
-    : public BufferizableOpInterface::ExternalModel<MaterializeOpInterface,
-                                                    MaterializeOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
-                               const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand &opOperand,
-      const AnalysisState & /*state*/) const {
-    auto result = op->getOpResult(0);
-    if (result.getType().isa<RankedTensorType>() &&
-        opOperand.getOperandNumber() == 0)
-      return {result};
-    return {};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::None;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto materializeOp = cast<MaterializeOp>(op);
-
-    FailureOr<Value> bufferOr =
-        getBuffer(rewriter, materializeOp->getOpOperand(0).get(), options);
-    if (failed(bufferOr)) return failure();
-
-    rewriter.setInsertionPoint(materializeOp);
-    FailureOr<Value> resultOr =
-        materializeExtraction(rewriter, *bufferOr, materializeOp);
-
-    if (failed(resultOr)) return failure();
-
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, *resultOr);
-    return success();
-  }
-};
-
-struct ParallelOpInterface
-    : public BufferizableOpInterface::ExternalModel<ParallelOpInterface,
-                                                    ParallelOp> {
-  SmallVector<OpOperand *> getAliasingOpOperand(
-      Operation *op, OpResult opResult, const AnalysisState & /*state*/) const {
-    auto parallelOp = cast<ParallelOp>(op);
-    return {
-        parallelOp.getTerminator().getDstOperand(opResult.getResultNumber())};
-  }
-
-  bool isMemoryWrite(Operation *, OpResult, const AnalysisState &) const {
-    // This op is a memory write. Stop lookup here to avoid finding false
-    // conflicts involving this op and one of the ops in the region. This is
-    // similar to how scf.if ops are analyzed.
-    return true;
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  bool isWritable(Operation * /*op*/, Value /*value*/,
-                  const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions & /*options*/) const {
-    auto loopOp = cast<ParallelOp>(op);
-
-    // Create new TiledLoopOp.
-    Optional<StringAttr> distTypeAttr;
-    if (auto distType = cast<ParallelOp>(op).getDistributionType())
-      distTypeAttr = rewriter.getStringAttr(*distType);
-    auto newLoopOp = rewriter.create<ParallelOp>(
-        loopOp.getLoc(), TypeRange{llvm::None}, loopOp.getLowerBound(),
-        loopOp.getUpperBound(), loopOp.getStep(), distTypeAttr);
-
-    // Move the old body into the new loop.
-    rewriter.mergeBlocks(loopOp.getBody(), newLoopOp.getBody(),
-                         newLoopOp.getInductionVars());
-
-    // Remove the old op.
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-struct ForOpInterface
-    : public BufferizableOpInterface::ExternalModel<ForOpInterface, ForOp> {
-  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
-                              const AnalysisState &state) const {
-    auto forOp = cast<gml_st::ForOp>(op);
-    return state.isValueRead(forOp.getRegionOutputArgForOpOperand(opOperand));
-  }
-
-  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
-                               const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand &opOperand,
-      const AnalysisState & /*state*/) const {
-    auto forOp = cast<gml_st::ForOp>(op);
-    return {forOp.getResultForOpOperand(opOperand)};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  bool isWritable(Operation * /*op*/, Value /*value*/,
-                  const AnalysisState & /*state*/) const {
-    // Interestingly, ForOp's bbArg can **always** be viewed
-    // inplace from the perspective of ops nested under:
-    //   1. Either the matching iter operand is not bufferized inplace and an
-    //      alloc + optional copy makes the bbArg itself inplaceable.
-    //   2. Or the matching iter operand is bufferized inplace and bbArg just
-    //      bufferizes to that too.
-    return true;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto forOp = cast<ForOp>(op);
-    Location loc = forOp.getLoc();
-
-    // Get the bufferized output arguments.
-    SmallVector<Value> bufferizedOutputs;
-    bufferizedOutputs.reserve(forOp.getNumOutputs());
-    for (Value output : forOp.getOutputs()) {
-      FailureOr<Value> maybeBuffer = getBuffer(rewriter, output, options);
-      if (failed(maybeBuffer)) return failure();
-      bufferizedOutputs.push_back(*maybeBuffer);
-    }
-
-    // Create new ForOp.
-    auto newForOp = rewriter.create<ForOp>(
-        loc, TypeRange{}, forOp.getLowerBound(), forOp.getUpperBound(),
-        forOp.getStep(), ValueRange{}, nullptr);
-    Block *loopBody = newForOp.getBody();
-
-    // Add conversions to tensor so that we can reuse the old loop body.
-    rewriter.setInsertionPointToStart(loopBody);
-    SmallVector<Value> outputsToTensors;
-    for (auto buf : bufferizedOutputs) {
-      Value tensor = rewriter.create<bufferization::ToTensorOp>(loc, buf);
-      outputsToTensors.push_back(tensor);
-    }
-    SmallVector<Value> blockArgs = newForOp.getInductionVars();
-    blockArgs.append(outputsToTensors);
-
-    // Move old body into new for loop.
-    rewriter.mergeBlocks(forOp.getBody(), loopBody, blockArgs);
-
-    // Replace results and delete old op.
-    bufferization::replaceOpWithBufferizedValues(rewriter, op,
-                                                 bufferizedOutputs);
-    return success();
-  }
-
-  FailureOr<BaseMemRefType> getBufferType(
-      Operation *op, Value value, const BufferizationOptions &options,
-      const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
-    auto forOp = cast<ForOp>(op);
-
-    if (auto bbArg = value.dyn_cast<BlockArgument>()) {
-      // A tensor block argument has the same bufferized type as the
-      // corresponding output operand.
-      return bufferization::getBufferType(
-          forOp.getOpOperandForRegionOutputArg(bbArg).get(), options,
-          fixedTypes);
-    }
-
-    // The bufferized result type is the same as the bufferized type of the
-    // corresponding output operand.
-    return bufferization::getBufferType(
-        forOp.getOutputs()[value.cast<OpResult>().getResultNumber()], options,
-        fixedTypes);
-  }
-};
-
-struct SetYieldOpInterface
-    : public BufferizableOpInterface::ExternalModel<SetYieldOpInterface,
-                                                    SetYieldOp> {
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation * /*op*/, OpOperand & /*opOperand*/,
-      const AnalysisState & /*state*/) const {
-    return {};
-  }
-
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const AnalysisState & /*state*/) const {
-    return cast<SetYieldOp>(op).isDstOperand(opOperand);
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /* opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto yieldOp = cast<SetYieldOp>(op);
-    Operation *loop = yieldOp->getParentOp();
-    if (!isa<ForOp, ParallelOp>(loop))
-      return yieldOp->emitError("unsupported gml_st::SetYieldOp parent");
-
-    rewriter.setInsertionPoint(op);
-    for (const auto &it :
-         llvm::enumerate(llvm::zip(yieldOp.getSrcs(), yieldOp.getDsts(),
-                                   yieldOp.getSets(), loop->getResults()))) {
-      Value src, dst, set, loopResult;
-      std::tie(src, dst, set, loopResult) = it.value();
-
-      // `src` can be a scalar, that's `getBuffer()` should be called only for
-      // tensor types.
-      if (src.getType().isa<RankedTensorType>()) {
-        FailureOr<Value> srcBufferOr = getBuffer(rewriter, src, options);
-        if (failed(srcBufferOr)) return failure();
-
-        src = *srcBufferOr;
-      }
-
-      FailureOr<Value> dstBufferOr = getBuffer(rewriter, dst, options);
-      if (failed(dstBufferOr)) return failure();
-      Value dstBuffer = *dstBufferOr;
-
-      if (failed(materializeInsertion(rewriter, src, set, dstBuffer, options)))
-        return failure();
-      if (auto parallelOp =
-              dyn_cast<gml_st::ParallelOp>(yieldOp->getParentOp())) {
-        // Replace results of the enclosing loop with `to_tensor(dst)`.
-        OpBuilder::InsertionGuard g(rewriter);
-        rewriter.setInsertionPointAfter(loop);
-
-        Value resultToTensor =
-            rewriter.create<ToTensorOp>(loop->getLoc(), dstBuffer);
-        for (OpOperand &use : loopResult.getUses()) {
-          rewriter.updateRootInPlace(use.getOwner(),
-                                     [&]() { use.set(resultToTensor); });
-        }
-      }
-    }
-    rewriter.replaceOpWithNewOp<SetYieldOp>(op);
-    return success();
-  }
-
-  bool isNotConflicting(Operation * /*op*/, OpOperand * /*uRead*/,
-                        OpOperand * /*uConflictingWrite*/,
-                        const AnalysisState & /*state*/) const {
-    return true;
-  }
-};
-
-}  // namespace
-}  // namespace gml_st
-}  // namespace mlir
-
-void mlir::gml_st::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(
-      +[](MLIRContext *ctx, gml_st::GmlStDialect * /*dialect*/) {
-        ForOp::attachInterface<ForOpInterface>(*ctx);
-        LoopOp::attachInterface<LoopOpInterface>(*ctx);
-        MaterializeOp::attachInterface<MaterializeOpInterface>(*ctx);
-        ParallelOp::attachInterface<ParallelOpInterface>(*ctx);
-        SetYieldOp::attachInterface<SetYieldOpInterface>(*ctx);
-      });
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/collapse_materialize_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/collapse_materialize_ops.cc
deleted file mode 100644
index 4f1e9a0a07d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/collapse_materialize_ops.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_COLLAPSEMATERIALIZEOPSPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-OpFoldResult multiplyOperandsOrIntegers(PatternRewriter& rewriter, Location loc,
-                                        OpFoldResult lhs, OpFoldResult rhs) {
-  // Both operands are static.
-  if (lhs.is<Attribute>() && rhs.is<Attribute>()) {
-    return rewriter.getI64IntegerAttr(
-        lhs.get<Attribute>().cast<IntegerAttr>().getInt() *
-        rhs.get<Attribute>().cast<IntegerAttr>().getInt());
-  }
-
-  // Exploit commutativity and move static operand to the left (if any).
-  if (rhs.is<Attribute>()) std::swap(lhs, rhs);
-
-  // Create constant if needed.
-  if (lhs.is<Attribute>()) {
-    int64_t lhsInt = lhs.get<Attribute>().cast<IntegerAttr>().getInt();
-
-    // Exploit static operand if possible.
-    if (lhsInt == 0) return lhs;
-    if (lhsInt == 1) return rhs;
-
-    lhs = rewriter.create<arith::ConstantIndexOp>(loc, lhsInt).getResult();
-  }
-
-  // Multiply.
-  return rewriter.create<arith::MulIOp>(loc, lhs.get<Value>(), rhs.get<Value>())
-      .getResult();
-}
-
-OpFoldResult addOperandsOrIntegers(PatternRewriter& rewriter, Location loc,
-                                   OpFoldResult lhs, OpFoldResult rhs) {
-  // Both operands are static.
-  if (lhs.is<Attribute>() && rhs.is<Attribute>()) {
-    return rewriter.getI64IntegerAttr(
-        lhs.get<Attribute>().cast<IntegerAttr>().getInt() +
-        rhs.get<Attribute>().cast<IntegerAttr>().getInt());
-  }
-
-  // Exploit commutativity and move static operand to the left (if any).
-  if (rhs.is<Attribute>()) std::swap(lhs, rhs);
-
-  // Create constant if needed.
-  if (lhs.is<Attribute>()) {
-    int64_t lhsInt = lhs.get<Attribute>().cast<IntegerAttr>().getInt();
-
-    // Exploit static operand if possible.
-    if (lhsInt == 0) return rhs;
-
-    lhs = rewriter.create<arith::ConstantIndexOp>(loc, lhsInt).getResult();
-  }
-
-  // Add.
-  return rewriter.create<arith::AddIOp>(loc, lhs.get<Value>(), rhs.get<Value>())
-      .getResult();
-}
-
-// Compose offsets with newOffset = supersetOffset + supersetStride * offset.
-SmallVector<OpFoldResult> composeOffsets(
-    const llvm::SmallVectorImpl<OpFoldResult>& supersetOffsets,
-    const llvm::SmallVectorImpl<OpFoldResult>& supersetStrides,
-    const llvm::SmallVectorImpl<OpFoldResult>& offsets, Location loc,
-    PatternRewriter& rewriter) {
-  SmallVector<OpFoldResult> composedOffsets;
-  for (auto it : llvm::zip(supersetOffsets, supersetStrides, offsets)) {
-    composedOffsets.push_back(addOperandsOrIntegers(
-        rewriter, loc, std::get<0>(it),
-        multiplyOperandsOrIntegers(rewriter, loc, std::get<1>(it),
-                                   std::get<2>(it))));
-  }
-  return composedOffsets;
-}
-
-// Compose strides with newStride = supersetStride * stride.
-SmallVector<OpFoldResult> composeStrides(
-    PatternRewriter& rewriter, Location loc,
-    const llvm::SmallVectorImpl<OpFoldResult>& supersetStrides,
-    const llvm::SmallVectorImpl<OpFoldResult>& strides) {
-  SmallVector<OpFoldResult> composedStrides;
-  for (auto it : llvm::zip(supersetStrides, strides)) {
-    composedStrides.push_back(multiplyOperandsOrIntegers(
-        rewriter, loc, std::get<0>(it), std::get<1>(it)));
-  }
-  return composedStrides;
-}
-
-// Collapse materialize operations with nested tile chains t1, t2, ..., tn, and
-// u1, u2, ..., un. A materialize op of the form ...
-//   `materialize(materialize(tensor2, t2), t1)
-// ... is collapsed as ...
-//   `materialize(t2, composed_tile(t1, t2))
-struct CollapseMaterializeOpPattern : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto tileOp = op.getSet().getDefiningOp<TileOp>();
-    if (!tileOp) return failure();
-
-    auto producerMaterializeOp = op.getSource().getDefiningOp<MaterializeOp>();
-    if (!producerMaterializeOp) return failure();
-
-    auto producerTileOp =
-        producerMaterializeOp.getSet().getDefiningOp<TileOp>();
-    if (!producerTileOp) return failure();
-
-    // Compose tileOp and producerTileOp.
-    auto loc = op.getLoc();
-    auto producerStrides = producerTileOp.getMixedStrides();
-    auto composedOffsets =
-        composeOffsets(producerTileOp.getMixedOffsets(), producerStrides,
-                       tileOp.getMixedOffsets(), loc, rewriter);
-    auto composedStrides = composeStrides(rewriter, loc, producerStrides,
-                                          tileOp.getMixedStrides());
-    auto composedTileOp = rewriter.create<TileOp>(
-        loc, composedOffsets, tileOp.getMixedSizes(), composedStrides);
-
-    rewriter.replaceOpWithNewOp<MaterializeOp>(
-        op, producerMaterializeOp.getSource(), composedTileOp);
-    return success();
-  }
-};
-
-struct CollapseMaterializeOpsPass
-    : public impl::CollapseMaterializeOpsPassBase<CollapseMaterializeOpsPass> {
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    populateCollapseMaterializeOpsPatterns(ctx, &patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-void populateCollapseMaterializeOpsPatterns(MLIRContext* ctx,
-                                            RewritePatternSet* patterns) {
-  patterns->add<CollapseMaterializeOpPattern>(ctx);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createCollapseMaterializeOpsPass() {
-  return std::make_unique<CollapseMaterializeOpsPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/fusion.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/fusion.cc
deleted file mode 100644
index 7fcdf3b7e49..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/fusion.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/fusion.h"
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_FUSIONPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-// TODO(frgossen): Move this to the shape reification pass.
-struct DimOpFissionPattern : public OpRewritePattern<tensor::ExtractOp> {
-  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::ExtractOp extract,
-                                PatternRewriter& rewriter) const override {
-    auto shapeDef = llvm::dyn_cast_or_null<shape::ShapeOfOp>(
-        extract.getTensor().getDefiningOp());
-    if (!shapeDef || extract.getIndices().size() != 1) return failure();
-    rewriter.replaceOpWithNewOp<tensor::DimOp>(extract, shapeDef.getArg(),
-                                               extract.getIndices().front());
-    return success();
-  }
-};
-
-// TODO(frgossen): Implement this through the shape reification interface and
-// move this pattern to the shape reification pass.
-struct DimOpReificationPattern : public OpRewritePattern<tensor::DimOp> {
-  using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::DimOp op,
-                                PatternRewriter& rewriter) const override {
-    Operation* def = op.getSource().getDefiningOp();
-    if (!def) return failure();
-
-    // TODO(pifon): Split this pattern into many.
-    // Case MaterializeOp.
-    if (auto materializeOp = llvm::dyn_cast<MaterializeOp>(def)) {
-      assert(materializeOp->getNumResults() == 1 && "assume single result");
-      auto dimConstantIndex = op.getConstantIndex();
-      if (!dimConstantIndex.has_value()) return failure();
-
-      auto tileOp = materializeOp.getSet().getDefiningOp<TileOp>();
-      if (!tileOp) return failure();
-      rewriter.replaceOp(op, tileOp.getSizes()[*dimConstantIndex]);
-      return success();
-    }
-    // Case GenericOp.
-    if (auto genericOp = llvm::dyn_cast<linalg::GenericOp>(def)) {
-      if (genericOp.getNumResults() != 1 || !genericOp.hasTensorSemantics()) {
-        return failure();
-      }
-      Value outputOperand = genericOp.getDpsInitOperand(0)->get();
-      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, outputOperand,
-                                                 op.getIndex());
-      return success();
-    }
-
-    // Case EmptyOp.
-    if (auto emptyTensorOp = llvm::dyn_cast<tensor::EmptyOp>(def)) {
-      if (auto indexConstantOp = llvm::dyn_cast_or_null<arith::ConstantOp>(
-              op.getIndex().getDefiningOp())) {
-        int64_t idx =
-            indexConstantOp.getValue().dyn_cast<IntegerAttr>().getInt();
-        OpFoldResult dim = emptyTensorOp.getMixedSizes()[idx];
-        Value dimValue;
-        if (dim.is<Value>()) {
-          dimValue = dim.get<Value>();
-        } else {
-          assert(dim.is<Attribute>() && "expected Value or Attribute");
-          int64_t dimInt = dim.get<Attribute>().cast<IntegerAttr>().getInt();
-          dimValue =
-              rewriter.create<arith::ConstantIndexOp>(op.getLoc(), dimInt);
-        }
-        assert(dimValue);
-        rewriter.replaceOp(op, ValueRange{dimValue});
-        return success();
-      }
-    }
-
-    // Case ConcatenateOp.
-    if (auto concat = llvm::dyn_cast<thlo::ConcatenateOp>(def)) {
-      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, concat.getInit(),
-                                                 op.getIndex());
-      return success();
-    }
-
-    // Case DynamicBroadcastInDimOp.
-    if (auto bcast = llvm::dyn_cast<thlo::DynamicBroadcastInDimOp>(def)) {
-      rewriter.replaceOpWithNewOp<tensor::DimOp>(op, bcast.getInit(),
-                                                 op.getIndex());
-      return success();
-    }
-
-    return failure();
-  }
-};
-
-// Finds the `dst` operand of `setYieldOp` that matches `currentDst` and then
-// replaces it with the corresponding `init` operand of the defining op of
-// `currentDst`. At the moment this update is restricted to `linalg.fill` only,
-// later it can be relaxed to support fusion of transposes into
-// `gml_st.parallel`.
-LogicalResult replaceSetYieldDstByProducerInit(SetYieldOp setYieldOp,
-                                               Value currentDst) {
-  auto fillOp = currentDst.getDefiningOp<linalg::FillOp>();
-  if (!fillOp) return failure();
-
-  Value init = fillOp.getDpsInitOperand(0)->get();
-  for (OpOperand& operand : setYieldOp->getOpOperands()) {
-    if (operand.get() != currentDst) continue;
-    operand.set(init);
-    return success();
-  }
-  return failure();
-}
-
-class FusionPattern : public OpRewritePattern<MaterializeOp> {
- public:
-  FusionPattern(MLIRContext* context,
-                function_ref<LogicalResult(Operation*)> filterFn,
-                mlir::PatternBenefit benefit = 1)
-      : OpRewritePattern<MaterializeOp>(context, benefit), filterFn(filterFn) {}
-
-  LogicalResult matchAndRewrite(MaterializeOp materializeOp,
-                                PatternRewriter& rewriter) const override {
-    assert(filterFn && "expect filter function");
-    if (failed(filterFn(materializeOp)))
-      return rewriter.notifyMatchFailure(materializeOp, "filtered");
-
-    Location loc = materializeOp.getLoc();
-    FailureOr<Value> fusedOr = createFusedOp(rewriter, materializeOp);
-    if (failed(fusedOr)) return failure();  // Match failure aleady notified.
-
-    // Insert cast if needed.
-    Value fused = *fusedOr;
-    if (fused.getType() != materializeOp.getType()) {
-      if (!materializeOp.getType().isa<RankedTensorType>()) {
-        // the result should be a scalar, insert tensor.extract
-        auto tensorType = fused.getType().dyn_cast<RankedTensorType>();
-        assert(tensorType && tensorType.getNumElements() == 1 &&
-               "resulting tensor should contain a single element");
-        auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-        fused = rewriter.create<tensor::ExtractOp>(
-            loc, fused, SmallVector<Value>(tensorType.getRank(), zero));
-      } else {
-        // The result should be a tensor, cast it to the correct shape
-        fused = rewriter.create<tensor::CastOp>(loc, materializeOp.getType(),
-                                                fused);
-      }
-    }
-
-    // Update destination argument of SetYieldOp if we are fusing into the
-    // output tile.
-    if (auto parallelOp = dyn_cast<ParallelOp>(materializeOp->getParentOp())) {
-      SetYieldOp setYieldOp = parallelOp.getTerminator();
-      Value src = materializeOp.getSource();
-      if (llvm::is_contained(src.getUsers(), setYieldOp)) {
-        if (failed(replaceSetYieldDstByProducerInit(setYieldOp, src)))
-          return failure();
-      }
-    }
-
-    rewriter.replaceOp(materializeOp, fused);
-    return success();
-  }
-
- private:
-  function_ref<LogicalResult(Operation*)> filterFn;
-};
-
-struct FusionPass : public impl::FusionPassBase<FusionPass> {
-  FusionPass(StringRef producer, StringRef consumer) {
-    this->producerLabel = producer.str();
-    this->consumerLabel = consumer.str();
-  }
-
-  void getDependentDialects(DialectRegistry& registry) const final {
-    registry.insert<GmlStDialect, tensor::TensorDialect>();
-    registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() final {
-    MLIRContext* ctx = &getContext();
-
-    auto filterFn = [&](Operation* op) {
-      auto materializeOp = cast<MaterializeOp>(op);
-      Operation* producerOp = materializeOp.getSource().getDefiningOp();
-      if (!producerOp || (!producerLabel.empty() &&
-                          !hasMatchingLabel(producerOp, producerLabel))) {
-        return failure();
-      }
-
-      Operation* consumerOp = nullptr;
-      if (!consumerLabel.empty()) {
-        for (Operation* user : materializeOp.getResult().getUsers()) {
-          if (hasMatchingLabel(user, consumerLabel)) {
-            consumerOp = user;
-            break;
-          }
-        }
-        return success(consumerOp != nullptr);
-      }
-
-      return success();
-    };
-
-    // Populate patterns.
-    RewritePatternSet patterns(ctx);
-    populateFusionPatterns(ctx, filterFn, &patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-FailureOr<Value> createFusedOp(PatternRewriter& rewriter,
-                               MaterializeOp materializeOp) {
-  auto tileableOp = materializeOp.getSource().getDefiningOp<TilingInterface>();
-  if (!tileableOp) {
-    return rewriter.notifyMatchFailure(
-        materializeOp, "expected source to be defined by tiling interface op ");
-  }
-
-  auto tileOp = materializeOp.getSet().getDefiningOp<gml_st::TileOp>();
-  if (!tileOp) {
-    return rewriter.notifyMatchFailure(
-        materializeOp, "expected set to be defined by gml_st.tile");
-  }
-
-  SmallVector<OpFoldResult> offsets = tileOp.getMixedOffsets();
-  SmallVector<OpFoldResult> sizes = tileOp.getMixedSizes();
-
-  // Tile the producer.
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(materializeOp);
-  FailureOr<Value> tiledProducer = tileableOp.generateResultTileValue(
-      rewriter, /*resultNumber=*/0, offsets, sizes);
-  if (failed(tiledProducer)) {
-    return rewriter.notifyMatchFailure(tileableOp,
-                                       "failed to tile the producer");
-  }
-
-  return tiledProducer;
-}
-
-void populateFusionPatterns(MLIRContext* ctx,
-                            function_ref<LogicalResult(Operation*)> filterFn,
-                            RewritePatternSet* patterns) {
-  patterns->insert<FusionPattern>(ctx, filterFn);
-  // clang-format off
-  patterns->insert<
-      DimOpFissionPattern,
-      DimOpReificationPattern>(ctx);
-  // clang-format on
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createFusionPass(
-    StringRef producer, StringRef consumer) {
-  return std::make_unique<FusionPass>(producer, consumer);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_gpu.cc
deleted file mode 100644
index 79c9f7f3807..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_gpu.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <array>
-#include <cstdint>
-#include <initializer_list>
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/vector_utils.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define GEN_PASS_DEF_GMLSTTOGPUPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-using namespace mlir;
-using namespace mlir::gml_st;
-using mlir::gpu::LaunchOp;
-using mlir::memref::SubViewOp;
-using mlir::vector::CombiningKind;
-using mlir::vector::ExtractOp;
-using mlir::vector::MultiDimReductionOp;
-using mlir::vector::TransferReadOp;
-using mlir::vector::TransferWriteOp;
-
-namespace {
-/// Converts a sequence of 3 nested gml_st.parallel ops into a gpu.launch op.
-/// Throughout thes pass we will call the first level of nesting "block", the
-/// second "warp", and the 3rd "thread" level. The intention is to allude to the
-/// fact that these will likely correspond to the CUDA programming concepts of
-/// the same name when the IR is lowered to PTX. However, this pass does not
-/// make, nor verify all the requirements (e.g., that the warp-level iteration
-/// contains exactly 32 steps) for mapping to this level.
-///
-/// Each gml_st.parallel is expected to only have a single induction variable.
-/// The loops representing the block, warp, and thread level are mapped to
-/// gridDim.x, blockDim.y, and blockDim.x launch dimensions of gpu.launch,
-/// respectively.
-///
-/// All operations from within the nested gml_st.parallel regions are copied
-/// directly into the gpu.launch region, with induction variables replaced by
-/// equivalent values computed using the blockIdx.x, threadIdx.y and threadIdx.x
-/// indices. Thus, the 3 nested parallel regions are effectively flattened into
-/// a single level of nesting within the gpu.launch region.
-///
-/// At any level of nesting, multiple gml_st.parallel operations are allowed, as
-/// long as they have the same iteration space, i.e., the SSA values defining
-/// the lower bound, upper bound and the step of all parallels on the same level
-/// of nesting are the same values.
-struct ParallelOpToGpuPattern : public OpRewritePattern<ParallelOp> {
-  using OpRewritePattern<ParallelOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ParallelOp root,
-                                PatternRewriter& rewriter) const override;
-};
-
-struct MultiDimReductionOpToWarpReductionPattern
-    : OpRewritePattern<MultiDimReductionOp> {
-  using OpRewritePattern<MultiDimReductionOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MultiDimReductionOp reductionOp,
-                                PatternRewriter& rewriter) const override;
-};
-
-struct EliminateMaterializeOfTransferReadPattern
-    : OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp materialize,
-                                PatternRewriter& rewriter) const override;
-};
-
-struct EliminateDistributeIntoTransferWritePattern
-    : OpRewritePattern<TransferWriteOp> {
-  using OpRewritePattern<TransferWriteOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TransferWriteOp transferWrite,
-                                PatternRewriter& rewriter) const override;
-};
-
-/// Implements the GmlStToGpuPass declared in
-/// include/mlir-hlo/Dialect/gml_st/transforms/passes.td.
-struct GmlStToGpuPass : public ::impl::GmlStToGpuPassBase<GmlStToGpuPass> {
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    patterns
-        .add<ParallelOpToGpuPattern, EliminateMaterializeOfTransferReadPattern,
-             EliminateDistributeIntoTransferWritePattern,
-             MultiDimReductionOpToWarpReductionPattern>(&getContext());
-    func::FuncOp func = getOperation();
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      signalPassFailure();
-    // Make sure there are no gml_st.parallel ops left.
-    // TODO(b/254497967): Properly handle a full conversion from GmlSt to GPU
-    // once SIMTfication is split from conversion of other GmlSt operations.
-    // For now, only verify that we do not have a ParallelOp, since there
-    // might still be other gml_st operations that will be removed by a
-    // subsequent conversion from GmlSt to SCF.
-    WalkResult walk = func.walk([](ParallelOp op) {
-      op->emitOpError("failed to simtfy");
-      return WalkResult::interrupt();
-    });
-    if (walk.wasInterrupted()) signalPassFailure();
-  }
-};
-}  // namespace
-
-/// Creates an initial gpu.launch op with launch configuration set to a single
-/// thread. The idea is to update those later, as we discover the correct values
-/// from the nesting structure.
-static LaunchOp createInitialGpuLaunchOp(Location loc, Value defaultSize,
-                                         PatternRewriter& rewriter) {
-  auto launch =
-      rewriter.create<LaunchOp>(loc, defaultSize, defaultSize, defaultSize,
-                                defaultSize, defaultSize, defaultSize);
-  Block* body = &launch.getBody().front();
-  rewriter.setInsertionPointToEnd(body);
-  rewriter.create<gpu::TerminatorOp>(loc);
-  rewriter.setInsertionPointToStart(body);
-  return launch;
-}
-
-/// Returns the induction variable index `idx` of gpu.launch that should be used
-/// for the given gml_st.parallel's `nestingLevel`.
-static LogicalResult getInductionVarIdxForLevel(unsigned nestingLevel,
-                                                unsigned& idx) {
-  constexpr std::array<unsigned, 3> kNestingToLaunchIdx{
-      0,  // block IDs map to blockIdx.x
-      4,  // warp IDs map to threadIdx.y
-      3,  // thread IDs map to threadIdx.x
-  };
-  if (nestingLevel >= kNestingToLaunchIdx.size()) return failure();
-  idx = kNestingToLaunchIdx[nestingLevel];
-  return success();
-}
-
-/// Verifies that the loop bounds of `currentBound` (which is a result of
-/// affine.apply) are the same ones as the bounds of the `parallel` op.
-static LogicalResult verifyLoopBoundsMatch(Value currentBound,
-                                           ParallelOp parallel) {
-  auto applyOp = currentBound.getDefiningOp<AffineApplyOp>();
-  assert(applyOp && "inferred bounds should be expressed as affine.apply");
-  OperandRange operands = applyOp.getMapOperands();
-  assert(operands.size() == 3 &&
-         "affine map expressing the launch bound should have three operands");
-  return success(operands[0] == parallel.getUpperBound().front() &&
-                 operands[1] == parallel.getLowerBound().front() &&
-                 operands[2] == parallel.getStep().front());
-}
-
-/// Emits code that infers and returns an iteration-independent version of
-/// `upperBound` in cases when the tiling size does not evenly divide the
-/// problem size.
-///
-/// In these cases, `upperBound` depends on other values within the `launch`
-/// region, so it cannot be used to infer the launch bounds of `launch`. This
-/// function returns an approximation of `upperBound` that does not depend on
-/// such values, and emmits code that masks off extra threads (identified by
-/// `inductionVar`) that result from using the approximated value.
-static Value handleImperfectTile(Location loc, LaunchOp launch,
-                                 Value upperBound, Value inductionVar,
-                                 PatternRewriter& rewriter) {
-  // We are assuming that imperfect tiling is expressed through an affine.min
-  // op with an affine map of the form (<something>)[<something>] ->
-  // (<something>, tileSize), where <something>s possibly depend on values
-  // defined within the regions of nested gml_st.parallel. Since local values
-  // are not available outside of the loops, which is needed for launch bounds
-  // computation, we only use tileSize to compte the launch bounds. We then mask
-  // off the threads that would be computing out-of-bound values.
-  // TODO(b/244314345): Replace this pattern matching with a proper way to
-  // handle imperfect tiling once we figure this out.
-  auto affineMin = upperBound.getDefiningOp<AffineMinOp>();
-  if (!affineMin || affineMin.getMap().getNumResults() != 2) return upperBound;
-  auto tileSize =
-      affineMin.getMap().getResult(1).dyn_cast<AffineConstantExpr>();
-  if (!tileSize) return upperBound;
-
-  // Insert a guard in the region to mask off threads that would operate outside
-  // the tile bounds.
-  auto predicate = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::slt, inductionVar, upperBound);
-  auto scfIf =
-      rewriter.create<scf::IfOp>(loc, predicate, /*withElseRegion=*/false);
-  rewriter.setInsertionPointToStart(&scfIf.getThenRegion().front());
-
-  // Create a constant corresponding to the tile size, and return it as the
-  // iteration-independent upper bound.
-  PatternRewriter::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(launch);
-  return rewriter.create<arith::ConstantIndexOp>(loc, tileSize.getValue());
-}
-
-/// Matches the `launchIdx`-th iteration space of `launch` to the iteration
-/// space of `parallel`. Returns an SSA value that is a part of the `launch`'s
-/// region, and represents the value of `parallel`'s induction variable.
-static Value matchLaunchSpaceToLoop(ParallelOp parallel,
-                                    const BlockAndValueMapping& bvm,
-                                    unsigned launchIdx, LaunchOp launch,
-                                    PatternRewriter& rewriter) {
-  Location loc = parallel.getLoc();
-  Value upperBound = bvm.lookupOrDefault(parallel.getUpperBound().front());
-  Value lowerBound = bvm.lookupOrDefault(parallel.getLowerBound().front());
-  Value step = bvm.lookupOrDefault(parallel.getStep().front());
-
-  // Compute the value that gml_st.parallel's induction variable would have in
-  // each iteration, and make it available to operations within the gpu.launch
-  // region.
-  AffineMap inductionVarMap = AffineMap::get(
-      /*dimCount=*/1, /*symbolCount=*/2,
-      rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(1) +
-          rewriter.getAffineSymbolExpr(0));
-  Value inductionVar = rewriter.create<AffineApplyOp>(
-      loc, inductionVarMap,
-      ValueRange{launch.getBody().getArgument(launchIdx), lowerBound, step});
-
-  // Infer the launch bound from the loop bounds and the step.
-  Value iterIndependentUpperBound =
-      handleImperfectTile(loc, launch, upperBound, inductionVar, rewriter);
-  AffineMap launchBoundMap = AffineMap::get(
-      /*dimCount=*/1, /*symbolCount=*/2,
-      (rewriter.getAffineDimExpr(0) - rewriter.getAffineSymbolExpr(0))
-          .ceilDiv(rewriter.getAffineSymbolExpr(1)));
-  PatternRewriter::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(launch);
-  launch.setOperand(
-      launchIdx, rewriter.create<AffineApplyOp>(
-                     loc, launchBoundMap,
-                     ValueRange{iterIndependentUpperBound, lowerBound, step}));
-  return inductionVar;
-}
-
-// Converts the 3 nested gml_st.parallel ops rooted at `root` into a
-// gpu.launch op. We do this by creating an empty gpu.launch region and
-// copying all the operations in gml_st.parallel into that region,
-// recursively copying the bodies of any nested gml_st.parallel regions that
-// we encounter.
-LogicalResult ParallelOpToGpuPattern::matchAndRewrite(
-    ParallelOp root, PatternRewriter& rewriter) const {
-  Location loc = root.getLoc();
-
-  if (root->getParentOfType<ParallelOp>())
-    return rewriter.notifyMatchFailure(root, "should be the root parallel");
-
-  Value defaultSize = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-  LaunchOp launch = createInitialGpuLaunchOp(loc, defaultSize, rewriter);
-
-  BlockAndValueMapping bvm;
-  // We need to keep track of which value in the gpu.launch region represents
-  // which level of the induction variable in the nested region. This is because
-  // we might have multiple gml_st.parallel operations on the same level, and
-  // their induction variables should map to the same value in the flattened
-  // gpu.launch region.
-  SmallVector<Value, 3> nestingLevelToInductionVarMap;
-  // This is our stack holding in-flight operations of gml_st.parallel regions
-  // that we started to copy over to the gpu.launch region, but are on hold
-  // while we are processing a nested gml_st.parallel.
-  SmallVector<iterator_range<Block::iterator>, 3> loopIterators;
-
-  // This functor implements the processing of a single parallel op:
-  // 1)  update of GPU launch bounds according to the interation space
-  // 2)  addition of a nesting level to `loopIterators`, with the iterator
-  //     over `parallel`'s body
-  auto processParallelOp = [&](ParallelOp parallel) {
-    unsigned nestingLevel = loopIterators.size();
-    unsigned inductionVarIdx = 0;
-    if (failed(getInductionVarIdxForLevel(nestingLevel, inductionVarIdx)))
-      return rewriter.notifyMatchFailure(parallel, "is nested too deeply");
-    if (parallel.getNumLoops() != 1) {
-      return rewriter.notifyMatchFailure(
-          parallel, "should only have a single induction variable");
-    }
-
-    Value currentBound = launch.getOperand(inductionVarIdx);
-    if (nestingLevel < nestingLevelToInductionVarMap.size()) {
-      // We already inferred the launch bound for this nesting level.
-      if (failed(verifyLoopBoundsMatch(currentBound, parallel))) {
-        return rewriter.notifyMatchFailure(
-            parallel,
-            "should have the same iteration space as other parallel operations "
-            "on the same nesting level");
-      }
-    } else {
-      // We are encountering a loop at this level of nesting for the first time.
-      assert(currentBound == defaultSize &&
-             "launch bound should use the default size");
-      nestingLevelToInductionVarMap.push_back(matchLaunchSpaceToLoop(
-          parallel, bvm, inductionVarIdx, launch, rewriter));
-    }
-
-    bvm.map(parallel.getInductionVars().front(),
-            nestingLevelToInductionVarMap[nestingLevel]);
-    Block* body = parallel.getBody();
-    loopIterators.push_back(llvm::make_range(body->begin(), body->end()));
-    return success();
-  };
-
-  if (failed(processParallelOp(root))) return failure();
-
-  while (!loopIterators.empty()) {
-    auto currentLoop = loopIterators.pop_back_val();
-    for (Operation& op : currentLoop) {
-      if (auto nestedParallel = dyn_cast<ParallelOp>(&op)) {
-        // Push the current state back to loopIterator and start the next level
-        // of nesting.
-        loopIterators.push_back(
-            llvm::make_range(std::next(op.getIterator()), currentLoop.end()));
-        if (failed(processParallelOp(nestedParallel))) return failure();
-        break;
-      }
-      if (auto setYield = dyn_cast<SetYieldOp>(&op)) {
-        // convert setYield into distribute
-        auto parallelOp = setYield->getParentOfType<ParallelOp>();
-        assert(parallelOp &&
-               "gml_st.set_yield should have a parent gml_st.parallel op");
-        for (auto [result, src, set] :
-             llvm::zip(parallelOp.getResults(), setYield.getSrcs(),
-                       setYield.getSets())) {
-          bvm.map(result,
-                  rewriter.create<DistributeOp>(op.getLoc(), result.getType(),
-                                                bvm.lookupOrDefault(src),
-                                                bvm.lookupOrDefault(set)));
-        }
-        continue;
-      }
-      // TODO(b/244314146): Figure out what we need to do for operations
-      // encountered on upper nesting levels to correctly lower them after the
-      // rewrite to gpu.launch.
-      Operation* clone = rewriter.clone(op, bvm);
-      bvm.map(op.getResults(), clone->getResults());
-    }
-  }
-
-  rewriter.eraseOp(root);
-  return success();
-}
-
-static Value createCombineOp(Location loc, Value lhs, Value rhs,
-                             CombiningKind kind, PatternRewriter& rewriter) {
-  auto helper = [&](auto dummy) {
-    return rewriter.create<decltype(dummy)>(loc, lhs, rhs);
-  };
-  switch (kind) {
-    case CombiningKind::ADD:
-      return helper(arith::AddFOp());
-    case CombiningKind::MUL:
-      return helper(arith::MulFOp());
-    case CombiningKind::MINUI:
-      return helper(arith::MinUIOp());
-    case CombiningKind::MINSI:
-      return helper(arith::MinSIOp());
-    case CombiningKind::MINF:
-      return helper(arith::MinFOp());
-    case CombiningKind::MAXUI:
-      return helper(arith::MaxUIOp());
-    case CombiningKind::MAXSI:
-      return helper(arith::MaxSIOp());
-    case CombiningKind::MAXF:
-      return helper(arith::MaxFOp());
-    case CombiningKind::AND:
-      return helper(arith::AndIOp());
-    case CombiningKind::OR:
-      return helper(arith::OrIOp());
-    case CombiningKind::XOR:
-      return helper(arith::XOrIOp());
-    default:
-      llvm_unreachable("unhandled");
-  }
-}
-
-LogicalResult MultiDimReductionOpToWarpReductionPattern::matchAndRewrite(
-    MultiDimReductionOp reductionOp, PatternRewriter& rewriter) const {
-  auto inType = reductionOp.getSourceVectorType();
-  int64_t width = inType.getNumElements();
-  std::initializer_list<int64_t> supportedWidths = {1, 2, 4, 8, 16, 32};
-  if (!llvm::is_contained(supportedWidths, width)) {
-    return rewriter.notifyMatchFailure(
-        reductionOp, "expected input vector with size 2^N, <=32");
-  }
-  auto hasOneElement = [](auto type) {
-    return type && type.getNumElements() == 1;
-  };
-  auto outType = reductionOp.getDestType().dyn_cast<VectorType>();
-  if (!hasOneElement(outType)) {
-    return rewriter.notifyMatchFailure(reductionOp, "expected 1-vector output");
-  }
-  auto distribute = reductionOp.getSource().getDefiningOp<DistributeOp>();
-  if (!distribute) {
-    return rewriter.notifyMatchFailure(
-        reductionOp, "source not defined by gml_st.distribute");
-  }
-  // Even if this value was not written into the tile corresponding to the
-  // current thread's lane id, this is fine, since it doesn't matter which
-  // thread processes which element within a reduction.
-  TypedValue<VectorType> lhsVector = distribute.getSource();
-  if (!hasOneElement(lhsVector.getType())) {
-    return rewriter.notifyMatchFailure(distribute, "expected 1-vector input");
-  }
-
-  // Preamble: extract element from input
-  Location loc = reductionOp->getLoc();
-  Value lhs = rewriter.create<ExtractOp>(
-      loc, lhsVector, SmallVector<int64_t>(lhsVector.getType().getRank(), 0));
-
-  auto createConstant = [&](int32_t value) {
-    return rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI32IntegerAttr(value));
-  };
-  Value cWidth = createConstant(width);
-  // Create warp shuffles of increasing offset and interleave with a clone of
-  // the accumulate block.
-  for (int64_t i = 1; i < width; i *= 2) {
-    auto shuffleOp = rewriter.create<gpu::ShuffleOp>(
-        loc, lhs, createConstant(i), cWidth, gpu::ShuffleMode::XOR);
-    lhs = createCombineOp(loc, lhs, shuffleOp.getShuffleResult(),
-                          reductionOp.getKind(), rewriter);
-  }
-
-  // Combine with init element and broadcast result back to vector.
-  Value acc = rewriter.create<ExtractOp>(loc, reductionOp.getAcc(), 0);
-  lhs = createCombineOp(loc, lhs, acc, reductionOp.getKind(), rewriter);
-  rewriter.replaceOpWithNewOp<vector::BroadcastOp>(reductionOp, outType, lhs);
-
-  return success();
-}
-
-SubViewOp createSubView(Location loc, Value source, TileOp tile,
-                        PatternRewriter& rewriter) {
-  auto asIntArray = [](ArrayAttr array) {
-    return llvm::to_vector(llvm::map_range(array, [](Attribute attr) {
-      return attr.cast<IntegerAttr>().getInt();
-    }));
-  };
-  Type memRefType = SubViewOp::inferResultType(
-      source.getType().cast<MemRefType>(), asIntArray(tile.getStaticOffsets()),
-      asIntArray(tile.getStaticSizes()), asIntArray(tile.getStaticStrides()));
-  return rewriter.create<SubViewOp>(
-      loc, memRefType, source, tile.getOffsets(), tile.getSizes(),
-      tile.getStrides(), tile.getStaticOffsets(), tile.getStaticSizes(),
-      tile.getStaticStrides());
-}
-
-LogicalResult EliminateMaterializeOfTransferReadPattern::matchAndRewrite(
-    MaterializeOp materialize, PatternRewriter& rewriter) const {
-  // Match the following pattern:
-  //  gml_st.materialize(
-  //  vector.transfer_read Memref:$src[(arith.constant 0)...]
-  //  gml_st.tile [$offsets] [$sizes] [$strides])
-  auto transferRead = materialize.getSource().getDefiningOp<TransferReadOp>();
-  if (!transferRead) {
-    return rewriter.notifyMatchFailure(
-        materialize, "expected vector.transfer_read as source");
-  }
-  Value source = transferRead.getSource();
-  if (!source.getType().isa<MemRefType>()) {
-    return rewriter.notifyMatchFailure(transferRead,
-                                       "expected memref as source");
-  }
-  if (failed(matchSimpleTransferOp(transferRead, rewriter))) return failure();
-
-  auto tile = materialize.getSet().getDefiningOp<TileOp>();
-  if (!tile) {
-    return rewriter.notifyMatchFailure(materialize,
-                                       "expected gml_st.tile as set");
-  }
-
-  // Rewrite the pattern as:
-  // vector.transfer_read
-  //   (memref.subview $src [$offsets] [$sizes] [$strides])
-  //   [(arith.constant 0)...]
-  // TODO(b/254271932): This might not be correct if there is someone writing
-  // to `source` in between `transferRead` and `materialize`. This won't happen
-  // for elementwise fusion and softmax, but might become a problem down the
-  // line.
-  auto subview = createSubView(materialize.getLoc(), source, tile, rewriter);
-  Type resultType = materialize.getResult().getType();
-  if (!resultType.isa<VectorType>()) {
-    // We have a transfer to a single element: just use memref.load directly.
-    rewriter.replaceOpWithNewOp<memref::LoadOp>(materialize, subview,
-                                                transferRead.getIndices());
-    return success();
-  }
-  rewriter.replaceOpWithNewOp<TransferReadOp>(
-      materialize, resultType, subview, transferRead.getIndices(),
-      transferRead.getPermutationMap(), transferRead.getPadding(),
-      /*mask=*/nullptr, transferRead.getInBounds().value_or(nullptr));
-  return success();
-}
-
-LogicalResult EliminateDistributeIntoTransferWritePattern::matchAndRewrite(
-    TransferWriteOp transferWrite, PatternRewriter& rewriter) const {
-  // Match the following pattern:
-  //  vector.transfer_write
-  //    (gml_st.distribute $src into
-  //      [(gml_st.tile [$offsets] [$sizes] [$strides])])
-  //    Memref:$dst[(arith.constant 0)]
-  Value destination = transferWrite.getSource();
-  if (!destination.getType().isa<MemRefType>()) {
-    return rewriter.notifyMatchFailure(transferWrite,
-                                       "expected memref as destination");
-  }
-  if (failed(matchSimpleTransferOp(transferWrite, rewriter))) return failure();
-
-  auto distribute = transferWrite.getVector().getDefiningOp<DistributeOp>();
-  if (!distribute) {
-    return rewriter.notifyMatchFailure(transferWrite,
-                                       "expected distribute as source");
-  }
-  Value source = distribute.getSource();
-
-  auto tile = distribute.getSet().getDefiningOp<TileOp>();
-  if (!tile) {
-    return rewriter.notifyMatchFailure(distribute,
-                                       "expected gml_st.tile as set");
-  }
-
-  // Rewrite the pattern as:
-  // vector.transfer_write $src,
-  //   (memref.subview $dst [$offsets] [$sizes] [$strides])
-  //   [(arith.constant 0)...]
-  auto subview =
-      createSubView(transferWrite.getLoc(), destination, tile, rewriter);
-  rewriter.replaceOpWithNewOp<TransferWriteOp>(
-      transferWrite, /*resultType=*/llvm::None, source, subview,
-      transferWrite.getIndices(), transferWrite.getPermutationMap(),
-      /*mask=*/nullptr, transferWrite.getInBounds().value_or(nullptr));
-  return success();
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_scf.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_scf.cc
deleted file mode 100644
index 055bf8812ba..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/gml_st_to_scf.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/SCF/Transforms/Transforms.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_GMLSTTOSCF
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-/// Converts gml_st.loop to SCF loop nest. All parallel dimensions are collected
-/// into an scf.parallel loop and all sequential dimensions will result in a
-/// nested scf.for loop nest. The pattern assumes that a gml_st.loop with
-/// iterator_types ["reduction", "parallel", "reduction"] can be reordered.
-struct LoopToSCFPattern : public OpRewritePattern<LoopOp> {
-  using OpRewritePattern<LoopOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(LoopOp loop,
-                                PatternRewriter &rewriter) const override {
-    // Fail conversion if the `gml_st.loop` has not been bufferized.
-    if (!loop.hasBufferSemantics()) return failure();
-
-    // Collect loop control parameters for parallel and sequential dimensions.
-    SmallVector<Value, 3> seqLBs, seqUBs, seqSteps, seqIVs;
-    SmallVector<Value, 3> parLBs, parUBs, parSteps, parIVs;
-    for (const auto &en :
-         llvm::enumerate(llvm::zip(loop.getLowerBound(), loop.getUpperBound(),
-                                   loop.getStep(), loop.getInductionVars()))) {
-      Value lb, ub, step, iv;
-      std::tie(lb, ub, step, iv) = en.value();
-      if (loop.isParallelDimension(en.index())) {
-        parLBs.push_back(lb);
-        parUBs.push_back(ub);
-        parSteps.push_back(step);
-        parIVs.push_back(iv);
-      } else {
-        seqLBs.push_back(lb);
-        seqUBs.push_back(ub);
-        seqSteps.push_back(step);
-        seqIVs.push_back(iv);
-      }
-    }
-
-    Location loc = loop.getLoc();
-    auto generateForLoopNestAndCloneBody = [&](OpBuilder &builder, Location loc,
-                                               ValueRange ivs) {
-      BlockAndValueMapping bvm;
-      bvm.map(parIVs, ivs);
-      bvm.map(loop.getRegionInputArgs(), loop.getInputs());
-      bvm.map(loop.getRegionOutputArgs(), loop.getOutputs());
-
-      // If not all dimensions of the gml_st.loop are parallel, an scf.for loop
-      // nest is generated.
-      if (!seqIVs.empty()) {
-        scf::LoopNest nest =
-            scf::buildLoopNest(builder, loc, seqLBs, seqUBs, seqSteps,
-                               [&](OpBuilder & /*builder*/, Location /*loc*/,
-                                   ValueRange ivs) { bvm.map(seqIVs, ivs); });
-        builder.setInsertionPointToStart(nest.loops.back().getBody());
-      }
-      for (auto &op : loop.getBody()->without_terminator())
-        builder.clone(op, bvm);
-    };
-
-    if (parIVs.empty()) {
-      generateForLoopNestAndCloneBody(rewriter, loc, llvm::None);
-    } else {
-      rewriter.create<scf::ParallelOp>(loc, parLBs, parUBs, parSteps,
-                                       generateForLoopNestAndCloneBody);
-    }
-    rewriter.eraseOp(loop);
-    return success();
-  }
-};
-
-/// Converts gml_st.parallel to SCF loop nest.
-struct ParallelOpToSCFPattern : public OpRewritePattern<ParallelOp> {
-  using OpRewritePattern<ParallelOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ParallelOp loop,
-                                PatternRewriter &rewriter) const override {
-    // Fail conversion if the loop has not been bufferized.
-    if (!loop.hasBufferSemantics()) return failure();
-
-    auto cloneBody = [&](OpBuilder &builder, Location /*loc*/, ValueRange ivs) {
-      BlockAndValueMapping bvm;
-      bvm.map(loop.getInductionVars(), ivs);
-
-      for (auto &op : loop.getBody()->without_terminator())
-        builder.clone(op, bvm);
-    };
-
-    rewriter.create<scf::ParallelOp>(loop.getLoc(), loop.getLowerBound(),
-                                     loop.getUpperBound(), loop.getStep(),
-                                     cloneBody);
-
-    rewriter.eraseOp(loop);
-    return success();
-  }
-};
-
-/// Converts gml_st.for to SCF loop nest.
-struct ForOpToSCFPattern : public OpRewritePattern<ForOp> {
-  using OpRewritePattern<ForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ForOp loop,
-                                PatternRewriter &rewriter) const override {
-    auto cloneBody = [&](OpBuilder &builder, Location /*loc*/, ValueRange ivs,
-                         ValueRange iterArgs) {
-      BlockAndValueMapping bvm;
-      bvm.map(loop.getInductionVars(), ivs);
-      bvm.map(loop.getRegionOutputArgs(), iterArgs);
-
-      for (auto &op : loop.getBody()->without_terminator())
-        builder.clone(op, bvm);
-
-      std::vector<Value> result;
-      llvm::transform(loop.getTerminator().getSrcs(),
-                      std::back_inserter(result),
-                      [&](Value src) { return bvm.lookupOrDefault(src); });
-      return result;
-    };
-
-    scf::LoopNest nest = scf::buildLoopNest(
-        rewriter, loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(),
-        loop.getStep(), loop.getOutputs(), cloneBody);
-    // TODO(csigg): just nest.getResults() once https://reviews.llvm.org/D136926
-    // has landed.
-    ValueRange results;
-    if (!nest.loops.empty()) results = nest.getResults();
-    rewriter.replaceOp(loop, results);
-    return success();
-  }
-};
-
-struct GmlStToScfPass : public impl::GmlStToScfBase<GmlStToScfPass> {
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    RewritePatternSet patterns(context);
-    patterns.add<ForOpToSCFPattern, LoopToSCFPattern, ParallelOpToSCFPattern>(
-        patterns.getContext());
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createGmlStToScfPass() {
-  return std::make_unique<GmlStToScfPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/linalg_utils.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/linalg_utils.cc
deleted file mode 100644
index f13e4d097f7..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/linalg_utils.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h"
-
-namespace mlir {
-namespace gml_st {
-
-namespace {
-
-bool hasUniqueInputAndOutputMaps(linalg::GenericOp genericOp,
-                                 AffineMap &inputMap, AffineMap &outputMap) {
-  if (genericOp.getNumDpsInputs() != 1 || genericOp.getNumDpsInits() != 1) {
-    return false;
-  }
-  inputMap = genericOp.getIndexingMapsArray().front();
-  outputMap = genericOp.getIndexingMapsArray().back();
-  return true;
-}
-
-// Checks if an affine map maps all dimensions in sequence, skipping a unique
-// dimension. This can be the output map of a reduction, or the input map of a
-// bcast. For example:
-//   - affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-//   - affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
-//   - affine_map<(d0, d1) -> (d0)>
-//   - affine_map<(d0, d1) -> (d1)>
-bool isBcastOrReductionMap(AffineMap map, int64_t &dim) {
-  const auto *it = map.getResults().begin();
-  const auto *end = map.getResults().end();
-  auto consumeIotaSeq = [&](int64_t &i) {
-    while (it != end) {
-      auto expr = it->dyn_cast<AffineDimExpr>();
-      if (!expr || expr.getPosition() != i) break;
-      it++;
-      i++;
-    }
-  };
-  int64_t i = 0;
-  consumeIotaSeq(i);
-  dim = i++;
-  consumeIotaSeq(i);
-  return i == map.getNumDims();
-}
-
-}  // namespace
-
-bool isSimpleReduction(Operation *op, int64_t *dimension, Value *operand) {
-  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
-  if (!genericOp || genericOp.getNumDpsInits() != 1) return false;
-
-  // Expect monadic op.
-  AffineMap inputMap, outputMap;
-  if (!hasUniqueInputAndOutputMaps(genericOp, inputMap, outputMap))
-    return false;
-
-  // Check identity of operand map.
-  if (!inputMap.isIdentity()) return false;
-
-  // Check that the output map is a reduction: it maps all dimensions in
-  // seqence, skipping the unique reduction dimension.
-  int64_t dim;
-  if (!isBcastOrReductionMap(outputMap, dim)) return false;
-
-  // Check uniqueness of reduction dimension and remaining parallel iterator
-  // types.
-  auto iterTys = genericOp.getIteratorTypes();
-  for (int i = 0; i < iterTys.size(); i++) {
-    StringRef expectedTy = i == dim ? getReductionIteratorTypeName()
-                                    : getParallelIteratorTypeName();
-    StringRef actualTy =
-        genericOp.getIteratorTypes()[i].cast<StringAttr>().getValue();
-    if (expectedTy != actualTy) return false;
-  }
-
-  // Allow for pattern matching the reduction dimension and operand.
-  if (dimension != nullptr) *dimension = dim;
-  if (operand != nullptr) *operand = genericOp.getInputs().front();
-
-  return true;
-}
-
-bool isCwiseGenericOp(Operation *op, int64_t *arity) {
-  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
-  if (!genericOp || genericOp.getNumDpsInits() != 1) return false;
-
-  // Check all-parallel iterator types.
-  if (!llvm::all_of(genericOp.getIteratorTypes(), [](Attribute it) {
-        return it.cast<StringAttr>().getValue() ==
-               getParallelIteratorTypeName();
-      })) {
-    return false;
-  }
-
-  // Check all-identity maps.
-  if (!llvm::all_of(genericOp.getIndexingMapsArray(),
-                    [](AffineMap map) { return map.isIdentity(); })) {
-    return false;
-  }
-
-  // Allow for pattern matching the arity.
-  if (arity != nullptr) *arity = genericOp.getNumDpsInputs();
-  return true;
-}
-
-bool isUnaryCwiseGenericOp(Operation *op) {
-  int64_t arity;
-  return isCwiseGenericOp(op, &arity) && arity == 1;
-}
-
-bool isSimpleBcast(Operation *op, int64_t *dimension, Value *operand) {
-  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op);
-  if (!genericOp) return false;
-
-  // Expect monadic op.
-  AffineMap inputMap, outputMap;
-  if (!hasUniqueInputAndOutputMaps(genericOp, inputMap, outputMap))
-    return false;
-
-  // Check all-parallel iterator types.
-  if (!llvm::all_of(genericOp.getIteratorTypes(), [](Attribute it) {
-        return it.cast<StringAttr>().getValue() ==
-               getParallelIteratorTypeName();
-      })) {
-    return false;
-  }
-
-  // Check that the operand map is a degenerate bcast: it maps all dimensions in
-  // seqence, skipping the unique bcast dimension.
-  int64_t dim;
-  if (!isBcastOrReductionMap(inputMap, dim)) return false;
-
-  // Check that the output map is the identity.
-  if (!outputMap.isIdentity()) return false;
-
-  // Allow for pattern matching the reduction dimension and operand.
-  if (dimension != nullptr) *dimension = dim;
-  if (operand != nullptr) *operand = genericOp.getInputs().front();
-
-  return true;
-}
-
-bool isSimpleBcastReduction(Operation *op, int64_t *dimension,
-                            SimpleBcastReduction *chain) {
-  // Match bcast.
-  int64_t bcastDim;
-  Value bcastOperand;
-  if (!isSimpleBcast(op, &bcastDim, &bcastOperand)) {
-    return false;
-  }
-
-  // Match reduction.
-  Operation *reduction = bcastOperand.getDefiningOp();
-  int64_t reductionDim;
-  Value operand;
-  if (!isSimpleReduction(reduction, &reductionDim, &operand)) {
-    return false;
-  }
-
-  // Check that bcast and reduction dimensions match.
-  if (bcastDim != reductionDim) return false;
-
-  // Allow for pattern matching the reduction dimension and operation chain.
-  if (dimension != nullptr) *dimension = bcastDim;
-  if (chain != nullptr) {
-    chain->bcast = op;
-    chain->operand = operand;
-    chain->operand = operand;
-  }
-
-  return true;
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/test_passes.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/test_passes.cc
deleted file mode 100644
index 583cc64e67e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/test_passes.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h"
-
-#include <string>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
-#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
-#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TESTGMLSTBUFFERIZATION
-#define GEN_PASS_DEF_TESTGMLSTLOOPPEELING
-#define GEN_PASS_DEF_TESTGMLSTLOOPTILING
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h.inc"
-
-static constexpr char kPeeledLoopsLabel[] = "__peeled_loops__";
-static constexpr char kPartialIterationLabel[] = "__partial_iteration__";
-
-/// Peel LoopOps, i.e., split them into two loops: One loop where the
-/// `idx`-th loop contains only "full" iterations and a second loop for the
-/// remaining partial iteration (if any).
-struct TiledLoopPeelingPattern : public OpRewritePattern<LoopOp> {
-  TiledLoopPeelingPattern(MLIRContext *ctx, int64_t idx, bool skipPartial)
-      : OpRewritePattern<LoopOp>(ctx), idx(idx), skipPartial(skipPartial) {}
-
-  LogicalResult matchAndRewrite(LoopOp loopOp,
-                                PatternRewriter &rewriter) const override {
-    SmallVector<int64_t> peeledLoops;
-    if (loopOp->hasAttr(kPeeledLoopsLabel)) {
-      auto attr = loopOp->getAttr(kPeeledLoopsLabel).cast<ArrayAttr>();
-      peeledLoops =
-          llvm::to_vector<4>(llvm::map_range(attr, [](Attribute attr) {
-            return attr.cast<IntegerAttr>().getInt();
-          }));
-      // Check if the loop was already peeled.
-      if (llvm::find(peeledLoops, idx) != peeledLoops.end()) return failure();
-    }
-    if (skipPartial && loopOp->hasAttr(kPartialIterationLabel))
-      // No peeling of loop nests with a partial iteration.
-      return failure();
-
-    if (static_cast<int64_t>(loopOp.getIteratorTypes().size()) <= idx)
-      return failure();
-
-    // Peel loop and canonicalize.
-    LoopOp result;
-    if (failed(peelAndCanonicalizeGmlStLoop(rewriter, loopOp, idx, result)))
-      return failure();
-
-    // Apply label, so that the same loop is not rewritten a second time.
-    peeledLoops.push_back(idx);
-    rewriter.updateRootInPlace(loopOp, [&]() {
-      loopOp->setAttr(kPeeledLoopsLabel, rewriter.getI64ArrayAttr(peeledLoops));
-    });
-    result->setAttr(kPeeledLoopsLabel, rewriter.getI64ArrayAttr(peeledLoops));
-    result->setAttr(kPartialIterationLabel, rewriter.getUnitAttr());
-
-    return success();
-  }
-
-  /// Index of loop to peel.
-  int64_t idx;
-
-  /// If set to true, do not peel LoopOps with a partial iteration.
-  bool skipPartial;
-};
-
-class TestGmlStLoopPeelingPass
-    : public impl::TestGmlStLoopPeelingBase<TestGmlStLoopPeelingPass> {
-  void runOnOperation() final {
-    auto funcOp = getOperation();
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet patterns(ctx);
-    for (unsigned idx : dims)
-      patterns.add<TiledLoopPeelingPattern>(ctx, idx, skip_partial);
-
-    (void)(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)));
-
-    // Drop the markers.
-    funcOp.walk([](LoopOp op) {
-      op->removeAttr(kPeeledLoopsLabel);
-      op->removeAttr(kPartialIterationLabel);
-    });
-  }
-};
-
-struct LinalgTilingPattern
-    : public OpInterfaceRewritePattern<linalg::LinalgOp> {
-  LinalgTilingPattern(MLIRContext *context, linalg::LinalgTilingOptions options,
-                      PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<linalg::LinalgOp>(context, benefit),
-        options(std::move(options)) {}
-
-  LogicalResult matchAndRewrite(linalg::LinalgOp op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(op)) return failure();
-
-    FailureOr<linalg::TiledLinalgOp> res =
-        gml_st::tileLinalgOp(rewriter, op, options);
-    if (failed(res)) return failure();
-
-    setTransformationAttr(rewriter, res->op);
-
-    if (res->tensorResults.empty())
-      rewriter.eraseOp(op);
-    else
-      rewriter.replaceOp(op, res->tensorResults);
-
-    return success();
-  }
-
- private:
-  linalg::LinalgTilingOptions options;
-};
-
-struct TestGmlStLoopTilingPass
-    : public impl::TestGmlStLoopTilingBase<TestGmlStLoopTilingPass> {
-  TestGmlStLoopTilingPass() = default;
-  TestGmlStLoopTilingPass(ArrayRef<int64_t> tileSizes,
-                          ArrayRef<StringRef> distributionTypes) {
-    this->tile_sizes = tileSizes;
-    this->distribution_types = llvm::to_vector<2>(llvm::map_range(
-        distributionTypes, [](StringRef ref) { return ref.str(); }));
-  }
-
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-
-    auto distTypes = llvm::to_vector<2>(llvm::map_range(
-        distribution_types, [](std::string &str) { return StringRef(str); }));
-    auto options = linalg::LinalgTilingOptions()
-                       .setTileSizes(tile_sizes)
-                       .setDistributionTypes(distTypes);
-    MLIRContext *ctx = funcOp.getContext();
-    RewritePatternSet patterns(ctx);
-
-    patterns.add<LinalgTilingPattern>(ctx, options);
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-
-    funcOp.walk([](linalg::LinalgOp op) { removeTransformationAttr(op); });
-  }
-};
-
-struct TestGmlStBufferizationPass
-    : public impl::TestGmlStBufferizationBase<TestGmlStBufferizationPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<bufferization::BufferizationDialect, memref::MemRefDialect>();
-    linalg::registerBufferizableOpInterfaceExternalModels(registry);
-    gml_st::registerBufferizableOpInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    bufferization::OneShotBufferizationOptions opts;
-    opts.allowReturnAllocs = true;
-    opts.bufferizeFunctionBoundaries = true;
-    opts.functionBoundaryTypeConversion =
-        bufferization::BufferizationOptions::LayoutMapOption::IdentityLayoutMap;
-
-    ModuleOp module = getOperation();
-    if (failed(bufferization::runOneShotModuleBufferize(module, opts))) {
-      signalPassFailure();
-      return;
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStLoopPeelingPass() {
-  return std::make_unique<TestGmlStLoopPeelingPass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestGmlStLoopTilingPass() {
-  return std::make_unique<TestGmlStLoopTilingPass>();
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> createTestGmlStBufferizationPass() {
-  return std::make_unique<TestGmlStBufferizationPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling.cc
deleted file mode 100644
index 677918933b6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-
-#include <functional>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-
-void TilingOptions::setTileSizeComputationFn(ArrayRef<int64_t> ts) {
-  SmallVector<int64_t, 4> tileSizes(ts.begin(), ts.end());
-  tileSizeComputationFn = [tileSizes](OpBuilder &b, Operation *op) {
-    return llvm::to_vector<4>(map_range(tileSizes, [&](int64_t s) {
-      Value v = b.create<arith::ConstantIndexOp>(op->getLoc(), s);
-      return v;
-    }));
-  };
-}
-
-namespace {
-
-#define GEN_PASS_DEF_TILINGPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-// Compute tile size for the tile that starts at `offset`, has size `tileSize`
-// for the tensor with the dimension size `dimSize`.
-// The tile size is static when `tileSize` divides `dimSize` or when the
-// `tileSize` is 1.
-// Otherwise, it is minimum of `tileSize` and `dimSize - offset` to avoid out of
-// bounds access.
-OpFoldResult computeTileSizeInDim(OpBuilder &builder, Location loc,
-                                  OpFoldResult tileSize, OpFoldResult dimSize,
-                                  OpFoldResult offset) {
-  Optional<int64_t> tileCst = getConstantIntValue(tileSize);
-  Optional<int64_t> dimCst = getConstantIntValue(dimSize);
-
-  bool hasTileSizeOne = tileCst && *tileCst == 1;
-  bool dividesEvenly = tileCst && dimCst && ((*dimCst % *tileCst) == 0);
-  if (hasTileSizeOne || dividesEvenly) return builder.getIndexAttr(*tileCst);
-
-  AffineExpr d0, s0;
-  bindDims(builder.getContext(), d0);
-  bindSymbols(builder.getContext(), s0);
-  OpFoldResult residualTileSize =
-      makeComposedFoldedAffineApply(builder, loc, s0 - d0, {offset, dimSize});
-
-  return makeComposedFoldedAffineMin(
-      builder, loc, AffineMap::getMultiDimIdentityMap(2, loc.getContext()),
-      {residualTileSize, tileSize});
-}
-
-/// Generate an empty loop nest that represents the tiled loop nest shell.
-/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space.
-/// - `tileSizeVals` is the tile sizes to use. Zero represent untiled loops.
-/// - In `offsets` and `sizes` return the multi-dimensional offset and size of
-/// the tile processed within the inner most loop.
-Operation *generateTileLoopNest(OpBuilder &builder, Location loc,
-                                ArrayRef<Range> loopRanges,
-                                ArrayRef<Value> tileSizeVals,
-                                ArrayRef<Value> dstOperands, bool distribute,
-                                StringRef distributionLabel,
-                                SmallVector<OpFoldResult> &offsets,
-                                SmallVector<OpFoldResult> &sizes) {
-  assert(!loopRanges.empty() && "expected at least one loop range");
-  assert(loopRanges.size() == tileSizeVals.size() &&
-         "expected as many tile sizes as loop ranges");
-  OpBuilder::InsertionGuard guard(builder);
-
-  SmallVector<OpFoldResult> lbs, ubs, steps;
-  SmallVector<unsigned> nonemptyRangeIndices;
-  for (auto &loopRange : llvm::enumerate(loopRanges)) {
-    Value offset =
-        getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().offset);
-    Value size =
-        getValueOrCreateConstantIndexOp(builder, loc, loopRange.value().size);
-    // No loops if tile size is zero. Set offset and size to the loop offset and
-    // size.
-    offsets.push_back(offset);
-    sizes.push_back(size);
-    if (matchPattern(tileSizeVals[loopRange.index()], m_Zero())) continue;
-    lbs.push_back(offset);
-    ubs.push_back(size);
-    steps.push_back(tileSizeVals[loopRange.index()]);
-    nonemptyRangeIndices.push_back(loopRange.index());
-  }
-
-  auto buildBody = [&](OpBuilder &nestedBuilder, Location bodyLoc,
-                       ValueRange ivs) {
-    for (const auto &[index, iv] : llvm::enumerate(ivs)) {
-      offsets[nonemptyRangeIndices[index]] = iv;
-      sizes[nonemptyRangeIndices[index]] = computeTileSizeInDim(
-          nestedBuilder, bodyLoc, steps[index], ubs[index], iv);
-    }
-  };
-  Optional<StringAttr> distributionLabelAttr;
-  if (!distributionLabel.empty()) {
-    distributionLabelAttr =
-        StringAttr::get(builder.getContext(), distributionLabel);
-  }
-  Operation *loop =
-      distribute ? builder
-                       .create<gml_st::ParallelOp>(
-                           loc, TypeRange(ValueRange{dstOperands}),
-                           getValueOrCreateConstantIndexOp(builder, loc, lbs),
-                           getValueOrCreateConstantIndexOp(builder, loc, ubs),
-                           getValueOrCreateConstantIndexOp(builder, loc, steps),
-                           distributionLabelAttr,
-                           [&](OpBuilder &nestedBuilder, Location bodyLoc,
-                               ValueRange ivs) {
-                             buildBody(nestedBuilder, bodyLoc, ivs);
-                           })
-                       .getOperation()
-                 : builder
-                       .create<gml_st::ForOp>(
-                           loc, TypeRange(ValueRange{dstOperands}),
-                           getValueOrCreateConstantIndexOp(builder, loc, lbs),
-                           getValueOrCreateConstantIndexOp(builder, loc, ubs),
-                           getValueOrCreateConstantIndexOp(builder, loc, steps),
-                           dstOperands,
-                           [&](OpBuilder &nestedBuilder, Location bodyLoc,
-                               ValueRange ivs, ValueRange /*inits*/) {
-                             buildBody(nestedBuilder, bodyLoc, ivs);
-                           })
-                       .getOperation();
-  return loop;
-}
-
-struct DimOfMaterializedTilePattern : public OpRewritePattern<tensor::DimOp> {
-  using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(tensor::DimOp op,
-                                PatternRewriter &rewriter) const override {
-    Operation *def = op.getSource().getDefiningOp();
-    if (!def) return failure();
-
-    auto materializeOp = llvm::dyn_cast<MaterializeOp>(def);
-    if (!materializeOp) return failure();
-
-    auto tileOp = materializeOp.getSet().getDefiningOp<gml_st::TileOp>();
-    if (!tileOp) return failure();
-
-    Optional<int64_t> indexOr = op.getConstantIndex();
-    if (!indexOr.has_value()) return failure();
-
-    Value tileSizeValue =
-        tileOp.isDynamicSize(*indexOr)
-            ? tileOp.getDynamicSize(*indexOr)
-            : rewriter.create<arith::ConstantIndexOp>(
-                  op.getLoc(), tileOp.getStaticSize(*indexOr));
-    rewriter.replaceOp(op, tileSizeValue);
-    return success();
-  }
-};
-
-/// Pattern to tile an op that implements the `TilingInterface` using
-/// `gml_st.for` for iterating over the tiles.
-struct TilingPattern : public OpInterfaceRewritePattern<TilingInterface> {
-  TilingPattern(MLIRContext *context,
-                llvm::function_ref<LogicalResult(Operation *)> filterFn,
-                TilingOptions options, PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<TilingInterface>(context, benefit),
-        filterFn(filterFn),
-        options(std::move(options)) {}
-
-  LogicalResult matchAndRewrite(TilingInterface op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn || failed(filterFn(op)) || hasTransformationAttr(op))
-      return failure();
-
-    auto tilingResult = tile(options, rewriter, op);
-    if (failed(tilingResult)) return failure();
-
-    // If we did not tile (e.g. when all tile sizes are 0), do not replace
-    // original op and just mark it as transformed then return.
-    if (tilingResult->loop != nullptr) {
-      rewriter.replaceOp(op, tilingResult->loop->getResults());
-    }
-    setTransformationAttr(rewriter, tilingResult->tiledOp);
-    return success();
-  }
-
- private:
-  llvm::function_ref<LogicalResult(Operation *)> filterFn;
-  TilingOptions options;
-};
-
-struct TilingPass : public impl::TilingPassBase<TilingPass> {
-  TilingPass() = default;
-  TilingPass(StringRef name, StringRef label, bool distributeFlag,
-             llvm::ArrayRef<int64_t> sizes) {
-    opName = name.str();
-    opLabel = label.str();
-    distribute = distributeFlag;
-    tileSizes = sizes;
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry
-        .insert<GmlStDialect, tensor::TensorDialect, linalg::LinalgDialect>();
-    registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    TilingOptions opts;
-    opts.distribute = distribute;
-    SmallVector<int64_t> ts(tileSizes.begin(), tileSizes.end());
-    opts.tileSizeComputationFn = [ts](OpBuilder &b, Operation *op) {
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(
-          &op->getParentOfType<func::FuncOp>().getBody().front());
-      return llvm::to_vector<4>(llvm::map_range(ts, [&](int64_t s) {
-        Value v = b.create<arith::ConstantIndexOp>(op->getLoc(), s);
-        return v;
-      }));
-    };
-
-    auto filterFn = [&](Operation *op) {
-      if (!opName.empty() && op->getName().getStringRef() != opName)
-        return failure();
-      if (!opLabel.empty() && !hasMatchingLabel(op, opLabel)) return failure();
-      return success();
-    };
-    RewritePatternSet patterns(ctx);
-    populateTilingPatterns(ctx, filterFn, opts, &patterns);
-    patterns.add<DimOfMaterializedTilePattern>(ctx);
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
-
-    // Clean up by removing temporary attributes.
-    f.walk([](Operation *op) { removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-FailureOr<TilingResult> tile(const TilingOptions &options,
-                             PatternRewriter &rewriter, TilingInterface op) {
-  if (!options.tileSizeComputationFn) {
-    return rewriter.notifyMatchFailure(
-        op, "missing tile size computation function");
-  }
-
-  // 1. Get the range of the loops that are represented by the operation.
-  SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
-  size_t numLoops = iterationDomain.size();
-  if (numLoops == 0) return failure();
-
-  // 2. Materialize the tile sizes. Enforce the convention that "tiling by
-  // zero" skips tiling a particular dimension. This convention is
-  // significantly simpler to handle instead of adjusting affine maps to
-  // account for missing dimensions.
-  SmallVector<Value> tileSizeVector;
-  {
-    OpBuilder::InsertionGuard guard(rewriter);
-    tileSizeVector = options.tileSizeComputationFn(rewriter, op);
-  }
-
-  if (tileSizeVector.size() < iterationDomain.size()) {
-    auto zero = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
-    tileSizeVector.append(numLoops - tileSizeVector.size(), zero);
-  }
-
-  if (llvm::all_of(tileSizeVector, mlir::gml_st::isZero)) {
-    return TilingResult{op, nullptr};
-  }
-
-  // 3. Materialize an empty loop nest that iterates over the tiles.
-  auto dstOperands = op.getDestinationOperands(rewriter);
-  SmallVector<OpFoldResult> offsets, sizes;
-  TilingResult tilingResult;
-  tilingResult.loop = generateTileLoopNest(
-      rewriter, op.getLoc(), iterationDomain, tileSizeVector, dstOperands,
-      options.distribute, options.distributionLabel, offsets, sizes);
-  Block *loopBody = &tilingResult.loop->getRegion(0).front();
-  Operation *terminator = loopBody->getTerminator();
-  rewriter.setInsertionPoint(terminator);
-
-  // 4. Insert the tiled implementation within the loop.
-  TilingInterface tiledOp = op.getTiledImplementation(rewriter, offsets, sizes);
-  tilingResult.tiledOp = tiledOp.getOperation();
-
-  // 5. Add `gml_st.set_yield` terminator.
-  SmallVector<Value> dstSubsets;
-  for (Value dst : tiledOp.getDestinationOperands(rewriter))
-    dstSubsets.push_back(dst.getDefiningOp<MaterializeOp>().getSet());
-  rewriter.replaceOpWithNewOp<SetYieldOp>(
-      terminator, tilingResult.tiledOp->getResults(), dstOperands, dstSubsets);
-
-  // 6. Replace the uses of `outputs` with the output block arguments.
-  if (!options.distribute) {
-    auto forLoop = cast<gml_st::ForOp>(tilingResult.loop);
-    for (auto [dst, regionArg] :
-         llvm::zip(dstOperands, forLoop.getRegionOutputArgs())) {
-      dst.replaceUsesWithIf(regionArg, [&](OpOperand &operand) {
-        return operand.getOwner()->getBlock() == loopBody;
-      });
-    }
-  }
-
-  return tilingResult;
-}
-
-void populateTilingPatterns(
-    MLIRContext *context,
-    llvm::function_ref<LogicalResult(Operation *)> filterFn,
-    const TilingOptions &opts, RewritePatternSet *patterns) {
-  patterns->add<TilingPattern>(context, filterFn, opts);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingPass(
-    StringRef opName, StringRef opLabel, bool distribute,
-    ArrayRef<int64_t> tileSizes) {
-  return std::make_unique<TilingPass>(opName, opLabel, distribute, tileSizes);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_gpu_warp.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_gpu_warp.cc
deleted file mode 100644
index 5dc8fd60b6c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_gpu_warp.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/fusion.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define GEN_PASS_DEF_TILINGGPUWARPPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-namespace mlir {
-namespace gml_st {
-
-namespace {
-
-constexpr const char* kWarpDistributionLabel = "warp";
-constexpr const char* kThreadDistributionLabel = "thread";
-
-// Returns 'count' rounded up to power of two, up to warp size (32).
-static int64_t getGroupSize(int64_t count) {
-  constexpr int64_t kWarpSize = 32;
-  if (count < 0) return kWarpSize;
-  for (int64_t i = 1; i < kWarpSize; i *= 2)
-    if (i >= count) return i;
-  return kWarpSize;
-}
-
-bool isWarpLevelOp(Operation* op) {
-  if (!op) return false;
-  auto parentPloop = op->getParentOfType<ParallelOp>();
-  return parentPloop && parentPloop.getDistributionType() &&
-         *parentPloop.getDistributionType() == kWarpDistributionLabel;
-}
-
-struct TilingCwisePattern : OpRewritePattern<linalg::GenericOp> {
-  using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::GenericOp genericOp,
-                                PatternRewriter& rewriter) const override {
-    if (hasTransformationAttr(genericOp)) {
-      return rewriter.notifyMatchFailure(genericOp, "already transformed");
-    }
-
-    // Match only cwise `linalg.generic` ops on the shape 1x?.
-    auto genericOpTy =
-        genericOp.getResultTypes().front().dyn_cast<RankedTensorType>();
-    if (!isCwiseGenericOp(genericOp) || !genericOpTy ||
-        genericOpTy.getRank() != 2 || genericOpTy.getDimSize(0) != 1) {
-      return rewriter.notifyMatchFailure(genericOp,
-                                         "not a cwise op on tensor<1x?>");
-    }
-
-    // Only tile root ops on the warp level.
-    if (!isWarpLevelOp(genericOp) || !genericOp->hasOneUse() ||
-        !llvm::isa<SetYieldOp>(*genericOp->getUsers().begin())) {
-      return rewriter.notifyMatchFailure(genericOp, "not a warp level root op");
-    }
-
-    // The number of threads per row (power of two, <= kWarpSize).
-    int64_t groupSize = getGroupSize(genericOpTy.getDimSize(1));
-
-    // Constants and attributes.
-    Location loc = genericOp.getLoc();
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    Value cGroupSize = rewriter.create<arith::ConstantIndexOp>(loc, groupSize);
-    Value cGroupSizeMinusOne =
-        rewriter.create<arith::ConstantIndexOp>(loc, groupSize - 1);
-    Attribute zeroAttr = rewriter.getIndexAttr(0);
-    Attribute oneAttr = rewriter.getIndexAttr(1);
-    Attribute groupSizeAttr = rewriter.getIndexAttr(groupSize);
-    StringAttr threadDistrLabel =
-        rewriter.getStringAttr(kThreadDistributionLabel);
-
-    // Create `gml_st.parallel` loop to distribute among lanes.
-    Value init = genericOp.getOutputs().front();
-    Value genericOpResult = genericOp.getResults().front();
-    Value dimSize =
-        rewriter.createOrFold<tensor::DimOp>(loc, genericOpResult, c1);
-    Value dimSizePlusWarpSizeMinusOne =
-        rewriter.createOrFold<arith::AddIOp>(loc, dimSize, cGroupSizeMinusOne);
-    auto ploop = rewriter.create<gml_st::ParallelOp>(
-        loc, genericOpTy, c0, cGroupSize, c1, threadDistrLabel,
-        [&](OpBuilder& b, Location loc, ValueRange ivs) {
-          // Compute the lane tile with a stride of `warpSize`. This tile
-          // defines the subset of the result that is produced by the lane.
-          // The `laneId` defines the initial offset into the tensor. The
-          // remaining length to be addressed by the lane is
-          //     `dimSize` - `laneId`.
-          // With a stride of `warpSize`, every lane addresses a total of
-          //     ceil((`dimSize` - `laneId`) / `cWarpSize`)
-          //     = (`dimSize` + `cWarpSize` - 1 - `laneId`) / `cWarpSize`
-          // elements.
-          Value laneId = ivs.front();
-          Value laneTileSize = b.create<arith::DivUIOp>(
-              loc,
-              b.create<arith::SubIOp>(loc, dimSizePlusWarpSizeMinusOne, laneId),
-              cGroupSize);
-          Value laneTile = b.createOrFold<gml_st::TileOp>(
-              loc, SmallVector<OpFoldResult>{zeroAttr, laneId},
-              SmallVector<OpFoldResult>{oneAttr, laneTileSize},
-              SmallVector<OpFoldResult>{oneAttr, groupSizeAttr});
-
-          // Create `gml_st.for` loop to iterate over the lane's tile.
-          Type elemTy = genericOpTy.getElementType();
-          auto sloopTy =
-              RankedTensorType::get({1, ShapedType::kDynamicSize}, elemTy);
-          Value laneInit = b.create<gml_st::MaterializeOp>(loc, init, laneTile);
-          auto sloop = b.create<gml_st::ForOp>(
-              loc, sloopTy, c0, laneTileSize, c1, laneInit,
-              [&](OpBuilder& b, Location loc, ValueRange ivs, ValueRange aggr) {
-                // Create the iteration tile. This specifies the scalar subset
-                // in the warp-level operands.
-                Value i = ivs.front();
-                Value iterTileOffset = b.create<arith::AddIOp>(
-                    loc, laneId, b.create<arith::MulIOp>(loc, i, cGroupSize));
-                Value iterTile = b.create<gml_st::TileOp>(
-                    loc, SmallVector<OpFoldResult>{zeroAttr, iterTileOffset},
-                    SmallVector<OpFoldResult>{oneAttr, oneAttr},
-                    SmallVector<OpFoldResult>{oneAttr, oneAttr});
-
-                // Materialize scalar subsets per operand.
-                SmallVector<Value> iterOperands =
-                    llvm::to_vector(llvm::map_range(
-                        genericOp.getInputs(), [&](Value arg) -> Value {
-                          return b.create<gml_st::MaterializeOp>(loc, elemTy,
-                                                                 arg, iterTile);
-                        }));
-
-                // Create scalar computation from `linalg.generic` body by (i)
-                // mapping its block arguments to the newly materialized
-                // scalar operands, and (ii) cloning the body.
-                BlockAndValueMapping bvm;
-                for (const auto& [blockArg, iterOperand] : llvm::zip(
-                         genericOp.getBlock()->getArguments(), iterOperands)) {
-                  bvm.map(blockArg, iterOperand);
-                }
-                for (auto& innerop :
-                     genericOp.getBody()->without_terminator()) {
-                  rewriter.clone(innerop, bvm);
-                }
-
-                // Yield iteration result.
-                Value iterResult = bvm.lookup(genericOp.getBody()
-                                                  ->getTerminator()
-                                                  ->getOperands()
-                                                  .front());
-                Value iterTileInLaneTile = b.create<gml_st::TileOp>(
-                    loc, SmallVector<OpFoldResult>{zeroAttr, i},
-                    SmallVector<OpFoldResult>{oneAttr, oneAttr},
-                    SmallVector<OpFoldResult>{oneAttr, oneAttr});
-                b.create<gml_st::SetYieldOp>(loc, iterResult, aggr,
-                                             iterTileInLaneTile);
-              });
-          b.create<gml_st::SetYieldOp>(loc, sloop.getResults().front(), init,
-                                       laneTile);
-        });
-
-    rewriter.replaceOp(genericOp, ploop.getResults());
-    return success();
-  }
-};
-
-struct TilingReductionPattern : OpRewritePattern<linalg::GenericOp> {
-  using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::GenericOp genericOp,
-                                PatternRewriter& rewriter) const override {
-    if (hasTransformationAttr(genericOp)) {
-      return rewriter.notifyMatchFailure(genericOp, "already transformed");
-    }
-
-    // Only tile ops on the warp level.
-    if (!isWarpLevelOp(genericOp)) {
-      return rewriter.notifyMatchFailure(genericOp, "not a warp level op");
-    }
-
-    // Match only if it's a linalg.generic tensor<1x?xf32> -> tensor<1xf32> with
-    // iterator_types = ["parallel", "reduction"].
-    auto itTypes = llvm::to_vector(
-        genericOp.getIteratorTypes().getAsValueRange<StringAttr>());
-    if (itTypes.size() != 2 || itTypes[0] != getParallelIteratorTypeName() ||
-        itTypes[1] != getReductionIteratorTypeName()) {
-      return rewriter.notifyMatchFailure(genericOp,
-                                         "Expected ['parallel', 'reduction']");
-    }
-    if (genericOp.getNumDpsInputs() != 1 || genericOp.getNumDpsInits() != 1) {
-      return rewriter.notifyMatchFailure(genericOp,
-                                         "Expected single input and output");
-    }
-
-    auto inputTy =
-        genericOp.getInputs().front().getType().dyn_cast<RankedTensorType>();
-
-    // The number of threads per row (power of two, <= kWarpSize).
-    int64_t groupSize = getGroupSize(inputTy.getDimSize(1));
-
-    // Attributes and constants.
-    Location loc = genericOp->getLoc();
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    Value cGroupSize = rewriter.create<arith::ConstantIndexOp>(loc, groupSize);
-    IntegerAttr zeroAttr = rewriter.getIndexAttr(0);
-    IntegerAttr oneAttr = rewriter.getIndexAttr(1);
-    IntegerAttr groupSizeAttr = rewriter.getIndexAttr(groupSize);
-    StringAttr threadDistrLabel =
-        rewriter.getStringAttr(kThreadDistributionLabel);
-
-    Value operand = genericOp.getInputs().front();
-    Value init = genericOp.getOutputs().front();
-
-    Type scalarTy = inputTy.getElementType();
-
-    Value reductionDimSize = rewriter.create<tensor::DimOp>(loc, operand, c1);
-
-    // Create warp-sized partial reduction result tensor.
-    Value warpResult = rewriter.create<tensor::EmptyOp>(
-        loc, SmallVector<OpFoldResult>{oneAttr, groupSizeAttr}, scalarTy);
-    Value initTile =
-        rewriter.create<TileOp>(loc, SmallVector<OpFoldResult>{zeroAttr});
-    Value initMaterialized =
-        rewriter.create<MaterializeOp>(loc, scalarTy, init, initTile);
-    warpResult =
-        rewriter.create<linalg::FillOp>(loc, initMaterialized, warpResult)
-            .getResults()
-            .front();
-
-    // Create gml_st.parallel finalizing the partial result.
-    auto parallelOpBodyBuilderFn = [&](OpBuilder& b, Location loc,
-                                       ValueRange ivs) {
-      Value laneId = ivs.front();
-      Value laneTile =
-          b.create<TileOp>(loc, SmallVector<OpFoldResult>{zeroAttr, laneId});
-      Value laneResult = b.create<MaterializeOp>(loc, warpResult, laneTile);
-
-      // Create gml_st.for sequentially reducing parts of the row.
-      auto forOpBodyBuilderFn = [&](OpBuilder& b, Location loc, ValueRange ivs,
-                                    ValueRange outputs) {
-        Value iterationId = ivs.front();
-        Value laneAcc = outputs.front();
-
-        // Materialize operand subset.
-        Value operandTile = b.create<TileOp>(
-            loc, ArrayRef<OpFoldResult>{zeroAttr, iterationId});
-        Value operandMaterialized =
-            b.create<MaterializeOp>(loc, scalarTy, operand, operandTile);
-
-        // Materialize intermediate result.
-        Value iterationTile = rewriter.create<TileOp>(
-            loc, SmallVector<OpFoldResult>{zeroAttr, zeroAttr});
-        Value iterationResult = rewriter.create<MaterializeOp>(
-            loc, scalarTy, laneAcc, iterationTile);
-
-        // Create scalar computation based on `linalg.generic` body.
-        BlockAndValueMapping bvm;
-        bvm.map(genericOp.getBlock()->getArguments()[0], operandMaterialized);
-        bvm.map(genericOp.getBlock()->getArguments()[1], iterationResult);
-        for (Operation& inner : genericOp.getBody()->without_terminator()) {
-          rewriter.clone(inner, bvm);
-        }
-        iterationResult = bvm.lookup(
-            genericOp.getBody()->getTerminator()->getOperands().front());
-
-        b.create<gml_st::SetYieldOp>(loc, iterationResult, laneAcc,
-                                     iterationTile);
-      };
-      laneResult = b.create<gml_st::ForOp>(loc, laneResult.getType(), laneId,
-                                           reductionDimSize, cGroupSize,
-                                           laneResult, forOpBodyBuilderFn)
-                       .getResults()
-                       .front();
-
-      b.create<gml_st::SetYieldOp>(loc, laneResult, warpResult, laneTile);
-    };
-    warpResult = rewriter
-                     .create<gml_st::ParallelOp>(
-                         loc, warpResult.getType(), c0, cGroupSize, c1,
-                         threadDistrLabel, parallelOpBodyBuilderFn)
-                     .getResults()
-                     .front();
-
-    // Change existing linalg.generic to warp-reduce the partial results.
-    rewriter.updateRootInPlace(genericOp, [&] {
-      genericOp->setOperand(0, warpResult);
-      gml_st::setTransformationAttr(rewriter, genericOp);
-    });
-
-    return success();
-  }
-};
-
-struct TilingGPUWarpPass
-    : public ::impl::TilingGPUWarpPassBase<TilingGPUWarpPass> {
-  void getDependentDialects(DialectRegistry& registry) const final {
-    ::impl::TilingGPUWarpPassBase<TilingGPUWarpPass>::getDependentDialects(
-        registry);
-    registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    MLIRContext* ctx = &getContext();
-
-    // Populate tiling patterns
-    RewritePatternSet patterns(ctx);
-    patterns.add<TilingCwisePattern, TilingReductionPattern>(ctx);
-
-    // Populate fusion patterns.
-    auto fuseGreedilyFilterFn = [](Operation* op) {
-      auto materializeOp = llvm::dyn_cast<MaterializeOp>(op);
-      Operation* source = materializeOp.getSource().getDefiningOp();
-
-      // Do not fuse wap-level reductions.
-      if (isSimpleReduction(source) && isWarpLevelOp(source)) return failure();
-
-      return success();
-    };
-    populateFusionPatterns(ctx, fuseGreedilyFilterFn, &patterns);
-
-    func::FuncOp func = getOperation();
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Clean up by removing temporary attributes.
-    func.walk([](Operation* op) { removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingGPUWarpPass() {
-  return std::make_unique<TilingGPUWarpPass>();
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface.cc
deleted file mode 100644
index ff6cac20efd..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-
-namespace mlir {
-namespace gml_st {
-
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.cc.inc"
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface_impl.cc
deleted file mode 100644
index 03c851cfa9b..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_interface_impl.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-template <typename LinalgOpTy>
-struct ExternalLinalgOpTilingInterface
-    : public TilingInterface::ExternalModel<
-          ExternalLinalgOpTilingInterface<LinalgOpTy>, LinalgOpTy> {
-  /// Return the destination operands.
-  SmallVector<Value> getDestinationOperands(Operation *op, OpBuilder &) const {
-    return cast<DestinationStyleOpInterface>(op).getDpsInitOperands();
-  }
-
-  /// Return the loop iterator type.
-  SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
-    auto linalgOp = cast<linalg::LinalgOp>(op);
-    return llvm::to_vector(llvm::map_range(
-        linalgOp.getIteratorTypesArray(), [](StringRef iteratorType) {
-          return utils::symbolizeIteratorType(iteratorType).value();
-        }));
-  }
-
-  /// Return the iteration domain range.
-  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(op);
-    Location loc = op->getLoc();
-    linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
-    SmallVector<OpFoldResult> allShapesSizes =
-        linalgOp.createFlatListOfOperandDims(b, loc);
-    AffineMap map = linalgOp.getShapesToLoopsMap();
-
-    IRRewriter rewriter(b);
-    return llvm::to_vector(
-        llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) {
-          OpFoldResult ofr = makeComposedFoldedAffineApply(
-              rewriter, loc, loopExpr, allShapesSizes);
-          return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)};
-        }));
-  }
-
-  // Instantiate the tiled implementation of the operation.
-  TilingInterface getTiledImplementation(Operation *op, OpBuilder &b,
-                                         ArrayRef<OpFoldResult> offsets,
-                                         ArrayRef<OpFoldResult> sizes) const {
-    Location loc = op->getLoc();
-    linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
-    OperandRange valuesToTile = linalgOp->getOperands();
-    SmallVector<Optional<linalg::SliceParameters>> allSliceParams =
-        linalg::computeAllSliceParameters(b, loc, linalgOp, valuesToTile,
-                                          offsets, sizes, {}, true);
-
-    SmallVector<Value> tiledOperands;
-    for (const auto &[valueToTile, sliceParams] :
-         llvm::zip(valuesToTile, allSliceParams)) {
-      // Use the original operand if it is not a ranked tensor. This could be a
-      // scalar, e.g. for `linalg.fill`.
-      auto valueToTileTy =
-          valueToTile.getType().template dyn_cast<RankedTensorType>();
-      if (!valueToTileTy) {
-        tiledOperands.push_back(valueToTile);
-        continue;
-      }
-
-      int64_t rank = valueToTileTy.getRank();
-      SmallVector<OpFoldResult> valueToTileSizes{
-          tensor::getMixedSizes(b, loc, valueToTile)};
-      SmallVector<OpFoldResult> zeros(rank, b.getI64IntegerAttr(0));
-      SmallVector<OpFoldResult> ones(rank, b.getI64IntegerAttr(1));
-      Value set =
-          sliceParams.has_value()
-              ? b.create<TileOp>(loc, sliceParams->offsets, sliceParams->sizes,
-                                 sliceParams->strides)
-              : b.create<TileOp>(loc, zeros, valueToTileSizes, ones);
-
-      Value materializedTile = b.create<MaterializeOp>(loc, valueToTile, set);
-      tiledOperands.push_back(materializedTile);
-    }
-
-    SmallVector<Type> resultTensorTypes = llvm::to_vector(llvm::map_range(
-        linalgOp.getDpsInitOperands(), [&](OpOperand *opOperand) {
-          return tiledOperands[opOperand->getOperandNumber()].getType();
-        }));
-
-    Operation *tiledOp =
-        linalgOp.clone(b, loc, resultTensorTypes, tiledOperands);
-    offsetIndices(b, cast<linalg::LinalgOp>(tiledOp), offsets);
-
-    return {tiledOp};
-  }
-
-  FailureOr<Value> generateResultTileValue(Operation *op, OpBuilder &b,
-                                           unsigned resultNumber,
-                                           ArrayRef<OpFoldResult> offsets,
-                                           ArrayRef<OpFoldResult> sizes) const {
-    auto linalgOp = cast<linalg::LinalgOp>(op);
-
-    // Check that the indexing map used for the output is a projected
-    // permutation. This could be relaxed with a more general approach that can
-    // map the offsets and sizes from the result to iteration space tiles
-    // (filling in full extent for dimensions not used to access the result).
-    AffineMap indexingMap =
-        linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber));
-    if (!indexingMap.isProjectedPermutation()) {
-      return op->emitOpError(
-          "unhandled tiled implementation generation when result is not "
-          "accessed using a permuted projection");
-    }
-
-    auto numLoops = linalgOp.getNumLoops();
-    auto tilingInterfaceOp = cast<TilingInterface>(op);
-    SmallVector<OpFoldResult> iterationTileOffsets(numLoops),
-        iterationTileSizes(numLoops);
-    if (!indexingMap.isPermutation()) {
-      SmallVector<Range> iterationDomain =
-          tilingInterfaceOp.getIterationDomain(b);
-      for (const auto &range : llvm::enumerate(iterationDomain)) {
-        iterationTileOffsets[range.index()] = range.value().offset;
-        iterationTileSizes[range.index()] = range.value().size;
-      }
-    }
-    for (const auto &resultExpr : llvm::enumerate(indexingMap.getResults())) {
-      unsigned dimPosition =
-          resultExpr.value().cast<AffineDimExpr>().getPosition();
-      iterationTileOffsets[dimPosition] = offsets[resultExpr.index()];
-      iterationTileSizes[dimPosition] = sizes[resultExpr.index()];
-    }
-
-    TilingInterface tiledOp = tilingInterfaceOp.getTiledImplementation(
-        b, iterationTileOffsets, iterationTileSizes);
-
-    return tiledOp->getResult(resultNumber);
-  }
-};
-
-}  // namespace
-
-void registerGmlStTilingInterfaceExternalModels(DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *) {
-    linalg::FillOp::attachInterface<
-        ExternalLinalgOpTilingInterface<linalg::FillOp>>(*ctx);
-    linalg::GenericOp::attachInterface<
-        ExternalLinalgOpTilingInterface<linalg::GenericOp>>(*ctx);
-    linalg::MapOp::attachInterface<
-        ExternalLinalgOpTilingInterface<linalg::MapOp>>(*ctx);
-    linalg::MatmulOp::attachInterface<
-        ExternalLinalgOpTilingInterface<linalg::MatmulOp>>(*ctx);
-    linalg::TransposeOp::attachInterface<
-        ExternalLinalgOpTilingInterface<linalg::TransposeOp>>(*ctx);
-  });
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_softmax.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_softmax.cc
deleted file mode 100644
index 4191055aa26..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/tiling_softmax.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/transforms/fusion.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/linalg_utils.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TILINGSOFTMAXPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-Operation *fuseIthOperandInPlace(PatternRewriter &rewriter, Operation *op,
-                                 int64_t i) {
-  auto matOp = llvm::cast<MaterializeOp>(op->getOperand(i).getDefiningOp());
-  FailureOr<Value> fused = createFusedOp(rewriter, matOp);
-  assert(succeeded(fused) && "expect success after matching");
-  rewriter.replaceOp(matOp, *fused);
-  return fused->getDefiningOp();
-}
-
-LogicalResult tilePartialSoftmax(
-    TilingInterface op, PatternRewriter &rewriter,
-    llvm::function_ref<FailureOr<Operation *>(Operation *, int64_t)>
-        tileOperationFn) {
-  // Match cwise root op.
-  if (!isCwiseGenericOp(op)) return failure();
-
-  // Match all operands to be derived from the same source value in one of two
-  // ways:
-  //   i)  by a reduction and subsequent bcast in one dimension, or
-  //   ii) by using the source value as is.
-  Value commonSource;
-  Optional<int64_t> commonReductionDim;
-  SmallVector<Optional<SimpleBcastReduction>> simpleBcastReductions;
-  auto genericOp = llvm::dyn_cast_or_null<linalg::GenericOp>(op.getOperation());
-  for (Value operand : genericOp.getInputs()) {
-    // Case i.
-    SimpleBcastReduction bcastReduction;
-    int64_t reductionDim;
-    if (isSimpleBcastReduction(operand.getDefiningOp(), &reductionDim,
-                               &bcastReduction)) {
-      if (commonSource && commonSource != bcastReduction.operand) {
-        return failure();
-      }
-      commonSource = bcastReduction.operand;
-      if (commonReductionDim && *commonReductionDim != reductionDim) {
-        return failure();
-      }
-      commonReductionDim = reductionDim;
-      simpleBcastReductions.push_back(bcastReduction);
-      // foundBcastReduction = true;
-      continue;
-    }
-
-    // Case ii.
-    if (commonSource && commonSource != operand) return failure();
-    commonSource = operand;
-    simpleBcastReductions.push_back(llvm::None);
-  }
-
-  if (!commonReductionDim || !commonSource) return failure();
-
-  // Tile or fuse cwise root op.
-  FailureOr<Operation *> tiledOp = tileOperationFn(op, *commonReductionDim);
-  if (failed(tiledOp)) return failure();
-  setTransformationAttr(rewriter, *tiledOp);
-
-  // Fuse through the bcast reduction chains.
-  Value commonTiledSource;
-  for (int64_t i = 0; i < simpleBcastReductions.size(); i++) {
-    if (!simpleBcastReductions[i]) continue;
-
-    // Fuse.
-    Operation *tiledBcast = fuseIthOperandInPlace(rewriter, *tiledOp, i);
-    Operation *tiledReduction =
-        fuseIthOperandInPlace(rewriter, tiledBcast, /*i=*/0);
-
-    // Use common tiled source value.
-    if (commonTiledSource) {
-      tiledReduction->setOperand(0, commonTiledSource);
-    } else {
-      commonTiledSource = tiledReduction->getOperands().front();
-    }
-  }
-
-  // Also use the common tiled source value for the remaining operands.
-  for (int64_t i = 0; i < simpleBcastReductions.size(); i++) {
-    if (simpleBcastReductions[i]) continue;
-    (*tiledOp)->setOperand(i, commonTiledSource);
-  }
-
-  return success();
-}
-
-struct TilePartialSoftmaxPattern
-    : public OpInterfaceRewritePattern<TilingInterface> {
-  using OpInterfaceRewritePattern<TilingInterface>::OpInterfaceRewritePattern;
-
-  TilePartialSoftmaxPattern(MLIRContext *ctx, bool distribute,
-                            SmallVector<int64_t> tileSizes,
-                            StringRef distributionLabel,
-                            PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<TilingInterface>(ctx, benefit),
-        distribute(distribute),
-        tileSizes(std::move(tileSizes)),
-        distributionLabel(distributionLabel) {}
-
-  LogicalResult matchAndRewrite(TilingInterface op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(op)) return failure();
-
-    // Only apply to non-fusable occurrences.
-    bool hasFusableOccurrences = llvm::any_of(
-        op->getUsers(),
-        [](Operation *op) { return llvm::isa<MaterializeOp>(op); });
-    if (hasFusableOccurrences) return failure();
-
-    return tilePartialSoftmax(
-        op, rewriter,
-        [&](Operation *op,
-            int64_t commonReductionDim) -> FailureOr<Operation *> {
-          // Populate tiling options.
-          TilingOptions tilingOptions;
-          tilingOptions.tileSizeComputationFn =
-              [&](OpBuilder &b, Operation *op) -> SmallVector<Value> {
-            Location loc = op->getLoc();
-            SmallVector<Value> tileSizeValues;
-            for (int64_t i = 0; i < tileSizes.size(); i++) {
-              // Skip tiling the reduction dimension. By convention, this is a
-              // tile size of 0.
-              int64_t tileSizeInDim =
-                  i == commonReductionDim ? 0 : tileSizes[i];
-              tileSizeValues.push_back(
-                  b.create<arith::ConstantIndexOp>(loc, tileSizeInDim));
-            }
-            return tileSizeValues;
-          };
-          tilingOptions.distribute = distribute;
-          tilingOptions.distributionLabel = distributionLabel;
-          // Tile.
-          FailureOr<TilingResult> tilingResult =
-              tile(tilingOptions, rewriter, op);
-          if (failed(tilingResult)) return failure();
-
-          rewriter.replaceOp(op, tilingResult->loop->getResults());
-          setTransformationAttr(rewriter, tilingResult->tiledOp);
-          return tilingResult->tiledOp;
-        });
-  }
-
- private:
-  bool distribute;
-  SmallVector<int64_t> tileSizes;
-  std::string distributionLabel;
-};
-
-struct FusePartialSoftmaxPattern : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    Value source = op.getSource();
-    Operation *def = source.getDefiningOp();
-    if (!def) return failure();
-
-    if (!llvm::isa<TilingInterface>(def)) return failure();
-
-    return tilePartialSoftmax(
-        def, rewriter,
-        [&](Operation *cwiseOp,
-            int64_t /*commonReductionDim*/) -> FailureOr<Operation *> {
-          auto iface = llvm::dyn_cast_or_null<TilingInterface>(cwiseOp);
-          if (!iface) return failure();
-
-          // By construction, we assume that the tile spans the operand in the
-          // common reduction dimension (`commonReductionDim`).
-          // TODO(frgossen): Assert this assumption when we have moved to
-          // unnested tiles.
-
-          // Extract tile offsets and sizes.
-          auto tile = op.getSet().getDefiningOp<TileOp>();
-          if (!tile) return failure();
-
-          // Fuse.
-          SmallVector<OpFoldResult> offsets = tile.getMixedOffsets();
-          SmallVector<OpFoldResult> sizes = tile.getMixedSizes();
-          FailureOr<Value> result =
-              iface.generateResultTileValue(rewriter, 0, offsets, sizes);
-          if (failed(result)) return failure();
-
-          rewriter.replaceOp(op, *result);
-          return result->getDefiningOp();
-        });
-  }
-};
-
-struct FuseUnaryCwisePattern : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    // Match unary cwise ops.
-    Operation *source = op.getSource().getDefiningOp();
-    if (!isUnaryCwiseGenericOp(source)) return failure();
-
-    // Fuse.
-    FailureOr<Value> fused = createFusedOp(rewriter, op);
-    if (failed(fused)) return failure();
-
-    rewriter.replaceOp(op, *fused);
-    return success();
-  }
-};
-
-struct TilingSoftmaxPass
-    : public impl::TilingSoftmaxPassBase<TilingSoftmaxPass> {
-  TilingSoftmaxPass() = default;
-  TilingSoftmaxPass(bool distr, ArrayRef<int64_t> ts, StringRef dl) {
-    this->distribute = distr;
-    this->tileSizes = ts;
-    this->distributionLabel = dl.str();
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry
-        .insert<GmlStDialect, linalg::LinalgDialect, tensor::TensorDialect>();
-    registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    // Populate tiling and fusion patterns for partial softmax and unary cwise
-    // ops.
-    RewritePatternSet patterns(ctx);
-    SmallVector<int64_t> tileSizes(this->tileSizes.begin(),
-                                   this->tileSizes.end());
-    patterns.insert<TilePartialSoftmaxPattern>(ctx, distribute, tileSizes,
-                                               distributionLabel);
-    patterns.insert<FuseUnaryCwisePattern, FusePartialSoftmaxPattern>(ctx);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Clean up by removing temporary attributes.
-    f.walk([](Operation *op) { removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass() {
-  return std::make_unique<TilingSoftmaxPass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTilingSoftmaxPass(
-    bool distribute, ArrayRef<int64_t> tileSizes, StringRef distributionLabel) {
-  return std::make_unique<TilingSoftmaxPass>(distribute, tileSizes,
-                                             distributionLabel);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_map_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_map_for_cpu.cc
deleted file mode 100644
index ddbb6aa07e1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_map_for_cpu.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMMAPFORCPUPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-struct TileMapPattern : public OpRewritePattern<linalg::MapOp> {
-  TileMapPattern(MLIRContext *context, TilingOptions options,
-                 PatternBenefit benefit = 1)
-      : OpRewritePattern<linalg::MapOp>(context, benefit),
-        options(std::move(options)) {}
-
-  LogicalResult matchAndRewrite(linalg::MapOp op,
-                                PatternRewriter &rewriter) const override {
-    if (hasTransformationAttr(op)) return failure();
-
-    auto tilingResult =
-        tile(options, rewriter, cast<TilingInterface>(op.getOperation()));
-    if (failed(tilingResult)) return failure();
-
-    // If we did not tile (e.g. when all tile sizes are 0), do not replace
-    // original op and just mark it as transformed then return.
-    if (tilingResult->loop != nullptr) {
-      rewriter.replaceOp(op, tilingResult->loop->getResults());
-    }
-    setTransformationAttr(rewriter, tilingResult->tiledOp);
-    return success();
-  }
-
- private:
-  TilingOptions options;
-};
-
-struct TransformMapForCpuPass
-    : public impl::TransformMapForCpuPassBase<TransformMapForCpuPass> {
-  explicit TransformMapForCpuPass(int64_t ts) { tileSize = ts; }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, tensor::TensorDialect>();
-    mlir::gml_st::registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *context = &getContext();
-
-    mlir::gml_st::TilingOptions opts;
-
-    opts.tileSizeComputationFn = [&](OpBuilder &b, Operation *op) {
-      auto numLoops = cast<linalg::MapOp>(op).getNumLoops();
-      SmallVector<Value> tiles(
-          numLoops, b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
-      if (!tiles.empty())
-        tiles.back() = b.create<arith::ConstantIndexOp>(op->getLoc(), tileSize);
-      return tiles;
-    };
-
-    RewritePatternSet patterns(context);
-    patterns.add<TileMapPattern>(context, opts);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    f.walk([](linalg::MapOp op) { gml_st::removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMapForCpuPass(int64_t tileSize) {
-  return std::make_unique<mlir::gml_st::TransformMapForCpuPass>(tileSize);
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_matmul_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_matmul_for_cpu.cc
deleted file mode 100644
index aec7d3be33b..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_matmul_for_cpu.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMMATMULFORCPUPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-struct TransformMatmulForCpuPass
-    : public impl::TransformMatmulForCpuPassBase<TransformMatmulForCpuPass> {
-  TransformMatmulForCpuPass() = default;
-  explicit TransformMatmulForCpuPass(
-      llvm::ArrayRef<int64_t> matmulTileSizes) {
-    tileSizes = matmulTileSizes;
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    linalg::LinalgDialect, tensor::TensorDialect>();
-    mlir::gml_st::registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    mlir::gml_st::TilingOptions opts;
-
-    if ((*tileSizes).empty()) {
-      tileSizes = {2, 2, 2};
-    }
-
-    assert(tileSizes.size() == 3 &&
-           "Tiling sizes for MatMul should have 3 elements");
-
-    auto filter_fn = [&](Operation *op) {
-      return success(isa<mlir::linalg::MatmulOp>(op));
-    };
-
-    ///////////////////////////////
-    // Tiling parallel dimensions
-    opts.setTileSizeComputationFn({(*tileSizes)[0], (*tileSizes)[1], 0});
-
-    RewritePatternSet patterns(ctx);
-    populateTilingPatterns(ctx, filter_fn, opts, &patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-
-    // Ensure we drop the marker in the end.
-    f.walk([](linalg::LinalgOp op) { gml_st::removeTransformationAttr(op); });
-
-    ///////////////////////////////
-    // Tiling reduction dimension
-    opts.setTileSizeComputationFn({0, 0, (*tileSizes).back()});
-    opts.distribute = false;
-
-    RewritePatternSet newpatterns(ctx);
-    populateTilingPatterns(ctx, filter_fn, opts, &newpatterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(newpatterns)))) {
-      return signalPassFailure();
-    }
-
-    // Ensure we drop the marker in the end.
-    f.walk([](linalg::LinalgOp op) { gml_st::removeTransformationAttr(op); });
-  }
-};
-
-}  // namespace
-}  // namespace mlir::gml_st
-
-namespace mlir::gml_st {
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMatmulForCpuPass() {
-  return std::make_unique<mlir::gml_st::TransformMatmulForCpuPass>();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformMatmulForCpuPass(llvm::ArrayRef<int64_t> matmulTileSizes) {
-  return std::make_unique<mlir::gml_st::TransformMatmulForCpuPass>(
-      matmulTileSizes);
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_scatter_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_scatter_for_cpu.cc
deleted file mode 100644
index 34c9097cb62..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transform_scatter_for_cpu.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface_impl.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::gml_st {
-namespace {
-
-#define GEN_PASS_DEF_TRANSFORMSCATTERFORCPUPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-struct TransformScatterForCpuPass
-    : public impl::TransformScatterForCpuPassBase<TransformScatterForCpuPass> {
-  void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<mlir::gml_st::GmlStDialect, arith::ArithDialect,
-                    tensor::TensorDialect>();
-    mlir::gml_st::registerGmlStTilingInterfaceExternalModels(registry);
-  }
-
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext *ctx = &getContext();
-
-    mlir::gml_st::TilingOptions opts;
-    opts.distribute = false;  // Tile to `for` loops.
-
-    // Tile everything to points.
-    opts.tileSizeComputationFn = [](OpBuilder &b, Operation *op) {
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(
-          &op->getParentOfType<func::FuncOp>().getBody().front());
-
-      auto loops = cast<gml_st::TilingInterface>(op).getLoopIteratorTypes();
-      return SmallVector<Value>(
-          loops.size(), b.create<arith::ConstantIndexOp>(op->getLoc(), 1));
-    };
-
-    auto filterFn = [&](Operation *op) {
-      if (isa<mlir::thlo::ScatterOp>(op))
-        return success();
-      return failure();
-    };
-
-    RewritePatternSet patterns(ctx);
-    populateTilingPatterns(ctx, filterFn, opts, &patterns);
-
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-}  // namespace mlir::gml_st
-
-namespace mlir::gml_st {
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-createTransformScatterForCpuPass() {
-  return std::make_unique<mlir::gml_st::TransformScatterForCpuPass>();
-}
-
-}  // namespace mlir::gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transforms.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transforms.cc
deleted file mode 100644
index d40c6682f1a..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/transforms.cc
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/gml_st/transforms/transforms.h"
-
-#include <tuple>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/SCF/Utils/AffineCanonicalizationUtils.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StaticValueUtils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/Matchers.h"
-
-namespace mlir {
-namespace gml_st {
-bool isZero(Value v) { return matchPattern(v, m_Zero()); }
-namespace {
-
-bool isATensor(Type t) { return t.isa<TensorType>(); }
-
-/// Return true if the given op has only tensor-typed results or operands.
-bool hasTensorSemantics(Operation *op) {
-  return llvm::all_of(op->getResultTypes(), isATensor) ||
-         llvm::all_of(op->getOperandTypes(), isATensor);
-}
-
-/// Rewrite a LoopOp/ParallelOp/ForOp with bounds/step that potentially do not
-/// divide evenly into two LoopOp/ParallelOp/ForOps: One where the step divides
-/// the iteration space evenly, followed another one for the last (partial)
-/// iteration (if any). This function only rewrites the `idx`-th loop of the
-/// loop nest represented by the LoopOp/ParallelOp/ForOp. To peel the entire
-/// loop nest, this function must be called multiple times.
-///
-/// This function rewrites the given LoopOp/ParallelOp/ForOp in-place and
-/// creates a new LoopOp/ParallelOp/ForOp for the last iteration. It replaces
-/// all uses of the original LoopOp/ParallelOp/ForOp with the results of the
-/// newly generated one.
-///
-/// The newly generated LoopOp/ParallelOp/ForOp is returned via `result`. The
-/// boundary at which the loop is split (new upper bound) is returned via
-/// `splitBound`.  The return value indicates whether the
-/// LoopOp/ParallelOp/ForOp was rewritten or not.
-template <typename LoopTy>
-LogicalResult peelLoop(RewriterBase &b, LoopTy loopOp, int64_t idx,
-                       LoopTy &result, Value &splitBound) {
-  if (!hasTensorSemantics(loopOp)) return failure();
-
-  Value lb = loopOp.getLowerBound()[idx], ub = loopOp.getUpperBound()[idx],
-        step = loopOp.getStep()[idx];
-  auto ubInt = getConstantIntValue(ub);
-
-  auto loc = loopOp.getLoc();
-  AffineExpr exprLb, exprUb, exprStep;
-  bindSymbols(b.getContext(), exprLb, exprUb, exprStep);
-  // New upper bound: %ub - (%ub - %lb) mod %step
-  auto modMap = AffineMap::get(0, 3, {exprUb - ((exprUb - exprLb) % exprStep)});
-  SmallVector<Value> operands{lb, ub, step};
-  canonicalizeMapAndOperands(&modMap, &operands);
-  modMap = simplifyAffineMap(modMap);
-  RewriterBase::InsertionGuard guard(b);
-  b.setInsertionPoint(loopOp);
-  splitBound = b.createOrFold<AffineApplyOp>(loc, modMap, operands);
-  // No specialization necessary if step already divides upper bound evenly.
-  if (splitBound == ub || (ubInt && ubInt == getConstantIntValue(splitBound)))
-    return failure();
-
-  // Create remainder loop.
-  BlockAndValueMapping bvm;
-  for (const auto &[res, termDst] :
-       llvm::zip(loopOp.getResults(), loopOp.getLoopLikeOpInits())) {
-    bvm.map(termDst, res);
-  }
-  b.setInsertionPointAfter(loopOp);
-  auto remainderLoop = cast<LoopTy>(b.clone(*loopOp.getOperation(), bvm));
-
-  Operation *remainderLoopOp = remainderLoop.getOperation();
-
-  for (const auto &[oldRes, newRes] :
-       llvm::zip(loopOp.getResults(), remainderLoop.getResults())) {
-    SmallPtrSet<Operation *, 4> exceptions({remainderLoopOp});
-    for (OpOperand &use : oldRes.getUses()) {
-      Operation *user = use.getOwner();
-      if (user->getParentOp() == remainderLoopOp) exceptions.insert(user);
-    }
-    oldRes.replaceAllUsesExcept(newRes, exceptions);
-  }
-
-  // Set new loop bounds.
-  b.updateRootInPlace(loopOp, [&]() {
-    SmallVector<Value> ubs = loopOp.getUpperBound();
-    ubs[idx] = splitBound;
-    loopOp.getUpperBoundMutable().assign(ubs);
-  });
-  SmallVector<Value> lbs = remainderLoop.getLowerBound();
-  lbs[idx] = splitBound;
-  b.updateRootInPlace(remainderLoop, [&]() {
-    remainderLoop.getLowerBoundMutable().assign(lbs);
-  });
-
-  result = remainderLoop;
-  return success();
-}
-
-template <typename OpTy, bool IsMin>
-void rewriteAffineOpAfterPeeling(RewriterBase &rewriter, Operation *mainLoop,
-                                 Operation *remainderLoop, Value mainIv,
-                                 Value remainderIv, Value ub, Value step) {
-  mainLoop->walk([&](OpTy affineOp) {
-    AffineMap map = affineOp.getAffineMap();
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, mainIv, ub,
-                                     step, /*insideLoop=*/true);
-  });
-  remainderLoop->walk([&](OpTy affineOp) {
-    AffineMap map = affineOp.getAffineMap();
-    (void)scf::rewritePeeledMinMaxOp(rewriter, affineOp, map,
-                                     affineOp.operands(), IsMin, remainderIv,
-                                     ub, step, /*insideLoop=*/false);
-  });
-}
-
-using ::mlir::linalg::LinalgOp;
-
-void generateLoopNest(OpBuilder &b, Location loc, ArrayRef<Range> loopRanges,
-                      LinalgOp linalgOp, ArrayRef<Attribute> iteratorTypes,
-                      function_ref<scf::ValueVector(OpBuilder &, Location,
-                                                    ValueRange, ValueRange)>
-                          bodyBuilderFn,
-                      ArrayRef<StringRef> distributionTypes) {
-  SmallVector<OpFoldResult, 4> lbs, ubs, steps;
-  for (Range range : loopRanges) {
-    lbs.emplace_back(range.offset);
-    ubs.emplace_back(range.size);
-    steps.emplace_back(range.stride);
-  }
-
-  auto wrappedBuilderFn = [&](OpBuilder &nestedBuilder, Location nestedLoc,
-                              ValueRange ivs, ValueRange inputs,
-                              ValueRange outputs) {
-    SmallVector<Value> operandValuesToUse = inputs;
-    operandValuesToUse.append(outputs.begin(), outputs.end());
-    scf::ValueVector results =
-        bodyBuilderFn(nestedBuilder, nestedLoc, ivs, operandValuesToUse);
-    nestedBuilder.create<gml_st::YieldOp>(nestedLoc, results);
-  };
-
-  SmallVector<Value> inputs{linalgOp.getDpsInputOperands()};
-  SmallVector<Value> outputs{linalgOp.getDpsInitOperands()};
-
-  SmallVector<Value> lbsValue =
-      mlir::getValueOrCreateConstantIndexOp(b, loc, lbs);
-  SmallVector<Value> ubsValue =
-      mlir::getValueOrCreateConstantIndexOp(b, loc, ubs);
-  SmallVector<Value> stepsValue =
-      mlir::getValueOrCreateConstantIndexOp(b, loc, steps);
-  auto tiledLoop =
-      b.create<LoopOp>(loc, lbsValue, ubsValue, stepsValue, inputs, outputs,
-                       b.getArrayAttr(iteratorTypes), wrappedBuilderFn);
-  if (!distributionTypes.empty())
-    tiledLoop.setDistributionTypes(b, distributionTypes);
-}
-
-// Insert a tile `source` into the destination tensor `dest`. The position at
-// which the tile is inserted (as well as size of tile) is taken from a given
-// ExtractSliceOp `sliceOp`.
-Value insertSliceIntoTensor(RewriterBase &b, Location loc,
-                            tensor::ExtractSliceOp sliceOp, Value source,
-                            Value dest) {
-  return b.create<tensor::InsertSliceOp>(
-      loc, sliceOp.getSource().getType(), source, dest, sliceOp.getOffsets(),
-      sliceOp.getSizes(), sliceOp.getStrides(), sliceOp.getStaticOffsets(),
-      sliceOp.getStaticSizes(), sliceOp.getStaticStrides());
-}
-
-FailureOr<linalg::TiledLinalgOp> tileLinalgOpImpl(
-    RewriterBase &b, LinalgOp op, ValueRange tileSizes,
-    const linalg::LinalgTilingOptions &options) {
-  auto nLoops = op.getNumLoops();
-  // Initial tile sizes may be too big, only take the first nLoops.
-  tileSizes = tileSizes.take_front(nLoops);
-
-  if (llvm::all_of(tileSizes, isZero)) {
-    linalg::TiledLinalgOp tiledOp;
-    tiledOp.op = cast<LinalgOp>(b.clone(*op.getOperation()));
-    tiledOp.tensorResults.assign(tiledOp.op->result_begin(),
-                                 tiledOp.op->result_end());
-    return tiledOp;
-  }
-
-  SmallVector<OpFoldResult> tileSizesFold;
-  for (Value tileSize : tileSizes) tileSizesFold.push_back(tileSize);
-
-  // 1. Build the tiled loop ranges.
-  auto allShapeSizes = op.createFlatListOfOperandDims(b, op.getLoc());
-  AffineMap shapeSizesToLoopsMap = op.getShapesToLoopsMap();
-  if (!shapeSizesToLoopsMap) return failure();
-
-  SmallVector<Range, 4> loopRanges;
-  mlir::linalg::LoopIndexToRangeIndexMap loopIndexToRangeIndex;
-  std::tie(loopRanges, loopIndexToRangeIndex) =
-      mlir::linalg::makeTiledLoopRanges(b, op.getLoc(), shapeSizesToLoopsMap,
-                                        allShapeSizes, tileSizesFold);
-
-  SmallVector<Attribute, 4> iteratorTypes;
-  for (const auto &attr : enumerate(op.getIteratorTypesArray())) {
-    if (loopIndexToRangeIndex.count(attr.index()))
-      iteratorTypes.push_back(IteratorTypeAttr::get(
-          b.getContext(), utils::symbolizeIteratorType(attr.value()).value()));
-  }
-
-  // 2. Create the tiled loops.
-  LinalgOp res = op;
-  SmallVector<Value, 4> ivs, tensorResults;
-  auto tiledLoopBodyBuilder =
-      [&](OpBuilder & /*builder*/, Location loc, ValueRange localIvs,
-          ValueRange operandValuesToUse) -> scf::ValueVector {
-    ivs.assign(localIvs.begin(), localIvs.end());
-
-    // Tile the `operandValuesToUse` that either match the `op` operands
-    // themselves or the tile loop arguments forwarding them.
-    assert(operandValuesToUse.size() == op->getNumOperands() &&
-           "expect the number of operands and inputs and outputs to match");
-    SmallVector<Value> valuesToTile = operandValuesToUse;
-    auto sizeBounds = makeComposedFoldedMultiResultAffineApply(
-        b, loc, shapeSizesToLoopsMap, allShapeSizes);
-    SmallVector<OpFoldResult> ivsFold(ivs.begin(), ivs.end());
-    SmallVector<Value, 4> tiledOperands = makeTiledShapes(
-        b, loc, op, valuesToTile, ivsFold, tileSizesFold, sizeBounds,
-        /*omitPartialTileCheck=*/false);
-
-    SmallVector<Type, 4> resultTensorTypes;
-    for (OpOperand *opOperand : op.getDpsInitOperands())
-      resultTensorTypes.push_back(
-          tiledOperands[opOperand->getOperandNumber()].getType());
-
-    res = op.clone(b, loc, resultTensorTypes, tiledOperands);
-
-    // Insert a insert_slice for each output tensor.
-    unsigned resultIdx = 0;
-    for (OpOperand *opOperand : op.getDpsInitOperands()) {
-      Value outputTensor = tiledOperands[opOperand->getOperandNumber()];
-      IRRewriter rewriter(b);
-      if (auto sliceOp = outputTensor.getDefiningOp<tensor::ExtractSliceOp>()) {
-        tensorResults.push_back(insertSliceIntoTensor(rewriter, loc, sliceOp,
-                                                      res->getResult(resultIdx),
-                                                      sliceOp.getSource()));
-      } else {
-        tensorResults.push_back(res->getResult(resultIdx));
-      }
-      ++resultIdx;
-    }
-    return scf::ValueVector(tensorResults.begin(), tensorResults.end());
-  };
-  generateLoopNest(b, op.getLoc(), loopRanges, op, iteratorTypes,
-                   tiledLoopBodyBuilder, options.distributionTypes);
-
-  // 3. Transform IndexOp results w.r.t. the tiling.
-  mlir::linalg::transformIndexOps(b, res, ivs, loopIndexToRangeIndex);
-
-  // 4. Gather the newly created loops and return them with the new op.
-  SmallVector<Operation *, 8> loops;
-  loops.reserve(ivs.size());
-  for (auto iv : ivs) {
-    if (iv.isa<BlockArgument>()) {
-      loops.push_back(iv.cast<BlockArgument>().getOwner()->getParentOp());
-      assert(loops.back() && "no owner found for induction variable!");
-    } else {
-      loops.push_back(nullptr);
-    }
-  }
-
-  // 5. Get the tensor results from the outermost loop if available. Otherwise
-  // use the previously captured `tensorResults`.
-  Operation *outermostLoop = nullptr;
-  for (Operation *loop : loops)
-    if ((outermostLoop = loop)) break;
-
-  return linalg::TiledLinalgOp{
-      res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};
-}
-
-template <typename LoopTy>
-LogicalResult peelAndCanonicalizeGmlStLoopImpl(RewriterBase &rewriter,
-                                               LoopTy loopOp, int64_t idx,
-                                               LoopTy &result) {
-  int64_t numLoops = loopOp.getNumLoops();
-  if (idx < 0 || numLoops <= idx) return failure();
-
-  Value ub = loopOp.getUpperBound()[idx];
-  LoopTy remainderLoop;
-  Value splitBound;
-  if (failed(
-          peelLoop<LoopTy>(rewriter, loopOp, idx, remainderLoop, splitBound)))
-    return failure();
-
-  // Rewrite affine.min and affine.max ops.
-  Value mainIv = loopOp.getInductionVars()[idx], step = loopOp.getStep()[idx],
-        remainderIv = remainderLoop.getInductionVars()[idx];
-
-  rewriteAffineOpAfterPeeling<AffineMinOp, /*IsMin=*/true>(
-      rewriter, loopOp, remainderLoop, mainIv, remainderIv, ub, step);
-  rewriteAffineOpAfterPeeling<AffineMaxOp, /*IsMin=*/false>(
-      rewriter, loopOp, remainderLoop, mainIv, remainderIv, ub, step);
-
-  result = remainderLoop;
-  return success();
-}
-}  // namespace
-
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter,
-                                           LoopOp loopOp, int64_t idx,
-                                           LoopOp &result) {
-  return peelAndCanonicalizeGmlStLoopImpl<LoopOp>(rewriter, loopOp, idx,
-                                                  result);
-}
-
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter,
-                                           ParallelOp loopOp, int64_t idx,
-                                           ParallelOp &result) {
-  return peelAndCanonicalizeGmlStLoopImpl<ParallelOp>(rewriter, loopOp, idx,
-                                                      result);
-}
-
-LogicalResult peelAndCanonicalizeGmlStLoop(RewriterBase &rewriter, ForOp loopOp,
-                                           int64_t idx, ForOp &result) {
-  return peelAndCanonicalizeGmlStLoopImpl<ForOp>(rewriter, loopOp, idx, result);
-}
-
-FailureOr<linalg::TiledLinalgOp> tileLinalgOp(
-    RewriterBase &b, linalg::LinalgOp op,
-    const linalg::LinalgTilingOptions &options) {
-  OpBuilder::InsertionGuard g(b);
-  b.setInsertionPoint(op);
-
-  if (!options.tileSizeComputationFunction) return failure();
-
-  // Enforce the convention that "tiling by zero" skips tiling a particular
-  // dimension. This convention is significantly simpler to handle instead of
-  // adjusting affine maps to account for missing dimensions.
-  auto nLoops = op.getNumLoops();
-  SmallVector<Value, 4> tileSizeVector =
-      options.tileSizeComputationFunction(b, op);
-  if (tileSizeVector.size() < nLoops) {
-    auto zero = b.create<arith::ConstantIndexOp>(op.getLoc(), 0);
-    tileSizeVector.append(nLoops - tileSizeVector.size(), zero);
-  }
-
-  return tileLinalgOpImpl(b, op, tileSizeVector, options);
-}
-
-constexpr llvm::StringLiteral kTransformMarker =
-    "__internal_transformation_marker__";
-
-void setTransformationAttr(mlir::OpBuilder &b, Operation *op) {
-  op->setAttr(kTransformMarker, b.getBoolAttr(true));
-}
-
-void removeTransformationAttr(Operation *op) {
-  op->removeAttr(kTransformMarker);
-}
-
-bool hasTransformationAttr(Operation *op) {
-  auto marker = op->getAttr(kTransformMarker);
-  if (!marker) return false;
-  return marker && marker.cast<BoolAttr>().getValue();
-}
-
-constexpr llvm::StringLiteral kOpLabel = "op_label";
-
-bool hasMatchingLabel(Operation *op, StringRef label) {
-  auto opLabelAttr = op->getAttr(kOpLabel);
-  if (!opLabelAttr) return false;
-
-  return opLabelAttr.cast<StringAttr>().getValue() == label;
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/vectorization.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/vectorization.cc
deleted file mode 100644
index bad92c2c8f6..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/gml_st/transforms/vectorization.cc
+++ /dev/null
@@ -1,584 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <limits>
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/vector_utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace gml_st {
-namespace {
-
-#define GEN_PASS_DEF_VECTORIZEGMLSTLOOPSPASS
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h.inc"
-
-using mlir::linalg::FillOp;
-using mlir::linalg::GenericOp;
-using mlir::linalg::MatmulOp;
-using mlir::tensor::ExpandShapeOp;
-using mlir::vector::TransferReadOp;
-using mlir::vector::TransferWriteOp;
-
-// The upper limit for vectorization of untiled `linalg.fill`. If a tensor has a
-// static shape with more elements, then `linalg.fill` won't be vectorized. It
-// is expected that such operations are tiled to get to small static shapes.
-constexpr int64_t kNumElementsThreshold = 1024;
-
-// Rewrite `vector.transfer_read(linalg.expand_shape)` as
-// `vector.shape_cast(vector.transfer_read)`.
-struct TransferReadOfOneDimExpandShape
-    : public mlir::OpRewritePattern<TransferReadOp> {
-  using OpRewritePattern<TransferReadOp>::OpRewritePattern;
-
-  mlir::LogicalResult matchAndRewrite(
-      TransferReadOp vectorRead,
-      mlir::PatternRewriter &rewriter) const override {
-    auto expand = vectorRead.getSource().getDefiningOp<ExpandShapeOp>();
-    if (!expand) return failure();
-
-    auto expandSrc = expand.getSrc();
-    auto expandSrcType = expand.getSrcType();
-    auto expandDstType = expand.getResultType();
-    if (expandSrcType.getRank() != 1 || expandDstType.getRank() != 2)
-      return failure();
-
-    auto resultType = vectorRead.getType().dyn_cast<mlir::ShapedType>();
-    if (!resultType || resultType.getShape() != expandDstType.getShape())
-      return failure();
-
-    auto zero = rewriter.create<arith::ConstantIndexOp>(vectorRead.getLoc(), 0);
-    auto map = mlir::AffineMap::get(1, 0, {rewriter.getAffineDimExpr(0)},
-                                    vectorRead.getContext());
-    // TODO(pifon): Also support canonicalization in case the map is not an
-    // identity.
-    if (!map.isIdentity()) return failure();
-
-    auto newRead = rewriter.create<TransferReadOp>(
-        vectorRead.getLoc(),
-        mlir::VectorType::get(expandSrcType.getShape(),
-                              expandSrcType.getElementType()),
-        expandSrc, mlir::ValueRange{zero}, mlir::AffineMapAttr::get(map),
-        vectorRead.getPadding(),
-        /*mask=*/mlir::Value(), rewriter.getBoolArrayAttr({true}));
-    rewriter.replaceOpWithNewOp<mlir::vector::ShapeCastOp>(
-        vectorRead, vectorRead.getType(), newRead);
-    return success();
-  }
-};
-
-// Rewrite materialize of scalar from 1-element vector into a vector.extract /
-// vector.extractelement.
-struct MaterializeFromSingleElementToExtractPattern
-    : public OpRewritePattern<MaterializeOp> {
-  using OpRewritePattern<MaterializeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    Value source = op.getSource();
-    auto sourceType = source.getType().dyn_cast<VectorType>();
-    if (!sourceType || sourceType.getNumDynamicDims() > 0 ||
-        sourceType.getNumElements() > 1) {
-      return rewriter.notifyMatchFailure(
-          op, "source should be a single element vector");
-    }
-    if (op.getResult().getType().isa<ShapedType>())
-      return rewriter.notifyMatchFailure(op, "result should be a scalar");
-
-    int64_t rank = sourceType.getRank();
-    if (rank == 0) {
-      // vector.extract doesn't support 0D tensors at the moment,
-      // use vector.extractelement.
-      rewriter.replaceOpWithNewOp<vector::ExtractElementOp>(op, source);
-      return success();
-    }
-    rewriter.replaceOpWithNewOp<vector::ExtractOp>(
-        op, source, SmallVector<int64_t>(rank, 0));
-    return success();
-  }
-};
-
-// Prepend a set_yield of scalar into 1-element vector with a vector.insert.
-struct SetYieldOfScalarToVectorPattern : public OpRewritePattern<SetYieldOp> {
-  using OpRewritePattern<SetYieldOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SetYieldOp op,
-                                PatternRewriter &rewriter) const override {
-    auto tryRewrite = [&](Value dst, Value set, OpOperand &src) {
-      if (!dst.getType().isa<VectorType>()) return failure();
-      if (src.get().getType().isa<VectorType>()) return failure();
-      auto tileOp = set.getDefiningOp<TileOp>();
-      if (!tileOp || !tileOp.getOffsets().empty()) return failure();
-
-      src.set(rewriter.create<vector::InsertOp>(op.getLoc(), src.get(), dst,
-                                                tileOp.getStaticOffsets()));
-      return success();
-    };
-
-    if (llvm::none_of(
-            llvm::zip_first(op.getDsts(), op.getSets(), op->getOpOperands()),
-            [&](auto &&tuple) {
-              return succeeded(std::apply(tryRewrite, tuple));
-            })) {
-      return rewriter.notifyMatchFailure(
-          op, "expected scalar srcs and static offsets");
-    }
-
-    return success();
-  }
-};
-
-template <typename OpTy>
-struct VectorizationPattern : public mlir::OpRewritePattern<OpTy> {
-  VectorizationPattern(MLIRContext *context,
-                       llvm::function_ref<bool(OpTy)> matchFn,
-                       mlir::PatternBenefit benefit = 1)
-      : mlir::OpRewritePattern<OpTy>(context, benefit), filterFn(matchFn) {}
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-    return mlir::linalg::vectorize(rewriter, op);
-  }
-
- private:
-  llvm::function_ref<bool(OpTy)> filterFn;
-};
-
-// Generates an offset of all 0s suitable as the index paramter for the builder
-// of vector.transfer_read or vector.transfer_write with input or output
-// `value`, respectively.
-SmallVector<Value, 4> generateDefaultOffsetFor(Value value,
-                                               OpBuilder &builder) {
-  auto shapedType = value.getType().dyn_cast<ShapedType>();
-  if (!shapedType) return {};
-  Value offset = builder.create<arith::ConstantIndexOp>(value.getLoc(), 0);
-  return SmallVector<Value, 4>(shapedType.getRank(), offset);
-}
-
-// Converts the ranked-tensor-typed `bvm`-mapped operands of `op` into vectors
-// via vector.transfer_read. Updates `bvm`'s mapping of `op`'s operands to the
-// newly created vector values.
-void convertTensorOperandsToVector(Operation *op, BlockAndValueMapping &bvm,
-                                   OpBuilder &builder) {
-  OpBuilder::InsertionGuard guard(builder);
-  for (Value operand : op->getOperands()) {
-    Value mappedOperand = bvm.lookupOrDefault(operand);
-    auto tensorType = mappedOperand.getType().dyn_cast<RankedTensorType>();
-    if (!tensorType || tensorType.getNumDynamicDims() > 0) continue;
-    builder.setInsertionPointAfterValue(mappedOperand);
-    Value vectorOperand = builder.createOrFold<TransferReadOp>(
-        mappedOperand.getLoc(),
-        VectorType::get(tensorType.getShape(), tensorType.getElementType()),
-        mappedOperand, generateDefaultOffsetFor(mappedOperand, builder));
-    bvm.map(operand, vectorOperand);
-  }
-}
-
-// Converts the `bvm`-mapped `results` from vectors to tensors using
-// vector.transfer_write, passing in corresponding `destinations` as the
-// destination parameter of vector.transfer_write. Updates `bvm`'s mapping of
-// `op`'s results to the newly generated tensors. Expects that the operation's
-// results are vectors, and the destinations tensors.
-void convertVectorResultsToTensor(ValueRange results, ValueRange destinations,
-                                  BlockAndValueMapping &bvm,
-                                  OpBuilder &builder) {
-  for (auto [result, dest] : llvm::zip(results, destinations)) {
-    Value mappedResult = bvm.lookupOrDefault(result);
-    // Skip over scalars and leave them as is.
-    if (!mappedResult.getType().isa<ShapedType>()) continue;
-    assert(mappedResult.getType().isa<VectorType>() &&
-           "op's result should be a vector");
-    assert(dest.getType().isa<RankedTensorType>() &&
-           "destination should be a tensor");
-    auto writeOp = builder.create<TransferWriteOp>(
-        mappedResult.getLoc(), mappedResult, dest,
-        generateDefaultOffsetFor(dest, builder));
-    bvm.map(result, writeOp.getResult());
-  }
-}
-
-// Rewrite tensor.extract on single-element tensors into a vector.extract.
-struct TensorToElementVectorizationPattern
-    : public mlir::OpRewritePattern<tensor::ExtractOp> {
-  TensorToElementVectorizationPattern(
-      MLIRContext *context, llvm::function_ref<bool(tensor::ExtractOp)> matchFn,
-      mlir::PatternBenefit benefit = 1)
-      : mlir::OpRewritePattern<tensor::ExtractOp>(context, benefit),
-        filterFn(matchFn) {}
-
-  LogicalResult matchAndRewrite(tensor::ExtractOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-    TensorType tensorType = op.getTensor().getType();
-    if (tensorType.getNumDynamicDims() > 0 || tensorType.getNumElements() > 1)
-      return rewriter.notifyMatchFailure(op, "should have a single element");
-
-    BlockAndValueMapping bvm;
-    convertTensorOperandsToVector(op, bvm, rewriter);
-    if (tensorType.getRank() == 0) {
-      // ExtractOp only supports ranks > 0, for rank = 0 use ExtractElementOp
-      rewriter.replaceOpWithNewOp<vector::ExtractElementOp>(
-          op, bvm.lookupOrDefault(op.getTensor()));
-    } else {
-      rewriter.replaceOpWithNewOp<vector::ExtractOp>(
-          op, bvm.lookupOrDefault(op.getTensor()),
-          SmallVector<int64_t, 1>(tensorType.getRank(), 0));
-    }
-    return success();
-  }
-
- private:
-  llvm::function_ref<bool(tensor::ExtractOp)> filterFn;
-};
-
-// Rewrite vector.transfer_read(tensor.empty) into a constant vector of the
-// right size. This is our temporary way of expressing the nonexistent
-// vector.undef, which creates a vector to be used in destination-passing-style
-// ops.
-// TODO(b/255779480): Figure out how to properly solve this issue.
-struct TensorEmptyToVectorBroadcastPattern
-    : public OpRewritePattern<TransferReadOp> {
-  TensorEmptyToVectorBroadcastPattern(
-      MLIRContext *context, llvm::function_ref<bool(TransferReadOp)> filterFn,
-      PatternBenefit benefit = 1)
-      : OpRewritePattern(context, benefit), filterFn(filterFn) {}
-
-  LogicalResult matchAndRewrite(TransferReadOp op,
-                                PatternRewriter &rewriter) const override {
-    if (failed(matchSimpleTransferOp(op, rewriter))) return failure();
-    auto tensorEmpty = op.getSource().getDefiningOp<tensor::EmptyOp>();
-    if (!tensorEmpty)
-      return rewriter.notifyMatchFailure(op, "source should be tensor.empty");
-    VectorType vectorType = op.getResult().getType().dyn_cast<VectorType>();
-    if (!vectorType)
-      return rewriter.notifyMatchFailure(op, "result should be a vector");
-    Type elementType = vectorType.getElementType();
-    TypedAttr nanAttr;
-    if (elementType.isa<IntegerType>()) {
-      nanAttr = rewriter.getIntegerAttr(elementType, 0l);
-    } else if (elementType.isa<FloatType>()) {
-      nanAttr = rewriter.getFloatAttr(elementType,
-                                      std::numeric_limits<double>::quiet_NaN());
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "should operate on integer or floating point vectors");
-    }
-
-    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
-        op, DenseElementsAttr::get(vectorType, nanAttr));
-    return success();
-  }
-
- private:
-  llvm::function_ref<bool(TransferReadOp)> filterFn;
-};
-
-struct MaterializeOpVectorizationPattern
-    : public OpRewritePattern<MaterializeOp> {
-  MaterializeOpVectorizationPattern(
-      MLIRContext *context, llvm::function_ref<bool(MaterializeOp)> filterFn,
-      PatternBenefit benefit = 1)
-      : OpRewritePattern(context, benefit), filterFn(filterFn) {}
-
-  LogicalResult matchAndRewrite(MaterializeOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-    TypedValue<ShapedType> source = op.getSource();
-    ShapedType sourceType = source.getType();
-    // TODO(b/244314345): Support imperfect tiling, which results in dynamic
-    // shapes.
-    if (!sourceType.isa<RankedTensorType>() ||
-        sourceType.getNumDynamicDims() > 0 ||
-        !op.getSet().getType().cast<TileType>().hasStaticShape())
-      return rewriter.notifyMatchFailure(op, "input is not statically shaped");
-
-    Location loc = op.getLoc();
-    BlockAndValueMapping bvm;
-    convertTensorOperandsToVector(op, bvm, rewriter);
-    Type newResult = op.getResult().getType();
-    if (auto tensorResult = newResult.dyn_cast<RankedTensorType>()) {
-      newResult = VectorType::get(tensorResult.getShape(),
-                                  tensorResult.getElementType());
-    }
-    Value vectorMaterialize = rewriter.create<MaterializeOp>(
-        loc, newResult, bvm.lookupOrDefault(source), op.getSet());
-    bvm.map(op, vectorMaterialize);
-    if (auto vectorType = newResult.dyn_cast<VectorType>()) {
-      // The result is not a scalar, generate a TransferWrite back to tensor.
-      // transfer_write uses destination passing style, so we need to "invent" a
-      // destination tensor. The entinre tensor_write op, together with the
-      // invented tensor will be folded when vectorizing the final
-      // gml_st.set_yield op.
-      auto emptyTensor = rewriter.create<tensor::EmptyOp>(
-          loc, vectorType.getShape(), vectorType.getElementType());
-      convertVectorResultsToTensor(op->getResults(), {emptyTensor}, bvm,
-                                   rewriter);
-    }
-    rewriter.replaceOp(op, bvm.lookupOrDefault(op));
-    return success();
-  }
-
- private:
-  llvm::function_ref<bool(MaterializeOp)> filterFn;
-};
-
-// Converts static tensors among `types` to their equivalent vectors.
-SmallVector<Type, 1> convertToVectorTypes(TypeRange types) {
-  return llvm::to_vector<1>(llvm::map_range(types, [&](Type type) -> Type {
-    if (auto tensorType = type.dyn_cast<RankedTensorType>()) {
-      return VectorType::get(tensorType.getShape(),
-                             tensorType.getElementType());
-    }
-    return type;
-  }));
-}
-
-// Copies the body of a loop `op` that is being vectorized, vectorizing the
-// terminator, and stores the mapping to new values into `bvm`.
-void copyLoopBodyAndVectorizeTerminator(LoopLikeOpInterface op,
-                                        OpBuilder &builder,
-                                        BlockAndValueMapping &bvm) {
-  auto &blocks = op.getLoopBody().getBlocks();
-  assert(blocks.size() == 1 && "loop body should contain a single block");
-  Block &block = blocks.front();
-  for (Operation &bodyMember : block.without_terminator()) {
-    builder.clone(bodyMember, bvm);
-  }
-  convertTensorOperandsToVector(block.getTerminator(), bvm, builder);
-  builder.clone(*block.getTerminator(), bvm);
-}
-
-// Vectorizes a gml_st.parallel `op`, and stores the mapping from old to new
-// values into `bvm`.
-ParallelOp vectorizeLoopLikeOp(ParallelOp op, BlockAndValueMapping &bvm,
-                               PatternRewriter &rewriter) {
-  Optional<StringAttr> distTypeAttr;
-  if (auto distType = op.getDistributionType())
-    distTypeAttr = rewriter.getStringAttr(*distType);
-  return rewriter.create<ParallelOp>(
-      op.getLoc(), convertToVectorTypes(op->getResultTypes()),
-      op.getLowerBound(), op.getUpperBound(), op.getStep(), distTypeAttr,
-      [&](OpBuilder &builder, Location, ValueRange inductionVars) {
-        bvm.map(op.getInductionVars(), inductionVars);
-        copyLoopBodyAndVectorizeTerminator(op, builder, bvm);
-      });
-}
-
-// Vectorizes a gml_st.for `op`, and stores the mapping from old to new
-// values into `bvm`.
-ForOp vectorizeLoopLikeOp(ForOp op, BlockAndValueMapping &bvm,
-                          PatternRewriter &rewriter) {
-  convertTensorOperandsToVector(op, bvm, rewriter);
-  auto outputs = llvm::to_vector(llvm::map_range(
-      op.getOutputs(), [&](Value v) { return bvm.lookupOrDefault(v); }));
-  return rewriter.create<ForOp>(
-      op.getLoc(), convertToVectorTypes(op->getResultTypes()),
-      op.getLowerBound(), op.getUpperBound(), op.getStep(), outputs,
-      [&](OpBuilder &builder, Location, ValueRange inductionVars,
-          ValueRange outputs) {
-        bvm.map(op.getInductionVars(), inductionVars);
-        bvm.map(op.getRegionOutputArgs(), outputs);
-        convertVectorResultsToTensor(op.getRegionOutputArgs(), op.getOutputs(),
-                                     bvm, builder);
-        copyLoopBodyAndVectorizeTerminator(op, builder, bvm);
-      });
-}
-
-template <typename LoopLikeOp>
-struct LoopLikeOpVectorizationPattern : public OpRewritePattern<LoopLikeOp> {
-  LoopLikeOpVectorizationPattern(MLIRContext *context,
-                                 llvm::function_ref<bool(LoopLikeOp)> filterFn,
-                                 PatternBenefit benefit = 1)
-      : OpRewritePattern<LoopLikeOp>(context, benefit), filterFn(filterFn) {}
-
-  LogicalResult matchAndRewrite(LoopLikeOp op,
-                                PatternRewriter &rewriter) const override {
-    if (!filterFn(op))
-      return rewriter.notifyMatchFailure(op, "did not match filter");
-    SetYieldOp setYield = op.getTerminator();
-    // Make sure that all the arguments are either tiles or ranked tensors, and
-    // that we have at least one tensor (so that the rewrite is not a no-op).
-    bool hasTensor = false;
-    for (auto [srcType, dstType] : llvm::zip(setYield.getSrcs().getTypes(),
-                                             setYield.getDsts().getTypes())) {
-      // gcc is failing without `template dyn_cast` here.
-      auto dstTensor = dstType.template dyn_cast<RankedTensorType>();
-      // TODO(b/244314345): Support imperfect tiling, which results in dynamic
-      // shapes.
-      if (!dstTensor || dstTensor.getNumDynamicDims() > 0)
-        return rewriter.notifyMatchFailure(
-            op, "destination tensors should be statically shaped");
-      hasTensor = true;
-      if (!srcType.template isa<ShapedType>()) continue;
-      auto srcTensor = srcType.template dyn_cast<RankedTensorType>();
-      if (!srcTensor || srcTensor.getNumDynamicDims() > 0)
-        return rewriter.notifyMatchFailure(
-            op, "source tensors should be statically shaped");
-    }
-    if (!hasTensor) {
-      return rewriter.notifyMatchFailure(
-          op, "should yield at least one tensor to be vectorized");
-    }
-    // We currently only support set_yield without an accumulator, since this
-    // pattern is only needed for GPU, where accumulators are not used.
-    if (!setYield.getAccumulators().empty()) {
-      return rewriter.notifyMatchFailure(
-          op, "shoud not use set_yield accumulators");
-    }
-
-    Location loc = op.getLoc();
-    BlockAndValueMapping bvm;
-
-    auto vectorLoopLikeOp = vectorizeLoopLikeOp(op, bvm, rewriter);
-    bvm.map(op.getResults(), vectorLoopLikeOp.getResults());
-
-    convertVectorResultsToTensor(op->getResults(), op.getLoopLikeOpInits(), bvm,
-                                 rewriter);
-    SmallVector<Value, 1> mappedResults = llvm::to_vector<1>(llvm::map_range(
-        op.getResults(), [&](Value v) { return bvm.lookupOrDefault(v); }));
-
-    rewriter.replaceOp(op, mappedResults);
-    return success();
-  }
-
- private:
-  llvm::function_ref<bool(LoopLikeOp)> filterFn;
-};
-
-RewritePatternSet getDefaultVectorizationPatterns(MLIRContext *ctx) {
-  RewritePatternSet patterns(ctx);
-  mlir::vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
-  mlir::vector::populateVectorReductionToContractPatterns(patterns);
-  patterns.add<mlir::linalg::LinalgCopyVTRForwardingPattern,
-               mlir::linalg::LinalgCopyVTWForwardingPattern>(ctx,
-                                                             /*benefit=*/2);
-  TransferReadOp::getCanonicalizationPatterns(patterns, ctx);
-  TransferWriteOp::getCanonicalizationPatterns(patterns, ctx);
-  return patterns;
-}
-
-bool isInsideGmlStLoop(Operation *op) {
-  Operation *parent = op->getParentOp();
-  return isa<LoopOp>(parent) || isa<ParallelOp>(parent) || isa<ForOp>(parent);
-}
-bool isFillTiledOrSmall(FillOp fill) {
-  if (isInsideGmlStLoop(fill)) return true;
-
-  // Allow vectorization for static shapes with low number of elements.
-  auto outputType = fill.output().getType().cast<mlir::RankedTensorType>();
-  return outputType.hasStaticShape() &&
-         outputType.getNumElements() < kNumElementsThreshold;
-}
-
-bool isGenericOpTiledOrOneDimReduction(GenericOp generic) {
-  if (isInsideGmlStLoop(generic)) return true;
-
-  // Allow vectorization of 1D reductions.
-  return generic.getNumLoops() == 1 && generic.getNumReductionLoops() == 1;
-}
-
-struct VectorizeGmlStLoopsPass
-    : public impl::VectorizeGmlStLoopsPassBase<VectorizeGmlStLoopsPass> {
-  VectorizeGmlStLoopsPass(bool vectorizeGmlStOpsParam,
-                          ArrayRef<StringRef> distributionLabelsParam) {
-    vectorizeGmlStOps = vectorizeGmlStOpsParam;
-    for (StringRef distribution : distributionLabelsParam)
-      distributionLabels.push_back(distribution.str());
-  }
-
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::vector::VectorDialect>();
-  }
-
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *ctx = func.getContext();
-
-    auto isValidDistribution = [&](Operation *op) {
-      if (distributionLabels.empty()) return true;
-      ParallelOp parent = op->getParentOfType<ParallelOp>();
-      if (!parent || !parent.getDistributionType().has_value()) return false;
-      return llvm::find(distributionLabels,
-                        parent.getDistributionType().value()) !=
-             distributionLabels.end();
-    };
-    // These lambdas have to be assigned to local variables, so that they
-    // survive beyond patterns.add() and applyPatternsAndFoldGreedily() calls.
-    auto fillOpFilter = [&](FillOp op) {
-      return isValidDistribution(op) && isFillTiledOrSmall(op);
-    };
-    auto genericOpFilter = [&](GenericOp op) {
-      return isValidDistribution(op) && isGenericOpTiledOrOneDimReduction(op);
-    };
-    auto matmulOpFilter = [&](MatmulOp op) {
-      if (isInsideGmlStLoop(op)) return true;
-      // Allow vectorization for static shapes.
-      auto outputType =
-          op.getResult(0).getType().cast<mlir::RankedTensorType>();
-      return outputType.hasStaticShape();
-    };
-    auto materializeOpFilter = [&](MaterializeOp op) {
-      // Materialize op should only be vectorized if the producer of its
-      // source is within the vectorized region, otherwise we vectorize one
-      // level too much. (E.g., for GPU, if we are vectorizing up to warp level,
-      // we should not vectorize materializes of warp-level tiles from
-      // block-level tiles, since it means we are inserting a
-      // vector.transfer_read on the source, i.e., a block-level tile).
-      Operation *sourceOp = op.getSource().getDefiningOp();
-      return sourceOp && isValidDistribution(sourceOp);
-    };
-
-    RewritePatternSet patterns = getDefaultVectorizationPatterns(ctx);
-    patterns.add<TransferReadOfOneDimExpandShape,
-                 MaterializeFromSingleElementToExtractPattern,
-                 SetYieldOfScalarToVectorPattern>(ctx);
-    patterns.add<VectorizationPattern<FillOp>>(ctx, fillOpFilter);
-    patterns.add<VectorizationPattern<GenericOp>>(ctx, genericOpFilter);
-    patterns.add<VectorizationPattern<MatmulOp>>(ctx, matmulOpFilter);
-    patterns.add<TensorToElementVectorizationPattern,
-                 TensorEmptyToVectorBroadcastPattern>(ctx, isValidDistribution);
-    if (vectorizeGmlStOps) {
-      patterns.add<MaterializeOpVectorizationPattern>(ctx, materializeOpFilter);
-      patterns.add<LoopLikeOpVectorizationPattern<ParallelOp>,
-                   LoopLikeOpVectorizationPattern<ForOp>>(ctx,
-                                                          isValidDistribution);
-    }
-    (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeGmlStLoopsPass(
-    bool vectorizeGmlStOps, ArrayRef<StringRef> distributionLabels) {
-  return std::make_unique<VectorizeGmlStLoopsPass>(vectorizeGmlStOps,
-                                                   distributionLabels);
-}
-
-}  // namespace gml_st
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/CMakeLists.txt
deleted file mode 100644
index e138afa587f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/CMakeLists.txt
deleted file mode 100644
index 9b83249d16b..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(LmhloStructuredInterface
-  lhlo_structured_interface.cc
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-
-  DEPENDS
-  MLIRlhlo_structured_interfaceIncGen
-)
-
-add_mlir_dialect_library(LmhloDialect
-  lhlo_ops.cc
-
-  DEPENDS
-  MLIRlhlo_opsIncGen
-
-  LINK_LIBS PUBLIC
-  HloOpsCommon
-  LmhloStructuredInterface
-  MhloDialect
-  MLIRIR
-)
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/CMakeLists.txt
deleted file mode 100644
index f88a2e43250..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(LmhloPasses
-  legalize_to_tensor_op.cc
-  lhlo_elemental_utils.cc
-  lhlo_fuse_linalg.cc
-  lhlo_legalize_to_affine.cc
-  lhlo_legalize_to_gpu.cc
-  lhlo_legalize_to_parallel_loops.cc
-
-  DEPENDS
-  MLIRlhlo_opsIncGen
-  MLIRLmhloPassIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  MLIRComplexDialect
-  MLIRGPUOps
-  MLIRLinalgDialect
-  MLIRLinalgAnalysis
-  MLIRLinalgTransforms
-  MLIRMhloUtils
-  MLIRIR
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_fuse_linalg.cc
deleted file mode 100644
index c5b2dfef9de..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo/transforms/lhlo_fuse_linalg.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for fusing linalg ops obtained after LHLO
-// lowering.
-
-#include <utility>
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Interfaces/ViewLikeInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace lmhlo {
-
-#define GEN_PASS_DEF_LHLOFUSELINALGPASS
-#include "mlir-hlo/Dialect/lhlo/transforms/lmhlo_passes.h.inc"
-
-namespace {
-
-using linalg::LinalgOp;
-
-class LhloFuseLinalgPass
-    : public impl::LhloFuseLinalgPassBase<LhloFuseLinalgPass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
-  }
-
- public:
-  LhloFuseLinalgPass() = default;
-  LhloFuseLinalgPass(const LhloFuseLinalgPass&) = default;
-  LhloFuseLinalgPass(bool useParallelLoops,
-                     llvm::ArrayRef<unsigned> tileSizes) {
-    tile_sizes_ = tileSizes;
-    use_parallel_loops_.setValue(useParallelLoops);
-  }
-
-  void runOnOperation() override {
-    auto func = getOperation();
-
-    // TODO(pifon): Remove assumption that the function has a single block.
-    if (!llvm::hasSingleElement(func)) {
-      emitError(func.getLoc(), "The function needs to have a single block.");
-      signalPassFailure();
-      return;
-    }
-
-    // The fusion in Linalg is currently possible only when the consumer op is
-    // tiled. In order to greedily fuse the ops, we have to start from the tiled
-    // root linalg ops, i.e. linalg ops that write to output buffers of the
-    // function or are returned in case of escaping allocations.
-    llvm::SmallDenseSet<Value> resultBuffers;
-    for (auto funcArg : func.getArguments()) {
-      resultBuffers.insert(funcArg);
-    }
-    for (auto& block : func) {
-      auto returnOp =
-          mlir::dyn_cast<mlir::func::ReturnOp>(block.getTerminator());
-      if (!returnOp) continue;
-      for (auto operand : returnOp.getOperands()) {
-        resultBuffers.insert(operand);
-      }
-    }
-    // Resolve aliasing operations (like casts) on the result to identify
-    // results. This only handles escaping results.
-    // TODO(herhut): Use BufferizeAliasAnalysis for this.
-    llvm::SmallVector<Value, 4> worklist(resultBuffers.begin(),
-                                         resultBuffers.end());
-    while (!worklist.empty()) {
-      Value result = worklist.pop_back_val();
-      auto* definingOp = result.getDefiningOp();
-      if (!definingOp) {
-        continue;
-      }
-
-      if (auto viewLike = dyn_cast<ViewLikeOpInterface>(definingOp)) {
-        auto alias = viewLike.getViewSource();
-        if (resultBuffers.insert(alias).second) {
-          worklist.push_back(alias);
-        }
-        continue;
-      }
-
-      if (auto toTensor = dyn_cast<bufferization::ToTensorOp>(definingOp)) {
-        auto alias = toTensor.getMemref();
-        if (resultBuffers.insert(alias).second) {
-          worklist.push_back(alias);
-        }
-        continue;
-      }
-
-      if (auto toMemref = dyn_cast<bufferization::ToMemrefOp>(definingOp)) {
-        auto alias = toMemref.getTensor();
-        if (resultBuffers.insert(alias).second) {
-          worklist.push_back(alias);
-        }
-        continue;
-      }
-
-      if (auto tensorCast = dyn_cast<tensor::CastOp>(definingOp)) {
-        auto alias = tensorCast.getSource();
-        if (resultBuffers.insert(alias).second) {
-          worklist.push_back(alias);
-        }
-        continue;
-      }
-
-      if (auto regionInterface =
-              dyn_cast<RegionBranchOpInterface>(definingOp)) {
-        for (Region& region : regionInterface.getOperation()->getRegions()) {
-          // Only consider regions that can return to the parent region.
-          SmallVector<RegionSuccessor, 2> successorRegions;
-          regionInterface.getSuccessorRegions(region.getRegionNumber(),
-                                              successorRegions);
-          if (llvm::none_of(successorRegions, [&](auto successorRegion) {
-                return successorRegion.isParent();
-              }))
-            continue;
-
-          // Iterate over all immediate terminators and record the values
-          // corresponding to result_buffers of interest.
-          for (Block& block : region) {
-            if (block.empty()) continue;
-            Operation& operation = block.back();
-            if (!operation.hasTrait<OpTrait::ReturnLike>()) continue;
-            auto idx = result.dyn_cast<OpResult>().getResultNumber();
-            if (resultBuffers.insert(operation.getOperand(idx)).second) {
-              worklist.push_back(operation.getOperand(idx));
-            }
-          }
-        }
-      }
-    }
-
-    MLIRContext* ctx = func.getContext();
-    OpBuilder b(func);
-    func.walk([&](linalg::GenericOp genericOp) {
-      SmallVector<int64_t, 2> tileSizes(tile_sizes_.begin(), tile_sizes_.end());
-      if (tileSizes.empty()) {
-        tileSizes = SmallVector<int64_t, 2>(genericOp.getNumLoops(), 1);
-      }
-      auto op = cast<LinalgOp>(genericOp.getOperation());
-      for (OpOperand* opOperand : op.getDpsInitOperands()) {
-        if (!resultBuffers.count(opOperand->get())) continue;
-        if (tileGenericOp(op, tileSizes, &b)) {
-          genericOp.erase();
-          return;
-        }
-      }
-    });
-    auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      return signalPassFailure();
-
-    // Fuse producers of tiled linalg ops.
-    llvm::SmallDenseSet<Operation*> eraseSet;
-    SmallVector<LinalgOp, 8> linalgOps;
-    func.walk([&](LinalgOp op) { linalgOps.push_back(op); });
-    for (LinalgOp op : llvm::reverse(linalgOps)) {
-      for (OpOperand* inputOperand : op.getDpsInputOperands()) {
-        linalg::Aliases aliases;
-        linalg::LinalgDependenceGraph graph(aliases, linalgOps);
-        auto info = fuseProducerOfBuffer(b, *inputOperand, graph);
-        if (failed(info)) continue;
-        auto* originalOp = info->originalProducer.getOperation();
-        eraseSet.insert(originalOp);
-        auto* originalOpInLinalgOpsVector =
-            std::find_if(linalgOps.begin(), linalgOps.end(),
-                         [&](const Operation* op) { return op == originalOp; });
-        *originalOpInLinalgOpsVector = info->fusedProducer.getOperation();
-      }
-
-      auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
-      if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-        return signalPassFailure();
-    }
-    for (auto* e : eraseSet) e->erase();
-  }
-
- private:
-  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tileSizes, OpBuilder* b) {
-    auto loopType = use_parallel_loops_
-                        ? linalg::LinalgTilingLoopType::ParallelLoops
-                        : linalg::LinalgTilingLoopType::Loops;
-    IRRewriter rewriter(*b);
-    return succeeded(linalg::tileLinalgOp(
-        rewriter, op,
-        linalg::LinalgTilingOptions().setTileSizes(tileSizes).setLoopType(
-            loopType)));
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLhloFuseLinalgPass(
-    bool useParallelLoops, ArrayRef<unsigned> tileSizes) {
-  return std::make_unique<LhloFuseLinalgPass>(useParallelLoops, tileSizes);
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/CMakeLists.txt
deleted file mode 100644
index b16dd4a6fd4..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(IR)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/CMakeLists.txt
deleted file mode 100644
index c25af334864..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/lhlo_gpu/IR/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(LmhloGPUDialect
-  lhlo_gpu_ops.cc
-
-  DEPENDS
-  MLIRlhlo_gpu_opsIncGen
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MLIRIR
-  HloOpsCommon
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/CMakeLists.txt
deleted file mode 100644
index e138afa587f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
deleted file mode 100644
index d2750882594..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-set(LLVM_TARGET_DEFINITIONS hlo_patterns.td)
-mlir_tablegen(hlo_patterns.cc.inc -gen-rewriters)
-add_public_tablegen_target(MLIRMhloRewriterIncGen)
-
-set(LLVM_TARGET_DEFINITIONS mhlo_canonicalize.td)
-mlir_tablegen(mhlo_canonicalize.inc -gen-rewriters)
-add_public_tablegen_target(MLIRMhloCanonicalizeIncGen)
-
-add_mlir_library(HloOpsCommon
-  hlo_ops_common.cc
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-)
-
-add_mlir_dialect_library(MhloDialect
-  hlo_ops.cc
-  mhlo_bytecode.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloCanonicalizeIncGen
-  MLIRMhloRewriterIncGen
-)
-target_link_libraries(MhloDialect
-  PUBLIC
-  MLIRComplexDialect
-  MLIRIR
-  MLIRMhloUtils
-  MLIRQuantDialect
-  MLIRSparseTensorDialect
-  HloOpsCommon
-  StablehloBase
-)
-target_include_directories(MhloDialect
-  PUBLIC
-  $<BUILD_INTERFACE:${MLIR_HLO_GEN_INCLUDE_DIR}>
-  $<BUILD_INTERFACE:${MLIR_HLO_MAIN_INCLUDE_DIR}>
-)
-
-add_mlir_dialect_library(MhloRegisterDialects
-  init.cc
-DEPENDS
-  MLIRhlo_opsIncGen
-)
-target_link_libraries(MhloRegisterDialects
-  PUBLIC
-  MhloDialect
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
deleted file mode 100644
index 27d2ce4a410..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ /dev/null
@@ -1,9269 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the MHLO dialect.
-
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/TypeSwitch.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MathExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"
-#include "mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h"
-#include "mlir-hlo/utils/convert_op_folder.h"
-#include "mlir-hlo/utils/hlo_utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/FunctionInterfaces.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/InliningUtils.h"
-#include "stablehlo/dialect/AssemblyFormat.h"
-#include "stablehlo/dialect/TypeInference.h"
-
-namespace mlir {
-#include "hlo_patterns.cc.inc"
-}  // namespace mlir
-
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.cc.inc"
-#define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.cc.inc"
-#define GET_TYPEDEF_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.cc.inc"
-
-namespace mlir {
-namespace mhlo {
-
-namespace detail {
-/// A type representing a collection of other types.
-struct AsyncBundleTypeStorage final
-    : public TypeStorage,
-      public llvm::TrailingObjects<AsyncBundleTypeStorage, Type> {
-  using KeyTy = TypeRange;
-
-  explicit AsyncBundleTypeStorage(unsigned numTypes) : numElements(numTypes) {}
-
-  // Construction.
-  static AsyncBundleTypeStorage* construct(TypeStorageAllocator& allocator,
-                                           TypeRange key) {
-    // Allocate a new storage instance.
-    auto byteSize = AsyncBundleTypeStorage::totalSizeToAlloc<Type>(key.size());
-    auto* rawMem =
-        allocator.allocate(byteSize, alignof(AsyncBundleTypeStorage));
-    auto* result = ::new (rawMem) AsyncBundleTypeStorage(key.size());
-
-    // Copy in the element types into the trailing storage.
-    std::uninitialized_copy(key.begin(), key.end(),
-                            result->getTrailingObjects<Type>());
-    return result;
-  }
-
-  bool operator==(const KeyTy& key) const { return key == getTypes(); }
-
-  // Return the number of held types.
-  unsigned size() const { return numElements; }
-
-  // Return the held types.
-  ArrayRef<Type> getTypes() const {
-    return {getTrailingObjects<Type>(), size()};
-  }
-
-  void getFlattenedTypes(SmallVectorImpl<Type>& types) {
-    for (Type type : getTypes()) {
-      if (auto nestedTuple = type.dyn_cast<TupleType>())
-        nestedTuple.getFlattenedTypes(types);
-      else
-        types.push_back(type);
-    }
-  }
-
-  // The number of tuple elements.
-  unsigned numElements;
-};
-
-}  // namespace detail
-/// Return the elements types for this tuple.
-ArrayRef<Type> AsyncBundleType::getTypes() const {
-  return getImpl()->getTypes();
-}
-void AsyncBundleType::getFlattenedTypes(SmallVectorImpl<Type>& types) {
-  getImpl()->getFlattenedTypes(types);
-}
-
-namespace {
-void createArgs(ArrayRef<OpAsmParser::UnresolvedOperand> operands,
-                ArrayRef<Type> types,
-                SmallVector<OpAsmParser::Argument>& args) {
-  for (auto argAndType : llvm::zip(operands, types)) {
-    auto& arg = args.emplace_back();
-    arg.ssaName = std::get<0>(argAndType);
-    arg.type = std::get<1>(argAndType);
-  }
-}
-
-// Checks if the vector `nums` has duplicates.
-const auto hasDuplicates = [](const ArrayRef<int64_t> nums) {
-  llvm::SmallDenseSet<int64_t> set(nums.begin(), nums.end());
-  return set.size() != nums.size();
-};
-
-//===----------------------------------------------------------------------===//
-// Utilities for the canonicalize patterns
-//===----------------------------------------------------------------------===//
-
-// This is an upper limit on how many elements can be folded by an op folder.
-// This limit doesn't apply to some special cases like adding a zero,
-// multiplying by one, doing many operations with splats.
-constexpr int64_t kFoldOpEltLimit = 64 * 1024ll * 1024ll * 1024ll;
-
-// Clamps value to the range [lower, upper].  Requires lower <= upper.
-template <typename T>
-static T clamp(const T& value, const T& lower, const T& upper) {
-  assert(lower <= upper);
-  return std::max(lower, std::min(value, upper));
-}
-
-// Verifies that dimension attribute for the op correctly indexes in operand or
-// result shape.
-template <typename OpT>
-static LogicalResult verifyDimAttr(OpT op) {
-  int64_t rank = -1;
-  if (auto ty =
-          op.getOperand().getType().template dyn_cast<RankedTensorType>()) {
-    rank = ty.getRank();
-  } else if (auto ty = op.getType().template dyn_cast<RankedTensorType>()) {
-    rank = ty.getRank();
-  } else {
-    return success();
-  }
-
-  int64_t dim = op.getDimension();
-  if (dim < 0 || dim >= rank)
-    return op.emitOpError() << "requires dimension attribute in range [0, "
-                            << rank << "); found (" << dim << ")";
-  return success();
-}
-
-// Given the start indices and slice sizes for a dynamic-slice that can be
-// converted to a static slice, returns the limits for the static slice.
-DenseIntElementsAttr buildSliceLimits(DenseIntElementsAttr startIndices,
-                                      DenseIntElementsAttr sliceSizes,
-                                      Builder* builder) {
-  SmallVector<int64_t, 4> sliceLimits;
-  for (int64_t i = 0; i < sliceSizes.getNumElements(); ++i) {
-    int64_t startIndex = startIndices.getValues<IntegerAttr>()[i].getInt();
-    int64_t sliceSize = sliceSizes.getValues<IntegerAttr>()[i].getInt();
-    sliceLimits.push_back(startIndex + sliceSize);
-  }
-  return builder->getI64TensorAttr(sliceLimits);
-}
-
-/// Replaces the given op with the contents of the given single-block region,
-/// using the operands of the block terminator to replace operation results.
-static void replaceOpWithRegion(PatternRewriter& rewriter, Operation* op,
-                                Region& region, ValueRange blockArgs = {}) {
-  assert(llvm::hasSingleElement(region) && "expected single-block region");
-  Block* block = &region.front();
-  Operation* terminator = block->getTerminator();
-  ValueRange results = terminator->getOperands();
-  rewriter.mergeBlockBefore(block, op, blockArgs);
-  rewriter.replaceOp(op, results);
-  rewriter.eraseOp(terminator);
-}
-
-#include "mhlo_canonicalize.inc"
-
-// Common shape function helper for RngNormal and RngUniform.
-static LogicalResult rngInferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  if (operands.size() != 3)
-    return emitOptionalError(location, "expected 3 operands");
-
-  SmallVector<int64_t> shapeVector;
-  Value shapeOperand = operands[2];
-  auto shapeOperandType = shapeOperand.getType().cast<ShapedType>();
-  Type elementType = getElementTypeOrSelf(operands[1]);
-
-  // Operand `shape` (1D by ODS) may be a constant or not, if `shape` is:
-  // 1, not constant and have dynimic dim (tensor<?x>): infer tensor<*x>.
-  // 2. not constant nor dynimic (e.g. tensor<3xi64>): infer tensor<?x?x?x>.
-  // 3. constant (e.g. dense<[2, 3, 5]>): infer tensor<2x3x5x>.
-
-  // Match to check whether the `shape` operand is a constant.
-  DenseIntElementsAttr shape;
-  if (!matchPattern(shapeOperand, m_Constant(&shape))) {
-    int size = shapeOperandType.getDimSize(0);
-    if (hlo::isDynamicDimSize(size)) {
-      inferredReturnShapes.emplace_back(elementType);
-      return success();
-    }
-    shapeVector.resize(size, ShapedType::kDynamicSize);
-    inferredReturnShapes.emplace_back(shapeVector, elementType);
-    return success();
-  }
-
-  // `shape` operand is a constant.
-  shapeVector.reserve(shape.size());
-  for (const APInt& fp : shape.getValues<APInt>())
-    shapeVector.push_back(fp.getSExtValue());
-  inferredReturnShapes.emplace_back(shapeVector, elementType);
-  return success();
-}
-
-// Returns a new scalar integer value having type `type`. Here `type` must be
-// an integer or index type.
-Value maybeCastTo(OpBuilder& b, Location loc, Value value, Type type) {
-  if (type == value.getType()) return value;
-  if (!type.isIndex() && !value.getType().isIndex()) {
-    // in case of i32 -> i64 or vice versa
-    Value casted = b.create<arith::IndexCastOp>(loc, b.getIndexType(), value);
-    return b.create<arith::IndexCastOp>(loc, type, casted);
-  }
-  return b.create<arith::IndexCastOp>(loc, type, value);
-}
-
-DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
-  // TODO(b/232866626): DenseElementsAttr::reshape is broken for bool splats.
-  // Once that ticket is fixed, we can remove this conditional.
-  if (attr.isSplat() && newType.getElementType().isInteger(/*width=*/1)) {
-    auto splatValue = attr.getValues<bool>()[0];
-    return DenseElementsAttr::get(newType, {splatValue});
-  }
-  return attr.reshape(newType);
-}
-
-//===----------------------------------------------------------------------===//
-// Utilities for verifiers
-//===----------------------------------------------------------------------===//
-
-// Convert a 1D dense int64 attribute to a list of values.
-SmallVector<int64_t> convertDenseIntAttr(
-    llvm::Optional<mlir::DenseIntElementsAttr> optionalAttr) {
-  if (!optionalAttr.has_value()) return SmallVector<int64_t>{};
-
-  mlir::DenseIntElementsAttr attr = *optionalAttr;
-  auto values = attr.getValues<int64_t>();
-  return {values.begin(), values.end()};
-}
-
-// Convert a 1D or Nx2 dense int64 attribute to a list of tuples.
-FailureOr<SmallVector<std::pair<int64_t, int64_t>>> convertNx2Attribute(
-    llvm::Optional<mlir::DenseIntElementsAttr> optionalAttr, Location loc) {
-  if (!optionalAttr.has_value())
-    return SmallVector<std::pair<int64_t, int64_t>>{};
-  mlir::DenseIntElementsAttr attr = *optionalAttr;
-
-  auto attrType = attr.getType().cast<RankedTensorType>();  // ensured by ODS.
-  if (attrType.getRank() > 1) {
-    if (attrType.getRank() != 2 || attrType.getShape()[1] != 2)
-      return (mlir::emitError(loc) << "expects the shape of padding-attribute "
-                                      "to be {N, 2}, but got {"
-                                   << attrType.getShape() << "}.",
-              failure());
-  } else {
-    // Padding values can be provided as a 1D vector as well.
-    if (attr.getValues<int64_t>().size() % 2 != 0)
-      return (mlir::emitError(loc)
-                  << "expects the padding-entries to have even number of "
-                     "elements, but got "
-                  << attr.getValues<int64_t>().size() << " elements.",
-              failure());
-  }
-
-  auto it = attr.getValues<int64_t>().begin();
-  SmallVector<std::pair<int64_t, int64_t>> out(attr.getNumElements() / 2);
-  for (auto& item : out) {
-    int64_t first = *it;
-    ++it;
-    int64_t second = *it;
-    ++it;
-    item = {first, second};
-  }
-  return out;
-}
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// Utilities for attributes
-//===----------------------------------------------------------------------===//
-
-LogicalResult TypeExtensionsAttr::verifyEncoding(
-    llvm::ArrayRef<int64_t> bounds, mlir::Type elementType,
-    llvm::function_ref<mlir::InFlightDiagnostic()> emitError) const {
-  return hlo::verifyBounds(
-      getBounds(), RankedTensorType::get(bounds, elementType), emitError);
-}
-
-//===----------------------------------------------------------------------===//
-// CollectivePermuteOp
-//===----------------------------------------------------------------------===//
-
-void CollectivePermuteOp::build(OpBuilder& odsBuilder, OperationState& odsState,
-                                Type resultType, Value operand,
-                                DenseIntElementsAttr sourceTargetPairs) {
-  CollectivePermuteOp::build(odsBuilder, odsState, resultType, operand,
-                             sourceTargetPairs, /*channel_handle=*/nullptr);
-}
-
-//===----------------------------------------------------------------------===//
-// ReduceScatterOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult ReduceScatterOp::verify() {
-  if (failed(mlir::hlo::verifyReplicaGroups(*this, /*isUniformSized=*/true)))
-    return failure();
-  auto operandType = getOperand().getType().cast<TensorType>();
-  bool operandTypeRanked = operandType.isa<RankedTensorType>();
-  Block& block = getComputation().front();
-  SmallVector<TensorType> accumulatorSubshapes;
-  if (failed(hlo::verifyReducerShape(
-          this->getLoc(), block, {operandType},
-          {RankedTensorType::get({}, operandType.getElementType())},
-          /*numInputs=*/1, /*allowedDimensions=*/{},
-          /*allInputsUnranked=*/!operandTypeRanked, accumulatorSubshapes)))
-    return failure();
-
-  return mlir::hlo::verifyReduceScatter(
-      *this,
-      /*operandTypes=*/{getOperand().getType()},
-      /*resultTypes=*/{getType()},
-      /*scatterDimension=*/getScatterDimension());
-}
-
-void ReduceScatterOp::build(OpBuilder& odsBuilder, OperationState& odsState,
-                            Type resultType, Value operand,
-                            IntegerAttr scatterDimension,
-                            DenseIntElementsAttr replicaGroups,
-                            ChannelHandleAttr channelHandle) {
-  ReduceScatterOp::build(odsBuilder, odsState, resultType, operand,
-                         scatterDimension, replicaGroups, channelHandle,
-                         /*use_global_device_ids=*/nullptr);
-}
-
-//===----------------------------------------------------------------------===//
-// CompatibleOperandsAndResultType
-//===----------------------------------------------------------------------===//
-
-// TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
-// support quantization or sparsity.
-#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
-  LogicalResult Op::inferReturnTypeComponents(                                \
-      MLIRContext* context, Optional<Location> location,                      \
-      ValueShapeRange operands, DictionaryAttr attributes,                    \
-      RegionRange regions,                                                    \
-      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
-    return inferReturnTypeComponentsFromOperands(context, location, operands, \
-                                                 attributes, regions,         \
-                                                 inferredReturnShapes);       \
-  }
-
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AllReduceOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AndOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Atan2Op)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CbrtOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CeilOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ClzOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CollectivePermuteOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CopyOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CosineOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CrossReplicaSumOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DivOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DomainOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ExpOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Expm1Op)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(FloorOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LogOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Log1pOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LogisticOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MaxOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MinOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MulOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NegOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NotOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(OrOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(PopulationCountOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(PowOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ReducePrecisionOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RemOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ReverseOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundNearestEvenOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RsqrtOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftLeftOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftRightArithmeticOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftRightLogicalOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SignOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SineOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SqrtOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SubtractOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(TanhOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(XorOp)
-
-//===----------------------------------------------------------------------===//
-// Async ops
-//===----------------------------------------------------------------------===//
-
-Type maybeTupleFromTypes(MLIRContext* ctx, ArrayRef<Type> types) {
-  if (types.size() == 1) return types[0];
-  return TupleType::get(ctx, TypeRange(types));
-}
-
-LogicalResult AsyncStartOp::verify() {
-  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
-  func::FuncOp callee =
-      module.lookupSymbol<func::FuncOp>(getCalledComputation());
-  if (!callee) {
-    return emitOpError() << "can't find function: " << getCalledComputation();
-  }
-  FunctionType calleeType = callee.getFunctionType();
-  auto calleeInputTypes = calleeType.getInputs();
-  auto calleeResultTypes = calleeType.getResults();
-
-  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
-  if (!calleeThreadName)
-    return emitOpError() << "callee must have execution_thread attribute.";
-  if (calleeThreadName != getExecutionThread()) {
-    return emitOpError()
-           << "execution_thread does not match the execution_thread of "
-           << getCalledComputation() << ".  Got: \"" << getExecutionThread()
-           << "\", but expected " << calleeThreadName << ".";
-  }
-
-  if (calleeType.getNumInputs() != getOperands().size()) {
-    return emitOpError() << "number of operands doesn't match operands for "
-                         << getCalledComputation()
-                         << ". Got: " << getOperands().size()
-                         << ", but expected: " << calleeType.getNumInputs()
-                         << ".";
-  }
-  for (int i = 0; i < getOperands().size(); ++i) {
-    if (calleeType.getInput(i) != getOperandTypes()[i]) {
-      return emitOpError() << "type mismatch on argument #" << i << " of "
-                           << getCalledComputation()
-                           << ". Got: " << getOperandTypes()[i]
-                           << ", but expected: " << calleeType.getInput(i)
-                           << ".";
-    }
-  }
-
-  auto resultTypes = getResult().getType().cast<AsyncBundleType>().getTypes();
-  if (resultTypes.size() < 2)
-    return emitOpError() << "result is expected to be a bundle of at least 2 "
-                            "components, but got "
-                         << resultTypes.size();
-  if (resultTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
-    return emitOpError()
-           << "component #0 of return type doesn't match callee input types";
-  }
-  if (resultTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
-    return emitOpError()
-           << "component #1 of return type doesn't match callee result types";
-  }
-
-  return success();
-}
-
-LogicalResult AsyncUpdateOp::verify() {
-  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
-  func::FuncOp callee =
-      module.lookupSymbol<func::FuncOp>(getCalledComputation());
-  if (!callee) {
-    return emitOpError() << "can't find function: " << getCalledComputation();
-  }
-  FunctionType calleeType = callee.getFunctionType();
-  auto calleeInputTypes = calleeType.getInputs();
-  auto calleeResultTypes = calleeType.getResults();
-  auto bundleTypes = getBundle().getType().cast<AsyncBundleType>().getTypes();
-
-  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
-  if (!calleeThreadName)
-    return emitOpError() << "callee must have execution_thread attribute.";
-  if (calleeThreadName != getExecutionThread()) {
-    return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ".  Got: \""
-                         << getExecutionThread() << "\", but expected "
-                         << calleeThreadName << ".";
-  }
-
-  if (bundleTypes.size() < 2)
-    return emitOpError() << "operand is expected to be a bundle of at least 2 "
-                            "components, but got "
-                         << bundleTypes.size();
-  if (bundleTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
-    return emitOpError() << "component #0 of operand bundle type doesn't match "
-                            "callee input types";
-  }
-  if (bundleTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
-    return emitOpError() << "component #1 of operand bundle type doesn't match "
-                            "callee result types";
-  }
-
-  return success();
-}
-
-LogicalResult AsyncUpdateOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncUpdateOp::Adaptor adaptor(operands, attributes, regions);
-  auto stateType = adaptor.getBundle().getType().cast<AsyncBundleType>();
-  inferredReturnTypes.push_back(stateType);
-  return success();
-}
-
-LogicalResult AsyncDoneOp::verify() {
-  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
-  func::FuncOp callee =
-      module.lookupSymbol<func::FuncOp>(getCalledComputation());
-  if (!callee) {
-    return emitOpError() << "can't find function: " << getCalledComputation();
-  }
-  FunctionType calleeType = callee.getFunctionType();
-  auto calleeInputTypes = calleeType.getInputs();
-  auto calleeResultTypes = calleeType.getResults();
-  auto bundleTypes = getBundle().getType().cast<AsyncBundleType>().getTypes();
-
-  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
-  if (!calleeThreadName)
-    return emitOpError() << "callee must have execution_thread attribute.";
-  if (calleeThreadName != getExecutionThread()) {
-    return emitOpError() << "execution_thread does not match name of "
-                         << getCalledComputation() << ".  Got: \""
-                         << getExecutionThread() << "\", but expected "
-                         << calleeThreadName << ".";
-  }
-
-  if (bundleTypes.size() < 2)
-    return emitOpError() << "operand is expected to be a bundle of at least 2 "
-                            "components, but got "
-                         << bundleTypes.size();
-  if (bundleTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
-    return emitOpError()
-           << "operand type component #0 doesn't match callee input types";
-  }
-  if (bundleTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
-    return emitOpError()
-           << "operand type component #1 doesn't match callee result types";
-  }
-
-  return success();
-}
-
-LogicalResult AsyncDoneOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  AsyncDoneOp::Adaptor adaptor(operands, attributes, regions);
-  ModuleOp module =
-      adaptor.getBundle().getDefiningOp()->getParentOfType<ModuleOp>();
-  auto calledComputation = adaptor.getCalledComputationAttr();
-  func::FuncOp callee = module.lookupSymbol<func::FuncOp>(calledComputation);
-  if (!callee) {
-    return adaptor.getBundle().getDefiningOp()->emitOpError()
-           << "can't find function: " << calledComputation;
-  }
-  llvm::append_range(inferredReturnTypes,
-                     callee.getFunctionType().getResults());
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ConstantOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult ConstantOp::fold(ArrayRef<Attribute> operands) {
-  assert(operands.empty() && "constant has no operands");
-
-  // Return the held attribute value.
-  return getValue();
-}
-
-// Builds a constant op with the specified attribute `value`.
-void ConstantOp::build(OpBuilder& /*builder*/, OperationState& result,
-                       Attribute value) {
-  Type type;
-  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
-    type = elemAttr.getType();
-  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
-    // All XLA types must be tensor types. In the build() method, we want to
-    // provide more flexibility by allowing attributes of scalar types. But we
-    // need to wrap it up with ElementsAttr to construct valid XLA constants.
-    type =
-        RankedTensorType::get(/*shape=*/{}, value.cast<TypedAttr>().getType());
-    value = DenseElementsAttr::get(type.cast<TensorType>(), value);
-  } else if (auto complexAttr = value.dyn_cast<complex::NumberAttr>()) {
-    type = RankedTensorType::get(/*shape=*/{},
-                                 complexAttr.cast<TypedAttr>().getType());
-    value =
-        DenseElementsAttr::get(type.cast<TensorType>(), complexAttr.getValue());
-  }
-
-  // TODO: support other XLA specific types.
-  assert(type && "unsupported attribute type for building mhlo.constant");
-  result.types.push_back(type);
-  result.addAttribute("value", value);
-}
-
-LogicalResult ConstantOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  ConstantOpAdaptor adaptor(operands, attributes);
-  Type type = adaptor.getValue().getType();
-  inferredReturnTypes.push_back(type);
-  return success();
-}
-
-bool ConstantOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
-  if (l.size() != r.size() || l.size() != 1) return false;
-  auto lhsTy = l.front().cast<TensorType>();
-  auto rhsTy = r.front().cast<TensorType>();
-  // For comparisons of the uniform quantized element based tensor type, use the
-  // storage type since the constant value will be stored through the underlying
-  // storage type.
-  if (auto rhsElemTy =
-          rhsTy.getElementType().dyn_cast<quant::QuantizedType>()) {
-    rhsTy = hlo::getSameShapeTensorType(rhsTy, rhsElemTy.getStorageType());
-  }
-  return lhsTy == rhsTy;
-}
-
-ParseResult ConstantOp::parse(OpAsmParser& parser, OperationState& result) {
-  // Parse the generic form.
-  if (succeeded(parser.parseOptionalLParen())) {
-    if (parser.parseRParen()) return failure();
-    if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-    if (parser.parseColon() || parser.parseLParen() || parser.parseRParen() ||
-        parser.parseArrow())
-      return failure();
-    Type resultTy;
-    if (parser.parseType(resultTy)) {
-      return failure();
-    }
-    result.addTypes(resultTy);
-    return success();
-  }
-
-  ElementsAttr valueAttr;
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-
-  if (parser.parseCustomAttributeWithFallback(valueAttr, Type{}, "value",
-                                              result.attributes)) {
-    return failure();
-  }
-  result.addTypes(valueAttr.getType());
-  return success();
-}
-
-/// Print a `constant` op.
-///
-/// op ::= attr-dict $value
-///
-/// When the `value` and `output` have different type, it just uses the default
-/// operator assembly format as a fallback.
-void ConstantOp::print(::mlir::OpAsmPrinter& p) {
-  // If not all types are the same, use generic form.
-  if (getValue().getType() != getType()) {
-    p.printGenericOp(getOperation(), /*printOpName=*/false);
-    return;
-  }
-
-  p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{"value"});
-  p << ' ';
-  p.printStrippedAttrOrType(getValueAttr());
-}
-
-//===----------------------------------------------------------------------===//
-// CustomCallOp
-//===----------------------------------------------------------------------===//
-
-void CustomCallOp::build(
-    ::mlir::OpBuilder& odsBuilder, ::mlir::OperationState& odsState,
-    ::mlir::TypeRange resultType, ::mlir::ValueRange operands,
-    ::mlir::StringAttr callTargetName, ::mlir::BoolAttr hasSideEffect,
-    ::mlir::StringAttr backendConfig,
-    ::mlir::mhlo::CustomCallApiVersionAttr apiVersion,
-    ::mlir::ArrayAttr calledComputations, ::mlir::ArrayAttr operandLayouts,
-    ::mlir::ArrayAttr resultLayouts) {
-  return CustomCallOp::build(odsBuilder, odsState, resultType, operands,
-                             callTargetName, hasSideEffect, backendConfig,
-                             apiVersion, calledComputations, operandLayouts,
-                             resultLayouts, nullptr);
-}
-
-LogicalResult CustomCallOp::verify() {
-  // If both operand and result layout attributes are not specified then nothing
-  // to verify.
-  if (getOperandLayouts().has_value() || getResultLayouts().has_value()) {
-    // Layout constraints for either both operands & results or none should be
-    // specified.
-    if (getOperandLayouts().has_value() != getResultLayouts().has_value())
-      return emitOpError() << "Layout attributes should be specified for "
-                              "either both operands and results or none.";
-
-    // Helper function to verify types and the corresponding layouts.
-    auto verifyTypesAndLayouts =
-        [this](TypeRange types, mlir::ArrayAttr layouts,
-               const std::string& valueName) -> LogicalResult {
-      if (types.size() != layouts.size())
-        return emitOpError()
-               << "Number of " << valueName << "s must match the number of "
-               << valueName << " layouts, " << types.size()
-               << " != " << layouts.size();
-
-      for (const auto& indexedTypeAndLayout :
-           llvm::enumerate(llvm::zip(types, layouts))) {
-        // Get index for more descriptive error message.
-        auto index = indexedTypeAndLayout.index();
-
-        auto type = std::get<0>(indexedTypeAndLayout.value());
-        auto layout = std::get<1>(indexedTypeAndLayout.value())
-                          .cast<DenseIntElementsAttr>();
-
-        if (type.isa<TupleType>())
-          return emitOpError() << "Tuple types are not fully supported with "
-                                  "layout constraints yet";
-        auto tensorType = type.dyn_cast<TensorType>();
-
-        // For non-tensor types such as !mhlo.token, the layout should be empty.
-        if (!tensorType) {
-          if (layout.empty()) continue;
-          return emitOpError()
-                 << "Only tensor types can have non-empty layout: " << valueName
-                 << " #" << index << " of type " << type << " has layout "
-                 << layout;
-        }
-
-        // For unranked tensors, we cannot verify the compatibility with layout
-        // any further.
-        if (!tensorType.hasRank()) continue;
-
-        // Layout must be a permutation of [0, N) where N is the rank of the
-        // tensor type.
-        std::vector<int64_t> range(tensorType.getRank());
-        std::iota(range.begin(), range.end(), 0);
-        if (tensorType.getRank() != layout.size() ||
-            !std::is_permutation(range.begin(), range.end(), layout.begin()))
-          return emitOpError()
-                 << "incorrect layout " << layout << " for type " << type
-                 << ", layout must be a permutation of [0, "
-                 << tensorType.getRank() << ")";
-      }
-      return success();
-    };
-
-    // At this point both `operand_layouts` and `result_layouts` are defined.
-    ArrayAttr operandLayouts = this->getOperandLayouts().value();
-    ArrayAttr resultLayouts = this->getResultLayouts().value();
-
-    // Full support for layouts for arbitrary nesting of tuples is not
-    // supported yet.
-    //
-    // If result does not have any tuples, then i-th element of `result_layouts`
-    // specifies the layout constraints on i-th result.
-    //
-    // For the common case of a single tuple result packing non-tuple values,
-    // the i-th element of `result_layouts` specifies layout for i-th element of
-    // the result tuple.
-    TypeRange resultTypes;
-    if (getNumResults() == 1 && getResult(0).getType().isa<TupleType>())
-      resultTypes = getResult(0).getType().cast<TupleType>().getTypes();
-    else
-      resultTypes = getResultTypes();
-
-    // Verify that operands and operand layouts match.
-    if (failed(verifyTypesAndLayouts(getOperandTypes(), operandLayouts,
-                                     "operand")))
-      return failure();
-
-    // Verify that results and result layouts match.
-    if (failed(verifyTypesAndLayouts(resultTypes, resultLayouts, "result")))
-      return failure();
-  }
-
-  // Check output_operand_aliases
-
-  auto aliasArrayAttr = getOutputOperandAliases();
-  for (auto attr : aliasArrayAttr) {
-    auto alias = attr.cast<OutputOperandAliasAttr>();
-    auto outputTupleIndices = alias.getOutputTupleIndices();
-    auto operandIndex = alias.getOperandIndex();
-    auto operandTupleIndices = alias.getOperandTupleIndices();
-
-    if (operandIndex < 0 || operandIndex >= getInputs().size())
-      return emitOpError()
-             << "expects operandIndex in the output_operand_alias attribute "
-                "to be in range [0, "
-             << getInputs().size() << "); got: " << operandIndex << ".";
-
-    Type operandPart = getOperand(operandIndex).getType();
-    for (auto i : operandTupleIndices) {
-      if (!operandPart.isa<TupleType>() ||
-          i >= operandPart.cast<TupleType>().size() || i < 0)
-        return emitOpError()
-               << "operand_tuple_indices in the output_operand_alias "
-                  "attribute out of bounds";
-      operandPart = operandPart.cast<TupleType>().getType(i);
-    }
-    Type outputPart = getNumResults() > 1
-                          ? TupleType::get(getContext(), getResultTypes())
-                          : getResult(0).getType();
-    for (auto i : outputTupleIndices) {
-      if (!outputPart.isa<TupleType>() ||
-          i >= outputPart.cast<TupleType>().size() || i < 0)
-        return emitOpError()
-               << "output_tuple_indices in the output_operand_alias "
-                  "attribute out of bounds";
-      outputPart = outputPart.cast<TupleType>().getType(i);
-    }
-    if (operandPart != outputPart)
-      return emitOpError()
-             << "shapes mismatch in the output_operand_alias attribute: "
-             << "operand part has type " << operandPart
-             << " and output part has type " << outputPart;
-  }
-  return success();
-}
-
-void CustomCallOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
-        effects) {
-  // CustomCall has "all possible effects" unless the has_side_effect is present
-  // and set to false.
-  auto hasSideEffect = (*this)->getAttrOfType<BoolAttr>("has_side_effect");
-  if (hasSideEffect && !hasSideEffect.getValue()) return;
-  effects.emplace_back(MemoryEffects::Allocate::get());
-  effects.emplace_back(MemoryEffects::Free::get());
-  effects.emplace_back(MemoryEffects::Write::get());
-  effects.emplace_back(MemoryEffects::Read::get());
-}
-
-//===----------------------------------------------------------------------===//
-// CholeskyOp
-//===----------------------------------------------------------------------===//
-
-// The following properties are already enforced by the ODS:
-//   P0. a.element_type is floating or complex
-// We intend to verify the following properties
-//   P1. The 'a' argument to Cholesky must have rank >= 2, got shape %s
-//   P2. The two minor dimensions of 'a' must have equal size, got %s.
-LogicalResult CholeskyOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  CholeskyOp::Adaptor adaptor(operands, attributes, regions);
-  Type aType = adaptor.getA().getType();
-  RankedTensorType aRankedType = aType.dyn_cast<RankedTensorType>();
-  if (!aRankedType) {
-    inferredReturnShapes.emplace_back(
-        aType.cast<TensorType>().getElementType());
-    return success();
-  }
-
-  ArrayRef<int64_t> aShape = aRankedType.getShape();
-  if (aShape.size() < 2) {
-    return emitOptionalError(
-        location, "argument 'a' must have rank >= 2, got shape ", aShape, ".");
-  }
-
-  int64_t lastDim = aShape[aShape.size() - 1];
-  int64_t penultimateDim = aShape[aShape.size() - 2];
-  if (!hlo::isDynamicDimSize(lastDim) &&
-      !hlo::isDynamicDimSize(penultimateDim) && lastDim != penultimateDim) {
-    return emitOptionalError(
-        location, "minor dimensions of 'a' must have equal size, got shape ",
-        aShape, ".");
-  }
-  inferredReturnShapes.emplace_back(aRankedType.getShape(),
-                                    aRankedType.getElementType());
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// DotOp
-//===----------------------------------------------------------------------===//
-namespace {
-bool dimCompatible(int64_t a, int64_t b) {
-  return hlo::isDynamicDimSize(a) || hlo::isDynamicDimSize(b) || a == b;
-}
-
-ShapedType inferDotReturnType(ShapedType lhs, ShapedType rhs) {
-  auto elementType = lhs.getElementType();
-  if (!lhs.hasRank() || !rhs.hasRank()) {
-    return UnrankedTensorType::get(elementType);
-  }
-
-  // vector dot vector
-  if (1 == lhs.getRank() && 1 == rhs.getRank() &&
-      dimCompatible(lhs.getDimSize(0), rhs.getDimSize(0))) {
-    return RankedTensorType::get({}, elementType);
-  }
-  // matrix dot vector
-  if (2 == lhs.getRank() && 1 == rhs.getRank() &&
-      dimCompatible(lhs.getDimSize(1), rhs.getDimSize(0))) {
-    return RankedTensorType::get({lhs.getDimSize(0)}, elementType);
-  }
-  // vector dot matrix
-  if (1 == lhs.getRank() && 2 == rhs.getRank() &&
-      dimCompatible(lhs.getDimSize(0), rhs.getDimSize(0))) {
-    return RankedTensorType::get({rhs.getDimSize(1)}, elementType);
-  }
-  // matrix dot matrix
-  if (2 == lhs.getRank() && 2 == rhs.getRank() &&
-      dimCompatible(lhs.getDimSize(1), rhs.getDimSize(0))) {
-    int64_t shape[2] = {lhs.getDimSize(0), rhs.getDimSize(1)};
-    return RankedTensorType::get(shape, elementType);
-  }
-  return {};
-}
-}  // namespace
-
-LogicalResult DotOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  DotOp::Adaptor op(operands);
-  auto lhsType = op.getLhs().getType().cast<ShapedType>();
-  auto rhsType = op.getRhs().getType().cast<ShapedType>();
-  inferredReturnTypes.push_back(inferDotReturnType(lhsType, rhsType));
-  return success();
-}
-
-LogicalResult DotOp::verify() {
-  auto lhsType = getLhs().getType().cast<ShapedType>();
-  auto rhsType = getRhs().getType().cast<ShapedType>();
-  auto resultType = getType().cast<ShapedType>();
-  auto expectReturnType = inferDotReturnType(lhsType, rhsType);
-  if (!expectReturnType) {
-    return emitError() << "Unexpected operands type: " << lhsType << " and "
-                       << rhsType;
-  }
-  if (resultType.hasRank() && expectReturnType.hasRank()) {
-    if (resultType.getShape() != expectReturnType.getShape()) {
-      return emitError() << "Unexpected result type: has " << resultType
-                         << " but inferred " << expectReturnType
-                         << " from operands " << lhsType << " and " << rhsType;
-    }
-  }
-  return success();
-}
-
-LogicalResult DotOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DotOp::Adaptor adaptor(operands);
-  Value lhs = adaptor.getLhs();
-  Value rhs = adaptor.getRhs();
-
-  RankedTensorType lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
-  RankedTensorType rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!lhs_type || !rhs_type) return failure();
-
-  if (lhs_type.getRank() > 2 || rhs_type.getRank() > 2) {
-    // TODO: make sure whether DotOp supports batch dimension or not. The doc
-    // (https://www.tensorflow.org/xla/operation_semantics#dot) does not tell.
-    // We thus only support 1-D and 2-D Dot currently.
-    return failure();
-  }
-
-  Location loc = this->getLoc();
-  SmallVector<Value, 4> shape_values;
-
-  Type shape_scalar_type = builder.getIndexType();
-  auto to_shape_scalar_type = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shape_scalar_type);
-  };
-
-  if (lhs_type.getRank() == 2) {
-    shape_values.emplace_back(
-        to_shape_scalar_type(builder.create<tensor::DimOp>(loc, lhs, 0)));
-  }
-  if (rhs_type.getRank() == 2) {
-    shape_values.emplace_back(
-        to_shape_scalar_type(builder.create<tensor::DimOp>(loc, rhs, 1)));
-  }
-
-  Value output_shape = builder.create<tensor::FromElementsOp>(loc, shape_values);
-  reifiedReturnShapes.push_back(output_shape);
-
-  return success();
-}
-
-
-//===----------------------------------------------------------------------===//
-// DotGeneralOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult DotGeneralOp::verify() {
-  auto dimNumbers = this->getDotDimensionNumbers();
-
-  ArrayRef<int64_t> lhsBatchingDims = dimNumbers.getLhsBatchingDimensions();
-  ArrayRef<int64_t> rhsBatchingDims = dimNumbers.getRhsBatchingDimensions();
-  ArrayRef<int64_t> lhsContractingDims =
-      dimNumbers.getLhsContractingDimensions();
-  ArrayRef<int64_t> rhsContractingDims =
-      dimNumbers.getRhsContractingDimensions();
-
-  if (lhsBatchingDims.size() != rhsBatchingDims.size()) {
-    return emitOpError() << "lhs and rhs should have the same number of "
-                            "batching dimensions";
-  }
-  if (lhsContractingDims.size() != rhsContractingDims.size()) {
-    return emitOpError() << "lhs and rhs should have the same number of "
-                            "contracting dimensions";
-  }
-
-  llvm::SmallDenseSet<int64_t> dimSet;
-
-  auto checkDimsDistinct =
-      [this](ArrayRef<int64_t> batchingDims, ArrayRef<int64_t> contractingDims,
-             llvm::SmallDenseSet<int64_t>& dimSet, llvm::StringRef lhs,
-             llvm::StringRef rhs) -> LogicalResult {
-    auto dims = llvm::concat<const int64_t>(batchingDims, contractingDims);
-    for (auto dim : dims) {
-      auto [_, wasInserted] = dimSet.insert(dim);
-      if (!wasInserted) {
-        return emitOpError() << "has duplicated dimension from " << lhs
-                             << " and " << rhs << ": " << dim;
-      }
-    }
-    return success();
-  };
-
-  if (failed(checkDimsDistinct(lhsBatchingDims, lhsContractingDims, dimSet,
-                               "lhs_batching_dimensions",
-                               "lhs_contracting_dimensions"))) {
-    return failure();
-  }
-  dimSet.clear();
-  if (failed(checkDimsDistinct(rhsBatchingDims, rhsContractingDims, dimSet,
-                               "rhs_batching_dimensions",
-                               "rhs_contracting_dimensions"))) {
-    return failure();
-  }
-
-  auto checkDimsInRange = [this](int64_t rank, ArrayRef<int64_t> dims,
-                                 llvm::StringRef dimName) -> LogicalResult {
-    auto inRange = [&](int64_t i) -> bool { return 0 <= i && i < rank; };
-    const auto* dimsNotInRange =
-        std::find_if_not(dims.begin(), dims.end(), inRange);
-    if (dimsNotInRange != dims.end()) {
-      return emitOpError() << dimName << " value: " << *dimsNotInRange
-                           << " is out of range: "
-                           << "[0, " << rank << ")";
-    }
-    return success();
-  };
-
-  auto lhsType = this->getLhs().getType().dyn_cast<RankedTensorType>();
-  auto rhsType = this->getRhs().getType().dyn_cast<RankedTensorType>();
-
-  if (lhsType) {
-    if (failed(checkDimsInRange(lhsType.getRank(), lhsBatchingDims,
-                                "lhs_batching_dimensions")) ||
-        failed(checkDimsInRange(lhsType.getRank(), lhsContractingDims,
-                                "lhs_contracting_dimensions"))) {
-      return failure();
-    }
-  }
-  if (rhsType) {
-    if (failed(checkDimsInRange(rhsType.getRank(), rhsBatchingDims,
-                                "rhs_batching_dimensions")) ||
-        failed(checkDimsInRange(rhsType.getRank(), rhsContractingDims,
-                                "rhs_contracting_dimensions"))) {
-      return failure();
-    }
-  }
-
-  // =================== BEGIN Added for DISC ======================
-  // tf.BatchMatmul(tensor<?x?x?xf32>, tensor<4x?x?xf32>) is valid, while the
-  // tf.BatchMatmul tf2mhlo converter does not handle shape propagation between
-  // the lhs & rhs, leading to following check fail. Just disable the check
-  // as a workaround.
-  // if (lhsType && rhsType) {
-  //   // Dimension sizes must be compatible for lhs/rhs.
-  //   auto lhsShape = lhsType.getShape();
-  //   auto rhsShape = rhsType.getShape();
-
-  //   for (auto [lhs, rhs] : llvm::zip(lhsBatchingDims, rhsBatchingDims)) {
-  //     if (lhsShape[lhs] != rhsShape[rhs]) {
-  //       return emitOpError() << "batching dimension sizes must match for "
-  //                               "lhs/rhs";
-  //     }
-  //   }
-  //   for (auto [lhs, rhs] : llvm::zip(lhsContractingDims, rhsContractingDims)) {
-  //     if (lhsShape[lhs] != rhsShape[rhs]) {
-  //       return emitOpError() << "contracting dimension sizes must match for "
-  //                               "lhs/rhs";
-  //     }
-  //   }
-  // }
-  return success();
-}
-
-namespace {
-// Handle the generic case of DotGeneral and convert to a regulat DotOp.
-struct DotGeneralToDot : public OpRewritePattern<DotGeneralOp> {
-  using OpRewritePattern<DotGeneralOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DotGeneralOp dot,
-                                PatternRewriter& rewriter) const override {
-    auto lhs = dot.getLhs();
-    auto rhs = dot.getRhs();
-    auto lhsTy = lhs.getType().cast<ShapedType>();
-    auto rhsTy = rhs.getType().cast<ShapedType>();
-
-    if (lhsTy.getRank() != 2) return failure();
-    if (rhsTy.getRank() != 2) return failure();
-
-    auto nums = dot.getDotDimensionNumbers();
-    if (!nums.getLhsBatchingDimensions().empty()) return failure();
-    if (!nums.getRhsBatchingDimensions().empty()) return failure();
-
-    auto lhsContract = nums.getLhsContractingDimensions();
-    auto rhsContract = nums.getRhsContractingDimensions();
-    if (lhsContract.size() != 1 || rhsContract.size() != 1) return failure();
-
-    if (lhsContract.front() != 1) return failure();
-    if (rhsContract.front() != 0) return failure();
-
-    rewriter.replaceOpWithNewOp<mhlo::DotOp>(
-        dot, dot.getType(), lhs, rhs,
-        dot.getPrecisionConfig().value_or(nullptr));
-
-    return success();
-  }
-};
-}  // namespace
-
-void DotGeneralOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                               MLIRContext* context) {
-  // Disc perfer to convert DotOp to DotGeneralOp
-  // results.add<DotGeneralToDot>(context);
-}
-
-LogicalResult DotGeneralOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  auto lhsType = getLhs().getType().dyn_cast<ShapedType>();
-  auto rhsType = getRhs().getType().dyn_cast<ShapedType>();
-  if (!lhsType || !rhsType) {
-    return failure();
-  }
-
-  Adaptor adaptor(operands);
-  auto dimNumbers = getDotDimensionNumbers();
-  SmallVector<Value> dimensions;
-  for (const int64_t lhsDim : dimNumbers.getLhsBatchingDimensions()) {
-    dimensions.push_back(
-        builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), lhsDim));
-  }
-
-  for (int64_t i = 0; i < lhsType.getRank(); i++) {
-    if (!llvm::is_contained(dimNumbers.getLhsContractingDimensions(), i) &&
-        !llvm::is_contained(dimNumbers.getLhsBatchingDimensions(), i)) {
-      dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), i));
-    }
-  }
-  for (int64_t i = 0; i < rhsType.getRank(); i++) {
-    if (!llvm::is_contained(dimNumbers.getRhsContractingDimensions(), i) &&
-        !llvm::is_contained(dimNumbers.getRhsBatchingDimensions(), i)) {
-      dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getRhs(), i));
-    }
-  }
-
-  reifiedReturnShapes.push_back(
-      builder.create<tensor::FromElementsOp>(getLoc(), dimensions));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// FftOp
-//===----------------------------------------------------------------------===//
-
-// We intend to verify the following properties
-// P1. 1 <= rank <= 3
-// P2. Element types agree with fft_type
-// P3. Operand shape dimensions agree with fft_length for the given fft_type
-LogicalResult FftOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  FftOp::Adaptor adaptor(operands, attributes, regions);
-  auto fftLength = adaptor.getFftLength().getValues<int64_t>();
-  int64_t fftRank = fftLength.size();
-
-  // P1.
-  if (fftRank > 3 || fftRank < 1) {
-    return emitOptionalError(location, "rank must be between 1 and 3, but got ",
-                             fftRank, ".");
-  }
-
-  // P2. Element type agreement
-  // FFT : C -> C
-  // IFFT : C -> C
-  // RFFT : R -> C
-  // IRFFT : C -> R
-  auto fftType = adaptor.getFftType();
-  auto operandType = adaptor.getOperand().getType().cast<TensorType>();
-  Type operandElementType = operandType.getElementType();
-  // Check the input element type and infer return element type
-  if (fftType == FftType::RFFT) {
-    if (!operandElementType.isF32() && !operandElementType.isF64()) {
-      return emitOptionalError(
-          location, "RFFT requires f32 or f64 input type, but is given ",
-          operandElementType, ".");
-    }
-  } else {
-    if (!operandElementType.isa<ComplexType>()) {
-      return emitOptionalError(
-          location, stringifyFftType(fftType),
-          " takes a complex tensor as input, but is given ", operandType, ".");
-    }
-  }
-  // Generate the output element type
-  Type resultElementType = operandElementType;
-  if (fftType == FftType::RFFT) {  // RFFT : R -> C
-    resultElementType = ComplexType::get(resultElementType);
-  } else if (fftType == FftType::IRFFT) {  // IRFFT : C -> R
-    resultElementType = operandElementType.cast<ComplexType>().getElementType();
-  }
-
-  // P3. Check input shape and infer return shape
-  operandType = operandType.dyn_cast<RankedTensorType>();
-  if (!operandType) {
-    inferredReturnShapes.emplace_back(resultElementType);
-    return success();
-  }
-  auto operandShape = operandType.getShape();
-  if (static_cast<int64_t>(operandShape.size()) < fftRank) {
-    return emitOptionalError(
-        location, "operand rank must not be less than fft rank of ", fftRank,
-        " for operand of type ", operandType, ".");
-  }
-
-  SmallVector<int64_t> resultShape = to_vector(operandShape);
-
-  if (fftType == FftType::RFFT) {
-    auto shapeBack = operandShape.take_back(fftRank);
-    for (auto [operandDim, fftDim] : llvm::zip(shapeBack, fftLength)) {
-      if (operandDim != fftDim) {
-        return emitOptionalError(
-            location,
-            "RFFT requires innermost dimensions match fft_length. Got: ",
-            operandShape, " but wanted ", fftLength, ".");
-      }
-    }
-    if (fftLength[fftRank - 1] != 0) {
-      resultShape[resultShape.size() - 1] = fftLength[fftRank - 1] / 2 + 1;
-    }
-  }
-  if (fftType == FftType::IRFFT) {
-    auto shapeBack = operandShape.take_back(fftRank).drop_back();
-    for (auto [operandDim, fftDim] : llvm::zip(shapeBack, fftLength)) {
-      if (operandDim != fftDim) {
-        return emitOptionalError(location,
-                                 "IRFFT requires non-final dimensions "
-                                 "match fft_length. Got: ",
-                                 operandShape, " but wanted ", fftLength,
-                                 ", and ", operandDim, " != ", fftDim, ".");
-      }
-    }
-    if ((operandShape[operandShape.size() - 1] != 0 ||
-         fftLength[fftRank - 1] != 0) &&
-        operandShape[operandShape.size() - 1] != fftLength[fftRank - 1] / 2 + 1)
-      return emitOptionalError(location,
-                               "IRFFT requires innermost dimension match "
-                               "fft_length[-1]/2+1. Got: ",
-                               operandShape, " but fft_length is ", fftLength,
-                               ".");
-    resultShape[resultShape.size() - 1] = fftLength[fftRank - 1];
-  }
-
-  inferredReturnShapes.emplace_back(resultShape, resultElementType);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// GatherOp
-//===----------------------------------------------------------------------===//
-
-// Converts gather ops to slice ops in case we have a single set of constant
-// indices.
-struct GatherSlice : public OpRewritePattern<GatherOp> {
-  using OpRewritePattern<GatherOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GatherOp gather,
-                                PatternRewriter& rewriter) const override {
-    DenseIntElementsAttr index;
-    if (!matchPattern(gather.getStartIndices(), m_Constant(&index)))
-      return failure();
-
-    const auto& dnums = gather.getDimensionNumbers();
-    if (dnums.getIndexVectorDim() != 0 || index.getType().getRank() > 1)
-      return failure();
-
-    // TODO(tberghammer): Remove when the verifier catches this case what is
-    // invalid if all previous condition holds.
-    if (index.getNumElements() !=
-        static_cast<int64_t>(dnums.getStartIndexMap().size()))
-      return failure();
-
-    RankedTensorType operandType =
-        gather->getOperand(0).getType().dyn_cast<RankedTensorType>();
-    if (!operandType || !operandType.hasStaticShape()) return failure();
-
-    auto sliceEnd =
-        llvm::to_vector<8>(gather.getSliceSizes().getValues<int64_t>());
-    llvm::SmallVector<int64_t, 8> sliceStart(sliceEnd.size(), 0);
-    for (auto it :
-         llvm::zip(dnums.getStartIndexMap(), index.getValues<APInt>())) {
-      int64_t mapIndex = std::get<0>(it);
-      // Clamp the indices within bounds to faithfully mirror gather semantics.
-      int64_t offset =
-          clamp(std::get<1>(it).getSExtValue(), static_cast<int64_t>(0),
-                operandType.getDimSize(mapIndex) - sliceEnd[mapIndex]);
-      sliceStart[mapIndex] += offset;
-      sliceEnd[mapIndex] += offset;
-    }
-
-    llvm::SmallVector<int64_t, 8> sliceStride(sliceEnd.size(), 1);
-    llvm::SmallVector<int64_t, 8> sliceShape(sliceEnd.size());
-    for (size_t i = 0; i < sliceEnd.size(); ++i) {
-      sliceShape[i] = sliceEnd[i] - sliceStart[i];
-    }
-    Type elementType = gather.getType().cast<TensorType>().getElementType();
-    auto sliceType = RankedTensorType::get(sliceShape, elementType);
-    Value result = rewriter.create<SliceOp>(
-        gather.getLoc(), sliceType, gather.getOperand(),
-        rewriter.getI64TensorAttr(sliceStart),
-        rewriter.getI64TensorAttr(sliceEnd),
-        rewriter.getI64TensorAttr(sliceStride));
-
-    auto collapsedSliceDims = dnums.getCollapsedSliceDims();
-    if (!collapsedSliceDims.empty()) {
-      llvm::SmallVector<int64_t, 8> reshapeShape;
-      for (size_t i = 0; i < sliceShape.size(); ++i) {
-        if (llvm::count(collapsedSliceDims, i) == 0) {
-          reshapeShape.push_back(sliceShape[i]);
-        }
-      }
-      auto reshapeType = RankedTensorType::get(reshapeShape, elementType);
-      result = rewriter.create<ReshapeOp>(gather.getLoc(), reshapeType, result);
-    }
-
-    result.setType(gather.getType());
-    rewriter.replaceOp(gather, result);
-    return success();
-  }
-};
-
-void GatherOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                           MLIRContext* context) {
-  results.add<GatherSlice>(context);
-}
-
-namespace {
-
-// following https://www.tensorflow.org/xla/operation_semantics#gather
-// The bounds for the output array along dimension i is computed as follows:
-// (1) If i is present in batch_dims (i.e. is equal to batch_dims[k] for some k)
-// then we pick
-// the corresponding dimension bounds out of start_indices.shape, skipping
-// index_vector_dim
-// (i.e. pick start_indices.shape.dims[k] if k < index_vector_dim and
-// start_indices.shape.dims[k+1] otherwise).
-// (2) If i is present in offset_dims (i.e. equal to offset_dims[k] for some k)
-// then we pick
-// the corresponding bound out of slice_sizes after accounting for
-// collapsed_slice_dims
-// (i.e. we pick adjusted_slice_sizes[k] where adjusted_slice_sizes is
-// slice_sizes with the bounds at indices collapsed_slice_dims removed).
-
-void getSliceSizeValues(GatherOp* gather, OpBuilder& builder, Location loc,
-                        ValueRange operands,
-                        SmallVectorImpl<Value>& sliceSizes) {
-  for (int64_t val : gather->getSliceSizes().getValues<int64_t>()) {
-    sliceSizes.push_back(builder.create<arith::ConstantIndexOp>(loc, val));
-  }
-}
-
-void getSliceSizeValues(DynamicGatherOp* /*dGather*/, OpBuilder& builder,
-                        Location loc, ValueRange operands,
-                        SmallVectorImpl<Value>& sliceSizeValues) {
-  DynamicGatherOp::Adaptor adaptor(operands);
-  Value sliceSizes = adaptor.getSliceSizes();
-  auto sliceSizesTy = sliceSizes.getType().cast<ShapedType>();
-  for (int64_t i = 0; i < sliceSizesTy.getDimSize(0); ++i) {
-    Value idx = builder.create<arith::ConstantIndexOp>(loc, i);
-    sliceSizeValues.push_back(
-        builder.create<tensor::ExtractOp>(loc, sliceSizes, idx));
-  }
-}
-
-// Verify the following properties:
-//  P1. Verify no repeat in start_index_map.
-//  P2. Verify 0 <= start_index_map[i] < rank(operand), for every i.
-//  P3. Verify 0 <= index_vector_dim <= rank(start_indices).
-//  P4. Verify size(start_index_map) == shape(start_indices)[index_vector_dim].
-//  P5. Verify offset_dims is_sorted and no repeated.
-//  P6. Verify collapsed_slice_dims is_sorted and no repeated.
-//  P7. Verify rank(operand) == size(offset_dims) + size(collapsed_slice_dims).
-//  P8. Verify slice_sizes has rank of 1.
-//  P9. Verify size(slice_sizes) == rank(operand).
-//  P10. Verify 0 <= collapsed_slice_dims[i] < size(slice_sizes) for all items.
-static LogicalResult verifyGather(
-    ShapeAdaptor operandShape, ShapeAdaptor startIndicesShape,
-    ShapeAdaptor sliceSizesShape, GatherDimensionNumbersAttr dimensionNumbers,
-    llvm::function_ref<InFlightDiagnostic()> errorEmitter) {
-  int64_t indexVectorDim = dimensionNumbers.getIndexVectorDim();
-
-  // Check startIndexMap
-  auto startIndexMap = to_vector(dimensionNumbers.getStartIndexMap());
-  // P1.
-  if (hasDuplicates(startIndexMap))
-    return errorEmitter() << "expects start_index_map to not repeat, got: ["
-                          << startIndexMap << "]";
-
-  // P2.
-  for (auto i : llvm::seq<int64_t>(0, startIndexMap.size()))
-    if (startIndexMap[i] < 0 ||
-        (operandShape.hasRank() && startIndexMap[i] >= operandShape.getRank()))
-      return errorEmitter()
-             << "start_index_map[" << i << "]: " << startIndexMap[i]
-             << " is out of bounds for "
-             << "operand rank " << operandShape.getRank();
-
-  if (startIndicesShape.hasRank()) {
-    // P3.
-    // index_vector_dim == start_indices.rank implies a trailing 1 on the shape
-    // of start_indices.
-    if (indexVectorDim > startIndicesShape.getRank() || indexVectorDim < 0)
-      return errorEmitter() << "index_vector_dim " << indexVectorDim
-                            << " is out of bounds for start indices with rank "
-                            << startIndicesShape.getRank();
-
-    bool impliedTrailingDim = indexVectorDim == startIndicesShape.getRank();
-    if (impliedTrailingDim || !startIndicesShape.isDynamicDim(indexVectorDim)) {
-      int64_t effectiveDimSize;
-      if (impliedTrailingDim)
-        effectiveDimSize = 1;
-      else
-        effectiveDimSize = startIndicesShape.getDimSize(indexVectorDim);
-      // P4.
-      if (effectiveDimSize !=
-          static_cast<int64_t>(dimensionNumbers.getStartIndexMap().size()))
-        return errorEmitter() << "start_index_map size ("
-                              << dimensionNumbers.getStartIndexMap().size()
-                              << ") is not equal to size of index dimension ("
-                              << indexVectorDim << ") of start_indices ("
-                              << effectiveDimSize << ")";
-    }
-  }
-
-  // P5.
-  auto offsetDims = to_vector(dimensionNumbers.getOffsetDims());
-  if (!llvm::is_sorted(offsetDims))
-    return errorEmitter() << "expects offset_dims to be sorted, got: ["
-                          << offsetDims << "]";
-  if (hasDuplicates(offsetDims))
-    return errorEmitter() << "expects offset_dims to not repeat, got: ["
-                          << offsetDims << "]";
-
-  // P6.
-  auto collapsedSliceDims = to_vector(dimensionNumbers.getCollapsedSliceDims());
-  if (!llvm::is_sorted(collapsedSliceDims))
-    return errorEmitter() << "expects collapsed_slice_dims to be sorted, got: ["
-                          << collapsedSliceDims << "]";
-  if (hasDuplicates(collapsedSliceDims))
-    return errorEmitter()
-           << "expects collapsed_slice_dims to not repeat, got: ["
-           << collapsedSliceDims << "]";
-
-  // P7.
-  int64_t impliedOperandRank = dimensionNumbers.getOffsetDims().size() +
-                               dimensionNumbers.getCollapsedSliceDims().size();
-  if (operandShape.hasRank() && operandShape.getRank() != impliedOperandRank)
-    return errorEmitter() << "offset_dims size ("
-                          << dimensionNumbers.getOffsetDims().size()
-                          << ") plus collapse_slice_dims size ("
-                          << dimensionNumbers.getCollapsedSliceDims().size()
-                          << ") is not equal to operand rank ("
-                          << operandShape.getRank() << ")";
-
-  // P8.
-  // This should be fully expressible with type constraints, but it isn't
-  // obvious how to do that with the current infrastructure.
-  if (sliceSizesShape.hasRank() && sliceSizesShape.getRank() != 1)
-    return errorEmitter() << "slice_sizes.rank != 1";
-  if (sliceSizesShape.hasStaticShape()) {
-    int64_t sliceSize = sliceSizesShape.getNumElements();
-
-    // P9.
-    if (sliceSize != impliedOperandRank)
-      return errorEmitter() << "slice_sizes size (" << sliceSize
-                            << ") not equal to (implied) operand rank ("
-                            << impliedOperandRank << ")";
-
-    // P10.
-    for (auto dim : dimensionNumbers.getCollapsedSliceDims())
-      if (dim < 0 || dim >= sliceSize)
-        return errorEmitter() << "collapsed dimension " << dim
-                              << " is out of bounds for slice_sizes.size ("
-                              << sliceSize << ")";
-  }
-
-  return success();
-}
-
-// Verify the following properties:
-//  P1. Verifications by verifyGather().
-//  P2. Verify slice_sizes[i] <= 1 for i in collapsed_slice_dims.
-//  P3. Verify 0 <= slice_sizes[i] < shape(operand)[i], for every i.
-static LogicalResult verifyStaticGather(
-    ShapeAdaptor operandShape, ShapeAdaptor startIndicesShape,
-    DenseIntElementsAttr sliceSizes,
-    GatherDimensionNumbersAttr dimensionNumbers,
-    llvm::function_ref<InFlightDiagnostic()> errorEmitter) {
-  // P1.
-  // For some reason the getType call is necessary here
-  if (failed(verifyGather(
-          /*operandShape=*/operandShape,
-          /*startIndicesShape=*/startIndicesShape,
-          /*sliceSizesShape=*/sliceSizes.getType(), dimensionNumbers,
-          errorEmitter)))
-    return failure();
-
-  // P2.
-  for (auto dim : dimensionNumbers.getCollapsedSliceDims()) {
-    int64_t sliceDimSize = sliceSizes.getValues<int64_t>()[dim];
-    if (sliceDimSize > 1) {
-      return errorEmitter() << "slice_sizes collapsed dimension " << dim
-                            << " should <= 1 but got " << sliceDimSize;
-    }
-  }
-
-  // P3.
-  if (operandShape.hasRank()) {
-    for (const auto& it : llvm::enumerate(sliceSizes.getValues<int64_t>())) {
-      if (operandShape.isDynamicDim(it.index())) continue;
-      auto operandDimSize = operandShape.getDimSize(it.index());
-      auto sliceDimSize = it.value();
-      if (sliceDimSize < 0 || sliceDimSize > operandDimSize)
-        return errorEmitter() << "slice size (" << sliceDimSize
-                              << ") is out of bounds for operand dimension ("
-                              << operandDimSize << ") at index " << it.index();
-    }
-  }
-  return success();
-}
-
-template <typename dimTy>
-static void inferGatherShape(
-    int64_t resultRank, llvm::function_ref<dimTy(int64_t)> getStartIndicesDim,
-    llvm::function_ref<dimTy(int64_t)> getSliceDim,
-    GatherDimensionNumbersAttr dimensionNumbers,
-    SmallVectorImpl<dimTy>& shape) {
-  ArrayRef<int64_t> collapsedSliceDims =
-      dimensionNumbers.getCollapsedSliceDims();
-  int64_t indexVectorDim = dimensionNumbers.getIndexVectorDim();
-
-  // We don't necessarily know the rank of sliceSizes, but we do know that it
-  // can't be larger than the highest collapsed dimension. So go through those
-  // and populate the leading dimensions of adjustedSliceSizes. The trailing
-  // dimensions can just be adjusted by an offset.
-  const auto* maxCollapsedDimIt =
-      std::max_element(collapsedSliceDims.begin(), collapsedSliceDims.end());
-  int64_t maxCollapsedDim = -1;
-  if (maxCollapsedDimIt != collapsedSliceDims.end())
-    maxCollapsedDim = *maxCollapsedDimIt;
-
-  SmallVector<dimTy> adjustedSliceSizePrefix;
-  for (int dimIndex = 0; dimIndex <= maxCollapsedDim; ++dimIndex) {
-    if (llvm::is_contained(collapsedSliceDims, dimIndex)) continue;
-    adjustedSliceSizePrefix.push_back(getSliceDim(dimIndex));
-  }
-  auto getAdjustedSliceDim = [&](int64_t index) -> dimTy {
-    if (index < static_cast<int64_t>(adjustedSliceSizePrefix.size()))
-      return adjustedSliceSizePrefix[index];
-    return getSliceDim(index + collapsedSliceDims.size());
-  };
-
-  ArrayRef<int64_t> offsetDims = dimensionNumbers.getOffsetDims();
-
-  // Dimensions in the output that aren't offset dimensions are called batch
-  // dimensions.
-  SmallVector<int64_t> batchDims;
-  for (int dim = 0; dim < resultRank; ++dim)
-    if (!llvm::is_contained(offsetDims, dim)) batchDims.push_back(dim);
-
-  for (int i = 0; i < resultRank; ++i) {
-    const auto* offsetDimsIt =
-        std::find(offsetDims.begin(), offsetDims.end(), i);
-    if (offsetDimsIt != offsetDims.end()) {
-      auto index = std::distance(offsetDims.begin(), offsetDimsIt);
-      shape.push_back(getAdjustedSliceDim(index));
-      continue;
-    }
-    auto* batchDimsIt = std::find(batchDims.begin(), batchDims.end(), i);
-    assert(batchDimsIt != batchDims.end());
-    auto index = std::distance(batchDims.begin(), batchDimsIt);
-    // This can never run into the special case where start_indices gets
-    // implicitly expanded with a trailing 1 if
-    // index_vector_dim = start_indices.rank because then index would equal
-    // index_vector_dim, which means we'd be looking at index+1, which would be
-    // out of bounds anyway.
-    if (index >= indexVectorDim) ++index;
-    shape.push_back(getStartIndicesDim(index));
-  }
-}
-
-// Verify the following properties:
-//  P1. Verify 0 <= offset_dims[i] < output_shape_rank, for every i.
-//      (output_shape_rank = size(offset_dims) + rank(start_indices) -1)
-static LogicalResult inferGatherReturnTypeComponents(
-    ShapeAdaptor operandShape, ShapeAdaptor startIndicesShape,
-    llvm::function_ref<int64_t(int64_t)> getSliceDim,
-    GatherDimensionNumbersAttr dimensionNumbers,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes,
-    llvm::function_ref<InFlightDiagnostic()> errorEmitter) {
-  Type elementType = operandShape.getElementType();
-
-  // We need this to determine the result rank. We could still place bounds on
-  // the result rank if that was something ShapedTypeComponents could express.
-  if (!startIndicesShape.hasRank()) {
-    inferredReturnShapes.push_back(elementType);
-    return success();
-  }
-
-  ArrayRef<int64_t> offsetDims = dimensionNumbers.getOffsetDims();
-  int64_t startIndicesRank = startIndicesShape.getRank();
-  // If index_vector_dim == start_indices.rank, then an implicit trailing 1 is
-  // appended to start_indices shape.
-  if (dimensionNumbers.getIndexVectorDim() == startIndicesRank)
-    ++startIndicesRank;
-  int64_t resultRank = offsetDims.size() + startIndicesRank - 1;
-  // P1.
-  for (auto i : llvm::seq<int64_t>(0, offsetDims.size()))
-    if (offsetDims[i] < 0 || offsetDims[i] >= resultRank)
-      return errorEmitter() << "offset_dims[" << i << "]: " << offsetDims[i]
-                            << " is out of bounds for "
-                            << "implied result rank " << resultRank;
-
-  auto getStartIndicesDim = [&](int64_t index) {
-    return startIndicesShape.getDimSize(index);
-  };
-
-  SmallVector<int64_t> shape;
-  inferGatherShape<int64_t>(resultRank, getStartIndicesDim, getSliceDim,
-                            dimensionNumbers, shape);
-
-  inferredReturnShapes.emplace_back(shape, elementType);
-  return success();
-}
-
-template <typename Op>
-LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
-                               SmallVectorImpl<Value>& reifiedReturnShapes) {
-  // No support for unranked gather output shape a.t.m.
-  auto resultTy =
-      op->getResult().getType().template dyn_cast<RankedTensorType>();
-  if (!resultTy) return failure();
-
-  typename Op::Adaptor adaptor(operands);
-  Value startIndices = adaptor.getStartIndices();
-
-  Location loc = op->getLoc();
-  int resultRank = resultTy.getRank();
-  Type shapeElTy = startIndices.getType().cast<ShapedType>().getElementType();
-  auto toShapeElType = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shapeElTy);
-  };
-
-  SmallVector<Value, 4> sliceSizes;
-  getSliceSizeValues(op, builder, loc, operands, sliceSizes);
-  llvm::transform(sliceSizes, sliceSizes.begin(),
-                  [&](Value v) { return toShapeElType(v); });
-
-  auto getStartIndicesDim = [&](int64_t index) {
-    return toShapeElType(
-        builder.create<tensor::DimOp>(loc, startIndices, index));
-  };
-  SmallVector<Value, 4> shapeValues;
-  auto getSliceDim = [&sliceSizes](int64_t index) -> Value {
-    return sliceSizes[index];
-  };
-  inferGatherShape<Value>(resultRank, getStartIndicesDim, getSliceDim,
-                          op->getDimensionNumbers(), shapeValues);
-
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc, RankedTensorType::get({resultRank}, shapeElTy), shapeValues);
-  reifiedReturnShapes.push_back(outputShape);
-
-  return success();
-}
-
-}  // namespace
-
-LogicalResult GatherOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  return reifyGatherShape(this, builder, operands, reifiedReturnShapes);
-}
-
-// The following properties are already enforced by the ODS:
-//  P0. Verify the start_indices has element type of integer.
-// Verify the following properties:
-//  Verifications by verifyStaticGather() and verifyGather() inside it.
-//  Verifications by inferGatherReturnTypeComponents.
-LogicalResult GatherOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  // TODO(zhouxin) remove this comment after the ordering issue is clear.
-  // This can get called before other op verify methods, so we have to do a
-  // bunch of verification up front. With a better story for ordering and/or
-  // multi-phase op verification, this should hopefully all go away.
-  Location loc = location.value_or(UnknownLoc::get(context));
-  auto errorEmitter = [&loc]() {
-    return mlir::emitError(loc)
-           << "'" << GatherOp::getOperationName() << "' op ";
-  };
-  GatherOp::Adaptor adaptor(operands, attributes, regions);
-  if (failed(adaptor.verify(loc))) return failure();
-
-  // We want the ShapeAdaptors, so can't route via the adaptor :-/
-  ShapeAdaptor operandShape = operands.getShape(0);
-  ShapeAdaptor startIndicesShape = operands.getShape(1);
-  GatherDimensionNumbersAttr dimensionNumbers = adaptor.getDimensionNumbers();
-  DenseIntElementsAttr sliceSizesAttr = adaptor.getSliceSizes();
-
-  if (failed(verifyStaticGather(/*operandShape=*/operandShape,
-                                /*startIndicesShape=*/startIndicesShape,
-                                /*sliceSizes=*/sliceSizesAttr, dimensionNumbers,
-                                errorEmitter)))
-    return failure();
-
-  auto getSliceDim = [&sliceSizesAttr](int64_t index) -> int64_t {
-    return sliceSizesAttr.getValues<int64_t>()[index];
-  };
-
-  return inferGatherReturnTypeComponents(operandShape, startIndicesShape,
-                                         getSliceDim, dimensionNumbers,
-                                         inferredReturnShapes, errorEmitter);
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicGatherOp
-//===----------------------------------------------------------------------===//
-
-// Canonicalize mhlo.dynamic_gather to mhlo.gather when slice_sizes is constant.
-LogicalResult simplifyDynamicGatherToGather(DynamicGatherOp op,
-                                            PatternRewriter& rewriter) {
-  DenseIntElementsAttr sliceSizes;
-  if (!matchPattern(op.getSliceSizes(), m_Constant(&sliceSizes))) {
-    return failure();
-  }
-  rewriter.replaceOpWithNewOp<mhlo::GatherOp>(
-      op, op.getOperand(), op.getStartIndices(), op.getDimensionNumbersAttr(),
-      sliceSizes, op.getIndicesAreSortedAttr());
-  return success();
-}
-
-void DynamicGatherOp::getCanonicalizationPatterns(RewritePatternSet& result,
-                                                  MLIRContext* context) {
-  // result.add(simplifyDynamicGatherToGather);
-}
-
-LogicalResult DynamicGatherOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  return reifyGatherShape(this, builder, operands, reifiedReturnShapes);
-}
-
-LogicalResult DynamicGatherOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  // This can get called before other op verify methods, so we have to do a
-  // bunch of verification up front. With a better story for ordering and/or
-  // multi-phase op verification, this should hopefully all go away.
-  Location loc = location.value_or(UnknownLoc::get(context));
-  auto errorEmitter = [&loc]() {
-    return mlir::emitError(loc)
-           << "'" << DynamicGatherOp::getOperationName() << "' op ";
-  };
-  DynamicGatherOp::Adaptor adaptor(operands, attributes, regions);
-  if (failed(adaptor.verify(loc))) return failure();
-
-  // We want the ShapeAdaptors, so can't route via the adaptor :-/
-  ShapeAdaptor operandShape = operands.getShape(0);
-  ShapeAdaptor startIndicesShape = operands.getShape(1);
-  ShapeAdaptor sliceSizesShape = operands.getShape(2);
-  GatherDimensionNumbersAttr dimensionNumbers = adaptor.getDimensionNumbers();
-
-  if (failed(verifyGather(/*operandShape=*/operandShape,
-                          /*startIndicesShape=*/startIndicesShape,
-                          /*sliceSizesShape=*/sliceSizesShape, dimensionNumbers,
-                          errorEmitter)))
-    return failure();
-
-  auto getSliceDim = [](int64_t index) { return ShapedType::kDynamicSize; };
-  return inferGatherReturnTypeComponents(operandShape, startIndicesShape,
-                                         getSliceDim, dimensionNumbers,
-                                         inferredReturnShapes, errorEmitter);
-}
-
-//===----------------------------------------------------------------------===//
-// GetDimensionSizeOp
-//===----------------------------------------------------------------------===//
-//
-LogicalResult GetDimensionSizeOp::verify() { return verifyDimAttr(*this); }
-
-/// Fold get_dimension_size when the said shape dimension is a constant.
-OpFoldResult GetDimensionSizeOp::fold(ArrayRef<Attribute> attrs) {
-  RankedTensorType type = getOperand().getType().dyn_cast<RankedTensorType>();
-  if (!type) return {};
-
-  int32_t dim = getDimension();
-  if (type.isDynamicDim(dim)) return {};
-  // The result type is always is a 0-d i32 tensor.
-  return DenseIntElementsAttr::get<int32_t>(
-      getResult().getType().cast<RankedTensorType>(), type.getDimSize(dim));
-}
-
-//===----------------------------------------------------------------------===//
-// IotaOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult IotaOp::verify() {
-  auto shape = getType().cast<ShapedType>();
-  if (!shape.hasRank()) return success();
-
-  if (shape.getRank() == 0) return emitOpError() << "does not support scalars.";
-
-  auto iotaDimension = static_cast<int64_t>(this->getIotaDimension());
-  if (iotaDimension >= shape.getRank() || iotaDimension < 0)
-    return emitOpError()
-           << "iota dimension cannot go beyond the output rank or be negative.";
-  return success();
-}
-
-// Iota operations across multiple dimensions can be reduced to an iota and a
-// ranked broadcast.
-struct IotaBroadcast : public OpRewritePattern<IotaOp> {
-  using OpRewritePattern<IotaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(IotaOp iota,
-                                PatternRewriter& rewriter) const override {
-    auto resultTy = iota.getType().cast<ShapedType>();
-    if (!resultTy.hasRank() || resultTy.getRank() < 2) {
-      return failure();
-    }
-
-    auto iotaDimension = iota.getIotaDimension();
-
-    auto iotaType = RankedTensorType::get({resultTy.getDimSize(iotaDimension)},
-                                          resultTy.getElementType());
-
-    auto newIota = rewriter.create<IotaOp>(iota.getLoc(), iotaType,
-                                           rewriter.getI64IntegerAttr(0));
-
-    auto broadcastAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({1}, rewriter.getIntegerType(64)),
-        {iotaDimension});
-    rewriter.replaceOpWithNewOp<BroadcastInDimOp>(iota, resultTy, newIota,
-                                                  broadcastAttr);
-    return success();
-  }
-};
-
-void IotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                         MLIRContext* context) {
-  results.add<IotaBroadcast>(context);
-}
-
-OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  auto dimension = getIotaDimension();
-  auto resultTy = getResult().getType().cast<ShapedType>();
-  if (resultTy.hasRank() && resultTy.getDimSize(dimension) == 1) {
-    Builder builder(getContext());
-    return builder.getZeroAttr(resultTy);
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicIotaOp
-//===----------------------------------------------------------------------===//
-
-// Does the same as PatternRewriter::replaceOpWithNewOp, but with a twist.
-//
-// Sometimes, we want to replace an op with a new op and simultaneously refine
-// the result type from a dynamically-shaped type to a statically-shaped type.
-// (Search for usages of this function for examples).
-//
-// Oftentimes, this works just fine because MHLO is designed to accommodate
-// this kind of type refinements. But sometimes, this doesn't work - when
-// the op is used outside of the MHLO dialect (e.g. in func.return). In these
-// cases, we insert a tensor.cast to smooth things out.
-template <typename OpTy, typename... Args>
-OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
-                       Args&&... args) {
-  auto newOp = rewriter.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
-
-  llvm::SmallVector<Value> replacementResults;
-  assert(op->getNumResults() == newOp->getNumResults() &&
-         "replacement op doesn't match results of original op");
-  for (auto [opResult, newOpResult] :
-       llvm::zip(op->getResults(), newOp->getResults())) {
-    Value replacementResult = newOpResult;
-    if (llvm::any_of(opResult.getUsers(), [&](Operation* user) {
-          return user->getDialect() != op->getDialect();
-        })) {
-      replacementResult = rewriter.create<tensor::CastOp>(
-          op->getLoc(), opResult.getType(), newOpResult);
-    }
-    replacementResults.push_back(replacementResult);
-  }
-
-  rewriter.replaceOp(op, replacementResults);
-  return newOp;
-}
-
-namespace {
-
-struct DynamicIotaIsStatic : public OpRewritePattern<DynamicIotaOp> {
-  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicIotaOp iota,
-                                PatternRewriter& rewriter) const override {
-    // Result type has static shape, replace with iota.
-    auto resultTy = iota.getType().cast<ShapedType>();
-    if (resultTy.hasStaticShape()) {
-      rewriter.replaceOpWithNewOp<IotaOp>(iota, resultTy,
-                                          iota.getIotaDimension());
-      return success();
-    }
-
-    // Output shape is constant, compute result type with static shape, then
-    // replace with iota.
-    DenseIntElementsAttr outputShapeAttr;
-    if (matchPattern(iota.getOutputShape(), m_Constant(&outputShapeAttr))) {
-      SmallVector<int64_t> outputShape;
-      for (APInt dim : outputShapeAttr.getValues<APInt>()) {
-        outputShape.push_back(dim.getSExtValue());
-      }
-      resultTy = RankedTensorType::get(outputShape, resultTy.getElementType());
-      refineOpWithNewOp<IotaOp>(rewriter, iota, resultTy,
-                                iota.getIotaDimension());
-      return success();
-    }
-
-    return rewriter.notifyMatchFailure(
-        iota, "requires static shape or constant output shape");
-  }
-};
-
-// Dynamic Iota operations across multiple dimensions can be reduced to an iota
-// and a ranked broadcast.
-struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
-  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicIotaOp iota,
-                                PatternRewriter& rewriter) const override {
-    auto resultTy = iota.getType().cast<ShapedType>();
-    if (!resultTy.hasRank() || resultTy.getRank() < 2) {
-      return failure();
-    }
-
-    auto iotaDimension = iota.getIotaDimension();
-    auto iotaDimensionInt = iotaDimension;
-
-    Value refinedShape = iota.getOutputShape();
-    auto ty = refinedShape.getType().cast<ShapedType>();
-    if (ty.getElementType().isa<IndexType>()) {
-      refinedShape = rewriter.create<arith::IndexCastOp>(
-          iota.getLoc(),
-          RankedTensorType::get(ty.getShape(), rewriter.getI64Type()),
-          iota.getOutputShape());
-    }
-
-    auto slicedShape = rewriter.create<SliceOp>(
-        iota.getLoc(), refinedShape,
-        rewriter.getI64TensorAttr(iotaDimensionInt),
-        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
-        rewriter.getI64TensorAttr(1));
-
-    auto convertedSlicedShape = rewriter.create<arith::IndexCastOp>(
-        iota.getLoc(),
-        RankedTensorType::get({1}, iota.getOutputShape()
-                                       .getType()
-                                       .cast<ShapedType>()
-                                       .getElementType()),
-        slicedShape);
-
-    auto iotaType = RankedTensorType::get(
-        {resultTy.getDimSize(iotaDimensionInt)}, resultTy.getElementType());
-
-    auto newIota = rewriter.create<DynamicIotaOp>(
-        iota.getLoc(), iotaType, convertedSlicedShape,
-        rewriter.getI64IntegerAttr(0));
-
-    auto broadcastAttr = DenseIntElementsAttr::get(
-        RankedTensorType::get({1}, rewriter.getIntegerType(64)),
-        {iotaDimension});
-    rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
-        iota, resultTy, newIota, iota.getOutputShape(), broadcastAttr);
-    return success();
-  }
-};
-
-}  // namespace
-
-void DynamicIotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                MLIRContext* context) {
-  results.add<DynamicIotaIsStatic>(context);
-  results.add<DynamicIotaBroadcast>(context);
-}
-
-static Value castToIndexTensor(OpBuilder& builder, Location loc,
-                               Value shapeOp) {
-  ShapedType resultTy = shape::getExtentTensorType(
-      builder.getContext(), shapeOp.getType().cast<ShapedType>().getDimSize(0));
-  if (shapeOp.getType() == resultTy) return shapeOp;  // Nothing to do.
-  return builder.create<arith::IndexCastOp>(loc, resultTy, shapeOp);
-}
-
-LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DynamicIotaOp::Adaptor adaptor(operands);
-  reifiedReturnShapes.push_back(
-      castToIndexTensor(builder, getLoc(), adaptor.getOutputShape()));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicUpdateSliceOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult DynamicUpdateSliceOp::verify() {
-  OperandRange indices = getStartIndices();
-  if (indices.size() <= 1) return success();
-
-  // Note: start_indices is constrained to Variadic<HLO_ScalarIntTensor>, so it
-  // is OK to cast indices to ShapedType here.
-  auto idxTensor = indices.take_front().front().getType().cast<ShapedType>();
-  Type firstElemTy = idxTensor.getElementType();
-  Type elemTy;
-
-  for (auto idx : llvm::drop_begin(indices, 1)) {
-    idxTensor = idx.getType().cast<ShapedType>();
-    elemTy = idxTensor.getElementType();
-
-    if (firstElemTy != elemTy) {
-      return emitOpError() << "start indices must have same element type "
-                              "(encountered mismatch: "
-                           << firstElemTy << " vs " << elemTy << ")";
-    }
-  }
-  return success();
-}
-
-OpFoldResult DynamicUpdateSliceOp::fold(ArrayRef<Attribute> operands) {
-  auto operandShape = this->getOperand().getType().cast<RankedTensorType>();
-  auto updateShape = this->getUpdate().getType().cast<RankedTensorType>();
-
-  // If any of the dimensions are length-0, the update does nothing.
-  for (auto dim : updateShape.getShape()) {
-    if (dim == 0) {
-      return this->getOperand();
-    }
-  }
-
-  if (operandShape != updateShape || !operandShape.hasStaticShape()) {
-    return {};
-  }
-
-  // Ensure that indices are 0 constants. The 0 check mostly ensures
-  // correctness. For non-constants, the pattern does not fold to avoid hiding
-  // the behavior of incorrect user input.
-  for (Value index : this->getStartIndices()) {
-    DenseIntElementsAttr deAttr;
-    if (!matchPattern(index, m_Constant(&deAttr))) return {};
-    if (!deAttr.getSplatValue<IntegerAttr>().getValue().isZero()) return {};
-  }
-  return this->getUpdate();
-}
-
-//===----------------------------------------------------------------------===//
-// AbsOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult AbsOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto operandTy = (*operands.begin()).getType().cast<ShapedType>();
-  Type elementTy = operandTy.getElementType();
-  if (auto complexTy = elementTy.dyn_cast<ComplexType>()) {
-    elementTy = complexTy.getElementType();
-  }
-
-  Type resultTy;
-  if (auto rankedOperandTy = operandTy.dyn_cast<RankedTensorType>()) {
-    resultTy = RankedTensorType::get(operandTy.getShape(), elementTy,
-                                     rankedOperandTy.getEncoding());
-  } else if (operandTy.hasRank()) {
-    resultTy = RankedTensorType::get(operandTy.getShape(), elementTy);
-  } else {
-    resultTy = UnrankedTensorType::get(elementTy);
-  }
-  inferredReturnTypes.push_back(resultTy);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// CollectivePermuteOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult CollectivePermuteOp::verify() {
-  return mlir::hlo::verifyCollectivePermuteSourceTargetPairs(
-      *this, getSourceTargetPairs());
-}
-
-//===----------------------------------------------------------------------===//
-// ConvolutionOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-
-template <typename Op>
-LogicalResult ConvReifyReturnTypeImpl(
-    Op* op, OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes,
-    const SmallVector<Value>& spatial_padding_values, Type shape_scalar_type) {
-  typename Op::Adaptor adaptor(operands);
-  Value lhs = adaptor.getLhs();
-  Value rhs = adaptor.getRhs();
-
-  RankedTensorType lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
-  RankedTensorType rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!lhs_type || !rhs_type) return failure();
-
-  Location loc = op->getLoc();
-
-  auto to_shape_scalar_type = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shape_scalar_type);
-  };
-
-  auto dimension_numbers = op->getDimensionNumbers();
-  int64_t input_batch_dimension = dimension_numbers.getInputBatchDimension();
-  int64_t kernel_output_feature_dimension =
-      dimension_numbers.getKernelOutputFeatureDimension();
-  auto input_spatial_dimensions_attr =
-      dimension_numbers.getInputSpatialDimensions();
-  auto kernel_spatial_dimensions_attr =
-      dimension_numbers.getKernelSpatialDimensions();
-  auto output_spatial_dimensions_attr =
-      dimension_numbers.getOutputSpatialDimensions();
-
-  SmallVector<Value, 4> shape_values(output_spatial_dimensions_attr.size() + 2);
-
-  // batch dim = lhs-batch-dim / batch_group_count
-  Value lhs_batch_dim = to_shape_scalar_type(
-      builder.create<tensor::DimOp>(loc, lhs, input_batch_dimension));
-  Value batch_group_count = to_shape_scalar_type(
-      builder.create<arith::ConstantIndexOp>(loc, op->getBatchGroupCount()));
-  Value batch_dim = to_shape_scalar_type(
-      builder.create<arith::DivSIOp>(loc, lhs_batch_dim, batch_group_count));
-  int64_t output_batch_dimension = dimension_numbers.getOutputBatchDimension();
-  shape_values[output_batch_dimension] = batch_dim;
-
-  // Output's feature dim is the same with kernel's output feature dim.
-  Value feature_dim = to_shape_scalar_type(
-      builder.create<tensor::DimOp>(loc, rhs, kernel_output_feature_dimension));
-  int64_t output_feature_dimension =
-      dimension_numbers.getOutputFeatureDimension();
-  shape_values[output_feature_dimension] = feature_dim;
-
-  Optional<DenseIntElementsAttr> window_strides_attr = op->getWindowStrides();
-  Optional<DenseIntElementsAttr> lhs_dilation_attr = op->getLhsDilation();
-  Optional<DenseIntElementsAttr> rhs_dilation_attr = op->getRhsDilation();
-
-  Value one =
-      to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(loc, 1));
-  for (uint64_t i = 0; i < output_spatial_dimensions_attr.size(); i++) {
-    // effective_input_value =
-    //    (input_size - 1) * input_dilation + 1 + padding_left + padding_right
-    Value effective_input_value =
-        to_shape_scalar_type(builder.create<tensor::DimOp>(
-            loc, lhs, input_spatial_dimensions_attr[i]));
-    // Dilation.
-    if (lhs_dilation_attr) {
-      Value input_dilation =
-          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
-              loc, lhs_dilation_attr.getValue().getValues<int64_t>()[i]));
-      effective_input_value = builder.create<arith::AddIOp>(
-          loc,
-          builder.create<arith::MulIOp>(
-              loc,
-              builder.create<arith::SubIOp>(loc, effective_input_value, one),
-              input_dilation),
-          one);
-    }
-
-    // Padding.
-    if (!spatial_padding_values.empty()) {
-      Value padding_left = spatial_padding_values[i * 2];
-      Value padding_right = spatial_padding_values[i * 2 + 1];
-      effective_input_value = builder.create<arith::AddIOp>(
-          loc, effective_input_value,
-          builder.create<arith::AddIOp>(loc, padding_left, padding_right));
-    }
-
-    // effective_kernel_size = (kernel_size - 1) * dilation + 1
-    Value effective_kernel_size_value =
-        to_shape_scalar_type(builder.create<tensor::DimOp>(
-            loc, rhs, kernel_spatial_dimensions_attr[i]));
-    if (rhs_dilation_attr) {
-      Value kernel_dilation =
-          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
-              loc, rhs_dilation_attr.getValue().getValues<int64_t>()[i]));
-      effective_kernel_size_value = builder.create<arith::AddIOp>(
-          loc, one,
-          builder.create<arith::MulIOp>(
-              loc, kernel_dilation,
-              builder.create<arith::SubIOp>(loc, effective_kernel_size_value,
-                                            one)));
-    }
-
-    // output_size =
-    //     (effective_input_value - effective_kernel_size_value) / stride + 1
-    Value output_dim_value = builder.create<arith::SubIOp>(
-        loc, effective_input_value, effective_kernel_size_value);
-    if (window_strides_attr) {
-      Value stride_value =
-          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
-              loc, window_strides_attr.getValue().getValues<int64_t>()[i]));
-      output_dim_value =
-          builder.create<arith::DivSIOp>(loc, output_dim_value, stride_value);
-    }
-    output_dim_value =
-        builder.create<arith::AddIOp>(loc, output_dim_value, one);
-    shape_values[output_spatial_dimensions_attr[i]] = output_dim_value;
-  }
-  Value output_shape =
-      builder.create<tensor::FromElementsOp>(loc, shape_values);
-  reifiedReturnShapes.push_back(output_shape);
-  return success();
-}
-
-// Checks:
-//  P1. Same sizes for input, kernel and output spatial_dims.
-//  P2. Spatial and non-spatial dimentions (for input,kernel, &output) should
-//      be unique and in range [0, num_dims), where num_dims = rank of input
-//      (lhs/rhs) tensors.
-//
-//  Note that the spatial + non-spatial dimensions may not cover all the
-//  dimensions in the range [0,num) because of the presence of 'unknown'
-//  dimensions (ref. cl/415132294).
-LogicalResult isSpatialDimensionsValid(ConvolutionOp op) {
-  auto inputSpatialDimensions =
-      op.getDimensionNumbers().getInputSpatialDimensions();
-  auto kernelSpatialDimensions =
-      op.getDimensionNumbers().getKernelSpatialDimensions();
-  auto outputSpatialDimensions =
-      op.getDimensionNumbers().getOutputSpatialDimensions();
-
-  // P1.
-  if ((inputSpatialDimensions.size() != kernelSpatialDimensions.size()) ||
-      (inputSpatialDimensions.size() != outputSpatialDimensions.size()))
-    return op.emitOpError() << "expects the same size for input, kernel and "
-                               "output spatial-dimensions, but got "
-                            << inputSpatialDimensions.size() << ", "
-                            << kernelSpatialDimensions.size() << ", and "
-                            << outputSpatialDimensions.size() << " resp.";
-
-  // P2.
-  SmallVector<int64_t> inputDnums(inputSpatialDimensions.size() + 2);
-  inputDnums[0] = op.getDimensionNumbers().getInputBatchDimension();
-  inputDnums[1] = op.getDimensionNumbers().getInputFeatureDimension();
-  std::copy(inputSpatialDimensions.begin(), inputSpatialDimensions.end(),
-            inputDnums.begin() + 2);
-
-  SmallVector<int64_t> windowDnums(kernelSpatialDimensions.size() + 2);
-  windowDnums[0] = op.getDimensionNumbers().getKernelInputFeatureDimension();
-  windowDnums[1] = op.getDimensionNumbers().getKernelOutputFeatureDimension();
-  std::copy(kernelSpatialDimensions.begin(), kernelSpatialDimensions.end(),
-            windowDnums.begin() + 2);
-
-  SmallVector<int64_t> outputDnums(outputSpatialDimensions.size() + 2);
-  outputDnums[0] = op.getDimensionNumbers().getOutputBatchDimension();
-  outputDnums[1] = op.getDimensionNumbers().getOutputFeatureDimension();
-  std::copy(outputSpatialDimensions.begin(), outputSpatialDimensions.end(),
-            outputDnums.begin() + 2);
-
-  auto numDims = op.getLhs().getType().cast<RankedTensorType>().getRank();
-  const auto inRange = [numDims](int64_t i) { return 0 <= i && i < numDims; };
-
-  if (!llvm::all_of(inputDnums, inRange) ||
-      !llvm::all_of(windowDnums, inRange) ||
-      !llvm::all_of(outputDnums, inRange))
-    return op.emitOpError() << "expects input, kernel, and output "
-                               "dimension-numbers to be in-range [0, "
-                            << numDims << ").";
-
-  if (hasDuplicates(inputDnums))
-    return op.emitOpError()
-           << "expects input dimension-numbers to be unique, got {"
-           << inputDnums << "}.";
-
-  if (hasDuplicates(windowDnums))
-    return op.emitOpError()
-           << "expects kernel dimension-numbers to be unique, got {"
-           << windowDnums << "}.";
-
-  if (hasDuplicates(outputDnums))
-    return op.emitOpError()
-           << "expects output dimension-numbers to be unique, got {"
-           << outputDnums << "}.";
-
-  return success();
-}
-
-// Verifies the following properties:
-//  P1. The input, kernel, and output spatial-dimentions are valid.
-//  P2. Given,
-//          input-dimensions: b * input-spatial-dims * f
-//          kernel-dimensions: kernel-spatial-dims * i * o
-//          output-dimensions: b' * out-spatial-dims * f'
-//            where b = input-batch-dims
-//            where f = input-feature-dims
-//            where i = kernel-input-feature-dims
-//            where o = kernel-output-feature-dims
-//            where b' = output-batch-dims
-//            where f' = output-feature-dims
-//      Check the following properties w.r.t feature_group_count (fgc) and
-//      batch_group_count (bgc).
-//        fgc > 0, bgc > 1 and !(fgc > 1 && bgc > 1)
-//        b % bgc == 0
-//        f % fgc == 0 and i = f / fgc
-//        o (or f') % bgc == 0 and o (or f') % fgc == 0
-LogicalResult verifyConvolutionAttributes(ConvolutionOp op) {
-  // P1.
-  if (failed(isSpatialDimensionsValid(op))) return failure();
-
-  // P2.
-  const int64_t featureGroupCount = op.getFeatureGroupCount();
-  const int64_t batchGroupCount = op.getBatchGroupCount();
-
-  if (featureGroupCount <= 0)
-    return op.emitOpError()
-           << "expects feature_group_count to be a positive number, got "
-           << featureGroupCount << ".";
-
-  if (batchGroupCount <= 0)
-    return op.emitOpError()
-           << "expects batch_group_count to be a positive number, got "
-           << batchGroupCount << ".";
-
-  if (batchGroupCount > 1 && featureGroupCount > 1)
-    return op.emitOpError()
-           << "expects batch_group_count and feature_group_count not to be "
-              "both greater than 1. Got "
-           << batchGroupCount << " and " << featureGroupCount << " resp.";
-
-  auto lhsType = op.getLhs().getType().cast<RankedTensorType>();
-  const int64_t inputFeatures =
-      lhsType.getShape()[op.getDimensionNumbers().getInputFeatureDimension()];
-  const int64_t inputBatch =
-      lhsType.getShape()[op.getDimensionNumbers().getInputBatchDimension()];
-
-  auto rhsType = op.getRhs().getType().cast<RankedTensorType>();
-  const int64_t kernelInputFeatures =
-      rhsType.getShape()[op.getDimensionNumbers()
-                             .getKernelInputFeatureDimension()];
-  const int64_t kernelOutputFeatures =
-      rhsType.getShape()[op.getDimensionNumbers()
-                             .getKernelOutputFeatureDimension()];
-
-  if (!hlo::isDynamicDimSize(kernelOutputFeatures)) {
-    if (kernelOutputFeatures % batchGroupCount != 0)
-      return op.emitOpError() << "expects output feature dimension size ("
-                              << kernelOutputFeatures
-                              << ") to be a multiple of "
-                                 "batch_group_count. Got batch_group_count = "
-                              << batchGroupCount << ".";
-
-    if (kernelOutputFeatures % featureGroupCount != 0)
-      return op.emitOpError()
-             << "expects kernel output feature dimension ("
-             << kernelOutputFeatures
-             << ") to be divisible by "
-                "feature_group_count. For feature_group_count = "
-             << featureGroupCount << ".";
-  }
-
-  if (!hlo::isDynamicDimSize(inputFeatures)) {
-    if (inputFeatures % featureGroupCount != 0)
-      return op.emitOpError()
-             << "expects input feature dimension (" << inputFeatures
-             << ") to be a multiple of "
-                "feature_group_count. Got feature_group_count = "
-             << featureGroupCount << ".";
-
-    if (!hlo::isDynamicDimSize(kernelInputFeatures) &&
-        inputFeatures / featureGroupCount != kernelInputFeatures)
-      return op.emitOpError()
-             << "expects input feature dimension (" << inputFeatures
-             << ") / "
-                "feature_group_count = kernel input feature dimension ("
-             << kernelInputFeatures
-             << "). Got feature_group_count = " << featureGroupCount << ".";
-  }
-
-  if (!hlo::isDynamicDimSize(inputBatch) && inputBatch % batchGroupCount != 0)
-    return op.emitOpError() << "expects input batch dimension (" << inputBatch
-                            << ") to be divisible by "
-                               "batch_group_count. Got batch_group_count = "
-                            << batchGroupCount << ".";
-
-  return success();
-}
-
-// Infer the return-shape of ConvolutionOp.
-// Precondition:
-//  1. Input args to ConvolutionOp 'op' are RankedTypes.
-//  2. rank-of(input-type) == rank-of(output-type)
-SmallVector<int64_t> inferConvolutionOpReturnShape(
-    ConvolutionOp op, const ArrayRef<hlo::WindowDimension> window) {
-  // We keep the 'unknown' dimensions (cl/415132294) as it is in the
-  // output-shape. To do that we initilize the output dimensions with the shape
-  // of the return-type and updates only the spatial + non-spatial dimensions.
-  // Precondition 2 ensures that size of output-shape == size of input-shape.
-  SmallVector<int64_t> outputDimensions =
-      to_vector(op.getResult().getType().cast<ShapedType>().getShape());
-
-  // Infer the output spatial dimensions.
-  auto lhsType = op.getLhs().getType().cast<RankedTensorType>();
-  auto inputSpatialDims = op.getDimensionNumbers().getInputSpatialDimensions();
-  auto numSpatialDims = inputSpatialDims.size();
-  SmallVector<int64_t> inputSpatialDimVals(numSpatialDims);
-  for (int64_t i = 0; i < static_cast<int64_t>(numSpatialDims); ++i)
-    inputSpatialDimVals[i] = lhsType.getShape()[inputSpatialDims[i]];
-
-  auto windowOutputShape = inferWindowOutputShape(inputSpatialDimVals, window);
-
-  for (int64_t i = 0; i < static_cast<int64_t>(window.size()); ++i)
-    outputDimensions[op.getDimensionNumbers().getOutputSpatialDimensions()[i]] =
-        windowOutputShape[i];
-
-  // Infer the output-batch-dimension and output-feature-dimension.
-  auto rhsType = op.getRhs().getType().cast<RankedTensorType>();
-  const int64_t inputBatch =
-      lhsType.getShape()[op.getDimensionNumbers().getInputBatchDimension()];
-  const int64_t kernelOutputFeatures =
-      rhsType.getShape()[op.getDimensionNumbers()
-                             .getKernelOutputFeatureDimension()];
-
-  outputDimensions[op.getDimensionNumbers().getOutputBatchDimension()] =
-      hlo::isDynamicDimSize(inputBatch) ? ShapedType::kDynamicSize
-                                        : inputBatch / op.getBatchGroupCount();
-  outputDimensions[op.getDimensionNumbers().getOutputFeatureDimension()] =
-      kernelOutputFeatures;
-
-  return outputDimensions;
-}
-
-// Some mhlo.convolutions are dot products, specifically when there is no
-// padding and no spatial dimensions. DotGeneralOp is general enough that it
-// can sufficiently describe it.
-struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
-  using OpRewritePattern<mhlo::ConvolutionOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(mhlo::ConvolutionOp op,
-                                PatternRewriter& rewriter) const override {
-    auto lhs = op.getLhs();
-    auto rhs = op.getRhs();
-    auto lhsTy = lhs.getType().cast<RankedTensorType>();
-    auto rhsTy = rhs.getType().cast<RankedTensorType>();
-    auto resultTy = op.getType().cast<RankedTensorType>();
-
-    if (lhsTy.getRank() != 2) return failure();
-    if (rhsTy.getRank() != 2) return failure();
-
-    if (op.getBatchGroupCount() != 1) return failure();
-
-    // There should not be any padding if this is a matmul.
-    auto dNums = op.getDimensionNumbers();
-    assert(!op.getPadding() || op.getPadding()->empty());
-    assert(dNums.getKernelSpatialDimensions().empty());
-
-    auto lhsBatchDim = dNums.getInputBatchDimension();
-    auto rhsBatchDim = dNums.getKernelOutputFeatureDimension();
-    auto lhsContractDim = dNums.getInputFeatureDimension();
-    auto rhsContractDim = dNums.getKernelInputFeatureDimension();
-    auto outBatchDim = dNums.getOutputBatchDimension();
-    auto outFeatureDim = dNums.getOutputFeatureDimension();
-
-    // If the input features are not grouped then we can directly convert to an
-    // mhlo.dot_general.
-    if (op.getFeatureGroupCount() == 1) {
-      // We can swap the lhs and rhs sides to avoid a transpose.
-      if (outBatchDim == 1 && outFeatureDim == 0) {
-        std::swap(lhs, rhs);
-        std::swap(outBatchDim, outFeatureDim);
-        std::swap(lhsContractDim, rhsContractDim);
-      }
-
-      auto dotNums = DotDimensionNumbersAttr::get(
-          op.getContext(), {}, {}, {lhsContractDim}, {rhsContractDim});
-      auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-          op.getLoc(), op.getType(), lhs, rhs, dotNums,
-          op.getPrecisionConfig().value_or(nullptr));
-
-      rewriter.replaceOp(op, dotOp.getResult());
-      return success();
-    }
-
-    int64_t featureGroupCount = op.getFeatureGroupCount();
-    int64_t lhsBatchSize = lhsTy.getDimSize(lhsBatchDim);
-    int64_t lhsContractSize = lhsTy.getDimSize(lhsContractDim);
-    int64_t rhsBatchSize = rhsTy.getDimSize(rhsBatchDim);
-    int64_t rhsContractSize = rhsTy.getDimSize(rhsContractDim);
-
-    llvm::SmallVector<int64_t> lhsShape;
-    llvm::SmallVector<int64_t> rhsShape;
-    lhsShape.resize(3, lhsBatchSize);
-    rhsShape.resize(3, rhsContractSize);
-    lhsShape[lhsContractDim] = featureGroupCount;
-    lhsShape[lhsContractDim + 1] = lhsContractSize / featureGroupCount;
-    rhsShape[rhsContractDim] = featureGroupCount;
-    rhsShape[rhsContractDim + 1] = rhsBatchSize / featureGroupCount;
-
-    lhsTy = RankedTensorType::get(lhsShape, lhsTy.getElementType());
-    rhsTy = RankedTensorType::get(rhsShape, rhsTy.getElementType());
-
-    lhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), lhsTy, lhs);
-    rhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), rhsTy, rhs);
-
-    auto dotTy = RankedTensorType::get(
-        {featureGroupCount, lhsBatchSize, rhsBatchSize / featureGroupCount},
-        resultTy.getElementType());
-
-    auto dotNums = DotDimensionNumbersAttr::get(
-        op.getContext(), {lhsContractDim}, {rhsContractDim},
-        {lhsContractDim + 1}, {rhsContractDim == 0 ? 2 : 0});
-    auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-        op.getLoc(), dotTy, lhs, rhs, dotNums,
-        op.getPrecisionConfig().value_or(nullptr));
-
-    llvm::SmallVector<int64_t> perms;
-    perms.resize(3, dNums.getOutputBatchDimension() == 0 ? 0 : 2);
-    perms[0] = dNums.getOutputFeatureDimension();
-    perms[2] = dNums.getOutputFeatureDimension() + 1;
-
-    auto transposeTy = RankedTensorType::get(
-        {dotTy.getDimSize(perms[0]), dotTy.getDimSize(perms[1]),
-         dotTy.getDimSize(perms[2])},
-        dotTy.getElementType());
-    auto transposeOp = rewriter.create<mhlo::TransposeOp>(
-        op.getLoc(), transposeTy, dotOp, rewriter.getI64TensorAttr(perms));
-
-    rewriter.replaceOpWithNewOp<mhlo::ReshapeOp>(op, resultTy, transposeOp);
-    return success();
-  }
-};
-
-}  // namespace
-
-void ConvolutionOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                MLIRContext* context) {
-  results.add<ConvolutionIsDot>(context);
-}
-
-/*
- * We intend to verify the following properties
- *  P1. Verify the input, kernel types.
- *  P2. Verify the convolution atributes.
- *  P3. Verify and collect the window atributes.
- *  P4. Verify the return shape.
- *      TODO(b/232574102): Verify the element-type of return-value.
- */
-LogicalResult ConvolutionOp::verify() {
-  auto lhsType = getLhs().getType().dyn_cast<RankedTensorType>();
-  auto rhsType = getRhs().getType().dyn_cast<RankedTensorType>();
-
-  if (!lhsType || !rhsType) return success();
-
-  // P1.
-  int numDims = lhsType.getRank();
-  if (numDims != rhsType.getRank())
-    return emitOpError()
-           << "expects convolution arguments to have same number of "
-              "dimensions. Got: "
-           << lhsType << " and " << rhsType << ".";
-
-  if (numDims < 2)
-    return emitOpError()
-           << "expects convolution arguments to have >= 2 dimensions. "
-              "Got: "
-           << lhsType << " and " << rhsType << ".";
-
-  // P2.
-  if (failed(verifyConvolutionAttributes(*this))) return failure();
-
-  // P3.
-  auto kernelSpatialDimensions =
-      getDimensionNumbers().getKernelSpatialDimensions();
-  SmallVector<int64_t> windowDimensions(kernelSpatialDimensions.size());
-  for (size_t i = 0; i < windowDimensions.size(); i++)
-    windowDimensions[i] = rhsType.getShape()[kernelSpatialDimensions[i]];
-
-  auto paddingOrErr = convertNx2Attribute(this->getPadding(), getLoc());
-  if (failed(paddingOrErr)) return failure();
-  SmallVector<std::pair<int64_t, int64_t>> padding = *paddingOrErr;
-
-  auto windowOrErr = hlo::verifyWindowAttributesAndInferWindowDimensions(
-      windowDimensions, convertDenseIntAttr(getWindowStrides()), padding,
-      convertDenseIntAttr(getLhsDilation()),
-      convertDenseIntAttr(getRhsDilation()), getLoc());
-  if (failed(windowOrErr)) return failure();
-
-  // P4.
-  auto actualReturnType = getResult().getType().cast<TensorType>();
-  auto actualReturnElementType = actualReturnType.getElementType();
-  if (!actualReturnType.hasRank()) return success();
-
-  auto actualReturnRankedType = actualReturnType.cast<RankedTensorType>();
-  if (numDims != actualReturnRankedType.getRank())
-    return emitOpError() << "expects rank of convolution return-type to be "
-                            "equal to input-ranks ("
-                         << numDims << "), but got "
-                         << actualReturnRankedType.getRank() << ".";
-
-  auto expectedReturnShape = inferConvolutionOpReturnShape(*this, *windowOrErr);
-  auto expectedReturnType =
-      RankedTensorType::get(expectedReturnShape, actualReturnElementType);
-  if (failed(verifyCompatibleShape(expectedReturnType, actualReturnRankedType)))
-    return emitOpError()
-           << "has shape mismatch between the expected return-type ("
-           << expectedReturnType << ") and actual return-type ("
-           << actualReturnRankedType << ").";
-
-  return success();
-}
-
-LogicalResult ConvolutionOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  ConvolutionOp::Adaptor adaptor(operands);
-  Location loc = this->getLoc();
-
-  DenseIntElementsAttr padding_attr = this->getPadding().getValue();
-  Type shape_scalar_type = builder.getIndexType();
-
-  SmallVector<Value> spatial_padding_values;
-  if (padding_attr) {
-    for (int64_t pad : padding_attr.getValues<int64_t>()) {
-      Value pad_value = builder.create<arith::ConstantIndexOp>(loc, pad);
-      pad_value = maybeCastTo(builder, loc, pad_value, shape_scalar_type);
-      spatial_padding_values.push_back(pad_value);
-    }
-  }
-
-  return ConvReifyReturnTypeImpl(this, builder, operands, reifiedReturnShapes,
-                                 spatial_padding_values, shape_scalar_type);
-}
-
-LogicalResult DynamicConvOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DynamicConvOp::Adaptor adaptor(operands);
-  Value d_padding = adaptor.getDPadding();
-
-  RankedTensorType padding_type =
-      d_padding.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!padding_type) return failure();
-
-  Location loc = this->getLoc();
-  Type shape_scalar_type = padding_type.getElementType();
-  auto to_shape_scalar_type = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shape_scalar_type);
-  };
-
-  SmallVector<Value> spatial_padding_values;
-  auto dimension_numbers = this->getDimensionNumbers();
-  auto input_spatial_dimensions_attr =
-      dimension_numbers.getInputSpatialDimensions();
-  int64_t padding_num = input_spatial_dimensions_attr.size() * 2;
-  for (int64_t i = 0; i < padding_num; i++) {
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, i);
-    Value pad_value = to_shape_scalar_type(
-        builder.create<tensor::ExtractOp>(loc, d_padding, offset));
-    spatial_padding_values.push_back(pad_value);
-  }
-
-  return ConvReifyReturnTypeImpl(this, builder, operands, reifiedReturnShapes,
-                                 spatial_padding_values, shape_scalar_type);
-}
-
-//===----------------------------------------------------------------------===//
-// ConvertOp
-//===----------------------------------------------------------------------===//
-
-void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
-                      Type resultElementTy) {
-  Type resultTy;
-  Type operandTy = operand.getType();
-  if (auto rankedTy = operandTy.dyn_cast<RankedTensorType>()) {
-    resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy);
-  } else {
-    resultTy = UnrankedTensorType::get(resultElementTy);
-  }
-  build(builder, result, resultTy, operand);
-}
-
-OpFoldResult ConvertOp::fold(ArrayRef<Attribute> operands) {
-  auto operandTy = getOperand().getType().cast<TensorType>();
-  auto resultTy = getResult().getType().cast<TensorType>();
-  if (operandTy == resultTy) return getOperand();
-
-  // If the result has non-static shape, a convert op is necessary to go from
-  // static shape to non-static shape.
-  if (!resultTy.hasStaticShape()) return {};
-
-  // If the operand is constant, we can do the conversion now.
-  auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>();
-  if (!elementsAttr) return {};
-
-  // Prevent folding if the result is too large.
-  if (elementsAttr.getNumElements() > kFoldOpEltLimit) return {};
-  return hlo::convertElementsAttr(elementsAttr,
-                                  getElementTypeOrSelf(getResult()));
-}
-
-namespace {
-
-struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
-  using OpRewritePattern<ConvertOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(ConvertOp op,
-                                PatternRewriter& rewriter) const override {
-    auto convertOp = op.getOperand().getDefiningOp<ConvertOp>();
-    if (!convertOp) {
-      return failure();
-    }
-    auto firstType =
-        convertOp.getOperand().getType().cast<TensorType>().getElementType();
-    auto secondType =
-        op.getOperand().getType().cast<TensorType>().getElementType();
-    auto thirdType =
-        op.getResult().getType().cast<TensorType>().getElementType();
-    auto loc = rewriter.getFusedLoc({convertOp->getLoc(), op->getLoc()});
-    if (firstType.isa<FloatType>() && secondType.isa<FloatType>() &&
-        thirdType.isa<FloatType>()) {
-      // fold when the second float type's width is longer than first,
-      // like fp16 -> fp32 -> fp64, bf16 -> fp32 -> fp16
-      if (secondType.cast<FloatType>().getWidth() >
-          firstType.cast<FloatType>().getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
-        rewriter.replaceOp(op, result);
-        return success();
-      }
-    } else if (firstType.isa<IntegerType>() && secondType.isa<IntegerType>() &&
-               thirdType.isa<IntegerType>()) {
-      // fold when the second integer type's width is longer than first,
-      // like i16 -> i32 -> i64, u16 -> i32 -> u32
-      if (secondType.cast<IntegerType>().getWidth() >
-          firstType.cast<IntegerType>().getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
-        rewriter.replaceOp(op, result);
-        return success();
-      }
-    }
-    return failure();
-  }
-};
-
-}  // namespace
-
-void ConvertOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                            MLIRContext* context) {
-  results.add<EliminateIdentityConvert>(context);
-  results.add<EliminateRedundantConvert>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// StochasticConvertOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult StochasticConvertOp::verify() {
-  DataLayout dataLayout = DataLayout::closest(*this);
-  unsigned operandElementSize =
-      dataLayout.getTypeSizeInBits(getOperand().getType().getElementType());
-  unsigned randomElementSize =
-      dataLayout.getTypeSizeInBits(getRandom().getType().getElementType());
-  if (operandElementSize != randomElementSize) {
-    return emitOpError() << "requires the random's bitwidth to match the "
-                            "operand's, but got: "
-                         << randomElementSize << " and " << operandElementSize;
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// GetTupleElementOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult GetTupleElementOp::verify() {
-  auto indexVal = getIndex();
-  auto operandType = getOperand().getType().cast<TupleType>();
-  if (indexVal >= operandType.size()) {
-    return emitOpError(
-        llvm::formatv("index {0} is out of bounds of operand with size {1}",
-                      indexVal, operandType.size()));
-  }
-
-  auto expectedType = operandType.getType(indexVal);
-  if (getType() != expectedType) {
-    return emitOpError(llvm::formatv("has return type {0}, but expected {1}",
-                                     getType(), expectedType));
-  }
-  return success();
-}
-
-OpFoldResult GetTupleElementOp::fold(ArrayRef<Attribute> operands) {
-  if (auto tupleOp = getOperand().getDefiningOp<mhlo::TupleOp>()) {
-    return tupleOp.getOperand(getIndex());
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// TupleOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult TupleOp::verify() {
-  auto opType = getType().dyn_cast<TupleType>();
-  if (!opType) return emitOpError("tuple op with non-tuple result");
-  if (getNumOperands() != opType.size())
-    return emitOpError(
-        "number of operands to tuple expected to match number of types in "
-        "resultant tuple type");
-  for (const auto& it :
-       llvm::enumerate(llvm::zip_first(getOperandTypes(), opType.getTypes()))) {
-    if (std::get<0>(it.value()) != std::get<1>(it.value()))
-      return emitOpError("has return type mismatch at ")
-             << it.index() << "th value (" << std::get<0>(it.value())
-             << " != " << std::get<1>(it.value()) << ")";
-  }
-  return success();
-}
-
-namespace {
-
-// Pattern for unpacking and repacking the same tuple.
-struct UnpackRepackSameTuple : public OpRewritePattern<TupleOp> {
-  using OpRewritePattern<TupleOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TupleOp op,
-                                PatternRewriter& rewriter) const override {
-    if (op.getVal().empty()) return failure();
-
-    Value firstElement = op.getVal().front();
-    auto firstElementOp = firstElement.getDefiningOp<GetTupleElementOp>();
-    if (!firstElementOp || firstElementOp.getIndexAttr().getInt() != 0)
-      return failure();
-
-    Value tuplePredecessor = firstElementOp.getOperand();
-    if (tuplePredecessor.getType() != op.getType()) return failure();
-
-    for (const auto& elementAndIdx :
-         llvm::enumerate(op.getVal().drop_front(1))) {
-      auto elementOp = elementAndIdx.value().getDefiningOp<GetTupleElementOp>();
-      if (!elementOp ||
-          elementOp.getIndexAttr().getInt() !=
-              static_cast<int64_t>(elementAndIdx.index() + 1) ||
-          elementOp.getOperand() != tuplePredecessor)
-        return failure();
-    }
-
-    rewriter.replaceOp(op, tuplePredecessor);
-    return success();
-  }
-};
-
-}  // namespace
-
-void TupleOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                          MLIRContext* context) {
-  results.add<UnpackRepackSameTuple>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// AllToAllOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult AllToAllOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  AllToAllOp::Adaptor adaptor(operands, attributes, regions);
-  Type operandType = adaptor.getOperand().getType();
-  RankedTensorType operandRankedType = operandType.dyn_cast<RankedTensorType>();
-  if (!operandRankedType) {
-    inferredReturnShapes.emplace_back(
-        operandType.cast<TensorType>().getElementType());
-    return success();
-  }
-
-  int64_t inputRank = operandRankedType.getRank();
-  int64_t splitDimension = static_cast<int64_t>(adaptor.getSplitDimension());
-  int64_t concatDimension = static_cast<int64_t>(adaptor.getConcatDimension());
-  if (splitDimension >= inputRank || splitDimension < 0) {
-    return emitOptionalError(location, "AllToAll split_dimension ",
-                             splitDimension,
-                             " is out-of-bounds for input rank ", inputRank);
-  }
-  if (concatDimension >= inputRank || concatDimension < 0) {
-    return emitOptionalError(location, "AllToAll concat_dimension ",
-                             concatDimension,
-                             " is out-of-bounds for input rank ", inputRank);
-  }
-
-  // If operand is ranked, size of split dimension should be a multiple of split
-  // count.
-  int64_t splitCount = adaptor.getSplitCount();
-  auto splitDimSize = operandRankedType.getDimSize(splitDimension);
-  if (splitDimSize % splitCount != 0) {
-    return emitOptionalError(
-        location, "split dimension has size ", splitDimSize,
-        ", expected to be a multiple of split_count ", splitCount);
-  }
-  SmallVector<int64_t> resultShape(operandRankedType.getShape().begin(),
-                                   operandRankedType.getShape().end());
-  resultShape[splitDimension] /= splitCount;
-  resultShape[concatDimension] *= splitCount;
-  inferredReturnShapes.emplace_back(resultShape,
-                                    operandRankedType.getElementType());
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// AllGatherOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult AllGatherOp::verify() {
-  // If operand and result are both ranked, then the size of the gather
-  // dimension in the result should be a multiple of the size of the gather
-  // dimension in the operand.
-  auto operandType = getOperand().getType().dyn_cast<RankedTensorType>();
-  auto resultType = getType().dyn_cast<RankedTensorType>();
-  uint64_t allGatherDimIndex = getAllGatherDim();
-  if (!operandType || !resultType ||
-      operandType.isDynamicDim(allGatherDimIndex) ||
-      resultType.isDynamicDim(allGatherDimIndex))
-    return success();
-  if (operandType.getDimSize(allGatherDimIndex) == 0)
-    return emitOpError() << "operand gather dimension cannot be zero.";
-  if ((resultType.getDimSize(allGatherDimIndex) %
-       operandType.getDimSize(allGatherDimIndex)) != 0)
-    return emitOpError()
-           << "result gather dimension has size "
-           << resultType.getDimSize(allGatherDimIndex)
-           << ", expected to be a multiple of operand gather dimension size "
-           << operandType.getDimSize(allGatherDimIndex);
-
-  return success();
-}
-
-void AllGatherOp::build(OpBuilder& odsBuilder, OperationState& odsState,
-                        Type resultType, Value operand,
-                        IntegerAttr allGatherDim,
-                        DenseIntElementsAttr replicaGroups,
-                        ChannelHandleAttr channelHandle) {
-  AllGatherOp::build(odsBuilder, odsState, resultType, operand, allGatherDim,
-                     replicaGroups, channelHandle,
-                     /*use_global_device_ids=*/nullptr);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchNormGradOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult verifyBatchNorm(Location loc, Value operand,
-                              int64_t feature_index, Value scale) {
-  auto operandType = operand.getType().cast<RankedTensorType>();
-  if (feature_index >= operandType.getRank())
-    return emitError(loc) << "expects feature_index to be smaller "
-                             "than the rank of operand type; got feature_index "
-                          << feature_index << ", and rank "
-                          << operandType.getRank() << ".";
-
-  if (feature_index < 0)
-    return emitError(loc) << "expects feature_index to be a "
-                          << "non-negative number, got " << feature_index
-                          << ".";
-
-  // Note: the above checks '0 <= feature-index < operandType.getRank()'
-  // imply 'operand_type.getRank() >= 1'.
-
-  const int64_t featureCount = operandType.getDimSize(feature_index);
-  const int64_t scaleShape =
-      scale.getType().cast<RankedTensorType>().getDimSize(0);
-  // As ODS enforces `scale`, `mean`, `variance`, `offset` are AllShapesMatch,
-  // this also infers that featureCount is aligned with them.
-  if (scaleShape != featureCount)
-    return emitError(loc) << "expects the size of scale factor to be "
-                             "same as the feature count,"
-                             " but the size of scale factor is "
-                          << scaleShape << " and the feature count is "
-                          << featureCount << ".";
-
-  return success();
-}
-
-// Refer ODS for properties that are already enforced including shapes and
-// element types. This verifier includes additional checks.
-LogicalResult BatchNormGradOp::verify() {
-  if (failed(verifyBatchNorm(getLoc(), getOperand(), getFeatureIndex(),
-                             getScale())))
-    return failure();
-  return success();
-}
-
-LogicalResult BatchNormGradOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormGradOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferBatchNormGradOp(
-      location, adaptor.getOperand(), adaptor.getScale(),
-      adaptor.getFeatureIndex(), inferredReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchNormTrainingOp
-//===----------------------------------------------------------------------===//
-
-// Refer ODS for properties that are already enforced including shapes and
-// element types. This verifier includes additional checks.
-LogicalResult BatchNormTrainingOp::verify() {
-  if (failed(verifyBatchNorm(getLoc(), getOperand(), getFeatureIndex(),
-                             getScale())))
-    return failure();
-  return success();
-}
-
-LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferBatchNormTrainingOp(
-      location, adaptor.getOperand(), adaptor.getScale(),
-      adaptor.getFeatureIndex(), inferredReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// BatchNormInferenceOp
-//===----------------------------------------------------------------------===//
-
-// Refer ODS for properties that are already enforced including shapes and
-// element types. This verifier includes additional checks.
-LogicalResult BatchNormInferenceOp::verify() {
-  if (failed(verifyBatchNorm(getLoc(), getOperand(), getFeatureIndex(),
-                             getScale())))
-    return failure();
-  return success();
-}
-
-LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferBatchNormInferenceOp(
-      location, adaptor.getOperand(), adaptor.getScale(),
-      adaptor.getFeatureIndex(), inferredReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// BitcastConvertOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult BitcastConvertOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  auto operandType = operands[0].getType().dyn_cast<RankedTensorType>();
-  auto resultType = getType().dyn_cast<RankedTensorType>();
-
-  // Only ranked tensors are supported.
-  if (!operandType || !resultType) return failure();
-
-  // Shape-changing bitcast convert is not implemented.
-  // TODO(kramerb): This could be done by adjusting the last dimension.
-  DataLayout dataLayout = DataLayout::closest(*this);
-  unsigned operandElementSize =
-      dataLayout.getTypeSizeInBits(operandType.getElementType());
-  unsigned resultElementSize =
-      dataLayout.getTypeSizeInBits(resultType.getElementType());
-  if (operandElementSize != resultElementSize) return failure();
-
-  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
-                                     &reifiedReturnShapes);
-}
-
-/*
- * We intend to verify the following properties
- * P1. We cannot convert between complex and real types (cf xla)
- * P3. The dimensions of the operand and the target
- * shape must match, except that the shape with the smaller element bitwidth has
- * an appropriately-sized additional innermost dimension, e.g.
- * ... x f32 => [bitcast_convert] => ... x 4 x i8
- * ... x 4 x i8 => [bitcast_convert] => ... x f32
- */
-LogicalResult BitcastConvertOp::verify() {
-  auto operandTensorType = getOperand().getType().cast<TensorType>();
-  auto targetTensorType = getResult().getType().cast<TensorType>();
-
-  // P1.
-  auto targetElt = targetTensorType.getElementType();
-  auto operandElt = operandTensorType.getElementType();
-  if (targetElt.isa<ComplexType>() != operandElt.isa<ComplexType>()) {
-    return emitOpError()
-           << "cannot convert between real and complex types, but got: "
-           << operandTensorType << " and " << targetTensorType;
-  }
-
-  auto targetEltBitwidth = hlo::potentiallyComplexBitwidth(targetElt);
-  auto operandEltBitwidth = hlo::potentiallyComplexBitwidth(operandElt);
-
-  // P2.
-  auto operandType = operandTensorType.dyn_cast<RankedTensorType>();
-  auto targetType = targetTensorType.dyn_cast<RankedTensorType>();
-  if (!operandType || !targetType) return success();
-
-  auto targetShape = targetType.getShape();
-  auto operandShape = operandType.getShape();
-  ArrayRef<int64_t> smallerEltShape, biggerEltShape;
-  Type smallerElt, biggerElt;
-  if (operandEltBitwidth < targetEltBitwidth) {
-    smallerEltShape = operandShape;
-    smallerElt = operandElt;
-    biggerEltShape = targetShape;
-    biggerElt = targetElt;
-  } else {
-    smallerEltShape = targetShape;
-    smallerElt = targetElt;
-    biggerEltShape = operandShape;
-    biggerElt = operandElt;
-  }
-
-  ArrayRef<int64_t> smallerEltPrefix;
-  auto smallerEltBitwidth = std::min(targetEltBitwidth, operandEltBitwidth);
-  auto biggerEltBitwidth = std::max(targetEltBitwidth, operandEltBitwidth);
-  if (operandEltBitwidth != targetEltBitwidth) {
-    if (smallerEltShape.empty()) {
-      return emitOpError() << "does not allow the smaller element type to be "
-                              "part of a 0d tensor, but got: "
-                           << operandType << " and " << targetType << ".";
-    }
-    smallerEltPrefix = smallerEltShape.drop_back();
-    if (!hlo::isDynamicDimSize(smallerEltShape.back()) &&
-        smallerEltShape.back() * smallerEltBitwidth != biggerEltBitwidth) {
-      return emitOpError() << "requires compatible bitwidths. "
-                           << "Got: " << operandType << " and " << targetType
-                           << ", but " << smallerEltBitwidth << " * "
-                           << smallerEltShape.back()
-                           << " != " << biggerEltBitwidth << ".";
-    }
-  } else {
-    smallerEltPrefix = smallerEltShape;
-  }
-
-  for (auto it : llvm::zip(smallerEltPrefix, biggerEltShape)) {
-    auto targetDim = std::get<0>(it);
-    auto operandDim = std::get<1>(it);
-    if (!hlo::isDynamicDimSize(targetDim) &&
-        !hlo::isDynamicDimSize(operandDim)) {
-      if (targetDim != operandDim) {
-        return emitOpError() << "operand and result shapes must match except "
-                                "for the innermost dimension of the shape with "
-                                "the smaller element type. Got: "
-                             << operandType << " and " << targetType << ".";
-      }
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// BroadcastOp
-//===----------------------------------------------------------------------===//
-
-// TODO(b/129012527) These should be expressed as type constraints.
-LogicalResult BroadcastOp::verify() {
-  auto sizes = getBroadcastSizes();
-  auto sizesType = sizes.getType();
-  auto sizesRank = sizesType.getRank();
-  if (sizesRank != 1) {
-    return emitOpError(llvm::formatv(
-        "broadcast_sizes has rank {0} instead of rank 1", sizesRank));
-  }
-
-  return success();
-}
-
-OpFoldResult BroadcastOp::fold(ArrayRef<Attribute> attrs) {
-  auto type = getType().cast<RankedTensorType>();
-  auto sizesType = getBroadcastSizes().getType();
-  if (sizesType.getNumElements() == 0) {
-    return getOperand();
-  }
-
-  // Constant fold when an operand is a splat tensor attribute.
-  if (!attrs[0] || !type.hasStaticShape()) return {};
-  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
-  if (!splatOperandAttr) return {};
-
-  // Handle complex type
-  if (type.getElementType().isa<ComplexType>()) {
-    ComplexType complex = type.getElementType().cast<ComplexType>();
-    if (complex.getElementType().isa<FloatType>()) {
-      return DenseElementsAttr::get(
-          type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
-    }
-    if (complex.getElementType().isa<IntegerType>()) {
-      return DenseElementsAttr::get(
-          type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
-    }
-    return {};
-  }
-
-  return SplatElementsAttr::get(
-      type, splatOperandAttr.getSplatValue<mlir::Attribute>());
-}
-
-LogicalResult BroadcastOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  BroadcastOp::Adaptor adaptor(operands, attributes, regions);
-  Value operand = adaptor.getOperand();
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  if (!operandType) return failure();
-
-  Type elementTy = operandType.getElementType();
-  auto dimensionAttr = adaptor.getBroadcastSizes();
-  for (int64_t size : dimensionAttr.getValues<int64_t>()) {
-    if (size < 0)
-      return emitOptionalError(location,
-                               "Broadcast with negative dimension size ", size);
-  }
-  SmallVector<int64_t> shapeValues(dimensionAttr.getValues<int64_t>());
-  llvm::append_range(shapeValues, operandType.getShape());
-
-  inferredReturnShapes.emplace_back(shapeValues, elementTy);
-  return success();
-}
-
-LogicalResult BroadcastOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  BroadcastOp::Adaptor adaptor(operands);
-  Value operand = adaptor.getOperand();
-
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  // Unranked tensors are not supported.
-  if (!operandType) return failure();
-
-  Location loc = getLoc();
-  SmallVector<Value, 4> shapeValues;
-
-  // Collect the broadcast sizes.
-  for (const auto& size : getBroadcastSizes()) {
-    shapeValues.push_back(
-        builder.create<arith::ConstantIndexOp>(loc, size.getZExtValue()));
-  }
-
-  // Collect the operand sizes.
-  for (auto index : llvm::seq<int64_t>(0, operandType.getRank())) {
-    shapeValues.push_back(
-        builder.createOrFold<tensor::DimOp>(loc, operand, index));
-  }
-
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            builder.getIndexType()),
-      shapeValues));
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// BroadcastInDimOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult BroadcastInDimOp::verify() {
-  auto operandType = getOperand().getType().dyn_cast<RankedTensorType>();
-  if (!operandType) {
-    // The following verification checks all depend on knowing the rank of
-    // the operand. Bail out now if we don't know the rank of the operand.
-    return success();
-  }
-
-  auto operandRank = operandType.getRank();
-  if (!getBroadcastDimensions()) {
-    if (operandRank == 0) {
-      return success();
-    }
-    return emitOpError(
-        llvm::formatv("broadcast_dimensions is absent, but required because "
-                      "operand has non-zero rank ({0})",
-                      operandRank));
-  }
-
-  auto dimensionsType = getBroadcastDimensions().getType();
-  auto dimensionsRank = dimensionsType.getRank();
-  if (dimensionsRank != 1) {
-    return emitOpError(llvm::formatv(
-        "broadcast_dimensions has rank {0} instead of rank 1", dimensionsRank));
-  }
-
-  auto dimensionsSize = dimensionsType.getNumElements();
-  if (dimensionsSize != operandRank) {
-    return emitOpError(llvm::formatv(
-        "broadcast_dimensions size ({0}) does not match operand rank ({1})",
-        dimensionsSize, operandRank));
-  }
-
-  auto dimensions =
-      llvm::to_vector(getBroadcastDimensions().getValues<int64_t>());
-  if (hasDuplicates(dimensions))
-    return emitOpError("broadcast_dimensions should not have duplicates");
-
-  auto resultType = getResult().getType().cast<RankedTensorType>();
-  auto resultRank = resultType.getRank();
-  for (int i = 0; i != dimensionsSize; ++i) {
-    auto dimIndex = dimensions[i];
-    if (dimIndex >= resultRank) {
-      return emitOpError(
-          llvm::formatv("broadcast_dimensions contains invalid value {0} for "
-                        "result with rank {1}",
-                        dimIndex, resultRank));
-    }
-
-    if (!operandType.isDynamicDim(i)) {
-      auto dimSize = operandType.getDimSize(i);
-      auto resultDimSize = resultType.getDimSize(dimIndex);
-      if (dimSize != 1 && dimSize != resultDimSize) {
-        return emitOpError(
-            llvm::formatv("size of operand dimension {0} ({1}) is not equal to "
-                          "1 or size of result dimension {2} ({3})",
-                          i, dimSize, dimIndex, resultDimSize));
-      }
-    }
-  }
-
-  return success();
-}
-
-OpFoldResult BroadcastInDimOp::fold(ArrayRef<Attribute> attrs) {
-  auto type = getType().cast<RankedTensorType>();
-  if (type == getOperand().getType()) {
-    auto broadcastValues = getBroadcastDimensions().getValues<int64_t>();
-    if (!std::equal(broadcastValues.begin(), broadcastValues.end(),
-                    llvm::seq<int64_t>(0, type.getRank()).begin())) {
-      return {};
-    }
-    return getOperand();
-  }
-
-  // Constant fold when an operand is a splat tensor attribute.
-  if (!attrs[0] || !type.hasStaticShape()) return {};
-  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
-  if (!splatOperandAttr) return {};
-
-  // Handle complex type
-  if (type.getElementType().isa<ComplexType>()) {
-    ComplexType complex = type.getElementType().cast<ComplexType>();
-    if (complex.getElementType().isa<FloatType>()) {
-      return DenseElementsAttr::get(
-          type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
-    }
-    if (complex.getElementType().isa<IntegerType>()) {
-      return DenseElementsAttr::get(
-          type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
-    }
-    return {};
-  }
-
-  return SplatElementsAttr::get(
-      type, splatOperandAttr.getSplatValue<mlir::Attribute>());
-}
-
-// Simplify BroadcastInDim has the following behaviors: replace BroadcastInDim
-// with Reshape or Transpose if they are equivalent or replace
-// BroadcastInDim(BroadcastInDim(X)) with BroadcastInDim(X)
-class BroadcastInDimSimplifier : public OpRewritePattern<BroadcastInDimOp> {
- public:
-  using OpRewritePattern<BroadcastInDimOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(BroadcastInDimOp op,
-                                PatternRewriter& rewriter) const override {
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!operandType || !resultType) {
-      return failure();
-    }
-    auto bsDimIndices = op.getBroadcastDimensions().getValues<int64_t>();
-    if (operandType.hasStaticShape() && resultType.hasStaticShape()) {
-      bool sameTotalElements =
-          operandType.getNumElements() == resultType.getNumElements();
-      // BroadcastInDim equivalent to reshape
-      if (llvm::is_sorted(bsDimIndices) && sameTotalElements) {
-        rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(),
-                                               op.getOperand());
-        return success();
-      }
-      // BroadcastInDim equivalent to transpose
-      if (operandType.getRank() == resultType.getRank() && sameTotalElements) {
-        rewriter.replaceOpWithNewOp<TransposeOp>(
-            op, op.getType(), op.getOperand(), op.getBroadcastDimensions());
-        return success();
-      }
-    }
-    // eliminate redundant BroadcastInDim
-    if (auto broadcastInDimOp = llvm::dyn_cast_or_null<BroadcastInDimOp>(
-            op.getOperand().getDefiningOp())) {
-      auto newIndices =
-          broadcastInDimOp.getBroadcastDimensions()
-              .mapValues(op.getBroadcastDimensions().getElementType(),
-                         [&bsDimIndices](const APInt& dim) -> APInt {
-                           return APInt(dim.getBitWidth(),
-                                        bsDimIndices[dim.getSExtValue()], true);
-                         })
-              .cast<DenseIntElementsAttr>();
-      rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
-          op, op.getType(), broadcastInDimOp.getOperand(), newIndices);
-      return success();
-    }
-    return failure();
-  }
-};
-
-void BroadcastInDimOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                   MLIRContext* context) {
-  results.add<BroadcastInDimSimplifier>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicBroadcastInDimOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult DynamicBroadcastInDimOp::verify() {
-  auto operandType = getOperand().getType().dyn_cast<RankedTensorType>();
-  auto resultType = getResult().getType().dyn_cast<RankedTensorType>();
-
-  // If either the operand or result are unranked, there is very little
-  // to verify statically.
-  if (!operandType || !resultType) {
-    return success();
-  }
-
-  auto outputDimensionsType =
-      getOutputDimensions().getType().cast<RankedTensorType>();
-  auto outputDimensionsSize = outputDimensionsType.getDimSize(0);
-  auto operandRank = operandType.getRank();
-  auto resultRank = resultType.getRank();
-
-  // Verify broadcast_dimensions.
-  auto bcastDimensions = getBroadcastDimensions();
-  auto bcastDimensionsType = getBroadcastDimensions().getType();
-  auto bcastDimensionsRank = bcastDimensionsType.getRank();
-  // TODO(laurenzo): Update the BroadcastDimAttr to constrain its rank to 1.
-  if (bcastDimensionsRank != 1) {
-    return emitOpError(
-        llvm::formatv("broadcast_dimensions has rank {0} instead of rank 1",
-                      bcastDimensionsRank));
-  }
-
-  auto bcastDimensionsSize = bcastDimensionsType.getNumElements();
-  if (bcastDimensionsSize != operandRank) {
-    return emitOpError(llvm::formatv(
-        "broadcast_dimensions size ({0}) does not match operand rank ({1})",
-        bcastDimensionsSize, operandRank));
-  }
-
-  if (resultRank < operandRank) {
-    return emitOpError(
-        llvm::formatv("result rank ({0}) is less than operand rank ({1})",
-                      resultRank, operandRank));
-  }
-
-  for (int i = 0; i != bcastDimensionsSize; ++i) {
-    auto dimIndex = bcastDimensions.getValues<int64_t>()[i];
-    if (dimIndex >= resultRank) {
-      return emitOpError(
-          llvm::formatv("broadcast_dimensions contains invalid value {0} for "
-                        "result with rank {1}",
-                        dimIndex, resultRank));
-    }
-
-    auto dimSize = operandType.getDimSize(i);
-    auto resultDimSize = resultType.getDimSize(dimIndex);
-    // Note: verifyCompatibleShapes doesn't consider size-1 broadcasting, so we
-    // add a manual check for this.
-    if (dimSize != 1 && failed(verifyCompatibleShape(dimSize, resultDimSize))) {
-      return emitOpError(
-          llvm::formatv("size of operand dimension {0} ({1}) is not compatible "
-                        "with size of result dimension {2} ({3})",
-                        i, dimSize, dimIndex, resultDimSize));
-    }
-  }
-
-  if (outputDimensionsSize != resultRank) {
-    return emitOpError(
-        llvm::formatv("result rank ({0}) is not equal to number of output "
-                      "dimensions ({1})",
-                      resultRank, outputDimensionsSize));
-  }
-
-  // Verify that the known expanding and non-expanding dimensions are a subset
-  // of the operand's dimensions.
-  int64_t numKnownExpansionBehavior = 0;
-  DenseSet<int64_t> knownExpansionBehavior;
-  auto collectExpansionBehaviorDims =
-      [&](const Optional<DenseIntElementsAttr>& attr) {
-        if (!attr) return;
-        for (const APInt& it : *attr) {
-          numKnownExpansionBehavior++;
-          knownExpansionBehavior.insert(it.getLimitedValue());
-        }
-      };
-  collectExpansionBehaviorDims(getKnownExpandingDimensions());
-  collectExpansionBehaviorDims(getKnownNonexpandingDimensions());
-  if (knownExpansionBehavior.size() != numKnownExpansionBehavior) {
-    return emitOpError(
-        "duplicate expansion hint for at least one operand dimension");
-  }
-  for (int64_t i : knownExpansionBehavior) {
-    if (i < 0 || i >= operandRank) {
-      return emitOpError(
-          llvm::formatv("hint for expanding dimension {0} does not refer to a "
-                        "valid operand dimension",
-                        i));
-    }
-  }
-
-  return success();
-}
-
-namespace {
-// If a DynamicBroadCastInDimOp is not actually dynamic, use an ordinary
-// BroadcastInDimOp.
-class DynamicBroadcastInDimOpNotActuallyDynamic
-    : public OpRewritePattern<DynamicBroadcastInDimOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
-                                PatternRewriter& rewriter) const override {
-    auto type = op.getType().dyn_cast<RankedTensorType>();
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    auto* outputDimOp = op.getOutputDimensions().getDefiningOp();
-    if (!type || !operandType || !operandType.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(op, "requires operand static shape");
-    }
-    // output has static shape, replace with broadcast_in_dim
-    if (type.hasStaticShape()) {
-      rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
-          op, type, op.getOperand(), op.getBroadcastDimensions());
-      return success();
-    }
-    // output_dimensions are constant, set output shape with output_dimensions,
-    // then replace with broadcast_in_dim
-    if (outputDimOp && outputDimOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
-      DenseIntElementsAttr shapeAttr;
-      if (matchPattern(outputDimOp, m_Constant(&shapeAttr))) {
-        SmallVector<int64_t> outputShape;
-        for (APInt shape : shapeAttr.getValues<APInt>()) {
-          outputShape.push_back(shape.getZExtValue());
-        }
-        refineOpWithNewOp<BroadcastInDimOp>(
-            rewriter, op,
-            RankedTensorType::get(outputShape, type.getElementType()),
-            op.getOperand(), op.getBroadcastDimensions());
-        return success();
-      }
-    }
-    return rewriter.notifyMatchFailure(
-        op, "requires output static shape or constant broadcast dimensions");
-  }
-};
-
-class ChainedDynamicBroadcastInDimCanonicalization
-    : public OpRewritePattern<DynamicBroadcastInDimOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp bcast,
-                                PatternRewriter& rewriter) const override {
-    auto precedingBcast =
-        bcast.getOperand().getDefiningOp<DynamicBroadcastInDimOp>();
-    if (!precedingBcast) return failure();
-
-    // Compose broadcast dimensions.
-    DenseIntElementsAttr precedingBcastDims =
-        precedingBcast.getBroadcastDimensions();
-    DenseIntElementsAttr bcastDims = bcast.getBroadcastDimensions();
-    SmallVector<APInt, 4> composition;
-    for (APInt precedingDim : precedingBcastDims) {
-      composition.push_back(
-          bcastDims.getValues<APInt>()[precedingDim.getZExtValue()]);
-    }
-    auto composedBcastDims =
-        DenseIntElementsAttr::get(precedingBcastDims.getType(), composition);
-
-    rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
-        bcast, bcast.getType(), precedingBcast.getOperand(),
-        bcast.getOutputDimensions(), composedBcastDims);
-    return success();
-  }
-};
-}  // namespace
-
-void DynamicBroadcastInDimOp::getCanonicalizationPatterns(
-    RewritePatternSet& results, MLIRContext* context) {
-  results.add<ChainedDynamicBroadcastInDimCanonicalization,
-              DynamicBroadcastInDimOpNotActuallyDynamic,
-              DynamicBroadcastToOwnShape_1, DynamicBroadcastToOwnShape_2,
-              DynamicBroadcastToOwnShape_3, DynamicBroadcastToOwnShape_4>(
-      context);
-}
-
-LogicalResult DynamicBroadcastInDimOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DynamicBroadcastInDimOp::Adaptor adaptor(operands);
-  reifiedReturnShapes.push_back(
-      castToIndexTensor(builder, getLoc(), adaptor.getOutputDimensions()));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ClampOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult ClampOp::verify() {
-  auto operandType = getOperand().getType().cast<RankedTensorType>();
-  auto operandShape = operandType.getShape();
-  auto minType = getMin().getType().cast<RankedTensorType>();
-
-  auto minShape = minType.getShape();
-  if (failed(verifyCompatibleShape(minType, operandType)) &&
-      minType.getRank() != 0) {
-    return emitOpError(llvm::formatv(
-        "min shape [{0}] is not scalar and is not compatible to operand shape "
-        "[{1}]",
-        llvm::make_range(minShape.begin(), minShape.end()),
-        llvm::make_range(operandShape.begin(), operandShape.end())));
-  }
-
-  auto maxType = getMax().getType().cast<RankedTensorType>();
-  auto maxShape = maxType.getShape();
-  if (failed(verifyCompatibleShape(maxType, operandType)) &&
-      maxType.getRank() != 0) {
-    return emitOpError(llvm::formatv(
-        "max shape [{0}] is not scalar and is not compatible to operand shape "
-        "[{1}]",
-        llvm::make_range(maxShape.begin(), maxShape.end()),
-        llvm::make_range(operandShape.begin(), operandShape.end())));
-  }
-
-  return success();
-}
-
-LogicalResult ClampOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> /*location*/, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ClampOp::Adaptor adaptor(operands, attributes, regions);
-  RankedTensorType operandType =
-      adaptor.getOperand().getType().cast<RankedTensorType>();
-  inferredReturnShapes.emplace_back(operandType.getShape(),
-                                    operandType.getElementType());
-  return success();
-}
-
-LogicalResult ClampOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  // For `mhlo.clamp`, the first operand may be a scalar.
-  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands[1],
-                                     &reifiedReturnShapes);
-}
-
-OpFoldResult ClampOp::fold(ArrayRef<Attribute> operands) {
-
-  auto val = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  if (!val) return {};
-
-  auto type = getElementTypeOrSelf(getType());
-  if (!type.isF32() && !type.isF64()) return {};
-
-  auto shapedType = getType().cast<ShapedType>();
-  if (!shapedType.hasStaticShape()) return {};
-
-  DenseElementsAttr min_val = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-  DenseElementsAttr max_val = operands[2].dyn_cast_or_null<DenseElementsAttr>();
-  if (!min_val || !max_val) return {};
-
-  // min/max/val should be the same shape.
-  // Or min/max must be a scalar of the same type of val
-  int64_t val_num = val.getNumElements();
-  int64_t min_num = min_val.getNumElements();
-  int64_t max_num = max_val.getNumElements();
-  if (!((val_num == min_num && val_num == max_num) ||
-        (val_num == max_num && min_num == 1) ||
-        (val_num == min_num && max_num == 1) || (min_num == max_num == 1)))
-    return {};
-
-  int bitWidth = type.getIntOrFloatBitWidth();
-  auto convertValue = [](APFloat value, int bitWidth) {
-    double converted_value = bitWidth == 32 ? value.convertToFloat()
-                                            : value.convertToDouble();
-    return converted_value;
-  };
-  double first_min_value = convertValue(*min_val.getValues<APFloat>().begin(), bitWidth);
-  double first_max_value = convertValue(*max_val.getValues<APFloat>().begin(), bitWidth);
-  auto val_start = val.getValues<APFloat>().begin();
-  auto min_start = min_val.getValues<APFloat>().begin();
-  auto max_start = max_val.getValues<APFloat>().begin();
-  llvm::SmallVector<APFloat, 4> values;
-  values.reserve(val.getNumElements());
-  for (int64_t i=0; i<val_num; i++) {
-    double cur_value = convertValue(val_start[i], bitWidth);
-    double min_value;
-    double max_value;
-    if (val_num == min_num) {
-      min_value = convertValue(min_start[i], bitWidth);
-    } else {
-      min_value = first_min_value;
-    }
-    if (val_num == max_num) {
-      max_value = convertValue(max_start[i], bitWidth);
-    } else {
-      max_value = first_max_value;
-    }
-    cur_value = std::clamp(cur_value, min_value, max_value);
-    if (bitWidth == 32)
-      values.emplace_back(static_cast<float>(cur_value));
-    else
-      values.emplace_back(cur_value);
-  }
-
-  return DenseFPElementsAttr::get(shapedType, values);
-
-}
-
-//===----------------------------------------------------------------------===//
-// ComplexOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult ComplexOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  TensorType operandType = operands[0].getType().cast<TensorType>();
-  ComplexType elementTy = ComplexType::get(operandType.getElementType());
-  inferredReturnTypes.push_back(
-      hlo::getSameShapeTensorType(operandType, elementTy));
-  return success();
-}
-
-OpFoldResult ComplexOp::fold(ArrayRef<Attribute> operands) {
-  auto realOp = getOperand(0).getDefiningOp<mhlo::RealOp>();
-  auto imagOp = getOperand(1).getDefiningOp<mhlo::ImagOp>();
-  if (realOp && imagOp && realOp.getOperand() == imagOp.getOperand()) {
-    return realOp.getOperand();
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// ImagOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-Type createRealType(TensorType type) {
-  auto elementTy = type.getElementType();
-  if (auto complexTy = elementTy.dyn_cast<ComplexType>()) {
-    elementTy = complexTy.getElementType();
-  }
-  return hlo::getSameShapeTensorType(type, elementTy);
-}
-}  // namespace
-
-LogicalResult ImagOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(
-      createRealType(operands[0].getType().cast<TensorType>()));
-  return success();
-}
-
-OpFoldResult ImagOp::fold(ArrayRef<Attribute> operands) {
-  if (auto complexOp = getOperand().getDefiningOp<mhlo::ComplexOp>()) {
-    return complexOp.getOperand(1);
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// IsFiniteOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult IsFiniteOp::inferReturnTypes(
-    MLIRContext* ctx, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto argTy = operands.front().getType().cast<TensorType>();
-  Builder b(ctx);
-  inferredReturnTypes.push_back(
-      hlo::getSameShapeTensorType(argTy, b.getI1Type()));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// RealOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult RealOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(
-      createRealType(operands[0].getType().cast<TensorType>()));
-  return success();
-}
-
-OpFoldResult RealOp::fold(ArrayRef<Attribute> operands) {
-  if (auto complexOp = getOperand().getDefiningOp<mhlo::ComplexOp>()) {
-    return complexOp.getOperand(0);
-  }
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// ConcatenateOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-class SingleOperandConcatenateToCast : public OpRewritePattern<ConcatenateOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    if (op.getVal().size() != 1) return failure();
-
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, op.getType(),
-                                                op.getVal().front());
-    return success();
-  }
-};
-
-class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    auto axis = op.getDimension();
-    llvm::SmallVector<Value, 6> newOperands;
-    for (auto operand : op.getOperands()) {
-      auto ty = operand.getType().cast<ShapedType>();
-      if (!ty.hasRank() || ty.getDimSize(axis) != 0) {
-        newOperands.push_back(operand);
-      }
-    }
-
-    if (!newOperands.empty() && newOperands.size() < op.getNumOperands()) {
-      rewriter.replaceOpWithNewOp<ConcatenateOp>(
-          op, op.getResult().getType(), newOperands, op.getDimension());
-      return success();
-    }
-
-    return failure();
-  }
-};
-
-class ConcatenateForwarding : public OpRewritePattern<ConcatenateOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    auto getFlattenedOperands = [&](const Value& val) -> ValueRange {
-      auto definingOp = dyn_cast_or_null<ConcatenateOp>(val.getDefiningOp());
-      // To avoid inflate the memory footprint, only flatten the ConcatenateOp
-      // when it has only one use.
-      if (definingOp && definingOp->hasOneUse() &&
-          definingOp.getDimension() == op.getDimension())
-        return definingOp.getVal();
-      return val;
-    };
-
-    bool needToFlatten = false;
-    int operandCount = 0;
-    llvm::for_each(op.getVal(), [&](Value val) {
-      auto result = getFlattenedOperands(val);
-      if (result.size() != 1 || result[0] != val) needToFlatten = true;
-      operandCount += result.size();
-    });
-
-    if (!needToFlatten) return failure();
-
-    llvm::SmallVector<Value, 6> newOperands;
-    newOperands.reserve(operandCount);
-
-    for (auto operand : op.getVal()) {
-      auto flattenedOperands = getFlattenedOperands(operand);
-      newOperands.append(flattenedOperands.begin(), flattenedOperands.end());
-    }
-
-    rewriter.replaceOpWithNewOp<ConcatenateOp>(op, op.getResult().getType(),
-                                               newOperands, op.getDimension());
-    return success();
-  }
-};
-
-}  // namespace
-
-LogicalResult ConcatenateOp::inferReturnTypes(
-    MLIRContext*, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  if (operands.empty()) {
-    return failure();
-  }
-
-  auto dimensionAttr = attributes.get("dimension").cast<IntegerAttr>();
-  auto dimension = dimensionAttr.getInt();
-
-  auto firstType = (*operands.begin()).getType().cast<ShapedType>();
-  auto outElement = firstType.getElementType();
-
-  // Find the first ranked input to determine the output rank.
-  for (auto type : operands.getTypes()) {
-    auto shapedType = type.cast<ShapedType>();
-    if (shapedType.hasRank()) {
-      firstType = shapedType;
-      break;
-    }
-  }
-
-  // If all inputs are unranked, the result must be unranked.
-  if (!firstType.hasRank()) {
-    inferredReturnTypes.push_back(UnrankedTensorType::get(outElement));
-    return success();
-  }
-
-  auto outShape = llvm::to_vector<6>(firstType.getShape());
-
-  // Determine what the non-concatenate dimensions should be.
-  for (auto type : operands.getTypes()) {
-    auto shapedTy = type.cast<ShapedType>();
-    if (!shapedTy.hasRank()) {
-      continue;
-    }
-
-    for (const auto& it : llvm::enumerate(shapedTy.getShape())) {
-      // If a dimension is not dynamic, the output shape should match.
-      if (ShapedType::isDynamic(outShape[it.index()])) {
-        outShape[it.index()] = it.value();
-      }
-    }
-  }
-
-  outShape[dimension] = 0;
-
-  for (auto operand : operands.getTypes()) {
-    auto type = operand.cast<ShapedType>();
-    if (!type.hasRank()) {
-      inferredReturnTypes.push_back(UnrankedTensorType::get(outElement));
-      return success();
-    }
-
-    // If the dimension is dynamic we know the output dimension is dynamic.
-    auto dim = type.getShape()[dimension];
-    if (ShapedType::isDynamic(dim)) {
-      outShape[dimension] = ShapedType::kDynamicSize;
-      break;
-    }
-
-    outShape[dimension] += dim;
-  }
-
-  bool allSparse = llvm::all_of(operands.getTypes(), [](Type t) -> bool {
-    return sparse_tensor::getSparseTensorEncoding(t) != nullptr;
-  });
-
-  sparse_tensor::SparseTensorEncodingAttr enc;
-  if (allSparse) {
-    // Picks the encoding from an abitrary input is fine and it will be lowered
-    // correctly by the sparse compiler (though efficiency might vary).
-    // TODO: Extra rules are needed to infer sparse encoding when inputs have
-    // different encodings for better efficiency.
-    enc = sparse_tensor::getSparseTensorEncoding(operands.getTypes()[0]);
-  }
-
-  inferredReturnTypes.push_back(
-      RankedTensorType::get(outShape, outElement, enc));
-
-  return success();
-}
-
-void ConcatenateOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                MLIRContext* context) {
-  results.add<ConcatenateOperandRemoval, ConcatenateForwarding,
-              SingleOperandConcatenateToCast>(context);
-}
-
-template <typename T>
-static Attribute foldConcatenateHelper(ConcatenateOp* op,
-                                       ArrayRef<Attribute> operands) {
-  auto axis = op->getDimension();
-  auto type = op->getType().cast<ShapedType>();
-  auto shape = type.getShape();
-
-  size_t topSize = 1;
-  for (int i = 0, e = axis; i < e; i++) {
-    topSize = topSize * shape[i];
-  }
-
-  // Prevent folding if the result is too large.
-  if (type.getNumElements() > kFoldOpEltLimit) return {};
-
-  SmallVector<T, 6> values;
-  for (size_t i = 0; i < topSize; i++) {
-    for (auto operand : operands) {
-      DenseElementsAttr attr = operand.cast<DenseElementsAttr>();
-      size_t bottomSize = attr.getNumElements() / topSize;
-      auto iter = attr.getValues<T>().begin() + i * bottomSize;
-      values.append(iter, iter + bottomSize);
-    }
-  }
-
-  return DenseElementsAttr::get(type, values);
-}
-
-static Attribute foldConcatenate(ConcatenateOp* op,
-                                 ArrayRef<Attribute> operands) {
-  for (auto operand : operands) {
-    if (!operand) return {};
-  }
-
-  auto type = op->getResult().getType().cast<ShapedType>();
-  auto etype = type.getElementType();
-  if (etype.isa<IntegerType>()) {
-    return foldConcatenateHelper<APInt>(op, operands);
-  }
-
-  if (etype.isa<FloatType>()) {
-    return foldConcatenateHelper<APFloat>(op, operands);
-  }
-
-  return {};
-}
-
-OpFoldResult ConcatenateOp::fold(ArrayRef<Attribute> operands) {
-  if (getNumOperands() == 1) return getOperand(0);
-
-  ShapedType type = getResult().getType().cast<ShapedType>();
-  if (!type.hasStaticShape()) return {};
-
-  auto axis = getDimension();
-  if (auto attr = foldConcatenate(this, operands)) {
-    return attr;
-  }
-
-  for (auto operand : getOperands()) {
-    auto ty = operand.getType().cast<ShapedType>();
-    if (ty.getDimSize(axis) != 0) {
-      return {};
-    }
-  }
-
-  return DenseElementsAttr::get(type, ArrayRef<Attribute>());
-}
-
-LogicalResult ConcatenateOp::verify() {
-  RankedTensorType firstRankedType;
-  int firstRankedIndex;
-  int numOperands = getNumOperands();
-  int64_t concatDimension = static_cast<int64_t>(getDimension());
-  if (concatDimension < 0) {
-    return emitOpError(
-        llvm::formatv("dimension {0} is negative", concatDimension));
-  }
-  for (int i = 0; i < numOperands; i++) {
-    auto secondType = getOperand(i).getType().dyn_cast<ShapedType>();
-    if (!secondType.hasRank()) {
-      continue;
-    }
-
-    if (!firstRankedType) {
-      firstRankedType = secondType.cast<RankedTensorType>();
-      firstRankedIndex = i;
-      if (firstRankedType.getRank() == 0)
-        return emitOpError(
-            llvm::formatv("rank-0 values cannot be concatenated"));
-      if (concatDimension >= firstRankedType.getRank()) {
-        return emitOpError(
-            llvm::formatv("dimension {0} is out-of-bounds for input rank {1}",
-                          concatDimension, firstRankedType.getRank()));
-      }
-      continue;
-    }
-
-    if (firstRankedType.getRank() != secondType.getRank()) {
-      return emitOpError(llvm::formatv(
-          "operands ({0}) and ({1}) do not match rank", firstRankedIndex, i));
-    }
-
-    auto firstShape = firstRankedType.getShape();
-    auto secondShape = secondType.getShape();
-    for (int d = 0; d < firstRankedType.getRank(); ++d) {
-      if (!ShapedType::isDynamic(firstShape[d]) &&
-          !ShapedType::isDynamic(secondShape[d]) &&
-          firstShape[d] != secondShape[d] && d != concatDimension) {
-        return emitOpError(llvm::formatv(
-            "shapes of operand ({0}) and ({1}) do not match at non-concat "
-            "index: ({2}) != ({3}) at non-concat index {4}",
-            firstRankedIndex, i,
-            llvm::make_range(firstShape.begin(), firstShape.end()),
-            llvm::make_range(secondShape.begin(), secondShape.end()), d));
-      }
-    }
-  }
-  return success();
-}
-
-LogicalResult ConcatenateOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  ConcatenateOp::Adaptor adaptor(operands);
-  auto inputs = adaptor.getVal();
-
-  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!operandType) return failure();
-
-  Location loc = this->getLoc();
-  Type shapeScalarType = builder.getIndexType();
-  auto toShapeScalarType = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shapeScalarType);
-  };
-
-  SmallVector<SmallVector<Value, 4>, 4> allShapeValues;
-  for (size_t inputId = 0; inputId < inputs.size(); ++inputId) {
-    Value operand = inputs[inputId];
-    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-    if (!operandType) return failure();
-
-    SmallVector<Value, 4> shapeVals;
-    for (const auto& element : llvm::enumerate(operandType.getShape())) {
-      Value valueDim = toShapeScalarType(
-          builder.create<tensor::DimOp>(loc, operand, element.index()));
-      shapeVals.push_back(valueDim);
-    }
-    allShapeValues.emplace_back(std::move(shapeVals));
-  }
-
-  int axis = this->getDimension();
-  auto& shapeValues = allShapeValues[0];
-  for (size_t vecId = 1; vecId < allShapeValues.size(); ++vecId) {
-    auto& otherShapeValues = allShapeValues[vecId];
-    if (otherShapeValues.size() != shapeValues.size()) {
-      this->emitOpError()
-          << "Concatenate expects all operands must be of the same rank";
-      return failure();
-    }
-    shapeValues[axis] = builder.create<arith::AddIOp>(loc, shapeValues[axis],
-                                                      otherShapeValues[axis]);
-  }
-
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            shapeScalarType),
-      shapeValues);
-  reifiedReturnShapes.push_back(outputShape);
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicReshapeOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult DynamicReshapeOp::verify() {
-  auto resultType = getResult().getType().dyn_cast<RankedTensorType>();
-  auto outputShapeType =
-      getOutputShape().getType().dyn_cast<RankedTensorType>();
-  if (resultType && outputShapeType && outputShapeType.hasStaticShape() &&
-      outputShapeType.getDimSize(0) != resultType.getRank()) {
-    return emitError() << "output should have a rank equal to the number of "
-                          "elements in output_shape";
-  }
-  return success();
-}
-
-LogicalResult DynamicReshapeOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DynamicReshapeOp::Adaptor adaptor(operands);
-  reifiedReturnShapes.push_back(
-      castToIndexTensor(builder, getLoc(), adaptor.getOutputShape()));
-  return success();
-}
-
-namespace {
-class DynamicReshapeOpNotActuallyDynamic
-    : public OpRewritePattern<DynamicReshapeOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(DynamicReshapeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!type || !type.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(op, "requires static shape tensor");
-    }
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
-    return success();
-  }
-};
-
-// Canonicalizes
-// %0 = some_op(%tensor)
-// %1 = "mhlo.dynamic_reshape"(%0, %shape)
-//      (tensor<?xT>, tensor<1xindex>) -> tensor<?xT>
-// ... uses of %1.
-//
-// into
-//
-// ... uses of %0.
-// This canonicalization is only correct if the input is correct!
-// TODO(b/178779691): Use a more sophisticated canonicalization that preserves
-// errors in input, and still allows us to get rid of redundant reshapes.
-class RemoveRedundantRank1DynamicReshape
-    : public OpRewritePattern<DynamicReshapeOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(DynamicReshapeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!type || type.getRank() != 1 || type.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(
-          op, "requires rank 1 shape tensor with dynamic dimension");
-    }
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    if (!operandType || operandType.getRank() != 1 ||
-        operandType.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(
-          op, "requires rank 1 shape tensor with dynamic dimension");
-    }
-    rewriter.replaceOp(op, {op.getOperand()});
-    return success();
-  }
-};
-
-// Canonicalizes
-// %0 = "mhlo.dynamic_reshape"(%tensor, %shape)
-// %1 = same_operands_and_result_shape_op(%tensor)
-// %2 = "mhlo.dynamic_reshape"(%1, %shape)
-// ... uses of %2.
-//
-// into
-//
-// %0 = "mhlo.dynamic_reshape"(%tensor, %shape)
-// %1 = same_operands_and_result_shape_op(%tensor)
-// ... uses of %1.
-class DynamicReshapeOpSameShapeOpResult
-    : public OpRewritePattern<DynamicReshapeOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicReshapeOp op,
-                                PatternRewriter& rewriter) const override {
-    Operation* defOp = op.getOperand().getDefiningOp();
-    if (!defOp ||
-        !defOp->hasTrait<mlir::OpTrait::SameOperandsAndResultShape>()) {
-      return failure();
-    }
-    Operation* inputDefOp = defOp->getOperand(0).getDefiningOp();
-    if (!inputDefOp) {
-      return failure();
-    }
-    auto reshape = dyn_cast<DynamicReshapeOp>(*inputDefOp);
-    if (reshape && reshape.getOutputShape() == op.getOutputShape()) {
-      rewriter.replaceOp(op, {defOp->getResult(0)});
-      return success();
-    }
-    return failure();
-  }
-};
-}  // namespace
-
-void DynamicReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                   MLIRContext* context) {
-  // clang-format off
-  results.add<
-      DynamicReshapeOpNotActuallyDynamic,
-      DynamicReshapeOpSameShapeOpResult,
-      RemoveRedundantDynamicBroadcast,
-      RemoveRedundantDynamicReshape,
-      RemoveRedundantRank1DynamicReshape,
-      ShapeOfDynamicReshape
-    >(context);
-  // clang-format on
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicSliceOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Canonicalizes DynamicSlice ops that can be replaced instead with Slice ops.
-// This canonicalization is applied the case when the `begin` input values are
-// compile time constants and thus can be made into a tensor.
-struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
-  using OpRewritePattern<DynamicSliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicSliceOp dynamicSlice,
-                                PatternRewriter& rewriter) const override {
-    Value input = dynamicSlice.getOperand();
-    auto inputTensor = input.getType().dyn_cast<RankedTensorType>();
-    if (!inputTensor || !inputTensor.hasStaticShape()) return failure();
-
-    auto sliceSizes = dynamicSlice.getSliceSizes().getValues<int64_t>();
-    SmallVector<int64_t, 4> tempStartIndices;
-    for (const auto& indexAndSliceStart :
-         llvm::enumerate(dynamicSlice.getStartIndices())) {
-      APInt val;
-      Value start = indexAndSliceStart.value();
-      int64_t index = indexAndSliceStart.index();
-      if (!matchPattern(start, m_ConstantInt(&val))) {
-        return failure();
-      }
-      // Clamp the indices within bounds to faithfully mirror dynamic slice
-      // semantics.
-      int64_t clampedStart =
-          clamp(val.getSExtValue(), static_cast<int64_t>(0),
-                inputTensor.getDimSize(index) - sliceSizes[index]);
-      tempStartIndices.push_back(clampedStart);
-    }
-
-    // At this point we've determined that the start indices are all constants;
-    // pack them into a single tensor.
-    auto loc = dynamicSlice.getLoc();
-    int64_t inputRank = inputTensor.getRank();
-    auto sliceStartIndices = rewriter.getI64TensorAttr(tempStartIndices);
-    DenseIntElementsAttr sliceLimits = buildSliceLimits(
-        sliceStartIndices, dynamicSlice.getSliceSizes(), &rewriter);
-    DenseIntElementsAttr sliceStrides =
-        rewriter.getI64TensorAttr(SmallVector<int64_t, 4>(inputRank, 1));
-    auto result = rewriter.create<SliceOp>(loc, input, sliceStartIndices,
-                                           sliceLimits, sliceStrides);
-    rewriter.replaceOp(dynamicSlice, {result});
-    return success();
-  }
-};
-
-}  // namespace
-
-void DynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                 MLIRContext* context) {
-  results.add<DynamicSliceToSlice>(context);
-}
-
-// Verifies that the number of slice sizes and the number of start indices match
-LogicalResult DynamicSliceOp::verify() {
-  int numSliceSizes = getSliceSizes().getNumElements();
-  int numStartIndices = getStartIndices().size();
-  if (numStartIndices != numSliceSizes) {
-    return emitOpError() << "has mismatched number of slice sizes ("
-                         << numSliceSizes << ") and number of start indices ("
-                         << numStartIndices << ")";
-  }
-  auto operandType = getOperand().getType().dyn_cast<RankedTensorType>();
-  if (!operandType) return failure();
-
-  if (operandType.getRank() != numStartIndices) {
-    return emitOpError() << "has mismatched number of start indices ("
-                         << numStartIndices << ") and the rank of operand ("
-                         << operandType.getRank() << ")";
-  }
-
-  for (int i = 0; i < numSliceSizes; ++i) {
-    int64_t sliceSize = getSliceSizes().getValues<int64_t>()[i];
-    if (sliceSize < 0) {
-      return emitOpError() << "has negative size index to dynamic slice: "
-                           << sliceSize;
-    }
-    if (!operandType.isDynamicDim(i)) {
-      int64_t dimSize = operandType.getDimSize(i);
-      if (sliceSize > dimSize) {
-        return emitOpError() << "has slice size " << sliceSize
-                             << " greater than dimension size " << dimSize
-                             << " in dimension " << i << " of operand";
-      }
-    }
-  }
-  return success();
-}
-
-LogicalResult DynamicSliceOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  DynamicSliceOp::Adaptor adaptor(operands, attributes, regions);
-  Value operand = adaptor.getOperand();
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  if (!operandType) return failure();
-
-  auto sliceSizes = adaptor.getSliceSizes();
-  Type elementTy = operandType.getElementType();
-  inferredReturnShapes.emplace_back(sliceSizes.getValues<int64_t>(), elementTy);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// RealDynamicSliceOp
-//===----------------------------------------------------------------------===//
-// Verifies that operand rank matches start_indices/limit_indices/strides size
-LogicalResult RealDynamicSliceOp::verify() {
-  auto inputType = getOperand().getType().dyn_cast<RankedTensorType>();
-  // If operand is unranked, there is very little to verify statically.
-  if (!inputType) return success();
-  int inputRank = inputType.getRank();
-
-  auto startType = getStartIndices().getType().cast<RankedTensorType>();
-  auto limitType = getLimitIndices().getType().cast<RankedTensorType>();
-  auto stridesType = getStrides().getType().cast<RankedTensorType>();
-
-  if (inputRank != startType.getNumElements()) {
-    return emitOpError() << "has mismatched number of operand rank ("
-                         << inputRank << ") and start_indices size ("
-                         << startType.getNumElements() << ")";
-  }
-
-  if (inputRank != limitType.getNumElements()) {
-    return emitOpError() << "has mismatched number of operand rank ("
-                         << inputRank << ") and limit_indices size ("
-                         << limitType.getNumElements() << ")";
-  }
-
-  if (inputRank != stridesType.getNumElements()) {
-    return emitOpError() << "has mismatched number of operand rank ("
-                         << inputRank << ") and strides size ("
-                         << stridesType.getNumElements() << ")";
-  }
-
-  return success();
-}
-
-namespace {
-// Canonicalizes RealDynamicSlice ops that can be replaced instead with Slice
-// ops. This canonicalization is applied the case when the `begin` input values
-// are compile time constants and thus can be made into a tensor.
-struct RealDynamicSliceIsStatic : public OpRewritePattern<RealDynamicSliceOp> {
-  using OpRewritePattern<RealDynamicSliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(RealDynamicSliceOp realDynamicSlice,
-                                PatternRewriter& rewriter) const override {
-    Location loc = realDynamicSlice.getLoc();
-    Value input = realDynamicSlice.getOperand();
-    Value output = realDynamicSlice.getResult();
-    auto inputTy = input.getType().dyn_cast<RankedTensorType>();
-    auto outputTy = output.getType().dyn_cast<RankedTensorType>();
-
-    if (!inputTy || !outputTy || !inputTy.hasStaticShape() ||
-        !outputTy.hasStaticShape()) {
-      return failure();
-    }
-
-    int64_t inputRank = inputTy.getRank();
-
-    auto startVal = realDynamicSlice.getStartIndices();
-    auto limitVal = realDynamicSlice.getLimitIndices();
-    auto strideVal = realDynamicSlice.getStrides();
-    auto startOp = startVal.getDefiningOp<mlir::arith::ConstantOp>();
-    auto limitOp = limitVal.getDefiningOp<mlir::arith::ConstantOp>();
-    auto strideOp = strideVal.getDefiningOp<mlir::arith::ConstantOp>();
-    if (!startOp || !limitOp || !strideOp) return failure();
-
-    auto startAttr =
-        startOp.getValue().dyn_cast_or_null<DenseIntElementsAttr>();
-    auto limitAttr =
-        limitOp.getValue().dyn_cast_or_null<DenseIntElementsAttr>();
-    auto strideAttr =
-        strideOp.getValue().dyn_cast_or_null<DenseIntElementsAttr>();
-    if (!startAttr || !limitAttr || !strideAttr) return failure();
-
-    SmallVector<int64_t, 4> tempStartIndices;
-    SmallVector<int64_t, 4> tempLimitIndices;
-    SmallVector<int64_t, 4> tempStride;
-    for (int64_t dimIdx = 0; dimIdx < inputRank; dimIdx++) {
-      int64_t start = startAttr.getValues<IntegerAttr>()[dimIdx].getInt();
-      tempStartIndices.push_back(start);
-      int64_t limit = limitAttr.getValues<IntegerAttr>()[dimIdx].getInt();
-      tempLimitIndices.push_back(limit);
-      int64_t end = strideAttr.getValues<IntegerAttr>()[dimIdx].getInt();
-      tempStride.push_back(end);
-    }
-
-    DenseIntElementsAttr sliceStartIndices =
-        rewriter.getI64TensorAttr(tempStartIndices);
-    DenseIntElementsAttr sliceLimitIndices =
-        rewriter.getI64TensorAttr(tempLimitIndices);
-    DenseIntElementsAttr sliceStrides = rewriter.getI64TensorAttr(tempStride);
-    auto result = rewriter.create<SliceOp>(loc, input, sliceStartIndices,
-                                           sliceLimitIndices, sliceStrides);
-    rewriter.replaceOp(realDynamicSlice, {result});
-    return success();
-  }
-};
-}  // namespace
-
-void RealDynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                     MLIRContext* context) {
-  results.add<RealDynamicSliceIsStatic, RealDSliceToSlice>(context);
-}
-
-LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  RealDynamicSliceOp::Adaptor adaptor(operands);
-  Value operand = adaptor.getOperand();
-  Value startIndices = adaptor.getStartIndices();
-  Value limitIndices = adaptor.getLimitIndices();
-  Value strides = adaptor.getStrides();
-
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!operandType) return failure();
-
-  Location loc = this->getLoc();
-  SmallVector<Value, 4> shapeValues;
-  shapeValues.reserve(operandType.getRank());
-  Type shapeScalarType =
-      startIndices.getType().cast<ShapedType>().getElementType();
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
-  one = maybeCastTo(builder, loc, one, shapeScalarType);
-  for (const auto& element : llvm::enumerate(operandType.getShape())) {
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, element.index());
-    Value valueStart =
-        builder.create<tensor::ExtractOp>(loc, startIndices, offset);
-    Value valueLimit =
-        builder.create<tensor::ExtractOp>(loc, limitIndices, offset);
-    Value valueStride = builder.create<tensor::ExtractOp>(loc, strides, offset);
-    // size = (limit - start + stride - 1) / stride
-    shapeValues.push_back(builder.create<arith::DivSIOp>(
-        loc,
-        builder.create<arith::SubIOp>(
-            loc,
-            builder.create<arith::AddIOp>(
-                loc, valueStride,
-                builder.create<arith::SubIOp>(loc, valueLimit, valueStart)),
-            one),
-        valueStride));
-  }
-
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            shapeScalarType),
-      shapeValues));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// InfeedOp
-//===----------------------------------------------------------------------===//
-
-// Checks that the result type is of the form `zero_or_more_type(s),
-// mhlo::token`
-LogicalResult InfeedOp::verify() {
-  auto resultTypes = getResultTypes();
-  if (resultTypes.empty())
-    return emitOpError()
-           << "result is expected to be at least of size 1, but got "
-           << resultTypes.size();
-
-  if (!resultTypes[resultTypes.size() - 1].isa<TokenType>())
-    return emitOpError() << "last element of result types is expected to "
-                            "be of token type, but got "
-                         << resultTypes[resultTypes.size() - 1];
-
-  // Verify layout attribute
-  constexpr char kLayoutAttr[] = "layout";
-  if (!getOperation()->hasAttr(kLayoutAttr)) return success();
-
-  mlir::ArrayAttr layout =
-      getOperation()->getAttrOfType<mlir::ArrayAttr>(kLayoutAttr);
-  if (!layout)
-    return emitOpError() << "layout-attribute expected to be of array-type.";
-
-  if (layout.size() != resultTypes.size() - 1) {
-    return emitOpError() << "layout-attribute size must be "
-                         << resultTypes.size() - 1
-                         << " (which is the number of "
-                            "op-results - 1 (for token result)), but got "
-                         << layout.size();
-  }
-
-  for (auto childLayout : layout) {
-    mlir::ArrayAttr childLayoutArr = childLayout.dyn_cast<mlir::ArrayAttr>();
-    if (!childLayoutArr) {
-      return emitOpError() << "layout-attribute expected to have "
-                              "elements of type array, but got "
-                           << childLayout;
-    }
-
-    for (auto i : childLayoutArr) {
-      mlir::IntegerAttr attr = i.dyn_cast<mlir::IntegerAttr>();
-      if (!attr) {
-        return emitOpError() << "layout-attribute's leaf elements are "
-                                "expected to be of type integer, but got "
-                             << i;
-      }
-    }
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// MapOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult MapOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  MapOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferMapOp(location, adaptor.getInputs(), adaptor.getDimensions(),
-                         adaptor.getComputation(), inferredReturnShapes);
-}
-
-OpFoldResult MapOp::fold(ArrayRef<Attribute> operands) {
-  mlir::Block& bb = getComputation().front();
-  mlir::Operation& frontOp = bb.front();
-
-  auto retOp = mlir::dyn_cast<ReturnOp>(frontOp);
-  if (!retOp) return nullptr;
-  if (retOp.getResults().size() != 1) return nullptr;
-
-  for (mlir::BlockArgument barg : bb.getArguments()) {
-    if (barg == retOp.getResults()[0])
-      return getOperands()[barg.getArgNumber()];
-  }
-  return nullptr;
-}
-
-LogicalResult MapOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
-                                     &reifiedReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// RecvOp
-//===----------------------------------------------------------------------===//
-
-// Checks that the result type is of the form `zero_or_more_type(s),
-// mhlo::token`
-LogicalResult RecvOp::verify() {
-  auto resultTypes = getResultTypes();
-  if (resultTypes.empty())
-    return emitOpError()
-           << "result is expected to be at least of size 1, but got "
-           << resultTypes.size();
-  if (!resultTypes[resultTypes.size() - 1].isa<TokenType>())
-    return emitOpError() << "last element of result types is expected to "
-                            "be of token type, but got "
-                         << resultTypes[resultTypes.size() - 1];
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// CopyOp
-//===----------------------------------------------------------------------===//
-
-OpFoldResult CopyOp::fold(ArrayRef<Attribute> operands) { return getOperand(); }
-
-//===----------------------------------------------------------------------===//
-// ReduceWindowOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Infer the return-type of ReduceWindowOp.
-SmallVector<TensorType> inferReduceWindowOpReturnType(
-    ArrayRef<TensorType> inputTypes, ArrayRef<TensorType> initTypes,
-    const ArrayRef<hlo::WindowDimension> window) {
-  SmallVector<TensorType> outputTypes;
-  for (size_t i = 0; i < inputTypes.size(); ++i) {
-    if (!inputTypes[i].hasRank()) {
-      outputTypes.push_back(
-          UnrankedTensorType::get(initTypes[i].getElementType()));
-      continue;
-    }
-
-    outputTypes.push_back(RankedTensorType::get(
-        inferWindowOutputShape(inputTypes[i].getShape(), window),
-        initTypes[i].getElementType()));
-  }
-
-  return outputTypes;
-}
-}  // namespace
-
-LogicalResult ReduceWindowOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceWindowOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferReduceWindowOp(
-      location, adaptor.getInputs(), adaptor.getInitValues(),
-      adaptor.getWindowDimensions(), adaptor.getWindowStrides(),
-      adaptor.getBaseDilations(), adaptor.getWindowDilations(),
-      adaptor.getPadding(), adaptor.getBody(), inferredReturnShapes);
-}
-
-// Get the operation used for reduction applied to `result_index`th result. Its
-// expected to be a binary operation that consumes `result_index`th and
-// `result_index + getInputs().size`th arguments of the body.
-Operation* ReduceWindowOp::getReductionOp(int resultIndex) {
-  auto returnOp = cast<ReturnOp>(getBody().front().getTerminator());
-  Operation* computeOp = returnOp.getResults()[resultIndex].getDefiningOp();
-  if (computeOp->getNumOperands() != 2) return nullptr;
-  auto arg0 = computeOp->getOperand(0).dyn_cast<BlockArgument>();
-  auto arg1 = computeOp->getOperand(1).dyn_cast<BlockArgument>();
-  if (!arg0 || !arg1) return nullptr;
-  int64_t arg0Num = arg0.getArgNumber();
-  int64_t arg1Num = arg1.getArgNumber();
-  int64_t otherArgIndex = resultIndex + getInputs().size();
-  if (arg0Num == resultIndex && arg1Num == otherArgIndex) return computeOp;
-  if (arg0Num == otherArgIndex && arg1Num == resultIndex &&
-      computeOp->hasTrait<mlir::OpTrait::IsCommutative>())
-    return computeOp;
-  return nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// ReducePrecisionOp
-//===----------------------------------------------------------------------===//
-
-// The following property is already enforced by the ODS:
-//  P0. operand element type is float
-//  P1. mantissa_bits >= 0
-// We intend to verify the following properties
-//  P2. exponent_bits >= 1
-LogicalResult ReducePrecisionOp::verify() {
-  if (getExponentBits() < 1) {
-    return emitOpError() << "exponent_bits must be at least 1.";
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ReverseOp
-//===----------------------------------------------------------------------===//
-
-template <typename T>
-static Attribute foldReverseHelper(DenseElementsAttr& attr, ShapedType& type,
-                                   DenseIntElementsAttr& dims) {
-  int64_t numElements = attr.getNumElements();
-  // No-op if the tensor has 0 elements.
-  // No-op if the result of folding is too large.
-  if (numElements == 0 || numElements > kFoldOpEltLimit) return {};
-
-  SmallVector<T> result(attr.getValues<T>().begin(), attr.getValues<T>().end());
-
-  size_t rank = type.getRank();
-  SmallVector<int64_t> stride(rank + 1, numElements);
-  for (size_t i = 0; i < rank; i++) {
-    if (type.getDimSize(i) == 0) return {};
-    stride[i + 1] = stride[i] / type.getDimSize(i);
-  }
-
-  for (auto dim : dims.getValues<int64_t>()) {
-    // For example, given:
-    //   * tensor: tensor<2x3x2xi32>
-    //     [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9,10], [11, 12]]]
-    //   * dim: [1]
-    //
-    // We're going to reverse the tensor with respect to dim as follows:
-    //   1) Split the tensor into blocks, i.e. smaller tensors whose type is
-    //   derived from the tensor by dropping the first `dim` dimensions, i.e.
-    //   tensor<3x2xi32> for the running example.
-    //   2) Split each block into windows, i.e. even smaller tensors whose type
-    //   is derived from the block by dropping the first dimension of the
-    //   block, i.e. tensor<2xi32> for the running example.
-    //   3) Within each block, swap windows but don't change the order of
-    //   elements within the windows: 0th window goes to N-1st spot, 1st window
-    //   goes to N-2nd spot etc.
-    //
-    // For the running example, the result will be:
-    //   [[[5, 6], [3, 4], [1, 2]], [[11, 12], [9, 10], [7, 8]]].
-    //
-    // Note how elements within windows haven't changed their order with respect
-    // to each other and how blocks haven't changed their order with respect to
-    // each other.
-    int64_t numWindows = type.getDimSize(dim);
-    int64_t windowSize = stride[dim] / numWindows;
-
-    for (int64_t index = 0; index < numElements; index++) {
-      int64_t blockNumber = index / stride[dim];
-      int64_t windowNumber = (index % stride[dim]) / windowSize;
-      int64_t reversedWindowNumber = numWindows - windowNumber - 1;
-      if (windowNumber >= reversedWindowNumber) continue;
-      int64_t reversedIndex = blockNumber * stride[dim] +
-                              reversedWindowNumber * windowSize +
-                              index % windowSize;
-      std::swap(result[index], result[reversedIndex]);
-    }
-  }
-  return DenseElementsAttr::get(type, result);
-}
-
-OpFoldResult ReverseOp::fold(ArrayRef<Attribute> operands) {
-  Value input = getOperand();
-
-  // No dimensions to reverse.
-  DenseIntElementsAttr dims = getDimensions();
-  if (dims.getNumElements() == 0) return input;
-
-  // If size of all dimensions to reverse equals 1, then the reverse is a no-op.
-  // Eg. Reverse dimensions {0,1} of a 1x1x2 tensor
-  auto shapedType = input.getType().cast<ShapedType>();
-  if (llvm::all_of(dims.getValues<int64_t>(), [&](int64_t dim) {
-        return shapedType.getDimSize(dim) == 1;
-      }))
-    return input;
-
-  // If the operand is a static shaped tensor of constants, return reversed
-  // tensor
-  DenseElementsAttr inputAttr =
-      operands.begin()->dyn_cast_or_null<DenseElementsAttr>();
-  if (inputAttr && shapedType.hasStaticShape()) {
-    auto etype = shapedType.getElementType();
-    if (etype.isa<IntegerType>())
-      return foldReverseHelper<APInt>(inputAttr, shapedType, dims);
-    if (etype.isa<FloatType>())
-      return foldReverseHelper<APFloat>(inputAttr, shapedType, dims);
-  }
-
-  return {};
-}
-
-LogicalResult ReverseOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  return mlir::hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
-                                &reifiedReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// ReduceOp
-//===----------------------------------------------------------------------===//
-
-// Returns the result type after reducing operand of the given type across the
-// specified dimensions.
-static TensorType getReduceResultType(Type operandTy,
-                                      DenseIntElementsAttr dimensions) {
-  Type elementTy = getElementTypeOrSelf(operandTy);
-
-  auto rankedTy = operandTy.dyn_cast<RankedTensorType>();
-  if (!rankedTy) return UnrankedTensorType::get(elementTy);
-
-  int64_t rank = rankedTy.getRank();
-  llvm::SmallVector<bool, 4> dimsMask(rank, false);
-  for (int64_t dim : dimensions.getValues<int64_t>()) dimsMask[dim] = true;
-
-  SmallVector<int64_t, 4> shape;
-  for (int64_t i = 0; i < rank; ++i) {
-    if (!dimsMask[i]) shape.push_back(rankedTy.getDimSize(i));
-  }
-
-  return RankedTensorType::get(shape, elementTy);
-}
-
-LogicalResult ReduceOp::fold(ArrayRef<Attribute> operands,
-                             SmallVectorImpl<OpFoldResult>& results) {
-  // No dimensions to reduce.
-  if (getDimensions().getNumElements() == 0) {
-    for (Value operand : this->getInputs()) {
-      results.push_back(operand);
-    }
-    return success();
-  }
-
-  // If all returned values in the ReduceOp region exists outside
-  // the region replace the ReduceOp with those values.
-  mlir::Block& bb = this->getBody().front();
-  SmallVector<Value> replacedResults;
-  if (auto retOp = mlir::dyn_cast<ReturnOp>(bb.back())) {
-    for (Value result : retOp.getResults()) {
-      if (result.getParentRegion() == retOp->getParentRegion())
-        return failure();
-      replacedResults.push_back(result);
-    }
-
-    results.insert(results.end(), replacedResults.begin(),
-                   replacedResults.end());
-    return success();
-  }
-
-  return failure();
-}
-
-bool hasSameOperandAndResultTypes(Operation& op) {
-  Type expected;
-  if (op.getNumResults() != 0) expected = op.getResult(0).getType();
-  if (op.getNumOperands() != 0) expected = op.getOperand(0).getType();
-  if (!expected) return false;
-
-  auto typeMatch = [&](Type actual) { return actual == expected; };
-  return llvm::all_of(op.getOperandTypes(), typeMatch) &&
-         llvm::all_of(op.getResultTypes(), typeMatch);
-}
-
-// Checks the following eligibility criteria for compact printing of
-// mhlo.reduce:
-// E1. The reduce-op wraps a single inner-op in the associated region.
-// E2. The single operation is a commutative binary-op from mhlo dialect, zero
-//     region, producing single result such that the operands and result all
-//     have the same type.
-// E3. The reduce-op consist of at least one input-operand; The operand-types of
-//     inner-op should be derived trivially from the element-type of reduce-op's
-//     first input-operand.
-// E4. The  arguments of the region's only basic block are forwarded perfectly
-//     to inner-op's operands.
-// E5. The reduce-op, inner-op, blocks arguments, and the return-op all have the
-//     same location.
-// E6. The single operation result is perfectly forwarded to the reduce op
-//     return.
-static bool isEligibleForCompactPrint(ReduceOp op) {
-  // Check E1.
-  auto& block = op.getBody().front();
-  if (!hasSingleElement(block.without_terminator())) return false;
-
-  Operation& innerOp = *block.begin();
-
-  // Check E2.
-  if (innerOp.getDialect() != op->getDialect()) return false;
-
-  if (innerOp.getNumOperands() != 2 ||
-      !innerOp.hasTrait<mlir::OpTrait::OneResult>() ||
-      !hasSameOperandAndResultTypes(innerOp) ||
-      !innerOp.hasTrait<mlir::OpTrait::IsCommutative>() ||
-      !innerOp.hasTrait<mlir::OpTrait::ZeroRegions>())
-    return false;
-
-  // Check E3.
-  if (op.getInputs().empty()) return false;
-
-  auto elemType =
-      op.getInputs()[0].getType().cast<TensorType>().getElementType();
-  auto expectedInnerOpType = RankedTensorType::get(/*shape=*/{}, elemType);
-  if (innerOp.getOperands()[0].getType() != expectedInnerOpType) return false;
-
-  // Check E4.
-  if (!llvm::equal(block.getArguments(), innerOp.getOperands())) return false;
-
-  // Check E5.
-  auto retOp = dyn_cast<ReturnOp>(block.getTerminator());
-  if (!retOp) return false;
-
-  auto blockArgLoc = block.getArgument(0).getLoc();
-  if (blockArgLoc != block.getArgument(1).getLoc()) return false;
-
-  if (innerOp.getLoc() != op.getLoc() || retOp.getLoc() != op.getLoc() ||
-      blockArgLoc != op.getLoc())
-    return false;
-
-  // Check E6.
-  return llvm::equal(innerOp.getResults(), retOp.getOperands());
-}
-
-void ReduceOp::print(OpAsmPrinter& p) {
-  {
-    // Print the pairs of operands under the form:
-    //   (%arg0 init: %arg3), (%arg1 init: %arg4), (%arg2 init: %arg5)
-    StringRef comma = "";
-    int numOperandPairs = getNumOperands() / 2;
-    for (int opId : llvm::seq<int>(0, numOperandPairs)) {
-      p << comma << "(" << getOperand(opId)
-        << " init: " << getOperand(opId + numOperandPairs) << ")";
-      comma = ", ";
-    }
-  }
-
-  // If the reduce-op is eligible for compact printing, we emit the one-liner:
-  //  mhlo.reduce applies <inner-op> across dimensions = [...] : <func-type>
-  // Note: We are not printing the function type of reduction operation. We
-  // have some simplifying assumptions (refer to IsEligibleForCompactPrint::E3)
-  // to derive the type from that of reduce-op.
-  if (isEligibleForCompactPrint(*this)) {
-    Operation& innerOp = getBody().front().front();
-    p << " applies ";
-    printEscapedString(innerOp.getName().getStringRef(), p.getStream());
-
-    p << " across dimensions = [";
-    llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
-    p << "]";
-    p << " : ";
-    p.printFunctionalType(*this);
-  } else {
-    p << " across dimensions = [";
-    llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
-    p << "]";
-    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
-    p << " : ";
-    p.printFunctionalType(*this);
-    p.printNewline();
-    p << " reducer";
-    {
-      // Print the pairs of block operands under the form:
-      //   (%arg0_elt, %arg0_acc) (%arg1_elt, %arg1_acc):
-      Block& reducer = getBody().front();
-      int numOperandPairs = getNumOperands() / 2;
-      for (int opId : llvm::seq<int>(0, numOperandPairs)) {
-        p << "(";
-        p.printRegionArgument(reducer.getArgument(opId));
-        p << ", ";
-        p.printRegionArgument(reducer.getArgument(opId + numOperandPairs));
-        p << ") ";
-      }
-    }
-    p << ' ';
-    p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
-  }
-}
-
-ParseResult ReduceOp::parse(OpAsmParser& parser, OperationState& result) {
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  Location currLocation = parser.getEncodedSourceLoc(loc);
-
-  // Parse the operands of reduce-op, this is a list of pair under the form:
-  //   (%arg0 init: %arg3), (%arg1 init: %arg4), (%arg2 init: %arg5)
-  // Each input to reduce is paired with its init value, even though in memory
-  // they are stored with the input first and the init values after.
-  SmallVector<OpAsmParser::UnresolvedOperand, 2> operands;
-  SmallVector<OpAsmParser::UnresolvedOperand, 2> initOperands;
-  do {
-    (void)parser.parseOptionalComma();
-    if (parser.parseOptionalLParen()) break;
-    OpAsmParser::UnresolvedOperand operand, initOperand;
-    if (parser.parseOperand(operand) || parser.parseKeyword("init") ||
-        parser.parseColon() || parser.parseOperand(initOperand) ||
-        parser.parseRParen())
-      return failure();
-    operands.push_back(operand);
-    initOperands.push_back(initOperand);
-  } while (true);
-  operands.append(initOperands);
-
-  // Check if we are parsing the compact version of reduce-op:
-  //  mhlo.reduce applies <inner-op> across dimensions = [...] : <func-type>
-  // else parse the "region-based" variant.
-  if (failed(parser.parseOptionalKeyword("applies"))) {
-    // Parse the inner-op dimensions, reduce-op's function-type and
-    // optional location.
-    SmallVector<int64_t> dimensions;
-    auto parseDim = [&]() -> ParseResult {
-      if (parser.parseInteger(dimensions.emplace_back())) return failure();
-      return success();
-    };
-
-    FunctionType reduceOpFntype;
-    if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
-        parser.parseEqual() ||
-        parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
-                                       parseDim) ||
-        parser.parseOptionalAttrDict(result.attributes) ||
-        parser.parseColon() || parser.parseType(reduceOpFntype) ||
-        parser.parseKeyword("reducer"))
-      return failure();
-    OpBuilder builder(parser.getBuilder().getContext());
-    result.addAttribute("dimensions", builder.getI64TensorAttr(dimensions));
-
-    // Parse the "reducer" region now.
-    SmallVector<OpAsmParser::UnresolvedOperand, 2> reducerOperands;
-    SmallVector<OpAsmParser::UnresolvedOperand, 2> reducerInitOperands;
-    SmallVector<Type, 2> reducerTypes;
-    SmallVector<Type, 2> reducerInitTypes;
-    SmallVector<Optional<Location>, 2> reducerLocs;
-    SmallVector<Optional<Location>, 2> reducerInitLocs;
-    auto parseBlockOperand =
-        [&](SmallVectorImpl<OpAsmParser::UnresolvedOperand>& operands,
-            SmallVectorImpl<Type>& types,
-            SmallVectorImpl<Optional<Location>>& locs) -> ParseResult {
-      OpAsmParser::UnresolvedOperand operand;
-      Type type;
-      Optional<Location> loc;
-      if (parser.parseOperand(operand, /*allowResultNumber=*/false) ||
-          parser.parseColon() || parser.parseType(type) ||
-          parser.parseOptionalLocationSpecifier(loc))
-        return failure();
-      operands.push_back(operand);
-      types.push_back(type);
-      locs.push_back(loc);
-      return success();
-    };
-    do {
-      if (failed(parser.parseOptionalLParen())) break;
-      if (parseBlockOperand(reducerOperands, reducerTypes, reducerLocs) ||
-          parser.parseComma() ||
-          parseBlockOperand(reducerInitOperands, reducerInitTypes,
-                            reducerInitLocs) ||
-          parser.parseRParen())
-        return failure();
-    } while (true);
-    reducerOperands.append(reducerInitOperands);
-    reducerTypes.append(reducerInitTypes);
-    reducerLocs.append(reducerInitLocs);
-    result.addTypes(reduceOpFntype.getResults());
-    SmallVector<OpAsmParser::Argument> reducerArgs;
-    createArgs(reducerOperands, reducerTypes, reducerArgs);
-
-    // Derive the SSA-values for reduce-op's operands and parse the region, and
-    // the optional trailing location.
-    Optional<Location> trailingLoc;
-    if (parser.resolveOperands(operands, reduceOpFntype.getInputs(), loc,
-                               result.operands) ||
-        parser.parseRegion(*result.addRegion(), reducerArgs))
-      return failure();
-    // Set the individual block arguments.
-    for (auto argAndLoc :
-         llvm::zip(result.regions.front()->front().getArguments(), reducerLocs))
-      if (std::get<1>(argAndLoc))
-        std::get<0>(argAndLoc).setLoc(std::get<1>(argAndLoc).value());
-    result.location = trailingLoc.value_or(currLocation);
-    return success();
-  }
-
-  // Parse the inner-op name and check if the contract on inner-op
-  // mentioned in "isEligibleForCompactPrint::E2" for pretty-priting is met.
-  FailureOr<OperationName> innerOpNameInfo = parser.parseCustomOperationName();
-  if (failed(innerOpNameInfo)) return failure();
-
-  StringRef innerOpName = innerOpNameInfo->getStringRef();
-  Dialect* innerOpDialect = innerOpNameInfo->getDialect();
-  if (!innerOpDialect || !innerOpDialect->getNamespace().equals("mhlo") ||
-      !innerOpNameInfo->hasTrait<mlir::OpTrait::NOperands<2>::Impl>() ||
-      !innerOpNameInfo->hasTrait<mlir::OpTrait::OneResult>() ||
-      !innerOpNameInfo->hasTrait<mlir::OpTrait::IsCommutative>() ||
-      !innerOpNameInfo->hasTrait<mlir::OpTrait::ZeroRegions>()) {
-    parser.emitError(loc,
-                     "expected the inner-op to be a commutative binary-op from "
-                     "mhlo dialect, zero region, producing single result");
-    return failure();
-  }
-
-  // Parse the inner-op dimensions, reduce-op's function-type and
-  // optional location.
-  SmallVector<int64_t> dimensions;
-  auto parseDim = [&]() -> ParseResult {
-    if (parser.parseInteger(dimensions.emplace_back())) return failure();
-    return success();
-  };
-
-  Optional<Location> explicitLoc;
-  FunctionType reduceOpFntype;
-  if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
-      parser.parseEqual() ||
-      parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
-      parser.parseColon() || parser.parseType(reduceOpFntype) ||
-      parser.parseOptionalLocationSpecifier(explicitLoc))
-    return failure();
-
-  if (!reduceOpFntype || reduceOpFntype.getInputs().empty()) {
-    if (!reduceOpFntype) return parser.emitError(loc, "expected function type");
-    return parser.emitError(loc,
-                            "input types missing in reduce-op function type");
-  }
-
-  // If location of reduce-op is explicitly provided, then use it; Else use
-  // the parser's current location.
-  Location reduceOpLoc = explicitLoc.value_or(currLocation);
-
-  // Derive the SSA-values for reduce-op's operands.
-  if (parser.resolveOperands(operands, reduceOpFntype.getInputs(), loc,
-                             result.operands))
-    return failure();
-
-  // Derive the type of inner-op from that of reduce-op's input operand.
-  auto innerOpType = RankedTensorType::get(
-      /*shape=*/{}, getElementTypeOrSelf(reduceOpFntype.getInput(0)));
-
-  // Add a region for reduce-op.
-  Region& region = *result.addRegion();
-
-  // Create a basic-block inside reduce-op's region.
-  Block& block = region.emplaceBlock();
-  auto lhs = block.addArgument(innerOpType, reduceOpLoc);
-  auto rhs = block.addArgument(innerOpType, reduceOpLoc);
-
-  // Create and insert an "inner-op" operation in the block.
-  OpBuilder builder(parser.getBuilder().getContext());
-  builder.setInsertionPointToStart(&block);
-
-  OperationState innerOpState(reduceOpLoc, innerOpName);
-  innerOpState.operands.push_back(lhs);
-  innerOpState.operands.push_back(rhs);
-  innerOpState.addTypes(innerOpType);
-
-  Operation* innerOp = builder.create(innerOpState);
-
-  // Insert a return statement in the block returning the inner-op's result.
-  builder.create<ReturnOp>(innerOp->getLoc(), innerOp->getResults());
-
-  // Populate the reduce-op operation-state with result-type, location, and
-  // dimension attribute.
-  result.addTypes(reduceOpFntype.getResults());
-  result.location = innerOp->getLoc();
-  result.addAttribute("dimensions", builder.getI64TensorAttr(dimensions));
-
-  return success();
-}
-
-LogicalResult ReduceOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location>, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  ReduceOp::Adaptor adaptor(operands, attributes, regions);
-  for (auto input : adaptor.getInputs()) {
-    ShapedType outputType =
-        getReduceResultType(input.getType(), adaptor.getDimensions());
-    inferredReturnShapes.emplace_back(outputType);
-  }
-  return success();
-}
-
-LogicalResult ReduceOp::verify() {
-  SmallVector<ShapedTypeComponents> unusedReturnShapes;
-  return hlo::inferReduceOp(getLoc(), getInputs(), getInitValues(),
-                            getDimensions(), getBody(), unusedReturnShapes);
-}
-
-// Enable constant folding to occur within the region of the ReduceOp
-// by replacing block argument uses with constants if:
-//  1. All the ReduceOp operands are splat constants.
-//  2. The ReduceOp region consists of a single logical AND or logical OR.
-// The pattern leverages the idempotent property of the AND and OR operators
-// to determine the value of a reduction on splat constants. Other boolean
-// operators do not have this property, and need separate patterns to resolve
-// reductions of their splat constants.
-struct LowerBoolSplatConstantsIntoRegion : public OpRewritePattern<ReduceOp> {
-  using OpRewritePattern<ReduceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    mlir::Block& bb = op.getBody().front();
-
-    // Ensure only a compute op and return op exist and the
-    // compute op is an AND or OR op.
-    if (bb.getOperations().size() != 2) return failure();
-    if (!mlir::isa<mhlo::AndOp, mhlo::OrOp>(bb.front())) return failure();
-
-    // Ensure all operands are splat constants.
-    SmallVector<DenseElementsAttr, 4> bargCstAttrs;
-    for (auto inpAndBarg : llvm::zip(op.getOperands(), bb.getArguments())) {
-      Value inp = std::get<0>(inpAndBarg);
-      BlockArgument barg = std::get<1>(inpAndBarg);
-      ConstantOp cst = inp.getDefiningOp<ConstantOp>();
-      if (!cst) return failure();
-
-      auto cstAttr = cst.getValue().dyn_cast_or_null<DenseElementsAttr>();
-      if (!cstAttr.isSplat()) {
-        return rewriter.notifyMatchFailure(op, "Must be splat constant.");
-      }
-
-      auto bargShapedType = barg.getType().dyn_cast<ShapedType>();
-      if (!bargShapedType) return failure();
-
-      auto bargCstAttr = DenseElementsAttr::get(
-          bargShapedType, cstAttr.getSplatValue<mlir::Attribute>());
-      bargCstAttrs.push_back(bargCstAttr);
-    }
-
-    // Create new splat constants to replace block arguments.
-    for (BlockArgument barg : bb.getArguments()) {
-      int argIdx = barg.getArgNumber();
-      mhlo::ConstantOp newCst = rewriter.create<mhlo::ConstantOp>(
-          bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
-      barg.replaceAllUsesWith(newCst);
-    }
-    return success();
-  }
-};
-
-void ReduceOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                           MLIRContext* context) {
-  results.add<LowerBoolSplatConstantsIntoRegion>(context);
-}
-
-LogicalResult ReduceOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  ReduceOp::Adaptor adaptor(operands);
-  auto inputs = adaptor.getInputs();
-
-  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!operandType) return failure();
-
-  Location loc = this->getLoc();
-  SmallVector<Value, 4> shapeValues;
-  SmallVector<int64_t, 4> dimensions(
-      this->getDimensions().getValues<int64_t>());
-  shapeValues.reserve(operandType.getRank());
-  Type shapeScalarType = builder.getIndexType();
-  auto toShapeScalarType = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shapeScalarType);
-  };
-
-  for (const auto& element : llvm::enumerate(operandType.getShape())) {
-    int64_t idx = element.index();
-    auto* it = std::find(dimensions.begin(), dimensions.end(), idx);
-    if (it != dimensions.end()) {
-      continue;
-    }
-    Value valueDim = toShapeScalarType(
-        builder.create<tensor::DimOp>(loc, inputs[0], element.index()));
-    shapeValues.push_back(valueDim);
-  }
-
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            shapeScalarType),
-      shapeValues);
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    reifiedReturnShapes.push_back(outputShape);
-  }
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// RngBitGeneratorOp
-//===----------------------------------------------------------------------===//
-
-// Verify that input state has the same shape as output shape
-LogicalResult RngBitGeneratorOp::verify() {
-  auto initialShape = getInitialState().getType().dyn_cast<RankedTensorType>();
-  auto outputShape = getOutputState().getType().dyn_cast<RankedTensorType>();
-  if (initialShape.getShape() != outputShape.getShape())
-    return emitOpError()
-           << "output state shape must match initial state shape. Got: "
-           << initialShape << " and " << outputShape;
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// RngOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult RngOp::verify() {
-  auto dist = getRngDistribution();
-  if (dist == RngDistribution::UNIFORM) {
-    return success();
-  }
-  auto muTy = getA().getType().cast<TensorType>().getElementType();
-  auto sigmaTy = getB().getType().cast<TensorType>().getElementType();
-  if (muTy.isa<FloatType>() && sigmaTy.isa<FloatType>()) {
-    return success();
-  }
-  return emitOpError() << "mu and sigma must be floats";
-}
-
-LogicalResult RngOp::inferReturnTypeComponents(
-    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  return rngInferReturnTypeComponents(context, location, operands, attributes,
-                                      regions, inferredReturnShapes);
-}
-
-LogicalResult RngOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  RngOp::Adaptor adaptor(operands);
-  reifiedReturnShapes.push_back(
-      castToIndexTensor(builder, getLoc(), adaptor.getShape()));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// XlaRngGetAndUpdateStateOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult XlaRngGetAndUpdateStateOp::verify() {
-  auto resultTy = getType().cast<RankedTensorType>();
-  if (!resultTy) return emitOpError() << "Output is not ranked.";
-  if (!resultTy.hasStaticShape())
-    return emitOpError() << "Output is not statically shaped.";
-  auto rank = resultTy.getRank();
-  if (rank != 1)
-    return emitOpError() << "Output is of rank " << rank << " instead of 1";
-  auto extent = resultTy.getDimSize(0);
-  if (extent != 2)
-    return emitOpError() << "Output size is " << extent << " instead of 2";
-
-  return success();
-}
-
-LogicalResult XlaRngGetAndUpdateStateOp::inferReturnTypes(
-    MLIRContext* ctx, Optional<Location>, ValueRange, DictionaryAttr,
-    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(mlir::RankedTensorType::get(
-      {2}, mlir::IntegerType::get(ctx, 64, IntegerType::Unsigned)));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// SelectOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult SelectOp::verify() {
-  // The operands 'on_true' and 'on_false' should have compatible types, i.e.,
-  //   (a) have the same element type, and
-  //   (b) have compatible shapes (i.e. the same shape and/or at least one
-  //       dynamic shape)
-  if (!hlo::compatibleShapeAndElementType(getOnTrue().getType(),
-                                          getOnFalse().getType()))
-    return emitOpError()
-           << "requires compatible types for non-predicate operands";
-
-  // The predicate, if not-scalar, should have the same shape as the remaining
-  // operands.
-  auto predTy = getPred().getType().dyn_cast<RankedTensorType>();
-  bool predMayBeScalar = !predTy || predTy.getRank() == 0;
-  if (predMayBeScalar) return success();
-
-  if (failed(verifyCompatibleShape(getPred().getType(), getOnTrue().getType())))
-    return emitOpError() << "requires the same shape for all operands";
-
-  return success();
-}
-
-OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
-  if (getOnTrue() == getOnFalse()) {
-    return getOnTrue();
-  }
-
-  auto predicate = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
-  if (!predicate) {
-    return {};
-  }
-
-  auto predicateTy = predicate.getType().cast<ShapedType>();
-  if (!predicateTy.getElementType().isInteger(1)) {
-    return {};
-  }
-
-  if (predicate.isSplat()) {
-    return predicate.getSplatValue<APInt>().getBoolValue() ? getOnTrue()
-                                                           : getOnFalse();
-  }
-
-  return {};
-}
-
-// simplify select(not(%pred), true_value, false_value) => select(%pred,
-// false_value, true_value)
-static LogicalResult selectCanonicalization(SelectOp selectOp,
-                                            PatternRewriter& rewriter) {
-  auto notOp = selectOp.getPred().getDefiningOp<NotOp>();
-  if (!notOp) {
-    return failure();
-  }
-  std::array<Value, 3> newOperands = {notOp.getOperand(), selectOp.getOnFalse(),
-                                      selectOp.getOnTrue()};
-  rewriter.updateRootInPlace(
-      selectOp, [&]() { selectOp.getOperation()->setOperands(newOperands); });
-  return success();
-}
-
-void SelectOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                           MLIRContext* /*context*/) {
-  results.add(&selectCanonicalization);
-}
-
-// Makes it such that a SelectOp that is a non-root operation in a DRR infers
-// the return type based on operand type.
-LogicalResult SelectOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SelectOp::Adaptor op(operands, attributes);
-  auto trueType = op.getOnTrue().getType().cast<TensorType>();
-  auto falseType = op.getOnFalse().getType().cast<TensorType>();
-
-  // The output shape should be the most general of the operand shapes at each
-  // dimension.
-  ShapedTypeComponents& outputType = inferredReturnShapes.emplace_back();
-  if (trueType == falseType || !trueType.hasRank()) {
-    outputType = ShapedTypeComponents(trueType.cast<ShapedType>());
-  } else if (!falseType.hasRank()) {
-    outputType = ShapedTypeComponents(falseType.cast<ShapedType>());
-  } else {
-    assert(trueType.getRank() == falseType.getRank());
-    llvm::SmallVector<int64_t, 4> dims;
-    dims.reserve(trueType.getRank());
-    for (auto dim : llvm::zip(trueType.getShape(), falseType.getShape())) {
-      dims.push_back(std::get<0>(dim) == std::get<1>(dim)
-                         ? std::get<0>(dim)
-                         : ShapedType::kDynamicSize);
-    }
-    outputType = ShapedTypeComponents(dims, trueType.getElementType());
-  }
-  return success();
-}
-
-LogicalResult SelectOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  // For `hlo.select`, the first operand may be a scalar.
-  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands[1],
-                                     &reifiedReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// SetDimensionSizeOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult SetDimensionSizeOp::verify() {
-  if (auto size = this->getSize().getType().dyn_cast<RankedTensorType>()) {
-    if (size.getRank() != 0)
-      return emitOpError() << "size operand should be of rank-0";
-  }
-
-  return verifyDimAttr(*this);
-}
-
-OpFoldResult SetDimensionSizeOp::fold(ArrayRef<Attribute> operands) {
-  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  if (input) return input;
-
-  DenseElementsAttr size = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-  if (!size || !size.isSplat()) return {};
-
-  auto ty = getType().dyn_cast<RankedTensorType>();
-  if (!ty) return {};
-
-  int64_t dimSize = ty.getDimSize(getDimension());
-  if (dimSize == size.getSplatValue<IntegerAttr>().getInt())
-    return getOperand();
-  return {};
-}
-
-// TODO(b/238903565): Switch to inferReturnTypeComponents after adding support
-// for the encoding upstream.
-LogicalResult SetDimensionSizeOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  Location loc = location.value_or(UnknownLoc::get(context));
-
-  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
-  if (failed(adaptor.verify(loc))) return failure();
-
-  auto inputType = adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
-  if (!inputType) {
-    inferredReturnTypes.push_back(adaptor.getOperand().getType());
-    return success();
-  }
-
-  int64_t dim = adaptor.getDimension();
-  int64_t rank = inputType.getRank();
-  if (dim < 0 || dim >= rank) {
-    return mlir::emitError(loc) << "expects dimension to be in range [0, "
-                                << rank << "); got: [" << dim << "].";
-  }
-
-  auto shape = llvm::to_vector<4>(inputType.getShape());
-  llvm::SmallVector<int64_t, 4> bounds(rank, ShapedType::kDynamicSize);
-  if (auto encoding =
-          inputType.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>())
-    bounds = llvm::to_vector<4>(encoding.getBounds());
-
-  if (shape[dim] != ShapedType::kDynamicSize) bounds[dim] = shape[dim];
-  shape[dim] = ShapedType::kDynamicSize;
-
-  DenseIntElementsAttr sizeAttr;
-  if (matchPattern(adaptor.getSize(), m_Constant(&sizeAttr))) {
-    int64_t splat =
-        sizeAttr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
-    if (splat == bounds[dim]) {
-      shape[dim] = splat;
-      bounds[dim] = ShapedType::kDynamicSize;
-    }
-  }
-
-  auto extensions = TypeExtensionsAttr::get(context, bounds);
-  auto resultType =
-      llvm::all_of(bounds,
-                   [](int64_t v) { return v == ShapedType::kDynamicSize; })
-          ? RankedTensorType::get(shape, inputType.getElementType())
-          : RankedTensorType::get(shape, inputType.getElementType(),
-                                  extensions);
-  inferredReturnTypes.push_back(resultType);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// PadOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult PadOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  PadOp::Adaptor adaptor(operands, attributes, regions);
-  auto inputType = adaptor.getOperand().getType().cast<RankedTensorType>();
-  auto padType = adaptor.getPaddingValue().getType().cast<RankedTensorType>();
-
-  if (padType.getRank() != 0) {
-    return emitOptionalError(
-        location, llvm::formatv("padding value type should be a rank-0 "
-                                "tensor, is rank {0}",
-                                padType.getRank()));
-  }
-
-  const auto& paddingLow = adaptor.getEdgePaddingLow();
-  if (paddingLow.getType().getNumElements() != inputType.getRank()) {
-    return emitOptionalError(
-        location,
-        llvm::formatv(
-            "edge_padding_low length ({0}) must match operand rank ({1})",
-            paddingLow.getType().getNumElements(), inputType.getRank()));
-  }
-
-  const auto& paddingHigh = adaptor.getEdgePaddingHigh();
-  if (paddingHigh.getType().getNumElements() != inputType.getRank()) {
-    return emitOptionalError(
-        location,
-        llvm::formatv(
-            "edge_padding_high length ({0}) must match operand rank ({1})",
-            paddingHigh.getType().getNumElements(), inputType.getRank()));
-  }
-
-  const auto& paddingInterior = adaptor.getInteriorPadding();
-  if (paddingInterior.getType().getNumElements() != inputType.getRank()) {
-    return emitOptionalError(
-        location,
-        llvm::formatv(
-            "interior_padding length ({0}) must match operand rank ({1})",
-            paddingInterior.getType().getNumElements(), inputType.getRank()));
-  }
-
-  auto inputShape = inputType.getShape();
-  SmallVector<int64_t> resultShape;
-  for (int i = 0, e = inputShape.size(); i < e; i++) {
-    if (hlo::isDynamicDimSize(inputShape[i])) {
-      resultShape.push_back(ShapedType::kDynamicSize);
-      continue;
-    }
-
-    int64_t paddingLowVal = paddingLow.getValues<APInt>()[i].getSExtValue();
-    int64_t paddingHighVal = paddingHigh.getValues<APInt>()[i].getSExtValue();
-    int64_t paddingInteriorVal =
-        paddingInterior.getValues<APInt>()[i].getSExtValue();
-    if (paddingInteriorVal < 0) {
-      return emitOptionalError(
-          location, llvm::formatv("Interior padding cannot be negative: {0}",
-                                  paddingInteriorVal));
-    }
-    int64_t expectedOutput =
-        inputShape[i] + paddingLowVal + paddingHighVal +
-        std::max<int64_t>(inputShape[i] - 1, 0LL) * paddingInteriorVal;
-    if (expectedOutput < 0) {
-      return emitOptionalError(
-          location,
-          llvm::formatv("Padding result in negative size for dimension {0}",
-                        i));
-    }
-    resultShape.push_back(expectedOutput);
-  }
-  inferredReturnShapes.emplace_back(resultShape, inputType.getElementType());
-
-  return success();
-}
-
-template <typename T>
-OpFoldResult padOpFoldHelper(DenseElementsAttr input, DenseElementsAttr padding,
-                             RankedTensorType returnType,
-                             DenseIntElementsAttr edgePaddingLow,
-                             DenseIntElementsAttr /*edgePaddingHigh*/,
-                             DenseIntElementsAttr interiorPadding) {
-  // Prevent folding if the result is too large.
-  if (returnType.getNumElements() > kFoldOpEltLimit) return {};
-
-  // Fill the full result tensor with the padding value.
-  llvm::SmallVector<T, 4> result(returnType.getNumElements(),
-                                 padding.getValues<T>()[0]);
-
-  auto nextIndex = [](llvm::SmallVector<uint64_t, 8>& index,
-                      llvm::ArrayRef<int64_t> shape) {
-    for (int64_t i = index.size() - 1; i >= 0; --i) {
-      ++index[i];
-      if (static_cast<int64_t>(index[i]) < shape[i]) return;
-      index[i] = 0;
-    }
-  };
-
-  // Iterate over all elements of the input tensor and copy it to the correct
-  // location in the output tensor.
-  llvm::SmallVector<uint64_t, 8> index(input.getType().getRank(), 0);
-  uint64_t numElements = input.getNumElements();
-  for (uint64_t operandIdx = 0; operandIdx < numElements; operandIdx++) {
-    uint64_t resultIdx = 0;
-    uint64_t idxMultiplyer = 1;
-    for (int64_t i = index.size() - 1; i >= 0; --i) {
-      resultIdx += (edgePaddingLow.getValues<int64_t>()[i] +
-                    index[i] * (interiorPadding.getValues<int64_t>()[i] + 1)) *
-                   idxMultiplyer;
-      idxMultiplyer *= returnType.getDimSize(i);
-    }
-    result[resultIdx] = input.getValues<T>()[index];
-    nextIndex(index, input.getType().getShape());
-  }
-  return DenseElementsAttr::get(returnType, result);
-}
-
-OpFoldResult PadOp::fold(ArrayRef<Attribute> operands) {
-  // If all padding is zero then it is an identity pad.
-  auto isZero = [](const APInt& i) { return i == 0; };
-  if (llvm::all_of(getEdgePaddingLow().getValues<APInt>(), isZero) &&
-      llvm::all_of(getEdgePaddingHigh().getValues<APInt>(), isZero) &&
-      llvm::all_of(getInteriorPadding().getValues<APInt>(), isZero))
-    return getOperand();
-
-  // If any padding is negative then it isn't supported by the folder (yet).
-  auto isNegative = [](const APInt& i) { return i.slt(0); };
-  if (llvm::any_of(getEdgePaddingLow().getValues<APInt>(), isNegative) ||
-      llvm::any_of(getEdgePaddingHigh().getValues<APInt>(), isNegative) ||
-      llvm::any_of(getInteriorPadding().getValues<APInt>(), isNegative))
-    return {};
-
-  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  DenseElementsAttr padding = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-  RankedTensorType returnType = getType().dyn_cast_or_null<RankedTensorType>();
-  if (!input || !input.getType().hasRank() || !padding || !returnType ||
-      !returnType.hasStaticShape())
-    return {};
-
-  if (returnType.getElementType().isa<IntegerType>())
-    return padOpFoldHelper<APInt>(input, padding, returnType,
-                                  getEdgePaddingLow(), getEdgePaddingHigh(),
-                                  getInteriorPadding());
-  if (returnType.getElementType().isa<FloatType>())
-    return padOpFoldHelper<APFloat>(input, padding, returnType,
-                                    getEdgePaddingLow(), getEdgePaddingHigh(),
-                                    getInteriorPadding());
-  if (ComplexType complex =
-          returnType.getElementType().dyn_cast_or_null<ComplexType>()) {
-    // TODO(atondwal): Allow int types in HLO_complex
-    if (complex.getElementType().isa<FloatType>())
-      return padOpFoldHelper<std::complex<APFloat>>(
-          input, padding, returnType, getEdgePaddingLow(), getEdgePaddingHigh(),
-          getInteriorPadding());
-  }
-  return {};
-}
-
-LogicalResult PadOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  PadOp::Adaptor adaptor(operands, this->getOperation()->getAttrDictionary());
-  auto loc = this->getLoc();
-  Value operand = adaptor.getOperand();
-  auto operandTy = operand.getType().cast<RankedTensorType>();
-
-  llvm::SmallVector<int32_t> padHigh;
-  llvm::SmallVector<int32_t> padLow;
-  llvm::SmallVector<int32_t> padInterior;
-
-  auto padHighAttr = adaptor.getEdgePaddingHigh();
-  auto padLowAttr = adaptor.getEdgePaddingLow();
-  auto padInteriorAttr = adaptor.getInteriorPadding();
-
-  padHigh.reserve(padHighAttr.getNumElements());
-  padLow.reserve(padLowAttr.getNumElements());
-  padInterior.reserve(padInteriorAttr.getNumElements());
-
-  for (const APInt& val : padHighAttr.getValues<APInt>())
-    padHigh.push_back(val.getSExtValue());
-
-  for (const APInt& val : padLowAttr.getValues<APInt>())
-    padLow.push_back(val.getSExtValue());
-
-  for (const APInt& val : padInteriorAttr.getValues<APInt>())
-    padInterior.push_back(val.getSExtValue());
-
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1).getResult();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0).getResult();
-
-  llvm::SmallVector<Value> dimensions;
-  dimensions.reserve(operandTy.getRank());
-  for (int i = 0, s = operandTy.getRank(); i < s; ++i) {
-    Value padEdge =
-        builder.create<arith::ConstantIndexOp>(loc, padHigh[i] + padLow[i]);
-
-    // First we grab the initial interior size.
-    Value dim = builder.create<tensor::DimOp>(loc, operand, i).getResult();
-
-    // Compute the interior of the tensor and determine padding size.
-    if (padInterior[i] > 0) {
-      Value padInter =
-          builder.create<arith::ConstantIndexOp>(loc, padInterior[i])
-              .getResult();
-      Value interior = builder.create<arith::SubIOp>(loc, dim, one).getResult();
-      interior = builder.create<arith::MaxSIOp>(loc, interior, zero);
-      interior = builder.create<arith::MulIOp>(loc, interior, padInter);
-      dim = builder.create<arith::AddIOp>(loc, dim, interior).getResult();
-    }
-
-    // Then we add the padding on the edge of the tensor.
-    dim = builder.create<arith::AddIOp>(loc, dim, padEdge).getResult();
-    dimensions.push_back(dim);
-  }
-
-  Value dimensionTensor =
-      builder.create<tensor::FromElementsOp>(loc, dimensions).getResult();
-  reifiedReturnShapes.push_back(dimensionTensor);
-  return success();
-}
-
-// If the input tensor has a dimension of length-0, the input tensor is
-// irrelevant. Instead we can broadcast the pad value to the output size rather
-// than pad the input tensor.
-struct PadEmptyTensor : public OpRewritePattern<PadOp> {
-  using OpRewritePattern<PadOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PadOp op,
-                                PatternRewriter& rewriter) const override {
-    auto operand = op.getOperand();
-    auto padVal = op.getPaddingValue();
-
-    auto operandTy = operand.getType().cast<RankedTensorType>();
-    auto resultTy = op.getType().cast<RankedTensorType>();
-
-    if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
-      return failure();
-    }
-
-    if (resultTy.hasStaticShape()) {
-      auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
-      auto dims =
-          DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
-      rewriter.replaceOpWithNewOp<mhlo::BroadcastInDimOp>(op, resultTy, padVal,
-                                                          dims);
-      return success();
-    }
-
-    llvm::SmallVector<Value> reifiedShapes;
-    if (failed(op.reifyReturnTypeShapes(rewriter, op.getOperands(),
-                                        reifiedShapes)))
-      return failure();
-
-    auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
-    auto broadcastDims =
-        DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
-    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
-        op, op.getType(), padVal, reifiedShapes.front(), broadcastDims);
-
-    return failure();
-  }
-};
-
-void PadOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                        MLIRContext* context) {
-  results.add<PadEmptyTensor>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicPadOp
-//===----------------------------------------------------------------------===//
-
-// If the input tensor has a dimension of length-0, the input tensor is
-// irrelevant. Instead we can broadcast the pad value to the output size rather
-// than pad the input tensor.
-struct DynamicPadEmptyTensor : public OpRewritePattern<DynamicPadOp> {
-  using OpRewritePattern<DynamicPadOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(DynamicPadOp op,
-                                PatternRewriter& rewriter) const override {
-    // auto loc = op.getLoc();
-    auto operand = op.getOperand();
-    auto padVal = op.getPaddingValue();
-
-    auto operandTy = operand.getType().cast<RankedTensorType>();
-
-    if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
-      return failure();
-    }
-
-    llvm::SmallVector<Value> reifiedShapes;
-    if (failed(op.reifyReturnTypeShapes(rewriter, op->getOperands(),
-                                        reifiedShapes)))
-      return failure();
-
-    auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
-    auto broadcastDims =
-        DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
-    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
-        op, op.getType(), padVal, reifiedShapes.front(), broadcastDims);
-
-    return failure();
-  }
-};
-
-void DynamicPadOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                               MLIRContext* context) {
-  // Disc doesn't lowering mhlo::PadOp currently,
-  // see https://github.com/alibaba/BladeDISC/issues/541
-  // results.add<DPadToPad, DynamicPadEmptyTensor>(context);
-  results.add<DynamicPadEmptyTensor>(context);
-}
-
-LogicalResult DynamicPadOp::verify() {
-  auto inputType = getOperand().getType().dyn_cast<RankedTensorType>();
-  // If operand is unranked, there is very little to verify statically.
-  if (!inputType) return success();
-  int inputRank = inputType.getRank();
-
-  auto padType = getPaddingValue().getType().cast<RankedTensorType>();
-  if (padType.getRank() != 0) {
-    return emitOpError() << "padding value type should be a rank-0";
-  }
-
-  auto paddingLowType = getEdgePaddingLow().getType().cast<RankedTensorType>();
-  if (paddingLowType.getNumElements() != inputRank) {
-    return emitOpError() << "edge_padding_low length("
-                         << paddingLowType.getNumElements()
-                         << ") must match operand rank(" << inputRank << ").";
-  }
-
-  auto paddingHighType =
-      getEdgePaddingHigh().getType().cast<RankedTensorType>();
-  if (paddingHighType.getNumElements() != inputRank) {
-    return emitOpError() << "edge_padding_high length("
-                         << paddingHighType.getNumElements()
-                         << ") must match operand rank(" << inputRank << ").";
-  }
-
-  auto interiorPaddingType =
-      getInteriorPadding().getType().cast<RankedTensorType>();
-  if (interiorPaddingType.getNumElements() != inputRank) {
-    return emitOpError() << "edge_padding_interior length("
-                         << interiorPaddingType.getNumElements()
-                         << ") must match operand rank(" << inputRank << ").";
-  }
-
-  auto outputType = getResult().getType().dyn_cast<RankedTensorType>();
-  // If result is unranked, there is very little to verify statically.
-  if (!outputType) return success();
-  int outputRank = outputType.getRank();
-  if (inputRank != outputRank) {
-    return emitOpError() << "operand rank(" << inputRank
-                         << ") must match result(" << outputRank << ").";
-  }
-
-  return success();
-}
-
-LogicalResult DynamicPadOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  DynamicPadOp::Adaptor adaptor(operands);
-  Value operand = adaptor.getOperand();
-  Value edgePaddingLow = adaptor.getEdgePaddingLow();
-  Value edgePaddingHigh = adaptor.getEdgePaddingHigh();
-  Value interiorPadding = adaptor.getInteriorPadding();
-
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked pad a.t.m.
-  if (!operandType) return failure();
-
-  auto loc = this->getLoc();
-  SmallVector<Value, 4> shapeValues;
-  shapeValues.reserve(operandType.getRank());
-  Type shapeScalarType =
-      edgePaddingLow.getType().cast<ShapedType>().getElementType();
-
-  auto toShapeScalarType = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shapeScalarType);
-  };
-
-  Value zero =
-      toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 0));
-  Value one = toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 1));
-
-  for (int idx : llvm::seq<int>(0, operandType.getShape().size())) {
-    Value valueDim =
-        toShapeScalarType(builder.create<tensor::DimOp>(loc, operand, idx));
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, idx);
-    Value valueLow =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingLow, offset);
-    Value valueHigh =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingHigh, offset);
-    Value valueInterior =
-        builder.create<tensor::ExtractOp>(loc, interiorPadding, offset);
-    // output_size = input_size + padding_low + padding_high + interior *
-    // max(input_size - 1, 0)
-    Value valueDimLessThanOne = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, valueDim, one);
-    Value interiorSize = builder.create<arith::MulIOp>(
-        loc, valueInterior,
-        builder.create<mlir::arith::SelectOp>(
-            loc, valueDimLessThanOne, zero,
-            builder.create<arith::SubIOp>(loc, valueDim, one)));
-    shapeValues.push_back(builder.create<arith::AddIOp>(
-        loc,
-        builder.create<arith::AddIOp>(
-            loc, builder.create<arith::AddIOp>(loc, interiorSize, valueDim),
-            valueLow),
-        valueHigh));
-  }
-
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            shapeScalarType),
-      shapeValues));
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ReshapeOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult ReshapeOp::verify() {
-  // If the operand type is dynamically shaped there is nothing to verify.
-  auto operandTy = getOperand().getType().dyn_cast<RankedTensorType>();
-  if (!operandTy || !operandTy.hasStaticShape()) return success();
-
-  // If the operand type is statically shaped (not required) the number of
-  // elements must match that of the result type.
-  auto resultTy = getType().cast<RankedTensorType>();
-  assert(resultTy && resultTy.hasStaticShape() &&
-         "result type must be statically shaped");
-  int64_t numResultElements = resultTy.getNumElements();
-  int64_t numOperandElements = operandTy.getNumElements();
-  if (numResultElements != numOperandElements)
-    return emitOpError() << "number of output elements (" << numResultElements
-                         << ") doesn't match expected number of elements ("
-                         << numOperandElements << ")";
-
-  return success();
-}
-
-OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
-  if (getOperand().getType() == getType()) {
-    return getOperand();
-  }
-
-  if (auto prevOp = getOperand().getDefiningOp<ReshapeOp>()) {
-    setOperand(prevOp.getOperand());
-    return getResult();
-  }
-
-  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
-    return reshape(elements, getResult().getType().cast<ShapedType>());
-  }
-
-  return {};
-}
-
-void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                            MLIRContext* context) {
-  results.add<IdentityBroadcastReshape, IdentityBroadcastInDimReshape,
-              EliminateRedundantReshape, EliminateIdentityReshape>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// ReplicaId Op
-//===----------------------------------------------------------------------===//
-
-LogicalResult ReplicaIdOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location>, ValueRange operands,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(RankedTensorType::get(
-      /*shape=*/{}, IntegerType::get(context, 32, IntegerType::Unsigned)));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// AddDependency Op
-//===----------------------------------------------------------------------===//
-
-LogicalResult AddDependencyOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location>, ValueRange operands,
-    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(operands.getTypes()[0]);
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// If Op
-//===----------------------------------------------------------------------===//
-
-LogicalResult IfOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  IfOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferIfOp(location, adaptor.getRegions(), inferredReturnTypes);
-}
-
-static LogicalResult inlineIfConstantCondition(IfOp ifOp,
-                                               PatternRewriter& rewriter) {
-  DenseIntElementsAttr predAttr;
-  if (!matchPattern(ifOp.getPred(), m_Constant(&predAttr))) return failure();
-
-  if (predAttr.getSplatValue<BoolAttr>().getValue()) {
-    replaceOpWithRegion(rewriter, ifOp, ifOp.getTrueBranch());
-  } else {
-    replaceOpWithRegion(rewriter, ifOp, ifOp.getFalseBranch());
-  }
-  return success();
-}
-
-void IfOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                       MLIRContext* context) {
-  results.add(&inlineIfConstantCondition);
-}
-
-//===----------------------------------------------------------------------===//
-// Case Op
-//===----------------------------------------------------------------------===//
-
-LogicalResult CaseOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  CaseOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferCaseOp(location, adaptor.getRegions(), inferredReturnTypes);
-}
-
-static LogicalResult inlineCaseConstantCondition(CaseOp caseOp,
-                                                 PatternRewriter& rewriter) {
-  DenseIntElementsAttr indexAttr;
-  if (!matchPattern(caseOp.getIndex(), m_Constant(&indexAttr))) {
-    return failure();
-  }
-  int64_t index =
-      indexAttr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
-  // For an OOB index, the last branch is executed as the default branch:
-  // https://www.tensorflow.org/xla/operation_semantics#conditional
-  if (index < 0 || index >= caseOp.getNumRegions())
-    index = caseOp.getNumRegions() - 1;
-
-  Region& region = caseOp.getRegion(index);
-  if (!llvm::hasSingleElement(region)) return failure();
-  replaceOpWithRegion(rewriter, caseOp, region);
-  return success();
-}
-
-void CaseOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                         MLIRContext* context) {
-  results.add(&inlineCaseConstantCondition);
-}
-
-//===----------------------------------------------------------------------===//
-// UnaryOps
-//===----------------------------------------------------------------------===//
-
-template <typename ValType>
-struct AnyValue {
-  bool operator()(const ValType&) { return true; }
-};
-
-template <typename ValType>
-struct NonNegativeValue {
-  bool operator()(const ValType& v) { return !v.isNegative(); }
-};
-
-template <typename ValType>
-struct PositiveValue {
-  bool operator()(const ValType& v) { return !v.isNegative() && !v.isZero(); }
-};
-
-static const APFloat& addSign(const APFloat& v, Type) { return v; }
-static APSInt addSign(const APInt& v, Type t) {
-  // Add signedness information to the value, treating signless as signed,
-  // unless it's i1.
-  return APSInt(v, t.isUnsignedInteger() || t.isSignlessInteger(1));
-}
-
-template <typename Op, typename ElementType, typename ValType, typename Convert,
-          typename Validate = AnyValue<ValType>>
-static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
-  if (!attrs[0]) return {};
-
-  DenseElementsAttr val = attrs[0].dyn_cast<DenseElementsAttr>();
-  if (!val) return {};
-
-  ShapedType type = op->getType().template cast<ShapedType>();
-  if (!type.hasStaticShape()) {
-    return {};
-  }
-
-  Type etype = type.getElementType();
-
-  // Evaluate for integer values.
-  if (!etype.isa<ElementType>()) {
-    return {};
-  }
-
-  // Prevent folding if the result is too large.
-  if (val.getNumElements() > kFoldOpEltLimit) return {};
-
-  SmallVector<ValType, 6> values;
-  values.reserve(val.getNumElements());
-  for (const auto v : val.getValues<ValType>()) {
-    if (!Validate()(v)) return {};
-    Optional<ValType> r = Convert()(addSign(v, type));
-    if (!r) return {};
-    values.push_back(r.value());
-  }
-
-  return DenseElementsAttr::get(type, values);
-}
-
-struct Round {
-  Optional<APFloat> operator()(const APFloat& f) {
-    APFloat r = f;
-    r.roundToIntegral(llvm::RoundingMode::NearestTiesToAway);
-    return r;
-  }
-};
-
-struct RoundNearestEven {
-  Optional<APFloat> operator()(const APFloat& f) {
-    APFloat r = f;
-    r.roundToIntegral(llvm::RoundingMode::NearestTiesToEven);
-    return r;
-  }
-};
-
-template <typename FloatOrInt>
-struct Sign {
-  APFloat compute(const APFloat& f) {
-    if (f.isZero() || f.isNaN()) return f;
-    double value = f.isNegative() ? -1.0 : 1.0;
-    APFloat val(value);
-    bool unused;
-    val.convert(f.getSemantics(), APFloat::rmNearestTiesToEven, &unused);
-    return val;
-  }
-
-  APInt compute(const APInt& i) {
-    APInt r = i;
-    if (r == 0) return r;
-    if (r.isNegative()) {
-      return APInt(r.getBitWidth(), -1, /*isSigned=*/true);
-    }
-    return APInt(r.getBitWidth(), 1, /*isSigned=*/true);
-  }
-
-  Optional<FloatOrInt> operator()(const FloatOrInt& fi) { return compute(fi); }
-};
-
-double rsqrt(double d) { return 1.0 / std::sqrt(d); }
-
-double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
-
-// NOLINTBEGIN(bugprone-macro-parentheses)
-#define UNARY_FOLDER(Op, Func)                                                \
-  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                          \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
-      return UnaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
-    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                   \
-      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
-    return {};                                                                \
-  }
-
-#define UNARY_FOLDER_INT(Op, Func)                                          \
-  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                        \
-    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                 \
-      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs); \
-    return {};                                                              \
-  }
-
-#define UNARY_FOLDER_FLOAT(Op, Func)                                 \
-  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                 \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
-      return UnaryFolder<Op, FloatType, APFloat, Func>(this, attrs); \
-    return {};                                                       \
-  }
-
-#define UNARY_FOLDER_UPCAST_TO_F64(Op, Func, Validate)               \
-  struct Op##Folder {                                                \
-    Optional<APFloat> operator()(const APFloat& input) {             \
-      APFloat f = input;                                             \
-      const llvm::fltSemantics& oldSemantics = f.getSemantics();     \
-                                                                     \
-      bool unusedLoseInfo;                                           \
-      f.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, \
-                &unusedLoseInfo);                                    \
-                                                                     \
-      APFloat result(Func(f.convertToDouble()));                     \
-      result.convert(oldSemantics, APFloat::rmNearestTiesToEven,     \
-                     &unusedLoseInfo);                               \
-      return result;                                                 \
-    }                                                                \
-  };                                                                 \
-  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                 \
-    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
-      return UnaryFolder<Op, FloatType, APFloat, Op##Folder,         \
-                         Validate<APFloat>>(this, attrs);            \
-    return {};                                                       \
-  }
-// NOLINTEND(bugprone-macro-parentheses)
-
-UNARY_FOLDER(NegOp, std::negate)
-UNARY_FOLDER(SignOp, Sign)
-UNARY_FOLDER_INT(NotOp, std::bit_not)
-UNARY_FOLDER_FLOAT(RoundNearestEvenOp, RoundNearestEven)
-UNARY_FOLDER_FLOAT(RoundOp, Round)
-
-UNARY_FOLDER_UPCAST_TO_F64(CosineOp, std::cos, AnyValue)
-UNARY_FOLDER_UPCAST_TO_F64(ExpOp, std::exp, AnyValue)
-UNARY_FOLDER_UPCAST_TO_F64(LogisticOp, logistic, AnyValue)
-UNARY_FOLDER_UPCAST_TO_F64(RsqrtOp, rsqrt, PositiveValue)
-UNARY_FOLDER_UPCAST_TO_F64(SineOp, std::sin, AnyValue)
-UNARY_FOLDER_UPCAST_TO_F64(SqrtOp, std::sqrt, NonNegativeValue)
-UNARY_FOLDER_UPCAST_TO_F64(TanhOp, std::tanh, AnyValue)
-
-#undef UNARY_FOLDER
-#undef UNARY_FOLDER_INT
-#undef UNARY_FOLDER_FLOAT
-#undef UNARY_FOLDER_UPCAST_TO_F64
-
-//===----------------------------------------------------------------------===//
-// BinaryOps
-//===----------------------------------------------------------------------===//
-
-template <typename Op, typename ElementType = Type, typename ValType,
-          typename Convert>
-static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
-  if (!attrs[0] || !attrs[1]) return {};
-
-  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
-  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
-  if (!lhs || !rhs) return {};
-
-  ShapedType type = op->getType().template cast<ShapedType>();
-  if (!type.hasStaticShape()) {
-    return {};
-  }
-
-  Type etype = type.getElementType();
-
-  // Evaluate for integer values.
-  if (!etype.isa<ElementType>()) {
-    return {};
-  }
-
-  // Special case for folding splats no matter how large.
-  // Only covers the case of both attrs being splats; operation-specific cases
-  // like adding a zero or multiplying by one are handled elsewhere.
-  SplatElementsAttr splatLhs = lhs.dyn_cast<SplatElementsAttr>();
-  SplatElementsAttr splatRhs = rhs.dyn_cast<SplatElementsAttr>();
-  if (splatLhs && splatRhs) {
-    auto signedLhs = addSign(splatLhs.getSplatValue<ValType>(), etype);
-    auto signedRhs = addSign(splatRhs.getSplatValue<ValType>(), etype);
-    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
-    return succeeded(result) ? SplatElementsAttr::get(type, *result)
-                             : Attribute();
-  }
-
-  // Prevent folding if the result is too large.
-  if (lhs.getNumElements() > kFoldOpEltLimit) return {};
-
-  SmallVector<ValType, 6> values;
-  values.reserve(lhs.getNumElements());
-  for (const auto zip :
-       llvm::zip(lhs.getValues<ValType>(), rhs.getValues<ValType>())) {
-    auto signedLhs = addSign(std::get<0>(zip), etype);
-    auto signedRhs = addSign(std::get<1>(zip), etype);
-    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
-    if (failed(result)) {
-      return {};
-    }
-    values.push_back(std::move(*result));
-  }
-
-  return DenseElementsAttr::get(type, values);
-}
-
-template <typename T>
-struct Divide : std::divides<T> {};
-
-template <>
-struct Divide<APSInt> {
-  FailureOr<APSInt> operator()(const APSInt& a, const APSInt& b) const {
-    if (b.isZero()) return failure();
-    return a / b;
-  }
-};
-
-template <typename T>
-struct Remainder : std::modulus<T> {};
-
-template <>
-struct Remainder<APSInt> {
-  FailureOr<APSInt> operator()(const APSInt& a, const APSInt& b) const {
-    if (b.isZero()) return failure();
-    return a % b;
-  }
-};
-
-template <>
-struct Remainder<APFloat> {
-  APFloat operator()(const APFloat& a, const APFloat& b) const {
-    APFloat result(a);
-    // Using .mod instead of .remainder is important for behavior around signed
-    // zeros
-    result.mod(b);
-    return result;
-  }
-};
-
-template <typename T>
-struct Max {
-  T operator()(const T& a, const T& b) const { return std::max<T>(a, b); }
-};
-
-template <>
-struct Max<APFloat> {
-  // maximum on APFloat is required for NaN propagation logic
-  APFloat operator()(const APFloat& a, const APFloat& b) const {
-    return llvm::maximum(a, b);
-  }
-};
-
-template <typename T>
-struct Min {
-  T operator()(const T& a, const T& b) const { return std::min<T>(a, b); }
-};
-
-template <>
-struct Min<APFloat> {
-  // minimum on APFloat is required for NaN propagation logic
-  APFloat operator()(const APFloat& a, const APFloat& b) const {
-    return llvm::minimum(a, b);
-  }
-};
-
-#define BINARY_FOLDER_INTERNAL(Op, Func)                                     \
-  if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
-    return BinaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
-  if (getElementTypeOrSelf(getType()).isa<IntegerType>())                    \
-    return BinaryFolder<Op, IntegerType, APInt, Func<APSInt>>(this, attrs);  \
-  return {};
-
-#define BINARY_FOLDER(Op, Func)                      \
-  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) { \
-    BINARY_FOLDER_INTERNAL(Op, Func)                 \
-  }
-
-// Addition, subtraction and multiplication use the std:: versions of the ops.
-// Due to the other ops behaving differently in signed vs unsigned integers,
-// APInts need a special implementation. Currently, it replicates signed int
-// op behavior.
-BINARY_FOLDER(SubtractOp, std::minus)
-BINARY_FOLDER(DivOp, Divide)
-BINARY_FOLDER(RemOp, Remainder)
-BINARY_FOLDER(MaxOp, Max)
-BINARY_FOLDER(MinOp, Min)
-
-bool isSplatZero(SplatElementsAttr attr) {
-  if (!attr) return false;
-  if (attr.getElementType().isa<FloatType>()) {
-    return attr.getSplatValue<APFloat>().isZero();
-  }
-  if (attr.getElementType().isa<IntegerType>()) {
-    return attr.getSplatValue<APInt>().isZero();
-  }
-  return false;
-}
-
-OpFoldResult AddOp::fold(ArrayRef<Attribute> attrs) {
-  // Handle special case where one operand is 0:  x + 0 => x
-  if (attrs[0] || attrs[1]) {
-    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
-    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
-    if (isSplatZero(splatLhs))
-      return splatRhs ? (OpFoldResult)splatRhs : getRhs();
-    if (isSplatZero(splatRhs))
-      return splatLhs ? (OpFoldResult)splatLhs : getLhs();
-  }
-  if (attrs[0] && attrs[1]) {
-    BINARY_FOLDER_INTERNAL(AddOp, std::plus)
-  }
-  return {};
-}
-
-bool isSplatOne(SplatElementsAttr attr) {
-  if (!attr) return false;
-  if (attr.getElementType().isa<FloatType>()) {
-    return attr.getSplatValue<APFloat>().convertToDouble() == 1.0;
-  }
-  if (attr.getElementType().isa<IntegerType>()) {
-    return attr.getSplatValue<APInt>().getSExtValue() == 1;
-  }
-  return false;
-}
-
-OpFoldResult MulOp::fold(ArrayRef<Attribute> attrs) {
-  // Handle special case where one operand is 1: x * 1 => x
-  if (attrs[0] || attrs[1]) {
-    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
-    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
-    if (isSplatOne(splatLhs))
-      return splatRhs ? (OpFoldResult)splatRhs : getRhs();
-    if (isSplatOne(splatRhs))
-      return splatLhs ? (OpFoldResult)splatLhs : getLhs();
-  }
-  if (attrs[0] && attrs[1]) {
-    BINARY_FOLDER_INTERNAL(MulOp, std::multiplies);
-  }
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// Logical Ops
-//===----------------------------------------------------------------------===//
-
-OpFoldResult AndOp::fold(ArrayRef<Attribute> operands) {
-  if (getLhs() == getRhs()) return getLhs();
-
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-
-  if (lhsVal && lhsVal.isSplat()) {
-    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
-      return getRhs();
-    }
-
-    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return lhsVal;
-    }
-  }
-
-  if (rhsVal && rhsVal.isSplat()) {
-    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
-      return getLhs();
-    }
-
-    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return rhsVal;
-    }
-  }
-
-  if (!rhsVal || !lhsVal) return {};
-  return BinaryFolder<AndOp, IntegerType, APInt, std::bit_and<APSInt>>(
-      this, operands);
-}
-
-OpFoldResult OrOp::fold(ArrayRef<Attribute> operands) {
-  if (getLhs() == getRhs()) return getLhs();
-
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-
-  if (lhsVal && lhsVal.isSplat()) {
-    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
-      return lhsVal;
-    }
-
-    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return getRhs();
-    }
-  }
-
-  if (rhsVal && rhsVal.isSplat()) {
-    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
-      return rhsVal;
-    }
-
-    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return getLhs();
-    }
-  }
-
-  if (!rhsVal || !lhsVal) return {};
-  return BinaryFolder<OrOp, IntegerType, APInt, std::bit_or<APSInt>>(this,
-                                                                     operands);
-}
-
-OpFoldResult XorOp::fold(ArrayRef<Attribute> operands) {
-  // Fold x^x to 0. Attributes only support static shapes.
-  auto rType = getType().cast<ShapedType>();
-  if (getLhs() == getRhs() && rType.hasStaticShape()) {
-    Builder builder(getContext());
-    return builder.getZeroAttr(rType);
-  }
-
-  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
-
-  if (lhsVal && lhsVal.isSplat()) {
-    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return getRhs();
-    }
-  }
-
-  if (rhsVal && rhsVal.isSplat()) {
-    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
-      return getLhs();
-    }
-  }
-
-  if (!rhsVal || !lhsVal) return {};
-  return BinaryFolder<XorOp, IntegerType, APInt, std::bit_xor<APSInt>>(
-      this, operands);
-}
-
-#undef BINARY_FOLDER_INTERNAL
-#undef BINARY_FOLDER
-
-//===----------------------------------------------------------------------===//
-// SliceOp
-//===----------------------------------------------------------------------===//
-
-// Returns output dimension size for slice result for the given arguments.
-// Returns -1 if arguments are illegal.
-static int64_t inferSliceDim(int64_t inputDim, int64_t start, int64_t end,
-                             int64_t stride) {
-  if (inputDim == -1 || start < 0 || start > end || end > inputDim ||
-      stride == 0)
-    return -1;
-
-  return llvm::divideCeil(end - start, stride);
-}
-
-// The following properties are already enforced by the ODS:
-//  type(start_indices) == type(limit_indices) == type(strides).
-// Verify the following properties:
-//  P1. Verify rank(start_indices) == 1.
-//  P2. Verify size(start_indices) == rank(operand).
-//  P3~5. Verify 0 <= start_indices[i] <= limit_indices[i] <= shape(operand)[i].
-//  P6. Verify stride[i] > 0.
-LogicalResult SliceOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  SliceOpAdaptor slice(operands, attributes);
-  Type ty = slice.getOperand().getType();
-  RankedTensorType rankedTy = ty.dyn_cast<RankedTensorType>();
-  if (!rankedTy) {
-    // The operand type is unranked, so the best we can infer for the result
-    // type is an unranked tensor with the same element type as the operand
-    // type.
-    inferredReturnTypes.assign({ty});
-    return success();
-  }
-
-  ShapedType attrTy = slice.getStartIndices().getType();
-  // P1.
-  // Note: ODS has type(start_indices) == type(limit_indices) == type(strides)
-  // So this implies rank(limit_indices) == rank(strides) == 1 also.
-  if (attrTy.getRank() != 1) {
-    return emitOptionalError(location, "start_indices has rank ",
-                             attrTy.getRank(), " instead of required rank 1");
-  }
-
-  // P2.
-  int64_t rank = rankedTy.getRank();
-  if (attrTy.getNumElements() != rank) {
-    return emitOptionalError(
-        location, "the number of elements in start_indices (",
-        attrTy.getNumElements(), ") does not match the rank of the operand (",
-        rank, ")");
-  }
-
-  SmallVector<int64_t, 4> start(slice.getStartIndices().getValues<int64_t>());
-  SmallVector<int64_t, 4> limit(slice.getLimitIndices().getValues<int64_t>());
-  SmallVector<int64_t, 4> strideVals(slice.getStrides().getValues<int64_t>());
-
-  SmallVector<int64_t, 4> shape;
-  shape.reserve(rank);
-  for (int64_t i = 0, e = rank; i != e; i++) {
-    if (hlo::isDynamicDimSize(rankedTy.getDimSize(i))) {
-      shape.push_back(ShapedType::kDynamicSize);
-      continue;
-    }
-    // P3.
-    if (start[i] < 0)
-      return emitOptionalError(location, "negative start index ", start[i],
-                               " in dimension ", i);
-    // P4.
-    if (limit[i] > rankedTy.getDimSize(i))
-      return emitOptionalError(location, "limit index ", limit[i],
-                               " is larger than dimension size ",
-                               rankedTy.getDimSize(i), " in dimension ", i);
-    // P5.
-    if (start[i] > limit[i])
-      return emitOptionalError(location, "start index ", start[i],
-                               " is larger than limit index ", limit[i],
-                               " in dimension ", i);
-    // P6.
-    if (strideVals[i] <= 0)
-      return emitOptionalError(location, "stride must be positive but got ",
-                               strideVals[i], " in dimension ", i);
-
-    shape.push_back(inferSliceDim(rankedTy.getDimSize(i), start[i], limit[i],
-                                  strideVals[i]));
-  }
-  inferredReturnTypes.assign(
-      {RankedTensorType::get(shape, rankedTy.getElementType())});
-  return success();
-}
-
-template <typename I, typename E>
-static void sliceElements(I values, ArrayRef<int64_t> sizes,
-                          ArrayRef<int64_t> starts, ArrayRef<int64_t> limits,
-                          ArrayRef<int64_t> strides,
-                          llvm::SmallVectorImpl<E>* outValues) {
-  assert(starts.size() == limits.size());
-  assert(starts.size() == strides.size());
-  if (starts.empty()) return;
-
-  int64_t start = starts.front();
-  int64_t limit = limits.front();
-  int64_t stride = strides.front();
-  if (starts.size() == 1) {
-    for (int i = start; i < limit; i += stride) {
-      outValues->push_back(*(values + i));
-    }
-    return;
-  }
-
-  for (; start < limit; start += stride) {
-    auto begin = values + start * sizes.front();
-    sliceElements<I, E>(begin, sizes.drop_front(), starts.drop_front(),
-                        limits.drop_front(), strides.drop_front(), outValues);
-  }
-}
-
-template <typename I, typename E>
-static Attribute foldSlice(SliceOp* op, I values) {
-  auto start = llvm::to_vector<6>(op->getStartIndices().getValues<int64_t>());
-  auto limit = llvm::to_vector<6>(op->getLimitIndices().getValues<int64_t>());
-  auto stride = llvm::to_vector<6>(op->getStrides().getValues<int64_t>());
-
-  // TODO(b/235903849): This should be op->getType().case<ShapedType>().
-  auto resultType = op->getOperand().getType().cast<ShapedType>();
-  if (!resultType.hasStaticShape()) return {};
-
-  auto shape = resultType.getShape();
-  int64_t count = resultType.getNumElements();
-  if (count == 0) {
-    return DenseElementsAttr::get<E>(
-        op->getResult().getType().cast<ShapedType>(),
-        /*list=*/{});
-  }
-
-  // Compute the striding for each dimension.
-  llvm::SmallVector<int64_t, 6> sizes;
-  sizes.reserve(shape.size());
-  for (auto v : shape) {
-    count = count / v;
-    sizes.push_back(count);
-  }
-
-  // Prevent folding if the result is too large.
-  if (resultType.getNumElements() > kFoldOpEltLimit) return {};
-
-  llvm::SmallVector<E, 6> outValues;
-  outValues.reserve(resultType.getNumElements());
-  sliceElements<I, E>(values, sizes, start, limit, stride, &outValues);
-
-  return DenseElementsAttr::get(op->getResult().getType().cast<ShapedType>(),
-                                outValues);
-}
-
-OpFoldResult SliceOp::fold(ArrayRef<Attribute> operands) {
-  // Check if the SliceOp is a NoOp operation.
-  auto operandType = getOperand().getType().cast<ShapedType>();
-  auto resultType = getResult().getType().cast<ShapedType>();
-
-  if (operandType.hasStaticShape() && resultType.hasStaticShape() &&
-      (operandType.getShape() == resultType.getShape())) {
-    return getOperand();
-  }
-
-  if (operands.empty() || !operands.front()) return {};
-
-  // Evaluate for statically valued inputs.
-  DenseElementsAttr elements = operands.front().dyn_cast<DenseElementsAttr>();
-  if (!elements) return {};
-
-  auto etype = elements.getType().getElementType();
-  if (etype.isa<IntegerType>()) {
-    return foldSlice<DenseElementsAttr::IntElementIterator, APInt>(
-        this, elements.value_begin<APInt>());
-  }
-  if (etype.isa<FloatType>()) {
-    return foldSlice<DenseElementsAttr::FloatElementIterator, APFloat>(
-        this, elements.value_begin<APFloat>());
-  }
-
-  return {};
-}
-
-namespace {
-// In cases where a concat is fed into a slice, it is possible the concat
-// can be simplified or bypassed. This checks which inputs to the concat are
-// used by the slice, either reducing the number of concatenated values or
-// entirely removes the concat.
-struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
-  using OpRewritePattern<SliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SliceOp slice,
-                                PatternRewriter& rewriter) const override {
-    auto resultTy = slice.getType().cast<ShapedType>();
-    if (!resultTy.hasStaticShape()) {
-      return failure();
-    }
-
-    auto sliceInput = slice.getOperand();
-    auto sliceInputTy = sliceInput.getType().cast<ShapedType>();
-    auto concat = sliceInput.getDefiningOp<ConcatenateOp>();
-    if (!concat) {
-      return failure();
-    }
-
-    auto dimension = concat.getDimension();
-
-    auto start = slice.getStartIndices().getValues<APInt>();
-    auto limit = slice.getLimitIndices().getValues<APInt>();
-
-    auto sliceStart = (*(start.begin() + dimension)).getSExtValue();
-    auto sliceLimit = (*(limit.begin() + dimension)).getSExtValue();
-
-    // We need to determine what inputs from the concat affect the slice, and
-    // how the bounds of the slice need to be updated for the minimally required
-    // inputs.
-    int64_t runningSize = 0;
-    int64_t frontOffset = sliceInputTy.getShape()[dimension];
-
-    auto subsetStart = concat.operand_end();
-    auto subsetEnd = concat.operand_end();
-    for (auto it = concat.operand_begin(); it < concat.operand_end(); ++it) {
-      auto input = *it;
-      ShapedType inputTy = input.getType().cast<ShapedType>();
-      if (inputTy.isDynamicDim(dimension)) {
-        return failure();
-      }
-      auto dimSize = inputTy.getShape()[dimension];
-
-      // If this position is in the slice its the start of the subset and we
-      // need to update the start and limit values.
-      if (runningSize + dimSize > sliceStart &&
-          subsetStart == concat.operand_end()) {
-        subsetStart = it;
-        frontOffset = runningSize;
-      }
-
-      // Determine the last required offset.
-      if (runningSize < sliceLimit) {
-        subsetEnd = it + 1;
-      }
-
-      runningSize += dimSize;
-    }
-
-    auto subsetSize = subsetEnd - subsetStart;
-    // We need all inputs so no optimization.
-    if (subsetSize == concat.getNumOperands()) {
-      return failure();
-    }
-
-    // If there's nothing to slice that means the output is an empty tensor and
-    // there is dead code. We do nothing here and rely on other passes to clean
-    // this up.
-    if (subsetSize == 0) {
-      return failure();
-    }
-
-    if (subsetSize > 1 && !concat.getResult().hasOneUse()) {
-      return failure();
-    }
-
-    auto concatRange = OperandRange(subsetStart, subsetEnd);
-    auto newConcat = rewriter.create<ConcatenateOp>(
-        concat.getLoc(), concatRange, concat.getDimension());
-
-    llvm::SmallVector<APInt, 6> newStart(start);
-    llvm::SmallVector<APInt, 6> newLimit(limit);
-    newStart[dimension] -= frontOffset;
-    newLimit[dimension] -= frontOffset;
-
-    auto attrType = slice.getStartIndices().getType().cast<ShapedType>();
-    auto create = rewriter.create<SliceOp>(
-        slice.getLoc(), newConcat,
-        DenseIntElementsAttr::get(attrType, newStart),
-        DenseIntElementsAttr::get(attrType, newLimit), slice.getStrides());
-    rewriter.replaceOp(slice, create.getResult());
-    return success();
-  }
-};
-}  // namespace
-
-void SliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                          MLIRContext* context) {
-  results.add<SimplifyConcatSlice>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// SortOp
-//===----------------------------------------------------------------------===//
-
-void SortOp::build(OpBuilder& builder, OperationState& state,
-                   ValueRange operands, int64_t dimension, bool isStable) {
-  state.addOperands(operands);
-  state.addAttribute("dimension", builder.getI64IntegerAttr(dimension));
-  state.addAttribute("is_stable", builder.getBoolAttr(isStable));
-
-  for (Value operand : operands) state.addTypes(operand.getType());
-
-  state.addRegion();
-}
-
-LogicalResult SortOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location>, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  SortOp::Adaptor adaptor(operands, attributes, regions);
-  for (auto resultType : adaptor.getInputs().getTypes())
-    inferredReturnShapes.emplace_back(resultType.cast<ShapedType>());
-  return success();
-}
-
-LogicalResult SortOp::verify() {
-  SmallVector<ShapedTypeComponents> unusedReturnShapes;
-  return hlo::inferSortOp(getLoc(), getInputs(), getDimension(),
-                          getComparator(), unusedReturnShapes);
-}
-
-/// Drops the operands if the results are not used and they are not used in
-/// op.comparator().
-static LogicalResult sortDropEmptyUseArgs(SortOp op,
-                                          PatternRewriter& rewriter) {
-  DenseSet<unsigned> erasedArgs;
-  unsigned numOperands = op.getNumOperands();
-  for (unsigned i = 0; i < numOperands; ++i) {
-    if (!op.getResult(i).use_empty()) continue;
-    Block& block = op.getComparator().front();
-    if (!block.getArgument(i * 2).use_empty()) continue;
-    if (!block.getArgument(i * 2 + 1).use_empty()) continue;
-    erasedArgs.insert(i);
-  }
-  if (erasedArgs.empty()) return failure();
-
-  SmallVector<Value> newOperands;
-  BitVector erasedBlockArgs(op.getNumOperands() * 2);
-  for (const auto& en : llvm::enumerate(op.getInputs())) {
-    if (erasedArgs.contains(en.index())) {
-      erasedBlockArgs.set(en.index() * 2);
-      erasedBlockArgs.set(en.index() * 2 + 1);
-    } else {
-      newOperands.push_back(en.value());
-    }
-  }
-
-  auto newOp = rewriter.create<SortOp>(op.getLoc(), newOperands,
-                                       op.getDimension(), op.getIsStable());
-  Region& region = newOp.getComparator();
-  rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
-  region.front().eraseArguments(erasedBlockArgs);
-
-  SmallVector<Value> results;
-  for (unsigned i = 0, j = 0; i < numOperands; ++i) {
-    if (erasedArgs.contains(i)) {
-      results.push_back({});
-    } else {
-      results.push_back(newOp.getResult(j++));
-    }
-  }
-  rewriter.replaceOp(op, results);
-
-  return success();
-}
-
-/// Set the sorting dimension to the last dimension if it's not set and the rank
-/// is known.
-static LogicalResult sortOpInferDefaultDimension(SortOp op,
-                                                 PatternRewriter& rewriter) {
-  auto ty = op.getResultTypes()[0].dyn_cast<ShapedType>();
-  if (!ty) {
-    return failure();
-  }
-  if (static_cast<int64_t>(op.getDimension()) != -1) {
-    return failure();
-  }
-
-  IntegerAttr dim = rewriter.getI64IntegerAttr(ty.getRank() - 1);
-  auto newOp =
-      rewriter.create<SortOp>(op.getLoc(), op.getResultTypes(), op.getInputs(),
-                              dim, op.getIsStableAttr());
-  Region& region = newOp.getComparator();
-  rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
-  rewriter.replaceOp(op, newOp.getResults());
-
-  return success();
-}
-
-void SortOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                         MLIRContext* /*context*/) {
-  results.add(sortDropEmptyUseArgs);
-  results.add(sortOpInferDefaultDimension);
-}
-
-//===----------------------------------------------------------------------===//
-// TransposeOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Transpose the given elements attr according to the specified permutation.
-mlir::ElementsAttr TransposeElementsAttr(
-    const mlir::ElementsAttr& elements, const DenseIntElementsAttr& perm_attr) {
-  ShapedType type = elements.getType();
-  int64_t rank = type.getRank();
-  if (rank < 2) {
-    // Transpose rank 0 and rank 1 tensor is equal to itself.
-    return elements;
-  }
-
-  SmallVector<int64_t, 4> perm;
-  for (auto v : perm_attr.getValues<APInt>()) {
-    perm.push_back(static_cast<int64_t>(v.getSExtValue()));
-  }
-
-  SmallVector<int64_t, 4> orig_shape;
-  for (auto v : type.getShape()) orig_shape.push_back(v);
-  SmallVector<int64_t, 4> new_shape(rank);
-  for (int64_t dim = 0; dim < rank; ++dim) {
-    new_shape[dim] = orig_shape[perm[dim]];
-  }
-
-  SmallVector<Attribute, 8> transposed_attrs(elements.getNumElements());
-  for (int64_t i = 0; i < elements.getNumElements(); ++i) {
-    SmallVector<uint64_t, 4> orig_index(rank);
-    SmallVector<uint64_t, 4> new_index(rank);
-    int orig_linear_index = i;
-    for (int64_t dim = rank - 1; dim >= 0; --dim) {
-      orig_index[dim] =  orig_linear_index % orig_shape[dim];
-      orig_linear_index /= orig_shape[dim];
-    }
-    for (int64_t dim = 0; dim < rank; ++dim) {
-      new_index[dim] = orig_index[perm[dim]];
-    }
-    int64_t new_linear_index = new_index[0];
-    for (int64_t dim = 1; dim < rank; ++dim) {
-      new_linear_index = new_linear_index * new_shape[dim] + new_index[dim];
-    }
-    transposed_attrs[new_linear_index] =
-        elements.getValues<Attribute>()[orig_index];
-  }
-  return DenseElementsAttr::get(RankedTensorType::get(
-      new_shape, type.getElementType()), transposed_attrs);
-}
-
-} // namespace
-
-OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
-  // If the result has non-static shape, a transpsed op is necessary to go from
-  // static shape to non-static shape.
-  auto resultTy = getResult().getType().dyn_cast<RankedTensorType>();
-  if (!resultTy || !resultTy.hasStaticShape()) return {};
-
-  if (auto elements = operands.front().dyn_cast_or_null<SplatElementsAttr>()) {
-    return reshape(elements, getResult().getType().cast<ShapedType>());
-  }
-
-  // operand is const, thus fold it directly.
-  if (auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>()) {
-    return TransposeElementsAttr(elementsAttr, getPermutation());
-  }
-
-  for (const auto& it : llvm::enumerate(getPermutation().getValues<APInt>())) {
-    if (it.index() != it.value()) {
-      return {};
-    }
-  }
-  return getOperand();
-}
-
-// transpose(transpose(X)) => transpose(X)
-static LogicalResult eliminateRedundantTranspse(TransposeOp op,
-                                                PatternRewriter& rewriter) {
-  auto tranposeOperand = op.getOperand().getDefiningOp<TransposeOp>();
-  if (!tranposeOperand) {
-    return failure();
-  }
-  auto operandPermutation = tranposeOperand.getPermutation().getValues<APInt>();
-  auto newPermutation =
-      op.getPermutation()
-          .mapValues(op.getPermutation().getElementType(),
-                     [&operandPermutation](const APInt& index) -> APInt {
-                       return operandPermutation[index.getSExtValue()];
-                     })
-          .cast<DenseIntElementsAttr>();
-  rewriter.replaceOpWithNewOp<TransposeOp>(op, op.getResult().getType(),
-                                           tranposeOperand.getOperand(),
-                                           newPermutation);
-  return success();
-}
-
-// transpose(broadcast_in_dim(X)) => broadcast_in_dim(X)
-static LogicalResult eliminateBroadcastInDimTranspose(
-    TransposeOp op, PatternRewriter& rewriter) {
-  auto broadcastInDimOp = op.getOperand().getDefiningOp<BroadcastInDimOp>();
-  if (!broadcastInDimOp) {
-    return failure();
-  }
-  DenseIntElementsAttr broadcastDimensions =
-      broadcastInDimOp.getBroadcastDimensions();
-  DenseIntElementsAttr permutation = op.getPermutation();
-  SmallVector<int64_t> newBroadcastDimensions;
-  for (auto dimension : broadcastDimensions.getValues<int64_t>()) {
-    int64_t index = 0;
-    for (auto p : permutation.getValues<int64_t>()) {
-      if (p == dimension) {
-        newBroadcastDimensions.push_back(index);
-        break;
-      }
-      index++;
-    }
-  }
-  rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
-      op, op->getResultTypes(), broadcastInDimOp.getOperand(),
-      rewriter.getI64TensorAttr(newBroadcastDimensions));
-  return success();
-}
-
-// simplify Transpose: replace Transpose with Reshape if they are equivalent
-static LogicalResult simplifyTranspose(TransposeOp op,
-                                       PatternRewriter& rewriter) {
-  auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-  auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
-  if (!operandType || !resultType) {
-    return failure();
-  }
-  // Not support dynamic shape a.t.m. BTW, when it's dynamic shape,
-  // maybe Transpose should be replaced by DynamicReshape.
-  if (!operandType.hasStaticShape() || !resultType.hasStaticShape()) {
-    return failure();
-  }
-  auto permutation = op.getPermutation().getValues<int64_t>();
-  llvm::SmallVector<int64_t> sortedPermutation;
-  for (int64_t i = 0, e = resultType.getRank(); i < e; i++) {
-    if (resultType.getDimSize(i) != 1) {
-      sortedPermutation.push_back(permutation[i]);
-    }
-  }
-  if (llvm::is_sorted(sortedPermutation)) {
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
-    return success();
-  }
-  return failure();
-}
-
-void TransposeOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                              MLIRContext* /*context*/) {
-  results.add(eliminateRedundantTranspse);
-  results.add(eliminateBroadcastInDimTranspose);
-  results.add(simplifyTranspose);
-}
-
-LogicalResult TransposeOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  TransposeOp::Adaptor adaptor(operands);
-  Value operand = adaptor.getOperand();
-
-  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-  // Not support unranked type a.t.m.
-  if (!operandType) return failure();
-
-  Location loc = this->getLoc();
-  SmallVector<int64_t, 4> permutation(
-      this->getPermutation().getValues<int64_t>());
-  SmallVector<Value, 4> shapeValues(permutation.size());
-
-  Type shapeScalarType = builder.getIndexType();
-  auto toShapeScalarType = [&](Value v) {
-    return maybeCastTo(builder, loc, v, shapeScalarType);
-  };
-
-  for (const auto& element : llvm::enumerate(operandType.getShape())) {
-    int64_t idx = element.index();
-    auto* it = std::find(permutation.begin(), permutation.end(), idx);
-    Value valueDim = toShapeScalarType(
-        builder.createOrFold<tensor::DimOp>(loc, operand, element.index()));
-    shapeValues[std::distance(permutation.begin(), it)] = valueDim;
-  }
-
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
-      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
-                            shapeScalarType),
-      shapeValues);
-  reifiedReturnShapes.push_back(outputShape);
-
-  return success();
-}
-
-// Method for InferTypeOpInterface: infer the return type from the operand type
-// and the permutation.
-LogicalResult TransposeOp::inferReturnTypes(
-    MLIRContext* /*context*/, Optional<Location> loc, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto type = operands[0].getType();
-  auto rankedTy = type.dyn_cast<RankedTensorType>();
-  if (!rankedTy) {
-    auto shapedTy = type.dyn_cast<ShapedType>();
-    inferredReturnTypes.emplace_back(shapedTy);
-    return success();
-  }
-  auto permutation = attributes.getAs<DenseIntElementsAttr>("permutation");
-  int64_t rank = rankedTy.getRank();
-  if (permutation.getType().getRank() != 1)
-    return emitOptionalError(loc, "TransposeOp permutation has rank ",
-                             permutation.getType().getRank(),
-                             " instead of rank 1");
-
-  if (permutation.size() != rank)
-    return emitOptionalError(loc, "TransposeOp operand rank ", rank,
-                             " does not match permutation size ",
-                             permutation.size());
-
-  std::vector<int64_t> range(rank);
-  std::iota(range.begin(), range.end(), 0);
-  if (!std::is_permutation(range.begin(), range.end(), permutation.begin()))
-    return emitOptionalError(loc,
-                             "attribute permutation must be a permutation"
-                             " of [",
-                             range, "] but got ", permutation);
-
-  SmallVector<int64_t> resultShape;
-  ArrayRef<int64_t> inputShape = rankedTy.getShape();
-  for (int64_t dim : permutation.getValues<int64_t>()) {
-    resultShape.push_back(inputShape[dim]);
-  }
-  inferredReturnTypes.emplace_back(RankedTensorType::get(
-      resultShape, rankedTy.getElementType(), rankedTy.getEncoding()));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TriangularSolveOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult TriangularSolveOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  TriangularSolveOp::Adaptor adaptor(operands, attributes, regions);
-  bool isTransposeAInvalid =
-      (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
-  return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
-                                     adaptor.getLeftSide(), isTransposeAInvalid,
-                                     inferredReturnShapes);
-}
-
-//===----------------------------------------------------------------------===//
-// GetTupleElementOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult GetTupleElementOp::inferReturnTypes(
-    MLIRContext*, Optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  auto tupleType = operands[0].getType().dyn_cast<TupleType>();
-  if (!tupleType) return failure();
-
-  auto indexAttr = attributes.get("index").cast<IntegerAttr>();
-  auto index = indexAttr.getInt();
-  if (index < 0 || index >= static_cast<int64_t>(tupleType.size()))
-    return failure();
-
-  inferredReturnTypes.push_back(tupleType.getType(index));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// TupleOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult TupleOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location>, ValueRange operands,
-    DictionaryAttr attributes, RegionRange,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  inferredReturnTypes.push_back(TupleType::get(context, TypeRange(operands)));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// UnaryEinsumOp
-//===----------------------------------------------------------------------===//
-
-void UnaryEinsumOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                                MLIRContext* context) {
-  results.add<UnaryEinsumToEinsum>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// CompareOp
-//===----------------------------------------------------------------------===//
-
-void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
-                      Value rhs, ComparisonDirection comparisonDirection,
-                      ComparisonType compareType) {
-  build(builder, result, lhs, rhs,
-        ComparisonDirectionAttr::get(builder.getContext(), comparisonDirection),
-        ComparisonTypeAttr::get(builder.getContext(), compareType));
-}
-
-LogicalResult CompareOp::inferReturnTypeComponents(
-    mlir::MLIRContext* ctx, llvm::Optional<mlir::Location>,
-    ValueShapeRange operands, mlir::DictionaryAttr, mlir::RegionRange,
-    llvm::SmallVectorImpl<mlir::ShapedTypeComponents>& inferredReturnTypes) {
-  ShapedTypeComponents& components =
-      inferredReturnTypes.emplace_back(IntegerType::get(ctx, /*width=*/1));
-  auto argTy = operands.front().getType().cast<TensorType>();
-  if (argTy.hasRank()) {
-    components =
-        ShapedTypeComponents(argTy.getShape(), components.getElementType());
-  }
-  return success();
-}
-
-LogicalResult CompareOp::reifyReturnTypeShapes(
-    OpBuilder& builder, ValueRange operands,
-    SmallVectorImpl<Value>& reifiedReturnShapes) {
-  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
-                                     &reifiedReturnShapes);
-}
-
-template <typename Op, typename ElementType, typename SrcType, typename Convert>
-static Attribute CompareFolder(CompareOp op, ArrayRef<Attribute> attrs) {
-  if (!attrs[0] || !attrs[1]) return {};
-
-  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
-  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
-  if (!lhs || !rhs) return {};
-
-  ShapedType operandType =
-      op.getOperand(0).getType().template cast<ShapedType>();
-  if (!operandType.hasStaticShape()) {
-    return {};
-  }
-
-  auto etype = operandType.getElementType();
-  if (!etype.isa<ElementType>()) {
-    return {};
-  }
-
-  // Prevent folding if the result is too large.
-  if (lhs.getNumElements() > kFoldOpEltLimit) return {};
-
-  SmallVector<bool, 6> values;
-  values.reserve(lhs.getNumElements());
-  for (const auto zip :
-       llvm::zip(lhs.getValues<SrcType>(), rhs.getValues<SrcType>())) {
-    values.push_back(
-        Convert()(addSign(std::get<0>(zip), lhs.getElementType()),
-                  addSign(std::get<1>(zip), rhs.getElementType())));
-  }
-
-  auto resultTy = op.getType().cast<ShapedType>();
-  return DenseElementsAttr::get(resultTy, values);
-}
-
-OpFoldResult CompareOp::fold(ArrayRef<Attribute> operands) {
-  auto resultTy = getType().cast<ShapedType>();
-  if (!resultTy.hasStaticShape()) return {};
-
-  auto direction = getComparisonDirection();
-  auto lhsTy = getElementTypeOrSelf(getLhs());
-  if (getLhs() == getRhs() && !lhsTy.isa<FloatType>() &&
-      (!lhsTy.isa<ComplexType>() ||
-       !lhsTy.cast<ComplexType>().getElementType().isa<FloatType>())) {
-    if (direction == ComparisonDirection::LE ||
-        direction == ComparisonDirection::EQ ||
-        direction == ComparisonDirection::GE) {
-      return DenseIntElementsAttr::get(resultTy, {true});
-    }
-    return DenseIntElementsAttr::get(resultTy, {false});
-  }
-
-  auto opElType = getLhs().getType().cast<ShapedType>().getElementType();
-  // Fold tensor<*xi1> != false to just return tensor<*xi1>
-  if (direction == ComparisonDirection::NE && opElType.isInteger(1)) {
-    DenseIntElementsAttr cstAttr;
-    if (matchPattern(getLhs(), m_Constant(&cstAttr))) {
-      if (cstAttr.isSplat() && !cstAttr.getSplatValue<bool>()) {
-        return getRhs();
-      }
-    }
-
-    if (matchPattern(getRhs(), m_Constant(&cstAttr))) {
-      if (cstAttr.isSplat() && !cstAttr.getSplatValue<bool>()) {
-        return getLhs();
-      }
-    }
-  }
-
-  // Fold tensor<*xi1> == True to just return tensor<*xi1>
-  if (direction == ComparisonDirection::EQ && opElType.isInteger(1)) {
-    DenseIntElementsAttr cstAttr;
-    if (matchPattern(getLhs(), m_Constant(&cstAttr))) {
-      if (cstAttr.isSplat() && cstAttr.getSplatValue<bool>()) {
-        return getRhs();
-      }
-    }
-
-    if (matchPattern(getRhs(), m_Constant(&cstAttr))) {
-      if (cstAttr.isSplat() && cstAttr.getSplatValue<bool>()) {
-        return getLhs();
-      }
-    }
-  }
-
-  if (!operands[0] || !operands[1]) {
-    return {};
-  }
-
-#define COMPARE_FOLDER(Op, comparison, Func)                                \
-  if (direction == comparison) {                                            \
-    if (auto folded = CompareFolder<Op, FloatType, APFloat, Func<APFloat>>( \
-            *this, operands))                                               \
-      return folded;                                                        \
-    if (auto folded = CompareFolder<Op, IntegerType, APInt, Func<APSInt>>(  \
-            *this, operands))                                               \
-      return folded;                                                        \
-  }
-
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::EQ, std::equal_to);
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::NE, std::not_equal_to);
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::LT, std::less);
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::LE, std::less_equal);
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::GT, std::greater);
-  COMPARE_FOLDER(CompareOp, ComparisonDirection::GE, std::greater_equal);
-#undef COMPARE_FOLDER
-
-  return {};
-}
-
-//===----------------------------------------------------------------------===//
-// SelectAndScatterOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-// Infer the return-type of SelectAndScatterOp.
-TensorType inferSelectAndScatterOpReturnType(
-    TensorType operandType, const ArrayRef<hlo::WindowDimension> window) {
-  if (!operandType.hasRank())
-    return UnrankedTensorType::get(operandType.getElementType());
-
-  return RankedTensorType::get(
-      inferWindowOutputShape(operandType.getShape(), window),
-      operandType.getElementType());
-}
-}  // namespace
-
-//  We intend to verify the following properties:
-//   P1. Check if the select function has a proper shape of (T,T) -> PRED, where
-//        T is a 0-D tensor with element-type same as 'operand' element-type.
-//   P2. Verify scatter-computation type.
-//   P3. size-of(window_dimension) == rank-of(input),
-//         where input is an element of 'inputs'.
-//   P4. Verify and collect the window attributes.
-//   P5. Verify the return type matches the operand-type.
-//   P6. Check if the result type of window operation matches the source type.
-LogicalResult SelectAndScatterOp::verify() {
-  auto operandType = getOperand().getType().cast<TensorType>();
-  auto initValueType = getInitValue().getType().cast<TensorType>();
-  auto sourceType = getSource().getType().cast<TensorType>();
-  auto resultType = getResult().getType().cast<TensorType>();
-
-  // P1.
-  Block& selectBlock = getSelect().front();
-
-  if (selectBlock.getArguments().size() != 2)
-    return emitOpError()
-           << "expects the select-region to take 2 parameters, but takes "
-           << selectBlock.getArguments().size();
-
-  Type expectedSelectArgType =
-      RankedTensorType::get({}, operandType.getElementType());
-  for (const auto& selectArgIt : llvm::enumerate(selectBlock.getArguments()))
-    if (!hlo::compatibleShapeAndElementType(expectedSelectArgType,
-                                            selectArgIt.value().getType(),
-                                            /*ignoreFpPrecision=*/true))
-      return emitOpError()
-             << "expects the type of select-region's parameter at index "
-             << selectArgIt.index() << " to be " << expectedSelectArgType
-             << ", but got " << selectArgIt.value().getType();
-
-  auto selectResult = selectBlock.getTerminator()->getOperands();
-  if (selectResult.size() != 1)
-    return emitOpError()
-           << "expects select-region to return single value, but got: "
-           << selectResult.size();
-
-  auto selectResultType = selectResult[0].getType().dyn_cast<TensorType>();
-  if (!selectResultType || !selectResultType.getElementType().isInteger(1) ||
-      (selectResultType.hasRank() &&
-       selectResultType.cast<RankedTensorType>().getRank() != 0))
-    return emitOpError() << "expects the return-type of select-region to be "
-                            "tensor<i1>, but got: "
-                         << selectResult[0].getType();
-
-  // P2.
-  Block& scatterBlock = getScatter().front();
-  SmallVector<TensorType> accumulatorSubshapes;
-  if (failed(hlo::verifyReducerShape(
-          this->getLoc(), scatterBlock,
-          {RankedTensorType::get({}, sourceType.getElementType())},
-          {initValueType},
-          /*numInputs=*/1, /*allowedDimensions=*/{},
-          /*allInputsUnranked=*/false, accumulatorSubshapes)))
-    return failure();
-
-  // P3.
-  SmallVector<int64_t> windowDims =
-      convertDenseIntAttr(this->getWindowDimensions());
-  if (operandType.hasRank()) {
-    if (operandType.getRank() != static_cast<int64_t>(windowDims.size()))
-      return emitOpError()
-             << "expects window-dimensions size == operand rank, but got "
-                "window-dimensions size: "
-             << windowDims.size() << " and operand-type: " << operandType
-             << " with rank = " << operandType.getRank() << ".";
-  }
-
-  // P4.
-  auto paddingOrErr = convertNx2Attribute(this->getPadding(), getLoc());
-  if (failed(paddingOrErr)) return failure();
-  SmallVector<std::pair<int64_t, int64_t>> padding = *paddingOrErr;
-
-  auto windowOrErr = hlo::verifyWindowAttributesAndInferWindowDimensions(
-      windowDims, convertDenseIntAttr(getWindowStrides()), padding,
-      /*lhs_dilation=*/{}, /*rhs_dilation=*/{}, getLoc());
-  if (failed(windowOrErr)) return failure();
-
-  // P5.
-  if (!hlo::compatibleShapeAndElementType(operandType, resultType))
-    return emitOpError()
-           << "expects the return-type to match the operand-type, but got "
-           << resultType << " and " << operandType << " resp.";
-
-  // P6.
-  auto windowResultType =
-      inferSelectAndScatterOpReturnType(operandType, *windowOrErr);
-
-  if (!hlo::compatibleShapeAndElementType(windowResultType, sourceType,
-                                          /*ignoreFpPrecision=*/true))
-    return emitOpError() << "expects source-type to be " << windowResultType
-                         << ", but got" << sourceType;
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// ScatterOp
-//===----------------------------------------------------------------------===//
-
-/*
- * We intend to verify the following properties:
- * P1. The 'update_window_dims' must be valid indices of 'updates' tensor.
- * P2. The 'inserted_window_dims' must be valid indices of 'operand' tensor.
- * P3. Check if the rank-of('operand') == size-of('update_window_dims') +
- *     size-of('inserted_window_dims')
- * P4. size-of('scatter_dims_to_operand_dims') =
- *         'scatter_indices'['index_vector_dim'] &
- *     'scatter_dims_to_operand_dims' must be valid indices of 'operand' tensor.
- */
-LogicalResult validateScatterDimensionNumbers(
-    ShapedType operandType, ArrayRef<int64_t> scatterIndicesShape,
-    ShapedType updateType, bool operandTypeRanked,
-    bool scatterIndicesTypeRanked, bool updatesTypeRanked,
-    ScatterDimensionNumbersAttr dimNumbers, Location loc) {
-  // P1.
-  auto updateWindowDims = to_vector(dimNumbers.getUpdateWindowDims());
-  if (!llvm::is_sorted(updateWindowDims))
-    return mlir::emitError(loc)
-           << "Expects update_window_dims to be sorted; got: ["
-           << updateWindowDims << "].";
-
-  if (hasDuplicates(updateWindowDims))
-    return mlir::emitError(loc)
-           << "Expects update_window_dims to not repeat; got: ["
-           << updateWindowDims << "].";
-
-  if (updatesTypeRanked) {
-    for (int64_t windowDim : updateWindowDims) {
-      if (windowDim < 0 || windowDim >= updateType.getRank()) {
-        return mlir::emitError(loc)
-               << "Expects each element of update_window_dims to be in range "
-                  "[0, "
-                  "rank-of('updates') i.e. [0, "
-               << updateType.getRank() << "). got: " << windowDim << ".";
-      }
-    }
-  }
-
-  // P2.
-  auto insertedWindowDims = to_vector(dimNumbers.getInsertedWindowDims());
-  if (!llvm::is_sorted(insertedWindowDims))
-    return mlir::emitError(loc)
-           << "Expects inserted_window_dims to be sorted; got: ["
-           << insertedWindowDims << "].";
-
-  if (hasDuplicates(insertedWindowDims))
-    return mlir::emitError(loc)
-           << "Expects inserted_window_dims to not repeat; got: ["
-           << insertedWindowDims << "].";
-
-  if (operandTypeRanked) {
-    for (int64_t insertedDim : insertedWindowDims) {
-      if (insertedDim < 0 || insertedDim >= operandType.getRank()) {
-        return mlir::emitError(loc)
-               << "Expects each element of inserted_window_dims to be in range "
-                  "[0, rank-of('operand') i.e. [0, "
-               << operandType.getRank() << "). got: " << insertedDim << ".";
-      }
-    }
-  }
-
-  // P3.
-  if (operandTypeRanked) {
-    auto windowSize = updateWindowDims.size() + insertedWindowDims.size();
-    if (operandType.getRank() != static_cast<int64_t>(windowSize))
-      return mlir::emitError(loc)
-             << "Expects rank-of operand to match "
-                "size-of('update_window_dims')  + "
-                "size-of('inserted_window_dims') i.e. "
-             << windowSize << " but got " << operandType.getRank() << ".";
-  }
-
-  // P4.
-  auto scatterDimsToOperandDims =
-      to_vector(dimNumbers.getScatterDimsToOperandDims());
-  auto indexVectorDim = dimNumbers.getIndexVectorDim();
-  if (scatterIndicesTypeRanked) {
-    if (!hlo::isDynamicDimSize(scatterIndicesShape[indexVectorDim]) &&
-        static_cast<int64_t>(scatterDimsToOperandDims.size()) !=
-            scatterIndicesShape[dimNumbers.getIndexVectorDim()])
-      return mlir::emitError(loc)
-             << "Scatter op has " << scatterDimsToOperandDims.size()
-             << " elements in scatter_dims_to_operand_dims and the bound of "
-                "dimension index_vector_dim="
-             << dimNumbers.getIndexVectorDim() << " of scatter_indices is "
-             << scatterIndicesShape[dimNumbers.getIndexVectorDim()]
-             << ". These two numbers must be equal.";
-  }
-
-  if (operandTypeRanked) {
-    for (int64_t i = 0;
-         i < static_cast<int64_t>(scatterDimsToOperandDims.size()); ++i) {
-      int64_t scatterDimToOperandDim = scatterDimsToOperandDims[i];
-      if (scatterDimToOperandDim < 0 ||
-          scatterDimToOperandDim >= operandType.getRank())
-        return mlir::emitError(loc)
-               << "Invalid scatter_dims_to_operand_dims mapping; domain is [0, "
-               << operandType.getRank() << "), got: " << i << "->"
-               << scatterDimToOperandDim << ".";
-    }
-  }
-
-  if (hasDuplicates(scatterDimsToOperandDims))
-    return mlir::emitError(loc)
-           << "Expects scatter_dims_to_operand_dims to not repeat; got: ["
-           << scatterDimsToOperandDims << "].";
-
-  return success();
-}
-/*
- * We intend to verify the following properties:
- *  P0. scatter_indices argument must be an integral tensor. Enforced by ODS.
- *  P1. Scatter index leaf dimension must be within [0, rank(scatter_indices)"
- *      " + 1).
- *  P2. Verify reducer shape.
- *  P3. rank-of('updates[i]') == size-of('update_window_dims') +
- *      rank-of('scatter_indices') - 1, where 'scatter_indices' is expanded by a
- *      trailing 1 dimension if 'index_vector_dim' == rank-of('scatter_indices')
- *      for all values of `i`.
- *  P4. Validate the scatter-dimensions-numbers.
- *  P5. Valide the bounds of each of the 'updates' w.r.t the operands.
- *  P6. Validate the bounds of each of the 'updates' w.r.t the
- * 'scatter_indices'.
- *  P7. Check return types.
- */
-LogicalResult ScatterOp::verify() {
-  // Get the first operand and update, since variadic Scatter is not yet
-  // implemented
-  auto numOperands = getInputs().size();
-  auto scatterIndicesType =
-      getScatterIndices().getType().dyn_cast<TensorType>();
-
-  SmallVector<TensorType, 1> operandTypes =
-      llvm::to_vector(llvm::map_range(getInputs().getTypes(), [](Type type) {
-        return type.cast<TensorType>();
-      }));
-  SmallVector<TensorType, 1> updatesTypes =
-      llvm::to_vector(llvm::map_range(getUpdates().getTypes(), [](Type type) {
-        return type.cast<TensorType>();
-      }));
-  bool allOperandTypesRanked =
-      llvm::all_of(getInputs().getTypes(),
-                   [](Type type) { return type.isa<RankedTensorType>(); });
-  bool scatterIndicesTypeRanked = scatterIndicesType.isa<RankedTensorType>();
-
-  // P1.
-  int64_t indexVectorDim = getScatterDimensionNumbers().getIndexVectorDim();
-  if (scatterIndicesTypeRanked) {
-    if (indexVectorDim > scatterIndicesType.getRank() || indexVectorDim < 0)
-      return emitOpError()
-             << "expects scatter index leaf dimension to be within [0, "
-                "rank(scatter_indices) + 1."
-                " rank(scatter_indices) is "
-             << scatterIndicesType.getRank()
-             << " and scatter index leaf dimension is " << indexVectorDim
-             << ".";
-  }
-
-  // P2.
-  Block& block = getUpdateComputation().front();
-  SmallVector<TensorType> accumulatorSubshapes;
-  SmallVector<TensorType> inputTypes, initValueTypes;
-  for (int64_t i = 0; i < static_cast<int64_t>(numOperands); i++) {
-    inputTypes.push_back(operandTypes[i]);
-    initValueTypes.push_back(
-        RankedTensorType::get({}, updatesTypes[i].getElementType()));
-  }
-  if (failed(hlo::verifyReducerShape(
-          this->getLoc(), block, inputTypes, initValueTypes, numOperands,
-          /*allowedDimensions=*/{},
-          /*allInputsUnranked=*/!allOperandTypesRanked, accumulatorSubshapes)))
-    return failure();
-
-  // P3.
-  auto updateWindowDims = getScatterDimensionNumbers().getUpdateWindowDims();
-  SmallVector<int64_t> expandedScatterIndicesShape;
-  if (scatterIndicesTypeRanked) {
-    expandedScatterIndicesShape =
-        llvm::to_vector(scatterIndicesType.getShape());
-    if (static_cast<int64_t>(expandedScatterIndicesShape.size()) ==
-        indexVectorDim)
-      expandedScatterIndicesShape.push_back(1);
-  }
-
-  for (int64_t i = 0; i < static_cast<int64_t>(numOperands); i++) {
-    if (scatterIndicesTypeRanked && updatesTypes[i].isa<RankedTensorType>()) {
-      int64_t expectedUpdatesRank =
-          expandedScatterIndicesShape.size() - 1 + updateWindowDims.size();
-      if (updatesTypes[i].getRank() != expectedUpdatesRank)
-        return emitOpError()
-               << "expects updates tensor must be of rank "
-               << expectedUpdatesRank
-               << " ( == rank-of('scatter_indices') - 1 + "
-                  "size-of('update_window_dims'), where 'scatter_indices' is "
-                  "expanded by a trailing 1 dimension if 'index_vector_dim' == "
-                  "rank-of('scatter_indices')), but got "
-               << updatesTypes[i].getRank() << ".";
-    }
-  }
-
-  // P4.
-  for (int64_t i = 0; i < static_cast<int64_t>(numOperands); i++) {
-    if (failed(validateScatterDimensionNumbers(
-            operandTypes[i], expandedScatterIndicesShape, updatesTypes[i],
-            operandTypes[i].isa<RankedTensorType>(), scatterIndicesTypeRanked,
-            updatesTypes[i].isa<RankedTensorType>(),
-            getScatterDimensionNumbers(), getLoc())))
-      return failure();
-  }
-
-  // P5.
-  for (int64_t i = 0; i < static_cast<int64_t>(numOperands); i++) {
-    if (updatesTypes[i].isa<RankedTensorType>()) {
-      auto updatesShape = updatesTypes[i].getShape();
-      if (operandTypes[i].isa<RankedTensorType>()) {
-        auto operandShape = operandTypes[i].getShape();
-        auto insertedWindowDims =
-            getScatterDimensionNumbers().getInsertedWindowDims();
-
-        int64_t insertedDimsSeen = 0;
-        SmallVector<int64_t> maxUpdateSliceSizes;
-        const auto dimensionsSize = operandTypes[i].getRank();
-        maxUpdateSliceSizes.reserve(dimensionsSize);
-        for (int i = 0; i < dimensionsSize; ++i) {
-          if (insertedDimsSeen <
-                  static_cast<int64_t>(insertedWindowDims.size()) &&
-              insertedWindowDims[insertedDimsSeen] == i) {
-            ++insertedDimsSeen;
-          } else {
-            maxUpdateSliceSizes.push_back(operandShape[i]);
-          }
-        }
-
-        for (int64_t i = 0; i < static_cast<int64_t>(updateWindowDims.size());
-             ++i) {
-          auto updateWindowDim = updateWindowDims[i];
-
-          if (hlo::isDynamicDimSize(updatesShape[updateWindowDim]) ||
-              hlo::isDynamicDimSize(maxUpdateSliceSizes[i]))
-            continue;
-
-          if (updatesShape[updateWindowDim] > maxUpdateSliceSizes[i]) {
-            return emitOpError()
-                   << "expects bounds of the window dimensions of "
-                      "updates to not exceed the "
-                      "bounds of the corresponding dimensions of "
-                      "operand. For dimension "
-                   << updateWindowDim << ", updates bound is "
-                   << updatesShape[updateWindowDim] << ", operand bound is "
-                   << maxUpdateSliceSizes[i] << ".";
-          }
-        }
-      }
-
-      // P6.
-      if (scatterIndicesTypeRanked) {
-        int64_t scatterDimsSeen = 0;
-        for (int64_t i = 0; i < static_cast<int64_t>(updatesShape.size());
-             ++i) {
-          bool isUpdateWindowDim = std::binary_search(
-              updateWindowDims.begin(), updateWindowDims.end(), i);
-
-          if (isUpdateWindowDim) continue;
-          if (scatterDimsSeen == indexVectorDim) ++scatterDimsSeen;
-
-          if (!hlo::isDynamicDimSize(updatesShape[i]) &&
-              !hlo::isDynamicDimSize(
-                  expandedScatterIndicesShape[scatterDimsSeen]) &&
-              (updatesShape[i] !=
-               expandedScatterIndicesShape[scatterDimsSeen])) {
-            return emitOpError()
-                   << "expects bounds of the scatter dimensions of "
-                      "updates to be same as the "
-                      "bounds of the corresponding dimensions of "
-                      "scatter indices. For "
-                      "scatter dimension "
-                   << i << ", updates bound is " << updatesShape[i]
-                   << " , scatter_indices "
-                      "bound is "
-                   << expandedScatterIndicesShape[scatterDimsSeen] << ".";
-          }
-          ++scatterDimsSeen;
-        }
-      }
-    }
-  }
-
-  // P7.
-  for (int64_t i = 0; i < static_cast<int64_t>(numOperands); i++) {
-    if (!hlo::compatibleShapeAndElementType(operandTypes[i],
-                                            getResult(i).getType()))
-      return emitOpError()
-             << "expects the return type to be same as the operand type: "
-             << operandTypes[i] << ", but got " << getResult(i).getType()
-             << ".";
-  }
-
-  return success();
-}
-
-llvm::SmallVector<Attribute, 4> evaluateMhloRegion(Region& region,
-                                                   ArrayRef<Attribute> inputs) {
-  if (region.getNumArguments() != inputs.size()) return {};
-
-  llvm::DenseMap<Value, Attribute> values;
-  values.reserve(region.getNumArguments());
-  for (auto it : llvm::zip(region.getArguments(), inputs)) {
-    values.try_emplace(std::get<0>(it), std::get<1>(it));
-  }
-
-  for (auto& op : region.getOps()) {
-    llvm::SmallVector<Attribute, 4> inputs;
-    for (auto& operand : op.getOpOperands()) {
-      inputs.push_back(values.lookup(operand.get()));
-    }
-    if (isa<ReturnOp>(op)) return inputs;
-
-    llvm::SmallVector<OpFoldResult, 4> results;
-    if (failed(op.fold(inputs, results))) return {};
-    for (auto it : llvm::zip(op.getResults(), results)) {
-      if (!std::get<1>(it).is<Attribute>()) return {};
-      values.insert({std::get<0>(it), std::get<1>(it).get<Attribute>()});
-    }
-  }
-  return {};
-}
-
-LogicalResult ScatterOp::fold(
-    ArrayRef<Attribute> args,
-    llvm::SmallVectorImpl<OpFoldResult>& foldResults) {
-  // Variadic Scatter not yet implemented
-  if (getInputs().size() != 1 || getUpdates().size() != 1) return failure();
-  auto index = args[1].dyn_cast_or_null<DenseIntElementsAttr>();
-  if (!index) return failure();
-
-  auto baseType = getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
-  auto updateType = getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
-  auto indexType = index.getType().cast<RankedTensorType>();
-  if (!baseType || !indexType || !updateType) return failure();
-
-  // TODO(b/228310289): Work around canonicalization crash for complex types.
-  // Remove after upstream MLIR has been fixed.
-  if (baseType.getElementType().isa<ComplexType>()) return failure();
-
-  // Catch a trivial full replacement of base with update, this does not require
-  // these to be constant: just that we know the type.
-  if (updateType == baseType && updateType.hasStaticShape() &&
-      baseType.hasStaticShape() && index.isSplat() &&
-      index.getSplatValue<uint32_t>() == 0 &&
-      llvm::hasSingleElement(getUpdateComputation().front())) {
-    foldResults.push_back(getUpdates()[0]);
-    return success();
-  }
-  auto base = args[0].dyn_cast_or_null<DenseElementsAttr>();
-  auto update = args[2].dyn_cast_or_null<DenseElementsAttr>();
-  if (!base || !update) return failure();
-
-  // Add the virtual trailing dimension of size 1 if indexVectorDim equals to
-  // indexType.rank.
-  const int64_t indexVectorDim =
-      getScatterDimensionNumbers().getIndexVectorDim();
-  if (indexVectorDim == indexType.getRank()) {
-    auto indexShape = indexType.getShape().vec();
-    indexShape.push_back(1);
-    indexType = RankedTensorType::get(indexShape, indexType.getElementType());
-    index = reshape(index, indexType).cast<DenseIntElementsAttr>();
-  }
-
-  // Increment the multi-dimensional index vector based on the limits for each
-  // dimension specified by shape and returns false if the index rolled around
-  // with true otherwise.
-  auto nextIndex = [](llvm::SmallVector<uint64_t, 8>& index,
-                      llvm::ArrayRef<int64_t> shape) {
-    for (int64_t i = index.size() - 1; i >= 0; --i) {
-      ++index[i];
-      if (index[i] < static_cast<unsigned long>(shape[i])) return true;
-      index[i] = 0;
-    }
-    return false;
-  };
-
-  // Prevent folding if the result is too large.
-  if (base.getNumElements() > kFoldOpEltLimit) return failure();
-
-  // Iterate over all elements of the update tensor, then find the corresponding
-  // value in the indices tensor to determine which location we have to update
-  // in the base/result tensor.
-  llvm::SmallVector<Attribute, 8> results(base.getValues<Attribute>());
-  llvm::SmallVector<uint64_t, 8> updateIndex(updateType.getRank(), 0);
-  llvm::SmallVector<uint64_t, 8> indexIndex;
-  indexIndex.reserve(indexType.getRank());
-  llvm::SmallVector<int64_t, 8> baseIndex;
-  baseIndex.reserve(baseType.getRank());
-  do {
-    // Compute the index for the slice of the indices tensor for this update
-    // value.
-    indexIndex.clear();
-    if (indexVectorDim == 0) indexIndex.push_back(0);
-    for (int64_t i = 0; i < static_cast<int64_t>(updateIndex.size()); ++i) {
-      if (llvm::count(getScatterDimensionNumbers().getUpdateWindowDims(), i) ==
-          0)
-        indexIndex.push_back(updateIndex[i]);
-      if (static_cast<int64_t>(indexIndex.size()) == indexVectorDim)
-        indexIndex.push_back(0);
-    }
-
-    // Compute the index for the given update value in the base tensor.
-    baseIndex.assign(baseType.getRank(), 0);
-    uint64_t indexCount = indexType.getShape()[indexVectorDim];
-    for (uint64_t i = 0; i < indexCount; ++i) {
-      uint64_t operandDim =
-          getScatterDimensionNumbers().getScatterDimsToOperandDims()[i];
-      indexIndex[indexVectorDim] = i;
-      baseIndex[operandDim] +=
-          index.getValues<APInt>()[indexIndex].getSExtValue();
-    }
-    uint64_t updateWindowDimIndex = 0;
-    auto insertedWindowDims =
-        getScatterDimensionNumbers().getInsertedWindowDims();
-    auto updateWindowDims = getScatterDimensionNumbers().getUpdateWindowDims();
-    for (uint64_t i = 0; i < baseIndex.size(); ++i) {
-      if (llvm::count(insertedWindowDims, i)) continue;
-      baseIndex[i] += updateIndex[updateWindowDims[updateWindowDimIndex]];
-      updateWindowDimIndex++;
-    }
-
-    // Compute the linear index for the index into the base tensor.
-    int64_t linearBaseIndex = 0;
-    int64_t linearBaseIndexMultiplyer = 1;
-    for (int64_t i = baseIndex.size() - 1; i >= 0; --i) {
-      // Out of bound index have backend specific behaviour so avoid folding it.
-      if (baseIndex[i] < 0 || baseIndex[i] >= baseType.getShape()[i])
-        return failure();
-      linearBaseIndex += baseIndex[i] * linearBaseIndexMultiplyer;
-      linearBaseIndexMultiplyer *= baseType.getShape()[i];
-    }
-
-    // Evaluate update computation and update the value with the newly computed
-    // attribute in the base tensor.
-    auto lhs = DenseElementsAttr::get(
-        RankedTensorType::get({}, baseType.getElementType()),
-        results[linearBaseIndex]);
-    auto rhs = DenseElementsAttr::get(
-        RankedTensorType::get({}, baseType.getElementType()),
-        update.getValues<Attribute>()[updateIndex]);
-    auto newValue = evaluateMhloRegion(getUpdateComputation(), {lhs, rhs});
-    if (newValue.size() != 1 || !newValue[0]) return failure();
-    results[linearBaseIndex] =
-        newValue[0].cast<DenseElementsAttr>().getValues<Attribute>()[0];
-  } while (nextIndex(updateIndex, updateType.getShape()));
-
-  foldResults.push_back(DenseElementsAttr::get(baseType, results));
-  return success();
-}
-
-// Replace mhlo.scatter overwriting the entire input with mhlo.map.
-struct ScatterFullReplace : public OpRewritePattern<ScatterOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ScatterOp scatter,
-                                PatternRewriter& rewriter) const override {
-    // Variadic Scatter not yet implemented
-    if (scatter.getInputs().size() != 1 || scatter.getUpdates().size() != 1)
-      return failure();
-
-    auto baseType =
-        scatter.getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
-    auto updateType =
-        scatter.getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
-    auto indexType =
-        scatter.getScatterIndices().getType().dyn_cast<RankedTensorType>();
-    if (!baseType || !indexType || !updateType) return failure();
-
-    // If updates is an empty shape, scatter overwrites the entire tensor.
-    // Transform it into a map with the combiner function.
-    if (!indexType.hasStaticShape() || indexType.getNumElements() > 0)
-      return failure();
-
-    // Require the same shape for base and updates. This isn't strictly
-    // necessary, but handling other cases would require turning scatter options
-    // into the appropriate reshapes and transposes.
-    if (!baseType.hasStaticShape() || !updateType.hasStaticShape() ||
-        baseType != updateType)
-      return failure();
-
-    auto dimensions =
-        llvm::to_vector(llvm::seq<int64_t>(0, baseType.getRank()));
-    auto map = rewriter.create<mhlo::MapOp>(
-        scatter.getLoc(), scatter->getResultTypes(),
-        ValueRange{scatter.getOperands()[0], scatter.getUpdates()[0]},
-        rewriter.getI64TensorAttr(dimensions));
-    rewriter.inlineRegionBefore(scatter.getRegion(), map.getRegion(),
-                                map.getRegion().begin());
-    rewriter.replaceOp(scatter, map->getResults());
-    return success();
-  }
-};
-
-void ScatterOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                            MLIRContext* context) {
-  results.add<ScatterFullReplace>(context);
-}
-
-//===----------------------------------------------------------------------===//
-// WhileOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult WhileOp::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<Type>& inferredReturnTypes) {
-  WhileOp::Adaptor adaptor(operands, attributes, regions);
-  return hlo::inferWhileOp(location, adaptor.getOperand(), adaptor.getCond(),
-                           adaptor.getBody(), inferredReturnTypes);
-}
-
-/// Print a `while` op.
-///
-/// op ::= `mhlo.while` `(` assignment-list `)` `:` types attribute-dict
-///         `cond` region
-///         `do` region
-/// assignment-list ::= assignment | assignment `,` assignment-list
-/// assignment ::= ssa-value `=` ssa-value
-void WhileOp::print(OpAsmPrinter& p) {
-  p << '(';
-  llvm::interleaveComma(
-      llvm::zip(SingleBlock::getBody()->getArguments(), getOperands()), p,
-      [&](auto zip) {
-        p.printOperand(std::get<0>(zip));
-        p << " = ";
-        p.printOperand(std::get<1>(zip));
-      });
-  p << ")";
-  if (getNumOperands()) {
-    p << " : ";
-    llvm::interleaveComma(getOperandTypes(), p);
-  }
-  p.printOptionalAttrDictWithKeyword(getOperation()->getAttrs());
-  p.printNewline();
-  p << " cond ";
-  p.printRegion(getRegion(0), /*printEntryBlockArgs=*/false);
-  p << " do ";
-  p.printRegion(getRegion(1), /*printEntryBlockArgs=*/false);
-}
-
-ParseResult WhileOp::parse(OpAsmParser& parser, OperationState& result) {
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  // Parse the operands of the while: these are of the form:
-  //   %iter_arg = %init_val
-  // where %iter_arg is the name of the block argument in the cond/body blocks
-  // and %init_val is the actual operand.
-  SmallVector<OpAsmParser::UnresolvedOperand> operands;
-  SmallVector<OpAsmParser::UnresolvedOperand> iterArgs;
-  if (parser.parseLParen()) return failure();
-  do {
-    if (succeeded(parser.parseOptionalRParen())) break;
-    OpAsmParser::UnresolvedOperand operand, iterArg;
-    if (parser.parseOperand(iterArg) || parser.parseEqual() ||
-        parser.parseOperand(operand))
-      return failure();
-    iterArgs.push_back(iterArg);
-    operands.push_back(operand);
-    if (succeeded(parser.parseOptionalRParen())) break;
-    if (failed(parser.parseComma())) return failure();
-  } while (true);
-  if (!operands.empty()) {
-    if (parser.parseColon() || parser.parseTypeList(result.types))
-      return failure();
-  }
-
-  SmallVector<OpAsmParser::Argument> args;
-  createArgs(iterArgs, result.types, args);
-  if (parser.resolveOperands(operands, result.types, loc, result.operands) ||
-      parser.parseOptionalAttrDictWithKeyword(result.attributes) ||
-      parser.parseKeyword("cond") ||
-      parser.parseRegion(*result.addRegion(), args) ||
-      parser.parseKeyword("do") ||
-      parser.parseRegion(*result.addRegion(), args))
-    return failure();
-  return success();
-}
-
-LogicalResult WhileOp::fold(ArrayRef<Attribute> /*operands*/,
-                            SmallVectorImpl<OpFoldResult>& results) {
-  DenseIntElementsAttr condValue;
-  auto condReturnOp = cast<ReturnOp>(getCond().front().back());
-  if (!matchPattern(condReturnOp.getOperand(0), m_Constant(&condValue)))
-    return failure();
-  if (condValue.getSplatValue<BoolAttr>().getValue())
-    return failure();  // TODO(mhlo): this is an infinite loop, should we fold?
-
-  results.append(getOperands().begin(), getOperands().end());
-  return success();
-}
-
-static LogicalResult whileCanonicalization(WhileOp whileOp,
-                                           PatternRewriter& rewriter) {
-  // Turn loop invariant values into implicit capture.
-  // Check if there is at least one value is forwarded from one iteration to the
-  // next, or one of the yielded value is an implicit capture already. Otherwise
-  // there is nothing to do here.
-  Block* cond = whileOp.SingleBlock::getBody(0);
-  Block* body = whileOp.SingleBlock::getBody(1);
-  auto bodyReturnOp = cast<ReturnOp>(body->getTerminator());
-  if (!llvm::any_of(llvm::zip(whileOp->getOperands(), body->getArguments(),
-                              bodyReturnOp->getOperands()),
-                    [&](auto zip) {
-                      return (std::get<0>(zip) == std::get<2>(zip) ||
-                              std::get<1>(zip) == std::get<2>(zip));
-                    }))
-    return rewriter.notifyMatchFailure(whileOp, "no loop invariant found");
-
-  SmallVector<Value> newOperands, resultsToReplace;
-  SmallVector<unsigned> invariantArgIdxs;
-  BitVector invariantArgIdxBitVector(cond->getNumArguments());
-  for (const auto& enumeratedOperands : llvm::enumerate(llvm::zip(
-           whileOp.getOperands(), cond->getArguments(), body->getArguments(),
-           bodyReturnOp->getOperands(), whileOp->getResults()))) {
-    const auto& operands = enumeratedOperands.value();
-    Value whileOperand = std::get<0>(operands);
-    BlockArgument condBlockArg = std::get<1>(operands);
-    BlockArgument bodyBlockArg = std::get<2>(operands);
-    Value bodyReturnOperand = std::get<3>(operands);
-    Value whileResult = std::get<4>(operands);
-
-    bool forwarded = (whileOperand == bodyReturnOperand ||
-                      bodyBlockArg == bodyReturnOperand);
-    if (forwarded) {
-      invariantArgIdxs.push_back(enumeratedOperands.index());
-      invariantArgIdxBitVector.set(enumeratedOperands.index());
-      condBlockArg.replaceAllUsesWith(whileOperand);
-      bodyBlockArg.replaceAllUsesWith(whileOperand);
-      whileResult.replaceAllUsesWith(whileOperand);
-      continue;
-    }
-    newOperands.push_back(whileOperand);
-    resultsToReplace.push_back(whileResult);
-  }
-  cond->eraseArguments(invariantArgIdxBitVector);
-  body->eraseArguments(invariantArgIdxBitVector);
-  for (int idx : llvm::reverse(invariantArgIdxs))
-    bodyReturnOp->eraseOperand(idx);
-
-  WhileOp newWhileOp = rewriter.create<WhileOp>(
-      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands);
-  newWhileOp.getBodyRegion(0).takeBody(whileOp.getBodyRegion(0));
-  newWhileOp.getBodyRegion(1).takeBody(whileOp.getBodyRegion(1));
-  for (auto results : llvm::zip(resultsToReplace, newWhileOp->getResults()))
-    std::get<0>(results).replaceAllUsesWith(std::get<1>(results));
-  rewriter.eraseOp(whileOp);
-  return success();
-}
-
-void WhileOp::getCanonicalizationPatterns(RewritePatternSet& results,
-                                          MLIRContext* context) {
-  results.add(&whileCanonicalization);
-}
-
-LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
-    MLIRContext*, Optional<Location> /*location*/, ValueShapeRange operands,
-    DictionaryAttr attributes, RegionRange regions,
-    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-  UniformDequantizeOp::Adaptor adaptor(operands, attributes, regions);
-  auto operandType = (*operands.begin()).getType().cast<ShapedType>();
-  // Trait HLO_QuantizedIntTensor in ODS guarantees QuantizedType;
-  auto quantType = operandType.getElementType().cast<quant::QuantizedType>();
-  auto shape = operandType.dyn_cast<ShapedType>().getShape();
-  inferredReturnShapes.emplace_back(shape, quantType.getExpressedType());
-  return success();
-}
-
-using mlir::hlo::parseWindowAttributes;
-using mlir::hlo::printWindowAttributes;
-
-}  // namespace mhlo
-}  // namespace mlir
-
-// clang-format off
-using mlir::hlo::printSameOperandsAndResultType;
-using mlir::hlo::parseSameOperandsAndResultType;
-using mlir::hlo::printVariadicSameOperandsAndResultType;
-using mlir::hlo::parseVariadicSameOperandsAndResultType;
-using mlir::hlo::printComplexOpType;
-using mlir::hlo::parseComplexOpType;
-using mlir::hlo::printPairwiseOpType;
-using mlir::hlo::parsePairwiseOpType;
-using mlir::hlo::printSelectOpType;
-using mlir::hlo::parseSelectOpType;
-using mlir::hlo::printTupleOpType;
-using mlir::hlo::parseTupleOpType;
-// clang-format on
-
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
-
-namespace mlir {
-namespace mhlo {
-
-//===----------------------------------------------------------------------===//
-// mhlo Dialect Interfaces
-//===----------------------------------------------------------------------===//
-
-namespace {
-struct HLOInlinerInterface : public DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
-
-  // Allow all call operations to be inlined.
-  bool isLegalToInline(Operation* call, Operation* callable,
-                       bool wouldBeCloned) const final {
-    return true;
-  }
-  // We don't have any special restrictions on what can be inlined into
-  // destination regions (e.g. while/conditional bodies). Always allow it.
-  bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
-                       BlockAndValueMapping& valueMapping) const final {
-    return true;
-  }
-  // Operations in mhlo dialect are always legal to inline since they are
-  // pure.
-  bool isLegalToInline(Operation*, Region*, bool,
-                       BlockAndValueMapping&) const final {
-    return true;
-  }
-};
-
-struct HLOBoundedDialectInterface : public hlo::BoundedDialectInterface {
-  using BoundedDialectInterface::BoundedDialectInterface;
-
-  Attribute createBoundedAttr(ArrayRef<int64_t> bounds) const override {
-    return TypeExtensionsAttr::get(getDialect()->getContext(), bounds);
-  }
-};
-}  // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// mhlo Dialect Constructor
-//===----------------------------------------------------------------------===//
-
-MhloDialect::MhloDialect(MLIRContext* context)
-    : Dialect(getDialectNamespace(), context, TypeID::get<MhloDialect>()) {
-  addOperations<
-#define GET_OP_LIST
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
-      >();
-  addInterfaces<HLOBoundedDialectInterface>();
-  addInterfaces<HLOInlinerInterface>();
-  addBytecodeInterface(this);
-  addTypes<TokenType, AsyncBundleType>();
-  addAttributes<
-#define GET_ATTRDEF_LIST
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.cc.inc"
-      >();
-  context->loadDialect<tensor::TensorDialect>();
-}
-
-Type MhloDialect::parseType(DialectAsmParser& parser) const {
-  StringRef mnemonic;
-  Type parsedType;
-  auto parseResult = generatedTypeParser(parser, &mnemonic, parsedType);
-  if (parseResult.has_value()) return parsedType;
-  if (mnemonic == "token") return TokenType::get(getContext());
-  parser.emitError(parser.getNameLoc()) << "unknown mhlo type: " << mnemonic;
-  return nullptr;
-}
-
-void MhloDialect::printType(Type type, DialectAsmPrinter& os) const {
-  if (type.isa<TokenType>()) {
-    os << "token";
-    return;
-  }
-  if (succeeded(generatedTypePrinter(type, os))) return;
-  os << "<unknown mhlo type>";
-}
-
-// Entry point for Attribute parsing, TableGen generated code will handle the
-// dispatch to the individual classes.
-Attribute MhloDialect::parseAttribute(DialectAsmParser& parser,
-                                      Type type) const {
-  StringRef attrTag;
-  Attribute attr;
-  auto parseResult = generatedAttributeParser(parser, &attrTag, type, attr);
-  if (parseResult.has_value()) return attr;
-  parser.emitError(parser.getNameLoc(), "unknown mhlo attribute");
-  return Attribute();
-}
-
-// Entry point for Attribute printing, TableGen generated code will handle the
-// dispatch to the individual classes.
-void MhloDialect::printAttribute(Attribute attr, DialectAsmPrinter& os) const {
-  LogicalResult result = generatedAttributePrinter(attr, os);
-  (void)result;
-  assert(succeeded(result));
-}
-
-/// Helpers for attributes parsing.
-static ParseResult parseDims(AsmParser& parser, SmallVector<int64_t>& dims) {
-  dims.clear();
-  return parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, [&] {
-    dims.emplace_back();
-    return parser.parseInteger(dims.back());
-  });
-}
-
-static ParseResult parseDimsWithMinimumElements(AsmParser& parser,
-                                                SmallVector<int64_t>& dims,
-                                                int minElements) {
-  if (failed(parseDims(parser, dims))) return failure();
-  if (static_cast<int64_t>(dims.size()) < minElements)
-    return parser.emitError(parser.getCurrentLocation())
-           << "expected at least " << minElements << " element(s), found "
-           << dims.size();
-  return success();
-}
-
-FailureOr<SmallVector<int64_t>> parseIntArray(AsmParser& parser) {
-  SmallVector<int64_t> ints;
-  if (failed(parseDims(parser, ints))) return failure();
-  return ints;
-}
-
-void printIntArray(AsmPrinter& printer, ArrayRef<int64_t> ints) {
-  printer << '[';
-  llvm::interleaveComma(ints, printer);
-  printer << ']';
-}
-
-/// Parse a custom attribute that resembles a struct of the form
-/// <
-///   foo = something_parsed_by_custom_parser,
-///   bar = something_parsed_by_different_custom_parser,
-///   baz something_parsed_by_another_custom_parser
-/// >
-/// The optional argument `parse_equal` array can be used to denote if
-/// '=' follows the keyword (see baz in the example above) for a field. If
-/// not provided, all fields must be followed by a '='.
-static ParseResult parseStruct(
-    AsmParser& parser, ArrayRef<StringRef> keywords,
-    ArrayRef<llvm::function_ref<ParseResult()>> parseFuncs,
-    ArrayRef<bool> parseEqual = {}) {
-  assert(keywords.size() == parseFuncs.size());
-  assert(parseEqual.empty() || parseEqual.size() == keywords.size());
-  SmallVector<bool> seen(keywords.size(), false);
-  while (failed(parser.parseOptionalGreater())) {
-    bool foundOne = false;
-    for (const auto& it : llvm::enumerate(keywords)) {
-      size_t index = it.index();
-      StringRef keyword = it.value();
-      if (succeeded(parser.parseOptionalKeyword(keyword))) {
-        if (seen[index]) {
-          return parser.emitError(parser.getCurrentLocation())
-                 << "duplicated `" << keyword << "` entry";
-        }
-        if (parseEqual.empty() || parseEqual[index]) {
-          if (failed(parser.parseEqual())) return failure();
-        }
-        if (failed(parseFuncs[index]())) return failure();
-        if (failed(parser.parseOptionalComma())) return parser.parseGreater();
-        seen[index] = true;
-        foundOne = true;
-      }
-    }
-    if (!foundOne) {
-      auto parseError = parser.emitError(parser.getCurrentLocation())
-                        << "expected one of: ";
-      llvm::interleaveComma(keywords, parseError, [&](StringRef kw) {
-        parseError << '`' << kw << '`';
-      });
-      return parseError;
-    }
-  }
-  return success();
-}
-
-// Helpers to print an optional array or integer field, to simplify writing
-// attribute printers.
-template <typename T>
-static void printField(AsmPrinter& printer, StringRef name, T field,
-                       StringRef& separator) {
-  if (field != 0) {
-    printer << separator << name << " = " << field;
-    separator = ", ";
-  }
-}
-template <typename T>
-static void printField(AsmPrinter& printer, StringRef name, ArrayRef<T> field,
-                       StringRef& separator) {
-  if (!field.empty()) {
-    printer << separator << name << " = [";
-    llvm::interleaveComma(field, printer);
-    printer << "]";
-    separator = ", ";
-  }
-}
-
-template <typename... Ts>
-static void printStruct(AsmPrinter& printer, StringRef name,
-                        Ts... printFields) {
-  printer << "<";
-  StringRef separator = "";
-  // Fold expression to print each entry in the parameter pack.
-  // TODO(mhlo-team): this can be simplified when TF moves to C++17.
-  using unused = int[];
-  (void)unused{0, (printField(printer, std::get<0>(printFields),
-                              std::get<1>(printFields), separator),
-                   0)...};
-  printer << ">";
-}
-
-// Custom printer and parser for ScatterDimensionNumbersAttr.
-void ScatterDimensionNumbersAttr::print(AsmPrinter& printer) const {
-  printStruct(printer, "scatter",
-              std::make_pair("update_window_dims", getUpdateWindowDims()),
-              std::make_pair("inserted_window_dims", getInsertedWindowDims()),
-              std::make_pair("scatter_dims_to_operand_dims",
-                             getScatterDimsToOperandDims()),
-              std::make_pair("index_vector_dim", getIndexVectorDim()));
-}
-Attribute ScatterDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
-  if (failed(parser.parseLess())) return {};
-  SmallVector<int64_t> updateWindowDims;
-  SmallVector<int64_t> insertedWindowDims;
-  SmallVector<int64_t> scatterDimsToOperandDims;
-  int64_t indexVectorDim = 0;
-
-  if (failed(parseStruct(
-          parser,
-          {"update_window_dims", "inserted_window_dims",
-           "scatter_dims_to_operand_dims", "index_vector_dim"},
-          {[&]() { return parseDims(parser, updateWindowDims); },
-           [&]() { return parseDims(parser, insertedWindowDims); },
-           [&]() { return parseDims(parser, scatterDimsToOperandDims); },
-           [&]() { return parser.parseInteger(indexVectorDim); }}))) {
-    parser.emitError(parser.getCurrentLocation())
-        << "failed parsing scatter dimension numbers attribute";
-    return {};
-  }
-
-  return ScatterDimensionNumbersAttr::get(
-      parser.getContext(), updateWindowDims, insertedWindowDims,
-      scatterDimsToOperandDims, indexVectorDim);
-}
-
-// Custom printer and parser for GatherDimensionNumbersAttr.
-void GatherDimensionNumbersAttr::print(AsmPrinter& printer) const {
-  printStruct(printer, "gather", std::make_pair("offset_dims", getOffsetDims()),
-              std::make_pair("collapsed_slice_dims", getCollapsedSliceDims()),
-              std::make_pair("start_index_map", getStartIndexMap()),
-              std::make_pair("index_vector_dim", getIndexVectorDim()));
-}
-
-Attribute GatherDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
-  if (failed(parser.parseLess())) return {};
-
-  SmallVector<int64_t> offsetDims;
-  SmallVector<int64_t> collapsedSliceDims;
-  SmallVector<int64_t> startIndexMap;
-  int64_t indexVectorDim = 0;
-
-  if (failed(parseStruct(
-          parser,
-          {"offset_dims", "collapsed_slice_dims", "start_index_map",
-           "index_vector_dim"},
-          {[&]() { return parseDims(parser, offsetDims); },
-           [&]() { return parseDims(parser, collapsedSliceDims); },
-           [&]() { return parseDims(parser, startIndexMap); },
-           [&]() { return parser.parseInteger(indexVectorDim); }}))) {
-    parser.emitError(parser.getCurrentLocation())
-        << "failed parsing gather dimension numbers attribute";
-    return {};
-  }
-
-  return GatherDimensionNumbersAttr::get(parser.getContext(), offsetDims,
-                                         collapsedSliceDims, startIndexMap,
-                                         indexVectorDim);
-}
-
-// Custom printer and parser for DotDimensionNumbersAttr.
-void DotDimensionNumbersAttr::print(AsmPrinter& printer) const {
-  printStruct(
-      printer, "dot",
-      std::make_pair("lhs_batching_dimensions", getLhsBatchingDimensions()),
-      std::make_pair("rhs_batching_dimensions", getRhsBatchingDimensions()),
-      std::make_pair("lhs_contracting_dimensions",
-                     getLhsContractingDimensions()),
-      std::make_pair("rhs_contracting_dimensions",
-                     getRhsContractingDimensions()));
-}
-
-Attribute DotDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
-  if (failed(parser.parseLess())) return {};
-
-  SmallVector<int64_t> lhsBatchingDimensions;
-  SmallVector<int64_t> rhsBatchingDimensions;
-  SmallVector<int64_t> lhsContractingDimensions;
-  SmallVector<int64_t> rhsContractingDimensions;
-
-  if (failed(parseStruct(
-          parser,
-          {"lhs_batching_dimensions", "rhs_batching_dimensions",
-           "lhs_contracting_dimensions", "rhs_contracting_dimensions"},
-          {[&]() { return parseDims(parser, lhsBatchingDimensions); },
-           [&]() { return parseDims(parser, rhsBatchingDimensions); },
-           [&]() { return parseDims(parser, lhsContractingDimensions); },
-           [&]() { return parseDims(parser, rhsContractingDimensions); }}))) {
-    parser.emitError(parser.getCurrentLocation())
-        << "failed parsing dot dimension numbers attribute";
-    return {};
-  }
-  return DotDimensionNumbersAttr::get(
-      parser.getContext(), lhsBatchingDimensions, rhsBatchingDimensions,
-      lhsContractingDimensions, rhsContractingDimensions);
-}
-
-namespace {
-enum NonSpatialDim : int64_t {
-  IOBatch = -1,    // Input or output batch dimension
-  IOFeature = -2,  // Input or output feature dimension
-  KIFeature = -3,  // Kernel input feature dimension
-  KOFeature = -4,  // Kernel output feature dimensions.
-};
-
-struct DenseMapInfoNonSpatialDim {
-  static inline NonSpatialDim getEmptyKey() {
-    return NonSpatialDim(DenseMapInfo<int64_t>::getEmptyKey());
-  }
-
-  static inline NonSpatialDim getTombstoneKey() {
-    return NonSpatialDim(DenseMapInfo<int64_t>::getTombstoneKey());
-  }
-
-  static unsigned getHashValue(const NonSpatialDim& key) {
-    return DenseMapInfo<int64_t>::getHashValue(key);
-  }
-
-  static bool isEqual(const NonSpatialDim& lhs, const NonSpatialDim& rhs) {
-    return lhs == rhs;
-  }
-};
-
-char nonSpatialDimToString(NonSpatialDim dim) {
-  switch (dim) {
-    case IOBatch:
-      return 'b';
-    case IOFeature:
-      return 'f';
-    case KIFeature:
-      return 'i';
-    case KOFeature:
-      return 'o';
-  }
-  llvm_unreachable("Unknown NonSpatialDim");
-}
-}  // namespace
-
-// Custom printer and parser for convolution attribute.
-void printConvolutionDimensions(AsmPrinter& p, ConvDimensionNumbersAttr dnums) {
-  // TODO(b/202040055): we should check the attribute invariant and print the
-  // "raw" form if they are violated, otherwise we'll crash here.
-  constexpr int64_t kUnknownDim = std::numeric_limits<int64_t>::min();
-  auto printDim =
-      [&](ArrayRef<int64_t> spatialDims,
-          ArrayRef<std::pair<int64_t, NonSpatialDim>> nonSpatialDims) {
-        int64_t numDims = 0;
-        if (!spatialDims.empty()) {
-          numDims =
-              *std::max_element(spatialDims.begin(), spatialDims.end()) + 1;
-        }
-        for (const auto& dim : nonSpatialDims) {
-          numDims = std::max(numDims, dim.first + 1);
-        }
-
-        llvm::SmallVector<int64_t> dims(numDims, kUnknownDim);
-        // Fill each element of dims with a (< 0) NonSpatialDim enum or a (>=0)
-        // spatial dimension index.
-        for (const std::pair<int64_t, NonSpatialDim>& nonSpatialDim :
-             nonSpatialDims) {
-          dims[nonSpatialDim.first] = nonSpatialDim.second;
-        }
-        for (const auto& spatialDim : llvm::enumerate(spatialDims)) {
-          dims[spatialDim.value()] = static_cast<int64_t>(spatialDim.index());
-        }
-
-        // Each dimension numbers will be printed as a comma separated list
-        // surrounded by square brackets, e.g., [b, 0, 1, 2, f]
-        p << '[';
-        llvm::interleaveComma(dims, p, [&](int64_t dim) {
-          if (dim == kUnknownDim) {
-            p << "?";
-          } else if (dim >= 0) {
-            p << dim;
-          } else {
-            p << nonSpatialDimToString(static_cast<NonSpatialDim>(dim));
-          }
-        });
-        p << ']';
-      };
-
-  printDim(dnums.getInputSpatialDimensions(),
-           {{dnums.getInputBatchDimension(), IOBatch},
-            {dnums.getInputFeatureDimension(), IOFeature}});
-  p << "x";
-  printDim(dnums.getKernelSpatialDimensions(),
-           {{dnums.getKernelInputFeatureDimension(), KIFeature},
-            {dnums.getKernelOutputFeatureDimension(), KOFeature}});
-  p << "->";
-  printDim(dnums.getOutputSpatialDimensions(),
-           {{dnums.getOutputBatchDimension(), IOBatch},
-            {dnums.getOutputFeatureDimension(), IOFeature}});
-}
-
-void printConvolutionDimensions(AsmPrinter& p, Operation*,
-                                ConvDimensionNumbersAttr dnums) {
-  printConvolutionDimensions(p, dnums);
-}
-
-// Custom printer and parser for ConvDimensionNumbersAttr.
-void ConvDimensionNumbersAttr::print(AsmPrinter& printer) const {
-  printer << "<";
-  printConvolutionDimensions(printer, *this);
-  printer << ">";
-}
-
-// If the attribute is written with `#mhlo.conv raw<`, we parse it as a struct
-// instead of the compressed format. This enables writing tests covering
-// impossible/invalid internal representation for the attribute.
-static ParseResult parseConvolutionDimensionsRaw(
-    AsmParser& parser, ConvDimensionNumbersAttr& dnums) {
-  int64_t inputBatchDimension = 0;
-  int64_t inputFeatureDimension = 0;
-  SmallVector<int64_t> inputSpatialDimensions;
-  int64_t kernelInputFeatureDimension = 0;
-  int64_t kernelOutputFeatureDimension = 0;
-  SmallVector<int64_t> kernelSpatialDimensions;
-  int64_t outBatchDimension = 0;
-  int64_t outputFeatureDimension = 0;
-  SmallVector<int64_t> outputSpatialDimensions;
-  if (failed(parseStruct(
-          parser,
-          {"input_batch_dimension", "input_feature_dimension",
-           "input_spatial_dimensions", "kernel_input_feature_dimension",
-           "kernel_output_feature_dimension", "kernel_spatial_dimensions",
-           "output_batch_dimension", "output_feature_dimension",
-           "output_spatial_dimensions"},
-          {
-              [&]() { return parser.parseInteger(inputBatchDimension); },
-              [&]() { return parser.parseInteger(inputFeatureDimension); },
-              [&]() { return parseDims(parser, inputSpatialDimensions); },
-              [&]() {
-                return parser.parseInteger(kernelInputFeatureDimension);
-              },
-              [&]() {
-                return parser.parseInteger(kernelOutputFeatureDimension);
-              },
-              [&]() { return parseDims(parser, kernelSpatialDimensions); },
-              [&]() { return parser.parseInteger(outBatchDimension); },
-              [&]() { return parser.parseInteger(outputFeatureDimension); },
-              [&]() { return parseDims(parser, outputSpatialDimensions); },
-          }))) {
-    parser.emitError(parser.getCurrentLocation())
-        << "failed parsing dot dimension numbers attribute";
-    return failure();
-  }
-  dnums = ConvDimensionNumbersAttr::get(
-      parser.getBuilder().getContext(), inputBatchDimension,
-      inputFeatureDimension, inputSpatialDimensions,
-      kernelInputFeatureDimension, kernelOutputFeatureDimension,
-      kernelSpatialDimensions, outBatchDimension, outputFeatureDimension,
-      outputSpatialDimensions);
-  return success();
-}
-
-ParseResult parseConvolutionDimensions(AsmParser& parser,
-                                       ConvDimensionNumbersAttr& dnums) {
-  // Parsing a single set of dim numbers gives the spatial dimensions as a
-  // single ArrayRef<int64_t> and a list of non-spatial dimensions as
-  // IntegerAttrs (indexed by the NonSpatialDim enum).
-  using parse_dim_result_t =
-      std::pair<llvm::SmallVector<int64_t>,
-                llvm::SmallDenseMap<NonSpatialDim, int64_t, 4,
-                                    DenseMapInfoNonSpatialDim>>;
-
-  // Note that the allowed_non_spatial_dims is a set (as opposed to unordered
-  // set) because its used to print a list of allowed non spatial dims in the
-  // error messages, so making it a set keeps the error messages deterministic.
-  auto parseDims =
-      [&](std::set<NonSpatialDim, std::greater<>> allowedNonSpatialDims,
-          parse_dim_result_t& parsedDims) -> ParseResult {
-    auto& spatialDims = std::get<0>(parsedDims);
-    auto& nonSpatialDims = std::get<1>(parsedDims);
-    spatialDims.clear();
-    nonSpatialDims.clear();
-
-    // Parse the starting [
-    if (parser.parseLSquare()) {
-      return failure();
-    }
-
-    llvm::SmallDenseMap<int64_t, int64_t> spatialDimsMap;
-    constexpr int64_t kInvalidDimension = -1;
-    // Keep track of the maximum spatial dimension parsed as we expect to see
-    // all the dimensions from 0 to maximum dimension parsed.
-    int64_t maxParsedSpatialDim = kInvalidDimension;
-
-    int64_t index = 0;
-    do {
-      int64_t spatialDim;
-      auto dimLocation = parser.getCurrentLocation();
-      OptionalParseResult parseResult = parser.parseOptionalInteger(spatialDim);
-      if (parseResult.has_value()) {
-        if (parseResult.value().failed()) {
-          return failure();
-        }
-        // We were successful in parsing an integer. Check if it is a valid
-        // dimension (non-negative and no duplicate) and add its index to the
-        // spatial dims map.
-        if (spatialDim < 0)
-          return parser.emitError(dimLocation)
-                 << "Unexpected dimension " << spatialDim;
-        if (!spatialDimsMap
-                 .insert(std::pair<int64_t, int64_t>(spatialDim, index))
-                 .second)
-          return parser.emitError(dimLocation)
-                 << "Duplicate entries for spatial dimension " << spatialDim;
-        maxParsedSpatialDim = std::max(spatialDim, maxParsedSpatialDim);
-      } else if (!parser.parseOptionalQuestion()) {
-        // Do nothing other than increment `index` at the bottom of the loop;
-        // '?' means "unknown dimension", and it's not represented in the
-        // return value of this function.
-      } else {
-        // We did not parse an integer or question mark. We expect a keyword
-        // token.
-        StringRef keyword;
-        if (parser.parseKeyword(&keyword)) {
-          return failure();
-        }
-        if (keyword.size() != 1 || allowedNonSpatialDims.empty()) {
-          return parser.emitError(dimLocation, "Unexpected keyword ")
-                 << keyword;
-        }
-        // Check if the keyword matches one of the allowed non-spatial dims.
-        // If so, add it to the non_spatial dims and remove it from the
-        // allowed set so that it won't be allowed again.
-        bool isAllowed = false;
-        for (NonSpatialDim allowed : allowedNonSpatialDims) {
-          if (keyword[0] == nonSpatialDimToString(allowed)) {
-            nonSpatialDims.insert({allowed, index});
-            allowedNonSpatialDims.erase(allowed);
-            isAllowed = true;
-            break;
-          }
-        }
-
-        if (!isAllowed) {
-          mlir::InFlightDiagnostic diag =
-              parser.emitError(dimLocation, "Unexpected dimension ");
-          diag << keyword << ", expecting ";
-          llvm::interleaveComma(
-              allowedNonSpatialDims, diag,
-              [&](NonSpatialDim dim) { diag << nonSpatialDimToString(dim); });
-          return diag;
-        }
-      }
-      index++;
-    } while (parser.parseOptionalComma().succeeded());
-
-    // Make sure all expected non-spatial dimensions are parsed.
-    if (!allowedNonSpatialDims.empty()) {
-      mlir::InFlightDiagnostic diag =
-          parser.emitError(parser.getCurrentLocation(), "Expected dimensions ");
-      llvm::interleaveComma(
-          allowedNonSpatialDims, diag,
-          [&](NonSpatialDim dim) { diag << nonSpatialDimToString(dim); });
-      diag << " not specified";
-      return diag;
-    }
-
-    // parse ending ]
-    if (parser.parseRSquare()) {
-      return failure();
-    }
-
-    // Number of expected spatial dimensions is one more than the maximum parsed
-    // spatial dimension. For example, if we parse [0, 3, 2, b, i, 1], then the
-    // maximum parsed spatial dimension is 3 and the number of expected spatial
-    // dimensions is 4.
-    int64_t numSpatialDimensions = maxParsedSpatialDim + 1;
-    spatialDims.resize(numSpatialDimensions);
-    // Store spatial dimensions in a vector which maps spatial dim (vector
-    // index) -> index in the tensor dimensions. For example, for parsed
-    // dimension numbers [0, 3, 2, b, i, 1] the spatial dimension vector would
-    // be [0, 5, 2, 1].
-    //
-    // Get all the unspecified spatial dimensions to throw a more descriptive
-    // error later.
-    llvm::SmallVector<int64_t> unspecifiedSpatialDims;
-    constexpr int kPrintUnspecifiedDimsMax = 10;
-    for (int dim = 0; dim < numSpatialDimensions; ++dim) {
-      auto it = spatialDimsMap.find(dim);
-      if (it == spatialDimsMap.end()) {
-        // Have an upper bound on the number of unspecified dimensions to print
-        // in the error message.
-        if (unspecifiedSpatialDims.size() < kPrintUnspecifiedDimsMax)
-          unspecifiedSpatialDims.push_back(dim);
-        continue;
-      }
-      spatialDims[dim] = it->second;
-    }
-
-    // Verify that we got all spatial dimensions between 0 and maximum parsed
-    // spatial dimension.
-    if (!unspecifiedSpatialDims.empty()) {
-      mlir::InFlightDiagnostic diag = parser.emitError(
-          parser.getCurrentLocation(), "Expected spatial dimensions ");
-      llvm::interleaveComma(unspecifiedSpatialDims, diag);
-      diag << " not specified";
-      return diag;
-    }
-
-    return success();
-  };
-
-  parse_dim_result_t parsedDims;
-  if (parseDims({IOBatch, IOFeature}, parsedDims)) {
-    return failure();
-  }
-  llvm::SmallVector<int64_t> inputSpatialDimensions = parsedDims.first;
-  int64_t inputBatchDimension = parsedDims.second[IOBatch];
-  int64_t inputFeatureDimension = parsedDims.second[IOFeature];
-  if (parser.parseKeyword("x")) return failure();
-  if (parseDims({KIFeature, KOFeature}, parsedDims)) {
-    return failure();
-  }
-  llvm::SmallVector<int64_t> kernelSpatialDimensions = parsedDims.first;
-  int64_t kernelInputFeatureDimension = parsedDims.second[KIFeature];
-  int64_t kernelOutputFeatureDimension = parsedDims.second[KOFeature];
-  if (parser.parseArrow()) {
-    return failure();
-  }
-  if (parseDims({IOBatch, IOFeature}, parsedDims)) {
-    return failure();
-  }
-  llvm::SmallVector<int64_t> outputSpatialDimensions = parsedDims.first;
-  const int64_t outBatchDimension = parsedDims.second[IOBatch];
-  const int64_t outputFeatureDimension = parsedDims.second[IOFeature];
-  dnums = ConvDimensionNumbersAttr::get(
-      parser.getBuilder().getContext(), inputBatchDimension,
-      inputFeatureDimension, inputSpatialDimensions,
-      kernelInputFeatureDimension, kernelOutputFeatureDimension,
-      kernelSpatialDimensions, outBatchDimension, outputFeatureDimension,
-      outputSpatialDimensions);
-
-  return success();
-}
-
-Attribute ConvDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
-  if (failed(parser.parseLess())) return {};
-  ConvDimensionNumbersAttr dnums;
-  if (succeeded(parser.parseOptionalKeyword("raw"))) {
-    if (failed(parseConvolutionDimensionsRaw(parser, dnums))) return {};
-    return dnums;
-  }
-  if (failed(parseConvolutionDimensions(parser, dnums))) return {};
-  if (failed(parser.parseGreater())) return {};
-  return dnums;
-}
-
-// Custom printer and parser for ArgResultAliasAttr.
-constexpr char kMustAlias[] = "must_alias";
-constexpr char kResult[] = "result_index";
-constexpr char kArgTupleIndices[] = "tuple_indices";
-
-void ArgResultAliasAttr::print(AsmPrinter& printer) const {
-  printer << "<";
-
-  // The attribute can have empty tuple indices. Only print argument tuple
-  // indices if they are non-empty.
-  if (!getArgTupleIndices().empty())
-    printer << kArgTupleIndices << " = [" << getArgTupleIndices() << "], ";
-
-  // Print the result index followed by any result tuple indices if present.
-  printer << kResult << " = [";
-  printer << getResultIndex();
-  if (!getResultTupleIndices().empty()) {
-    printer << ", " << getResultTupleIndices();
-  }
-  printer << "]";
-
-  // Print the "must_alias" keyword if this is a must alias, otherwise skip.
-  if (getIsMustAlias()) printer << ", " << kMustAlias;
-
-  printer << ">";
-}
-
-Attribute ArgResultAliasAttr::parse(AsmParser& parser, Type type) {
-  if (failed(parser.parseLess())) return {};
-  llvm::SmallVector<int64_t> argTupleIndices;
-  // The first element of result indices holds the aliased result index and the
-  // remaining elements are the result tuple indices.
-  llvm::SmallVector<int64_t> resultIndices;
-  bool isMustAlias = false;
-
-  // This conveys to parseStruct that keyword "must_alias" (3rd field) is not
-  // followed by a "=", but other fields are.
-  llvm::SmallVector<bool, 3> parseEqual = {true, true, false};
-
-  if (failed(parseStruct(parser, {kArgTupleIndices, kResult, kMustAlias},
-                         {[&]() { return parseDims(parser, argTupleIndices); },
-                          [&]() {
-                            // Since the first element is the index of result,
-                            // at least one element is expected.
-                            return parseDimsWithMinimumElements(
-                                parser, resultIndices, /*minElements=*/1);
-                          },
-                          [&]() {
-                            // always succeeds if the keyword "must_alias" was
-                            // parsed
-                            isMustAlias = true;
-                            return success();
-                          }},
-                         parseEqual))) {
-    parser.emitError(parser.getCurrentLocation())
-        << "failed parsing argument-result alias attribute";
-    return {};
-  }
-
-  int64_t resultIndex = resultIndices[0];
-  auto resultTupleIndices =
-      ArrayRef<int64_t>{resultIndices.begin() + 1, resultIndices.end()};
-
-  return ArgResultAliasAttr::get(parser.getContext(), argTupleIndices,
-                                 resultIndex, resultTupleIndices, isMustAlias);
-}
-
-// Returns the element type pointed to by `indices` in type `t`. If the indices
-// are invalid, returns nullptr.
-static Type getTypeFromTupleIndices(Type type, ArrayRef<int64_t> indices) {
-  Type current = type;
-  for (auto index : indices) {
-    TupleType tupleType = current.dyn_cast<TupleType>();
-    if (!tupleType || index >= static_cast<int64_t>(tupleType.size()))
-      return {};
-    current = tupleType.getType(index);
-  }
-  return current;
-}
-
-static LogicalResult verifyArgResultAliasAttr(StringAttr attrName,
-                                              ArgResultAliasAttr aliasAttr,
-                                              unsigned argIndex,
-                                              Operation* op) {
-  // The attribute can only be applied to function-like operations.
-  if (!isa<mlir::FunctionOpInterface>(op))
-    return op->emitOpError() << "attribute " << attrName
-                             << " can only be used on function-like operations";
-
-  // Verify there are no negative indices.
-  auto tupleIndices = llvm::concat<const int64_t>(
-      aliasAttr.getArgTupleIndices(), aliasAttr.getResultTupleIndices());
-  if (llvm::any_of(tupleIndices, [](const int64_t val) { return val < 0; }) ||
-      aliasAttr.getResultIndex() < 0)
-    return op->emitOpError()
-           << "attribute " << attrName
-           << " expects all argument and result indices to be >= 0";
-
-  // Verify that the result index is not out of range. Since the attribute is a
-  // function argument attribute, the argument index is always correct when this
-  // verifier is called.
-  FunctionOpInterface funcOp = cast<FunctionOpInterface>(op);
-  ArrayRef<Type> argTypes = funcOp.getArgumentTypes();
-  ArrayRef<Type> resultTypes = funcOp.getResultTypes();
-  if (aliasAttr.getResultIndex() >= static_cast<int64_t>(resultTypes.size()))
-    return op->emitOpError()
-           << "attribute " << attrName
-           << " result index is out of range, must be <" << resultTypes.size();
-
-  // Verify that argument and result types pointed to by the indices are valid
-  // and compatible.
-  Type argType = getTypeFromTupleIndices(argTypes[argIndex],
-                                         aliasAttr.getArgTupleIndices());
-  if (!argType)
-    return op->emitOpError()
-           << "attribute " << attrName << " argument tuple indices are invalid";
-  Type resultType =
-      getTypeFromTupleIndices(resultTypes[aliasAttr.getResultIndex()],
-                              aliasAttr.getResultTupleIndices());
-  if (!resultType)
-    return op->emitOpError()
-           << "attribute " << attrName << " result tuple indices are invalid";
-
-  if (failed(mlir::verifyCompatibleShape(argType, resultType)) ||
-      getElementTypeOrSelf(argType) != getElementTypeOrSelf(resultType))
-    return op->emitOpError() << "attribute " << attrName
-                             << " aliases do not have compatible types, "
-                             << argType << " vs. " << resultType;
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Builder utilities
-//===----------------------------------------------------------------------===//
-
-// Builds the region `body` for mhlo.sort's comparator: for each type in
-// `element_types`, create two block arguments, one for lhs and one for rhs, and
-// generates mhlo.compare op to compare them with the given `direction`.
-//
-// Note that this right now only does comparision on the first pair of block
-// arguments.
-static void buildSortComparisonBody(llvm::ArrayRef<Type> elementTypes,
-                                    ComparisonDirection direction,
-                                    llvm::Optional<StringRef> compareType,
-                                    Region* body, OpBuilder* builder) {
-  OpBuilder::InsertionGuard insertionPointGurad(*builder);
-
-  Location loc = body->getLoc();
-  Block* block = builder->createBlock(body);
-  // Add two arguments for each element type.
-  for (Type elementType : elementTypes) {
-    TensorType tensorType = RankedTensorType::get({}, elementType);
-    block->addArguments({tensorType, tensorType},
-                        SmallVector<Location, 2>(2, loc));
-  }
-
-  ComparisonType typeAttr;
-  if (compareType)
-    typeAttr = symbolizeComparisonType(*compareType).value();
-  else
-    typeAttr = ComparisonType::NOTYPE;
-  Value compare = builder->create<mhlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1), direction, typeAttr);
-
-  builder->create<mhlo::ReturnOp>(loc, compare);
-}
-
-SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
-                    const llvm::ArrayRef<Value>& operands,
-                    const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
-                    bool isStable, ComparisonDirection direction) {
-  assert(!operands.empty() && "No operands to sort");
-  // Create the sort op.
-  auto sortOp =
-      rewriter->create<mhlo::SortOp>(loc, operands, dimension, isStable);
-
-  // Use TOTALORDER comparison type instead of the default comparison if the
-  // element type is of type float.
-  llvm::Optional<StringRef> compareType = llvm::None;
-  for (auto const& elementType : elementTypes)
-    if (elementType.isa<FloatType>()) {
-      compareType.emplace("TOTALORDER");
-      break;
-    }
-  buildSortComparisonBody(elementTypes, direction, compareType,
-                          &sortOp.getComparator(), rewriter);
-  return sortOp;
-}
-
-//===----------------------------------------------------------------------===//
-// MHLO Dialect Hooks
-//===----------------------------------------------------------------------===//
-
-Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
-                                            Type type, Location loc) {
-  auto elementsAttr = value.dyn_cast<ElementsAttr>();
-  // HLO dialect constants only support ElementsAttr unlike standard dialect
-  // constant which supports all attributes.
-  if (!elementsAttr) return nullptr;
-  // HLO dialect constants require the type of value and result to match.
-  if (type != elementsAttr.getType()) return nullptr;
-
-  return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
-}
-
-LogicalResult MhloDialect::verifyRegionArgAttribute(Operation* op,
-                                                    unsigned /*regionIndex*/,
-                                                    unsigned argIndex,
-                                                    NamedAttribute attr) {
-  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
-    if (failed(
-            verifyArgResultAliasAttr(attr.getName(), aliasAttr, argIndex, op)))
-      return failure();
-  }
-  return success();
-}
-
-LogicalResult MhloDialect::verifyOperationAttribute(Operation* op,
-                                                    NamedAttribute attr) {
-  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
-    if (!isa<mlir::FunctionOpInterface>(op))
-      return op->emitOpError()
-             << "attribute " << attr.getName()
-             << " can only be used on function-like operations";
-  }
-  return success();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_patterns.td b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
deleted file mode 100644
index 9dae5e88813..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_patterns.td
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Canonicalization patterns for the MHLO dialect.
-
-include "mlir/Dialect/Shape/IR/ShapeOps.td"
-include "mlir/Dialect/Tensor/IR/TensorOps.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-
-def HasSameType : Constraint<CPred<"$0.getType() == $1.getType()">>;
-
-// Canonicalization patterns.
-
-def DynamicBroadcastToOwnShape_1 : Pat<
-  (HLO_DynamicBroadcastInDimOp:$op $x,
-    (Shape_ToExtentTensorOp (Shape_ShapeOfOp $x)), $_, $_, $_),
-  (replaceWithValue $x)>;
-def DynamicBroadcastToOwnShape_2 : Pat<
-  (HLO_DynamicBroadcastInDimOp:$op $x,
-    (Shape_ShapeOfOp $x), $attr0, $attr1, $attr2),
-  (replaceWithValue $x)>;
-def DynamicBroadcastToOwnShape_3 : Pat<
-  (HLO_DynamicBroadcastInDimOp:$op $x,
-    (Tensor_CastOp (Shape_ToExtentTensorOp (Shape_ShapeOfOp $x))),
-    $attr0, $attr1, $attr2),
-  (Tensor_CastOp $x)>;
-def DynamicBroadcastToOwnShape_4 : Pat<
-  (HLO_DynamicBroadcastInDimOp:$op $x,
-    (Tensor_CastOp (Shape_ShapeOfOp $x)), $attr0, $attr1, $attr2),
-  (Tensor_CastOp $x)>;
-
-def ShapeOfDynamicReshape : Pat<
-  (Shape_ShapeOfOp:$op (HLO_DynamicReshapeOp $x, $shape)),
-  (replaceWithValue $shape),
-  [(HasSameType $shape, $op)]>;
-
-def IdentityBroadcastReshape : Pat<
-  (HLO_ReshapeOp:$op (HLO_BroadcastOp $input, $dims)),
-  (replaceWithValue $input),
-  [(HasSameType $input, $op)]>;
-
-def IdentityBroadcastInDimReshape : Pat<
-  (HLO_ReshapeOp:$op (HLO_BroadcastInDimOp $input, $dims)),
-  (replaceWithValue $input),
-  [(HasSameType $input, $op)]>;
-
-def EliminateIdentityConvert : Pat<
-  (HLO_ConvertOp:$res $src),
-  (replaceWithValue $src),
-  [(HasSameType $res, $src)]>;
-
-def EliminateRedundantReshape : Pat<
-  (HLO_ReshapeOp:$res (HLO_ReshapeOp $src)),
-  (replaceWithValue $src),
-  [(HasSameType $res, $src)]>;
-
-def EliminateIdentityReshape : Pat<
-  (HLO_ReshapeOp:$res $src),
-  (replaceWithValue $src),
-  [(HasSameType $res, $src)]>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td
deleted file mode 100644
index df79559d589..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_canonicalize.td
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the canonicalize pattern definition file.
-
-include "mlir/IR/OpBase.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
-
-def UnaryToBinaryEinsumEq : NativeCodeCall<
-  "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
-
-def GetI64DenseElementsAttr : NativeCodeCall<
-  "$0.mapValues($_builder.getI64Type(), [](llvm::APInt x) { return x.sext(64); })">;
-
-// Convert UnaryEinsumOp to EinsumOp with two operands with redundant first
-// operand.
-def UnaryEinsumToEinsum : Pat<
-  (HLO_UnaryEinsumOp $operand, $equation),
-  (HLO_EinsumOp (HLO_ConstantOp (GetScalarOfType<1> $operand)),
-                $operand, (UnaryToBinaryEinsumEq $equation))>;
-
-// A dynamic reshape of a dynamic reshape is a dynamic reshape.
-def RemoveRedundantDynamicReshape : Pat<
-  (HLO_DynamicReshapeOp (HLO_DynamicReshapeOp $operand, $shape1), $shape2),
-  (HLO_DynamicReshapeOp $operand, $shape2)>;
-
-// A dynamic broadcast of a dynamic reshape with the same shape operand
-// is a dynamic reshape.
-def RemoveRedundantDynamicBroadcast : Pat<
-  (HLO_DynamicBroadcastInDimOp
-    (HLO_DynamicReshapeOp $operand, $shape),
-    $shape,
-    IdentityBroadcastDims,
-    $known_expanding_dimensions,
-    $known_nonexpanding_dimensions),
-  (HLO_DynamicReshapeOp $operand, $shape)>;
-
-
-// Convert DPad to Pad if edge_padding_low, edge_padding_high and
-// interior_padding are HLO_ConstantOp
-def DPadToPad: Pat<
-          (HLO_DynamicPadOp $input,
-            $padding_value,
-            (ConstantLikeMatcher AnyIntElementsAttr:$edge_padding_low),
-            (ConstantLikeMatcher AnyIntElementsAttr:$edge_padding_high),
-            (ConstantLikeMatcher AnyIntElementsAttr:$interior_padding)),
-          (HLO_PadOp $input, $padding_value,
-            (GetI64DenseElementsAttr (CastIntElementsAttr $edge_padding_low)),
-            (GetI64DenseElementsAttr (CastIntElementsAttr $edge_padding_high)),
-            (GetI64DenseElementsAttr (CastIntElementsAttr $interior_padding)))>;
-
-// Convert RealDynamicSliceOp to SliceOp if start_indices, limit_indices and
-// strides are HLO_ConstantOp
-def RealDSliceToSlice: Pat<
-          (HLO_RealDynamicSliceOp $operand,
-            (HLO_ConstantOp I64ElementsAttr:$start_indices),
-            (HLO_ConstantOp I64ElementsAttr:$limit_indices),
-            (HLO_ConstantOp I64ElementsAttr:$strides)),
-          (HLO_SliceOp $operand,
-            (CastIntElementsAttr $start_indices),
-            (CastIntElementsAttr $limit_indices),
-            (CastIntElementsAttr $strides))>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
deleted file mode 100644
index 6a8ef48a961..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,382 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-set(LLVM_TARGET_DEFINITIONS lower_complex_patterns.td)
-mlir_tablegen(generated_lower_complex.inc -gen-rewriters)
-add_public_tablegen_target(MLIRMhloLowerComplexIncGen)
-
-set(LLVM_TARGET_DEFINITIONS legalize_to_standard_patterns.td)
-mlir_tablegen(generated_legalize_to_standard.inc -gen-rewriters)
-add_public_tablegen_target(MLIRMhloLegalizeToStandardIncGen)
-
-set(LLVM_TARGET_DEFINITIONS chlo_legalize_to_hlo_patterns.td)
-mlir_tablegen(generated_chlo_legalize_to_hlo.inc -gen-rewriters)
-add_public_tablegen_target(MLIRChloLegalizeToHloIncGen)
-
-
-add_mlir_library(MhloScatterUtils
-  mhlo_scatter_gather_utils.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-)
-
-add_mlir_library(MhloPasses
-  broadcast_propagation.cc
-  collapse_elementwise_map.cc
-  constraint_fusion_pass.cc
-  convert_to_signless_pass.cc
-  expand_hlo_tuples.cc
-  group_reduction_dimensions.cc
-  legalize_einsum_to_dot_general.cc
-  legalize_gather_to_torch_index_select.cc
-  legalize_shape_computations.cc
-  legalize_trigonometric_to_approximation.cc
-  lower_complex.cc
-  lower_complex_patterns.td
-  lower_general_dot.cc
-  materialize_broadcasts.cc
-  materialize_broadcasts_pass.cc
-  merge_assuming_ops.cc
-  mhlo_canonicalize_gather.cc
-  mhlo_canonicalize_reduction.cc
-  mhlo_canonicalize_scatter.cc
-  mhlo_flatten_tuple.cc
-  prepare_for_export.cc
-  optimize_mhlo.cc
-  optimize_mhlo_pass.cc
-  rank_specialization.cc
-  restrict_max_rank.cc
-  shape_reification_pass.cc
-  sink_constants_to_control_flow.cc
-  sparse_rewriting.cc
-  test_infer_shaped_type_pass.cc
-  unfuse_batch_norm.cc
-  unfuse_batch_norm_pass.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloLowerComplexIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  ChloOps
-  MhloDialect
-  MhloScatterUtils
-  MhloTypeConversion
-  MLIRIR
-  MLIRMhloUtils
-  MLIRPass
-  MLIRTransformUtils
-  StablehloBroadcastUtils
-)
-
-add_mlir_library(MhloToThloConversion
-  legalize_mhlo_to_thlo.cc
-
-  DEPENDS
-  MLIRMhloPassIncGen
-  MLIRGmlStTilingInterfaceIncGen
-  THLODialect
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloTypeConversion
-  THLODialect
-  MLIRIR
-  MLIRMhloUtils
-  MLIRPass
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloToLhloConversion
-  hlo_legalize_to_lhlo.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRlhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  LmhloDialect
-  MLIRArithDialect
-  MLIRArithTransforms
-  MLIRFuncTransforms
-  MLIRIR
-  MLIRPass
-  MLIRMathDialect
-  MLIRShapeOpsTransforms
-  MLIRTransformUtils
-  MLIRTransforms
-)
-
-add_mlir_library(MhloTypeConversion
-  type_conversion.cc
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MLIRIR
-  MLIRFuncDialect
-  MLIRFuncTransforms
-  MLIRTensorDialect
-  StablehloOps
-)
-
-add_mlir_library(MhloToMemrefConversion
-  hlo_legalize_to_memref.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloTypeConversion
-  MLIRIR
-  MLIRPass
-  MLIRMathDialect
-  MLIRTransforms
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloToArithmeticConversion
-  hlo_legalize_to_arithmetic.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloTypeConversion
-  MLIRIR
-  MLIRPass
-  MLIRMathDialect
-  MLIRTransforms
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloToStandard
-  legalize_control_flow.cc
-  legalize_sort.cc
-  legalize_to_standard.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRlhlo_opsIncGen
-  MLIRMhloLegalizeToStandardIncGen
-  MLIRMhloPassIncGen
-  MhloTypeConversion
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MLIRIR
-  MLIRPass
-  MLIRTensorDialect
-  MLIRTransformUtils
-)
-
-add_mlir_library(HloToLinalgUtils
-  legalize_to_linalg_utils.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  MhloDialect
-  MhloTypeConversion
-  MLIRBufferizationDialect
-  MLIRComplexDialect
-  MLIRIR
-  MLIRLinalgUtils
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
-
-add_mlir_library(ChloPasses
-  chlo_legalize_to_hlo.cc
-  chlo_legalize_to_hlo_pass.cc
-  sparse_chlo_legalize_to_linalg.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRChloLegalizeToHloIncGen
-  MLIRMhloPassIncGen
-  MLIRLmhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  ChloOps
-  HloToLinalgUtils
-  MLIRComplexDialect
-  MLIRIR
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloToLinalg
-  legalize_to_linalg.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  HloToLinalgUtils
-  LmhloDialect
-  MhloDialect
-  MhloTypeConversion
-  MLIRBufferizationDialect
-  MLIRComplexDialect
-  MLIRIR
-  MLIRLinalgUtils
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloShapeOpsToStandard
-  hlo_legalize_shape_ops_to_standard.cc
-
-  DEPENDS
-  MLIRhlo_opsIncGen
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  MhloDialect
-  MhloTypeConversion
-  MLIRComplexDialect
-  MLIRIR
-  MLIRPass
-  MLIRRewrite
-  MLIRTransformUtils
-)
-
-add_mlir_library(MhloToStablehlo
-  hlo_legalize_to_stablehlo.cc
-  hlo_legalize_to_stablehlo_pass.cc
-
-  DEPENDS
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloTypeConversion
-  MLIRIR
-  MLIRPass
-  MLIRSupport
-  MLIRTransforms
-  StablehloOps
-)
-
-add_mlir_library(StablehloToMhlo
-  stablehlo_legalize_to_hlo.cc
-  stablehlo_legalize_to_hlo_pass.cc
-
-  DEPENDS
-  MLIRMhloPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MhloDialect
-  MhloTypeConversion
-  MLIRIR
-  MLIRPass
-  MLIRSupport
-  MLIRTransforms
-  StablehloOps
-)
-
-add_library(AllMhloPasses INTERFACE)
-target_link_libraries(AllMhloPasses INTERFACE
-  ChloPasses
-  MhloPasses
-  MhloToLhloConversion
-  MhloToArithmeticConversion
-  MhloToMemrefConversion
-  MhloToStandard
-  HloToLinalgUtils
-  MhloToLinalg
-  MhloToThloConversion
-  MhloShapeOpsToStandard
-  MhloToStablehlo
-  StablehloToMhlo
-)
-
-add_library(AllGmlStPasses INTERFACE)
-target_link_libraries(AllGmlStPasses INTERFACE
-  GmlStPasses
-  GmlStTestPasses
-  MLIRFuncDialect
-  MLIRPass
-)
-
-add_library(AllThloPasses INTERFACE)
-target_link_libraries(AllThloPasses INTERFACE
-  ThloPasses
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
deleted file mode 100644
index 0559e595b6c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
+++ /dev/null
@@ -1,354 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the legalization pattern definition file for CHLO to MHLO.
-// These are included in the PopulateDecomposeChloPatterns factory
-// and should only include canonical expansions which are not actually
-// ambiguous/different for various backends. Avoid patterns that are actually
-// lowering to non-canonical forms.
-
-include "mlir/Dialect/Shape/IR/ShapeOps.td"
-include "mlir/IR/OpBase.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-include "stablehlo/dialect/ChloOps.td"
-
-class HLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<HLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
-
-//===----------------------------------------------------------------------===//
-// Unary op patterns.
-//===----------------------------------------------------------------------===//
-
-// Expand acos for non-complex arguments to MHLO dialect as follows:
-//   acos(x) = 2 * atan2(sqrt(1 - x^2), (1 + x))  if x != -1
-//           = pi                                 if x == -1
-//
-// TODO(b/237376133): Support operands with complex element types separately
-// using the following formula.
-//   acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
-def : Pat<(CHLO_AcosOp NonComplexElementType:$input),
-  (HLO_SelectOp
-    (HLO_CompareOp
-      $input,
-      (HLO_ConstantLike<"-1"> $input),
-      HLO_ComparisonDirectionValue<"NE">,
-      (HLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (HLO_MulOp
-      (HLO_ConstantLike<"2"> $input),
-      (HLO_Atan2Op
-        (HLO_SqrtOp
-          (HLO_SubtractOp
-            (HLO_ConstantLike<"1"> $input),
-            (HLO_MulOp $input, $input)
-          )
-        ),
-        (HLO_AddOp
-          (HLO_ConstantLike<"1"> $input),
-          $input
-        )
-      )
-    ),
-    (HLO_ConstantLike<"M_PI"> $input)
-  )>;
-
-// Expand acosh to MHLO dialect as follows:
-//   acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
-//            = log(x + sqrt((x+1)*(x-1)))
-//   acosh(x) = nan                         if x < -1
-//
-// If x^2 will overflow, we approximate sqrt(x^2 - 1) == x and compute as
-// log(2*x) = log(2) + log(x).  (Note this works because negative x never
-// overflows; x < -1 simply yields nan.
-def : Pat<(CHLO_AcoshOp NonComplexElementType:$input),
-  (HLO_SelectOp
-    (HLO_CompareOp
-      $input,
-      (HLO_ConstantLike<"-1"> $input),
-      HLO_ComparisonDirectionValue<"LT">,
-      (HLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (HLO_ConstantLike<"NAN"> $input),
-    (HLO_SelectOp
-      (HLO_CompareOp
-        $input,
-        (HLO_SqrtOp
-          (HLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        HLO_ComparisonDirectionValue<"GE">,
-        (HLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (HLO_AddOp
-        (HLO_LogOp $input),
-        (HLO_LogOp
-          (HLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (HLO_LogOp
-        (HLO_AddOp
-          $input,
-          (HLO_SqrtOp
-            (HLO_MulOp
-              (HLO_AddOp
-                (HLO_ConstantLike<"1"> $input),
-                $input
-              ),
-              (HLO_AddOp
-                (HLO_ConstantLike<"-1"> $input),
-                $input
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand acosh for complex arguments to MHLO dialect as
-//   acosh(x) = log(x + sqrt((x+1)*(x-1)))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AcoshOp ComplexElementType:$input),
-  (HLO_LogOp
-    (HLO_AddOp
-      $input,
-      (HLO_SqrtOp
-        (HLO_MulOp
-          (HLO_AddOp
-            $input,
-            (HLO_ConstantLike<"1"> $input)
-          ),
-          (HLO_SubtractOp
-            $input,
-            (HLO_ConstantLike<"1"> $input)
-          )
-        )
-      )
-    )
-  )>;
-
-
-// Expand asin to MHLO dialect as follows:
-//   asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
-def : Pat<(CHLO_AsinOp $input),
-  (HLO_MulOp
-    (HLO_ConstantLike<"2"> $input),
-    (HLO_Atan2Op
-      $input,
-      (HLO_AddOp
-        (HLO_ConstantLike<"1"> $input),
-        (HLO_SqrtOp
-          (HLO_SubtractOp
-            (HLO_ConstantLike<"1"> $input),
-            (HLO_MulOp $input, $input)
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for non-complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
-// as 2*x and return log(2) + log(x).
-//
-// For small x, sqrt(x^2 + 1) will evaluate to 1 due to floating point
-// arithmetic. However, we would like to retain the low order term of this,
-// which is around 0.5 * x^2 using a binomial expansion.
-// Let z = sqrt(a^2 + 1)
-// The following rewrite retains the lower order term.
-// log(a + sqrt(a^2 + 1))
-//   = log((a + sqrt(a^2 + 1)) * (1 + sqrt(a^2 + 1)) / (1 + sqrt(a^2 + 1)))
-//   = log((a + a^2 + 1 + a * z + z) / (1 + z))
-//   = log(1 + a + a^2 / (1 + z))
-//   = log(1 + a + a^2 / (1 + sqrt(a^2 + 1)))
-//
-// If x is negative, the above would give us some trouble; we can't approximate
-// the result as x + abs(x) = 0 but we are saved by the fact that asinh(-x) =
-// -asinh(x).
-def : Pat<(CHLO_AsinhOp NonComplexElementType:$input),
-  (HLO_MulOp
-    (HLO_SignOp $input),
-    (HLO_SelectOp
-      (HLO_CompareOp
-        (HLO_AbsOp $input),
-        (HLO_SqrtOp
-          (HLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        HLO_ComparisonDirectionValue<"GE">,
-        (HLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (HLO_AddOp
-        (HLO_LogOp
-          (HLO_AbsOp $input)
-        ),
-        (HLO_LogOp
-          (HLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (HLO_SelectOp
-        (HLO_CompareOp
-          (HLO_AbsOp $input),
-          (HLO_ConstantLike<"1"> $input),
-          HLO_ComparisonDirectionValue<"LE">,
-          (HLO_DEFAULT_COMPARISON_TYPE)
-        ),
-        (HLO_Log1pOp
-          (HLO_AddOp
-            (HLO_AbsOp $input),
-            (HLO_MulOp
-              (HLO_AbsOp $input),
-              (HLO_DivOp
-                (HLO_AbsOp $input),
-                (HLO_AddOp
-                  (HLO_ConstantLike<"1"> $input),
-                  (HLO_SqrtOp
-                    (HLO_AddOp
-                      (HLO_MulOp
-                        (HLO_AbsOp $input),
-                        (HLO_AbsOp $input)
-                      ),
-                      (HLO_ConstantLike<"1"> $input)
-                    )
-                  )
-                )
-              )
-            )
-          )
-        ),
-        (HLO_LogOp
-          (HLO_AddOp
-            (HLO_AbsOp $input),
-            (HLO_SqrtOp
-              (HLO_AddOp
-                (HLO_MulOp
-                  (HLO_AbsOp $input),
-                  (HLO_AbsOp $input)
-                ),
-                (HLO_ConstantLike<"1"> $input)
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AsinhOp ComplexElementType:$input),
-  (HLO_LogOp
-    (HLO_AddOp
-      $input,
-      (HLO_SqrtOp
-        (HLO_AddOp
-          (HLO_MulOp $input, $input),
-          (HLO_ConstantLike<"1"> $input)
-        )
-      )
-    )
-  )>;
-
-// Express `atan` as
-//   atan(x) = atan2(x, 1)
-def : Pat<(CHLO_AtanOp $input),
-  (HLO_Atan2Op
-    $input,
-    (HLO_ConstantLike<"1"> $input)
-  )>;
-
-// Express `atanh` for non-complex arguments as follows:
-//   atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) <= 1
-//   atanh(x) = nan                          otherwise
-def : Pat<(CHLO_AtanhOp NonComplexElementType:$input),
-  (HLO_SelectOp
-    (HLO_CompareOp
-      (HLO_AbsOp $input),
-      (HLO_ConstantLike<"1"> $input),
-      HLO_ComparisonDirectionValue<"GT">,
-      (HLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (HLO_ConstantLike<"NAN"> $input),
-    (HLO_MulOp
-      (HLO_SubtractOp
-        (HLO_Log1pOp $input),
-        (HLO_Log1pOp
-          (HLO_NegOp $input)
-        )
-      ),
-      (HLO_ConstantLike<"0.5"> $input)
-    )
-  )>;
-
-// Express `atanh` for complex arguments as follows:
-//   atanh(x) = (log(1 + x) - log(1 + (-x))) * 0.5
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the nan edge case for complex inputs,
-// because we don't yet have exhaustive tests for complex trig functions".
-def : Pat<(CHLO_AtanhOp ComplexElementType:$input),
-  (HLO_MulOp
-    (HLO_SubtractOp
-      (HLO_Log1pOp $input),
-      (HLO_Log1pOp
-        (HLO_NegOp $input)
-      )
-    ),
-    (HLO_ConstantLike<"0.5"> $input)
-  )>;
-
-// Express `conj` as
-//   conj(x) = (re(x), -im(x)).
-def : Pat<(CHLO_ConjOp $v),
-          (HLO_ComplexOp (HLO_RealOp $v), (HLO_NegOp (HLO_ImagOp $v)))>;
-
-// Express `is_inf` as
-//   is_inf(x) = is_pos_inf(|x|)
-def : Pat<(CHLO_IsInfOp NonComplexElementType:$input),
-  (CHLO_IsPosInfOp
-    (HLO_AbsOp $input)
-  )>;
-
-// Express `is_pos_inf` as
-//   is_pos_inf(x) = (x == +inf)
-def : Pat<(CHLO_IsPosInfOp NonComplexElementType:$input),
-  (HLO_CompareOp
-    $input,
-    (HLO_ConstantLikePosInfValue $input),
-    HLO_ComparisonDirectionValue<"EQ">,
-    (HLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-// Express `is_neg_inf` as
-//   is_neg_inf(x) = (x == -inf)
-def : Pat<(CHLO_IsNegInfOp NonComplexElementType:$input),
-  (HLO_CompareOp
-    $input,
-    (HLO_ConstantLikeNegInfValue $input),
-    HLO_ComparisonDirectionValue<"EQ">,
-    (HLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-def : Pat<(CHLO_ConstantOp $v),
-          (HLO_ConstantOp $v)>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc
deleted file mode 100644
index 2071140afa7..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering HLO dialect to LHLO dialect.
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_HLOLEGALIZETOMEMREFPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-using bufferization::AnalysisState;
-using bufferization::BufferizableOpInterface;
-using bufferization::BufferizationOptions;
-using bufferization::BufferRelation;
-using bufferization::replaceOpWithNewBufferizedOp;
-
-struct CustomCallOpInterface
-    : public BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
-                                                    mhlo::CustomCallOp> {
-  bool bufferizesToMemoryRead(Operation *, OpOperand &,
-                              const AnalysisState &) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *, OpOperand &,
-                               const AnalysisState &) const {
-    return false;  // Arguments are read-only.
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(Operation *, OpOperand &,
-                                            const AnalysisState &) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto customCallOp = cast<mhlo::CustomCallOp>(op);
-
-    // Bufferize arguments.
-    SmallVector<Value> bufferArgs;
-    for (OpOperand &operand : customCallOp->getOpOperands()) {
-      if (!operand.get().getType().isa<TensorType>()) return failure();
-      FailureOr<Value> operandBuffer =
-          getBuffer(rewriter, operand.get(), options);
-      if (failed(operandBuffer)) return failure();
-      bufferArgs.push_back(*operandBuffer);
-    }
-
-    // Allocate outputs.
-    for (OpResult result : customCallOp->getOpResults()) {
-      auto tensorType = result.getType().cast<RankedTensorType>();
-      if (!tensorType) return failure();
-      // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
-      AnalysisState analysisState(options);
-      FailureOr<Value> tensorAlloc =
-          bufferization::allocateTensorForShapedValue(
-              rewriter, op->getLoc(), result,
-              analysisState.isTensorYielded(result), options);
-      if (failed(tensorAlloc)) return failure();
-      auto memrefType =
-          MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-      Value resultBuffer = rewriter.create<bufferization::ToMemrefOp>(
-          op->getLoc(), memrefType, *tensorAlloc);
-      bufferArgs.push_back(resultBuffer);
-    }
-
-    auto lhloOp = rewriter.create<lmhlo::CustomCallOp>(
-        op->getLoc(), llvm::None, bufferArgs, op->getAttrs());
-    // lmhlo.custom_call uses a segment_size attribute to tell input from output
-    // arguments.
-    lhloOp->setAttr(lhloOp.getOperandSegmentSizeAttr(),
-                    rewriter.getDenseI32ArrayAttr(
-                        {static_cast<int32_t>(op->getNumOperands()),
-                         static_cast<int32_t>(op->getNumResults())}));
-    bufferization::replaceOpWithBufferizedValues(
-        rewriter, op, makeArrayRef(bufferArgs).slice(op->getNumOperands()));
-    return success();
-  }
-};
-
-struct ReshapeOpInterface
-    : public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
-                                                    mhlo::ReshapeOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
-                               const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand & /*opOperand*/,
-      const AnalysisState & /*state*/) const {
-    return {op->getResult(0)};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto reshapeOp = cast<mhlo::ReshapeOp>(op);
-    auto unrankedOperandType =
-        reshapeOp.getOperand().getType().dyn_cast<UnrankedTensorType>();
-    if (unrankedOperandType == nullptr) return success();
-
-    // The buffer still has the old (pre-reshape) type.
-    FailureOr<Value> operandBuffer =
-        getBuffer(rewriter, reshapeOp.getOperand(), options);
-    if (failed(operandBuffer)) return failure();
-
-    auto resultType = reshapeOp.getType().cast<RankedTensorType>();
-    auto destType =
-        MemRefType::get(resultType.getShape(), resultType.getElementType());
-    replaceOpWithNewBufferizedOp<memref::CastOp>(rewriter, op, destType,
-                                                 *operandBuffer);
-    return success();
-  }
-};
-
-struct DynamicReshapeOpInterface
-    : public BufferizableOpInterface::ExternalModel<DynamicReshapeOpInterface,
-                                                    mhlo::DynamicReshapeOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
-                               const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand & /*opOperand*/,
-      const AnalysisState & /*state*/) const {
-    return {op->getResult(0)};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto reshapeOp = cast<mhlo::DynamicReshapeOp>(op);
-
-    // The buffer still has the old (pre-reshape) type.
-    FailureOr<Value> operandBuffer =
-        getBuffer(rewriter, reshapeOp.getOperand(), options);
-    FailureOr<Value> outputShapeBuffer =
-        getBuffer(rewriter, reshapeOp.getOutputShape(), options);
-    if (failed(operandBuffer) || failed(outputShapeBuffer)) return failure();
-
-    ShapedType resultType;
-    TensorType opResultType = reshapeOp.getType();
-    if (auto rankedType = opResultType.dyn_cast<RankedTensorType>()) {
-      resultType =
-          MemRefType::get(rankedType.getShape(), rankedType.getElementType());
-    } else if (auto unrankedType =
-                   opResultType.dyn_cast<UnrankedTensorType>()) {
-      resultType = UnrankedMemRefType::get(unrankedType.getElementType(), 0);
-    }
-    auto operand = *operandBuffer;
-    // If the operand has a non-identity affine map, we will have to add a copy.
-    auto bufferType = operandBuffer->getType().dyn_cast<MemRefType>();
-    if (bufferType && !bufferType.getLayout().isIdentity()) {
-      // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
-      AnalysisState analysisState(options);
-      FailureOr<Value> tensorAlloc =
-          bufferization::allocateTensorForShapedValue(
-              rewriter, op->getLoc(), *operandBuffer,
-              analysisState.isTensorYielded(reshapeOp.getResult()), options);
-      if (failed(tensorAlloc)) return failure();
-      auto memrefType =
-          MemRefType::get(bufferType.getShape(), bufferType.getElementType());
-      operand = rewriter.create<bufferization::ToMemrefOp>(
-          op->getLoc(), memrefType, *tensorAlloc);
-    }
-    bufferization::replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
-        rewriter, op, resultType, operand, *outputShapeBuffer);
-    return success();
-  }
-};
-
-// Inserts dynamic memref to change the layout of the memref to put 0-stride
-// and size of the target dimension if size-1 dimension expansion is
-// necessary.
-FailureOr<Value> insertDynamicMemrefCastOp(
-    mhlo::DynamicBroadcastInDimOp op, Value operand, RewriterBase &rewriter,
-    const BufferizationOptions &options) {
-  auto loc = op.getLoc();
-  auto operandType = operand.getType().cast<MemRefType>();
-  auto operandShape = operandType.getShape();
-  auto operandRank = operandType.getRank();
-
-  auto resultType = op.getType().cast<RankedTensorType>();
-  auto resultRank = resultType.getRank();
-
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-
-  // Compute a reversed scan product. Compute the stride for the dimensions so
-  // far, working from minor to major dimensions. Additionally, save the
-  // operand shape Values to use in the next loop.
-  SmallVector<Value, 2> operandStrides(operandRank, one);
-  SmallVector<Value, 2> operandSizes(operandRank, one);
-  Value strideSoFar = one;
-  for (int i = operandRank - 1; i >= 0; --i) {
-    Value operandDimSize =
-        ShapedType::isDynamic(operandShape[i])
-            ? rewriter.create<memref::DimOp>(loc, operand, i).getResult()
-            : rewriter.create<arith::ConstantIndexOp>(loc, operandShape[i])
-                  .getResult();
-    operandSizes[i] = operandDimSize;
-
-    operandStrides[i] = strideSoFar;
-    if (i > 0) {
-      strideSoFar =
-          rewriter.create<arith::MulIOp>(loc, strideSoFar, operandDimSize);
-    }
-  }
-
-  SmallVector<OpFoldResult, 2> sizes, strides;
-  sizes.reserve(resultRank);
-  strides.reserve(resultRank);
-
-  DenseMap<int, int> outputToInputDim;
-  for (const auto &dim : llvm::enumerate(op.getBroadcastDimensions())) {
-    outputToInputDim[dim.value().getSExtValue()] = dim.index();
-  }
-  for (int i = 0; i < resultRank; ++i) {
-    Value iVal = rewriter.create<arith::ConstantIndexOp>(loc, i);
-    FailureOr<Value> outputDimsBuffer =
-        getBuffer(rewriter, op.getOutputDimensions(), options);
-    if (failed(outputDimsBuffer)) return failure();
-    Value resultDimSize =
-        rewriter.create<memref::LoadOp>(loc, *outputDimsBuffer, iVal);
-    if (!resultDimSize.getType().isIndex()) {
-      resultDimSize = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), resultDimSize);
-    }
-    if (resultType.isDynamicDim(i)) {
-      sizes.push_back(resultDimSize);
-    } else {
-      sizes.push_back(rewriter.getIndexAttr(resultType.getDimSize(i)));
-    }
-
-    auto it = outputToInputDim.find(i);
-    // If the rank of the output is greater than the rank of the input, i.e.
-    // there was no output dimension in the inverse broadcast_dimensions map
-    // we also set stride to 0 to emulate padding of the shape with 1s and the
-    // corresponding expansion.
-    if (it == outputToInputDim.end()) {
-      strides.push_back(zero);
-      continue;
-    }
-
-    // There can be two cases:
-    // 1) Operand dim == result dim => expansion is not needed
-    //    => stride flattened buffer stride
-    // 2) Operand dim < result dim => expansion is needed => stride := 0.
-    int dim = it->second;
-    Value isExpansion = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, operandSizes[dim], resultDimSize);
-    Value select = rewriter.create<mlir::arith::SelectOp>(
-        loc, isExpansion, zero, operandStrides[dim]);
-    strides.push_back(select);
-  }
-
-  // Type-erased memref type with static rank and dynamic strides.
-  SmallVector<int64_t, 2> dynamicLayout(resultRank,
-                                        ShapedType::kDynamicStrideOrOffset);
-  auto typeErasedMemrefType = MemRefType::get(
-      resultType.getShape(), operandType.getElementType(),
-      makeStridedLinearLayoutMap(dynamicLayout,
-                                 /*offset=*/0, rewriter.getContext()));
-
-  auto transformedOperand = rewriter.create<memref::ReinterpretCastOp>(
-      loc, typeErasedMemrefType, operand,
-      /*offset=*/rewriter.getI64IntegerAttr(0), sizes, strides);
-  return transformedOperand.getResult();
-}
-
-struct DynamicBroadcastInDimOpInterface
-    : public BufferizableOpInterface::ExternalModel<
-          DynamicBroadcastInDimOpInterface, mhlo::DynamicBroadcastInDimOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
-                               const AnalysisState & /*state*/) const {
-    return false;
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand & /*opOperand*/,
-      const AnalysisState & /*state*/) const {
-    return {op->getResult(0)};
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    // The op may allocate.
-    return BufferRelation::None;
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    auto broadcastInDimOp = cast<mhlo::DynamicBroadcastInDimOp>(op);
-    auto resultType = broadcastInDimOp.getType().dyn_cast<RankedTensorType>();
-    if (!resultType) return success();
-
-    // The buffer still has the old (pre-reshape) type.
-    FailureOr<Value> operandBuffer =
-        getBuffer(rewriter, broadcastInDimOp.getOperand(), options);
-    if (failed(operandBuffer)) return failure();
-    FailureOr<Value> result = insertDynamicMemrefCastOp(
-        broadcastInDimOp, *operandBuffer, rewriter, options);
-    if (failed(result)) return failure();
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, *result);
-    return success();
-  }
-};
-
-struct HloLegalizeToMemrefPass
-    : public impl::HloLegalizeToMemrefPassBase<HloLegalizeToMemrefPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
-                    mhlo::MhloDialect, lmhlo::LmhloDialect>();
-    registerBufferizableOpInterfaceExternalModels(registry);
-  }
-
- public:
-  void runOnOperation() override {
-    bufferization::BufferizationOptions options =
-        bufferization::getPartialBufferizationOptions();
-    options.opFilter.allowDialect<mhlo::MhloDialect>();
-    if (failed(bufferizeOp(getOperation(), options))) signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass() {
-  return std::make_unique<HloLegalizeToMemrefPass>();
-}
-
-void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, MhloDialect * /*dialect*/) {
-    CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
-    ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
-    DynamicReshapeOp::attachInterface<DynamicReshapeOpInterface>(*ctx);
-    DynamicBroadcastInDimOp::attachInterface<DynamicBroadcastInDimOpInterface>(
-        *ctx);
-  });
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo.cc
deleted file mode 100644
index 410bf14d476..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "stablehlo/dialect/StablehloOps.h"
-
-namespace mlir {
-namespace stablehlo {
-namespace {
-
-#define RETURN_CONVERTED_ENUM_ATTR(Name)                      \
-  auto hloValue = mhlo::stringify##Name(attr.getValue());     \
-  auto stablehloValue = stablehlo::symbolize##Name(hloValue); \
-  if (!stablehloValue.has_value()) return {};                 \
-  return stablehlo::Name##Attr::get(attr.getContext(), stablehloValue.value())
-
-Attribute convertAttr(Attribute hloAttr) {
-  // Handle MHLO attributes.
-  // The logic that handles attributes from other dialects (e.g. builtin
-  // attributes) lives below.
-  if (auto attr = hloAttr.dyn_cast<mhlo::ChannelHandleAttr>()) {
-    return stablehlo::ChannelHandleAttr::get(attr.getContext(),
-                                             attr.getHandle(), attr.getType());
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonDirectionAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(ComparisonDirection);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonTypeAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(ComparisonType);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ConvDimensionNumbersAttr>()) {
-    return stablehlo::ConvDimensionNumbersAttr::get(
-        attr.getContext(), attr.getInputBatchDimension(),
-        attr.getInputFeatureDimension(), attr.getInputSpatialDimensions(),
-        attr.getKernelInputFeatureDimension(),
-        attr.getKernelOutputFeatureDimension(),
-        attr.getKernelSpatialDimensions(), attr.getOutputBatchDimension(),
-        attr.getOutputFeatureDimension(), attr.getOutputSpatialDimensions());
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::CustomCallApiVersionAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(CustomCallApiVersion);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::DotDimensionNumbersAttr>()) {
-    return stablehlo::DotDimensionNumbersAttr::get(
-        attr.getContext(), attr.getLhsBatchingDimensions(),
-        attr.getRhsBatchingDimensions(), attr.getLhsContractingDimensions(),
-        attr.getRhsContractingDimensions());
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::FftTypeAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(FftType);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::GatherDimensionNumbersAttr>()) {
-    return stablehlo::GatherDimensionNumbersAttr::get(
-        attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
-        attr.getStartIndexMap(), attr.getIndexVectorDim());
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::PrecisionAttr>()) {
-    // This precision value is used to experiment with int4 support.
-    // Needs more experimental data before we decide whether or not to propose
-    // it to StableHLO.
-    if (attr.getValue() == mhlo::Precision::PACKED_NIBBLE) return {};
-    RETURN_CONVERTED_ENUM_ATTR(Precision);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::RngAlgorithmAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(RngAlgorithm);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::RngDistributionAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(RngDistribution);
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::ScatterDimensionNumbersAttr>()) {
-    return stablehlo::ScatterDimensionNumbersAttr::get(
-        attr.getContext(), attr.getUpdateWindowDims(),
-        attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
-        attr.getIndexVectorDim());
-  }
-  if (auto attr = hloAttr.dyn_cast<mhlo::TransposeAttr>()) {
-    RETURN_CONVERTED_ENUM_ATTR(Transpose);
-  }
-  if (hloAttr.getDialect().getNamespace() ==
-      mhlo::MhloDialect::getDialectNamespace()) {
-    // Our guiding principle is to support all StableHLO functionality in MHLO.
-    // The inverse is not necessarily true - some MHLO attributes are missing
-    // from StableHLO (either deliberately or haven't yet been proposed).
-    // As a result, these MHLO attributes will fail here.
-    return {};
-  }
-
-  // Handle non-MHLO attributes.
-  // If an attribute is not defined in MHLO, then it is unchanged,
-  // with the exception of ArrayAttr which is converted recursively.
-  if (auto hloAttrs = hloAttr.dyn_cast<ArrayAttr>()) {
-    SmallVector<Attribute> stablehloAttrs;
-    for (auto hloAttr : hloAttrs) {
-      auto stablehloAttr = convertAttr(hloAttr);
-      if (!stablehloAttr) return {};
-      stablehloAttrs.push_back(stablehloAttr);
-    }
-    return ArrayAttr::get(hloAttrs.getContext(), stablehloAttrs);
-  }
-  return hloAttr;
-}
-
-#undef RETURN_CONVERTED_ENUM_ATTR
-
-template <typename HloOpTy>
-class HloToStablehloOpConverter : public OpConversionPattern<HloOpTy> {
- public:
-  using OpConversionPattern<HloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      HloOpTy hloOp, typename HloOpTy::Adaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    // Most MHLO ops which end up here are fully supported by StableHLO.
-    // However, some of these ops are supported only partially because they
-    // have features that either haven't been proposed to StableHLO yet
-    // or aren't planned to be proposed to StableHLO.
-    // The check below makes sure we only proceed for supported ops.
-    if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
-      // Added to MHLO per feature request from JAX.
-      // Hasn't been proposed to StableHLO yet.
-      if (!hloOp.getOutputOperandAliases().empty()) return failure();
-    }
-
-    // Convert MHLO types to StableHLO equivalents.
-    // If a type is not defined in MHLO, then it is unchanged,
-    // with the exception of RankedTensorType and TupleType which are
-    // converted recursively.
-    // See `HloToStablehloTypeConverter` for more information on when this
-    // conversion will succeed or fail.
-    SmallVector<Type> stablehloTypes;
-    if (failed(this->getTypeConverter()->convertTypes(hloOp->getResultTypes(),
-                                                      stablehloTypes)))
-      return failure();
-
-    // These operands have already been converted to StableHLO by
-    // the dialect conversion infrastructure.
-    ValueRange stablehloOperands = adaptor.getOperands();
-
-    // Convert MHLO attributes to StableHLO equivalents.
-    // If an attribute is not defined in MHLO, then it is unchanged,
-    // with the exception of ArrayAttr which is converted recursively.
-    SmallVector<NamedAttribute> stablehloAttrs;
-    for (NamedAttribute hloAttr : hloOp->getAttrs()) {
-      auto stablehloAttr = convertAttr(hloAttr.getValue());
-      if (!stablehloAttr) return failure();
-      stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
-    }
-
-    // Convert the MHLO operation to a StableHLO equivalent.
-    // This can almost be done in a generic fashion, except for stablehlo.case
-    // that uses a variadic number of regions which means an additional argument
-    // for the generic builder.
-    HloToStablehloOp<HloOpTy> stablehloOp;
-    if constexpr (std::is_same<HloOpTy, mhlo::CaseOp>::value) {
-      stablehloOp = rewriter.replaceOpWithNewOp<stablehlo::CaseOp>(
-          hloOp, stablehloTypes, stablehloOperands, stablehloAttrs,
-          hloOp.getBranches().size());
-    } else {
-      stablehloOp = rewriter.replaceOpWithNewOp<HloToStablehloOp<HloOpTy>>(
-          hloOp, stablehloTypes, stablehloOperands, stablehloAttrs);
-    }
-
-    // Finally, populate the regions while converting argument types
-    // and nested operations.
-    for (auto [hloRegion, stablehloRegion] :
-         llvm::zip(hloOp->getRegions(), stablehloOp->getRegions())) {
-      rewriter.inlineRegionBefore(hloRegion, stablehloRegion,
-                                  stablehloRegion.end());
-    }
-    return success();
-  }
-};
-
-template <typename... StablehloOpTypes>
-void populateHloToStablehloPatterns(RewritePatternSet* patterns,
-                                    TypeConverter* converter,
-                                    MLIRContext* context) {
-  patterns
-      ->add<HloToStablehloOpConverter<StablehloToHloOp<StablehloOpTypes>>...>(
-          *converter, context);
-}
-
-}  // namespace
-
-void populateHloToStablehloPatterns(RewritePatternSet* patterns,
-                                    TypeConverter* converter,
-                                    MLIRContext* context) {
-  // Populate conversion patterns for all StableHLO ops.
-  // Our guiding principle is to support all StableHLO functionality in MHLO.
-  // The inverse is not necessarily true - some MHLO ops are missing from
-  // StableHLO (either deliberately or haven't yet been proposed to StableHLO).
-  // As a result, these MHLO ops will not be added to these patterns and
-  // will fail the conversion.
-  populateHloToStablehloPatterns<
-#define GET_OP_LIST
-#include "stablehlo/dialect/StablehloOps.cpp.inc"
-      >(patterns, converter, context);
-}
-
-}  // namespace stablehlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_sort.cc
deleted file mode 100644
index 264e2ef6eaf..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_sort.cc
+++ /dev/null
@@ -1,579 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering mhlo.sort to the SCF dialect.
-#include <iterator>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"  // TF:llvm-project
-#include "mlir/IR/Block.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/TypeRange.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_HLOLEGALIZESORTPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-using ::mlir::arith::AddIOp;
-using ::mlir::arith::CmpIPredicate;
-using ::mlir::arith::MinSIOp;
-using ::mlir::arith::SelectOp;
-using ::mlir::arith::SubIOp;
-
-constexpr int64_t kInsertionSortSize = 16;
-
-// Inlines the `comparator` region (without terminator) at the current insertion
-// point, replacing the arguments with the given values from `lhs` and `rhs`.
-Value emitComparison(ImplicitLocOpBuilder& b, SmallVector<Value>& lhs,
-                     SmallVector<Value>& rhs, Region& comparator) {
-  assert(comparator.hasOneBlock() && "Comparator must have only one block.");
-  Block& block = comparator.front();
-  assert(block.getTerminator()->getOperands().size() == 1 &&
-         "Comparator must return a single value");
-
-  BlockAndValueMapping mapping;
-  for (auto [idx, arg] : llvm::enumerate(comparator.getArguments())) {
-    Value value = idx % 2 == 0 ? lhs[idx / 2] : rhs[idx / 2];
-    Type type = RankedTensorType::get({}, value.getType());
-    mapping.map(arg, b.create<tensor::FromElementsOp>(type, value));
-  }
-
-  for (Operation& op : block.without_terminator()) b.clone(op, mapping);
-  Value result = mapping.lookup(block.getTerminator()->getOperands().front());
-
-  return b.create<tensor::ExtractOp>(result, ValueRange());
-}
-
-// Emits a binary search of `pivots` in `arrayMemrefs` (all rank 1) in the range
-// [`left`;`right`). `arrayMemrefs` must be sorted according to `comparator`.
-Value emitBinarySearch(ImplicitLocOpBuilder& b, Value leftInit, Value rightInit,
-                       SmallVector<Value>& pivots, ValueRange arrayMemrefs,
-                       Region& comparator) {
-  SmallVector<Type, 2> types{leftInit.getType(), rightInit.getType()};
-  ArithBuilder arith(b, b.getLoc());
-
-  // while (
-  auto whileOp =
-      b.create<scf::WhileOp>(types, SmallVector<Value, 2>{leftInit, rightInit});
-  OpBuilder::InsertionGuard guard(b);
-
-  //        left < right) {
-  Block* before = b.createBlock(&whileOp.getBefore(), {}, types,
-                                {whileOp.getLoc(), whileOp.getLoc()});
-  {
-    Value left = before->getArgument(0), right = before->getArgument(1);
-    b.setInsertionPointToEnd(before);
-    b.create<scf::ConditionOp>(arith.slt(left, right), before->getArguments());
-  }
-
-  Block* after = b.createBlock(&whileOp.getAfter(), {}, types,
-                               {whileOp.getLoc(), whileOp.getLoc()});
-  {
-    Value left = after->getArgument(0), right = after->getArgument(1);
-    b.setInsertionPointToEnd(after);
-    //   int mid = (left + right) >> 1;
-    Value one = b.create<arith::ConstantIndexOp>(1);
-    Value mid = b.create<arith::ShRUIOp>(arith.add(left, right), one);
-    Value midPlusOne = b.create<AddIOp>(mid, one);
-
-    auto arraysAtMid = llvm::to_vector(
-        llvm::map_range(arrayMemrefs, [&](Value arrayMemref) -> Value {
-          return b.create<memref::LoadOp>(arrayMemref, mid);
-        }));
-    Value cond = emitComparison(b, pivots, arraysAtMid, comparator);
-    //   if (comparator(pivot, array[mid]))
-    //     right = mid;
-    //   else
-    //     left = mid + 1;
-    Value newLeft = arith.select(cond, left, midPlusOne);
-    Value newRight = arith.select(cond, mid, right);
-
-    // }
-    b.create<scf::YieldOp>(ValueRange{newLeft, newRight});
-  }
-
-  return whileOp.getResult(0);
-}
-
-SmallVector<Value> loadTensorElements(ImplicitLocOpBuilder& b,
-                                      ValueRange tensors, Value index) {
-  return llvm::to_vector(llvm::map_range(tensors, [&](Value tensor) -> Value {
-    return b.create<tensor::ExtractOp>(tensor, index);
-  }));
-}
-
-SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
-                                      ValueRange memrefs, Value index) {
-  return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
-    Type type = memref.getType().cast<MemRefType>().getElementType();
-    return b.create<memref::LoadOp>(type, memref, index);
-  }));
-}
-
-void storeMemrefElements(ImplicitLocOpBuilder& b, ValueRange memrefs,
-                         Value index, ValueRange values) {
-  for (auto [value, memref] : llvm::zip(values, memrefs)) {
-    b.create<memref::StoreOp>(value, memref, index);
-  }
-}
-
-// Insertion sorts `inputTensors` in the range [`lo`; `hi`), storing the results
-// in `outputMemrefs`. `inputTensors` and `outputMemrefs` must all be rank 1 and
-// of identical size.
-void emitInsertionSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                       ValueRange inputTensors, ValueRange outputMemrefs,
-                       mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-
-  // array[lo] = tensors[lo];
-  storeMemrefElements(b, outputMemrefs, lo,
-                      loadTensorElements(b, inputTensors, lo));
-
-  // for (int start = lo + 1; start < hi; ++start)
-  {
-    auto forOp = b.create<scf::ForOp>(arith.add(lo, one), hi, one);
-    OpBuilder::InsertionGuard outerGuard(b);
-    b.setInsertionPointToStart(forOp.getBody());
-    Value start = forOp.getInductionVar();
-
-    //   T pivot = tensors[start];
-    auto pivots = loadTensorElements(b, inputTensors, start);
-
-    //   int index = binarySearch(lo, start, pivot, array, comparator);
-    auto index =
-        emitBinarySearch(b, lo, start, pivots, outputMemrefs, comparator);
-
-    //   int n = start - index;  // The number of elements to move
-    Value n = arith.sub(start, index);
-
-    // memmove(&array[index + 1], &array[index], n * sizeof(T))
-    // memref::CopyOp would be nice to use here, but:
-    // 1. It lowers to a quite inefficient library call in the general case
-    //    (strides != 1).
-    // 2. It implements memcpy semantics, but we need memmove here.
-    // So we go with a loop instead.
-    auto copyForOp = b.create<scf::ForOp>(zero, n, one);
-    {
-      OpBuilder::InsertionGuard innerGuard(b);
-      b.setInsertionPointToStart(copyForOp.getBody());
-      Value copyLoopIndex = copyForOp.getBody()->getArgument(0);
-
-      Value dstIndex = arith.sub(start, copyLoopIndex);
-      Value srcIndex = arith.sub(dstIndex, one);
-      storeMemrefElements(b, outputMemrefs, dstIndex,
-                          loadMemrefElements(b, outputMemrefs, srcIndex));
-    }
-    //   array[index] = pivot;
-    storeMemrefElements(b, outputMemrefs, index, pivots);
-  }
-}
-
-void emitMerge(ImplicitLocOpBuilder& b, Value lo, Value mid, Value hi,
-               ValueRange readBufs, ValueRange writeBufs,
-               mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  // The while loop runs until we reach the end of either interval. It has three
-  // loop-carried variables:
-  // 1. current output index
-  // 2. current read index for interval 1
-  // 3. current read index for interval 2
-  SmallVector<Type> whileArgTypes{lo.getType(), lo.getType(), mid.getType()};
-  SmallVector<Value> whileInitArgs{lo, lo, mid};
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while(
-  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
-  {
-    OpBuilder::InsertionGuard guard(b);
-    {
-      Block* before =
-          b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
-      Value i0 = before->getArgument(1), i1 = before->getArgument(2);
-      b.setInsertionPointToEnd(before);
-
-      //     i0 < mid && i1 < hi) {
-      Value inbounds0 = arith.slt(i0, mid);
-      Value inbounds1 = arith.slt(i1, hi);
-
-      b.create<scf::ConditionOp>(arith._and(inbounds0, inbounds1),
-                                 before->getArguments());
-    }
-
-    {
-      Block* after =
-          b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
-      Value iOut = after->getArgument(0), i0 = after->getArgument(1),
-            i1 = after->getArgument(2);
-      b.setInsertionPointToEnd(after);
-
-      //   auto vals0 = readBufs[i0], vals1 = readBufs[i1];
-      SmallVector<Value> vals0 = loadMemrefElements(b, readBufs, i0);
-      SmallVector<Value> vals1 = loadMemrefElements(b, readBufs, i1);
-
-      //   writeBufs[iOut] = comparator(vals1, vals0)
-      //                       ? readBufs[i1++] : readBufs[i0++];
-      Value cmp = emitComparison(b, vals1, vals0, comparator);
-      SmallVector<Value> pickedVals;
-      for (auto [val0, val1] : llvm::zip(vals0, vals1)) {
-        pickedVals.push_back(b.create<SelectOp>(cmp, val1, val0));
-      }
-      storeMemrefElements(b, writeBufs, iOut, pickedVals);
-
-      Value one = b.create<arith::ConstantIndexOp>(1);
-      Value nexti0 = b.create<SelectOp>(cmp, i0, arith.add(i0, one));
-      Value nexti1 = b.create<SelectOp>(cmp, arith.add(i1, one), i1);
-      //   ++iOut;
-      Value nextIOut = b.create<AddIOp>(iOut, one);
-      b.create<scf::YieldOp>(ValueRange{nextIOut, nexti0, nexti1});
-    }
-  }
-
-  // At this point, exactly one of the input ranges will have leftover elements.
-  Value iOut = whileOp->getResult(0);
-  Value i0 = whileOp->getResult(1);
-  Value i1 = whileOp->getResult(2);
-
-  // We could use memref::CopyOp here, but typically, there aren't many leftover
-  // elements for randomly shuffled inputs.
-  Value leftoverIn0 = arith.slt(i0, mid);
-  Value start = arith.select(leftoverIn0, i0, i1);
-  Value end = arith.select(leftoverIn0, mid, hi);
-  Value n = arith.sub(end, start);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-  auto forOp = b.create<scf::ForOp>(zero, n, one);
-  b.setInsertionPointToStart(forOp.getBody());
-  Value copyIndex = forOp.getBody()->getArgument(0);
-
-  Value srcIndex = arith.add(start, copyIndex);
-  Value dstIndex = arith.add(iOut, copyIndex);
-  storeMemrefElements(b, writeBufs, dstIndex,
-                      loadMemrefElements(b, readBufs, srcIndex));
-}
-
-// Emits a bottom up merge sort of `inputTensors` in the range [`lo`; `hi`), and
-// writes the results to either `outputs0` or `outputs1`.
-// Returns 0 if the results are in `outputs0`, 1 if they are in `outputs1`.
-// TODO(jreiffers): Consider implementing top-down merge sort.
-Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                            int64_t staticSortDimSize, ValueRange inputTensors,
-                            ValueRange outputs0, ValueRange outputs1,
-                            mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value size = arith.sub(hi, lo);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value insertionSortSize =
-      b.create<arith::ConstantIndexOp>(kInsertionSortSize);
-
-  // Run insertion sort on blocks of size kInsertionSortSize.
-  // for (int start = 0; start < size; start += kInsertionSortSize) {
-  {
-    auto forOp = b.create<scf::ForOp>(zero, size, insertionSortSize);
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(forOp.getBody());
-    Value start = forOp.getBody()->getArgument(0);
-    Value end = arith.add(
-        b.create<MinSIOp>(arith.add(start, insertionSortSize), size), lo);
-    emitInsertionSort(b, start, end, inputTensors, outputs0, comparator);
-  }
-
-  Value initParity = b.create<arith::ConstantIntOp>(0, 1);
-  if (staticSortDimSize >= 0 && staticSortDimSize < kInsertionSortSize) {
-    return initParity;
-  }
-
-  // The while arguments are:
-  // 1. the current size
-  // 2. the original index of the buffers we're currently reading from
-  // 3. the buffers we're currently reading from
-  // 4. the buffers we're currently writing to.
-  //
-  // 1 gets doubled each iteration, 2 gets negated, 3 and 4 are swapped.
-  // int currentSize = 16;
-  SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
-  // First we read from `outputs0` (initialized by the insertion sort above).
-  llvm::copy(outputs0, std::back_inserter(whileInitArgs));
-  llvm::copy(outputs1, std::back_inserter(whileInitArgs));
-
-  SmallVector<Type> whileArgTypes;
-  for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
-
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while (
-  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
-  OpBuilder::InsertionGuard guard(b);
-
-  //        currentSize < totalSize)
-  {
-    Block* before =
-        b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
-    Value currentSize = before->getArgument(0);
-    b.setInsertionPointToEnd(before);
-    b.create<scf::ConditionOp>(arith.slt(currentSize, size),
-                               before->getArguments());
-  }
-
-  size_t numArgs = inputTensors.size();
-  //                                 {
-  {
-    Block* after =
-        b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
-
-    Value currentSize = after->getArgument(0);
-    Value parity = after->getArgument(1);
-    auto readBufs = after->getArguments().drop_front(2).take_front(numArgs);
-    auto writeBufs = after->getArguments().take_back(numArgs);
-
-    Value twoCurrentSize = arith.add(currentSize, currentSize);
-
-    // for (int start = 0; start < size; start += 2*currentSize) {
-    {
-      auto forOp = b.create<scf::ForOp>(zero, size, twoCurrentSize);
-      b.setInsertionPointToStart(forOp.getBody());
-      Value start = forOp.getBody()->getArgument(0);
-
-      Value mid = b.create<MinSIOp>(size, arith.add(start, currentSize));
-      Value end = b.create<MinSIOp>(size, arith.add(start, twoCurrentSize));
-      emitMerge(b, start, mid, end, readBufs, writeBufs, comparator);
-      b.setInsertionPointAfter(forOp);
-    }
-    // }
-
-    // parity = !parity;
-    Value one = b.create<arith::ConstantIntOp>(1, 1);
-    Value notParity = arith.sub(one, parity);
-    // currentSize *= 2;
-    SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
-    llvm::copy(writeBufs, std::back_inserter(nextWhileArgs));
-    llvm::copy(readBufs, std::back_inserter(nextWhileArgs));
-    b.create<scf::YieldOp>(nextWhileArgs);
-  }
-  // }
-
-  // The result is the parity bit.
-  return whileOp.getResults().drop_front(1).front();
-}
-
-// Helper struct for extracting 1d slices from tensors and memrefs.
-struct Slicer {
-  Slicer(OpBuilder& b, uint64_t sortDim, Value sortDimSize, ValueRange ivs)
-      : sizes(ivs.size() + 1, b.getI64IntegerAttr(1)),
-        strides(ivs.size() + 1, b.getI64IntegerAttr(1)) {
-    sizes[sortDim] = sortDimSize;
-    for (size_t i = 0; i < ivs.size() + 1; ++i) {
-      if (i == sortDim) {
-        offsets.push_back(b.getI64IntegerAttr(0));
-      } else {
-        offsets.push_back(ivs[i - static_cast<int>(i > sortDim)]);
-      }
-    }
-  }
-
-  RankedTensorType toSlicedType(RankedTensorType sourceType) {
-    return tensor::ExtractSliceOp::inferCanonicalRankReducedResultType(
-        /*resultRank=*/1, sourceType, offsets, sizes, strides);
-  }
-
-  MemRefType toSlicedType(MemRefType sourceType) {
-    return memref::SubViewOp::inferRankReducedResultType(
-               {ShapedType::kDynamicSize} /*1D output*/, sourceType, offsets,
-               sizes, strides)
-        .cast<MemRefType>();
-  }
-
-  template <typename Op, typename Ty>
-  Value slice(ImplicitLocOpBuilder& b, Value input) {
-    Ty ty = input.getType().cast<Ty>();
-    return b.create<Op>(toSlicedType(ty), input, offsets, sizes, strides)
-        .getResult();
-  }
-
-  Value apply(ImplicitLocOpBuilder& b, Value input) {
-    Type inTy = input.getType();
-    if (inTy.isa<RankedTensorType>()) {
-      return slice<tensor::ExtractSliceOp, RankedTensorType>(b, input);
-    }
-    assert(inTy.isa<MemRefType>());
-    return slice<memref::SubViewOp, MemRefType>(b, input);
-  }
-
-  SmallVector<OpFoldResult> offsets;
-  SmallVector<OpFoldResult> sizes;
-  SmallVector<OpFoldResult> strides;
-};
-
-SmallVector<Value> sliceMemrefsOrTensors(ImplicitLocOpBuilder& b,
-                                         SmallVector<Value>& ivs,
-                                         Value sortDimSize,
-                                         ValueRange memrefsOrTensors,
-                                         SortOp op) {
-  if (ivs.empty()) return memrefsOrTensors;
-
-  SmallVector<Value> outputs;
-  Slicer slicer(b, op.getDimension(), sortDimSize, ivs);
-  // Create subviews/slices.
-  for (Value out : memrefsOrTensors) {
-    outputs.push_back(slicer.apply(b, out));
-  }
-
-  return outputs;
-}
-
-struct SortOpPattern : public OpRewritePattern<SortOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SortOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Note: the output memrefs aren't necessarily the ones that we return,
-    SmallVector<Value> outputMemrefs;
-    SmallVector<Value> scratchMemrefs;
-
-    Value firstOperand = op.getOperands().front();
-    auto firstOperandType = firstOperand.getType().cast<ShapedType>();
-    int64_t inputRank = firstOperandType.getRank();
-
-    Value sortDimSize = b.createOrFold<tensor::DimOp>(
-        firstOperand, b.create<arith::ConstantIndexOp>(op.getDimension()));
-    int64_t staticSortDimSize = firstOperandType.getDimSize(op.getDimension());
-
-    SmallVector<Value> dynamicDims;
-    for (int i = 0; i < inputRank; ++i) {
-      if (!firstOperandType.isDynamicDim(i)) continue;
-      Value index = b.create<arith::ConstantIndexOp>(i);
-      Value dimOp = b.create<tensor::DimOp>(firstOperand, index);
-      dynamicDims.push_back(dimOp);
-    }
-
-    // Allocate output and scratch memrefs. If the size of the sort dimension is
-    // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
-    // and will be cleaned up later.
-    for (auto input : op.getOperands()) {
-      auto inputType = input.getType().cast<ShapedType>();
-      auto memRefType =
-          MemRefType::get(inputType.getShape(), inputType.getElementType());
-
-      outputMemrefs.push_back(
-          b.create<memref::AllocOp>(memRefType, dynamicDims));
-      scratchMemrefs.push_back(
-          b.create<memref::AllocOp>(memRefType, dynamicDims));
-    }
-
-    b.setInsertionPoint(op);
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    Value one = b.create<arith::ConstantIndexOp>(1);
-
-    Value forInitArg = b.create<arith::ConstantIntOp>(0, 1);
-    SmallVector<scf::ForOp> forOps;
-    SmallVector<Value> ivs;
-    forOps.reserve(inputRank - 1);
-    ivs.reserve(inputRank - 1);
-    for (int64_t i = 0; i < inputRank; ++i) {
-      if (i != op.getDimension()) {
-        Value dim = b.create<arith::ConstantIndexOp>(i);
-        Value ub = b.create<tensor::DimOp>(firstOperand, dim);
-        scf::ForOp& forOp = forOps.emplace_back(
-            b.create<scf::ForOp>(zero, ub, one, ValueRange{forInitArg}));
-        ivs.push_back(forOp.getInductionVar());
-        b.setInsertionPointToStart(&forOp.getRegion().front());
-      }
-    }
-    SmallVector<Value> inputs =
-        sliceMemrefsOrTensors(b, ivs, sortDimSize, op.getOperands(), op);
-    SmallVector<Value> outputs =
-        sliceMemrefsOrTensors(b, ivs, sortDimSize, outputMemrefs, op);
-    SmallVector<Value> scratches =
-        sliceMemrefsOrTensors(b, ivs, sortDimSize, scratchMemrefs, op);
-
-    Value parity =
-        emitBottomUpMergeSort(b, zero, sortDimSize, staticSortDimSize, inputs,
-                              outputs, scratches, op.getRegion());
-
-    // Pass the parity bit through the for loops.
-    for (auto i = static_cast<int64_t>(forOps.size() - 1); i >= 0; --i) {
-      b.setInsertionPointToEnd(&forOps[i].getRegion().front());
-      b.create<scf::YieldOp>(ValueRange{parity});
-      parity = forOps[i]->getResult(0);
-    }
-    b.setInsertionPoint(op);
-
-    SmallVector<Value> outputTensors;
-    for (auto [out0, out1] : llvm::zip(outputMemrefs, scratchMemrefs)) {
-      outputTensors.push_back(b.create<bufferization::ToTensorOp>(
-          b.create<SelectOp>(parity, out1, out0)));
-    }
-
-    rewriter.replaceOp(op, outputTensors);
-    return success();
-  }
-};
-
-struct LegalizeSortPass
-    : public impl::HloLegalizeSortPassBase<LegalizeSortPass> {
-  // Perform the lowering to MLIR control flow.
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = f.getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<SortOpPattern>(ctx);
-
-    mlir::ConversionTarget target(*ctx);
-    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
-    target.addIllegalOp<mhlo::SortOp>();
-
-    if (failed(applyPartialConversion(f, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-}  // namespace mhlo
-}  // namespace mlir
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-mlir::mhlo::createLegalizeSortPass() {
-  return std::make_unique<LegalizeSortPass>();
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
deleted file mode 100644
index eae54f1f4e3..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex_patterns.td
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the legalization pattern that converts complex operations into
-// equivalent real value operations.
-
-include "mlir/IR/OpBase.td"
-include "mlir/Dialect/Func/IR/FuncOps.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-
-//===----------------------------------------------------------------------===//
-// Binary op patterns.
-//===----------------------------------------------------------------------===//
-
-// Add and subtraction are elementwise and can be distributed across the real
-// and imaginary components.
-foreach elementwiseOp = [HLO_AddOp, HLO_SubtractOp] in
-  def : Pat<(elementwiseOp HLO_ComplexTensor:$lhs,
-             HLO_ComplexTensor:$rhs),
-            (HLO_ComplexOp
-              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs)),
-              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs)))>;
-
-// Complex multiplication results in a cross product multiplication between the
-// real and imaginary components such that:
-//   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
-//   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
-def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs,
-           HLO_ComplexTensor:$rhs),
-          (HLO_ComplexOp
-           (HLO_SubtractOp
-            (HLO_MulOp
-             (HLO_RealOp:$lhs_real $lhs),
-             (HLO_RealOp:$rhs_real $rhs)),
-            (HLO_MulOp
-             (HLO_ImagOp:$lhs_imag $lhs),
-             (HLO_ImagOp:$rhs_imag $rhs))),
-           (HLO_AddOp
-            (HLO_MulOp $lhs_real, $rhs_imag),
-            (HLO_MulOp $lhs_imag, $rhs_real)))>;
-
-
-// Division is performed by normalizing the denominator by multiplying by the
-// conjugate of the rhs.
-//   numerator = lhs * conj(rhs)
-//   denominator = rhs * conj(rhs)
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs),
-          (HLO_ComplexOp
-            (HLO_DivOp
-             (HLO_RealOp (HLO_MulOp:$num $lhs,
-                          (HLO_ComplexOp:$conj
-                           (HLO_RealOp $rhs),
-                           (HLO_NegOp (HLO_ImagOp $rhs))))),
-              (HLO_AddOp:$den
-               (HLO_MulOp (HLO_RealOp $rhs), (HLO_RealOp $rhs)),
-               (HLO_MulOp (HLO_ImagOp $rhs), (HLO_ImagOp $rhs)))),
-            (HLO_DivOp (HLO_ImagOp $num), $den))>;
-
-// Absolute value is evaluated as:
-//   result = sqrt(val.real * val.real + val.imag * val.imag)
-def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
-           (HLO_SqrtOp
-             (HLO_AddOp
-              (HLO_MulOp (HLO_RealOp:$real $val), $real),
-              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag)))>;
-
-// Can deconstruct sin(a + ib) as follows:
-//   sin(a) * cosh(b) + icos(a) * sinh(b)
-//   sinh(b) = (e^x - e^-x) / 2
-//   cosh(b) = (e^x + e^-x) / 2
-def : Pat<(HLO_SineOp HLO_ComplexTensor:$val),
-            (HLO_ComplexOp
-              (HLO_DivOp
-                (HLO_MulOp
-                  (HLO_SineOp (HLO_RealOp:$real $val)),
-                  (HLO_AddOp
-                    (HLO_ExpOp:$exp (HLO_ImagOp:$imag $val)),
-                    (HLO_ExpOp:$nexp (HLO_NegOp $imag)))),
-                 (HLO_ConstantOp : $two (ConstantSplat<"2.0"> $real))),
-              (HLO_DivOp
-                (HLO_MulOp
-                  (HLO_CosineOp $real),
-                  (HLO_SubtractOp $exp, $nexp)), $two))>;
-
-// Can deconstruct cos(a + ib) as follows:
-//   cos(a) * cosh(b) - isin(a) * sinh(b)
-//   sinh(b) = (e^x - e^-x) / 2
-//   cosh(b) = (e^x + e^-x) / 2
-def : Pat<(HLO_CosineOp HLO_ComplexTensor:$val),
-            (HLO_ComplexOp
-              (HLO_DivOp
-                (HLO_MulOp
-                  (HLO_CosineOp (HLO_RealOp:$real $val)),
-                  (HLO_AddOp
-                    (HLO_ExpOp:$exp (HLO_ImagOp:$imag $val)),
-                    (HLO_ExpOp:$nexp (HLO_NegOp $imag)))),
-                 (HLO_ConstantOp : $two (ConstantSplat<"2.0"> $real))),
-              (HLO_DivOp
-                (HLO_MulOp
-                  (HLO_SineOp $real),
-                  (HLO_SubtractOp $nexp, $exp)), $two))>;
-
-// Exponential can be lowered to an exponential on the real component and a
-// sum of sinusoids of the imaginary component, which equates to a normal
-// exponential operator multiplied by Euler's formula.
-//
-// Exp(a + ib) = Exp(a) * Exp(ib) = Exp(a) * Cos(b) + Exp(a) * iSin(b))
-class HLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<HLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
-
-def : Pat<(HLO_ExpOp HLO_ComplexTensor:$val),
-          (HLO_ComplexOp
-           (HLO_MulOp
-            (HLO_CosineOp (HLO_ImagOp:$imag $val)),
-            (HLO_ExpOp:$exp (HLO_RealOp:$real $val))),
-           (HLO_MulOp (HLO_SineOp $imag), $exp))>;
-
-foreach pair = [[HLO_ComparisonDirectionValue<"NE">, HLO_OrOp],
-                [HLO_ComparisonDirectionValue<"EQ">, HLO_AndOp]] in {
-def : Pat<(HLO_CompareOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs, pair[0], $compare_type),
-            (pair[1]
-             (HLO_CompareOp (HLO_RealOp $lhs), (HLO_RealOp $rhs), pair[0], $compare_type),
-             (HLO_CompareOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs), pair[0], $compare_type))>;
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
deleted file mode 100644
index 3d2f707b255..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm_pass.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace mhlo {
-
-#define GEN_PASS_DEF_TESTUNFUSEBATCHNORMPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-struct TestUnfuseBatchNormPass
-    : public impl::TestUnfuseBatchNormPassBase<TestUnfuseBatchNormPass> {
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateUnfuseBatchNormInferencePattern(&getContext(), &patterns);
-    populateUnfuseBatchNormTrainingPattern(&getContext(), &patterns);
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<::mlir::Pass> createTestUnfuseBatchNormPass() {
-  return std::make_unique<TestUnfuseBatchNormPass>();
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/CMakeLists.txt
deleted file mode 100644
index 88672e5e298..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-add_subdirectory(IR)
-add_subdirectory(transforms)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/CMakeLists.txt
deleted file mode 100644
index 907ab9407ef..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_dialect_library(THLODialect
-  thlo_ops.cc
-
-  DEPENDS
-  MLIRthlo_opsIncGen
-  MLIRGmlStTilingInterfaceIncGen
-
-  LINK_LIBS PUBLIC
-  GmlStDialect
-  MLIRDestinationStyleOpInterface
-  MLIRIR
-  MLIRMemRefDialect
-  MLIRSideEffectInterfaces
-  MLIRSupport
-  MLIRTensorDialect
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/thlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/thlo_ops.cc
deleted file mode 100644
index db65d0bc878..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/IR/thlo_ops.cc
+++ /dev/null
@@ -1,1094 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-
-#include <algorithm>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-
-namespace mlir {
-namespace {
-
-//===----------------------------------------------------------------------===//
-// Destination-style ops tools
-//===----------------------------------------------------------------------===//
-
-LogicalResult verifyDestinationStyleOp(Operation *op) {
-  auto dstStyleOp = cast<DestinationStyleOpInterface>(*op);
-  if (dstStyleOp.hasBufferSemantics()) return success(op->getNumResults() == 0);
-
-  if (!dstStyleOp.hasTensorSemantics())
-    return op->emitOpError("expected either buffer or tensor semantics");
-
-  return success();
-}
-
-template <typename DstOpTy>
-void printDstStyleOp(
-    DstOpTy op, OpAsmPrinter &p,
-    function_ref<SmallVector<StringRef>(DstOpTy op, OpAsmPrinter &)>
-        printAttrsFn = nullptr) {
-  if (op.getNumDpsInputs() != 0) {
-    p << " ins(";
-    llvm::interleaveComma(
-        op.getOperands().take_front(op.getNumDpsInputs()), p,
-        [&](Value input) { p << input << " : " << input.getType(); });
-    p << ")";
-  }
-  p << " outs(";
-  llvm::interleaveComma(
-      op.getOperands().take_back(op.getNumDpsInits()), p,
-      [&](Value output) { p << output << " : " << output.getType(); });
-  p << ")";
-
-  // Print attributes with custom printing logic.
-  SmallVector<StringRef> elidedAttrs;
-  if (printAttrsFn) elidedAttrs = printAttrsFn(op, p);
-
-  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
-}
-
-ParseResult parseKeywordOperandListWithTypes(
-    OpAsmParser &parser, OperationState &result, StringRef keyword,
-    SmallVectorImpl<Type> *operandTypes) {
-  SmallVector<OpAsmParser::UnresolvedOperand, 4> operands;
-  if (succeeded(parser.parseOptionalKeyword(keyword))) {
-    SMLoc operandsOperandsLoc = parser.getCurrentLocation();
-
-    if (parser.parseCommaSeparatedList(
-            AsmParser::Delimiter::Paren, [&]() -> ParseResult {
-              if (parser.parseOperand(operands.emplace_back(),
-                                      /*allowResultNumber=*/false) ||
-                  parser.parseColon() ||
-                  parser.parseType(operandTypes->emplace_back())) {
-                return failure();
-              }
-              return success();
-            }))
-      return failure();
-
-    if (parser.resolveOperands(operands, *operandTypes, operandsOperandsLoc,
-                               result.operands))
-      return failure();
-  }
-  return success();
-}
-
-ParseResult parseDstStyleOp(
-    OpAsmParser &parser, OperationState &result,
-    function_ref<ParseResult(OpAsmParser &, NamedAttrList &)> parseAttrsFn =
-        nullptr) {
-  // Parse `ins` and `outs`.
-  SmallVector<Type, 4> inputTypes, outputTypes;
-  if (parseKeywordOperandListWithTypes(parser, result, "ins", &inputTypes) ||
-      parseKeywordOperandListWithTypes(parser, result, "outs", &outputTypes))
-    return failure();
-
-  // Add result types.
-  for (Type outputType : outputTypes) {
-    if (outputType.isa<RankedTensorType>()) result.addTypes(outputType);
-  }
-
-  // Parse required attributes.
-  if (parseAttrsFn && failed(parseAttrsFn(parser, result.attributes)))
-    return failure();
-
-  // Parse optional attributes.
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
-  return success();
-}
-
-ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser,
-                                   NamedAttrList &attributes,
-                                   StringRef attributeName) {
-  if (parser.parseKeyword(attributeName) || parser.parseEqual())
-    return failure();
-
-  attributes.set(attributeName, DenseI64ArrayAttr::parse(parser, Type{}));
-  return success();
-}
-
-void printDenseI64ArrayAttr(OpAsmPrinter &p, StringRef attributeName,
-                            ArrayRef<int64_t> attributeValue) {
-  p << " " << attributeName << " = [" << attributeValue << "] ";
-}
-
-bool dimensionsMatch(int64_t d1, int64_t d2) {
-  return ShapedType::isDynamic(d1) || ShapedType::isDynamic(d2) || d1 == d2;
-}
-
-SmallVector<utils::IteratorType> getParallelIteratorTypes(int64_t dimCount) {
-  return SmallVector<utils::IteratorType>(dimCount,
-                                          utils::IteratorType::parallel);
-}
-
-SmallVector<Range> getIterationDomainForTensor(OpBuilder &b, Location loc,
-                                               Value tensor,
-                                               int64_t dimCount = -1) {
-  auto dimValues = tensor::createDimValues(b, loc, tensor);
-  if (dimCount >= 0) dimValues.resize(dimCount);
-  return llvm::to_vector(llvm::map_range(dimValues, [&](OpFoldResult d) {
-    return Range{b.getIndexAttr(0), d, b.getIndexAttr(1)};
-  }));
-}
-
-Value getMaterializedTile(OpBuilder &b, Location loc,
-                          TypedValue<TensorType> tensor,
-                          ArrayRef<OpFoldResult> offsets,
-                          ArrayRef<OpFoldResult> sizes) {
-  SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
-  Value tile = b.create<gml_st::TileOp>(loc, offsets, sizes, strides);
-  return b.create<gml_st::MaterializeOp>(loc, tensor, tile);
-}
-
-}  // namespace
-}  // namespace mlir
-
-// Generated dialect definitions.
-#include "mlir-hlo/Dialect/thlo/IR/thlo_dialect.cc.inc"
-
-namespace mlir {
-namespace thlo {
-
-void THLODialect::initialize() {
-  addOperations<
-#define GET_OP_LIST
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.cc.inc"
-      >();
-}
-
-//===----------------------------------------------------------------------===//
-// YieldOp
-//===----------------------------------------------------------------------===//
-
-LogicalResult checkYieldOutputs(YieldOp yieldOp,
-                                TypeRange expectedElementTypes) {
-  uint64_t numOutputs = expectedElementTypes.size();
-  if (yieldOp.getValues().size() != numOutputs) {
-    return yieldOp.emitOpError("expects number of tensor output args = ")
-           << numOutputs << " to match the number of yield operands = "
-           << yieldOp.getValues().size();
-  }
-
-  for (auto &item : llvm::enumerate(
-           llvm::zip(expectedElementTypes, yieldOp.getOperandTypes()))) {
-    Type outputElementType, resultType;
-    unsigned index = item.index();
-    std::tie(outputElementType, resultType) = item.value();
-    if (outputElementType != resultType)
-      return yieldOp.emitOpError("expects yield operand ")
-             << index << " with type = " << resultType
-             << " to match output arg element type = " << outputElementType;
-  }
-
-  return success();
-}
-
-LogicalResult YieldOp::verify() { return success(); }
-
-//===----------------------------------------------------------------------===//
-// ConcatenateOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-gml_st::TileOp createTileOp(OpBuilder &b, Location loc, Value tensor,
-                            ArrayRef<OpFoldResult> offsets,
-                            ArrayRef<OpFoldResult> sizes) {
-  auto initTy = tensor.getType().cast<RankedTensorType>();
-  SmallVector<OpFoldResult> unitStrides(initTy.getRank(), b.getIndexAttr(1));
-  return b.create<gml_st::TileOp>(loc, offsets, sizes, unitStrides);
-}
-
-}  // namespace
-
-SmallVector<utils::IteratorType> ConcatenateOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getInit().getType().getRank());
-}
-
-SmallVector<Value> ConcatenateOp::getDestinationOperands(OpBuilder &) {
-  return {getInit()};
-}
-
-SmallVector<Range> ConcatenateOp::getIterationDomain(OpBuilder &b) {
-  return getIterationDomainForTensor(b, getLoc(), getInit());
-}
-
-namespace {
-
-// TODO(frgossen): Fuse this as a switch statement if all the operands are unit
-// size in the concatenation dimension.
-Value fuseConcatenateOpThroughTile(ConcatenateOp op, OpBuilder &builder,
-                                   Location loc, Value tile) {
-  uint64_t concatDim = op.getDimension();
-  RankedTensorType resultTy = op.getType(0).cast<RankedTensorType>();
-  int64_t rank = resultTy.getRank();
-  OperandRange allOperands = op.getInputs();
-  Value anyOperand = allOperands.front();
-
-  // Create the shared tile strides, which are the exact same for every operand
-  // tile. Also create a basis for the space sizes, tile offsets, and tile
-  // sizes. These hold the shared values in all non-concat dimensions and can be
-  // amended in the concat dimension to create the individual operand tiles.
-  SmallVector<Value> sharedTileStrides(rank);
-  SmallVector<Value> baseSpaceSizes(rank);
-  SmallVector<Value> baseTileOffsets(rank);
-  SmallVector<Value> baseTileSizes(rank);
-  auto tileOp = tile.getDefiningOp<gml_st::TileOp>();
-  auto tileOffsets =
-      getValueOrCreateConstantIndexOp(builder, loc, tileOp.getMixedOffsets());
-  auto tileSizes =
-      getValueOrCreateConstantIndexOp(builder, loc, tileOp.getMixedSizes());
-  auto tileStrides =
-      getValueOrCreateConstantIndexOp(builder, loc, tileOp.getMixedStrides());
-  for (int64_t i = 0; i < rank; ++i) {
-    Value iCst = builder.create<arith::ConstantIndexOp>(loc, i);
-    sharedTileStrides[i] = tileStrides[i];
-
-    // The space sizes, tile offsets, and tile sizes differ in the concat
-    // dimension. Do not populate these.
-    if (i == static_cast<int64_t>(concatDim)) continue;
-
-    baseSpaceSizes[i] =
-        builder.createOrFold<tensor::DimOp>(loc, anyOperand, iCst);
-    baseTileOffsets[i] = tileOffsets[i];
-    baseTileSizes[i] = tileSizes[i];
-  }
-
-  // Some shared values.
-  ArrayAttr allDynamicStridesOrOffsetsAttr = builder.getI64ArrayAttr(
-      SmallVector<int64_t>(rank, ShapedType::kDynamicStrideOrOffset));
-  ArrayAttr allDynamicSizesAttr = builder.getI64ArrayAttr(
-      SmallVector<int64_t>(rank, ShapedType::kDynamicSize));
-  Value zeroCst = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value concatDimCst = builder.create<arith::ConstantIndexOp>(loc, concatDim);
-  Value maxTileSizeInConcatDim = tileSizes[concatDim];
-
-  // The remaining tile offset in the concat dimension is subtracted by each
-  // operand's size in that dimension. We maintain the invariant
-  // remainingTileOffsetInConcatDim >= 0.
-  Value remainingTileOffsetInConcatDim = tileOffsets[concatDim];
-
-  // Create the relevant subsets per operand. These tiles can be empty at
-  // runtime.
-  SmallVector<Value> subOperands;
-  subOperands.reserve(allOperands.size());
-  for (Value operand : allOperands) {
-    // Create operand space.
-    Value operandSizeInConcatDim =
-        builder.create<tensor::DimOp>(loc, operand, concatDimCst);
-    baseSpaceSizes[concatDim] = operandSizeInConcatDim;
-
-    // Find the current operand's tile offset in the concat dimension. This is
-    // the remaining offset clamped into the bounds of the operand. Note that
-    // the remaining offset is always >= 0.
-    Value operandTileOffsetInConcatDim = builder.create<arith::MinUIOp>(
-        loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
-    baseTileOffsets[concatDim] = operandTileOffsetInConcatDim;
-
-    // Find the current operand's tile size in the concat dimension.
-    Value remainingOperandSizeInConcatDim = builder.create<arith::SubIOp>(
-        loc, operandSizeInConcatDim, operandTileOffsetInConcatDim);
-    baseTileSizes[concatDim] = builder.create<arith::MinUIOp>(
-        loc, remainingOperandSizeInConcatDim, maxTileSizeInConcatDim);
-
-    // Create the operand tile and materialize the subset for this operand.
-    Value tile = builder.create<gml_st::TileOp>(
-        loc, baseTileOffsets, baseTileSizes, sharedTileStrides,
-        allDynamicStridesOrOffsetsAttr, allDynamicSizesAttr,
-        allDynamicStridesOrOffsetsAttr);
-    subOperands.push_back(
-        builder.create<gml_st::MaterializeOp>(loc, operand, tile));
-
-    // Unless it is the last operand, update the remaining tile offset in the
-    // concat dimension. The remaining offset is subtracted by the operand's
-    // size but must remain >= 0.
-    if (operand != allOperands.back()) {
-      Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
-                                                remainingTileOffsetInConcatDim,
-                                                operandSizeInConcatDim);
-      Value sub = builder.create<arith::SubIOp>(
-          loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
-      remainingTileOffsetInConcatDim =
-          builder.create<arith::SelectOp>(loc, cmp, zeroCst, sub);
-    }
-  }
-
-  // Create the tiled concat op.
-  auto tileType = tile.getType().cast<gml_st::TileType>();
-  Value subInit =
-      builder.create<gml_st::MaterializeOp>(loc, op.getInit(), tile);
-  auto subResultType =
-      RankedTensorType::get(tileType.getShape(), resultTy.getElementType());
-  return builder
-      .create<thlo::ConcatenateOp>(loc, subResultType, subOperands, subInit,
-                                   concatDim)
-      ->getResult(0);
-}
-
-Value fuseConcatenateOpThroughPointRecursively(
-    OpBuilder &builder, Location loc, RankedTensorType rankedTy,
-    uint64_t concatDim, SmallVector<Value> &remainingOffsets,
-    ValueRange remainingOperands) {
-  // Bail if called for no operands.
-  if (remainingOperands.empty()) {
-    return {};
-  }
-  Value leadingOperand = remainingOperands.front();
-
-  // Terminal case of exactly one operand.
-  if (remainingOperands.size() == 1) {
-    // Create operand point.
-    SmallVector<int64_t> allDynamicOffsets(rankedTy.getRank(),
-                                           ShapedType::kDynamicStrideOrOffset);
-
-    auto sizeOrStride = builder.getI64ArrayAttr({1});
-    Value operandPoint = builder.create<gml_st::TileOp>(
-        loc, remainingOffsets, ValueRange{}, ValueRange{},
-        builder.getI64ArrayAttr(allDynamicOffsets), sizeOrStride, sizeOrStride);
-
-    return builder.create<gml_st::MaterializeOp>(loc, rankedTy.getElementType(),
-                                                 leadingOperand, operandPoint);
-  }
-
-  // For more than 1 operand, distinguish between the leading operand and the
-  // remainder.
-  assert(remainingOperands.size() > 1 &&
-         "expect more than 1 operand at this point");
-  Value leadingOperandConcatDim =
-      builder.create<tensor::DimOp>(loc, leadingOperand, concatDim);
-  Value leadingOperandPredicate = builder.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::ult, remainingOffsets[concatDim],
-      leadingOperandConcatDim);
-  auto ifOp = builder.create<scf::IfOp>(
-      loc, rankedTy.getElementType(), leadingOperandPredicate,
-      [&](OpBuilder &builder, Location loc) {
-        // For the leading operand, recur with the current offsets.
-        Value fused = fuseConcatenateOpThroughPointRecursively(
-            builder, loc, rankedTy, concatDim, remainingOffsets,
-            leadingOperand);
-        builder.create<scf::YieldOp>(loc, fused);
-      },
-      [&](OpBuilder &builder, Location loc) {
-        // For the remaining operands, substract the leading operand's size from
-        // the remaining offsets in the concatenation dimension.
-        SmallVector<Value> thenRemainingOffsets(remainingOffsets.begin(),
-                                                remainingOffsets.end());
-        thenRemainingOffsets[concatDim] = builder.create<arith::SubIOp>(
-            loc, remainingOffsets[concatDim], leadingOperandConcatDim);
-        Value fused = fuseConcatenateOpThroughPointRecursively(
-            builder, loc, rankedTy, concatDim, thenRemainingOffsets,
-            remainingOperands.drop_front());
-        builder.create<scf::YieldOp>(loc, fused);
-      });
-  return ifOp.getResults().front();
-}
-
-Value fuseConcatenateOpThroughPoint(ConcatenateOp op, OpBuilder &builder,
-                                    Location loc, Value subset) {
-  auto resultTy = op.getType(0).cast<RankedTensorType>();
-  uint64_t concatDim = op.getDimension();
-
-  // Materialize initial offsets.
-  auto tileOp = subset.getDefiningOp<gml_st::TileOp>();
-  SmallVector<Value> initialOffsets =
-      getValueOrCreateConstantIndexOp(builder, loc, tileOp.getMixedOffsets());
-
-  ValueRange initialOperands = op.getInputs();
-  return fuseConcatenateOpThroughPointRecursively(
-      builder, loc, resultTy, concatDim, initialOffsets, initialOperands);
-}
-
-}  // namespace
-
-gml_st::TilingInterface ConcatenateOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  // Create tile subset.
-  auto loc = getLoc();
-  gml_st::TileOp tile = createTileOp(b, loc, getInit(), offsets, sizes);
-
-  auto tiled = fuseConcatenateOpThroughTile(*this, b, loc, tile);
-  return llvm::cast<gml_st::TilingInterface>(tiled.getDefiningOp());
-}
-
-FailureOr<Value> ConcatenateOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "expect unique result idx");
-  return getTiledImplementation(b, offsets, sizes)->getResults().front();
-}
-
-ParseResult ConcatenateOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseDstStyleOp(parser, result);
-}
-
-void ConcatenateOp::print(OpAsmPrinter &p) { printDstStyleOp(*this, p); }
-
-LogicalResult ConcatenateOp::verify() {
-  int64_t concatDim = getDimension();
-
-  ShapedType inputType =
-      getDpsInputOperand(0)->get().getType().cast<ShapedType>();
-  int64_t rank = inputType.getRank();
-  auto inputShape = inputType.getShape();
-
-  Type outputElementType =
-      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
-
-  for (const auto &en : llvm::enumerate(getInputs())) {
-    ShapedType inputArgShapedType = en.value().getType().cast<ShapedType>();
-    auto inputArgShape = inputArgShapedType.getShape();
-
-    if (inputArgShapedType.getElementType() != outputElementType)
-      return emitOpError() << "expected element type of input "
-                           << inputArgShapedType.getElementType()
-                           << " to match output element type "
-                           << outputElementType;
-
-    if (inputArgShapedType.getRank() != rank)
-      return emitOpError() << "expected all args to be rank " << rank
-                           << ", got " << inputArgShapedType.getRank()
-                           << " in arg " << en.index();
-
-    // Make sure that all dimensions, expect for concatenation dim, in the input
-    // arg are equal.
-    // TODO(shyshkov): Also check output dims once tiling is fixed for
-    // ConcatenateOp.
-    for (int64_t i = 0; i < rank; ++i) {
-      if (i == concatDim) continue;
-
-      if (inputShape[i] != inputArgShape[i])
-        return emitOpError()
-               << "shape of input arg " << en.index() << ": "
-               << inputArgShapedType << " doesn't match expected shape "
-               << inputType << " (all dims except concat dim(" << concatDim
-               << ") should match exactly)";
-    }
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-//===----------------------------------------------------------------------===//
-// DynamicBroadcastInDimOp
-//===----------------------------------------------------------------------===//
-
-ParseResult DynamicBroadcastInDimOp::parse(OpAsmParser &parser,
-                                           OperationState &result) {
-  return parseDstStyleOp(parser, result,
-                         [&](OpAsmParser &parser, NamedAttrList &attributes) {
-                           return parseDenseI64ArrayAttr(
-                               parser, attributes, "broadcast_dimensions");
-                         });
-}
-
-void DynamicBroadcastInDimOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<DynamicBroadcastInDimOp>(
-      *this, p,
-      [](DynamicBroadcastInDimOp op,
-         OpAsmPrinter &p) -> SmallVector<StringRef> {
-        printDenseI64ArrayAttr(p, op.getBroadcastDimensionsAttrName(),
-                               op.getBroadcastDimensions());
-        return {op.getBroadcastDimensionsAttrName()};
-      });
-}
-
-LogicalResult DynamicBroadcastInDimOp::verify() {
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType>
-DynamicBroadcastInDimOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getInit().getType().getRank());
-}
-
-SmallVector<Value> DynamicBroadcastInDimOp::getDestinationOperands(
-    OpBuilder &) {
-  return {getInit()};
-}
-
-SmallVector<Range> DynamicBroadcastInDimOp::getIterationDomain(OpBuilder &b) {
-  return getIterationDomainForTensor(b, getLoc(), getInit());
-}
-
-gml_st::TilingInterface DynamicBroadcastInDimOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  // Create tile subset.
-  auto loc = getLoc();
-  auto tile = createTileOp(b, loc, getInit(), offsets, sizes);
-  auto initRank = getInit().getType().cast<RankedTensorType>().getRank();
-
-  // Create the needed constants only once.
-  DenseMap<uint64_t, Value> localIndexConstants;
-  auto getIndexConstant = [&](uint64_t c) -> Value {
-    auto it = localIndexConstants.find(c);
-    if (it != localIndexConstants.end()) return it->second;
-    auto cst = b.create<arith::ConstantIndexOp>(loc, c);
-    localIndexConstants[c] = cst;
-    return cst;
-  };
-
-  DenseSet<int64_t> dimensionsThatStay(getBroadcastDimensions().begin(),
-                                       getBroadcastDimensions().end());
-
-  // Materialize operand space.
-  auto operandTy = getOperand().getType().cast<RankedTensorType>();
-  auto dynamicDims = tensor::createDynamicDimValues(b, loc, getOperand());
-
-  // Materialize operand dimensions.
-  SmallVector<Value> operandDims;
-  int64_t dynamicDimsIdx = 0;
-  operandDims.reserve(operandTy.getRank());
-  for (const auto &it : llvm::enumerate(operandTy.getShape())) {
-    int64_t d = it.value();
-    Value dim = d == ShapedType::kDynamicSize ? dynamicDims[dynamicDimsIdx++]
-                                              : getIndexConstant(d);
-    operandDims.push_back(dim);
-  }
-
-  // Find the expanding dimensions. If corresponding operand and result
-  // dimensions are different then the dimension is expanding.
-  // TODO(frgossen): Use info from known expanding and known non-expanding
-  // dimensions here.
-  SmallVector<Value> operandExpandingDims;
-  for (const auto &it : llvm::enumerate(getBroadcastDimensions())) {
-    auto operandDim = operandDims[it.index()];
-    auto resultDim =
-        b.create<tensor::DimOp>(loc, getInit(), getIndexConstant(it.value()));
-    operandExpandingDims.push_back(b.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ne, operandDim, resultDim));
-  }
-
-  // Compute operand tile offsets.
-  auto tileOpOffsets =
-      getValueOrCreateConstantIndexOp(b, loc, tile.getMixedOffsets());
-  int64_t operandRank = operandTy.getRank();
-  auto staticOffsets = b.getI64ArrayAttr(
-      SmallVector<int64_t>(operandRank, ShapedType::kDynamicStrideOrOffset));
-  SmallVector<Value> operandOffsets;
-  Value zero = getIndexConstant(0);
-  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
-    if (!dimensionsThatStay.contains(initId)) continue;
-    Value isExpanding = operandExpandingDims[operandId++];
-    Value collapsedSubsetOffset = tileOpOffsets[initId];
-    operandOffsets.push_back(b.create<arith::SelectOp>(loc, isExpanding, zero,
-                                                       collapsedSubsetOffset));
-  }
-
-  // Compute operand tile sizes.
-  auto staticTileSizes = b.getI64ArrayAttr(
-      SmallVector<int64_t>(operandRank, ShapedType::kDynamicSize));
-  SmallVector<Value> tileSizes;
-  Value one = getIndexConstant(1);
-  auto tileOpSizes =
-      getValueOrCreateConstantIndexOp(b, loc, tile.getMixedSizes());
-  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
-    if (!dimensionsThatStay.contains(initId)) continue;
-    Value isExpanding = operandExpandingDims[operandId++];
-    Value tileSize = tileOpSizes[initId];
-    tileSizes.push_back(
-        b.create<arith::SelectOp>(loc, isExpanding, one, tileSize));
-  }
-
-  // Create operand tile.
-  auto staticTileStrides =
-      b.getI64ArrayAttr(SmallVector<int64_t>(operandRank, 1));
-  SmallVector<Value> tileStrides = {};
-  auto operandTileTy = b.getType<gml_st::TileType>(
-      SmallVector<int64_t>(operandRank, ShapedType::kDynamicSize));
-  auto operandTile = b.create<gml_st::TileOp>(
-      loc, operandTileTy, operandOffsets, tileSizes, tileStrides, staticOffsets,
-      staticTileSizes, staticTileStrides);
-
-  // Materialize operand tiles.
-  Value tiledInit = b.create<gml_st::MaterializeOp>(loc, getInit(), tile);
-  Value tiledOperand =
-      b.create<gml_st::MaterializeOp>(loc, getOperand(), operandTile);
-
-  // Finally, materialize tiled broadcast.
-  auto tileTy = tile.getType();
-  auto resultTy = getType(0).cast<RankedTensorType>();
-  auto tiledResultTy =
-      RankedTensorType::get(tileTy.getShape(), resultTy.getElementType());
-  return b.create<DynamicBroadcastInDimOp>(
-      loc, TypeRange{tiledResultTy}, tiledOperand, tiledInit,
-      getBroadcastDimensionsAttr(), getKnownExpandingDimensionsAttr(),
-      getKnownNonexpandingDimensionsAttr());
-}
-
-FailureOr<Value> DynamicBroadcastInDimOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "expect unique result idx");
-  return getTiledImplementation(b, offsets, sizes)->getResults().front();
-}
-
-//===----------------------------------------------------------------------===//
-// ScatterOp
-//===----------------------------------------------------------------------===//
-
-ParseResult ScatterOp::parse(OpAsmParser &parser, OperationState &result) {
-  if (parseDstStyleOp(parser, result)) return failure();
-
-  SmallVector<OpAsmParser::Argument> regionArgs;
-  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
-                               /*allowType=*/true, /*allowAttrs=*/true)) {
-    return failure();
-  }
-
-  Region *body = result.addRegion();
-  if (parser.parseRegion(*body, regionArgs)) return failure();
-
-  return success();
-}
-
-void ScatterOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<ScatterOp>(*this, p);
-
-  p << "(";
-  llvm::interleaveComma(getUpdateComputation().getArguments(), p,
-                        [&](auto arg) { p.printRegionArgument(arg); });
-  p << ") ";
-
-  p.printRegion(getUpdateComputation(), /*printEntryBlockArgs=*/false);
-}
-
-LogicalResult ScatterOp::verify() {
-  if (failed(verifyDestinationStyleOp(getOperation()))) return failure();
-
-  auto indicesType = getIndices().getType().cast<ShapedType>();
-  int64_t indicesRank = indicesType.getRank();
-
-  if (indicesRank != 2)
-    return emitOpError() << "expected `indices` to be a 2D tensor";
-
-  auto updatesType = getUpdates().getType();
-  int64_t updatesRank = updatesType.getRank();
-
-  if (updatesType.getDimSize(0) != indicesType.getDimSize(0)) {
-    return emitOpError() << "expected major dimension of `indices` to match "
-                            "major dimension of `updates`";
-  }
-
-  int64_t indexVectorDim = indicesType.getDimSize(1);
-  if (ShapedType::isDynamic(indexVectorDim))
-    return emitOpError() << "expected index vector dimension size to be static";
-
-  auto initType = getInit().getType();
-  int64_t initRank = initType.getRank();
-
-  if (indexVectorDim > initRank) {
-    return emitOpError() << "expected index vector dimension size = "
-                         << indexVectorDim
-                         << " to be smaller or equal than `init` rank = "
-                         << initRank;
-  }
-
-  if (updatesRank - 1 != initRank)
-    return emitOpError() << "expected `updates` rank + 1 to match `init` rank";
-
-  if (updatesType.getElementType() != initType.getElementType()) {
-    return emitOpError()
-           << "expected `updates` element type to match `init` element type";
-  }
-
-  // The update computation should yield exactly 1 result.
-  auto updateTerminator = cast<YieldOp>(getBody()->getTerminator());
-  Type outputElementType =
-      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
-  if (!succeeded(checkYieldOutputs(updateTerminator, outputElementType)))
-    return failure();
-
-  return success();
-}
-
-SmallVector<utils::IteratorType> ScatterOp::getLoopIteratorTypes() {
-  return {utils::IteratorType::reduction};
-}
-
-SmallVector<Value> ScatterOp::getDestinationOperands(OpBuilder &) {
-  return {getInit()};
-}
-
-SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &b) {
-  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getIndices(), 0);
-  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
-}
-
-static Value getSlice(OpBuilder &b, Location loc, Value tensor,
-                      ArrayRef<OpFoldResult> offsets,
-                      ArrayRef<OpFoldResult> sizes) {
-  SmallVector<OpFoldResult> ones(offsets.size(), b.getIndexAttr(1));
-  Value tile = b.create<gml_st::TileOp>(loc, offsets, sizes, ones);
-  return b.create<gml_st::MaterializeOp>(loc, tensor, tile);
-}
-
-static Value getFullSpace(OpBuilder &b, Location loc, Value tensor) {
-  SmallVector<OpFoldResult> sizes = tensor::getMixedSizes(b, loc, tensor);
-  SmallVector<OpFoldResult> offsets(sizes.size(), b.getIndexAttr(0));
-  return getSlice(b, loc, tensor, offsets, sizes);
-}
-
-mlir::gml_st::TilingInterface ScatterOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  Location loc = getLoc();
-  IntegerAttr zeroAttr = b.getIndexAttr(0);
-
-  OpFoldResult tileOffset = offsets.front();
-  OpFoldResult tileSize = sizes.front();
-
-  // Tile outer dimension of updates.
-  Value update = this->getUpdates();
-  auto updateType = update.getType().cast<RankedTensorType>();
-
-  SmallVector<OpFoldResult> updateOffsets(updateType.getRank(), zeroAttr);
-  updateOffsets.front() = tileOffset;
-  SmallVector<OpFoldResult> updateSizes = tensor::getMixedSizes(b, loc, update);
-  updateSizes.front() = tileSize;
-
-  Value updateSlice = getSlice(b, loc, update, updateOffsets, updateSizes);
-
-  // Tile outer dimension of indices.
-  Value indices = this->getIndices();
-
-  SmallVector<OpFoldResult> indicesOffsets{offsets.front(), zeroAttr};
-  indicesOffsets.front() = tileOffset;
-  SmallVector<OpFoldResult> indicesSizes =
-      tensor::getMixedSizes(b, loc, indices);
-  indicesSizes.front() = tileSize;
-
-  Value indicesSlice = getSlice(b, loc, indices, indicesOffsets, indicesSizes);
-
-  // Get full space of the `init` tensor.
-  Value init = this->getInit();
-  Value initSlice = getFullSpace(b, loc, init);
-
-  auto dpsInterface = cast<DestinationStyleOpInterface>(this->getOperation());
-  return dpsInterface.clone(b, loc, TypeRange{initSlice.getType()},
-                            ValueRange{indicesSlice, updateSlice, initSlice});
-}
-
-FailureOr<Value> ScatterOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "variadic scatter is not implemented");
-  return getTiledImplementation(b, offsets, sizes)->getResult(0);
-}
-
-//===----------------------------------------------------------------------===//
-// GatherOp
-//===----------------------------------------------------------------------===//
-
-ParseResult GatherOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseDstStyleOp(parser, result);
-}
-
-void GatherOp::print(OpAsmPrinter &p) { printDstStyleOp(*this, p); }
-
-LogicalResult GatherOp::verify() {
-  auto indicesType = getStartIndices().getType();
-  int64_t indicesRank = indicesType.getRank();
-
-  if (indicesRank != 2)
-    return emitOpError() << "expected `indices` to be a 2D tensor";
-
-  auto initType = getInit().getType();
-  if (indicesType.getDimSize(0) != getInit().getType().getDimSize(0)) {
-    return emitOpError()
-           << "expected major dimension of `startIndices` to match "
-              "major dimension of `init`";
-  }
-
-  if (initType.getNumDynamicDims() > 1 ||
-      (initType.getNumDynamicDims() == 1 && !initType.isDynamicDim(0))) {
-    return emitOpError() << "only the major dimenion of `init` may be dynamic";
-  }
-
-  if (indicesType.isDynamic(1)) {
-    return emitOpError()
-           << "the minor dimensions of `startIndices` must be static";
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType> GatherOp::getLoopIteratorTypes() {
-  return {utils::IteratorType::parallel};
-}
-
-SmallVector<Value> GatherOp::getDestinationOperands(OpBuilder &) {
-  return {getInit()};
-}
-
-SmallVector<Range> GatherOp::getIterationDomain(OpBuilder &b) {
-  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getStartIndices(), 0);
-  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
-}
-
-mlir::gml_st::TilingInterface GatherOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  SmallVector<OpFoldResult> startIndexOffsets{offsets.front(),
-                                              b.getIndexAttr(0)};
-  SmallVector<OpFoldResult> startIndexSizes{
-      sizes.front(),
-      b.getIndexAttr(getStartIndices().getType().getShape().back())};
-  auto subStartIndices = getMaterializedTile(
-      b, getLoc(), getStartIndices(), startIndexOffsets, startIndexSizes);
-
-  int64_t initRank = getInit().getType().getRank();
-  SmallVector<OpFoldResult> initOffsets(initRank, b.getIndexAttr(0));
-  initOffsets[0] = offsets.front();
-  auto initSizes = tensor::getMixedSizes(b, getLoc(), getInit());
-  initSizes[0] = sizes.front();
-  Value initSlice =
-      getMaterializedTile(b, getLoc(), getInit(), initOffsets, initSizes);
-
-  return b
-      .create<GatherOp>(getLoc(), TypeRange{initSlice.getType()},
-                        ValueRange{getOperand(), subStartIndices, initSlice})
-      .getOperation();
-}
-
-FailureOr<Value> GatherOp::generateResultTileValue(
-    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  assert(resultNumber == 0 && "resultNumber > 0 not implemented");
-  return getTiledImplementation(b, offsets, sizes)->getResult(0);
-}
-
-//===----------------------------------------------------------------------===//
-// SortOp
-//===----------------------------------------------------------------------===//
-
-ParseResult SortOp::parse(OpAsmParser &parser, OperationState &result) {
-  if (parseDstStyleOp(parser, result)) return failure();
-
-  SmallVector<OpAsmParser::Argument> regionArgs;
-  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
-                               /*allowType=*/true, /*allowAttrs=*/true)) {
-    return failure();
-  }
-
-  Region *comparator = result.addRegion();
-  if (parser.parseRegion(*comparator, regionArgs)) return failure();
-
-  return success();
-}
-
-void SortOp::print(OpAsmPrinter &p) {
-  printDstStyleOp<SortOp>(*this, p);
-
-  p << "(";
-  llvm::interleaveComma(getComparator().getArguments(), p,
-                        [&](auto arg) { p.printRegionArgument(arg); });
-  p << ") ";
-
-  p.printRegion(getComparator(), /*printEntryBlockArgs=*/false);
-}
-
-LogicalResult SortOp::verify() {
-  auto *comparatorBlock = getBody();
-  auto comparatorArgs = comparatorBlock->getArguments();
-
-  // Checks that the arity of the comparator is equal to twice the number of
-  // inputs.
-  int64_t numInputs = getNumDpsInputs();
-  int64_t numOutputs = getNumDpsInits();
-  if (getNumDpsInits() != numInputs) {
-    return emitOpError() << "expected the number of inputs " << numInputs
-                         << " to match the number of outputs " << numOutputs;
-  }
-  if (comparatorArgs.size() != numInputs * 2) {
-    return emitOpError() << "expected the number of block arguments "
-                         << comparatorArgs.size() << " to be twice the number "
-                         << "of inputs (2*" << numInputs << ")";
-  }
-  // Checks that the comparator's arguments match the element type of the
-  // inputs.
-  TypeRange inputTypes = TypeRange{getInputs()};
-  TypeRange comparatorArgElementTypes = comparatorBlock->getArgumentTypes();
-  for (size_t i = 0; i < getInputs().size(); ++i) {
-    Type inputArgElemType = inputTypes[i].cast<ShapedType>().getElementType(),
-         comparatorArgElemType1 = comparatorArgElementTypes[2 * i],
-         comparatorArgElemType2 = comparatorArgElementTypes[2 * i + 1];
-    if (comparatorArgElemType1 != inputArgElemType ||
-        comparatorArgElemType2 != inputArgElemType)
-      return emitOpError() << "expected element type of input " << i
-                           << " to match type of the corresponding "
-                              "arguments to the comparison function but got "
-                           << inputArgElemType << " and ("
-                           << comparatorArgElemType1 << ", "
-                           << comparatorArgElemType2 << ")";
-  }
-
-  // Checks that the comparator yields exactly one boolean output.
-  YieldOp comparatorTerminator =
-      cast<YieldOp>(comparatorBlock->getTerminator());
-  if (!succeeded(
-          checkYieldOutputs(comparatorTerminator,
-                            TypeRange({IntegerType::get(getContext(), 1)}))))
-    return failure();
-
-  // Checks that the inputs all have the same shape.
-  ArrayRef<int64_t> referenceShape =
-      getInputs().front().getType().cast<ShapedType>().getShape();
-
-  for (auto &item : llvm::enumerate(TypeRange{getInputs()})) {
-    ArrayRef<int64_t> shape = item.value().cast<ShapedType>().getShape();
-    if (shape != referenceShape) {
-      return emitOpError() << "expected all inputs to have the same shape ("
-                           << referenceShape << ") but input " << item.index()
-                           << " has shape (" << shape << ")";
-    }
-  }
-
-  // Checks that the outputs have the same shape as the inputs.
-  for (auto &item : llvm::enumerate(getInits())) {
-    ArrayRef<int64_t> shape =
-        item.value().getType().cast<ShapedType>().getShape();
-    if (shape != referenceShape) {
-      return emitOpError() << "expected outputs to have shape ("
-                           << referenceShape << ") but output " << item.index()
-                           << " has shape (" << shape << ")";
-    }
-  }
-
-  // Checks that the rank of the reference shape is larger than the absolute
-  // value of the sorting dimension. This is enough to ensure that the dimension
-  // is valid, since all inputs are known to have the same shape.
-  int64_t referenceRank = referenceShape.size();
-  if (getDimension() >= referenceRank || getDimension() < 0) {
-    return emitOpError() << "sorting dimension must be in range [0, "
-                         << referenceRank << ") but got " << getDimension();
-  }
-
-  return verifyDestinationStyleOp(getOperation());
-}
-
-SmallVector<utils::IteratorType> SortOp::getLoopIteratorTypes() {
-  return getParallelIteratorTypes(getType(0).cast<ShapedType>().getRank() - 1);
-}
-
-SmallVector<Value> SortOp::getDestinationOperands(OpBuilder &) {
-  return {getInits()};
-}
-
-SmallVector<Range> SortOp::getIterationDomain(OpBuilder &b) {
-  Location loc = getLoc();
-  auto oneInit = getInits().front();
-  auto operandsRank = oneInit.getType().cast<ShapedType>().getRank();
-
-  SmallVector<Range> iterationDomain(operandsRank - 1);
-
-  IntegerAttr zero = b.getIndexAttr(0);
-  IntegerAttr one = b.getIndexAttr(1);
-  int64_t sortDimension = getDimension();
-
-  for (auto axis : llvm::seq<int64_t>(0, operandsRank - 1)) {
-    int64_t operandAxis = (axis >= sortDimension) ? axis + 1 : axis;
-    iterationDomain[axis].offset = zero;
-    iterationDomain[axis].size =
-        b.createOrFold<tensor::DimOp>(loc, oneInit, operandAxis);
-    iterationDomain[axis].stride = one;
-  }
-  return iterationDomain;
-}
-
-mlir::gml_st::TilingInterface SortOp::getTiledImplementation(
-    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes) {
-  auto loc = getLoc();
-  SmallVector<OpFoldResult> tileOffsets = llvm::to_vector(offsets);
-  SmallVector<OpFoldResult> tileSizes = llvm::to_vector(sizes);
-
-  size_t numOutputs = getNumDpsInits();
-  int64_t sortDimension = getDimension();
-
-  Value oneInput = getInputs().front();
-
-  // Capture the entire sorting axis in each tile.
-  tileOffsets.insert(tileOffsets.begin() + sortDimension, b.getIndexAttr(0));
-
-  OpFoldResult sortDimensionSize =
-      b.createOrFold<tensor::DimOp>(loc, oneInput, sortDimension);
-  tileSizes.insert(tileSizes.begin() + sortDimension, sortDimensionSize);
-
-  gml_st::TileOp tile = createTileOp(b, loc, oneInput, tileOffsets, tileSizes);
-
-  // Materialize the tile for each input and init.
-  SmallVector<Value> tiledInputsAndInits;
-  SmallVector<Type> tiledResultTypes;
-  tiledInputsAndInits.reserve(numOutputs * 2);
-  tiledResultTypes.reserve(numOutputs);
-
-  auto tileShape = tile.getType().cast<gml_st::TileType>().getShape();
-
-  for (const auto &input : getInputs()) {
-    tiledInputsAndInits.push_back(
-        b.create<gml_st::MaterializeOp>(loc, input, tile));
-    tiledResultTypes.push_back(RankedTensorType::get(
-        tileShape, input.getType().cast<ShapedType>().getElementType()));
-  }
-
-  for (const auto &init : getInits()) {
-    tiledInputsAndInits.push_back(
-        b.create<gml_st::MaterializeOp>(loc, init, tile));
-  }
-
-  auto dpsInterface = cast<DestinationStyleOpInterface>(this->getOperation());
-  return dpsInterface.clone(b, loc, tiledResultTypes, tiledInputsAndInits);
-}
-
-FailureOr<Value> SortOp::generateResultTileValue(OpBuilder &b,
-                                                 unsigned resultNumber,
-                                                 ArrayRef<OpFoldResult> offsets,
-                                                 ArrayRef<OpFoldResult> sizes) {
-  return getTiledImplementation(b, offsets, sizes)->getResult(resultNumber);
-}
-
-}  // namespace thlo
-}  // namespace mlir
-
-// Generated op classes.
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/CMakeLists.txt
deleted file mode 100644
index 4a9bb5fe4f8..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-include_directories(BEFORE
-    ${CMAKE_CURRENT_BINARY_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_mlir_library(ThloBufferizableOpInterface
-  bufferizable_op_interface_impl.cc
-
-  LINK_LIBS PUBLIC
-  THLODialect
-  MLIRBufferizationDialect
-  MLIRDestinationStyleOpInterface
-)
-
-add_mlir_library(ThloPasses
-  legalize_sort.cc
-
-  DEPENDS
-  MLIRThloPassIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRArithDialect
-  MLIRArithUtils
-  MLIRFuncDialect
-  MLIRMemRefDialect
-  MLIRPass
-  MLIRSCFDialect
-  MLIRTransforms
-)
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/bufferizable_op_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/bufferizable_op_interface_impl.cc
deleted file mode 100644
index 9f63e594236..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/bufferizable_op_interface_impl.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h"
-
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-
-namespace mlir {
-namespace thlo {
-namespace {
-
-using mlir::bufferization::AnalysisState;
-using mlir::bufferization::BufferizableOpInterface;
-using mlir::bufferization::BufferizationOptions;
-using mlir::bufferization::BufferRelation;
-
-// We can reuse the upstream implementation when DestinationStyleOpInterface
-// is moved out of linalg.
-static LogicalResult bufferizeDestinationStyleOpInterface(
-    RewriterBase &rewriter, DestinationStyleOpInterface op,
-    const BufferizationOptions &options) {
-  // Take a guard before anything else.
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(op);
-
-  // Nothing to do. This op is already bufferized.
-  if (op.hasBufferSemantics()) return success();
-
-  if (!op.hasTensorSemantics())
-    return op->emitError() << "expected either buffer or tensor semantics";
-
-  size_t numOutputs = op.getNumDpsInits();
-
-  // New operands for the cloned op.
-  SmallVector<Value> newOperands;
-  newOperands.reserve(op.getNumDpsInputs() + numOutputs);
-
-  for (OpOperand *opOperand : op.getDpsInputOperands()) {
-    if (op.isScalar(opOperand)) {
-      newOperands.push_back(opOperand->get());
-      continue;
-    }
-    FailureOr<Value> buffer = getBuffer(rewriter, opOperand->get(), options);
-    if (failed(buffer)) return failure();
-    newOperands.push_back(*buffer);
-  }
-
-  // New output operands for the cloned op.
-  SmallVector<Value> newOutputs;
-  newOutputs.reserve(numOutputs);
-
-  for (OpResult opResult : op->getOpResults()) {
-    OpOperand *opOperand = op.getDpsInitOperand(opResult.getResultNumber());
-    FailureOr<Value> resultBuffer =
-        getBuffer(rewriter, opOperand->get(), options);
-    if (failed(resultBuffer)) return failure();
-    newOutputs.push_back(*resultBuffer);
-  }
-
-  newOperands.append(newOutputs.begin(), newOutputs.end());
-
-  // Set insertion point now that potential alloc/dealloc are introduced.
-  rewriter.setInsertionPoint(op);
-
-  // Clone the op, but use the new operands. Move the existing block into the
-  // new op. Since the new op does not have any tensor results, it does not
-  // return anything.
-  auto newOp = cast<DestinationStyleOpInterface>(op.cloneWithoutRegions(
-      rewriter, op.getLoc(), /*resultTypes=*/TypeRange{}, newOperands));
-
-  assert(op->getNumRegions() <= 1);
-  if (op->getNumRegions() == 1) {
-    rewriter.inlineRegionBefore(op->getRegion(0), newOp->getRegion(0),
-                                newOp->getRegion(0).begin());
-  }
-
-  // Replace the results of the old op with the new output buffers.
-  bufferization::replaceOpWithBufferizedValues(rewriter, op, newOutputs);
-
-  return success();
-}
-
-struct ThloSortOpBufferizationModel
-    : public BufferizableOpInterface::ExternalModel<
-          ThloSortOpBufferizationModel, SortOp> {
-  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
-                              const AnalysisState & /*state*/) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
-                               const AnalysisState & /*state*/) const {
-    return cast<DestinationStyleOpInterface>(op).isDpsInit(&opOperand);
-  }
-
-  SmallVector<OpOperand *> getAliasingOpOperand(
-      Operation *op, OpResult opResult, const AnalysisState & /*state*/) const {
-    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
-
-    // The i-th OpResult may alias with the i-th "out" tensor.
-    return {dstStyleOp.getDpsInitOperand(opResult.getResultNumber())};
-  }
-
-  SmallVector<OpResult> getAliasingOpResult(
-      Operation *op, OpOperand &opOperand,
-      const AnalysisState & /*state*/) const {
-    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
-
-    // The i-th "out" tensor may alias with the i-th OpResult.
-    if (dstStyleOp.isDpsInit(&opOperand))
-      return {dstStyleOp.getTiedOpResult(&opOperand)};
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    return bufferizeDestinationStyleOpInterface(
-        rewriter, cast<DestinationStyleOpInterface>(op), options);
-  }
-
-  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
-                                const AnalysisState & /*state*/) const {
-    return BufferRelation::Equivalent;
-  }
-};
-
-}  // namespace
-
-}  // namespace thlo
-}  // namespace mlir
-
-void mlir::thlo::registerBufferizableOpInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, thlo::THLODialect * /*dialect*/) {
-    SortOp::attachInterface<ThloSortOpBufferizationModel>(*ctx);
-  });
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/legalize_sort.cc
deleted file mode 100644
index 9c45a9d0577..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/thlo/transforms/legalize_sort.cc
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir-hlo/Dialect/thlo/transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-namespace mlir {
-namespace thlo {
-
-#define GEN_PASS_DEF_THLOLEGALIZESORTPASS
-#include "mlir-hlo/Dialect/thlo/transforms/thlo_passes.h.inc"
-
-namespace {
-
-using ::mlir::arith::AddIOp;
-using ::mlir::arith::MinSIOp;
-using ::mlir::arith::SelectOp;
-
-constexpr uint64_t kInsertionSortSize = 16;
-
-// Inlines the `comparator` region (without terminator) at the current insertion
-// point, replacing the arguments with the given values from `lhs` and `rhs`.
-Value emitComparison(ImplicitLocOpBuilder& b, SmallVector<Value>& lhs,
-                     SmallVector<Value>& rhs, Region& comparator) {
-  assert(comparator.hasOneBlock() && "Comparator must have only one block.");
-  Block& block = comparator.front();
-  assert(block.getTerminator()->getOperands().size() == 1 &&
-         "Comparator must return a single value");
-
-  BlockAndValueMapping mapping;
-  for (auto [idx, arg] : llvm::enumerate(comparator.getArguments())) {
-    Value value = idx % 2 == 0 ? lhs[idx / 2] : rhs[idx / 2];
-    mapping.map(arg, value);
-  }
-
-  for (Operation& op : block.without_terminator()) b.clone(op, mapping);
-  Value result = mapping.lookup(block.getTerminator()->getOperand(0));
-
-  return result;
-}
-
-// Emits a binary search of `pivots` in `arrayMemrefs` (all rank 1) in the range
-// [`left`;`right`). `arrayMemrefs` must be sorted according to `comparator`.
-Value emitBinarySearch(ImplicitLocOpBuilder& b, Value leftInit, Value rightInit,
-                       SmallVector<Value>& pivots, ValueRange arrayMemrefs,
-                       Region& comparator) {
-  SmallVector<Type, 2> types{leftInit.getType(), rightInit.getType()};
-  ArithBuilder arith(b, b.getLoc());
-
-  // while (
-  auto whileOp =
-      b.create<scf::WhileOp>(types, SmallVector<Value, 2>{leftInit, rightInit});
-  OpBuilder::InsertionGuard guard(b);
-
-  //        left < right) {
-  Block* before = b.createBlock(&whileOp.getBefore(), {}, types,
-                                {whileOp.getLoc(), whileOp.getLoc()});
-  {
-    Value left = before->getArgument(0), right = before->getArgument(1);
-    b.setInsertionPointToEnd(before);
-    b.create<scf::ConditionOp>(arith.slt(left, right), before->getArguments());
-  }
-
-  Block* after = b.createBlock(&whileOp.getAfter(), {}, types,
-                               {whileOp.getLoc(), whileOp.getLoc()});
-  {
-    Value left = after->getArgument(0), right = after->getArgument(1);
-    b.setInsertionPointToEnd(after);
-    //   int mid = (left + right) >> 1;
-    Value one = b.create<arith::ConstantIndexOp>(1);
-    Value mid = b.create<arith::ShRUIOp>(arith.add(left, right), one);
-    Value midPlusOne = b.create<AddIOp>(mid, one);
-
-    auto arraysAtMid = llvm::to_vector(
-        llvm::map_range(arrayMemrefs, [&](Value arrayMemref) -> Value {
-          return b.create<memref::LoadOp>(arrayMemref, mid);
-        }));
-    Value cond = emitComparison(b, pivots, arraysAtMid, comparator);
-    //   if (comparator(pivot, array[mid]))
-    //     right = mid;
-    //   else
-    //     left = mid + 1;
-    Value newLeft = arith.select(cond, left, midPlusOne);
-    Value newRight = arith.select(cond, mid, right);
-
-    // }
-    b.create<scf::YieldOp>(ValueRange{newLeft, newRight});
-  }
-
-  return whileOp.getResult(0);
-}
-
-SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
-                                      ValueRange memrefs, Value index) {
-  return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
-    Type type = memref.getType().cast<MemRefType>().getElementType();
-    return b.create<memref::LoadOp>(type, memref, index);
-  }));
-}
-
-void storeMemrefElements(ImplicitLocOpBuilder& b, ValueRange memrefs,
-                         Value index, ValueRange values) {
-  for (auto [value, memref] : llvm::zip(values, memrefs)) {
-    b.create<memref::StoreOp>(value, memref, index);
-  }
-}
-
-// Insertion sorts `inputMemrefs` in the range [`lo`; `hi`), storing the results
-// in `outputMemrefs`. `inputMemrefs` and `outputMemrefs` must all be rank 1 and
-// of identical size.
-void emitInsertionSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                       ValueRange inputMemrefs, ValueRange outputMemrefs,
-                       mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-
-  // array[lo] = inputs[lo];
-  storeMemrefElements(b, outputMemrefs, lo,
-                      loadMemrefElements(b, inputMemrefs, lo));
-
-  // for (int start = lo + 1; start < hi; ++start)
-  {
-    auto forOp = b.create<scf::ForOp>(arith.add(lo, one), hi, one);
-    OpBuilder::InsertionGuard outerGuard(b);
-    b.setInsertionPointToStart(forOp.getBody());
-    Value start = forOp.getInductionVar();
-
-    //   T pivot = inputs[start];
-    auto pivots = loadMemrefElements(b, inputMemrefs, start);
-
-    //   int index = binarySearch(lo, start, pivot, array, comparator);
-    auto index =
-        emitBinarySearch(b, lo, start, pivots, outputMemrefs, comparator);
-
-    //   int n = start - index;  // The number of elements to move
-    Value n = arith.sub(start, index);
-
-    // memmove(&array[index + 1], &array[index], n * sizeof(T))
-    // memref::CopyOp would be nice to use here, but:
-    // 1. It lowers to a quite inefficient library call in the general case
-    //    (strides != 1).
-    // 2. It implements memcpy semantics, but we need memmove here.
-    // So we go with a loop instead.
-    auto copyForOp = b.create<scf::ForOp>(zero, n, one);
-    {
-      OpBuilder::InsertionGuard innerGuard(b);
-      b.setInsertionPointToStart(copyForOp.getBody());
-      Value copyLoopIndex = copyForOp.getInductionVar();
-
-      Value dstIndex = arith.sub(start, copyLoopIndex);
-      Value srcIndex = arith.sub(dstIndex, one);
-      storeMemrefElements(b, outputMemrefs, dstIndex,
-                          loadMemrefElements(b, outputMemrefs, srcIndex));
-    }
-    //   array[index] = pivot;
-    storeMemrefElements(b, outputMemrefs, index, pivots);
-  }
-}
-
-void emitMerge(ImplicitLocOpBuilder& b, Value lo, Value mid, Value hi,
-               ValueRange readBufs, ValueRange writeBufs,
-               mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  // The while loop runs until we reach the end of either interval. It has three
-  // loop-carried variables:
-  // 1. current output index
-  // 2. current read index for interval 1
-  // 3. current read index for interval 2
-  SmallVector<Type> whileArgTypes{lo.getType(), lo.getType(), mid.getType()};
-  SmallVector<Value> whileInitArgs{lo, lo, mid};
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while(
-  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
-  {
-    OpBuilder::InsertionGuard guard(b);
-    {
-      Block* before =
-          b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
-      Value i0 = before->getArgument(1), i1 = before->getArgument(2);
-      b.setInsertionPointToEnd(before);
-
-      //     i0 < mid && i1 < hi) {
-      Value inbounds0 = arith.slt(i0, mid);
-      Value inbounds1 = arith.slt(i1, hi);
-
-      b.create<scf::ConditionOp>(arith._and(inbounds0, inbounds1),
-                                 before->getArguments());
-    }
-
-    {
-      Block* after =
-          b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
-      Value iOut = after->getArgument(0), i0 = after->getArgument(1),
-            i1 = after->getArgument(2);
-      b.setInsertionPointToEnd(after);
-
-      //   auto vals0 = readBufs[i0], vals1 = readBufs[i1];
-      SmallVector<Value> vals0 = loadMemrefElements(b, readBufs, i0);
-      SmallVector<Value> vals1 = loadMemrefElements(b, readBufs, i1);
-
-      //   writeBufs[iOut] = comparator(vals1, vals0)
-      //                       ? readBufs[i1++] : readBufs[i0++];
-      Value cmp = emitComparison(b, vals1, vals0, comparator);
-      SmallVector<Value> pickedVals;
-      for (auto [val0, val1] : llvm::zip(vals0, vals1)) {
-        pickedVals.push_back(b.create<SelectOp>(cmp, val1, val0));
-      }
-      storeMemrefElements(b, writeBufs, iOut, pickedVals);
-
-      Value one = b.create<arith::ConstantIndexOp>(1);
-      Value nexti0 = b.create<SelectOp>(cmp, i0, arith.add(i0, one));
-      Value nexti1 = b.create<SelectOp>(cmp, arith.add(i1, one), i1);
-      //   ++iOut;
-      Value nextIOut = b.create<AddIOp>(iOut, one);
-      b.create<scf::YieldOp>(ValueRange{nextIOut, nexti0, nexti1});
-    }
-  }
-
-  // At this point, exactly one of the input ranges will have leftover elements.
-  Value iOut = whileOp->getResult(0);
-  Value i0 = whileOp->getResult(1);
-  Value i1 = whileOp->getResult(2);
-
-  // We could use memref::CopyOp here, but typically, there aren't many leftover
-  // elements for randomly shuffled inputs.
-  Value leftoverIn0 = arith.slt(i0, mid);
-  Value start = arith.select(leftoverIn0, i0, i1);
-  Value end = arith.select(leftoverIn0, mid, hi);
-  Value n = arith.sub(end, start);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value one = b.create<arith::ConstantIndexOp>(1);
-  auto forOp = b.create<scf::ForOp>(zero, n, one);
-  b.setInsertionPointToStart(forOp.getBody());
-  Value copyIndex = forOp.getInductionVar();
-
-  Value srcIndex = arith.add(start, copyIndex);
-  Value dstIndex = arith.add(iOut, copyIndex);
-  storeMemrefElements(b, writeBufs, dstIndex,
-                      loadMemrefElements(b, readBufs, srcIndex));
-}
-
-Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
-                            int64_t staticSortDimSize, ValueRange inputMemrefs,
-                            ValueRange outputs0, ValueRange outputs1,
-                            mlir::Region& comparator) {
-  ArithBuilder arith(b, b.getLoc());
-  Value size = arith.sub(hi, lo);
-
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  Value insertionSortSize =
-      b.create<arith::ConstantIndexOp>(kInsertionSortSize);
-
-  // Run insertion sort on blocks of size kInsertionSortSize.
-  {
-    auto forBody = [&](OpBuilder& ob, Location loc, Value start, ValueRange) {
-      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
-      Value end = arith.add(
-          b.create<MinSIOp>(arith.add(start, insertionSortSize), size), lo);
-      emitInsertionSort(b, start, end, inputMemrefs, outputs0, comparator);
-      b.create<scf::YieldOp>(ValueRange{});
-    };
-    b.create<scf::ForOp>(/*lowerBound=*/zero, /*upperBound=*/size,
-                         /*step=*/insertionSortSize, /*iterArgs=*/llvm::None,
-                         forBody);
-  }
-
-  Value initParity = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
-  if (staticSortDimSize >= 0 && staticSortDimSize < kInsertionSortSize) {
-    return initParity;
-  }
-
-  // The while arguments are:
-  // 1. the current size
-  // 2. the original index of the buffers we're currently reading from
-  // 3. the buffers we're currently reading from
-  // 4. the buffers we're currently writing to.
-  //
-  // 1 gets doubled each iteration, 2 gets negated, 3 and 4 are swapped.
-  // int currentSize = kInsertionSortSize;
-  SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
-  // First we read from `outputs0` (initialized by the insertion sort above).
-  llvm::copy(outputs0, std::back_inserter(whileInitArgs));
-  llvm::copy(outputs1, std::back_inserter(whileInitArgs));
-
-  SmallVector<Type> whileArgTypes;
-  for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
-
-  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
-
-  // while (
-  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
-  OpBuilder::InsertionGuard guard(b);
-
-  //        currentSize < totalSize)
-  {
-    Block* before =
-        b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
-    Value currentSize = before->getArgument(0);
-    b.setInsertionPointToEnd(before);
-    b.create<scf::ConditionOp>(arith.slt(currentSize, size),
-                               before->getArguments());
-  }
-
-  size_t numArgs = inputMemrefs.size();
-  //                                 {
-  {
-    Block* after =
-        b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
-
-    Value currentSize = after->getArgument(0);
-    Value parity = after->getArgument(1);
-    auto readBufs = after->getArguments().drop_front(2).take_front(numArgs);
-    auto writeBufs = after->getArguments().take_back(numArgs);
-
-    Value twoCurrentSize = arith.add(currentSize, currentSize);
-
-    // for (int start = 0; start < size; start += 2*currentSize) {
-    {
-      auto forOp = b.create<scf::ForOp>(zero, size, twoCurrentSize);
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(forOp.getBody());
-      Value start = forOp.getInductionVar();
-
-      Value mid = b.create<MinSIOp>(size, arith.add(start, currentSize));
-      Value end = b.create<MinSIOp>(size, arith.add(start, twoCurrentSize));
-      emitMerge(b, start, mid, end, readBufs, writeBufs, comparator);
-    }
-    // }
-
-    // parity = !parity;
-    Value one = b.create<arith::ConstantIntOp>(1, 1);
-    Value notParity = arith.sub(one, parity);
-    // currentSize *= 2;
-    SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
-    llvm::copy(writeBufs, std::back_inserter(nextWhileArgs));
-    llvm::copy(readBufs, std::back_inserter(nextWhileArgs));
-    b.create<scf::YieldOp>(nextWhileArgs);
-  }
-  // }
-
-  // The result is the parity bit.
-  return whileOp.getResult(1);
-}
-
-struct Slicer {
-  Slicer(OpBuilder& b, uint64_t sortDim, Value sortDimSize,
-         ValueRange inductionVariables)
-      : sizes(inductionVariables.size() + 1, b.getI64IntegerAttr(1)),
-        strides(inductionVariables.size() + 1, b.getI64IntegerAttr(1)) {
-    sizes[sortDim] = sortDimSize;
-    for (size_t i = 0; i < inductionVariables.size() + 1; ++i) {
-      if (i == sortDim) {
-        offsets.push_back(b.getI64IntegerAttr(0));
-      } else {
-        offsets.push_back(
-            inductionVariables[i - static_cast<int>(i > sortDim)]);
-      }
-    }
-  }
-
-  Value slice(ImplicitLocOpBuilder& b, Value input) {
-    auto ty = input.getType().cast<MemRefType>();
-    auto slicedType = memref::SubViewOp::inferRankReducedResultType(
-                          {ShapedType::kDynamicSize} /*1D output*/, ty, offsets,
-                          sizes, strides)
-                          .cast<MemRefType>();
-    return b
-        .create<memref::SubViewOp>(slicedType, input, offsets, sizes, strides)
-        .getResult();
-  }
-
-  SmallVector<OpFoldResult> offsets;
-  SmallVector<OpFoldResult> sizes;
-  SmallVector<OpFoldResult> strides;
-};
-
-SmallVector<Value> sliceMemrefs(ImplicitLocOpBuilder& b,
-                                SmallVector<Value>& inductionVariables,
-                                Value sortDimSize, ValueRange memrefs,
-                                SortOp op) {
-  if (inductionVariables.empty()) return memrefs;
-
-  SmallVector<Value> slices;
-  Slicer slicer(b, op.getDimension(), sortDimSize, inductionVariables);
-
-  for (Value out : memrefs) {
-    slices.push_back(slicer.slice(b, out));
-  }
-
-  return slices;
-}
-
-struct SortOpPattern : public OpRewritePattern<SortOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(SortOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Lowering thlo to our merge sort implementation necessarily happens after
-    // bufferization.
-    if (!op.hasBufferSemantics())
-      return op->emitError() << "expected buffer semantics";
-
-    // Note: the output memrefs aren't necessarily the ones that we return,
-    ValueRange outputMemrefs = op.getInits();
-    SmallVector<Value> scratchMemrefs;
-    scratchMemrefs.reserve(outputMemrefs.size());
-
-    Value firstInput = op.getOperand(0);
-    auto firstInputType = firstInput.getType().cast<ShapedType>();
-    int64_t inputRank = firstInputType.getRank();
-
-    int64_t sortDim = op.getDimension();
-    Value sortDimSize = b.createOrFold<memref::DimOp>(
-        firstInput, b.create<arith::ConstantIndexOp>(sortDim));
-    int64_t staticSortDimSize = firstInputType.getDimSize(sortDim);
-
-    SmallVector<Value> dynamicDims;
-    for (int i = 0; i < inputRank; ++i) {
-      if (!firstInputType.isDynamicDim(i)) continue;
-      auto index = b.createOrFold<arith::ConstantIndexOp>(i);
-      Value dimOp = b.create<memref::DimOp>(firstInput, index);
-      dynamicDims.push_back(dimOp);
-    }
-
-    // Allocate scratch memrefs. If the size of the sort dimension is
-    // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
-    // and will be cleaned up later.
-    for (auto input : op.getInputs()) {
-      auto inputType = input.getType().cast<ShapedType>();
-      auto memRefType =
-          MemRefType::get(inputType.getShape(), inputType.getElementType());
-      scratchMemrefs.emplace_back(
-          b.create<memref::AllocOp>(memRefType, dynamicDims));
-    }
-
-    b.setInsertionPoint(op);
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    Value one = b.create<arith::ConstantIndexOp>(1);
-
-    Value forInitArg = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
-    SmallVector<scf::ForOp> forOps;
-    SmallVector<Value> inductionVariables;
-    forOps.reserve(inputRank - 1);
-    inductionVariables.reserve(inputRank - 1);
-    for (int64_t i = 0; i < inputRank; ++i) {
-      if (i != sortDim) {
-        Value dim = b.create<arith::ConstantIndexOp>(i);
-        Value upperBound = b.create<memref::DimOp>(firstInput, dim);
-        scf::ForOp& forOp = forOps.emplace_back(b.create<scf::ForOp>(
-            zero, upperBound, one, ValueRange{forInitArg}));
-        inductionVariables.push_back(forOp.getInductionVar());
-        b.setInsertionPointToStart(forOp.SingleBlock::getBody());
-      }
-    }
-    SmallVector<Value> inputs =
-        sliceMemrefs(b, inductionVariables, sortDimSize, op.getInputs(), op);
-    SmallVector<Value> outputs =
-        sliceMemrefs(b, inductionVariables, sortDimSize, outputMemrefs, op);
-    SmallVector<Value> scratches =
-        sliceMemrefs(b, inductionVariables, sortDimSize, scratchMemrefs, op);
-
-    Value parity =
-        emitBottomUpMergeSort(b, zero, sortDimSize, staticSortDimSize, inputs,
-                              outputs, scratches, op.getRegion());
-
-    // Pass the parity bit through the for loops.
-    for (auto i = static_cast<int64_t>(forOps.size() - 1); i >= 0; --i) {
-      b.setInsertionPointToEnd(&forOps[i].getRegion().front());
-      b.create<scf::YieldOp>(ValueRange{parity});
-      parity = forOps[i]->getResult(0);
-    }
-    b.setInsertionPoint(op);
-
-    // If the results are in the scratch memrefs, copy them to the output
-    // memrefs.
-    auto thenBlock = [&](OpBuilder& ob, Location loc) {
-      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
-      for (auto [target, source] : llvm::zip(outputMemrefs, scratchMemrefs)) {
-        b.create<memref::CopyOp>(source, target);
-      }
-      b.create<scf::YieldOp>(ValueRange{});
-    };
-
-    rewriter.replaceOpWithNewOp<scf::IfOp>(op, /*cond=*/parity,
-                                           /*thenBuilder=*/thenBlock,
-                                           /*elseBuilder=*/nullptr);
-
-    return success();
-  }
-};
-
-struct LegalizeSortPass
-    : public impl::ThloLegalizeSortPassBase<LegalizeSortPass> {
-  // Perform the lowering to MLIR control flow.
-  void runOnOperation() override {
-    func::FuncOp f = getOperation();
-    MLIRContext* ctx = f.getContext();
-
-    RewritePatternSet patterns(ctx);
-    patterns.add<SortOpPattern>(ctx);
-
-    mlir::ConversionTarget target(*ctx);
-    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
-    target.addIllegalOp<thlo::SortOp>();
-
-    if (failed(applyPartialConversion(f, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-}  // namespace thlo
-}  // namespace mlir
-
-std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
-mlir::thlo::createLegalizeSortPass() {
-  return std::make_unique<LegalizeSortPass>();
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/CMakeLists.txt
deleted file mode 100644
index f0c966f2646..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-#
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_mlir_library(MLIRBufferTransforms
-  alloc_to_arg_pass.cc
-  buffer_packing.cc
-  buffer_reuse.cc
-  bufferize.cc
-  bufferize_pass.cc
-  collapse_parallel_loops_to_1d_pass.cc
-  copy_removal.cc
-  detensorize_scf_ops.cc
-  generic_host_to_llvm.cc
-  inline_fusion_pass.cc
-  lower_index_cast_pass.cc
-  propagate_static_shapes_to_kernel.cc
-  scalarization.cc
-  shape_simplification.cc
-  symbolic_shape_optimization.cc
-  tile_loops_pass.cc
-  unbufferize_pass.cc
-  unroll_loops.cc
-
-  DEPENDS
-  LMHLOTransformsPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  ChloOps
-  GmlStDialect
-  MhloDialect
-  MLIRGPUOps
-  MLIRHLOAnalysis
-  MLIRIR
-  MLIRPass
-  MLIRShapeDialect
-  MLIRTransforms
-  THLODialect
-  ThloBufferizableOpInterface
-)
-
-add_mlir_library(MLIRHLOGPUTransforms
-  gpu_kernel_lowering_passes.cc
-  gpu_fusion_rewrite.cc
-  hlo_to_gpu_pipeline.cc
-
-  DEPENDS
-  LMHLOGPUTransformsPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  GmlStPasses
-  MLIRArithTransforms
-  MLIRGPUOps
-  MLIRHLOAnalysis
-  MLIRIR
-  MLIRPass
-  MLIRShapeDialect
-  MLIRSCFTransforms
-  MLIRTransforms
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/alloc_to_arg_pass.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/alloc_to_arg_pass.cc
deleted file mode 100644
index 7441ee2dc12..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/alloc_to_arg_pass.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This files implements a pass that partially bufferized IR.
-
-#include <cstdint>
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Value.h"
-
-namespace mlir {
-
-#define GEN_PASS_DEF_ALLOCTOARGPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-using ::mlir::func::FuncOp;
-
-namespace {
-class AllocToArgPass : public impl::AllocToArgPassBase<AllocToArgPass> {
- public:
-  using AllocToArgPassBase<AllocToArgPass>::AllocToArgPassBase;
-
- private:
-  void runOnOperation() override;
-};
-}  // namespace
-
-void AllocToArgPass::runOnOperation() {
-  FuncOp funcOp = getOperation();
-  IRRewriter rewriter(funcOp.getContext());
-  BitVector resultsToErase(funcOp.getNumResults());
-  Operation *terminator = funcOp.getBody().back().getTerminator();
-  for (OpOperand &result : terminator->getOpOperands()) {
-    Operation *allocOp = result.get().getDefiningOp();
-    if (!allocOp || !isa<memref::AllocOp>(allocOp)) {
-      terminator->emitOpError("expected operand #")
-          << result.getOperandNumber() << " to be defined by an memref.alloc";
-      return signalPassFailure();
-    }
-    resultsToErase.set(result.getOperandNumber());
-    auto attrs = funcOp.getResultAttrDict(result.getOperandNumber());
-    funcOp.insertArgument(funcOp.getNumArguments(), result.get().getType(),
-                          attrs, result.get().getLoc());
-    rewriter.replaceOp(allocOp, funcOp.getArguments().back());
-  }
-  funcOp.eraseResults(resultsToErase);
-  terminator->eraseOperands(resultsToErase);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> hlo::createAllocToArgPass() {
-  return std::make_unique<AllocToArgPass>();
-}
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gml_st_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gml_st_pipeline.cc
deleted file mode 100644
index 7be184c8228..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gml_st_pipeline.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir-hlo/Transforms/gml_st_pipeline.h"
-
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-namespace mlir {
-
-using ::mlir::func::FuncOp;
-
-void createGmlStPipeline(mlir::OpPassManager& pm,
-                         const GmlStPipelineOptions& options) {
-  // Transforms HLO to Linalg + THLO.
-  pm.addNestedPass<FuncOp>(mhlo::createLegalizeMHLOToTHLOPass());
-  pm.addNestedPass<FuncOp>(mhlo::createLegalizeHloToLinalgPass());
-
-  // Perform tiling, fusion, vectorization and other transformations.
-  pm.addNestedPass<FuncOp>(
-      gml_st::createTilingPass("", "", true, options.tileSizes));
-  pm.addNestedPass<FuncOp>(gml_st::createFusionPass());
-  pm.addNestedPass<FuncOp>(createScalarizationPass());
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
-  if (!options.lowerToLoops) return;
-
-  // Bufferization-related passes.
-  pm.addNestedPass<FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
-  pm.addPass(hlo::createOneShotBufferizePass());
-  pm.addPass(createCSEPass());
-  pm.addPass(createCanonicalizerPass());
-  pm.addNestedPass<FuncOp>(bufferization::createBufferDeallocationPass());
-
-  // Convert Linalg + GmlSt to SCF loops.
-  pm.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
-  pm.addNestedPass<FuncOp>(gml_st::createVectorizeGmlStLoopsPass());
-  pm.addNestedPass<FuncOp>(gml_st::createGmlStToScfPass());
-
-  pm.addNestedPass<FuncOp>(createLowerAffinePass());
-  pm.addPass(createConvertSCFToCFPass());
-  pm.addPass(hlo::createGenericHostToLLVMPass());
-}
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/hlo_to_gpu_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/hlo_to_gpu_pipeline.cc
deleted file mode 100644
index 57272dbbd4b..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/hlo_to_gpu_pipeline.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// This files contains a pipeline which converts HLO operations to GPU kernels
-/// written in a combination of LLVM and NVVM dialects.
-
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Transforms/gpu_passes.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
-#include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/SCF/Transforms/Passes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-using namespace mlir;
-using ::mlir::func::FuncOp;
-using ::mlir::gpu::GPUModuleOp;
-
-static constexpr const char* kBlockDistributionLabel = "block";
-static constexpr const char* kWarpDistributionLabel = "warp";
-static constexpr const char* kThreadDistributionLabel = "thread";
-
-// TODO(b/233761238): We only want to have this pipeline temporarily, as it is
-// not yet clear how exactly it will look like. The goal is to merge this with
-// the unified kernel generator + autofusion + XLA Next pipeline once we have
-// it, and once this code stabilizes.
-void mlir::createHloToGpuPipeline(OpPassManager& pm,
-                                  ArrayRef<int64_t> blockTileDim,
-                                  ArrayRef<int64_t> warpTileDim,
-                                  ArrayRef<int64_t> threadTileDim,
-                                  bool experimentalSoftmax) {
-  pm.addNestedPass<FuncOp>(hlo::createUnbufferizePass());
-  pm.addNestedPass<FuncOp>(hlo::createInlineFusionPass());
-  pm.addPass(createCSEPass());  // Combine repeated subtract(broadcast).
-
-  // HLO -> Linalg
-  pm.addNestedPass<FuncOp>(mhlo::createChloLegalizeToHloPass());
-  pm.addPass(createCanonicalizerPass());  // Clean up shape.assuming ops.
-  pm.addNestedPass<FuncOp>(mhlo::createLegalizeHloToLinalgPass());
-
-  // Perform tiling either for softmax or for element-wise.
-  if (experimentalSoftmax) {
-    // Simplify unit dimension.
-    pm.addPass(mlir::createLinalgFoldUnitExtentDimsPass());
-
-    // Tile parallel dimensions of the softmax-like patterns and distribute them
-    // across warps. Warps remain independant of each other.
-    pm.addNestedPass<FuncOp>(gml_st::createTilingSoftmaxPass(
-        /*distribute=*/true, blockTileDim, kBlockDistributionLabel));
-    pm.addNestedPass<FuncOp>(gml_st::createTilingSoftmaxPass(
-        /*distribute=*/true, warpTileDim, kWarpDistributionLabel));
-
-    // GPU-specific tiling for ops on the warp level.
-    pm.addNestedPass<FuncOp>(gml_st::createTilingGPUWarpPass());
-    pm.addNestedPass<FuncOp>(createScalarizationPass());
-
-    pm.addNestedPass<FuncOp>(gml_st::createVectorizeGmlStLoopsPass(
-        /*vectorizeGmlStOps=*/true, /*distributionLabels=*/{
-            kWarpDistributionLabel, kThreadDistributionLabel}));
-  } else {
-    // TODO(b/244313563): This is a workaround to avoid temporary allocs within
-    // threads. It works for as long as all of our operations are cwise.
-    // Vectorize the inner loops instead.
-    // TODO(frgossen): We should not have to skip this pass for softmax.
-    pm.addNestedPass<FuncOp>(createLinalgElementwiseOpFusionPass());
-
-    // Tiling
-    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
-        /*distribute=*/true, blockTileDim, kBlockDistributionLabel));
-    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
-        /*distribute=*/true, warpTileDim, kWarpDistributionLabel));
-    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
-        /*distribute=*/true, threadTileDim, kThreadDistributionLabel));
-    pm.addNestedPass<FuncOp>(createScalarizationPass());
-  }
-
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
-  // Bufferization-related passes.
-  pm.addNestedPass<FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
-  pm.addPass(hlo::createOneShotBufferizePass());
-  // We do not deallocate buffers, since grid-level buffers get converted into
-  // functions arguments, while block- (and lower-)level buffers become shared
-  // memory. None of which have to be deallocated.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-  // Canonicalize away memory copies into itself
-  pm.addPass(createCanonicalizerPass());
-
-  // Linalg + GmlSt -> GPU
-  pm.addNestedPass<FuncOp>(createGmlStToGpuPass());
-  pm.addNestedPass<FuncOp>(gml_st::createGmlStToScfPass());
-  pm.addNestedPass<FuncOp>(arith::createArithExpandOpsPass());
-  pm.addNestedPass<FuncOp>(createConvertLinalgToLoopsPass());
-  pm.addNestedPass<FuncOp>(createCanonicalizerPass());
-  pm.addPass(createGpuLauchSinkIndexComputationsPass());
-  constexpr llvm::StringRef kGpuDataLayoutSpec =
-      "#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>";
-  pm.addPass(createGpuKernelOutliningPass(kGpuDataLayoutSpec));
-  pm.addNestedPass<GPUModuleOp>(createForLoopSpecializationPass());
-  pm.addNestedPass<GPUModuleOp>(hlo::createUnrollLoopsPass());
-  pm.addNestedPass<GPUModuleOp>(createLowerAffinePass());
-  pm.addNestedPass<GPUModuleOp>(createCanonicalizerPass());
-  pm.addNestedPass<GPUModuleOp>(createConvertSCFToCFPass());
-
-  // GPU -> low-level IR
-#if TENSORFLOW_USE_ROCM
-  pm.addNestedPass<GPUModuleOp>(createGpuKernelToRocdlPass());
-#else
-  pm.addNestedPass<GPUModuleOp>(createGpuKernelToNvvmPass());
-#endif
-  pm.addPass(createPropagateStaticShapesToKernelPass());
-  pm.addNestedPass<GPUModuleOp>(createCSEPass());
-  // Some instructions crash ptxas down the line if they have debug info
-  // attached.
-  pm.addNestedPass<GPUModuleOp>(createStripDebugInfoPass());
-  pm.addNestedPass<FuncOp>(hlo::createAllocToArgPass());
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/inline_fusion_pass.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/inline_fusion_pass.cc
deleted file mode 100644
index f91540153c5..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/inline_fusion_pass.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This files implements a pass that inlines all mlho.fusion op regions.
-
-#include <cstdint>
-#include <memory>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
-
-namespace mlir {
-
-#define GEN_PASS_DEF_INLINEFUSIONPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-using ::mlir::func::FuncOp;
-
-namespace {
-// Inlines all mhlo.fusion op regions.
-class InlineFusionPass : public impl::InlineFusionPassBase<InlineFusionPass> {
- public:
-  using InlineFusionPassBase<InlineFusionPass>::InlineFusionPassBase;
-
- private:
-  void runOnOperation() override;
-};
-}  // namespace
-
-void InlineFusionPass::runOnOperation() {
-  FuncOp funcOp = getOperation();
-  IRRewriter rewriter(funcOp.getContext());
-  funcOp->walk([&](mhlo::FusionOp fusionOp) {
-    assert(fusionOp.getFusedComputation().hasOneBlock());
-    rewriter.setInsertionPoint(fusionOp);
-    BlockAndValueMapping bvm;
-    Block& body = *fusionOp.getFusedComputation().begin();
-    bvm.map(body.getArguments(), fusionOp.getInputs());
-    for (auto& op : body.without_terminator()) rewriter.clone(op, bvm);
-    auto results = llvm::map_range(
-        body.getTerminator()->getOperands(),
-        [&](Value operand) { return bvm.lookupOrDefault(operand); });
-    rewriter.replaceOp(fusionOp, llvm::to_vector(results));
-  });
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> hlo::createInlineFusionPass() {
-  return std::make_unique<InlineFusionPass>();
-}
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/lower_index_cast_pass.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/lower_index_cast_pass.cc
deleted file mode 100644
index 084d64e242c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/lower_index_cast_pass.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file contains the patterns to convert std.index_cast on tensors to
-// tensor ops and index_cast on scalars.
-
-#include <utility>
-
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-
-#define GEN_PASS_DEF_LOWERINDEXCASTPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-namespace {
-
-// index_cast is not defined on tensors, so lower it to a tensor.generate.
-struct IndexCastConverter : public OpRewritePattern<arith::IndexCastOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(arith::IndexCastOp op,
-                                PatternRewriter &rewriter) const final {
-    // Only rank 1 is supported for now.
-    auto resultTy = op.getType().dyn_cast<ShapedType>();
-    if (!resultTy || resultTy.getRank() != 1) return failure();
-
-    rewriter.replaceOpWithNewOp<tensor::GenerateOp>(
-        op, op.getType(),
-        resultTy.hasStaticShape() ? ValueRange{}
-                                  : ValueRange{rewriter.create<tensor::DimOp>(
-                                        op.getLoc(), op.getIn(), 0)},
-        [&](OpBuilder &b, Location loc, ValueRange args) {
-          Value dim = args.front();
-          Value extent = b.create<tensor::ExtractOp>(loc, op.getIn(), dim);
-          Value casted = b.create<arith::IndexCastOp>(
-              loc, resultTy.getElementType(), extent);
-          b.create<tensor::YieldOp>(loc, casted);
-        });
-    return success();
-  }
-};
-
-struct LowerIndexCastPass
-    : public impl::LowerIndexCastPassBase<LowerIndexCastPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<tensor::TensorDialect>();
-  }
-
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<IndexCastConverter>(patterns.getContext());
-    if (failed(
-            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
-      return signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass() {
-  return std::make_unique<LowerIndexCastPass>();
-}
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/scalarization.cc b/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/scalarization.cc
deleted file mode 100644
index f538c4ccde2..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/scalarization.cc
+++ /dev/null
@@ -1,528 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tensor/Utils/Utils.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir {
-namespace {
-
-#define GEN_PASS_DEF_SCALARIZATIONPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
-
-using linalg::GenericOp;
-using tensor::ExtractOp;
-using tensor::FromElementsOp;
-using tensor::InsertOp;
-
-template <typename ShapedTy>
-bool hasSingleElement(ShapedTy type) {
-  return type.hasStaticShape() && type.getNumElements() == 1;
-}
-
-struct ScalarizeGenericOp : public OpRewritePattern<GenericOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GenericOp genericOp,
-                                PatternRewriter &rewriter) const override {
-    auto isNonScalar = [](Type type) {
-      return type.isa<TensorType>() &&
-             !hasSingleElement(type.cast<TensorType>());
-    };
-    if (llvm::any_of(genericOp.getOperandTypes(), isNonScalar) ||
-        llvm::any_of(genericOp.getResultTypes(), isNonScalar))
-      return failure();
-
-    // Map block arguments of genericOp to tensor.extract ops of its args.
-    Location loc = genericOp.getLoc();
-    BlockAndValueMapping bvm;
-    for (OpOperand &opOperand : genericOp->getOpOperands()) {
-      Value operandValue = opOperand.get();
-      Type operandType = operandValue.getType();
-      auto bbArg = genericOp.getMatchingBlockArgument(&opOperand);
-      if (!operandType.isa<ShapedType>()) continue;
-
-      SmallVector<Value> indices(
-          operandType.cast<RankedTensorType>().getRank(),
-          rewriter.create<arith::ConstantIndexOp>(loc, 0));
-      Value extractedElement =
-          rewriter.create<ExtractOp>(loc, operandValue, indices);
-      bvm.map(bbArg, extractedElement);
-    }
-
-    // Clone everything but terminator.
-    Block *body = genericOp.getBody();
-    for (Operation &op : body->without_terminator()) {
-      // `linalg.index` can only result in 0 for scalar linalg.generic.
-      if (auto indexOp = dyn_cast<linalg::IndexOp>(op)) {
-        Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-        bvm.map(indexOp.getResult(), zero);
-        continue;
-      }
-      rewriter.clone(op, bvm);
-    }
-
-    // Wrap every scalar result into a tensor using `tensor.from_elements`.
-    SmallVector<Value> newResults;
-    for (auto [resultType, yieldOperand] :
-         llvm::zip(genericOp->getResultTypes(),
-                   body->getTerminator()->getOperands())) {
-      auto scalarValue = bvm.lookupOrDefault(yieldOperand);
-      newResults.push_back(
-          rewriter.create<FromElementsOp>(loc, resultType, scalarValue));
-    }
-    rewriter.replaceOp(genericOp, newResults);
-
-    return success();
-  }
-};
-
-// Extracts a point using gml_st.materialize and gml_st.tile with 1 element.
-Value getPoint(OpBuilder &b, Location loc, Value tensor, ValueRange indices) {
-  IntegerAttr oneAttr = b.getIndexAttr(1);
-
-  auto tensorType = tensor.getType().cast<RankedTensorType>();
-  int64_t tensorRank = tensorType.getRank();
-
-  SmallVector<OpFoldResult> offsets(indices.begin(), indices.end());
-  SmallVector<OpFoldResult> sizes(tensorRank, oneAttr);
-  SmallVector<OpFoldResult> strides(tensorRank, oneAttr);
-
-  Value tile = b.create<gml_st::TileOp>(loc, offsets, sizes, strides);
-  return b.create<gml_st::MaterializeOp>(loc, tensorType.getElementType(),
-                                         tensor, tile);
-}
-
-// Returns `startIndices`[0, :] for `startIndices` of shape 1xn. Returns None if
-// startIndices has a different shape.
-Optional<SmallVector<Value>> extractStartIndices(
-    ImplicitLocOpBuilder &b, TypedValue<TensorType> startIndices) {
-  if (startIndices.getType().getRank() != 2 ||
-      startIndices.getType().getDimSize(0) != 1) {
-    return llvm::None;
-  }
-
-  int64_t indexVectorSize = startIndices.getType().getDimSize(1);
-  SmallVector<Value> result;
-  result.reserve(indexVectorSize);
-  Value zero = b.create<arith::ConstantIndexOp>(0);
-  for (int64_t i = 0; i < indexVectorSize; ++i) {
-    result.push_back(b.create<ExtractOp>(
-        startIndices, ValueRange{zero, b.create<arith::ConstantIndexOp>(i)}));
-  }
-  return result;
-}
-
-struct ScalarizeScatterOp : public OpRewritePattern<thlo::ScatterOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(thlo::ScatterOp scatterOp,
-                                PatternRewriter &rewriter) const override {
-    Location loc = scatterOp.getLoc();
-    ImplicitLocOpBuilder b(loc, rewriter);
-
-    auto scatterIndices = extractStartIndices(b, scatterOp.getIndices());
-    if (!scatterIndices) return failure();
-
-    // Create the loop nest that spans window dimensions of `updates`.
-    Value updates = scatterOp.getUpdates();
-    auto updatesType = updates.getType().dyn_cast<RankedTensorType>();
-    if (!updatesType) return failure();
-    int64_t updatesRank = updatesType.getRank();
-
-    SmallVector<OpFoldResult> updatesDimSizes =
-        tensor::getMixedSizes(b, loc, updates);
-    auto updatesDimValues =
-        getValueOrCreateConstantIndexOp(b, loc, updatesDimSizes);
-
-    Value init = scatterOp.getInit();
-    auto initType = init.getType().dyn_cast<RankedTensorType>();
-    if (!initType) return failure();
-
-    int64_t initRank = initType.getRank();
-
-    SmallVector<OpFoldResult> initDimSizes =
-        tensor::getMixedSizes(b, loc, init);
-    auto initDimValues = getValueOrCreateConstantIndexOp(b, loc, initDimSizes);
-
-    Value initTile = b.create<gml_st::TileOp>(
-        loc, SmallVector<OpFoldResult>(initRank, b.getI64IntegerAttr(0)),
-        initDimSizes,
-        SmallVector<OpFoldResult>(initRank, b.getI64IntegerAttr(1)));
-
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    Value one = b.create<arith::ConstantIndexOp>(1);
-
-    // Create a loop that spans the dimensions of the update slice.
-    SmallVector<Value> lbs(updatesRank, zero);
-    SmallVector<Value> steps(updatesRank, one);
-
-    auto loop = b.create<gml_st::ForOp>(
-        TypeRange(ValueRange{init}), lbs, updatesDimValues, steps, init,
-        [&](OpBuilder &nestedBuilder, Location bodyLoc, ValueRange updateIndex,
-            ValueRange loopInits) {
-          Value initBlockArg = loopInits.front();
-
-          auto initIndex = llvm::to_vector(updateIndex.drop_front());
-          for (const auto &en : llvm::enumerate(*scatterIndices)) {
-            initIndex[en.index()] = nestedBuilder.create<arith::AddIOp>(
-                bodyLoc, initIndex[en.index()], en.value());
-          }
-
-          Value indexIsInBounds =
-              isValidIndex(nestedBuilder, loc, initIndex, initDimValues, zero);
-          Value maybeUpdatedInit =
-              nestedBuilder
-                  .create<scf::IfOp>(
-                      loc, initType, indexIsInBounds,
-                      [&](OpBuilder &thenBuilder, Location thenLoc) {
-                        Value updateValue =
-                            getPoint(thenBuilder, loc, updates, updateIndex);
-                        Value currentValue =
-                            getPoint(thenBuilder, loc, initBlockArg, initIndex);
-
-                        // Combine update with the value in the output.
-                        Block *body = scatterOp.getBody();
-                        BlockAndValueMapping bvm;
-                        bvm.map(body->getArgument(0), updateValue);
-                        bvm.map(body->getArgument(1), currentValue);
-
-                        for (Operation &op : body->without_terminator())
-                          thenBuilder.clone(op, bvm);
-
-                        // Wrap every scalar result into a tensor using
-                        // `tensor.from_elements`.
-                        auto combinedValue =
-                            bvm.lookup(body->getTerminator()->getOperand(0));
-                        Value updatedInit = thenBuilder.create<InsertOp>(
-                            thenLoc, combinedValue, initBlockArg, initIndex);
-                        thenBuilder.create<scf::YieldOp>(thenLoc, updatedInit);
-                      },
-                      [&](OpBuilder &elseBuilder, Location elseLoc) {
-                        elseBuilder.create<scf::YieldOp>(elseLoc, initBlockArg);
-                      })
-                  .getResult(0);
-
-          nestedBuilder.create<gml_st::SetYieldOp>(bodyLoc, maybeUpdatedInit,
-                                                   initBlockArg, initTile);
-        });
-    rewriter.replaceOp(scatterOp, loop.getResults());
-    return success();
-  }
-
- private:
-  // Return i1 value after checking that 0 <= indices < dims(tensor).
-  Value isValidIndex(OpBuilder &b, Location loc, ArrayRef<Value> indices,
-                     ArrayRef<Value> tensorDims, Value zero) const {
-    auto i1Type = b.getI1Type();
-    Value isValid = b.create<arith::ConstantOp>(
-        loc, i1Type, IntegerAttr::get(i1Type, APInt(1, 1)));
-
-    for (auto [dim, index] : llvm::zip(tensorDims, indices)) {
-      Value geZero =
-          b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge, index, zero);
-      Value ltDim =
-          b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, index, dim);
-      Value dimInBounds = b.create<arith::AndIOp>(loc, geZero, ltDim);
-      isValid = b.create<arith::AndIOp>(loc, isValid, dimInBounds);
-    }
-    return isValid;
-  }
-};
-
-struct ScalarizeGatherOp : public OpRewritePattern<thlo::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(thlo::GatherOp gatherOp,
-                                PatternRewriter &rewriter) const override {
-    Location loc = gatherOp.getLoc();
-    ImplicitLocOpBuilder b(loc, rewriter);
-    auto startIndices = extractStartIndices(b, gatherOp.getStartIndices());
-    if (!startIndices) return failure();
-
-    TypedValue<ShapedType> init = gatherOp.getInit();
-    ShapedType initTy = init.getType();
-    int64_t initRank = initTy.getRank();
-    SmallVector<OpFoldResult> initDimSizes =
-        tensor::getMixedSizes(b, loc, init);
-    SmallVector<Value> initDimSizeValues =
-        getValueOrCreateConstantIndexOp(b, loc, initDimSizes);
-
-    IntegerAttr oneAttr = b.getI64IntegerAttr(1);
-
-    TypedValue<ShapedType> operand = gatherOp.getOperand();
-    auto operandSizes = getValueOrCreateConstantIndexOp(
-        b, loc, tensor::createDimValues(b, loc, operand));
-
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    Value one = b.create<arith::ConstantIndexOp>(1);
-    SmallVector<Value> lbs(initRank, zero);
-    SmallVector<Value> steps(initRank, one);
-
-    rewriter.replaceOpWithNewOp<gml_st::ForOp>(
-        gatherOp, TypeRange(ValueRange{init}), lbs, initDimSizeValues, steps,
-        init,
-        [&](OpBuilder &nestedBuilder, Location bodyLoc, ValueRange ivs,
-            ValueRange loopInits) {
-          // Compute the index in the operand.
-          SmallVector<Value> readIndices(operand.getType().getRank(), zero);
-          llvm::copy(ivs, readIndices.begin());
-          for (auto &&[readIndex, startIndex] :
-               llvm::zip(readIndices, *startIndices)) {
-            readIndex = nestedBuilder.create<arith::AddIOp>(bodyLoc, readIndex,
-                                                            startIndex);
-          }
-
-          // Clamp the indices.
-          for (auto &&[readIndex, max] : llvm::zip(readIndices, operandSizes)) {
-            auto maxMinusOne =
-                nestedBuilder.createOrFold<arith::SubIOp>(bodyLoc, max, one);
-            readIndex = nestedBuilder.create<arith::MinSIOp>(bodyLoc, readIndex,
-                                                             maxMinusOne);
-            readIndex =
-                nestedBuilder.create<arith::MaxSIOp>(bodyLoc, readIndex, zero);
-          }
-
-          // Materialize the value and yield it.
-          SmallVector<OpFoldResult> ones(initRank, oneAttr);
-          Value tile = nestedBuilder.create<gml_st::TileOp>(
-              bodyLoc, SmallVector<OpFoldResult>(ivs), ones, ones);
-          Value val = getPoint(nestedBuilder, bodyLoc, operand, readIndices);
-          nestedBuilder.create<gml_st::SetYieldOp>(bodyLoc, val,
-                                                   loopInits.front(), tile);
-        });
-    return success();
-  }
-};
-
-// Replace `thlo.concatenate` that is statically known to have only one element
-// in concatenation dimension in all the inputs with
-// `gml_st.materialize/tensor.insert_slice`.
-struct ScalarizeConcatenateOp : public OpRewritePattern<thlo::ConcatenateOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(thlo::ConcatenateOp concatenateOp,
-                                PatternRewriter &rewriter) const override {
-    Location loc = concatenateOp.getLoc();
-    int64_t concatDim = concatenateOp.getDimension();
-
-    auto initTensor = concatenateOp.getInit();
-    auto initType = initTensor.getType();
-    int64_t rank = initTensor.getType().getRank();
-
-    // Only scalarize when it's statically known that output concatenation dim
-    // size is one.
-    if (initType.getShape()[concatDim] != 1) {
-      return failure();
-    }
-
-    IntegerAttr oneAttr = rewriter.getIndexAttr(1);
-    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
-    SmallVector<OpFoldResult> strides(rank, oneAttr);
-
-    SmallVector<OpFoldResult> sizes;
-    for (int i = 0; i < rank; ++i) {
-      if (i == concatDim) {
-        sizes.push_back(oneAttr);
-      } else {
-        sizes.emplace_back(rewriter.create<tensor::DimOp>(loc, initTensor, i));
-      }
-    }
-    Value tile = rewriter.create<gml_st::TileOp>(loc, offsets, sizes, strides);
-
-    auto materializeAndInsert = [&](OpBuilder &b, Location l, Value input) {
-      Value slice = b.create<gml_st::MaterializeOp>(l, input, tile);
-      return b.create<tensor::InsertSliceOp>(l, slice, initTensor, offsets,
-                                             sizes, strides);
-    };
-
-    Value res =
-        extractElementFromInputs(rewriter, loc, concatenateOp.getInputs(),
-                                 initType, concatDim, materializeAndInsert);
-
-    rewriter.replaceOp(concatenateOp, res);
-
-    return success();
-  }
-
- private:
-  Value tensorHasElement(OpBuilder &b, Location loc, Value input,
-                         int64_t concatDim) const {
-    Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
-    Value concatDimSize = b.create<tensor::DimOp>(loc, input, concatDim);
-    return b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, concatDimSize,
-                                   zero);
-  }
-
-  Value extractElementFromInputs(
-      OpBuilder &b, Location loc, ValueRange inputs, Type resultType,
-      int64_t concatDim,
-      llvm::function_ref<Value(OpBuilder &, Location, Value)>
-          materializeAndInsert) const {
-    if (inputs.size() == 1) {
-      return materializeAndInsert(b, loc, inputs.front());
-    }
-
-    return b
-        .create<scf::IfOp>(
-            loc, resultType,
-            tensorHasElement(b, loc, inputs.front(), concatDim),
-            [&](OpBuilder &thenBuilder, Location thenLoc) {
-              thenBuilder.create<scf::YieldOp>(
-                  thenLoc,
-                  materializeAndInsert(thenBuilder, thenLoc, inputs.front()));
-            },
-            [&](OpBuilder &elseBuilder, Location elseLoc) {
-              elseBuilder.create<scf::YieldOp>(
-                  elseLoc, extractElementFromInputs(
-                               elseBuilder, elseLoc, inputs.drop_front(),
-                               resultType, concatDim, materializeAndInsert));
-            })
-        .getResult(0);
-  }
-};
-
-struct ScalarizeDynamicBroadcastInDimOp
-    : public OpRewritePattern<thlo::DynamicBroadcastInDimOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(thlo::DynamicBroadcastInDimOp broadcastOp,
-                                PatternRewriter &rewriter) const override {
-    Location loc = broadcastOp.getLoc();
-    ImplicitLocOpBuilder b(loc, rewriter);
-
-    auto output = broadcastOp.getInit();
-    auto outputType = output.getType().dyn_cast<RankedTensorType>();
-    if (!outputType) return failure();
-
-    if (!hasSingleElement(outputType)) return failure();
-
-    auto input = broadcastOp.getOperand();
-    auto inputType = input.getType().dyn_cast<RankedTensorType>();
-    if (!inputType) return failure();
-
-    Value zero = b.create<arith::ConstantIndexOp>(0);
-    llvm::SmallVector<Value> indicesInput(inputType.getRank(), zero);
-    llvm::SmallVector<Value> indicesOutput(outputType.getRank(), zero);
-
-    Value extractedValue = b.create<ExtractOp>(input, indicesInput);
-    Value result =
-        b.create<tensor::InsertOp>(extractedValue, output, indicesOutput);
-
-    rewriter.replaceOp(broadcastOp, result);
-    return success();
-  }
-};
-
-// Fold `tensor.extract(gml_st.materialize -> tensor<1x1xf32>)` into
-//      `gml_st.materialize -> f32` for single-element tensors.
-struct FoldTensorExtractIntoMaterialize : public OpRewritePattern<ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractOp extractOp,
-                                PatternRewriter &rewriter) const override {
-    auto materializeOp =
-        extractOp.getTensor().getDefiningOp<gml_st::MaterializeOp>();
-    if (!materializeOp) return failure();
-
-    auto tileType =
-        materializeOp.getSet().getType().dyn_cast<gml_st::TileType>();
-    if (!tileType || !hasSingleElement(tileType)) return failure();
-
-    rewriter.replaceOpWithNewOp<gml_st::MaterializeOp>(
-        extractOp, extractOp.getType(), materializeOp.getSource(),
-        materializeOp.getSet());
-    return success();
-  }
-};
-
-// Fold `gml_st.set_yield(tensor.from_elements(x) -> tensor<1x1xf32>)` into
-//      `gml_st.set_yield(x)` for single-element tensors.
-struct FoldTensorFromElementsIntoSetYield
-    : public OpRewritePattern<gml_st::SetYieldOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(gml_st::SetYieldOp yieldOp,
-                                PatternRewriter &rewriter) const override {
-    bool isFoldingPossible = false;
-    SmallVector<Value> newSrcs;
-    for (auto [src, set] : llvm::zip(yieldOp.getSrcs(), yieldOp.getSets())) {
-      auto fromElementsOp = src.getDefiningOp<FromElementsOp>();
-      if (!fromElementsOp) continue;
-
-      if (hasSingleElement(fromElementsOp.getType())) {
-        newSrcs.push_back(fromElementsOp.getElements().front());
-        isFoldingPossible = true;
-        continue;
-      }
-      newSrcs.push_back(src);
-    }
-
-    if (!isFoldingPossible) return failure();
-
-    // Update in-place to make sure that the accumulator regions don't get lost.
-    rewriter.updateRootInPlace(
-        yieldOp, [&]() { yieldOp.getSrcsMutable().assign(newSrcs); });
-    return success();
-  }
-};
-
-void populateTensorInsertExtractFoldingPatterns(RewritePatternSet *patterns) {
-  patterns->add<FoldTensorExtractIntoMaterialize,
-                FoldTensorFromElementsIntoSetYield>(patterns->getContext());
-}
-
-struct ScalarizationPass
-    : public impl::ScalarizationPassBase<ScalarizationPass> {
-  void runOnOperation() override {
-    auto func = getOperation();
-    auto *context = &getContext();
-
-    RewritePatternSet patterns(context);
-    // clang-format off
-    patterns.add<
-        ScalarizeConcatenateOp,
-        ScalarizeDynamicBroadcastInDimOp,
-        ScalarizeGatherOp,
-        ScalarizeGenericOp,
-        ScalarizeScatterOp
-    >(context);
-    // clang-format on
-    populateTensorInsertExtractFoldingPatterns(&patterns);
-    FromElementsOp::getCanonicalizationPatterns(patterns, context);
-    gml_st::ForOp::getCanonicalizationPatterns(patterns, context);
-    if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
-      signalPassFailure();
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createScalarizationPass() {
-  return std::make_unique<ScalarizationPass>();
-}
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/CMakeLists.txt
new file mode 100644
index 00000000000..a4a8881e204
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/CMakeLists.txt
@@ -0,0 +1,19 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+add_subdirectory(analysis)
+add_subdirectory(IR)
+add_subdirectory(transforms)
+add_subdirectory(utils)
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..b982ed73435
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/CMakeLists.txt
@@ -0,0 +1,83 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Need a separate function because of the .cc vs .cpp used in the one provided by MLIR
+set(LLVM_TARGET_DEFINITIONS hlo_ops.td)
+mlir_tablegen(hlo_ops.h.inc -gen-op-decls)
+mlir_tablegen(hlo_ops.cc.inc -gen-op-defs)
+mlir_tablegen(hlo_ops_enums.h.inc -gen-enum-decls)
+mlir_tablegen(hlo_ops_enums.cc.inc -gen-enum-defs)
+mlir_tablegen(hlo_ops_attrs.h.inc -gen-attrdef-decls)
+mlir_tablegen(hlo_ops_attrs.cc.inc -gen-attrdef-defs)
+mlir_tablegen(hlo_ops_typedefs.h.inc -gen-typedef-decls --typedefs-dialect=mhlo)
+mlir_tablegen(hlo_ops_typedefs.cc.inc -gen-typedef-defs --typedefs-dialect=mhlo)
+add_public_tablegen_target(MLIRhlo_opsIncGen)
+add_dependencies(mlir-headers MLIRhlo_opsIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS hlo_patterns.td)
+mlir_tablegen(hlo_patterns.cc.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloRewriterIncGen)
+
+set(LLVM_TARGET_DEFINITIONS mhlo_canonicalize.td)
+mlir_tablegen(mhlo_canonicalize.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloCanonicalizeIncGen)
+
+add_mlir_library(HloOpsCommon
+  hlo_ops_common.cc
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+)
+
+add_mlir_dialect_library(MhloDialect
+  hlo_ops.cc
+  mhlo_bytecode.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloCanonicalizeIncGen
+  MLIRMhloRewriterIncGen
+)
+target_link_libraries(MhloDialect
+  PUBLIC
+  MLIRComplexDialect
+  MLIRIR
+  MLIRMhloUtils
+  MLIRQuantDialect
+  MLIRSparseTensorDialect
+  HloOpsCommon
+  StablehloAssemblyFormat
+  StablehloBase
+  StablehloTypeInference
+)
+target_include_directories(MhloDialect
+  PUBLIC
+  $<BUILD_INTERFACE:${MLIR_HLO_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${MLIR_HLO_GEN_INCLUDE_DIR}>
+)
+
+add_mlir_dialect_library(MhloRegisterDialects
+  init.cc
+DEPENDS
+  MLIRhlo_opsIncGen
+)
+target_link_libraries(MhloRegisterDialects
+  PUBLIC
+  MhloDialect
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/chlo_canonicalize.td
similarity index 83%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/chlo_canonicalize.td
index a8ffd26b7de..dcc0ab46366 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/chlo_canonicalize.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/chlo_canonicalize.td
@@ -16,8 +16,8 @@ limitations under the License.
 // This is the canonicalize pattern definition file.
 
 include "mlir/IR/OpBase.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
+include "mhlo/IR/hlo_ops.td"
+include "mhlo/IR/hlo_utils.td"
 
 def UnaryToBinaryEinsumEq : NativeCodeCall<
   "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
@@ -25,6 +25,6 @@ def UnaryToBinaryEinsumEq : NativeCodeCall<
 // Convert UnaryEinsumOp to EinsumOp with two operands with redundant first
 // operand.
 def UnaryEinsumToEinsum : Pat<
-  (HLO_UnaryEinsumOp $operand, $equation),
-  (HLO_EinsumOp (HLO_ConstantOp (GetScalarOfType<1> $operand)),
+  (MHLO_UnaryEinsumOp $operand, $equation),
+  (MHLO_EinsumOp (MHLO_ConstantOp (GetScalarOfType<1> $operand)),
                 $operand, (UnaryToBinaryEinsumEq $equation))>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_base.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_base.td
new file mode 100644
index 00000000000..cf50dc870d6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_base.td
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_BASE
+#define MLIR_HLO_DIALECT_MHLO_IR_HLO_BASE
+
+include "stablehlo/dialect/Base.td"
+
+//===----------------------------------------------------------------------===//
+// MHLO type definitions.
+//===----------------------------------------------------------------------===//
+
+defvar MHLO_Pred = HLO_Pred;
+
+defvar MHLO_SInt = HLO_SInt;
+defvar MHLO_UInt = HLO_UInt;
+defvar MHLO_Int = HLO_Int;
+
+defvar MHLO_Float = HLO_Float;
+defvar MHLO_Float32Or64 = HLO_Float32Or64;
+
+defvar MHLO_Complex = HLO_Complex;
+
+//===----------------------------------------------------------------------===//
+// Quantized element type definitions.
+//===----------------------------------------------------------------------===//
+
+// Integer-based uniform quantized types. The definitions can be used to specify
+// operand's tensor types.
+defvar MHLO_QuantizedSignedInt = HLO_QuantizedSignedInt;
+defvar MHLO_QuantizedUnsignedInt = HLO_QuantizedUnsignedInt;
+defvar MHLO_QuantizedInt = HLO_QuantizedInt;
+
+// The broadcasting dimensions correspond to a tuple that describes how a
+// smaller rank shape is broadcast into a larger rank shape. For example,
+// given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
+// matching the matrix to dimensions 1 and 2 of the cuboid.
+defvar MHLO_BroadcastDimAttr = I64ElementsAttr;
+
+// Token type.
+defvar MHLO_Token = HLO_Token;
+
+// Any integer tensor types
+defvar MHLO_IntTensor = HLO_IntTensor;
+
+// Any integer tensor type with rank 0 (i.e. representing a single integer).
+defvar MHLO_ScalarIntTensor = HLO_ScalarIntTensor;
+
+// Any floating-point tensor types
+defvar MHLO_FpTensor = HLO_FpTensor;
+
+// 32 or 64 bits floating-point tensor types
+defvar MHLO_Fp32Or64Tensor = HLO_Fp32Or64Tensor;
+
+// Any quantized integer tensor types
+defvar MHLO_QuantizedIntTensor = HLO_QuantizedIntTensor;
+
+defvar MHLO_PredTensor = HLO_PredTensor;
+
+defvar MHLO_Tensor = HLO_Tensor;
+
+defvar MHLO_ComplexTensor = HLO_ComplexTensor;
+
+defvar MHLO_Tuple = HLO_Tuple;
+
+defvar MHLO_TensorOrToken = HLO_TensorOrToken;
+
+defvar MHLO_TensorOrTokenOrTuple = AnyTypeOf<[MHLO_Tensor, MHLO_Token, MHLO_Tuple]>;
+
+defvar MHLO_DimensionValue = HLO_DimensionValue;
+
+// Dynamic representation of a shape vector as a tensor.
+defvar MHLO_DimensionTensor = HLO_DimensionTensor;
+
+//===----------------------------------------------------------------------===//
+// MHLO combined type definitions.
+//===----------------------------------------------------------------------===//
+
+// Any integer or floating-point tensor types
+
+// Any integer or floating-point tensor types
+defvar MHLO_IntOrFpTensor = HLO_IntOrFpTensor;
+
+// Any integer or predicate tensor types
+defvar MHLO_PredOrIntTensor = HLO_PredOrIntTensor;
+
+// Any floating-point or complex tensor types
+defvar MHLO_FpOrComplexTensor = HLO_FpOrComplexTensor;
+
+// Any int, floating-point or complex tensor types
+defvar MHLO_IntFpOrComplexTensor = HLO_IntFpOrComplexTensor;
+
+// Any pred, int or floating-point tensor types
+defvar MHLO_PredIntOrFpTensor = HLO_PredIntOrFpTensor;
+
+//===----------------------------------------------------------------------===//
+// MHLO static shape type definitions.
+//===----------------------------------------------------------------------===//
+
+// In general, static shaped tensor constraints should be avoided unless
+// it is for a legacy op which is only correct with static shapes.
+defvar MHLO_StaticShapeTensor = HLO_StaticShapeTensor;
+
+defvar MHLO_StaticShapeTensorOrToken = HLO_StaticShapeTensorOrToken;
+
+defvar MHLO_StaticShapeIntOrFpTensor = HLO_StaticShapeIntOrFpTensor;
+
+defvar MHLO_StaticShapeIntFpOrComplexTensor = HLO_StaticShapeIntFpOrComplexTensor;
+
+#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_BASE
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
new file mode 100644
index 00000000000..2957fb2f18d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -0,0 +1,7627 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the MHLO dialect.
+
+#include "mhlo/IR/hlo_ops.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
+#include "mhlo/IR/hlo_ops.h.inc"
+#include "mhlo/IR/hlo_ops_common.h"
+#include "mhlo/IR/mhlo_bytecode.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/FunctionInterfaces.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "stablehlo/dialect/AssemblyFormat.h"
+#include "stablehlo/dialect/TypeInference.h"
+#include "utils/convert_op_folder.h"
+#include "utils/hlo_utils.h"
+
+namespace mlir {
+#include "hlo_patterns.cc.inc"
+}  // namespace mlir
+
+using mlir::hlo::parseDimSizes;
+using mlir::hlo::printDimSizes;
+
+#include "mhlo/IR/hlo_ops_enums.cc.inc"
+#define GET_ATTRDEF_CLASSES
+#include "mhlo/IR/hlo_ops_attrs.cc.inc"
+#define GET_TYPEDEF_CLASSES
+#include "mhlo/IR/hlo_ops_typedefs.cc.inc"
+
+namespace mlir {
+namespace mhlo {
+
+namespace detail {
+/// A type representing a collection of other types.
+struct AsyncBundleTypeStorage final
+    : public TypeStorage,
+      public llvm::TrailingObjects<AsyncBundleTypeStorage, Type> {
+  using KeyTy = TypeRange;
+
+  explicit AsyncBundleTypeStorage(unsigned numTypes) : numElements(numTypes) {}
+
+  // Construction.
+  static AsyncBundleTypeStorage* construct(TypeStorageAllocator& allocator,
+                                           TypeRange key) {
+    // Allocate a new storage instance.
+    auto byteSize = AsyncBundleTypeStorage::totalSizeToAlloc<Type>(key.size());
+    auto* rawMem =
+        allocator.allocate(byteSize, alignof(AsyncBundleTypeStorage));
+    auto* result = ::new (rawMem) AsyncBundleTypeStorage(key.size());
+
+    // Copy in the element types into the trailing storage.
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<Type>());
+    return result;
+  }
+
+  bool operator==(const KeyTy& key) const { return key == getTypes(); }
+
+  // Return the number of held types.
+  unsigned size() const { return numElements; }
+
+  // Return the held types.
+  ArrayRef<Type> getTypes() const {
+    return {getTrailingObjects<Type>(), size()};
+  }
+
+  void getFlattenedTypes(SmallVectorImpl<Type>& types) {
+    for (Type type : getTypes()) {
+      if (auto nestedTuple = type.dyn_cast<TupleType>())
+        nestedTuple.getFlattenedTypes(types);
+      else
+        types.push_back(type);
+    }
+  }
+
+  // The number of tuple elements.
+  unsigned numElements;
+};
+
+}  // namespace detail
+/// Return the elements types for this tuple.
+ArrayRef<Type> AsyncBundleType::getTypes() const {
+  return getImpl()->getTypes();
+}
+void AsyncBundleType::getFlattenedTypes(SmallVectorImpl<Type>& types) {
+  getImpl()->getFlattenedTypes(types);
+}
+
+namespace {
+void createArgs(ArrayRef<OpAsmParser::UnresolvedOperand> operands,
+                ArrayRef<Type> types,
+                SmallVector<OpAsmParser::Argument>& args) {
+  for (auto argAndType : llvm::zip(operands, types)) {
+    auto& arg = args.emplace_back();
+    arg.ssaName = std::get<0>(argAndType);
+    arg.type = std::get<1>(argAndType);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Utilities for the canonicalize patterns
+//===----------------------------------------------------------------------===//
+
+// This is an upper limit on how many elements can be folded by an op folder.
+// This limit doesn't apply to some special cases like adding a zero,
+// multiplying by one, doing many operations with splats.
+constexpr int64_t kFoldOpEltLimit = 64 * 1024ll * 1024ll * 1024ll;
+
+// Clamps value to the range [lower, upper].  Requires lower <= upper.
+template <typename T>
+static T clamp(const T& value, const T& lower, const T& upper) {
+  assert(lower <= upper);
+  return std::max(lower, std::min(value, upper));
+}
+
+// Verifies that dimension attribute for the op correctly indexes in operand or
+// result shape.
+template <typename OpT>
+static LogicalResult verifyDimAttr(OpT op) {
+  int64_t rank = -1;
+  if (auto ty =
+          op.getOperand().getType().template dyn_cast<RankedTensorType>()) {
+    rank = ty.getRank();
+  } else if (auto ty = op.getType().template dyn_cast<RankedTensorType>()) {
+    rank = ty.getRank();
+  } else {
+    return success();
+  }
+
+  int64_t dim = op.getDimension();
+  if (dim < 0 || dim >= rank)
+    return op.emitOpError() << "requires dimension attribute in range [0, "
+                            << rank << "); found (" << dim << ")";
+  return success();
+}
+
+// Given the start indices and slice sizes for a dynamic-slice that can be
+// converted to a static slice, returns the limits for the static slice.
+DenseIntElementsAttr buildSliceLimits(DenseIntElementsAttr startIndices,
+                                      DenseIntElementsAttr sliceSizes,
+                                      Builder* builder) {
+  SmallVector<int64_t, 4> sliceLimits;
+  for (int64_t i = 0; i < sliceSizes.getNumElements(); ++i) {
+    int64_t startIndex = startIndices.getValues<IntegerAttr>()[i].getInt();
+    int64_t sliceSize = sliceSizes.getValues<IntegerAttr>()[i].getInt();
+    sliceLimits.push_back(startIndex + sliceSize);
+  }
+  return builder->getI64TensorAttr(sliceLimits);
+}
+
+/// Replaces the given op with the contents of the given single-block region,
+/// using the operands of the block terminator to replace operation results.
+static void replaceOpWithRegion(PatternRewriter& rewriter, Operation* op,
+                                Region& region, ValueRange blockArgs = {}) {
+  assert(llvm::hasSingleElement(region) && "expected single-block region");
+  Block* block = &region.front();
+  Operation* terminator = block->getTerminator();
+  ValueRange results = terminator->getOperands();
+  rewriter.mergeBlockBefore(block, op, blockArgs);
+  rewriter.replaceOp(op, results);
+  rewriter.eraseOp(terminator);
+}
+
+#include "mhlo/IR/mhlo_canonicalize.inc"
+
+// Common shape function helper for RngNormal and RngUniform.
+static LogicalResult rngInferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  if (operands.size() != 3)
+    return emitOptionalError(location, "expected 3 operands");
+
+  SmallVector<int64_t> shapeVector;
+  Value shapeOperand = operands[2];
+  auto shapeOperandType = shapeOperand.getType().cast<ShapedType>();
+  Type elementType = getElementTypeOrSelf(operands[1]);
+
+  // Operand `shape` (1D by ODS) may be a constant or not, if `shape` is:
+  // 1, not constant and have dynimic dim (tensor<?x>): infer tensor<*x>.
+  // 2. not constant nor dynimic (e.g. tensor<3xi64>): infer tensor<?x?x?x>.
+  // 3. constant (e.g. dense<[2, 3, 5]>): infer tensor<2x3x5x>.
+
+  // Match to check whether the `shape` operand is a constant.
+  DenseIntElementsAttr shape;
+  if (!matchPattern(shapeOperand, m_Constant(&shape))) {
+    int size = shapeOperandType.getDimSize(0);
+    if (hlo::isDynamicDimSize(size)) {
+      inferredReturnShapes.emplace_back(elementType);
+      return success();
+    }
+    shapeVector.resize(size, ShapedType::kDynamic);
+    inferredReturnShapes.emplace_back(shapeVector, elementType);
+    return success();
+  }
+
+  // `shape` operand is a constant.
+  shapeVector.reserve(shape.size());
+  for (const APInt& fp : shape.getValues<APInt>())
+    shapeVector.push_back(fp.getSExtValue());
+  inferredReturnShapes.emplace_back(shapeVector, elementType);
+  return success();
+}
+
+// Returns a new scalar integer value having type `type`. Here `type` must be
+// an integer or index type.
+Value maybeCastTo(OpBuilder& b, Location loc, Value value, Type type) {
+  if (type == value.getType()) return value;
+  // DISC-Begin
+  if (!type.isIndex() && !value.getType().isIndex()) {
+    // in case of i32 -> i64 or vice versa
+    Value casted = b.create<arith::IndexCastOp>(loc, b.getIndexType(), value);
+    return b.create<arith::IndexCastOp>(loc, type, casted);
+  }
+  // DISC-End
+  return b.create<arith::IndexCastOp>(loc, type, value);
+}
+
+DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
+  // TODO(b/232866626): DenseElementsAttr::reshape is broken for bool splats.
+  // Once that ticket is fixed, we can remove this conditional.
+  if (attr.isSplat() && newType.getElementType().isInteger(/*width=*/1)) {
+    auto splatValue = attr.getValues<bool>()[0];
+    return DenseElementsAttr::get(newType, {splatValue});
+  }
+  return attr.reshape(newType);
+}
+
+//===----------------------------------------------------------------------===//
+// Utilities for verifiers
+//===----------------------------------------------------------------------===//
+
+// Convert a 1D dense int64 attribute to a list of values.
+SmallVector<int64_t> convertDenseIntAttr(
+    llvm::Optional<mlir::DenseIntElementsAttr> optionalAttr) {
+  if (!optionalAttr.has_value()) return SmallVector<int64_t>{};
+
+  mlir::DenseIntElementsAttr attr = *optionalAttr;
+  auto values = attr.getValues<int64_t>();
+  return {values.begin(), values.end()};
+}
+
+// Convert a 1D or Nx2 dense int64 attribute to a list of tuples.
+FailureOr<SmallVector<std::pair<int64_t, int64_t>>> convertNx2Attribute(
+    llvm::Optional<mlir::DenseIntElementsAttr> optionalAttr, Location loc) {
+  if (!optionalAttr.has_value())
+    return SmallVector<std::pair<int64_t, int64_t>>{};
+  mlir::DenseIntElementsAttr attr = *optionalAttr;
+
+  auto attrType = attr.getType().cast<RankedTensorType>();  // ensured by ODS.
+  if (attrType.getRank() > 1) {
+    if (attrType.getRank() != 2 || attrType.getShape()[1] != 2)
+      return (mlir::emitError(loc) << "expects the shape of padding-attribute "
+                                      "to be {N, 2}, but got {"
+                                   << attrType.getShape() << "}.",
+              failure());
+  } else {
+    // Padding values can be provided as a 1D vector as well.
+    if (attr.getValues<int64_t>().size() % 2 != 0)
+      return (mlir::emitError(loc)
+                  << "expects the padding-entries to have even number of "
+                     "elements, but got "
+                  << attr.getValues<int64_t>().size() << " elements.",
+              failure());
+  }
+
+  auto it = attr.getValues<int64_t>().begin();
+  SmallVector<std::pair<int64_t, int64_t>> out(attr.getNumElements() / 2);
+  for (auto& item : out) {
+    int64_t first = *it;
+    ++it;
+    int64_t second = *it;
+    ++it;
+    item = {first, second};
+  }
+  return out;
+}
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// Utilities for attributes
+//===----------------------------------------------------------------------===//
+
+LogicalResult TypeExtensionsAttr::verifyEncoding(
+    llvm::ArrayRef<int64_t> shape, mlir::Type elementType,
+    llvm::function_ref<mlir::InFlightDiagnostic()> emitError) const {
+  return hlo::verifyBounds(
+      getBounds(), RankedTensorType::get(shape, elementType), emitError);
+}
+
+//===----------------------------------------------------------------------===//
+// CollectivePermuteOp
+//===----------------------------------------------------------------------===//
+
+void CollectivePermuteOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                                Type resultType, Value operand,
+                                DenseIntElementsAttr sourceTargetPairs) {
+  CollectivePermuteOp::build(odsBuilder, odsState, resultType, operand,
+                             sourceTargetPairs, /*channel_handle=*/nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceScatterOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReduceScatterOp::verify() {
+  return hlo::verifyReduceScatterOp(
+      getLoc(), getOperand(), getScatterDimension(), getReplicaGroups(),
+      getUseGlobalDeviceIds(), getComputation(), getResult());
+}
+
+void ReduceScatterOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                            Type resultType, Value operand,
+                            IntegerAttr scatterDimension,
+                            DenseIntElementsAttr replicaGroups,
+                            ChannelHandleAttr channelHandle) {
+  ReduceScatterOp::build(odsBuilder, odsState, resultType, operand,
+                         scatterDimension, replicaGroups, channelHandle,
+                         /*use_global_device_ids=*/nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// CompatibleOperandsAndResultType
+//===----------------------------------------------------------------------===//
+
+// TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
+// support quantization or sparsity.
+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
+  LogicalResult Op::inferReturnTypeComponents(                                \
+      MLIRContext* context, Optional<Location> location,                      \
+      ValueShapeRange operands, DictionaryAttr attributes,                    \
+      RegionRange regions,                                                    \
+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
+    return inferReturnTypeComponentsFromOperands(context, location, operands, \
+                                                 attributes, regions,         \
+                                                 inferredReturnShapes);       \
+  }
+
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AllReduceOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AndOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Atan2Op)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CbrtOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CeilOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ClzOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CollectivePermuteOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CopyOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CosineOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CrossReplicaSumOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DivOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DomainOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ExpOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Expm1Op)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(FloorOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LogOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Log1pOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(LogisticOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MaxOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MinOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(MulOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NegOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(NotOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(OrOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(PopulationCountOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(PowOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ReducePrecisionOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RemOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ReverseOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundNearestEvenOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RoundOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(RsqrtOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftLeftOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftRightArithmeticOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ShiftRightLogicalOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SignOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SineOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SqrtOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(SubtractOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(TanOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(TanhOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(XorOp)
+
+//===----------------------------------------------------------------------===//
+// Async ops
+//===----------------------------------------------------------------------===//
+
+Type maybeTupleFromTypes(MLIRContext* ctx, ArrayRef<Type> types) {
+  if (types.size() == 1 && !types[0].isa<TupleType>()) return types[0];
+  return TupleType::get(ctx, TypeRange(types));
+}
+
+LogicalResult AsyncStartOp::verify() {
+  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
+  func::FuncOp callee =
+      module.lookupSymbol<func::FuncOp>(getCalledComputation());
+  if (!callee) {
+    return emitOpError() << "can't find function: " << getCalledComputation();
+  }
+  FunctionType calleeType = callee.getFunctionType();
+  auto calleeInputTypes = calleeType.getInputs();
+  auto calleeResultTypes = calleeType.getResults();
+
+  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
+  if (!calleeThreadName)
+    return emitOpError() << "callee must have execution_thread attribute.";
+  if (calleeThreadName != getExecutionThread()) {
+    return emitOpError()
+           << "execution_thread does not match the execution_thread of "
+           << getCalledComputation() << ".  Got: \"" << getExecutionThread()
+           << "\", but expected " << calleeThreadName << ".";
+  }
+
+  if (calleeType.getNumInputs() != getOperands().size()) {
+    return emitOpError() << "number of operands doesn't match operands for "
+                         << getCalledComputation()
+                         << ". Got: " << getOperands().size()
+                         << ", but expected: " << calleeType.getNumInputs()
+                         << ".";
+  }
+  for (int i = 0; i < static_cast<int64_t>(getOperands().size()); ++i) {
+    if (calleeType.getInput(i) != getOperandTypes()[i]) {
+      return emitOpError() << "type mismatch on argument #" << i << " of "
+                           << getCalledComputation()
+                           << ". Got: " << getOperandTypes()[i]
+                           << ", but expected: " << calleeType.getInput(i)
+                           << ".";
+    }
+  }
+
+  auto resultTypes = getResult().getType().cast<AsyncBundleType>().getTypes();
+  if (resultTypes.size() < 2)
+    return emitOpError() << "result is expected to be a bundle of at least 2 "
+                            "components, but got "
+                         << resultTypes.size();
+  if (resultTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
+    return emitOpError()
+           << "component #0 of return type doesn't match callee input types";
+  }
+  if (resultTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
+    return emitOpError()
+           << "component #1 of return type doesn't match callee result types";
+  }
+
+  return success();
+}
+
+LogicalResult AsyncUpdateOp::verify() {
+  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
+  func::FuncOp callee =
+      module.lookupSymbol<func::FuncOp>(getCalledComputation());
+  if (!callee) {
+    return emitOpError() << "can't find function: " << getCalledComputation();
+  }
+  FunctionType calleeType = callee.getFunctionType();
+  auto calleeInputTypes = calleeType.getInputs();
+  auto calleeResultTypes = calleeType.getResults();
+  auto bundleTypes = getBundle().getType().cast<AsyncBundleType>().getTypes();
+
+  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
+  if (!calleeThreadName)
+    return emitOpError() << "callee must have execution_thread attribute.";
+  if (calleeThreadName != getExecutionThread()) {
+    return emitOpError() << "execution_thread does not match name of "
+                         << getCalledComputation() << ".  Got: \""
+                         << getExecutionThread() << "\", but expected "
+                         << calleeThreadName << ".";
+  }
+
+  if (bundleTypes.size() < 2)
+    return emitOpError() << "operand is expected to be a bundle of at least 2 "
+                            "components, but got "
+                         << bundleTypes.size();
+  if (bundleTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
+    return emitOpError() << "component #0 of operand bundle type doesn't match "
+                            "callee input types";
+  }
+  if (bundleTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
+    return emitOpError() << "component #1 of operand bundle type doesn't match "
+                            "callee result types";
+  }
+
+  return success();
+}
+
+LogicalResult AsyncUpdateOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  AsyncUpdateOp::Adaptor adaptor(operands, attributes, regions);
+  auto stateType = adaptor.getBundle().getType().cast<AsyncBundleType>();
+  inferredReturnTypes.push_back(stateType);
+  return success();
+}
+
+LogicalResult AsyncDoneOp::verify() {
+  ModuleOp module = getOperation()->getParentOfType<ModuleOp>();
+  func::FuncOp callee =
+      module.lookupSymbol<func::FuncOp>(getCalledComputation());
+  if (!callee) {
+    return emitOpError() << "can't find function: " << getCalledComputation();
+  }
+  FunctionType calleeType = callee.getFunctionType();
+  auto calleeInputTypes = calleeType.getInputs();
+  auto calleeResultTypes = calleeType.getResults();
+  auto bundleTypes = getBundle().getType().cast<AsyncBundleType>().getTypes();
+
+  auto calleeThreadName = callee->getAttrOfType<StringAttr>("execution_thread");
+  if (!calleeThreadName)
+    return emitOpError() << "callee must have execution_thread attribute.";
+  if (calleeThreadName != getExecutionThread()) {
+    return emitOpError() << "execution_thread does not match name of "
+                         << getCalledComputation() << ".  Got: \""
+                         << getExecutionThread() << "\", but expected "
+                         << calleeThreadName << ".";
+  }
+
+  if (bundleTypes.size() < 2)
+    return emitOpError() << "operand is expected to be a bundle of at least 2 "
+                            "components, but got "
+                         << bundleTypes.size();
+  if (bundleTypes[0] != maybeTupleFromTypes(getContext(), calleeInputTypes)) {
+    return emitOpError()
+           << "operand type component #0 doesn't match callee input types";
+  }
+  if (bundleTypes[1] != maybeTupleFromTypes(getContext(), calleeResultTypes)) {
+    return emitOpError()
+           << "operand type component #1 doesn't match callee result types";
+  }
+
+  return success();
+}
+
+LogicalResult AsyncDoneOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  AsyncDoneOp::Adaptor adaptor(operands, attributes, regions);
+  ModuleOp module =
+      adaptor.getBundle().getDefiningOp()->getParentOfType<ModuleOp>();
+  auto calledComputation = adaptor.getCalledComputationAttr();
+  func::FuncOp callee = module.lookupSymbol<func::FuncOp>(calledComputation);
+  if (!callee) {
+    return adaptor.getBundle().getDefiningOp()->emitOpError()
+           << "can't find function: " << calledComputation;
+  }
+  llvm::append_range(inferredReturnTypes,
+                     callee.getFunctionType().getResults());
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// AfterAllOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AfterAllOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto dialect = context->getLoadedDialect<MhloDialect>();
+  return hlo::inferAfterAllOp(dialect, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ConstantOp::fold(FoldAdaptor adaptor) {
+  assert(adaptor.getOperands().empty() && "constant has no operands");
+
+  // Return the held attribute value.
+  return getValue();
+}
+
+// Builds a constant op with the specified attribute `value`.
+void ConstantOp::build(OpBuilder& /*builder*/, OperationState& result,
+                       Attribute value) {
+  Type type;
+  if (auto elemAttr = value.dyn_cast<ElementsAttr>()) {
+    type = elemAttr.getType();
+  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+    // All XLA types must be tensor types. In the build() method, we want to
+    // provide more flexibility by allowing attributes of scalar types. But we
+    // need to wrap it up with ElementsAttr to construct valid XLA constants.
+    type =
+        RankedTensorType::get(/*shape=*/{}, value.cast<TypedAttr>().getType());
+    value = DenseElementsAttr::get(type.cast<TensorType>(), value);
+  } else if (auto complexAttr = value.dyn_cast<complex::NumberAttr>()) {
+    type = RankedTensorType::get(/*shape=*/{},
+                                 complexAttr.cast<TypedAttr>().getType());
+    value =
+        DenseElementsAttr::get(type.cast<TensorType>(), complexAttr.getValue());
+  }
+
+  // TODO: support other XLA specific types.
+  assert(type && "unsupported attribute type for building mhlo.constant");
+  result.types.push_back(type);
+  result.addAttribute("value", value);
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ConstantOpAdaptor adaptor(operands, attributes);
+  return hlo::inferConstantOp(location, adaptor.getValue(),
+                              inferredReturnTypes);
+}
+
+bool ConstantOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  if (l.size() != r.size() || l.size() != 1) return false;
+  auto lhsTy = l.front().cast<TensorType>();
+  auto rhsTy = r.front().cast<TensorType>();
+  // For comparisons of the uniform quantized element based tensor type, use the
+  // storage type since the constant value will be stored through the underlying
+  // storage type.
+  if (auto rhsElemTy =
+          rhsTy.getElementType().dyn_cast<quant::QuantizedType>()) {
+    rhsTy = hlo::getSameShapeTensorType(rhsTy, rhsElemTy.getStorageType());
+  }
+  return lhsTy == rhsTy;
+}
+
+ParseResult ConstantOp::parse(OpAsmParser& parser, OperationState& result) {
+  // Parse the generic form.
+  if (succeeded(parser.parseOptionalLParen())) {
+    if (parser.parseRParen()) return failure();
+    if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+    if (parser.parseColon() || parser.parseLParen() || parser.parseRParen() ||
+        parser.parseArrow())
+      return failure();
+    Type resultTy;
+    if (parser.parseType(resultTy)) {
+      return failure();
+    }
+    result.addTypes(resultTy);
+    return success();
+  }
+
+  ElementsAttr valueAttr;
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  if (parser.parseCustomAttributeWithFallback(valueAttr, Type{}, "value",
+                                              result.attributes)) {
+    return failure();
+  }
+  result.addTypes(valueAttr.getType());
+  return success();
+}
+
+/// Print a `constant` op.
+///
+/// op ::= attr-dict $value
+///
+/// When the `value` and `output` have different type, it just uses the default
+/// operator assembly format as a fallback.
+void ConstantOp::print(::mlir::OpAsmPrinter& p) {
+  // If not all types are the same, use generic form.
+  if (getValue().getType() != getType()) {
+    p.printGenericOp(getOperation(), /*printOpName=*/false);
+    return;
+  }
+
+  p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{"value"});
+  p << ' ';
+  p.printStrippedAttrOrType(getValueAttr());
+}
+
+//===----------------------------------------------------------------------===//
+// Helper function to verify output operand aliasing (FusionOp and CustomCallOp)
+//===----------------------------------------------------------------------===//
+
+template <typename CallableOpType>
+LogicalResult verifyOutputOperandAliasing(CallableOpType* op) {
+  auto aliasArrayAttr = op->getOutputOperandAliases();
+  for (auto attr : aliasArrayAttr) {
+    auto alias = attr.template cast<OutputOperandAliasAttr>();
+    auto outputTupleIndices = alias.getOutputTupleIndices();
+    auto operandIndex = alias.getOperandIndex();
+    auto operandTupleIndices = alias.getOperandTupleIndices();
+    if (operandIndex < 0 ||
+        operandIndex >= static_cast<int64_t>(op->getInputs().size()))
+      return op->emitOpError()
+             << "expects operandIndex in the output_operand_alias attribute "
+                "to be in range [0, "
+             << op->getInputs().size() << "); got: " << operandIndex << ".";
+    Type operandPart = op->getOperand(operandIndex).getType();
+    for (auto i : operandTupleIndices) {
+      if (!operandPart.isa<TupleType>() ||
+          i >= static_cast<int64_t>(operandPart.cast<TupleType>().size()) ||
+          i < 0)
+        return op->emitOpError()
+               << "operand_tuple_indices in the output_operand_alias "
+                  "attribute out of bounds";
+      operandPart = operandPart.cast<TupleType>().getType(i);
+    }
+    Type outputPart =
+        op->getNumResults() > 1
+            ? TupleType::get(op->getContext(), op->getResultTypes())
+            : op->getResult(0).getType();
+    for (auto i : outputTupleIndices) {
+      if (!outputPart.isa<TupleType>() ||
+          i >= static_cast<int64_t>(outputPart.cast<TupleType>().size()) ||
+          i < 0)
+        return op->emitOpError()
+               << "output_tuple_indices in the output_operand_alias "
+                  "attribute out of bounds";
+      outputPart = outputPart.cast<TupleType>().getType(i);
+    }
+    if (operandPart != outputPart)
+      return op->emitOpError()
+             << "shapes mismatch in the output_operand_alias attribute: "
+             << "operand part has type " << operandPart
+             << " and output part has type " << outputPart;
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FusionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult FusionOp::verify() { return verifyOutputOperandAliasing(this); }
+
+//===----------------------------------------------------------------------===//
+// CreateTokenOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult CreateTokenOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto dialect = context->getLoadedDialect<MhloDialect>();
+  return hlo::inferCreateTokenOp(dialect, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// CustomCallOp
+//===----------------------------------------------------------------------===//
+
+void CustomCallOp::build(
+    ::mlir::OpBuilder& odsBuilder, ::mlir::OperationState& odsState,
+    ::mlir::TypeRange resultType, ::mlir::ValueRange operands,
+    ::mlir::StringAttr callTargetName, ::mlir::BoolAttr hasSideEffect,
+    ::mlir::StringAttr backendConfig,
+    ::mlir::mhlo::CustomCallApiVersionAttr apiVersion,
+    ::mlir::ArrayAttr calledComputations, ::mlir::ArrayAttr operandLayouts,
+    ::mlir::ArrayAttr resultLayouts) {
+  return CustomCallOp::build(
+      odsBuilder, odsState, resultType, operands, callTargetName, hasSideEffect,
+      backendConfig, apiVersion, calledComputations,
+      CustomCallScheduleAttr::get(odsBuilder.getContext(),
+                                  CustomCallSchedule::NONE),
+      operandLayouts, resultLayouts, nullptr);
+}
+
+LogicalResult CustomCallOp::verify() {
+  // If both operand and result layout attributes are not specified then nothing
+  // to verify.
+  if (getOperandLayouts().has_value() || getResultLayouts().has_value()) {
+    // Layout constraints for either both operands & results or none should be
+    // specified.
+    if (getOperandLayouts().has_value() != getResultLayouts().has_value())
+      return emitOpError() << "Layout attributes should be specified for "
+                              "either both operands and results or none.";
+
+    // Helper function to verify types and the corresponding layouts.
+    auto verifyTypesAndLayouts =
+        [this](TypeRange types, mlir::ArrayAttr layouts,
+               const std::string& valueName) -> LogicalResult {
+      if (types.size() != layouts.size())
+        return emitOpError()
+               << "Number of " << valueName << "s must match the number of "
+               << valueName << " layouts, " << types.size()
+               << " != " << layouts.size();
+
+      for (const auto& indexedTypeAndLayout :
+           llvm::enumerate(llvm::zip(types, layouts))) {
+        // Get index for more descriptive error message.
+        auto index = indexedTypeAndLayout.index();
+
+        auto type = std::get<0>(indexedTypeAndLayout.value());
+        auto layout = std::get<1>(indexedTypeAndLayout.value())
+                          .cast<DenseIntElementsAttr>();
+
+        if (type.isa<TupleType>())
+          return emitOpError() << "Tuple types are not fully supported with "
+                                  "layout constraints yet";
+        auto tensorType = type.dyn_cast<TensorType>();
+
+        // For non-tensor types such as !mhlo.token, the layout should be empty.
+        if (!tensorType) {
+          if (layout.empty()) continue;
+          return emitOpError()
+                 << "Only tensor types can have non-empty layout: " << valueName
+                 << " #" << index << " of type " << type << " has layout "
+                 << layout;
+        }
+
+        // For unranked tensors, we cannot verify the compatibility with layout
+        // any further.
+        if (!tensorType.hasRank()) continue;
+
+        // Layout must be a permutation of [0, N) where N is the rank of the
+        // tensor type.
+        std::vector<int64_t> range(tensorType.getRank());
+        std::iota(range.begin(), range.end(), 0);
+        if (tensorType.getRank() != layout.size() ||
+            !std::is_permutation(range.begin(), range.end(), layout.begin()))
+          return emitOpError()
+                 << "incorrect layout " << layout << " for type " << type
+                 << ", layout must be a permutation of [0, "
+                 << tensorType.getRank() << ")";
+      }
+      return success();
+    };
+
+    // At this point both `operand_layouts` and `result_layouts` are defined.
+    ArrayAttr operandLayouts = this->getOperandLayouts().value();
+    ArrayAttr resultLayouts = this->getResultLayouts().value();
+
+    // Full support for layouts for arbitrary nesting of tuples is not
+    // supported yet.
+    //
+    // If result does not have any tuples, then i-th element of `result_layouts`
+    // specifies the layout constraints on i-th result.
+    //
+    // For the common case of a single tuple result packing non-tuple values,
+    // the i-th element of `result_layouts` specifies layout for i-th element of
+    // the result tuple.
+    TypeRange resultTypes;
+    if (getNumResults() == 1 && getResult(0).getType().isa<TupleType>())
+      resultTypes = getResult(0).getType().cast<TupleType>().getTypes();
+    else
+      resultTypes = getResultTypes();
+
+    // Verify that operands and operand layouts match.
+    if (failed(verifyTypesAndLayouts(getOperandTypes(), operandLayouts,
+                                     "operand")))
+      return failure();
+
+    // Verify that results and result layouts match.
+    if (failed(verifyTypesAndLayouts(resultTypes, resultLayouts, "result")))
+      return failure();
+  }
+
+  // Check output_operand_aliases
+  if (failed(verifyOutputOperandAliasing(this))) return failure();
+
+  // Check backend_config attribute.
+  if (auto backendConfig = getBackendConfig()) {
+    if (getApiVersion() == CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+      // Typed FFI custom calls require `backend_config` to be a DictionaryAttr.
+      if (backendConfig->isa<mlir::StringAttr>())
+        return emitOpError()
+               << "unsupported user-encoded backend config,"
+                  " backend config must be a dictionary attribute.";
+    } else {
+      // Older API versions require user-encoded `backend_config` string.
+      if (backendConfig->isa<mlir::DictionaryAttr>())
+        return emitOpError()
+               << "unsupported dictionary attribute backend config, backend"
+                  " config must be a user-encoded string attribute.";
+    }
+  }
+
+  return success();
+}
+
+void CustomCallOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>&
+        effects) {
+  // CustomCall has "all possible effects" unless the has_side_effect is present
+  // and set to false.
+  auto hasSideEffect = (*this)->getAttrOfType<BoolAttr>("has_side_effect");
+  if (hasSideEffect && !hasSideEffect.getValue()) return;
+  effects.emplace_back(MemoryEffects::Allocate::get());
+  effects.emplace_back(MemoryEffects::Free::get());
+  effects.emplace_back(MemoryEffects::Write::get());
+  effects.emplace_back(MemoryEffects::Read::get());
+}
+
+//===----------------------------------------------------------------------===//
+// CholeskyOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult CholeskyOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  CholeskyOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferCholeskyOp(location, adaptor.getA(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// DotOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DotOp::verify() {
+  return hlo::verifyDotOp(getLoc(), getLhs(), getRhs(), getPrecisionConfig(),
+                          getResult());
+}
+
+// DISC-Begin
+LogicalResult DotOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  DotOp::Adaptor adaptor(operands);
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+
+  RankedTensorType lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!lhs_type || !rhs_type) return failure();
+
+  if (lhs_type.getRank() > 2 || rhs_type.getRank() > 2) {
+    // TODO: make sure whether DotOp supports batch dimension or not. The doc
+    // (https://www.tensorflow.org/xla/operation_semantics#dot) does not tell.
+    // We thus only support 1-D and 2-D Dot currently.
+    return failure();
+  }
+
+  Location loc = this->getLoc();
+  SmallVector<Value, 4> shape_values;
+
+  Type shape_scalar_type = builder.getIndexType();
+  auto to_shape_scalar_type = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shape_scalar_type);
+  };
+
+  if (lhs_type.getRank() == 2) {
+    shape_values.emplace_back(
+        to_shape_scalar_type(builder.create<tensor::DimOp>(loc, lhs, 0)));
+  }
+  if (rhs_type.getRank() == 2) {
+    shape_values.emplace_back(
+        to_shape_scalar_type(builder.create<tensor::DimOp>(loc, rhs, 1)));
+  }
+
+  Value output_shape = builder.create<tensor::FromElementsOp>(loc, shape_values);
+  reifiedReturnShapes.push_back(output_shape);
+
+  return success();
+}
+// DISC-End
+
+//===----------------------------------------------------------------------===//
+// DotGeneralOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DotGeneralOp::verify() {
+  return hlo::verifyDotGeneralOp(
+      getLoc(), getLhs(), getRhs(),
+      getDotDimensionNumbersAttr().getLhsBatchingDimensions(),
+      getDotDimensionNumbersAttr().getRhsBatchingDimensions(),
+      getDotDimensionNumbersAttr().getLhsContractingDimensions(),
+      getDotDimensionNumbersAttr().getRhsContractingDimensions(),
+      getPrecisionConfig(), getResult());
+}
+
+namespace {
+// Handle the generic case of DotGeneral and convert to a regulat DotOp.
+struct DotGeneralToDot : public OpRewritePattern<DotGeneralOp> {
+  using OpRewritePattern<DotGeneralOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DotGeneralOp dot,
+                                PatternRewriter& rewriter) const override {
+    auto lhs = dot.getLhs();
+    auto rhs = dot.getRhs();
+    auto lhsTy = lhs.getType().cast<ShapedType>();
+    auto rhsTy = rhs.getType().cast<ShapedType>();
+
+    int64_t lhsRank = lhsTy.getRank();
+    int64_t rhsRank = rhsTy.getRank();
+    if ((lhsRank != 1 && lhsRank != 2) || (rhsRank != 1 && rhsRank != 2)) {
+      return rewriter.notifyMatchFailure(
+          dot, "input tensors must have rank of 1 or 2");
+    }
+
+    auto nums = dot.getDotDimensionNumbers();
+    if ((!nums.getLhsBatchingDimensions().empty()) ||
+        (!nums.getRhsBatchingDimensions().empty())) {
+      return rewriter.notifyMatchFailure(dot, "cannot have batch dimensions");
+    }
+
+    auto lhsContract = nums.getLhsContractingDimensions();
+    auto rhsContract = nums.getRhsContractingDimensions();
+
+    if (lhsContract.size() != 1 || rhsContract.size() != 1) {
+      return rewriter.notifyMatchFailure(
+          dot, "input tensors must only have 1 contracting dimension");
+    }
+    if (rhsContract.front() != 0) {
+      return rewriter.notifyMatchFailure(
+          dot, "rhs must contract the first dimension");
+    }
+    if (lhsContract.front() != lhsRank - 1) {
+      return rewriter.notifyMatchFailure(
+          dot, "lhs must contract the last dimension");
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::DotOp>(
+        dot, dot.getType(), lhs, rhs,
+        dot.getPrecisionConfig().value_or(nullptr));
+
+    return success();
+  }
+};
+}  // namespace
+
+void DotGeneralOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                               MLIRContext* context) {
+  // Disc perfer to convert DotOp to DotGeneralOp
+  // results.add<DotGeneralToDot>(context);
+}
+
+LogicalResult DotGeneralOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  auto lhsType = getLhs().getType().dyn_cast<ShapedType>();
+  auto rhsType = getRhs().getType().dyn_cast<ShapedType>();
+  if (!lhsType || !rhsType) {
+    return failure();
+  }
+
+  Adaptor adaptor(operands);
+  auto dimNumbers = getDotDimensionNumbers();
+  SmallVector<Value> dimensions;
+  for (const int64_t lhsDim : dimNumbers.getLhsBatchingDimensions()) {
+    dimensions.push_back(
+        builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), lhsDim));
+  }
+
+  for (int64_t i = 0; i < lhsType.getRank(); i++) {
+    if (!llvm::is_contained(dimNumbers.getLhsContractingDimensions(), i) &&
+        !llvm::is_contained(dimNumbers.getLhsBatchingDimensions(), i)) {
+      dimensions.push_back(
+          builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), i));
+    }
+  }
+  for (int64_t i = 0; i < rhsType.getRank(); i++) {
+    if (!llvm::is_contained(dimNumbers.getRhsContractingDimensions(), i) &&
+        !llvm::is_contained(dimNumbers.getRhsBatchingDimensions(), i)) {
+      dimensions.push_back(
+          builder.create<tensor::DimOp>(getLoc(), adaptor.getRhs(), i));
+    }
+  }
+
+  reifiedReturnShapes.push_back(
+      builder.create<tensor::FromElementsOp>(getLoc(), dimensions));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// FftOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult FftOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  FftOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferFftOp(location, adaptor.getOperand(),
+                         adaptor.getFftType() == FftType::RFFT,
+                         adaptor.getFftType() == FftType::IRFFT,
+                         adaptor.getFftLength(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+// Converts gather ops to slice ops in case we have a single set of constant
+// indices.
+struct GatherSlice : public OpRewritePattern<GatherOp> {
+  using OpRewritePattern<GatherOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GatherOp gather,
+                                PatternRewriter& rewriter) const override {
+    DenseIntElementsAttr index;
+    if (!matchPattern(gather.getStartIndices(), m_Constant(&index)))
+      return failure();
+
+    const auto& dnums = gather.getDimensionNumbers();
+    if (dnums.getIndexVectorDim() != 0 || index.getType().getRank() > 1)
+      return failure();
+
+    // TODO(tberghammer): Remove when the verifier catches this case what is
+    // invalid if all previous condition holds.
+    if (index.getNumElements() !=
+        static_cast<int64_t>(dnums.getStartIndexMap().size()))
+      return failure();
+
+    RankedTensorType operandType =
+        gather->getOperand(0).getType().dyn_cast<RankedTensorType>();
+    if (!operandType || !operandType.hasStaticShape()) return failure();
+
+    auto sliceEnd =
+        llvm::to_vector<8>(gather.getSliceSizes().getValues<int64_t>());
+    llvm::SmallVector<int64_t, 8> sliceStart(sliceEnd.size(), 0);
+    for (auto it :
+         llvm::zip(dnums.getStartIndexMap(), index.getValues<APInt>())) {
+      int64_t mapIndex = std::get<0>(it);
+      // Clamp the indices within bounds to faithfully mirror gather semantics.
+      int64_t offset =
+          clamp(std::get<1>(it).getSExtValue(), static_cast<int64_t>(0),
+                operandType.getDimSize(mapIndex) - sliceEnd[mapIndex]);
+      sliceStart[mapIndex] += offset;
+      sliceEnd[mapIndex] += offset;
+    }
+
+    llvm::SmallVector<int64_t, 8> sliceStride(sliceEnd.size(), 1);
+    llvm::SmallVector<int64_t, 8> sliceShape(sliceEnd.size());
+    for (size_t i = 0; i < sliceEnd.size(); ++i) {
+      sliceShape[i] = sliceEnd[i] - sliceStart[i];
+    }
+    Type elementType = gather.getType().cast<TensorType>().getElementType();
+    auto sliceType = RankedTensorType::get(sliceShape, elementType);
+    Value result = rewriter.create<SliceOp>(
+        gather.getLoc(), sliceType, gather.getOperand(),
+        rewriter.getI64TensorAttr(sliceStart),
+        rewriter.getI64TensorAttr(sliceEnd),
+        rewriter.getI64TensorAttr(sliceStride));
+
+    auto collapsedSliceDims = dnums.getCollapsedSliceDims();
+    if (!collapsedSliceDims.empty()) {
+      llvm::SmallVector<int64_t, 8> reshapeShape;
+      for (size_t i = 0; i < sliceShape.size(); ++i) {
+        if (llvm::count(collapsedSliceDims, i) == 0) {
+          reshapeShape.push_back(sliceShape[i]);
+        }
+      }
+      auto reshapeType = RankedTensorType::get(reshapeShape, elementType);
+      result = rewriter.create<ReshapeOp>(gather.getLoc(), reshapeType, result);
+    }
+
+    result.setType(gather.getType());
+    rewriter.replaceOp(gather, result);
+    return success();
+  }
+};
+
+void GatherOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
+  results.add<GatherSlice>(context);
+}
+
+namespace {
+
+// following https://www.tensorflow.org/xla/operation_semantics#gather
+// The bounds for the output array along dimension i is computed as follows:
+// (1) If i is present in batch_dims (i.e. is equal to batch_dims[k] for some k)
+// then we pick
+// the corresponding dimension bounds out of start_indices.shape, skipping
+// index_vector_dim
+// (i.e. pick start_indices.shape.dims[k] if k < index_vector_dim and
+// start_indices.shape.dims[k+1] otherwise).
+// (2) If i is present in offset_dims (i.e. equal to offset_dims[k] for some k)
+// then we pick
+// the corresponding bound out of slice_sizes after accounting for
+// collapsed_slice_dims
+// (i.e. we pick adjusted_slice_sizes[k] where adjusted_slice_sizes is
+// slice_sizes with the bounds at indices collapsed_slice_dims removed).
+
+void getSliceSizeValues(GatherOp* gather, OpBuilder& builder, Location loc,
+                        ValueRange operands,
+                        SmallVectorImpl<Value>& sliceSizes) {
+  for (int64_t val : gather->getSliceSizes().getValues<int64_t>()) {
+    sliceSizes.push_back(builder.create<arith::ConstantIndexOp>(loc, val));
+  }
+}
+
+void getSliceSizeValues(DynamicGatherOp* /*dGather*/, OpBuilder& builder,
+                        Location loc, ValueRange operands,
+                        SmallVectorImpl<Value>& sliceSizeValues) {
+  DynamicGatherOp::Adaptor adaptor(operands);
+  Value sliceSizes = adaptor.getSliceSizes();
+  auto sliceSizesTy = sliceSizes.getType().cast<ShapedType>();
+  for (int64_t i = 0; i < sliceSizesTy.getDimSize(0); ++i) {
+    Value idx = builder.create<arith::ConstantIndexOp>(loc, i);
+    sliceSizeValues.push_back(
+        builder.create<tensor::ExtractOp>(loc, sliceSizes, idx));
+  }
+}
+
+template <typename Op>
+LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
+                               SmallVectorImpl<Value>& reifiedReturnShapes) {
+  // No support for unranked gather output shape a.t.m.
+  auto resultTy =
+      op->getResult().getType().template dyn_cast<RankedTensorType>();
+  if (!resultTy) return failure();
+
+  typename Op::Adaptor adaptor(operands);
+  Value startIndices = adaptor.getStartIndices();
+
+  Location loc = op->getLoc();
+  int resultRank = resultTy.getRank();
+  Type shapeElTy = builder.getIndexType();
+  auto toShapeElType = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shapeElTy);
+  };
+
+  SmallVector<Value, 4> sliceSizes;
+  getSliceSizeValues(op, builder, loc, operands, sliceSizes);
+  llvm::transform(sliceSizes, sliceSizes.begin(),
+                  [&](Value v) { return toShapeElType(v); });
+
+  auto getStartIndicesDim = [&](int64_t index) {
+    return toShapeElType(
+        builder.create<tensor::DimOp>(loc, startIndices, index));
+  };
+  SmallVector<Value, 4> shapeValues;
+  auto getSliceDim = [&sliceSizes](int64_t index) -> Value {
+    return sliceSizes[index];
+  };
+  hlo::reifyGatherDimSizes(resultRank, getStartIndicesDim, getSliceDim,
+                           op->getDimensionNumbers().getOffsetDims(),
+                           op->getDimensionNumbers().getCollapsedSliceDims(),
+                           op->getDimensionNumbers().getStartIndexMap(),
+                           op->getDimensionNumbers().getIndexVectorDim(),
+                           shapeValues);
+
+  Value outputShape = builder.create<tensor::FromElementsOp>(
+      loc, RankedTensorType::get({resultRank}, shapeElTy), shapeValues);
+  reifiedReturnShapes.push_back(outputShape);
+
+  return success();
+}
+
+}  // namespace
+
+LogicalResult GatherOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return reifyGatherShape(this, builder, operands, reifiedReturnShapes);
+}
+
+LogicalResult GatherOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  GatherOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferGatherOp(
+      location, adaptor.getOperand(), adaptor.getStartIndices(),
+      adaptor.getDimensionNumbers().getOffsetDims(),
+      adaptor.getDimensionNumbers().getCollapsedSliceDims(),
+      adaptor.getDimensionNumbers().getStartIndexMap(),
+      adaptor.getDimensionNumbers().getIndexVectorDim(),
+      adaptor.getSliceSizes(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicGatherOp
+//===----------------------------------------------------------------------===//
+
+// Canonicalize mhlo.dynamic_gather to mhlo.gather when slice_sizes is constant.
+LogicalResult simplifyDynamicGatherToGather(DynamicGatherOp op,
+                                            PatternRewriter& rewriter) {
+  DenseIntElementsAttr dynamicGatherSliceSizes;
+  if (!matchPattern(op.getSliceSizes(), m_Constant(&dynamicGatherSliceSizes))) {
+    return failure();
+  }
+
+  // DynamicGatherOp's slice_sizes is 1DTensorOf<[HLO_DimensionValue]>
+  // where HLO_DimensionValue is AnyTypeOf<[Index, HLO_Int]>.
+  // However, GatherOp's slice_sizes is I64ElementsAttr.
+  // Therefore, we need to convert the elements in case there is a mismatch
+  // of element types.
+  DenseIntElementsAttr gatherSliceSizes = dynamicGatherSliceSizes;
+  if (!dynamicGatherSliceSizes.getType().getElementType().isInteger(64)) {
+    SmallVector<int64_t> sliceSizes;
+    for (APInt sliceSize : dynamicGatherSliceSizes.getValues<APInt>()) {
+      sliceSizes.push_back(sliceSize.getSExtValue());
+    }
+    gatherSliceSizes = rewriter.getI64TensorAttr(sliceSizes);
+  }
+
+  rewriter.replaceOpWithNewOp<mhlo::GatherOp>(
+      op, op.getOperand(), op.getStartIndices(), op.getDimensionNumbersAttr(),
+      gatherSliceSizes, op.getIndicesAreSortedAttr());
+  return success();
+}
+
+void DynamicGatherOp::getCanonicalizationPatterns(RewritePatternSet& result,
+                                                  MLIRContext* context) {
+  // Disc disable
+  // result.add(simplifyDynamicGatherToGather);
+}
+
+LogicalResult DynamicGatherOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return reifyGatherShape(this, builder, operands, reifiedReturnShapes);
+}
+
+LogicalResult DynamicGatherOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  DynamicGatherOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferDynamicGatherOp(
+      location, adaptor.getOperand(), adaptor.getStartIndices(),
+      adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
+      adaptor.getDimensionNumbers().getCollapsedSliceDims(),
+      adaptor.getDimensionNumbers().getStartIndexMap(),
+      adaptor.getDimensionNumbers().getIndexVectorDim(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// GetDimensionSizeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult GetDimensionSizeOp::verify() { return verifyDimAttr(*this); }
+
+LogicalResult GetDimensionSizeOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferGetDimensionSizeOp(context, location, inferredReturnTypes);
+}
+
+/// Fold get_dimension_size when the said shape dimension is a constant.
+OpFoldResult GetDimensionSizeOp::fold(FoldAdaptor) {
+  RankedTensorType type = getOperand().getType().dyn_cast<RankedTensorType>();
+  if (!type) return {};
+
+  int32_t dim = getDimension();
+  if (type.isDynamicDim(dim)) return {};
+  // The result type is always is a 0-d i32 tensor.
+  return DenseIntElementsAttr::get<int32_t>(
+      getResult().getType().cast<RankedTensorType>(), type.getDimSize(dim));
+}
+
+//===----------------------------------------------------------------------===//
+// IotaOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult IotaOp::verify() {
+  return hlo::verifyIotaOp(getLoc(), getIotaDimension(), getResult());
+}
+
+// Iota operations across multiple dimensions can be reduced to an iota and a
+// ranked broadcast.
+struct IotaBroadcast : public OpRewritePattern<IotaOp> {
+  using OpRewritePattern<IotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(IotaOp iota,
+                                PatternRewriter& rewriter) const override {
+    auto resultTy = iota.getType().cast<ShapedType>();
+    if (!resultTy.hasRank() || resultTy.getRank() < 2) {
+      return failure();
+    }
+
+    auto iotaDimension = iota.getIotaDimension();
+
+    auto iotaType = RankedTensorType::get({resultTy.getDimSize(iotaDimension)},
+                                          resultTy.getElementType());
+
+    auto newIota = rewriter.create<IotaOp>(iota.getLoc(), iotaType,
+                                           rewriter.getI64IntegerAttr(0));
+
+    auto broadcastAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({1}, rewriter.getIntegerType(64)),
+        {iotaDimension});
+    rewriter.replaceOpWithNewOp<BroadcastInDimOp>(iota, resultTy, newIota,
+                                                  broadcastAttr);
+    return success();
+  }
+};
+
+void IotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* context) {
+  results.add<IotaBroadcast>(context);
+}
+
+OpFoldResult IotaOp::fold(FoldAdaptor /*adaptor*/) {
+  auto dimension = getIotaDimension();
+  auto resultTy = getResult().getType().cast<ShapedType>();
+  if (resultTy.hasRank() && resultTy.getDimSize(dimension) == 1) {
+    Builder builder(getContext());
+    return builder.getZeroAttr(resultTy);
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicIotaOp
+//===----------------------------------------------------------------------===//
+
+// Does the same as PatternRewriter::replaceOpWithNewOp, but with a twist.
+//
+// Sometimes, we want to replace an op with a new op and simultaneously refine
+// the result type from a dynamically-shaped type to a statically-shaped type.
+// (Search for usages of this function for examples).
+//
+// Oftentimes, this works just fine because MHLO is designed to accommodate
+// this kind of type refinements. But sometimes, this doesn't work - when
+// the op is used outside of the MHLO dialect (e.g. in func.return). In these
+// cases, we insert a tensor.cast to smooth things out.
+template <typename OpTy, typename... Args>
+OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
+                       Args&&... args) {
+  auto newOp = rewriter.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+
+  llvm::SmallVector<Value> replacementResults;
+  assert(op->getNumResults() == newOp->getNumResults() &&
+         "replacement op doesn't match results of original op");
+  for (auto [opResult, newOpResult] :
+       llvm::zip(op->getResults(), newOp->getResults())) {
+    Value replacementResult = newOpResult;
+    if (llvm::any_of(opResult.getUsers(), [&](Operation* user) {
+          return user->getDialect() != op->getDialect();
+        })) {
+      replacementResult = rewriter.create<tensor::CastOp>(
+          op->getLoc(), opResult.getType(), newOpResult);
+    }
+    replacementResults.push_back(replacementResult);
+  }
+
+  rewriter.replaceOp(op, replacementResults);
+  return newOp;
+}
+
+namespace {
+
+struct DynamicIotaIsStatic : public OpRewritePattern<DynamicIotaOp> {
+  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicIotaOp iota,
+                                PatternRewriter& rewriter) const override {
+    // Result type has static shape, replace with iota.
+    auto resultTy = iota.getType().cast<ShapedType>();
+    if (resultTy.hasStaticShape()) {
+      rewriter.replaceOpWithNewOp<IotaOp>(iota, resultTy,
+                                          iota.getIotaDimension());
+      return success();
+    }
+
+    // Output shape is constant, compute result type with static shape, then
+    // replace with iota.
+    DenseIntElementsAttr outputShapeAttr;
+    if (matchPattern(iota.getOutputShape(), m_Constant(&outputShapeAttr))) {
+      SmallVector<int64_t> outputShape;
+      for (APInt dim : outputShapeAttr.getValues<APInt>()) {
+        outputShape.push_back(dim.getSExtValue());
+      }
+      resultTy = RankedTensorType::get(outputShape, resultTy.getElementType());
+      refineOpWithNewOp<IotaOp>(rewriter, iota, resultTy,
+                                iota.getIotaDimension());
+      return success();
+    }
+
+    return rewriter.notifyMatchFailure(
+        iota, "requires static shape or constant output shape");
+  }
+};
+
+// Dynamic Iota operations across multiple dimensions can be reduced to an iota
+// and a ranked broadcast.
+struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
+  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicIotaOp iota,
+                                PatternRewriter& rewriter) const override {
+    auto resultTy = iota.getType().cast<ShapedType>();
+    if (!resultTy.hasRank() || resultTy.getRank() < 2) {
+      return failure();
+    }
+
+    auto iotaDimension = iota.getIotaDimension();
+    auto iotaDimensionInt = iotaDimension;
+
+    Value refinedShape = iota.getOutputShape();
+    auto ty = refinedShape.getType().cast<ShapedType>();
+    if (ty.getElementType().isa<IndexType>()) {
+      refinedShape = rewriter.create<arith::IndexCastOp>(
+          iota.getLoc(),
+          RankedTensorType::get(ty.getShape(), rewriter.getI64Type()),
+          iota.getOutputShape());
+    }
+
+    auto slicedShape = rewriter.create<SliceOp>(
+        iota.getLoc(), refinedShape,
+        rewriter.getI64TensorAttr(iotaDimensionInt),
+        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
+        rewriter.getI64TensorAttr(1));
+
+    auto convertedSlicedShape = rewriter.create<arith::IndexCastOp>(
+        iota.getLoc(),
+        RankedTensorType::get({1}, iota.getOutputShape()
+                                       .getType()
+                                       .cast<ShapedType>()
+                                       .getElementType()),
+        slicedShape);
+
+    auto iotaType = RankedTensorType::get(
+        {resultTy.getDimSize(iotaDimensionInt)}, resultTy.getElementType());
+
+    auto newIota = rewriter.create<DynamicIotaOp>(
+        iota.getLoc(), iotaType, convertedSlicedShape,
+        rewriter.getI64IntegerAttr(0));
+
+    auto broadcastAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({1}, rewriter.getIntegerType(64)),
+        {iotaDimension});
+    rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
+        iota, resultTy, newIota, iota.getOutputShape(), broadcastAttr);
+    return success();
+  }
+};
+
+}  // namespace
+
+void DynamicIotaOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
+  results.add<DynamicIotaIsStatic>(context);
+  results.add<DynamicIotaBroadcast>(context);
+}
+
+static Value castToIndexTensor(OpBuilder& builder, Location loc,
+                               Value shapeOp) {
+  ShapedType resultTy = shape::getExtentTensorType(
+      builder.getContext(), shapeOp.getType().cast<ShapedType>().getDimSize(0));
+  if (shapeOp.getType() == resultTy) return shapeOp;  // Nothing to do.
+  return builder.create<arith::IndexCastOp>(loc, resultTy, shapeOp);
+}
+
+LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  DynamicIotaOp::Adaptor adaptor(operands);
+  reifiedReturnShapes.push_back(
+      castToIndexTensor(builder, getLoc(), adaptor.getOutputShape()));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicUpdateSliceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferDynamicUpdateSliceOp(
+      location, adaptor.getOperand(), adaptor.getUpdate(),
+      adaptor.getStartIndices(), inferredReturnShapes);
+}
+
+OpFoldResult DynamicUpdateSliceOp::fold(FoldAdaptor /*adaptor*/) {
+  auto operandShape = this->getOperand().getType().cast<RankedTensorType>();
+  auto updateShape = this->getUpdate().getType().cast<RankedTensorType>();
+
+  // If any of the dimensions are length-0, the update does nothing.
+  for (auto dim : updateShape.getShape()) {
+    if (dim == 0) {
+      return this->getOperand();
+    }
+  }
+
+  if (operandShape != updateShape || !operandShape.hasStaticShape()) {
+    return {};
+  }
+
+  // Ensure that indices are 0 constants. The 0 check mostly ensures
+  // correctness. For non-constants, the pattern does not fold to avoid hiding
+  // the behavior of incorrect user input.
+  for (Value index : this->getStartIndices()) {
+    DenseIntElementsAttr deAttr;
+    if (!matchPattern(index, m_Constant(&deAttr))) return {};
+    if (!deAttr.getSplatValue<IntegerAttr>().getValue().isZero()) return {};
+  }
+  return this->getUpdate();
+}
+
+//===----------------------------------------------------------------------===//
+// AbsOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AbsOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  AbsOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// CollectivePermuteOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult CollectivePermuteOp::verify() {
+  return hlo::verifyCollectivePermuteOp(getLoc(), getSourceTargetPairs());
+}
+
+//===----------------------------------------------------------------------===//
+// ConvolutionOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// DISC-Begin
+template <typename Op>
+LogicalResult ConvReifyReturnTypeImpl(
+    Op* op, OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes,
+    const SmallVector<Value>& spatial_padding_values, Type shape_scalar_type) {
+  typename Op::Adaptor adaptor(operands);
+  Value lhs = adaptor.getLhs();
+  Value rhs = adaptor.getRhs();
+
+  RankedTensorType lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+  RankedTensorType rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!lhs_type || !rhs_type) return failure();
+
+  Location loc = op->getLoc();
+
+  auto to_shape_scalar_type = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shape_scalar_type);
+  };
+
+  auto dimension_numbers = op->getDimensionNumbers();
+  int64_t input_batch_dimension = dimension_numbers.getInputBatchDimension();
+  int64_t kernel_output_feature_dimension =
+      dimension_numbers.getKernelOutputFeatureDimension();
+  auto input_spatial_dimensions_attr =
+      dimension_numbers.getInputSpatialDimensions();
+  auto kernel_spatial_dimensions_attr =
+      dimension_numbers.getKernelSpatialDimensions();
+  auto output_spatial_dimensions_attr =
+      dimension_numbers.getOutputSpatialDimensions();
+
+  SmallVector<Value, 4> shape_values(output_spatial_dimensions_attr.size() + 2);
+
+  // batch dim = lhs-batch-dim / batch_group_count
+  Value lhs_batch_dim = to_shape_scalar_type(
+      builder.create<tensor::DimOp>(loc, lhs, input_batch_dimension));
+  Value batch_group_count = to_shape_scalar_type(
+      builder.create<arith::ConstantIndexOp>(loc, op->getBatchGroupCount()));
+  Value batch_dim = to_shape_scalar_type(
+      builder.create<arith::DivSIOp>(loc, lhs_batch_dim, batch_group_count));
+  int64_t output_batch_dimension = dimension_numbers.getOutputBatchDimension();
+  shape_values[output_batch_dimension] = batch_dim;
+
+  // Output's feature dim is the same with kernel's output feature dim.
+  Value feature_dim = to_shape_scalar_type(
+      builder.create<tensor::DimOp>(loc, rhs, kernel_output_feature_dimension));
+  int64_t output_feature_dimension =
+      dimension_numbers.getOutputFeatureDimension();
+  shape_values[output_feature_dimension] = feature_dim;
+
+  Optional<DenseIntElementsAttr> window_strides_attr = op->getWindowStrides();
+  Optional<DenseIntElementsAttr> lhs_dilation_attr = op->getLhsDilation();
+  Optional<DenseIntElementsAttr> rhs_dilation_attr = op->getRhsDilation();
+
+  Value one =
+      to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(loc, 1));
+  for (uint64_t i = 0; i < output_spatial_dimensions_attr.size(); i++) {
+    // effective_input_value =
+    //    (input_size - 1) * input_dilation + 1 + padding_left + padding_right
+    Value effective_input_value =
+        to_shape_scalar_type(builder.create<tensor::DimOp>(
+            loc, lhs, input_spatial_dimensions_attr[i]));
+    // Dilation.
+    if (lhs_dilation_attr) {
+      Value input_dilation =
+          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
+              loc, lhs_dilation_attr.value().getValues<int64_t>()[i]));
+      effective_input_value = builder.create<arith::AddIOp>(
+          loc,
+          builder.create<arith::MulIOp>(
+              loc,
+              builder.create<arith::SubIOp>(loc, effective_input_value, one),
+              input_dilation),
+          one);
+    }
+
+    // Padding.
+    if (!spatial_padding_values.empty()) {
+      Value padding_left = spatial_padding_values[i * 2];
+      Value padding_right = spatial_padding_values[i * 2 + 1];
+      effective_input_value = builder.create<arith::AddIOp>(
+          loc, effective_input_value,
+          builder.create<arith::AddIOp>(loc, padding_left, padding_right));
+    }
+
+    // effective_kernel_size = (kernel_size - 1) * dilation + 1
+    Value effective_kernel_size_value =
+        to_shape_scalar_type(builder.create<tensor::DimOp>(
+            loc, rhs, kernel_spatial_dimensions_attr[i]));
+    if (rhs_dilation_attr) {
+      Value kernel_dilation =
+          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
+              loc, rhs_dilation_attr.value().getValues<int64_t>()[i]));
+      effective_kernel_size_value = builder.create<arith::AddIOp>(
+          loc, one,
+          builder.create<arith::MulIOp>(
+              loc, kernel_dilation,
+              builder.create<arith::SubIOp>(loc, effective_kernel_size_value,
+                                            one)));
+    }
+
+    // output_size =
+    //     (effective_input_value - effective_kernel_size_value) / stride + 1
+    Value output_dim_value = builder.create<arith::SubIOp>(
+        loc, effective_input_value, effective_kernel_size_value);
+    if (window_strides_attr) {
+      Value stride_value =
+          to_shape_scalar_type(builder.create<arith::ConstantIndexOp>(
+              loc, window_strides_attr.value().getValues<int64_t>()[i]));
+      output_dim_value =
+          builder.create<arith::DivSIOp>(loc, output_dim_value, stride_value);
+    }
+    output_dim_value =
+        builder.create<arith::AddIOp>(loc, output_dim_value, one);
+    shape_values[output_spatial_dimensions_attr[i]] = output_dim_value;
+  }
+  Value output_shape =
+      builder.create<tensor::FromElementsOp>(loc, shape_values);
+  reifiedReturnShapes.push_back(output_shape);
+  return success();
+}
+// DISC-End
+
+// Infer the return-shape of ConvolutionOp.
+// Precondition:
+//  1. Input args to ConvolutionOp 'op' are RankedTypes.
+//  2. rank-of(input-type) == rank-of(output-type)
+SmallVector<int64_t> inferConvolutionOpReturnShape(
+    ConvolutionOp op, const ArrayRef<hlo::WindowDimension> window) {
+  // We keep the 'unknown' dimensions (cl/415132294) as it is in the
+  // output-shape. To do that we initilize the output dimensions with the shape
+  // of the return-type and updates only the spatial + non-spatial dimensions.
+  // Precondition 2 ensures that size of output-shape == size of input-shape.
+  // NOTE: This is a divergence from StableHLO which doesn't allow us to fully
+  // share ConvolutionOp's verification / shape inference logic with StableHLO.
+  SmallVector<int64_t> outputDimensions =
+      to_vector(op.getResult().getType().cast<ShapedType>().getShape());
+
+  // Infer the output spatial dimensions.
+  auto lhsType = op.getLhs().getType().cast<RankedTensorType>();
+  auto inputSpatialDims = op.getDimensionNumbers().getInputSpatialDimensions();
+  auto numSpatialDims = inputSpatialDims.size();
+  SmallVector<int64_t> inputSpatialDimVals(numSpatialDims);
+  for (int64_t i = 0; i < static_cast<int64_t>(numSpatialDims); ++i)
+    inputSpatialDimVals[i] = lhsType.getShape()[inputSpatialDims[i]];
+
+  auto windowOutputShape = inferWindowOutputShape(inputSpatialDimVals, window);
+
+  for (int64_t i = 0; i < static_cast<int64_t>(window.size()); ++i)
+    outputDimensions[op.getDimensionNumbers().getOutputSpatialDimensions()[i]] =
+        windowOutputShape[i];
+
+  // Infer the output-batch-dimension and output-feature-dimension.
+  auto rhsType = op.getRhs().getType().cast<RankedTensorType>();
+  const int64_t inputBatch =
+      lhsType.getShape()[op.getDimensionNumbers().getInputBatchDimension()];
+  const int64_t kernelOutputFeatures =
+      rhsType.getShape()[op.getDimensionNumbers()
+                             .getKernelOutputFeatureDimension()];
+
+  outputDimensions[op.getDimensionNumbers().getOutputBatchDimension()] =
+      hlo::isDynamicDimSize(inputBatch) ? ShapedType::kDynamic
+                                        : inputBatch / op.getBatchGroupCount();
+  outputDimensions[op.getDimensionNumbers().getOutputFeatureDimension()] =
+      kernelOutputFeatures;
+
+  return outputDimensions;
+}
+
+// Some mhlo.convolutions are dot products, specifically when there is no
+// padding and no spatial dimensions. DotGeneralOp is general enough that it
+// can sufficiently describe it.
+struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
+  using OpRewritePattern<mhlo::ConvolutionOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(mhlo::ConvolutionOp op,
+                                PatternRewriter& rewriter) const override {
+    auto lhs = op.getLhs();
+    auto rhs = op.getRhs();
+    auto lhsTy = lhs.getType().cast<RankedTensorType>();
+    auto rhsTy = rhs.getType().cast<RankedTensorType>();
+    auto resultTy = op.getType().cast<RankedTensorType>();
+
+    if (lhsTy.getRank() != 2) return failure();
+    if (rhsTy.getRank() != 2) return failure();
+
+    if (op.getBatchGroupCount() != 1) return failure();
+
+    // There should not be any padding if this is a matmul.
+    auto dNums = op.getDimensionNumbers();
+    assert(!op.getPadding() || op.getPadding()->empty());
+    assert(dNums.getKernelSpatialDimensions().empty());
+
+    auto lhsBatchDim = dNums.getInputBatchDimension();
+    auto rhsBatchDim = dNums.getKernelOutputFeatureDimension();
+    auto lhsContractDim = dNums.getInputFeatureDimension();
+    auto rhsContractDim = dNums.getKernelInputFeatureDimension();
+    auto outBatchDim = dNums.getOutputBatchDimension();
+    auto outFeatureDim = dNums.getOutputFeatureDimension();
+
+    // If the input features are not grouped then we can directly convert to an
+    // mhlo.dot_general.
+    if (op.getFeatureGroupCount() == 1) {
+      // We can swap the lhs and rhs sides to avoid a transpose.
+      if (outBatchDim == 1 && outFeatureDim == 0) {
+        std::swap(lhs, rhs);
+        std::swap(outBatchDim, outFeatureDim);
+        std::swap(lhsContractDim, rhsContractDim);
+      }
+
+      auto dotNums = DotDimensionNumbersAttr::get(
+          op.getContext(), {}, {}, {lhsContractDim}, {rhsContractDim});
+      auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
+          op.getLoc(), op.getType(), lhs, rhs, dotNums,
+          op.getPrecisionConfig().value_or(nullptr));
+
+      rewriter.replaceOp(op, dotOp.getResult());
+      return success();
+    }
+
+    int64_t featureGroupCount = op.getFeatureGroupCount();
+    int64_t lhsBatchSize = lhsTy.getDimSize(lhsBatchDim);
+    int64_t lhsContractSize = lhsTy.getDimSize(lhsContractDim);
+    int64_t rhsBatchSize = rhsTy.getDimSize(rhsBatchDim);
+    int64_t rhsContractSize = rhsTy.getDimSize(rhsContractDim);
+
+    llvm::SmallVector<int64_t> lhsShape;
+    llvm::SmallVector<int64_t> rhsShape;
+    lhsShape.resize(3, lhsBatchSize);
+    rhsShape.resize(3, rhsContractSize);
+    lhsShape[lhsContractDim] = featureGroupCount;
+    lhsShape[lhsContractDim + 1] = lhsContractSize / featureGroupCount;
+    rhsShape[rhsContractDim] = featureGroupCount;
+    rhsShape[rhsContractDim + 1] = rhsBatchSize / featureGroupCount;
+
+    lhsTy = RankedTensorType::get(lhsShape, lhsTy.getElementType());
+    rhsTy = RankedTensorType::get(rhsShape, rhsTy.getElementType());
+
+    lhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), lhsTy, lhs);
+    rhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), rhsTy, rhs);
+
+    auto dotTy = RankedTensorType::get(
+        {featureGroupCount, lhsBatchSize, rhsBatchSize / featureGroupCount},
+        resultTy.getElementType());
+
+    auto dotNums = DotDimensionNumbersAttr::get(
+        op.getContext(), {lhsContractDim}, {rhsContractDim},
+        {lhsContractDim + 1}, {rhsContractDim == 0 ? 2 : 0});
+    auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
+        op.getLoc(), dotTy, lhs, rhs, dotNums,
+        op.getPrecisionConfig().value_or(nullptr));
+
+    llvm::SmallVector<int64_t> perms;
+    perms.resize(3, dNums.getOutputBatchDimension() == 0 ? 0 : 2);
+    perms[0] = dNums.getOutputFeatureDimension();
+    perms[2] = dNums.getOutputFeatureDimension() + 1;
+
+    auto transposeTy = RankedTensorType::get(
+        {dotTy.getDimSize(perms[0]), dotTy.getDimSize(perms[1]),
+         dotTy.getDimSize(perms[2])},
+        dotTy.getElementType());
+    auto transposeOp = rewriter.create<mhlo::TransposeOp>(
+        op.getLoc(), transposeTy, dotOp, rewriter.getI64TensorAttr(perms));
+
+    rewriter.replaceOpWithNewOp<mhlo::ReshapeOp>(op, resultTy, transposeOp);
+    return success();
+  }
+};
+
+}  // namespace
+
+void ConvolutionOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
+  results.add<ConvolutionIsDot>(context);
+}
+
+/*
+ * We intend to verify the following properties
+ *  P1. Verify the input, kernel types.
+ *  P2. Verify the convolution atributes.
+ *  P3. Verify and collect the window atributes.
+ *  P4. Verify the return shape.
+ */
+LogicalResult ConvolutionOp::verify() {
+  auto lhsType = getLhs().getType().dyn_cast<RankedTensorType>();
+  auto rhsType = getRhs().getType().dyn_cast<RankedTensorType>();
+
+  if (!lhsType || !rhsType) return success();
+
+  // P1.
+  int numDims = lhsType.getRank();
+  if (numDims != rhsType.getRank())
+    return emitOpError()
+           << "expects convolution arguments to have same number of "
+              "dimensions. Got: "
+           << lhsType << " and " << rhsType << ".";
+
+  if (numDims < 2)
+    return emitOpError()
+           << "expects convolution arguments to have >= 2 dimensions. "
+              "Got: "
+           << lhsType << " and " << rhsType << ".";
+
+  // P2.
+  if (failed(hlo::verifyConvolutionAttributes(
+          getLoc(), getLhs(), getRhs(),
+          getDimensionNumbers().getInputBatchDimension(),
+          getDimensionNumbers().getInputFeatureDimension(),
+          getDimensionNumbers().getInputSpatialDimensions(),
+          getDimensionNumbers().getKernelInputFeatureDimension(),
+          getDimensionNumbers().getKernelOutputFeatureDimension(),
+          getDimensionNumbers().getKernelSpatialDimensions(),
+          getDimensionNumbers().getOutputBatchDimension(),
+          getDimensionNumbers().getOutputFeatureDimension(),
+          getDimensionNumbers().getOutputSpatialDimensions(),
+          getFeatureGroupCount(), getBatchGroupCount(), getPrecisionConfig())))
+    return failure();
+
+  // P3.
+  auto kernelSpatialDimensions =
+      getDimensionNumbers().getKernelSpatialDimensions();
+  SmallVector<int64_t> windowDimensions(kernelSpatialDimensions.size());
+  for (size_t i = 0; i < windowDimensions.size(); i++)
+    windowDimensions[i] = rhsType.getShape()[kernelSpatialDimensions[i]];
+
+  auto paddingOrErr = convertNx2Attribute(this->getPadding(), getLoc());
+  if (failed(paddingOrErr)) return failure();
+  SmallVector<std::pair<int64_t, int64_t>> padding = *paddingOrErr;
+
+  auto windowOrErr = hlo::verifyWindowAttributesAndInferWindowDimensions(
+      windowDimensions, convertDenseIntAttr(getWindowStrides()), padding,
+      convertDenseIntAttr(getLhsDilation()),
+      convertDenseIntAttr(getRhsDilation()),
+      *hlo::convertWindowReversalAttribute(getWindowReversal(), getLoc(),
+                                           "window_reversal"),
+      getLoc());
+  if (failed(windowOrErr)) return failure();
+
+  // P4.
+  auto actualReturnType = getResult().getType().cast<TensorType>();
+  if (!actualReturnType.hasRank()) return success();
+
+  auto actualReturnRankedType = actualReturnType.cast<RankedTensorType>();
+  if (numDims != actualReturnRankedType.getRank())
+    return emitOpError() << "expects rank of convolution return-type to be "
+                            "equal to input-ranks ("
+                         << numDims << "), but got "
+                         << actualReturnRankedType.getRank() << ".";
+
+  auto expectedReturnShape = inferConvolutionOpReturnShape(*this, *windowOrErr);
+  if (failed(verifyCompatibleShape(expectedReturnShape,
+                                   actualReturnRankedType.getShape())))
+    return emitOpError() << "inferred shape '"
+                         << hlo::dimSizesToString(expectedReturnShape) << "' "
+                         << "is incompatible with return type of operation "
+                         << actualReturnRankedType;
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicConvOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct DynamicConvIsConv : public OpRewritePattern<mhlo::DynamicConvOp> {
+  using OpRewritePattern<mhlo::DynamicConvOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(mhlo::DynamicConvOp op,
+                                PatternRewriter& rewriter) const override {
+    DenseIntElementsAttr padAttr;
+    if (!matchPattern(op.getDPadding(), m_Constant(&padAttr))) {
+      return rewriter.notifyMatchFailure(op, "non-constant d_padding found");
+    }
+
+    SmallVector<int64_t> padArray;
+    for (APInt pad : padAttr.getValues<APInt>()) {
+      padArray.push_back(pad.getZExtValue());
+    }
+
+    int64_t paddedDimCount = padArray.size() / 2;
+    auto newPadAttr = DenseIntElementsAttr::get(
+        RankedTensorType::get({paddedDimCount, 2}, rewriter.getI64Type()),
+        padArray);
+
+    rewriter.replaceOpWithNewOp<mhlo::ConvolutionOp>(
+        op, op.getType(), op.getLhs(), op.getRhs(), op.getWindowStridesAttr(),
+        newPadAttr, op.getLhsDilationAttr(), op.getRhsDilationAttr(),
+        op.getWindowReversalAttr(), op.getDimensionNumbers(),
+        op.getFeatureGroupCount(), op.getBatchGroupCount(),
+        op.getPrecisionConfigAttr());
+    return success();
+  }
+};
+
+}  // namespace
+
+void DynamicConvOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
+  // DISC disable
+  // results.add<DynamicConvIsConv>(context);
+}
+
+// DISC-Begin
+LogicalResult DynamicConvOp::reifyReturnTypeShapes(
+    OpBuilder &builder, ValueRange operands,
+    SmallVectorImpl<Value> &reifiedReturnShapes) {
+  DynamicConvOp::Adaptor adaptor(operands);
+  Value d_padding = adaptor.getDPadding();
+
+  RankedTensorType padding_type =
+      d_padding.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!padding_type)
+    return failure();
+
+  Location loc = this->getLoc();
+  Type shape_scalar_type = padding_type.getElementType();
+  auto to_shape_scalar_type = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shape_scalar_type);
+  };
+
+  SmallVector<Value> spatial_padding_values;
+  auto dimension_numbers = this->getDimensionNumbers();
+  auto input_spatial_dimensions_attr =
+      dimension_numbers.getInputSpatialDimensions();
+  int64_t padding_num = input_spatial_dimensions_attr.size() * 2;
+  for (int64_t i = 0; i < padding_num; i++) {
+    Value offset = builder.create<arith::ConstantIndexOp>(loc, i);
+    Value pad_value = to_shape_scalar_type(
+        builder.create<tensor::ExtractOp>(loc, d_padding, offset));
+    spatial_padding_values.push_back(pad_value);
+  }
+
+  return ConvReifyReturnTypeImpl<DynamicConvOp>(
+      this, builder, operands, reifiedReturnShapes, spatial_padding_values,
+      shape_scalar_type);
+}
+// DISC-End
+
+//===----------------------------------------------------------------------===//
+// ConvertOp
+//===----------------------------------------------------------------------===//
+
+void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
+                      Type resultElementTy) {
+  Type resultTy;
+  Type operandTy = operand.getType();
+  if (auto rankedTy = operandTy.dyn_cast<RankedTensorType>()) {
+    resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy);
+  } else {
+    resultTy = UnrankedTensorType::get(resultElementTy);
+  }
+  build(builder, result, resultTy, operand);
+}
+
+OpFoldResult ConvertOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  auto operandTy = getOperand().getType().cast<TensorType>();
+  auto resultTy = getResult().getType().cast<TensorType>();
+  if (operandTy == resultTy) return getOperand();
+
+  // If the result has non-static shape, a convert op is necessary to go from
+  // static shape to non-static shape.
+  if (!resultTy.hasStaticShape()) return {};
+
+  // If the operand is constant, we can do the conversion now.
+  auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>();
+  if (!elementsAttr) return {};
+
+  // Prevent folding if the result is too large.
+  if (elementsAttr.getNumElements() > kFoldOpEltLimit) return {};
+  return hlo::convertElementsAttr(elementsAttr,
+                                  getElementTypeOrSelf(getResult()));
+}
+
+namespace {
+
+struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
+  using OpRewritePattern<ConvertOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConvertOp op,
+                                PatternRewriter& rewriter) const override {
+    auto convertOp = op.getOperand().getDefiningOp<ConvertOp>();
+    if (!convertOp) {
+      return failure();
+    }
+    auto firstType =
+        convertOp.getOperand().getType().cast<TensorType>().getElementType();
+    auto secondType =
+        op.getOperand().getType().cast<TensorType>().getElementType();
+    auto thirdType =
+        op.getResult().getType().cast<TensorType>().getElementType();
+    auto loc = rewriter.getFusedLoc({convertOp->getLoc(), op->getLoc()});
+    if (firstType.isa<FloatType>() && secondType.isa<FloatType>() &&
+        thirdType.isa<FloatType>()) {
+      // fold when the second float type's width is longer than first,
+      // like fp16 -> fp32 -> fp64, bf16 -> fp32 -> fp16
+      if (secondType.cast<FloatType>().getWidth() >
+          firstType.cast<FloatType>().getWidth()) {
+        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
+                                                  convertOp.getOperand());
+        rewriter.replaceOp(op, result);
+        return success();
+      }
+    } else if (firstType.isa<IntegerType>() && secondType.isa<IntegerType>() &&
+               thirdType.isa<IntegerType>()) {
+      // fold when the second integer type's width is longer than first,
+      // like i16 -> i32 -> i64, u16 -> i32 -> u32
+      if (secondType.cast<IntegerType>().getWidth() >
+          firstType.cast<IntegerType>().getWidth()) {
+        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
+                                                  convertOp.getOperand());
+        rewriter.replaceOp(op, result);
+        return success();
+      }
+    }
+    return failure();
+  }
+};
+
+}  // namespace
+
+void ConvertOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
+  results.add<EliminateIdentityConvert>(context);
+  results.add<EliminateRedundantConvert>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// StochasticConvertOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult StochasticConvertOp::verify() {
+  DataLayout dataLayout = DataLayout::closest(*this);
+  unsigned operandElementSize =
+      dataLayout.getTypeSizeInBits(getOperand().getType().getElementType());
+  unsigned randomElementSize =
+      dataLayout.getTypeSizeInBits(getRandom().getType().getElementType());
+  if (operandElementSize != randomElementSize) {
+    return emitOpError() << "requires the random's bitwidth to match the "
+                            "operand's, but got: "
+                         << randomElementSize << " and " << operandElementSize;
+  }
+  return success();
+}
+
+namespace {
+
+// Pattern for unpacking and repacking the same tuple.
+struct UnpackRepackSameTuple : public OpRewritePattern<TupleOp> {
+  using OpRewritePattern<TupleOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TupleOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getVal().empty()) return failure();
+
+    Value firstElement = op.getVal().front();
+    auto firstElementOp = firstElement.getDefiningOp<GetTupleElementOp>();
+    if (!firstElementOp || firstElementOp.getIndexAttr().getInt() != 0)
+      return failure();
+
+    Value tuplePredecessor = firstElementOp.getOperand();
+    if (tuplePredecessor.getType() != op.getType()) return failure();
+
+    for (const auto& elementAndIdx :
+         llvm::enumerate(op.getVal().drop_front(1))) {
+      auto elementOp = elementAndIdx.value().getDefiningOp<GetTupleElementOp>();
+      if (!elementOp ||
+          elementOp.getIndexAttr().getInt() !=
+              static_cast<int64_t>(elementAndIdx.index() + 1) ||
+          elementOp.getOperand() != tuplePredecessor)
+        return failure();
+    }
+
+    rewriter.replaceOp(op, tuplePredecessor);
+    return success();
+  }
+};
+
+}  // namespace
+
+void TupleOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
+  results.add<UnpackRepackSameTuple>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// AllToAllOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AllToAllOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  AllToAllOp::Adaptor adaptor(operands, attributes, regions);
+
+  bool isArrayAllToAll = adaptor.getSplitDimension() &&
+                         adaptor.getConcatDimension() &&
+                         adaptor.getSplitCount();
+  if (!isArrayAllToAll) {
+    if (adaptor.getSplitDimension() || adaptor.getConcatDimension() ||
+        adaptor.getSplitCount()) {
+      return emitOptionalError(location,
+                               "TupleAllToAll should not have split_dimension, "
+                               "concat_dimension or split_count attributes");
+    }
+
+    // TupleAllToAll has identical result and operand shapes.
+    for (size_t i = 0; i < operands.size(); ++i) {
+      auto rankedOperand = operands[i].getType().dyn_cast<RankedTensorType>();
+      if (rankedOperand)
+        inferredReturnShapes.emplace_back(rankedOperand.getShape(),
+                                          rankedOperand.getElementType(),
+                                          rankedOperand.getEncoding());
+      else
+        inferredReturnShapes.emplace_back(
+            operands[i].getType().cast<ShapedType>());
+    }
+
+    return success();
+  }
+
+  if (adaptor.getOperand().size() != 1) {
+    return emitOptionalError(location,
+                             "ArrayAllToAll should have exactly one operand");
+  }
+
+  return hlo::inferAllToAllOp(
+      location, adaptor.getOperand()[0], *adaptor.getSplitDimension(),
+      *adaptor.getConcatDimension(), *adaptor.getSplitCount(),
+      adaptor.getReplicaGroups(), inferredReturnShapes);
+}
+
+void AllToAllOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                       Type resultType, Value operand,
+                       IntegerAttr splitDimension, IntegerAttr concatDimension,
+                       IntegerAttr splitCount,
+                       DenseIntElementsAttr replicaGroups) {
+  AllToAllOp::build(odsBuilder, odsState, resultType, operand, splitDimension,
+                    concatDimension, splitCount, replicaGroups,
+                    /*channel_handle=*/nullptr);
+}
+
+void AllToAllOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                       ::mlir::TypeRange resultType, ::mlir::ValueRange operand,
+                       IntegerAttr splitDimension, IntegerAttr concatDimension,
+                       IntegerAttr splitCount,
+                       DenseIntElementsAttr replicaGroups) {
+  AllToAllOp::build(odsBuilder, odsState, resultType, operand, splitDimension,
+                    concatDimension, splitCount, replicaGroups,
+                    /*channel_handle=*/nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// AllGatherOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AllGatherOp::verify() {
+  return hlo::verifyAllGatherOp(getLoc(), getOperand(), getAllGatherDim(),
+                                getReplicaGroups(), getUseGlobalDeviceIds(),
+                                getResult());
+}
+
+void AllGatherOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                        Type resultType, Value operand,
+                        IntegerAttr allGatherDim,
+                        DenseIntElementsAttr replicaGroups,
+                        ChannelHandleAttr channelHandle) {
+  AllGatherOp::build(odsBuilder, odsState, resultType, operand, allGatherDim,
+                     replicaGroups, channelHandle,
+                     /*use_global_device_ids=*/nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// AllReduceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AllReduceOp::verify() {
+  return hlo::verifyAllReduceOp(getLoc(), getOperand(), getReplicaGroups(),
+                                getUseGlobalDeviceIds(), getComputation());
+}
+
+//===----------------------------------------------------------------------===//
+// BatchNormGradOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BatchNormGradOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  BatchNormGradOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferBatchNormGradOp(
+      location, adaptor.getOperand(), adaptor.getScale(), adaptor.getMean(),
+      adaptor.getVariance(), adaptor.getGradOutput(), adaptor.getFeatureIndex(),
+      inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// BatchNormTrainingOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferBatchNormTrainingOp(
+      location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
+      adaptor.getFeatureIndex(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// BatchNormInferenceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferBatchNormInferenceOp(
+      location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
+      adaptor.getMean(), adaptor.getVariance(), adaptor.getFeatureIndex(),
+      inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// BitcastOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult BitcastOp::fold(FoldAdaptor) {
+  if (getResult().getType() != getOperand().getType()) {
+    return {};
+  }
+
+  auto sourceLayout =
+      getOperation()->getAttrOfType<DenseIntElementsAttr>("source_layout");
+  auto resultLayout =
+      getOperation()->getAttrOfType<DenseIntElementsAttr>("result_layout");
+
+  if (sourceLayout == resultLayout) {
+    return getOperand();
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// BitcastConvertOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BitcastConvertOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  auto operandType = operands[0].getType().dyn_cast<RankedTensorType>();
+  auto resultType = getType().dyn_cast<RankedTensorType>();
+
+  // Only ranked tensors are supported.
+  if (!operandType || !resultType) return failure();
+
+  // Shape-changing bitcast convert is not implemented.
+  // TODO(kramerb): This could be done by adjusting the last dimension.
+  DataLayout dataLayout = DataLayout::closest(*this);
+  unsigned operandElementSize =
+      dataLayout.getTypeSizeInBits(operandType.getElementType());
+  unsigned resultElementSize =
+      dataLayout.getTypeSizeInBits(resultType.getElementType());
+  if (operandElementSize != resultElementSize) return failure();
+
+  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
+                                     &reifiedReturnShapes);
+}
+
+LogicalResult BitcastConvertOp::verify() {
+  return hlo::verifyBitcastConvertOp(getLoc(), getOperand(), getResult());
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult BroadcastOp::fold(FoldAdaptor adaptor) {
+  auto attrs = adaptor.getOperands();
+  auto type = getType().cast<RankedTensorType>();
+  auto sizesType = getBroadcastSizes().getType();
+  if (sizesType.getNumElements() == 0) {
+    return getOperand();
+  }
+
+  // Constant fold when an operand is a splat tensor attribute.
+  if (!attrs[0] || !type.hasStaticShape()) return {};
+  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
+  if (!splatOperandAttr) return {};
+
+  // Handle complex type
+  if (type.getElementType().isa<ComplexType>()) {
+    ComplexType complex = type.getElementType().cast<ComplexType>();
+    if (complex.getElementType().isa<FloatType>()) {
+      return DenseElementsAttr::get(
+          type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
+    }
+    if (complex.getElementType().isa<IntegerType>()) {
+      return DenseElementsAttr::get(
+          type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
+    }
+    return {};
+  }
+
+  return SplatElementsAttr::get(
+      type, splatOperandAttr.getSplatValue<mlir::Attribute>());
+}
+
+LogicalResult BroadcastOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  BroadcastOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferBroadcastOp(location, adaptor.getOperand(),
+                               adaptor.getBroadcastSizes(),
+                               inferredReturnShapes);
+}
+
+LogicalResult BroadcastOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  BroadcastOp::Adaptor adaptor(operands);
+  Value operand = adaptor.getOperand();
+
+  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  // Unranked tensors are not supported.
+  if (!operandType) return failure();
+
+  Location loc = getLoc();
+  SmallVector<Value, 4> shapeValues;
+
+  // Collect the broadcast sizes.
+  for (const auto& size : getBroadcastSizes()) {
+    shapeValues.push_back(
+        builder.create<arith::ConstantIndexOp>(loc, size.getZExtValue()));
+  }
+
+  // Collect the operand sizes.
+  for (auto index : llvm::seq<int64_t>(0, operandType.getRank())) {
+    shapeValues.push_back(
+        builder.createOrFold<tensor::DimOp>(loc, operand, index));
+  }
+
+  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            builder.getIndexType()),
+      shapeValues));
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BroadcastInDimOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult BroadcastInDimOp::verify() {
+  return hlo::verifyBroadcastInDimOp(getLoc(), getOperand(),
+                                     getBroadcastDimensions(), getResult());
+}
+
+OpFoldResult BroadcastInDimOp::fold(FoldAdaptor adaptor) {
+  auto attrs = adaptor.getOperands();
+  auto type = getType().cast<RankedTensorType>();
+  if (type == getOperand().getType()) {
+    auto broadcastValues = getBroadcastDimensions().getValues<int64_t>();
+    if (!std::equal(broadcastValues.begin(), broadcastValues.end(),
+                    llvm::seq<int64_t>(0, type.getRank()).begin())) {
+      return {};
+    }
+    return getOperand();
+  }
+
+  // Constant fold when an operand is a splat tensor attribute.
+  if (!attrs[0] || !type.hasStaticShape()) return {};
+  auto splatOperandAttr = attrs[0].dyn_cast<SplatElementsAttr>();
+  if (!splatOperandAttr) return {};
+
+  // Handle complex type
+  if (type.getElementType().isa<ComplexType>()) {
+    ComplexType complex = type.getElementType().cast<ComplexType>();
+    if (complex.getElementType().isa<FloatType>()) {
+      return DenseElementsAttr::get(
+          type, {splatOperandAttr.getSplatValue<std::complex<APFloat>>()});
+    }
+    if (complex.getElementType().isa<IntegerType>()) {
+      return DenseElementsAttr::get(
+          type, {splatOperandAttr.getSplatValue<std::complex<APInt>>()});
+    }
+    return {};
+  }
+
+  return SplatElementsAttr::get(
+      type, splatOperandAttr.getSplatValue<mlir::Attribute>());
+}
+
+// Simplify BroadcastInDim has the following behaviors: replace BroadcastInDim
+// with Reshape or Transpose if they are equivalent or replace
+// BroadcastInDim(BroadcastInDim(X)) with BroadcastInDim(X)
+class BroadcastInDimSimplifier : public OpRewritePattern<BroadcastInDimOp> {
+ public:
+  using OpRewritePattern<BroadcastInDimOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(BroadcastInDimOp op,
+                                PatternRewriter& rewriter) const override {
+    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!operandType || !resultType) {
+      return failure();
+    }
+    auto bsDimIndices = op.getBroadcastDimensions().getValues<int64_t>();
+    if (operandType.hasStaticShape() && resultType.hasStaticShape()) {
+      bool sameTotalElements =
+          operandType.getNumElements() == resultType.getNumElements();
+      // BroadcastInDim equivalent to reshape
+      if (llvm::is_sorted(bsDimIndices) && sameTotalElements) {
+        rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(),
+                                               op.getOperand());
+        return success();
+      }
+      // BroadcastInDim equivalent to transpose
+      if (operandType.getRank() == resultType.getRank() && sameTotalElements) {
+        rewriter.replaceOpWithNewOp<TransposeOp>(
+            op, op.getType(), op.getOperand(), op.getBroadcastDimensions());
+        return success();
+      }
+    }
+    // eliminate redundant BroadcastInDim
+    if (auto broadcastInDimOp = llvm::dyn_cast_or_null<BroadcastInDimOp>(
+            op.getOperand().getDefiningOp())) {
+      auto newIndices =
+          broadcastInDimOp.getBroadcastDimensions()
+              .mapValues(op.getBroadcastDimensions().getElementType(),
+                         [&bsDimIndices](const APInt& dim) -> APInt {
+                           return APInt(dim.getBitWidth(),
+                                        bsDimIndices[dim.getSExtValue()], true);
+                         })
+              .cast<DenseIntElementsAttr>();
+      rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
+          op, op.getType(), broadcastInDimOp.getOperand(), newIndices);
+      return success();
+    }
+    return failure();
+  }
+};
+
+void BroadcastInDimOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                   MLIRContext* context) {
+  results.add<BroadcastInDimSimplifier>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicBroadcastInDimOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DynamicBroadcastInDimOp::verify() {
+  return hlo::verifyDynamicBroadcastInDimOp(
+      getLoc(), getOperand(), getOutputDimensions(), getBroadcastDimensions(),
+      getKnownExpandingDimensions(), getKnownNonexpandingDimensions(),
+      getResult());
+}
+
+namespace {
+// If a DynamicBroadCastInDimOp is not actually dynamic, use an ordinary
+// BroadcastInDimOp.
+class DynamicBroadcastInDimOpNotActuallyDynamic
+    : public OpRewritePattern<DynamicBroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
+                                PatternRewriter& rewriter) const override {
+    auto type = op.getType().dyn_cast<RankedTensorType>();
+    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    auto* outputDimOp = op.getOutputDimensions().getDefiningOp();
+    if (!type || !operandType || !operandType.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op, "requires operand static shape");
+    }
+    // output has static shape, replace with broadcast_in_dim
+    if (type.hasStaticShape()) {
+      rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
+          op, type, op.getOperand(), op.getBroadcastDimensions());
+      return success();
+    }
+    // output_dimensions are constant, set output shape with output_dimensions,
+    // then replace with broadcast_in_dim
+    if (outputDimOp && outputDimOp->hasTrait<mlir::OpTrait::ConstantLike>()) {
+      DenseIntElementsAttr shapeAttr;
+      if (matchPattern(outputDimOp, m_Constant(&shapeAttr))) {
+        SmallVector<int64_t> outputShape;
+        for (APInt shape : shapeAttr.getValues<APInt>()) {
+          outputShape.push_back(shape.getZExtValue());
+        }
+        refineOpWithNewOp<BroadcastInDimOp>(
+            rewriter, op,
+            RankedTensorType::get(outputShape, type.getElementType()),
+            op.getOperand(), op.getBroadcastDimensions());
+        return success();
+      }
+    }
+    return rewriter.notifyMatchFailure(
+        op, "requires output static shape or constant broadcast dimensions");
+  }
+};
+
+class ChainedDynamicBroadcastInDimCanonicalization
+    : public OpRewritePattern<DynamicBroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp bcast,
+                                PatternRewriter& rewriter) const override {
+    auto precedingBcast =
+        bcast.getOperand().getDefiningOp<DynamicBroadcastInDimOp>();
+    if (!precedingBcast) return failure();
+
+    // Compose broadcast dimensions.
+    DenseIntElementsAttr precedingBcastDims =
+        precedingBcast.getBroadcastDimensions();
+    DenseIntElementsAttr bcastDims = bcast.getBroadcastDimensions();
+    SmallVector<APInt, 4> composition;
+    for (APInt precedingDim : precedingBcastDims) {
+      composition.push_back(
+          bcastDims.getValues<APInt>()[precedingDim.getZExtValue()]);
+    }
+    auto composedBcastDims =
+        DenseIntElementsAttr::get(precedingBcastDims.getType(), composition);
+
+    rewriter.replaceOpWithNewOp<DynamicBroadcastInDimOp>(
+        bcast, bcast.getType(), precedingBcast.getOperand(),
+        bcast.getOutputDimensions(), composedBcastDims);
+    return success();
+  }
+};
+
+// If all dimensions are known to be nonexpanding from the attribute, replace
+// the dynamic broadcast with a cast.
+class DynamicBroadcastInDimAllDimsNonExpanding
+    : public OpRewritePattern<DynamicBroadcastInDimOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicBroadcastInDimOp op,
+                                PatternRewriter& rewriter) const override {
+    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!resultType)
+      return rewriter.notifyMatchFailure(op, "requires ranked result type");
+
+    if (!op.getKnownNonexpandingDimensions().has_value() ||
+        op.getKnownNonexpandingDimensions()->size() != resultType.getRank()) {
+      return rewriter.notifyMatchFailure(
+          op, "known_nonexpanding_dimensions don't cover all output dims");
+    }
+
+    auto cast = rewriter.createOrFold<tensor::CastOp>(op.getLoc(), resultType,
+                                                      op.getOperand());
+    rewriter.replaceOp(op, cast);
+    return success();
+  }
+};
+}  // namespace
+
+void DynamicBroadcastInDimOp::getCanonicalizationPatterns(
+    RewritePatternSet& results, MLIRContext* context) {
+  results.add<ChainedDynamicBroadcastInDimCanonicalization,
+              DynamicBroadcastInDimOpNotActuallyDynamic,
+              DynamicBroadcastInDimAllDimsNonExpanding,
+              DynamicBroadcastToOwnShape_1, DynamicBroadcastToOwnShape_2,
+              DynamicBroadcastToOwnShape_3, DynamicBroadcastToOwnShape_4>(
+      context);
+}
+
+LogicalResult DynamicBroadcastInDimOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  DynamicBroadcastInDimOp::Adaptor adaptor(operands);
+  reifiedReturnShapes.push_back(
+      castToIndexTensor(builder, getLoc(), adaptor.getOutputDimensions()));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ClampOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ClampOp::verify() {
+  auto operandType = getOperand().getType().cast<RankedTensorType>();
+  auto operandShape = operandType.getShape();
+  auto minType = getMin().getType().cast<RankedTensorType>();
+
+  auto minShape = minType.getShape();
+  if (failed(verifyCompatibleShape(minType, operandType)) &&
+      minType.getRank() != 0) {
+    return emitOpError(llvm::formatv(
+        "min shape [{0}] is not scalar and is not compatible to operand shape "
+        "[{1}]",
+        llvm::make_range(minShape.begin(), minShape.end()),
+        llvm::make_range(operandShape.begin(), operandShape.end())));
+  }
+
+  auto maxType = getMax().getType().cast<RankedTensorType>();
+  auto maxShape = maxType.getShape();
+  if (failed(verifyCompatibleShape(maxType, operandType)) &&
+      maxType.getRank() != 0) {
+    return emitOpError(llvm::formatv(
+        "max shape [{0}] is not scalar and is not compatible to operand shape "
+        "[{1}]",
+        llvm::make_range(maxShape.begin(), maxShape.end()),
+        llvm::make_range(operandShape.begin(), operandShape.end())));
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ComplexOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ComplexOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ComplexOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferComplexOp(location, adaptor.getLhs(), inferredReturnTypes);
+}
+
+OpFoldResult ComplexOp::fold(FoldAdaptor) {
+  auto realOp = getOperand(0).getDefiningOp<mhlo::RealOp>();
+  auto imagOp = getOperand(1).getDefiningOp<mhlo::ImagOp>();
+  if (realOp && imagOp && realOp.getOperand() == imagOp.getOperand()) {
+    return realOp.getOperand();
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// ImagOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ImagOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ImagOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferImagOp(location, adaptor.getOperand(), inferredReturnTypes);
+}
+
+OpFoldResult ImagOp::fold(FoldAdaptor) {
+  if (auto complexOp = getOperand().getDefiningOp<mhlo::ComplexOp>()) {
+    return complexOp.getOperand(1);
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// IsFiniteOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult IsFiniteOp::inferReturnTypes(
+    MLIRContext* ctx, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  IsFiniteOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferIsFiniteOp(ctx, location, adaptor.getX(),
+                              inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// RealOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RealOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  RealOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferRealOp(location, adaptor.getOperand(), inferredReturnTypes);
+}
+
+OpFoldResult RealOp::fold(FoldAdaptor) {
+  if (auto complexOp = getOperand().getDefiningOp<mhlo::ComplexOp>()) {
+    return complexOp.getOperand(0);
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// ConcatenateOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+class SingleOperandConcatenateToCast : public OpRewritePattern<ConcatenateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    if (op.getVal().size() != 1) return failure();
+
+    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, op.getType(),
+                                                op.getVal().front());
+    return success();
+  }
+};
+
+class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    auto axis = op.getDimension();
+    llvm::SmallVector<Value, 6> newOperands;
+    for (auto operand : op.getOperands()) {
+      auto ty = operand.getType().cast<ShapedType>();
+      if (!ty.hasRank() || ty.getDimSize(axis) != 0) {
+        newOperands.push_back(operand);
+      }
+    }
+
+    if (!newOperands.empty() && newOperands.size() < op.getNumOperands()) {
+      rewriter.replaceOpWithNewOp<ConcatenateOp>(
+          op, op.getResult().getType(), newOperands, op.getDimension());
+      return success();
+    }
+
+    return failure();
+  }
+};
+
+class ConcatenateForwarding : public OpRewritePattern<ConcatenateOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    auto getFlattenedOperands = [&](const Value& val) -> ValueRange {
+      auto definingOp = dyn_cast_or_null<ConcatenateOp>(val.getDefiningOp());
+      // To avoid inflate the memory footprint, only flatten the ConcatenateOp
+      // when it has only one use.
+      if (definingOp && definingOp->hasOneUse() &&
+          definingOp.getDimension() == op.getDimension())
+        return definingOp.getVal();
+      return val;
+    };
+
+    bool needToFlatten = false;
+    int operandCount = 0;
+    llvm::for_each(op.getVal(), [&](Value val) {
+      auto result = getFlattenedOperands(val);
+      if (result.size() != 1 || result[0] != val) needToFlatten = true;
+      operandCount += result.size();
+    });
+
+    if (!needToFlatten) return failure();
+
+    llvm::SmallVector<Value, 6> newOperands;
+    newOperands.reserve(operandCount);
+
+    for (auto operand : op.getVal()) {
+      auto flattenedOperands = getFlattenedOperands(operand);
+      newOperands.append(flattenedOperands.begin(), flattenedOperands.end());
+    }
+
+    rewriter.replaceOpWithNewOp<ConcatenateOp>(op, op.getResult().getType(),
+                                               newOperands, op.getDimension());
+    return success();
+  }
+};
+
+}  // namespace
+
+LogicalResult ConcatenateOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ConcatenateOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferConcatenateOp(location, adaptor.getVal(),
+                                 adaptor.getDimension(), inferredReturnTypes);
+}
+
+void ConcatenateOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
+  results.add<ConcatenateOperandRemoval, ConcatenateForwarding,
+              SingleOperandConcatenateToCast>(context);
+}
+
+template <typename T>
+static Attribute foldConcatenateHelper(ConcatenateOp* op,
+                                       ArrayRef<Attribute> operands) {
+  auto axis = op->getDimension();
+  auto type = op->getType().cast<ShapedType>();
+  auto shape = type.getShape();
+
+  size_t topSize = 1;
+  for (int i = 0, e = axis; i < e; i++) {
+    topSize = topSize * shape[i];
+  }
+
+  // Prevent folding if the result is too large.
+  if (type.getNumElements() > kFoldOpEltLimit) return {};
+
+  SmallVector<T, 6> values;
+  for (size_t i = 0; i < topSize; i++) {
+    for (auto operand : operands) {
+      DenseElementsAttr attr = operand.cast<DenseElementsAttr>();
+      size_t bottomSize = attr.getNumElements() / topSize;
+      auto iter = attr.getValues<T>().begin() + i * bottomSize;
+      values.append(iter, iter + bottomSize);
+    }
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+static Attribute foldConcatenate(ConcatenateOp* op,
+                                 ArrayRef<Attribute> operands) {
+  for (auto operand : operands) {
+    if (!operand) return {};
+  }
+
+  auto type = op->getResult().getType().cast<ShapedType>();
+  auto etype = type.getElementType();
+  if (etype.isa<IntegerType>()) {
+    return foldConcatenateHelper<APInt>(op, operands);
+  }
+
+  if (etype.isa<FloatType>()) {
+    return foldConcatenateHelper<APFloat>(op, operands);
+  }
+
+  return {};
+}
+
+OpFoldResult ConcatenateOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (getNumOperands() == 1) return getOperand(0);
+
+  ShapedType type = getResult().getType().cast<ShapedType>();
+  if (!type.hasStaticShape()) return {};
+
+  auto axis = getDimension();
+  if (auto attr = foldConcatenate(this, operands)) {
+    return attr;
+  }
+
+  for (auto operand : getOperands()) {
+    auto ty = operand.getType().cast<ShapedType>();
+    if (ty.getDimSize(axis) != 0) {
+      return {};
+    }
+  }
+
+  return DenseElementsAttr::get(type, ArrayRef<Attribute>());
+}
+
+LogicalResult ConcatenateOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  ConcatenateOp::Adaptor adaptor(operands);
+  auto inputs = adaptor.getVal();
+
+  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!operandType) return failure();
+
+  Location loc = this->getLoc();
+  Type shapeScalarType = builder.getIndexType();
+  auto toShapeScalarType = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shapeScalarType);
+  };
+
+  SmallVector<SmallVector<Value, 4>, 4> allShapeValues;
+  for (size_t inputId = 0; inputId < inputs.size(); ++inputId) {
+    Value operand = inputs[inputId];
+    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+    if (!operandType) return failure();
+
+    SmallVector<Value, 4> shapeVals;
+    for (const auto& element : llvm::enumerate(operandType.getShape())) {
+      Value valueDim = toShapeScalarType(
+          builder.create<tensor::DimOp>(loc, operand, element.index()));
+      shapeVals.push_back(valueDim);
+    }
+    allShapeValues.emplace_back(std::move(shapeVals));
+  }
+
+  int axis = this->getDimension();
+  auto& shapeValues = allShapeValues[0];
+  for (size_t vecId = 1; vecId < allShapeValues.size(); ++vecId) {
+    auto& otherShapeValues = allShapeValues[vecId];
+    if (otherShapeValues.size() != shapeValues.size()) {
+      this->emitOpError()
+          << "Concatenate expects all operands must be of the same rank";
+      return failure();
+    }
+    shapeValues[axis] = builder.create<arith::AddIOp>(loc, shapeValues[axis],
+                                                      otherShapeValues[axis]);
+  }
+
+  Value outputShape = builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            shapeScalarType),
+      shapeValues);
+  reifiedReturnShapes.push_back(outputShape);
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicReshapeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DynamicReshapeOp::verify() {
+  return hlo::verifyDynamicReshapeOp(getLoc(), getOutputShape(), getResult());
+}
+
+LogicalResult DynamicReshapeOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  DynamicReshapeOp::Adaptor adaptor(operands);
+  reifiedReturnShapes.push_back(
+      castToIndexTensor(builder, getLoc(), adaptor.getOutputShape()));
+  return success();
+}
+
+namespace {
+class DynamicReshapeOpNotActuallyDynamic
+    : public OpRewritePattern<DynamicReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!type || !type.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op, "requires static shape tensor");
+    }
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
+    return success();
+  }
+};
+
+// Canonicalizes
+// %0 = some_op(%tensor)
+// %1 = "mhlo.dynamic_reshape"(%0, %shape)
+//      (tensor<?xT>, tensor<1xindex>) -> tensor<?xT>
+// ... uses of %1.
+//
+// into
+//
+// ... uses of %0.
+// This canonicalization is only correct if the input is correct!
+// TODO(b/178779691): Use a more sophisticated canonicalization that preserves
+// errors in input, and still allows us to get rid of redundant reshapes.
+class RemoveRedundantRank1DynamicReshape
+    : public OpRewritePattern<DynamicReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(DynamicReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto type = op.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!type || type.getRank() != 1 || type.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(
+          op, "requires rank 1 shape tensor with dynamic dimension");
+    }
+    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    if (!operandType || operandType.getRank() != 1 ||
+        operandType.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(
+          op, "requires rank 1 shape tensor with dynamic dimension");
+    }
+    rewriter.replaceOp(op, {op.getOperand()});
+    return success();
+  }
+};
+
+// Canonicalizes
+// %0 = "mhlo.dynamic_reshape"(%tensor, %shape)
+// %1 = same_operands_and_result_shape_op(%tensor)
+// %2 = "mhlo.dynamic_reshape"(%1, %shape)
+// ... uses of %2.
+//
+// into
+//
+// %0 = "mhlo.dynamic_reshape"(%tensor, %shape)
+// %1 = same_operands_and_result_shape_op(%tensor)
+// ... uses of %1.
+class DynamicReshapeOpSameShapeOpResult
+    : public OpRewritePattern<DynamicReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicReshapeOp op,
+                                PatternRewriter& rewriter) const override {
+    Operation* defOp = op.getOperand().getDefiningOp();
+    if (!defOp ||
+        !defOp->hasTrait<mlir::OpTrait::SameOperandsAndResultShape>()) {
+      return failure();
+    }
+    Operation* inputDefOp = defOp->getOperand(0).getDefiningOp();
+    if (!inputDefOp) {
+      return failure();
+    }
+    auto reshape = dyn_cast<DynamicReshapeOp>(*inputDefOp);
+    if (reshape && reshape.getOutputShape() == op.getOutputShape()) {
+      rewriter.replaceOp(op, {defOp->getResult(0)});
+      return success();
+    }
+    return failure();
+  }
+};
+}  // namespace
+
+void DynamicReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                   MLIRContext* context) {
+  // clang-format off
+  results.add<
+      DynamicReshapeOpNotActuallyDynamic,
+      DynamicReshapeOpSameShapeOpResult,
+      RemoveRedundantDynamicBroadcast,
+      RemoveRedundantDynamicReshape,
+      RemoveRedundantRank1DynamicReshape,
+      ShapeOfDynamicReshape
+    >(context);
+  // clang-format on
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicSliceOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Canonicalizes DynamicSlice ops that can be replaced instead with Slice ops.
+// This canonicalization is applied the case when the `begin` input values are
+// compile time constants and thus can be made into a tensor.
+struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
+  using OpRewritePattern<DynamicSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicSliceOp dynamicSlice,
+                                PatternRewriter& rewriter) const override {
+    Value input = dynamicSlice.getOperand();
+    auto inputTensor = input.getType().dyn_cast<RankedTensorType>();
+    if (!inputTensor || !inputTensor.hasStaticShape()) return failure();
+
+    auto sliceSizes = dynamicSlice.getSliceSizes().getValues<int64_t>();
+    SmallVector<int64_t, 4> tempStartIndices;
+    for (const auto& indexAndSliceStart :
+         llvm::enumerate(dynamicSlice.getStartIndices())) {
+      APInt val;
+      Value start = indexAndSliceStart.value();
+      int64_t index = indexAndSliceStart.index();
+      if (!matchPattern(start, m_ConstantInt(&val))) {
+        return failure();
+      }
+      // Clamp the indices within bounds to faithfully mirror dynamic slice
+      // semantics.
+      int64_t clampedStart =
+          clamp(val.getSExtValue(), static_cast<int64_t>(0),
+                inputTensor.getDimSize(index) - sliceSizes[index]);
+      tempStartIndices.push_back(clampedStart);
+    }
+
+    // At this point we've determined that the start indices are all constants;
+    // pack them into a single tensor.
+    auto loc = dynamicSlice.getLoc();
+    int64_t inputRank = inputTensor.getRank();
+    auto sliceStartIndices = rewriter.getI64TensorAttr(tempStartIndices);
+    DenseIntElementsAttr sliceLimits = buildSliceLimits(
+        sliceStartIndices, dynamicSlice.getSliceSizes(), &rewriter);
+    DenseIntElementsAttr sliceStrides =
+        rewriter.getI64TensorAttr(SmallVector<int64_t, 4>(inputRank, 1));
+    auto result = rewriter.create<SliceOp>(loc, input, sliceStartIndices,
+                                           sliceLimits, sliceStrides);
+    rewriter.replaceOp(dynamicSlice, {result});
+    return success();
+  }
+};
+
+}  // namespace
+
+void DynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                 MLIRContext* context) {
+  results.add<DynamicSliceToSlice>(context);
+}
+
+LogicalResult DynamicSliceOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  DynamicSliceOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferDynamicSliceOp(
+      location, adaptor.getOperand(), adaptor.getStartIndices(),
+      adaptor.getSliceSizes(), inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// RealDynamicSliceOp
+//===----------------------------------------------------------------------===//
+// Verifies that operand rank matches start_indices/limit_indices/strides size
+LogicalResult RealDynamicSliceOp::verify() {
+  return hlo::verifyRealDynamicSliceOp(getLoc(), getOperand(),
+                                       getStartIndices(), getLimitIndices(),
+                                       getStrides());
+}
+
+namespace {
+struct RealDSliceToDSlice : public OpRewritePattern<RealDynamicSliceOp> {
+  using OpRewritePattern<RealDynamicSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(RealDynamicSliceOp op,
+                                PatternRewriter& rewriter) const override {
+    // This rewrite only works for unit strides because DynamicSliceOp
+    // doesn't support strides (i.e. it implicitly has unit strides).
+    DenseIntElementsAttr stridesAttr;
+    if (!matchPattern(op.getStrides(), m_Constant(&stridesAttr)))
+      return rewriter.notifyMatchFailure(op, "requires constant strides");
+    if (!llvm::all_of(stridesAttr.getValues<APInt>(),
+                      [&](APInt stride) { return stride == 1; }))
+      return rewriter.notifyMatchFailure(op, "requires unit strides");
+
+    // Check that slice sizes are fully static (DynamicSliceOp style).
+    // To detect that, we check whether `limit_indices` is defined as
+    // `start_indices + constant` or `constant + start_indices`.
+    DenseIntElementsAttr sliceSizesAttr;
+    auto m_startIndices = matchers::m_Val(op.getStartIndices());
+    if (!matchPattern(
+            op.getLimitIndices(),
+            m_Op<AddOp>(m_startIndices, m_Constant(&sliceSizesAttr))) &&
+        !matchPattern(op.getLimitIndices(),
+                      m_Op<AddOp>(m_Constant(&sliceSizesAttr), m_startIndices)))
+      return rewriter.notifyMatchFailure(
+          op, "requires limit indices equal to start indices plus constant");
+
+    // RealDynamicSliceOp can take tensors of integer or index element types.
+    // DynamicSliceOp::slice_sizes only supports i64 element type.
+    // Adapt accordingly in order to be compatible with DynamicSliceOp.
+    SmallVector<int64_t> sliceSizes;
+    for (auto element : sliceSizesAttr.getValues<APInt>()) {
+      sliceSizes.push_back(element.getSExtValue());
+    }
+
+    // RealDynamicSliceOp::start_indices is a 1-dimensional tensor.
+    // DynamicSliceOp::start_indices is a vararg of 0-dimensional tensors.
+    // Adapt accordingly in order to be compatible with DynamicSliceOp.
+    SmallVector<Value> startIndices;
+    for (auto i = 0; i < static_cast<int64_t>(sliceSizes.size()); ++i) {
+      auto startIndex1D = rewriter.create<SliceOp>(
+          op.getLoc(), op.getStartIndices(), rewriter.getI64TensorAttr(i),
+          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
+      auto startIndex0DType = RankedTensorType::get(
+          {},
+          op.getStartIndices().getType().cast<ShapedType>().getElementType());
+      auto startIndex0D = rewriter.create<ReshapeOp>(
+          op.getLoc(), startIndex0DType, startIndex1D);
+      startIndices.push_back(startIndex0D);
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::DynamicSliceOp>(
+        op, op.getOperand(), startIndices,
+        rewriter.getI64TensorAttr(sliceSizes));
+    return success();
+  }
+};
+}  // namespace
+
+void RealDynamicSliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                     MLIRContext* context) {
+  // DISC disable
+  // results.add<RealDSliceToSlice, RealDSliceToDSlice>(context);
+}
+
+LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  RealDynamicSliceOp::Adaptor adaptor(operands);
+  Value operand = adaptor.getOperand();
+  Value startIndices = adaptor.getStartIndices();
+  Value limitIndices = adaptor.getLimitIndices();
+  Value strides = adaptor.getStrides();
+
+  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!operandType) return failure();
+
+  Location loc = this->getLoc();
+  SmallVector<Value, 4> shapeValues;
+  shapeValues.reserve(operandType.getRank());
+  Type shapeScalarType =
+      startIndices.getType().cast<ShapedType>().getElementType();
+  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  one = maybeCastTo(builder, loc, one, shapeScalarType);
+  for (const auto& element : llvm::enumerate(operandType.getShape())) {
+    Value offset = builder.create<arith::ConstantIndexOp>(loc, element.index());
+    Value valueStart =
+        builder.create<tensor::ExtractOp>(loc, startIndices, offset);
+    Value valueLimit =
+        builder.create<tensor::ExtractOp>(loc, limitIndices, offset);
+    Value valueStride = builder.create<tensor::ExtractOp>(loc, strides, offset);
+    // size = (limit - start + stride - 1) / stride
+    shapeValues.push_back(builder.create<arith::DivSIOp>(
+        loc,
+        builder.create<arith::SubIOp>(
+            loc,
+            builder.create<arith::AddIOp>(
+                loc, valueStride,
+                builder.create<arith::SubIOp>(loc, valueLimit, valueStart)),
+            one),
+        valueStride));
+  }
+
+  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            shapeScalarType),
+      shapeValues));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// InfeedOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult InfeedOp::verify() {
+  auto dialect = getContext()->getLoadedDialect<MhloDialect>();
+  return hlo::verifyInfeedOp(dialect, getLoc(), getLayout(), getResults());
+}
+
+//===----------------------------------------------------------------------===//
+// MapOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MapOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  MapOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferMapOp(location, adaptor.getInputs(), adaptor.getDimensions(),
+                         adaptor.getComputation(), inferredReturnShapes);
+}
+
+OpFoldResult MapOp::fold(FoldAdaptor) {
+  mlir::Block& bb = getComputation().front();
+  mlir::Operation& frontOp = bb.front();
+
+  auto retOp = mlir::dyn_cast<ReturnOp>(frontOp);
+  if (!retOp) return nullptr;
+  if (retOp.getResults().size() != 1) return nullptr;
+
+  for (mlir::BlockArgument barg : bb.getArguments()) {
+    if (barg == retOp.getResults()[0])
+      return getOperands()[barg.getArgNumber()];
+  }
+  return nullptr;
+}
+
+LogicalResult MapOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
+                                     &reifiedReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// OutfeedOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult OutfeedOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto dialect = context->getLoadedDialect<MhloDialect>();
+  return hlo::inferOutfeedOp(dialect, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// SendOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SendOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto dialect = context->getLoadedDialect<MhloDialect>();
+  return hlo::inferSendOp(dialect, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// RecvOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RecvOp::verify() {
+  auto dialect = getContext()->getLoadedDialect<MhloDialect>();
+  return hlo::verifyRecvOp(dialect, getLoc(), getResults());
+}
+
+//===----------------------------------------------------------------------===//
+// CopyOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult CopyOp::fold(FoldAdaptor) { return getOperand(); }
+
+//===----------------------------------------------------------------------===//
+// ReduceWindowOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReduceWindowOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  ReduceWindowOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferReduceWindowOp(
+      location, adaptor.getInputs(), adaptor.getInitValues(),
+      adaptor.getWindowDimensions(), adaptor.getWindowStrides(),
+      adaptor.getBaseDilations(), adaptor.getWindowDilations(),
+      adaptor.getPadding(), inferredReturnShapes);
+}
+
+LogicalResult ReduceWindowOp::verify() {
+  return hlo::verifyReduceWindowOp(getLoc(), getInputs(), getInitValues(),
+                                   getWindowDimensions(), getWindowStrides(),
+                                   getBaseDilations(), getWindowDilations(),
+                                   getPadding(), getBody());
+}
+
+// Get the operation used for reduction applied to `result_index`th result. Its
+// expected to be a binary operation that consumes `result_index`th and
+// `result_index + getInputs().size`th arguments of the body.
+Operation* ReduceWindowOp::getReductionOp(int resultIndex) {
+  auto returnOp = cast<ReturnOp>(getBody().front().getTerminator());
+  Operation* computeOp = returnOp.getResults()[resultIndex].getDefiningOp();
+  if (computeOp->getNumOperands() != 2) return nullptr;
+  auto arg0 = computeOp->getOperand(0).dyn_cast<BlockArgument>();
+  auto arg1 = computeOp->getOperand(1).dyn_cast<BlockArgument>();
+  if (!arg0 || !arg1) return nullptr;
+  int64_t arg0Num = arg0.getArgNumber();
+  int64_t arg1Num = arg1.getArgNumber();
+  int64_t otherArgIndex = resultIndex + getInputs().size();
+  if (arg0Num == resultIndex && arg1Num == otherArgIndex) return computeOp;
+  if (arg0Num == otherArgIndex && arg1Num == resultIndex &&
+      computeOp->hasTrait<mlir::OpTrait::IsCommutative>())
+    return computeOp;
+  return nullptr;
+}
+
+bool isSplatZero(SplatElementsAttr attr) {
+  if (!attr) return false;
+  if (attr.getElementType().isa<FloatType>()) {
+    return attr.getSplatValue<APFloat>().isZero();
+  }
+  if (attr.getElementType().isa<IntegerType>()) {
+    return attr.getSplatValue<APInt>().isZero();
+  }
+  return false;
+}
+
+LogicalResult ReduceWindowOp::fold(FoldAdaptor adaptor,
+                                   SmallVectorImpl<OpFoldResult>& results) {
+  auto operands = adaptor.getOperands();
+  const auto emptyOrAllEq = [](const Optional<DenseIntElementsAttr> opt,
+                               const int64_t n) {
+    return !opt.has_value() ||
+           (opt->isSplat() && opt->getSplatValue<IntegerAttr>().getInt() == n);
+  };
+  const auto isSumReductionBody = [](mlir::Region& body) {
+    if (body.getNumArguments() != 2) return false;
+    auto returnOp = dyn_cast_or_null<ReturnOp>(body.back().getTerminator());
+    if (!returnOp || returnOp.getNumOperands() != 1) return false;
+    auto addOp = returnOp.getOperand(0).getDefiningOp<AddOp>();
+    if (!addOp) return false;
+    return (addOp.getLhs() == body.getArgument(0) &&
+            addOp.getRhs() == body.getArgument(1)) ||
+           (addOp.getLhs() == body.getArgument(1) &&
+            addOp.getRhs() == body.getArgument(0));
+  };
+
+  // Fold no-op single input sum reduction.
+  if (getInputs().size() == 1 &&
+      isSplatZero(operands[1].dyn_cast_or_null<SplatElementsAttr>()) &&
+      emptyOrAllEq(getWindowDimensionsAttr(), 1) &&
+      emptyOrAllEq(getWindowStrides(), 1) &&
+      emptyOrAllEq(getBaseDilations(), 1) &&
+      emptyOrAllEq(getWindowDilations(), 1) && emptyOrAllEq(getPadding(), 0) &&
+      isSumReductionBody(getBody())) {
+    results.push_back(getInputs()[0]);
+    return success();
+  }
+
+  return failure();
+}
+
+// Builder that takes a constructor for its region and infers result types
+void ReduceWindowOp::build(
+    OpBuilder& odsBuilder, OperationState& odsState, ValueRange inputs,
+    ValueRange init_values, DenseIntElementsAttr window_dimensions,
+    /*optional*/ DenseIntElementsAttr window_strides,
+    /*optional*/ DenseIntElementsAttr base_dilations,
+    /*optional*/ DenseIntElementsAttr window_dilations,
+    /*optional*/ DenseIntElementsAttr padding,
+    function_ref<void(OpBuilder&, Location, ValueRange)> bodyBuilder) {
+  odsState.addOperands(inputs);
+  odsState.addOperands(init_values);
+  odsState.addAttribute(getWindowDimensionsAttrName(odsState.name),
+                        window_dimensions);
+  if (window_strides) {
+    odsState.addAttribute(getWindowStridesAttrName(odsState.name),
+                          window_strides);
+  }
+  if (base_dilations) {
+    odsState.addAttribute(getBaseDilationsAttrName(odsState.name),
+                          base_dilations);
+  }
+  if (window_dilations) {
+    odsState.addAttribute(getWindowDilationsAttrName(odsState.name),
+                          window_dilations);
+  }
+  if (padding) {
+    odsState.addAttribute(getPaddingAttrName(odsState.name), padding);
+  }
+  Region* region = odsState.addRegion();
+
+  llvm::SmallVector<Type> blockArgTypes;
+  llvm::SmallVector<Location> locs;
+  auto numValues = inputs.size() + init_values.size();
+  blockArgTypes.reserve(numValues);
+  locs.reserve(numValues);
+  for (auto i : inputs) {
+    auto iType = i.getType().cast<ShapedType>();
+    blockArgTypes.push_back(iType.cloneWith(
+        llvm::ArrayRef<int64_t>(std::nullopt), iType.getElementType()));
+    locs.push_back(i.getLoc());
+  }
+  for (auto i : init_values) {
+    auto iType = i.getType().cast<ShapedType>();
+    blockArgTypes.push_back(iType.cloneWith(
+        llvm::ArrayRef<int64_t>(std::nullopt), iType.getElementType()));
+    locs.push_back(i.getLoc());
+  }
+
+  {
+    OpBuilder::InsertionGuard g(odsBuilder);
+    Block* body =
+        odsBuilder.createBlock(region, /*insertPt=*/{}, blockArgTypes, locs);
+    bodyBuilder(odsBuilder, odsState.location, body->getArguments());
+  }
+
+  llvm::SmallVector<mlir::Type, 2> inferredReturnTypes;
+  if (mlir::succeeded(ReduceWindowOp::inferReturnTypes(
+          odsBuilder.getContext(), odsState.location, odsState.operands,
+          odsState.attributes.getDictionary(odsState.getContext()),
+          odsState.regions, inferredReturnTypes)))
+    odsState.addTypes(inferredReturnTypes);
+  else
+    llvm::report_fatal_error("Failed to infer result type(s).");
+}
+
+//===----------------------------------------------------------------------===//
+// ReducePrecisionOp
+//===----------------------------------------------------------------------===//
+
+// The following property is already enforced by the ODS:
+//  P0. operand element type is float
+//  P1. mantissa_bits >= 0
+// We intend to verify the following properties
+//  P2. exponent_bits >= 1
+LogicalResult ReducePrecisionOp::verify() {
+  return hlo::verifyReducePrecisionOp(getLoc(), getExponentBits(),
+                                      getMantissaBits());
+}
+
+//===----------------------------------------------------------------------===//
+// ReverseOp
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+static Attribute foldReverseHelper(DenseElementsAttr& attr, ShapedType& type,
+                                   DenseIntElementsAttr& dims) {
+  int64_t numElements = attr.getNumElements();
+  // No-op if the tensor has 0 elements.
+  // No-op if the result of folding is too large.
+  if (numElements == 0 || numElements > kFoldOpEltLimit) return {};
+
+  SmallVector<T> result(attr.getValues<T>().begin(), attr.getValues<T>().end());
+
+  size_t rank = type.getRank();
+  SmallVector<int64_t> stride(rank + 1, numElements);
+  for (size_t i = 0; i < rank; i++) {
+    if (type.getDimSize(i) == 0) return {};
+    stride[i + 1] = stride[i] / type.getDimSize(i);
+  }
+
+  for (auto dim : dims.getValues<int64_t>()) {
+    // For example, given:
+    //   * tensor: tensor<2x3x2xi32>
+    //     [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9,10], [11, 12]]]
+    //   * dim: [1]
+    //
+    // We're going to reverse the tensor with respect to dim as follows:
+    //   1) Split the tensor into blocks, i.e. smaller tensors whose type is
+    //   derived from the tensor by dropping the first `dim` dimensions, i.e.
+    //   tensor<3x2xi32> for the running example.
+    //   2) Split each block into windows, i.e. even smaller tensors whose type
+    //   is derived from the block by dropping the first dimension of the
+    //   block, i.e. tensor<2xi32> for the running example.
+    //   3) Within each block, swap windows but don't change the order of
+    //   elements within the windows: 0th window goes to N-1st spot, 1st window
+    //   goes to N-2nd spot etc.
+    //
+    // For the running example, the result will be:
+    //   [[[5, 6], [3, 4], [1, 2]], [[11, 12], [9, 10], [7, 8]]].
+    //
+    // Note how elements within windows haven't changed their order with respect
+    // to each other and how blocks haven't changed their order with respect to
+    // each other.
+    int64_t numWindows = type.getDimSize(dim);
+    int64_t windowSize = stride[dim] / numWindows;
+
+    for (int64_t index = 0; index < numElements; index++) {
+      int64_t blockNumber = index / stride[dim];
+      int64_t windowNumber = (index % stride[dim]) / windowSize;
+      int64_t reversedWindowNumber = numWindows - windowNumber - 1;
+      if (windowNumber >= reversedWindowNumber) continue;
+      int64_t reversedIndex = blockNumber * stride[dim] +
+                              reversedWindowNumber * windowSize +
+                              index % windowSize;
+      std::swap(result[index], result[reversedIndex]);
+    }
+  }
+  return DenseElementsAttr::get(type, result);
+}
+
+OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  Value input = getOperand();
+
+  // No dimensions to reverse.
+  DenseIntElementsAttr dims = getDimensions();
+  if (dims.getNumElements() == 0) return input;
+
+  // If size of all dimensions to reverse equals 1, then the reverse is a no-op.
+  // Eg. Reverse dimensions {0,1} of a 1x1x2 tensor
+  auto shapedType = input.getType().cast<ShapedType>();
+  if (llvm::all_of(dims.getValues<int64_t>(), [&](int64_t dim) {
+        return shapedType.getDimSize(dim) == 1;
+      }))
+    return input;
+
+  // If the operand is a static shaped tensor of constants, return reversed
+  // tensor
+  DenseElementsAttr inputAttr =
+      operands.begin()->dyn_cast_or_null<DenseElementsAttr>();
+  if (inputAttr && shapedType.hasStaticShape()) {
+    auto etype = shapedType.getElementType();
+    if (etype.isa<IntegerType>())
+      return foldReverseHelper<APInt>(inputAttr, shapedType, dims);
+    if (etype.isa<FloatType>())
+      return foldReverseHelper<APFloat>(inputAttr, shapedType, dims);
+  }
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReduceOp::fold(FoldAdaptor /*adaptor*/,
+                             SmallVectorImpl<OpFoldResult>& results) {
+  // No dimensions to reduce.
+  if (getDimensions().getNumElements() == 0) {
+    for (Value operand : this->getInputs()) {
+      results.push_back(operand);
+    }
+    return success();
+  }
+
+  // If all returned values in the ReduceOp region exists outside
+  // the region replace the ReduceOp with those values.
+  mlir::Block& bb = this->getBody().front();
+  SmallVector<Value> replacedResults;
+  if (auto retOp = mlir::dyn_cast<ReturnOp>(bb.back())) {
+    for (Value result : retOp.getResults()) {
+      if (result.getParentRegion() == retOp->getParentRegion())
+        return failure();
+      replacedResults.push_back(result);
+    }
+
+    results.insert(results.end(), replacedResults.begin(),
+                   replacedResults.end());
+    return success();
+  }
+
+  return failure();
+}
+
+bool hasSameOperandAndResultTypes(Operation& op) {
+  Type expected;
+  if (op.getNumResults() != 0) expected = op.getResult(0).getType();
+  if (op.getNumOperands() != 0) expected = op.getOperand(0).getType();
+  if (!expected) return false;
+
+  auto typeMatch = [&](Type actual) { return actual == expected; };
+  return llvm::all_of(op.getOperandTypes(), typeMatch) &&
+         llvm::all_of(op.getResultTypes(), typeMatch);
+}
+
+// Checks the following eligibility criteria for compact printing of
+// mhlo.reduce:
+// E1. The reduce-op wraps a single inner-op in the associated region.
+// E2. The single operation is a commutative binary-op from mhlo dialect, zero
+//     region, producing single result such that the operands and result all
+//     have the same type.
+// E3. The reduce-op consist of at least one input-operand; The operand-types of
+//     inner-op should be derived trivially from the element-type of reduce-op's
+//     first input-operand.
+// E4. The  arguments of the region's only basic block are forwarded perfectly
+//     to inner-op's operands.
+// E5. The reduce-op, inner-op, blocks arguments, and the return-op all have the
+//     same location.
+// E6. The single operation result is perfectly forwarded to the reduce op
+//     return.
+static bool isEligibleForCompactPrint(ReduceOp op) {
+  // Check E1.
+  auto& block = op.getBody().front();
+  if (!hasSingleElement(block.without_terminator())) return false;
+
+  Operation& innerOp = *block.begin();
+
+  // Check E2.
+  if (innerOp.getDialect() != op->getDialect()) return false;
+
+  if (innerOp.getNumOperands() != 2 ||
+      !innerOp.hasTrait<mlir::OpTrait::OneResult>() ||
+      !hasSameOperandAndResultTypes(innerOp) ||
+      !innerOp.hasTrait<mlir::OpTrait::IsCommutative>() ||
+      !innerOp.hasTrait<mlir::OpTrait::ZeroRegions>())
+    return false;
+
+  // Check E3.
+  if (op.getInputs().empty()) return false;
+
+  auto elemType =
+      op.getInputs()[0].getType().cast<TensorType>().getElementType();
+  auto expectedInnerOpType = RankedTensorType::get(/*shape=*/{}, elemType);
+  if (innerOp.getOperands()[0].getType() != expectedInnerOpType) return false;
+
+  // Check E4.
+  if (!llvm::equal(block.getArguments(), innerOp.getOperands())) return false;
+
+  // Check E5.
+  auto retOp = dyn_cast<ReturnOp>(block.getTerminator());
+  if (!retOp) return false;
+
+  auto blockArgLoc = block.getArgument(0).getLoc();
+  if (blockArgLoc != block.getArgument(1).getLoc()) return false;
+
+  if (innerOp.getLoc() != op.getLoc() || retOp.getLoc() != op.getLoc() ||
+      blockArgLoc != op.getLoc())
+    return false;
+
+  // Check E6.
+  return llvm::equal(innerOp.getResults(), retOp.getOperands());
+}
+
+void ReduceOp::print(OpAsmPrinter& p) {
+  {
+    // Print the pairs of operands under the form:
+    //   (%arg0 init: %arg3), (%arg1 init: %arg4), (%arg2 init: %arg5)
+    StringRef comma = "";
+    int numOperandPairs = getNumOperands() / 2;
+    for (int opId : llvm::seq<int>(0, numOperandPairs)) {
+      p << comma << "(" << getOperand(opId)
+        << " init: " << getOperand(opId + numOperandPairs) << ")";
+      comma = ", ";
+    }
+  }
+
+  // If the reduce-op is eligible for compact printing, we emit the one-liner:
+  //  mhlo.reduce applies <inner-op> across dimensions = [...] : <func-type>
+  // Note: We are not printing the function type of reduction operation. We
+  // have some simplifying assumptions (refer to IsEligibleForCompactPrint::E3)
+  // to derive the type from that of reduce-op.
+  if (isEligibleForCompactPrint(*this)) {
+    Operation& innerOp = getBody().front().front();
+    p << " applies ";
+    printEscapedString(innerOp.getName().getStringRef(), p.getStream());
+
+    p << " across dimensions = [";
+    llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
+    p << "]";
+    p << " : ";
+    p.printFunctionalType(*this);
+  } else {
+    p << " across dimensions = [";
+    llvm::interleaveComma(getDimensions().getValues<int64_t>(), p);
+    p << "]";
+    p.printOptionalAttrDict(getOperation()->getAttrs(), {"dimensions"});
+    p << " : ";
+    p.printFunctionalType(*this);
+    p.printNewline();
+    p << " reducer";
+    {
+      // Print the pairs of block operands under the form:
+      //   (%arg0_elt, %arg0_acc) (%arg1_elt, %arg1_acc):
+      Block& reducer = getBody().front();
+      int numOperandPairs = getNumOperands() / 2;
+      for (int opId : llvm::seq<int>(0, numOperandPairs)) {
+        p << "(";
+        p.printRegionArgument(reducer.getArgument(opId));
+        p << ", ";
+        p.printRegionArgument(reducer.getArgument(opId + numOperandPairs));
+        p << ") ";
+      }
+    }
+    p << ' ';
+    p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
+  }
+}
+
+ParseResult ReduceOp::parse(OpAsmParser& parser, OperationState& result) {
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  Location currLocation = parser.getEncodedSourceLoc(loc);
+
+  // Parse the operands of reduce-op, this is a list of pair under the form:
+  //   (%arg0 init: %arg3), (%arg1 init: %arg4), (%arg2 init: %arg5)
+  // Each input to reduce is paired with its init value, even though in memory
+  // they are stored with the input first and the init values after.
+  SmallVector<OpAsmParser::UnresolvedOperand, 2> operands;
+  SmallVector<OpAsmParser::UnresolvedOperand, 2> initOperands;
+  do {
+    (void)parser.parseOptionalComma();
+    if (parser.parseOptionalLParen()) break;
+    OpAsmParser::UnresolvedOperand operand, initOperand;
+    if (parser.parseOperand(operand) || parser.parseKeyword("init") ||
+        parser.parseColon() || parser.parseOperand(initOperand) ||
+        parser.parseRParen())
+      return failure();
+    operands.push_back(operand);
+    initOperands.push_back(initOperand);
+  } while (true);
+  operands.append(initOperands);
+
+  // Check if we are parsing the compact version of reduce-op:
+  //  mhlo.reduce applies <inner-op> across dimensions = [...] : <func-type>
+  // else parse the "region-based" variant.
+  if (failed(parser.parseOptionalKeyword("applies"))) {
+    // Parse the inner-op dimensions, reduce-op's function-type and
+    // optional location.
+    SmallVector<int64_t> dimensions;
+    auto parseDim = [&]() -> ParseResult {
+      if (parser.parseInteger(dimensions.emplace_back())) return failure();
+      return success();
+    };
+
+    FunctionType reduceOpFntype;
+    if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
+        parser.parseEqual() ||
+        parser.parseCommaSeparatedList(AsmParser::Delimiter::Square,
+                                       parseDim) ||
+        parser.parseOptionalAttrDict(result.attributes) ||
+        parser.parseColon() || parser.parseType(reduceOpFntype) ||
+        parser.parseKeyword("reducer"))
+      return failure();
+    OpBuilder builder(parser.getBuilder().getContext());
+    result.addAttribute("dimensions", builder.getI64TensorAttr(dimensions));
+
+    // Parse the "reducer" region now.
+    SmallVector<OpAsmParser::UnresolvedOperand, 2> reducerOperands;
+    SmallVector<OpAsmParser::UnresolvedOperand, 2> reducerInitOperands;
+    SmallVector<Type, 2> reducerTypes;
+    SmallVector<Type, 2> reducerInitTypes;
+    SmallVector<Optional<Location>, 2> reducerLocs;
+    SmallVector<Optional<Location>, 2> reducerInitLocs;
+    auto parseBlockOperand =
+        [&](SmallVectorImpl<OpAsmParser::UnresolvedOperand>& operands,
+            SmallVectorImpl<Type>& types,
+            SmallVectorImpl<Optional<Location>>& locs) -> ParseResult {
+      OpAsmParser::UnresolvedOperand operand;
+      Type type;
+      Optional<Location> loc;
+      if (parser.parseOperand(operand, /*allowResultNumber=*/false) ||
+          parser.parseColon() || parser.parseType(type) ||
+          parser.parseOptionalLocationSpecifier(loc))
+        return failure();
+      operands.push_back(operand);
+      types.push_back(type);
+      locs.push_back(loc);
+      return success();
+    };
+    do {
+      if (failed(parser.parseOptionalLParen())) break;
+      if (parseBlockOperand(reducerOperands, reducerTypes, reducerLocs) ||
+          parser.parseComma() ||
+          parseBlockOperand(reducerInitOperands, reducerInitTypes,
+                            reducerInitLocs) ||
+          parser.parseRParen())
+        return failure();
+    } while (true);
+    reducerOperands.append(reducerInitOperands);
+    reducerTypes.append(reducerInitTypes);
+    reducerLocs.append(reducerInitLocs);
+    result.addTypes(reduceOpFntype.getResults());
+    SmallVector<OpAsmParser::Argument> reducerArgs;
+    createArgs(reducerOperands, reducerTypes, reducerArgs);
+
+    // Derive the SSA-values for reduce-op's operands and parse the region, and
+    // the optional trailing location.
+    Optional<Location> trailingLoc;
+    if (parser.resolveOperands(operands, reduceOpFntype.getInputs(), loc,
+                               result.operands) ||
+        parser.parseRegion(*result.addRegion(), reducerArgs))
+      return failure();
+    // Set the individual block arguments.
+    for (auto argAndLoc :
+         llvm::zip(result.regions.front()->front().getArguments(), reducerLocs))
+      if (std::get<1>(argAndLoc))
+        std::get<0>(argAndLoc).setLoc(std::get<1>(argAndLoc).value());
+    result.location = trailingLoc.value_or(currLocation);
+    return success();
+  }
+
+  // Parse the inner-op name and check if the contract on inner-op
+  // mentioned in "isEligibleForCompactPrint::E2" for pretty-priting is met.
+  FailureOr<OperationName> innerOpNameInfo = parser.parseCustomOperationName();
+  if (failed(innerOpNameInfo)) return failure();
+
+  StringRef innerOpName = innerOpNameInfo->getStringRef();
+  Dialect* innerOpDialect = innerOpNameInfo->getDialect();
+  if (!innerOpDialect || !innerOpDialect->getNamespace().equals("mhlo") ||
+      !innerOpNameInfo->hasTrait<mlir::OpTrait::NOperands<2>::Impl>() ||
+      !innerOpNameInfo->hasTrait<mlir::OpTrait::OneResult>() ||
+      !innerOpNameInfo->hasTrait<mlir::OpTrait::IsCommutative>() ||
+      !innerOpNameInfo->hasTrait<mlir::OpTrait::ZeroRegions>()) {
+    parser.emitError(loc,
+                     "expected the inner-op to be a commutative binary-op from "
+                     "mhlo dialect, zero region, producing single result");
+    return failure();
+  }
+
+  // Parse the inner-op dimensions, reduce-op's function-type and
+  // optional location.
+  SmallVector<int64_t> dimensions;
+  auto parseDim = [&]() -> ParseResult {
+    if (parser.parseInteger(dimensions.emplace_back())) return failure();
+    return success();
+  };
+
+  Optional<Location> explicitLoc;
+  FunctionType reduceOpFntype;
+  if (parser.parseKeyword("across") || parser.parseKeyword("dimensions") ||
+      parser.parseEqual() ||
+      parser.parseCommaSeparatedList(AsmParser::Delimiter::Square, parseDim) ||
+      parser.parseColon() || parser.parseType(reduceOpFntype) ||
+      parser.parseOptionalLocationSpecifier(explicitLoc))
+    return failure();
+
+  if (!reduceOpFntype || reduceOpFntype.getInputs().empty()) {
+    if (!reduceOpFntype) return parser.emitError(loc, "expected function type");
+    return parser.emitError(loc,
+                            "input types missing in reduce-op function type");
+  }
+
+  // If location of reduce-op is explicitly provided, then use it; Else use
+  // the parser's current location.
+  Location reduceOpLoc = explicitLoc.value_or(currLocation);
+
+  // Derive the SSA-values for reduce-op's operands.
+  if (parser.resolveOperands(operands, reduceOpFntype.getInputs(), loc,
+                             result.operands))
+    return failure();
+
+  // Derive the type of inner-op from that of reduce-op's input operand.
+  auto innerOpType = RankedTensorType::get(
+      /*shape=*/{}, getElementTypeOrSelf(reduceOpFntype.getInput(0)));
+
+  // Add a region for reduce-op.
+  Region& region = *result.addRegion();
+
+  // Create a basic-block inside reduce-op's region.
+  Block& block = region.emplaceBlock();
+  auto lhs = block.addArgument(innerOpType, reduceOpLoc);
+  auto rhs = block.addArgument(innerOpType, reduceOpLoc);
+
+  // Create and insert an "inner-op" operation in the block.
+  OpBuilder builder(parser.getBuilder().getContext());
+  builder.setInsertionPointToStart(&block);
+
+  OperationState innerOpState(reduceOpLoc, innerOpName);
+  innerOpState.operands.push_back(lhs);
+  innerOpState.operands.push_back(rhs);
+  innerOpState.addTypes(innerOpType);
+
+  Operation* innerOp = builder.create(innerOpState);
+
+  // Insert a return statement in the block returning the inner-op's result.
+  builder.create<ReturnOp>(innerOp->getLoc(), innerOp->getResults());
+
+  // Populate the reduce-op operation-state with result-type, location, and
+  // dimension attribute.
+  result.addTypes(reduceOpFntype.getResults());
+  result.location = innerOp->getLoc();
+  result.addAttribute("dimensions", builder.getI64TensorAttr(dimensions));
+
+  return success();
+}
+
+LogicalResult ReduceOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  ReduceOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferReduceOp(location, adaptor.getInputs(),
+                            adaptor.getInitValues(), adaptor.getDimensions(),
+                            inferredReturnShapes);
+}
+
+LogicalResult ReduceOp::verify() {
+  return hlo::verifyReduceOp(getLoc(), getInputs(), getInitValues(),
+                             getDimensions(), getBody());
+}
+
+// Enable constant folding to occur within the region of the ReduceOp
+// by replacing block argument uses with constants if:
+//  1. All the ReduceOp operands are splat constants.
+//  2. The ReduceOp region consists of a single logical AND or logical OR.
+// The pattern leverages the idempotent property of the AND and OR operators
+// to determine the value of a reduction on splat constants. Other boolean
+// operators do not have this property, and need separate patterns to resolve
+// reductions of their splat constants.
+struct LowerBoolSplatConstantsIntoRegion : public OpRewritePattern<ReduceOp> {
+  using OpRewritePattern<ReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReduceOp op,
+                                PatternRewriter& rewriter) const override {
+    mlir::Block& bb = op.getBody().front();
+
+    // Ensure only a compute op and return op exist and the
+    // compute op is an AND or OR op.
+    if (bb.getOperations().size() != 2) return failure();
+    if (!mlir::isa<mhlo::AndOp, mhlo::OrOp>(bb.front())) return failure();
+
+    // Ensure all operands are splat constants.
+    SmallVector<DenseElementsAttr, 4> bargCstAttrs;
+    for (auto inpAndBarg : llvm::zip(op.getOperands(), bb.getArguments())) {
+      Value inp = std::get<0>(inpAndBarg);
+      BlockArgument barg = std::get<1>(inpAndBarg);
+      ConstantOp cst = inp.getDefiningOp<ConstantOp>();
+      if (!cst) return failure();
+
+      auto cstAttr = cst.getValue().dyn_cast_or_null<DenseElementsAttr>();
+      if (!cstAttr.isSplat()) {
+        return rewriter.notifyMatchFailure(op, "Must be splat constant.");
+      }
+
+      auto bargShapedType = barg.getType().dyn_cast<ShapedType>();
+      if (!bargShapedType) return failure();
+
+      auto bargCstAttr = DenseElementsAttr::get(
+          bargShapedType, cstAttr.getSplatValue<mlir::Attribute>());
+      bargCstAttrs.push_back(bargCstAttr);
+    }
+
+    // Create new splat constants to replace block arguments.
+    for (BlockArgument barg : bb.getArguments()) {
+      int argIdx = barg.getArgNumber();
+      mhlo::ConstantOp newCst = rewriter.create<mhlo::ConstantOp>(
+          bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
+      barg.replaceAllUsesWith(newCst);
+    }
+    return success();
+  }
+};
+
+void ReduceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
+  results.add<LowerBoolSplatConstantsIntoRegion>(context);
+}
+
+LogicalResult ReduceOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  ReduceOp::Adaptor adaptor(operands);
+  auto inputs = adaptor.getInputs();
+
+  auto operandType = inputs[0].getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!operandType) return failure();
+
+  Location loc = this->getLoc();
+  SmallVector<Value, 4> shapeValues;
+  SmallVector<int64_t, 4> dimensions(
+      this->getDimensions().getValues<int64_t>());
+  shapeValues.reserve(operandType.getRank());
+  Type shapeScalarType = builder.getIndexType();
+  auto toShapeScalarType = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shapeScalarType);
+  };
+
+  for (const auto& element : llvm::enumerate(operandType.getShape())) {
+    int64_t idx = element.index();
+    auto* it = std::find(dimensions.begin(), dimensions.end(), idx);
+    if (it != dimensions.end()) {
+      continue;
+    }
+    Value valueDim = toShapeScalarType(
+        builder.create<tensor::DimOp>(loc, inputs[0], element.index()));
+    shapeValues.push_back(valueDim);
+  }
+
+  Value outputShape = builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            shapeScalarType),
+      shapeValues);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    reifiedReturnShapes.push_back(outputShape);
+  }
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// OptimizationBarrierOp
+//===----------------------------------------------------------------------===//
+LogicalResult OptimizationBarrierOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  OptimizationBarrierOp::Adaptor adaptor(operands, attributes);
+  return hlo::inferOptimizationBarrierOp(location, adaptor.getOperand(),
+                                         inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+LogicalResult ReturnOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ReturnOp::Adaptor adaptor(operands, attributes);
+  return hlo::inferReturnOp(location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// ReverseOp
+//===----------------------------------------------------------------------===//
+LogicalResult ReverseOp::verify() {
+  return hlo::verifyReverseOp(getLoc(), getOperand(), getDimensions());
+}
+
+//===----------------------------------------------------------------------===//
+// RngBitGeneratorOp
+//===----------------------------------------------------------------------===//
+
+// Verify that input state has the same shape as output shape
+LogicalResult RngBitGeneratorOp::verify() {
+  return hlo::verifyRngBitGeneratorOp(getLoc(), getInitialState(),
+                                      getOutputState());
+}
+
+//===----------------------------------------------------------------------===//
+// RngOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RngOp::verify() {
+  return hlo::verifyRngOp(getLoc(), getA(), getB(),
+                          getRngDistribution() == RngDistribution::UNIFORM);
+}
+
+LogicalResult RngOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  return rngInferReturnTypeComponents(context, location, operands, attributes,
+                                      regions, inferredReturnShapes);
+}
+
+LogicalResult RngOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  RngOp::Adaptor adaptor(operands);
+  reifiedReturnShapes.push_back(
+      castToIndexTensor(builder, getLoc(), adaptor.getShape()));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XlaRngGetAndUpdateStateOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult XlaRngGetAndUpdateStateOp::verify() {
+  auto resultTy = getType().cast<RankedTensorType>();
+  if (!resultTy) return emitOpError() << "Output is not ranked.";
+  if (!resultTy.hasStaticShape())
+    return emitOpError() << "Output is not statically shaped.";
+  auto rank = resultTy.getRank();
+  if (rank != 1)
+    return emitOpError() << "Output is of rank " << rank << " instead of 1";
+  auto extent = resultTy.getDimSize(0);
+  if (extent != 2)
+    return emitOpError() << "Output size is " << extent << " instead of 2";
+
+  return success();
+}
+
+LogicalResult XlaRngGetAndUpdateStateOp::inferReturnTypes(
+    MLIRContext* ctx, Optional<Location>, ValueRange, DictionaryAttr,
+    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  inferredReturnTypes.push_back(mlir::RankedTensorType::get(
+      {2}, mlir::IntegerType::get(ctx, 64, IntegerType::Unsigned)));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SelectOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (getOnTrue() == getOnFalse()) {
+    return getOnTrue();
+  }
+
+  auto predicate = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!predicate) {
+    return {};
+  }
+
+  auto predicateTy = predicate.getType().cast<ShapedType>();
+  if (!predicateTy.getElementType().isInteger(1)) {
+    return {};
+  }
+
+  if (predicate.isSplat()) {
+    return predicate.getSplatValue<APInt>().getBoolValue() ? getOnTrue()
+                                                           : getOnFalse();
+  }
+
+  return {};
+}
+
+void SelectOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                           MLIRContext* context) {
+  results.add<FusePredNegIntoSelect, FuseBroadcastedPredNegIntoSelect>(context);
+}
+
+// Makes it such that a SelectOp that is a non-root operation in a DRR infers
+// the return type based on operand type.
+LogicalResult SelectOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  SelectOp::Adaptor op(operands, attributes);
+  return hlo::inferSelectOp(location, op.getPred(), op.getOnTrue(),
+                            op.getOnFalse(), inferredReturnShapes);
+}
+
+LogicalResult SelectOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  // For `hlo.select`, the first operand may be a scalar.
+  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands[1],
+                                     &reifiedReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// SetDimensionSizeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SetDimensionSizeOp::verify() {
+  if (auto size = this->getSize().getType().dyn_cast<RankedTensorType>()) {
+    if (size.getRank() != 0)
+      return emitOpError() << "size operand should be of rank-0";
+  }
+
+  return verifyDimAttr(*this);
+}
+
+OpFoldResult SetDimensionSizeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  if (input) return input;
+
+  DenseElementsAttr size = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  if (!size || !size.isSplat()) return {};
+
+  auto ty = getType().dyn_cast<RankedTensorType>();
+  if (!ty) return {};
+
+  int64_t dimSize = ty.getDimSize(getDimension());
+  if (dimSize == size.getSplatValue<IntegerAttr>().getInt())
+    return getOperand();
+  return {};
+}
+
+// TODO(b/238903565): Switch to inferReturnTypeComponents after adding support
+// for the encoding upstream.
+LogicalResult SetDimensionSizeOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  Location loc = location.value_or(UnknownLoc::get(context));
+
+  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
+  if (failed(adaptor.verify(loc))) return failure();
+
+  auto inputType = adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
+  if (!inputType) {
+    inferredReturnTypes.push_back(adaptor.getOperand().getType());
+    return success();
+  }
+
+  int64_t dim = adaptor.getDimension();
+  int64_t rank = inputType.getRank();
+  if (dim < 0 || dim >= rank) {
+    return mlir::emitError(loc) << "expects dimension to be in range [0, "
+                                << rank << "); got: [" << dim << "].";
+  }
+
+  auto shape = llvm::to_vector<4>(inputType.getShape());
+  llvm::SmallVector<int64_t, 4> bounds(rank, ShapedType::kDynamic);
+  if (auto encoding =
+          inputType.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>())
+    bounds = llvm::to_vector<4>(encoding.getBounds());
+
+  if (shape[dim] != ShapedType::kDynamic) bounds[dim] = shape[dim];
+  shape[dim] = ShapedType::kDynamic;
+
+  DenseIntElementsAttr sizeAttr;
+  if (matchPattern(adaptor.getSize(), m_Constant(&sizeAttr))) {
+    int64_t splat =
+        sizeAttr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
+    if (splat == bounds[dim]) {
+      shape[dim] = splat;
+      bounds[dim] = ShapedType::kDynamic;
+    }
+  }
+
+  auto extensions = TypeExtensionsAttr::get(context, bounds);
+  auto resultType =
+      llvm::all_of(bounds, [](int64_t v) { return v == ShapedType::kDynamic; })
+          ? RankedTensorType::get(shape, inputType.getElementType())
+          : RankedTensorType::get(shape, inputType.getElementType(),
+                                  extensions);
+  inferredReturnTypes.push_back(resultType);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult PadOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  PadOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferPadOp(location, adaptor.getOperand(),
+                         adaptor.getPaddingValue(), adaptor.getEdgePaddingLow(),
+                         adaptor.getEdgePaddingHigh(),
+                         adaptor.getInteriorPadding(), inferredReturnTypes);
+}
+
+template <typename T>
+OpFoldResult padOpFoldHelper(DenseElementsAttr input, DenseElementsAttr padding,
+                             RankedTensorType returnType,
+                             DenseIntElementsAttr edgePaddingLow,
+                             DenseIntElementsAttr /*edgePaddingHigh*/,
+                             DenseIntElementsAttr interiorPadding) {
+  // Prevent folding if the result is too large.
+  if (returnType.getNumElements() > kFoldOpEltLimit) return {};
+
+  // Fill the full result tensor with the padding value.
+  llvm::SmallVector<T, 4> result(returnType.getNumElements(),
+                                 padding.getValues<T>()[0]);
+
+  auto nextIndex = [](llvm::SmallVector<uint64_t, 8>& index,
+                      llvm::ArrayRef<int64_t> shape) {
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      ++index[i];
+      if (static_cast<int64_t>(index[i]) < shape[i]) return;
+      index[i] = 0;
+    }
+  };
+
+  // Iterate over all elements of the input tensor and copy it to the correct
+  // location in the output tensor.
+  llvm::SmallVector<uint64_t, 8> index(input.getType().getRank(), 0);
+  uint64_t numElements = input.getNumElements();
+  for (uint64_t operandIdx = 0; operandIdx < numElements; operandIdx++) {
+    uint64_t resultIdx = 0;
+    uint64_t idxMultiplyer = 1;
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      resultIdx += (edgePaddingLow.getValues<int64_t>()[i] +
+                    index[i] * (interiorPadding.getValues<int64_t>()[i] + 1)) *
+                   idxMultiplyer;
+      idxMultiplyer *= returnType.getDimSize(i);
+    }
+    result[resultIdx] = input.getValues<T>()[index];
+    nextIndex(index, input.getType().getShape());
+  }
+  return DenseElementsAttr::get(returnType, result);
+}
+
+OpFoldResult PadOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  // If all padding is zero then it is an identity pad.
+  auto isZero = [](const APInt& i) { return i == 0; };
+  if (llvm::all_of(getEdgePaddingLow().getValues<APInt>(), isZero) &&
+      llvm::all_of(getEdgePaddingHigh().getValues<APInt>(), isZero) &&
+      llvm::all_of(getInteriorPadding().getValues<APInt>(), isZero))
+    return getOperand();
+
+  // If any padding is negative then it isn't supported by the folder (yet).
+  auto isNegative = [](const APInt& i) { return i.slt(0); };
+  if (llvm::any_of(getEdgePaddingLow().getValues<APInt>(), isNegative) ||
+      llvm::any_of(getEdgePaddingHigh().getValues<APInt>(), isNegative) ||
+      llvm::any_of(getInteriorPadding().getValues<APInt>(), isNegative))
+    return {};
+
+  DenseElementsAttr input = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  DenseElementsAttr padding = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+  RankedTensorType returnType = getType().dyn_cast_or_null<RankedTensorType>();
+  if (!input || !input.getType().hasRank() || !padding || !returnType ||
+      !returnType.hasStaticShape())
+    return {};
+
+  if (returnType.getElementType().isa<IntegerType>())
+    return padOpFoldHelper<APInt>(input, padding, returnType,
+                                  getEdgePaddingLow(), getEdgePaddingHigh(),
+                                  getInteriorPadding());
+  if (returnType.getElementType().isa<FloatType>())
+    return padOpFoldHelper<APFloat>(input, padding, returnType,
+                                    getEdgePaddingLow(), getEdgePaddingHigh(),
+                                    getInteriorPadding());
+  if (ComplexType complex =
+          returnType.getElementType().dyn_cast_or_null<ComplexType>()) {
+    // TODO(atondwal): Allow int types in HLO_complex
+    if (complex.getElementType().isa<FloatType>())
+      return padOpFoldHelper<std::complex<APFloat>>(
+          input, padding, returnType, getEdgePaddingLow(), getEdgePaddingHigh(),
+          getInteriorPadding());
+  }
+  return {};
+}
+
+LogicalResult PadOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  PadOp::Adaptor adaptor(operands, this->getOperation()->getAttrDictionary());
+  auto loc = this->getLoc();
+  Value operand = adaptor.getOperand();
+  auto operandTy = operand.getType().cast<RankedTensorType>();
+
+  llvm::SmallVector<int32_t> padHigh;
+  llvm::SmallVector<int32_t> padLow;
+  llvm::SmallVector<int32_t> padInterior;
+
+  auto padHighAttr = adaptor.getEdgePaddingHigh();
+  auto padLowAttr = adaptor.getEdgePaddingLow();
+  auto padInteriorAttr = adaptor.getInteriorPadding();
+
+  padHigh.reserve(padHighAttr.getNumElements());
+  padLow.reserve(padLowAttr.getNumElements());
+  padInterior.reserve(padInteriorAttr.getNumElements());
+
+  for (const APInt& val : padHighAttr.getValues<APInt>())
+    padHigh.push_back(val.getSExtValue());
+
+  for (const APInt& val : padLowAttr.getValues<APInt>())
+    padLow.push_back(val.getSExtValue());
+
+  for (const APInt& val : padInteriorAttr.getValues<APInt>())
+    padInterior.push_back(val.getSExtValue());
+
+  Value one = builder.create<arith::ConstantIndexOp>(loc, 1).getResult();
+  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0).getResult();
+
+  llvm::SmallVector<Value> dimensions;
+  dimensions.reserve(operandTy.getRank());
+  for (int i = 0, s = operandTy.getRank(); i < s; ++i) {
+    Value padEdge =
+        builder.create<arith::ConstantIndexOp>(loc, padHigh[i] + padLow[i]);
+
+    // First we grab the initial interior size.
+    Value dim = builder.create<tensor::DimOp>(loc, operand, i).getResult();
+
+    // Compute the interior of the tensor and determine padding size.
+    if (padInterior[i] > 0) {
+      Value padInter =
+          builder.create<arith::ConstantIndexOp>(loc, padInterior[i])
+              .getResult();
+      Value interior = builder.create<arith::SubIOp>(loc, dim, one).getResult();
+      interior = builder.create<arith::MaxSIOp>(loc, interior, zero);
+      interior = builder.create<arith::MulIOp>(loc, interior, padInter);
+      dim = builder.create<arith::AddIOp>(loc, dim, interior).getResult();
+    }
+
+    // Then we add the padding on the edge of the tensor.
+    dim = builder.create<arith::AddIOp>(loc, dim, padEdge).getResult();
+    dimensions.push_back(dim);
+  }
+
+  Value dimensionTensor =
+      builder.create<tensor::FromElementsOp>(loc, dimensions).getResult();
+  reifiedReturnShapes.push_back(dimensionTensor);
+  return success();
+}
+
+// If the input tensor has a dimension of length-0, the input tensor is
+// irrelevant. Instead we can broadcast the pad value to the output size rather
+// than pad the input tensor.
+struct PadEmptyTensor : public OpRewritePattern<PadOp> {
+  using OpRewritePattern<PadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PadOp op,
+                                PatternRewriter& rewriter) const override {
+    auto operand = op.getOperand();
+    auto padVal = op.getPaddingValue();
+
+    auto operandTy = operand.getType().cast<RankedTensorType>();
+    auto resultTy = op.getType().cast<RankedTensorType>();
+
+    if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
+      return failure();
+    }
+
+    if (resultTy.hasStaticShape()) {
+      auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
+      auto dims =
+          DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
+      rewriter.replaceOpWithNewOp<mhlo::BroadcastInDimOp>(op, resultTy, padVal,
+                                                          dims);
+      return success();
+    }
+
+    llvm::SmallVector<Value> reifiedShapes;
+    if (failed(op.reifyReturnTypeShapes(rewriter, op.getOperands(),
+                                        reifiedShapes)))
+      return failure();
+
+    auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
+    auto broadcastDims =
+        DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
+    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
+        op, op.getType(), padVal, reifiedShapes.front(), broadcastDims);
+
+    return failure();
+  }
+};
+
+void PadOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                        MLIRContext* context) {
+  results.add<PadEmptyTensor>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicPadOp
+//===----------------------------------------------------------------------===//
+
+// If the input tensor has a dimension of length-0, the input tensor is
+// irrelevant. Instead we can broadcast the pad value to the output size rather
+// than pad the input tensor.
+struct DynamicPadEmptyTensor : public OpRewritePattern<DynamicPadOp> {
+  using OpRewritePattern<DynamicPadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicPadOp op,
+                                PatternRewriter& rewriter) const override {
+    // auto loc = op.getLoc();
+    auto operand = op.getOperand();
+    auto padVal = op.getPaddingValue();
+
+    auto operandTy = operand.getType().cast<RankedTensorType>();
+
+    if (llvm::all_of(operandTy.getShape(), [](int64_t d) { return d != 0; })) {
+      return failure();
+    }
+
+    llvm::SmallVector<Value> reifiedShapes;
+    if (failed(op.reifyReturnTypeShapes(rewriter, op->getOperands(),
+                                        reifiedShapes)))
+      return failure();
+
+    auto dimsType = RankedTensorType::get({0}, rewriter.getIntegerType(64));
+    auto broadcastDims =
+        DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
+    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
+        op, op.getType(), padVal, reifiedShapes.front(), broadcastDims);
+
+    return failure();
+  }
+};
+
+void DynamicPadOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                               MLIRContext* context) {
+  // Disc doesn't lowering mhlo::PadOp currently,
+  // see https://github.com/alibaba/BladeDISC/issues/541
+  // results.add<DPadToPad, DynamicPadEmptyTensor>(context);
+  results.add<DynamicPadEmptyTensor>(context);
+}
+
+LogicalResult DynamicPadOp::verify() {
+  return hlo::verifyDynamicPadOp(getLoc(), getOperand(), getPaddingValue(),
+                                 getEdgePaddingLow(), getEdgePaddingHigh(),
+                                 getInteriorPadding(), getResult());
+}
+
+LogicalResult DynamicPadOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  DynamicPadOp::Adaptor adaptor(operands);
+  Value operand = adaptor.getOperand();
+  Value edgePaddingLow = adaptor.getEdgePaddingLow();
+  Value edgePaddingHigh = adaptor.getEdgePaddingHigh();
+  Value interiorPadding = adaptor.getInteriorPadding();
+
+  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked pad a.t.m.
+  if (!operandType) return failure();
+
+  auto loc = this->getLoc();
+  SmallVector<Value, 4> shapeValues;
+  shapeValues.reserve(operandType.getRank());
+  Type shapeScalarType =
+      edgePaddingLow.getType().cast<ShapedType>().getElementType();
+
+  auto toShapeScalarType = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shapeScalarType);
+  };
+
+  Value zero =
+      toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 0));
+  Value one = toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 1));
+
+  for (int idx : llvm::seq<int>(0, operandType.getShape().size())) {
+    Value valueDim =
+        toShapeScalarType(builder.create<tensor::DimOp>(loc, operand, idx));
+    Value offset = builder.create<arith::ConstantIndexOp>(loc, idx);
+    Value valueLow =
+        builder.create<tensor::ExtractOp>(loc, edgePaddingLow, offset);
+    Value valueHigh =
+        builder.create<tensor::ExtractOp>(loc, edgePaddingHigh, offset);
+    Value valueInterior =
+        builder.create<tensor::ExtractOp>(loc, interiorPadding, offset);
+    // output_size = input_size + padding_low + padding_high + interior *
+    // max(input_size - 1, 0)
+    Value valueDimLessThanOne = builder.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt, valueDim, one);
+    Value interiorSize = builder.create<arith::MulIOp>(
+        loc, valueInterior,
+        builder.create<mlir::arith::SelectOp>(
+            loc, valueDimLessThanOne, zero,
+            builder.create<arith::SubIOp>(loc, valueDim, one)));
+    shapeValues.push_back(builder.create<arith::AddIOp>(
+        loc,
+        builder.create<arith::AddIOp>(
+            loc, builder.create<arith::AddIOp>(loc, interiorSize, valueDim),
+            valueLow),
+        valueHigh));
+  }
+
+  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            shapeScalarType),
+      shapeValues));
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReshapeOp::verify() {
+  return hlo::verifyReshapeOp(getLoc(), getOperand(), getResult());
+}
+
+OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (getOperand().getType() == getType()) {
+    return getOperand();
+  }
+
+  if (auto prevOp = getOperand().getDefiningOp<ReshapeOp>()) {
+    setOperand(prevOp.getOperand());
+    return getResult();
+  }
+
+  if (auto elements = operands.front().dyn_cast_or_null<DenseElementsAttr>()) {
+    return reshape(elements, getResult().getType().cast<ShapedType>());
+  }
+
+  return {};
+}
+
+void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
+  results.add<IdentityBroadcastReshape, IdentityBroadcastInDimReshape,
+              EliminateRedundantReshape, EliminateIdentityReshape>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// ReplicaId Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReplicaIdOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange /*operands*/,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferReplicaIdOp(context, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// PartitionId Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult PartitionIdOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange /*operands*/,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  return hlo::inferPartitionIdOp(context, location, inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// AddDependency Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult AddDependencyOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location>, ValueRange operands,
+    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands.getTypes()[0]);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// If Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult IfOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  IfOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferIfOp(location, adaptor.getRegions(), inferredReturnTypes);
+}
+
+static LogicalResult inlineIfConstantCondition(IfOp ifOp,
+                                               PatternRewriter& rewriter) {
+  DenseIntElementsAttr predAttr;
+  if (!matchPattern(ifOp.getPred(), m_Constant(&predAttr))) return failure();
+
+  if (predAttr.getSplatValue<BoolAttr>().getValue()) {
+    replaceOpWithRegion(rewriter, ifOp, ifOp.getTrueBranch());
+  } else {
+    replaceOpWithRegion(rewriter, ifOp, ifOp.getFalseBranch());
+  }
+  return success();
+}
+
+void IfOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                       MLIRContext* context) {
+  results.add(&inlineIfConstantCondition);
+}
+
+//===----------------------------------------------------------------------===//
+// Case Op
+//===----------------------------------------------------------------------===//
+
+LogicalResult CaseOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  CaseOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferCaseOp(location, adaptor.getRegions(), inferredReturnTypes);
+}
+
+static LogicalResult inlineCaseConstantCondition(CaseOp caseOp,
+                                                 PatternRewriter& rewriter) {
+  DenseIntElementsAttr indexAttr;
+  if (!matchPattern(caseOp.getIndex(), m_Constant(&indexAttr))) {
+    return failure();
+  }
+  int64_t index =
+      indexAttr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
+  // For an OOB index, the last branch is executed as the default branch:
+  // https://www.tensorflow.org/xla/operation_semantics#conditional
+  if (index < 0 || index >= caseOp.getNumRegions())
+    index = caseOp.getNumRegions() - 1;
+
+  Region& region = caseOp.getRegion(index);
+  if (!llvm::hasSingleElement(region)) return failure();
+  replaceOpWithRegion(rewriter, caseOp, region);
+  return success();
+}
+
+void CaseOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* context) {
+  results.add(&inlineCaseConstantCondition);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryOps
+//===----------------------------------------------------------------------===//
+
+template <typename ValType>
+struct AnyValue {
+  bool operator()(const ValType&) { return true; }
+};
+
+template <typename ValType>
+struct NonNegativeValue {
+  bool operator()(const ValType& v) { return !v.isNegative(); }
+};
+
+template <typename ValType>
+struct PositiveValue {
+  bool operator()(const ValType& v) { return !v.isNegative() && !v.isZero(); }
+};
+
+static const APFloat& addSign(const APFloat& v, Type) { return v; }
+static APSInt addSign(const APInt& v, Type t) {
+  // Add signedness information to the value, treating signless as signed,
+  // unless it's i1.
+  return APSInt(v, t.isUnsignedInteger() || t.isSignlessInteger(1));
+}
+
+template <typename Op, typename ElementType, typename ValType, typename Convert,
+          typename Validate = AnyValue<ValType>>
+static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0]) return {};
+
+  DenseElementsAttr val = attrs[0].dyn_cast<DenseElementsAttr>();
+  if (!val) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
+  }
+
+  Type etype = type.getElementType();
+
+  // Evaluate for integer values.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  // Prevent folding if the result is too large.
+  if (val.getNumElements() > kFoldOpEltLimit) return {};
+
+  SmallVector<ValType, 6> values;
+  values.reserve(val.getNumElements());
+  for (const auto v : val.getValues<ValType>()) {
+    if (!Validate()(v)) return {};
+    Optional<ValType> r = Convert()(addSign(v, type));
+    if (!r) return {};
+    values.push_back(r.value());
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+struct Round {
+  Optional<APFloat> operator()(const APFloat& f) {
+    APFloat r = f;
+    r.roundToIntegral(llvm::RoundingMode::NearestTiesToAway);
+    return r;
+  }
+};
+
+struct RoundNearestEven {
+  Optional<APFloat> operator()(const APFloat& f) {
+    APFloat r = f;
+    r.roundToIntegral(llvm::RoundingMode::NearestTiesToEven);
+    return r;
+  }
+};
+
+template <typename FloatOrInt>
+struct Sign {
+  APFloat compute(const APFloat& f) {
+    if (f.isZero() || f.isNaN()) return f;
+    double value = f.isNegative() ? -1.0 : 1.0;
+    APFloat val(value);
+    bool unused;
+    val.convert(f.getSemantics(), APFloat::rmNearestTiesToEven, &unused);
+    return val;
+  }
+
+  APInt compute(const APInt& i) {
+    APInt r = i;
+    if (r == 0) return r;
+    if (r.isNegative()) {
+      return APInt(r.getBitWidth(), -1, /*isSigned=*/true);
+    }
+    return APInt(r.getBitWidth(), 1, /*isSigned=*/true);
+  }
+
+  Optional<FloatOrInt> operator()(const FloatOrInt& fi) { return compute(fi); }
+};
+
+template <typename FloatOrInt>
+struct Abs {
+  APFloat compute(const APFloat& f) { return abs(f); }
+
+  APInt compute(const APInt& i) { return i.abs(); }
+
+  Optional<FloatOrInt> operator()(const FloatOrInt& fi) { return compute(fi); }
+};
+
+double rsqrt(double d) { return 1.0 / std::sqrt(d); }
+
+double logistic(double d) { return 1.0 / (1.0 + std::exp(-d)); }
+
+// NOLINTBEGIN(bugprone-macro-parentheses)
+#define UNARY_FOLDER(Op, Func)                                                \
+  OpFoldResult Op::fold(FoldAdaptor adaptor) {                                \
+    auto attrs = adaptor.getOperands();                                       \
+    /* AbsOp could take complex but return float */                           \
+    if (getElementTypeOrSelf(getOperation()->getOperand(0).getType()) !=      \
+        getElementTypeOrSelf(getType())) {                                    \
+      return {};                                                              \
+    }                                                                         \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
+      return UnaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
+    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                   \
+      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
+    return {};                                                                \
+  }
+
+#define UNARY_FOLDER_INT(Op, Func)                                          \
+  OpFoldResult Op::fold(FoldAdaptor adaptor) {                              \
+    auto attrs = adaptor.getOperands();                                     \
+    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                 \
+      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs); \
+    return {};                                                              \
+  }
+
+#define UNARY_FOLDER_FLOAT(Op, Func)                                 \
+  OpFoldResult Op::fold(FoldAdaptor adaptor) {                       \
+    auto attrs = adaptor.getOperands();                              \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
+      return UnaryFolder<Op, FloatType, APFloat, Func>(this, attrs); \
+    return {};                                                       \
+  }
+
+#define UNARY_FOLDER_UPCAST_TO_F64(Op, Func, Validate)               \
+  struct Op##Folder {                                                \
+    Optional<APFloat> operator()(const APFloat& input) {             \
+      APFloat f = input;                                             \
+      const llvm::fltSemantics& oldSemantics = f.getSemantics();     \
+                                                                     \
+      bool unusedLoseInfo;                                           \
+      f.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, \
+                &unusedLoseInfo);                                    \
+                                                                     \
+      APFloat result(Func(f.convertToDouble()));                     \
+      result.convert(oldSemantics, APFloat::rmNearestTiesToEven,     \
+                     &unusedLoseInfo);                               \
+      return result;                                                 \
+    }                                                                \
+  };                                                                 \
+  OpFoldResult Op::fold(FoldAdaptor adaptor) {                       \
+    auto attrs = adaptor.getOperands();                              \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())            \
+      return UnaryFolder<Op, FloatType, APFloat, Op##Folder,         \
+                         Validate<APFloat>>(this, attrs);            \
+    return {};                                                       \
+  }
+// NOLINTEND(bugprone-macro-parentheses)
+
+UNARY_FOLDER(NegOp, std::negate)
+UNARY_FOLDER(SignOp, Sign)
+UNARY_FOLDER(AbsOp, Abs)
+UNARY_FOLDER_INT(NotOp, std::bit_not)
+UNARY_FOLDER_FLOAT(RoundNearestEvenOp, RoundNearestEven)
+UNARY_FOLDER_FLOAT(RoundOp, Round)
+
+UNARY_FOLDER_UPCAST_TO_F64(CosineOp, std::cos, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(ExpOp, std::exp, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(LogisticOp, logistic, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(LogOp, std::log, PositiveValue)
+UNARY_FOLDER_UPCAST_TO_F64(RsqrtOp, rsqrt, PositiveValue)
+UNARY_FOLDER_UPCAST_TO_F64(SineOp, std::sin, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(SqrtOp, std::sqrt, NonNegativeValue)
+UNARY_FOLDER_UPCAST_TO_F64(TanOp, std::tan, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(TanhOp, std::tanh, AnyValue)
+
+#undef UNARY_FOLDER
+#undef UNARY_FOLDER_INT
+#undef UNARY_FOLDER_FLOAT
+#undef UNARY_FOLDER_UPCAST_TO_F64
+
+//===----------------------------------------------------------------------===//
+// BinaryOps
+//===----------------------------------------------------------------------===//
+
+template <typename Op, typename ElementType = Type, typename ValType,
+          typename Convert>
+static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0] || !attrs[1]) return {};
+
+  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  if (!lhs || !rhs) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
+  }
+
+  Type etype = type.getElementType();
+
+  // Evaluate for integer values.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  // Special case for folding splats no matter how large.
+  // Only covers the case of both attrs being splats; operation-specific cases
+  // like adding a zero or multiplying by one are handled elsewhere.
+  SplatElementsAttr splatLhs = lhs.dyn_cast<SplatElementsAttr>();
+  SplatElementsAttr splatRhs = rhs.dyn_cast<SplatElementsAttr>();
+  if (splatLhs && splatRhs) {
+    auto signedLhs = addSign(splatLhs.getSplatValue<ValType>(), etype);
+    auto signedRhs = addSign(splatRhs.getSplatValue<ValType>(), etype);
+    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
+    return succeeded(result) ? SplatElementsAttr::get(type, *result)
+                             : Attribute();
+  }
+
+  // Prevent folding if the result is too large.
+  if (lhs.getNumElements() > kFoldOpEltLimit) return {};
+
+  SmallVector<ValType, 6> values;
+  values.reserve(lhs.getNumElements());
+  for (const auto zip :
+       llvm::zip(lhs.getValues<ValType>(), rhs.getValues<ValType>())) {
+    auto signedLhs = addSign(std::get<0>(zip), etype);
+    auto signedRhs = addSign(std::get<1>(zip), etype);
+    FailureOr<decltype(signedLhs)> result(Convert()(signedLhs, signedRhs));
+    if (failed(result)) {
+      return {};
+    }
+    values.push_back(std::move(*result));
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+template <typename T>
+struct Divide : std::divides<T> {};
+
+template <>
+struct Divide<APSInt> {
+  FailureOr<APSInt> operator()(const APSInt& a, const APSInt& b) const {
+    if (b.isZero()) return failure();
+    return a / b;
+  }
+};
+
+template <typename T>
+struct Remainder : std::modulus<T> {};
+
+template <>
+struct Remainder<APSInt> {
+  FailureOr<APSInt> operator()(const APSInt& a, const APSInt& b) const {
+    if (b.isZero()) return failure();
+    return a % b;
+  }
+};
+
+template <>
+struct Remainder<APFloat> {
+  APFloat operator()(const APFloat& a, const APFloat& b) const {
+    APFloat result(a);
+    // Using .mod instead of .remainder is important for behavior around signed
+    // zeros
+    result.mod(b);
+    return result;
+  }
+};
+
+template <typename T>
+struct Max {
+  T operator()(const T& a, const T& b) const { return std::max<T>(a, b); }
+};
+
+template <>
+struct Max<APFloat> {
+  // maximum on APFloat is required for NaN propagation logic
+  APFloat operator()(const APFloat& a, const APFloat& b) const {
+    return llvm::maximum(a, b);
+  }
+};
+
+template <typename T>
+struct Min {
+  T operator()(const T& a, const T& b) const { return std::min<T>(a, b); }
+};
+
+template <>
+struct Min<APFloat> {
+  // minimum on APFloat is required for NaN propagation logic
+  APFloat operator()(const APFloat& a, const APFloat& b) const {
+    return llvm::minimum(a, b);
+  }
+};
+
+#define BINARY_FOLDER_INTERNAL(Op, Func)                                     \
+  if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
+    return BinaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
+  if (getElementTypeOrSelf(getType()).isa<IntegerType>())                    \
+    return BinaryFolder<Op, IntegerType, APInt, Func<APSInt>>(this, attrs);  \
+  return {};
+
+#define BINARY_FOLDER(Op, Func)                \
+  OpFoldResult Op::fold(FoldAdaptor adaptor) { \
+    auto attrs = adaptor.getOperands();        \
+    BINARY_FOLDER_INTERNAL(Op, Func)           \
+  }
+
+// Addition, subtraction and multiplication use the std:: versions of the ops.
+// Due to the other ops behaving differently in signed vs unsigned integers,
+// APInts need a special implementation. Currently, it replicates signed int
+// op behavior.
+BINARY_FOLDER(SubtractOp, std::minus)
+BINARY_FOLDER(DivOp, Divide)
+BINARY_FOLDER(RemOp, Remainder)
+BINARY_FOLDER(MaxOp, Max)
+BINARY_FOLDER(MinOp, Min)
+
+OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
+  auto attrs = adaptor.getOperands();
+  // Handle special case where one operand is 0:  x + 0 => x
+  if (attrs[0] || attrs[1]) {
+    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
+    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
+    if (isSplatZero(splatLhs))
+      return splatRhs ? (OpFoldResult)splatRhs : getRhs();
+    if (isSplatZero(splatRhs))
+      return splatLhs ? (OpFoldResult)splatLhs : getLhs();
+  }
+  if (attrs[0] && attrs[1]) {
+    BINARY_FOLDER_INTERNAL(AddOp, std::plus)
+  }
+  return {};
+}
+
+bool isSplatOne(SplatElementsAttr attr) {
+  if (!attr) return false;
+  if (attr.getElementType().isa<FloatType>()) {
+    return attr.getSplatValue<APFloat>().convertToDouble() == 1.0;
+  }
+  if (attr.getElementType().isa<IntegerType>()) {
+    return attr.getSplatValue<APInt>().getSExtValue() == 1;
+  }
+  return false;
+}
+
+OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
+  auto attrs = adaptor.getOperands();
+  // Handle special case where one operand is 1: x * 1 => x
+  if (attrs[0] || attrs[1]) {
+    SplatElementsAttr splatLhs = attrs[0].dyn_cast_or_null<SplatElementsAttr>();
+    SplatElementsAttr splatRhs = attrs[1].dyn_cast_or_null<SplatElementsAttr>();
+    if (isSplatOne(splatLhs))
+      return splatRhs ? (OpFoldResult)splatRhs : getRhs();
+    if (isSplatOne(splatRhs))
+      return splatLhs ? (OpFoldResult)splatLhs : getLhs();
+  }
+  if (attrs[0] && attrs[1]) {
+    BINARY_FOLDER_INTERNAL(MulOp, std::multiplies);
+  }
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// Logical Ops
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AndOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (getLhs() == getRhs()) return getLhs();
+
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
+      return getRhs();
+    }
+
+    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return lhsVal;
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
+      return getLhs();
+    }
+
+    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return rhsVal;
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+  return BinaryFolder<AndOp, IntegerType, APInt, std::bit_and<APSInt>>(
+      this, operands);
+}
+
+OpFoldResult OrOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  if (getLhs() == getRhs()) return getLhs();
+
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
+      return lhsVal;
+    }
+
+    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return getRhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isAllOnesValue()) {
+      return rhsVal;
+    }
+
+    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return getLhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+  return BinaryFolder<OrOp, IntegerType, APInt, std::bit_or<APSInt>>(this,
+                                                                     operands);
+}
+
+OpFoldResult XorOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  // Fold x^x to 0. Attributes only support static shapes.
+  auto rType = getType().cast<ShapedType>();
+  if (getLhs() == getRhs() && rType.hasStaticShape()) {
+    Builder builder(getContext());
+    return builder.getZeroAttr(rType);
+  }
+
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return getRhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue<IntegerAttr>().getValue().isNullValue()) {
+      return getLhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+  return BinaryFolder<XorOp, IntegerType, APInt, std::bit_xor<APSInt>>(
+      this, operands);
+}
+
+#undef BINARY_FOLDER_INTERNAL
+#undef BINARY_FOLDER
+
+//===----------------------------------------------------------------------===//
+// ClampOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult ClampOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  auto operand = operands[1].dyn_cast_or_null<ElementsAttr>();
+  auto min = operands[0].dyn_cast_or_null<ElementsAttr>();
+  auto max = operands[2].dyn_cast_or_null<ElementsAttr>();
+  if (!operand || !min || !max) {
+    return {};
+  }
+  if (min.getType().getRank() == 0) {
+    min = DenseElementsAttr::get(operand.getType(),
+                                 min.getValues<Attribute>()[0]);
+  }
+  if (max.getType().getRank() == 0) {
+    max = DenseElementsAttr::get(operand.getType(),
+                                 max.getValues<Attribute>()[0]);
+  }
+  Attribute result = {};
+  if (operand.getType().getElementType().isa<FloatType>()) {
+    result = BinaryFolder<ClampOp, FloatType, APFloat, Max<APFloat>>(
+        this, ArrayRef<Attribute>{min, operand});
+    result = BinaryFolder<ClampOp, FloatType, APFloat, Min<APFloat>>(
+        this, ArrayRef<Attribute>{max, result});
+  } else if (operand.getType().getElementType().isa<IntegerType>()) {
+    result = BinaryFolder<ClampOp, IntegerType, APInt, Max<APSInt>>(
+        this, ArrayRef<Attribute>{min, operand});
+    result = BinaryFolder<ClampOp, IntegerType, APInt, Min<APSInt>>(
+        this, ArrayRef<Attribute>{max, result});
+  }
+  return result;
+}
+
+LogicalResult ClampOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  ClampOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferClampOp(location, adaptor.getMin(), adaptor.getOperand(),
+                           adaptor.getMax(), inferredReturnShapes);
+}
+
+LogicalResult ClampOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  // For `mhlo.clamp`, the first operand may be a scalar.
+  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands[1],
+                                     &reifiedReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// SliceOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SliceOp::inferReturnTypes(
+    MLIRContext* /*context*/, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange /*regions*/,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  SliceOpAdaptor adaptor(operands, attributes);
+  return hlo::inferSliceOp(location, adaptor.getOperand(),
+                           adaptor.getStartIndices(), adaptor.getLimitIndices(),
+                           adaptor.getStrides(), inferredReturnTypes);
+}
+
+template <typename I, typename E>
+static void sliceElements(I values, ArrayRef<int64_t> sizes,
+                          ArrayRef<int64_t> starts, ArrayRef<int64_t> limits,
+                          ArrayRef<int64_t> strides,
+                          llvm::SmallVectorImpl<E>* outValues) {
+  assert(starts.size() == limits.size());
+  assert(starts.size() == strides.size());
+  if (starts.empty()) return;
+
+  int64_t start = starts.front();
+  int64_t limit = limits.front();
+  int64_t stride = strides.front();
+  if (starts.size() == 1) {
+    for (int i = start; i < limit; i += stride) {
+      outValues->push_back(*(values + i));
+    }
+    return;
+  }
+
+  for (; start < limit; start += stride) {
+    auto begin = values + start * sizes.front();
+    sliceElements<I, E>(begin, sizes.drop_front(), starts.drop_front(),
+                        limits.drop_front(), strides.drop_front(), outValues);
+  }
+}
+
+template <typename I, typename E>
+static Attribute foldSlice(SliceOp* op, I values) {
+  auto start = llvm::to_vector<6>(op->getStartIndices().getValues<int64_t>());
+  auto limit = llvm::to_vector<6>(op->getLimitIndices().getValues<int64_t>());
+  auto stride = llvm::to_vector<6>(op->getStrides().getValues<int64_t>());
+
+  // TODO(b/235903849): This should be op->getType().case<ShapedType>().
+  auto resultType = op->getOperand().getType().cast<ShapedType>();
+  if (!resultType.hasStaticShape()) return {};
+
+  auto shape = resultType.getShape();
+  int64_t count = resultType.getNumElements();
+  if (count == 0) {
+    return DenseElementsAttr::get<E>(
+        op->getResult().getType().cast<ShapedType>(),
+        /*list=*/{});
+  }
+
+  // Compute the striding for each dimension.
+  llvm::SmallVector<int64_t, 6> sizes;
+  sizes.reserve(shape.size());
+  for (auto v : shape) {
+    count = count / v;
+    sizes.push_back(count);
+  }
+
+  // Prevent folding if the result is too large.
+  if (resultType.getNumElements() > kFoldOpEltLimit) return {};
+
+  llvm::SmallVector<E, 6> outValues;
+  outValues.reserve(resultType.getNumElements());
+  sliceElements<I, E>(values, sizes, start, limit, stride, &outValues);
+
+  return DenseElementsAttr::get(op->getResult().getType().cast<ShapedType>(),
+                                outValues);
+}
+
+OpFoldResult SliceOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  // Check if the SliceOp is a NoOp operation.
+  auto operandType = getOperand().getType().cast<ShapedType>();
+  auto resultType = getResult().getType().cast<ShapedType>();
+
+  if (operandType.hasStaticShape() && resultType.hasStaticShape() &&
+      (operandType.getShape() == resultType.getShape())) {
+    return getOperand();
+  }
+
+  if (operands.empty() || !operands.front()) return {};
+
+  // Evaluate for statically valued inputs.
+  DenseElementsAttr elements = operands.front().dyn_cast<DenseElementsAttr>();
+  if (!elements) return {};
+
+  auto etype = elements.getType().getElementType();
+  if (etype.isa<IntegerType>()) {
+    return foldSlice<DenseElementsAttr::IntElementIterator, APInt>(
+        this, elements.value_begin<APInt>());
+  }
+  if (etype.isa<FloatType>()) {
+    return foldSlice<DenseElementsAttr::FloatElementIterator, APFloat>(
+        this, elements.value_begin<APFloat>());
+  }
+
+  return {};
+}
+
+namespace {
+// In cases where a concat is fed into a slice, it is possible the concat
+// can be simplified or bypassed. This checks which inputs to the concat are
+// used by the slice, either reducing the number of concatenated values or
+// entirely removes the concat.
+struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
+  using OpRewritePattern<SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SliceOp slice,
+                                PatternRewriter& rewriter) const override {
+    auto resultTy = slice.getType().cast<ShapedType>();
+    if (!resultTy.hasStaticShape()) {
+      return failure();
+    }
+
+    auto sliceInput = slice.getOperand();
+    auto sliceInputTy = sliceInput.getType().cast<ShapedType>();
+    auto concat = sliceInput.getDefiningOp<ConcatenateOp>();
+    if (!concat) {
+      return failure();
+    }
+
+    auto dimension = concat.getDimension();
+
+    auto start = slice.getStartIndices().getValues<APInt>();
+    auto limit = slice.getLimitIndices().getValues<APInt>();
+
+    auto sliceStart = (*(start.begin() + dimension)).getSExtValue();
+    auto sliceLimit = (*(limit.begin() + dimension)).getSExtValue();
+
+    // We need to determine what inputs from the concat affect the slice, and
+    // how the bounds of the slice need to be updated for the minimally required
+    // inputs.
+    int64_t runningSize = 0;
+    int64_t frontOffset = sliceInputTy.getShape()[dimension];
+
+    auto subsetStart = concat.operand_end();
+    auto subsetEnd = concat.operand_end();
+    for (auto it = concat.operand_begin(); it < concat.operand_end(); ++it) {
+      auto input = *it;
+      ShapedType inputTy = input.getType().cast<ShapedType>();
+      if (inputTy.isDynamicDim(dimension)) {
+        return failure();
+      }
+      auto dimSize = inputTy.getShape()[dimension];
+
+      // If this position is in the slice its the start of the subset and we
+      // need to update the start and limit values.
+      if (runningSize + dimSize > sliceStart &&
+          subsetStart == concat.operand_end()) {
+        subsetStart = it;
+        frontOffset = runningSize;
+      }
+
+      // Determine the last required offset.
+      if (runningSize < sliceLimit) {
+        subsetEnd = it + 1;
+      }
+
+      runningSize += dimSize;
+    }
+
+    auto subsetSize = subsetEnd - subsetStart;
+    // We need all inputs so no optimization.
+    if (subsetSize == concat.getNumOperands()) {
+      return failure();
+    }
+
+    // If there's nothing to slice that means the output is an empty tensor and
+    // there is dead code. We do nothing here and rely on other passes to clean
+    // this up.
+    if (subsetSize == 0) {
+      return failure();
+    }
+
+    if (subsetSize > 1 && !concat.getResult().hasOneUse()) {
+      return failure();
+    }
+
+    auto concatRange = OperandRange(subsetStart, subsetEnd);
+    auto newConcat = rewriter.create<ConcatenateOp>(
+        concat.getLoc(), concatRange, concat.getDimension());
+
+    llvm::SmallVector<APInt, 6> newStart(start);
+    llvm::SmallVector<APInt, 6> newLimit(limit);
+    newStart[dimension] -= frontOffset;
+    newLimit[dimension] -= frontOffset;
+
+    auto attrType = slice.getStartIndices().getType().cast<ShapedType>();
+    auto create = rewriter.create<SliceOp>(
+        slice.getLoc(), newConcat,
+        DenseIntElementsAttr::get(attrType, newStart),
+        DenseIntElementsAttr::get(attrType, newLimit), slice.getStrides());
+    rewriter.replaceOp(slice, create.getResult());
+    return success();
+  }
+};
+}  // namespace
+
+void SliceOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
+  results.add<SimplifyConcatSlice>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// SortOp
+//===----------------------------------------------------------------------===//
+
+void SortOp::build(OpBuilder& builder, OperationState& state,
+                   ValueRange operands, int64_t dimension, bool isStable) {
+  state.addOperands(operands);
+  state.addAttribute("dimension", builder.getI64IntegerAttr(dimension));
+  state.addAttribute("is_stable", builder.getBoolAttr(isStable));
+
+  for (Value operand : operands) state.addTypes(operand.getType());
+
+  state.addRegion();
+}
+
+LogicalResult SortOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  SortOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferSortOp(location, adaptor.getInputs(), inferredReturnShapes);
+}
+
+LogicalResult SortOp::verify() {
+  return hlo::verifySortOp(getLoc(), getInputs(), getDimension(),
+                           getComparator());
+}
+
+/// Drops the operands if the results are not used and they are not used in
+/// op.comparator().
+static LogicalResult sortDropEmptyUseArgs(SortOp op,
+                                          PatternRewriter& rewriter) {
+  DenseSet<unsigned> erasedArgs;
+  unsigned numOperands = op.getNumOperands();
+  for (unsigned i = 0; i < numOperands; ++i) {
+    if (!op.getResult(i).use_empty()) continue;
+    Block& block = op.getComparator().front();
+    if (!block.getArgument(i * 2).use_empty()) continue;
+    if (!block.getArgument(i * 2 + 1).use_empty()) continue;
+    erasedArgs.insert(i);
+  }
+  if (erasedArgs.empty()) return failure();
+
+  SmallVector<Value> newOperands;
+  BitVector erasedBlockArgs(op.getNumOperands() * 2);
+  for (const auto& en : llvm::enumerate(op.getInputs())) {
+    if (erasedArgs.contains(en.index())) {
+      erasedBlockArgs.set(en.index() * 2);
+      erasedBlockArgs.set(en.index() * 2 + 1);
+    } else {
+      newOperands.push_back(en.value());
+    }
+  }
+
+  auto newOp = rewriter.create<SortOp>(op.getLoc(), newOperands,
+                                       op.getDimension(), op.getIsStable());
+  Region& region = newOp.getComparator();
+  rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
+  region.front().eraseArguments(erasedBlockArgs);
+
+  SmallVector<Value> results;
+  for (unsigned i = 0, j = 0; i < numOperands; ++i) {
+    if (erasedArgs.contains(i)) {
+      results.push_back({});
+    } else {
+      results.push_back(newOp.getResult(j++));
+    }
+  }
+  rewriter.replaceOp(op, results);
+
+  return success();
+}
+
+/// Set the sorting dimension to the last dimension if it's not set and the rank
+/// is known.
+static LogicalResult sortOpInferDefaultDimension(SortOp op,
+                                                 PatternRewriter& rewriter) {
+  auto ty = op.getResultTypes()[0].dyn_cast<ShapedType>();
+  if (!ty) {
+    return failure();
+  }
+  if (static_cast<int64_t>(op.getDimension()) != -1) {
+    return failure();
+  }
+
+  IntegerAttr dim = rewriter.getI64IntegerAttr(ty.getRank() - 1);
+  auto newOp =
+      rewriter.create<SortOp>(op.getLoc(), op.getResultTypes(), op.getInputs(),
+                              dim, op.getIsStableAttr());
+  Region& region = newOp.getComparator();
+  rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
+  rewriter.replaceOp(op, newOp.getResults());
+
+  return success();
+}
+
+void SortOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                         MLIRContext* /*context*/) {
+  results.add(sortDropEmptyUseArgs);
+  results.add(sortOpInferDefaultDimension);
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+// DISC-Begin
+namespace {
+// Transpose the given elements attr according to the specified permutation.
+mlir::ElementsAttr TransposeElementsAttr(
+    const mlir::ElementsAttr& elements, const DenseIntElementsAttr& perm_attr) {
+  ShapedType type = elements.getType();
+  int64_t rank = type.getRank();
+  if (rank < 2) {
+    // Transpose rank 0 and rank 1 tensor is equal to itself.
+    return elements;
+  }
+
+  SmallVector<int64_t, 4> perm;
+  for (auto v : perm_attr.getValues<APInt>()) {
+    perm.push_back(static_cast<int64_t>(v.getSExtValue()));
+  }
+
+  SmallVector<int64_t, 4> orig_shape;
+  for (auto v : type.getShape()) orig_shape.push_back(v);
+  SmallVector<int64_t, 4> new_shape(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    new_shape[dim] = orig_shape[perm[dim]];
+  }
+
+  SmallVector<Attribute, 8> transposed_attrs(elements.getNumElements());
+  for (int64_t i = 0; i < elements.getNumElements(); ++i) {
+    SmallVector<uint64_t, 4> orig_index(rank);
+    SmallVector<uint64_t, 4> new_index(rank);
+    int orig_linear_index = i;
+    for (int64_t dim = rank - 1; dim >= 0; --dim) {
+      orig_index[dim] =  orig_linear_index % orig_shape[dim];
+      orig_linear_index /= orig_shape[dim];
+    }
+    for (int64_t dim = 0; dim < rank; ++dim) {
+      new_index[dim] = orig_index[perm[dim]];
+    }
+    int64_t new_linear_index = new_index[0];
+    for (int64_t dim = 1; dim < rank; ++dim) {
+      new_linear_index = new_linear_index * new_shape[dim] + new_index[dim];
+    }
+    transposed_attrs[new_linear_index] =
+        elements.getValues<Attribute>()[orig_index];
+  }
+  return DenseElementsAttr::get(RankedTensorType::get(
+      new_shape, type.getElementType()), transposed_attrs);
+}
+
+} // namespace
+// DISC-End
+
+OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+
+  // DISC-BEGIN
+  // If the result has non-static shape, a transpsed op is necessary to go from
+  // static shape to non-static shape.
+  auto resultTy = getResult().getType().dyn_cast<RankedTensorType>();
+  if (!resultTy || !resultTy.hasStaticShape()) return {};
+  // DISC-END
+
+  if (auto elements = operands.front().dyn_cast_or_null<SplatElementsAttr>()) {
+    return reshape(elements, getResult().getType().cast<ShapedType>());
+  }
+
+  // DISC-BEGIN
+  // operand is const, thus fold it directly.
+  if (auto elementsAttr = operands.front().dyn_cast_or_null<ElementsAttr>()) {
+    return TransposeElementsAttr(elementsAttr, getPermutation());
+  }
+  // DISC-END
+
+  for (const auto& it : llvm::enumerate(getPermutation().getValues<APInt>())) {
+    if (it.index() != it.value()) {
+      return {};
+    }
+  }
+  return getOperand();
+}
+
+// transpose(transpose(X)) => transpose(X)
+static LogicalResult eliminateRedundantTranspse(TransposeOp op,
+                                                PatternRewriter& rewriter) {
+  auto tranposeOperand = op.getOperand().getDefiningOp<TransposeOp>();
+  if (!tranposeOperand) {
+    return failure();
+  }
+  auto operandPermutation = tranposeOperand.getPermutation().getValues<APInt>();
+  auto newPermutation =
+      op.getPermutation()
+          .mapValues(op.getPermutation().getElementType(),
+                     [&operandPermutation](const APInt& index) -> APInt {
+                       return operandPermutation[index.getSExtValue()];
+                     })
+          .cast<DenseIntElementsAttr>();
+  rewriter.replaceOpWithNewOp<TransposeOp>(op, op.getResult().getType(),
+                                           tranposeOperand.getOperand(),
+                                           newPermutation);
+  return success();
+}
+
+// transpose(broadcast_in_dim(X)) => broadcast_in_dim(X)
+static LogicalResult eliminateBroadcastInDimTranspose(
+    TransposeOp op, PatternRewriter& rewriter) {
+  auto broadcastInDimOp = op.getOperand().getDefiningOp<BroadcastInDimOp>();
+  if (!broadcastInDimOp) {
+    return failure();
+  }
+  DenseIntElementsAttr broadcastDimensions =
+      broadcastInDimOp.getBroadcastDimensions();
+  DenseIntElementsAttr permutation = op.getPermutation();
+  SmallVector<int64_t> newBroadcastDimensions;
+  for (auto dimension : broadcastDimensions.getValues<int64_t>()) {
+    int64_t index = 0;
+    for (auto p : permutation.getValues<int64_t>()) {
+      if (p == dimension) {
+        newBroadcastDimensions.push_back(index);
+        break;
+      }
+      index++;
+    }
+  }
+  rewriter.replaceOpWithNewOp<BroadcastInDimOp>(
+      op, op->getResultTypes(), broadcastInDimOp.getOperand(),
+      rewriter.getI64TensorAttr(newBroadcastDimensions));
+  return success();
+}
+
+// simplify Transpose: replace Transpose with Reshape if they are equivalent
+static LogicalResult simplifyTranspose(TransposeOp op,
+                                       PatternRewriter& rewriter) {
+  auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+  auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+  if (!operandType || !resultType) {
+    return failure();
+  }
+  // Not support dynamic shape a.t.m. BTW, when it's dynamic shape,
+  // maybe Transpose should be replaced by DynamicReshape.
+  if (!operandType.hasStaticShape() || !resultType.hasStaticShape()) {
+    return failure();
+  }
+  auto permutation = op.getPermutation().getValues<int64_t>();
+  llvm::SmallVector<int64_t> sortedPermutation;
+  for (int64_t i = 0, e = resultType.getRank(); i < e; i++) {
+    if (resultType.getDimSize(i) != 1) {
+      sortedPermutation.push_back(permutation[i]);
+    }
+  }
+  if (llvm::is_sorted(sortedPermutation)) {
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.getOperand());
+    return success();
+  }
+  return failure();
+}
+
+void TransposeOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                              MLIRContext* /*context*/) {
+  results.add(eliminateRedundantTranspse);
+  results.add(eliminateBroadcastInDimTranspose);
+  results.add(simplifyTranspose);
+}
+
+LogicalResult TransposeOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  TransposeOp::Adaptor adaptor(operands);
+  Value operand = adaptor.getOperand();
+
+  auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+  // Not support unranked type a.t.m.
+  if (!operandType) return failure();
+
+  Location loc = this->getLoc();
+  SmallVector<int64_t, 4> permutation(
+      this->getPermutation().getValues<int64_t>());
+  SmallVector<Value, 4> shapeValues(permutation.size());
+
+  Type shapeScalarType = builder.getIndexType();
+  auto toShapeScalarType = [&](Value v) {
+    return maybeCastTo(builder, loc, v, shapeScalarType);
+  };
+
+  for (const auto& element : llvm::enumerate(operandType.getShape())) {
+    int64_t idx = element.index();
+    auto* it = std::find(permutation.begin(), permutation.end(), idx);
+    Value valueDim = toShapeScalarType(
+        builder.createOrFold<tensor::DimOp>(loc, operand, element.index()));
+    shapeValues[std::distance(permutation.begin(), it)] = valueDim;
+  }
+
+  Value outputShape = builder.create<tensor::FromElementsOp>(
+      loc,
+      RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
+                            shapeScalarType),
+      shapeValues);
+  reifiedReturnShapes.push_back(outputShape);
+
+  return success();
+}
+
+LogicalResult TransposeOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> loc, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  TransposeOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferTransposeOp(loc, adaptor.getOperand(),
+                               adaptor.getPermutation(), inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// TriangularSolveOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TriangularSolveOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  TriangularSolveOp::Adaptor adaptor(operands, attributes, regions);
+  bool isTransposeAInvalid =
+      (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
+  return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
+                                     adaptor.getLeftSide(), isTransposeAInvalid,
+                                     inferredReturnShapes);
+}
+
+//===----------------------------------------------------------------------===//
+// GetTupleElementOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult GetTupleElementOp::fold(FoldAdaptor /*adaptor*/) {
+  if (auto tupleOp = getOperand().getDefiningOp<mhlo::TupleOp>()) {
+    return tupleOp.getOperand(getIndex());
+  }
+
+  return {};
+}
+
+LogicalResult GetTupleElementOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  GetTupleElementOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferGetTupleElementOp(location, adaptor.getOperand(),
+                                     adaptor.getIndex(), inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// TupleOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TupleOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  TupleOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferTupleOp(context, location, adaptor.getVal(),
+                           inferredReturnTypes);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryEinsumOp
+//===----------------------------------------------------------------------===//
+
+void UnaryEinsumOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                                MLIRContext* context) {
+  results.add<UnaryEinsumToEinsum>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// CompareOp
+//===----------------------------------------------------------------------===//
+
+void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
+                      Value rhs, ComparisonDirection comparisonDirection,
+                      ComparisonType compareType) {
+  build(builder, result, lhs, rhs,
+        ComparisonDirectionAttr::get(builder.getContext(), comparisonDirection),
+        ComparisonTypeAttr::get(builder.getContext(), compareType));
+}
+
+LogicalResult CompareOp::inferReturnTypeComponents(
+    MLIRContext* context, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  CompareOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferCompareOp(context, location, adaptor.getLhs(),
+                             inferredReturnShapes);
+}
+
+LogicalResult CompareOp::reifyReturnTypeShapes(
+    OpBuilder& builder, ValueRange operands,
+    SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return hlo::deriveShapeFromOperand(&builder, getOperation(), operands.front(),
+                                     &reifiedReturnShapes);
+}
+
+template <typename Op, typename ElementType, typename SrcType, typename Convert>
+static Attribute CompareFolder(CompareOp op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0] || !attrs[1]) return {};
+
+  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  if (!lhs || !rhs) return {};
+
+  ShapedType operandType =
+      op.getOperand(0).getType().template cast<ShapedType>();
+  if (!operandType.hasStaticShape()) {
+    return {};
+  }
+
+  auto etype = operandType.getElementType();
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  // Prevent folding if the result is too large.
+  if (lhs.getNumElements() > kFoldOpEltLimit) return {};
+
+  SmallVector<bool, 6> values;
+  values.reserve(lhs.getNumElements());
+  for (const auto zip :
+       llvm::zip(lhs.getValues<SrcType>(), rhs.getValues<SrcType>())) {
+    values.push_back(
+        Convert()(addSign(std::get<0>(zip), lhs.getElementType()),
+                  addSign(std::get<1>(zip), rhs.getElementType())));
+  }
+
+  auto resultTy = op.getType().cast<ShapedType>();
+  return DenseElementsAttr::get(resultTy, values);
+}
+
+OpFoldResult CompareOp::fold(FoldAdaptor adaptor) {
+  auto operands = adaptor.getOperands();
+  auto resultTy = getType().cast<ShapedType>();
+  if (!resultTy.hasStaticShape()) return {};
+
+  auto direction = getComparisonDirection();
+  auto lhsTy = getElementTypeOrSelf(getLhs());
+  if (getLhs() == getRhs() && !lhsTy.isa<FloatType>() &&
+      (!lhsTy.isa<ComplexType>() ||
+       !lhsTy.cast<ComplexType>().getElementType().isa<FloatType>())) {
+    if (direction == ComparisonDirection::LE ||
+        direction == ComparisonDirection::EQ ||
+        direction == ComparisonDirection::GE) {
+      return DenseIntElementsAttr::get(resultTy, {true});
+    }
+    return DenseIntElementsAttr::get(resultTy, {false});
+  }
+
+  auto opElType = getLhs().getType().cast<ShapedType>().getElementType();
+  // Fold tensor<*xi1> != false to just return tensor<*xi1>
+  if (direction == ComparisonDirection::NE && opElType.isInteger(1)) {
+    DenseIntElementsAttr cstAttr;
+    if (matchPattern(getLhs(), m_Constant(&cstAttr))) {
+      if (cstAttr.isSplat() && !cstAttr.getSplatValue<bool>()) {
+        return getRhs();
+      }
+    }
+
+    if (matchPattern(getRhs(), m_Constant(&cstAttr))) {
+      if (cstAttr.isSplat() && !cstAttr.getSplatValue<bool>()) {
+        return getLhs();
+      }
+    }
+  }
+
+  // Fold tensor<*xi1> == True to just return tensor<*xi1>
+  if (direction == ComparisonDirection::EQ && opElType.isInteger(1)) {
+    DenseIntElementsAttr cstAttr;
+    if (matchPattern(getLhs(), m_Constant(&cstAttr))) {
+      if (cstAttr.isSplat() && cstAttr.getSplatValue<bool>()) {
+        return getRhs();
+      }
+    }
+
+    if (matchPattern(getRhs(), m_Constant(&cstAttr))) {
+      if (cstAttr.isSplat() && cstAttr.getSplatValue<bool>()) {
+        return getLhs();
+      }
+    }
+  }
+
+  if (!operands[0] || !operands[1]) {
+    return {};
+  }
+
+#define COMPARE_FOLDER(Op, comparison, Func)                                \
+  if (direction == comparison) {                                            \
+    if (auto folded = CompareFolder<Op, FloatType, APFloat, Func<APFloat>>( \
+            *this, operands))                                               \
+      return folded;                                                        \
+    if (auto folded = CompareFolder<Op, IntegerType, APInt, Func<APSInt>>(  \
+            *this, operands))                                               \
+      return folded;                                                        \
+  }
+
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::EQ, std::equal_to);
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::NE, std::not_equal_to);
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::LT, std::less);
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::LE, std::less_equal);
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::GT, std::greater);
+  COMPARE_FOLDER(CompareOp, ComparisonDirection::GE, std::greater_equal);
+#undef COMPARE_FOLDER
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// SelectAndScatterOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SelectAndScatterOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  SelectAndScatterOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferSelectAndScatterOp(adaptor.getOperand(),
+                                      inferredReturnTypes);
+}
+
+LogicalResult SelectAndScatterOp::verify() {
+  return hlo::verifySelectAndScatterOp(getLoc(), getOperand(), getSource(),
+                                       getInitValue(), getWindowDimensions(),
+                                       getWindowStrides(), getPadding(),
+                                       getSelect(), getScatter());
+}
+
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ScatterOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  ScatterOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferScatterOp(location, adaptor.getInputs(),
+                             inferredReturnTypes);
+}
+
+LogicalResult ScatterOp::verify() {
+  return hlo::verifyScatterOp(
+      getLoc(), getInputs(), getScatterIndices(), getUpdates(),
+      getScatterDimensionNumbers().getUpdateWindowDims(),
+      getScatterDimensionNumbers().getInsertedWindowDims(),
+      getScatterDimensionNumbers().getScatterDimsToOperandDims(),
+      getScatterDimensionNumbers().getIndexVectorDim(), getUpdateComputation());
+}
+
+llvm::SmallVector<Attribute, 4> evaluateMhloRegion(Region& region,
+                                                   ArrayRef<Attribute> inputs) {
+  if (region.getNumArguments() != inputs.size()) return {};
+
+  llvm::DenseMap<Value, Attribute> values;
+  values.reserve(region.getNumArguments());
+  for (auto it : llvm::zip(region.getArguments(), inputs)) {
+    values.try_emplace(std::get<0>(it), std::get<1>(it));
+  }
+
+  for (auto& op : region.getOps()) {
+    llvm::SmallVector<Attribute, 4> inputs;
+    for (auto& operand : op.getOpOperands()) {
+      inputs.push_back(values.lookup(operand.get()));
+    }
+    if (isa<ReturnOp>(op)) return inputs;
+
+    llvm::SmallVector<OpFoldResult, 4> results;
+    if (failed(op.fold(inputs, results))) return {};
+    for (auto it : llvm::zip(op.getResults(), results)) {
+      if (!std::get<1>(it).is<Attribute>()) return {};
+      values.insert({std::get<0>(it), std::get<1>(it).get<Attribute>()});
+    }
+  }
+  return {};
+}
+
+LogicalResult ScatterOp::fold(
+    FoldAdaptor adaptor, llvm::SmallVectorImpl<OpFoldResult>& foldResults) {
+  auto args = adaptor.getOperands();
+  // Variadic Scatter not yet implemented
+  if (getInputs().size() != 1 || getUpdates().size() != 1) return failure();
+  auto index = args[1].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!index) return failure();
+
+  auto baseType = getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
+  auto updateType = getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
+  auto indexType = index.getType().cast<RankedTensorType>();
+  if (!baseType || !indexType || !updateType) return failure();
+
+  // TODO(b/228310289): Work around canonicalization crash for complex types.
+  // Remove after upstream MLIR has been fixed.
+  if (baseType.getElementType().isa<ComplexType>()) return failure();
+
+  // Catch a trivial full replacement of base with update, this does not require
+  // these to be constant: just that we know the type.
+  if (updateType == baseType && updateType.hasStaticShape() &&
+      baseType.hasStaticShape() && index.isSplat() &&
+      index.getSplatValue<uint32_t>() == 0 &&
+      llvm::hasSingleElement(getUpdateComputation().front())) {
+    foldResults.push_back(getUpdates()[0]);
+    return success();
+  }
+  auto base = args[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto update = args[2].dyn_cast_or_null<DenseElementsAttr>();
+  if (!base || !update) return failure();
+
+  // Add the virtual trailing dimension of size 1 if indexVectorDim equals to
+  // indexType.rank.
+  const int64_t indexVectorDim =
+      getScatterDimensionNumbers().getIndexVectorDim();
+  if (indexVectorDim == indexType.getRank()) {
+    auto indexShape = indexType.getShape().vec();
+    indexShape.push_back(1);
+    indexType = RankedTensorType::get(indexShape, indexType.getElementType());
+    index = reshape(index, indexType).cast<DenseIntElementsAttr>();
+  }
+
+  // Increment the multi-dimensional index vector based on the limits for each
+  // dimension specified by shape and returns false if the index rolled around
+  // with true otherwise.
+  auto nextIndex = [](llvm::SmallVector<uint64_t, 8>& index,
+                      llvm::ArrayRef<int64_t> shape) {
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      ++index[i];
+      if (index[i] < static_cast<unsigned long>(shape[i])) return true;
+      index[i] = 0;
+    }
+    return false;
+  };
+
+  // Prevent folding if the result is too large.
+  if (base.getNumElements() > kFoldOpEltLimit) return failure();
+
+  // Iterate over all elements of the update tensor, then find the corresponding
+  // value in the indices tensor to determine which location we have to update
+  // in the base/result tensor.
+  llvm::SmallVector<Attribute, 8> results(base.getValues<Attribute>());
+  llvm::SmallVector<uint64_t, 8> updateIndex(updateType.getRank(), 0);
+  llvm::SmallVector<uint64_t, 8> indexIndex;
+  indexIndex.reserve(indexType.getRank());
+  llvm::SmallVector<int64_t, 8> baseIndex;
+  baseIndex.reserve(baseType.getRank());
+  do {
+    // Compute the index for the slice of the indices tensor for this update
+    // value.
+    indexIndex.clear();
+    if (indexVectorDim == 0) indexIndex.push_back(0);
+    for (int64_t i = 0; i < static_cast<int64_t>(updateIndex.size()); ++i) {
+      if (llvm::count(getScatterDimensionNumbers().getUpdateWindowDims(), i) ==
+          0)
+        indexIndex.push_back(updateIndex[i]);
+      if (static_cast<int64_t>(indexIndex.size()) == indexVectorDim)
+        indexIndex.push_back(0);
+    }
+
+    // Compute the index for the given update value in the base tensor.
+    baseIndex.assign(baseType.getRank(), 0);
+    uint64_t indexCount = indexType.getShape()[indexVectorDim];
+    for (uint64_t i = 0; i < indexCount; ++i) {
+      uint64_t operandDim =
+          getScatterDimensionNumbers().getScatterDimsToOperandDims()[i];
+      indexIndex[indexVectorDim] = i;
+      baseIndex[operandDim] +=
+          index.getValues<APInt>()[indexIndex].getSExtValue();
+    }
+    uint64_t updateWindowDimIndex = 0;
+    auto insertedWindowDims =
+        getScatterDimensionNumbers().getInsertedWindowDims();
+    auto updateWindowDims = getScatterDimensionNumbers().getUpdateWindowDims();
+    for (uint64_t i = 0; i < baseIndex.size(); ++i) {
+      if (llvm::count(insertedWindowDims, i)) continue;
+      baseIndex[i] += updateIndex[updateWindowDims[updateWindowDimIndex]];
+      updateWindowDimIndex++;
+    }
+
+    // Compute the linear index for the index into the base tensor.
+    int64_t linearBaseIndex = 0;
+    int64_t linearBaseIndexMultiplyer = 1;
+    for (int64_t i = baseIndex.size() - 1; i >= 0; --i) {
+      // Out of bound index have backend specific behaviour so avoid folding it.
+      if (baseIndex[i] < 0 || baseIndex[i] >= baseType.getShape()[i])
+        return failure();
+      linearBaseIndex += baseIndex[i] * linearBaseIndexMultiplyer;
+      linearBaseIndexMultiplyer *= baseType.getShape()[i];
+    }
+
+    // Evaluate update computation and update the value with the newly computed
+    // attribute in the base tensor.
+    auto lhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, baseType.getElementType()),
+        results[linearBaseIndex]);
+    auto rhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, baseType.getElementType()),
+        update.getValues<Attribute>()[updateIndex]);
+    auto newValue = evaluateMhloRegion(getUpdateComputation(), {lhs, rhs});
+    if (newValue.size() != 1 || !newValue[0]) return failure();
+    results[linearBaseIndex] =
+        newValue[0].cast<DenseElementsAttr>().getValues<Attribute>()[0];
+  } while (nextIndex(updateIndex, updateType.getShape()));
+
+  foldResults.push_back(DenseElementsAttr::get(baseType, results));
+  return success();
+}
+
+// Replace mhlo.scatter overwriting the entire input with mhlo.map.
+struct ScatterFullReplace : public OpRewritePattern<ScatterOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ScatterOp scatter,
+                                PatternRewriter& rewriter) const override {
+    // Variadic Scatter not yet implemented
+    if (scatter.getInputs().size() != 1 || scatter.getUpdates().size() != 1)
+      return failure();
+
+    auto baseType =
+        scatter.getInputs().getTypes()[0].dyn_cast<RankedTensorType>();
+    auto updateType =
+        scatter.getUpdates().getTypes()[0].dyn_cast<RankedTensorType>();
+    auto indexType =
+        scatter.getScatterIndices().getType().dyn_cast<RankedTensorType>();
+    if (!baseType || !indexType || !updateType) return failure();
+
+    // If updates is an empty shape, scatter overwrites the entire tensor.
+    // Transform it into a map with the combiner function.
+    if (!indexType.hasStaticShape() || indexType.getNumElements() > 0)
+      return failure();
+
+    // Require the same shape for base and updates. This isn't strictly
+    // necessary, but handling other cases would require turning scatter options
+    // into the appropriate reshapes and transposes.
+    if (!baseType.hasStaticShape() || !updateType.hasStaticShape() ||
+        baseType != updateType)
+      return failure();
+
+    auto dimensions =
+        llvm::to_vector(llvm::seq<int64_t>(0, baseType.getRank()));
+    auto map = rewriter.create<mhlo::MapOp>(
+        scatter.getLoc(), scatter->getResultTypes(),
+        ValueRange{scatter.getOperands()[0], scatter.getUpdates()[0]},
+        rewriter.getI64TensorAttr(dimensions));
+    rewriter.inlineRegionBefore(scatter.getRegion(), map.getRegion(),
+                                map.getRegion().begin());
+    rewriter.replaceOp(scatter, map->getResults());
+    return success();
+  }
+};
+
+void ScatterOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                            MLIRContext* context) {
+  results.add<ScatterFullReplace>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// WhileOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult WhileOp::inferReturnTypes(
+    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  WhileOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferWhileOp(location, adaptor.getOperand(), inferredReturnTypes);
+}
+
+LogicalResult WhileOp::verify() {
+  return hlo::verifyWhileOp(getLoc(), getOperand(), getCond(), getBody());
+}
+
+/// Print a `while` op.
+///
+/// op ::= `mhlo.while` `(` assignment-list `)` `:` types attribute-dict
+///         `cond` region
+///         `do` region
+/// assignment-list ::= assignment | assignment `,` assignment-list
+/// assignment ::= ssa-value `=` ssa-value
+void WhileOp::print(OpAsmPrinter& p) {
+  p << '(';
+  llvm::interleaveComma(
+      llvm::zip(SingleBlock::getBody()->getArguments(), getOperands()), p,
+      [&](auto zip) {
+        p.printOperand(std::get<0>(zip));
+        p << " = ";
+        p.printOperand(std::get<1>(zip));
+      });
+  p << ")";
+  if (getNumOperands()) {
+    p << " : ";
+    llvm::interleaveComma(getOperandTypes(), p);
+  }
+  p.printOptionalAttrDictWithKeyword(getOperation()->getAttrs());
+  p.printNewline();
+  p << " cond ";
+  p.printRegion(getRegion(0), /*printEntryBlockArgs=*/false);
+  p << " do ";
+  p.printRegion(getRegion(1), /*printEntryBlockArgs=*/false);
+}
+
+ParseResult WhileOp::parse(OpAsmParser& parser, OperationState& result) {
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  // Parse the operands of the while: these are of the form:
+  //   %iter_arg = %init_val
+  // where %iter_arg is the name of the block argument in the cond/body blocks
+  // and %init_val is the actual operand.
+  SmallVector<OpAsmParser::UnresolvedOperand> operands;
+  SmallVector<OpAsmParser::UnresolvedOperand> iterArgs;
+  if (parser.parseLParen()) return failure();
+  do {
+    if (succeeded(parser.parseOptionalRParen())) break;
+    OpAsmParser::UnresolvedOperand operand, iterArg;
+    if (parser.parseOperand(iterArg) || parser.parseEqual() ||
+        parser.parseOperand(operand))
+      return failure();
+    iterArgs.push_back(iterArg);
+    operands.push_back(operand);
+    if (succeeded(parser.parseOptionalRParen())) break;
+    if (failed(parser.parseComma())) return failure();
+  } while (true);
+  if (!operands.empty()) {
+    if (parser.parseColon() || parser.parseTypeList(result.types))
+      return failure();
+  }
+
+  SmallVector<OpAsmParser::Argument> args;
+  createArgs(iterArgs, result.types, args);
+  if (parser.resolveOperands(operands, result.types, loc, result.operands) ||
+      parser.parseOptionalAttrDictWithKeyword(result.attributes) ||
+      parser.parseKeyword("cond") ||
+      parser.parseRegion(*result.addRegion(), args) ||
+      parser.parseKeyword("do") ||
+      parser.parseRegion(*result.addRegion(), args))
+    return failure();
+  return success();
+}
+
+LogicalResult WhileOp::fold(FoldAdaptor /*adaptor*/,
+                            SmallVectorImpl<OpFoldResult>& results) {
+  DenseIntElementsAttr condValue;
+  // TODO: This folder is executed on invalid mhlo.while ops during
+  // LegalizeMhlo, mlir_hlo/tosa/tests/unary.mlir. Broken pattern?
+  auto condReturnOp = dyn_cast<ReturnOp>(getCond().front().back());
+  if (!condReturnOp) return failure();
+  if (!matchPattern(condReturnOp.getOperand(0), m_Constant(&condValue)))
+    return failure();
+  if (condValue.getSplatValue<BoolAttr>().getValue())
+    return failure();  // TODO(mhlo): this is an infinite loop, should we fold?
+
+  results.append(getOperands().begin(), getOperands().end());
+  return success();
+}
+
+static LogicalResult whileCanonicalization(WhileOp whileOp,
+                                           PatternRewriter& rewriter) {
+  // Turn loop invariant values into implicit capture.
+  // Check if there is at least one value is forwarded from one iteration to the
+  // next, or one of the yielded value is an implicit capture already. Otherwise
+  // there is nothing to do here.
+  Block* cond = whileOp.SingleBlock::getBody(0);
+  Block* body = whileOp.SingleBlock::getBody(1);
+  auto bodyReturnOp = cast<ReturnOp>(body->getTerminator());
+  if (!llvm::any_of(llvm::zip(whileOp->getOperands(), body->getArguments(),
+                              bodyReturnOp->getOperands()),
+                    [&](auto zip) {
+                      return (std::get<0>(zip) == std::get<2>(zip) ||
+                              std::get<1>(zip) == std::get<2>(zip));
+                    }))
+    return rewriter.notifyMatchFailure(whileOp, "no loop invariant found");
+
+  SmallVector<Value> newOperands, resultsToReplace;
+  SmallVector<unsigned> invariantArgIdxs;
+  BitVector invariantArgIdxBitVector(cond->getNumArguments());
+  for (const auto& enumeratedOperands : llvm::enumerate(llvm::zip(
+           whileOp.getOperands(), cond->getArguments(), body->getArguments(),
+           bodyReturnOp->getOperands(), whileOp->getResults()))) {
+    const auto& operands = enumeratedOperands.value();
+    Value whileOperand = std::get<0>(operands);
+    BlockArgument condBlockArg = std::get<1>(operands);
+    BlockArgument bodyBlockArg = std::get<2>(operands);
+    Value bodyReturnOperand = std::get<3>(operands);
+    Value whileResult = std::get<4>(operands);
+
+    bool forwarded = (whileOperand == bodyReturnOperand ||
+                      bodyBlockArg == bodyReturnOperand);
+    if (forwarded) {
+      invariantArgIdxs.push_back(enumeratedOperands.index());
+      invariantArgIdxBitVector.set(enumeratedOperands.index());
+      condBlockArg.replaceAllUsesWith(whileOperand);
+      bodyBlockArg.replaceAllUsesWith(whileOperand);
+      whileResult.replaceAllUsesWith(whileOperand);
+      continue;
+    }
+    newOperands.push_back(whileOperand);
+    resultsToReplace.push_back(whileResult);
+  }
+  cond->eraseArguments(invariantArgIdxBitVector);
+  body->eraseArguments(invariantArgIdxBitVector);
+  for (int idx : llvm::reverse(invariantArgIdxs))
+    bodyReturnOp->eraseOperand(idx);
+
+  WhileOp newWhileOp = rewriter.create<WhileOp>(
+      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands);
+  newWhileOp.getBodyRegion(0).takeBody(whileOp.getBodyRegion(0));
+  newWhileOp.getBodyRegion(1).takeBody(whileOp.getBodyRegion(1));
+  for (auto results : llvm::zip(resultsToReplace, newWhileOp->getResults()))
+    std::get<0>(results).replaceAllUsesWith(std::get<1>(results));
+  rewriter.eraseOp(whileOp);
+  return success();
+}
+
+void WhileOp::getCanonicalizationPatterns(RewritePatternSet& results,
+                                          MLIRContext* context) {
+  results.add(&whileCanonicalization);
+}
+
+LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
+    MLIRContext*, Optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  UniformDequantizeOp::Adaptor adaptor(operands, attributes, regions);
+  return hlo::inferUniformDequantizeOp(location, adaptor.getOperand(),
+                                       inferredReturnShapes);
+}
+
+using mlir::hlo::parseWindowAttributes;
+using mlir::hlo::printWindowAttributes;
+
+}  // namespace mhlo
+}  // namespace mlir
+
+using mlir::hlo::parseComplexOpType;
+using mlir::hlo::parseCustomCallTarget;
+using mlir::hlo::parseDenseI64Array;
+using mlir::hlo::parseExponentMantissa;
+using mlir::hlo::parsePairwiseOpType;
+using mlir::hlo::parseSameOperandsAndResultType;
+using mlir::hlo::parseSelectOpType;
+using mlir::hlo::parseTupleOpType;
+using mlir::hlo::parseVariadicOperandWithAttribute;
+using mlir::hlo::parseVariadicSameOperandsAndResultType;
+using mlir::hlo::printComplexOpType;
+using mlir::hlo::printCustomCallTarget;
+using mlir::hlo::printDenseI64Array;
+using mlir::hlo::printExponentMantissa;
+using mlir::hlo::printPairwiseOpType;
+using mlir::hlo::printSameOperandsAndResultType;
+using mlir::hlo::printSelectOpType;
+using mlir::hlo::printTupleOpType;
+using mlir::hlo::printVariadicOperandWithAttribute;
+using mlir::hlo::printVariadicSameOperandsAndResultType;
+
+#define GET_OP_CLASSES
+#include "mhlo/IR/hlo_ops.cc.inc"
+
+namespace mlir {
+namespace mhlo {
+
+//===----------------------------------------------------------------------===//
+// mhlo Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct MhloDialectInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  // Allow all call operations to be inlined.
+  bool isLegalToInline(Operation* call, Operation* callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
+  // We don't have any special restrictions on what can be inlined into
+  // destination regions (e.g. while/conditional bodies). Always allow it.
+  bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
+                       IRMapping& valueMapping) const final {
+    return true;
+  }
+  // Operations in mhlo dialect are always legal to inline since they are
+  // pure.
+  bool isLegalToInline(Operation*, Region*, bool,
+                       IRMapping&) const final {
+    return true;
+  }
+};
+
+struct MhloHloDialectInterface : public hlo::HloDialectInterface {
+  using HloDialectInterface::HloDialectInterface;
+
+  Type createTokenType() const override {
+    return TokenType::get(getDialect()->getContext());
+  }
+
+  bool isTokenType(Type type) const override { return type.isa<TokenType>(); }
+
+  Attribute createTypeExtensions(ArrayRef<int64_t> bounds) const override {
+    return TypeExtensionsAttr::get(getDialect()->getContext(), bounds);
+  }
+};
+}  // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// mhlo Dialect Constructor
+//===----------------------------------------------------------------------===//
+
+MhloDialect::MhloDialect(MLIRContext* context)
+    : Dialect(getDialectNamespace(), context, TypeID::get<MhloDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "mhlo/IR/hlo_ops.cc.inc"
+      >();
+  addInterfaces<MhloHloDialectInterface>();
+  addInterfaces<MhloDialectInlinerInterface>();
+  addBytecodeInterface(this);
+  addTypes<TokenType, AsyncBundleType>();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mhlo/IR/hlo_ops_attrs.cc.inc"
+      >();
+  context->loadDialect<tensor::TensorDialect>();
+}
+
+Type MhloDialect::parseType(DialectAsmParser& parser) const {
+  StringRef mnemonic;
+  Type parsedType;
+  auto parseResult = generatedTypeParser(parser, &mnemonic, parsedType);
+  if (parseResult.has_value()) return parsedType;
+  if (mnemonic == "token") return TokenType::get(getContext());
+  parser.emitError(parser.getNameLoc()) << "unknown mhlo type: " << mnemonic;
+  return nullptr;
+}
+
+void MhloDialect::printType(Type type, DialectAsmPrinter& os) const {
+  if (type.isa<TokenType>()) {
+    os << "token";
+    return;
+  }
+  if (succeeded(generatedTypePrinter(type, os))) return;
+  os << "<unknown mhlo type>";
+}
+
+// Entry point for Attribute parsing, TableGen generated code will handle the
+// dispatch to the individual classes.
+Attribute MhloDialect::parseAttribute(DialectAsmParser& parser,
+                                      Type type) const {
+  StringRef attrTag;
+  Attribute attr;
+  auto parseResult = generatedAttributeParser(parser, &attrTag, type, attr);
+  if (parseResult.has_value()) return attr;
+  parser.emitError(parser.getNameLoc(), "unknown mhlo attribute");
+  return Attribute();
+}
+
+// Entry point for Attribute printing, TableGen generated code will handle the
+// dispatch to the individual classes.
+void MhloDialect::printAttribute(Attribute attr, DialectAsmPrinter& os) const {
+  LogicalResult result = generatedAttributePrinter(attr, os);
+  (void)result;
+  assert(succeeded(result));
+}
+
+/// Helpers for attributes parsing.
+static ParseResult parseDims(AsmParser& parser,
+                             SmallVector<int64_t>& dimSizes) {
+  dimSizes.clear();
+  auto failOrDims = parseDimSizes(parser);
+  if (failed(failOrDims)) {
+    return failure();
+  }
+  dimSizes = std::move(*failOrDims);
+  return success();
+}
+
+static ParseResult parseDimsWithMinimumElements(AsmParser& parser,
+                                                SmallVector<int64_t>& dimSizes,
+                                                int minElements) {
+  if (failed(parseDims(parser, dimSizes))) return failure();
+  if (static_cast<int64_t>(dimSizes.size()) < minElements)
+    return parser.emitError(parser.getCurrentLocation())
+           << "expected at least " << minElements << " element(s), found "
+           << dimSizes.size();
+  return success();
+}
+
+/// Parse a custom attribute that resembles a struct of the form
+/// <
+///   foo = something_parsed_by_custom_parser,
+///   bar = something_parsed_by_different_custom_parser,
+///   baz something_parsed_by_another_custom_parser
+/// >
+/// The optional argument `parse_equal` array can be used to denote if
+/// '=' follows the keyword (see baz in the example above) for a field. If
+/// not provided, all fields must be followed by a '='.
+static ParseResult parseStruct(
+    AsmParser& parser, ArrayRef<StringRef> keywords,
+    ArrayRef<llvm::function_ref<ParseResult()>> parseFuncs,
+    ArrayRef<bool> parseEqual = {}) {
+  assert(keywords.size() == parseFuncs.size());
+  assert(parseEqual.empty() || parseEqual.size() == keywords.size());
+  SmallVector<bool> seen(keywords.size(), false);
+  while (failed(parser.parseOptionalGreater())) {
+    bool foundOne = false;
+    for (const auto& it : llvm::enumerate(keywords)) {
+      size_t index = it.index();
+      StringRef keyword = it.value();
+      if (succeeded(parser.parseOptionalKeyword(keyword))) {
+        if (seen[index]) {
+          return parser.emitError(parser.getCurrentLocation())
+                 << "duplicated `" << keyword << "` entry";
+        }
+        if (parseEqual.empty() || parseEqual[index]) {
+          if (failed(parser.parseEqual())) return failure();
+        }
+        if (failed(parseFuncs[index]())) return failure();
+        if (failed(parser.parseOptionalComma())) return parser.parseGreater();
+        seen[index] = true;
+        foundOne = true;
+      }
+    }
+    if (!foundOne) {
+      auto parseError = parser.emitError(parser.getCurrentLocation())
+                        << "expected one of: ";
+      llvm::interleaveComma(keywords, parseError, [&](StringRef kw) {
+        parseError << '`' << kw << '`';
+      });
+      return parseError;
+    }
+  }
+  return success();
+}
+
+// Helpers to print an optional array or integer field, to simplify writing
+// attribute printers.
+template <typename T>
+static void printField(AsmPrinter& printer, StringRef name, T field,
+                       StringRef& separator) {
+  if (field != 0) {
+    printer << separator << name << " = " << field;
+    separator = ", ";
+  }
+}
+template <typename T>
+static void printField(AsmPrinter& printer, StringRef name, ArrayRef<T> field,
+                       StringRef& separator) {
+  if (!field.empty()) {
+    printer << separator << name << " = [";
+    llvm::interleaveComma(field, printer);
+    printer << "]";
+    separator = ", ";
+  }
+}
+
+template <typename... Ts>
+static void printStruct(AsmPrinter& printer, StringRef name,
+                        Ts... printFields) {
+  printer << "<";
+  StringRef separator = "";
+  // Fold expression to print each entry in the parameter pack.
+  // TODO(mhlo-team): this can be simplified when TF moves to C++17.
+  using unused = int[];
+  (void)unused{0, (printField(printer, std::get<0>(printFields),
+                              std::get<1>(printFields), separator),
+                   0)...};
+  printer << ">";
+}
+
+// Custom printer and parser for ScatterDimensionNumbersAttr.
+void ScatterDimensionNumbersAttr::print(AsmPrinter& printer) const {
+  printStruct(printer, "scatter",
+              std::make_pair("update_window_dims", getUpdateWindowDims()),
+              std::make_pair("inserted_window_dims", getInsertedWindowDims()),
+              std::make_pair("scatter_dims_to_operand_dims",
+                             getScatterDimsToOperandDims()),
+              std::make_pair("index_vector_dim", getIndexVectorDim()));
+}
+Attribute ScatterDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
+  if (failed(parser.parseLess())) return {};
+  SmallVector<int64_t> updateWindowDims;
+  SmallVector<int64_t> insertedWindowDims;
+  SmallVector<int64_t> scatterDimsToOperandDims;
+  int64_t indexVectorDim = 0;
+
+  if (failed(parseStruct(
+          parser,
+          {"update_window_dims", "inserted_window_dims",
+           "scatter_dims_to_operand_dims", "index_vector_dim"},
+          {[&]() { return parseDims(parser, updateWindowDims); },
+           [&]() { return parseDims(parser, insertedWindowDims); },
+           [&]() { return parseDims(parser, scatterDimsToOperandDims); },
+           [&]() { return parser.parseInteger(indexVectorDim); }}))) {
+    parser.emitError(parser.getCurrentLocation())
+        << "failed parsing scatter dimension numbers attribute";
+    return {};
+  }
+
+  return ScatterDimensionNumbersAttr::get(
+      parser.getContext(), updateWindowDims, insertedWindowDims,
+      scatterDimsToOperandDims, indexVectorDim);
+}
+
+// Custom printer and parser for GatherDimensionNumbersAttr.
+void GatherDimensionNumbersAttr::print(AsmPrinter& printer) const {
+  printStruct(printer, "gather", std::make_pair("offset_dims", getOffsetDims()),
+              std::make_pair("collapsed_slice_dims", getCollapsedSliceDims()),
+              std::make_pair("start_index_map", getStartIndexMap()),
+              std::make_pair("index_vector_dim", getIndexVectorDim()));
+}
+
+Attribute GatherDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
+  if (failed(parser.parseLess())) return {};
+
+  SmallVector<int64_t> offsetDims;
+  SmallVector<int64_t> collapsedSliceDims;
+  SmallVector<int64_t> startIndexMap;
+  int64_t indexVectorDim = 0;
+
+  if (failed(parseStruct(
+          parser,
+          {"offset_dims", "collapsed_slice_dims", "start_index_map",
+           "index_vector_dim"},
+          {[&]() { return parseDims(parser, offsetDims); },
+           [&]() { return parseDims(parser, collapsedSliceDims); },
+           [&]() { return parseDims(parser, startIndexMap); },
+           [&]() { return parser.parseInteger(indexVectorDim); }}))) {
+    parser.emitError(parser.getCurrentLocation())
+        << "failed parsing gather dimension numbers attribute";
+    return {};
+  }
+
+  return GatherDimensionNumbersAttr::get(parser.getContext(), offsetDims,
+                                         collapsedSliceDims, startIndexMap,
+                                         indexVectorDim);
+}
+
+// Custom printer and parser for DotDimensionNumbersAttr.
+void DotDimensionNumbersAttr::print(AsmPrinter& printer) const {
+  printStruct(
+      printer, "dot",
+      std::make_pair("lhs_batching_dimensions", getLhsBatchingDimensions()),
+      std::make_pair("rhs_batching_dimensions", getRhsBatchingDimensions()),
+      std::make_pair("lhs_contracting_dimensions",
+                     getLhsContractingDimensions()),
+      std::make_pair("rhs_contracting_dimensions",
+                     getRhsContractingDimensions()));
+}
+
+Attribute DotDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
+  if (failed(parser.parseLess())) return {};
+
+  SmallVector<int64_t> lhsBatchingDimensions;
+  SmallVector<int64_t> rhsBatchingDimensions;
+  SmallVector<int64_t> lhsContractingDimensions;
+  SmallVector<int64_t> rhsContractingDimensions;
+
+  if (failed(parseStruct(
+          parser,
+          {"lhs_batching_dimensions", "rhs_batching_dimensions",
+           "lhs_contracting_dimensions", "rhs_contracting_dimensions"},
+          {[&]() { return parseDims(parser, lhsBatchingDimensions); },
+           [&]() { return parseDims(parser, rhsBatchingDimensions); },
+           [&]() { return parseDims(parser, lhsContractingDimensions); },
+           [&]() { return parseDims(parser, rhsContractingDimensions); }}))) {
+    parser.emitError(parser.getCurrentLocation())
+        << "failed parsing dot dimension numbers attribute";
+    return {};
+  }
+  return DotDimensionNumbersAttr::get(
+      parser.getContext(), lhsBatchingDimensions, rhsBatchingDimensions,
+      lhsContractingDimensions, rhsContractingDimensions);
+}
+
+namespace {
+enum NonSpatialDim : int64_t {
+  IOBatch = -1,    // Input or output batch dimension
+  IOFeature = -2,  // Input or output feature dimension
+  KIFeature = -3,  // Kernel input feature dimension
+  KOFeature = -4,  // Kernel output feature dimensions.
+};
+
+struct DenseMapInfoNonSpatialDim {
+  static inline NonSpatialDim getEmptyKey() {
+    return NonSpatialDim(DenseMapInfo<int64_t>::getEmptyKey());
+  }
+
+  static inline NonSpatialDim getTombstoneKey() {
+    return NonSpatialDim(DenseMapInfo<int64_t>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const NonSpatialDim& key) {
+    return DenseMapInfo<int64_t>::getHashValue(key);
+  }
+
+  static bool isEqual(const NonSpatialDim& lhs, const NonSpatialDim& rhs) {
+    return lhs == rhs;
+  }
+};
+
+char nonSpatialDimToString(NonSpatialDim dim) {
+  switch (dim) {
+    case IOBatch:
+      return 'b';
+    case IOFeature:
+      return 'f';
+    case KIFeature:
+      return 'i';
+    case KOFeature:
+      return 'o';
+  }
+  llvm_unreachable("Unknown NonSpatialDim");
+}
+}  // namespace
+
+// Custom printer and parser for convolution attribute.
+void printConvolutionDimensions(AsmPrinter& p, ConvDimensionNumbersAttr dnums) {
+  // TODO(b/202040055): we should check the attribute invariant and print the
+  // "raw" form if they are violated, otherwise we'll crash here.
+  constexpr int64_t kUnknownDim = std::numeric_limits<int64_t>::min();
+  auto printDim =
+      [&](ArrayRef<int64_t> spatialDims,
+          ArrayRef<std::pair<int64_t, NonSpatialDim>> nonSpatialDims) {
+        int64_t numDims = 0;
+        if (!spatialDims.empty()) {
+          numDims =
+              *std::max_element(spatialDims.begin(), spatialDims.end()) + 1;
+        }
+        for (const auto& dim : nonSpatialDims) {
+          numDims = std::max(numDims, dim.first + 1);
+        }
+
+        llvm::SmallVector<int64_t> dims(numDims, kUnknownDim);
+        // Fill each element of dims with a (< 0) NonSpatialDim enum or a (>=0)
+        // spatial dimension index.
+        for (const std::pair<int64_t, NonSpatialDim>& nonSpatialDim :
+             nonSpatialDims) {
+          dims[nonSpatialDim.first] = nonSpatialDim.second;
+        }
+        for (const auto& spatialDim : llvm::enumerate(spatialDims)) {
+          dims[spatialDim.value()] = static_cast<int64_t>(spatialDim.index());
+        }
+
+        // Each dimension numbers will be printed as a comma separated list
+        // surrounded by square brackets, e.g., [b, 0, 1, 2, f]
+        p << '[';
+        llvm::interleaveComma(dims, p, [&](int64_t dim) {
+          if (dim == kUnknownDim) {
+            p << "?";
+          } else if (dim >= 0) {
+            p << dim;
+          } else {
+            p << nonSpatialDimToString(static_cast<NonSpatialDim>(dim));
+          }
+        });
+        p << ']';
+      };
+
+  printDim(dnums.getInputSpatialDimensions(),
+           {{dnums.getInputBatchDimension(), IOBatch},
+            {dnums.getInputFeatureDimension(), IOFeature}});
+  p << "x";
+  printDim(dnums.getKernelSpatialDimensions(),
+           {{dnums.getKernelInputFeatureDimension(), KIFeature},
+            {dnums.getKernelOutputFeatureDimension(), KOFeature}});
+  p << "->";
+  printDim(dnums.getOutputSpatialDimensions(),
+           {{dnums.getOutputBatchDimension(), IOBatch},
+            {dnums.getOutputFeatureDimension(), IOFeature}});
+}
+
+void printConvolutionDimensions(AsmPrinter& p, Operation*,
+                                ConvDimensionNumbersAttr dnums) {
+  printConvolutionDimensions(p, dnums);
+}
+
+// Custom printer and parser for ConvDimensionNumbersAttr.
+void ConvDimensionNumbersAttr::print(AsmPrinter& printer) const {
+  printer << "<";
+  printConvolutionDimensions(printer, *this);
+  printer << ">";
+}
+
+// If the attribute is written with `#mhlo.conv raw<`, we parse it as a struct
+// instead of the compressed format. This enables writing tests covering
+// impossible/invalid internal representation for the attribute.
+static ParseResult parseConvolutionDimensionsRaw(
+    AsmParser& parser, ConvDimensionNumbersAttr& dnums) {
+  int64_t inputBatchDimension = 0;
+  int64_t inputFeatureDimension = 0;
+  SmallVector<int64_t> inputSpatialDimensions;
+  int64_t kernelInputFeatureDimension = 0;
+  int64_t kernelOutputFeatureDimension = 0;
+  SmallVector<int64_t> kernelSpatialDimensions;
+  int64_t outBatchDimension = 0;
+  int64_t outputFeatureDimension = 0;
+  SmallVector<int64_t> outputSpatialDimensions;
+  if (failed(parseStruct(
+          parser,
+          {"input_batch_dimension", "input_feature_dimension",
+           "input_spatial_dimensions", "kernel_input_feature_dimension",
+           "kernel_output_feature_dimension", "kernel_spatial_dimensions",
+           "output_batch_dimension", "output_feature_dimension",
+           "output_spatial_dimensions"},
+          {
+              [&]() { return parser.parseInteger(inputBatchDimension); },
+              [&]() { return parser.parseInteger(inputFeatureDimension); },
+              [&]() { return parseDims(parser, inputSpatialDimensions); },
+              [&]() {
+                return parser.parseInteger(kernelInputFeatureDimension);
+              },
+              [&]() {
+                return parser.parseInteger(kernelOutputFeatureDimension);
+              },
+              [&]() { return parseDims(parser, kernelSpatialDimensions); },
+              [&]() { return parser.parseInteger(outBatchDimension); },
+              [&]() { return parser.parseInteger(outputFeatureDimension); },
+              [&]() { return parseDims(parser, outputSpatialDimensions); },
+          }))) {
+    parser.emitError(parser.getCurrentLocation())
+        << "failed parsing dot dimension numbers attribute";
+    return failure();
+  }
+  dnums = ConvDimensionNumbersAttr::get(
+      parser.getBuilder().getContext(), inputBatchDimension,
+      inputFeatureDimension, inputSpatialDimensions,
+      kernelInputFeatureDimension, kernelOutputFeatureDimension,
+      kernelSpatialDimensions, outBatchDimension, outputFeatureDimension,
+      outputSpatialDimensions);
+  return success();
+}
+
+ParseResult parseConvolutionDimensions(AsmParser& parser,
+                                       ConvDimensionNumbersAttr& dnums) {
+  // Parsing a single set of dim numbers gives the spatial dimensions as a
+  // single ArrayRef<int64_t> and a list of non-spatial dimensions as
+  // IntegerAttrs (indexed by the NonSpatialDim enum).
+  using parse_dim_result_t =
+      std::pair<llvm::SmallVector<int64_t>,
+                llvm::SmallDenseMap<NonSpatialDim, int64_t, 4,
+                                    DenseMapInfoNonSpatialDim>>;
+
+  // Note that the allowed_non_spatial_dims is a set (as opposed to unordered
+  // set) because its used to print a list of allowed non spatial dims in the
+  // error messages, so making it a set keeps the error messages deterministic.
+  auto parseDims =
+      [&](std::set<NonSpatialDim, std::greater<>> allowedNonSpatialDims,
+          parse_dim_result_t& parsedDims) -> ParseResult {
+    auto& spatialDims = std::get<0>(parsedDims);
+    auto& nonSpatialDims = std::get<1>(parsedDims);
+    spatialDims.clear();
+    nonSpatialDims.clear();
+
+    // Parse the starting [
+    if (parser.parseLSquare()) {
+      return failure();
+    }
+
+    llvm::SmallDenseMap<int64_t, int64_t> spatialDimsMap;
+    constexpr int64_t kInvalidDimension = -1;
+    // Keep track of the maximum spatial dimension parsed as we expect to see
+    // all the dimensions from 0 to maximum dimension parsed.
+    int64_t maxParsedSpatialDim = kInvalidDimension;
+
+    int64_t index = 0;
+    do {
+      int64_t spatialDim;
+      auto dimLocation = parser.getCurrentLocation();
+      OptionalParseResult parseResult = parser.parseOptionalInteger(spatialDim);
+      if (parseResult.has_value()) {
+        if (parseResult.value().failed()) {
+          return failure();
+        }
+        // We were successful in parsing an integer. Check if it is a valid
+        // dimension (non-negative and no duplicate) and add its index to the
+        // spatial dims map.
+        if (spatialDim < 0)
+          return parser.emitError(dimLocation)
+                 << "Unexpected dimension " << spatialDim;
+        if (!spatialDimsMap
+                 .insert(std::pair<int64_t, int64_t>(spatialDim, index))
+                 .second)
+          return parser.emitError(dimLocation)
+                 << "Duplicate entries for spatial dimension " << spatialDim;
+        maxParsedSpatialDim = std::max(spatialDim, maxParsedSpatialDim);
+      } else if (!parser.parseOptionalQuestion()) {
+        // Do nothing other than increment `index` at the bottom of the loop;
+        // '?' means "unknown dimension", and it's not represented in the
+        // return value of this function.
+      } else {
+        // We did not parse an integer or question mark. We expect a keyword
+        // token.
+        StringRef keyword;
+        if (parser.parseKeyword(&keyword)) {
+          return failure();
+        }
+        if (keyword.size() != 1 || allowedNonSpatialDims.empty()) {
+          return parser.emitError(dimLocation, "Unexpected keyword ")
+                 << keyword;
+        }
+        // Check if the keyword matches one of the allowed non-spatial dims.
+        // If so, add it to the non_spatial dims and remove it from the
+        // allowed set so that it won't be allowed again.
+        bool isAllowed = false;
+        for (NonSpatialDim allowed : allowedNonSpatialDims) {
+          if (keyword[0] == nonSpatialDimToString(allowed)) {
+            nonSpatialDims.insert({allowed, index});
+            allowedNonSpatialDims.erase(allowed);
+            isAllowed = true;
+            break;
+          }
+        }
+
+        if (!isAllowed) {
+          mlir::InFlightDiagnostic diag =
+              parser.emitError(dimLocation, "Unexpected dimension ");
+          diag << keyword << ", expecting ";
+          llvm::interleaveComma(
+              allowedNonSpatialDims, diag,
+              [&](NonSpatialDim dim) { diag << nonSpatialDimToString(dim); });
+          return diag;
+        }
+      }
+      index++;
+    } while (parser.parseOptionalComma().succeeded());
+
+    // Make sure all expected non-spatial dimensions are parsed.
+    if (!allowedNonSpatialDims.empty()) {
+      mlir::InFlightDiagnostic diag =
+          parser.emitError(parser.getCurrentLocation(), "Expected dimensions ");
+      llvm::interleaveComma(
+          allowedNonSpatialDims, diag,
+          [&](NonSpatialDim dim) { diag << nonSpatialDimToString(dim); });
+      diag << " not specified";
+      return diag;
+    }
+
+    // parse ending ]
+    if (parser.parseRSquare()) {
+      return failure();
+    }
+
+    // Number of expected spatial dimensions is one more than the maximum parsed
+    // spatial dimension. For example, if we parse [0, 3, 2, b, i, 1], then the
+    // maximum parsed spatial dimension is 3 and the number of expected spatial
+    // dimensions is 4.
+    int64_t numSpatialDimensions = maxParsedSpatialDim + 1;
+    spatialDims.resize(numSpatialDimensions);
+    // Store spatial dimensions in a vector which maps spatial dim (vector
+    // index) -> index in the tensor dimensions. For example, for parsed
+    // dimension numbers [0, 3, 2, b, i, 1] the spatial dimension vector would
+    // be [0, 5, 2, 1].
+    //
+    // Get all the unspecified spatial dimensions to throw a more descriptive
+    // error later.
+    llvm::SmallVector<int64_t> unspecifiedSpatialDims;
+    constexpr int kPrintUnspecifiedDimsMax = 10;
+    for (int dim = 0; dim < numSpatialDimensions; ++dim) {
+      auto it = spatialDimsMap.find(dim);
+      if (it == spatialDimsMap.end()) {
+        // Have an upper bound on the number of unspecified dimensions to print
+        // in the error message.
+        if (unspecifiedSpatialDims.size() < kPrintUnspecifiedDimsMax)
+          unspecifiedSpatialDims.push_back(dim);
+        continue;
+      }
+      spatialDims[dim] = it->second;
+    }
+
+    // Verify that we got all spatial dimensions between 0 and maximum parsed
+    // spatial dimension.
+    if (!unspecifiedSpatialDims.empty()) {
+      mlir::InFlightDiagnostic diag = parser.emitError(
+          parser.getCurrentLocation(), "Expected spatial dimensions ");
+      llvm::interleaveComma(unspecifiedSpatialDims, diag);
+      diag << " not specified";
+      return diag;
+    }
+
+    return success();
+  };
+
+  parse_dim_result_t parsedDims;
+  if (parseDims({IOBatch, IOFeature}, parsedDims)) {
+    return failure();
+  }
+  llvm::SmallVector<int64_t> inputSpatialDimensions = parsedDims.first;
+  int64_t inputBatchDimension = parsedDims.second[IOBatch];
+  int64_t inputFeatureDimension = parsedDims.second[IOFeature];
+  if (parser.parseKeyword("x")) return failure();
+  if (parseDims({KIFeature, KOFeature}, parsedDims)) {
+    return failure();
+  }
+  llvm::SmallVector<int64_t> kernelSpatialDimensions = parsedDims.first;
+  int64_t kernelInputFeatureDimension = parsedDims.second[KIFeature];
+  int64_t kernelOutputFeatureDimension = parsedDims.second[KOFeature];
+  if (parser.parseArrow()) {
+    return failure();
+  }
+  if (parseDims({IOBatch, IOFeature}, parsedDims)) {
+    return failure();
+  }
+  llvm::SmallVector<int64_t> outputSpatialDimensions = parsedDims.first;
+  const int64_t outBatchDimension = parsedDims.second[IOBatch];
+  const int64_t outputFeatureDimension = parsedDims.second[IOFeature];
+  dnums = ConvDimensionNumbersAttr::get(
+      parser.getBuilder().getContext(), inputBatchDimension,
+      inputFeatureDimension, inputSpatialDimensions,
+      kernelInputFeatureDimension, kernelOutputFeatureDimension,
+      kernelSpatialDimensions, outBatchDimension, outputFeatureDimension,
+      outputSpatialDimensions);
+
+  return success();
+}
+
+Attribute ConvDimensionNumbersAttr::parse(AsmParser& parser, Type type) {
+  if (failed(parser.parseLess())) return {};
+  ConvDimensionNumbersAttr dnums;
+  if (succeeded(parser.parseOptionalKeyword("raw"))) {
+    if (failed(parseConvolutionDimensionsRaw(parser, dnums))) return {};
+    return dnums;
+  }
+  if (failed(parseConvolutionDimensions(parser, dnums))) return {};
+  if (failed(parser.parseGreater())) return {};
+  return dnums;
+}
+
+// Custom printer and parser for ArgResultAliasAttr.
+constexpr char kMustAlias[] = "must_alias";
+constexpr char kResult[] = "result_index";
+constexpr char kArgTupleIndices[] = "tuple_indices";
+
+void ArgResultAliasAttr::print(AsmPrinter& printer) const {
+  printer << "<";
+
+  // The attribute can have empty tuple indices. Only print argument tuple
+  // indices if they are non-empty.
+  if (!getArgTupleIndices().empty())
+    printer << kArgTupleIndices << " = [" << getArgTupleIndices() << "], ";
+
+  // Print the result index followed by any result tuple indices if present.
+  printer << kResult << " = [";
+  printer << getResultIndex();
+  if (!getResultTupleIndices().empty()) {
+    printer << ", " << getResultTupleIndices();
+  }
+  printer << "]";
+
+  // Print the "must_alias" keyword if this is a must alias, otherwise skip.
+  if (getIsMustAlias()) printer << ", " << kMustAlias;
+
+  printer << ">";
+}
+
+Attribute ArgResultAliasAttr::parse(AsmParser& parser, Type type) {
+  if (failed(parser.parseLess())) return {};
+  llvm::SmallVector<int64_t> argTupleIndices;
+  // The first element of result indices holds the aliased result index and the
+  // remaining elements are the result tuple indices.
+  llvm::SmallVector<int64_t> resultIndices;
+  bool isMustAlias = false;
+
+  // This conveys to parseStruct that keyword "must_alias" (3rd field) is not
+  // followed by a "=", but other fields are.
+  llvm::SmallVector<bool, 3> parseEqual = {true, true, false};
+
+  if (failed(parseStruct(parser, {kArgTupleIndices, kResult, kMustAlias},
+                         {[&]() { return parseDims(parser, argTupleIndices); },
+                          [&]() {
+                            // Since the first element is the index of result,
+                            // at least one element is expected.
+                            return parseDimsWithMinimumElements(
+                                parser, resultIndices, /*minElements=*/1);
+                          },
+                          [&]() {
+                            // always succeeds if the keyword "must_alias" was
+                            // parsed
+                            isMustAlias = true;
+                            return success();
+                          }},
+                         parseEqual))) {
+    parser.emitError(parser.getCurrentLocation())
+        << "failed parsing argument-result alias attribute";
+    return {};
+  }
+
+  int64_t resultIndex = resultIndices[0];
+  auto resultTupleIndices =
+      ArrayRef<int64_t>{resultIndices.begin() + 1, resultIndices.end()};
+
+  return ArgResultAliasAttr::get(parser.getContext(), argTupleIndices,
+                                 resultIndex, resultTupleIndices, isMustAlias);
+}
+
+// Returns the element type pointed to by `indices` in type `t`. If the indices
+// are invalid, returns nullptr.
+static Type getTypeFromTupleIndices(Type type, ArrayRef<int64_t> indices) {
+  Type current = type;
+  for (auto index : indices) {
+    TupleType tupleType = current.dyn_cast<TupleType>();
+    if (!tupleType || index >= static_cast<int64_t>(tupleType.size()))
+      return {};
+    current = tupleType.getType(index);
+  }
+  return current;
+}
+
+static LogicalResult verifyArgResultAliasAttr(StringAttr attrName,
+                                              ArgResultAliasAttr aliasAttr,
+                                              unsigned argIndex,
+                                              Operation* op) {
+  // The attribute can only be applied to function-like operations.
+  if (!isa<mlir::FunctionOpInterface>(op))
+    return op->emitOpError() << "attribute " << attrName
+                             << " can only be used on function-like operations";
+
+  // Verify there are no negative indices.
+  auto tupleIndices = llvm::concat<const int64_t>(
+      aliasAttr.getArgTupleIndices(), aliasAttr.getResultTupleIndices());
+  if (llvm::any_of(tupleIndices, [](const int64_t val) { return val < 0; }) ||
+      aliasAttr.getResultIndex() < 0)
+    return op->emitOpError()
+           << "attribute " << attrName
+           << " expects all argument and result indices to be >= 0";
+
+  // Verify that the result index is not out of range. Since the attribute is a
+  // function argument attribute, the argument index is always correct when this
+  // verifier is called.
+  FunctionOpInterface funcOp = cast<FunctionOpInterface>(op);
+  ArrayRef<Type> argTypes = funcOp.getArgumentTypes();
+  ArrayRef<Type> resultTypes = funcOp.getResultTypes();
+  if (aliasAttr.getResultIndex() >= static_cast<int64_t>(resultTypes.size()))
+    return op->emitOpError()
+           << "attribute " << attrName
+           << " result index is out of range, must be <" << resultTypes.size();
+
+  // Verify that argument and result types pointed to by the indices are valid
+  // and compatible.
+  Type argType = getTypeFromTupleIndices(argTypes[argIndex],
+                                         aliasAttr.getArgTupleIndices());
+  if (!argType)
+    return op->emitOpError()
+           << "attribute " << attrName << " argument tuple indices are invalid";
+  Type resultType =
+      getTypeFromTupleIndices(resultTypes[aliasAttr.getResultIndex()],
+                              aliasAttr.getResultTupleIndices());
+  if (!resultType)
+    return op->emitOpError()
+           << "attribute " << attrName << " result tuple indices are invalid";
+
+  if (failed(mlir::verifyCompatibleShape(argType, resultType)) ||
+      getElementTypeOrSelf(argType) != getElementTypeOrSelf(resultType))
+    return op->emitOpError() << "attribute " << attrName
+                             << " aliases do not have compatible types, "
+                             << argType << " vs. " << resultType;
+  return success();
+}
+
+// Each CrossProgramPrefetchAttr specifies a parameter and a ShapeIndex
+// (1) the parameter must be valid
+// (2) there must be a subshape at the given indices
+LogicalResult verifyCrossProgramPrefetchAttr(CrossProgramPrefetchAttr cpp,
+                                             ModuleOp module) {
+  func::FuncOp main = module.lookupSymbol<func::FuncOp>("main");
+  if (cpp.getParameter() >= main.getNumArguments())
+    return module->emitOpError()
+           << "cross_program_prefetch: parameter " << cpp.getParameter()
+           << " out of range. main has only " << main.getNumArguments()
+           << " arguments";
+  auto type = getTypeFromTupleIndices(main.getArgument(cpp.getParameter())
+                                          .getType()
+                                          .dyn_cast_or_null<TupleType>(),
+                                      cpp.getIndices());
+  if (!type)
+    return module->emitOpError()
+           << "cross_program_prefetch: no subshape at given index: "
+           << cpp.getIndices();
+  return success();
+}
+
+// Each DynamicParameterBinding specifies a dynamic parameter, a target
+// parameter, a shape index of each and a target dimension.
+// (1) the parameters must be valid
+// (2) there must be a subshape at the given ShapeIndex for each parameter
+// (3) the given subshape for the dynamic parameter must be of type tensor<i32>
+// (4) there must be a dimension at the given dimension number for the given
+// subshape of the target parameter
+// (5) that dimension is dynamic
+LogicalResult verifyDynamicParameterBinding(DynamicParameterBindingAttr bind,
+                                            ModuleOp module) {
+  func::FuncOp main = module.lookupSymbol<func::FuncOp>("main");
+
+  // (1)
+  if (bind.getDynamicParamNum() >= main.getNumArguments() ||
+      bind.getTargetParamNum() >= main.getNumArguments())
+    return module->emitOpError()
+           << "dynamic_parameter_binding: parameters "
+           << bind.getDynamicParamNum() << " and " << bind.getTargetParamNum()
+           << " out of range. main has only " << main.getNumArguments()
+           << " arguments";
+
+  // (2)
+  auto dynamicParamSubshape =
+      getTypeFromTupleIndices(
+          main.getArgument(bind.getDynamicParamNum()).getType(),
+          bind.getDynamicParamIndices())
+          .dyn_cast_or_null<RankedTensorType>();
+  if (!dynamicParamSubshape)
+    return module->emitOpError() << "dynamic_parameter_binding: no ranked "
+                                    "tensor type at dynamic_param_indices: "
+                                 << bind.getDynamicParamIndices();
+  // (3)
+  if (dynamicParamSubshape.getRank() != 0 ||
+      !dynamicParamSubshape.getElementType().isInteger(32))
+    return module->emitOpError()
+           << "dynamic_parameter_binding: dynamic size must be tensor<i32>";
+
+  // (2)
+  auto targetParamSubshape =
+      getTypeFromTupleIndices(
+          main.getArgument(bind.getTargetParamNum()).getType(),
+          bind.getTargetParamIndices())
+          .dyn_cast_or_null<RankedTensorType>();
+  if (!targetParamSubshape)
+    return module->emitOpError() << "dynamic_parameter_binding: no ranked "
+                                    "tensor type at target_param_indices: "
+                                 << bind.getTargetParamIndices();
+  // (4)
+  if (targetParamSubshape.getRank() <= bind.getTargetParamDimNum())
+    return module->emitOpError()
+           << "dynamic_parameter_binding: no dimension number "
+           << bind.getTargetParamDimNum() << " in target subshape "
+           << targetParamSubshape;
+
+  // (5)
+  if (!targetParamSubshape.isDynamicDim(bind.getTargetParamDimNum()))
+    return module->emitOpError()
+           << "dynamic_parameter_binding: dimension number "
+           << bind.getTargetParamDimNum() << " in target subshape "
+           << targetParamSubshape << " is not dynamic";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Builder utilities
+//===----------------------------------------------------------------------===//
+
+// Builds the region `body` for mhlo.sort's comparator: for each type in
+// `element_types`, create two block arguments, one for lhs and one for rhs, and
+// generates mhlo.compare op to compare them with the given `direction`.
+//
+// Note that this right now only does comparision on the first pair of block
+// arguments.
+static void buildSortComparisonBody(llvm::ArrayRef<Type> elementTypes,
+                                    ComparisonDirection direction,
+                                    llvm::Optional<StringRef> compareType,
+                                    Region* body, OpBuilder* builder) {
+  OpBuilder::InsertionGuard insertionPointGurad(*builder);
+
+  Location loc = body->getLoc();
+  Block* block = builder->createBlock(body);
+  // Add two arguments for each element type.
+  for (Type elementType : elementTypes) {
+    TensorType tensorType = RankedTensorType::get({}, elementType);
+    block->addArguments({tensorType, tensorType},
+                        SmallVector<Location, 2>(2, loc));
+  }
+
+  ComparisonType typeAttr;
+  if (compareType)
+    typeAttr = symbolizeComparisonType(*compareType).value();
+  else
+    typeAttr = ComparisonType::NOTYPE;
+  Value compare = builder->create<mhlo::CompareOp>(
+      loc, block->getArgument(0), block->getArgument(1), direction, typeAttr);
+
+  builder->create<mhlo::ReturnOp>(loc, compare);
+}
+
+SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
+                    const llvm::ArrayRef<Value>& operands,
+                    const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
+                    bool isStable, ComparisonDirection direction) {
+  assert(!operands.empty() && "No operands to sort");
+  // Create the sort op.
+  auto sortOp =
+      rewriter->create<mhlo::SortOp>(loc, operands, dimension, isStable);
+
+  // Use TOTALORDER comparison type instead of the default comparison if the
+  // element type is of type float.
+  llvm::Optional<StringRef> compareType = std::nullopt;
+  for (auto const& elementType : elementTypes)
+    if (elementType.isa<FloatType>()) {
+      compareType.emplace("TOTALORDER");
+      break;
+    }
+  buildSortComparisonBody(elementTypes, direction, compareType,
+                          &sortOp.getComparator(), rewriter);
+  return sortOp;
+}
+
+//===----------------------------------------------------------------------===//
+// MHLO Dialect Hooks
+//===----------------------------------------------------------------------===//
+
+Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
+                                            Type type, Location loc) {
+  auto elementsAttr = value.dyn_cast<ElementsAttr>();
+  // HLO dialect constants only support ElementsAttr unlike standard dialect
+  // constant which supports all attributes.
+  if (!elementsAttr) return nullptr;
+  // HLO dialect constants require the type of value and result to match.
+  if (type != elementsAttr.getType()) return nullptr;
+
+  return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+}
+
+int64_t getNumLeafBuffers(Type type) {
+  if (auto tuple = type.dyn_cast<TupleType>()) {
+    auto ans = 0;
+    for (auto type : tuple.getTypes()) ans += getNumLeafBuffers(type);
+    return ans;
+  } else {
+    return 1;
+  }
+}
+
+LogicalResult MhloDialect::verifyRegionArgAttribute(Operation* op,
+                                                    unsigned /*regionIndex*/,
+                                                    unsigned argIndex,
+                                                    NamedAttribute attr) {
+  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
+    if (failed(
+            verifyArgResultAliasAttr(attr.getName(), aliasAttr, argIndex, op)))
+      return failure();
+  }
+  if (attr.getName() == "mhlo.parameter_replication") {
+    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    if (!arrayAttr)
+      return op->emitOpError() << "parameter_replication: must be an array";
+    auto func = dyn_cast<mlir::FunctionOpInterface>(op);
+    if (!func) {
+      return op->emitOpError()
+             << "has parameter_replication but is not a function";
+    }
+    // parameter_replication = [] or [false] is equivalent to
+    // [false,...,false] and parameter_replication = [true] means
+    // [true,...,true]
+    if (arrayAttr.size() == 0 || arrayAttr.size() == 1) return success();
+    auto num_leaf_buffers =
+        getNumLeafBuffers(func.getArgumentTypes()[argIndex]);
+    if ((size_t)num_leaf_buffers != arrayAttr.size())
+      return op->emitOpError()
+             << "parameter_replication: arg " << argIndex << " has "
+             << num_leaf_buffers << " leaf_buffers, but parameter_replication"
+             << " expects " << arrayAttr.size();
+  }
+  return success();
+}
+
+LogicalResult MhloDialect::verifyOperationAttribute(Operation* op,
+                                                    NamedAttribute attr) {
+  if (auto aliasAttr = attr.getValue().dyn_cast<ArgResultAliasAttr>()) {
+    if (!isa<mlir::FunctionOpInterface>(op))
+      return op->emitOpError()
+             << "attribute " << attr.getName()
+             << " can only be used on function-like operations";
+  }
+  if (attr.getName() == "mhlo.cross_program_prefetches") {
+    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    if (!arrayAttr)
+      return op->emitOpError() << "cross_program_prefetches must be an array";
+    for (auto attrElt : arrayAttr) {
+      auto prefetchAttr = attrElt.dyn_cast<CrossProgramPrefetchAttr>();
+      if (!prefetchAttr)
+        return op->emitOpError() << "cross_program_prefetches must be an array "
+                                    "of cross_program_prefetch attrs";
+      auto module = dyn_cast<ModuleOp>(op);
+      if (!module)
+        return op->emitOpError()
+               << "has cross_program_prefetches but is not a module";
+      auto res = verifyCrossProgramPrefetchAttr(prefetchAttr, module);
+      if (failed(res)) return res;
+    }
+  }
+  if (attr.getName() == "mhlo.dynamic_parameter_bindings") {
+    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    if (!arrayAttr)
+      return op->emitOpError() << "dynamic_parameter_bindings must be an array";
+    auto module = dyn_cast<ModuleOp>(op);
+    if (!module)
+      return op->emitOpError()
+             << "has dynamic_parameter_bindings but is not a module";
+    for (auto attrElt : arrayAttr) {
+      auto bindingAttr = attrElt.dyn_cast<DynamicParameterBindingAttr>();
+      if (!bindingAttr)
+        return op->emitOpError() << "dynamic_parameter_bindings must be an "
+                                    "array of dynamic_parameter_binding attrs";
+      auto res = verifyDynamicParameterBinding(bindingAttr, module);
+      if (failed(res)) return res;
+    }
+  }
+  if (attr.getName() == "mhlo.spmd_parameters_sharding") {
+    auto arrayAttr = attr.getValue().dyn_cast<ArrayAttr>();
+    if (!arrayAttr)
+      return op->emitOpError() << "spmd_parameters_sharding: must be an array";
+    auto module = dyn_cast<ModuleOp>(op);
+    if (!module)
+      return op->emitOpError()
+             << "has spmd_paramters_sharding but is not a module";
+    // Check that the "main" function exists:
+    auto main = module.lookupSymbol<mlir::func::FuncOp>("main");
+    if (!main)
+      return module.emitOpError() << "spmd_parameters_sharding: main not found";
+    if (main.getNumArguments() != arrayAttr.size())
+      return module.emitOpError()
+             << "spmd_parameters_sharding: main has " << main.getNumArguments()
+             << " arguments, but spmd_parameters_sharding expects "
+             << arrayAttr.size();
+  }
+  return success();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h
similarity index 88%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h
index a53c763b5f4..9b54a8494a8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // This file defines the operations used in the MHLO dialect.
 
-#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H
-#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H
+#ifndef MLIR_HLO_MHLO_IR_HLO_OPS_H
+#define MLIR_HLO_MHLO_IR_HLO_OPS_H
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"
@@ -37,11 +37,11 @@ limitations under the License.
 #include "stablehlo/dialect/Base.h"
 
 // Include order below matters.
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.h.inc"
+#include "mhlo/IR/hlo_ops_enums.h.inc"
 #define GET_ATTRDEF_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.h.inc"
+#include "mhlo/IR/hlo_ops_attrs.h.inc"
 #define GET_TYPEDEF_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.h.inc"
+#include "mhlo/IR/hlo_ops_typedefs.h.inc"
 
 namespace mlir {
 class OpBuilder;
@@ -92,13 +92,11 @@ void printConvolutionDimensions(AsmPrinter &p, Operation *,
 ParseResult parseConvolutionDimensions(AsmParser &parser,
                                        ConvDimensionNumbersAttr &dnums);
 
-FailureOr<SmallVector<int64_t>> parseIntArray(AsmParser &parser);
-void printIntArray(AsmPrinter &printer, ArrayRef<int64_t> ints);
 }  // end namespace mhlo
 }  // end namespace mlir
 
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
+#include "mhlo/IR/hlo_ops.h.inc"
 
 namespace mlir {
 namespace mhlo {
@@ -111,4 +109,4 @@ SortOp createSortOp(PatternRewriter *rewriter, const Location &loc,
 }  // end namespace mhlo
 }  // end namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H
+#endif  // MLIR_HLO_MHLO_IR_HLO_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index cf5f253f29c..d76ca1c002e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -23,11 +23,11 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/OpBase.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.td"
+include "mhlo/IR/hlo_utils.td"
+include "mhlo/IR/hlo_ops_common.td"
 
-class HLO_Op<string mnemonic, list<Trait> traits> :
-    Op<HLO_Dialect, mnemonic, traits> {
+class MHLO_Op<string mnemonic, list<Trait> traits> :
+    Op<MHLO_Dialect, mnemonic, traits> {
   // Whether this operation has a custom conversion to HLO or not.
   bit hasCustomHLOConverter = 0b0;
 
@@ -40,8 +40,8 @@ class HLO_Op<string mnemonic, list<Trait> traits> :
   }];
 }
 
-class HLO_ShapedInterfaceOp<string mnemonic, list<Trait> traits> :
-    HLO_Op<mnemonic, traits # [DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+class MHLO_ShapedInterfaceOp<string mnemonic, list<Trait> traits> :
+    MHLO_Op<mnemonic, traits # [DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
     ["reifyReturnTypeShapes"]>]> {
   let extraClassDeclaration = [{
     static bool isCompatibleReturnTypes(TypeRange l, TypeRange r) {
@@ -54,7 +54,7 @@ class HLO_ShapedInterfaceOp<string mnemonic, list<Trait> traits> :
 // MHLO nullary op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_ConstantOp : HLO_Op<"constant",
+def MHLO_ConstantOp : MHLO_Op<"constant",
     [ConstantLike, Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Constant operator";
   let description = [{
@@ -65,7 +65,7 @@ def HLO_ConstantOp : HLO_Op<"constant",
   );
 
   let results = (outs
-    HLO_StaticShapeTensor:$output
+    MHLO_StaticShapeTensor:$output
   );
 
   let builders = [
@@ -83,14 +83,14 @@ def HLO_ConstantOp : HLO_Op<"constant",
   }];
 }
 
-def HLO_IotaOp : HLO_Op<"iota", [Pure]> {
+def MHLO_IotaOp : MHLO_Op<"iota", [Pure]> {
   let summary = "Iota operator";
   let description = [{
     Creates a rank 1 array of values starting at zero and incrementing by one.
   }];
   let arguments = (ins I64Attr:$iota_dimension);
 
-  let results = (outs HLO_IntFpOrComplexTensor:$output);
+  let results = (outs MHLO_StaticShapeIntFpOrComplexTensor:$output);
 
   // TODO(b/130357376): Iota has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
@@ -99,7 +99,7 @@ def HLO_IotaOp : HLO_Op<"iota", [Pure]> {
   let hasVerifier = 1;
 }
 
-def HLO_DynamicIotaOp: HLO_ShapedInterfaceOp<"dynamic_iota", [Pure]> {
+def MHLO_DynamicIotaOp: MHLO_ShapedInterfaceOp<"dynamic_iota", [Pure]> {
   let summary = "Create linear increasing values from 0 to length -1.";
   let description = [{
     Produces an HLO Tensor of the specified shape, with an incremental set of
@@ -109,8 +109,8 @@ def HLO_DynamicIotaOp: HLO_ShapedInterfaceOp<"dynamic_iota", [Pure]> {
     - The output length of the tensor result.
   }];
 
-  let arguments = (ins HLO_DimensionTensor:$output_shape, I64Attr:$iota_dimension);
-  let results = (outs HLO_Tensor:$result);
+  let arguments = (ins MHLO_DimensionTensor:$output_shape, I64Attr:$iota_dimension);
+  let results = (outs MHLO_Tensor:$result);
 
   let hasCanonicalizer = 1;
   // Cannot be exported to legacy formats.
@@ -118,7 +118,8 @@ def HLO_DynamicIotaOp: HLO_ShapedInterfaceOp<"dynamic_iota", [Pure]> {
 }
 
 
-def HLO_CreateTokenOp : HLO_Op<"create_token", [Pure]> {
+def MHLO_CreateTokenOp : MHLO_Op<"create_token", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Create Token operator";
 
   let description = [{
@@ -133,7 +134,7 @@ def HLO_CreateTokenOp : HLO_Op<"create_token", [Pure]> {
     ```
   }];
 
-  let results = (outs HLO_Token:$output);
+  let results = (outs MHLO_Token:$output);
 
   let assemblyFormat = "attr-dict `:` type(results)";
 }
@@ -143,8 +144,8 @@ def HLO_CreateTokenOp : HLO_Op<"create_token", [Pure]> {
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
 
-class HLO_UnaryElementwiseOp<string mnemonic, list<Trait> traits,
-    Type OperandType, Type ResultType = OperandType> : HLO_Op<mnemonic, traits # [Elementwise,
+class MHLO_UnaryElementwiseOp<string mnemonic, list<Trait> traits,
+    Type OperandType, Type ResultType = OperandType> : MHLO_Op<mnemonic, traits # [Elementwise,
     InferShapedTypeOpInterface, SameOperandsAndResultShape]> {
   let arguments = (ins OperandType:$operand);
   let results = (outs ResultType:$result);
@@ -168,10 +169,10 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<Trait> traits,
 }
 
 // Abs supports complex to real, so element type is not guaranteed to match.
-def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
+def MHLO_AbsOp: MHLO_UnaryElementwiseOp<"abs",
     [Pure,
      DeclareOpInterfaceMethods<InferTypeOpInterface>],
-     TensorOf<[HLO_SInt, HLO_Float, HLO_Complex]>> {
+     TensorOf<[MHLO_SInt, MHLO_Float, MHLO_Complex]>> {
   let summary = "Absolute value operator";
   let description = [{
     Returns `abs(operand)` element-wise.
@@ -185,10 +186,11 @@ def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
     %0 = mhlo.abs %arg0 : tensor<3xi32>
     ```
   }];
+  let hasFolder = 1;
 }
 
-def HLO_CbrtOp: HLO_UnaryElementwiseOp<"cbrt",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpTensor> {
+def MHLO_CbrtOp: MHLO_UnaryElementwiseOp<"cbrt",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Cubic root operator";
   let description = [{
     Returns element-wise cubic root of the operand.
@@ -203,8 +205,8 @@ def HLO_CbrtOp: HLO_UnaryElementwiseOp<"cbrt",
     ```
   }];
 }
-def HLO_CeilOp: HLO_UnaryElementwiseOp<"ceil",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpTensor> {
+def MHLO_CeilOp: MHLO_UnaryElementwiseOp<"ceil",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpTensor> {
   let summary = "Ceil operator";
   let description = [{
     Returns `Ceil(operand)` element-wise.
@@ -219,8 +221,8 @@ def HLO_CeilOp: HLO_UnaryElementwiseOp<"ceil",
     ```
   }];
 }
-def HLO_ConvertOp : HLO_UnaryElementwiseOp<"convert",
-    [Pure, SameOperandsAndResultShape], HLO_Tensor> {
+def MHLO_ConvertOp : MHLO_UnaryElementwiseOp<"convert",
+    [Pure, SameOperandsAndResultShape], MHLO_Tensor> {
   let summary = "Convert operator";
   let description = [{
     Performs element-wise conversion of values from one type to another, e.g.
@@ -244,8 +246,8 @@ def HLO_ConvertOp : HLO_UnaryElementwiseOp<"convert",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ClzOp: HLO_UnaryElementwiseOp<"count_leading_zeros",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_IntTensor> {
+def MHLO_ClzOp: MHLO_UnaryElementwiseOp<"count_leading_zeros",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntTensor> {
   let summary = "Count-leading-zeros (Clz) operator";
   let description = [{
     Returns the number of leading zeros in each operand element-wise.
@@ -261,8 +263,8 @@ def HLO_ClzOp: HLO_UnaryElementwiseOp<"count_leading_zeros",
   }];
 }
 
-def HLO_CosineOp: HLO_UnaryElementwiseOp<"cosine",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_CosineOp: MHLO_UnaryElementwiseOp<"cosine",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Cos operator";
   let description = [{
     Returns `Cos(operand)` element-wise.
@@ -281,8 +283,8 @@ def HLO_CosineOp: HLO_UnaryElementwiseOp<"cosine",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ExpOp: HLO_UnaryElementwiseOp<"exponential",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_ExpOp: MHLO_UnaryElementwiseOp<"exponential",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Exponential operator";
   let description = [{
     Returns `e^(operand)` element-wise.
@@ -298,8 +300,8 @@ def HLO_ExpOp: HLO_UnaryElementwiseOp<"exponential",
   }];
   let hasFolder = 1;
 }
-def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_Expm1Op: MHLO_UnaryElementwiseOp<"exponential_minus_one",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Exponential minus one operator";
   let description = [{
     Returns `e^(operand) - 1` element-wise.
@@ -314,8 +316,8 @@ def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
     ```
   }];
 }
-def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpTensor> {
+def MHLO_FloorOp: MHLO_UnaryElementwiseOp<"floor",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpTensor> {
   let summary = "Floor operator";
   let description = [{
     Returns `Floor(operand)` element-wise.
@@ -330,9 +332,9 @@ def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
     ```
   }];
 }
-def HLO_ImagOp: HLO_UnaryElementwiseOp<"imag",
+def MHLO_ImagOp: MHLO_UnaryElementwiseOp<"imag",
     [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>],
-    HLO_FpOrComplexTensor, HLO_FpTensor> {
+    MHLO_FpOrComplexTensor, MHLO_FpTensor> {
   let summary = "Imag operator";
   let description = [{
     Returns `Imag(operand)` element-wise.
@@ -348,8 +350,8 @@ def HLO_ImagOp: HLO_UnaryElementwiseOp<"imag",
 
 }
 
-def HLO_IsFiniteOp: HLO_UnaryElementwiseOp<"is_finite", [Pure,
-    DeclareOpInterfaceMethods<InferTypeOpInterface>], HLO_Tensor> {
+def MHLO_IsFiniteOp: MHLO_UnaryElementwiseOp<"is_finite", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>], MHLO_Tensor> {
   let summary = "IsFinite operator";
   let description = [{
     Tests whether each element of operand is finite, i.e., is not positive or
@@ -366,16 +368,16 @@ def HLO_IsFiniteOp: HLO_UnaryElementwiseOp<"is_finite", [Pure,
     %0 = mhlo.is_finite %arg0 : (tensor<2xf32>) -> tensor<2xi1>
     ```
   }];
-  let arguments = (ins HLO_FpTensor:$x);
-  let results = (outs HLO_PredTensor:$y);
+  let arguments = (ins MHLO_FpTensor:$x);
+  let results = (outs MHLO_PredTensor:$y);
 
   let assemblyFormat = [{
     $x attr-dict `:` functional-type(operands, results)
   }];
 }
 
-def HLO_LogOp: HLO_UnaryElementwiseOp<"log",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_LogOp: MHLO_UnaryElementwiseOp<"log",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Logarithm operator";
   let description = [{
     Returns `log(operand)` element-wise.
@@ -389,9 +391,10 @@ def HLO_LogOp: HLO_UnaryElementwiseOp<"log",
     %0 = mhlo.log %arg0 : tensor<2xf32>
     ```
   }];
+  let hasFolder = 1;
 }
-def HLO_Log1pOp: HLO_UnaryElementwiseOp<"log_plus_one",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_Log1pOp: MHLO_UnaryElementwiseOp<"log_plus_one",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Log1p operator";
   let description = [{
     Returns `log(operand+1)` element-wise.
@@ -406,8 +409,8 @@ def HLO_Log1pOp: HLO_UnaryElementwiseOp<"log_plus_one",
     ```
   }];
 }
-def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_LogisticOp: MHLO_UnaryElementwiseOp<"logistic",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Logistic operator";
   let description = [{
     Returns `logistic(operand)` element-wise.
@@ -423,12 +426,12 @@ def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
   }];
   let hasFolder = 1;
 }
-def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_PredOrIntTensor> {
+def MHLO_NotOp: MHLO_UnaryElementwiseOp<"not",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_PredOrIntTensor> {
   let summary = "Not operator";
   let description = [{
     Returns biwise-NOT of `operand` element-wise. The input tensor must be
-    of type integer `HLO_Int` or boolean `HLO_Pred`.
+    of type integer `MHLO_Int` or boolean `MHLO_Pred`.
 
     Note: For boolean tensor, the bitwise-NOT is equivalent to logical-NOT.
 
@@ -441,8 +444,8 @@ def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     let hasFolder = 1;
 }
 
-def HLO_NegOp: HLO_UnaryElementwiseOp<"negate",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_IntFpOrComplexTensor> {
+def MHLO_NegOp: MHLO_UnaryElementwiseOp<"negate",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntFpOrComplexTensor> {
   let summary = "Negation operator";
   let description = [{
     Returns `-operand` element-wise.
@@ -459,8 +462,8 @@ def HLO_NegOp: HLO_UnaryElementwiseOp<"negate",
   let hasFolder = 1;
 }
 
-def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_IntTensor> {
+def MHLO_PopulationCountOp: MHLO_UnaryElementwiseOp<"popcnt",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntTensor> {
   let summary = "PopulationCount operator";
   let description = [{
     Returns the number of bits set in each operand element-wise.
@@ -475,9 +478,9 @@ def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     ```
   }];
 }
-def HLO_RealOp: HLO_UnaryElementwiseOp<"real",
+def MHLO_RealOp: MHLO_UnaryElementwiseOp<"real",
     [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>],
-    HLO_FpOrComplexTensor, HLO_FpTensor> {
+    MHLO_FpOrComplexTensor, MHLO_FpTensor> {
   let summary = "Real operator";
   let description = [{
     Returns `Real(operand)` element-wise.
@@ -491,8 +494,8 @@ def HLO_RealOp: HLO_UnaryElementwiseOp<"real",
   let hasFolder = 1;
 }
 
-def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpTensor> {
+def MHLO_RoundOp: MHLO_UnaryElementwiseOp<"round_nearest_afz",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpTensor> {
   let summary = "Round operator, ties away from zero";
   let description = [{
     Returns `Round(operand)` element-wise, rounding to nearest integer with
@@ -510,8 +513,8 @@ def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
   let hasFolder = 1;
 }
 
-def HLO_RoundNearestEvenOp: HLO_UnaryElementwiseOp<"round_nearest_even",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpTensor> {
+def MHLO_RoundNearestEvenOp: MHLO_UnaryElementwiseOp<"round_nearest_even",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpTensor> {
   let summary = "Round operator, ties to even";
   let description = [{
     Returns `Round(operand)` element-wise, rounding to nearest integer with
@@ -529,8 +532,8 @@ def HLO_RoundNearestEvenOp: HLO_UnaryElementwiseOp<"round_nearest_even",
   let hasFolder = 1;
 }
 
-def HLO_RsqrtOp: HLO_UnaryElementwiseOp<"rsqrt",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_RsqrtOp: MHLO_UnaryElementwiseOp<"rsqrt",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Reciprocal Square-root operator";
   let description = [{
     Returns `1.0 / sqrt(operand)` element-wise.
@@ -547,9 +550,9 @@ def HLO_RsqrtOp: HLO_UnaryElementwiseOp<"rsqrt",
   let hasFolder = 1;
 }
 
-def HLO_SignOp: HLO_UnaryElementwiseOp<"sign",
+def MHLO_SignOp: MHLO_UnaryElementwiseOp<"sign",
     [Pure, HLO_CompatibleOperandsAndResultType],
-    TensorOf<[HLO_SInt, HLO_Float, HLO_Complex]>> {
+    TensorOf<[MHLO_SInt, MHLO_Float, MHLO_Complex]>> {
   let summary = "Sign operator";
   let description = [{
     Returns `sign(operand)` element-wise, where
@@ -574,8 +577,8 @@ def HLO_SignOp: HLO_UnaryElementwiseOp<"sign",
   let hasFolder = 1;
 }
 
-def HLO_SineOp: HLO_UnaryElementwiseOp<"sine",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_SineOp: MHLO_UnaryElementwiseOp<"sine",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Sin operator";
   let description = [{
     Returns `Sin(operand)` element-wise.
@@ -593,8 +596,27 @@ def HLO_SineOp: HLO_UnaryElementwiseOp<"sine",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
-    [Pure, HLO_CompatibleOperandsAndResultType], HLO_FpOrComplexTensor> {
+def MHLO_TanOp: MHLO_UnaryElementwiseOp<"tan",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
+  let summary = "Tan operator";
+  let description = [{
+    Returns `Tan(operand)` element-wise.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+
+    Example:
+
+    ```mlir
+    %0 = mhlo.tan %arg0 : tensor<2xf32>
+    ```
+  }];
+  let hasFolder = 1;
+  let hasCustomHLOConverter = 1;
+}
+
+def MHLO_SqrtOp: MHLO_UnaryElementwiseOp<"sqrt",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Square-root operator";
   let description = [{
     Returns `sqrt(operand)` element-wise.
@@ -611,9 +633,9 @@ def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
   let hasFolder = 1;
 }
 
-def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
+def MHLO_TanhOp: MHLO_UnaryElementwiseOp<"tanh",
     [Pure, HLO_CompatibleOperandsAndResultType],
-    HLO_FpOrComplexTensor> {
+    MHLO_FpOrComplexTensor> {
   let summary = "Tanh operator";
   let description = [{
     Returns `tanh(operand)` element-wise.
@@ -634,13 +656,13 @@ def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 
-// TODO(gleasonk): Merge HLO_BinaryElementwiseOp and HLO_BinaryElementwiseOpNoAssembly
-class HLO_BinaryElementwiseOpNoAssembly<string mnemonic, list<Trait> traits> :
-    HLO_Op<mnemonic, traits # [InferShapedTypeOpInterface,
+class MHLO_BinaryElementwiseOp<string mnemonic, list<Trait> traits,
+    Type OperandType = MHLO_Tensor, Type ResultType = OperandType> :
+    MHLO_Op<mnemonic, traits # [InferShapedTypeOpInterface,
     SameOperandsAndResultShape, Elementwise]> {
   let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs
+    OperandType:$lhs,
+    OperandType:$rhs
   );
 
   let extraClassDeclaration = [{
@@ -656,18 +678,15 @@ class HLO_BinaryElementwiseOpNoAssembly<string mnemonic, list<Trait> traits> :
     }
   }];
 
-  let results = (outs HLO_Tensor:$result);
-}
+  let results = (outs ResultType:$result);
 
-class HLO_BinaryElementwiseOp<string mnemonic, list<Trait> traits> :
-    HLO_BinaryElementwiseOpNoAssembly<mnemonic, traits> {
   let assemblyFormat = [{
     $lhs `,` $rhs attr-dict
       `:` custom<SameOperandsAndResultType>(type($lhs), type($rhs), type($result))
   }];
 }
 
-def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
+def MHLO_AddOp : MHLO_BinaryElementwiseOp<"add",
       [Commutative, Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Addition operator";
   let description = [{
@@ -686,8 +705,8 @@ def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
   let hasFolder = 1;
 }
 
-def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_Atan2Op : MHLO_BinaryElementwiseOp<"atan2",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpOrComplexTensor> {
   let summary = "Atan2 operator";
   let description = [{
     Returns `atan2(lhs/rhs)` element-wise.
@@ -703,8 +722,9 @@ def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
   }];
 }
 
-def HLO_ComplexOp: HLO_BinaryElementwiseOpNoAssembly<"complex", [Pure,
-    SameOperandsElementType, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def MHLO_ComplexOp: MHLO_BinaryElementwiseOp<"complex", [Pure,
+    SameOperandsElementType, SameOperandsAndResultShape,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Complex operator";
   let description = [{
     Performs element-wise conversion of a pair of real and imaginary values to
@@ -717,8 +737,8 @@ def HLO_ComplexOp: HLO_BinaryElementwiseOpNoAssembly<"complex", [Pure,
     ```
   }];
 
-  let arguments = (ins HLO_Fp32Or64Tensor:$lhs, HLO_Fp32Or64Tensor:$rhs);
-  let results = (outs HLO_ComplexTensor:$result);
+  let arguments = (ins MHLO_Fp32Or64Tensor:$lhs, MHLO_Fp32Or64Tensor:$rhs);
+  let results = (outs MHLO_ComplexTensor:$result);
 
   let hasFolder = 1;
 
@@ -728,8 +748,8 @@ def HLO_ComplexOp: HLO_BinaryElementwiseOpNoAssembly<"complex", [Pure,
   }];
 }
 
-def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_DivOp : MHLO_BinaryElementwiseOp<"divide",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntFpOrComplexTensor> {
   let summary = "Division operator";
   let description = [{
     Returns `lhs / rhs` element-wise.
@@ -746,7 +766,7 @@ def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
   let hasFolder = 1;
 }
 
-def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
+def MHLO_MaxOp : MHLO_BinaryElementwiseOp<"maximum",
       [Commutative, Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Maximum operator";
   let description = [{
@@ -764,7 +784,7 @@ def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
   let hasFolder = 1;
 }
 
-def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
+def MHLO_MinOp : MHLO_BinaryElementwiseOp<"minimum",
       [Commutative, Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Minimum operator";
   let description = [{
@@ -782,7 +802,7 @@ def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
   let hasFolder = 1;
 }
 
-def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
+def MHLO_MulOp : MHLO_BinaryElementwiseOp<"multiply",
       [Commutative, Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Multiplication operator";
   let description = [{
@@ -800,8 +820,8 @@ def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
   let hasFolder = 1;
 }
 
-def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_PowOp : MHLO_BinaryElementwiseOp<"power",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntFpOrComplexTensor> {
   let summary = "Power operator";
   let description = [{
     Returns `lhs ^ rhs` element-wise.
@@ -816,8 +836,8 @@ def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
     ```
   }];
 }
-def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_RemOp : MHLO_BinaryElementwiseOp<"remainder",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntFpOrComplexTensor> {
   let summary = "Remainder operator";
   let description = [{
     Returns `lhs % rhs` element-wise.
@@ -828,14 +848,14 @@ def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
     Example:
 
     ```mlir
-    %0 = mhlo.remainder %arg0, %arg1 : (ensor<4xi64>
+    %0 = mhlo.remainder %arg0, %arg1 : tensor<4xi64>
     ```
   }];
   let hasFolder = 1;
 }
 
-def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_ShiftLeftOp : MHLO_BinaryElementwiseOp<"shift_left",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntTensor> {
   let summary = "Shift Left operator";
   let description = [{
     Returns `lhs << rhs` element-wise.
@@ -851,8 +871,8 @@ def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
   }];
 }
 
-def HLO_ShiftRightArithmeticOp : HLO_BinaryElementwiseOp<"shift_right_arithmetic",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_ShiftRightArithmeticOp : MHLO_BinaryElementwiseOp<"shift_right_arithmetic",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntTensor> {
   let summary = "Shift right arithmetic operator";
   let description = [{
     Returns arithmetic `lhs >> rhs` element-wise.
@@ -869,8 +889,8 @@ def HLO_ShiftRightArithmeticOp : HLO_BinaryElementwiseOp<"shift_right_arithmetic
   }];
 }
 
-def HLO_ShiftRightLogicalOp : HLO_BinaryElementwiseOp<"shift_right_logical",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_ShiftRightLogicalOp : MHLO_BinaryElementwiseOp<"shift_right_logical",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntTensor> {
   let summary = "Shift right logical operator";
   let description = [{
     Returns logical `lhs >> rhs` element-wise.
@@ -886,8 +906,8 @@ def HLO_ShiftRightLogicalOp : HLO_BinaryElementwiseOp<"shift_right_logical",
   }];
 }
 
-def HLO_SubtractOp : HLO_BinaryElementwiseOp<"subtract",
-      [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_SubtractOp : MHLO_BinaryElementwiseOp<"subtract",
+      [Pure, HLO_CompatibleOperandsAndResultType], MHLO_IntFpOrComplexTensor> {
   let summary = "Subtraction operator";
   let description = [{
     Returns `lhs - rhs` element-wise.
@@ -906,7 +926,7 @@ def HLO_SubtractOp : HLO_BinaryElementwiseOp<"subtract",
 }
 
 // TODO(b/232442915): Implement stochastic_convert MHLO once HLO interface is submitted.
-def HLO_StochasticConvertOp : HLO_Op<"stochastic_convert",
+def MHLO_StochasticConvertOp : MHLO_Op<"stochastic_convert",
       [Pure, AllShapesMatch<["operand", "random", "result"]>]> {
   let summary = "Stochastic convert operator";
   let description = [{
@@ -914,8 +934,8 @@ def HLO_StochasticConvertOp : HLO_Op<"stochastic_convert",
     one with stochastic rounding using the random number passed in.
   }];
 
-  let arguments = (ins HLO_FpTensor:$operand, TensorOf<[HLO_UInt]>:$random);
-  let results = (outs HLO_Tensor:$result);
+  let arguments = (ins MHLO_FpTensor:$operand, TensorOf<[MHLO_UInt]>:$random);
+  let results = (outs MHLO_Tensor:$result);
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 }
@@ -925,22 +945,22 @@ def HLO_StochasticConvertOp : HLO_Op<"stochastic_convert",
 //===----------------------------------------------------------------------===//
 
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
-class HLO_BinaryBiwiseOrLogicalElementwiseOp<string mnemonic> :
-        HLO_BinaryElementwiseOp<mnemonic,
+class MHLO_BinaryBiwiseOrLogicalElementwiseOp<string mnemonic> :
+        MHLO_BinaryElementwiseOp<mnemonic,
           [Commutative, Pure, HLO_CompatibleOperandsAndResultType]> {
   let arguments = (ins
-    HLO_PredOrIntTensor:$lhs,
-    HLO_PredOrIntTensor:$rhs
+    MHLO_PredOrIntTensor:$lhs,
+    MHLO_PredOrIntTensor:$rhs
   );
 
   let hasFolder = 1;
 }
 
-def HLO_AndOp: HLO_BinaryBiwiseOrLogicalElementwiseOp<"and"> {
+def MHLO_AndOp: MHLO_BinaryBiwiseOrLogicalElementwiseOp<"and"> {
   let summary = "And operator";
   let description = [{
     Returns biwise-AND of `lhs` and `rhs` element-wise. The input tensors must
-    be of type integer `HLO_Int` or boolean `HLO_Pred`.
+    be of type integer `MHLO_Int` or boolean `MHLO_Pred`.
 
     Note: For boolean tensor, the bitwise-AND is equivalent to logical-AND.
 
@@ -952,11 +972,11 @@ def HLO_AndOp: HLO_BinaryBiwiseOrLogicalElementwiseOp<"and"> {
   }];
 }
 
-def HLO_OrOp: HLO_BinaryBiwiseOrLogicalElementwiseOp<"or"> {
+def MHLO_OrOp: MHLO_BinaryBiwiseOrLogicalElementwiseOp<"or"> {
   let summary = "Or operator";
   let description = [{
     Returns biwise-OR of `lhs` and `rhs` element-wise. The input tensors must
-    be of type integer `HLO_Int` or boolean `HLO_Pred`.
+    be of type integer `MHLO_Int` or boolean `MHLO_Pred`.
 
     Note: For boolean tensor, the bitwise-OR is equivalent to logical-OR.
 
@@ -968,11 +988,11 @@ def HLO_OrOp: HLO_BinaryBiwiseOrLogicalElementwiseOp<"or"> {
   }];
 }
 
-def HLO_XorOp : HLO_BinaryBiwiseOrLogicalElementwiseOp<"xor"> {
+def MHLO_XorOp : MHLO_BinaryBiwiseOrLogicalElementwiseOp<"xor"> {
   let summary = "Xor operator";
   let description = [{
     Returns biwise-XOR of `lhs` and `rhs` element-wise. The input tensors must
-    be of type integer `HLO_Int` or boolean `HLO_Pred`.
+    be of type integer `MHLO_Int` or boolean `MHLO_Pred`.
 
     Note: For boolean tensor, the bitwise-XOR is equivalent to logical-XOR.
 
@@ -990,7 +1010,7 @@ def HLO_XorOp : HLO_BinaryBiwiseOrLogicalElementwiseOp<"xor"> {
 
 // InfeedOp corresponds to 'InfeedWithToken' xla client API and not 'Infeed'.
 // InfeedWithToken allows ordering of infeed HLO instructions using tokens.
-def HLO_InfeedOp : HLO_Op<"infeed", []> {
+def MHLO_InfeedOp : MHLO_Op<"infeed", []> {
 
   let summary = "Infeed operator";
 
@@ -1009,18 +1029,19 @@ def HLO_InfeedOp : HLO_Op<"infeed", []> {
   }];
 
   let arguments = (ins
-    HLO_Token:$token,
+    MHLO_Token:$token,
     DefaultValuedStrAttr<StrAttr, "">:$infeed_config,
     OptionalAttr<ArrayAttr>:$layout
   );
-  let results = (outs Variadic<HLO_TensorOrToken>);
+  let results = (outs Variadic<MHLO_StaticShapeTensorOrToken>);
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 }
 
 // OutfeedOp corresponds to 'OutfeedWithToken' xla client API and not 'Outfeed'.
 // OutfeedWithToken allows ordering of outfeed HLO instructions using tokens.
-def HLO_OutfeedOp : HLO_Op<"outfeed", []> {
+def MHLO_OutfeedOp : MHLO_Op<"outfeed",
+    [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
 
   let summary = "Outfeed operator";
 
@@ -1033,15 +1054,16 @@ def HLO_OutfeedOp : HLO_Op<"outfeed", []> {
   }];
 
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
-    HLO_Token:$token,
+    Variadic<MHLO_Tensor>:$inputs,
+    MHLO_Token:$token,
     DefaultValuedStrAttr<StrAttr, "">:$outfeed_config
   );
-  let results = (outs HLO_Token);
+  let results = (outs MHLO_Token);
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_SendOp : HLO_Op<"send", []> {
+def MHLO_SendOp : MHLO_Op<"send",
+    [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
 
   let summary = "Send operator";
 
@@ -1056,17 +1078,17 @@ def HLO_SendOp : HLO_Op<"send", []> {
   }];
 
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
-    HLO_Token:$token,
-    ChannelHandle:$channel_handle,
+    Variadic<MHLO_Tensor>:$inputs,
+    MHLO_Token:$token,
+    MHLO_ChannelHandle:$channel_handle,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
   );
 
-  let results = (outs HLO_Token);
+  let results = (outs MHLO_Token);
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_RecvOp : HLO_Op<"recv", []> {
+def MHLO_RecvOp : MHLO_Op<"recv", []> {
 
   let summary = "Recv operator";
 
@@ -1082,12 +1104,12 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
   }];
 
   let arguments = (ins
-    HLO_Token:$token,
-    ChannelHandle:$channel_handle,
+    MHLO_Token:$token,
+    MHLO_ChannelHandle:$channel_handle,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$is_host_transfer
   );
 
-  let results = (outs Variadic<HLO_TensorOrToken>);
+  let results = (outs Variadic<MHLO_StaticShapeTensorOrToken>);
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 }
@@ -1096,7 +1118,7 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
 // MHLO parallelism related op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_ReplicaIdOp : HLO_Op<"replica_id", [Pure,
+def MHLO_ReplicaIdOp : MHLO_Op<"replica_id", [Pure,
     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "ReplicaId operator";
   let description = [{
@@ -1124,7 +1146,7 @@ def HLO_ReplicaIdOp : HLO_Op<"replica_id", [Pure,
 // MHLO control flow op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_AddDependencyOp : HLO_Op<"add_dependency", [Pure,
+def MHLO_AddDependencyOp : MHLO_Op<"add_dependency", [Pure,
     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "AddDependency operator";
   let description = [{
@@ -1140,15 +1162,16 @@ def HLO_AddDependencyOp : HLO_Op<"add_dependency", [Pure,
     ```
   }];
 
-  let arguments = (ins HLO_TensorOrToken:$operand, HLO_Token:$token);
-  let results = (outs HLO_TensorOrToken:$output);
+  let arguments = (ins MHLO_TensorOrToken:$operand, MHLO_Token:$token);
+  let results = (outs MHLO_TensorOrToken:$output);
   let hasCustomHLOConverter = 1;
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
 
-def HLO_AfterAllOp : HLO_Op<"after_all", [Pure]> {
+def MHLO_AfterAllOp : MHLO_Op<"after_all", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
 
   let summary = "AfterAll operator";
 
@@ -1167,8 +1190,8 @@ def HLO_AfterAllOp : HLO_Op<"after_all", [Pure]> {
     ```
   }];
 
-  let arguments = (ins Variadic<HLO_Token>:$inputs);
-  let results = (outs HLO_Token:$result);
+  let arguments = (ins Variadic<MHLO_Token>:$inputs);
+  let results = (outs MHLO_Token:$result);
 
   let assemblyFormat = [{
     $inputs attr-dict
@@ -1176,7 +1199,7 @@ def HLO_AfterAllOp : HLO_Op<"after_all", [Pure]> {
   }];
 }
 
-def HLO_AsyncStartOp : HLO_Op<"async_start", []> {
+def MHLO_AsyncStartOp : MHLO_Op<"async_start", []> {
   let summary = "AsyncStart operator";
 
   let description = [{
@@ -1208,18 +1231,18 @@ def HLO_AsyncStartOp : HLO_Op<"async_start", []> {
   }];
 
   let arguments = (ins
-    Variadic<HLO_TensorOrTokenOrTuple>:$inputs,
+    Variadic<MHLO_TensorOrTokenOrTuple>:$inputs,
     FlatSymbolRefAttr:$called_computation,
     StrAttr:$execution_thread,
     OptionalAttr<I64Attr>:$group_id
   );
 
-  let results = (outs HLO_AsyncBundle);
+  let results = (outs MHLO_AsyncBundle);
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 }
 
-def HLO_AsyncUpdateOp : HLO_Op<"async_update", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def MHLO_AsyncUpdateOp : MHLO_Op<"async_update", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "AsyncUpdate operator";
 
   let description = [{
@@ -1230,19 +1253,19 @@ def HLO_AsyncUpdateOp : HLO_Op<"async_update", [DeclareOpInterfaceMethods<InferT
   }];
 
   let arguments = (ins
-    HLO_AsyncBundle:$bundle,
+    MHLO_AsyncBundle:$bundle,
     FlatSymbolRefAttr:$called_computation,
     StrAttr:$execution_thread,
     OptionalAttr<I64Attr>:$group_id
     );
 
-  let results = (outs HLO_AsyncBundle);
+  let results = (outs MHLO_AsyncBundle);
 
   let hasVerifier = 1;
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_AsyncDoneOp : HLO_Op<"async_done", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def MHLO_AsyncDoneOp : MHLO_Op<"async_done", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "AsyncDone operator";
 
   let description = [{
@@ -1253,13 +1276,13 @@ def HLO_AsyncDoneOp : HLO_Op<"async_done", [DeclareOpInterfaceMethods<InferTypeO
   }];
 
   let arguments = (ins
-    HLO_AsyncBundle:$bundle,
+    MHLO_AsyncBundle:$bundle,
     FlatSymbolRefAttr:$called_computation,
     StrAttr:$execution_thread,
     OptionalAttr<I64Attr>:$group_id
     );
 
-  let results = (outs Variadic<HLO_TensorOrTokenOrTuple>);
+  let results = (outs Variadic<MHLO_TensorOrTokenOrTuple>);
   let hasVerifier = 1;
   let hasCustomHLOConverter = 1;
 }
@@ -1267,7 +1290,7 @@ def HLO_AsyncDoneOp : HLO_Op<"async_done", [DeclareOpInterfaceMethods<InferTypeO
 // Xla Client API has two separate calls for indexed and predicated conditional,
 // although both eventually map to kConditional HLO. IfOp maps to predicated
 // conditional use of kConditional HLO.
-def HLO_IfOp: HLO_Op<"if", [
+def MHLO_IfOp: MHLO_Op<"if", [
     RecursiveMemoryEffects,
     SingleBlockImplicitTerminator<"ReturnOp">,
     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
@@ -1286,13 +1309,13 @@ def HLO_IfOp: HLO_Op<"if", [
   }];
 
   let arguments = (ins
-    HLO_PredTensor:$pred
+    MHLO_PredTensor:$pred
   );
 
   let regions = (region SizedRegion<1>:$true_branch,
                         SizedRegion<1>:$false_branch);
 
-  let results = (outs Variadic<HLO_TensorOrToken>);
+  let results = (outs Variadic<MHLO_TensorOrToken>);
 
   // TODO(b/129422361): ConditionalOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
@@ -1303,7 +1326,7 @@ def HLO_IfOp: HLO_Op<"if", [
 // Xla Client API has two separate calls for indexed and predicated conditional,
 // although both eventually map to kConditional HLO. CaseOp maps to indexed
 // conditional use of kConditional HLO.
-def HLO_CaseOp: HLO_Op<"case", [
+def MHLO_CaseOp: MHLO_Op<"case", [
       RecursiveMemoryEffects,
       SingleBlockImplicitTerminator<"ReturnOp">,
       DeclareOpInterfaceMethods<InferTypeOpInterface>
@@ -1326,14 +1349,14 @@ def HLO_CaseOp: HLO_Op<"case", [
 
   let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
 
-  let results = (outs Variadic<HLO_TensorOrToken>);
+  let results = (outs Variadic<MHLO_TensorOrToken>);
 
   let hasCustomHLOConverter = 1;
 
   let hasCanonicalizer = 1;
 }
 
-def HLO_WhileOp: HLO_Op<"while", [
+def MHLO_WhileOp: MHLO_Op<"while", [
       RecursiveMemoryEffects,
       SingleBlockImplicitTerminator<"ReturnOp">,
       DeclareOpInterfaceMethods<InferTypeOpInterface>,
@@ -1346,11 +1369,11 @@ def HLO_WhileOp: HLO_Op<"while", [
 
     See https://www.tensorflow.org/xla/operation_semantics#while.
   }];
-  let arguments = (ins Variadic<HLO_TensorOrToken>:$operand);
+  let arguments = (ins Variadic<MHLO_TensorOrToken>:$operand);
 
   let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
 
-  let results = (outs Variadic<HLO_TensorOrToken>);
+  let results = (outs Variadic<MHLO_TensorOrToken>);
 
   let extraClassDeclaration = [{
     // Method of OpAsmOpInterface used during custom printing to name the block
@@ -1372,9 +1395,10 @@ def HLO_WhileOp: HLO_Op<"while", [
   let hasCanonicalizer = 1;
   let hasCustomAssemblyFormat = 1;
   let hasFolder = 1;
+  let hasVerifier = 1;
 }
 
-def HLO_AllGatherOp : HLO_Op<"all_gather", [SameOperandsAndResultElementType]> {
+def MHLO_AllGatherOp : MHLO_Op<"all_gather", [SameOperandsAndResultElementType]> {
 
   string summary = "AllGather operator";
 
@@ -1385,13 +1409,13 @@ def HLO_AllGatherOp : HLO_Op<"all_gather", [SameOperandsAndResultElementType]> {
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64Attr:$all_gather_dim,
     I64ElementsAttr:$replica_groups,
-    OptionalAttr<ChannelHandle>:$channel_handle,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle,
     UnitAttr:$use_global_device_ids
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   // use_global_device_ids is rarely used, so we add simplified builder methods
   // for convenience.
   let builders = [
@@ -1405,7 +1429,7 @@ def HLO_AllGatherOp : HLO_Op<"all_gather", [SameOperandsAndResultElementType]> {
   let hasVerifier = 1;
 }
 
-def HLO_AllReduceOp : HLO_Op<"all_reduce",
+def MHLO_AllReduceOp : MHLO_Op<"all_reduce",
     [HLO_CompatibleOperandsAndResultType]> {
   let summary = "AllReduce operator";
   let description = [{
@@ -1415,25 +1439,20 @@ def HLO_AllReduceOp : HLO_Op<"all_reduce",
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$replica_groups,
-    OptionalAttr<ChannelHandle>:$channel_handle,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle,
     UnitAttr:$use_global_device_ids
   );
   let regions = (region SizedRegion<1>:$computation);
-  let results = (outs HLO_Tensor);
-  // use_global_device_ids is rarely used, so we add a simplified builder method
-  // for convenience.
-  let builders = [
-    OpBuilder<(ins
-      "::mlir::Type":$result_type, "::mlir::Value":$operand,
-      "::mlir::DenseIntElementsAttr":$replica_groups,
-      "::mlir::mhlo::ChannelHandleAttr":$channel_handle)>];
+  let results = (outs MHLO_Tensor);
+  let hasVerifier = 1;
 
   let hasCustomHLOConverter = 1;
+  let hasVerifier = 1;
 }
 
-def HLO_ReduceScatterOp : HLO_Op<"reduce_scatter",
+def MHLO_ReduceScatterOp : MHLO_Op<"reduce_scatter",
     [SameOperandsAndResultElementType]> {
   let summary = "ReduceScatter operator";
   let description = [{
@@ -1443,14 +1462,14 @@ def HLO_ReduceScatterOp : HLO_Op<"reduce_scatter",
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64Attr:$scatter_dimension,
     I64ElementsAttr:$replica_groups,
-    OptionalAttr<ChannelHandle>:$channel_handle,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle,
     UnitAttr:$use_global_device_ids
   );
   let regions = (region SizedRegion<1>:$computation);
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   // use_global_device_ids is rarely used, so we add simplified builder methods
   // for convenience.
   let builders = [
@@ -1464,21 +1483,42 @@ def HLO_ReduceScatterOp : HLO_Op<"reduce_scatter",
   let hasVerifier = 1;
 }
 
-def HLO_AllToAllOp : HLO_Op<"all_to_all",
-    [Pure, SameOperandsElementType, SameOperandsShape,
+def MHLO_AllToAllOp : MHLO_Op<"all_to_all",
+    [Pure, SameOperandsElementType, SameOperandsShape, SameVariadicOperandSize,
      InferTensorType]> {
 
   let arguments = (ins
-    HLO_Tensor:$operand,
-    I64Attr:$split_dimension,
-    I64Attr:$concat_dimension,
-    I64Attr:$split_count,
-    I64ElementsAttr:$replica_groups
+    // ArrayAllToAll must have exactly one operand, TupleAllToAll at least one.
+    Variadic<MHLO_Tensor>:$operand,
+    // split_dimension, concat_dimension and split_count are present for array
+    // all-to-all, absent for tuple all-to-all.
+    OptionalAttr<I64Attr>:$split_dimension,
+    OptionalAttr<I64Attr>:$concat_dimension,
+    OptionalAttr<I64Attr>:$split_count,
+    I64ElementsAttr:$replica_groups,
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs Variadic<MHLO_Tensor>);
+  let hasCustomHLOConverter = 1;
+
+  // channel_handle is only used for the SPMD partitioner, so we add a
+  // simplified builder method for convenience.
+  let builders = [
+    OpBuilder<(ins
+      "::mlir::Type":$result_type, "::mlir::Value":$operand,
+      "::mlir::IntegerAttr": $split_dimension,
+      "::mlir::IntegerAttr": $concat_dimension,
+      "::mlir::IntegerAttr": $split_count,
+      "::mlir::DenseIntElementsAttr": $replica_groups)>,
+    OpBuilder<(ins
+      "::mlir::TypeRange":$result_type, "::mlir::ValueRange":$operand,
+      "::mlir::IntegerAttr": $split_dimension,
+      "::mlir::IntegerAttr": $concat_dimension,
+      "::mlir::IntegerAttr": $split_count,
+      "::mlir::DenseIntElementsAttr": $replica_groups)>];
 }
 
-def HLO_ReduceOp: HLO_ShapedInterfaceOp<"reduce", [
+def MHLO_ReduceOp: MHLO_ShapedInterfaceOp<"reduce", [
       RecursiveMemoryEffects,
       SameVariadicOperandSize,
       SingleBlockImplicitTerminator<"ReturnOp">,
@@ -1492,12 +1532,12 @@ def HLO_ReduceOp: HLO_ShapedInterfaceOp<"reduce", [
     See https://www.tensorflow.org/xla/operation_semantics#reduce.
   }];
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
-    Variadic<HLO_Tensor>:$init_values,
+    Variadic<MHLO_Tensor>:$inputs,
+    Variadic<MHLO_Tensor>:$init_values,
     I64ElementsAttr:$dimensions
   );
 
-  let results = (outs Variadic<HLO_Tensor>);
+  let results = (outs Variadic<MHLO_Tensor>);
 
   let hasCanonicalizer = 1;
   let hasCustomAssemblyFormat = 1;
@@ -1515,7 +1555,7 @@ def HLO_ReduceOp: HLO_ShapedInterfaceOp<"reduce", [
 //===----------------------------------------------------------------------===//
 // MHLO tuple op definitions.
 //===----------------------------------------------------------------------===//
-def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [Pure,
+def MHLO_GetTupleElementOp: MHLO_Op<"get_tuple_element", [Pure,
      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "GetTupleElement operator";
   let description = [{
@@ -1524,21 +1564,20 @@ def HLO_GetTupleElementOp: HLO_Op<"get_tuple_element", [Pure,
     See https://www.tensorflow.org/xla/operation_semantics#gettupleelement.
   }];
   let arguments = (ins
-    HLO_Tuple:$operand,
+    MHLO_Tuple:$operand,
     I32Attr:$index
   );
 
-  let results = (outs HLO_TensorOrTokenOrTuple);
+  let results = (outs MHLO_TensorOrTokenOrTuple);
 
   let hasFolder = 1;
-  let hasVerifier = 1;
 
   let assemblyFormat = [{
     $operand `[` $index `]` attr-dict `:` functional-type(operands, results)
   }];
 }
 
-def HLO_TupleOp : HLO_Op<"tuple", [Pure,
+def MHLO_TupleOp : MHLO_Op<"tuple", [Pure,
      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "XLA's tuple op";
   let description = [{
@@ -1552,18 +1591,17 @@ def HLO_TupleOp : HLO_Op<"tuple", [Pure,
     %0 = mhlo.tuple %arg0, %arg0 : tuple<tensor<i32>, tensor<i32>>
     ```
    }];
-  let arguments = (ins Variadic<HLO_TensorOrTokenOrTuple>:$val);
-  let results = (outs HLO_Tuple:$result);
+  let arguments = (ins Variadic<MHLO_TensorOrTokenOrTuple>:$val);
+  let results = (outs MHLO_Tuple:$result);
 
   let hasCanonicalizer = 1;
-  let hasVerifier = 1;
 
   let assemblyFormat = [{
     $val attr-dict `:` custom<TupleOpType>(type($val), type($result))
   }];
 }
 
-def HLO_CompareOp: HLO_Op<"compare", [Pure, SameOperandsElementType,
+def MHLO_CompareOp: MHLO_Op<"compare", [Pure, SameOperandsElementType,
     SameOperandsAndResultShape, Elementwise, InferTensorTypeWithReify]> {
   let summary = "Comparison operator";
   let description = [{
@@ -1583,12 +1621,12 @@ def HLO_CompareOp: HLO_Op<"compare", [Pure, SameOperandsElementType,
     ```
   }];
   let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    HLO_ComparisonDirectionAttr:$comparison_direction,
-    OptionalAttr<HLO_ComparisonTypeAttr>:$compare_type
+    MHLO_Tensor:$lhs,
+    MHLO_Tensor:$rhs,
+    MHLO_ComparisonDirectionAttr:$comparison_direction,
+    OptionalAttr<MHLO_ComparisonTypeAttr>:$compare_type
   );
-  let results = (outs HLO_PredTensor);
+  let results = (outs MHLO_PredTensor);
 
   let hasFolder = 1;
 
@@ -1611,25 +1649,25 @@ def HLO_CompareOp: HLO_Op<"compare", [Pure, SameOperandsElementType,
 // MHLO Slice definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_SliceOp: HLO_Op<
+def MHLO_SliceOp: MHLO_Op<
       "slice",
       [Pure, SameOperandsAndResultElementType,
        AllTypesMatch<["start_indices", "limit_indices", "strides"]>,
        DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$start_indices,
     I64ElementsAttr:$limit_indices,
     I64ElementsAttr:$strides
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
 
-def HLO_DynamicSliceOp: HLO_Op<"dynamic_slice",
+def MHLO_DynamicSliceOp: MHLO_Op<"dynamic_slice",
       [Pure, AllElementTypesMatch<["operand", "result"]>,
        InferTensorType]> {
   let summary = "Dynamic Slice operator";
@@ -1639,19 +1677,18 @@ def HLO_DynamicSliceOp: HLO_Op<"dynamic_slice",
     See https://www.tensorflow.org/xla/operation_semantics#dynamicslice.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    Variadic<HLO_ScalarIntTensor>:$start_indices,
+    MHLO_Tensor:$operand,
+    Variadic<MHLO_ScalarIntTensor>:$start_indices,
     I64ElementsAttr:$slice_sizes
   );
 
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
   let hasCanonicalizer = 1;
-  let hasVerifier = 1;
 }
 
-def HLO_DynamicUpdateSliceOp: HLO_Op<"dynamic_update_slice",
+def MHLO_DynamicUpdateSliceOp: MHLO_Op<"dynamic_update_slice",
       [Pure, AllElementTypesMatch<["operand", "update", "result"]>,
-       AllShapesMatch<["operand", "result"]>]> {
+       InferTensorType]> {
   let summary = "Dynamic Update Slice operator";
   let description = [{
     DynamicUpdateSlice generates a result which is the value of the input array
@@ -1667,13 +1704,12 @@ def HLO_DynamicUpdateSliceOp: HLO_Op<"dynamic_update_slice",
     ```
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_Tensor:$update,
-    Variadic<HLO_ScalarIntTensor>:$start_indices
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$update,
+    Variadic<MHLO_ScalarIntTensor>:$start_indices
   );
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
   let hasFolder = 1;
-  let hasVerifier = 1;
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
@@ -1683,7 +1719,7 @@ def HLO_DynamicUpdateSliceOp: HLO_Op<"dynamic_update_slice",
 // MHLO Other op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_DomainOp : HLO_Op<"domain", [HLO_CompatibleOperandsAndResultType, InferTypeOpInterface, Pure]> {
+def MHLO_DomainOp : MHLO_Op<"domain", [HLO_CompatibleOperandsAndResultType, InferTypeOpInterface, Pure]> {
   let summary = "Marks groups of instructions (domains) with a property";
   let description = [{
     Domain instructions are used to group instructions with the same
@@ -1699,21 +1735,17 @@ def HLO_DomainOp : HLO_Op<"domain", [HLO_CompatibleOperandsAndResultType, InferT
     one on the operand side and one on the user side of the domain.
   }];
   let arguments = (ins
-    HLO_TensorOrToken:$operand,
-    HLO_DomainKindAttr:$kind,
+    MHLO_TensorOrToken:$operand,
+    MHLO_DomainKindAttr:$kind,
     StrAttr:$entry_metadata,
     StrAttr:$exit_metadata
   );
-  let results = (outs HLO_TensorOrToken:$result);
+  let results = (outs MHLO_TensorOrToken:$result);
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_BatchNormGradOp : HLO_Op<"batch_norm_grad", [Pure,
-    AllShapesMatch<["scale", "mean", "variance", "grad_scale",
-        "grad_offset"]>,
-    AllShapesMatch<["operand", "grad_output"]>,
-    AllElementTypesMatch<["operand", "grad_scale", "grad_offset"]>,
-    AllTypesMatch<["operand", "grad_operand"]>,
+def MHLO_BatchNormGradOp : MHLO_Op<"batch_norm_grad", [Pure,
+    AllElementTypesMatch<["operand", "grad_operand", "grad_scale", "grad_offset"]>,
     InferTensorType]> {
   let summary = "Batch Normalization Gradient";
   let description = [{
@@ -1723,28 +1755,25 @@ def HLO_BatchNormGradOp : HLO_Op<"batch_norm_grad", [Pure,
   }];
 
   let arguments = (ins
-    RankedTensorOf<[HLO_Float]>:$operand,
-    1DTensorOf<[HLO_Float]>:$scale,
-    1DTensorOf<[HLO_Float]>:$mean,
-    1DTensorOf<[HLO_Float]>:$variance,
-    RankedTensorOf<[HLO_Float]>:$grad_output,
+    RankedTensorOf<[MHLO_Float]>:$operand,
+    1DTensorOf<[MHLO_Float]>:$scale,
+    1DTensorOf<[MHLO_Float]>:$mean,
+    1DTensorOf<[MHLO_Float]>:$variance,
+    RankedTensorOf<[MHLO_Float]>:$grad_output,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
 
   let results = (outs
-      RankedTensorOf<[HLO_Float]>:$grad_operand,
-      1DTensorOf<[HLO_Float]>:$grad_scale,
-      1DTensorOf<[HLO_Float]>:$grad_offset);
+      RankedTensorOf<[MHLO_Float]>:$grad_operand,
+      1DTensorOf<[MHLO_Float]>:$grad_scale,
+      1DTensorOf<[MHLO_Float]>:$grad_offset);
 
   let hasCustomHLOConverter = 1;
-  let hasVerifier = 1;
 }
 
-def HLO_BatchNormInferenceOp : HLO_Op<"batch_norm_inference",
-    [Pure, AllTypesMatch<["operand", "result"]>,
-    AllShapesMatch<["scale", "offset", "mean", "variance"]>,
-    InferTensorType]> {
+def MHLO_BatchNormInferenceOp : MHLO_Op<"batch_norm_inference",
+    [Pure, AllElementTypesMatch<["operand", "result"]>, InferTensorType]> {
   let summary = "Batch Normalization for Inference";
   let description = [{
     Normalizes an array across batch and spatial dimensions.
@@ -1753,24 +1782,20 @@ def HLO_BatchNormInferenceOp : HLO_Op<"batch_norm_inference",
   }];
 
   let arguments = (ins
-    RankedTensorOf<[HLO_Float]>:$operand,
-    1DTensorOf<[HLO_Float]>:$scale,
-    1DTensorOf<[HLO_Float]>:$offset,
-    1DTensorOf<[HLO_Float]>:$mean,
-    1DTensorOf<[HLO_Float]>:$variance,
+    RankedTensorOf<[MHLO_Float]>:$operand,
+    1DTensorOf<[MHLO_Float]>:$scale,
+    1DTensorOf<[MHLO_Float]>:$offset,
+    1DTensorOf<[MHLO_Float]>:$mean,
+    1DTensorOf<[MHLO_Float]>:$variance,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
 
-  let results = (outs RankedTensorOf<[HLO_Float]>:$result);
-
-  let hasVerifier = 1;
+  let results = (outs RankedTensorOf<[MHLO_Float]>:$result);
 }
 
-def HLO_BatchNormTrainingOp : HLO_Op<"batch_norm_training",
-    [Pure, AllTypesMatch<["operand", "output"]>,
-    AllElementTypesMatch<["operand", "batch_mean", "batch_var"]>,
-    AllShapesMatch<["scale", "offset", "batch_mean", "batch_var"]>,
+def MHLO_BatchNormTrainingOp : MHLO_Op<"batch_norm_training",
+    [Pure, AllElementTypesMatch<["operand", "output", "batch_mean", "batch_var"]>,
     InferTensorType]> {
   let summary = "Batch Normalization for Training";
   let description = [{
@@ -1780,23 +1805,22 @@ def HLO_BatchNormTrainingOp : HLO_Op<"batch_norm_training",
   }];
 
   let arguments = (ins
-    RankedTensorOf<[HLO_Float]>:$operand,
-    1DTensorOf<[HLO_Float]>:$scale,
-    1DTensorOf<[HLO_Float]>:$offset,
+    RankedTensorOf<[MHLO_Float]>:$operand,
+    1DTensorOf<[MHLO_Float]>:$scale,
+    1DTensorOf<[MHLO_Float]>:$offset,
     F32Attr:$epsilon,
     I64Attr:$feature_index
   );
 
   let results = (outs
-      RankedTensorOf<[HLO_Float]>:$output,
-      1DTensorOf<[HLO_Float]>:$batch_mean,
-      1DTensorOf<[HLO_Float]>:$batch_var);
+      RankedTensorOf<[MHLO_Float]>:$output,
+      1DTensorOf<[MHLO_Float]>:$batch_mean,
+      1DTensorOf<[MHLO_Float]>:$batch_var);
 
-  let hasVerifier = 1;
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_BitcastConvertOp : HLO_ShapedInterfaceOp<"bitcast_convert",
+def MHLO_BitcastConvertOp : MHLO_ShapedInterfaceOp<"bitcast_convert",
     [Pure]> {
   let summary = "BitcastConvert operator";
   let description = [{
@@ -1815,15 +1839,15 @@ def HLO_BitcastConvertOp : HLO_ShapedInterfaceOp<"bitcast_convert",
     ```
   }];
 
-  let arguments = (ins HLO_Tensor:$operand);
-  let results = (outs HLO_Tensor);
+  let arguments = (ins MHLO_Tensor:$operand);
+  let results = (outs MHLO_Tensor);
   let hasVerifier = 1;
   let hasCustomHLOConverter = 1;
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_BroadcastOp : HLO_ShapedInterfaceOp<"broadcast",
+def MHLO_BroadcastOp : MHLO_ShapedInterfaceOp<"broadcast",
     [Pure, SameOperandsAndResultElementType, InferTensorType]> {
   let summary = "Broadcast a tensor to a higher rank by prepending dimensions";
   let description = [{
@@ -1838,17 +1862,16 @@ def HLO_BroadcastOp : HLO_ShapedInterfaceOp<"broadcast",
     See https://www.tensorflow.org/xla/operation_semantics#broadcast.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$broadcast_sizes
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasFolder = 1;
-  let hasVerifier = 1;
 }
 
-def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
+def MHLO_BroadcastInDimOp : MHLO_Op<"broadcast_in_dim",
       [Pure, SameOperandsAndResultElementType]> {
   let summary = "Broadcast a tensor into the given shape by adding dimensions.";
   let description = [{
@@ -1868,11 +1891,11 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
     See https://www.tensorflow.org/xla/broadcasting.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    BroadcastDimAttr:$broadcast_dimensions
+    MHLO_Tensor:$operand,
+    MHLO_BroadcastDimAttr:$broadcast_dimensions
   );
 
-  let results = (outs HLO_StaticShapeTensor);
+  let results = (outs MHLO_StaticShapeTensor);
 
   let hasFolder = 1;
   let hasCanonicalizer = 1;
@@ -1881,7 +1904,7 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_DynamicBroadcastInDimOp : HLO_ShapedInterfaceOp<
+def MHLO_DynamicBroadcastInDimOp : MHLO_ShapedInterfaceOp<
     "dynamic_broadcast_in_dim", [Pure]> {
   let summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
   let description = [{
@@ -1897,14 +1920,14 @@ def HLO_DynamicBroadcastInDimOp : HLO_ShapedInterfaceOp<
     must be disjoint and they must be a subset of the operand's dimensions.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_DimensionTensor:$output_dimensions,
-    BroadcastDimAttr:$broadcast_dimensions,
-    OptionalAttr<BroadcastDimAttr>:$known_expanding_dimensions,
-    OptionalAttr<BroadcastDimAttr>:$known_nonexpanding_dimensions
+    MHLO_Tensor:$operand,
+    MHLO_DimensionTensor:$output_dimensions,
+    MHLO_BroadcastDimAttr:$broadcast_dimensions,
+    OptionalAttr<MHLO_BroadcastDimAttr>:$known_expanding_dimensions,
+    OptionalAttr<MHLO_BroadcastDimAttr>:$known_nonexpanding_dimensions
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let builders = [
     OpBuilder<(ins
@@ -1922,11 +1945,11 @@ def HLO_DynamicBroadcastInDimOp : HLO_ShapedInterfaceOp<
   let hasCustomHLOConverter = 1;
 }
 
-// Note: There is no HLO_CallOp because the standard call operation mlir::func::CallOp
+// Note: There is no MHLO_CallOp because the standard call operation mlir::func::CallOp
 // is used instead. A mlir::func::CallOp is exported to a HLO call instruction
 // directly.
 
-def HLO_CholeskyOp : HLO_Op<"cholesky",
+def MHLO_CholeskyOp : MHLO_Op<"cholesky",
       [Pure, SameOperandsAndResultElementType, InferTensorType]> {
   let summary = "Cholesky operator";
   let description = [{
@@ -1951,14 +1974,14 @@ def HLO_CholeskyOp : HLO_Op<"cholesky",
     See https://www.tensorflow.org/xla/operation_semantics#cholesky.
   }];
   let arguments = (ins
-    HLO_FpOrComplexTensor:$a,
+    MHLO_FpOrComplexTensor:$a,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$lower
   );
 
-  let results = (outs HLO_FpOrComplexTensor);
+  let results = (outs MHLO_FpOrComplexTensor);
 }
 
-def HLO_ClampOp : HLO_ShapedInterfaceOp<"clamp", [Pure,
+def MHLO_ClampOp : MHLO_ShapedInterfaceOp<"clamp", [Pure,
   SameOperandsAndResultElementType, HLO_BroadcastingElementwise,
   InferTensorType]> {
   let summary = "Clamp operator";
@@ -1980,11 +2003,11 @@ def HLO_ClampOp : HLO_ShapedInterfaceOp<"clamp", [Pure,
   }];
 
   let arguments = (ins
-    HLO_Tensor:$min,
-    HLO_Tensor:$operand,
-    HLO_Tensor:$max
+    MHLO_Tensor:$min,
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$max
   );
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
 
   let hasVerifier = 1;
   let hasFolder = 1;
@@ -1995,7 +2018,7 @@ def HLO_ClampOp : HLO_ShapedInterfaceOp<"clamp", [Pure,
   }];
 }
 
-def HLO_ConcatenateOp : HLO_ShapedInterfaceOp<"concatenate",
+def MHLO_ConcatenateOp : MHLO_ShapedInterfaceOp<"concatenate",
     [Pure, SameOperandsAndResultElementType,
      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "XLA's concatenate op";
@@ -2006,18 +2029,17 @@ def HLO_ConcatenateOp : HLO_ShapedInterfaceOp<"concatenate",
    }];
 
   let arguments = (ins
-    Variadic<HLO_Tensor>:$val,
+    Variadic<MHLO_Tensor>:$val,
     I64Attr:$dimension
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasCanonicalizer = 1;
   let hasFolder = 1;
-  let hasVerifier = 1;
 }
 
-def HLO_CollectivePermuteOp: HLO_Op<"collective_permute",
+def MHLO_CollectivePermuteOp: MHLO_Op<"collective_permute",
     [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "CollectivePermute operator";
   let description = [{
@@ -2034,11 +2056,11 @@ def HLO_CollectivePermuteOp: HLO_Op<"collective_permute",
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<ChannelHandle>:$channel_handle
+    OptionalAttr<MHLO_ChannelHandle>:$channel_handle
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   let hasVerifier = 1;
   // channel_handle is only used for the SPMD partitioner, so we add a
   // simplified builder method for convenience.
@@ -2048,7 +2070,7 @@ def HLO_CollectivePermuteOp: HLO_Op<"collective_permute",
       "::mlir::DenseIntElementsAttr":$source_target_pairs)>];
 }
 
-def HLO_ConvolutionOp : HLO_ShapedInterfaceOp<"convolution", [Pure]> {
+def MHLO_ConvolutionOp : MHLO_Op<"convolution", [Pure]> {
   let summary = "Convolution operator";
   let description = [{
     Computes a convolution of the kind used in neural networks.
@@ -2057,11 +2079,11 @@ def HLO_ConvolutionOp : HLO_ShapedInterfaceOp<"convolution", [Pure]> {
   }];
   let arguments = !con(
     (ins
-       HLO_Tensor:$lhs,
-       HLO_Tensor:$rhs),
-    ConvolutionAttributes.attributes);
+       MHLO_Tensor:$lhs,
+       MHLO_Tensor:$rhs),
+    MHLO_ConvolutionAttributes.attributes);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   let hasCanonicalizer = 1;
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
@@ -2084,7 +2106,7 @@ def HLO_ConvolutionOp : HLO_ShapedInterfaceOp<"convolution", [Pure]> {
   }];
 }
 
-def HLO_CopyOp: HLO_Op<"copy", [Pure, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_CopyOp: MHLO_Op<"copy", [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Copy operator";
   let description = [{
     Returns a copy of `operand`.
@@ -2096,10 +2118,10 @@ def HLO_CopyOp: HLO_Op<"copy", [Pure, HLO_CompatibleOperandsAndResultType]> {
     ```
   }];
   let arguments = (ins
-      HLO_Tensor:$operand,
-      UnitAttr:$is_cross_program_prefetch
+      MHLO_TensorOrTokenOrTuple:$operand,
+      OptionalAttr<I32Attr>:$cross_program_prefetch_index
   );
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_TensorOrTokenOrTuple:$result);
   let hasCustomHLOConverter = 1;
   let hasFolder = 1;
 
@@ -2109,7 +2131,7 @@ def HLO_CopyOp: HLO_Op<"copy", [Pure, HLO_CompatibleOperandsAndResultType]> {
   }];
 }
 
-def HLO_CrossReplicaSumOp : HLO_Op<"cross-replica-sum",
+def MHLO_CrossReplicaSumOp : MHLO_Op<"cross-replica-sum",
     [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Sums input across replicated instances.";
   let description = [{
@@ -2125,14 +2147,14 @@ def HLO_CrossReplicaSumOp : HLO_Op<"cross-replica-sum",
    }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$replica_groups
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 }
 
-def HLO_CustomCallOp: HLO_Op<"custom_call",
+def MHLO_CustomCallOp: MHLO_Op<"custom_call",
     [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "CustomCall operator";
   let description = [{
@@ -2142,8 +2164,29 @@ def HLO_CustomCallOp: HLO_Op<"custom_call",
     backend, a call instruction is emitted which targets a symbol with the name
     `call_target_name`.
 
+    If XLA runtime is enabled for a backend, then custom calls use the runtime
+    custom call calling convention to call into the external functions. This
+    calling convention defines an ABI for encoding arguments, attributes and
+    results.
+
     `call_target_name` and `backend_config` can be arbitrary strings, but
     `call_target_name` should be short as it may be used in labels.
+
+    Depending on the API version there are two ways to pass extra bits of static
+    information to the external function:
+
+    1. For `API_VERSION_TYPED_FFI` custom calls `backend_config` must be a
+       dictionary attribute, that will be encoded according to the custom call
+       calling convention and passed to the external function as the attributes
+       argument. External code is expected to use declarative bindings (see
+       `xla/runtime/custom_call.h`) to decode them at run time.
+
+    2. For previous API versions it is the user responsibility to encode extra
+       bits of static information as a string `backend_config` attribute, and
+       decode it at run time.
+
+    `API_VERSION_TYPED_FFI` custom calls only supported if XLA uses XLA runtime.
+
     `backend_config` can encode arbitrarily large amounts of information.
 
     `has_side_effect` must be true if the custom call has side-effects.
@@ -2168,28 +2211,36 @@ def HLO_CustomCallOp: HLO_Op<"custom_call",
        result tuple.
 
     See https://www.tensorflow.org/xla/operation_semantics#customcall.
+
+    Example:
+
+    ```mlir
+    %1 = mhlo.custom_call @foo(%arg0, %arg1) {backend_config = "bar", has_side_effect = true}
+          : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+    ```
   }];
   let arguments = (ins
-    Variadic<HLO_TensorOrTokenOrTuple>:$inputs,
+    Variadic<MHLO_TensorOrTokenOrTuple>:$inputs,
     StrAttr:$call_target_name,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$has_side_effect,
-    DefaultValuedOptionalStrAttr<StrAttr, "">:$backend_config,
+    OptionalAttr<AnyAttrOf<[StrAttr, DictionaryAttr]>>:$backend_config,
     // TODO(b/189822916): Remove this field when all clients are migrated to
     // the status-returning API.
     DefaultValuedOptionalAttr<
-        HLO_CustomCallApiVersionAttr,
+        MHLO_CustomCallApiVersionAttr,
         "::mlir::mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL">:
         $api_version,
-    DefaultValuedOptionalAttr<HLO_FlatSymbolRefArrayAttr, "{}">:$called_computations,
-    OptionalAttr<HLO_ArrayOfLayoutAttr>:$operand_layouts,
-    OptionalAttr<HLO_ArrayOfLayoutAttr>:$result_layouts,
+    DefaultValuedOptionalAttr<MHLO_FlatSymbolRefArrayAttr, "{}">:$called_computations,
+    DefaultValuedOptionalAttr<MHLO_CustomCallScheduleAttr, "::mlir::mhlo::CustomCallSchedule::NONE">:$custom_call_schedule,
+    OptionalAttr<MHLO_ArrayOfLayoutAttr>:$operand_layouts,
+    OptionalAttr<MHLO_ArrayOfLayoutAttr>:$result_layouts,
     DefaultValuedOptionalAttr<
         TypedArrayAttrBase<
-            OutputOperandAlias,
+            MHLO_OutputOperandAlias,
             "Aliasing attribute for outputs and operands of CustomCall">,
         "{}">:$output_operand_aliases
   );
-  let results = (outs Variadic<HLO_TensorOrTokenOrTuple>);
+  let results = (outs Variadic<MHLO_TensorOrTokenOrTuple>);
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 
@@ -2205,11 +2256,14 @@ def HLO_CustomCallOp: HLO_Op<"custom_call",
       "::mlir::ArrayAttr":$called_computations,
       "::mlir::ArrayAttr":$operand_layouts,
       "::mlir::ArrayAttr":$result_layouts)>];
+
+  let assemblyFormat = [{
+    custom<CustomCallTarget>($call_target_name) `(` $inputs `)`
+      attr-dict `:` functional-type(operands, results)
+  }];
 }
 
-def HLO_DotOp: HLO_Op<"dot",
-    [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>,
-     DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+def MHLO_DotOp: MHLO_Op<"dot", [Pure, DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
                               ["reifyReturnTypeShapes"]>]> {
   let summary = "Dot operator";
   let description = [{
@@ -2219,24 +2273,18 @@ def HLO_DotOp: HLO_Op<"dot",
     See https://www.tensorflow.org/xla/operation_semantics#dot.
   }];
   let arguments = (
-    ins HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    HLO_PrecisionConfigAttr:$precision_config
+    ins MHLO_Tensor:$lhs,
+    MHLO_Tensor:$rhs,
+    MHLO_PrecisionConfigAttr:$precision_config
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   // Dot op required custom exporter to pass the preferred element type
   // to Xla builder.
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
-
-  let extraClassDeclaration = [{
-    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r) {
-      return succeeded(mlir::verifyCompatibleShapes(l, r));
-    }
-  }];
 }
 
-def HLO_DotGeneralOp: HLO_ShapedInterfaceOp<"dot_general", [Pure]> {
+def MHLO_DotGeneralOp: MHLO_ShapedInterfaceOp<"dot_general", [Pure]> {
   let summary = "General Dot operator";
   let description = [{
     Performs general dot products between vectors, vector/matrix and
@@ -2245,13 +2293,13 @@ def HLO_DotGeneralOp: HLO_ShapedInterfaceOp<"dot_general", [Pure]> {
     See https://www.tensorflow.org/xla/operation_semantics#dotgeneral.
   }];
   let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    DotDimensionNumbers:$dot_dimension_numbers,
-    HLO_PrecisionConfigAttr:$precision_config
+    MHLO_Tensor:$lhs,
+    MHLO_Tensor:$rhs,
+    MHLO_DotDimensionNumbers:$dot_dimension_numbers,
+    MHLO_PrecisionConfigAttr:$precision_config
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   let hasCanonicalizer = 1;
   // DotGeneral op required custom exporter to pass the preferred element type
   // to Xla builder.
@@ -2270,26 +2318,26 @@ class BASE_EinsumOp {
   }];
 }
 
-def HLO_EinsumOp: HLO_Op<"einsum", [Pure]>, BASE_EinsumOp {
+def MHLO_EinsumOp: MHLO_Op<"einsum", [Pure]>, BASE_EinsumOp {
   let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
+    MHLO_Tensor:$lhs,
+    MHLO_Tensor:$rhs,
     StrAttr:$einsum_config
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   // TODO(hinsu): Canonicalize to lower this client side HLO op to server
   // side HLO ops.
 }
 
-def HLO_UnaryEinsumOp: HLO_Op<"unary_einsum", [Pure]>, BASE_EinsumOp {
+def MHLO_UnaryEinsumOp: MHLO_Op<"unary_einsum", [Pure]>, BASE_EinsumOp {
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     StrAttr:$einsum_config
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasCanonicalizer = 1;
 
@@ -2298,7 +2346,7 @@ def HLO_UnaryEinsumOp: HLO_Op<"unary_einsum", [Pure]>, BASE_EinsumOp {
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_FftOp: HLO_Op<"fft", [InferTensorType, Pure]> {
+def MHLO_FftOp: MHLO_Op<"fft", [InferTensorType, Pure]> {
   let summary = "Fast fourier transform operator";
   let description = [{
     Returns the fast-fourier-transform of the input array.
@@ -2307,15 +2355,15 @@ def HLO_FftOp: HLO_Op<"fft", [InferTensorType, Pure]> {
     https://www.tensorflow.org/xla/operation_semantics#fft.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_FftTypeAttr:$fft_type,
+    MHLO_Tensor:$operand,
+    MHLO_FftTypeAttr:$fft_type,
     I64ElementsAttr:$fft_length
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 }
 
-def HLO_GatherOp: HLO_Op<"gather", [InferTensorTypeWithReify, Pure]> {
+def MHLO_GatherOp: MHLO_Op<"gather", [InferTensorTypeWithReify, Pure]> {
   let summary = "Gather operator";
   let description = [{
     Stitches together several slices of `operand` from offsets specified in
@@ -2325,19 +2373,20 @@ def HLO_GatherOp: HLO_Op<"gather", [InferTensorTypeWithReify, Pure]> {
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_IntTensor:$start_indices,
-    GatherDimensionNumbers:$dimension_numbers,
+    MHLO_Tensor:$operand,
+    MHLO_IntTensor:$start_indices,
+    MHLO_GatherDimensionNumbers:$dimension_numbers,
     I64ElementsAttr:$slice_sizes,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasCanonicalizer = 1;
 }
 
-def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [Pure]> {
+def MHLO_GetDimensionSizeOp: MHLO_Op<"get_dimension_size", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "GetDimensionSize operator";
   let description = [{
     Returns the size of the given dimension of the operand.
@@ -2346,7 +2395,7 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [Pure]> {
     https://www.tensorflow.org/xla/operation_semantics#getdimensionsize.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64Attr:$dimension
   );
   // TODO(hinsu): Allow 64-bit result types once XLA HLO dialect based on the
@@ -2358,7 +2407,7 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [Pure]> {
   let hasVerifier = 1;
 }
 
-def HLO_MapOp: HLO_ShapedInterfaceOp<"map",
+def MHLO_MapOp: MHLO_ShapedInterfaceOp<"map",
       [RecursiveMemoryEffects, SameOperandsAndResultShape,
        SingleBlockImplicitTerminator<"ReturnOp">, InferTensorTypeWithReify]> {
   let summary = "Map operator";
@@ -2375,16 +2424,16 @@ def HLO_MapOp: HLO_ShapedInterfaceOp<"map",
   See https://www.tensorflow.org/xla/operation_semantics#map.
   }];
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
+    Variadic<MHLO_Tensor>:$inputs,
     I64ElementsAttr:$dimensions
   );
   let regions = (region SizedRegion<1>:$computation);
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
   let hasFolder = 1;
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ReshapeOp: HLO_Op<"reshape",
+def MHLO_ReshapeOp: MHLO_Op<"reshape",
       [Pure, SameOperandsAndResultElementType]> {
   let summary = "Reshape operator";
   let description = [{
@@ -2399,9 +2448,9 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
     ```
   }];
 
-  let arguments = (ins HLO_Tensor:$operand);
+  let arguments = (ins MHLO_Tensor:$operand);
 
-  let results = (outs HLO_StaticShapeTensor);
+  let results = (outs MHLO_StaticShapeTensor);
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
@@ -2411,7 +2460,7 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_DynamicReshapeOp: HLO_ShapedInterfaceOp<"dynamic_reshape", [Pure]> {
+def MHLO_DynamicReshapeOp: MHLO_ShapedInterfaceOp<"dynamic_reshape", [Pure]> {
   let summary = "Reshape a tensor to a given, possibly dynamic, shape.";
   let description = [{
     Reshapes `operand` to `output_shape`.
@@ -2429,8 +2478,8 @@ def HLO_DynamicReshapeOp: HLO_ShapedInterfaceOp<"dynamic_reshape", [Pure]> {
     ```
   }];
 
-  let arguments = (ins HLO_Tensor:$operand, HLO_DimensionTensor:$output_shape);
-  let results = (outs HLO_Tensor:$result);
+  let arguments = (ins MHLO_Tensor:$operand, MHLO_DimensionTensor:$output_shape);
+  let results = (outs MHLO_Tensor:$result);
 
   let hasCanonicalizer = 1;
   // Cannot be exported to legacy formats.
@@ -2440,7 +2489,9 @@ def HLO_DynamicReshapeOp: HLO_ShapedInterfaceOp<"dynamic_reshape", [Pure]> {
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_ScatterOp: HLO_Op<"scatter", [SameVariadicOperandSize, RecursiveMemoryEffects]> {
+def MHLO_ScatterOp: MHLO_Op<"scatter",
+      [SameVariadicOperandSize, RecursiveMemoryEffects,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Scatter operator";
   let description = [{
     Generates a result which is the value of the input array `operand`,
@@ -2450,17 +2501,17 @@ def HLO_ScatterOp: HLO_Op<"scatter", [SameVariadicOperandSize, RecursiveMemoryEf
     See https://www.tensorflow.org/xla/operation_semantics#scatter.
   }];
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
+    Variadic<MHLO_Tensor>:$inputs,
     TensorOf<[AnyInteger, Index]>:$scatter_indices,
-    Variadic<HLO_Tensor>:$updates,
-    ScatterDimensionNumbers:$scatter_dimension_numbers,
+    Variadic<MHLO_Tensor>:$updates,
+    MHLO_ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$unique_indices
   );
 
   let regions = (region SizedRegion<1>:$update_computation);
 
-  let results = (outs Variadic<HLO_Tensor>);
+  let results = (outs Variadic<MHLO_Tensor>);
 
   let hasCustomHLOConverter = 1;
 
@@ -2469,7 +2520,7 @@ def HLO_ScatterOp: HLO_Op<"scatter", [SameVariadicOperandSize, RecursiveMemoryEf
   let hasCanonicalizer = 1;
 }
 
-def HLO_SelectOp: HLO_Op<"select", [Pure, HLO_BroadcastingElementwise,
+def MHLO_SelectOp: MHLO_Op<"select", [Pure, HLO_BroadcastingElementwise,
     InferTensorTypeWithReify]> {
   let summary = "Select operator";
   let description = [{
@@ -2488,15 +2539,14 @@ def HLO_SelectOp: HLO_Op<"select", [Pure, HLO_BroadcastingElementwise,
     ```
   }];
   let arguments = (ins
-    HLO_PredTensor:$pred,
-    HLO_Tensor:$on_true,
-    HLO_Tensor:$on_false
+    MHLO_PredTensor:$pred,
+    MHLO_Tensor:$on_true,
+    MHLO_Tensor:$on_false
   );
 
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
 
   let hasFolder = 1;
-  let hasVerifier = 1;
   let hasCanonicalizer = 1;
 
   let assemblyFormat = [{
@@ -2505,8 +2555,8 @@ def HLO_SelectOp: HLO_Op<"select", [Pure, HLO_BroadcastingElementwise,
   }];
 }
 
-def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
-      [RecursiveMemoryEffects]> {
+def MHLO_SelectAndScatterOp: MHLO_Op<"select_and_scatter",
+      [RecursiveMemoryEffects, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "SelectAndScatter operator";
   let description = [{
     Runs a windowed selection `select` function over `operand` with shape
@@ -2519,9 +2569,9 @@ def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
     See https://www.tensorflow.org/xla/operation_semantics#selectandscatter.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_Tensor:$source,
-    HLO_Tensor:$init_value,
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$source,
+    MHLO_Tensor:$init_value,
     OptionalAttr<I64ElementsAttr>:$window_dimensions,
     OptionalAttr<I64ElementsAttr>:$window_strides,
     OptionalAttr<I64ElementsAttr>:$padding
@@ -2529,13 +2579,13 @@ def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
 
   let regions = (region SizedRegion<1>:$select, SizedRegion<1>:$scatter);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasVerifier = 1;
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_SetDimensionSizeOp: HLO_Op<"set_dimension_size", [Pure,
+def MHLO_SetDimensionSizeOp: MHLO_Op<"set_dimension_size", [Pure,
     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "SetDimensionSize operator";
   let description = [{
@@ -2546,17 +2596,17 @@ def HLO_SetDimensionSizeOp: HLO_Op<"set_dimension_size", [Pure,
     See https://www.tensorflow.org/xla/operation_semantics#setdimensionsize.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I32Tensor:$size,
     I64Attr:$dimension
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasFolder = 1;
   let hasVerifier = 1;
 }
 
-def HLO_SortOp : HLO_Op<"sort",
+def MHLO_SortOp : MHLO_Op<"sort",
       [RecursiveMemoryEffects, SameOperandsAndResultShape, InferTensorType]> {
   let summary = "Sort operator";
   let description = [{
@@ -2566,12 +2616,12 @@ def HLO_SortOp : HLO_Op<"sort",
     See https://www.tensorflow.org/xla/operation_semantics#sort.
   }];
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
+    Variadic<MHLO_Tensor>:$inputs,
     DefaultValuedOptionalAttr<I64Attr, "-1">:$dimension,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$is_stable
   );
 
-  let results = (outs Variadic<HLO_Tensor>);
+  let results = (outs Variadic<MHLO_Tensor>);
 
   let regions = (region SizedRegion<1>:$comparator);
 
@@ -2587,8 +2637,8 @@ def HLO_SortOp : HLO_Op<"sort",
   let hasVerifier = 1;
 }
 
-def HLO_ReverseOp: HLO_ShapedInterfaceOp<"reverse",
-      [Pure, SameOperandsAndResultType, HLO_CompatibleOperandsAndResultType]> {
+def MHLO_ReverseOp: MHLO_Op<"reverse",
+      [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Reverse operator";
   let description = [{
     Reverses the specified dimensions of `operand` according to the given
@@ -2597,16 +2647,34 @@ def HLO_ReverseOp: HLO_ShapedInterfaceOp<"reverse",
     See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$dimensions
   );
 
-  let results = (outs HLO_Tensor);
+  // DISC-Begin
+  let extraClassDeclaration = [{
+    LogicalResult reifyReturnTypeShapes(
+        OpBuilder& builder, ValueRange operands,
+        SmallVectorImpl<Value>& reifiedReturnShapes) {
+      return ::mlir::hlo::deriveShapeFromOperand(&builder, getOperation(),
+                                                 operands.front(),
+                                                 &reifiedReturnShapes);
+    }
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+      return mlir::hlo::isCompatibleForHloTypeInference(l, r);
+    }
+  }];
+  // DISC-end
+
+  let hasVerifier = 1;
+
+  let results = (outs MHLO_Tensor);
 
   let hasFolder = 1;
 }
 
-def HLO_PartitionIdOp : HLO_Op<"partition_id", []> {
+def MHLO_PartitionIdOp : MHLO_Op<"partition_id", [
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "PartitionId operator";
   let description = [{
     Returns the value of the partition id of the currently executing device.
@@ -2628,8 +2696,9 @@ def HLO_PartitionIdOp : HLO_Op<"partition_id", []> {
   let assemblyFormat = "attr-dict `:` type(results)";
 }
 
-def HLO_PadOp: HLO_ShapedInterfaceOp<"pad",
-      [Pure, SameOperandsAndResultElementType, InferTensorType]> {
+def MHLO_PadOp: MHLO_ShapedInterfaceOp<"pad",
+      [Pure, SameOperandsAndResultElementType,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Pad operator";
   let description = [{
     Pads edges and between the elements of `operand` with the `padding_value`
@@ -2655,14 +2724,14 @@ def HLO_PadOp: HLO_ShapedInterfaceOp<"pad",
 
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_Tensor:$padding_value,
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$padding_value,
     I64ElementsAttr:$edge_padding_low,
     I64ElementsAttr:$edge_padding_high,
     I64ElementsAttr:$interior_padding
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   // TODO(b/129422361): PadOp has a custom constructor for HLO.
   let hasCustomHLOConverter = 1;
@@ -2671,7 +2740,7 @@ def HLO_PadOp: HLO_ShapedInterfaceOp<"pad",
   let hasFolder = 1;
 }
 
-def HLO_TraceOp: HLO_Op<"trace", []> {
+def MHLO_TraceOp: MHLO_Op<"trace", []> {
   let summary = "Trace operator";
   let description = [{
     Emits a logging message `tag` with the `operand`.
@@ -2683,14 +2752,14 @@ def HLO_TraceOp: HLO_Op<"trace", []> {
     ```
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     StrAttr:$tag
   );
   let hasCustomHLOConverter = 1;
   let assemblyFormat = "$operand `,` $tag attr-dict `:` type($operand)";
 }
 
-def HLO_TransposeOp: HLO_ShapedInterfaceOp<"transpose",
+def MHLO_TransposeOp: MHLO_ShapedInterfaceOp<"transpose",
       [Pure, SameOperandsAndResultElementType,
       DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Transpose operator";
@@ -2702,16 +2771,16 @@ def HLO_TransposeOp: HLO_ShapedInterfaceOp<"transpose",
     See https://www.tensorflow.org/xla/operation_semantics#transpose.
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
+    MHLO_Tensor:$operand,
     I64ElementsAttr:$permutation
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasFolder = 1;
   let hasCanonicalizer = 1;
 }
 
-def HLO_TriangularSolveOp: HLO_Op<"triangular_solve",
+def MHLO_TriangularSolveOp: MHLO_Op<"triangular_solve",
     [Pure, SameOperandsAndResultElementType, InferTensorType]> {
   let summary = "TriangularSolve operator";
   let description = [{
@@ -2734,17 +2803,17 @@ def HLO_TriangularSolveOp: HLO_Op<"triangular_solve",
     See https://www.tensorflow.org/xla/operation_semantics#triangularsolve.
   }];
   let arguments = (ins
-    HLO_FpOrComplexTensor:$a,
-    HLO_FpOrComplexTensor:$b,
+    MHLO_FpOrComplexTensor:$a,
+    MHLO_FpOrComplexTensor:$b,
     BoolAttr:$left_side,
     BoolAttr:$lower,
     BoolAttr:$unit_diagonal,
-    HLO_TransposeAttr:$transpose_a
+    MHLO_TransposeAttr:$transpose_a
   );
-  let results = (outs HLO_FpOrComplexTensor);
+  let results = (outs MHLO_FpOrComplexTensor);
 }
 
-def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
+def MHLO_ReduceWindowOp: MHLO_Op<"reduce_window", [
       RecursiveMemoryEffects,
       SameVariadicOperandSize,
       SingleBlockImplicitTerminator<"ReturnOp">,
@@ -2762,8 +2831,8 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
   // attributes are 1-d. Attributes' leading dimension should match rank of the
   // operands.
   let arguments = (ins
-    Variadic<HLO_Tensor>:$inputs,
-    Variadic<HLO_Tensor>:$init_values,
+    Variadic<MHLO_Tensor>:$inputs,
+    Variadic<MHLO_Tensor>:$init_values,
     I64ElementsAttr:$window_dimensions,
     // If strides or dilations attributes are missing then the default value is
     // one for each of the operand dimensions. Similarly, padding values are zero
@@ -2774,7 +2843,7 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
     OptionalAttr<I64ElementsAttr>:$padding
   );
 
-  let results = (outs Variadic<HLO_Tensor>);
+  let results = (outs Variadic<MHLO_Tensor>);
 
   // TODO(hinsu): Verify that the attached body arguments and results are
   // compatible with reduce op's operands.
@@ -2793,10 +2862,21 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
       build($_builder, $_state, TypeRange(result_type), ValueRange(operand),
             ValueRange(init_value), window_dimensions, window_strides,
             base_dilations, window_dilations, padding);
-    }]>
+    }]>,
+    OpBuilder<(ins "ValueRange":$operands,
+      "ValueRange":$init_values,
+      "DenseIntElementsAttr":$window_dimensions,
+      "DenseIntElementsAttr":$window_strides,
+      "DenseIntElementsAttr":$base_dilations,
+      "DenseIntElementsAttr":$window_dilations,
+      "DenseIntElementsAttr":$padding,
+      "function_ref<void(OpBuilder &, Location, ValueRange)>":$bodyBuilder
+    )>,
   ];
 
   let hasCustomHLOConverter = 1;
+  let hasFolder = 1;
+  let hasVerifier = 1;
   // TODO(hinsu): Implement custom printer and parser.
 
   let extraClassDeclaration = [{
@@ -2809,7 +2889,9 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
   }];
 }
 
-def HLO_ReturnOp : HLO_Op<"return", [Pure, Terminator]> {
+def MHLO_ReturnOp : MHLO_Op<"return",
+      [Pure, Terminator,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = [{
     The `hlo.return` operation terminates a region and returns values.
 
@@ -2824,7 +2906,7 @@ def HLO_ReturnOp : HLO_Op<"return", [Pure, Terminator]> {
   }];
 
   let arguments = (ins
-    Variadic<HLO_TensorOrTokenOrTuple >:$results
+    Variadic<MHLO_TensorOrTokenOrTuple >:$results
   );
 
   // Disable conversion operator for return op as the op is not an actual XLA
@@ -2834,22 +2916,45 @@ def HLO_ReturnOp : HLO_Op<"return", [Pure, Terminator]> {
   let assemblyFormat = "$results attr-dict (`:` type($results)^)?";
 }
 
-def HLO_TorchIndexSelectOp : HLO_Op<"torch_index_select", [Pure]> {
+def MHLO_TorchIndexSelectOp : MHLO_Op<"torch_index_select", [Pure]> {
+  let summary = "Torch Index Select operator";
+  let description = [{
+    Returns a new tensor which indexes the input tensor along dimension `dim`
+    using the entries in `index`.
+
+    The returned tensor has the same dimensions as `operand`, except for the
+    `dim`th dimension which is replaced by the shape of `index` without the
+    leading `batch_dims` dimensions;
+
+    The `batch_dims` attribute specifies the number of major batch dimensions
+    (0 or more) that act like a multidimensional loop over both the input and
+    the index.
+
+    Example:
+
+    ```mlir
+    %0 = "mhlo.torch_index_select"(%arg0, %arg1) {
+      batch_dims = 1 : i64, dim = 2 : i64
+    } : (tensor<8x128x3072x64xf32>, tensor<8x16x1024xi32>) -> tensor<8x128x16x1024x64xf32>
+    ```
+  }];
+
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_Tensor:$index,
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$index,
     I64Attr:$dim,
     I64Attr:$batch_dims
   );
 
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   // TODO(hinsu): Canonicalize to lower this client side HLO op to server
   // side HLO ops.
 }
 
-def HLO_OptimizationBarrierOp : HLO_Op<"optimization_barrier",
-      [Pure, HLO_PairwiseSameOperandAndResultType]> {
+def MHLO_OptimizationBarrierOp : MHLO_Op<"optimization_barrier",
+      [Pure, HLO_PairwiseSameOperandAndResultType,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = [{
     The `mhlo.optimization_barrier` op blocks optimizations.
 
@@ -2876,9 +2981,9 @@ def HLO_OptimizationBarrierOp : HLO_Op<"optimization_barrier",
     ```
   }];
 
-  let arguments = (ins Variadic<HLO_TensorOrToken>:$operand);
+  let arguments = (ins Variadic<MHLO_TensorOrToken>:$operand);
 
-  let results = (outs Variadic<HLO_TensorOrToken>:$result);
+  let results = (outs Variadic<MHLO_TensorOrToken>:$result);
 
   let hasCustomHLOConverter = 1;
 
@@ -2898,7 +3003,7 @@ def HLO_OptimizationBarrierOp : HLO_Op<"optimization_barrier",
 // MHLO RNG Operators.
 //===----------------------------------------------------------------------===//
 
-def HLO_RngOp : HLO_Op<"rng", [InferTensorTypeWithReify, AllElementTypesMatch<["a", "b", "result"]>]> {
+def MHLO_RngOp : MHLO_Op<"rng", [InferTensorTypeWithReify, AllElementTypesMatch<["a", "b", "result"]>]> {
   let summary = "RNG with uniform distribution.";
   let description = [{
     Constructs an output of a given shape with random numbers generated
@@ -2917,19 +3022,19 @@ def HLO_RngOp : HLO_Op<"rng", [InferTensorTypeWithReify, AllElementTypesMatch<["
                 See https://www.tensorflow.org/xla/operation_semantics#rngnormal.
   }];
   let arguments = (ins
-    0DTensorOf<[HLO_Pred, HLO_Int, HLO_Float]>:$a,
-    0DTensorOf<[HLO_Pred, HLO_Int, HLO_Float]>:$b,
-    HLO_DimensionTensor:$shape,
-    HLO_RngDistributionAttr:$rng_distribution
+    0DTensorOf<[MHLO_Pred, MHLO_Int, MHLO_Float]>:$a,
+    0DTensorOf<[MHLO_Pred, MHLO_Int, MHLO_Float]>:$b,
+    MHLO_DimensionTensor:$shape,
+    MHLO_RngDistributionAttr:$rng_distribution
   );
 
-  let results = (outs HLO_PredIntOrFpTensor:$result);
+  let results = (outs MHLO_PredIntOrFpTensor:$result);
 
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
 }
 
-def HLO_RngBitGeneratorOp : HLO_Op<"rng_bit_generator", [Pure]> {
+def MHLO_RngBitGeneratorOp : MHLO_Op<"rng_bit_generator", [Pure]> {
   let summary = "Uniform random number generator operator";
   let description = [{
     Returns an output with a given shape filled with uniform random bits using
@@ -2939,13 +3044,13 @@ def HLO_RngBitGeneratorOp : HLO_Op<"rng_bit_generator", [Pure]> {
     See https://www.tensorflow.org/xla/operation_semantics#rngbitgenerator.
   }];
   let arguments = (ins
-    HLO_RngAlgorithmAttr:$rng_algorithm,
-    HLO_IntOrFpTensor:$initial_state
+    MHLO_RngAlgorithmAttr:$rng_algorithm,
+    MHLO_IntOrFpTensor:$initial_state
   );
 
   let results = (outs
-      HLO_IntOrFpTensor:$output_state,
-      HLO_IntOrFpTensor:$output
+      MHLO_IntOrFpTensor:$output_state,
+      MHLO_StaticShapeIntOrFpTensor:$output
       );
 
   let hasVerifier = 1;
@@ -2953,7 +3058,7 @@ def HLO_RngBitGeneratorOp : HLO_Op<"rng_bit_generator", [Pure]> {
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_XlaRngGetAndUpdateStateOp: HLO_Op<"xla.rng_get_and_update_state", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def MHLO_XlaRngGetAndUpdateStateOp: MHLO_Op<"xla.rng_get_and_update_state", [DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "RNG state change";
   let description = [{
     This instruction represents the change of the global random number generator
@@ -2979,9 +3084,9 @@ def HLO_XlaRngGetAndUpdateStateOp: HLO_Op<"xla.rng_get_and_update_state", [Decla
 //===----------------------------------------------------------------------===//
 
 // TODO(b/230662142): Implement unknown scales/zero_point cases.
-def HLO_UniformQuantizeOp : HLO_UnaryElementwiseOp<"uniform_quantize",
-      [Pure], TensorOf<[F32, BF16, HLO_QuantizedInt]>,
-      HLO_QuantizedIntTensor> {
+def MHLO_UniformQuantizeOp : MHLO_UnaryElementwiseOp<"uniform_quantize",
+      [Pure], TensorOf<[F32, BF16, MHLO_QuantizedInt]>,
+      MHLO_QuantizedIntTensor> {
   let summary = "Uniform quantize operator";
   let description = [{
     Converts floating point tensors or uniform quantized integer tensors to
@@ -3000,8 +3105,8 @@ def HLO_UniformQuantizeOp : HLO_UnaryElementwiseOp<"uniform_quantize",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_UniformDequantizeOp : HLO_UnaryElementwiseOp<"uniform_dequantize",
-      [InferTensorType, Pure], HLO_QuantizedIntTensor, TensorOf<[F32, BF16]>> {
+def MHLO_UniformDequantizeOp : MHLO_UnaryElementwiseOp<"uniform_dequantize",
+      [InferTensorType, Pure], MHLO_QuantizedIntTensor, TensorOf<[F32, BF16]>> {
   let summary = "Uniform dequantize operator";
   let description = [{
     Converts quantized array of integers to floating-points according to the
@@ -3019,7 +3124,7 @@ def HLO_UniformDequantizeOp : HLO_UnaryElementwiseOp<"uniform_dequantize",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_FusionOp : HLO_Op<"fusion", []> {
+def MHLO_FusionOp : MHLO_Op<"fusion", []> {
   let summary = "Fusion operator";
   let description = [{
     Models the fusion instruction.
@@ -3031,20 +3136,27 @@ def HLO_FusionOp : HLO_Op<"fusion", []> {
   let regions = (region SizedRegion<1>:$fused_computation);
 
   let arguments = (ins
-    Variadic<HLO_TensorOrToken>:$inputs,
-    OptionalAttr<HLO_FusionKindAttr>:$fusion_kind
+    Variadic<MHLO_TensorOrToken>:$inputs,
+    OptionalAttr<MHLO_FusionKindAttr>:$fusion_kind,
+    DefaultValuedOptionalAttr<
+        TypedArrayAttrBase<
+            MHLO_OutputOperandAlias,
+            "Aliasing attribute for outputs and operands of Fusion">,
+        "{}">:$output_operand_aliases
   );
 
   let results = (outs
-    Variadic<AnyTypeOf<[HLO_Tensor, HLO_Tuple]>>:$results
+    Variadic<AnyTypeOf<[MHLO_Tensor, MHLO_Tuple]>>:$results
   );
 
   // FusionOp has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
+
+  let hasVerifier = 1;
 }
 
 // This is an op for purposes internal to XLA/GPU.
-def HLO_BitcastOp : HLO_Op<"bitcast", [Pure]> {
+def MHLO_BitcastOp : MHLO_Op<"bitcast", [Pure]> {
   let summary = "Bitcast operator";
   let description = [{
     This op changes the shape of the input in the way that the physical
@@ -3061,15 +3173,16 @@ def HLO_BitcastOp : HLO_Op<"bitcast", [Pure]> {
     ```
   }];
 
-  let arguments = (ins HLO_Tensor:$operand);
-  let results = (outs HLO_Tensor);
+  let arguments = (ins MHLO_Tensor:$operand);
+  let results = (outs MHLO_Tensor);
   let hasCustomHLOConverter = 1;
+  let hasFolder = 1;
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_ReducePrecisionOp :
-    HLO_Op<"reduce_precision", [HLO_CompatibleOperandsAndResultType]> {
+def MHLO_ReducePrecisionOp :
+    MHLO_Op<"reduce_precision", [HLO_CompatibleOperandsAndResultType, Pure]> {
   let summary = "Reduce precision operator";
   let description = [{
     Models the effect of converting floating - point values to a lower -
@@ -3080,17 +3193,26 @@ def HLO_ReducePrecisionOp :
     implementations.
 
     See https://www.tensorflow.org/xla/operation_semantics#reduceprecision.
+
+    ```mlir
+    %0 = mhlo.reduce_precision %arg0, format = e8m10 : tensor<3x4xf32>
+    ```
   }];
   let arguments = (ins
-    HLO_FpTensor:$operand,
+    MHLO_FpTensor:$operand,
     I32Attr:$exponent_bits,
     I32Attr:$mantissa_bits
   );
   let hasVerifier = 1;
-  let results = (outs HLO_FpTensor:$output);
+  let results = (outs MHLO_FpTensor:$output);
+
+  let assemblyFormat = [{
+    $operand `,` `format` `=` custom<ExponentMantissa>($exponent_bits, $mantissa_bits)
+      attr-dict `:` custom<SameOperandsAndResultType>(type($operand), type($output))
+  }];
 }
 
-def HLO_RealDynamicSliceOp: HLO_ShapedInterfaceOp<
+def MHLO_RealDynamicSliceOp: MHLO_ShapedInterfaceOp<
       "real_dynamic_slice",
       [Pure, AllElementTypesMatch<["operand", "result"]>,
        AllTypesMatch<["start_indices", "limit_indices", "strides"]>]> {
@@ -3109,12 +3231,12 @@ def HLO_RealDynamicSliceOp: HLO_ShapedInterfaceOp<
     ```
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_DimensionTensor:$start_indices,
-    HLO_DimensionTensor:$limit_indices,
-    HLO_DimensionTensor:$strides
+    MHLO_Tensor:$operand,
+    MHLO_DimensionTensor:$start_indices,
+    MHLO_DimensionTensor:$limit_indices,
+    MHLO_DimensionTensor:$strides
   );
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
   let hasCanonicalizer = 1;
   let hasCustomHLOConverter = 1;
   let hasVerifier = 1;
@@ -3122,7 +3244,7 @@ def HLO_RealDynamicSliceOp: HLO_ShapedInterfaceOp<
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_DynamicPadOp: HLO_ShapedInterfaceOp<"dynamic_pad",
+def MHLO_DynamicPadOp: MHLO_ShapedInterfaceOp<"dynamic_pad",
       [Pure, AllElementTypesMatch<["operand", "padding_value", "result"]>,
       AllTypesMatch<["edge_padding_low", "edge_padding_high", "interior_padding"]>]> {
   let summary = "Dynamic Pad operator";
@@ -3142,13 +3264,13 @@ def HLO_DynamicPadOp: HLO_ShapedInterfaceOp<"dynamic_pad",
     ```
   }];
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_Tensor:$padding_value,
-    HLO_DimensionTensor:$edge_padding_low,
-    HLO_DimensionTensor:$edge_padding_high,
-    HLO_DimensionTensor:$interior_padding
+    MHLO_Tensor:$operand,
+    MHLO_Tensor:$padding_value,
+    MHLO_DimensionTensor:$edge_padding_low,
+    MHLO_DimensionTensor:$edge_padding_high,
+    MHLO_DimensionTensor:$interior_padding
   );
-  let results = (outs HLO_Tensor:$result);
+  let results = (outs MHLO_Tensor:$result);
   let description = [{
     Dynamically Pads the `operand`, with amount of padding added at
     low-end/high-end/interior is passed through input tensors.
@@ -3160,7 +3282,7 @@ def HLO_DynamicPadOp: HLO_ShapedInterfaceOp<"dynamic_pad",
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_DynamicGatherOp: HLO_Op<"dynamic_gather",
+def MHLO_DynamicGatherOp: MHLO_Op<"dynamic_gather",
                                 [InferTensorTypeWithReify, Pure]> {
   string summary = "Dynamic Gather operator";
   string description = [{
@@ -3169,19 +3291,20 @@ def HLO_DynamicGatherOp: HLO_Op<"dynamic_gather",
   }];
 
   let arguments = (ins
-    HLO_Tensor:$operand,
-    HLO_IntTensor:$start_indices,
-    HLO_IntTensor:$slice_sizes,
-    GatherDimensionNumbers:$dimension_numbers,
+    MHLO_Tensor:$operand,
+    MHLO_IntTensor:$start_indices,
+    MHLO_IntTensor:$slice_sizes,
+    MHLO_GatherDimensionNumbers:$dimension_numbers,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$indices_are_sorted
   );
-  let results = (outs HLO_Tensor);
+  let results = (outs MHLO_Tensor);
 
   let hasCustomHLOConverter = 1;
   let hasCanonicalizer = 1;
 }
 
-def HLO_DynamicConvOp : HLO_ShapedInterfaceOp<"dynamic_conv", [Pure]> {
+def MHLO_DynamicConvOp : MHLO_Op<"dynamic_conv", [Pure, DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+                              ["reifyReturnTypeShapes"]>]> {
   let summary = "Dynamic Convolution operator";
   let description = [{
     The dynamic shape version of ConvOp. Computes a convolution with dynamic padding.
@@ -3189,16 +3312,17 @@ def HLO_DynamicConvOp : HLO_ShapedInterfaceOp<"dynamic_conv", [Pure]> {
 
   let arguments = !con(
     (ins
-       HLO_Tensor:$lhs,
-       HLO_Tensor:$rhs,
-       HLO_Tensor:$d_padding),
-    ConvolutionAttributes.attributes);
-  let results = (outs HLO_Tensor);
+       MHLO_Tensor:$lhs,
+       MHLO_Tensor:$rhs,
+       MHLO_Tensor:$d_padding),
+    MHLO_ConvolutionAttributes.attributes);
+  let results = (outs MHLO_Tensor);
+  let hasCanonicalizer = 1;
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ComputeReshapeShapeOp :
-    HLO_Op<"compute_reshape_shape", [Pure]> {
+def MHLO_ComputeReshapeShapeOp :
+    MHLO_Op<"compute_reshape_shape", [Pure]> {
   string summary = "Compute input for reshape with any dynamic dim resolved";
 
   string description = [{
@@ -3230,8 +3354,8 @@ def HLO_ComputeReshapeShapeOp :
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
-def HLO_CstrReshapableOp :
-    HLO_Op<"cstr_reshapable", [Pure]> {
+def MHLO_CstrReshapableOp :
+    MHLO_Op<"cstr_reshapable", [Pure]> {
   string summary = "Compute input for reshape with any dynamic dim resolved";
 
   string description = [{
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
new file mode 100644
index 00000000000..c90b50cc950
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_attrs.td
@@ -0,0 +1,339 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
+#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/TensorEncoding.td"
+
+def MHLO_Dims : ArrayRefParameter<"int64_t", "Dimension"> {
+  let parser = "parseDimSizes($_parser)";
+  let printer = "printDimSizes($_printer, $_self)";
+}
+
+def MHLO_ScatterDimensionNumbers : AttrDef<MHLO_Dialect, "ScatterDimensionNumbers"> {
+  let mnemonic = "scatter";
+  let summary = "Attribute that models the dimension information for scatter";
+  let parameters = (ins
+      MHLO_Dims:$updateWindowDims,
+      MHLO_Dims:$insertedWindowDims,
+      MHLO_Dims:$scatterDimsToOperandDims,
+      "int64_t":$indexVectorDim
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+def MHLO_GatherDimensionNumbers : AttrDef<MHLO_Dialect, "GatherDimensionNumbers"> {
+  let mnemonic = "gather";
+  let summary = "Attribute that models the dimension information for gather";
+  let parameters = (ins
+      MHLO_Dims:$offsetDims,
+      MHLO_Dims:$collapsedSliceDims,
+      MHLO_Dims:$startIndexMap,
+      "int64_t":$indexVectorDim
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+def MHLO_DotDimensionNumbers : AttrDef<MHLO_Dialect, "DotDimensionNumbers"> {
+  let mnemonic = "dot";
+  let summary = "Attribute that models the dimension information for dot.";
+  let parameters = (ins
+      MHLO_Dims:$lhsBatchingDimensions,
+      MHLO_Dims:$rhsBatchingDimensions,
+      MHLO_Dims:$lhsContractingDimensions,
+      MHLO_Dims:$rhsContractingDimensions
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+def MHLO_ConvDimensionNumbers : AttrDef<MHLO_Dialect, "ConvDimensionNumbers"> {
+  let cppNamespace = "::mlir::mhlo";
+  let mnemonic = "conv";
+  let summary = "Structure of dimension information for conv op";
+  let parameters = (ins
+    "int64_t":$inputBatchDimension,
+    "int64_t":$inputFeatureDimension,
+    MHLO_Dims:$inputSpatialDimensions,
+    "int64_t":$kernelInputFeatureDimension,
+    "int64_t":$kernelOutputFeatureDimension,
+    MHLO_Dims:$kernelSpatialDimensions,
+    "int64_t":$outputBatchDimension,
+    "int64_t":$outputFeatureDimension,
+    MHLO_Dims:$outputSpatialDimensions
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+def MHLO_OutputOperandAlias : AttrDef<MHLO_Dialect, "OutputOperandAlias"> {
+  let cppNamespace = "::mlir::mhlo";
+  let mnemonic = "output_operand_alias";
+  let summary =
+    "Attribute that models the alias relationship of output and operand of a CustomCall op";
+  let description = [{
+    This attribute captures the alias relationship of the output to one of the
+    operands for a CustomCall op, denoted by `operand_index`. The
+    `output_tuple_indices` and `operand_tuple_indices` are used to index into
+    output and operand types. These indices lists are empty if the corresponding
+    types are not tuple types, and can be arbitrarily long in case of
+    arbitrarily nested tuple types.
+
+    See https://www.tensorflow.org/xla/aliasing.
+
+    Example when used as array with in mhlo.custom-call:
+
+    ```mlir
+    %0 = "mhlo.custom_call"(%arg0, %arg1) {
+      // other attributes
+      output_operand_alias = [
+        #mhlo.output_operand_alias<output_tuple_indices = [0],
+                                   operand_index = 0,
+                                   operand_tuple_indices = [1]>
+      ]
+    } : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
+
+    The output and the 0th operand are both tuples. The aliasing shows the
+    relationship between the 0th element in output tuple with the 1st element in
+    the 0th operand. And both of them are of the same type: tensor<2x3xf32>.
+    ```
+  }];
+  let parameters = (ins
+    MHLO_Dims:$outputTupleIndices,
+    "int64_t":$operandIndex,
+    MHLO_Dims:$operandTupleIndices
+  );
+  let assemblyFormat = [{
+    `<` `output_tuple_indices` `=` $outputTupleIndices `,`
+        `operand_index` `=` $operandIndex `,`
+        `operand_tuple_indices` `=` $operandTupleIndices `>`
+  }];
+}
+
+def MHLO_ArgResultAlias : AttrDef<MHLO_Dialect, "ArgResultAlias"> {
+  let cppNamespace = "::mlir::mhlo";
+  let mnemonic = "result_alias";
+  let summary =
+    "Attribute that models the alias relationship of entry function argument";
+  let description = [{
+    This attribute captures the alias relationship of an MHLO main function
+    argument to one of the results, denoted by `resultIndex`. The
+    `argTupleIndices` and `resultTupleIndices` are used to index into nested
+    tuples in operand and result respectively. If `isMustAlias` is true then the
+    operand-result pair must alias.
+
+    This is meant to be used as an attribute on a function argument in MHLO.
+    For example, in the following code it expresses that `%arg1` may alias 0-th
+    result.
+
+    ```mlir
+    func @main(%arg0: tensor<2xf32>, %arg1: tensor<3xf32> {mhlo.result_alias =
+        mhlo.result_alias<result_index = [2], ...>}
+      ) -> tensor<2xf32>, tensor<3xf32> {
+      // function body ...
+    }
+    ```
+  }];
+  let parameters = (ins
+    MHLO_Dims:$argTupleIndices,
+    "int64_t":$resultIndex,
+    MHLO_Dims:$resultTupleIndices,
+    "bool":$isMustAlias
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+// Represents a unique identifier for each Send/Recv instruction pair or
+// optionally for collective instructions (AllReduce, CollectivePermute,
+// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
+def MHLO_ChannelHandle : AttrDef<MHLO_Dialect, "ChannelHandle"> {
+  let mnemonic = "channel_handle";
+  let parameters = (ins "int64_t":$handle, "int64_t":$type);
+  let summary = "two 64-bit integers 'handle' and 'type'";
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def MHLO_CrossProgramPrefetch : AttrDef<MHLO_Dialect, "CrossProgramPrefetch"> {
+  let mnemonic = "cross_program_prefetch";
+  let parameters = (ins "int64_t":$parameter, MHLO_Dims:$indices, "Optional<int64_t>":$offset);
+  let summary = "Argument that is prefetched from another program";
+  let description = [{
+    This attribute captures an argument that is prefetched from another program.
+    For a given `CrossProgramPrefetchAttr`, `parameter` tells us which argument
+    of the `main` function of the module is prefetched, and `indices` is a shape
+    index telling us what subshape of that argument is prefetched.
+
+    A shape has a subshape iff it is a tuple. In that case, the subshape of
+    the tuple by `indices` is the shape achieved after indexing by each
+    element of `indices` in turn. For example, the [1,0] subshape of
+    `tuple<tuple<token, token>, tuple<tensor<i32>, token>>` is `tensor<i32>`.
+
+    An empty value for `indices` means the whole shape is prefetched.
+
+    For example,
+
+    ```mlir
+    module attributes { mhlo.cross_program_prefetch = [ #mhlo.cross_program_prefetch< parameter = 0, indices = [0]> ]} {
+      func.func @copy(%arg0 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> {
+        %0 = "mhlo.copy"(%arg0) {is_cross_program_prefetch}
+        return %0 : tuple<tensor<2x3xi32>, tensor<i32>>
+      }
+      func.func @main(%arg0 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> {
+        %1 = "mhlo.async_start"(%arg0) {called_computation=@copy}
+        %2 = "mhlo.async_done"(%1) {called_computation=@copy}
+        return %2 : tuple<tensor<2x3xi32>, tensor<i32>>
+      }
+    }
+    ```
+
+    The `parameter = 0` tells us that the async copy of the `0`th parameter is
+    a `cross_program_prefetch`, while the `index` of `[0]` tells us that the
+    `0`th element of the tuple is prefetched while the other element of the
+    tuple is not.
+  }];
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def MHLO_DynamicParameterBinding : AttrDef<MHLO_Dialect, "DynamicParameterBinding"> {
+  let mnemonic = "dynamic_parameter_binding";
+  let summary = "Indicates which parameters represent dynamic dimension sizes.";
+  let description = [{
+    This attribute indicates which parameters to the `main` function of a
+    given module have the runtime values for dynamic dimension sizes for
+    other parameters.
+
+    Each binding here specifies that parameter containing the runtime value of
+    the dynamic dimension size `dynamic_param_num` (which contains the
+    runtime size at a subshape given by the shape index
+    `dynamic_param_index`) and the dimension for which that is the size,
+    which is the `target_param_dim_num` dimension of the `target_param_num`
+    parameter to `main` at subshape `target_param_index`.
+
+    See cross_program_prefetch for a discussion of subshapes.
+
+    For example,
+
+    module @main attributes {
+     mhlo.dynamic_parameter_bindings = [
+       #mhlo.dynamic_parameter_binding<
+         dynamic_param_num = 0,
+         dynamic_param_indices = [],
+         target_param_num = 1,
+         target_param_indices = [],
+         target_param_dim_num = 0>] } {
+     func.func @main(%a : tensor<f32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+       func.return
+     }
+    }
+
+    Here, 'b' (param index 1) has a dynamic shape whose real size is
+    determined at runtime. 'a' represents the real size of b's zeroth
+    dimension.
+
+  }];
+  let parameters = (ins
+      "int64_t":$dynamic_param_num,
+      MHLO_Dims:$dynamic_param_indices,
+      "int64_t":$target_param_num,
+      MHLO_Dims:$target_param_indices,
+      "int64_t":$target_param_dim_num
+  );
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+// Note: This is an experimental attribute and shouldn't be relied upon for
+// production.
+def MHLO_TypeExtensions : AttrDef<MHLO_Dialect, "TypeExtensions", [
+    DeclareAttrInterfaceMethods<VerifiableTensorEncoding>,
+    DeclareAttrInterfaceMethods<HLO_BoundedAttrInterface>]> {
+  let mnemonic = "type_extensions";
+
+  // TODO(b/238903065): Move sparsity related info here from the standalone
+  // attribute. That will allow composition of bounds and sparsity info.
+  let parameters = (ins
+    ArrayRefParameter<"int64_t">:$bounds
+  );
+
+  let summary = "Attribute that extends tensor type with MHLO type properties.";
+
+  let description = [{
+    This attribute is used to extend MLIR tensor type with MHLO tensor specific
+    properties. These properties aren't modeled in the MLIR type. This
+    attribute is set in the `encoding` field of the tensor type.
+
+    See `HLO_BoundedAttrInterface` for documentation for `bounds`.
+  }];
+
+  let assemblyFormat = "`<` `bounds` `=` ` ` custom<DimSizes>($bounds) `>`";
+}
+
+// A layout attribute (1D tensor of index type)
+def MHLO_LayoutAttr : Attr<
+  And<[IndexElementsAttr.predicate,
+       CPred<[{$_self.cast<::mlir::DenseIntElementsAttr>().getType().getRank()
+               == 1}]>]>,
+  "A 1D tensor of index type (layout)"> {
+  let storageType = IndexElementsAttr.storageType;
+  let returnType = IndexElementsAttr.returnType;
+  let convertFromStorage = IndexElementsAttr.convertFromStorage;
+}
+
+// An array of layout (1D tensor) attributes.
+def MHLO_ArrayOfLayoutAttr : TypedArrayAttrBase<MHLO_LayoutAttr,
+    "Array of layout (1D tensor of index type) attributes">;
+
+// An array of FlatSymbolRef attributes that can be used as a default valued
+// attribute.
+def MHLO_FlatSymbolRefArrayAttr :
+  TypedArrayAttrBase<FlatSymbolRefAttr, "flat symbol ref array attribute"> {
+  let constBuilderCall = "::mlir::ArrayAttr::get($_builder.getContext(), $0)";
+}
+
+//===----------------------------------------------------------------------===//
+// Common convolution attributes
+//===----------------------------------------------------------------------===//
+
+def MHLO_BoolElementsAttr :
+    ElementsAttrBase<
+      And<[CPred<"$_self.isa<::mlir::DenseIntOrFPElementsAttr>()">,
+           CPred<"$_self.cast<::mlir::DenseIntOrFPElementsAttr>().getType().getElementType().isInteger(1)">]>,
+      "constant boolean vector/tensor attribute"> {
+  let storageType = [{ ::mlir::DenseElementsAttr }];
+  let returnType = [{ ::mlir::DenseElementsAttr }];
+
+  let convertFromStorage = "$_self";
+}
+
+def MHLO_ConvolutionAttributes {
+  dag attributes = (ins
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$window_strides,
+    // Default value: two zeros for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$padding,
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$lhs_dilation,
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$rhs_dilation,
+    // Default value: false for each of the spatial dimension.
+    OptionalAttr<MHLO_BoolElementsAttr>:$window_reversal,
+    MHLO_ConvDimensionNumbers:$dimension_numbers,
+    I64Attr:$feature_group_count,
+    I64Attr:$batch_group_count,
+    MHLO_PrecisionConfigAttr:$precision_config
+  );
+}
+
+#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ATTRS
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops_common.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops_common.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
index a4c196356b8..64147cb52cb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/hlo_ops_common.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.h"
+#include "mhlo/IR/hlo_ops_common.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
@@ -26,7 +26,7 @@ namespace hlo {
 // Verifies the source target pairs attached to collective permute.
 LogicalResult verifyCollectivePermuteSourceTargetPairs(
     Operation *op, DenseIntElementsAttr attr) {
-  auto type = attr.getType().dyn_cast<RankedTensorType>();
+  auto type = attr.getType().cast<RankedTensorType>();
   if (type.getRank() != 2)
     return op->emitError() << "expect source_target_pairs attribute to be of "
                               "rank 2, but got rank "
@@ -40,6 +40,10 @@ LogicalResult verifyCollectivePermuteSourceTargetPairs(
   llvm::DenseSet<int64_t> targets;
   for (auto i = attr.begin(), e = attr.end(); i != e; ++i) {
     auto val = (*i).getSExtValue();
+    if (val < 0)
+      return op->emitError()
+             << "replica ids in source_target_pairs must be >= 0.";
+
     if (i.getIndex() % 2 == 0) {
       bool isUnique = sources.insert(val).second;
       if (!isUnique) return op->emitError() << "duplicate sources not allowed.";
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h
new file mode 100644
index 00000000000..69e1b43f5c1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
+#define MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
+
+// This file defines functionality shared between chlo/mhlo/lhlo dialects.
+
+#include <algorithm>
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+
+namespace mlir {
+namespace hlo {
+
+// Verifies the source target pairs attached to collective permute.
+LogicalResult verifyCollectivePermuteSourceTargetPairs(
+    Operation* op, DenseIntElementsAttr attr);
+
+LogicalResult verifyReduceScatter(Operation* op, TypeRange operandTypes,
+                                  TypeRange resultTypes,
+                                  uint64_t scatterDimension);
+
+// Custom formatting for convolution window attributes.
+void printWindowAttributes(OpAsmPrinter& p, Operation* op,
+                           llvm::Optional<DenseIntElementsAttr> windowStrides,
+                           llvm::Optional<DenseIntElementsAttr> padding,
+                           llvm::Optional<DenseIntElementsAttr> lhsDilation,
+                           llvm::Optional<DenseIntElementsAttr> rhsDilation,
+                           llvm::Optional<DenseElementsAttr> windowReversal);
+
+ParseResult parseWindowAttributes(OpAsmParser& parser,
+                                  DenseIntElementsAttr& windowStrides,
+                                  DenseIntElementsAttr& padding,
+                                  DenseIntElementsAttr& lhsDilation,
+                                  DenseIntElementsAttr& rhsDilation,
+                                  DenseElementsAttr& windowReversal);
+
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_IR_HLO_OPS_COMMON_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
similarity index 77%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
index d2e2e13e041..67aac3a94ae 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_common.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.td
@@ -16,18 +16,18 @@ limitations under the License.
 #ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON
 #define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON
 
-def HLO_Dialect : Dialect {
+def MHLO_Dialect : Dialect {
   let name = "mhlo";
   let cppNamespace = "::mlir::mhlo";
 
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
   let useDefaultAttributePrinterParser = 0;
   let useDefaultTypePrinterParser = 0;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
-include "stablehlo/dialect/Base.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_enums.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_attrs.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.td"
+include "mhlo/IR/hlo_base.td"
+include "mhlo/IR/hlo_ops_enums.td"
+include "mhlo/IR/hlo_ops_attrs.td"
+include "mhlo/IR/hlo_ops_typedefs.td"
 
 #endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_COMMON
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td
new file mode 100644
index 00000000000..729dbf48cb2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_enums.td
@@ -0,0 +1,264 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
+#define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
+
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/PatternBase.td"
+
+//===----------------------------------------------------------------------===//
+// Precision Config enum definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA PrecisionConfig proto enum.
+def MHLO_PRECISION_DEFAULT : I32EnumAttrCase<"DEFAULT", 0>;
+def MHLO_PRECISION_HIGH    : I32EnumAttrCase<"HIGH", 1>;
+def MHLO_PRECISION_HIGHEST : I32EnumAttrCase<"HIGHEST", 2>;
+def MHLO_PRECISION_PACKED_NIBBLE : I32EnumAttrCase<"PACKED_NIBBLE", 3>;
+
+def MHLO_Precision : I32EnumAttr<"Precision",
+    "XLA precision for an operand. Has backend specific meaning.",
+    [MHLO_PRECISION_DEFAULT, MHLO_PRECISION_HIGH, MHLO_PRECISION_HIGHEST, MHLO_PRECISION_PACKED_NIBBLE]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_PrecisionAttr : EnumAttr<MHLO_Dialect, MHLO_Precision, "precision">;
+
+// TODO(b/129153247) See if it's possible to also validate the size.
+def MHLO_PrecisionConfigAttr:
+    OptionalAttr<
+          TypedArrayAttrBase<MHLO_PrecisionAttr, "Precision Config attribute">>;
+
+//===----------------------------------------------------------------------===//
+// Custom call schedule hints
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA CustomCallSchedule proto enum.
+def MHLO_CUSTOM_CALL_SCHEDULE_NONE : I32EnumAttrCase<"NONE", 0>;
+def MHLO_CUSTOM_CALL_SCHEDULE_LATEST : I32EnumAttrCase<"LATEST", 1>;
+def MHLO_CUSTOM_CALL_SCHEDULE_EARLIEST : I32EnumAttrCase<"EARLIEST", 2>;
+
+// mhlo.custom_call_schedule gives us a scheduling hint for placing calls
+// LATEST indicates that the operation should be scheduled just before the first
+//   user in the use/def chain.
+// EARLIEST indicates that the operation should be scheduled just after the last
+//   operation that defines an argument of this operation in the use/def chain.
+// NONE indicates no hint for the compiler.
+def MHLO_CustomCallSchedule : I32EnumAttr<"CustomCallSchedule",
+    "Specifies the desired schedule for the custom-call.",
+    [MHLO_CUSTOM_CALL_SCHEDULE_NONE, MHLO_CUSTOM_CALL_SCHEDULE_LATEST, MHLO_CUSTOM_CALL_SCHEDULE_EARLIEST]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_CustomCallScheduleAttr : EnumAttr<MHLO_Dialect, MHLO_CustomCallSchedule, "custom_call_schedule">;
+
+//===----------------------------------------------------------------------===//
+// Domain Metadata Kind  enum definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA FftType proto enum.
+def MHLO_DOMAIN_KIND_SHARDING : I32EnumAttrCase<"sharding", 0>;
+
+def MHLO_DomainKind : I32EnumAttr<"DomainKind",
+    "Kind of domain metatdata attached to an HLO domain.",
+    [MHLO_DOMAIN_KIND_SHARDING]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_DomainKindAttr : EnumAttr<MHLO_Dialect, MHLO_DomainKind, "kind">;
+
+//===----------------------------------------------------------------------===//
+// Fast Fourier Transform Type enum definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA FftType proto enum.
+def MHLO_FFT_TYPE_FFT : I32EnumAttrCase<"FFT", 0>;
+def MHLO_FFT_TYPE_IFFT : I32EnumAttrCase<"IFFT", 1>;
+def MHLO_FFT_TYPE_RFFT : I32EnumAttrCase<"RFFT", 2>;
+def MHLO_FFT_TYPE_IRFFT : I32EnumAttrCase<"IRFFT", 3>;
+
+def MHLO_FftType : I32EnumAttr<"FftType",
+    "XLA fast fourier transform type.",
+    [MHLO_FFT_TYPE_FFT, MHLO_FFT_TYPE_IFFT,
+     MHLO_FFT_TYPE_RFFT, MHLO_FFT_TYPE_IRFFT]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_FftTypeAttr : EnumAttr<MHLO_Dialect, MHLO_FftType, "fft_type">;
+
+//===----------------------------------------------------------------------===//
+// Custom call enum definitions.
+//===----------------------------------------------------------------------===//
+
+// TODO(b/189822916): Remove this enum when all clients are migrated to the
+// status-returning API.
+def MHLO_CUSTOM_CALL_API_VERISON_UNSPECIFIED :
+    I32EnumAttrCase<"API_VERSION_UNSPECIFIED", 0>;
+def MHLO_CUSTOM_CALL_API_VERSION_ORIGINAL :
+    I32EnumAttrCase<"API_VERSION_ORIGINAL", 1>;
+def MHLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING :
+    I32EnumAttrCase<"API_VERSION_STATUS_RETURNING", 2>;
+def MHLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING_UNIFIED :
+    I32EnumAttrCase<"API_VERSION_STATUS_RETURNING_UNIFIED", 3>;
+def MHLO_CUSTOM_CALL_API_VERSION_TYPED_FFI :
+    I32EnumAttrCase<"API_VERSION_TYPED_FFI", 4>;
+def MHLO_CustomCallApiVersionAttr :
+    I32EnumAttr<"CustomCallApiVersion", "Custom call API version", [
+        MHLO_CUSTOM_CALL_API_VERISON_UNSPECIFIED,
+        MHLO_CUSTOM_CALL_API_VERSION_ORIGINAL,
+        MHLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING,
+        MHLO_CUSTOM_CALL_API_VERSION_STATUS_RETURNING_UNIFIED,
+        MHLO_CUSTOM_CALL_API_VERSION_TYPED_FFI
+    ]> {
+  let cppNamespace = "::mlir::mhlo";
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison op definitions.
+//===----------------------------------------------------------------------===//
+
+// These mirror the XLA ComparisonDirection enum.
+def MHLO_COMPARISON_DIRECTION_EQ : I32EnumAttrCase<"EQ", 0>;
+def MHLO_COMPARISON_DIRECTION_NE : I32EnumAttrCase<"NE", 1>;
+def MHLO_COMPARISON_DIRECTION_GE : I32EnumAttrCase<"GE", 2>;
+def MHLO_COMPARISON_DIRECTION_GT : I32EnumAttrCase<"GT", 3>;
+def MHLO_COMPARISON_DIRECTION_LE : I32EnumAttrCase<"LE", 4>;
+def MHLO_COMPARISON_DIRECTION_LT : I32EnumAttrCase<"LT", 5>;
+
+def MHLO_ComparisonDirection : I32EnumAttr<"ComparisonDirection",
+    "Which comparison operation to perform.",
+    [
+      MHLO_COMPARISON_DIRECTION_EQ,
+      MHLO_COMPARISON_DIRECTION_NE,
+      MHLO_COMPARISON_DIRECTION_GE,
+      MHLO_COMPARISON_DIRECTION_GT,
+      MHLO_COMPARISON_DIRECTION_LE,
+      MHLO_COMPARISON_DIRECTION_LT
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_ComparisonDirectionAttr : EnumAttr<MHLO_Dialect, MHLO_ComparisonDirection, "comparison_direction">;
+
+def MHLO_DEFAULT_COMPARISON_TYPE : NativeCodeCall<"::mlir::mhlo::ComparisonTypeAttr()">;
+def MHLO_COMPARISON_TYPE_NOTYPE : I32EnumAttrCase<"NOTYPE", 0>;
+def MHLO_COMPARISON_TYPE_FLOAT : I32EnumAttrCase<"FLOAT", 1>;
+def MHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER : I32EnumAttrCase<"TOTALORDER", 2>;
+def MHLO_COMPARISON_TYPE_SIGNED : I32EnumAttrCase<"SIGNED", 3>;
+def MHLO_COMPARISON_TYPE_UNSIGNED : I32EnumAttrCase<"UNSIGNED", 4>;
+
+def MHLO_ComparisonType : I32EnumAttr<"ComparisonType",
+    "Which comparison type to use.",
+    [
+      MHLO_COMPARISON_TYPE_NOTYPE,
+      MHLO_COMPARISON_TYPE_FLOAT,
+      MHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
+      MHLO_COMPARISON_TYPE_SIGNED,
+      MHLO_COMPARISON_TYPE_UNSIGNED
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_ComparisonTypeAttr : EnumAttr<MHLO_Dialect, MHLO_ComparisonType, "comparison_type">;
+
+// These mirror the XLA Dequantize mode string enum.
+def MHLO_MIN_COMBINED : I32EnumAttrCase<"MIN_COMBINED", 0>;
+
+def MHLO_DequantizeMode : I32EnumAttr<"DequantizeMode",
+  "Dequantization mode. Only MIN_COMBINED is supported.",
+  [MHLO_MIN_COMBINED]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_DequantizeModeAttr : EnumAttr<MHLO_Dialect, MHLO_DequantizeMode, "dequantize_mode">;
+
+// These mirror the XLA Transpose enum in Triangular Solve options.
+def MHLO_TRANSPOSE_INVALID : I32EnumAttrCase<"TRANSPOSE_INVALID", 0>;
+def MHLO_NO_TRANSPOSE : I32EnumAttrCase<"NO_TRANSPOSE", 1>;
+def MHLO_TRANSPOSE : I32EnumAttrCase<"TRANSPOSE", 2>;
+def MHLO_ADJOINT : I32EnumAttrCase<"ADJOINT", 3>;
+
+def MHLO_Transpose : I32EnumAttr<"Transpose",
+    "Transpose options",
+    [
+      MHLO_TRANSPOSE_INVALID,
+      MHLO_NO_TRANSPOSE,
+      MHLO_TRANSPOSE,
+      MHLO_ADJOINT
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_TransposeAttr : EnumAttr<MHLO_Dialect, MHLO_Transpose, "transpose">;
+
+def MHLO_LOOP_FUSION : I32EnumAttrCase<"kLoop", 0>;
+def MHLO_INPUT_FUSION : I32EnumAttrCase<"kInput", 1>;
+def MHLO_OUTPUT_FUSION : I32EnumAttrCase<"kOutput", 2>;
+def MHLO_CUSTOM_FUSION : I32EnumAttrCase<"kCustom", 3>;
+def MHLO_FusionKind : I32EnumAttr<"FusionKind", "fusion kind", [
+    MHLO_LOOP_FUSION, MHLO_INPUT_FUSION, MHLO_OUTPUT_FUSION, MHLO_CUSTOM_FUSION
+]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_RNG_DISTRIBUTION_UNIFORM : I32EnumAttrCase<"UNIFORM", 1>;
+def MHLO_RNG_DISTRIBUTION_NORMAL : I32EnumAttrCase<"NORMAL", 2>;
+
+def MHLO_RngDistribution : I32EnumAttr<"RngDistribution",
+    "XLA PRNG distribution to be used.",
+    [
+      MHLO_RNG_DISTRIBUTION_UNIFORM,
+      MHLO_RNG_DISTRIBUTION_NORMAL
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_RngDistributionAttr : EnumAttr<MHLO_Dialect, MHLO_RngDistribution, "rng_distribution"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def MHLO_FusionKindAttr : EnumAttr<MHLO_Dialect, MHLO_FusionKind, "fusion_kind">;
+
+def MHLO_RNG_ALGORITHM_DEFAULT : I32EnumAttrCase<"DEFAULT", 0>;
+def MHLO_RNG_ALGORITHM_THREE_FRY : I32EnumAttrCase<"THREE_FRY", 1>;
+def MHLO_RNG_ALGORITHM_PHILOX : I32EnumAttrCase<"PHILOX", 2>;
+
+def MHLO_RngAlgorithm : I32EnumAttr<"RngAlgorithm",
+    "XLA PRNG algorithm to be used.",
+    [
+      MHLO_RNG_ALGORITHM_DEFAULT,
+      MHLO_RNG_ALGORITHM_THREE_FRY,
+      MHLO_RNG_ALGORITHM_PHILOX
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::mhlo";
+}
+
+def MHLO_RngAlgorithmAttr : EnumAttr<MHLO_Dialect, MHLO_RngAlgorithm, "rng_algorithm"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_ENUMS
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
similarity index 86%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
index 44acebbb923..532915ab980 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_typedefs.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_typedefs.td
@@ -17,7 +17,7 @@ limitations under the License.
 #ifndef MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_TYPEDEFS
 #define MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_TYPEDEFS
 
-def AsyncBundle : TypeDef<HLO_Dialect, "AsyncBundle"> {
+def MHLO_AsyncBundleTypeDef : TypeDef<MHLO_Dialect, "AsyncBundle"> {
   let mnemonic = "async_bundle";
   let summary = "Opaque collection of other types";
   let parameters = (ins ArrayRefParameter<"Type">:$types);
@@ -51,11 +51,11 @@ def AsyncBundle : TypeDef<HLO_Dialect, "AsyncBundle"> {
 }
 
 // Whether a type is a AsyncBundleType.
-def IsAsyncBundleTypePred : CPred<"$_self.isa<::mlir::mhlo::AsyncBundleType>()">;
+def MHLO_IsAsyncBundleTypePred : CPred<"$_self.isa<::mlir::mhlo::AsyncBundleType>()">;
 
-def HLO_AsyncBundle :
-    MixedContainerType<AnyTypeOf<[HLO_Tensor, HLO_Token]>, IsAsyncBundleTypePred,
+def MHLO_AsyncBundle :
+    MixedContainerType<AnyTypeOf<[MHLO_Tensor, MHLO_Token]>, MHLO_IsAsyncBundleTypePred,
                        "AsyncBundleType::getFlattenedTypes($_self.cast<::mlir::mhlo::AsyncBundleType>())",
                        "async_bundle">;
 
-#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_TYPEDEFS
\ No newline at end of file
+#endif // MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_TYPEDEFS
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_patterns.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_patterns.td
new file mode 100644
index 00000000000..2c1534e6568
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_patterns.td
@@ -0,0 +1,90 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Canonicalization patterns for the MHLO dialect.
+
+include "mlir/Dialect/Shape/IR/ShapeOps.td"
+include "mlir/Dialect/Tensor/IR/TensorOps.td"
+include "mhlo/IR/hlo_ops.td"
+
+def HasSameType : Constraint<CPred<"$0.getType() == $1.getType()">>;
+
+// Checks if the value has only one user.
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+
+// Canonicalization patterns.
+
+def DynamicBroadcastToOwnShape_1 : Pat<
+  (MHLO_DynamicBroadcastInDimOp:$op $x,
+    (Shape_ToExtentTensorOp (Shape_ShapeOfOp $x)), $_, $_, $_),
+  (replaceWithValue $x)>;
+def DynamicBroadcastToOwnShape_2 : Pat<
+  (MHLO_DynamicBroadcastInDimOp:$op $x,
+    (Shape_ShapeOfOp $x), $attr0, $attr1, $attr2),
+  (replaceWithValue $x)>;
+def DynamicBroadcastToOwnShape_3 : Pat<
+  (MHLO_DynamicBroadcastInDimOp:$op $x,
+    (Tensor_CastOp (Shape_ToExtentTensorOp (Shape_ShapeOfOp $x))),
+    $attr0, $attr1, $attr2),
+  (Tensor_CastOp $x)>;
+def DynamicBroadcastToOwnShape_4 : Pat<
+  (MHLO_DynamicBroadcastInDimOp:$op $x,
+    (Tensor_CastOp (Shape_ShapeOfOp $x)), $attr0, $attr1, $attr2),
+  (Tensor_CastOp $x)>;
+
+def ShapeOfDynamicReshape : Pat<
+  (Shape_ShapeOfOp:$op (MHLO_DynamicReshapeOp $x, $shape)),
+  (replaceWithValue $shape),
+  [(HasSameType $shape, $op)]>;
+
+def IdentityBroadcastReshape : Pat<
+  (MHLO_ReshapeOp:$op (MHLO_BroadcastOp $input, $dims)),
+  (replaceWithValue $input),
+  [(HasSameType $input, $op)]>;
+
+def IdentityBroadcastInDimReshape : Pat<
+  (MHLO_ReshapeOp:$op (MHLO_BroadcastInDimOp $input, $dims)),
+  (replaceWithValue $input),
+  [(HasSameType $input, $op)]>;
+
+def EliminateIdentityConvert : Pat<
+  (MHLO_ConvertOp:$res $src),
+  (replaceWithValue $src),
+  [(HasSameType $res, $src)]>;
+
+def EliminateRedundantReshape : Pat<
+  (MHLO_ReshapeOp:$res (MHLO_ReshapeOp $src)),
+  (replaceWithValue $src),
+  [(HasSameType $res, $src)]>;
+
+def EliminateIdentityReshape : Pat<
+  (MHLO_ReshapeOp:$res $src),
+  (replaceWithValue $src),
+  [(HasSameType $res, $src)]>;
+
+// select(not(p), t, f) => select(p, f, t)
+def FusePredNegIntoSelect : Pat<
+  (MHLO_SelectOp (MHLO_NotOp $pred), $on_true, $on_false),
+  (MHLO_SelectOp $pred, $on_false, $on_true)>;
+
+// select(broadcast(not(p)), t, f) => select(broadcast(p), f, t)
+def FuseBroadcastedPredNegIntoSelect : Pat<
+  (MHLO_SelectOp
+    (MHLO_BroadcastInDimOp:$b (MHLO_NotOp $pred), $broadcast_dimensions),
+    $on_true, $on_false),
+  (MHLO_SelectOp
+    (MHLO_BroadcastInDimOp $pred, $broadcast_dimensions, (returnType $b)),
+    $on_false, $on_true),
+  [(HasOneUse $b)]>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_utils.td
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_utils.td
index 2146fa2c5a4..b5eb6de56fe 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_utils.td
@@ -30,16 +30,16 @@ def CastIntElementsAttr : NativeCodeCall<"$0.cast<DenseIntElementsAttr>()">;
 class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
 
-class HLO_ConstantLike<string value> : NativeCodeCall<
+class MHLO_ConstantLike<string value> : NativeCodeCall<
     "chlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
 
-def HLO_ConstantLikeMaxFiniteValue : NativeCodeCall<
+def MHLO_ConstantLikeMaxFiniteValue : NativeCodeCall<
     "chlo::getConstantLikeMaxFiniteValue($_builder, $_loc, $0)">;
 
-def HLO_ConstantLikePosInfValue : NativeCodeCall<
+def MHLO_ConstantLikePosInfValue : NativeCodeCall<
     "chlo::getConstantLikeInfValue($_builder, $_loc, $0, /*negative=*/false)">;
 
-def HLO_ConstantLikeNegInfValue : NativeCodeCall<
+def MHLO_ConstantLikeNegInfValue : NativeCodeCall<
     "chlo::getConstantLikeInfValue($_builder, $_loc, $0, /*negative=*/true)">;
 
 def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/init.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/init.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/init.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/init.cc
index b8393b735cf..9cd5e2a9265 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/init.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/init.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/register.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "stablehlo/dialect/ChloOps.h"
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_bytecode.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_bytecode.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
index 12d0c1584d9..9efce77c1a3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/IR/mhlo_bytecode.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h"
+#include "mhlo/IR/mhlo_bytecode.h"
 
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
 #include "mlir/IR/Diagnostics.h"
 #include "stablehlo/dialect/Base.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h
similarity index 85%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h
index e7c36a044c6..a2c78c82c4d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/mhlo_bytecode.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_bytecode.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_MHLO_IR_MHLO_BYTECODE_H
-#define MLIR_HLO_DIALECT_MHLO_IR_MHLO_BYTECODE_H
+#ifndef MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
+#define MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
 
 namespace mlir {
 namespace mhlo {
@@ -26,4 +26,4 @@ void addBytecodeInterface(MhloDialect *dialect);
 }  // namespace mhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_IR_MHLO_BYTECODE_H
+#endif  // MLIR_HLO_MHLO_IR_MHLO_BYTECODE_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_canonicalize.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_canonicalize.td
new file mode 100644
index 00000000000..28ea9a469ad
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/mhlo_canonicalize.td
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the canonicalize pattern definition file.
+
+include "mlir/IR/OpBase.td"
+include "mhlo/IR/hlo_ops.td"
+include "mhlo/IR/hlo_utils.td"
+
+def UnaryToBinaryEinsumEq : NativeCodeCall<
+  "$_builder.getStringAttr(\",\" + $0.getValue().str())">;
+
+def GetI64DenseElementsAttr : NativeCodeCall<
+  "$0.mapValues($_builder.getI64Type(), [](llvm::APInt x) { return x.sext(64); })">;
+
+// Convert UnaryEinsumOp to EinsumOp with two operands with redundant first
+// operand.
+def UnaryEinsumToEinsum : Pat<
+  (MHLO_UnaryEinsumOp $operand, $equation),
+  (MHLO_EinsumOp (MHLO_ConstantOp (GetScalarOfType<1> $operand)),
+                $operand, (UnaryToBinaryEinsumEq $equation))>;
+
+// A dynamic reshape of a dynamic reshape is a dynamic reshape.
+def RemoveRedundantDynamicReshape : Pat<
+  (MHLO_DynamicReshapeOp (MHLO_DynamicReshapeOp $operand, $shape1), $shape2),
+  (MHLO_DynamicReshapeOp $operand, $shape2)>;
+
+// A dynamic broadcast of a dynamic reshape with the same shape operand
+// is a dynamic reshape.
+def RemoveRedundantDynamicBroadcast : Pat<
+  (MHLO_DynamicBroadcastInDimOp
+    (MHLO_DynamicReshapeOp $operand, $shape),
+    $shape,
+    IdentityBroadcastDims,
+    $known_expanding_dimensions,
+    $known_nonexpanding_dimensions),
+  (MHLO_DynamicReshapeOp $operand, $shape)>;
+
+
+// Convert DPad to Pad if edge_padding_low, edge_padding_high and
+// interior_padding are constant.
+def DPadToPad: Pat<
+          (MHLO_DynamicPadOp $input,
+            $padding_value,
+            (ConstantLikeMatcher AnyIntElementsAttr:$edge_padding_low),
+            (ConstantLikeMatcher AnyIntElementsAttr:$edge_padding_high),
+            (ConstantLikeMatcher AnyIntElementsAttr:$interior_padding)),
+          (MHLO_PadOp $input, $padding_value,
+            (GetI64DenseElementsAttr (CastIntElementsAttr $edge_padding_low)),
+            (GetI64DenseElementsAttr (CastIntElementsAttr $edge_padding_high)),
+            (GetI64DenseElementsAttr (CastIntElementsAttr $interior_padding)))>;
+
+// Convert RealDynamicSliceOp to SliceOp if start_indices, limit_indices and
+// strides are constant.
+def RealDSliceToSlice: Pat<
+          (MHLO_RealDynamicSliceOp $operand,
+            (ConstantLikeMatcher AnyIntElementsAttr:$start_indices),
+            (ConstantLikeMatcher AnyIntElementsAttr:$limit_indices),
+            (ConstantLikeMatcher AnyIntElementsAttr:$strides)),
+          (MHLO_SliceOp $operand,
+            (GetI64DenseElementsAttr (CastIntElementsAttr $start_indices)),
+            (GetI64DenseElementsAttr (CastIntElementsAttr $limit_indices)),
+            (GetI64DenseElementsAttr (CastIntElementsAttr $strides)))>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
new file mode 100644
index 00000000000..4e15d7389ca
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_mlir_library(MhloAnalysis
+  shape_component_analysis.cc
+
+  DEPENDS
+  mlir-headers
+
+  LINK_LIBS PUBLIC
+  MLIRAnalysis
+  MLIRIR
+  LmhloDialect
+)
+
+add_mlir_library(MhloTestAnalysis
+  test_shape_component_analysis.cc
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRHLOAnalysis
+  MLIRAnalysis
+  MLIRPass
+  MLIRTransforms
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/shape_component_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Analysis/shape_component_analysis.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
index 320c2969e0a..93e900b9572 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/shape_component_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Analysis/shape_component_analysis.h"
+#include "mhlo/analysis/shape_component_analysis.h"
 
 #include <algorithm>
+#include <optional>
 #include <vector>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -717,7 +718,7 @@ struct ShapeVisitor {
   ArrayRef<SymbolicExpr> lookup(ShapeOrValueInfo requestedInfo) {
     auto i = symbolicExprsMap->find(requestedInfo);
     assert(i != symbolicExprsMap->end() && "op not processed yet?");
-    return llvm::makeArrayRef(i->second);
+    return llvm::ArrayRef(i->second);
   }
 
   // Inserts a new entry into the cache and returns a reference to its result
@@ -748,7 +749,7 @@ ShapeComponentAnalysis::ShapeComponentAnalysis::GetShapeInfo(Value value) {
   compute(request);
   auto found = symbolicExprsMap.find(request);
   if (found == symbolicExprsMap.end()) return {};
-  return llvm::makeArrayRef(found->second);
+  return llvm::ArrayRef(found->second);
 }
 
 Optional<ArrayRef<SymbolicExpr>>
@@ -757,7 +758,7 @@ ShapeComponentAnalysis::ShapeComponentAnalysis::GetValueInfo(Value shape) {
   compute(request);
   auto found = symbolicExprsMap.find(request);
   if (found == symbolicExprsMap.end()) return {};
-  return llvm::makeArrayRef(found->second);
+  return llvm::ArrayRef(found->second);
 }
 
 void ShapeComponentAnalysis::reset() {
@@ -819,7 +820,7 @@ llvm::Optional<Symbol> SymbolicExpr::singleton() const {
     assert(symbols.size() == 1);
     return symbols[0];
   }
-  return llvm::None;
+  return std::nullopt;
 }
 
 void SymbolicExpr::dump(llvm::raw_ostream &os) const {
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/shape_component_analysis.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/shape_component_analysis.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
index 0fcefbfcea2..d1c3b13af11 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Analysis/shape_component_analysis.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/shape_component_analysis.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
-#define MLIR_HLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#ifndef MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#define MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
 
 #include "llvm/Support/raw_ostream.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Value.h"
 
@@ -165,4 +165,4 @@ struct DenseMapInfo<mlir::ShapeComponentAnalysis::Symbol> {
 
 }  // namespace llvm
 
-#endif  // MLIR_HLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
+#endif  // MLIR_HLO_MHLO_ANALYSIS_SHAPE_COMPONENT_ANALYSIS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_shape_component_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_shape_component_analysis.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
index a1ffa95ba0b..a749f7fad73 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Analysis/test_shape_component_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Analysis/shape_component_analysis.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/analysis/shape_component_analysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_TESTSHAPECOMPONENTANALYSIS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h
new file mode 100644
index 00000000000..a0ccb1d7a21
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+#define MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+
+namespace mlir {
+namespace mhlo {
+
+/// Register the external models for bufferizing mhlo ops.
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..8c3395d5829
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -0,0 +1,352 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS mhlo_passes.td)
+mlir_tablegen(mhlo_passes.h.inc -gen-pass-decls -name AllMhlo)
+add_public_tablegen_target(MLIRMhloPassIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(LLVM_TARGET_DEFINITIONS lower_complex/lower_complex_patterns.td)
+mlir_tablegen(lower_complex/generated_lower_complex.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLowerComplexIncGen)
+
+set(LLVM_TARGET_DEFINITIONS legalize_to_standard/legalize_to_standard_patterns.td)
+mlir_tablegen(legalize_to_standard/generated_legalize_to_standard.inc -gen-rewriters)
+add_public_tablegen_target(MLIRMhloLegalizeToStandardIncGen)
+
+set(LLVM_TARGET_DEFINITIONS chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td)
+mlir_tablegen(chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc -gen-rewriters)
+add_public_tablegen_target(MLIRChloLegalizeToHloIncGen)
+
+
+
+add_mlir_library(MhloPasses
+  broadcast_propagation/broadcast_propagation.cc
+  collapse_elementwise_map/collapse_elementwise_map.cc
+  constraint_fusion/constraint_fusion_pass.cc
+  convert_to_signless/convert_to_signless_pass.cc
+  expand_hlo_tuples/expand_hlo_tuples.cc
+  expand_ops_simplifier/expand_ops_simplifier.cc
+  group_reduction_dimensions/group_reduction_dimensions.cc
+  legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+  legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
+  legalize_shape_computations/legalize_shape_computations.cc
+  legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
+  lower_complex/lower_complex.cc
+  lower_complex/lower_complex_patterns.td
+  lower_general_dot/lower_general_dot.cc
+  materialize_broadcasts/materialize_broadcasts.cc
+  materialize_broadcasts/materialize_broadcasts_pass.cc
+  merge_assuming_ops/merge_assuming_ops.cc
+  mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
+  mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
+  mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
+  mhlo_flatten_tuple/mhlo_flatten_tuple.cc
+  prepare_for_export/prepare_for_export.cc
+  optimize_mhlo/optimize_mhlo.cc
+  optimize_mhlo/optimize_mhlo_pass.cc
+  rank_specialization/rank_specialization.cc
+  restrict_max_rank/restrict_max_rank.cc
+  shape_reification/shape_reification_pass.cc
+  shape_simplification/shape_simplification.cc
+  sink_constants_to_control_flow/sink_constants_to_control_flow.cc
+  sparse_rewriting/sparse_rewriting.cc
+  symbolic_shape_optimization/symbolic_shape_optimization.cc
+  test_infer_shaped_type/test_infer_shaped_type_pass.cc
+  unfuse_batch_norm/unfuse_batch_norm.cc
+  unfuse_batch_norm/unfuse_batch_norm_pass.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloLowerComplexIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  ChloOps
+  MhloAnalysis
+  MhloDialect
+  MhloScatterUtils
+  MhloTypeConversion
+  MLIRIR
+  MLIRLinalgDialect
+  MLIRMathDialect
+  MLIRMhloUtils
+  MLIRPass
+  MLIRSCFDialect
+  MLIRSideEffectInterfaces
+  MLIRTransformUtils
+  StablehloBroadcastUtils
+)
+
+add_mlir_library(MhloToThloConversion
+  legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
+
+  DEPENDS
+  MLIRMhloPassIncGen
+  THLODialect
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MhloToArithmeticConversion
+  MhloTypeConversion
+  THLODialect
+  MLIRIR
+  MLIRMhloUtils
+  MLIRPass
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToLhloConversion
+  hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  LmhloDialect
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncTransforms
+  MLIRIR
+  MLIRPass
+  MLIRMathDialect
+  MLIRShapeOpsTransforms
+  MLIRTransformUtils
+  MLIRTransforms
+)
+
+add_mlir_library(MhloToMemrefConversion
+  hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MhloDialect
+  MhloTypeConversion
+  MLIRIR
+  MLIRPass
+  MLIRMathDialect
+  MLIRTransforms
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToArithmeticConversion
+  hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MhloTypeConversion
+  MLIRIR
+  MLIRPass
+  MLIRMathDialect
+  MLIRSCFDialect
+  MLIRTransforms
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToStandard
+  legalize_control_flow/legalize_control_flow.cc
+  legalize_sort/legalize_sort.cc
+  legalize_to_standard/legalize_to_standard.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRlhlo_opsIncGen
+  MLIRMhloLegalizeToStandardIncGen
+  MLIRMhloPassIncGen
+  MhloTypeConversion
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MLIRIR
+  MLIRMathDialect
+  MLIRPass
+  MLIRSCFDialect
+  MLIRTensorDialect
+  MLIRTransformUtils
+)
+
+add_mlir_library(ChloPasses
+  chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+  chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+  sparse_chlo_legalize_to_linalg/sparse_chlo_legalize_to_linalg.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRChloLegalizeToHloIncGen
+  MLIRMhloPassIncGen
+  MLIRLmhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  ChloOps
+  HloToLinalgUtils
+  MLIRComplexDialect
+  MLIRIR
+  MLIRPass
+  MLIRRewrite
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToLinalg
+  legalize_to_linalg/legalize_to_linalg.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  HloToLinalgUtils
+  LmhloDialect
+  MhloDialect
+  MhloToArithmeticConversion
+  MhloTypeConversion
+  MLIRBufferizationDialect
+  MLIRComplexDialect
+  MLIRIR
+  MLIRLinalgTransforms
+  MLIRLinalgUtils
+  MLIRPass
+  MLIRRewrite
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloShapeOpsToStandard
+  hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MhloDialect
+  MhloTypeConversion
+  MLIRComplexDialect
+  MLIRIR
+  MLIRPass
+  MLIRRewrite
+  MLIRTransformUtils
+)
+
+add_mlir_library(MhloToStablehlo
+  hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+  hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
+
+  DEPENDS
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MhloTypeConversion
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  StablehloOps
+)
+
+add_mlir_library(StablehloToMhlo
+  stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+  stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
+
+  DEPENDS
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MhloTypeConversion
+  MLIRAsmParser
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  StablehloOps
+)
+
+add_library(AllMhloPasses INTERFACE)
+target_link_libraries(AllMhloPasses INTERFACE
+  ChloPasses
+  MhloPasses
+  MhloToLhloConversion
+  MhloToArithmeticConversion
+  MhloToMemrefConversion
+  MhloToStandard
+  HloToLinalgUtils
+  MhloToLinalg
+  MhloToThloConversion
+  MhloShapeOpsToStandard
+  MhloToStablehlo
+  StablehloToMhlo
+)
+
+add_library(AllGmlStPasses INTERFACE)
+target_link_libraries(AllGmlStPasses INTERFACE
+  GmlStPasses
+  GmlStTestPasses
+  MLIRFuncDialect
+  MLIRPass
+)
+
+add_library(AllThloPasses INTERFACE)
+target_link_libraries(AllThloPasses INTERFACE
+  ThloPasses
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/broadcast_propagation.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/broadcast_propagation.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
index 777c57bb8e0..44e09a135a1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/broadcast_propagation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Location.h"
@@ -38,7 +38,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_BROADCASTPROPAGATIONPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -328,7 +328,7 @@ void transitivelyEraseUnusedSideEffectFreeOps(Operation *root,
     if (opsToEraseSet.count(op)) continue;
 
     // Erase only operations that are unused and free of side effects.
-    if (!MemoryEffectOpInterface::hasNoEffect(op) ||
+    if (!isMemoryEffectFree(op) ||
         !llvm::all_of(op->getUsers(), [opsToEraseSet](Operation *user) {
           return opsToEraseSet.count(user);
         })) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
index 8e058be70f1..3bd2c477890 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
@@ -24,10 +24,9 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/utils/hlo_utils.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_chlo_to_hlo_op.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -42,6 +41,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/BroadcastUtils.h"
 #include "stablehlo/dialect/ChloOps.h"
+#include "utils/hlo_utils.h"
 
 namespace mlir {
 namespace chlo {
@@ -311,19 +311,19 @@ Value materializeErfcApproximationF64ForMagnituteGeOne(
   Value expZ = rewriter.create<mhlo::ExpOp>(loc, z);
   Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
   Value polP = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::makeArrayRef(kErfcPCoefficients));
+      rewriter, loc, absX, llvm::ArrayRef(kErfcPCoefficients));
   Value expZMulPolyP = rewriter.create<mhlo::MulOp>(loc, expZ, polP);
   Value polQ = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::makeArrayRef(kErfcQCoefficients));
+      rewriter, loc, absX, llvm::ArrayRef(kErfcQCoefficients));
   Value erfcApprox18 = rewriter.create<mhlo::DivOp>(loc, expZMulPolyP, polQ);
 
   // Materialize polynomial approximation for x in >= 8 as
   //   erfc(x) exp(z) R(|x|) / S(|x|).
   Value polR = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::makeArrayRef(kErfcRCoefficients));
+      rewriter, loc, absX, llvm::ArrayRef(kErfcRCoefficients));
   Value expZMulPolyR = rewriter.create<mhlo::MulOp>(loc, expZ, polR);
   Value polS = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::makeArrayRef(kErfcSCoefficients));
+      rewriter, loc, absX, llvm::ArrayRef(kErfcSCoefficients));
   Value erfcApprox8Inf = rewriter.create<mhlo::DivOp>(loc, expZMulPolyR, polS);
 
   // Combine polynomial approximations for x >= 1.
@@ -375,10 +375,10 @@ Value materializeErfApproximationF64ForMagnituteLeOne(
   //   erf(x) = x T(x^2) / U(x^2).
   Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
   Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::makeArrayRef(kErfTCoefficients));
+      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
   Value xMulPolyT = rewriter.create<mhlo::MulOp>(loc, x, polyT);
   Value polyU = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::makeArrayRef(kErfUCoefficients));
+      rewriter, loc, xSq, llvm::ArrayRef(kErfUCoefficients));
   return rewriter.create<mhlo::DivOp>(loc, xMulPolyT, polyU);
 }
 
@@ -475,9 +475,9 @@ Value materializeErfcApproximationF32ForMagnitudeGeOne(
   Value absXLtTwo = rewriter.create<mhlo::CompareOp>(
       loc, absX, two, mhlo::ComparisonDirection::LT);
   Value polP = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::makeArrayRef(kErfcPCoefficients));
+      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcPCoefficients));
   Value polR = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::makeArrayRef(kErfcRCoefficients));
+      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcRCoefficients));
   Value poly = rewriter.create<mhlo::SelectOp>(loc, absXLtTwo, polP, polR);
   Value erfcApprox = rewriter.create<mhlo::MulOp>(loc, expZMulOneDivAbsX, poly);
 
@@ -519,7 +519,7 @@ Value materializeErfApproximationF32ForMagnitudeLeOne(
   //   erf(x) = x T(x^2).
   Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
   Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::makeArrayRef(kErfTCoefficients));
+      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
   return rewriter.create<mhlo::MulOp>(loc, x, polyT);
 }
 
@@ -547,10 +547,10 @@ Value materializeErfApproximationF32(ConversionPatternRewriter &rewriter,
 
   // Materialize polynomial approximation for x in [-4, 4] as
   //   erf(x) = x * Alpha(x^2) / Beta(x^2).
-  Value alphaPoly = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::makeArrayRef(kAlpha));
-  Value betaPoly = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::makeArrayRef(kBeta));
+  Value alphaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
+                                                       llvm::ArrayRef(kAlpha));
+  Value betaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
+                                                      llvm::ArrayRef(kBeta));
   Value xMulAlphaPoly = rewriter.create<mhlo::MulOp>(loc, x, alphaPoly);
   return rewriter.create<mhlo::DivOp>(loc, xMulAlphaPoly, betaPoly);
 }
@@ -1337,26 +1337,6 @@ struct ConvertSinhOp : public OpConversionPattern<SinhOp> {
   }
 };
 
-Value materializeTan(ConversionPatternRewriter &rewriter, Location loc,
-                     ValueRange operands) {
-  TanOp::Adaptor transformed(operands);
-  return rewriter.create<mhlo::DivOp>(
-      loc, rewriter.create<mhlo::SineOp>(loc, transformed.getOperand()),
-      rewriter.create<mhlo::CosineOp>(loc, transformed.getOperand()));
-}
-
-struct ConvertTanOp : public OpConversionPattern<TanOp> {
-  using OpConversionPattern<TanOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      TanOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  rewriter.getF32Type(), &materializeTan));
-    return success();
-  }
-};
-
 // Converts chlo.top_k to MHLO iota, sort, and slice ops.
 //
 // chlo.top_k sorts along last dimension of the input tensor and then returns
@@ -1397,7 +1377,7 @@ struct ConvertTopKOp : public OpConversionPattern<TopKOp> {
     int64_t operandRank = operandType.getRank();
     int64_t lastDimIndex = operandRank - 1;
     int64_t lastDimSize = operandType.getDimSize(lastDimIndex);
-    assert(lastDimSize != ShapedType::kDynamicSize);
+    assert(lastDimSize != ShapedType::kDynamic);
 
     // Create an Iota op for indices.
     auto i32Type = rewriter.getIntegerType(32);
@@ -1697,7 +1677,7 @@ class ConvertDynamicReshapeOp
   }
 };
 
-#include "generated_chlo_legalize_to_hlo.inc"
+#include "chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc"
 }  // namespace
 
 void populateChloBroadcastingPatterns(MLIRContext *context,
@@ -1729,7 +1709,6 @@ void populateDecomposeChloPatterns(MLIRContext *context,
                    ConvertNextAfterOp,
                    ConvertPolygammaOp,
                    ConvertSinhOp,
-                   ConvertTanOp,
                    ConvertTopKOp,
                    ConvertZetaOp>(context);
   // clang-format on
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
similarity index 93%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 0cfa79b48f9..9ef25181fe1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -28,7 +28,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CHLOLEGALIZETOHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
new file mode 100644
index 00000000000..3090ef17a48
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
@@ -0,0 +1,357 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the legalization pattern definition file for CHLO to MHLO.
+// These are included in the PopulateDecomposeChloPatterns factory
+// and should only include canonical expansions which are not actually
+// ambiguous/different for various backends. Avoid patterns that are actually
+// lowering to non-canonical forms.
+
+include "mlir/Dialect/Shape/IR/ShapeOps.td"
+include "mlir/IR/OpBase.td"
+include "mhlo/IR/hlo_ops.td"
+include "stablehlo/dialect/ChloOps.td"
+
+class MHLO_ComparisonDirectionValue<string enumStr> :
+  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
+
+//===----------------------------------------------------------------------===//
+// Unary op patterns.
+//===----------------------------------------------------------------------===//
+
+// Expand acos for non-complex arguments to MHLO dialect as follows:
+//   acos(x) = 2 * atan2(sqrt(1 - x^2), (1 + x))  if x != -1
+//           = pi                                 if x == -1
+//
+// TODO(b/237376133): Support operands with complex element types separately
+// using the following formula.
+//   acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
+def : Pat<(CHLO_AcosOp NonComplexElementType:$input),
+  (MHLO_SelectOp
+    (MHLO_CompareOp
+      $input,
+      (MHLO_ConstantLike<"-1"> $input),
+      MHLO_ComparisonDirectionValue<"NE">,
+      (MHLO_DEFAULT_COMPARISON_TYPE)
+    ),
+    (MHLO_MulOp
+      (MHLO_ConstantLike<"2"> $input),
+      (MHLO_Atan2Op
+        (MHLO_SqrtOp
+          (MHLO_SubtractOp
+            (MHLO_ConstantLike<"1"> $input),
+            (MHLO_MulOp $input, $input)
+          )
+        ),
+        (MHLO_AddOp
+          (MHLO_ConstantLike<"1"> $input),
+          $input
+        )
+      )
+    ),
+    (MHLO_ConstantLike<"M_PI"> $input)
+  )>;
+
+// Expand acosh to MHLO dialect as follows:
+//   acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
+//            = log(x + sqrt((x+1)*(x-1)))
+//   acosh(x) = nan                         if x < -1
+//
+// If x^2 will overflow, we approximate sqrt(x^2 - 1) == x and compute as
+// log(2*x) = log(2) + log(x).  (Note this works because negative x never
+// overflows; x < -1 simply yields nan.
+def : Pat<(CHLO_AcoshOp NonComplexElementType:$input),
+  (MHLO_SelectOp
+    (MHLO_CompareOp
+      $input,
+      (MHLO_ConstantLike<"-1"> $input),
+      MHLO_ComparisonDirectionValue<"LT">,
+      (MHLO_DEFAULT_COMPARISON_TYPE)
+    ),
+    (MHLO_ConstantLike<"NAN"> $input),
+    (MHLO_SelectOp
+      (MHLO_CompareOp
+        $input,
+        (MHLO_SqrtOp
+          (MHLO_ConstantLikeMaxFiniteValue $input)
+        ),
+        MHLO_ComparisonDirectionValue<"GE">,
+        (MHLO_DEFAULT_COMPARISON_TYPE)
+      ),
+      (MHLO_AddOp
+        (MHLO_LogOp $input),
+        (MHLO_LogOp
+          (MHLO_ConstantLike<"2"> $input)
+        )
+      ),
+      (MHLO_LogOp
+        (MHLO_AddOp
+          $input,
+          (MHLO_SqrtOp
+            (MHLO_MulOp
+              (MHLO_AddOp
+                (MHLO_ConstantLike<"1"> $input),
+                $input
+              ),
+              (MHLO_AddOp
+                (MHLO_ConstantLike<"-1"> $input),
+                $input
+              )
+            )
+          )
+        )
+      )
+    )
+  )>;
+
+// Expand acosh for complex arguments to MHLO dialect as
+//   acosh(x) = log(x + sqrt((x+1)*(x-1)))
+//
+// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
+// "For now, we ignore the question of overflow if x is a
+// complex type, because we don't yet have exhaustive tests for complex trig
+// functions".
+def : Pat<(CHLO_AcoshOp ComplexElementType:$input),
+  (MHLO_LogOp
+    (MHLO_AddOp
+      $input,
+      (MHLO_SqrtOp
+        (MHLO_MulOp
+          (MHLO_AddOp
+            $input,
+            (MHLO_ConstantLike<"1"> $input)
+          ),
+          (MHLO_SubtractOp
+            $input,
+            (MHLO_ConstantLike<"1"> $input)
+          )
+        )
+      )
+    )
+  )>;
+
+
+// Expand asin to MHLO dialect as follows:
+//   asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
+def : Pat<(CHLO_AsinOp $input),
+  (MHLO_MulOp
+    (MHLO_ConstantLike<"2"> $input),
+    (MHLO_Atan2Op
+      $input,
+      (MHLO_AddOp
+        (MHLO_ConstantLike<"1"> $input),
+        (MHLO_SqrtOp
+          (MHLO_SubtractOp
+            (MHLO_ConstantLike<"1"> $input),
+            (MHLO_MulOp $input, $input)
+          )
+        )
+      )
+    )
+  )>;
+
+// Expand asinh for non-complex arguments to MHLO dialect as
+//   asinh(x) = log(x + sqrt(x^2 + 1))
+//
+// If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
+// as 2*x and return log(2) + log(x).
+//
+// For small x, sqrt(x^2 + 1) will evaluate to 1 due to floating point
+// arithmetic. However, we would like to retain the low order term of this,
+// which is around 0.5 * x^2 using a binomial expansion.
+// Let z = sqrt(a^2 + 1)
+// The following rewrite retains the lower order term.
+// log(a + sqrt(a^2 + 1))
+//   = log((a + sqrt(a^2 + 1)) * (1 + sqrt(a^2 + 1)) / (1 + sqrt(a^2 + 1)))
+//   = log((a + a^2 + 1 + a * z + z) / (1 + z))
+//   = log(1 + a + a^2 / (1 + z))
+//   = log(1 + a + a^2 / (1 + sqrt(a^2 + 1)))
+//
+// If x is negative, the above would give us some trouble; we can't approximate
+// the result as x + abs(x) = 0 but we are saved by the fact that asinh(-x) =
+// -asinh(x).
+def : Pat<(CHLO_AsinhOp NonComplexElementType:$input),
+  (MHLO_MulOp
+    (MHLO_SignOp $input),
+    (MHLO_SelectOp
+      (MHLO_CompareOp
+        (MHLO_AbsOp $input),
+        (MHLO_SqrtOp
+          (MHLO_ConstantLikeMaxFiniteValue $input)
+        ),
+        MHLO_ComparisonDirectionValue<"GE">,
+        (MHLO_DEFAULT_COMPARISON_TYPE)
+      ),
+      (MHLO_AddOp
+        (MHLO_LogOp
+          (MHLO_AbsOp $input)
+        ),
+        (MHLO_LogOp
+          (MHLO_ConstantLike<"2"> $input)
+        )
+      ),
+      (MHLO_SelectOp
+        (MHLO_CompareOp
+          (MHLO_AbsOp $input),
+          (MHLO_ConstantLike<"1"> $input),
+          MHLO_ComparisonDirectionValue<"LE">,
+          (MHLO_DEFAULT_COMPARISON_TYPE)
+        ),
+        (MHLO_Log1pOp
+          (MHLO_AddOp
+            (MHLO_AbsOp $input),
+            (MHLO_MulOp
+              (MHLO_AbsOp $input),
+              (MHLO_DivOp
+                (MHLO_AbsOp $input),
+                (MHLO_AddOp
+                  (MHLO_ConstantLike<"1"> $input),
+                  (MHLO_SqrtOp
+                    (MHLO_AddOp
+                      (MHLO_MulOp
+                        (MHLO_AbsOp $input),
+                        (MHLO_AbsOp $input)
+                      ),
+                      (MHLO_ConstantLike<"1"> $input)
+                    )
+                  )
+                )
+              )
+            )
+          )
+        ),
+        (MHLO_LogOp
+          (MHLO_AddOp
+            (MHLO_AbsOp $input),
+            (MHLO_SqrtOp
+              (MHLO_AddOp
+                (MHLO_MulOp
+                  (MHLO_AbsOp $input),
+                  (MHLO_AbsOp $input)
+                ),
+                (MHLO_ConstantLike<"1"> $input)
+              )
+            )
+          )
+        )
+      )
+    )
+  )>;
+
+// Expand asinh for complex arguments to MHLO dialect as
+//   asinh(x) = log(x + sqrt(x^2 + 1))
+//
+// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
+// "For now, we ignore the question of overflow if x is a
+// complex type, because we don't yet have exhaustive tests for complex trig
+// functions".
+def : Pat<(CHLO_AsinhOp ComplexElementType:$input),
+  (MHLO_LogOp
+    (MHLO_AddOp
+      $input,
+      (MHLO_SqrtOp
+        (MHLO_AddOp
+          (MHLO_MulOp $input, $input),
+          (MHLO_ConstantLike<"1"> $input)
+        )
+      )
+    )
+  )>;
+
+// Express `atan` as
+//   atan(x) = atan2(x, 1)
+def : Pat<(CHLO_AtanOp $input),
+  (MHLO_Atan2Op
+    $input,
+    (MHLO_ConstantLike<"1"> $input)
+  )>;
+
+// Express `atanh` for non-complex arguments as follows:
+//   atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) <= 1
+//   atanh(x) = nan                          otherwise
+def : Pat<(CHLO_AtanhOp NonComplexElementType:$input),
+  (MHLO_SelectOp
+    (MHLO_CompareOp
+      (MHLO_AbsOp $input),
+      (MHLO_ConstantLike<"1"> $input),
+      MHLO_ComparisonDirectionValue<"GT">,
+      (MHLO_DEFAULT_COMPARISON_TYPE)
+    ),
+    (MHLO_ConstantLike<"NAN"> $input),
+    (MHLO_MulOp
+      (MHLO_SubtractOp
+        (MHLO_Log1pOp $input),
+        (MHLO_Log1pOp
+          (MHLO_NegOp $input)
+        )
+      ),
+      (MHLO_ConstantLike<"0.5"> $input)
+    )
+  )>;
+
+// Express `atanh` for complex arguments as follows:
+//   atanh(x) = (log(1 + x) - log(1 + (-x))) * 0.5
+//
+// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
+// "For now, we ignore the nan edge case for complex inputs,
+// because we don't yet have exhaustive tests for complex trig functions".
+def : Pat<(CHLO_AtanhOp ComplexElementType:$input),
+  (MHLO_MulOp
+    (MHLO_SubtractOp
+      (MHLO_Log1pOp $input),
+      (MHLO_Log1pOp
+        (MHLO_NegOp $input)
+      )
+    ),
+    (MHLO_ConstantLike<"0.5"> $input)
+  )>;
+
+// Express `conj` as
+//   conj(x) = (re(x), -im(x)).
+def : Pat<(CHLO_ConjOp $v),
+          (MHLO_ComplexOp (MHLO_RealOp $v), (MHLO_NegOp (MHLO_ImagOp $v)))>;
+
+// Express `is_inf` as
+//   is_inf(x) = is_pos_inf(|x|)
+def : Pat<(CHLO_IsInfOp NonComplexElementType:$input),
+  (CHLO_IsPosInfOp
+    (MHLO_AbsOp $input)
+  )>;
+
+// Express `is_pos_inf` as
+//   is_pos_inf(x) = (x == +inf)
+def : Pat<(CHLO_IsPosInfOp NonComplexElementType:$input),
+  (MHLO_CompareOp
+    $input,
+    (MHLO_ConstantLikePosInfValue $input),
+    MHLO_ComparisonDirectionValue<"EQ">,
+    (MHLO_DEFAULT_COMPARISON_TYPE)
+  )>;
+
+// Express `is_neg_inf` as
+//   is_neg_inf(x) = (x == -inf)
+def : Pat<(CHLO_IsNegInfOp NonComplexElementType:$input),
+  (MHLO_CompareOp
+    $input,
+    (MHLO_ConstantLikeNegInfValue $input),
+    MHLO_ComparisonDirectionValue<"EQ">,
+    (MHLO_DEFAULT_COMPARISON_TYPE)
+  )>;
+
+def : Pat<(CHLO_ConstantOp $v),
+          (MHLO_ConstantOp $v)>;
+
+def : Pat<(CHLO_TanOp $v),
+          (MHLO_TanOp $v)>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/collapse_elementwise_map.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/collapse_elementwise_map.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
index cf8b2309ae0..adc3937a9e0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/collapse_elementwise_map.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -28,7 +28,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_COLLAPSEELEMENTWISEMAPPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -58,7 +58,7 @@ struct ConvertMapOfElementwiseOps : public OpRewritePattern<MapOp> {
     }
 
     rewriter.setInsertionPointAfter(map);
-    BlockAndValueMapping blockAndValueMap;
+    IRMapping blockAndValueMap;
     for (mlir::BlockArgument barg :
          map.getComputation().front().getArguments()) {
       blockAndValueMap.map(barg, map->getOperand(barg.getArgNumber()));
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/constraint_fusion_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/constraint_fusion_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
index 5ddb16cc50d..4dae80389f5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/constraint_fusion_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/constraint_fusion/constraint_fusion_pass.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Pass/Pass.h"
@@ -30,7 +30,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CONSTRAINTFUSIONPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/convert_to_signless_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/convert_to_signless_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
index 2ca30707a0d..3037b380d07 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/convert_to_signless_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/convert_to_signless/convert_to_signless_pass.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
@@ -40,7 +40,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CONVERTTOSIGNLESSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/expand_hlo_tuples.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
similarity index 89%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/expand_hlo_tuples.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index fc6401ec083..d9a3e6d9520 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/expand_hlo_tuples.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -36,7 +37,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_EXPANDHLOTUPLESPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -102,27 +103,27 @@ class ExpandHloTuplesPass
 
     // Update output signatures.
     auto returnOp = cast<mlir::func::ReturnOp>(func.getBody().back().back());
+    OpBuilder builder(returnOp);
 
     // Expand all tuples in old return operands.
     SmallVector<Value, 4> expandedReturnOperands;
     SmallVector<Type, 4> expandedResultTypes;
     for (auto value : returnOp.getOperands()) {
-      auto tuple = dyn_cast_or_null<mhlo::TupleOp>(value.getDefiningOp());
-      if (!tuple) {
+      if (auto tupleTy = value.getType().dyn_cast<TupleType>()) {
+        llvm::copy(tupleTy.getTypes(), std::back_inserter(expandedResultTypes));
+        for (auto [index, ty] : llvm::enumerate(tupleTy.getTypes())) {
+          expandedReturnOperands.push_back(
+              builder.createOrFold<mhlo::GetTupleElementOp>(value.getLoc(), ty,
+                                                            value, index));
+        }
+      } else {
         expandedReturnOperands.push_back(value);
         expandedResultTypes.push_back(value.getType());
-        continue;
-      }
-
-      for (auto tupleOperand : tuple.getOperands()) {
-        expandedReturnOperands.push_back(tupleOperand);
-        expandedResultTypes.push_back(tupleOperand.getType());
       }
     }
 
     if (returnOp.getOperands() == expandedReturnOperands) return;
 
-    OpBuilder builder(returnOp);
     builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
                                          expandedReturnOperands);
     returnOp.erase();
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
new file mode 100644
index 00000000000..1a47c0bb72f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/expand_ops_simplifier/expand_ops_simplifier.cc
@@ -0,0 +1,229 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file replaces some complicated HLOs such as SelectAndScatter with a
+// sequence of simpler HLOs.
+
+#include <cstddef>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace mhlo {
+
+#define GEN_PASS_DEF_MHLOEXPANDOPSSIMPLIFIERPASS
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+namespace {
+
+ShapedType getScalarizedType(ShapedType t) {
+  return t.cloneWith(llvm::ArrayRef<int64_t>(std::nullopt), t.getElementType());
+}
+
+struct SelectAndScatterExpanderPattern
+    : public OpRewritePattern<SelectAndScatterOp> {
+  using OpRewritePattern<SelectAndScatterOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SelectAndScatterOp sas,
+                                PatternRewriter& rewriter) const override {
+    // Capture original values with variables
+    ImplicitLocOpBuilder builder(sas.getLoc(), rewriter);
+    TypedValue<TensorType> operand = sas.getOperand();
+    llvm::ArrayRef<int64_t> operandShape = operand.getType().getShape();
+    TypedValue<TensorType> source = sas.getSource();
+    Value initValue = sas.getInitValue();
+    Region& select = sas.getSelect();
+    Region& scatter = sas.getScatter();
+    TensorType sasType = sas.getType();
+
+    // Useful shapes
+    const auto iotaShape =
+        operand.getType().cloneWith(operandShape, rewriter.getI64Type());
+    const auto sourceShape = source.getType().getShape();
+    const auto iotaShapeReduced =
+        source.getType().cloneWith(sourceShape, rewriter.getI64Type());
+    const auto scalarIota = getScalarizedType(iotaShapeReduced);
+
+    // Construct one iota for each dimension. This will reduced in the reduction
+    // to determine the indices to be scattered to.
+    llvm::SmallVector<Value> iotas;
+    iotas.reserve(operandShape.size());
+    for (size_t i = 0; i < operandShape.size(); ++i) {
+      iotas.push_back(builder.create<mhlo::IotaOp>(iotaShape, i));
+    }
+
+    // ReduceWindow arguments
+    auto numReduceValues = iotas.size() + 1;
+    auto negOne = builder.create<mhlo::ConstantOp>(
+        mlir::DenseIntElementsAttr::get(scalarIota, (uint64_t)-1));
+    llvm::SmallVector<Value> reduceInitValues(numReduceValues, negOne);
+    reduceInitValues.front() = initValue;
+
+    // ReduceWindow arguments
+    llvm::SmallVector<Value> ops;
+    ops.reserve(numReduceValues);
+    ops.push_back(operand);
+    ops.insert(ops.end(), iotas.begin(), iotas.end());
+
+    // Construct ReduceWindow and its region.
+    auto reduceWindow = builder.create<mhlo::ReduceWindowOp>(
+        ops, reduceInitValues, sas.getWindowDimensionsAttr(),
+        sas.getWindowStridesAttr(), /*dilations=*/nullptr,
+        /*dilations=*/nullptr, sas.getPaddingAttr(),
+        [&](OpBuilder& b, Location loc, ValueRange /*values*/) {
+          ImplicitLocOpBuilder builder(loc, b);
+          Block* block = b.getBlock();
+          auto rhsBegin = static_cast<int64_t>(numReduceValues);
+          auto lhsBegin = 0;
+          auto firstIota = 1;
+          Value firstLhsIota = block->getArgument(firstIota);
+          Value firstRhsIota = block->getArgument(firstIota + rhsBegin);
+          Value lhsFirstInWindow = builder.create<mhlo::CompareOp>(
+              firstLhsIota, negOne, mhlo::ComparisonDirection::NE);
+          // Current implementations of ReduceWindow do not need the following
+          // line in their implementations, but it is actually required in the
+          // documented behavior of the implementation which allows the seed
+          // value to occur on both lhs and rhs sides when padding occurs.
+          Value rhsFirstInWindow = builder.create<mhlo::CompareOp>(
+              firstRhsIota, negOne, mhlo::ComparisonDirection::NE);
+          auto rhsNotFirstInWindow =
+              builder.create<mhlo::NotOp>(rhsFirstInWindow);
+
+          Value operandLhs = block->getArgument(0);
+          Value operandRhs = block->getArgument(rhsBegin);
+          llvm::SmallVector<Value> selectIns;
+          selectIns.push_back(operandLhs);
+          selectIns.push_back(operandRhs);
+          rewriter.mergeBlocks(&select.front(), block, selectIns);
+          Value call = block->back().getOperand(0);
+          rewriter.eraseOp(&block->back());
+
+          Value pred = builder.create<mhlo::AndOp>(call, lhsFirstInWindow);
+          pred = builder.create<mhlo::OrOp>(pred, rhsNotFirstInWindow);
+
+          llvm::SmallVector<Value> resultTuple;
+          for (auto i = lhsBegin; i < rhsBegin; ++i) {
+            Value iotaLhs = block->getArgument(i);
+            Value iotaRhs = block->getArgument(i + rhsBegin);
+            resultTuple.push_back(
+                builder.create<mhlo::SelectOp>(pred, iotaLhs, iotaRhs));
+          }
+          builder.create<mhlo::ReturnOp>(resultTuple);
+        });
+
+    // Handle the results of the reduction
+    llvm::SmallVector<Value> iotaIndices;
+    llvm::SmallVector<int64_t> broadcastedIotaDims;
+    broadcastedIotaDims.reserve(iotaShapeReduced.getRank() + 1);
+    broadcastedIotaDims.insert(broadcastedIotaDims.end(),
+                               iotaShapeReduced.getShape().begin(),
+                               iotaShapeReduced.getShape().end());
+    broadcastedIotaDims.push_back(1);
+    auto broadcastedIotaShape = RankedTensorType::get(
+        broadcastedIotaDims, iotaShapeReduced.getElementType());
+
+    for (size_t i = 1; i < numReduceValues; ++i) {
+      Value element = reduceWindow.getResult(i);
+      iotaIndices.push_back(
+          builder.create<mhlo::ReshapeOp>(broadcastedIotaShape, element)
+              .getResult());
+    }
+
+    // Prepare scatter inputs
+    llvm::SmallVector<int64_t> scatterDims(operandShape.size());
+    std::iota(scatterDims.begin(), scatterDims.end(), 0);
+    Value broadcastedInitValue = builder.create<mhlo::BroadcastOp>(
+        initValue, mlir::DenseIntElementsAttr::get(
+                       RankedTensorType::get(sasType.getShape().size(),
+                                             rewriter.getIntegerType(64, true)),
+                       sasType.getShape()));
+
+    llvm::SmallVector<int64_t> concatenatedIotasDims;
+    concatenatedIotasDims.reserve(
+        iotaIndices.front().getType().cast<ShapedType>().getRank());
+    concatenatedIotasDims.insert(concatenatedIotasDims.end(),
+                                 broadcastedIotaDims.begin(),
+                                 broadcastedIotaDims.end());
+    concatenatedIotasDims.back() = static_cast<int64_t>(iotaIndices.size());
+    Value indices = builder.create<mhlo::ConcatenateOp>(
+        RankedTensorType::get(concatenatedIotasDims,
+                              iotaShape.getElementType()),
+        iotaIndices, iotaShape.getRank());
+
+    // Scatter
+    auto dimNums = mhlo::ScatterDimensionNumbersAttr::get(
+        sas->getContext(),
+        /*updateWindowDims=*/{},
+        /*insertedWindowDims=*/scatterDims,
+        /*scatterDimsToOperandDims=*/scatterDims,
+        /*indexVectorDim=*/source.getType().getRank());
+    auto scatterOp = builder.create<mhlo::ScatterOp>(
+        /*shape=*/sasType, /*operand=*/broadcastedInitValue,
+        /*scatter_indices=*/indices, /*updates=*/source,
+        /*scatter_dim_numbers=*/dimNums,
+        /*indices_are_sorted=*/false, /*unique_indices=*/false);
+
+    // Prepare ScatterOp block and then copy SelectAndScatter's body
+    llvm::SmallVector<Type> scatterIns;
+    llvm::SmallVector<Location> scatterLocs;
+    scatterIns.push_back(RankedTensorType::get(
+        {},
+        broadcastedInitValue.getType().cast<ShapedType>().getElementType()));
+    scatterIns.push_back(
+        RankedTensorType::get({}, source.getType().getElementType()));
+    scatterLocs.push_back(broadcastedInitValue.getLoc());
+    scatterLocs.push_back(source.getLoc());
+
+    rewriter.inlineRegionBefore(scatter, scatterOp.getUpdateComputation(),
+                                scatterOp.getUpdateComputation().end());
+    rewriter.replaceOp(sas, scatterOp.getResults());
+    return success();
+  }
+};
+
+struct MhloExpandOpsSimplifierPass
+    : impl::MhloExpandOpsSimplifierPassBase<MhloExpandOpsSimplifierPass> {
+  void runOnOperation() override {
+    auto* ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.add<SelectAndScatterExpanderPattern>(ctx);
+
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+createMhloExpandOpsSimplifierPass() {
+  return std::make_unique<MhloExpandOpsSimplifierPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/group_reduction_dimensions.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/group_reduction_dimensions.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
index 3c21fdcc460..b7a9718a4cd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/group_reduction_dimensions.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -34,7 +34,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_GROUPREDUCTIONDIMENSIONSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_shape_ops_to_standard.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_shape_ops_to_standard.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
index 024346d993a..e539564c87d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_shape_ops_to_standard.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -44,7 +44,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZESHAPEOPSTOSTANDARDPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -137,7 +137,7 @@ struct CstrReshapableConversion
                         : rewriter.create<arith::IndexCastOp>(
                               loc, extentType, adaptor.getOperands()[1]);
     auto reduction = rewriter.create<shape::ReduceOp>(
-        loc, newShape, llvm::makeArrayRef({one, zero, zero}));
+        loc, newShape, llvm::ArrayRef({one, zero, zero}));
     {
       PatternRewriter::InsertionGuard g(rewriter);
       auto* body = reduction.getBody();
@@ -158,7 +158,7 @@ struct CstrReshapableConversion
       Value totalElements = rewriter.create<arith::MulIOp>(
           loc, extentOrOne, body->getArgument(2));
       rewriter.create<shape::YieldOp>(
-          loc, llvm::makeArrayRef({totalElements, totalDynamic, totalInvalid}));
+          loc, llvm::ArrayRef({totalElements, totalDynamic, totalInvalid}));
     }
     // Avoid division by zero.
     Value isZeroElements = rewriter.create<arith::CmpIOp>(
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_arithmetic.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_arithmetic.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
index 057ee6067bd..955182f3b52 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_arithmetic.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -33,7 +33,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZETOARITHMETICPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -111,11 +111,18 @@ struct RngGetAndUpdateStatePattern
 
 template <typename OpTy>
 struct ScalarHloToArithmeticPattern : public OpConversionPattern<OpTy> {
-  using OpConversionPattern<OpTy>::OpConversionPattern;
+  ScalarHloToArithmeticPattern(
+      TypeConverter& typeConverter, MLIRContext* context,
+      llvm::function_ref<bool(Operation*)> filterFn = nullptr,
+      PatternBenefit benefit = 1)
+      : OpConversionPattern<OpTy>(typeConverter, context, benefit),
+        filterFn(filterFn) {}
 
   LogicalResult matchAndRewrite(
       OpTy op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
+    if (filterFn && !filterFn(op)) return failure();
+
     auto isScalar = [&](Value v) {
       return v.getType().cast<ShapedType>().getRank() == 0;
     };
@@ -141,6 +148,9 @@ struct ScalarHloToArithmeticPattern : public OpConversionPattern<OpTy> {
                                                         scalarResult);
     return success();
   }
+
+ private:
+  llvm::function_ref<bool(Operation*)> filterFn;
 };
 
 struct HloLegalizeToArithmeticPass
@@ -177,7 +187,8 @@ void populateHloToArithmeticConversionPatterns(RewritePatternSet* patterns) {
 
 void populateScalarHloToArithmeticConversionPatterns(
     MLIRContext* context, TypeConverter& typeConverter,
-    RewritePatternSet* patterns) {
+    RewritePatternSet* patterns,
+    llvm::function_ref<bool(Operation*)> filterFn) {
   // clang-format off
   patterns->add<
       ScalarHloToArithmeticPattern<mhlo::AbsOp>,
@@ -225,9 +236,10 @@ void populateScalarHloToArithmeticConversionPatterns(
       ScalarHloToArithmeticPattern<mhlo::SineOp>,
       ScalarHloToArithmeticPattern<mhlo::SqrtOp>,
       ScalarHloToArithmeticPattern<mhlo::SubtractOp>,
+      ScalarHloToArithmeticPattern<mhlo::TanOp>,
       ScalarHloToArithmeticPattern<mhlo::TanhOp>,
       ScalarHloToArithmeticPattern<mhlo::XorOp>
-  >(typeConverter, context);
+  >(typeConverter, context, filterFn);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
index fc0a9a7cbe7..55b32099e6e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
@@ -16,13 +16,14 @@ limitations under the License.
 // This file implements logic for lowering HLO dialect to LHLO dialect.
 
 #include <algorithm>
+#include <optional>
 #include <utility>
 
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/map_hlo_to_lhlo_op.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo/transforms/map_hlo_to_lhlo_op.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
@@ -35,7 +36,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -50,7 +51,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZETOLHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -60,17 +61,14 @@ using BaseOpConversion = OpConversionPattern<T>;
 Value insertDynamicAlloc(Location loc, Value result, Value shapeOperand,
                          ConversionPatternRewriter* rewriter) {
   auto resultType = result.getType().dyn_cast<RankedTensorType>();
-  if (!resultType) {
-    result.getDefiningOp()->emitOpError()
-        << "tensor to buffer conversion expects ranked results";
-  }
+  assert(resultType);
   auto memrefType =
       MemRefType::get(resultType.getShape(), resultType.getElementType());
 
   // Extract the required element out of the vector.
   SmallVector<Value, 4> dynamicOperands;
   for (const auto& shapeElement : llvm::enumerate(resultType.getShape())) {
-    if (shapeElement.value() != ShapedType::kDynamicSize) continue;
+    if (shapeElement.value() != ShapedType::kDynamic) continue;
     Value index =
         rewriter->create<arith::ConstantIndexOp>(loc, shapeElement.index());
     Value allocOperand =
@@ -88,10 +86,7 @@ Value insertDynamicAlloc(Location loc, Value result, Value shapeOperand,
 Value insertAlloc(Location loc, OpResult result,
                   ConversionPatternRewriter* rewriter) {
   auto resultType = result.getType().dyn_cast<RankedTensorType>();
-  if (!resultType || !resultType.hasStaticShape()) {
-    result.getDefiningOp()->emitOpError()
-        << "tensor to buffer conversion expects statically shaped results";
-  }
+  assert(resultType && resultType.hasStaticShape());
   auto memrefType =
       MemRefType::get(resultType.getShape(), resultType.getElementType());
   OpBuilder::InsertionGuard guard(*rewriter);
@@ -150,10 +145,11 @@ class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
     Operation* op = hloOp.getOperation();
     SmallVector<Value, 4> bufferArgs(adaptor.getOperands());
     if (failed(convertResults(op, bufferArgs, rewriter))) return failure();
-    rewriter.create<mhlo::HloToLhloOp<HloOpTy>>(op->getLoc(), llvm::None,
+    rewriter.create<mhlo::HloToLhloOp<HloOpTy>>(op->getLoc(), std::nullopt,
                                                 bufferArgs, op->getAttrs());
-    rewriter.replaceOp(op, llvm::makeArrayRef(bufferArgs)
-                               .drop_front(adaptor.getOperands().size()));
+    rewriter.replaceOp(
+        op,
+        llvm::ArrayRef(bufferArgs).drop_front(adaptor.getOperands().size()));
     return success();
   }
 };
@@ -174,7 +170,7 @@ class HloToLhloOpConverter<mhlo::DotOp> : public BaseOpConversion<mhlo::DotOp> {
     SmallVector<Value, 2> bufferArgs(adaptor.getOperands());
     if (failed(convertResults(op, bufferArgs, rewriter))) return failure();
 
-    auto dotOp = rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None,
+    auto dotOp = rewriter.create<lmhlo::DotOp>(op->getLoc(), std::nullopt,
                                                bufferArgs, op->getAttrs());
     // MHLO's Dot uses rank-2 operands, of the form ([N, M], [M, O]) -> [N, O].
     auto dimensionNumbers = mhlo::DotDimensionNumbersAttr::get(
@@ -201,7 +197,7 @@ struct HloToLhloCustomCallOpConverter
     if (failed(convertResults(op, bufferArgs, rewriter))) return failure();
 
     auto lhloOp = rewriter.create<lmhlo::CustomCallOp>(
-        op->getLoc(), llvm::None, bufferArgs, op->getAttrs());
+        op->getLoc(), std::nullopt, bufferArgs, op->getAttrs());
     // Setup AttrSizedOperandSegments attribute to indicate number of operands
     // for args and outputs.
     const int32_t segments[2] = {
@@ -248,7 +244,7 @@ struct HloToLhloDotGeneralOpConverter
                                          resultsShape.front(), &rewriter);
     }
 
-    rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None, bufferArgs,
+    rewriter.create<lmhlo::DotOp>(op->getLoc(), std::nullopt, bufferArgs,
                                   op->getAttrs());
     rewriter.replaceOp(op, bufferArgs[2]);
     return success();
@@ -273,7 +269,7 @@ struct HloToLhloReduceLikeOpConverter : public BaseOpConversion<HloOpTy> {
     SmallVector<Value, 4> bufferArgs(adaptor.getOperands());
     if (failed(convertResults(op, bufferArgs, rewriter))) return failure();
     auto newOp = rewriter.create<mhlo::HloToLhloOp<HloOpTy>>(
-        loc, llvm::None, bufferArgs, op->getAttrs());
+        loc, std::nullopt, bufferArgs, op->getAttrs());
 
     // Copy over the operations inside the region.
     rewriter.inlineRegionBefore(hloOp.getBody(), newOp.getBody(),
@@ -571,6 +567,7 @@ void populateHloToLhloConversionPattern(
       HloToLhloOpConverter<mhlo::SliceOp>,
       HloToLhloOpConverter<mhlo::SqrtOp>,
       HloToLhloOpConverter<mhlo::SubtractOp>,
+      HloToLhloOpConverter<mhlo::TanOp>,
       HloToLhloOpConverter<mhlo::TanhOp>,
       HloToLhloOpConverter<mhlo::TransposeOp>,
       HloToLhloOpConverter<mhlo::XorOp>,
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
new file mode 100644
index 00000000000..5f95e643520
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -0,0 +1,520 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for lowering HLO dialect to LHLO dialect.
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "lhlo/IR/lhlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/interfaces/bufferizable_op_interface_impl.h"
+#include "mhlo/transforms/passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace mhlo {
+
+#define GEN_PASS_DEF_HLOLEGALIZETOMEMREFPASS
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+namespace {
+
+using bufferization::AliasingOpResultList;
+using bufferization::AnalysisState;
+using bufferization::BufferizableOpInterface;
+using bufferization::BufferizationOptions;
+using bufferization::BufferRelation;
+using bufferization::replaceOpWithNewBufferizedOp;
+
+struct CustomCallOpInterface
+    : public BufferizableOpInterface::ExternalModel<CustomCallOpInterface,
+                                                    mhlo::CustomCallOp> {
+  bool bufferizesToMemoryRead(Operation *, OpOperand &,
+                              const AnalysisState &) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *, OpOperand &,
+                               const AnalysisState &) const {
+    return false;  // Arguments are read-only.
+  }
+
+  AliasingOpResultList getAliasingOpResults(Operation *, OpOperand &,
+                                            const AnalysisState &) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto customCallOp = cast<mhlo::CustomCallOp>(op);
+    Value tokenArgument;
+
+    // Bufferize arguments.
+    SmallVector<Value> bufferArgs;
+    for (OpOperand &operand : customCallOp->getOpOperands()) {
+      auto &newBuffer = bufferArgs.emplace_back();
+      if (operand.get().getType().isa<mhlo::TokenType>()) {
+        // Remember the token for later. We need it for the return value but
+        // it's not getting passed to LMHLO.
+        if (tokenArgument) return failure();
+        tokenArgument = operand.get();
+        continue;
+      }
+      if (!operand.get().getType().isa<TensorType>()) return failure();
+      FailureOr<Value> operandBuffer =
+          getBuffer(rewriter, operand.get(), options);
+      if (failed(operandBuffer)) return failure();
+      newBuffer = *operandBuffer;
+    }
+
+    // Allocate outputs.
+    for (OpResult result : customCallOp->getOpResults()) {
+      auto &newBuffer = bufferArgs.emplace_back();
+      if (result.getType().isa<mhlo::TokenType>()) {
+        continue;
+      }
+      auto tensorType = result.getType().dyn_cast<RankedTensorType>();
+      if (!tensorType) return failure();
+      // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
+      AnalysisState analysisState(options);
+      FailureOr<Value> tensorAlloc =
+          bufferization::allocateTensorForShapedValue(
+              rewriter, op->getLoc(), result,
+              analysisState.isTensorYielded(result), options);
+      if (failed(tensorAlloc)) return failure();
+      auto memrefType =
+          MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+      newBuffer = rewriter.create<bufferization::ToMemrefOp>(
+          op->getLoc(), memrefType, *tensorAlloc);
+    }
+
+    lmhlo::CustomCallTargetArgMappingAttr targetMapping;
+    auto numArguments = static_cast<int32_t>(customCallOp->getNumOperands());
+    auto numResults = static_cast<int32_t>(customCallOp->getNumResults());
+
+    // Take the result buffers and fill in the token input in the gaps.
+    auto bufferResults = llvm::to_vector(llvm::map_range(
+        llvm::ArrayRef(bufferArgs).slice(numArguments),
+        [&](Value buffer) { return buffer ? buffer : tokenArgument; }));
+
+    if (tokenArgument) {
+      // If there was a token, squeeze all the non-token arguments and results
+      // (in-place) and remember the mapping.
+      int nextIndex = 0;
+      llvm::SmallVector<int64_t> argToTargetArgMapping;
+      for (int i = 0; i < numArguments; ++i) {
+        if (bufferArgs[i]) {
+          argToTargetArgMapping.push_back(i);
+          bufferArgs[nextIndex++] = bufferArgs[i];
+        }
+      }
+      llvm::SmallVector<int64_t> resultToTargetResultMapping;
+      for (int32_t i = numArguments;
+           i < static_cast<int64_t>(bufferArgs.size()); ++i) {
+        if (bufferArgs[i]) {
+          resultToTargetResultMapping.push_back(i - numArguments);
+          bufferArgs[nextIndex++] = bufferArgs[i];
+        }
+      }
+
+      // Build the mapping attribute.
+      targetMapping = lmhlo::CustomCallTargetArgMappingAttr::get(
+          rewriter.getContext(), numArguments, numResults,
+          argToTargetArgMapping, resultToTargetResultMapping);
+
+      // Drop the remaining operands and adjust num_arguments and num_results
+      // for LMHLO creation.
+      bufferArgs.resize(nextIndex);
+      numArguments = static_cast<int32_t>(argToTargetArgMapping.size());
+      numResults = static_cast<int32_t>(resultToTargetResultMapping.size());
+    }
+
+    auto lhloOp = rewriter.create<lmhlo::CustomCallOp>(
+        op->getLoc(), std::nullopt, bufferArgs, op->getAttrs());
+    if (targetMapping) lhloOp.setTargetArgMappingAttr(targetMapping);
+    // lmhlo.custom_call uses a segment_size attribute to tell input from output
+    // arguments.
+    lhloOp->setAttr(lhloOp.getOperandSegmentSizeAttr(),
+                    rewriter.getDenseI32ArrayAttr({numArguments, numResults}));
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, bufferResults);
+    return success();
+  }
+};
+
+struct InfeedOpInterface
+    : public BufferizableOpInterface::ExternalModel<InfeedOpInterface,
+                                                    mhlo::InfeedOp> {
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    // Allocate buffers for the outputs of infeed.
+    SmallVector<Value> bufferArgs;
+    for (OpResult result : op->getOpResults()) {
+      if (!result.getType().isa<TensorType>()) continue;
+      AnalysisState analysisState(options);
+      auto tensorType = result.getType().cast<TensorType>();
+      FailureOr<Value> tensorAlloc =
+          bufferization::allocateTensorForShapedValue(
+              rewriter, op->getLoc(), result,
+              analysisState.isTensorYielded(result), options);
+      if (failed(tensorAlloc)) return failure();
+      auto memrefType =
+          MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+      bufferArgs.push_back(rewriter.create<bufferization::ToMemrefOp>(
+          op->getLoc(), memrefType, *tensorAlloc));
+    }
+    rewriter.create<lmhlo::InfeedOp>(op->getLoc(), std::nullopt, bufferArgs,
+                                     op->getAttrs());
+    // Pass the token along.
+    bufferArgs.push_back((op->getOperand(0)));
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, bufferArgs);
+    return success();
+  }
+};
+
+struct OutfeedOpInterface
+    : public BufferizableOpInterface::ExternalModel<OutfeedOpInterface,
+                                                    mhlo::OutfeedOp> {
+  bool bufferizesToMemoryRead(Operation *, OpOperand &,
+                              const AnalysisState &) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *, OpOperand &,
+                               const AnalysisState &) const {
+    return false;  // Arguments are read-only.
+  }
+
+  AliasingOpResultList getAliasingOpResults(Operation *, OpOperand &,
+                                            const AnalysisState &) const {
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    // Outfeed trivially bufferizes to lmhlo. Just pass the token operand along.
+    FailureOr<Value> operandBuffer =
+        getBuffer(rewriter, op->getOperand(0), options);
+    if (failed(operandBuffer)) return failure();
+    rewriter.create<lmhlo::OutfeedOp>(op->getLoc(), std::nullopt,
+                                      *operandBuffer, op->getAttrs());
+    bufferization::replaceOpWithBufferizedValues(rewriter, op,
+                                                 {op->getOperand(1)});
+    return success();
+  }
+};
+
+struct ReshapeOpInterface
+    : public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
+                                                    mhlo::ReshapeOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
+                              const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand & /*opOperand*/,
+      const AnalysisState & /*state*/) const {
+    return {op->getResult(0)};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto reshapeOp = cast<mhlo::ReshapeOp>(op);
+    auto unrankedOperandType =
+        reshapeOp.getOperand().getType().dyn_cast<UnrankedTensorType>();
+    if (unrankedOperandType == nullptr) return success();
+
+    // The buffer still has the old (pre-reshape) type.
+    FailureOr<Value> operandBuffer =
+        getBuffer(rewriter, reshapeOp.getOperand(), options);
+    if (failed(operandBuffer)) return failure();
+
+    auto resultType = reshapeOp.getType().cast<RankedTensorType>();
+    auto destType =
+        MemRefType::get(resultType.getShape(), resultType.getElementType());
+    replaceOpWithNewBufferizedOp<memref::CastOp>(rewriter, op, destType,
+                                                 *operandBuffer);
+    return success();
+  }
+};
+
+struct DynamicReshapeOpInterface
+    : public BufferizableOpInterface::ExternalModel<DynamicReshapeOpInterface,
+                                                    mhlo::DynamicReshapeOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
+                              const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand & /*opOperand*/,
+      const AnalysisState & /*state*/) const {
+    return {op->getResult(0)};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto reshapeOp = cast<mhlo::DynamicReshapeOp>(op);
+
+    // The buffer still has the old (pre-reshape) type.
+    FailureOr<Value> operandBuffer =
+        getBuffer(rewriter, reshapeOp.getOperand(), options);
+    FailureOr<Value> outputShapeBuffer =
+        getBuffer(rewriter, reshapeOp.getOutputShape(), options);
+    if (failed(operandBuffer) || failed(outputShapeBuffer)) return failure();
+
+    ShapedType resultType;
+    TensorType opResultType = reshapeOp.getType();
+    if (auto rankedType = opResultType.dyn_cast<RankedTensorType>()) {
+      resultType =
+          MemRefType::get(rankedType.getShape(), rankedType.getElementType());
+    } else if (auto unrankedType =
+                   opResultType.dyn_cast<UnrankedTensorType>()) {
+      resultType = UnrankedMemRefType::get(unrankedType.getElementType(), 0);
+    }
+    auto operand = *operandBuffer;
+    // If the operand has a non-identity affine map, we will have to add a copy.
+    auto bufferType = operandBuffer->getType().dyn_cast<MemRefType>();
+    if (bufferType && !bufferType.getLayout().isIdentity()) {
+      // TODO(springerm): Create alloc_tensor ops during TensorCopyInsertion.
+      AnalysisState analysisState(options);
+      FailureOr<Value> tensorAlloc =
+          bufferization::allocateTensorForShapedValue(
+              rewriter, op->getLoc(), *operandBuffer,
+              analysisState.isTensorYielded(reshapeOp.getResult()), options);
+      if (failed(tensorAlloc)) return failure();
+      auto memrefType =
+          MemRefType::get(bufferType.getShape(), bufferType.getElementType());
+      operand = rewriter.create<bufferization::ToMemrefOp>(
+          op->getLoc(), memrefType, *tensorAlloc);
+    }
+    bufferization::replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
+        rewriter, op, resultType, operand, *outputShapeBuffer);
+    return success();
+  }
+};
+
+// Inserts dynamic memref to change the layout of the memref to put 0-stride
+// and size of the target dimension if size-1 dimension expansion is
+// necessary.
+FailureOr<Value> insertDynamicMemrefCastOp(
+    mhlo::DynamicBroadcastInDimOp op, Value operand, RewriterBase &rewriter,
+    const BufferizationOptions &options) {
+  auto loc = op.getLoc();
+  auto operandType = operand.getType().cast<MemRefType>();
+  auto operandShape = operandType.getShape();
+  auto operandRank = operandType.getRank();
+
+  auto resultType = op.getType().cast<RankedTensorType>();
+  auto resultRank = resultType.getRank();
+
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+
+  // Compute a reversed scan product. Compute the stride for the dimensions so
+  // far, working from minor to major dimensions. Additionally, save the
+  // operand shape Values to use in the next loop.
+  SmallVector<Value, 2> operandStrides(operandRank, one);
+  SmallVector<Value, 2> operandSizes(operandRank, one);
+  Value strideSoFar = one;
+  for (int i = operandRank - 1; i >= 0; --i) {
+    Value operandDimSize =
+        ShapedType::isDynamic(operandShape[i])
+            ? rewriter.create<memref::DimOp>(loc, operand, i).getResult()
+            : rewriter.create<arith::ConstantIndexOp>(loc, operandShape[i])
+                  .getResult();
+    operandSizes[i] = operandDimSize;
+
+    operandStrides[i] = strideSoFar;
+    if (i > 0) {
+      strideSoFar =
+          rewriter.create<arith::MulIOp>(loc, strideSoFar, operandDimSize);
+    }
+  }
+
+  SmallVector<OpFoldResult, 2> sizes, strides;
+  sizes.reserve(resultRank);
+  strides.reserve(resultRank);
+
+  DenseMap<int, int> outputToInputDim;
+  for (const auto &dim : llvm::enumerate(op.getBroadcastDimensions())) {
+    outputToInputDim[dim.value().getSExtValue()] = dim.index();
+  }
+  for (int i = 0; i < resultRank; ++i) {
+    Value iVal = rewriter.create<arith::ConstantIndexOp>(loc, i);
+    FailureOr<Value> outputDimsBuffer =
+        getBuffer(rewriter, op.getOutputDimensions(), options);
+    if (failed(outputDimsBuffer)) return failure();
+    Value resultDimSize =
+        rewriter.create<memref::LoadOp>(loc, *outputDimsBuffer, iVal);
+    if (!resultDimSize.getType().isIndex()) {
+      resultDimSize = rewriter.create<arith::IndexCastOp>(
+          loc, rewriter.getIndexType(), resultDimSize);
+    }
+    if (resultType.isDynamicDim(i)) {
+      sizes.push_back(resultDimSize);
+    } else {
+      sizes.push_back(rewriter.getIndexAttr(resultType.getDimSize(i)));
+    }
+
+    auto it = outputToInputDim.find(i);
+    // If the rank of the output is greater than the rank of the input, i.e.
+    // there was no output dimension in the inverse broadcast_dimensions map
+    // we also set stride to 0 to emulate padding of the shape with 1s and the
+    // corresponding expansion.
+    if (it == outputToInputDim.end()) {
+      strides.push_back(zero);
+      continue;
+    }
+
+    // There can be two cases:
+    // 1) Operand dim == result dim => expansion is not needed
+    //    => stride flattened buffer stride
+    // 2) Operand dim < result dim => expansion is needed => stride := 0.
+    int dim = it->second;
+    Value isExpansion = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt, operandSizes[dim], resultDimSize);
+    Value select = rewriter.create<mlir::arith::SelectOp>(
+        loc, isExpansion, zero, operandStrides[dim]);
+    strides.push_back(select);
+  }
+
+  // Type-erased memref type with static rank and dynamic strides.
+  SmallVector<int64_t, 2> dynamicLayout(resultRank, ShapedType::kDynamic);
+  auto typeErasedMemrefType = MemRefType::get(
+      resultType.getShape(), operandType.getElementType(),
+      makeStridedLinearLayoutMap(dynamicLayout,
+                                 /*offset=*/0, rewriter.getContext()));
+
+  auto transformedOperand = rewriter.create<memref::ReinterpretCastOp>(
+      loc, typeErasedMemrefType, operand,
+      /*offset=*/rewriter.getI64IntegerAttr(0), sizes, strides);
+  return transformedOperand.getResult();
+}
+
+struct DynamicBroadcastInDimOpInterface
+    : public BufferizableOpInterface::ExternalModel<
+          DynamicBroadcastInDimOpInterface, mhlo::DynamicBroadcastInDimOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
+                              const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation * /*op*/, OpOperand & /*opOperand*/,
+                               const AnalysisState & /*state*/) const {
+    return false;
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand & /*opOperand*/,
+      const AnalysisState & /*state*/) const {
+    return {op->getResult(0)};
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    // The op may allocate.
+    return BufferRelation::Unknown;
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    auto broadcastInDimOp = cast<mhlo::DynamicBroadcastInDimOp>(op);
+    auto resultType = broadcastInDimOp.getType().dyn_cast<RankedTensorType>();
+    if (!resultType) return success();
+
+    // The buffer still has the old (pre-reshape) type.
+    FailureOr<Value> operandBuffer =
+        getBuffer(rewriter, broadcastInDimOp.getOperand(), options);
+    if (failed(operandBuffer)) return failure();
+    FailureOr<Value> result = insertDynamicMemrefCastOp(
+        broadcastInDimOp, *operandBuffer, rewriter, options);
+    if (failed(result)) return failure();
+    bufferization::replaceOpWithBufferizedValues(rewriter, op, *result);
+    return success();
+  }
+};
+
+struct HloLegalizeToMemrefPass
+    : public impl::HloLegalizeToMemrefPassBase<HloLegalizeToMemrefPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect>();
+    registerBufferizableOpInterfaceExternalModels(registry);
+  }
+
+ public:
+  void runOnOperation() override {
+    bufferization::BufferizationOptions options =
+        bufferization::getPartialBufferizationOptions();
+    options.opFilter.allowDialect<mhlo::MhloDialect>();
+    if (failed(bufferizeOp(getOperation(), options))) signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass() {
+  return std::make_unique<HloLegalizeToMemrefPass>();
+}
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, MhloDialect * /*dialect*/) {
+    CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
+    InfeedOp::attachInterface<InfeedOpInterface>(*ctx);
+    OutfeedOp::attachInterface<OutfeedOpInterface>(*ctx);
+    ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
+    DynamicReshapeOp::attachInterface<DynamicReshapeOpInterface>(*ctx);
+    DynamicBroadcastInDimOp::attachInterface<DynamicBroadcastInDimOpInterface>(
+        *ctx);
+
+    // Load additional dialects of which ops may get created.
+    ctx->loadDialect<arith::ArithDialect, bufferization::BufferizationDialect,
+                     lmhlo::LmhloDialect, memref::MemRefDialect>();
+  });
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
new file mode 100644
index 00000000000..2eb43736f90
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -0,0 +1,384 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_stablehlo_to_hlo_op.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/DebugStringHelper.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "stablehlo/dialect/StablehloOps.h"
+
+namespace mlir {
+namespace stablehlo {
+namespace {
+
+// PRIVATE MHLO features are internal to XLA and not used by any ML frontends.
+// These should never be converted to StableHLO, as they are not a good fit for
+// StableHLO.
+template <typename HloOpTy>
+bool hasPrivateFeaturesNotInStablehlo(HloOpTy hloOp) {
+  // To the best of our knowledge, none of the ML frontends are using these ops
+  // directly or indirectly, so we categorized them as private to XLA.
+  // Please let us know if we missed something, and we'll recategorize them.
+  if (isa<mhlo::AddDependencyOp, mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
+          mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp,
+          mhlo::FusionOp, mhlo::StochasticConvertOp,
+          mhlo::XlaRngGetAndUpdateStateOp>(hloOp.getOperation())) {
+    return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::ConvolutionOp>::value) {
+    // StableHLO convolution doesn't support "unknown" dimensions.
+    // This is an esoteric feature of MHLO convolutions, and it's different
+    // from the notion of dynamic dimensions. For more context, here's the
+    // commit which introduced it:
+    // https://github.com/tensorflow/mlir-hlo/commit/4d6dc3163c1c9289d86455d9f4de5711465c50fb
+    // This feature isn't supported in HLO and doesn't have documentation, so
+    // we may end up removing it from MHLO as well.
+    auto dimensionNumbers = debugString(hloOp.getDimensionNumbers());
+    if (dimensionNumbers.find('?') != std::string::npos) return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
+    // To the best of our knowledge, none of the ML frontends are using this
+    // enum, so we categorized it as private to XLA.
+    // Please let us know if we missed something, and we'll recategorize it.
+    if (hloOp.getCustomCallSchedule() != mhlo::CustomCallSchedule::NONE)
+      return true;
+  }
+  return false;
+}
+
+bool hasPackedNibble(Optional<ArrayAttr> precisionConfigAttr) {
+  if (!precisionConfigAttr) return false;
+  return llvm::any_of(*precisionConfigAttr, [&](Attribute attr) {
+    auto precisionAttr = attr.cast<mhlo::PrecisionAttr>();
+    return precisionAttr.getValue() == mhlo::Precision::PACKED_NIBBLE;
+  });
+}
+
+// EXPERIMENTAL MHLO features are being explored by ML frontends but do not have
+// any agreed upon compatibility guarantees. By default, these features cannot
+// be converted to StableHLO, although the allow-experimental-features flag can
+// be used to manually enable the conversion. Such features might be a good fit
+// for StableHLO, and they are usually accompanied by a StableHLO GitHub ticket.
+template <typename HloOpTy>
+bool hasExperimentalFeaturesNotInStablehlo(HloOpTy hloOp) {
+  if constexpr (std::is_same<HloOpTy, mhlo::AllToAllOp>::value) {
+    // StableHLO AllToAll doesn't support the tuple form yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/574.
+    if (hloOp.getNumOperands() != 1) return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::ConvolutionOp>::value) {
+    // StableHLO ConvolutionOp doesn't support PACKED_NIBBLE yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/742.
+    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
+    // StableHLO CustomCall doesn't support API_VERSION_TYPED_FFI yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/637.
+    if (hloOp.getApiVersion() ==
+        mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
+      return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::DotGeneralOp>::value) {
+    // StableHLO DotGeneral doesn't support PACKED_NIBBLE yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/742.
+    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
+  }
+  if constexpr (std::is_same<HloOpTy, mhlo::DotOp>::value) {
+    // StableHLO Dot doesn't support PACKED_NIBBLE yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/742.
+    if (hasPackedNibble(hloOp.getPrecisionConfig())) return true;
+  }
+  return false;
+}
+
+// PUBLIC MHLO features are not yet in StableHLO but are agreed upon internally
+// to have limited compatibility guarantees. These features are used by ML
+// frontends but are not yet part of StableHLO. Such features might be a good
+// fit for StableHLO, and are usually accompanied by a StableHLO GitHub ticket.
+template <typename HloOpTy>
+bool hasPublicFeaturesNotInStablehlo(HloOpTy) {
+  return false;
+}
+
+#define RETURN_CONVERTED_ENUM_ATTR(Name)                      \
+  auto hloValue = mhlo::stringify##Name(attr.getValue());     \
+  auto stablehloValue = stablehlo::symbolize##Name(hloValue); \
+  if (!stablehloValue.has_value()) return {};                 \
+  return stablehlo::Name##Attr::get(attr.getContext(), stablehloValue.value())
+
+Attribute convertAttr(Attribute hloAttr) {
+  // Handle MHLO attributes.
+  // The logic that handles attributes from other dialects (e.g. builtin
+  // attributes) lives below.
+  if (auto attr = hloAttr.dyn_cast<mhlo::ChannelHandleAttr>()) {
+    return stablehlo::ChannelHandleAttr::get(attr.getContext(),
+                                             attr.getHandle(), attr.getType());
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonDirectionAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(ComparisonDirection);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::ComparisonTypeAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(ComparisonType);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::ConvDimensionNumbersAttr>()) {
+    return stablehlo::ConvDimensionNumbersAttr::get(
+        attr.getContext(), attr.getInputBatchDimension(),
+        attr.getInputFeatureDimension(), attr.getInputSpatialDimensions(),
+        attr.getKernelInputFeatureDimension(),
+        attr.getKernelOutputFeatureDimension(),
+        attr.getKernelSpatialDimensions(), attr.getOutputBatchDimension(),
+        attr.getOutputFeatureDimension(), attr.getOutputSpatialDimensions());
+  }
+  // NOTE: We cannot process CustomCallApiVersionAttr here because
+  // `dyn_cast<mhlo::CustomCallApiVersionAttr>()` succeeds for IntegerAttr too.
+  if (auto attr = hloAttr.dyn_cast<mhlo::DotDimensionNumbersAttr>()) {
+    return stablehlo::DotDimensionNumbersAttr::get(
+        attr.getContext(), attr.getLhsBatchingDimensions(),
+        attr.getRhsBatchingDimensions(), attr.getLhsContractingDimensions(),
+        attr.getRhsContractingDimensions());
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::FftTypeAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(FftType);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::GatherDimensionNumbersAttr>()) {
+    return stablehlo::GatherDimensionNumbersAttr::get(
+        attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
+        attr.getStartIndexMap(), attr.getIndexVectorDim());
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::OutputOperandAliasAttr>()) {
+    return stablehlo::OutputOperandAliasAttr::get(
+        attr.getContext(), attr.getOutputTupleIndices(), attr.getOperandIndex(),
+        attr.getOperandTupleIndices());
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::PrecisionAttr>()) {
+    // StableHLO Precision doesn't support PACKED_NIBBLE yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/742.
+    if (attr.getValue() == mhlo::Precision::PACKED_NIBBLE) return {};
+    RETURN_CONVERTED_ENUM_ATTR(Precision);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::RngAlgorithmAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(RngAlgorithm);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::RngDistributionAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(RngDistribution);
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::ScatterDimensionNumbersAttr>()) {
+    return stablehlo::ScatterDimensionNumbersAttr::get(
+        attr.getContext(), attr.getUpdateWindowDims(),
+        attr.getInsertedWindowDims(), attr.getScatterDimsToOperandDims(),
+        attr.getIndexVectorDim());
+  }
+  if (auto attr = hloAttr.dyn_cast<mhlo::TransposeAttr>()) {
+    RETURN_CONVERTED_ENUM_ATTR(Transpose);
+  }
+  if (hloAttr.getDialect().getNamespace() ==
+      mhlo::MhloDialect::getDialectNamespace()) {
+    // Our guiding principle is to support all StableHLO functionality in MHLO.
+    // The inverse is not necessarily true - some MHLO attributes are missing
+    // from StableHLO (either deliberately or haven't yet been proposed).
+    // As a result, these MHLO attributes will fail here.
+    return {};
+  }
+
+  // Handle non-MHLO attributes.
+  // If an attribute is not defined in MHLO, then it is unchanged,
+  // with the exception of ArrayAttr which is converted recursively.
+  if (auto hloAttrs = hloAttr.dyn_cast<ArrayAttr>()) {
+    SmallVector<Attribute> stablehloAttrs;
+    for (auto hloAttr : hloAttrs) {
+      auto stablehloAttr = convertAttr(hloAttr);
+      if (!stablehloAttr) return {};
+      stablehloAttrs.push_back(stablehloAttr);
+    }
+    return ArrayAttr::get(hloAttrs.getContext(), stablehloAttrs);
+  }
+  return hloAttr;
+}
+
+#undef RETURN_CONVERTED_ENUM_ATTR
+
+template <typename HloOpTy>
+class HloToStablehloOpConverter : public OpConversionPattern<HloOpTy> {
+ public:
+  HloToStablehloOpConverter(TypeConverter& converter, MLIRContext* context,
+                            bool allowExperimentalFeatures)
+      : OpConversionPattern<HloOpTy>::OpConversionPattern(converter, context),
+        allowExperimentalFeatures(allowExperimentalFeatures) {}
+
+  LogicalResult matchAndRewrite(
+      HloOpTy hloOp, typename HloOpTy::Adaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    // Most MHLO ops which end up here are fully supported by StableHLO.
+    // However, some of these ops are supported only partially because they
+    // have features that are not supported in StableHLO.
+    // These MHLO features fall into two distinct categories:
+    //   1) Features that are private to the XLA compiler, so they are not
+    //      a good fit for StableHLO. Conversion of such features should fail.
+    //   2) Features that might be a good fit for StableHLO but haven't yet
+    //      been proposed or approved in StableHLO. Conversion of such features
+    //      should succeed using custom_call extensibility protocol (see below).
+    if (hasPrivateFeaturesNotInStablehlo(hloOp)) {
+      return failure();
+    }
+
+    // Convert MHLO types to StableHLO equivalents.
+    // If a type is not defined in MHLO, then it is unchanged,
+    // with the exception of RankedTensorType and TupleType which are
+    // converted recursively.
+    // See `HloToStablehloTypeConverter` for more information on when this
+    // conversion will succeed or fail.
+    SmallVector<Type> stablehloTypes;
+    if (failed(this->getTypeConverter()->convertTypes(hloOp->getResultTypes(),
+                                                      stablehloTypes)))
+      return failure();
+
+    // These operands have already been converted to StableHLO by
+    // the dialect conversion infrastructure.
+    ValueRange stablehloOperands = adaptor.getOperands();
+
+    // Extensibility protocol for MHLO ops with public MHLO features that
+    // are not yet supported in StableHLO.
+    //   1) The op is represented by stablehlo::CustomCallOp.
+    //   2) The full name, e.g. "mhlo.all_to_all" is stored in the
+    //      `call_target_name` attribute of the CustomCallOp.
+    //   3) The operands become operands of the CustomCallOp.
+    //   4) The attributes are wrapped in a DictionaryAttr, which is
+    //      prettyprinted and then stored in the `backend_config` attribute
+    //      of the CustomCallOp.
+    //   5) The result types become result types of the CustomCallOp.
+    //
+    // This StableHLO representation does not come with any compatibility
+    // guarantees. For example, when it is roundtripped back to MHLO, it may
+    // turn out that the original MHLO op no longer exists or has different
+    // attributes in the current version.
+    bool hasExperimentalFeatures = hasExperimentalFeaturesNotInStablehlo(hloOp);
+    if (!allowExperimentalFeatures && hasExperimentalFeatures) return failure();
+    if (hasPublicFeaturesNotInStablehlo(hloOp) || hasExperimentalFeatures) {
+      if (hloOp->getNumRegions() != 0) {
+        // Extensibility protocol for regions hasn't been implemented yet.
+        // In principle, it should be straightforward to implement by
+        // converting regions into functions and calling them out in
+        // "called_computations".
+        // https://github.com/openxla/stablehlo/issues/593.
+        return failure();
+      }
+
+      auto stablehloCallTargetName = hloOp->getName().getStringRef();
+      std::string stablehloBackendConfig;
+      llvm::raw_string_ostream os(stablehloBackendConfig);
+      os << hloOp->getAttrDictionary();
+
+      SmallVector<NamedAttribute> stablehloAttrs;
+      stablehloAttrs.push_back(rewriter.getNamedAttr(
+          "call_target_name", rewriter.getStringAttr(stablehloCallTargetName)));
+      stablehloAttrs.push_back(rewriter.getNamedAttr(
+          "backend_config", rewriter.getStringAttr(stablehloBackendConfig)));
+      rewriter.replaceOpWithNewOp<stablehlo::CustomCallOp>(
+          hloOp, stablehloTypes, stablehloOperands, stablehloAttrs);
+      return success();
+    }
+
+    // Convert MHLO attributes to StableHLO equivalents.
+    // If an attribute is not defined in MHLO, then it is unchanged,
+    // with the exception of ArrayAttr which is converted recursively.
+    SmallVector<NamedAttribute> stablehloAttrs;
+    for (NamedAttribute hloAttr : hloOp->getAttrs()) {
+      if constexpr (std::is_same<HloOpTy, mhlo::CustomCallOp>::value) {
+        // custom_call_schedule is private to XLA, but we still want to allow
+        // #mhlo<custom_call_schedule NONE> (by ignoring it).
+        if (hloAttr.getName() == "custom_call_schedule" &&
+            hloOp.getCustomCallSchedule() == mhlo::CustomCallSchedule::NONE)
+          continue;
+      }
+      auto stablehloAttr = convertAttr(hloAttr.getValue());
+      if (!stablehloAttr) return failure();
+      stablehloAttrs.push_back({hloAttr.getName(), stablehloAttr});
+    }
+
+    // Convert the MHLO operation to a StableHLO equivalent.
+    // This can almost be done in a generic fashion, except for stablehlo.case
+    // that uses a variadic number of regions which means an additional argument
+    // for the generic builder.
+    HloToStablehloOp<HloOpTy> stablehloOp;
+    if constexpr (std::is_same<HloOpTy, mhlo::CaseOp>::value) {
+      stablehloOp = rewriter.replaceOpWithNewOp<stablehlo::CaseOp>(
+          hloOp, stablehloTypes, stablehloOperands, stablehloAttrs,
+          hloOp.getBranches().size());
+    } else {
+      stablehloOp = rewriter.replaceOpWithNewOp<HloToStablehloOp<HloOpTy>>(
+          hloOp, stablehloTypes, stablehloOperands, stablehloAttrs);
+    }
+
+    // Finally, populate the regions while converting argument types
+    // and nested operations.
+    for (auto [hloRegion, stablehloRegion] :
+         llvm::zip(hloOp->getRegions(), stablehloOp->getRegions())) {
+      rewriter.inlineRegionBefore(hloRegion, stablehloRegion,
+                                  stablehloRegion.end());
+      if (failed(rewriter.convertRegionTypes(&stablehloRegion,
+                                             *this->getTypeConverter(),
+                                             /*entryConversion=*/nullptr)))
+        return failure();
+    }
+    return success();
+  }
+
+  bool allowExperimentalFeatures;
+};
+
+template <typename... StablehloOpTypes>
+void populateHloToStablehloPatterns(RewritePatternSet* patterns,
+                                    TypeConverter* converter,
+                                    MLIRContext* context,
+                                    bool allowExperimentalFeatures) {
+  patterns
+      ->add<HloToStablehloOpConverter<StablehloToHloOp<StablehloOpTypes>>...>(
+          *converter, context, allowExperimentalFeatures);
+}
+
+}  // namespace
+
+void populateHloToStablehloPatterns(RewritePatternSet* patterns,
+                                    TypeConverter* converter,
+                                    MLIRContext* context,
+                                    bool allowExperimentalFeatures) {
+  // Populate conversion patterns for all StableHLO ops.
+  // Our guiding principle is to support all StableHLO functionality in MHLO.
+  // The inverse is not necessarily true - some MHLO ops are missing from
+  // StableHLO (either deliberately or haven't yet been proposed to StableHLO).
+  // As a result, these MHLO ops will not be added to these patterns and
+  // will fail the conversion.
+  populateHloToStablehloPatterns<
+#define GET_OP_LIST
+#include "stablehlo/dialect/StablehloOps.cpp.inc"
+      >(patterns, converter, context, allowExperimentalFeatures);
+}
+
+}  // namespace stablehlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
similarity index 83%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
index b73ae40f90e..05bbd455f86 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_stablehlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/PatternMatch.h"
@@ -35,7 +35,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZETOSTABLEHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -48,8 +48,8 @@ struct HloLegalizeToStablehloPass
 
     stablehlo::HloToStablehloTypeConverter converter;
     RewritePatternSet patterns(&getContext());
-    stablehlo::populateHloToStablehloPatterns(&patterns, &converter,
-                                              &getContext());
+    stablehlo::populateHloToStablehloPatterns(
+        &patterns, &converter, &getContext(), allow_experimental_features_);
     stablehlo::registerFuncOpsForTypeConversion(target, patterns, converter);
 
     if (failed(applyPartialConversion(getOperation(), target,
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
index 8c21200246f..70deb9d89ef 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_control_flow/legalize_control_flow.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -44,7 +44,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZECONTROLFLOWPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_einsum_to_dot_general.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_einsum_to_dot_general.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index 819863f6b49..e0efda1c92e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_einsum_to_dot_general.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
@@ -28,7 +28,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZEEINSUMTODOTGENERALPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
index 7b105d7501c..445e982a925 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_gather_to_torch_index_select.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
@@ -25,7 +25,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZEGATHERTOTORCHINDEXSELECTPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_mhlo_to_thlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
similarity index 85%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_mhlo_to_thlo.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
index 6d003d74a5d..04d7efa05de 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_mhlo_to_thlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
@@ -21,42 +21,34 @@ limitations under the License.
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/legalize_to_linalg_utils.h"
+#include "mhlo/utils/mhlo_scatter_gather_utils.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZEMHLOTOTHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
-bool isIotaArray(llvm::ArrayRef<int64_t> array, int expectedSize = -1) {
-  if (expectedSize != -1 && static_cast<int>(array.size()) != expectedSize)
-    return false;
-  for (int64_t i = 0, e = array.size(); i < e; ++i) {
-    if (i != array[i]) return false;
-  }
-  return true;
-}
-
 Value castToIndex(OpBuilder& b, Location loc, TensorType originalType,
                   Value value) {
   Type elementTy = originalType.getElementType();
@@ -89,7 +81,7 @@ struct ConcatenateOpPattern : public OpConversionPattern<mhlo::ConcatenateOp> {
     SmallVector<Value> dynamicInitSizes;
     for (int64_t i = 0; i < rank; ++i) {
       // No need to materialize anything for static dimensions.
-      if (staticInitSizes[i] != ShapedType::kDynamicSize) {
+      if (staticInitSizes[i] != ShapedType::kDynamic) {
         continue;
       }
 
@@ -107,7 +99,7 @@ struct ConcatenateOpPattern : public OpConversionPattern<mhlo::ConcatenateOp> {
       Value dynamicSum;
       for (const Value operand : adaptor.getVal()) {
         auto operandTy = operand.getType().cast<RankedTensorType>();
-        if (operandTy.getDimSize(concatDim) == ShapedType::kDynamicSize) {
+        if (operandTy.getDimSize(concatDim) == ShapedType::kDynamic) {
           const Value dynamicSummand =
               rewriter.create<tensor::DimOp>(loc, operand, concatDim);
           if (dynamicSum) {
@@ -133,7 +125,8 @@ struct ConcatenateOpPattern : public OpConversionPattern<mhlo::ConcatenateOp> {
     auto emptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, staticInitSizes, resultTy.getElementType(), dynamicInitSizes);
     rewriter.replaceOpWithNewOp<thlo::ConcatenateOp>(
-        op, resultTy, adaptor.getVal(), emptyTensor, concatDim);
+        op, resultTy, adaptor.getVal(), emptyTensor,
+        rewriter.getIndexAttr(concatDim));
     return success();
   }
 };
@@ -169,7 +162,7 @@ struct DynamicBroadcastInDimOpPattern
       dynamicDims.push_back(rewriter.create<tensor::ExtractOp>(
           loc, outputDimensions,
           ValueRange{rewriter.create<arith::ConstantIndexOp>(loc, i)}));
-      staticShapeInfo.push_back(ShapedType::kDynamicSize);
+      staticShapeInfo.push_back(ShapedType::kDynamic);
     }
     auto emptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, staticShapeInfo, resultTy.getElementType(), dynamicDims);
@@ -214,7 +207,7 @@ struct GatherPattern : public OpConversionPattern<mhlo::GatherOp> {
         typeConverter->convertType(op.getType()).cast<RankedTensorType>();
     SmallVector<OpFoldResult> sizes;
     sizes.reserve(resultType.getRank());
-    if (resultType.getDimSize(0) != ShapedType::kDynamicSize) {
+    if (resultType.getDimSize(0) != ShapedType::kDynamic) {
       sizes.push_back(rewriter.getI64IntegerAttr(resultType.getDimSize(0)));
     } else {
       sizes.push_back(
@@ -236,22 +229,6 @@ struct GatherPattern : public OpConversionPattern<mhlo::GatherOp> {
   }
 };
 
-static SmallVector<Value, 8> getReduceOpEmptyTensorDynSizes(
-    OpBuilder& b, Location loc, Value operand, int64_t srcRank,
-    RankedTensorType resultType, ArrayRef<int64_t> reductionDims) {
-  SmallVector<Value, 8> dynShape;
-  for (size_t i = 0, j = 0; i < srcRank; ++i) {
-    if (j < reductionDims.size() && reductionDims[j] == i) {
-      ++j;
-      continue;
-    }
-    size_t resultIndex = i - j;
-    if (!resultType.isDynamicDim(resultIndex)) continue;
-    dynShape.push_back(b.create<tensor::DimOp>(loc, operand, resultIndex));
-  }
-  return dynShape;
-}
-
 bool isInBodyOfThloOp(Operation* op) {
   auto* parentOp = op->getParentRegion()->getParentOp();
   return isa<thlo::ScatterOp>(*parentOp) || isa<thlo::SortOp>(*parentOp);
@@ -357,7 +334,7 @@ struct SortPattern : public OpConversionPattern<mhlo::SortOp> {
 
     auto thloSort = rewriter.create<thlo::SortOp>(
         loc, resultTypes, adaptor.getInputs(), outputs,
-        rewriter.getI64IntegerAttr(dimension), rewriter.getBoolAttr(isStable));
+        rewriter.getIndexAttr(dimension), rewriter.getBoolAttr(isStable));
 
     Region& region = thloSort.getComparator();
     rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
@@ -384,8 +361,41 @@ struct SortPattern : public OpConversionPattern<mhlo::SortOp> {
   }
 };
 
+struct ReversePattern : public OpConversionPattern<mhlo::ReverseOp> {
+  using OpConversionPattern<mhlo::ReverseOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      mhlo::ReverseOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    auto reverseDimensions =
+        llvm::to_vector(op.getDimensions().getValues<int64_t>());
+    Type resultType = typeConverter->convertType(op->getResultTypes()[0]);
+    if (!resultType)
+      return rewriter.notifyMatchFailure(op, "failed to convert result type");
+    Location loc = op.getLoc();
+    auto operandType =
+        adaptor.getOperand().getType().dyn_cast<RankedTensorType>();
+    if (!operandType)
+      return rewriter.notifyMatchFailure(op, "expects known-rank operand");
+    auto tensorResultType = resultType.cast<RankedTensorType>();
+    SmallVector<Value, 8> dynShape =
+        tensor::createDynamicDimValues(rewriter, loc, adaptor.getOperand());
+    Value initTensor = rewriter.create<tensor::EmptyOp>(
+        loc, tensorResultType.getShape(), tensorResultType.getElementType(),
+        dynShape);
+    rewriter.replaceOpWithNewOp<thlo::ReverseOp>(
+        op, resultType, adaptor.getOperand(), initTensor, reverseDimensions);
+    return success();
+  }
+};
+
 class LegalizeMHLOToTHLOPass
     : public impl::LegalizeMHLOToTHLOPassBase<LegalizeMHLOToTHLOPass> {
+ public:
+  explicit LegalizeMHLOToTHLOPass(bool enableExperimentalOps) {
+    enableExperimental = enableExperimentalOps;
+  }
+
+ private:
   void runOnOperation() final {
     MLIRContext* ctx = &getContext();
     RewritePatternSet patterns(ctx);
@@ -404,18 +414,24 @@ class LegalizeMHLOToTHLOPass
 
     auto typeConverter = std::make_unique<LinalgTypeConverter>();
 
-    populateScalarHloToArithmeticConversionPatterns(ctx, *typeConverter,
-                                                    &patterns);
+    populateScalarHloToArithmeticConversionPatterns(
+        ctx, *typeConverter, &patterns,
+        [](Operation* op) { return isInBodyOfThloOp(op); });
 
     // List of patterns.
     // clang-format off
     patterns.insert<
-        ConcatenateOpPattern,
-        DynamicBroadcastInDimOpPattern,
-        GatherPattern,
+        ReversePattern,
         ScatterPattern,
         SortPattern,
         ThloRegionReturnOpConversion>(*typeConverter, ctx);
+
+    if (enableExperimental) {
+      patterns.insert<
+        ConcatenateOpPattern,
+        DynamicBroadcastInDimOpPattern,
+        GatherPattern>(*typeConverter, ctx);
+    }
     // clang-format on
 
     if (failed(applyPartialConversion(getOperation(), target,
@@ -427,8 +443,9 @@ class LegalizeMHLOToTHLOPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass() {
-  return std::make_unique<LegalizeMHLOToTHLOPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass(
+    bool enableExperimentalOps) {
+  return std::make_unique<LegalizeMHLOToTHLOPass>(enableExperimentalOps);
 }
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_shape_computations.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_shape_computations.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
index 866ac7dfa24..0ec48e417a0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_shape_computations.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -52,7 +52,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZESHAPECOMPUTATIONSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
new file mode 100644
index 00000000000..9c9dd4f72d1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
@@ -0,0 +1,579 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for lowering mhlo.sort to the SCF dialect.
+#include <iterator>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // TF:llvm-project
+#include "mlir/IR/Block.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace mhlo {
+
+#define GEN_PASS_DEF_HLOLEGALIZESORTPASS
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+namespace {
+
+using ::mlir::arith::AddIOp;
+using ::mlir::arith::CmpIPredicate;
+using ::mlir::arith::MinSIOp;
+using ::mlir::arith::SelectOp;
+using ::mlir::arith::SubIOp;
+
+constexpr int64_t kInsertionSortSize = 16;
+
+// Inlines the `comparator` region (without terminator) at the current insertion
+// point, replacing the arguments with the given values from `lhs` and `rhs`.
+Value emitComparison(ImplicitLocOpBuilder& b, SmallVector<Value>& lhs,
+                     SmallVector<Value>& rhs, Region& comparator) {
+  assert(comparator.hasOneBlock() && "Comparator must have only one block.");
+  Block& block = comparator.front();
+  assert(block.getTerminator()->getOperands().size() == 1 &&
+         "Comparator must return a single value");
+
+  IRMapping mapping;
+  for (auto [idx, arg] : llvm::enumerate(comparator.getArguments())) {
+    Value value = idx % 2 == 0 ? lhs[idx / 2] : rhs[idx / 2];
+    Type type = RankedTensorType::get({}, value.getType());
+    mapping.map(arg, b.create<tensor::FromElementsOp>(type, value));
+  }
+
+  for (Operation& op : block.without_terminator()) b.clone(op, mapping);
+  Value result = mapping.lookup(block.getTerminator()->getOperands().front());
+
+  return b.create<tensor::ExtractOp>(result, ValueRange());
+}
+
+// Emits a binary search of `pivots` in `arrayMemrefs` (all rank 1) in the range
+// [`left`;`right`). `arrayMemrefs` must be sorted according to `comparator`.
+Value emitBinarySearch(ImplicitLocOpBuilder& b, Value leftInit, Value rightInit,
+                       SmallVector<Value>& pivots, ValueRange arrayMemrefs,
+                       Region& comparator) {
+  SmallVector<Type, 2> types{leftInit.getType(), rightInit.getType()};
+  ArithBuilder arith(b, b.getLoc());
+
+  // while (
+  auto whileOp =
+      b.create<scf::WhileOp>(types, SmallVector<Value, 2>{leftInit, rightInit});
+  OpBuilder::InsertionGuard guard(b);
+
+  //        left < right) {
+  Block* before = b.createBlock(&whileOp.getBefore(), {}, types,
+                                {whileOp.getLoc(), whileOp.getLoc()});
+  {
+    Value left = before->getArgument(0), right = before->getArgument(1);
+    b.setInsertionPointToEnd(before);
+    b.create<scf::ConditionOp>(arith.slt(left, right), before->getArguments());
+  }
+
+  Block* after = b.createBlock(&whileOp.getAfter(), {}, types,
+                               {whileOp.getLoc(), whileOp.getLoc()});
+  {
+    Value left = after->getArgument(0), right = after->getArgument(1);
+    b.setInsertionPointToEnd(after);
+    //   int mid = (left + right) >> 1;
+    Value one = b.create<arith::ConstantIndexOp>(1);
+    Value mid = b.create<arith::ShRUIOp>(arith.add(left, right), one);
+    Value midPlusOne = b.create<AddIOp>(mid, one);
+
+    auto arraysAtMid = llvm::to_vector(
+        llvm::map_range(arrayMemrefs, [&](Value arrayMemref) -> Value {
+          return b.create<memref::LoadOp>(arrayMemref, mid);
+        }));
+    Value cond = emitComparison(b, pivots, arraysAtMid, comparator);
+    //   if (comparator(pivot, array[mid]))
+    //     right = mid;
+    //   else
+    //     left = mid + 1;
+    Value newLeft = arith.select(cond, left, midPlusOne);
+    Value newRight = arith.select(cond, mid, right);
+
+    // }
+    b.create<scf::YieldOp>(ValueRange{newLeft, newRight});
+  }
+
+  return whileOp.getResult(0);
+}
+
+SmallVector<Value> loadTensorElements(ImplicitLocOpBuilder& b,
+                                      ValueRange tensors, Value index) {
+  return llvm::to_vector(llvm::map_range(tensors, [&](Value tensor) -> Value {
+    return b.create<tensor::ExtractOp>(tensor, index);
+  }));
+}
+
+SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
+                                      ValueRange memrefs, Value index) {
+  return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
+    Type type = memref.getType().cast<MemRefType>().getElementType();
+    return b.create<memref::LoadOp>(type, memref, index);
+  }));
+}
+
+void storeMemrefElements(ImplicitLocOpBuilder& b, ValueRange memrefs,
+                         Value index, ValueRange values) {
+  for (auto [value, memref] : llvm::zip(values, memrefs)) {
+    b.create<memref::StoreOp>(value, memref, index);
+  }
+}
+
+// Insertion sorts `inputTensors` in the range [`lo`; `hi`), storing the results
+// in `outputMemrefs`. `inputTensors` and `outputMemrefs` must all be rank 1 and
+// of identical size.
+void emitInsertionSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
+                       ValueRange inputTensors, ValueRange outputMemrefs,
+                       mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value one = b.create<arith::ConstantIndexOp>(1);
+
+  // array[lo] = tensors[lo];
+  storeMemrefElements(b, outputMemrefs, lo,
+                      loadTensorElements(b, inputTensors, lo));
+
+  // for (int start = lo + 1; start < hi; ++start)
+  {
+    auto forOp = b.create<scf::ForOp>(arith.add(lo, one), hi, one);
+    OpBuilder::InsertionGuard outerGuard(b);
+    b.setInsertionPointToStart(forOp.getBody());
+    Value start = forOp.getInductionVar();
+
+    //   T pivot = tensors[start];
+    auto pivots = loadTensorElements(b, inputTensors, start);
+
+    //   int index = binarySearch(lo, start, pivot, array, comparator);
+    auto index =
+        emitBinarySearch(b, lo, start, pivots, outputMemrefs, comparator);
+
+    //   int n = start - index;  // The number of elements to move
+    Value n = arith.sub(start, index);
+
+    // memmove(&array[index + 1], &array[index], n * sizeof(T))
+    // memref::CopyOp would be nice to use here, but:
+    // 1. It lowers to a quite inefficient library call in the general case
+    //    (strides != 1).
+    // 2. It implements memcpy semantics, but we need memmove here.
+    // So we go with a loop instead.
+    auto copyForOp = b.create<scf::ForOp>(zero, n, one);
+    {
+      OpBuilder::InsertionGuard innerGuard(b);
+      b.setInsertionPointToStart(copyForOp.getBody());
+      Value copyLoopIndex = copyForOp.getBody()->getArgument(0);
+
+      Value dstIndex = arith.sub(start, copyLoopIndex);
+      Value srcIndex = arith.sub(dstIndex, one);
+      storeMemrefElements(b, outputMemrefs, dstIndex,
+                          loadMemrefElements(b, outputMemrefs, srcIndex));
+    }
+    //   array[index] = pivot;
+    storeMemrefElements(b, outputMemrefs, index, pivots);
+  }
+}
+
+void emitMerge(ImplicitLocOpBuilder& b, Value lo, Value mid, Value hi,
+               ValueRange readBufs, ValueRange writeBufs,
+               mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  // The while loop runs until we reach the end of either interval. It has three
+  // loop-carried variables:
+  // 1. current output index
+  // 2. current read index for interval 1
+  // 3. current read index for interval 2
+  SmallVector<Type> whileArgTypes{lo.getType(), lo.getType(), mid.getType()};
+  SmallVector<Value> whileInitArgs{lo, lo, mid};
+  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
+
+  // while(
+  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
+  {
+    OpBuilder::InsertionGuard guard(b);
+    {
+      Block* before =
+          b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
+      Value i0 = before->getArgument(1), i1 = before->getArgument(2);
+      b.setInsertionPointToEnd(before);
+
+      //     i0 < mid && i1 < hi) {
+      Value inbounds0 = arith.slt(i0, mid);
+      Value inbounds1 = arith.slt(i1, hi);
+
+      b.create<scf::ConditionOp>(arith._and(inbounds0, inbounds1),
+                                 before->getArguments());
+    }
+
+    {
+      Block* after =
+          b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
+      Value iOut = after->getArgument(0), i0 = after->getArgument(1),
+            i1 = after->getArgument(2);
+      b.setInsertionPointToEnd(after);
+
+      //   auto vals0 = readBufs[i0], vals1 = readBufs[i1];
+      SmallVector<Value> vals0 = loadMemrefElements(b, readBufs, i0);
+      SmallVector<Value> vals1 = loadMemrefElements(b, readBufs, i1);
+
+      //   writeBufs[iOut] = comparator(vals1, vals0)
+      //                       ? readBufs[i1++] : readBufs[i0++];
+      Value cmp = emitComparison(b, vals1, vals0, comparator);
+      SmallVector<Value> pickedVals;
+      for (auto [val0, val1] : llvm::zip(vals0, vals1)) {
+        pickedVals.push_back(b.create<SelectOp>(cmp, val1, val0));
+      }
+      storeMemrefElements(b, writeBufs, iOut, pickedVals);
+
+      Value one = b.create<arith::ConstantIndexOp>(1);
+      Value nexti0 = b.create<SelectOp>(cmp, i0, arith.add(i0, one));
+      Value nexti1 = b.create<SelectOp>(cmp, arith.add(i1, one), i1);
+      //   ++iOut;
+      Value nextIOut = b.create<AddIOp>(iOut, one);
+      b.create<scf::YieldOp>(ValueRange{nextIOut, nexti0, nexti1});
+    }
+  }
+
+  // At this point, exactly one of the input ranges will have leftover elements.
+  Value iOut = whileOp->getResult(0);
+  Value i0 = whileOp->getResult(1);
+  Value i1 = whileOp->getResult(2);
+
+  // We could use memref::CopyOp here, but typically, there aren't many leftover
+  // elements for randomly shuffled inputs.
+  Value leftoverIn0 = arith.slt(i0, mid);
+  Value start = arith.select(leftoverIn0, i0, i1);
+  Value end = arith.select(leftoverIn0, mid, hi);
+  Value n = arith.sub(end, start);
+
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value one = b.create<arith::ConstantIndexOp>(1);
+  auto forOp = b.create<scf::ForOp>(zero, n, one);
+  b.setInsertionPointToStart(forOp.getBody());
+  Value copyIndex = forOp.getBody()->getArgument(0);
+
+  Value srcIndex = arith.add(start, copyIndex);
+  Value dstIndex = arith.add(iOut, copyIndex);
+  storeMemrefElements(b, writeBufs, dstIndex,
+                      loadMemrefElements(b, readBufs, srcIndex));
+}
+
+// Emits a bottom up merge sort of `inputTensors` in the range [`lo`; `hi`), and
+// writes the results to either `outputs0` or `outputs1`.
+// Returns 0 if the results are in `outputs0`, 1 if they are in `outputs1`.
+// TODO(jreiffers): Consider implementing top-down merge sort.
+Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
+                            int64_t staticSortDimSize, ValueRange inputTensors,
+                            ValueRange outputs0, ValueRange outputs1,
+                            mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  Value size = arith.sub(hi, lo);
+
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value insertionSortSize =
+      b.create<arith::ConstantIndexOp>(kInsertionSortSize);
+
+  // Run insertion sort on blocks of size kInsertionSortSize.
+  // for (int start = 0; start < size; start += kInsertionSortSize) {
+  {
+    auto forOp = b.create<scf::ForOp>(zero, size, insertionSortSize);
+    OpBuilder::InsertionGuard guard(b);
+    b.setInsertionPointToStart(forOp.getBody());
+    Value start = forOp.getBody()->getArgument(0);
+    Value end = arith.add(
+        b.create<MinSIOp>(arith.add(start, insertionSortSize), size), lo);
+    emitInsertionSort(b, start, end, inputTensors, outputs0, comparator);
+  }
+
+  Value initParity = b.create<arith::ConstantIntOp>(0, 1);
+  if (staticSortDimSize >= 0 && staticSortDimSize < kInsertionSortSize) {
+    return initParity;
+  }
+
+  // The while arguments are:
+  // 1. the current size
+  // 2. the original index of the buffers we're currently reading from
+  // 3. the buffers we're currently reading from
+  // 4. the buffers we're currently writing to.
+  //
+  // 1 gets doubled each iteration, 2 gets negated, 3 and 4 are swapped.
+  // int currentSize = 16;
+  SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
+  // First we read from `outputs0` (initialized by the insertion sort above).
+  llvm::copy(outputs0, std::back_inserter(whileInitArgs));
+  llvm::copy(outputs1, std::back_inserter(whileInitArgs));
+
+  SmallVector<Type> whileArgTypes;
+  for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
+
+  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
+
+  // while (
+  auto whileOp = b.create<scf::WhileOp>(whileArgTypes, whileInitArgs);
+  OpBuilder::InsertionGuard guard(b);
+
+  //        currentSize < totalSize)
+  {
+    Block* before =
+        b.createBlock(&whileOp.getBefore(), {}, whileArgTypes, whileArgLocs);
+    Value currentSize = before->getArgument(0);
+    b.setInsertionPointToEnd(before);
+    b.create<scf::ConditionOp>(arith.slt(currentSize, size),
+                               before->getArguments());
+  }
+
+  size_t numArgs = inputTensors.size();
+  //                                 {
+  {
+    Block* after =
+        b.createBlock(&whileOp.getAfter(), {}, whileArgTypes, whileArgLocs);
+
+    Value currentSize = after->getArgument(0);
+    Value parity = after->getArgument(1);
+    auto readBufs = after->getArguments().drop_front(2).take_front(numArgs);
+    auto writeBufs = after->getArguments().take_back(numArgs);
+
+    Value twoCurrentSize = arith.add(currentSize, currentSize);
+
+    // for (int start = 0; start < size; start += 2*currentSize) {
+    {
+      auto forOp = b.create<scf::ForOp>(zero, size, twoCurrentSize);
+      b.setInsertionPointToStart(forOp.getBody());
+      Value start = forOp.getBody()->getArgument(0);
+
+      Value mid = b.create<MinSIOp>(size, arith.add(start, currentSize));
+      Value end = b.create<MinSIOp>(size, arith.add(start, twoCurrentSize));
+      emitMerge(b, start, mid, end, readBufs, writeBufs, comparator);
+      b.setInsertionPointAfter(forOp);
+    }
+    // }
+
+    // parity = !parity;
+    Value one = b.create<arith::ConstantIntOp>(1, 1);
+    Value notParity = arith.sub(one, parity);
+    // currentSize *= 2;
+    SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
+    llvm::copy(writeBufs, std::back_inserter(nextWhileArgs));
+    llvm::copy(readBufs, std::back_inserter(nextWhileArgs));
+    b.create<scf::YieldOp>(nextWhileArgs);
+  }
+  // }
+
+  // The result is the parity bit.
+  return whileOp.getResults().drop_front(1).front();
+}
+
+// Helper struct for extracting 1d slices from tensors and memrefs.
+struct Slicer {
+  Slicer(OpBuilder& b, uint64_t sortDim, Value sortDimSize, ValueRange ivs)
+      : sizes(ivs.size() + 1, b.getI64IntegerAttr(1)),
+        strides(ivs.size() + 1, b.getI64IntegerAttr(1)) {
+    sizes[sortDim] = sortDimSize;
+    for (size_t i = 0; i < ivs.size() + 1; ++i) {
+      if (i == sortDim) {
+        offsets.push_back(b.getI64IntegerAttr(0));
+      } else {
+        offsets.push_back(ivs[i - static_cast<int>(i > sortDim)]);
+      }
+    }
+  }
+
+  RankedTensorType toSlicedType(RankedTensorType sourceType) {
+    return tensor::ExtractSliceOp::inferCanonicalRankReducedResultType(
+        /*resultRank=*/1, sourceType, offsets, sizes, strides);
+  }
+
+  MemRefType toSlicedType(MemRefType sourceType) {
+    return memref::SubViewOp::inferRankReducedResultType(
+               {ShapedType::kDynamic} /*1D output*/, sourceType, offsets, sizes,
+               strides)
+        .cast<MemRefType>();
+  }
+
+  template <typename Op, typename Ty>
+  Value slice(ImplicitLocOpBuilder& b, Value input) {
+    Ty ty = input.getType().cast<Ty>();
+    return b.create<Op>(toSlicedType(ty), input, offsets, sizes, strides)
+        .getResult();
+  }
+
+  Value apply(ImplicitLocOpBuilder& b, Value input) {
+    Type inTy = input.getType();
+    if (inTy.isa<RankedTensorType>()) {
+      return slice<tensor::ExtractSliceOp, RankedTensorType>(b, input);
+    }
+    assert(inTy.isa<MemRefType>());
+    return slice<memref::SubViewOp, MemRefType>(b, input);
+  }
+
+  SmallVector<OpFoldResult> offsets;
+  SmallVector<OpFoldResult> sizes;
+  SmallVector<OpFoldResult> strides;
+};
+
+SmallVector<Value> sliceMemrefsOrTensors(ImplicitLocOpBuilder& b,
+                                         SmallVector<Value>& ivs,
+                                         Value sortDimSize,
+                                         ValueRange memrefsOrTensors,
+                                         SortOp op) {
+  if (ivs.empty()) return memrefsOrTensors;
+
+  SmallVector<Value> outputs;
+  Slicer slicer(b, op.getDimension(), sortDimSize, ivs);
+  // Create subviews/slices.
+  for (Value out : memrefsOrTensors) {
+    outputs.push_back(slicer.apply(b, out));
+  }
+
+  return outputs;
+}
+
+struct SortOpPattern : public OpRewritePattern<SortOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SortOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // Note: the output memrefs aren't necessarily the ones that we return,
+    SmallVector<Value> outputMemrefs;
+    SmallVector<Value> scratchMemrefs;
+
+    Value firstOperand = op.getOperands().front();
+    auto firstOperandType = firstOperand.getType().cast<ShapedType>();
+    int64_t inputRank = firstOperandType.getRank();
+
+    Value sortDimSize = b.createOrFold<tensor::DimOp>(
+        firstOperand, b.create<arith::ConstantIndexOp>(op.getDimension()));
+    int64_t staticSortDimSize = firstOperandType.getDimSize(op.getDimension());
+
+    SmallVector<Value> dynamicDims;
+    for (int i = 0; i < inputRank; ++i) {
+      if (!firstOperandType.isDynamicDim(i)) continue;
+      Value index = b.create<arith::ConstantIndexOp>(i);
+      Value dimOp = b.create<tensor::DimOp>(firstOperand, index);
+      dynamicDims.push_back(dimOp);
+    }
+
+    // Allocate output and scratch memrefs. If the size of the sort dimension is
+    // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
+    // and will be cleaned up later.
+    for (auto input : op.getOperands()) {
+      auto inputType = input.getType().cast<ShapedType>();
+      auto memRefType =
+          MemRefType::get(inputType.getShape(), inputType.getElementType());
+
+      outputMemrefs.push_back(
+          b.create<memref::AllocOp>(memRefType, dynamicDims));
+      scratchMemrefs.push_back(
+          b.create<memref::AllocOp>(memRefType, dynamicDims));
+    }
+
+    b.setInsertionPoint(op);
+    Value zero = b.create<arith::ConstantIndexOp>(0);
+    Value one = b.create<arith::ConstantIndexOp>(1);
+
+    Value forInitArg = b.create<arith::ConstantIntOp>(0, 1);
+    SmallVector<scf::ForOp> forOps;
+    SmallVector<Value> ivs;
+    forOps.reserve(inputRank - 1);
+    ivs.reserve(inputRank - 1);
+    for (int64_t i = 0; i < inputRank; ++i) {
+      if (i != static_cast<int64_t>(op.getDimension())) {
+        Value dim = b.create<arith::ConstantIndexOp>(i);
+        Value ub = b.create<tensor::DimOp>(firstOperand, dim);
+        scf::ForOp& forOp = forOps.emplace_back(
+            b.create<scf::ForOp>(zero, ub, one, ValueRange{forInitArg}));
+        ivs.push_back(forOp.getInductionVar());
+        b.setInsertionPointToStart(&forOp.getRegion().front());
+      }
+    }
+    SmallVector<Value> inputs =
+        sliceMemrefsOrTensors(b, ivs, sortDimSize, op.getOperands(), op);
+    SmallVector<Value> outputs =
+        sliceMemrefsOrTensors(b, ivs, sortDimSize, outputMemrefs, op);
+    SmallVector<Value> scratches =
+        sliceMemrefsOrTensors(b, ivs, sortDimSize, scratchMemrefs, op);
+
+    Value parity =
+        emitBottomUpMergeSort(b, zero, sortDimSize, staticSortDimSize, inputs,
+                              outputs, scratches, op.getRegion());
+
+    // Pass the parity bit through the for loops.
+    for (auto i = static_cast<int64_t>(forOps.size() - 1); i >= 0; --i) {
+      b.setInsertionPointToEnd(&forOps[i].getRegion().front());
+      b.create<scf::YieldOp>(ValueRange{parity});
+      parity = forOps[i]->getResult(0);
+    }
+    b.setInsertionPoint(op);
+
+    SmallVector<Value> outputTensors;
+    for (auto [out0, out1] : llvm::zip(outputMemrefs, scratchMemrefs)) {
+      outputTensors.push_back(b.create<bufferization::ToTensorOp>(
+          b.create<SelectOp>(parity, out1, out0)));
+    }
+
+    rewriter.replaceOp(op, outputTensors);
+    return success();
+  }
+};
+
+struct LegalizeSortPass
+    : public impl::HloLegalizeSortPassBase<LegalizeSortPass> {
+  // Perform the lowering to MLIR control flow.
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext* ctx = f.getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<SortOpPattern>(ctx);
+
+    mlir::ConversionTarget target(*ctx);
+    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
+    target.addIllegalOp<mhlo::SortOp>();
+
+    if (failed(applyPartialConversion(f, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+}  // namespace mhlo
+}  // namespace mlir
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+mlir::mhlo::createLegalizeSortPass() {
+  return std::make_unique<LegalizeSortPass>();
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
similarity index 86%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index 8bc2cad5f10..affac5688b0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <memory>
 #include <numeric>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -25,19 +27,21 @@ limitations under the License.
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/legalize_to_linalg_utils.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -67,7 +71,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOLEGALIZETOLINALGPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -146,6 +150,15 @@ Value extractIndexFromTensor(OpBuilder& builder, Location loc, Value tensor,
                    loc, builder.getIndexType(), extracted);
 }
 
+/// Ensures a tensor has the same shape (not including the element type) as
+/// another.
+Value coerceTensorShape(OpBuilder& builder, Location loc,
+                        TypedValue<ShapedType> value, ShapedType targetType) {
+  return builder.createOrFold<tensor::CastOp>(
+      loc, targetType.cloneWith(std::nullopt, value.getType().getElementType()),
+      value);
+}
+
 /// Returns true if the given `dimensionNumbers` from a mhlo.convolution op
 /// follows a canonical form:
 ///
@@ -304,15 +317,15 @@ struct RngUniformConversion : public OpConversionPattern<mhlo::RngOp> {
 // Looks through a set of dimension that has been marked as reduction axes,
 // if it is found within the set, then we set it as "reduction", otherwise
 // we can label it as "parallel".
-SmallVector<StringRef, 3> getEinsumLoopsAttrs(
+SmallVector<utils::IteratorType, 3> getEinsumLoopsAttrs(
     const llvm::SmallSetVector<StringRef, 4>& inputInd,
     const llvm::SmallSetVector<StringRef, 4>& reductionDims) {
-  SmallVector<StringRef, 3> res;
+  SmallVector<utils::IteratorType, 3> res;
   for (StringRef dim : inputInd) {
     if (!reductionDims.contains(dim)) {
-      res.push_back(getParallelIteratorTypeName());
+      res.push_back(utils::IteratorType::parallel);
     } else {
-      res.push_back(getReductionIteratorTypeName());
+      res.push_back(utils::IteratorType::reduction);
     }
   }
   return res;
@@ -332,14 +345,14 @@ SmallVector<Value, 2> extractDynamicEinsumSizes(
       // Query from lhs vars.
       auto dimIndPos = dimIndIt - lhsLoopVec.begin();
       auto lhsShape = lhs.getType().dyn_cast<RankedTensorType>().getShape();
-      if (lhsShape[dimIndPos] != ShapedType::kDynamicSize) continue;
+      if (lhsShape[dimIndPos] != ShapedType::kDynamic) continue;
       dimSize = b.create<tensor::DimOp>(loc, lhs, dimIndPos);
     } else {
       // query from rhs vars.
       dimIndIt = std::find(rhsLoopVec.begin(), rhsLoopVec.end(), dimInd);
       auto dimIndPos = dimIndIt - rhsLoopVec.begin();
       auto rhsShape = rhs.getType().dyn_cast<RankedTensorType>().getShape();
-      if (rhsShape[dimIndPos] != ShapedType::kDynamicSize) continue;
+      if (rhsShape[dimIndPos] != ShapedType::kDynamic) continue;
       dimSize = b.create<tensor::DimOp>(loc, rhs, dimIndPos);
     }
     dynSizes.push_back(dimSize);
@@ -689,6 +702,30 @@ class BroadcastConverter
   }
 };
 
+class BroadcastOpToBroadcastConverter
+    : public OpConversionPattern<mhlo::BroadcastOp> {
+  using OpConversionPattern<mhlo::BroadcastOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::BroadcastOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    auto resultTy = typeConverter->convertType(op.getType()).cast<ShapedType>();
+
+    int64_t numPrependedDims = op.getBroadcastSizes().size();
+    SmallVector<int64_t> dimensions =
+        llvm::to_vector(llvm::seq<int64_t>(0, numPrependedDims));
+
+    auto loc = op.getLoc();
+    Value emptyTensor =
+        getEmptyTensorFor(rewriter, loc, resultTy, op, adaptor.getOperands());
+
+    rewriter.replaceOpWithNewOp<linalg::BroadcastOp>(
+        op, op.getOperand(), emptyTensor, dimensions,
+        linalg::getPrunedAttributeList(op));
+    return success();
+  }
+};
+
 class HloBroadcastInDimConverter
     : public DataMovementOpConverter<HloBroadcastInDimConverter,
                                      mhlo::BroadcastInDimOp> {
@@ -730,6 +767,120 @@ class HloBroadcastInDimConverter
   }
 };
 
+Value collapseExpandingDims(PatternRewriter& rewriter, Location loc,
+                            Value operand, SmallVector<int64_t>& dimensions,
+                            llvm::function_ref<bool(int64_t)> isExpandingDim) {
+  auto operandTy = operand.getType().cast<RankedTensorType>();
+
+  SmallVector<ReassociationIndices> reassociationMap;
+  ReassociationIndices currentIndices;
+
+  ArrayRef<int64_t> operandShape = operandTy.getShape();
+  SmallVector<int64_t> newOperandShape;
+  SmallVector<int64_t> newDimensions;
+
+  for (auto& [idx, dim] : llvm::enumerate(dimensions)) {
+    currentIndices.push_back(idx);
+
+    if (!isExpandingDim(idx)) {
+      reassociationMap.push_back(currentIndices);
+      currentIndices.clear();
+      newOperandShape.push_back(operandShape[idx]);
+      newDimensions.push_back(dim);
+    }
+  }
+
+  if (!reassociationMap.empty())
+    reassociationMap.back().insert(reassociationMap.back().end(),
+                                   currentIndices.begin(),
+                                   currentIndices.end());
+
+  if (dimensions.size() != newDimensions.size()) {
+    dimensions = newDimensions;
+
+    auto newOperandType =
+        RankedTensorType::get(newOperandShape, operandTy.getElementType());
+    operand = rewriter.create<tensor::CollapseShapeOp>(
+        loc, newOperandType, operand, reassociationMap);
+  }
+  return operand;
+}
+
+// Insert linalg.transpose if broadcasted dimensions are not in sorded order.
+// linalg.broadcast does not support implicit transpose, so the input needs to
+// be explicitely transposed.
+Value transposeBroadcastOperand(PatternRewriter& rewriter, Location loc,
+                                Value operand,
+                                SmallVector<int64_t>& dimensions) {
+  // Do not insert `transpose` is dimensions are already sorted.
+  if (llvm::is_sorted(dimensions)) return operand;
+
+  SmallVector<int64_t> permutation =
+      llvm::to_vector(llvm::seq<int64_t>(0, dimensions.size()));
+  llvm::sort(permutation, [&](int64_t lhs, int64_t rhs) {
+    return dimensions[lhs] < dimensions[rhs];
+  });
+
+  auto operandTy = operand.getType().cast<ShapedType>();
+  ArrayRef<int64_t> operandShape = operandTy.getShape();
+  SmallVector<int64_t> transposedOperandShape, transposedDimensions;
+
+  for (int64_t index : permutation) {
+    transposedOperandShape.push_back(operandShape[index]);
+    transposedDimensions.push_back(dimensions[index]);
+  }
+  dimensions = transposedDimensions;
+
+  return rewriter.create<mhlo::TransposeOp>(
+      loc,
+      RankedTensorType::get(transposedOperandShape, operandTy.getElementType()),
+      operand, rewriter.getI64VectorAttr(permutation));
+}
+
+class BroadcastInDimOpToBroadcastConverter
+    : public OpConversionPattern<mhlo::BroadcastInDimOp> {
+ public:
+  using OpConversionPattern<mhlo::BroadcastInDimOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::BroadcastInDimOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    Location loc = op.getLoc();
+
+    SmallVector<int64_t> broadcastDimensions =
+        llvm::to_vector(op.getBroadcastDimensions().getValues<int64_t>());
+
+    Value operand = adaptor.getOperand();
+    auto operandTy = operand.getType().cast<ShapedType>();
+    auto resultTy = typeConverter->convertType(op.getType()).cast<ShapedType>();
+
+    ArrayRef<int64_t> operandShape = operandTy.getShape();
+    ArrayRef<int64_t> resultShape = resultTy.getShape();
+
+    operand = collapseExpandingDims(
+        rewriter, loc, operand, broadcastDimensions, [&](int64_t i) {
+          return operandShape[i] == 1 &&
+                 resultShape[broadcastDimensions[i]] != 1;
+        });
+    operand =
+        transposeBroadcastOperand(rewriter, loc, operand, broadcastDimensions);
+
+    Value emptyTensor =
+        getEmptyTensorFor(rewriter, loc, resultTy, op, adaptor.getOperands());
+
+    SmallVector<int64_t> addedDimensions;
+    for (int64_t dim : llvm::seq<int64_t>(0, resultTy.getRank())) {
+      if (!llvm::is_contained(broadcastDimensions, dim))
+        addedDimensions.push_back(dim);
+    }
+
+    rewriter.replaceOpWithNewOp<linalg::BroadcastOp>(
+        op, operand, emptyTensor, addedDimensions,
+        linalg::getPrunedAttributeList(op));
+    return success();
+  }
+};
+
 // If the input has a static shape we know exactly when the broadcast must
 // expand (the dimension is 1, which also trivially expands to 1) or will never
 // expand (the dimension is not 1). We can also source the information from the
@@ -801,10 +952,9 @@ class HloDynamicBroadcastInDimConverter
     rewriter.replaceOpWithNewOp<linalg::GenericOp>(
         op, TypeRange{emptyTensor.getType()}, ValueRange{operand},
         /*outputBuffers=*/ValueRange{emptyTensor},
-        llvm::makeArrayRef(
-            {AffineMap::get(/*dimCount=*/nloops, /*symbolCount=*/0, dimExprs,
-                            rewriter.getContext()),
-             rewriter.getMultiDimIdentityMap(nloops)}),
+        llvm::ArrayRef({AffineMap::get(/*dimCount=*/nloops, /*symbolCount=*/0,
+                                       dimExprs, rewriter.getContext()),
+                        rewriter.getMultiDimIdentityMap(nloops)}),
         getNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location /*nested_loc*/,
             ValueRange args) {
@@ -815,6 +965,128 @@ class HloDynamicBroadcastInDimConverter
   }
 };
 
+class DynamicBroadcastInDimOpToBroadcastConverter
+    : public OpConversionPattern<mhlo::DynamicBroadcastInDimOp> {
+ public:
+  using OpConversionPattern<mhlo::DynamicBroadcastInDimOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::DynamicBroadcastInDimOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    Location loc = op.getLoc();
+
+    Value operand = adaptor.getOperand();
+    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
+    if (!operandTy) return failure();
+    auto resultTy =
+        typeConverter->convertType(op.getType()).dyn_cast<RankedTensorType>();
+    if (!resultTy) return failure();
+
+    SmallVector<int64_t> broadcastDimensions =
+        llvm::to_vector(op.getBroadcastDimensions().getValues<int64_t>());
+
+    SmallVector<Optional<bool>> expansionBehavior(broadcastDimensions.size());
+
+    // Use static type info.
+    for (const auto& [idx, dim] : llvm::enumerate(operandTy.getShape())) {
+      if (ShapedType::isDynamic(dim)) continue;
+      expansionBehavior[idx] = (dim == 1);
+    }
+
+    // Use annotated expansion behavior, if available.
+    if (op.getKnownExpandingDimensions()) {
+      for (const auto& it :
+           op.getKnownExpandingDimensions()->getValues<int64_t>()) {
+        expansionBehavior[it] = true;
+      }
+    }
+    if (op.getKnownNonexpandingDimensions()) {
+      for (const auto& it :
+           op.getKnownNonexpandingDimensions()->getValues<int64_t>()) {
+        expansionBehavior[it] = false;
+      }
+    }
+
+    // Fail if unknown expansion behavior remains.
+    if (llvm::any_of(expansionBehavior, [](auto v) { return !v.has_value(); }))
+      return failure();
+
+    auto isExpandingDim = [&](int64_t i) {
+      return expansionBehavior[i].value();
+    };
+
+    // Use attribute information to insert 1s into operand type.
+    operand = getBroadcastOperand(rewriter, loc, operand, isExpandingDim);
+
+    auto broadcastResultTy = getBroadcastResultType(
+        operand, resultTy, broadcastDimensions, isExpandingDim);
+
+    operand = collapseExpandingDims(rewriter, loc, operand, broadcastDimensions,
+                                    isExpandingDim);
+    operand =
+        transposeBroadcastOperand(rewriter, loc, operand, broadcastDimensions);
+
+    Value emptyTensor = getEmptyTensorFor(rewriter, loc, broadcastResultTy, op,
+                                          adaptor.getOperands());
+
+    SmallVector<int64_t> addedDimensions;
+    for (int64_t dim : llvm::seq<int64_t>(0, resultTy.getRank())) {
+      if (!llvm::is_contained(broadcastDimensions, dim))
+        addedDimensions.push_back(dim);
+    }
+
+    Value result = rewriter
+                       .create<linalg::BroadcastOp>(
+                           loc, operand, emptyTensor, addedDimensions,
+                           linalg::getPrunedAttributeList(op))
+                       .getResults()[0];
+
+    if (resultTy != broadcastResultTy) {
+      result = rewriter.create<tensor::CastOp>(loc, resultTy, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+ private:
+  static Value getBroadcastOperand(
+      PatternRewriter& rewriter, Location loc, Value operand,
+      llvm::function_ref<bool(int64_t)> isExpandingDim) {
+    auto operandTy = operand.getType().dyn_cast<RankedTensorType>();
+
+    SmallVector<int64_t> updatedOperandShape =
+        llvm::to_vector(operandTy.getShape());
+    for (size_t i = 0; i < updatedOperandShape.size(); ++i) {
+      if (isExpandingDim(i)) updatedOperandShape[i] = 1;
+    }
+
+    auto updatedOperandTy =
+        RankedTensorType::get(updatedOperandShape, operandTy.getElementType());
+
+    if (updatedOperandTy != operandTy) {
+      operand = rewriter.create<tensor::CastOp>(loc, updatedOperandTy, operand);
+    }
+
+    return operand;
+  }
+
+  static ShapedType getBroadcastResultType(
+      Value operand, RankedTensorType resultTy, ArrayRef<int64_t> dimensions,
+      llvm::function_ref<bool(int64_t)> isExpandingDim) {
+    auto operandShape = operand.getType().cast<RankedTensorType>().getShape();
+    auto broadcastResultShape = llvm::to_vector(resultTy.getShape());
+
+    for (auto [operandIndex, resultIndex] : llvm::enumerate(dimensions)) {
+      if (isExpandingDim(operandIndex)) continue;
+      broadcastResultShape[resultIndex] = operandShape[operandIndex];
+    }
+
+    return RankedTensorType::get(broadcastResultShape,
+                                 resultTy.getElementType());
+  }
+};
+
 template <typename OpTy>
 class TransposeConverter
     : public DataMovementOpConverter<TransposeConverter<OpTy>, OpTy> {
@@ -853,7 +1125,7 @@ class TransposeOpToTransposeConverter
         llvm::to_vector(op.getPermutation().getValues<int64_t>()));
 
     rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
-        op, op.getOperand(), emptyTensor, permutation,
+        op, adaptor.getOperand(), emptyTensor, permutation,
         linalg::getPrunedAttributeList(op));
     return success();
   }
@@ -1116,8 +1388,7 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
           // source.
           if (resultType.isDynamicDim(map.index())) continue;
           for (auto targetDim : map.value()) {
-            if (shape[targetDim] == ShapedType::kDynamicSize)
-              shape[targetDim] = 1;
+            if (shape[targetDim] == ShapedType::kDynamic) shape[targetDim] = 1;
           }
         }
         // Insert a cast if types are not the same (ignoring sparse encoding).
@@ -1205,7 +1476,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
 
         ValueRange{getEmptyTensorFor(rewriter, loc, resultShapedType, iotaOp,
                                      adaptor.getOperands())},
-        llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
+        llvm::ArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
         getNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange /*args*/) {
           Value indexOp = nestedBuilder.create<linalg::IndexOp>(
@@ -1262,7 +1533,7 @@ struct ConcatenateConverter : public OpConversionPattern<mhlo::ConcatenateOp> {
         op,
         /*resultTensorTypes=*/resultType,
         /*inputs=*/ValueRange{}, /*outputBuffers=*/result,
-        llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
+        llvm::ArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
         getNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location loc, ValueRange) {
           OpBuilder b = nestedBuilder;
@@ -1386,9 +1657,9 @@ class SliceConverter : public OpConversionPattern<mhlo::SliceOp> {
       // Say that there are k elements in total, we have condition:
       //   start + (k - 1) * strides <= limit - 1
       // ->
-      //   k <= (limit - 1 - start) / strides + 1
+      //   k <= (limit - 1 - start + strides) / strides
       sizes.push_back(
-          rewriter.getI64IntegerAttr((limit - 1 - start) / stride + 1));
+          rewriter.getI64IntegerAttr((limit - 1 - start + stride) / stride));
       strides.push_back(rewriter.getI64IntegerAttr(stride));
     }
     rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
@@ -1524,8 +1795,7 @@ DotOperationType getDotOperationType(mhlo::DotOp dotOp) {
   ArrayRef<int64_t> rhsShape =
       dotOp.getRhs().getType().cast<ShapedType>().getShape();
   auto shapeMatches = [](int64_t a, int64_t b) {
-    return a == ShapedType::kDynamicSize || b == ShapedType::kDynamicSize ||
-           a == b;
+    return a == ShapedType::kDynamic || b == ShapedType::kDynamic || a == b;
   };
   if (lhsShape.size() == 1 && rhsShape.size() == 1 &&
       shapeMatches(lhsShape[0], rhsShape[0])) {
@@ -1723,15 +1993,17 @@ class MapOpToMapConverter : public OpConversionPattern<mhlo::MapOp> {
            "Expected a pointwise map");
 
     Location loc = op.getLoc();
-    Value output =
-        getEmptyTensorFor(rewriter, loc, resultType, op, adaptor.getOperands());
+    Value operand0 = adaptor.getOperands()[0];
+    Value operand1 = coerceTensorShape(
+        rewriter, loc, cast<TypedValue<ShapedType>>(adaptor.getOperands()[1]),
+        operand0.getType());
+    Value output = rewriter.create<tensor::EmptyOp>(
+        loc, tensor::getMixedSizes(rewriter, loc, operand0),
+        resultType.getElementType());
 
     auto linalgOp = rewriter.create<linalg::MapOp>(
-        loc, resultType, adaptor.getOperands(), output);
-    // TODO(shyshkov): Add a builder for linalg::MapOp that accepts (inputs,
-    // init, attrs). Default builder can do either (inputs, init) or (all
-    // operands, attrs).
-    linalgOp->setAttrs(linalg::getPrunedAttributeList(op));
+        loc, ValueRange{operand0, operand1}, output,
+        /*bodyBuild=*/nullptr, linalg::getPrunedAttributeList(op));
 
     // Convert the signature of the body. We scalarize the operands and add a
     // scalar operand representing the output tensor.
@@ -1748,17 +2020,13 @@ class MapOpToMapConverter : public OpConversionPattern<mhlo::MapOp> {
 
     rewriter.applySignatureConversion(&region, signatureConverter,
                                       getTypeConverter());
-    rewriter.replaceOp(op, linalgOp.getResults());
+    auto result = rewriter.createOrFold<tensor::CastOp>(loc, resultType,
+                                                        linalgOp.getResults());
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
 
-bool isInBodyOfLinalgOps(Operation* op) {
-  auto* parentOp = op->getParentRegion()->getParentOp();
-  return parentOp->getDialect() ==
-         parentOp->getContext()->getLoadedDialect<linalg::LinalgDialect>();
-}
-
 SmallVector<Value, 8> getReduceOpEmptyTensorDynSizes(
     OpBuilder& b, Location loc, Value arg, ShapedType resultType,
     ArrayRef<int64_t> reductionDims) {
@@ -1799,7 +2067,7 @@ class ReduceRegionReturnOpConversion
   }
 };
 
-class ReduceConversion : public OpConversionPattern<mhlo::ReduceOp> {
+class ReduceOpToGenericConverter : public OpConversionPattern<mhlo::ReduceOp> {
  public:
   using OpConversionPattern<mhlo::ReduceOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
@@ -1899,6 +2167,109 @@ class ReduceConversion : public OpConversionPattern<mhlo::ReduceOp> {
   }
 };
 
+struct ReduceOpToReduceConverter : public OpConversionPattern<mhlo::ReduceOp> {
+  using OpConversionPattern<mhlo::ReduceOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::ReduceOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    auto reductionDims =
+        llvm::to_vector(op.getDimensions().getValues<int64_t>());
+    // mhlo.reduce doesn't specify the order of the reduction dimensions.
+    llvm::sort(reductionDims);
+
+    auto toRankedTensor = [](Value v) -> RankedTensorType {
+      return v.getType().dyn_cast<RankedTensorType>();
+    };
+
+    SmallVector<Value> outputs;
+    SmallVector<RankedTensorType> operandTypes, initTypes;
+    SmallVector<Type> resultTypes;
+    if (failed(typeConverter->convertTypes(op.getResultTypes(), resultTypes)))
+      return failure();
+
+    Location loc = op.getLoc();
+    for (auto [operand, initValue, resultType] :
+         llvm::zip(adaptor.getInputs(), adaptor.getInitValues(), resultTypes)) {
+      auto initType = toRankedTensor(initValue);
+      if (!initType)
+        return rewriter.notifyMatchFailure(op,
+                                           "expects known-rank init values");
+      initTypes.push_back(initType);
+      auto operandType = toRankedTensor(operand);
+      if (!operandType)
+        return rewriter.notifyMatchFailure(op, "expects known-rank operands");
+      operandTypes.push_back(operandType);
+      initValue = rewriter.createOrFold<tensor::ExtractOp>(loc, initValue);
+      auto tensorResultType = resultType.cast<RankedTensorType>();
+      // For linalg.reduce, the result type's dimensions must match the input's
+      // dimensions, whereas MHLO allows replacing static dimensions with
+      // dynamic ones.
+      SmallVector<int64_t> resultShape;
+      SmallVector<Value, 8> dynShape;
+      for (auto [index, dim] :
+           llvm::enumerate(operand.getType().cast<ShapedType>().getShape())) {
+        if (!llvm::is_contained(reductionDims, index)) {
+          resultShape.push_back(dim);
+          if (ShapedType::isDynamic(dim)) {
+            dynShape.push_back(
+                rewriter.create<tensor::DimOp>(loc, operand, index));
+          }
+        }
+      }
+
+      Value emptyTensor = rewriter.create<tensor::EmptyOp>(
+          loc, resultShape, tensorResultType.getElementType(), dynShape);
+      Value filledTensor =
+          rewriter.create<linalg::FillOp>(loc, initValue, emptyTensor).result();
+      outputs.push_back(filledTensor);
+    }
+
+    auto linalgOp = rewriter.create<linalg::ReduceOp>(
+        loc, adaptor.getInputs(), outputs, reductionDims,
+        /*bodyBuild=*/nullptr, linalg::getPrunedAttributeList(op));
+
+    Region& region = linalgOp.getRegion();
+    rewriter.inlineRegionBefore(op.getBody(), region, region.end());
+
+    // Convert the signature of the body. The reduce op 'computation' region
+    // apply function has a signature with tensor types, this is converted to a
+    // function with element types. E.g. the signature "(tensor<f32>,
+    // tensor<f32>) -> tensor<f32>" will be converted to "(f32, f32) -> f32".
+    // Also, we need to swap the operands of the function. The mhlo.reduce op
+    // expects the init values to be the first parameters of the apply function,
+    // while the linalg.reduction op expects the init values as the last
+    // parameters of the 'combiner' region apply function.
+    TypeConverter::SignatureConversion signatureConverter(
+        linalgOp.getNumDpsInputs() * 2);
+    assert(linalgOp.getNumDpsInputs() == linalgOp.getNumDpsInits());
+    for (const auto& [idx, val] : llvm::enumerate(operandTypes)) {
+      signatureConverter.addInputs(
+          /*origInputNo=*/idx + linalgOp.getNumDpsInputs(),
+          // type for new operand number 'idx'.
+          typeConverter->convertType(val.getElementType()));
+    }
+    for (const auto& [idx, val] : llvm::enumerate(initTypes)) {
+      signatureConverter.addInputs(
+          /*origInputNo=*/idx,
+          // type for new operand number 'idx' + linalgOp.getNumInputs()
+          typeConverter->convertType(val.getElementType()));
+    }
+    rewriter.applySignatureConversion(&region, signatureConverter,
+                                      getTypeConverter());
+
+    // Cast the result to the correct type.
+    SmallVector<Value> results;
+    for (auto [result, resultType] :
+         llvm::zip(linalgOp.getResults(), resultTypes)) {
+      results.push_back(
+          rewriter.createOrFold<tensor::CastOp>(loc, resultType, result));
+    }
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
 // Decomposes a pad with negative edge padding into a pad without negative edge
 // padding and a tensor.extract_slice.
 struct PadOpNegativePaddingConversion
@@ -2069,6 +2440,29 @@ Value applyConvolutionPadding(Location loc, Value input,
       DenseIntElementsAttr::get(attrType, padInterior));
 }
 
+// If the ConvolutionOp has a window reversal, applies it to the filter.
+Value applyConvolutionReversal(Location loc, OpBuilder& b, ConvolutionOp op,
+                               Value filter) {
+  auto reversals = op.getWindowReversal();
+  if (!reversals.has_value()) {
+    return filter;
+  }
+  llvm::SmallVector<int64_t> reversedDims;
+  for (auto [idx, reversed] :
+       llvm::enumerate(reversals.value().getValues<bool>())) {
+    if (reversed) {
+      reversedDims.push_back(
+          op.getDimensionNumbers().getKernelSpatialDimensions()[idx]);
+    }
+  }
+
+  return b.create<mhlo::ReverseOp>(
+      loc, filter,
+      mlir::DenseIntElementsAttr::get(
+          RankedTensorType::get(reversedDims.size(), b.getI64Type()),
+          reversedDims));
+}
+
 /// Converts mhlo.conv operation to linalg named op. This only covers normal
 /// convolution cases. The op must have canonical dimension numbers. Depthwise
 /// convolution and pointwise convolution are not handled in the conversion.
@@ -2087,10 +2481,18 @@ struct NormalConvolutionOpConversion
     Location loc = op.getLoc();
     Value input = adaptor.getLhs();
     Value filter = adaptor.getRhs();
+    filter = applyConvolutionReversal(loc, rewriter, op, filter);
     auto resultType =
         typeConverter->convertType(op.getResult().getType()).cast<ShapedType>();
     int64_t rank = resultType.getRank();
 
+    // Immediately emit an EmptyOp for output tensors with zero dimension.
+    if (llvm::is_contained(resultType.getShape(), 0)) {
+      rewriter.replaceOpWithNewOp<tensor::EmptyOp>(op, resultType.getShape(),
+                                                   resultType.getElementType());
+      return success();
+    }
+
     // The output shape is N spatial_dims F.
     SmallVector<Value, 8> dynSizes;
     if (resultType.isDynamicDim(0)) {
@@ -2193,6 +2595,13 @@ struct ConvolutionOpGeneralConversion
     auto reshapedResultShape = resultType.getShape().vec();
     if (!resultType.hasStaticShape()) return failure();
 
+    // Immediately emit an EmptyOp for output tensors with zero dimension.
+    if (llvm::is_contained(reshapedResultShape, 0)) {
+      rewriter.replaceOpWithNewOp<tensor::EmptyOp>(op, reshapedResultShape,
+                                                   resultType.getElementType());
+      return success();
+    }
+
     auto dimensionNumbers = op.getDimensionNumbers();
     auto inputBatchDimension = dimensionNumbers.getInputBatchDimension();
     auto inputFeatureDimension = dimensionNumbers.getInputFeatureDimension();
@@ -2226,25 +2635,7 @@ struct ConvolutionOpGeneralConversion
     Value modifiedRhs = applyConvolutionPadding(
         op.getLoc(), adaptor.getRhs(), nullptr, adaptor.getRhsDilationAttr(),
         op.getDimensionNumbers().getKernelSpatialDimensions(), rewriter);
-
-    // Decompose the reversal dims into its own step
-    auto reversals = op.getWindowReversal();
-    if (reversals.has_value()) {
-      llvm::SmallVector<int64_t> reversedDims;
-      for (auto& idxAndBool :
-           llvm::enumerate(reversals.value().getValues<bool>()))
-        if (idxAndBool.value())
-          reversedDims.push_back(
-              op.getDimensionNumbers()
-                  .getKernelSpatialDimensions()[idxAndBool.index()]);
-
-      modifiedRhs = rewriter.create<mhlo::ReverseOp>(
-          loc, modifiedRhs,
-          mlir::DenseIntElementsAttr::get(
-              RankedTensorType::get(reversedDims.size(),
-                                    rewriter.getIntegerType(64)),
-              reversedDims));
-    }
+    modifiedRhs = applyConvolutionReversal(loc, rewriter, op, modifiedRhs);
 
     // Non-one values for feature or batch group counts will result in reshaped
     // inputs and outputs. These mappings are used to keep track of the the new
@@ -2290,10 +2681,10 @@ struct ConvolutionOpGeneralConversion
     // If batch or feature count groupings exist, represent this through
     // reshaping the input to have an additional dimension that these groupings
     // exist along, and reduce in that dimension
-    SmallVector<StringRef, 3> iterationLoops;
+    SmallVector<utils::IteratorType, 3> iterationLoops;
     if (featureGroupCount != 1) {
       auto parallelDim = mlir::getAffineDimExpr(nextDim++, ctx);
-      iterationLoops.push_back(getParallelIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::parallel);
       // Reshape LHS
       {
         srcExprs.insert(srcExprs.begin() + inputFeatureDimension, parallelDim);
@@ -2335,7 +2726,7 @@ struct ConvolutionOpGeneralConversion
     }
 
     if (batchGroupCount != 1) {
-      iterationLoops.push_back(getParallelIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::parallel);
       auto parallelDim = mlir::getAffineDimExpr(nextDim++, ctx);
       // Reshape LHS
       {
@@ -2379,7 +2770,7 @@ struct ConvolutionOpGeneralConversion
 
     // Handle input feature dimension
     {
-      iterationLoops.push_back(getReductionIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::reduction);
       auto inputFeatureDim = mlir::getAffineDimExpr(nextDim++, ctx);
       srcExprs[lhsIndexMapping[inputFeatureDimension]] = inputFeatureDim;
       windowExprs[rhsIndexMapping[kernelInputFeatureDimension]] =
@@ -2388,7 +2779,7 @@ struct ConvolutionOpGeneralConversion
 
     // Handle output feature dimension
     {
-      iterationLoops.push_back(getParallelIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::parallel);
       auto outputFeatureDim = mlir::getAffineDimExpr(nextDim++, ctx);
       dstExprs[resultIndexMapping[outputFeatureDimension]] = outputFeatureDim;
       windowExprs[rhsIndexMapping[kernelOutputFeatureDimension]] =
@@ -2398,8 +2789,8 @@ struct ConvolutionOpGeneralConversion
     // Handle spatial Dimensions
     int64_t numSpatialDims = rank - 2;
     for (int64_t i = 0; i < numSpatialDims; i++) {
-      iterationLoops.push_back(getParallelIteratorTypeName());
-      iterationLoops.push_back(getReductionIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::parallel);
+      iterationLoops.push_back(utils::IteratorType::reduction);
       auto dim0 = mlir::getAffineDimExpr(nextDim++, ctx);
       auto dim1 = mlir::getAffineDimExpr(nextDim++, ctx);
 
@@ -2415,7 +2806,7 @@ struct ConvolutionOpGeneralConversion
 
     // Handle batch dimension
     {
-      iterationLoops.push_back(getParallelIteratorTypeName());
+      iterationLoops.push_back(utils::IteratorType::parallel);
       auto batchDim = mlir::getAffineDimExpr(nextDim++, ctx);
 
       srcExprs[lhsIndexMapping[inputBatchDimension]] = batchDim;
@@ -2435,10 +2826,10 @@ struct ConvolutionOpGeneralConversion
             .create<linalg::GenericOp>(
                 loc,
                 /*resultTensors=*/
-                llvm::makeArrayRef<Type>(zeroTensor.getType()),
+                llvm::ArrayRef<Type>(zeroTensor.getType()),
                 /*inputs=*/
-                llvm::makeArrayRef<Value>({modifiedLhs, modifiedRhs}),
-                /*outputs=*/llvm::makeArrayRef<Value>(zeroTensor), inferredMaps,
+                llvm::ArrayRef<Value>({modifiedLhs, modifiedRhs}),
+                /*outputs=*/llvm::ArrayRef<Value>(zeroTensor), inferredMaps,
                 iterationLoops,
                 /*bodyBuild=*/
                 [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange) {
@@ -2517,6 +2908,13 @@ struct DepthwiseConvolutionOpConversion
                                          "expected output has static shapes");
     }
 
+    // Immediately emit an EmptyOp for output tensors with zero dimension.
+    if (llvm::is_contained(resultType.getShape(), 0)) {
+      rewriter.replaceOpWithNewOp<tensor::EmptyOp>(op, resultType.getShape(),
+                                                   resultType.getElementType());
+      return success();
+    }
+
     // Apply padding and input dilation.
     llvm::SmallVector<int64_t> spatialDimMapping(spatialRank);
     std::iota(spatialDimMapping.begin(), spatialDimMapping.end(), 1);
@@ -2943,7 +3341,7 @@ struct ReduceWindowOpConversion
 
       SmallVector<Value> resultDynamicDims;
       for (auto& en : llvm::enumerate(resultType.getShape())) {
-        if (en.value() != ShapedType::kDynamicSize) continue;
+        if (en.value() != ShapedType::kDynamic) continue;
         Value dimSize = rewriter.create<tensor::DimOp>(loc, input, en.index());
         if (en.index() == 0 || static_cast<int64_t>(en.index()) == rank - 1) {
           // batch dims and channel dims can be derived from input dims
@@ -3192,12 +3590,6 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
     ArrayRef<int64_t> startIndexMap =
         gatherOp.getDimensionNumbers().getStartIndexMap();
 
-    auto extractAsIndex = [&](Value input, ArrayRef<Value> index) -> Value {
-      return rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(),
-          rewriter.create<tensor::ExtractOp>(loc, input, index));
-    };
-
     // We'll need these later and creating them on demand we end up with
     // duplicates, which also makes lit tests really hard to write.
     SmallVector<Value> constants;
@@ -3206,25 +3598,8 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
           rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(i)));
     }
 
-    // Create ops to calculate the dynamic dimensions of the return shape, which
-    // are needed for the init tensor.
-    SmallVector<Value> dynDimSizes;
-    if (!resultType.hasStaticShape()) {
-      SmallVector<Value> returnShapes;
-      if (failed(gatherOp.reifyReturnTypeShapes(rewriter, adaptor.getOperands(),
-                                                returnShapes)))
-        return rewriter.notifyMatchFailure(gatherOp,
-                                           "could not reify return shape");
-      assert(returnShapes.size() == 1);
-      Value returnShape = returnShapes[0];
-
-      for (int i = 0; i < resultRank; ++i)
-        if (resultType.isDynamicDim(i))
-          dynDimSizes.push_back(extractAsIndex(returnShape, constants[i]));
-    }
-
-    Value emptyOp = rewriter.create<tensor::EmptyOp>(
-        loc, resultType.getShape(), resultType.getElementType(), dynDimSizes);
+    auto emptyOp = getEmptyTensorFor(rewriter, loc, resultType, gatherOp,
+                                     adaptor.getOperands());
 
     ValueRange ins;
     SmallVector<AffineMap, 1> indexingMaps(
@@ -3348,7 +3723,7 @@ struct GatherConversion : public OpConversionPattern<mhlo::GatherOp> {
       extractOperand = operand;
     } else {
       // Cannot extract from unranked tensors, cast to ranked first.
-      SmallVector<int64_t> dims(operandRank, ShapedType::kDynamicSize);
+      SmallVector<int64_t> dims(operandRank, ShapedType::kDynamic);
       auto type = RankedTensorType::get(
           dims, operand.getType().cast<TensorType>().getElementType());
       extractOperand = rewriter.create<tensor::CastOp>(loc, type, operand);
@@ -3453,104 +3828,6 @@ class DotGeneralOpConversion : public OpConversionPattern<mhlo::DotGeneralOp> {
   }
 };
 
-class SelectOpToMapConverter : public OpConversionPattern<mhlo::SelectOp> {
- public:
-  using OpConversionPattern<mhlo::SelectOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mhlo::SelectOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    Location loc = op.getLoc();
-
-    // Find result type, if on tensors.
-    Optional<ShapedType> resultTy;
-    resultTy = typeConverter->convertType(op.getType()).dyn_cast<ShapedType>();
-
-    // Check result type compatibility.
-    if (!resultTy || !resultTy->hasRank() ||
-        !(resultTy->getElementType().isSignlessIntOrFloat() ||
-          resultTy->getElementType().isa<ComplexType>())) {
-      return rewriter.notifyMatchFailure(
-          op, "mismatched operand/result types or iterator count");
-    }
-
-    auto isScalar = [&](Value v) {
-      return v.getType().cast<ShapedType>().getRank() == 0;
-    };
-
-    // Predicate in mhlo.select can be a shaped type with the same size as other
-    // operands, or a scalar.
-    const bool isScalarPred = isScalar(op.getPred());
-    const bool allOperandsAreScalar =
-        isScalarPred && isScalar(op.getOnTrue()) && isScalar(op.getOnFalse());
-
-    // Within a linalg op, we can immediately de-tensorsize if the computation
-    // is scalar. We do not do this on the top-level, as that would break the
-    // nice invariant that all programs are exclusively on tensors, which is
-    // currently relied on for fusion in some pipelines.
-    if (allOperandsAreScalar && isInBodyOfLinalgOps(op)) {
-      SmallVector<Value> inputs;
-      for (auto input : adaptor.getOperands()) {
-        inputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
-      }
-
-      Value scalarResult = mhlo::MhloOpToStdScalarOp::mapOp(
-          op, resultTy->getElementType(), inputs, &rewriter);
-
-      rewriter.replaceOpWithNewOp<tensor::FromElementsOp>(op, *resultTy,
-                                                          scalarResult);
-      return success();
-    }
-
-    Value predValue;
-    ValueRange mappedInputs = adaptor.getOperands();
-    // If predicate is a scalar, do not pass it as an argument to linalg.map,
-    // because linalg.map does not support broadcasting scalar values. Instead,
-    // extract the value and use it in the map block directly.
-    if (isScalarPred) {
-      predValue = rewriter.create<tensor::ExtractOp>(loc, adaptor.getPred());
-      mappedInputs = mappedInputs.drop_front();
-    }
-
-    auto emptyTensor =
-        getEmptyTensorFor(rewriter, loc, *resultTy, op, op.getOperands());
-
-    auto linalgOp = rewriter.create<linalg::MapOp>(loc, *resultTy, mappedInputs,
-                                                   emptyTensor);
-    linalgOp->setAttrs(linalg::getPrunedAttributeList(op));
-
-    {
-      OpBuilder::InsertionGuard guard(rewriter);
-      Region& region = linalgOp.getRegion();
-
-      SmallVector<Type, 4> blockArgTypes;
-      SmallVector<Location, 4> blockArgLocs;
-      for (Value v : mappedInputs) {
-        blockArgTypes.push_back(getElementTypeOrSelf(v));
-        blockArgLocs.push_back(v.getLoc());
-      }
-
-      Block* block = rewriter.createBlock(&region, region.end(), blockArgTypes,
-                                          blockArgLocs);
-
-      // If predicate is scalar, the block has two arguments (on_true, on_false)
-      // and the predicate value is extracted outside of the block.
-      // If predicate is shaped, the block has three arguments (pred, on_true,
-      // on_false).
-      Value innerResult = rewriter.create<arith::SelectOp>(
-          loc, getElementTypeOrSelf(emptyTensor),
-          isScalarPred ? ValueRange{predValue, block->getArgument(0),
-                                    block->getArgument(1)}
-                       : block->getArguments());
-
-      rewriter.create<linalg::YieldOp>(loc, innerResult);
-    }
-
-    rewriter.replaceOp(op, linalgOp.getResult());
-    return success();
-  }
-};
-
 /// Converts a HLO operation to a linalg.map op that contains the corresponding
 /// scalar operations.
 template <typename OpTy>
@@ -3561,16 +3838,17 @@ class PointwiseToLinalgMapConverter : public OpConversionPattern<OpTy> {
   LogicalResult matchAndRewrite(
       OpTy op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto getRank = [](Value v) {
-      return v.getType().cast<ShapedType>().getRank();
-    };
-    int64_t maxRank = getRank(adaptor.getOperands().front());
+    auto loc = op.getLoc();
+    int64_t maxRank = getMaxRank(adaptor);
 
-    // Apply only if all operands have the same rank.
-    if (!llvm::all_of(adaptor.getOperands(),
-                      [&](Value v) { return getRank(v) == maxRank; })) {
-      return rewriter.notifyMatchFailure(op,
-                                         "Operands must have the same rank.");
+    // Apply only if all operands are scalar or have the same rank. Some ops,
+    // like `mhlo.select`, support implicit broadcasting of scalars.
+    if (!llvm::all_of(adaptor.getOperands(), [&](Value v) {
+          int64_t r = getRank(v);
+          return r == 0 || r == maxRank;
+        })) {
+      return rewriter.notifyMatchFailure(
+          op, "Operands must be of same rank or scalar.");
     }
 
     // Find result type, if on tensors.
@@ -3586,59 +3864,114 @@ class PointwiseToLinalgMapConverter : public OpConversionPattern<OpTy> {
           op, "mismatched operand/result types or iterator count");
     }
 
-    auto loc = op.getLoc();
-    // Within a thlo.map region, we can immediately de-tensorsize if the
-    // computation is scalar. We do not do this on the top-level, as that would
-    // break the nice invariant that all programs are exclusively on tensors,
-    // which is currently relied on for fusion in some pipelines.
-    if (maxRank == 0 && isInBodyOfLinalgOps(op)) {
-      SmallVector<Value> inputs;
-      for (auto input : adaptor.getOperands()) {
-        inputs.push_back(
-            rewriter.create<tensor::ExtractOp>(loc, input, ValueRange()));
-      }
-      Value scalarResult = mhlo::MhloOpToStdScalarOp::mapOp(
-          op, resultTy->getElementType(), inputs, &rewriter);
-      if (!scalarResult) return failure();
-      rewriter.replaceOpWithNewOp<tensor::FromElementsOp>(op, *resultTy,
-                                                          scalarResult);
-      return success();
-    }
+    // All-scalar pointwise ops inside of linalg ops are processes by
+    // ScalarHloToArithmeticPattern.
+    if (maxRank == 0 && isInBodyOfLinalgOps(op)) return failure();
 
     // Find input/output values and types.
-    ValueRange inputs = adaptor.getOperands();
     Value emptyTensor =
         getEmptyTensorFor(rewriter, loc, *resultTy, op, adaptor.getOperands());
-    auto mapOp =
-        rewriter.create<linalg::MapOp>(loc, *resultTy, inputs, emptyTensor);
-    mapOp->setAttrs(linalg::getPrunedAttributeList(op));
 
-    {
-      OpBuilder::InsertionGuard guard(rewriter);
-      auto& region = mapOp.getRegion();
-
-      SmallVector<Type, 4> blockArgTypes;
-      SmallVector<Location, 4> blockArgLocs;
-      for (Value v : inputs) {
-        blockArgTypes.push_back(getElementTypeOrSelf(v));
-        blockArgLocs.push_back(v.getLoc());
+    // Mapped inputs are cast to the same shape as the init tensor.
+    // Values from scalar inputs are extracted and used directly in the block.
+    SmallVector<Value> mappedInputs;
+    SmallVector<Value> scalarInputs;
+    for (Value input : adaptor.getOperands()) {
+      if (getRank(input) == maxRank) {
+        mappedInputs.push_back(coerceTensorShape(
+            rewriter, loc, cast<TypedValue<ShapedType>>(input),
+            emptyTensor.getType()));
+        scalarInputs.push_back(nullptr);
+      } else {
+        scalarInputs.push_back(rewriter.create<tensor::ExtractOp>(loc, input));
       }
-      Block* block = rewriter.createBlock(&region, region.end(), blockArgTypes,
-                                          blockArgLocs);
-
-      Value innerResult = mhlo::MhloOpToStdScalarOp::mapOp(
-          op, getElementTypeOrSelf(emptyTensor), block->getArguments(),
-          &rewriter);
-      rewriter.create<linalg::YieldOp>(loc, innerResult);
     }
 
+    auto mapOp = rewriter.create<linalg::MapOp>(
+        loc, mappedInputs, emptyTensor,
+        [&](OpBuilder& b, Location loc, ValueRange args) {
+          Value innerResult = mhlo::MhloOpToStdScalarOp::mapOp(
+              op, getElementTypeOrSelf(emptyTensor),
+              interleaveScalarAndBlockArgs(scalarInputs, args), &b);
+          b.create<linalg::YieldOp>(loc, innerResult);
+        },
+        linalg::getPrunedAttributeList(op));
+
     rewriter.replaceOp(op, mapOp->getResults());
     return success();
   }
+
+ protected:
+  int64_t getRank(Value v) const {
+    return v.getType().cast<ShapedType>().getRank();
+  }
+
+  int64_t getMaxRank(typename OpTy::Adaptor adaptor) const {
+    int64_t maxRank = 0;
+    for (auto operand : adaptor.getOperands()) {
+      maxRank = std::max(maxRank, getRank(operand));
+    }
+    return maxRank;
+  }
+
+  // Inserts block arguments in places where scalar inputs have a nullptr.
+  SmallVector<Value> interleaveScalarAndBlockArgs(ValueRange scalarInputs,
+                                                  ValueRange blockArgs) const {
+    SmallVector<Value> result;
+    auto argsIter = blockArgs.begin();
+    for (Value scalarInput : scalarInputs) {
+      if (scalarInput) {
+        result.push_back(scalarInput);
+      } else {
+        result.push_back(*argsIter);
+        ++argsIter;
+      }
+    }
+    return result;
+  }
+};
+
+class SetDimensionSizeConverter
+    : public OpConversionPattern<mhlo::SetDimensionSizeOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::SetDimensionSizeOp setDimensionSizeOp, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    // We can lower SetDimensionSize to tensor extract. This turns into a
+    // regular dynamic shape. Note that the bounds annotation is still around
+    // but may be no longer valid depending on choices made by bufferization.
+    Location loc = setDimensionSizeOp.getLoc();
+    auto resultType = setDimensionSizeOp.getType().cast<RankedTensorType>();
+
+    SmallVector<OpFoldResult> offsets(resultType.getRank(),
+                                      rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> strides(resultType.getRank(),
+                                      rewriter.getIndexAttr(1));
+    SmallVector<OpFoldResult> sizes(llvm::map_range(
+        resultType.getShape(), [&](int64_t dim) -> OpFoldResult {
+          return rewriter.getIndexAttr(dim);
+        }));
+    Value dimensionSize =
+        rewriter.create<tensor::ExtractOp>(loc, setDimensionSizeOp.getSize());
+    sizes[setDimensionSizeOp.getDimension()] =
+        rewriter
+            .create<arith::IndexCastOp>(loc, rewriter.getIndexType(),
+                                        dimensionSize)
+            .getResult();
+
+    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+        setDimensionSizeOp, resultType, adaptor.getOperand(), offsets, sizes,
+        strides);
+    return success();
+  }
 };
 
 struct HloLegalizeToLinalgPass
     : public impl::HloLegalizeToLinalgPassBase<HloLegalizeToLinalgPass> {
+  using HloLegalizeToLinalgPassBase::HloLegalizeToLinalgPassBase;
+
   void getDependentDialects(DialectRegistry& registry) const override {
     registry.insert<bufferization::BufferizationDialect, linalg::LinalgDialect,
                     scf::SCFDialect, complex::ComplexDialect, math::MathDialect,
@@ -3659,6 +3992,9 @@ struct HloLegalizeToLinalgPass
 
     auto typeConverter = createHloToLinalgTypeConverter();
     auto func = getOperation();
+    mhlo::populateScalarHloToArithmeticConversionPatterns(
+        &ctx, *typeConverter, &patterns,
+        [](Operation* op) { return isInBodyOfLinalgOps(op); });
     mhlo::populateHloToLinalgConversionPattern(&ctx, *typeConverter, &patterns,
                                                enablePrimitiveOps);
     if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
@@ -3676,21 +4012,21 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
   // clang-format off
   patterns->add<
       BitcastConvertConverter,
-      BroadcastConverter<mhlo::BroadcastOp>, ConcatenateConverter,
-      ConstConverterTensor, HloDynamicBroadcastInDimConverter,
-      HloBroadcastInDimConverter, IotaConverter<mhlo::IotaOp>,
+      ConcatenateConverter,
+      ConstConverterTensor,
+      IotaConverter<mhlo::IotaOp>,
       EinsumToLinalgConverter,
       IotaConverter<mhlo::DynamicIotaOp>,
       RealDynamicSliceConverter,
       ReshapeOpConverter,
       ReverseConverter,
+      SetDimensionSizeConverter,
       SliceConverter,
       DynamicSliceConverter,
       DynamicUpdateSliceConverter,
       GatherConversion,
       PadOpConversion,
       PadOpNegativePaddingConversion,
-      ReduceConversion,
       ReduceWindowOpOnTensorsGenericConversion,
       ReduceWindowOpConversion,
       RngUniformConversion,
@@ -3699,6 +4035,9 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
 
   if (enablePrimitiveOps) {
     patterns->add<
+      BroadcastInDimOpToBroadcastConverter,
+      BroadcastOpToBroadcastConverter,
+      DynamicBroadcastInDimOpToBroadcastConverter,
       MapOpToMapConverter,
       PointwiseToLinalgMapConverter<mhlo::AbsOp>,
       PointwiseToLinalgMapConverter<mhlo::AddOp>,
@@ -3737,6 +4076,7 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       PointwiseToLinalgMapConverter<mhlo::RoundNearestEvenOp>,
       PointwiseToLinalgMapConverter<mhlo::RoundOp>,
       PointwiseToLinalgMapConverter<mhlo::RsqrtOp>,
+      PointwiseToLinalgMapConverter<mhlo::SelectOp>,
       PointwiseToLinalgMapConverter<mhlo::ShiftLeftOp>,
       PointwiseToLinalgMapConverter<mhlo::ShiftRightArithmeticOp>,
       PointwiseToLinalgMapConverter<mhlo::ShiftRightLogicalOp>,
@@ -3744,13 +4084,17 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       PointwiseToLinalgMapConverter<mhlo::SineOp>,
       PointwiseToLinalgMapConverter<mhlo::SqrtOp>,
       PointwiseToLinalgMapConverter<mhlo::SubtractOp>,
+      PointwiseToLinalgMapConverter<mhlo::TanOp>,
       PointwiseToLinalgMapConverter<mhlo::TanhOp>,
       PointwiseToLinalgMapConverter<mhlo::XorOp>,
-      TransposeOpToTransposeConverter,
-      SelectOpToMapConverter
+      ReduceOpToReduceConverter,
+      TransposeOpToTransposeConverter
     >(typeConverter, context);
   } else {
     patterns->add<
+      BroadcastConverter<mhlo::BroadcastOp>,
+      HloBroadcastInDimConverter,
+      HloDynamicBroadcastInDimConverter,
       MapOpToGenericConverter,
       PointwiseToLinalgConverter<mhlo::AbsOp>,
       PointwiseToLinalgConverter<mhlo::AddOp>,
@@ -3797,8 +4141,10 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       PointwiseToLinalgConverter<mhlo::SineOp>,
       PointwiseToLinalgConverter<mhlo::SqrtOp>,
       PointwiseToLinalgConverter<mhlo::SubtractOp>,
+      PointwiseToLinalgConverter<mhlo::TanOp>,
       PointwiseToLinalgConverter<mhlo::TanhOp>,
       PointwiseToLinalgConverter<mhlo::XorOp>,
+      ReduceOpToGenericConverter,
       TransposeConverter<mhlo::TransposeOp>
     >(typeConverter, context);
   }
@@ -3817,11 +4163,15 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
   patterns->add<
       ConvolutionOpGeneralConversion,
       DotGeneralOpConversion>(typeConverter, context, PatternBenefit(1));
+  linalg::populateEraseUnusedOperandsAndResultsPatterns(*patterns);
   // clang-format on
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeHloToLinalgPass() {
-  return std::make_unique<HloLegalizeToLinalgPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeHloToLinalgPass(
+    bool enablePrimitiveOps) {
+  HloLegalizeToLinalgPassOptions options;
+  options.enablePrimitiveOps = enablePrimitiveOps;
+  return std::make_unique<HloLegalizeToLinalgPass>(options);
 }
 
 std::unique_ptr<TypeConverter> createHloToLinalgTypeConverter() {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
index 7748db855e5..28ec1333929 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 // This file implements logic for lowering MHLO dialect to Standard dialect.
 
+#include <optional>
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -29,12 +30,12 @@ limitations under the License.
 
 namespace mlir {
 namespace {
-#include "generated_legalize_to_standard.inc"
+#include "legalize_to_standard/generated_legalize_to_standard.inc"
 }  // end anonymous namespace
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZETOSTANDARDPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -56,7 +57,7 @@ class CompareIConvert : public OpRewritePattern<mhlo::CompareOp> {
         !rhsType.getElementType().isSignlessInteger())
       return failure();
 
-    Optional<arith::CmpIPredicate> comparePredicate = llvm::None;
+    Optional<arith::CmpIPredicate> comparePredicate = std::nullopt;
     switch (op.getComparisonDirection()) {
       case ComparisonDirection::EQ:
         comparePredicate = arith::CmpIPredicate::eq;
@@ -104,7 +105,7 @@ class CompareFConvert : public OpRewritePattern<mhlo::CompareOp> {
         !rhsType.getElementType().isa<FloatType>())
       return failure();
 
-    Optional<arith::CmpFPredicate> comparePredicate = llvm::None;
+    Optional<arith::CmpFPredicate> comparePredicate = std::nullopt;
     switch (op.getComparisonDirection()) {
       case ComparisonDirection::EQ:
         comparePredicate = arith::CmpFPredicate::OEQ;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
index c139b69ff7f..2436dadb870 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard_patterns.td
@@ -19,13 +19,13 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 include "mlir/Dialect/Math/IR/MathOps.td"
 include "mlir/Dialect/Func/IR/FuncOps.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mhlo/IR/hlo_ops.td"
 
 //===----------------------------------------------------------------------===//
 // Nullary op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(HLO_ConstantOp ElementsAttr:$value),
+def : Pat<(MHLO_ConstantOp ElementsAttr:$value),
           (Arith_ConstantOp $value)>;
 
 //===----------------------------------------------------------------------===//
@@ -43,46 +43,46 @@ def createFastMathNone : NativeCodeCall<
 
 
 // Unary Lowering Patterns.
-def : Pat<(HLO_CeilOp HLO_FpTensor:$i), (Math_CeilOp $i)>;
+def : Pat<(MHLO_CeilOp MHLO_FpTensor:$i), (Math_CeilOp $i, (createFastMathNone ))>;
 
 // Binary Lowering Patterns.
-def : Pat<(HLO_AndOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_AndOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_AndIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_OrOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_OrOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_OrIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r),
+def : Pat<(MHLO_AddOp MHLO_FpTensor:$l, MHLO_FpTensor:$r),
           (Arith_AddFOp $l, $r, (createFastMathNone )),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubtractOp HLO_FpTensor:$l, HLO_FpTensor:$r),
+def : Pat<(MHLO_SubtractOp MHLO_FpTensor:$l, MHLO_FpTensor:$r),
           (Arith_SubFOp $l, $r, (createFastMathNone )),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r),
+def : Pat<(MHLO_MulOp MHLO_FpTensor:$l, MHLO_FpTensor:$r),
           (Arith_MulFOp $l, $r, (createFastMathNone )),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r),
+def : Pat<(MHLO_DivOp MHLO_FpTensor:$l, MHLO_FpTensor:$r),
           (Arith_DivFOp $l, $r, (createFastMathNone )),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r),
+def : Pat<(MHLO_RemOp MHLO_FpTensor:$l, MHLO_FpTensor:$r),
           (Arith_RemFOp $l, $r, (createFastMathNone )),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_AddOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_AddIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubtractOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_SubtractOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_SubIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_MulOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_MulIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_DivOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_DivSIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r),
+def : Pat<(MHLO_RemOp MHLO_IntTensor:$l, MHLO_IntTensor:$r),
           (Arith_RemSIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
 
-def : Pat<(HLO_SelectOp $pred, $tv, $fv),
+def : Pat<(MHLO_SelectOp $pred, $tv, $fv),
           (SelectOp $pred, $tv, $fv),
           [(IsSameSizeConstraint $pred, $tv), (IsSameSizeConstraint $tv, $fv)]>;
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
index ceb9be3cc3b..e2fb93aa146 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -31,7 +31,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZETANHTOAPPROXIMATIONPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
similarity index 89%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
index 62aaffb43d0..977db01f842 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
@@ -24,10 +24,9 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/utils/hlo_utils.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -38,12 +37,13 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "utils/hlo_utils.h"
 
 namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LOWERCOMPLEXPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 class LowerComplexPass : public impl::LowerComplexPassBase<LowerComplexPass> {
@@ -52,7 +52,7 @@ class LowerComplexPass : public impl::LowerComplexPassBase<LowerComplexPass> {
   void runOnOperation() override;
 };
 
-#include "generated_lower_complex.inc"
+#include "lower_complex/generated_lower_complex.inc"
 
 // Lowers the complex operations that can be represented using other operations.
 void LowerComplexPass::runOnOperation() {
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex_patterns.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex_patterns.td
new file mode 100644
index 00000000000..1fc4128cba0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex_patterns.td
@@ -0,0 +1,136 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the legalization pattern that converts complex operations into
+// equivalent real value operations.
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mhlo/IR/hlo_ops.td"
+
+//===----------------------------------------------------------------------===//
+// Binary op patterns.
+//===----------------------------------------------------------------------===//
+
+// Add and subtraction are elementwise and can be distributed across the real
+// and imaginary components.
+foreach elementwiseOp = [MHLO_AddOp, MHLO_SubtractOp] in
+  def : Pat<(elementwiseOp MHLO_ComplexTensor:$lhs,
+             MHLO_ComplexTensor:$rhs),
+            (MHLO_ComplexOp
+              (elementwiseOp (MHLO_RealOp $lhs), (MHLO_RealOp $rhs)),
+              (elementwiseOp (MHLO_ImagOp $lhs), (MHLO_ImagOp $rhs)))>;
+
+// Complex multiplication results in a cross product multiplication between the
+// real and imaginary components such that:
+//   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
+//   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
+def : Pat<(MHLO_MulOp MHLO_ComplexTensor:$lhs,
+           MHLO_ComplexTensor:$rhs),
+          (MHLO_ComplexOp
+           (MHLO_SubtractOp
+            (MHLO_MulOp
+             (MHLO_RealOp:$lhs_real $lhs),
+             (MHLO_RealOp:$rhs_real $rhs)),
+            (MHLO_MulOp
+             (MHLO_ImagOp:$lhs_imag $lhs),
+             (MHLO_ImagOp:$rhs_imag $rhs))),
+           (MHLO_AddOp
+            (MHLO_MulOp $lhs_real, $rhs_imag),
+            (MHLO_MulOp $lhs_imag, $rhs_real)))>;
+
+
+// Division is performed by normalizing the denominator by multiplying by the
+// conjugate of the rhs.
+//   numerator = lhs * conj(rhs)
+//   denominator = rhs * conj(rhs)
+def : Pat<(MHLO_DivOp MHLO_ComplexTensor:$lhs, MHLO_ComplexTensor:$rhs),
+          (MHLO_ComplexOp
+            (MHLO_DivOp
+             (MHLO_RealOp (MHLO_MulOp:$num $lhs,
+                          (MHLO_ComplexOp:$conj
+                           (MHLO_RealOp $rhs),
+                           (MHLO_NegOp (MHLO_ImagOp $rhs))))),
+              (MHLO_AddOp:$den
+               (MHLO_MulOp (MHLO_RealOp $rhs), (MHLO_RealOp $rhs)),
+               (MHLO_MulOp (MHLO_ImagOp $rhs), (MHLO_ImagOp $rhs)))),
+            (MHLO_DivOp (MHLO_ImagOp $num), $den))>;
+
+// Absolute value is evaluated as:
+//   result = sqrt(val.real * val.real + val.imag * val.imag)
+def : Pat<(MHLO_AbsOp MHLO_ComplexTensor:$val),
+           (MHLO_SqrtOp
+             (MHLO_AddOp
+              (MHLO_MulOp (MHLO_RealOp:$real $val), $real),
+              (MHLO_MulOp (MHLO_ImagOp:$imag $val), $imag)))>;
+
+// Can deconstruct sin(a + ib) as follows:
+//   sin(a) * cosh(b) + icos(a) * sinh(b)
+//   sinh(b) = (e^x - e^-x) / 2
+//   cosh(b) = (e^x + e^-x) / 2
+def : Pat<(MHLO_SineOp MHLO_ComplexTensor:$val),
+            (MHLO_ComplexOp
+              (MHLO_DivOp
+                (MHLO_MulOp
+                  (MHLO_SineOp (MHLO_RealOp:$real $val)),
+                  (MHLO_AddOp
+                    (MHLO_ExpOp:$exp (MHLO_ImagOp:$imag $val)),
+                    (MHLO_ExpOp:$nexp (MHLO_NegOp $imag)))),
+                 (MHLO_ConstantOp : $two (ConstantSplat<"2.0"> $real))),
+              (MHLO_DivOp
+                (MHLO_MulOp
+                  (MHLO_CosineOp $real),
+                  (MHLO_SubtractOp $exp, $nexp)), $two))>;
+
+// Can deconstruct cos(a + ib) as follows:
+//   cos(a) * cosh(b) - isin(a) * sinh(b)
+//   sinh(b) = (e^x - e^-x) / 2
+//   cosh(b) = (e^x + e^-x) / 2
+def : Pat<(MHLO_CosineOp MHLO_ComplexTensor:$val),
+            (MHLO_ComplexOp
+              (MHLO_DivOp
+                (MHLO_MulOp
+                  (MHLO_CosineOp (MHLO_RealOp:$real $val)),
+                  (MHLO_AddOp
+                    (MHLO_ExpOp:$exp (MHLO_ImagOp:$imag $val)),
+                    (MHLO_ExpOp:$nexp (MHLO_NegOp $imag)))),
+                 (MHLO_ConstantOp : $two (ConstantSplat<"2.0"> $real))),
+              (MHLO_DivOp
+                (MHLO_MulOp
+                  (MHLO_SineOp $real),
+                  (MHLO_SubtractOp $nexp, $exp)), $two))>;
+
+// Exponential can be lowered to an exponential on the real component and a
+// sum of sinusoids of the imaginary component, which equates to a normal
+// exponential operator multiplied by Euler's formula.
+//
+// Exp(a + ib) = Exp(a) * Exp(ib) = Exp(a) * Cos(b) + Exp(a) * iSin(b))
+class MHLO_ComparisonDirectionValue<string enumStr> :
+  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
+
+def : Pat<(MHLO_ExpOp MHLO_ComplexTensor:$val),
+          (MHLO_ComplexOp
+           (MHLO_MulOp
+            (MHLO_CosineOp (MHLO_ImagOp:$imag $val)),
+            (MHLO_ExpOp:$exp (MHLO_RealOp:$real $val))),
+           (MHLO_MulOp (MHLO_SineOp $imag), $exp))>;
+
+foreach pair = [[MHLO_ComparisonDirectionValue<"NE">, MHLO_OrOp],
+                [MHLO_ComparisonDirectionValue<"EQ">, MHLO_AndOp]] in {
+def : Pat<(MHLO_CompareOp MHLO_ComplexTensor:$lhs, MHLO_ComplexTensor:$rhs, pair[0], $compare_type),
+            (pair[1]
+             (MHLO_CompareOp (MHLO_RealOp $lhs), (MHLO_RealOp $rhs), pair[0], $compare_type),
+             (MHLO_CompareOp (MHLO_ImagOp $lhs), (MHLO_ImagOp $rhs), pair[0], $compare_type))>;
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
similarity index 86%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
index d7085fdfbcd..28b9288154c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
@@ -17,12 +17,13 @@ limitations under the License.
 
 #include <sys/types.h>
 
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/IR/Attributes.h"
@@ -37,7 +38,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_LEGALIZEGENERALDOTPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -51,14 +52,14 @@ Value transposeReshape(Value arg, Location loc,
   int64_t leftSize = 1;
   for (auto dim : leftDims) {
     leftSize = (ShapedType::isDynamic(argShape[dim]) || leftSize < 0)
-                   ? ShapedType::kDynamicSize
+                   ? ShapedType::kDynamic
                    : leftSize * argShape[dim];
   }
 
   int64_t rightSize = 1;
   for (auto dim : rightDims) {
     rightSize = (ShapedType::isDynamic(argShape[dim]) || rightSize < 0)
-                    ? ShapedType::kDynamicSize
+                    ? ShapedType::kDynamic
                     : rightSize * argShape[dim];
   }
 
@@ -73,7 +74,7 @@ Value transposeReshape(Value arg, Location loc,
 
   auto transposePermutationAttr =
       DenseIntElementsAttr::get(transposePermutationType,
-                                llvm::makeArrayRef(transposePermutation))
+                                llvm::ArrayRef(transposePermutation))
           .cast<DenseIntElementsAttr>();
 
   // Compute the resulting shape.
@@ -111,16 +112,18 @@ Value transposeReshape(Value arg, Location loc,
   SmallVector<Value> reshapeDims;
   auto multiplyDynamicDims = [&](llvm::ArrayRef<int64_t> dims) -> Value {
     Value dynamicSize = rewriter.create<GetDimensionSizeOp>(
-        loc, RankedTensorType::get({1}, rewriter.getI32Type()), arg,
-        rewriter.getI64IntegerAttr(dims.front()));
-
+        loc, arg, rewriter.getI64IntegerAttr(dims.front()));
+    Value dynamicSizeReshaped = rewriter.create<ReshapeOp>(
+        loc, RankedTensorType::get({1}, rewriter.getI32Type()), dynamicSize);
     for (auto idx : dims.drop_front()) {
       Value dim = rewriter.create<GetDimensionSizeOp>(
-          loc, RankedTensorType::get({1}, rewriter.getI32Type()), arg,
-          rewriter.getI64IntegerAttr(idx));
-      dynamicSize = rewriter.create<MulOp>(loc, dynamicSize, dim);
+          loc, arg, rewriter.getI64IntegerAttr(idx));
+      Value dimReshaped = rewriter.create<ReshapeOp>(
+          loc, RankedTensorType::get({1}, rewriter.getI32Type()), dim);
+      dynamicSizeReshaped =
+          rewriter.create<MulOp>(loc, dynamicSizeReshaped, dimReshaped);
     }
-    return dynamicSize;
+    return dynamicSizeReshaped;
   };
 
   if (leftSize < 0) {
@@ -140,7 +143,6 @@ Value transposeReshape(Value arg, Location loc,
   Value reshapeDimsTensor = rewriter.create<ConcatenateOp>(
       loc, RankedTensorType::get({2}, rewriter.getI32Type()), reshapeDims,
       rewriter.getI64IntegerAttr(0));
-
   return rewriter.create<DynamicReshapeOp>(loc, reshapedType, transposeResult,
                                            reshapeDimsTensor);
 }
@@ -206,13 +208,13 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
     RankedTensorType rhsTy = rhs.getType().dyn_cast<RankedTensorType>();
     if (!lhsTy || !rhsTy) return failure();
 
-    lhs = processDotArg(op.getLhs(), op.getLoc(),
-                        dotNumbers.getLhsContractingDimensions(),
-                        /*outerDimsFirst=*/true, rewriter);
+    lhs = llvm::cast<mlir::TypedValue<mlir::TensorType>>(processDotArg(
+        op.getLhs(), op.getLoc(), dotNumbers.getLhsContractingDimensions(),
+        /*outerDimsFirst=*/true, rewriter));
 
-    rhs = processDotArg(op.getRhs(), op.getLoc(),
-                        dotNumbers.getRhsContractingDimensions(),
-                        /*outerDimsFirst=*/false, rewriter);
+    rhs = llvm::cast<mlir::TypedValue<mlir::TensorType>>(processDotArg(
+        op.getRhs(), op.getLoc(), dotNumbers.getRhsContractingDimensions(),
+        /*outerDimsFirst=*/false, rewriter));
 
     // Accept only static shaped types.
     auto lhsShapeType = lhs.getType().dyn_cast_or_null<ShapedType>();
@@ -221,16 +223,11 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
 
     ArrayAttr precisionConfig;
     if (op.getPrecisionConfig()) precisionConfig = *op.getPrecisionConfig();
-    SmallVector<Type, 1> results;
-    LogicalResult res =
-        DotOp::inferReturnTypes(rewriter.getContext(), None, {lhs, rhs},
-                                op->getAttrDictionary(), {}, results);
-    (void)res;
-    assert(succeeded(res) && "invalid input to dot");
 
     ShapedType resultTy = op.getType().cast<ShapedType>();
-    ShapedType newTy =
-        results.front().cast<ShapedType>().clone(resultTy.getElementType());
+    ShapedType newTy = RankedTensorType::get(
+        {lhsShapeType.getShape()[0], rhsShapeType.getShape()[1]},
+        resultTy.getElementType());
     Value newDotOp =
         rewriter.create<DotOp>(op.getLoc(), newTy, lhs, rhs, precisionConfig);
     if (static_cast<int64_t>(lhsContractingDims.size()) ==
@@ -266,18 +263,22 @@ struct GeneralDotConvert : public OpRewritePattern<DotGeneralOp> {
       for (auto contractingDim : contractingDims) {
         for (; index < contractingDim; index++) {
           staticDims.push_back(ty.getDimSize(index));
-          dynDims.push_back(rewriter.create<GetDimensionSizeOp>(
-              loc, RankedTensorType::get({1}, rewriter.getI32Type()), arg,
-              rewriter.getI64IntegerAttr(index)));
+          Value dynDim = rewriter.create<GetDimensionSizeOp>(
+              loc, arg, rewriter.getI64IntegerAttr(index));
+          Value dynDimReshaped = rewriter.create<ReshapeOp>(
+              loc, RankedTensorType::get({1}, rewriter.getI32Type()), dynDim);
+          dynDims.push_back(dynDimReshaped);
         }
         index++;
       }
 
       for (; index < ty.getRank(); index++) {
         staticDims.push_back(ty.getDimSize(index));
-        dynDims.push_back(rewriter.create<GetDimensionSizeOp>(
-            loc, RankedTensorType::get({1}, rewriter.getI32Type()), arg,
-            rewriter.getI64IntegerAttr(index)));
+        Value dynDim = rewriter.create<GetDimensionSizeOp>(
+            loc, arg, rewriter.getI64IntegerAttr(index));
+        Value dynDimReshaped = rewriter.create<ReshapeOp>(
+            loc, RankedTensorType::get({1}, rewriter.getI32Type()), dynDim);
+        dynDims.push_back(dynDimReshaped);
       }
     };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h
index ce277ab9b03..a2974a2e49d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_chlo_to_hlo_op.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
 
 #include <type_traits>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/IR/PatternMatch.h"
 #include "stablehlo/dialect/ChloOps.h"
 
@@ -49,9 +49,8 @@ inline llvm::Optional<mhlo::ComparisonDirection> mhloComparisonDirection(
       return mhlo::ComparisonDirection::LE;
     case chlo::ComparisonDirection::LT:
       return mhlo::ComparisonDirection::LT;
-    default:
-      return {};
   }
+  return {};
 }
 
 inline llvm::Optional<mhlo::ComparisonType> mhloComparisonType(
@@ -67,9 +66,8 @@ inline llvm::Optional<mhlo::ComparisonType> mhloComparisonType(
       return mhlo::ComparisonType::SIGNED;
     case chlo::ComparisonType::UNSIGNED:
       return mhlo::ComparisonType::UNSIGNED;
-    default:
-      return {};
   }
+  return {};
 }
 
 struct HloCompareAdaptor {
@@ -135,4 +133,4 @@ void populateForBroadcastingBinaryOp(MLIRContext *context,
 }  // namespace chlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_CHLO_TO_HLO_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index ceb0576bdb6..7e59059449a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+
+#include <optional>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Math/IR/Math.h"
@@ -56,6 +58,10 @@ struct MhloToScalarOp<mhlo::AndOp> {
   using UOp = ::mlir::arith::AndIOp;
 };
 template <>
+struct MhloToScalarOp<mhlo::CbrtOp> {
+  using FOp = ::mlir::math::CbrtOp;
+};
+template <>
 struct MhloToScalarOp<mhlo::CompareOp> {
   using FOp = ::mlir::arith::CmpFOp;
   using IOp = ::mlir::arith::CmpIOp;
@@ -159,6 +165,11 @@ struct MhloToScalarOp<mhlo::SineOp> {
   using COp = ::mlir::complex::SinOp;
 };
 template <>
+struct MhloToScalarOp<mhlo::TanOp> {
+  using FOp = ::mlir::math::TanOp;
+  using COp = ::mlir::complex::TanOp;
+};
+template <>
 struct MhloToScalarOp<mhlo::Atan2Op> {
   using FOp = ::mlir::math::Atan2Op;
   using COp = ::mlir::complex::Atan2Op;
@@ -200,7 +211,8 @@ template <typename StdScalarOp>
 struct MapMhloOpToScalarOpImpl<StdScalarOp> {
   Value operator()(Location loc, ArrayRef<Type> resultTypes,
                    ArrayRef<Type> /*argTypes*/, ValueRange args, OpBuilder* b) {
-    return b->template create<StdScalarOp>(loc, resultTypes, args, mlir::None);
+    return b->template create<StdScalarOp>(loc, resultTypes, args,
+                                           std::nullopt);
   }
 };
 
@@ -211,7 +223,7 @@ struct MapMhloOpToScalarOpImpl<SupportedType, StdScalarOp, Args...> {
     Type elementType = getElementTypeOrSelf(argTypes.front());
     if (SupportedType{}(elementType)) {
       return b->template create<StdScalarOp>(loc, resultTypes, args,
-                                             mlir::None);
+                                             std::nullopt);
     }
     return MapMhloOpToScalarOpImpl<Args...>{}(loc, resultTypes, argTypes, args,
                                               b);
@@ -318,31 +330,10 @@ inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
   return b->create<arith::ConstantOp>(loc, t, v);
 }
 
-template <>
-inline Value mapMhloOpToStdScalarOp<mhlo::CbrtOp>(Location loc,
-                                                  ArrayRef<Type> resultTypes,
-                                                  ArrayRef<Type> argTypes,
-                                                  mhlo::CbrtOp::Adaptor adaptor,
-                                                  OpBuilder* b) {
-  Type elementType = getElementTypeOrSelf(argTypes.front());
-  if (auto floatType = elementType.dyn_cast<FloatType>()) {
-    // Convert cbrt(x) to copysign(cbrt(abs(x), 1.0 / 3.0), x).
-    // This is to allow cbrt using pow while still handling negative numbers. It
-    // should match most cbrt intrinsics.
-    Value abs = b->create<mlir::math::AbsFOp>(loc, adaptor.getOperand());
-    Value third = b->create<arith::ConstantOp>(
-        loc, b->getFloatAttr(floatType, 1.0 / 3.0));
-    Value pow = b->create<mlir::math::PowFOp>(loc, resultTypes[0], abs, third);
-    return b->create<mlir::math::CopySignOp>(loc, floatType, pow,
-                                             adaptor.getOperand());
-  }
-  return nullptr;
-}
-
 template <typename PredicateType>
 inline Optional<PredicateType> getCmpPredicate(mhlo::ComparisonDirection,
                                                bool) {
-  return llvm::None;
+  return std::nullopt;
 }
 
 template <>
@@ -357,7 +348,7 @@ inline Optional<arith::CmpFPredicate> getCmpPredicate<arith::CmpFPredicate>(
       .Case("GT", arith::CmpFPredicate::OGT)
       .Case("LE", arith::CmpFPredicate::OLE)
       .Case("LT", arith::CmpFPredicate::OLT)
-      .Default(llvm::None);
+      .Default(std::nullopt);
 }
 
 template <>
@@ -375,7 +366,7 @@ inline Optional<arith::CmpIPredicate> getCmpPredicate<arith::CmpIPredicate>(
             isSigned ? arith::CmpIPredicate::sle : arith::CmpIPredicate::ule)
       .Case("LT",
             isSigned ? arith::CmpIPredicate::slt : arith::CmpIPredicate::ult)
-      .Default(llvm::None);
+      .Default(std::nullopt);
 }
 
 template <>
@@ -627,7 +618,8 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (sourceType.isSignlessInteger(1) &&
       mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::UIToFPOp>(loc, resultTypes, args, mlir::None);
+    return b->create<mlir::arith::UIToFPOp>(loc, resultTypes, args,
+                                            std::nullopt);
   }
   if (sourceType.isUnsignedInteger() &&
       mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
@@ -644,17 +636,19 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     // End of Added by DISC
   }
   if (mlir::arith::SIToFPOp::areCastCompatible(sourceType, targetType)) {
-    return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args, mlir::None);
+    return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args,
+                                            std::nullopt);
   }
   if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
     auto src = sourceType.cast<FloatType>();
     auto res = targetType.cast<FloatType>();
     if (src.getWidth() > res.getWidth()) {
       return b->create<mlir::arith::TruncFOp>(loc, resultTypes, args,
-                                              mlir::None);
+                                              std::nullopt);
     }
     if (src.getWidth() < res.getWidth()) {
-      return b->create<mlir::arith::ExtFOp>(loc, resultTypes, args, mlir::None);
+      return b->create<mlir::arith::ExtFOp>(loc, resultTypes, args,
+                                            std::nullopt);
     }
     // There's no direct conversion between different 16 bit floating point
     // types, so go through 32 bit float.
@@ -687,16 +681,16 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     auto res = targetType.cast<IntegerType>();
     if (src.getWidth() > res.getWidth()) {
       return b->create<mlir::arith::TruncIOp>(loc, resultTypes, args,
-                                              mlir::None);
+                                              std::nullopt);
     }
     if (src.getWidth() < res.getWidth()) {
       // Special case boolean values, so they get casted to `1` instead of `-1`.
       if (IsUnsignedIntegerType{}(src)) {
         return b->create<mlir::arith::ExtUIOp>(loc, resultTypes, args,
-                                               mlir::None);
+                                               std::nullopt);
       }
       return b->create<mlir::arith::ExtSIOp>(loc, resultTypes, args,
-                                             mlir::None);
+                                             std::nullopt);
     }
     // No conversion is needed for the same width integers
     return args.front();
@@ -704,11 +698,13 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (targetType.isUnsignedInteger() &&
       mlir::arith::FPToUIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToUIOp>(loc, resultTypes, args, mlir::None);
+    return b->create<mlir::arith::FPToUIOp>(loc, resultTypes, args,
+                                            std::nullopt);
   }
   if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args, mlir::None);
+    return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args,
+                                            std::nullopt);
   }
   if (targetType.isa<ComplexType>()) {
     Type targetElementType = targetType.cast<ComplexType>().getElementType();
@@ -1290,4 +1286,4 @@ struct MhloOpToStdScalarOp {
 }  // namespace mhlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
similarity index 93%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
index 433d0e88430..3667563ac07 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_stablehlo_to_hlo_op.h
@@ -13,28 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
+#define MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
 
 #include <type_traits>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "stablehlo/dialect/StablehloOps.h"
 
 namespace mlir {
 namespace stablehlo {
 
 template <typename HloOpTy>
-struct HloToStablehloOpImpl {
-  using Type = std::false_type;
-};
+struct HloToStablehloOpImpl;
 template <typename HloOpTy>
 using HloToStablehloOp = typename HloToStablehloOpImpl<HloOpTy>::Type;
 
 template <typename StablehloOpTy>
-struct StablehloToHloOpImpl {
-  using Type = std::false_type;
-};
+struct StablehloToHloOpImpl;
 template <typename StablehloOpTy>
 using StablehloToHloOp = typename StablehloToHloOpImpl<StablehloOpTy>::Type;
 
@@ -118,6 +114,7 @@ MAP_STABLEHLO_TO_HLO(OptimizationBarrierOp)
 MAP_STABLEHLO_TO_HLO(OrOp)
 MAP_STABLEHLO_TO_HLO(OutfeedOp)
 MAP_STABLEHLO_TO_HLO(PadOp)
+MAP_STABLEHLO_TO_HLO(PartitionIdOp)
 MAP_STABLEHLO_TO_HLO(PopulationCountOp)
 MAP_STABLEHLO_TO_HLO(PowOp)
 MAP_STABLEHLO_TO_HLO(RealDynamicSliceOp)
@@ -168,4 +165,4 @@ MAP_STABLEHLO_TO_HLO(XorOp)
 }  // namespace stablehlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_MAP_STABLEHLO_TO_HLO_OP_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
index 4730eeeccb2..4b2cb559b5e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <numeric>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
index 81ab6936b32..e7eb7209aba 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -27,7 +27,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_TESTMATERIALIZEBROADCASTSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/merge_assuming_ops.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/merge_assuming_ops.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
index 6d5c4a17516..8f7ca771e98 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/merge_assuming_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -41,7 +41,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_MERGEASSUMINGOPSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -146,7 +146,7 @@ LogicalResult moveUpIntoAssumingOpMatchAndRewrite(Operation *op,
       assumingOp.getLoc(), assumingOp.getWitness(),
       [&](OpBuilder &b, Location) {
         // Copy body.
-        BlockAndValueMapping mapping;
+        IRMapping mapping;
         for (auto &nested : body->without_terminator())
           b.clone(nested, mapping);
 
@@ -202,7 +202,7 @@ struct MoveElementwiseOpsUpIntoAssumingOpPattern : public RewritePattern {
         !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
       return failure();
     }
-    if (!MemoryEffectOpInterface::hasNoEffect(op)) return failure();
+    if (!isMemoryEffectFree(op)) return failure();
 
     return moveUpIntoAssumingOpMatchAndRewrite(op, rewriter);
   }
@@ -250,7 +250,7 @@ struct MoveElementwiseOpsDownIntoAssumingOpPattern : public RewritePattern {
         !op->hasTrait<hlo::OpTrait::BroadcastingElementwise>()) {
       return failure();
     }
-    if (!MemoryEffectOpInterface::hasNoEffect(op)) return failure();
+    if (!isMemoryEffectFree(op)) return failure();
 
     return moveDownIntoAssumingOpMatchAndRewrite(op, rewriter);
   }
@@ -302,7 +302,7 @@ struct MoveUpOutOfAssumingOpPattern : public OpRewritePattern<OpTy> {
         assumingOp.getLoc(), assumingOp.getWitness(),
         [&](OpBuilder &b, Location) {
           // Copy body.
-          BlockAndValueMapping mapping;
+          IRMapping mapping;
           for (Operation &nested : body->without_terminator()) {
             b.clone(nested, mapping);
           }
@@ -358,7 +358,7 @@ struct MergeAssumingOpsPattern : public OpRewritePattern<shape::AssumingOp> {
     auto newAssumingOp = rewriter.create<shape::AssumingOp>(
         precedingOp.getLoc(), newWitness, [&](OpBuilder &b, Location) {
           // Copy preceding op's body.
-          BlockAndValueMapping mapping;
+          IRMapping mapping;
           for (auto &nested : body_a->without_terminator()) {
             b.clone(nested, mapping);
           }
@@ -432,7 +432,7 @@ struct MergeAssumingOpsPass
     RewritePatternSet patterns(ctx);
     mhlo::populateMergeAssumingOpsPatterns(ctx, &patterns);
     GreedyRewriteConfig config;
-    config.maxIterations = GreedyRewriteConfig::kNoIterationLimit;
+    config.maxIterations = GreedyRewriteConfig::kNoLimit;
     if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
                                             config))) {
       return signalPassFailure();
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_gather.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_gather.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
index 75f7e72fbb2..62cf99f5e16 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_gather.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_gather/mhlo_canonicalize_gather.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/utils/mhlo_scatter_gather_utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -37,7 +37,7 @@ namespace mhlo {
 namespace {
 
 #define GEN_PASS_DEF_HLOCANONICALIZEGATHERPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 // Given an input tensor, collapse dimensions 1+collapsedSliceDims[...].
 Value collapseSliceDims(ImplicitLocOpBuilder& b, TypedValue<TensorType> input,
@@ -65,7 +65,8 @@ Value expandBatchDimension(ImplicitLocOpBuilder& b,
       originalGatherOp.getStartIndices().getType().getShape()};
   // Erase the index vector dimension if it wasn't implicit.
   int64_t indexDim = originalGatherOp.getDimensionNumbers().getIndexVectorDim();
-  if (indexDim < newShape.size()) newShape.erase(newShape.begin() + indexDim);
+  if (indexDim < static_cast<int64_t>(newShape.size()))
+    newShape.erase(newShape.begin() + indexDim);
 
   // `input` has one batch dimension, if we still have one now, there is nothing
   // to do.
@@ -79,7 +80,7 @@ Value expandBatchDimension(ImplicitLocOpBuilder& b,
       RankedTensorType::get(newShape, input.getType().getElementType());
   auto reassociation =
       *getReassociationIndicesForReshape(input.getType(), newType);
-  if (newShape.size() > input.getType().getRank()) {
+  if (static_cast<int64_t>(newShape.size()) > input.getType().getRank()) {
     return b.create<tensor::ExpandShapeOp>(newType, input, reassociation);
   }
   return b.create<tensor::CollapseShapeOp>(newType, input, reassociation);
@@ -167,13 +168,16 @@ struct CanonicalizeGatherPattern : public OpRewritePattern<GatherOp> {
         result, b.getI64TensorAttr(operandPermutationInverse));
 
     // Collapse the requested dimensions.
-    result = collapseSliceDims(b, result, dims.getCollapsedSliceDims());
+    result = cast<TypedValue<TensorType>>(
+        collapseSliceDims(b, result, dims.getCollapsedSliceDims()));
 
     // Expand the start index dimensions.
-    result = expandBatchDimension(b, result, gatherOp);
+    result =
+        cast<TypedValue<TensorType>>(expandBatchDimension(b, result, gatherOp));
 
     // Move the offset dims to the final locations.
-    result = moveOffsetDimensions(b, result, gatherOp);
+    result =
+        cast<TypedValue<TensorType>>(moveOffsetDimensions(b, result, gatherOp));
 
     rewriter.replaceOp(gatherOp.getOperation(), {result});
     return success();
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_reduction.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_reduction.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
index 4b3a97e1630..65cedd54595 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_reduction.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file canonicalize reduction ops in hlo dialect to match the
 // capacity of codegen backend.
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -26,7 +26,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_HLOCANONICALIZEREDUCTIONPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -212,10 +212,9 @@ struct HloCanonicalizeReductionPass
       for (Value operand : op.getInputs()) {
         newOperands.push_back(b.create<DynamicReshapeOp>(
             loc,
-            RankedTensorType::get(
-                SmallVector<int64_t, 4>(newOperandDims.size(),
-                                        ShapedType::kDynamicSize),
-                elemTy),
+            RankedTensorType::get(SmallVector<int64_t, 4>(newOperandDims.size(),
+                                                          ShapedType::kDynamic),
+                                  elemTy),
             operand, newOperandShape));
       }
       auto newOp =
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_scatter.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_scatter.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
index a056bfc4d60..54892f63ed6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_canonicalize_scatter.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_scatter/mhlo_canonicalize_scatter.cc
@@ -17,12 +17,13 @@ limitations under the License.
 
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/utils/mhlo_scatter_gather_utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Pass/Pass.h"
@@ -33,7 +34,7 @@ namespace mhlo {
 namespace {
 
 #define GEN_PASS_DEF_HLOCANONICALIZESCATTERPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 DenseIntElementsAttr getI64ElementsAttr(ArrayRef<int64_t> values,
                                         Builder* builder) {
@@ -119,7 +120,9 @@ SmallVector<Value> reshapeUpdatesToEnsureSingleScatterDimension(
   }
   if (numScatterDims == 0) {
     return to_vector(llvm::map_range(updates, [&](Value update) -> Value {
-      return insertDegenerateDimensions(b, loc, update, {0});
+      return insertDegenerateDimensions(
+          b, loc, cast<TypedValue<TensorType>>(update),
+          {0});
     }));
   }
   return updates;
@@ -137,8 +140,9 @@ SmallVector<Value> reshapeUpdatesToMatchOperandShape(
     shiftedScatterDimsToOperandDims.push_back(i + 1);
 
   return to_vector(map_range(updates, [&](Value update) -> Value {
-    return insertDegenerateDimensions(b, loc, update,
-                                      shiftedScatterDimsToOperandDims);
+    return insertDegenerateDimensions(
+        b, loc, cast<TypedValue<TensorType>>(update),
+        shiftedScatterDimsToOperandDims);
   }));
 }
 
@@ -205,7 +209,7 @@ struct CanonicalizeScatterPattern : public OpRewritePattern<ScatterOp> {
         rewriter.getContext(),
         /*updateWindowDims=*/
         llvm::to_vector<4>(llvm::seq<int64_t>(1, operandRank + 1)),
-        /*insertedWindowDims=*/llvm::None,
+        /*insertedWindowDims=*/std::nullopt,
         /*scatterDimsToOperandDims=*/
         llvm::to_vector<4>(llvm::seq<int64_t>(0, scatterIndicesVectorSize)),
         /*indexVectorDim=*/1);
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_flatten_tuple.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_flatten_tuple.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
index 5432f200268..48ad9df9da6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_flatten_tuple.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
@@ -37,7 +37,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_FLATTENTUPLEPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 3b9e66ca04a..8ac42b37fba 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -232,6 +232,17 @@ def ShapeReificationPass : Pass<"shape-reification", "func::FuncOp"> {
   let constructor = "createShapeReificationPass()";
 }
 
+def SymbolicShapeOptimization : Pass<"symbolic-shape-optimization", "func::FuncOp"> {
+  let summary = "Analyzes shapes and performs shape-related optimizations";
+  let constructor = "createSymbolicShapeOptimizationPass()";
+}
+
+def ShapeSimplification
+    : Pass<"shape-simplification", "mlir::func::FuncOp"> {
+  let summary = "Simplify shape ops";
+  let constructor = "createShapeSimplification()";
+}
+
 def ConstraintFusionPass : Pass<"constraint-fusion", "func::FuncOp"> {
   let summary = "Fuse shape constraints and merge all assuming regions.";
   let constructor = "createConstraintFusionPass()";
@@ -302,6 +313,12 @@ def RankSpecializationToSCFPass
   ];
 }
 
+def MhloExpandOpsSimplifierPass
+    : Pass<"mhlo-expand-ops-simplifier", "func::FuncOp"> {
+  let summary = "Expand feature rich mhlo ops into a set of simpler mhlo ops.";
+  let constructor = "createMhloExpandOpsSimplifierPass()";
+}
+
 def CollapseElementwiseMapPass
     : Pass<"mhlo-collapse-elementwise-map", "func::FuncOp"> {
   let summary = "Collapse the mhlo.map if the map only has elementwise ops.";
@@ -312,6 +329,12 @@ def HloLegalizeToStablehloPass : Pass<"hlo-legalize-to-stablehlo", "ModuleOp"> {
   let summary = "Legalize HLO to StableHLO.";
   let constructor = "createHloLegalizeToStablehloPass()";
   let dependentDialects = ["stablehlo::StablehloDialect"];
+  let options = [
+    Option<"allow_experimental_features_", "allow-experimental-features",
+           "bool", /*default=*/"false",
+           "Allow legalization of experimental MHLO features via StableHLO "
+           "custom_call">
+  ];
 }
 
 def StablehloLegalizeToHloPass : Pass<"stablehlo-legalize-to-hlo", "ModuleOp"> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
index 30505a0c8e7..e8f40574b43 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
@@ -22,9 +22,8 @@ limitations under the License.
 #include <numeric>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/utils/hlo_utils.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
+#include "utils/hlo_utils.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
similarity index 89%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
index 036901146ce..6576edce296 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -28,7 +28,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_OPTIMIZEMHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 class OptimizeMhloPass : public impl::OptimizeMhloPassBase<OptimizeMhloPass> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h
new file mode 100644
index 00000000000..10a5aef783a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
+
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+class ModuleOp;
+class Operation;
+template <typename T>
+class OperationPass;
+class Pass;
+namespace func {
+class FuncOp;
+}  // namespace func
+namespace lmhlo {
+class FusionOp;
+}  // namespace lmhlo
+
+namespace mhlo {
+
+#define GEN_PASS_DECL
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+/// Lowers HLO control flow ops to SCF.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeControlFlowPass();
+
+/// Lowers sort to SCF & arith.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
+
+/// Lowers from HLO dialect to Standard dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToStdPass();
+
+/// Lowers from the CHLO dialect to the HLO dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
+    bool legalizeBroadcasts = true, bool expandCompositions = true);
+
+// Lowers from sparse ops in CHLO dialect to Linalg dialect.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeSparseChloToLinalgPass();
+
+// Canonicalize reduction ops to be suitable for codegen.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createHloCanonicalizeReductionPass();
+
+// Expand feature rich mhlo ops to simpler mhlo ops.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createMhloExpandOpsSimplifierPass();
+
+// Rewrites scatter into transposes, reshapes and a simpler scatter.
+std::unique_ptr<OperationPass<func::FuncOp>> createHloCanonicalizeScatterPass();
+
+// Rewrites gather into transposes, reshapes and a simpler gather.
+std::unique_ptr<OperationPass<func::FuncOp>> createHloCanonicalizeGatherPass();
+
+/// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
+/// buffers if necessary.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
+
+/// Lowers from HLO dialect to Memref dialect allocating/deallocating temporary
+/// buffers if necessary.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass();
+
+/// Lowers from HLO dialect to Arithmetic dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToArithmeticPass();
+
+// Lowers shape operations from HLO dialect to Standard dialect.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeHloShapeOpsToStandardPass();
+
+/// Lowers from MHLO dialect to THLO dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMHLOToTHLOPass(
+    bool enableExperimentalOps = false);
+
+/// Lowers from HLO dialect to Linalg dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeHloToLinalgPass(
+    bool enablePrimitiveOps = false);
+
+/// Lowers from HLO dialects dim operations.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeShapeComputationsPass();
+
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createSinkConstantsToControlFlowPass();
+
+/// Lowers trigonometric operations from the standard dialect to approximations
+/// that do not use intrinsics.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeTrigonometricToApproximationPass();
+
+// Move dynamic broadcasts up over element-wise operations and broadcast the
+// operands rather than the result. This will eventually allow for larger
+// fusions.
+std::unique_ptr<OperationPass<func::FuncOp>> createBroadcastPropagationPass();
+
+// Transformations that helps in restricting maximum rank among tensors in the
+// pass.
+std::unique_ptr<OperationPass<func::FuncOp>> createRestrictMaxRankPass();
+
+// Prepare moving dynamic broadcasts up over element-wise operations and
+// broadcast the operands rather than the result. This will eventually allow for
+// larger fusions.
+std::unique_ptr<OperationPass<func::FuncOp>> createMergeAssumingOpsPass();
+
+// Iteratively reifies all shape computations in the function.
+std::unique_ptr<OperationPass<func::FuncOp>> createShapeReificationPass();
+
+/// Creates a pass to analyze shapes and to use that information for
+/// shape-related optimizations.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createSymbolicShapeOptimizationPass();
+
+// Pass to simplify shape ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification();
+
+// Fuse shape constraints and merge all assuming regions.
+std::unique_ptr<OperationPass<func::FuncOp>> createConstraintFusionPass();
+
+// Group reduction and parallel dimensions of reduction operations and realize
+// them through equivalent 1D or 2D reductions.
+std::unique_ptr<OperationPass<func::FuncOp>> createGroupReductionDimensionsPass(
+    bool preferColumnsReductions = true);
+
+/// Rank specialization passes:
+///   - Find compatible operations and group them together in one rank
+///     specialization cluster.
+///   - Lower rank specialization clusters to SCF and ranked operations.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createRankSpecializationClusterPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createRankSpecializationToSCFPass(
+    int64_t maxTargetRank = 5);
+
+std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeMhloPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexPass();
+std::unique_ptr<::mlir::Pass> createLegalizeGeneralDotPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeEinsumToDotGeneralPass();
+std::unique_ptr<OperationPass<func::FuncOp>>
+createLegalizeGatherToTorchIndexSelectPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createFlattenTuplePass();
+
+// Creates a pass for expanding mhlo.tuple ops.
+std::unique_ptr<OperationPass<ModuleOp>> createExpandHloTuplesPass(
+    const std::string& entryFunctionName = "main");
+
+// Creates a pass for collapsing the mhlo.map if the map only has elementwise
+// op.
+std::unique_ptr<OperationPass<func::FuncOp>> createCollapseElementwiseMapPass();
+
+// Pass to replace unsigned types with signless integers.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertToSignlessPass();
+
+/// Creates pass for rewriting sparse mhlo ops.
+std::unique_ptr<OperationPass<func::FuncOp>> createSparseRewritingPass();
+
+// Legalizes from the MHLO dialect to the StableHLO dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createHloLegalizeToStablehloPass();
+
+// Legalizes from the StableHLO dialect to the MHLO dialect.
+std::unique_ptr<OperationPass<ModuleOp>> createStablehloLegalizeToHloPass();
+
+// Test passes.
+std::unique_ptr<Pass> createTestInferShapedTypeMethodsPass();
+std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
+std::unique_ptr<Pass> createTestUnfuseBatchNormPass();
+
+#define GEN_PASS_REGISTRATION
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/prepare_for_export.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/prepare_for_export.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 04676cc4716..192994676bb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/prepare_for_export.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -38,11 +38,11 @@ limitations under the License.
 
 namespace mlir {
 namespace mhlo {
-namespace {
 
 #define GEN_PASS_DEF_PREPAREFOREXPORTPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
+namespace {
 // Prepare module for export to XLA HLO.
 struct PrepareForExportPass
     : public impl::PrepareForExportPassBase<PrepareForExportPass> {
@@ -163,9 +163,5 @@ void PrepareForExportPass::runOnOperation() {
   });
 }
 
-std::unique_ptr<Pass> createPrepareForExportPass() {
-  return std::make_unique<PrepareForExportPass>();
-}
-
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/rank_specialization.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/rank_specialization.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
index 2e8fbdbb7ab..0a4310d880f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/rank_specialization.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Block.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -54,7 +54,7 @@ namespace mhlo {
 
 #define GEN_PASS_DEF_RANKSPECIALIZATIONCLUSTERPASS
 #define GEN_PASS_DEF_RANKSPECIALIZATIONTOSCFPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -142,7 +142,7 @@ struct RankSpecializationClusterPattern : public RewritePattern {
                              SmallVector<Location>(operandTypes.size(), loc));
 
     // Copy operations into the body.
-    BlockAndValueMapping bvm;
+    IRMapping bvm;
     for (auto it : llvm::zip(operands, block->getArguments()))
       bvm.map(std::get<0>(it), std::get<1>(it));
     rewriter.setInsertionPointToStart(block);
@@ -191,10 +191,10 @@ struct MergeRankSpecializationClusterOpsPattern
     // Merge cluster operands. Consider only those operands of the second
     // cluster that do not originate in the preceding cluster.
     SmallVector<Value, 8> newOperands;
-    for (Value v : precedingOp.operands()) newOperands.push_back(v);
-    for (Value v : op.operands()) {
+    for (Value v : precedingOp.getOperands()) newOperands.push_back(v);
+    for (Value v : op.getOperands()) {
       if (v.getDefiningOp() != precedingOp &&
-          !llvm::is_contained(precedingOp.operands(), v)) {
+          !llvm::is_contained(precedingOp.getOperands(), v)) {
         newOperands.push_back(v);
       }
     }
@@ -229,7 +229,7 @@ struct MergeRankSpecializationClusterOpsPattern
 
     // Map operands and copy operations of the preceding cluster into the new
     // body.
-    BlockAndValueMapping bvm;
+    IRMapping bvm;
     for (const auto &it : llvm::enumerate(precedingBody->getArguments()))
       bvm.map(it.value(), newBody->getArgument(it.index()));
     for (Operation &nestedOp : precedingBody->without_terminator())
@@ -238,7 +238,7 @@ struct MergeRankSpecializationClusterOpsPattern
     // Map operands and copy operations of the second cluster. If they result
     // from the preceeding cluster, we can simply map the corresponding value
     // internally.
-    for (auto it : llvm::zip(body->getArguments(), op.operands())) {
+    for (auto it : llvm::zip(body->getArguments(), op.getOperands())) {
       Value blockArg, operand;
       std::tie(blockArg, operand) = it;
       if (operand.getDefiningOp() == precedingOp) {
@@ -247,7 +247,7 @@ struct MergeRankSpecializationClusterOpsPattern
         bvm.map(blockArg,
                 bvm.lookup(precedingYieldOp.getOperand(where.getIndex())));
       } else {
-        auto where = llvm::find(newOp.operands(), operand);
+        auto where = llvm::find(newOp.getOperands(), operand);
         bvm.map(blockArg, newBody->getArgument(where.getIndex()));
       }
     }
@@ -314,7 +314,7 @@ bool isScalarShapeType(Type ty) {
 Type deriveRankedTensorTypes(Type ty, int64_t rank) {
   auto tensorTy = ty.dyn_cast<TensorType>();
   if (!tensorTy) return ty;
-  SmallVector<int64_t, 8> shape(rank, ShapedType::kDynamicSize);
+  SmallVector<int64_t, 8> shape(rank, ShapedType::kDynamic);
   return RankedTensorType::get(shape, tensorTy.getElementType());
 }
 
@@ -325,7 +325,7 @@ Type deriveUnrankedTensorTypes(Type ty) {
 }
 
 SmallVector<Value, 8> materializeRankedOperations(
-    OpBuilder &b, Location loc, BlockAndValueMapping &bvm,
+    OpBuilder &b, Location loc, IRMapping &bvm,
     chlo::RankSpecializationClusterOp op) {
   // Create ranked operations.
   for (Operation &nestedOp : op.SingleBlock::getBody()->without_terminator()) {
@@ -413,7 +413,7 @@ SmallVector<Value, 8> materializeFinalReshape(
 
   // Replace all remaining uses of the original cluster's block args.
   for (auto it :
-       llvm::zip(op.operands(), op.SingleBlock::getBody()->getArguments())) {
+       llvm::zip(op.getOperands(), op.SingleBlock::getBody()->getArguments())) {
     Value operand, barg;
     std::tie(operand, barg) = it;
     barg.replaceUsesWithIf(operand, [&](OpOperand &operand) {
@@ -443,7 +443,7 @@ Value materializeScalarRankSpecializationCase(
   // non-scalars.
   Value one = b.create<arith::ConstantIndexOp>(loc, 1);
   Value allOthersAreScalar;
-  for (auto it : llvm::zip(op.operands(), shapes)) {
+  for (auto it : llvm::zip(op.getOperands(), shapes)) {
     Value operand, shape;
     std::tie(operand, shape) = it;
     if (llvm::is_contained(nonScalarsOfSameShape, operand) ||
@@ -461,11 +461,11 @@ Value materializeScalarRankSpecializationCase(
   }
 
   auto ifOp = b.create<scf::IfOp>(
-      loc, op->getResultTypes(), allOthersAreScalar,
+      loc, allOthersAreScalar,
       [&](OpBuilder &b, Location loc) {
         // Compute flat non-scalar shape.
         SmallVector<Value, 4> nonScalarShapes;
-        for (auto it : llvm::zip(op.operands(), shapes)) {
+        for (auto it : llvm::zip(op.getOperands(), shapes)) {
           Value operand, shape;
           std::tie(operand, shape) = it;
           if (llvm::is_contained(nonScalarsOfSameShape, operand))
@@ -475,7 +475,7 @@ Value materializeScalarRankSpecializationCase(
 
         // Derive ranked operands.
         auto rankedOperands = llvm::to_vector<8>(
-            llvm::map_range(op.operands(), [&](Value v) -> Value {
+            llvm::map_range(op.getOperands(), [&](Value v) -> Value {
               if (isScalarTensorType(v.getType())) return v;
               if (!llvm::is_contained(nonScalarsOfSameShape, v)) {
                 return b
@@ -492,7 +492,7 @@ Value materializeScalarRankSpecializationCase(
             }));
 
         // Materialize ranked variants for the element-wise operations.
-        BlockAndValueMapping bvm;
+        IRMapping bvm;
         for (auto it : llvm::zip(op.SingleBlock::getBody()->getArguments(),
                                  rankedOperands))
           bvm.map(std::get<0>(it), std::get<1>(it));
@@ -532,12 +532,12 @@ Value materializeEqualShapesRankSpecializationCase(
   }
 
   auto ifOp = b.create<scf::IfOp>(
-      loc, op->getResultTypes(), allShapesEqOrScalar,
+      loc, allShapesEqOrScalar,
       [&](OpBuilder &b, Location loc) {
         // Flatten non-scalar operands.
         Value flatShape = materializeFlatShape(b, loc, nonScalarShapes);
         auto flatOperands = llvm::to_vector<8>(
-            llvm::map_range(op.operands(), [&](Value v) -> Value {
+            llvm::map_range(op.getOperands(), [&](Value v) -> Value {
               if (isScalarTensorType(v.getType())) return v;
               return b.create<mhlo::DynamicReshapeOp>(
                   loc, deriveRankedTensorTypes(v.getType(), /*rank=*/1), v,
@@ -545,7 +545,7 @@ Value materializeEqualShapesRankSpecializationCase(
             }));
 
         // Materialize ranked variants for the element-wise operations.
-        BlockAndValueMapping bvm;
+        IRMapping bvm;
         for (auto it :
              llvm::zip(op.SingleBlock::getBody()->getArguments(), flatOperands))
           bvm.map(std::get<0>(it), std::get<1>(it));
@@ -575,7 +575,7 @@ Value materializeTargetRankSpecializationCase(
       mlir::DenseIntElementsAttr::get(extentTensorTy,
                                       SmallVector<int64_t, 6>(targetRank, 1)));
   SmallVector<Value, 8> rankedOperands;
-  for (auto it : llvm::zip(op.operands(), shapes)) {
+  for (auto it : llvm::zip(op.getOperands(), shapes)) {
     Value operand, shape;
     std::tie(operand, shape) = it;
     if (operand.getType().isa<RankedTensorType>()) {
@@ -594,7 +594,7 @@ Value materializeTargetRankSpecializationCase(
   }
 
   // Materialize ranked versions of the element-wise operations.
-  BlockAndValueMapping bvm;
+  IRMapping bvm;
   for (auto it : llvm::zip(op.getBody().front().getArguments(), rankedOperands))
     bvm.map(std::get<0>(it), std::get<1>(it));
 
@@ -707,9 +707,9 @@ materializeRankSpecializationForSingleNonScalarShapeEquivalenceClass(
   Value flatShape = materializeFlatShape(rewriter, loc, nonScalarShapes);
 
   // Materialize ranked variants for the element-wise operations.
-  BlockAndValueMapping bvm;
+  IRMapping bvm;
   for (auto it :
-       llvm::zip(op.SingleBlock::getBody()->getArguments(), op.operands())) {
+       llvm::zip(op.SingleBlock::getBody()->getArguments(), op.getOperands())) {
     Value operand;
     Value bbArg;
     std::tie(bbArg, operand) = it;
@@ -740,9 +740,10 @@ Value materializeRankSpecializationForTwoNonScalarShapeEquivalenceClasses(
     SmallVector<SmallVector<Value, 4>, 4> nonScalarEqs, int64_t maxTargetRank) {
   assert(nonScalarEqs.size() == 2 &&
          "Expect two non-scalar equivalence classes.");
-  auto shapes = llvm::to_vector<8>(llvm::map_range(op.operands(), [&](Value v) {
-    return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
-  }));
+  auto shapes =
+      llvm::to_vector<8>(llvm::map_range(op.getOperands(), [&](Value v) {
+        return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
+      }));
   ValueRange lhsNonScalarEqs = nonScalarEqs[0];
   ValueRange rhsNonScalarEqs = nonScalarEqs[1];
 
@@ -769,9 +770,10 @@ Value materializeDefaultRankSpecialization(PatternRewriter &rewriter,
                                            Location loc,
                                            chlo::RankSpecializationClusterOp op,
                                            int64_t maxTargetRank) {
-  auto shapes = llvm::to_vector<8>(llvm::map_range(op.operands(), [&](Value v) {
-    return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
-  }));
+  auto shapes =
+      llvm::to_vector<8>(llvm::map_range(op.getOperands(), [&](Value v) {
+        return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
+      }));
 
   // Materialize all the different cases.
   Value unshapedResult = materializeDefaultRankSpecializationCases(
@@ -788,7 +790,7 @@ SmallVector<SmallVector<Value, 4>, 4> findNonScalarShapeEquivalences(
 
   // Bridge the equivalences between operands and block arguments.
   for (auto it :
-       llvm::zip(op.operands(), op.SingleBlock::getBody()->getArguments()))
+       llvm::zip(op.getOperands(), op.SingleBlock::getBody()->getArguments()))
     eqs.unionSets(std::get<0>(it), std::get<1>(it));
 
   // Find equalities through `SameOperandsAndResultShape` trait.
@@ -848,7 +850,7 @@ SmallVector<SmallVector<Value, 4>, 4> findNonScalarShapeEquivalences(
 
   // Convert to a list-like equivalence class representation.
   SmallVector<SmallVector<Value, 4>, 4> nonScalarEqs;
-  for (Value v : op.operands()) {
+  for (Value v : op.getOperands()) {
     if (isScalarTensorType(v.getType())) continue;
     bool inserted = false;
     for (auto &eqClass : nonScalarEqs) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/restrict_max_rank.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/restrict_max_rank.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
index e497c1dded2..a4db09b8eb9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/restrict_max_rank.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -35,7 +35,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_RESTRICTMAXRANKPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h
new file mode 100644
index 00000000000..dcca8e78cd8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
+#define MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+namespace bufferization {
+class BufferizeTypeConverter;
+}  // namespace bufferization
+namespace mhlo {
+
+// Collection of rewrite patterns for lowering a general dot product.
+void populateGeneralDotOpLoweringPatterns(RewritePatternSet *patterns,
+                                          MLIRContext *ctx);
+
+// Collection of rewrite patterns for lowering complex operations to equivalent
+// float operations.
+void populateComplexLoweringPatterns(MLIRContext *context,
+                                     RewritePatternSet *patterns);
+
+void populateOptimizeMhloPatterns(MLIRContext *context,
+                                  RewritePatternSet *patterns);
+
+// Rewrite patterns for einsum to equivalent dot_general legalization.
+void populateEinsumToDotGeneralPatterns(mlir::MLIRContext *context,
+                                        RewritePatternSet *patterns);
+
+// Rewrite patterns for gather to equivalent torch index select legalization.
+void populateGatherToTorchIndexSelectPatterns(mlir::MLIRContext *context,
+                                              RewritePatternSet *patterns);
+
+void populateMhloToStdPatterns(RewritePatternSet *patterns, MLIRContext *ctx);
+
+// Collection of rewrite patterns for lowering all mhlo ops to their
+// lmhlo counterparts.
+void populateDynamicHloToLhloConversionPattern(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering of HLO to LHLO dialect.
+void populateHloToLhloConversionPattern(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering of HLO to arithmetic dialect.
+void populateHloToArithmeticConversionPatterns(RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering pointwise HLO ops with scalar
+// arguments to arithmetic dialect.
+void populateScalarHloToArithmeticConversionPatterns(
+    MLIRContext *context, TypeConverter &typeConverter,
+    RewritePatternSet *patterns,
+    llvm::function_ref<bool(Operation *)> filterFn = nullptr);
+
+// Collection of rewrite patterns for lowering of shape operations from the HLO
+// dialect to the standard dialect.
+void populateHloShapeOpsToStandardConversionPattern(
+    MLIRContext *context, TypeConverter &typeConverter,
+    RewritePatternSet *patterns);
+
+// Collection of rewrite patterns for lowering of HLO to Linalg dialect.
+void populateHloToLinalgConversionPattern(MLIRContext *context,
+                                          TypeConverter &typeConverter,
+                                          RewritePatternSet *patterns,
+                                          bool enablePrimitiveOps = false);
+
+// Collection of rewrite patterns for lowering of HLO dim operations.
+void populateShapeComputationPatterns(MLIRContext *context,
+                                      RewritePatternSet *patterns);
+
+// Converter to signless intergers to be used with linalg conversion patterns.
+std::unique_ptr<TypeConverter> createHloToLinalgTypeConverter();
+
+// Sets up legality definitions for materializing broadcasts.
+void setupMaterializeBroadcastsLegality(MLIRContext *context,
+                                        ConversionTarget *conversionTarget);
+
+// Populates a collection of rewrite patterns for materializing broadcast
+// attributes to equivalent sequences of ops.
+void populateMaterializeBroadcastsPatterns(MLIRContext *context,
+                                           RewritePatternSet *patterns);
+
+// Populates a collection of rewrite patterns to realize element-wise operations
+// on ranked tensors where possible.
+void populateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          RewritePatternSet *patterns);
+
+void populateDynamicShapeFusionPatterns(MLIRContext *context,
+                                        RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// batch_norm_inference into constituent HLO ops.
+void populateUnfuseBatchNormInferencePattern(MLIRContext *context,
+                                             RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// batch_norm_training into constituent HLO ops.
+void populateUnfuseBatchNormTrainingPattern(MLIRContext *context,
+                                            RewritePatternSet *patterns);
+
+// Populate a collection of conversion patterns for un-fusing
+// // batch_norm_inference and batch_norm_training into constituent HLO ops.
+inline void populateUnfuseBatchNormPatterns(MLIRContext *context,
+                                            RewritePatternSet *patterns) {
+  populateUnfuseBatchNormInferencePattern(context, patterns);
+  populateUnfuseBatchNormTrainingPattern(context, patterns);
+}
+
+// Populates patterns that translate the trigonometric operations from the
+// standard dialect to approximations that do not use intrinsics.
+void populateTrigonometricToApproximationPatterns(MLIRContext *context,
+                                                  RewritePatternSet *patterns);
+
+// Populate patterns to prepare moving dynamic broadcasts up over element-wise
+// operations and broadcast the operands rather than the result. This will
+// eventually allow for larger fusions.
+void populateMergeAssumingOpsPatterns(MLIRContext *context,
+                                      RewritePatternSet *patterns);
+
+// Populate patterns for iterative shape reification.
+void populateShapeReificationPatterns(MLIRContext *, RewritePatternSet *);
+
+// Populate patterns to group reduction and parallel dimensions of reduction
+// operations and realize them through equivalent 1D or 2D reductions.
+void populateGroupReductionDimensionsPatterns(MLIRContext *context,
+                                              RewritePatternSet *patterns,
+                                              bool preferColumnsReductions);
+
+/// Populate rank specialization clustering and lowering patterns.
+void populateRankSpecializationClusterPatterns(MLIRContext *context,
+                                               RewritePatternSet *patterns);
+void populateRankSpecializationToSCFPatterns(MLIRContext *context,
+                                             RewritePatternSet *patterns,
+                                             int64_t maxTargetRank);
+
+/// Populate sparse tensor specific rewriting patterns.
+void populateSparseRewritingPatterns(RewritePatternSet *patterns,
+                                     MLIRContext *ctx);
+
+/// Populates sparse ops in CHLO to linalg rewriting patterns.
+void populateLegalizeSparseChloToLinalgPatterns(MLIRContext *context,
+                                                TypeConverter &typeConverter,
+                                                RewritePatternSet *patterns);
+
+}  // namespace mhlo
+
+namespace chlo {
+
+// Populates a collection of conversion patterns for legalizing broadcasting
+// client-HLO to their non-broadcasting counterparts.
+void populateChloBroadcastingPatterns(MLIRContext *context,
+                                      RewritePatternSet *patterns);
+
+// Populates a collection of conversion patterns for legalizing client-HLO to
+// HLO by decomposing client-operations to corresponding sequences of more
+// primitive operations. This does not include the
+// PopulateChloBroadcastingPatterns above.
+void populateDecomposeChloPatterns(MLIRContext *context,
+                                   RewritePatternSet *patterns);
+
+}  // namespace chlo
+
+namespace stablehlo {
+
+// Populates MHLO ops to StableHLO ops rewriting patterns.
+// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
+// which make sure `func.func`, `func.call` and `func.return` which involve
+// illegal types also get converted.
+void populateHloToStablehloPatterns(RewritePatternSet *patterns,
+                                    TypeConverter *converter,
+                                    MLIRContext *context,
+                                    bool allowExperimentalFeatures);
+
+// Populates StableHLO ops to MHLO ops rewriting patterns.
+// Also see `stablehlo::registerFuncOpsForTypeConversion` for helper patterns
+// which make sure `func.func`, `func.call` and `func.return` which involve
+// illegal types also get converted.
+void populateStablehloToHloPatterns(RewritePatternSet *patterns,
+                                    TypeConverter *converter,
+                                    MLIRContext *context);
+
+}  // namespace stablehlo
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_MHLO_TRANSFORMS_REWRITERS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/shape_reification_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/shape_reification_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
index b0381eb5fbb..08f7551d12e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/shape_reification_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_reification/shape_reification_pass.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -32,7 +32,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_SHAPEREIFICATIONPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/shape_simplification.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
similarity index 94%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/shape_simplification.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
index 5491cc00e35..06ddde5e6a7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/shape_simplification.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/shape_simplification/shape_simplification.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/Optional.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -31,9 +31,10 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
+namespace mhlo {
 
 #define GEN_PASS_DEF_SHAPESIMPLIFICATION
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
@@ -63,16 +64,16 @@ struct BroadcastRemoveSubsumedOperandsPattern
       if (extents.size() > knownExtents.size()) {
         knownExtents.insert(knownExtents.begin(),
                             extents.size() - knownExtents.size(),
-                            ShapedType::kDynamicSize);
+                            ShapedType::kDynamic);
       }
 
       for (size_t i = 0, e = extents.size(); i != e; ++i) {
         int64_t extent = extents[e - i - 1];
-        if (extent != ShapedType::kDynamicSize && extent != 1) {
+        if (extent != ShapedType::kDynamic && extent != 1) {
           int64_t &knownExtent = knownExtents[knownExtents.size() - i - 1];
           // A dynamic dimension is subsumed by a static one, but bail out for
           // known conflicting shapes.
-          if (knownExtent != extent && knownExtent != ShapedType::kDynamicSize)
+          if (knownExtent != extent && knownExtent != ShapedType::kDynamic)
             return failure();
           knownExtent = extent;
         }
@@ -80,7 +81,7 @@ struct BroadcastRemoveSubsumedOperandsPattern
     }
 
     // If we've figured out all shapes to be constants we're done.
-    if (!llvm::is_contained(knownExtents, ShapedType::kDynamicSize)) {
+    if (!llvm::is_contained(knownExtents, ShapedType::kDynamic)) {
       rewriter.replaceOpWithNewOp<ConstShapeOp>(
           op, op->getResultTypes(), rewriter.getIndexTensorAttr(knownExtents));
       return success();
@@ -111,14 +112,14 @@ struct BroadcastRemoveSubsumedOperandsPattern
         //   - a dynamic dim but the result is known to be constant.
         int64_t knownExtent = knownExtents[knownExtents.size() - i - 1];
         assert(knownExtent != 1);
-        if (knownExtent != ShapedType::kDynamicSize &&
-            extent == ShapedType::kDynamicSize)
+        if (knownExtent != ShapedType::kDynamic &&
+            extent == ShapedType::kDynamic)
           continue;
 
         //   - a constant non-1 dimension equal to the "known" dim.
         // In this case we also have to check whether this operand is the only
         // contributor of that constant.
-        if (knownExtent != ShapedType::kDynamicSize && extent == knownExtent &&
+        if (knownExtent != ShapedType::kDynamic && extent == knownExtent &&
             llvm::count_if(operandExtents, [&](ArrayRef<int64_t> operandShape) {
               return i < operandShape.size() &&
                      operandShape[operandShape.size() - i - 1] == knownExtent;
@@ -251,4 +252,5 @@ std::unique_ptr<OperationPass<func::FuncOp>> createShapeSimplification() {
   return std::make_unique<ShapeSimplification>();
 }
 
+}  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
index c7bb4ae05e1..47a72c5e707 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Casting.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
@@ -27,7 +27,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_SINKCONSTANTSTOCONTROLFLOWPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_chlo_legalize_to_linalg.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_chlo_legalize_to_linalg/sparse_chlo_legalize_to_linalg.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_chlo_legalize_to_linalg.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_chlo_legalize_to_linalg/sparse_chlo_legalize_to_linalg.cc
index fc593e9cedb..9809697212c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_chlo_legalize_to_linalg.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_chlo_legalize_to_linalg/sparse_chlo_legalize_to_linalg.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/legalize_to_linalg_utils.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -46,7 +46,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CHLOLEGALIZETOLINALGPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_rewriting.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_rewriting.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
index 4699e2dca86..b749bc81ca9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/sparse_rewriting.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/Support/Debug.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/IR/Operation.h"
@@ -31,7 +31,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_SPARSEREWRITINGPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
similarity index 80%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 171e0244c60..2bd0e3ba0e9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_stablehlo_to_hlo_op.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/map_stablehlo_to_hlo_op.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
@@ -85,6 +86,11 @@ Attribute convertAttr(Attribute stablehloAttr) {
         attr.getContext(), attr.getOffsetDims(), attr.getCollapsedSliceDims(),
         attr.getStartIndexMap(), attr.getIndexVectorDim());
   }
+  if (auto attr = stablehloAttr.dyn_cast<stablehlo::OutputOperandAliasAttr>()) {
+    return mhlo::OutputOperandAliasAttr::get(
+        attr.getContext(), attr.getOutputTupleIndices(), attr.getOperandIndex(),
+        attr.getOperandTupleIndices());
+  }
   if (auto attr = stablehloAttr.dyn_cast<stablehlo::PrecisionAttr>()) {
     RETURN_CONVERTED_ENUM_ATTR(Precision);
   }
@@ -151,6 +157,36 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // the dialect conversion infrastructure.
     ValueRange hloOperands = adaptor.getOperands();
 
+    // Extensibility protocol for public MHLO features that are not yet
+    // supported in StableHLO. See hlo_legalize_to_stablehlo.cc for details.
+    if constexpr (std::is_same<StablehloOpTy, stablehlo::CustomCallOp>::value) {
+      if (stablehloOp.getCallTargetName().starts_with("mhlo.")) {
+        // Only call_target_name and backend_config are compatible with
+        // the extensibility protocol.
+        for (NamedAttribute stablehloAttr : stablehloOp->getAttrs()) {
+          auto stablehloName = stablehloAttr.getName().getValue();
+          if (stablehloName != "call_target_name" &&
+              stablehloName != "backend_config")
+            return failure();
+        }
+
+        // Dynamically create the corresponding MHLO op using call_target_name
+        // and backend_config. (It is quite neat that we have an API for this!).
+        OperationState hloOpState(stablehloOp.getLoc(),
+                                  stablehloOp.getCallTargetName());
+        hloOpState.addOperands(hloOperands);
+        hloOpState.addTypes(hloTypes);
+        auto hloAttrs = parseAttribute(stablehloOp.getBackendConfig(),
+                                       stablehloOp.getContext())
+                            .template dyn_cast_or_null<DictionaryAttr>();
+        if (!hloAttrs) return failure();
+        hloOpState.addAttributes(hloAttrs.getValue());
+        Operation* hloOp = rewriter.create(hloOpState);
+        rewriter.replaceOp(stablehloOp, hloOp->getResults());
+        return success();
+      }
+    }
+
     // Convert StableHLO attributes to MHLO equivalents.
     // If an attribute is not defined in StableHLO, then it is unchanged,
     // with the exception of ArrayAttr which is converted recursively.
@@ -180,6 +216,10 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     for (auto [stablehloRegion, hloRegion] :
          llvm::zip(stablehloOp->getRegions(), hloOp->getRegions())) {
       rewriter.inlineRegionBefore(stablehloRegion, hloRegion, hloRegion.end());
+      if (failed(rewriter.convertRegionTypes(&hloRegion,
+                                             *this->getTypeConverter(),
+                                             /*entryConversion=*/nullptr)))
+        return failure();
     }
     return success();
   }
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
similarity index 89%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
index 35a41e7cad4..848cf5597e8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/stablehlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo_pass.cc
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/passes.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/PatternMatch.h"
@@ -35,7 +35,7 @@ namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_STABLEHLOLEGALIZETOHLOPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/symbolic_shape_optimization.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
similarity index 87%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/symbolic_shape_optimization.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
index e9306348d80..7eb478c7b3e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/symbolic_shape_optimization.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/symbolic_shape_optimization/symbolic_shape_optimization.cc
@@ -18,14 +18,15 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Analysis/shape_component_analysis.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Transforms/passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/analysis/shape_component_analysis.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -38,9 +39,10 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
+namespace mhlo {
 
 #define GEN_PASS_DEF_SYMBOLICSHAPEOPTIMIZATION
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
 using Symbol = ShapeComponentAnalysis::Symbol;
@@ -78,7 +80,7 @@ struct SimplifyBroadcasts : public mlir::OpRewritePattern<shape::BroadcastOp> {
 
     // Compute broadcast symbolically.
     SmallVector<Optional<SymbolicBroadcastDimension>> symResult(rank,
-                                                                llvm::None);
+                                                                std::nullopt);
     for (const auto &sInfo : llvm::enumerate(shapesInfo)) {
       size_t dimOffset = rank - sInfo.value().size();
       for (const auto &symExpr : llvm::enumerate(sInfo.value())) {
@@ -183,7 +185,7 @@ struct AnnotateExpandingDimensionsInDynamicBroadcastInDim
 
     // Collect possibly already annotated info.
     auto insertAll = [](llvm::SmallSetVector<int64_t, 4> &dst,
-                        Optional<DenseIntElementsAttr> src) {
+                        std::optional<DenseIntElementsAttr> src) {
       if (!src) return;
       for (auto it : *src) dst.insert(it.getLimitedValue());
     };
@@ -338,7 +340,7 @@ struct RemoveRedundantCstrReshapable final
       if (!isSymbolicProduct(
               dim,
               [&](int64_t c) {
-                if (c != ShapedType::kDynamicSize) concreteProductDynShape *= c;
+                if (c != -1) concreteProductDynShape *= c;
               },
               [&](Symbol s) { partialSymbolicFactorsDynShape.push_back(s); })) {
         return failure();
@@ -455,7 +457,7 @@ bool isUnpairedUnitDimension(
 
 int64_t getShapedTypyDimSize(const SymbolicProduct &symProduct) {
   return symProduct.symbolic.empty() ? symProduct.concrete
-                                     : ShapedType::kDynamicSize;
+                                     : ShapedType::kDynamic;
 }
 
 // Iterate over the operand's and the result's shape dimensions and find
@@ -587,7 +589,7 @@ LogicalResult findExpandingAndCollapsingDimensionGroups(
         int64_t tyDimSize = getShapedTypyDimSize(gcd);
 
         // Allow no more than one dynamic dimension per expansion group.
-        if (tyDimSize == ShapedType::kDynamicSize) {
+        if (tyDimSize == ShapedType::kDynamic) {
           numDynamicDims++;
           if (numDynamicDims > 1) return failure();
         }
@@ -640,7 +642,7 @@ SmallVector<int64_t> concretizeOperandShape(
   return result;
 }
 
-llvm::Optional<SmallVector<ReassociationIndices>> requiresReassociationOfKind(
+std::optional<SmallVector<ReassociationIndices>> requiresReassociationOfKind(
     DimensionGroupKind kind, const SmallVector<DimensionGroup> &dimGroups) {
   SmallVector<ReassociationIndices> reassociation;
   reassociation.reserve(dimGroups.size());
@@ -659,7 +661,7 @@ llvm::Optional<SmallVector<ReassociationIndices>> requiresReassociationOfKind(
 
   // Return the reassociation if expansion is required.
   if (isStrictlyReassociating) return reassociation;
-  return llvm::None;
+  return std::nullopt;
 }
 
 LogicalResult materializeReshapeAsExpandAndCollapse(
@@ -786,6 +788,92 @@ struct CstrBroadcastableOpLowering
   }
 };
 
+// Returns a shape tensor if the shapes can be broadcasted to a known shape.
+// Will either return one of the shapes or a generated mix of the shapes.
+llvm::Optional<Value> simplifyBroadcast(ShapeComponentAnalysis &analysis,
+                                        ValueRange shapes, Location loc,
+                                        OpBuilder *builder) {
+  // First find the input shape with the largest rank.
+  SmallVector<ArrayRef<ShapeComponentAnalysis::SymbolicExpr>> shapesFound;
+  size_t maxRank = 0;
+  for (const auto &shape : llvm::enumerate(shapes)) {
+    auto foundShape = analysis.GetValueInfo(shape.value());
+    if (!foundShape) return {};
+    shapesFound.push_back(*foundShape);
+    maxRank = std::max(maxRank, foundShape->size());
+  }
+  if (maxRank == 0) {
+    return Value(builder->create<tensor::FromElementsOp>(
+        loc, shapes[0].getType(), SmallVector<Value>()));
+  }
+
+  SmallVector<const ShapeComponentAnalysis::SymbolicExpr *> joinedDimensions(
+      maxRank);
+  SmallVector<std::pair<Value, int64_t>> shapeAndRankForDim(maxRank);
+  for (const auto &shape : llvm::enumerate(shapesFound)) {
+    for (const auto &dim : llvm::enumerate(llvm::reverse(shape.value()))) {
+      // 1 dimensions don't contribute to the final result.
+      if (dim.value().isConstant(1)) continue;
+      // If it's not a 1 dimension it will be present in the result. Remember
+      // where it came from.
+      auto index = maxRank - dim.index() - 1;
+      if (!joinedDimensions[index]) {
+        joinedDimensions[index] = &dim.value();
+        shapeAndRankForDim[index] =
+            std::make_pair(shapes[shape.index()], shape.value().size());
+        continue;
+      }
+      // Bail if the dimensions are neither equal nor 1.
+      if (*joinedDimensions[index] != dim.value()) return {};
+    }
+  }
+  // If the output is the same as one of the inputs just return that.
+  if (llvm::all_equal(shapeAndRankForDim) && shapeAndRankForDim[0].first) {
+    return shapeAndRankForDim[0].first;
+  }
+  // Otherwise rematerialize the shape from the pieces we have.
+  SmallVector<Value> elements;
+  for (size_t i = 0; i != maxRank; ++i) {
+    // 1 dimensions are filtered above, recreate the constant.
+    if (!shapeAndRankForDim[i].first) {
+      auto one = builder->getIntegerAttr(
+          shapes[0].getType().cast<RankedTensorType>().getElementType(), 1);
+      elements.push_back(builder->create<arith::ConstantOp>(loc, one));
+      continue;
+    }
+    // Extract from one of the shapes, accounting for the reverse indexing
+    // performed by broadcast.
+    Value index = builder->create<arith::ConstantIndexOp>(
+        loc, i - maxRank + shapeAndRankForDim[i].second);
+    elements.push_back(builder->create<tensor::ExtractOp>(
+        loc, shapeAndRankForDim[i].first, index));
+  }
+  return Value(builder->create<tensor::FromElementsOp>(loc, elements));
+}
+
+// Replace shape.broadcast with a shape if it's statically known.
+struct BroadcastOpLowering final
+    : public mlir::OpRewritePattern<shape::BroadcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(
+      shape::BroadcastOp op, mlir::PatternRewriter &rewriter) const override {
+    ShapeComponentAnalysis shapeComponentAnalysis;
+    auto newBroadcast = simplifyBroadcast(
+        shapeComponentAnalysis, op.getShapes(), op.getLoc(), &rewriter);
+    if (!newBroadcast) return failure();
+
+    // Insert cast, if needed.
+    Type expectedTy = op.getType();
+    if (newBroadcast->getType() != expectedTy) {
+      newBroadcast = rewriter.create<tensor::CastOp>(op.getLoc(), expectedTy,
+                                                     *newBroadcast);
+    }
+
+    rewriter.replaceOp(op, {*newBroadcast});
+    return success();
+  }
+};
+
 class SymbolicShapeOptimizationPass final
     : public impl::SymbolicShapeOptimizationBase<
           SymbolicShapeOptimizationPass> {
@@ -800,13 +888,17 @@ class SymbolicShapeOptimizationPass final
     // clang-format off
     patterns.insert<
         AnnotateExpandingDimensionsInDynamicBroadcastInDim,
+        BroadcastOpLowering,
         CstrBroadcastableOpLowering,
         DynamicReshapeToExpandAndCollapseShape,
         RemoveComputeReshapeShape,
         RemoveRedundantCstrReshapable,
         SimplifyBroadcasts>(ctx);
     // clang-format on
+
+    // Collect some relevant canonicalization patterns.
     shape::AssumingOp::getCanonicalizationPatterns(patterns, ctx);
+    shape::ShapeOfOp::getCanonicalizationPatterns(patterns, ctx);
 
     if (failed(mlir::applyPatternsAndFoldGreedily(getOperation(),
                                                   std::move(patterns)))) {
@@ -822,4 +914,5 @@ createSymbolicShapeOptimizationPass() {
   return std::make_unique<SymbolicShapeOptimizationPass>();
 }
 
+}  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
index 9a0918d14a6..727c70440a8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/test_infer_shaped_type/test_infer_shaped_type_pass.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace mlir {
 
 #define GEN_PASS_DEF_TESTINFERSHAPEDTYPEMETHODSPASS
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+#include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace mhlo {
 namespace {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
index 74a0254f3bd..8e300a15b4a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc
new file mode 100644
index 00000000000..f912946c8ef
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc
@@ -0,0 +1,59 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace mhlo {
+
+#define GEN_PASS_DEF_TESTUNFUSEBATCHNORMPASS
+#include "mhlo/transforms/mhlo_passes.h.inc"
+
+namespace {
+
+struct TestUnfuseBatchNormPass
+    : public impl::TestUnfuseBatchNormPassBase<TestUnfuseBatchNormPass> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateUnfuseBatchNormInferencePattern(&getContext(), &patterns);
+    populateUnfuseBatchNormTrainingPattern(&getContext(), &patterns);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<::mlir::Pass> createTestUnfuseBatchNormPass() {
+  return std::make_unique<TestUnfuseBatchNormPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
new file mode 100644
index 00000000000..ad7cc55334b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/CMakeLists.txt
@@ -0,0 +1,70 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(MhloScatterUtils
+  mhlo_scatter_gather_utils.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+)
+
+add_mlir_library(MhloTypeConversion
+  type_conversion.cc
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MhloDialect
+  MLIRIR
+  MLIRFuncDialect
+  MLIRFuncTransforms
+  MLIRTensorDialect
+  StablehloOps
+)
+
+add_mlir_library(HloToLinalgUtils
+  legalize_to_linalg_utils.cc
+
+  DEPENDS
+  MLIRhlo_opsIncGen
+  MLIRMhloPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  LmhloDialect
+  MhloDialect
+  MhloTypeConversion
+  MLIRBufferizationDialect
+  MLIRComplexDialect
+  MLIRIR
+  MLIRLinalgUtils
+  MLIRPass
+  MLIRRewrite
+  MLIRTransformUtils
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg_utils.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
similarity index 79%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg_utils.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
index ea073213497..b83f4c82bc6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file implements utilities for lowering CHLO/HLO/LHLO dialect to Linalg
 // dialect.
 
-#include "mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h"
+#include "mhlo/utils/legalize_to_linalg_utils.h"
 
 #include <algorithm>
 #include <numeric>
@@ -39,21 +39,23 @@ bool hasIntegralShapeType(Operation* op) {
 
 }  // namespace
 
-SmallVector<StringRef, 3> getParallelAndReductionIterators(
+SmallVector<utils::IteratorType, 3> getParallelAndReductionIterators(
     unsigned nLoops, unsigned nReduction) {
-  SmallVector<StringRef, 3> res(nLoops - nReduction,
-                                getParallelIteratorTypeName());
-  res.append(nReduction, getReductionIteratorTypeName());
+  SmallVector<utils::IteratorType, 3> res(nLoops - nReduction,
+                                          utils::IteratorType::parallel);
+  res.append(nReduction, utils::IteratorType::reduction);
   return res;
 }
 
-SmallVector<StringRef, 3> getNParallelLoopsAttrs(unsigned nParallelLoops) {
+SmallVector<utils::IteratorType, 3> getNParallelLoopsAttrs(
+    unsigned nParallelLoops) {
   return getParallelAndReductionIterators(nParallelLoops, 0);
 }
 
 Value getEmptySparseTensor(OpBuilder& b, Location loc, ShapedType type,
                            ArrayRef<Value> dynSizes) {
-  return b.create<bufferization::AllocTensorOp>(loc, type, dynSizes,
+  return b.create<bufferization::AllocTensorOp>(loc, type.cast<TensorType>(),
+                                                dynSizes,
                                                 /*copy=*/Value(),
                                                 /*memory_space=*/IntegerAttr());
 }
@@ -61,7 +63,8 @@ Value getEmptySparseTensor(OpBuilder& b, Location loc, ShapedType type,
 Value getEmptyTensor(OpBuilder& b, Location loc, ShapedType type,
                      ArrayRef<Value> dynSizes) {
   return b.create<tensor::EmptyOp>(loc, type.getShape(), type.getElementType(),
-                                   dynSizes);
+                                   dynSizes,
+                                   type.cast<RankedTensorType>().getEncoding());
 }
 
 Value getEmptyTensorFor(OpBuilder& b, Location loc, ShapedType resultType,
@@ -79,7 +82,7 @@ Value getEmptyTensorFor(OpBuilder& b, Location loc, ShapedType resultType,
     assert(reifiedShapes.size() == 1 && "Expected one reified result");
     // Construct sizes for the required dimensions.
     for (auto& en : llvm::enumerate(resultType.getShape())) {
-      if (en.value() != ShapedType::kDynamicSize) continue;
+      if (en.value() != ShapedType::kDynamic) continue;
       sizes.push_back(b.create<tensor::ExtractOp>(
           loc, reifiedShapes[0],
           ValueRange{b.create<arith::ConstantIndexOp>(loc, en.index())}));
@@ -122,6 +125,19 @@ Value postSparsify(Operation* op, Value semiring, Value result, OpBuilder* b) {
   return result;
 }
 
+bool allOperandsAreScalarTensors(Operation* op) {
+  return llvm::all_of(op->getOperands(), [](Value operand) {
+    auto operandTy = operand.getType().dyn_cast<ShapedType>();
+    return operandTy && operandTy.getRank() == 0;
+  });
+}
+
+bool isInBodyOfLinalgOps(Operation* op) {
+  auto* parentOp = op->getParentRegion()->getParentOp();
+  return parentOp->getDialect() ==
+         parentOp->getContext()->getLoadedDialect<linalg::LinalgDialect>();
+}
+
 }  // namespace mhlo
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
similarity index 81%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
index e0bd885e5ec..a91dbe9ac5e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/legalize_to_linalg_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/legalize_to_linalg_utils.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
+#include "mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -50,11 +50,12 @@ namespace mhlo {
 /// Returns an ArrayAttr that contains `nLoops` attributes. All the attributes
 /// are "parallel" except the last `nReduction` elements, where are "reduction"
 /// attributes.
-SmallVector<StringRef, 3> getParallelAndReductionIterators(unsigned nLoops,
-                                                           unsigned nReduction);
+SmallVector<utils::IteratorType, 3> getParallelAndReductionIterators(
+    unsigned nLoops, unsigned nReduction);
 
 /// Returns an ArrayAttr that contains `nParallelLoops` "parallel" attributes.
-SmallVector<StringRef, 3> getNParallelLoopsAttrs(unsigned nParallelLoops);
+SmallVector<utils::IteratorType, 3> getNParallelLoopsAttrs(
+    unsigned nParallelLoops);
 
 /// Generates an init sparse tensor.
 Value getEmptySparseTensor(OpBuilder& b, Location loc, ShapedType type,
@@ -88,6 +89,12 @@ Value preSparsify(Operation* op, llvm::SmallVector<Value, 2>& values, Type rtp,
 /// Finalizes sparse semi-ring construction.
 Value postSparsify(Operation* op, Value semiring, Value result, OpBuilder* b);
 
+/// Returns true if all operands are tensors with rank 0.
+bool allOperandsAreScalarTensors(Operation* op);
+
+/// Returns true if parent op is linalg.
+bool isInBodyOfLinalgOps(Operation* op);
+
 /// Converts a HLO operation to a linalg.generic op that contains the
 /// corresponding scalar operations.
 template <typename OpTy>
@@ -98,6 +105,7 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
   LogicalResult matchAndRewrite(
       OpTy op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
+    auto loc = op.getLoc();
     // Find maximum rank / number of loops.
     auto getRank = [](Value v) {
       return v.getType().cast<ShapedType>().getRank();
@@ -131,25 +139,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
           op, "mismatched operand/result types or iterator count");
     }
 
-    auto loc = op.getLoc();
-    // Within a linalg op, we can immediately de-tensorsize if the computation
-    // is scalar. We do not do this on the top-level, as that would break the
-    // nice invariant that all programs are exclusively on tensors, which is
-    // currently relied on for fusion in some pipelines.
-    if (nloops == 0 && isInBodyOfLinalgOps(op)) {
-      // No need to create a linalg.generic if all inputs are scalars.
-      SmallVector<Value> inputs;
-      for (auto input : adaptor.getOperands()) {
-        inputs.push_back(
-            rewriter.create<tensor::ExtractOp>(loc, input, ValueRange()));
-      }
-      Value scalarResult = mhlo::MhloOpToStdScalarOp::mapOp(
-          op, resultTy->getElementType(), inputs, &rewriter);
-      if (!scalarResult) return failure();
-      rewriter.replaceOpWithNewOp<tensor::FromElementsOp>(op, *resultTy,
-                                                          scalarResult);
-      return success();
-    }
+    if (allOperandsAreScalarTensors(op) && isInBodyOfLinalgOps(op))
+      return failure();
 
     // Find input/output values and types.
     ValueRange inputs = adaptor.getOperands();
@@ -188,13 +179,6 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
     rewriter.replaceOp(op, linalgOp->getResults());
     return success();
   }
-
- private:
-  static bool isInBodyOfLinalgOps(Operation* op) {
-    auto* parentOp = op->getParentRegion()->getParentOp();
-    return parentOp->getDialect() ==
-           parentOp->getContext()->getLoadedDialect<linalg::LinalgDialect>();
-  }
 };
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
index 9a846be903c..1db992ad266 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This file implements utilities for the canonicalization of ScatterOp and
 // GatherOp.
 
-#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h"
+#include "mhlo/utils/mhlo_scatter_gather_utils.h"
 
 #include <utility>
 
@@ -85,11 +85,11 @@ makeOperandStartIndexPermutations(ArrayRef<int64_t> dimMap, int operandRank) {
 TypedValue<TensorType> insertDegenerateDimensions(
     OpBuilder& b, Location loc, TypedValue<TensorType> tensor,
     ArrayRef<int64_t> dimsToInsert) {
+  assert(llvm::is_sorted(dimsToInsert) && "dimsToInsert must be sorted");
   if (dimsToInsert.empty()) return tensor;
   TensorType type = tensor.getType();
   SmallVector<int64_t> newShape{type.getShape()};
-  for (int64_t dim : llvm::reverse(dimsToInsert))
-    newShape.insert(newShape.begin() + dim, 1);
+  for (int64_t dim : dimsToInsert) newShape.insert(newShape.begin() + dim, 1);
   auto newType = RankedTensorType::get(newShape, type.getElementType());
 
   return b
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
index bd84a7e17bd..cf7461be5a0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_scatter_gather_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_scatter_gather_utils.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/type_conversion.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/type_conversion.cc
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.cc
index 831d3e374f0..998fd02a25c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Dialect/mhlo/transforms/type_conversion.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
+#include "mhlo/utils/type_conversion.h"
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include <optional>
+
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -53,7 +55,7 @@ llvm::Optional<Value> materializeCastFromIllegal(OpBuilder& builder, Type type,
   Type toType = getElementTypeOrSelf(type);
   if ((!fromType.isSignedInteger() && !fromType.isUnsignedInteger()) ||
       !toType.isSignlessInteger())
-    return llvm::None;
+    return std::nullopt;
   // Use unrealized conversion casts to do signful->signless conversions.
   return builder.create<UnrealizedConversionCastOp>(loc, type, inputs[0])
       ->getResult(0);
@@ -66,7 +68,7 @@ llvm::Optional<Value> materializeCastToIllegal(OpBuilder& builder, Type type,
   Type toType = getElementTypeOrSelf(type);
   if (!fromType.isSignlessInteger() ||
       (!toType.isSignedInteger() && !toType.isUnsignedInteger()))
-    return llvm::None;
+    return std::nullopt;
   // Use unrealized conversion casts to do signless->signful conversions.
   return builder.create<UnrealizedConversionCastOp>(loc, type, inputs[0])
       ->getResult(0);
@@ -76,7 +78,7 @@ llvm::Optional<Value> scalarToTensor(OpBuilder& builder, Type /*type*/,
                                      ValueRange inputs, Location loc) {
   assert(inputs.size() == 1);
   if (inputs.front().getType().isa<ShapedType>()) {
-    return llvm::None;
+    return std::nullopt;
   }
   return builder
       .create<tensor::FromElementsOp>(
@@ -148,6 +150,9 @@ HloToStablehloTypeConverter::HloToStablehloTypeConverter()
   addConversion([](mhlo::TokenType type) -> Type {
     return stablehlo::TokenType::get(type.getContext());
   });
+  // Consider implementing stablehlo::CustomType to provide an escape hatch
+  // for modelling MHLO types that aren't yet in StableHLO.
+  // Proposal: https://github.com/openxla/stablehlo/issues/743.
 }
 
 bool HloToStablehloTypeConverter::isSourceDialect(Dialect& dialect) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/type_conversion.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.h
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/type_conversion.h
rename to tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.h
index 987347de43d..ae3f0b963e8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/type_conversion.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/type_conversion.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_DIALECT_MHLO_TRANSFORMS_TYPE_CONVERSION_H
-#define MLIR_HLO_DIALECT_MHLO_TRANSFORMS_TYPE_CONVERSION_H
+#ifndef MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
+#define MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
 
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
@@ -96,4 +96,4 @@ void registerFuncOpsForTypeConversion(ConversionTarget& target,
 }  // namespace stablehlo
 }  // namespace mlir
 
-#endif  // MLIR_HLO_DIALECT_MHLO_TRANSFORMS_TYPE_CONVERSION_H
+#endif  // MLIR_HLO_MHLO_UTILS_TYPE_CONVERSION_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tests/BUILD
index 7ebd172013a..766fdff76cb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/BUILD
@@ -2,7 +2,10 @@ load("@llvm-project//llvm:lit_test.bzl", "lit_test", "package_path")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 [
     lit_test(
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
index 5e0d3ba8497..ce5a493fd6e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
@@ -274,26 +274,26 @@ func.func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
 
 // -----
 // CHECK-LABEL: @shift_leftWithoutBroadcast
-func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK: mhlo.shift_left %arg0, %arg1
-  %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
+  %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  func.return %0 : tensor<4xi32>
 }
 
 // -----
 // CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
-func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK: mhlo.shift_right_arithmetic %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
+  %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  func.return %0 : tensor<4xi32>
 }
 
 // -----
 // CHECK-LABEL: @shift_right_logicalWithoutBroadcast
-func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK: mhlo.shift_right_logical %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
+  %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  func.return %0 : tensor<4xi32>
 }
 
 // -----
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir
new file mode 100644
index 00000000000..cc3690fea32
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/add_debug_info.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-hlo-opt %s --add-debug-info --mlir-print-debuginfo | FileCheck %s
+
+builtin.module {
+  func.func @foo() {
+    return
+  }
+}
+
+// CHECK: module
+// CHECK:   func.func @[[SUBPROGRAM_NAME:.*]]() {
+// CHECK:     return loc(#[[RET_LOC:.*]])
+// CHECK:   } loc(#[[FUSED_SUBPROGRAM_LOC:.*]])
+// CHECK: } loc(#[[MODULE_LOC:.*]])
+// CHECK: #di_basic_type = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "void", encoding = DW_ATE_address>
+// CHECK: #di_file = #llvm.di_file<"[[FILE_NAME:.*]]" in "[[DIR_NAME:.*]]">
+// CHECK: #[[MODULE_LOC]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE:]]:1)
+// CHECK: #[[SUBPROGRAM_LOC:.*]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE+1]]:3)
+// CHECK: #[[RET_LOC]] = loc("[[DIR_NAME]]/[[FILE_NAME]]":[[#MODULE_LINE+2]]:5)
+// CHECK: #di_compile_unit = #llvm.di_compile_unit<sourceLanguage = DW_LANG_C_plus_plus_17, file = #di_file, producer = "XLA CPU", isOptimized = false, emissionKind = LineTablesOnly>
+// CHECK: #di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_normal, types = #di_basic_type>
+// CHECK: #di_subprogram = #llvm.di_subprogram<compileUnit = #di_compile_unit, scope = #di_file, name = "[[SUBPROGRAM_NAME]]", linkageName = "[[SUBPROGRAM_NAME]]", file = #di_file, line = 1, scopeLine = 1, subprogramFlags = Definition, type = #di_subroutine_type>
+// CHECK: #[[FUSED_SUBPROGRAM_LOC]] = loc(fused<#di_subprogram>[#[[SUBPROGRAM_LOC]]])
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
index 0dea74401c6..4813f3ffdeb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/bufferization.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-hlo-opt %s -test-gml-st-bufferization -canonicalize -cse \
-// RUN:   -split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s -empty-tensor-to-alloc-tensor \
+// RUN: -hlo-one-shot-bufferize -canonicalize -cse -canonicalize \
+// RUN: -split-input-file | FileCheck %s
 
 func.func @set_tile(%input: tensor<?x?xf32>) -> tensor<2x4xf32> {
   %c0 = arith.constant 0 : index
@@ -8,10 +9,8 @@ func.func @set_tile(%input: tensor<?x?xf32>) -> tensor<2x4xf32> {
   %dim_0 = tensor.dim %input, %c0 : tensor<?x?xf32>
   %dim_1 = tensor.dim %input, %c1 : tensor<?x?xf32>
 
-  %tile = gml_st.tile [0, 1][2, 4][1, 1] : !gml_st.tile<2x4>
-
-  %slice = gml_st.materialize %input[%tile]
-    : tensor<?x?xf32>[!gml_st.tile<2x4>] to tensor<2x4xf32>
+  %slice = tensor.extract_slice %input[0, 1][2, 4][1, 1]
+    : tensor<?x?xf32> to tensor<2x4xf32>
 
   return %slice : tensor<2x4xf32>
 }
@@ -26,26 +25,26 @@ func.func @set_tile(%input: tensor<?x?xf32>) -> tensor<2x4xf32> {
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @parallel_with_tiles(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>,
-                               %init : tensor<?x?xf32>) -> tensor<?x?xf32> {
+                               %out : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
   %dim_0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
   %dim_1 = tensor.dim %lhs, %c1 : tensor<?x?xf32>
 
-  %result = gml_st.parallel (%i, %j) = (%c0, %c0) to (%dim_0, %dim_1) step (%c4, %c1) {
+  %result = gml_st.parallel (%i, %j) = (%c0, %c0) to (%dim_0, %dim_1)
+      step (%c4, %c1) outs (%out_ = %out: tensor<?x?xf32>) {
     %7 = arith.addi %i, %c4 : index
     %8 = arith.cmpi sgt, %7, %dim_0 : index
     %9 = arith.subi %dim_0, %i : index
     %size_0 = arith.select %8, %9, %c4 : index
 
-    %tile = gml_st.tile [%i, %j] [%size_0, 1] [1, 1] : !gml_st.tile<?x1>
-    %lhs_tile = gml_st.materialize %lhs[%tile]
-      : tensor<?x?xf32>[!gml_st.tile<?x1>] to tensor<?x1xf32>
-    %rhs_tile = gml_st.materialize %rhs[%tile]
-      : tensor<?x?xf32>[!gml_st.tile<?x1>] to tensor<?x1xf32>
-    %init_tile = gml_st.materialize %init[%tile]
-      : tensor<?x?xf32>[!gml_st.tile<?x1>] to tensor<?x1xf32>
+    %lhs_tile = tensor.extract_slice %lhs[%i, %j] [%size_0, 1] [1, 1]
+      : tensor<?x?xf32> to tensor<?x1xf32>
+    %rhs_tile = tensor.extract_slice %rhs[%i, %j] [%size_0, 1] [1, 1]
+      : tensor<?x?xf32> to tensor<?x1xf32>
+    %init_tile = tensor.extract_slice %out_[%i, %j] [%size_0, 1] [1, 1]
+      : tensor<?x?xf32> to tensor<?x1xf32>
     %sum = linalg.generic {
         indexing_maps = [#map, #map, #map],
         iterator_types = ["parallel", "parallel"]}
@@ -55,7 +54,8 @@ func.func @parallel_with_tiles(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>,
         %add = arith.addf %l, %r : f32
         linalg.yield %add : f32
       } -> tensor<?x1xf32>
-    gml_st.set_yield %sum into %init[%tile]
+    %tile = gml_st.tile [%i, %j] [%size_0, 1] [1, 1] : !gml_st.tile<?x1>
+    gml_st.set_yield %sum into %out_[%tile]
       : tensor<?x1xf32> into tensor<?x?xf32>[!gml_st.tile<?x1>]
   } : tensor<?x?xf32>
   return %result : tensor<?x?xf32>
@@ -99,12 +99,14 @@ func.func @materialize_and_yield_with_constants(
   %c2 = arith.constant 2 : index
   %c8 = arith.constant 8 : index
 
-  %1 = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c8, %c2) step (%c1, %c1) {
-    %2 = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
-    %3 = gml_st.materialize %in[%2]
-      : tensor<8x2xf32>[!gml_st.tile<1x1>] to f32
+  %1 = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c8, %c2) step (%c1, %c1)
+      outs(%out_ = %out: tensor<8x2xf32>) {
+    %2 = tensor.extract_slice %in[%i, %j] [1, 1] [1, 1]
+      : tensor<8x2xf32> to tensor<1x1xf32>
+    %3 = tensor.extract %2[%c0, %c0] : tensor<1x1xf32>
     %4 = math.absf %3: f32
-    gml_st.set_yield %4 into %out[%2]
+    %5 = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
+    gml_st.set_yield %4 into %out_[%5]
       : f32 into tensor<8x2xf32>[!gml_st.tile<1x1>]
   } : tensor<8x2xf32>
   return %1 : tensor<8x2xf32>
@@ -113,23 +115,26 @@ func.func @materialize_and_yield_with_constants(
 // CHECK-SAME:      %[[IN:.*]]: memref<8x2xf32>, %[[OUT:.*]]: memref<8x2xf32>)
 
 // CHECK:       gml_st.parallel (%[[I:.*]], %[[J:.*]]) =
-// CHECK-NEXT:    %[[ELEM:.*]] = memref.load %[[IN]][%[[I]], %[[J]]]
+// CHECK-NEXT:    %[[SLICE:.*]] = memref.subview %[[IN]][%[[I]], %[[J]]]
+// CHECK-NEXT:    %[[ELEM:.*]] = memref.load %[[SLICE]]
 // CHECK-NEXT:    %[[ABS:.*]] = math.absf %[[ELEM]] : f32
 // CHECK-NEXT:    memref.store %[[ABS]], %[[OUT]][%[[I]], %[[J]]]
 // CHECK-NEXT:    gml_st.set_yield
 
 // -----
-func.func @parallel_with_vector(%in: vector<8xf32>, %init : vector<8xf32>) -> vector<8xf32> {
+
+func.func @parallel_with_vector(%in: vector<8xf32>, %out : vector<8xf32>) -> vector<8xf32> {
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %result = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %in_tile = gml_st.materialize %in[%tile]
-      : vector<8xf32>[!gml_st.tile<4>] to vector<4xf32>
+  %result = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs (%out_ = %out: vector<8xf32>) {
+    %in_tile = gml_st.materialize %in[%i] [4] [1]
+      : vector<8xf32> to vector<4xf32>
     %neg = arith.negf %in_tile : vector<4xf32>
-    gml_st.set_yield %neg into %init[%tile]
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+    gml_st.set_yield %neg into %out_[%tile]
       : vector<4xf32> into vector<8xf32>[!gml_st.tile<4>]
   } : vector<8xf32>
 
@@ -144,33 +149,35 @@ func.func @parallel_with_vector(%in: vector<8xf32>, %init : vector<8xf32>) -> ve
 
 // -----
 
-func.func @nested_parallel_with_vector(%init : tensor<?x32xf32>)
+func.func @nested_parallel_with_vector(%out : tensor<?x32xf32>)
     -> tensor<?x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
   %c32 = arith.constant 32 : index
-  %dim_0 = tensor.dim %init, %c0 : tensor<?x32xf32>
+  %dim_0 = tensor.dim %out, %c0 : tensor<?x32xf32>
 
-  %result = gml_st.parallel (%i) = (%c0) to (%dim_0) step (%c1) {
-    %tile = gml_st.tile [%i, 0] [1, 32] [1, 1] : !gml_st.tile<1x32>
-    %init_tile = gml_st.materialize %init[%tile]
-      : tensor<?x32xf32>[!gml_st.tile<1x32>] to tensor<1x32xf32>
-    %init_vec = vector.transfer_read %init_tile[%c0, %c0], %cst
+  %result = gml_st.parallel (%i) = (%c0) to (%dim_0) step (%c1)
+      outs (%out_ = %out: tensor<?x32xf32>) {
+    %out_tile = tensor.extract_slice %out_[%i, 0] [1, 32] [1, 1]
+      : tensor<?x32xf32> to tensor<1x32xf32>
+    %out_vec = vector.transfer_read %out_tile[%c0, %c0], %cst
       {in_bounds = [true, true]}: tensor<1x32xf32>, vector<1x32xf32>
 
-    %result_vec = gml_st.parallel (%j) = (%c0) to (%c32) step (%c4) {
+    %result_vec = gml_st.parallel (%j) = (%c0) to (%c32) step (%c4)
+      outs (%vec_out_ = %out_vec: vector<1x32xf32>) {
+      %inner_tile = gml_st.materialize %vec_out_[0, %j] [1, 4] [1, 1]
+        : vector<1x32xf32> to vector<1x4xf32>
       %vtile = gml_st.tile [0, %j] [1, 4] [1, 1] : !gml_st.tile<1x4>
-      %inner_tile = gml_st.materialize %init_vec[%vtile]
-        : vector<1x32xf32>[!gml_st.tile<1x4>] to vector<1x4xf32>
-      gml_st.set_yield %inner_tile into %init_vec[%vtile]
+      gml_st.set_yield %inner_tile into %vec_out_[%vtile]
         : vector<1x4xf32> into vector<1x32xf32>[!gml_st.tile<1x4>]
     } : vector<1x32xf32>
 
-    %result = vector.transfer_write %result_vec, %init_tile[%c0, %c0]
+    %result = vector.transfer_write %result_vec, %out_tile[%c0, %c0]
       {in_bounds = [true, true]} : vector<1x32xf32>, tensor<1x32xf32>
-    gml_st.set_yield %result into %init[%tile]
+    %tile = gml_st.tile [%i, 0] [1, 32] [1, 1] : !gml_st.tile<1x32>
+    gml_st.set_yield %result into %out_[%tile]
       : tensor<1x32xf32> into tensor<?x32xf32>[!gml_st.tile<1x32>]
   } : tensor<?x32xf32>
 
@@ -186,11 +193,12 @@ func.func @nested_parallel_with_vector(%init : tensor<?x32xf32>)
 // CHECK-DAG:       %[[INITVEC:.*]] = vector.transfer_read %[[INITTILE]]
 // CHECK-SAME:        memref<1x32xf32, {{.*}}>, vector<1x32xf32>
 // CHECK:           %[[RESVEC:.*]] = gml_st.parallel
-// CHECK:             gml_st.materialize %[[INITVEC]]
+// CHECK-SAME:          outs (%[[VEC_OUT_:.*]] = %[[INITVEC]]:
+// CHECK:             gml_st.materialize %[[VEC_OUT_]]
 // CHECK:             gml_st.set_yield
 // CHECK-SAME:          vector<1x4xf32> into vector<1x32xf32>[!gml_st.tile<1x4>]
 // CHECK:           vector.transfer_write %[[RESVEC]], %[[INITTILE]]
-// CHWECK-SAME:         vector<1x32xf32>, memref<1x32xf32
+// CHECK-SAME:         vector<1x32xf32>, memref<1x32xf32
 // CHECK:         return %[[INIT]] : memref<?x32xf32>
 
 
@@ -207,9 +215,9 @@ func.func @scalarized_reduction(%arg: tensor<1x?xf32>) -> tensor<1xf32> {
   %dim = tensor.dim %arg, %c1 : tensor<1x?xf32>
   %result = gml_st.for (%i) = (%c0) to (%dim) step (%c1)
       outs (%out = %fill: tensor<1xf32>) {
-    %tile = gml_st.tile [0, %i] [1, 1] [1, 1] : !gml_st.tile<1x1>
-    %elem = gml_st.materialize %arg[%tile]
-      : tensor<1x?xf32>[!gml_st.tile<1x1>] to f32
+    %slice = tensor.extract_slice %arg[0, %i] [1, 1] [1, 1]
+      : tensor<1x?xf32> to tensor<1x1xf32>
+    %elem = tensor.extract %slice[%c0, %c0] : tensor<1x1xf32>
 
     %extracted = tensor.extract %out[%c0] : tensor<1xf32>
     %sum = arith.addf %extracted, %elem : f32
@@ -231,7 +239,8 @@ func.func @scalarized_reduction(%arg: tensor<1x?xf32>) -> tensor<1xf32> {
 // CHECK:       %[[DIM:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<1x?xf32>
 
 // CHECK-NEXT:  gml_st.for (%[[I:.*]]) = (%[[C0]]) to (%[[DIM]]) step (%[[C1]]) {
-// CHECK-NEXT:    %[[ARG_ELEM:.*]] = memref.load %[[ARG]][%[[C0]], %[[I]]]
+// CHECK-NEXT:    %[[ARG_SLICE:.*]] = memref.subview %[[ARG]][0, %[[I]]]
+// CHECK-NEXT:    %[[ARG_ELEM:.*]] = memref.load %[[ARG_SLICE]][%[[C0]], %[[C0]]]
 // CHECK-NEXT:    %[[ACC:.*]] = memref.load %[[ALLOC]][%[[C0]]] : memref<1xf32>
 // CHECK-NEXT:    %[[SUM:.*]] = arith.addf %[[ACC]], %[[ARG_ELEM]] : f32
 // CHECK-NEXT:    memref.store %[[SUM]], %[[ALLOC]][%[[C0]]] : memref<1xf32>
@@ -252,37 +261,34 @@ func.func @matmul(%lhs: tensor<128x16xf32>,
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %matmul = gml_st.parallel (%i, %j)
-      = (%c0, %c0) to (%c128, %c64) step (%c8, %c4) {
-    %lhs_tile = gml_st.tile [%i, 0] [8, 16] [1, 1] : !gml_st.tile<8x16>
-    %lhs_sub = gml_st.materialize %lhs[%lhs_tile]
-      : tensor<128x16xf32>[!gml_st.tile<8x16>] to tensor<8x16xf32>
-    %rhs_tile = gml_st.tile [0, %j] [16, 4] [1, 1] : !gml_st.tile<16x4>
-    %rhs_sub = gml_st.materialize %rhs[%rhs_tile]
-      : tensor<16x64xf32>[!gml_st.tile<16x4>] to tensor<16x4xf32>
-    %out_tile = gml_st.tile [%i, %j] [8, 4] [1, 1] : !gml_st.tile<8x4>
-    %out_sub = gml_st.materialize %out[%out_tile]
-      : tensor<128x64xf32>[!gml_st.tile<8x4>] to tensor<8x4xf32>
+      = (%c0, %c0) to (%c128, %c64) step (%c8, %c4)
+      outs (%out_ = %out: tensor<128x64xf32>) {
+    %lhs_sub = tensor.extract_slice %lhs[%i, 0] [8, 16] [1, 1]
+      : tensor<128x16xf32> to tensor<8x16xf32>
+    %rhs_sub = tensor.extract_slice %rhs[0, %j] [16, 4] [1, 1]
+      : tensor<16x64xf32> to tensor<16x4xf32>
+    %out_sub = tensor.extract_slice %out_[%i, %j] [8, 4] [1, 1]
+      : tensor<128x64xf32> to tensor<8x4xf32>
 
     %mat_sub = gml_st.for (%k) = (%c0) to (%c16) step (%c2)
         outs (%out_sub_ = %out_sub: tensor<8x4xf32>) {
-      %lhs_tile2 = gml_st.tile [0, %k] [8, 2] [1, 1] : !gml_st.tile<8x2>
-      %lhs_sub2 = gml_st.materialize %lhs_sub[%lhs_tile2]
-        : tensor<8x16xf32>[!gml_st.tile<8x2>] to tensor<8x2xf32>
-      %rhs_tile2 = gml_st.tile [%k, 0] [2, 4] [1, 1] : !gml_st.tile<2x4>
-      %rhs_sub2 = gml_st.materialize %rhs_sub[%rhs_tile2]
-        : tensor<16x4xf32>[!gml_st.tile<2x4>] to tensor<2x4xf32>
-      %out_tile2 = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
-      %out_sub2 = gml_st.materialize %out_sub_[%out_tile2]
-        : tensor<8x4xf32>[!gml_st.tile<8x4>] to tensor<8x4xf32>
+      %lhs_sub2 = tensor.extract_slice %lhs_sub[0, %k] [8, 2] [1, 1]
+        : tensor<8x16xf32> to tensor<8x2xf32>
+      %rhs_sub2 = tensor.extract_slice %rhs_sub[%k, 0] [2, 4] [1, 1]
+        : tensor<16x4xf32> to tensor<2x4xf32>
+      %out_sub2 = tensor.extract_slice %out_sub_[0, 0] [8, 4] [1, 1]
+        : tensor<8x4xf32> to tensor<8x4xf32>
 
       %mat_sub2 = linalg.matmul
         ins(%lhs_sub2, %rhs_sub2 : tensor<8x2xf32>, tensor<2x4xf32>)
         outs(%out_sub2 : tensor<8x4xf32>) -> tensor<8x4xf32>
 
+      %out_tile2 = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
       gml_st.set_yield %mat_sub2 into %out_sub_[%out_tile2]
         : tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
     } : tensor<8x4xf32>
-    gml_st.set_yield %mat_sub into %out[%out_tile]
+    %out_tile = gml_st.tile [%i, %j] [8, 4] [1, 1] : !gml_st.tile<8x4>
+    gml_st.set_yield %mat_sub into %out_[%out_tile]
       : tensor<8x4xf32> into tensor<128x64xf32>[!gml_st.tile<8x4>]
   } : tensor<128x64xf32>
   return %matmul : tensor<128x64xf32>
@@ -290,9 +296,96 @@ func.func @matmul(%lhs: tensor<128x16xf32>,
 // CHECK-LABEL: func.func @matmul
 // CHECK-NOT:     alloc
 // CHECK:         gml_st.parallel
-// CHECK-3:         memref.subview
+// CHECK-COUNT-3:   memref.subview
 // CHECK-NOT:       alloc
 // CHECK:           gml_st.for
-// CHECK-4:           memref.subview
+// CHECK-COUNT-2:     memref.subview
 // CHECK-NOT:         alloc
 // CHECK:             linalg.matmul
+
+// -----
+
+func.func @materialize_out_of_place(%arg0: tensor<1xi32>) -> tensor<1xi32> {
+  %c0 = arith.constant 0 : index
+  %c42 = arith.constant 42 : i32
+
+  %0 = tensor.insert %c42 into %arg0[%c0] : tensor<1xi32>
+  %1 = tensor.extract_slice %arg0[0][1][1] : tensor<1xi32> to tensor<1xi32>
+  %2 = tensor.extract %1[%c0] : tensor<1xi32>
+  %3 = tensor.insert %2 into %0[%c0] : tensor<1xi32>
+
+  return %3 : tensor<1xi32>
+}
+
+// CHECK-LABEL: @materialize_out_of_place
+// CHECK-SAME:       %[[ARG0:.*]]: memref<1xi32>
+// CHECK-DAG:      %[[C42:.*]] = arith.constant 42
+// CHECK:          %[[ALLOC:.*]] = memref.alloc
+// CHECK:          memref.copy %{{.*}}, %[[ALLOC]]
+// CHECK:          memref.store %[[C42]], %[[ALLOC]]
+// CHECK:          %[[LOADED:.*]] = memref.load %[[ARG0]]
+// CHECK:          memref.store %[[LOADED]], %[[ALLOC]]
+// CHECK:          return %[[ALLOC]]
+
+// -----
+
+func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
+                                            %3: tensor<320x10240xf32>)
+  -> tensor<320xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4 = gml_st.parallel (%i) = (%c0) to (%c320) step (%c1)
+      outs(%arg1 = %2: tensor<320xf32>) {
+    %5 = tensor.extract_slice %3[%i, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
+    %6 = tensor.extract_slice %arg1[%i] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+
+    %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %8 into %arg1[%tile]
+      : tensor<1xf32> into tensor<320xf32>[!gml_st.tile<1>]
+  } : tensor<320xf32>
+  return %4 : tensor<320xf32>
+}
+// CHECK-LABEL: @same_enclosing_repetitive_region
+// CHECK-NOT: memref.alloc
+
+// -----
+
+// CHECK-LABEL: func @gml_st_parallel_private_var(
+//  CHECK-SAME:     %[[t:.*]]: memref<10xf32
+func.func @gml_st_parallel_private_var(%t: tensor<10xf32>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c5 = arith.constant 5 : index
+
+  // A copy is inserted for the uses of %t in the loop.
+  // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
+  // CHECK: memref.copy %[[t]], %[[t_copy]]
+
+  // CHECK: gml_st.parallel
+
+  // Load from the copy and store into the shared output.
+  // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
+  // CHECK:   memref.load %[[t_copy]]
+  // CHECK:   memref.store %{{.*}}, %[[subview]]
+  %0 = gml_st.parallel (%tid) = (%c0) to (%c2) step (%c1)
+      outs(%o = %t: tensor<10xf32>) {
+    %offset = arith.muli %c5, %tid : index
+    %slice = tensor.extract_slice %o[%offset] [5] [1]
+        : tensor<10xf32> to tensor<5xf32>
+    %r2 = tensor.extract %t[%tid] : tensor<10xf32>
+    %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
+
+    %tile = gml_st.tile [%offset][5][1] : !gml_st.tile<5>
+    gml_st.set_yield %i into %o[%tile]
+      : tensor<5xf32> into tensor<10xf32>[!gml_st.tile<5>]
+  } : tensor<10xf32>
+
+  %r = tensor.extract %0[%c2] : tensor<10xf32>
+  return %r : f32
+}
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/canonicalize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/canonicalize.mlir
index c1ff5ca4018..b655d985782 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/canonicalize.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/canonicalize.mlir
@@ -1,278 +1,4 @@
-// RUN: mlir-hlo-opt %s -canonicalize -split-input-file | FileCheck %s
-
-#map = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @memref_cast_into_loop(
-func.func @memref_cast_into_loop(%arg0: memref<192xf32>)  {
-  %0 = memref.cast %arg0
-    : memref<192xf32> to memref<192xf32, #map>
-  %cst = arith.constant 0.000000e+00 : f32
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  // CHECK: gml_st.loop
-  // CHECK-SAME: outs (%{{.*}} = %{{.*}}: memref<192xf32>)
-  gml_st.loop (%arg3) = (%c0) to (%c192) step (%c24)
-    outs (%out = %0: memref<192xf32, #map>) {
-    %14 = affine.min affine_map<(d0) -> (-d0 + 192, 24)>(%arg3)
-    %16 = memref.subview %out[%arg3] [%14] [1]
-      : memref<192xf32, #map> to memref<?xf32, #map>
-    linalg.fill ins(%cst : f32) outs(%16 : memref<?xf32, #map>)
-    gml_st.yield
-  }
-  func.return
-}
-
-// -----
-
-func.func private @foo(%A: memref<48xf32>, %B: tensor<48xf32>,
-                  %C: memref<48xf32>) -> (tensor<48xf32>)
-
-func.func @fold_loop_results(%A: memref<48xf32>, %B: tensor<48xf32>,
-    %C: memref<48xf32>, %C_tensor: tensor<48xf32>) -> tensor<48xf32> {
-  %c0 = arith.constant 0 : index
-  %c24 = arith.constant 24 : index
-  %c48 = arith.constant 48 : index
-  %useful, %useless = gml_st.loop (%i) = (%c0) to (%c48) step (%c24)
-      ins (%A_ = %A: memref<48xf32>)
-      outs (%B_ = %B: tensor<48xf32>,
-            %CT_ = %C_tensor: tensor<48xf32>,
-            %C_ = %C: memref<48xf32>) {
-        %result = func.call @foo(%A_, %B_, %C_)
-          : (memref<48xf32>, tensor<48xf32>, memref<48xf32>)-> (tensor<48xf32>)
-    gml_st.yield %result, %CT_ : tensor<48xf32>, tensor<48xf32>
-  }
-  func.return %useful : tensor<48xf32>
-}
-
-// CHECK-LABEL: func @fold_loop_results(
-// CHECK-SAME:   %[[A:.*]]: [[BUF_TY:memref<48xf32>]], %[[B:.*]]: [[TY:tensor<48xf32>]],
-// CHECK-SAME:   %[[C:.*]]: [[BUF_TY]],  %[[C_TENSOR:.*]]: [[TY]]) -> [[TY]] {
-
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C24:.*]] = arith.constant 24 : index
-// CHECK-DAG:  %[[C48:.*]] = arith.constant 48 : index
-
-// CHECK-NOT: %{{.*}} = gml_st.loop
-// CHECK:  %[[RESULT:.*]] = gml_st.loop (%{{.*}}) = (%[[C0]])
-// CHECK-SAME: to (%[[C48]]) step (%[[C24]])
-// CHECK-SAME: ins (%[[A_:.*]] = %[[A]]: [[BUF_TY]])
-// CHECK-SAME: outs (%[[B_:.*]] = %[[B]]: [[TY]], %[[C_:.*]] = %[[C]]: [[BUF_TY]]) {
-// CHECK-NEXT:   %[[RES:.*]] = func.call @foo(%[[A_]], %[[B_]], %[[C_]])
-// CHECK-NEXT:   gml_st.yield %[[RES]] :
-
-// CHECK: return %[[RESULT]]
-
-// -----
-
-func.func private @foo(%A: memref<192xf32>, %B: tensor<192xf32>) -> tensor<192xf32>
-
-func.func @fold_loop_inputs(%A: memref<192xf32>, %A_tensor: tensor<192xf32>,
-                             %B_tensor: tensor<192xf32>) -> tensor<192xf32> {
-  %c0 = arith.constant 0 : index
-  %c24 = arith.constant 24 : index
-  %c192 = arith.constant 192 : index
-  %result = gml_st.loop (%i) = (%c0) to (%c192) step (%c24)
-      ins (%A_ = %A: memref<192xf32>, %AT_ = %A_tensor: tensor<192xf32>)
-      outs (%BT_ = %B_tensor: tensor<192xf32>) {
-    %0 = func.call @foo(%A_, %BT_) : (memref<192xf32>, tensor<192xf32>) -> tensor<192xf32>
-    gml_st.yield %0 : tensor<192xf32>
-  }
-  func.return %result : tensor<192xf32>
-}
-
-// CHECK-LABEL: func @fold_loop_inputs
-// CHECK: %[[RESULT:.*]] = gml_st.loop
-// CHECK-SAME: ins (%{{.*}} = %{{.*}}: memref<192xf32>)
-
-// CHECK: return %[[RESULT]]
-
-// -----
-
-// CHECK-LABEL: func @dim_of_loop_input_no_canonicalize(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   gml_st.loop {{.*}} outs (%[[o:.*]] =
-//       CHECK:     %[[dim:.*]] = tensor.dim %[[o]], %[[c0]]
-//       CHECK:     arith.index_cast %[[dim]]
-func.func @dim_of_loop_input_no_canonicalize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = gml_st.loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %inner_dim = tensor.dim %out1, %c0 : tensor<?x?xf32>
-    %cast1 = arith.index_cast %inner_dim : index to i32
-    %cast2 = arith.sitofp %cast1 : i32 to f32
-    %fill = linalg.fill ins(%cast2 : f32) outs(%out1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-    %slice = tensor.extract_slice %fill[0, 0][%s, %s][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-    gml_st.yield %slice : tensor<?x?xf32>
-  }
-  func.return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_loop_input(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   gml_st.loop
-//       CHECK:     %[[dim:.*]] = tensor.dim %[[arg1]], %[[c0]]
-//       CHECK:     arith.index_cast %[[dim]]
-func.func @dim_of_loop_input(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = gml_st.loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %inner_dim = tensor.dim %in1, %c0 : tensor<?x?xf32>
-    %cast1 = arith.index_cast %inner_dim : index to i32
-    %cast2 = arith.sitofp %cast1 : i32 to f32
-    %fill = linalg.fill ins(%cast2 : f32) outs(%out1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-    gml_st.yield %fill : tensor<?x?xf32>
-  }
-  func.return %r : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_loop_result(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   tensor.dim %[[arg2]], %[[c0]]
-func.func @dim_of_loop_result(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> index {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = gml_st.loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %1 = tensor.insert_slice %arg0 into %out1 [0, 0] [%s, %s] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-    gml_st.yield %1 : tensor<?x?xf32>
-  }
-  %r2 = tensor.dim %r, %c0 : tensor<?x?xf32>
-  func.return %r2 : index
-}
-
-// -----
-
-// CHECK-LABEL: func @dim_of_loop_result_no_canonicalize(
-//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
-//       CHECK:   %[[c0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[r:.*]] = gml_st.loop
-//       CHECK:   tensor.dim %[[r]], %[[c0]]
-func.func @dim_of_loop_result_no_canonicalize(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>, %s: index)
-    -> index {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %r = gml_st.loop (%iv0, %iv1) = (%c0, %c0)
-      to (%d0, %d1) step (%c1, %c1)
-      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
-      outs (%out1 = %arg2 : tensor<?x?xf32>) {
-    %1 = tensor.insert_slice %arg0 into %arg1 [0, 0] [%s, %s] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-    gml_st.yield %1 : tensor<?x?xf32>
-  }
-  %r2 = tensor.dim %r, %c0 : tensor<?x?xf32>
-  func.return %r2 : index
-}
-
-// -----
-
-func.func private @do(%A: tensor<?x4xf32>, %B: tensor<?xf32>) -> tensor<?xf32>
-
-func.func @fold_tensor_cast(%in: tensor<4x600xf32>,
-                       %out: tensor<4xf32>) -> tensor<4xf32> {
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c600 = arith.constant 600 : index
-
-  %in_cast = tensor.cast %in : tensor<4x600xf32> to tensor<?x600xf32>
-  %out_cast = tensor.cast %out : tensor<4xf32> to tensor<?xf32>
-
-  %result = gml_st.loop (%i) = (%c0) to (%c600) step (%c4)
-      ins (%in_ = %in_cast: tensor<?x600xf32>)
-      outs (%out_ = %out_cast: tensor<?xf32>)
-      iterators[#gml_st.iterator_type<reduction>] {
-    %dim_in = tensor.dim %in_, %c0 : tensor<?x600xf32>
-    %dim_out = tensor.dim %out_, %c0 : tensor<?xf32>
-
-    %in_sub = tensor.extract_slice %in_[0, %i] [%dim_in, 4] [1, 1]
-      : tensor<?x600xf32> to tensor<?x4xf32>
-    %out_sub = tensor.extract_slice %out_[0] [%dim_out] [1]
-      : tensor<?xf32> to tensor<?xf32>
-    %result_sub = func.call @do(%in_sub, %out_sub):
-      (tensor<?x4xf32>, tensor<?xf32>) -> tensor<?xf32>
-    %out_update = tensor.insert_slice %result_sub into %out_[0] [%dim_out] [1]
-      : tensor<?xf32> into tensor<?xf32>
-    gml_st.yield %out_update : tensor<?xf32>
-  }
-  %result_cast = tensor.cast %result : tensor<?xf32> to tensor<4xf32>
-  func.return %result_cast : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @fold_tensor_cast(
-// CHECK-SAME:    %[[IN:.*]]: tensor<4x600xf32>, %[[OUT:.*]]: tensor<4xf32>)
-
-// CHECK-DAG:  %[[C600:.*]] = arith.constant 600 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-
-// CHECK:      %[[RESULT:.*]] = gml_st.loop
-// CHECK-SAME:   ins (%[[IN_:.*]] = %[[IN]]: tensor<4x600xf32>)
-// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[OUT]]: tensor<4xf32>) iterators
-
-// CHECK:      %[[IN_SUB:.*]] = tensor.extract_slice
-// CHECK:      %[[IN_SUB_CAST:.*]] = tensor.cast %[[IN_SUB]]
-// CHECK-SAME:   : tensor<4x4xf32> to tensor<?x4xf32>
-
-// CHECK:      %[[OUT_SUB:.*]] = tensor.cast %[[OUT_]]
-// CHECK-SAME:   : tensor<4xf32> to tensor<?xf32>
-
-// CHECK:      %[[RESULT_SUB:.*]] = func.call @do(%[[IN_SUB_CAST]], %[[OUT_SUB]])
-// CHECK:      %[[RESULT_CAST:.*]] = tensor.cast %[[RESULT_SUB]]
-// CHECK:      gml_st.yield %[[RESULT_CAST]] : tensor<4xf32>
-// CHECK:    }
-// CHECK:    return %[[RESULT]] : tensor<4xf32>
-
-// -----
-
-func.func private @reduce(%A: tensor<4xf32>, %B: tensor<f32>) -> tensor<f32>
-
-// CHECK-LABEL: @remove_empty_loop
-func.func @remove_empty_loop(%in: tensor<16xf32>, %out: tensor<f32>,
-                             %buf: memref<f32>) -> tensor<f32>{
-  // CHECK-NOT: gml_st.loop
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c16 = arith.constant 16 : index
-  %0 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c16, %c0) step (%c4, %c4)
-      ins (%in_ = %in: tensor<16xf32>)
-      outs (%out_ = %out: tensor<f32>, %buf_ = %buf: memref<f32>)
-      iterators[#gml_st.iterator_type<reduction>,
-                #gml_st.iterator_type<parallel>] {
-    %in_sub = tensor.extract_slice %in_[%i][4][1]
-      : tensor<16xf32> to tensor<4xf32>
-    %result = func.call @reduce(%in_sub, %out_):
-      (tensor<4xf32>, tensor<f32>) -> tensor<f32>
-    gml_st.yield %result : tensor<f32>
-  }
-  func.return %0 : tensor<f32>
-}
-
-// -----
+// RUN: mlir-hlo-opt %s -canonicalize="test-convergence" -split-input-file | FileCheck %s
 
 // CHECK-LABEL: @fold_unit_dim
 func.func @fold_unit_dim() -> tensor<8x10xf32> {
@@ -300,15 +26,37 @@ func.func @fold_unit_dim() -> tensor<8x10xf32> {
 
 // -----
 
+// CHECK-LABEL: @remove_empty_for
+func.func @remove_empty_for() -> tensor<8x10xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %c8 = arith.constant 8 : index
+  // CHECK: %[[INIT:.*]] = tensor.empty
+  %init = tensor.empty() : tensor<8x10xf32>
+  // CHECK-NOT: gml_st.for
+  %out = gml_st.for (%i, %j) = (%c0, %c4) to (%c4, %c4) step (%c1, %c1)
+      outs(%out_ = %init : tensor<8x10xf32>) {
+     %tile = gml_st.tile [%i, %j] [4, 1] [1, 1] : !gml_st.tile<4x1>
+
+     %val = tensor.empty() : tensor<4x1xf32>
+     gml_st.set_yield %val into %out_[%tile]
+      : tensor<4x1xf32> into tensor<8x10xf32>[!gml_st.tile<4x1>]
+  } : tensor<8x10xf32>
+  // CHECK: return %[[INIT]]
+  func.return %out : tensor<8x10xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @fold_constant_tile_through_materialize
 func.func @fold_constant_tile_through_materialize(%in: tensor<4xf32>) ->
     tensor<?xf32> {
   %c2 = arith.constant 2 : index
-  // CHECK: %[[TILE:.*]] = gml_st.tile [2] [2] [1] : !gml_st.tile<2>
-  %tile = gml_st.tile [%c2] [%c2] [1] : !gml_st.tile<?>
-  // CHECK: %[[MAT:.*]] = gml_st.materialize {{.*}}[%[[TILE]]] : tensor<4xf32>[!gml_st.tile<2>]
-  %mat = gml_st.materialize %in[%tile] : tensor<4xf32>[!gml_st.tile<?>]
-      to tensor<?xf32>
+  // CHECK:       %[[MAT:.*]] = tensor.extract_slice
+  // CHECK-SAME:    [2] [2] [1] : tensor<4xf32> to tensor<2xf32>
+  %mat = tensor.extract_slice %in[%c2] [%c2] [1] : tensor<4xf32> to tensor<?xf32>
   // CHECK: %[[RET:.*]] = tensor.cast %[[MAT]] : tensor<2xf32> to tensor<?xf32>
   // CHECK: return %[[RET]]
   func.return %mat : tensor<?xf32>
@@ -324,11 +72,11 @@ func.func @fold_constant_set_yield(%in: tensor<?x?xf32>,
   %cst = arith.constant 0.000000e+00 : f32
   %1 = gml_st.for (%arg0) = (%c0) to (%c8) step (%c2)
                   outs (%arg1 = %out: tensor<?x?xf32>) {
-    %tile = gml_st.tile [0, 0] [%c2, %c2] [1, 1] : !gml_st.tile<?x?>
-    %out_sub = gml_st.materialize %out[%tile] :
-                    tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
+    %out_sub = tensor.extract_slice %out[0, 0] [%c2, %c2] [1, 1]  :
+                    tensor<?x?xf32> to tensor<?x?xf32>
     %fill = linalg.fill ins(%cst : f32)
                         outs(%out_sub : tensor<?x?xf32>) -> tensor<?x?xf32>
+    %tile = gml_st.tile [0, 0] [%c2, %c2] [1, 1] : !gml_st.tile<?x?>
     gml_st.set_yield %fill into %arg1[%tile] :
                     tensor<?x?xf32> into tensor<?x?xf32>[!gml_st.tile<?x?>]
   } : tensor<?x?xf32>
@@ -337,10 +85,10 @@ func.func @fold_constant_set_yield(%in: tensor<?x?xf32>,
 
 // CHECK-LABEL: @fold_constant_set_yield
 // CHECK:         %[[FOR:.*]] = gml_st.for{{.*}}: tensor<?x?xf32>
-// CHECK:           %[[TILE:.*]] = gml_st.tile [0, 0] [2, 2] {{.*}} !gml_st.tile<2x2>
-// CHECK-NOT:       builtin.unrealized_conversion_cast
-// CHECK-NEXT:      %[[SLICE:.*]] = gml_st.materialize %{{.*}}[%[[TILE]]] {{.*}} to tensor<2x2xf32>
-// CHECK:           %[[FILL:.*]] = linalg.fill {{.*}} outs(%[[SLICE]] : tensor<2x2xf32>)
+// CHECK-NEXT:      %[[SLICE:.*]] = tensor.extract_slice
+// CHECK-SAME:        [0, 0] [2, 2]
+// CHECK-NEXT:      %[[FILL:.*]] = linalg.fill {{.*}} outs(%[[SLICE]] : tensor<2x2xf32>)
+// CHECK-NEXT:      %[[TILE:.*]] = gml_st.tile [0, 0] [2, 2] {{.*}} !gml_st.tile<2x2>
 // CHECK-NEXT:      gml_st.set_yield %[[FILL]] into %{{.*}}[%[[TILE]]] : tensor<2x2xf32> into tensor<?x?xf32>[!gml_st.tile<2x2>]
 
 // -----
@@ -364,10 +112,11 @@ func.func @fold_constant_set_yield_scalar(%in: tensor<?xf32>,
 }
 
 // CHECK-LABEL: @fold_constant_set_yield_scalar
-// CHECK:         %[[FOR:.*]] = gml_st.for{{.*}}: tensor<?xf32>
-// CHECK:           %[[TILE:.*]] = gml_st.tile [0] [1] {{.*}} !gml_st.tile<1>
+// CHECK:         %[[FOR:.*]] = gml_st.for (%{{.*}}) outs
+// CHECK-SAME:      (%[[INIT_:.*]] = %[[INIT:.*]]: tensor<?xf32>)
+// CHECK:           %[[TILE:.*]] = gml_st.tile [0] [1] [1] : !gml_st.tile<1>
 // CHECK-NOT:       builtin.unrealized_conversion_cast
-// CHECK-NEXT:      gml_st.set_yield %[[SCALAR:.*]] into %{{.*}}[%[[TILE]]] : f32 into tensor<?xf32>[!gml_st.tile<1>]
+// CHECK:           gml_st.set_yield %[[SCALAR:.*]] into %[[INIT_]][%[[TILE]]] : f32 into tensor<?xf32>[!gml_st.tile<1>]
 
 // -----
 
@@ -377,20 +126,19 @@ func.func @fold_constant_for(%in: tensor<?x?xf32>,
   %c2 = arith.constant 2 : index
   %c8 = arith.constant 8 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %1 = gml_st.tile [0, 0] [8, 2] [1, 1] : !gml_st.tile<8x2>
-  %3 = gml_st.materialize %out[%1] :
-                  tensor<?x?xf32>[!gml_st.tile<8x2>] to tensor<8x2xf32>
+  %3 = tensor.extract_slice %out[0, 0] [8, 2] [1, 1] :
+                  tensor<?x?xf32> to tensor<8x2xf32>
   %cast_3 = tensor.cast %3 : tensor<8x2xf32> to tensor<?x?xf32>
   %4 = gml_st.for (%arg0) = (%c0) to (%c8) step (%c2)
                   outs (%arg1 = %cast_3: tensor<?x?xf32>) {
-    %tile = gml_st.tile [0, %arg0] [8, 2] [1, 1] : !gml_st.tile<8x2>
-    %2 = builtin.unrealized_conversion_cast %tile :
-                    !gml_st.tile<8x2> to !gml_st.tile<?x?>
-    %out_sub = gml_st.materialize %arg1[%tile] :
-                    tensor<?x?xf32>[!gml_st.tile<8x2>] to tensor<8x2xf32>
+    %out_sub = tensor.extract_slice %arg1[0, %arg0] [8, 2] [1, 1]  :
+                    tensor<?x?xf32> to tensor<8x2xf32>
     %fill = linalg.fill ins(%cst : f32)
                         outs(%out_sub : tensor<8x2xf32>) -> tensor<8x2xf32>
     %cast_fill = tensor.cast %fill : tensor<8x2xf32> to tensor<?x?xf32>
+    %tile = gml_st.tile [0, %arg0] [8, 2] [1, 1] : !gml_st.tile<8x2>
+    %2 = builtin.unrealized_conversion_cast %tile :
+                    !gml_st.tile<8x2> to !gml_st.tile<?x?>
     gml_st.set_yield %cast_fill into %arg1[%2] :
                     tensor<?x?xf32> into tensor<?x?xf32>[!gml_st.tile<?x?>]
   } : tensor<?x?xf32>
@@ -398,11 +146,11 @@ func.func @fold_constant_for(%in: tensor<?x?xf32>,
 }
 
 // CHECK-LABEL: @fold_constant_for
-// CHECK:         %[[SLICE:.*]] = gml_st.materialize {{.*}} to tensor<8x2xf32>
+// CHECK:         %[[SLICE:.*]] = tensor.extract_slice {{.*}} to tensor<8x2xf32>
 // CHECK-NOT:     tensor.cast
-// CHECK:         %[[FOR1:.*]] = gml_st.for (%{{.*}} = (%c0) to {{.*}} outs (%[[ARG1:.*]] = %[[SLICE]]: tensor<8x2xf32>
+// CHECK:         %[[FOR1:.*]] = gml_st.for (%[[I:.*]]) = (%c0) to {{.*}} outs (%[[ARG1:.*]] = %[[SLICE]]: tensor<8x2xf32>
+// CHECK-NEXT:      %[[FOR1_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[I]]] [8, 2] [1, 1] : tensor<8x2xf32> to tensor<8x2xf32>
 // CHECK:           %[[FOR1_TILE:.*]] = gml_st.tile {{.*}} [8, 2] {{.*}} !gml_st.tile<8x2>
-// CHECK-NEXT:      %[[FOR1_SLICE:.*]] = gml_st.materialize %{{.*}}[%[[FOR1_TILE]]] {{.*}} to tensor<8x2xf32>
 // CHECK:           gml_st.set_yield %{{.*}} into %[[ARG1]][%[[FOR1_TILE]]] : tensor<8x2xf32> into tensor<8x2xf32>[!gml_st.tile<8x2>]
 // CHECK-NEXT:    } : tensor<8x2xf32>
 // CHECK:         %[[CAST:.*]] = tensor.cast %[[FOR1]] : tensor<8x2xf32> to tensor<?x?xf32>
@@ -412,16 +160,251 @@ func.func @fold_constant_for(%in: tensor<?x?xf32>,
 
 func.func @fold_cast_to_materialize_source(%in: tensor<4xf32>) ->
     tensor<2xf32> {
-  %tile = gml_st.tile [2] [2] [1] : !gml_st.tile<2>
   %cast = tensor.cast %in : tensor<4xf32> to tensor<?xf32>
-  %mat = gml_st.materialize %cast[%tile] : tensor<?xf32>[!gml_st.tile<2>]
-      to tensor<2xf32>
+  %mat = tensor.extract_slice %cast[2] [2] [1]
+    : tensor<?xf32> to tensor<2xf32>
   func.return %mat : tensor<2xf32>
 }
 
 // CHECK-LABEL: @fold_cast_to_materialize_source
 // CHECK-SAME:    %[[IN:.*]]: tensor<4xf32>
-// CHECK:         %[[TILE:.*]] = gml_st.tile [2] [2] [1] : !gml_st.tile<2>
 // CHECK-NOT:     tensor.cast
-// CHECK:         %[[MAT:.*]] = gml_st.materialize %[[IN]][%[[TILE]]] : tensor<4xf32>[!gml_st.tile<2>]
+// CHECK:         %[[MAT:.*]] = tensor.extract_slice %[[IN]][2] [2] [1] : tensor<4xf32> to tensor<2xf32>
 // CHECK:         return %[[MAT]]
+
+// -----
+
+func.func @inline_single_iteration_for(
+    %in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %13 = gml_st.for (%arg4) = (%c0) to (%c1) step (%c8)
+        outs (%arg5 = %0: tensor<8x8xf32>) {
+    %19 = gml_st.tile [0, 0] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %11 = linalg.fill ins(%cst : f32) outs(%arg5 : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    gml_st.set_yield %11 into %arg5[%19] : tensor<8x8xf32>
+          into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @inline_single_iteration_for
+// CHECK-NOT:     gml_st.for
+// CHECK:         linalg.fill
+
+// -----
+
+func.func @inline_single_iteration_parallel(
+    %in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %13 = gml_st.parallel (%arg4, %arg5) = (%c0, %c0) to (%c1, %c1)
+        step (%c8, %c8) outs (%out_ = %0: tensor<8x8xf32>) {
+    %20 = tensor.extract_slice %out_[%arg4, %arg5] [8, 8] [1, 1]
+      : tensor<8x8xf32> to tensor<8x8xf32>
+    %11 = linalg.fill ins(%cst : f32) outs(%20 : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    %19 = gml_st.tile [%arg4, %arg5] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    gml_st.set_yield %11 into %out_[%19] : tensor<8x8xf32>
+          into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @inline_single_iteration_parallel
+// CHECK-NOT:     gml_st.parallel
+// CHECK:         tensor.empty
+// CHECK-NEXT:    linalg.fill
+
+// -----
+
+func.func @collapse_one_dim_parallel(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %13 = gml_st.parallel (%arg4, %arg5) = (%c0, %c0) to (%c1, %c16)
+        step (%c8, %c8) outs (%out_ = %0: tensor<8x8xf32>) {
+    %19 = gml_st.tile [%arg4, %arg5] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %11 = linalg.fill ins(%cst : f32) outs(%out_ : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    gml_st.set_yield %11 into %out_[%19] : tensor<8x8xf32>
+          into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @collapse_one_dim_parallel
+// CHECK:         gml_st.parallel (%[[ARG:.*]]) = (%c0) to (%c16) step (%c8)
+// CHECK:           gml_st.tile [0, %[[ARG]]]
+// CHECK:           linalg.fill
+// CHECK:           gml_st.set_yield
+
+// -----
+
+func.func @remove_empty_parallel(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %13 = gml_st.parallel (%arg4, %arg5) = (%c0, %c16) to (%c1, %c16)
+        step (%c8, %c8) outs (%out_ = %0: tensor<8x8xf32>) {
+    %19 = gml_st.tile [%arg4, %arg5] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %11 = linalg.fill ins(%cst : f32) outs(%out_ : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    gml_st.set_yield %11 into %out_[%19] : tensor<8x8xf32>
+          into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @remove_empty_parallel
+// CHECK-NOT:   gml_st.parallel
+// CHECK:       %[[EMPTY:.*]] = tensor.empty
+// CHECK:       return %[[EMPTY]]
+
+// -----
+
+func.func @fold_for_iter_arg(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %1 = tensor.empty() : tensor<8x8xf32>
+  %13:2 = gml_st.for (%arg4) = (%c0) to (%c16) step (%c8)
+        outs (%arg5 = %0: tensor<8x8xf32>, %arg6 = %1: tensor<8x8xf32>) {
+    %19 = gml_st.tile [0, 0] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %11 = linalg.fill ins(%cst : f32) outs(%arg5 : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    gml_st.set_yield %11 into %arg5[%19]
+      : tensor<8x8xf32> into tensor<8x8xf32>[!gml_st.tile<8x8>],
+                     %arg6 into %arg6[%19]
+      : tensor<8x8xf32> into tensor<8x8xf32>[!gml_st.tile<8x8>],
+  } : tensor<8x8xf32>, tensor<8x8xf32>
+  return %13#0 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @fold_for_iter_arg
+// CHECK:         %[[INIT:.*]] = tensor.empty()
+// CHECK-NOT:     tensor.empty()
+// CHECK:         %[[FOR:.*]] = gml_st.for {{.*}} outs (%[[ARG:.*]] = %[[INIT]]: tensor<8x8xf32>) {
+// CHECK:         gml_st.set_yield {{.*}} into %[[ARG]][{{.*}}] : tensor<8x8xf32> into tensor<8x8xf32>
+// CHECK:         } : tensor<8x8xf32>
+// CHECK:         return %[[FOR]] : tensor<8x8xf32
+
+// -----
+
+func.func @fold_for_iter_arg_no_args(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %1 = tensor.empty() : tensor<8x8xf32>
+  %13 = gml_st.for (%arg4) = (%c0) to (%c8) step (%c8) outs (%arg6 = %1: tensor<8x8xf32>) {
+    %19 = gml_st.tile [0, 0] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    gml_st.set_yield %arg6 into %arg6[%19]
+      : tensor<8x8xf32> into tensor<8x8xf32>[!gml_st.tile<8x8>],
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: @fold_for_iter_arg_no_args
+// CHECK:         %[[INIT:.*]] = tensor.empty()
+// CHECK-NEXT:    return %[[INIT]] : tensor<8x8xf32
+
+// -----
+
+func.func @collapse_empty_for_vector(%in: vector<8x8xf32>) -> vector<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant dense<0.000000e+00> : vector<8x8xf32>
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %6 = vector.transfer_read %0[%c0, %c0], %cst {in_bounds = [true, true]} :
+        tensor<8x8xf32>, vector<8x8xf32>
+  %13 = gml_st.for (%arg4) = (%c0) to (%c1) step (%c8)
+        outs (%arg5 = %6: vector<8x8xf32>) {
+    %19 = gml_st.tile [0, 0] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %20 = tensor.extract_slice %0[0, 0] [8, 8] [1, 1]
+      : tensor<8x8xf32> to tensor<8x8xf32>
+    %7 = vector.transfer_write %arg5, %20[%c0, %c0] {in_bounds = [true, true]} :
+          vector<8x8xf32>, tensor<8x8xf32>
+    %11 = linalg.fill ins(%cst : f32) outs(%7 : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    %8 = vector.transfer_read %11[%c0, %c0], %cst {in_bounds = [true, true]} :
+          tensor<8x8xf32>, vector<8x8xf32>
+    gml_st.set_yield %8 into %arg5[%19] : vector<8x8xf32>
+          into vector<8x8xf32>[!gml_st.tile<8x8>]
+  } : vector<8x8xf32>
+  return %13 : vector<8x8xf32>
+}
+
+// CHECK-LABEL: @collapse_empty_for_vector
+// CHECK-NOT:     gml_st.for
+// CHECK:         linalg.fill
+// CHECK:         %[[READ:.*]] = vector.transfer_read
+// CHECK:         return %[[READ]] : vector<8x8xf32>
+
+// -----
+
+func.func @fold_tensor_cast_into_parallel(
+    %in: tensor<2xi32>, %out: tensor<2xi32>) -> tensor<2xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cst = arith.constant 100500 : i32
+
+  %out_cast = tensor.cast %out : tensor<2xi32> to tensor<?xi32>
+  %result = gml_st.parallel (%i) = (%c0) to (%c2) step (%c1)
+      outs (%out_ = %out_cast: tensor<?xi32>) {
+    %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %cst into %out_[%tile]
+      : i32 into tensor<?xi32>[!gml_st.tile<1>]
+  } : tensor<?xi32>
+  %result_cast = tensor.cast %result
+    : tensor<?xi32> to tensor<2xi32>
+
+  func.return %result_cast : tensor<2xi32>
+}
+// CHECK-LABEL: @fold_tensor_cast_into_parallel
+// CHECK:         gml_st.parallel
+// CHECK-NEXT:      gml_st.tile
+// CHECK-NEXT:      gml_st.set_yield
+// CHECK-SAME:        i32 into tensor<2xi32>
+// CHECK-NEXT:    } : tensor<2xi32>
+// CHECK-NEXT:    return
+
+// -----
+
+func.func @dim_of_parallel_loop(
+    %in: tensor<2x10xi32>, %out: tensor<2x10xi32>) -> index {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 100500 : i32
+
+  %result = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c2, %c10)
+      step (%c1, %c1) outs (%out_ = %out: tensor<2x10xi32>) {
+    %tile = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
+    gml_st.set_yield %cst into %out_[%tile]
+      : i32 into tensor<2x10xi32>[!gml_st.tile<1x1>]
+  } : tensor<2x10xi32>
+
+  %dim = tensor.dim %result, %c1 : tensor<2x10xi32>
+  func.return %dim : index
+}
+// CHECK-LABEL: @dim_of_parallel_loop
+// CHECK:         %[[C10:.*]] = arith.constant 10
+// CHECK-NEXT:    return %[[C10]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir
new file mode 100644
index 00000000000..e165ac57023
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse-shape.mlir
@@ -0,0 +1,288 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-collapse-shape | FileCheck %s
+
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:     --gml-collapse-shape="retain-trailing-dims=1" | \
+// RUN: FileCheck %s --check-prefix=CHECK-1
+
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:     --gml-collapse-shape="retain-trailing-dims=2" | \
+// RUN: FileCheck %s --check-prefix=CHECK-2
+
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:     --gml-collapse-shape="retain-trailing-dims=3" | \
+// RUN: FileCheck %s --check-prefix=CHECK-3
+
+func.func @bcast(%arg0: tensor<2x4x2048xf32>) -> tensor<2x4x2048x4096xf32> {
+  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
+  %1 = linalg.broadcast
+        ins(%arg0 : tensor<2x4x2048xf32>)
+        outs(%0 : tensor<2x4x2048x4096xf32>)
+        dimensions = [3]
+  return %1 : tensor<2x4x2048x4096xf32>
+}
+
+// CHECK:        func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
+// CHECK-NOT:    collapse_shape
+// CHECK-NOT:    expand_shape
+
+// CHECK-1:      func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
+// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-1-SAME:     [0, 1, 2]]
+// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-1:      ins(%[[COLLAPSED]] : tensor<16384xf32>)
+// CHECK-1:      outs(%[[EMPTY]] : tensor<16384x4096xf32>)
+// CHECK-1:      dimensions = [1]
+// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        return %[[EXPANDED]]
+
+// CHECK-2:      func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
+// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-2-SAME:     [0, 1], [2]]
+// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-2:        %[[BROADCASTED:.*]] = linalg.broadcast
+// CHECK-2-SAME:     ins(%[[COLLAPSED]] : tensor<8x2048xf32>)
+// CHECK-2-SAME:     outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
+// CHECK-2:      dimensions = [2]
+// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCASTED]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        return %[[EXPANDED]]
+
+// CHECK-3:        func.func @bcast(%[[ARG0:.*]]: tensor<2x4x2048xf32>)
+// CHECK-3-NOT:    collapse_shape
+// CHECK-3-NOT:    expand_shape
+
+// -----
+
+func.func @bcast_from_scalar() -> tensor<2x4x2048x4096xf32> {
+  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
+  %cst = arith.constant 0xFF800000 : f32
+  %1 = tensor.empty() : tensor<f32>
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
+  %3 = linalg.broadcast
+      ins(%2 : tensor<f32>)
+      outs(%0 : tensor<2x4x2048x4096xf32>)
+      dimensions = [0, 1, 2, 3]
+  return %3 : tensor<2x4x2048x4096xf32>
+}
+
+// CHECK:      func.func @bcast_from_scalar()
+// CHECK:        %[[EMPTY:.*]] = tensor.empty() : tensor<67108864xf32>
+// CHECK:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK:           ins(%{{.*}} : tensor<f32>)
+// CHECK:           outs(%[[EMPTY]] : tensor<67108864xf32>)
+// CHECK:           dimensions = [0]
+// CHECK:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
+// CHECK-SAME:     0, 1, 2, 3]]
+// CHECK:        return %[[EXPANDED]]
+
+// CHECK-1:      func.func @bcast_from_scalar()
+// CHECK-1:        %[[EMPTY:.*]] = tensor.empty() : tensor<16384x4096xf32>
+// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-1-SAME:       ins(%{{.*}} : tensor<f32>)
+// CHECK-1-SAME:       outs(%[[EMPTY]] : tensor<16384x4096xf32>)
+// CHECK-1-SAME:       dimensions = [1, 0]
+// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        return %[[EXPANDED]]
+
+// CHECK-2:      func.func @bcast_from_scalar()
+// CHECK-2:        %[[EMPTY:.*]] = tensor.empty() : tensor<8x2048x4096xf32>
+// CHECK-2:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-2-SAME:       ins(%{{.*}} : tensor<f32>
+// CHECK-2-SAME:       outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
+// CHECK-2-SAME:       dimensions = [1, 2, 0]
+// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[BROADCAST]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        return %[[EXPANDED]]
+
+// CHECK-3:        func.func @bcast_from_scalar()
+// CHECK-3-NOT:    collapse_shape
+// CHECK-3-NOT:    expand_shape
+
+// -----
+
+func.func @reduction(%arg0: tensor<2x4x2048x4096xf32>) -> tensor<2x4x2048xf32> {
+  %cst = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<2x4x2048xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x4x2048xf32>)
+      -> tensor<2x4x2048xf32>
+  %2 = linalg.reduce { arith.maxf }
+      ins(%arg0 : tensor<2x4x2048x4096xf32>)
+      outs(%1 : tensor<2x4x2048xf32>)
+      dimensions = [3]
+  return %2 : tensor<2x4x2048xf32>
+}
+
+// CHECK:        func.func @reduction(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-NOT:    collapse_shape
+// CHECK-NOT:    expand_shape
+
+// CHECK-1:      func.func @reduction(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-1-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
+// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-1:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<16384xf32>)
+// CHECK-1:        %[[REDUCED:.*]] = linalg.reduce { arith.maxf }
+// CHECK-1-SAME:       ins(%[[COLLAPSED]] : tensor<16384x4096xf32>)
+// CHECK-1-SAME:       outs(%[[FILL]] : tensor<16384xf32>)
+// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[REDUCED]] [
+// CHECK-1-SAME:     [0, 1, 2]]
+// CHECK-1:        return %[[EXPANDED]]
+
+
+// -----
+
+func.func @cwise(%arg0: tensor<2x4x2048x4096xf32>,
+    %arg1: tensor<2x4x2048x4096xf32>) -> tensor<2x4x2048x4096xf32> {
+  %0 = tensor.empty() : tensor<2x4x2048x4096xf32>
+  %1 = linalg.map { arith.subf }
+         ins(%arg0, %arg1 : tensor<2x4x2048x4096xf32>, tensor<2x4x2048x4096xf32>)
+         outs(%0 : tensor<2x4x2048x4096xf32>)
+  return %1 : tensor<2x4x2048x4096xf32>
+}
+
+// CHECK:        func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK:          %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-SAME:       [0, 1, 2, 3]]
+// CHECK:          %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
+// CHECK-SAME:       [0, 1, 2, 3]]
+// CHECK:          %[[EMPTY:.*]] = tensor.empty()
+// CHECK:          %[[MAP:.*]] = linalg.map { arith.subf }
+// CHECK:           ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<67108864xf32>, tensor<67108864xf32>)
+// CHECK:           outs(%[[EMPTY]] : tensor<67108864xf32>)
+// CHECK:          %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
+// CHECK-SAME:       [0, 1, 2, 3]]
+// CHECK:          return %[[EXPANDED]]
+
+// CHECK-1:      func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-1:        %[[MAP:.*]] = linalg.map { arith.subf }
+// CHECK-1-SAME:       ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<16384x4096xf32>, tensor<16384x4096xf32>)
+// CHECK-1-SAME       outs(%[[EMPTY]] : tensor<16384x4096xf32>)
+// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        return %[[EXPANDED]]
+
+// CHECK-2:      func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG1]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-2:        %[[MAP:.*]] = linalg.map { arith.subf }
+// CHECK-2-SAME:       ins(%[[COLLAPSED]], %[[COLLAPSED_0]] : tensor<8x2048x4096xf32>, tensor<8x2048x4096xf32>)
+// CHECK-2-SAME       outs(%[[EMPTY]] : tensor<8x2048x4096xf32>)
+// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        return %[[EXPANDED]]
+
+// CHECK-3:        func.func @cwise(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>, %[[ARG1:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-3-NOT:    collapse_shape
+// CHECK-3-NOT:    expand_shape
+
+// -----
+
+func.func @partial_softmax(%arg0: tensor<2x4x2048x4096xf32>)
+    -> tensor<2x4x2048x4096xf32> {
+  %cst = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<2x4x2048xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x4x2048xf32>)
+      -> tensor<2x4x2048xf32>
+  %2 = linalg.reduce { arith.maxf }
+         ins(%arg0 : tensor<2x4x2048x4096xf32>)
+         outs(%1 : tensor<2x4x2048xf32>)
+         dimensions = [3]
+  %3 = tensor.empty() : tensor<2x4x2048x4096xf32>
+  %4 = linalg.broadcast
+         ins(%2 : tensor<2x4x2048xf32>)
+         outs(%3 : tensor<2x4x2048x4096xf32>)
+         dimensions = [3]
+  %5 = linalg.map { arith.subf }
+         ins(%arg0, %4 : tensor<2x4x2048x4096xf32>, tensor<2x4x2048x4096xf32>)
+         outs(%3 : tensor<2x4x2048x4096xf32>)
+  return %5 : tensor<2x4x2048x4096xf32>
+}
+
+// CHECK-1:      func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-1-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
+// CHECK-1:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-1:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<16384xf32>)
+// CHECK-1:        %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
+// CHECK-1-SAME:       ins(%[[COLLAPSED]] : tensor<16384x4096xf32>)
+// CHECK-1-SAME:       outs(%[[FILL]] : tensor<16384xf32>)
+// CHECK-1-SAME:       dimensions = [1]
+// CHECK-1:        %[[EMPTY_0:.*]] = tensor.empty()
+// CHECK-1:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-1-SAME:       ins(%[[REDUCE]] : tensor<16384xf32>)
+// CHECK-1-SAME:       outs(%[[EMPTY_0]] : tensor<16384x4096xf32>)
+// CHECK-1-SAME:       dimensions = [1]
+// CHECK-1:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        %[[EMPTY_1:.*]] = tensor.empty()
+// CHECK-1:        %[[MAP:.*]] = linalg.map { arith.subf }
+// CHECK-1-SAME:       ins(%[[COLLAPSED_0]], %[[BROADCAST]] : tensor<16384x4096xf32>, tensor<16384x4096xf32>)
+// CHECK-1-SAME:       outs(%[[EMPTY_1]] : tensor<16384x4096xf32>)
+// CHECK-1:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
+// CHECK-1-SAME:     [0, 1, 2], [3]]
+// CHECK-1:        return %[[EXPANDED]]
+
+// CHECK-2:      func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-2-DAG:    %[[CST:.*]] = arith.constant 0xFF800000 : f32
+// CHECK-2:        %[[COLLAPSED:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        %[[EMPTY:.*]] = tensor.empty()
+// CHECK-2:        %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY]] : tensor<8x2048xf32>)
+// CHECK-2:        %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
+// CHECK-2-SAME:       ins(%[[COLLAPSED]] : tensor<8x2048x4096xf32>)
+// CHECK-2-SAME:       outs(%[[FILL]] : tensor<8x2048xf32>)
+// CHECK-2-SAME:       dimensions = [2]
+// CHECK-2:        %[[EMPTY_0:.*]] = tensor.empty()
+// CHECK-2:        %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-2-SAME:       ins(%[[REDUCE]] : tensor<8x2048xf32>)
+// CHECK-2-SAME:       outs(%[[EMPTY_0]] : tensor<8x2048x4096xf32>)
+// CHECK-2-SAME:       dimensions = [2]
+// CHECK-2:        %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[ARG0]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        %[[EMPTY_1:.*]] = tensor.empty()
+// CHECK-2:        %[[MAP:.*]] = linalg.map { arith.subf }
+// CHECK-2-SAME:       ins(%[[COLLAPSED_0]], %[[BROADCAST]] : tensor<8x2048x4096xf32>, tensor<8x2048x4096xf32>)
+// CHECK-2-SAME:       outs(%[[EMPTY_1]] : tensor<8x2048x4096xf32>)
+// CHECK-2:        %[[EXPANDED:.*]] = tensor.expand_shape %[[MAP]] [
+// CHECK-2-SAME:     [0, 1], [2], [3]]
+// CHECK-2:        return %[[EXPANDED]]
+
+// CHECK-3:        func.func @partial_softmax(%[[ARG0:.*]]: tensor<2x4x2048x4096xf32>)
+// CHECK-3-NOT:    collapse_shape
+// CHECK-3-NOT:    expand_shape
+
+// -----
+
+
+func.func @collapse_shape_of_cwise(%arg0: tensor<2x4xf32>) -> tensor<8xf32> {
+  %0 = tensor.empty() : tensor<2x4xf32>
+  %1 = linalg.map { arith.negf }
+         ins(%arg0 : tensor<2x4xf32>)
+         outs(%0 : tensor<2x4xf32>)
+  %3 = tensor.collapse_shape %1 [[0, 1]] : tensor<2x4xf32> into tensor<8xf32>
+  return %3 : tensor<8xf32>
+}
+
+// CHECK:   func.func @collapse_shape_of_cwise
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape {{.*}} [
+// CHECK-SAME: [0, 1]] : tensor<2x4xf32> into tensor<8xf32>
+// CHECK: %[[MAPPED:.*]] = linalg.map
+// CHECK: ins(%[[COLLAPSED]] : tensor<8xf32>)
+
+// CHECK-1: func.func @collapse_shape_of_cwise
+// CHECK-2: func.func @collapse_shape_of_cwise
+// CHECK-3: func.func @collapse_shape_of_cwise
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_materialize_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_materialize_ops.mlir
deleted file mode 100644
index 4bc3575ab34..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/collapse_materialize_ops.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-collapse-materialize-ops | \
-// RUN: FileCheck %s
-
-func.func @compose_tiles(%arg: tensor<?x?xf32>, %i: index, %j: index, %k: index,
-    %n: index, %a: index, %b: index) -> tensor<4x?xf32> {
-  %1 = gml_st.tile [%i, %j] [4, 128] [2, %a]
-      : !gml_st.tile<4x128>
-  %4 = gml_st.materialize %arg[%1] : tensor<?x?xf32>[!gml_st.tile<4x128>] to tensor<4x128xf32>
-  %3 = gml_st.tile [0, %k] [4, %n] [1, %b]
-      : !gml_st.tile<4x?>
-  %5 = gml_st.materialize %4[%3] : tensor<4x128xf32>[!gml_st.tile<4x?>] to tensor<4x?xf32>
-  return %5 : tensor<4x?xf32>
-}
-// CHECK-LABEL: @compose_tiles
-// CHECK-SAME:  %[[ARG:[a-z0-9]+]]: tensor<?x?xf32>, %[[I:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[J:[a-z0-9]+]]: index, %[[K:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[N:[a-z0-9]+]]: index, %[[A:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[B:[a-z0-9]+]]: index)
-
-// CHECK-DAG:  %[[AK:.*]] = arith.muli %[[A]], %[[K]]
-// CHECK-DAG:  %[[J_PLUS_AK:.*]] = arith.addi %[[J]], %[[AK]]
-// CHECK-DAG:  %[[AB:.*]] = arith.muli %[[A]], %[[B]]
-// CHECK:      %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J_PLUS_AK]]] [4, %[[N]]]
-// CHECK-SAME:   [2, %[[AB]]] : !gml_st.tile<4x?>
-// CHECK-NEXT:  %[[RES:.*]] = gml_st.materialize %[[ARG]][%[[TILE]]]
-// CHECK-SAME:    : tensor<?x?xf32>[!gml_st.tile<4x?>]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
new file mode 100644
index 00000000000..eb58d45d08f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/compose_extract_insert_slice.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-hlo-opt %s --gml-compose-extract-insert-slice | FileCheck %s
+
+func.func @compose_tiles(%arg: tensor<?x?xf32>, %i: index, %j: index, %k: index,
+    %n: index, %a: index, %b: index) -> tensor<4x?xf32> {
+  %4 = tensor.extract_slice %arg[%i, %j] [4, 128] [2, %a]
+    : tensor<?x?xf32> to tensor<4x128xf32>
+  %5 = tensor.extract_slice %4[0, %k] [4, %n] [1, %b]
+    : tensor<4x128xf32> to tensor<4x?xf32>
+  return %5 : tensor<4x?xf32>
+}
+// CHECK-LABEL: @compose_tiles
+// CHECK-SAME:  %[[ARG:[a-z0-9]+]]: tensor<?x?xf32>, %[[I:[a-z0-9]+]]: index,
+// CHECK-SAME:  %[[J:[a-z0-9]+]]: index, %[[K:[a-z0-9]+]]: index,
+// CHECK-SAME:  %[[N:[a-z0-9]+]]: index, %[[A:[a-z0-9]+]]: index,
+// CHECK-SAME:  %[[B:[a-z0-9]+]]: index)
+
+// CHECK-DAG:  %[[J_PLUS_AK:.*]] = affine.apply
+// CHECK-DAG:  %[[AB:.*]] = affine.apply
+// CHECK-NEXT: %[[RES:.*]] = tensor.extract_slice %[[ARG]]
+// CHECK-SAME:   [%[[I]], %[[J_PLUS_AK]]] [4, %[[N]]] [2, %[[AB]]]
+// CHECK-SAME:   : tensor<?x?xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
new file mode 100644
index 00000000000..dc5210abd97
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_bcast_map.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline | FileCheck %s
+
+func.func @map_bcast_map(%arg0: tensor<?xf32>, %arg1: tensor<?x?x?xf32>,
+                              %init0: tensor<?xf32>,
+                              %init1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %abs = linalg.map { math.absf }
+           ins(%arg0:tensor<?xf32>)
+           outs(%init0:tensor<?xf32>)
+
+  %bcast = linalg.broadcast
+             ins(%abs : tensor<?xf32>)
+             outs(%init1 : tensor<?x?x?xf32>)
+             dimensions = [1, 2]
+
+  %mapped = linalg.map { arith.addf }
+              ins(%bcast, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
+              outs(%init1:tensor<?x?x?xf32>)
+  func.return %mapped : tensor<?x?x?xf32>
+}
+
+// CHECK-LABEL: func.func @map_bcast_map
+
+// CHECK: gml_st.parallel
+// CHECK:   math.absf %{{.*}} : f32
+// CHECK:   vector.broadcast %{{.*}} : vector<1xf32> to vector<1x8x1xf32>
+// CHECK:   vector.transpose %{{.*}}, [2, 0, 1] : vector<1x8x1xf32> to vector<1x1x8xf32>
+// CHECK:   arith.addf %{{.*}} : vector<8xf32>
+// CHECK:   gml_st.set_yield
+
+// CHECK: gml_st.parallel
+// CHECK:   gml_st.parallel
+// CHECK:     math.absf %{{.*}} : f32
+// CHECK:     arith.addf %{{.*}} : f32
+// CHECK:     gml_st.set_yield
+// CHECK:   gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
new file mode 100644
index 00000000000..4be58e566fd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_matmul.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline | FileCheck %s
+
+func.func @map_matmul(%arg0: tensor<?x?xf32>,
+    %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %init = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %filled = linalg.fill ins(%cst : f32)
+              outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %5 = linalg.matmul ins(%arg0, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %6 = linalg.map { math.absf }
+         ins(%5 : tensor<?x?xf32>)
+         outs(%init : tensor<?x?xf32>)
+
+  %result = linalg.map { arith.addf }
+              ins(%4, %6 : tensor<?x?xf32>, tensor<?x?xf32>)
+              outs(%init : tensor<?x?xf32>)
+  return %result : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @map_matmul
+
+// CHECK:      gml_st.parallel
+// CHECK:        scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK:             vector.transpose
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK:          scf.yield
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK:             vector.transpose
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK:          scf.yield
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        math.absf %{{.*}} : vector<4x4xf32>
+// CHECK:        arith.addf %{{.*}} : vector<4x4xf32>
+// CHECK:        gml_st.set_yield
+
+// CHECK:      gml_st.parallel
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        linalg.map
+// CHECK:        linalg.map
+// CHECK:        gml_st.set_yield
+
+// CHECK:      gml_st.parallel
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        scf.for
+// CHECK:          linalg.matmul
+// CHECK:          scf.yield
+// CHECK:        linalg.map
+// CHECK:        linalg.map
+// CHECK:        gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
new file mode 100644
index 00000000000..7cf0fe560c3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/map_reduce_map.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
+// RUN: | FileCheck %s
+
+func.func @reduce_map_fuse_map(%arg0: tensor<10x100xf32>,
+    %arg1: tensor<10x100xf32>, %output: tensor<10xf32>) -> tensor<10xf32> {
+  %map_init = tensor.empty() : tensor<10x100xf32>
+  %reduce_init = tensor.empty() : tensor<10xf32>
+  %mapped = linalg.map { arith.addf }
+              ins(%arg0, %arg1 : tensor<10x100xf32>, tensor<10x100xf32>)
+              outs(%map_init : tensor<10x100xf32>)
+
+  %reduce = linalg.reduce { arith.addf }
+              ins(%mapped: tensor<10x100xf32>)
+              outs(%reduce_init: tensor<10xf32>)
+              dimensions = [1]
+
+  %res = linalg.map { math.absf }
+           ins(%reduce: tensor<10xf32>)
+           outs(%output : tensor<10xf32>)
+  return %res : tensor<10xf32>
+}
+// CHECK-LABEL: @reduce_map_fuse_map
+
+// TODO(pifon): The lowering is severely broken. Fixing it in a follow-up.
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
new file mode 100644
index 00000000000..c1d1558f06b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/matmul.mlir
@@ -0,0 +1,206 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline | FileCheck %s
+// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline="lower-to-mmt4d=true" | FileCheck %s --check-prefixes=PACKED
+
+func.func @matmul_static(%lhs: tensor<128x16xf32>, %rhs: tensor<16x64xf32>,
+                         %output: tensor<128x64xf32>) -> tensor<128x64xf32> {
+  %2 = linalg.matmul ins(%lhs, %rhs : tensor<128x16xf32>, tensor<16x64xf32>)
+                     outs(%output : tensor<128x64xf32>) -> tensor<128x64xf32>
+  return %2 : tensor<128x64xf32>
+}
+
+// CHECK-LABEL: @matmul_static
+
+// CHECK:         gml_st.parallel
+// CHECK:           vector.transfer_read
+// CHECK-NEXT:      scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK:             vector.transpose
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK:             scf.yield {{.*}} : vector<4x4xf32>
+// CHECK:           vector.transfer_write
+// CHECK:           gml_st.set_yield
+
+// PACKED-LABEL: @matmul_static
+
+// PACKED:         tensor.empty() : tensor<16x16x8x1xf32>
+// PACKED-COUNT-2: scf.for
+// PACKED:           vector.transfer_read
+// PACKED:           vector.transfer_write
+// PACKED:           scf.yield %{{.*}} : tensor<16x16x8x1xf32>
+// PACKED:          scf.yield %{{.*}} : tensor<16x16x8x1xf32>
+
+// PACKED:         tensor.empty() : tensor<8x16x8x1xf32>
+// PACKED-COUNT-2:   scf.for
+// PACKED:             vector.broadcast
+// PACKED:             vector.transpose
+// PACKED:            scf.yield %{{.*}} : tensor<8x16x8x1xf32>
+// PACKED:           scf.yield %{{.*}} : tensor<8x16x8x1xf32>
+
+// PACKED:         tensor.empty() : tensor<16x8x8x8xf32>
+// PACKED-COUNT-2: scf.for
+// PACKED:           vector.transfer_read
+// PACKED:           vector.transfer_write
+// PACKED:          scf.yield
+// PACKED:         scf.yield
+
+// PACKED-COUNT-2: scf.for
+// PACKED:           vector.broadcast
+// PACKED:           scf.for
+// PACKED:             vector.transfer_read
+// PACKED:             vector.transfer_read
+// PACKED:             vector.transpose
+// PACKED:             vector.transpose
+// PACKED:             vector.outerproduct
+// PACKED:             vector.broadcast
+// PACKED:             vector.broadcast
+// PACKED:             scf.yield
+// PACKED:           scf.yield
+// PACKED:          scf.yield
+
+// PACKED:         tensor.empty() : tensor<128x64xf32>
+// PACKED-COUNT-2: scf.for
+// PACKED:           vector.transfer_read
+// PACKED:           vector.transfer_write
+// PACKED:           scf.yield %{{.*}} : tensor<128x64xf32>
+// PACKED:          scf.yield %{{.*}} : tensor<128x64xf32>
+
+
+
+// -----
+
+func.func @matmul(%lhs: tensor<?x?xf32>,
+    %rhs: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
+  %c1 = arith.constant 1 : index
+  %1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
+  %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %3 = linalg.fill ins(%cst : f32)
+         outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %4 = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %4 : tensor<?x?xf32>
+}
+// CHECK-LABEL: @matmul
+
+// CHECK:         gml_st.parallel
+// CHECK:           scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK:             vector.transpose
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK-NEXT:        scf.yield %{{.*}} : vector<4x4xf32>
+// CHECK:           vector.transfer_write
+
+// CHECK-NEXT:      scf.for
+// CHECK:             linalg.matmul {{.*}} -> tensor<4x4xf32>
+// CHECK:             scf.yield {{.*}} : tensor<?x?xf32>
+// CHECK:           gml_st.set_yield
+
+// CHECK:         gml_st.parallel
+// CHECK:           linalg.fill
+// CHECK:           scf.for
+// CHECK:             linalg.matmul {{.*}} -> tensor<4x?xf32>
+// CHECK:             scf.yield {{.*}} : tensor<4x?xf32>
+// CHECK:           gml_st.set_yield
+
+// CHECK:         gml_st.parallel
+// CHECK:           linalg.fill
+// CHECK:           scf.for
+// CHECK:             linalg.matmul
+// CHECK:             scf.yield {{.*}} : tensor<?x?xf32>
+// CHECK:           gml_st.set_yield
+
+// -----
+
+func.func @matmul_narrow_static(%lhs: tensor<2x16xf32>, %rhs: tensor<16x64xf32>,
+                         %output: tensor<2x64xf32>) -> tensor<2x64xf32> {
+  %2 = linalg.matmul ins(%lhs, %rhs : tensor<2x16xf32>, tensor<16x64xf32>)
+                     outs(%output : tensor<2x64xf32>) -> tensor<2x64xf32>
+  return %2 : tensor<2x64xf32>
+}
+// CHECK-LABEL: @matmul_narrow_static
+
+// CHECK:         gml_st.parallel
+// CHECK:           scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK:             vector.transpose
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK:             scf.yield {{.*}} : vector<2x4xf32>
+// CHECK:           vector.transfer_write
+// CHECK:           gml_st.set_yield
+
+// PACKED-LABEL: @matmul_narrow_static
+
+// PACKED:       tensor.empty() : tensor<1x16x2x1xf32>
+// PACKED:       scf.for
+// PACKED:         vector.transfer_read
+// PACKED:         vector.transfer_write
+// PACKED:         scf.yield %{{.*}} : tensor<1x16x2x1xf32>
+// PACKED:       }
+
+// PACKED:       tensor.empty() : tensor<8x16x8x1xf32>
+// PACKED-COUNT: scf.for
+// PACKED:           vector.broadcast
+// PACKED:           vector.transpose
+// PACKED:           scf.yield %{{.*}} : tensor<8x16x8x1xf32>
+// PACKED:         scf.yield %{{.*}} : tensor<8x16x8x1xf32>
+
+// PACKED:       tensor.empty() : tensor<1x8x2x8xf32>
+// PACKED:       scf.for
+// PACKED:         vector.transfer_read
+// PACKED:         vector.transfer_write
+// PACKED:         scf.yield %{{.*}} : tensor<1x8x2x8xf32>
+// PACKED:       scf.for
+// PACKED:         vector.broadcast
+// PACKED:         scf.for
+// PACKED:           vector.transpose
+// PACKED:           vector.transpose
+// PACKED:           vector.outerproduct
+// PACKED:           vector.broadcast
+// PACKED:           vector.broadcast
+// PACKED:           scf.yield %{{.*}} : vector<1x1x2x8xf32>
+// PACKED:         scf.yield
+
+// PACKED:       tensor.empty() : tensor<2x64xf32>
+// PACKED:       scf.for
+// PACKED:         vector.transfer_read
+// PACKED:         vector.transfer_write
+// PACKED:         scf.yield %{{.*}} : tensor<2x64xf32>
+
+// -----
+
+func.func @matmul_small_static_peeling(%lhs: tensor<2x4xf32>, %arg1: tensor<4x6xf32>,
+                         %output: tensor<2x6xf32>) -> tensor<2x6xf32> {
+  %2 = linalg.matmul ins(%lhs, %arg1 : tensor<2x4xf32>, tensor<4x6xf32>)
+                     outs(%output : tensor<2x6xf32>) -> tensor<2x6xf32>
+  return %2 : tensor<2x6xf32>
+}
+// CHECK-LABEL: @matmul_small_static_peeling
+
+// CHECK-NOT:     gml_st.parallel
+// CHECK-NOT:     scf.for
+// CHECK:         vector.transpose
+// CHECK-COUNT-4: vector.outerproduct
+// CHECK:         vector.transpose
+// CHECK-COUNT-4: vector.outerproduct
+
+// -----
+
+func.func @matvec_static(%lhs: tensor<1x16xf32>, %arg1: tensor<16x64xf32>,
+                         %output: tensor<1x64xf32>) -> tensor<1x64xf32> {
+  %2 = linalg.matmul ins(%lhs, %arg1 : tensor<1x16xf32>, tensor<16x64xf32>)
+                     outs(%output : tensor<1x64xf32>) -> tensor<1x64xf32>
+  return %2 : tensor<1x64xf32>
+}
+// CHECK-LABEL: @matvec_static
+
+// CHECK:         gml_st.parallel
+// CHECK:           vector.transfer_read
+// CHECK-NEXT:      vector.broadcast
+// CHECK-NEXT:      scf.for
+// CHECK-COUNT-2:     vector.transfer_read
+// CHECK-COUNT-4:     vector.outerproduct
+// CHECK:             scf.yield {{.*}} : vector<1x4xf32>
+// CHECK:           vector.transfer_write
+// CHECK:           gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
new file mode 100644
index 00000000000..5a7f5f9c75a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_1d.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
+// RUN: | FileCheck %s
+
+func.func @reduce_1d_static(%arg0: tensor<100xf32>) -> tensor<f32> {
+  %1 = tensor.empty() : tensor<f32>
+  %cst = arith.constant 0.0 : f32
+  %init = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
+  %res = linalg.reduce { arith.addf }
+    ins(%arg0: tensor<100xf32>) outs(%init: tensor<f32>) dimensions = [0]
+  return %res : tensor<f32>
+}
+// CHECK-LABEL: @reduce_1d_static
+
+//       CHECK: arith.constant dense<0.000000e+00> : vector<8xf32>
+//       CHECK: tensor.empty() : tensor<f32>
+
+//       CHECK: scf.for
+//       CHECK:   vector.multi_reduction <add>
+//  CHECK-SAME:     : vector<4x8xf32> to vector<8xf32>
+//       CHECK:   scf.yield %{{.*}} : vector<8xf32>
+
+//       CHECK: vector.multi_reduction <add>
+//  CHECK-SAME:   : vector<8xf32> to f32
+//       CHECK: vector.multi_reduction <add>
+//  CHECK-SAME:   : vector<4xf32> to f32
+
+// -----
+
+func.func @reduce_1d_dynamic(%arg0: tensor<?xf32>) -> tensor<f32> {
+  %1 = tensor.empty() : tensor<f32>
+  %cst = arith.constant 0.0 : f32
+  %init = linalg.fill ins(%cst : f32) outs(%1 : tensor<f32>) -> tensor<f32>
+  %res = linalg.reduce { arith.addf }
+    ins(%arg0: tensor<?xf32>) outs(%init: tensor<f32>) dimensions = [0]
+  return %res : tensor<f32>
+}
+// CHECK-LABEL: func @reduce_1d_dynamic
+
+//       CHECK: arith.constant dense<0.000000e+00> : vector<8xf32>
+//       CHECK: tensor.empty() : tensor<f32>
+
+//       CHECK: scf.for
+//       CHECK:   vector.multi_reduction <add>
+//  CHECK-SAME:     : vector<4x8xf32> to vector<8xf32>
+//       CHECK:   scf.yield %{{.*}} : vector<8xf32>
+
+//       CHECK: vector.multi_reduction <add>
+//  CHECK-SAME:   : vector<8xf32> to f32
+
+//       CHECK: scf.for
+//       CHECK:   linalg.reduce
+//       CHECK:   scf.yield %{{.*}} : tensor<f32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
new file mode 100644
index 00000000000..c257d0ffdb2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reduce_2d.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
+// RUN: | FileCheck %s
+
+func.func @reduce_static(%input: tensor<100x10xf32>,
+                        %output: tensor<10xf32>) -> tensor<10xf32> {
+  %res = linalg.reduce { arith.addf }
+           ins(%input: tensor<100x10xf32>)
+           outs(%output: tensor<10xf32>)
+           dimensions = [0]
+  return %res : tensor<10xf32>
+}
+// CHECK-LABEL: @reduce_static
+
+//       CHECK: gml_st.parallel
+//       CHECK:   scf.for
+//       CHECK:     vector.multi_reduction
+//  CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
+//  CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
+//       CHECK:   gml_st.set_yield
+
+// -----
+
+func.func @reduce_dynamic(%input: tensor<?x?xf32>,
+                      %output: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.dim %output, %c0 : tensor<?xf32>
+  %1 = tensor.empty(%0) : tensor<?xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<?xf32>) -> tensor<?xf32>
+  %res = linalg.reduce { arith.mulf }
+           ins(%input: tensor<?x?xf32>)
+           outs(%2: tensor<?xf32>)
+           dimensions = [1]
+  return %res : tensor<?xf32>
+}
+// CHECK-LABEL: @reduce_dynamic
+
+// CHECK:      gml_st.parallel
+// CHECK:        scf.for
+// CHECK:          vector.multi_reduction
+// CHECK-SAME:       : vector<4x4xf32> to vector<4xf32>
+// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
+
+// CHECK:        scf.for
+// CHECK:          vector.multi_reduction
+// CHECK-SAME:       : vector<4x1xf32> to vector<4xf32>
+// CHECK-NEXT:     scf.yield %{{.*}} : vector<4xf32>
+// CHECK:        gml_st.set_yield
+
+// CHECK:      gml_st.parallel
+// CHECK:        linalg.fill
+// CHECK:        scf.for
+// CHECK:          linalg.reduce
+// CHECK:          scf.yield
+// CHECK:        gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir
new file mode 100644
index 00000000000..e65c630345c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/reverse.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-st-cpu-tiling-pipeline \
+// RUN: | FileCheck %s
+
+func.func @reverse_static_perfect_tiles(
+  %input: tensor<64xf32>, %init: tensor<64xf32>) -> tensor<64xf32> {
+  %res = thlo.reverse
+    ins(%input: tensor<64xf32>)
+    outs(%init: tensor<64xf32>)
+    reverse_dimensions = [0]
+  func.return %res : tensor<64xf32>
+}
+
+// CHECK-LABEL: @reverse_static_perfect_tiles
+
+// CHECK: gml_st.parallel
+// CHECK:   vector.shuffle
+// CHECK:   gml_st.set_yield
+
+// -----
+
+func.func @reverse_dynamic(
+  %input: tensor<?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %res = thlo.reverse
+     ins(%input: tensor<?x?xf32>)
+     outs(%init: tensor<?x?xf32>)
+     reverse_dimensions = [0, 1]
+  func.return %res : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @reverse_dynamic
+
+// CHECK: gml_st.parallel
+// CHECK:   vector.shuffle
+// CHECK:   gml_st.set_yield
+
+// CHECK: gml_st.parallel
+// CHECK:   gml_st.parallel
+// CHECK:     tensor.extract_slice
+// CHECK:     gml_st.set_yield
+// CHECK:   gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
new file mode 100644
index 00000000000..33874e4e88a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/scatter.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-hlo-opt %s -xla-cpu-transform-scatter | FileCheck %s
+
+func.func @scatter_small_vector_dim(%indices: tensor<?x2xindex>,
+  %updates: tensor<?x?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %result = thlo.scatter
+    ins (%indices: tensor<?x2xindex>, %updates: tensor<?x?x?xf32>)
+    outs (%init: tensor<?x?xf32>)
+    (%in: f32, %out: f32) {
+      %0 = arith.addf %in, %out: f32
+      thlo.yield %0: f32
+    }
+  return %result : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: @scatter_small_vector_dim
+// CHECK:       scf.for
+// CHECK:       thlo.scatter
+// CHECK-SAME:    ins(%{{.*}} : tensor<1x2xindex>, %{{.*}} : tensor<1x?x?xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir
new file mode 100644
index 00000000000..fcf9ccc7467
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/sort.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline | FileCheck %s
+
+func.func @sort(%input1: tensor<64x8x4xf32>, %input2: tensor<64x8x4xf32>,
+                %init1: tensor<64x8x4xf32>, %init2: tensor<64x8x4xf32>) {
+  thlo.sort
+    ins(%input1: tensor<64x8x4xf32>, %input2: tensor<64x8x4xf32>)
+    outs(%init1: tensor<64x8x4xf32>, %init2: tensor<64x8x4xf32>)
+    dimension = 1
+    is_stable = true
+    (%e11: f32, %e12: f32, %e21: f32, %e22: f32) {
+      %gt = arith.cmpf ogt, %e11, %e12: f32
+      thlo.yield %gt : i1
+    }
+  func.return
+}
+// CHECK-LABEL: func.func @sort(
+
+// CHECK:      gml_st.parallel
+// CHECK:        thlo.sort
+// CHECK-SAME:     ins(%{{.*}} : tensor<1x8x1xf32>, %{{.*}} : tensor<1x8x1xf32>)
+// CHECK-SAME:     dimension = 1
+// CHECK:        gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir
new file mode 100644
index 00000000000..231987e6b6f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/transpose.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-hlo-opt %s --gml-st-cpu-tiling-pipeline \
+// RUN: | FileCheck %s
+
+func.func @transpose(%input: tensor<16x32x64xf32>,
+    %init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
+  %transpose = linalg.transpose
+    ins(%input:tensor<16x32x64xf32>)
+    outs(%init:tensor<32x64x16xf32>)
+    permutation = [1, 2, 0]
+  func.return %transpose : tensor<32x64x16xf32>
+}
+// CHECK-LABEL: func.func @transpose
+
+// CHECK:      gml_st.parallel
+// CHECK:        vector.transpose
+// CHECK-SAME:     [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
+// CHECK:        gml_st.set_yield
+
+// -----
+
+func.func @peel_transpose(%input: tensor<16x32x65xf32>,
+    %init: tensor<32x65x16xf32>) -> tensor<32x65x16xf32> {
+  %transpose = linalg.transpose
+    ins(%input:tensor<16x32x65xf32>)
+    outs(%init:tensor<32x65x16xf32>)
+    permutation = [1, 2, 0]
+  func.return %transpose : tensor<32x65x16xf32>
+}
+
+// CHECK-LABEL: @peel_transpose
+
+// CHECK:      gml_st.parallel
+// CHECK:        vector.transpose
+// CHECK-SAME:     [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32>
+// CHECK:        gml_st.set_yield
+
+// CHECK:      gml_st.parallel
+// CHECK:        gml_st.parallel
+// CHECK:          tensor.extract
+// CHECK:          gml_st.set_yield
+// CHECK:       gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/fusion.mlir
index 81bd095d343..148421863ba 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/fusion.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/fusion.mlir
@@ -18,9 +18,8 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>,
       broadcast_dimensions = [0, 2]
       { op_label = "producer" }
 
-  %tile = gml_st.tile [%i, %j, %k] [3, 4, %arg_dim] [1, 1, 1] : !gml_st.tile<3x4x?>
-  %bcast_sub = gml_st.materialize %bcast[%tile]
-      : tensor<?x?x?xf32>[!gml_st.tile<3x4x?>] to tensor<3x4x?xf32>
+  %bcast_sub = tensor.extract_slice %bcast[%i, %j, %k] [3, 4, %arg_dim] [1, 1, 1]
+      : tensor<?x?x?xf32> to tensor<3x4x?xf32>
   func.return { op_label = "consumer" } %bcast_sub : tensor<3x4x?xf32>
 }
 // CHECK-LABEL: @dynamic_broadcast_in_dim
@@ -35,8 +34,6 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>,
 // CHECK:      %[[EXTRACT_1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
 // CHECK:      %[[EXTRACT_2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
 // CHECK:      %[[INIT:.*]] = tensor.empty(%[[EXTRACT_0]], %[[EXTRACT_1]], %[[EXTRACT_2]])
-// CHECK:      %[[INIT_TILE:.*]] = gml_st.tile
-// CHECK-SAME:     [%[[I]], %[[J]], %[[K]]] [3, 4, %[[ARG_DIM]]]
 // CHECK:      %[[DIM_0:.*]] = tensor.dim %[[ARG]], %[[C0]]
 // CHECK:      %[[DIM_1:.*]] = tensor.dim %[[ARG]], %[[C1]]
 // CHECK:      %[[CMPI_0:.*]] = arith.cmpi ne, %[[DIM_0]], %[[EXTRACT_0]]
@@ -45,12 +42,12 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>,
 // CHECK:      %[[SELECT_0:.*]] = arith.select %[[CMPI_1]], %[[C0]], %[[K]]
 // CHECK:      %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[C1]], %[[C3]]
 // CHECK:      %[[SELECT_2:.*]] = arith.select %[[CMPI_1]], %[[C1]], %[[ARG_DIM]]
-// CHECK:      %[[ARG_TILE:.*]] = gml_st.tile
+// CHECK:      %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:     [%[[I]], %[[J]], %[[K]]] [3, 4, %[[ARG_DIM]]]
+// CHECK:      %[[ARG_SUB:.*]] = tensor.extract_slice %[[ARG]]
 // CHECK-SAME:     [%[[SELECT]], %[[SELECT_0]]]
 // CHECK-SAME:     [%[[SELECT_1]], %[[SELECT_2]]]
 // CHECK-SAME:     [1, 1]
-// CHECK:      %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%[[INIT_TILE]]]
-// CHECK:      %[[ARG_SUB:.*]] = gml_st.materialize %[[ARG]][%[[ARG_TILE]]]
 // CHECK:      %[[DYNAMIC:.*]] = thlo.dynamic_broadcast_in_dim
 // CHECK-SAME:     ins(%[[ARG_SUB]] : tensor<?x?xf32>)
 // CHECK-SAME:     outs(%[[INIT_SUB]] : tensor<3x4x?xf32>)
@@ -59,69 +56,62 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>,
 
 // -----
 
-func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
-    %b: tensor<?x?xi32>, %c: tensor<?x?xi32>, %i: index, %j: index,
-    %arg_dim0: index, %arg_dim1: index) -> tensor<?x?xi32> {
-  %tile = gml_st.tile [%i, %j] [%arg_dim0, %arg_dim1] [1, 1] : !gml_st.tile<?x?>
-  %concat = thlo.concatenate
-      ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>, %c : tensor<?x?xi32>)
-      outs(%init : tensor<?x?xi32>) {
-      dimension = 1 : i64,
-      op_label = "producer" }
-  %concat_sub = gml_st.materialize %concat[%tile]
-      : tensor<?x?xi32>[!gml_st.tile<?x?>] to tensor<?x?xi32>
-  func.return { op_label = "consumer" } %concat_sub : tensor<?x?xi32>
+// CHECK-LABEL: @nary_concatenate_with_unit_dims
+// CHECK-SAME:  %[[INIT:.*]]: tensor<?x?xi32>, %[[ARG_A:.*]]: tensor<?x?xi32>, %[[ARG_B:.*]]: tensor<?x?xi32>, %[[ARG_C:.*]]: tensor<?x?xi32>, %[[I:.*]]: index, %[[J:.*]]: index, %[[N:.*]]: index
+func.func @nary_concatenate_with_unit_dims(%init : tensor<?x?xi32>,
+    %a: tensor<?x?xi32>, %b: tensor<?x?xi32>, %c: tensor<?x?xi32>, %i: index,
+    %j: index, %n: index) -> tensor<?x1xi32> {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1
+  // CHECK:     %[[DIM:.*]] = tensor.dim %[[ARG_A]], %[[C1]]
+  // CHECK:     %[[CMPI:.*]] = arith.cmpi ult, %[[J]], %[[DIM]]
+  // CHECK:     %[[IF:.*]] = scf.if %[[CMPI]] -> (tensor<?x1xi32>)
+  // CHECK:       %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG_A]][%[[I]], %[[J]]] [%[[N]], 1] [1, 1]
+  // CHECK:       scf.yield %[[MATERIALIZE]]
+  // CHECK:     else
+  // CHECK:       %[[SUBI:.*]] = arith.subi %[[J]], %[[DIM]]
+  // CHECK:       %[[DIM_0:.*]] = tensor.dim %[[ARG_B]], %[[C1]]
+  // CHECK:       %[[CMPI_0:.*]] = arith.cmpi ult, %[[SUBI]], %[[DIM_0]]
+  // CHECK:       %[[IF_0:.*]] = scf.if %[[CMPI_0]] -> (tensor<?x1xi32>)
+  // CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG_B]][%[[I]], %[[SUBI]]] [%[[N]], 1] [1, 1]
+  // CHECK:         scf.yield %[[MATERIALIZE_0]]
+  // CHECK:       else
+  // CHECK:         %[[SUBI_0:.*]] = arith.subi %[[SUBI]], %[[DIM_0]]
+  // CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[ARG_C]][%[[I]], %[[SUBI_0]]] [%[[N]], 1] [1, 1]
+  // CHECK:         scf.yield %[[MATERIALIZE_1]]
+  // CHECK:       scf.yield %[[IF_0]]
+  // CHECK:     return {op_label = "consumer"} %[[IF]]
+  %concat = thlo.concatenate ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>,
+      %c : tensor<?x?xi32>) outs(%init : tensor<?x?xi32>) dimension = 1
+      { op_label = "producer" }
+  %tiled_concat = tensor.extract_slice %concat[%i, %j] [%n, 1] [1, 1]
+      : tensor<?x?xi32> to tensor<?x1xi32>
+  func.return { op_label = "consumer" } %tiled_concat : tensor<?x1xi32>
 }
-// CHECK-LABEL: @concatenate
-// CHECK-SAME:  (%[[INIT:[a-z0-9]+]]: tensor<?x?xi32>, %[[A:[a-z0-9]+]]: tensor<?x?xi32>,
-// CHECK-SAME:  %[[B:[a-z0-9]+]]: tensor<?x?xi32>, %[[C:[a-z0-9]+]]: tensor<?x?xi32>,
-// CHECK-SAME:  %[[I:[a-z0-9]+]]: index, %[[J:[a-z0-9]+]]: index,
-// CHECK-SAME:  %[[ARG_DIM0:[a-z0-9]+]]: index, %[[ARG_DIM1:[a-z0-9]+]]: index)
 
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:  %[[C1:.*]] = arith.constant 1
-// CHECK:      %[[INIT_TILE:.*]] = gml_st.tile
-// CHECK-SAME:     [%[[I]], %[[J]]] [%[[ARG_DIM0]], %[[ARG_DIM1]]]
+// -----
 
-// CHECK:      %[[DIM_2:.*]] = tensor.dim %[[A]], %[[C1]]
-// CHECK:      %[[MINUI:.*]] = arith.minui %[[J]], %[[DIM_2]]
-// CHECK:      %[[SUBI:.*]] = arith.subi %[[DIM_2]], %[[MINUI]]
-// CHECK:      %[[MINUI_0:.*]] = arith.minui %[[SUBI]], %[[ARG_DIM1]]
-// CHECK:      %[[A_TILE:.*]] = gml_st.tile
-// CHECK-SAME:     [%[[I]], %[[MINUI]]]
-// CHECK-SAME:     [%[[ARG_DIM0]], %[[MINUI_0]]]
-// CHECK:      %[[A_SUB:.*]] = gml_st.materialize %[[A]][%[[A_TILE]]]
-
-// CHECK:      %[[CMPI:.*]] = arith.cmpi ule, %[[J]], %[[DIM_2]]
-// CHECK:      %[[SUBI_0:.*]] = arith.subi %[[J]], %[[DIM_2]]
-// CHECK:      %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[SUBI_0]]
-// CHECK:      %[[DIM_3:.*]] = tensor.dim %[[B]], %[[C1]]
-// CHECK:      %[[MINUI_1:.*]] = arith.minui %[[SELECT]], %[[DIM_3]]
-// CHECK:      %[[SUBI_1:.*]] = arith.subi %[[DIM_3]], %[[MINUI_1]]
-// CHECK:      %[[MINUI_2:.*]] = arith.minui %[[SUBI_1]], %[[ARG_DIM1]]
-// CHECK:      %[[B_TILE:.*]] = gml_st.tile
-// CHECK-SAME:     [%[[I]], %[[MINUI_1]]]
-// CHECK-SAME:     [%[[ARG_DIM0]], %[[MINUI_2]]]
-// CHECK:      %[[B_SUB:.*]] = gml_st.materialize %[[B]][%[[B_TILE]]]
-
-// CHECK:      %[[CMPI_0:.*]] = arith.cmpi ule, %[[SELECT]], %[[DIM_3]]
-// CHECK:      %[[SUBI_2:.*]] = arith.subi %[[SELECT]], %[[DIM_3]]
-// CHECK:      %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[C0]], %[[SUBI_2]]
-// CHECK:      %[[DIM_4:.*]] = tensor.dim %[[C]], %[[C1]]
-// CHECK:      %[[MINUI_3:.*]] = arith.minui %[[SELECT_0]], %[[DIM_4]]
-// CHECK:      %[[SUBI_3:.*]] = arith.subi %[[DIM_4]], %[[MINUI_3]]
-// CHECK:      %[[MINUI_4:.*]] = arith.minui %[[SUBI_3]], %[[ARG_DIM1]]
-// CHECK:      %[[C_TILE:.*]] = gml_st.tile
-// CHECK-SAME:     [%[[I]], %[[MINUI_3]]]
-// CHECK-SAME:     [%[[ARG_DIM0]], %[[MINUI_4]]]
-// CHECK:      %[[C_SUB:.*]] = gml_st.materialize %[[C]][%[[C_TILE]]]
-// CHECK:      %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%[[INIT_TILE]]]
-// CHECK:      %[[CONCATENATE:.*]] = thlo.concatenate
-// CHECK-SAME:     ins(%[[A_SUB]] : tensor<?x?xi32>, %[[B_SUB]] : tensor<?x?xi32>,
-// CHECK-SAME:         %[[C_SUB]] : tensor<?x?xi32>)
-// CHECK-SAME:     outs(%[[INIT_SUB]] : tensor<?x?xi32>)
-// CHECK-SAME:     dimension = 1
-// CHECK:      return {op_label = "consumer"} %[[CONCATENATE]]
+// CHECK-LABEL: @binary_concatenate_with_unit_dims
+// CHECK-SAME:  %[[INIT:.*]]: tensor<?x?xi32>, %[[ARG_A:.*]]: tensor<?x?xi32>, %[[ARG_B:.*]]: tensor<?x?xi32>, %[[I:.*]]: index, %[[J:.*]]: index, %[[N:.*]]: index
+func.func @binary_concatenate_with_unit_dims(%init : tensor<?x?xi32>,
+    %a: tensor<?x?xi32>, %b: tensor<?x?xi32>, %i: index, %j: index, %n: index)
+    -> tensor<?x1xi32> {
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1
+  // CHECK:     %[[DIM:.*]] = tensor.dim %[[ARG_A]], %[[C1]]
+  // CHECK:     %[[CMPI:.*]] = arith.cmpi ult, %[[J]], %[[DIM]]
+  // CHECK:     %[[IF:.*]] = scf.if %[[CMPI]] -> (tensor<?x1xi32>)
+  // CHECK:       %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG_A]][%[[I]], %[[J]]] [%[[N]], 1] [1, 1]
+  // CHECK:       scf.yield %[[MATERIALIZE]]
+  // CHECK:     else
+  // CHECK:       %[[SUBI:.*]] = arith.subi %[[J]], %[[DIM]]
+  // CHECK:       %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG_B]][%[[I]], %[[SUBI]]] [%[[N]], 1] [1, 1]
+  // CHECK:       scf.yield %[[MATERIALIZE_0]]
+  // CHECK:     return {op_label = "consumer"} %[[IF]]
+  %concat = thlo.concatenate ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>)
+      outs(%init : tensor<?x?xi32>) dimension = 1 { op_label = "producer" }
+  %tiled_concat = tensor.extract_slice %concat[%i, %j] [%n, 1] [1, 1]
+      : tensor<?x?xi32> to tensor<?x1xi32>
+  func.return { op_label = "consumer" } %tiled_concat : tensor<?x1xi32>
+}
 
 // -----
 
@@ -140,9 +130,8 @@ func.func @add(%lhs: tensor<32x32xf32>, %rhs: tensor<32x32xf32>, %i: index,
     %add = arith.addf %lhs_scalar, %rhs_scalar : f32
     linalg.yield %add : f32
   } -> tensor<32x32xf32>
-  %tile = gml_st.tile [%i, %j] [%arg_dim0, %arg_dim1] [1, 1] : !gml_st.tile<?x?>
-  %result = gml_st.materialize %linalg[%tile]
-      : tensor<32x32xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
+  %result = tensor.extract_slice %linalg[%i, %j] [%arg_dim0, %arg_dim1] [1, 1]
+      : tensor<32x32xf32> to tensor<?x?xf32>
   return { op_label = "consumer" } %result : tensor<?x?xf32>
 }
 // CHECK-LABEL: @add
@@ -152,11 +141,12 @@ func.func @add(%lhs: tensor<32x32xf32>, %rhs: tensor<32x32xf32>, %i: index,
 // CHECK-SAME:  %[[ARG_DIM0:[a-z0-9]+]]: index, %[[ARG_DIM1:[a-z0-9]+]]: index)
 
 // CHECK:      %[[INIT:.*]] = tensor.empty()
-// CHECK:      %[[TILE:.*]] = gml_st.tile
+// CHECK:      %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS]]
+// CHECK-SAME:     [%[[I]], %[[J]]] [%[[ARG_DIM0]], %[[ARG_DIM1]]]
+// CHECK:      %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS]]
+// CHECK-SAME:     [%[[I]], %[[J]]] [%[[ARG_DIM0]], %[[ARG_DIM1]]]
+// CHECK:      %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT]]
 // CHECK-SAME:     [%[[I]], %[[J]]] [%[[ARG_DIM0]], %[[ARG_DIM1]]]
-// CHECK:      %[[LHS_SUB:.*]] = gml_st.materialize %[[LHS]][%[[TILE]]]
-// CHECK:      %[[RHS_SUB:.*]] = gml_st.materialize %[[RHS]][%[[TILE]]]
-// CHECK:      %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%[[TILE]]]
 // CHECK:      %[[GENERIC:.*]] = linalg.generic
 // CHECK-SAME:     iterator_types = ["parallel", "parallel"]
 // CHECK-SAME:     ins(%[[LHS_SUB]], %[[RHS_SUB]] : tensor<?x?xf32>,
@@ -187,9 +177,8 @@ func.func @empty(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>,
     linalg.yield %arg2 : f32
   } -> tensor<?x?xf32>
 
-  %tile = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
-  %elem =  gml_st.materialize %result[%tile]
-    : tensor<?x?xf32>[!gml_st.tile<1x1>] to tensor<1x1xf32>
+  %elem =  tensor.extract_slice %result[%i, %j] [1, 1] [1, 1]
+    : tensor<?x?xf32> to tensor<1x1xf32>
   return { op_label = "consumer" } %elem : tensor<1x1xf32>
 }
 // CHECK-LABEL: @empty
@@ -202,8 +191,8 @@ func.func @empty(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>,
 // CHECK:      %[[DIM_0:.*]] = tensor.dim %[[ARG0]], %[[C1]]
 // CHECK:      %[[INIT:.*]] = tensor.empty(%[[DIM]], %[[DIM_0]])
 
-// CHECK:      %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]]] [1, 1]
-// CHECK:      %[[MATERIALIZE:.*]] = gml_st.materialize %[[INIT]][%[[TILE]]]
+// CHECK:      %[[MATERIALIZE:.*]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:   [%[[I]], %[[J]]] [1, 1]
 // CHECK:      return {op_label = "consumer"} %[[MATERIALIZE]]
 
 // -----
@@ -226,8 +215,8 @@ func.func @dim_reification_fission(%arg: tensor<?xf32>) -> index {
 func.func @dim_reification_materialize(%arg: tensor<?x?xf32>,
     %arg_dim0: index, %arg_dim1: index) -> index {
   %c0 = arith.constant 0 : index
-  %tile = gml_st.tile [0, 0] [%arg_dim0, %arg_dim1] [1, 1] : !gml_st.tile<?x?>
-  %0 = gml_st.materialize %arg[%tile] : tensor<?x?xf32>[!gml_st.tile<?x?>] to tensor<?x?xf32>
+  %0 = tensor.extract_slice %arg[0, 0] [%arg_dim0, %arg_dim1] [1, 1]
+    : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.dim %0, %c0 : tensor<?x?xf32>
   return %1 : index
 }
@@ -296,7 +285,7 @@ func.func @dim_reification_concatenate(%init : tensor<?x?xi32>,
   %concat = thlo.concatenate
       ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>, %c : tensor<?x?xi32>)
       outs(%init : tensor<?x?xi32>)
-      {dimension = 1 : i64}
+      dimension = 1
   %dim = tensor.dim %concat, %c1 : tensor<?x?xi32>
   func.return %dim : index
 }
@@ -321,13 +310,15 @@ func.func @fusion_into_materialize_element(
     %0 = arith.negf %in : f32
     linalg.yield %0 : f32
   } -> tensor<?xf32>
-  %tile = gml_st.tile [%idx] [1] [1] : !gml_st.tile<1>
-  %res = gml_st.materialize %neg[%tile] : tensor<?xf32>[!gml_st.tile<1>] to f32
-  return { op_label="consumer" } %res : f32
+  %res_slice = tensor.extract_slice %neg[%idx] [1] [1]
+    : tensor<?xf32> to tensor<1xf32>
+  %c0 = arith.constant 0 : index
+  %res = tensor.extract %res_slice[%c0] { op_label="consumer" } : tensor<1xf32>
+  return  %res : f32
 }
 // CHECK-LABEL: @fusion_into_materialize_element
-// CHECK: %[[RES:.*]] = tensor.extract
-// CHECK: return {{.*}} %[[RES]]
+// CHECK-COUNT-2: tensor.extract_slice
+// CHECK:         linalg.generic
 
 // -----
 
@@ -350,22 +341,20 @@ func.func @matmul(%lhs: tensor<128x16xf32>,
   %fill = linalg.fill { op_label = "producer" } ins(%cst : f32)
       outs(%init : tensor<128x256xf32>) -> tensor<128x256xf32>
   %matmul = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c128, %c256)
-      step (%c8, %c8) {
-    %lhs_tile = gml_st.tile [%i, 0] [8, 16] [1, 1] : !gml_st.tile<8x16>
-    %lhs_sub = gml_st.materialize %lhs[%lhs_tile]
-      : tensor<128x16xf32>[!gml_st.tile<8x16>] to tensor<8x16xf32>
-    %rhs_tile = gml_st.tile [0, %j] [16, 8] [1, 1] : !gml_st.tile<16x8>
-    %rhs_sub = gml_st.materialize %rhs[%rhs_tile]
-      : tensor<16x256xf32>[!gml_st.tile<16x8>] to tensor<16x8xf32>
-    %out_tile = gml_st.tile [%i, %j] [8, 8] [1, 1] : !gml_st.tile<8x8>
-    %out_sub = gml_st.materialize %fill[%out_tile]
-      : tensor<128x256xf32>[!gml_st.tile<8x8>] to tensor<8x8xf32>
+      step (%c8, %c8) outs (%out = %fill: tensor<128x256xf32>) {
+    %lhs_sub = tensor.extract_slice %lhs[%i, 0] [8, 16] [1, 1]
+      : tensor<128x16xf32> to tensor<8x16xf32>
+    %rhs_sub = tensor.extract_slice %rhs[0, %j] [16, 8] [1, 1]
+      : tensor<16x256xf32> to tensor<16x8xf32>
+    %out_sub = tensor.extract_slice %out[%i, %j] [8, 8] [1, 1]
+      : tensor<128x256xf32> to tensor<8x8xf32>
 
     %matmul_sub = linalg.matmul { op_label="consumer" }
       ins(%lhs_sub, %rhs_sub : tensor<8x16xf32>, tensor<16x8xf32>)
       outs(%out_sub : tensor<8x8xf32>) -> tensor<8x8xf32>
 
-    gml_st.set_yield %matmul_sub into %fill[%out_tile]
+    %out_tile = gml_st.tile [%i, %j] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    gml_st.set_yield %matmul_sub into %out[%out_tile]
       : tensor<8x8xf32> into tensor<128x256xf32>[!gml_st.tile<8x8>]
   } : tensor<128x256xf32>
   return %matmul : tensor<128x256xf32>
@@ -377,12 +366,12 @@ func.func @matmul(%lhs: tensor<128x16xf32>,
 // CHECK:      %[[C0:.*]] = arith.constant 0 : index
 // CHECK:      %[[EMPTY:.*]] = tensor.empty() : tensor<128x256xf32>
 // CHECK:       gml_st.parallel (%[[I:[a-z0-9]+]], %[[J:[a-z0-9]+]])
-// CHECK:        %[[OUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]] [8, 8] [1, 1]
-// CHECK:        %[[OUT_SUB:.*]] = gml_st.materialize %[[EMPTY]][%[[OUT_TILE]]]
+// CHECK-SAME:     outs (%[[OUT_:.*]] = %[[EMPTY]]:
+// CHECK:        %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]] [8, 8] [1, 1]
 // CHECK:        %[[FILL:.*]] = linalg.fill
 // CHECK-SAME:     outs(%[[OUT_SUB]] : tensor<8x8xf32>) -> tensor<8x8xf32>
 // CHECK:        %[[MATMUL:.*]] = linalg.matmul
 // CHECK-SAME:     outs(%[[FILL]] : tensor<8x8xf32>) -> tensor<8x8xf32>
-// CHECK:        gml_st.set_yield %[[MATMUL]] into %[[EMPTY]][%[[OUT_TILE]]]
+// CHECK:        %[[OUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]] [8, 8] [1, 1]
+// CHECK:        gml_st.set_yield %[[MATMUL]] into %[[OUT_]][%[[OUT_TILE]]]
 // CHECK-SAME:     : tensor<8x8xf32> into tensor<128x256xf32>[!gml_st.tile<8x8>]
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_simtfy.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_simtfy.mlir
new file mode 100644
index 00000000000..df1c36465c2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_simtfy.mlir
@@ -0,0 +1,221 @@
+// RUN: mlir-hlo-opt %s -split-input-file -verify-diagnostics \
+// RUN:   --gml-st-simtfy="block-distribution-label=block" -cse \
+// RUN: | FileCheck %s
+// We run CSE above to deduplicate constant definitions, which would confuse
+// FileCheck.
+
+#map = affine_map<(d0)[s0] -> (d0 + s0)>
+
+func.func @simple(%arg2: memref<2048xf32>) -> memref<2048xf32> {
+  %c0 = arith.constant 0 : index
+  %c2048 = arith.constant 2048 : index
+  %c128 = arith.constant 128 : index
+  %c32 = arith.constant 32 : index
+  %c1 = arith.constant 1 : index
+  %2 = memref.alloc() {alignment = 64 : i64} : memref<2048xf32>
+  gml_st.parallel (%arg3) = (%c0) to (%c2048) step (%c128) distribution ("block") {
+    %3 = memref.subview %2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
+    %4 = memref.subview %arg2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
+    gml_st.parallel (%arg4) = (%c0) to (%c128) step (%c32) distribution ("warp") {
+      %5 = memref.subview %3[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
+      %6 = memref.subview %4[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
+      gml_st.parallel (%arg5) = (%c0) to (%c32) step (%c1) distribution ("thread") {
+        %7 = memref.load %6[%arg5] : memref<32xf32, #map>
+        %8 = math.log %7 : f32
+        memref.store %8, %5[%arg5] : memref<32xf32, #map>
+        gml_st.set_yield
+      }
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return %2 : memref<2048xf32>
+}
+
+// CHECK-LABEL: @simple
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C32:.*]] = arith.constant 32 : index
+// CHECK-DAG:   %[[C128:.*]] = arith.constant 128 : index
+// CHECK-DAG:   %[[C2048:.*]] = arith.constant 2048 : index
+// CHECK-DAG:   %[[MAP0:.*]] = affine.apply #map(%[[C2048]])[%[[C0]], %[[C128]]]
+// CHECK-DAG:   %[[MAP1:.*]] = affine.apply #map(%[[C128]])[%[[C0]], %[[C32]]]
+// CHECK-DAG:   %[[MAP2:.*]] = affine.apply #map(%[[C32]])[%[[C0]], %[[C1]]]
+// CHECK:       gpu.launch blocks
+// CHECK-SAME:  ({{.*}}) in ({{.*}} = %[[MAP0]], {{.*}} = %[[C1]], {{.*}} = %[[C1]]) threads
+// CHECK-SAME:  ({{.*}}) in ({{.*}} = %[[MAP2]], {{.*}} = %[[MAP1]], {{.*}} = %[[C1]])
+// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C128]]]
+// CHECK-NEXT:  memref.subview
+// CHECK-SAME:  "gml-st-distribution-label" = "block"
+// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C32]]]
+// CHECK-NEXT:  memref.subview
+// CHECK-SAME:  "gml-st-distribution-label" = "warp"
+// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C1]]]
+// CHECK-NOT:   scf.if
+// CHECK:       memref.load
+// CHECK-NOT:   "gml-st-distribution-label"
+// CHECK:       math.log
+// CHECK:       memref.store
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (d0 + s0)>
+
+func.func @sibling_parallels(%arg2: memref<2048xf32>) -> memref<2048xf32> {
+  %c0 = arith.constant 0 : index
+  %c2048 = arith.constant 2048 : index
+  %c128 = arith.constant 128 : index
+  %c32 = arith.constant 32 : index
+  %c1 = arith.constant 1 : index
+  %2 = memref.alloc() {alignment = 64 : i64} : memref<2048xf32>
+  gml_st.parallel (%arg3) = (%c0) to (%c2048) step (%c128) distribution ("block") {
+    %3 = memref.subview %2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
+    %4 = memref.subview %arg2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
+    gml_st.parallel (%arg4) = (%c0) to (%c128) step (%c32) distribution ("warp") {
+      %5 = memref.subview %3[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
+      %6 = memref.subview %4[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
+      %7 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+      gml_st.parallel (%arg5) = (%c0) to (%c32) step (%c1) distribution ("thread") {
+        %8 = memref.load %6[%arg5] : memref<32xf32, #map>
+        %9 = math.log %8 : f32
+        memref.store %9, %7[%arg5] : memref<32xf32>
+        gml_st.set_yield
+      }
+      gml_st.parallel (%arg6) = (%c0) to (%c32) step (%c1) distribution ("thread") {
+        %10 = memref.load %7[%arg6] : memref<32xf32>
+        %11 = math.absf %10 : f32
+        memref.store %11, %5[%arg6] : memref<32xf32, #map>
+        gml_st.set_yield
+      }
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return %2 : memref<2048xf32>
+}
+
+// CHECK-LABEL: @sibling_parallels
+// CHECK:       gpu.launch blocks
+// CHECK:       affine.apply
+// CHECK:       affine.apply
+// CHECK:       affine.apply
+// CHECK:       memref.load
+// CHECK:       math.log
+// CHECK:       memref.store
+// CHECK-NOT:   affine.apply
+// CHECK:       memref.load
+// CHECK:       math.absf
+// CHECK:       memref.store
+
+// -----
+
+func.func @too_deep_nesting() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %alloc = memref.alloc() : memref<index>
+  // expected-error@+1 {{failed to simtfy}}
+  gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1) distribution ("block") {
+    gml_st.parallel (%arg4) = (%c0) to (%c1) step (%c1) distribution ("warp") {
+      gml_st.parallel (%arg5) = (%c0) to (%c1) step (%c1) distribution ("thread") {
+        gml_st.parallel (%arg6) = (%c0) to (%c1) step (%c1) {
+          memref.store %c0, %alloc[] : memref<index>
+          gml_st.set_yield
+        }
+        gml_st.set_yield
+      }
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return
+}
+
+
+// -----
+
+func.func @mismatched_bounds() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %alloc1 = memref.alloc() : memref<index>
+  %alloc2 = memref.alloc() : memref<index>
+  // expected-error@+1 {{failed to simtfy}}
+  gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1) distribution ("block") {
+    gml_st.parallel (%arg4) = (%c0) to (%c1) step (%c1) distribution ("warp") {
+      gml_st.parallel (%arg5) = (%c0) to (%c1) step (%c1) distribution ("thread") {
+        memref.store %c0, %alloc1[] : memref<index>
+        gml_st.set_yield
+      }
+      gml_st.parallel (%arg6) = (%c0) to (%c2) step (%c1) distribution ("thread") {
+        memref.store %c0, %alloc2[] : memref<index>
+        gml_st.set_yield
+      }
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return
+}
+
+// -----
+
+func.func @mmultple_induction_vars() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %alloc = memref.alloc() : memref<index>
+  // expected-error@+1 {{failed to simtfy}}
+  gml_st.parallel (%arg1, %arg2) = (%c0, %c0) to (%c1, %c1) step (%c1, %c1) distribution ("block") {
+    memref.store %c0, %alloc[] : memref<index>
+    gml_st.set_yield
+  }
+  return
+}
+
+// -----
+
+#layout = strided<[1], offset: ?>
+
+func.func @imperfect_tiling(%arg0: memref<2051xf32>) -> memref<2051xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c32 = arith.constant 32 : index
+  %c128 = arith.constant 128 : index
+  %c2051 = arith.constant 2051 : index
+  %0 = memref.alloc() {alignment = 64 : i64} : memref<2051xf32>
+  gml_st.parallel (%arg1) = (%c0) to (%c2051) step (%c128) distribution ("block") {
+    %1 = affine.min affine_map<(d0) -> (-d0 + 2051, 128)>(%arg1)
+    %2 = memref.subview %arg0[%arg1] [%1] [1] : memref<2051xf32> to memref<?xf32, #layout>
+    %3 = memref.subview %0[%arg1] [%1] [1] : memref<2051xf32> to memref<?xf32, #layout>
+    gml_st.parallel (%arg2) = (%c0) to (%1) step (%c32) distribution ("warp") {
+      %4 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 32)>(%arg2)[%1]
+      %5 = memref.subview %2[%arg2] [%4] [1] : memref<?xf32, #layout> to memref<?xf32, #layout>
+      %6 = memref.subview %3[%arg2] [%4] [1] : memref<?xf32, #layout> to memref<?xf32, #layout>
+      gml_st.parallel (%arg3) = (%c0) to (%4) step (%c1) distribution ("thread") {
+        %7 = memref.load %5[%arg3] : memref<?xf32, #layout>
+        %8 = math.log %7 : f32
+        memref.store %8, %6[%arg3] : memref<?xf32, #layout>
+        gml_st.set_yield
+      }
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return %0 : memref<2051xf32>
+}
+
+// CHECK-LABEL: @imperfect_tiling
+// CHECK:       gpu.launch blocks(%[[BLOCKID:.*]], %{{.*}}, %{{.*}}) in {{.*}} threads
+// CHECK-SAME:             (%[[THREADID:.*]], %[[WARPID:.*]], %{{.*}}) in
+// CHECK-DAG:   %[[ARG1:.*]] = affine.apply {{.*}}(%[[BLOCKID]])
+// CHECK-DAG:   %[[BTILESIZE:.*]] = affine.min {{.*}}(%[[ARG1]])
+// CHECK-DAG:   %[[ARG2:.*]] = affine.apply {{.*}}(%[[WARPID]])
+// CHECK-DAG:   %[[WCOND:.*]] = arith.cmpi slt, %[[ARG2:.*]], %[[BTILESIZE]]
+// CHECK-DAG:   scf.if %[[WCOND]]
+// CHECK-DAG:   %[[WTILESIZE:.*]] = affine.min {{.*}}(%[[ARG2]])[%[[BTILESIZE]]]
+// CHECK-DAG:   %[[ARG3:.*]] = affine.apply {{.*}}(%[[THREADID]])
+// CHECK-DAG:   %[[TCOND:.*]] = arith.cmpi slt, %[[ARG3:.*]], %[[WTILESIZE]]
+// CHECK-DAG:   scf.if %[[TCOND]]
+// CHECK:       memref.load
+// CHECK:       math.log
+// CHECK:       memref.store
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_gpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_gpu.mlir
index 86308229cb5..af05f1175de 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_gpu.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_gpu.mlir
@@ -1,221 +1,11 @@
 // RUN: mlir-hlo-opt %s -split-input-file -verify-diagnostics \
-// RUN:   -gml-st-to-gpu -cse \
+// RUN:   --gml-st-to-gpu="warp-distribution-label=warp" -cse \
 // RUN: | FileCheck %s
 // We run CSE above to deduplicate constant definitions, which would confuse
 // FileCheck.
 
-#map = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @simple(%arg2: memref<2048xf32>) -> memref<2048xf32> {
-  %c0 = arith.constant 0 : index
-  %c2048 = arith.constant 2048 : index
-  %c128 = arith.constant 128 : index
-  %c32 = arith.constant 32 : index
-  %c1 = arith.constant 1 : index
-  %2 = memref.alloc() {alignment = 64 : i64} : memref<2048xf32>
-  gml_st.parallel (%arg3) = (%c0) to (%c2048) step (%c128) {
-    %3 = memref.subview %2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
-    %4 = memref.subview %arg2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
-    gml_st.parallel (%arg4) = (%c0) to (%c128) step (%c32) {
-      %5 = memref.subview %3[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
-      %6 = memref.subview %4[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
-      gml_st.parallel (%arg5) = (%c0) to (%c32) step (%c1) {
-        %7 = memref.load %6[%arg5] : memref<32xf32, #map>
-        %8 = math.log %7 : f32
-        memref.store %8, %5[%arg5] : memref<32xf32, #map>
-        gml_st.set_yield
-      }
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return %2 : memref<2048xf32>
-}
-
-// CHECK-LABEL: @simple
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C32:.*]] = arith.constant 32 : index
-// CHECK-DAG:   %[[C128:.*]] = arith.constant 128 : index
-// CHECK:       gpu.launch blocks
-// CHECK-SAME:  ({{.*}}) in ({{.*}} = %[[C16]], {{.*}} = %[[C1]], {{.*}} = %[[C1]]) threads
-// CHECK-SAME:  ({{.*}}) in ({{.*}} = %[[C32]], {{.*}} = %[[C4]], {{.*}} = %[[C1]])
-// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C128]]]
-// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C32]]]
-// CHECK:       affine.apply {{.*}}[%[[C0]], %[[C1]]]
-// CHECK-NOT:   scf.if
-// CHECK:       memref.load
-// CHECK:       math.log
-// CHECK:       memref.store
-
-// -----
-
-#map = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @sibling_parallels(%arg2: memref<2048xf32>) -> memref<2048xf32> {
-  %c0 = arith.constant 0 : index
-  %c2048 = arith.constant 2048 : index
-  %c128 = arith.constant 128 : index
-  %c32 = arith.constant 32 : index
-  %c1 = arith.constant 1 : index
-  %2 = memref.alloc() {alignment = 64 : i64} : memref<2048xf32>
-  gml_st.parallel (%arg3) = (%c0) to (%c2048) step (%c128) {
-    %3 = memref.subview %2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
-    %4 = memref.subview %arg2[%arg3] [128] [1] : memref<2048xf32> to memref<128xf32, #map>
-    gml_st.parallel (%arg4) = (%c0) to (%c128) step (%c32) {
-      %5 = memref.subview %3[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
-      %6 = memref.subview %4[%arg4] [32] [1] : memref<128xf32, #map> to memref<32xf32, #map>
-      %7 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
-      gml_st.parallel (%arg5) = (%c0) to (%c32) step (%c1) {
-        %8 = memref.load %6[%arg5] : memref<32xf32, #map>
-        %9 = math.log %8 : f32
-        memref.store %9, %7[%arg5] : memref<32xf32>
-        gml_st.set_yield
-      }
-      gml_st.parallel (%arg6) = (%c0) to (%c32) step (%c1) {
-        %10 = memref.load %7[%arg6] : memref<32xf32>
-        %11 = math.absf %10 : f32
-        memref.store %11, %5[%arg6] : memref<32xf32, #map>
-        gml_st.set_yield
-      }
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return %2 : memref<2048xf32>
-}
-
-// CHECK-LABEL: @sibling_parallels
-// CHECK:       gpu.launch blocks
-// CHECK:       affine.apply
-// CHECK:       affine.apply
-// CHECK:       affine.apply
-// CHECK:       memref.load
-// CHECK:       math.log
-// CHECK:       memref.store
-// CHECK-NOT:   affine.apply
-// CHECK:       memref.load
-// CHECK:       math.absf
-// CHECK:       memref.store
-
-// -----
-
-func.func @too_deep_nesting() {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %alloc = memref.alloc() : memref<index>
-  gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1) {
-    gml_st.parallel (%arg4) = (%c0) to (%c1) step (%c1) {
-      gml_st.parallel (%arg5) = (%c0) to (%c1) step (%c1) {
-        // expected-error@+1 {{failed to simtfy}}
-        gml_st.parallel (%arg6) = (%c0) to (%c1) step (%c1) {
-          memref.store %c0, %alloc[] : memref<index>
-          gml_st.set_yield
-        }
-        gml_st.set_yield
-      }
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return
-}
-
-
-// -----
-
-func.func @mismatched_bounds() {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %alloc1 = memref.alloc() : memref<index>
-  %alloc2 = memref.alloc() : memref<index>
-  gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1) {
-    gml_st.parallel (%arg4) = (%c0) to (%c1) step (%c1) {
-      // expected-error@+1 {{failed to simtfy}}
-      gml_st.parallel (%arg5) = (%c0) to (%c1) step (%c1) {
-        memref.store %c0, %alloc1[] : memref<index>
-        gml_st.set_yield
-      }
-      gml_st.parallel (%arg6) = (%c0) to (%c2) step (%c1) {
-        memref.store %c0, %alloc2[] : memref<index>
-        gml_st.set_yield
-      }
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return
-}
-
-// -----
-
-func.func @mmultple_induction_vars() {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %alloc = memref.alloc() : memref<index>
-  // expected-error@+1 {{failed to simtfy}}
-  gml_st.parallel (%arg1, %arg2) = (%c0, %c0) to (%c1, %c1) step (%c1, %c1) {
-    memref.store %c0, %alloc[] : memref<index>
-    gml_st.set_yield
-  }
-  return
-}
-
-// -----
-
-#layout = strided<[1], offset: ?>
-
-func.func @imperfect_tiling(%arg0: memref<2051xf32>) -> memref<2051xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c32 = arith.constant 32 : index
-  %c128 = arith.constant 128 : index
-  %c2051 = arith.constant 2051 : index
-  %0 = memref.alloc() {alignment = 64 : i64} : memref<2051xf32>
-  gml_st.parallel (%arg1) = (%c0) to (%c2051) step (%c128) {
-    %1 = affine.min affine_map<(d0) -> (-d0 + 2051, 128)>(%arg1)
-    %2 = memref.subview %arg0[%arg1] [%1] [1] : memref<2051xf32> to memref<?xf32, #layout>
-    %3 = memref.subview %0[%arg1] [%1] [1] : memref<2051xf32> to memref<?xf32, #layout>
-    gml_st.parallel (%arg2) = (%c0) to (%1) step (%c32) {
-      %4 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 32)>(%arg2)[%1]
-      %5 = memref.subview %2[%arg2] [%4] [1] : memref<?xf32, #layout> to memref<?xf32, #layout>
-      %6 = memref.subview %3[%arg2] [%4] [1] : memref<?xf32, #layout> to memref<?xf32, #layout>
-      gml_st.parallel (%arg3) = (%c0) to (%4) step (%c1) {
-        %7 = memref.load %5[%arg3] : memref<?xf32, #layout>
-        %8 = math.log %7 : f32
-        memref.store %8, %6[%arg3] : memref<?xf32, #layout>
-        gml_st.set_yield
-      }
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return %0 : memref<2051xf32>
-}
-
-// CHECK-LABEL: @imperfect_tiling
-// CHECK:       gpu.launch blocks(%[[BLOCKID:.*]], %{{.*}}, %{{.*}}) in {{.*}} threads
-// CHECK-SAME:             (%[[THREADID:.*]], %[[WARPID:.*]], %{{.*}}) in
-// CHECK-DAG:   %[[ARG1:.*]] = affine.apply {{.*}}(%[[BLOCKID]])
-// CHECK-DAG:   %[[BTILESIZE:.*]] = affine.min {{.*}}(%[[ARG1]])
-// CHECK-DAG:   %[[ARG2:.*]] = affine.apply {{.*}}(%[[WARPID]])
-// CHECK-DAG:   %[[WCOND:.*]] = arith.cmpi slt, %[[ARG2:.*]], %[[BTILESIZE]]
-// CHECK-DAG:   scf.if %[[WCOND]]
-// CHECK-DAG:   %[[WTILESIZE:.*]] = affine.min {{.*}}(%[[ARG2]])[%[[BTILESIZE]]]
-// CHECK-DAG:   %[[ARG3:.*]] = affine.apply {{.*}}(%[[THREADID]])
-// CHECK-DAG:   %[[TCOND:.*]] = arith.cmpi slt, %[[ARG3:.*]], %[[WTILESIZE]]
-// CHECK-DAG:   scf.if %[[TCOND]]
-// CHECK:       memref.load
-// CHECK:       math.log
-// CHECK:       memref.store
-
-// -----
-
-#layout = strided<[1], offset: ?>
+#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
+#map1 = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
 func.func @vectorized_tiling(%arg0: memref<2048xf32>) -> memref<2048xf32> {
   %c2048 = arith.constant 2048 : index
@@ -223,38 +13,32 @@ func.func @vectorized_tiling(%arg0: memref<2048xf32>) -> memref<2048xf32> {
   %c1024 = arith.constant 1024 : index
   %c128 = arith.constant 128 : index
   %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
+  %c0f = arith.constant 0.0 : f32
   %alloc = memref.alloc() {alignment = 64 : i64} : memref<2048xf32>
-  gml_st.parallel (%arg1) = (%c0) to (%c2048) step (%c1024) {
-    %subview = memref.subview %arg0[%arg1] [1024] [1]
-      : memref<2048xf32> to memref<1024xf32, #layout>
-    %subview_0 = memref.subview %alloc[%arg1] [1024] [1]
-      : memref<2048xf32> to memref<1024xf32, #layout>
-    gml_st.parallel (%arg2) = (%c0) to (%c1024) step (%c128) {
-      %subview_1 = memref.subview %subview[%arg2] [128] [1]
-        : memref<1024xf32, #layout> to memref<128xf32, #layout>
-      %0 = vector.transfer_read %subview_1[%c0], %cst {in_bounds = [true]}
-        : memref<128xf32, #layout>, vector<128xf32>
-      %subview_2 = memref.subview %subview_0[%arg2] [128] [1]
-        : memref<1024xf32, #layout> to memref<128xf32, #layout>
-      %1 = vector.transfer_read %subview_2[%c0], %cst {in_bounds = [true]}
-        : memref<128xf32, #layout>, vector<128xf32>
-      %2 = gml_st.parallel (%arg3) = (%c0) to (%c128) step (%c4) {
-        %4 = gml_st.tile [%arg3] [4] [1] : !gml_st.tile<4>
-        %5 = gml_st.materialize %0[%4]
-          : vector<128xf32>[!gml_st.tile<4>] to vector<4xf32>
-        %6 = math.absf %5 : vector<4xf32>
-        gml_st.set_yield %6 into %1[%4]
-          : vector<4xf32> into vector<128xf32>[!gml_st.tile<4>]
-      } : vector<128xf32>
-      vector.transfer_write %2, %subview_2[%c0] {in_bounds = [true]}
-        : vector<128xf32>, memref<128xf32, #layout>
-      gml_st.set_yield
-    }
-    gml_st.set_yield
+  %c1 = arith.constant 1 : index
+  %map0 = affine.apply #map(%c2048)[%c0, %c1024]
+  %map1 = affine.apply #map(%c1024)[%c0, %c128]
+  %map2 = affine.apply #map(%c128)[%c0, %c4]
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %map0, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %map2, %block_y = %map1, %block_z = %c1) {
+    %apply_bx = affine.apply #map1(%bx)[%c0, %c1024]
+    %block_arg = memref.subview %arg0[%apply_bx] [1024] [1] {"gml-st-distribution-label" = "block"} : memref<2048xf32> to memref<1024xf32, strided<[1], offset: ?>>
+    %block_out = memref.subview %alloc[%apply_bx] [1024] [1] {"gml-st-distribution-label" = "block"} : memref<2048xf32> to memref<1024xf32, strided<[1], offset: ?>>
+    %apply_ty = affine.apply #map1(%ty)[%c0, %c128]
+    %warp_arg = memref.subview %block_arg[%apply_ty] [128] [1] {"gml-st-distribution-label" = "warp"} : memref<1024xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>>
+    %transfer_read = vector.transfer_read %warp_arg[%c0], %c0f {"gml-st-distribution-label" = "warp", in_bounds = [true]} : memref<128xf32, strided<[1], offset: ?>>, vector<128xf32>
+    %warp_out = memref.subview %block_out[%apply_ty] [128] [1] {"gml-st-distribution-label" = "warp"} : memref<1024xf32, strided<[1], offset: ?>> to memref<128xf32, strided<[1], offset: ?>>
+    %apply_tx = affine.apply #map1(%tx)[%c0, %c4]
+    %materialized_tile = gml_st.materialize %transfer_read[%apply_tx] [4] [1]
+      : vector<128xf32> to vector<4xf32>
+    %result = math.absf %materialized_tile : vector<4xf32>
+    %tile = gml_st.tile [%apply_tx] [4] [1] : !gml_st.tile<4>
+    %distribute = gml_st.distribute %result into[%tile] : vector<4xf32> into vector<128xf32>[!gml_st.tile<4>]
+    vector.transfer_write %distribute, %warp_out[%c0] {"gml-st-distribution-label" = "warp", in_bounds = [true]} : vector<128xf32>, memref<128xf32, strided<[1], offset: ?>>
+    gpu.terminator
   }
   return %alloc : memref<2048xf32>
 }
+
 // CHECK-LABEL: @vectorized_tiling
 // CHECK-SAME: %[[ARG:.*]]: memref
 // CHECK:      %[[OUT:.*]] = memref.alloc
@@ -282,9 +66,8 @@ func.func @materialize_scalar_of_transfer_read(
   %c0 = arith.constant 0 : index
   %vector = vector.transfer_read %in[%c0], %pad {in_bounds = [true]}
     : memref<32xf32>, vector<32xf32>
-  %tile = gml_st.tile [%idx] [1] [1] : !gml_st.tile<1>
-  %value = gml_st.materialize %vector[%tile]
-    : vector<32xf32>[!gml_st.tile<1>] to f32
+  %value = gml_st.materialize %vector[%idx] [1] [1]
+    : vector<32xf32> to f32
   return %value : f32
 }
 // CHECK-LABEL: @materialize_scalar_of_transfer_read(
@@ -292,4 +75,4 @@ func.func @materialize_scalar_of_transfer_read(
 // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:  %[[SUBVIEW:.*]] = memref.subview %[[IN]][%[[IDX]]]
 // CHECK:      %[[VALUE:.*]] = memref.load %[[SUBVIEW]][%[[C0]]]
-// CHECK:      return %[[VALUE]] : f32
\ No newline at end of file
+// CHECK:      return %[[VALUE]] : f32
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_scf.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_scf.mlir
index fcfe5bc5763..90b7132247f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_scf.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gml_st_to_scf.mlir
@@ -1,113 +1,5 @@
 // RUN: mlir-hlo-opt %s -gml-st-to-scf -split-input-file | FileCheck %s
 
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func.func @loop(%A: memref<192x192xf32>,
-                 %B: memref<192x192xf32>,
-                 %C: memref<192x192xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c24 = arith.constant 24 : index
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-
-  gml_st.loop (%i, %j) = (%c0, %c0) to (%c192, %c192) step (%c24, %c16)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B:  memref<192x192xf32>)
-      outs (%C_ = %C: memref<192x192xf32>) {
-    %0 = affine.min #map0(%i)
-    %1 = memref.subview %A_[%i, 0] [%0, 192] [1, 1]
-      : memref<192x192xf32> to memref<?x192xf32, #map1>
-    %2 = affine.min #map2(%j)
-    %3 = memref.subview %B_[0, %j] [192, %2] [1, 1]
-      : memref<192x192xf32> to memref<192x?xf32, #map1>
-    %4 = memref.subview %C_[%i, %j] [%0, %2] [1, 1]
-      : memref<192x192xf32> to memref<?x?xf32, #map1>
-    linalg.fill ins(%cst : f32) outs(%4 : memref<?x?xf32, #map1>)
-    linalg.matmul ins(%1, %3 : memref<?x192xf32, #map1>,
-                               memref<192x?xf32, #map1>)
-                  outs(%4 : memref<?x?xf32, #map1>)
-    gml_st.yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: @loop
-// CHECK-SAME:  %[[A:.*]]: memref<192x192xf32>, %[[B:.*]]: memref<192x192xf32>,
-// CHECK-SAME:  %[[C:.*]]: memref<192x192xf32>) {
-// CHECK-DAG:   %[[C24:.*]] = arith.constant 24 : index
-// CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C192:.*]] = arith.constant 192 : index
-// CHECK:       scf.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:      to (%[[C192]], %[[C192]]) step (%[[C24]], %[[C16]]) {
-// CHECK:         %[[A_sub:.*]] = memref.subview %[[A]][%[[I]]
-// CHECK:         %[[B_sub:.*]] = memref.subview %[[B]][0, %[[J]]]
-// CHECK:         %[[C_sub:.*]] = memref.subview %[[C]][%[[I]]
-// CHECK:         linalg.fill
-// CHECK:         linalg.matmul
-
-// -----
-
-
-func.func @parallel(%A: memref<192x192xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c24 = arith.constant 24 : index
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-
-  gml_st.parallel (%i, %j) = (%c0, %c0) to (%c192, %c192) step (%c24, %c16) {
-    linalg.fill ins(%cst : f32) outs(%A : memref<192x192xf32>)
-    gml_st.set_yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: @parallel
-// CHECK-SAME:  %[[A:.*]]: memref<192x192xf32>
-// CHECK-DAG:   %[[C24:.*]] = arith.constant 24 : index
-// CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C192:.*]] = arith.constant 192 : index
-// CHECK:       scf.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:      to (%[[C192]], %[[C192]]) step (%[[C24]], %[[C16]]) {
-// CHECK:         linalg.fill
-
-// -----
-
-func.func @loop_reduction(%A: memref<192x192xf32>,
-                           %B: memref<192x192xf32>,
-                           %C: memref<f32>) {
-   %c24 = arith.constant 24 : index
-   %c16 = arith.constant 16 : index
-   %c0 = arith.constant 0 : index
-   %c192 = arith.constant 192 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  gml_st.loop (%i, %j) = (%c0, %c0) to (%c192, %c192) step (%c24, %c16)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B:  memref<192x192xf32>)
-      outs (%C_ = %C: memref<f32>)
-      iterators[#gml_st.iterator_type<reduction>,
-                #gml_st.iterator_type<reduction>] {
-    linalg.fill ins(%cst : f32) outs(%A_ : memref<192x192xf32>)
-    gml_st.yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: @loop_reduction
-// CHECK-DAG:   %[[C24:.*]] = arith.constant 24 : index
-// CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C192:.*]] = arith.constant 192 : index
-// CHECK:       scf.for %{{.*}} = %[[C0]] to %[[C192]] step %[[C24]]
-// CHECK:         scf.for %{{.*}} = %[[C0]] to %[[C192]] step %[[C16]]
-// CHECK:           linalg.fill
-
-// -----
-
 func.func @for(%A: memref<192x192xf32>) {
    %c24 = arith.constant 24 : index
    %c16 = arith.constant 16 : index
@@ -133,112 +25,6 @@ func.func @for(%A: memref<192x192xf32>) {
 
 // -----
 
-#strided_1d = affine_map<(d0)[s0] -> (d0 + s0)>
-#strided_2d = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
-
-func.func @loop_row_reduction(%A: memref<10x8xf32>,
-                               %B: memref<8xf32>) {
-   %c0 = arith.constant 0 : index
-   %c2 = arith.constant 2 : index
-   %c4 = arith.constant 4 : index
-   %c8 = arith.constant 8 : index
-   %c10 = arith.constant 10 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  gml_st.loop (%i, %j) = (%c0, %c0) to (%c10, %c8) step (%c2, %c4)
-      ins (%A_ = %A: memref<10x8xf32>)
-      outs (%B_ = %B: memref<8xf32>)
-      iterators[#gml_st.iterator_type<reduction>,
-                #gml_st.iterator_type<parallel>] {
-    %A_sub = memref.subview %A_[%i, %j][2, 4][1, 1]
-      : memref<10x8xf32> to memref<2x4xf32, #strided_2d>
-    %B_sub = memref.subview %B_[%j][4][1]
-      : memref<8xf32> to memref<4xf32, #strided_1d>
-    linalg.generic {
-        indexing_maps = [affine_map<(i, j) -> (i, j)>,
-                         affine_map<(i, j) -> (j)>],
-        iterator_types = ["reduction", "parallel"]}
-        ins(%A_sub : memref<2x4xf32, #strided_2d>)
-        outs(%B_sub : memref<4xf32, #strided_1d>) {
-      ^bb(%a: f32, %b: f32) :
-        %0 = arith.addf %a, %b: f32
-        linalg.yield %0 : f32
-      }
-    gml_st.yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: @loop_row_reduction
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
-
-// CHECK:     scf.parallel (%[[J:.*]]) = (%[[C0]]) to (%[[C8]]) step (%[[C4]])
-// CHECK-NEXT:  scf.for %[[I:.*]] = %[[C0]] to %[[C10]] step %[[C2]]
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]], %[[J]]] [2, 4] [1, 1]
-// CHECK-SAME:      : memref<10x8xf32> to memref<2x4xf32, #map{{[0-9]*}}>
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[J]]] [4] [1]
-// CHECK-SAME:      : memref<8xf32> to memref<4xf32, #map{{[0-9]*}}>
-
-// -----
-
-#strided_1d = affine_map<(d0)[s0] -> (d0 + s0)>
-#strided_2d = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)>
-
-func.func @loop_col_reduction(%A: memref<10x8xf32>,
-                               %B: memref<10xf32>) {
-   %c0 = arith.constant 0 : index
-   %c2 = arith.constant 2 : index
-   %c4 = arith.constant 4 : index
-   %c8 = arith.constant 8 : index
-   %c10 = arith.constant 10 : index
-   %cst = arith.constant 0.000000e+00 : f32
-
-  gml_st.loop (%i, %j) = (%c0, %c0) to (%c10, %c8) step (%c2, %c4)
-      ins (%A_ = %A: memref<10x8xf32>)
-      outs (%B_ = %B: memref<10xf32>)
-      iterators[#gml_st.iterator_type<parallel>,
-                #gml_st.iterator_type<reduction>] {
-    %A_sub = memref.subview %A_[%i, %j][2, 4][1, 1]
-      : memref<10x8xf32> to memref<2x4xf32, #strided_2d>
-    %B_sub = memref.subview %B_[%i][2][1]
-      : memref<10xf32> to memref<2xf32, #strided_1d>
-    linalg.generic {
-        indexing_maps = [affine_map<(i, j) -> (i, j)>,
-                         affine_map<(i, j) -> (i)>],
-        iterator_types = ["parallel", "reduction"]}
-        ins(%A_sub : memref<2x4xf32, #strided_2d>)
-        outs(%B_sub : memref<2xf32, #strided_1d>) {
-      ^bb(%a: f32, %b: f32) :
-        %0 = arith.addf %a, %b: f32
-        linalg.yield %0 : f32
-      }
-    gml_st.yield
-  }
-  func.return
-}
-
-// CHECK-LABEL: @loop_col_reduction
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[C10:.*]] = arith.constant 10 : index
-
-// CHECK:     scf.parallel (%[[I:.*]]) = (%[[C0]]) to (%[[C10]]) step (%[[C2]])
-// CHECK-NEXT:  scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]]
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]], %[[J]]] [2, 4] [1, 1]
-// CHECK-SAME:      : memref<10x8xf32> to memref<2x4xf32, #map{{[0-9]*}}>
-// CHECK-NEXT:    memref.subview %arg{{[0-9]+}}[%[[I]]] [2] [1]
-// CHECK-SAME:      : memref<10xf32> to memref<2xf32, #map{{[0-9]*}}>
-
-// -----
-
 func.func @for_with_result(%arg: vector<4xf32>) -> vector<4xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_cwise.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_cwise.mlir
new file mode 100644
index 00000000000..70172d03e2e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_cwise.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-hlo-opt %s \
+// RUN:     --gml-tiling-cwise="tile-sizes=4,8 distribute=true distribution-label=test" \
+// RUN:     --cse | \
+// RUN: FileCheck %s
+
+#id = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+func.func @cwise_expr(%a: tensor<?x1024x1024xf32>, %b: tensor<?x1024x1024xf32>,
+    %c: tensor<?x1024x1024xf32>) -> tensor<?x1024x1024xf32> {
+  %c0 = arith.constant 0 : index
+  %d0 = tensor.dim %a, %c0 : tensor<?x1024x1024xf32>
+  %init = tensor.empty(%d0) : tensor<?x1024x1024xf32>
+  %ab = linalg.generic {
+      indexing_maps = [#id, #id, #id],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%a, %b : tensor<?x1024x1024xf32>, tensor<?x1024x1024xf32>)
+      outs(%init : tensor<?x1024x1024xf32>) {
+  ^bb0(%a_: f32, %b_: f32, %_: f32):
+    %ab_ = arith.addf %a_, %b_ : f32
+    linalg.yield %ab_ : f32
+  } -> tensor<?x1024x1024xf32>
+  %abc = linalg.generic {
+      indexing_maps = [#id, #id, #id],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%ab, %c : tensor<?x1024x1024xf32>, tensor<?x1024x1024xf32>)
+      outs(%init : tensor<?x1024x1024xf32>) {
+  ^bb0(%ab_: f32, %c_: f32, %_: f32):
+    %abc_ = arith.addf %ab_, %c_ : f32
+    linalg.yield %abc_ : f32
+  } -> tensor<?x1024x1024xf32>
+  func.return %abc : tensor<?x1024x1024xf32>
+}
+
+
+// CHECK-LABEL: @cwise_expr
+// CHECK-SAME:  %[[A:.*]]: tensor<?x1024x1024xf32>, %[[B:.*]]: tensor<?x1024x1024xf32>, %[[C:.*]]: tensor<?x1024x1024xf32>
+
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:   %[[C4:.*]] = arith.constant 4
+// CHECK-DAG:   %[[C8:.*]] = arith.constant 8
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:   %[[C1024:.*]] = arith.constant 1024
+// CHECK-DAG:   %[[A_D0:.*]] = tensor.dim %[[A]], %[[C0]]
+// CHECK-DAG:   %[[INIT:.*]] = tensor.empty(%[[A_D0]])
+// CHECK:       %[[ABC:.*]] = gml_st.parallel
+// CHECK-SAME:      (%[[I:.*]], %[[J:.*]], %[[K:.*]]) = (%[[C0]], %[[C0]], %[[C0]])
+// CHECK-SAME:      to (%[[A_D0]], %[[C1024]], %[[C1024]])
+// CHECK-SAME:      step (%[[C1]], %[[C4]], %[[C8]])
+// CHECK-SAME:      outs (%[[INIT_:.*]] = %[[INIT]]:
+// CHECK-SAME:      distribution ("test")
+// CHECK-DAG:     %[[A_SUB:.*]] = tensor.extract_slice %[[A]]
+// CHECK-SAME:      [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK-DAG:     %[[B_SUB:.*]] = tensor.extract_slice %[[B]]
+// CHECK-SAME:      [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK-DAG:     %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:      [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK-DAG:     %[[AB_SUB:.*]] = linalg.generic
+// CHECK-SAME:        ins(%[[A_SUB]], %[[B_SUB]] : tensor<1x4x8xf32>, tensor<1x4x8xf32>)
+// CHECK-SAME:        outs(%[[INIT_SUB]] : tensor<1x4x8xf32>)
+// CHECK-DAG:     %[[C_SUB:.*]] = tensor.extract_slice %[[C]]
+// CHECK-SAME:      [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK-DAG:     %[[INIT_SUB_:.*]] = tensor.extract_slice %[[INIT_]]
+// CHECK-SAME:      [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK-DAG:     %[[ABC_SUB:.*]] = linalg.generic
+// CHECK-SAME:        ins(%[[AB_SUB]], %[[C_SUB]] : tensor<1x4x8xf32>, tensor<1x4x8xf32>)
+// CHECK-SAME:        outs(%[[INIT_SUB_]] : tensor<1x4x8xf32>)
+// CHECK-DAG:     %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
+// CHECK:         gml_st.set_yield %[[ABC_SUB]] into %[[INIT_]][%[[TILE]]]
+// CHECK:       return %[[ABC]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_gpu_warp.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_gpu_warp.mlir
new file mode 100644
index 00000000000..6018b2b5d3a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/gpu_tiling/tiling_gpu_warp.mlir
@@ -0,0 +1,316 @@
+// RUN: mlir-hlo-opt %s --split-input-file --gml-tiling-gpu-warp | \
+// RUN: FileCheck %s
+
+// CHECK-LABEL: @tiling_warp_level_reduction
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<7x13xf32>
+func.func @tiling_warp_level_reduction(%arg0: tensor<7x13xf32>)
+    -> tensor<7xf32> {
+  // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
+  // CHECK-DAG: %[[C13:.*]] = arith.constant 13 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[CST:.*]] = arith.constant 0xFF800000 : f32
+  //     CHECK: %[[EMPTY:.*]] = tensor.empty()
+  //     CHECK: %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C1024]]) step (%[[C1]])
+ // CHECK-SAME:     outs (%[[EMPTY_:.*]] = %[[EMPTY]]:
+ // CHECK-SAME:     distribution ("warp")
+  //     CHECK:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [1, 13] [1, 1]
+  //     CHECK:   %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[EMPTY_]][%[[ARG1]]] [1] [1]
+  //     CHECK:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_0]] : tensor<1xf32>)
+  //     CHECK:   %[[EMPTY_0:.*]] = tensor.empty()
+  //     CHECK:   %[[EXTRACTED:.*]] = tensor.extract %[[FILL]][%[[C0]]]
+  //     CHECK:   %[[PARALLEL_0:.*]] = gml_st.parallel (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C16]]) step (%[[C1]])
+ // CHECK-SAME:       outs (%[[EMPTY_0_:.*]] = %[[EMPTY_0]]
+ // CHECK-SAME:       distribution ("thread")
+  //     CHECK:     %[[MATERIALIZE_2:.*]] = tensor.extract_slice %[[EMPTY_0_]][0, %[[ARG2]]] [1, 1] [1, 1]
+  //     CHECK:     %[[FILL_1:.*]] = linalg.fill ins(%[[EXTRACTED]] : f32) outs(%[[MATERIALIZE_2]] : tensor<1x1xf32>)
+  //     CHECK:     %[[FOR:.*]] = gml_st.for (%[[ARG3:.*]]) = (%[[ARG2]]) to (%[[C13]]) step (%[[C16]]) outs (%[[ARG4:.*]] = %[[FILL_1]]: tensor<1x1xf32>)
+  //     CHECK:       %[[MATERIALIZE_3_:.*]] = tensor.extract_slice %[[MATERIALIZE]][0, %[[ARG3]]] [1, 1] [1, 1] : tensor<1x13xf32> to tensor<1x1xf32>
+  //     CHECK:       %[[MATERIALIZE_3:.*]] = tensor.extract %[[MATERIALIZE_3_]]
+  //     CHECK:       %[[MATERIALIZE_4:.*]] = tensor.extract %[[ARG4]]
+  //     CHECK:       %[[MAXF:.*]] = arith.maxf %[[MATERIALIZE_4]], %[[MATERIALIZE_3]] : f32
+  //     CHECK:       %[[TILE_6_:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
+  //     CHECK:       gml_st.set_yield %[[MAXF]] into %[[ARG4]][%[[TILE_6_]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
+  //     CHECK:     %[[TILE_3:.*]] = gml_st.tile [0, %[[ARG2]]] [1, 1] [1, 1]
+  //     CHECK:     gml_st.set_yield %[[FOR]] into %[[EMPTY_0_]][%[[TILE_3]]]
+  //     CHECK:   %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
+  //     CHECK:       ins(%[[PARALLEL_0]] : tensor<1x16xf32>)
+  //     CHECK:       outs(%[[FILL]] : tensor<1xf32>)
+  //     CHECK:       dimensions = [1]
+  //     CHECK:   %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [1] [1]
+  //     CHECK:   gml_st.set_yield %[[REDUCE]] into %[[EMPTY_]][%[[TILE_0]]]
+  //     CHECK: return %[[PARALLEL]]
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<7xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<7xf32>)
+      -> tensor<7xf32>
+  %2 = gml_st.parallel (%arg1) = (%c0) to (%c1024) step (%c1)
+      outs (%out_ = %1: tensor<7xf32>)
+      distribution ("warp") {
+    %4 = tensor.extract_slice %arg0[%arg1, 0] [1, 13] [1, 1]
+        : tensor<7x13xf32> to tensor<1x13xf32>
+    %6 = tensor.extract_slice %out_[%arg1] [1] [1]
+        : tensor<7xf32> to tensor<1xf32>
+    %7 = linalg.reduce { arith.maxf }
+           ins(%4 : tensor<1x13xf32>)
+           outs(%6 : tensor<1xf32>)
+           dimensions = [1]
+    %5 = gml_st.tile [%arg1] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %7 into %out_[%5]
+        : tensor<1xf32> into tensor<7xf32>[!gml_st.tile<1>]
+  } : tensor<7xf32>
+  return %2 : tensor<7xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @tiling_warp_level_cwise
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<7x13xf32>, %[[ARG1:.*]]: tensor<7x13xf32>
+func.func @tiling_warp_level_cwise(%arg0: tensor<7x13xf32>,
+    %arg1: tensor<7x13xf32>) -> tensor<7x13xf32> {
+  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
+  // CHECK-DAG:  %[[C16:.*]] = arith.constant 16
+  // CHECK-DAG:  %[[C1024:.*]] = arith.constant 1024
+  // CHECK-DAG:  %[[C28:.*]] = arith.constant 28
+  // CHECK-DAG:  %[[EMPTY:.*]] = tensor.empty() : tensor<7x13xf32>
+  // CHECK:      %[[PARALLEL:.*]] = gml_st.parallel
+  // CHECK-SAME:     (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C1024]])
+  // CHECK-SAME:     step (%[[C1]]) outs (%[[EMPTY_:.*]] = %[[EMPTY]]:
+  // CHECK-SAME:     distribution ("warp")
+  // CHECK:        %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0] [1, 13] [1, 1]
+  // CHECK:        %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG2]], 0] [1, 13] [1, 1]
+  // CHECK:        %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[EMPTY_]][%[[ARG2]], 0] [1, 13] [1, 1]
+  // CHECK:        %[[PARALLEL_0:.*]] = gml_st.parallel
+  // CHECK-SAME:       (%[[ARG3:.*]]) = (%[[C0]]) to (%[[C16]])
+  // CHECK-SAME:       step (%[[C1]]) outs (%[[MATERIALIZE_1_:.*]] = %[[MATERIALIZE_1]]:
+  // CHECK-SAME:       distribution ("thread")
+  // CHECK:          %[[SUBI:.*]] = arith.subi %[[C28]], %[[ARG3]]
+  // CHECK:          %[[DIVUI:.*]] = arith.divui %[[SUBI]], %[[C16]]
+  // CHECK:          %[[MATERIALIZE_2:.*]] = tensor.extract_slice %[[MATERIALIZE_1_]][0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 16]
+  // CHECK:          %[[FOR:.*]] = gml_st.for (%[[ARG4:.*]]) = (%[[C0]])
+  // CHECK-SAME:         to (%[[DIVUI]]) step (%[[C1]])
+  // CHECK-SAME:         outs (%[[ARG5:.*]] = %[[MATERIALIZE_2]]: tensor<1x?xf32>)
+  // CHECK:            %[[MULI:.*]] = arith.muli %[[ARG4]], %[[C16]] : index
+  // CHECK:            %[[ADDI:.*]] = arith.addi %[[ARG3]], %[[MULI]] : index
+  // CHECK:            %[[MATERIALIZE_3_:.*]] = tensor.extract_slice %[[MATERIALIZE]][0, %[[ADDI]]] [1, 1] [1, 1]
+  // CHECK:            %[[MATERIALIZE_3:.*]] = tensor.extract %[[MATERIALIZE_3_]]
+  // CHECK:            %[[MATERIALIZE_4_:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][0, %[[ADDI]]] [1, 1] [1, 1]
+  // CHECK:            %[[MATERIALIZE_4:.*]] = tensor.extract %[[MATERIALIZE_4_]]
+  // CHECK:            %[[SUBF:.*]] = arith.subf %[[MATERIALIZE_3]], %[[MATERIALIZE_4]]
+  // CHECK:            %[[TILE_2:.*]] = gml_st.tile [0, %[[ARG4]]] [1, 1] [1, 1]
+  // CHECK:            gml_st.set_yield %[[SUBF]] into %[[ARG5]][%[[TILE_2]]]
+  // CHECK:          %[[TILE_0:.*]] = gml_st.tile [0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 16]
+  // CHECK:          gml_st.set_yield %[[FOR]] into %[[MATERIALIZE_1_]][%[[TILE_0]]]
+  // CHECK:        %[[TILE:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 13] [1, 1]
+  // CHECK:        gml_st.set_yield %[[PARALLEL_0]] into %[[EMPTY_]][%[[TILE]]]
+  // CHECK:      return %[[PARALLEL]]
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %0 = tensor.empty() : tensor<7x13xf32>
+  %1 = gml_st.parallel (%arg2) = (%c0) to (%c1024) step (%c1)
+      outs (%out_ = %0: tensor<7x13xf32>) distribution ("warp") {
+    %3 = tensor.extract_slice %arg0 [%arg2, 0] [1, 13] [1, 1]
+        : tensor<7x13xf32> to tensor<1x13xf32>
+    %4 = tensor.extract_slice %arg1 [%arg2, 0] [1, 13] [1, 1]
+        : tensor<7x13xf32> to tensor<1x13xf32>
+    %5 = tensor.extract_slice %out_ [%arg2, 0] [1, 13] [1, 1]
+        : tensor<7x13xf32> to tensor<1x13xf32>
+    %6 = linalg.map { arith.subf }
+           ins(%3, %4 : tensor<1x13xf32>, tensor<1x13xf32>)
+           outs(%5 : tensor<1x13xf32>)
+    %2 = gml_st.tile [%arg2, 0] [1, 13] [1, 1] : !gml_st.tile<1x13>
+    gml_st.set_yield %6 into %out_[%2]
+        : tensor<1x13xf32> into tensor<7x13xf32>[!gml_st.tile<1x13>]
+  } : tensor<7x13xf32>
+  return %1 : tensor<7x13xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @softmax
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<2048x4096xf32>
+func.func @softmax(%arg0: tensor<2048x4096xf32>) -> tensor<2048x4096xf32> {
+  // CHECK-DAG: %[[C4096:.*]] = arith.constant 4096 : index
+  // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+  // CHECK-DAG: %[[C4127:.*]] = arith.constant 4127 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C2048:.*]] = arith.constant 2048 : index
+  // CHECK-DAG: %[[CST:.*]] = arith.constant -0.000000e+00 : f32
+  // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0xFF800000 : f32
+  // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<2048xf32>
+  // CHECK-DAG: %[[EMPTY_0:.*]] = tensor.empty() : tensor<2048x4096xf32>
+  //     CHECK: %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C2048]]) step (%[[C1024]])
+  //     CHECK-SAME:  outs (%[[BLOCK_OUT_:.*]] = %[[EMPTY_0]]:
+  //     CHECK-SAME:  distribution ("block")
+  //     CHECK:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[BLOCK_OUT_]][%[[ARG1]], 0] [1024, 4096] [1, 1]
+
+  //     CHECK:   %[[PARALLEL_0:.*]] = gml_st.parallel (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C1024]]) step (%[[C1]])
+  //     CHECK-SAME:  outs (%[[WARP_OUT_:.*]] = %[[MATERIALIZE]]:
+  //     CHECK-SAME:  distribution ("warp")
+  //     CHECK:     %[[ADDI:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : index
+  //     CHECK:     %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG0]][%[[ADDI]], 0] [1, 4096] [1, 1]
+  //     CHECK:     %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[EMPTY]][%[[ADDI]]] [1] [1]
+  //     CHECK:     %[[FILL:.*]] = linalg.fill ins(%[[CST_0]] : f32) outs(%[[MATERIALIZE_1]] : tensor<1xf32>)
+  //     CHECK:     %[[EMPTY_1:.*]] = tensor.empty()
+  //     CHECK:     %[[EXTRACTED:.*]] = tensor.extract %[[FILL]][%[[C0]]]
+  //     CHECK:     %[[PARALLEL_1:.*]] = gml_st.parallel (%[[ARG3:.*]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]])
+  //     CHECK-SAME:    outs (%[[THREAD_OUT0_:.*]] = %[[EMPTY_1]]
+  //     CHECK-SAME:    distribution ("thread")
+  //     CHECK:       %[[MATERIALIZE_3:.*]] = tensor.extract_slice %[[THREAD_OUT0_]][0, %[[ARG3]]] [1, 1] [1, 1]
+  //     CHECK:       %[[FILL_1:.*]] = linalg.fill ins(%[[EXTRACTED]] : f32) outs(%[[MATERIALIZE_3]] : tensor<1x1xf32>)
+  //     CHECK:       %[[FOR:.*]] = gml_st.for (%[[ARG4:.*]]) = (%[[ARG3]]) to (%[[C4096]]) step (%[[C32]]) outs (%[[ARG5:.*]] = %[[FILL_1]]: tensor<1x1xf32>)
+  //     CHECK:         %[[MATERIALIZE_4_:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][0, %[[ARG4]]] [1, 1] [1, 1] : tensor<1x4096xf32> to tensor<1x1xf32>
+  //     CHECK:         %[[MATERIALIZE_4:.*]] = tensor.extract %[[MATERIALIZE_4_]]
+  //     CHECK:         %[[MATERIALIZE_5:.*]] = tensor.extract %[[ARG5]]
+  //     CHECK:         %[[MAXF:.*]] = arith.maxf %[[MATERIALIZE_5]], %[[MATERIALIZE_4]] : f32
+  //     CHECK:         %[[TILE_7_:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
+  //     CHECK:         gml_st.set_yield %[[MAXF]] into %[[ARG5]][%[[TILE_7_]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
+  //     CHECK:       %[[TILE_4:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
+  //     CHECK:       gml_st.set_yield %[[FOR]] into %[[THREAD_OUT0_]][%[[TILE_4]]]
+  //     CHECK:     %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
+  //     CHECK:          ins(%[[PARALLEL_1]] : tensor<1x32xf32>)
+  //     CHECK:          outs(%[[FILL]] : tensor<1xf32>)
+  //     CHECK:          dimensions = [1]
+  //     CHECK:     %[[MATERIALIZE_6:.*]] = tensor.extract_slice %[[EMPTY_0]][%[[ADDI]], 0] [1, 4096] [1, 1]
+  //     CHECK:     %[[MATERIALIZE_7:.*]] = tensor.extract_slice %[[EMPTY]][%[[ADDI]]] [1] [1]
+  //     CHECK:     %[[FILL_2:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_7]] : tensor<1xf32>)
+  //     CHECK:     %[[EMPTY_2:.*]] = tensor.empty()
+  //     CHECK:     %[[EXTRACTED_1:.*]] = tensor.extract %[[FILL_2]][%[[C0]]]
+  //     CHECK:     %[[PARALLEL_2:.*]] = gml_st.parallel (%[[ARG3]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]])
+  //     CHECK-SAME:    outs (%[[THREAD_OUT1_:.*]] = %[[EMPTY_2]]
+  //     CHECK-SAME:    distribution ("thread")
+  //     CHECK:       %[[MATERIALIZE_9:.*]] = tensor.extract_slice %[[THREAD_OUT1_]][0, %[[ARG3]]] [1, 1] [1, 1]
+  //     CHECK:       %[[FILL_4:.*]] = linalg.fill ins(%[[EXTRACTED_1]] : f32) outs(%[[MATERIALIZE_9]] : tensor<1x1xf32>)
+  //     CHECK:       %[[FOR_0:.*]] = gml_st.for (%[[ARG4_0:.*]]) = (%[[ARG3]]) to (%[[C4096]]) step (%[[C32]]) outs (%[[ARG5_0:.*]] = %[[FILL_4]]: tensor<1x1xf32>)
+  //     CHECK:         %[[MATERIALIZE_10:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][0, %[[ARG4_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MATERIALIZE_12:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ARG4_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[BROADCAST:.*]] = linalg.broadcast
+  //     CHECK:           ins(%[[REDUCE]] : tensor<1xf32>)
+  //     CHECK:           outs(%[[MATERIALIZE_12]] : tensor<1x1xf32>)
+  //     CHECK:           dimensions = [1]
+  //     CHECK:         %[[MATERIALIZE_13:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ARG4_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MAP:.*]] = linalg.map { arith.subf }
+  //     CHECK:            ins(%[[MATERIALIZE_10]], %[[BROADCAST]] : tensor<1x1xf32>, tensor<1x1xf32>)
+  //     CHECK:            outs(%[[MATERIALIZE_13]] : tensor<1x1xf32>)
+  //     CHECK:         %[[MATERIALIZE_14:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ARG4_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MAP_0:.*]] = linalg.map { math.exp }
+  //     CHECK:           ins(%[[MAP]] : tensor<1x1xf32>)
+  //     CHECK:           outs(%[[MATERIALIZE_14]] : tensor<1x1xf32>)
+  //     CHECK:         %[[EXTRACTED_2:.*]] = tensor.extract %[[MAP_0]][%[[C0]], %[[C0]]]
+  //     CHECK:         %[[MATERIALIZE_15:.*]] = tensor.extract %[[ARG5_0]]
+
+  //     CHECK:         %[[ADDF:.*]] = arith.addf %[[MATERIALIZE_15]], %[[EXTRACTED_2]] : f32
+  //     CHECK:         %[[TILE_17_:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
+  //     CHECK:         gml_st.set_yield %[[ADDF]] into %[[ARG5_0]][%[[TILE_17_]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
+  //     CHECK:       %[[TILE_10:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
+  //     CHECK:       gml_st.set_yield %[[FOR_0]] into %[[THREAD_OUT1_]][%[[TILE_10]]]
+  //     CHECK:     %[[REDUCE_0:.*]] = linalg.reduce { arith.addf }
+  //     CHECK:         ins(%[[PARALLEL_2]] : tensor<1x32xf32>)
+  //     CHECK:         outs(%[[FILL_2]] : tensor<1xf32>)
+  //     CHECK:         dimensions = [1]
+  //     CHECK:     %[[PARALLEL_3:.*]] = gml_st.parallel (%[[ARG3]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]])
+  //     CHECK-SAME:    outs (%[[THREAD_OUT2_:.*]] = %[[MATERIALIZE_6]]:
+  //     CHECK-SAME:    distribution ("thread")
+  //     CHECK:       %[[SUBI:.*]] = arith.subi %[[C4127]], %[[ARG3]] : index
+  //     CHECK:       %[[DIVUI:.*]] = arith.divui %[[SUBI]], %[[C32]] : index
+  //     CHECK:       %[[MATERIALIZE_16:.*]] = tensor.extract_slice %[[THREAD_OUT2_]][0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 32]
+  //     CHECK:       %[[FOR_1:.*]] = gml_st.for (%[[ARG4_1:.*]]) = (%[[C0]]) to (%[[DIVUI]]) step (%[[C1]]) outs (%[[ARG5_1:.*]] = %[[MATERIALIZE_16]]: tensor<1x?xf32>)
+  //     CHECK:         %[[MULI:.*]] = arith.muli %[[ARG4_1]], %[[C32]] : index
+  //     CHECK:         %[[ADDI_0:.*]] = arith.addi %[[ARG3]], %[[MULI]] : index
+  //     CHECK:         %[[MATERIALIZE_17:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][0, %[[ADDI_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MATERIALIZE_19:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ADDI_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[BROADCAST_0:.*]] = linalg.broadcast
+  //     CHECK:           ins(%[[REDUCE]] : tensor<1xf32>)
+  //     CHECK:           outs(%[[MATERIALIZE_19]] : tensor<1x1xf32>)
+  //     CHECK:           dimensions = [1]
+  //     CHECK:         %[[MATERIALIZE_20:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ADDI_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MAP_1:.*]] = linalg.map { arith.subf }
+  //     CHECK:           ins(%[[MATERIALIZE_17]], %[[BROADCAST_0]] : tensor<1x1xf32>, tensor<1x1xf32>)
+  //     CHECK:           outs(%[[MATERIALIZE_20]] : tensor<1x1xf32>)
+  //     CHECK:         %[[MATERIALIZE_21:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ADDI_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[MAP_2:.*]] = linalg.map { math.exp }
+  //     CHECK:            ins(%[[MAP_1]] : tensor<1x1xf32>)
+  //     CHECK:            outs(%[[MATERIALIZE_21]] : tensor<1x1xf32>)
+  //     CHECK:         %[[EXTRACTED_2_0:.*]] = tensor.extract %[[MAP_2]][%[[C0]], %[[C0]]]
+  //     CHECK:         %[[MATERIALIZE_23:.*]] = tensor.extract_slice %[[MATERIALIZE_6]][0, %[[ADDI_0]]] [1, 1] [1, 1]
+  //     CHECK:         %[[BROADCAST_1:.*]] = linalg.broadcast
+  //     CHECK:             ins(%[[REDUCE_0]] : tensor<1xf32>)
+  //     CHECK:             outs(%[[MATERIALIZE_23]] : tensor<1x1xf32>)
+  //     CHECK:             dimensions = [1]
+  //     CHECK:         %[[EXTRACTED_3:.*]] = tensor.extract %[[BROADCAST_1]][%[[C0]], %[[C0]]]
+  //     CHECK:         %[[DIVF:.*]] = arith.divf %[[EXTRACTED_2_0]], %[[EXTRACTED_3]] : f32
+  //     CHECK:         %[[TILE_26:.*]] = gml_st.tile [0, %[[ARG4_1]]] [1, 1] [1, 1]
+  //     CHECK:         gml_st.set_yield %[[DIVF]] into %[[ARG5_1]][%[[TILE_26]]] : f32 into tensor<1x?xf32>[!gml_st.tile<1x1>]
+  //     CHECK:       %[[TILE_18_:.*]] = gml_st.tile [0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 32]
+  //     CHECK:       gml_st.set_yield %[[FOR_1]] into %[[THREAD_OUT2_]][%[[TILE_18_]]]
+  //     CHECK:     %[[TILE_0:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 4096] [1, 1]
+  //     CHECK:     gml_st.set_yield %[[PARALLEL_3]] into %[[WARP_OUT_]][%[[TILE_0]]]
+  //     CHECK:   %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [1024, 4096] [1, 1]
+  //     CHECK:   gml_st.set_yield %[[PARALLEL_0]] into %[[BLOCK_OUT_]][%[[TILE]]]
+  //     CHECK: return %[[PARALLEL]]
+  %c1 = arith.constant 1 : index
+  %c1024 = arith.constant 1024 : index
+  %c0 = arith.constant 0 : index
+  %c2048 = arith.constant 2048 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %cst_0 = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<2048xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<2048xf32>)
+      -> tensor<2048xf32>
+  %2 = tensor.empty() : tensor<2048x4096xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2048xf32>)
+      -> tensor<2048xf32>
+  %4 = gml_st.parallel (%arg1) = (%c0) to (%c2048) step (%c1024)
+      outs (%block_out_ = %2: tensor<2048x4096xf32>) distribution ("block") {
+    %6 = tensor.extract_slice %block_out_[%arg1, 0] [1024, 4096] [1, 1]
+        : tensor<2048x4096xf32> to tensor<1024x4096xf32>
+    %7 = gml_st.parallel (%arg2) = (%c0) to (%c1024) step (%c1)
+       outs (%warp_out_ = %6: tensor<1024x4096xf32>) distribution ("warp") {
+      %9 = arith.addi %arg1, %arg2 : index
+      %11 = tensor.extract_slice %arg0[%9, 0] [1, 4096] [1, 1]
+          : tensor<2048x4096xf32> to tensor<1x4096xf32>
+      %13 = tensor.extract_slice %1[%9] [1] [1]
+          : tensor<2048xf32> to tensor<1xf32>
+      %14 = linalg.reduce { arith.maxf }
+              ins(%11 : tensor<1x4096xf32>)
+              outs(%13 : tensor<1xf32>) dimensions = [1]
+      %15 = tensor.extract_slice %2[%9, 0] [1, 4096] [1, 1]
+          : tensor<2048x4096xf32> to tensor<1x4096xf32>
+      %16 = linalg.broadcast
+              ins(%14 : tensor<1xf32>) outs(%15 : tensor<1x4096xf32>)
+              dimensions = [1]
+      %17 = linalg.map  { arith.subf }
+              ins(%11, %16 : tensor<1x4096xf32>, tensor<1x4096xf32>)
+              outs(%15 : tensor<1x4096xf32>)
+      %18 = linalg.map { math.exp }
+              ins(%17 : tensor<1x4096xf32>)
+              outs(%15 : tensor<1x4096xf32>)
+      %19 = tensor.extract_slice %3[%9] [1] [1]
+        : tensor<2048xf32> to tensor<1xf32>
+      %20 = linalg.reduce { arith.addf }
+              ins(%18 : tensor<1x4096xf32>)
+              outs(%19 : tensor<1xf32>) dimensions = [1]
+      %21 = linalg.broadcast
+          ins(%20 : tensor<1xf32>) outs(%15 : tensor<1x4096xf32>)
+          dimensions = [1]
+      %22 = linalg.map { arith.divf }
+              ins(%18, %21 : tensor<1x4096xf32>, tensor<1x4096xf32>)
+              outs(%15 : tensor<1x4096xf32>)
+      %8 = gml_st.tile [%arg2, 0] [1, 4096] [1, 1] : !gml_st.tile<1x4096>
+      gml_st.set_yield %22 into %warp_out_[%8]
+          : tensor<1x4096xf32> into tensor<1024x4096xf32>[!gml_st.tile<1x4096>]
+    } : tensor<1024x4096xf32>
+    %5 = gml_st.tile [%arg1, 0] [1024, 4096] [1, 1] : !gml_st.tile<1024x4096>
+    gml_st.set_yield %7 into %block_out_[%5]
+        : tensor<1024x4096xf32> into tensor<2048x4096xf32>[!gml_st.tile<1024x4096>]
+  } : tensor<2048x4096xf32>
+  return %4 : tensor<2048x4096xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
new file mode 100644
index 00000000000..c269727ed58
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
@@ -0,0 +1,131 @@
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN: --gml-tiling="tile-sizes=2 op-label=root" --test-gml-st-greedy-fusion | \
+// RUN: FileCheck %s
+
+// CHECK-LABEL: func @fuse_broadcast_map
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<16xf32>, %[[ARG1:.*]]: tensor<16x32xf32>)
+func.func @fuse_broadcast_map(%arg0: tensor<16xf32>, %arg1: tensor<16x32xf32>)
+    -> tensor<16x32xf32> {
+  %init = tensor.empty() : tensor<16x32xf32>
+  %bcast = linalg.broadcast
+    ins(%arg0 : tensor<16xf32>)
+    outs(%init : tensor<16x32xf32>)
+    dimensions = [1]
+
+  %result = linalg.map { arith.addf }
+    ins(%bcast, %arg1 : tensor<16x32xf32>, tensor<16x32xf32>)
+    outs(%init : tensor<16x32xf32>)
+    { op_label = "root" }
+  func.return %result : tensor<16x32xf32>
+}
+
+// CHECK:      %[[INIT:.*]] = tensor.empty()
+// CHECK:      %[[RESULT:.*]] = gml_st.parallel
+// CHECK-SAME:    outs (%[[INIT_:.*]] = %[[INIT]]:
+// CHECK-DAG:  %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT]]
+// CHECK-DAG:  %[[ARG0_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:      %[[BCAST:.*]] = linalg.broadcast
+// CHECK-SAME:   ins(%[[ARG0_SLICE]]
+// CHECK-SAME:   outs(%[[INIT_SLICE]]
+// CHECK:      %[[ARG1_SLICE:.*]] = tensor.extract_slice %[[ARG1]]
+// CHECK-DAG:  %[[INIT_SLICE_:.*]] = tensor.extract_slice %[[INIT_]]
+// CHECK:      %[[MAPPED:.*]] = linalg.map
+// CHECK-SAME:   ins(%[[BCAST]], %[[ARG1_SLICE]]
+// CHECK-SAME:   outs(%[[INIT_SLICE_]]
+// CHECK:      gml_st.set_yield %[[MAPPED]]
+// CHECK:      return %[[RESULT]]
+
+// -----
+
+// CHECK-LABEL: func @do_not_fuse_map_reduce
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<16x32xf32>, %[[ARG1:.*]]: tensor<16xf32>)
+func.func @do_not_fuse_map_reduce(%arg0: tensor<16x32xf32>, %arg1: tensor<16xf32>)
+    -> tensor<16xf32> {
+  %init = tensor.empty() : tensor<16xf32>
+  %reduce = linalg.reduce { arith.addf }
+    ins(%arg0 : tensor<16x32xf32>)
+    outs(%init : tensor<16xf32>)
+    dimensions = [1]
+
+  %result = linalg.map { arith.addf }
+    ins(%reduce, %arg1 : tensor<16xf32>, tensor<16xf32>)
+    outs(%init : tensor<16xf32>)
+    { op_label = "root" }
+  func.return %result : tensor<16xf32>
+}
+
+// CHECK:      %[[INIT:.*]] = tensor.empty()
+// CHECK:      %[[REDUCE:.*]] = linalg.reduce
+// CHECK:      %[[RESULT:.*]] = gml_st.parallel
+// CHECK-SAME:    outs (%[[INIT_:.*]] = %[[INIT]]:
+// CHECK-DAG:  %[[REDUCE_SLICE:.*]] = tensor.extract_slice %[[REDUCE]]
+// CHECK-DAG:  %[[ARG1_SLICE:.*]] = tensor.extract_slice %[[ARG1]]
+// CHECK-DAG:  %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT_]]
+// CHECK:      %[[MAPPED:.*]] = linalg.map
+// CHECK-SAME:   ins(%[[REDUCE_SLICE]], %[[ARG1_SLICE]]
+// CHECK-SAME:   outs(%[[INIT_SLICE]]
+// CHECK:      gml_st.set_yield %[[MAPPED]]
+// CHECK:      return %[[RESULT]]
+
+// -----
+
+// Only basic checks that all maps and fills were fused into gml_st.parallel.
+// This test verified that ops are fused in correct order. If something is
+// broken, the test will take exponential time and/or memory to finish.
+// CHECK-LABEL:    func @fuse_fibonacci
+// CHECK-NOT:      linalg.fill
+// CHECK-NOT:      linalg.map
+// CHECK:          gml_st.parallel
+// CHECK-COUNT-2:    linalg.fill
+// CHECK-COUNT-38:   linalg.map
+// CHECK-NOT:        linalg.fill
+// CHECK-NOT:        linalg.map
+// CHECK:            gml_st.set_yield
+// CHECK:          return
+func.func @fuse_fibonacci(%init : tensor<?xi64>) -> tensor<?xi64> {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+
+  %0 = linalg.fill ins(%c0 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
+  %1 = linalg.fill ins(%c1 : i64) outs(%init : tensor<?xi64>) -> tensor<?xi64>
+  %2 = linalg.map { arith.addi } ins(%0, %1 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %3 = linalg.map { arith.addi } ins(%1, %2 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %4 = linalg.map { arith.addi } ins(%2, %3 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %5 = linalg.map { arith.addi } ins(%3, %4 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %6 = linalg.map { arith.addi } ins(%4, %5 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %7 = linalg.map { arith.addi } ins(%5, %6 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %8 = linalg.map { arith.addi } ins(%6, %7 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %9 = linalg.map { arith.addi } ins(%7, %8 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %10 = linalg.map { arith.addi } ins(%8, %9 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %11 = linalg.map { arith.addi } ins(%9, %10 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %12 = linalg.map { arith.addi } ins(%10, %11 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %13 = linalg.map { arith.addi } ins(%11, %12 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %14 = linalg.map { arith.addi } ins(%12, %13 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %15 = linalg.map { arith.addi } ins(%13, %14 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %16 = linalg.map { arith.addi } ins(%14, %15 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %17 = linalg.map { arith.addi } ins(%15, %16 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %18 = linalg.map { arith.addi } ins(%16, %17 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %19 = linalg.map { arith.addi } ins(%17, %18 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %20 = linalg.map { arith.addi } ins(%18, %19 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %21 = linalg.map { arith.addi } ins(%19, %20 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %22 = linalg.map { arith.addi } ins(%20, %21 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %23 = linalg.map { arith.addi } ins(%21, %22 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %24 = linalg.map { arith.addi } ins(%22, %23 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %25 = linalg.map { arith.addi } ins(%23, %24 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %26 = linalg.map { arith.addi } ins(%24, %25 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %27 = linalg.map { arith.addi } ins(%25, %26 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %28 = linalg.map { arith.addi } ins(%26, %27 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %29 = linalg.map { arith.addi } ins(%27, %28 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %30 = linalg.map { arith.addi } ins(%28, %29 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %31 = linalg.map { arith.addi } ins(%29, %30 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %32 = linalg.map { arith.addi } ins(%30, %31 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %33 = linalg.map { arith.addi } ins(%31, %32 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %34 = linalg.map { arith.addi } ins(%32, %33 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %35 = linalg.map { arith.addi } ins(%33, %34 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %36 = linalg.map { arith.addi } ins(%34, %35 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %37 = linalg.map { arith.addi } ins(%35, %36 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %38 = linalg.map { arith.addi } ins(%36, %37 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+  %39 = linalg.map { arith.addi } ins(%37, %38 : tensor<?xi64>, tensor<?xi64>) outs(%init : tensor<?xi64>)
+    { op_label = "root" }
+  func.return %39 : tensor<?xi64>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_tiling_and_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_tiling_and_fusion.mlir
new file mode 100644
index 00000000000..ae763cd9fdd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_tiling_and_fusion.mlir
@@ -0,0 +1,143 @@
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:     --gml-greedy-fusion="tile-sizes=8,16 distribute=true distribution-label=test" \
+// RUN:     --canonicalize --cse | \
+// RUN: FileCheck %s
+
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+
+func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  %cst = arith.constant -0.000000e+00 : f32
+  %cst_0 = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<64xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
+  %2 = linalg.generic {indexing_maps = [#map0, #map1],
+      iterator_types = ["parallel", "reduction"]}
+      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = arith.maxf %arg2, %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64xf32>
+  %3 = tensor.empty() : tensor<64x128xf32>
+  %4 = linalg.generic {indexing_maps = [#map1, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<64x128xf32>
+  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+    %11 = arith.subf %arg1, %arg2 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  %6 = linalg.generic {indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = math.exp %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
+  %8 = linalg.generic {indexing_maps = [#map0, #map1],
+      iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<64x128xf32>)
+      outs(%7 : tensor<64xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = arith.addf %arg2, %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64xf32>
+  %9 = linalg.generic {indexing_maps = [#map1, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<64xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<64x128xf32>
+  %10 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+    %11 = arith.divf %arg1, %arg2 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  return %10 : tensor<64x128xf32>
+}
+
+// CHECK-LABEL:  @softmax
+// CHECK-NOT:      linalg.generic
+// CHECK:          %[[PARALLEL:.*]] = gml_st.parallel
+// CHECK:            gml_st.set_yield
+// CHECK-NOT:      linalg.generic
+// CHECK:          return %[[PARALLEL]]
+
+// -----
+
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+
+func.func @softmax_fuse_tensor_extract(%arg0: tensor<64x128xf32>,
+                                       %cst_tensor: tensor<f32>) -> tensor<64x128xf32> {
+  %cst = tensor.extract %cst_tensor[] : tensor<f32>
+  %cst_0 = arith.constant 0xFF800000 : f32
+  %0 = tensor.empty() : tensor<64xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
+  %2 = linalg.generic {indexing_maps = [#map0, #map1],
+      iterator_types = ["parallel", "reduction"]}
+      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = arith.maxf %arg2, %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64xf32>
+  %3 = tensor.empty() : tensor<64x128xf32>
+  %4 = linalg.generic {indexing_maps = [#map1, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<64x128xf32>
+  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+    %11 = arith.subf %arg1, %arg2 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  %6 = linalg.generic {indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = math.exp %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
+  %8 = linalg.generic {indexing_maps = [#map0, #map1],
+      iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<64x128xf32>)
+      outs(%7 : tensor<64xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    %11 = arith.addf %arg2, %arg1 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64xf32>
+  %9 = linalg.generic {indexing_maps = [#map1, #map0],
+      iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<64xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32):
+    linalg.yield %arg1 : f32
+  } -> tensor<64x128xf32>
+  %10 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
+      iterator_types = ["parallel", "parallel"]}
+      ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>) {
+  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+    %11 = arith.divf %arg1, %arg2 : f32
+    linalg.yield %11 : f32
+  } -> tensor<64x128xf32>
+  return %10 : tensor<64x128xf32>
+}
+
+// CHECK-LABEL:  @softmax
+// CHECK-NOT:      tensor.extract
+// CHECK:          %[[PARALLEL_BLOCK:.*]] = gml_st.parallel
+// CHECK:            tensor.extract
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
index 4479f0980b6..601481657b9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/invalid.mlir
@@ -1,158 +1,8 @@
 // RUN: mlir-hlo-opt %s -split-input-file -verify-diagnostics
 
-
-func.func @materialize_rank_mismatch(%tensor: tensor<?x?xf32>,
-                                     %tile: !gml_st.tile<4>) {
-  // expected-error @+1 {{expected source rank = 2 to match tile rank = 1}}
-  %0 = gml_st.materialize %tensor[%tile]
-     : tensor<?x?xf32>[!gml_st.tile<4>] to tensor<4xf32>
-}
-
-// -----
-
-func.func @materialize_inferred_type_mismatch(%tensor: tensor<?x?xf32>,
-                                              %tile: !gml_st.tile<?x4>) {
-  // expected-error @+1 {{expected result type = 'tensor<4x?xf32>' to match the inferred type = 'tensor<?x4xf32>}}
-  %0 = gml_st.materialize %tensor[%tile]
-     : tensor<?x?xf32>[!gml_st.tile<?x4>] to tensor<4x?xf32>
-}
-
-// -----
-
-func.func @materialize_scalar_with_dynamic_tile(
-    %tensor: tensor<?x?xf32>, %tile: !gml_st.tile<?x2>) {
-  // expected-error @+1 {{expected tile type '!gml_st.tile<?x2>' to have a single element shape}}
-  %0 = gml_st.materialize %tensor[%tile]
-     : tensor<?x?xf32>[!gml_st.tile<?x2>] to f32
-}
-
-// -----
-
-func.func @materialize_scalar_with_nonsingle_element_tile(
-    %tensor: tensor<?x?xf32>, %tile: !gml_st.tile<1x2>) {
-  // expected-error @+1 {{expected tile type '!gml_st.tile<1x2>' to have a single element shape}}
-  %0 = gml_st.materialize %tensor[%tile]
-     : tensor<?x?xf32>[!gml_st.tile<1x2>] to f32
-}
-
-// -----
-
-func.func @materialize_scalar_element_type_mismatch(
-    %tensor: tensor<?x?xf32>, %tile: !gml_st.tile<1x1>) {
-  // expected-error @+1 {{expected the result type 'i32' to match source element type 'f32'}}
-  %0 = gml_st.materialize %tensor[%tile]
-     : tensor<?x?xf32>[!gml_st.tile<1x1>] to i32
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func.func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> ()
-
-func.func @loop_incorrent_num_yield_operands(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %0 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c192, %c192)
-      step (%c24, %c24)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B: memref<192x192xf32>)
-      outs (%CT_ = %C_tensor: tensor<192x192xf32>,
-            %C_ = %C: memref<192x192xf32>) {
-        func.call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
-    // expected-error @+1 {{expected number of tensor output args = 1 to match the number of yield operands = 0}}
-    gml_st.yield
-  }
-  func.return
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (24, -d0 + 192)>
-#map1 = affine_map<(d0, d1)[s0] -> (d0 * 192 + s0 + d1)>
-#map2 = affine_map<(d0) -> (16, -d0 + 192)>
-
-func.func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> tensor<f32>
-
-func.func @loop_incorrent_yield_operand_type(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %0 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c192, %c192)
-      step (%c24, %c24)
-      ins (%A_ = %A: memref<192x192xf32>, %B_ = %B: memref<192x192xf32>)
-      outs (%CT_ = %C_tensor: tensor<192x192xf32>,
-            %C_ = %C: memref<192x192xf32>) {
-        %1 = func.call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> tensor<f32>
-    // expected-error @+1 {{expected yield operand 0 with type = 'tensor<f32>' to match output arg type = 'tensor<192x192xf32>}}
-    gml_st.yield %1 : tensor<f32>
-  }
-  func.return
-}
-
-// -----
-
-func.func private @foo(%A: memref<192x192xf32>, %B: memref<192x192xf32>,
-                  %C: memref<192x192xf32>) -> ()
-
-func.func @loop_incorrent_iterator_types_count(%A: memref<192x192xf32>,
-    %B: memref<192x192xf32>, %C: memref<192x192xf32>,
-    %C_tensor: tensor<192x192xf32>) {
-  %c24 = arith.constant 24 : index
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  // expected-error @+1 {{expected iterator types array attribute size = 1 to match the number of loops = 2}}
-  %0 = "gml_st.loop"(%c0, %c0, %c192, %c192, %c24, %c24, %A, %B, %C_tensor, %C) ({
-    ^bb0(%arg4: index, %arg5: index, %A_: memref<192x192xf32>,
-         %B_: memref<192x192xf32>, %CT_: tensor<192x192xf32>,
-         %C_: memref<192x192xf32>):
-      func.call @foo(%A_, %B_, %C_)
-          : (memref<192x192xf32>, memref<192x192xf32>, memref<192x192xf32>)-> ()
-      gml_st.yield %CT_ : tensor<192x192xf32>
-    }) {
-      iterator_types = [#gml_st.iterator_type<parallel>],
-      operand_segment_sizes = array<i32: 2, 2, 2, 2, 2>
-    } : (index, index, index, index, index, index, memref<192x192xf32>,
-      memref<192x192xf32>, tensor<192x192xf32>, memref<192x192xf32>
-    ) -> tensor<192x192xf32>
-  func.return
-}
-
-// -----
-
-func.func private @foo(%A: memref<100xf32>) -> ()
-
-func.func @loop_incorrent_block_arg_type(%A: memref<192xf32>) {
-  %c0 = arith.constant 0 : index
-  %c192 = arith.constant 192 : index
-  %c24 = arith.constant 24 : index
-  // expected-error @+1 {{expected output arg 0 with type = 'memref<192xf32>' to match region arg 1 type = 'memref<100xf32>'}}
-  "gml_st.loop"(%c0, %c192, %c24, %A) ({
-    ^bb0(%arg4: index, %A_: memref<100xf32>):
-      func.call @foo(%A_) : (memref<100xf32>)-> ()
-      gml_st.yield
-    }) {
-      iterator_types = [#gml_st.iterator_type<parallel>],
-      operand_segment_sizes = array<i32: 1, 1, 1, 0, 1>
-    } : (index, index, index, memref<192xf32>) -> ()
-  func.return
-}
-
-// -----
-
 func.func @tile_op_mismatch_sizes_and_static_sizes(%i: index) {
   // expected-error@+1 {{expected 0 dynamic size values}}
-  %1 = "gml_st.tile"(%i) { static_offsets = [0, 0], static_sizes = [1, 1], static_strides = [1, 1], operand_segment_sizes = array<i32: 0, 1, 0> } : (index) -> !gml_st.tile<?x?>
+  %1 = "gml_st.tile"(%i) { static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 1, 1>, static_strides = array<i64: 1, 1>, operand_segment_sizes = array<i32: 0, 1, 0> } : (index) -> !gml_st.tile<?x?>
   func.return
 }
 
@@ -160,7 +10,7 @@ func.func @tile_op_mismatch_sizes_and_static_sizes(%i: index) {
 
 func.func @tile_op_mismatch_offsets_and_static_offsets(%i: index) -> !gml_st.tile<8x8> {
   // expected-error@+1 {{expected 0 dynamic offset values}}
-  %1 = "gml_st.tile"(%i) {static_offsets = [0, 0], static_sizes = [8, 8], static_strides = [1, 1], operand_segment_sizes = array<i32: 1, 0, 0>} : (index) -> !gml_st.tile<8x8>
+  %1 = "gml_st.tile"(%i) {static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 8, 8>, static_strides = array<i64: 1, 1>, operand_segment_sizes = array<i32: 1, 0, 0>} : (index) -> !gml_st.tile<8x8>
   func.return %1 : !gml_st.tile<8x8>
 }
 
@@ -168,7 +18,7 @@ func.func @tile_op_mismatch_offsets_and_static_offsets(%i: index) -> !gml_st.til
 
 func.func @tile_op_mismatch_strides_and_static_strides(%i: index)  -> !gml_st.tile<8x8> {
   // expected-error@+1 {{expected 0 dynamic stride values}}
-  %1 = "gml_st.tile"(%i) {static_offsets = [0, 0], static_sizes = [8, 8], static_strides = [1, 1], operand_segment_sizes = array<i32: 0, 0, 1>} : (index) -> !gml_st.tile<8x8>
+  %1 = "gml_st.tile"(%i) {static_offsets = array<i64: 0, 0>, static_sizes = array<i64: 8, 8>, static_strides = array<i64: 1, 1>, operand_segment_sizes = array<i32: 0, 0, 1>} : (index) -> !gml_st.tile<8x8>
   func.return %1 : !gml_st.tile<8x8>
 }
 
@@ -176,7 +26,7 @@ func.func @tile_op_mismatch_strides_and_static_strides(%i: index)  -> !gml_st.ti
 
 func.func @tile_op_negative_static_size(%i: index)  -> !gml_st.tile<?x?> {
   // expected-error@+1 {{'gml_st.tile' op expected size = -2 to be non-negative}}
-  %1 = "gml_st.tile"(%i) {static_offsets = [0, 0], static_sizes = [-1, -2], static_strides = [1, 1], operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x?>
+  %1 = "gml_st.tile"(%i) {static_offsets = array<i64: 0, 0>, static_sizes = array<i64: -9223372036854775808, -2>, static_strides = array<i64: 1, 1>, operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x?>
   func.return %1 : !gml_st.tile<?x?>
 }
 
@@ -184,7 +34,7 @@ func.func @tile_op_negative_static_size(%i: index)  -> !gml_st.tile<?x?> {
 
 func.func @tile_op_negative_static_stride(%i: index)  -> !gml_st.tile<?x8> {
   // expected-error@+1 {{'gml_st.tile' op expected stride = -2 to be non-negative}}
-  %1 = "gml_st.tile"(%i) {static_offsets = [0, 0], static_sizes = [-1, 8], static_strides = [1, -2], operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x8>
+  %1 = "gml_st.tile"(%i) {static_offsets = array<i64: 0, 0>, static_sizes = array<i64: -9223372036854775808, 8>, static_strides = array<i64: 1, -2>, operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x8>
   func.return %1 : !gml_st.tile<?x8>
 }
 
@@ -192,7 +42,7 @@ func.func @tile_op_negative_static_stride(%i: index)  -> !gml_st.tile<?x8> {
 
 func.func @tile_op_negative_static_offset(%i: index)  -> !gml_st.tile<?x8> {
   // expected-error@+1 {{'gml_st.tile' op expected offset = -2 to be non-negative}}
-  %1 = "gml_st.tile"(%i) {static_offsets = [0, -2], static_sizes = [-1, 8], static_strides = [1, 1], operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x8>
+  %1 = "gml_st.tile"(%i) {static_offsets = array<i64: 0, -2>, static_sizes = array<i64: -9223372036854775808, 8>, static_strides = array<i64: 1, 1>, operand_segment_sizes = array<i32: 0, 1, 0>} : (index) -> !gml_st.tile<?x8>
   func.return %1 : !gml_st.tile<?x8>
 }
 
@@ -204,19 +54,18 @@ func.func @for_loop_wrong_yield_target(
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %identity = gml_st.tile[][][] : !gml_st.tile<>
   %sum = gml_st.for (%i) = (%c0) to (%c8) step (%c4)
       outs(%out_ = %output : tensor<f32>) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %arg_sub = gml_st.materialize %arg[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %out_[%identity]
-      : tensor<f32>[!gml_st.tile<>] to tensor<f32>
+    %arg_sub = tensor.extract_slice %arg[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %out_[][][]
+      : tensor<f32> to tensor<f32>
 
     %result_sub = linalg.dot
         ins(%arg_sub, %arg_sub : tensor<4xf32>, tensor<4xf32>)
         outs(%out_sub : tensor<f32>) -> tensor<f32>
 
+    %identity = gml_st.tile[][][] : !gml_st.tile<>
     // expected-error@+1 {{'gml_st.set_yield' op expected output block argument 0 to match set_yield destination}}
     gml_st.set_yield %result_sub into %output[%identity]
       : tensor<f32> into tensor<f32>[!gml_st.tile<>]
@@ -232,20 +81,20 @@ func.func @yield_with_accumulator_mismatched_type(
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %identity = gml_st.tile[][][] : !gml_st.tile<>
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile  [%i] [4] [1] : !gml_st.tile<4>
-    %arg_sub = gml_st.materialize %arg[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %output[%identity]
-      : tensor<f32>[!gml_st.tile<>] to tensor<f32>
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs (%out_ = %output: tensor<f32>) {
+    %arg_sub = tensor.extract_slice %arg[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %out_[][][]
+      : tensor<f32> to tensor<f32>
 
     %result_sub = linalg.dot
        ins(%arg_sub, %arg_sub : tensor<4xf32>, tensor<4xf32>)
        outs(%out_sub : tensor<f32>) -> tensor<f32>
 
+    %identity = gml_st.tile[][][] : !gml_st.tile<>
     // expected-error@+1 {{'gml_st.set_yield' op expected accumulator region to have 2 arguments of type 'tensor<f32>'}}
-    gml_st.set_yield %result_sub into %output[%identity]
+    gml_st.set_yield %result_sub into %out_[%identity]
       acc (%in, %out: memref<f32>) {
         gml_st.yield %in : memref<f32>
       }: tensor<f32> into tensor<f32>[!gml_st.tile<>]
@@ -263,12 +112,10 @@ func.func @for_loop_wrong_yield_operands(
 
   %sum = gml_st.for (%i) = (%c0) to (%c8) step (%c4)
       outs(%out_ = %output : tensor<f32>) {
-    %tile_0d =gml_st.tile  [%i] [4] [1] : !gml_st.tile<>
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %arg_sub = gml_st.materialize %arg[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %out_[%tile_0d]
-      : tensor<f32>[!gml_st.tile<>] to tensor<f32>
+    %arg_sub = tensor.extract_slice %arg[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %out_[][][]
+      : tensor<f32> to tensor<f32>
 
     %result_sub = linalg.dot
         ins(%arg_sub, %arg_sub : tensor<4xf32>, tensor<4xf32>)
@@ -279,3 +126,24 @@ func.func @for_loop_wrong_yield_operands(
   } : tensor<f32>
   func.return %sum : tensor<f32>
 }
+
+// -----
+
+func.func @missing_output_tensors(%in: tensor<8x8xf32>) -> tensor<8x8xf32> {
+  %c8 = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  // expected-error@+1 {{expected the number of output arguments to match the number of results}}
+  %13 = gml_st.parallel (%arg4, %arg5) = (%c0, %c16) to (%c1, %c16)
+        step (%c8, %c8) {
+    %19 = gml_st.tile [%arg4, %arg5] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    %11 = linalg.fill ins(%cst : f32) outs(%0 : tensor<8x8xf32>)
+          -> tensor<8x8xf32>
+    gml_st.set_yield %11 into %0[%19] : tensor<8x8xf32>
+          into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %13 : tensor<8x8xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_bufferization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_bufferization.mlir
deleted file mode 100644
index 92f2f8a47cc..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_bufferization.mlir
+++ /dev/null
@@ -1,133 +0,0 @@
-// RUN: mlir-hlo-opt %s -test-gml-st-bufferization --canonicalize -cse \
-// RUN:   -split-input-file | FileCheck %s
-
-func.func private @some_use(memref<?xf32>)
-
-#TILE_MAP = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-
-//  CHECK-DAG: #[[$TILE_MAP:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 3)>
-
-//      CHECK:  func @tiled_dot(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32>
-// CHECK-SAME:    %[[c:[a-zA-Z0-9]*]]: memref<f32>
-func.func @tiled_dot(%A: tensor<?xf32> {bufferization.writeable = false},
-                %B: tensor<?xf32> {bufferization.writeable = false},
-                %c: tensor<f32> {bufferization.writeable = true},
-                %effecting: memref<?xf32>) -> tensor<f32> {
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: gml_st.loop {{.*}} to (%[[M]]) {{.*}} %[[A]]{{.*}}%[[B]]{{.*}}outs{{.*}}%[[c]]
-  // CHECK-NOT: copy
-  %1 = gml_st.loop (%arg3) = (%c0) to (%0) step (%c3)
-       ins (%arg4 = %A: tensor<?xf32>, %use = %effecting : memref<?xf32>,
-            %arg5 = %B: tensor<?xf32>)
-       outs (%arg6 = %c: tensor<f32>)
-       iterators[#gml_st.iterator_type<reduction>] {
-    // CHECK-NOT:   alloc
-
-    %2 = tensor.dim %arg4, %c0 : tensor<?xf32>
-    %3 = affine.min #TILE_MAP(%arg3)[%2]
-
-    //     CHECK:   %[[SV_A:.*]] = memref.subview {{.*}}
-    %4 = tensor.extract_slice %arg4[%arg3] [%3] [1]
-      : tensor<?xf32> to tensor<?xf32>
-    %5 = tensor.dim %arg5, %c0 : tensor<?xf32>
-    %6 = affine.min #TILE_MAP(%arg3)[%5]
-
-    //     CHECK:   %[[SV_B:.*]] = memref.subview {{.*}}
-    %7 = tensor.extract_slice %arg5[%arg3] [%6] [1] : tensor<?xf32> to tensor<?xf32>
-
-    //     CHECK:   linalg.dot ins(%[[SV_A]], %[[SV_B]] : memref<?xf32, strided{{.*}}>, memref<?xf32, strided<{{.*}}>>) outs(%{{.*}} : memref<f32>)
-    %8 = linalg.dot ins(%4, %7 : tensor<?xf32>, tensor<?xf32>)
-                    outs(%arg6 : tensor<f32>) -> tensor<f32>
-
-    //     CHECK:   call @some_use(%{{.*}}) : (memref<?xf32>) -> ()
-    func.call @some_use(%use) : (memref<?xf32>) -> ()
-
-    gml_st.yield %8 : tensor<f32>
-    //     CHECK:   gml_st.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  func.return %1 : tensor<f32>
-}
-
-// -----
-
-#TILE_MAP = affine_map<(d0)[s0] -> (3, -d0 + s0)>
-
-//      CHECK:  func @tiled_fill(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32>
-func.func @tiled_fill(%A: tensor<?xf32> {bufferization.writeable = true}) -> tensor<?xf32> {
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-  %f0 = arith.constant 0.0 : f32
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: gml_st.loop {{.*}} to (%[[M]]) {{.*}} outs{{.*}}%[[A]]
-  %1 = gml_st.loop (%arg3) = (%c0) to (%0) step (%c3)
-      outs (%arg1 = %A: tensor<?xf32>)
-      iterators[#gml_st.iterator_type<parallel>] {
-    // CHECK-NOT:   alloc
-
-    %2 = tensor.dim %arg1, %c0 : tensor<?xf32>
-    %3 = affine.min #TILE_MAP(%arg3)[%2]
-
-    //     CHECK:   %[[SV_A:.*]] = memref.subview {{.*}}
-    %4 = tensor.extract_slice %arg1[%arg3] [%3] [1] : tensor<?xf32> to tensor<?xf32>
-
-    //     CHECK:   linalg.fill ins(%{{.*}}: f32) outs(%[[SV_A]] : memref<?xf32, strided{{.*}}>)
-    %5 = linalg.fill ins(%f0: f32) outs(%4: tensor<?xf32>)
-      -> tensor<?xf32>
-    %6 = tensor.insert_slice %5 into %arg1[%arg3] [%3] [1] : tensor<?xf32> into tensor<?xf32>
-
-    gml_st.yield %6 : tensor<?xf32>
-    //     CHECK:   gml_st.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  func.return %1 : tensor<?xf32>
-}
-
-// -----
-
-//      CHECK:  func @tiled_loop_yield_out_of_place(
-// CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32>
-// CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32>
-func.func @tiled_loop_yield_out_of_place(
-    %A: tensor<?xf32> {bufferization.writeable = true},
-    %B: tensor<?xf32> {bufferization.writeable = true}) -> tensor<?xf32> {
-  %c3 = arith.constant 3 : index
-  %c0 = arith.constant 0 : index
-  %f0 = arith.constant 0.0 : f32
-
-  //     CHECK: %[[M:.*]] = memref.dim %[[A]], {{.*}} : memref<?xf32>
-  %0 = tensor.dim %A, %c0 : tensor<?xf32>
-
-  //     CHECK: gml_st.loop {{.*}} to (%[[M]]) {{.*}} outs{{.*}}%[[A]]
-  %1 = gml_st.loop (%arg3) = (%c0) to (%0) step (%c3)
-      outs (%arg1 = %A: tensor<?xf32>)
-      iterators[#gml_st.iterator_type<parallel>]
-  {
-    // CHECK-NOT:   alloc
-    //     CHECK:   memref.copy %[[B]], %[[A]]
-    gml_st.yield %B : tensor<?xf32>
-    //     CHECK:   gml_st.yield
-    // CHECK-NOT:   tensor
-  }
-
-  //     CHECK: return
-  // CHECK-NOT: tensor
-  func.return %1 : tensor<?xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_loop_tiling.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_loop_tiling.mlir
deleted file mode 100644
index ea600613372..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_loop_tiling.mlir
+++ /dev/null
@@ -1,93 +0,0 @@
-// RUN: mlir-hlo-opt %s -split-input-file \
-// RUN: -test-gml-st-loop-tiling="tile-sizes=2,3,4 distribution-types=block_x,block_y,none" \
-// RUN: | FileCheck %s
-
-func.func @matmul_tensors(
-  %arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%arg2: tensor<?x?xf32>)
-    -> tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @matmul_tensors
-// CHECK-SAME: (%[[ARG_0:.*]]: [[TY:.*]], %[[ARG_1:.*]]: [[TY]],
-// CHECK-SAME: %[[ARG_2:.*]]: [[TY]]) -> [[TY]] {
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-
-// CHECK: %[[ARG_0_X:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : [[TY]]
-// CHECK: %[[ARG_0_Y:.*]] = tensor.dim %[[ARG_0]], %[[C1]] : [[TY]]
-// CHECK: %[[ARG_1_Y:.*]] = tensor.dim %[[ARG_1]], %[[C1]] : [[TY]]
-
-// CHECK: %{{.*}} = gml_st.loop (%[[I:.*]], %[[J:.*]], %[[K:.*]]) =
-// CHECK-SAME: (%[[C0]], %[[C0]], %[[C0]])
-// CHECK-SAME: to (%[[ARG_0_X]], %[[ARG_1_Y]], %[[ARG_0_Y]])
-// CHECK-SAME: step (%[[C2]], %[[C3]], %[[C4]])
-// CHECK-SAME: ins (%[[A0:.*]] = %[[ARG_0]]: [[TY]], %[[A1:.*]] = %[[ARG_1]]: [[TY]])
-// CHECK-SAME: outs (%[[A2:.*]] = %[[ARG_2]]: [[TY]])
-// CHECK-SAME: iterators[#gml_st.iterator_type<parallel>,
-// CHECK-SAME:   #gml_st.iterator_type<parallel>, #gml_st.iterator_type<reduction>]
-// CHECK-SAME: distribution["block_x", "block_y", "none"] {
-
-// CHECK: %[[SUB_ARG_0:.*]] = tensor.extract_slice %[[A0]][%[[I]], %[[K]]]
-// CHECK: %[[SUB_ARG_1:.*]] = tensor.extract_slice %[[A1]][%[[K]], %[[J]]]
-// CHECK: %[[SUB_ARG_2:.*]] = tensor.extract_slice %[[A2]][%[[I]], %[[J]]]
-
-// CHECK: %[[PROD:.*]] = linalg.matmul ins(%[[SUB_ARG_0]], %[[SUB_ARG_1]]
-// CHECK-SE: outs(%[[SUB_ARG_2]] : [[TY]]) -> [[TY]]
-
-// CHECK: %[[O:.*]] = tensor.insert_slice %[[PROD]] into %[[A2]][%[[I]], %[[J]]]
-// CHECK: gml_st.yield %[[O]] : [[TY]]
-
-// -----
-
-func.func @generic_op_tensors(
-  %arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %3 = tensor.empty(%0, %1, %2) : tensor<?x?x?xf32>
-  %4 = linalg.generic
-    {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                      affine_map<(d0, d1, d2) -> (d0, d2, d1)>,
-                      affine_map<(d0, d1, d2) -> (d2, d1, d0)>],
-     iterator_types = ["parallel", "parallel", "parallel"]}
-    ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs(%3 : tensor<?x?x?xf32>) {
-    ^bb0(%arg2 : f32, %arg3: f32, %arg4: f32):
-      %5 = arith.addf %arg2, %arg3 : f32
-      linalg.yield %5 : f32
-    } -> tensor<?x?x?xf32>
-  func.return %4 : tensor<?x?x?xf32>
-}
-// CHECK-LABEL: func @generic_op_tensors(
-// CHECK-SAME:    %[[ARG_0:.*]]: [[TY:.*]],
-// CHECK-SAME:    %[[ARG_1:.*]]: [[TY]]) -> [[TY]] {
-
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
-// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-
-// CHECK:     %[[INIT:.*]] = tensor.empty
-// CHECK:     %[[ARG_0_X:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : [[TY]]
-// CHECK:     %[[ARG_0_Y:.*]] = tensor.dim %[[ARG_0]], %[[C1]] : [[TY]]
-// CHECK:     %[[ARG_0_Z:.*]] = tensor.dim %[[ARG_0]], %[[C2]] : [[TY]]
-
-// CHECK:     %{{.*}} = gml_st.loop (%{{.*}}, %{{.*}}, %{{.*}}) =
-// CHECK-SAME: (%[[C0]], %[[C0]], %[[C0]])
-// CHECK-SAME: to (%[[ARG_0_X]], %[[ARG_0_Y]], %[[ARG_0_Z]])
-// CHECK-SAME: step (%[[C2]], %[[C3]], %[[C4]])
-// CHECK-SAME: ins (%{{.*}} = %[[ARG_0]]: [[TY]], %{{.*}} = %[[ARG_1]]: [[TY]])
-// CHECK-SAME: outs (%{{.*}} = %[[INIT]]: [[TY]])
-// CHECK-SAME: distribution["block_x", "block_y", "none"] {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_peeling.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_peeling.mlir
deleted file mode 100644
index 8f3f26d2b3d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/legacy_peeling.mlir
+++ /dev/null
@@ -1,171 +0,0 @@
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -test-gml-st-loop-peeling="dims=2" -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-2
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -test-gml-st-loop-peeling="dims=0,1,2" -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-012
-// RUN: mlir-hlo-opt %s -allow-unregistered-dialect -test-gml-st-loop-peeling="dims=0,1,2 skip-partial" -split-input-file | FileCheck %s -check-prefix=CHECK-TILE-012-SKIP-PARTIAL
-
-// CHECK-TILE-2-LABEL: func @loop_3d_tensor(
-//  CHECK-TILE-2-SAME:     %[[input:.*]]: tensor<?x?x?xf32>, %[[s0:.*]]: index, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-TILE-2-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//   CHECK-TILE-2-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//   CHECK-TILE-2-DAG:   %[[c2:.*]] = arith.constant 2 : index
-//       CHECK-TILE-2:   %[[dim0:.*]] = tensor.dim %[[input]], %[[c0]]
-//       CHECK-TILE-2:   %[[dim1:.*]] = tensor.dim %[[input]], %[[c1]]
-//       CHECK-TILE-2:   %[[dim2:.*]] = tensor.dim %[[input]], %[[c2]]
-//       CHECK-TILE-2:   %[[init_tensor:.*]] = tensor.empty
-//       CHECK-TILE-2:   %[[split_bound:.*]] = affine.apply
-//       CHECK-TILE-2:   %[[r1:.*]] = gml_st.loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[c0]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in1:.*]] = %[[input]]: tensor<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out1:.*]] = %[[init_tensor]]: tensor<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_1:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_1:.*]] = affine.min
-//       CHECK-TILE-2:     %[[in_slice1:.*]] = tensor.extract_slice %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     %[[out_slice1:.*]] = tensor.extract_slice %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     %[[mod_slice1:.*]] = tensor.insert_slice %{{.*}} into %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_1]], %[[min1_1]], %[[s2]]]
-//       CHECK-TILE-2:     gml_st.yield %[[mod_slice1]]
-//       CHECK-TILE-2:   %[[r2:.*]] = gml_st.loop (%[[iv0:.*]], %[[iv1:.*]], %[[iv2:.*]]) = (%[[c0]], %[[c0]], %[[split_bound]])
-//  CHECK-TILE-2-SAME:       to (%[[dim0]], %[[dim1]], %[[dim2]])
-//  CHECK-TILE-2-SAME:       step (%[[s0]], %[[s1]], %[[s2]])
-//  CHECK-TILE-2-SAME:       ins (%[[loop_in2:.*]] = %[[input]]: tensor<?x?x?xf32>)
-//  CHECK-TILE-2-SAME:       outs (%[[loop_out2:.*]] = %[[r1]]: tensor<?x?x?xf32>) {
-//       CHECK-TILE-2:     %[[min0_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[min1_2:.*]] = affine.min
-//       CHECK-TILE-2:     %[[apply2:.*]] = affine.apply
-//       CHECK-TILE-2:     %[[in_slice2:.*]] = tensor.extract_slice %[[loop_in1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     %[[out_slice2:.*]] = tensor.extract_slice %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     %[[mod_slice2:.*]] = tensor.insert_slice %{{.*}} into %[[loop_out1]][%[[iv0]], %[[iv1]], %[[iv2]]] [%[[min0_2]], %[[min1_2]], %[[apply2]]]
-//       CHECK-TILE-2:     gml_st.yield %[[mod_slice2]]
-//       CHECK-TILE-2:   return %[[r2]]
-
-// CHECK-TILE-012-LABEL: func @loop_3d_tensor
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//       CHECK-TILE-012:   gml_st.loop {{.*}} {
-//       CHECK-TILE-012:     gml_st.yield
-//       CHECK-TILE-012:   }
-//   CHECK-TILE-012-NOT: gml_st.loop
-
-//      CHECK-TILE-012-SKIP-PARTIAL: func @loop_3d_tensor(
-// CHECK-TILE-012-SKIP-PARTIAL-SAME:     %[[input:.*]]: tensor<?x?x?xf32>
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[c2:.*]] = arith.constant 2 : index
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim0:.*]] = tensor.dim %[[input]], %[[c0]]
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim1:.*]] = tensor.dim %[[input]], %[[c1]]
-//  CHECK-TILE-012-SKIP-PARTIAL-DAG:   %[[dim2:.*]] = tensor.dim %[[input]], %[[c2]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p0:.*]] = affine.apply #{{.*}}()[%[[dim0]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p1:.*]] = affine.apply #{{.*}}()[%[[dim1]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   %[[p2:.*]] = affine.apply #{{.*}}()[%[[dim2]]
-//      CHECK-TILE-012-SKIP-PARTIAL:   gml_st.loop {{.*}} = (%[[c0]], %[[c0]], %[[c0]]) to (%[[p0]], %[[p1]], %[[p2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   gml_st.loop {{.*}} = (%[[c0]], %[[c0]], %[[p2]]) to (%[[p0]], %[[p1]], %[[dim2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   gml_st.loop {{.*}} = (%[[c0]], %[[p1]], %[[c0]]) to (%[[p0]], %[[dim1]], %[[dim2]])
-//      CHECK-TILE-012-SKIP-PARTIAL:   gml_st.loop {{.*}} = (%[[p0]], %[[c0]], %[[c0]]) to (%[[dim0]], %[[dim1]], %[[dim2]])
-func.func @loop_3d_tensor(%arg0: tensor<?x?x?xf32>, %s0: index, %s1: index,
-                           %s2: index) -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = tensor.empty(%dim0, %dim1, %dim2) : tensor<?x?x?xf32>
-  %result = gml_st.loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %dim2)
-           step (%s0, %s1, %s2) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %min0 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg1, %s0)[%dim0]
-    %min1 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg2, %s1)[%dim1]
-    %min2 = affine.min affine_map<(d0, d1)[s0] -> (d1, -d0 + s0)>(%arg3, %s2)[%dim2]
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%min0, %min1, %min2] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    gml_st.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  func.return %result : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-TILE-2-LABEL: func @step_1_do_not_peel
-//       CHECK-TILE-2:   gml_st.loop
-//   CHECK-TILE-2-NOT:   gml_st.loop
-
-// CHECK-TILE-012-LABEL: func @step_1_do_not_peel
-
-func.func @step_1_do_not_peel(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = tensor.empty(%dim0, %dim1, %dim2) : tensor<?x?x?xf32>
-  %result = gml_st.loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %dim2)
-           step (%c1, %c1, %c1) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    gml_st.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  func.return %result : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-TILE-2-LABEL: func @divides_evenly_do_not_peel
-//       CHECK-TILE-2:   gml_st.loop
-//   CHECK-TILE-2-NOT:   gml_st.loop
-
-// CHECK-TILE-012-LABEL: func @divides_evenly_do_not_peel
-
-func.func @divides_evenly_do_not_peel(%arg0: tensor<?x?x?xf32>, %s: index)
-    -> tensor<?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %c64 = arith.constant 64 : index
-  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?x?xf32>
-  %dim1 = tensor.dim %arg0, %c1 : tensor<?x?x?xf32>
-  %dim2 = tensor.dim %arg0, %c2 : tensor<?x?x?xf32>
-  %output = tensor.empty(%dim0, %dim1, %dim2) : tensor<?x?x?xf32>
-  %result = gml_st.loop
-           (%arg1, %arg2, %arg3) = (%c0, %c0, %c0) to (%dim0, %dim1, %c64)
-           step (%s, %s, %c8) ins (%arg4 = %arg0: tensor<?x?x?xf32>)
-           outs (%arg5 = %output: tensor<?x?x?xf32>) {
-    %in_slice = tensor.extract_slice %arg4[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1]: tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %out_slice = tensor.extract_slice %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
-    %comp = "computation"(%in_slice, %out_slice) : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-    %updated_slice = tensor.insert_slice %comp into %arg5[%arg1, %arg2, %arg3] [%c1, %c1, %c1] [1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
-    gml_st.yield %updated_slice : tensor<?x?x?xf32>
-  }
-  func.return %result : tensor<?x?x?xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling.mlir
index 4fae330546b..ff31254de32 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling.mlir
@@ -16,17 +16,19 @@ func.func @add(%lhs : tensor<?x?xf32>, %rhs : tensor<?x?xf32>)
   // CHECK-DAG:  %[[C512:.*]] = arith.constant 512
   // CHECK:      %[[INIT:.*]] = tensor.empty
   // CHECK:      %[[LOOP:.*]] = gml_st.parallel
-  // CHECK:        %[[LHS_SUB:.*]] = gml_st.materialize %[[LHS]]
-  // CHECK:        %[[RHS_SUB:.*]] = gml_st.materialize %[[RHS]]
-  // CHECK:        %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]]
+  // CHECK-SAME:     outs (%[[INIT_:.*]] = %[[INIT]]:
+  // CHECK:        %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS]]
+  // CHECK:        %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS]]
+  // CHECK:        %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]]
 
   // CHECK:        %[[LOOP_:.*]] = gml_st.parallel
-  // CHECK:          %[[LHS_SUB_2:.*]] = gml_st.materialize %[[LHS_SUB]]
-  // CHECK:          %[[RHS_SUB_2:.*]] = gml_st.materialize %[[RHS_SUB]]
-  // CHECK:          %[[INIT_SUB_2:.*]] = gml_st.materialize %[[INIT_SUB]]
+  // CHECK-SAME:       outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]:
+  // CHECK:          %[[LHS_SUB_2:.*]] = tensor.extract_slice %[[LHS_SUB]]
+  // CHECK:          %[[RHS_SUB_2:.*]] = tensor.extract_slice %[[RHS_SUB]]
+  // CHECK:          %[[INIT_SUB_2:.*]] = tensor.extract_slice %[[INIT_SUB_]]
   // CHECK:          %[[GENERIC:.*]] = linalg.generic
-  // CHECK:          gml_st.set_yield %[[GENERIC]] into %[[INIT_SUB]]
-  // CHECK:        gml_st.set_yield %[[LOOP_]] into %[[INIT]]
+  // CHECK:          gml_st.set_yield %[[GENERIC]] into %[[INIT_SUB_]]
+  // CHECK:        gml_st.set_yield %[[LOOP_]] into %[[INIT_]]
   // CHECK:      return %[[LOOP]]
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_cwise.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_cwise.mlir
index 2b151a1b855..3ee7d1ef746 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_cwise.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_cwise.mlir
@@ -23,35 +23,38 @@ func.func @cwise_expr(%a: tensor<?x1024x1024xf32>, %b: tensor<?x1024x1024xf32>,
   // CHECK-SAME:      (%[[I:.*]], %[[J:.*]], %[[K:.*]]) = (%[[C0]], %[[C0]], %[[C0]])
   // CHECK-SAME:      to (%[[A_D0]], %[[C1024]], %[[C1024]])
   // CHECK-SAME:      step (%[[C1]], %[[C512]], %[[C1024]])
-  // CHECK-DAG:     %[[A_SUB:.*]] = gml_st.materialize %[[A]][%{{.*}}]
-  // CHECK-DAG:     %[[B_SUB:.*]] = gml_st.materialize %[[B]][%{{.*}}]
-  // CHECK-DAG:     %[[C_SUB:.*]] = gml_st.materialize %[[C]][%{{.*}}]
-  // CHECK-DAG:     %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%{{.*}}]
+  // CHECK-SAME:      outs (%[[INIT_:.*]] = %[[INIT]]:
+  // CHECK-DAG:     %[[A_SUB:.*]] = tensor.extract_slice %[[A]]
+  // CHECK-DAG:     %[[B_SUB:.*]] = tensor.extract_slice %[[B]]
+  // CHECK-DAG:     %[[C_SUB:.*]] = tensor.extract_slice %[[C]]
+  // CHECK-DAG:     %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]]
   // CHECK:         %[[PLOOP_:.*]] = gml_st.parallel
   // CHECK-SAME:        (%[[I_:.*]], %[[J_:.*]], %[[K_:.*]]) = (%[[C0]], %[[C0]], %[[C0]])
   // CHECK-SAME:        to (%[[C1]], %[[C512]], %[[C1024]])
   // CHECK-SAME:        step (%[[C1]], %[[C64]], %[[C128]])
-  // CHECK-DAG:       %[[A_SUB_SUB:.*]] = gml_st.materialize %[[A_SUB]][%{{.*}}]
-  // CHECK-DAG:       %[[B_SUB_SUB:.*]] = gml_st.materialize %[[B_SUB]][%{{.*}}]
-  // CHECK-DAG:       %[[C_SUB_SUB:.*]] = gml_st.materialize %[[C_SUB]][%{{.*}}]
-  // CHECK-DAG:       %[[INIT_SUB_SUB:.*]] = gml_st.materialize %[[INIT_SUB]][%{{.*}}]
+  // CHECK-SAME:        outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]:
+  // CHECK-DAG:       %[[A_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB]]
+  // CHECK-DAG:       %[[B_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB]]
+  // CHECK-DAG:       %[[C_SUB_SUB:.*]] = tensor.extract_slice %[[C_SUB]]
+  // CHECK-DAG:       %[[INIT_SUB_SUB:.*]] = tensor.extract_slice %[[INIT_SUB_]]
   // CHECK:           %[[PLOOP__:.*]] = gml_st.parallel
   // CHECK-SAME:          (%[[I__:.*]], %[[J__:.*]], %[[K__:.*]]) = (%[[C0]], %[[C0]], %[[C0]])
   // CHECK-SAME:          to (%[[C1]], %[[C64]], %[[C128]])
   // CHECK-SAME:          step (%[[C1]], %[[C1]], %[[C32]])
-  // CHECK-DAG:         %[[A_SUB_SUB_SUB:.*]] = gml_st.materialize %[[A_SUB_SUB]][%{{.*}}]
-  // CHECK-DAG:         %[[B_SUB_SUB_SUB:.*]] = gml_st.materialize %[[B_SUB_SUB]][%{{.*}}]
-  // CHECK-DAG:         %[[INIT_SUB_SUB_SUB:.*]] = gml_st.materialize %[[INIT_SUB_SUB]][%{{.*}}]
+  // CHECK-SAME:          outs (%[[INIT_SUB_SUB_:.*]] = %[[INIT_SUB_SUB]]:
+  // CHECK-DAG:         %[[A_SUB_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB_SUB]]
+  // CHECK-DAG:         %[[B_SUB_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB_SUB]]
   // CHECK:             %[[AB_SUB_SUB_SUB:.*]] = linalg.generic
   // CHECK-SAME:            ins(%[[A_SUB_SUB_SUB]], %[[B_SUB_SUB_SUB]] : tensor<1x1x32xf32>, tensor<1x1x32xf32>)
-  // CHECK-SAME:            outs(%[[INIT_SUB_SUB_SUB]] : tensor<1x1x32xf32>)
-  // CHECK-DAG:         %[[C_SUB_SUB_SUB:.*]] = gml_st.materialize %[[C_SUB_SUB]][%{{.*}}]
+  // CHECK-SAME:            outs(%{{.*}} : tensor<1x1x32xf32>)
+  // CHECK-DAG:         %[[C_SUB_SUB_SUB:.*]] = tensor.extract_slice %[[C_SUB_SUB]]
+  // CHECK-DAG:         %[[INIT_SUB_SUB_SUB_:.*]] = tensor.extract_slice %[[INIT_SUB_SUB_]]
   // CHECK:             %[[ABC_SUB_SUB_SUB:.*]] = linalg.generic
   // CHECK-SAME:            ins(%[[AB_SUB_SUB_SUB]], %[[C_SUB_SUB_SUB]] : tensor<1x1x32xf32>, tensor<1x1x32xf32>)
-  // CHECK-SAME:            outs(%[[INIT_SUB_SUB_SUB]] : tensor<1x1x32xf32>)
-  // CHECK:             gml_st.set_yield %[[ABC_SUB_SUB_SUB]] into %[[INIT_SUB_SUB]][%{{.*}}]
-  // CHECK:           gml_st.set_yield %[[PLOOP__]] into %[[INIT_SUB]][%{{.*}}]
-  // CHECK:         gml_st.set_yield %[[PLOOP_]] into %[[INIT]][%{{.*}}]
+  // CHECK-SAME:            outs(%[[INIT_SUB_SUB_SUB_]] : tensor<1x1x32xf32>)
+  // CHECK:             gml_st.set_yield %[[ABC_SUB_SUB_SUB]] into %[[INIT_SUB_SUB_]][%{{.*}}]
+  // CHECK:           gml_st.set_yield %[[PLOOP__]] into %[[INIT_SUB_]][%{{.*}}]
+  // CHECK:         gml_st.set_yield %[[PLOOP_]] into %[[INIT_]][%{{.*}}]
   // CHECK:       return %[[PLOOP]]
   %c0 = arith.constant 0 : index
   %d0 = tensor.dim %a, %c0 : tensor<?x1024x1024xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir
index 40bf9afc825..a2691a97232 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/nested_tiling_softmax.mlir
@@ -5,67 +5,51 @@
 // RUN:     --canonicalize --cse | \
 // RUN: FileCheck %s
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
 func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   %cst = arith.constant -0.000000e+00 : f32
   %cst_0 = arith.constant 0xFF800000 : f32
   %0 = tensor.empty() : tensor<64xf32>
   %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %11 = arith.maxf %arg2, %arg1 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64xf32>
+  %2 = linalg.reduce ins(%arg0 : tensor<64x128xf32>)
+                     outs(%1 : tensor<64xf32>) dimensions = [1]
+    (%arg1: f32, %arg2: f32) {
+      %11 = arith.maxf %arg1, %arg2 : f32
+      linalg.yield %11 : f32
+    }
   %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<64x128xf32>
-  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+  %4 = linalg.broadcast
+    ins(%2 : tensor<64xf32>)
+    outs(%3 : tensor<64x128xf32>)
+    dimensions = [1]
+  %5 = linalg.map ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+              outs(%3 : tensor<64x128xf32>)
+  (%arg1: f32, %arg2: f32) {
     %11 = arith.subf %arg1, %arg2 : f32
     linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
-  %6 = linalg.generic {indexing_maps = [#map0, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
+  }
+  %6 = linalg.map ins(%5 : tensor<64x128xf32>)
+              outs(%3 : tensor<64x128xf32>)
+  (%arg1: f32) {
     %11 = math.exp %arg1 : f32
     linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
+  }
   %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %8 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<64x128xf32>)
-      outs(%7 : tensor<64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %11 = arith.addf %arg2, %arg1 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64xf32>
-  %9 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<64x128xf32>
-  %10 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+  %8 = linalg.reduce ins(%6 : tensor<64x128xf32>)
+                     outs(%7 : tensor<64xf32>) dimensions = [1]
+    (%arg1: f32, %arg2: f32) {
+      %11 = arith.addf %arg2, %arg1 : f32
+      linalg.yield %11 : f32
+    }
+  %9 = linalg.broadcast
+    ins(%8 : tensor<64xf32>)
+    outs(%3 : tensor<64x128xf32>)
+    dimensions = [1]
+  %10 = linalg.map ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
+      outs(%3 : tensor<64x128xf32>)
+  (%arg1: f32, %arg2: f32) {
     %11 = arith.divf %arg1, %arg2 : f32
     linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
+  }
   return %10 : tensor<64x128xf32>
 }
 // CHECK-LABEL: @softmax
@@ -84,62 +68,51 @@ func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
 // CHECK-SAME:     outs(%[[EMPTY]] : tensor<64xf32>)
 
 // CHECK:      %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C64]]) step (%[[C8]])
-// CHECK-NEXT:   %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
-// CHECK-NEXT:   %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-// CHECK-NEXT:   %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [8] [1]
-// CHECK-NEXT:   %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[FILL]][%[[TILE_0]]]
-// CHECK-NEXT:   %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[EMPTY_0]][%[[TILE]]]
-// CHECK-NEXT:   %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[FILL_0]][%[[TILE_0]]]
+// CHECK-SAME:     outs (%[[EMPTY_:.*]] = %[[EMPTY_0]]:
+// CHECK-DAG:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
+// CHECK-DAG:   %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
+// CHECK-DAG:   %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[EMPTY_0]][%[[ARG1]], 0] [8, 128] [1, 1]
+// CHECK-DAG:   %[[MATERIALIZE_3:.*]] = tensor.extract_slice %[[FILL_0]][%[[ARG1]]] [8] [1]
+// CHECK-DAG:   %[[EMPTY_SUB:.*]] = tensor.extract_slice %[[EMPTY_]]
 
 // CHECK:        %[[PARALLEL_0:.*]] = gml_st.parallel (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C8]]) step (%[[C1]])
-// CHECK-NEXT:     %[[TILE_4:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 128] [1, 1]
-// CHECK-NEXT:     %[[MATERIALIZE_4:.*]] = gml_st.materialize %[[MATERIALIZE]][%[[TILE_4]]]
-// CHECK-NEXT:     %[[TILE_5:.*]] = gml_st.tile [%[[ARG2]]] [1] [1]
-// CHECK-NEXT:     %[[MATERIALIZE_5:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_5]]]
-
-// CHECK-NEXT:     %[[GENERIC:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP0:map[0-9]*]], #[[MAP1:map[0-9]*]]]
-// CHECK-SAME:         iterator_types = ["parallel", "reduction"]
+// CHECK-SAME:       outs (%[[EMPTY_SUB_:.*]] = %[[EMPTY_SUB]]:
+// CHECK-NEXT:     %[[MATERIALIZE_4:.*]] = tensor.extract_slice %[[MATERIALIZE]][%[[ARG2]], 0] [1, 128] [1, 1]
+// CHECK-NEXT:     %[[MATERIALIZE_5:.*]] = tensor.extract_slice %[[MATERIALIZE_0]][%[[ARG2]]] [1] [1]
+// CHECK-NEXT:     %[[REDUCE:.*]] = linalg.reduce
 // CHECK-SAME:         ins(%[[MATERIALIZE_4]] : tensor<1x128xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_5]] : tensor<1xf32>)
+// CHECK-SAME:         dimensions = [1]
 
-// CHECK:          %[[MATERIALIZE_6:.*]] = gml_st.materialize %[[MATERIALIZE_1]][%[[TILE_4]]]
-// CHECK-NEXT:     %[[GENERIC_0:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP1]], #[[MAP0]]]
-// CHECK-SAME:         iterator_types = ["parallel", "parallel"]
-// CHECK-SAME:         ins(%[[GENERIC]] : tensor<1xf32>)
+// CHECK:          %[[MATERIALIZE_6:.*]] = tensor.extract_slice %[[MATERIALIZE_1]][%[[ARG2]], 0] [1, 128] [1, 1]
+// CHECK-NEXT:     %[[BROADCAST:.*]] = linalg.broadcast
+// CHECK-SAME:         ins(%[[REDUCE]] : tensor<1xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
+// CHECK-SAME:         dimensions = [1]
 
-// CHECK:          %[[GENERIC_1:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]]
-// CHECK-SAME:         iterator_types = ["parallel", "parallel"]
-// CHECK-SAME:         ins(%[[MATERIALIZE_4]], %[[GENERIC_0]] : tensor<1x128xf32>, tensor<1x128xf32>)
+// CHECK:          %[[MAP:.*]] = linalg.map
+// CHECK-SAME:         ins(%[[MATERIALIZE_4]], %[[BROADCAST]] : tensor<1x128xf32>, tensor<1x128xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
 
-// CHECK:          %[[GENERIC_2:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP0]], #[[MAP0]]]
-// CHECK-SAME:         iterator_types = ["parallel", "parallel"]
-// CHECK-SAME:         ins(%[[GENERIC_1]] : tensor<1x128xf32>)
+// CHECK:          %[[MAP_0:.*]] = linalg.map
+// CHECK-SAME:         ins(%[[MAP]] : tensor<1x128xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
 
-// CHECK:          %[[MATERIALIZE_8:.*]] = gml_st.materialize %[[MATERIALIZE_3]][%[[TILE_5]]]
-// CHECK-NEXT:          %[[GENERIC_3:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP0]], #[[MAP1]]]
-// CHECK-SAME:         iterator_types = ["parallel", "reduction"]
-// CHECK-SAME:         ins(%[[GENERIC_2]] : tensor<1x128xf32>)
+// CHECK:          %[[MATERIALIZE_8:.*]] = tensor.extract_slice %[[MATERIALIZE_3]][%[[ARG2]]] [1] [1]
+// CHECK-NEXT:          %[[REDUCE_0:.*]] = linalg.reduce
+// CHECK-SAME:         ins(%[[MAP_0]] : tensor<1x128xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_8]] : tensor<1xf32>)
 
-// CHECK:          %[[GENERIC_4:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP1]], #[[MAP0]]]
-// CHECK-SAME:         iterator_types = ["parallel", "parallel"]
-// CHECK-SAME:         ins(%[[GENERIC_3]] : tensor<1xf32>)
+// CHECK:          %[[BROADCAST_0:.*]] = linalg.broadcast
+// CHECK-SAME:         ins(%[[REDUCE_0]] : tensor<1xf32>)
 // CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
 
-// CHECK:          %[[GENERIC_5:.*]] = linalg.generic
-// CHECK-SAME:         indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]
-// CHECK-SAME:         iterator_types = ["parallel", "parallel"]
-// CHECK-SAME:         ins(%[[GENERIC_2]], %[[GENERIC_4]] : tensor<1x128xf32>, tensor<1x128xf32>)
-// CHECK-SAME:         outs(%[[MATERIALIZE_6]] : tensor<1x128xf32>)
-// CHECK:          gml_st.set_yield %[[GENERIC_5]] into %[[MATERIALIZE_1]][%[[TILE_4]]]
-// CHECK:        gml_st.set_yield %[[PARALLEL_0]] into %[[EMPTY_0]][%[[TILE]]]
+// CHECK-NEXT:     %[[MATERIALIZE_7:.*]] = tensor.extract_slice %[[EMPTY_SUB_]]
+// CHECK:          %[[MAP_1:.*]] = linalg.map
+// CHECK-SAME:         ins(%[[MAP_0]], %[[BROADCAST_0]] : tensor<1x128xf32>, tensor<1x128xf32>)
+// CHECK-SAME:         outs(%[[MATERIALIZE_7]] : tensor<1x128xf32>)
+// CHECK:          %[[TILE_4:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 128] [1, 1]
+// CHECK:          gml_st.set_yield %[[MAP_1]] into %[[EMPTY_SUB_]][%[[TILE_4]]]
+// CHECK:        %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
+// CHECK:        gml_st.set_yield %[[PARALLEL_0]] into %[[EMPTY_]][%[[TILE]]]
 // CHECK:      return %[[PARALLEL]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
index 6c6e2adb757..d08ec1e958d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/ops.mlir
@@ -22,46 +22,25 @@ func.func @dynamic_types(%size : index) {
 
 // -----
 
-// CHECK-LABEL: @materialize_static_tensor
-// CHECK-SAME: %[[TENSOR:.*]]: tensor<64x32xf32>, %[[TILE:.*]]: !gml_st.tile<42x16>
-func.func @materialize_static_tensor(%tensor: tensor<64x32xf32>, %tile: !gml_st.tile<42x16>) {
-  // CHECK: %{{.*}} = gml_st.materialize %[[TENSOR]][%[[TILE]]] : tensor<64x32xf32>[!gml_st.tile<42x16>]
-  %0 = gml_st.materialize %tensor[%tile] : tensor<64x32xf32>[!gml_st.tile<42x16>] to tensor<42x16xf32>
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: @materialize_dynamic_tensor
-// CHECK-SAME: %[[TENSOR:.*]]: tensor<?x?xf32>, %[[TILE:.*]]: !gml_st.tile<42x16>
-func.func @materialize_dynamic_tensor(%tensor: tensor<?x?xf32>, %tile: !gml_st.tile<42x16>) {
-  // CHECK: %{{.*}} = gml_st.materialize %[[TENSOR]][%[[TILE]]] : tensor<?x?xf32>[!gml_st.tile<42x16>]
-  %0 = gml_st.materialize %tensor[%tile] : tensor<?x?xf32>[!gml_st.tile<42x16>] to tensor<42x16xf32>
-  func.return
-}
-
 // CHECK-LABEL: @materialize_vector
-// CHECK-SAME: %[[VECTOR:.*]]: vector<64x32xf32>,
-// CHECK-SAME: %[[TILE:.*]]: !gml_st.tile<42x16>
-func.func @materialize_vector(%vector: vector<64x32xf32>,
-                              %tile: !gml_st.tile<42x16>) {
-  // CHECK: %{{.*}} = gml_st.materialize %[[VECTOR]][%[[TILE]]]
-  // CHECK-SAME: : vector<64x32xf32>[!gml_st.tile<42x16>]
-  %0 = gml_st.materialize %vector[%tile]
-    : vector<64x32xf32>[!gml_st.tile<42x16>] to vector<42x16xf32>
+// CHECK-SAME: %[[VECTOR:.*]]: vector<64x32xf32>
+func.func @materialize_vector(%vector: vector<64x32xf32>) {
+  // CHECK: %{{.*}} = gml_st.materialize %[[VECTOR]]
+  // CHECK-SAME: : vector<64x32xf32>
+  %0 = gml_st.materialize %vector[0, 0][42, 16][1, 1]
+    : vector<64x32xf32> to vector<42x16xf32>
   func.return
 }
 
 // -----
 
 // CHECK-LABEL: @materialize_0d_vector
-// CHECK-SAME: %[[VECTOR:.*]]: vector<f32>,
-// CHECK-SAME: %[[TILE:.*]]: !gml_st.tile<>
-func.func @materialize_0d_vector(%vector: vector<f32>, %tile: !gml_st.tile<>) {
-  // CHECK: %{{.*}} = gml_st.materialize %[[VECTOR]][%[[TILE]]]
-  // CHECK-SAME: : vector<f32>[!gml_st.tile<>] to vector<f32>
-  %0 = gml_st.materialize %vector[%tile]
-    : vector<f32>[!gml_st.tile<>] to vector<f32>
+// CHECK-SAME: %[[VECTOR:.*]]: vector<f32>
+func.func @materialize_0d_vector(%vector: vector<f32>) {
+  // CHECK: %{{.*}} = gml_st.materialize %[[VECTOR]]
+  // CHECK-SAME: : vector<f32> to vector<f32>
+  %0 = gml_st.materialize %vector[][][]
+    : vector<f32> to vector<f32>
   func.return
 }
 
@@ -94,163 +73,6 @@ func.func @distribute_0d_vector(%vector: vector<f32>, %tile: !gml_st.tile<>) {
 
 // -----
 
-#cwise_trait = {
-  indexing_maps = [
-    affine_map<(i, j) -> (i, j)>,
-    affine_map<(i, j) -> (i, j)>,
-    affine_map<(i, j) -> (i, j)>
-  ],
-  iterator_types = ["parallel", "parallel"]
-}
-
-func.func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
-                 %out: tensor<24x64xi8>) -> tensor<24x64xi8> {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c4 = arith.constant 4 : index
- %c24 = arith.constant 24 : index
- %c64 = arith.constant 64 : index
- %prod = gml_st.loop (%i) = (%c0) to (%c24) step (%c4)
-      ins(%lhs_ = %lhs: tensor<24x64xi8>, %rhs_ = %rhs: tensor<24x64xi8>)
-      outs(%out_ = %out: tensor<24x64xi8>) {
-    %lhs_sub = tensor.extract_slice %lhs_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-    %rhs_sub = tensor.extract_slice %rhs_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-    %out_sub = tensor.extract_slice %out_[%i, 0] [%c4, %c64] [1, 1]
-        : tensor<24x64xi8> to tensor<?x?xi8>
-
-    %sum = linalg.generic #cwise_trait
-        ins(%lhs_sub, %rhs_sub : tensor<?x?xi8>, tensor<?x?xi8>)
-        outs(%out_sub : tensor<?x?xi8>) {
-      ^bb(%l: i8, %r: i8, %o: i8) :
-        %s = arith.addi %l, %r : i8
-        linalg.yield %s : i8
-      } -> tensor<?x?xi8>
-
-    %sum_sub = tensor.insert_slice %sum into %out_[%i, 0][%c4, %c64][1, 1]
-      : tensor<?x?xi8> into tensor<24x64xi8>
-    gml_st.yield %sum_sub : tensor<24x64xi8>
-  }
-  func.return %prod : tensor<24x64xi8>
-}
-// CHECK-LABEL: func @tiled_loop
-// CHECK-NOT: iterators[
-
-// -----
-
-#reduction_trait = {
-  indexing_maps = [
-    affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-    affine_map<(d0, d1, d2) -> (d0, d2)>,
-    affine_map<(d0, d1, d2) -> (d1)>,
-    affine_map<(d0, d1, d2) -> (d1)>
-  ],
-  iterator_types = ["reduction", "parallel", "reduction"]
-}
-
-func.func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
-                           %input_2d: tensor<16x32xf32>,
-                           %input_1d: tensor<24xf32>,
-                           %output: tensor<24xf32>) -> tensor<24xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c8 = arith.constant 8 : index
-  %X = tensor.dim %input_3d, %c0 : tensor<16x24x32xf32>
-  %Y = tensor.dim %input_3d, %c1 : tensor<16x24x32xf32>
-  %Z = tensor.dim %input_3d, %c2 : tensor<16x24x32xf32>
-  %result = gml_st.loop (%i, %j, %k)
-      = (%c0, %c0, %c0) to (%X, %Y, %Z) step (%c2, %c4, %c8)
-      ins(%i3d_ = %input_3d: tensor<16x24x32xf32>,
-          %i2d_ = %input_2d: tensor<16x32xf32>,
-          %i1d_ = %input_1d: tensor<24xf32>)
-      outs(%o_ =  %output: tensor<24xf32>)
-      iterators[#gml_st.iterator_type<reduction>,
-                #gml_st.iterator_type<parallel>,
-                #gml_st.iterator_type<reduction>]
-      distribution["block_x", "block_y", "none"] {
-    %sub_3d = tensor.extract_slice %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
-      : tensor<16x24x32xf32> to tensor<2x4x8xf32>
-    %sub_2d = tensor.extract_slice %i2d_[%i, %k][2, 8][1, 1]
-      : tensor<16x32xf32> to tensor<2x8xf32>
-    %sub_1d = tensor.extract_slice %i1d_[%j] [4] [1]
-      : tensor<24xf32> to tensor<4xf32>
-    %sub_out = tensor.extract_slice %o_[%j] [4] [1]
-      : tensor<24xf32> to tensor<4xf32>
-    %acc = linalg.generic #reduction_trait
-      ins(%sub_3d, %sub_2d, %sub_1d
-        : tensor<2x4x8xf32>, tensor<2x8xf32>, tensor<4xf32>)
-      outs(%sub_out : tensor<4xf32>)  {
-    ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32):
-      %0 = arith.addf %i3d, %i2d : f32
-      %1 = arith.addf %0, %i1d : f32
-      linalg.yield %1 : f32
-    } -> tensor<4xf32>
-
-    %sum_sub = tensor.insert_slice %acc into %o_[%j][4][1]
-      : tensor<4xf32> into tensor<24xf32>
-    gml_st.yield %sum_sub : tensor<24xf32>
-  }
-  func.return %result : tensor<24xf32>
-}
-// CHECK-LABEL: func @tiled_loop_reduction
-// CHECK: iterators[
-
-#map_1 = affine_map<(d0, d1, d2)[s0] -> (d0 * 768 + s0 + d1 * 32 + d2)>
-#map_2 = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)>
-#map_3 = affine_map<(d0)[s0] -> (d0 + s0)>
-
-func.func @tiled_loop_on_buffers(%input_3d: memref<16x24x32xf32>,
-                            %input_2d: memref<16x32xf32>,
-                            %input_1d: memref<24xf32>,
-                            %output: memref<24xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c8 = arith.constant 8 : index
-  %X = memref.dim %input_3d, %c0 : memref<16x24x32xf32>
-  %Y = memref.dim %input_3d, %c1 : memref<16x24x32xf32>
-  %Z = memref.dim %input_3d, %c2 : memref<16x24x32xf32>
-  gml_st.loop (%i, %j, %k) = (%c0, %c0, %c0)
-      to (%X, %Y, %Z) step (%c2, %c4, %c8)
-      ins(%i3d_ = %input_3d: memref<16x24x32xf32>,
-          %i2d_ = %input_2d: memref<16x32xf32>,
-          %i1d_ = %input_1d: memref<24xf32>)
-      outs(%o_ =  %output: memref<24xf32>)
-      iterators[#gml_st.iterator_type<reduction>,
-                #gml_st.iterator_type<parallel>,
-                #gml_st.iterator_type<reduction>] {
-    %sub_3d = memref.subview %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
-      : memref<16x24x32xf32> to memref<2x4x8xf32, #map_1>
-    %sub_2d = memref.subview %i2d_[%i, %k][2, 8][1, 1]
-      : memref<16x32xf32> to memref<2x8xf32, #map_2>
-    %sub_1d = memref.subview %i1d_[%j] [4] [1]
-      : memref<24xf32> to memref<4xf32, #map_3>
-    %sub_out = memref.subview %o_[%j] [4] [1]
-      : memref<24xf32> to memref<4xf32, #map_3>
-    linalg.generic #reduction_trait
-      ins(%sub_3d, %sub_2d, %sub_1d
-        : memref<2x4x8xf32, #map_1>,
-          memref<2x8xf32, #map_2>,
-          memref<4xf32, #map_3>)
-      outs(%sub_out : memref<4xf32, #map_3>)  {
-    ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32):
-      %0 = arith.addf %i3d, %i2d : f32
-      %1 = arith.addf %0, %i1d : f32
-      linalg.yield %1 : f32
-    }
-    gml_st.yield
-  }
-  func.return
-}
-// CHECK-LABEL: func @tiled_loop_on_buffers
-// CHECK: iterators[
-
-// -----
-
 #id_1d = affine_map<(d0) -> (d0)>
 
 func.func @parallel_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
@@ -259,14 +81,14 @@ func.func @parallel_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %lhs_sub = gml_st.materialize %lhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %rhs_sub = gml_st.materialize %rhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %output[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs(%out_ = %output : tensor<8xf32>) {
+    %lhs_sub = tensor.extract_slice %lhs[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %rhs_sub = tensor.extract_slice %rhs[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %output[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_1d, #id_1d, #id_1d],
@@ -278,7 +100,8 @@ func.func @parallel_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<4xf32>
 
-    gml_st.set_yield %result_sub into %output[%tile]
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+    gml_st.set_yield %result_sub into %out_[%tile]
       : tensor<4xf32> into tensor<8xf32>[!gml_st.tile<4>]
   } : tensor<8xf32>
   func.return %sum : tensor<8xf32>
@@ -293,9 +116,10 @@ func.func @loop_on_points(%output: tensor<8xf32>) -> tensor<8xf32> {
   %c8 = arith.constant 8 : index
   %c0_f32 = arith.constant 0.0 : f32
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1) {
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1)
+      outs(%out_ = %output : tensor<8xf32>) {
     %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
-    gml_st.set_yield %c0_f32 into %output[%tile]
+    gml_st.set_yield %c0_f32 into %out_[%tile]
       : f32 into tensor<8xf32>[!gml_st.tile<1>]
   } : tensor<8xf32>
   func.return %sum : tensor<8xf32>
@@ -309,9 +133,10 @@ func.func @parallel_with_distribution(%output: tensor<8xf32>) -> tensor<8xf32> {
   %c8 = arith.constant 8 : index
   %c0_f32 = arith.constant 0.0 : f32
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1) distribution ("x") {
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1)
+      outs(%out_ = %output : tensor<8xf32>) distribution ("x") {
     %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
-    gml_st.set_yield %c0_f32 into %output[%tile]
+    gml_st.set_yield %c0_f32 into %out_[%tile]
       : f32 into tensor<8xf32>[!gml_st.tile<1>]
   } : tensor<8xf32>
   func.return %sum : tensor<8xf32>
@@ -327,9 +152,10 @@ func.func @loop_on_vector(%output: vector<8xf32>, %fill: vector<2xf32>)
   %c2 = arith.constant 2 : index
   %c8 = arith.constant 8 : index
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c2) {
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c2) 
+      outs(%out_ = %output : vector<8xf32>)  {
     %tile = gml_st.tile [%i] [2] [1] : !gml_st.tile<2>
-    gml_st.set_yield %fill into %output[%tile]
+    gml_st.set_yield %fill into %out_[%tile]
       : vector<2xf32> into vector<8xf32>[!gml_st.tile<2>]
   } : vector<8xf32>
   func.return %sum : vector<8xf32>
@@ -349,15 +175,14 @@ func.func @for_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
 
   %sum, %sum2 = gml_st.for (%i) = (%c0) to (%c8) step (%c4)
       outs(%out_ = %output : tensor<8xf32>, %out2_ = %output2 : tensor<8xf32>) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %lhs_sub = gml_st.materialize %lhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %rhs_sub = gml_st.materialize %rhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %out_[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out2_sub = gml_st.materialize %out_[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+    %lhs_sub = tensor.extract_slice %lhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %rhs_sub = tensor.extract_slice %rhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %out_ [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out2_sub = tensor.extract_slice %out_ [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_1d, #id_1d, #id_1d],
@@ -369,6 +194,7 @@ func.func @for_loop(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<4xf32>
 
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
     gml_st.set_yield %result_sub into %out_[%tile]
       : tensor<4xf32> into tensor<8xf32>[!gml_st.tile<4>],
       %result_sub into %out2_[%tile]
@@ -388,14 +214,14 @@ func.func @trivial_acc_region(%lhs: tensor<8xf32>,
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %lhs_sub = gml_st.materialize %lhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %rhs_sub = gml_st.materialize %rhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %output[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs(%out_ = %output : tensor<8xf32>)  {
+    %lhs_sub = tensor.extract_slice %lhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %rhs_sub = tensor.extract_slice %rhs[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %output[%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_1d, #id_1d, #id_1d],
@@ -407,7 +233,8 @@ func.func @trivial_acc_region(%lhs: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<4xf32>
 
-    gml_st.set_yield %result_sub into %output[%tile]
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+    gml_st.set_yield %result_sub into %out_[%tile]
       acc (%new, %old: tensor<4xf32>) {
         gml_st.yield %new : tensor<4xf32>
       } : tensor<4xf32> into tensor<8xf32>[!gml_st.tile<4>]
@@ -431,16 +258,16 @@ func.func @two_acc_region(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %result:2 = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %lhs_sub = gml_st.materialize %lhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %rhs_sub = gml_st.materialize %rhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %output[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_2_sub = gml_st.materialize %output_2[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+  %result:2 = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs(%out_ = %output : tensor<8xf32>, %out2_ = %output_2 : tensor<8xf32>) {
+    %lhs_sub = tensor.extract_slice %lhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %rhs_sub = tensor.extract_slice %rhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %output [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_2_sub = tensor.extract_slice %output_2 [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_1d, #id_1d, #id_1d],
@@ -452,11 +279,12 @@ func.func @two_acc_region(%lhs: tensor<8xf32>, %rhs: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<4xf32>
 
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
     gml_st.set_yield
-      %result_sub into %output[%tile] acc (%new, %old: tensor<4xf32>) {
+      %result_sub into %out_[%tile] acc (%new, %old: tensor<4xf32>) {
         gml_st.yield %new : tensor<4xf32>
       } : tensor<4xf32> into tensor<8xf32>[!gml_st.tile<4>],
-      %result_sub into %output_2[%tile] acc (%new, %old: tensor<4xf32>) {
+      %result_sub into %out2_[%tile] acc (%new, %old: tensor<4xf32>) {
         %sum = linalg.generic {
            indexing_maps = [#id_1d, #id_1d],
            iterator_types = ["parallel"]}
@@ -493,14 +321,14 @@ func.func @accumulator_region(%lhs: tensor<8xf32>,
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
 
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %lhs_sub = gml_st.materialize %lhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %rhs_sub = gml_st.materialize %rhs[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %out_sub = gml_st.materialize %output[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs(%out_ = %output : tensor<8xf32>) {
+    %lhs_sub = tensor.extract_slice %lhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %rhs_sub = tensor.extract_slice %rhs [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
+    %out_sub = tensor.extract_slice %output [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_1d, #id_1d, #id_1d],
@@ -512,7 +340,8 @@ func.func @accumulator_region(%lhs: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<4xf32>
 
-    gml_st.set_yield %result_sub into %output[%tile]
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+    gml_st.set_yield %result_sub into %out_[%tile]
       acc (%new, %old: tensor<4xf32>) {
         %sum = linalg.generic {
            indexing_maps = [#id_1d, #id_1d],
@@ -547,11 +376,10 @@ func.func @reduce_tiles(%arg: tensor<8xf32>,
   %c8 = arith.constant 8 : index
   %c0_f32 = arith.constant 0.0 : f32
 
-  %init_tile = gml_st.tile [] [] [] : !gml_st.tile<>
-  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %arg_sub = gml_st.materialize %arg[%tile]
-      : tensor<8xf32>[!gml_st.tile<4>] to tensor<4xf32>
+  %sum = gml_st.parallel (%i) = (%c0) to (%c8) step (%c4)
+      outs(%out_ = %output : tensor<f32>) {
+    %arg_sub = tensor.extract_slice %arg [%i] [4] [1]
+      : tensor<8xf32> to tensor<4xf32>
 
     %local_init = tensor.empty() : tensor<f32>
     %local_fill = linalg.fill
@@ -568,7 +396,8 @@ func.func @reduce_tiles(%arg: tensor<8xf32>,
         linalg.yield %s : f32
     } -> tensor<f32>
 
-    gml_st.set_yield %result_sub into %output[%init_tile]
+    %init_tile = gml_st.tile [] [] [] : !gml_st.tile<>
+    gml_st.set_yield %result_sub into %out_[%init_tile]
         acc (%in, %out: tensor<f32>) {
       %in_pt = tensor.extract %in[] : tensor<f32>
       %out_pt = tensor.extract %out[] : tensor<f32>
@@ -598,10 +427,10 @@ func.func @column_reduction(%arg: tensor<128x16xf32>,
   %c128 = arith.constant 128 : index
   %cst = arith.constant 0.000000e+00 : f32
 
-  %sum = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c128, %c16) step (%c8, %c8) {
-    %arg_tile = gml_st.tile [%i, %j] [8, 8] [1, 1] : !gml_st.tile<8x8>
-    %arg_sub = gml_st.materialize %arg[%arg_tile]
-      : tensor<128x16xf32>[!gml_st.tile<8x8>] to tensor<8x8xf32>
+  %sum = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c128, %c16) step (%c8, %c8)
+      outs(%out_ = %out : tensor<16xf32>) {
+    %arg_sub = tensor.extract_slice %arg[%i, %j] [8, 8] [1, 1]
+      : tensor<128x16xf32> to tensor<8x8xf32>
 
     %init = tensor.empty() : tensor<8xf32>
     %fill = linalg.fill ins(%cst : f32)
@@ -618,8 +447,7 @@ func.func @column_reduction(%arg: tensor<128x16xf32>,
     } -> tensor<8xf32>
 
     %out_tile = gml_st.tile [%j] [8] [1] : !gml_st.tile<8>
-
-    gml_st.set_yield %result_sub into %out[%out_tile]
+    gml_st.set_yield %result_sub into %out_[%out_tile]
         acc (%new, %old: tensor<8xf32>) {
       %acc = linalg.generic {
           indexing_maps = [#id_1d, #id_1d],
@@ -655,13 +483,11 @@ func.func @sequential_column_reduction(%arg: tensor<128x16xf32>,
 
   %sum = gml_st.for (%i, %j) = (%c0, %c0) to (%c128, %c16) step (%c8, %c8)
       outs(%out_ = %out : tensor<16xf32>) {
-    %arg_tile = gml_st.tile [%i, %j] [8, 8] [1, 1] : !gml_st.tile<8x8>
-    %arg_sub = gml_st.materialize %arg[%arg_tile]
-      : tensor<128x16xf32>[!gml_st.tile<8x8>] to tensor<8x8xf32>
+    %arg_sub = tensor.extract_slice %arg[%i, %j] [8, 8] [1, 1]
+      : tensor<128x16xf32> to tensor<8x8xf32>
 
-    %out_tile = gml_st.tile [%j] [8] [1] : !gml_st.tile<8>
-    %out_sub = gml_st.materialize %out_[%out_tile]
-      : tensor<16xf32>[!gml_st.tile<8>] to tensor<8xf32>
+    %out_sub = tensor.extract_slice %out_[%j] [8] [1]
+      : tensor<16xf32> to tensor<8xf32>
 
     %result_sub = linalg.generic {
         indexing_maps = [#id_2d, #map_1d],
@@ -673,6 +499,7 @@ func.func @sequential_column_reduction(%arg: tensor<128x16xf32>,
         linalg.yield %s : f32
     } -> tensor<8xf32>
 
+    %out_tile = gml_st.tile [%j] [8] [1] : !gml_st.tile<8>
     gml_st.set_yield %result_sub into %out_[%out_tile]
       : tensor<8xf32> into tensor<16xf32>[!gml_st.tile<8>]
   } : tensor<16xf32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_contract.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_contract.mlir
new file mode 100644
index 00000000000..eabf0c4bbdd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_contract.mlir
@@ -0,0 +1,166 @@
+// RUN: mlir-hlo-opt %s --split-input-file --rewrite-vector-contract | FileCheck %s
+
+func.func @lower_vector_contract(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>)
+                  -> tensor<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %2 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8x8xf32>
+  %3 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8x8xf32>
+  %4 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8x8xf32>
+  %5 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %3, %4 : vector<8x8xf32>, vector<8x8xf32> into vector<8x8xf32>
+  %6 = vector.transfer_write %5, %0[%c0, %c0] {in_bounds = [true, true]} : vector<8x8xf32>, tensor<8x8xf32>
+  return %6 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @lower_vector_contract(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<8x8xf32>, %[[RHS:.*]]: tensor<8x8xf32>)
+
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[INIT:.*]] = tensor.empty
+
+// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]
+// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]
+// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[INIT]]
+// CHECK:         %[[TRANSPOSE:.*]] = vector.transpose %[[LHS_READ]]
+// CHECK:         %[[EXTRACT_LHS0:.*]] = vector.extract %[[TRANSPOSE]][0]
+// CHECK:         %[[EXTRACT_RHS0:.*]] = vector.extract %[[RHS_READ]][0]
+// CHECK:         %[[PRODUCT0:.*]] = vector.outerproduct %[[EXTRACT_LHS0]], %[[EXTRACT_RHS0]], %[[OUT_READ]]
+// CHECK:         %[[EXTRACT_LHS1:.*]] = vector.extract %[[TRANSPOSE]][1]
+// CHECK:         %[[EXTRACT_RHS1:.*]] = vector.extract %[[RHS_READ]][1]
+// CHECK:         %[[PRODUCT1:.*]] = vector.outerproduct %[[EXTRACT_LHS1]], %[[EXTRACT_RHS1]], %[[PRODUCT0]]
+// CHECK:         %[[PRODUCT2:.*]] = vector.outerproduct %[[EXTRACT_LHS2:.*]], %[[EXTRACT_RHS2:.*]], %[[PRODUCT1]]
+// CHECK:         %[[PRODUCT3:.*]] = vector.outerproduct %[[EXTRACT_LHS3:.*]], %[[EXTRACT_RHS3:.*]], %[[PRODUCT2]]
+// CHECK:         %[[PRODUCT4:.*]] = vector.outerproduct %[[EXTRACT_LHS4:.*]], %[[EXTRACT_RHS4:.*]], %[[PRODUCT3]]
+// CHECK:         %[[PRODUCT5:.*]] = vector.outerproduct %[[EXTRACT_LHS5:.*]], %[[EXTRACT_RHS5:.*]], %[[PRODUCT4]]
+// CHECK:         %[[PRODUCT6:.*]] = vector.outerproduct %[[EXTRACT_LHS6:.*]], %[[EXTRACT_RHS6:.*]], %[[PRODUCT5]]
+// CHECK:         %[[PRODUCT7:.*]] = vector.outerproduct %[[EXTRACT_LHS7:.*]], %[[EXTRACT_RHS7:.*]], %[[PRODUCT6]]
+// CHECK:         %[[RET:.*]] = vector.transfer_write %[[PRODUCT7]], %[[INIT]]
+// CHECK:         return %[[RET]]
+
+// -----
+
+func.func @canonicalize_outer_product(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>)
+                  -> tensor<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<8x8xf32>
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %2 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8x8xf32>
+  %3 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<8x8xf32>, vector<8x8xf32>
+  %5 = gml_st.materialize %cst[0, 0] [8, 8] [1, 1]  : vector<8x8xf32> to vector<8x8xf32>
+  %6 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %3, %5 : vector<8x8xf32>, vector<8x8xf32> into vector<8x8xf32>
+  %7 = vector.transfer_write %6, %0[%c0, %c0] {in_bounds = [true, true]} : vector<8x8xf32>, tensor<8x8xf32>
+  return %7 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @canonicalize_outer_product(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<8x8xf32>, %[[RHS:.*]]: tensor<8x8xf32>)
+
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:     %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x8xf32>
+// CHECK:         %[[INIT:.*]] = tensor.empty
+
+// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]
+// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]
+// CHECK:         %[[TRANSPOSE:.*]] = vector.transpose %[[LHS_READ]]
+// CHECK:         %[[EXTRACT_LHS0:.*]] = vector.extract %[[TRANSPOSE]][0]
+// CHECK:         %[[EXTRACT_RHS0:.*]] = vector.extract %[[RHS_READ]][0]
+// CHECK:         %[[PRODUCT0:.*]] = vector.outerproduct %[[EXTRACT_LHS0]], %[[EXTRACT_RHS0]], %[[CST]]
+
+// -----
+
+func.func @lower_vector_contract_4d(%arg0: tensor<1x1x8x1xf32>,
+                                    %arg1: tensor<1x1x8x1xf32>)
+                  -> tensor<1x1x8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %4 = tensor.empty() : tensor<1x1x8x8xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %20 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
+                                             vector<1x1x8x1xf32>
+  %21 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
+                                             vector<1x1x8x1xf32>
+  %22 = vector.transfer_read %4[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x8x8xf32>,
+                                             vector<1x1x8x8xf32>
+  %23 = vector.contract {indexing_maps =
+    [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>,
+     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>,
+     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>],
+    iterator_types = ["parallel", "parallel", "reduction",
+                      "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %20, %21, %22 : vector<1x1x8x1xf32>, vector<1x1x8x1xf32>
+               into vector<1x1x8x8xf32>
+  %14 = vector.transfer_write %23, %4[%c0, %c0, %c0, %c0]
+    {in_bounds = [true, true, true, true]} : vector<1x1x8x8xf32>,
+                                             tensor<1x1x8x8xf32>
+  return %14 : tensor<1x1x8x8xf32>
+}
+
+// CHECK-LABEL: func @lower_vector_contract_4d(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1x8x1xf32>, %[[RHS:.*]]: tensor<1x1x8x1xf32>)
+
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[INIT:.*]] = tensor.empty
+
+// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]{{.*}} tensor<1x1x8x1xf32>, vector<8x1xf32>
+// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]{{.*}} tensor<1x1x8x1xf32>, vector<8x1xf32>
+// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[INIT]]{{.*}} vector<8x8xf32>
+// CHECK:         %[[LHS_TRANSPOSE:.*]] = vector.transpose %[[LHS_READ]]{{.*}} : vector<8x1xf32> to vector<1x8xf32>
+// CHECK:         %[[RHS_TRANSPOSE:.*]] = vector.transpose %[[RHS_READ]]{{.*}} : vector<8x1xf32> to vector<1x8xf32>
+// CHECK:         %[[EXTRACT_LHS:.*]] = vector.extract %[[LHS_TRANSPOSE]][0]
+// CHECK:         %[[EXTRACT_RHS:.*]] = vector.extract %[[RHS_TRANSPOSE]][0]
+// CHECK:         %[[PRODUCT:.*]] = vector.outerproduct %[[EXTRACT_LHS]], %[[EXTRACT_RHS]], %[[OUT_READ]]
+// CHECK:         %[[RET:.*]] = vector.transfer_write %[[PRODUCT]], %[[INIT]]{{.*}} vector<8x8xf32>, tensor<1x1x8x8xf32>
+// CHECK:         return %[[RET]]
+
+// -----
+
+func.func @lower_vector_contract_4d_matvec(%arg0: tensor<1x1x1x1xf32>,
+                                           %arg1: tensor<1x1x8x1xf32>)
+                  -> tensor<1x1x1x8xf32> {
+  %c0 = arith.constant 0 : index
+  %4 = tensor.empty() : tensor<1x1x1x8xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %20 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x1x1xf32>,
+                                             vector<1x1x1x1xf32>
+  %21 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x8x1xf32>,
+                                             vector<1x1x8x1xf32>
+  %22 = vector.transfer_read %4[%c0, %c0, %c0, %c0], %cst
+    {in_bounds = [true, true, true, true]} : tensor<1x1x1x8xf32>,
+                                             vector<1x1x1x8xf32>
+  %23 = vector.contract {indexing_maps =
+    [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>,
+     affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>,
+     affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>],
+    iterator_types = ["parallel", "parallel", "reduction",
+                      "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %20, %21, %22 : vector<1x1x1x1xf32>, vector<1x1x8x1xf32>
+               into vector<1x1x1x8xf32>
+  %14 = vector.transfer_write %23, %4[%c0, %c0, %c0, %c0]
+    {in_bounds = [true, true, true, true]} : vector<1x1x1x8xf32>,
+                                             tensor<1x1x1x8xf32>
+  return %14 : tensor<1x1x1x8xf32>
+}
+
+// CHECK-LABEL: func @lower_vector_contract_4d_matvec(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1x1x1xf32>, %[[RHS:.*]]: tensor<1x1x8x1xf32>)
+
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[INIT:.*]] = tensor.empty
+
+// CHECK:         %[[LHS_READ:.*]] = vector.transfer_read %[[LHS]]{{.*}} tensor<1x1x1x1xf32>, vector<1xf32>
+// CHECK:         %[[RHS_READ:.*]] = vector.transfer_read %[[RHS]]{{.*}} tensor<1x1x8x1xf32>, vector<8x1xf32>
+// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[INIT]]{{.*}} vector<8xf32>
+// CHECK:         %[[RHS_TRANSPOSE:.*]] = vector.transpose %[[RHS_READ]]{{.*}} : vector<8x1xf32> to vector<1x8xf32>
+// CHECK:         %[[EXTRACT_RHS:.*]] = vector.extract %[[RHS_TRANSPOSE]][0]
+// CHECK:         %[[EXTRACT_LHS:.*]] = vector.extract %[[LHS_READ]][0]
+// CHECK:         %[[PRODUCT:.*]] = vector.outerproduct %[[EXTRACT_RHS]], %[[EXTRACT_LHS]], %[[OUT_READ]]
+// CHECK:         %[[RET:.*]] = vector.transfer_write %[[PRODUCT]], %[[INIT]]{{.*}} vector<8xf32>, tensor<1x1x1x8xf32>
+// CHECK:         return %[[RET]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_multi_reduction.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_multi_reduction.mlir
new file mode 100644
index 00000000000..de73d162fe2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/rewrite_vector_multi_reduction.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-hlo-opt %s --rewrite-vector-multi-reduction | FileCheck %s
+
+// CHECK-LABEL: func @vector_row
+func.func @vector_row(%arg0: vector<2x4xf32>, %acc: vector<2xf32>) -> vector<2xf32> {
+    %0 = vector.multi_reduction <mul>, %arg0, %acc [1] : vector<2x4xf32> to vector<2xf32>
+    func.return %0 : vector<2xf32>
+}
+// CHECK-COUNT-4: arith.mulf
+
+// CHECK-LABEL: func @vector_col
+func.func @vector_col(%arg0: vector<2x4xf32>, %acc: vector<4xf32>) -> vector<4xf32> {
+    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<2x4xf32> to vector<4xf32>
+    func.return %0 : vector<4xf32>
+}
+// CHECK: arith.mulf
+// CHECK: arith.mulf
+
+// CHECK-LABEL: func @vector_1d
+func.func @vector_1d(%arg0: vector<4xf32>, %acc: f32) -> f32 {
+    %0 = vector.multi_reduction <mul>, %arg0, %acc [0] : vector<4xf32> to f32
+    func.return %0 : f32
+}
+// CHECK: vector.reduction <mul>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/simplify_dead_copy.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/simplify_dead_copy.mlir
new file mode 100644
index 00000000000..0344e7b03d3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/simplify_dead_copy.mlir
@@ -0,0 +1,198 @@
+// RUN: mlir-hlo-opt %s --split-input-file --simplify-dead-copy | FileCheck %s
+
+func.func @target_is_alloc(%arg0: memref<8x8xf32>) -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  memref.copy %arg0, %alloc_4: memref<8x8xf32> to memref<8x8xf32>
+  return %arg0 : memref<8x8xf32>
+}
+
+// CHECK-LABEL: func @target_is_alloc(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
+
+// CHECK-NOT:     memref.copy
+// CHECK:         return %[[INPUT]]
+
+// -----
+
+func.func @target_is_alloc_with_other_stores(%arg0: memref<8x8xf32>)
+                                             -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  memref.copy %arg0, %alloc_4: memref<8x8xf32> to memref<8x8xf32>
+  linalg.fill ins(%cst_0 : f32) outs(%alloc_4 : memref<8x8xf32>)
+  memref.store %cst_0, %alloc_4[%c4, %c4] : memref<8x8xf32>
+  return %arg0 : memref<8x8xf32>
+}
+
+// CHECK-LABEL: func @target_is_alloc_with_other_stores(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
+
+// CHECK:         memref.alloc
+// CHECK-NOT:     memref.copy
+// CHECK:         linalg.fill
+// CHECK:         memref.store
+// CHECK:         return %[[INPUT]]
+
+// -----
+
+func.func @target_is_subview(%arg0: memref<8x8xf32>) -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  memref.copy %arg0, %subview_5 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  return %arg0 : memref<8x8xf32>
+}
+
+// CHECK-LABEL: func @target_is_subview(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
+
+// CHECK-NOT:     memref.copy
+// CHECK:         return %[[INPUT]]
+
+// -----
+
+func.func @target_is_subview_of_subview(%arg0: memref<8x8xf32>)
+                                        -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  %subview_6 = memref.subview %subview_5[0, 0] [%c4, %c4] [1, 1] :
+        memref<?x?xf32, strided<[8, 1]>> to memref<?x?xf32, strided<[?, ?]>>
+  memref.copy %arg0, %subview_6 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[?, ?]>>
+  return %arg0 : memref<8x8xf32>
+}
+
+// CHECK-LABEL: func @target_is_subview_of_subview(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
+
+// CHECK-NOT:     memref.copy
+// CHECK:         return %[[INPUT]]
+
+// -----
+
+func.func @do_not_simplify_subview_of_subview(%arg0: memref<8x8xf32>)
+                                              -> vector<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  %subview_6 = memref.subview %subview_5[0, 0] [%c4, %c4] [1, 1] :
+        memref<?x?xf32, strided<[8, 1]>> to memref<?x?xf32, strided<[?, ?]>>
+  memref.copy %arg0, %subview_6 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[?, ?]>>
+  %27 = vector.transfer_read %subview_5[%c0, %c0], %cst_0 :
+        memref<?x?xf32, strided<[8, 1]>>, vector<8x8xf32>
+  return %27 : vector<8x8xf32>
+}
+
+// CHECK-LABEL: func @do_not_simplify_subview_of_subview(
+
+// CHECK:         memref.alloc
+// CHECK:         memref.subview
+// CHECK:         memref.subview
+// CHECK:         memref.copy
+
+// -----
+
+func.func @do_not_simplify_subview(%arg0: memref<8x8xf32>) -> vector<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  memref.copy %arg0, %subview_5 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  %27 = vector.transfer_read %subview_5[%c0, %c0], %cst_0 :
+        memref<?x?xf32, strided<[8, 1]>>, vector<8x8xf32>
+  return %27 : vector<8x8xf32>
+}
+
+// CHECK-LABEL: func @do_not_simplify_subview(
+
+// CHECK:         memref.alloc
+// CHECK:         memref.subview
+// CHECK:         memref.copy
+
+// -----
+
+func.func @do_not_simplify_alloc(%arg0: memref<8x8xf32>) -> vector<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  memref.copy %arg0, %alloc_4 : memref<8x8xf32> to memref<8x8xf32>
+  %27 = vector.transfer_read %alloc_4[%c0, %c0], %cst_0 :
+        memref<8x8xf32>, vector<8x8xf32>
+  return %27 : vector<8x8xf32>
+}
+
+// CHECK-LABEL: func @do_not_simplify_alloc(
+
+// CHECK:         memref.alloc
+// CHECK:         memref.copy
+
+// -----
+
+func.func @do_not_simplify_subview_with_other_use(%arg0: memref<8x8xf32>)
+                                                  -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  %subview_6 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  memref.copy %arg0, %subview_6 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  memref.copy %arg0, %subview_5 :
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
+  return %arg0 : memref<8x8xf32>
+}
+
+
+// CHECK-LABEL: func @do_not_simplify_subview_with_other_use(
+
+// CHECK:         memref.alloc
+// CHECK:         memref.subview
+// CHECK:         memref.subview
+// CHECK:         memref.copy
+// CHECK:         memref.copy
+
+// -----
+
+func.func @target_is_alloc_with_loads_stores(%arg0: memref<8x8xf32>)
+                                             -> memref<8x8xf32> {
+  %c4 = arith.constant 4 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
+  memref.copy %arg0, %alloc_4: memref<8x8xf32> to memref<8x8xf32>
+  "lmhlo.custom_call"(%alloc_4, %alloc_4) ({
+  }) {
+    backend_config = "",
+    call_target_name = "foo",
+    has_side_effect = false,
+    operand_segment_sizes = array<i32: 1, 1>
+  } : (memref<8x8xf32>, memref<8x8xf32>) -> ()
+
+  return %arg0 : memref<8x8xf32>
+}
+
+// CHECK-LABEL: func @target_is_alloc_with_loads_stores(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<8x8xf32>)
+
+// CHECK:         memref.alloc
+// CHECK:         memref.copy
+// CHECK:         "lmhlo.custom_call"
+// CHECK:         return %[[INPUT]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling.mlir
index 24777b633fc..51db848f008 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling.mlir
@@ -3,213 +3,16 @@
 // RUN: --gml-tiling="tile-sizes=1,1 distribute=false op-label=tile-2d-point" \
 // RUN: --gml-tiling="tile-sizes=1 distribute=false op-label=tile-1d-point" \
 // RUN: --gml-tiling="tile-sizes=256,512 distribute=false op-label=tile-3d" \
+// RUN: --gml-tiling="tile-sizes=10 distribute=false op-label=tile-1d" \
+// RUN: --gml-tiling="tile-sizes=2,4 distribute=false op-label=tile-pad" \
 // RUN: --cse | \
 // RUN: FileCheck %s --check-prefix=CHECK-FOR
 
 // RUN: mlir-hlo-opt %s --split-input-file \
-// RUN: --gml-tiling="tile-sizes=256,512 distribute=true op-label=tile-2d" \
+// RUN: --gml-tiling="tile-sizes=256,512 distribute=true op-label=tile-3d" \
 // RUN: --cse | \
 // RUN: FileCheck %s --check-prefix=CHECK-PARALLEL
 
-#id_map = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func @add_static(%lhs: tensor<1024x1024xf32>, %rhs: tensor<1024x1024xf32>)
-    -> tensor<1024x1024xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %init = tensor.empty() : tensor<1024x1024xf32>
-  %add = linalg.generic {
-      indexing_maps = [#id_map, #id_map, #id_map],
-      iterator_types = ["parallel", "parallel"],
-      op_label = "tile-2d"}
-      ins(%lhs, %rhs : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
-      outs(%init : tensor<1024x1024xf32>) {
-  ^bb0(%lhs_scalar: f32, %rhs_scalar: f32, %_: f32):
-    %add_scalar = arith.addf %lhs_scalar, %rhs_scalar : f32
-    linalg.yield %add_scalar : f32
-  } -> tensor<1024x1024xf32>
-  func.return %add : tensor<1024x1024xf32>
-}
-
-// CHECK-FOR-LABEL: @add_static
-// CHECK-FOR-SAME:  %[[ARG0:.*]]: tensor<1024x1024xf32>, %[[ARG1:.*]]: tensor<1024x1024xf32>
-
-// CHECK-FOR-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-FOR-DAG:   %[[C256:.*]] = arith.constant 256
-// CHECK-FOR-DAG:   %[[C512:.*]] = arith.constant 512
-// CHECK-FOR-DAG:   %[[C1024:.*]] = arith.constant 1024
-// CHECK-FOR:       %[[INIT:.*]] = tensor.empty()
-// CHECK-FOR:       %[[FOR:.*]] = gml_st.for (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-FOR-SAME:      to (%[[C1024]], %[[C1024]])
-// CHECK-FOR-SAME:      step (%[[C256]], %[[C512]])
-// CHECK-FOR-SAME:      outs (%[[ARG4:.*]] = %[[INIT]]: tensor<1024x1024xf32>)
-// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]]] [256, 512] [1, 1]
-// CHECK-FOR:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG1]][%[[TILE]]]
-// CHECK-FOR:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[ARG4]][%[[TILE]]]
-// CHECK-FOR:         %[[GENERIC:.*]] = linalg.generic
-// CHECK-FOR-SAME:        iterator_types = ["parallel", "parallel"]
-// CHECK-FOR-SAME:        ins(%[[MATERIALIZE]], %[[MATERIALIZE_0]] : tensor<256x512xf32>, tensor<256x512xf32>)
-// CHECK-FOR-SAME:        outs(%[[MATERIALIZE_1]] : tensor<256x512xf32>)
-// CHECK-FOR-SAME:        attrs =  {op_label = "tile-2d"}
-// CHECK-FOR:         ^bb0(%[[ARG5:.*]]: f32, %[[ARG6:.*]]: f32, %[[ARG7:.*]]: f32):
-// CHECK-FOR:           %[[ADDF:.*]] = arith.addf %[[ARG5]], %[[ARG6]]
-// CHECK-FOR:           linalg.yield %[[ADDF]]
-// CHECK-FOR:         gml_st.set_yield %[[GENERIC]] into %[[ARG4]][%[[TILE]]]
-// CHECK-FOR:       return %[[FOR]]
-
-// -----
-
-#id_map = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func @add(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>)
-    -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %d0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-  %d1 = tensor.dim %lhs, %c1 : tensor<?x?xf32>
-  %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
-  %add = linalg.generic {
-      indexing_maps = [#id_map, #id_map, #id_map],
-      iterator_types = ["parallel", "parallel"],
-      op_label = "tile-2d"}
-      ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%init : tensor<?x?xf32>) {
-  ^bb0(%lhs_scalar: f32, %rhs_scalar: f32, %_: f32):
-    %add_scalar = arith.addf %lhs_scalar, %rhs_scalar : f32
-    linalg.yield %add_scalar : f32
-  } -> tensor<?x?xf32>
-  func.return %add : tensor<?x?xf32>
-}
-
-
-// CHECK-FOR-LABEL: @add(
-// CHECK-FOR-SAME:  %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
-
-// CHECK-FOR:       %[[C0:.*]] = arith.constant 0
-// CHECK-FOR:       %[[C1:.*]] = arith.constant 1
-// CHECK-FOR:       %[[C256:.*]] = arith.constant 256
-// CHECK-FOR:       %[[C512:.*]] = arith.constant 512
-// CHECK-FOR:       %[[LHS_DIM_0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
-// CHECK-FOR:       %[[LHS_DIM_1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
-// CHECK-FOR:       %[[INIT:.*]] = tensor.empty(%[[LHS_DIM_0]], %[[LHS_DIM_1]])
-// CHECK-FOR:       %[[FOR:.*]] = gml_st.for (%[[ARG2:.*]], %[[ARG3:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-FOR-SAME:      to (%[[LHS_DIM_0]], %[[LHS_DIM_1]])
-// CHECK-FOR-SAME:      step (%[[C256]], %[[C512]])
-// CHECK-FOR-SAME:      outs (%[[OUT:.*]] = %[[INIT]]: tensor<?x?xf32>)
-// CHECK-FOR:         %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[ARG2]])[%[[LHS_DIM_0]]]
-// CHECK-FOR:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[ARG3]])[%[[LHS_DIM_1]]]
-// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[ARG2]], %[[ARG3]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
-// CHECK-FOR:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG1]][%[[TILE]]]
-// CHECK-FOR:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[OUT]][%[[TILE]]]
-// CHECK-FOR:         %[[GENERIC:.*]] = linalg.generic
-// CHECK-FOR-SAME:        iterator_types = ["parallel", "parallel"]
-// CHECK-FOR-SAME:        ins(%[[MATERIALIZE]], %[[MATERIALIZE_0]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-FOR-SAME:        outs(%[[MATERIALIZE_1]] : tensor<?x?xf32>)
-// CHECK-FOR-SAME:        attrs =  {op_label = "tile-2d"}
-// CHECK-FOR:         ^bb0(%[[ARG5:.*]]: f32, %[[ARG6:.*]]: f32, %[[ARG7:.*]]: f32):
-// CHECK-FOR:           %[[ADDF:.*]] = arith.addf %[[ARG5]], %[[ARG6]]
-// CHECK-FOR:           linalg.yield %[[ADDF]]
-// CHECK-FOR:         gml_st.set_yield %[[GENERIC]] into %[[OUT]][%[[TILE]]]
-// CHECK-FOR:       return %[[FOR]]
-
-
-// CHECK-PARALLEL-LABEL: @add(
-// CHECK-PARALLEL-SAME:  %[[LHS:.*]]: tensor<?x?xf32>, %[[RHS:.*]]: tensor<?x?xf32>
-
-// CHECK-PARALLEL:       %[[C0:.*]] = arith.constant 0
-// CHECK-PARALLEL:       %[[C1:.*]] = arith.constant 1
-// CHECK-PARALLEL:       %[[C256:.*]] = arith.constant 256
-// CHECK-PARALLEL:       %[[C512:.*]] = arith.constant 512
-// CHECK-PARALLEL:       %[[LHS_DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]]
-// CHECK-PARALLEL:       %[[LHS_DIM_1:.*]] = tensor.dim %[[LHS]], %[[C1]]
-// CHECK-PARALLEL:       %[[INIT:.*]] = tensor.empty(%[[LHS_DIM_0]], %[[LHS_DIM_1]])
-// CHECK-PARALLEL:       %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG2:.*]], %[[ARG3:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-PARALLEL-SAME:      to (%[[LHS_DIM_0]], %[[LHS_DIM_1]])
-// CHECK-PARALLEL-SAME:      step (%[[C256]], %[[C512]])
-// CHECK-PARALLEL:         %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[ARG2]])[%[[LHS_DIM_0]]]
-// CHECK-PARALLEL:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[ARG3]])[%[[LHS_DIM_1]]]
-// CHECK-PARALLEL:         %[[TILE:.*]] = gml_st.tile [%[[ARG2]], %[[ARG3]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
-// CHECK-PARALLEL:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[LHS]][%[[TILE]]]
-// CHECK-PARALLEL:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[RHS]][%[[TILE]]]
-// CHECK-PARALLEL:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[INIT]][%[[TILE]]]
-// CHECK-PARALLEL:         %[[GENERIC:.*]] = linalg.generic
-// CHECK-PARALLEL-SAME:        iterator_types = ["parallel", "parallel"]
-// CHECK-PARALLEL-SAME:        ins(%[[MATERIALIZE]], %[[MATERIALIZE_0]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-PARALLEL-SAME:        outs(%[[MATERIALIZE_1]] : tensor<?x?xf32>)
-// CHECK-PARALLEL-SAME:        attrs =  {op_label = "tile-2d"}
-// CHECK-PARALLEL:         ^bb0(%[[OUT:.*]]: f32, %[[ARG5:.*]]: f32, %[[ARG6:.*]]: f32):
-// CHECK-PARALLEL:           %[[ADDF:.*]] = arith.addf %[[OUT]], %[[ARG5]]
-// CHECK-PARALLEL:           linalg.yield %[[ADDF]]
-// CHECK-PARALLEL:         gml_st.set_yield %[[GENERIC]] into %[[INIT]][%[[TILE]]]
-// CHECK-PARALLEL:       return %[[PARALLEL]]
-
-// -----
-
-func.func @reduce_row(%lhs: tensor<?x?xf32>,
-                      %rhs: tensor<?x?xf32>) -> tensor<?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
-
-  %init = tensor.empty(%0) : tensor<?xf32>
-  %fill = linalg.fill ins(%cst : f32)
-                      outs(%init : tensor<?xf32>) -> tensor<?xf32>
-  %sum_of_prod = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"],
-    op_label = "tile-2d"}
-    ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
-    outs(%fill : tensor<?xf32>) {
-  ^bb0(%l: f32, %r: f32, %o: f32):
-    %prod = arith.mulf %l, %r : f32
-    %add = arith.addf %prod, %o : f32
-    linalg.yield %add : f32
-  } -> tensor<?xf32>
-  func.return %sum_of_prod : tensor<?xf32>
-}
-
-
-// CHECK-FOR-LABEL: @reduce_row
-// CHECK-FOR-SAME:  %[[LHS:.*]]: tensor<?x?xf32>, %[[RHS:.*]]: tensor<?x?xf32>
-
-// CHECK-FOR-DAG:   %[[C0_0:.*]] = arith.constant 0
-// CHECK-FOR-DAG:   %[[C1_0:.*]] = arith.constant 1
-// CHECK-FOR-DAG:   %[[LHS_DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0_0]]
-// CHECK-FOR-DAG:   %[[LHS_DIM_1:.*]] = tensor.dim %[[LHS]], %[[C1_0]]
-// CHECK-FOR-DAG:   %[[C256_0:.*]] = arith.constant 256
-// CHECK-FOR-DAG:   %[[C512_0:.*]] = arith.constant 512
-// CHECK-FOR-DAG:   %[[CST:.*]] = arith.constant 0.000000e+00
-// CHECK-FOR-DAG:   %[[INIT_0:.*]] = tensor.empty(%[[LHS_DIM_0]])
-// CHECK-FOR-DAG:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT_0]] : tensor<?xf32>)
-// CHECK-FOR:       %[[FOR_0:.*]] = gml_st.for (%[[ARG2_0:.*]], %[[ARG3_0:.*]]) = (%[[C0_0]], %[[C0_0]])
-// CHECK-FOR-SAME:      to (%[[LHS_DIM_0]], %[[LHS_DIM_1]])
-// CHECK-FOR-SAME:      step (%[[C256_0]], %[[C512_0]])
-// CHECK-FOR-SAME:      outs (%[[OUT_0:.*]] = %[[FILL]]: tensor<?xf32>)
-// CHECK-FOR:         %[[MIN_1:.*]] = affine.min #map{{[0-9]*}}(%[[ARG2_0]])[%[[LHS_DIM_0]]]
-// CHECK-FOR:         %[[MIN_2:.*]] = affine.min #map{{[0-9]*}}(%[[ARG3_0]])[%[[LHS_DIM_1]]]
-// CHECK-FOR:         %[[TILE_2:.*]] = gml_st.tile [%[[ARG2_0]], %[[ARG3_0]]] [%[[MIN_1]], %[[MIN_2]]] [1, 1]
-// CHECK-FOR:         %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[LHS]][%[[TILE_2]]]
-// CHECK-FOR:         %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[RHS]][%[[TILE_2]]]
-// CHECK-FOR:         %[[TILE_4:.*]] = gml_st.tile [%[[ARG2_0]]] [%[[MIN_1]]] [1]
-// CHECK-FOR:         %[[MATERIALIZE_4:.*]] = gml_st.materialize %[[OUT_0]][%[[TILE_4]]]
-// CHECK-FOR:         %[[GENERIC_0:.*]] = linalg.generic
-// CHECK-FOR-SAME:        iterator_types = ["parallel", "reduction"]}
-// CHECK-FOR-SAME:        ins(%[[MATERIALIZE_2]], %[[MATERIALIZE_3]] : tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-FOR-SAME:        outs(%[[MATERIALIZE_4]] : tensor<?xf32>)
-// CHECK-FOR-SAME:        attrs =  {op_label = "tile-2d"}
-// CHECK-FOR:         ^bb0(%[[ARG5_0:.*]]: f32, %[[ARG6_0:.*]]: f32, %[[ARG7_0:.*]]: f32):
-// CHECK-FOR:           %[[MULF:.*]] = arith.mulf %[[ARG5_0]], %[[ARG6_0]]
-// CHECK-FOR:           %[[ADDF_0:.*]] = arith.addf %[[MULF]], %[[ARG7_0]]
-// CHECK-FOR:           linalg.yield %[[ADDF_0]]
-// CHECK-FOR:         gml_st.set_yield %[[GENERIC_0]] into %[[OUT_0]][%[[TILE_4]]]
-// CHECK-FOR:       return %[[FOR_0]]
-
-// -----
-
 func.func @dynamic_broadcast_in_dim_at_tile(%init : tensor<?x?x?xf32>,
     %arg : tensor<?x?xf32>) -> tensor<?x?x?xf32> {
   %bcast = thlo.dynamic_broadcast_in_dim ins(%arg: tensor<?x?xf32>)
@@ -236,27 +39,61 @@ func.func @dynamic_broadcast_in_dim_at_tile(%init : tensor<?x?x?xf32>,
 // CHECK-FOR-SAME:      outs (%[[OUT:.*]] = %[[INIT]]: tensor<?x?x?xf32>)
 // CHECK-FOR:         %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[I]])[%[[INIT_DIM_0]]]
 // CHECK-FOR:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[J]])[%[[INIT_DIM_1]]]
-// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
 // CHECK-FOR:         %[[ARG_DIM_0:.*]] = tensor.dim %[[ARG]], %[[C0]]
 // CHECK-FOR:         %[[ARG_DIM_1:.*]] = tensor.dim %[[ARG]], %[[C1]]
-// CHECK-FOR:         %[[OUT_DIM_0:.*]] = tensor.dim %[[OUT]], %[[C0]]
-// CHECK-FOR:         %[[CMPI:.*]] = arith.cmpi ne, %[[ARG_DIM_0]], %[[OUT_DIM_0]]
-// CHECK-FOR:         %[[OUT_DIM_2:.*]] = tensor.dim %[[OUT]], %[[C2]]
-// CHECK-FOR:         %[[CMPI_0:.*]] = arith.cmpi ne, %[[ARG_DIM_1]], %[[OUT_DIM_2]]
+// CHECK-FOR:         %[[CMPI:.*]] = arith.cmpi ne, %[[ARG_DIM_0]], %[[INIT_DIM_0]]
+// CHECK-FOR:         %[[CMPI_0:.*]] = arith.cmpi ne, %[[ARG_DIM_1]], %[[INIT_DIM_2]]
 // CHECK-FOR:         %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[I]]
 // CHECK-FOR:         %[[SELECT_0:.*]] = arith.select %[[CMPI]], %[[C1]], %[[MIN]]
 // CHECK-FOR:         %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[C1]], %[[INIT_DIM_2]]
-// CHECK-FOR:         %[[TILE_0:.*]] = gml_st.tile [%[[SELECT]], %[[C0]]] [%[[SELECT_0]], %[[SELECT_1]]] [1, 1]
-// CHECK-FOR:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[OUT]][%[[TILE]]]
-// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG]][%[[TILE_0]]]
+// CHECK-FOR:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[OUT]]
+// CHECK-FOR-SAME:      [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
+// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG]]
+// CHECK-FOR-SAME:      [%[[SELECT]], %[[C0]]] [%[[SELECT_0]], %[[SELECT_1]]] [1, 1]
 // CHECK-FOR:         %[[DYNAMIC:.*]] = thlo.dynamic_broadcast_in_dim
 // CHECK-FOR-SAME:        ins(%[[MATERIALIZE_0]]
 // CHECK-FOR-SAME:        outs(%[[MATERIALIZE]]
 // CHECK-FOR-SAME:        broadcast_dimensions = [0, 2]
+// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
 // CHECK-FOR:         gml_st.set_yield %[[DYNAMIC]] into %[[OUT]][%[[TILE]]]
 // CHECK-FOR:       return %[[FOR]]
 
 // CHECK-PARALLEL-LABEL: @dynamic_broadcast_in_dim_at_tile
+// CHECK-PARALLEL-SAME:  %[[INIT:.*]]: tensor<?x?x?xf32>, %[[ARG:.*]]: tensor<?x?xf32>
+
+// CHECK-PARALLEL:       %[[C0:.*]] = arith.constant 0
+// CHECK-PARALLEL:       %[[C1:.*]] = arith.constant 1
+// CHECK-PARALLEL:       %[[C2:.*]] = arith.constant 2
+// CHECK-PARALLEL:       %[[C256:.*]] = arith.constant 256
+// CHECK-PARALLEL:       %[[C512:.*]] = arith.constant 512
+// CHECK-PARALLEL:       %[[INIT_DIM_0:.*]] = tensor.dim %[[INIT]], %[[C0]]
+// CHECK-PARALLEL:       %[[INIT_DIM_1:.*]] = tensor.dim %[[INIT]], %[[C1]]
+// CHECK-PARALLEL:       %[[INIT_DIM_2:.*]] = tensor.dim %[[INIT]], %[[C2]]
+// CHECK-PARALLEL:       %[[PARALLEL:.*]] = gml_st.parallel (%[[I:.*]], %[[J:.*]]) =
+// CHECK-PARALLEL-SAME:      (%[[C0]], %[[C0]])
+// CHECK-PARALLEL-SAME:      to (%[[INIT_DIM_0]], %[[INIT_DIM_1]])
+// CHECK-PARALLEL-SAME:      step (%[[C256]], %[[C512]])
+// CHECK-PARALLEL-SAME:      outs (%[[OUT:.*]] = %[[INIT]]: tensor<?x?x?xf32>)
+// CHECK-PARALLEL:         %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[I]])[%[[INIT_DIM_0]]]
+// CHECK-PARALLEL:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[J]])[%[[INIT_DIM_1]]]
+// CHECK-PARALLEL:         %[[ARG_DIM_0:.*]] = tensor.dim %[[ARG]], %[[C0]]
+// CHECK-PARALLEL:         %[[ARG_DIM_1:.*]] = tensor.dim %[[ARG]], %[[C1]]
+// CHECK-PARALLEL:         %[[CMPI:.*]] = arith.cmpi ne, %[[ARG_DIM_0]], %[[INIT_DIM_0]]
+// CHECK-PARALLEL:         %[[CMPI_0:.*]] = arith.cmpi ne, %[[ARG_DIM_1]], %[[INIT_DIM_2]]
+// CHECK-PARALLEL:         %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[I]]
+// CHECK-PARALLEL:         %[[SELECT_0:.*]] = arith.select %[[CMPI]], %[[C1]], %[[MIN]]
+// CHECK-PARALLEL:         %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[C1]], %[[INIT_DIM_2]]
+// CHECK-PARALLEL:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[OUT]]
+// CHECK-PARALLEL-SAME:      [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
+// CHECK-PARALLEL:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG]]
+// CHECK-PARALLEL-SAME:      [%[[SELECT]], %[[C0]]] [%[[SELECT_0]], %[[SELECT_1]]] [1, 1]
+// CHECK-PARALLEL:         %[[DYNAMIC:.*]] = thlo.dynamic_broadcast_in_dim
+// CHECK-PARALLEL-SAME:        ins(%[[MATERIALIZE_0]]
+// CHECK-PARALLEL-SAME:        outs(%[[MATERIALIZE]]
+// CHECK-PARALLEL-SAME:        broadcast_dimensions = [0, 2]
+// CHECK-PARALLEL:         %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]], %[[C0]]] [%[[MIN]], %[[MIN_0]], %[[INIT_DIM_2]]] [1, 1, 1]
+// CHECK-PARALLEL:         gml_st.set_yield %[[DYNAMIC]] into %[[OUT]][%[[TILE]]]
+// CHECK-PARALLEL:       return %[[PARALLEL]]
 
 // -----
 
@@ -283,13 +120,14 @@ func.func @scatter_i64(%indices: tensor<?x2xindex>,
 
 // CHECK-FOR:       %[[INDICES_COUNT:.*]] = tensor.dim %[[INDICES]], %c0
 // CHECK-FOR:       gml_st.for (%{{.*}}) = (%[[C0]]) to (%[[INDICES_COUNT]])
+// CHECK-FOR-SAME:    outs (%[[INIT_:.*]] = %[[INIT]]: tensor<?x?xi64>) {
 
-// CHECK-FOR:       %[[UPDATE_SUB:.*]] = gml_st.materialize %[[UPDATES]]
-// CHECK-FOR-SAME:    : tensor<?x?x?xi64>[!gml_st.tile<1x?x?>]
-// CHECK-FOR:       %[[INDICES_SUB:.*]] = gml_st.materialize %[[INDICES]]
-// CHECK-FOR-SAME:    : tensor<?x2xindex>[!gml_st.tile<1x2>]
-// CHECK-FOR:       %[[INIT_SUB:.*]] = gml_st.materialize
-// CHECK-FOR-SAME:    : tensor<?x?xi64>[!gml_st.tile<?x?>]
+// CHECK-FOR:       %[[UPDATE_SUB:.*]] = tensor.extract_slice %[[UPDATES]]
+// CHECK-FOR-SAME:    : tensor<?x?x?xi64>
+// CHECK-FOR:       %[[INDICES_SUB:.*]] = tensor.extract_slice %[[INDICES]]
+// CHECK-FOR-SAME:    : tensor<?x2xindex>
+// CHECK-FOR:       %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]]
+// CHECK-FOR-SAME:    : tensor<?x?xi64>
 
 // CHECK-FOR:       %[[SCATTER:.*]] = thlo.scatter
 // CHECK-FOR-SAME:    ins(%[[INDICES_SUB]] : tensor<1x2xindex>,
@@ -318,11 +156,11 @@ func.func @gather(%operand: tensor<?x?x?x?xf64>, %indices: tensor<?x4xindex>,
 // CHECK-FOR:       %[[RESULT:.*]] = gml_st.for (%[[I:.*]]) =
 // CHECK-FOR-SAME:      (%[[INIT_:[a-z0-9]+]] = %[[INIT]]: tensor<?x10xf64>)
 
-// CHECK-FOR:         %[[INDEX_TILE:.*]] = gml_st.tile [%[[I]], 0] [1, 4] [1, 1]
-// CHECK-FOR:         %[[INDEX_SLICE:.*]] = gml_st.materialize %[[INDICES]][%[[INDEX_TILE]]]
+// CHECK-FOR:         %[[INDEX_SLICE:.*]] = tensor.extract_slice %[[INDICES]]
+// CHECK-FOR-SAME:      [%[[I]], 0] [1, 4] [1, 1]
 
-// CHECK-FOR:         %[[INIT_TILE:.*]] = gml_st.tile [%[[I]], 0] [1, 10] [1, 1]
-// CHECK-FOR:         %[[INIT_SLICE:.*]] = gml_st.materialize %[[INIT_]][%[[INIT_TILE]]]
+// CHECK-FOR:         %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT_]]
+// CHECK-FOR-SAME:      [%[[I]], 0] [1, 10] [1, 1]
 // CHECK-FOR:         %[[GATHER_SLICE:.*]] = thlo.gather
 // CHECK-FOR-SAME:       ins(%[[OPERAND]] : tensor<?x?x?x?xf64>,
 // CHECK-FOR-SAME:           %[[INDEX_SLICE]] : tensor<1x4xindex>)
@@ -336,9 +174,9 @@ func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
     -> tensor<?x?xi32> {
   %concat = thlo.concatenate
       ins(%a : tensor<?x?xi32>, %b : tensor<?x?xi32>, %c : tensor<?x?xi32>)
-      outs(%init : tensor<?x?xi32>) {
-      dimension = 1 : i64,
-      op_label = "tile-2d" }
+      outs(%init : tensor<?x?xi32>)
+      dimension = 1
+      { op_label = "tile-2d" }
   func.return %concat : tensor<?x?xi32>
 }
 
@@ -358,13 +196,12 @@ func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
 // CHECK-FOR-SAME:      outs (%[[ARG6:.*]] = %[[ARG0]]: tensor<?x?xi32>)
 // CHECK-FOR:         %[[MIN:.*]] = affine.min #map{{[0-9]*}}(%[[ARG4]])[%[[DIM]]]
 // CHECK-FOR:         %[[MIN_0:.*]] = affine.min #map{{[0-9]*}}(%[[ARG5]])[%[[DIM_0]]]
-// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[ARG4]], %[[ARG5]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
 // CHECK-FOR:         %[[DIM_4:.*]] = tensor.dim %[[ARG1]], %[[C1]]
 // CHECK-FOR:         %[[MINUI:.*]] = arith.minui %[[ARG5]], %[[DIM_4]]
 // CHECK-FOR:         %[[SUBI:.*]] = arith.subi %[[DIM_4]], %[[MINUI]]
 // CHECK-FOR:         %[[MINUI_0:.*]] = arith.minui %[[SUBI]], %[[MIN_0]]
-// CHECK-FOR:         %[[TILE_0:.*]] = gml_st.tile [%[[ARG4]], %[[MINUI]]] [%[[MIN]], %[[MINUI_0]]] [%[[C1]], %[[C1]]]
-// CHECK-FOR:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG1]][%[[TILE_0]]]
+// CHECK-FOR:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG1]]
+// CHECK-FOR-SAME:      [%[[ARG4]], %[[MINUI]]] [%[[MIN]], %[[MINUI_0]]] [1, 1]
 // CHECK-FOR:         %[[CMPI:.*]] = arith.cmpi ule, %[[ARG5]], %[[DIM_4]]
 // CHECK-FOR:         %[[SUBI_0:.*]] = arith.subi %[[ARG5]], %[[DIM_4]]
 // CHECK-FOR:         %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[SUBI_0]]
@@ -372,8 +209,8 @@ func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
 // CHECK-FOR:         %[[MINUI_1:.*]] = arith.minui %[[SELECT]], %[[DIM_5]]
 // CHECK-FOR:         %[[SUBI_1:.*]] = arith.subi %[[DIM_5]], %[[MINUI_1]]
 // CHECK-FOR:         %[[MINUI_2:.*]] = arith.minui %[[SUBI_1]], %[[MIN_0]]
-// CHECK-FOR:         %[[TILE_1:.*]] = gml_st.tile [%[[ARG4]], %[[MINUI_1]]] [%[[MIN]], %[[MINUI_2]]] [%[[C1]], %[[C1]]]
-// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG2]][%[[TILE_1]]]
+// CHECK-FOR:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG2]]
+// CHECK-FOR-SAME:      [%[[ARG4]], %[[MINUI_1]]] [%[[MIN]], %[[MINUI_2]]] [1, 1]
 // CHECK-FOR:         %[[CMPI_0:.*]] = arith.cmpi ule, %[[SELECT]], %[[DIM_5]]
 // CHECK-FOR:         %[[SUBI_2:.*]] = arith.subi %[[SELECT]], %[[DIM_5]]
 // CHECK-FOR:         %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[C0]], %[[SUBI_2]]
@@ -381,13 +218,15 @@ func.func @concatenate_at_tile(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
 // CHECK-FOR:         %[[MINUI_3:.*]] = arith.minui %[[SELECT_0]], %[[DIM_6]]
 // CHECK-FOR:         %[[SUBI_3:.*]] = arith.subi %[[DIM_6]], %[[MINUI_3]]
 // CHECK-FOR:         %[[MINUI_4:.*]] = arith.minui %[[SUBI_3]], %[[MIN_0]]
-// CHECK-FOR:         %[[TILE_2:.*]] = gml_st.tile [%[[ARG4]], %[[MINUI_3]]] [%[[MIN]], %[[MINUI_4]]] [%[[C1]], %[[C1]]]
-// CHECK-FOR:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[ARG3]][%[[TILE_2]]]
-// CHECK-FOR:         %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[ARG6]][%[[TILE]]]
+// CHECK-FOR:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[ARG3]]
+// CHECK-FOR-SAME:      [%[[ARG4]], %[[MINUI_3]]] [%[[MIN]], %[[MINUI_4]]] [1, 1]
+// CHECK-FOR:         %[[MATERIALIZE_2:.*]] = tensor.extract_slice %[[ARG6]]
+// CHECK-FOR:         [%[[ARG4]], %[[ARG5]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
 // CHECK-FOR:         %[[CONCATENATE:.*]] = thlo.concatenate
 // CHECK-FOR-SAME:        ins(%[[MATERIALIZE]] : tensor<?x?xi32>, %[[MATERIALIZE_0]] : tensor<?x?xi32>, %[[MATERIALIZE_1]] : tensor<?x?xi32>)
 // CHECK-FOR-SAME:        outs(%[[MATERIALIZE_2]] : tensor<?x?xi32>)
-// CHECK-FOR-SAME:        {dimension = 1 : i64}
+// CHECK-FOR-SAME:        dimension = 1
+// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile [%[[ARG4]], %[[ARG5]]] [%[[MIN]], %[[MIN_0]]] [1, 1]
 // CHECK-FOR:         gml_st.set_yield %[[CONCATENATE]] into %[[ARG6]][%[[TILE]]]
 // CHECK-FOR:       return %[[FOR]]
 
@@ -401,7 +240,9 @@ func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>)
       outs(%init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-      { dimension = 1 : i64, is_stable = true, op_label = "tile-3d" }
+      dimension = 1
+      is_stable = true
+      {op_label = "tile-3d" }
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -426,20 +267,32 @@ func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
 // CHECK-FOR-DAG:     %[[TILE_SIZE0:.*]] = affine.min #map{{[0-9]*}}(%[[START0]])[%[[DIM0]]]
 // CHECK-FOR-DAG:     %[[TILE_SIZE2:.*]] = affine.min #map{{[0-9]*}}(%[[START2]])[%[[DIM2]]]
 // CHECK-FOR-DAG:     %[[DIM1:.*]] = tensor.dim %[[IN0]], %[[C1]]
-// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile
+// CHECK-FOR-DAG:     %[[IN0_SUB:.*]] = tensor.extract_slice %[[IN0]]
+// CHECK-FOR-SAME:        [%[[START0]], 0, %[[START2]]]
+// CHECK-FOR-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
+// CHECK-FOR-SAME:        [1, 1, 1]
+// CHECK-FOR-DAG:     %[[IN1_SUB:.*]] = tensor.extract_slice %[[IN1]]
+// CHECK-FOR-SAME:        [%[[START0]], 0, %[[START2]]]
+// CHECK-FOR-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
+// CHECK-FOR-SAME:        [1, 1, 1]
+// CHECK-FOR-DAG:     %[[INIT0_SUB:.*]] = tensor.extract_slice %[[INIT0_]]
+// CHECK-FOR-SAME:        [%[[START0]], 0, %[[START2]]]
+// CHECK-FOR-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
+// CHECK-FOR-SAME:        [1, 1, 1]
+// CHECK-FOR-DAG:     %[[INIT1_SUB:.*]] = tensor.extract_slice %[[INIT1_]]
 // CHECK-FOR-SAME:        [%[[START0]], 0, %[[START2]]]
 // CHECK-FOR-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
 // CHECK-FOR-SAME:        [1, 1, 1]
-// CHECK-FOR-DAG:     %[[IN0_SUB:.*]] = gml_st.materialize %[[IN0]][%[[TILE]]]
-// CHECK-FOR-DAG:     %[[IN1_SUB:.*]] = gml_st.materialize %[[IN1]][%[[TILE]]]
-// CHECK-FOR-DAG:     %[[INIT0_SUB:.*]] = gml_st.materialize %[[INIT0_]][%[[TILE]]]
-// CHECK-FOR-DAG:     %[[INIT1_SUB:.*]] = gml_st.materialize %[[INIT1_]][%[[TILE]]]
 // CHECK-FOR:         thlo.sort
 // CHECK-FOR-SAME:        ins(%[[IN0_SUB]] : tensor<?x?x?xf32>, %[[IN1_SUB]] : tensor<?x?x?xi32>)
 // CHECK-FOR-SAME:        outs(%[[INIT0_SUB]] : tensor<?x?x?xf32>, %[[INIT1_SUB]] : tensor<?x?x?xi32>)
+// CHECK-FOR:         %[[TILE:.*]] = gml_st.tile
+// CHECK-FOR-SAME:        [%[[START0]], 0, %[[START2]]]
+// CHECK-FOR-SAME:        [%[[TILE_SIZE0]], %[[DIM1]], %[[TILE_SIZE2]]]
+// CHECK-FOR-SAME:        [1, 1, 1]
 // CHECK-FOR:         gml_st.set_yield
-// CHECK-FOR-SAME:        %[[RESULT_TILE:.*]]#0 into %[[INIT0_]][%[[TILE]]]
-// CHECK-FOR:             %[[RESULT_TILE]]#1 into %[[INIT1_]][%[[TILE]]]
+// CHECK-FOR-SAME:        %[[RESULT_TILE:.*]]0 into %[[INIT0_]][%[[TILE]]]
+// CHECK-FOR:             %[[RESULT_TILE]]1 into %[[INIT1_]][%[[TILE]]]
 
 // -----
 
@@ -453,7 +306,9 @@ func.func @sort2(%input1: tensor<1024x2048x4096xf32>,
           %input2: tensor<1024x2048x4096xi32>)
       outs(%init1: tensor<1024x2048x4096xf32>,
            %init2: tensor<1024x2048x4096xi32>)
-      { dimension = 1 : i64, is_stable = true, op_label = "tile-3d" }
+      dimension = 1
+      is_stable = true
+      { op_label = "tile-3d" }
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -463,3 +318,70 @@ func.func @sort2(%input1: tensor<1024x2048x4096xf32>,
 }
 
 // CHECK-FOR-LABEL: func.func @sort2
+
+// -----
+
+func.func @reverse_static(%input: tensor<100xf32>, %init: tensor<100xf32>)
+  -> tensor<100xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<100xf32>)
+         outs(%init: tensor<100xf32>)
+         reverse_dimensions = [0]
+         { op_label = "tile-1d" }
+  func.return %res : tensor<100xf32>
+}
+
+// CHECK-FOR-LABEL: func @reverse_static
+//  CHECK-FOR-SAME: %[[ARG0:.*]]: tensor<100xf32>, %[[ARG1:.*]]: tensor<100xf32>
+//   CHECK-FOR-DAG:   %[[C10:.*]] = arith.constant 10
+//   CHECK-FOR-DAG:   %[[C100:.*]] = arith.constant 100
+//       CHECK-FOR:   %[[FOR:.*]] = gml_st.for (%[[I:.*]]) =
+//  CHECK-FOR-SAME:   outs (%[[ARG3:.*]] = %[[ARG1]]
+//       CHECK-FOR:     %[[TEMP_SUB_RES:.*]] = arith.subi %[[C100]], %[[I]]
+//       CHECK-FOR:     %[[IN_TILE_DIM:.*]] = arith.subi %[[TEMP_SUB_RES]], %[[C10]]
+//   CHECK-FOR-DAG:     %[[IN_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IN_TILE_DIM]]]
+//   CHECK-FOR-DAG:     %[[INIT_SLICE:.*]] = tensor.extract_slice %[[ARG3]][%[[I]]]
+//       CHECK-FOR:     %[[REVERSED:.*]] = thlo.reverse ins(%[[IN_SLICE]]
+//       CHECK-FOR:       outs(%[[INIT_SLICE]]
+//   CHECK-FOR-DAG:     %[[INIT_TILE:.*]] = gml_st.tile [%[[I]]]
+//       CHECK-FOR:   gml_st.set_yield %[[REVERSED]] into %[[ARG3]][%[[INIT_TILE]]]
+//       CHECK-FOR:   return %[[FOR]]
+
+// -----
+
+func.func @reverse_dynamic(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
+  -> tensor<?x?xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<?x?xf32>)
+         outs(%init: tensor<?x?xf32>)
+         reverse_dimensions = [0, 1]
+         { op_label = "tile-2d" }
+  func.return %res : tensor<?x?xf32>
+}
+
+// CHECK-FOR-LABEL: func @reverse_dynamic(
+//  CHECK-FOR-SAME: %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: tensor<?x?xf32>
+//   CHECK-FOR-DAG:   %[[C0:.*]] = arith.constant 0
+//   CHECK-FOR-DAG:   %[[C1:.*]] = arith.constant 1
+//   CHECK-FOR-DAG:   %[[DIM:.*]] = tensor.dim %[[ARG1]], %[[C0]]
+//   CHECK-FOR-DAG:   %[[DIM0:.*]] = tensor.dim %[[ARG1]], %[[C1]]
+//       CHECK-FOR:   %[[FOR:.*]] = gml_st.for (%[[I:.*]], %[[J:.*]]) =
+//  CHECK-FOR-SAME:       (%[[C0]], %[[C0]]) to (%[[DIM]], %[[DIM0]])
+//  CHECK-FOR-SAME:       outs (%[[ARG4:.*]] = %[[ARG1]]
+//   CHECK-FOR-DAG:     %[[AFFINE_MIN1:.*]] = affine.min
+//   CHECK-FOR-DAG:     %[[AFFINE_MIN2:.*]] = affine.min
+//   CHECK-FOR-DAG:     %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+//   CHECK-FOR-DAG:     %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+//   CHECK-FOR-DAG:     %[[TEMP_SUB_RES0:.*]] = arith.subi %[[DIM1]], %[[I]]
+//   CHECK-FOR-DAG:     %[[IN_TILE_DIM0:.*]] = arith.subi %[[TEMP_SUB_RES0]], %[[AFFINE_MIN1]]
+//   CHECK-FOR-DAG:     %[[TEMP_SUB_RES1:.*]] = arith.subi %[[DIM2]], %[[J]]
+//   CHECK-FOR-DAG:     %[[IN_TILE_DIM1:.*]] = arith.subi %[[TEMP_SUB_RES1]], %[[AFFINE_MIN2]]
+//   CHECK-FOR-DAG:     %[[IN_SLICE:.*]] = tensor.extract_slice %[[ARG0]]
+//   CHECK-FOR-SAME:      [%[[IN_TILE_DIM0]], %[[IN_TILE_DIM1]]]
+//   CHECK-FOR-DAG:     %[[INIT_SLICE:.*]] = tensor.extract_slice %[[ARG4]]
+//   CHECK-FOR-SAME:      [%[[I]], %[[J]]]
+//       CHECK-FOR:     %[[REVERSED:.*]] = thlo.reverse ins(%[[IN_SLICE]]
+//  CHECK-FOR-SAME:     outs(%[[INIT_SLICE]]
+//       CHECK-FOR:   %[[INIT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]]
+//       CHECK-FOR:   gml_st.set_yield %[[REVERSED]] into %[[ARG4]][%[[INIT_TILE]]]
+//       CHECK-FOR:   return %[[FOR]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_and_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_and_fusion.mlir
index 8e79065a3d3..9b7b37bf353 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_and_fusion.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_and_fusion.mlir
@@ -52,12 +52,12 @@ func.func @reduce_cwise(%lhs: tensor<32x16xf32>, %rhs: tensor<32x16xf32>)
 // CHECK-SAME:      to (%[[C32]])
 // CHECK-SAME:      step (%[[C8]])
 // CHECK-SAME:      outs (%[[ARG3:.*]] = %[[FILL]]: tensor<32xf32>)
-// CHECK:         %[[TILE:.*]] = gml_st.tile [%[[ARG2]], 0] [8, 16] [1, 1]
-// CHECK:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-// CHECK:         %[[TILE_0:.*]] = gml_st.tile [%[[ARG2]], 0] [8, 16] [1, 1]
-// CHECK:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG1]][%[[TILE_0]]]
-// CHECK:         %[[TILE_1:.*]] = gml_st.tile [%[[ARG2]], 0] [8, 16] [1, 1]
-// CHECK:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[INIT]][%[[TILE_1]]]
+// CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK-SAME:      [%[[ARG2]], 0] [8, 16] [1, 1]
+// CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[ARG1]]
+// CHECK-SAME:      [%[[ARG2]], 0] [8, 16] [1, 1]
+// CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT]]
+// CHECK-SAME:      [%[[ARG2]], 0] [8, 16] [1, 1]
 // CHECK:         %[[GENERIC:.*]] = linalg.generic
 // CHECK-SAME:        iterator_types = ["parallel", "parallel"]
 // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[MATERIALIZE_0]] : tensor<8x16xf32>, tensor<8x16xf32>)
@@ -66,8 +66,7 @@ func.func @reduce_cwise(%lhs: tensor<32x16xf32>, %rhs: tensor<32x16xf32>)
 // CHECK:         ^bb0(%[[ARG4:.*]]: f32, %[[ARG5:.*]]: f32, %[[ARG6:.*]]: f32):
 // CHECK:           %[[MULF:.*]] = arith.mulf %[[ARG4]], %[[ARG5]]
 // CHECK:           linalg.yield %[[MULF]]
-// CHECK:         %[[TILE_2:.*]] = gml_st.tile [%[[ARG2]]] [8] [1]
-// CHECK:         %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[ARG3]][%[[TILE_2]]]
+// CHECK:         %[[MATERIALIZE_2:.*]] = tensor.extract_slice %[[ARG3]][%[[ARG2]]] [8] [1]
 // CHECK:         %[[GENERIC_0:.*]] = linalg.generic
 // CHECK-SAME:        iterator_types = ["parallel", "reduction"]
 // CHECK-SAME:        ins(%[[GENERIC]] : tensor<8x16xf32>)
@@ -76,5 +75,6 @@ func.func @reduce_cwise(%lhs: tensor<32x16xf32>, %rhs: tensor<32x16xf32>)
 // CHECK:         ^bb0(%[[ARG4_0:.*]]: f32, %[[ARG5_0:.*]]: f32):
 // CHECK:           %[[ADDF:.*]] = arith.addf %[[ARG4_0]], %[[ARG5_0]]
 // CHECK:           linalg.yield %[[ADDF]]
-// CHECK:         gml_st.set_yield %[[GENERIC_0]] into %[[ARG3]][%[[TILE_2]]]
+// CHECK:         %[[TILE_2_:.*]] = gml_st.tile [%[[ARG2]]] [8] [1]
+// CHECK:         gml_st.set_yield %[[GENERIC_0]] into %[[ARG3]][%[[TILE_2_]]]
 // CHECK:       return %[[FOR]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_cwise.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_cwise.mlir
deleted file mode 100644
index bda22ae481e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_cwise.mlir
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: mlir-hlo-opt %s \
-// RUN:     --gml-tiling-cwise="tile-sizes=4,8 distribute=true distribution-label=test" \
-// RUN:     --cse | \
-// RUN: FileCheck %s
-
-#id = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-func.func @cwise_expr(%a: tensor<?x1024x1024xf32>, %b: tensor<?x1024x1024xf32>,
-    %c: tensor<?x1024x1024xf32>) -> tensor<?x1024x1024xf32> {
-  %c0 = arith.constant 0 : index
-  %d0 = tensor.dim %a, %c0 : tensor<?x1024x1024xf32>
-  %init = tensor.empty(%d0) : tensor<?x1024x1024xf32>
-  %ab = linalg.generic {
-      indexing_maps = [#id, #id, #id],
-      iterator_types = ["parallel", "parallel", "parallel"]}
-      ins(%a, %b : tensor<?x1024x1024xf32>, tensor<?x1024x1024xf32>)
-      outs(%init : tensor<?x1024x1024xf32>) {
-  ^bb0(%a_: f32, %b_: f32, %_: f32):
-    %ab_ = arith.addf %a_, %b_ : f32
-    linalg.yield %ab_ : f32
-  } -> tensor<?x1024x1024xf32>
-  %abc = linalg.generic {
-      indexing_maps = [#id, #id, #id],
-      iterator_types = ["parallel", "parallel", "parallel"]}
-      ins(%ab, %c : tensor<?x1024x1024xf32>, tensor<?x1024x1024xf32>)
-      outs(%init : tensor<?x1024x1024xf32>) {
-  ^bb0(%ab_: f32, %c_: f32, %_: f32):
-    %abc_ = arith.addf %ab_, %c_ : f32
-    linalg.yield %abc_ : f32
-  } -> tensor<?x1024x1024xf32>
-  func.return %abc : tensor<?x1024x1024xf32>
-}
-
-
-// CHECK-LABEL: @cwise_expr
-// CHECK-SAME:  %[[A:.*]]: tensor<?x1024x1024xf32>, %[[B:.*]]: tensor<?x1024x1024xf32>, %[[C:.*]]: tensor<?x1024x1024xf32>
-
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[C4:.*]] = arith.constant 4
-// CHECK-DAG:   %[[C8:.*]] = arith.constant 8
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1024:.*]] = arith.constant 1024
-// CHECK-DAG:   %[[A_D0:.*]] = tensor.dim %[[A]], %[[C0]]
-// CHECK-DAG:   %[[INIT:.*]] = tensor.empty(%[[A_D0]])
-// CHECK:       %[[ABC:.*]] = gml_st.parallel 
-// CHECK-SAME:      (%[[I:.*]], %[[J:.*]], %[[K:.*]]) = (%[[C0]], %[[C0]], %[[C0]])
-// CHECK-SAME:      to (%[[A_D0]], %[[C1024]], %[[C1024]])
-// CHECK-SAME:      step (%[[C1]], %[[C4]], %[[C8]])
-// CHECK-SAME:      distribution ("test")
-// CHECK-DAG:     %[[TILE:.*]] = gml_st.tile [%[[I]], %[[J]], %[[K]]] [1, 4, 8] [1, 1, 1]
-// CHECK-DAG:     %[[A_SUB:.*]] = gml_st.materialize %[[A]][%[[TILE]]]
-// CHECK-DAG:     %[[B_SUB:.*]] = gml_st.materialize %[[B]][%[[TILE]]]
-// CHECK-DAG:     %[[INIT_SUB:.*]] = gml_st.materialize %[[INIT]][%[[TILE]]]
-// CHECK-DAG:     %[[AB_SUB:.*]] = linalg.generic 
-// CHECK-SAME:        ins(%[[A_SUB]], %[[B_SUB]] : tensor<1x4x8xf32>, tensor<1x4x8xf32>) 
-// CHECK-SAME:        outs(%[[INIT_SUB]] : tensor<1x4x8xf32>)
-// CHECK-DAG:     %[[C_SUB:.*]] = gml_st.materialize %[[C]][%[[TILE]]]
-// CHECK-DAG:     %[[ABC_SUB:.*]] = linalg.generic
-// CHECK-SAME:        ins(%[[AB_SUB]], %[[C_SUB]] : tensor<1x4x8xf32>, tensor<1x4x8xf32>) 
-// CHECK-SAME:        outs(%[[INIT_SUB]] : tensor<1x4x8xf32>)
-// CHECK:         gml_st.set_yield %[[ABC_SUB]] into %[[INIT]][%[[TILE]]]
-// CHECK:       return %[[ABC]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_gpu_warp.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_gpu_warp.mlir
deleted file mode 100644
index 596ab7f1bf3..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_gpu_warp.mlir
+++ /dev/null
@@ -1,389 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --gml-tiling-gpu-warp | \
-// RUN: FileCheck %s
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
-// CHECK-LABEL: @tiling_warp_level_reduction
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<7x13xf32>
-func.func @tiling_warp_level_reduction(%arg0: tensor<7x13xf32>)
-    -> tensor<7xf32> {
-  // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index
-  // CHECK-DAG: %[[C13:.*]] = arith.constant 13 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[CST:.*]] = arith.constant 0xFF800000 : f32
-  //     CHECK: %[[EMPTY:.*]] = tensor.empty()
-  //     CHECK: %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C1024]]) step (%[[C1]]) distribution ("warp")
-  //     CHECK:   %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [1, 13] [1, 1]
-  //     CHECK:   %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-  //     CHECK:   %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [1] [1]
-  //     CHECK:   %[[TILE_1:.*]] = gml_st.tile [%[[ARG1]]] [1] [1]
-  //     CHECK:   %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[EMPTY]][%[[TILE_1]]]
-  //     CHECK:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_0]] : tensor<1xf32>)
-  //     CHECK:   %[[EMPTY_0:.*]] = tensor.empty()
-  //     CHECK:   %[[TILE_2:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:   %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_2]]]
-  //     CHECK:   %[[FILL_0:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_1]] : tensor<1xf32>)
-  //     CHECK:   %[[EXTRACTED:.*]] = tensor.extract %[[FILL_0]][%[[C0]]]
-  //     CHECK:   %[[PARALLEL_0:.*]] = gml_st.parallel (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C16]]) step (%[[C1]]) distribution ("thread")
-  //     CHECK:     %[[TILE_3:.*]] = gml_st.tile [0, %[[ARG2]]] [1, 1] [1, 1]
-  //     CHECK:     %[[TILE_4:.*]] = gml_st.tile [0, %[[ARG2]]] [1, 1] [1, 1]
-  //     CHECK:     %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[EMPTY_0]][%[[TILE_4]]]
-  //     CHECK:     %[[FILL_1:.*]] = linalg.fill ins(%[[EXTRACTED]] : f32) outs(%[[MATERIALIZE_2]] : tensor<1x1xf32>)
-  //     CHECK:     %[[FOR:.*]] = gml_st.for (%[[ARG3:.*]]) = (%[[ARG2]]) to (%[[C13]]) step (%[[C16]]) outs (%[[ARG4:.*]] = %[[FILL_1]]: tensor<1x1xf32>)
-  //     CHECK:       %[[TILE_5:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
-  //     CHECK:       %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[MATERIALIZE]][%[[TILE_5]]] : tensor<1x13xf32>[!gml_st.tile<1x1>] to f32
-  //     CHECK:       %[[TILE_6:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
-  //     CHECK:       %[[MATERIALIZE_4:.*]] = gml_st.materialize %[[ARG4]][%[[TILE_6]]] : tensor<1x1xf32>[!gml_st.tile<1x1>] to f32
-  //     CHECK:       %[[MAXF:.*]] = arith.maxf %[[MATERIALIZE_4]], %[[MATERIALIZE_3]] : f32
-  //     CHECK:       gml_st.set_yield %[[MAXF]] into %[[ARG4]][%[[TILE_6]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
-  //     CHECK:     gml_st.set_yield %[[FOR]] into %[[EMPTY_0]][%[[TILE_3]]]
-  //     CHECK:   %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%[[PARALLEL_0]] : tensor<1x16xf32>) outs(%[[FILL]] : tensor<1xf32>)
-  //     CHECK:   ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-  //     CHECK:     %[[MAXF_0:.*]] = arith.maxf %[[OUT]], %[[IN]] : f32
-  //     CHECK:     linalg.yield %[[MAXF_0]] : f32
-  //     CHECK:   gml_st.set_yield %[[GENERIC]] into %[[EMPTY]][%[[TILE_0]]]
-  //     CHECK: return %[[PARALLEL]]
-  %c1 = arith.constant 1 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<7xf32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<7xf32>)
-      -> tensor<7xf32>
-  %2 = gml_st.parallel (%arg1) = (%c0) to (%c1024) step (%c1)
-      distribution ("warp") {
-    %3 = gml_st.tile [%arg1, 0] [1, 13] [1, 1] : !gml_st.tile<1x13>
-    %4 = gml_st.materialize %arg0[%3]
-        : tensor<7x13xf32>[!gml_st.tile<1x13>] to tensor<1x13xf32>
-    %5 = gml_st.tile [%arg1] [1] [1] : !gml_st.tile<1>
-    %6 = gml_st.materialize %1[%5]
-        : tensor<7xf32>[!gml_st.tile<1>] to tensor<1xf32>
-    %7 = linalg.generic { indexing_maps = [#map0, #map1],
-        iterator_types = ["parallel", "reduction"]} ins(%4 : tensor<1x13xf32>)
-        outs(%6 : tensor<1xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %8 = arith.maxf %out, %in : f32
-      linalg.yield %8 : f32
-    } -> tensor<1xf32>
-    gml_st.set_yield %7 into %1[%5]
-        : tensor<1xf32> into tensor<7xf32>[!gml_st.tile<1>]
-  } : tensor<7xf32>
-  return %2 : tensor<7xf32>
-}
-
-// -----
-
-#map = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @tiling_warp_level_cwise
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<7x13xf32>, %[[ARG1:.*]]: tensor<7x13xf32>
-func.func @tiling_warp_level_cwise(%arg0: tensor<7x13xf32>,
-    %arg1: tensor<7x13xf32>) -> tensor<7x13xf32> {
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG:  %[[C16:.*]] = arith.constant 16
-  // CHECK-DAG:  %[[C1024:.*]] = arith.constant 1024
-  // CHECK-DAG:  %[[C28:.*]] = arith.constant 28
-  // CHECK-DAG:  %[[EMPTY:.*]] = tensor.empty() : tensor<7x13xf32>
-  // CHECK:      %[[PARALLEL:.*]] = gml_st.parallel
-  // CHECK-SAME:     (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C1024]])
-  // CHECK-SAME:     step (%[[C1]]) distribution ("warp")
-  // CHECK:        %[[TILE:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 13] [1, 1]
-  // CHECK:        %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-  // CHECK:        %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG1]][%[[TILE]]]
-  // CHECK:        %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[EMPTY]][%[[TILE]]]
-  // CHECK:        %[[PARALLEL_0:.*]] = gml_st.parallel
-  // CHECK-SAME:       (%[[ARG3:.*]]) = (%[[C0]]) to (%[[C16]]) step (%[[C1]])
-  // CHECK-SAME:       distribution ("thread")
-  // CHECK:          %[[SUBI:.*]] = arith.subi %[[C28]], %[[ARG3]]
-  // CHECK:          %[[DIVUI:.*]] = arith.divui %[[SUBI]], %[[C16]]
-  // CHECK:          %[[TILE_0:.*]] = gml_st.tile [0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 16]
-  // CHECK:          %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[MATERIALIZE_1]][%[[TILE_0]]]
-  // CHECK:          %[[FOR:.*]] = gml_st.for (%[[ARG4:.*]]) = (%[[C0]])
-  // CHECK-SAME:         to (%[[DIVUI]]) step (%[[C1]])
-  // CHECK-SAME:         outs (%[[ARG5:.*]] = %[[MATERIALIZE_2]]: tensor<1x?xf32>)
-  // CHECK:            %[[MULI:.*]] = arith.muli %[[ARG4]], %[[C16]] : index
-  // CHECK:            %[[ADDI:.*]] = arith.addi %[[ARG3]], %[[MULI]] : index
-  // CHECK:            %[[TILE_1:.*]] = gml_st.tile [0, %[[ADDI]]] [1, 1] [1, 1]
-  // CHECK:            %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[MATERIALIZE]][%[[TILE_1]]]
-  // CHECK:            %[[MATERIALIZE_4:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_1]]]
-  // CHECK:            %[[SUBF:.*]] = arith.subf %[[MATERIALIZE_3]], %[[MATERIALIZE_4]]
-  // CHECK:            %[[TILE_2:.*]] = gml_st.tile [0, %[[ARG4]]] [1, 1] [1, 1]
-  // CHECK:            gml_st.set_yield %[[SUBF]] into %[[ARG5]][%[[TILE_2]]]
-  // CHECK:          gml_st.set_yield %[[FOR]] into %[[MATERIALIZE_1]][%[[TILE_0]]]
-  // CHECK:        gml_st.set_yield %[[PARALLEL_0]] into %[[EMPTY]][%[[TILE]]]
-  // CHECK:      return %[[PARALLEL]]
-  %c1 = arith.constant 1 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %0 = tensor.empty() : tensor<7x13xf32>
-  %1 = gml_st.parallel (%arg2) = (%c0) to (%c1024) step (%c1)
-      distribution ("warp") {
-    %2 = gml_st.tile [%arg2, 0] [1, 13] [1, 1] : !gml_st.tile<1x13>
-    %3 = gml_st.materialize %arg0[%2]
-        : tensor<7x13xf32>[!gml_st.tile<1x13>] to tensor<1x13xf32>
-    %4 = gml_st.materialize %arg1[%2]
-        : tensor<7x13xf32>[!gml_st.tile<1x13>] to tensor<1x13xf32>
-    %5 = gml_st.materialize %0[%2]
-        : tensor<7x13xf32>[!gml_st.tile<1x13>] to tensor<1x13xf32>
-    %6 = linalg.generic {indexing_maps = [#map, #map, #map],
-        iterator_types = ["parallel", "parallel"]}
-        ins(%3, %4 : tensor<1x13xf32>, tensor<1x13xf32>)
-        outs(%5 : tensor<1x13xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %7 = arith.subf %in, %in_0 : f32
-      linalg.yield %7 : f32
-    } -> tensor<1x13xf32>
-    gml_st.set_yield %6 into %0[%2]
-        : tensor<1x13xf32> into tensor<7x13xf32>[!gml_st.tile<1x13>]
-  } : tensor<7x13xf32>
-  return %1 : tensor<7x13xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @softmax
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<2048x4096xf32>
-func.func @softmax(%arg0: tensor<2048x4096xf32>) -> tensor<2048x4096xf32> {
-  // CHECK-DAG: %[[C4096:.*]] = arith.constant 4096 : index
-  // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
-  // CHECK-DAG: %[[C4127:.*]] = arith.constant 4127 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[C2048:.*]] = arith.constant 2048 : index
-  // CHECK-DAG: %[[CST:.*]] = arith.constant -0.000000e+00 : f32
-  // CHECK-DAG: %[[CST_0:.*]] = arith.constant 0xFF800000 : f32
-  //     CHECK: %[[EMPTY:.*]] = tensor.empty()
-  //     CHECK: %[[EMPTY_0:.*]] = tensor.empty()
-  //     CHECK: %[[PARALLEL:.*]] = gml_st.parallel (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C2048]]) step (%[[C1024]]) distribution ("block")
-  //     CHECK:   %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [1024, 4096] [1, 1]
-  //     CHECK:   %[[MATERIALIZE:.*]] = gml_st.materialize %[[EMPTY_0]][%[[TILE]]]
-  //     CHECK:   %[[PARALLEL_0:.*]] = gml_st.parallel (%[[ARG2:.*]]) = (%[[C0]]) to (%[[C1024]]) step (%[[C1]]) distribution ("warp")
-  //     CHECK:     %[[TILE_0:.*]] = gml_st.tile [%[[ARG2]], 0] [1, 4096] [1, 1]
-  //     CHECK:     %[[ADDI:.*]] = arith.addi %[[ARG1]], %[[ARG2]] : index
-  //     CHECK:     %[[TILE_1:.*]] = gml_st.tile [%[[ADDI]], 0] [1, 4096] [1, 1]
-  //     CHECK:     %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[ARG0]][%[[TILE_1]]]
-  //     CHECK:     %[[TILE_2:.*]] = gml_st.tile [%[[ADDI]]] [1] [1]
-  //     CHECK:     %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[EMPTY]][%[[TILE_2]]]
-  //     CHECK:     %[[FILL:.*]] = linalg.fill ins(%[[CST_0]] : f32) outs(%[[MATERIALIZE_1]] : tensor<1xf32>)
-  //     CHECK:     %[[EMPTY_1:.*]] = tensor.empty()
-  //     CHECK:     %[[TILE_3:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:     %[[MATERIALIZE_2:.*]] = gml_st.materialize %[[MATERIALIZE_1]][%[[TILE_3]]]
-  //     CHECK:     %[[FILL_0:.*]] = linalg.fill ins(%[[CST_0]] : f32) outs(%[[MATERIALIZE_2]] : tensor<1xf32>)
-  //     CHECK:     %[[EXTRACTED:.*]] = tensor.extract %[[FILL_0]][%[[C0]]]
-  //     CHECK:     %[[PARALLEL_1:.*]] = gml_st.parallel (%[[ARG3:.*]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]]) distribution ("thread")
-  //     CHECK:       %[[TILE_4:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
-  //     CHECK:       %[[TILE_5:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
-  //     CHECK:       %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[EMPTY_1]][%[[TILE_5]]]
-  //     CHECK:       %[[FILL_1:.*]] = linalg.fill ins(%[[EXTRACTED]] : f32) outs(%[[MATERIALIZE_3]] : tensor<1x1xf32>)
-  //     CHECK:       %[[FOR:.*]] = gml_st.for (%[[ARG4:.*]]) = (%[[ARG3]]) to (%[[C4096]]) step (%[[C32]]) outs (%[[ARG5:.*]] = %[[FILL_1]]: tensor<1x1xf32>)
-  //     CHECK:         %[[TILE_6:.*]] = gml_st.tile [0, %[[ARG4]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_4:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_6]]] : tensor<1x4096xf32>[!gml_st.tile<1x1>] to f32
-  //     CHECK:         %[[TILE_7:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_5:.*]] = gml_st.materialize %[[ARG5]][%[[TILE_7]]] : tensor<1x1xf32>[!gml_st.tile<1x1>] to f32
-  //     CHECK:         %[[MAXF:.*]] = arith.maxf %[[MATERIALIZE_5]], %[[MATERIALIZE_4]] : f32
-  //     CHECK:         gml_st.set_yield %[[MAXF]] into %[[ARG5]][%[[TILE_7]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
-  //     CHECK:       gml_st.set_yield %[[FOR]] into %[[EMPTY_1]][%[[TILE_4]]]
-  //     CHECK:     %[[GENERIC:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%[[PARALLEL_1]] : tensor<1x32xf32>) outs(%[[FILL]] : tensor<1xf32>)
-  //     CHECK:     ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-  //     CHECK:       %[[MAXF_0:.*]] = arith.maxf %[[OUT]], %[[IN]] : f32
-  //     CHECK:       linalg.yield %[[MAXF_0]] : f32
-  //     CHECK:     %[[MATERIALIZE_6:.*]] = gml_st.materialize %[[EMPTY_0]][%[[TILE_1]]]
-  //     CHECK:     %[[TILE_8:.*]] = gml_st.tile [%[[ADDI]]] [1] [1]
-  //     CHECK:     %[[MATERIALIZE_7:.*]] = gml_st.materialize %[[EMPTY]][%[[TILE_8]]]
-  //     CHECK:     %[[FILL_2:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_7]] : tensor<1xf32>)
-  //     CHECK:     %[[EMPTY_2:.*]] = tensor.empty()
-  //     CHECK:     %[[TILE_9:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:     %[[MATERIALIZE_8:.*]] = gml_st.materialize %[[MATERIALIZE_7]][%[[TILE_9]]]
-  //     CHECK:     %[[FILL_3:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[MATERIALIZE_8]] : tensor<1xf32>)
-  //     CHECK:     %[[EXTRACTED_1:.*]] = tensor.extract %[[FILL_3]][%[[C0]]]
-  //     CHECK:     %[[PARALLEL_2:.*]] = gml_st.parallel (%[[ARG3]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]]) distribution ("thread")
-  //     CHECK:       %[[TILE_10:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
-  //     CHECK:       %[[TILE_11:.*]] = gml_st.tile [0, %[[ARG3]]] [1, 1] [1, 1]
-  //     CHECK:       %[[MATERIALIZE_9:.*]] = gml_st.materialize %[[EMPTY_2]][%[[TILE_11]]]
-  //     CHECK:       %[[FILL_4:.*]] = linalg.fill ins(%[[EXTRACTED_1]] : f32) outs(%[[MATERIALIZE_9]] : tensor<1x1xf32>)
-  //     CHECK:       %[[FOR_0:.*]] = gml_st.for (%[[ARG4_0:.*]]) = (%[[ARG3]]) to (%[[C4096]]) step (%[[C32]]) outs (%[[ARG5_0:.*]] = %[[FILL_4]]: tensor<1x1xf32>)
-  //     CHECK:         %[[TILE_12:.*]] = gml_st.tile [0, %[[ARG4_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_10:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_12]]]
-  //     CHECK:         %[[TILE_13:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:         %[[MATERIALIZE_11:.*]] = gml_st.materialize %[[GENERIC]][%[[TILE_13]]]
-  //     CHECK:         %[[TILE_14:.*]] = gml_st.tile [0, %[[ARG4_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_12:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_14]]]
-  //     CHECK:         %[[GENERIC_0:.*]] = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel"]} ins(%[[MATERIALIZE_11]] : tensor<1xf32>) outs(%[[MATERIALIZE_12]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_0:.*]]: f32, %[[OUT_0:.*]]: f32):
-  //     CHECK:           linalg.yield %[[IN_0]] : f32
-  //     CHECK:         %[[TILE_15:.*]] = gml_st.tile [0, %[[ARG4_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_13:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_15]]]
-  //     CHECK:         %[[GENERIC_1:.*]] = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%[[MATERIALIZE_10]], %[[GENERIC_0]] : tensor<1x1xf32>, tensor<1x1xf32>) outs(%[[MATERIALIZE_13]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_1:.*]]: f32, %[[IN_3:.*]]: f32, %[[OUT_1:.*]]: f32):
-  //     CHECK:           %[[SUBF:.*]] = arith.subf %[[IN_1]], %[[IN_3]] : f32
-  //     CHECK:           linalg.yield %[[SUBF]] : f32
-  //     CHECK:         %[[TILE_16:.*]] = gml_st.tile [0, %[[ARG4_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_14:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_16]]]
-  //     CHECK:         %[[GENERIC_2:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%[[GENERIC_1]] : tensor<1x1xf32>) outs(%[[MATERIALIZE_14]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_2:.*]]: f32, %[[OUT_2:.*]]: f32):
-  //     CHECK:           %[[EXP:.*]] = math.exp %[[IN_2]] : f32
-  //     CHECK:           linalg.yield %[[EXP]] : f32
-  //     CHECK:         %[[EXTRACTED_2:.*]] = tensor.extract %[[GENERIC_2]][%[[C0]], %[[C0]]]
-  //     CHECK:         %[[TILE_17:.*]] = gml_st.tile [0, 0] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_15:.*]] = gml_st.materialize %[[ARG5_0]][%[[TILE_17]]] : tensor<1x1xf32>[!gml_st.tile<1x1>] to f32
-  //     CHECK:         %[[ADDF:.*]] = arith.addf %[[MATERIALIZE_15]], %[[EXTRACTED_2]] : f32
-  //     CHECK:         gml_st.set_yield %[[ADDF]] into %[[ARG5_0]][%[[TILE_17]]] : f32 into tensor<1x1xf32>[!gml_st.tile<1x1>]
-  //     CHECK:       gml_st.set_yield %[[FOR_0]] into %[[EMPTY_2]][%[[TILE_10]]]
-  //     CHECK:     %[[GENERIC_3:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%[[PARALLEL_2]] : tensor<1x32xf32>) outs(%[[FILL_2]] : tensor<1xf32>)
-  //     CHECK:     ^bb0(%[[IN_4:.*]]: f32, %[[OUT_3:.*]]: f32):
-  //     CHECK:       %[[ADDF_0:.*]] = arith.addf %[[OUT_3]], %[[IN_4]] : f32
-  //     CHECK:       linalg.yield %[[ADDF_0]] : f32
-  //     CHECK:     %[[PARALLEL_3:.*]] = gml_st.parallel (%[[ARG3]]) = (%[[C0]]) to (%[[C32]]) step (%[[C1]]) distribution ("thread")
-  //     CHECK:       %[[SUBI:.*]] = arith.subi %[[C4127]], %[[ARG3]] : index
-  //     CHECK:       %[[DIVUI:.*]] = arith.divui %[[SUBI]], %[[C32]] : index
-  //     CHECK:       %[[TILE_18:.*]] = gml_st.tile [0, %[[ARG3]]] [1, %[[DIVUI]]] [1, 32]
-  //     CHECK:       %[[MATERIALIZE_16:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_18]]]
-  //     CHECK:       %[[FOR_1:.*]] = gml_st.for (%[[ARG4_1:.*]]) = (%[[C0]]) to (%[[DIVUI]]) step (%[[C1]]) outs (%[[ARG5_1:.*]] = %[[MATERIALIZE_16]]: tensor<1x?xf32>)
-  //     CHECK:         %[[MULI:.*]] = arith.muli %[[ARG4_1]], %[[C32]] : index
-  //     CHECK:         %[[ADDI_0:.*]] = arith.addi %[[ARG3]], %[[MULI]] : index
-  //     CHECK:         %[[TILE_19:.*]] = gml_st.tile [0, %[[ADDI_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_17:.*]] = gml_st.materialize %[[MATERIALIZE_0]][%[[TILE_19]]]
-  //     CHECK:         %[[TILE_20:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:         %[[MATERIALIZE_18:.*]] = gml_st.materialize %[[GENERIC]][%[[TILE_20]]]
-  //     CHECK:         %[[TILE_21:.*]] = gml_st.tile [0, %[[ADDI_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_19:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_21]]]
-  //     CHECK:         %[[GENERIC_4:.*]] = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel"]} ins(%[[MATERIALIZE_18]] : tensor<1xf32>) outs(%[[MATERIALIZE_19]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_5:.*]]: f32, %[[OUT_4:.*]]: f32):
-  //     CHECK:           linalg.yield %[[IN_5]] : f32
-  //     CHECK:         %[[TILE_22:.*]] = gml_st.tile [0, %[[ADDI_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_20:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_22]]]
-  //     CHECK:         %[[GENERIC_5:.*]] = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%[[MATERIALIZE_17]], %[[GENERIC_4]] : tensor<1x1xf32>, tensor<1x1xf32>) outs(%[[MATERIALIZE_20]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_6:.*]]: f32, %[[IN_4_0:.*]]: f32, %[[OUT_5:.*]]: f32):
-  //     CHECK:           %[[SUBF_0:.*]] = arith.subf %[[IN_6]], %[[IN_4_0]] : f32
-  //     CHECK:           linalg.yield %[[SUBF_0]] : f32
-  //     CHECK:         %[[TILE_23:.*]] = gml_st.tile [0, %[[ADDI_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_21:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_23]]]
-  //     CHECK:         %[[GENERIC_6:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%[[GENERIC_5]] : tensor<1x1xf32>) outs(%[[MATERIALIZE_21]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_7:.*]]: f32, %[[OUT_6:.*]]: f32):
-  //     CHECK:           %[[EXP_0:.*]] = math.exp %[[IN_7]] : f32
-  //     CHECK:           linalg.yield %[[EXP_0]] : f32
-  //     CHECK:         %[[EXTRACTED_2_0:.*]] = tensor.extract %[[GENERIC_6]][%[[C0]], %[[C0]]]
-  //     CHECK:         %[[TILE_24:.*]] = gml_st.tile [0] [1] [1]
-  //     CHECK:         %[[MATERIALIZE_22:.*]] = gml_st.materialize %[[GENERIC_3]][%[[TILE_24]]]
-  //     CHECK:         %[[TILE_25:.*]] = gml_st.tile [0, %[[ADDI_0]]] [1, 1] [1, 1]
-  //     CHECK:         %[[MATERIALIZE_23:.*]] = gml_st.materialize %[[MATERIALIZE_6]][%[[TILE_25]]]
-  //     CHECK:         %[[GENERIC_7:.*]] = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel"]} ins(%[[MATERIALIZE_22]] : tensor<1xf32>) outs(%[[MATERIALIZE_23]] : tensor<1x1xf32>)
-  //     CHECK:         ^bb0(%[[IN_8:.*]]: f32, %[[OUT_7:.*]]: f32):
-  //     CHECK:           linalg.yield %[[IN_8]] : f32
-  //     CHECK:         %[[EXTRACTED_3:.*]] = tensor.extract %[[GENERIC_7]][%[[C0]], %[[C0]]]
-  //     CHECK:         %[[DIVF:.*]] = arith.divf %[[EXTRACTED_2_0]], %[[EXTRACTED_3]] : f32
-  //     CHECK:         %[[TILE_26:.*]] = gml_st.tile [0, %[[ARG4_1]]] [1, 1] [1, 1]
-  //     CHECK:         gml_st.set_yield %[[DIVF]] into %[[ARG5_1]][%[[TILE_26]]] : f32 into tensor<1x?xf32>[!gml_st.tile<1x1>]
-  //     CHECK:       gml_st.set_yield %[[FOR_1]] into %[[MATERIALIZE_6]][%[[TILE_18]]]
-  //     CHECK:     gml_st.set_yield %[[PARALLEL_3]] into %[[MATERIALIZE]][%[[TILE_0]]]
-  //     CHECK:   gml_st.set_yield %[[PARALLEL_0]] into %[[EMPTY_0]][%[[TILE]]]
-  //     CHECK: return %[[PARALLEL]]
-  %c1 = arith.constant 1 : index
-  %c1024 = arith.constant 1024 : index
-  %c0 = arith.constant 0 : index
-  %c2048 = arith.constant 2048 : index
-  %cst = arith.constant -0.000000e+00 : f32
-  %cst_0 = arith.constant 0xFF800000 : f32
-  %0 = tensor.empty() : tensor<2048xf32>
-  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<2048xf32>)
-      -> tensor<2048xf32>
-  %2 = tensor.empty() : tensor<2048x4096xf32>
-  %3 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2048xf32>)
-      -> tensor<2048xf32>
-  %4 = gml_st.parallel (%arg1) = (%c0) to (%c2048) step (%c1024)
-      distribution ("block") {
-    %5 = gml_st.tile [%arg1, 0] [1024, 4096] [1, 1] : !gml_st.tile<1024x4096>
-    %6 = gml_st.materialize %2[%5]
-        : tensor<2048x4096xf32>[!gml_st.tile<1024x4096>] to tensor<1024x4096xf32>
-    %7 = gml_st.parallel (%arg2) = (%c0) to (%c1024) step (%c1)
-        distribution ("warp") {
-      %8 = gml_st.tile [%arg2, 0] [1, 4096] [1, 1] : !gml_st.tile<1x4096>
-      %9 = arith.addi %arg1, %arg2 : index
-      %10 = gml_st.tile [%9, 0] [1, 4096] [1, 1] : !gml_st.tile<1x4096>
-      %11 = gml_st.materialize %arg0[%10]
-          : tensor<2048x4096xf32>[!gml_st.tile<1x4096>] to tensor<1x4096xf32>
-      %12 = gml_st.tile [%9] [1] [1] : !gml_st.tile<1>
-      %13 = gml_st.materialize %1[%12]
-          : tensor<2048xf32>[!gml_st.tile<1>] to tensor<1xf32>
-      %14 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0)>],
-          iterator_types = ["parallel", "reduction"]}
-          ins(%11 : tensor<1x4096xf32>) outs(%13 : tensor<1xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %23 = arith.maxf %out, %in : f32
-        linalg.yield %23 : f32
-      } -> tensor<1xf32>
-      %15 = gml_st.materialize %2[%10]
-          : tensor<2048x4096xf32>[!gml_st.tile<1x4096>] to tensor<1x4096xf32>
-      %16 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0)>,
-                           affine_map<(d0, d1) -> (d0, d1)>],
-          iterator_types = ["parallel", "parallel"]}
-          ins(%14 : tensor<1xf32>) outs(%15 : tensor<1x4096xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        linalg.yield %in : f32
-      } -> tensor<1x4096xf32>
-      %17 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0, d1)>],
-          iterator_types = ["parallel", "parallel"]}
-          ins(%11, %16 : tensor<1x4096xf32>, tensor<1x4096xf32>)
-          outs(%15 : tensor<1x4096xf32>) {
-      ^bb0(%in: f32, %in_1: f32, %out: f32):
-        %23 = arith.subf %in, %in_1 : f32
-        linalg.yield %23 : f32
-      } -> tensor<1x4096xf32>
-      %18 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0, d1)>],
-          iterator_types = ["parallel", "parallel"]}
-          ins(%17 : tensor<1x4096xf32>) outs(%15 : tensor<1x4096xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %23 = math.exp %in : f32
-        linalg.yield %23 : f32
-      } -> tensor<1x4096xf32>
-      %19 = gml_st.materialize %3[%12]
-          : tensor<2048xf32>[!gml_st.tile<1>] to tensor<1xf32>
-      %20 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0)>],
-          iterator_types = ["parallel", "reduction"]}
-          ins(%18 : tensor<1x4096xf32>) outs(%19 : tensor<1xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        %23 = arith.addf %out, %in : f32
-        linalg.yield %23 : f32
-      } -> tensor<1xf32>
-      %21 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0)>,
-                          affine_map<(d0, d1) -> (d0, d1)>],
-          iterator_types = ["parallel", "parallel"]}
-          ins(%20 : tensor<1xf32>) outs(%15 : tensor<1x4096xf32>) {
-      ^bb0(%in: f32, %out: f32):
-        linalg.yield %in : f32
-      } -> tensor<1x4096xf32>
-      %22 = linalg.generic {
-          indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0, d1)>,
-                           affine_map<(d0, d1) -> (d0, d1)>],
-          iterator_types = ["parallel", "parallel"]}
-          ins(%18, %21 : tensor<1x4096xf32>, tensor<1x4096xf32>)
-          outs(%15 : tensor<1x4096xf32>) {
-      ^bb0(%in: f32, %in_1: f32, %out: f32):
-        %23 = arith.divf %in, %in_1 : f32
-        linalg.yield %23 : f32
-      } -> tensor<1x4096xf32>
-      gml_st.set_yield %22 into %6[%8]
-          : tensor<1x4096xf32> into tensor<1024x4096xf32>[!gml_st.tile<1x4096>]
-    } : tensor<1024x4096xf32>
-    gml_st.set_yield %7 into %2[%5]
-        : tensor<1024x4096xf32> into tensor<2048x4096xf32>[!gml_st.tile<1024x4096>]
-  } : tensor<2048x4096xf32>
-  return %4 : tensor<2048x4096xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir
index 1cc1dae2461..d39b28e3c9c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tiling_softmax.mlir
@@ -3,11 +3,6 @@
 // RUN:     --canonicalize --cse | \
 // RUN: FileCheck %s
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
 // CHECK-LABEL: @partial_softmax
 // CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>
 func.func @partial_softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
@@ -19,74 +14,45 @@ func.func @partial_softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   // CHECK-DAG:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
   // CHECK-DAG:   %[[INIT_0:.*]] = tensor.empty() : tensor<64x128xf32>
   // CHECK:       %[[PARALLEL:.*]] = gml_st.parallel
-  // CHECK-SAME:      (%[[ARG1:.*]]) = (%[[C0]])
-  // CHECK-SAME:      to (%[[C64]]) step (%[[C8]])
-  // CHECK-SAME:      distribution ("test")
-  // CHECK-DAG:     %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK-DAG:     %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-  // CHECK-DAG:     %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [8] [1]
-  // CHECK-DAG:     %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[FILL]][%[[TILE_0]]]
-  // CHECK:         %[[GENERIC:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0:map[0-9]*]], #[[MAP1:map[0-9]*]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "reduction"]}
+  // CHECK-SAME:      (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C64]]) step (%[[C8]])
+  // CHECK-SAME:      outs (%[[INIT_0_:.*]] = %[[INIT_0]]:
+  // CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
+  // CHECK:         %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
   // CHECK-SAME:        ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK:         ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
-  // CHECK:           %[[MAXF:.*]] = arith.maxf %[[ARG4]], %[[ARG3]]
-  // CHECK:           linalg.yield %[[MAXF]]
-  // CHECK:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[INIT_0]][%[[TILE]]]
-  // CHECK:         %[[GENERIC_0:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP1]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[GENERIC]] : tensor<8xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_0:.*]]: f32, %[[ARG4_0:.*]]: f32):
-  // CHECK:           linalg.yield %[[ARG3_0]]
-  // CHECK:         %[[GENERIC_1:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[GENERIC_0]] : tensor<8x128xf32>, tensor<8x128xf32>)
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[BROADCAST:.*]] = linalg.broadcast
+  // CHECK-SAME:        ins(%[[REDUCE]] : tensor<8xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_1:.*]]: f32, %[[ARG4_1:.*]]: f32, %[[ARG5:.*]]: f32):
-  // CHECK:           %[[SUBF:.*]] = arith.subf %[[ARG3_1]], %[[ARG4_1]]
-  // CHECK:           linalg.yield %[[SUBF]]
-  // CHECK:         gml_st.set_yield %[[GENERIC_1]] into %[[INIT_0]][%[[TILE]]]
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[INIT_0_SUB:.*]] = tensor.extract_slice %[[INIT_0_]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[MAP:.*]] = linalg.map { arith.subf }
+  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
+  // CHECK-SAME:        outs(%[[INIT_0_SUB]] : tensor<8x128xf32>)
+
   // CHECK:       return %[[PARALLEL]]
   %cst = arith.constant 0xFF800000 : f32
   %0 = tensor.empty() : tensor<64xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %6 = arith.maxf %arg2, %arg1 : f32
-    linalg.yield %6 : f32
-  } -> tensor<64xf32>
+  %2 = linalg.reduce { arith.maxf }
+         ins(%arg0 : tensor<64x128xf32>)
+         outs(%1 : tensor<64xf32>)
+         dimensions = [1]
   %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<64x128xf32>
-  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
-    %6 = arith.subf %arg1, %arg2 : f32
-    linalg.yield %6 : f32
-  } -> tensor<64x128xf32>
+  %4 = linalg.broadcast
+         ins(%2 : tensor<64xf32>)
+         outs(%3 : tensor<64x128xf32>)
+         dimensions = [1]
+  %5 = linalg.map { arith.subf }
+         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+         outs(%3 : tensor<64x128xf32>)
   return %5 : tensor<64x128xf32>
 }
 
 // -----
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
 // CHECK-LABEL: @partial_softmax_fusion
 // CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>, %[[ARG1:.*]]: index
 func.func @partial_softmax_fusion(%arg0: tensor<64x128xf32>, %arg1: index)
@@ -95,73 +61,43 @@ func.func @partial_softmax_fusion(%arg0: tensor<64x128xf32>, %arg1: index)
   // CHECK-DAG:   %[[INIT:.*]] = tensor.empty() : tensor<64xf32>
   // CHECK-DAG:   %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
   // CHECK-DAG:   %[[INIT_0:.*]] = tensor.empty() : tensor<64x128xf32>
-  // CHECK-DAG:   %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK-DAG:   %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-  // CHECK-DAG:   %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [8] [1]
-  // CHECK-DAG:   %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[FILL]][%[[TILE_0]]]
-  // CHECK:       %[[GENERIC:.*]] = linalg.generic
-  // CHECK-SAME:      indexing_maps = [#[[MAP0:map[0-9]*]], #[[MAP1:map[0-9]*]]],
-  // CHECK-SAME:      iterator_types = ["parallel", "reduction"]}
+  // CHECK-DAG:   %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK-DAG:   %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
+  // CHECK:       %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
   // CHECK-SAME:      ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
   // CHECK-SAME:      outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK:       ^bb0(%[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32):
-  // CHECK-DAG:     %[[MAXF:.*]] = arith.maxf %[[ARG3]], %[[ARG2]]
-  // CHECK:         linalg.yield %[[MAXF]]
-  // CHECK-DAG:   %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[INIT_0]][%[[TILE]]]
-  // CHECK:       %[[GENERIC_0:.*]] = linalg.generic
-  // CHECK-SAME:      indexing_maps = [#[[MAP1]], #[[MAP0]]],
-  // CHECK-SAME:      iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:      ins(%[[GENERIC]] : tensor<8xf32>)
+  // CHECK-SAME:      dimensions = [1]
+  // CHECK-DAG:   %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:       %[[BROADCAST:.*]] = linalg.broadcast
+  // CHECK-SAME:      ins(%[[REDUCE]] : tensor<8xf32>)
   // CHECK-SAME:      outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:       ^bb0(%[[ARG2_0:.*]]: f32, %[[ARG3_0:.*]]: f32):
-  // CHECK:         linalg.yield %[[ARG2_0]]
-  // CHECK:       %[[GENERIC_1:.*]] = linalg.generic
-  // CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]],
-  // CHECK-SAME:      iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:      ins(%[[MATERIALIZE]], %[[GENERIC_0]] : tensor<8x128xf32>, tensor<8x128xf32>)
+  // CHECK-SAME:      dimensions = [1]
+  // CHECK:       %[[MAP:.*]] = linalg.map { arith.subf }
+  // CHECK-SAME:      ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
   // CHECK-SAME:      outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:       ^bb0(%[[ARG2_1:.*]]: f32, %[[ARG3_1:.*]]: f32, %[[ARG4:.*]]: f32):
-  // CHECK-DAG:     %[[SUBF:.*]] = arith.subf %[[ARG2_1]], %[[ARG3_1]]
-  // CHECK:         linalg.yield %[[SUBF]]
-  // CHECK:       return %[[GENERIC_1]]
+  // CHECK:       return %[[MAP]]
   %cst = arith.constant 0xFF800000 : f32
   %0 = tensor.empty() : tensor<64xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):
-    %9 = arith.maxf %arg3, %arg2 : f32
-    linalg.yield %9 : f32
-  } -> tensor<64xf32>
+  %2 = linalg.reduce { arith.maxf }
+         ins(%arg0 : tensor<64x128xf32>)
+         outs(%1 : tensor<64xf32>)
+         dimensions = [1]
   %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32):
-    linalg.yield %arg2 : f32
-  } -> tensor<64x128xf32>
-  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
-    %9 = arith.subf %arg2, %arg3 : f32
-    linalg.yield %9 : f32
-  } -> tensor<64x128xf32>
-  %7 = gml_st.tile [%arg1, 0] [8, 128] [1, 1] : !gml_st.tile<8x128>
-  %8 = gml_st.materialize %5[%7]
-      : tensor<64x128xf32>[!gml_st.tile<8x128>] to tensor<8x128xf32>
+  %4 = linalg.broadcast
+         ins(%2 : tensor<64xf32>)
+         outs(%3 : tensor<64x128xf32>)
+         dimensions = [1]
+  %5 = linalg.map { arith.subf }
+         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+         outs(%3 : tensor<64x128xf32>)
+  %8 = tensor.extract_slice %5[%arg1, 0] [8, 128] [1, 1]
+      : tensor<64x128xf32> to tensor<8x128xf32>
   return %8 : tensor<8x128xf32>
 }
 
 // -----
 
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK: #map{{[0-9]*}} = affine_map<(d0, d1) -> (d0)>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-
 // CHECK-LABEL: @softmax
 // CHECK-SAME:  %[[ARG0:.*]]: tensor<64x128xf32>
 func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
@@ -176,122 +112,69 @@ func.func @softmax(%arg0: tensor<64x128xf32>) -> tensor<64x128xf32> {
   // CHECK-DAG:   %[[FILL_0:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<64xf32>)
   // CHECK:       %[[PARALLEL:.*]] = gml_st.parallel
   // CHECK-SAME:      (%[[ARG1:.*]]) = (%[[C0]]) to (%[[C64]]) step (%[[C8]])
-  // CHECK:         %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
-  // CHECK:         %[[MATERIALIZE:.*]] = gml_st.materialize %[[ARG0]][%[[TILE]]]
-  // CHECK:         %[[TILE_0:.*]] = gml_st.tile [%[[ARG1]]] [8] [1]
-  // CHECK:         %[[MATERIALIZE_0:.*]] = gml_st.materialize %[[FILL]][%[[TILE_0]]]
-  // CHECK:         %[[GENERIC:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0:map[0-9]*]], #[[MAP1:map[0-9]*]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "reduction"]}
+  // CHECK-SAME:      outs (%[[INIT_0_:.*]] = %[[INIT_0]]:
+  // CHECK:         %[[MATERIALIZE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[MATERIALIZE_0:.*]] = tensor.extract_slice %[[FILL]][%[[ARG1]]] [8] [1]
+  // CHECK:         %[[REDUCE:.*]] = linalg.reduce { arith.maxf }
   // CHECK-SAME:        ins(%[[MATERIALIZE]] : tensor<8x128xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_0]] : tensor<8xf32>)
-  // CHECK:         ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
-  // CHECK:           %[[MAXF:.*]] = arith.maxf %[[ARG4]], %[[ARG3]]
-  // CHECK:           linalg.yield %[[MAXF]]
-  // CHECK:         %[[MATERIALIZE_1:.*]] = gml_st.materialize %[[INIT_0]][%[[TILE]]]
-  // CHECK:         %[[GENERIC_0:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP1]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[GENERIC]] : tensor<8xf32>)
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[MATERIALIZE_1:.*]] = tensor.extract_slice %[[INIT_0]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[BROADCAST:.*]] = linalg.broadcast
+  // CHECK-SAME:        ins(%[[REDUCE]] : tensor<8xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_0:.*]]: f32, %[[ARG4_0:.*]]: f32):
-  // CHECK:           linalg.yield %[[ARG3_0]]
-  // CHECK:         %[[GENERIC_1:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[GENERIC_0]] : tensor<8x128xf32>, tensor<8x128xf32>)
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[MAP:.*]] = linalg.map { arith.subf }
+  // CHECK-SAME:        ins(%[[MATERIALIZE]], %[[BROADCAST]] : tensor<8x128xf32>, tensor<8x128xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_1:.*]]: f32, %[[ARG4_1:.*]]: f32, %[[ARG5:.*]]: f32):
-  // CHECK:           %[[SUBF:.*]] = arith.subf %[[ARG3_1]], %[[ARG4_1]]
-  // CHECK:           linalg.yield %[[SUBF]]
-  // CHECK:         %[[GENERIC_2:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[GENERIC_1]] : tensor<8x128xf32>)
+  // CHECK:         %[[MAP_0:.*]] = linalg.map { math.exp }
+  // CHECK-SAME:        ins(%[[MAP]] : tensor<8x128xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_2:.*]]: f32, %[[ARG4_2:.*]]: f32):
-  // CHECK:           %[[EXP:.*]] = math.exp %[[ARG3_2]]
-  // CHECK:           linalg.yield %[[EXP]]
-  // CHECK:         %[[MATERIALIZE_3:.*]] = gml_st.materialize %[[FILL_0]][%[[TILE_0]]]
-  // CHECK:         %[[GENERIC_3:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0]], #[[MAP1]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "reduction"]}
-  // CHECK-SAME:        ins(%[[GENERIC_2]] : tensor<8x128xf32>)
+  // CHECK:         %[[MATERIALIZE_3:.*]] = tensor.extract_slice %[[FILL_0]][%[[ARG1]]] [8] [1]
+  // CHECK:         %[[REDUCE_0:.*]] = linalg.reduce { arith.addf }
+  // CHECK-SAME:        ins(%[[MAP_0]] : tensor<8x128xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_3]] : tensor<8xf32>)
-  // CHECK:         ^bb0(%[[ARG3_3:.*]]: f32, %[[ARG4_3:.*]]: f32):
-  // CHECK:           %[[ADDF:.*]] = arith.addf %[[ARG4_3]], %[[ARG3_3]]
-  // CHECK:           linalg.yield %[[ADDF]]
-  // CHECK:         %[[GENERIC_4:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP1]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[GENERIC_3]] : tensor<8xf32>)
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[BROADCAST_0:.*]] = linalg.broadcast
+  // CHECK-SAME:        ins(%[[REDUCE_0]] : tensor<8xf32>)
   // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_4:.*]]: f32, %[[ARG4_4:.*]]: f32):
-  // CHECK:           linalg.yield %[[ARG3_4]]
-  // CHECK:         %[[GENERIC_5:.*]] = linalg.generic
-  // CHECK-SAME:        indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP0]]],
-  // CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
-  // CHECK-SAME:        ins(%[[GENERIC_2]], %[[GENERIC_4]] : tensor<8x128xf32>, tensor<8x128xf32>)
-  // CHECK-SAME:        outs(%[[MATERIALIZE_1]] : tensor<8x128xf32>)
-  // CHECK:         ^bb0(%[[ARG3_5:.*]]: f32, %[[ARG4_5:.*]]: f32, %[[ARG5_0:.*]]: f32):
-  // CHECK:           %[[DIVF:.*]] = arith.divf %[[ARG3_5]], %[[ARG4_5]]
-  // CHECK:           linalg.yield %[[DIVF]]
-  // CHECK:         gml_st.set_yield %[[GENERIC_5]] into %[[INIT_0]][%[[TILE]]]
+  // CHECK-SAME:        dimensions = [1]
+  // CHECK:         %[[INIT_0_SUB:.*]] = tensor.extract_slice %[[INIT_0_]][%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         %[[MAP_1:.*]] = linalg.map { arith.divf }
+  // CHECK-SAME:        ins(%[[MAP_0]], %[[BROADCAST_0]] : tensor<8x128xf32>, tensor<8x128xf32>)
+  // CHECK-SAME:        outs(%[[INIT_0_SUB]] : tensor<8x128xf32>)
+  // CHECK:         %[[TILE:.*]] = gml_st.tile [%[[ARG1]], 0] [8, 128] [1, 1]
+  // CHECK:         gml_st.set_yield %[[MAP_1]] into %[[INIT_0_]][%[[TILE]]]
   // CHECK:       return %[[PARALLEL]]
   %cst = arith.constant -0.000000e+00 : f32
   %cst_0 = arith.constant 0xFF800000 : f32
   %0 = tensor.empty() : tensor<64xf32>
   %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %2 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]}
-      ins(%arg0 : tensor<64x128xf32>) outs(%1 : tensor<64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %11 = arith.maxf %arg2, %arg1 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64xf32>
+  %2 = linalg.reduce { arith.maxf }
+         ins(%arg0 : tensor<64x128xf32>)
+         outs(%1 : tensor<64xf32>) dimensions = [1]
   %3 = tensor.empty() : tensor<64x128xf32>
-  %4 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<64x128xf32>
-  %5 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
-    %11 = arith.subf %arg1, %arg2 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
-  %6 = linalg.generic {indexing_maps = [#map0, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %11 = math.exp %arg1 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
+  %4 = linalg.broadcast
+         ins(%2 : tensor<64xf32>)
+         outs(%3 : tensor<64x128xf32>)
+         dimensions = [1]
+  %5 = linalg.map { arith.subf }
+         ins(%arg0, %4 : tensor<64x128xf32>, tensor<64x128xf32>)
+         outs(%3 : tensor<64x128xf32>)
+  %6 = linalg.map { math.exp }
+         ins(%5 : tensor<64x128xf32>)
+         outs(%3 : tensor<64x128xf32>)
   %7 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64xf32>) -> tensor<64xf32>
-  %8 = linalg.generic {indexing_maps = [#map0, #map1],
-      iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<64x128xf32>)
-      outs(%7 : tensor<64xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %11 = arith.addf %arg2, %arg1 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64xf32>
-  %9 = linalg.generic {indexing_maps = [#map1, #map0],
-      iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<64xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    linalg.yield %arg1 : f32
-  } -> tensor<64x128xf32>
-  %10 = linalg.generic {indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
-      outs(%3 : tensor<64x128xf32>) {
-  ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
-    %11 = arith.divf %arg1, %arg2 : f32
-    linalg.yield %11 : f32
-  } -> tensor<64x128xf32>
+  %8 = linalg.reduce { arith.addf }
+         ins(%6 : tensor<64x128xf32>)
+         outs(%7 : tensor<64xf32>)
+         dimensions = [1]
+  %9 = linalg.broadcast
+         ins(%8 : tensor<64xf32>)
+         outs(%3 : tensor<64x128xf32>)
+         dimensions = [1]
+  %10 = linalg.map { arith.divf }
+          ins(%6, %9 : tensor<64x128xf32>, tensor<64x128xf32>)
+          outs(%3 : tensor<64x128xf32>)
   return %10 : tensor<64x128xf32>
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_map_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_map_for_cpu.mlir
deleted file mode 100644
index 1e19feab4c9..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_map_for_cpu.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-transform-map="tile-size=8" | FileCheck %s
-
-func.func @map_unary(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
-                  -> tensor<?x?xf32> {
-  %abs = linalg.map
-         ins(%input:tensor<?x?xf32>)
-         outs(%init:tensor<?x?xf32>)
-         (%input_elem: f32) {
-           %0 = math.absf %input_elem: f32
-           linalg.yield %0: f32
-         }
-  func.return %abs : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func.func @map_unary(
-// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x?xf32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<?x?xf32>)
-
-// CHECK-DAG:   %[[C0:.*]] = arith.constant 0
-// CHECK-DAG:   %[[C1:.*]] = arith.constant 1
-// CHECK-DAG:   %[[C8:.*]] = arith.constant 8
-
-// CHECK-DAG:  %[[DIM_0:.*]] = tensor.dim %[[INPUT]], %[[C0]]
-// CHECK-DAG:  %[[DIM_1:.*]] = tensor.dim %[[INPUT]], %[[C1]]
-
-// CHECK-NEXT: %[[RESULT:.*]] = gml_st.parallel (%[[I:.*]], %[[J:.*]]) =
-// CHECK-SAME:     (%[[C0]], %[[C0]]) to (%[[DIM_0]], %[[DIM_1]])
-// CHECK-SAME:     step (%[[C1]], %[[C8]]) {
-// CHECK:        %[[MIN_DIM:.*]] = affine.min #map(%[[J]])[%[[DIM_1]]]
-// CHECK-NEXT:   %[[INPUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]]
-// CHECK-SAME:                          [1, %[[MIN_DIM]]] [1, 1]
-// CHECK-NEXT:   %[[INPUT_SLICE:.*]] = gml_st.materialize %[[INPUT]]
-// CHECK-NEXT:   %[[INIT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]]
-// CHECK-SAME:                          [1, %[[MIN_DIM]]] [1, 1]
-// CHECK-NEXT:   %[[INIT_SLICE:.*]] = gml_st.materialize %[[INIT]]
-// CHECK-NEXT:   %[[MAPPED:.*]] = linalg.map
-// CHECK-NEXT:     ins(%[[INPUT_SLICE]] : tensor<1x?xf32>)
-// CHECK-NEXT:     outs(%[[INIT_SLICE]] : tensor<1x?xf32>)
-// CHECK-NEXT:     (%[[IN_ELEM:.*]]: f32) {
-// CHECK-NEXT:       %[[RES_ELEM:.*]] = math.absf %[[IN_ELEM]] : f32
-// CHECK-NEXT:       linalg.yield %[[RES_ELEM]] : f32
-// CHECK-NEXT:     }
-// CHECK-NEXT:   gml_st.set_yield %[[MAPPED]] into %[[INIT]][%[[INIT_TILE]]]
-// CHECK-NEXT: }
-// CHECK-NEXT: return %[[RESULT]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_matmul_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_matmul_for_cpu.mlir
deleted file mode 100644
index 3f4764ef5c1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_matmul_for_cpu.mlir
+++ /dev/null
@@ -1,126 +0,0 @@
-// RUN: mlir-hlo-opt %s -xla-cpu-transform-matmul="tile-sizes=8,4,2" | FileCheck %s
-
-#id_map = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func @matmul_static(%arg0: tensor<128x16xf32>, %arg1: tensor<16x64xf32>,
-                         %output: tensor<128x64xf32>) -> tensor<128x64xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x16xf32>, tensor<16x64xf32>)
-                     outs(%output : tensor<128x64xf32>) -> tensor<128x64xf32>
-  return %2 : tensor<128x64xf32>
-}
-
-// CHECK-LABEL:    func @matmul_static(
-// CHECK-SAME:       %[[LHS:.*]]: tensor<128x16xf32>,
-// CHECK-SAME:       %[[RHS:.*]]: tensor<16x64xf32>,
-// CHECK-SAME:       %[[OUT:.*]]: tensor<128x64xf32>)
-
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:  %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
-// CHECK-DAG:  %[[C_LHS_COL:.*]] = arith.constant 16 : index
-// CHECK-DAG:  %[[C_RHS_COL:.*]] = arith.constant 64 : index
-// CHECK-DAG:  %[[C_LHS_ROW:.*]] = arith.constant 128 : index
-
-// CHECK:      gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:     to (%[[C_LHS_ROW]], %[[C_RHS_COL]]) step (%[[C8]], %[[C4]])
-
-// CHECK:      %[[LHS_TILE:.*]] = gml_st.tile [%[[I]], 0] [8, 16]
-// CHECK:      %[[LHS_SLICE:.*]] = gml_st.materialize %[[LHS]][%[[LHS_TILE]]]
-
-// CHECK:      %[[RHS_TILE:.*]] = gml_st.tile [0, %[[J]]] [16, 4]
-// CHECK:      %[[RHS_SLICE:.*]] = gml_st.materialize %[[RHS]][%[[RHS_TILE]]]
-
-// CHECK:      %[[OUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]] [8, 4]
-// CHECK:      %[[OUT_SLICE:.*]] = gml_st.materialize %[[OUT]][%[[OUT_TILE]]]
-
-// CHECK:      %[[FOR:.*]] = gml_st.for (%[[K:.*]]) = (%[[C0]])
-// CHECK-SAME:   to (%[[C_LHS_COL]]) step (%[[C2]])
-// CHECK-SAME:   outs (%[[OUT_SUB_ARG:.*]] = %[[OUT_SLICE]]:
-
-// CHECK:        %[[LHS_TILE_2:.*]] = gml_st.tile [0, %[[K]]] [8, 2]
-// CHECK:        %[[LHS_SLICE_2:.*]] = gml_st.materialize %[[LHS_SLICE]][%[[LHS_TILE_2]]]
-
-// CHECK:        %[[RHS_TILE_2:.*]] = gml_st.tile [%[[K]], 0] [2, 4]
-// CHECK:        %[[RHS_SLICE_2:.*]] = gml_st.materialize %[[RHS_SLICE]][%[[RHS_TILE_2]]]
-
-// CHECK:        %[[OUT_TILE_2:.*]] = gml_st.tile [0, 0] [8, 4]
-// CHECK:        %[[OUT_SLICE_2:.*]] = gml_st.materialize %[[OUT_SUB_ARG]][%[[OUT_TILE_2]]]
-
-// CHECK:        %[[MATMUL:.*]] = linalg.matmul
-// CHECK-SAME:     ins(%[[LHS_SLICE_2]], %[[RHS_SLICE_2]] :
-// CHECK:          outs(%[[OUT_SLICE_2]] :
-
-// CHECK-NEXT:   gml_st.set_yield %[[MATMUL]] into %[[OUT_SUB_ARG]][%[[OUT_TILE_2]]]
-// CHECK:      gml_st.set_yield %[[FOR]] into %[[OUT]][%[[OUT_TILE]]]
-
-// -----
-
-func.func @matmul(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
-                  -> tensor<?x?xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %c1 = arith.constant 1 : index
-  %1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-                     outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: func @matmul(
-// CHECK-SAME:      %[[LHS:.*]]: tensor<?x?xf32>, %[[RHS:.*]]: tensor<?x?xf32>)
-
-// CHECK-DAG:     %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
-// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
-// CHECK-DAG:     %[[C8:.*]] = arith.constant 8 : index
-
-// CHECK-DAG:     %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
-// CHECK-DAG:     %[[DIM_1:.*]] = tensor.dim %[[RHS]], %[[C1]] : [[TY_2D]]
-// CHECK:         %[[INIT:.*]] = tensor.empty(%[[DIM_0]], %[[DIM_1]]) : [[TY_2D]]
-// CHECK:         %[[FILL:.*]] = linalg.fill ins(%[[C0_F32]]{{.*}}outs(%[[INIT]]
-// CHECK-DAG:     %[[LHS_ROW:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D]]
-// CHECK-DAG:     %[[LHS_COL:.*]] = tensor.dim %[[LHS]], %[[C1]] : [[TY_2D]]
-// CHECK-DAG:     %[[RHS_COL:.*]] = tensor.dim %[[RHS]], %[[C1]] : [[TY_2D]]
-
-// CHECK:         gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:        to (%[[LHS_ROW]], %[[RHS_COL]]) step (%[[C8]], %[[C4]])
-
-// CHECK:           %[[LHS_TILE:.*]] = gml_st.tile [%[[I]], 0]
-// CHECK:           %[[LHS_SLICE:.*]] = gml_st.materialize %[[LHS]][%[[LHS_TILE]]]
-
-// CHECK:           %[[RHS_TILE:.*]] = gml_st.tile [0, %[[J]]]
-// CHECK:           %[[RHS_SLICE:.*]] = gml_st.materialize %[[RHS]][%[[RHS_TILE]]]
-
-// CHECK:           %[[OUT_TILE:.*]] = gml_st.tile [%[[I]], %[[J]]]
-// CHECK:           %[[OUT_SLICE:.*]] = gml_st.materialize %[[FILL]][%[[OUT_TILE]]]
-
-// CHECK:           %[[LHS_SUB_ROW:.*]] = tensor.dim %[[LHS_SLICE]], %[[C0]] : [[TY_2D]]
-// CHECK:           %[[LHS_SUB_COL:.*]] = tensor.dim %[[LHS_SLICE]], %[[C1]] : [[TY_2D]]
-// CHECK:           %[[RHS_SUB_COL:.*]] = tensor.dim %[[RHS_SLICE]], %[[C1]] : [[TY_2D]]
-
-// CHECK:           %[[FOR:.*]] = gml_st.for (%[[K:.*]]) = (%[[C0]])
-// CHECK-SAME:          to (%[[LHS_SUB_COL]]) step (%[[C2]])
-// CHECK-SAME:          outs (%[[OUT_SUB_ARG:.*]] = %[[OUT_SLICE]]: [[TY_2D]])
-
-// CHECK:             %[[LHS_TILE_2:.*]] = gml_st.tile [0, %[[K]]]
-// CHECK:             %[[LHS_SLICE_2:.*]] = gml_st.materialize %[[LHS_SLICE]][%[[LHS_TILE_2]]]
-
-// CHECK:             %[[RHS_TILE_2:.*]] = gml_st.tile [%[[K]], 0]
-// CHECK:             %[[RHS_SLICE_2:.*]] = gml_st.materialize %[[RHS_SLICE]][%[[RHS_TILE_2]]]
-
-// CHECK:             %[[OUT_TILE_2:.*]] = gml_st.tile [0, 0] [%[[LHS_SUB_ROW]], %[[RHS_SUB_COL]]]
-// CHECK:             %[[OUT_SLICE_2:.*]] = gml_st.materialize %[[OUT_SUB_ARG]][%[[OUT_TILE_2]]]
-
-// CHECK:             %[[MATMUL:.*]] = linalg.matmul
-// CHECK-SAME:          ins(%[[LHS_SLICE_2]], %[[RHS_SLICE_2]] : [[TY_2D]], [[TY_2D]])
-// CHECK:               outs(%[[OUT_SLICE_2]] : [[TY_2D]])
-
-// CHECK-NEXT:        gml_st.set_yield %[[MATMUL]] into %[[OUT_SUB_ARG]][%[[OUT_TILE_2]]]
-// CHECK:           gml_st.set_yield %[[FOR]] into %[[FILL]][%[[OUT_TILE]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_scatter_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_scatter_for_cpu.mlir
deleted file mode 100644
index f242ee41a18..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_scatter_for_cpu.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: mlir-hlo-opt %s -xla-cpu-transform-scatter | FileCheck %s
-
-#id_map = affine_map<(d0, d1) -> (d0, d1)>
-
-func.func @scatter_small_vector_dim(%indices: tensor<?x2xindex>,
-  %updates: tensor<?x?x?xf32>, %init: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  %result = thlo.scatter
-    ins (%indices: tensor<?x2xindex>, %updates: tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?xf32>)
-    (%in: f32, %out: f32) {
-      %0 = arith.addf %in, %out: f32
-      thlo.yield %0: f32
-    }
-  return %result : tensor<?x?xf32>
-}
-
-// CHECK-LABEL: @scatter_small_vector_dim
-// CHECK: gml_st.for
-// CHECK: thlo.scatter ins(%{{.*}} : tensor<1x2xindex>, %{{.*}} : tensor<1x?x?xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_transpose_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_transpose_for_cpu.mlir
deleted file mode 100644
index 6d4375f055a..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/transform_transpose_for_cpu.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: mlir-hlo-opt %s --gml-st-cpu-transform-transpose | FileCheck %s
-
-func.func @transpose_permutation(%input: tensor<16x32x64xf32>,
-    %init: tensor<32x64x16xf32>) -> tensor<32x64x16xf32> {
-    %transpose = linalg.transpose
-      ins(%input:tensor<16x32x64xf32>)
-      outs(%init:tensor<32x64x16xf32>)
-      permutation = [1, 2, 0]
-  func.return %transpose : tensor<32x64x16xf32>
-}
-
-// CHECK-LABEL: func.func @transpose_permutation(
-// CHECK-SAME:      %[[INPUT:.*]]: tensor<16x32x64xf32>,
-// CHECK-SAME:      %[[INIT:.*]]: tensor<32x64x16xf32>)
-
-// CHECK:             gml_st.parallel
-// CHECK:               %[[INPUT_SUB:.*]] = gml_st.materialize %[[INPUT]]
-// CHECK:      : tensor<16x32x64xf32>[!gml_st.tile<8x1x8>] to tensor<8x1x8xf32>
-
-// CHECK:              %[[INIT_SUB:.*]] =  gml_st.materialize %[[INIT]]
-// CHECK:      : tensor<32x64x16xf32>[!gml_st.tile<1x8x8>] to tensor<1x8x8xf32>
-
-// CHECK:       linalg.transpose
-// CHECK-NEXT:    ins(%[[INPUT_SUB]] : tensor<8x1x8xf32>)
-// CHECK-NEXT:    outs(%[[INIT_SUB]] : tensor<1x8x8xf32>)
-// CHECK-NEXT:    permutation = [1, 2, 0]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/triton_tiling/transform_matmul_for_triton.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/triton_tiling/transform_matmul_for_triton.mlir
new file mode 100644
index 00000000000..b07a8e58bd7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/triton_tiling/transform_matmul_for_triton.mlir
@@ -0,0 +1,109 @@
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:  -xla-triton-transform-matmul="tile-sizes=8,4,2 distribution-label=test" \
+// RUN: | FileCheck %s --dump-input=always
+
+func.func @matmul_static(%arg0: tensor<128x16xf32>, %arg1: tensor<16x64xf32>,
+                         %output: tensor<128x64xf32>) -> tensor<128x64xf32> {
+  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<128x16xf32>, tensor<16x64xf32>)
+                     outs(%output : tensor<128x64xf32>) -> tensor<128x64xf32>
+  return %2 : tensor<128x64xf32>
+}
+
+// CHECK-LABEL:    func @matmul_static(
+// CHECK-SAME:       %[[LHS:.*]]: tensor<128x16xf32>,
+// CHECK-SAME:       %[[RHS:.*]]: tensor<16x64xf32>,
+// CHECK-SAME:       %[[OUT:.*]]: tensor<128x64xf32>)
+
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK:      gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
+// CHECK-SAME:     distribution ("test")
+// CHECK:        %[[FOR:.*]] = gml_st.for (%[[K:.*]]) = (%[[C0]])
+// CHECK:          %[[MATMUL:.*]] = linalg.matmul
+// CHECK-SAME:       -> tensor<8x4xf32>
+// CHECK:          gml_st.set_yield %[[MATMUL]]
+// CHECK:        gml_st.set_yield %[[FOR]]
+
+// -----
+
+func.func @matmul_fuse_output(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+                              %arg2: tensor<?x?xf32>)
+                              -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %init = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %filled = linalg.fill ins(%cst : f32)
+                        outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %4 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %5 = linalg.matmul ins(%arg0, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
+                     outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %6 = linalg.map { math.absf }
+         ins(%5 : tensor<?x?xf32>)
+         outs(%init : tensor<?x?xf32>)
+
+  %result = linalg.map { arith.addf }
+              ins(%4, %6 : tensor<?x?xf32>, tensor<?x?xf32>)
+              outs(%init : tensor<?x?xf32>)
+  return %result : tensor<?x?xf32>
+}
+
+// CHECK-LABEL:    func @matmul_fuse_output(
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+
+// CHECK:      gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
+// CHECK:        gml_st.for (%[[K:.*]]) = (%[[C0]])
+// CHECK:          %[[MATMUL:.*]] = linalg.matmul
+// CHECK:          gml_st.set_yield %[[MATMUL]]
+
+// CHECK:        gml_st.for
+// CHECK:          %[[MATMUL:.*]] = linalg.matmul
+// CHECK:          gml_st.set_yield %[[MATMUL]]
+
+// CHECK:        linalg.map
+// CHECK:        linalg.map
+
+// CHECK:        gml_st.set_yield
+
+// -----
+
+func.func @matmul_fuse_input_and_output(
+              %arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>,
+              %init: tensor<?x?xf32>)
+              -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %filled = linalg.fill ins(%cst : f32)
+                        outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %mapped = linalg.map { math.absf }
+              ins(%arg0 : tensor<?x?xf32>)
+              outs(%init : tensor<?x?xf32>)
+  %bcast = linalg.broadcast
+             ins(%arg1 : tensor<?xf32>)
+             outs(%init : tensor<?x?xf32>)
+             dimensions = [1]
+
+  %matmul = linalg.matmul
+              ins(%mapped, %bcast : tensor<?x?xf32>, tensor<?x?xf32>)
+              outs(%filled : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  %result = linalg.map { math.absf }
+              ins(%matmul : tensor<?x?xf32>)
+              outs(%init : tensor<?x?xf32>)
+  return %result : tensor<?x?xf32>
+}
+
+// CHECK-LABEL:    func @matmul_fuse_input_and_output(
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+
+// CHECK:      gml_st.parallel (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
+// CHECK:        gml_st.for (%[[K:.*]]) = (%[[C0]])
+// CHECK:          linalg.map
+// CHECK:          linalg.broadcast
+// CHECK:          %[[MATMUL:.*]] = linalg.matmul
+// CHECK:          gml_st.set_yield %[[MATMUL]]
+// CHECK:        linalg.map
+// CHECK:        gml_st.set_yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorization.mlir
deleted file mode 100644
index a7f9fbc6a17..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorization.mlir
+++ /dev/null
@@ -1,295 +0,0 @@
-// RUN: mlir-hlo-opt %s --vectorize-gml-st-loops --split-input-file |\
-// RUN: FileCheck %s
-
-#map0 = affine_map<(d0) -> (d0)>
-func.func @tiled_add(%A: tensor<8xf32>, %B: tensor<8xf32>,
-                  %C: tensor<8xf32>) -> tensor<8xf32> {
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c8 = arith.constant 8 : index
-  %sum = gml_st.loop (%i) = (%c0) to (%c8) step (%c2)
-       ins (%A_ = %A: tensor<8xf32>, %B_ = %B: tensor<8xf32>)
-       outs (%C_ = %C: tensor<8xf32>) {
-    %A_sub = tensor.extract_slice %A_[%i] [2] [1]
-      : tensor<8xf32> to tensor<2xf32>
-    %B_sub = tensor.extract_slice %B_[%i] [2] [1]
-      : tensor<8xf32> to tensor<2xf32>
-    %C_sub = tensor.extract_slice %C_[%i] [2] [1]
-      : tensor<8xf32> to tensor<2xf32>
-    %sum_sub = linalg.generic {
-      indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel"]
-    } ins(%A_sub, %B_sub : tensor<2xf32>, tensor<2xf32>)
-      outs(%C_sub : tensor<2xf32>) {
-      ^bb0(%a: f32, %b: f32, %c: f32):
-        %0 = arith.addf %a, %b : f32
-        linalg.yield %0 : f32
-    } -> tensor<2xf32>
-    %update = tensor.insert_slice %sum_sub into %C_[%i] [2] [1]
-      : tensor<2xf32> into tensor<8xf32>
-    gml_st.yield %update : tensor<8xf32>
-  }
-  func.return %sum : tensor<8xf32>
-}
-// CHECK-LABEL: func @tiled_add
-
-// CHECK-DAG:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
-
-// CHECK: gml_st.loop (%[[IV:arg[0-9]]]) =
-// CHECK-SAME: ins (%[[A:arg[0-9]]] = %{{arg[0-9]}}: tensor<8xf32>,
-// CHECK-SAME:      %[[B:arg[0-9]]] = %{{arg[0-9]}}: tensor<8xf32>
-// CHECK-SAME: outs (%[[C:arg[0-9]]] = %{{arg[0-9]}}: tensor<8xf32>)
-
-// CHECK-NEXT: %[[LHS:.*]] = vector.transfer_read %[[A]][%[[IV]]], %[[CST]]
-// CHECK-SAME:   {in_bounds = [true]} : tensor<8xf32>, vector<2xf32>
-// CHECK-NEXT: %[[RHS:.*]] = vector.transfer_read %[[B]][%[[IV]]], %[[CST]]
-// CHECK-SAME:   {in_bounds = [true]} : tensor<8xf32>, vector<2xf32>
-
-// CHECK-NEXT: %[[SUM:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<2xf32>
-
-// CHECK-NEXT: %{{.*}} = vector.transfer_write %[[SUM]], %[[C]][%[[IV]]]
-// CHECK-SAME:   {in_bounds = [true]} : vector<2xf32>, tensor<8xf32>
-
-// -----
-
-func.func @tiled_reduction_2d(%in: tensor<80x60xf32>) -> tensor<80xf32> {
-  %c0 = arith.constant 0 : index
-  %c4 = arith.constant 4 : index
-  %c60 = arith.constant 60 : index
-  %c80 = arith.constant 80 : index
-  %cst = arith.constant 0.000000e+00 : f32
-
-  %init = tensor.empty() : tensor<80xf32>
-  %out = linalg.fill ins(%cst : f32) outs(%init : tensor<80xf32>) -> tensor<80xf32>
-
-  %sum = gml_st.loop (%i, %j) = (%c0, %c0) to (%c80, %c60) step (%c4, %c4)
-          ins (%in_ = %in: tensor<80x60xf32>, %cst_ = %cst: f32)
-          outs (%out_ = %out: tensor<80xf32>)
-          iterators[#gml_st.iterator_type<parallel>,
-                    #gml_st.iterator_type<reduction>] {
-    %in_sub = tensor.extract_slice %in_[%i, %j] [4, 4] [1, 1]
-        : tensor<80x60xf32> to tensor<4x4xf32>
-    %out_sub = tensor.extract_slice %out_[%i] [4] [1]
-        : tensor<80xf32> to tensor<4xf32>
-    %local_fill = linalg.fill ins(%cst_ : f32) outs(%out_sub : tensor<4xf32>) -> tensor<4xf32>
-    %reduced_tile = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                         affine_map<(d0, d1) -> (d0)>],
-        iterator_types = ["parallel", "reduction"]}
-        ins(%in_sub : tensor<4x4xf32>)
-        outs(%local_fill : tensor<4xf32>) {
-      ^bb0(%a: f32, %b: f32):
-        %0 = arith.addf %a, %b : f32
-        linalg.yield %0 : f32
-    } -> tensor<4xf32>
-
-    %acc = linalg.generic {
-        indexing_maps = [affine_map<(d0) -> (d0)>,
-                        affine_map<(d0) -> (d0)>],
-        iterator_types = ["parallel"]}
-        ins(%reduced_tile : tensor<4xf32>)
-        outs(%out_sub : tensor<4xf32>) {
-      ^bb0(%a: f32, %b: f32):
-        %1 = arith.addf %a, %b : f32
-        linalg.yield %1 : f32
-    } -> tensor<4xf32>
-    %update = tensor.insert_slice %acc into %out_[%i] [4] [1]
-        : tensor<4xf32> into tensor<80xf32>
-    gml_st.yield %update : tensor<80xf32>
-  }
-  func.return %sum : tensor<80xf32>
-}
-
-// CHECK-LABEL: func @tiled_reduction_2d
-
-// CHECK: gml_st.loop
-// CHECK-SAME: ins (%{{arg[0-9]}} = %{{arg[0-9]}}: tensor<80x60xf32>,
-// CHECK-SAME:      %[[CST:arg[0-9]]] = %{{.*}}: f32
-
-// CHECK: %[[BCAST:.*]] = vector.broadcast %[[CST]] : f32 to vector<4xf32>
-// CHECK-NOT: vector.transfer_write %[[BCAST]]
-// CHECK: vector.multi_reduction <add>, %{{.*}}, %[[BCAST]] [1] : vector<4x4xf32> to vector<4xf32>
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d1)>
-#map2 = affine_map<(d0) -> (d0)>
-#map3 = affine_map<(d0) -> ()>
-func.func @reduction_1d(%arg0: tensor<16xf32>) -> tensor<f32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %c16 = arith.constant 16 : index
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-  %0 = tensor.empty() : tensor<f32>
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<f32>) -> tensor<f32>
-  %2 = tensor.empty() : tensor<8xf32>
-  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<8xf32>) -> tensor<8xf32>
-  %4 = gml_st.loop (%arg1) = (%c0) to (%c16) step (%c8)
-      ins (%arg2 = %arg0: tensor<16xf32>)
-      outs (%arg3 = %3: tensor<8xf32>)
-      iterators[#gml_st.iterator_type<reduction>] {
-    %6 = tensor.extract_slice %arg2[%arg1] [8] [1]
-      : tensor<16xf32> to tensor<8xf32>
-    %7 = tensor.expand_shape %6 [[0, 1]]
-      : tensor<8xf32> into tensor<1x8xf32>
-    %8 = linalg.generic {indexing_maps = [#map0, #map1],
-                         iterator_types = ["reduction", "parallel"]}
-                         ins(%7 : tensor<1x8xf32>)
-                         outs(%arg3 : tensor<8xf32>) {
-    ^bb0(%arg4: f32, %arg5: f32):
-      %9 = arith.addf %arg4, %arg5 : f32
-      linalg.yield %9 : f32
-    } -> tensor<8xf32>
-    gml_st.yield %8 : tensor<8xf32>
-  }
-  %5 = linalg.generic {indexing_maps = [#map2, #map3],
-                       iterator_types = ["reduction"]}
-                       ins(%4 : tensor<8xf32>)
-                       outs(%1 : tensor<f32>) {
-  ^bb0(%arg1: f32, %arg2: f32):
-    %6 = arith.addf %arg1, %arg2 : f32
-    linalg.yield %6 : f32
-  } -> tensor<f32>
-  func.return %5 : tensor<f32>
-}
-// CHECK-LABEL: func @reduction_1d
-
-// CHECK: gml_st.loop
-// CHECK-SAME: ins (%[[IN:arg[0-9]]] = %{{arg[0-9]}}: tensor<16xf32>)
-
-// CHECK: %[[VECTOR:.*]] = vector.transfer_read %[[IN]]
-// CHECK: vector.shape_cast %[[VECTOR]] : vector<8xf32> to vector<1x8xf32>
-// CHECK-NOT: tensor.expand_shape
-// CHECK: vector.multi_reduction
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @test_transfer_read_of_one_dim_expand_shape(
-    %in: tensor<10xf32>) -> tensor<5xf32> {
-  %c0 = arith.constant 0 : index
-  %min_float = arith.constant dense<-3.402820e+38> : vector<5xf32>
-  %zero_float = arith.constant 0.000000e+00 : f32
-  %0 = tensor.expand_shape %in [[0, 1]] : tensor<10xf32> into tensor<2x5xf32>
-  %1 = tensor.empty() : tensor<5xf32>
-  %2 = vector.transfer_read %0[%c0, %c0], %zero_float
-    {in_bounds = [true, true], permutation_map = #map0}
-    : tensor<2x5xf32>, vector<2x5xf32>
-  %3 = vector.multi_reduction <maxf>, %2, %min_float [0]
-    : vector<2x5xf32> to vector<5xf32>
-  %4 = vector.transfer_write %3, %1[%c0] {in_bounds = [true]}
-    : vector<5xf32>, tensor<5xf32>
-  func.return %4 : tensor<5xf32>
-}
-// CHECK-LABEL: func @test_transfer_read_of_one_dim_expand_shape(
-// CHECK-SAME: %[[IN:.*]]: tensor<10xf32>
-// CHECK-DAG: %[[MIN_FLOAT:.*]] = arith.constant dense<-3.402820e+38> : vector<5xf32>
-// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[ZERO_FLOAT:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK: %[[INIT_TENSOR:.*]] = tensor.empty() : tensor<5xf32>
-// CHECK: %[[TRANSFER_READ:.*]] = vector.transfer_read %[[IN]][%[[C0]]], %[[ZERO_FLOAT]] {in_bounds = [true]} : tensor<10xf32>, vector<10xf32>
-// CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[TRANSFER_READ]] : vector<10xf32> to vector<2x5xf32>
-// CHECK: %[[MULTI_REDUCTION:.*]] = vector.multi_reduction <maxf>, %[[SHAPE_CAST]], %[[MIN_FLOAT]] [0] : vector<2x5xf32> to vector<5xf32>
-// CHECK: %[[TRANSFER_WRITE:.*]] = vector.transfer_write %[[MULTI_REDUCTION]], %[[INIT_TENSOR]][%[[C0]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
-// CHECK: return %[[TRANSFER_WRITE]] : tensor<5xf32>
-
-// -----
-
-func.func @tiled_matmul(%arg0: tensor<128x16xf32>, %arg1: tensor<16x64xf32>,
-                        %arg2: tensor<128x64xf32>) -> tensor<128x64xf32> {
-  %c2 = arith.constant 2 : index
-  %c16 = arith.constant 16 : index
-  %c8 = arith.constant 8 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-  %c128 = arith.constant 128 : index
-  %c64 = arith.constant 64 : index
-  %0 = gml_st.parallel (%arg3, %arg4) =
-        (%c0, %c0) to (%c128, %c64) step (%c8, %c4) {
-    %1 = gml_st.tile [%arg3, 0] [8, 16] [1, 1] : !gml_st.tile<8x16>
-    %2 = gml_st.materialize %arg0[%1] :
-            tensor<128x16xf32>[!gml_st.tile<8x16>] to tensor<8x16xf32>
-    %3 = gml_st.tile [0, %arg4] [16, 4] [1, 1] : !gml_st.tile<16x4>
-    %4 = gml_st.materialize %arg1[%3] :
-            tensor<16x64xf32>[!gml_st.tile<16x4>] to tensor<16x4xf32>
-    %5 = gml_st.tile [%arg3, %arg4] [8, 4] [1, 1] : !gml_st.tile<8x4>
-    %6 = gml_st.materialize %arg2[%5] :
-            tensor<128x64xf32>[!gml_st.tile<8x4>] to tensor<8x4xf32>
-    %7 = gml_st.for (%arg5) =
-                (%c0) to (%c16) step (%c2) outs (%arg6 = %6: tensor<8x4xf32>) {
-      %8 = gml_st.tile [0, %arg5] [8, 2] [1, 1] : !gml_st.tile<8x2>
-      %9 = gml_st.materialize %2[%8] :
-                tensor<8x16xf32>[!gml_st.tile<8x2>] to tensor<8x2xf32>
-      %10 = gml_st.tile [%arg5, 0] [2, 4] [1, 1] : !gml_st.tile<2x4>
-      %11 = gml_st.materialize %4[%10] :
-                tensor<16x4xf32>[!gml_st.tile<2x4>] to tensor<2x4xf32>
-      %12 = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
-      %13 = gml_st.materialize %arg6[%12] :
-                tensor<8x4xf32>[!gml_st.tile<8x4>] to tensor<8x4xf32>
-      %14 = linalg.matmul ins(%9, %11 : tensor<8x2xf32>, tensor<2x4xf32>)
-                          outs(%13 : tensor<8x4xf32>) -> tensor<8x4xf32>
-      gml_st.set_yield %14 into %arg6[%12] :
-                tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
-    } : tensor<8x4xf32>
-    gml_st.set_yield %7 into %arg2[%5] :
-            tensor<8x4xf32> into tensor<128x64xf32>[!gml_st.tile<8x4>]
-  } : tensor<128x64xf32>
-  return %0 : tensor<128x64xf32>
-}
-
-// CHECK-LABEL: func @tiled_matmul
-
-// CHECK: gml_st.for
-
-// CHECK: %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<8x2xf32>, vector<8x2xf32>
-// CHECK: %[[RHS:.*]] = vector.transfer_read {{.*}} : tensor<2x4xf32>, vector<2x4xf32>
-// CHECK: %[[OUT:.*]] = vector.transfer_read {{.*}} : tensor<8x4xf32>, vector<8x4xf32>
-// CHECK: vector.contract {{{.*}}} %[[LHS]], %[[RHS]], %[[OUT]]
-
-// CHECK-NOT: linalg.matmul
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, 0)>
-func.func @test_transfer_read_of_one_dim_expand_shape_different_shape(
-    %in: tensor<1xf32>) -> tensor<18xf32> {
-  %c0 = arith.constant 0 : index
-  %min_float = arith.constant dense<-3.402820e+38> : vector<18xf32>
-  %zero_float = arith.constant 0.000000e+00 : f32
-  %0 = tensor.expand_shape %in [[0, 1]] : tensor<1xf32> into tensor<1x1xf32>
-  %1 = tensor.empty() : tensor<18xf32>
-  %2 = vector.transfer_read %0[%c0, %c0], %zero_float
-    {in_bounds = [true, true], permutation_map = #map0}
-    : tensor<1x1xf32>, vector<1x18xf32>
-  %3 = vector.multi_reduction <maxf>, %2, %min_float [0]
-    : vector<1x18xf32> to vector<18xf32>
-  %4 = vector.transfer_write %3, %1[%c0] {in_bounds = [true]}
-    : vector<18xf32>, tensor<18xf32>
-  func.return %4 : tensor<18xf32>
-}
-// CHECK-LABEL: func @test_transfer_read_of_one_dim_expand_shape_different_shape
-// CHECK: %{{.*}} = tensor.expand_shape
-
-// -----
-
-func.func @do_not_vectorize_large_untiled_fill() -> tensor<2x1000xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %init = tensor.empty() : tensor<2x1000xf32>
-  %out = linalg.fill ins(%cst : f32) outs(%init : tensor<2x1000xf32>) -> tensor<2x1000xf32>
-  func.return %out : tensor<2x1000xf32>
-}
-// CHECK-LABEL: func @do_not_vectorize_large_untiled_fill
-// CHECK: linalg.fill
-
-// -----
-
-func.func @vectorize_small_untiled_fill() -> tensor<128xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %init = tensor.empty() : tensor<128xf32>
-  %out = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32>
-  func.return %out : tensor<128xf32>
-}
-// CHECK-LABEL: func @vectorize_small_untiled_fill
-// CHECK: vector.transfer_write
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize.mlir
deleted file mode 100644
index 9654093035f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize.mlir
+++ /dev/null
@@ -1,213 +0,0 @@
-// Test vectorization of gml_st.parallel and gml_st.for loops.
-// RUN: mlir-hlo-opt %s --split-input-file --vectorize-gml-st-loops \
-// RUN: | FileCheck %s
-
-#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @parallel_with_tiles(
-func.func @parallel_with_tiles(
-    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
-    -> memref<?x?xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
-  %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
-  gml_st.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
-    %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    linalg.generic {indexing_maps = [#map1, #map1, #map1],
-                    iterator_types = ["parallel", "parallel"]}
-                    ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
-                    outs(%6 : memref<4x1xf32, #map0>) {
-    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
-      %9 = arith.addf %arg5, %arg6 : f32
-      linalg.yield %9 : f32
-    }
-    gml_st.set_yield
-  }
-  func.return %arg2 : memref<?x?xf32>
-}
-// CHECK-NOT: linalg.generic
-// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
-// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
-// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
-// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
-
-// -----
-
-#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @for_with_tiles(
-func.func @for_with_tiles(
-    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
-    -> memref<?x?xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
-  %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
-  gml_st.for (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
-    %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
-      : memref<?x?xf32> to memref<4x1xf32, #map0>
-    linalg.generic {indexing_maps = [#map1, #map1, #map1],
-                    iterator_types = ["parallel", "parallel"]}
-                    ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
-                    outs(%6 : memref<4x1xf32, #map0>) {
-    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
-      %9 = arith.addf %arg5, %arg6 : f32
-      linalg.yield %9 : f32
-    }
-    gml_st.set_yield
-  }
-  func.return %arg2 : memref<?x?xf32>
-}
-// CHECK-NOT: linalg.generic
-// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
-// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
-// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
-// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
-
-// -----
-
-#map3 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @parallel_on_tensor(
-func.func @parallel_on_tensor(
-    %arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>)
-    -> tensor<?xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %2 = gml_st.parallel (%i) = (%c0) to (%0) step (%c4) {
-    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-    %6 = gml_st.materialize %arg0[%tile]
-      : tensor<?xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %7 = gml_st.materialize %arg1[%tile]
-      : tensor<?xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %8 = gml_st.materialize %arg2[%tile]
-      : tensor<?xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %9 = linalg.generic {indexing_maps = [#map3, #map3, #map3],
-                        iterator_types = ["parallel"]}
-                        ins(%6, %7 : tensor<4xf32>, tensor<4xf32>)
-                        outs(%8 : tensor<4xf32>) {
-    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
-      %10 = arith.addf %arg5, %arg6 : f32
-      linalg.yield %10 : f32
-    } -> tensor<4xf32>
-    gml_st.set_yield %9 into %arg2[%tile]
-      : tensor<4xf32> into tensor<?xf32>[!gml_st.tile<4>]
-  } : tensor<?xf32>
-  func.return %2 : tensor<?xf32>
-}
-// CHECK-NOT: linalg.generic
-// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0]
-// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0]
-// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4xf32>
-// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0]
-
-// -----
-
-// CHECK-LABEL: @single_element_tensor_to_element(
-// CHECK-SAME: %[[IN:.*]]: vector<1xf32>
-func.func @single_element_tensor_to_element(%in : vector<1xf32>) -> f32 {
-  %c0 = arith.constant 0 : index
-  %pad = arith.constant 0.0 : f32
-  %empty = tensor.empty() : tensor<1xf32>
-  %r = vector.transfer_write %in, %empty[%c0] {in_bounds = [true]}
-    : vector<1xf32>, tensor<1xf32>
-  %v = tensor.extract %r[%c0] : tensor<1xf32>
-  return %v : f32
-}
-// CHECK: %[[RESULT:.*]] = vector.extract %[[IN]][0]
-// CHECK: return %[[RESULT]]
-
-// -----
-
-// CHECK-LABEL: @zero_dim_element_tensor_to_element(
-// CHECK-SAME: %[[IN:.*]]: vector<f32>
-func.func @zero_dim_element_tensor_to_element(%in : vector<f32>) -> f32 {
-  %pad = arith.constant 0.0 : f32
-  %empty = tensor.empty() : tensor<f32>
-  %r = vector.transfer_write %in, %empty[] {in_bounds = []}
-    : vector<f32>, tensor<f32>
-  %v = tensor.extract %r[] : tensor<f32>
-  return %v : f32
-}
-// CHECK: %[[RESULT:.*]] = vector.extractelement %[[IN]][]
-// CHECK: return %[[RESULT]]
-
-// -----
-
-// CHECK-LABEL: @read_of_empty_float_to_constant(
-func.func @read_of_empty_float_to_constant(%pad : f32) -> vector<32xf32> {
-  %empty = tensor.empty() : tensor<32xf32>
-  %c0 = arith.constant 0 : index
-  %r = vector.transfer_read %empty[%c0], %pad {in_bounds = [true]}
-    : tensor<32xf32>, vector<32xf32>
-  return %r : vector<32xf32>
-}
-// CHECK: %[[RESULT:.*]] = arith.constant dense<0x7FC00000> : vector<32xf32>
-// CHECK: return %[[RESULT]]
-
-// -----
-
-// CHECK-LABEL: @read_of_empty_int_to_constant(
-func.func @read_of_empty_int_to_constant(%pad : i8) -> vector<32xi8> {
-  %empty = tensor.empty() : tensor<32xi8>
-  %c0 = arith.constant 0 : index
-  %r = vector.transfer_read %empty[%c0], %pad {in_bounds = [true]}
-    : tensor<32xi8>, vector<32xi8>
-  return %r : vector<32xi8>
-}
-// CHECK: %[[RESULT:.*]] = arith.constant dense<0> : vector<32xi8>
-// CHECK: return %[[RESULT]]
-// -----
-
-// CHECK-LABEL: @materialize_scalar_from_0D_vector(
-// CHECK-SAME: %[[V:.*]]: vector<f32>
-func.func @materialize_scalar_from_0D_vector(%v : vector<f32>) -> f32 {
-  %tile = gml_st.tile [] [] [] : !gml_st.tile<>
-  %r = gml_st.materialize %v[%tile] : vector<f32>[!gml_st.tile<>] to f32
-  return %r : f32
-}
-// CHECK: %[[R:.*]] = vector.extractelement %[[V]][]
-// CHECK: return %[[R]]
-
-// -----
-
-// CHECK-LABEL: @materialize_scalar_from_single_element_vector(
-// CHECK-SAME: %[[V:.*]]: vector<1x1xf32>
-func.func @materialize_scalar_from_single_element_vector(
-    %v : vector<1x1xf32>) -> f32 {
-  %tile = gml_st.tile [0, 0] [1, 1] [1, 1] : !gml_st.tile<1x1>
-  %r = gml_st.materialize %v[%tile] : vector<1x1xf32>[!gml_st.tile<1x1>] to f32
-  return %r : f32
-}
-// CHECK: %[[R:.*]] = vector.extract %[[V]][0, 0]
-// CHECK: return %[[R]]
-
-
-// -----
-
-// CHECK-LABEL: @set_yield_scalar_into_vector(
-// CHECK-SAME: %[[F:.*]]: f32, %[[V:.*]]: vector<1x1xf32>)
-func.func @set_yield_scalar_into_vector(
-  %f: f32, %v: vector<1x1xf32>) {
-  %tile = gml_st.tile [0, 0] [1, 1] [1, 1] : !gml_st.tile<1x1>
-  gml_st.set_yield %f into %v[%tile]
-    : f32 into vector<1x1xf32>[!gml_st.tile<1x1>]
-}
-// CHECK: %[[R:.*]] = vector.insert %[[F]], %[[V]] [0, 0]
-// CHECK: gml_st.set_yield %[[R]] into %[[V]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
new file mode 100644
index 00000000000..ba6a604b95e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-hlo-opt %s --vectorize-copy --split-input-file | FileCheck %s
+
+func.func @vectorize_copy(%arg: memref<10x16xf32>) -> memref<10x10xf32> {
+  %subview = memref.subview %arg[0, 0] [10, 10] [1, 1] : memref<10x16xf32> to memref<10x10xf32, strided<[16, 1]>>
+  %alloc = memref.alloc() : memref<10x10xf32>
+  memref.copy %subview, %alloc : memref<10x10xf32, strided<[16, 1]>> to memref<10x10xf32>
+  return %alloc : memref<10x10xf32>
+}
+
+// CHECK-LABEL: func @vectorize_copy
+
+// CHECK-NOT:     memref.copy
+// CHECK:         vector.transfer_read
+// CHECK:         vector.transfer_write
+
+// -----
+
+func.func @do_not_vectorize_copy(%arg: memref<10x10xf32>) -> memref<10x10xf32> {
+  %alloc_10 = memref.alloc() : memref<10x10xf32>
+  memref.copy %arg, %alloc_10 : memref<10x10xf32> to memref<10x10xf32>
+  return %alloc_10 : memref<10x10xf32>
+}
+
+// CHECK-LABEL: func @do_not_vectorize_copy
+
+// CHECK-NOT:     vector.transfer_read
+// CHECK-NOT:     vector.transfer_write
+// CHECK:         memref.copy
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir
new file mode 100644
index 00000000000..4aafd2d5d76
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_cpu.mlir
@@ -0,0 +1,224 @@
+// RUN: mlir-hlo-opt %s --vectorize-for-cpu --split-input-file |\
+// RUN: FileCheck %s
+
+
+func.func @vectorize_tiled_matmul(%lhs: tensor<8x16xf32>,
+    %rhs: tensor<16x4xf32>, %fill: tensor<8x4xf32>) -> tensor<8x4xf32> {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c16 = arith.constant 16 : index
+
+  %7 = gml_st.for (%i) =
+              (%c0) to (%c16) step (%c2) outs (%arg6 = %fill: tensor<8x4xf32>) {
+    %9 = tensor.extract_slice %lhs[0, %i] [8, 2] [1, 1]  :
+              tensor<8x16xf32> to tensor<8x2xf32>
+
+    %11 = tensor.extract_slice %rhs[%i, 0] [2, 4] [1, 1]  :
+              tensor<16x4xf32> to tensor<2x4xf32>
+
+    %13 = tensor.extract_slice %arg6[0, 0] [8, 4] [1, 1]  :
+              tensor<8x4xf32> to tensor<8x4xf32>
+
+    %14 = linalg.matmul ins(%9, %11 : tensor<8x2xf32>, tensor<2x4xf32>)
+                        outs(%13 : tensor<8x4xf32>) -> tensor<8x4xf32>
+
+    %12 = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
+    gml_st.set_yield %14 into %arg6[%12] :
+              tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
+  } {__perfectly_tileable_loop_label__} : tensor<8x4xf32>
+  return %7 : tensor<8x4xf32>
+}
+
+// CHECK-LABEL: func @vectorize_tiled_matmul
+
+// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read %[[OUT:.*]]
+// CHECK:         %[[FOR:.*]] = gml_st.for {{.*}} outs (%[[ARG:.*]] =
+// CHECK:           %[[LHS:.*]] = vector.transfer_read
+// CHECK-SAME:        : tensor<8x16xf32>, vector<8x2xf32>
+// CHECK:           %[[RHS:.*]] = vector.transfer_read
+// CHECK-SAME:        : tensor<16x4xf32>, vector<2x4xf32>
+// CHECK:           %[[CONTRACT:.*]] = vector.contract
+// CHECK-SAME:        %[[LHS]], %[[RHS]], %[[ARG]]
+// CHECK:           gml_st.set_yield %[[CONTRACT]] into %[[ARG]]
+// CHECK:         vector.transfer_write %[[FOR]]
+
+// -----
+
+func.func @vectorize_static_matmul(%lhs: tensor<128x16xf32>,
+    %rhs: tensor<16x64xf32>, %fill: tensor<128x64xf32>) -> tensor<128x64xf32> {
+  %c2 = arith.constant 2 : index
+  %c16 = arith.constant 16 : index
+  %c8 = arith.constant 8 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  %c64 = arith.constant 64 : index
+  %0 = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c128, %c64) step (%c8, %c4)
+    outs (%out_ = %fill: tensor<128x64xf32>) {
+    %2 = tensor.extract_slice %lhs[%i, 0] [8, 16] [1, 1] :
+            tensor<128x16xf32> to tensor<8x16xf32>
+    %4 = tensor.extract_slice %rhs[0, %j] [16, 4] [1, 1] :
+            tensor<16x64xf32> to tensor<16x4xf32>
+    %6 = tensor.extract_slice %fill[%i, %j] [8, 4] [1, 1] :
+            tensor<128x64xf32> to tensor<8x4xf32>
+    %7 = gml_st.for (%k) =
+                (%c0) to (%c16) step (%c2) outs (%arg6 = %6: tensor<8x4xf32>) {
+      %9 = tensor.extract_slice %2[0, %k] [8, 2] [1, 1] :
+                tensor<8x16xf32> to tensor<8x2xf32>
+      %11 = tensor.extract_slice %4[%k, 0] [2, 4] [1, 1] :
+                tensor<16x4xf32> to tensor<2x4xf32>
+      %13 = tensor.extract_slice %arg6[0, 0] [8, 4] [1, 1] :
+                tensor<8x4xf32> to tensor<8x4xf32>
+      %14 = linalg.matmul ins(%9, %11 : tensor<8x2xf32>, tensor<2x4xf32>)
+                          outs(%13 : tensor<8x4xf32>) -> tensor<8x4xf32>
+      %12 = gml_st.tile [0, 0] [8, 4] [1, 1] : !gml_st.tile<8x4>
+      gml_st.set_yield %14 into %arg6[%12] :
+                tensor<8x4xf32> into tensor<8x4xf32>[!gml_st.tile<8x4>]
+    } : tensor<8x4xf32>
+    %5 = gml_st.tile [%i, %j] [8, 4] [1, 1] : !gml_st.tile<8x4>
+    gml_st.set_yield %7 into %out_[%5] :
+            tensor<8x4xf32> into tensor<128x64xf32>[!gml_st.tile<8x4>]
+  } : tensor<128x64xf32>
+  return %0 : tensor<128x64xf32>
+}
+// CHECK-LABEL: func @vectorize_static_matmul
+
+// CHECK:         %[[OUT_READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4xf32>, vector<8x4xf32>
+// CHECK:         %[[FOR:.*]] = gml_st.for {{.*}} outs (%[[ARG:.*]] = %[[OUT_READ]]
+// CHECK-NOT:       linalg.matmul
+// CHECK:           %[[LHS:.*]] = vector.transfer_read {{.*}} : tensor<128x16xf32>, vector<8x2xf32>
+// CHECK:           %[[RHS:.*]] = vector.transfer_read {{.*}} : tensor<16x64xf32>, vector<2x4xf32>
+// CHECK-NOT:       vector.transfer_read
+// CHECK:           %[[CONTRACT:.*]] = vector.contract {{{.*}}} %[[LHS]], %[[RHS]], %[[ARG]]
+// CHECK:           gml_st.set_yield %[[CONTRACT]] into %[[ARG]]
+// CHECK:         vector.transfer_write %[[FOR]]
+
+// -----
+
+func.func @pad(%arg0: tensor<10x10xf32>) -> tensor<16x10xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %padded = tensor.pad %arg0 low[0, 0] high[6, 0] {
+  ^bb0(%arg3: index, %arg4: index):
+    tensor.yield %cst : f32
+  } : tensor<10x10xf32> to tensor<16x10xf32>
+
+  return %padded : tensor<16x10xf32>
+}
+
+// CHECK-LABEL: func @pad(
+
+// CHECK:         %[[EMPTY:.*]] = tensor.empty() : tensor<16x10xf32>
+// CHECK:         %[[FILL:.*]] = linalg.fill {{.*}} outs(%[[EMPTY]]
+// CHECK:         %[[READ:.*]] = vector.transfer_read
+// CHECK:         %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[FILL]]
+// CHECK:         return %[[WRITE]]
+
+// -----
+
+func.func @transpose(%input: tensor<4x5x6xf32>,
+    %init: tensor<5x6x4xf32>) -> tensor<5x6x4xf32> {
+  %transpose = linalg.transpose
+    ins(%input:tensor<4x5x6xf32>)
+    outs(%init:tensor<5x6x4xf32>)
+    permutation = [1, 2, 0]
+  func.return %transpose : tensor<5x6x4xf32>
+}
+
+// CHECK-LABEL: func @transpose(
+// CHECK-SAME:  %[[INPUT:.*]]: tensor<4x5x6xf32>
+// CHECK-SAME:  %[[INIT:.*]]: tensor<5x6x4xf32>
+
+// CHECK:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
+// CHECK:         %[[TRANSPOSE:.*]] = vector.transpose %[[READ]], [1, 2, 0]
+// CHECK:         %[[WRITE:.*]] = vector.transfer_write %[[TRANSPOSE]], %[[INIT]]
+// CHECK:         return %[[WRITE]]
+
+// -----
+
+func.func @simplify_identity_transpose(%input: tensor<1x1xf32>,
+    %init: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %transpose = linalg.transpose
+    ins(%input:tensor<1x1xf32>)
+    outs(%init:tensor<1x1xf32>)
+    permutation = [0, 1]
+  func.return %transpose : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: func @simplify_identity_transpose(
+
+// CHECK-NOT:     linalg.transpose
+// CHECK:         return
+
+// -----
+
+func.func @do_not_simplify_transpose(%input: tensor<1x1xf32>,
+    %init: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %transpose = linalg.transpose
+    ins(%input:tensor<1x1xf32>)
+    outs(%init:tensor<1x1xf32>)
+    permutation = [1, 0]
+  func.return %transpose : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: func @do_not_simplify_transpose(
+
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK:         return %[[TRANSPOSE]]
+
+// -----
+
+func.func @perfectly_tiled_reverse_1d(%input: tensor<8xf32>,
+    %init: tensor<8xf32>) -> tensor<8xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<8xf32>)
+         outs(%init: tensor<8xf32>)
+         reverse_dimensions = [0]
+  func.return %res : tensor<8xf32>
+}
+
+// CHECK-LABEL: func @perfectly_tiled_reverse_1d(
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<8xf32>, %[[ARG1:.*]]: tensor<8xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
+//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
+//       CHECK:   return %[[WRITE]]
+
+// -----
+
+func.func @perfectly_tiled_reverse_2d(%input: tensor<1x8xf32>,
+    %init: tensor<1x8xf32>) -> tensor<1x8xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<1x8xf32>)
+         outs(%init: tensor<1x8xf32>)
+         reverse_dimensions = [1]
+  func.return %res : tensor<1x8xf32>
+}
+
+// CHECK-LABEL: func @perfectly_tiled_reverse_2d(
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x8xf32>, %[[ARG1:.*]]: tensor<1x8xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
+//  CHECK-SAME:   : tensor<1x8xf32>, vector<8xf32>
+//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
+//  CHECK-SAME:   : vector<8xf32>, tensor<1x8xf32>
+//       CHECK:   return %[[WRITE]]
+
+// -----
+
+func.func @perfectly_tiled_reverse_4d(%input: tensor<1x1x1x8xf32>,
+    %init: tensor<1x1x1x8xf32>) -> tensor<1x1x1x8xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<1x1x1x8xf32>)
+         outs(%init: tensor<1x1x1x8xf32>)
+         reverse_dimensions = [3]
+  func.return %res : tensor<1x1x1x8xf32>
+}
+
+// CHECK-LABEL: func @perfectly_tiled_reverse_4d(
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x1x1x8xf32>, %[[ARG1:.*]]: tensor<1x1x1x8xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]]
+//  CHECK-SAME:   : tensor<1x1x1x8xf32>, vector<8xf32>
+//       CHECK:   %[[SHUFFLE:.*]] = vector.shuffle %[[READ]]
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[SHUFFLE]], %[[ARG1]]
+//  CHECK-SAME:   : vector<8xf32>, tensor<1x1x1x8xf32>
+//       CHECK:   return %[[WRITE]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu.mlir
new file mode 100644
index 00000000000..525b8f88eef
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu.mlir
@@ -0,0 +1,300 @@
+// RUN: mlir-hlo-opt %s --vectorize-for-gpu --split-input-file |\
+// RUN: FileCheck %s
+
+#map0 = affine_map<(d0, d1) -> (d0, d1)>
+func.func @test_transfer_read_of_one_dim_expand_shape(
+    %in: tensor<10xf32>) -> tensor<5xf32> {
+  %c0 = arith.constant 0 : index
+  %min_float = arith.constant dense<-3.402820e+38> : vector<5xf32>
+  %zero_float = arith.constant 0.000000e+00 : f32
+  %0 = tensor.expand_shape %in [[0, 1]] : tensor<10xf32> into tensor<2x5xf32>
+  %1 = tensor.empty() : tensor<5xf32>
+  %2 = vector.transfer_read %0[%c0, %c0], %zero_float
+    {in_bounds = [true, true], permutation_map = #map0}
+    : tensor<2x5xf32>, vector<2x5xf32>
+  %3 = vector.multi_reduction <maxf>, %2, %min_float [0]
+    : vector<2x5xf32> to vector<5xf32>
+  %4 = vector.transfer_write %3, %1[%c0] {in_bounds = [true]}
+    : vector<5xf32>, tensor<5xf32>
+  func.return %4 : tensor<5xf32>
+}
+// CHECK-LABEL: func @test_transfer_read_of_one_dim_expand_shape(
+// CHECK-SAME: %[[IN:.*]]: tensor<10xf32>
+// CHECK-DAG: %[[MIN_FLOAT:.*]] = arith.constant dense<-3.402820e+38> : vector<5xf32>
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[ZERO_FLOAT:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[INIT_TENSOR:.*]] = tensor.empty() : tensor<5xf32>
+// CHECK: %[[TRANSFER_READ:.*]] = vector.transfer_read %[[IN]][%[[C0]]], %[[ZERO_FLOAT]] {in_bounds = [true]} : tensor<10xf32>, vector<10xf32>
+// CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[TRANSFER_READ]] : vector<10xf32> to vector<2x5xf32>
+// CHECK: %[[MULTI_REDUCTION:.*]] = vector.multi_reduction <maxf>, %[[SHAPE_CAST]], %[[MIN_FLOAT]] [0] : vector<2x5xf32> to vector<5xf32>
+// CHECK: %[[TRANSFER_WRITE:.*]] = vector.transfer_write %[[MULTI_REDUCTION]], %[[INIT_TENSOR]][%[[C0]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
+// CHECK: return %[[TRANSFER_WRITE]] : tensor<5xf32>
+
+// -----
+
+#map0 = affine_map<(d0, d1) -> (d0, 0)>
+func.func @test_transfer_read_of_one_dim_expand_shape_different_shape(
+    %in: tensor<1xf32>) -> tensor<18xf32> {
+  %c0 = arith.constant 0 : index
+  %min_float = arith.constant dense<-3.402820e+38> : vector<18xf32>
+  %zero_float = arith.constant 0.000000e+00 : f32
+  %0 = tensor.expand_shape %in [[0, 1]] : tensor<1xf32> into tensor<1x1xf32>
+  %1 = tensor.empty() : tensor<18xf32>
+  %2 = vector.transfer_read %0[%c0, %c0], %zero_float
+    {in_bounds = [true, true], permutation_map = #map0}
+    : tensor<1x1xf32>, vector<1x18xf32>
+  %3 = vector.multi_reduction <maxf>, %2, %min_float [0]
+    : vector<1x18xf32> to vector<18xf32>
+  %4 = vector.transfer_write %3, %1[%c0] {in_bounds = [true]}
+    : vector<18xf32>, tensor<18xf32>
+  func.return %4 : tensor<18xf32>
+}
+// CHECK-LABEL: func @test_transfer_read_of_one_dim_expand_shape_different_shape
+// CHECK: %{{.*}} = tensor.expand_shape
+
+// -----
+
+func.func @do_not_vectorize_large_untiled_fill() -> tensor<2x1000xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %init = tensor.empty() : tensor<2x1000xf32>
+  %out = linalg.fill ins(%cst : f32) outs(%init : tensor<2x1000xf32>) -> tensor<2x1000xf32>
+  func.return %out : tensor<2x1000xf32>
+}
+// CHECK-LABEL: func @do_not_vectorize_large_untiled_fill
+// CHECK: linalg.fill
+
+// -----
+
+func.func @vectorize_small_untiled_fill() -> tensor<128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %init = tensor.empty() : tensor<128xf32>
+  %out = linalg.fill ins(%cst : f32) outs(%init : tensor<128xf32>) -> tensor<128xf32>
+  func.return %out : tensor<128xf32>
+}
+// CHECK-LABEL: func @vectorize_small_untiled_fill
+// CHECK: vector.transfer_write
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+
+// CHECK-LABEL: @parallel_with_tiles(
+func.func @parallel_with_tiles(
+    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
+    -> memref<?x?xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
+  gml_st.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
+    %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    linalg.map { arith.addf }
+            ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
+            outs(%6 : memref<4x1xf32, #map0>)
+    gml_st.set_yield
+  }
+  func.return %arg2 : memref<?x?xf32>
+}
+// CHECK-NOT: linalg.map
+// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
+// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
+
+// -----
+
+#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
+
+// CHECK-LABEL: @for_with_tiles(
+func.func @for_with_tiles(
+    %arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
+    -> memref<?x?xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
+  gml_st.for (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c4, %c1) {
+    %6 = memref.subview %arg2[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    %7 = memref.subview %arg1[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    %8 = memref.subview %arg0[%arg3, %arg4] [4, 1] [1, 1]
+      : memref<?x?xf32> to memref<4x1xf32, #map0>
+    linalg.map { arith.addf }
+            ins(%8, %7 : memref<4x1xf32, #map0>, memref<4x1xf32, #map0>)
+            outs(%6 : memref<4x1xf32, #map0>)
+    gml_st.set_yield
+  }
+  func.return %arg2 : memref<?x?xf32>
+}
+// CHECK-NOT: linalg.map
+// CHECK: %[[LHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[RHS:.*]] = vector.transfer_read {{%.*}}[%c0, %c0]
+// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4x1xf32>
+// CHECK: vector.transfer_write %[[ADD]], {{%.*}}[%c0, %c0]
+
+// -----
+
+// CHECK-LABEL: @parallel_on_tensor(
+// CHECK: {{%.*}}: tensor<?xf32>, {{%.*}}: tensor<?xf32>, %[[ARG2:.*]]: tensor<?xf32>)
+func.func @parallel_on_tensor(
+    %arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<?xf32>)
+    -> tensor<?xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
+  %2 = gml_st.parallel (%i) = (%c0) to (%0) step (%c4)
+      outs (%out_ = %arg2: tensor<?xf32>) {
+    %6 = tensor.extract_slice %arg0[%i] [4] [1]
+      : tensor<?xf32> to tensor<4xf32>
+    %7 = tensor.extract_slice %arg1[%i] [4] [1]
+      : tensor<?xf32> to tensor<4xf32>
+    %8 = tensor.extract_slice %out_[%i] [4] [1]
+      : tensor<?xf32> to tensor<4xf32>
+    %9 = linalg.map { arith.addf }
+           ins(%6, %7 : tensor<4xf32>, tensor<4xf32>)
+           outs(%8 : tensor<4xf32>)
+    %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+    gml_st.set_yield %9 into %out_[%tile]
+      : tensor<4xf32> into tensor<?xf32>[!gml_st.tile<4>]
+  } : tensor<?xf32>
+  func.return %2 : tensor<?xf32>
+}
+// CHECK-NOT: linalg.map
+// CHECK: gml_st.parallel (%[[ITER:.*]]) = (%[[C0:[a-z0-9]+]])
+// CHECK: %[[LHS:.*]] = vector.transfer_read {{%[a-z0-9_]+}}[%[[C0]]]
+// CHECK: %[[RHS:.*]] = vector.transfer_read {{%[a-z0-9_]+}}[%[[C0]]]
+// CHECK: %[[ADD:.*]] = arith.addf %[[LHS]], %[[RHS]] : vector<4xf32>
+
+// -----
+
+// CHECK-LABEL: @single_element_tensor_to_element(
+// CHECK-SAME: %[[IN:.*]]: vector<1xf32>
+func.func @single_element_tensor_to_element(%in : vector<1xf32>) -> f32 {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 0.0 : f32
+  %empty = tensor.empty() : tensor<1xf32>
+  %r = vector.transfer_write %in, %empty[%c0] {in_bounds = [true]}
+    : vector<1xf32>, tensor<1xf32>
+  %v = tensor.extract %r[%c0] : tensor<1xf32>
+  return %v : f32
+}
+// CHECK: %[[RESULT:.*]] = vector.extract %[[IN]][0]
+// CHECK: return %[[RESULT]]
+
+// -----
+
+// CHECK-LABEL: @zero_dim_element_tensor_to_element(
+// CHECK-SAME: %[[IN:.*]]: vector<f32>
+func.func @zero_dim_element_tensor_to_element(%in : vector<f32>) -> f32 {
+  %pad = arith.constant 0.0 : f32
+  %empty = tensor.empty() : tensor<f32>
+  %r = vector.transfer_write %in, %empty[] {in_bounds = []}
+    : vector<f32>, tensor<f32>
+  %v = tensor.extract %r[] : tensor<f32>
+  return %v : f32
+}
+// CHECK: %[[RESULT:.*]] = vector.extractelement %[[IN]][]
+// CHECK: return %[[RESULT]]
+
+// -----
+
+// CHECK-LABEL: @read_of_empty_float_to_constant(
+func.func @read_of_empty_float_to_constant(%pad : f32) -> vector<32xf32> {
+  %empty = tensor.empty() : tensor<32xf32>
+  %c0 = arith.constant 0 : index
+  %r = vector.transfer_read %empty[%c0], %pad {in_bounds = [true]}
+    : tensor<32xf32>, vector<32xf32>
+  return %r : vector<32xf32>
+}
+// CHECK: %[[RESULT:.*]] = arith.constant dense<0x7FC00000> : vector<32xf32>
+// CHECK: return %[[RESULT]]
+
+// -----
+
+// CHECK-LABEL: @read_of_empty_int_to_constant(
+func.func @read_of_empty_int_to_constant(%pad : i8) -> vector<32xi8> {
+  %empty = tensor.empty() : tensor<32xi8>
+  %c0 = arith.constant 0 : index
+  %r = vector.transfer_read %empty[%c0], %pad {in_bounds = [true]}
+    : tensor<32xi8>, vector<32xi8>
+  return %r : vector<32xi8>
+}
+// CHECK: %[[RESULT:.*]] = arith.constant dense<0> : vector<32xi8>
+// CHECK: return %[[RESULT]]
+// -----
+
+// CHECK-LABEL: @materialize_scalar_from_0D_vector(
+// CHECK-SAME: %[[V:.*]]: vector<f32>
+func.func @materialize_scalar_from_0D_vector(%v : vector<f32>) -> f32 {
+  %r = gml_st.materialize %v[][][] : vector<f32> to f32
+  return %r : f32
+}
+// CHECK: %[[R:.*]] = vector.extractelement %[[V]][]
+// CHECK: return %[[R]]
+
+// -----
+
+// CHECK-LABEL: @materialize_scalar_from_single_element_vector(
+// CHECK-SAME: %[[V:.*]]: vector<1x1xf32>
+func.func @materialize_scalar_from_single_element_vector(
+    %v : vector<1x1xf32>) -> f32 {
+  %r = gml_st.materialize %v[0, 0] [1, 1] [1, 1]
+    : vector<1x1xf32> to f32
+  return %r : f32
+}
+// CHECK: %[[R:.*]] = vector.extract %[[V]][0, 0]
+// CHECK: return %[[R]]
+
+
+// -----
+
+// CHECK-LABEL: @set_yield_scalar_into_vector(
+// CHECK-SAME: %[[F:.*]]: f32, %[[V:.*]]: vector<1x1xf32>)
+func.func @set_yield_scalar_into_vector(
+  %f: f32, %v: vector<1x1xf32>) {
+  %tile = gml_st.tile [0, 0] [1, 1] [1, 1] : !gml_st.tile<1x1>
+  gml_st.set_yield %f into %v[%tile]
+    : f32 into vector<1x1xf32>[!gml_st.tile<1x1>]
+}
+// CHECK: %[[R:.*]] = vector.insert %[[F]], %[[V]] [0, 0]
+// CHECK: gml_st.set_yield %[[R]] into %[[V]]
+
+// -----
+
+func.func @fold_identity_materialize(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>)
+                  -> tensor<8x8xf32> {
+  %c0 = arith.constant 0 : index
+  %c8 = arith.constant 8 : index
+  %c16 = arith.constant 16 : index
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<8x8xf32>
+  %6 = gml_st.for (%arg4) = (%c0) to (%c16) step (%c8) outs (%arg5 = %0: tensor<8x8xf32>) {
+    %19 = tensor.extract_slice %arg0[%c0, %arg4] [8, 8] [1, 1]  : tensor<8x16xf32> to tensor<8x8xf32>
+    %21 = tensor.extract_slice %arg1[%arg4, %c0] [8, 8] [1, 1]  : tensor<16x8xf32> to tensor<8x8xf32>
+    %23 = tensor.extract_slice %arg5[0, 0] [8, 8] [1, 1]  : tensor<8x8xf32> to tensor<8x8xf32>
+    %28 = linalg.fill ins(%cst_0 : f32) outs(%23 : tensor<8x8xf32>) -> tensor<8x8xf32>
+    %29 = tensor.extract_slice %28[0, 0] [8, 8] [1, 1]  : tensor<8x8xf32> to tensor<8x8xf32>
+    %22 = gml_st.tile [0, 0] [8, 8] [1, 1] : !gml_st.tile<8x8>
+    gml_st.set_yield %29 into %arg5[%22] : tensor<8x8xf32> into tensor<8x8xf32>[!gml_st.tile<8x8>]
+  } : tensor<8x8xf32>
+  return %6 : tensor<8x8xf32>
+}
+
+// CHECK-LABEL: func @fold_identity_materialize(
+
+// CHECK:         %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x8xf32>
+// CHECK:         %[[INIT:.*]] = tensor.empty
+
+// CHECK:         gml_st.for {{.*}} outs (%[[ARG:.*]] = %[[INIT]]
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[CST]], %[[ARG]]
+// CHECK:           %[[TILE:.*]] = gml_st.tile [0, 0] [8, 8] [1, 1]
+// CHECK:           gml_st.set_yield %[[WRITE]] into %[[ARG]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu_distributed.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu_distributed.mlir
new file mode 100644
index 00000000000..7a1ed54e747
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_for_gpu_distributed.mlir
@@ -0,0 +1,235 @@
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN:     --vectorize-for-gpu="vectorize-gml-st-ops=true included-distribution-labels=test" \
+// RUN: | FileCheck %s
+
+func.func @vectorize_gml_st_parallel_op(
+    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
+    -> tensor<32xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  // We need this outer trivial loop to make sure the inner loop has a parent
+  // with the correct distribution label.
+  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
+      outs (%arg1_ = %arg1: tensor<32xf32>) distribution ("test") {
+    %arg0tile = tensor.extract_slice %arg0[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %arg1tile = tensor.extract_slice %arg1_[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %3 = gml_st.parallel (%i) = (%c0) to (%c32) step (%c4)
+        outs (%arg1tile_ = %arg1tile: tensor<32xf32>) distribution ("test") {
+      %6 = tensor.extract_slice %arg0tile[%i] [4] [1]
+        : tensor<32xf32> to tensor<4xf32>
+      %7 = tensor.extract_slice %arg1tile_[%i] [4] [1]
+        : tensor<32xf32> to tensor<4xf32>
+      %9 = linalg.map {arith.negf }
+             ins(%6: tensor<4xf32>)
+             outs(%7 : tensor<4xf32>)
+      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+      gml_st.set_yield %9 into %arg1tile_[%tile]
+        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
+    } : tensor<32xf32>
+    %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
+    gml_st.set_yield %3 into %arg1_[%tile32]
+      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
+  } : tensor<32xf32>
+  func.return %2 : tensor<32xf32>
+}
+// CHECK-LABEL: @vectorize_gml_st_parallel_op(
+// CHECK-SAME:   %[[ARG0:.*]]: tensor<32xf32>, %[[ARG1:.*]]: tensor<32xf32>
+
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK:      gml_st.parallel
+// CHECK-SAME:     outs (%[[ARG1_:.*]] = %[[ARG1]]:
+// CHECK-DAG:    vector.transfer_read %[[ARG1_]][%[[C0]]]
+// CHECK:        %[[RESULT:.*]] = gml_st.parallel
+// CHECK:          %[[LHSTILE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:          %[[LHSVEC:.*]] = vector.transfer_read %[[LHSTILE]]
+// CHECK:          %[[NEG:.*]] = arith.negf %[[LHSVEC]] : vector<4xf32>
+// CHECK:          gml_st.set_yield %[[NEG]]
+// CHECK-SAME:     vector<4xf32> into vector<32xf32>
+// CHECK:        vector.transfer_write %[[RESULT]], {{%.*}}[%c0]
+
+// -----
+
+func.func @vectorize_gml_st_for_op(
+    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
+    -> tensor<32xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  // We need this outer trivial loop to make sure the inner loop has a parent
+  // with the correct distribution label.
+  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
+      outs (%out_ = %arg1 : tensor<32xf32>) distribution ("test") {
+    %arg0tile = tensor.extract_slice %arg0[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %out_tile = tensor.extract_slice %out_[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %3 = gml_st.for (%i) = (%c0) to (%c32) step (%c4)
+          outs(%out = %out_tile : tensor<32xf32>) {
+      %6 = tensor.extract_slice %arg0tile[%i][4][1]
+        : tensor<32xf32> to tensor<4xf32>
+      %7 = tensor.extract_slice %out[%i][4][1]
+        : tensor<32xf32> to tensor<4xf32>
+      %9 = linalg.map { arith.negf }
+              ins(%6: tensor<4xf32>)
+              outs(%7 : tensor<4xf32>)
+      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+      gml_st.set_yield %9 into %out[%tile]
+        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
+    } : tensor<32xf32>
+    %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
+    gml_st.set_yield %3 into %out_[%tile32]
+      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
+  } : tensor<32xf32>
+  func.return %2 : tensor<32xf32>
+}
+// CHECK-LABEL: @vectorize_gml_st_for_op(
+// CHECK-SAME:   %[[ARG0:.*]]: tensor<32xf32>, %[[ARG1:.*]]: tensor<32xf32>
+
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK:      gml_st.parallel
+// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[ARG1]]:
+// CHECK-DAG:  %[[RES:.*]] = vector.transfer_read %[[OUT_]][%[[C0]]]
+// CHECK:      %[[RESULT:.*]] = gml_st.for
+// CHECK-SAME:     outs (%[[OUT:.*]] = %[[RES]]: vector<32xf32>)
+// CHECK-DAG:    %[[LHSTILE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK-DAG:    %[[LHSVEC:.*]] = vector.transfer_read %[[LHSTILE]]
+// CHECK:        %[[NEG:.*]] = arith.negf %[[LHSVEC]] : vector<4xf32>
+// CHECK:        gml_st.set_yield %[[NEG]] into %[[OUT]]
+// CHECK-SAME:   vector<4xf32> into vector<32xf32>
+
+// -----
+
+func.func @vectorize_loop_on_scalars(
+    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>) -> tensor<32xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  // We need this outer trivial loop to make sure the inner loop has a parent
+  // with the correct distribution label.
+  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
+         outs (%out_ = %arg1 : tensor<32xf32>) distribution ("test") {
+    %arg0tile = tensor.extract_slice %arg0[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %out_tile = tensor.extract_slice %out_[0][32][1]
+      : tensor<32xf32> to tensor<32xf32>
+    %3 = gml_st.for (%i) = (%c0) to (%c32) step (%c4)
+          outs(%out = %out_tile : tensor<32xf32>) {
+      %6 = tensor.extract_slice %arg0tile[%i][1][1]
+        : tensor<32xf32> to tensor<1xf32>
+      %7 = tensor.extract %6[%c0] : tensor<1xf32>
+      %9 = arith.negf %7 : f32
+      %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+      gml_st.set_yield %9 into %out[%tile]
+        : f32 into tensor<32xf32>[!gml_st.tile<1>]
+    } : tensor<32xf32>
+    %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
+    gml_st.set_yield %3 into %out_[%tile32]
+      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
+  } : tensor<32xf32>
+  func.return %2 : tensor<32xf32>
+}
+// CHECK-LABEL: @vectorize_loop_on_scalars(
+// CHECK-SAME:   %[[ARG0:.*]]: tensor<32xf32>, %[[ARG1:.*]]: tensor<32xf32>
+
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK:      gml_st.parallel
+// CHECK-SAME:   outs (%[[OUT_:.*]] = %[[ARG1]]:
+// CHECK-DAG:  %[[RES:.*]] = vector.transfer_read %[[OUT_]][%[[C0]]]
+// CHECK:      %[[RESULT:.*]] = gml_st.for
+// CHECK-SAME:     outs (%[[OUT:.*]] = %[[RES]]: vector<32xf32>)
+// CHECK:        %[[LHSTILE:.*]] = tensor.extract_slice %[[ARG0]]
+// CHECK:        %[[LHSVEC:.*]] = vector.transfer_read %[[LHSTILE]][%c0]
+// CHECK:        %[[LHSELEM:.*]] = vector.extract %[[LHSVEC]]
+// CHECK:        %[[NEG:.*]] = arith.negf %[[LHSELEM]] : f32
+// CHECK:        gml_st.set_yield %[[NEG]] into %[[OUT]]
+// CHECK-SAME:   f32 into vector<32xf32>
+
+// -----
+
+// CHECK-LABEL: @skip_vectorization_with_wrong_label(
+func.func @skip_vectorization_with_wrong_label(
+    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
+    -> tensor<32xf32> {
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
+      outs (%out_ = %arg1 : tensor<32xf32>) distribution ("no_vec") {
+    %3 = gml_st.parallel (%i) = (%c0) to (%c32) step (%c4)
+       outs (%out2_ = %out_ : tensor<32xf32>) distribution ("no_vec") {
+      %6 = tensor.extract_slice %arg0[%i][4][1]
+        : tensor<32xf32> to tensor<4xf32>
+      %7 = tensor.extract_slice %out2_[%i][4][1]
+        : tensor<32xf32> to tensor<4xf32>
+      %9 = linalg.map { arith.negf }
+             ins(%6: tensor<4xf32>)
+             outs(%7 : tensor<4xf32>)
+      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
+      gml_st.set_yield %9 into %out2_[%tile]
+        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
+    } : tensor<32xf32>
+    %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
+    gml_st.set_yield %3 into %out_[%tile32]
+      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
+  } : tensor<32xf32>
+  func.return %2 : tensor<32xf32>
+}
+// CHECK-NOT: vector.transfer_read
+
+// -----
+
+// CHECK-LABEL: @materialize_to_scalar(
+func.func @materialize_to_scalar(%arg1 : tensor<4xf32>) -> tensor<4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %empty = tensor.empty() : tensor<4xf32>
+  %1 = gml_st.parallel (%arg2) = (%c0) to (%c1) step (%c1)
+         outs (%out_ = %empty : tensor<4xf32>) distribution ("test") {
+    %5 = tensor.extract_slice %arg1[1][4][1]
+      : tensor<4xf32> to tensor<4xf32>
+    %3 = tensor.extract_slice %5[1][1][1]
+      : tensor<4xf32> to tensor<1xf32>
+    %4 = tensor.extract %3[%c0] : tensor<1xf32>
+    // CHECK: gml_st.materialize {{.*}} : vector<4xf32> to f32
+    %2 = arith.negf %4 : f32
+    %point = gml_st.tile [1][1][1] : !gml_st.tile<1>
+    gml_st.set_yield %2 into %out_[%point]
+      : f32 into tensor<4xf32>[!gml_st.tile<1>]
+  } : tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @materialize_to_dynamic_tile(
+func.func @materialize_to_dynamic_tile(%arg1 : tensor<4xf32>, %size : index)
+    -> tensor<4xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %empty = tensor.empty() : tensor<4xf32>
+  %0 = gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1)
+         outs (%out_ = %empty : tensor<4xf32>) distribution ("test") {
+    %1 = gml_st.parallel (%arg2) = (%c0) to (%c1) step (%c1)
+           outs (%out2_ = %out_ : tensor<4xf32>) distribution ("test") {
+      %2 = tensor.extract_slice %arg1[1][4][1]
+        : tensor<4xf32> to tensor<4xf32>
+      %3 = tensor.extract_slice %2[1][%size][1]
+        : tensor<4xf32> to tensor<?xf32>
+      %dynTile = gml_st.tile [1][%size][1] : !gml_st.tile<?>
+      gml_st.set_yield %3 into %out2_[%dynTile]
+        : tensor<?xf32> into tensor<4xf32>[!gml_st.tile<?>]
+    } : tensor<4xf32>
+    %tile = gml_st.tile [1][4][1] : !gml_st.tile<4>
+    gml_st.set_yield %1 into %out_[%tile]
+      : tensor<4xf32> into tensor<4xf32>[!gml_st.tile<4>]
+  } : tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+// CHECK-NOT: vector
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_gml_st.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_gml_st.mlir
deleted file mode 100644
index b852ddce1d5..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_gml_st.mlir
+++ /dev/null
@@ -1,249 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file \
-// RUN:     --vectorize-gml-st-loops="vectorize-gml-st-ops=true included-distribution-labels=test" \
-// RUN: | FileCheck %s
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @vectorize_gml_st_parallel_op(
-func.func @vectorize_gml_st_parallel_op(
-    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
-    -> tensor<32xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
-  // We need this outer trivial loop to make sure the inner loop has a parent
-  // with the correct distribution label.
-  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
-          distribution ("test") {
-    %arg0tile = gml_st.materialize %arg0[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %arg1tile = gml_st.materialize %arg1[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %3 = gml_st.parallel (%i) = (%c0) to (%c32) step (%c4)
-          distribution ("test") {
-      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-      %6 = gml_st.materialize %arg0tile[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %7 = gml_st.materialize %arg1tile[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %9 = linalg.generic {indexing_maps = [#map0, #map0],
-                          iterator_types = ["parallel"]}
-                          ins(%6: tensor<4xf32>)
-                          outs(%7 : tensor<4xf32>) {
-      ^bb0(%arg5: f32, %arg6: f32):
-        %10 = arith.negf %arg5 : f32
-        linalg.yield %10 : f32
-      } -> tensor<4xf32>
-      gml_st.set_yield %9 into %arg1tile[%tile]
-        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
-    } : tensor<32xf32>
-    gml_st.set_yield %3 into %arg1[%tile32]
-      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
-  } : tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-// CHECK:      gml_st.parallel
-// CHECK-DAG:  %[[ARG0TILE:.*]] = gml_st.materialize %arg0
-// CHECK-DAG:  %[[LHS:.*]] = vector.transfer_read %[[ARG0TILE]][%c0]
-// CHECK:      %[[RESULT:.*]] = gml_st.parallel
-// CHECK-DAG:    %[[LHSTILE:.*]] = gml_st.materialize %[[LHS]]
-// CHECK:        %[[NEG:.*]] = arith.negf %[[LHSTILE]] : vector<4xf32>
-// CHECK:        gml_st.set_yield %[[NEG]]
-// CHECK-SAME:   vector<4xf32> into vector<32xf32>
-// CHECK:      vector.transfer_write %[[RESULT]], {{%.*}}[%c0]
-
-// -----
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @vectorize_gml_st_for_op(
-func.func @vectorize_gml_st_for_op(
-    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
-    -> tensor<32xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
-  // We need this outer trivial loop to make sure the inner loop has a parent
-  // with the correct distribution label.
-  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
-          distribution ("test") {
-    %arg0tile = gml_st.materialize %arg0[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %arg1tile = gml_st.materialize %arg1[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %3 = gml_st.for (%i) = (%c0) to (%c32) step (%c4)
-          outs(%out = %arg1tile : tensor<32xf32>) {
-      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-      %6 = gml_st.materialize %arg0tile[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %7 = gml_st.materialize %out[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %9 = linalg.generic {indexing_maps = [#map0, #map0],
-                          iterator_types = ["parallel"]}
-                          ins(%6: tensor<4xf32>)
-                          outs(%7 : tensor<4xf32>) {
-      ^bb0(%arg5: f32, %arg6: f32):
-        %10 = arith.negf %arg5 : f32
-        linalg.yield %10 : f32
-      } -> tensor<4xf32>
-      gml_st.set_yield %9 into %out[%tile]
-        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
-    } : tensor<32xf32>
-    gml_st.set_yield %3 into %arg1[%tile32]
-      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
-  } : tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-// CHECK:      gml_st.parallel
-// CHECK-DAG:  %[[ARG0TILE:.*]] = gml_st.materialize %arg0
-// CHECK-DAG:  %[[ARG1TILE:.*]] = gml_st.materialize %arg1
-// CHECK-DAG:  %[[LHS:.*]] = vector.transfer_read %[[ARG0TILE]][%c0]
-// CHECK-DAG:  %[[RES:.*]] = vector.transfer_read %[[ARG1TILE]][%c0]
-// CHECK:      %[[RESULT:.*]] = gml_st.for
-// CHECK-SAME:     outs (%[[OUT:.*]] = %[[RES]]: vector<32xf32>)
-// CHECK-DAG:    %[[LHSTILE:.*]] = gml_st.materialize %[[LHS]]
-// CHECK:        %[[NEG:.*]] = arith.negf %[[LHSTILE]] : vector<4xf32>
-// CHECK:        gml_st.set_yield %[[NEG]] into %[[OUT]]
-// CHECK-SAME:   vector<4xf32> into vector<32xf32>
-// CHECK:      vector.transfer_write %[[RESULT]], %[[ARG1TILE]][%c0]
-
-// -----
-
-// CHECK-LABEL: @vectorize_loop_on_scalars(
-func.func @vectorize_loop_on_scalars(
-    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
-    -> tensor<32xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
-  // We need this outer trivial loop to make sure the inner loop has a parent
-  // with the correct distribution label.
-  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
-          distribution ("test") {
-    %arg0tile = gml_st.materialize %arg0[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %arg1tile = gml_st.materialize %arg1[%tile32]
-      : tensor<32xf32>[!gml_st.tile<32>] to tensor<32xf32>
-    %3 = gml_st.for (%i) = (%c0) to (%c32) step (%c4)
-          outs(%out = %arg1tile : tensor<32xf32>) {
-      %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
-      %6 = gml_st.materialize %arg0tile[%tile]
-        : tensor<32xf32>[!gml_st.tile<1>] to f32
-      %9 = arith.negf %6 : f32
-      gml_st.set_yield %9 into %out[%tile]
-        : f32 into tensor<32xf32>[!gml_st.tile<1>]
-    } : tensor<32xf32>
-    gml_st.set_yield %3 into %arg1[%tile32]
-      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
-  } : tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-// CHECK:      gml_st.parallel
-// CHECK-DAG:  %[[ARG0TILE:.*]] = gml_st.materialize %arg0
-// CHECK-DAG:  %[[ARG1TILE:.*]] = gml_st.materialize %arg1
-// CHECK-DAG:  %[[LHS:.*]] = vector.transfer_read %[[ARG0TILE]][%c0]
-// CHECK-DAG:  %[[RES:.*]] = vector.transfer_read %[[ARG1TILE]][%c0]
-// CHECK:      %[[RESULT:.*]] = gml_st.for
-// CHECK-SAME:     outs (%[[OUT:.*]] = %[[RES]]: vector<32xf32>)
-// CHECK-DAG:    %[[LHSTILE:.*]] = gml_st.materialize %[[LHS]]
-// CHECK:        %[[NEG:.*]] = arith.negf %[[LHSTILE]] : f32
-// CHECK:        gml_st.set_yield %[[NEG]] into %[[OUT]]
-// CHECK-SAME:   f32 into vector<32xf32>
-// CHECK:      vector.transfer_write %[[RESULT]], %[[ARG1TILE]][%c0]
-
-// -----
-
-#map0 = affine_map<(d0) -> (d0)>
-
-// CHECK-LABEL: @skip_vectorization_with_wrong_label(
-func.func @skip_vectorization_with_wrong_label(
-    %arg0: tensor<32xf32>, %arg1: tensor<32xf32>)
-    -> tensor<32xf32> {
-  %c4 = arith.constant 4 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %c32 = arith.constant 32 : index
-  %tile32 = gml_st.tile [0][32][1] : !gml_st.tile<32>
-  %2 = gml_st.parallel (%unused) = (%c0) to (%c1) step (%c1)
-          distribution ("no_vec") {
-    %3 = gml_st.parallel (%i) = (%c0) to (%c32) step (%c4)
-            distribution ("no_vec") {
-      %tile = gml_st.tile [%i] [4] [1] : !gml_st.tile<4>
-      %6 = gml_st.materialize %arg0[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %7 = gml_st.materialize %arg1[%tile]
-        : tensor<32xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %9 = linalg.generic {indexing_maps = [#map0, #map0],
-                          iterator_types = ["parallel"]}
-                          ins(%6 : tensor<4xf32>)
-                          outs(%7 : tensor<4xf32>) {
-      ^bb0(%arg5: f32, %arg6: f32):
-        %10 = arith.negf %arg5 : f32
-        linalg.yield %10 : f32
-      } -> tensor<4xf32>
-      gml_st.set_yield %9 into %arg1[%tile]
-        : tensor<4xf32> into tensor<32xf32>[!gml_st.tile<4>]
-    } : tensor<32xf32>
-    gml_st.set_yield %3 into %arg1[%tile32]
-      : tensor<32xf32> into tensor<32xf32>[!gml_st.tile<32>]
-  } : tensor<32xf32>
-  func.return %2 : tensor<32xf32>
-}
-// CHECK-NOT: vector.transfer_read
-
-// -----
-
-// CHECK-LABEL: @materialize_to_scalar(
-func.func @materialize_to_scalar(%arg1 : tensor<4xf32>) -> tensor<4xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %empty = tensor.empty() : tensor<4xf32>
-  %1 = gml_st.parallel (%arg2) = (%c0) to (%c1) step (%c1)
-            distribution ("test") {
-    %tile = gml_st.tile [1][4][1] : !gml_st.tile<4>
-    %point = gml_st.tile [1][1][1] : !gml_st.tile<1>
-    %5 = gml_st.materialize %arg1[%tile]
-      : tensor<4xf32>[!gml_st.tile<4>] to tensor<4xf32>
-    %3 = gml_st.materialize %5[%point]
-      : tensor<4xf32>[!gml_st.tile<1>] to f32
-    // CHECK: gml_st.materialize {{.*}} : vector<4xf32>[!gml_st.tile<1>] to f32
-    %2 = arith.negf %3 : f32
-    gml_st.set_yield %2 into %empty[%point]
-      : f32 into tensor<4xf32>[!gml_st.tile<1>]
-  } : tensor<4xf32>
-  return %1 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @materialize_to_dynamic_tile(
-func.func @materialize_to_dynamic_tile(%arg1 : tensor<4xf32>, %size : index)
-    -> tensor<4xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %empty = tensor.empty() : tensor<4xf32>
-  %0 = gml_st.parallel (%arg3) = (%c0) to (%c1) step (%c1)
-            distribution ("test") {
-    %tile = gml_st.tile [1][4][1] : !gml_st.tile<4>
-    %1 = gml_st.parallel (%arg2) = (%c0) to (%c1) step (%c1)
-              distribution ("test") {
-      %2 = gml_st.materialize %arg1[%tile]
-        : tensor<4xf32>[!gml_st.tile<4>] to tensor<4xf32>
-      %dynTile = gml_st.tile [1][%size][1] : !gml_st.tile<?>
-      %3 = gml_st.materialize %2[%dynTile]
-        : tensor<4xf32>[!gml_st.tile<?>] to tensor<?xf32>
-      gml_st.set_yield %3 into %empty[%dynTile]
-        : tensor<?xf32> into tensor<4xf32>[!gml_st.tile<?>]
-    } : tensor<4xf32>
-    gml_st.set_yield %1 into %empty[%tile]
-      : tensor<4xf32> into tensor<4xf32>[!gml_st.tile<4>]
-  } : tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-// CHECK-NOT: vector
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/warp_reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/warp_reduce.mlir
new file mode 100644
index 00000000000..91ebf82b2ef
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/warp_reduce.mlir
@@ -0,0 +1,226 @@
+// RUN: mlir-hlo-opt -split-input-file %s \
+// RUN:   -gml-st-simtfy="block-distribution-label=block" \
+// RUN:   -gml-st-to-gpu="warp-distribution-label=warp" \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: func @vector_reduce_add
+func.func @vector_reduce_add(
+  %arg0: vector<1xf32>,
+  %arg1: vector<1xf32>
+) -> vector<1xf32> {
+
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xf32> into vector<1x32xf32>[!gml_st.tile<1>]
+
+  // CHECK: %[[X0:.*]] = vector.extract %arg1[0]
+  // CHECK: %[[Y0:.*]], %{{.*}} = gpu.shuffle xor %[[X0]], %c1
+  // CHECK: %[[X1:.*]] = arith.addf %[[X0]], %[[Y0]]
+  // CHECK: %[[Y1:.*]], %{{.*}} = gpu.shuffle xor %[[X1]], %c2
+  // CHECK: %[[X2:.*]] = arith.addf %[[X1]], %[[Y1]]
+  // CHECK: %[[Y2:.*]], %{{.*}} = gpu.shuffle xor %[[X2]], %c4
+  // CHECK: %[[X3:.*]] = arith.addf %[[X2]], %[[Y2]]
+  // CHECK: %[[Y3:.*]], %{{.*}} = gpu.shuffle xor %[[X3]], %c8
+  // CHECK: %[[X4:.*]] = arith.addf %[[X3]], %[[Y3]]
+  // CHECK: %[[Y4:.*]], %{{.*}} = gpu.shuffle xor %[[X4]], %c16
+  // CHECK: %[[X5:.*]] = arith.addf %[[X4]], %[[Y4]]
+  // CHECK: %[[Y5:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[X6:.*]] = arith.addf %[[Y5]], %[[X5]]
+  // CHECK: %[[RESULT:.*]] = vector.broadcast %[[X6]]
+  %result = vector.multi_reduction <add>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x32xf32> to vector<1xf32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %result : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @vector_reduce_add_int
+func.func @vector_reduce_add_int(
+  %arg0: vector<1xi32>,
+  %arg1: vector<1xi32>
+) -> vector<1xi32> {
+
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xi32> into vector<1x32xi32>[!gml_st.tile<1>]
+
+  // CHECK: %[[X0:.*]] = vector.extract %arg1[0]
+  // CHECK: %[[Y0:.*]], %{{.*}} = gpu.shuffle xor %[[X0]], %c1
+  // CHECK: %[[X1:.*]] = arith.addi %[[X0]], %[[Y0]]
+  // CHECK: %[[Y1:.*]], %{{.*}} = gpu.shuffle xor %[[X1]], %c2
+  // CHECK: %[[X2:.*]] = arith.addi %[[X1]], %[[Y1]]
+  // CHECK: %[[Y2:.*]], %{{.*}} = gpu.shuffle xor %[[X2]], %c4
+  // CHECK: %[[X3:.*]] = arith.addi %[[X2]], %[[Y2]]
+  // CHECK: %[[Y3:.*]], %{{.*}} = gpu.shuffle xor %[[X3]], %c8
+  // CHECK: %[[X4:.*]] = arith.addi %[[X3]], %[[Y3]]
+  // CHECK: %[[Y4:.*]], %{{.*}} = gpu.shuffle xor %[[X4]], %c16
+  // CHECK: %[[X5:.*]] = arith.addi %[[X4]], %[[Y4]]
+  // CHECK: %[[Y5:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[X6:.*]] = arith.addi %[[Y5]], %[[X5]]
+  // CHECK: %[[RESULT:.*]] = vector.broadcast %[[X6]]
+  %result = vector.multi_reduction <add>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x32xi32> to vector<1xi32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %result : vector<1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @vector_reduce_mul
+func.func @vector_reduce_mul(
+  %arg0: vector<1xf32>,
+  %arg1: vector<1xf32>
+) -> vector<1xf32> {
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xf32> into vector<1x32xf32>[!gml_st.tile<1>]
+
+  // CHECK: arith.mulf
+  %result = vector.multi_reduction <mul>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x32xf32> to vector<1xf32>
+  func.return %result : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @vector_reduce_mul_int
+func.func @vector_reduce_mul_int(
+  %arg0: vector<1xi32>,
+  %arg1: vector<1xi32>
+) -> vector<1xi32> {
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xi32> into vector<1x32xi32>[!gml_st.tile<1>]
+
+  // CHECK: arith.muli
+  %result = vector.multi_reduction <mul>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x32xi32> to vector<1xi32>
+  func.return %result : vector<1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @vector_reduce_small
+func.func @vector_reduce_small(
+  %arg0: vector<1xf32>,
+  %arg1: vector<1xf32>
+) -> vector<1xf32> {
+
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xf32> into vector<1x4xf32>[!gml_st.tile<1>]
+
+  // CHECK: %[[X0:.*]] = vector.extract %arg1[0]
+  // CHECK: %[[Y0:.*]], %{{.*}} = gpu.shuffle xor %[[X0]], %c1
+  // CHECK: %[[X1:.*]] = arith.addf %[[X0]], %[[Y0]]
+  // CHECK: %[[Y1:.*]], %{{.*}} = gpu.shuffle xor %[[X1]], %c2
+  // CHECK: %[[X2:.*]] = arith.addf %[[X1]], %[[Y1]]
+  // CHECK: %[[Y2:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[X3:.*]] = arith.addf %[[Y2]], %[[X2]]
+  // CHECK: %[[RESULT:.*]] = vector.broadcast %[[X3]]
+  %result = vector.multi_reduction <add>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x4xf32> to vector<1xf32>
+
+  // CHECK: return %[[RESULT]]
+  func.return %result : vector<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @vector_reduce_fp16
+func.func @vector_reduce_fp16(
+  %arg0: vector<1xf16>,
+  %arg1: vector<1xf16>
+) -> vector<1xf16> {
+
+  %lane = gpu.lane_id
+  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
+  %dist = gml_st.distribute %arg1 into[%tile]
+    : vector<1xf16> into vector<1x2xf16>[!gml_st.tile<1>]
+
+  // CHECK: %[[X0:.*]] = vector.extract %arg1[0]
+  // CHECK: %[[A0:.*]] = arith.bitcast %[[X0]]
+  // CHECK: %[[B0:.*]] = arith.extui %[[A0]]
+  // CHECK: %[[C0:.*]], %{{.*}} = gpu.shuffle xor %[[B0]], %c1
+  // CHECK: %[[D0:.*]] = arith.trunci %[[C0]]
+  // CHECK: %[[Y0:.*]] = arith.bitcast %[[D0]]
+  // CHECK: %[[X1:.*]] = arith.maxf %[[X0]], %[[Y0]]
+  // CHECK: %[[Y1:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[X2:.*]] = arith.maxf %[[Y1]], %[[X1]]
+  // CHECK: %[[RESULT:.*]] = vector.broadcast %[[X2]]
+  %result = vector.multi_reduction <maxf>, %dist, %arg0
+    {"gml-st-distribution-label" = "warp"} [1]
+    : vector<1x2xf16> to vector<1xf16>
+
+  // CHECK: return %[[RESULT]]
+  func.return %result : vector<1xf16>
+}
+
+// -----
+
+#stride1 = strided<[1], offset: ?>
+
+// CHECK-LABEL: func @gpu_launch
+func.func @gpu_launch() -> memref<64xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c32 = arith.constant 32 : index
+  %c64 = arith.constant 64 : index
+  %cst = arith.constant dense<0.0> : vector<1xf32>
+  %0 = memref.alloc() : memref<64xf32>
+  // CHECK: gpu.launch
+  gml_st.parallel (%arg1) = (%c0) to (%c64) step (%c4) distribution ("block") {
+    %1 = memref.subview %0[%arg1] [4] [1]
+      : memref<64xf32> to memref<4xf32, #stride1>
+    gml_st.parallel (%arg2) = (%c0) to (%c4) step (%c1) distribution ("warp") {
+      %2 = memref.subview %1[%arg2] [1] [1]
+        : memref<4xf32, #stride1> to memref<1xf32, #stride1>
+
+      %init = vector.broadcast %cst : vector<1xf32> to vector<1x32xf32>
+      %3 = gml_st.parallel (%arg3) = (%c0) to (%c32) step (%c1)
+          outs (%out_ = %init: vector<1x32xf32>) distribution ("thread") {
+        %tile = gml_st.tile [0, %arg3] [1, 1] [1, 1] : !gml_st.tile<1x1>
+        %elem = arith.constant dense<1.0> : vector<1x1xf32>
+        gml_st.set_yield %elem into %out_[%tile]
+          : vector<1x1xf32> into vector<1x32xf32>[!gml_st.tile<1x1>]
+      } : vector<1x32xf32>
+
+      // CHECK-NOT: vector.multi_reduction
+      %sum = vector.multi_reduction <add>, %3, %cst [1]
+        : vector<1x32xf32> to vector<1xf32>
+      vector.transfer_write %sum, %2[%c0] {in_bounds = [true]}
+        : vector<1xf32>, memref<1xf32, #stride1>
+      gml_st.set_yield
+    }
+    gml_st.set_yield
+  }
+  return %0 : memref<64xf32>
+}
+
+// -----
+
+func.func @transform_only_warp_level_multi_reduction(%in: vector<4x10xi32>)
+    -> i32 {
+  %acc = arith.constant 0 : i32
+  %result = vector.multi_reduction <add>, %in, %acc
+    {"gml-st-distribution-level" = "not-warp"} [0, 1] : vector<4x10xi32> to i32
+  func.return %result : i32
+}
+
+// CHECK-LABEL: @transform_only_warp_level_multi_reduction
+// CHECK: vector.multi_reduction <add>, %[[IN:.*]], %[[ACC:.*]]
+// CHECK-SAME {"gml-st-distribution-level" = "not-warp"} [0, 1]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-fuse-linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-fuse-linalg.mlir
deleted file mode 100644
index be1db0627ea..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-fuse-linalg.mlir
+++ /dev/null
@@ -1,429 +0,0 @@
-// RUN: mlir-hlo-opt -lhlo-fuse-linalg %s -split-input-file | FileCheck %s --dump-input=always
-// RUN: mlir-hlo-opt -lhlo-fuse-linalg=tile-sizes=2,3 %s -split-input-file | FileCheck %s -check-prefix=TILED
-// RUN: mlir-hlo-opt -lhlo-fuse-linalg=use-parallel-loops %s -split-input-file | FileCheck %s -check-prefix=PLOOP
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
-                       iterator_types = ["parallel", "parallel"]}
-func.func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
-             %summand_2: memref<6x6xf32>, %result: memref<6x6xf32>) {
-  %temp_result = memref.alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait
-    ins(%summand_1, %summand_2 : memref<6x6xf32>, memref<6x6xf32>)
-   outs(%temp_result : memref<6x6xf32>) {
-  ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
-    %out = arith.addf %summand_1_in, %summand_2_in : f32
-    linalg.yield %out : f32
-  }
-  linalg.generic #pointwise_2d_trait
-    ins(%temp_result, %multiplier : memref<6x6xf32>, memref<6x6xf32>)
-   outs(%result : memref<6x6xf32>) {
-  ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
-    %out = arith.mulf %temp_result_in, %multiplier_in : f32
-    linalg.yield %out : f32
-  }
-  memref.dealloc %temp_result : memref<6x6xf32>
-  func.return
-}
-// CHECK-LABEL: func @fusion
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//       CHECK:    scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        addf
-//       CHECK:      linalg.generic
-//       CHECK:        mulf
-
-// TILED-LABEL: func @fusion
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-DAG:  %[[C3:.*]] = arith.constant 3
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//       TILED:    scf.for {{.*}} step %[[C3]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        addf
-//       TILED:      linalg.generic
-//       TILED:        mulf
-
-// PLOOP-LABEL: func @fusion
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        addf
-//       PLOOP:      linalg.generic
-//       PLOOP:        mulf
-
-// -----
-
-func.func @fusion_of_three(%arg0: memref<100x10xf32>,
-                      %arg1: memref<100xf32>,
-                      %arg2: memref<100x10xf32>) {
- %0 = memref.alloc() : memref<100x10xf32>
- linalg.generic {
-   indexing_maps = [affine_map<(d0, d1) -> (d0)>,
-                    affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]}
-     ins(%arg1 : memref<100xf32>)
-    outs(%0 : memref<100x10xf32>) {
-   ^bb0(%arg3: f32, %arg4: f32):
-     linalg.yield %arg3 : f32
-   }
- %1 = memref.alloc() : memref<100x10xf32>
- linalg.generic {
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                    affine_map<(d0, d1) -> (d0, d1)>,
-                    affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]}
-    ins(%arg0, %0 : memref<100x10xf32>, memref<100x10xf32>)
-   outs(%1 : memref<100x10xf32>) {
-     ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-       %2 = arith.subf %arg3, %arg4 : f32
-       linalg.yield %2 : f32
-     }
- memref.dealloc %0 : memref<100x10xf32>
- linalg.generic {
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                    affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]}
-     ins(%1 : memref<100x10xf32>)
-    outs(%arg2 : memref<100x10xf32>) {
-     ^bb0(%arg3: f32, %arg4: f32):
-       %2 = math.exp %arg3 : f32
-       linalg.yield %2 : f32
-     }
- memref.dealloc %1 : memref<100x10xf32>
- func.return
-}
-// CHECK-LABEL: func @fusion
-//       CHECK:  %[[C1:.*]] = arith.constant 1 :
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//       CHECK:    scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:      linalg.generic
-//       CHECK:        subf
-//       CHECK:      linalg.generic
-//       CHECK:        exp
-
-// TILED-LABEL: func @fusion_of_three
-//   TILED-DAG:   %[[C2:.*]] = arith.constant 2
-//   TILED-DAG:   %[[C3:.*]] = arith.constant 3
-//   TILED-NOT:   linalg.generic
-//       TILED:   scf.for {{.*}} step %[[C2]]
-//       TILED:     scf.for {{.*}} step %[[C3]]
-//   TILED-NOT:   scf.for
-//       TILED:       linalg.generic
-//       TILED:       linalg.generic
-//       TILED:         subf
-//       TILED:       linalg.generic
-//       TILED:         exp
-
-// PLOOP-LABEL: func @fusion_of_three
-//   PLOOP-NOT:   linalg.generic
-//       PLOOP:   scf.parallel
-//   PLOOP-NOT:   scf.parallel
-//       PLOOP:       linalg.generic
-//       PLOOP:       linalg.generic
-//       PLOOP:         subf
-//       PLOOP:       linalg.generic
-//       PLOOP:         exp
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pointwise_4d_trait = {indexing_maps = [#map0, #map0, #map0],
-                       iterator_types = ["parallel", "parallel", "parallel",
-                                         "parallel"]}
-func.func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32>,
-             %summand_2: memref<6x6x6x6xf32>, %result: memref<6x6x6x6xf32>) {
-  %temp_result = memref.alloc() : memref<6x6x6x6xf32>
-  linalg.generic #pointwise_4d_trait
-    ins(%summand_1, %summand_2 : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>)
-   outs(%temp_result : memref<6x6x6x6xf32>) {
-  ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
-    %out = arith.addf %summand_1_in, %summand_2_in : f32
-    linalg.yield %out : f32
-  }
-  linalg.generic #pointwise_4d_trait
-    ins(%temp_result, %multiplier : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>)
-   outs(%result : memref<6x6x6x6xf32>) {
-  ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
-    %out = arith.mulf %temp_result_in, %multiplier_in : f32
-    linalg.yield %out : f32
-  }
-  memref.dealloc %temp_result : memref<6x6x6x6xf32>
-  func.return
-}
-// CHECK-LABEL: func @fusion_4d
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//       CHECK:    scf.for {{.*}} step %[[C1]]
-//       CHECK:      scf.for {{.*}} step %[[C1]]
-//       CHECK:        scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        addf
-//       CHECK:      linalg.generic
-//       CHECK:        mulf
-
-// TILED-LABEL: func @fusion_4d
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-DAG:  %[[C3:.*]] = arith.constant 3
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//       TILED:    scf.for {{.*}} step %[[C3]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        addf
-//       TILED:      linalg.generic
-//       TILED:        mulf
-
-// PLOOP-LABEL: func @fusion_4d
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        addf
-//       PLOOP:      linalg.generic
-//       PLOOP:        mulf
-
-// -----
-
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
-                       iterator_types = ["parallel", "parallel"]}
-func.func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
-             %summand_2: memref<6x6xf32>) -> memref<6x6xf32> {
-  %temp_result = memref.alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait
-    ins(%summand_1, %summand_2 : memref<6x6xf32>, memref<6x6xf32>)
-   outs(%temp_result : memref<6x6xf32>) {
-  ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
-    %out = arith.addf %summand_1_in, %summand_2_in : f32
-    linalg.yield %out : f32
-  }
-  %result = memref.alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait
-    ins(%temp_result, %multiplier : memref<6x6xf32>, memref<6x6xf32>)
-   outs(%result : memref<6x6xf32>) {
-  ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
-    %out = arith.mulf %temp_result_in, %multiplier_in : f32
-    linalg.yield %out : f32
-  }
-  memref.dealloc %temp_result : memref<6x6xf32>
-  func.return %result : memref<6x6xf32>
-}
-
-// CHECK-LABEL: func @fusion
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//       CHECK:    scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        addf
-//       CHECK:      linalg.generic
-//       CHECK:        mulf
-
-// TILED-LABEL: func @fusion
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-DAG:  %[[C3:.*]] = arith.constant 3
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//       TILED:    scf.for {{.*}} step %[[C3]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        addf
-//       TILED:      linalg.generic
-//       TILED:        mulf
-
-// PLOOP-LABEL: func @fusion
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        addf
-//       PLOOP:      linalg.generic
-//       PLOOP:        mulf
-
-// -----
-
-func.func @view_result(%arg0: memref<?xf32>, %arg1: memref<?xindex>, %arg2: index)
-    -> memref<*xf32> {
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %1 = memref.alloc(%arg2) : memref<?xf32>
-  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
-                                   affine_map<(d0) -> (d0)>],
-                  iterator_types = ["parallel"]}
-      ins(%arg0 : memref<?xf32>) outs(%1 : memref<?xf32>) {
-  ^bb0(%arg3: f32, %arg4: f32):
-    %13 = math.absf %arg3 : f32
-    linalg.yield %13 : f32
-  }
-  %2 = memref.reshape %1(%arg1)
-      : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
-  func.return %2 : memref<*xf32>
-}
-
-// CHECK-LABEL: func @view_result
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        math.absf
-//       CHECK:  memref.reshape
-
-// TILED-LABEL: func @view_result
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        math.absf
-//       TILED:  memref.reshape
-
-
-// PLOOP-LABEL: func @view_result
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        math.absf
-//       PLOOP:  memref.reshape
-
-
-
-// -----
-
-// Confirm that tiling information is passed through RegionBranchOpInterfaces.
-// This test also uses memref.reshape, just to have a value to return through
-// the if statement.
-func.func @branching_result(%arg0: memref<?xf32>, %arg1: memref<?xindex>, %arg2: index)
-    -> memref<*xf32> {
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %1 = memref.alloc(%arg2) : memref<?xf32>
-  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
-                                   affine_map<(d0) -> (d0)>],
-                  iterator_types = ["parallel"]}
-      ins(%arg0 : memref<?xf32>) outs(%1 : memref<?xf32>) {
-  ^bb0(%arg3: f32, %arg4: f32):
-    %13 = math.absf %arg3 : f32
-    linalg.yield %13 : f32
-  }
-  %true = arith.constant 1 : i1
-  %3 = scf.if %true -> memref<*xf32> {
-    %2 = memref.reshape %1(%arg1)
-        : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
-    scf.yield %2 : memref<*xf32>
-  } else {
-    %2 = memref.reshape %1(%arg1)
-        : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
-    scf.yield %2 : memref<*xf32>
-  }
-  func.return %3 : memref<*xf32>
-}
-
-// CHECK-LABEL: func @branching_result
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        math.absf
-//       CHECK:  scf.if
-//       CHECK:    memref.reshape
-//       CHECK:    scf.yield
-//       CHECK:  else
-//       CHECK:    memref.reshape
-//       CHECK:    scf.yield
-
-// TILED-LABEL: func @branching_result
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        math.absf
-//       TILED:  scf.if
-//       TILED:    memref.reshape
-//       TILED:    scf.yield
-//       TILED:  else
-//       TILED:    memref.reshape
-//       TILED:    scf.yield
-
-// PLOOP-LABEL: func @branching_result
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        math.absf
-//       PLOOP:  scf.if
-//       PLOOP:    memref.reshape
-//       PLOOP:    scf.yield
-//       PLOOP:  else
-//       PLOOP:    memref.reshape
-//       PLOOP:    scf.yield
-
-// -----
-
-// Confirm that tiling information is passed through tensor_load, tensor.cast
-// and memref_to_tensor  operations.
-func.func @tensor_ops(%arg0: memref<32xf32>, %arg1: memref<32xindex>)
-    -> memref<?xf32> {
-  %c1 = arith.constant 1 : index
-  %1 = memref.alloc() : memref<32xf32>
-  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
-                                   affine_map<(d0) -> (d0)>],
-                  iterator_types = ["parallel"]}
-      ins(%arg0 : memref<32xf32>) outs(%1 : memref<32xf32>) {
-  ^bb0(%arg3: f32, %arg4: f32):
-    %13 = math.absf %arg3 : f32
-    linalg.yield %13 : f32
-  }
-  %2 = bufferization.to_tensor %1 : memref<32xf32>
-  %3 = tensor.cast %2 : tensor<32xf32> to tensor<?xf32>
-  %4 = bufferization.to_memref %3 : memref<?xf32>
-  func.return %4 : memref<?xf32>
-}
-
-// CHECK-LABEL: func @tensor_ops
-//       CHECK:  %[[C1:.*]] = arith.constant 1
-//   CHECK-NOT:  linalg.generic
-//       CHECK:  scf.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  scf.for
-//       CHECK:      linalg.generic
-//       CHECK:        math.absf
-//       CHECK:  bufferization.to_tensor
-//       CHECK:  tensor.cast
-//       CHECK:  bufferization.to_memref
-
-// TILED-LABEL: func @tensor_ops
-//   TILED-DAG:  %[[C2:.*]] = arith.constant 2
-//   TILED-NOT:  linalg.generic
-//       TILED:  scf.for {{.*}} step %[[C2]]
-//   TILED-NOT:  scf.for
-//       TILED:      linalg.generic
-//       TILED:        math.absf
-//       TILED:  bufferization.to_tensor
-//       TILED:  tensor.cast
-//       TILED:  bufferization.to_memref
-
-
-// PLOOP-LABEL: func @tensor_ops
-//   PLOOP-NOT:  linalg.generic
-//       PLOOP:  scf.parallel
-//   PLOOP-NOT:  scf.parallel
-//       PLOOP:      linalg.generic
-//       PLOOP:        math.absf
-//       PLOOP:  bufferization.to_tensor
-//       PLOOP:  tensor.cast
-//       PLOOP:  bufferization.to_memref
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
index 15d161192fb..8a1f6c0225f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
@@ -131,7 +131,7 @@ func.func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK: }
 
 // Use selected ivs to load element from the SRC buffer.
-// CHECK: %[[SRC_ELEM:.*]] = "memref.load"(%[[SRC_BUF]], %[[II]], %[[JJ]]) : (memref<56x56xf32>, index, index) -> f32
+// CHECK: %[[SRC_ELEM:.*]] = "memref.load"(%[[SRC_BUF]], %[[II]], %[[JJ]]) {nontemporal = false} : (memref<56x56xf32>, index, index) -> f32
 
 // Update of RESULT[SELECTED_I, SELECTED_J] should be done atomically, because
 // it may happen that several other threads select the same IVs if the windows
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
index fbcc9c8c15e..b6a0604d625 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | mlir-hlo-opt | FileCheck %s
+// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | mlir-hlo-opt -split-input-file | FileCheck %s
 
 // -----
 
@@ -78,7 +78,7 @@ func.func @invalid_alltoall(%input0: memref<2xf32>, %output: memref<8xf32>) {
 // -----
 
 func.func @invalid_alltoall(%input0: memref<2xf32>, %output: memref<8xf32>) {
-  // expected-error@+1 {{replica groups should be a rank 2 tensor of 64 bit integers}}
+  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
   "lmhlo.all_to_all"(%input0, %output)
     {channel_id = #mhlo.channel_handle<handle = 1, type = 0>, constrain_layout = false,
      replica_groups = dense<0> : tensor<1xi64>,
@@ -1081,7 +1081,7 @@ func.func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
 
 // CHECK-LABEL: func @valid_custom_call
 func.func @valid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1100,7 +1100,7 @@ func.func @valid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{number of entries in the mapping for args (1) should match the number of args for the operation (2)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1119,7 +1119,7 @@ func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{number of entries in the mapping for results (1) should match the number of results for the operation (2)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1138,7 +1138,7 @@ func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{entry 0 cannot appear more than once in the mapping for args}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1157,7 +1157,7 @@ func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{entry 1 cannot appear more than once in the mapping for results}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1176,7 +1176,7 @@ func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{entries in mapping for args must be >= 0 and less than target's number of args (4)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1195,7 +1195,7 @@ func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
 
 func.func @invalid_custom_call(%arg0:memref<1xf32>, %arg1:memref<1xf32>) -> () {
   // expected-error @+1 {{entries in mapping for results must be >= 0 and less than target's number of results (3)}}
-  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) {
+  "lmhlo.custom_call"(%arg0, %arg0, %arg1, %arg1) ({}) {
     backend_config = "",
     call_target_name = "foo",
     has_side_effects = false,
@@ -1226,3 +1226,63 @@ func.func @invalid_float_abs_call(%input:memref<2xf32>, %result:memref<2xf64>) -
   "lmhlo.abs"(%input, %result) : (memref<2xf32>, memref<2xf64>) -> ()
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: func @send_memrefs
+func.func @send_memrefs(%arg0: memref<3xf32>) -> !mhlo.token {
+  // CHECK: lmhlo.send
+  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+  // CHECK:   frontend_attributes = {foo = "bar"}
+  // CHECK:   is_host_transfer = true
+  %token = "lmhlo.send"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    frontend_attributes = {foo = "bar"},
+    is_host_transfer = true
+  } : (memref<3xf32>) -> (!mhlo.token)
+  return %token : !mhlo.token
+}
+
+// -----
+
+// CHECK-LABEL: func @send_done
+func.func @send_done(%arg0: !mhlo.token) {
+  // CHECK: lmhlo.send_done
+  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+  // CHECK:   is_host_transfer = true
+  "lmhlo.send_done"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+    is_host_transfer = true
+  } : (!mhlo.token) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @recv_memrefs
+func.func @recv_memrefs(%arg0: memref<3xf32>) -> !mhlo.token {
+  // CHECK: lmhlo.recv
+  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>
+  // CHECK:   frontend_attributes = {foo = "bar"}
+  // CHECK:   is_host_transfer = true
+  %token = "lmhlo.recv"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
+    frontend_attributes = {foo = "bar"},
+    is_host_transfer = true
+  } : (memref<3xf32>) -> (!mhlo.token)
+  return %token : !mhlo.token
+}
+
+// -----
+
+// CHECK-LABEL: func @recv_done
+func.func @recv_done(%arg0: !mhlo.token) {
+  // CHECK: lmhlo.recv_done
+  // CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>
+  // CHECK:   is_host_transfer = true
+  "lmhlo.recv_done"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
+    is_host_transfer = true
+  } : (!mhlo.token) -> ()
+  return
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/attrs.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/attrs.mlir
new file mode 100644
index 00000000000..f42240a1628
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/attrs.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file -allow-unregistered-dialect | FileCheck %s
+
+// CHECK-LABEL: parameter_replication
+func.func @parameter_replication(%arg0: tensor<f32> {mhlo.parameter_replication = [true]}, %arg1: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = [false, true]}) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: parameter_replication
+func.func @parameter_replication_empty(%arg0: tensor<f32> {mhlo.parameter_replication = []}, %arg1: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = []}) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: parameter_replication
+func.func @parameter_replication_single_false(%arg0: tensor<f32> {mhlo.parameter_replication = [false]}, %arg1: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = [false]}) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: parameter_replication
+func.func @parameter_replication_single_true(%arg0: tensor<f32> {mhlo.parameter_replication = [true]}, %arg1: tuple<tensor<2x4xf32>, tuple<tensor<2x4xf32>>> {mhlo.parameter_replication = [true]}) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
+
+
+// -----
+
+// expected-error@+1 {{parameter_replication: arg 0 has 1 leaf_buffers, but parameter_replication expects 2}}
+func.func @parameter_replication_num_leaf_buffer_mismatch(%arg0: tensor<f32> {mhlo.parameter_replication = [true, false]}) -> tensor<f32> {
+  return %arg0 : tensor<f32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/bitcast.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/bitcast.mlir
new file mode 100644
index 00000000000..4786e01d4a9
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/bitcast.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
+
+// CHECK-LABEL:@no_layout
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK-NOT: bitcast
+// CHECK: return %[[ARG0]]
+func.func @no_layout(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:@same_layout
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK-NOT: bitcast
+// CHECK: return %[[ARG0]]
+func.func @same_layout(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) {
+    source_layout = dense<[1, 0]> : tensor<2xindex>,
+    result_layout = dense<[1, 0]> : tensor<2xindex>
+  }: (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:@different_layout
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK: bitcast
+func.func @different_layout(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) {
+    source_layout = dense<[0, 1]> : tensor<2xindex>,
+    result_layout = dense<[1, 0]> : tensor<2xindex>
+  }: (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:@source_layout_only
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK: bitcast
+func.func @source_layout_only(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) {
+    source_layout = dense<[0, 1]> : tensor<2xindex>
+  }: (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:@result_layout_only
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK: bitcast
+func.func @result_layout_only(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) {
+    result_layout = dense<[1, 0]> : tensor<2xindex>
+  }: (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL:@type_cast
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x4xf32>
+// CHECK: bitcast
+func.func @type_cast(%arg: tensor<2x4xf32>) -> tensor<2x4xi32> {
+  %0 = "mhlo.bitcast"(%arg): (tensor<2x4xf32>) -> tensor<2x4xi32>
+  func.return %0 : tensor<2x4xi32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
index 166c5833b39..4ec3dd133a4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: add_fold
 func.func @add_fold() -> tensor<4xi64> {
@@ -252,6 +252,39 @@ func.func @min_fold_float() -> tensor<6xf32> {
   func.return %2 : tensor<6xf32>
 }
 
+// CHECK-LABEL: clamp_scalar_fold
+func.func @clamp_scalar_fold() -> tensor<5xi64> {
+  %0 = mhlo.constant dense<149> : tensor<i64>
+  %1 = mhlo.constant dense<[-1, 100, 200, 0, 149]> : tensor<5xi64>
+  %2 = mhlo.constant dense<0> : tensor<i64>
+  // CHECK{LITERAL}: mhlo.constant dense<[0, 100, 149, 0, 149]>
+  // CHECK-NOT: mhlo.clamp
+  %3 = mhlo.clamp %2, %1, %0 : (tensor<i64>, tensor<5xi64>, tensor<i64>) -> tensor<5xi64>
+  return %3 : tensor<5xi64>
+}
+
+// CHECK-LABEL: clamp_fold
+func.func @clamp_fold() -> tensor<5xi64> {
+  %0 = mhlo.constant dense<[149, 101, -1,  30, 50]> : tensor<5xi64>
+  %1 = mhlo.constant dense<[-1,  100, 200, 0,  149]> : tensor<5xi64>
+  %2 = mhlo.constant dense<[0,   10,  -10, 10, -100]> : tensor<5xi64>
+  // CHECK{LITERAL}: mhlo.constant dense<[0, 100, -1, 10, 50]>
+  // CHECK-NOT: mhlo.clamp
+  %3 = mhlo.clamp %2, %1, %0 : (tensor<5xi64>, tensor<5xi64>, tensor<5xi64>) -> tensor<5xi64>
+  return %3 : tensor<5xi64>
+}
+
+// CHECK-LABEL: clamp_fold_float
+func.func @clamp_fold_float() -> tensor<6xf32> {
+  %0 = mhlo.constant dense<[5.0, 66.0, 0xFFFFFFFF, -2.0,       0xFFFFFFFF, 6.0]> : tensor<6xf32>
+  %1 = mhlo.constant dense<[5.0, 3.0,  2.0,        0xFFFFFFFF, 0xFFFFFFFF, 4.0]> : tensor<6xf32>
+  %2 = mhlo.constant dense<[5.0, 1.0,  1.0,        0xFFFFFFFF, 0xFFFFFFFF, 5.0]> : tensor<6xf32>
+  // CHECK{LITERAL}: mhlo.constant dense<[5.000000e+00, 3.000000e+00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 5.000000e+00]
+  // CHECK-NOT: mhlo.clamp
+  %3 = mhlo.clamp %2, %1, %0 : (tensor<6xf32>, tensor<6xf32>, tensor<6xf32>) -> tensor<6xf32>
+  return %3 : tensor<6xf32>
+}
+
 // CHECK-LABEL: concatenate_noop
 func.func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG:%.+]]: tensor<4xi32>
@@ -781,6 +814,19 @@ func.func @dynamic_broadcast_in_dim_to_same_shape_4(%arg0: tensor<*xf32>) -> ten
   func.return %3 : tensor<?xf32>
 }
 
+// CHECK-LABEL: func @dynamic_broadcast_in_dim_all_dims_non_expanding
+func.func @dynamic_broadcast_in_dim_all_dims_non_expanding(%arg0: tensor<*xf32>, %arg1: tensor<1xindex>) -> tensor<?xf32> {
+  // CHECK-SAME: %[[ARG:.*]]: tensor<*xf32>
+  %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
+    broadcast_dimensions = dense<0> : tensor<1xi64>,
+    known_expanding_dimensions = dense<> : tensor<0xi64>,
+    known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
+  } : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: %[[RES:.*]] = tensor.cast %[[ARG]] : tensor<*xf32> to tensor<?xf32>
+  // CHECK: return %[[RES]] : tensor<?xf32>
+  func.return %1 : tensor<?xf32>
+}
+
 // CHECK-LABEL: func @broadcast_in_dim_constant_fold_0d
 func.func @broadcast_in_dim_constant_fold_0d() -> tensor<1x64x224x224xf32> {
   %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
@@ -837,7 +883,7 @@ func.func @dynamic_iota_is_static(%arg0 : tensor<1xindex>) -> tensor<4xi32> {
 
 // CHECK-LABEL: @dynamic_iota_is_static_constant_arg
 func.func @dynamic_iota_is_static_constant_arg(%arg0: tensor<5xi32>) -> tensor<?xi32> {
-  // CHECK-NOTE: mhlo.dynamic_iota
+  // CHECK-NOT: mhlo.dynamic_iota
   // CHECK: [[RESULT:%.*]] = "mhlo.iota"
   // CHECK: [[CAST:%.*]] = tensor.cast [[RESULT]]
   // CHECK: return [[CAST]]
@@ -1457,14 +1503,25 @@ func.func @fold_select_vector(%arg0 : tensor<4xf32>, %arg1 : tensor<4xf32>) -> t
 }
 
 // CHECK-LABEL: func @simplify_not_as_select_pred(
-// CHECK-SAME: [[ARGV0:%[a-zA-Z0-9_]+]]: tensor<4xi1>
-// CHECK-SAME: [[ARGV1:%[a-zA-Z0-9_]+]]: tensor<4xf32>
-// CHECK-SAME: [[ARGV2:%[a-zA-Z0-9_]+]]: tensor<4xf32>
 func.func @simplify_not_as_select_pred(%arg0 : tensor<4xi1>, %arg1 : tensor<4xf32>, %arg2 : tensor<4xf32>) -> tensor<4xf32> {
   %0 = "mhlo.not"(%arg0) : (tensor<4xi1>) -> tensor<4xi1>
   %1 = "mhlo.select"(%0, %arg1, %arg2) : (tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK: mhlo.select [[ARGV0]], [[ARGV2]], [[ARGV1]]
   func.return %1 : tensor<4xf32>
+
+  // CHECK: %[[R:.*]] = mhlo.select %arg0, %arg2, %arg1
+  // CHECK: return %[[R]]
+}
+
+// CHECK-LABEL: func @simplify_broadcasted_not_as_select_pred(
+func.func @simplify_broadcasted_not_as_select_pred(%arg0 : tensor<1xi1>, %arg1 : tensor<4xf32>, %arg2 : tensor<4xf32>) -> tensor<4xf32> {
+  %0 = "mhlo.not"(%arg0) : (tensor<1xi1>) -> tensor<1xi1>
+  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[0]> : tensor<1xi64> } : (tensor<1xi1>) -> tensor<4xi1>
+  %2 = "mhlo.select"(%1, %arg1, %arg2) : (tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  func.return %2 : tensor<4xf32>
+
+  // CHECK: %[[B:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<1xi1>) -> tensor<4xi1>
+  // CHECK: %[[R:.*]] = mhlo.select %[[B]], %arg2, %arg1
+  // CHECK: return %[[R]]
 }
 
 // CHECK-LABEL: gather_to_slice
@@ -1838,6 +1895,15 @@ func.func @not_fold_rsqrt_const_zero() -> tensor<4xf32> {
   func.return %1 : tensor<4xf32>
 }
 
+// CHECK-LABEL: func @fold_abs
+func.func @fold_abs() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<-1.0> : tensor<4xf32>
+  %1 = "mhlo.abs"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  //     CHECK: mhlo.constant dense<1.000000e+00> : tensor<4xf32>
+  // CHECK-NOT: mhlo.abs
+  func.return %1 : tensor<4xf32>
+}
+
 // CHECK-LABEL: func @fold_sine
 func.func @fold_sine() -> tensor<4xf32> {
   %0 = mhlo.constant dense<2.0> : tensor<4xf32>
@@ -1883,6 +1949,24 @@ func.func @fold_logistic() -> tensor<4xf32> {
   func.return %1 : tensor<4xf32>
 }
 
+// CHECK-LABEL: func @fold_log
+func.func @fold_log() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<2.0> : tensor<4xf32>
+  %1 = "mhlo.log"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  //     CHECK: mhlo.constant dense<0.693147182> : tensor<4xf32>
+  // CHECK-NOT: mhlo.log
+  func.return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @not_fold_log_neg_constants
+func.func @not_fold_log_neg_constants() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<-1.0> : tensor<4xf32>
+  %1 = "mhlo.log"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: mhlo.constant dense<-1.000000e+00> : tensor<4xf32>
+  // CHECK: mhlo.log
+  func.return %1 : tensor<4xf32>
+}
+
 // CHECK-LABEL: func @fold_if_true(
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
@@ -2596,11 +2680,68 @@ func.func public @reshape_splat_of_bools() -> tensor<2x1xi1> {
   return %1 : tensor<2x1xi1>
 }
 
-// CHECK-LABEL: @simplify_dynamic_gather
-func.func @simplify_dynamic_gather(%arg0: tensor<375682x256xf16>, %arg1: tensor<16x64xi64>) -> tensor<16x64x256xf16> {
+// CHECK-LABEL: @simplify_dynamic_gather_i64
+func.func @simplify_dynamic_gather_i64(%arg0: tensor<375682x256xf16>, %arg1: tensor<16x64xi64>) -> tensor<16x64x256xf16> {
   %0 = "arith.constant"() {value = dense<[1, 256]> : tensor<2xi64>} : () -> tensor<2xi64>
   %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %0) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<375682x256xf16>, tensor<16x64xi64>, tensor<2xi64>) -> tensor<16x64x256xf16>
   // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>} : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
   // CHECK: return %[[RET]]
   return %1 : tensor<16x64x256xf16>
 }
+
+// CHECK-LABEL: @simplify_dynamic_gather_i32
+func.func @simplify_dynamic_gather_i32(%arg0: tensor<375682x256xf16>, %arg1: tensor<16x64xi64>) -> tensor<16x64x256xf16> {
+  %0 = "arith.constant"() {value = dense<[1, 256]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %1 = "mhlo.dynamic_gather"(%arg0, %arg1, %0) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false} : (tensor<375682x256xf16>, tensor<16x64xi64>, tensor<2xi32>) -> tensor<16x64x256xf16>
+  // CHECK: %[[RET:.+]] = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 256]> : tensor<2xi64>} : (tensor<375682x256xf16>, tensor<16x64xi64>) -> tensor<16x64x256xf16>
+  // CHECK: return %[[RET]]
+  return %1 : tensor<16x64x256xf16>
+}
+
+// CHECK-LABEL: @fold_reduce_window
+func.func @fold_reduce_window(%arg0: tensor<1x1x20xf32>) -> tensor<1x1x20xf32> {
+  %cst_0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %r = "mhlo.reduce_window"(%arg0, %cst_0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %s = mhlo.add %arg1, %arg2 : tensor<f32>
+    mhlo.return %s : tensor<f32>
+  }) {
+    padding = dense<0> : tensor<3x2xi64>,
+    window_dimensions = dense<1> : tensor<3xi64>,
+    window_strides = dense<1> : tensor<3xi64>
+  } : (tensor<1x1x20xf32>, tensor<f32>) -> tensor<1x1x20xf32>
+  func.return %r : tensor<1x1x20xf32>
+
+  // CHECK: return %arg0 : tensor<1x1x20xf32>
+}
+
+// CHECK-LABEL: @simplify_real_dynamic_slice_to_slice
+func.func @simplify_real_dynamic_slice_to_slice(%arg0: tensor<?x4xf32>) -> tensor<1x4xf32> {
+  %0 = mhlo.constant dense<[0, 0]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[1, 4]> : tensor<2xi32>
+  %2 = mhlo.constant dense<[1, 1]> : tensor<2xi32>
+  %3 = mhlo.real_dynamic_slice %arg0, %0, %1, %2 : (tensor<?x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x4xf32>
+  // CHECK: %[[RESULT:.*]] =  "mhlo.slice"(%arg0)
+  // CHECK-DAG-SAME: start_indices = dense<[0, 0]> : tensor<2xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<[1, 4]> : tensor<2xi64>
+  // CHECK-DAG-SAME: strides = dense<[1, 1]> : tensor<2xi64>}
+  // CHECK: return %[[RESULT]] : tensor<1x4xf32>
+  return %3 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL: @simplify_real_dynamic_slice_to_dynamic_slice
+func.func @simplify_real_dynamic_slice_to_dynamic_slice(%arg0: tensor<?x4xf32>, %arg1: tensor<2xi32>) -> tensor<1x4xf32> {
+  %0 = mhlo.constant dense<[1, 4]> : tensor<2xi32>
+  %1 = mhlo.add %arg1, %0 : tensor<2xi32>
+  %2 = mhlo.constant dense<[1, 1]> : tensor<2xi32>
+  %3 = mhlo.real_dynamic_slice %arg0, %arg1, %1, %2 : (tensor<?x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x4xf32>
+  return %3 : tensor<1x4xf32>
+  //      CHECK: [[START_INDEX_0_1D:%.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[START_INDEX_0_0D:%.*]] = mhlo.reshape [[START_INDEX_0_1D]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: [[START_INDEX_1_1D:%.*]] = "mhlo.slice"(%arg1) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[START_INDEX_1_0D:%.*]] = mhlo.reshape [[START_INDEX_1_1D]] : (tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "mhlo.dynamic_slice"(%arg0, [[START_INDEX_0_0D]], [[START_INDEX_1_0D]]) {
+  // CHECK-SAME:   slice_sizes = dense<[1, 4]> : tensor<2xi64>
+  // CHECK-SAME: } : (tensor<?x4xf32>, tensor<i32>, tensor<i32>) -> tensor<1x4xf32>
+  // CHECK-NEXT: return [[RESULT]] : tensor<1x4xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
index 50574cb0a24..fc937c1657e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/concatenate.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @single_operand
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convert.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convert.mlir
index 5e3ee42651d..8bd55f3e070 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convert.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convert.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // -----
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
index 4c9cddeaa76..452168c923c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/convolution.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: @dot_general_is_dot
 func.func @dot_general_is_dot(%arg0: tensor<5x6xf32>, %arg1: tensor<6x?xf32>) -> tensor<5x?xf32> {
@@ -101,4 +101,28 @@ func.func @conv_grouped_is_dot_transpose_out(%arg0: tensor<5x4xf32>, %arg1: tens
   %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f]x[i, o]->[f, b], window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [], reverse = []} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<5x4xf32>, tensor<2x6xf32>) -> tensor<6x5xf32>
   // CHECK: return %[[OUT]]
   return %0 : tensor<6x5xf32>
-}
\ No newline at end of file
+}
+
+// -----
+
+// CHECK-LABEL: @dynamic_conv2d_padding
+func.func @dynamic_conv2d_padding(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32> {
+  %pad = arith.constant dense<[2, 0, 1, 1]> : tensor<4xi64>
+  // CHECK: %[[CONV:.+]] = mhlo.convolution
+  // CHECK-SAME: pad = {{\[\[}}2, 0], [1, 1]]
+  %0 = "mhlo.dynamic_conv"(%arg0, %arg1, %pad) {batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 4,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 3,
+      kernel_output_feature_dimension = 4,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 1,
+      output_feature_dimension = 4,
+      output_spatial_dimensions = [2, 3]
+    >, feature_group_count = 1 : i64, lhs_dilation = dense<1> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} :
+       (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>, tensor<4xi64>) -> tensor<32x1x8x8x16xf32>
+  // CHECK: return %[[CONV]]
+  func.return %0 : tensor<32x1x8x8x16xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/custom_call.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/custom_call.mlir
index 7a5abf99886..ca31f880b7f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/custom_call.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/custom_call.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL:@noeffect
 func.func @noeffect(%arg0: tensor<8xf32>) -> tensor<8xf32> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
index a18184f364a..1d62442436c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/folder_limit.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @add_lhs_zero
 func.func @add_lhs_zero(%lhs: tensor<65537xi8>) -> tensor<65537xi8> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
index 4fb78183951..6c38f5111dd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reduce.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // -----
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
index 8aad9915069..f15829fd2ad 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reshape.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @const_fold_collapse_to_scalar
 func.func @const_fold_collapse_to_scalar() -> tensor<i32> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
index 5afe47a7551..33880f6adb1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/reverse.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @noop
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<1x2xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
index cd8d3e869ed..1a7a6f343e9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/scatter.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // Folding this case would explode the IR
 func.func @scatter_fold_explosion() ->  tensor<512x1x6400x6400xf32> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
index 719c1f21c3d..38f2cce820d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @transpose_splat_constant
 func.func @transpose_splat_constant() -> tensor<5x10xf32> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/tuple.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/tuple.mlir
index 769018d2ad1..77f31ab6eb0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/tuple.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/canonicalize/tuple.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='func.func(canonicalize)' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
 
 // CHECK-LABEL: func @fold_access
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
index 6f2c0036118..30b97f1fca9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_hlo_tuples.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -split-input-file -expand-hlo-tuples='entry-function=main' | FileCheck %s
+// RUN: mlir-hlo-opt %s -split-input-file -expand-hlo-tuples='entry-function=main' -allow-unregistered-dialect | FileCheck %s
 // Check if the `expand-hlo-tuples` pass adds the right variable to return_op and function return type.
 
 func.func @main(%arg0: tensor<1x1xf32>, %arg1: tensor<1x8x8x16xf32>) -> tuple<tensor<1024xf32>, tensor<1xf32>> {
@@ -22,10 +22,10 @@ func.func @main(%arg0: tuple<tensor<1024xf32>, tensor<1xf32>>) -> tuple<tensor<1
   func.return %arg0 : tuple<tensor<1024xf32>, tensor<1xf32>>
 }
 
-// CHECK:   func @main(%[[VAL_0:.*]]: tensor<1024xf32>, %[[VAL_1:.*]]: tensor<1xf32>) -> (tensor<1024xf32>, tensor<1xf32>) {
-// CHECK:           %[[VAL_2:.*]] = mhlo.tuple %[[VAL_0]], %[[VAL_1]] : tuple<tensor<1024xf32>, tensor<1xf32>>
-// CHECK:           return %[[VAL_0]], %[[VAL_1]] : tensor<1024xf32>, tensor<1xf32>
-// CHECK:         }
+// CHECK: func @main(%[[VAL_0:.*]]: tensor<1024xf32>, %[[VAL_1:.*]]: tensor<1xf32>) -> (tensor<1024xf32>, tensor<1xf32>) {
+// CHECK:   %[[VAL_2:.*]] = mhlo.tuple %[[VAL_0]], %[[VAL_1]] : tuple<tensor<1024xf32>, tensor<1xf32>>
+// CHECK:   return %[[VAL_0]], %[[VAL_1]] : tensor<1024xf32>, tensor<1xf32>
+// CHECK: }
 
 // -----
 
@@ -35,5 +35,18 @@ func.func @main() -> tuple<> {
 }
 
 // CHECK-LABEL: func @main() {
-// CHECK:   return{{$}}
-// CHECK:  }
+//       CHECK:   return{{$}}
+//       CHECK: }
+
+// -----
+
+func.func @main() -> tuple<tensor<1xf32>, tensor<1xi32>> {
+  %0 = "test.dummy"() : () -> tuple<tensor<1xf32>, tensor<1xi32>>
+  func.return %0 : tuple<tensor<1xf32>, tensor<1xi32>>
+}
+
+// CHECK-LABEL: func @main()
+//       CHECK: %[[TUPLE:.*]] = "test.dummy"()
+//       CHECK: %[[T0:.*]] = mhlo.get_tuple_element %[[TUPLE]][0]
+//       CHECK: %[[T1:.*]] = mhlo.get_tuple_element %[[TUPLE]][1]
+//       CHECK: return %[[T0]], %[[T1]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir
new file mode 100644
index 00000000000..58f60bb795c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/expand_ops_simplifier.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-hlo-opt %s -split-input-file -mhlo-expand-ops-simplifier | FileCheck %s
+
+func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
+  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = "mhlo.compare"(%arg3, %arg4) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = mhlo.add %arg3, %arg4 : tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) {
+    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+  func.return %1 : tensor<10x24x24x64xf32>
+}
+
+// CHECK-LABEL:   func @main
+// CHECK-SAME:        %[[OPERAND:.*]]: tensor<10x24x24x64xf32>,
+// CHECK-SAME:        %[[SOURCE:.*]]: tensor<10x12x12x64xf32>
+// CHECK-SAME:        -> tensor<10x24x24x64xf32>
+// CHECK-DAG:       %[[NEG_1:.*]] = mhlo.constant dense<-1> : tensor<i64>
+// CHECK-DAG:       %[[INIT:.*]] = mhlo.constant dense<0.000000e+00> : tensor<10x24x24x64xf32>
+// CHECK-DAG:       %[[C0:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK:           %[[IOTA_0:.*]] = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_1:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_2:.*]] = "mhlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[IOTA_3:.*]] = "mhlo.iota"() {iota_dimension = 3 : i64} : () -> tensor<10x24x24x64xi64>
+// CHECK:           %[[REDUCE_WINDOW:.*]]:5 = "mhlo.reduce_window"(%[[OPERAND]], %[[IOTA_0]], %[[IOTA_1]], %[[IOTA_2]], %[[IOTA_3]], %[[C0]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]], %[[NEG_1]]) ({
+// CHECK:           ^bb0(%[[VAL_10:.*]]: tensor<f32>, %[[VAL_11:.*]]: tensor<i64>, %[[VAL_12:.*]]: tensor<i64>, %[[VAL_13:.*]]: tensor<i64>, %[[VAL_14:.*]]: tensor<i64>, %[[VAL_15:.*]]: tensor<f32>, %[[VAL_16:.*]]: tensor<i64>, %[[VAL_17:.*]]: tensor<i64>, %[[VAL_18:.*]]: tensor<i64>, %[[VAL_19:.*]]: tensor<i64>):
+// CHECK:             %[[VAL_20:.*]] = mhlo.compare  NE, %[[VAL_11]], %[[NEG_1]]
+// CHECK:             %[[VAL_21:.*]] = mhlo.compare  NE, %[[VAL_16]], %[[NEG_1]]
+// CHECK:             %[[VAL_22:.*]] = mhlo.not %[[VAL_21]] : tensor<i1>
+// CHECK:             %[[VAL_23:.*]] = mhlo.compare  GE, %[[VAL_10]], %[[VAL_15]]
+// CHECK:             %[[VAL_24:.*]] = mhlo.and %[[VAL_23]], %[[VAL_20]] : tensor<i1>
+// CHECK:             %[[VAL_25:.*]] = mhlo.or %[[VAL_24]], %[[VAL_22]] : tensor<i1>
+// CHECK:             %[[SELECTED_0:.*]] = mhlo.select %[[VAL_25]], %[[VAL_10]], %[[VAL_15]]
+// CHECK:             %[[SELECTED_1:.*]] = mhlo.select %[[VAL_25]], %[[VAL_11]], %[[VAL_16]]
+// CHECK:             %[[SELECTED_2:.*]] = mhlo.select %[[VAL_25]], %[[VAL_12]], %[[VAL_17]]
+// CHECK:             %[[SELECTED_3:.*]] = mhlo.select %[[VAL_25]], %[[VAL_13]], %[[VAL_18]]
+// CHECK:             %[[SELECTED_4:.*]] = mhlo.select %[[VAL_25]], %[[VAL_14]], %[[VAL_19]]
+// CHECK:             mhlo.return %[[SELECTED_0]], %[[SELECTED_1]], %[[SELECTED_2]], %[[SELECTED_3]], %[[SELECTED_4]]
+// CHECK:           }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<10x24x24x64xi64>, tensor<f32>, tensor<i64>, tensor<i64>, tensor<i64>, tensor<i64>) -> (tensor<10x12x12x64xf32>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>, tensor<10x12x12x64xi64>)
+// CHECK:           %[[RESHAPE_0:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#1 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
+// CHECK:           %[[RESHAPE_1:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#2 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
+// CHECK:           %[[RESHAPE_2:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#3 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
+// CHECK:           %[[RESHAPE_3:.*]] = mhlo.reshape %[[REDUCE_WINDOW]]#4 : (tensor<10x12x12x64xi64>) -> tensor<10x12x12x64x1xi64>
+// CHECK:           %[[CONCAT:.*]] = "mhlo.concatenate"(%[[RESHAPE_0]], %[[RESHAPE_1]], %[[RESHAPE_2]], %[[RESHAPE_3]]) {dimension = 4 : i64}
+// CHECK:           %[[SCATTER:.*]] = "mhlo.scatter"(%[[INIT]], %[[CONCAT]], %[[SOURCE]]) ({
+// CHECK:           ^bb0(%[[VAL_38:.*]]: tensor<f32>, %[[VAL_39:.*]]: tensor<f32>):
+// CHECK:             %[[UPDATE:.*]] = mhlo.add %[[VAL_38]], %[[VAL_39]] : tensor<f32>
+// CHECK:             mhlo.return %[[UPDATE]] : tensor<f32>
+// CHECK:           }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<inserted_window_dims = [0, 1, 2, 3], scatter_dims_to_operand_dims = [0, 1, 2, 3], index_vector_dim = 4>, unique_indices = false} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64x4xi64>, tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32>
+// CHECK:           return %[[SCATTER]] : tensor<10x24x24x64xf32>
+// CHECK:         }
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-lhlo.mlir
index 129c5b2f1b8..afae8cf8ab3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-lhlo.mlir
@@ -619,7 +619,8 @@ func.func @transpose(%operand: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK-LABEL: func @custom_call
 // CHECK-SAME:([[ARG0:%.*]]: memref<2x2xf32>, [[ARG1:%.*]]: memref<2x3xf32>)
 func.func @custom_call(%arg0: tensor<2x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<4x4xf16> {
-  // CHECK: "lmhlo.custom_call"([[ARG0]], [[ARG1]], %{{.*}}) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 1>}
+  // CHECK: "lmhlo.custom_call"([[ARG0]], [[ARG1]], %{{.*}}) ({
+  // CHECK-NEXT: }) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 1>}
   %result = "mhlo.custom_call"(%arg0, %arg1)
               {backend_config = "", call_target_name = "foo", has_side_effect = false}
               : (tensor<2x2xf32>, tensor<2x3xf32>) -> tensor<4x4xf16>
@@ -631,7 +632,8 @@ func.func @custom_call(%arg0: tensor<2x2xf32>, %arg1: tensor<2x3xf32>) -> tensor
 // CHECK-LABEL: func @custom_call_multiout
 // CHECK-SAME:([[ARG0:%.*]]: memref<2x2xf32>, [[ARG1:%.*]]: memref<2x3xf32>)
 func.func @custom_call_multiout(%arg0: tensor<2x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<4x4xf16> {
-  // CHECK: "lmhlo.custom_call"([[ARG0]], [[ARG1]], %{{.*}}, %{{.*}}) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 2>}
+  // CHECK: "lmhlo.custom_call"([[ARG0]], [[ARG1]], %{{.*}}, %{{.*}}) ({
+  // CHECK-NEXT: }) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 2>}
   %temp:2 = "mhlo.custom_call"(%arg0, %arg1)
                    {backend_config = "", call_target_name = "foo", has_side_effect = false}
                    : (tensor<2x2xf32>, tensor<2x3xf32>) -> (tensor<4x4xf16>, tensor<4x4xf16>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index 1f4be3c535e..45150a6f0ee 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -21,7 +21,6 @@ func.func @float_add(%lhs: tensor<2x2xf32>,
 
   // CHECK-PRIMITIVE: linalg.map
   // CHECK-PRIMITIVE: arith.addf
-  // CHECK-PRIMITIVE: linalg.yield
   %0 = "mhlo.add"(%lhs, %rhs) {someattr}
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
@@ -29,6 +28,27 @@ func.func @float_add(%lhs: tensor<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @float_add_dynamic_encoding
+// CHECK-PRIMITIVE-LABEL: func @float_add_dynamic_encoding
+func.func @float_add_dynamic_encoding(
+  %lhs: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>,
+  %rhs: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>)
+    -> tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>> {
+  // CHECK: linalg.generic
+  // CHECK: arith.addf
+  // CHECK: linalg.yield
+
+  // CHECK-PRIMITIVE: linalg.map
+  // CHECK-PRIMITIVE: arith.addf
+  %0 = "mhlo.add"(%lhs, %rhs) {someattr}
+      : (tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>,
+         tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>)
+      -> tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>
+  func.return %0 : tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>
+}
+
+// -----
+
 // CHECK-LABEL: integer_add
 // CHECK-PRIMITIVE-LABEL: integer_add
 func.func @integer_add(%lhs: tensor<2x2xi32>,
@@ -213,19 +233,13 @@ func.func @complex_rsqrt(%operand: tensor<2x2xcomplex<f32>>)
 func.func @float_cbrt(%operand: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %tensor_result = "mhlo.cbrt"(%operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK: %[[THIRD:.+]] = arith.constant 0.333333343
   // CHECK: ^{{[a-z0-9_]*}}
   // CHECK-SAME: %[[IN:[a-zA-Z0-9_]*]]: f32
-  // CHECK-SAME: %[[OUT:[a-zA-Z0-9_]*]]: f32
-  // CHECK: %[[ABS:.+]] = math.absf %[[IN]]
-  // CHECK: %[[POW:.+]] = math.powf %[[ABS]], %[[THIRD]]
-  // CHECK: %[[RESULT:.+]] = math.copysign %[[POW]], %[[IN]]
+  // CHECK: %[[RESULT:.+]] = math.cbrt %[[IN]]
   // CHECK: linalg.yield %[[RESULT]]
 
   // CHECK-PRIMITIVE: linalg.map
-  // CHECK-PRIMITIVE: math.absf
-  // CHECK-PRIMITIVE: math.powf
-  // CHECK-PRIMITIVE: math.copysign
+  // CHECK-PRIMITIVE: math.cbrt
   func.return %tensor_result : tensor<2x2xf32>
 }
 
@@ -283,11 +297,10 @@ func.func @float_abs(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: linalg.generic
   // CHECK-SAME: {someattr}
   // CHECK: math.absf
-  // CHECK-PRIMITIVE: linalg.map
-  // CHECK-PRIMITIVE-NEXT: ins(
-  // CHECK-PRIMITIVE-NEXT: outs(
+  // CHECK-PRIMITIVE: linalg.map { math.absf }
+  // CHECK-PRIMITIVE-SAME: ins(
+  // CHECK-PRIMITIVE-SAME: outs(
   // CHECK-PRIMITIVE-SAME: {someattr}
-  // CHECK-PRIMITIVE: math.absf
   %0 = "mhlo.abs"(%arg0) {someattr} : (tensor<2x2xf32>) -> tensor<2x2xf32>
   func.return %0 : tensor<2x2xf32>
 }
@@ -657,8 +670,8 @@ func.func @float_cmp_totalorder(%lhs: tensor<2x2xbf16>,
 // CHECK-PRIMITIVE-DAG: %[[C0:.*]] = arith.constant 0 : i16
 // CHECK-PRIMITIVE-DAG: %[[C32767:.*]] = arith.constant 32767 : i16
 // CHECK-PRIMITIVE: linalg.map
-// CHECK-PRIMITIVE-NEXT: ins(
-// CHECK-PRIMITIVE-NEXT: outs(
+// CHECK-PRIMITIVE-SAME: ins(
+// CHECK-PRIMITIVE-SAME: outs(
 // CHECK-PRIMITIVE-NEXT: (%[[LHS_IN:[a-zA-Z0-9]*]]: bf16, %[[RHS_IN:.*]]: bf16) {
 // CHECK-PRIMITIVE-NEXT:   %[[LHS_INT:.*]] = arith.bitcast %[[LHS_IN]] : bf16 to i16
 // CHECK-PRIMITIVE-NEXT:   %[[LHS_CMP:.*]] = arith.cmpi slt, %[[LHS_INT]], %[[C0]] : i16
@@ -858,12 +871,9 @@ func.func @select(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>,
 
 // CHECK-PRIMITIVE-LABEL: func @select
 // CHECK-PRIMITIVE: tensor.empty() : tensor<2x2xf32>
-// CHECK-PRIMITIVE: linalg.map
-// CHECK-PRIMITIVE-NEXT: ins(
-// CHECK-PRIMITIVE-NEXT: outs(
-// CHECK-PRIMITIVE-NEXT: (%[[PRED_IN:[a-zA-Z0-9]*]]: i1, %[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32) {
-// CHECK-PRIMITIVE-NEXT:   %[[RESULT:.*]] = arith.select %[[PRED_IN]], %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-PRIMITIVE-NEXT:   linalg.yield %[[RESULT]] : f32
+// CHECK-PRIMITIVE: linalg.map { arith.select }
+// CHECK-PRIMITIVE-SAME: ins(
+// CHECK-PRIMITIVE-SAME: outs(
 
 // -----
 
@@ -895,8 +905,8 @@ func.func @select_scalar_pred_dyn(%pred : tensor<i1>, %lhs: tensor<2x?xf32>, %rh
 // CHECK-PRIMITIVE-DAG:  %[[DST:.*]] = tensor.empty(%[[DIM]])
 // CHECK-PRIMITIVE-DAG:  %[[PRED_ELEM:.*]] = tensor.extract %[[PRED]]
 // CHECK-PRIMITIVE:      linalg.map
-// CHECK-PRIMITIVE-NEXT:   ins(%[[LHS]], %[[RHS]] : tensor<2x?xf32>, tensor<2x?xf32>)
-// CHECK-PRIMITIVE-NEXT:   outs(%[[DST]] : tensor<2x?xf32>)
+// CHECK-PRIMITIVE-SAME:   ins(%[[LHS]], %[[RHS]] : tensor<2x?xf32>, tensor<2x?xf32>)
+// CHECK-PRIMITIVE-SAME:   outs(%[[DST]] : tensor<2x?xf32>)
 // CHECK-PRIMITIVE-SAME:   {someattr}
 // CHECK-PRIMITIVE:      (%[[LHS_:.*]]: f32, %[[RHS_:.*]]: f32) {
 // CHECK-PRIMITIVE:        %[[RES:.*]] = arith.select %[[PRED_ELEM]], %[[LHS_]], %[[RHS_]] : f32
@@ -904,6 +914,16 @@ func.func @select_scalar_pred_dyn(%pred : tensor<i1>, %lhs: tensor<2x?xf32>, %rh
 
 // -----
 
+// CHECK-LABEL: func @select_mixed
+func.func @select_mixed(%pred: tensor<2x?xi1>, %lhs: tensor<?x2xf32>,
+             %rhs: tensor<2x2xf32>) -> tensor<?x2xf32> {
+  %0 = "mhlo.select"(%pred, %lhs, %rhs)
+         : (tensor<2x?xi1>, tensor<?x2xf32>, tensor<2x2xf32>) -> (tensor<?x2xf32>)
+  func.return %0 : tensor<?x2xf32>
+}
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
@@ -916,6 +936,13 @@ func.func @broadcast_scalar(%arg: tensor<f32>) -> tensor<4x2x1xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast_scalar
+// CHECK-PRIMITIVE: tensor.empty() : tensor<4x2x1xf32>
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE-SAME: ins(
+// CHECK-PRIMITIVE-SAME: outs(
+// CHECK-PRIMITIVE-SAME: dimensions = [0, 1, 2]
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>
@@ -932,6 +959,13 @@ func.func @broadcast(%arg: tensor<4x?x16xf32>) -> tensor<4x2x1x4x?x16xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast
+// CHECK-PRIMITIVE-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-PRIMITIVE: %[[DIM:.*]] = tensor.dim %{{.*}}, %[[C1]] : tensor<4x?x16xf32>
+// CHECK-PRIMITIVE: %{{.*}} = tensor.empty(%[[DIM]]) : tensor<4x2x1x4x?x16xf32>
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE:   dimensions = [0, 1, 2]
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
@@ -948,6 +982,14 @@ func.func @broadcast_in_dim(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf3
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast_in_dim
+// CHECK-PRIMITIVE: tensor.collapse_shape
+// CHECK-PRIMITIVE: linalg.transpose
+// CHECK-PRIMITIVE:   permutation = [1, 0]
+// CHECK-PRIMITIVE: tensor.empty() : tensor<7x10x6x4x5xf32>
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE:   dimensions = [1, 2, 3]
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
@@ -966,6 +1008,15 @@ func.func @broadcast_in_dim_ui32(%operand: tensor<5x7x1xui32>) -> tensor<7x10x6x
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : i32
 // CHECK: builtin.unrealized_conversion_cast %[[RES]] : tensor<7x10x6x4x5xi32> to tensor<7x10x6x4x5xui32>
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast_in_dim_ui32
+// CHECK-PRIMITIVE: tensor.collapse_shape
+// CHECK-PRIMITIVE: linalg.transpose
+// CHECK-PRIMITIVE:   permutation = [1, 0]
+// CHECK-PRIMITIVE: tensor.empty() : tensor<7x10x6x4x5xi32>
+// CHECK-PRIMITIVE: %[[RES:.*]] = linalg.broadcast
+// CHECK-PRIMITIVE:   dimensions = [1, 2, 3]
+// CHECK-PRIMITIVE: builtin.unrealized_conversion_cast %[[RES]] : tensor<7x10x6x4x5xi32> to tensor<7x10x6x4x5xui32>
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
@@ -983,12 +1034,43 @@ func.func @broadcast_in_dim_with_one_to_one(
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast_in_dim_with_one_to_one
+// CHECK-PRIMITIVE-NOT: tensor.collapse_shape
+// CHECK-PRIMITIVE-NOT: linalg.transpose
+// CHECK-PRIMITIVE:     linalg.broadcast
+// CHECK-PRIMITIVE:       dimensions = [1]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d2, d0, d1)>
+// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @broadcast_in_dim_with_transpose
+func.func @broadcast_in_dim_with_transpose(
+         %operand: tensor<2x3x4xf32>) -> tensor<3x4x2x5xf32> {
+  %0 = "mhlo.broadcast_in_dim"(%operand)
+         {broadcast_dimensions = dense<[2, 0, 1]> : tensor<3xi64>}
+         : (tensor<2x3x4xf32>) -> tensor<3x4x2x5xf32>
+  func.return %0 : tensor<3x4x2x5xf32>
+}
+// CHECK: tensor.empty() : tensor<3x4x2x5xf32>
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// CHECK-PRIMITIVE-LABEL: func @broadcast_in_dim_with_transpose
+// CHECK-PRIMITIVE: tensor.empty() : tensor<3x4x2xf32>
+// CHECK-PRIMITIVE: linalg.transpose
+// CHECK-PRIMITIVE:   permutation = [1, 2, 0]
+// CHECK-PRIMITIVE: tensor.empty() : tensor<3x4x2x5xf32>
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE:   dimensions = [3]
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-LABEL: func @broadcast_scalar
-func.func @broadcast_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32> {
+// CHECK-LABEL: func @broadcast_in_dim_scalar
+func.func @broadcast_in_dim_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32> {
   %0 = "mhlo.broadcast_in_dim"(%operand)
         {broadcast_dimensions = dense<[]> : tensor<0xi64>}
         : (tensor<f32>) -> tensor<7x10x6xf32>
@@ -999,6 +1081,11 @@ func.func @broadcast_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %{{.*}}: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @broadcast_in_dim_scalar
+// CHECK-PRIMITIVE: tensor.empty() : tensor<7x10x6xf32>
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE:   dimensions = [0, 1, 2]
+
 // -----
 
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3, d2)>
@@ -1044,11 +1131,23 @@ func.func @transpose_dynamic(%arg0: tensor<?x?x9x?xi32>) -> tensor<?x?x?x9xi32>
 // CHECK-PRIMITIVE: %[[D3:.*]] = tensor.dim %arg0, %[[C3]]
 // CHECK-PRIMITIVE: %[[INIT:.*]] = tensor.empty(%[[D1]], %[[D0]], %[[D3]]) : tensor<?x?x?x9xi32>
 // CHECK-PRIMITIVE: linalg.transpose
-// CHECK-PRIMITIVE-NEXT: ins(%arg0 : tensor<?x?x9x?xi32>)
-// CHECK-PRIMITIVE-NEXT: outs(%[[INIT]] : tensor<?x?x?x9xi32>)
-// CHECK-PRIMITIVE-NEXT: permutation = [1, 0, 3, 2]
+// CHECK-PRIMITIVE-SAME: ins(%arg0 : tensor<?x?x9x?xi32>)
+// CHECK-PRIMITIVE-SAME: outs(%[[INIT]] : tensor<?x?x?x9xi32>)
+// CHECK-PRIMITIVE-SAME: permutation = [1, 0, 3, 2]
 // CHECK-PRIMITIVE-SAME: {someattr}
 
+func.func @transpose_unsigned(%arg0: tensor<2x2xui32>) -> tensor<2x2xui32> {
+  %0 = "mhlo.transpose"(%arg0) {
+    permutation = dense<[1, 0]> : tensor<2xi64>,
+    result_layout = dense<[0, 1]> : tensor<2xindex>
+  } : (tensor<2x2xui32>) -> tensor<2x2xui32>
+  return %0 : tensor<2x2xui32>
+}
+
+// Regression test. Just check that unsigned ints lower successfully.
+// CHECK-LABEL: func @transpose_unsigned
+// CHECK-PRIMITIVE-LABEL: func @transpose_unsigned
+
 // -----
 
 // CHECK-LABEL: func @real_dynamic_slice
@@ -2108,7 +2207,7 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>) -> tensor<?xf32> {
   } : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   func.return %result : tensor<?xf32>
 }
-// CHECK: [[CST:%.*]] = arith.constant
+// CHECK: [[CST:%.*]] = arith.constant dense
 // CHECK: [[INIT:%.*]] = tensor.empty
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
@@ -2117,6 +2216,13 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>) -> tensor<?xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
+// CHECK-PRIMITIVE: [[CST:%.*]] = arith.constant dense
+// CHECK-PRIMITIVE: [[INIT:%.*]] = tensor.empty
+// CHECK-PRIMITIVE: linalg.broadcast
+// CHECK-PRIMITIVE-SAME: ins([[CST]]
+// CHECK-PRIMITIVE-SAME: outs([[INIT]]
+
 // -----
 
 // CHECK: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> ()>
@@ -2139,6 +2245,10 @@ func.func @dynamic_broadcast_in_dim(%scalar: tensor<f32>, %shape: tensor<2xindex
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
+// CHECK-PRIMITIVE: tensor.empty
+// CHECK-PRIMITIVE: linalg.broadcast
+
 // -----
 
 // CHECK: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1)>
@@ -2161,9 +2271,15 @@ func.func @dynamic_broadcast_in_dim(%vector: tensor<42xf32>, %shape: tensor<3xin
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
+// CHECK-PRIMITIVE: tensor.empty
+// CHECK-PRIMITIVE: %[[RESULT:.*]] = linalg.broadcast
+// CHECK-PRIMITIVE: tensor.cast %[[RESULT]] : tensor<?x42x?xf32> to tensor<?x?x?xf32>
+
 // -----
 
 // CHECK-LABEL: func @dynamic_broadcast_in_dim(
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
 // Note: this test requires no checks. The tensor.empty verifier will
 // fail if the %shape i32 -> index cast is not performed properly.
 func.func @dynamic_broadcast_in_dim(%scalar: tensor<f32>, %shape: tensor<2xi32>)
@@ -2197,6 +2313,13 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>, %cst: tensor<ui32>)
 // CHECK: [[RES:%.*]] = builtin.unrealized_conversion_cast [[GENERIC]] : tensor<?xi32> to tensor<?xui32>
 // CHECK: return [[RES]] : tensor<?xui32>
 
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
+// CHECK-PRIMITIVE: tensor.empty
+// CHECK-PRIMITIVE: %[[BROADCASTED:.*]] = linalg.broadcast
+// CHECK-PRIMITIVE: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[BROADCASTED]]
+// CHECK-PRIMITIVE-SAME:  tensor<?xi32> to tensor<?xui32>
+// CHECK-PRIMITIVE: return %[[RES]] : tensor<?xui32>
+
 // -----
 
 // CHECK: #[[ARG_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (0, 0, d3, d4, 0, d6)>
@@ -2204,30 +2327,10 @@ func.func @dynamic_broadcast_in_dim(%shape: tensor<1xindex>, %cst: tensor<ui32>)
 
 // CHECK-LABEL: @dynamic_broadcast_in_dim
 // CHECK-SAME:  %[[ARG:.*]]: tensor<?x?x?x?x1x42xf32>, %[[SHAPE:.*]]: tensor<7xindex>
+// CHECK-PRIMITIVE-LABEL: func @dynamic_broadcast_in_dim
+// CHECK-PRIMITIVE-SAME:  %[[ARG:.*]]: tensor<?x?x?x?x1x42xf32>, %[[SHAPE:.*]]: tensor<7xindex>
 func.func @dynamic_broadcast_in_dim(%arg: tensor<?x?x?x?x1x42xf32>,
     %shape: tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xf32> {
-  // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG:  %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG:  %[[C2:.*]] = arith.constant 2
-  // CHECK-DAG:  %[[C3:.*]] = arith.constant 3
-  // CHECK-DAG:  %[[C4:.*]] = arith.constant 4
-  // CHECK-DAG:  %[[C5:.*]] = arith.constant 5
-  // CHECK-DAG:  %[[C6:.*]] = arith.constant 6
-  // CHECK-DAG:  %[[DIM0:.*]] = tensor.extract %[[SHAPE]][%[[C0]]]
-  // CHECK-DAG:  %[[DIM1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
-  // CHECK-DAG:  %[[DIM2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
-  // CHECK-DAG:  %[[DIM3:.*]] = tensor.extract %[[SHAPE]][%[[C3]]]
-  // CHECK-DAG:  %[[DIM4:.*]] = tensor.extract %[[SHAPE]][%[[C4]]]
-  // CHECK-DAG:  %[[DIM5:.*]] = tensor.extract %[[SHAPE]][%[[C5]]]
-  // CHECK-DAG:  %[[DIM6:.*]] = tensor.extract %[[SHAPE]][%[[C6]]]
-  // CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[DIM4]], %[[DIM5]], %[[DIM6]]) : tensor<?x?x?x?x?x?x?xf32>
-  // CHECK:      %[[RES:.*]] = linalg.generic {
-  // CHECK-SAME:     indexing_maps = [#[[ARG_MAP]], #[[RES_MAP]]],
-  // CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]}
-  // CHECK-SAME:     ins(%[[ARG]] : tensor<?x?x?x?x1x42xf32>) outs(%[[INIT]] : tensor<?x?x?x?x?x?x?xf32>) {
-  // CHECK:      ^bb0(%[[ARG_:.*]]: f32, %{{.*}}: f32):
-  // CHECK:        linalg.yield %[[ARG_]]
-  // CHECK:      return %[[RES]]
   %result = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) {
       broadcast_dimensions = dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi64>,
       known_expanding_dimensions = dense<[0, 1]> : tensor<2xi64>,
@@ -2236,6 +2339,55 @@ func.func @dynamic_broadcast_in_dim(%arg: tensor<?x?x?x?x1x42xf32>,
   func.return %result : tensor<?x?x?x?x?x?x?xf32>
 }
 
+// CHECK-DAG:  %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:  %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:  %[[C2:.*]] = arith.constant 2
+// CHECK-DAG:  %[[C3:.*]] = arith.constant 3
+// CHECK-DAG:  %[[C4:.*]] = arith.constant 4
+// CHECK-DAG:  %[[C5:.*]] = arith.constant 5
+// CHECK-DAG:  %[[C6:.*]] = arith.constant 6
+// CHECK-DAG:  %[[DIM0:.*]] = tensor.extract %[[SHAPE]][%[[C0]]]
+// CHECK-DAG:  %[[DIM1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
+// CHECK-DAG:  %[[DIM2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
+// CHECK-DAG:  %[[DIM3:.*]] = tensor.extract %[[SHAPE]][%[[C3]]]
+// CHECK-DAG:  %[[DIM4:.*]] = tensor.extract %[[SHAPE]][%[[C4]]]
+// CHECK-DAG:  %[[DIM5:.*]] = tensor.extract %[[SHAPE]][%[[C5]]]
+// CHECK-DAG:  %[[DIM6:.*]] = tensor.extract %[[SHAPE]][%[[C6]]]
+// CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[DIM4]], %[[DIM5]], %[[DIM6]]) : tensor<?x?x?x?x?x?x?xf32>
+// CHECK:      %[[RES:.*]] = linalg.generic {
+// CHECK-SAME:     indexing_maps = [#[[ARG_MAP]], #[[RES_MAP]]],
+// CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]}
+// CHECK-SAME:     ins(%[[ARG]] : tensor<?x?x?x?x1x42xf32>) outs(%[[INIT]] : tensor<?x?x?x?x?x?x?xf32>) {
+// CHECK:      ^bb0(%[[ARG_:.*]]: f32, %{{.*}}: f32):
+// CHECK:        linalg.yield %[[ARG_]]
+// CHECK:      return %[[RES]]
+
+// CHECK-PRIMITIVE-DAG:  %[[C0:.*]] = arith.constant 0
+// CHECK-PRIMITIVE-DAG:  %[[C1:.*]] = arith.constant 1
+// CHECK-PRIMITIVE-DAG:  %[[C2:.*]] = arith.constant 2
+// CHECK-PRIMITIVE-DAG:  %[[C3:.*]] = arith.constant 3
+// CHECK-PRIMITIVE-DAG:  %[[C4:.*]] = arith.constant 4
+// CHECK-PRIMITIVE-DAG:  %[[C5:.*]] = arith.constant 5
+// CHECK-PRIMITIVE:      tensor.cast %[[ARG]]
+// CHECK-PRIMITIVE-SAME:   tensor<?x?x?x?x1x42xf32> to tensor<1x1x?x?x1x42xf32>
+// CHECK-PRIMITIVE:      %[[COLLAPSED:.*]] = tensor.collapse_shape
+// CHECK-PRIMITIVE-SAME{literal}:   [[0, 1, 2], [3], [4, 5]]
+// CHECK-PRIMITIVE-SAME:   tensor<1x1x?x?x1x42xf32> into tensor<?x?x42xf32>
+// CHECK-PRIMITIVE-DAG:  %[[DIM0:.*]] = tensor.extract %[[SHAPE]][%[[C0]]]
+// CHECK-PRIMITIVE-DAG:  %[[DIM1:.*]] = tensor.extract %[[SHAPE]][%[[C1]]]
+// CHECK-PRIMITIVE-DAG:  %[[DIM2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
+// CHECK-PRIMITIVE-DAG:  %[[DIM3:.*]] = tensor.extract %[[SHAPE]][%[[C3]]]
+// CHECK-PRIMITIVE-DAG:  %[[DIM4:.*]] = tensor.extract %[[SHAPE]][%[[C4]]]
+// CHECK-PRIMITIVE-DAG:  %[[DIM5:.*]] = tensor.extract %[[SHAPE]][%[[C5]]]
+// CHECK-PRIMITIVE:      %[[INIT:.*]] = tensor.empty(%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[DIM4]], %[[DIM5]]) : tensor<?x?x?x?x?x?x42xf32>
+// CHECK-PRIMITIVE:      %[[BROADCASTED:.*]] = linalg.broadcast
+// CHECK-PRIMITIVE-SAME:   ins(%[[COLLAPSED]] : tensor<?x?x42xf32>)
+// CHECK-PRIMITIVE-SAME:   outs(%[[INIT]] : tensor<?x?x?x?x?x?x42xf32>)
+// CHECK-PRIMITIVE-SAME:   dimensions = [0, 1, 2, 5]
+// CHECK-PRIMITIVE:      %[[RES:.*]] = tensor.cast %[[BROADCASTED]]
+// CHECK-PRIMITIVE-SAME:    tensor<?x?x?x?x?x?x42xf32> to tensor<?x?x?x?x?x?x?xf32>
+// CHECK-PRIMITIVE:      return %[[RES]]
+
 // -----
 
 func.func @dot_matmul(%arg0: tensor<2x3xf32>,
@@ -2694,9 +2846,9 @@ func.func @einsum_dynamic_size_broadcast_dot(%arg0: tensor<?x?x4xf32>, %arg1: te
 
 // -----
 
-// CHECK-LABEL: @clamp
+// CHECK-LABEL: @clamp_static
 // CHECK-SAME: %[[LB:.*]]: tensor<4xf32>, %[[X:.*]]: tensor<4xf32>, %[[UB:.*]]: tensor<4xf32>
-func.func @clamp(%lb : tensor<4xf32>, %x : tensor<4xf32>, %ub : tensor<4xf32>)
+func.func @clamp_static(%lb : tensor<4xf32>, %x : tensor<4xf32>, %ub : tensor<4xf32>)
     -> tensor<4xf32> {
   // CHECK: %[[INIT:.*]] = tensor.empty
   // CHECK: %[[RESULT:.*]] = linalg.generic {{.*}} ins(%[[LB]], %[[X]], %[[UB]] : tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) outs(%[[INIT]] : tensor<4xf32>)
@@ -2711,6 +2863,17 @@ func.func @clamp(%lb : tensor<4xf32>, %x : tensor<4xf32>, %ub : tensor<4xf32>)
   func.return %0 : tensor<4xf32>
 }
 
+// CHECK-PRIMITIVE-LABEL: @clamp_static
+// CHECK-PRIMITIVE-SAME: %[[LB:.*]]: tensor<4xf32>, %[[X:.*]]: tensor<4xf32>, %[[UB:.*]]: tensor<4xf32>
+
+// CHECK-PRIMITIVE: %[[INIT:.*]] = tensor.empty
+// CHECK-PRIMITIVE: %[[RESULT:.*]] = linalg.map ins(%[[LB]], %[[X]], %[[UB]] : tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) outs(%[[INIT]] : tensor<4xf32>)
+// CHECK-PRIMITIVE: (%[[SCALAR_LB:.*]]: f32, %[[SCALAR_X:.*]]: f32, %[[SCALAR_UB:.*]]: f32)
+// CHECK-PRIMITIVE:   %[[MAX:.*]] = arith.maxf %[[SCALAR_LB]], %[[SCALAR_X]] : f32
+// CHECK-PRIMITIVE:   %[[MIN:.*]] = arith.minf %[[MAX]], %[[SCALAR_UB]] : f32
+// CHECK-PRIMITIVE:   linalg.yield %[[MIN]]
+// CHECK-PRIMITIVE: return %[[RESULT]] : tensor<4xf32>
+
 // -----
 
 // CHECK-LABEL: @clamp_dynamic
@@ -2730,6 +2893,65 @@ func.func @clamp_dynamic(%lb : tensor<?xf32>, %x : tensor<?xf32>, %ub : tensor<?
   func.return %0 : tensor<?xf32>
 }
 
+// CHECK-PRIMITIVE-LABEL: @clamp_dynamic
+// CHECK-PRIMITIVE: linalg.map
+
+// -----
+
+func.func @clamp_mixed(%lb : tensor<4xf32>, %x : tensor<?xf32>, %ub : tensor<?xf32>)
+    -> tensor<?xf32> {
+  %0 = "mhlo.clamp"(%lb, %x, %ub) : (tensor<4xf32>, tensor<?xf32>,
+      tensor<?xf32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @clamp_mixed
+// CHECK: linalg.generic
+
+// CHECK-PRIMITIVE-LABEL: @clamp_mixed
+// CHECK-PRIMITIVE: linalg.map
+
+// -----
+
+func.func @clamp_scalar(%lb : tensor<f32>, %x : tensor<?xf32>, %ub : tensor<f32>)
+    -> tensor<?xf32> {
+  %0 = "mhlo.clamp"(%lb, %x, %ub) : (tensor<f32>, tensor<?xf32>,
+      tensor<f32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @clamp_scalar
+// CHECK: linalg.generic
+
+// CHECK-PRIMITIVE-LABEL: @clamp_scalar
+// CHECK-PRIMITIVE-SAME: %[[LB:.*]]: tensor<f32>, %[[X:.*]]: tensor<?xf32>, %[[UB:.*]]: tensor<f32>
+
+// CHECK-PRIMITIVE-DAG: %[[INIT:.*]] = tensor.empty
+// CHECK-PRIMITIVE-DAG: %[[SCALAR_LB:.*]] = tensor.extract %[[LB]]
+// CHECK-PRIMITIVE-DAG: %[[SCALAR_UB:.*]] = tensor.extract %[[UB]]
+// CHECK-PRIMITIVE: %[[RESULT:.*]] = linalg.map ins(%[[X]] : tensor<?xf32>) outs(%[[INIT]] : tensor<?xf32>)
+// CHECK-PRIMITIVE: (%[[SCALAR_X:.*]]: f32)
+// CHECK-PRIMITIVE:   %[[MAX:.*]] = arith.maxf %[[SCALAR_LB]], %[[SCALAR_X]] : f32
+// CHECK-PRIMITIVE:   %[[MIN:.*]] = arith.minf %[[MAX]], %[[SCALAR_UB]] : f32
+// CHECK-PRIMITIVE:   linalg.yield %[[MIN]]
+// CHECK-PRIMITIVE: return %[[RESULT]]
+
+
+// -----
+
+func.func @clamp_scalar_mixed(%lb : tensor<f32>, %x : tensor<?xf32>, %ub : tensor<?xf32>)
+    -> tensor<?xf32> {
+  %0 = "mhlo.clamp"(%lb, %x, %ub) : (tensor<f32>, tensor<?xf32>,
+      tensor<?xf32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @clamp_scalar_mixed
+// CHECK: linalg.generic
+
+// CHECK-PRIMITIVE-LABEL: @clamp_scalar_mixed
+// CHECK-PRIMITIVE: linalg.map
+
 // -----
 
 func.func @map_compare(%arg0: tensor<?xcomplex<f32>>,
@@ -2770,8 +2992,8 @@ func.func @map_compare(%arg0: tensor<?xcomplex<f32>>,
 
 // CHECK-PRIMITIVE: %[[INIT:.+]] = tensor.empty
 // CHECK-PRIMITIVE: %[[MAP:.+]] = linalg.map
-// CHECK-PRIMITIVE-NEXT: ins(%[[ARG0]], %[[ARG1]]
-// CHECK-PRIMITIVE-NEXT: outs(%[[INIT]] : tensor<?xi1>)
+// CHECK-PRIMITIVE-SAME: ins(%[[ARG0]], %[[ARG1]]
+// CHECK-PRIMITIVE-SAME: outs(%[[INIT]] : tensor<?xi1>)
 // CHECK-PRIMITIVE-NEXT: (%[[A:.+]]: complex<f32>, %[[B:.+]]: complex<f32>) {
 // CHECK-PRIMITIVE: %[[RE1:.+]] = complex.re %[[A]] : complex<f32>
 // CHECK-PRIMITIVE: %[[RE2:.+]] = complex.re %[[B]] : complex<f32>
@@ -2779,6 +3001,26 @@ func.func @map_compare(%arg0: tensor<?xcomplex<f32>>,
 // CHECK-PRIMITIVE: linalg.yield %[[CMP]] : i1
 // CHECK-PRIMITIVE: }
 // CHECK-PRIMITIVE: return %[[MAP]] : tensor<?xi1>
+
+// -----
+
+func.func @map_mixed(%arg0: tensor<?xf32>,
+                     %arg1: tensor<4xf32>) -> tensor<?xf32> {
+  %0 = "mhlo.map"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {dimensions = dense<0> : tensor<1xi64>}
+  : (tensor<?xf32>, tensor<4xf32>) -> tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @map_mixed
+// CHECK: linalg.generic
+
+// CHECK-PRIMITIVE-LABEL: @map_mixed
+// CHECK-PRIMITIVE: linalg.map
+
 // -----
 
 func.func @reduce_add(%arg0: tensor<5x4xi32>, %arg1: tensor<i32>) -> tensor<5xi32> {
@@ -2805,9 +3047,19 @@ func.func @reduce_add(%arg0: tensor<5x4xi32>, %arg1: tensor<i32>) -> tensor<5xi3
 // CHECK-NEXT:   %[[RESULT:.*]] = arith.addi %[[RHS_IN]], %[[LHS_IN]] : i32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
 
+// CHECK-PRIMITIVE-LABEL: @reduce_add
+// CHECK-PRIMITIVE-DAG: %[[INIT:.*]] = tensor.extract %{{.*}} : tensor<i32>
+// CHECK-PRIMITIVE-DAG: %[[INIT_TENSOR:.*]] = tensor.empty()
+// CHECK-PRIMITIVE-DAG: %[[FILL_TENSOR:.*]] = linalg.fill ins(%[[INIT]]{{.*}}outs(%[[INIT_TENSOR]]
+// CHECK-PRIMITIVE: linalg.reduce { arith.addi }
+// CHECK-PRIMITIVE-SAME: ins(%{{.*}}tensor<5x4xi32>)
+// CHECK-PRIMITIVE-SAME: outs(%[[FILL_TENSOR]] : tensor<5xi32>)
+// CHECK-PRIMITIVE-SAME: dimensions = [1]  {someattr}
+
 // -----
 
 // CHECK-LABEL: @reduce_add_unranked
+// CHECK-PRIMITIVE-LABEL: @reduce_add_unranked
 func.func @reduce_add_unranked(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> tensor<*xi32> {
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
   ^bb0(%arg3: tensor<i32>, %arg4 : tensor<i32>):
@@ -2817,6 +3069,7 @@ func.func @reduce_add_unranked(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> tens
   func.return %0 : tensor<*xi32>
 }
 // CHECK: mhlo.reduce
+// CHECK-PRIMITIVE: mhlo.reduce
 
 // -----
 
@@ -2843,6 +3096,33 @@ func.func @reduce_dim0(%arg0: tensor<5x4xi32>, %arg1: tensor<i32>) -> tensor<4xi
 // CHECK-NEXT:   %[[RESULT:.*]] = arith.maxsi %[[RHS_IN]], %[[LHS_IN]] : i32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
 
+// CHECK-PRIMITIVE-LABEL: @reduce_dim0
+// CHECK-PRIMITIVE-DAG: %[[INIT:.*]] = tensor.extract %{{.*}} : tensor<i32>
+// CHECK-PRIMITIVE-DAG: %[[INIT_TENSOR:.*]] = tensor.empty()
+// CHECK-PRIMITIVE-DAG: %[[FILL_TENSOR:.*]] = linalg.fill ins(%[[INIT]]{{.*}}outs(%[[INIT_TENSOR]]
+// CHECK-PRIMITIVE: linalg.reduce { arith.maxsi }
+// CHECK-PRIMITIVE-SAME: ins(%{{.*}}tensor<5x4xi32>)
+// CHECK-PRIMITIVE-SAME: outs(%[[FILL_TENSOR]] : tensor<4xi32>)
+// CHECK-PRIMITIVE-SAME: dimensions = [0]
+
+// -----
+
+func.func @reduce_dynamic_output(%arg0: tensor<5x4xi32>, %arg1: tensor<i32>) -> tensor<?xi32> {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ({
+  ^bb0(%arg3: tensor<i32>, %arg4 : tensor<i32>):
+    %1 = mhlo.maximum %arg3, %arg4 : tensor<i32>
+    "mhlo.return"(%1) : (tensor<i32>) -> ()
+  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5x4xi32>, tensor<i32>) -> tensor<?xi32>
+  func.return %0 : tensor<?xi32>
+}
+
+// Regression test: just check that this lowers successfully.
+// CHECK-LABEL: @reduce_dynamic_output
+// CHECK: linalg.generic
+
+// CHECK-PRIMITIVE-LABEL: @reduce_dynamic_output
+// CHECK-PRIMITIVE: linalg.reduce
+
 // -----
 
 func.func @reduce_init_const(%arg0: tensor<1x10xf32>) -> tensor<1xf32> {
@@ -2938,7 +3218,7 @@ func.func @reduce_lexicographic_min_complex(%arg0: tensor<?x3x4xcomplex<f64>>,
 // CHECK: arith.select
 
 // CHECK-PRIMITIVE-LABEL: @reduce_lexicographic_min_complex
-// CHECK-PRIMITIVE: linalg.generic
+// CHECK-PRIMITIVE: linalg.reduce
 // CHECK-PRIMITIVE: complex.re
 // CHECK-PRIMITIVE: complex.re
 // CHECK-PRIMITIVE: arith.cmpf
@@ -3017,8 +3297,6 @@ func.func @variadic_reduce(%arg0: tensor<9x2xi32>, %arg1: tensor<9x2xi32>) -> (t
 // CHECK-NEXT:     %[[T6:.*]] = arith.select %[[T3]], %[[T4]], %[[T5]] : i32
 // CHECK-NEXT:     linalg.yield %[[T2]], %[[T6]]
 
-// CHECK-PRIMITIVE-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-PRIMITIVE-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
 // CHECK-PRIMITIVE:      func @variadic_reduce
 // CHECK-PRIMITIVE-SAME:   %[[ARG0:[a-zA-Z0-9_]*]]
 // CHECK-PRIMITIVE-SAME:   %[[ARG1:[a-zA-Z0-9_]*]]
@@ -3028,12 +3306,11 @@ func.func @variadic_reduce(%arg0: tensor<9x2xi32>, %arg1: tensor<9x2xi32>) -> (t
 // CHECK-PRIMITIVE:        %[[FILL0:.*]] = linalg.fill ins(%[[CST0]]{{.*}}outs(%[[INIT0]]
 // CHECK-PRIMITIVE:        %[[INIT1:.*]] = tensor.empty() : tensor<2xi32>
 // CHECK-PRIMITIVE:        %[[FILL1:.*]] = linalg.fill ins(%[[CST1]]{{.*}}outs(%[[INIT1]]
-// CHECK-PRIMITIVE:        %[[RES:.+]]:2 = linalg.generic {
-// CHECK-PRIMITIVE-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP1]], #[[MAP1]]]
-// CHECK-PRIMITIVE-SAME:     iterator_types = ["parallel", "reduction"]
+// CHECK-PRIMITIVE:        %[[RES:.+]]:2 = linalg.reduce
 // CHECK-PRIMITIVE-SAME:     ins(%[[ARG0]], %[[ARG1]] : tensor<9x2xi32>, tensor<9x2xi32>)
 // CHECK-PRIMITIVE-SAME:    outs(%[[FILL0]], %[[FILL1]] : tensor<2xi32>, tensor<2xi32>)
-// CHECK-PRIMITIVE-NEXT:   ^bb0(%[[IN0:.*]]: i32, %[[IN1:.*]]: i32, %[[OUT0:.*]]: i32, %[[OUT1:.*]]: i32):
+// CHECK-PRIMITIVE-SAME:    dimensions = [0]
+// CHECK-PRIMITIVE-NEXT:   (%[[IN0:.*]]: i32, %[[IN1:.*]]: i32, %[[OUT0:.*]]: i32, %[[OUT1:.*]]: i32) {
 // CHECK-PRIMITIVE-NEXT:     %[[T1:.*]] = arith.cmpi sge, %[[OUT0]], %[[IN0]] : i32
 // CHECK-PRIMITIVE-NEXT:     %[[T2:.*]] = arith.select %[[T1]], %[[OUT0]], %[[IN0]] : i32
 // CHECK-PRIMITIVE-NEXT:     %[[T3:.*]] = arith.cmpi eq, %[[OUT0]], %[[IN0]] : i32
@@ -3078,8 +3355,6 @@ func.func @variadic_diff_type_reduce(%arg0: tensor<128x10xf32>, %arg1: tensor<12
 // CHECK-NEXT:      %[[RES1:.*]] = arith.select %[[B0]], %[[RHS1]], %[[LHS1]] : i32
 // CHECK-NEXT:      linalg.yield %[[RES0]], %[[RES1]] : f32, i32
 
-// CHECK-PRIMITIVE-DAG:  #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-PRIMITIVE-DAG:  #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0)>
 // CHECK-PRIMITIVE:      func @variadic_diff_type_reduce
 // CHECK-PRIMITIVE-SAME:   %[[ARG0:[a-zA-Z0-9_]*]]
 // CHECK-PRIMITIVE-SAME:   %[[ARG1:[a-zA-Z0-9_]*]]
@@ -3089,12 +3364,11 @@ func.func @variadic_diff_type_reduce(%arg0: tensor<128x10xf32>, %arg1: tensor<12
 // CHECK-PRIMITIVE:        %[[FILL0:.*]] = linalg.fill ins(%[[CST0]]{{.*}}outs(%[[INIT0]]
 // CHECK-PRIMITIVE:        %[[INIT1:.*]] = tensor.empty() : tensor<128xi32>
 // CHECK-PRIMITIVE:        %[[FILL1:.*]] = linalg.fill ins(%[[CST1]]{{.*}}outs(%[[INIT1]]
-// CHECK-PRIMITIVE:        %[[RES:.+]]:2 = linalg.generic {
-// CHECK-PRIMITIVE-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP1]], #[[MAP1]]]
-// CHECK-PRIMITIVE-SAME:     iterator_types = ["parallel", "reduction"]
+// CHECK-PRIMITIVE:        %[[RES:.+]]:2 = linalg.reduce
 // CHECK-PRIMITIVE-SAME:     ins(%[[ARG0]], %[[ARG1]] : tensor<128x10xf32>, tensor<128x10xi32>)
-// CHECK-PRIMITIVE-SAME:    outs(%[[FILL0]], %[[FILL1]] : tensor<128xf32>, tensor<128xi32>)
-// CHECK-PRIMITIVE-NEXT:   ^bb0(%[[LHS0:.*]]: f32, %[[LHS1:.*]]: i32, %[[RHS0:.*]]: f32, %[[RHS1:.*]]: i32):
+// CHECK-PRIMITIVE-SAME:     outs(%[[FILL0]], %[[FILL1]] : tensor<128xf32>, tensor<128xi32>)
+// CHECK-PRIMITIVE-SAME:     dimensions = [1]
+// CHECK-PRIMITIVE-NEXT:   (%[[LHS0:.*]]: f32, %[[LHS1:.*]]: i32, %[[RHS0:.*]]: f32, %[[RHS1:.*]]: i32) {
 // CHECK-PRIMITIVE-NEXT:      %[[B0:.*]] = arith.cmpf oge, %[[RHS0]], %[[LHS0]] : f32
 // CHECK-PRIMITIVE-NEXT:      %[[RES0:.*]] = arith.select %[[B0]], %[[RHS0]], %[[LHS0]] : f32
 // CHECK-PRIMITIVE-NEXT:      %[[RES1:.*]] = arith.select %[[B0]], %[[RHS1]], %[[LHS1]] : i32
@@ -3154,6 +3428,19 @@ func.func @slice_with_strides2(%arg0: tensor<6xi32>) -> tensor<3xi32> {
 
 // -----
 
+func.func @slice_with_empty_result(%arg0: tensor<3x3x5xf64>) -> tensor<3x0x5xf64> {
+  %0 = "mhlo.slice"(%arg0) {
+    limit_indices = dense<[3, 2, 5]> : tensor<3xi64>,
+    start_indices = dense<[0, 2, 0]> : tensor<3xi64>,
+    strides = dense<[1, 2, 1]> : tensor<3xi64>
+  } : (tensor<3x3x5xf64>) -> tensor<3x0x5xf64>
+  func.return %0 : tensor<3x0x5xf64>
+}
+// CHECK-LABEL: func @slice_with_empty_result
+//       CHECK:   tensor.extract_slice %{{.*}}[0, 2, 0] [3, 0, 5] [1, 2, 1] : tensor<3x3x5xf64> to tensor<3x0x5xf64>
+
+// -----
+
 func.func @dynamic_slice(%arg: tensor<3x4xf32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xf32> {
   %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {
     slice_sizes = dense<[1, 4]> : tensor<2xi64>
@@ -3179,7 +3466,7 @@ func.func @dynamic_slice(%arg: tensor<3x4xf32>, %start1: tensor<i64>, %start2: t
 // -----
 
 func.func @dynamic_slice_unsigned_index(
-    %arg: tensor<3x4xui32>, %start1: tensor<ui64>, %start2: tensor<ui64>) 
+    %arg: tensor<3x4xui32>, %start1: tensor<ui64>, %start2: tensor<ui64>)
     -> tensor<1x4xui32> {
   %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {
     slice_sizes = dense<[1, 4]> : tensor<2xi64>
@@ -4369,7 +4656,7 @@ func.func @reduce_window_generic_scalar(%arg0: tensor<f32>, %arg1: tensor<f32>)
 
 // CHECK: #[[MAP:.+]] = affine_map<() -> ()>
 // CHECK-LABEL: func @reduce_window_generic_scalar
-// CHECK: linalg.generic {indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK: linalg.generic {indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
 
 // -----
 
@@ -4750,19 +5037,21 @@ func.func @torch_index_select(%arg0: tensor<5x1x5xi32>,
   func.return %0 : tensor<2x1x5xi32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //      CHECK: func @torch_index_select
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
-//      CHECK: %[[INIT:.+]] = tensor.empty() :
+//      CHECK: %[[INIT1:.+]] = tensor.empty() :
+//      CHECK: %[[INIT2:.+]] = tensor.empty() :
 //      CHECK: linalg.generic {
 // CHECK-SAME:   indexing_maps
-// CHECK-SAME:   #[[MAP0]], #[[MAP1]]
+// CHECK-SAME:   #[[MAP0]], #[[MAP1]], #[[MAP2]]
 // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel"]
-// CHECK-SAME: ins(%[[INDEX]] : 
-// CHECK-SAME: outs(%[[INIT]] : 
+// CHECK-SAME: ins(%[[INDEX]], %[[INIT1]] :
+// CHECK-SAME: outs(%[[INIT2]] :
 // CHECK-SAME: {someattr}
-//      CHECK: ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32):
+//      CHECK: ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32, %{{.+}}: i32):
 //      CHECK:   %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
 //      CHECK:   %[[J:.+]] = linalg.index 1
 //      CHECK:   %[[K:.+]] = linalg.index 2
@@ -4898,17 +5187,19 @@ func.func @torch_index_select_unsigned(%arg0: tensor<5x1x5xui32>,
   func.return %0 : tensor<2x1x5xui32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //      CHECK: func @torch_index_select_unsigned
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
 //      CHECK:   %[[INPUT_SIGNLESS:.*]] = builtin.unrealized_conversion_cast %[[INPUT]] : tensor<5x1x5xui32> to tensor<5x1x5xi32>
+//      CHECK:   %[[INIT:.*]] = tensor.empty() : tensor<1x5xi32>
 //      CHECK:   %[[RES:.+]] = linalg.generic {
 // CHECK-SAME:     indexing_maps
-// CHECK-SAME:     #[[MAP0]], #[[MAP1]]
+// CHECK-SAME:     #[[MAP0]], #[[MAP1]], #[[MAP2]]
 // CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel"]
-// CHECK-SAME:   ins(%[[INDEX]] : tensor<2xi32>)
-//      CHECK:   ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32):
+// CHECK-SAME:   ins(%[[INDEX]], %[[INIT]] : tensor<2xi32>, tensor<1x5xi32>)
+//      CHECK:   ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: i32, %{{.+}}: i32):
 //      CHECK:     %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
 //      CHECK:     %[[J:.+]] = linalg.index 1
 //      CHECK:     %[[K:.+]] = linalg.index 2
@@ -4933,11 +5224,12 @@ func.func @torch_index_select_scalar(%arg0: tensor<4x8xf32>,
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
 //      CHECK: %[[T0:.+]] = tensor.empty() : tensor<8xf32>
+//      CHECK: %[[T1:.+]] = tensor.empty() : tensor<8xf32>
 //      CHECK: linalg.generic {
 // CHECK-SAME:   indexing_maps
-// CHECK-SAME:   #[[MAP0]], #[[MAP1]]
+// CHECK-SAME:   #[[MAP0]], #[[MAP1]], #[[MAP1]]
 // CHECK-SAME:   iterator_types = ["parallel"]
-// CHECK-SAME:   ins(%[[INDEX]] : tensor<i32>) outs(%[[T0]] : tensor<8xf32>)
+// CHECK-SAME:   ins(%[[INDEX]], %[[T0]] : tensor<i32>, tensor<8xf32>) outs(%[[T1]] : tensor<8xf32>)
 //      CHECK:   ^{{.+}}(%[[VAL:[a-zA-Z0-9_]+]]: i32, %{{.+}}: f32):
 //      CHECK:     %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
 //      CHECK:     %[[I:.+]] = linalg.index 0
@@ -4955,16 +5247,18 @@ func.func @torch_index_select_batch(%arg0: tensor<4x7x8x2xf32>,
   func.return %0 : tensor<4x7x1x2xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 //      CHECK: func @torch_index_select_batch
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
+//      CHECK: %[[INIT:.+]] = tensor.empty() : tensor<4x7x2xf32>
 //      CHECK: linalg.generic {
 // CHECK-SAME:   indexing_maps
-// CHECK-SAME:   #[[MAP0]], #[[MAP1]]
+// CHECK-SAME:   #[[MAP0]], #[[MAP1]], #[[MAP2]]
 // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-// CHECK-SAME: ins(%[[INDEX]] :
-// CHECK-NEXT: ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: f32):
+// CHECK-SAME: ins(%[[INDEX]], %[[INIT]] :
+// CHECK-NEXT: ^{{.+}}(%[[VAL:.+]]: i32, %{{.+}}: f32, %{{.+}}: f32):
 //      CHECK:   %[[CAST:.+]] = arith.index_cast %[[VAL]] : i32 to index
 //      CHECK:   %[[I:.+]] = linalg.index 0
 //      CHECK:   %[[J:.+]] = linalg.index 1
@@ -4983,7 +5277,8 @@ func.func @torch_index_select_dynamic(%input: tensor<?x?x?x?xf32>,
   func.return %0 : tensor<?x?x?x?xf32>
 }
 //      CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
-//      CHECK: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+//      CHECK: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+//      CHECK: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 //      CHECK: func @torch_index_select_dynamic
 // CHECK-SAME:   %[[INPUT:[a-zA-Z0-9_]*]]
 // CHECK-SAME:   %[[INDEX:[a-zA-Z0-9_]*]]
@@ -4994,14 +5289,18 @@ func.func @torch_index_select_dynamic(%input: tensor<?x?x?x?xf32>,
 //      CHECK:   %[[D1:.+]] = tensor.dim %[[INPUT]], %[[C1]]
 //      CHECK:   %[[D2:.+]] = tensor.dim %[[INDEX]], %[[C1]]
 //      CHECK:   %[[D3:.+]] = tensor.dim %[[INPUT]], %[[C3]]
-//      CHECK:   %[[INIT:.+]] = tensor.empty(%[[D0]], %[[D1]], %[[D2]], %[[D3]])
+//      CHECK:   %[[D4:.+]] = tensor.dim %[[INPUT]], %[[C0]]
+//      CHECK:   %[[D5:.+]] = tensor.dim %[[INPUT]], %[[C1]]
+//      CHECK:   %[[D6:.+]] = tensor.dim %[[INPUT]], %[[C3]]
+//      CHECK:   %[[INIT0:.+]] = tensor.empty(%[[D4]], %[[D5]], %[[D6]]) : tensor<?x?x?xf32>
+//      CHECK:   %[[INIT1:.+]] = tensor.empty(%[[D0]], %[[D1]], %[[D2]], %[[D3]])
 //      CHECK:   %[[RESULT:.+]] = linalg.generic
-// CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]
+// CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]
 // CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel", "parallel"]
-// CHECK-SAME:     ins(%[[INDEX]] : tensor<?x?xi32>)
-// CHECK-SAME:     outs(%[[INIT]] : tensor<?x?x?x?xf32>)
+// CHECK-SAME:     ins(%[[INDEX]], %[[INIT0]] : tensor<?x?xi32>, tensor<?x?x?xf32>)
+// CHECK-SAME:     outs(%[[INIT1]] : tensor<?x?x?x?xf32>)
 //      CHECK:     ^{{.+}}(
-// CHECK-SAME:       %[[ARG0:[a-zA-Z0-9_]+]]: i32, %{{[a-zA-Z0-9_]+}}: f32)
+// CHECK-SAME:       %[[ARG0:[a-zA-Z0-9_]+]]: i32, %{{[a-zA-Z0-9_]+}}: f32, %{{[a-zA-Z0-9_]+}}: f32)
 //      CHECK:       %[[POS:.+]] = arith.index_cast %[[ARG0]]
 //      CHECK:       %[[IDX0:.+]] = linalg.index 0
 //      CHECK:       %[[IDX1:.+]] = linalg.index 1
@@ -5562,3 +5861,84 @@ func.func @convolution_without_reversing_and_stride(%arg0: tensor<2x14x12x2xf64>
     : (tensor<2x14x12x2xf64>, tensor<7x7x1x2xf64>) -> tensor<2x12x16x2xf64>
   return %0 : tensor<2x12x16x2xf64>
 }
+
+// -----
+
+// CHECK-DAG: affine_map<(d0, d1, d2, d3) -> (-d0 + 2, -d1 + 2, d2, d3)>
+// CHECK-LABEL: @normal_convolution_with_reversal
+func.func @normal_convolution_with_reversal(%arg0: tensor<1x3x3x3xf32>,
+    %arg1: tensor<3x3x3x1xf32>) -> tensor<1x1x1x1xf32> {
+  %0 = mhlo.convolution(%arg0, %arg1)
+      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+      window = {
+        stride = [1, 1],
+        pad = [[0, 0], [0, 0]],
+        lhs_dilate = [1, 1],
+        rhs_dilate = [1, 1],
+        reverse = [1, 1]
+      } {
+        batch_group_count = 1 : i64,
+        feature_group_count = 1 : i64, precision_config = [
+          #mhlo<precision DEFAULT>,
+          #mhlo<precision DEFAULT>]
+      } : (tensor<1x3x3x3xf32>, tensor<3x3x3x1xf32>) -> tensor<1x1x1x1xf32>
+  return %0 : tensor<1x1x1x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: set_dimension_size
+// CHECK-SAME: %[[VALUE:.*]]: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>
+func.func @set_dimension_size(
+  %value: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>,
+  %dimension: tensor<i32>)
+  -> tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>> {
+  // CHECK: tensor.extract_slice %[[VALUE]][0, 0] [2, %{{.*}}] [1, 1] : tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>> to tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>
+  %0 = "mhlo.set_dimension_size"(%value, %dimension) { dimension = 1 }
+    : (tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>, tensor<i32>)
+    -> tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>
+  func.return %0 : tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 2]>>
+}
+
+// -----
+// The following test checks that an EmptyOp is emitted for mhlo.convolution
+// when the output shape has a zero-sized dimension. This goes through
+// ConvolutionOpGeneralConversion rewrite pattern.
+
+// CHECK-LABEL: @general_convolution_with_zero_sized_dimension_in_output
+//  CHECK-SAME: %[[LHS:.*]]: tensor<2x4x9x0xi64>
+//  CHECK-SAME: %[[RHS:.*]]: tensor<4x5x2x4xi64>
+//  CHECK-SAME: -> tensor<2x5x0x4xi64>
+//  CHECK-NEXT: %[[RES:.*]] = tensor.empty
+//  CHECK-NEXT: return %[[RES]]
+
+func.func @general_convolution_with_zero_sized_dimension_in_output(%arg0: tensor<2x4x9x0xi64> {bufferization.writable = false, xla_framework.input_mapping = 2 : i32},
+%arg1: tensor<4x5x2x4xi64> {bufferization.writable = false, xla_framework.input_mapping = 0 : i32})
+-> tensor<2x5x0x4xi64> attributes {xla_framework.result_mapping = 1 : i32} {
+  %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {stride = [2, 1], pad = [[1, 2], [2, 0]], lhs_dilate = [1, 4], rhs_dilate = [1, 1], reverse = [0, 0]}
+    {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
+    : (tensor<2x4x9x0xi64>, tensor<4x5x2x4xi64>) -> tensor<2x5x0x4xi64>
+  return %0 : tensor<2x5x0x4xi64>
+}
+
+// -----
+// This test is similar to the previous one, but runs through a different
+// rewrite pattern (NormalConvolutionOpConversion).
+
+// CHECK-LABEL: @normal_convolution_with_zero_sized_dimension_in_output
+//  CHECK-SAME: %[[LHS:.*]]: tensor<3x9x0x2xi16>
+//  CHECK-SAME: %[[RHS:.*]]: tensor<4x5x2x2xi16>
+//  CHECK-SAME: -> tensor<3x9x0x2xi16>
+//  CHECK-NEXT: %[[RES:.*]] = tensor.empty
+//  CHECK-NEXT: return %[[RES]]
+
+func.func @normal_convolution_with_zero_sized_dimension_in_output(%arg0: tensor<3x9x0x2xi16> {bufferization.writable = false, xla_framework.input_mapping = 2 : i16},
+%arg1: tensor<4x5x2x2xi16> {bufferization.writable = false, xla_framework.input_mapping = 0 : i16})
+-> tensor<3x9x0x2xi16> attributes {xla_framework.result_mapping = 1 : i16} {
+  %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {stride = [1, 1], pad = [[1, 2], [2, 0]], lhs_dilate = [1, 2], rhs_dilate = [1, 4], reverse = [0, 0]}
+    {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
+    : (tensor<3x9x0x2xi16>, tensor<4x5x2x2xi16>) -> tensor<3x9x0x2xi16>
+  return %0 : tensor<3x9x0x2xi16>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
index f6fc852e63a..dbca93c7d9f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
@@ -24,7 +24,7 @@ func.func @dyn_broadcast(%operand: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
 // CHECK: %[[EXPAND_2:.*]] = arith.cmpi slt, %[[OPER_DIM_1]], %[[C1]] : index
 // CHECK: %[[STRIDE_2:.*]] = arith.select %[[EXPAND_2]], %[[C0]], %[[C1]] : index
 
-// CHECK: %[[TRANSFORMED_MEMREF:.*]] = memref.reinterpret_cast %[[OPERAND]] to offset: [0], sizes: [%[[C1]], %[[C1]], %[[C1]]], strides: [%[[C0]], %[[STRIDE_1]], %[[STRIDE_2]]] : memref<?x?xf32> to memref<?x?x?xf32, #map>
+// CHECK: %[[TRANSFORMED_MEMREF:.*]] = memref.reinterpret_cast %[[OPERAND]] to offset: [0], sizes: [%[[C1]], %[[C1]], %[[C1]]], strides: [%[[C0]], %[[STRIDE_1]], %[[STRIDE_2]]] : memref<?x?xf32> to memref<?x?x?xf32, #map{{[0-9]*}}>
 // CHECK: %[[RESULT:.*]] = bufferization.to_tensor %[[TRANSFORMED_MEMREF]]
 
 // CHECK: return %[[RESULT]]
@@ -63,7 +63,7 @@ func.func @dyn_broadcast_unsigned(%operand: tensor<?x?xi32>, %shape: tensor<3xi6
 // CHECK: %[[EXPAND_2:.*]] = arith.cmpi slt, %[[OPER_DIM_1]], %[[SIZE_2]] : index
 // CHECK: %[[STRIDE_2:.*]] = arith.select %[[EXPAND_2]], %[[C0]], %[[C1]] : index
 
-// CHECK: %[[TRANSFORMED_MEMREF:.*]] = memref.reinterpret_cast %[[OPERAND]] to offset: [0], sizes: [%[[SIZE_0]], %[[SIZE_1]], %[[SIZE_2]]], strides: [%[[C0]], %[[STRIDE_1]], %[[STRIDE_2]]] : memref<?x?xi32> to memref<?x?x?xi32, #map>
+// CHECK: %[[TRANSFORMED_MEMREF:.*]] = memref.reinterpret_cast %[[OPERAND]] to offset: [0], sizes: [%[[SIZE_0]], %[[SIZE_1]], %[[SIZE_2]]], strides: [%[[C0]], %[[STRIDE_1]], %[[STRIDE_2]]] : memref<?x?xi32> to memref<?x?x?xi32, #map{{[0-9]*}}>
 
 // CHECK: %[[RESULT:.*]] = bufferization.to_tensor %[[TRANSFORMED_MEMREF]]
 
@@ -118,10 +118,49 @@ func.func @custom_call_multiple_inputs_outputs(%x: tensor<2xf32>,
 
 // CHECK-DAG: %[[I0:.+]] = bufferization.to_memref %[[ARG0]] : memref<2xf32>
 // CHECK-DAG: %[[I1:.+]] = bufferization.to_memref %[[ARG1]] : memref<5xi32>
-// CHECK-DAG: %[[O0:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2xf32>
-// CHECK-DAG: %[[O1:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2xf32>
-// CHECK-DAG: %[[O2:.*]] = memref.alloc() {alignment = 128 : i64} : memref<5xi32>
-// CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[O0]], %[[O1]], %[[O2]]) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 3>} : (memref<2xf32>, memref<5xi32>, memref<2xf32>, memref<2xf32>, memref<5xi32>) -> ()
+// CHECK-DAG: %[[O0:.*]] = memref.alloc() {{.*}} : memref<2xf32>
+// CHECK-DAG: %[[O1:.*]] = memref.alloc() {{.*}} : memref<2xf32>
+// CHECK-DAG: %[[O2:.*]] = memref.alloc() {{.*}} : memref<5xi32>
+// CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[O0]], %[[O1]], %[[O2]]) ({
+// CHECK-NEXT: }) {backend_config = "", call_target_name = "foo", has_side_effect = false, operand_segment_sizes = array<i32: 2, 3>} : (memref<2xf32>, memref<5xi32>, memref<2xf32>, memref<2xf32>, memref<5xi32>) -> ()
 // CHECK-DAG: %[[T0:.+]] = bufferization.to_tensor %[[O0]] : memref<2xf32>
 // CHECK-DAG: %[[T1:.+]] = bufferization.to_tensor %[[O1]] : memref<2xf32>
 // CHECK: return %[[T0]], %[[T1]] : tensor<2xf32>, tensor<2xf32>
+
+// -----
+
+// CHECK-LABEL: func @custom_call_side_effect
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2xf32>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<5xi32>
+func.func @custom_call_side_effect(%x: tensor<2xf32>,
+                                   %y: tensor<5xi32>) -> !mhlo.token {
+  %token = mhlo.create_token : !mhlo.token
+  %0:2 = "mhlo.custom_call"(%x, %y, %token) {
+    backend_config = "",
+    call_target_name = "bar",
+    has_side_effect = true
+  } : (tensor<2xf32>, tensor<5xi32>, !mhlo.token)
+    -> (!mhlo.token, tensor<2xi32>)
+  func.return %0#0 : !mhlo.token
+}
+
+// CHECK-DAG: %[[TOKEN:.*]] = mhlo.create_token
+// CHECK-DAG: %[[I0:.+]] = bufferization.to_memref %[[ARG0]] : memref<2xf32>
+// CHECK-DAG: %[[I1:.+]] = bufferization.to_memref %[[ARG1]] : memref<5xi32>
+// CHECK-DAG: %[[ALLOC:.+]] = memref.alloc
+// CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[ALLOC]]) ({
+// CHECK-NEXT: }) {backend_config = "", call_target_name = "bar", has_side_effect = true, operand_segment_sizes = array<i32: 2, 1>, target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<num_args = 3, num_results = 2, args_to_target_args = [0, 1], results_to_target_results = [1]>} : (memref<2xf32>, memref<5xi32>, memref<2xi32>)
+// CHECK: return %[[TOKEN]] : !mhlo.token
+
+// -----
+
+// CHECK-LABEL: func @infeed_outfeed
+func.func @infeed_outfeed(%arg0: tensor<f32>) {
+  %0 = mhlo.create_token : !mhlo.token
+  %1:2 = "mhlo.infeed"(%0) {infeed_config = "", layout = [[1, 0]]} : (!mhlo.token) -> (tensor<3x4xf32>, !mhlo.token)
+// CHECK: %[[ALLOC:.*]] = memref.alloc() {{.*}} : memref<3x4xf32>
+// CHECK: "lmhlo.infeed"(%[[ALLOC]]) {config = "", infeed_config = "", layout = {{\[}}[1, 0]]} : (memref<3x4xf32>) -> ()
+  %2 = "mhlo.outfeed"(%1#0, %1#1) {outfeed_config = ""} : (tensor<3x4xf32>, !mhlo.token) -> !mhlo.token
+// CHECK: "lmhlo.outfeed"(%[[ALLOC]]) {config = "", outfeed_config = ""} : (memref<3x4xf32>) -> ()
+  func.return
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
new file mode 100644
index 00000000000..6d8d5854381
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo --mlir-print-op-generic --split-input-file --verify-diagnostics %s
+// RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo=allow-experimental-features --mlir-print-op-generic %s | FileCheck %s
+
+// This test file runs both FileCheck and diagnostic check. These tests all
+// error when the experimental flag is disabled, and pass when it is enabled.
+
+func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>) {
+  //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
+  // CHECK-SAME{LITERAL}:    backend_config = "{replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>}",
+  //          CHECK-SAME:    call_target_name = "mhlo.all_to_all"
+  //          CHECK-SAME: } : (tensor<128x4xf32>, tensor<128x4xf32>)
+  // expected-error@+1 {{failed to legalize operation 'mhlo.all_to_all' that was explicitly marked illegal}}
+  %0:2 = "mhlo.all_to_all"(%arg0, %arg1) {
+    replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  } : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
+  return %0#0, %0#1 : tensor<128x4xf32>, tensor<128x4xf32>
+}
+// CHECK-LABEL: "op_all_to_all_tuple"
+
+// -----
+
+func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
+  //      CHECK: "stablehlo.custom_call"(%arg0) {
+  // CHECK-SAME:   backend_config = "{api_version = 4 : i32, backend_config = {foo = \22bar\22}, call_target_name = \22foo\22}"
+  // CHECK-SAME:   call_target_name = "mhlo.custom_call"
+  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
+  %0 = "mhlo.custom_call"(%arg0) {
+    call_target_name = "foo",
+    backend_config = {foo = "bar"},
+    api_version = 4 : i32
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
+
+// -----
+
+func.func @attr_precision_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
+  //      CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
+  // CHECK-SAME:    backend_config = "{precision_config = [#mhlo<precision PACKED_NIBBLE>]}",
+  // CHECK-SAME:    call_target_name = "mhlo.dot"
+  // CHECK-SAME: } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  // expected-error@+1 {{failed to legalize operation 'mhlo.dot' that was explicitly marked illegal}}
+  %0 = "mhlo.dot"(%arg0, %arg1) {
+    precision_config = [#mhlo<precision PACKED_NIBBLE>]
+  } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+// CHECK-LABEL: "attr_precision_packed_nibble"
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 4be23d50f00..7075982f6d5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo --mlir-print-op-generic --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo=allow-experimental-features --mlir-print-op-generic --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // ============ ATTRIBUTES ============
 
@@ -151,6 +152,7 @@ func.func @attr_custom_call_api_version_status_returning_unified(%arg0: tensor<f
 }
 // CHECK-LABEL: "attr_custom_call_api_version_status_returning_unified"
 
+// CustomCallSchedule aka #mhlo<custom_call_schedule> is unsupported at the moment (see negative test below).
 // DequantizeMode aka #mhlo<dequantize_mode> is unused at the moment.
 // DomainKind aka #mhlo<kind> is unsupported at the moment (see negative test below).
 // DotDimensionNumbers aka #mhlo.dot is covered below.
@@ -198,34 +200,32 @@ func.func @attr_fft_type_irfft(%arg0: tensor<9xcomplex<f32>>) -> tensor<16xf32>
 // FusionKind aka #mhlo<fusion_kind> is unsupported at the moment (see negative test below).
 // GatherDimensionNumbers aka #mhlo.gather is covered below.
 
-func.func @attr_precision_config_default(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
+func.func @attr_precision_default(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision DEFAULT>]
     precision_config = [#mhlo<precision DEFAULT>]
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_default"
+// CHECK-LABEL: "attr_precision_default"
 
-func.func @attr_precision_config_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
+func.func @attr_precision_high(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision HIGH>]
     precision_config = [#mhlo<precision HIGH>]
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_high"
+// CHECK-LABEL: "attr_precision_high"
 
-func.func @attr_precision_config_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
+func.func @attr_precision_highest(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {
     // CHECK: precision_config = [#stablehlo<precision HIGHEST>]
     precision_config = [#mhlo<precision HIGHEST>]
   } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
   func.return %0 : tensor<8x8xf32>
 }
-// CHECK-LABEL: "attr_precision_config_highest"
-
-// Precision::PACKED_NIBBLE is unsupported at the moment (see negative test below).
+// CHECK-LABEL: "attr_precision_highest"
 
 func.func @attr_rng_algorithm_default(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
   %0:2 = "mhlo.rng_bit_generator"(%arg0) {
@@ -312,9 +312,11 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
 
 // TypeExtensionsAttr aka #mhlo.type_extensions is covered below.
 
-func.func @attr_type_extensions_bounds(%arg0: tensor<?xf32, #mhlo.type_extensions<bounds = [16]>>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [16]>> {
-  // CHECK: "func.return"(%arg0) : (tensor<?xf32, #stablehlo.type_extensions<bounds = [16]>>) -> ()
-  func.return %arg0 : tensor<?xf32, #mhlo.type_extensions<bounds = [16]>>
+func.func @attr_type_extensions_bounds(
+    %arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>)
+    -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>> {
+  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>) -> ()
+  func.return %arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>
 }
 // CHECK-LABEL: "attr_type_extensions_bounds"
 
@@ -385,6 +387,7 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
 
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "stablehlo.all_to_all"(%arg0) {
+  //          CHECK-SAME:   channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>,
   //          CHECK-SAME:   concat_dimension = 0 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
   //          CHECK-SAME:   split_count = 4 : i64,
@@ -394,11 +397,11 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
     split_count = 4 : i64,
-    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   } : (tensor<4x16xf32>) -> tensor<16x4xf32>
   func.return %0 : tensor<16x4xf32>
 }
-// CHECK-LABEL: "op_all_to_all"
 
 func.func @op_and(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<i1> {
   // CHECK: "stablehlo.and"(%arg0, %arg1) : (tensor<i1>, tensor<i1>) -> tensor<i1>
@@ -607,10 +610,10 @@ func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
 }
 // CHECK-LABEL: "op_convert"
 
-func.func @op_convolution(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32> {
+func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   //      CHECK: "stablehlo.convolution"(%arg0, %arg1) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
-  // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+  // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
@@ -618,19 +621,19 @@ func.func @op_convolution(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32>
+  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "mhlo.convolution"(%arg0, %arg1) {
     window_strides = dense<1> : tensor<2xi64>,
     padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = dense<1> : tensor<2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
     window_reversal = dense<false> : tensor<2xi1>,
-    dimension_numbers = #mhlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32>
-  func.return %0 : tensor<32x1x8x8x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  func.return %0 : tensor<1x8x8x16xf32>
 }
 // CHECK-LABEL: "op_convolution"
 
@@ -669,7 +672,7 @@ func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.wi
 // CHECK-LABEL: "op_cstr_reshapable"
 
 func.func @called_computation() { func.return }
-func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "stablehlo.custom_call"(%arg0) {
   // CHECK-SAME:   api_version = 1 : i32,
   // CHECK-SAME:   backend_config = "",
@@ -677,6 +680,11 @@ func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK-SAME:   called_computations = [@foo],
   // CHECK-SAME:   has_side_effect = false,
   // CHECK-SAME:   operand_layouts = [dense<> : tensor<0xindex>],
+  // CHECK-SAME:   output_operand_aliases = [
+  // CHECK-SAME:     #stablehlo.output_operand_alias<
+  // CHECK-SAME:       output_tuple_indices = [],
+  // CHECK-SAME:       operand_index = 0,
+  // CHECK-SAME:       operand_tuple_indices = []>]
   // CHECK-SAME:   result_layouts = [dense<> : tensor<0xindex>]
   // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
   %0 = "mhlo.custom_call"(%arg0) {
@@ -686,12 +694,16 @@ func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
     api_version = 1 : i32,
     called_computations = [@foo],
     operand_layouts = [dense<> : tensor<0xindex>],
+    output_operand_aliases = [
+      #mhlo.output_operand_alias<output_tuple_indices = [],
+                                 operand_index = 0,
+                                 operand_tuple_indices = []>
+    ],
     result_layouts = [dense<> : tensor<0xindex>]
-    // CustomCallOp::output_operand_aliases is unsupported at the moment (see negative test below).
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call"
+// CHECK-LABEL: "op_custom_call_api_version_original"
 
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "stablehlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -751,10 +763,10 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
 }
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 
-func.func @op_dynamic_conv(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<32x1x?x?x16xf32> {
+func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
   //      CHECK: "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
-  // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+  // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
@@ -762,19 +774,19 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>, tensor<4xi32>) -> tensor<32x1x?x?x16xf32>
+  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   %0 = "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = dense<1> : tensor<2xi64>,
     padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = dense<1> : tensor<2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
     window_reversal = dense<false> : tensor<2xi1>,
-    dimension_numbers = #mhlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>, tensor<4xi32>) -> tensor<32x1x?x?x16xf32>
-  func.return %0 : tensor<32x1x?x?x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  func.return %0 : tensor<1x?x?x16xf32>
 }
 // CHECK-LABEL: "op_dynamic_conv"
 
@@ -927,13 +939,13 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
 }
 // CHECK-LABEL: "op_get_dimension_size"
 
-func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>>) -> tensor<f32> {
+func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
   //      CHECK: "stablehlo.get_tuple_element"(%arg0) {
-  // CHECK-SAME:   index = 0 : i32
-  // CHECK-SAME: } : (tuple<tensor<f32>>) -> tensor<f32>
+  // CHECK-SAME:   index = 4 : i32
+  // CHECK-SAME: } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   %0 = "mhlo.get_tuple_element"(%arg0) {
-    index = 0 : i32
-  } : (tuple<tensor<f32>>) -> tensor<f32>
+    index = 4 : i32
+  } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 // CHECK-LABEL: "op_get_tuple_element"
@@ -1106,7 +1118,12 @@ func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
 }
 // CHECK-LABEL: "op_pad"
 
-// PartitionIdOp aka mhlo.partition_id is unsupported at the moment (see negative test below).
+func.func @op_partition_id() -> tensor<ui32> {
+  // CHECK: "stablehlo.partition_id"() : () -> tensor<ui32>
+  %0 = "mhlo.partition_id"() : () -> tensor<ui32>
+  func.return %0 : tensor<ui32>
+}
+// CHECK-LABEL: "op_partition_id"
 
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
@@ -1688,6 +1705,20 @@ func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
 }
 // CHECK-LABEL: "type_ui64"
 
+func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
+  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  func.return %0 : tensor<f8E4M3FN>
+}
+// CHECK-LABEL: "type_f8E4M3FN"
+
+func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
+  // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  func.return %0 : tensor<f8E5M2>
+}
+// CHECK-LABEL: "type_f8E5M2"
+
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
   // CHECK: "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
@@ -1775,6 +1806,25 @@ func.func @type_token_caller(%arg0: !mhlo.token) -> !mhlo.token {
 //       CHECK: function_type = (!stablehlo.token) -> !stablehlo.token
 // CHECK-LABEL: "type_token_caller"
 
+func.func @type_token_region(%arg0: tensor<i1>, %arg1: !mhlo.token) {
+  //      CHECK: "stablehlo.while"(%arg1) ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !stablehlo.token):
+  // CHECK-NEXT:     "stablehlo.return"(%arg0) : (tensor<i1>) -> ()
+  // CHECK-NEXT:   }, {
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !stablehlo.token):
+  // CHECK-NEXT:     "stablehlo.return"(%[[ARG2]]) : (!stablehlo.token) -> ()
+  // CHECK-NEXT: }) : (!stablehlo.token) -> !stablehlo.token
+  %0 = "mhlo.while"(%arg1) ({
+    ^bb0(%arg2: !mhlo.token):
+      mhlo.return %arg0 : tensor<i1>
+    }, {
+    ^bb0(%arg2: !mhlo.token):
+      mhlo.return %arg2 : !mhlo.token
+  }) : (!mhlo.token) -> !mhlo.token
+  return
+}
+// CHECK-LABEL: "type_token_region"
+
 func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!mhlo.token> {
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo"
@@ -1786,17 +1836,8 @@ func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!mhlo.token> {
 
 // ============ NEGATIVE TESTS ============
 // Some ops, attributes and types used in MHLO programs are not supported in StableHLO.
-// For those cases, we have negative tests below.
-
-// -----
-
-func.func @attr_precision_config_packed_nibble(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x8xf32> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.dot' that was explicitly marked illegal}}
-  %0 = "mhlo.dot"(%arg0, %arg1) {
-    precision_config = [#mhlo<precision PACKED_NIBBLE>]
-  } : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
-  func.return %0 : tensor<8x8xf32>
-}
+// The following features are private, and not convertable to StableHLO even
+// with the experimental flag.
 
 // -----
 
@@ -1881,17 +1922,29 @@ func.func @op_copy(%arg0: tensor<f32>) -> tensor<f32> {
 
 // -----
 
-func.func @op_custom_call_output_operand_aliases(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @op_convolution_unknown_dimension_numbers(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32> {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.convolution' that was explicitly marked illegal}}
+  %0 = "mhlo.convolution"(%arg0, %arg1) {
+    window_strides = dense<1> : tensor<2xi64>,
+    padding = dense<1> : tensor<2x2xi64>,
+    lhs_dilation = dense<1> : tensor<2xi64>,
+    rhs_dilation = dense<1> : tensor<2xi64>,
+    window_reversal = dense<false> : tensor<2xi1>,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+    feature_group_count = 1 : i64,
+    batch_group_count = 1 : i64,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+  } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32>
+  func.return %0 : tensor<32x1x8x8x16xf32>
+}
+
+// -----
+
+func.func @op_custom_call_custom_call_schedule(%arg0: tensor<f32>) -> tensor<f32> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.custom_call' that was explicitly marked illegal}}
   %0 = "mhlo.custom_call"(%arg0) {
     call_target_name = "foo",
-    output_operand_aliases = [
-      #mhlo.output_operand_alias<
-        output_tuple_indices = [],
-        operand_index = 0,
-        operand_tuple_indices = []
-      >
-    ]
+    custom_call_schedule = #mhlo<custom_call_schedule EARLIEST>
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -1916,21 +1969,18 @@ func.func @op_fusion(%arg0: tensor<f32>) -> tensor<f32> {
     ^bb0(%arg1: tensor<f32>):
       "mhlo.return"(%arg1) : (tensor<f32>) -> ()
   }) {
-    fusion_kind = #mhlo<fusion_kind kCustom>
+    fusion_kind = #mhlo<fusion_kind kCustom>,
+    output_operand_aliases = [
+      #mhlo.output_operand_alias<output_tuple_indices = [],
+                                 operand_index = 0,
+                                 operand_tuple_indices = []>
+    ]
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // -----
 
-func.func @op_partition_id() -> tensor<ui32> {
-  // expected-error@+1 {{failed to legalize operation 'mhlo.partition_id' that was explicitly marked illegal}}
-  %0 = "mhlo.partition_id"() : () -> tensor<ui32>
-  func.return %0 : tensor<ui32>
-}
-
-// -----
-
 func.func @op_stochastic_convert(%arg0: tensor<f32>, %arg1: tensor<ui32>) -> tensor<i8> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.stochastic_convert' that was explicitly marked illegal}}
   %0 = "mhlo.stochastic_convert"(%arg0, %arg1) : (tensor<f32>, tensor<ui32>) -> tensor<i8>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/invalid.mlir
index 917250012ac..fdbca0a67fa 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/invalid.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/invalid.mlir
@@ -111,6 +111,15 @@ func.func @complex_type_not_complex(%arg0: tensor<1xf64>) -> () {
 
 // -----
 
+func.func @dense_array_nested(%arg0: tensor<1x2xf64>) -> () {
+  // expected-error @+2 {{custom op 'stablehlo.transpose' expected integer value}}
+  // expected-error @+1 {{expected ']'}}
+  %0 = stablehlo.transpose %arg0, dims = [1, [0]] : tensor<1xf64>
+  func.return
+}
+
+// -----
+
 func.func @select_type_wrong_type(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>) -> () {
   // expected-error @+1 {{custom op 'mhlo.select' expected functional type or list of two types}}
   %0 = mhlo.select %arg0, %arg1, %arg1 : tensor<2x3xi1>
@@ -132,3 +141,59 @@ func.func @pairwise_type_not_type(%arg0: tensor<1xf64>) -> tensor<1xf64> {
   %0 = mhlo.select %arg0, %arg1, %arg1 : %arg0
   func.return %0 : tensor<1xf64>
 }
+
+// -----
+
+func.func @reduce_precision_no_e_num(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw em2}}
+  %0 = mhlo.reduce_precision %arg0, format = em2 : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_not_literal(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected valid keyword}}
+  %0 = mhlo.reduce_precision %arg0, format = "e2m2" : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_no_em(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw z4f2}}
+  %0 = mhlo.reduce_precision %arg0, format = z4f2 : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_em_order(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw m2e2}}
+  %0 = mhlo.reduce_precision %arg0, format = m2e2 : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_no_e(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw m2}}
+  %0 = mhlo.reduce_precision %arg0, format = m2 : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_no_m(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw e2}}
+  %0 = mhlo.reduce_precision %arg0, format = e2 : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
+
+// -----
+
+func.func @reduce_precision_no_m_num(%arg0: tensor<3x4xf32>) -> (tensor<3x4xf32>) {
+  // expected-error @+1 {{custom op 'mhlo.reduce_precision' expected exponent mantissa in format e#m#, saw e2m}}
+  %0 = mhlo.reduce_precision %arg0, format = e2m : tensor<3x4xf32>
+  func.return %0 : tensor<?x?xf64>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
index 6dc9eddd2f2..061a4591738 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
@@ -11,9 +11,9 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<?x?xf32>, %shape : tensor<3xin
   // CHECK-DAG: %[[SHAPE_D2:.*]] = tensor.extract %[[SHAPE]][%[[C2]]]
   // CHECK-DAG: %[[INIT:.*]] = tensor.empty(%[[SHAPE_D0]], %[[SHAPE_D1]], %[[SHAPE_D2]]) : tensor<?x?x?xf32>
   // CHECK-NEXT: %[[BCAST:.*]] = thlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: ins(%[[ARG]] : tensor<?x?xf32>)
-  // CHECK-SAME: outs(%[[INIT]] : tensor<?x?x?xf32>)
-  // CHECK-SAME: broadcast_dimensions = [0, 2]
+  // CHECK-SAME:   ins(%[[ARG]] : tensor<?x?xf32>)
+  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?x?xf32>)
+  // CHECK-SAME:   broadcast_dimensions = [0, 2]
   // CHECK:     return %[[BCAST]]
   %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape)
       { broadcast_dimensions = dense<[0, 2]> : tensor<2xi64> }
@@ -48,9 +48,9 @@ func.func @dynamic_broadcast_in_dim_with_known_expanding(%arg : tensor<?x?x?xf32
   // CHECK-DAG: %[[SHAPE_D3:.*]] = tensor.extract %[[SHAPE]][%[[C3]]]
   // CHECK-DAG: %[[INIT:.*]] = tensor.empty(%[[SHAPE_D0]], %[[SHAPE_D1]], %[[SHAPE_D2]], %[[SHAPE_D3]]) : tensor<?x?x?x?xf32>
   // CHECK-NEXT: %[[BCAST:.*]] = thlo.dynamic_broadcast_in_dim
-  // CHECK-SAME: ins(%[[ARG]] : tensor<?x?x?xf32>)
-  // CHECK-SAME: outs(%[[INIT]] : tensor<?x?x?x?xf32>)
-  // CHECK-SAME: broadcast_dimensions = [0, 2, 3]
+  // CHECK-SAME:   ins(%[[ARG]] : tensor<?x?x?xf32>)
+  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?x?x?xf32>)
+  // CHECK-SAME:   broadcast_dimensions = [0, 2, 3]
   // CHECK-SAME: {known_expanding_dimensions = array<i64: 0>, known_nonexpanding_dimensions = array<i64: 2>}
   // CHECK:     return %[[BCAST]]
   %0 = "mhlo.dynamic_broadcast_in_dim"(%arg, %shape) {
@@ -74,9 +74,9 @@ func.func @concatenate(%a: tensor<?x?xi32>, %b: tensor<?x?xi32>, %c: tensor<?x?x
   // CHECK-DAG:  %[[CONCAT_DIM_ABC:.*]] = arith.addi %[[CONCAT_DIM_AB]], %[[CONCAT_DIM_C]]
   // CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[D0]], %[[CONCAT_DIM_ABC]])
   // CHECK:      %[[CONCATENATE:.*]] = thlo.concatenate
-  // CHECK-SAME:     ins(%[[A]] : tensor<?x?xi32>, %[[B]] : tensor<?x?xi32>, %[[C]] : tensor<?x?xi32>)
-  // CHECK-SAME:     outs(%[[INIT]] : tensor<?x?xi32>)
-  // CHECK-SAME:     {dimension = 1 : i64}
+  // CHECK-SAME:   ins(%[[A]] : tensor<?x?xi32>, %[[B]] : tensor<?x?xi32>, %[[C]] : tensor<?x?xi32>)
+  // CHECK-SAME:   outs(%[[INIT]] : tensor<?x?xi32>)
+  // CHECK-SAME:   dimension = 1
   // CHECK:      return %[[CONCATENATE]]
   %concat = "mhlo.concatenate"(%a, %b, %c) { dimension = 1 } : (tensor<?x?xi32>, tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   func.return %concat : tensor<?x?xi32>
@@ -91,9 +91,9 @@ func.func @concatenate_with_static_info(%a: tensor<64x32xi32>, %b: tensor<64x16x
   // CHECK-DAG:  %[[CONCAT_DIM_SUM:.*]] = arith.addi %[[CONCAT_DIM_C]], %[[C48]]
   // CHECK-DAG:  %[[INIT:.*]] = tensor.empty(%[[CONCAT_DIM_SUM]])
   // CHECK:      %[[CONCAT:.*]] = thlo.concatenate
-  // CHECK-SAME:     ins(%[[A]] : tensor<64x32xi32>, %[[B]] : tensor<64x16xi32>, %[[C]] : tensor<64x?xi32>)
-  // CHECK-SAME:     outs(%[[INIT]] : tensor<64x?xi32>)
-  // CHECK-SAME:     {dimension = 1 : i64}
+  // CHECK-SAME:   ins(%[[A]] : tensor<64x32xi32>, %[[B]] : tensor<64x16xi32>, %[[C]] : tensor<64x?xi32>)
+  // CHECK-SAME:   outs(%[[INIT]] : tensor<64x?xi32>)
+  // CHECK-SAME:   dimension = 1
   // CHECK:      return %[[CONCAT]]
   %concat = "mhlo.concatenate"(%a, %b, %c) { dimension = 1 } : (tensor<64x32xi32>, tensor<64x16xi32>, tensor<64x?xi32>) -> tensor<64x?xi32>
   func.return %concat : tensor<64x?xi32>
@@ -184,7 +184,7 @@ func.func @gather_dynamic(
 //       CHECK: %[[DIM:.*]] = tensor.dim {{.*}} %[[C0]] : tensor<?x1xi64>
 //       CHECK: %[[INIT:.*]] = tensor.empty(%dim) : tensor<?x42xi32>
 //       CHECK: thlo.gather
-//  CHECK-SAME: outs(%[[INIT]] : tensor<?x42xi32>)
+//  CHECK-SAME:   outs(%[[INIT]] : tensor<?x42xi32>)
 
 func.func @unsupported_gather(%operand: tensor<3x3xf32>,
                               %indices: tensor<3x2xi64>) -> tensor<3xf32> {
@@ -227,10 +227,11 @@ func.func @simple_scatter(%dst: tensor<3x3xf32>, %indices: tensor<2x2xi32>,
 // CHECK-SAME: (%[[DST:.*]]: tensor<3x3xf32>, %[[INDICES:.*]]: tensor<2x2xi32>,
 // CHECK-SAME:  %[[UPDATE:.*]]: tensor<2x1x3xf32>)
 //      CHECK:   %[[CAST:.*]] = arith.index_cast %[[INDICES]] {{.*}} to tensor<2x2xindex>
-//      CHECK:   thlo.scatter ins(%[[CAST]] : tensor<2x2xindex>,
-// CHECK-SAME:                    %[[UPDATE]] : tensor<2x1x3xf32>)
-// CHECK-SAME:                outs(%[[DST]] : tensor<3x3xf32>)
-// CHECK-SAME:                (%[[UPD:.*]]: f32, %[[CUR:.*]]: f32) {
+//      CHECK:   thlo.scatter 
+// CHECK-SAME:     ins(%[[CAST]] : tensor<2x2xindex>,
+// CHECK-SAME:        %[[UPDATE]] : tensor<2x1x3xf32>)
+// CHECK-SAME:     outs(%[[DST]] : tensor<3x3xf32>)
+// CHECK-NEXT:     (%[[UPD:.*]]: f32, %[[CUR:.*]]: f32) {
 // CHECK-NEXT:    %[[CUR_T:.*]] = tensor.from_elements %[[CUR]] : tensor<f32>
 // CHECK-NEXT:    %[[UPD_T:.*]] = tensor.from_elements %[[UPD]] : tensor<f32>
 // CHECK-NEXT:    %[[CUR:.*]] = tensor.extract %[[CUR_T]][] : tensor<f32>
@@ -261,8 +262,8 @@ func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
 // CHECK:       thlo.sort
 // CHECK-SAME:  ins(%[[IN0]] : tensor<16x16xf32>, %[[IN1]] : tensor<16x16xi32>)
 // CHECK-SAME:  outs(%[[INIT0]] : tensor<16x16xf32>, %[[INIT1]] : tensor<16x16xi32>)
-// CHECK-DAG:   dimension = 1 : i64
-// CHECK-DAG:   is_stable = true
+// CHECK-SAME:  dimension = 1
+// CHECK-SAME:  is_stable = true
 // CHECK:       (%[[FLOAT0:.*]]: f32, %[[FLOAT1:.*]]: f32, %[[INT0:.*]]: i32, %[[INT1:.*]]: i32)
 // CHECK-DAG:     %[[TENSOR0:.*]] = tensor.from_elements %[[FLOAT0]] : tensor<f32>
 // CHECK-DAG:     %[[TENSOR1:.*]] = tensor.from_elements %[[FLOAT1]] : tensor<f32>
@@ -272,3 +273,39 @@ func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
 // CHECK-NEXT:    %[[RESULT:.*]] = tensor.from_elements %[[CMPRESULT]] : tensor<i1>
 // CHECK-NEXT:    %[[EXTRACTED_RESULT:.*]] = tensor.extract %[[RESULT]][] : tensor<i1>
 // CHECK-NEXT:    thlo.yield %[[EXTRACTED_RESULT]] : i1
+
+func.func @reverse_static(%input: tensor<100xf32>)
+  -> tensor<100xf32> {
+  %res = "mhlo.reverse"(%input) {dimensions = dense<[0]> : tensor<1xi64>} :
+    (tensor<100xf32>) -> tensor<100xf32>
+  func.return %res : tensor<100xf32>
+}
+
+// CHECK-LABEL: func @reverse_static
+//  CHECK-SAME: (%[[ARG0:.*]]: tensor<100xf32>) -> tensor<100xf32>
+//       CHECK:   %[[EMPTY:.*]] = tensor.empty
+//       CHECK:   %[[REVERSED:.*]] = thlo.reverse
+//  CHECK-SAME:     ins(%[[ARG0]]
+//  CHECK-SAME:     outs(%[[EMPTY]]
+//  CHECK-SAME:     reverse_dimensions = [0]
+//  CHECK-NEXT:   return %[[REVERSED]]
+
+func.func @reverse_dynamic(%input: tensor<?x?xf32>)
+  -> tensor<?x?xf32> {
+  %res = "mhlo.reverse"(%input) {dimensions = dense<[0, 1]> : tensor<2xi64>} :
+    (tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %res : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func @reverse_dynamic
+//  CHECK-SAME: (%[[ARG0:.*]]: tensor<?x?xf32>) -> tensor<?x?xf32>
+//       CHECK:   %[[C0:.*]] = arith.constant
+//       CHECK:   %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+//       CHECK:   %[[C1:.*]] = arith.constant
+//       CHECK:   %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]]
+//       CHECK:   %[[EMPTY:.*]] = tensor.empty(%[[DIM0]],  %[[DIM1]])
+//       CHECK:   %[[REVERSED:.*]] = thlo.reverse
+//  CHECK-SAME:     ins(%[[ARG0]]
+//  CHECK-SAME:     outs(%[[EMPTY]]
+//  CHECK-SAME:     reverse_dimensions = [0, 1]
+//  CHECK-NEXT:   return %[[REVERSED]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
index 4f0eeb60054..175a1820cb5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-general-dot.mlir
@@ -39,7 +39,7 @@ func.func @testDebatch2(%arg0: tensor<2x3xf32>, %arg1: tensor<1x1x2xf32>) -> ten
 // -----
 
 // CHECK-LABEL: @testBatchPassthrough
-func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf32>) -> tensor<3x2x1xf32> {
+func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf32>) -> tensor<2x3x1xf32> {
   // CHECK-NEXT: "mhlo.dot_general"(%arg0, %arg1)
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
@@ -49,8 +49,8 @@ func.func @testBatchPassthrough(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf3
       rhs_contracting_dimensions = [2]
     >,
     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<3x2x1xf32>
-  func.return %0 : tensor<3x2x1xf32>
+  } : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<2x3x1xf32>
+  func.return %0 : tensor<2x3x1xf32>
 }
 
 // -----
@@ -111,18 +111,22 @@ func.func @dot_general_to_dot_dynamic(%arg0: tensor<128x4x?x32xf32>, %arg1: tens
 // CHECK-DAG: %[[C8:.+]] = mhlo.constant dense<8> : tensor<1xi32>
 // CHECK-DAG: %[[TRANS0:.+]] = "mhlo.transpose"(%arg0) {permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}
 // CHECK-DAG: %[[DIM0:.+]] = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64}
-// CHECK-DAG: %[[MUL0:.+]] = mhlo.multiply %[[DIM0]], %[[C32]]
+// CHECK-DAG: %[[RESHAPE0:.+]] = mhlo.reshape %[[DIM0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG: %[[MUL0:.+]] = mhlo.multiply %[[RESHAPE0]], %[[C32]]
 // CHECK-DAG: %[[CONCAT1:.+]] = "mhlo.concatenate"(%[[MUL0]], %[[C512]]) {dimension = 0 : i64}
 // CHECK-DAG: %[[DR1:.+]] = mhlo.dynamic_reshape %[[TRANS0]], %[[CONCAT1]]
 // CHECK-DAG: %[[TRANS1:.+]] = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 0, 1]> : tensor<4xi64>}
 // CHECK-DAG: %[[DIM1:.+]] = "mhlo.get_dimension_size"(%arg1) {dimension = 1 : i64}
-// CHECK-DAG: %[[MUL1:.+]] = mhlo.multiply %[[DIM1]], %[[C8]]
+// CHECK-DAG: %[[RESHAPE1:.+]] = mhlo.reshape %[[DIM1]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG: %[[MUL1:.+]] = mhlo.multiply %[[RESHAPE1]], %[[C8]]
 // CHECK-DAG: %[[CONCAT2:.+]] = "mhlo.concatenate"(%[[C512]], %[[MUL1]]) {dimension = 0 : i64}
 // CHECK-DAG: %[[DR2:.+]] = mhlo.dynamic_reshape %[[TRANS1]], %[[CONCAT2]]
 // CHECK-DAG: %[[DOT:.+]] = "mhlo.dot"(%[[DR1:.+]], %[[DR2:.+]])
 // CHECK-DAG: %[[DIM2:.+]] = "mhlo.get_dimension_size"(%arg0) {dimension = 2 : i64}
+// CHECK-DAG: %[[RESHAPE2:.+]] = mhlo.reshape %[[DIM2]] : (tensor<i32>) -> tensor<1xi32>
 // CHECK-DAG: %[[DIM3:.+]] = "mhlo.get_dimension_size"(%arg1) {dimension = 1 : i64}
-// CHECK-DAG: %[[CONCAT3:.+]] = "mhlo.concatenate"(%[[DIM2]], %[[C32]], %[[C8]], %[[DIM3]]) {dimension = 0 : i64}
+// CHECK-DAG: %[[RESHAPE3:.+]] = mhlo.reshape %[[DIM3]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-DAG: %[[CONCAT3:.+]] = "mhlo.concatenate"(%[[RESHAPE2]], %[[C32]], %[[C8]], %[[RESHAPE3]]) {dimension = 0 : i64}
 // CHECK-DAG: %[[DR3:.+]] = mhlo.dynamic_reshape %[[DOT]], %[[CONCAT3]]
 // CHECK: return %[[DR3]]
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
index 9cf54592be4..653bd3cf913 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_canonicalize_scatter.mlir
@@ -201,3 +201,34 @@ func.func @zero_dim_scatter_indices(%dst: tensor<4x4xf32>,
 // CHECK:           update_window_dims = [1, 2],
 // CHECK-SAME:      scatter_dims_to_operand_dims = [0, 1]
 // CHECK-SAME:      index_vector_dim = 1
+
+// -----
+
+func.func @multiple_window_and_scatter_dims(
+    %dst: tensor<1x2x3x4x5xf32>, %indices: tensor<6x7x2xi32>,
+    %updates: tensor<2x6x4x7xf32>) -> tensor<1x2x3x4x5xf32> {
+  %0 = "mhlo.scatter"(%dst, %indices, %updates) ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    mhlo.return %arg3 : tensor<f32>
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = #mhlo.scatter<
+      inserted_window_dims = [0, 2, 4],
+      update_window_dims = [0, 2],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 2
+    >, unique_indices = false
+  } : (tensor<1x2x3x4x5xf32>, tensor<6x7x2xi32>, tensor<2x6x4x7xf32>) ->
+      tensor<1x2x3x4x5xf32>
+  return %0 : tensor<1x2x3x4x5xf32>
+}
+
+// CHECK-LABEL: @multiple_window_and_scatter_dims(
+// CHECK-SAME:      %[[DST:.*]]: tensor<1x2x3x4x5xf32>,
+// CHECK-SAME:      %[[IND:.*]]: tensor<6x7x2xi32>,
+// CHECK-SAME:      %[[UPD:.*]]: tensor<2x6x4x7xf32>
+// CHECK:         %[[IND0:.*]] = tensor.collapse_shape %[[IND]] {{.*}} into tensor<42x2xi32>
+// CHECK:         %[[UPD0:.*]] = "mhlo.transpose"(%[[UPD]]) {{.*}} -> tensor<6x7x2x4xf32>
+// CHECK:         %[[UPD1:.*]] = tensor.collapse_shape %[[UPD0]] {{.*}} into tensor<42x2x4xf32>
+// CHECK:         %[[UPD2:.*]] = tensor.expand_shape %[[UPD1]] {{.*}} into tensor<42x1x2x1x4x1xf32>
+// CHECK:         "mhlo.scatter"(%[[DST]], %[[IND0]], %[[UPD2]])
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
index f38c43f418f..a6853afc8f2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
@@ -2,8 +2,8 @@
 
 // CHECK-LABEL: @custom_call
 // CHECK-SAME: %[[X:.*]]: tensor<6x3xf32>
-// CHECK: %[[CALL:.+]]:2 = "mhlo.custom_call"(%[[X]]) {api_version = 2 : i32, call_target_name = "f"} : (tensor<6x3xf32>) -> (tensor<6xf32>, tensor<3xf32>)
-// CHECK: return %[[CALL]]#0, %[[CALL]]#1 : tensor<6xf32>, tensor<3xf32>
+// CHECK: %[[CALL:.+]]:2 = mhlo.custom_call @f(%[[X]]) {api_version = 2 : i32} : (tensor<6x3xf32>) -> (tensor<6xf32>, tensor<3xf32>) 
+// CHECK: return %[[CALL]]#0, %[[CALL]]#1 : tensor<6xf32>, tensor<3xf32> 
 func.func @custom_call(%x: tensor<6x3xf32>) -> (tensor<6xf32>, tensor<3xf32>) {
   %0 = "mhlo.custom_call"(%x) {api_version = 2 : i32, call_target_name = "f"}
     : (tensor<6x3xf32>) -> tuple<tensor<6xf32>, tensor<3xf32>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index f7791080074..cc2ee6aff7a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -1,20 +1,5 @@
 // RUN: mlir-hlo-opt --mhlo-test-infer-shaped-type-methods --allow-unregistered-dialect --split-input-file --verify-diagnostics %s | FileCheck %s
 
-// CHECK-LABEL: @select
-// CHECK-SAME: (%{{.*}}: tensor<i1>, %[[SHAPED_ARG:.*]]: tensor<2x?xf32>, %{{.*}}: tensor<2x?xf32>
-func.func @select(%pred : tensor<i1>, %a : tensor<2x?xf32>, %b : tensor<2x?xf32>)
-    -> tensor<2xindex> {
-  // CHECK: %[[SHAPE:.*]] = shape.shape_of %[[SHAPED_ARG]] : tensor<2x?xf32> -> tensor<2xindex>
-  // CHECK: return %[[SHAPE]] : tensor<2xindex>
-  %0 = "mhlo.select"(%pred, %a, %b)
-      : (tensor<i1>, tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xf32>
-  %1 = "mhlo_test.reify_return_type_shapes"(%0)
-      : (tensor<2x?xf32>) -> tensor<2xindex>
-  func.return %1 : tensor<2xindex>
-}
-
-// -----
-
 // CHECK-LABEL: @compare
 // CHECK-SAME: (%[[A:.*]]: tensor<2x?xf32>,
 func.func @compare(%a : tensor<2x?xf32>, %b : tensor<2x?xf32>) -> tensor<2xindex> {
@@ -30,14 +15,14 @@ func.func @compare(%a : tensor<2x?xf32>, %b : tensor<2x?xf32>) -> tensor<2xindex
 // -----
 
 // CHECK-LABEL: @select
-func.func @select(%pred : tensor<i1>, %a : tensor<2x2xf32>, %b : tensor<2x2xf32>)
-    -> tensor<2x2xindex> {
+func.func @select(%pred : tensor<i1>, %a : tensor<?x2x3xf32>, %b : tensor<1x?x3xf32>)
+    -> tensor<1x2x3xindex> {
   %0 = "mhlo.select"(%pred, %a, %b)
-      : (tensor<i1>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+      : (tensor<i1>, tensor<?x2x3xf32>, tensor<1x?x3xf32>) -> tensor<*xf32>
   %1 = "mhlo_test.get_return_type_components"(%0)
-      : (tensor<2x2xf32>) -> tensor<2x2xindex>
-// CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [2, 2], element_type0 = f32} : (tensor<2x2xf32>) -> tensor<2x2xindex>
-  func.return %1 : tensor<2x2xindex>
+      : (tensor<*xf32>) -> tensor<1x2x3xindex>
+  // CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [1, 2, 3], element_type0 = f32} : (tensor<*xf32>) -> tensor<1x2x3xindex>
+  func.return %1 : tensor<1x2x3xindex>
 }
 
 // -----
@@ -87,16 +72,42 @@ func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tens
 // -----
 
 // CHECK-LABEL: @pad
-func.func @pad(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
+func.func @pad(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xindex> {
   %0 = "mhlo.pad"(%arg0, %arg1) {
     edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
     interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
   } : (tensor<1x2x3xf16>, tensor<f16>) -> tensor<2x4x7xf16>
-  %1 = "mhlo_test.get_return_type_components"(%0)
-      : (tensor<2x4x7xf16>) -> tensor<2x4x7xindex>
-// CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [2, 4, 7], element_type0 = f16} : (tensor<2x4x7xf16>) -> tensor<2x4x7xindex>
-  func.return %0 : tensor<2x4x7xf16>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<2x4x7xf16>) -> tensor<2x4x7xindex>
+// CHECK: %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<2x4x7xf16>) -> tensor<2x4x7xindex>
+  func.return %1 : tensor<2x4x7xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @pad_with_bounds
+func.func @pad_with_bounds(%arg0: tensor<3x?x?xf16, #mhlo.type_extensions<bounds = [?, 3, ?]>>, %arg1: tensor<f16>) -> tensor<*xindex> {
+  %0 = "mhlo.pad"(%arg0, %arg1) {
+    edge_padding_low = dense<[2, 2, 0]> : tensor<3xi64>,
+    edge_padding_high = dense<[0, 0, 0]> : tensor<3xi64>,
+    interior_padding = dense<[1, 1, 1]> : tensor<3xi64>
+  } : (tensor<3x?x?xf16, #mhlo.type_extensions<bounds = [?, 3, ?]>>, tensor<f16>) -> tensor<*xf16>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xf16>) -> tensor<*xindex>
+  // CHECK: %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xf16>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+func.func @pad_with_negative_inferred_bounds(%arg0: tensor<3x?x?xf16, #mhlo.type_extensions<bounds = [?, 3, ?]>>, %arg1: tensor<f16>) -> tensor<*xindex> {
+  // expected-error@+1 {{Padding result in negative bound for dimension 1}}
+  %0 = "mhlo.pad"(%arg0, %arg1) {
+    edge_padding_low = dense<[2, -10, 0]> : tensor<3xi64>,
+    edge_padding_high = dense<[0, 0, 0]> : tensor<3xi64>,
+    interior_padding = dense<[1, 1, 1]> : tensor<3xi64>
+  } : (tensor<3x?x?xf16, #mhlo.type_extensions<bounds = [?, 3, ?]>>, tensor<f16>) -> tensor<*xf16>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xf16>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
 }
 
 // -----
@@ -104,9 +115,8 @@ func.func @pad(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16
 // CHECK-LABEL: @cholesky
 func.func @cholesky(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2xindex> {
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xf32>) -> tensor<1x2x2xf32>
-  %1 = "mhlo_test.get_return_type_components"(%0)
-      : (tensor<1x2x2xf32>) -> tensor<1x2x2xindex>
-// CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [1, 2, 2], element_type0 = f32} : (tensor<1x2x2xf32>) -> tensor<1x2x2xindex>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<1x2x2xf32>) -> tensor<1x2x2xindex>
+  // CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [1, 2, 2], element_type0 = f32} : (tensor<1x2x2xf32>) -> tensor<1x2x2xindex>
   func.return %1: tensor<1x2x2xindex>
 }
 
@@ -128,6 +138,21 @@ func.func @alltoall(%data: tensor<4x16xf32>) -> tensor<16x4xindex> {
 
 // -----
 
+// CHECK-LABEL: func @alltoall_bounds
+func.func @alltoall_bounds(%data: tensor<16x?xf32, #mhlo.type_extensions<bounds = [?, 5]>>) -> tensor<*xindex> {
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 0 : i64,
+    concat_dimension = 1 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<16x?xf32, #mhlo.type_extensions<bounds = [?, 5]>>) -> tensor<*xf32>
+  // CHECK: types0 = tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 20]>>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<*xf32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
 // CHECK-LABEL: func @abs
 func.func @abs(%arg0: tensor<1x2xf32>) -> tensor<1x2xindex> {
   %0 = "mhlo.abs"(%arg0) {} : (tensor<1x2xf32>) -> tensor<1x2xf32>
@@ -140,7 +165,7 @@ func.func @abs(%arg0: tensor<1x2xf32>) -> tensor<1x2xindex> {
 // -----
 
 // CHECK-LABEL: @concat
-func.func @concat(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xindex> {
+func.func @concat(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<3xindex> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
   %1 = "mhlo_test.get_return_type_components"(%0)
       : (tensor<3xi32>) -> tensor<3xindex>
@@ -150,6 +175,175 @@ func.func @concat(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xindex
 
 // -----
 
+// -----
+
+// Inference rules to concat dimensions with bounds (lhs/rhs are commutative):
+//       Dim of lhs     Dim of rhs      Infer
+//  c0:  X              Y               X+Y
+//  c1:  X              ?               ?
+//  c2:  X              ?, B            ?, X+B
+//  c3:  ?              ?               ?
+//  c4:  ?              ?, B            ?
+//  c5:  ?, B           ?, C            ?, B+C
+
+// CHECK-LABEL: @concat_bounds_c0
+func.func @concat_bounds_c0(
+  %arg0: tensor<5x1xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+  %arg1: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x1xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x3xi32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_c1
+func.func @concat_bounds_c1(
+  %arg0: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %2 = "mhlo_test.get_return_types"(%result_swap) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_c2
+func.func @concat_bounds_c2(
+  %arg0: tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 6]>>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
+    tensor<5x2xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 6]>>
+  %2 = "mhlo_test.get_return_types"(%result_swap) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_c3
+func.func @concat_bounds_c3(
+  %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_c4
+func.func @concat_bounds_c4(
+  %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  %result_swap = "mhlo.concatenate"(%arg1, %arg0) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, ?]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %2 = "mhlo_test.get_return_types"(%result_swap) : (tensor<?x?xi32>) -> tensor<*xindex>
+
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_c5
+func.func @concat_bounds_c5(
+  %arg0: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 3]>>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 3]>>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<?x?xi32>
+  // CHECK: types0 = tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 7]>>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<?x?xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// Note: unranked input types can't be ignored, consider these input types:
+// c0: (<5x?xf32>, <*xf32>) with concat dim 0 should infer <?x?xf32>
+// c1: (<5x?xf32>, <*xf32>) with concat dim 1 should infer <5x?xf32>
+// Instead, they should be replaced with dynamic tensors: tensor<?x...?x>
+
+// CHECK-LABEL: @concat_bounds_unranked_c0
+func.func @concat_bounds_unranked_c0(
+  %arg0: tensor<*xi32>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (
+    tensor<*xi32>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<5x?xi32>
+  // CHECK: types0 = tensor<?x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<5x?xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @concat_bounds_unranked_c1
+func.func @concat_bounds_unranked_c1(
+  %arg0: tensor<*xi32>,
+  %arg1: tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<*xindex> {
+  %result = "mhlo.concatenate"(%arg0, %arg1) { dimension = 1 : i64 } : (
+    tensor<*xi32>,
+    tensor<5x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>) -> tensor<5x?xi32>
+  // CHECK: types0 = tensor<5x?xi32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<5x?xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: while_bounds
+func.func @while_bounds(
+  %while_arg_1: tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
+  %while_arg_2: tensor<3xf32>) -> tensor<*xindex> {
+  %1:2 = "mhlo.while"(%while_arg_1, %while_arg_2) ({
+  ^bb0(%arg1: tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>, %arg2: tensor<3xf32>):
+    %2 = mhlo.constant dense<1> : tensor<i1>
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>, %arg2: tensor<3xf32>):
+    "mhlo.return"(%arg1, %arg2) : (tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>, tensor<3xf32>) -> ()
+  }) : (tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>, tensor<3xf32>) -> (tensor<*xi32>, tensor<*xf32>)
+  // CHECK: types0 = tensor<2x?xi32, #mhlo.type_extensions<bounds = [?, 4]>>,
+  // CHECK-SAME: types1 = tensor<3xf32>
+  %3 = "mhlo_test.get_return_types"(%1) : (tensor<*xi32>) -> tensor<*xindex>
+  func.return %3 : tensor<*xindex>
+}
+
+// -----
+
 // CHECK-LABEL: @gather
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xindex> {
   %res = "mhlo.gather"(%operand, %start_indices) {
@@ -205,6 +399,25 @@ func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xindex> {
 
 // -----
 
+// CHECK-LABEL: func @slice_with_bounds
+func.func @slice_with_bounds(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xindex> {
+  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 4, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xi32>
+  // CHECK: %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xi32>) -> tensor<*xindex>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+func.func @slice_with_index_larger_than_bound_dim(%arg0: tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xindex> {
+  // expected-error@+1 {{limit index 5 is larger than dimension bound 4 in dimension 1}}
+  %0 = "mhlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #mhlo.type_extensions<bounds = [?, 4, ?]>>) -> tensor<*xi32>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<*xi32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
 // CHECK-LABEL: func @clamp
 func.func @clamp(%arg0: tensor<1xi32>) -> tensor<1xindex> {
   %0 = "mhlo.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
@@ -239,41 +452,110 @@ func.func @fft(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xindex> {
 // -----
 
 // CHECK-LABEL: func @batch_norm_grad
-func.func @batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xindex> {
+func.func @batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xindex> {
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-  // CHECK: (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xindex>
-  %1 = "mhlo_test.get_return_type_components"(%0#0) : (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xindex>
-  // CHECK: (tensor<2xf32>) -> tensor<2xindex>
-  %2 = "mhlo_test.get_return_type_components"(%0#1) : (tensor<2xf32>) -> tensor<2xindex>
-  // CHECK: (tensor<2xf32>) -> tensor<2xindex>
-  %3 = "mhlo_test.get_return_type_components"(%0#2) : (tensor<2xf32>) -> tensor<2xindex>
-  func.return %1 : tensor<2x2x2x2xindex>
+  // CHECK: types0 = tensor<2x2x2x2xf32>
+  // CHECK-SAME: types1 = tensor<2xf32>
+  // CHECK-SAME: types2 = tensor<2xf32>
+  %1 = "mhlo_test.get_return_types"(%0#0) : (tensor<2x2x2x2xf32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
 }
 
 // -----
 
 // CHECK-LABEL: func @batch_norm_train
-func.func @batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xindex> {
-  %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-  // CHECK: (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xindex>
-  %1 = "mhlo_test.get_return_type_components"(%0#0) : (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xindex>
-  // CHECK: (tensor<2xf32>) -> tensor<2xindex>
-  %2 = "mhlo_test.get_return_type_components"(%0#1) : (tensor<2xf32>) -> tensor<2xindex>
-  // CHECK: (tensor<2xf32>) -> tensor<2xindex>
-  %3 = "mhlo_test.get_return_type_components"(%0#2) : (tensor<2xf32>) -> tensor<2xindex>
-  func.return %1 : tensor<2x2x2x2xindex>
+func.func @batch_norm_train(%input: tensor<2x?x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<*xindex> {
+  %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 1 : i64} : (tensor<2x?x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x?x2x2xf32>, tensor<?xf32>, tensor<?xf32>)
+  // CHECK: types0 = tensor<2x?x2x2xf32>
+  // CHECK-SAME: types1 = tensor<?xf32>
+  // CHECK-SAME: types2 = tensor<?xf32>
+  %1 = "mhlo_test.get_return_types"(%0#0) : (tensor<2x?x2x2xf32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
 }
 
 // -----
 
 // CHECK-LABEL: @batch_norm_inference
-func.func @batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xindex>) {
+func.func @batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<*xindex>) {
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
         tensor<256xf32>) -> tensor<4x256xf32>
-  // CHECK: (tensor<4x256xf32>) -> tensor<4x256xindex>
-  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<4x256xf32>) -> tensor<4x256xindex>
-  func.return %1 : tensor<4x256xindex>
+  // CHECK: types0 = tensor<4x256xf32>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<4x256xf32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @batch_norm_inference_bounds
+func.func @batch_norm_inference_bounds(
+  %input: tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>, %scale: tensor<?xf32>,
+  %offset: tensor<?xf32>, %mean: tensor<?xf32>, %variance: tensor<?xf32>
+) -> (tensor<*xindex>) {
+  %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {
+    epsilon = 1.001000e-05 : f32, feature_index = 1 : i64
+    } : (tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+  // CHECK: types0 = tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @batch_norm_grad_bounds
+func.func @batch_norm_grad_bounds(
+  %input: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+  %scale: tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+  %mean: tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+  %variance: tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+  %grad_output: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+) -> tensor<*xindex> {
+  %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {
+    epsilon = 0.001 : f32, feature_index = 1 : i64
+  } : (
+    tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+  ) ->
+    (
+    tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  )
+  // CHECK: types0 = tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+  // CHECK-SAME: types1 = tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  // CHECK-SAME: types2 = tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  %1 = "mhlo_test.get_return_types"(%0#0) : (tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @batch_norm_train_bounds
+func.func @batch_norm_train_bounds(
+  %input: tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+  %scale: tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+  %offset: tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+) -> tensor<*xindex> {
+  %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {
+    epsilon = 0.001 : f32, feature_index = 1 : i64
+  } : (
+    tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  ) ->
+    (
+    tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>,
+    tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  )
+  // CHECK: types0 = tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>
+  // CHECK-SAME: types1 = tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  // CHECK-SAME: types2 = tensor<?xf32, #mhlo.type_extensions<bounds = [64]>>
+  %1 = "mhlo_test.get_return_types"(%0#0) : (tensor<2x?xf32, #mhlo.type_extensions<bounds = [?, 64]>>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
 }
 
 // -----
@@ -303,7 +585,7 @@ func.func @triangular_solve(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x5x4x4x
 // -----
 
 // CHECK-LABEL: func @if
-func.func @if(%pred : tensor<i1>, %branch_operand : tensor<2xf32>, %wrong_type : tensor<2xf32>) {
+func.func @if(%pred : tensor<i1>, %branch_operand : tensor<2xf32>, %wrong_type : tensor<2xf32>) -> tensor<2xindex> {
   %0 = "mhlo.if"(%pred) ({
       "mhlo.return"(%wrong_type) : (tensor<2xf32>) -> ()
     }, {
@@ -311,13 +593,13 @@ func.func @if(%pred : tensor<i1>, %branch_operand : tensor<2xf32>, %wrong_type :
     }) : (tensor<i1>) -> tensor<2xf32>
   // CHECK: (tensor<2xf32>) -> tensor<2xindex>
   %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<2xf32>) -> tensor<2xindex>
-  func.return
+  func.return %1 : tensor<2xindex>
 }
 
 // -----
 
 // CHECK-LABEL: func @case
-func.func @case(%index : tensor<i32>, %branch_operand : tensor<2xf32>) {
+func.func @case(%index : tensor<i32>, %branch_operand : tensor<2xf32>) -> tensor<2xindex> {
   %0 = "mhlo.case"(%index) ({
       "mhlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
   }, {
@@ -325,13 +607,13 @@ func.func @case(%index : tensor<i32>, %branch_operand : tensor<2xf32>) {
   }) : (tensor<i32>) -> tensor<2xf32>
   // CHECK: (tensor<2xf32>) -> tensor<2xindex>
   %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<2xf32>) -> tensor<2xindex>
-  func.return
+  func.return %1 : tensor<2xindex>
 }
 
 // -----
 
 // CHECK-LABEL: func @sort
-func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
+func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) -> (tensor<16x16xindex>, tensor<16x16xindex>) {
   %0:2 = "mhlo.sort"(%input0, %input1) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -341,11 +623,43 @@ func.func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   %1 = "mhlo_test.get_return_type_components"(%0#0) : (tensor<16x16xf32>) -> tensor<16x16xindex>
   // CHECK: (tensor<16x16xi32>) -> tensor<16x16xindex>
   %2 = "mhlo_test.get_return_type_components"(%0#1) : (tensor<16x16xi32>) -> tensor<16x16xindex>
+  func.return %1, %2 : tensor<16x16xindex>, tensor<16x16xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @sort_bounds_and_unknown_rank
+func.func @sort_bounds_and_unknown_rank(%input0: tensor<*xf32>, %input1: tensor<5x?x?xi32, #mhlo.type_extensions<bounds = [?, 7, 6]>>) {
+  %0, %1 = "mhlo.sort"(%input0, %input1) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
+    %pred = "mhlo.compare"(%arg0, %arg1) {
+      comparison_direction = #mhlo<comparison_direction GT>
+    } : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%pred) : (tensor<i1>) -> ()
+  }) { dimension = 1 : i64, is_stable = true } : (
+    tensor<*xf32>,
+    tensor<5x?x?xi32, #mhlo.type_extensions<bounds = [?, 7, 6]>>
+  ) -> (tensor<*xf32>, tensor<*xi32>)
+  // CHECK: types0 = tensor<*xf32>
+  // CHECK-SAME: types1 = tensor<5x?x?xi32, #mhlo.type_extensions<bounds = [?, 7, 6]>>
+  %2 = "mhlo_test.get_return_types"(%0) : (tensor<*xf32>) -> tensor<*xindex>
   func.return
 }
 
 // -----
 
+// CHECK-LABEL: func @outfeed
+func.func @outfeed(%arg0: tensor<3x3x3xi32>, %arg1: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.outfeed"(%arg0, %arg1) {
+    outfeed_config = ""
+  } : (tensor<3x3x3xi32>, !mhlo.token) -> !mhlo.token
+  %1 = "mhlo_test.get_return_types"(%0) : (!mhlo.token) -> !mhlo.token
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = !mhlo.token} : (!mhlo.token) -> !mhlo.token
+  func.return %1 : !mhlo.token
+}
+
+// -----
+
 // CHECK-LABEL: func @while
 func.func @while(%arg0: tensor<4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<4xf32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<i32>) -> tensor<index> {
   %cst = arith.constant dense<-1> : tensor<i32>
@@ -372,6 +686,148 @@ func.func @while(%arg0: tensor<4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %
 
 // -----
 
+// CHECK-LABEL: func @get_dimension_size
+func.func @get_dimension_size(%arg0: tensor<4x2xf32>) -> tensor<index> {
+  %0 = "mhlo.get_dimension_size"(%arg0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<i32>) -> tensor<index>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<i32>} : (tensor<i32>) -> tensor<index>
+  func.return %1 : tensor<index>
+}
+
+// -----
+
+// CHECK-LABEL: @dynamic_update_slice
+func.func @dynamic_update_slice(%arg0: tensor<4x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<i64>, %arg3: tensor<i64>) -> tensor<4x4xindex> {
+  %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xi32>, tensor<2x2xi32>, tensor<i64>, tensor<i64>) -> tensor<4x4xi32>
+  %1 = "mhlo_test.get_return_type_components"(%0) : (tensor<4x4xi32>) -> tensor<4x4xindex>
+  // CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [4, 4], element_type0 = i32} : (tensor<4x4xi32>) -> tensor<4x4xindex>
+  func.return %1 : tensor<4x4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @dynamic_update_slice_with_bounds
+func.func @dynamic_update_slice_with_bounds(%input: tensor<3x?x?xi64, #mhlo.type_extensions<bounds = [?, ?, 5]>>, %update: tensor<1x4x3xi64>, %start1: tensor<i64>, %start2: tensor<i64>, %start3 : tensor<i64>) -> tensor<*xindex> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2, %start3) : (tensor<3x?x?xi64, #mhlo.type_extensions<bounds = [?, ?, 5]>>, tensor<1x4x3xi64>, tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<3x?x?xi64>
+  // CHECK: types0 = tensor<3x?x?xi64, #mhlo.type_extensions<bounds = [?, ?, 5]>>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<3x?x?xi64>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @create_token
+func.func @create_token() -> !mhlo.token {
+  %0 = "mhlo.create_token"() : () -> !mhlo.token
+  %1 = "mhlo_test.get_return_types"(%0) : (!mhlo.token) -> !mhlo.token
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = !mhlo.token} : (!mhlo.token) -> !mhlo.token
+  func.return %1 : !mhlo.token
+}
+
+// -----
+
+// CHECK-LABEL: func @after_all_empty_arg
+func.func @after_all_empty_arg() -> !mhlo.token {
+  %0 = "mhlo.after_all"() : () -> !mhlo.token
+  %1 = "mhlo_test.get_return_types"(%0) : (!mhlo.token) -> !mhlo.token
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = !mhlo.token} : (!mhlo.token) -> !mhlo.token
+  func.return %1 : !mhlo.token
+}
+
+// -----
+
+// CHECK-LABEL: func @after_all
+func.func @after_all(%arg0: !mhlo.token, %arg1: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0, %arg1) : (!mhlo.token, !mhlo.token) -> !mhlo.token
+  %1 = "mhlo_test.get_return_types"(%0) : (!mhlo.token) -> !mhlo.token
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = !mhlo.token} : (!mhlo.token) -> !mhlo.token
+  func.return %1 : !mhlo.token
+}
+
+// -----
+
+// CHECK: func @select_and_scatter
+func.func @select_and_scatter(
+  %arg0: tensor<10x24x24x64xf32>,
+  %arg1: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xindex> {
+  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+
+  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = "mhlo.compare"(%arg3, %arg4) {
+      compare_type = #mhlo<comparison_type TOTALORDER>,
+      comparison_direction = #mhlo<comparison_direction GE>
+      } : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = mhlo.add %arg3, %arg4 : tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) {
+    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) ->
+        tensor<10x24x24x64xf32>
+  %3 = "mhlo_test.get_return_types"(%1) : (tensor<10x24x24x64xf32>) -> tensor<10x24x24x64xindex>
+   // CHECK: %2 = "mhlo_test.return_types"(%1) {types0 = tensor<10x24x24x64xf32>} : (tensor<10x24x24x64xf32>) -> tensor<10x24x24x64xindex>
+  func.return %3 : tensor<10x24x24x64xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @scatter
+func.func @scatter(%input_tensor: tensor<200x100x300xf32>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<200x100x300xindex> {
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x100x300xf32>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<200x100x300xf32>) -> tensor<200x100x300xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<200x100x300xf32>} : (tensor<200x100x300xf32>) -> tensor<200x100x300xindex>
+  func.return %1 : tensor<200x100x300xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @scatter_bounds
+func.func @scatter_bounds(%input_tensor: tensor<200x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 301]>>,
+    %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
+      tensor<*xindex> {
+  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %add = mhlo.add %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%add) : (tensor<f32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [1],
+      inserted_window_dims = [0, 1],
+      scatter_dims_to_operand_dims = [0, 1],
+      index_vector_dim = 1
+    >,
+    indices_are_sorted = true,
+    unique_indices = true
+  } : (tensor<200x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 301]>>, tensor<10x2xi32>, tensor<10x300xf32>) ->
+      tensor<200x?x?xf32>
+
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<200x?x?xf32>) -> tensor<*xindex>
+  // CHECK: types0 = tensor<200x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 301]>>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // Sparsity
 //===----------------------------------------------------------------------===//
@@ -385,7 +841,7 @@ func.func @tanh_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex>
   %0 = "mhlo.tanh"(%arg0) : (tensor<10x10xf32, #CSR>) -> tensor<10x10xf32>
   %1 = "mhlo_test.get_return_types"(%0)
       : (tensor<10x10xf32>) -> tensor<10x10xindex>
-// CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
   func.return %1 : tensor<10x10xindex>
 }
 
@@ -400,7 +856,7 @@ func.func @abs_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex> {
   %0 = "mhlo.abs"(%arg0) : (tensor<10x10xf32, #CSR>) -> tensor<10x10xf32>
   %1 = "mhlo_test.get_return_types"(%0)
       : (tensor<10x10xf32>) -> tensor<10x10xindex>
-// CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
   func.return %1 : tensor<10x10xindex>
 }
 
@@ -415,7 +871,7 @@ func.func @real_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
   %0 = "mhlo.real"(%arg0) : (tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x10xf32>
   %1 = "mhlo_test.get_return_types"(%0)
       : (tensor<10x10xf32>) -> tensor<10x10xindex>
-// CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
   func.return %1 : tensor<10x10xindex>
 }
 
@@ -430,7 +886,7 @@ func.func @imag_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
   %0 = "mhlo.imag"(%arg0) : (tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x10xf32>
   %1 = "mhlo_test.get_return_types"(%0)
       : (tensor<10x10xf32>) -> tensor<10x10xindex>
-// CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xf32, {{.*}}>} : (tensor<10x10xf32>) -> tensor<10x10xindex>
   func.return %1 : tensor<10x10xindex>
 }
 
@@ -445,26 +901,70 @@ func.func @complex_sparsity(%arg0: tensor<10x10xf32, #CSR>, %arg1: tensor<10x10x
   %0 = "mhlo.complex"(%arg0, %arg1) : (tensor<10x10xf32, #CSR>, tensor<10x10xf32, #CSR>) -> tensor<10x10xcomplex<f32>>
   %1 = "mhlo_test.get_return_types"(%0)
       : (tensor<10x10xcomplex<f32>>) -> tensor<10x10xindex>
-// CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xcomplex<f32>, {{.*}}>} : (tensor<10x10xcomplex<f32>>) -> tensor<10x10xindex>
+  // CHECK: %1 = "mhlo_test.return_types"(%0) {types0 = tensor<10x10xcomplex<f32>, {{.*}}>} : (tensor<10x10xcomplex<f32>>) -> tensor<10x10xindex>
   func.return %1 : tensor<10x10xindex>
 }
 
 // -----
 
 // CHECK-LABEL: func @reduce
-func.func @reduce(%arg0: tensor<4x4xf32>, %arg1 : tensor<4xf32>)
-    -> (tensor<4xindex>) {
+func.func @reduce(%arg0: tensor<7x5xf32>, %arg1 : tensor<5xf32>)
+    -> (tensor<5xindex>) {
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
 
-  ^bb0(%arg2: tensor<4xf32>, %arg3: tensor<4xf32> ):
-    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-    "mhlo.return"(%1) : (tensor<4xf32>) -> ()
+  ^bb0(%arg2: tensor<5xf32>, %arg3: tensor<5xf32> ):
+    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+    "mhlo.return"(%1) : (tensor<5xf32>) -> ()
+
+  }) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<7x5xf32>, tensor<5xf32>) -> tensor<5xf32>
 
-  }) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: {dims0 = [5], element_type0 = f32}
   %2 = "mhlo_test.get_return_type_components"(%0)
-      : (tensor<4xf32>) -> tensor<4xindex>
-// CHECK: %1 = "mhlo_test.return_type_components"(%0) {dims0 = [4], element_type0 = f32} : (tensor<4xf32>) -> tensor<4xindex>
-  func.return %2: tensor<4xindex>
+      : (tensor<5xf32>) -> tensor<5xindex>
+
+  func.return %2: tensor<5xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @reduce_with_bounds
+func.func @reduce_with_bounds(%arg0: tensor<?x?x5xf32, #mhlo.type_extensions<bounds = [3, 7, ?]>>, %arg1 : tensor<5xf32>)
+    -> (tensor<*xindex>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ({
+
+  ^bb0(%arg2: tensor<5xf32>, %arg3: tensor<5xf32> ):
+    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xf32>
+    "mhlo.return"(%1) : (tensor<5xf32>) -> ()
+
+  }) {dimensions = dense<[0]> : tensor<1xi64>}
+      : (tensor<?x?x5xf32, #mhlo.type_extensions<bounds = [3, 7, ?]>>, tensor<5xf32>)
+          -> tensor<?x5xf32, #mhlo.type_extensions<bounds = [7, ?]>>
+
+  // CHECK: types0 = tensor<?x5xf32, #mhlo.type_extensions<bounds = [7, ?]>>
+  %2 = "mhlo_test.get_return_types"(%0)
+      : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [7, ?]>>) -> tensor<*xindex>
+
+  func.return %2: tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @unranked_reduce
+func.func @unranked_reduce(%arg0: tensor<*xf32>, %arg1 : tensor<f32>)
+   -> (tensor<*xindex>) {
+  %0 = "mhlo.reduce"(%arg0, %arg1) ({
+
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32> ):
+    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+
+  }) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+
+  // CHECK: {element_type0 = f32}
+  %2 = "mhlo_test.get_return_type_components"(%0)
+      : (tensor<*xf32>) -> tensor<*xindex>
+
+  func.return %2: tensor<*xindex>
 }
 
 // -----
@@ -504,7 +1004,7 @@ func.func @reduce_window(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi32>,
 func.func @tensor_bounds(%arg0: tensor<3x5xf32>, %arg1: tensor<i32>) -> tensor<*xindex> {
   %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x5xf32>, tensor<i32>) -> tensor<*xf32>
 
-  // CHECK: types0 = tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>
+  // CHECK: types0 = tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
 }
@@ -512,9 +1012,9 @@ func.func @tensor_bounds(%arg0: tensor<3x5xf32>, %arg1: tensor<i32>) -> tensor<*
 // -----
 
 // CHECK-LABEL: @static_tensor_bounds
-func.func @static_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [8, -1]>>) -> tensor<*xindex> {
+func.func @static_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [8, ?]>>) -> tensor<*xindex> {
   %bounds = mhlo.constant dense<8> : tensor<i32>
-  %result = "mhlo.set_dimension_size"(%arg0, %bounds) {dimension = 0 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [8, -1]>>, tensor<i32>) -> tensor<*xf32>
+  %result = "mhlo.set_dimension_size"(%arg0, %bounds) {dimension = 0 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [8, ?]>>, tensor<i32>) -> tensor<*xf32>
 
   // CHECK: types0 = tensor<8x5xf32>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
@@ -524,8 +1024,8 @@ func.func @static_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bou
 // -----
 
 // CHECK-LABEL: @edit_tensor_bounds
-func.func @edit_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
-  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+func.func @edit_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
+  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<*xf32>
 
   // CHECK: types0 = tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, 5]>>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
@@ -535,10 +1035,10 @@ func.func @edit_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bound
 // -----
 
 // CHECK-LABEL: @retain_tensor_bounds
-func.func @retain_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
-  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+func.func @retain_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
+  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<*xf32>
 
-  // CHECK: types0 = tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, -1]>>
+  // CHECK: types0 = tensor<?x5xf32, #mhlo.type_extensions<bounds = [3, ?]>>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
 }
@@ -546,10 +1046,10 @@ func.func @retain_tensor_bounds(%arg0: tensor<?x5xf32, #mhlo.type_extensions<bou
 // -----
 
 // CHECK-LABEL: @unknown_bounds
-func.func @unknown_bounds(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
-  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+func.func @unknown_bounds(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, %arg1: tensor<i32>) -> tensor<*xindex> {
+  %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<*xf32>
 
-  // CHECK: types0 = tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>
+  // CHECK: types0 = tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
 }
@@ -572,21 +1072,21 @@ func.func @unranked_input(%arg0: tensor<*xf32>, %arg1: tensor<i32>) -> tensor<*x
 // See PairwiseSameOperandAndResultType::inferDimWithBound()
 // CHECK-LABEL: @add_bounds
 func.func @add_bounds(
-  %arg0: tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, -1, -1, -1, 3, 3]>>,
-  %arg1: tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, 4, -1, 3, 3, 4]>>) -> tensor<*xindex> {
+  %arg0: tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, ?, ?, ?, 3, 3]>>,
+  %arg1: tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 4, ?, 3, 3, 4]>>) -> tensor<*xindex> {
   %result1 = "mhlo.add"(%arg0, %arg1) : (
-    tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, -1, -1, -1, 3, 3]>>,
-    tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, 4, -1, 3, 3, 4]>>)
+    tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, ?, ?, ?, 3, 3]>>,
+    tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 4, ?, 3, 3, 4]>>)
     -> tensor<?x?x?x?x?x?x?xf32>
   %result2 = "mhlo.add"(%arg1, %arg0) : (
-    tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, 4, -1, 3, 3, 4]>>,
-    tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, -1, -1, -1, 3, 3]>>)
+    tensor<3x?x?x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 4, ?, 3, 3, 4]>>,
+    tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, ?, ?, ?, 3, 3]>>)
     -> tensor<?x?x?x?x?x?x?xf32>
 
-  // CHECK: types0 = tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, -1, -1, 3, 3, 3]>>
+  // CHECK: types0 = tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, ?, ?, 3, 3, 3]>>
   %1 = "mhlo_test.get_return_types"(%result1) : (tensor<?x?x?x?x?x?x?xf32>) -> tensor<*xindex>
 
-  // CHECK: types0 = tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [-1, -1, -1, -1, 3, 3, 3]>>
+  // CHECK: types0 = tensor<3x3x3x?x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, ?, ?, 3, 3, 3]>>
   %2 = "mhlo_test.get_return_types"(%result2) : (tensor<?x?x?x?x?x?x?xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
 }
@@ -596,11 +1096,11 @@ func.func @add_bounds(
 // This test covers "Error out" case for type inference of binary op with bounds
 // See PairwiseSameOperandAndResultType::inferDimWithBound()
 func.func @add_bounds_mismatch(
-  %arg0: tensor<3xf32, #mhlo.type_extensions<bounds = [-1]>>,
+  %arg0: tensor<3xf32, #mhlo.type_extensions<bounds = [?]>>,
   %arg1: tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> tensor<*xindex> {
   // expected-error@+1 {{requires compatible types for all operands and results}}
   %result = "mhlo.add"(%arg0, %arg1) : (
-    tensor<3xf32, #mhlo.type_extensions<bounds = [-1]>>,
+    tensor<3xf32, #mhlo.type_extensions<bounds = [?]>>,
     tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> tensor<?xf32>
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<?xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
@@ -617,3 +1117,285 @@ func.func @add_bounds_unranked(
   %1 = "mhlo_test.get_return_types"(%result) : (tensor<*xf32>) -> tensor<*xindex>
   func.return %1 : tensor<*xindex>
 }
+
+// -----
+
+// CHECK-LABEL: @partition_id
+func.func @partition_id() -> tensor<*xindex> {
+  %result = "mhlo.partition_id"() : () -> tensor<ui32>
+  // CHECK: types0 = tensor<ui32>
+  %1 = "mhlo_test.get_return_types"(%result) : (tensor<ui32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @send
+func.func @send(%arg0: !mhlo.token) -> !mhlo.token {
+  %result = "mhlo.send"(%arg0) {
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+  } : (!mhlo.token) -> !mhlo.token
+  // CHECK: types0 = !mhlo.token
+  %1 = "mhlo_test.get_return_types"(%result) : (!mhlo.token) -> !mhlo.token
+  func.return %1 : !mhlo.token
+}
+
+// -----
+
+// CHECK-LABEL: func @gather
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4x2xi32>, %[[ARG1:.*]]: tensor<?x3x2xi64>
+func.func @gather(%operand : tensor<3x4x2xi32>, %start_indices : tensor<?x3x2xi64>) -> tensor<4xindex> {
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C3:.*]] = arith.constant 3 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x3x2xi64>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM]], %[[C3]], %[[C2]], %[[C2]] : tensor<4xindex>
+  // CHECK: return %[[RES]] : tensor<4xindex>
+  %result = "mhlo.gather"(%operand, %start_indices) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [2, 3],
+      collapsed_slice_dims = [0],
+      start_index_map = [1, 0],
+      index_vector_dim = 2>,
+      slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+      indices_are_sorted = false
+  } : (tensor<3x4x2xi32>, tensor<?x3x2xi64>) -> tensor<?x3x2x2xi32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result) : (tensor<?x3x2x2xi32>) -> tensor<4xindex>
+  func.return %1 : tensor<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @pad
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?x48x48x32xf32>
+func.func @pad(%arg0: tensor<?x48x48x32xf32>) -> tensor<4xindex> {
+  // CHECK: %[[CST0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST1:.*]] = arith.constant 48 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[CST0]] : tensor<?x48x48x32xf32>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM]], %[[CST1]], %[[CST1]], %[[CST1]] : tensor<4xindex>
+  // CHECK: return %[[RES]] : tensor<4xindex>
+  %0 = "mhlo.constant"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %result = "mhlo.pad"(%arg0, %0) {
+    edge_padding_high = dense<[0, 0, 0, 16]> : tensor<4xi64>,
+    edge_padding_low = dense<0> : tensor<4xi64>,
+    interior_padding = dense<0> : tensor<4xi64>
+  } : (tensor<?x48x48x32xf32>, tensor<f32>) -> tensor<?x48x48x48xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result) : (tensor<?x48x48x48xf32>) -> tensor<4xindex>
+  func.return %1 : tensor<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @cholesky_bounds
+func.func @cholesky_bounds(%input: tensor<2x?x?xf32, #mhlo.type_extensions<bounds = [?, 5, ?]>>) -> tensor<*xindex> {
+  %0 = "mhlo.cholesky"(%input) { lower = true } : (tensor<2x?x?xf32, #mhlo.type_extensions<bounds = [?, 5, ?]>>) -> tensor<*xf32>
+  // CHECK: types0 = tensor<2x?x?xf32, #mhlo.type_extensions<bounds = [?, 5, ?]>>
+  %1 = "mhlo_test.get_return_types"(%0) : (tensor<*xf32>) -> tensor<*xindex>
+  func.return %1 : tensor<*xindex>
+}
+
+// CHECK-LABEL: func @concatenate
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?x?xi32>, %[[ARG1:.*]]: tensor<?x?xi32>, %[[ARG2:.*]]: tensor<?x?xi32>
+func.func @concatenate(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?xi32>, %arg2: tensor<?x?xi32>) -> tensor<2xindex> {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xi32>
+  // CHECK: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xi32>
+  // CHECK: %[[DIM1:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xi32>
+  // CHECK: %[[DIM2:.*]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xi32>
+  // CHECK: %[[V0:.*]] = arith.addi %[[DIM]], %[[DIM1]] : index
+  // CHECK: %[[V1:.*]] = arith.addi %[[V0]], %[[DIM2]] : index
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[V1]], %[[DIM0]] : tensor<2xindex>
+  // CHECK: return %[[RES]] : tensor<2xindex>
+  %result = "mhlo.concatenate"(%arg0, %arg1, %arg2) {
+    dimension = 0 : i64
+  } : (tensor<?x?xi32>, tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result) : (tensor<?x?xi32>) -> tensor<2xindex>
+  func.return %1 : tensor<2xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @reduce
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<4x?xf32>,
+func.func @reduce(%arg0: tensor<4x?xf32>, %arg1 : tensor<4xf32>)-> (tensor<1xindex>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<4x?xf32>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM]] : tensor<1xindex>
+  // CHECK: return %[[RES]] : tensor<1xindex>
+  %result = "mhlo.reduce"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<4xf32>, %arg3: tensor<4xf32> ):
+    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    "mhlo.return"(%1) : (tensor<4xf32>) -> ()
+  }) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<4x?xf32>, tensor<4xf32>) -> tensor<?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?xf32>) -> tensor<1xindex>
+  func.return %1: tensor<1xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @real_dynamic_slice
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?xf32>, %[[ARG1:.*]]: tensor<1xindex>, %[[ARG2:.*]]: tensor<1xindex>, %[[ARG3:.*]]: tensor<1xindex>
+func.func @real_dynamic_slice(%arg0: tensor<?xf32>, %arg1: tensor<1xindex>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>) -> tensor<1xindex> {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[EXTD:.*]] = tensor.extract %[[ARG1]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[EXTD0:.*]] = tensor.extract %[[ARG2]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[EXTD1:.*]] = tensor.extract %[[ARG3]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[V0:.*]] = arith.subi %[[EXTD0]], %[[EXTD]] : index
+  // CHECK: %[[V1:.*]] = arith.addi %[[EXTD1]], %[[V0]] : index
+  // CHECK: %[[V2:.*]] = arith.subi %[[V1]], %[[C1]] : index
+  // CHECK: %[[V3:.*]] = arith.divsi %[[V2]], %[[EXTD1]] : index
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[V3]] : tensor<1xindex>
+  // CHECK: return %[[RES]] : tensor<1xindex>
+  %result = "mhlo.real_dynamic_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<?xf32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?xf32>) -> tensor<1xindex>
+  func.return %1: tensor<1xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?x?x?xf32>, %[[ARG1:.*]]: tensor<?x?x?xf32>
+func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<3xindex> {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
+  // CHECK: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[DIM1:.*]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x?x?xf32>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM]], %[[DIM0]], %[[DIM1]] : tensor<3xindex>
+  // CHECK: return %[[RES]] : tensor<3xindex>
+  %result = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [1]
+    >
+  } : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?x?x?xf32>) -> tensor<3xindex>
+  func.return %1: tensor<3xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_pad
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?xf32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<1xindex>, %[[ARG3:.*]]: tensor<1xindex>, %[[ARG4:.*]]: tensor<1xindex>
+func.func @dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<1xindex>, %arg3: tensor<1xindex>, %arg4: tensor<1xindex>) -> tensor<1xindex> {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
+  // CHECK: %[[EXTD:.*]] = tensor.extract %[[ARG2]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[EXTD0:.*]] = tensor.extract %[[ARG3]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[EXTD1:.*]] = tensor.extract %[[ARG4]][%[[C0]]] : tensor<1xindex>
+  // CHECK: %[[V0:.*]] = arith.cmpi slt, %[[DIM]], %[[C1]] : index
+  // CHECK: %[[V1:.*]] = arith.subi %[[DIM]], %[[C1]] : index
+  // CHECK: %[[V2:.*]] = arith.select %[[V0]], %[[C0]], %[[V1]] : index
+  // CHECK: %[[V3:.*]] = arith.muli %[[EXTD1]], %[[V2]] : index
+  // CHECK: %[[V4:.*]] = arith.addi %[[V3]], %[[DIM]] : index
+  // CHECK: %[[V5:.*]] = arith.addi %[[V4]], %[[EXTD]] : index
+  // CHECK: %[[V6:.*]] = arith.addi %[[V5]], %[[EXTD0]] : index
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[V6]] : tensor<1xindex>
+  // CHECK: return %[[RES]] : tensor<1xindex>
+  %result = "mhlo.dynamic_pad"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<?xf32>, tensor<f32>, tensor<1xindex>, tensor<1xindex>, tensor<1xindex>) -> tensor<?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?xf32>) -> tensor<1xindex>
+  func.return %1: tensor<1xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @broadcast
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?xi32>
+func.func @broadcast(%arg0: tensor<?xi32>) -> tensor<3xindex> {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xi32>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[C1]], %[[C2]], %[[DIM]] : tensor<3xindex>
+  // CHECK: return %[[RES]] : tensor<3xindex>
+  %result = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<?xi32>) -> tensor<1x2x?xi32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<1x2x?xi32>) -> tensor<3xindex>
+  func.return %1: tensor<3xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @transpose
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<?x?x?x?xi32>
+func.func @transpose(%arg0: tensor<?x?x?x?xi32>) -> tensor<4xindex> {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: %[[C3:.*]] = arith.constant 3 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?x?xi32>
+  // CHECK: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?x?xi32>
+  // CHECK: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?x?xi32>
+  // CHECK: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C3]] : tensor<?x?x?x?xi32>
+  // CHECK: %[[RES:.*]] = tensor.from_elements %[[DIM0]], %[[DIM]], %[[DIM2]], %[[DIM1]] : tensor<4xindex>
+  // CHECK: return %[[RES]] : tensor<4xindex>
+  %result = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?x?x?x?xi32>) -> tensor<4xindex>
+  func.return %1: tensor<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_iota
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<1xindex>
+func.func @dynamic_iota(%arg0: tensor<1xindex>) -> tensor<1xindex> {
+  // CHECK: return %[[ARG0]] : tensor<1xindex>
+  %result = "mhlo.dynamic_iota"(%arg0) {
+    iota_dimension = 0 : i64
+  } : (tensor<1xindex>) -> tensor<?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%result): (tensor<?xf32>) -> tensor<1xindex>
+  func.return %1: tensor<1xindex>
+}
+
+// -----
+
+// CHECK: func @select_and_scatter_bound
+func.func @select_and_scatter_bound(
+    %arg0: tensor<?x24x24x64xf32, #mhlo.type_extensions<bounds = [10, ?, ?, ?]>>,
+    %arg1: tensor<?x12x12x64xf32, #mhlo.type_extensions<bounds = [10, ?, ?, ?]>>) -> tensor<*xindex> {
+  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = "mhlo.compare"(%arg3, %arg4) {
+      compare_type = #mhlo<comparison_type TOTALORDER>,
+      comparison_direction = #mhlo<comparison_direction GE>
+    } : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  }, {
+  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+    %2 = mhlo.add %arg3, %arg4 : tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) {
+    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  } : (tensor<?x24x24x64xf32, #mhlo.type_extensions<bounds = [10, ?, ?, ?]>>,
+       tensor<?x12x12x64xf32, #mhlo.type_extensions<bounds = [10, ?, ?, ?]>>,
+       tensor<f32>) -> tensor<*xf32>
+  // CHECK: types0 = tensor<?x24x24x64xf32, #mhlo.type_extensions<bounds = [10, ?, ?, ?]>>
+  %3 = "mhlo_test.get_return_types"(%1) : (tensor<*xf32>) -> tensor<*xindex>
+  func.return %3 : tensor<*xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @reduce_window_bound
+func.func @reduce_window_bound(%arg0: tensor<4x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 4, 2]>>,
+    %init0: tensor<f32>) -> (tensor<*xindex>) {
+  %0:1 = "mhlo.reduce_window"(%arg0, %init0) ({
+  ^bb0(%a0: tensor<f32>, %b0: tensor<f32>):
+    %2 = mhlo.add %a0, %b0 : tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) {
+    padding = dense<[[0, 0], [0, 0], [2, 2], [0, 0]]> : tensor<4x2xi64>,
+    window_dimensions = dense<[1, 1, 5, 1]> : tensor<4xi64>,
+    window_strides = dense<[1, 1, 3, 1]> : tensor<4xi64>
+  } : (tensor<4x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 4, 2]>>,
+       tensor<f32>) -> (tensor<*xf32>)
+  // CHECK: types0 = tensor<4x?x?x?xf32, #mhlo.type_extensions<bounds = [?, ?, 2, 2]>>
+  %1 = "mhlo_test.get_return_types"(%0#0) : (tensor<*xf32>) -> tensor<*xindex>
+  func.return %1: tensor<*xindex>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index fde33bc810e..de812256b57 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -88,35 +88,35 @@ func.func @unary_ops(%arg0 : tensor<2xi32>, %arg1 : tensor<2xf32>) -> () {
 // -----
 
 // CHECK-LABEL: func @binary_ops
-func.func @binary_ops(%arg0: tensor<2xi1>, %arg1 : tensor<2xf32>) -> tensor<2xi1> {
+func.func @binary_ops(%arg0: tensor<2xi1>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xi32>) -> tensor<2xi1> {
   // CHECK:      %0 = mhlo.add %arg0, %arg0 : tensor<2xi1>
   // CHECK-NEXT: %1 = mhlo.and %arg0, %arg0 : tensor<2xi1>
-  // CHECK-NEXT: %2 = mhlo.atan2 %arg0, %arg0 : tensor<2xi1>
-  // CHECK-NEXT: %3 = mhlo.divide %arg0, %arg0 : tensor<2xi1>
+  // CHECK-NEXT: %2 = mhlo.atan2 %arg1, %arg1 : tensor<2xf32>
+  // CHECK-NEXT: %3 = mhlo.divide %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %4 = mhlo.maximum %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %5 = mhlo.minimum %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %6 = mhlo.multiply %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %7 = mhlo.or %arg0, %arg0 : tensor<2xi1>
   // CHECK-NEXT: %8 = mhlo.power %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %9 = mhlo.remainder %arg1, %arg1 : tensor<2xf32>
-  // CHECK-NEXT: %10 = mhlo.shift_left %arg1, %arg1 : tensor<2xf32>
-  // CHECK-NEXT: %11 = mhlo.shift_right_arithmetic %arg1, %arg1 : tensor<2xf32>
-  // CHECK-NEXT: %12 = mhlo.shift_right_logical %arg1, %arg1 : tensor<2xf32>
+  // CHECK-NEXT: %10 = mhlo.shift_left %arg2, %arg2 : tensor<2xi32>
+  // CHECK-NEXT: %11 = mhlo.shift_right_arithmetic %arg2, %arg2 : tensor<2xi32>
+  // CHECK-NEXT: %12 = mhlo.shift_right_logical %arg2, %arg2 : tensor<2xi32>
   // CHECK-NEXT: %13 = mhlo.subtract %arg1, %arg1 : tensor<2xf32>
   // CHECK-NEXT: %14 = mhlo.xor %arg0, %arg0 : tensor<2xi1>
   %0 = "mhlo.add"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   %1 = "mhlo.and"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  %2 = "mhlo.atan2"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  %3 = "mhlo.divide"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  %2 = "mhlo.atan2"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %3 = "mhlo.divide"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %4 = "mhlo.maximum"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %5 = "mhlo.minimum"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %6 = "mhlo.multiply"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %7 = "mhlo.or"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   %8 = "mhlo.power"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %9 = "mhlo.remainder"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  %10 = "mhlo.shift_left"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  %11 = "mhlo.shift_right_arithmetic"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  %12 = "mhlo.shift_right_logical"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %10 = "mhlo.shift_left"(%arg2, %arg2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %11 = "mhlo.shift_right_arithmetic"(%arg2, %arg2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %12 = "mhlo.shift_right_logical"(%arg2, %arg2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   %13 = "mhlo.subtract"(%arg1, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %14 = "mhlo.xor"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   func.return %0 : tensor<2xi1>
@@ -132,7 +132,7 @@ func.func @type_convert_ops(%arg0 : tensor<2xf32>) -> () {
   // CHECK-NEXT: %3 = mhlo.bitcast %arg0 : (tensor<2xf32>) -> tensor<2x1xf32>
   %0 = "mhlo.convert"(%arg0) : (tensor<2xf32>) -> tensor<2xf64>
   %1 = "mhlo.reshape"(%arg0) : (tensor<2xf32>) -> tensor<1x2xf32>
-  %2 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xi32> 
+  %2 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xi32>
   %3 = "mhlo.bitcast"(%arg0) : (tensor<2xf32>) -> tensor<2x1xf32>
   "mhlo.return"() : () -> ()
 }
@@ -221,10 +221,10 @@ func.func @compare_op(%arg0 : tensor<3xi32>) -> () {
 // -----
 
 // CHECK-LABEL: func @extensions
-func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>,
+func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>,
                 %arg1 : tensor<i32>) -> () {
-  // CHECK:      %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
-  %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
+  // CHECK:      %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<*xf32>
+  %0 = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, ?]>>, tensor<i32>) -> tensor<*xf32>
   "mhlo.return"() : () -> ()
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 7bdc51baaa2..8fc79709ef2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -16,7 +16,6 @@ func.func private @invalid_type() -> !mhlo.foobar
 // CHECK-LABEL: func @reduce_scatter
 func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -31,7 +30,6 @@ func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x5xf32> {
   // expected-error@+1 {{operand scatter dimension has size 16, expected to be a multiple of result scatter dimension size 5}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -62,10 +60,222 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // -----
 
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{Reduction-region must take 2 parameters, but takes 3 parameter(s)}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, -1], [1, 3, -1, -1]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{The reduction-region expected to return some value(s)}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"() : () -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{Reduction-region here must produce 1 tensors, but produces 2 instead}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max, %max) : (tensor<f32>, tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{Reduction-region here must produce tensor-typed result(s), but produces 'tuple<tensor<f32>, tensor<f32>>' instead}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    %tup = "mhlo.tuple"(%max, %max) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+    "mhlo.return"(%tup) : (tuple<tensor<f32>, tensor<f32>>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{The type of reduction-region's parameter at index 1 is different than the corresponding result type: 'tensor<i32>' vs 'tensor<f32>'}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<i32>):
+    %max = mhlo.maximum %arg0, %arg0 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{The type of reduction-region's parameter at index 0 is different than the corresponding result type: 'tensor<f32>' vs 'tensor<i32>'}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    %maxint = "mhlo.convert"(%max) : (tensor<f32>) -> tensor<i32>
+    "mhlo.return"(%maxint) : (tensor<i32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{The type of reduction-region's result type at index 0 differs from the op's corresponding init-value type: 'tensor<i32>' vs 'tensor<f32>'}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<i32>
+    "mhlo.return"(%max) : (tensor<i32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{The type of reduction-region's result type at index 0 differs from the op's corresponding init-value type: 'tensor<4xf32>' vs 'tensor<f32>'}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<4xf32>
+    "mhlo.return"(%max) : (tensor<4xf32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10x4xf32> {
+  // expected-error@+1 {{requires compatible types for all operands and results}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10x4xf32>
+  func.return %0 : tensor<10x4xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10xi32> {
+  // expected-error@+1 {{requires compatible types for all operands and results}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xi32>
+  func.return %0 : tensor<10xi32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<0> : tensor<1xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+1 {{replica id #1 seen more than once}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 1, 1, 3]]> : tensor<1x4xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  //  expected-error@+1 {{replica id #2 not seen in replica groups}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 1, 3]]> : tensor<1x3xi64>
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
+func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  //  expected-error@+1 {{replica groups cannot be empty}}
+  %0 = "mhlo.all_reduce"(%operand) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+    %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<0> : tensor<0x2xi64>,
+    use_global_device_ids
+  } : (tensor<10xf32>) -> tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
 func.func @invalid_reduce_scatter(%data: tensor<4x0xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{operand scatter dimension cannot be zero}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -79,7 +289,6 @@ func.func @invalid_reduce_scatter(%data: tensor<4x0xf32>) -> tensor<4x4xf32> {
 func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x0xf32> {
   // expected-error@+1 {{result scatter dimension cannot be zero}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -93,7 +302,6 @@ func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x0xf32> {
 func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4xf32> {
   // expected-error@+1 {{operand and result should have same rank}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -107,7 +315,6 @@ func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4xf32> {
 func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
   // expected-error@+1 {{scatter dim should be less than operand/result rank}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -121,7 +328,6 @@ func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<3x4xf32> {
   // expected-error@+1 {{non scatter dimensions should be same for operand (4) and result (3)}}
   %0 = "mhlo.reduce_scatter"(%data) ({
-    // reduction computation
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
@@ -133,7 +339,7 @@ func.func @invalid_reduce_scatter(%data: tensor<4x16xf32>) -> tensor<3x4xf32> {
 // -----
 
 func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
-  // expected-error@+1 {{replica groups should be a rank 2 tensor of 64 bit integers}}
+  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
   %0 = "mhlo.reduce_scatter"(%data) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
     %1 = mhlo.add %arg2, %arg3 : tensor<f32>
@@ -210,6 +416,20 @@ func.func @reduce_scatter(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
 
 // -----
 
+// CHECK-LABEL: func @reduce_scatter_dynamic
+func.func @reduce_scatter_dynamic(%data: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = "mhlo.reduce_scatter"(%data) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      use_global_device_ids} : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @alltoall
 func.func @alltoall(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
   %0 = "mhlo.all_to_all"(%data) {
@@ -236,8 +456,60 @@ func.func @alltoall_unranked_input(%data: tensor<*xf32>) -> tensor<*xf32> {
 
 // -----
 
+// CHECK-LABEL: func @alltoall_dynamic_split_dim
+func.func @alltoall_dynamic_split_dim(%data: tensor<4x?xf32>) -> tensor<20x?xf32> {
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 5 : i64,
+    replica_groups = dense<[[0, 1, 2, 3, 4]]> : tensor<1x5xi64>
+  } : (tensor<4x?xf32>) -> tensor<20x?xf32>
+  func.return %0 : tensor<20x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @alltoall_dynamic_concat_dim
+func.func @alltoall_dynamic_concat_dim(%data: tensor<?x16xf32>) -> tensor<?x4xf32> {
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<?x16xf32>) -> tensor<?x4xf32>
+  func.return %0 : tensor<?x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @alltoall_dynamic_split_dim
+func.func @alltoall_dynamic_split_dim(%data: tensor<4x?xf32>) -> tensor<20x?xf32> {
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 5 : i64,
+    replica_groups = dense<[[0, 1, 2, 3, 4]]> : tensor<1x5xi64>
+  } : (tensor<4x?xf32>) -> tensor<20x?xf32>
+  func.return %0 : tensor<20x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @alltoall_dynamic_concat_dim
+func.func @alltoall_dynamic_concat_dim(%data: tensor<?x16xf32>) -> tensor<?x4xf32> {
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<?x16xf32>) -> tensor<?x4xf32>
+  func.return %0 : tensor<?x4xf32>
+}
+
+// -----
+
 func.func @alltoall_negative_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-  // expected-error@+1 {{AllToAll split_dimension -1 is out-of-bounds for input rank 2}}
+  // expected-error@+1 {{AllToAll split_dimension cannot be negative}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = -1 : i64,
     concat_dimension = 0 : i64,
@@ -263,7 +535,7 @@ func.func @alltoall_out_bound_split_dimension(%data: tensor<4x16xf32>) -> tensor
 // -----
 
 func.func @alltoall_negative_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-  // expected-error@+1 {{AllToAll concat_dimension -1 is out-of-bounds for input rank 2}}
+  // expected-error@+1 {{AllToAll concat_dimension cannot be negative}}
   %0 = "mhlo.all_to_all"(%data) {
     split_dimension = 1 : i64,
     concat_dimension = -1 : i64,
@@ -288,6 +560,19 @@ func.func @alltoall_out_bound_concat_dimension(%data: tensor<4x16xf32>) -> tenso
 
 // -----
 
+func.func @alltoall_invalid_split_count(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{AllToAll split_count must be > 0}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 0 : i64,
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
 func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
 // expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
   %0 = "mhlo.all_to_all"(%data) {
@@ -301,39 +586,226 @@ func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16
 
 // -----
 
-func.func @allgather_incompatible_types(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
-  // expected-error@+1 {{result gather dimension has size 100, expected to be a multiple of operand gather dimension size 32}}
+func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[[0], [1], [2], [3]]]> : tensor<1x4x1xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
+func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{replica id #1 not seen in replica groups}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[-5, -4, -3, 0]]> : tensor<1x4xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
+func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{replica id #2 seen more than once}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 2]]> : tensor<2x4xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
+func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{replica id #4 not seen in replica groups}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 2, 6, 8], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
+func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
+  // expected-error@+1 {{group size of replica_groups must be 4}}
+  %0 = "mhlo.all_to_all"(%data) {
+    split_dimension = 1 : i64,
+    concat_dimension = 0 : i64,
+    split_count = 4 : i64,
+    replica_groups = dense<[[0, 2, 4], [1, 3, 5]]> : tensor<2x3xi64>
+  } : (tensor<4x16xf32>) -> tensor<16x4xf32>
+  func.return %0 : tensor<16x4xf32>
+}
+
+// -----
+
+func.func @allgather_incompatible_types(%arg0: tensor<128x32xf32>) -> tensor<128x100xf32> {
+  // expected-error@+1 {{result gather dimension has size 100, expected to be a multiple of operand gather dimension size 32}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<128x32xf32>) -> tensor<128x100xf32>
+  func.return %0 : tensor<128x100xf32>
+}
+
+// -----
+
+func.func @allgather_gather_along_zero_dimension(%arg0: tensor<128x0x32xf32>) -> tensor<128x100xf32> {
+  // expected-error@+1 {{dimension size of operand at 'all_gather_dim' cannot be zero}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<128x0x32xf32>) -> tensor<128x100xf32>
+  func.return %0 : tensor<128x100xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @allgather_dynamic_gather_dim
+func.func @allgather_dynamic_gather_dim(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    use_global_device_ids
+  } : (tensor<128x32xf32>) -> tensor<128x?xf32>
+  func.return %0 : tensor<128x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @allgather_dynamic_non_gather_dim
+func.func @allgather_dynamic_non_gather_dim(%arg0: tensor<128x32xf32>) -> tensor<?x64xf32> {
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    use_global_device_ids
+  } : (tensor<128x32xf32>) -> tensor<?x64xf32>
+  func.return %0 : tensor<?x64xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_dim(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{all_gather_dim must be a valid index of operand}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 2 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_dim(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{all_gather_dim cannot be negative}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = -1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_result_shape(%arg0: tensor<8x2x32xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{operand and return must have the same rank}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<8x2x32xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_result_shape(%arg0: tensor<8x2xf32>) -> tensor<4x8xf32> {
+  // expected-error@+1 {{operand and result should have the same shape except for the dimension size at 'all_gather_dim'}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<4x8xf32>
+  func.return %0 : tensor<4x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_replica_group_shape(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{replica groups should be a rank 2 tensor}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<[[[0], [1], [2], [3]]]> : tensor<1x4x1xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_replica_group_shape(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{replica groups cannot be empty}}
+  %0 = "mhlo.all_gather"(%arg0) {
+    all_gather_dim = 1 : i64,
+    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+    replica_groups = dense<0> : tensor<0x2xi64>,
+    use_global_device_ids
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
+}
+
+// -----
+
+func.func @all_gather_invalid_replica_group(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{replica id #1 not seen in replica groups}}
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<128x32xf32>) -> tensor<128x100xf32>
-  func.return %0 : tensor<128x100xf32>
+    replica_groups = dense<[[-5, -4, -3, 0]]> : tensor<1x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
 }
 
 // -----
 
-func.func @allgather_gather_along_zero_dimension(%arg0: tensor<128x0x32xf32>) -> tensor<128x100xf32> {
-  // expected-error@+1 {{operand gather dimension cannot be zero}}
+func.func @all_gather_invalid_replica_group(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{replica id #2 seen more than once}}
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>
-  } : (tensor<128x0x32xf32>) -> tensor<128x100xf32>
-  func.return %0 : tensor<128x100xf32>
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 2]]> : tensor<2x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @allgather_dynamic_gather_dim
-func.func @allgather_dynamic_gather_dim(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
+func.func @all_gather_invalid_replica_group(%arg0: tensor<8x2xf32>) -> tensor<8x8xf32> {
+  // expected-error@+1 {{replica id #4 not seen in replica groups}}
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
     channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
-    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
-    use_global_device_ids
-  } : (tensor<128x32xf32>) -> tensor<128x?xf32>
-  func.return %0 : tensor<128x?xf32>
+    replica_groups = dense<[[0, 2, 6, 8], [1, 3, 5, 7]]> : tensor<2x4xi64>
+  } : (tensor<8x2xf32>) -> tensor<8x8xf32>
+  func.return %0 : tensor<8x8xf32>
 }
 
 // -----
@@ -452,7 +924,7 @@ func.func @broadcast_in_dim_bad_rank_decrease(%arg0: tensor<1x2x3xi32>) -> tenso
 
 func.func @broadcast_in_dim_duplicate_bcast_dimensions(%arg0: tensor<1x1x3xi32>) -> tensor<1x2x3xi32> {
   // expected-error@+1 {{broadcast_dimensions should not have duplicates}}
-  %0 = "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,0,2]> : tensor<3xi64>} : (tensor<1x1x3xi32>) -> tensor<1x2x3xi32>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0,0,2]> : tensor<3xi64>} : (tensor<1x1x3xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
 
@@ -786,7 +1258,7 @@ func.func @comp_mismatch_return_shape(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>
 
 // -----
 
-func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+func.func @collective_permute_invalid_sources(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   // expected-error@+1 {{duplicate sources not allowed}}
   %0 = "mhlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[[0, 1], [0, 2], [2, 3]]> : tensor<3x2xi64>
@@ -796,7 +1268,7 @@ func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> te
 
 // -----
 
-func.func @collective_permute_duplicate_targets(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+func.func @collective_permute_invalid_destinations(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   // expected-error@+1 {{duplicate targets not allowed}}
   %0 = "mhlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[[0, 1], [1, 2], [2, 1]]> : tensor<3x2xi64>
@@ -806,7 +1278,7 @@ func.func @collective_permute_duplicate_targets(%arg0: tensor<128x32xf32>) -> te
 
 // -----
 
-func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+func.func @collective_permute_invalid_source_target_pairs(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   // expected-error@+1 {{expect source_target_pairs attribute to be of rank 2, but got rank 1}}
   %0 = "mhlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[0, 1]> : tensor<2xi64>
@@ -816,7 +1288,7 @@ func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> te
 
 // -----
 
-func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+func.func @collective_permute_invalid_source_target_pairs(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
   // expected-error@+1 {{expect source_target_pairs attribute of shape (N, 2), but got (2, 3)}}
   %0 = "mhlo.collective_permute"(%arg0) {
     source_target_pairs = dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi64>
@@ -826,6 +1298,16 @@ func.func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> te
 
 // -----
 
+func.func @collective_permute_invalid_source_target_pairs(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
+  // expected-error@+1 {{replica ids in source_target_pairs must be >= 0}}
+  %0 = "mhlo.collective_permute"(%arg0) {
+    source_target_pairs = dense<[[0, 1], [-1, 0]]> : tensor<2x2xi64>
+  } : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  func.return %0 : tensor<128x32xf32>
+}
+
+// -----
+
 func.func @concat_0D(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
   // expected-error@+1 {{rank-0 values cannot be concatenated}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
@@ -909,7 +1391,7 @@ func.func @concat_outofbounds_dim(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -
 // -----
 
 func.func @concat_mismatch_rank(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
-  // expected-error@+1 {{op operands (0) and (1) do not match rank}}
+  // expected-error@+1 {{operands (0) and (1) do not match rank}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
   func.return %0 : tensor<3xi32>
 }
@@ -1009,7 +1491,7 @@ func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
 // -----
 
 func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>}}
+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>}}
   %0 = "mhlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xi32>) -> tensor<1x2x2xi32>
   func.return %0: tensor<1x2x2xi32>
 }
@@ -1024,6 +1506,13 @@ func.func @cholesky_wrong_infer_shape(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2x
 
 // -----
 
+func.func @create_token() -> !mhlo.token {
+  %0 = "mhlo.create_token"() : () -> !mhlo.token
+  func.return %0: !mhlo.token
+}
+
+// -----
+
 // CHECK-LABEL: func @dot_vector
 func.func @dot_vector(%arg0: tensor<1x2xi32>, %arg1: tensor<2x1xi32>) -> tensor<1x1xi32> {
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<1x1xi32>
@@ -1048,6 +1537,14 @@ func.func @dot_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>)
 
 // -----
 
+func.func @dot_precision_invalid_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  // expected-error@+1 {{expects precision config to be empty or have <= 2 elements}}
+  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGH>, #mhlo<precision HIGH>]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %0: tensor<2x2xi32>
+}
+
+// -----
+
 func.func @dot_bad_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
   // expected-error@+1 {{'precision_config' failed to satisfy constraint}}
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["FOO", #mhlo<precision HIGHEST>]} : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
@@ -1056,22 +1553,36 @@ func.func @dot_bad_precision_config(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi3
 
 // -----
 
-func.func @dot_illegal_input_type(%arg0: tensor<3xf32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
-  // expected-error@+1 {{Unexpected result type: has 'tensor<?xf32>' but inferred 'tensor<3xf32>' from operands 'tensor<3xf32>' and 'tensor<?x3xf32>}}
+func.func @dot_more_dynamic_output_type(%arg0: tensor<3xf32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3xf32>, tensor<?x3xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
 }
 
 // -----
 
-func.func @dot_illegal_result_type(%arg0: tensor<?x3xf32>, %arg1: tensor<3xf32>) -> tensor<3x?xf32> {
-  // expected-error@+1 {{Unexpected result type: has 'tensor<3x?xf32>' but inferred 'tensor<?xf32>' from operands 'tensor<?x3xf32>' and 'tensor<3xf32>'}}
+func.func @dot_cannot_infer_type(%arg0: tensor<?x?x3xf32>, %arg1: tensor<?x3x?xf32>) -> tensor<*xf32> {
+  // expected-error@+1 {{expected both lhs/rhs ranks to be either 1 or 2}}
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<?x?x3xf32>, tensor<?x3x?xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @dot_result_type_mismatch_with_inferred_type(%arg0: tensor<?x3xf32>, %arg1: tensor<3xf32>) -> tensor<3x?xf32> {
+  // expected-error@+1 {{inferred shape '[?]' is incompatible with return type of operation 'tensor<3x?xf32>'}}
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<?x3xf32>, tensor<3xf32>) -> tensor<3x?xf32>
   func.return %0 : tensor<3x?xf32>
 }
 
 // -----
 
+func.func @dot_result_type_match_with_inferred_type(%arg0: tensor<?x3xf32>, %arg1: tensor<3xf32>) -> tensor<*xf32> {
+  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<?x3xf32>, tensor<3xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @dot_legal_unranked_rank_type
 func.func @dot_legal_unranked_rank_type(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<2x2xf32> {
   // unrank legal test
@@ -1103,7 +1614,7 @@ func.func @imag_fp_input(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // -----
 
 func.func @imag_int_input(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
   %0 = "mhlo.imag"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
@@ -1334,6 +1845,16 @@ func.func @map_mismatch_arguments_and_dimensions(%arg0: tensor<4x5xf32>, %arg1:
 
 // -----
 
+// CHECK-LABEL: func @outfeed
+func.func @outfeed(%arg0: tensor<3x3x3xi32>, %arg1: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.outfeed"(%arg0, %arg1) {
+    outfeed_config = ""
+  } : (tensor<3x3x3xi32>, !mhlo.token) -> !mhlo.token
+  func.return %0 : !mhlo.token
+}
+
+// -----
+
 // CHECK-LABEL: func @real_fp_input
 func.func @real_fp_input(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.real"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
@@ -1343,7 +1864,7 @@ func.func @real_fp_input(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // -----
 
 func.func @real_int_input(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // expected-error@+1 {{operand #0 must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<*xi32>'}}
   %0 = "mhlo.real"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
   func.return %0 : tensor<*xi32>
 }
@@ -1399,22 +1920,24 @@ func.func @replica_id() -> tensor<ui32> {
 
 // CHECK-LABEL: func @rng_bit_generator
 func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
-  %4 = mhlo.constant dense<[10, 12]> : tensor<2xui64>
-  %0 = mhlo.constant dense<[10, 12]> : tensor<2xi32>
-  %1 = mhlo.constant dense<3> : tensor<i32>
-  %2, %3 = "mhlo.rng_bit_generator"(%4) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
-  func.return %2, %3 : tensor<2xui64>, tensor<10x12xui32>
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>)
+  func.return %0, %1 : tensor<2xui64>, tensor<10x12xui32>
 }
 
 // -----
 
 func.func @rng_bit_generator(%arg0: tensor<2xui64>) -> (tensor<2xui64>, tensor<10x12xui32>) {
-  %4 = mhlo.constant dense<[10, 12]> : tensor<2xui64>
-  %0 = mhlo.constant dense<[10, 12]> : tensor<2xi32>
-  %1 = mhlo.constant dense<3> : tensor<i32>
-  // expected-error@+1 {{output state shape must match initial state shape. Got: 'tensor<2xui64>' and 'tensor<3xui64>'}}
-  %2, %3 = "mhlo.rng_bit_generator"(%4) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<3xui64>, tensor<10x12xui32>)
-  func.return %2, %3 : tensor<3xui64>, tensor<10x12xui32>
+  // expected-error@+1 {{output state shape must be compatible with initial state shape. Got: 'tensor<2xui64>' and 'tensor<3xui64>'}}
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<3xui64>, tensor<10x12xui32>)
+  func.return %0, %1 : tensor<3xui64>, tensor<10x12xui32>
+}
+
+// -----
+
+// CHECK-LABEL: func @rng_bit_generator_dynamic
+func.func @rng_bit_generator_dynamic(%arg0: tensor<?xui64>) -> (tensor<?xui64>, tensor<10x12xui32>) {
+  %0, %1 = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<?xui64>) -> (tensor<?xui64>, tensor<10x12xui32>)
+  func.return %0, %1 : tensor<?xui64>, tensor<10x12xui32>
 }
 
 // -----
@@ -1455,7 +1978,7 @@ func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
 
 func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1464,7 +1987,7 @@ func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -
 
 func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1482,7 +2005,7 @@ func.func @rng_normal_invalid_shape_rank(%mu: tensor<f32>, %sigma: tensor<f32>)
 
 func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
   %cst = "mhlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
-  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #mhlo.rng_distribution<NORMAL>}: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
   func.return
 }
@@ -1524,7 +2047,7 @@ func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %ar
 
 func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1534,7 +2057,7 @@ func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> ten
 
 func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
+  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1552,7 +2075,7 @@ func.func @rng_uniform_invalid_shape_rank(%a: tensor<f32>, %b: tensor<f32>) -> t
 
 func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
+  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
   %0 = "mhlo.rng"(%a, %b, %shape) {rng_distribution = #mhlo.rng_distribution<UNIFORM>}: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   func.return %0 : tensor<2x3x5xf32>
 }
@@ -1670,12 +2193,24 @@ func.func @slice(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
 
 // -----
 
-// CHECK-LABEL: func @slice
+// CHECK-LABEL: func @slice_dynamic_dim
 func.func @slice_dynamic_dim(%arg0: tensor<3x?xi32>) -> tensor<1x?xi32> {
+  %0 = "mhlo.slice"(%arg0) {
+    start_indices = dense<[1, 1]> : tensor<2xi64>,
+    limit_indices = dense<[2, 2]> : tensor<2xi64>,
+    strides = dense<[1, 1]> : tensor<2xi64>
+  } : (tensor<3x?xi32>) -> tensor<1x?xi32>
+  func.return %0 : tensor<1x?xi32>
+}
+
+// -----
+
+func.func @slice_dynamic_dim_invalid_indices(%arg0: tensor<3x?xi32>) -> tensor<1x?xi32> {
+  // expected-error@+1 {{negative start index -1 in dimension 1}}
   %0 = "mhlo.slice"(%arg0) {
     start_indices = dense<[1, -1]> : tensor<2xi64>,
-    limit_indices = dense<[2, -1]> : tensor<2xi64>,
-    strides = dense<[1, -1]> : tensor<2xi64>
+    limit_indices = dense<[2, 2]> : tensor<2xi64>,
+    strides = dense<[1, 1]> : tensor<2xi64>
   } : (tensor<3x?xi32>) -> tensor<1x?xi32>
   func.return %0 : tensor<1x?xi32>
 }
@@ -1802,6 +2337,14 @@ func.func @dynamic_slice_mismatch_indices(%arg0: tensor<3x4xi32>, %arg1: tensor<
 
 // -----
 
+func.func @dynamic_slice_mismatch_indices_element_type(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
+  // expected-error@+1 {{start indices must have same element type}}
+  %0 = "mhlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
+  func.return %0 : tensor<1x4xi32>
+}
+
+// -----
+
 func.func @dynamic_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tensor<1x4xi32> {
   // expected-error@+1 {{has mismatched number of start indices (1) and the rank of operand (2)}}
   %0 = "mhlo.dynamic_slice"(%arg0, %arg1) {slice_sizes = dense<[1]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
@@ -1859,29 +2402,85 @@ func.func @dynamic_slice_slice_size_too_large(%arg0: tensor<3x4xi32>, %arg1: ten
 // -----
 
 // CHECK-LABEL: @dynamic_update_slice
-func.func @dynamic_update_slice(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
-  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+func.func @dynamic_update_slice(%input: tensor<3x4xi64>, %update: tensor<1x4xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
   func.return %0 : tensor<3x4xi64>
 }
 
 // -----
 
-func.func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start: tensor<2xi64>) -> tensor<3x4xi64> {
+// CHECK-LABEL: @dynamic_update_slice_dynamic_dim
+func.func @dynamic_update_slice_dynamic_dim(%input: tensor<?x4xi64>, %update: tensor<1x4xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<?x4xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+  func.return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+func.func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tensor<1x2xi64>, %start: tensor<2xi64>) -> tensor<3x4xi64> {
   // expected-error@+1 {{operand #2 must be 0D tensor of 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
-  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start) : (tensor<3x4xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<3x4xi64>
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start) : (tensor<3x4xi64>, tensor<1x2xi64>, tensor<2xi64>) -> tensor<3x4xi64>
+  func.return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+func.func @dynamic_update_slice_invalid_update(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  // expected-error@+1 {{update rank does not match operand rank: 1 vs 2.}}
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+  func.return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+func.func @dynamic_update_slice_invalid_start_size(%input: tensor<3x4xi64>, %update: tensor<1x2xi64>, %start: tensor<i64>) -> tensor<3x4xi64> {
+  // expected-error@+1 {{expects number of start_indices to match operand rank: 1 vs 2.}}
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start) : (tensor<3x4xi64>, tensor<1x2xi64>, tensor<i64>) -> tensor<3x4xi64>
   func.return %0 : tensor<3x4xi64>
 }
 
 // -----
 
 func.func @dynamic_update_slice_mismatched_start(%input: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start1: tensor<i32>, %start2: tensor<i64>, %start3: tensor<i64>) -> tensor<11x3x4xi32> {
-  // expected-error@+1 {{start indices must have same element type (encountered mismatch: 'i32' vs 'i64')}}
+  // expected-error@+1 {{start indices must have same element type}}
   %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2, %start3) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
   func.return %0 : tensor<11x3x4xi32>
 }
 
 // -----
 
+func.func @dynamic_update_slice_invalid_update_size(%input: tensor<3x4xi64>, %update: tensor<1x5xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  // expected-error@+1 {{expects size at dimension 1 of update to be in range [0, 4]. Got: 5.}}
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<1x5xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+  func.return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_update_slice_dynamic_rank_input
+func.func @dynamic_update_slice_dynamic_rank_input(%input: tensor<*xi64>, %update: tensor<1x4xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<*xi64> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<*xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<*xi64>
+  func.return %0 : tensor<*xi64>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_update_slice_dynamic_rank_update
+func.func @dynamic_update_slice_dynamic_rank_update(%input: tensor<3x4xi64>, %update: tensor<*xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<3x4xi64> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<3x4xi64>, tensor<*xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
+  func.return %0 : tensor<3x4xi64>
+}
+
+// -----
+
+// CHECK-LABEL: func @dynamic_update_slice_dynamic_sizes
+func.func @dynamic_update_slice_dynamic_sizes(%input: tensor<?x4xi64>, %update: tensor<1x?xi64>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<?x4xi64> {
+  %0 = "mhlo.dynamic_update_slice"(%input, %update, %start1, %start2) : (tensor<?x4xi64>, tensor<1x?xi64>, tensor<i64>, tensor<i64>) -> tensor<?x4xi64>
+  func.return %0 : tensor<?x4xi64>
+}
+
+// -----
+
 // CHECK-LABEL: func @transpose
 func.func @transpose(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
@@ -1960,6 +2559,30 @@ func.func @triangular_solve(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x5x4x4x
 
 // -----
 
+// CHECK-LABEL: func @triangular_solve_dynamic_dims_minor
+func.func @triangular_solve_dynamic_dims_minor(%arg0: tensor<10x5x?x4xf32>, %arg1: tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32> {
+  %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x?x4xf32>, tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32>
+  func.return %0 : tensor<10x5x4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @triangular_solve_dynamic_dims_shared
+func.func @triangular_solve_dynamic_dims_shared(%arg0: tensor<10x5x4x?xf32>, %arg1: tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32> {
+  %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x4x?xf32>, tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32>
+  func.return %0 : tensor<10x5x4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @triangular_solve_dynamic_dims_batch
+func.func @triangular_solve_dynamic_dims_batch(%arg0: tensor<?x5x4x4xf32>, %arg1: tensor<10x?x4x4xf32>) -> tensor<10x5x4x4xf32> {
+  %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<?x5x4x4xf32>, tensor<10x?x4x4xf32>) -> tensor<10x5x4x4xf32>
+  func.return %0 : tensor<10x5x4x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @triangular_solve_unranked
 func.func @triangular_solve_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -1993,7 +2616,7 @@ func.func @triangular_solve_rank_less_than_2(%arg0: tensor<4xf32>, %arg1: tensor
 // -----
 
 func.func @triangular_solve_unequal_minor_dims_a(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
-  // expected-error@+1 {{two minor dimensions of operand 'a' must have equal size, but got 'tensor<4x3xf32>'}}
+  // expected-error@+1 {{two minor dimensions of operand 'a' must be compatible, but got 'tensor<4x3xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x3xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
   func.return %0 : tensor<4x3xf32>
 }
@@ -2009,7 +2632,7 @@ func.func @triangular_solve_unequal_rank(%arg0: tensor<10x4x4xf32>, %arg1: tenso
 // -----
 
 func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
-  // expected-error@+1 {{shared dimension of operands 'a' and 'b' does not match, but got 'tensor<4x4xf32>' and 'tensor<3x4xf32>'}}
+  // expected-error@+1 {{shared dimension of operands 'a' and 'b' must be compatible, but got 'tensor<4x4xf32>' and 'tensor<3x4xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
   func.return %0 : tensor<3x4xf32>
 }
@@ -2017,7 +2640,7 @@ func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: t
 // -----
 
 func.func @triangular_solve_mismatch_leading_dims(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32> {
-  // expected-error@+1 {{leading batch dimensions of the operands must be same, but got 'tensor<10x5x4x4xf32>' and 'tensor<10x6x4x3xf32>'}}
+  // expected-error@+1 {{batch dimensions of the operands must be compatible, but got 'tensor<10x5x4x4xf32>' and 'tensor<10x6x4x3xf32>'}}
   %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32>
   func.return %0 : tensor<10x6x4x3xf32>
 }
@@ -2056,7 +2679,7 @@ func.func @tuple_token(%arg0: tensor<f32>, %arg1: !mhlo.token) -> tuple<tensor<f
 // -----
 
 func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
-  // expected-error@+1 {{number of operands to tuple expected to match number of types}}
+  // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<f32>, tensor<f32>>'}}
   %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
   func.return %0 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
 }
@@ -2064,7 +2687,7 @@ func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tu
 // -----
 
 func.func @tuple_type_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<i32>> {
-  // expected-error@+1 {{op has return type mismatch at 1th value}}
+  // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<i32>>'}}
   %0 = "mhlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<i32>>
   func.return %0 : tuple<tensor<f32>, tensor<i32>>
 }
@@ -2086,7 +2709,7 @@ func.func @get_tuple_element_token(%arg0: tuple<tensor<f32>, !mhlo.token>) -> !m
 // -----
 
 func.func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<i32> {
-  // expected-error@+1 {{has return type tensor<i32>, but expected tensor<f32>}}
+  // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
   %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<i32>
   func.return %0 : tensor<i32>
 }
@@ -2125,7 +2748,7 @@ func.func @or_invalid_f32_type(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> te
 // -----
 
 func.func @floor_invalid_i32_type(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  // expected-error@+1 {{must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
+  // expected-error@+1 {{must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
   %0 = "mhlo.floor"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
   func.return %0 : tensor<4xi32>
 }
@@ -2356,7 +2979,48 @@ func.func @reshape_invalid_shapes(%operand: tensor<2x4xf32>) -> tensor<3x3xf32>
 
 // -----
 
-func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
+func.func @reverse_duplicate_dimensions(%operand: tensor<3x2xi32>) -> tensor<3x2xi32> {
+  // expected-error @+1 {{dimensions should be unique. Got: 0, 0}}
+  %0 = "mhlo.reverse"(%operand) {
+    dimensions = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  func.return %0 : tensor<3x2xi32>
+}
+
+// -----
+
+func.func @reverse_invalid_dimensions_unranked(%operand: tensor<*xi32>) -> tensor<*xi32> {
+  // expected-error @+1 {{all dimensions should be non-negative. Got dimension: -1.}}
+  %0 = "mhlo.reverse"(%operand) {
+    dimensions = dense<-1> : tensor<i64>
+  } : (tensor<*xi32>) -> tensor<*xi32>
+  func.return %0 : tensor<*xi32>
+}
+
+// -----
+
+func.func @reverse_invalid_dimensions_negative(%operand: tensor<3x2xi32>) -> tensor<3x2xi32> {
+  // expected-error @+1 {{all dimensions should be non-negative. Got dimension: -1.}}
+  %0 = "mhlo.reverse"(%operand) {
+    dimensions = dense<-1> : tensor<i64>
+  } : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  func.return %0 : tensor<3x2xi32>
+}
+
+// -----
+
+func.func @reverse_invalid_dimensions(%operand: tensor<3x2xi32>) -> tensor<3x2xi32> {
+  // expected-error @+1 {{all dimensions should be between [0, 2). Got dimension: 2.}}
+  %0 = "mhlo.reverse"(%operand) {
+    dimensions = dense<2> : tensor<i64>
+  } : (tensor<3x2xi32>) -> tensor<3x2xi32>
+  func.return %0 : tensor<3x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general
+func.func @dot_general(%arg0: tensor<2x3x4xf32>, %arg1: tensor<2x3x5xf32>) -> tensor<2x4x5xf32> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2364,7 +3028,21 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
       lhs_contracting_dimensions = [1],
       rhs_contracting_dimensions = [1]
     >
-  } : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  } : (tensor<2x3x4xf32>, tensor<2x3x5xf32>) -> tensor<2x4x5xf32>
+  func.return %0 : tensor<2x4x5xf32>
+}
+
+// -----
+
+func.func @dot_general(%arg0: tensor<1x?x1x?xf32>, %arg1: tensor<?x1x?x1x?xf32>) {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0, 1],
+      rhs_batching_dimensions = [0, 1],
+      lhs_contracting_dimensions = [2, 3],
+      rhs_contracting_dimensions = [2, 3]
+    >
+  } : (tensor<1x?x1x?xf32>, tensor<?x1x?x1x?xf32>) -> tensor<?x?x?xf32>
   func.return
 }
 
@@ -2414,7 +3092,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs and rhs should have the same number of batching dimensions}}
+  // expected-error @+1 {{lhs and rhs should have the same number of batching dimensions}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2429,7 +3107,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs and rhs should have the same number of batching dimensions}}
+  // expected-error @+1 {{lhs and rhs should have the same number of batching dimensions}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2444,7 +3122,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs and rhs should have the same number of contracting dimensions}}
+  // expected-error @+1 {{lhs and rhs should have the same number of contracting dimensions}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2459,7 +3137,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs and rhs should have the same number of contracting dimensions}}
+  // expected-error @+1 {{lhs and rhs should have the same number of contracting dimensions}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2474,7 +3152,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0, 0],
@@ -2489,7 +3167,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0, 0],
@@ -2504,7 +3182,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 1}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 1}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2519,7 +3197,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 1}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 1}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2534,7 +3212,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2549,7 +3227,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from lhs_batching_dimensions and lhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2564,7 +3242,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from rhs_batching_dimensions and rhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from rhs_batching_dimensions and rhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2579,7 +3257,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op has duplicated dimension from rhs_batching_dimensions and rhs_contracting_dimensions: 0}}
+  // expected-error @+1 {{has duplicated dimension from rhs_batching_dimensions and rhs_contracting_dimensions: 0}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2594,7 +3272,7 @@ func.func @dot_general(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs_batching_dimensions value: -1 is out of range: [0, 3)}}
+  // expected-error @+1 {{lhs_batching_dimensions value: -1 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [-1],
@@ -2609,7 +3287,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs_batching_dimensions value: 3 is out of range: [0, 3)}}
+  // expected-error @+1 {{lhs_batching_dimensions value: 3 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [3],
@@ -2624,7 +3302,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op rhs_batching_dimensions value: -1 is out of range: [0, 3)}}
+  // expected-error @+1 {{rhs_batching_dimensions value: -1 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2639,7 +3317,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op rhs_batching_dimensions value: 3 is out of range: [0, 3)}}
+  // expected-error @+1 {{rhs_batching_dimensions value: 3 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2654,7 +3332,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs_contracting_dimensions value: -1 is out of range: [0, 3)}}
+  // expected-error @+1 {{lhs_contracting_dimensions value: -1 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2669,7 +3347,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op lhs_contracting_dimensions value: 3 is out of range: [0, 3)}}
+  // expected-error @+1 {{lhs_contracting_dimensions value: 3 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2684,7 +3362,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op rhs_contracting_dimensions value: -1 is out of range: [0, 3)}}
+  // expected-error @+1 {{rhs_contracting_dimensions value: -1 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2699,7 +3377,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op rhs_contracting_dimensions value: 3 is out of range: [0, 3)}}
+  // expected-error @+1 {{rhs_contracting_dimensions value: 3 is out of range: [0, 3)}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2714,7 +3392,7 @@ func.func @dot_general(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<2x?x?xf32>, %arg1: tensor<3x?x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op batching dimension sizes must match for lhs/rhs}}
+  // expected-error @+1 {{batching dimension sizes must match for lhs/rhs}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2729,7 +3407,7 @@ func.func @dot_general(%arg0: tensor<2x?x?xf32>, %arg1: tensor<3x?x?xf32>) {
 // -----
 
 func.func @dot_general(%arg0: tensor<?x2x?xf32>, %arg1: tensor<?x3x?xf32>) {
-  // expected-error @+1 {{'mhlo.dot_general' op contracting dimension sizes must match for lhs/rhs}}
+  // expected-error @+1 {{contracting dimension sizes must match for lhs/rhs}}
   %0 = "mhlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #mhlo.dot<
       lhs_batching_dimensions = [0],
@@ -2743,6 +3421,54 @@ func.func @dot_general(%arg0: tensor<?x2x?xf32>, %arg1: tensor<?x3x?xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @dot_general
+func.func @dot_general(%arg0: tensor<2x3x4xf32>, %arg1: tensor<2x3x5xf32>) -> tensor<2x4x6xf32> {
+  // expected-error@+1 {{inferred shape '[2, 4, 5]' is incompatible with return type of operation 'tensor<2x4x6xf32>'}}
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [1]
+    >
+  } : (tensor<2x3x4xf32>, tensor<2x3x5xf32>) -> tensor<2x4x6xf32>
+  func.return %0 : tensor<2x4x6xf32>
+}
+
+
+// -----
+
+func.func @dot_general_one_element_precision_config(%arg0: tensor<2x3x4xf32>, %arg1: tensor<2x3x5xf32>) -> tensor<2x4x5xf32> {
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [1]
+    >,
+    precision_config = [#mhlo<precision DEFAULT>]
+  } : (tensor<2x3x4xf32>, tensor<2x3x5xf32>) -> tensor<2x4x5xf32>
+  func.return %0 : tensor<2x4x5xf32>
+}
+
+// -----
+
+func.func @dot_general_three_element_precision_config(%arg0: tensor<2x3x4xf32>, %arg1: tensor<2x3x5xf32>) -> tensor<2x4x5xf32> {
+  // expected-error@+1 {{expects precision config to be empty or have <= 2 elements}}
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      rhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [1]
+    >,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+  } : (tensor<2x3x4xf32>, tensor<2x3x5xf32>) -> tensor<2x4x5xf32>
+  func.return %0 : tensor<2x4x5xf32>
+}
+
+// -----
+
 func.func @compatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tensor<?x?xf32> {
   %0 = "mhlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
@@ -2841,7 +3567,7 @@ func.func @bitcast_convert_width_mismatch(%arg: tensor<f32>) -> tensor<f64> {
 // -----
 
 func.func @bitcast_convert_empty_target(%arg: tensor<1xf64>) -> tensor<f32> {
-  // expected-error@+1 {{op does not allow the smaller element type to be part of a 0d tensor, but got: 'tensor<1xf64>' and 'tensor<f32>'.}}
+  // expected-error@+1 {{does not allow the smaller element type to be part of a 0d tensor, but got: 'tensor<1xf64>' and 'tensor<f32>'.}}
   %0 = "mhlo.bitcast_convert"(%arg) : (tensor<1xf64>) -> tensor<f32>
   return %0 : tensor<f32>
 }
@@ -2849,7 +3575,7 @@ func.func @bitcast_convert_empty_target(%arg: tensor<1xf64>) -> tensor<f32> {
 // -----
 
 func.func @bitcast_convert_empty_operand(%arg: tensor<f32>) -> tensor<1xf64> {
-  // expected-error@+1 {{op does not allow the smaller element type to be part of a 0d tensor, but got: 'tensor<f32>' and 'tensor<1xf64>'.}}
+  // expected-error@+1 {{does not allow the smaller element type to be part of a 0d tensor, but got: 'tensor<f32>' and 'tensor<1xf64>'.}}
   %0 = "mhlo.bitcast_convert"(%arg) : (tensor<f32>) -> tensor<1xf64>
   return %0 : tensor<1xf64>
 }
@@ -2910,6 +3636,14 @@ func.func @reduce_precision_invalid_exponent(%arg: tensor<2x4xf32>) -> tensor<2x
 
 // -----
 
+func.func @reduce_precision_invalid_mantissa(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  // expected-error @+1 {{mantissa_bits must be at least 0.}}
+  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits=1 : i32, mantissa_bits=-1 : i32} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
 func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
   %res = "mhlo.gather"(%operand, %start_indices) {
     dimension_numbers = #mhlo.gather<
@@ -3575,9 +4309,33 @@ func.func @set_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32
 
 // -----
 
+func.func @custom_call_with_dictionary_backend_config() {
+  // CHECK: mhlo.custom_call @foo() {api_version = 4 : i32, backend_config = {foo = 42 : i32}}
+  "mhlo.custom_call"() {api_version = 4 : i32, backend_config={foo = 42 : i32}, call_target_name = "foo"} : () -> ()
+  func.return
+}
+
+// -----
+
+func.func @custom_call_with_incompatible_backend_config() {
+  // expected-error@+1 {{unsupported user-encoded backend config, backend config must be a dictionary attribute}}
+  "mhlo.custom_call"() {api_version = 4 : i32, backend_config="bar=42", call_target_name = "foo"} : () -> ()
+  func.return
+}
+
+// -----
+
+func.func @custom_call_with_incompatible_backend_config() {
+  // expected-error@+1 {{unsupported dictionary attribute backend config, backend config must be a user-encoded string attribute}}
+  "mhlo.custom_call"() {api_version = 3 : i32, backend_config={bar = 42 : i32}, call_target_name = "foo"} : () -> ()
+  func.return
+}
+
+// -----
+
 // CHECK: func @custom_call_multiple_inputs_outputs
 func.func @custom_call_multiple_inputs_outputs(%x: tensor<2xf32>, %token: !mhlo.token) -> tensor<2xf32> {
-  %0:3 = "mhlo.custom_call"(%x, %token) {backend_config="", call_target_name = "foo", has_side_effect = false} : (tensor<2xf32>, !mhlo.token) -> (tensor<2xf32>, tensor<2xf32>, !mhlo.token)
+  %0:3 = "mhlo.custom_call"(%x, %token) {backend_config="", call_target_name = "foo", has_side_effect = false, custom_call_schedule = #mhlo<custom_call_schedule NONE>} : (tensor<2xf32>, !mhlo.token) -> (tensor<2xf32>, tensor<2xf32>, !mhlo.token)
   %1 = "mhlo.add"(%0#0, %0#1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   func.return %1 : tensor<2xf32>
 }
@@ -3705,7 +4463,7 @@ func.func @custom_call_mismatch_tensor_and_layout_permutation(%arg: tensor<1x2x3
 
 // CHECK-LABEL: func @custom_call_output_operand_alias
 func.func @custom_call_output_operand_alias(%arg0: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %arg1: tensor<5x5xf32>) {
-  // CHECK: "mhlo.custom_call"
+  // CHECK: mhlo.custom_call
   // CHECK-SAME{LITERAL}: output_operand_aliases = [#mhlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>]}
   %0 = "mhlo.custom_call"(%arg0, %arg1) {
     call_target_name = "foo",
@@ -4335,7 +5093,6 @@ func.func @error_incompatible_alias_element_types (%arg0: tensor<2xf32> {mhlo.re
 
 // -----
 
-
 // mhlo.batch_norm_training
 
 // CHECK-LABEL: @batch_norm_train
@@ -4346,8 +5103,18 @@ func.func @batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>,
 
 // -----
 
+// CHECK-LABEL: @batch_norm_train_dynamic
+func.func @batch_norm_train_dynamic(%input: tensor<?x?x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<?x?x2x2xf32> {
+  %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {
+    epsilon = 0.001 : f32, feature_index = 1 : i64
+  } : (tensor<?x?x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<?x?x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
+  func.return %0#0 : tensor<?x?x2x2xf32>
+}
+
+// -----
+
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects feature_index to be smaller than the rank of operand type; got feature_index 4, and rank 4.}}
+  // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4355,7 +5122,7 @@ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2x
 // -----
 
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects feature_index to be a non-negative number, got -1.}}
+  // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4363,7 +5130,7 @@ func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2x
 // -----
 
 func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<3xf32>, %offset: tensor<3xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects the size of scale factor to be same as the feature count, but the size of scale factor is 3 and the feature count is 2.}}
+  // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 3 and the feature count is 2.}}
   %0:3 = "mhlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 3 : i64} : (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>) -> (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4382,8 +5149,18 @@ func.func @batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf3
 
 // -----
 
+// CHECK-LABEL: @batch_norm_inference_dynamic
+func.func @batch_norm_inference_dynamic(%input: tensor<4x?xf32>, %scale: tensor<?xf32>, %offset: tensor<?xf32>, %mean: tensor<?xf32>, %variance: tensor<?xf32>) -> (tensor<4x?xf32>) {
+  %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {
+    epsilon = 1.001000e-05 : f32, feature_index = 1 : i64
+  } : (tensor<4x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<4x?xf32>
+  func.return %0 : tensor<4x?xf32>
+}
+
+// -----
+
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
-  // expected-error@+1 {{expects feature_index to be smaller than the rank of operand type; got feature_index 2, and rank 2.}}
+  // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 2, and rank 2.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
         tensor<256xf32>) -> tensor<4x256xf32>
@@ -4393,7 +5170,7 @@ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
-  // expected-error@+1 {{expects feature_index to be a non-negative number, got -1.}}
+  // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = -1 : i64} :
       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
         tensor<256xf32>) -> tensor<4x256xf32>
@@ -4403,7 +5180,7 @@ func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<
 // -----
 
 func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<25xf32>, %offset: tensor<25xf32>, %mean: tensor<25xf32>, %variance: tensor<25xf32>) -> (tensor<4x256xf32>) {
-  // expected-error@+1 {{expects the size of scale factor to be same as the feature count, but the size of scale factor is 25 and the feature count is 256.}}
+  // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 25 and the feature count is 256.}}
   %0 = "mhlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
       (tensor<4x256xf32>, tensor<25xf32>, tensor<25xf32>, tensor<25xf32>,
         tensor<25xf32>) -> tensor<4x256xf32>
@@ -4422,8 +5199,17 @@ func.func @batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %
 
 // -----
 
+func.func @batch_norm_grad_dynamic(%input: tensor<?x2x2x2xf32>, %scale: tensor<?xf32>, %mean: tensor<?xf32>, %variance: tensor<?xf32>, %grad_output: tensor<?x2x2x2xf32>) -> tensor<?x2x2x2xf32> {
+  %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {
+    epsilon = 0.001 : f32, feature_index = 0 : i64
+  } : (tensor<?x2x2x2xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x2x2x2xf32>) -> (tensor<?x2x2x2xf32>, tensor<?xf32>, tensor<?xf32>)
+  func.return %0#0 : tensor<?x2x2x2xf32>
+}
+
+// -----
+
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects feature_index to be smaller than the rank of operand type; got feature_index 4, and rank 4.}}
+  // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4431,7 +5217,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects feature_index to be a non-negative number, got -1.}}
+  // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4439,7 +5225,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<4xf32>, %variance: tensor<4xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{expects the size of scale factor to be same as the feature count, but the size of scale factor is 4 and the feature count is 2.}}
+  // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 4 and the feature count is 2.}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4447,7 +5233,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{failed to verify that all of {scale, mean, variance, grad_scale, grad_offset} have same shape}}
+  // expected-error@+1 {{expects single-dimensional operands to have compatible shapes}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4455,7 +5241,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
+  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xi32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4463,7 +5249,7 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{failed to verify that all of {operand, grad_output} have same shape}}
+  // expected-error@+1 {{expects multi-dimensional operands to have compatible shapes}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
@@ -4471,15 +5257,15 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{failed to verify that all of {operand, grad_scale, grad_offset} have same element type}}
+  // expected-error@+1 {{failed to verify that all of {operand, grad_operand, grad_scale, grad_offset} have same element type}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf64>, tensor<2xf64>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
 
 // -----
 
-func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{failed to verify that all of {operand, grad_operand} have same type}}
+func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf64> {
+  // expected-error@+1 {{failed to verify that all of {operand, grad_operand, grad_scale, grad_offset} have same element type}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf64>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf64>
 }
@@ -4487,28 +5273,21 @@ func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf
 // -----
 
 func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{result #1 must be 1D tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
+  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2x2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<2x2x2x2xf32>
 }
 
 // -----
 
-func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-  // expected-error@+1 {{operand #0 must be ranked tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
+func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xf32> {
+  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
   %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>)
   func.return %0#0 : tensor<*xf32>
 }
 
 // -----
 
-func.func @error_batch_norm_grad(%input: tensor<?x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<?x2x2x2xf32>) -> tensor<?x2x2x2xf32> {
-  // expected-error@+1 {{expects the size of scale factor to be same as the feature count, but the size of scale factor is 2 and the feature count is -1.}}
-  %0:3 = "mhlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<?x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<?x2x2x2xf32>) -> (tensor<?x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-  func.return %0#0 : tensor<?x2x2x2xf32>
-}
-
-// -----
 // Test rng_get_and_update_state_op
 // CHECK-LABEL: xla.rng_get_and_update_state
 func.func @xla.rng_get_and_update_state() -> tensor<2xui64> {
@@ -4584,7 +5363,7 @@ func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-  // expected-error@+1 {{RFFT requires innermost dimensions match fft_length. Got: 3, 9 but wanted 9, 9.}}
+  // expected-error@+1 {{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
@@ -4592,7 +5371,7 @@ func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
 // -----
 
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
-  // expected-error@+1 {{IRFFT requires non-final dimensions match fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
+  // expected-error@+1 {{IRFFT requires non-final dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
 }
@@ -4600,7 +5379,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 // -----
 
 func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
-  // expected-error@+1 {{IRFFT requires innermost dimension match fft_length[-1]/2+1. Got: 3, 9 but fft_length is 9.}}
+  // expected-error@+1 {{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 9 but fft_length is 9.}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
   func.return %0 : tensor<3x9xf32>
 }
@@ -4608,7 +5387,7 @@ func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
 // -----
 
 func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-  // expected-error@+1 {{IRFFT takes a complex tensor as input, but is given 'tensor<3x9xf32>'}}
+  // expected-error@+1 {{FFT/IFFT/IRFFT take a complex tensor as input, but is given 'tensor<3x9xf32>'}}
   %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
   func.return %0 : tensor<3x9xcomplex<f32>>
 }
@@ -4631,6 +5410,54 @@ func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
 
 // -----
 
+// CHECK-LABEL: @rfft_dynamic
+func.func @rfft_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
+  func.return %0 : tensor<?x?xcomplex<f32>>
+}
+
+// -----
+
+func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>> {
+  // expected-error@+1{{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 10 but wanted 9.}}
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT> } : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
+  func.return %0 : tensor<?x?xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @irfft_dynamic
+func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32> {
+  // expected-error@+1{{IRFFT requires non-final dimensions to be compatible with fft_length. Got: -9223372036854775808, 3, 15 but wanted 4, 16, and 3 != 4}}
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
+  func.return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32> {
+  // expected-error@+1{{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 8 but fft_length is 16.}}
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @irfft_dynamic
+func.func @irfft_dynamic(%arg0: tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32> {
+  %0 = "mhlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #mhlo<fft_type IRFFT> } : (tensor<?x?xcomplex<f32>>) -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @eltwise_static_and_dynamic_type(
 //  CHECK-SAME: %[[A:.*]]: tensor<10x10xf32>, %[[B:.*]]: tensor<?x?xf32>) -> tensor<10x10xf32>
 //       CHECK: %[[R:.*]] = mhlo.add %[[A]], %[[B]] : (tensor<10x10xf32>, tensor<?x?xf32>) -> tensor<10x10xf32>
@@ -5484,7 +6311,7 @@ func.func @is_finite(%arg0: tensor<3xf32>) -> tensor<3xi1> {
 // -----
 
 func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
-  // expected-error@+1 {{operand #0 must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
   %0 = "mhlo.is_finite"(%arg0) {} : (tensor<3xi32>) -> tensor<3xi1>
   func.return %0 : tensor<3xi1>
 }
@@ -5512,3 +6339,17 @@ func.func @invalid_dimension_attr(%arg0: tensor<?x?xf32, #mhlo.type_extensions<b
   %result = "mhlo.set_dimension_size"(%arg0, %arg1) {dimension = -1 : i64} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
   func.return %result : tensor<*xf32>
 }
+
+// -----
+
+func.func @f8e4m3fn(%arg0: tensor<f16>) -> tensor<f8E4M3FN> {
+  %0 = "mhlo.convert"(%arg0) : (tensor<f16>) -> tensor<f8E4M3FN>
+  func.return %0 : tensor<f8E4M3FN>
+}
+
+// -----
+
+func.func @f8e5m2(%arg0: tensor<f16>) -> tensor<f8E5M2> {
+  %0 = "mhlo.convert"(%arg0) : (tensor<f16>) -> tensor<f8E5M2>
+  func.return %0 : tensor<f8E5M2>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
index be3d7ffc968..f002d87fde8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/optimize-hlo.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -pass-pipeline='func.func(mhlo-test-optimize)' | FileCheck %s --dump-input-context=30
+// RUN: mlir-hlo-opt %s -pass-pipeline='builtin.module(func.func(mhlo-test-optimize))' | FileCheck %s --dump-input-context=30
 
 // CHECK-LABEL: @gather_is_slice_no_rank
 func.func @gather_is_slice_no_rank(%arg0: tensor<2x1x2xi32>, %arg1: tensor<i64>) -> tensor<1x2xi32> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 01552900c23..da1b657a4ce 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo --mlir-print-op-generic --split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --stablehlo-legalize-to-hlo --mlir-print-op-generic --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // ============ ATTRIBUTES ============
 
@@ -307,10 +307,13 @@ func.func @attr_transpose_adjoint(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16x
 
 // TypeExtensionsAttr aka #stablehlo.type_extensions is covered below.
 
-func.func @attr_type_extensions_bounds(%arg0: tensor<?xf32, #stablehlo.type_extensions<bounds = [16]>>) -> tensor<?xf32, #stablehlo.type_extensions<bounds = [16]>> {
-  // CHECK: "func.return"(%arg0) : (tensor<?xf32, #mhlo.type_extensions<bounds = [16]>>) -> ()
-  func.return %arg0 : tensor<?xf32, #stablehlo.type_extensions<bounds = [16]>>
+func.func @attr_type_extensions_bounds(
+    %arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>)
+    -> tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>> {
+  // CHECK: "func.return"(%arg0) : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [16, ?]>>) -> ()
+  func.return %arg0 : tensor<?x?xf32, #stablehlo.type_extensions<bounds = [16, ?]>>
 }
+
 // CHECK-LABEL: "attr_type_extensions_bounds"
 
 // ============ OPS ============
@@ -378,6 +381,7 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
 
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "mhlo.all_to_all"(%arg0) {
+  //          CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
   //          CHECK-SAME:   concat_dimension = 0 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
   //          CHECK-SAME:   split_count = 4 : i64,
@@ -387,7 +391,8 @@ func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
     split_dimension = 1 : i64,
     concat_dimension = 0 : i64,
     split_count = 4 : i64,
-    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
+    replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+    channel_handle = #stablehlo.channel_handle<handle = 1, type = 0>
   } : (tensor<4x16xf32>) -> tensor<16x4xf32>
   func.return %0 : tensor<16x4xf32>
 }
@@ -594,10 +599,10 @@ func.func @op_convert(%arg0: tensor<i32>) -> tensor<f32> {
 }
 // CHECK-LABEL: "op_convert"
 
-func.func @op_convolution(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32> {
+func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   //      CHECK: "mhlo.convolution"(%arg0, %arg1) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
-  // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+  // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
@@ -605,19 +610,19 @@ func.func @op_convolution(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32>
+  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "stablehlo.convolution"(%arg0, %arg1) {
     window_strides = dense<1> : tensor<2xi64>,
     padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = dense<1> : tensor<2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
     window_reversal = dense<false> : tensor<2xi1>,
-    dimension_numbers = #stablehlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+    dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>) -> tensor<32x1x8x8x16xf32>
-  func.return %0 : tensor<32x1x8x8x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  func.return %0 : tensor<1x8x8x16xf32>
 }
 // CHECK-LABEL: "op_convolution"
 
@@ -654,7 +659,7 @@ func.func @op_cstr_reshapable(%arg0: index, %arg1: tensor<1xindex>) -> !shape.wi
 // CHECK-LABEL: "op_cstr_reshapable"
 
 func.func @called_computation() { func.return }
-func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
+func.func @op_custom_call_api_version_original(%arg0: tensor<f32>) -> tensor<f32> {
   //      CHECK: "mhlo.custom_call"(%arg0) {
   // CHECK-SAME:   api_version = 1 : i32,
   // CHECK-SAME:   backend_config = "",
@@ -662,6 +667,11 @@ func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK-SAME:   called_computations = [@foo],
   // CHECK-SAME:   has_side_effect = false,
   // CHECK-SAME:   operand_layouts = [dense<> : tensor<0xindex>],
+  // CHECK-SAME:   output_operand_aliases = [
+  // CHECK-SAME:     #mhlo.output_operand_alias<
+  // CHECK-SAME:       output_tuple_indices = [],
+  // CHECK-SAME:       operand_index = 0,
+  // CHECK-SAME:       operand_tuple_indices = []>]
   // CHECK-SAME:   result_layouts = [dense<> : tensor<0xindex>]
   // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
   %0 = "stablehlo.custom_call"(%arg0) {
@@ -671,11 +681,29 @@ func.func @op_custom_call(%arg0: tensor<f32>) -> tensor<f32> {
     api_version = 1 : i32,
     called_computations = [@foo],
     operand_layouts = [dense<> : tensor<0xindex>],
+    output_operand_aliases = [
+      #stablehlo.output_operand_alias<output_tuple_indices = [],
+                                 operand_index = 0,
+                                 operand_tuple_indices = []>],
     result_layouts = [dense<> : tensor<0xindex>]
   } : (tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
-// CHECK-LABEL: "op_custom_call"
+// CHECK-LABEL: "op_custom_call_api_version_original"
+
+func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f32> {
+  //      CHECK: "mhlo.custom_call"(%arg0) {
+  // CHECK-SAME:   api_version = 4 : i32,
+  // CHECK-SAME:   backend_config = {foo = "bar"},
+  // CHECK-SAME:   call_target_name = "foo"
+  // CHECK-SAME: } : (tensor<f32>) -> tensor<f32>
+  %0 = "stablehlo.custom_call"(%arg0) {
+    call_target_name = "mhlo.custom_call",
+    backend_config = "{api_version = 4 : i32, backend_config = {foo = \22bar\22}, call_target_name = \22foo\22}"
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+// CHECK-LABEL: "op_custom_call_api_version_typed_ffi"
 
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -733,10 +761,10 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
 }
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 
-func.func @op_dynamic_conv(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<32x1x?x?x16xf32> {
+func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>, %arg2: tensor<4xi32>) -> tensor<1x?x?x16xf32> {
   //      CHECK: "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   batch_group_count = 1 : i64,
-  // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+  // CHECK-SAME:   dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
   // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
@@ -744,19 +772,19 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x32x207xf32>, %arg1: tensor<3x3x32
   // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
   // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME: } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>, tensor<4xi32>) -> tensor<32x1x?x?x16xf32>
+  // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   %0 = "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = dense<1> : tensor<2xi64>,
     padding = dense<1> : tensor<2x2xi64>,
     lhs_dilation = dense<1> : tensor<2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
     window_reversal = dense<false> : tensor<2xi1>,
-    dimension_numbers = #stablehlo.conv<[b, 0, 1, ?, f]x[0, 1, ?, i, o]->[?, b, 0, 1, f]>,
+    dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
     precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x8x8x32x207xf32>, tensor<3x3x32x207x16xf32>, tensor<4xi32>) -> tensor<32x1x?x?x16xf32>
-  func.return %0 : tensor<32x1x?x?x16xf32>
+  } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
+  func.return %0 : tensor<1x?x?x16xf32>
 }
 // CHECK-LABEL: "op_dynamic_conv"
 
@@ -907,13 +935,13 @@ func.func @op_get_dimension_size(%arg0: tensor<?xf32>) -> tensor<i32> {
 }
 // CHECK-LABEL: "op_get_dimension_size"
 
-func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>>) -> tensor<f32> {
+func.func @op_get_tuple_element(%arg0: tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32> {
   //      CHECK: "mhlo.get_tuple_element"(%arg0) {
-  // CHECK-SAME:   index = 0 : i32
-  // CHECK-SAME: } : (tuple<tensor<f32>>) -> tensor<f32>
+  // CHECK-SAME:   index = 4 : i32
+  // CHECK-SAME: } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   %0 = "stablehlo.get_tuple_element"(%arg0) {
-    index = 0 : i32
-  } : (tuple<tensor<f32>>) -> tensor<f32>
+    index = 4 : i32
+  } : (tuple<tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 // CHECK-LABEL: "op_get_tuple_element"
@@ -1086,6 +1114,13 @@ func.func @op_pad(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tensor<16xf32> {
 }
 // CHECK-LABEL: "op_pad"
 
+func.func @op_partition_id() -> tensor<ui32> {
+  // CHECK: "mhlo.partition_id"() : () -> tensor<ui32>
+  %0 = "stablehlo.partition_id"() : () -> tensor<ui32>
+  func.return %0 : tensor<ui32>
+}
+// CHECK-LABEL: "op_partition_id"
+
 func.func @op_popcnt(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: "mhlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
   %0 = "stablehlo.popcnt"(%arg0) : (tensor<i32>) -> tensor<i32>
@@ -1664,6 +1699,20 @@ func.func @type_ui64(%arg0: tensor<ui64>, %arg1: tensor<ui64>) -> tensor<ui64> {
 }
 // CHECK-LABEL: "type_ui64"
 
+func.func @type_f8E4M3FN(%arg0: tensor<f8E4M3FN>, %arg1: tensor<f8E4M3FN>) -> tensor<f8E4M3FN> {
+  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E4M3FN>, tensor<f8E4M3FN>) -> tensor<f8E4M3FN>
+  func.return %0 : tensor<f8E4M3FN>
+}
+// CHECK-LABEL: "type_f8E4M3FN"
+
+func.func @type_f8E5M2(%arg0: tensor<f8E5M2>, %arg1: tensor<f8E5M2>) -> tensor<f8E5M2> {
+  // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<f8E5M2>, tensor<f8E5M2>) -> tensor<f8E5M2>
+  func.return %0 : tensor<f8E5M2>
+}
+// CHECK-LABEL: "type_f8E5M2"
+
 func.func @type_bf16(%arg0: tensor<bf16>, %arg1: tensor<bf16>) -> tensor<bf16> {
   // CHECK: "mhlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
   %0 = "stablehlo.add"(%arg0, %arg1) : (tensor<bf16>, tensor<bf16>) -> tensor<bf16>
@@ -1749,6 +1798,25 @@ func.func @type_token_caller(%arg0: !stablehlo.token) -> !stablehlo.token {
 //       CHECK: function_type = (!mhlo.token) -> !mhlo.token
 // CHECK-LABEL: "type_token_caller"
 
+func.func @type_token_region(%arg0: tensor<i1>, %arg1: !stablehlo.token) {
+  //      CHECK: "mhlo.while"(%arg1) ({
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !mhlo.token):
+  // CHECK-NEXT:     "mhlo.return"(%arg0) : (tensor<i1>) -> ()
+  // CHECK-NEXT:   }, {
+  // CHECK-NEXT:   ^[[BB:bb.*]](%[[ARG2:arg.*]]: !mhlo.token):
+  // CHECK-NEXT:     "mhlo.return"(%[[ARG2]]) : (!mhlo.token) -> ()
+  // CHECK-NEXT: }) : (!mhlo.token) -> !mhlo.token
+  %0 = "stablehlo.while"(%arg1) ({
+    ^bb0(%arg2: !stablehlo.token):
+      stablehlo.return %arg0 : tensor<i1>
+    }, {
+    ^bb0(%arg2: !stablehlo.token):
+      stablehlo.return %arg2 : !stablehlo.token
+  }) : (!stablehlo.token) -> !stablehlo.token
+  return
+}
+// CHECK-LABEL: "type_token_region"
+
 func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!stablehlo.token> {
   %0 = "stablehlo.custom_call"(%arg0) {
     call_target_name = "foo"
@@ -1757,3 +1825,19 @@ func.func @type_tuple(%arg0: tuple<tensor<f32>>) -> tuple<!stablehlo.token> {
   return %0 : tuple<!stablehlo.token>
 }
 // CHECK-LABEL: "type_tuple"
+
+// ============ NEGATIVE TESTS ============
+// Some ops, attributes and types used in StableHLO programs are not supported in MHLO.
+// For those cases, we have negative tests below.
+
+// -----
+
+func.func @op_custom_call_botched_extensibility_protocol(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{failed to legalize operation 'stablehlo.custom_call' that was explicitly marked illegal}}
+  %0 = "stablehlo.custom_call"(%arg0) {
+    call_target_name = "mhlo.custom_call",
+    backend_config = "{api_version = 4 : i32, backend_config = {foo = \22bar\22}, call_target_name = \22foo\22}",
+    has_side_effect = false
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
index 026e6f287f3..4eb4d418396 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/symbolic-shape-optimization.mlir
@@ -645,27 +645,25 @@ func.func @empty_bcast(%arg0 : tensor<f32>, %arg1 : tensor<f32>) -> tensor<0xind
 // -----
 
 // CHECK-LABEL: @simplifiable_bcast
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<?x1x1x4x?x?x1xf32>
-// CHECK-SAME:  %[[ARG1:.*]]: tensor<1x8x1x?x1x?xf32>
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<?x1x1x4x?x?x1xf32> {rt.symbolic_shape = dense<[-2, 1, 1, 4, -2, -3, 1]> : tensor<7xi64>}
+// CHECK-SAME:  %[[ARG1:.*]]: tensor<1x8x1x?x1x?xf32> {rt.symbolic_shape = dense<[1, 8, 1, -2, 1, -4]> : tensor<6xi64>}
 func.func @simplifiable_bcast(
     %arg0 : tensor<?x1x1x4x?x?x1xf32>
     {rt.symbolic_shape = dense<[-2, 1, 1, 4, -2, -3,  1]> : tensor<7xi64>},
     %arg1 : tensor<1x8x1x?x1x?xf32>
     {rt.symbolic_shape = dense<[    1, 8, 1, -2,  1, -4]> : tensor<6xi64>})
     -> tensor<7xindex> {
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1
-  // CHECK-DAG: %[[C4:.*]] = arith.constant 4
-  // CHECK-DAG: %[[C5:.*]] = arith.constant 5
-  // CHECK-DAG: %[[C8:.*]] = arith.constant 8
-  // CHECK-DAG: %[[S0:.*]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[S1:.*]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[S0D0:.*]] = tensor.extract %[[S0]][%[[C0]]]
-  // CHECK-DAG: %[[S0D4:.*]] = tensor.extract %[[S0]][%[[C4]]]
-  // CHECK-DAG: %[[S0D5:.*]] = tensor.extract %[[S0]][%[[C5]]]
-  // CHECK-DAG: %[[S1D5:.*]] = tensor.extract %[[S1]][%[[C5]]]
-  // CHECK-DAG: %[[RES:.*]] = tensor.from_elements %[[S0D0]], %[[C1]], %[[C8]], %[[C4]], %[[S0D4]], %[[S0D5]], %[[S1D5]]
-  // CHECK:     return %[[RES]]
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+  // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
+  // CHECK-DAG: %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]]
+  // CHECK-DAG: %[[DIM_0:.*]] = tensor.dim %[[ARG0]], %[[C4]]
+  // CHECK-DAG: %[[DIM_1:.*]] = tensor.dim %[[ARG0]], %[[C5]]
+  // CHECK-DAG: %[[DIM_2:.*]] = tensor.dim %[[ARG1]], %[[C5]]
+  // CHECK-DAG: %[[FROM_ELEMENTS:.*]] = tensor.from_elements %[[DIM]], %[[C1]], %[[C8]], %[[C4]], %[[DIM_0]], %[[DIM_1]], %[[DIM_2]]
+  // CHECK:     return %[[FROM_ELEMENTS]] : tensor<7xindex>
   %0 = shape.shape_of %arg0 : tensor<?x1x1x4x?x?x1xf32> -> tensor<7xindex>
   %1 = shape.shape_of %arg1 : tensor<1x8x1x?x1x?xf32> -> tensor<6xindex>
   %2 = shape.broadcast %0, %1 : tensor<7xindex>, tensor<6xindex>
@@ -695,10 +693,7 @@ func.func @very_dynamic_bcast(%arg0 : tensor<?xf32>, %arg1 : tensor<?xf32>)
 // CHECK-LABEL: @broadcast_w_dyn_ty
 // CHECK-SAME:  %[[ARG:.*]]: tensor<1xindex>
 func.func @broadcast_w_dyn_ty(%arg0: tensor<1xindex>) -> tensor<?xindex>{
-  // CHECK: %[[C0:.*]] = arith.constant 0
-  // CHECK: %[[D0:.*]] = tensor.extract %[[ARG]][%[[C0]]]
-  // CHECK: %[[UNCAST:.*]] = tensor.from_elements %[[D0]]
-  // CHECK: %[[CAST:.*]] = tensor.cast %[[UNCAST]] : tensor<1xindex> to tensor<?xindex>
+  // CHECK: %[[CAST:.*]] = tensor.cast %[[ARG]] : tensor<1xindex> to tensor<?xindex>
   // CHECK: return %[[CAST]]
    %0 = shape.broadcast %arg0, %arg0
        : tensor<1xindex>, tensor<1xindex> -> tensor<?xindex>
@@ -717,3 +712,118 @@ func.func @broadcast_scalar_w_dyn_ty(%arg0: tensor<0xindex>) -> tensor<?xindex>{
        : tensor<0xindex>, tensor<0xindex> -> tensor<?xindex>
    func.return %0 : tensor<?xindex>
 }
+
+// -----
+
+// CHECK-LABEL: @optimize_1dx1d_bcast
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}, %[[ARG1:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}
+func.func @optimize_1dx1d_bcast(
+  %arg0: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
+  %arg1: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
+) -> tensor<?xf32> {
+  // CHECK:      %[[SHAPE:.*]] = shape.shape_of %[[ARG0]]
+  // CHECK:      %[[DYNAMIC:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[SHAPE]])
+  // CHECK-SAME:     broadcast_dimensions = dense<0>
+  // CHECK-SAME:     known_expanding_dimensions = dense<>
+  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      return %[[DYNAMIC]]
+  %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
+  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
+  %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
+      -> tensor<1xindex>
+  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  func.return %3: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @optimize_1dx2d_bcast_const_shape
+// CHECK-SAME:  %[[ARG0_0:.*]]: tensor<512xf32>, %[[ARG1_0:.*]]: tensor<?x512xf32> {rt.symbolic_shape = dense<[-2, 512]> : tensor<2xi64>}
+func.func @optimize_1dx2d_bcast_const_shape(
+  %arg0: tensor<512xf32>,
+  %arg1: tensor<?x512xf32>
+    {rt.symbolic_shape = dense<[-2, 512]> : tensor<2xi64>}
+) -> tensor<?x512xf32> {
+  // CHECK:      %[[SHAPE_0:.*]] = shape.shape_of %[[ARG1_0]]
+  // CHECK:      %[[DYNAMIC_0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_0]], %[[SHAPE_0]])
+  // CHECK-SAME:     broadcast_dimensions = dense<1>
+  // CHECK-SAME:     known_expanding_dimensions = dense<>
+  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      return %[[DYNAMIC_0]]
+  %0 = shape.const_shape [512] : tensor<1xindex>
+  %1 = shape.shape_of %arg1 : tensor<?x512xf32> -> tensor<2xindex>
+  %2 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<2xindex>
+                             -> tensor<2xindex>
+  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+       : (tensor<512xf32>, tensor<2xindex>) -> tensor<?x512xf32>
+  func.return %3: tensor<?x512xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @optimize_1dx1dx1d_bcast
+// CHECK:       %[[ARG0_1:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}, %[[ARG1_1:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}, %[[ARG2:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}
+func.func @optimize_1dx1dx1d_bcast(
+  %arg0: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
+  %arg1: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>},
+  %arg2: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
+) -> tensor<?xf32> {
+  // CHECK:      %[[SHAPE_1:.*]] = shape.shape_of %[[ARG0_1]]
+  // CHECK:      %[[DYNAMIC_1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_1]], %[[SHAPE_1]])
+  // CHECK-SAME:     broadcast_dimensions = dense<0>
+  // CHECK-SAME:     known_expanding_dimensions = dense<>
+  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      return %[[DYNAMIC_1]]
+  %0 = shape.shape_of %arg0 : tensor<?xf32> -> tensor<1xindex>
+  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
+  %2 = shape.shape_of %arg2 : tensor<?xf32> -> tensor<1xindex>
+  %3 = shape.broadcast %0, %1 : tensor<1xindex>, tensor<1xindex>
+                             -> tensor<1xindex>
+  %4 = shape.broadcast %3, %2 : tensor<1xindex>, tensor<1xindex>
+                             -> tensor<1xindex>
+  %5 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %4)
+         {broadcast_dimensions = dense<[0]> : tensor<1xi64>}
+       : (tensor<?xf32>, tensor<1xindex>) -> tensor<?xf32>
+  func.return %5: tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @optimize_2dx1d_bcast
+// CHECK-SAME:  %[[ARG0_2:.*]]: tensor<10x?xf32> {rt.symbolic_shape = dense<[10, -2]> : tensor<2xi64>}, %[[ARG1_2:.*]]: tensor<?xf32> {rt.symbolic_shape = dense<-2> : tensor<1xi64>}
+func.func @optimize_2dx1d_bcast(
+  %arg0: tensor<10x?xf32>
+    {rt.symbolic_shape = dense<[10, -2]> : tensor<2xi64>},
+  %arg1: tensor<?xf32>
+    {rt.symbolic_shape = dense<[-2]> : tensor<1xi64>}
+) -> (tensor<10x?xf32>, tensor<10x?xf32>) {
+  // CHECK:      %[[SHAPE_2:.*]] = shape.shape_of %[[ARG0_2]]
+  // CHECK:      %[[DYNAMIC_2:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0_2]], %[[SHAPE_2]])
+  // CHECK-SAME:     broadcast_dimensions = dense<[0, 1]>
+  // CHECK-SAME:     known_expanding_dimensions = dense<>
+  // CHECK-SAME:     known_nonexpanding_dimensions = dense<[0, 1]>
+  // CHECK:      %[[DYNAMIC_3:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1_2]], %[[SHAPE_2]])
+  // CHECK-SAME:     broadcast_dimensions = dense<1>
+  // CHECK-SAME:     known_expanding_dimensions = dense<>
+  // CHECK-SAME:     known_nonexpanding_dimensions = dense<0>
+  // CHECK:      return %[[DYNAMIC_2]], %[[DYNAMIC_3]]
+  %0 = shape.shape_of %arg0 : tensor<10x?xf32> -> tensor<2xindex>
+  %1 = shape.shape_of %arg1 : tensor<?xf32> -> tensor<1xindex>
+  %2 = shape.broadcast %0, %1 : tensor<2xindex>, tensor<1xindex>
+                             -> tensor<2xindex>
+  %3 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %2)
+         {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+       : (tensor<10x?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
+  %4 = "mhlo.dynamic_broadcast_in_dim"(%arg1, %2)
+         {broadcast_dimensions = dense<[1]> : tensor<1xi64>}
+       : (tensor<?xf32>, tensor<2xindex>) -> tensor<10x?xf32>
+  func.return %3, %4: tensor<10x?xf32>, tensor<10x?xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_bounds.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_bounds.mlir
new file mode 100644
index 00000000000..01c898b7820
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_bounds.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file
+
+// expected-error@+1 {{Bounds length is 1, expected to be equal to rank(2) of the tensor}}
+func.func @incorrect_bounds_length(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [3]>>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [3]>> {
+  func.return %arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3]>>
+}
+
+// -----
+
+// expected-error@+1 {{Static dimension 0 cannot have a bound, use ShapedType::kDynamic to indicate a missing bound}}
+func.func @static_dim_with_bound(%arg0: tensor<3xf32, #mhlo.type_extensions<bounds = [3]>>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [3]>> {
+  func.return %arg0 : tensor<?xf32, #mhlo.type_extensions<bounds = [3]>>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
index 713b200fd20..5b22e609fa5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_conv_op.mlir
@@ -1,9 +1,8 @@
 // RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | FileCheck %s
 
-// -----
-
 // Valid: Generic convolution
 
+// CHECK-LABEL: func @main
 func.func @main(%arg0 : tensor<100x26x26x32xf32>, %arg1 : tensor<3x3x1x32xf32>) ->
     tensor<100x28x28x1xf32> {
   %result = "mhlo.convolution"(%arg0, %arg1) {
@@ -29,8 +28,11 @@ func.func @main(%arg0 : tensor<100x26x26x32xf32>, %arg1 : tensor<3x3x1x32xf32>)
   func.return %result : tensor<100x28x28x1xf32>
 }
 
+// -----
+
 // Valid: Test convolution i8xi8 -> i32.
 
+// CHECK-LABEL: func @convolution_upcast
 func.func @convolution_upcast(%arg0 : tensor<100x26x26x32xi8>,
     %arg1 : tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
   %result = "mhlo.convolution"(%arg0, %arg1) {
@@ -55,6 +57,8 @@ func.func @convolution_upcast(%arg0 : tensor<100x26x26x32xi8>,
   func.return %result : tensor<100x28x28x1xi32>
 }
 
+// -----
+
 // Valid: Empty spatial dimensions
 
 // CHECK: func @conv_empty_spatial_dimensions
@@ -274,7 +278,7 @@ func.func @invalid_conv_dimensions(%arg0 : tensor<100x26x26x32xf32>,
 
 func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-  // expected-error@+1 {{op expects batch_group_count to be a positive number, got 0.}}
+  // expected-error@+1 {{expects batch_group_count to be a positive number, got 0.}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1, 1]],
@@ -292,7 +296,7 @@ func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
 
 func.func @invalid_conv_dimensions(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-  // expected-error@+1 {{op expects feature_group_count to be a positive number, got 0.}}
+  // expected-error@+1 {{expects feature_group_count to be a positive number, got 0.}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1, 1]],
@@ -468,6 +472,23 @@ func.func @invalid_conv_window_attributes(%arg0: tensor<1x8x8x207xf32>,
 
 // -----
 
+func.func @invalid_conv_window_attributes(%arg0: tensor<1x8x8x207xf32>,
+    %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+  // expected-error@+1 {{expects window-reversal to have same dimension-size as size of window dimensions (2), but got: 1.}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+         window = {stride = [1, 1], pad = [[1, 1], [1, 1]],
+           lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [false]}
+         {
+           batch_group_count = 1 : i64,
+           feature_group_count = 1 : i64,
+           precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} :
+       (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  func.return %0 : tensor<1x8x8x16xf32>
+}
+
+// -----
+
 func.func @invalid_conv_window_attributes(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   // expected-error@+1 {{expects padding-entries to have same dimension-size as size of window dimensions (2), but got: 1.}}
@@ -605,7 +626,7 @@ func.func @invalid_conv_window_attributes(%arg0: tensor<1x8x8x207xf32>,
 
 func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x16xf32> {
-  // expected-error @+1 {{expects rank of convolution return-type to be equal to input-ranks (4), but got 3.}}
+  // expected-error @+1 {{expects rank of convolution return-type to be equal to input-ranks (4), but got 3}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1, 1]],
@@ -626,7 +647,7 @@ func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
 
 func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<2x8x8x16xf32> {
-  // expected-error@+1 {{nvolution' op has shape mismatch between the expected return-type ('tensor<1x8x8x16xf32>') and actual return-type ('tensor<2x8x8x16xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, 8, 16]' is incompatible with return type of operation 'tensor<2x8x8x16xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -647,7 +668,7 @@ func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
 
 func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x32xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x8x8x16xf32>') and actual return-type ('tensor<1x8x8x32xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, 8, 16]' is incompatible with return type of operation 'tensor<1x8x8x32xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -670,7 +691,7 @@ func.func @invalid_conv_return_type(%arg0: tensor<1x8x8x207xf32>,
 // Dynamic input-batch-dimension
 func.func @invalid_conv_dynamic_shapes(%arg0: tensor<?x8x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<?x8x8x16xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[?, 8, 8, 16]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -689,7 +710,7 @@ func.func @invalid_conv_dynamic_shapes(%arg0: tensor<?x8x8x207xf32>,
 // Dynamic input-feature-dimension: No effect on output dimensions.
 func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x8x8x?xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x8x8x16xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, 8, 16]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -708,7 +729,7 @@ func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x8x8x?xf32>,
 // Dynamic input-spatial-dimension
 func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x?x8x207xf32>,
     %arg1: tensor<3x3x207x16xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x?x8x16xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, ?, 8, 16]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -727,7 +748,7 @@ func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x?x8x207xf32>,
 // Dynamic kernel-input-feature-dimension: No effect on output dimensions.
 func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x?x16xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x8x8x16xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, 8, 16]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -746,7 +767,7 @@ func.func @invalid_conv_dynamic_shapes(%arg0: tensor<1x8x8x207xf32>,
 // Dynamic kernel-output-feature-dimension
 func.func @check_inferred_type_with_dynamic_input_dims(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x3x207x?xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x8x8x?xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, 8, ?]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -765,7 +786,7 @@ func.func @check_inferred_type_with_dynamic_input_dims(%arg0: tensor<1x8x8x207xf
 // Dynamic kernel-spatial-dimension
 func.func @check_inferred_type_with_dynamic_input_dims(%arg0: tensor<1x8x8x207xf32>,
     %arg1: tensor<3x?x207x16xf32>) -> tensor<1x1x1x1xf32> {
-  // expected-error@+1 {{has shape mismatch between the expected return-type ('tensor<1x8x?x16xf32>') and actual return-type ('tensor<1x1x1x1xf32>').}}
+  // expected-error@+1 {{inferred shape '[1, 8, ?, 16]' is incompatible with return type of operation 'tensor<1x1x1x1xf32>'}}
   %0 = mhlo.convolution(%arg0, %arg1)
          dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
          window = {stride = [1, 1], pad = [[1, 1], [1,1]],
@@ -779,3 +800,155 @@ func.func @check_inferred_type_with_dynamic_input_dims(%arg0: tensor<1x8x8x207xf
   func.return %0 : tensor<1x1x1x1xf32>
 }
 
+// -----
+
+func.func @conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+  // expected-error @+3 {{'mhlo.convolution' Expected array with 2 elements, got 3 elements instead}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+         window = {stride = [1, 1], pad = [[1, 1, 1], [1, 1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
+         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} :
+       (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+  func.return %0 : tensor<1x8x8x16xf32>
+}
+
+// -----
+
+// CHECK: module
+// CHECK-SAME: mhlo.conv = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 1, 0, f]>
+module attributes { mhlo.conv = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [2, 1]>} {}
+
+// -----
+
+// CHECK: module
+// CHECK: mhlo.conv = #mhlo.conv<[b, 1, 0, f]x[0, 1, i, o]->[b, 0, 1, f]>
+module attributes {
+  mhlo.conv = #mhlo.conv<[b, 1, 0, f]x[0, 1, i, o]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Unexpected dimension c, expecting b, f}}
+  mhlo.conv = #mhlo.conv<[c, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Unexpected dimension b, expecting i, o}}
+  mhlo.conv = #mhlo.conv<[b, 0, 1, f]x[0, 1, b, o]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Unexpected dimension i, expecting o}}
+  mhlo.conv = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, i]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Expected dimensions f not specified}}
+  mhlo.conv = #mhlo.conv<[b, 0, 1]x[0, 1, i, o]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Unexpected keyword b}}
+  mhlo.conv = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o, b]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{expected '['}}
+  mhlo.conv = #mhlo.conv<{b, 0, 1, f}x[0, 1, i, o]->[b, 0, 1, f]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Expected spatial dimensions 0 not specified}}
+  mhlo.conv = #mhlo.conv<[b, f, 1]x[o, 0, 1, i]->[f, b, 0, 1]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Duplicate entries for spatial dimension 1}}
+  mhlo.conv = #mhlo.conv<[b, f, 1, 0, 1]x[o, 0, 1, i]->[f, b, 0, 1]>
+} {}
+
+// -----
+
+module attributes {
+  // expected-error@+1{{Unexpected dimension -2}}
+  mhlo.conv = #mhlo.conv<[b, f, 1, -2]x[o, 0, 1, i]->[f, b, 0, 1]>
+} {}
+
+// -----
+
+func.func @convolution(%arg0: tensor<2x2x3x4xf32>, %arg1: tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32> {
+  // expected-error@+3{{Unexpected keyword stide}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+     window = {stide = [2, 1], pad = [[0, 1], [0, 1]], rhs_dilate = [1, 2]}
+     { batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+  : (tensor<2x2x3x4xf32>, tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32>
+  func.return %0 : tensor<3x5x5x4xf32>
+}
+
+// -----
+
+func.func @convolution(%arg0: tensor<2x2x3x4xf32>, %arg1: tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32> {
+  // expected-error@+3{{expected integer value}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+     window = {stride = [2, b], pad = [[0, 1], [0, 1]], rhs_dilate = [1, 2]}
+     { batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+  : (tensor<2x2x3x4xf32>, tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32>
+  func.return %0 : tensor<3x5x5x4xf32>
+}
+
+// -----
+
+func.func @convolution(%arg0: tensor<2x2x3x4xf32>, %arg1: tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32> {
+  // expected-error@+3{{Unexpected keyword stride}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+     window = {stride = [2, 1], pad = [[0, 1], [0, 1]], rhs_dilate = [1, 2], stride=[2,1]}
+     { batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+  : (tensor<2x2x3x4xf32>, tensor<3x5x5x3xf32>) -> tensor<3x5x5x4xf32>
+  func.return %0 : tensor<3x5x5x4xf32>
+}
+
+// -----
+
+func.func @conv_invalid_precision_config(%arg0: tensor<3x2xf16>,
+    %arg1: tensor<2x2xf16>) -> tuple<tensor<3x2xf16>> {
+  // expected-error@+1 {{expects precision config to be empty or have <= 2 elements}}
+  %0 = mhlo.convolution(%arg0, %arg1)
+         dim_numbers = [b, f]x[i, o]->[b, f],
+         window = {stride = [], pad = [], lhs_dilate = [], rhs_dilate = [],
+           reverse = []}
+         {
+           batch_group_count = 1 : i64,
+           feature_group_count = 1 : i64,
+           precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+         }
+       : (tensor<3x2xf16>, tensor<2x2xf16>) -> tensor<3x2xf16>
+  %1 = "mhlo.tuple"(%0) : (tensor<3x2xf16>) -> tuple<tensor<3x2xf16>>
+  func.return %1 : tuple<tensor<3x2xf16>>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
index 50c8ca186ab..1d9a912fee6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
@@ -381,6 +381,23 @@ func.func @verify_reducer_function(%arg0: tensor<8x5xf32>, %arg1 : tensor<4xf32>
 
 // -----
 
+// Verifies that dynamic input type is allowed with reducer function with static shapes.
+func.func @verify_dynamic_operand(%arg0: tensor<8x?xf32>, %arg1 : tensor<4xf32>)
+    -> (tensor<?xf32>) {
+
+  %0 = "mhlo.reduce"(%arg0, %arg1) ({
+
+  ^bb0(%arg2: tensor<4xf32>, %arg3: tensor<4xf32> ):
+    %1 = "mhlo.add"(%arg2, %arg3) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    "mhlo.return"(%1) : (tensor<4xf32>) -> ()
+
+  }) {dimensions = dense<[0]> : tensor<1xi64>} : (tensor<8x?xf32>, tensor<4xf32>) -> tensor<?xf32>
+
+  func.return %0: tensor<?xf32>
+}
+
+// -----
+
 func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
index ce18f90ebae..137a6609845 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_scatter_op.mlir
@@ -481,7 +481,7 @@ func.func @invalid_scatter_return_type(%input_tensor: tensor<200x100x300xf32>,
     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
       tensor<200x100xf32> {
 
-  // expected-error @+1 {{expects the return type to be same as the operand type: 'tensor<200x100x300xf32>', but got 'tensor<200x100xf32>'.}}
+  // expected-error @+1 {{inferred type(s) 'tensor<200x100x300xf32>' are incompatible with return type(s) of operation 'tensor<200x100xf32>'}}
   %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
     %add = mhlo.add %lhs, %rhs : tensor<f32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
index b5892a95a74..1eebafb939d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_select_and_scatter_op.mlir
@@ -394,7 +394,7 @@ func.func @select_and_scatter_invalid_ret_type(
     %arg1: tensor<10x12x12x64xf32>) -> () {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 
-    // expected-error @+1 {{expects the return-type to match the operand-type, but got 'tensor<10x24x24x32xf32>' and 'tensor<10x24x24x64xf32>' resp.}}
+    // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x32xf32>'}}
     %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %2 = "mhlo.compare"(%arg3, %arg4) {
@@ -422,7 +422,7 @@ func.func @select_and_scatter_invalid_ret_type(
     %arg1: tensor<10x12x12x64xf32>) -> () {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 
-    // expected-error @+1 {{expects the return-type to match the operand-type, but got 'tensor<10x24x24x64xi32>' and 'tensor<10x24x24x64xf32>' resp.}}
+    // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x64xi32>'}}
     %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
       %2 = "mhlo.compare"(%arg3, %arg4) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
index e4dc05894f7..882f7cf66fc 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/verifier_while_op.mlir
@@ -121,7 +121,7 @@ func.func @while_with_invalid_tuples(%arg0: tensor<3xf32>) -> tensor<3xf32> {
   %cst_2 = arith.constant dense<1.00> : tensor<1xf32>
   %0 = "mhlo.tuple"(%arg0, %cst_2) : (tensor<3xf32>, tensor<1xf32>) -> tuple<tensor<3xf32>, tensor<1xf32>>
   %1 = "mhlo.tuple"(%cst_1, %0) : (tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>) -> tuple<tensor<2xi32>, tuple<tensor<3xf32>, tensor<1xf32>>>
-  // expected-error @+1 {{op operand #1 must be tensor of 16-bit float or 32-bit float or 64-bit float or bfloat16 type or pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or complex type with 32-bit float or 64-bit float elements or 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer values or token}}
+  // expected-error @+1 {{op operand #1 must be tensor of f8E4M3FN type or f8E5M2 type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or complex type with 32-bit float or 64-bit float elements or 4/8/16/32-bit uniform quantized signed integer or 4/8/16/32-bit uniform quantized unsigned integer values or token}}
   %2:2 = "mhlo.while"(%cst_0, %1) ({
   ^bb0(%arg1: tensor<1xi32>, %arg2: tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>):
     %t0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tensor<2xi32>, tuple<tensor<1xf32>, tensor<3xf32>>>) -> tensor<2xi32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir
index 860013026e4..5e88a2dd320 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/bufferize.mlir
@@ -8,7 +8,8 @@ func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>)
       outs(%init1: tensor<?x?x?xf32>, %init2: tensor<?x?x?xi32>)
-      { dimension = 1 : i64, is_stable = true }
+      dimension = 1
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -28,12 +29,13 @@ func.func @sort(%input1: tensor<?x?x?xf32>, %input2: tensor<?x?x?xi32>,
 // CHECK-DAG:      memref.copy %[[INIT2]], %[[OUTPUT2]]
 // CHECK:          thlo.sort
 // CHECK-SAME:         ins(%[[INPUT1]] : memref<?x?x?xf32>,
-// CHECK-SAME:             %[[INPUT2]] : memref<?x?x?xi32>)
+// CHECK-SAME:           %[[INPUT2]] : memref<?x?x?xi32>)
 // CHECK-SAME:         outs(%[[OUTPUT1]] : memref<?x?x?xf32>,
-// CHECK-SAME:              %[[OUTPUT2]] : memref<?x?x?xi32>)
-// CHECK-SAME:         {dimension = 1 : i64, is_stable = true}
-// CHECK-SAME:         (%[[FLOAT1:[A-Za-z_0-9]*]]: f32, %[[FLOAT2:.*]]: f32,
+// CHECK-SAME:           %[[OUTPUT2]] : memref<?x?x?xi32>)
+// CHECK-SAME:         dimension = 1
+// CHECK-SAME:         is_stable = true
+// CHECK-NEXT:         (%[[FLOAT1:[A-Za-z_0-9]*]]: f32, %[[FLOAT2:.*]]: f32,
 // CHECK-SAME:          %[[INT1:[A-Za-z_0-9]*]]: i32, %[[INT2:.*]]: i32)
 // CHECK:                 %[[RESULT:.*]] = arith.cmpf ogt, %[[FLOAT1]], %[[FLOAT2]] : f32
 // CHECK:                 thlo.yield %[[RESULT]] : i1
-// CHECK:          return %[[OUTPUT1]], %[[OUTPUT2]]
\ No newline at end of file
+// CHECK:          return %[[OUTPUT1]], %[[OUTPUT2]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir
new file mode 100644
index 00000000000..aaec8cc5b1e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/canonicalize.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-hlo-opt %s --split-input-file \
+// RUN: --canonicalize | FileCheck %s
+
+func.func @reverse_dynamic_fold(%input: tensor<1x?xf32>, %init: tensor<1x?xf32>)
+  -> tensor<1x?xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<1x?xf32>)
+         outs(%init: tensor<1x?xf32>)
+         reverse_dimensions = [0]
+  func.return %res : tensor<1x?xf32>
+}
+
+// CHECK-LABEL: func @reverse_dynamic_fold
+//  CHECK-SAME: %[[ARG0:.*]]: tensor<1x?xf32>, %[[ARG1:.*]]: tensor<1x?xf32>
+//       CHECK:   return %[[ARG0]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir
index 4f1c8d8fca6..8582b40adab 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/invalid.mlir
@@ -7,7 +7,7 @@ func.func @concatenate(%arg1: tensor<?x?xf32>,
   %cat = thlo.concatenate
       ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?xi32>)
       outs(%dst: tensor<?x?xf32>)
-      { dimension = 0 : i64 }
+      dimension = 0
   func.return %cat : tensor<?x?xf32>
 }
 
@@ -20,7 +20,7 @@ func.func @concatenate_mismatch_rank(%arg1: tensor<?x?xf32>,
   %cat = thlo.concatenate
       ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?x?xf32>)
       outs(%dst: tensor<?x?xf32>)
-      { dimension = 0 : i64 }
+      dimension = 0
   func.return %cat : tensor<?x?xf32>
 }
 
@@ -33,7 +33,7 @@ func.func @concatenate_mismatch_shape(%arg1: tensor<?x8xf32>,
   %cat = thlo.concatenate
       ins(%arg1: tensor<?x8xf32>, %arg2: tensor<?x?xf32>)
       outs(%dst: tensor<?x?xf32>)
-      { dimension = 0 : i64 }
+      dimension = 0
   func.return %cat : tensor<?x?xf32>
 }
 
@@ -210,7 +210,8 @@ func.func @sort_mismatched_number_of_inputs_and_outputs(
   %sorted = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -228,7 +229,8 @@ func.func @sort_mismatched_number_of_inputs_and_comparator_arguments(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -246,7 +248,8 @@ func.func @sort_mismatched_input_and_comparator_type(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: f32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -263,7 +266,8 @@ func.func @sort_comparator_yields_different_than_one_output(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         // expected-error@+1{{'thlo.yield' op expects number of tensor output args = 1 to match the number of yield operands = 2}}
@@ -281,7 +285,8 @@ func.func @sort_comparator_yields_non_boolean(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         // expected-error@+1{{'thlo.yield' op expects yield operand 0 with type = 'f32' to match output arg element type = 'i1'}}
         thlo.yield %e11 : f32
@@ -299,7 +304,8 @@ func.func @sort_inputs_have_different_shapes(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<64x32xf32>, %input2: tensor<32x32xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -317,7 +323,8 @@ func.func @sort_output_has_different_shape_from_inputs(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<64x32xf32>, %input2: tensor<64x32xi32>)
       outs(%init1: tensor<32x64xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -335,7 +342,8 @@ func.func @sort_dimension_is_incompatible_with_rank_of_inputs(
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 2 : i64, is_stable = true }
+      dimension = 2
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
index f893096d564..7a7c90dfe8a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
@@ -5,7 +5,8 @@ func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
   thlo.sort
       ins(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>)
       outs(%init1: memref<?x?xf32>, %init2: memref<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir
index 1292dc85d08..94bfedea885 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/ops.mlir
@@ -9,7 +9,7 @@ func.func @concatenate(%arg1: tensor<?x?xf32>,
   %cat = thlo.concatenate
       ins(%arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
       outs(%dst: tensor<?x?xf32>)
-      { dimension = 0 : i64 }
+      dimension = 0
   func.return %cat : tensor<?x?xf32>
 }
 // CHECK-LABEL: func @concatenate
@@ -22,7 +22,7 @@ func.func @concatenate_memref(%arg1: memref<?x?xf32>,
   thlo.concatenate
       ins(%arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>)
       outs(%dst: memref<?x?xf32>)
-      { dimension = 0 : i64 }
+      dimension = 0
   func.return
 }
 // CHECK-LABEL: func @concatenate_memref
@@ -112,7 +112,8 @@ func.func @sort(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
   %sorted1, %sorted2 = thlo.sort
       ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
       outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -120,6 +121,9 @@ func.func @sort(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>,
   func.return %sorted1, %sorted2 : tensor<?x?xf32>, tensor<?x?xi32>
 }
 // CHECK-LABEL: func @sort
+// CHECK:         %[[RES1:sorted0]], %[[RES2:sorted1]] = thlo.sort
+// CHECK:         %[[LHS0:lhs0: f32]], %[[RHS0:rhs0: f32]],
+// CHECK-SAME:    %[[LHS1:lhs1: i32]], %[[RHS1:rhs1: i32]]
 
 // -----
 
@@ -128,7 +132,8 @@ func.func @sort_memref(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
   thlo.sort
       ins(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>)
       outs(%init1: memref<?x?xf32>, %init2: memref<?x?xi32>)
-      { dimension = 0 : i64, is_stable = true }
+      dimension = 0
+      is_stable = true
       (%e11: f32, %e12: f32, %e21: i32, %e22: i32) {
         %gt = arith.cmpf ogt, %e11, %e12: f32
         thlo.yield %gt : i1
@@ -136,3 +141,27 @@ func.func @sort_memref(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
   func.return
 }
 // CHECK-LABEL: func @sort_memref
+
+// -----
+
+func.func @reverse_static(%input: tensor<100xf32>, %init: tensor<100xf32>)
+  -> tensor<100xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<100xf32>)
+         outs(%init: tensor<100xf32>)
+         reverse_dimensions = [0]
+  func.return %res : tensor<100xf32>
+}
+// CHECK-LABEL: func @reverse_static
+
+// -----
+
+func.func @reverse_dynamic(%input: tensor<?x?xf32>, %init: tensor<?x?xf32>)
+  -> tensor<?x?xf32> {
+  %res = thlo.reverse
+         ins(%input: tensor<?x?xf32>)
+         outs(%init: tensor<?x?xf32>)
+         reverse_dimensions = [0, 1]
+  func.return %res : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @reverse_dynamic
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/alloc_to_arg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/alloc_to_arg.mlir
index 1248d7b5a9e..f477ecc2292 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/alloc_to_arg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/alloc_to_arg.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --alloc-to-arg %s -verify-diagnostics -split-input-file \
+// RUN: mlir-hlo-opt %s --alloc-to-arg -verify-diagnostics -split-input-file -allow-unregistered-dialect \
 // RUN: | FileCheck %s
 
 // CHECK-LABEL: func @alloc_to_arg
@@ -12,6 +12,18 @@ func.func @alloc_to_arg(%arg0: memref<8xf32>) -> (memref<8xf32> {my.attr}) {
 // -----
 
 func.func @not_alloc(%arg0: memref<8xf32>) -> memref<8xf32> {
-  // expected-error@+1 {{expected operand #0 to be defined by an memref.alloc}}
+  // expected-error@+1 {{expected operand #0 to be defined by (shape-expanded) memref.alloc}}
   return %arg0 : memref<8xf32>
 }
+
+// -----
+
+// CHECK: @fusion(%[[ARG0:.*]]: memref<4x4x8x32xf32>)
+func.func @fusion() -> memref<4x4x8x32xf32> {
+  // CHECK:   %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3{{\]\]}}
+  // CHECK:   "some.use"(%[[COLLAPSE_SHAPE]], %[[ARG0]])
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<128x32xf32>
+  %expand_shape = memref.expand_shape %alloc [[0, 1, 2], [3]] : memref<128x32xf32> into memref<4x4x8x32xf32>
+  "some.use"(%alloc, %expand_shape) : (memref<128x32xf32>, memref<4x4x8x32xf32>) -> ()
+  return %expand_shape : memref<4x4x8x32xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/bufferize_one_shot.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/bufferize_one_shot.mlir
index 5c625ace62a..91d04a84872 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/bufferize_one_shot.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/bufferize_one_shot.mlir
@@ -159,216 +159,6 @@ func.func @init_tensor_multiple_users(%lhs: tensor<10xf32>,
 
 // -----
 
-// CHECK-LABEL:  func @tiled_dot
-func.func @tiled_dot(%A: tensor<10xf32>, %B: tensor<10xf32>,
-                %C: tensor<f32>) -> tensor<f32> {
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c10 = arith.constant 10 : index
-
-  %dot = gml_st.loop (%i) = (%c0) to (%c10) step (%c2)
-       ins (%A_ = %A: tensor<10xf32>, %B_ = %B: tensor<10xf32>)
-       outs (%C_ = %C: tensor<f32>)
-       iterators[#gml_st.iterator_type<reduction>] {
-    %A_sub = tensor.extract_slice %A_[%i] [%c2] [1]
-      : tensor<10xf32> to tensor<?xf32>
-    %B_sub = tensor.extract_slice %B_[%i] [%c2] [1]
-      : tensor<10xf32> to tensor<?xf32>
-    %dot_sub = linalg.dot ins(%A_sub, %B_sub : tensor<?xf32>, tensor<?xf32>)
-                          outs(%C_ : tensor<f32>) -> tensor<f32>
-    gml_st.yield %dot_sub : tensor<f32>
-  }
-  // CHECK: gml_st.loop
-  // CHECK-SAME: ins (%[[A:arg[0-9]]] = %{{arg[0-9]}}: memref<10xf32>,
-  // CHECK-SAME:      %[[B:arg[0-9]]] = %{{arg[0-9]}}: memref<10xf32>
-  // CHECK-SAME: outs (%[[C:arg[0-9]]] = %{{arg[0-9]}}: memref<f32>)
-
-  // CHECK-NEXT: %[[SV_A:.*]] = memref.subview %[[A]]
-  // CHECK-NEXT: %[[SV_B:.*]] = memref.subview %[[B]]
-  // CHECK-NEXT: linalg.dot ins(%[[SV_A]], %[[SV_B]]
-  // CHECK-SAME:            outs(%[[C]] : memref<f32>)
-  // CHECK-NEXT: memref.copy
-  // CHECK-NEXT: gml_st.yield
-  func.return %dot : tensor<f32>
-}
-
-// -----
-
-#map0 = affine_map<(d0) -> (d0)>
-
-func.func @tiled_add(%A: tensor<10xf32>, %B: tensor<10xf32>,
-                  %C: tensor<10xf32>) -> tensor<10xf32> {
-  %c0 = arith.constant 0 : index
-  %c2 = arith.constant 2 : index
-  %c10 = arith.constant 10 : index
-
-  %sum = gml_st.loop (%i) = (%c0) to (%c10) step (%c2)
-       ins (%A_ = %A: tensor<10xf32>, %B_ = %B: tensor<10xf32>)
-       outs (%C_ = %C: tensor<10xf32>) {
-    %A_sub = tensor.extract_slice %A_[%i] [%c2] [1]
-      : tensor<10xf32> to tensor<?xf32>
-    %B_sub = tensor.extract_slice %B_[%i] [%c2] [1]
-      : tensor<10xf32> to tensor<?xf32>
-    %C_sub = tensor.extract_slice %C_[%i] [%c2] [1]
-      : tensor<10xf32> to tensor<?xf32>
-    %sum_sub = linalg.generic {
-      indexing_maps = [#map0, #map0, #map0],
-      iterator_types = ["parallel"]
-    } ins(%A_sub, %B_sub : tensor<?xf32>, tensor<?xf32>)
-      outs(%C_sub : tensor<?xf32>) {
-      ^bb0(%a: f32, %b: f32, %c: f32):
-        %0 = arith.addf %a, %b : f32
-        linalg.yield %0 : f32
-    } -> tensor<?xf32>
-    %update = tensor.insert_slice %sum_sub into %C_[%i] [%c2] [1]
-      : tensor<?xf32> into tensor<10xf32>
-    gml_st.yield %update : tensor<10xf32>
-  }
-  // CHECK: gml_st.loop
-  // CHECK-SAME: ins (%[[A:arg[0-9]]] = %{{arg[0-9]}}: memref<10xf32>,
-  // CHECK-SAME:      %[[B:arg[0-9]]] = %{{arg[0-9]}}: memref<10xf32>
-  // CHECK-SAME: outs (%[[C:arg[0-9]]] = %{{arg[0-9]}}: memref<10xf32>)
-
-  // CHECK-NEXT:  %[[SV_A:.*]] = memref.subview %[[A]]
-  // CHECK-NEXT:  %[[SV_B:.*]] = memref.subview %[[B]]
-  // CHECK-NEXT:  %[[SV_C:.*]] = memref.subview %[[C]]
-  // CHECK-NEXT:  linalg.generic
-  // CHECK-SAME:    ins(%[[SV_A]], %[[SV_B]]
-  // CHECK-SAME:    outs(%[[SV_C]] : memref<2xf32, strided{{.*}}>)
-  // CHECK:         linalg.yield %{{[0-9]}} : f32
-  // CHECK:       gml_st.yield
-  func.return %sum : tensor<10xf32>
-}
-
-// -----
-
-func.func @tiled_add_broadcast(%A: tensor<1x?x12xf32>, %B: tensor<?x?x12xf32>,
-                          %shape: tensor<3xi32>) -> tensor<?x?x12xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %AA = "mhlo.dynamic_broadcast_in_dim"(%A, %shape)
-    {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
-    : (tensor<1x?x12xf32>, tensor<3xi32>) -> tensor<?x?x12xf32>
-
-  %d0 = tensor.dim %AA, %c0 : tensor<?x?x12xf32>
-  %d1 = tensor.dim %AA, %c1 : tensor<?x?x12xf32>
-  %sum = gml_st.loop (%i0, %i1, %i2) = (%c0, %c0, %c0) to (%d0, %d1, %c8)
-       step (%c1, %c1, %c8)
-       ins (%A_ = %AA: tensor<?x?x12xf32>)
-       outs (%B_ = %B: tensor<?x?x12xf32>) {
-    %v_in = vector.transfer_read %A_[%i0, %i1, %i2], %cst
-          {in_bounds = [true, true, true]}
-          : tensor<?x?x12xf32>, vector<1x1x8xf32>
-    %v_add = arith.addf %v_in, %v_in : vector<1x1x8xf32>
-    %v_out = vector.transfer_write %v_add, %B_[%i0, %i1, %i2]
-           {in_bounds = [true, true, true]}
-           : vector<1x1x8xf32>, tensor<?x?x12xf32>
-    gml_st.yield %v_out : tensor<?x?x12xf32>
-  }
-  // CHECK: gml_st.loop
-  // CHECK-SAME: ins (%[[A:arg[0-9]]] = %{{[0-9a-zA-Z_]+}}: memref<?x?x12xf32
-  // CHECK-SAME: outs (%[[C:arg[0-9]]] = %{{arg[0-9]}}: memref<?x?x12xf32>)
-  // CHECK: memref.copy
-  func.return %sum : tensor<?x?x12xf32>
-}
-
-// -----
-
-#map0 = affine_map<()[s0] -> ((s0 floordiv 8) * 8)>
-#map1 = affine_map<(d0)[s0] -> (-d0 + s0)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @init_tensor_multiple_users(%arg0: tensor<1x?xf32>)
-    -> (tensor<1x?xf32>, tensor<1x?xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  %cst_0 = arith.constant dense<1.000000e+00> : vector<1x8xf32>
-  %cst_1 = arith.constant 1.000000e+00 : f32
-  %c0 = arith.constant 0 : index
-  %c8 = arith.constant 8 : index
-  %c1 = arith.constant 1 : index
-  %0 = tensor.dim %arg0, %c1 : tensor<1x?xf32>
-  %init = bufferization.alloc_tensor(%0) : tensor<1x?xf32>
-  %2 = affine.apply #map0()[%0]
-  %3 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c1, %2) step (%c1, %c8)
-      ins (%arg3 = %arg0: tensor<1x?xf32>)
-      outs (%arg4 = %init: tensor<1x?xf32>) {
-    %7 = vector.transfer_read %arg3[%i, %j], %cst {in_bounds = [true, true]}
-      : tensor<1x?xf32>, vector<1x8xf32>
-    %8 = arith.subf %cst_0, %7 : vector<1x8xf32>
-    %9 = vector.transfer_write %8, %arg4[%i, %j] {in_bounds = [true, true]}
-      : vector<1x8xf32>, tensor<1x?xf32>
-    gml_st.yield %9 : tensor<1x?xf32>
-  }
-  %4 = gml_st.loop (%i, %j) = (%c0, %2) to (%c1, %0) step (%c1, %c8)
-      ins (%arg3 = %arg0: tensor<1x?xf32>)
-      outs (%arg4 = %3: tensor<1x?xf32>) {
-    %7 = affine.apply #map1(%j)[%0]
-    %8 = tensor.extract_slice %arg3[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> to tensor<1x?xf32>
-    %9 = tensor.extract_slice %arg4[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> to tensor<1x?xf32>
-    %10 = linalg.generic {
-      indexing_maps = [#map2, #map2],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%8 : tensor<1x?xf32>) outs(%9 : tensor<1x?xf32>) {
-    ^bb0(%arg5: f32, %arg6: f32):
-      %12 = arith.subf %cst_1, %arg5 : f32
-      linalg.yield %12 : f32
-    } -> tensor<1x?xf32>
-    %11 = tensor.insert_slice %10 into %arg4[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> into tensor<1x?xf32>
-    gml_st.yield %11 : tensor<1x?xf32>
-  }
-  %5 = gml_st.loop (%i, %j) = (%c0, %c0) to (%c1, %2) step (%c1, %c8)
-      ins (%arg3 = %arg0: tensor<1x?xf32>)
-      outs (%arg4 = %init: tensor<1x?xf32>) {
-    %7 = vector.transfer_read %arg3[%i, %j], %cst
-      {in_bounds = [true, true]} : tensor<1x?xf32>, vector<1x8xf32>
-    %8 = arith.subf %cst_0, %7 : vector<1x8xf32>
-    %9 = arith.subf %cst_0, %8 : vector<1x8xf32>
-    %10 = vector.transfer_write %9, %arg4[%i, %j]
-      {in_bounds = [true, true]} : vector<1x8xf32>, tensor<1x?xf32>
-    gml_st.yield %10 : tensor<1x?xf32>
-  }
-  %6 = gml_st.loop (%i, %j) = (%c0, %2) to (%c1, %0) step (%c1, %c8)
-      ins (%arg3 = %arg0: tensor<1x?xf32>)
-      outs (%arg4 = %5: tensor<1x?xf32>) {
-    %7 = affine.apply #map1(%j)[%0]
-    %8 = tensor.extract_slice %arg3[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> to tensor<1x?xf32>
-    %9 = tensor.extract_slice %arg4[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> to tensor<1x?xf32>
-    %10 = linalg.generic {
-      indexing_maps = [#map2, #map2],
-      iterator_types = ["parallel", "parallel"]}
-      ins(%8 : tensor<1x?xf32>) outs(%9 : tensor<1x?xf32>) {
-    ^bb0(%arg5: f32, %arg6: f32):
-      %12 = arith.subf %cst_1, %arg5 : f32
-      %13 = arith.subf %cst_1, %12 : f32
-      linalg.yield %13 : f32
-    } -> tensor<1x?xf32>
-    %11 = tensor.insert_slice %10 into %arg4[%i, %j] [1, %7] [1, 1]
-      : tensor<1x?xf32> into tensor<1x?xf32>
-    gml_st.yield %11 : tensor<1x?xf32>
-  }
-  func.return %4, %6 : tensor<1x?xf32>, tensor<1x?xf32>
-}
-// CHECK-LABEL: init_tensor_multiple_users
-// CHECK: %[[BUF2:.*]] = memref.alloc
-// CHECK: %[[BUF1:.*]] = memref.alloc
-// CHECK: gml_st.loop
-// CHECK:   %[[BUF1]]
-// CHECK: gml_st.loop
-// CHECK:   %[[BUF1]]
-// CHECK: gml_st.loop
-// CHECK:   %[[BUF2]]
-// CHECK: gml_st.loop
-// CHECK:   %[[BUF2]]
-// CHECK: return %[[BUF1]], %[[BUF2]]
-
-// -----
-
 // Test that scf ops are bufferized
 // CHECK-LABEL:   func @if(
 // CHECK-SAME:             %[[PRED:.*]]: i1,
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/capi_test.c b/tensorflow/compiler/xla/mlir_hlo/tests/capi_test.c
index 4ca78b69299..a37d4a8aab9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/capi_test.c
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/capi_test.c
@@ -12,7 +12,7 @@ limitations under the License.
 
 // This file checks that the MHLO CAPI can actually be compiled by a C compiler.
 // At the moment, this check is only implemented in the Bazel build.
-#include "mlir-hlo-c/Attributes.h"
-#include "mlir-hlo-c/Dialects.h"
-#include "mlir-hlo-c/Passes.h"
-#include "mlir-hlo-c/Types.h"
+#include "bindings/c/Attributes.h"
+#include "bindings/c/Dialects.h"
+#include "bindings/c/Passes.h"
+#include "bindings/c/Types.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/gpu_fusion_rewrite.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/gpu_fusion_rewrite.mlir
index 83eabdc919f..c90a82e4ccc 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/gpu_fusion_rewrite.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/gpu_fusion_rewrite.mlir
@@ -25,15 +25,11 @@ func.func @log(
 // We do however, need some index computations to convert from warp and thread
 // indices to offset in input/output that that thread should operate on.
 // TODO(b/247482325): Optimize this better if needed.
-// CHECK-DAG: %[[C0:.*]] = llvm.mlir.constant(0 : i32)
-// CHECK-DAG: %[[C1:.*]] = llvm.mlir.constant(1 : i32)
 // CHECK-DAG: %[[C32:.*]] = llvm.mlir.constant(32 : index)
 // CHECK-DAG: %[[TIDX:.*]] = nvvm.read.ptx.sreg.tid.x
 // CHECK-DAG: %[[TIDY:.*]] = nvvm.read.ptx.sreg.tid.y
-// CHECK-DAG: %[[TMP1:.*]] = llvm.mul %[[TIDY]], %[[C32]]
-// CHECK-DAG: %[[TMP2:.*]] = llvm.mul %[[TMP1]], %[[C1]]
-// CHECK-DAG: %[[WARPOFS:.*]] = llvm.add %[[TMP2]], %[[C0]]
-// CHECK: llvm.add %[[WARPOFS]], %[[TIDX]]
+// CHECK-DAG: %[[WARPOFS:.*]] = llvm.mul %[[TIDY]], %[[C32]]
+// CHECK: llvm.add %[[TIDX]], %[[WARPOFS]]
 // CHECK-NOT: llvm.mul
 // CHECK-NOT: llvm.add
 // CHECK-LABEL: func.func @multidimensional
@@ -192,3 +188,93 @@ func.func @softmax(
   }) {fusion_type = "softmax_fusion"} : () -> ()
   "lmhlo.terminator"() : () -> ()
 }
+
+// -----
+
+// CHECK: gpu.container_module
+// CHECK: gpu.module @fusion_kernel
+// CHECK: llvm.func @fusion_kernel
+// CHECK-SAME: gpu.kernel
+// CHECK-LABEL: func.func @complete_softmax
+// CHECK: gpu.launch_func @fusion_kernel::@fusion_kernel
+func.func @complete_softmax(
+    %arg0: memref<640xi8> {lmhlo.params = 0 : index},
+    %arg1: memref<640xi8> {lmhlo.output_index = dense<> : tensor<0xi64>}
+) attributes {result_xla_shape = "f32[32,5]{1,0}"} {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<640xi8> to memref<32x5xf32>
+  %c0_0 = arith.constant 0 : index
+  %view_1 = memref.view %arg1[%c0_0][] : memref<640xi8> to memref<32x5xf32>
+  "lmhlo.fusion"() ({
+    %0 = bufferization.to_tensor %view : memref<32x5xf32>
+    %1 = mhlo.constant dense<0xFF800000> : tensor<f32>
+    %2 = mhlo.reduce(%0 init: %1) across dimensions = [1] : (tensor<32x5xf32>, tensor<f32>) -> tensor<32xf32>
+     reducer(%arg4: tensor<f32>, %arg5: tensor<f32>)  {
+      %10 = mhlo.maximum %arg4, %arg5 : tensor<f32>
+      mhlo.return %10 : tensor<f32>
+    }
+    %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<32xf32>) -> tensor<32x5xf32>
+    %4 = mhlo.subtract %0, %3 : tensor<32x5xf32>
+    %5 = mhlo.exponential %4 : tensor<32x5xf32>
+    %6 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    %7 = mhlo.reduce(%5 init: %6) across dimensions = [1] : (tensor<32x5xf32>, tensor<f32>) -> tensor<32xf32>
+     reducer(%arg4: tensor<f32>, %arg5: tensor<f32>)  {
+      %10 = mhlo.add %arg4, %arg5 : tensor<f32>
+      mhlo.return %10 : tensor<f32>
+    }
+    %8 = "mhlo.broadcast_in_dim"(%7) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<32xf32>) -> tensor<32x5xf32>
+    %9 = mhlo.divide %5, %8 : tensor<32x5xf32>
+    memref.tensor_store %9, %view_1 : memref<32x5xf32>
+    "lmhlo.terminator"() : () -> ()
+  }) {fusion_type = "softmax_fusion"} : () -> ()
+  "lmhlo.terminator"() : () -> ()
+}
+
+// -----
+
+func.func @softmax_4d(
+    %arg0: memref<16384xi8> {lmhlo.params = 0 : index},
+    %arg1: memref<4xi8> {lmhlo.params = 1 : index},
+    %arg2: memref<16384xi8> {lmhlo.output_index = dense<> : tensor<0xi64>}
+) attributes {result_xla_shape = "f32[128,32]{1,0}"} {
+  %c0 = arith.constant 0 : index
+  %view = memref.view %arg0[%c0][] : memref<16384xi8> to memref<4x4x8x32xf32>
+  %view2 = memref.view %arg1[%c0][] : memref<4xi8> to memref<f32>
+  %c0_0 = arith.constant 0 : index
+  %view_1 = memref.view %arg2[%c0_0][]
+      : memref<16384xi8> to memref<4x4x8x32xf32>
+  "lmhlo.fusion"() ({
+    %0 = bufferization.to_tensor %view : memref<4x4x8x32xf32>
+    %1 = bufferization.to_tensor %view2 : memref<f32>
+    %2 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4x4x8x32xf32>
+    %3 = mhlo.add %0, %2 : tensor<4x4x8x32xf32>
+    %4 = mhlo.constant dense<0.0> : tensor<f32>
+    %5 = mhlo.reduce(%3 init: %4) across dimensions = [3]
+        : (tensor<4x4x8x32xf32>, tensor<f32>) -> tensor<4x4x8xf32>
+    reducer(%arg3: tensor<f32>, %arg4: tensor<f32>) {
+      %6 = mhlo.maximum %arg3, %arg4 : tensor<f32>
+      mhlo.return %6 : tensor<f32>
+    }
+    %7 = "mhlo.broadcast_in_dim"(%5) {broadcast_dimensions = dense<[0, 1, 2]>
+        : tensor<3xi64>} : (tensor<4x4x8xf32>) -> tensor<4x4x8x32xf32>
+    %8 = mhlo.subtract %3, %7 : tensor<4x4x8x32xf32>
+    memref.tensor_store %8, %view_1 : memref<4x4x8x32xf32>
+    "lmhlo.terminator"() : () -> ()
+  }) {fusion_type = "softmax_fusion"} : () -> ()
+  "lmhlo.terminator"() : () -> ()
+}
+
+// CHECK:      module attributes {gpu.container_module}
+// CHECK:        gpu.module @fusion_kernel
+// CHECK:          llvm.func @fusion_kernel(%[[ARG0:.*]]: !llvm.ptr<f32>, %[[ARG1:.*]]: !llvm.ptr<f32>)
+
+// CHECK:        func.func @softmax_4d(%[[ARG_0:.*]]: memref<16384xi8> {lmhlo.params = 0 : index}, %[[ARG_1:.*]]: memref<4xi8> {lmhlo.params = 1 : index}, %[[ARG_2:.*]]: memref<16384xi8> {lmhlo.output_index = dense<> : tensor<0xi64>}) attributes {result_xla_shape = "f32[128,32]{1,0}"}
+// CHECK:          %[[VIEW:.*]] = memref.view %[[ARG_0]][%{{.*}}][] : memref<16384xi8> to memref<4x4x8x32xf32>
+// CHECK:          %[[VIEW_1:.*]] = memref.view %[[ARG_1]][%{{.*}}][] : memref<4xi8> to memref<f32>
+// CHECK:          %[[VIEW_2:.*]] = memref.view %[[ARG_2]][%{{.*}}][] : memref<16384xi8> to memref<4x4x8x32xf32>
+// CHECK:          %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[VIEW_2]] [
+// CHECK-SAME:         [0, 1, 2], [3]] : memref<4x4x8x32xf32> into memref<128x32xf32>
+// CHECK:          %[[COLLAPSE_SHAPE_1:.*]] = memref.collapse_shape %[[VIEW]] [
+// CHECK-SAME:         [0, 1, 2], [3]] : memref<4x4x8x32xf32> into memref<128x32xf32>
+// CHECK:          gpu.launch_func  @fusion_kernel::@fusion_kernel
+// CHECK-SAME:         args(%[[VIEW_1]] : memref<f32>, %[[COLLAPSE_SHAPE]] : memref<128x32xf32>, %[[COLLAPSE_SHAPE_1]] : memref<128x32xf32>)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline.mlir
index d03bd9e6066..d86ff8dcabc 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline.mlir
@@ -19,8 +19,14 @@ func.func @simple_op(%arg0: memref<2048xf32>, %arg1: memref<2048xf32>) {
   "lmhlo.terminator"() : () -> ()
 }
 // CHECK: gpu.module @[[MODULE]] attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
-// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, nvvm.kernel}
+// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 8, 1>, gpu.known_grid_size = array<i32: 2, 1, 1>, nvvm.kernel}
 // CHECK: llvm.call @__nv_logf
+// Make sure we successfully unrolled the loop 4 times
+// CHECK: llvm.call @__nv_logf
+// CHECK: llvm.call @__nv_logf
+// CHECK: llvm.call @__nv_logf
+// CHECK-NOT: llvm.call @__nv_logf
+
 
 // -----
 
@@ -42,10 +48,15 @@ func.func @fusion(%arg0: memref<2048xf32>, %arg1: memref<2048xf32>) {
   "lmhlo.terminator"() : () -> ()
 }
 // CHECK:     gpu.module @[[MODULE]]
-// CHECK:     llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, nvvm.kernel}
+// CHECK:     llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 8, 1>, gpu.known_grid_size = array<i32: 2, 1, 1>, nvvm.kernel}
 // CHECK:     %[[ABS:.*]] = llvm.call @__nv_fabsf
 // CHECK-NOT: llvm.return
 // CHECK:     %[[ADD:.*]] = llvm.fadd %[[ABS]], %[[ABS]]
+// Make sure we successfully unrolled the loop 4 times:
+// CHECK: llvm.call @__nv_fabsf
+// CHECK: llvm.call @__nv_fabsf
+// CHECK: llvm.call @__nv_fabsf
+// CHECK-NOT: llvm.call @__nv_fabsf
 
 
 // -----
@@ -67,5 +78,13 @@ func.func @imperfect_tiling(%arg0: memref<2051xf32>, %arg1: memref<2051xf32>) {
   "lmhlo.terminator"() : () -> ()
 }
 // CHECK: gpu.module @[[MODULE]] attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
-// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, nvvm.kernel}
+// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 8, 1>, gpu.known_grid_size = array<i32: 3, 1, 1>, nvvm.kernel}
+// CHECK: llvm.call @__nv_logf
+// Make sure we successfully unrolled the loop 4 times:
+// CHECK: llvm.call @__nv_logf
+// CHECK: llvm.call @__nv_logf
+// CHECK: llvm.call @__nv_logf
+// ... and that we have an imperfect-tile loop at the end:
+// CHECK: llvm.cond_br
 // CHECK: llvm.call @__nv_logf
+// CHECK-NOT: llvm.call @__nv_logf
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline_softmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline_softmax.mlir
index 21e80e6d577..312f850c790 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline_softmax.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_gpu_pipeline_softmax.mlir
@@ -42,7 +42,7 @@ func.func @perfectly_tiled_softmax(%argbuffer : memref<2048x4096xf32>,
   "lmhlo.terminator"() : () -> ()
 }
 // CHECK: gpu.module @[[MODULE]]
-// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, nvvm.kernel}
+// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 8, 1>, gpu.known_grid_size = array<i32: 256, 1, 1>, nvvm.kernel}
 // CHECK: nvvm.shfl.sync  bfly
 // CHECK: llvm.fcmp
 // CHECK: llvm.select
@@ -92,4 +92,44 @@ func.func @imperfectly_tiled_softmax(%argbuffer : memref<2047x4095xf32>,
   "lmhlo.terminator"() : () -> ()
 }
 // CHECK: gpu.module @[[MODULE]]
-// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, nvvm.kernel}
+// CHECK: llvm.func @[[KERNEL]]({{.*}}) attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 8, 1>, gpu.known_grid_size = array<i32: 256, 1, 1>, nvvm.kernel}
+
+// -----
+
+// CHECK-LABEL: @imperfectly_tiled_softmax_4d
+// CHECK-SAME:  %[[ARG0:.*]]: memref<6x4x2047x4095xf32>, %[[ARG1:.*]]: memref<6x4x2047x4095xf32>
+func.func @imperfectly_tiled_softmax_4d(%argbuffer : memref<6x4x2047x4095xf32>,
+    %resbuffer : memref<6x4x2047x4095xf32>) {
+  %arg = bufferization.to_tensor %argbuffer : memref<6x4x2047x4095xf32>
+  %0 = mhlo.constant dense<-1> : tensor<1xi64>
+  %1 = mhlo.convert %arg : tensor<6x4x2047x4095xf32>
+  %2 = mhlo.constant dense<0xFF800000> : tensor<f32>
+  %3 = mhlo.reduce(%1 init: %2) applies mhlo.maximum across dimensions = [3]
+      : (tensor<6x4x2047x4095xf32>, tensor<f32>) -> tensor<6x4x2047xf32>
+  %4 = mhlo.convert %3 : tensor<6x4x2047xf32>
+  %cst = arith.constant dense<1> : tensor<1xi32>
+  %5 = mhlo.reshape %4 : (tensor<6x4x2047xf32>) -> tensor<6x4x2047x1xf32>
+  %6 = chlo.broadcast_subtract %arg, %5
+      : (tensor<6x4x2047x4095xf32>, tensor<6x4x2047x1xf32>)
+      -> tensor<6x4x2047x4095xf32>
+  %7 = mhlo.exponential %6 : tensor<6x4x2047x4095xf32>
+  %8 = mhlo.convert %7 : tensor<6x4x2047x4095xf32>
+  %9 = mhlo.constant dense<-0.000000e+00> : tensor<f32>
+  %10 = mhlo.reduce(%8 init: %9) applies mhlo.add across dimensions = [3]
+      : (tensor<6x4x2047x4095xf32>, tensor<f32>) -> tensor<6x4x2047xf32>
+  %11 = mhlo.convert %10 : tensor<6x4x2047xf32>
+  %cst_0 = arith.constant dense<1> : tensor<1xi32>
+  %12 = mhlo.reshape %11 : (tensor<6x4x2047xf32>) -> tensor<6x4x2047x1xf32>
+  %13 = chlo.broadcast_divide %7, %12
+      : (tensor<6x4x2047x4095xf32>, tensor<6x4x2047x1xf32>)
+      -> tensor<6x4x2047x4095xf32>
+  // CHECK:         %[[COLLAPSE_SHAPE:.*]] = memref.collapse_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3{{\]\]}} : memref<6x4x2047x4095xf32> into memref<49128x4095xf32>
+  // CHECK:         %[[COLLAPSE_SHAPE_2:.*]] = memref.collapse_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3{{\]\]}} : memref<6x4x2047x4095xf32> into memref<49128x4095xf32>
+  // CHECK:         gpu.launch_func  @imperfectly_tiled_softmax_4d_kernel::@imperfectly_tiled_softmax_4d_kernel
+  // CHECK-SAME:        args(%[[COLLAPSE_SHAPE_2]] : memref<49128x4095xf32>, %[[COLLAPSE_SHAPE]] : memref<49128x4095xf32>)
+  // CHECK:         return
+  memref.tensor_store %13, %resbuffer : memref<6x4x2047x4095xf32>
+  "lmhlo.terminator"() : () -> ()
+}
+// CHECK:       gpu.module @imperfectly_tiled_softmax_4d_kernel
+// CHECK:         llvm.func @imperfectly_tiled_softmax_4d_kernel(%[[ARG0_0:.*]]: !llvm.ptr<f32>, %[[ARG1_0:.*]]: !llvm.ptr<f32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_triton_pipeline_softmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_triton_pipeline_softmax.mlir
new file mode 100644
index 00000000000..b00eed66c3d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/hlo_to_triton_pipeline_softmax.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-hlo-opt --split-input-file %s \
+// RUN:   --hlo-to-triton-pipeline="block-tile=1" \
+// RUN: | FileCheck %s
+
+// CHECK:       gpu.container_module
+// CHECK-LABEL: @perfectly_tiled_softmax(
+func.func @perfectly_tiled_softmax(%argbuffer : memref<2048x4096xf32>,
+    %resbuffer : memref<2048x4096xf32>) {
+  %arg = bufferization.to_tensor %argbuffer : memref<2048x4096xf32>
+  %0 = mhlo.constant dense<-1> : tensor<1xi64>
+  %1 = mhlo.convert %arg : tensor<2048x4096xf32>
+  %2 = mhlo.constant dense<0xFF800000> : tensor<f32>
+  %3 = mhlo.reduce(%1 init: %2) applies mhlo.maximum across dimensions = [1]
+      : (tensor<2048x4096xf32>, tensor<f32>) -> tensor<2048xf32>
+  %4 = mhlo.convert %3 : tensor<2048xf32>
+  %cst = arith.constant dense<1> : tensor<1xi32>
+  %5 = mhlo.reshape %4 : (tensor<2048xf32>) -> tensor<2048x1xf32>
+  %6 = chlo.broadcast_subtract %arg, %5
+      : (tensor<2048x4096xf32>, tensor<2048x1xf32>) -> tensor<2048x4096xf32>
+  %7 = mhlo.exponential %6 : tensor<2048x4096xf32>
+  %8 = mhlo.convert %7 : tensor<2048x4096xf32>
+  %9 = mhlo.constant dense<-0.000000e+00> : tensor<f32>
+  %10 = mhlo.reduce(%8 init: %9) applies mhlo.add across dimensions = [1]
+      : (tensor<2048x4096xf32>, tensor<f32>) -> tensor<2048xf32>
+  %11 = mhlo.convert %10 : tensor<2048xf32>
+  %cst_0 = arith.constant dense<1> : tensor<1xi32>
+  %12 = mhlo.reshape %11 : (tensor<2048xf32>) -> tensor<2048x1xf32>
+  %13 = chlo.broadcast_divide %7, %12
+      : (tensor<2048x4096xf32>, tensor<2048x1xf32>) -> tensor<2048x4096xf32>
+  // CHECK-DAG:  %[[ONE:.*]] = arith.constant 1
+  // CHECK-DAG:  %[[GRID:.*]] = arith.constant 2048
+  // CHECK:      gpu.launch_func @[[MODULE:.*]]::@[[KERNEL:.*]] blocks
+  // CHECK-SAME: in (%[[GRID]], %[[ONE]], %[[ONE]])
+  // CHECK-SAME: threads in (%[[ONE]], %[[ONE]], %[[ONE]])
+  // CHECK-SAME: args({{.*}} : memref<2048x4096xf32>,
+  // CHECK-SAME: {{.*}} : memref<2048x4096xf32>)
+  memref.tensor_store %13, %resbuffer : memref<2048x4096xf32>
+  "lmhlo.terminator"() : () -> ()
+}
+// CHECK:      gpu.module @[[MODULE]]
+// CHECK:      gpu.func @[[KERNEL]]
+  /// TODO(b/261710844): This should be Triton Dialect.
+// CHECK-SAME: %[[IN:.*]]: memref<2048x4096xf32>,
+// CHECK-SAME: %[[OUT:.*]]: memref<2048x4096xf32>) kernel
+// CHECK:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK:      %[[BID:.*]] = gpu.block_id  x
+// CHECK:      %[[VEC:.*]] = vector.transfer_read %[[IN]][%[[BID]], %[[C0]]]
+// CHECK:      %[[MAX:.*]] = vector.multi_reduction <maxf>, %[[VEC]]
+// CHECK:      %[[BMAX:.*]] = vector.broadcast %[[MAX]]
+// CHECK:      %[[SUB:.*]] = arith.subf %[[VEC]], %[[BMAX]]
+// CHECK:      %[[EXP:.*]] = math.exp %[[SUB]]
+// CHECK:      %[[SUM:.*]] = vector.multi_reduction <add>, %[[EXP]]
+// CHECK:      %[[BSUM:.*]] = vector.broadcast %[[SUM]]
+// CHECK:      %[[OUT_SV:.*]] = memref.subview %[[OUT]][%[[BID]], 0] [1, 4096]
+// CHECK:      %[[DIV:.*]] = arith.divf %[[EXP]], %[[BSUM]]
+// CHECK:      vector.transfer_write %[[DIV]], %[[OUT_SV]][%[[C0]], %[[C0]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/index_type_llvm_lowering.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/index_type_llvm_lowering.mlir
index 56fc8a77c57..35923372513 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/index_type_llvm_lowering.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/index_type_llvm_lowering.mlir
@@ -1,16 +1,16 @@
 // RUN: mlir-hlo-opt %s -gpu-kernel-to-nvvm | FileCheck %s
 
 gpu.module @test_module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>} {
-  gpu.func @test_kernel() kernel {
+  gpu.func @test_kernel(%out: memref<32xf32>) kernel {
     %0 = gpu.block_id x
+    %cst = arith.constant 0.0 : f32
+    memref.store %cst, %out[%0] : memref<32xf32>
     gpu.return
   }
 }
 
 // CHECK-LABEL:  gpu.module @test_module
 // CHECK-SAME:     attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>} {
-// CHECK-NEXT:    llvm.func @test_kernel() attributes {gpu.kernel, nvvm.kernel} {
-// CHECK-NEXT:      %0 = nvvm.read.ptx.sreg.ctaid.x : i32
-// CHECK-NEXT:      llvm.return
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
+// CHECK-NEXT:    llvm.func @test_kernel
+// CHECK-SAME         attributes {gpu.kernel, nvvm.kernel}
+// CHECK:           %[[VAR:.*]] = nvvm.read.ptx.sreg.ctaid.x : i32
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/inline_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/inline_fusion.mlir
deleted file mode 100644
index f8bc1c484af..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/inline_fusion.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: mlir-hlo-opt --inline-fusion %s | FileCheck %s
-
-// CHECK-LABEL: func @fusion
-func.func @fusion(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-
-  // CHECK-NEXT: %[[EXP:.*]] = mhlo.exponential %arg0
-  %0 = "mhlo.fusion"(%arg0) ({
-  ^bb0(%arg1: tensor<8xf32>):
-    %1 = mhlo.exponential %arg1 : tensor<8xf32>
-    mhlo.return %1 : tensor<8xf32>
-  }) : (tensor<8xf32>) -> tensor<8xf32>
-
-  // CHECK-NEXT: return %[[EXP]] : tensor<8xf32>
-  return %0 : tensor<8xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/lower_index_cast.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/lower_index_cast.mlir
index 93ed4f277fa..8974b6ea8b0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/lower_index_cast.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/lower_index_cast.mlir
@@ -30,3 +30,31 @@ func.func @f(%arg0 : tensor<?xi32>) -> tensor<?xindex> {
   %0 = arith.index_cast %arg0 : tensor<?xi32> to tensor<?xindex>
   func.return %0 : tensor<?xindex>
 }
+
+// -----
+
+// index_cast of dynamic multidimensional tensor
+func.func @f(%arg0 : tensor<42x?xi32>) -> tensor<42x?xindex> {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C1]] : tensor<42x?xi32>
+  // CHECK: %[[TENSOR:.*]] = tensor.generate %[[DIM]] {
+  // CHECK: ^bb0(%arg1: index, %arg2: index):
+  // CHECK:   %[[E:.*]] = tensor.extract %arg0[%arg1, %arg2] : tensor<42x?xi32>
+  // CHECK:   %[[C:.*]] = arith.index_cast %[[E]] : i32 to index
+  // CHECK:   tensor.yield %[[C]] : index
+  // CHECK: } : tensor<42x?xindex>
+  // CHECK: return %[[TENSOR]] : tensor<42x?xindex>
+  %0 = arith.index_cast %arg0 : tensor<42x?xi32> to tensor<42x?xindex>
+  func.return %0 : tensor<42x?xindex>
+}
+
+// -----
+
+// CHECK-LABEL: func @index_castui
+func.func @index_castui(%arg0 : tensor<10xi32>) -> tensor<10xindex> {
+  // CHECK: tensor.generate {
+  // CHECK:   %[[C:.*]] = arith.index_castui
+  // CHECK:   tensor.yield %[[C]] : index
+  %0 = arith.index_castui %arg0 : tensor<10xi32> to tensor<10xindex>
+  func.return %0 : tensor<10xindex>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/python/attributes.py b/tensorflow/compiler/xla/mlir_hlo/tests/python/attributes.py
index d21a990531f..706dafc74b1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/python/attributes.py
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/python/attributes.py
@@ -14,226 +14,205 @@
 # ==============================================================================
 """Test for Python APIs accessing MHLO attributes."""
 
-# pylint: disable=wildcard-import,undefined-variable
+# pylint: disable=wildcard-import,undefined-variable,missing-function-docstring
 
-from mlir.dialects.mhlo import *
-from mlir.ir import *
+from mlir import ir
+from mlir.dialects import mhlo
 
 
 def run(f):
-  with Context() as context:
-    register_mhlo_dialect(context)
+  with ir.Context() as context:
+    mhlo.register_mhlo_dialect(context)
     f()
   return f
 
 
 @run
-def test_scatter_dimension_numbers():
-  """Check that ScatterDimensionNumbers attributes is available and usable."""
-
-  attr = ScatterDimensionNumbers.get(
-      update_window_dims=[1, 2, 3],
-      inserted_window_dims=[4, 6],
-      scattered_dims_to_operand_dims=[6, 7],
-      index_vector_dim=8)
+def test_channel_handle():
+  attr = mhlo.ChannelHandle.get(handle=1, type=2)
   assert attr is not None
-  assert str(attr) == ("#mhlo.scatter<update_window_dims = [1, 2, 3], "
-                       "inserted_window_dims = [4, 6], "
-                       "scatter_dims_to_operand_dims = [6, 7], "
-                       "index_vector_dim = 8>")
-  assert attr.update_window_dims == [1, 2, 3]
-  assert attr.inserted_window_dims == [4, 6]
-  assert attr.scattered_dims_to_operand_dims == [6, 7]
-  assert attr.index_vector_dim == 8
+  assert attr.handle == 1
+  assert attr.channel_type == 2
 
 
 @run
-def test_gather_dimension_numbers():
-  """Check that GatherDimensionNumbers attributes is available and usable."""
-
-  attr = GatherDimensionNumbers.get(
-      offset_dims=[1, 2],
-      collapsed_slice_dims=[3, 4, 5],
-      start_index_map=[6],
-      index_vector_dim=7)
+def test_comparison_direction_attr():
+  attr = mhlo.ComparisonDirectionAttr.get("EQ")
   assert attr is not None
-  assert str(attr) == ("#mhlo.gather<offset_dims = [1, 2], "
-                       "collapsed_slice_dims = [3, 4, 5], "
-                       "start_index_map = [6], "
-                       "index_vector_dim = 7>")
-  assert attr.offset_dims == [1, 2]
-  assert attr.collapsed_slice_dims == [3, 4, 5]
-  assert attr.start_index_map == [6]
-  assert attr.index_vector_dim == 7
+  assert str(attr) == "#mhlo<comparison_direction EQ>"
+  assert attr.value == "EQ"
 
 
 @run
-def test_dot_dimension_numbers():
-  """Check that DotDimensionNumbers attributes is available and usable."""
-
-  attr = DotDimensionNumbers.get(
-      lhs_batching_dimensions=[0, 1],
-      rhs_batching_dimensions=[2, 3],
-      lhs_contracting_dimensions=[4, 5],
-      rhs_contracting_dimensions=[6, 7])
+def test_comparison_type_attr():
+  attr = mhlo.ComparisonTypeAttr.get("FLOAT")
   assert attr is not None
-  assert str(attr) == ("#mhlo.dot<lhs_batching_dimensions = [0, 1], "
-                       "rhs_batching_dimensions = [2, 3], "
-                       "lhs_contracting_dimensions = [4, 5], "
-                       "rhs_contracting_dimensions = [6, 7]>")
-  assert attr.lhs_batching_dimensions == [0, 1]
-  assert attr.rhs_batching_dimensions == [2, 3]
-  assert attr.lhs_contracting_dimensions == [4, 5]
-  assert attr.rhs_contracting_dimensions == [6, 7]
+  assert str(attr) == "#mhlo<comparison_type FLOAT>"
+  assert attr.value == "FLOAT"
 
 
 @run
 def test_conv_dimension_numbers():
-  """Check that DotDimensionNumbers attributes is available and usable."""
-
-  attr = ConvDimensionNumbers.get(
+  attr = mhlo.ConvDimensionNumbers.get(
       input_batch_dimension=0,
-      input_feature_dimension=4,
-      input_spatial_dimensions=[1, 2, 3],
-      kernel_input_feature_dimension=1,
-      kernel_output_feature_dimension=2,
-      kernel_spatial_dimensions=[0, 3],
-      output_batch_dimension=1,
-      output_feature_dimension=3,
-      output_spatial_dimensions=[0, 2])
-  assert str(attr) == "#mhlo.conv<[b, 0, 1, 2, f]x[0, i, o, 1]->[0, b, 1, f]>"
+      input_feature_dimension=1,
+      input_spatial_dimensions=[2, 3, 4],
+      kernel_input_feature_dimension=0,
+      kernel_output_feature_dimension=1,
+      kernel_spatial_dimensions=[2, 3],
+      output_batch_dimension=0,
+      output_feature_dimension=1,
+      output_spatial_dimensions=[2, 3],
+  )
+  assert str(attr) == "#mhlo.conv<[b, f, 0, 1, 2]x[i, o, 0, 1]->[b, f, 0, 1]>"
   assert attr is not None
   assert attr.input_batch_dimension == 0
-  assert attr.input_feature_dimension == 4
-  assert attr.input_spatial_dimensions == [1, 2, 3]
-  assert attr.kernel_input_feature_dimension == 1
-  assert attr.kernel_output_feature_dimension == 2
-  assert attr.kernel_spatial_dimensions == [0, 3]
-  assert attr.output_batch_dimension == 1
-  assert attr.output_feature_dimension == 3
-  assert attr.output_spatial_dimensions == [0, 2]
+  assert attr.input_feature_dimension == 1
+  assert attr.input_spatial_dimensions == [2, 3, 4]
+  assert attr.kernel_input_feature_dimension == 0
+  assert attr.kernel_output_feature_dimension == 1
+  assert attr.kernel_spatial_dimensions == [2, 3]
+  assert attr.output_batch_dimension == 0
+  assert attr.output_feature_dimension == 1
+  assert attr.output_spatial_dimensions == [2, 3]
 
 
 @run
-def test_output_operand_alias():
-  """Check that OutputOperandAlias attributes is available and usable."""
-
-  attr = OutputOperandAlias.get(
-      output_tuple_indices=[0],
-      operand_index=0,
-      operand_tuple_indices=[1])
-  assert str(attr) == ("#mhlo.output_operand_alias<output_tuple_indices = [0], "
-                       "operand_index = 0, "
-                       "operand_tuple_indices = [1]>")
-  assert attr.output_tuple_indices == [0]
-  assert attr.operand_index == 0
-  assert attr.operand_tuple_indices == [1]
+def test_dequantize_mode_attr():
+  attr = mhlo.DequantizeModeAttr.get("MIN_COMBINED")
+  assert attr is not None
+  assert str(attr) == "#mhlo<dequantize_mode MIN_COMBINED>"
+  assert attr.value == "MIN_COMBINED"
 
 
 @run
-def test_comparison_direction():
-  """Check that ComparisonDirection attribute is available and usable."""
-
-  attr = ComparisonDirectionAttr.get("EQ")
+def test_dot_dimension_numbers():
+  attr = mhlo.DotDimensionNumbers.get(
+      lhs_batching_dimensions=[0, 1],
+      rhs_batching_dimensions=[2, 3],
+      lhs_contracting_dimensions=[4, 5],
+      rhs_contracting_dimensions=[6, 7],
+  )
   assert attr is not None
-  assert str(attr) == ("#mhlo<comparison_direction EQ>")
-  assert attr.comparison_direction == "EQ"
+  assert str(attr) == ("#mhlo.dot<lhs_batching_dimensions = [0, 1], "
+                       "rhs_batching_dimensions = [2, 3], "
+                       "lhs_contracting_dimensions = [4, 5], "
+                       "rhs_contracting_dimensions = [6, 7]>")
+  assert attr.lhs_batching_dimensions == [0, 1]
+  assert attr.rhs_batching_dimensions == [2, 3]
+  assert attr.lhs_contracting_dimensions == [4, 5]
+  assert attr.rhs_contracting_dimensions == [6, 7]
 
 
 @run
-def test_comparison_type():
-  """Check that ComparisonType attribute is available and usable."""
-
-  attr = ComparisonTypeAttr.get("TOTALORDER")
+def test_fft_type_attr():
+  attr = mhlo.FftTypeAttr.get("FFT")
   assert attr is not None
-  assert str(attr) == ("#mhlo<comparison_type TOTALORDER>")
-  assert attr.comparison_type == "TOTALORDER"
+  assert str(attr) == "#mhlo<fft_type FFT>"
+  assert attr.value == "FFT"
 
 
 @run
-def test_precision():
-  """Check that Precision attribute is available and usable."""
-
-  attr = PrecisionAttr.get("DEFAULT")
+def test_fusion_kind_attr():
+  attr = mhlo.FusionKindAttr.get("kLoop")
   assert attr is not None
-  assert str(attr) == ("#mhlo<precision DEFAULT>")
-  assert attr.precision_type == "DEFAULT"
+  assert str(attr) == "#mhlo<fusion_kind kLoop>"
+  assert attr.value == "kLoop"
 
 
 @run
-def test_fft_type():
-  """Check that FftType attribute is available and usable."""
-
-  attr = FftTypeAttr.get("FFT")
+def test_gather_dimension_numbers():
+  attr = mhlo.GatherDimensionNumbers.get(
+      offset_dims=[1, 2],
+      collapsed_slice_dims=[3, 4, 5],
+      start_index_map=[6],
+      index_vector_dim=7,
+  )
   assert attr is not None
-  assert str(attr) == ("#mhlo<fft_type FFT>")
-  assert attr.fft_type == "FFT"
+  assert (
+      str(attr)
+      == "#mhlo.gather<offset_dims = [1, 2], "
+      "collapsed_slice_dims = [3, 4, 5], "
+      "start_index_map = [6], "
+      "index_vector_dim = 7>"
+  )
+  assert attr.offset_dims == [1, 2]
+  assert attr.collapsed_slice_dims == [3, 4, 5]
+  assert attr.start_index_map == [6]
+  assert attr.index_vector_dim == 7
 
 
 @run
-def test_dequantize_mode():
-  """Check that DequantizeMode attribute is available and usable."""
-
-  attr = DequantizeModeAttr.get("MIN_COMBINED")
+def test_output_operand_alias():
+  attr = mhlo.OutputOperandAlias.get(
+      output_tuple_indices=[0], operand_index=0, operand_tuple_indices=[1]
+  )
   assert attr is not None
-  assert str(attr) == ("#mhlo<dequantize_mode MIN_COMBINED>")
-  assert attr.dequantize_mode == "MIN_COMBINED"
+  assert str(attr) == ("#mhlo.output_operand_alias<output_tuple_indices = [0], "
+                       "operand_index = 0, "
+                       "operand_tuple_indices = [1]>")
+  assert attr.output_tuple_indices == [0]
+  assert attr.operand_index == 0
+  assert attr.operand_tuple_indices == [1]
 
 
 @run
-def test_transpose_type():
-  """Check that Transpose attribute is available and usable."""
-
-  attr = TransposeAttr.get("TRANSPOSE")
+def test_precision_attr():
+  attr = mhlo.PrecisionAttr.get("DEFAULT")
   assert attr is not None
-  assert str(attr) == ("#mhlo<transpose TRANSPOSE>")
-  assert attr.transpose_type == "TRANSPOSE"
+  assert str(attr) == "#mhlo<precision DEFAULT>"
+  assert attr.value == "DEFAULT"
 
 
 @run
-def test_fusion_kind():
-  """Check that FusionKind attribute is available and usable."""
-
-  attr = FusionKindAttr.get("kLoop")
+def test_rng_algorithm_attr():
+  attr = mhlo.RngAlgorithmAttr.get("DEFAULT")
   assert attr is not None
-  assert str(attr) == ("#mhlo<fusion_kind kLoop>")
-  assert attr.fusion_kind == "kLoop"
+  assert str(attr) == "#mhlo.rng_algorithm<DEFAULT>"
+  assert attr.value == "DEFAULT"
 
 
 @run
-def test_rng_distribution():
-  """Check that RngDistribution attribute is available and usable."""
-
-  attr = RngDistributionAttr.get("UNIFORM")
+def test_rng_distribution_attr():
+  attr = mhlo.RngDistributionAttr.get("UNIFORM")
   assert attr is not None
-  assert str(attr) == ("#mhlo.rng_distribution<UNIFORM>")
-  assert attr.rng_distribution == "UNIFORM"
+  assert str(attr) == "#mhlo.rng_distribution<UNIFORM>"
+  assert attr.value == "UNIFORM"
 
 
 @run
-def test_rng_algorithm():
-  """Check that RngAlgorithm attribute is available and usable."""
-
-  attr = RngAlgorithmAttr.get("DEFAULT")
+def test_scatter_dimension_numbers():
+  attr = mhlo.ScatterDimensionNumbers.get(
+      update_window_dims=[1, 2, 3],
+      inserted_window_dims=[4, 5],
+      scattered_dims_to_operand_dims=[6, 7],
+      index_vector_dim=8,
+  )
   assert attr is not None
-  assert str(attr) == ("#mhlo.rng_algorithm<DEFAULT>")
-  assert attr.rng_algorithm == "DEFAULT"
+  assert (
+      str(attr)
+      == "#mhlo.scatter<update_window_dims = [1, 2, 3], "
+      "inserted_window_dims = [4, 5], "
+      "scatter_dims_to_operand_dims = [6, 7], "
+      "index_vector_dim = 8>"
+  )
+  assert attr.update_window_dims == [1, 2, 3]
+  assert attr.inserted_window_dims == [4, 5]
+  assert attr.scattered_dims_to_operand_dims == [6, 7]
+  assert attr.index_vector_dim == 8
 
 
 @run
-def test_channel_handle():
-  """Check that ChannelHandle attribute is available and usable."""
-
-  attr = ChannelHandle.get(handle=1, type=2)
+def test_transpose_attr():
+  attr = mhlo.TransposeAttr.get("TRANSPOSE")
   assert attr is not None
-  assert attr.handle == 1
-  assert attr.channel_type == 2
+  assert str(attr) == "#mhlo<transpose TRANSPOSE>"
+  assert attr.value == "TRANSPOSE"
 
 
 @run
 def test_type_extensions():
-  """Check that TypeExtensions attribute is available and usable."""
-
-  attr = TypeExtensions.get(bounds=[128, -1])
+  dyn_size = ir.ShapedType.get_dynamic_size()
+  attr = mhlo.TypeExtensions.get(bounds=[128, dyn_size])
   assert attr is not None
-  assert attr.bounds == [128, -1]
+  assert attr.bounds == [128, dyn_size]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/python/types.py b/tensorflow/compiler/xla/mlir_hlo/tests/python/types.py
index 8458fc5b70c..d66dd70cf54 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/python/types.py
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/python/types.py
@@ -14,20 +14,21 @@
 # ==============================================================================
 """Test for Python APIs accessing MHLO types."""
 
-# pylint: disable=wildcard-import,undefined-variable
+# pylint: disable=wildcard-import,undefined-variable,missing-function-docstring
 
-from mlir.dialects.mhlo import *
-from mlir.ir import *
+from mlir import ir
+from mlir.dialects import mhlo
 
 
 def run(f):
-  with Context() as context:
-    register_mhlo_dialect(context)
+  with ir.Context() as context:
+    mhlo.register_mhlo_dialect(context)
     f()
   return f
 
 
 @run
 def test_token_type():
-  """Check that the Token type is available."""
-  assert str(TokenType.get()) == "!mhlo.token"
+  token_type = mhlo.TokenType.get()
+  assert token_type is not None
+  assert str(token_type) == "!mhlo.token"
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/scalarization.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/scalarization.mlir
index affc8e8e105..4d350c8da71 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/scalarization.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/scalarization.mlir
@@ -208,60 +208,73 @@ func.func @scatter_f32(%indices: tensor<1x2xindex>,
 // CHECK-DAG:  %[[UPDATES_DIM_2:.*]] = tensor.dim %[[UPDATES]], %[[C2]]
 // CHECK-DAG:  %[[INIT_DIM_0:.*]] = tensor.dim %[[INIT]], %[[C0]]
 // CHECK-DAG:  %[[INIT_DIM_1:.*]] = tensor.dim %[[INIT]], %[[C1]]
-// CHECK-DAG:  %[[INIT_TILE:.*]] = gml_st.tile [0, 0] [%[[INIT_DIM_0]], %[[INIT_DIM_1]]
 
 // Extract scatter indices from `indices` arg.
 // CHECK-DAG:  %[[INDEX_0:.*]] = tensor.extract %[[INDICES]][%[[C0]],
 // CHECK-DAG:  %[[INDEX_1:.*]] = tensor.extract %[[INDICES]][%[[C0]],
 
-// Iterate over indow dimensions..
-// CHECK-NEXT:  %[[SCATTER:.*]] = gml_st.for (%[[I:.*]], %[[J:.*]]) = (%[[C0]],
-// CHECK-SAME:     %[[C0]]) to (%[[UPDATES_DIM_1]], %[[UPDATES_DIM_2]])
-// CHECK-SAME:     step (%[[C1]], %[[C1]])
-// CHECK-SAME:     outs (%[[INIT_:.*]] = %[[INIT]]: tensor<?x?xf32>) {
-
-// Check whetherthe index to update is not out-of-bounds.
-// CHECK-NEXT:    %[[I_PLUS_INDEX_0:.*]] = arith.addi %[[I]], %[[INDEX_0]]
-// CHECK-NEXT:    %[[J_PLUS_INDEX_1:.*]] = arith.addi %[[J]], %[[INDEX_1]]
-// CHECK-NEXT:    arith.cmpi sge, %[[I_PLUS_INDEX_0]], %[[C0]]
-// CHECK-NEXT:    arith.cmpi slt, %[[I_PLUS_INDEX_0]], %[[INIT_DIM_0]]
+// Check bounds of the slice.
+// CHECK-NEXT:    %[[DIM_1_PLUS_INDEX_0:.*]] = arith.addi %[[UPDATES_DIM_1]], %[[INDEX_0]]
+// CHECK-NEXT:    %[[DIM_2_PLUS_INDEX_1:.*]] = arith.addi %[[UPDATES_DIM_2]], %[[INDEX_1]]
+// CHECK-NEXT:    %[[LIMIT_DIM_0:.*]] = arith.subi %[[DIM_1_PLUS_INDEX_0]], %[[C1]]
+// CHECK-NEXT:    %[[LIMIT_DIM_1:.*]] = arith.subi %[[DIM_2_PLUS_INDEX_1]], %[[C1]]
+// CHECK-NEXT:    arith.cmpi sge, %[[LIMIT_DIM_0]], %[[C0]]
+// CHECK-NEXT:    arith.cmpi slt, %[[LIMIT_DIM_0]], %[[INIT_DIM_0]]
+// CHECK-NEXT:    arith.andi
+// CHECK-NEXT:    arith.cmpi sge, %[[LIMIT_DIM_1]], %[[C0]]
+// CHECK-NEXT:    arith.cmpi slt, %[[LIMIT_DIM_1]], %[[INIT_DIM_1]]
+// CHECK-NEXT:    arith.andi
+// CHECK-NEXT:    arith.andi
+// CHECK-NEXT:    arith.cmpi sge, %[[INDEX_0]], %[[C0]]
+// CHECK-NEXT:    arith.cmpi slt, %[[INDEX_0]], %[[INIT_DIM_0]]
+// CHECK-NEXT:    arith.andi
+// CHECK-NEXT:    arith.cmpi sge, %[[INDEX_1]], %[[C0]]
+// CHECK-NEXT:    arith.cmpi slt, %[[INDEX_1]], %[[INIT_DIM_1]]
 // CHECK-NEXT:    arith.andi
-// CHECK-NEXT:    arith.cmpi sge, %[[J_PLUS_INDEX_1]], %[[C0]]
-// CHECK-NEXT:    arith.cmpi slt, %[[J_PLUS_INDEX_1]], %[[INIT_DIM_1]]
 // CHECK-NEXT:    arith.andi
 // CHECK-NEXT:    %[[VALID_ACCESS:.*]] = arith.andi
+// CHECK-NEXT:    %[[RESULT:.*]] = scf.if %[[VALID_ACCESS]]
+
+// Iterate over window dimensions..
+// CHECK-NEXT:      %[[SCATTER:.*]] = scf.for %[[K:.*]] = %[[C0]]
+// CHECK-SAME:        to %[[C1]] step %[[C1]]
+// CHECK-SAME:        iter_args(%[[INIT_:.*]] = %[[INIT]])
+
+// CHECK-NEXT:      %[[SCATTER_:.*]] = scf.for %[[I:.*]] = %[[C0]]
+// CHECK-SAME:        to %[[UPDATES_DIM_1]] step %[[C1]]
+// CHECK-SAME:        iter_args(%[[INIT__:.*]] = %[[INIT_]])
+
+// CHECK-NEXT:      %[[SCATTER__:.*]] = scf.for %[[J:.*]] = %[[C0]]
+// CHECK-SAME:        to %[[UPDATES_DIM_2]] step %[[C1]]
+// CHECK-SAME:        iter_args(%[[INIT___:.*]] = %[[INIT__]])
+
+// CHECK-NEXT:        %[[I_PLUS_INDEX_0:.*]] = arith.addi %[[I]], %[[INDEX_0]]
+// CHECK-NEXT:        %[[J_PLUS_INDEX_1:.*]] = arith.addi %[[J]], %[[INDEX_1]]
 
 // Extracts elements of `updates` and `init` tensors and combine.
-// CHECK-NEXT:    %[[INIT_AFTER_INSERTION:.*]] = scf.if %[[VALID_ACCESS]]
-// CHECK-NEXT:      %[[UPDATES_ELEM_TILE:.*]] = gml_st.tile
-// CHECK-SAME:       [%[[C0]], %[[I]], %[[J]]] [1, 1, 1] [1, 1, 1]
-// CHECK-SAME:       : !gml_st.tile<1x1x1>
-// CHECK-NEXT:      %[[UPDATES_ELEM:.*]] = gml_st.materialize %[[UPDATES]]
-// CHECK-SAME:       [%[[UPDATES_ELEM_TILE]]]
-// CHECK-SAME:       : tensor<1x?x?xf32>[!gml_st.tile<1x1x1>] to f32
-
-// CHECK-NEXT:      %[[INIT_ELEM_TILE:.*]] = gml_st.tile
-// CHECK-SAME:       [%[[I_PLUS_INDEX_0]], %[[J_PLUS_INDEX_1]]] [1, 1] [1, 1]
-// CHECK-SAME:       : !gml_st.tile<1x1>
-// CHECK-NEXT:      %[[INIT_ELEM:.*]] = gml_st.materialize %[[INIT_]]
-// CHECK-SAME:      [%[[INIT_ELEM_TILE]]] : tensor<?x?xf32>[!gml_st.tile<1x1>] to f32
-
-// CHECK-NEXT:      %[[COMBINED_ELEMS:.*]] = arith.addf %[[UPDATES_ELEM]],
-// CHECK-SAME:       %[[INIT_ELEM]] : f32
-
-// CHECK-NEXT:      %[[UPDATED_INIT:.*]] = tensor.insert %[[COMBINED_ELEMS]]
-// CHECK-SAME:       into %[[INIT_]][%[[I_PLUS_INDEX_0]], %[[J_PLUS_INDEX_1]]]
-// CHECK-SAME:       : tensor<?x?xf32>
-// CHECK-NEXT:      scf.yield %[[UPDATED_INIT]] : tensor<?x?xf32>
+// CHECK-NEXT:        %[[UPDATES_SLICE:.*]] = tensor.extract_slice %[[UPDATES]]
+// CHECK-SAME:          [%[[K]], %[[I]], %[[J]]] [1, 1, 1] [1, 1, 1]
+// CHECK-SAME:          : tensor<1x?x?xf32> to tensor<1x1x1xf32>
+// CHECK-NEXT:        %[[UPDATES_ELEM:.*]] = tensor.extract %[[UPDATES_SLICE]]
+
+// CHECK-NEXT:        %[[INIT_SLICE:.*]] = tensor.extract_slice %[[INIT___]]
+// CHECK-SAME:          [%[[I_PLUS_INDEX_0]], %[[J_PLUS_INDEX_1]]] [1, 1] [1, 1]
+// CHECK-SAME:          : tensor<?x?xf32> to tensor<1x1xf32>
+// CHECK-NEXT:        %[[INIT_ELEM:.*]] = tensor.extract %[[INIT_SLICE]]
+
+// CHECK-NEXT:        %[[COMBINED_ELEMS:.*]] = arith.addf %[[UPDATES_ELEM]],
+// CHECK-SAME:          %[[INIT_ELEM]] : f32
+
+// CHECK-NEXT:        %[[UPDATED_INIT:.*]] = tensor.insert %[[COMBINED_ELEMS]]
+// CHECK-SAME:          into %[[INIT___]][%[[I_PLUS_INDEX_0]], %[[J_PLUS_INDEX_1]]]
+// CHECK-SAME:          : tensor<?x?xf32>
+// CHECK-NEXT:        scf.yield %[[UPDATED_INIT]] : tensor<?x?xf32>
+// CHECK:            scf.yield %[[SCATTER_]] : tensor<?x?xf32>
+// CHECK:           scf.yield %[[SCATTER]] : tensor<?x?xf32>
 // CHECK-NEXT:    } else {
-// CHECK-NEXT:      scf.yield %[[INIT_]] : tensor<?x?xf32>
+// CHECK-NEXT:      scf.yield %[[INIT]] : tensor<?x?xf32>
 // CHECK-NEXT:    }
-
-// CHECK-NEXT:    gml_st.set_yield %[[INIT_AFTER_INSERTION]]
-// CHECK-SAME:     into %[[INIT_]][%[[INIT_TILE]]]
-// CHECK-SAME:     : tensor<?x?xf32> into tensor<?x?xf32>[!gml_st.tile<?x?>]
-// CHECK-NEXT:  } : tensor<?x?xf32>
-// CHECK-NEXT:  return %[[SCATTER]] : tensor<?x?xf32>
+// CHECK-NEXT:    return %[[RESULT]] : tensor<?x?xf32>
 
 // -----
 
@@ -286,74 +299,70 @@ func.func @scatter_i64(%indices: tensor<1x1xindex>,
 // CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
 
-// CHECK-DAG:       %[[INIT_TILE:.*]] = gml_st.tile [0, 0, 0] [3, 3, 4]
-// CHECK-DAG:       %[[INDEX_0:.*]] = tensor.extract %[[INDICES]]
-
-// CHECK:       gml_st.for (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
-// CHECK-SAME:       to (%[[C3]], %[[C4]]) step (%[[C1]], %[[C1]])
-// CHECK-SAME:       outs (%[[INIT_:.*]] = %[[INIT]]: tensor<3x3x4xi64>) {
-// CHECK:         arith.cmpi sge, %[[INDEX_0]], %[[C0]]
-// CHECK:         arith.cmpi slt, %[[INDEX_0]], %[[C3]]
-// CHECK:         arith.andi
-// CHECK:         arith.cmpi sge, %[[I]], %[[C0]]
-// CHECK:         arith.cmpi slt, %[[I]], %[[C3]]
-// CHECK:         arith.andi
-// CHECK:         arith.andi
-// CHECK:         arith.cmpi sge, %[[J]], %[[C0]]
-// CHECK:         arith.cmpi slt, %[[J]], %[[C4]]
-// CHECK:         arith.andi
-// CHECK:         %[[VALID_ACCESS:.*]] = arith.andi
-
-// CHECK:         %[[INIT_AFTER_INSERTION:.*]] = scf.if %[[VALID_ACCESS]]
-// CHECK:           %[[UPDATES_TILE:.*]] = gml_st.tile
-// CHECK-SAME:        [%[[C0]], %[[C0]], %[[I]], %[[J]]]
-// CHECK-SAME:        [1, 1, 1, 1] [1, 1, 1, 1]
-// CHECK-SAME:        : !gml_st.tile<1x1x1x1>
-// CHECK:           %[[UPDATES_ELEM:.*]] = gml_st.materialize %[[UPDATES]]
-// CHECK-SAME:        [%[[UPDATES_TILE]]] : tensor<1x1x3x4xi64>[{{.*}}] to i64
-
-// CHECK:           %[[UPDATED_INIT:.*]] = tensor.insert %[[UPDATES_ELEM]] into
-// CHECK-SAME:        %[[INIT_]][%[[INDEX_0]], %[[I]], %[[J]]] : tensor<3x3x4xi64>
-
-// CHECK:           scf.yield %[[UPDATED_INIT]] : tensor<3x3x4xi64>
-// CHECK:         } else {
-// CHECK:           scf.yield %[[INIT_]] : tensor<3x3x4xi64>
-// CHECK:         }
-// CHECK:         gml_st.set_yield %[[INIT_AFTER_INSERTION:.*]] into %[[INIT_]]
-// CHECK-SAME:       [%[[INIT_TILE]]] : tensor<3x3x4xi64>
-// CHECK-SAME:       into tensor<3x3x4xi64>[!gml_st.tile<3x3x4>]
-// CHECK:       } : tensor<3x3x4xi64>
+// CHECK-DAG:   %[[INDEX_0:.*]] = tensor.extract %[[INDICES]]
+
+// CHECK:       %[[SCATTER:.*]] = scf.for %[[K:.*]] = %[[C0]]
+// CHECK-SAME:    to %[[C1]] step %[[C1]]
+// CHECK-SAME:    iter_args(%[[INIT_:.*]] = %[[INIT]])
+
+// CHECK-NEXT:  %[[SCATTER_:.*]] = scf.for %[[L:.*]] = %[[C0]]
+// CHECK-SAME:    to %[[C1]] step %[[C1]]
+// CHECK-SAME:    iter_args(%[[INIT__:.*]] = %[[INIT_]])
+
+// CHECK-NEXT:  %[[SCATTER__:.*]] = scf.for %[[I:.*]] = %[[C0]]
+// CHECK-SAME:    to %[[C3]] step %[[C1]]
+// CHECK-SAME:    iter_args(%[[INIT___:.*]] = %[[INIT__]])
+
+// CHECK-NEXT:  %[[SCATTER___:.*]] = scf.for %[[J:.*]] = %[[C0]]
+// CHECK-SAME:    to %[[C4]] step %[[C1]]
+// CHECK-SAME:    iter_args(%[[INIT____:.*]] = %[[INIT___]])
+
+
+// CHECK-NEXT:    %[[OFFSET:.*]] = arith.addi %[[L]], %[[INDEX_0]]
+// CHECK:         %[[UPDATES_SLICE:.*]] = tensor.extract_slice %[[UPDATES]]
+// CHECK-SAME:      [%[[K]], %[[L]], %[[I]], %[[J]]]
+// CHECK-SAME:      [1, 1, 1, 1] [1, 1, 1, 1]
+// CHECK-SAME:      : tensor<1x1x3x4xi64> to tensor<1x1x1x1xi64>
+// CHECK-NEXT:    %[[UPDATES_ELEM:.*]] = tensor.extract %[[UPDATES_SLICE]]
+
+// CHECK:         %[[UPDATED_INIT:.*]] = tensor.insert %[[UPDATES_ELEM]] into
+// CHECK-SAME:    %[[INIT____]][%[[OFFSET]], %[[I]], %[[J]]] : tensor<3x3x4xi64>
+
+// CHECK-NEXT:    scf.yield %[[UPDATED_INIT]]
 
 // -----
 
 func.func @gather(%indices: tensor<1x2xindex>,
-                  %operand: tensor<4x5x6xi64>,
-                  %init: tensor<1x4xi64>) -> tensor<1x4xi64> {
- %0 = thlo.gather ins(%operand : tensor<4x5x6xi64>,
+                  %operand: tensor<5x6x7xi64>,
+                  %init: tensor<1x3xi64>) -> tensor<1x3xi64> {
+ %0 = thlo.gather ins(%operand : tensor<5x6x7xi64>,
                       %indices : tensor<1x2xindex>)
-                   outs(%init : tensor<1x4xi64>)
- func.return %0 : tensor<1x4xi64>
+                   outs(%init : tensor<1x3xi64>)
+ func.return %0 : tensor<1x3xi64>
 }
 
 // CHECK-LABEL: func.func @gather(
 //  CHECK-SAME:     %[[INDICES:.*]]: tensor<1x2xindex>
-//  CHECK-SAME:     %[[OPERAND:.*]]: tensor<4x5x6xi64>
-//  CHECK-SAME:     %[[INIT:.*]]: tensor<1x4xi64>
+//  CHECK-SAME:     %[[OPERAND:.*]]: tensor<5x6x7xi64>
+//  CHECK-SAME:     %[[INIT:.*]]: tensor<1x3xi64>
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0
 //   CHECK-DAG:   %[[C1:.*]] = arith.constant 1
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2
 //   CHECK-DAG:   %[[C3:.*]] = arith.constant 3
-//   CHECK-DAG:   %[[C4:.*]] = arith.constant 4
+//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5
 //   CHECK-DAG:   %[[INDEX0:.*]] = tensor.extract %[[INDICES]][%[[C0]], %[[C0]]]
 //   CHECK-DAG:   %[[INDEX1:.*]] = tensor.extract %[[INDICES]][%[[C0]], %[[C1]]]
-//       CHECK:    gml_st.for (%[[J:.*]]) = (%[[C0]]) to (%[[C4]])
-//   CHECK-DAG:      %[[OFFSET_J:.*]] = arith.addi %[[J]], %[[INDEX1]]
-//   CHECK-DAG:      %[[MIN_J:.*]] = arith.minsi %[[OFFSET_J]], %[[C4]]
-//   CHECK-DAG:      %[[CLAMPED_J:.*]] = arith.maxsi %[[MIN_J]], %[[C0]]
-//   CHECK-DAG:      %[[MIN_I:.*]] = arith.minsi %[[INDEX0]], %[[C3]]
-//   CHECK-DAG:      %[[CLAMPED_I:.*]] = arith.maxsi %[[MIN_I]], %[[C0]]
+//   CHECK-DAG:   %[[CLAMPED_INDEX0:.*]] = arith.minsi %[[INDEX0]], %[[C2]]
+//   CHECK-DAG:   %[[CLAMPED_INDEX0_:.*]] = arith.maxsi %[[CLAMPED_INDEX0]], %[[C0]]
+//   CHECK-DAG:   %[[CLAMPED_INDEX1:.*]] = arith.minsi %[[INDEX1]], %[[C5]]
+//   CHECK-DAG:   %[[CLAMPED_INDEX1_:.*]] = arith.maxsi %[[CLAMPED_INDEX1]], %[[C0]]
+//       CHECK:    gml_st.for (%[[J:.*]]) = (%[[C0]]) to (%[[C3]])
+//   CHECK-DAG:      %[[OFFSET_J:.*]] = arith.addi %[[J]], %[[CLAMPED_INDEX0_]]
 //       CHECK:      %[[INIT_TILE:.*]] = gml_st.tile [%[[C0]], %[[J]]]
-//       CHECK:      %[[OPERAND_TILE:.*]] = gml_st.tile [%[[CLAMPED_I]], %[[CLAMPED_J]], %[[C0]]]
-//       CHECK:      %[[VAL:.*]] = gml_st.materialize %[[OPERAND]][%[[OPERAND_TILE]]]
+
+//       CHECK:      %[[SLICE:.*]] = tensor.extract_slice %[[OPERAND]]
+//       CHECK-SAME:   [%[[OFFSET_J]], %[[CLAMPED_INDEX1_]], 0]
+//       CHECK-NEXT: %[[VAL:.*]] = tensor.extract %[[SLICE]]
 //       CHECK:      gml_st.set_yield %[[VAL]] into {{.*}}[%[[INIT_TILE]]]
 
 // -----
@@ -365,26 +374,27 @@ func.func @fold_extract_from_elements_into_gml_st(%in: tensor<8x2xf32>,
   %c2 = arith.constant 2 : index
   %c8 = arith.constant 8 : index
 
-  %copy = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c8, %c2) step (%c1, %c1) {
-    %tile = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
-
-    %in_sub = gml_st.materialize %in[%tile]
-      : tensor<8x2xf32>[!gml_st.tile<1x1>] to tensor<1x1xf32>
+  %copy = gml_st.parallel (%i, %j) = (%c0, %c0) to (%c8, %c2) step (%c1, %c1)
+      outs (%out_ = %out: tensor<8x2xf32>) {
+    %in_sub = tensor.extract_slice %in[%i, %j] [1, 1] [1, 1]
+      : tensor<8x2xf32> to tensor<1x1xf32>
 
     %elem = tensor.extract %in_sub[%c0, %c0] : tensor<1x1xf32>
 
     %out_sub = tensor.from_elements %elem : tensor<1x1xf32>
 
-    gml_st.set_yield %out_sub into %out[%tile]
+    %tile = gml_st.tile [%i, %j] [1, 1] [1, 1] : !gml_st.tile<1x1>
+    gml_st.set_yield %out_sub into %out_[%tile]
       : tensor<1x1xf32> into tensor<8x2xf32>[!gml_st.tile<1x1>]
   } : tensor<8x2xf32>
   func.return %copy: tensor<8x2xf32>
 }
 // CHECK-LABEL: func @fold_extract_from_elements_into_gml_st
 
-// CHECK:       = gml_st.tile
-// CHECK-NEXT:  %[[ELEM:.*]] = gml_st.materialize
-// CHECK-SAME:    : tensor<8x2xf32>[!gml_st.tile<1x1>] to f32
+// CHECK:       %[[SLICE:.*]] = tensor.extract_slice
+// CHECK-SAME:    : tensor<8x2xf32> to tensor<1x1xf32>
+// CHECK-NEXT:  %[[ELEM:.*]] = tensor.extract %[[SLICE]]
+// CHECK-NEXT:       = gml_st.tile
 
 // CHECK-NEXT:  gml_st.set_yield %[[ELEM]]
 // CHECK-SAME:    : f32 into tensor<8x2xf32>[!gml_st.tile<1x1>]
@@ -403,7 +413,7 @@ func.func @dynamic_broadcast_in_dim(%arg : tensor<1x1xf32>,
 // CHECK-SAME:      %[[ARG:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1x1xf32>)
 // CHECK:                 %[[C0:.*]] = arith.constant 0 : index
 // CHECK-NEXT:      %[[ELEM:.*]] = tensor.extract %[[ARG]][%[[C0]], %[[C0]]]
-// CHECK-NEXT:      %[[UPDATED:.*]] = tensor.insert %[[ELEM]] into %[[INIT]][%[[C0]], %[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[UPDATED:.*]] = tensor.from_elements %[[ELEM]]
 
 // -----
 
@@ -415,7 +425,7 @@ func.func @concatenate(
         %arg1: tensor<?x?x?xf32>,
         %arg2: tensor<?x?x?xf32>)
     outs(%init: tensor<?x1x?xf32>)
-    { dimension = 1 : i64 }
+    dimension = 1
   func.return %cat : tensor<?x1x?xf32>
 }
 
@@ -432,14 +442,13 @@ func.func @concatenate(
 // CHECK-DAG:   %[[DIM0:.*]] = tensor.dim %[[INIT]], %[[C0]]
 // CHECK-DAG:   %[[DIM2:.*]] = tensor.dim %[[INIT]], %[[C2]]
 
-// CHECK-NEXT:  %[[TILE:.*]] = gml_st.tile [0, 0, 0]
-// CHECK-SAME:                        [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
 
 // Extract elements from arg0 is it's not empty.
 // CHECK-NEXT:  %[[DIM_ARG_0:.*]] = tensor.dim %[[ARG_0]], %[[C1]]
 // CHECK-NEXT:  %[[CMP_0:.*]] = arith.cmpi ne, %[[DIM_ARG_0]], %[[C0]]
 // CHECK:       %[[RESULT:.*]] = scf.if %[[CMP_0]]
-// CHECK:         %[[MAT_0:.*]] = gml_st.materialize %[[ARG_0]][%[[TILE]]]
+// CHECK:         %[[MAT_0:.*]] = tensor.extract_slice %[[ARG_0]]
+// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
 // CHECK:         %[[RES_0:.*]] = tensor.insert_slice %[[MAT_0]] into %[[INIT]]
 // CHECK-NEXT:    scf.yield %[[RES_0]]
 // CHECK-NEXT:  } else {
@@ -448,13 +457,15 @@ func.func @concatenate(
 // CHECK-NEXT:    %[[DIM_ARG_1:.*]] = tensor.dim %[[ARG_1]], %[[C1]]
 // CHECK-NEXT:    %[[CMP_1:.*]] = arith.cmpi ne, %[[DIM_ARG_1]], %[[C0]]
 // CHECK-NEXT:    %[[RESULT_1:.*]] = scf.if %[[CMP_1]]
-// CHECK-NEXT:      %[[MAT_1:.*]] = gml_st.materialize %[[ARG_1]][%[[TILE]]]
+// CHECK-NEXT:      %[[MAT_1:.*]] = tensor.extract_slice %[[ARG_1]]
+// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
 // CHECK-NEXT:      %[[RES_1:.*]] = tensor.insert_slice %[[MAT_1]] into %[[INIT]]
 // CHECK-NEXT:      scf.yield %[[RES_1]]
 // CHECK-NEXT:    } else {
 
 // Otherwise extract elements from arg2, because arg0 and arg1 are empty.
-// CHECK-NEXT:      %[[MAT_2:.*]] = gml_st.materialize %[[ARG_2]][%[[TILE]]]
+// CHECK-NEXT:      %[[MAT_2:.*]] = tensor.extract_slice %[[ARG_2]]
+// CHECK-SAME:        [0, 0, 0] [%[[DIM0]], 1, %[[DIM2]]] [1, 1, 1]
 // CHECK-NEXT:      %[[RES_2:.*]] = tensor.insert_slice %[[MAT_2]] into %[[INIT]]
 // CHECK-NEXT:      scf.yield %[[RES_2]]
 // CHECK-NEXT:    }
@@ -462,3 +473,150 @@ func.func @concatenate(
 // CHECK-NEXT:  }
 
 // CHECK-NEXT:  return %[[RESULT]] : tensor<?x1x?xf32>
+
+// -----
+
+func.func @linalg_map(%lhs : tensor<1x1xf32>,
+                      %rhs: tensor<1x1xf32>,
+                      %init: tensor<1x1xf32>)
+                                    -> tensor<1x1xf32>  {
+      %add = linalg.map
+          ins(%lhs, %rhs : tensor<1x1xf32>, tensor<1x1xf32>)
+          outs(%init: tensor<1x1xf32>)
+          (%lhs_elem: f32, %rhs_elem: f32) {
+            %0 = arith.addf %lhs_elem, %rhs_elem: f32
+            linalg.yield %0: f32
+          }
+      func.return %add : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @linalg_map(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1xf32>, %[[RHS:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1xf32>)
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[L_ELEM:.*]] = tensor.extract %[[LHS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[R_ELEM:.*]] = tensor.extract %[[RHS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[L_ELEM]], %[[R_ELEM]]
+// CHECK-NEXT:      tensor.from_elements %[[ADD]]
+
+// -----
+
+func.func @linalg_reduce(%ins: tensor<1x1x1xf32>,
+                         %outs: tensor<1x1xf32>)
+                                    -> tensor<1x1xf32>  {
+      %reduce = linalg.reduce
+          ins(%ins: tensor<1x1x1xf32>)
+          outs(%outs: tensor<1x1xf32>)
+          dimensions = [1]
+          (%in: f32, %out: f32) {
+            %0 = arith.addf %in, %out: f32
+            linalg.yield %0: f32
+          }
+      func.return %reduce : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @linalg_reduce(
+// CHECK-SAME:      %[[INS:.*]]: tensor<1x1x1xf32>, %[[OUTS:.*]]: tensor<1x1xf32>)
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[L_ELEM:.*]] = tensor.extract %[[INS]][%[[C0]], %[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[R_ELEM:.*]] = tensor.extract %[[OUTS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[L_ELEM]], %[[R_ELEM]]
+// CHECK-NEXT:      tensor.from_elements %[[ADD]]
+
+// -----
+
+func.func @linalg_transpose(%ins: tensor<1x1xf32>,
+                            %outs: tensor<1x1xf32>)
+                                    -> tensor<1x1xf32>  {
+      %transpose = linalg.transpose
+          ins(%ins: tensor<1x1xf32>)
+          outs(%outs: tensor<1x1xf32>)
+          permutation = [1, 0]
+      func.return %transpose : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @linalg_transpose(
+// CHECK-SAME:      %[[INS:.*]]: tensor<1x1xf32>, %[[OUTS:.*]]: tensor<1x1xf32>)
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[EXTRACTED:.*]] = tensor.extract %[[INS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      tensor.from_elements %[[EXTRACTED]]
+
+// -----
+
+func.func @linalg_matmul(%lhs: tensor<1x1xf32>,
+                         %rhs: tensor<1x1xf32>,
+                         %out : tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %0 = linalg.matmul
+      ins(%lhs, %rhs : tensor<1x1xf32>, tensor<1x1xf32>)
+      outs(%out : tensor<1x1xf32>) -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @linalg_matmul(
+// CHECK-SAME:      %[[LHS:.*]]: tensor<1x1xf32>, %[[RHS:.*]]: tensor<1x1xf32>, %[[OUT:.*]]: tensor<1x1xf32>)
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[LHS_ELEM:.*]] = tensor.extract %[[LHS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[RHS_ELEM:.*]] = tensor.extract %[[RHS]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[OUT_ELEM:.*]] = tensor.extract %[[OUT]][%[[C0]], %[[C0]]]
+// CHECK-NEXT:      %[[MUL:.*]] = arith.mulf %[[LHS_ELEM]], %[[RHS_ELEM]]
+// CHECK-NEXT:      %[[ADD:.*]] = arith.addf %[[OUT_ELEM]], %[[MUL]]
+// CHECK-NEXT:       tensor.from_elements %[[ADD]]
+
+// -----
+
+func.func @thlo_reverse(%arg : tensor<1x1xf32>, %init: tensor<1x1xf32>)
+    -> tensor<1x1xf32> {
+  %0 = thlo.reverse ins(%arg : tensor<1x1xf32>)
+        outs(%init : tensor<1x1xf32>)
+        reverse_dimensions = [0, 1]
+  func.return %0 : tensor<1x1xf32>
+}
+
+// CHECK-LABEL: @thlo_reverse(
+//  CHECK-SAME: %[[ARG:.*]]: tensor<1x1xf32>, %[[INIT:.*]]: tensor<1x1xf32>)
+//       CHECK:   return %[[ARG]]
+
+// -----
+
+func.func @ite_1d(%arg0: i1, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>)
+    -> tensor<1xf32> {
+  %0 = scf.if %arg0 -> (tensor<1xf32>) {
+    scf.yield %arg2 : tensor<1xf32>
+  } else {
+    scf.yield %arg1 : tensor<1xf32>
+  }
+  return %0 : tensor<1xf32>
+}
+
+// CHECK:     func.func @ite_1d(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<1xf32>, %[[ARG2:.*]]: tensor<1xf32>)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:       %[[IF:.*]] = scf.if %[[ARG0]] -> (f32)
+// CHECK:         %[[EXTRACTED:.*]] = tensor.extract %[[ARG2]][%[[C0]]]
+// CHECK:         scf.yield %[[EXTRACTED]] : f32
+// CHECK:       else
+// CHECK:         %[[EXTRACTED_0:.*]] = tensor.extract %[[ARG1]][%[[C0]]]
+// CHECK:         scf.yield %[[EXTRACTED_0]] : f32
+// CHECK:       %[[FROM_ELEMENTS:.*]] = tensor.from_elements %[[IF]]
+// CHECK:       return %[[FROM_ELEMENTS]]
+
+// -----
+
+func.func @ite_2d(%arg0: i1, %arg1: tensor<1x1xf32>, %arg2: tensor<1x1xf32>)
+    -> tensor<1x1xf32> {
+  %0 = scf.if %arg0 -> (tensor<1x1xf32>) {
+    scf.yield %arg2 : tensor<1x1xf32>
+  } else {
+    scf.yield %arg1 : tensor<1x1xf32>
+  }
+  return %0 : tensor<1x1xf32>
+}
+
+// CHECK:     func.func @ite_2d(%[[ARG0:.*]]: i1, %[[ARG1:.*]]: tensor<1x1xf32>, %[[ARG2:.*]]: tensor<1x1xf32>)
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:       %[[IF:.*]] = scf.if %[[ARG0]] -> (f32)
+// CHECK:         %[[EXTRACTED:.*]] = tensor.extract %[[ARG2]][%[[C0]], %[[C0]]]
+// CHECK:         scf.yield %[[EXTRACTED]] : f32
+// CHECK:       else
+// CHECK:         %[[EXTRACTED_0:.*]] = tensor.extract %[[ARG1]][%[[C0]], %[[C0]]]
+// CHECK:         scf.yield %[[EXTRACTED_0]] : f32
+// CHECK:       %[[FROM_ELEMENTS:.*]] = tensor.from_elements %[[IF]]
+// CHECK:       return %[[FROM_ELEMENTS]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/warp_reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/warp_reduce.mlir
deleted file mode 100644
index 019c62b2fca..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tests/warp_reduce.mlir
+++ /dev/null
@@ -1,101 +0,0 @@
-// RUN: mlir-hlo-opt -split-input-file -gml-st-to-gpu %s | FileCheck %s
-
-// CHECK-LABEL: func @vector_reduce
-func.func @vector_reduce(%arg0 : memref<1xf32>) {
-
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 1.0 : f32
-  %init = vector.broadcast %cst : f32 to vector<1xf32>
-  %lane = gpu.lane_id
-  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
-  %bcast = vector.broadcast %cst : f32 to vector<1xf32>
-
-  // CHECK: %[[CST:.*]] = arith.constant 1.0
-  // CHECK: %[[Y0:.*]], %{{.*}} = gpu.shuffle xor %[[CST]], %c1
-  // CHECK: %[[X1:.*]] = arith.addf %[[Y0]], %[[CST]]
-  // CHECK: %[[Y1:.*]], %{{.*}} = gpu.shuffle xor %[[X1]], %c2
-  // CHECK: %[[X2:.*]] = arith.addf %[[X1]], %[[Y1]]
-  // CHECK: %[[Y2:.*]], %{{.*}} = gpu.shuffle xor %[[X2]], %c4
-  // CHECK: %[[X3:.*]] = arith.addf %[[X2]], %[[Y2]]
-  // CHECK: %[[Y3:.*]], %{{.*}} = gpu.shuffle xor %[[X3]], %c8
-  // CHECK: %[[X4:.*]] = arith.addf %[[X3]], %[[Y3]]
-  // CHECK: %[[Y4:.*]], %{{.*}} = gpu.shuffle xor %[[X4]], %c16
-  // CHECK: %[[X5:.*]] = arith.addf %[[X4]], %[[Y4]]
-  // CHECK: %[[Y5:.*]] = arith.addf %[[X5]], %[[CST]]
-  // CHECK: %[[SUM:.*]] = vector.broadcast %[[Y5]]
-  %dist = gml_st.distribute %bcast into[%tile]
-    : vector<1xf32> into vector<1x32xf32>[!gml_st.tile<1>]
-  %sum = vector.multi_reduction <add>, %dist, %init [1]
-    : vector<1x32xf32> to vector<1xf32>
-  // CHECK: vector.transfer_write %[[SUM]], %arg0[%c0]
-  vector.transfer_write %sum, %arg0[%c0] : vector<1xf32>, memref<1xf32>
-
-  func.return
-}
-
-// -----
-
-// CHECK-LABEL: func @vector_reduce_small
-func.func @vector_reduce_small() -> vector<1xf32> {
-
-  %cst = arith.constant 1.0 : f32
-  %init = vector.broadcast %cst : f32 to vector<1xf32>
-  %lane = gpu.lane_id
-  %tile = gml_st.tile [%lane] [1] [1] : !gml_st.tile<1>
-  %dist = gml_st.distribute %init into[%tile]
-    : vector<1xf32> into vector<1x4xf32>[!gml_st.tile<1>]
-
-  // CHECK: %[[CST:.*]] = arith.constant 1.0
-  // CHECK: %[[Y0:.*]], %{{.*}} = gpu.shuffle xor %[[CST]], %c1
-  // CHECK: %[[X1:.*]] = arith.addf %[[Y0]], %[[CST]]
-  // CHECK: %[[Y1:.*]], %{{.*}} = gpu.shuffle xor %[[X1]], %c2
-  // CHECK: %[[X2:.*]] = arith.addf %[[X1]], %[[Y1]]
-  // CHECK: %[[Y2:.*]] = arith.addf %[[X2]], %[[CST]]
-  // CHECK: %[[SUM:.*]] = vector.broadcast %[[Y2]]
-  %sum = vector.multi_reduction <add>, %dist, %init [1]
-    : vector<1x4xf32> to vector<1xf32>
-
-  // CHECK: return %[[SUM]]
-  func.return %sum : vector<1xf32>
-}
-
-// -----
-
-#stride1 = strided<[1], offset: ?>
-
-// CHECK-LABEL: func @gpu_launch
-func.func @gpu_launch() -> memref<64xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c32 = arith.constant 32 : index
-  %c64 = arith.constant 64 : index
-  %cst = arith.constant dense<0.0> : vector<1xf32>
-  %0 = memref.alloc() : memref<64xf32>
-  // CHECK: gpu.launch
-  gml_st.parallel (%arg1) = (%c0) to (%c64) step (%c4) {
-    %1 = memref.subview %0[%arg1] [4] [1]
-      : memref<64xf32> to memref<4xf32, #stride1>
-    gml_st.parallel (%arg2) = (%c0) to (%c4) step (%c1) {
-      %2 = memref.subview %1[%arg2] [1] [1]
-        : memref<4xf32, #stride1> to memref<1xf32, #stride1>
-
-      %init = vector.broadcast %cst : vector<1xf32> to vector<1x32xf32>
-      %3 = gml_st.parallel (%arg3) = (%c0) to (%c32) step (%c1) {
-        %tile = gml_st.tile [0, %arg3] [1, 1] [1, 1] : !gml_st.tile<1x1>
-        %elem = arith.constant dense<1.0> : vector<1x1xf32>
-        gml_st.set_yield %elem into %init[%tile]
-          : vector<1x1xf32> into vector<1x32xf32>[!gml_st.tile<1x1>]
-      } : vector<1x32xf32>
-
-      // CHECK-NOT: vector.multi_reduction
-      %sum = vector.multi_reduction <add>, %3, %cst [1]
-        : vector<1x32xf32> to vector<1xf32>
-      vector.transfer_write %sum, %2[%c0] {in_bounds = [true]}
-        : vector<1xf32>, memref<1xf32, #stride1>
-      gml_st.set_yield
-    }
-    gml_st.set_yield
-  }
-  return %0 : memref<64xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/thlo/CMakeLists.txt
new file mode 100644
index 00000000000..c3f701100cc
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(IR)
+add_subdirectory(transforms)
+add_subdirectory(interfaces)
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/IR/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/CMakeLists.txt
new file mode 100644
index 00000000000..5a4f76478c2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/CMakeLists.txt
@@ -0,0 +1,43 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(LLVM_TARGET_DEFINITIONS thlo_ops.td)
+mlir_tablegen(thlo_ops.h.inc -gen-op-decls)
+mlir_tablegen(thlo_ops.cc.inc -gen-op-defs)
+mlir_tablegen(thlo_dialect.h.inc -gen-dialect-decls)
+mlir_tablegen(thlo_dialect.cc.inc -gen-dialect-defs)
+
+add_public_tablegen_target(MLIRthlo_opsIncGen)
+add_dependencies(mlir-headers MLIRthlo_opsIncGen)
+
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_dialect_library(THLODialect
+  thlo_ops.cc
+
+  DEPENDS
+  MLIRthlo_opsIncGen
+
+  LINK_LIBS PUBLIC
+  GmlStDialect
+  MLIRDestinationStyleOpInterface
+  MLIRIR
+  MLIRMemRefDialect
+  MLIRSideEffectInterfaces
+  MLIRSupport
+  MLIRTensorDialect
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc
new file mode 100644
index 00000000000..5a038349bb2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.cc
@@ -0,0 +1,1240 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "thlo/IR/thlo_ops.h"
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Interfaces/TilingInterface.h"
+
+namespace mlir {
+namespace {
+
+Value materializeSlice(OpBuilder &b, Location loc, Value valueToTile,
+                       ArrayRef<OpFoldResult> offsets,
+                       ArrayRef<OpFoldResult> sizes,
+                       ArrayRef<OpFoldResult> strides) {
+  return b.create<tensor::ExtractSliceOp>(loc, valueToTile, offsets, sizes,
+                                          strides);
+}
+
+Value materializeSlice(OpBuilder &b, Location loc, Value valueToTile,
+                       ArrayRef<OpFoldResult> offsets,
+                       ArrayRef<OpFoldResult> sizes) {
+  SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
+  return materializeSlice(b, loc, valueToTile, offsets, sizes, strides);
+}
+
+//===----------------------------------------------------------------------===//
+// Destination-style ops tools
+//===----------------------------------------------------------------------===//
+
+LogicalResult verifyDestinationStyleOp(Operation *op) {
+  auto dstStyleOp = cast<DestinationStyleOpInterface>(*op);
+  if (dstStyleOp.hasBufferSemantics()) return success(op->getNumResults() == 0);
+
+  if (!dstStyleOp.hasTensorSemantics())
+    return op->emitOpError("expected either buffer or tensor semantics");
+
+  return success();
+}
+
+template <typename DstOpTy>
+void printDstStyleOp(
+    DstOpTy op, OpAsmPrinter &p,
+    function_ref<SmallVector<StringRef>(DstOpTy op, OpAsmPrinter &)>
+        printAttrsFn = nullptr) {
+  if (op.getNumDpsInputs() != 0) {
+    p << " ins(";
+    llvm::interleaveComma(
+        op.getOperands().take_front(op.getNumDpsInputs()), p,
+        [&](Value input) { p << input << " : " << input.getType(); });
+    p << ")";
+  }
+  p << " outs(";
+  llvm::interleaveComma(
+      op.getOperands().take_back(op.getNumDpsInits()), p,
+      [&](Value output) { p << output << " : " << output.getType(); });
+  p << ")";
+
+  // Print attributes with custom printing logic.
+  SmallVector<StringRef> elidedAttrs;
+  if (printAttrsFn) {
+    p << ' ';
+    elidedAttrs = printAttrsFn(op, p);
+  }
+
+  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+}
+
+ParseResult parseKeywordOperandListWithTypes(
+    OpAsmParser &parser, OperationState &result, StringRef keyword,
+    SmallVectorImpl<Type> *operandTypes) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> operands;
+  if (succeeded(parser.parseOptionalKeyword(keyword))) {
+    SMLoc operandsOperandsLoc = parser.getCurrentLocation();
+
+    if (parser.parseCommaSeparatedList(
+            AsmParser::Delimiter::Paren, [&]() -> ParseResult {
+              if (parser.parseOperand(operands.emplace_back(),
+                                      /*allowResultNumber=*/false) ||
+                  parser.parseColon() ||
+                  parser.parseType(operandTypes->emplace_back())) {
+                return failure();
+              }
+              return success();
+            }))
+      return failure();
+
+    if (parser.resolveOperands(operands, *operandTypes, operandsOperandsLoc,
+                               result.operands))
+      return failure();
+  }
+  return success();
+}
+
+ParseResult parseDstStyleOp(
+    OpAsmParser &parser, OperationState &result,
+    function_ref<ParseResult(OpAsmParser &, NamedAttrList &)> parseAttrsFn =
+        nullptr) {
+  // Parse `ins` and `outs`.
+  SmallVector<Type, 4> inputTypes, outputTypes;
+  if (parseKeywordOperandListWithTypes(parser, result, "ins", &inputTypes) ||
+      parseKeywordOperandListWithTypes(parser, result, "outs", &outputTypes))
+    return failure();
+
+  // Add result types.
+  for (Type outputType : outputTypes) {
+    if (outputType.isa<RankedTensorType>()) result.addTypes(outputType);
+  }
+
+  // Parse required attributes.
+  if (parseAttrsFn && failed(parseAttrsFn(parser, result.attributes)))
+    return failure();
+
+  // Parse optional attributes.
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+  return success();
+}
+
+ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser,
+                                   NamedAttrList &attributes,
+                                   StringRef attributeName) {
+  if (parser.parseKeyword(attributeName) || parser.parseEqual())
+    return failure();
+
+  attributes.set(attributeName, DenseI64ArrayAttr::parse(parser, Type{}));
+  return success();
+}
+
+void printDenseI64ArrayAttr(OpAsmPrinter &p, StringRef attributeName,
+                            ArrayRef<int64_t> attributeValue) {
+  p << attributeName << " = [" << attributeValue << "] ";
+}
+
+SmallVector<utils::IteratorType> getParallelIteratorTypes(int64_t dimCount) {
+  return SmallVector<utils::IteratorType>(dimCount,
+                                          utils::IteratorType::parallel);
+}
+
+SmallVector<Range> getIterationDomainForTensor(OpBuilder &b, Location loc,
+                                               Value tensor,
+                                               int64_t dimCount = -1) {
+  auto dimValues = tensor::createDimValues(b, loc, tensor);
+  if (dimCount >= 0) dimValues.resize(dimCount);
+  return llvm::to_vector(llvm::map_range(dimValues, [&](OpFoldResult d) {
+    return Range{b.getIndexAttr(0), d, b.getIndexAttr(1)};
+  }));
+}
+
+}  // namespace
+}  // namespace mlir
+
+// Generated dialect definitions.
+#include "thlo/IR/thlo_dialect.cc.inc"
+
+namespace mlir {
+namespace thlo {
+
+void THLODialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "thlo/IR/thlo_ops.cc.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult checkYieldOutputs(YieldOp yieldOp,
+                                TypeRange expectedElementTypes) {
+  uint64_t numOutputs = expectedElementTypes.size();
+  if (yieldOp.getValues().size() != numOutputs) {
+    return yieldOp.emitOpError("expects number of tensor output args = ")
+           << numOutputs << " to match the number of yield operands = "
+           << yieldOp.getValues().size();
+  }
+
+  for (auto &item : llvm::enumerate(
+           llvm::zip(expectedElementTypes, yieldOp.getOperandTypes()))) {
+    Type outputElementType, resultType;
+    unsigned index = item.index();
+    std::tie(outputElementType, resultType) = item.value();
+    if (outputElementType != resultType)
+      return yieldOp.emitOpError("expects yield operand ")
+             << index << " with type = " << resultType
+             << " to match output arg element type = " << outputElementType;
+  }
+
+  return success();
+}
+
+LogicalResult YieldOp::verify() { return success(); }
+
+//===----------------------------------------------------------------------===//
+// ConcatenateOp
+//===----------------------------------------------------------------------===//
+
+SmallVector<utils::IteratorType> ConcatenateOp::getLoopIteratorTypes() {
+  return getParallelIteratorTypes(getInit().getType().getRank());
+}
+
+SmallVector<Range> ConcatenateOp::getIterationDomain(OpBuilder &b) {
+  return getIterationDomainForTensor(b, getLoc(), getInit());
+}
+
+namespace {
+
+Value getSingleOperandTiledImplementationForConcatRecursively(
+    OpBuilder &b, Location loc, int64_t concatDim, ValueRange remainingOperands,
+    SmallVector<OpFoldResult> &remainingOffsets, ArrayRef<OpFoldResult> sizes) {
+  assert(!remainingOperands.empty() && "expect at least one remaining operand");
+  assert(sizes[concatDim].get<Attribute>().cast<IntegerAttr>().getInt() == 1 &&
+         "expect unit size in concat dim");
+
+  // Terminal case of exactly one operand.
+  Value leadingOperand = remainingOperands.front();
+  if (remainingOperands.size() == 1) {
+    return materializeSlice(b, loc, leadingOperand, remainingOffsets, sizes);
+  }
+
+  // For more than one operand, distinguish between the leading operand and the
+  // remainder.
+  assert(remainingOperands.size() > 1 &&
+         "expect more than one operand at this point");
+  Value leadingOperandSizeInConcatDim =
+      b.create<tensor::DimOp>(loc, leadingOperand, concatDim);
+  Value remainingOffsetInConcatDim =
+      getValueOrCreateConstantIndexOp(b, loc, remainingOffsets[concatDim]);
+  Value leadingOperandPredicate = b.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::ult, remainingOffsetInConcatDim,
+      leadingOperandSizeInConcatDim);
+  auto ifOp = b.create<scf::IfOp>(
+      loc, leadingOperandPredicate,
+      [&](OpBuilder &b, Location loc) {
+        Value tiledConcat =
+            getSingleOperandTiledImplementationForConcatRecursively(
+                b, loc, concatDim, {leadingOperand}, remainingOffsets, sizes);
+        b.create<scf::YieldOp>(loc, tiledConcat);
+      },
+      [&](OpBuilder &b, Location loc) {
+        remainingOffsets[concatDim] =
+            b.create<arith::SubIOp>(loc, remainingOffsetInConcatDim,
+                                    leadingOperandSizeInConcatDim)
+                .getResult();
+        Value tiledConcat =
+            getSingleOperandTiledImplementationForConcatRecursively(
+                b, loc, concatDim, remainingOperands.drop_front(),
+                remainingOffsets, sizes);
+        b.create<scf::YieldOp>(loc, tiledConcat);
+      });
+  return ifOp.getResults().front();
+}
+
+Value getSingleOperandTiledImplementationForConcat(
+    ConcatenateOp op, OpBuilder &b, Location loc,
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) {
+  int64_t concatDim = op.getDimension().getSExtValue();
+  SmallVector<OpFoldResult> remainingOffsets(offsets);
+  return getSingleOperandTiledImplementationForConcatRecursively(
+      b, loc, concatDim, op.getInputs(), remainingOffsets, sizes);
+}
+
+Value getGenericTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
+                                             Location loc,
+                                             ArrayRef<OpFoldResult> offsets,
+                                             ArrayRef<OpFoldResult> sizes) {
+  // Create a basis for the tile offsets and sizes. These hold the shared values
+  // in all non-concat dimensions and are amended in the concat dimension to
+  // create the individual operand tiles. Also, create the shared tile strides,
+  // which are the exact same for every operand tile.
+  SmallVector<OpFoldResult> operandTileOffsetsBase(offsets);
+  SmallVector<OpFoldResult> operandTileSizesBase(sizes);
+  SmallVector<OpFoldResult> operandTileStrides(sizes.size(), b.getIndexAttr(1));
+
+  // Some shared values.
+  Value zeroCst = b.create<arith::ConstantIndexOp>(loc, 0);
+  int64_t concatDim = op.getDimension().getSExtValue();
+  Value concatDimCst = b.create<arith::ConstantIndexOp>(loc, concatDim);
+  Value maxTileSizeInConcatDim =
+      getValueOrCreateConstantIndexOp(b, loc, sizes[concatDim]);
+
+  // The remaining tile offset in the concat dimension is subtracted by each
+  // operand's size in that dimension. We maintain the invariant
+  // remainingTileOffsetInConcatDim >= 0.
+  Value remainingTileOffsetInConcatDim =
+      getValueOrCreateConstantIndexOp(b, loc, offsets[concatDim]);
+
+  // Create the relevant subsets per operand. These tiles can be empty at
+  // runtime.
+  SmallVector<Value> tiledOperands;
+  tiledOperands.reserve(op.getNumDpsInputs());
+  for (Value operand : op.getInputs()) {
+    // Find the current operand's tile offset in the concat dimension. This is
+    // the remaining offset clamped into the bounds of the operand. Note that
+    // the remaining offset is always >= 0.
+    Value operandSizeInConcatDim =
+        b.create<tensor::DimOp>(loc, operand, concatDimCst);
+    Value operandTileOffsetInConcatDim = b.create<arith::MinUIOp>(
+        loc, remainingTileOffsetInConcatDim, operandSizeInConcatDim);
+    operandTileOffsetsBase[concatDim] = operandTileOffsetInConcatDim;
+
+    // Find the current operand's tile size in the concat dimension.
+    Value remainingOperandSizeInConcatDim = b.create<arith::SubIOp>(
+        loc, operandSizeInConcatDim, operandTileOffsetInConcatDim);
+    operandTileSizesBase[concatDim] = b.createOrFold<arith::MinUIOp>(
+        loc, remainingOperandSizeInConcatDim, maxTileSizeInConcatDim);
+
+    // Create the operand tile and materialize the subset for this operand.
+    tiledOperands.push_back(
+        materializeSlice(b, loc, operand, operandTileOffsetsBase,
+                         operandTileSizesBase, operandTileStrides));
+
+    // Unless it is the last operand, update the remaining tile offset in the
+    // concat dimension. The remaining offset is subtracted by the operand's
+    // size but must remain >= 0.
+    if (operand != op.getInputs().back()) {
+      Value cmp = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ule,
+                                          remainingTileOffsetInConcatDim,
+                                          operandSizeInConcatDim);
+      Value sub = b.create<arith::SubIOp>(loc, remainingTileOffsetInConcatDim,
+                                          operandSizeInConcatDim);
+      remainingTileOffsetInConcatDim =
+          b.create<arith::SelectOp>(loc, cmp, zeroCst, sub);
+    }
+  }
+
+  // Create the tiled concat op.
+  Value tiledInit = materializeSlice(b, loc, op.getInit(), offsets, sizes);
+  auto tiledConcat =
+      b.create<thlo::ConcatenateOp>(loc, tiledInit.getType(), tiledOperands,
+                                    tiledInit, b.getIndexAttr(concatDim));
+  return tiledConcat.getResults().front();
+}
+
+Value getTiledImplementationForConcat(ConcatenateOp op, OpBuilder &b,
+                                      Location loc,
+                                      ArrayRef<OpFoldResult> offsets,
+                                      ArrayRef<OpFoldResult> sizes) {
+  // If the tile is of unit size in the concatenation dimension, we can generate
+  // the tiled implementation based on a single operand.
+  int64_t concatDim = op.getDimension().getSExtValue();
+  OpFoldResult tileSizeInConcatDim = sizes[concatDim];
+  if (tileSizeInConcatDim.is<Attribute>() &&
+      tileSizeInConcatDim.get<Attribute>().cast<IntegerAttr>().getInt() == 1) {
+    return getSingleOperandTiledImplementationForConcat(op, b, loc, offsets,
+                                                        sizes);
+  }
+
+  // Otherwise, rely on the generic implementation.
+  return getGenericTiledImplementationForConcat(op, b, loc, offsets, sizes);
+}
+
+}  // namespace
+
+SmallVector<Operation *> ConcatenateOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  auto tiled =
+      getTiledImplementationForConcat(*this, b, getLoc(), offsets, sizes);
+  return {tiled.getDefiningOp()};
+}
+
+LogicalResult ConcatenateOp::getResultTilePosition(
+    OpBuilder & /*b*/, unsigned /*resultNumber*/,
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+    SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  resultOffsets = llvm::to_vector(offsets);
+  resultSizes = llvm::to_vector(sizes);
+  return success();
+}
+
+FailureOr<Value> ConcatenateOp::generateResultTileValue(
+    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  assert(resultNumber == 0 && "expect unique result idx");
+  return getTiledImplementation(b, offsets, sizes)
+      .front()
+      ->getResults()
+      .front();
+}
+
+ParseResult ConcatenateOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseDstStyleOp(
+      parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
+        int64_t dimension = 0;
+        if (parser.parseKeyword("dimension") || parser.parseEqual() ||
+            parser.parseInteger(dimension))
+          return failure();
+
+        attributes.set("dimension",
+                       parser.getBuilder().getIndexAttr(dimension));
+        return success();
+      });
+}
+
+void ConcatenateOp::print(OpAsmPrinter &p) {
+  printDstStyleOp<ConcatenateOp>(
+      *this, p,
+      [](ConcatenateOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
+        p << op.getDimensionAttrName().str() << " = " << op.getDimension();
+
+        return {op.getDimensionAttrName()};
+      });
+}
+
+LogicalResult ConcatenateOp::verify() {
+  int64_t concatDim = getDimension().getSExtValue();
+
+  ShapedType inputType =
+      getDpsInputOperand(0)->get().getType().cast<ShapedType>();
+  int64_t rank = inputType.getRank();
+  auto inputShape = inputType.getShape();
+
+  Type outputElementType =
+      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
+
+  for (const auto &en : llvm::enumerate(getInputs())) {
+    ShapedType inputArgShapedType = en.value().getType().cast<ShapedType>();
+    auto inputArgShape = inputArgShapedType.getShape();
+
+    if (inputArgShapedType.getElementType() != outputElementType)
+      return emitOpError() << "expected element type of input "
+                           << inputArgShapedType.getElementType()
+                           << " to match output element type "
+                           << outputElementType;
+
+    if (inputArgShapedType.getRank() != rank)
+      return emitOpError() << "expected all args to be rank " << rank
+                           << ", got " << inputArgShapedType.getRank()
+                           << " in arg " << en.index();
+
+    // Make sure that all dimensions, expect for concatenation dim, in the input
+    // arg are equal.
+    // TODO(shyshkov): Also check output dims once tiling is fixed for
+    // ConcatenateOp.
+    for (int64_t i = 0; i < rank; ++i) {
+      if (i == concatDim) continue;
+
+      if (inputShape[i] != inputArgShape[i])
+        return emitOpError()
+               << "shape of input arg " << en.index() << ": "
+               << inputArgShapedType << " doesn't match expected shape "
+               << inputType << " (all dims except concat dim(" << concatDim
+               << ") should match exactly)";
+    }
+  }
+
+  return verifyDestinationStyleOp(getOperation());
+}
+
+//===----------------------------------------------------------------------===//
+// DynamicBroadcastInDimOp
+//===----------------------------------------------------------------------===//
+
+ParseResult DynamicBroadcastInDimOp::parse(OpAsmParser &parser,
+                                           OperationState &result) {
+  return parseDstStyleOp(parser, result,
+                         [&](OpAsmParser &parser, NamedAttrList &attributes) {
+                           return parseDenseI64ArrayAttr(
+                               parser, attributes, "broadcast_dimensions");
+                         });
+}
+
+void DynamicBroadcastInDimOp::print(OpAsmPrinter &p) {
+  printDstStyleOp<DynamicBroadcastInDimOp>(
+      *this, p,
+      [](DynamicBroadcastInDimOp op,
+         OpAsmPrinter &p) -> SmallVector<StringRef> {
+        printDenseI64ArrayAttr(p, op.getBroadcastDimensionsAttrName(),
+                               op.getBroadcastDimensions());
+        return {op.getBroadcastDimensionsAttrName()};
+      });
+}
+
+LogicalResult DynamicBroadcastInDimOp::verify() {
+  return verifyDestinationStyleOp(getOperation());
+}
+
+SmallVector<utils::IteratorType>
+DynamicBroadcastInDimOp::getLoopIteratorTypes() {
+  return getParallelIteratorTypes(getInit().getType().getRank());
+}
+
+SmallVector<Range> DynamicBroadcastInDimOp::getIterationDomain(OpBuilder &b) {
+  return getIterationDomainForTensor(b, getLoc(), getInit());
+}
+
+SmallVector<Operation *> DynamicBroadcastInDimOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  // Create tile subset.
+  auto loc = getLoc();
+  auto initRank = getInit().getType().cast<RankedTensorType>().getRank();
+
+  DenseMap<uint64_t, Value> localIndexConstants;
+
+  DenseSet<int64_t> dimensionsThatStay(getBroadcastDimensions().begin(),
+                                       getBroadcastDimensions().end());
+
+  // Materialize operand space.
+  auto operandTy = getOperand().getType().cast<RankedTensorType>();
+  auto dynamicDims = tensor::createDynamicDimValues(b, loc, getOperand());
+
+  // Materialize operand dimensions.
+  SmallVector<Value> operandDims;
+  int64_t dynamicDimsIdx = 0;
+  operandDims.reserve(operandTy.getRank());
+  for (const auto &it : llvm::enumerate(operandTy.getShape())) {
+    int64_t d = it.value();
+    Value dim = d == ShapedType::kDynamic
+                    ? dynamicDims[dynamicDimsIdx++]
+                    : b.create<arith::ConstantIndexOp>(loc, d);
+    operandDims.push_back(dim);
+  }
+
+  // Find the expanding dimensions. If corresponding operand and result
+  // dimensions are different then the dimension is expanding.
+  // TODO(frgossen): Use info from known expanding and known non-expanding
+  // dimensions here.
+  SmallVector<Value> operandExpandingDims;
+  for (const auto &it : llvm::enumerate(getBroadcastDimensions())) {
+    auto operandDim = operandDims[it.index()];
+    auto resultDim = b.create<tensor::DimOp>(
+        loc, getInit(), b.create<arith::ConstantIndexOp>(loc, it.value()));
+    operandExpandingDims.push_back(b.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::ne, operandDim, resultDim));
+  }
+
+  // Compute operand tile offsets.
+  auto tileOpOffsets = getValueOrCreateConstantIndexOp(b, loc, offsets);
+  int64_t operandRank = operandTy.getRank();
+  auto staticOffsets = SmallVector<int64_t>(operandRank, ShapedType::kDynamic);
+  SmallVector<Value> operandOffsets;
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
+    if (!dimensionsThatStay.contains(initId)) continue;
+    Value isExpanding = operandExpandingDims[operandId++];
+    Value collapsedSubsetOffset = tileOpOffsets[initId];
+    operandOffsets.push_back(b.create<arith::SelectOp>(loc, isExpanding, zero,
+                                                       collapsedSubsetOffset));
+  }
+
+  // Compute operand tile sizes.
+  auto staticTileSizes =
+      SmallVector<int64_t>(operandRank, ShapedType::kDynamic);
+  SmallVector<Value> tileSizes;
+  Value one = b.create<arith::ConstantIndexOp>(loc, 1);
+  auto tileOpSizes = getValueOrCreateConstantIndexOp(b, loc, sizes);
+  for (int initId = 0, operandId = 0; initId < initRank; ++initId) {
+    if (!dimensionsThatStay.contains(initId)) continue;
+    Value isExpanding = operandExpandingDims[operandId++];
+    Value tileSize = tileOpSizes[initId];
+    tileSizes.push_back(
+        b.create<arith::SelectOp>(loc, isExpanding, one, tileSize));
+  }
+
+  // Create operand tile.
+  auto staticTileStrides = SmallVector<int64_t>(operandRank, 1);
+  SmallVector<Value> tileStrides = {};
+
+  // Materialize operand tiles.
+  Value tiledInit = materializeSlice(b, loc, getInit(), offsets, sizes);
+  Value tiledOperand = materializeSlice(
+      b, loc, getOperand(), getMixedValues(staticOffsets, operandOffsets, b),
+      getMixedValues(staticTileSizes, tileSizes, b),
+      getMixedValues(staticTileStrides, tileStrides, b));
+
+  // Finally, materialize tiled broadcast.
+  auto resultTy = getType(0).cast<RankedTensorType>();
+  auto tiledResultTy =
+      RankedTensorType::get(tiledInit.getType().cast<ShapedType>().getShape(),
+                            resultTy.getElementType());
+  return {b.create<DynamicBroadcastInDimOp>(
+      loc, TypeRange{tiledResultTy}, tiledOperand, tiledInit,
+      getBroadcastDimensionsAttr(), getKnownExpandingDimensionsAttr(),
+      getKnownNonexpandingDimensionsAttr())};
+}
+
+LogicalResult DynamicBroadcastInDimOp::getResultTilePosition(
+    OpBuilder & /*b*/, unsigned /*resultNumber*/,
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+    SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  resultOffsets = llvm::to_vector(offsets);
+  resultSizes = llvm::to_vector(sizes);
+  return success();
+}
+
+FailureOr<Value> DynamicBroadcastInDimOp::generateResultTileValue(
+    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  assert(resultNumber == 0 && "expect unique result idx");
+  return getTiledImplementation(b, offsets, sizes)
+      .front()
+      ->getResults()
+      .front();
+}
+
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+ParseResult ScatterOp::parse(OpAsmParser &parser, OperationState &result) {
+  if (parseDstStyleOp(parser, result)) return failure();
+
+  SmallVector<OpAsmParser::Argument> regionArgs;
+  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/true)) {
+    return failure();
+  }
+
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, regionArgs)) return failure();
+
+  return success();
+}
+
+void ScatterOp::print(OpAsmPrinter &p) {
+  printDstStyleOp<ScatterOp>(*this, p);
+
+  p.increaseIndent();
+  p.printNewline();
+  p << "(";
+  llvm::interleaveComma(getUpdateComputation().getArguments(), p,
+                        [&](auto arg) { p.printRegionArgument(arg); });
+  p << ") ";
+
+  p.printRegion(getUpdateComputation(), /*printEntryBlockArgs=*/false);
+  p.decreaseIndent();
+}
+
+LogicalResult ScatterOp::verify() {
+  if (failed(verifyDestinationStyleOp(getOperation()))) return failure();
+
+  auto indicesType = getIndices().getType().cast<ShapedType>();
+  int64_t indicesRank = indicesType.getRank();
+
+  if (indicesRank != 2)
+    return emitOpError() << "expected `indices` to be a 2D tensor";
+
+  auto updatesType = getUpdates().getType();
+  int64_t updatesRank = updatesType.getRank();
+
+  if (updatesType.getDimSize(0) != indicesType.getDimSize(0)) {
+    return emitOpError() << "expected major dimension of `indices` to match "
+                            "major dimension of `updates`";
+  }
+
+  int64_t indexVectorDim = indicesType.getDimSize(1);
+  if (ShapedType::isDynamic(indexVectorDim))
+    return emitOpError() << "expected index vector dimension size to be static";
+
+  auto initType = getInit().getType();
+  int64_t initRank = initType.getRank();
+
+  if (indexVectorDim > initRank) {
+    return emitOpError() << "expected index vector dimension size = "
+                         << indexVectorDim
+                         << " to be smaller or equal than `init` rank = "
+                         << initRank;
+  }
+
+  if (updatesRank - 1 != initRank)
+    return emitOpError() << "expected `updates` rank + 1 to match `init` rank";
+
+  if (updatesType.getElementType() != initType.getElementType()) {
+    return emitOpError()
+           << "expected `updates` element type to match `init` element type";
+  }
+
+  // The update computation should yield exactly 1 result.
+  auto updateTerminator = cast<YieldOp>(getBody()->getTerminator());
+  Type outputElementType =
+      getDpsInitOperand(0)->get().getType().cast<ShapedType>().getElementType();
+  if (!succeeded(checkYieldOutputs(updateTerminator, outputElementType)))
+    return failure();
+
+  return success();
+}
+
+SmallVector<utils::IteratorType> ScatterOp::getLoopIteratorTypes() {
+  return {utils::IteratorType::reduction};
+}
+
+SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &b) {
+  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getIndices(), 0);
+  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
+}
+
+SmallVector<Operation *> ScatterOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  Location loc = getLoc();
+  IntegerAttr zeroAttr = b.getIndexAttr(0);
+
+  OpFoldResult tileOffset = offsets.front();
+  OpFoldResult tileSize = sizes.front();
+
+  // Tile outer dimension of updates.
+  Value update = this->getUpdates();
+  auto updateType = update.getType().cast<RankedTensorType>();
+
+  SmallVector<OpFoldResult> updateOffsets(updateType.getRank(), zeroAttr);
+  updateOffsets.front() = tileOffset;
+  SmallVector<OpFoldResult> updateSizes = tensor::getMixedSizes(b, loc, update);
+  updateSizes.front() = tileSize;
+
+  Value updateSlice =
+      materializeSlice(b, loc, update, updateOffsets, updateSizes);
+
+  // Tile outer dimension of indices.
+  Value indices = this->getIndices();
+
+  SmallVector<OpFoldResult> indicesOffsets{offsets.front(), zeroAttr};
+  indicesOffsets.front() = tileOffset;
+  SmallVector<OpFoldResult> indicesSizes =
+      tensor::getMixedSizes(b, loc, indices);
+  indicesSizes.front() = tileSize;
+
+  Value indicesSlice =
+      materializeSlice(b, loc, indices, indicesOffsets, indicesSizes);
+
+  // Get full space of the `init` tensor. We use an extract_slice op because
+  // otherwise, tileUsingSCFForOp won't replace the arg with the bbarg.
+  int64_t initRank = getInit().getType().getRank();
+  Value init = materializeSlice(b, loc, this->getInit(),
+                                SmallVector<OpFoldResult>(initRank, zeroAttr),
+                                tensor::getMixedSizes(b, loc, this->getInit()));
+
+  return {mlir::clone(b, this->getOperation(), TypeRange{init.getType()},
+                      ValueRange{indicesSlice, updateSlice, init})};
+}
+
+LogicalResult ScatterOp::getResultTilePosition(
+    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> /*offsets*/,
+    ArrayRef<OpFoldResult> /*sizes*/, SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  ScatterOp scatterOp = cast<ScatterOp>(this->getOperation());
+  auto init = scatterOp.getInit();
+  resultOffsets =
+      SmallVector<OpFoldResult>(init.getType().getRank(), b.getIndexAttr(0));
+  resultSizes = tensor::createDimValues(b, scatterOp.getLoc(), init);
+  return success();
+}
+
+FailureOr<Value> ScatterOp::generateResultTileValue(
+    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  assert(resultNumber == 0 && "variadic scatter is not implemented");
+  return getTiledImplementation(b, offsets, sizes).front()->getResult(0);
+}
+
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+ParseResult GatherOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseDstStyleOp(parser, result);
+}
+
+void GatherOp::print(OpAsmPrinter &p) { printDstStyleOp(*this, p); }
+
+LogicalResult GatherOp::verify() {
+  auto indicesType = getStartIndices().getType();
+  int64_t indicesRank = indicesType.getRank();
+
+  if (indicesRank != 2)
+    return emitOpError() << "expected `indices` to be a 2D tensor";
+
+  auto initType = getInit().getType();
+  if (indicesType.getDimSize(0) != getInit().getType().getDimSize(0)) {
+    return emitOpError()
+           << "expected major dimension of `startIndices` to match "
+              "major dimension of `init`";
+  }
+
+  if (initType.getNumDynamicDims() > 1 ||
+      (initType.getNumDynamicDims() == 1 && !initType.isDynamicDim(0))) {
+    return emitOpError() << "only the major dimenion of `init` may be dynamic";
+  }
+
+  if (indicesType.isDynamic(1)) {
+    return emitOpError()
+           << "the minor dimensions of `startIndices` must be static";
+  }
+
+  return verifyDestinationStyleOp(getOperation());
+}
+
+SmallVector<utils::IteratorType> GatherOp::getLoopIteratorTypes() {
+  return {utils::IteratorType::parallel};
+}
+
+SmallVector<Range> GatherOp::getIterationDomain(OpBuilder &b) {
+  Value indicesCount = b.create<tensor::DimOp>(getLoc(), getStartIndices(), 0);
+  return {Range{b.getIndexAttr(0), indicesCount, b.getIndexAttr(1)}};
+}
+
+SmallVector<Operation *> GatherOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  SmallVector<OpFoldResult> startIndexOffsets{offsets.front(),
+                                              b.getIndexAttr(0)};
+  SmallVector<OpFoldResult> startIndexSizes{
+      sizes.front(),
+      b.getIndexAttr(getStartIndices().getType().getShape().back())};
+  auto subStartIndices = materializeSlice(b, getLoc(), getStartIndices(),
+                                          startIndexOffsets, startIndexSizes);
+
+  int64_t initRank = getInit().getType().getRank();
+  SmallVector<OpFoldResult> initOffsets(initRank, b.getIndexAttr(0));
+  initOffsets[0] = offsets.front();
+  auto initSizes = tensor::getMixedSizes(b, getLoc(), getInit());
+  initSizes[0] = sizes.front();
+  Value initSlice =
+      materializeSlice(b, getLoc(), getInit(), initOffsets, initSizes);
+
+  return {
+      b.create<GatherOp>(getLoc(), TypeRange{initSlice.getType()},
+                         ValueRange{getOperand(), subStartIndices, initSlice})};
+}
+
+LogicalResult GatherOp::getResultTilePosition(
+    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  GatherOp gatherOp = cast<GatherOp>(this->getOperation());
+  auto init = gatherOp.getInit();
+  resultOffsets =
+      SmallVector<OpFoldResult>(init.getType().getRank(), b.getIndexAttr(0));
+  resultOffsets.front() = offsets.front();
+  resultSizes = tensor::createDimValues(b, gatherOp.getLoc(), init);
+  resultSizes.front() = sizes.front();
+  return success();
+}
+
+FailureOr<Value> GatherOp::generateResultTileValue(
+    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  assert(resultNumber == 0 && "resultNumber > 0 not implemented");
+  return getTiledImplementation(b, offsets, sizes).front()->getResult(0);
+}
+
+//===----------------------------------------------------------------------===//
+// SortOp
+//===----------------------------------------------------------------------===//
+
+void SortOp::getAsmResultNames(function_ref<void(Value, StringRef)> setNameFn) {
+  ResultRange results = getResults();
+  for (size_t i = 0; i < results.size(); i++) {
+    setNameFn(results[i], "sorted" + std::to_string(i));
+  }
+}
+
+void SortOp::getAsmBlockArgumentNames(Region &region,
+                                      OpAsmSetValueNameFn setNameFn) {
+  for (int i = 0, e = region.getNumArguments(); i < e; i += 2) {
+    setNameFn(region.getArgument(i), "lhs" + std::to_string(i / 2));
+    setNameFn(region.getArgument(i + 1), "rhs" + std::to_string(i / 2));
+  }
+}
+
+ParseResult SortOp::parse(OpAsmParser &parser, OperationState &result) {
+  if (parseDstStyleOp(
+          parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
+            int64_t dimension = 0;
+            int64_t isStable = 0;
+            if (parser.parseKeyword("dimension") || parser.parseEqual() ||
+                parser.parseInteger(dimension) ||
+                parser.parseKeyword("is_stable") || parser.parseEqual() ||
+                parser.parseInteger(isStable))
+              return failure();
+
+            auto b = parser.getBuilder();
+            attributes.set("dimension", b.getIndexAttr(dimension));
+            attributes.set("is_stable", b.getBoolAttr(isStable != 0));
+            return success();
+          }))
+    return failure();
+
+  SmallVector<OpAsmParser::Argument> regionArgs;
+  if (parser.parseArgumentList(regionArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/true)) {
+    return failure();
+  }
+
+  Region *comparator = result.addRegion();
+  if (parser.parseRegion(*comparator, regionArgs)) return failure();
+
+  return success();
+}
+
+void SortOp::print(OpAsmPrinter &p) {
+  printDstStyleOp<SortOp>(
+      *this, p, [](SortOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
+        p << op.getDimensionAttrName().str() << " = " << op.getDimension()
+          << ' ' << op.getIsStableAttrName().str() << " = " << op.getIsStable();
+        return {op.getDimensionAttrName(), op.getIsStableAttrName()};
+      });
+
+  p.increaseIndent();
+  p.printNewline();
+  p << "(";
+  llvm::interleaveComma(getComparator().getArguments(), p,
+                        [&](auto arg) { p.printRegionArgument(arg); });
+  p << ") ";
+
+  p.printRegion(getComparator(), /*printEntryBlockArgs=*/false);
+  p.decreaseIndent();
+}
+
+LogicalResult SortOp::verify() {
+  auto *comparatorBlock = getBody();
+  auto comparatorArgs = comparatorBlock->getArguments();
+
+  // Checks that the arity of the comparator is equal to twice the number of
+  // inputs.
+  int64_t numInputs = getNumDpsInputs();
+  int64_t numOutputs = getNumDpsInits();
+  if (getNumDpsInits() != numInputs) {
+    return emitOpError() << "expected the number of inputs " << numInputs
+                         << " to match the number of outputs " << numOutputs;
+  }
+  if (static_cast<int64_t>(comparatorArgs.size()) != numInputs * 2) {
+    return emitOpError() << "expected the number of block arguments "
+                         << comparatorArgs.size() << " to be twice the number "
+                         << "of inputs (2*" << numInputs << ")";
+  }
+  // Checks that the comparator's arguments match the element type of the
+  // inputs.
+  TypeRange inputTypes = TypeRange{getInputs()};
+  TypeRange comparatorArgElementTypes = comparatorBlock->getArgumentTypes();
+  for (size_t i = 0; i < getInputs().size(); ++i) {
+    Type inputArgElemType = inputTypes[i].cast<ShapedType>().getElementType(),
+         comparatorArgElemType1 = comparatorArgElementTypes[2 * i],
+         comparatorArgElemType2 = comparatorArgElementTypes[2 * i + 1];
+    if (comparatorArgElemType1 != inputArgElemType ||
+        comparatorArgElemType2 != inputArgElemType)
+      return emitOpError() << "expected element type of input " << i
+                           << " to match type of the corresponding "
+                              "arguments to the comparison function but got "
+                           << inputArgElemType << " and ("
+                           << comparatorArgElemType1 << ", "
+                           << comparatorArgElemType2 << ")";
+  }
+
+  // Checks that the comparator yields exactly one boolean output.
+  YieldOp comparatorTerminator =
+      cast<YieldOp>(comparatorBlock->getTerminator());
+  if (!succeeded(
+          checkYieldOutputs(comparatorTerminator,
+                            TypeRange({IntegerType::get(getContext(), 1)}))))
+    return failure();
+
+  // Checks that the inputs all have the same shape.
+  ArrayRef<int64_t> referenceShape =
+      getInputs().front().getType().cast<ShapedType>().getShape();
+
+  for (auto &item : llvm::enumerate(TypeRange{getInputs()})) {
+    ArrayRef<int64_t> shape = item.value().cast<ShapedType>().getShape();
+    if (shape != referenceShape) {
+      return emitOpError() << "expected all inputs to have the same shape ("
+                           << referenceShape << ") but input " << item.index()
+                           << " has shape (" << shape << ")";
+    }
+  }
+
+  // Checks that the outputs have the same shape as the inputs.
+  for (auto &item : llvm::enumerate(getInits())) {
+    ArrayRef<int64_t> shape =
+        item.value().getType().cast<ShapedType>().getShape();
+    if (shape != referenceShape) {
+      return emitOpError() << "expected outputs to have shape ("
+                           << referenceShape << ") but output " << item.index()
+                           << " has shape (" << shape << ")";
+    }
+  }
+
+  // Checks that the rank of the reference shape is larger than the absolute
+  // value of the sorting dimension. This is enough to ensure that the dimension
+  // is valid, since all inputs are known to have the same shape. `getDimension`
+  // returns an unsigned int, so no need to check for negative values.
+  size_t referenceRank = referenceShape.size();
+  if (getDimension().getSExtValue() >= (int64_t)referenceRank) {
+    return emitOpError() << "sorting dimension must be in range [0, "
+                         << referenceRank << ") but got "
+                         << getDimension().getSExtValue();
+  }
+
+  return verifyDestinationStyleOp(getOperation());
+}
+
+SmallVector<utils::IteratorType> SortOp::getLoopIteratorTypes() {
+  return getParallelIteratorTypes(getType(0).cast<ShapedType>().getRank() - 1);
+}
+
+SmallVector<Range> SortOp::getIterationDomain(OpBuilder &b) {
+  Location loc = getLoc();
+  auto oneInit = getInits().front();
+  auto operandsRank = oneInit.getType().cast<ShapedType>().getRank();
+
+  SmallVector<Range> iterationDomain(operandsRank - 1);
+
+  IntegerAttr zero = b.getIndexAttr(0);
+  IntegerAttr one = b.getIndexAttr(1);
+  int64_t sortDimension = getDimension().getSExtValue();
+
+  for (auto axis : llvm::seq<int64_t>(0, operandsRank - 1)) {
+    int64_t operandAxis = (axis >= sortDimension) ? axis + 1 : axis;
+    iterationDomain[axis].offset = zero;
+    iterationDomain[axis].size =
+        b.createOrFold<tensor::DimOp>(loc, oneInit, operandAxis);
+    iterationDomain[axis].stride = one;
+  }
+  return iterationDomain;
+}
+
+SmallVector<Operation *> SortOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  auto loc = getLoc();
+  SmallVector<OpFoldResult> tileOffsets = llvm::to_vector(offsets);
+  SmallVector<OpFoldResult> tileSizes = llvm::to_vector(sizes);
+
+  size_t numOutputs = getNumDpsInits();
+  int64_t sortDimension = getDimension().getSExtValue();
+
+  Value oneInput = getInputs().front();
+
+  // Capture the entire sorting axis in each tile.
+  tileOffsets.insert(tileOffsets.begin() + sortDimension, b.getIndexAttr(0));
+
+  OpFoldResult sortDimensionSize =
+      b.createOrFold<tensor::DimOp>(loc, oneInput, sortDimension);
+  tileSizes.insert(tileSizes.begin() + sortDimension, sortDimensionSize);
+
+  // Materialize the tile for each input and init.
+  SmallVector<Value> tiledInputsAndInits;
+  SmallVector<Type> tiledResultTypes;
+  tiledInputsAndInits.reserve(numOutputs * 2);
+  tiledResultTypes.reserve(numOutputs);
+
+  for (const auto &input : getInputs()) {
+    tiledInputsAndInits.push_back(
+        materializeSlice(b, loc, input, tileOffsets, tileSizes));
+    auto tileShape =
+        tiledInputsAndInits.back().getType().cast<ShapedType>().getShape();
+    tiledResultTypes.push_back(RankedTensorType::get(
+        tileShape, input.getType().cast<ShapedType>().getElementType()));
+  }
+
+  for (const auto &init : getInits()) {
+    tiledInputsAndInits.push_back(
+        materializeSlice(b, loc, init, tileOffsets, tileSizes));
+  }
+
+  return {mlir::clone(b, this->getOperation(), tiledResultTypes,
+                      tiledInputsAndInits)};
+}
+
+LogicalResult SortOp::getResultTilePosition(
+    OpBuilder &b, unsigned /*resultNumber*/, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes, SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  SortOp sortOp = cast<SortOp>(this->getOperation());
+  resultOffsets = llvm::to_vector(offsets);
+  resultSizes = llvm::to_vector(sizes);
+
+  int64_t sortDimIndex = sortOp.getDimension().getSExtValue();
+  Value sortDimValue = b.create<tensor::DimOp>(
+      sortOp.getLoc(), sortOp.getInputs().front(), sortDimIndex);
+  resultOffsets.insert(resultOffsets.begin() + sortDimIndex, b.getIndexAttr(0));
+  resultSizes.insert(resultSizes.begin() + sortDimIndex, sortDimValue);
+  return success();
+}
+
+FailureOr<Value> SortOp::generateResultTileValue(OpBuilder &b,
+                                                 unsigned resultNumber,
+                                                 ArrayRef<OpFoldResult> offsets,
+                                                 ArrayRef<OpFoldResult> sizes) {
+  return getTiledImplementation(b, offsets, sizes)
+      .front()
+      ->getResult(resultNumber);
+}
+
+//===----------------------------------------------------------------------===//
+// ReverseOp
+//===----------------------------------------------------------------------===//
+
+ParseResult ReverseOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseDstStyleOp(
+      parser, result, [&](OpAsmParser &parser, NamedAttrList &attributes) {
+        return parseDenseI64ArrayAttr(parser, attributes, "reverse_dimensions");
+      });
+}
+
+void ReverseOp::print(OpAsmPrinter &p) {
+  printDstStyleOp<ReverseOp>(
+      *this, p, [](ReverseOp op, OpAsmPrinter &p) -> SmallVector<StringRef> {
+        printDenseI64ArrayAttr(p, op.getReverseDimensionsAttrName(),
+                               op.getReverseDimensions());
+        return {op.getReverseDimensionsAttrName()};
+      });
+}
+
+LogicalResult ReverseOp::verify() {
+  return verifyDestinationStyleOp(getOperation());
+}
+
+void ReverseOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult(), "reversed");
+}
+
+SmallVector<utils::IteratorType> ReverseOp::getLoopIteratorTypes() {
+  return getParallelIteratorTypes(getType().cast<ShapedType>().getRank() - 1);
+}
+
+SmallVector<Range> ReverseOp::getIterationDomain(OpBuilder &b) {
+  return getIterationDomainForTensor(b, getLoc(), getInit());
+}
+
+namespace {
+SmallVector<OpFoldResult> getInputTileOffsetsForReverse(
+    OpBuilder &b, Location loc, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> tileSizes, ArrayRef<int64_t> reverseDimensions,
+    TypedValue<ShapedType> &input) {
+  auto tileOpOffsets = getValueOrCreateConstantIndexOp(b, loc, offsets);
+  auto sizes = getValueOrCreateConstantIndexOp(b, loc, tileSizes);
+  SmallVector<OpFoldResult> inputTileOffsets;
+  for (size_t i = 0; i < tileOpOffsets.size(); ++i) {
+    if (llvm::is_contained(reverseDimensions, i)) {
+      inputTileOffsets.push_back(OpFoldResult{b.createOrFold<arith::SubIOp>(
+          loc,
+          b.createOrFold<arith::SubIOp>(
+              loc, b.createOrFold<tensor::DimOp>(loc, input, i),
+              Value(tileOpOffsets[i])),
+          sizes[i])});
+    } else {
+      inputTileOffsets.push_back(tileOpOffsets[i]);
+    }
+  }
+
+  return inputTileOffsets;
+}
+}  // namespace
+
+SmallVector<Operation *> ReverseOp::getTiledImplementation(
+    OpBuilder &b, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  auto loc = getLoc();
+  auto input = getInput();
+  SmallVector<OpFoldResult> inputTileOffsets = getInputTileOffsetsForReverse(
+      b, loc, offsets, sizes, getReverseDimensions(), input);
+
+  // Materialize the tile for input and init.
+  SmallVector<Value, 2> tiledInputsAndInits;
+
+  tiledInputsAndInits.push_back(
+      materializeSlice(b, loc, input, inputTileOffsets, sizes));
+  tiledInputsAndInits.push_back(
+      materializeSlice(b, loc, getInit(), offsets, sizes));
+  auto tileShape =
+      tiledInputsAndInits.back().getType().cast<ShapedType>().getShape();
+  auto tiledResultType = RankedTensorType::get(
+      tileShape, input.getType().cast<ShapedType>().getElementType());
+
+  return {mlir::clone(b, this->getOperation(), tiledResultType,
+                      tiledInputsAndInits)};
+}
+
+LogicalResult ReverseOp::getResultTilePosition(
+    OpBuilder & /*b*/, unsigned /*resultNumber*/,
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+    SmallVector<OpFoldResult> &resultOffsets,
+    SmallVector<OpFoldResult> &resultSizes) {
+  resultOffsets = llvm::to_vector(offsets);
+  resultSizes = llvm::to_vector(sizes);
+  return success();
+}
+
+FailureOr<Value> ReverseOp::generateResultTileValue(
+    OpBuilder &b, unsigned resultNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  return getTiledImplementation(b, offsets, sizes)
+      .front()
+      ->getResult(resultNumber);
+}
+
+OpFoldResult ReverseOp::fold(
+    ReverseOpGenericAdaptor<ArrayRef<Attribute>>) /*operands*/ {
+  auto inputType = getInput().getType();
+  for (unsigned i = 0; i < getReverseDimensions().size(); ++i) {
+    if (inputType.getDimSize(getReverseDimensions()[i]) != 1) return nullptr;
+  }
+  return getInput();
+}
+
+}  // namespace thlo
+}  // namespace mlir
+
+// Generated op classes.
+#define GET_OP_CLASSES
+#include "thlo/IR/thlo_ops.cc.inc"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h
rename to tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h
index 08d0cdd0b68..d844aeb884f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/IR/thlo_ops.h
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.h
@@ -15,23 +15,22 @@ limitations under the License.
 
 // This file defines the operations used in the THLO dialect.
 
-#ifndef MLIR_HLO_DIALECT_THLO_IR_THLO_OPS_H
-#define MLIR_HLO_DIALECT_THLO_IR_THLO_OPS_H
+#ifndef MLIR_HLO_THLO_IR_THLO_OPS_H
+#define MLIR_HLO_THLO_IR_THLO_OPS_H
 
-#include "mlir-hlo/Dialect/gml_st/transforms/tiling_interface.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/TilingInterface.h"
 
 // Generated dialect declarations.
-#include "mlir-hlo/Dialect/thlo/IR/thlo_dialect.h.inc"
+#include "thlo/IR/thlo_dialect.h.inc"
 
 // Generated operation classes.
 #define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h.inc"
+#include "thlo/IR/thlo_ops.h.inc"
 
-#endif  // MLIR_HLO_DIALECT_THLO_IR_THLO_OPS_H
+#endif  // MLIR_HLO_THLO_IR_THLO_OPS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.td
new file mode 100644
index 00000000000..362e540d1a6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/IR/thlo_ops.td
@@ -0,0 +1,349 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THLO_OPS
+#define THLO_OPS
+
+include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/DestinationStyleOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/TilingInterface.td"
+
+def TensorOrMemref :
+  AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
+
+class TensorOrMemrefOf<list<Type> allowedTypes> :
+  AnyTypeOf<[MemRefOf<allowedTypes>, RankedTensorOf<allowedTypes>],
+  "", "::mlir::ShapedType">;
+
+def THLO_Dialect : Dialect {
+  let name = "thlo";
+  let cppNamespace = "::mlir::thlo";
+  let useFoldAPI = kEmitFoldAdaptorFolder;
+}
+
+class THLO_Op<string mnemonic, list<Trait> traits> :
+    Op<THLO_Dialect, mnemonic, traits> {
+  let hasVerifier = 1;
+}
+
+class THLO_DstStyleOp<string mnemonic, list<Trait> traits> : THLO_Op<mnemonic, [
+    DestinationStyleOpInterface] # traits> {
+  let hasCustomAssemblyFormat = 1;
+}
+
+def THLO_ConcatenateOp : THLO_DstStyleOp<"concatenate", [
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>
+  ]> {
+  let summary = "Destination-style twin for `mhlo.concatenate`";
+  let description = [{
+    tHLO ConcatenateOp composes a tensor or a memref from multiple tensors or
+    memrefs.
+
+    Example:
+    ```
+      %concat = thlo.concatenate
+        ins(%T1 : tensor<100x?xf32>, %T2 : tensor<300x?xf32>)
+        outs(%init : tensor<400x?xf32>)
+        dimension = 0
+    ```
+
+    See https://www.tensorflow.org/xla/operation_semantics#concatenate
+  }];
+
+  let arguments = (ins
+    Variadic<TensorOrMemref>:$inputs,
+    TensorOrMemref:$init,
+    IndexAttr:$dimension
+  );
+  let results = (outs Variadic<AnyTensor>:$result);
+
+  let extraClassDeclaration = [{
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - 1, getNumOperands};
+    }
+  }];
+}
+
+def THLO_DynamicBroadcastInDimOp : THLO_DstStyleOp<"dynamic_broadcast_in_dim", [
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>
+  ]> {
+  let summary = "Destination-style twin for `mhlo.dynamic_broadcast_in_dim`";
+  let description = [{
+    tHLO DynamicBroadcastInDimOp specifies a map how to broadcast input
+    dimensions. It also supports broadcasting size-1 dimensions.
+
+    Example:
+    ```
+      %dyn_bcast = thlo.dynamic_broadcast_in_dim
+        ins(%input : tensor<?x?xf32>)
+        outs(%init : tensor<?x?x?xf32>)
+        broadcast_dimensions = [0, 2]
+    ```
+
+    See https://www.tensorflow.org/xla/operation_semantics#broadcastindim
+  }];
+
+  let arguments = (ins
+    // Input args
+    TensorOrMemref:$operand,
+    // Output arg
+    TensorOrMemref:$init,
+
+    DenseI64ArrayAttr:$broadcast_dimensions,
+    OptionalAttr<DenseI64ArrayAttr>:$known_expanding_dimensions,
+    OptionalAttr<DenseI64ArrayAttr>:$known_nonexpanding_dimensions
+  );
+
+  let results = (outs Variadic<AnyTensor>:$result);
+
+  let extraClassDeclaration = [{
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - 1, getNumOperands};
+    }
+  }];
+}
+
+def THLO_GatherOp : THLO_DstStyleOp<"gather", [
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>
+  ]> {
+  let summary = "Destination-style twin for `mhlo.gather`";
+  let description = [{
+    tHLO GatherOp corresponds to the canonicalized mHLO GatherOp, i.e.
+
+    - start_indices is a two-dimensional tensor.
+    - index_vector_dim is 1
+    - offset_dims is [1, 2, ...]
+    - collapsed_slice_dims is []
+    - start_index_map is range(start_indices.shape[1])
+
+    Example:
+    ```
+      %gathered = thlo.gather
+        ins(%input : tensor<100xf32>, %indices : tensor<42x1xindex>)
+        outs(%init : tensor<42xf32>)
+    ```
+
+    See https://www.tensorflow.org/xla/operation_semantics#gather.
+  }];
+  let arguments = (ins
+    // Input args
+    TensorOrMemref:$operand,
+    TensorOrMemrefOf<[Index]>:$start_indices,
+    // Output arg
+    TensorOrMemref:$init
+  );
+  let results = (outs Variadic<AnyTensor>:$result);
+
+  let extraClassDeclaration = [{
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - 1, getNumOperands};
+    }
+  }];
+}
+
+def THLO_ScatterOp : THLO_DstStyleOp<"scatter", [
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>,
+    SingleBlockImplicitTerminator<"YieldOp">
+  ]> {
+  let summary = "Destination-style twin for `mhlo.scatter`";
+  let description = [{
+    tHLO ScatterOp corresponds to the canonicalized mHLO ScatterOp, i.e.
+
+    - update_window_dims is range(1, rank(update_window_dims))
+    - inserted_window_dims is []
+    - scatter_dims_to_operand_dims is range(0, rank(indices))
+    - index_vector_dim is rank(indices) - 1
+
+    At the moment, the variadic case is not supported.
+
+    Example:
+    ```
+      %scattered = thlo.scatter
+        ins(%indices : tensor<2x2xindex>, %input : tensor<2x1x3xf32>)
+        outs(%init : tensor<3x3xf32>)
+        (%arg3: f32, %arg4: f32) {
+          %0 = arith.addf %arg3, %arg4 : f32
+          thlo.yield %0 : f32
+        }
+    ```
+
+    See https://www.tensorflow.org/xla/operation_semantics#scatter.
+  }];
+
+  let arguments = (ins
+    // Input args
+    TensorOrMemrefOf<[Index]>:$indices,
+    TensorOrMemref:$updates,
+    // Output arg
+    TensorOrMemref:$init
+  );
+
+  let results = (outs Variadic<AnyTensor>:$result);
+
+  let regions = (region SizedRegion<1>:$update_computation);
+
+  let extraClassDeclaration = [{
+    // Returns index vector dimension size, which is always statically-known.
+    int64_t getIndexVectorDimSize() {
+      return getIndices().getType().getDimSize(1);
+    }
+
+    // Returns the number of indices, i.e. number of scalar/tensor updates.
+    int64_t getIndicesCount() { return getIndices().getType().getDimSize(0); }
+
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - 1, getNumOperands};
+    }
+  }];
+}
+
+def THLO_SortOp : THLO_DstStyleOp<"sort", [
+    DeclareOpInterfaceMethods<OpAsmOpInterface, [
+      "getAsmResultNames",
+      "getAsmBlockArgumentNames"
+    ]>,
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>,
+    SameVariadicOperandSize,
+    SingleBlockImplicitTerminator<"YieldOp">
+  ]> {
+  let summary = "Destination-style twin for the `mhlo.sort`";
+  let description = [{
+    Sorts the given `operands` along the given `dimension` using the given
+    `comparator`.
+
+    Example:
+    ```
+      %sorted1, %sorted2 = thlo.sort
+        ins(%input1: tensor<?x?xf32>, %input2: tensor<?x?xi32>)
+        outs(%init1: tensor<?x?xf32>, %init2: tensor<?x?xi32>)
+        dimension = 0
+        is_stable = true
+        (%lhs0: f32, %rhs0: f32, %lhs1: i32, %rhs1: i32) {
+          %0 = arith.cmpf ogt, %lhs0, %rhs0 : f32
+          thlo.yield %0 : i1
+        }
+    ```
+
+    See https://www.tensorflow.org/xla/operation_semantics#sort.
+  }];
+
+  let arguments = (ins
+    // Input args
+    Variadic<TensorOrMemref>:$inputs,
+    // Output args
+    Variadic<TensorOrMemref>:$inits,
+
+    IndexAttr:$dimension,
+    BoolAttr:$is_stable
+  );
+
+  let results = (outs Variadic<AnyTensor>:$result);
+  let regions = (region SizedRegion<1>:$comparator);
+
+  let extraClassDeclaration = [{
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - getInits().size(), getNumOperands};
+    }
+  }];
+}
+
+def THLO_ReverseOp : THLO_DstStyleOp<"reverse", [
+    DeclareOpInterfaceMethods<TilingInterface, [
+      "generateResultTileValue",
+      "getIterationDomain",
+      "getLoopIteratorTypes",
+      "getResultTilePosition",
+      "getTiledImplementation"
+    ]>,
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,]> {
+  let summary = "Destination-style twin for the `mhlo.reverse`";
+  let description = [{
+    Reverses the specified dimensions of `input` according to the given
+    `dimensions`.
+
+    See https://www.tensorflow.org/xla/operation_semantics#rev_reverse.
+  }];
+
+  let arguments = (ins
+    TensorOrMemref:$input,
+    TensorOrMemref:$init,
+    DenseI64ArrayAttr:$reverse_dimensions
+  );
+
+  let results = (outs TensorOrMemref:$result);
+
+  let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Implement method necessary for DestinationStyleOpInterface.
+    std::pair<int64_t, int64_t> getDpsInitsPositionRange() {
+      int64_t getNumOperands = this->getNumOperands();
+      return {getNumOperands - 1, getNumOperands};
+    }
+  }];
+}
+
+def THLO_YieldOp : THLO_Op<"yield", [Pure, ReturnLike, Terminator,
+    ParentOneOf<["ScatterOp", "SortOp"]>]>,
+    Arguments<(ins Variadic<AnyType>:$values)> {
+  let summary = "Yield operation for tHLO ops with regions.";
+  let assemblyFormat = "attr-dict $values `:` type($values)";
+  let hasVerifier = 1;
+}
+
+#endif // THLO_OPS
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt
new file mode 100644
index 00000000000..6ee9828d2a5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(ThloBufferizableOpInterface
+  bufferizable_op_interface_impl.cc
+
+  LINK_LIBS PUBLIC
+  THLODialect
+  MLIRBufferizationDialect
+  MLIRDestinationStyleOpInterface
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc
new file mode 100644
index 00000000000..1996644b618
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.cc
@@ -0,0 +1,153 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "thlo/interfaces/bufferizable_op_interface_impl.h"
+
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "thlo/IR/thlo_ops.h"
+
+namespace mlir {
+namespace thlo {
+namespace {
+
+using mlir::bufferization::AliasingOpOperandList;
+using mlir::bufferization::AliasingOpResultList;
+using mlir::bufferization::AnalysisState;
+using mlir::bufferization::BufferizableOpInterface;
+using mlir::bufferization::BufferizationOptions;
+using mlir::bufferization::BufferRelation;
+
+// We can reuse the upstream implementation when DestinationStyleOpInterface
+// is moved out of linalg.
+static LogicalResult bufferizeDestinationStyleOpInterface(
+    RewriterBase &rewriter, DestinationStyleOpInterface op,
+    const BufferizationOptions &options) {
+  // Take a guard before anything else.
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(op);
+
+  // Nothing to do. This op is already bufferized.
+  if (op.hasBufferSemantics()) return success();
+
+  if (!op.hasTensorSemantics())
+    return op->emitError() << "expected either buffer or tensor semantics";
+
+  size_t numOutputs = op.getNumDpsInits();
+
+  // New operands for the cloned op.
+  SmallVector<Value> newOperands;
+  newOperands.reserve(op.getNumDpsInputs() + numOutputs);
+
+  for (OpOperand *opOperand : op.getDpsInputOperands()) {
+    if (op.isScalar(opOperand)) {
+      newOperands.push_back(opOperand->get());
+      continue;
+    }
+    FailureOr<Value> buffer = getBuffer(rewriter, opOperand->get(), options);
+    if (failed(buffer)) return failure();
+    newOperands.push_back(*buffer);
+  }
+
+  // New output operands for the cloned op.
+  SmallVector<Value> newOutputs;
+  newOutputs.reserve(numOutputs);
+
+  for (OpResult opResult : op->getOpResults()) {
+    OpOperand *opOperand = op.getDpsInitOperand(opResult.getResultNumber());
+    FailureOr<Value> resultBuffer =
+        getBuffer(rewriter, opOperand->get(), options);
+    if (failed(resultBuffer)) return failure();
+    newOutputs.push_back(*resultBuffer);
+  }
+
+  newOperands.append(newOutputs.begin(), newOutputs.end());
+
+  // Set insertion point now that potential alloc/dealloc are introduced.
+  rewriter.setInsertionPoint(op);
+
+  // Clone the op, but use the new operands. Move the existing block into the
+  // new op. Since the new op does not have any tensor results, it does not
+  // return anything.
+  auto newOp = cast<DestinationStyleOpInterface>(cloneWithoutRegions(
+      rewriter, op, /*resultTypes=*/TypeRange{}, newOperands));
+
+  assert(op->getNumRegions() <= 1);
+  if (op->getNumRegions() == 1) {
+    rewriter.inlineRegionBefore(op->getRegion(0), newOp->getRegion(0),
+                                newOp->getRegion(0).begin());
+  }
+
+  // Replace the results of the old op with the new output buffers.
+  bufferization::replaceOpWithBufferizedValues(rewriter, op, newOutputs);
+
+  return success();
+}
+
+struct ThloSortOpBufferizationModel
+    : public BufferizableOpInterface::ExternalModel<
+          ThloSortOpBufferizationModel, SortOp> {
+  bool bufferizesToMemoryRead(Operation * /*op*/, OpOperand & /*opOperand*/,
+                              const AnalysisState & /*state*/) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState & /*state*/) const {
+    return cast<DestinationStyleOpInterface>(op).isDpsInit(&opOperand);
+  }
+
+  AliasingOpOperandList getAliasingOpOperands(
+      Operation *op, OpResult opResult, const AnalysisState & /*state*/) const {
+    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
+
+    // The i-th OpResult may alias with the i-th "out" tensor.
+    return {dstStyleOp.getDpsInitOperand(opResult.getResultNumber())};
+  }
+
+  AliasingOpResultList getAliasingOpResults(
+      Operation *op, OpOperand &opOperand,
+      const AnalysisState & /*state*/) const {
+    auto dstStyleOp = cast<DestinationStyleOpInterface>(op);
+
+    // The i-th "out" tensor may alias with the i-th OpResult.
+    if (dstStyleOp.isDpsInit(&opOperand))
+      return {dstStyleOp.getTiedOpResult(&opOperand)};
+    return {};
+  }
+
+  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
+                          const BufferizationOptions &options) const {
+    return bufferizeDestinationStyleOpInterface(
+        rewriter, cast<DestinationStyleOpInterface>(op), options);
+  }
+
+  BufferRelation bufferRelation(Operation * /*op*/, OpResult /*opResult*/,
+                                const AnalysisState & /*state*/) const {
+    return BufferRelation::Equivalent;
+  }
+};
+
+}  // namespace
+
+}  // namespace thlo
+}  // namespace mlir
+
+void mlir::thlo::registerBufferizableOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, thlo::THLODialect * /*dialect*/) {
+    SortOp::attachInterface<ThloSortOpBufferizationModel>(*ctx);
+  });
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h
new file mode 100644
index 00000000000..ee35b031ac5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+#define MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace thlo {
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry);
+
+}  // namespace thlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_THLO_INTERFACES_BUFFERIZABLE_OP_INTERFACE_IMPL_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..d582d86a724
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS thlo_passes.td)
+mlir_tablegen(thlo_passes.h.inc -gen-pass-decls -name AllThlo)
+add_public_tablegen_target(MLIRThloPassIncGen)
+
+include_directories(BEFORE
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR})
+
+add_mlir_library(ThloPasses
+  legalize_sort/legalize_sort.cc
+
+  DEPENDS
+  MLIRThloPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithUtils
+  MLIRFuncDialect
+  MLIRMemRefDialect
+  MLIRPass
+  MLIRSCFDialect
+  MLIRTransforms
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
new file mode 100644
index 00000000000..43f9a8cc2b7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
@@ -0,0 +1,544 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "thlo/IR/thlo_ops.h"
+#include "thlo/transforms/passes.h"
+
+namespace mlir {
+namespace thlo {
+
+#define GEN_PASS_DEF_THLOLEGALIZESORTPASS
+#include "thlo/transforms/thlo_passes.h.inc"
+
+namespace {
+
+using ::mlir::arith::AddIOp;
+using ::mlir::arith::MinSIOp;
+using ::mlir::arith::SelectOp;
+
+constexpr uint64_t kInsertionSortSize = 16;
+
+// Inlines the `comparator` region (without terminator) at the current insertion
+// point, replacing the arguments with the given values from `lhs` and `rhs`.
+Value emitComparison(ImplicitLocOpBuilder& b, SmallVector<Value>& lhs,
+                     SmallVector<Value>& rhs, Region& comparator) {
+  assert(comparator.hasOneBlock() && "Comparator must have only one block.");
+  Block& block = comparator.front();
+  assert(block.getTerminator()->getOperands().size() == 1 &&
+         "Comparator must return a single value");
+
+  IRMapping mapping;
+  for (auto [idx, arg] : llvm::enumerate(comparator.getArguments())) {
+    Value value = idx % 2 == 0 ? lhs[idx / 2] : rhs[idx / 2];
+    mapping.map(arg, value);
+  }
+
+  for (Operation& op : block.without_terminator()) b.clone(op, mapping);
+  Value result = mapping.lookup(block.getTerminator()->getOperand(0));
+
+  return result;
+}
+
+// Emits a binary search of `pivots` in `arrayMemrefs` (all rank 1) in the range
+// [`left`;`right`). `arrayMemrefs` must be sorted according to `comparator`.
+Value emitBinarySearch(ImplicitLocOpBuilder& b, Value leftInit, Value rightInit,
+                       SmallVector<Value>& pivots, ValueRange arrayMemrefs,
+                       Region& comparator) {
+  SmallVector<Type, 2> types{leftInit.getType(), rightInit.getType()};
+  ArithBuilder arith(b, b.getLoc());
+
+  // while (
+  auto whileOp = b.create<scf::WhileOp>(
+      types, SmallVector<Value, 2>{leftInit, rightInit},
+      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
+        //  left < right) {
+        Value left = args[0], right = args[1];
+        beforeBuilder.create<scf::ConditionOp>(beforeLoc,
+                                               arith.slt(left, right), args);
+      },
+      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
+        ImplicitLocOpBuilder impLocAfterBuilder =
+            ImplicitLocOpBuilder(afterLoc, afterBuilder);
+        Value left = args[0], right = args[1];
+        //   int mid = (left + right) >> 1;
+        Value one = impLocAfterBuilder.create<arith::ConstantIndexOp>(1);
+        Value mid = impLocAfterBuilder.create<arith::ShRUIOp>(
+            arith.add(left, right), one);
+        Value midPlusOne = impLocAfterBuilder.create<AddIOp>(mid, one);
+
+        auto arraysAtMid = llvm::to_vector(
+            llvm::map_range(arrayMemrefs, [&](Value arrayMemref) -> Value {
+              return impLocAfterBuilder.create<memref::LoadOp>(arrayMemref,
+                                                               mid);
+            }));
+
+        Value cond =
+            emitComparison(impLocAfterBuilder, pivots, arraysAtMid, comparator);
+        //   if (comparator(pivot, array[mid]))
+        //     right = mid;
+        //   else
+        //     left = mid + 1;
+        Value newLeft = arith.select(cond, left, midPlusOne);
+        Value newRight = arith.select(cond, mid, right);
+
+        // }
+        impLocAfterBuilder.create<scf::YieldOp>(ValueRange{newLeft, newRight});
+      });
+
+  return whileOp.getResult(0);
+}
+
+SmallVector<Value> loadMemrefElements(ImplicitLocOpBuilder& b,
+                                      ValueRange memrefs, Value index) {
+  return llvm::to_vector(llvm::map_range(memrefs, [&](Value memref) -> Value {
+    Type type = memref.getType().cast<MemRefType>().getElementType();
+    return b.create<memref::LoadOp>(type, memref, index);
+  }));
+}
+
+void storeMemrefElements(ImplicitLocOpBuilder& b, ValueRange memrefs,
+                         Value index, ValueRange values) {
+  for (auto [value, memref] : llvm::zip(values, memrefs)) {
+    b.create<memref::StoreOp>(value, memref, index);
+  }
+}
+
+// Insertion sorts `inputMemrefs` in the range [`lo`; `hi`), storing the results
+// in `outputMemrefs`. `inputMemrefs` and `outputMemrefs` must all be rank 1 and
+// of identical size.
+void emitInsertionSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
+                       ValueRange inputMemrefs, ValueRange outputMemrefs,
+                       mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value one = b.create<arith::ConstantIndexOp>(1);
+
+  // array[lo] = inputs[lo];
+  storeMemrefElements(b, outputMemrefs, lo,
+                      loadMemrefElements(b, inputMemrefs, lo));
+
+  // for (int start = lo + 1; start < hi; ++start)
+  {
+    auto forOp = b.create<scf::ForOp>(arith.add(lo, one), hi, one);
+    OpBuilder::InsertionGuard outerGuard(b);
+    b.setInsertionPointToStart(forOp.getBody());
+    Value start = forOp.getInductionVar();
+
+    //   T pivot = inputs[start];
+    auto pivots = loadMemrefElements(b, inputMemrefs, start);
+
+    //   int index = binarySearch(lo, start, pivot, array, comparator);
+    auto index =
+        emitBinarySearch(b, lo, start, pivots, outputMemrefs, comparator);
+
+    //   int n = start - index;  // The number of elements to move
+    Value n = arith.sub(start, index);
+
+    // memmove(&array[index + 1], &array[index], n * sizeof(T))
+    // memref::CopyOp would be nice to use here, but:
+    // 1. It lowers to a quite inefficient library call in the general case
+    //    (strides != 1).
+    // 2. It implements memcpy semantics, but we need memmove here.
+    // So we go with a loop instead.
+    auto copyForOp = b.create<scf::ForOp>(zero, n, one);
+    {
+      OpBuilder::InsertionGuard innerGuard(b);
+      b.setInsertionPointToStart(copyForOp.getBody());
+      Value copyLoopIndex = copyForOp.getInductionVar();
+
+      Value dstIndex = arith.sub(start, copyLoopIndex);
+      Value srcIndex = arith.sub(dstIndex, one);
+      storeMemrefElements(b, outputMemrefs, dstIndex,
+                          loadMemrefElements(b, outputMemrefs, srcIndex));
+    }
+    //   array[index] = pivot;
+    storeMemrefElements(b, outputMemrefs, index, pivots);
+  }
+}
+
+void emitMerge(ImplicitLocOpBuilder& b, Value lo, Value mid, Value hi,
+               ValueRange readBufs, ValueRange writeBufs,
+               mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  // The while loop runs until we reach the end of either interval. It has three
+  // loop-carried variables:
+  // 1. current output index
+  // 2. current read index for interval 1
+  // 3. current read index for interval 2
+  SmallVector<Type> whileArgTypes{lo.getType(), lo.getType(), mid.getType()};
+  SmallVector<Value> whileInitArgs{lo, lo, mid};
+  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
+
+  // while(
+  auto whileOp = b.create<scf::WhileOp>(
+      whileArgTypes, whileInitArgs,
+      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
+        Value i0 = args[1], i1 = args[2];
+
+        //     i0 < mid && i1 < hi) {
+        Value inbounds0 = arith.slt(i0, mid);
+        Value inbounds1 = arith.slt(i1, hi);
+        beforeBuilder.create<scf::ConditionOp>(
+            beforeLoc, arith._and(inbounds0, inbounds1), args);
+      },
+      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
+        ImplicitLocOpBuilder impLocAfterBuilder(afterLoc, afterBuilder);
+        Value iOut = args[0], i0 = args[1], i1 = args[2];
+
+        //   auto vals0 = readBufs[i0], vals1 = readBufs[i1];
+        SmallVector<Value> vals0 =
+            loadMemrefElements(impLocAfterBuilder, readBufs, i0);
+        SmallVector<Value> vals1 =
+            loadMemrefElements(impLocAfterBuilder, readBufs, i1);
+
+        //   writeBufs[iOut] = comparator(vals1, vals0)
+        //                       ? readBufs[i1++] : readBufs[i0++];
+        Value cmp =
+            emitComparison(impLocAfterBuilder, vals1, vals0, comparator);
+        SmallVector<Value> pickedVals;
+        for (auto [val0, val1] : llvm::zip(vals0, vals1)) {
+          pickedVals.push_back(
+              impLocAfterBuilder.create<SelectOp>(cmp, val1, val0));
+        }
+        storeMemrefElements(impLocAfterBuilder, writeBufs, iOut, pickedVals);
+        Value one = impLocAfterBuilder.create<arith::ConstantIndexOp>(1);
+        Value nexti0 =
+            impLocAfterBuilder.create<SelectOp>(cmp, i0, arith.add(i0, one));
+        Value nexti1 =
+            impLocAfterBuilder.create<SelectOp>(cmp, arith.add(i1, one), i1);
+
+        //   ++iOut;
+        Value nextIOut = impLocAfterBuilder.create<AddIOp>(iOut, one);
+        impLocAfterBuilder.create<scf::YieldOp>(
+            ValueRange{nextIOut, nexti0, nexti1});
+      });
+
+  // At this point, exactly one of the input ranges will have leftover elements.
+  Value iOut = whileOp->getResult(0);
+  Value i0 = whileOp->getResult(1);
+  Value i1 = whileOp->getResult(2);
+
+  // We could use memref::CopyOp here, but typically, there aren't many leftover
+  // elements for randomly shuffled inputs.
+  Value leftoverIn0 = arith.slt(i0, mid);
+  Value start = arith.select(leftoverIn0, i0, i1);
+  Value end = arith.select(leftoverIn0, mid, hi);
+  Value n = arith.sub(end, start);
+
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value one = b.create<arith::ConstantIndexOp>(1);
+  auto forOp = b.create<scf::ForOp>(zero, n, one);
+  b.setInsertionPointToStart(forOp.getBody());
+  Value copyIndex = forOp.getInductionVar();
+
+  Value srcIndex = arith.add(start, copyIndex);
+  Value dstIndex = arith.add(iOut, copyIndex);
+  storeMemrefElements(b, writeBufs, dstIndex,
+                      loadMemrefElements(b, readBufs, srcIndex));
+}
+
+Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
+                            int64_t staticSortDimSize, ValueRange inputMemrefs,
+                            ValueRange outputs0, ValueRange outputs1,
+                            mlir::Region& comparator) {
+  ArithBuilder arith(b, b.getLoc());
+  Value size = arith.sub(hi, lo);
+
+  Value zero = b.create<arith::ConstantIndexOp>(0);
+  Value insertionSortSize =
+      b.create<arith::ConstantIndexOp>(kInsertionSortSize);
+
+  // Run insertion sort on blocks of size kInsertionSortSize.
+  {
+    auto forBody = [&](OpBuilder& ob, Location loc, Value start, ValueRange) {
+      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
+      Value end = arith.add(
+          b.create<MinSIOp>(arith.add(start, insertionSortSize), size), lo);
+      emitInsertionSort(b, start, end, inputMemrefs, outputs0, comparator);
+      b.create<scf::YieldOp>(ValueRange{});
+    };
+    b.create<scf::ForOp>(/*lowerBound=*/zero, /*upperBound=*/size,
+                         /*step=*/insertionSortSize, /*iterArgs=*/std::nullopt,
+                         forBody);
+  }
+
+  Value initParity = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
+  if (staticSortDimSize >= 0 &&
+      staticSortDimSize < static_cast<int64_t>(kInsertionSortSize)) {
+    return initParity;
+  }
+
+  // The while arguments are:
+  // 1. the current size
+  // 2. the original index of the buffers we're currently reading from
+  // 3. the buffers we're currently reading from
+  // 4. the buffers we're currently writing to.
+  //
+  // 1 gets doubled each iteration, 2 gets negated, 3 and 4 are swapped.
+  // int currentSize = kInsertionSortSize;
+  SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
+  // First we read from `outputs0` (initialized by the insertion sort above).
+  llvm::copy(outputs0, std::back_inserter(whileInitArgs));
+  llvm::copy(outputs1, std::back_inserter(whileInitArgs));
+
+  SmallVector<Type> whileArgTypes;
+  for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
+
+  SmallVector<Location> whileArgLocs(whileArgTypes.size(), b.getLoc());
+
+  // while (
+  auto whileOp = b.create<scf::WhileOp>(
+      whileArgTypes, whileInitArgs,
+      [&](OpBuilder& beforeBuilder, Location beforeLoc, ValueRange args) {
+        //        currentSize < totalSize)
+        Value currentSize = args[0];
+        beforeBuilder.create<scf::ConditionOp>(
+            beforeLoc, arith.slt(currentSize, size), args);
+      },
+      [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
+        ImplicitLocOpBuilder impLocAfterBuilder =
+            ImplicitLocOpBuilder(afterLoc, afterBuilder);
+        ArithBuilder localArithBuilder(impLocAfterBuilder, afterLoc);
+        size_t numArgs = inputMemrefs.size();
+
+        //                                 {
+        Value currentSize = args[0], parity = args[1];
+        auto readBufs = args.drop_front(2).take_front(numArgs);
+        auto writeBufs = args.take_back(numArgs);
+
+        Value twoCurrentSize = arith.add(currentSize, currentSize);
+
+        // for (int start = 0; start < size; start += 2*currentSize) {
+        {
+          auto forOp =
+              impLocAfterBuilder.create<scf::ForOp>(zero, size, twoCurrentSize);
+          OpBuilder::InsertionGuard guard(impLocAfterBuilder);
+          impLocAfterBuilder.setInsertionPointToStart(forOp.getBody());
+          Value start = forOp.getInductionVar();
+
+          Value mid = impLocAfterBuilder.create<MinSIOp>(
+              size, localArithBuilder.add(start, currentSize));
+          Value end = impLocAfterBuilder.create<MinSIOp>(
+              size, localArithBuilder.add(start, twoCurrentSize));
+          emitMerge(impLocAfterBuilder, start, mid, end, readBufs, writeBufs,
+                    comparator);
+        }
+        // }
+
+        // parity = !parity;
+        Value one = impLocAfterBuilder.create<arith::ConstantIntOp>(1, 1);
+        Value notParity = arith.sub(one, parity);
+        // currentSize *= 2;
+        SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
+        llvm::copy(writeBufs, std::back_inserter(nextWhileArgs));
+        llvm::copy(readBufs, std::back_inserter(nextWhileArgs));
+        impLocAfterBuilder.create<scf::YieldOp>(nextWhileArgs);
+      });
+  // }
+
+  // The result is the parity bit.
+  return whileOp.getResult(1);
+}
+
+struct Slicer {
+  Slicer(OpBuilder& b, int64_t sortDim, Value sortDimSize,
+         ValueRange inductionVariables)
+      : sizes(inductionVariables.size() + 1, b.getI64IntegerAttr(1)),
+        strides(inductionVariables.size() + 1, b.getI64IntegerAttr(1)) {
+    sizes[sortDim] = sortDimSize;
+    for (size_t i = 0; i < inductionVariables.size() + 1; ++i) {
+      if ((int64_t)i == sortDim) {
+        offsets.push_back(b.getI64IntegerAttr(0));
+      } else {
+        offsets.push_back(
+            inductionVariables[i - static_cast<int>((int64_t)i > sortDim)]);
+      }
+    }
+  }
+
+  Value slice(ImplicitLocOpBuilder& b, Value input) {
+    auto ty = input.getType().cast<MemRefType>();
+    auto slicedType =
+        memref::SubViewOp::inferRankReducedResultType(
+            {ShapedType::kDynamic} /*1D output*/, ty, offsets, sizes, strides)
+            .cast<MemRefType>();
+    return b
+        .create<memref::SubViewOp>(slicedType, input, offsets, sizes, strides)
+        .getResult();
+  }
+
+  SmallVector<OpFoldResult> offsets;
+  SmallVector<OpFoldResult> sizes;
+  SmallVector<OpFoldResult> strides;
+};
+
+SmallVector<Value> sliceMemrefs(ImplicitLocOpBuilder& b,
+                                SmallVector<Value>& inductionVariables,
+                                Value sortDimSize, ValueRange memrefs,
+                                SortOp op) {
+  if (inductionVariables.empty()) return memrefs;
+
+  SmallVector<Value> slices;
+  Slicer slicer(b, op.getDimension().getSExtValue(), sortDimSize,
+                inductionVariables);
+
+  for (Value out : memrefs) slices.push_back(slicer.slice(b, out));
+
+  return slices;
+}
+
+struct SortOpPattern : public OpRewritePattern<SortOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SortOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    // Lowering thlo to our merge sort implementation necessarily happens after
+    // bufferization.
+    if (!op.hasBufferSemantics())
+      return op->emitError() << "expected buffer semantics";
+
+    // Note: the output memrefs aren't necessarily the ones that we return,
+    ValueRange outputMemrefs = op.getInits();
+    SmallVector<Value> scratchMemrefs;
+    scratchMemrefs.reserve(outputMemrefs.size());
+
+    Value firstInput = op.getOperand(0);
+    auto firstInputType = firstInput.getType().cast<ShapedType>();
+    int64_t inputRank = firstInputType.getRank();
+
+    int64_t sortDim = op.getDimension().getSExtValue();
+    Value sortDimSize = b.createOrFold<memref::DimOp>(
+        firstInput, b.create<arith::ConstantIndexOp>(sortDim));
+    int64_t staticSortDimSize = firstInputType.getDimSize(sortDim);
+
+    SmallVector<Value> dynamicDims;
+    for (int i = 0; i < inputRank; ++i) {
+      if (!firstInputType.isDynamicDim(i)) continue;
+      auto index = b.createOrFold<arith::ConstantIndexOp>(i);
+      Value dimOp = b.create<memref::DimOp>(firstInput, index);
+      dynamicDims.push_back(dimOp);
+    }
+
+    // Allocate scratch memrefs. If the size of the sort dimension is
+    // statically known to be <= kInsertionSortSize, `scratchMemrefs` are unused
+    // and will be cleaned up later.
+    for (auto input : op.getInputs()) {
+      auto inputType = input.getType().cast<ShapedType>();
+      auto memRefType =
+          MemRefType::get(inputType.getShape(), inputType.getElementType());
+      scratchMemrefs.emplace_back(
+          b.create<memref::AllocOp>(memRefType, dynamicDims));
+    }
+
+    b.setInsertionPoint(op);
+    Value zero = b.create<arith::ConstantIndexOp>(0);
+    Value one = b.create<arith::ConstantIndexOp>(1);
+
+    Value forInitArg = b.create<arith::ConstantIntOp>(/*value=*/0, /*width=*/1);
+    SmallVector<scf::ForOp> forOps;
+    SmallVector<Value> inductionVariables;
+    forOps.reserve(inputRank - 1);
+    inductionVariables.reserve(inputRank - 1);
+    for (int64_t i = 0; i < inputRank; ++i) {
+      if (i != sortDim) {
+        Value dim = b.create<arith::ConstantIndexOp>(i);
+        Value upperBound = b.create<memref::DimOp>(firstInput, dim);
+        scf::ForOp& forOp = forOps.emplace_back(b.create<scf::ForOp>(
+            zero, upperBound, one, ValueRange{forInitArg}));
+        inductionVariables.push_back(forOp.getInductionVar());
+        b.setInsertionPointToStart(forOp.SingleBlock::getBody());
+      }
+    }
+    SmallVector<Value> inputs =
+        sliceMemrefs(b, inductionVariables, sortDimSize, op.getInputs(), op);
+    SmallVector<Value> outputs =
+        sliceMemrefs(b, inductionVariables, sortDimSize, outputMemrefs, op);
+    SmallVector<Value> scratches =
+        sliceMemrefs(b, inductionVariables, sortDimSize, scratchMemrefs, op);
+
+    Value parity =
+        emitBottomUpMergeSort(b, zero, sortDimSize, staticSortDimSize, inputs,
+                              outputs, scratches, op.getRegion());
+
+    // Pass the parity bit through the for loops.
+    for (auto i = static_cast<int64_t>(forOps.size() - 1); i >= 0; --i) {
+      b.setInsertionPointToEnd(&forOps[i].getRegion().front());
+      b.create<scf::YieldOp>(ValueRange{parity});
+      parity = forOps[i]->getResult(0);
+    }
+    b.setInsertionPoint(op);
+
+    // If the results are in the scratch memrefs, copy them to the output
+    // memrefs.
+    auto thenBlock = [&](OpBuilder& ob, Location loc) {
+      ImplicitLocOpBuilder b = ImplicitLocOpBuilder(loc, ob);
+      for (auto [target, source] : llvm::zip(outputMemrefs, scratchMemrefs)) {
+        b.create<memref::CopyOp>(source, target);
+      }
+      b.create<scf::YieldOp>(ValueRange{});
+    };
+
+    rewriter.replaceOpWithNewOp<scf::IfOp>(op, /*cond=*/parity,
+                                           /*thenBuilder=*/thenBlock,
+                                           /*elseBuilder=*/nullptr);
+
+    return success();
+  }
+};
+
+struct LegalizeSortPass
+    : public impl::ThloLegalizeSortPassBase<LegalizeSortPass> {
+  // Perform the lowering to MLIR control flow.
+  void runOnOperation() override {
+    func::FuncOp f = getOperation();
+    MLIRContext* ctx = f.getContext();
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<SortOpPattern>(ctx);
+
+    mlir::ConversionTarget target(*ctx);
+    target.markUnknownOpDynamicallyLegal([](Operation*) { return true; });
+    target.addIllegalOp<thlo::SortOp>();
+
+    if (failed(applyPartialConversion(f, target, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+}  // namespace thlo
+}  // namespace mlir
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+mlir::thlo::createLegalizeSortPass() {
+  return std::make_unique<LegalizeSortPass>();
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/passes.h
new file mode 100644
index 00000000000..7ac8499f714
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/passes.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_THLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_THLO_TRANSFORMS_PASSES_H
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+template <typename T>
+class OperationPass;
+
+namespace func {
+class FuncOp;
+}  // namespace func
+
+namespace thlo {
+
+#define GEN_PASS_DECL_THLOLEGALIZESORTPASS
+#include "thlo/transforms/thlo_passes.h.inc"
+
+/// Lowers sort to Arith, MemRef, and SCF
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
+
+#define GEN_PASS_REGISTRATION
+#include "thlo/transforms/thlo_passes.h.inc"
+
+}  // namespace thlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_THLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.td b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/thlo_passes.td
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/thlo/transforms/thlo_passes.td
rename to tensorflow/compiler/xla/mlir_hlo/thlo/transforms/thlo_passes.td
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
index 6f12b949feb..74ea46a5dc0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -22,9 +22,11 @@ set(LIBS
 
         AllGmlStPasses
         GmlStDialect
+        GmlStPasses
         LmhloDialect
         LmhloGPUDialect
         MhloRegisterDialects
+        MhloTestAnalysis
         LmhloPasses
         AllMhloPasses
         AllThloPasses
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
index c66fbf7db45..d1ea67141a8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cc
@@ -13,26 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/test_passes.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/lhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/register.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir-hlo/Dialect/thlo/transforms/passes.h"
-#include "mlir-hlo/Transforms/gpu_passes.h"
-#include "mlir-hlo/Transforms/passes.h"
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/transforms/passes.h"
+#include "gml_st/transforms/test_passes.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo/transforms/passes.h"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "mhlo/IR/register.h"
+#include "mhlo/transforms/passes.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "stablehlo/dialect/Register.h"
+#include "thlo/IR/thlo_ops.h"
+#include "thlo/transforms/passes.h"
+#include "transforms/gpu_passes.h"
+#include "transforms/passes.h"
 
 using namespace mlir;
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   mlir::registerAllPasses();
   mlir::hlo::registerLMHLOTransformsPasses();
   mlir::registerLMHLOGPUTransformsPasses();
@@ -67,6 +67,22 @@ int main(int argc, char **argv) {
                                       opts.threadTileDim,
                                       opts.experimentalSoftmax);
       });
+  mlir::PassPipelineRegistration<gml_st::GmlStCPUPipelineOptions>
+      gmlStCpuTilingPipeline("gml-st-cpu-tiling-pipeline",
+                             "Tiles, fuses, vectorizes tileable ops for CPU",
+                             gml_st::addCPUTilingPipeline);
+
+  struct HloToTritonPipelineOptions
+      : public PassPipelineOptions<HloToTritonPipelineOptions> {
+    ListOption<int64_t> blockTileDim{
+        *this, "block-tile",
+        llvm::cl::desc("dimensions of the subproblem processed by the block")};
+  };
+  mlir::PassPipelineRegistration<HloToTritonPipelineOptions>(
+      "hlo-to-triton-pipeline", "Pipeline to transform HLO to Triton dialect.",
+      [](OpPassManager& pm, const HloToTritonPipelineOptions& opts) {
+        return createHloToTritonPipeline(pm, opts.blockTileDim);
+      });
 
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
new file mode 100644
index 00000000000..f237e6e7aba
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/affine.cc
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<int64_t> apply(InterpreterState&, AffineApplyOp op,
+                                 ArrayRef<int64_t> operands) {
+  return evalAffineMap(op.getAffineMap(), operands);
+}
+
+int64_t min(InterpreterState&, AffineMinOp op, ArrayRef<int64_t> operands) {
+  auto results = evalAffineMap(op.getAffineMap(), operands);
+  return *std::min_element(results.begin(), results.end());
+}
+
+int64_t max(InterpreterState&, AffineMaxOp op, ArrayRef<int64_t> operands) {
+  auto results = evalAffineMap(op.getAffineMap(), operands);
+  return *std::max_element(results.begin(), results.end());
+}
+
+REGISTER_MLIR_INTERPRETER_OP(apply);
+REGISTER_MLIR_INTERPRETER_OP(max);
+REGISTER_MLIR_INTERPRETER_OP(min);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
new file mode 100644
index 00000000000..65bc9c4cdff
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
@@ -0,0 +1,237 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+
+#include <type_traits>  // NOLINT
+
+#include "llvm/Support/ErrorHandling.h"
+#include "tools/mlir_interpreter/dialects/comparators.h"
+#include "tools/mlir_interpreter/dialects/cwise_math.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue constant(InterpreterState&, arith::ConstantOp constant) {
+  auto ty = constant->getResultTypes()[0];
+  auto shapedType = ty.dyn_cast<ShapedType>();
+  auto elemTy = shapedType ? shapedType.getElementType() : ty;
+  return dispatchScalarType(elemTy, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    if (shapedType) {
+      auto values =
+          constant.getValue().cast<DenseElementsAttr>().getValues<T>();
+      auto result = TensorOrMemref<T>::empty(shapedType.getShape());
+      auto valueIt = values.begin();
+      result.view.isVector = shapedType.isa<VectorType>();
+      for (const auto& index : result.view.indices(true)) {
+        result.at(index) = *valueIt;
+        ++valueIt;
+      }
+      return {result};
+    }
+
+    auto value = constant.getValue();
+    if (auto integer = value.dyn_cast<IntegerAttr>()) {
+      return {static_cast<T>(integer.getInt())};
+    }
+    if (auto floatValue = value.dyn_cast<FloatAttr>()) {
+      return {static_cast<T>(floatValue.getValueAsDouble())};
+    }
+
+    llvm_unreachable("unsupported constant type");
+  });
+}
+
+template <typename Op>
+InterpreterValue intCast(InterpreterState&, Op op,
+                         const InterpreterValue& arg) {
+  if (arg.isTensor()) {
+    return dispatchScalarType(
+        op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+          auto result =
+              TensorOrMemref<decltype(dummy)>::empty(arg.view().sizes);
+          for (const auto& index : result.view.indices()) {
+            result.at(index) =
+                static_cast<decltype(dummy)>(arg.extractElement(index).asInt());
+          }
+          return {result};
+        });
+  }
+
+  return dispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return {static_cast<decltype(dummy)>(arg.asInt())};
+      });
+}
+
+llvm::SmallVector<InterpreterValue> uiToFP(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState&) {
+  if (args[0].isTensor()) {
+    auto ty = op->getResultTypes()[0].cast<ShapedType>();
+    return {dispatchScalarType(
+        ty.getElementType(), [&](auto dummy) -> InterpreterValue {
+          auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
+          for (const auto& index : result.view.indices()) {
+            result.at(index) = static_cast<decltype(dummy)>(
+                args[0].extractElement(index).asUInt());
+          }
+          return {result};
+        })};
+  }
+
+  return {dispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return {static_cast<decltype(dummy)>(args[0].asUInt())};
+      })};
+}
+
+InterpreterValue cmpI(InterpreterState&, arith::CmpIOp compare,
+                      const InterpreterValue& lhs,
+                      const InterpreterValue& rhs) {
+  switch (compare.getPredicate()) {
+    case arith::CmpIPredicate::eq:
+      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case arith::CmpIPredicate::ne:
+      return applyCwiseBinaryMap<Fone>(lhs, rhs);
+    case arith::CmpIPredicate::slt:
+      return applyCwiseBinaryMap<Folt>(lhs, rhs);
+    case arith::CmpIPredicate::sle:
+      return applyCwiseBinaryMap<Fole>(lhs, rhs);
+    case arith::CmpIPredicate::sgt:
+      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case arith::CmpIPredicate::sge:
+      return applyCwiseBinaryMap<Foge>(lhs, rhs);
+    case arith::CmpIPredicate::ult:
+      return applyCwiseBinaryMap<Iult>(lhs, rhs);
+    case arith::CmpIPredicate::ule:
+      return applyCwiseBinaryMap<Iule>(lhs, rhs);
+    case arith::CmpIPredicate::ugt:
+      return applyCwiseBinaryMap<Iugt>(lhs, rhs);
+    case arith::CmpIPredicate::uge:
+      return applyCwiseBinaryMap<Iuge>(lhs, rhs);
+  }
+}
+
+template <bool value>
+struct ConstFunctor : CwiseAll {
+  template <typename T>
+  static bool apply(T, T) {
+    return value;
+  }
+};
+
+InterpreterValue cmpF(InterpreterState&, arith::CmpFOp compare,
+                      const InterpreterValue& lhs,
+                      const InterpreterValue& rhs) {
+  switch (compare.getPredicate()) {
+    case arith::CmpFPredicate::AlwaysFalse:
+      return applyCwiseBinaryMap<ConstFunctor<false>>(lhs, rhs);
+    case arith::CmpFPredicate::OEQ:
+      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case arith::CmpFPredicate::OGT:
+      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case arith::CmpFPredicate::OGE:
+      return applyCwiseBinaryMap<Foge>(lhs, rhs);
+    case arith::CmpFPredicate::OLT:
+      return applyCwiseBinaryMap<Folt>(lhs, rhs);
+    case arith::CmpFPredicate::OLE:
+      return applyCwiseBinaryMap<Fole>(lhs, rhs);
+    case arith::CmpFPredicate::ONE:
+      return applyCwiseBinaryMap<Fone>(lhs, rhs);
+    case arith::CmpFPredicate::ORD:
+      return applyCwiseBinaryMap<Ford>(lhs, rhs);
+    case arith::CmpFPredicate::UEQ:
+      return applyCwiseBinaryMap<Fueq>(lhs, rhs);
+    case arith::CmpFPredicate::UGT:
+      return applyCwiseBinaryMap<Fugt>(lhs, rhs);
+    case arith::CmpFPredicate::UGE:
+      return applyCwiseBinaryMap<Fuge>(lhs, rhs);
+    case arith::CmpFPredicate::ULT:
+      return applyCwiseBinaryMap<Fult>(lhs, rhs);
+    case arith::CmpFPredicate::ULE:
+      return applyCwiseBinaryMap<Fule>(lhs, rhs);
+    case arith::CmpFPredicate::UNE:
+      return applyCwiseBinaryMap<Fune>(lhs, rhs);
+    case arith::CmpFPredicate::UNO:
+      return applyCwiseBinaryMap<Funo>(lhs, rhs);
+    case arith::CmpFPredicate::AlwaysTrue:
+      return applyCwiseBinaryMap<ConstFunctor<true>>(lhs, rhs);
+  }
+}
+
+InterpreterValue select(InterpreterState&, arith::SelectOp,
+                        const InterpreterValue& cond,
+                        const InterpreterValue& trueValue,
+                        const InterpreterValue& falseValue) {
+  return std::get<bool>(cond.storage) ? trueValue : falseValue;
+}
+
+template <typename R>
+struct ExtFFunctor : CwiseFloat {
+  template <typename A>
+  static R apply(A v) {
+    return v;
+  }
+};
+
+InterpreterValue extF(InterpreterState&, arith::ExtFOp op,
+                      const InterpreterValue& in) {
+  return dispatchScalarType(
+      op->getResultTypes()[0], [&](auto dummy) -> InterpreterValue {
+        return applyCwiseMap<ExtFFunctor<decltype(dummy)>>(in);
+      });
+}
+
+REGISTER_MLIR_INTERPRETER_OP("arith.addf", applyCwiseBinaryMap<Plus>);
+REGISTER_MLIR_INTERPRETER_OP("arith.andi", applyCwiseBinaryMap<BitAnd>);
+REGISTER_MLIR_INTERPRETER_OP("arith.divf", applyCwiseBinaryMap<Divide>);
+REGISTER_MLIR_INTERPRETER_OP("arith.extui", uiToFP);
+REGISTER_MLIR_INTERPRETER_OP("arith.maxf", applyCwiseBinaryMap<Max>);
+REGISTER_MLIR_INTERPRETER_OP("arith.minf", applyCwiseBinaryMap<Min>);
+REGISTER_MLIR_INTERPRETER_OP("arith.mulf", applyCwiseBinaryMap<Multiply>);
+REGISTER_MLIR_INTERPRETER_OP("arith.negf", applyCwiseMap<Neg>);
+REGISTER_MLIR_INTERPRETER_OP("arith.ori", applyCwiseBinaryMap<BitOr>);
+REGISTER_MLIR_INTERPRETER_OP("arith.remf", applyCwiseBinaryMap<Remainder>);
+REGISTER_MLIR_INTERPRETER_OP("arith.subf", applyCwiseBinaryMap<Minus>);
+REGISTER_MLIR_INTERPRETER_OP("arith.uitofp", uiToFP);
+REGISTER_MLIR_INTERPRETER_OP("arith.xori", applyCwiseBinaryMap<BitXor>);
+
+// The float implementations support ints too.
+REGISTER_MLIR_INTERPRETER_OP("arith.addi", "arith.addf");
+REGISTER_MLIR_INTERPRETER_OP("arith.divsi", "arith.divf");
+REGISTER_MLIR_INTERPRETER_OP("arith.maxsi", "arith.maxf");
+REGISTER_MLIR_INTERPRETER_OP("arith.minsi", "arith.minf");
+REGISTER_MLIR_INTERPRETER_OP("arith.muli", "arith.mulf");
+REGISTER_MLIR_INTERPRETER_OP("arith.subi", "arith.subf");
+
+REGISTER_MLIR_INTERPRETER_OP(cmpF);
+REGISTER_MLIR_INTERPRETER_OP(cmpI);
+REGISTER_MLIR_INTERPRETER_OP(constant);
+REGISTER_MLIR_INTERPRETER_OP(extF);
+REGISTER_MLIR_INTERPRETER_OP(intCast<arith::IndexCastOp>);
+REGISTER_MLIR_INTERPRETER_OP(intCast<arith::TruncIOp>);
+REGISTER_MLIR_INTERPRETER_OP(intCast<arith::SIToFPOp>);
+REGISTER_MLIR_INTERPRETER_OP(select);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc
new file mode 100644
index 00000000000..1b65950c4d7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/bufferization.cc
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+
+#include <algorithm>  // NOLINT
+#include <optional>   // NOLINT
+
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue toTensor(InterpreterState&, bufferization::ToTensorOp,
+                          const InterpreterValue& in) {
+  return in.clone();
+}
+
+InterpreterValue toMemref(InterpreterState&, bufferization::ToMemrefOp,
+                          const InterpreterValue& in) {
+  return in;
+}
+
+InterpreterValue allocTensor(
+    InterpreterState&, bufferization::AllocTensorOp alloc,
+    ArrayRef<int64_t> dynamicSizes, std::optional<InterpreterValue> copy,
+    const std::optional<InterpreterValue>& /*sizeHint*/) {
+  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
+
+  if (copy) {
+    return copy->clone();
+  }
+  return InterpreterValue::makeTensor(ty.getElementType(), shape);
+}
+
+InterpreterValue clone(InterpreterState& state, bufferization::CloneOp,
+                       const InterpreterValue& in) {
+  if (auto* stats = state.getOptions().stats) {
+    stats->heapSize += in.buffer()->getByteSize();
+    stats->peakHeapSize = std::max(stats->peakHeapSize, stats->heapSize);
+    ++stats->numAllocations;
+  }
+  return in.clone();
+}
+
+REGISTER_MLIR_INTERPRETER_OP(allocTensor);
+REGISTER_MLIR_INTERPRETER_OP(clone);
+REGISTER_MLIR_INTERPRETER_OP(toMemref);
+REGISTER_MLIR_INTERPRETER_OP(toTensor);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc
new file mode 100644
index 00000000000..85303b4f702
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/builtin.cc
@@ -0,0 +1,50 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<InterpreterValue> unrealizedConversionCast(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState& state) {
+  auto resultTy = op->getResultTypes()[0];
+  auto operandTy = op->getOperandTypes()[0];
+  if (resultTy == operandTy) {
+    return {args[0]};
+  }
+
+  if (auto r = llvm::dyn_cast<ShapedType>(resultTy)) {
+    if (auto o = llvm::dyn_cast<ShapedType>(operandTy)) {
+      if (r.getElementType() == o.getElementType() &&
+          r.getRank() == o.getRank()) {
+        return {args[0]};
+      }
+    }
+  }
+
+  llvm::errs() << "Unimplemented cast: " << *op << "\n";
+  llvm_unreachable("unimplemented cast");
+}
+
+REGISTER_MLIR_INTERPRETER_OP("builtin.unrealized_conversion_cast",
+                             unrealizedConversionCast);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h
new file mode 100644
index 00000000000..00ca86c22fc
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/comparators.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
+
+#include <complex>
+#include <type_traits>
+
+#include "llvm/Support/ErrorHandling.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Despite the name, this works on integers and complex too.
+template <int64_t v, bool r, bool nan_result>
+struct FloatCompare : CwiseArith {
+  template <typename T>
+  static bool apply(T a, T b) {
+    if (isnan(a) || isnan(b)) return nan_result;
+    if constexpr (v == 0) {
+      // For complex eq/ne.
+      return (a == b) == r;
+    } else if constexpr (std::is_floating_point_v<T> || std::is_integral_v<T>) {
+      auto cmp = a > b ? 1 : (a < b ? -1 : 0);
+      return (cmp == v) == r;
+    } else {
+      llvm_unreachable("operation not supported for this type");
+    }
+  }
+
+  template <typename T>
+  static bool isnan(T a) {
+    return std::isnan(a);
+  }
+  template <typename T>
+  static bool isnan(std::complex<T> a) {
+    return std::isnan(std::real(a)) || std::isnan(std::imag(a));
+  }
+};
+
+using Foeq = FloatCompare<0, true, false>;
+using Foge = FloatCompare<-1, false, false>;
+using Fogt = FloatCompare<1, true, false>;
+using Fole = FloatCompare<1, false, false>;
+using Folt = FloatCompare<-1, true, false>;
+using Fone = FloatCompare<0, false, false>;
+using Ford = FloatCompare<99, false, false>;
+using Fueq = FloatCompare<0, true, true>;
+using Fuge = FloatCompare<-1, false, true>;
+using Fugt = FloatCompare<1, true, true>;
+using Fule = FloatCompare<1, false, true>;
+using Fult = FloatCompare<-1, true, true>;
+using Fune = FloatCompare<0, false, true>;
+using Funo = FloatCompare<99, true, true>;
+
+template <int64_t v, bool r>
+struct UnsignedCompare : CwiseInt {
+  template <typename T>
+  static bool apply(T a, T b) {
+    using U = std::make_unsigned_t<T>;
+    auto aU = static_cast<U>(a);
+    auto bU = static_cast<U>(b);
+    auto cmp = aU > bU ? 1 : (aU < bU ? -1 : 0);
+    return (cmp == v) == r;
+  }
+};
+
+using Iuge = UnsignedCompare<-1, false>;
+using Iule = UnsignedCompare<1, false>;
+using Iugt = UnsignedCompare<1, true>;
+using Iult = UnsignedCompare<-1, true>;
+
+struct Iumax {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iuge::apply(a, b) ? a : b;
+  }
+};
+
+struct Iumin {
+  template <typename T>
+  static T apply(T a, T b) {
+    return Iule::apply(a, b) ? a : b;
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_COMPARATORS_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h
new file mode 100644
index 00000000000..c539f33b410
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/cwise_math.h
@@ -0,0 +1,193 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
+
+#include <complex>
+#include <type_traits>
+
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct ATan2 : CwiseReal {
+  template <typename T>
+  static T apply(T a, T b) {
+    return std::atan2(a, b);
+  }
+};
+
+struct Complex : CwiseFloat {
+  template <typename T>
+  static std::complex<T> apply(T a, T b) {
+    return {a, b};
+  }
+};
+
+struct Max : CwiseReal {
+  template <typename T>
+  static T apply(T a, T b) {
+    return std::max(a, b);
+  }
+};
+
+struct Min : CwiseReal {
+  template <typename T>
+  static T apply(T a, T b) {
+    return std::min(a, b);
+  }
+};
+
+struct Power : CwiseArith {
+  template <typename T>
+  static T apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      if constexpr (std::is_signed_v<T>) {
+        if (b < 0) {
+          return a == 1 ? 1 : 0;
+        }
+      }
+      T result = 1;
+      while (b > 0) {
+        if (b & 1) result *= a;
+        b >>= 1;
+        if (b) {
+          a *= a;
+        }
+      }
+      return result;
+    } else {
+      return std::pow(a, b);
+    }
+  }
+};
+
+struct Remainder : CwiseReal {
+  template <typename T>
+  static T apply(T a, T b) {
+    if constexpr (std::is_integral_v<T>) {
+      return a % b;
+    } else {
+      return std::fmod(a, b);
+    }
+  }
+};
+
+struct ShiftRightArith : CwiseInt {
+  template <typename T>
+  static T apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a >> b);
+  }
+};
+
+struct ShiftRightLogical : CwiseInt {
+  template <typename T>
+  static T apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT
+               ? 0
+               : static_cast<std::make_unsigned_t<T>>(a) >> b;
+  }
+};
+
+struct ShiftLeft : CwiseInt {
+  template <typename T>
+  static T apply(T a, T b) {
+    return b >= sizeof(T) * CHAR_BIT ? 0 : (a << b);
+  }
+};
+
+namespace detail {
+template <template <typename T> class F, typename trait>
+struct Wrap : trait {
+  template <typename T>
+  static T apply(T a, T b) {
+    return F<T>{}(a, b);
+  }
+};
+}  // namespace detail
+
+using Plus = detail::Wrap<std::plus, CwiseArith>;
+using Divide = detail::Wrap<std::divides, CwiseArith>;
+using Multiply = detail::Wrap<std::multiplies, CwiseArith>;
+using Minus = detail::Wrap<std::minus, CwiseArith>;
+using BitAnd = detail::Wrap<std::bit_and, CwiseIntegral>;
+using BitOr = detail::Wrap<std::bit_or, CwiseIntegral>;
+using BitXor = detail::Wrap<std::bit_xor, CwiseIntegral>;
+
+#define DEFINE_WRAPPER(name, std_fun, trait) \
+  struct name : trait {                      \
+    template <typename T>                    \
+    static auto apply(T a) {                 \
+      return std_fun(a);                     \
+    }                                        \
+  };
+
+DEFINE_WRAPPER(Abs, std::abs, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Cbrt, std::cbrt, CwiseFloat);
+DEFINE_WRAPPER(Ceil, std::ceil, CwiseFloat);
+DEFINE_WRAPPER(Cos, std::cos, CwiseNonIntegral);
+DEFINE_WRAPPER(Exp, std::exp, CwiseNonIntegral);
+DEFINE_WRAPPER(Floor, std::floor, CwiseFloat);
+DEFINE_WRAPPER(Imag, std::imag, CwiseComplex);
+DEFINE_WRAPPER(IsFinite, std::isfinite, CwiseFloat);
+DEFINE_WRAPPER(Log, std::log, CwiseNonIntegral);
+DEFINE_WRAPPER(NearbyInt, std::nearbyint, CwiseFloat);
+DEFINE_WRAPPER(Neg, std::negate<T>{}, CwiseSignedOrComplex);
+DEFINE_WRAPPER(Real, std::real, CwiseComplex);
+DEFINE_WRAPPER(Round, std::round, CwiseFloat);
+DEFINE_WRAPPER(Sin, std::sin, CwiseNonIntegral);
+DEFINE_WRAPPER(Sqrt, std::sqrt, CwiseNonIntegral);
+DEFINE_WRAPPER(TanH, std::tanh, CwiseNonIntegral);
+
+#undef DEFINE_WRAPPER
+
+struct ExpM1 : CwiseNonIntegral {
+  template <typename T>
+  static T apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::expm1(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto s = std::sin(i / 2);
+      auto real = std::expm1(r) * std::cos(i) - 2 * s * s;
+      auto imag = std::exp(r) * std::sin(i);
+      return {real, imag};
+    }
+  }
+};
+
+struct Log1P : CwiseNonIntegral {
+  template <typename T>
+  static T apply(T a) {
+    if constexpr (std::is_floating_point_v<T>) {
+      return std::log1p(a);
+    } else {
+      auto r = std::real(a);
+      auto i = std::imag(a);
+      auto l = std::hypot(r + 1, i);
+      auto real = std::log(l);
+      auto imag = std::atan2(i, r + 1);
+      return {real, imag};
+    }
+  }
+};
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_CWISE_MATH_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc
new file mode 100644
index 00000000000..3852a0edb5e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/func.cc
@@ -0,0 +1,116 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dlfcn.h>
+
+#include <type_traits>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+template <typename T>
+bool typeMatches(mlir::Type type) {
+  if constexpr (std::is_same_v<T, float>) {
+    return type.isF32();
+  } else if constexpr (std::is_same_v<T, double>) {
+    return type.isF64();
+  } else {
+    return false;
+  }
+}
+
+template <typename Dummy>
+bool typesMatch(ArrayRef<mlir::Type> types) {
+  return types.empty();
+}
+
+template <typename Dummy, typename T, typename... R>
+bool typesMatch(ArrayRef<mlir::Type> types) {
+  if (types.empty() || !typeMatches<T>(types.front())) return false;
+  return typesMatch<Dummy, R...>(types.drop_front());
+}
+
+template <int n, typename... Args>
+using Arg = std::tuple_element_t<n, std::tuple<Args...>>;
+
+template <typename Ret, typename... Args>
+bool tryCall(void* sym, func::FuncOp callee,
+             MutableArrayRef<InterpreterValue> args, InterpreterValue& ret) {
+  if (args.size() != callee.getNumArguments() || callee.getNumResults() != 1) {
+    return false;
+  }
+
+  if (!typeMatches<Ret>(callee.getResultTypes()[0])) {
+    return false;
+  }
+  if (!typesMatch<void, Args...>(callee.getArgumentTypes())) {
+    return false;
+  }
+
+  static_assert(sizeof...(Args) <= 2);
+  using FnType = Ret (*)(Args...);
+  auto fn = reinterpret_cast<FnType>(sym);
+  constexpr int n = sizeof...(Args);
+
+  if constexpr (n == 1) {
+    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage))};
+  } else {
+    static_assert(n == 2);
+    ret = {fn(std::get<Arg<0, Args...>>(args[0].storage),
+              std::get<Arg<1, Args...>>(args[1].storage))};
+  }
+  return true;
+}
+
+llvm::SmallVector<InterpreterValue> call(MutableArrayRef<InterpreterValue> args,
+                                         mlir::Operation* op,
+                                         InterpreterState& state) {
+  auto call = llvm::cast<func::CallOp>(op);
+  auto callee =
+      llvm::cast<func::FuncOp>(state.getSymbols().lookup(call.getCallee()));
+  if (callee->getRegion(0).hasOneBlock()) {
+    return interpret(state, callee.getRegion(), args);
+  }
+
+  void* sym = dlsym(RTLD_DEFAULT, callee.getSymName().str().c_str());
+  if (sym == nullptr) {
+    state.addFailure("callee not found");
+    return {};
+  }
+
+  InterpreterValue result;
+  if (tryCall<float, float>(sym, callee, args, result) ||
+      tryCall<float, float, float>(sym, callee, args, result) ||
+      tryCall<double, double>(sym, callee, args, result) ||
+      tryCall<double, double, double>(sym, callee, args, result)) {
+    return {result};
+  }
+
+  state.addFailure("unsupported call target");
+  return {};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("func.call", call);
+REGISTER_MLIR_INTERPRETER_OP("func.return", noOpTerminator);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
new file mode 100644
index 00000000000..e13439c8014
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
@@ -0,0 +1,178 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+#include <memory>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Operation.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+OffsetsSizesStrides unpackTileArgs(const TensorOrMemref<int64_t>& tile) {
+  OffsetsSizesStrides result;
+  int64_t rank = tile.view.sizes[0] / 3;
+  for (int64_t i = 0; i < rank; ++i) {
+    result.offsets.push_back(tile.at(i));
+    result.sizes.push_back(tile.at(i + rank));
+    result.strides.push_back(tile.at(i + 2 * rank));
+  }
+  return result;
+}
+
+llvm::SmallVector<InterpreterValue> gmlStLoop(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState& state) {
+  bool isBufferized = op->getNumResults() == 0;
+  auto forOp = llvm::dyn_cast<gml_st::ForOp>(op);
+  auto parallelOp = llvm::dyn_cast<gml_st::ParallelOp>(op);
+
+  auto terminator = forOp ? forOp.getTerminator() : parallelOp.getTerminator();
+
+  int64_t numOutputs = terminator.getDsts().size();
+  assert((args.size() - numOutputs) % 3 == 0 &&
+         "expected uniform sizes for lbs, ubs and steps");
+
+  size_t numLoops = (args.size() - numOutputs) / 3;
+  auto boundArgs = args.take_front(numLoops * 3);
+  auto lbs = unpackInterpreterValues<int64_t>(boundArgs.take_front(numLoops));
+  auto ubs =
+      unpackInterpreterValues<int64_t>(boundArgs.slice(numLoops, numLoops));
+  auto steps = unpackInterpreterValues<int64_t>(boundArgs.take_back(numLoops));
+
+  SmallVector<InterpreterValue> outputs;
+  if (forOp) {
+    for (size_t i = args.size() - numOutputs; i < args.size(); ++i) {
+      outputs.push_back(getInitOperand(op, i, args));
+    }
+  } else {
+    for (size_t i = args.size() - numOutputs; i < args.size(); ++i) {
+      outputs.push_back(getInitOperand(op, static_cast<int64_t>(i), args));
+    }
+  }
+
+  SmallVector<int64_t> iterSizes;
+  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
+    if (step == 0) {
+      state.addFailure("invalid step");
+      return {};
+    }
+    iterSizes.push_back((ub - lb + (step - 1)) / step);
+  }
+
+  // Make a fake buffer view to abuse its index iterator.
+  BufferView view{0, iterSizes, {}};
+  for (const auto& indices : view.indices()) {
+    SmallVector<InterpreterValue> args;
+    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
+      args.push_back(InterpreterValue{i * step + lb});
+    }
+    llvm::copy(outputs, std::back_inserter(args));
+
+    auto yielded = interpret(state, op->getRegion(0), args);
+    if (state.hasFailure()) break;
+
+    assert(yielded.size() == 3 * numOutputs &&
+           "expected equal number of srcs, dsts and sets");
+
+    MutableArrayRef<InterpreterValue> yieldedRef = yielded;
+
+    // The dsts of set yield are always the outputs, so we can ignore them.
+    auto srcs = yieldedRef.take_front(numOutputs);
+    auto tiles = yieldedRef.take_back(numOutputs);
+
+    for (auto [src, tile, output] : llvm::zip(srcs, tiles, outputs)) {
+      auto tileArgs =
+          unpackTileArgs(std::get<TensorOrMemref<int64_t>>(tile.storage));
+      if (!src.isTensor()) {
+        output.insertElement(tileArgs.offsets, src);
+      } else {
+        for (const auto& srcIndices : src.view().indices()) {
+          assert(srcIndices.size() == tileArgs.sizes.size() &&
+                 "mismatched tile/src rank");
+          // The sizes of the tile must match the sizes of the src, so we can
+          // ignore them.
+          SmallVector<int64_t> dstIndices;
+          for (auto [src_index, offset, stride] :
+               llvm::zip(srcIndices, tileArgs.offsets, tileArgs.strides)) {
+            dstIndices.push_back(src_index * stride + offset);
+          }
+          output.insertElement(dstIndices, src.extractElement(srcIndices));
+        }
+      }
+    }
+  }
+
+  if (isBufferized) return {};
+  return outputs;
+}
+
+InterpreterValue tile(InterpreterState&, gml_st::TileOp op,
+                      ArrayRef<int64_t> dynamicOffsets,
+                      ArrayRef<int64_t> dynamicSizes,
+                      ArrayRef<int64_t> dynamicStrides) {
+  auto values = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
+                                           dynamicStrides, op);
+  int64_t rank = static_cast<int64_t>(values.offsets.size());
+
+  auto result = TensorOrMemref<int64_t>::empty({rank * 3});
+  for (int64_t i = 0; i < rank; ++i) {
+    result.at(i) = values.offsets[i];
+    result.at(i + rank) = values.sizes[i];
+    result.at(i + 2 * rank) = values.strides[i];
+  }
+
+  return {result};
+}
+
+llvm::SmallVector<InterpreterValue> materialize(
+    InterpreterState&, gml_st::MaterializeOp op, const InterpreterValue& src,
+    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
+    ArrayRef<int64_t> dynamicStrides) {
+  auto tile = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
+                                         dynamicStrides, op);
+
+  auto out = src.typedAlike(tile.sizes);
+  out.fill([&](llvm::ArrayRef<int64_t> indices) {
+    llvm::SmallVector<int64_t> srcIndices;
+    for (int64_t i = 0; i < tile.sizes.size(); ++i) {
+      srcIndices.push_back(indices[i] * tile.strides[i] + tile.offsets[i]);
+    }
+    return src.extractElement(srcIndices);
+  });
+
+  if (op->getResultTypes().front().isa<ShapedType>()) {
+    return {out};
+  }
+  return {out.extractElement({})};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("gml_st.for", gmlStLoop);
+REGISTER_MLIR_INTERPRETER_OP("gml_st.parallel", gmlStLoop);
+REGISTER_MLIR_INTERPRETER_OP("gml_st.set_yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("gml_st.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(materialize);
+REGISTER_MLIR_INTERPRETER_OP(tile);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
new file mode 100644
index 00000000000..02fe912dceb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/linalg.cc
@@ -0,0 +1,253 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+
+// clang-format erroneously puts the Linalg header above.
+#include <algorithm>   // NOLINT
+#include <cstdint>     // NOLINT
+#include <functional>  // NOLINT
+#include <memory>      // NOLINT
+
+#include "llvm/ADT/STLExtras.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class IterationIndexSideChannel : public InterpreterSideChannel {
+ public:
+  explicit IterationIndexSideChannel(ArrayRef<int64_t> indices)
+      : indices(indices) {}
+  ArrayRef<int64_t> getIndices() const { return indices; }
+
+ private:
+  ArrayRef<int64_t> indices;
+};
+
+llvm::SmallVector<InterpreterValue> broadcast(InterpreterState&,
+                                              linalg::BroadcastOp broadcast,
+                                              const InterpreterValue& input,
+                                              InterpreterValue init) {
+  auto broadcastDims = llvm::to_vector(broadcast.getDimensions());
+  llvm::sort(broadcastDims);
+
+  auto outShape = init.view().sizes;
+  auto out = input;
+  auto& outView = out.view();
+  for (int64_t dim : broadcastDims) {
+    outView.sizes.insert(outView.sizes.begin() + dim, outShape[dim]);
+    outView.strides.insert(outView.strides.begin() + dim, 0);
+  }
+
+  if (broadcast.getNumResults() == 1) {
+    return {out};
+  }
+
+  init.fill([&](llvm::ArrayRef<int64_t> indices) {
+    return out.extractElement(indices);
+  });
+  return {};
+}
+
+llvm::SmallVector<InterpreterValue> generic(
+    InterpreterState& state, linalg::GenericOp generic,
+    MutableArrayRef<InterpreterValue> inputs,
+    MutableArrayRef<InterpreterValue> outputsRef) {
+  auto ranges = generic.getStaticLoopRanges();
+  for (auto range : ranges) {
+    (void)range;
+    // TODO(jreiffers): Support this.
+    assert(!mlir::ShapedType::isDynamic(range) &&
+           "Dynamic ranges not supported yet.");
+  }
+
+  llvm::SmallVector<InterpreterValue> outputs;
+  for (int64_t output = 0; output < outputsRef.size(); ++output) {
+    outputs.push_back(getInitOperand(generic.getOutputs(), output, outputsRef));
+  }
+
+  llvm::SmallVector<int64_t> ivs(ranges.size());
+  InterpreterScope scope(state);
+  scope.setSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
+
+  auto indexingMaps = generic.getIndexingMapsArray();
+  auto outputMaps = ArrayRef<AffineMap>(indexingMaps).drop_front(inputs.size());
+  std::function<void(int64_t)> run;
+  run = [&](int64_t loopIndex) {
+    // Abort recursion if we encountered some error previously.s
+    if (state.hasFailure()) return;
+
+    if (loopIndex < ranges.size()) {
+      for (int64_t index = 0; index < ranges[loopIndex]; ++index) {
+        ivs[loopIndex] = index;
+        run(loopIndex + 1);
+      }
+    } else {
+      llvm::SmallVector<InterpreterValue> bbargs;
+      // Build bbargs: 1. inputs, 2. outputs.
+      for (auto [input, map] : llvm::zip(inputs, indexingMaps)) {
+        auto indices = evalAffineMap(map, ivs);
+        bbargs.push_back(input.extractElement(indices));
+      }
+      llvm::SmallVector<llvm::SmallVector<int64_t>> outputIndices;
+      for (auto [output, map] : llvm::zip(outputs, outputMaps)) {
+        auto& indices = outputIndices.emplace_back(evalAffineMap(map, ivs));
+        bbargs.push_back(output.extractElement(indices));
+      }
+      // Evaluate region.
+      auto yielded = interpret(state, generic.getRegion(), bbargs);
+      if (state.hasFailure()) return;
+      // Insert yielded values in the outputs.
+      for (auto [output, indices, yield] :
+           llvm::zip(outputs, outputIndices, yielded)) {
+        output.insertElement(indices, yield);
+      }
+    }
+  };
+  run(0);
+
+  if (generic.getNumResults() == 0) return {};
+  return outputs;
+}
+
+llvm::SmallVector<InterpreterValue> map(InterpreterState& state,
+                                        linalg::MapOp op,
+                                        ArrayRef<InterpreterValue> inputs,
+                                        const InterpreterValue& init) {
+  InterpreterValue output =
+      op.getInit().getType().isa<TensorType>() ? init.clone() : init;
+
+  InterpreterScope scope(state);
+  SmallVector<int64_t> ivs(output.view().rank());
+  scope.setSideChannel(std::make_shared<IterationIndexSideChannel>(ivs));
+  for (const auto& indices : output.view().indices()) {
+    std::copy(indices.begin(), indices.end(), ivs.begin());
+    llvm::SmallVector<InterpreterValue> args;
+    for (auto& input : inputs) {
+      args.push_back(input.extractElement(indices));
+    }
+    auto yielded = interpret(state, op.getRegion(), args);
+    if (state.hasFailure()) break;
+    output.insertElement(indices, yielded[0]);
+  }
+
+  if (op.getNumResults() == 0) return {};
+  return {output};
+}
+
+llvm::SmallVector<InterpreterValue> reduce(InterpreterState& state,
+                                           linalg::ReduceOp reduce,
+                                           const InterpreterValue& in,
+                                           const InterpreterValue& init) {
+  // TODO(jreiffers): Support variadic reduce.
+  auto dims = reduce.getDimensions();
+  InterpreterValue output =
+      reduce.getInits()[0].getType().isa<TensorType>() ? init.clone() : init;
+  for (const auto& index : in.view().indices()) {
+    auto dstIndex = index;
+    for (int64_t dim : llvm::reverse(dims)) {
+      dstIndex.erase(dstIndex.begin() + dim);
+    }
+
+    auto newValue =
+        interpret(state, reduce.getRegion(),
+                  {in.extractElement(index), output.extractElement(dstIndex)});
+    if (state.hasFailure()) return {};
+
+    output.insertElement(dstIndex, newValue.front());
+  }
+  if (reduce->getNumResults() == 0) return {};
+  return {output};
+}
+
+llvm::SmallVector<InterpreterValue> fill(InterpreterState&, linalg::FillOp op,
+                                         const InterpreterValue& value,
+                                         const InterpreterValue& init) {
+  // TODO(jreiffers): Support variadic fill.
+  InterpreterValue output = getInitOperand(op.getOutputs(), 0, {init});
+  output.fill([&](llvm::ArrayRef<int64_t>) { return value; });
+  if (op.getNumResults() == 0) return {};
+  return {output};
+}
+
+int64_t index(InterpreterState& state, linalg::IndexOp index) {
+  return state.getTopScope()
+      ->getSideChannel<IterationIndexSideChannel>()
+      ->getIndices()[index.getDim()];
+}
+
+SmallVector<InterpreterValue> matmul(InterpreterState& state,
+                                     linalg::MatmulOp matmul,
+                                     ArrayRef<InterpreterValue> inputs,
+                                     const InterpreterValue& init) {
+  if (inputs.size() != 2) {
+    state.addFailure("Invalid matmul");
+    return {};
+  }
+  const auto& lhs = inputs[0];
+  const auto& rhs = inputs[1];
+  auto ty = matmul.getOutputs()[0].getType();
+  auto result = ty.isa<TensorType>() ? init.clone() : init;
+  dispatchScalarType(ty, [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    auto lhsTensor = std::get<TT>(lhs.storage);
+    auto rhsTensor = std::get<TT>(rhs.storage);
+    auto resultTensor = std::get<TT>(result.storage);
+    for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
+      for (int64_t j = 0; j < resultTensor.view.sizes[1]; ++j) {
+        for (int64_t k = 0; k < lhsTensor.view.sizes[1]; ++k) {
+          resultTensor.at({i, j}) +=
+              lhsTensor.at({i, k}) * rhsTensor.at({k, j});
+        }
+      }
+    }
+  });
+
+  if (matmul.getNumResults() == 0) return {};
+  return {result};
+}
+
+SmallVector<InterpreterValue> transpose(InterpreterState&,
+                                        linalg::TransposeOp transpose,
+                                        const InterpreterValue& input,
+                                        InterpreterValue init) {
+  auto transposed = transposeImpl(input, transpose.getPermutation());
+  if (transpose.getNumResults() == 1) {
+    return {transposed};
+  }
+
+  init.fill([&](auto index) { return transposed.extractElement(index); });
+  return {};
+}
+
+REGISTER_MLIR_INTERPRETER_OP("linalg.matmul", "mhlo.dot");
+REGISTER_MLIR_INTERPRETER_OP("linalg.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(broadcast);
+REGISTER_MLIR_INTERPRETER_OP(fill);
+REGISTER_MLIR_INTERPRETER_OP(generic);
+REGISTER_MLIR_INTERPRETER_OP(index);
+REGISTER_MLIR_INTERPRETER_OP(map);
+REGISTER_MLIR_INTERPRETER_OP(matmul);
+REGISTER_MLIR_INTERPRETER_OP(reduce);
+REGISTER_MLIR_INTERPRETER_OP(transpose);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc
new file mode 100644
index 00000000000..70d53c098bb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/math.cc
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/dialects/comparators.h"
+#include "tools/mlir_interpreter/dialects/cwise_math.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+struct CopySign : CwiseFloat {
+  template <typename T>
+  static T apply(T a, T b) {
+    return std::copysign(a, b);
+  }
+};
+
+REGISTER_MLIR_INTERPRETER_OP("math.copysign", applyCwiseBinaryMap<CopySign>);
+
+REGISTER_MLIR_INTERPRETER_OP("math.absf", applyCwiseMap<Abs>);
+REGISTER_MLIR_INTERPRETER_OP("math.cos", applyCwiseMap<Cos>);
+REGISTER_MLIR_INTERPRETER_OP("math.exp", applyCwiseMap<Exp>);
+REGISTER_MLIR_INTERPRETER_OP("math.floor", applyCwiseMap<Floor>);
+REGISTER_MLIR_INTERPRETER_OP("math.log", applyCwiseMap<Log>);
+REGISTER_MLIR_INTERPRETER_OP("math.log1p", applyCwiseMap<Log1P>);
+REGISTER_MLIR_INTERPRETER_OP("math.powf", applyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("math.sin", applyCwiseMap<Sin>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
new file mode 100644
index 00000000000..5065b213582
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
@@ -0,0 +1,230 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+
+// clang-format erroneously puts the MemRef header above.
+#include <algorithm>  // NOLINT
+#include <iterator>   // NOLINT
+#include <limits>     // NOLINT
+#include <variant>    // NOLINT
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue load(InterpreterState& state, memref::LoadOp,
+                      const InterpreterValue& memref,
+                      ArrayRef<int64_t> indices) {
+  if (!memref.view().inBounds(indices)) {
+    state.addFailure("array index out of bounds");
+    return {};
+  }
+  return memref.extractElement(indices);
+}
+
+void store(InterpreterState& state, memref::StoreOp,
+           const InterpreterValue& value, InterpreterValue memref,
+           ArrayRef<int64_t> indices) {
+  if (memref.view().inBounds(indices)) {
+    memref.insertElement(indices, value);
+  } else {
+    state.addFailure("array index out of bounds");
+  }
+}
+
+// TODO(jreiffers): Support symbol operands.
+InterpreterValue alloc(InterpreterState& state, memref::AllocOp alloc,
+                       ArrayRef<int64_t> dynamicSizes) {
+  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
+  auto result = InterpreterValue::makeTensor(ty.getElementType(), shape);
+  if (auto* stats = state.getOptions().stats) {
+    stats->heapSize += result.buffer()->getByteSize();
+    stats->peakHeapSize = std::max(stats->peakHeapSize, stats->heapSize);
+    ++stats->numAllocations;
+  }
+  return result;
+}
+
+InterpreterValue allocA(InterpreterState&, memref::AllocaOp alloc,
+                        ArrayRef<int64_t> dynamicSizes) {
+  auto ty = alloc->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
+  return InterpreterValue::makeTensor(ty.getElementType(), shape);
+}
+
+void dealloc(InterpreterState& state, memref::DeallocOp,
+             InterpreterValue memref) {
+  auto buffer = memref.buffer();
+  const auto& view = memref.view();
+  if (auto* stats = state.getOptions().stats) {
+    stats->heapSize -= buffer->getByteSize();
+    ++stats->numDeallocations;
+  }
+  if (view.getNumElements() * memref.getByteSizeOfElement() !=
+      buffer->getByteSize()) {
+    state.addFailure("Attempting to deallocate a subview");
+  } else if (!state.getOptions().disableDeallocations) {
+    buffer->deallocate();
+  }
+}
+
+void copy(InterpreterState&, memref::CopyOp, InterpreterValue source,
+          InterpreterValue dest) {
+  dest.fill([&](llvm::ArrayRef<int64_t> indices) {
+    return source.extractElement(indices);
+  });
+}
+
+InterpreterValue subview(InterpreterState& state, memref::SubViewOp subview,
+                         const InterpreterValue& memref,
+                         ArrayRef<int64_t> dynamicOffsets,
+                         ArrayRef<int64_t> dynamicSizes,
+                         ArrayRef<int64_t> dynamicStrides) {
+  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
+                                      dynamicStrides, subview);
+  auto out = memref;
+  auto& outView = out.view();
+  if (!outView.subview(v.offsets, v.sizes, v.strides).succeeded()) {
+    state.addFailure("subview out of bounds");
+    return {};
+  }
+
+  if (subview.getResult().getType().getRank() == outView.rank()) {
+    return out;
+  }
+
+  auto shape = subview.getResult().getType().getShape();
+  // TODO(jreiffers): Check why subview.getDroppedDims() yields the wrong shape
+  // here for 1x2x2x3 (-> 1x2x1x3) -> 1x2x3 (claiming 0 is dropped).
+  int64_t dim = 0;
+  while (dim < outView.rank() && dim < shape.size()) {
+    if (shape[dim] != 1 && outView.sizes[dim] == 1) {
+      outView.sizes.erase(outView.sizes.begin() + dim);
+      outView.strides.erase(outView.strides.begin() + dim);
+    } else {
+      assert((shape[dim] < 0 || outView.sizes[dim] == shape[dim]) &&
+             "expected static size to match");
+      ++dim;
+    }
+  }
+  while (dim < outView.rank()) {
+    assert(outView.sizes.back() == 1 && "expected remaining dims to be 1");
+    outView.sizes.pop_back();
+    outView.strides.pop_back();
+  }
+  return out;
+}
+
+llvm::SmallVector<InterpreterValue> collapseShape(
+    InterpreterState& state, memref::CollapseShapeOp collapse,
+    const InterpreterValue& memref) {
+  const BufferView& inputView = memref.view();
+  InterpreterValue out = memref;
+  auto& outView = out.view();
+  outView.sizes.clear();
+  outView.strides.clear();
+
+  for (const auto& group : collapse.getReassociationIndices()) {
+    if (auto stride = inputView.getCollapsedStride(group)) {
+      outView.strides.push_back(*stride);
+      int64_t& size = outView.sizes.emplace_back(1);
+      for (int64_t dim : group) size *= inputView.sizes[dim];
+    } else {
+      state.addFailure("cannot collapse dimensions without a common stride");
+      return {};
+    }
+  }
+
+  return {out};
+}
+
+template <typename Op>
+InterpreterValue cast(InterpreterState& state, Op op, InterpreterValue memref) {
+  BufferView inputView = memref.view();
+  auto outTy = op->getResultTypes()[0].template cast<MemRefType>();
+  if (outTy.getNumDynamicDims() > 0) {
+    state.addFailure("dynamic dimensions unsupported.");
+    return {};
+  }
+
+  InterpreterValue out = memref;
+  auto& outView = out.view();
+  outView.strides.clear();
+  outView.sizes = llvm::to_vector(outTy.getShape());
+  int64_t dummy;
+  if (!getStridesAndOffset(outTy, outView.strides, dummy).succeeded()) {
+    if (inputView.strides != BufferView::getDefaultStrides(inputView.sizes)) {
+      state.addFailure("unsupported strides");
+      return {};
+    }
+    outView.strides = BufferView::getDefaultStrides(outView.sizes);
+  }
+
+  return out;
+}
+
+InterpreterValue getGlobal(InterpreterState& state,
+                           memref::GetGlobalOp getGlobal) {
+  auto global = llvm::cast<memref::GlobalOp>(
+      state.getSymbols().lookup(getGlobal.getName()));
+
+  auto value = global.getConstantInitValue();
+  assert(value && "mutable globals are not implemented");
+
+  auto ty = getGlobal->getResultTypes()[0].cast<ShapedType>();
+  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    auto values = value.getValues<decltype(dummy)>();
+    auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
+    auto valueIt = values.begin();
+    for (const auto& index : result.view.indices()) {
+      result.at(index) = *valueIt;
+      ++valueIt;
+    }
+    return {result};
+  });
+}
+
+int64_t dim(InterpreterState& state, memref::DimOp,
+            const InterpreterValue& memref, int64_t dim) {
+  return dimImpl(memref, dim, state);
+}
+
+REGISTER_MLIR_INTERPRETER_OP(alloc);
+REGISTER_MLIR_INTERPRETER_OP(allocA);
+REGISTER_MLIR_INTERPRETER_OP(collapseShape);
+REGISTER_MLIR_INTERPRETER_OP(cast<memref::CastOp>);
+REGISTER_MLIR_INTERPRETER_OP(copy);
+REGISTER_MLIR_INTERPRETER_OP(dealloc);
+REGISTER_MLIR_INTERPRETER_OP(dim);
+// TODO(jreiffers): Implement full expand_shape support.
+REGISTER_MLIR_INTERPRETER_OP(cast<memref::ExpandShapeOp>);
+REGISTER_MLIR_INTERPRETER_OP(getGlobal);
+REGISTER_MLIR_INTERPRETER_OP(load);
+REGISTER_MLIR_INTERPRETER_OP(store);
+REGISTER_MLIR_INTERPRETER_OP(subview);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
new file mode 100644
index 00000000000..5fa7f461b61
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo.cc
@@ -0,0 +1,776 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// NOTE: These ops aim to match the StableHLO spec and the behavior of the XLA
+// compiler. For op semantics and a reference implementation, check the
+// StableHLO repository (the spec is here:
+// https://github.com/openxla/stablehlo/blob/main/docs/spec.md).
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "tools/mlir_interpreter/dialects/comparators.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+InterpreterValue makeTuple(MutableArrayRef<InterpreterValue> values) {
+  Tuple result;
+  for (auto& value : values) {
+    result.values.push_back(
+        std::make_shared<InterpreterValue>(std::move(value)));
+  }
+  return {result};
+}
+
+InterpreterValue getTupleElement(InterpreterState&, mhlo::GetTupleElementOp get,
+                                 const InterpreterValue& tuple) {
+  return *std::get<Tuple>(tuple.storage).values[get.getIndex()];
+}
+
+InterpreterValue broadcastInDim(InterpreterState&,
+                                mhlo::BroadcastInDimOp broadcast,
+                                InterpreterValue in) {
+  auto broadcastDims = broadcast.getBroadcastDimensions().getValues<int64_t>();
+  const auto& inSizes = in.view().sizes;
+  auto out = in.typedAlike(
+      broadcast->getResultTypes()[0].cast<ShapedType>().getShape());
+  // TODO(jreiffers): Skip the copy.
+  out.fill([&](llvm::ArrayRef<int64_t> outIndices) {
+    llvm::SmallVector<int64_t> inIndices;
+    for (auto [inDim, outDim] : llvm::enumerate(broadcastDims)) {
+      inIndices.push_back(inSizes[inDim] == 1 ? 0 : outIndices[outDim]);
+    }
+    return in.extractElement(inIndices);
+  });
+  return out;
+}
+
+InterpreterValue reshape(InterpreterState&, mhlo::ReshapeOp reshape,
+                         const InterpreterValue& in) {
+  auto ty = reshape->getResultTypes()[0].cast<mlir::ShapedType>();
+  return reshapeTensor(in, ty.getShape());
+}
+
+llvm::SmallVector<int64_t> clampStarts(ArrayRef<int64_t> starts,
+                                       ArrayRef<int64_t> sliceSizes,
+                                       ArrayRef<int64_t> tensorSizes) {
+  llvm::SmallVector<int64_t> result;
+  for (auto [start, sliceSize, tensorSize] :
+       llvm::zip(starts, sliceSizes, tensorSizes)) {
+    result.push_back(
+        std::max(int64_t{0}, std::min(tensorSize - sliceSize, start)));
+  }
+  return result;
+}
+
+InterpreterValue dynamicSlice(InterpreterState&, mhlo::DynamicSliceOp slice,
+                              const InterpreterValue& in,
+                              ArrayRef<int64_t> starts) {
+  auto result = in.typedAlike(
+      llvm::to_vector(slice.getSliceSizes().getValues<int64_t>()));
+  auto clampedStarts =
+      clampStarts(starts, result.view().sizes, in.view().sizes);
+  // TODO(jreiffers): Skip the copy.
+  result.fill([&](llvm::ArrayRef<int64_t> outIndices) {
+    llvm::SmallVector<int64_t> inIndices;
+    for (auto [start, index] : llvm::zip(clampedStarts, outIndices)) {
+      inIndices.push_back(start + index);
+    }
+    return in.extractElement(inIndices);
+  });
+  return result;
+}
+
+InterpreterValue dynamicUpdateSlice(InterpreterState&,
+                                    mhlo::DynamicUpdateSliceOp,
+                                    const InterpreterValue& in,
+                                    const InterpreterValue& updates,
+                                    ArrayRef<int64_t> starts) {
+  auto result = in.clone();
+  auto clampedStarts =
+      clampStarts(starts, updates.view().sizes, result.view().sizes);
+  for (auto inIndices : updates.view().indices()) {
+    llvm::SmallVector<int64_t> outIndices;
+    for (auto [start, index] : llvm::zip(clampedStarts, inIndices)) {
+      outIndices.push_back(start + index);
+    }
+    result.insertElement(outIndices, updates.extractElement(inIndices));
+  }
+  return result;
+}
+
+InterpreterValue slice(InterpreterState&, mhlo::SliceOp slice,
+                       InterpreterValue in) {
+  auto starts = slice.getStartIndices().getValues<int64_t>();
+  auto limits = slice.getLimitIndices().getValues<int64_t>();
+  auto strides = slice.getStrides().getValues<int64_t>();
+
+  llvm::SmallVector<int64_t> sizes;
+  for (auto [start, limit, stride] : llvm::zip(starts, limits, strides)) {
+    sizes.push_back(((limit - start) + (stride - 1)) / stride);
+  }
+  // TODO(jreiffers): Skip the copy.
+  auto result = in.typedAlike(sizes);
+  result.fill([&](llvm::ArrayRef<int64_t> outIndices) {
+    llvm::SmallVector<int64_t> inIndices;
+    for (auto [start, stride, index] : llvm::zip(starts, strides, outIndices)) {
+      inIndices.push_back(start + stride * index);
+    }
+    return in.extractElement(inIndices);
+  });
+  return result;
+}
+
+llvm::SmallVector<InterpreterValue> constant(InterpreterState&,
+                                             mhlo::ConstantOp constant) {
+  auto ty = constant->getResultTypes()[0].cast<ShapedType>();
+  return {dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    if (ty.getElementType().isUnsignedInteger()) {
+      if constexpr (!std::is_same_v<decltype(dummy), bool> &&
+                    std::is_integral_v<decltype(dummy)>) {
+        auto values =
+            constant.getValue()
+                .getValues<
+                    typename std::make_unsigned<decltype(dummy)>::type>();
+        auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
+        auto valueIt = values.begin();
+        for (const auto& index : result.view.indices()) {
+          result.at(index) = *valueIt;
+          ++valueIt;
+        }
+        return {result};
+      } else {
+        llvm_unreachable("invalid input");
+      }
+    } else {
+      auto values = constant.getValue().getValues<decltype(dummy)>();
+      auto result = TensorOrMemref<decltype(dummy)>::empty(ty.getShape());
+      auto valueIt = values.begin();
+      for (const auto& index : result.view.indices()) {
+        result.at(index) = *valueIt;
+        ++valueIt;
+      }
+      return {result};
+    }
+  })};
+}
+
+InterpreterValue pad(InterpreterState&, mhlo::PadOp pad, InterpreterValue arg,
+                     InterpreterValue paddingValue) {
+  paddingValue = paddingValue.extractElement({});
+  auto his = pad.getEdgePaddingHigh().getValues<int64_t>();
+  auto los = pad.getEdgePaddingLow().getValues<int64_t>();
+  auto ins = pad.getInteriorPadding().getValues<int64_t>();
+
+  llvm::SmallVector<int64_t> sizes;
+  for (auto [size, lo, in, hi] : llvm::zip(arg.view().sizes, los, ins, his)) {
+    sizes.push_back(size + lo + hi + (size - 1) * in);
+  }
+
+  auto result = arg.typedAlike(sizes);
+  result.fill([&](llvm::ArrayRef<int64_t>) { return paddingValue; });
+
+  for (const auto& inIndices : arg.view().indices()) {
+    llvm::SmallVector<int64_t> outIndices;
+    for (auto [inIndex, in, lo] : llvm::zip(inIndices, ins, los)) {
+      outIndices.push_back(inIndex * (in + 1) + lo);
+    }
+    if (result.view().inBounds(outIndices)) {
+      result.insertElement(outIndices, arg.extractElement(inIndices));
+    }
+  }
+
+  return result;
+}
+
+InterpreterValue compare(InterpreterState&, mhlo::CompareOp compare,
+                         const InterpreterValue& lhs,
+                         const InterpreterValue& rhs) {
+  switch (compare.getComparisonDirection()) {
+    case mlir::mhlo::ComparisonDirection::EQ:
+      return applyCwiseBinaryMap<Foeq>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::NE:
+      return applyCwiseBinaryMap<Fune>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::GE:
+      return applyCwiseBinaryMap<Foge>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::GT:
+      return applyCwiseBinaryMap<Fogt>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::LE:
+      return applyCwiseBinaryMap<Fole>(lhs, rhs);
+    case mlir::mhlo::ComparisonDirection::LT:
+      return applyCwiseBinaryMap<Folt>(lhs, rhs);
+  }
+}
+
+llvm::SmallVector<InterpreterValue> gather(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+    InterpreterState& state) {
+  auto gather = llvm::dyn_cast<mhlo::GatherOp>(op);
+  auto dynamicGather = llvm::dyn_cast<mhlo::DynamicGatherOp>(op);
+
+  if (!gather && !dynamicGather) {
+    state.addFailure("invalid gather");
+    return {};
+  }
+
+  const auto& dims = gather ? gather.getDimensionNumbers()
+                            : dynamicGather.getDimensionNumbers();
+
+  auto indexVectorDim = dims.getIndexVectorDim();
+  auto startIndexMap = dims.getStartIndexMap();
+  auto offsetDims = dims.getOffsetDims();
+  auto collapsedSliceDims = dims.getCollapsedSliceDims();
+  auto sliceSizes =
+      gather ? llvm::to_vector(gather.getSliceSizes().getValues<int64_t>())
+             : llvm::to_vector(llvm::map_range(
+                   args[2].view().indices(), [&](const auto& indices) {
+                     return args[2].extractElement(indices).asInt();
+                   }));
+
+  auto& operand = args[0];
+  auto& startIndices = args[1];
+  const auto& operandView = operand.view();
+  int64_t operandRank = operandView.rank();
+
+  // Make a fake BufferView for the start indices.
+  BufferView startIndicesView = startIndices.view();
+  auto outputRank =
+      static_cast<int64_t>(startIndicesView.rank() + offsetDims.size());
+  if (indexVectorDim < startIndicesView.rank()) {
+    --outputRank;
+    startIndicesView.sizes[indexVectorDim] = 1;
+  }
+
+  SmallVector<int64_t> batchDims;
+  for (int64_t i = 0; i < outputRank; ++i) {
+    if (!llvm::is_contained(offsetDims, i)) {
+      batchDims.push_back(i);
+    }
+  }
+
+  // Make a fake BufferView for the slice indices.
+  BufferView sliceIndicesView{0, SmallVector<int64_t>{sliceSizes}, {}};
+
+  SmallVector<int64_t> nonCollapsedSliceDims;
+  for (int64_t i = 0; i < operandRank; ++i) {
+    if (!llvm::is_contained(collapsedSliceDims, i)) {
+      nonCollapsedSliceDims.push_back(i);
+    }
+  }
+
+  SmallVector<int64_t> outputSizes(outputRank);
+  for (auto [outputDim, sliceDim] :
+       llvm::zip(offsetDims, nonCollapsedSliceDims)) {
+    outputSizes[outputDim] = sliceSizes[sliceDim];
+  }
+  for (auto [batchIndex, outputDim] : llvm::enumerate(batchDims)) {
+    if (batchIndex >= indexVectorDim) {
+      ++batchIndex;
+    }
+    outputSizes[outputDim] = startIndicesView.sizes[batchIndex];
+  }
+
+  auto output = operand.typedAlike(outputSizes);
+  for (auto startIndicesIndex : startIndicesView.indices()) {
+    SmallVector<int64_t> operandBaseIndices(operandRank);
+    for (auto [i, dim] : llvm::enumerate(startIndexMap)) {
+      if (indexVectorDim < startIndicesView.rank()) {
+        startIndicesIndex[indexVectorDim] = static_cast<int64_t>(i);
+      }
+      operandBaseIndices[dim] = std::max<int64_t>(
+          0, std::min(startIndices.extractElement(startIndicesIndex).asInt(),
+                      operandView.sizes[dim] - sliceSizes[dim]));
+    }
+
+    for (const auto& sliceIndices : sliceIndicesView.indices()) {
+      SmallVector<int64_t> operandIndices;
+      for (int64_t i = 0; i < operandRank; ++i) {
+        operandIndices.push_back(operandBaseIndices[i] + sliceIndices[i]);
+      }
+
+      SmallVector<int64_t> outputIndices(outputRank);
+      for (auto [outputDim, sliceDim] :
+           llvm::zip(offsetDims, nonCollapsedSliceDims)) {
+        outputIndices[outputDim] = sliceIndices[sliceDim];
+      }
+      for (auto [batchIndex, outputDim] : llvm::enumerate(batchDims)) {
+        outputIndices[outputDim] =
+            startIndicesIndex[batchIndex >= indexVectorDim ? batchIndex + 1
+                                                           : batchIndex];
+      }
+
+      auto value = operand.extractElement(operandIndices);
+      output.insertElement(outputIndices, value);
+    }
+  }
+
+  return {output};
+}
+
+llvm::SmallVector<InterpreterValue> scatter(
+    InterpreterState& state, mhlo::ScatterOp scatter,
+    ArrayRef<InterpreterValue> nInputs, InterpreterValue scatterIndices,
+    ArrayRef<InterpreterValue> nUpdates) {
+  const auto& dims = scatter.getScatterDimensionNumbers();
+  auto indexVectorDim = dims.getIndexVectorDim();
+  auto scatterDimsToOperandDims = dims.getScatterDimsToOperandDims();
+  auto insertedWindowDims = dims.getInsertedWindowDims();
+  auto updateWindowDims = dims.getUpdateWindowDims();
+
+  auto inputView = nInputs.front().view();
+  int64_t operandRank = inputView.rank();
+  int64_t updatesRank = nUpdates.front().view().rank();
+  int64_t indicesRank = scatterIndices.view().rank();
+
+  llvm::SmallVector<int64_t> batchDims;
+  for (int64_t dim = 0; dim < operandRank; ++dim) {
+    if (!llvm::is_contained(insertedWindowDims, dim)) {
+      batchDims.push_back(dim);
+    }
+  }
+
+  llvm::SmallVector<int64_t> updateScatterDims;
+  for (int64_t dim = 0; dim < updatesRank; ++dim) {
+    if (!llvm::is_contained(updateWindowDims, dim)) {
+      updateScatterDims.push_back(dim);
+    }
+  }
+
+  llvm::SmallVector<InterpreterValue> nResults;
+  for (auto& inputs : nInputs) {
+    nResults.push_back(inputs.clone());
+  }
+
+  for (auto [inputs, updates, results] :
+       llvm::zip(nResults, nUpdates, nResults)) {
+    const auto& updatesView = updates.view();
+    for (const auto& updateIndices : updatesView.indices()) {
+      llvm::SmallVector<int64_t> inputIndices(operandRank);
+      llvm::SmallVector<int64_t> maxIndices(operandRank);
+      llvm::SmallVector<int64_t> minIndices(operandRank);
+      llvm::SmallVector<int64_t> scatterIndicesIndex(indicesRank);
+
+      for (auto [index, dim] : llvm::enumerate(updateScatterDims)) {
+        scatterIndicesIndex[index >= indexVectorDim ? index + 1 : index] +=
+            updateIndices[dim];
+      }
+
+      for (auto [updateDim, operandDim] :
+           llvm::zip(updateWindowDims, batchDims)) {
+        inputIndices[operandDim] = updateIndices[updateDim];
+        maxIndices[operandDim] = updatesView.sizes[updateDim] - 1;
+      }
+
+      for (auto [index, dim] : llvm::enumerate(scatterDimsToOperandDims)) {
+        if (indexVectorDim < indicesRank) {
+          scatterIndicesIndex[indexVectorDim] = static_cast<int64_t>(index);
+        }
+
+        int64_t scatterIndex =
+            scatterIndices.extractElement(scatterIndicesIndex).asInt();
+        inputIndices[dim] += scatterIndex;
+        minIndices[dim] += scatterIndex;
+        maxIndices[dim] += scatterIndex;
+      }
+
+      if (!inputView.inBounds(minIndices)) continue;
+      if (!inputView.inBounds(maxIndices)) continue;
+
+      auto currentValue = inputs.extractElement(inputIndices).asUnitTensor();
+      auto update = updates.extractElement(updateIndices).asUnitTensor();
+
+      auto result = interpret(state, scatter.getUpdateComputation(),
+                              {currentValue, update});
+      if (state.hasFailure()) {
+        return nResults;
+      }
+      inputs.insertElement(inputIndices, result.front().extractElement({}));
+    }
+  }
+
+  return nResults;
+}
+
+InterpreterValue select(InterpreterState&, mhlo::SelectOp,
+                        TensorOrMemref<bool> condition,
+                        const InterpreterValue& trueVals,
+                        const InterpreterValue& falseVals) {
+  // TODO(jreiffers): Support implicit broadcasting.
+  auto result = trueVals.clone();
+  for (const auto& indices : condition.view.indices()) {
+    if (!condition.at(indices)) {
+      result.insertElement(indices, falseVals.extractElement(indices));
+    }
+  }
+  return result;
+}
+
+InterpreterValue transpose(InterpreterState&, mhlo::TransposeOp transpose,
+                           const InterpreterValue& tensor) {
+  auto permutation = transpose.getPermutation().getValues<int64_t>();
+  return transposeImpl(tensor, llvm::to_vector(permutation));
+}
+
+InterpreterValue iota(InterpreterState&, mhlo::IotaOp iota) {
+  auto dim = iota.getIotaDimension();
+  auto ty = iota->getResultTypes()[0].cast<ShapedType>();
+  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    auto result = TensorOrMemref<decltype(dummy)>::empty({ty.getShape()[dim]});
+    for (const auto& index : result.view.indices()) {
+      result.at(index) = index[0];
+    }
+    result.view.sizes = SmallVector<int64_t>(ty.getShape());
+    result.view.strides = SmallVector<int64_t>(result.view.sizes.size());
+    result.view.strides[dim] = 1;
+    return {result};
+  });
+}
+
+template <typename R>
+struct Caster {
+  template <typename A>
+  constexpr static bool supportedType() {
+    return std::is_convertible_v<A, R>;
+  }
+
+  template <typename A>
+  static R apply(A v) {
+    return v;
+  }
+};
+
+InterpreterValue convert(InterpreterState&, mhlo::ConvertOp op,
+                         InterpreterValue in) {
+  return dispatchScalarType(op->getResultTypes()[0],
+                            [&](auto dummy) -> InterpreterValue {
+                              return applyCwiseMap<Caster<decltype(dummy)>>(in);
+                            });
+}
+
+llvm::SmallVector<InterpreterValue> mhloWhile(
+    InterpreterState& state, mhlo::WhileOp whileop,
+    SmallVector<InterpreterValue> loopVars) {
+  auto cond = interpret(state, whileop.getRegion(0), loopVars);
+  while (!state.hasFailure() &&
+         std::get<TensorOrMemref<bool>>(cond.front().storage).at({})) {
+    loopVars = interpret(state, whileop.getRegion(1), loopVars);
+    if (state.hasFailure()) break;
+    cond = interpret(state, whileop.getRegion(0), loopVars);
+  }
+  return loopVars;
+}
+
+InterpreterValue copy(InterpreterState&, mhlo::CopyOp op,
+                      const InterpreterValue& tensor) {
+  auto layout = op->getAttr("result_layout");
+  if (!layout) return tensor.clone();
+  return tensor.clone(llvm::to_vector(
+      layout.cast<DenseIntElementsAttr>().getValues<int64_t>()));
+}
+
+llvm::SmallVector<InterpreterValue> fusion(InterpreterState& state,
+                                           mhlo::FusionOp fusion,
+                                           ArrayRef<InterpreterValue> args) {
+  return interpret(state, fusion.getRegion(), args);
+}
+
+llvm::SmallVector<InterpreterValue> reduce(InterpreterState& state,
+                                           mhlo::ReduceOp reduce,
+                                           ArrayRef<InterpreterValue> operands,
+                                           ArrayRef<InterpreterValue> inits) {
+  auto dims = reduce.getDimensions().getValues<int64_t>();
+
+  auto outSizes = operands.front().view().sizes;
+  for (int64_t dim : llvm::reverse(dims)) {
+    outSizes.erase(outSizes.begin() + dim);
+  }
+
+  SmallVector<InterpreterValue> results;
+  for (auto [operand, init] : llvm::zip(operands, inits)) {
+    results.push_back(operand.typedAlike(outSizes));
+    auto initScalar = init.extractElement({});
+    results.back().fill([&](llvm::ArrayRef<int64_t>) { return initScalar; });
+  }
+
+  for (const auto& index : operands[0].view().indices()) {
+    auto dstIndex = index;
+    for (int64_t dim : llvm::reverse(dims)) {
+      dstIndex.erase(dstIndex.begin() + dim);
+    }
+
+    SmallVector<InterpreterValue> args;
+    for (auto& result : results) {
+      args.push_back(result.extractElement(dstIndex).asUnitTensor());
+    }
+    for (auto& operand : operands) {
+      args.push_back(operand.extractElement(index).asUnitTensor());
+    }
+    auto newValues = interpret(state, reduce.getRegion(), args);
+    if (state.hasFailure()) return results;
+
+    for (auto [result, value] : llvm::zip(results, newValues)) {
+      result.insertElement(dstIndex, value.extractElement({}));
+    }
+  }
+  return results;
+}
+
+SmallVector<InterpreterValue> mhloCase(InterpreterState& state, mhlo::CaseOp op,
+                                       int64_t index) {
+  if (index < 0 || index >= op->getNumResults()) {
+    index = op->getNumRegions() - 1;
+  }
+  return interpret(state, op->getRegion(index), {});
+}
+
+InterpreterValue dotGeneralImpl(InterpreterValue& lhs, InterpreterValue& rhs,
+                                ArrayRef<int64_t> lhsContracting,
+                                ArrayRef<int64_t> rhsContracting,
+                                ArrayRef<int64_t> lhsBatch,
+                                ArrayRef<int64_t> rhsBatch,
+                                mlir::Type elementTy) {
+  auto& lhsv = lhs.view();
+  auto& rhsv = rhs.view();
+  SmallVector<int64_t> dimensions;
+  SmallVector<int64_t> lhsNonBatch;
+  SmallVector<int64_t> rhsNonBatch;
+  auto nbatch = static_cast<int64_t>(lhsBatch.size());
+  for (const int64_t lhsDim : lhsBatch) {
+    dimensions.push_back(lhsv.sizes[lhsDim]);
+  }
+  for (int64_t i = 0; i < lhsv.rank(); i++) {
+    if (!llvm::is_contained(lhsContracting, i) &&
+        !llvm::is_contained(lhsBatch, i)) {
+      dimensions.push_back(lhsv.sizes[i]);
+      lhsNonBatch.push_back(i);
+    }
+  }
+  for (int64_t i = 0; i < rhs.view().rank(); i++) {
+    if (!llvm::is_contained(rhsContracting, i) &&
+        !llvm::is_contained(rhsBatch, i)) {
+      dimensions.push_back(rhsv.sizes[i]);
+      rhsNonBatch.push_back(i);
+    }
+  }
+
+  SmallVector<int64_t> lhsIndex(lhsv.rank());
+  SmallVector<int64_t> rhsIndex(rhsv.rank());
+  SmallVector<int64_t> outputIndex(dimensions.size());
+  auto output = lhs.typedAlike(dimensions);
+
+  std::function<void(int64_t)> applyBatch, applyLhs, applyRhs, applyContract;
+
+  applyBatch = [&](int64_t dim) {
+    if (dim >= nbatch) {
+      applyLhs(0);
+      return;
+    }
+    for (int64_t i = 0; i < dimensions[dim]; ++i) {
+      lhsIndex[lhsBatch[dim]] = i;
+      rhsIndex[rhsBatch[dim]] = i;
+      outputIndex[dim] = i;
+      applyBatch(dim + 1);
+    }
+  };
+
+  applyLhs = [&](int64_t dim) {
+    if (dim >= lhsNonBatch.size()) {
+      applyRhs(0);
+      return;
+    }
+    int64_t outDim = nbatch + dim;
+    for (int64_t i = 0; i < dimensions[outDim]; ++i) {
+      lhsIndex[lhsNonBatch[dim]] = i;
+      outputIndex[outDim] = i;
+      applyLhs(dim + 1);
+    }
+  };
+
+  applyRhs = [&](int64_t dim) {
+    if (dim >= rhsNonBatch.size()) {
+      applyContract(0);
+      return;
+    }
+    auto outDim = static_cast<int64_t>(nbatch + lhsNonBatch.size() + dim);
+    for (int64_t i = 0; i < dimensions[outDim]; ++i) {
+      rhsIndex[rhsNonBatch[dim]] = i;
+      outputIndex[outDim] = i;
+      applyRhs(dim + 1);
+    }
+  };
+
+  applyContract = [&](int64_t dim) {
+    if (dim >= lhsContracting.size()) {
+      dispatchScalarType(elementTy, [&](auto dummy) {
+        using T = TensorOrMemref<decltype(dummy)>;
+        std::get<T>(output.storage).at(outputIndex) +=
+            std::get<T>(lhs.storage).at(lhsIndex) *
+            std::get<T>(rhs.storage).at(rhsIndex);
+      });
+      return;
+    }
+    for (int64_t i = 0; i < lhsv.sizes[lhsContracting[dim]]; ++i) {
+      lhsIndex[lhsContracting[dim]] = i;
+      rhsIndex[rhsContracting[dim]] = i;
+      applyContract(dim + 1);
+    }
+  };
+
+  applyBatch(0);
+  return output;
+}
+
+// TODO(jreiffers): Unify this with DotGeneral.
+InterpreterValue dot(InterpreterState& state, mhlo::DotOp op,
+                     const InterpreterValue& lhs, const InterpreterValue& rhs) {
+  ShapedType ty = op->getResultTypes()[0];
+  auto result = lhs.typedAlike(ty.getShape());
+
+  if (lhs.view().rank() == 1 && rhs.view().rank() == 1) {
+    dispatchScalarType(ty, [&](auto dummy) {
+      using T = decltype(dummy);
+      using TT = TensorOrMemref<T>;
+      auto lhsTensor = std::get<TT>(lhs.storage);
+      auto rhsTensor = std::get<TT>(rhs.storage);
+
+      T product = 0;
+      for (int64_t i = 0; i < lhsTensor.view.sizes[0]; ++i) {
+        product += lhsTensor.at(i) * rhsTensor.at(i);
+      }
+      std::get<TT>(result.storage).at({}) += product;
+    });
+  } else if (lhs.view().rank() == 2 && rhs.view().rank() == 1) {
+    dispatchScalarType(ty, [&](auto dummy) {
+      using TT = TensorOrMemref<decltype(dummy)>;
+      auto lhsTensor = std::get<TT>(lhs.storage);
+      auto rhsTensor = std::get<TT>(rhs.storage);
+      auto resultTensor = std::get<TT>(result.storage);
+      for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
+        for (int64_t j = 0; j < rhsTensor.view.sizes[0]; ++j) {
+          resultTensor.at(i) += lhsTensor.at({i, j}) * rhsTensor.at(j);
+        }
+      }
+    });
+  } else if (lhs.view().rank() == 2 && rhs.view().rank() == 2) {
+    dispatchScalarType(ty, [&](auto dummy) {
+      using TT = TensorOrMemref<decltype(dummy)>;
+      auto lhsTensor = std::get<TT>(lhs.storage);
+      auto rhsTensor = std::get<TT>(rhs.storage);
+      auto resultTensor = std::get<TT>(result.storage);
+      for (int64_t i = 0; i < resultTensor.view.sizes[0]; ++i) {
+        for (int64_t j = 0; j < resultTensor.view.sizes[1]; ++j) {
+          for (int64_t k = 0; k < lhsTensor.view.sizes[1]; ++k) {
+            resultTensor.at({i, j}) +=
+                lhsTensor.at({i, k}) * rhsTensor.at({k, j});
+          }
+        }
+      }
+    });
+  } else {
+    state.addFailure("unsupported dot");
+  }
+
+  return result;
+}
+
+InterpreterValue dotGeneral(InterpreterState&, mhlo::DotGeneralOp op,
+                            InterpreterValue lhs, InterpreterValue rhs) {
+  const auto& dims = op.getDotDimensionNumbers();
+  return dotGeneralImpl(
+      lhs, rhs, dims.getLhsContractingDimensions(),
+      dims.getRhsContractingDimensions(), dims.getLhsBatchingDimensions(),
+      dims.getRhsBatchingDimensions(),
+      op->getResultTypes()[0].cast<ShapedType>().getElementType());
+}
+
+InterpreterValue computeReshapeShape(InterpreterState&,
+                                     mhlo::ComputeReshapeShapeOp,
+                                     const InterpreterValue& numElements,
+                                     const InterpreterValue& dynamicShape) {
+  auto out = dynamicShape.clone();
+  SmallVector<int64_t> dynamicIndex;
+  int64_t staticProduct = 1;
+  for (const auto& index : dynamicShape.view().indices()) {
+    auto value = dynamicShape.extractElement(index).asInt();
+    if (value == -1) {
+      dynamicIndex = index;
+    } else {
+      staticProduct *= value;
+    }
+  }
+
+  if (!dynamicIndex.empty()) {
+    std::visit(
+        [&](auto& it) {
+          if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+            it.at(dynamicIndex) = numElements.asInt() / staticProduct;
+          } else {
+            llvm_unreachable("invalid input");
+          }
+        },
+        out.storage);
+  }
+  return out;
+}
+
+// TODO(jreiffers): Migrate remaining ops to the safer signature.
+REGISTER_MLIR_INTERPRETER_OP("mhlo.dynamic_gather", gather);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.gather", gather);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.return", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.tuple", makeTuple);
+REGISTER_MLIR_INTERPRETER_OP(broadcastInDim);
+REGISTER_MLIR_INTERPRETER_OP(compare);
+REGISTER_MLIR_INTERPRETER_OP(computeReshapeShape);
+REGISTER_MLIR_INTERPRETER_OP(constant);
+REGISTER_MLIR_INTERPRETER_OP(convert);
+REGISTER_MLIR_INTERPRETER_OP(copy);
+REGISTER_MLIR_INTERPRETER_OP(dot);
+REGISTER_MLIR_INTERPRETER_OP(dotGeneral);
+REGISTER_MLIR_INTERPRETER_OP(dynamicSlice);
+REGISTER_MLIR_INTERPRETER_OP(dynamicUpdateSlice);
+REGISTER_MLIR_INTERPRETER_OP(fusion);
+REGISTER_MLIR_INTERPRETER_OP(mhloCase);
+REGISTER_MLIR_INTERPRETER_OP(getTupleElement);
+REGISTER_MLIR_INTERPRETER_OP(iota);
+REGISTER_MLIR_INTERPRETER_OP(pad);
+REGISTER_MLIR_INTERPRETER_OP(reduce);
+REGISTER_MLIR_INTERPRETER_OP(reshape);
+REGISTER_MLIR_INTERPRETER_OP(scatter);
+REGISTER_MLIR_INTERPRETER_OP(select);
+REGISTER_MLIR_INTERPRETER_OP(slice);
+REGISTER_MLIR_INTERPRETER_OP(transpose);
+REGISTER_MLIR_INTERPRETER_OP(mhloWhile);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
new file mode 100644
index 00000000000..959aa138cab
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_binary_cwise.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/dialects/cwise_math.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+REGISTER_MLIR_INTERPRETER_OP("mhlo.add", applyCwiseBinaryMap<Plus>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.and", applyCwiseBinaryMap<BitAnd>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.atan2", applyCwiseBinaryMap<ATan2>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.complex", applyCwiseBinaryMap<Complex>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.divide", applyCwiseBinaryMap<Divide>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.maximum", applyCwiseBinaryMap<Max>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.minimum", applyCwiseBinaryMap<Min>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.multiply", applyCwiseBinaryMap<Multiply>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.or", applyCwiseBinaryMap<BitOr>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.power", applyCwiseBinaryMap<Power>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.remainder", applyCwiseBinaryMap<Remainder>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_left", applyCwiseBinaryMap<ShiftLeft>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_arithmetic",
+                             applyCwiseBinaryMap<ShiftRightArith>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.shift_right_logical",
+                             applyCwiseBinaryMap<ShiftRightLogical>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.subtract", applyCwiseBinaryMap<Minus>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.xor", applyCwiseBinaryMap<BitXor>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
new file mode 100644
index 00000000000..0f93815cce5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/mhlo_unary_cwise.cc
@@ -0,0 +1,112 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <complex>
+
+#include "tools/mlir_interpreter/dialects/cwise_math.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+struct Clz : CwiseInt {
+  template <typename T>
+  static T apply(T a) {
+    if (!a) {
+      // Return something well-defined for zeroes.
+      return sizeof(T{}) * CHAR_BIT;
+    }
+    return __builtin_clzl(
+               static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a))) -
+           (sizeof(uint64_t) - sizeof(T{})) * CHAR_BIT;
+  }
+};
+
+struct Logistic : CwiseNonIntegral {
+  template <typename T>
+  static T apply(T a) {
+    if (std::real(a) < 0) {
+      T e = std::exp(a);
+      return e / (e + T{1});
+    }
+    return T{1} / (std::exp(-a) + T{1});
+  }
+};
+
+// Note: this can't be replaced with std::bit_not, which always returns true for
+// bools.
+struct Not : CwiseIntegral {
+  static bool apply(bool a) { return !a; }
+
+  template <typename T>
+  static T apply(T a) {
+    return ~a;
+  }
+};
+
+struct PopCount : CwiseInt {
+  template <typename T>
+  static T apply(T a) {
+    return __builtin_popcountl(
+        static_cast<uint64_t>(static_cast<std::make_unsigned_t<T>>(a)));
+  }
+};
+
+struct RSqrt : CwiseNonIntegral {
+  template <typename T>
+  static T apply(T a) {
+    return static_cast<T>(T{1} / std::sqrt(a));
+  }
+};
+
+struct Sign : CwiseSigned {
+  template <typename T>
+  static T apply(T a) {
+    return std::copysign(T{1}, a);
+  }
+};
+
+REGISTER_MLIR_INTERPRETER_OP("mhlo.abs", applyCwiseMap<Abs>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.cbrt", applyCwiseMap<Cbrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.ceil", applyCwiseMap<Ceil>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.cosine", applyCwiseMap<Cos>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.count_leading_zeros", applyCwiseMap<Clz>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential", applyCwiseMap<Exp>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.exponential_minus_one",
+                             applyCwiseMap<ExpM1>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.floor", applyCwiseMap<Floor>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.imag", applyCwiseMap<Imag>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.is_finite", applyCwiseMap<IsFinite>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.log", applyCwiseMap<Log>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.log_plus_one", applyCwiseMap<Log1P>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.logistic", applyCwiseMap<Logistic>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.negate", applyCwiseMap<Neg>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.not", applyCwiseMap<Not>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.popcnt", applyCwiseMap<PopCount>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.real", applyCwiseMap<Real>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_afz", applyCwiseMap<Round>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.round_nearest_even",
+                             applyCwiseMap<NearbyInt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.rsqrt", applyCwiseMap<RSqrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sign", applyCwiseMap<Sign>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sine", applyCwiseMap<Sin>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.sqrt", applyCwiseMap<Sqrt>);
+REGISTER_MLIR_INTERPRETER_OP("mhlo.tanh", applyCwiseMap<TanH>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
new file mode 100644
index 00000000000..6d9127d86cd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
@@ -0,0 +1,159 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+#include <iterator>  // NOLINT
+#include <memory>    // NOLINT
+#include <utility>   // NOLINT
+
+#include "llvm/ADT/SmallVector.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class ParallelSideChannel : public InterpreterSideChannel {
+ public:
+  ParallelSideChannel(
+      llvm::SmallVector<InterpreterValue>& results,
+      const llvm::DenseMap<scf::ReduceOp, int64_t>& reduceOpIndices)
+      : results(results), reduceOpIndices(reduceOpIndices) {}
+
+  InterpreterValue& result(scf::ReduceOp op) const {
+    return results[reduceOpIndices.find(op)->second];
+  }
+
+ private:
+  SmallVector<InterpreterValue>& results;
+  const llvm::DenseMap<scf::ReduceOp, int64_t>& reduceOpIndices;
+};
+
+llvm::SmallVector<InterpreterValue> scfFor(InterpreterState& state,
+                                           scf::ForOp op, int64_t lb,
+                                           int64_t ub, int64_t step,
+                                           ArrayRef<InterpreterValue> inits) {
+  llvm::SmallVector<InterpreterValue> results;
+  for (int64_t i = 0; i < inits.size(); ++i) {
+    results.push_back(getInitOperand(op.getInitArgs(), i, inits));
+  }
+
+  auto& region = op->getRegion(0);
+  for (; lb < ub; lb += step) {
+    SmallVector<InterpreterValue> inputs{{lb}};
+    llvm::copy(results, std::back_inserter(inputs));
+    results = interpret(state, region, inputs);
+    if (state.hasFailure()) break;
+  }
+  return results;
+}
+
+llvm::SmallVector<InterpreterValue> scfIf(InterpreterState& state, scf::IfOp op,
+                                          bool condition) {
+  if (condition) {
+    return interpret(state, op.getThenRegion(), {});
+  }
+  if (op.getElseRegion().hasOneBlock()) {
+    return interpret(state, op.getElseRegion(), {});
+  }
+  return {};
+}
+
+llvm::SmallVector<InterpreterValue> parallel(InterpreterState& state,
+                                             scf::ParallelOp parallel,
+                                             ArrayRef<int64_t> lbs,
+                                             ArrayRef<int64_t> ubs,
+                                             ArrayRef<int64_t> steps,
+                                             ArrayRef<InterpreterValue> inits) {
+  llvm::SmallVector<InterpreterValue> results;
+  for (int64_t i = 0; i < parallel.getNumReductions(); ++i) {
+    results.push_back(getInitOperand(parallel.getInitVals(), i, inits));
+  }
+
+  BufferView iter;
+  for (auto [lb, ub, step] : llvm::zip(lbs, ubs, steps)) {
+    iter.sizes.push_back((ub - lb + (step - 1)) / step);
+  }
+
+  llvm::DenseMap<scf::ReduceOp, int64_t> reduceOps;
+  for (auto& subOp : parallel.getBody()->getOperations()) {
+    if (auto reduce = llvm::dyn_cast<scf::ReduceOp>(subOp)) {
+      int64_t index = reduceOps.size();
+      reduceOps[reduce] = index;
+    }
+  }
+
+  assert(reduceOps.size() == results.size() &&
+         "expected equal number of reduce ops and results");
+
+  // Make the results available to reduce ops.
+  state.getTopScope()->setSideChannel(
+      std::make_shared<ParallelSideChannel>(results, reduceOps));
+  for (const auto& indices : iter.indices()) {
+    SmallVector<InterpreterValue> iterArgs;
+    for (auto [i, lb, step] : llvm::zip(indices, lbs, steps)) {
+      iterArgs.push_back(InterpreterValue{i * step + lb});
+    }
+
+    // Execute the region. It has no results.
+    interpret(state, parallel.getRegion(), iterArgs);
+  }
+
+  return results;
+}
+
+void reduce(InterpreterState& state, scf::ReduceOp reduce,
+            const InterpreterValue& operand) {
+  auto& accumulator =
+      state.getTopScope()->getSideChannel<ParallelSideChannel>()->result(
+          reduce);
+  // TODO(jreiffers): Is this the correct order?
+  auto results = interpret(state, reduce.getRegion(), {accumulator, operand});
+  if (!state.hasFailure()) {
+    accumulator = results.front();
+  }
+}
+
+llvm::SmallVector<InterpreterValue> scfWhile(
+    InterpreterState& state, scf::WhileOp op,
+    MutableArrayRef<InterpreterValue> inits) {
+  auto loopVars = interpret(state, op.getBefore(), inits);
+  while (!state.hasFailure() && std::get<bool>(loopVars.front().storage)) {
+    loopVars = interpret(state, op.getAfter(),
+                         ArrayRef<InterpreterValue>(loopVars).drop_front());
+    if (state.hasFailure()) break;
+    loopVars = interpret(state, op.getBefore(), loopVars);
+  }
+  if (state.hasFailure()) return {};
+  loopVars.erase(loopVars.begin());
+  return loopVars;
+}
+
+REGISTER_MLIR_INTERPRETER_OP("scf.condition", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("scf.reduce.return", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP("scf.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(parallel);
+REGISTER_MLIR_INTERPRETER_OP(reduce);
+REGISTER_MLIR_INTERPRETER_OP(scfFor);
+REGISTER_MLIR_INTERPRETER_OP(scfIf);
+REGISTER_MLIR_INTERPRETER_OP(scfWhile);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
new file mode 100644
index 00000000000..d30b7619c34
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tensor.cc
@@ -0,0 +1,209 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+
+// clang-format erroneously puts the Tensor.h header above.
+#include <iterator>  // NOLINT
+#include <variant>   // NOLINT
+
+#include "llvm/ADT/STLExtras.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+int64_t dim(InterpreterState& state, tensor::DimOp,
+            const InterpreterValue& tensor, int64_t dim) {
+  return dimImpl(tensor, dim, state);
+}
+
+InterpreterValue empty(InterpreterState&, tensor::EmptyOp op,
+                       ArrayRef<int64_t> dynamicSizes) {
+  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
+  return InterpreterValue::makeTensor(ty.getElementType(), shape);
+}
+
+InterpreterValue extract(InterpreterState& state, tensor::ExtractOp,
+                         InterpreterValue tensor, ArrayRef<int64_t> indices) {
+  if (!tensor.view().inBounds(indices)) {
+    state.addFailure("array index out of bounds");
+    return {};
+  }
+  return tensor.extractElement(indices);
+}
+
+InterpreterValue fromElements(InterpreterState&, tensor::FromElementsOp op,
+                              MutableArrayRef<InterpreterValue> elements) {
+  auto ty = op->getResultTypes().front().cast<mlir::ShapedType>();
+  auto result = InterpreterValue::makeTensor(ty.getElementType(),
+                                             llvm::to_vector(ty.getShape()));
+  for (auto [index, element] : llvm::zip(result.view().indices(), elements)) {
+    result.insertElement(index, element);
+  }
+  return result;
+}
+
+template <typename Op>
+llvm::SmallVector<InterpreterValue> tensorReshape(
+    InterpreterState&, Op op, const InterpreterValue& tensor) {
+  auto ty = op->getResultTypes().front().template cast<mlir::ShapedType>();
+  return {reshapeTensor(tensor, ty.getShape())};
+}
+
+llvm::SmallVector<InterpreterValue> extractSlice(
+    InterpreterState&, tensor::ExtractSliceOp extract, InterpreterValue tensor,
+    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
+    ArrayRef<int64_t> dynamicStrides) {
+  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
+                                      dynamicStrides, extract);
+  int64_t rank = v.offsets.size();
+  auto out = tensor.typedAlike(v.sizes);
+  out.fill([&](llvm::ArrayRef<int64_t> indices) {
+    llvm::SmallVector<int64_t> srcIndices;
+    for (int64_t i = 0; i < rank; ++i) {
+      srcIndices.push_back(indices[i] * v.strides[i] + v.offsets[i]);
+    }
+    return tensor.extractElement(srcIndices);
+  });
+
+  int64_t numDropped = 0;
+  auto& outView = out.view();
+  auto droppedDims = extract.getDroppedDims();
+  for (int64_t bit : droppedDims.set_bits()) {
+    assert(outView.sizes[bit - numDropped] == 1 && "Can only drop unit dims");
+    outView.sizes.erase(outView.sizes.begin() + (bit - numDropped));
+    outView.strides.erase(outView.strides.begin() + (bit - numDropped));
+    ++numDropped;
+  }
+  return {out};
+}
+
+llvm::SmallVector<InterpreterValue> insertSlice(
+    InterpreterState&, tensor::InsertSliceOp insert, InterpreterValue src,
+    InterpreterValue dest, ArrayRef<int64_t> dynamicOffsets,
+    ArrayRef<int64_t> dynamicSizes, ArrayRef<int64_t> dynamicStrides) {
+  dest = dest.clone();
+  auto v = extractOffsetsSizesStrides(dynamicOffsets, dynamicSizes,
+                                      dynamicStrides, insert);
+
+  auto staticSizes = insert.getStaticSizes();
+  llvm::SmallVector<int64_t> insertedDims;
+  auto& srcView = src.view();
+  auto* srcSizeIt = srcView.sizes.begin();
+  for (auto [dim, size] : llvm::enumerate(staticSizes)) {
+    if (srcSizeIt == srcView.sizes.end() || *srcSizeIt != size) {
+      assert(size == 1 && "Can only insert unit dims");
+      insertedDims.push_back(dim);
+    } else {
+      ++srcSizeIt;
+    }
+  }
+
+  for (const auto& srcIndices : srcView.indices()) {
+    llvm::SmallVector<int64_t> srcWithInsertedDims = srcIndices;
+    for (int64_t dim : insertedDims) {
+      srcWithInsertedDims.insert(srcWithInsertedDims.begin() + dim, 0);
+    }
+    llvm::SmallVector<int64_t> dstIndices;
+    for (auto [srcIndex, stride, offset] :
+         llvm::zip(srcWithInsertedDims, v.strides, v.offsets)) {
+      dstIndices.push_back(srcIndex * stride + offset);
+    }
+    dest.insertElement(dstIndices, src.extractElement(srcIndices));
+  }
+  return {dest};
+}
+
+InterpreterValue generate(InterpreterState& state, tensor::GenerateOp generate,
+                          ArrayRef<int64_t> dynamicSizes) {
+  auto ty = generate->getResultTypes().front().cast<ShapedType>();
+  auto sizes = replaceDynamicVals(ty.getShape(), dynamicSizes);
+
+  auto result = InterpreterValue::makeTensor(ty.getElementType(), sizes);
+  result.fill([&](ArrayRef<int64_t> indices) -> InterpreterValue {
+    auto values = interpret(state, generate.getRegion(),
+                            packInterpreterValues<int64_t>(indices));
+    if (state.hasFailure()) {
+      return {result.extractElement(indices)};
+    }
+    return values.front();
+  });
+  return {result};
+}
+
+InterpreterValue insert(InterpreterState& state, tensor::InsertOp,
+                        const InterpreterValue& value,
+                        const InterpreterValue& tensor,
+                        ArrayRef<int64_t> indices) {
+  auto result = tensor.clone();
+  if (result.view().inBounds(indices)) {
+    result.insertElement(indices, value);
+  } else {
+    state.addFailure("array index out of bounds");
+  }
+  return result;
+}
+
+InterpreterValue pad(InterpreterState& state, tensor::PadOp pad,
+                     InterpreterValue tensor, ArrayRef<int64_t> dynamicLows,
+                     ArrayRef<int64_t> dynamicHighs) {
+  auto lows = replaceDynamicVals(pad.getStaticLow(), dynamicLows);
+  auto highs = replaceDynamicVals(pad.getStaticHigh(), dynamicHighs);
+
+  auto& view = tensor.view();
+  llvm::SmallVector<int64_t> resultSizes;
+  for (auto [size, low, high] : llvm::zip(view.sizes, lows, highs)) {
+    resultSizes.push_back(size + low + high);
+  }
+
+  auto result = tensor.typedAlike(resultSizes);
+  result.fill([&](llvm::ArrayRef<int64_t> outIndex) -> InterpreterValue {
+    llvm::SmallVector<int64_t> inIndex;
+    for (auto [index, low] : llvm::zip(outIndex, lows)) {
+      inIndex.push_back(index - low);
+    }
+    if (view.inBounds(inIndex)) {
+      return tensor.extractElement(inIndex);
+    }
+    return interpret(state, pad.getRegion(), packInterpreterValues(outIndex))
+        .front();
+  });
+  return result;
+}
+
+REGISTER_MLIR_INTERPRETER_OP("tensor.cast",
+                             "builtin.unrealized_conversion_cast");
+REGISTER_MLIR_INTERPRETER_OP("tensor.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(dim);
+REGISTER_MLIR_INTERPRETER_OP(empty);
+REGISTER_MLIR_INTERPRETER_OP(extract);
+REGISTER_MLIR_INTERPRETER_OP(extractSlice);
+REGISTER_MLIR_INTERPRETER_OP(fromElements);
+REGISTER_MLIR_INTERPRETER_OP(generate);
+REGISTER_MLIR_INTERPRETER_OP(insert);
+REGISTER_MLIR_INTERPRETER_OP(insertSlice);
+REGISTER_MLIR_INTERPRETER_OP(pad);
+REGISTER_MLIR_INTERPRETER_OP(tensorReshape<tensor::CollapseShapeOp>);
+REGISTER_MLIR_INTERPRETER_OP(tensorReshape<tensor::ExpandShapeOp>);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
new file mode 100644
index 00000000000..61630520386
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/mlir_hlo:mlir-interpreter-runner",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
new file mode 100644
index 00000000000..33f064e8d81
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/apply.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @ceildiv() -> index {
+  %c25 = arith.constant 25 : index
+  %ret = affine.apply affine_map<(d0)[] -> ((d0 ceildiv 8) * 8)>(%c25)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @ceildiv
+// CHECK: Results
+// CHECK-NEXT: 32
+
+func.func @floordiv() -> index {
+  %c25 = arith.constant 25 : index
+  %ret = affine.apply affine_map<(d0)[] -> ((d0 floordiv 8) * 8)>(%c25)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @floordiv
+// CHECK: Results
+// CHECK-NEXT: 24
+
+func.func @add() -> index {
+  %c100 = arith.constant 100 : index
+  %c42 = arith.constant 42 : index
+  %ret = affine.apply affine_map<(d0, d1)[] -> (d0 + d1)>(%c100, %c42)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @add
+// CHECK: Results
+// CHECK-NEXT: 142
+
+func.func @mod() -> index {
+  %c99 = arith.constant 99 : index
+  %ret = affine.apply affine_map<(d0)[] -> (d0 mod 10)>(%c99)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @mod
+// CHECK: Results
+// CHECK-NEXT: 9
+
+func.func @mul() -> index {
+  %c100 = arith.constant 100 : index
+  %ret = affine.apply affine_map<(d0)[] -> (d0 * 42)>(%c100)[]
+  return %ret : index
+}
+
+// CHECK-LABEL: @mul
+// CHECK: Results
+// CHECK-NEXT: 4200
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
new file mode 100644
index 00000000000..cf38c45f360
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/affine/minmax.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+#map = affine_map<(d0)[] -> (1000, d0 + 512, d0*100)>
+
+func.func @min() -> (index, index, index) {
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c500 = arith.constant 500 : index
+
+  %0 = affine.min #map (%c1)[]
+  %1 = affine.min #map (%c8)[]
+  %2 = affine.min #map (%c500)[]
+
+  return %0, %1, %2 : index, index, index
+}
+
+// CHECK-LABEL: @min
+// CHECK-NEXT: Results
+// CHECK-NEXT: 100
+// CHECK-NEXT: 520
+// CHECK-NEXT: 1000
+
+func.func @max() -> (index, index) {
+  %c1 = arith.constant 1 : index
+  %c11 = arith.constant 11 : index
+
+  %0 = affine.max #map (%c1)[]
+  %1 = affine.max #map (%c11)[]
+
+  return %0, %1 : index, index
+}
+
+// CHECK-LABEL: @max
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1000
+// CHECK-NEXT: 1100
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
new file mode 100644
index 00000000000..4985bb0ebba
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpf.mlir
@@ -0,0 +1,118 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func private @compare(%a: f32, %b: f32) -> tensor<16xi1> {
+  %false = arith.cmpf false, %a, %b : f32
+  %oeq = arith.cmpf oeq, %a, %b : f32
+  %ogt = arith.cmpf ogt, %a, %b : f32
+  %oge = arith.cmpf oge, %a, %b : f32
+
+  %olt = arith.cmpf olt, %a, %b : f32
+  %ole = arith.cmpf ole, %a, %b : f32
+  %one = arith.cmpf one, %a, %b : f32
+  %ord = arith.cmpf ord, %a, %b : f32
+
+  %ueq = arith.cmpf ueq, %a, %b : f32
+  %ugt = arith.cmpf ugt, %a, %b : f32
+  %uge = arith.cmpf uge, %a, %b : f32
+  %ult = arith.cmpf ult, %a, %b : f32
+
+  %ule = arith.cmpf ule, %a, %b : f32
+  %une = arith.cmpf une, %a, %b : f32
+  %uno = arith.cmpf uno, %a, %b : f32
+  %true = arith.cmpf true, %a, %b : f32
+
+  %ret = tensor.from_elements
+    %false, %oeq, %ogt, %oge,
+    %olt, %ole, %one, %ord,
+    %ueq, %ugt, %uge, %ult,
+    %ule, %une, %uno, %true : tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+func.func @nan_vs_one() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%nan, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @nan_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @one_vs_nan() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%one, %nan) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_nan
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @nan_vs_nan() -> tensor<16xi1> {
+  %nan = arith.constant 0x7fc00000 : f32
+
+  %ret = func.call @compare(%nan, %nan) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @nan_vs_nan
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  true, true, true, true]
+
+func.func @one_vs_one() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+
+  %ret = func.call @compare(%one, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, true, false, true,
+// CHECK-SAME:  false, true, false, true,
+// CHECK-SAME:  true, false, true, false,
+// CHECK-SAME:  true, false, false, true]
+
+func.func @one_vs_two() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+  %two = arith.constant 2.0 : f32
+
+  %ret = func.call @compare(%one, %two) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @one_vs_two
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, false, false,
+// CHECK-SAME:  true, true, true, true,
+// CHECK-SAME:  false, false, false, true,
+// CHECK-SAME:  true, true, false, true]
+
+func.func @two_vs_one() -> tensor<16xi1> {
+  %one = arith.constant 1.0 : f32
+  %two = arith.constant 2.0 : f32
+
+  %ret = func.call @compare(%two, %one) : (f32, f32) -> tensor<16xi1>
+  func.return %ret : tensor<16xi1>
+}
+
+// CHECK-LABEL: @two_vs_one
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, false, true, true,
+// CHECK-SAME:  false, false, true, true,
+// CHECK-SAME:  false, true, true, false,
+// CHECK-SAME:  false, true, false, true]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
new file mode 100644
index 00000000000..dbca4f718d5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/cmpi.mlir
@@ -0,0 +1,147 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @eq() -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %e1 = arith.cmpi eq, %c1, %c2 : index
+  %e2 = arith.cmpi eq, %c1, %c1 : index
+  return %e1, %e2 : i1, i1
+}
+
+// CHECK-LABEL: @eq
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ne() -> (i1, i1) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %e1 = arith.cmpi ne, %c1, %c2 : index
+  %e2 = arith.cmpi ne, %c1, %c1 : index
+  return %e1, %e2 : i1, i1
+}
+
+// CHECK-LABEL: @ne
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @slt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi slt, %c-1, %c1 : index
+  %e2 = arith.cmpi slt, %c1, %c-1 : index
+  %e3 = arith.cmpi slt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @slt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: false
+
+func.func @sle() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sle, %c-1, %c1 : index
+  %e2 = arith.cmpi sle, %c1, %c-1 : index
+  %e3 = arith.cmpi sle, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sle
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+
+func.func @sgt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sgt, %c-1, %c1 : index
+  %e2 = arith.cmpi sgt, %c1, %c-1 : index
+  %e3 = arith.cmpi sgt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sgt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @sge() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi sge, %c-1, %c1 : index
+  %e2 = arith.cmpi sge, %c1, %c-1 : index
+  %e3 = arith.cmpi sge, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @sge
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ult() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ult, %c-1, %c1 : index
+  %e2 = arith.cmpi ult, %c1, %c-1 : index
+  %e3 = arith.cmpi ult, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ult
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+
+func.func @ule() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ule, %c-1, %c1 : index
+  %e2 = arith.cmpi ule, %c1, %c-1 : index
+  %e3 = arith.cmpi ule, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ule
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: true
+
+func.func @ugt() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi ugt, %c-1, %c1 : index
+  %e2 = arith.cmpi ugt, %c1, %c-1 : index
+  %e3 = arith.cmpi ugt, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @ugt
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: false
+
+func.func @uge() -> (i1, i1, i1) {
+  %c-1 = arith.constant -1 : index
+  %c1 = arith.constant 1 : index
+  %e1 = arith.cmpi uge, %c-1, %c1 : index
+  %e2 = arith.cmpi uge, %c1, %c-1 : index
+  %e3 = arith.cmpi uge, %c1, %c1 : index
+  return %e1, %e2, %e3 : i1, i1, i1
+}
+
+// CHECK-LABEL: @uge
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: true
+// CHECK-NEXT{LITERAL}: false
+// CHECK-NEXT{LITERAL}: true
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
new file mode 100644
index 00000000000..b378f008bad
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/constant.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tensor() -> tensor<2xi16> {
+  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
+  return %cst : tensor<2xi16>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 43]
+
+func.func @tensor_splat() -> tensor<2xi32> {
+  %cst = arith.constant dense<42> : tensor<2xi32>
+  return %cst : tensor<2xi32>
+}
+
+// CHECK-LABEL: @tensor_splat
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42, 42]
+
+func.func @scalar() -> i1 {
+  %cst = arith.constant true
+  return %cst : i1
+}
+
+// CHECK-LABEL: @scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+
+func.func @vector() -> vector<2x3xi32> {
+  %cst = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  return %cst : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 2, 3], [4, 5, 6]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
new file mode 100644
index 00000000000..f6124ea07c3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/extf.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extf() -> f64 {
+  %c1 = arith.constant 1.0 : f32
+  %ext = arith.extf %c1 : f32 to f64
+  return %ext : f64
+}
+
+// CHECK-LABEL: @extf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f64: 1.000000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
new file mode 100644
index 00000000000..d47438e1992
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/index_cast.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i32() -> (index) {
+  %c1 = arith.constant 42 : i32
+  %index = arith.index_cast %c1 : i32 to index
+  return %index : index
+}
+
+// CHECK-LABEL: @i32
+// CHECK{LITERAL}: 42
+
+func.func @i64() -> (index) {
+  %c1 = arith.constant 43 : i64
+  %index = arith.index_cast %c1 : i64 to index
+  return %index : index
+}
+
+// CHECK-LABEL: @i64
+// CHECK{LITERAL}: 43
+
+func.func @narrowing() -> (i32) {
+  %c1 = arith.constant 0x100000001 : index
+  %i32 = arith.index_cast %c1 : index to i32
+  return %i32 : i32
+}
+
+// CHECK-LABEL: @narrowing
+// CHECK{LITERAL}: i32: 1
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
new file mode 100644
index 00000000000..c93dd1248b8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @addi() -> i32 {
+  %c1 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : i32
+  %ret = arith.addi %c1, %c2 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @addi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @muli() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c5 = arith.constant 5 : i32
+  %ret = arith.muli %c3, %c5 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @muli
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 15
+
+func.func @divsi() -> i32 {
+  %c10 = arith.constant 10 : i32
+  %c-2 = arith.constant -2 : i32
+  %ret = arith.divsi %c10, %c-2 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @divsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -5
+
+func.func @subi() -> i32 {
+  %c10 = arith.constant 10 : i32
+  %c3 = arith.constant 3 : i32
+  %ret = arith.subi %c10, %c3 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @subi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 7
+
+func.func @andi() -> i32 {
+  %c63 = arith.constant 63 : i32
+  %c131 = arith.constant 131 : i32
+  %ret = arith.andi %c63, %c131 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @andi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @ori() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c10 = arith.constant 10 : i32
+  %ret = arith.ori %c3, %c10 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @ori
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 11
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
new file mode 100644
index 00000000000..6d11378b5d7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/minmax.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i32() -> (i32, i32) {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+  %r1 = arith.minsi %c-1, %c1 : i32
+  %r2 = arith.maxsi %c-1, %c1 : i32
+  return %r1, %r2 : i32, i32
+}
+
+// CHECK-LABEL: @i32
+// CHECK{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1
+
+func.func @i64() -> (i64, i64) {
+  %c-1 = arith.constant -1 : i64
+  %c1 = arith.constant 1000000000000 : i64
+  %r1 = arith.minsi %c-1, %c1 : i64
+  %r2 = arith.maxsi %c-1, %c1 : i64
+  return %r1, %r2 : i64, i64
+}
+
+// CHECK-LABEL: @i64
+// CHECK{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1000000000000
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
new file mode 100644
index 00000000000..8b45b85a061
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/negf.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @negf32() -> f32 {
+  %c = arith.constant -1.5 : f32
+  %ret = arith.negf %c : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @negf32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 1.500000e+00
+
+func.func @negf64() -> f64 {
+  %c = arith.constant 3.5 : f64
+  %ret = arith.negf %c : f64
+  return %ret : f64
+}
+
+// CHECK-LABEL: @negf64
+// CHECK-NEXT: Results
+// CHECK-NEXT: f64: -3.500000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
new file mode 100644
index 00000000000..3dc3033646c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/remf.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @remf() -> f32 {
+  %a = arith.constant 3.5 : f32
+  %b = arith.constant 2.25 : f32
+  %ret = arith.remf %a, %b : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @remf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 1.250000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
new file mode 100644
index 00000000000..f7f54303ba1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @select() -> (i32, i32) {
+  %c-1 = arith.constant -1 : i32
+  %c1 = arith.constant 1 : i32
+  %true = arith.constant true
+  %false = arith.constant false
+  %r1 = arith.select %true, %c-1, %c1 : i32
+  %r2 = arith.select %false, %c-1, %c1 : i32
+  return %r1, %r2 : i32, i32
+}
+
+// CHECK-LABEL: @select
+// CHECK{LITERAL}: -1
+// CHECK-NEXT{LITERAL}: 1
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
new file mode 100644
index 00000000000..82f09fbd2c2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/sitofp.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i16() -> f32 {
+  %c-1 = arith.constant -1 : i16
+  %r = arith.sitofp %c-1 : i16 to f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @i16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: -1.000000e+00
+
+func.func @i1() -> f64 {
+  %true = arith.constant true
+  %r = arith.sitofp %true : i1 to f64
+  return %r : f64
+}
+
+// CHECK-LABEL: @i1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 1.000000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
new file mode 100644
index 00000000000..d54bd7e9fe2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/uitofp.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @i16() -> f32 {
+  %c-1 = arith.constant -1 : i16
+  %r = arith.uitofp %c-1 : i16 to f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @i16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 6.553500e+04
+
+func.func @i1() -> f64 {
+  %true = arith.constant true
+  %r = arith.uitofp %true : i1 to f64
+  return %r : f64
+}
+
+// CHECK-LABEL: @i1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 1.000000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
new file mode 100644
index 00000000000..4ffc4d0ed81
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/vector_math.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @addi() -> vector<2xi32> {
+  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c2 = arith.constant dense<[3, 4]> : vector<2xi32>
+  %ret = arith.addi %c1, %c2 : vector<2xi32>
+  return %ret : vector<2xi32>
+}
+
+// CHECK-LABEL: @addi
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [4, 6]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
new file mode 100644
index 00000000000..8d47f63656a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/alloc_tensor.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static() -> tensor<1x2x3xi32> {
+  %t = bufferization.alloc_tensor() : tensor<1x2x3xi32>
+  return %t : tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @static
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[0, 0, 0], [0, 0, 0]]]
+
+func.func @dynamic() -> tensor<?x1xi32> {
+  %c4 = arith.constant 4 : index
+  %t = bufferization.alloc_tensor(%c4) : tensor<?x1xi32>
+  return %t : tensor<?x1xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0], [0], [0], [0]]
+
+func.func @copy() -> tensor<i32> {
+  %c = arith.constant dense<123> : tensor<i32>
+  %t = bufferization.alloc_tensor() copy(%c) : tensor<i32>
+  return %t : tensor<i32>
+}
+
+// CHECK-LABEL: @copy
+// CHECK-NEXT: Results
+// CHECK-NEXT: 123
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
new file mode 100644
index 00000000000..09670eb52c8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/clone.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @clone() -> (memref<i32>, memref<i32>) {
+  %a = arith.constant dense<1> : memref<i32>
+  %b = bufferization.clone %a : memref<i32> to memref<i32>
+  %c = arith.constant 2 : i32
+  memref.store %c, %b[] : memref<i32>
+  return %a, %b : memref<i32>, memref<i32>
+}
+
+// CHECK-LABEL: @clone
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: 1
+// CHECK-NEXT: TensorOrMemref<i32>: 2
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
new file mode 100644
index 00000000000..2ed938c36c6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_memref.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @memref() -> memref<2xi16> {
+  %cst = arith.constant dense<[42, 43]> : tensor<2xi16>
+  %memref = bufferization.to_memref %cst : memref<2xi16>
+  return %memref : memref<2xi16>
+}
+
+// CHECK-LABEL: @memref
+// CHECK{LITERAL}: [42, 43]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
new file mode 100644
index 00000000000..8bc19b4d895
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/bufferization/to_tensor.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tensor() -> tensor<2xi16> {
+  %cst = arith.constant dense<[43, 44]> : tensor<2xi16>
+  %memref = bufferization.to_memref %cst : memref<2xi16>
+  %tensor = bufferization.to_tensor %memref : memref<2xi16>
+  return %tensor : tensor<2xi16>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK{LITERAL}: [43, 44]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
new file mode 100644
index 00000000000..55c481677eb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/builtin/unrealized_conversion_cast.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @no_op_cast() -> i32 {
+  %cst = arith.constant 42 : i32
+  %cast = builtin.unrealized_conversion_cast %cst : i32 to i32
+  return %cast : i32
+}
+
+// CHECK-LABEL: @no_op_cast
+// CHECK-NEXT: Results
+// CHECK{LITERAL}: 42
+
+func.func @cast_to_dynamic() -> tensor<?xi32> {
+  %cst = arith.constant dense<[0, 1, 2]> : tensor<3xi32>
+  %cast = builtin.unrealized_conversion_cast %cst : tensor<3xi32> to tensor<?xi32>
+  return %cast : tensor<?xi32>
+}
+
+// CHECK-LABEL: @cast_to_dynamic
+// CHECK-NEXT: Results
+// CHECK{LITERAL}: [0, 1, 2]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir
new file mode 100644
index 00000000000..b23f9d7423b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/func/call.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @cbrtf_caller() -> f32 {
+  %c-27 = arith.constant -27.0 : f32
+  %ret = func.call @cbrtf(%c-27) : (f32) -> f32
+  func.return %ret : f32
+}
+
+// CHECK-LABEL: @cbrtf_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: -3.000000e+00
+
+func.func @cbrt_caller() -> f64 {
+  %c-8 = arith.constant -8.0 : f64
+  %ret = func.call @cbrt(%c-8) : (f64) -> f64
+  func.return %ret : f64
+}
+
+// CHECK-LABEL: @cbrt_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: -2.000000e+00
+
+func.func @atan2f_caller() -> f32 {
+  %c1 = arith.constant 1.0 : f32
+  %c2 = arith.constant 2.0 : f32
+  %ret = func.call @atan2f(%c1, %c2) : (f32, f32) -> f32
+  func.return %ret : f32
+}
+
+// CHECK-LABEL: @atan2f_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4.636476e-01
+
+func.func @atan2_caller() -> f64 {
+  %c2 = arith.constant 2.0 : f64
+  %c3 = arith.constant 3.0 : f64
+  %ret = func.call @atan2(%c2, %c3) : (f64, f64) -> f64
+  func.return %ret : f64
+}
+
+// CHECK-LABEL: @atan2_caller
+// CHECK-NEXT: Results
+// CHECK-NEXT: 5.880026e-01
+
+func.func private @cbrtf(%a: f32) -> f32
+func.func private @cbrt(%a: f64) -> f64
+func.func private @atan2f(%a: f32, %b: f32) -> f32
+func.func private @atan2(%a: f64, %b: f64) -> f64
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/for.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/for.mlir
new file mode 100644
index 00000000000..f3be5de502f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/for.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @yield_scalar() -> tensor<8xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+
+  %init = tensor.empty() : tensor<8xindex>
+  %iota = gml_st.for (%i) = (%c0) to (%c8) step (%c1)
+            outs(%init_ = %init : tensor<8xindex>) {
+    %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %i into %init_[%tile]
+      : index into tensor<8xindex>[!gml_st.tile<1>]
+  } : tensor<8xindex>
+  func.return %iota : tensor<8xindex>
+}
+
+// CHECK-LABEL: @yield_scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3, 4, 5, 6, 7]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/parallel.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/parallel.mlir
new file mode 100644
index 00000000000..e8b05f557b2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/parallel.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @yield_scalar() -> tensor<8xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+
+  %init = tensor.empty() : tensor<8xindex>
+  %iota = gml_st.parallel (%i) = (%c0) to (%c8) step (%c1)
+      outs (%out_ = %init: tensor<8xindex>) {
+    %tile = gml_st.tile [%i] [1] [1] : !gml_st.tile<1>
+    gml_st.set_yield %i into %out_[%tile]
+      : index into tensor<8xindex>[!gml_st.tile<1>]
+  } : tensor<8xindex>
+  func.return %iota : tensor<8xindex>
+}
+
+// CHECK-LABEL: @yield_scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3, 4, 5, 6, 7]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
new file mode 100644
index 00000000000..822b21dfc4c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/broadcast.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @broadcast() -> (tensor<2x3xi32>, tensor<2x3xi32>) {
+  %v = arith.constant dense<[1,2]> : tensor<2xi32>
+  %init = tensor.empty() : tensor<2x3xi32>
+  %bcast = linalg.broadcast
+      ins(%v: tensor<2xi32>)
+      outs(%init: tensor<2x3xi32>)
+      dimensions = [1]
+  func.return %init, %bcast : tensor<2x3xi32>, tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @broadcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
+
+func.func @bufferized() -> memref<2x3xi32> {
+  %v = arith.constant dense<[1,2]> : tensor<2xi32>
+  %alloc = memref.alloc() : memref<2x3xi32>
+  linalg.broadcast
+      ins(%v: tensor<2xi32>)
+      outs(%alloc: memref<2x3xi32>)
+      dimensions = [1]
+  func.return %alloc : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1, 1], [2, 2, 2]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
new file mode 100644
index 00000000000..20a91a7768d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/fill.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @fill() -> (tensor<2xi32>, tensor<2xi32>) {
+  %c42 = arith.constant 42 : i32
+  %init = tensor.empty() : tensor<2xi32>
+  %fill = linalg.fill ins(%c42 : i32) outs(%init : tensor<2xi32>) -> tensor<2xi32>
+  func.return %init, %fill : tensor<2xi32>, tensor<2xi32>
+}
+
+// CHECK-LABEL: @fill
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 0]
+// CHECK-NEXT{LITERAL}: [42, 42]
+
+func.func @bufferized() -> memref<2xi32> {
+  %c42 = arith.constant 42 : i32
+  %alloc = memref.alloc() : memref<2xi32>
+  linalg.fill ins(%c42 : i32) outs(%alloc : memref<2xi32>)
+  func.return %alloc : memref<2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [42, 42]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
new file mode 100644
index 00000000000..5546db4996c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/generic.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+#matmul_trait = {
+  indexing_maps = [
+    affine_map<(m, n, k) -> (m, k)>,
+    affine_map<(m, n, k) -> (k, n)>,
+    affine_map<(m, n, k) -> (m, n)>
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @matmul() -> (tensor<2x2xi32>, tensor<2x2xi32>) {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %rhs = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %init = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.generic #matmul_trait
+    ins(%lhs, %rhs : tensor<2x3xi32>, tensor<3x2xi32>)
+    outs(%init : tensor<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    } -> tensor<2x2xi32>
+  return %ret, %init : tensor<2x2xi32>, tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @matmul
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[22, 28], [49, 64]]
+// CHECK-NEXT{LITERAL}: [[0, 0], [0, 0]]
+
+func.func @bufferized() -> memref<2x2xi32> {
+  %lhs = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %rhs = arith.constant dense<[[2, 1], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %alloc = memref.alloc() : memref<2x2xi32>
+  linalg.generic #matmul_trait
+    ins(%lhs, %rhs : tensor<2x3xi32>, tensor<3x2xi32>)
+    outs(%alloc : memref<2x2xi32>) {
+    ^bb(%a: i32, %b: i32, %c: i32):
+      %d = arith.muli %a, %b: i32
+      %e = arith.addi %c, %d: i32
+      linalg.yield %e : i32
+    }
+  return %alloc : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[23, 27], [53, 60]]
+
+#map = affine_map<(d0) -> (d0)>
+
+func.func @vector() -> tensor<4xvector<2xi32>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %lhs = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  %rhs = arith.constant dense<[5, 6, 7, 8]> : tensor<4xi32>
+  %init = tensor.empty() : tensor<4xvector<2xi32>>
+  %ret = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel"]
+    }
+    ins(%lhs, %rhs : tensor<4xi32>, tensor<4xi32>)
+    outs(%init : tensor<4xvector<2xi32>>) {
+    ^bb(%a: i32, %b: i32, %c: vector<2xi32>):
+      %d = vector.insertelement %a, %c[%c0 : index] : vector<2xi32>
+      %e = vector.insertelement %b, %d[%c1 : index] : vector<2xi32>
+      linalg.yield %e : vector<2xi32>
+    } -> tensor<4xvector<2xi32>>
+  return %ret : tensor<4xvector<2xi32>>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4xvector<2xi32>>: [[1, 5], [2, 6], [3, 7], [4, 8]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
new file mode 100644
index 00000000000..4fef179d798
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/map.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @no_inputs() -> tensor<4xf32> {
+  %init = arith.constant dense<[1.0,2.0,3.0,4.0]> : tensor<4xf32>
+  %zero = linalg.map outs(%init:tensor<4xf32>)() {
+    %0 = arith.constant 0.0: f32
+    linalg.yield %0: f32
+  }
+  func.return %zero : tensor<4xf32>
+}
+
+// CHECK-LABEL: @no_inputs
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00]
+
+func.func @binary() -> tensor<4xi32> {
+  %init = tensor.empty() : tensor<4xi32>
+  %lhs = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %rhs = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
+  %add = linalg.map ins(%lhs, %rhs: tensor<4xi32>, tensor<4xi32>)
+                    outs(%init: tensor<4xi32>)
+    (%lhs_elem: i32, %rhs_elem: i32) {
+      %0 = arith.addi %lhs_elem, %rhs_elem: i32
+      linalg.yield %0: i32
+    }
+  func.return %add : tensor<4xi32>
+}
+
+// CHECK-LABEL: @binary
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [10, 21, 32, 43]
+
+func.func @memref() -> memref<4xi32> {
+  %alloc = memref.alloc() : memref<4xi32>
+  %lhs = arith.constant dense<[0, 1, 2, 3]> : memref<4xi32>
+  %rhs = arith.constant dense<[10, 20, 30, 40]> : memref<4xi32>
+  linalg.map ins(%lhs, %rhs: memref<4xi32>, memref<4xi32>)
+             outs(%alloc: memref<4xi32>)
+    (%lhs_elem: i32, %rhs_elem: i32) {
+      %0 = arith.muli %lhs_elem, %rhs_elem: i32
+      linalg.yield %0: i32
+    }
+  func.return %alloc : memref<4xi32>
+}
+
+// CHECK-LABEL: @memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 20, 60, 120]
+
+func.func @index() -> memref<4xindex> {
+  %alloc = memref.alloc() : memref<4xindex>
+  linalg.map outs(%alloc: memref<4xindex>)() {
+    %0 = linalg.index 0 : index
+    linalg.yield %0: index
+  }
+  func.return %alloc : memref<4xindex>
+}
+
+// CHECK-LABEL: @index
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3]
+
+func.func @vector() -> memref<4xvector<2xindex>> {
+  %c = arith.constant dense<42> : vector<2xindex>
+  %alloc = memref.alloc() : memref<4xvector<2xindex>>
+  linalg.map outs(%alloc: memref<4xvector<2xindex>>)() {
+    linalg.yield %c: vector<2xindex>
+  }
+  func.return %alloc : memref<4xvector<2xindex>>
+}
+
+// CHECK-LABEL: @vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[42, 42], [42, 42], [42, 42], [42, 42]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
new file mode 100644
index 00000000000..5bed36c4c68
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/matmul.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @m1x1() -> (tensor<1x1xi32>, tensor<1x1xi32>) {
+  %a = arith.constant dense<1> : tensor<1x1xi32>
+  %b = arith.constant dense<2> : tensor<1x1xi32>
+  %c = arith.constant dense<3> : tensor<1x1xi32>
+  %ret = linalg.matmul ins(%a, %b : tensor<1x1xi32>, tensor<1x1xi32>)
+                       outs(%c : tensor<1x1xi32>) -> tensor<1x1xi32>
+  return %c, %ret : tensor<1x1xi32>, tensor<1x1xi32>
+}
+
+// CHECK-LABEL: @m1x1
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[3]]
+// CHECK-NEXT{LITERAL}: [[5]]
+
+func.func @m2x2() -> tensor<2x2xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %b = arith.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %c = tensor.empty() : tensor<2x2xi32>
+  %ret = linalg.matmul ins(%a, %b : tensor<2x2xi32>, tensor<2x2xi32>)
+                       outs(%c : tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %ret : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @m2x2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @m1x1_bufferized() -> memref<1x1xi32> {
+  %a = arith.constant dense<1> : tensor<1x1xi32>
+  %b = arith.constant dense<2> : tensor<1x1xi32>
+  %c = arith.constant dense<3> : memref<1x1xi32>
+  linalg.matmul ins(%a, %b : tensor<1x1xi32>, tensor<1x1xi32>)
+                outs(%c : memref<1x1xi32>)
+  return %c : memref<1x1xi32>
+}
+
+// CHECK-LABEL: @m1x1_bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
new file mode 100644
index 00000000000..f0dcd80d025
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/reduce.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduce() -> tensor<2xi32> {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
+  %init = arith.constant dense<[9, 10]> : tensor<2xi32>
+  %ret = linalg.reduce ins(%v : tensor<2x4xi32>)
+                       outs(%init: tensor<2xi32>)
+                       dimensions = [1]
+                       (%in: i32, %out: i32) {
+                         %sum = arith.addi %in, %out : i32
+                         linalg.yield %sum: i32
+                       }
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @reduce
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+
+func.func @bufferized() -> memref<2xi32> {
+  %v = arith.constant dense<[[1,2,3,4], [5,6,7,8]]> : tensor<2x4xi32>
+  %init = arith.constant dense<[9, 10]> : memref<2xi32>
+  linalg.reduce ins(%v : tensor<2x4xi32>)
+                outs(%init: memref<2xi32>)
+                dimensions = [1]
+                (%in: i32, %out: i32) {
+                  %sum = arith.addi %in, %out : i32
+                  linalg.yield %sum: i32
+                }
+  func.return %init : memref<2xi32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT: [19, 36]
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
new file mode 100644
index 00000000000..c9fbb0d4632
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/linalg/transpose.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> tensor<2x3xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %b = tensor.empty() : tensor<2x3xi32>
+  %ret = linalg.transpose ins(%a : tensor<3x2xi32>)
+                          outs(%b : tensor<2x3xi32>)
+                          permutation = [1, 0]
+  return %ret : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
+
+func.func @transpose_bufferized() -> memref<2x3xi32> {
+  %a = arith.constant dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>
+  %b = memref.alloc() : memref<2x3xi32>
+  linalg.transpose ins(%a : tensor<3x2xi32>)
+                   outs(%b : memref<2x3xi32>)
+                   permutation = [1, 0]
+  return %b : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @transpose_bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 3, 5], [2, 4, 6]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir
new file mode 100644
index 00000000000..0eb7fe6dbfa
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/math/math.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @copysign() -> (f32, f32) {
+  %a = arith.constant 1.5 : f32
+  %b = arith.constant -10.0 : f32
+  %c = math.copysign %a, %b : f32
+  %d = math.copysign %a, %a : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @copysign
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1.500000e+00
+// CHECK-NEXT: 1.500000e+00
+
+func.func @absf() -> (f32, f32) {
+  %a = arith.constant 1.0 : f32
+  %b = arith.constant -1.0 : f32
+  %c = math.absf %a : f32
+  %d = math.absf %b : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @absf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+// CHECK-NEXT: 1.000000e+00
+
+func.func @cos() -> f32 {
+  %a = arith.constant 0.0 : f32
+  %b = math.cos %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @cos
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+
+func.func @exp() -> f32 {
+  %a = arith.constant 1.0 : f32
+  %b = math.exp %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @exp
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.718282e+00
+
+func.func @sin() -> f32 {
+  %a = arith.constant 0.0 : f32
+  %b = math.sin %a : f32
+  return %b : f32
+}
+
+// CHECK-LABEL: @sin
+// CHECK-NEXT: Results
+// CHECK-NEXT: 0.000000e+00
+
+func.func @floor() -> (f32, f32) {
+  %a = arith.constant 1.5 : f32
+  %b = arith.constant -10.5 : f32
+  %c = math.floor %a : f32
+  %d = math.floor %b : f32
+  return %c, %d : f32, f32
+}
+
+// CHECK-LABEL: @floor
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+// CHECK-NEXT: -1.100000e+01
+
+func.func @log() -> f32 {
+  %a = arith.constant 1.0 : f32
+  %ret = math.log %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: 0.000000e+00
+
+func.func @log1p() -> f32 {
+  %a = arith.constant 1.0e-10 : f32
+  %ret = math.log1p %a : f32
+  return %ret : f32
+}
+
+// CHECK-LABEL: @log1p
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-10
+
+func.func @powf() -> f32 {
+  %a = arith.constant 2.0 : f32
+  %b = arith.constant 3.0 : f32
+  %c = math.powf %a, %b : f32
+  return %c : f32
+}
+
+// CHECK-LABEL: @powf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.000000e+00
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
new file mode 100644
index 00000000000..430304c22ad
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
@@ -0,0 +1,47 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @alloc() -> memref<2x3xi32> {
+  %ret = memref.alloc() : memref<2x3xi32>
+  return %ret : memref<2x3xi32>
+}
+
+// CHECK-LABEL: @alloc
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @alloc_unit() -> memref<i32> {
+  %ret = memref.alloc() : memref<i32>
+  return %ret : memref<i32>
+}
+
+// CHECK-LABEL: @alloc_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <i32>: 0
+
+func.func @alloc_unit_vector() -> memref<vector<i32>> {
+  %ret = memref.alloc() : memref<vector<i32>>
+  return %ret : memref<vector<i32>>
+}
+
+// CHECK-LABEL: @alloc_unit_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <vector<i32>>: 0
+
+func.func @alloc_vector() -> memref<2xvector<3xi32>> {
+  %ret = memref.alloc() : memref<2xvector<3xi32>>
+  return %ret : memref<2xvector<3xi32>>
+}
+
+// CHECK-LABEL: @alloc_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2xvector<3xi32>>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @alloc_dynamic() -> memref<?x3xi32> {
+  %c2 = arith.constant 2 : index
+  %ret = memref.alloc(%c2) : memref<?x3xi32>
+  return %ret : memref<?x3xi32>
+}
+
+// CHECK-LABEL: @alloc_dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
new file mode 100644
index 00000000000..027b59755be
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/collapse_shape.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @collapse_shape()
+    -> (memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>) {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : memref<1x2x3xi32>
+  %collapse1 = memref.collapse_shape %cst [[0], [1, 2]]
+      : memref<1x2x3xi32> into memref<1x6xi32>
+  %collapse2 = memref.collapse_shape %cst [[0, 1], [2]]
+      : memref<1x2x3xi32> into memref<2x3xi32>
+  %collapse3 = memref.collapse_shape %cst [[0, 1, 2]]
+      : memref<1x2x3xi32> into memref<6xi32>
+  return %cst, %collapse1, %collapse2, %collapse3
+      : memref<1x2x3xi32>, memref<1x6xi32>, memref<2x3xi32>, memref<6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
+
+func.func @zero_dim()
+    -> (memref<6x0xi32>) {
+  %cst = arith.constant dense<> : memref<1x2x3x0xi32>
+  %collapse = memref.collapse_shape %cst [[0, 1, 2], [3]]
+      : memref<1x2x3x0xi32> into memref<6x0xi32>
+  return %collapse : memref<6x0xi32>
+}
+
+// CHECK-LABEL: @zero_dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<6x0xi32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
new file mode 100644
index 00000000000..09815f5e8cf
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/copy.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @subview() -> memref<4x4xi32, strided<[4, 1], offset: 0>> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
+    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  memref.copy %cst, %1 : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0 : memref<4x4xi32, strided<[4, 1], offset: 0>>
+}
+
+// CHECK-LABEL: @subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
+
+func.func @strided() -> memref<4x4xi32> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c2, %c2]
+    : memref<4x4xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  memref.copy %cst, %1
+    : memref<2x2xi32> to memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0 : memref<4x4xi32>
+}
+
+// CHECK-LABEL: @strided
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 0, 2], [0, 0, 0, 0], [0, 3, 0, 4]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
new file mode 100644
index 00000000000..2740b7df65d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/dim.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dim() -> index {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<10x50xf32>
+  %c1 = arith.constant 1 : index
+  %dim = memref.dim %alloc, %c1 : memref<10x50xf32>
+  return %dim : index
+}
+
+// CHECK-LABEL: @dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 50
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
new file mode 100644
index 00000000000..b9d05ebad24
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/expand_shape.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expand_shape()
+    -> (memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
+        memref<1x2x3xi32>) {
+  %cst1 = arith.constant dense<[[1, 2, 3, 4, 5, 6]]> : memref<1x6xi32>
+  %cst2 = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
+  %cst3 = arith.constant dense<[1, 2, 3, 4, 5, 6]> : memref<6xi32>
+  %expand1 = memref.expand_shape %cst1 [[0], [1, 2]]
+      : memref<1x6xi32> into memref<1x2x3xi32>
+  %expand2 = memref.expand_shape %cst2 [[0, 1], [2]]
+      : memref<2x3xi32> into memref<1x2x3xi32>
+  %expand3 = memref.expand_shape %cst2 [[0, 1], [2]]
+      : memref<2x3xi32> into memref<2x1x3xi32>
+  %expand4 = memref.expand_shape %cst3 [[0, 1, 2]]
+      : memref<6xi32> into memref<1x2x3xi32>
+  return %expand1, %expand2, %expand3, %expand4
+      : memref<1x2x3xi32>, memref<1x2x3xi32>, memref<2x1x3xi32>,
+        memref<1x2x3xi32>
+}
+
+// CHECK-LABEL: @expand_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3]], [[4, 5, 6]]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+
+func.func @zero_rank()
+    -> (memref<1x1xi32>) {
+  %cst = arith.constant dense<1> : memref<i32>
+  %expand = memref.expand_shape %cst []
+      : memref<i32> into memref<1x1xi32>
+  return %expand : memref<1x1xi32>
+}
+
+// CHECK-LABEL: @zero_rank
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1]]
+
+func.func @split_dim() -> memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>> {
+  %cst = arith.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : memref<3x4xi32, strided<[4, 1], offset: 0>>
+  %ret = memref.expand_shape %cst [[0], [1, 2]]
+    : memref<3x4xi32, strided<[4, 1], offset: 0>> into
+      memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
+  return %ret : memref<3x2x2xi32, strided<[4, 2, 1], offset: 0>>
+}
+
+// CHECK-LABEL: @split_dim
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
new file mode 100644
index 00000000000..a8b5de8930c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/get_global.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+memref.global "private" constant @cst : memref<2xi16> = dense<[1, 2]>
+
+func.func @get_global() -> memref<2xi16> {
+  %0 = memref.get_global @cst : memref<2xi16>
+  return %0 : memref<2xi16>
+}
+
+// CHECK-LABEL: @get_global
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 2]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
new file mode 100644
index 00000000000..7696782811f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
@@ -0,0 +1,50 @@
+// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
+
+func.func @out_of_bounds_load() -> i32 {
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  %ret = memref.load %cst[%c3] : memref<2xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @out_of_bounds_load
+// CHECK-NEXT: array index out of bounds
+
+func.func @out_of_bounds_store() {
+  %c3 = arith.constant 3 : index
+  %v = arith.constant 32 : i32
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  memref.store %v, %cst[%c3] : memref<2xi32>
+  return
+}
+
+// CHECK-LABEL: @out_of_bounds_store
+// CHECK-NEXT: array index out of bounds
+
+func.func @out_of_bounds_subview() -> memref<?xi32, strided<[?], offset: ?>> {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %v = arith.constant 32 : i32
+  %cst = arith.constant dense<[1, 2]> : memref<2xi32>
+  %subview = memref.subview %cst[%c1][%c2][%c1]
+    : memref<2xi32> to memref<?xi32, strided<[?], offset: ?>>
+  return %subview : memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @out_of_bounds_subview
+// CHECK-NEXT: subview out of bounds
+
+func.func @collapse_shape_no_common_stride()
+    -> (memref<1x2x3xi32>, memref<1x6xi32>) {
+  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
+                              [[6, 7, 8], [9, 10,11]]]]>
+    : memref<1x2x2x3xi32>
+  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
+    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
+  %c = memref.collapse_shape %b [[0], [1, 2]]
+    : memref<1x2x3xi32> into memref<1x6xi32>
+  return %b, %c : memref<1x2x3xi32>, memref<1x6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape_no_common_stride
+// CHECK-NEXT: cannot collapse dimensions without a common stride
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir
new file mode 100644
index 00000000000..17d93531962
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/load.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @load() -> i32 {
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  %ret = memref.load %cst[%c1, %c1] : memref<2x2xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @load
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
new file mode 100644
index 00000000000..65a9aba38d8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/subview.mlir
@@ -0,0 +1,131 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @subview() -> (memref<4x4xi32, strided<[4, 1], offset: 0>>,
+                         memref<?x?xi32, strided<[?, ?], offset: ?>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %0 = memref.alloc() : memref<4x4xi32, strided<[4, 1], offset: 0>>
+  %1 = memref.subview %0[%c1, %c1][%c2, %c2][%c1, %c1]
+    : memref<4x4xi32, strided<[4, 1], offset: 0>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %i1 = arith.constant 1 : i32
+  %i2 = arith.constant 2 : i32
+  %i3 = arith.constant 3 : i32
+  %i4 = arith.constant 4 : i32
+
+  memref.store %i1, %1[%c0, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i2, %1[%c0, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i3, %1[%c1, %c0] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+  memref.store %i4, %1[%c1, %c1] : memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %0, %1 : memref<4x4xi32, strided<[4, 1], offset: 0>>,
+                  memref<?x?xi32, strided<[?, ?], offset: ?>>
+}
+
+// CHECK-LABEL: @subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
+
+func.func @strided() -> memref<?x?xi32, strided<[?, ?], offset: ?>> {
+  %0 = arith.constant dense<[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]> : memref<2x5xi32>
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+
+  %1 = memref.subview %0[%c0, %c1][%c2, %c2][%c1, %c2]
+    : memref<2x5xi32> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  return %1 : memref<?x?xi32, strided<[?, ?], offset: ?>>
+}
+
+// CHECK-LABEL: @strided
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 4], [7, 9]]
+
+func.func @subview_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>,
+                                     memref<?xi32, strided<[?], offset: ?>>) {
+  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+
+  %1 = memref.subview %0[%c1][%c4][%c2]
+    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
+  %2 = memref.subview %1[%c1][%c2][%c2]
+    : memref<?xi32, strided<[?], offset: ?>> to
+      memref<?xi32, strided<[?], offset: ?>>
+
+  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>,
+                  memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @subview_of_subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
+// CHECK-NEXT{LITERAL}: [4, 8]
+
+func.func @negative_stride() -> memref<?xi32, strided<[?], offset: ?>> {
+  %0 = arith.constant dense<[1, 2, 3, 4]> : memref<4xi32>
+  %c-1 = arith.constant -1 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %1 = memref.subview %0[%c3][%c4][%c-1]
+    : memref<4xi32> to memref<?xi32, strided<[?], offset: ?>>
+  return %1 : memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @negative_stride
+// CHECK-NEXT: Results
+// CHECK-NEXT: [4, 3, 2, 1]
+
+func.func @negative_stride_of_subview() -> (memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>) {
+  %0 = arith.constant dense<[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]> : memref<10xi32>
+  %c-2 = arith.constant -2 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %1 = memref.subview %0[%c1][%c4][%c2]
+    : memref<10xi32> to memref<?xi32, strided<[?], offset: ?>>
+  %2 = memref.subview %1[%c3][%c2][%c-2]
+    : memref<?xi32, strided<[?], offset: ?>> to
+      memref<?xi32, strided<[?], offset: ?>>
+  return %1, %2 : memref<?xi32, strided<[?], offset: ?>>, memref<?xi32, strided<[?], offset: ?>>
+}
+
+// CHECK-LABEL: @negative_stride_of_subview
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 4, 6, 8]
+// CHECK-NEXT{LITERAL}: [8, 4]
+
+func.func @rank_reduce() -> memref<2xi32> {
+  %a = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : memref<2x3xi32>
+  %b = memref.subview %a[0, 0][2, 1][1, 1]
+    : memref<2x3xi32> to memref<2xi32>
+  return %b : memref<2xi32>
+}
+
+// CHECK-LABEL: @rank_reduce
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 4]
+
+func.func @rank_reduce_middle() -> memref<1x2x3xi32> {
+  %a = arith.constant dense<[[[[0, 1, 2], [3, 4, 5]],
+                              [[6, 7, 8], [9, 10,11]]]]>
+    : memref<1x2x2x3xi32>
+  %b = memref.subview %a[0, 0, 0, 0][1, 2, 1, 3][1, 1, 1, 1]
+    : memref<1x2x2x3xi32> to memref<1x2x3xi32>
+  return %b : memref<1x2x3xi32>
+}
+
+// CHECK-LABEL: @rank_reduce_middle
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[0, 1, 2], [6, 7, 8]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
new file mode 100644
index 00000000000..6f834dfb2fe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/broadcast_in_dim.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @broadcast() -> tensor<2x3xui16> {
+  %0 = mhlo.constant dense<[1, 2]> : tensor<2xui16>
+  %1 = "mhlo.broadcast_in_dim"(%0) {
+    broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<2xui16>) -> tensor<2x3xui16>
+  return %1 : tensor<2x3xui16>
+}
+
+// CHECK{LITERAL}: [[1, 1, 1], [2, 2, 2]]
+
+func.func @zero_rank() -> tensor<1x2x3xi32> {
+  %in = mhlo.constant dense<1> : tensor<i32>
+  %0 = "mhlo.broadcast_in_dim"(%in) {
+    broadcast_dimensions = dense<[]> : tensor<0xi64>
+  } : (tensor<i32>) -> tensor<1x2x3xi32>
+  func.return %0 : tensor<1x2x3xi32>
+}
+
+// CHECK{LITERAL}: [[[1, 1, 1], [1, 1, 1]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
new file mode 100644
index 00000000000..17d7f8fecd3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/case.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @case() -> tensor<i32> {
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %c2 = mhlo.constant dense<2> : tensor<i32>
+  %c3 = mhlo.constant dense<3> : tensor<i32>
+  %ret = "mhlo.case"(%c1) ({
+    "mhlo.return"(%c2) : (tensor<i32>) -> ()
+  }, {
+    "mhlo.return"(%c3) : (tensor<i32>) -> ()
+  }) : (tensor<i32>) -> tensor<i32>
+  func.return %ret : tensor<i32>
+}
+
+// CHECK-LABEL: @case
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i32>: 3
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
new file mode 100644
index 00000000000..f9d7eb99cbe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compare.mlir
@@ -0,0 +1,143 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @ne() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare NE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare NE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @ne
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+
+func.func @ge() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare GE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare GE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare GE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @ge
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+// CHECK-NEXT: true
+
+func.func @gt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare GT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare GT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare GT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @gt
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @le() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare LE, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare LE, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare LE, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @le
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+
+func.func @lt() -> (tensor<i1>, tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1> : tensor<i32>
+  %c2 = arith.constant dense<2> : tensor<i32>
+  %0 = mhlo.compare LT, %c1, %c2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = mhlo.compare LT, %c1, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = mhlo.compare LT, %c2, %c1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %0, %1, %2 : tensor<i1>, tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @lt
+// CHECK-NEXT: Results
+// CHECK-NEXT: true
+// CHECK-NEXT: false
+// CHECK-NEXT: false
+
+func.func @complex_eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<(1.0, 1.0)> : tensor<complex<f32>>
+  %c2 = arith.constant dense<(1.0, 2.0)> : tensor<complex<f32>>
+  %0 = mhlo.compare EQ, %c1, %c2
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @complex_eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @complex_nan_compare() -> (tensor<i1>, tensor<i1>) {
+  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
+  %c1 = arith.constant dense<1.0> : tensor<f32>
+  %c = mhlo.complex %c1, %nan : tensor<complex<f32>>
+  %0 = mhlo.compare EQ, %c, %c
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  %1 = mhlo.compare NE, %c, %c
+    : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @complex_nan_compare
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @float_eq() -> (tensor<i1>, tensor<i1>) {
+  %c1 = arith.constant dense<1.0> : tensor<f32>
+  %c2 = arith.constant dense<2.0> : tensor<f32>
+  %0 = mhlo.compare EQ, %c1, %c2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %1 = mhlo.compare EQ, %c1, %c1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @float_eq
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
+
+func.func @float_nan_compare() -> (tensor<i1>, tensor<i1>) {
+  %nan = arith.constant dense<0x7FC00000> : tensor<f32>
+  %0 = mhlo.compare EQ, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %1 = mhlo.compare NE, %nan, %nan : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0, %1 : tensor<i1>, tensor<i1>
+}
+
+// CHECK-LABEL: @float_nan_compare
+// CHECK-NEXT: Results
+// CHECK-NEXT: false
+// CHECK-NEXT: true
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
new file mode 100644
index 00000000000..c31953be621
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/complex_math.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @cos() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
+  %cos = mhlo.cosine %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @cos
+// CHECK-NEXT: Results
+// CHECK-NEXT: 8.337300e-01-9.888977e-01i
+
+func.func @expm1() -> tensor<complex<f32>> {
+  // import numpy as np  -- not jax.numpy
+  // np.expm1(np.array(1e-6 + 1e-6j, dtype=np.complex64))
+  // Don't run this with jax.numpy, it returns 9.536743e-07+0.j.
+  %c = mhlo.constant dense<(1.0e-06, 1.0e-06)> : tensor<complex<f32>>
+  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f32>>
+  return %expm1 : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @expm1
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-06+1.000001e-06i
+
+func.func @expm1d() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0e-50, 1.0e-50)> : tensor<complex<f64>>
+  %expm1 = mhlo.exponential_minus_one %c : tensor<complex<f64>>
+  return %expm1 : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @expm1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-50+1.000000e-50i
+
+func.func @imag() -> tensor<f64> {
+  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
+  %imag = mhlo.imag %c : (tensor<complex<f64>>) -> tensor<f64>
+  return %imag : tensor<f64>
+}
+
+// CHECK-LABEL: @imag
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @log1pf() -> tensor<complex<f32>> {
+  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f32>>
+  %cos = mhlo.log_plus_one %c : tensor<complex<f32>>
+  return %cos : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @log1p
+// CHECK-NEXT: Results
+// The accuracy of this is rather poor, but it matches numpy.
+// CHECK-NEXT: 1.192093e-07+9.999999e-21i
+
+func.func @log1pd() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0e-07, 1.0e-20)> : tensor<complex<f64>>
+  %cos = mhlo.log_plus_one %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @log1pd
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e-07+9.999999e-21i
+
+func.func @logistic() -> tensor<5xcomplex<f32>> {
+  %c = mhlo.constant dense<[(-1.0e5, 0.0), (-1.0, 0.0),
+      (0.0, 0.0), (1.0, 0.0), (1.0e5, 0.0)]> : tensor<5xcomplex<f32>>
+  %ret = mhlo.logistic %c : tensor<5xcomplex<f32>>
+  return %ret : tensor<5xcomplex<f32>>
+}
+
+// CHECK-LABEL: @logistic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00+0.000000e+00i,
+// CHECK-SAME:  2.689414e-01+0.000000e+00i,
+// CHECK-SAME:  5.000000e-01+0.000000e+00i,
+// CHECK-SAME:  7.310586e-01+0.000000e+00i,
+// CHECK-SAME:  1.000000e+00+0.000000e+00i]
+
+func.func @real() -> tensor<f64> {
+  %c = mhlo.constant dense<(1.0, 2.0)> : tensor<complex<f64>>
+  %real = mhlo.real %c : (tensor<complex<f64>>) -> tensor<f64>
+  return %real : tensor<f64>
+}
+
+// CHECK-LABEL: @real
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @sin() -> tensor<complex<f64>> {
+  %c = mhlo.constant dense<(1.0, 1.0)> : tensor<complex<f64>>
+  %cos = mhlo.sine %c : tensor<complex<f64>>
+  return %cos : tensor<complex<f64>>
+}
+
+// CHECK-LABEL: @sin
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.298458e+00+6.349639e-01i
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir
new file mode 100644
index 00000000000..0c9cb658ba7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/compute_reshape_shape.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @compute_reshape_shape() -> tensor<3xi32> {
+  %dynamic_shape = mhlo.constant dense<[2, -1, 3]> : tensor<3xi32>
+  %n = arith.constant 24 : index
+  %shape = mhlo.compute_reshape_shape %n, %dynamic_shape
+    : (index, tensor<3xi32>) -> tensor<3xi32>
+  return %shape : tensor<3xi32>
+}
+
+// CHECK-LABEL: @compute_reshape_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2, 4, 3]
+
+func.func @compute_reshape_shape_static() -> tensor<3xi32> {
+  %dynamic_shape = mhlo.constant dense<[2, 4, 3]> : tensor<3xi32>
+  %n = arith.constant 24 : index
+  %shape = mhlo.compute_reshape_shape %n, %dynamic_shape
+    : (index, tensor<3xi32>) -> tensor<3xi32>
+  return %shape : tensor<3xi32>
+}
+
+// CHECK-LABEL: @compute_reshape_shape_static
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2, 4, 3]
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
new file mode 100644
index 00000000000..77547aff85b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/constant.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @main() -> (tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>) {
+  %i32 = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
+  %f32 = mhlo.constant dense<[[0.0, 0.1, 0.2], [0.3, 0.4, 0.5]]> : tensor<2x3xf32>
+  %empty = mhlo.constant dense<> : tensor<0x0x3xi16>
+  %scalar = mhlo.constant dense<3.14> : tensor<f64>
+  return %i32, %f32, %empty, %scalar : tensor<2x3xi32>, tensor<2x3xf32>, tensor<0x0x3xi16>, tensor<f64>
+}
+
+// CHECK-LABEL: @main
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+// CHECK-NEXT{LITERAL}: [[0.000000e+00, 1.000000e-01, 2.000000e-01], [3.000000e-01, 4.000000e-01, 5.000000e-01]]
+// CHECK-NEXT{LITERAL}: []
+// CHECK-NEXT{LITERAL}: 3.140000e+00
+
+func.func @ui8() -> tensor<ui8> {
+  %v = mhlo.constant dense<123> : tensor<ui8>
+  return %v : tensor<ui8>
+}
+
+// CHECK-LABEL: @ui8
+// CHECK-NEXT: Results
+// CHECK-NEXT: 123
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
new file mode 100644
index 00000000000..fe897b2856a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/convert.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @convert_i1_to_f32() -> tensor<2xf32> {
+  %input = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %result = "mhlo.convert"(%input) : (tensor<2xi1>) -> tensor<2xf32>
+  func.return %result : tensor<2xf32>
+}
+
+// CHECK-LABEL: @convert_i1_to_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00, 0.000000e+00]
+
+func.func @convert_f32_to_i16() -> tensor<2xi16> {
+  %input = mhlo.constant dense<[1.4, 2.55]> : tensor<2xf32>
+  %result = "mhlo.convert"(%input) : (tensor<2xf32>) -> tensor<2xi16>
+  func.return %result : tensor<2xi16>
+}
+
+// CHECK-LABEL: @convert_f32_to_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
new file mode 100644
index 00000000000..66ed3956c0c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot_2d() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @dot_2d_1d() -> tensor<2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %dot : tensor<2xi32>
+}
+
+// CHECK-LABEL: @dot_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [14, 32]
+
+func.func @dot_1d_1d() -> tensor<i32> {
+  %lhs = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot"(%lhs, %rhs)
+    : (tensor<2xi32>, tensor<2xi32>) -> tensor<i32>
+  return %dot : tensor<i32>
+}
+
+// CHECK-LABEL: @dot_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: 14
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
new file mode 100644
index 00000000000..1f83ad5ff63
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dot_general.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dot_general_2d() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[16, 19], [36, 43]]
+
+func.func @dot_general_2d_2() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[[4, 5], [6, 7]]> : tensor<2x2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [0],
+      rhs_contracting_dimensions = [1],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d_2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[19, 27], [28, 40]]
+
+func.func @dot_general_2d_1d() -> tensor<2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [0],
+      rhs_contracting_dimensions = [0],
+      lhs_batching_dimensions = [],
+      rhs_batching_dimensions = []
+    >
+  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %dot : tensor<2xi32>
+}
+
+// CHECK-LABEL: @dot_general_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [19, 28]
+
+func.func @dot_general_batch_only() -> tensor<2x2xi32> {
+  %lhs = mhlo.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
+  %rhs = mhlo.constant dense<[4, 5]> : tensor<2xi32>
+  %dot = "mhlo.dot_general"(%lhs, %rhs) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_contracting_dimensions = [],
+      rhs_contracting_dimensions = [],
+      lhs_batching_dimensions = [1],
+      rhs_batching_dimensions = [0]
+    >
+  } : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
+  return %dot : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: @dot_general_batch_only
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[4, 12], [10, 20]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
new file mode 100644
index 00000000000..8708694d3a8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_slice.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dynamic_slice() -> tensor<1x2xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %s0 = mhlo.constant dense<1> : tensor<i32>
+  %s1 = mhlo.constant dense<0> : tensor<i32>
+  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
+    slice_sizes = dense<[1, 2]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<1x2xi32>
+  func.return %0 : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @dynamic_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5, 6]]
+
+func.func @clamp_starts() -> tensor<2x3xi32> {
+
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %s0 = mhlo.constant dense<-10> : tensor<i32>
+  %s1 = mhlo.constant dense<10> : tensor<i32>
+  %0 = "mhlo.dynamic_slice"(%cst, %s0, %s1) {
+    slice_sizes = dense<[2, 3]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<i32>, tensor<i32>) -> tensor<2x3xi32>
+  func.return %0 : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @clamp_starts
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
new file mode 100644
index 00000000000..481389e156e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/dynamic_update_slice.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dynamic_update_slice() -> tensor<3x4xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
+  %s0 = mhlo.constant dense<1> : tensor<i32>
+  %s1 = mhlo.constant dense<0> : tensor<i32>
+  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1)
+    : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
+      -> tensor<3x4xi32>
+  func.return %0 : tensor<3x4xi32>
+}
+
+// CHECK-LABEL: @dynamic_update_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4], [13, 14, 7, 8], [9, 10, 11, 12]]
+
+func.func @clamp_starts() -> tensor<3x4xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %v = mhlo.constant dense<[[13, 14]]> : tensor<1x2xi32>
+  %s0 = mhlo.constant dense<-10> : tensor<i32>
+  %s1 = mhlo.constant dense<10> : tensor<i32>
+  %0 = "mhlo.dynamic_update_slice"(%cst, %v, %s0, %s1) {
+    slice_sizes = dense<[2, 3]> : tensor<2xi64>
+  } : (tensor<3x4xi32>, tensor<1x2xi32>, tensor<i32>, tensor<i32>)
+      -> tensor<3x4xi32>
+  func.return %0 : tensor<3x4xi32>
+}
+
+// CHECK-LABEL: @clamp_starts
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 13, 14], [5, 6, 7, 8], [9, 10, 11, 12]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
new file mode 100644
index 00000000000..1b0e5b2941a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/float_math.mlir
@@ -0,0 +1,200 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @atan2() -> tensor<1xf32> {
+  // Why would you ever do this?
+  %c10 = mhlo.constant dense<10.0> : tensor<1xf32>
+  %c1 = mhlo.constant dense<1.0> : tensor<1xf32>
+  %ret = mhlo.atan2 %c10, %c1 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @atan2
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.471128e+00]
+
+func.func @cbrt() -> tensor<1xf32> {
+  %c-27 = mhlo.constant dense<-27.0> : tensor<1xf32>
+  %ret = mhlo.cbrt %c-27 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @cbrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-3.000000e+00]
+
+func.func @ceil() -> tensor<1xf32> {
+  %c = mhlo.constant dense<0.123> : tensor<1xf32>
+  %ret = mhlo.ceil %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @ceil
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+
+func.func @complex() -> tensor<complex<f32>> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.complex %c1, %c2 : tensor<complex<f32>>
+  return %ret : tensor<complex<f32>>
+}
+
+// CHECK-LABEL: @complex
+// CHECK-NEXT: Results
+// CHECK-NEXT: <complex<f32>>: 1.000000e+00+2.000000e+00i
+
+func.func @exp() -> tensor<1xf32> {
+  %c = mhlo.constant dense<0.0> : tensor<1xf32>
+  %ret = mhlo.exponential %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @exp
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+
+func.func @floor() -> tensor<1xf32> {
+  %c = mhlo.constant dense<3.123> : tensor<1xf32>
+  %ret = mhlo.floor %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @floor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3.000000e+00]
+
+func.func @is_finite() -> tensor<3xi1> {
+  %c = mhlo.constant dense<[0x7FC00000, 2.0, 0x7F800000]> : tensor<3xf32>
+  %is_finite = mhlo.is_finite %c : (tensor<3xf32>) -> tensor<3xi1>
+  return %is_finite : tensor<3xi1>
+}
+
+// CHECK-LABEL: @is_finite
+// CHECK-NEXT: Results
+// CHECK-NEXT: [false, true, false]
+
+func.func @log() -> tensor<1xf32> {
+  %c = mhlo.constant dense<1.0> : tensor<1xf32>
+  %ret = mhlo.log %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @log
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00]
+
+func.func @logistic() -> tensor<5xf32> {
+  %c = mhlo.constant dense<[-1.0e5, -1.0, 0.0, 1.0, 1.0e5]> : tensor<5xf32>
+  %ret = mhlo.logistic %c : tensor<5xf32>
+  return %ret : tensor<5xf32>
+}
+
+// CHECK-LABEL: @logistic
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0.000000e+00, 2.689414e-01, 5.000000e-01, 7.310586e-01, 1.000000e+00]
+
+func.func @maximum() -> tensor<f32> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.maximum %c1, %c2 : tensor<f32>
+  return %ret : tensor<f32>
+}
+
+// CHECK-LABEL: @maximum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 2.000000e+00
+
+func.func @minimum() -> tensor<f32> {
+  %c1 = mhlo.constant dense<1.0> : tensor<f32>
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %ret = mhlo.minimum %c1, %c2 : tensor<f32>
+  return %ret : tensor<f32>
+}
+
+// CHECK-LABEL: @minimum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @pow() -> tensor<f32> {
+  %c2 = mhlo.constant dense<2.0> : tensor<f32>
+  %c5 = mhlo.constant dense<5.0> : tensor<f32>
+  %pow = mhlo.power %c2, %c5 : tensor<f32>
+  return %pow : tensor<f32>
+}
+
+// CHECK-LABEL: @pow
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3.200000e+01
+
+func.func @rem() -> tensor<8xf32> {
+  %0 = mhlo.constant dense<[-2.5, 2.25, -10.0, 6.0, 3.0, 3.0, -1.0, -8.0]> : tensor<8xf32>
+  %1 = mhlo.constant dense<[10.0, 1.0, 10.0, -6.0, 2.0, -2.0, 7.0, -4.0]> : tensor<8xf32>
+  %2 = mhlo.remainder %0, %1 : tensor<8xf32>
+  func.return %2 : tensor<8xf32>
+}
+
+// CHECK-LABEL: @rem
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.500000e+00, 2.500000e-01, -0.000000e+00, 0.000000e+00, 1.000000e+00, 1.000000e+00, -1.000000e+00, -0.000000e+00]
+
+func.func @rem_inf() -> (tensor<1xf32>, tensor<1xf32>) {
+  %0 = mhlo.constant dense<[1.0]> : tensor<1xf32>
+  %1 = mhlo.constant dense<[0x7F800000]> : tensor<1xf32>
+  %2 = mhlo.remainder %0, %1 : tensor<1xf32>
+  func.return %2, %1 : tensor<1xf32>, tensor<1xf32>
+}
+
+// CHECK-LABEL: @rem_inf
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1.000000e+00]
+// CHECK-NEXT: [INF]
+
+func.func @round_nearest_afz() -> tensor<4xf32> {
+  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 1.5]> : tensor<4xf32>
+  %ret = mhlo.round_nearest_afz %c : tensor<4xf32>
+  return %ret : tensor<4xf32>
+}
+
+// CHECK-LABEL: @round_nearest_afz
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.000000e+00, -1.000000e+00, 1.000000e+00, 2.000000e+00]
+
+func.func @round_nearest_even() -> tensor<5xf32> {
+  %c = mhlo.constant dense<[-1.5, -0.5, 0.5, 0.6, 1.5]> : tensor<5xf32>
+  %ret = mhlo.round_nearest_even %c : tensor<5xf32>
+  return %ret : tensor<5xf32>
+}
+
+// CHECK-LABEL: @round_nearest_even
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-2.000000e+00, -0.000000e+00, 0.000000e+00, 1.000000e+00, 2.000000e+00]
+
+func.func @rsqrt() -> tensor<1xf32> {
+  %c4 = mhlo.constant dense<4.0> : tensor<1xf32>
+  %ret = mhlo.rsqrt %c4 : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @rsqrt
+// CHECK-NEXT: Results
+// CHECK-NEXT: [5.000000e-01]
+
+func.func @sign() -> tensor<3xf32> {
+  %c = mhlo.constant dense<[-1.0, 2.0, 0x7F800000]> : tensor<3xf32>
+  %ret = mhlo.sign %c : tensor<3xf32>
+  return %ret : tensor<3xf32>
+}
+
+// CHECK-LABEL: @sign
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-1.000000e+00, 1.000000e+00, 1.000000e+00]
+
+func.func @tanh() -> tensor<1xf32> {
+  %c = mhlo.constant dense<[1.0]> : tensor<1xf32>
+  %ret = mhlo.tanh %c : tensor<1xf32>
+  return %ret : tensor<1xf32>
+}
+
+// CHECK-LABEL: @tanh
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7.615942e-01]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
new file mode 100644
index 00000000000..dd216b5f29a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/gather.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bounds_check() -> tensor<4x3xi32> {
+  %operand = mhlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %indices = mhlo.constant dense<[[1], [8], [-3]]> : tensor<3x1xi32>
+  %gather = "mhlo.gather"(%operand, %indices) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [],
+      index_vector_dim = 1,
+      offset_dims = [0],
+      start_index_map = [0]
+    >,
+    slice_sizes = dense<[4]> : tensor<1xi64>
+  } : (tensor<10xi32>, tensor<3x1xi32>) -> tensor<4x3xi32>
+  return %gather : tensor<4x3xi32>
+}
+
+// CHECK-LABEL: @bounds_check
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 6, 0], [2, 7, 1], [3, 8, 2], [4, 9, 3]]
+
+func.func @gather_2d() -> tensor<4x2xi32> {
+  // operand = np.arange(1, 16).reshape([5, 3, 1])
+  // indices = np.array([[0, 0], [1, 0], [4, 3], [-1, -1]])
+  // lax.gather(operand, indices, lax.GatherDimensionNumbers(offset_dims=(1,),
+  //   collapsed_slice_dims=(1,2), start_index_map=(0,1,)), slice_sizes=[2,1,1]))
+  %operand = arith.constant dense<[
+    [[1], [2], [3]],
+    [[4], [5], [6]],
+    [[7], [8], [9]],
+    [[10], [11], [12]],
+    [[13], [14], [15]]
+  ]> : tensor<5x3x1xi32>
+
+  %indices = arith.constant dense<[
+    [0, 0],
+    [1, 0],
+    [4, 3],
+    [-1, -1]
+  ]> : tensor<4x2xi64>
+
+  %0 = "mhlo.gather"(%operand, %indices) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [1, 2],
+      index_vector_dim = 1,
+      offset_dims = [1],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<[2, 1, 1]> : tensor<3xi64>
+  } : (tensor<5x3x1xi32>, tensor<4x2xi64>) -> tensor<4x2xi32>
+
+  func.return %0 : tensor<4x2xi32>
+}
+
+// CHECK-LABEL: @gather_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 4], [4, 7], [12, 15], [1, 4]]
+
+func.func @dynamic_gather() -> tensor<*xi32> {
+  %operand = mhlo.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %indices = mhlo.constant dense<[[1], [8], [-3]]> : tensor<3x1xi32>
+  %slice_sizes = mhlo.constant dense<[4]> : tensor<1xi32>
+  %gather = "mhlo.dynamic_gather"(%operand, %indices, %slice_sizes) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [],
+      index_vector_dim = 1,
+      offset_dims = [0],
+      start_index_map = [0]
+    >,
+    slice_sizes = dense<[4]> : tensor<1xi64>
+  } : (tensor<10xi32>, tensor<3x1xi32>, tensor<1xi32>) -> tensor<*xi32>
+  return %gather : tensor<*xi32>
+}
+
+// CHECK-LABEL: @dynamic_gather
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 6, 0], [2, 7, 1], [3, 8, 2], [4, 9, 3]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
new file mode 100644
index 00000000000..999f6f6996e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/int_math.mlir
@@ -0,0 +1,358 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @abs() -> tensor<1xi32> {
+  %c-3 = mhlo.constant dense<-3> : tensor<1xi32>
+  %ret = mhlo.abs %c-3 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @abs
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3]
+
+func.func @add_tensor() -> tensor<2x3xi32> {
+  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
+  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi32>
+  %result = mhlo.add %lhs, %rhs : tensor<2x3xi32>
+  return %result : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @add_tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[10, 21, 32], [43, 54, 65]]
+
+func.func @add_scalar_i8() -> tensor<i8> {
+  %lhs = mhlo.constant dense<40> : tensor<i8>
+  %rhs = mhlo.constant dense<2> : tensor<i8>
+  %result = mhlo.add %lhs, %rhs : tensor<i8>
+  return %result : tensor<i8>
+}
+
+// CHECK-LABEL: @add_scalar_i8
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i8>: 42
+
+func.func @add_scalar_i16() -> tensor<i16> {
+  %lhs = mhlo.constant dense<40> : tensor<i16>
+  %rhs = mhlo.constant dense<2> : tensor<i16>
+  %result = mhlo.add %lhs, %rhs : tensor<i16>
+  return %result : tensor<i16>
+}
+
+// CHECK-LABEL: @add_scalar_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 42
+
+func.func @add_scalar_i32() -> tensor<i32> {
+  %lhs = mhlo.constant dense<40> : tensor<i32>
+  %rhs = mhlo.constant dense<2> : tensor<i32>
+  %result = mhlo.add %lhs, %rhs : tensor<i32>
+  return %result : tensor<i32>
+}
+
+// CHECK-LABEL: @add_scalar_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i32>: 42
+
+func.func @and() -> tensor<1xi32> {
+  %c63 = mhlo.constant dense<63> : tensor<1xi32>
+  %c131 = mhlo.constant dense<131> : tensor<1xi32>
+  %ret = mhlo.and %c63, %c131 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @and
+// CHECK-NEXT: Results
+// CHECK-NEXT: [3]
+
+func.func @and_i16() -> tensor<i16> {
+  %c63 = mhlo.constant dense<63> : tensor<i16>
+  %c131 = mhlo.constant dense<131> : tensor<i16>
+  %ret = mhlo.and %c63, %c131 : tensor<i16>
+  return %ret : tensor<i16>
+}
+
+// CHECK-LABEL: @and_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 3
+
+func.func @clz_negative()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
+  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
+  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
+  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
+  %clz_8 = mhlo.count_leading_zeros %c-1_8 : tensor<1xi8>
+  %clz_16 = mhlo.count_leading_zeros %c-2_16 : tensor<1xi16>
+  %clz_32 = mhlo.count_leading_zeros %c-4_32 : tensor<1xi32>
+  %clz_64 = mhlo.count_leading_zeros %c-8_64 : tensor<1xi64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @clz_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+// CHECK-NEXT: [0]
+
+func.func @clz_signed()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c1_8 = mhlo.constant dense<1> : tensor<1xi8>
+  %c2_16 = mhlo.constant dense<2> : tensor<1xi16>
+  %c4_32 = mhlo.constant dense<4> : tensor<1xi32>
+  %c8_64 = mhlo.constant dense<8> : tensor<1xi64>
+  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xi8>
+  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xi16>
+  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xi32>
+  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xi64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @clz_signed
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+// CHECK-NEXT: [14]
+// CHECK-NEXT: [29]
+// CHECK-NEXT: [60]
+
+func.func @clz_unsigned()
+    -> (tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>) {
+  %c1_8 = mhlo.constant dense<1> : tensor<1xui8>
+  %c2_16 = mhlo.constant dense<2> : tensor<1xui16>
+  %c4_32 = mhlo.constant dense<4> : tensor<1xui32>
+  %c8_64 = mhlo.constant dense<8> : tensor<1xui64>
+  %clz_8 = mhlo.count_leading_zeros %c1_8 : tensor<1xui8>
+  %clz_16 = mhlo.count_leading_zeros %c2_16 : tensor<1xui16>
+  %clz_32 = mhlo.count_leading_zeros %c4_32 : tensor<1xui32>
+  %clz_64 = mhlo.count_leading_zeros %c8_64 : tensor<1xui64>
+  return %clz_8, %clz_16, %clz_32, %clz_64
+    : tensor<1xui8>, tensor<1xui16>, tensor<1xui32>, tensor<1xui64>
+}
+
+// CHECK-LABEL: @clz_unsigned
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+// CHECK-NEXT: [14]
+// CHECK-NEXT: [29]
+// CHECK-NEXT: [60]
+
+func.func @divide() -> tensor<1xi32> {
+  %c-10 = mhlo.constant dense<-10> : tensor<1xi32>
+  %c-2 = mhlo.constant dense<-2> : tensor<1xi32>
+  %ret = mhlo.divide %c-10, %c-2 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @divide
+// CHECK-NEXT: Results
+// CHECK-NEXT: [5]
+
+func.func @subtract() -> tensor<1xi32> {
+  %c10 = mhlo.constant dense<10> : tensor<1xi32>
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %ret = mhlo.subtract %c10, %c3 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @subtract
+// CHECK-NEXT: Results
+// CHECK-NEXT: [7]
+
+func.func @or() -> tensor<1xi32> {
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %c10 = mhlo.constant dense<10> : tensor<1xi32>
+  %ret = mhlo.or %c3, %c10 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @or
+// CHECK-NEXT: Results
+// CHECK-NEXT: [11]
+
+func.func @max_scalar() -> tensor<i32> {
+  %lhs = mhlo.constant dense<40> : tensor<i32>
+  %rhs = mhlo.constant dense<2> : tensor<i32>
+  %result = mhlo.maximum %lhs, %rhs : tensor<i32>
+  return %result : tensor<i32>
+}
+
+// CHECK-LABEL: @max_scalar
+// CHECK-NEXT: Results
+// CHECK-NEXT: 40
+
+func.func @multiply() -> tensor<1xi32> {
+  %c3 = mhlo.constant dense<3> : tensor<1xi32>
+  %c-5 = mhlo.constant dense<-5> : tensor<1xi32>
+  %ret = mhlo.multiply %c3, %c-5 : tensor<1xi32>
+  return %ret : tensor<1xi32>
+}
+
+// CHECK-LABEL: @multiply
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-15]
+
+func.func @multiply_scalar_i16() -> tensor<i16> {
+  %lhs = mhlo.constant dense<40> : tensor<i16>
+  %rhs = mhlo.constant dense<2> : tensor<i16>
+  %result = mhlo.multiply %lhs, %rhs : tensor<i16>
+  return %result : tensor<i16>
+}
+
+// CHECK-LABEL: @multiply_scalar_i16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i16>: 80
+
+func.func @multiply_scalar_ui16() -> tensor<ui16> {
+  %lhs = mhlo.constant dense<40> : tensor<ui16>
+  %rhs = mhlo.constant dense<2> : tensor<ui16>
+  %result = mhlo.multiply %lhs, %rhs : tensor<ui16>
+  return %result : tensor<ui16>
+}
+
+// CHECK-LABEL: @multiply_scalar_ui16
+// CHECK-NEXT: Results
+// CHECK-NEXT: <ui16>: 80
+
+func.func @not_i1() -> tensor<2xi1> {
+  %cst = mhlo.constant dense<[false, true]> : tensor<2xi1>
+  %not = mhlo.not %cst : tensor<2xi1>
+  return %not : tensor<2xi1>
+}
+
+// CHECK-LABEL: @not_i1
+// CHECK-NEXT: Results
+// CHECK-NEXT: [true, false]
+
+func.func @not_ui16() -> tensor<ui16> {
+  %cst = mhlo.constant dense<1> : tensor<ui16>
+  %not = mhlo.not %cst : tensor<ui16>
+  return %not : tensor<ui16>
+}
+
+// CHECK-LABEL: @not_ui16
+// CHECK-NEXT: Results
+// CHECK-NEXT: 65534
+
+func.func @popcnt_negative()
+    -> (tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>) {
+  %c-1_8 = mhlo.constant dense<-1> : tensor<1xi8>
+  %c-2_16 = mhlo.constant dense<-2> : tensor<1xi16>
+  %c-4_32 = mhlo.constant dense<-4> : tensor<1xi32>
+  %c-8_64 = mhlo.constant dense<-8> : tensor<1xi64>
+  %pop_8 = mhlo.popcnt %c-1_8 : tensor<1xi8>
+  %pop_16 = mhlo.popcnt %c-2_16 : tensor<1xi16>
+  %pop_32 = mhlo.popcnt %c-4_32 : tensor<1xi32>
+  %pop_64 = mhlo.popcnt %c-8_64 : tensor<1xi64>
+  return %pop_8, %pop_16, %pop_32, %pop_64
+    : tensor<1xi8>, tensor<1xi16>, tensor<1xi32>, tensor<1xi64>
+}
+
+// CHECK-LABEL: @popcnt_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [8]
+// CHECK-NEXT: [15]
+// CHECK-NEXT: [30]
+// CHECK-NEXT: [61]
+
+func.func @pow() -> tensor<ui16> {
+  %c2 = mhlo.constant dense<2> : tensor<ui16>
+  %c5 = mhlo.constant dense<5> : tensor<ui16>
+  %pow = mhlo.power %c2, %c5 : tensor<ui16>
+  return %pow : tensor<ui16>
+}
+
+// CHECK-LABEL: @pow
+// CHECK-NEXT: Results
+// CHECK-NEXT: <ui16>: 32
+
+func.func @pow_negative() -> tensor<3xi64> {
+  %c = mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
+  %c2 = mhlo.constant dense<-1> : tensor<3xi64>
+  %pow = mhlo.power %c, %c2 : tensor<3xi64>
+  return %pow : tensor<3xi64>
+}
+
+// CHECK-LABEL: @pow_negative
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 1, 0]
+
+func.func @pow_non_double() -> tensor<i64> {
+  %c3 = mhlo.constant dense<3> : tensor<i64>
+  %c35 = mhlo.constant dense<35> : tensor<i64>
+  // The result of this operation cannot be represented by a double.
+  %pow = mhlo.power %c3, %c35 : tensor<i64>
+  return %pow : tensor<i64>
+}
+
+// CHECK-LABEL: @pow_non_double
+// CHECK-NEXT: Results
+// CHECK-NEXT: 50031545098999707
+
+func.func @rem() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[5, 66, 5, -1]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[3, 5, 1, -2]> : tensor<4xi32>
+  %2 = mhlo.remainder %0, %1 : tensor<4xi32>
+  func.return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: @rem
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2, 1, 0, -1]
+
+func.func @shift_left() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[3, 7]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[4, 31]> : tensor<2xi32>
+  %ret = mhlo.shift_left %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_left
+// CHECK-NEXT: Results
+// CHECK-NEXT: [48, -2147483648]
+
+func.func @shift_right_arith() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_right_arith
+// CHECK-NEXT: Results
+// CHECK-NEXT: [25, -25]
+
+func.func @shift_right_arith_ui32() -> tensor<ui32> {
+  %0 = mhlo.constant dense<100> : tensor<ui32>
+  %1 = mhlo.constant dense<2> : tensor<ui32>
+  %ret = mhlo.shift_right_arithmetic %0, %1 : tensor<ui32>
+  func.return %ret : tensor<ui32>
+}
+
+// CHECK-LABEL: @shift_right_arith_ui32
+// CHECK-NEXT: Results
+// CHECK-NEXT: 25
+
+func.func @shift_right_logical() -> tensor<2xi32> {
+  %0 = mhlo.constant dense<[100, -100]> : tensor<2xi32>
+  %1 = mhlo.constant dense<[2, 2]> : tensor<2xi32>
+  %ret = mhlo.shift_right_logical %0, %1 : tensor<2xi32>
+  func.return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @shift_right_logical
+// CHECK-NEXT: Results
+// CHECK-NEXT: [25, 1073741799]
+
+func.func @shift_right_logical_ui32() -> tensor<ui32> {
+  %0 = mhlo.constant dense<100> : tensor<ui32>
+  %1 = mhlo.constant dense<2> : tensor<ui32>
+  %ret = mhlo.shift_right_logical %0, %1 : tensor<ui32>
+  func.return %ret : tensor<ui32>
+}
+
+// CHECK-LABEL: @shift_right_logical_ui32
+// CHECK-NEXT: Results
+// CHECK-NEXT: 25
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
new file mode 100644
index 00000000000..e2d12d998e0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/iota.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @iota_f32() -> tensor<1x2x3x4xf32> {
+  %result = "mhlo.iota"() {
+    iota_dimension = 2 : i64
+  } : () -> tensor<1x2x3x4xf32>
+  func.return %result : tensor<1x2x3x4xf32>
+}
+
+// CHECK-LABEL: @iota_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00],
+// CHECK{LITERAL}:         [1.000000e+00, 1.000000e+00, 1.000000e+00, 1.000000e+00],
+// CHECK{LITERAL}:         [2.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00]]]]
+
+func.func @iota_i32() -> tensor<1x2x3x4xi32> {
+  %result = "mhlo.iota"() {
+    iota_dimension = 3 : i64
+  } : () -> tensor<1x2x3x4xi32>
+  func.return %result : tensor<1x2x3x4xi32>
+}
+
+// CHECK-LABEL: @iota_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3]],
+// CHECK{LITERAL}         [[0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3],
+// CHECK{LITERAL}          [0, 1, 2, 3]]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
new file mode 100644
index 00000000000..1f563b2d5ec
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/pad.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static_pad() -> tensor<2x4x7xi32> {
+  // lax.pad(np.array([[[1,2,3],[4,5,6]]]), 42,
+  //                  [(0, 1, 0), (1, 1, 0), (2, 0, 1)])
+  %cst = mhlo.constant dense<[[[1,2,3],[4,5,6]]]> : tensor<1x2x3xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%cst, %pad_value) {
+    edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
+    edge_padding_high = dense<[1, 1, 0]> : tensor<3xi64>,
+    interior_padding = dense<[0, 0, 1]> : tensor<3xi64>
+  } : (tensor<1x2x3xi32>, tensor<i32>) -> tensor<2x4x7xi32>
+  func.return %0 : tensor<2x4x7xi32>
+}
+
+// CHECK-LABEL: @static_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 1, 42, 2, 42, 3]
+// CHECK-SAME{LITERAL}:   [42, 42, 4, 42, 5, 42, 6],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]],
+// CHECK-SAME{LITERAL}:  [[42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42],
+// CHECK-SAME{LITERAL}:   [42, 42, 42, 42, 42, 42, 42]]]
+
+func.func @dynamic_pad() -> tensor<?x4xi32> {
+  %c1 = arith.constant 1 : index
+  %empty = tensor.empty(%c1) : tensor<?x2xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%empty, %pad_value) {
+    edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
+    edge_padding_high = dense<[1, 1]> : tensor<2xi64>,
+    interior_padding = dense<[0, 0]> : tensor<2xi64>
+  } : (tensor<?x2xi32>, tensor<i32>) -> tensor<?x4xi32>
+  func.return %0 : tensor<?x4xi32>
+}
+
+// CHECK-LABEL: @dynamic_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[42, 0, 0, 42], [42, 42, 42, 42]]
+
+func.func @negative_pad() -> tensor<10xi32> {
+  %empty = arith.constant dense<[1,2,3,4,5,6,7]> : tensor<7xi32>
+  %pad_value = mhlo.constant dense<42> : tensor<i32>
+  %0 = "mhlo.pad"(%empty, %pad_value) {
+    edge_padding_low = dense<[-2]> : tensor<1xi64>,
+    edge_padding_high = dense<[-1]> : tensor<1xi64>,
+    interior_padding = dense<[1]> : tensor<1xi64>
+  } : (tensor<7xi32>, tensor<i32>) -> tensor<10xi32>
+  func.return %0 : tensor<10xi32>
+}
+
+// CHECK-LABEL: @negative_pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2, 42, 3, 42, 4, 42, 5, 42, 6, 42]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
new file mode 100644
index 00000000000..f371769413c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reduce.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduce() -> tensor<3xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
+  %init = mhlo.constant dense<1> : tensor<i32>
+  %reduce = mhlo.reduce(%cst init: %init) across dimensions = [0]
+      : (tensor<2x3xi32>, tensor<i32>) -> tensor<3xi32>
+    reducer(%arg0: tensor<i32>, %arg1: tensor<i32>)  {
+      %0 = mhlo.add %arg0, %arg1 : tensor<i32>
+      mhlo.return %0 : tensor<i32>
+    }
+  return %reduce : tensor<3xi32>
+}
+
+// CHECK-LABEL: @reduce
+// CHECK-NEXT: Results
+// CHECK-NEXT: [6, 8, 10]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
new file mode 100644
index 00000000000..117489375b0
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/reshape.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reshape() -> tensor<2x4x2xi32> {
+  %cst = mhlo.constant dense<
+    [[0, 1, 2, 3], [4, 5, 6, 7],
+     [8, 9, 10, 11], [12, 13, 14, 15]]> : tensor<4x4xi32>
+  %reshape = mhlo.reshape %cst : (tensor<4x4xi32>) -> tensor<2x4x2xi32>
+  func.return %reshape : tensor<2x4x2xi32>
+}
+
+// CHECK-LABEL: @reshape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <2x4x2xi32>: [[[0, 1], [2, 3], [4, 5], [6, 7]],
+// CHECK-SAME{LITERAL}:               [[8, 9], [10, 11], [12, 13], [14, 15]]]
+
+func.func @reshape_0d_1d() -> tensor<1xi32> {
+  %cst = mhlo.constant dense<42> : tensor<i32>
+  %reshape = mhlo.reshape %cst : (tensor<i32>) -> tensor<1xi32>
+  func.return %reshape : tensor<1xi32>
+}
+
+// CHECK-LABEL: @reshape_0d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [42]
+
+func.func @reshape_1d_0d() -> tensor<i32> {
+  %cst = mhlo.constant dense<42> : tensor<1xi32>
+  %reshape = mhlo.reshape %cst : (tensor<1xi32>) -> tensor<i32>
+  func.return %reshape : tensor<i32>
+}
+
+// CHECK-LABEL: @reshape_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 42
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
new file mode 100644
index 00000000000..85bb6225f9b
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/scatter.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bounds_check() -> tensor<10xi32> {
+  // operand = np.zeros([8], dtype=np.int32)
+  // indices = np.array([[1], [8], [-1]])
+  // updates = np.array([[4, 5, 6], [6, 7, 8], [8, 9, 10]])
+  // lax.scatter_add(operand, indices, updates,
+  //   dimension_numbers=lax.ScatterDimensionNumbers(
+  //      update_window_dims=(0,), inserted_window_dims=(),
+  //      scatter_dims_to_operand_dims=(0,)))
+  %operand = mhlo.constant dense<0> : tensor<10xi32>
+  %indices = mhlo.constant dense<[[1], [8], [-1]]> : tensor<3x1xi32>
+  %updates = mhlo.constant dense<[[4, 5, 6], [6, 7, 8], [8, 9, 10]]> : tensor<3x3xi32>
+  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
+  ^bb0(%lhs: tensor<i32>, %rhs: tensor<i32>):
+    %add = mhlo.add %lhs, %rhs : tensor<i32>
+    "mhlo.return"(%add) : (tensor<i32>) -> ()
+  }) {
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [0],
+      inserted_window_dims = [],
+      index_vector_dim = 1,
+      scatter_dims_to_operand_dims = [0]
+    >
+  } : (tensor<10xi32>, tensor<3x1xi32>, tensor<3x3xi32>) -> tensor<10xi32>
+  return %scatter : tensor<10xi32>
+}
+
+// CHECK-LABEL: @bounds_check
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 4, 6, 8, 0, 0, 0, 0, 0, 0]
+
+func.func @update_last_element() -> tensor<2xi32> {
+  %operand = mhlo.constant dense<[1, 1]> : tensor<2xi32>
+  %indices = mhlo.constant dense<[[1]]> : tensor<1x1xi32>
+  %updates = mhlo.constant dense<[[0]]> : tensor<1x1xi32>
+
+  %scatter = "mhlo.scatter"(%operand, %indices, %updates) ({
+    ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+      "mhlo.return"(%arg4) : (tensor<i32>) -> ()
+    }) {
+      indices_are_sorted = false,
+      scatter_dimension_numbers = #mhlo.scatter<
+        update_window_dims = [1],
+        scatter_dims_to_operand_dims = [0],
+        index_vector_dim = 1
+      >,
+      unique_indices = false
+    } : (tensor<2xi32>, tensor<1x1xi32>, tensor<1x1xi32>) -> tensor<2xi32>
+  return %scatter : tensor<2xi32>
+}
+
+// CHECK-LABEL: @update_last_element
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 0]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
new file mode 100644
index 00000000000..5959b3cf105
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/select.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reshape() -> tensor<2xi32> {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %a = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+  %b = mhlo.constant dense<[3, 4]> : tensor<2xi32>
+  %ret = "mhlo.select"(%cst, %a, %b) :
+    (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @reshape
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 4]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
new file mode 100644
index 00000000000..c5bb53f8740
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/slice.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @slice() -> tensor<1x2xi32> {
+  %cst = mhlo.constant dense<[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]>
+    : tensor<3x4xi32>
+  %0 = "mhlo.slice"(%cst) {
+    start_indices = dense<[1, 0]> : tensor<2xi64>,
+    limit_indices = dense<[2, 4]> : tensor<2xi64>,
+    strides = dense<[1, 2]> : tensor<2xi64>
+  } : (tensor<3x4xi32>) -> tensor<1x2xi32>
+  func.return %0 : tensor<1x2xi32>
+}
+
+// CHECK-LABEL: @slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[5, 7]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
new file mode 100644
index 00000000000..334e8421863
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/subtract.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-interpreter-runner %s | FileCheck %s
+
+func.func @main() -> tensor<2x3xi64> {
+  %lhs = mhlo.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi64>
+  %rhs = mhlo.constant dense<[[10, 20, 30], [40, 50, 60]]> : tensor<2x3xi64>
+  %result = mhlo.subtract %lhs, %rhs : tensor<2x3xi64>
+  return %result : tensor<2x3xi64>
+}
+
+// CHECK{LITERAL}: [[-10, -19, -28], [-37, -46, -55]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
new file mode 100644
index 00000000000..09f9822aa8d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/transpose.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> tensor<2x1x4x3xi32> {
+  %0 = mhlo.constant dense<[[[
+      [000, 001, 002, 003],
+      [010, 011, 012, 013],
+      [020, 021, 022, 023]
+    ],
+    [
+      [100, 101, 102, 103],
+      [110, 111, 112, 113],
+      [120, 121, 122, 123]
+    ]]]> : tensor<1x2x3x4xi32>
+  %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>}
+    : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  return %1 : tensor<2x1x4x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
+// CHECK{LITERAL}:         [1, 11, 21],
+// CHECK{LITERAL}:         [2, 12, 22],
+// CHECK{LITERAL}:         [3, 13, 23]]],
+// CHECK{LITERAL}:       [[[100, 110, 120],
+// CHECK{LITERAL}:         [101, 111, 121],
+// CHECK{LITERAL}:         [102, 112, 122],
+// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
new file mode 100644
index 00000000000..02cb16c72e8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/tuple.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @tuple() -> tuple<tensor<2xi1>, tensor<i32>> {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %ret = "mhlo.tuple"(%cst, %c1)
+    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
+  return %ret : tuple<tensor<2xi1>, tensor<i32>>
+}
+
+// CHECK-LABEL: @tuple
+// CHECK-NEXT: Results
+// CHECK-NEXT: (TensorOrMemref<2xi1>: [true, false], TensorOrMemref<i32>: 1)
+
+func.func @get_tuple_element() -> (tensor<2xi1>, tensor<i32>) {
+  %cst = mhlo.constant dense<[true, false]> : tensor<2xi1>
+  %c42 = mhlo.constant dense<42> : tensor<i32>
+  %tuple = "mhlo.tuple"(%cst, %c42)
+    : (tensor<2xi1>, tensor<i32>) -> tuple<tensor<2xi1>, tensor<i32>>
+  %r0 = "mhlo.get_tuple_element"(%tuple) {index = 0 : i32}
+    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<2xi1>
+  %r1 = "mhlo.get_tuple_element"(%tuple) {index = 1 : i32}
+    : (tuple<tensor<2xi1>, tensor<i32>>) -> tensor<i32>
+  return %r0, %r1 : tensor<2xi1>, tensor<i32>
+}
+
+// CHECK-LABEL: @get_tuple_element
+// CHECK-NEXT: Results
+// CHECK-NEXT: [true, false]
+// CHECK-NEXT: 42
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
new file mode 100644
index 00000000000..4525174d7af
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/mhlo/while.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @while() -> (tensor<i32>, tensor<i32>) {
+  %c0 = mhlo.constant dense<0> : tensor<i32>
+  %c1 = mhlo.constant dense<1> : tensor<i32>
+  %c10 = mhlo.constant dense<10> : tensor<i32>
+  %3:2 = "mhlo.while"(%c0, %c1) ({
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.compare"(%arg0, %c10) {
+        comparison_direction = #mhlo<comparison_direction LT>
+      } : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "mhlo.return"(%4) : (tensor<i1>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %5 = mhlo.add %arg0, %c1 : tensor<i32>
+      %6 = mhlo.add %arg1, %arg1 : tensor<i32>
+      "mhlo.return"(%5, %6) : (tensor<i32>, tensor<i32>) -> ()
+    }) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  func.return %3#0, %3#1 : tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: @while
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: 10
+// CHECK-NEXT: TensorOrMemref<i32>: 1024
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
new file mode 100644
index 00000000000..c81af928f62
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
@@ -0,0 +1,50 @@
+
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @for() -> memref<4xi64> {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %alloc = memref.alloc() : memref<4xi64>
+  scf.for %i = %c0 to %c4 step %c2 {
+    %1 = arith.index_cast %i: index to i64
+    memref.store %1, %alloc[%i]: memref<4xi64>
+  }
+  return %alloc : memref<4xi64>
+}
+
+// CHECK-LABEL: @for
+// CHECK: Results
+// CHECK-NEXT{LITERAL}: [0, 0, 2, 0]
+
+func.func @nested() -> memref<2x2xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %alloc = memref.alloc() : memref<2x2xindex>
+  scf.for %i = %c0 to %c2 step %c1 {
+    scf.for %j = %c0 to %c2 step %c1 {
+      memref.store %c1, %alloc[%i, %j]: memref<2x2xindex>
+    }
+  }
+  return %alloc : memref<2x2xindex>
+}
+
+// CHECK-LABEL: @nested
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 1], [1, 1]]
+
+func.func @iter_arg() -> index {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %sum = scf.for %i = %c0 to %c4 step %c1 iter_args(%x = %c1) -> index {
+    %sum = arith.addi %i, %x : index
+    scf.yield %sum : index
+  }
+  return %sum : index
+}
+
+// CHECK-LABEL: @iter_arg
+// CHECK: Results
+// CHECK-NEXT{LITERAL}: i64: 7
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir
new file mode 100644
index 00000000000..52d899d1332
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/if.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @true() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %true = arith.constant true
+  %ret = scf.if %true -> i64 {
+    scf.yield %c0 : i64
+  } else {
+    scf.yield %c1 : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @true
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 0
+
+func.func @false() -> i64 {
+  %c2 = arith.constant 2 : i64
+  %c3 = arith.constant 3 : i64
+  %false = arith.constant false
+  %ret = scf.if %false -> i64 {
+    scf.yield %c2 : i64
+  } else {
+    scf.yield %c3 : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @false
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 3
+
+func.func @side_effect() -> memref<i64> {
+  %alloc = memref.alloc() : memref<i64>
+  %true = arith.constant true
+  %c124 = arith.constant 124 : i64
+  %c125 = arith.constant 125 : i64
+  scf.if %true {
+    memref.store %c124, %alloc[] : memref<i64>
+    scf.yield
+  } else {
+    memref.store %c125, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @side_effect
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i64>: 124
+
+func.func @side_effect_not_executed() -> memref<i64> {
+  %alloc = memref.alloc() : memref<i64>
+  %false = arith.constant false
+  %c126 = arith.constant 126 : i64
+  memref.store %c126, %alloc[] : memref<i64>
+  %c127 = arith.constant 127 : i64
+  scf.if %false {
+    memref.store %c127, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @side_effect_not_executed
+// CHECK-NEXT: Results
+// CHECK-NEXT: <i64>: 126
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
new file mode 100644
index 00000000000..41bcb08ac48
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/parallel.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @parallel() -> memref<4x4xi32> {
+  %ret = memref.alloc() : memref<4x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c9 = arith.constant 9 : i32
+  scf.parallel (%i, %j) = (%c0, %c1) to (%c4, %c4) step (%c1, %c2) {
+    memref.store %c9, %ret[%i, %j] : memref<4x4xi32>
+  }
+  return %ret : memref<4x4xi32>
+}
+
+// CHECK-LABEL: @parallel
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9], [0, 9, 0, 9]]
+
+func.func @reduce_2() -> (index, index) {
+  %c1 = arith.constant 1 : index
+  %c6 = arith.constant 6 : index
+  %ret:2 = scf.parallel (%i) = (%c1) to (%c6) step (%c1)
+             init (%c1, %c1) -> (index, index) {
+    scf.reduce (%i) : index {
+      ^bb0(%lhs: index, %rhs: index):
+        %ret = arith.muli %lhs, %rhs : index
+        scf.reduce.return %ret : index
+    }
+
+    scf.reduce (%i) : index {
+      ^bb0(%lhs: index, %rhs: index):
+        %ret = arith.addi %lhs, %rhs : index
+        scf.reduce.return %ret : index
+    }
+  }
+  return %ret#0, %ret#1 : index, index
+}
+
+// CHECK-LABEL: @reduce_2
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 120
+// CHECK-NEXT{LITERAL}: i64: 16
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir
new file mode 100644
index 00000000000..1be3814f403
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/while.mlir
@@ -0,0 +1,45 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @while_empty() -> memref<i64> {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  scf.while: () -> () {
+    %value = memref.load %alloc[] : memref<i64>
+    %cond = arith.cmpi slt, %value, %c4 : i64
+    scf.condition(%cond)
+  } do {
+    %value = memref.load %alloc[] : memref<i64>
+    %add = arith.addi %value, %c1 : i64
+    memref.store %add, %alloc[] : memref<i64>
+    scf.yield
+  }
+  return %alloc : memref<i64>
+}
+
+// CHECK-LABEL: @while_empty
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<i64>: 4
+
+func.func @while_var() -> i64 {
+  %c0 = arith.constant 0 : i64
+  %c1 = arith.constant 1 : i64
+  %c4 = arith.constant 4 : i64
+  %alloc = memref.alloc() : memref<i64>
+  memref.store %c0, %alloc[] : memref<i64>
+  %ret = scf.while(%arg0 = %c0): (i64) -> (i64) {
+    %cond = arith.cmpi slt, %arg0, %c4 : i64
+    scf.condition(%cond) %arg0 : i64
+  } do {
+  ^bb0(%arg1: i64):
+    %add = arith.addi %arg1, %c1 : i64
+    scf.yield %add : i64
+  }
+  return %ret : i64
+}
+
+// CHECK-LABEL: @while_var
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 4
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
new file mode 100644
index 00000000000..71b082f16d9
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/collapse_shape.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @collapse_shape()
+    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>) {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %collapse1 = tensor.collapse_shape %cst [[0], [1, 2]]
+      : tensor<1x2x3xi32> into tensor<1x6xi32>
+  %collapse2 = tensor.collapse_shape %cst [[0, 1], [2]]
+      : tensor<1x2x3xi32> into tensor<2x3xi32>
+  %collapse3 = tensor.collapse_shape %cst [[0, 1, 2]]
+      : tensor<1x2x3xi32> into tensor<6xi32>
+  return %collapse1, %collapse2, %collapse3
+      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<6xi32>
+}
+
+// CHECK-LABEL: @collapse_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [1, 2, 3, 4, 5, 6]
+
+func.func @to_unit() -> tensor<i32> {
+  %cst = arith.constant dense<42> : tensor<1x1x1x1xi32>
+  %collapse = tensor.collapse_shape %cst []
+    : tensor<1x1x1x1xi32> into tensor<i32>
+  return %collapse : tensor<i32>
+}
+
+// CHECK-LABEL: @to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: 42
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
new file mode 100644
index 00000000000..6e38150587e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/dim.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @dim() -> index {
+  %it = tensor.empty() : tensor<2x3x4xi32>
+  %c1 = arith.constant 1 : index
+  %dim = tensor.dim %it, %c1 : tensor<2x3x4xi32>
+  return %dim : index
+}
+
+// CHECK-LABEL: @dim
+// CHECK-NEXT: Results
+// CHECK-NEXT: i64: 3
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
new file mode 100644
index 00000000000..1fdd75afc7e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/empty.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @static() -> tensor<2x3xi32> {
+  %ret = tensor.empty() : tensor<2x3xi32>
+  return %ret : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: @static
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0], [0, 0, 0]]
+
+func.func @dynamic() -> tensor<2x?x3x?xi32> {
+  %c5 = arith.constant 5 : index
+  %c7 = arith.constant 7 : index
+  %ret = tensor.empty(%c5, %c7) : tensor<2x?x3x?xi32>
+  return %ret : tensor<2x?x3x?xi32>
+}
+
+// CHECK-LABEL: @dynamic
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x5x3x7xi32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
new file mode 100644
index 00000000000..b0561218d23
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/expand_shape.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expand_shape()
+    -> (tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>) {
+  %cst = arith.constant dense<[1, 2, 3, 4, 5, 6]> : tensor<6xi32>
+  %expand1 = tensor.expand_shape %cst [[0, 1]]
+      : tensor<6xi32> into tensor<1x6xi32>
+  %expand2 = tensor.expand_shape %cst [[0, 1]]
+      : tensor<6xi32> into tensor<2x3xi32>
+  %expand3 = tensor.expand_shape %cst [[0, 1, 2]]
+      : tensor<6xi32> into tensor<1x2x3xi32>
+  return %expand1, %expand2, %expand3
+      : tensor<1x6xi32>, tensor<2x3xi32>, tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @expand_shape
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2, 3, 4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 2, 3], [4, 5, 6]]
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 5, 6]]]
+
+func.func @from_unit() -> tensor<1x1xi32> {
+  %cst = arith.constant dense<42> : tensor<i32>
+  %expand = tensor.expand_shape %cst [] : tensor<i32> into tensor<1x1xi32>
+  return %expand : tensor<1x1xi32>
+}
+
+// CHECK-LABEL: @from_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<1x1xi32>: [[42]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
new file mode 100644
index 00000000000..9209eea17d9
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract() -> i32 {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
+  return %ret : i32
+}
+
+// CHECK-LABEL: @extract
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 5
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
new file mode 100644
index 00000000000..c91894cb6a3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/extract_slice.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract() -> tensor<1x2x1xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<1x2x1xi32>
+  return %ret : tensor<1x2x1xi32>
+}
+
+// CHECK-LABEL: @extract
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1], [4]]]
+
+func.func @rank_reduction() -> tensor<2xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.extract_slice %cst[0, 0, 0][1, 2, 1][1, 1, 1] : tensor<1x2x3xi32> to tensor<2xi32>
+  return %ret : tensor<2xi32>
+}
+
+// CHECK-LABEL: @rank_reduction
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1, 4]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
new file mode 100644
index 00000000000..e8bda72ada6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/from_elements.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @from_elements() -> tensor<2x3xindex> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %c5 = arith.constant 5 : index
+  %ret = tensor.from_elements %c0, %c1, %c2, %c3, %c4, %c5 :  tensor<2x3xindex>
+  return %ret : tensor<2x3xindex>
+}
+
+// CHECK-LABEL: @from_elements
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+
+func.func @empty() -> tensor<0xindex> {
+  %ret = tensor.from_elements : tensor<0xindex>
+  return %ret : tensor<0xindex>
+}
+
+// CHECK-LABEL: @empty
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: []
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
new file mode 100644
index 00000000000..9ae2a4d43de
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/generate.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @generate() -> tensor<?xindex> {
+  %size = arith.constant 5 : index
+  %iota = tensor.generate %size {
+    ^bb0(%i : index):
+      tensor.yield %i : index
+    } : tensor<?xindex>
+  return %iota : tensor<?xindex>
+}
+
+// CHECK-LABEL: @generate
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [0, 1, 2, 3, 4]
+
+func.func @generate_2d() -> tensor<?x?xindex> {
+  %c5 = arith.constant 5 : index
+  %c2 = arith.constant 2 : index
+  %iota = tensor.generate %c5, %c2 {
+    ^bb0(%i : index, %j : index):
+      %sum = arith.addi %i, %j : index
+      tensor.yield %sum : index
+    } : tensor<?x?xindex>
+  return %iota : tensor<?x?xindex>
+}
+
+// CHECK-LABEL: @generate_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
new file mode 100644
index 00000000000..47d5677694d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert() -> tensor<1x2x3xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c7 = arith.constant 7 : i32
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]]]> : tensor<1x2x3xi32>
+  %ret = tensor.insert %c7 into %cst[%c0, %c1, %c1] : tensor<1x2x3xi32>
+  return %ret : tensor<1x2x3xi32>
+}
+
+// CHECK-LABEL: @insert
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 7, 6]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
new file mode 100644
index 00000000000..ec5383d5c28
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/insert_slice.mlir
@@ -0,0 +1,25 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert() -> tensor<1x3x3xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
+  %cst_2 = arith.constant dense<[[[10], [11]]]> : tensor<1x2x1xi32>
+  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
+    : tensor<1x2x1xi32> into tensor<1x3x3xi32>
+  return %ret : tensor<1x3x3xi32>
+}
+
+// CHECK-LABEL: @insert
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
+
+func.func @rank_increase() -> tensor<1x3x3xi32> {
+  %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]> : tensor<1x3x3xi32>
+  %cst_2 = arith.constant dense<[10, 11]> : tensor<2xi32>
+  %ret = tensor.insert_slice %cst_2 into %cst[0, 1, 1][1, 2, 1][1, 1, 1]
+    : tensor<2xi32> into tensor<1x3x3xi32>
+  return %ret : tensor<1x3x3xi32>
+}
+
+// CHECK-LABEL: @rank_increase
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[1, 2, 3], [4, 10, 6], [7, 11, 9]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
new file mode 100644
index 00000000000..a2a6470b6e3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/tensor/pad.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @pad() -> tensor<1x?x?xi32> {
+  %it = arith.constant dense<[[[1, 2, 3], [2, 3, 4]]]> : tensor<1x2x3xi32>
+  %offset = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %out = tensor.pad %it low[%c0, %offset, 0] high[0, %c0, %offset]  {
+    ^bb0(%a: index, %b: index, %c: index):
+      %c5 = arith.constant 5 : i32
+      tensor.yield %c5 : i32
+    } : tensor<1x2x3xi32> to tensor<1x?x?xi32>
+  return %out : tensor<1x?x?xi32>
+}
+
+// CHECK-LABEL: @pad
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[5, 5, 5, 5, 5],
+// CHECK-SAME{LITERAL}:   [5, 5, 5, 5, 5],
+// CHECK-SAME{LITERAL}:   [1, 2, 3, 5, 5],
+// CHECK-SAME{LITERAL}:   [2, 3, 4, 5, 5]]]
+
+func.func @pad_args() -> tensor<3x3xindex> {
+  %it = arith.constant dense<[[999]]> : tensor<1x1xindex>
+  %out = tensor.pad %it low[1, 1] high[1, 1]  {
+    ^bb0(%a: index, %b: index):
+      %c10 = arith.constant 10 : index
+      %mul = arith.muli %a, %c10 : index
+      %ret = arith.addi %mul, %b : index
+      tensor.yield %ret : index
+    } : tensor<1x1xindex> to tensor<3x3xindex>
+  return %out : tensor<3x3xindex>
+}
+
+// CHECK-LABEL: @pad_args
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 1, 2],
+// CHECK-SAME{LITERAL}:  [10, 999, 12],
+// CHECK-SAME{LITERAL}:  [20, 21, 22]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
new file mode 100644
index 00000000000..242b025324e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/bitcast.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @bitcast1d() -> vector<4xi8> {
+  %c = arith.constant dense<-2> : vector<1xi32>
+  %b = vector.bitcast %c : vector<1xi32> to vector<4xi8>
+  return %b : vector<4xi8>
+}
+
+// CHECK-LABEL: @bitcast1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi8>: [-2, -1, -1, -1]
+
+func.func @bitcast2d() -> vector<2x1xi64> {
+  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
+  %b = vector.bitcast %c : vector<2x2xi32> to vector<2x1xi64>
+  return %b : vector<2x1xi64>
+}
+
+// CHECK-LABEL: @bitcast2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[4294967296], [12884901890]]
+
+func.func @transpose_bitcast() -> vector<2x1xi64> {
+  %c = arith.constant dense<[[0, 1], [2, 3]]> : vector<2x2xi32>
+  %d = vector.transpose %c, [1, 0] : vector<2x2xi32> to vector<2x2xi32>
+  %b = vector.bitcast %d : vector<2x2xi32> to vector<2x1xi64>
+  return %b : vector<2x1xi64>
+}
+
+// CHECK-LABEL: @transpose_bitcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1xi64>: [[8589934592], [12884901889]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
new file mode 100644
index 00000000000..1e30f7aa9e6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/broadcast.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @scalar_to_unit() -> vector<i32> {
+  %c1 = arith.constant 1 : i32
+  %b = vector.broadcast %c1 : i32 to vector<i32>
+  return %b : vector<i32>
+}
+
+// CHECK-LABEL: @scalar_to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 1
+
+func.func @scalar_to_2d() -> vector<2x3xi32> {
+  %c1 = arith.constant 1 : i32
+  %b = vector.broadcast %c1 : i32 to vector<2x3xi32>
+  return %b : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @scalar_to_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[1, 1, 1], [1, 1, 1]]
+
+func.func @unit_to_unit() -> vector<i32> {
+  %c1 = arith.constant dense<1> : vector<i32>
+  %b = vector.broadcast %c1 : vector<i32> to vector<i32>
+  return %b : vector<i32>
+}
+
+// CHECK-LABEL: @unit_to_unit
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 1
+
+func.func @stretch() -> vector<3xi32> {
+  %c1 = arith.constant dense<1> : vector<1xi32>
+  %b = vector.broadcast %c1 : vector<1xi32> to vector<3xi32>
+  return %b : vector<3xi32>
+}
+
+// CHECK-LABEL: @stretch
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<3xi32>: [1, 1, 1]
+
+func.func @stretch_and_broadcast() -> vector<2x1x2x2xi32> {
+  %c1 = arith.constant dense<[[1], [2]]> : vector<2x1xi32>
+  %b = vector.broadcast %c1 : vector<2x1xi32> to vector<2x1x2x2xi32>
+  return %b : vector<2x1x2x2xi32>
+}
+
+// CHECK-LABEL: @stretch_and_broadcast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x1x2x2xi32>: [[[[1, 1], [2, 2]]], [[[1, 1], [2, 2]]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
new file mode 100644
index 00000000000..07963309472
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/compressstore.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @compressstore() -> memref<3x4xi32> {
+  %alloc = memref.alloc() : memref<3x4xi32>
+  %c = arith.constant dense<[1,2,3]> : vector<3xi32>
+  %m = arith.constant dense<[true,false,true]> : vector<3xi1>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  vector.compressstore %alloc[%c1, %c2], %m, %c
+    : memref<3x4xi32>, vector<3xi1>, vector<3xi32>
+  return %alloc : memref<3x4xi32>
+}
+
+// CHECK-LABEL: @compressstore
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 3], [0, 0, 0, 0]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
new file mode 100644
index 00000000000..a3e3e0d03cc
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/constant_mask.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @constant_mask() -> vector<4x3xi1> {
+  %1 = vector.constant_mask [3, 2] : vector<4x3xi1>
+  return %1 : vector<4x3xi1>
+}
+
+// CHECK-LABEL: @constant_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
+// CHECK-SAME{LITERAL}: [[true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
new file mode 100644
index 00000000000..f7f26fed00d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/contract.mlir
@@ -0,0 +1,141 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from
+// mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir
+
+#dotp_accesses = [
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> ()>
+]
+#dotp_trait = {
+  indexing_maps = #dotp_accesses,
+  iterator_types = ["reduction"]
+}
+
+#matvec_accesses = [
+  affine_map<(i, j) -> (i, j)>,
+  affine_map<(i, j) -> (j)>,
+  affine_map<(i, j) -> (i)>
+]
+#matvec_trait = {
+  indexing_maps = #matvec_accesses,
+  iterator_types = ["parallel", "reduction"]
+}
+
+#mattransvec_accesses = [
+  affine_map<(i, j) -> (j, i)>,
+  affine_map<(i, j) -> (j)>,
+  affine_map<(i, j) -> (i)>
+]
+#mattransvec_trait = {
+  indexing_maps = #mattransvec_accesses,
+  iterator_types = ["parallel", "reduction"]
+}
+
+#matmat_accesses = [
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+#matmat_trait = {
+  indexing_maps = #matmat_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#column_major_matmat_accesses = [
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (j, i)>
+]
+#column_major_matmat_trait = {
+  indexing_maps = #column_major_matmat_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @dot_products() -> (i32, i32) {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4]> : vector<2xi32>
+  // Contraction: dot-product a x b
+  %dp1 = vector.contract #dotp_trait %a, %b, %c0
+    : vector<2xi32>, vector<2xi32> into i32
+  %dp2 = vector.contract #dotp_trait %a, %b, %c1
+    : vector<2xi32>, vector<2xi32> into i32
+  return %dp1, %dp2 : i32, i32
+}
+
+// CHECK-LABEL: @dot_product
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 11
+// CHECK-NEXT: i32: 12
+
+func.func @matrix_vector() -> (vector<2xi32>, vector<2xi32>) {
+  %z1 = arith.constant dense<0> : vector<2xi32>
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c = arith.constant dense<[5, 6]> : vector<2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %mv1 = vector.contract #matvec_trait %A, %c, %z1
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  %mv2 = vector.contract #matvec_trait %A, %c, %a
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
+}
+
+// CHECK-LABEL: @matrix_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: [17, 39]
+// CHECK-NEXT: [18, 41]
+
+func.func @matrix_trans_vector() -> (vector<2xi32>, vector<2xi32>) {
+  %z1 = arith.constant dense<0> : vector<2xi32>
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c = arith.constant dense<[5, 6]> : vector<2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %mv1 = vector.contract #mattransvec_trait %A, %c, %z1
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  %mv2 = vector.contract #mattransvec_trait %A, %c, %a
+    : vector<2x2xi32>, vector<2xi32> into vector<2xi32>
+  return %mv1, %mv2 : vector<2xi32>, vector<2xi32>
+}
+
+// CHECK-LABEL: @matrix_trans_vector
+// CHECK-NEXT: Results
+// CHECK-NEXT: [23, 34]
+// CHECK-NEXT: [24, 36]
+
+func.func @matrix_matrix() -> (vector<2x2xi32>, vector<2x2xi32>) {
+  %z2 = arith.constant dense<0> : vector<2x2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
+  %mm1 = vector.contract #matmat_trait %A, %B, %z2
+    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  %mm2 = vector.contract #matmat_trait %A, %B, %A
+    : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  return %mm1, %mm2 : vector<2x2xi32>, vector<2x2xi32>
+}
+
+// CHECK-LABEL: @matrix_matrix
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[19, 22], [43, 50]]
+// CHECK-NEXT{LITERAL}: [[20, 24], [46, 54]]
+
+func.func @matrix_matrix_column_major() -> (vector<2x2xi32>, vector<2x2xi32>) {
+  %z2 = arith.constant dense<0> : vector<2x2xi32>
+  %A = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %B = arith.constant dense<[[5, 6], [7, 8]]> : vector<2x2xi32>
+  %llvm_matrix_column_major_mm0 =
+    vector.contract #column_major_matmat_trait %A, %B, %z2
+      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  %llvm_matrix_column_major_mm1 =
+    vector.contract #column_major_matmat_trait %A, %B, %A
+      : vector<2x2xi32>, vector<2x2xi32> into vector<2x2xi32>
+  return %llvm_matrix_column_major_mm0, %llvm_matrix_column_major_mm1 :
+    vector<2x2xi32>, vector<2x2xi32>
+}
+
+// CHECK-LABEL: @matrix_matrix_column_major
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[23, 31], [34, 46]]
+// CHECK-NEXT{LITERAL}: [[24, 33], [37, 50]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
new file mode 100644
index 00000000000..2616eddb273
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/create_mask.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @create_mask() -> vector<4x3xi1> {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %1 = vector.create_mask %c3, %c2 : vector<4x3xi1>
+  return %1 : vector<4x3xi1>
+}
+
+// CHECK-LABEL: @create_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<4x3xi1>:
+// CHECK-SAME{LITERAL}: [[true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [true, true, false],
+// CHECK-SAME{LITERAL}:  [false, false, false]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
new file mode 100644
index 00000000000..35a4bc27358
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/expandload.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @expandload() -> (vector<4xi32>, vector<4xi32>) {
+  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %ret = vector.expandload %memref[%c1, %c2], %mask, %passthrough
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+
+  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
+}
+
+// CHECK-LABEL: @expandload
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
+// CHECK-NEXT: vector<4xi32>: [17, 2, 18, 4]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
new file mode 100644
index 00000000000..4c0ec223ea7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract_1d_1d() -> vector<2xi32> {
+  %c = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.extract %c[] : vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @extract_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 2]
+
+func.func @extract_1d_0d() -> i32 {
+  %c = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.extract %c[1] : vector<2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @extract_2d_0d() -> i32 {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.extract %c[0, 1] : vector<2x2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract_2d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @extract_2d_1d() -> vector<3xi32> {
+  %c = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %i = vector.extract %c[0] : vector<2x3xi32>
+  return %i : vector<3xi32>
+}
+
+// CHECK-LABEL: @extract_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<3xi32>: [1, 2, 3]
+
+func.func @extract_2d_2d() -> vector<2x2xi32> {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.extract %c[] : vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @extract_2d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
new file mode 100644
index 00000000000..eae5ae9e5e4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extract_strided_slice.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract_strided_slice() -> vector<2x3xi32> {
+  %c = arith.constant dense<[[1,2,3,4],
+                             [5,6,7,8],
+                             [9,10,11,12]]> : vector<3x4xi32>
+  %o = vector.extract_strided_slice %c {
+    offsets = [0, 1],
+    sizes = [2, 3],
+    // TODO(jreiffers): Test non-unit strides when supported by verifier.
+    strides = [1, 1]
+  } : vector<3x4xi32> to vector<2x3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @extract_strided_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[2, 3, 4], [6, 7, 8]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
new file mode 100644
index 00000000000..2e4ffc708d6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/extractelement.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @extract0d() -> i32 {
+  %c = arith.constant dense<1> : vector<i32>
+  %i = vector.extractelement %c[] : vector<i32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 1
+
+func.func @extract1d() -> i32 {
+  %c = arith.constant dense<[1,2]> : vector<2xi32>
+  %c1 = arith.constant 1 : index
+  %i = vector.extractelement %c[%c1 : index] : vector<2xi32>
+  return %i : i32
+}
+
+// CHECK-LABEL: @extract1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
new file mode 100644
index 00000000000..b67a84a4cce
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/flat_transpose.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @flattranspose_2x3() -> vector<6xi32> {
+  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
+  %ret = vector.flat_transpose %c { columns = 2: i32, rows = 3: i32 }
+    : vector<6xi32> -> vector<6xi32>
+  return %ret : vector<6xi32>
+}
+
+// CHECK-LABEL: @flattranspose_2x3
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 3, 1, 4, 2, 5]
+
+func.func @flattranspose_3x2() -> vector<6xi32> {
+  %c = arith.constant dense<[0, 1, 2, 3, 4, 5]> : vector<6xi32>
+  %ret = vector.flat_transpose %c { columns = 3: i32, rows = 2: i32 }
+    : vector<6xi32> -> vector<6xi32>
+  return %ret : vector<6xi32>
+}
+
+// CHECK-LABEL: @flattranspose_3x2
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 2, 4, 1, 3, 5]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
new file mode 100644
index 00000000000..696799d5c99
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/fma.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @fma() -> vector<2xf32> {
+  %a = arith.constant dense<[1.0,2.0]> : vector<2xf32>
+  %b = arith.constant dense<[3.0,4.0]> : vector<2xf32>
+  %c = arith.constant dense<[5.0,6.0]> : vector<2xf32>
+  %r = vector.fma %a, %b, %c : vector<2xf32>
+  return %r : vector<2xf32>
+}
+
+// CHECK-LABEL: @fma
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xf32>: [8.000000e+00, 1.400000e+01]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
new file mode 100644
index 00000000000..c92a5589b1e
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/gather.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-gather.mlir
+
+func.func private @gather8(%base: memref<10xi32>, %indices: vector<8xi32>,
+              %mask: vector<8xi1>, %pass_thru: vector<8xi32>) -> vector<8xi32> {
+  %c0 = arith.constant 0: index
+  %g = vector.gather %base[%c0][%indices], %mask, %pass_thru
+    : memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32> into vector<8xi32>
+  return %g : vector<8xi32>
+}
+
+func.func @gather() ->
+    (vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>) {
+  %A = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : memref<10xi32>
+  %idx = arith.constant dense<[0, 6, 1, 3, 5, 4, 9, 2]> : vector<8xi32>
+  %pass = arith.constant dense<-7> : vector<8xi32>
+  %none = vector.constant_mask [0] : vector<8xi1>
+  %all = vector.constant_mask [8] : vector<8xi1>
+  %some = vector.constant_mask [4] : vector<8xi1>
+  %true = arith.constant true
+  %more = vector.insert %true, %some[7] : i1 into vector<8xi1>
+
+  %g1 = call @gather8(%A, %idx, %all, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g2 = call @gather8(%A, %idx, %none, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g3 = call @gather8(%A, %idx, %some, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g4 = call @gather8(%A, %idx, %more, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+  %g5 = call @gather8(%A, %idx, %all, %pass)
+    : (memref<10xi32>, vector<8xi32>, vector<8xi1>, vector<8xi32>)
+    -> (vector<8xi32>)
+
+  return %g1, %g2, %g3, %g4, %g5
+    : vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>, vector<8xi32>
+}
+
+// CHECK-LABEL: @gather
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
+// CHECK-NEXT: [-7, -7, -7, -7, -7, -7, -7, -7]
+// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, -7]
+// CHECK-NEXT: [0, 6, 1, 3, -7, -7, -7, 2]
+// CHECK-NEXT: [0, 6, 1, 3, 5, 4, 9, 2]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
new file mode 100644
index 00000000000..97019cf98e4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert_1d_1d() -> vector<2xi32> {
+  %c = arith.constant dense<[3, 4]> : vector<2xi32>
+  %d = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.insert %c, %d[] : vector<2xi32> into vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [3, 4]
+
+func.func @insert_1d_0d() -> vector<2xi32> {
+  %c = arith.constant 42 : i32
+  %d = arith.constant dense<[1, 2]> : vector<2xi32>
+  %i = vector.insert %c, %d[1] : i32 into vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 42]
+
+func.func @insert_2d_0d() -> vector<2x2xi32> {
+  %c = arith.constant 42 : i32
+  %d = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %i = vector.insert %c, %d[0, 1] : i32 into vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @insert_2d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 42], [3, 4]]
+
+func.func @insert_2d_1d() -> vector<2x3xi32> {
+  %c = arith.constant dense<[42, 43, 44]> : vector<3xi32>
+  %d = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %i = vector.insert %c, %d[0] : vector<3xi32> into vector<2x3xi32>
+  return %i : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @insert_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x3xi32>: [[42, 43, 44], [4, 5, 6]]
+
+func.func @insert_2d_2d() -> vector<2x2xi32> {
+  %c = arith.constant dense<[[1, 2], [3, 4]]> : vector<2x2xi32>
+  %d = arith.constant dense<0> : vector<2x2xi32>
+  %i = vector.insert %c, %d[] : vector<2x2xi32> into vector<2x2xi32>
+  return %i : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @insert_2d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[1, 2], [3, 4]]
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
new file mode 100644
index 00000000000..140cc716bfd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insert_strided_slice.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert_strided_slice() -> (vector<3x4xi32>, vector<3x4xi32>) {
+  %v = arith.constant dense<[[2, 3, 4], [6, 7, 8]]> : vector<2x3xi32>
+  %c = arith.constant dense<0> : vector<3x4xi32>
+  %o = vector.insert_strided_slice %v, %c {
+    offsets = [0, 1],
+    // TODO(jreiffers): Test non-unit strides when supported by verifier.
+    strides = [1, 1]
+  } : vector<2x3xi32> into vector<3x4xi32>
+  return %c, %o : vector<3x4xi32>, vector<3x4xi32>
+}
+
+// CHECK-LABEL: @insert_strided_slice
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
+// CHECK-NEXT{LITERAL}: [[0, 2, 3, 4], [0, 6, 7, 8], [0, 0, 0, 0]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
new file mode 100644
index 00000000000..4befb8c9598
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/insertelement.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @insert0d() -> vector<i32> {
+  %c = arith.constant dense<1> : vector<i32>
+  %v = arith.constant 42 : i32
+  %i = vector.insertelement %v, %c[] : vector<i32>
+  return %i : vector<i32>
+}
+
+// CHECK-LABEL: @insert0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<i32>: 42
+
+func.func @insert1d() -> vector<2xi32> {
+  %c = arith.constant dense<1> : vector<2xi32>
+  %c1 = arith.constant 1 : index
+  %v = arith.constant 42 : i32
+  %i = vector.insertelement %v, %c[%c1 : index] : vector<2xi32>
+  return %i : vector<2xi32>
+}
+
+// CHECK-LABEL: @insert1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<2xi32>: [1, 42]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
new file mode 100644
index 00000000000..d8ffb0c4c09
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/invalid.mlir
@@ -0,0 +1,27 @@
+// RUN: (! mlir-interpreter-runner %s -run-all 2>&1) | FileCheck %s
+
+func.func @write_4_at_3_inbounds() {
+  %a = memref.alloc() : memref<5xi32>
+  %base = arith.constant 3 : index
+  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
+    : vector<4xi32>, memref<5xi32>
+  return
+}
+
+// CHECK-LABEL: @write_4_at_3_inbounds
+// CHECK-NEXT: index out of bounds
+
+func.func @transfer_read_2d_1d_oob()-> vector<2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c2, %c0], %c-42
+      : memref<2x4xi32>, vector<2xi32>
+  return %f : vector<2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d_1d_oob
+// CHECK-NEXT: index out of bounds
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir
new file mode 100644
index 00000000000..3fabd7676d6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/load.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @load_vector_memref() -> vector<2x2xi32> {
+  %m = memref.alloc() : memref<4x4xvector<2x2xi32>>
+  %c3 = arith.constant 3 : index
+  %r = vector.load %m[%c3, %c3] : memref<4x4xvector<2x2xi32>>, vector<2x2xi32>
+  return %r : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @load_vector_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[0, 0], [0, 0]]
+
+func.func @load_scalar_memref() -> vector<2x2xi32> {
+  %m = arith.constant dense<[[1, 2, 3, 4],
+                             [5, 6, 7, 8],
+                             [9, 10, 11, 12],
+                             [13, 14, 15, 16]]> : memref<4x4xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %r = vector.load %m[%c1, %c2] : memref<4x4xi32>, vector<2x2xi32>
+  return %r : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @load_scalar_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x2xi32>: [[7, 8], [11, 12]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
new file mode 100644
index 00000000000..bb11f49d261
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedload.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @maskedload() -> (vector<4xi32>, vector<4xi32>) {
+  %passthrough = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %ret = vector.maskedload %memref[%c1, %c2], %mask, %passthrough
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32> into vector<4xi32>
+
+  return %passthrough, %ret : vector<4xi32>, vector<4xi32>
+}
+
+// CHECK-LABEL: @maskedload
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 3, 4]
+// CHECK-NEXT: vector<4xi32>: [17, 2, 19, 4]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
new file mode 100644
index 00000000000..9ceaefc3a9c
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/maskedstore.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @maskedstore() -> memref<2x5xi32> {
+  %value = arith.constant dense<[1,2,3,4]> : vector<4xi32>
+  %mask = arith.constant dense<[true, false, true, false]> : vector<4xi1>
+  %memref = arith.constant dense<[[10,11,12,13,14],
+                                  [15,16,17,18,19]]> : memref<2x5xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  vector.maskedstore %memref[%c1, %c2], %mask, %value
+    : memref<2x5xi32>, vector<4xi1>, vector<4xi32>
+
+  return %memref : memref<2x5xi32>
+}
+
+// CHECK-LABEL: @maskedstore
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[10, 11, 12, 13, 14], [15, 16, 1, 18, 3]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
new file mode 100644
index 00000000000..94fc5481653
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/multi_reduction.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @multi_reduction_1d_0d() -> i32 {
+  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
+  %acc = arith.constant 1 : i32
+  %r = vector.multi_reduction <add>, %a, %acc [0] : vector<3xi32> to i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @multi_reduction_1d_0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: 22
+
+#dense234 = dense<[[[0,1,2,3],[4,5,6,7],[8,9,10,11]],
+                   [[12,13,14,15],[16,17,18,19],[20,21,22,23]]]>
+              : vector<2x3x4xi32>
+
+func.func @multi_reduction_3d_2d()
+    -> (vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>) {
+  %a = arith.constant #dense234
+  %acc_23 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : vector<2x3xi32>
+  %r1 = vector.multi_reduction <add>, %a, %acc_23 [2]
+    : vector<2x3x4xi32> to vector<2x3xi32>
+  %acc_24 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : vector<2x4xi32>
+  %r2 = vector.multi_reduction <mul>, %a, %acc_24 [1]
+    : vector<2x3x4xi32> to vector<2x4xi32>
+  return %r1, %r2, %acc_23 : vector<2x3xi32>, vector<2x4xi32>, vector<2x3xi32>
+}
+
+// CHECK-LABEL: @multi_reduction_3d_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[6, 23, 40], [57, 74, 91]]
+// CHECK-NEXT{LITERAL}: [[0, 45, 240, 693], [15360, 23205, 33264, 45885]]
+// CHECK-NEXT{LITERAL}: [[0, 1, 2], [3, 4, 5]]
+
+func.func @multi_reduction_3d_1d() -> vector<3xi32> {
+  %a = arith.constant #dense234
+  %acc_3 = arith.constant dense<[0, 1, 2]> : vector<3xi32>
+  %r1 = vector.multi_reduction <add>, %a, %acc_3 [2, 0]
+    : vector<2x3x4xi32> to vector<3xi32>
+  return %r1 : vector<3xi32>
+}
+
+// CHECK-LABEL: @multi_reduction_3d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [60, 93, 126]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
new file mode 100644
index 00000000000..ccd7f747933
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/outerproduct.mlir
@@ -0,0 +1,155 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @outerproduct_1d_1d() -> vector<2x3xi32> {
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b : vector<2xi32>, vector<3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[3, 4, 5], [6, 8, 10]]
+
+func.func @outerproduct_1d_1d_add() -> vector<2x3xi32> {
+  %a = arith.constant dense<[1, 2]> : vector<2xi32>
+  %b = arith.constant dense<[3, 4, 5]> : vector<3xi32>
+  %init = arith.constant dense<[[100, 200, 300], [400, 500, 600]]>
+    : vector<2x3xi32>
+  %o = vector.outerproduct %a, %b, %init : vector<2xi32>, vector<3xi32>
+  return %o : vector<2x3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_1d_add
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[103, 204, 305], [406, 508, 610]]
+
+func.func @outerproduct_1d_0d_and() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<and>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_and
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1]
+
+func.func @outerproduct_1d_0d_maxui() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxui>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxui
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [100, 6, -100]
+
+func.func @outerproduct_1d_0d_mul() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<mul>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_mul
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 12, 27]
+
+func.func @outerproduct_1d_0d_minui() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minui>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minui
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 0, 9]
+
+func.func @outerproduct_1d_0d_maxsi() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxsi>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxsi
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [100, 6, 9]
+
+func.func @outerproduct_1d_0d_minsi() -> vector<3xi32> {
+  %a = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %b = arith.constant 3 : i32
+  %init = arith.constant dense<[100, 0, -100]> : vector<3xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minsi>}
+    : vector<3xi32>, i32
+  return %o : vector<3xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minsi
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3, 0, -100]
+
+func.func @outerproduct_1d_0d_maxf() -> vector<1xf32> {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %b = arith.constant 3.0 : f32
+  %init = arith.constant dense<[10.0]> : vector<1xf32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<maxf>}
+    : vector<1xf32>, f32
+  return %o : vector<1xf32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_maxf
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [1.000000e+01]
+
+func.func @outerproduct_1d_0d_minf() -> vector<1xf32> {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %b = arith.constant 3.0 : f32
+  %init = arith.constant dense<[10.0]> : vector<1xf32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<minf>}
+    : vector<1xf32>, f32
+  return %o : vector<1xf32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [3.000000e+00]
+
+func.func @outerproduct_1d_0d_or() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<or>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_or
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [11]
+
+func.func @outerproduct_1d_0d_xor() -> vector<1xi32> {
+  %a = arith.constant dense<[3]> : vector<1xi32>
+  %b = arith.constant 1 : i32
+  %init = arith.constant dense<[9]> : vector<1xi32>
+  %o = vector.outerproduct %a, %b, %init {kind = #vector.kind<xor>}
+    : vector<1xi32>, i32
+  return %o : vector<1xi32>
+}
+
+// CHECK-LABEL: @outerproduct_1d_0d_xor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [10]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
new file mode 100644
index 00000000000..c3998aea47d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/reduction.mlir
@@ -0,0 +1,235 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @reduction_add() -> i32 {
+  %a = arith.constant dense<[10, 4, 7]> : vector<3xi32>
+  %r = vector.reduction <add>, %a : vector<3xi32> into i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @reduction_add
+// CHECK-NEXT: Results
+// CHECK-NEXT: 21
+
+func.func @reduction_minf_degenerate() -> f32 {
+  %a = arith.constant dense<[1.0]> : vector<1xf32>
+  %r = vector.reduction <minf>, %a : vector<1xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_minf_degenerate
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @reduction_minf() -> f32 {
+  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
+  %r = vector.reduction <minf>, %a : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT: 4.000000e+00
+
+func.func @reduction_acc() -> f32 {
+  %a = arith.constant dense<[10.0, 4.0, 7.0]> : vector<3xf32>
+  %acc = arith.constant 3.0 : f32
+  %r = vector.reduction <minf>, %a, %acc : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_acc
+// CHECK-NEXT: Results
+// CHECK-NEXT: 3.000000e+00
+
+func.func @reduction_acc_first_is_minimum() -> f32 {
+  %a = arith.constant dense<[1.0, 4.0, 7.0]> : vector<3xf32>
+  %acc = arith.constant 3.0 : f32
+  %r = vector.reduction <minf>, %a, %acc : vector<3xf32> into f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @reduction_acc_first_is_minimum
+// CHECK-NEXT: Results
+// CHECK-NEXT: 1.000000e+00
+
+func.func @masked_and() -> i32 {
+  %a = arith.constant dense<[255, 127, 6]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <and>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_and
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 127
+
+func.func @masked_xor() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <xor>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_xor
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 2
+
+func.func @masked_or() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <or>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_or
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 3
+
+func.func @masked_add_i32() -> i32 {
+  %a = arith.constant dense<[255, 1, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <add>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_add_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 4
+
+func.func @masked_add_f32() -> f32 {
+  %a = arith.constant dense<[255.0, 1.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <add>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_add_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 4.0
+
+func.func @masked_mul_i32() -> i32 {
+  %a = arith.constant dense<[255, 2, 3]> : vector<3xi32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <mul>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_mul_i32
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 6
+
+func.func @masked_mul_f32() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <mul>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_mul_f32
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 6.0
+
+func.func @masked_minsi() -> i32 {
+  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minsi>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_minsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -2
+
+func.func @masked_minui() -> i32 {
+  %a = arith.constant dense<[255, -2, -5]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minui>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_minui
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 255
+
+func.func @masked_maxsi() -> i32 {
+  %a = arith.constant dense<[255, -2, 500]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxsi>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_maxsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: 255
+
+func.func @masked_maxui() -> i32 {
+  %a = arith.constant dense<[255, -5, -2]> : vector<3xi32>
+  %m = arith.constant dense<[true, true, false]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxui>, %a : vector<3xi32> into i32
+  } : vector<3xi1> -> i32
+  return %r : i32
+}
+
+// CHECK-LABEL: @masked_maxui
+// CHECK-NEXT: Results
+// CHECK-NEXT: i32: -5
+
+func.func @masked_minf() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[true, false, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <minf>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_minf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 3.0
+
+func.func @masked_maxf() -> f32 {
+  %a = arith.constant dense<[255.0, 2.0, 3.0]> : vector<3xf32>
+  %m = arith.constant dense<[false, true, true]> : vector<3xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxf>, %a : vector<3xf32> into f32
+  } : vector<3xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_maxf
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: 3.0
+
+func.func @masked_maxf_empty() -> f32 {
+  %a = arith.constant dense<[255.0]> : vector<1xf32>
+  %m = arith.constant dense<[false]> : vector<1xi1>
+  %r = vector.mask %m {
+    vector.reduction <maxf>, %a : vector<1xf32> into f32
+  } : vector<1xi1> -> f32
+  return %r : f32
+}
+
+// CHECK-LABEL: @masked_maxf_empty
+// CHECK-NEXT: Results
+// CHECK-NEXT: f32: -INF
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
new file mode 100644
index 00000000000..333adcbc4b1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shape_cast.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @shape_cast() -> vector<2x2xi32> {
+  %a = arith.constant dense<[[1, 2, 3, 4]]> : vector<1x4xi32>
+  %cast = vector.shape_cast %a : vector<1x4xi32> to vector<2x2xi32>
+  return %cast : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @shape_cast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 2], [3, 4]]
+
+func.func @cast_of_transpose() -> (vector<3x2xi32>, vector<2x3xi32>) {
+  %a = arith.constant dense<[[1, 2, 3], [4, 5, 6]]> : vector<2x3xi32>
+  %b = vector.transpose %a, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
+  %cast = vector.shape_cast %b : vector<3x2xi32> to vector<2x3xi32>
+  return %b, %cast : vector<3x2xi32>, vector<2x3xi32>
+}
+
+// CHECK-LABEL: @cast_of_transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[1, 4], [2, 5], [3, 6]]
+// CHECK-NEXT{LITERAL}: [[1, 4, 2], [5, 3, 6]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
new file mode 100644
index 00000000000..3fb3d881fbe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/shuffle.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @shuffle0d() -> vector<4xi32> {
+  %c1 = arith.constant dense<1> : vector<i32>
+  %c2 = arith.constant dense<2> : vector<i32>
+  %shuffle = vector.shuffle %c1, %c2[0, 1, 1, 0] : vector<i32>, vector<i32>
+  return %shuffle : vector<4xi32>
+}
+
+// CHECK-LABEL: @shuffle0d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 2, 2, 1]
+
+func.func @shuffle1d() -> vector<4xi32> {
+  %c1 = arith.constant dense<[1, 2]> : vector<2xi32>
+  %c2 = arith.constant dense<[11, 12, 13, 14]> : vector<4xi32>
+  %shuffle = vector.shuffle %c1, %c2[0, 2, 5, 1] : vector<2xi32>, vector<4xi32>
+  return %shuffle : vector<4xi32>
+}
+
+// CHECK-LABEL: @shuffle1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: vector<4xi32>: [1, 11, 14, 2]
+
+func.func @shuffle2d() -> vector<3x2xi32> {
+  %c1 = arith.constant dense<[[1, 2], [11, 12]]> : vector<2x2xi32>
+  %c2 = arith.constant dense<[[21, 22]]> : vector<1x2xi32>
+  %shuffle = vector.shuffle %c1, %c2[0, 2, 1] : vector<2x2xi32>, vector<1x2xi32>
+  return %shuffle : vector<3x2xi32>
+}
+
+// CHECK-LABEL: @shuffle2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<3x2xi32>: [[1, 2], [21, 22], [11, 12]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
new file mode 100644
index 00000000000..bfbb76b38af
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/splat.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @splat() -> vector<2x4xi32> {
+  %c42 = arith.constant 42 : i32
+  %splat = vector.splat %c42 : vector<2x4xi32>
+  return %splat : vector<2x4xi32>
+}
+
+// CHECK-LABEL: @splat
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: vector<2x4xi32>: [[42, 42, 42, 42], [42, 42, 42, 42]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir
new file mode 100644
index 00000000000..f97f6435d61
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/store.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @store_vector_memref() -> memref<1x2xvector<2xi32>> {
+  %m = memref.alloc() : memref<1x2xvector<2xi32>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %v = arith.constant dense<[1,2]> : vector<2xi32>
+  vector.store %v, %m[%c0, %c1] : memref<1x2xvector<2xi32>>, vector<2xi32>
+  return %m : memref<1x2xvector<2xi32>>
+}
+
+// CHECK-LABEL: @store_vector_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<1x2xvector<2xi32>>: [[[0, 0], [1, 2]]]
+
+func.func @store_scalar_memref() -> memref<2x2xi32> {
+  %m = memref.alloc() : memref<2x2xi32>
+  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  vector.store %v, %m[%c1, %c0] : memref<2x2xi32>, vector<1x2xi32>
+  return %m : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @store_scalar_memref
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [1, 2]]
+
+func.func @store_oob() -> memref<2x2xi32> {
+  %m = memref.alloc() : memref<2x2xi32>
+  %v = arith.constant dense<[[1,2]]> : vector<1x2xi32>
+  %c1 = arith.constant 1 : index
+  vector.store %v, %m[%c1, %c1] : memref<2x2xi32>, vector<1x2xi32>
+  return %m : memref<2x2xi32>
+}
+
+// CHECK-LABEL: @store_oob
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<2x2xi32>: [[0, 0], [0, 1]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
new file mode 100644
index 00000000000..89d381948fe
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_read.mlir
@@ -0,0 +1,103 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+// Adapted from mlir/test/Integration/Dialect/Vector/CPU/test-transfer-read.mlir.
+
+func.func @transfer_read_1d() -> vector<13xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42 : i32
+  %f = vector.transfer_read %a[%c2], %c-42
+      {permutation_map = affine_map<(d0) -> (d0)>} :
+    memref<5xi32>, vector<13xi32>
+  return %f : vector<13xi32>
+}
+
+// CHECK-LABEL: @transfer_read_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [2, 3, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
+
+func.func @transfer_read_mask_1d() -> vector<13xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %m = arith.constant dense<[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]> : vector<13xi1>
+  %f = vector.transfer_read %a[%c2], %c-42, %m : memref<5xi32>, vector<13xi32>
+  return %f : vector<13xi32>
+}
+
+// CHECK-LABEL: @transfer_read_mask_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-42, -42, 4, -42, -42, -42, -42, -42, -42, -42, -42, -42, -42]
+
+func.func @transfer_read_vector_mask() -> vector<6xi32> {
+  %a = arith.constant dense<[0, 1, 2, 3, 4]> : memref<5xi32>
+  %c2 = arith.constant 2 : index
+  %c-42 = arith.constant -42: i32
+  %m = arith.constant dense<[0, 0, 1, 1, 1, 1]> : vector<6xi1>
+  %passthrough = arith.constant dense<[-1, -2, -3, -4, -5, -6]> : vector<6xi32>
+  %f = vector.mask %m, %passthrough {
+    vector.transfer_read %a[%c2], %c-42 : memref<5xi32>, vector<6xi32>
+  } : vector<6xi1> -> vector<6xi32>
+  return %f : vector<6xi32>
+}
+
+// CHECK-LABEL: @transfer_read_vector_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-1, -2, 4, -42, -42, -42]
+
+func.func @transfer_read_inbounds_4() -> vector<4xi32> {
+  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
+  %c-42 = arith.constant -42: i32
+  %c1 = arith.constant 1 : index
+  %f = vector.transfer_read %a[%c1], %c-42
+      {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]} :
+    memref<5xi32>, vector<4xi32>
+  return %f : vector<4xi32>
+}
+
+// CHECK-LABEL: @transfer_read_inbounds_4
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2, 0, 0]
+
+func.func @transfer_read_mask_inbounds_4() -> vector<4xi32> {
+  %a = arith.constant dense<[0, 1, 2, 0, 0]> : memref<5xi32>
+  %c-42 = arith.constant -42: i32
+  %c1 = arith.constant 1 : index
+  %m = arith.constant dense<[0, 1, 0, 1]> : vector<4xi1>
+  %f = vector.transfer_read %a[%c1], %c-42, %m {in_bounds = [true]}
+      : memref<5xi32>, vector<4xi32>
+  return %f : vector<4xi32>
+}
+
+// CHECK-LABEL: @transfer_read_mask_inbounds_4
+// CHECK-NEXT: Results
+// CHECK-NEXT: [-42, 2, -42, 0]
+
+func.func @transfer_read_2d()-> vector<2x2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]>
+    : memref<3x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c1, %c0], %c-42
+      : memref<3x4xi32>, vector<2x2xi32>
+  return %f : vector<2x2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[4, 5], [8, 9]]
+
+func.func @transfer_read_2d_1d()-> vector<2xi32> {
+  %a = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : memref<2x4xi32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c-42 = arith.constant -42: i32
+  %f = vector.transfer_read %a[%c1, %c0], %c-42
+      : memref<2x4xi32>, vector<2xi32>
+  return %f : vector<2xi32>
+}
+
+// CHECK-LABEL: @transfer_read_2d_1d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [4, 5]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
new file mode 100644
index 00000000000..6ca9eb88603
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transfer_write.mlir
@@ -0,0 +1,91 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @write_4_at_3_inbounds() -> memref<?xi32> {
+  %c8 = arith.constant 8 : index
+  %a = memref.alloc(%c8) : memref<?xi32>
+  %base = arith.constant 3 : index
+  %f = arith.constant dense<[1, 2, 3, 4]> : vector<4xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>, in_bounds = [true]}
+    : vector<4xi32>, memref<?xi32>
+  return %a : memref<?xi32>
+}
+
+// CHECK-LABEL: @write_4_at_3_inbounds
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 1, 2, 3, 4, 0]
+
+func.func @write_6_at_5() -> memref<8xi32> {
+  %a = memref.alloc() : memref<8xi32>
+  %base = arith.constant 5 : index
+  %f = arith.constant dense<[1, 2, 3, 4, 5, 6]> : vector<6xi32>
+  vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<6xi32>, memref<8xi32>
+  return %a : memref<8xi32>
+}
+
+// CHECK-LABEL: @write_6_at_5
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 0, 0, 1, 2, 3]
+
+func.func @write_to_tensor() -> (tensor<3xi32>, tensor<3xi32>) {
+  %a = arith.constant dense<[1,2,3]> : tensor<3xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[4, 5]> : vector<2xi32>
+  %b = vector.transfer_write %f, %a[%base]
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<2xi32>, tensor<3xi32>
+  return %a, %b : tensor<3xi32>, tensor<3xi32>
+}
+
+// CHECK-LABEL: @write_to_tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT: [1, 2, 3]
+// CHECK-NEXT: [1, 4, 5]
+
+func.func @write_masked() -> memref<4xi32> {
+  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
+  vector.transfer_write %f, %a[%base], %mask
+    {permutation_map = affine_map<(d0) -> (d0)>}
+    : vector<3xi32>, memref<4xi32>
+  return %a : memref<4xi32>
+}
+
+// CHECK-LABEL: @write_masked
+// CHECK-NEXT: Results
+// CHECK-NEXT: [10, 1, 12, 3]
+
+func.func @write_vector_mask() -> memref<4xi32> {
+  %a = arith.constant dense<[10, 11, 12, 13]> : memref<4xi32>
+  %base = arith.constant 1 : index
+  %f = arith.constant dense<[1, 2, 3]> : vector<3xi32>
+  %mask = arith.constant dense<[true, false, true]> : vector<3xi1>
+  vector.mask %mask {
+    vector.transfer_write %f, %a[%base]
+      {permutation_map = affine_map<(d0) -> (d0)>}
+      : vector<3xi32>, memref<4xi32>
+  } : vector<3xi1>
+  return %a : memref<4xi32>
+}
+
+// CHECK-LABEL: @write_vector_mask
+// CHECK-NEXT: Results
+// CHECK-NEXT: [10, 1, 12, 3]
+
+func.func @write_1d_to_2d() -> memref<2x4xi32> {
+  %a = memref.alloc() : memref<2x4xi32>
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %f = arith.constant dense<[1, 2]> : vector<2xi32>
+  vector.transfer_write %f, %a[%c1, %c2]
+    : vector<2xi32>, memref<2x4xi32>
+  return %a : memref<2x4xi32>
+}
+
+// CHECK-LABEL: @write_1d_to_2d
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[0, 0, 0, 0], [0, 0, 1, 2]]
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
new file mode 100644
index 00000000000..3c39327d67d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/transpose.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @transpose() -> vector<2x1x4x3xi32> {
+  %0 = arith.constant dense<[[[
+      [000, 001, 002, 003],
+      [010, 011, 012, 013],
+      [020, 021, 022, 023]
+    ],
+    [
+      [100, 101, 102, 103],
+      [110, 111, 112, 113],
+      [120, 121, 122, 123]
+    ]]]> : vector<1x2x3x4xi32>
+  %1 = vector.transpose %0, [1, 0, 3, 2]
+    : vector<1x2x3x4xi32> to vector<2x1x4x3xi32>
+  return %1 : vector<2x1x4x3xi32>
+}
+
+// CHECK-LABEL: @transpose
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [[[[0, 10, 20],
+// CHECK{LITERAL}:         [1, 11, 21],
+// CHECK{LITERAL}:         [2, 12, 22],
+// CHECK{LITERAL}:         [3, 13, 23]]],
+// CHECK{LITERAL}:       [[[100, 110, 120],
+// CHECK{LITERAL}:         [101, 111, 121],
+// CHECK{LITERAL}:         [102, 112, 122],
+// CHECK{LITERAL}:         [103, 113, 123]]]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
new file mode 100644
index 00000000000..fecc8e21466
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/type_cast.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @type_cast() -> memref<vector<2x2xi32>> {
+  %alloc = arith.constant dense<[[1, 2], [3, 4]]> : memref<2x2xi32>
+  %cast = vector.type_cast %alloc: memref<2x2xi32> to memref<vector<2x2xi32>>
+  return %cast : memref<vector<2x2xi32>>
+}
+
+// CHECK-LABEL: @type_cast
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<vector<2x2xi32>>: [[1, 2], [3, 4]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
new file mode 100644
index 00000000000..6d162357d57
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/vector/vscale.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @vscale() -> (vector<[4]xi32>, index) {
+  %c = arith.constant dense<0> : vector<[4]xi32>
+  %vscale = vector.vscale
+  return %c, %vscale : vector<[4]xi32>, index
+}
+
+// CHECK-LABEL: @vscale
+// CHECK-NEXT: Results
+// CHECK-NEXT: [0, 0, 0, 0]
+// CHECK-NEXT: 1
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc
new file mode 100644
index 00000000000..4effd12ce10
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.cc
@@ -0,0 +1,163 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/dialects/util.h"
+
+#include <variant>
+
+#include "mlir/Support/MathExtras.h"
+#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+
+SmallVector<int64_t> replaceDynamicVals(llvm::ArrayRef<int64_t> staticVals,
+                                        ArrayRef<InterpreterValue>& args) {
+  llvm::SmallVector<int64_t> out;
+  for (int64_t val : staticVals) {
+    if (ShapedType::isDynamic(val)) {
+      out.push_back(std::get<int64_t>(args.front().storage));
+      args = args.drop_front(1);
+    } else {
+      out.push_back(val);
+    }
+  }
+  return out;
+}
+
+SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
+                                        ArrayRef<int64_t> dynamicVals) {
+  llvm::SmallVector<int64_t> out;
+  for (int64_t val : staticVals) {
+    if (ShapedType::isDynamic(val)) {
+      out.push_back(dynamicVals.front());
+      dynamicVals = dynamicVals.drop_front(1);
+    } else {
+      out.push_back(val);
+    }
+  }
+  assert(dynamicVals.empty() && "expected no leftover dynamic values");
+  return out;
+}
+
+OffsetsSizesStrides extractOffsetsSizesStrides(
+    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op) {
+  auto offsets = replaceDynamicVals(op.static_offsets(), args);
+  auto sizes = replaceDynamicVals(op.static_sizes(), args);
+  auto strides = replaceDynamicVals(op.static_strides(), args);
+  return {offsets, sizes, strides};
+}
+
+OffsetsSizesStrides extractOffsetsSizesStrides(
+    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
+    ArrayRef<int64_t> dynamicStrides, OffsetSizeAndStrideOpInterface op) {
+  auto offsets = replaceDynamicVals(op.static_offsets(), dynamicOffsets);
+  auto sizes = replaceDynamicVals(op.static_sizes(), dynamicSizes);
+  auto strides = replaceDynamicVals(op.static_strides(), dynamicStrides);
+  return {offsets, sizes, strides};
+}
+
+InterpreterValue reshapeTensor(const InterpreterValue& in,
+                               ArrayRef<int64_t> shape) {
+  // This doesn't need a copy in many cases, but it's easier that way.
+  auto out = in.typedAlike(shape);
+  for (const auto& [inIndex, outIndex] :
+       llvm::zip(in.view().indices(), out.view().indices())) {
+    out.insertElement(outIndex, in.extractElement(inIndex));
+  }
+  return out;
+}
+
+InterpreterValue getInitOperand(mlir::Operation* op, int64_t index,
+                                MutableArrayRef<InterpreterValue> args) {
+  return getInitOperand(op->getOperands(), index, args);
+}
+
+InterpreterValue getInitOperand(mlir::ValueRange values, int64_t index,
+                                ArrayRef<InterpreterValue> args) {
+  assert(args.size() == values.size() && "expected matching sizes");
+  return values[index].getType().isa<TensorType>() ? args[index].clone()
+                                                   : args[index];
+}
+
+InterpreterValue transposeImpl(const InterpreterValue& in,
+                               ArrayRef<int64_t> permutation) {
+  auto out = in;
+  auto& view = out.view();
+
+  view.sizes.clear();
+  view.strides.clear();
+  for (int64_t p : permutation) {
+    view.sizes.push_back(in.view().sizes[p]);
+    view.strides.push_back(in.view().strides[p]);
+  }
+
+  return out;
+}
+
+int64_t dimImpl(const InterpreterValue& in, int64_t index,
+                InterpreterState& state) {
+  if (index < 0 || index >= in.view().rank()) {
+    state.addFailure("dimension index out of bounds");
+    return 0;
+  }
+  return in.view().sizes[index];
+}
+
+llvm::SmallVector<InterpreterValue> noOpTerminator(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&) {
+  return llvm::to_vector(args);
+}
+
+int64_t evalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims) {
+  switch (expr.getKind()) {
+    case AffineExprKind::Add:
+      return evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getLHS(), dims) +
+             evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getRHS(), dims);
+    case AffineExprKind::Mul:
+      return evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getLHS(), dims) *
+             evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getRHS(), dims);
+    case AffineExprKind::Mod:
+      return mod(
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getLHS(), dims),
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getRHS(), dims));
+    case AffineExprKind::FloorDiv:
+      return floorDiv(
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getLHS(), dims),
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getRHS(), dims));
+    case AffineExprKind::CeilDiv:
+      return ceilDiv(
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getLHS(), dims),
+          evalAffineExpr(expr.cast<AffineBinaryOpExpr>().getRHS(), dims));
+    case AffineExprKind::Constant:
+      return expr.cast<AffineConstantExpr>().getValue();
+    case AffineExprKind::DimId:
+      return dims[expr.cast<AffineDimExpr>().getPosition()];
+    case AffineExprKind::SymbolId:
+      llvm_unreachable("Symbol is unsupported");
+  }
+}
+
+SmallVector<int64_t> evalAffineMap(AffineMap map, ArrayRef<int64_t> dims) {
+  SmallVector<int64_t> result;
+  for (auto expr : map.getResults()) {
+    result.push_back(evalAffineExpr(expr, dims));
+  }
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h
new file mode 100644
index 00000000000..552d15dc0f3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/util.h
@@ -0,0 +1,77 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct OffsetsSizesStrides {
+  llvm::SmallVector<int64_t> offsets;
+  llvm::SmallVector<int64_t> sizes;
+  llvm::SmallVector<int64_t> strides;
+};
+
+// Replaces dynamic placeholders in staticVals using elements from the front
+// of args, which are removed.
+SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
+                                        ArrayRef<InterpreterValue>& args);
+// Same as above, but the values are already unpacked. `dynamicVals.size` must
+// match the number of dynamic values in `staticVals`.
+SmallVector<int64_t> replaceDynamicVals(ArrayRef<int64_t> staticVals,
+                                        ArrayRef<int64_t> dynamicVals);
+
+OffsetsSizesStrides extractOffsetsSizesStrides(
+    ArrayRef<InterpreterValue> args, OffsetSizeAndStrideOpInterface op);
+OffsetsSizesStrides extractOffsetsSizesStrides(
+    ArrayRef<int64_t> dynamicOffsets, ArrayRef<int64_t> dynamicSizes,
+    ArrayRef<int64_t> dynamicStrides, OffsetSizeAndStrideOpInterface op);
+
+InterpreterValue reshapeTensor(const InterpreterValue& in,
+                               ArrayRef<int64_t> shape);
+
+// Gets the given operand, cloning its storage if it is a tensor.
+InterpreterValue getInitOperand(mlir::Operation* op, int64_t index,
+                                MutableArrayRef<InterpreterValue> args);
+InterpreterValue getInitOperand(mlir::ValueRange values, int64_t index,
+                                ArrayRef<InterpreterValue> args);
+
+// Common implementations for ops from different dialects but sharing the same
+// semantics.
+InterpreterValue transposeImpl(const InterpreterValue& in,
+                               ArrayRef<int64_t> permutation);
+int64_t dimImpl(const InterpreterValue& in, int64_t index,
+                InterpreterState& state);
+
+// Terminator that just returns its args.
+llvm::SmallVector<InterpreterValue> noOpTerminator(
+    MutableArrayRef<InterpreterValue> args, mlir::Operation*,
+    InterpreterState&);
+
+int64_t evalAffineExpr(AffineExpr expr, ArrayRef<int64_t> dims);
+llvm::SmallVector<int64_t> evalAffineMap(AffineMap map, ArrayRef<int64_t> dims);
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_DIALECTS_UTIL_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
new file mode 100644
index 00000000000..3277d4242ea
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/vector.cc
@@ -0,0 +1,856 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "tools/mlir_interpreter/dialects/comparators.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+class MaskSideChannel : public InterpreterSideChannel {
+ public:
+  MaskSideChannel(TensorOrMemref<bool> mask,
+                  std::optional<InterpreterValue> passthrough)
+      : mask(std::move(mask)), passthrough(std::move(passthrough)) {}
+
+  const TensorOrMemref<bool>& getMask() const { return mask; }
+  const std::optional<InterpreterValue>& getPassthrough() const {
+    return passthrough;
+  }
+
+ private:
+  const TensorOrMemref<bool> mask;
+  const std::optional<InterpreterValue> passthrough;
+};
+
+template <typename T>
+using combiner_t = T (*)(T, T);
+
+template <typename T>
+combiner_t<T> getCombiner(vector::CombiningKind kind) {
+  if constexpr (std::is_arithmetic_v<T> && !std::is_same_v<T, bool>) {
+    switch (kind) {
+      case vector::CombiningKind::ADD:
+        return +[](T a, T b) -> T { return a + b; };
+      case vector::CombiningKind::MUL:
+        return +[](T a, T b) -> T { return a * b; };
+      case vector::CombiningKind::MINSI:
+      case vector::CombiningKind::MINF:
+        return +[](T a, T b) -> T { return std::min(a, b); };
+      case vector::CombiningKind::MAXSI:
+      case vector::CombiningKind::MAXF:
+        return +[](T a, T b) -> T { return std::max(a, b); };
+      default: {
+      }
+    }
+  }
+
+  if constexpr (std::is_integral_v<T> && !std::is_same_v<T, bool>) {
+    switch (kind) {
+      case vector::CombiningKind::MINUI:
+        return &Iumin::apply<T>;
+      case vector::CombiningKind::MAXUI:
+        return &Iumax::apply<T>;
+      default: {
+      }
+    }
+  }
+
+  if constexpr (std::is_integral_v<T>) {
+    switch (kind) {
+      case vector::CombiningKind::AND:
+        return +[](T a, T b) -> T { return a & b; };
+      case vector::CombiningKind::OR:
+        return +[](T a, T b) -> T { return a | b; };
+      case vector::CombiningKind::XOR:
+        return +[](T a, T b) -> T { return a ^ b; };
+      default: {
+      }
+    }
+  }
+
+  llvm_unreachable("unknown combining kind");
+}
+
+InterpreterValue getNeutralElement(vector::CombiningKind kind, mlir::Type ty) {
+  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    switch (kind) {
+      case vector::CombiningKind::AND:
+      case vector::CombiningKind::MINUI:
+        if constexpr (std::is_same_v<T, bool>) {
+          return {true};
+        } else if constexpr (std::is_integral_v<T>) {
+          return {static_cast<T>(static_cast<std::make_unsigned_t<T>>(-1))};
+        }
+        break;
+      case vector::CombiningKind::ADD:
+      case vector::CombiningKind::MAXUI:
+      case vector::CombiningKind::OR:
+      case vector::CombiningKind::XOR:
+        return {T{0}};
+      case vector::CombiningKind::MUL:
+        return {T{1}};
+      case vector::CombiningKind::MINSI:
+        return {std::numeric_limits<T>::max()};
+      case vector::CombiningKind::MINF:
+        return {std::numeric_limits<T>::infinity()};
+      case vector::CombiningKind::MAXSI:
+        return {std::numeric_limits<T>::min()};
+      case vector::CombiningKind::MAXF:
+        return {-std::numeric_limits<T>::infinity()};
+      default: {
+      }
+    }
+    llvm_unreachable("invalid combining kind");
+  });
+}
+
+template <typename IntType>
+SmallVector<IntType> extractVector(ArrayAttr arrayAttr) {
+  return llvm::to_vector<4>(llvm::map_range(
+      arrayAttr.getAsRange<IntegerAttr>(),
+      [](IntegerAttr attr) { return static_cast<IntType>(attr.getInt()); }));
+}
+
+InterpreterValue bitcast(InterpreterState&, vector::BitCastOp op,
+                         const InterpreterValue& vector) {
+  ShapedType ty = op->getResultTypes()[0];
+  auto flattened = vector.coerceLayout({});
+  auto buffer = flattened.buffer();
+  auto view = flattened.view();
+  view.sizes = llvm::to_vector(ty.getShape());
+  view.strides = BufferView::getDefaultStrides(view.sizes);
+  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    // TODO(jreiffers): i1 semantics are currently broken (both here and
+    // upstream).
+    using T = decltype(dummy);
+    return {TensorOrMemref<T>{buffer, view}};
+  });
+}
+
+InterpreterValue broadcast(InterpreterState&, vector::BroadcastOp broadcast,
+                           const InterpreterValue& value) {
+  auto result =
+      value.isTensor() ? value : value.asUnitTensor(/*isVector=*/true);
+  auto& resultView = result.view();
+
+  // Insert additional leading stride 0 dims.
+  SmallVector<int64_t> strides(broadcast.getVectorType().getRank());
+  llvm::copy(llvm::reverse(resultView.strides), strides.rbegin());
+  // Zero out broadcast dimension strides.
+  for (int64_t i : broadcast.computeBroadcastedUnitDims()) strides[i] = 0;
+
+  resultView.strides = std::move(strides);
+  resultView.sizes = llvm::to_vector(broadcast.getVectorType().getShape());
+  return result;
+}
+
+void compressStore(InterpreterState& state, vector::CompressStoreOp,
+                   InterpreterValue dst, ArrayRef<int64_t> indices,
+                   TensorOrMemref<bool> mask, InterpreterValue value) {
+  auto dstBuffer = dst.buffer();
+  const auto& dstView = dst.view();
+  if (dstView.strides.back() != 1) {
+    state.addFailure("trailing dimension must be continguous");
+    return;
+  }
+  auto offset = dstView.getPhysicalIndex(indices);
+  if (!offset) {
+    state.addFailure("index out of bounds");
+    return;
+  }
+
+  auto srcBuffer = value.buffer();
+  const auto& srcView = value.view();
+
+  // TODO(jreiffers): Bounds checks.
+  int64_t n = srcView.sizes[0];
+  int64_t elementSize = value.getByteSizeOfElement();
+  for (int64_t i = 0; i < n; ++i) {
+    if (mask.at(i)) {
+      memcpy(dstBuffer->at(offset, elementSize), srcBuffer->at(i, elementSize),
+             elementSize);
+      ++*offset;
+    }
+  }
+}
+
+InterpreterValue maskImpl(mlir::Operation* op, ArrayRef<int64_t> maskSizes) {
+  auto outSizes = op->getResultTypes()[0].cast<ShapedType>().getShape();
+  auto result = TensorOrMemref<bool>::empty(outSizes);
+  result.view.isVector = true;
+  BufferView iter;
+  iter.sizes = llvm::to_vector(maskSizes);
+  for (const auto& indices : iter.indices()) result.at(indices) = true;
+  return {result};
+}
+
+InterpreterValue constantMask(InterpreterState&, vector::ConstantMaskOp mask) {
+  return maskImpl(mask, extractVector<int64_t>(mask.getMaskDimSizes()));
+}
+
+// TODO(jreiffers): Support maksed contractions.
+InterpreterValue contract(InterpreterState&, vector::ContractionOp contraction,
+                          const InterpreterValue& lhs,
+                          const InterpreterValue& rhs,
+                          const InterpreterValue& acc) {
+  BufferView iter;
+  contraction.getIterationBounds(iter.sizes);
+  auto maps = contraction.getIndexingMapsArray();
+  auto resultTy = contraction->getResultTypes()[0];
+  auto shapedTy = resultTy.dyn_cast<ShapedType>();
+  auto result =
+      dispatchScalarType(resultTy, [&](auto dummy) -> InterpreterValue {
+        using T = decltype(dummy);
+        using TT = TensorOrMemref<T>;
+        const auto& lhsT = std::get<TT>(lhs.storage);
+        const auto& rhsT = std::get<TT>(rhs.storage);
+        auto resultValue = shapedTy ? acc.clone() : acc.asUnitTensor();
+        auto& result = std::get<TT>(resultValue.storage);
+        auto combiner = *getCombiner<T>(contraction.getKind());
+        for (const auto& indices : iter.indices()) {
+          auto lhsIndices = evalAffineMap(maps[0], indices);
+          auto rhsIndices = evalAffineMap(maps[1], indices);
+          auto resultIndices = evalAffineMap(maps[2], indices);
+
+          auto& resultItem = result.at(resultIndices);
+          resultItem =
+              combiner(resultItem, lhsT.at(lhsIndices) * rhsT.at(rhsIndices));
+        }
+        return resultValue;
+      });
+
+  return shapedTy ? result : result.extractElement({});
+}
+
+InterpreterValue createMask(InterpreterState&, vector::CreateMaskOp op,
+                            ArrayRef<int64_t> sizes) {
+  return maskImpl(op, sizes);
+}
+
+InterpreterValue expandLoad(InterpreterState& state, vector::ExpandLoadOp,
+                            InterpreterValue memref,
+                            SmallVector<int64_t> offsets,
+                            TensorOrMemref<bool> mask,
+                            const InterpreterValue& passThrough) {
+  if (memref.view().strides.back() != 1) {
+    state.addFailure("expected last dimension to be contiguous");
+    return {};
+  }
+
+  auto out = passThrough.clone();
+  for (int64_t i = 0, e = out.view().sizes[0]; i < e; ++i) {
+    if (mask.at({i})) {
+      out.insertElement({i}, memref.extractElement(offsets));
+      ++offsets.back();
+    }
+  }
+  return out;
+}
+
+InterpreterValue extract(InterpreterState& state, vector::ExtractOp extract,
+                         const InterpreterValue& vector) {
+  auto result = vector;
+  auto& resultView = result.view();
+  for (int64_t offset : extractVector<int64_t>(extract.getPosition())) {
+    state.checkSuccess(resultView.slice(0, offset), "index out of bounds");
+  }
+  return resultView.rank() == 0 ? result.extractElement({}) : result;
+}
+
+InterpreterValue extractElement(InterpreterState& state,
+                                vector::ExtractElementOp,
+                                const InterpreterValue& vector,
+                                std::optional<int64_t> index) {
+  if (!index) {
+    return vector.extractElement({});
+  }
+  if (!vector.view().inBounds(*index)) {
+    state.addFailure("array index out of bounds");
+    return {};
+  }
+  return vector.extractElement(*index);
+}
+
+InterpreterValue extractSlice(InterpreterState& state,
+                              vector::ExtractStridedSliceOp extract,
+                              const InterpreterValue& vector) {
+  auto out = vector;
+  state.checkSuccess(
+      out.view().subview(extractVector<int64_t>(extract.getOffsets()),
+                         extractVector<int64_t>(extract.getSizes()),
+                         extractVector<int64_t>(extract.getStrides())),
+      "subview out of bounds");
+  return out;
+}
+
+InterpreterValue flatTranspose(InterpreterState&,
+                               vector::FlatTransposeOp transpose,
+                               const InterpreterValue& vector) {
+  auto out = vector.clone();
+  // We currently only implement -matrix-default-layout=column-major.
+  int64_t rows = transpose.getRows();
+  int64_t cols = transpose.getColumns();
+  for (int64_t i = 0; i < rows * cols; ++i) {
+    int64_t srcIndex = (i % cols) * rows + (i / cols);
+    out.insertElement({i}, vector.extractElement({srcIndex}));
+  }
+  return out;
+}
+
+InterpreterValue fusedMultiplyAdd(InterpreterState&, vector::FMAOp op,
+                                  const InterpreterValue& lhs,
+                                  const InterpreterValue& rhs,
+                                  const InterpreterValue& acc) {
+  auto out = acc.clone();
+  dispatchScalarType(op->getResultTypes()[0], [&](auto dummy) {
+    using TT = TensorOrMemref<decltype(dummy)>;
+    const auto& lhsT = std::get<TT>(lhs.storage);
+    const auto& rhsT = std::get<TT>(rhs.storage);
+    auto& result = std::get<TT>(out.storage);
+
+    for (const auto& indices : lhsT.view.indices()) {
+      result.at(indices) += lhsT.at(indices) * rhsT.at(indices);
+    }
+  });
+  return out;
+}
+
+InterpreterValue gather(InterpreterState& state, vector::GatherOp op,
+                        const InterpreterValue& src, ArrayRef<int64_t> offsets,
+                        const InterpreterValue& indices,
+                        const TensorOrMemref<bool>& mask,
+                        const InterpreterValue& passThrough) {
+  if (op->getOperandTypes()[0].isa<MemRefType>() &&
+      src.view().strides.back() != 1) {
+    state.addFailure("expected trailing dimension to be contiguous");
+    return {};
+  }
+  auto out = passThrough.clone();
+  for (const auto& outIndex : out.view().indices()) {
+    if (!mask.at(outIndex)) continue;
+    auto inIndex = llvm::to_vector(offsets);
+    inIndex[0] += indices.extractElement(outIndex).asInt();
+    out.insertElement(outIndex, src.extractElement(inIndex));
+  }
+  return out;
+}
+
+InterpreterValue insert(InterpreterState& state, vector::InsertOp insert,
+                        const InterpreterValue& src,
+                        const InterpreterValue& dst) {
+  auto result = dst.clone();
+  auto resultSlice = result;
+  auto& resultSliceView = resultSlice.view();
+  for (int64_t offset : extractVector<int64_t>(insert.getPosition())) {
+    state.checkSuccess(resultSliceView.slice(0, offset), "index out of bounds");
+  }
+  resultSlice.fill([&](auto indices) { return src.extractElement(indices); });
+  return result;
+}
+
+InterpreterValue insertElement(InterpreterState& state, vector::InsertElementOp,
+                               const InterpreterValue& value,
+                               const InterpreterValue& vector,
+                               std::optional<int64_t> index) {
+  auto result = vector.clone();
+  if (!index) {
+    result.insertElement({}, value);
+    return result;
+  }
+  if (!result.view().inBounds(*index)) {
+    state.addFailure("array index out of bounds");
+    return {};
+  }
+  result.insertElement(*index, value);
+  return result;
+}
+
+InterpreterValue insertSlice(InterpreterState& state,
+                             vector::InsertStridedSliceOp insert,
+                             InterpreterValue src,
+                             const InterpreterValue& dst) {
+  auto out = dst.clone();
+  auto outSlice = out;
+  if (!outSlice.view()
+           .subview(extractVector<int64_t>(insert.getOffsets()),
+                    insert.getSourceVectorType().getShape(),
+                    extractVector<int64_t>(insert.getStrides()))
+           .succeeded()) {
+    state.addFailure("subview out of bounds");
+  }
+  for (const auto& index : src.view().indices()) {
+    outSlice.insertElement(index, src.extractElement(index));
+  }
+  return out;
+}
+
+InterpreterValue load(InterpreterState& state, vector::LoadOp load,
+                      const InterpreterValue& memref,
+                      ArrayRef<int64_t> indices) {
+  if (memref.view().numVectorDims > 0) {
+    return {memref.extractElement(indices)};
+  }
+  auto out = memref;
+  if (!out.view()
+           .subview(indices, load.getVectorType().getShape(),
+                    SmallVector<int64_t>(load.getVectorType().getRank(), 1))
+           .succeeded()) {
+    // "Not all targets may support out-of-bounds vector loads"
+    state.addFailure("out of bounds loads not supported");
+    return {};
+  }
+  out = out.clone();
+  out.view().isVector = true;
+  return out;
+}
+
+llvm::SmallVector<InterpreterValue> mask(
+    InterpreterState& state, vector::MaskOp op, TensorOrMemref<bool> mask,
+    std::optional<InterpreterValue> passthrough) {
+  InterpreterScope scope(state);
+  scope.setSideChannel(std::make_shared<MaskSideChannel>(mask, passthrough));
+  return interpret(state, op.getMaskRegion(), {});
+}
+
+InterpreterValue maskedLoad(InterpreterState& state, vector::MaskedLoadOp,
+                            const InterpreterValue& memref,
+                            SmallVector<int64_t> offsets,
+                            TensorOrMemref<bool> mask,
+                            const InterpreterValue& passThrough) {
+  if (memref.view().strides.back() != 1) {
+    state.addFailure("expected last dimension to be contiguous");
+    return {};
+  }
+
+  auto out = passThrough.clone();
+  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
+    if (mask.at(i)) {
+      out.insertElement(i, memref.extractElement(offsets));
+    }
+    ++offsets.back();
+  }
+  return out;
+}
+
+void maskedStore(InterpreterState& state, vector::MaskedStoreOp,
+                 InterpreterValue memref, SmallVector<int64_t> offsets,
+                 TensorOrMemref<bool> mask, const InterpreterValue& vector) {
+  if (memref.view().strides.back() != 1) {
+    state.addFailure("expected last dimension to be contiguous");
+    return;
+  }
+
+  for (int64_t i = 0, e = mask.view.sizes[0]; i < e; ++i) {
+    if (mask.at({i})) {
+      memref.insertElement(offsets, vector.extractElement({i}));
+    }
+    ++offsets.back();
+  }
+}
+
+InterpreterValue reductionImpl(InterpreterState& state,
+                               const InterpreterValue& v,
+                               const InterpreterValue* acc,
+                               vector::CombiningKind kind,
+                               SmallVector<int64_t> dims, Type elementType) {
+  llvm::sort(dims);
+  SmallVector<int64_t> keptDims;
+  SmallVector<int64_t> resultShape;
+  for (auto [dim, size] : llvm::enumerate(v.view().sizes)) {
+    if (!llvm::is_contained(dims, dim)) {
+      keptDims.push_back(dim);
+      resultShape.push_back(size);
+    }
+  }
+  return dispatchScalarType(elementType, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    using TT = TensorOrMemref<T>;
+
+    auto combiner = *getCombiner<T>(kind);
+
+    TT result = acc ? std::get<TT>((keptDims.empty() ? acc->asUnitTensor()
+                                                     : acc->clone())
+                                       .storage)
+                    : TensorOrMemref<T>::empty(resultShape);
+
+    for (const auto& resultIndex : result.view.indices()) {
+      auto src = std::get<TT>(v.storage);
+      for (auto [dim, index] :
+           llvm::reverse(llvm::zip(keptDims, resultIndex))) {
+        state.checkSuccess(src.view.slice(dim, index), "index out of bounds");
+      }
+
+      T& item = result.at(resultIndex);
+      bool first = acc == nullptr;
+      for (const auto& srcIndex : src.view.indices()) {
+        if (first) {
+          item = src.at(srcIndex);
+          first = false;
+        } else {
+          item = combiner(item, src.at(srcIndex));
+        }
+      }
+    }
+
+    if (keptDims.empty()) {
+      return {result.at({})};
+    }
+    return {result};
+  });
+}
+
+InterpreterValue multiReduction(InterpreterState& state,
+                                vector::MultiDimReductionOp reduction,
+                                const InterpreterValue& source,
+                                const InterpreterValue& acc) {
+  auto elementTy = getElementTypeOrSelf(reduction->getResultTypes()[0]);
+  return {reductionImpl(state, source, &acc, reduction.getKind(),
+                        extractVector<int64_t>(reduction.getReductionDims()),
+                        elementTy)};
+}
+
+InterpreterValue outerProduct(InterpreterState&,
+                              vector::OuterProductOp outerproduct,
+                              const InterpreterValue& lhs,
+                              const InterpreterValue& rhs,
+                              std::optional<InterpreterValue> acc) {
+  ShapedType ty = outerproduct->getResultTypes()[0];
+  return dispatchScalarType(ty, [&](auto dummy) -> InterpreterValue {
+    using T = decltype(dummy);
+    using TT = TensorOrMemref<T>;
+    const TT& lhsT = std::get<TT>(lhs.storage);
+
+    auto combiner = *getCombiner<T>(outerproduct.getKind());
+    auto result =
+        acc ? std::get<TT>(acc->storage).clone() : TT::empty(ty.getShape());
+    if (std::holds_alternative<T>(rhs.storage)) {
+      T rhsS = std::get<T>(rhs.storage);
+      for (int64_t i : llvm::seq(int64_t{0}, lhsT.view.sizes[0])) {
+        result.at(i) = combiner(result.at(i), lhsT.at(i) * rhsS);
+      }
+    } else {
+      const TT& rhsT = std::get<TT>(rhs.storage);
+      for (int64_t i : llvm::seq(int64_t{0}, lhsT.view.sizes[0])) {
+        for (int64_t j : llvm::seq(int64_t{0}, rhsT.view.sizes[0])) {
+          result.at({i, j}) =
+              combiner(result.at({i, j}), lhsT.at(i) * rhsT.at(j));
+        }
+      }
+    }
+    return {result};
+  });
+}
+
+InterpreterValue reduction(InterpreterState& state,
+                           vector::ReductionOp reduction, InterpreterValue arg,
+                           std::optional<InterpreterValue> acc) {
+  auto* mask =
+      state.getTopScope()->getSideChannel<MaskSideChannel>(/*optional=*/true);
+  auto ty = reduction->getResultTypes()[0];
+  if (mask) {
+    if (mask->getPassthrough()) {
+      state.addFailure("passthrough should not be set with masked reduction");
+      return {};
+    }
+    arg = arg.clone();
+    if (mask->getMask().view.sizes != arg.view().sizes) {
+      state.addFailure("mask shape should match argument shape");
+      return {};
+    }
+    auto neutral = getNeutralElement(reduction.getKind(), ty);
+    for (const auto& idx : arg.view().indices()) {
+      if (!mask->getMask().at(idx)) {
+        arg.insertElement(idx, neutral);
+      }
+    }
+  }
+
+  return reductionImpl(state, arg, acc ? &acc.value() : nullptr,
+                       reduction.getKind(), {0}, ty);
+}
+
+InterpreterValue shapeCast(InterpreterState&, vector::ShapeCastOp op,
+                           const InterpreterValue& in) {
+  auto out = in.coerceLayout({});
+  auto& outView = out.view();
+  outView.sizes =
+      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  outView.strides = BufferView::getDefaultStrides(outView.sizes);
+  return out;
+}
+
+InterpreterValue shuffle(InterpreterState& state, vector::ShuffleOp shuffle,
+                         const InterpreterValue& v0,
+                         const InterpreterValue& v1) {
+  auto result = v0.typedAlike(shuffle.getVectorType().getShape());
+  auto& resultView = result.view();
+  resultView.isVector = true;
+
+  auto mask = extractVector<int64_t>(shuffle.getMask());
+  bool isZeroDim = v0.view().rank() == 0;
+  int64_t size0 = isZeroDim ? 1 : v0.view().sizes[0];
+  for (auto [dstIndex, srcIndex] : llvm::enumerate(mask)) {
+    auto src = srcIndex < size0 ? v0 : v1;
+    if (!isZeroDim) {
+      state.checkSuccess(
+          src.view().slice(0, srcIndex < size0 ? srcIndex : srcIndex - size0),
+          "index out of bounds");
+    }
+    auto dst = result;
+    state.checkSuccess(dst.view().slice(0, dstIndex), "index out of bounds");
+    dst.fill([&](auto indices) { return src.extractElement(indices); });
+  }
+  return result;
+}
+
+InterpreterValue splat(InterpreterState&, vector::SplatOp op,
+                       const InterpreterValue& in) {
+  auto out = in.asUnitTensor(/*isVector=*/true);
+  auto& view = out.view();
+  view.sizes =
+      llvm::to_vector(op->getResultTypes()[0].cast<ShapedType>().getShape());
+  view.strides = SmallVector<int64_t>(view.sizes.size(), 0);
+  return out;
+}
+
+void store(InterpreterState&, vector::StoreOp, const InterpreterValue& src,
+           InterpreterValue dst, ArrayRef<int64_t> offsets) {
+  const auto& outView = dst.view();
+  if (outView.numVectorDims > 0) {
+    dst.insertElement(offsets, src);
+  } else {
+    for (const auto& srcIndex : src.view().indices()) {
+      auto dstIndex = srcIndex;
+      for (int64_t i = 0; i < dstIndex.size(); ++i) {
+        dstIndex[i] += offsets[i];
+      }
+      if (outView.inBounds(dstIndex)) {
+        dst.insertElement(dstIndex, src.extractElement(srcIndex));
+      }
+    }
+  }
+}
+
+std::optional<InterpreterValue> extractMemorySlice(
+    InterpreterState& state, const AffineMap& map,
+    const InterpreterValue& memory, const InterpreterValue& vector,
+    ArrayRef<int64_t> offsets, std::optional<ArrayAttr> inBoundsAttr) {
+  llvm::SmallVector<bool> inBounds(offsets.size());
+  if (inBoundsAttr) {
+    llvm::copy(inBoundsAttr->getAsValueRange<BoolAttr>(),
+               inBounds.end() - inBoundsAttr->size());
+  }
+
+  auto memSlice = memory;
+  auto& memSliceView = memSlice.view();
+  auto& vectorView = vector.view();
+  for (int64_t i = 0; i < memSliceView.rank(); ++i) {
+    bool found = false;
+    for (int64_t j = 0; !found && j < vectorView.rank(); ++j) {
+      if (map.getResult(j).isFunctionOfDim(i)) {
+        int64_t size = memSliceView.sizes[i] - offsets[i];
+        bool isInBounds = size >= vectorView.sizes[j];
+        if (!isInBounds && inBounds[i]) {
+          state.addFailure("index out of bounds");
+          return std::nullopt;
+        }
+        (void)memSliceView.slice(
+            i, offsets[i],
+            std::max(int64_t{0}, std::min(vectorView.sizes[j], size)));
+        found = true;
+      }
+    }
+    if (!found) {
+      bool isInBounds = memSliceView.sizes[i] > offsets[i];
+      if (!isInBounds) {
+        state.addFailure("index out of bounds");
+        return std::nullopt;
+      }
+
+      (void)memSliceView.slice(i, offsets[i], isInBounds ? 1 : 0);
+    }
+  }
+  return memSlice;
+}
+
+InterpreterValue transferRead(InterpreterState& state,
+                              vector::TransferReadOp transfer,
+                              const InterpreterValue& src,
+                              ArrayRef<int64_t> offsets,
+                              const InterpreterValue& padding,
+                              std::optional<TensorOrMemref<bool>> mask) {
+  auto* maskChannel = state.getTopScope()->getSideChannel<MaskSideChannel>(
+      /*optional=*/true);
+  if (maskChannel) {
+    if (mask) {
+      state.addFailure(
+          "vector.mask and transfer_read with mask should not be used "
+          "simultaneously");
+      return {};
+    }
+    mask = maskChannel->getMask();
+  }
+
+  InterpreterValue dst = src.typedAlike(transfer.getVectorType().getShape());
+  if (maskChannel && maskChannel->getPassthrough()) {
+    dst.fill([&](auto indices) {
+      if (mask->at(indices)) {
+        return padding;
+      }
+      return maskChannel->getPassthrough()->extractElement(indices);
+    });
+  } else {
+    dst.fill([&](auto) { return padding; });
+  }
+  dst.view().isVector = true;
+
+  auto srcSlice = extractMemorySlice(state, transfer.getPermutationMap(), src,
+                                     dst, offsets, transfer.getInBounds());
+
+  if (!srcSlice) {
+    return {};
+  }
+  for (const auto& srcIndices : srcSlice->view().indices()) {
+    SmallVector<int64_t> dstIndices =
+        evalAffineMap(transfer.getPermutationMap(), srcIndices);
+
+    // Note: the handling of padding and passthrough values is somewhat
+    // arbitrary here. At the time of writing this, there seems to be little
+    // evidence of actual usage of this feature.
+    if (!mask || mask->at(dstIndices)) {
+      dst.insertElement(dstIndices, srcSlice->extractElement(srcIndices));
+    }
+  }
+
+  return dst;
+}
+
+llvm::SmallVector<InterpreterValue> transferWrite(
+    InterpreterState& state, vector::TransferWriteOp transfer,
+    InterpreterValue src, InterpreterValue dst, ArrayRef<int64_t> offsets,
+    std::optional<TensorOrMemref<bool>> mask) {
+  if (auto* maskChannel = state.getTopScope()->getSideChannel<MaskSideChannel>(
+          /*optional=*/true)) {
+    if (mask) {
+      state.addFailure(
+          "vector.mask and transfer_write with mask should not be used "
+          "simultaneously");
+      return {};
+    }
+    if (maskChannel->getPassthrough()) {
+      state.addFailure(
+          "vector.mask with passthrough should not be used with "
+          "transfer_write");
+      return {};
+    }
+    mask = maskChannel->getMask();
+  }
+
+  const auto& srcView = src.view();
+  int64_t srcRank = srcView.rank();
+  (void)srcRank;
+  assert(transfer.getPermutationMap().getNumResults() == srcRank &&
+         "expected matching number of results");
+
+  dst = transfer.getSource().getType().isa<TensorType>() ? dst.clone() : dst;
+  auto dstSlice = extractMemorySlice(state, transfer.getPermutationMap(), dst,
+                                     src, offsets, transfer.getInBounds());
+  if (!dstSlice) {
+    return {};
+  }
+
+  for (const auto& dstIndices : dstSlice->view().indices()) {
+    SmallVector<int64_t> srcIndices =
+        evalAffineMap(transfer.getPermutationMap(), dstIndices);
+    if (srcView.inBounds(srcIndices) && (!mask || mask->at(srcIndices))) {
+      dstSlice->insertElement(dstIndices, src.extractElement(srcIndices));
+    }
+  }
+
+  if (transfer->getNumResults() == 0) return {};
+  return {dst};
+}
+
+InterpreterValue transpose(InterpreterState&, vector::TransposeOp transpose,
+                           const InterpreterValue& vector) {
+  auto permutation = extractVector<int64_t>(transpose.getTransp());
+  return transposeImpl(vector, permutation);
+}
+
+InterpreterValue typeCast(InterpreterState&, vector::TypeCastOp,
+                          InterpreterValue vector) {
+  vector.view().numVectorDims = vector.view().rank();
+  return vector;
+}
+
+uint64_t vScale(InterpreterState&, vector::VectorScaleOp) { return 1; }
+
+REGISTER_MLIR_INTERPRETER_OP("vector.yield", noOpTerminator);
+REGISTER_MLIR_INTERPRETER_OP(bitcast);
+REGISTER_MLIR_INTERPRETER_OP(broadcast);
+REGISTER_MLIR_INTERPRETER_OP(compressStore);
+REGISTER_MLIR_INTERPRETER_OP(constantMask);
+REGISTER_MLIR_INTERPRETER_OP(contract);
+REGISTER_MLIR_INTERPRETER_OP(createMask);
+REGISTER_MLIR_INTERPRETER_OP(expandLoad);
+REGISTER_MLIR_INTERPRETER_OP(extract);
+REGISTER_MLIR_INTERPRETER_OP(extractElement);
+REGISTER_MLIR_INTERPRETER_OP(extractSlice);
+REGISTER_MLIR_INTERPRETER_OP(fusedMultiplyAdd);
+REGISTER_MLIR_INTERPRETER_OP(flatTranspose);
+REGISTER_MLIR_INTERPRETER_OP(gather);
+REGISTER_MLIR_INTERPRETER_OP(insert);
+REGISTER_MLIR_INTERPRETER_OP(insertElement);
+REGISTER_MLIR_INTERPRETER_OP(insertSlice);
+REGISTER_MLIR_INTERPRETER_OP(load);
+REGISTER_MLIR_INTERPRETER_OP(mask);
+REGISTER_MLIR_INTERPRETER_OP(maskedLoad);
+REGISTER_MLIR_INTERPRETER_OP(maskedStore);
+REGISTER_MLIR_INTERPRETER_OP(multiReduction);
+REGISTER_MLIR_INTERPRETER_OP(outerProduct);
+REGISTER_MLIR_INTERPRETER_OP(reduction);
+REGISTER_MLIR_INTERPRETER_OP(shapeCast);
+REGISTER_MLIR_INTERPRETER_OP(shuffle);
+REGISTER_MLIR_INTERPRETER_OP(splat);
+REGISTER_MLIR_INTERPRETER_OP(store);
+REGISTER_MLIR_INTERPRETER_OP(transferRead);
+REGISTER_MLIR_INTERPRETER_OP(transferWrite);
+REGISTER_MLIR_INTERPRETER_OP(transpose);
+REGISTER_MLIR_INTERPRETER_OP(typeCast);
+REGISTER_MLIR_INTERPRETER_OP(vScale);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc
new file mode 100644
index 00000000000..c2913946fff
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.cc
@@ -0,0 +1,126 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/interpreter.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Support/LLVM.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+
+SmallVector<InterpreterValue> interpret(InterpreterState& state,
+                                        Operation& op) {
+  auto fn = detail::getFunction(op.getName().getStringRef());
+  if (!fn) {
+    llvm::errs() << "Unsupported op: " << op.getName().getStringRef() << "\n";
+    op.dump();
+    state.addFailure("unsupported op");
+    return {};
+  }
+  SmallVector<InterpreterValue> operands;
+  for (auto operand : op.getOperands()) {
+    operands.push_back(state.getTopScope()->Get(operand));
+  }
+  state.getOptions().listener->beforeOp(operands, &op);
+  auto results = fn(operands, &op, state);
+  state.getOptions().listener->afterOp(results);
+  state.step();
+  return results;
+}
+
+SmallVector<InterpreterValue> interpret(InterpreterState& state, Region& region,
+                                        ArrayRef<InterpreterValue> bbargs) {
+  if (state.hasFailure()) return {};
+  assert(region.hasOneBlock() && "expected region to have one block");
+  state.getOptions().listener->enterRegion(bbargs, region);
+  InterpreterScope scope(state);
+
+  auto& block = region.getBlocks().front();
+  for (auto [value, interpreter_value] :
+       llvm::zip(block.getArguments(), bbargs)) {
+    scope.Set(value, interpreter_value);
+  }
+
+  for (mlir::Operation& op : block.without_terminator()) {
+    auto results = interpret(state, op);
+    if (state.hasFailure()) return {};
+    if (results.size() != op.getNumResults()) {
+      llvm::errs() << "Unexpected number of results while interpreting "
+                   << op.getName().getStringRef() << ". Interpreter bug?\n";
+      llvm_unreachable("unexpected number of results");
+    }
+    for (auto [v, iv] : llvm::zip(op.getResults(), results)) {
+      scope.Set(v, iv);
+    }
+  }
+  auto result = interpret(state, *block.getTerminator());
+  if (state.hasFailure()) return {};
+
+  state.getOptions().listener->leaveRegion(result);
+  return result;
+}
+
+InterpreterState::InterpreterState(const mlir::SymbolTable& symbols,
+                                   InterpreterOptions options)
+    : symbols(symbols), options(options) {
+  if (!options.listener) {
+    static auto& noOpListener = *new InterpreterListener();
+    this->options.listener = &noOpListener;
+  }
+  if (options.maxSteps) {
+    remainingSteps = *options.maxSteps;
+  }
+}
+
+void InterpreterState::addFailure(llvm::StringRef failure) {
+  failed = true;
+  options.errorHandler(failure);
+}
+
+InterpreterScope::~InterpreterScope() {
+  for (auto& [_, value] : values) {
+    if (value.isTensor() && value.buffer() &&
+        value.buffer()->hasOutOfBoundsAccess()) {
+      state.addFailure("Out of bounds access");
+      break;
+    }
+  }
+
+  state.topScope = parentScope;
+}
+
+mlir::FailureOr<SmallVector<InterpreterValue>> runInterpreter(
+    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
+    ArrayRef<InterpreterValue> args, InterpreterOptions options) {
+  InterpreterState state{symbols, options};
+  auto results = interpret(state, function.getBody(), args);
+  if (state.hasFailure()) {
+    return failure();
+  }
+  return results;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h
new file mode 100644
index 00000000000..50a459ee093
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter.h
@@ -0,0 +1,191 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
+
+#include <algorithm>
+#include <complex>
+#include <cstddef>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Support/LLVM.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+
+class InterpreterScope;
+
+class InterpreterListener {
+ public:
+  virtual ~InterpreterListener() = default;
+  virtual void beforeOp(ArrayRef<InterpreterValue> args, mlir::Operation*) {}
+  virtual void afterOp(ArrayRef<InterpreterValue> results) {}
+  virtual void enterRegion(ArrayRef<InterpreterValue> args,
+                           mlir::Region& region) {}
+  virtual void leaveRegion(ArrayRef<InterpreterValue> terminatorArgs) {}
+};
+
+struct InterpreterStats {
+  // Memrefs only.
+  int64_t heapSize = 0;
+  int64_t peakHeapSize = 0;
+  int64_t numAllocations = 0;
+  int64_t numDeallocations = 0;
+};
+
+struct InterpreterOptions {
+  InterpreterListener* listener = nullptr;
+  std::optional<int64_t> maxSteps = std::nullopt;
+  // If set, ignore deallocations. Normally, accessing a deallocated memref will
+  // trigger an assertion. This flag disables all alloctions, which can be
+  // useful when debugging IR that includes a use-after-free bug.
+  bool disableDeallocations = false;
+  std::function<void(llvm::StringRef)> errorHandler =
+      [](llvm::StringRef failure) {
+        llvm::errs() << "Interpreter failure: " << failure << "\n";
+      };
+  InterpreterStats* stats = nullptr;
+};
+
+class InterpreterState {
+ public:
+  InterpreterState(const mlir::SymbolTable& symbols,
+                   InterpreterOptions options);
+
+  void step() {
+    if (remainingSteps == 0) {
+      addFailure("maximum number of steps exceeded");
+      return;
+    }
+    --remainingSteps;
+  }
+  void addFailure(llvm::StringRef failure);
+  bool hasFailure() const { return failed; }
+  void checkSuccess(LogicalResult result, llvm::StringRef failure) {
+    if (!result.succeeded()) {
+      addFailure(failure);
+    }
+  }
+
+  InterpreterScope* getTopScope() { return topScope; }
+  const mlir::SymbolTable& getSymbols() const { return symbols; }
+  const InterpreterOptions& getOptions() { return options; }
+
+ private:
+  const mlir::SymbolTable& symbols;
+  InterpreterScope* topScope = nullptr;
+  bool failed = false;
+  InterpreterOptions options;
+  int64_t remainingSteps = std::numeric_limits<int64_t>::max();
+
+  friend class InterpreterScope;
+  friend class InterpreterScopeStash;
+};
+
+// Used for passing arbitrary data to ops in sub-regions.
+class InterpreterSideChannel {
+ public:
+  virtual ~InterpreterSideChannel() = default;
+};
+
+// Holds a mapping from SSA values to InterpreterValues and registered side
+// channels. There's typically one scope per region, but ops can add additional
+// scopes if needed (for example, to register a side channel).
+class InterpreterScope {
+ public:
+  InterpreterScope(InterpreterScope&&) = delete;
+  explicit InterpreterScope(InterpreterState& state)
+      : state(state), parentScope(state.topScope) {
+    state.topScope = this;
+  }
+  ~InterpreterScope();
+
+  void Set(Value v, InterpreterValue iv) { values[v] = std::move(iv); }
+
+  const InterpreterValue& Get(Value v) {
+    auto ret = values.find(v);
+    if (ret == values.end()) {
+      if (!parentScope) {
+        v.dump();
+      }
+
+      assert(parentScope && "value not found");
+      return parentScope->Get(v);
+    }
+    return ret->second;
+  }
+
+  // Retrieves the side channel of the given type in this scope or one of its
+  // ancestor scopes. If `optional` is set, returns nullptr if not found,
+  // otherwise asserts.
+  template <typename T>
+  T* getSideChannel(bool optional = false) {
+    for (auto& sideChannel : sideChannels) {
+      if (auto it = dynamic_cast<T*>(sideChannel.get())) {
+        return it;
+      }
+    }
+    if (!parentScope && optional) return nullptr;
+    assert(parentScope && "side channel not found");
+    return parentScope->getSideChannel<T>(optional);
+  }
+
+  // Registers the given side channel. Will shadow a side channel of the same
+  // type if registered in an outer scope.
+  // The behavior of registering two side channels of the same type in the same
+  // scope is undefined.
+  void setSideChannel(std::shared_ptr<InterpreterSideChannel> sideChannel) {
+    sideChannels.push_back(std::move(sideChannel));
+  }
+
+ private:
+  DenseMap<Value, InterpreterValue> values;
+  SmallVector<std::shared_ptr<InterpreterSideChannel>> sideChannels;
+
+  InterpreterState& state;
+  InterpreterScope* parentScope;
+
+  friend class InterpreterScopeStash;
+};
+
+// Interprets the given region and returns the terminator's arguments. The
+// region must have a single block.
+SmallVector<InterpreterValue> interpret(InterpreterState& state, Region& region,
+                                        ArrayRef<InterpreterValue> bbargs);
+
+// Interprets the given function.
+mlir::FailureOr<SmallVector<InterpreterValue>> runInterpreter(
+    const mlir::SymbolTable& symbols, mlir::func::FuncOp function,
+    ArrayRef<InterpreterValue> args, InterpreterOptions options = {});
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc
new file mode 100644
index 00000000000..e38727a062a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.cc
@@ -0,0 +1,361 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+
+#include <complex>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <variant>
+
+namespace mlir {
+namespace interpreter {
+
+namespace {
+struct TypeStr {
+  static std::string_view get(bool) { return "i1"; }
+  static std::string_view get(int64_t) { return "i64"; }
+  static std::string_view get(int32_t) { return "i32"; }
+  static std::string_view get(int16_t) { return "i16"; }
+  static std::string_view get(int8_t) { return "i8"; }
+  static std::string_view get(uint64_t) { return "ui64"; }
+  static std::string_view get(uint32_t) { return "ui32"; }
+  static std::string_view get(uint16_t) { return "ui16"; }
+  static std::string_view get(uint8_t) { return "ui8"; }
+  static std::string_view get(float) { return "f32"; }
+  static std::string_view get(double) { return "f64"; }
+  static std::string_view get(std::complex<float>) { return "complex<f32>"; }
+  static std::string_view get(std::complex<double>) { return "complex<f64>"; }
+};
+
+struct InterpreterValuePrinter {
+  llvm::raw_ostream& os;
+
+  template <typename T>
+  void operator()(const TensorOrMemref<T>& t) {
+    if (t.view.isVector) {
+      os << "vector<";
+    } else {
+      os << "TensorOrMemref<";
+    }
+    ArrayRef<int64_t> sizes = t.view.sizes;
+    for (int64_t size : sizes.drop_back(t.view.numVectorDims.value_or(0))) {
+      os << size << "x";
+    }
+    if (t.view.numVectorDims) {
+      os << "vector<";
+      for (int64_t size : sizes.take_back(*t.view.numVectorDims)) {
+        os << size << "x";
+      }
+      os << TypeStr::get(T{}) << ">>: ";
+    } else {
+      os << TypeStr::get(T{}) << ">: ";
+    }
+    SmallVector<int64_t> indices(t.view.rank() +
+                                 t.view.numVectorDims.value_or(0));
+    std::function<void(int64_t)> print;
+    print = [&](int64_t dim) {
+      if (dim == indices.size()) {
+        printScalar(t.at(indices));
+      } else {
+        os << "[";
+        for (int64_t i = 0; i < t.view.sizes[dim]; ++i) {
+          if (i > 0) os << ", ";
+          indices[dim] = i;
+          print(dim + 1);
+        }
+        os << "]";
+      }
+    };
+    if (t.buffer->deallocated()) {
+      os << "<<deallocated>>";
+    } else {
+      print(0);
+    }
+  }
+
+  void operator()(const Tuple& t) {
+    os << "(";
+    bool first = true;
+    for (const auto& v : t.values) {
+      if (!first) os << ", ";
+      first = false;
+      v->print(os);
+    }
+    os << ")";
+  }
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << TypeStr::get(t) << ": ";
+    printScalar(t);
+  }
+
+  template <typename T>
+  void printScalar(const T& v) {
+    os << v;
+  }
+
+  template <typename T>
+  void printScalar(const std::complex<T>& v) {
+    os << v.real() << (v.imag() >= 0 ? "+" : "") << v.imag() << "i";
+  }
+
+  void printScalar(bool v) { os << (v ? "true" : "false"); }
+
+  void printScalar(int8_t v) { os << (int)v; }
+  void printScalar(uint8_t v) { os << (int)v; }
+};
+}  // namespace
+
+void InterpreterValue::print(llvm::raw_ostream& os) const {
+  std::visit(InterpreterValuePrinter{os}, storage);
+}
+
+std::string InterpreterValue::toString() const {
+  std::string buf;
+  llvm::raw_string_ostream os(buf);
+  print(os);
+  return buf;
+}
+
+InterpreterValue InterpreterValue::extractElement(
+    llvm::ArrayRef<int64_t> indices) const {
+  return std::visit(
+      [&](auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          if (it.view.numVectorDims) {
+            return {it.vectorAt(indices)};
+          } else {
+            return {it.at(indices)};
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("extracting from tuples is unsupported");
+        } else {
+          return {it};
+        }
+      },
+      storage);
+}
+
+void InterpreterValue::insertElement(llvm::ArrayRef<int64_t> indices,
+                                     const InterpreterValue& value) {
+  std::visit(
+      [&](auto& it) {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          if (it.view.numVectorDims) {
+            auto subview = it.vectorAt(indices);
+            const auto& values = std::get<T>(value.storage);
+            assert(values.view.sizes == subview.view.sizes &&
+                   "mismatched sizes");
+            for (const auto& index : subview.view.indices()) {
+              subview.at(index) = values.at(index);
+            }
+          } else {
+            it.at(indices) = std::get<typename T::element_type>(value.storage);
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("inserting into tuples is unsupported");
+        } else {
+          it = std::get<T>(value.storage);
+        }
+      },
+      storage);
+}
+
+void InterpreterValue::fill(
+    const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>& f) {
+  std::visit(
+      [&](auto& it) {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          for (const auto& indices : it.view.indices()) {
+            if (it.view.numVectorDims) {
+              auto subview = it.vectorAt(indices);
+              auto value = std::get<T>(f(indices).storage);
+              for (const auto& index : subview.view.indices()) {
+                subview.at(index) = value.at(index);
+              }
+            } else {
+              it.at(indices) =
+                  std::get<typename T::element_type>(f(indices).storage);
+            }
+          }
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("filling tuples is unsupported");
+        } else {
+          it = std::get<T>(f({}).storage);
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::clone(ArrayRef<int64_t> layout) const {
+  return std::visit(
+      [&](const auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return {it.clone(layout)};
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("cloning tuples is unsupported");
+        } else {
+          return {it};
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::coerceLayout(
+    ArrayRef<int64_t> layout) const {
+  const auto& view = this->view();
+  if (view.strides == BufferView::getStridesForLayout(view.sizes, layout)) {
+    return *this;
+  }
+  return clone(layout);
+}
+
+InterpreterValue InterpreterValue::typedAlike(
+    llvm::ArrayRef<int64_t> shape) const {
+  return std::visit(
+      [&](const auto& it) -> InterpreterValue {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return {T::empty(shape)};
+        } else if constexpr (std::is_same_v<T, Tuple>) {
+          llvm_unreachable("TypedAlike for tuples is unsupported");
+        } else {
+          return {TensorOrMemref<T>::empty(shape)};
+        }
+      },
+      storage);
+}
+
+InterpreterValue InterpreterValue::makeTensor(mlir::Type elementType,
+                                              SmallVector<int64_t> shape) {
+  auto vectorTy = llvm::dyn_cast<VectorType>(elementType);
+  if (vectorTy) {
+    llvm::copy(vectorTy.getShape(), std::back_inserter(shape));
+  }
+  return dispatchScalarType(elementType, [&](auto dummy) -> InterpreterValue {
+    auto tensor = TensorOrMemref<decltype(dummy)>::empty(shape);
+    if (vectorTy) {
+      tensor.view.numVectorDims = vectorTy.getRank();
+    }
+    return {tensor};
+  });
+}
+
+BufferView& InterpreterValue::view() {
+  return std::visit(
+      [](auto& it) -> BufferView& {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.view;
+        }
+        llvm_unreachable("view is only supported for tensors");
+      },
+      storage);
+}
+
+const BufferView& InterpreterValue::view() const {
+  return std::visit(
+      [](const auto& it) -> const BufferView& {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.view;
+        }
+        llvm_unreachable("view is only supported for tensors");
+      },
+      storage);
+}
+
+bool InterpreterValue::isTensor() const {
+  return std::visit(
+      [](const auto& it) { return is_tensor_or_memref_v<decltype(it)>; },
+      storage);
+}
+
+InterpreterValue InterpreterValue::asUnitTensor(bool isVector) const {
+  auto result = typedAlike({});
+  result.insertElement({}, *this);
+  result.view().isVector = isVector;
+  return result;
+}
+
+bool Tuple::operator==(const Tuple& other) const {
+  if (other.values.size() != values.size()) return false;
+  for (const auto& [lhs, rhs] : llvm::zip(values, other.values)) {
+    if (!(*lhs == *rhs)) return false;
+  }
+  return true;
+}
+
+std::shared_ptr<Buffer> InterpreterValue::buffer() const {
+  return std::visit(
+      [](const auto& it) -> std::shared_ptr<Buffer> {
+        if constexpr (is_tensor_or_memref_v<decltype(it)>) {
+          return it.buffer;
+        } else {
+          llvm_unreachable("buffer() is only supported for tensors");
+        }
+      },
+      storage);
+}
+
+int64_t InterpreterValue::asInt() const {
+  auto visit = [](auto value) -> int64_t {
+    if constexpr (std::is_integral_v<decltype(value)>) {
+      return static_cast<int64_t>(value);
+    } else {
+      llvm_unreachable("only integral types can be converted to ints");
+    }
+  };
+  return std::visit(visit, storage);
+}
+
+uint64_t InterpreterValue::asUInt() const {
+  auto visit = [](auto value) -> uint64_t {
+    if constexpr (std::is_integral_v<decltype(value)>) {
+      if constexpr (std::is_signed_v<decltype(value)>) {
+        return static_cast<uint64_t>(
+            static_cast<std::make_unsigned_t<decltype(value)>>(value));
+      } else {
+        return static_cast<uint64_t>(value);
+      }
+    } else {
+      llvm_unreachable("only integral types can be converted to ints");
+    }
+  };
+  return std::visit(visit, storage);
+}
+
+int64_t InterpreterValue::getByteSizeOfElement() const {
+  return std::visit(
+      [](const auto& it) -> int64_t {
+        using T = std::decay_t<decltype(it)>;
+        if constexpr (is_tensor_or_memref_v<T>) {
+          return sizeof(typename T::element_type);
+        } else {
+          llvm_unreachable("scalars have no element sizes");
+        }
+      },
+      storage);
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h
new file mode 100644
index 00000000000..2866d401fad
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value.h
@@ -0,0 +1,224 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
+
+#include <complex>
+#include <cstddef>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Support/LLVM.h"
+#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+namespace mlir {
+namespace interpreter {
+
+struct InterpreterValue;
+
+struct Tuple {
+  bool operator==(const Tuple& other) const;
+
+  SmallVector<std::shared_ptr<InterpreterValue>> values;
+};
+
+// Holds a scalar, a tensor/memref or a tuple. Tensors/memrefs can also
+// represent vectors.
+struct InterpreterValue {
+  void print(llvm::raw_ostream& os) const;
+  std::string toString() const;
+
+  // Returns the element at the given indices. If the value is a scalar, returns
+  // itself.
+  InterpreterValue extractElement(llvm::ArrayRef<int64_t> indices) const;
+  // Sets the element at the given index. If the value is a scalar, sets its
+  // value.
+  void insertElement(llvm::ArrayRef<int64_t> indices,
+                     const InterpreterValue& value);
+  // Initializes all elements of the underlying tensor.
+  void fill(
+      const std::function<InterpreterValue(llvm::ArrayRef<int64_t> indices)>&
+          f);
+
+  // Converts a scalar to a unit tensor or vector.
+  InterpreterValue asUnitTensor(bool isVector = false) const;
+  // For integral interpreter values, casts them to int64.
+  int64_t asInt() const;
+  // For integral interpreter values, first casts them to the unsigned integer
+  // type of the same size, and then to uint64. For example, the result for
+  // int8_t{-1} is 255.
+  uint64_t asUInt() const;
+  // Must be a tensor or memref.
+  int64_t getByteSizeOfElement() const;
+
+  // Creates a new tensor InterpreterValue (backed a new buffer) with the same
+  // elementtype as this, but a different shape. If this is not a tensor, it is
+  // used as the element type.
+  // If `layout` is empty, the clone uses the default layout.
+  InterpreterValue clone(ArrayRef<int64_t> layout = {}) const;
+  // Returns either this tensor InterpreterValue (if its layout matches the
+  // requested layout) or a clone.
+  InterpreterValue coerceLayout(llvm::ArrayRef<int64_t> layout) const;
+  // Returns a tensor interpreter value with a newly allocated buffer of the
+  // given shape, with a default layout and the same element type as this
+  // interpreter value.
+  InterpreterValue typedAlike(llvm::ArrayRef<int64_t> shape) const;
+
+  // Creates a tensor with the given element type and shape. `element_type` may
+  // be a vector type, in which case the shape only specifies the non-vector
+  // dimensions.
+  static InterpreterValue makeTensor(mlir::Type elementType,
+                                     SmallVector<int64_t> shape);
+
+  // Returns the underlying tensor's view. Must be a tensor.
+  BufferView& view();
+  const BufferView& view() const;
+  // Returns the underlying tensor's buffer. Must be a tensor.
+  std::shared_ptr<Buffer> buffer() const;
+
+  bool isTensor() const;
+
+  bool operator==(const InterpreterValue& other) const {
+    if (storage.index() != other.storage.index()) return false;
+    if (isTensor() || std::holds_alternative<Tuple>(storage))
+      return storage == other.storage;
+    // Tensors treat NaNs as equal, so just wrap the values.
+    return asUnitTensor() == other.asUnitTensor();
+  }
+
+  std::variant<
+      Tuple, bool, float, double, uint8_t, int8_t, uint16_t, int16_t, uint32_t,
+      int32_t, uint64_t, int64_t, std::complex<float>, std::complex<double>,
+      TensorOrMemref<bool>, TensorOrMemref<float>, TensorOrMemref<double>,
+      TensorOrMemref<uint8_t>, TensorOrMemref<int8_t>, TensorOrMemref<uint16_t>,
+      TensorOrMemref<int16_t>, TensorOrMemref<uint32_t>,
+      TensorOrMemref<int32_t>, TensorOrMemref<uint64_t>,
+      TensorOrMemref<int64_t>, TensorOrMemref<std::complex<float>>,
+      TensorOrMemref<std::complex<double>>>
+      storage;
+};
+
+template <typename T>
+constexpr static bool is_valid_interpreter_value_v =  // NOLINT
+    std::is_constructible_v<decltype(InterpreterValue::storage), T>;
+
+// Attempts to cast the given value to the requested type, returning nullopt if
+// no cast is possible. This allows casts to the concrete type of the value
+// (e.g. an `InterpreterValue` containing a `Tuple` can be cast to `Tuple`),
+// casts from a unit tensor to their contents, and casts of scalars to any
+// convertible type.
+// NOTE: When casting to an unsigned type, this behaves differently than
+// InterpreterValue::AsUint. That function preserves the content's bit width,
+// so InterpreterValueDynCast<uint64_t>({int8_t{-1}}) will return 2^64-1,
+// whereas asUInt will return 255.
+template <typename T>
+std::optional<T> interpreterValueDynCast(InterpreterValue v) {
+  if constexpr (std::is_same_v<T, InterpreterValue>) {
+    return v;
+  }
+  if constexpr (is_valid_interpreter_value_v<T>) {
+    if (std::holds_alternative<T>(v.storage)) {
+      return std::get<T>(v.storage);
+    }
+  }
+  if (v.isTensor() && !is_tensor_or_memref_v<T>) {
+    if (v.view().getNumElements() != 1) {
+      return std::nullopt;
+    }
+    return interpreterValueDynCast<T>(v.extractElement({}));
+  }
+  return std::visit(
+      [](auto v) -> std::optional<T> {
+        if constexpr (std::is_convertible_v<decltype(v), T>) {
+          return v;
+        } else {
+          return std::nullopt;
+        }
+      },
+      v.storage);
+}
+
+template <typename T>
+T interpreterValueCast(InterpreterValue v) {
+  auto ret = interpreterValueDynCast<T>(v);
+  assert(ret && "cast failed");
+  return *std::move(ret);
+}
+
+// Calls functor with a value of the C++ type corresponding to the given `Type`,
+// (or its element type).
+template <class Fn>
+auto dispatchScalarType(mlir::Type ty, Fn&& functor) {
+  ty = getElementTypeOrSelf(ty);
+  if (ty.isF32()) {
+    return functor(float{});
+  }
+  if (ty.isF64()) {
+    return functor(double{});
+  }
+  if (ty.isUnsignedInteger(64)) {
+    return functor(uint64_t{});
+  }
+  if (ty.isInteger(64) || ty.isIndex()) {
+    return functor(int64_t{});
+  }
+  if (ty.isUnsignedInteger(32)) {
+    return functor(uint32_t{});
+  }
+  if (ty.isInteger(32)) {
+    return functor(int32_t{});
+  }
+  if (ty.isUnsignedInteger(16)) {
+    return functor(uint16_t{});
+  }
+  if (ty.isInteger(16)) {
+    return functor(int16_t{});
+  }
+  if (ty.isUnsignedInteger(8)) {
+    return functor(uint8_t{});
+  }
+  if (ty.isInteger(8)) {
+    return functor(int8_t{});
+  }
+  if (ty.isInteger(1)) {
+    return functor(bool{});
+  }
+  if (auto complex = ty.dyn_cast<ComplexType>()) {
+    if (complex.getElementType().isF32()) {
+      return functor(std::complex<float>{});
+    }
+    if (complex.getElementType().isF64()) {
+      return functor(std::complex<double>{});
+    }
+  }
+
+  llvm::errs() << "DispatchScalarType unimplemented for " << ty << "\n";
+  llvm_unreachable("unimplemented");
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h
new file mode 100644
index 00000000000..a601d483088
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/interpreter_value_util.h
@@ -0,0 +1,176 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
+
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+template <typename T>
+struct is_complex : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};  // NOLINT
+
+template <typename Fn>
+struct InterpreterValueMapVisitor {
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& t) {
+    if constexpr (Fn::template supportedType<T>()) {
+      using out_elem_t = decltype(Fn::apply(T()));
+      auto out = TensorOrMemref<out_elem_t>::emptyLike(t.view);
+      for (const auto& index : out.view.indices(true)) {
+        out.at(index) = Fn::apply(t.at(index));
+      }
+      return {out};
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& t) {
+    Tuple out;
+    for (const auto& value : t.values) {
+      out.values.push_back(std::make_unique<InterpreterValue>(
+          std::move(std::visit(*this, value->storage))));
+    }
+    return {out};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template supportedType<T>()) {
+      return {Fn::apply(t)};
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+template <typename Fn>
+struct InterpreterValueBiMapVisitor {
+  const InterpreterValue& rhs;
+
+  template <typename T>
+  InterpreterValue operator()(const TensorOrMemref<T>& lhsT) {
+    if constexpr (Fn::template supportedType<T>()) {
+      using OutElemT = decltype(Fn::apply(T(), T()));
+      auto out = TensorOrMemref<OutElemT>::emptyLike(lhsT.view);
+      const auto& rhsT = std::get<TensorOrMemref<T>>(rhs.storage);
+      for (const auto& index : out.view.indices(true)) {
+        out.at(index) = Fn::apply(lhsT.at(index), rhsT.at(index));
+      }
+      return {out};
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+  }
+
+  InterpreterValue operator()(const Tuple& lhsT) {
+    const auto& rhsT = std::get<Tuple>(rhs.storage);
+    Tuple out;
+    for (const auto& [lhs_v, rhs_v] : llvm::zip(lhsT.values, rhsT.values)) {
+      out.values.push_back(std::make_unique<InterpreterValue>(std::move(
+          std::visit(InterpreterValueBiMapVisitor{*rhs_v}, lhs_v->storage))));
+    }
+    return {std::move(out)};
+  }
+
+  template <typename T>
+  InterpreterValue operator()(const T& t) {
+    if constexpr (Fn::template supportedType<T>()) {
+      return {Fn::apply(t, std::get<T>(rhs.storage))};
+    } else {
+      llvm_unreachable("unsupported type");
+    }
+  }
+};
+
+}  // namespace detail
+
+template <typename T>
+inline constexpr bool is_complex_v = detail::is_complex<T>::value;  // NOLINT
+
+template <bool allow_bools, bool allow_ints, bool allow_floats,
+          bool allow_complex, bool allow_unsigned = true>
+struct FilterMapTraits {
+  template <typename T>
+  static constexpr bool supportedType() {
+    constexpr bool isBool = std::is_same_v<T, bool>;
+    constexpr bool isInt = std::is_integral_v<T> && !isBool;
+    constexpr bool isUnsigned = std::is_unsigned_v<T>;
+    return (allow_bools && isBool) ||
+           (allow_ints && isInt && (allow_unsigned || !isUnsigned)) ||
+           (allow_floats && std::is_floating_point_v<T>) ||
+           (allow_complex && is_complex_v<T>);
+  }
+};
+
+using CwiseAll = FilterMapTraits<true, true, true, true>;
+using CwiseArith = FilterMapTraits<false, true, true, true>;
+using CwiseComplex = FilterMapTraits<false, false, false, true>;
+using CwiseFloat = FilterMapTraits<false, false, true, false>;
+using CwiseInt = FilterMapTraits<false, true, false, false>;
+using CwiseIntegral = FilterMapTraits<true, true, false, false>;
+using CwiseNonIntegral = FilterMapTraits<false, false, true, true>;
+using CwiseReal = FilterMapTraits<false, true, true, false>;
+using CwiseSignedOrComplex = FilterMapTraits<false, true, true, true, false>;
+using CwiseSigned = FilterMapTraits<false, true, true, false, false>;
+
+template <typename Fn>
+InterpreterValue applyCwiseMap(const InterpreterValue& value) {
+  return std::visit(detail::InterpreterValueMapVisitor<Fn>{}, value.storage);
+}
+
+template <typename Fn>
+InterpreterValue applyCwiseBinaryMap(const InterpreterValue& lhs,
+                                     const InterpreterValue& rhs) {
+  assert(lhs.storage.index() == rhs.storage.index());
+  return std::visit(detail::InterpreterValueBiMapVisitor<Fn>{rhs}, lhs.storage);
+}
+
+// Unboxes (and casts if necessary) the given interpreter values. Asserts if the
+// types are incompatible.
+template <typename T>
+SmallVector<T> unpackInterpreterValues(ArrayRef<InterpreterValue> values) {
+  SmallVector<T> result;
+  for (const auto& value : values) {
+    result.push_back(interpreterValueCast<T>(value));
+  }
+  return result;
+}
+
+// Boxes the given values in InterpreterValues.
+template <typename T>
+SmallVector<InterpreterValue> packInterpreterValues(ArrayRef<T> values) {
+  SmallVector<InterpreterValue> result;
+  for (const auto& value : values) {
+    result.push_back({value});
+  }
+  return result;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_INTERPRETER_VALUE_UTIL_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc
new file mode 100644
index 00000000000..8481f8a7bbd
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.cc
@@ -0,0 +1,120 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/registration.h"
+
+#include <functional>
+#include <utility>
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+namespace {
+
+// Aliases and function names are wrapped in functions because function
+// registrations are called from static initializers, whose execution order is
+// undefined.
+DenseMap<llvm::StringRef, llvm::StringRef>& getOpAliases() {
+  static DenseMap<llvm::StringRef, llvm::StringRef>* aliases = nullptr;
+  if (!aliases) {
+    aliases = new DenseMap<llvm::StringRef, llvm::StringRef>();
+  }
+  return *aliases;
+}
+
+DenseMap<llvm::StringRef, InterpreterFunction>& getFunctions() {
+  static DenseMap<llvm::StringRef, InterpreterFunction>* functions = nullptr;
+  if (!functions) {
+    functions = new DenseMap<llvm::StringRef, InterpreterFunction>();
+  }
+  return *functions;
+}
+
+}  // namespace
+
+InterpreterFunction getFunction(llvm::StringRef name) {
+  const auto& fns = getFunctions();
+  auto fn = fns.find(name);
+  if (fn != fns.end()) {
+    return fn->second;
+  }
+  const auto& aliases = getOpAliases();
+  auto alias = aliases.find(name);
+  if (alias != aliases.end()) {
+    return fns.find(alias->second)->second;
+  }
+  return nullptr;
+}
+
+void registerInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&)) {
+  registerInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        assert(operands.size() == 1 && "unexpected number of operands");
+        return {fn(operands[0])};
+      });
+}
+
+void registerInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&,
+                                                  const InterpreterValue&)) {
+  registerInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        assert(operands.size() == 2 && "unexpected number of operands");
+        return {fn(operands[0], operands[1])};
+      });
+}
+
+void registerInterpreterOp(
+    llvm::StringRef name,
+    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>)) {
+  registerInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        return {fn(operands)};
+      });
+}
+
+void registerInterpreterOp(llvm::StringRef name,
+                           void (*fn)(MutableArrayRef<InterpreterValue>)) {
+  registerInterpreterOp(
+      name,
+      [fn](MutableArrayRef<InterpreterValue> operands, mlir::Operation*,
+           InterpreterState&) -> SmallVector<InterpreterValue> {
+        fn(operands);
+        return {};
+      });
+}
+
+void registerInterpreterOp(
+    llvm::StringRef name,
+    std::function<llvm::SmallVector<InterpreterValue>(
+        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
+        fn) {
+  getFunctions()[name] = std::move(fn);
+}
+
+void registerInterpreterOp(llvm::StringRef name, llvm::StringRef original) {
+  getOpAliases()[name] = original;
+}
+
+}  // namespace detail
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h
new file mode 100644
index 00000000000..abf6435cd6d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/registration.h
@@ -0,0 +1,225 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
+
+#include <functional>
+#include <optional>
+#include <type_traits>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TypeName.h"
+#include "mlir/Support/LLVM.h"
+#include "absl/strings/str_cat.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/interpreter_value_util.h"
+
+#define MLIR_INTERPRETER_CONCAT_IMPL(x, y) x##y
+#define MLIR_INTERPRETER_CONCAT(x, y) MLIR_INTERPRETER_CONCAT_IMPL(x, y)
+#define REGISTER_MLIR_INTERPRETER_OP(args...)                     \
+  static int MLIR_INTERPRETER_CONCAT(init_, __COUNTER__) = []() { \
+    ::mlir::interpreter::detail::registerInterpreterOp(args);     \
+    return 1;                                                     \
+  }();
+
+namespace mlir {
+namespace interpreter {
+namespace detail {
+
+// The generic signature for interpreter functions. Typically the type-checked
+// form should be used instead.
+using InterpreterFunction = std::function<SmallVector<InterpreterValue>(
+    MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>;
+
+// Returns the given registered function, or nullptr if not found.
+InterpreterFunction getFunction(llvm::StringRef name);
+
+// Simple unary ops.
+void registerInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&));
+
+// Simple binary ops.
+void registerInterpreterOp(llvm::StringRef name,
+                           InterpreterValue (*fn)(const InterpreterValue&,
+                                                  const InterpreterValue&));
+
+template <typename T>
+struct is_optional : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_optional<std::optional<T>> : std::true_type {};  // NOLINT
+
+// Converts the given arguments to the requested type. Supported target types:
+// - InterpreterValue storage types (e.g. uint8_t, TensorOrMemref<float>).
+//   Scalars will be cast if necessary.
+// - std::optional of InterpreterValue storage types
+// - ArrayRef of InterpreterValue storage types. Will cast if necessary (e.g.
+//   int32_t -> int64_t).
+// - no-op conversions: InterpreterValue, ArrayRef<InterpreterValue>
+template <typename ArgT>
+auto typedInterpreterOpConvertArg(MutableArrayRef<InterpreterValue> args,
+                                  InterpreterState& state) {
+  constexpr bool optional = is_optional<ArgT>::value;
+  constexpr bool single = std::is_same_v<ArgT, InterpreterValue> ||
+                          is_valid_interpreter_value_v<ArgT>;
+  if constexpr (optional) {
+    if (args.empty()) {
+      return ArgT{};
+    }
+  }
+
+  if constexpr (single || optional) {
+    if (args.size() != 1) {
+      state.addFailure("Expected a single argument for the operand");
+      return ArgT{};
+    }
+  }
+
+  auto fail = [&]() {
+    state.addFailure(absl::StrCat("Unable to convert argument (variant index ",
+                                  args[0].storage.index(), ") to ",
+                                  llvm::getTypeName<ArgT>().str()));
+    return ArgT{};
+  };
+
+  if constexpr (single) {
+    if (auto arg = interpreterValueDynCast<ArgT>(args[0])) {
+      return ArgT{*arg};
+    }
+    return fail();
+  } else if constexpr (optional) {
+    using T = std::decay_t<decltype(*std::declval<ArgT>())>;
+    if (auto arg = interpreterValueDynCast<T>(args[0])) {
+      return arg;
+    }
+    return fail();
+  } else {
+    using E = std::decay_t<decltype(*std::declval<ArgT>().begin())>;
+    // Container argument types (e.g. MutableArrayRef<InterpreterValue>,
+    // ArrayRef<int64_t>).
+    if constexpr (std::is_same_v<E, InterpreterValue>) {
+      return ArgT{args};
+    } else {
+      // Note: we don't cast to ArgT here, because that's typically an ArrayRef,
+      // which would lead to returning a reference to a temporary.
+      return unpackInterpreterValues<E>(args);
+    }
+  }
+}
+
+// Converts the given return value. Supported target types:
+// - InterpreterValue (no-op conversion)
+// - InterpreterValue storage types (the value will be boxed)
+// - SmallVector<InterpreterValue>
+template <typename RetT>
+SmallVector<InterpreterValue> typedInterpreterOpConvertRet(RetT ret) {
+  if constexpr (std::is_same_v<RetT, InterpreterValue>) {
+    return {ret};
+  } else if constexpr (is_valid_interpreter_value_v<RetT>) {
+    return {InterpreterValue{ret}};
+  } else if constexpr (std::is_same_v<RetT, SmallVector<InterpreterValue>>) {
+    return ret;
+  } else {
+    using E = std::decay_t<decltype(*std::declval<RetT>().begin())>;
+    return packInterpreterValues(ArrayRef<E>(ret));
+  }
+}
+
+// Adapts the given function to the generic handler signature
+// (SmallVector<InterpreterValue>(MutableArrayRef<InterpreterValue>, Operation*,
+// InterpreterState&)).
+// See the function below for usage.
+template <typename Op, typename Ret, typename... T, size_t... Indices>
+void registerTypedInterpreterOpImpl(Ret (*fn)(InterpreterState&, Op, T... args),
+                                    std::index_sequence<Indices...>) {
+  registerInterpreterOp(
+      Op::getOperationName(),
+      [fn](MutableArrayRef<InterpreterValue> args, mlir::Operation* op,
+           InterpreterState& state) -> SmallVector<InterpreterValue> {
+        auto cast = llvm::dyn_cast<Op>(op);
+        if (!cast) {
+          state.addFailure(absl::StrCat(
+              "failed to cast op '", op->getName().getStringRef().str(),
+              "' to expected type (", llvm::getTypeName<Op>().str(), ")"));
+          return {};
+        }
+        int64_t usedArgs = 0;
+        for (auto i : llvm::seq(0ul, sizeof...(T))) {
+          usedArgs += cast.getODSOperandIndexAndLength(i).second;
+        }
+        if (args.size() != usedArgs) {
+          state.addFailure("Op handler did not use all arguments");
+          return {};
+        }
+
+        auto extractArg = [&](auto index, auto* dummy) {
+          auto [pos, length] = cast.getODSOperandIndexAndLength(index);
+          using ArgT = std::decay_t<decltype(*dummy)>;
+          return typedInterpreterOpConvertArg<ArgT>(args.slice(pos, length),
+                                                    state);
+        };
+
+        if constexpr (std::is_same_v<Ret, void>) {
+          fn(state, cast, extractArg(Indices, (std::decay_t<T>*){nullptr})...);
+          return {};
+        } else {
+          Ret ret = fn(state, cast,
+                       extractArg(Indices, (std::decay_t<T>*){nullptr})...);
+          return typedInterpreterOpConvertRet(ret);
+        }
+      });
+}
+
+// registers the given function. The function should take one argument per
+// Op operand, in the same order as the Op.
+// The argument types should match the operation's operand types:
+// - Variadic<...> becomes ArrayRef<...>
+// - Optional<...> becomes std::optional<...>
+// - Unboxing is optionally supported, e.g. an Optional<Index> operand can be
+//   passed to either a std::optional<int64_t> or a
+//   std::optional<InterpreterValue>.
+// Valid return types are InterpreterValue, SmallVector<InterpreterValue>, void,
+// and any type boxable in an InterpreterValue.
+template <typename Op, typename Ret, typename... T>
+void registerInterpreterOp(Ret (*fn)(InterpreterState&, Op, T...)) {
+  registerTypedInterpreterOpImpl(fn, std::index_sequence_for<T...>{});
+}
+
+// Simple variadic ops (single output).
+void registerInterpreterOp(
+    llvm::StringRef name,
+    InterpreterValue (*fn)(MutableArrayRef<InterpreterValue>));
+
+// Simple variadic ops(no output).
+void registerInterpreterOp(llvm::StringRef name,
+                           void (*fn)(MutableArrayRef<InterpreterValue>));
+
+// Generic ops.
+void registerInterpreterOp(
+    llvm::StringRef name,
+    std::function<llvm::SmallVector<InterpreterValue>(
+        MutableArrayRef<InterpreterValue>, mlir::Operation*, InterpreterState&)>
+        fn);
+
+void registerInterpreterOp(llvm::StringRef name, llvm::StringRef original);
+
+}  // namespace detail
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_REGISTRATION_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
new file mode 100644
index 00000000000..0ad9c928ec5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
@@ -0,0 +1,156 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <optional>
+#include <utility>
+
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+
+namespace mlir {
+namespace interpreter {
+
+std::optional<int64_t> BufferView::getPhysicalIndex(
+    llvm::ArrayRef<int64_t> viewIndices) const {
+  int64_t result = offset;
+  if (!inBounds(viewIndices)) {
+    return std::nullopt;
+  }
+  for (int64_t i = 0; i < viewIndices.size(); ++i) {
+    result += viewIndices[i] * strides[i];
+  }
+  return result;
+}
+
+bool BufferView::inBounds(llvm::ArrayRef<int64_t> viewIndices) const {
+  for (auto [index, size] : llvm::zip(viewIndices, sizes)) {
+    if (index < 0 || index >= size) return false;
+  }
+  return true;
+}
+
+SmallVector<int64_t> BufferView::getDefaultStrides(ArrayRef<int64_t> sizes) {
+  SmallVector<int64_t> result(sizes.size());
+  int64_t stride = 1;
+  for (int64_t i = result.size() - 1; i >= 0; --i) {
+    result[i] = stride;
+    stride *= sizes[i];
+  }
+  return result;
+}
+
+SmallVector<int64_t> BufferView::getStridesForLayout(ArrayRef<int64_t> sizes,
+                                                     ArrayRef<int64_t> layout) {
+  if (layout.empty()) return getDefaultStrides(sizes);
+  auto inverseLayout = invertPermutationVector(layout);
+  SmallVector<int64_t> result(sizes.size());
+  int64_t stride = 1;
+  for (int64_t i = 0; i < layout.size(); ++i) {
+    result[inverseLayout[i]] = stride;
+    stride *= sizes[inverseLayout[i]];
+  }
+  return result;
+}
+
+LogicalResult BufferView::slice(int64_t dimIndex, int64_t dimOffset) {
+  llvm::SmallVector<int64_t> offsets(rank(), 0);
+  offsets[dimIndex] = dimOffset;
+  if (auto newOffset = getPhysicalIndex(offsets)) {
+    offset = *newOffset;
+  } else {
+    return failure();
+  }
+  if (dimIndex >= rank()) --*numVectorDims;
+  strides.erase(strides.begin() + dimIndex);
+  sizes.erase(sizes.begin() + dimIndex);
+  return success();
+}
+
+LogicalResult BufferView::slice(int64_t dimIndex, int64_t dimOffset,
+                                int64_t dimSize, int64_t dimStride) {
+  llvm::SmallVector<int64_t> offsets(rank(), 0);
+  offsets[dimIndex] = dimOffset;
+  if (dimSize == 0) {
+    offset = 0;
+  } else if (auto newOffset = getPhysicalIndex(offsets)) {
+    offset = *newOffset;
+  } else {
+    return failure();
+  }
+  sizes[dimIndex] = dimSize;
+  strides[dimIndex] *= dimStride;
+  return success();
+}
+
+LogicalResult BufferView::subview(ArrayRef<int64_t> subviewOffsets,
+                                  ArrayRef<int64_t> subviewSizes,
+                                  ArrayRef<int64_t> subviewStrides) {
+  if (auto newOffset = getPhysicalIndex(subviewOffsets)) {
+    offset = *newOffset;
+  } else {
+    return failure();
+  }
+
+  for (auto [inSize, subview_offset, subview_size, subview_stride] :
+       llvm::zip(sizes, subviewOffsets, subviewSizes, subviewStrides)) {
+    int64_t limitIndex = subview_offset + (subview_size - 1) * subview_stride;
+    if (subview_offset < 0 || subview_offset >= inSize || limitIndex < 0 ||
+        limitIndex >= inSize) {
+      return failure();
+    }
+  }
+
+  for (auto [in_stride, subview_stride] : llvm::zip(strides, subviewStrides)) {
+    in_stride *= subview_stride;
+  }
+  sizes = llvm::to_vector(subviewSizes);
+  return success();
+}
+
+int64_t BufferView::getNumElements(bool includeVectorDims) const {
+  size_t n = 1;
+  for (auto size : ArrayRef<int64_t>(sizes).drop_back(
+           includeVectorDims ? 0 : numVectorDims.value_or(0)))
+    n *= size;
+  return n;
+}
+
+std::optional<int64_t> BufferView::getCollapsedStride(
+    llvm::ArrayRef<int64_t> dims) const {
+  using StrideAndDim = std::pair<int64_t, int64_t>;
+  llvm::SmallVector<StrideAndDim> stridesAndDims;
+  for (auto dim : dims) {
+    if (sizes[dim] != 1) {
+      stridesAndDims.emplace_back(strides[dim], dim);
+    }
+  }
+
+  if (stridesAndDims.empty()) return 0;
+
+  llvm::sort(stridesAndDims);
+  int64_t nextStride = stridesAndDims.front().first;
+  for (auto [stride, dim] : stridesAndDims) {
+    if (stride != nextStride) return std::nullopt;
+    nextStride *= sizes[dim];
+  }
+  return stridesAndDims.front().first;
+}
+
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
new file mode 100644
index 00000000000..6aee7bc5f5a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -0,0 +1,291 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+#define MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace interpreter {
+
+// Represents a view into a physical buffer.
+struct BufferView {
+  int64_t offset;
+  llvm::SmallVector<int64_t> sizes;    // [10, 11, 12]
+  llvm::SmallVector<int64_t> strides;  // [132, 12, 1]
+  // Number of vector element dimensions in the tensor. nullopt if this is a
+  // vector itself (isVector is set). {0} if this is a tensor of a unit vector.
+  std::optional<int64_t> numVectorDims = std::nullopt;
+  bool isVector = false;
+
+  int64_t rank() const { return sizes.size() - numVectorDims.value_or(0); }
+
+  // Removes the dimension from the view. If you need to keep it, use the
+  // overload below with dimSize = 1.
+  LogicalResult slice(int64_t dimIndex, int64_t dimOffset);
+  LogicalResult slice(int64_t dimIndex, int64_t dimOffset, int64_t dimSize,
+                      int64_t dimStride = 1);
+  LogicalResult subview(ArrayRef<int64_t> subviewoffsets,
+                        ArrayRef<int64_t> subviewsizes,
+                        ArrayRef<int64_t> subviewstrides);
+  int64_t getNumElements(bool includeVectorDims = false) const;
+
+  class LogicalIndexView {
+   public:
+    class Iterator {
+     public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = llvm::SmallVector<int64_t>;
+      using difference_type = std::ptrdiff_t;
+      using pointer = llvm::SmallVector<int64_t>*;
+      using reference = llvm::SmallVector<int64_t>&;
+
+      const llvm::SmallVector<int64_t>& operator*() const {
+        return viewIndices;
+      }
+      const llvm::SmallVector<int64_t>* operator->() const {
+        return &viewIndices;
+      }
+
+      Iterator& operator++() {
+        auto indexIt = viewIndices.rbegin();
+        auto sizeIt = view->sizes.rbegin();
+        if (!includeVectorDims) {
+          std::advance(sizeIt, view->numVectorDims.value_or(0));
+        }
+
+        for (auto e = viewIndices.rend(); indexIt != e; ++indexIt, ++sizeIt) {
+          ++*indexIt;
+          if (*indexIt < *sizeIt) {
+            return *this;
+          }
+          *indexIt = 0;
+        }
+
+        viewIndices.clear();
+        viewIndices.push_back(-1);
+        return *this;
+      }
+
+      Iterator operator++(int) {
+        auto tmp = *this;
+        ++(*this);
+        return tmp;
+      }
+
+      bool operator==(const Iterator& other) const {
+        return viewIndices == other.viewIndices;
+      }
+
+      bool operator!=(const Iterator& other) const { return !(*this == other); }
+
+     private:
+      friend class LogicalIndexView;
+
+      Iterator(const BufferView* view, llvm::SmallVector<int64_t> indices,
+               bool includeVectorDims)
+          : view(view),
+            viewIndices(std::move(indices)),
+            includeVectorDims(includeVectorDims) {}
+
+      const BufferView* view;
+      llvm::SmallVector<int64_t> viewIndices;
+      bool includeVectorDims;
+    };
+
+    Iterator begin() const {
+      if (view->getNumElements() == 0) return end();
+      return {view,
+              llvm::SmallVector<int64_t>(
+                  view->rank() +
+                  (includeVectorDims ? view->numVectorDims.value_or(0) : 0)),
+              includeVectorDims};
+    }
+    Iterator end() const { return {view, {-1}, false}; }
+
+   private:
+    friend class BufferView;
+
+    LogicalIndexView(const BufferView* view, bool includeVectorDims)
+        : view(view), includeVectorDims(includeVectorDims) {}
+
+    const BufferView* view;
+    bool includeVectorDims;
+  };
+
+  // Returns nullopt if the index is out of bounds.
+  std::optional<int64_t> getPhysicalIndex(
+      llvm::ArrayRef<int64_t> viewIndices) const;
+  LogicalIndexView indices(bool includeVectorDims = false) const {
+    return LogicalIndexView{this, includeVectorDims};
+  }
+  // Returns the stride resulting from collapsing the given dimensions, if
+  // possible.
+  std::optional<int64_t> getCollapsedStride(llvm::ArrayRef<int64_t> dims) const;
+
+  bool inBounds(llvm::ArrayRef<int64_t> viewIndices) const;
+  static SmallVector<int64_t> getDefaultStrides(ArrayRef<int64_t> sizes);
+  static SmallVector<int64_t> getStridesForLayout(ArrayRef<int64_t> sizes,
+                                                  ArrayRef<int64_t> layout);
+};
+
+// Backing for a TensorOrMemref.
+class Buffer {
+ private:
+  struct Dummy {};
+
+ public:
+  template <typename T>
+  static std::shared_ptr<Buffer> allocate(size_t size) {
+    return std::make_shared<Buffer>(Dummy{}, size, sizeof(T));
+  }
+
+  char* at(std::optional<int64_t> idx, int64_t elementSize) {
+    if (!idx) {
+      setHasOutOfBoundsAccess();
+      return &storage.data()[0];
+    }
+    assert(!isDeallocated && "accessing deallocated buffer");
+    return &storage.data()[*idx * elementSize];
+  }
+
+  const char* at(std::optional<int64_t> idx, int64_t elementSize) const {
+    if (!idx) {
+      setHasOutOfBoundsAccess();
+      return &storage.data()[0];
+    }
+    assert(!isDeallocated && "accessing deallocated buffer");
+    return &storage.data()[*idx * elementSize];
+  }
+
+  Buffer(Dummy, size_t numElements, size_t elementSize)
+      : storage(numElements * elementSize) {}
+
+  int64_t getByteSize() const { return storage.size(); }
+
+  void deallocate() { isDeallocated = true; }
+
+  bool deallocated() const { return isDeallocated; }
+
+  void setHasOutOfBoundsAccess() const { outOfBounds = true; }
+
+  bool hasOutOfBoundsAccess() const { return outOfBounds; }
+
+ private:
+  llvm::SmallVector<char> storage;
+  bool isDeallocated = false;
+  mutable bool outOfBounds = false;
+};
+
+template <typename T>
+struct TensorOrMemref {
+  using element_type = T;
+
+  static TensorOrMemref<T> empty(ArrayRef<int64_t> sizes,
+                                 ArrayRef<int64_t> layout = {}) {
+    BufferView dummy{0, SmallVector<int64_t>(sizes), {}};
+    return emptyLike(dummy, layout);
+  }
+
+  static TensorOrMemref<T> emptyLike(const BufferView& view,
+                                     ArrayRef<int64_t> layout = {}) {
+    BufferView newView = view;
+    newView.offset = 0;
+    newView.strides = BufferView::getStridesForLayout(view.sizes, layout);
+    return {Buffer::allocate<T>(view.getNumElements(true)), newView};
+  }
+
+  TensorOrMemref<T> clone(ArrayRef<int64_t> layout = {}) const {
+    auto out = emptyLike(view, layout);
+    for (auto [src_index, dst_index] :
+         llvm::zip(view.indices(true), out.view.indices(true))) {
+      out.at(dst_index) = at(src_index);
+    }
+    return out;
+  }
+
+  const T& at(ArrayRef<int64_t> indices) const {
+    return *reinterpret_cast<const T*>(
+        buffer->at(view.getPhysicalIndex(indices), sizeof(T)));
+  }
+
+  T& at(ArrayRef<int64_t> indices) {
+    return *reinterpret_cast<T*>(
+        buffer->at(view.getPhysicalIndex(indices), sizeof(T)));
+  }
+
+  TensorOrMemref vectorAt(ArrayRef<int64_t> indices) const {
+    auto offset = view.getPhysicalIndex(indices);
+    BufferView subview;
+    subview.strides = {view.strides.begin() + view.rank(), view.strides.end()};
+    subview.sizes = {view.sizes.begin() + view.rank(), view.sizes.end()};
+    if (offset) {
+      subview.offset = *offset;
+    } else {
+      buffer->setHasOutOfBoundsAccess();
+    }
+    subview.isVector = true;
+    subview.numVectorDims = std::nullopt;
+    return {buffer, subview};
+  }
+
+  bool operator==(const TensorOrMemref& other) const {
+    if (buffer->deallocated() || other.buffer->deallocated()) return false;
+    if (other.view.sizes != view.sizes) return false;
+    if (other.view.numVectorDims != view.numVectorDims) return false;
+    for (const auto& indices : view.indices(true)) {
+      // Treat NaNs as equal.
+      if constexpr (std::is_floating_point_v<T>) {
+        bool thisnan = isnan(at(indices));
+        bool othernan = isnan(other.at(indices));
+        if (thisnan || othernan) {
+          if (thisnan && othernan) continue;
+          return false;
+        }
+      }
+      if (at(indices) != other.at(indices)) return false;
+    }
+    return true;
+  }
+
+  std::shared_ptr<Buffer> buffer;
+  BufferView view;
+};
+
+template <typename T>
+struct is_tensor_or_memref : std::false_type {};  // NOLINT
+
+template <typename T>
+struct is_tensor_or_memref<TensorOrMemref<T>> : std::true_type {};  // NOLINT
+
+template <typename T>
+inline constexpr bool is_tensor_or_memref_v =  // NOLINT
+    is_tensor_or_memref<std::decay_t<T>>::value;
+
+}  // namespace interpreter
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOOLS_MLIR_INTERPRETER_FRAMEWORK_TENSOR_OR_MEMREF_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
new file mode 100644
index 00000000000..b77a340d2f4
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_test")
+
+cc_test(
+    name = "tensor_or_memref_test",
+    srcs = ["tensor_or_memref_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "interpreter_value_test",
+    srcs = ["interpreter_value_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo:mlir_interpreter_framework",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
new file mode 100644
index 00000000000..bbcfa509ca2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
@@ -0,0 +1,241 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+
+#include <variant>
+
+#include "llvm/ADT/ArrayRef.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+TEST(InterpreterValueTest, FillUnitTensor) {
+  auto t = TensorOrMemref<int64_t>::empty({});
+  t.at({}) = 42;
+  InterpreterValue v{t};
+  v.fill([](llvm::ArrayRef<int64_t>) { return InterpreterValue{int64_t{43}}; });
+  ASSERT_EQ(t.at({}), 43);
+}
+
+TEST(InterpreterValueTest, Fill1DTensor) {
+  auto t = TensorOrMemref<int64_t>::empty({3});
+  InterpreterValue v{t};
+  v.fill([](llvm::ArrayRef<int64_t> indices) {
+    return InterpreterValue{indices[0]};
+  });
+  ASSERT_EQ(t.at(0), 0);
+  ASSERT_EQ(t.at(1), 1);
+  ASSERT_EQ(t.at(2), 2);
+}
+
+TEST(InterpreterValueTest, FillTensorOfVector) {
+  auto t = TensorOrMemref<int64_t>::empty({4, 2});
+  t.view.numVectorDims = 1;
+
+  InterpreterValue v{t};
+  v.fill([](llvm::ArrayRef<int64_t> indices) -> InterpreterValue {
+    EXPECT_EQ(indices.size(), 1);
+    auto r = TensorOrMemref<int64_t>::empty({2});
+    r.view.isVector = true;
+    r.at(0) = indices[0];
+    r.at(1) = indices[0] * 10;
+    return {r};
+  });
+  ASSERT_EQ(
+      v.toString(),
+      "TensorOrMemref<4xvector<2xi64>>: [[0, 0], [1, 10], [2, 20], [3, 30]]");
+}
+
+TEST(InterpreterValueTest, FillZeroSizedTensor) {
+  auto t = TensorOrMemref<int64_t>::empty({0, 1});
+  InterpreterValue v{t};
+  bool wasCalled = false;
+  v.fill([&](llvm::ArrayRef<int64_t> indices) {
+    wasCalled = true;
+    return InterpreterValue{indices[0]};
+  });
+  EXPECT_FALSE(wasCalled);
+}
+
+TEST(InterpreterValueTest, TypedAlike) {
+  InterpreterValue v{TensorOrMemref<int32_t>::empty({})};
+  auto typedAlike = v.typedAlike({1, 2, 3});
+  ASSERT_TRUE(
+      std::holds_alternative<TensorOrMemref<int32_t>>(typedAlike.storage));
+  ASSERT_THAT(typedAlike.view().sizes, ElementsAre(1, 2, 3));
+}
+
+TEST(InterpreterValueTest, AsUnitTensor) {
+  InterpreterValue v{42};
+  InterpreterValue wrapped = v.asUnitTensor();
+  ASSERT_THAT(wrapped.view().sizes, IsEmpty());
+  ASSERT_EQ(std::get<TensorOrMemref<int32_t>>(wrapped.storage).at({}), 42);
+}
+
+TEST(InterpreterValueTest, IsTensor) {
+  ASSERT_FALSE(InterpreterValue{42}.isTensor());
+  ASSERT_TRUE(InterpreterValue{TensorOrMemref<int32_t>::empty({})}.isTensor());
+}
+
+TEST(InterpreterValueTest, AsInt) {
+  ASSERT_EQ(InterpreterValue{int64_t{42}}.asInt(), 42);
+  ASSERT_EQ(InterpreterValue{int32_t{42}}.asInt(), 42);
+  ASSERT_EQ(InterpreterValue{int16_t{42}}.asInt(), 42);
+  ASSERT_EQ(InterpreterValue{int8_t{42}}.asInt(), 42);
+  ASSERT_EQ(InterpreterValue{int8_t{-1}}.asInt(), -1);
+}
+
+TEST(InterpreterValueTest, AsUInt) {
+  ASSERT_EQ(InterpreterValue{int16_t{-1}}.asUInt(), 65535);
+  ASSERT_EQ(InterpreterValue{int8_t{-1}}.asUInt(), 255);
+}
+
+TEST(InterpreterValueTest, CloneTensor) {
+  auto tensor = TensorOrMemref<int64_t>::empty({3});
+  tensor.at(0) = 1;
+  tensor.at(1) = 2;
+  tensor.at(2) = 3;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.clone();
+  tensor.at(0) = 4;
+
+  auto& clonedTensor = std::get<TensorOrMemref<int64_t>>(clone.storage);
+  ASSERT_EQ(clonedTensor.at(0), 1);
+  ASSERT_EQ(clonedTensor.at(1), 2);
+  ASSERT_EQ(clonedTensor.at(2), 3);
+}
+
+TEST(InterpreterValueTest, CloneWithLayouts) {
+  auto tensor = TensorOrMemref<int64_t>::empty({3, 5}, {0, 1});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.clone();
+  ASSERT_EQ(clone.view().strides,
+            BufferView::getStridesForLayout({3, 5}, {1, 0}));
+  ASSERT_EQ(clone.extractElement({2, 4}).asInt(), 42);
+}
+
+TEST(InterpreterValueTest, CoerceLayoutNoop) {
+  auto tensor = TensorOrMemref<int64_t>::empty({3, 5}, {0, 1});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto coerced = wrapped.coerceLayout({0, 1});
+  ASSERT_EQ(tensor.buffer,
+            std::get<TensorOrMemref<int64_t>>(coerced.storage).buffer);
+}
+
+TEST(InterpreterValueTest, CoerceLayout) {
+  auto tensor = TensorOrMemref<int64_t>::empty({3, 5});
+  tensor.at({2, 4}) = 42;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.coerceLayout({0, 1});
+  ASSERT_EQ(clone.view().strides,
+            BufferView::getStridesForLayout({3, 5}, {0, 1}));
+  ASSERT_EQ(clone.extractElement({2, 4}).asInt(), 42);
+}
+
+TEST(InterpreterValueTest, CoerceLayoutSquare) {
+  auto tensor = TensorOrMemref<float>::empty({2, 2});
+  tensor.at({0, 0}) = 1;
+  tensor.at({0, 1}) = 2;
+  tensor.at({1, 0}) = 3;
+  tensor.at({1, 1}) = 4;
+
+  InterpreterValue wrapped{tensor};
+  auto clone = wrapped.coerceLayout({0, 1});
+  auto& clonedTensor = std::get<TensorOrMemref<float>>(clone.storage);
+
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(clonedTensor.buffer->at(0, sizeof(float))), 1);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(clonedTensor.buffer->at(1, sizeof(float))), 3);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(clonedTensor.buffer->at(2, sizeof(float))), 2);
+  EXPECT_EQ(
+      *reinterpret_cast<float*>(clonedTensor.buffer->at(3, sizeof(float))), 4);
+}
+
+TEST(InterpreterValueTest, CloneScalar) {
+  InterpreterValue value{42};
+  auto clone = value.clone();
+  ASSERT_THAT(std::get<int32_t>(clone.storage), 42);
+}
+
+TEST(InterpreterValueTest, ToString) {
+  InterpreterValue value{TensorOrMemref<int64_t>::empty({3})};
+  ASSERT_EQ(value.toString(), "TensorOrMemref<3xi64>: [0, 0, 0]");
+}
+
+TEST(InterpreterValueTest, ToString2d) {
+  InterpreterValue value{TensorOrMemref<int64_t>::empty({3, 2})};
+  ASSERT_EQ(value.toString(),
+            "TensorOrMemref<3x2xi64>: [[0, 0], [0, 0], [0, 0]]");
+}
+
+TEST(InterpreterValueTest, ToString0d) {
+  InterpreterValue value{TensorOrMemref<int64_t>::empty({})};
+  ASSERT_EQ(value.toString(), "TensorOrMemref<i64>: 0");
+}
+
+TEST(InterpreterValueTest, ToStringComplex) {
+  InterpreterValue value{std::complex<float>{}};
+  ASSERT_EQ(value.toString(), "complex<f32>: 0.000000e+00+0.000000e+00i");
+}
+
+TEST(InterpreterValueTest, ToStringDeallocated) {
+  InterpreterValue value{TensorOrMemref<int64_t>::empty({})};
+  value.buffer()->deallocate();
+  ASSERT_EQ(value.toString(), "TensorOrMemref<i64>: <<deallocated>>");
+}
+
+TEST(CastTest, UnpackTensor) {
+  InterpreterValue value{TensorOrMemref<int8_t>::empty({1, 1})};
+  value.insertElement({0, 0}, {int8_t{1}});
+  ASSERT_EQ(interpreterValueCast<int64_t>(value), 1);
+  ASSERT_EQ(interpreterValueCast<uint8_t>(value), 1);
+  ASSERT_EQ(interpreterValueCast<float>(value), 1.0f);
+  ASSERT_EQ(interpreterValueCast<double>(value), 1.0);
+
+  InterpreterValue nonUnit{TensorOrMemref<int8_t>::empty({2, 2})};
+  ASSERT_EQ(interpreterValueDynCast<int64_t>(nonUnit), std::nullopt);
+}
+
+TEST(CastTest, IdentityCast) {
+  InterpreterValue value{TensorOrMemref<float>::empty({1, 1})};
+  ASSERT_EQ(interpreterValueCast<InterpreterValue>(value), value);
+}
+
+TEST(CastTest, CastToUnsigned) {
+  // Note: This is different from `AsUint`, which preserves the size of the
+  // original type (i.e. int8_t{-1} results in 255).
+  InterpreterValue value{int8_t{-1}};
+  ASSERT_EQ(interpreterValueCast<uint8_t>(value), 255);
+  ASSERT_EQ(interpreterValueCast<uint16_t>(value), 65535);
+}
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
new file mode 100644
index 00000000000..a7187244a14
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/tensor_or_memref_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tools/mlir_interpreter/framework/tensor_or_memref.h"
+
+#include <algorithm>
+#include <limits>
+#include <optional>
+#include <utility>
+
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_join.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TensorOrMemrefTest, DefaultStrides) {
+  EXPECT_THAT(BufferView::getDefaultStrides({1, 2, 3}), ElementsAre(6, 3, 1));
+}
+
+TEST(TensorOrMemrefTest, StridesForLayout) {
+  EXPECT_THAT(BufferView::getStridesForLayout({1, 2, 3}, {2, 1, 0}),
+              ElementsAre(6, 3, 1));
+  EXPECT_THAT(BufferView::getStridesForLayout({1, 2, 3}, {0, 1, 2}),
+              ElementsAre(1, 1, 2));
+  EXPECT_THAT(BufferView::getStridesForLayout({3, 3, 3, 3}, {3, 0, 1, 2}),
+              ElementsAre(27, 1, 3, 9));
+}
+
+std::optional<int64_t> getCollapsedStrideNaive(llvm::ArrayRef<int64_t> dims,
+                                               const BufferView& view) {
+  BufferView f;
+  for (int64_t dim : dims) {
+    f.sizes.push_back(view.sizes[dim]);
+  }
+
+  // Find all physical indices for the dimensions.
+  llvm::SmallBitVector v(view.getNumElements());
+  for (const auto& indices : f.indices()) {
+    SmallVector<int64_t> viewIndices(view.rank());
+    for (auto [dim, index] : llvm::zip(dims, indices)) {
+      viewIndices[dim] = index;
+    }
+    v[*view.getPhysicalIndex(viewIndices)] = true;
+  }
+
+  if (v.count() != f.getNumElements()) return std::nullopt;
+  if (f.getNumElements() <= 1) return 0;
+
+  // Check that they have a common stride.
+  int64_t min = v.find_first();
+  int64_t expectedStride = (v.find_last() - min) / (f.getNumElements() - 1);
+  for (int64_t i = 0; i < f.getNumElements(); ++i) {
+    if (!v[i * expectedStride + min]) return std::nullopt;
+  }
+
+  return expectedStride;
+}
+
+TEST(TensorOrMemrefTest, CollapsedStride) {
+  BufferView view{.sizes = {1, 2, 3, 1, 5},
+                  .strides = BufferView::getDefaultStrides({1, 2, 3, 1, 5})};
+
+  auto checkAll = [&]() {
+    for (int64_t i = 0; i < (1 << view.rank()); ++i) {
+      SmallVector<int64_t> dims;
+      for (int64_t dim = 0; dim < view.rank(); ++dim) {
+        if (i & (1 << dim)) dims.push_back(dim);
+      }
+
+      do {
+        auto v = view.getCollapsedStride(dims);
+        auto n = getCollapsedStrideNaive(dims, view);
+        EXPECT_EQ(n, v) << "checking " << absl::StrJoin(dims, ", ");
+      } while (std::next_permutation(dims.begin(), dims.end()));
+    }
+  };
+
+  checkAll();
+  ASSERT_TRUE(view.slice(3, 0).succeeded());
+  checkAll();
+}
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc
new file mode 100644
index 00000000000..aaeea7b3378
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/mlir-interpreter-runner.cc
@@ -0,0 +1,139 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "lhlo/IR/lhlo_ops.h"
+#include "lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mhlo/IR/register.h"
+#include "mhlo/transforms/passes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Tools/ParseUtilities.h"
+#include "thlo/IR/thlo_ops.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+
+struct Options {
+  llvm::cl::opt<std::string> inputFilename{llvm::cl::Positional,
+                                           llvm::cl::desc("<input file>"),
+                                           llvm::cl::init("-")};
+  llvm::cl::opt<bool> runAllFunctions{
+      "run-all", llvm::cl::desc("Run all functions in the module"),
+      llvm::cl::init(false)};
+};
+
+static mlir::OwningOpRef<mlir::Operation *> parseMlirInput(
+    llvm::StringRef inputFilename, mlir::MLIRContext *context) {
+  std::string errorMessage;
+  auto file = mlir::openInputFile(inputFilename, &errorMessage);
+  if (!file) {
+    llvm::errs() << errorMessage << "\n";
+    return {};
+  }
+
+  auto sourceMgr = std::make_shared<llvm::SourceMgr>();
+  sourceMgr->AddNewSourceBuffer(std::move(file), mlir::SMLoc());
+  return mlir::parseSourceFileForTool(sourceMgr, context,
+                                      /*insertImplicitModule=*/true);
+}
+
+static mlir::LogicalResult run(mlir::ModuleOp module,
+                               mlir::func::FuncOp function) {
+  llvm::outs() << "@" << function.getName().str() << "()\n";
+  if (function.getBody().getBlocks().front().getNumArguments() > 0) {
+    llvm::errs() << "Function arguments are not supported.";
+    return mlir::failure();
+  }
+
+  mlir::SymbolTable symbolTable{module};
+  auto results =
+      mlir::interpreter::runInterpreter(symbolTable, function, {}, {});
+  if (!mlir::succeeded(results)) {
+    llvm::errs() << "Interpreter failed\n";
+    return mlir::failure();
+  }
+
+  if (!results->empty()) {
+    llvm::outs() << "Results:\n";
+    for (const auto &result : *results) {
+      llvm::outs() << result.toString() << "\n";
+    }
+  }
+
+  return mlir::success();
+}
+
+int main(int argc, char *argv[]) {
+  // Flush llvm::outs before writing errors.
+  llvm::errs().tie(&llvm::outs());
+
+  Options options;
+  llvm::cl::ParseCommandLineOptions(argc, argv, "MLIR CPU execution driver\n");
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::registerAllPasses();
+  registry.insert<mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect,
+                  mlir::gml_st::GmlStDialect, mlir::thlo::THLODialect>();
+
+  mlir::MLIRContext context(registry);
+  auto parsedInput = parseMlirInput(options.inputFilename, &context);
+  if (!parsedInput) {
+    llvm::errs() << "Failed to parse module.\n";
+    return 1;
+  }
+  auto module = llvm::dyn_cast<mlir::ModuleOp>(**parsedInput);
+  if (!module) {
+    llvm::errs() << "Parsing returned something that's not a module.\n";
+    return 1;
+  }
+
+  if (options.runAllFunctions) {
+    bool allSucceeded = true;
+    module.walk([&](mlir::func::FuncOp function) {
+      if (!function.isPrivate()) {
+        allSucceeded &= run(module, function).succeeded();
+      }
+    });
+    if (!allSucceeded) {
+      return 1;
+    }
+  } else {
+    auto *main = module.lookupSymbol("main");
+    if (!main) {
+      llvm::errs() << "no main function found.\n";
+      return 1;
+    }
+    if (!run(module, llvm::cast<mlir::func::FuncOp>(main)).succeeded()) {
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD b/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD
index 0266d23f8be..77bfc0370e1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD
@@ -1,8 +1,6 @@
 # MHLO -> TOSA bridge.
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":internal",
         "//tensorflow/compiler/xla/mlir_hlo/tosa:__subpackages__",
@@ -15,72 +13,12 @@ package_group(
     packages = [],
 )
 
-gentbl_cc_library(
-    name = "MHLOTOSAPDLLPatternsIncGen",
-    tbl_outs = [
-        (
-            ["-x=cpp"],
-            "include/mhlo_tosa/Transforms/legalize_mhlo.pdll.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-pdll",
-    td_file = "lib/Transforms/legalize_mhlo.pdll",
-    deps = [
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:TosaDialectTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "MHLOTOSATransformsPassIncGen",
-    compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = "include",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=MHLOTOSATransforms",
-            ],
-            "include/mhlo_tosa/Transforms/passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "include/mhlo_tosa/Transforms/passes.td",
-    deps = [
-        "@llvm-project//mlir:PassBaseTdFiles",
-    ],
-)
-
-cc_library(
-    name = "MHLOTOSATransforms",
-    srcs = [
-        "lib/Transforms/legalize_mhlo.cc",
-    ],
-    hdrs = [
-        "include/mhlo_tosa/Transforms/passes.h",
-    ],
-    includes = ["include"],
-    deps = [
-        ":MHLOTOSAPDLLPatternsIncGen",
-        ":MHLOTOSATransformsPassIncGen",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:TosaDialect",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
 cc_binary(
     name = "mhlo-tosa-opt",
     srcs = ["mhlo_tosa_opt.cc"],
     deps = [
-        ":MHLOTOSATransforms",
         "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/mlir_hlo/tosa/transforms:MHLOTOSATransforms",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt
index 46c501ce023..023f9643e30 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt
@@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_custom_target(check-mhlo-tosa)
 
-add_subdirectory(include/mhlo_tosa/Transforms)
-add_subdirectory(lib/Transforms)
+add_subdirectory(transforms)
 add_subdirectory(tests)
 
 add_mlir_pdll_library(MHLOTOSAPDLLPatternsIncGen
-  lib/Transforms/legalize_mhlo.pdll
-  include/mhlo_tosa/Transforms/legalize_mhlo.pdll.h.inc
+  transforms/legalize_mhlo/legalize_mhlo.pdll
+  transforms/legalize_mhlo/legalize_mhlo.pdll.h.inc
   )
 
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
@@ -42,6 +41,7 @@ set(LIBS
 add_llvm_executable(mhlo-tosa-opt mhlo_tosa_opt.cc
 )
 llvm_update_compile_flags(mhlo-tosa-opt)
+add_dependencies(mhlo-tosa-opt MHLOTOSATransformsPassIncGen)
 target_link_libraries(mhlo-tosa-opt PRIVATE ${LIBS})
 
 mlir_check_all_link_libraries(mhlo-tosa-opt)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/CMakeLists.txt
deleted file mode 100644
index bb50c802576..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS passes.td)
-mlir_tablegen(passes.h.inc -gen-pass-decls -name MHLOTOSATransforms)
-add_public_tablegen_target(MHLOTOSATransformsPassIncGen)
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.h
deleted file mode 100644
index 3dac9be53dd..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_MLIR_HLO_TOSA_INCLUDE_MHLO_TOSA_TRANSFORMS_PASSES_H_
-#define TENSORFLOW_COMPILER_XLA_MLIR_HLO_TOSA_INCLUDE_MHLO_TOSA_TRANSFORMS_PASSES_H_
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace tosa {
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass();
-
-#define GEN_PASS_REGISTRATION
-#define GEN_PASS_DECL_TOSALEGALIZEMHLOPASS
-#include "mhlo_tosa/Transforms/passes.h.inc"
-
-}  // namespace tosa
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_XLA_MLIR_HLO_TOSA_INCLUDE_MHLO_TOSA_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.td
deleted file mode 100644
index 90394952933..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/include/mhlo_tosa/Transforms/passes.td
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TosaLegalizeMhloPass : Pass<"tosa-legalize-mhlo", "mlir::func::FuncOp"> {
-  let summary = "Legalize from MHLO to TOSA";
-  let constructor = "createLegalizeMhloPass()";
-  let dependentDialects = ["::mlir::tosa::TosaDialect"];
-}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/CMakeLists.txt
deleted file mode 100644
index 78331b53a16..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_mlir_library(MHLOTOSATransforms
-  legalize_mhlo.cc
-
-  DEPENDS
-  MHLOTOSATransformsPassIncGen
-  MHLOTOSAPDLLPatternsIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRPass
-  MLIRTransforms
-  )
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.cc
deleted file mode 100644
index 910517c6841..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.cc
+++ /dev/null
@@ -1,385 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "mhlo_tosa/Transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define GEN_PASS_DEF_TOSALEGALIZEMHLOPASS
-#include "mhlo_tosa/Transforms/passes.h.inc"
-
-#define PASS_NAME "tosa-legalize-tf"
-#define DEBUG_TYPE PASS_NAME
-
-#include "mhlo_tosa/Transforms/legalize_mhlo.pdll.h.inc"
-
-namespace mlir {
-namespace tosa {
-namespace {
-
-struct LegalizeMhlo : ::impl::TosaLegalizeMhloPassBase<LegalizeMhlo> {
-  void runOnOperation() final;
-
-  LogicalResult initialize(MLIRContext* ctx) override;
-
- private:
-  FrozenRewritePatternSet patterns;
-};
-
-struct ConvertMhloCompareOp : public OpRewritePattern<mhlo::CompareOp> {
-  using OpRewritePattern<mhlo::CompareOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::CompareOp op,
-                                PatternRewriter& rewriter) const override {
-    auto direction = op.getComparisonDirection();
-    auto resultType = op->getResultTypes().front();
-
-    switch (direction) {
-      case mlir::mhlo::ComparisonDirection::EQ: {
-        rewriter.replaceOpWithNewOp<tosa::EqualOp>(op, resultType, op.getLhs(),
-                                                   op.getRhs());
-        break;
-      }
-      case mlir::mhlo::ComparisonDirection::NE: {
-        auto equalOp = rewriter.create<tosa::EqualOp>(op->getLoc(), resultType,
-                                                      op.getLhs(), op.getRhs());
-        rewriter.replaceOpWithNewOp<tosa::LogicalNotOp>(op, resultType,
-                                                        equalOp);
-        break;
-      }
-      default: {
-        return rewriter.notifyMatchFailure(
-            op, "comparison direction not yet implemented");
-      }
-    }
-    return success();
-  }
-};
-
-// TODO(jennik): Move this lowering to PDLL when variadic tensors are supported.
-struct ConvertMhloConcatenateOp : public OpRewritePattern<mhlo::ConcatenateOp> {
-  using OpRewritePattern<mhlo::ConcatenateOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<tosa::ConcatOp>(op, op.getResult().getType(),
-                                                op.getVal(), op.getDimension());
-    return success();
-  }
-};
-
-struct ConvertMhloDotOp : public OpRewritePattern<mhlo::DotOp> {
-  using OpRewritePattern<mhlo::DotOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::DotOp op,
-                                PatternRewriter& rewriter) const override {
-    auto lhsType = op.getLhs().getType().dyn_cast<RankedTensorType>();
-    auto rhsType = op.getRhs().getType().dyn_cast<RankedTensorType>();
-    if (!lhsType | !rhsType) {
-      return rewriter.notifyMatchFailure(op, "input tensors are not ranked");
-    }
-
-    auto resultType = op.getResult().getType().dyn_cast<ShapedType>();
-    if (!resultType) {
-      return rewriter.notifyMatchFailure(op,
-                                         "result tensor does not have shape");
-    }
-
-    if (lhsType.getElementType() != rhsType.getElementType()) {
-      return rewriter.notifyMatchFailure(
-          op, "lhs and rhs element types must match");
-    }
-
-    auto lhsShape = lhsType.getShape();
-    auto rhsShape = rhsType.getShape();
-    auto resultShape = resultType.getShape();
-    llvm::SmallVector<int64_t, 3> lhsReshape;
-    llvm::SmallVector<int64_t, 3> rhsReshape;
-    llvm::SmallVector<int64_t, 3> matMulShape;
-
-    // tosa.matmul requires input tensors to have a rank of 3, so lhs and rhs
-    // need to be reshaped first.
-    if (lhsType.getRank() == 1) {
-      // Reshape lhs to [1, 1, N].
-      lhsReshape = {1, 1, lhsShape[0]};
-      if (rhsType.getRank() == 1) {
-        // Reshape rhs to [1, N, 1].
-        rhsReshape = {1, rhsShape[0], 1};
-        // MatMul shape is [1, 1, N].
-        matMulShape = {1, 1, lhsShape[0]};
-      } else if (rhsType.getRank() == 2) {
-        // Reshape rhs to [1, N, K].
-        rhsReshape = {1, rhsShape[0], rhsShape[1]};
-        // MatMul shape is [1, 1, K].
-        matMulShape = {1, 1, rhsShape[1]};
-      } else {
-        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
-      }
-    } else if (lhsType.getRank() == 2) {
-      // Reshape lhs to [1, M, K].
-      lhsReshape = {1, lhsShape[0], lhsShape[1]};
-      if (rhsType.getRank() == 1) {
-        // Reshape rhs to [1, K, 1].
-        rhsReshape = {1, rhsShape[0], 1};
-        // MatMul shape is [1, M, 1].
-        matMulShape = {1, lhsShape[0], 1};
-      } else if (rhsType.getRank() == 2) {
-        // Reshape rhs to [1, K, N].
-        rhsReshape = {1, rhsShape[0], rhsShape[1]};
-        // MatMul shape is [1, M, N].
-        matMulShape = {1, lhsShape[0], rhsShape[1]};
-      } else {
-        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
-      }
-    } else {
-      return rewriter.notifyMatchFailure(op, "lhs must have rank of 1 or 2");
-    }
-
-    auto lhsReshapeType =
-        RankedTensorType::get(lhsReshape, lhsType.getElementType());
-    auto lhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
-        op->getLoc(), lhsReshapeType, op.getLhs(),
-        rewriter.getI64ArrayAttr(lhsReshape));
-
-    auto rhsReshapeType =
-        RankedTensorType::get(rhsReshape, rhsType.getElementType());
-    auto rhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
-        op->getLoc(), rhsReshapeType, op.getRhs(),
-        rewriter.getI64ArrayAttr(rhsReshape));
-
-    auto matMulType =
-        RankedTensorType::get(matMulShape, lhsType.getElementType());
-    auto matMulOp = rewriter.create<tosa::MatMulOp>(op->getLoc(), matMulType,
-                                                    lhsReshapeOp, rhsReshapeOp);
-
-    // Reshape the matmul result back to the original result shape.
-    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(
-        op, resultType, matMulOp, rewriter.getI64ArrayAttr(resultShape));
-    return success();
-  }
-};
-
-// TODO(jennik): Consider the case of a non-constant expansion.
-struct ConvertMhloIotaOp : public OpRewritePattern<mhlo::IotaOp> {
-  using OpRewritePattern<mhlo::IotaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::IotaOp op,
-                                PatternRewriter& rewriter) const override {
-    auto resultType = op.getResult().getType();
-    auto elementType = resultType.cast<ShapedType>().getElementType();
-    auto resultRankedType = resultType.dyn_cast<RankedTensorType>();
-
-    if (!resultRankedType) {
-      return rewriter.notifyMatchFailure(op, "result tensor must be ranked");
-    }
-    if (!resultRankedType.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(op, "result tensor must be static");
-    }
-
-    auto resultShape = resultRankedType.getShape();
-    auto iotaDimension = op.getIotaDimension();
-    int64_t iotaArrayLength = resultShape[iotaDimension];
-
-    // Create a const op of [0, 1, 2...iotaArrayLength - 1] to be tiled.
-    llvm::SmallVector<mlir::Attribute, 4> constValues;
-    constValues.resize(iotaArrayLength);
-    for (int i = 0; i < iotaArrayLength; i++) {
-      if (elementType.isa<FloatType>()) {
-        constValues[i] = rewriter.getFloatAttr(elementType, i);
-      } else {
-        constValues[i] = rewriter.getIntegerAttr(elementType, i);
-      }
-    }
-
-    RankedTensorType constType =
-        RankedTensorType::get(iotaArrayLength, elementType);
-    auto constOp = rewriter.create<tosa::ConstOp>(
-        op.getLoc(), constType, DenseElementsAttr::get(constType, constValues));
-
-    // Create the multiples attr for the tile op, where all dimensions except
-    // the iota dimension are multiplied.
-    llvm::SmallVector<int64_t, 4> tileMultiples;
-    size_t tileMultiplesSize = resultShape.size();
-    tileMultiples.resize(tileMultiplesSize);
-
-    for (int i = 0; i < tileMultiplesSize; i++) {
-      if (i == iotaDimension) {
-        tileMultiples[i] = 1;
-      } else {
-        tileMultiples[i] = resultShape[i];
-      }
-    }
-
-    // Tile the const array to the result shape of the iota op.
-    rewriter.replaceOpWithNewOp<tosa::TileOp>(
-        op, resultType, constOp, rewriter.getI64ArrayAttr(tileMultiples));
-    return success();
-  }
-};
-
-struct ConvertMhloReduceOp : public OpRewritePattern<mhlo::ReduceOp> {
-  using OpRewritePattern<mhlo::ReduceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    Block& bodyBlock = op.getBody().front();
-
-    // To lower to a tosa.reduce_* op, the body should contain the reduce op and
-    // a return op.
-    if (bodyBlock.getOperations().size() != 2) {
-      return rewriter.notifyMatchFailure(op, "body required to contain 2 ops");
-    }
-
-    auto operands = op.getInputs().front();
-    ShapedType inputType = operands.getType().cast<ShapedType>();
-    uint64_t dimension = op.getDimensions().getValues<uint64_t>().begin()[0];
-    Operation& innerOp = bodyBlock.front();
-    Value reduceOpResult;
-
-    if (isa<mhlo::AddOp>(innerOp)) {
-      reduceOpResult =
-          rewriter
-              .create<tosa::ReduceSumOp>(op->getLoc(), inputType, operands,
-                                         rewriter.getI64IntegerAttr(dimension))
-              .getResult();
-    } else if (isa<mhlo::MaxOp>(innerOp)) {
-      reduceOpResult =
-          rewriter
-              .create<tosa::ReduceMaxOp>(op->getLoc(), inputType, operands,
-                                         rewriter.getI64IntegerAttr(dimension))
-              .getResult();
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "reducing along a " + innerOp.getName().getStringRef().str() +
-                  " op not supported");
-    }
-
-    // TOSA reduce ops do not remove the dimension being reduced, so reshape the
-    // reduced output and remove the reduction dimension.
-    ArrayRef<int64_t> innerShape = inputType.getShape();
-    llvm::SmallVector<int64_t, 2> outputShape;
-    int outputShapeLength = innerShape.size() - 1;
-    outputShape.resize(outputShapeLength);
-    for (int64_t i = 0; i < outputShapeLength; i++) {
-      if (i < static_cast<int64_t>(dimension)) {
-        outputShape[i] = innerShape[i];
-      } else {
-        outputShape[i] = innerShape[i + 1];
-      }
-    }
-
-    rewriter
-        .replaceOpWithNewOp<tosa::ReshapeOp>(
-            op, op.getResultTypes().front(), reduceOpResult,
-            rewriter.getI64ArrayAttr(outputShape))
-        .getResult();
-
-    return success();
-  }
-};
-
-struct ConvertMhloSliceOp : public OpRewritePattern<mhlo::SliceOp> {
-  using OpRewritePattern<mhlo::SliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::SliceOp op,
-                                PatternRewriter& rewriter) const override {
-    auto rank = op.getOperand().getType().getRank();
-    if (rank < 1 || rank > 6) {
-      return rewriter.notifyMatchFailure(
-          op, "tosa.slice only supports 1D to 6D tensors");
-    }
-
-    auto strides = op.getStrides().getValues<int64_t>();
-    for (auto stride : strides) {
-      if (stride != 1) {
-        return rewriter.notifyMatchFailure(
-            op, "tosa.slice only supports strides of 1");
-      }
-    }
-
-    auto startIndices = op.getStartIndices().getValues<int64_t>();
-    auto endIndices = op.getLimitIndices().getValues<int64_t>();
-
-    llvm::SmallVector<int64_t, 2> size;
-    size.resize(startIndices.size());
-    llvm::SmallVector<int64_t, 2> startIndicesI64;
-    startIndicesI64.resize(startIndices.size());
-
-    for (int64_t i = 0; i < startIndices.size(); i++) {
-      size[i] = endIndices[i] - startIndices[i];
-      startIndicesI64[i] = startIndices[i];
-    }
-
-    rewriter.replaceOpWithNewOp<tosa::SliceOp>(
-        op, op.getResult().getType(), op.getOperand(),
-        rewriter.getI64ArrayAttr(startIndicesI64),
-        rewriter.getI64ArrayAttr(size));
-    return success();
-  }
-};
-
-struct ConvertMhloTransposeOp : public OpRewritePattern<mhlo::TransposeOp> {
-  using OpRewritePattern<mhlo::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::TransposeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto rank = op.getOperand().getType().getRank();
-    if (rank < 1 || rank > 6) {
-      return rewriter.notifyMatchFailure(
-          op, "tosa.transpose only supports 1D to 6D tensors");
-    }
-
-    auto perms = op.getPermutation();
-    auto constOp = rewriter.create<tosa::ConstOp>(
-        op->getLoc(),
-        RankedTensorType::get({perms.size()}, rewriter.getI64Type()), perms);
-    rewriter.replaceOpWithNewOp<tosa::TransposeOp>(op, op.getResult().getType(),
-                                                   op.getOperand(), constOp);
-    return success();
-  }
-};
-
-LogicalResult LegalizeMhlo::initialize(MLIRContext* ctx) {
-  RewritePatternSet patternList(ctx);
-  populateGeneratedPDLLPatterns(patternList);
-  patternList.addWithLabel<ConvertMhloCompareOp>({"MhloCompare"}, ctx);
-  patternList.addWithLabel<ConvertMhloConcatenateOp>({"MhloConcatenate"}, ctx);
-  patternList.addWithLabel<ConvertMhloDotOp>({"MhloDot"}, ctx);
-  patternList.addWithLabel<ConvertMhloIotaOp>({"MhloIota"}, ctx);
-  patternList.addWithLabel<ConvertMhloReduceOp>({"MhloReduce"}, ctx);
-  patternList.addWithLabel<ConvertMhloSliceOp>({"MhloSlice"}, ctx);
-  patternList.addWithLabel<ConvertMhloTransposeOp>({"MhloTranspose"}, ctx);
-  patterns = std::move(patternList);
-  return success();
-}
-
-void LegalizeMhlo::runOnOperation() {
-  (void)applyPatternsAndFoldGreedily(getOperation(), patterns);
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass() {
-  return std::make_unique<LegalizeMhlo>();
-}
-
-}  // namespace tosa
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc
index b0af645cd64..1d891b86910 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mhlo_tosa/Transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "mhlo/IR/register.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "stablehlo/dialect/Register.h"
+#include "transforms/passes.h"
 
 int main(int argc, char **argv) {
   mlir::registerAllPasses();
   mlir::tosa::registerTosaLegalizeMhloPassPass();
+  mlir::tosa::registerTosaPrepareMhloPassPass();
 
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD
index 726da935d5f..a32ee149f5a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD
@@ -1,7 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 glob_lit_tests(
     data = [":test_utilities"],
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
index 9bdb168cff1..d837c1f8a33 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
@@ -60,8 +60,8 @@ func.func @divide_f32(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<
 
 // CHECK-LABEL: @dot_vector_vector
 func.func @dot_vector_vector(%arg0 : tensor<3xf32>, %arg1 : tensor<3xf32>) -> tensor<f32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 1, 3]}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 3, 1]}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 1, 3>}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 1>}
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3xf32>, tensor<3xf32>) -> tensor<f32>
@@ -70,8 +70,8 @@ func.func @dot_vector_vector(%arg0 : tensor<3xf32>, %arg1 : tensor<3xf32>) -> te
 
 // CHECK-LABEL: @dot_vector_matrix
 func.func @dot_vector_matrix(%arg0 : tensor<2xf32>, %arg1 : tensor<2x3xf32>) -> tensor<3xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 1, 2]}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 2, 3]}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 1, 2>}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 2, 3>}
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2xf32>, tensor<2x3xf32>) -> tensor<3xf32>
@@ -80,8 +80,8 @@ func.func @dot_vector_matrix(%arg0 : tensor<2xf32>, %arg1 : tensor<2x3xf32>) ->
 
 // CHECK-LABEL: @dot_matrix_vector
 func.func @dot_matrix_vector(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 2, 3]}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 3, 1]}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 2, 3>}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 1>}
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2xf32>
@@ -90,14 +90,48 @@ func.func @dot_matrix_vector(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3xf32>) ->
 
 // CHECK-LABEL: @dot_matrix_matrix
 func.func @dot_matrix_matrix(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3x4xf32>) -> tensor<2x4xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = [1, 2, 3]}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = [1, 3, 4]}
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) {new_shape = array<i64: 1, 2, 3>}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) {new_shape = array<i64: 1, 3, 4>}
   // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
   // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3x4xf32>) -> tensor<2x4xf32>
   return %0 : tensor<2x4xf32>
 }
 
+// CHECK-LABEL: @gather
+func.func @gather(%arg0 : tensor<3x4x5xi32>, %arg1 : tensor<3x2xi32>) -> tensor<3x2x5xi32> {
+  // CHECK: tosa.gather
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0],
+      index_vector_dim = 1,
+      offset_dims = [1, 2],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<[1, 2, 5]> : tensor<3xi64>
+  } : (tensor<3x4x5xi32>, tensor<3x2xi32>) -> tensor<3x2x5xi32>
+  return %0 : tensor<3x2x5xi32>
+}
+
+// CHECK-LABEL: @gather_unranked
+func.func @gather_unranked(%arg0 : tensor<*xi32>, %arg1 : tensor<3x2xi32>) -> tensor<*xi32> {
+  // This lowering does not support unranked tensors, so this should not
+  // legalize.
+  // CHECK: mhlo.gather
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0],
+      index_vector_dim = 1,
+      offset_dims = [1, 2],
+      start_index_map = [0, 1]
+    >,
+    indices_are_sorted = false,
+    slice_sizes = dense<[1, 2, 5]> : tensor<3xi64>
+  } : (tensor<*xi32>, tensor<3x2xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
 // CHECK-LABEL: @maximum
 func.func @maximum(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
   // CHECK: tosa.maximum
@@ -165,17 +199,17 @@ func.func @reduce_sum(%arg0: tensor<5x4xf32>, %arg1: tensor<f32>) -> tensor<4xf3
 }
 
 // CHECK-LABEL: @shift_left
-func.func @shift_left(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
+func.func @shift_left(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
   // CHECK: tosa.logical_left_shift
-  %0 = "mhlo.shift_left"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
+  %0 = "mhlo.shift_left"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
+  return %0 : tensor<10xi32>
 }
 
 // CHECK-LABEL: @shift_right_logical
-func.func @shift_right_logical(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
+func.func @shift_right_logical(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
   // CHECK: tosa.logical_right_shift
-  %0 = "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
+  %0 = "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
+  return %0 : tensor<10xi32>
 }
 
 // CHECK-LABEL: @subtract
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
index a708a98c165..c0c72595b24 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
@@ -18,7 +18,7 @@ func.func @constant_f64() -> tensor<10xf64> {
 // CHECK-LABEL: @iota_dimension_0
 func.func @iota_dimension_0() -> tensor<4x8xf32> {
   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = [1, 8]}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = array<i64: 1, 8>}
   %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> (tensor<4x8xf32>)
   return %0 : tensor<4x8xf32>
 }
@@ -26,7 +26,7 @@ func.func @iota_dimension_0() -> tensor<4x8xf32> {
 // CHECK-LABEL: @iota_dimension_1
 func.func @iota_dimension_1() -> tensor<4x8xi32> {
   // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<[0, 1, 2, 3, 4, 5, 6, 7]> : tensor<8xi32>}
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = [4, 1]}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) {multiples = array<i64: 4, 1>}
   %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<4x8xi32>)
   return %0 : tensor<4x8xi32>
 }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir
new file mode 100644
index 00000000000..cd60a886eb1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir
@@ -0,0 +1,52 @@
+// RUN: mhlo-tosa-opt %s -split-input-file --tosa-prepare-mhlo | FileCheck %s
+
+// CHECK-LABEL: func @dot_general_to_dot_vector_vector
+func.func @dot_general_to_dot_vector_vector(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<f32> {
+  // CHECK: "mhlo.dot"(%arg0, %arg1)
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>} : (tensor<3xf32>, tensor<3xf32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general_to_dot_vector_matrix
+func.func @dot_general_to_dot_vector_matrix(%arg0: tensor<2xf32>, %arg1: tensor<2x3xf32>) -> tensor<3xf32> {
+  // CHECK: "mhlo.dot"(%arg0, %arg1)
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>} : (tensor<2xf32>, tensor<2x3xf32>) -> tensor<3xf32>
+  func.return %0 : tensor<3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general_to_dot_matrix_vector
+func.func @dot_general_to_dot_matrix_vector(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2xf32> {
+  // CHECK: "mhlo.dot"(%arg0, %arg1)
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general_to_dot_matrix_matrix
+func.func @dot_general_to_dot_matrix_matrix(%arg0: tensor<2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<2x4xf32> {
+  // CHECK: "mhlo.dot"(%arg0, %arg1)
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<2x3xf32>, tensor<3x4xf32>) -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @dot_general_to_dot_batch_dimensions
+func.func @dot_general_to_dot_batch_dimensions(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf32>) -> tensor<2x3x1xf32> {
+  // CHECK: mhlo.dot_general
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+    dot_dimension_numbers = #mhlo.dot<
+      lhs_batching_dimensions = [0],
+      lhs_contracting_dimensions = [1],
+      rhs_batching_dimensions = [0],
+      rhs_contracting_dimensions = [2]
+    >,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+  } : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<2x3x1xf32>
+  func.return %0 : tensor<2x3x1xf32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
index 243dbd44f9b..1ffac4c2e7d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
@@ -79,7 +79,7 @@ func.func @negate(%arg : tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK-LABEL: @slice
 func.func @slice(%arg : tensor<4x3xf32>) -> tensor<2x2xf32> {
-  // CHECK: "tosa.slice"(%arg0) {size = [2, 2], start = [2, 1]}
+  // CHECK: "tosa.slice"(%arg0) {size = array<i64: 2, 2>, start = array<i64: 2, 1>}
   %0 = "mhlo.slice"(%arg) {
     start_indices = dense<[2, 1]> : tensor<2xi64>,
     limit_indices = dense<[4, 3]> : tensor<2xi64>,
@@ -126,3 +126,32 @@ func.func @transpose(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
   return %0 : tensor<3x2x1xf32>
 }
+
+// CHECK-LABEL: @while
+func.func @while(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<3> : tensor<i32>}
+  // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<1> : tensor<i32>}
+  // CHECK:     %[[VAR2:.*]] = "tosa.while_loop"(%arg0) ({
+  // CHECK:     ^bb0(%[[ARG0:.+]]: tensor<i32>):
+  // CHECK:       %[[VAR3:.*]] = "tosa.equal"(%[[ARG0]], %[[VAR0]])
+  // CHECK:       "tosa.yield"(%[[VAR3]])
+  // CHECK:     }, {
+  // CHECK:     ^bb0(%[[ARG0:.+]]: tensor<i32>):
+  // CHECK:       %[[VAR4:.*]] = "tosa.add"(%[[ARG0]], %[[VAR1]])
+  // CHECK:       "tosa.yield"(%[[VAR4]])
+  // CHECK:     }) : (tensor<i32>) -> tensor<i32>
+  // CHECK:     return %[[VAR2]] : tensor<i32>
+  // CHECK:   }
+  %0 = "mhlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i32>):
+    %1 = "mhlo.constant"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+    %2 = "mhlo.compare"(%arg1, %1) {comparison_direction = #mhlo<comparison_direction EQ>}: (tensor<i32>, tensor<i32>) -> tensor<i1>
+    "mhlo.return"(%2) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i32>):
+    %1 = "mhlo.constant"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %2 = "mhlo.add"(%arg1, %1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    "mhlo.return"(%2) : (tensor<i32>) -> ()
+  }) : (tensor<i32>) -> (tensor<i32>)
+  return %0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD
new file mode 100644
index 00000000000..4b3bd02068d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD
@@ -0,0 +1,80 @@
+# Legalizations and transforms for MHLO -> TOSA.
+
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        ":internal",
+        "//tensorflow/compiler/xla/mlir_hlo/tosa:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "internal",
+    packages = [],
+)
+
+gentbl_cc_library(
+    name = "MHLOTOSAPDLLPatternsIncGen",
+    tbl_outs = [
+        (
+            ["-x=cpp"],
+            "legalize_mhlo/legalize_mhlo.pdll.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-pdll",
+    td_file = "legalize_mhlo/legalize_mhlo.pdll",
+    deps = [
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:TosaDialectTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "MHLOTOSATransformsPassIncGen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = ".",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=MHLOTOSATransforms",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "MHLOTOSATransforms",
+    srcs = [
+        "legalize_mhlo/legalize_mhlo.cc",
+        "prepare_mhlo/prepare_mhlo.cc",
+    ],
+    hdrs = [
+        "passes.h",
+    ],
+    includes = ["."],
+    deps = [
+        ":MHLOTOSAPDLLPatternsIncGen",
+        ":MHLOTOSATransformsPassIncGen",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..87011844519
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS passes.td)
+mlir_tablegen(passes.h.inc -gen-pass-decls -name MHLOTOSATransforms)
+add_public_tablegen_target(MHLOTOSATransformsPassIncGen)
+
+add_mlir_library(MHLOTOSATransforms
+  legalize_mhlo/legalize_mhlo.cc
+  prepare_mhlo/prepare_mhlo.cc
+
+  DEPENDS
+  MHLOTOSATransformsPassIncGen
+  MHLOTOSAPDLLPatternsIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MhloPasses
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc
new file mode 100644
index 00000000000..272a705f087
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc
@@ -0,0 +1,495 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "./passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define GEN_PASS_DEF_TOSALEGALIZEMHLOPASS
+#include "passes.h.inc"
+
+#define PASS_NAME "tosa-legalize-mhlo"
+#define DEBUG_TYPE PASS_NAME
+
+#include "legalize_mhlo/legalize_mhlo.pdll.h.inc"
+
+namespace mlir {
+namespace tosa {
+namespace {
+
+struct LegalizeMhlo : ::impl::TosaLegalizeMhloPassBase<LegalizeMhlo> {
+  void runOnOperation() final;
+
+  LogicalResult initialize(MLIRContext* ctx) override;
+
+ private:
+  FrozenRewritePatternSet patterns;
+};
+
+struct ConvertMhloCompareOp : public OpRewritePattern<mhlo::CompareOp> {
+  using OpRewritePattern<mhlo::CompareOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::CompareOp op,
+                                PatternRewriter& rewriter) const override {
+    auto direction = op.getComparisonDirection();
+    auto resultType = op->getResultTypes().front();
+
+    switch (direction) {
+      case mlir::mhlo::ComparisonDirection::EQ: {
+        rewriter.replaceOpWithNewOp<tosa::EqualOp>(op, resultType, op.getLhs(),
+                                                   op.getRhs());
+        break;
+      }
+      case mlir::mhlo::ComparisonDirection::NE: {
+        auto equalOp = rewriter.create<tosa::EqualOp>(op->getLoc(), resultType,
+                                                      op.getLhs(), op.getRhs());
+        rewriter.replaceOpWithNewOp<tosa::LogicalNotOp>(op, resultType,
+                                                        equalOp);
+        break;
+      }
+      default: {
+        return rewriter.notifyMatchFailure(
+            op, "comparison direction not yet implemented");
+      }
+    }
+    return success();
+  }
+};
+
+// TODO(jennik): Move this lowering to PDLL when variadic tensors are supported.
+struct ConvertMhloConcatenateOp : public OpRewritePattern<mhlo::ConcatenateOp> {
+  using OpRewritePattern<mhlo::ConcatenateOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::ConcatenateOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<tosa::ConcatOp>(op, op.getResult().getType(),
+                                                op.getVal(), op.getDimension());
+    return success();
+  }
+};
+
+struct ConvertMhloDotOp : public OpRewritePattern<mhlo::DotOp> {
+  using OpRewritePattern<mhlo::DotOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::DotOp op,
+                                PatternRewriter& rewriter) const override {
+    auto lhsType = op.getLhs().getType().dyn_cast<RankedTensorType>();
+    auto rhsType = op.getRhs().getType().dyn_cast<RankedTensorType>();
+    if (!lhsType | !rhsType) {
+      return rewriter.notifyMatchFailure(op, "input tensors are not ranked");
+    }
+
+    auto resultType = op.getResult().getType().dyn_cast<ShapedType>();
+    if (!resultType) {
+      return rewriter.notifyMatchFailure(op,
+                                         "result tensor does not have shape");
+    }
+
+    if (lhsType.getElementType() != rhsType.getElementType()) {
+      return rewriter.notifyMatchFailure(
+          op, "lhs and rhs element types must match");
+    }
+
+    auto lhsShape = lhsType.getShape();
+    auto rhsShape = rhsType.getShape();
+    auto resultShape = resultType.getShape();
+    llvm::SmallVector<int64_t, 3> lhsReshape;
+    llvm::SmallVector<int64_t, 3> rhsReshape;
+    llvm::SmallVector<int64_t, 3> matMulShape;
+
+    // tosa.matmul requires input tensors to have a rank of 3, so lhs and rhs
+    // need to be reshaped first.
+    if (lhsType.getRank() == 1) {
+      // Reshape lhs to [1, 1, N].
+      lhsReshape = {1, 1, lhsShape[0]};
+      if (rhsType.getRank() == 1) {
+        // Reshape rhs to [1, N, 1].
+        rhsReshape = {1, rhsShape[0], 1};
+        // MatMul shape is [1, 1, 1].
+        matMulShape = {1, 1, 1};
+      } else if (rhsType.getRank() == 2) {
+        // Reshape rhs to [1, N, K].
+        rhsReshape = {1, rhsShape[0], rhsShape[1]};
+        // MatMul shape is [1, 1, K].
+        matMulShape = {1, 1, rhsShape[1]};
+      } else {
+        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
+      }
+    } else if (lhsType.getRank() == 2) {
+      // Reshape lhs to [1, M, K].
+      lhsReshape = {1, lhsShape[0], lhsShape[1]};
+      if (rhsType.getRank() == 1) {
+        // Reshape rhs to [1, K, 1].
+        rhsReshape = {1, rhsShape[0], 1};
+        // MatMul shape is [1, M, 1].
+        matMulShape = {1, lhsShape[0], 1};
+      } else if (rhsType.getRank() == 2) {
+        // Reshape rhs to [1, K, N].
+        rhsReshape = {1, rhsShape[0], rhsShape[1]};
+        // MatMul shape is [1, M, N].
+        matMulShape = {1, lhsShape[0], rhsShape[1]};
+      } else {
+        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
+      }
+    } else {
+      return rewriter.notifyMatchFailure(op, "lhs must have rank of 1 or 2");
+    }
+
+    auto lhsReshapeType =
+        RankedTensorType::get(lhsReshape, lhsType.getElementType());
+    auto lhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
+        op->getLoc(), lhsReshapeType, op.getLhs(),
+        rewriter.getDenseI64ArrayAttr(lhsReshape));
+
+    auto rhsReshapeType =
+        RankedTensorType::get(rhsReshape, rhsType.getElementType());
+    auto rhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
+        op->getLoc(), rhsReshapeType, op.getRhs(),
+        rewriter.getDenseI64ArrayAttr(rhsReshape));
+
+    auto matMulType =
+        RankedTensorType::get(matMulShape, lhsType.getElementType());
+    auto matMulOp = rewriter.create<tosa::MatMulOp>(op->getLoc(), matMulType,
+                                                    lhsReshapeOp, rhsReshapeOp);
+
+    // Reshape the matmul result back to the original result shape.
+    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(
+        op, resultType, matMulOp, rewriter.getDenseI64ArrayAttr(resultShape));
+    return success();
+  }
+};
+
+// TODO(jennik): Consider the case of a non-constant expansion.
+struct ConvertMhloIotaOp : public OpRewritePattern<mhlo::IotaOp> {
+  using OpRewritePattern<mhlo::IotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::IotaOp op,
+                                PatternRewriter& rewriter) const override {
+    auto resultType = op.getResult().getType();
+    auto elementType = resultType.cast<ShapedType>().getElementType();
+    auto resultRankedType = resultType.dyn_cast<RankedTensorType>();
+
+    if (!resultRankedType) {
+      return rewriter.notifyMatchFailure(op, "result tensor must be ranked");
+    }
+    if (!resultRankedType.hasStaticShape()) {
+      return rewriter.notifyMatchFailure(op, "result tensor must be static");
+    }
+
+    auto resultShape = resultRankedType.getShape();
+    auto iotaDimension = op.getIotaDimension();
+    int64_t iotaArrayLength = resultShape[iotaDimension];
+
+    // Create a const op of [0, 1, 2...iotaArrayLength - 1] to be tiled.
+    llvm::SmallVector<mlir::Attribute, 4> constValues;
+    constValues.resize(iotaArrayLength);
+    for (int i = 0; i < iotaArrayLength; i++) {
+      if (elementType.isa<FloatType>()) {
+        constValues[i] = rewriter.getFloatAttr(elementType, i);
+      } else {
+        constValues[i] = rewriter.getIntegerAttr(elementType, i);
+      }
+    }
+
+    RankedTensorType constType =
+        RankedTensorType::get(iotaArrayLength, elementType);
+    auto constOp = rewriter.create<tosa::ConstOp>(
+        op.getLoc(), constType, DenseElementsAttr::get(constType, constValues));
+
+    // Create the multiples attr for the tile op, where all dimensions except
+    // the iota dimension are multiplied.
+    llvm::SmallVector<int64_t, 4> tileMultiples;
+    size_t tileMultiplesSize = resultShape.size();
+    tileMultiples.resize(tileMultiplesSize);
+
+    for (size_t i = 0; i < tileMultiplesSize; i++) {
+      if (i == iotaDimension) {
+        tileMultiples[i] = 1;
+      } else {
+        tileMultiples[i] = resultShape[i];
+      }
+    }
+
+    // Tile the const array to the result shape of the iota op.
+    rewriter.replaceOpWithNewOp<tosa::TileOp>(
+        op, resultType, constOp, rewriter.getDenseI64ArrayAttr(tileMultiples));
+    return success();
+  }
+};
+
+// This legalization supports the case where the MHLO start_indices directly map
+// to the TOSA indices.
+struct ConvertMhloGatherOp : public OpRewritePattern<mhlo::GatherOp> {
+  using OpRewritePattern<mhlo::GatherOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::GatherOp op,
+                                PatternRewriter& rewriter) const override {
+    // The input operand must be 3D, with shape [N, K, C].
+    auto operand = op.getOperand();
+    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
+    if (!operandType) {
+      return rewriter.notifyMatchFailure(op, "requires ranked operand shape");
+    }
+    if (operandType.getRank() != 3) {
+      return rewriter.notifyMatchFailure(op, "operand must have rank of 3");
+    }
+
+    // The indices tensor must be 2D, with shape [N, W].
+    auto startIndices = op.getStartIndices();
+    auto startIndicesType = startIndices.getType().dyn_cast<RankedTensorType>();
+    if (!startIndicesType) {
+      return rewriter.notifyMatchFailure(op,
+                                         "requires ranked start_indices shape");
+    }
+    if (startIndicesType.getRank() != 2) {
+      return rewriter.notifyMatchFailure(op,
+                                         "start_indices must have rank of 2");
+    }
+
+    // The result tensor must be 3D, with shape [N, W, C].
+    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
+    if (!resultType) {
+      return rewriter.notifyMatchFailure(op, "requires ranked output shape");
+    }
+    if (resultType.getRank() != 3) {
+      return rewriter.notifyMatchFailure(op, "result must have rank of 3");
+    }
+
+    auto operandShape = operand.getType().getShape();
+    auto startIndicesShape = startIndices.getType().getShape();
+    auto resultShape = resultType.getShape();
+
+    if (startIndicesShape[0] != resultShape[0] ||
+        startIndicesShape[1] != resultShape[1]) {
+      return rewriter.notifyMatchFailure(op,
+                                         "start_indices and result must have "
+                                         "same number of batches and indices");
+    }
+
+    if (operandShape[0] != resultShape[0] ||
+        operandShape[2] != resultShape[2]) {
+      return rewriter.notifyMatchFailure(op,
+                                         "operand and result must have same "
+                                         "number of batches and data channels");
+    }
+
+    auto startIndexMap = op.getDimensionNumbers().getStartIndexMap();
+    for (const auto& startIndex : llvm::enumerate(startIndexMap)) {
+      if (startIndex.value() != static_cast<int64_t>(startIndex.index())) {
+        return rewriter.notifyMatchFailure(op,
+                                           "start_index_map must be in order");
+      }
+    }
+
+    rewriter.replaceOpWithNewOp<tosa::GatherOp>(op, resultType, operand,
+                                                startIndices);
+    return success();
+  }
+};
+
+struct ConvertMhloReduceOp : public OpRewritePattern<mhlo::ReduceOp> {
+  using OpRewritePattern<mhlo::ReduceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::ReduceOp op,
+                                PatternRewriter& rewriter) const override {
+    Block& bodyBlock = op.getBody().front();
+
+    // To lower to a tosa.reduce_* op, the body should contain the reduce op
+    // and a return op.
+    if (bodyBlock.getOperations().size() != 2) {
+      return rewriter.notifyMatchFailure(op, "body required to contain 2 ops");
+    }
+
+    auto operand = op.getInputs().front();
+    ShapedType inputType = operand.getType().cast<ShapedType>();
+    Operation& innerOp = bodyBlock.front();
+    uint64_t dimension = op.getDimensions().getValues<uint64_t>().begin()[0];
+    SmallVector<int64_t> innerShape(inputType.getShape());
+    innerShape[dimension] = 1;
+    Type innerTy = inputType.clone(innerShape);
+
+    Value reduceOpResult;
+    if (isa<mhlo::AddOp>(innerOp)) {
+      reduceOpResult =
+          rewriter
+              .create<tosa::ReduceSumOp>(op->getLoc(), innerTy, operand,
+                                         rewriter.getI64IntegerAttr(dimension))
+              .getResult();
+    } else if (isa<mhlo::MaxOp>(innerOp)) {
+      reduceOpResult =
+          rewriter
+              .create<tosa::ReduceMaxOp>(op->getLoc(), innerTy, operand,
+                                         rewriter.getI64IntegerAttr(dimension))
+              .getResult();
+    } else {
+      return rewriter.notifyMatchFailure(
+          op, "reducing along a " + innerOp.getName().getStringRef().str() +
+                  " op not supported");
+    }
+
+    // TOSA reduce ops do not remove the dimension being reduced, so reshape
+    // the reduced output and remove the reduction dimension.
+    llvm::SmallVector<int64_t, 2> outputShape;
+    int outputShapeLength = innerShape.size() - 1;
+    outputShape.resize(outputShapeLength);
+    for (int64_t i = 0; i < outputShapeLength; i++) {
+      if (i < static_cast<int64_t>(dimension)) {
+        outputShape[i] = innerShape[i];
+      } else {
+        outputShape[i] = innerShape[i + 1];
+      }
+    }
+
+    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(
+        op, op.getResultTypes().front(), reduceOpResult,
+        rewriter.getDenseI64ArrayAttr(outputShape));
+
+    return success();
+  }
+};
+
+struct ConvertMhloReturnOp : public OpRewritePattern<mhlo::ReturnOp> {
+  using OpRewritePattern<mhlo::ReturnOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::ReturnOp op,
+                                PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<tosa::YieldOp>(op, op->getResultTypes(),
+                                               op.getResults());
+    return success();
+  }
+};
+
+struct ConvertMhloSliceOp : public OpRewritePattern<mhlo::SliceOp> {
+  using OpRewritePattern<mhlo::SliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::SliceOp op,
+                                PatternRewriter& rewriter) const override {
+    auto rank = op.getOperand().getType().getRank();
+    if (rank < 1 || rank > 6) {
+      return rewriter.notifyMatchFailure(
+          op, "tosa.slice only supports 1D to 6D tensors");
+    }
+
+    auto strides = op.getStrides().getValues<int64_t>();
+    for (auto stride : strides) {
+      if (stride != 1) {
+        return rewriter.notifyMatchFailure(
+            op, "tosa.slice only supports strides of 1");
+      }
+    }
+
+    auto startIndices = op.getStartIndices().getValues<int64_t>();
+    auto endIndices = op.getLimitIndices().getValues<int64_t>();
+
+    llvm::SmallVector<int64_t, 2> size;
+    size.resize(startIndices.size());
+    llvm::SmallVector<int64_t, 2> startIndicesI64;
+    startIndicesI64.resize(startIndices.size());
+
+    for (int64_t i = 0; i < static_cast<int64_t>(startIndices.size()); i++) {
+      size[i] = endIndices[i] - startIndices[i];
+      startIndicesI64[i] = startIndices[i];
+    }
+
+    rewriter.replaceOpWithNewOp<tosa::SliceOp>(
+        op, op.getResult().getType(), op.getOperand(),
+        rewriter.getDenseI64ArrayAttr(startIndicesI64),
+        rewriter.getDenseI64ArrayAttr(size));
+    return success();
+  }
+};
+
+struct ConvertMhloTransposeOp : public OpRewritePattern<mhlo::TransposeOp> {
+  using OpRewritePattern<mhlo::TransposeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::TransposeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto rank = op.getOperand().getType().getRank();
+    if (rank < 1 || rank > 6) {
+      return rewriter.notifyMatchFailure(
+          op, "tosa.transpose only supports 1D to 6D tensors");
+    }
+
+    auto perms = op.getPermutation();
+    auto constOp = rewriter.create<tosa::ConstOp>(
+        op->getLoc(),
+        RankedTensorType::get({perms.size()}, rewriter.getI64Type()), perms);
+    rewriter.replaceOpWithNewOp<tosa::TransposeOp>(op, op.getResult().getType(),
+                                                   op.getOperand(), constOp);
+    return success();
+  }
+};
+
+struct ConvertMhloWhileOp : public OpRewritePattern<mhlo::WhileOp> {
+  using OpRewritePattern<mhlo::WhileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::WhileOp op,
+                                PatternRewriter& rewriter) const override {
+    auto* cond = &op.getCond();
+    auto* body = &op.getBody();
+    auto newWhileOp = rewriter.create<tosa::WhileOp>(
+        op->getLoc(), op->getResultTypes(), op->getOperands());
+
+    auto* newCond = &newWhileOp->getRegion(0);
+    auto* newBody = &newWhileOp->getRegion(1);
+    rewriter.createBlock(newCond);
+    rewriter.createBlock(newBody);
+
+    rewriter.cloneRegionBefore(*cond, &newCond->back());
+    rewriter.eraseBlock(&newCond->back());
+    rewriter.cloneRegionBefore(*body, &newBody->back());
+    rewriter.eraseBlock(&newBody->back());
+
+    rewriter.replaceOp(op, newWhileOp.getResults());
+    return success();
+  }
+};
+
+LogicalResult LegalizeMhlo::initialize(MLIRContext* ctx) {
+  RewritePatternSet patternList(ctx);
+  populateGeneratedPDLLPatterns(patternList);
+  patternList.addWithLabel<ConvertMhloCompareOp>({"MhloCompare"}, ctx);
+  patternList.addWithLabel<ConvertMhloConcatenateOp>({"MhloConcatenate"}, ctx);
+  patternList.addWithLabel<ConvertMhloDotOp>({"MhloDot"}, ctx);
+  patternList.addWithLabel<ConvertMhloGatherOp>({"MhloGather"}, ctx);
+  patternList.addWithLabel<ConvertMhloIotaOp>({"MhloIota"}, ctx);
+  patternList.addWithLabel<ConvertMhloReduceOp>({"MhloReduce"}, ctx);
+  patternList.addWithLabel<ConvertMhloReturnOp>({"MhloReturn"}, ctx);
+  patternList.addWithLabel<ConvertMhloSliceOp>({"MhloSlice"}, ctx);
+  patternList.addWithLabel<ConvertMhloTransposeOp>({"MhloTranspose"}, ctx);
+  patternList.addWithLabel<ConvertMhloWhileOp>({"MhloWhile"}, ctx);
+  patterns = std::move(patternList);
+  return success();
+}
+
+void LegalizeMhlo::runOnOperation() {
+  (void)applyPatternsAndFoldGreedily(getOperation(), patterns);
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass() {
+  return std::make_unique<LegalizeMhlo>();
+}
+
+}  // namespace tosa
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.pdll b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.pdll
rename to tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
index 682b488b6f2..e75a3f0a218 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/lib/Transforms/legalize_mhlo.pdll
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.td"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+#include "mhlo/IR/hlo_ops.td"
 
 // Helper functions.
 Rewrite onesLike(op: Op, type: Type) -> Op [{
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h
new file mode 100644
index 00000000000..acd63a76c25
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
+#define MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace tosa {
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createPrepareMhloPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_TOSALEGALIZEMHLOPASS
+#include "passes.h.inc"
+
+}  // namespace tosa
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td
new file mode 100644
index 00000000000..f5841bb9fc1
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def TosaLegalizeMhloPass : Pass<"tosa-legalize-mhlo", "mlir::func::FuncOp"> {
+  let summary = "Legalize from MHLO to TOSA";
+  let constructor = "createLegalizeMhloPass()";
+  let dependentDialects = ["::mlir::tosa::TosaDialect"];
+}
+
+def TosaPrepareMhloPass : Pass<"tosa-prepare-mhlo", "mlir::func::FuncOp"> {
+  let summary = "Prepare MHLO for lowering to TOSA";
+  let description = [{
+    This pass adds rewriters to make MHLO ops more compatible with TOSA ops.
+    Currently simplifies mhlo.dot_general into mhlo.dot for easier lowering.
+  }];
+  let constructor = "createPrepareMhloPass()";
+  let dependentDialects = ["::mlir::tosa::TosaDialect"];
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc
new file mode 100644
index 00000000000..3918a5637ff
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "./passes.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define GEN_PASS_DEF_TOSAPREPAREMHLOPASS
+#include "./passes.h.inc"
+
+#define PASS_NAME "tosa-prepare-mhlo"
+#define DEBUG_TYPE PASS_NAME
+
+namespace mlir {
+namespace tosa {
+namespace {
+
+class PrepareMhlo : public ::impl::TosaPrepareMhloPassBase<PrepareMhlo> {
+ public:
+  explicit PrepareMhlo() = default;
+  void runOnOperation() override;
+};
+
+void PrepareMhlo::runOnOperation() {
+  auto* ctx = &getContext();
+  RewritePatternSet patterns(ctx);
+  mhlo::DotGeneralOp::getCanonicalizationPatterns(patterns, ctx);
+  mhlo::populateGeneralDotOpLoweringPatterns(&patterns, ctx);
+  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createPrepareMhloPass() {
+  return std::make_unique<PrepareMhlo>();
+}
+
+}  // namespace tosa
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
new file mode 100644
index 00000000000..32ca35c40e5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -0,0 +1,85 @@
+#
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(LLVM_TARGET_DEFINITIONS passes.td)
+mlir_tablegen(passes.h.inc -gen-pass-decls -name LMHLOTransforms)
+add_public_tablegen_target(LMHLOTransformsPassIncGen)
+
+set(LLVM_TARGET_DEFINITIONS gpu_passes.td)
+mlir_tablegen(gpu_passes.h.inc -gen-pass-decls -name LMHLOGPUTransforms)
+add_public_tablegen_target(LMHLOGPUTransformsPassIncGen)
+
+add_mlir_library(MLIRBufferTransforms
+  alloc_to_arg_pass.cc
+  buffer_packing.cc
+  buffer_reuse.cc
+  bufferize.cc
+  bufferize_pass.cc
+  collapse_parallel_loops_to_1d_pass.cc
+  copy_removal.cc
+  detensorize_scf_ops.cc
+  generic_host_to_llvm.cc
+  lower_index_cast_pass.cc
+  propagate_static_shapes_to_kernel.cc
+  tile_loops_pass.cc
+  unbufferize_pass.cc
+  unroll_loops.cc
+
+  DEPENDS
+  LMHLOTransformsPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  ChloOps
+  GmlStDialect
+  MhloDialect
+  MLIRGPUOps
+  MLIRHLOAnalysis
+  MLIRIR
+  MLIRPass
+  MLIRShapeDialect
+  MLIRTransforms
+  THLODialect
+  ThloBufferizableOpInterface
+)
+
+add_mlir_library(MLIRHLOGPUTransforms
+  gpu_kernel_lowering_passes.cc
+  gpu_fusion_rewrite.cc
+  hlo_to_gpu_pipeline.cc
+  hlo_to_triton_pipeline.cc
+
+  DEPENDS
+  LMHLOGPUTransformsPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  GmlStPasses
+  MLIRArithTransforms
+  MLIRGPUOps
+  MLIRHLOAnalysis
+  MLIRIR
+  MLIRMemRefTransforms
+  MLIRPass
+  MLIRShapeDialect
+  MLIRSCFTransforms
+  MLIRTransforms
+  MLIRVectorTransforms
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
new file mode 100644
index 00000000000..54694553fc8
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
@@ -0,0 +1,111 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This files implements a pass that partially bufferized IR.
+
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Value.h"
+#include "transforms/passes.h"
+
+namespace mlir {
+
+#define GEN_PASS_DEF_ALLOCTOARGPASS
+#include "transforms/passes.h.inc"
+
+using ::mlir::func::FuncOp;
+
+namespace {
+
+class AllocToArgPass : public impl::AllocToArgPassBase<AllocToArgPass> {
+ public:
+  using AllocToArgPassBase<AllocToArgPass>::AllocToArgPassBase;
+
+ private:
+  void runOnOperation() override;
+};
+
+}  // namespace
+
+void AllocToArgPass::runOnOperation() {
+  // Find unique block and return op.
+  FuncOp funcOp = getOperation();
+  auto &blocks = funcOp.getFunctionBody().getBlocks();
+  if (blocks.size() != 1) {
+    funcOp.emitError("expect function with single-block body");
+    return signalPassFailure();
+  }
+  Block &bodyBlock = blocks.front();
+  auto returnOp = llvm::cast<func::ReturnOp>(bodyBlock.getTerminator());
+
+  IRRewriter rewriter(&getContext());
+  BitVector resultsToErase(funcOp.getNumResults());
+  Location loc = returnOp.getLoc();
+
+  for (auto [i, result] : llvm::enumerate(returnOp.getOperands())) {
+    Operation *resultDef = result.getDefiningOp();
+    Type resultTy = result.getType();
+
+    // Case: plain alloc.
+    if (auto allocOp = llvm::dyn_cast_or_null<memref::AllocOp>(resultDef)) {
+      resultsToErase.set(i);
+      auto attrs = funcOp.getResultAttrDict(i);
+      funcOp.insertArgument(funcOp.getNumArguments(), resultTy, attrs, loc);
+      rewriter.replaceOp(allocOp, funcOp.getArguments().back());
+      continue;
+    }
+
+    // Case: shape-expanded alloc.
+    if (auto expandOp =
+            llvm::dyn_cast_or_null<memref::ExpandShapeOp>(resultDef)) {
+      Operation *expandDef = expandOp.getOperand().getDefiningOp();
+      if (auto allocOp = llvm::dyn_cast_or_null<memref::AllocOp>(expandDef)) {
+        resultsToErase.set(i);
+        auto attrs = funcOp.getResultAttrDict(i);
+        funcOp.insertArgument(funcOp.getNumArguments(), resultTy, attrs, loc);
+
+        // Collapse buffer argument to replace possible uses of the unexpanded
+        // buffer.
+        rewriter.setInsertionPoint(allocOp);
+        Value arg = funcOp.getArguments().back();
+        Value collapsedArg = rewriter.create<memref::CollapseShapeOp>(
+            loc, arg, expandOp.getReassociationIndices());
+
+        // Replace alloc and its expansion.
+        rewriter.replaceOp(allocOp, collapsedArg);
+        rewriter.replaceOp(expandOp, arg);
+        continue;
+      }
+    }
+
+    returnOp.emitOpError("expected operand #")
+        << i << " to be defined by (shape-expanded) memref.alloc";
+    return signalPassFailure();
+  }
+
+  funcOp.eraseResults(resultsToErase);
+  returnOp->eraseOperands(resultsToErase);
+}
+
+std::unique_ptr<OperationPass<func::FuncOp>> hlo::createAllocToArgPass() {
+  return std::make_unique<AllocToArgPass>();
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_packing.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_packing.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
index 7ddfc1de91f..105d6ca65ec 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_packing.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include <list>
 
-#include "mlir-hlo/Analysis/userange_analysis.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir-hlo/utils/hlo_utils.h"
+#include "analysis/userange_analysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
@@ -25,12 +23,14 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
+#include "transforms/passes.h"
+#include "utils/hlo_utils.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_BUFFERPACKING
 #define GEN_PASS_DEF_MEMORYCOUNT
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_reuse.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_reuse.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_reuse.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/buffer_reuse.cc
index dc133e2ad97..a33a201ac28 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/buffer_reuse.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_reuse.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "mlir-hlo/Analysis/userange_analysis.h"
-#include "mlir-hlo/Transforms/passes.h"
+#include "analysis/userange_analysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -24,11 +23,12 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Pass/Pass.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_BUFFERREUSE
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
similarity index 96%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
index e4d83e9e791..b86a614e58a 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize.cc
@@ -13,22 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file implements logic for translating mixed IR to buffer form.
-
-#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include <optional>
 
-#include "mlir-hlo/Transforms/rewriters.h"
+// This file implements logic for translating mixed IR to buffer form.
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/ChloOps.h"
+#include "transforms/rewriters.h"
 
 namespace mlir {
 namespace {
@@ -119,17 +119,17 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     SmallVector<Value> resultShapes;
     resultShapes.reserve(k);
     auto resultType =
-        MemRefType::get({ShapedType::kDynamicSize}, lb.getIndexType());
+        MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
     Value one = lb.create<arith::ConstantIndexOp>(1);
     for (size_t i = 0; i < k; ++i) {
       // We assume the buffer will be small, so we allocate it on the stack.
       // TODO(b/181654096): Replace AllocaOp with AllocOp.
       auto result = lb.create<memref::AllocaOp>(resultType, ranks[i]);
-      lb.create<scf::ForOp>(zero, ranks[i], one, llvm::None,
+      lb.create<scf::ForOp>(zero, ranks[i], one, std::nullopt,
                             [&one, &result](OpBuilder &b, Location l, Value idx,
                                             ValueRange /*vr*/) {
                               b.create<memref::StoreOp>(l, one, result, idx);
-                              b.create<scf::YieldOp>(l, llvm::None);
+                              b.create<scf::YieldOp>(l, std::nullopt);
                             });
       resultShapes.push_back(result);
     }
@@ -196,7 +196,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
             resultDimensions.push_back(dimension);
             Value currentSize =
                 b.create<scf::IfOp>(
-                     l, TypeRange{b.getIndexType()}, isOutOfBounds,
+                     l, isOutOfBounds,
                      [&](OpBuilder &b, Location l) {
                        b.create<scf::YieldOp>(l, one);
                      },
@@ -260,8 +260,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           Value stopCombiningDimensions = b.create<arith::OrIOp>(
               l, isLastIteration, differentBroadcastingSet);
           auto ifStopCombiningDimensions = b.create<scf::IfOp>(
-              l, TypeRange{b.getIndexType(), b.getIndexType()},
-              stopCombiningDimensions,
+              l, stopCombiningDimensions,
               [&](OpBuilder &b, Location l) {
                 // If the running product is not 1, add one dimension of size
                 // 'running_product' to each shape that didn't need
@@ -271,7 +270,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
                     l, arith::CmpIPredicate::ne, runningProduct, one);
                 Value newDimensionOffset =
                     b.create<scf::IfOp>(
-                         l, TypeRange{b.getIndexType()}, runningProductNotOne,
+                         l, runningProductNotOne,
                          [&](OpBuilder &b, Location l) {
                            Value newDimensionOffset = b.create<arith::AddIOp>(
                                l, currentDimensionOffset, one);
@@ -299,7 +298,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
                                    b.create<memref::StoreOp>(l, outputSize,
                                                              resultShapes[i],
                                                              outputDimension);
-                                   b.create<scf::YieldOp>(l, llvm::None);
+                                   b.create<scf::YieldOp>(l, std::nullopt);
                                  });
                            }
                            b.create<scf::YieldOp>(l, newDimensionOffset);
@@ -367,7 +366,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     Value leadingOnes = countLeadingOnes(lb, extentMemref, rank);
     Value newRank = lb.create<arith::SubIOp>(rank, leadingOnes);
     auto resultType =
-        MemRefType::get({ShapedType::kDynamicSize}, lb.getIndexType());
+        MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
     // We cannot use SubView here to return a MemRef with 'leading_ones' as
     // offset, because that also changes the size, so the result type would need
     // to have an affine map to change the layout. This is incompatible to our
@@ -379,12 +378,12 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     Value zero = lb.create<arith::ConstantIndexOp>(0);
     Value one = lb.create<arith::ConstantIndexOp>(1);
     lb.create<scf::ForOp>(
-        zero, newRank, one, llvm::None,
+        zero, newRank, one, std::nullopt,
         [&](OpBuilder &b, Location l, Value idx, ValueRange /*vr*/) {
           Value idxWithOffset = b.create<arith::AddIOp>(l, idx, leadingOnes);
           auto size = b.create<memref::LoadOp>(l, extentMemref, idxWithOffset);
           b.create<memref::StoreOp>(l, size, result, idx);
-          b.create<scf::YieldOp>(l, llvm::None);
+          b.create<scf::YieldOp>(l, std::nullopt);
         });
     return result;
   }
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
similarity index 91%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
index d48a0dee9d7..156cb4d6577 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -21,19 +21,15 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "gml_st/IR/gml_st_ops.h"
+#include "gml_st/interfaces/bufferizable_op_interface_impl.h"
+#include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir-hlo/Dialect/gml_st/IR/gml_st_ops.h"
-#include "mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
-#include "mlir-hlo/Dialect/thlo/IR/thlo_ops.h"
-#include "mlir-hlo/Dialect/thlo/transforms/bufferizable_op_interface_impl.h"
-#include "mlir-hlo/Transforms/passes.h"
-#include "mlir-hlo/Transforms/rewriters.h"
+#include "mhlo/IR/hlo_ops.h"
+#include "mhlo/interfaces/bufferizable_op_interface_impl.h"
+#include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
@@ -75,13 +71,17 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/ChloOps.h"
+#include "thlo/IR/thlo_ops.h"
+#include "thlo/interfaces/bufferizable_op_interface_impl.h"
+#include "transforms/passes.h"
+#include "transforms/rewriters.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_COMPUTEOPANDFUNCBUFFERIZEPASS
 #define GEN_PASS_DEF_FINALBUFFERIZEPASS
 #define GEN_PASS_DEF_ONESHOTBUFFERIZE
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
@@ -156,10 +156,6 @@ struct ComputeOpAndFuncBufferizePass
                                   linalg::LinalgDialect, mhlo::MhloDialect,
                                   shape::ShapeDialect, tensor::TensorDialect,
                                   thlo::THLODialect, vector::VectorDialect>();
-    // Ops inside TiledLoopOps have special handling.
-    options.opFilter.denyOperation([](Operation* op) {
-      return mlir::isa<gml_st::LoopOp>(op->getParentOp());
-    });
 
     if (failed(bufferization::bufferizeOp(getOperation(), options))) {
       signalPassFailure();
@@ -181,12 +177,8 @@ struct ComputeOpAndFuncBufferizePass
                            lmhlo::LmhloDialect, math::MathDialect,
                            memref::MemRefDialect, tensor::TensorDialect,
                            thlo::THLODialect, vector::VectorDialect>();
-    target.addLegalOp<UnrealizedConversionCastOp, gml_st::LoopOp>();
+    target.addLegalOp<UnrealizedConversionCastOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
-    target.addDynamicallyLegalOp<tensor::ExtractSliceOp, tensor::InsertSliceOp>(
-        [&](Operation* op) {
-          return mlir::isa<gml_st::LoopOp>(op->getParentOp());
-        });
 
     CustomBufferizeTypeConverter converter;
     populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
@@ -210,15 +202,10 @@ struct ComputeOpAndFuncBufferizePass
     auto isLegalOp = [&](Operation* op) { return converter.isLegal(op); };
     target.addDynamicallyLegalOp<func::CallOp, func::ReturnOp>(isLegalOp);
 
-    auto isLegalOrInsideTiledLoop = [&](Operation* op) {
-      return converter.isLegal(op) ||
-             mlir::isa<gml_st::LoopOp>(op->getParentOp());
-    };
-    target.addDynamicallyLegalDialect<linalg::LinalgDialect>(
-        isLegalOrInsideTiledLoop);
+    target.addDynamicallyLegalDialect<linalg::LinalgDialect>(isLegalOp);
     target
         .addDynamicallyLegalOp<vector::TransferWriteOp, vector::TransferReadOp>(
-            isLegalOrInsideTiledLoop);
+            isLegalOp);
 
     return applyPartialConversion(getOperation(), target, std::move(patterns));
   }
@@ -250,7 +237,7 @@ struct OneShotBufferizePass
     opts.allowReturnAllocs = true;
     opts.bufferizeFunctionBoundaries = true;
     opts.functionBoundaryTypeConversion =
-        bufferization::BufferizationOptions::LayoutMapOption::IdentityLayoutMap;
+        bufferization::LayoutMapOption::IdentityLayoutMap;
     opts.createDeallocs = false;
     opts.bufferAlignment = 64;
 
@@ -340,8 +327,7 @@ struct FinalBufferizePass
              converter.isLegal(op->getResultTypes());
     };
     target.addDynamicallyLegalOp<func::ConstantOp, arith::ConstantOp,
-                                 arith::IndexCastOp, arith::SelectOp,
-                                 gml_st::LoopOp, gml_st::YieldOp>(
+                                 arith::IndexCastOp, arith::SelectOp>(
         typesAreLegal);
 
     RewritePatternSet patterns(&getContext());
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/collapse_parallel_loops_to_1d_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
similarity index 93%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/collapse_parallel_loops_to_1d_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
index d66c3ef1648..79bb80ed0ac 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/collapse_parallel_loops_to_1d_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/collapse_parallel_loops_to_1d_pass.cc
@@ -20,22 +20,22 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "transforms/passes.h"
 
 using ::mlir::scf::ParallelOp;
 
 namespace mlir {
 
 #define GEN_PASS_DEF_COLLAPSEPARALLELLOOPSTO1DPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
 // This is the implementation of the CollapseParallelLoopsTo1D pass declared in
-//  include/mlir-hlo/Transforms/passes.td
+//  include/transforms/passes.td
 struct CollapseParallelLoopsTo1D
     : public impl::CollapseParallelLoopsTo1DPassBase<
           CollapseParallelLoopsTo1D> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/copy_removal.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/copy_removal.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/copy_removal.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/copy_removal.cc
index 035afbe586c..23246ca2fff 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/copy_removal.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/copy_removal.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Analysis/userange_analysis.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Transforms/passes.h"
+#include "analysis/userange_analysis.h"
+#include "lhlo/IR/lhlo_ops.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_COPYREMOVAL
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/detensorize_scf_ops.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/detensorize_scf_ops.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
index 46977ed0e5d..6a2a8987910 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/detensorize_scf_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -24,11 +23,12 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_DETENSORIZESCFOPSPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/generic_host_to_llvm.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
similarity index 92%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/generic_host_to_llvm.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
index d24bb891c99..b890ae0da9b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/generic_host_to_llvm.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/generic_host_to_llvm.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir-hlo/Transforms/passes.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -34,11 +34,12 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_GENERICHOSTTOLLVMPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
@@ -60,10 +61,12 @@ class GenericHostToLLVMPass
 
     // Populate patterns.
     RewritePatternSet patterns(&getContext());
+    populateAffineToStdConversionPatterns(patterns);
     arith::populateArithExpandOpsPatterns(patterns);
     memref::populateExpandOpsPatterns(patterns);
+    memref::populateExpandStridedMetadataPatterns(patterns);
     arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
-    populateMemRefToLLVMConversionPatterns(typeConverter, patterns);
+    populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
     populateMathToLLVMConversionPatterns(typeConverter, patterns);
     populateFuncToLLVMConversionPatterns(typeConverter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_fusion_rewrite.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_fusion_rewrite.cc
similarity index 75%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_fusion_rewrite.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/gpu_fusion_rewrite.cc
index 500ea8d5c86..9b7703f9f80 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_fusion_rewrite.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_fusion_rewrite.cc
@@ -15,19 +15,19 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 
+#include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Transforms/gpu_passes.h"
+#include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -35,17 +35,26 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "transforms/gpu_passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_GPUFUSIONREWRITEPASS
-#include "mlir-hlo/Transforms/gpu_passes.h.inc"
+#include "transforms/gpu_passes.h.inc"
+
+// Name of the 'gpu.launch_func' attribute which specifies the written operands.
+static constexpr llvm::StringLiteral kWrittenOperandsAttrName("lmhlo.written");
 
 namespace {
+struct HloToGpuPipelineOptions {
+  SmallVector<int64_t> blockTileDim;
+  SmallVector<int64_t> warpTileDim;
+  SmallVector<int64_t> threadTileDim;
+  bool experimentalSoftmax = false;
+};
+
 class GpuFusionRewritePass
     : public impl::GpuFusionRewritePassBase<GpuFusionRewritePass> {
  public:
@@ -55,84 +64,28 @@ class GpuFusionRewritePass
  private:
   void getDependentDialects(DialectRegistry& registry) const override;
   void runOnOperation() override;
-};
-
-// Rewrites `lmhlo.fusion` to `gpu.launch_func` for fusion regions that the
-// HLO to GPU pipeline can handle.
-class FusionRewritePattern : public OpRewritePattern<lmhlo::FusionOp> {
-  struct HloToGpuPipelineOptions {
-    SmallVector<int64_t> blockTileDim;
-    SmallVector<int64_t> warpTileDim;
-    SmallVector<int64_t> threadTileDim;
-    bool experimentalSoftmax = false;
-  };
 
- public:
-  explicit FusionRewritePattern(MLIRContext* ctx,
+  // Rewrites `lmhlo.fusion` to `gpu.launch_func` for fusion regions that the
+  // HLO to GPU pipeline can handle.
+  LogicalResult rewriteFusionOp(lmhlo::FusionOp fusionOp,
+                                RewriterBase& rewriter,
+                                const HloToGpuPipelineOptions& options,
                                 GpuFusionRewritePass& parentPass,
-                                SymbolTable& symbolTable);
-
- private:
-  LogicalResult matchAndRewrite(lmhlo::FusionOp fusionOp,
-                                PatternRewriter& rewriter) const override;
-
-  // Returns the hlo-to-gpu pipeline options, or failure if the fusion cannot be
-  // rewritten.
-  FailureOr<HloToGpuPipelineOptions> getPipelineOptions(
-      lmhlo::FusionOp fusionOp, PatternRewriter& rewriter) const;
-
-  // Annotates gpu.launch_func with attribute specifying written operands.
-  //
-  // func.func @fusion(%arg0, %arg1 {lmhlo.written}) {
-  //   gpu.launch_func args(%arg0, %arg1, %arg0)
-  //
-  // will add a `lmhlo.written = [false, true, false]` attribute.
-  //
-  // The 'written_operands' attribute is used later to retrieve which
-  // gpu.launch_func arguments are written vs. just read.
-  static void annotateLaunchFunc(func::FuncOp funcOp,
-                                 PatternRewriter& rewriter);
-
-  // Returns target where lowerable fusion ops are marked legal.
-  static ConversionTarget getRewritableTarget(MLIRContext* ctx);
-
-  GpuFusionRewritePass& parentPass;
-  SymbolTable& symbolTable;
-  ConversionTarget rewritableTarget = getRewritableTarget(getContext());
+                                SymbolTable& symbolTable) const;
 };
 }  // namespace
 
-// Name of the 'gpu.launch_func' attribute which specifies the written operands.
-static constexpr llvm::StringLiteral kWrittenOperandsAttrName = "lmhlo.written";
-
-void GpuFusionRewritePass::getDependentDialects(
-    DialectRegistry& registry) const {
-  OpPassManager passManager;
-  createHloToGpuPipeline(passManager, {}, {}, {}, false);
-  passManager.getDependentDialects(registry);
-}
-
-void GpuFusionRewritePass::runOnOperation() {
-  SymbolTable symbolTable(getOperation());
-  auto pattern =
-      std::make_unique<FusionRewritePattern>(&getContext(), *this, symbolTable);
-  mlir::FrozenRewritePatternSet patterns({&getContext(), std::move(pattern)});
-  auto callback = [&](lmhlo::FusionOp fusion) {
-    if (failed(applyOpPatternsAndFold(fusion, patterns)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  };
-  if (getOperation().walk(callback).wasInterrupted())
-    return signalPassFailure();
+// Returns the number of groups per block for softmax. Each group of threads
+// handles a row. A group has power-of-two size up to a warp. We use 256 threads
+// per block and a group size that leaves less than half of the threads unused.
+static int64_t getGroupsPerBlock(TensorType type) {
+  int64_t reductionDim = type.getShape().back();
+  for (int64_t numGroups = 8; numGroups <= 256; numGroups *= 2) {
+    if (reductionDim * numGroups > 128) return numGroups;
+  }
+  return 8;
 }
 
-FusionRewritePattern::FusionRewritePattern(MLIRContext* ctx,
-                                           GpuFusionRewritePass& parentPass,
-                                           SymbolTable& symbolTable)
-    : OpRewritePattern<lmhlo::FusionOp>::OpRewritePattern(ctx),
-      parentPass(parentPass),
-      symbolTable(symbolTable) {}
-
 // Returns the number of elements each thread should handle for 'type'.
 // The intention is that loads and stores are vectorized later on to this width
 // to maximize memory throughput.
@@ -185,11 +138,129 @@ static int64_t getThreadsPerBlock(TensorType type, int64_t elementsPerThread) {
   return 256;
 }
 
-LogicalResult FusionRewritePattern::matchAndRewrite(
-    lmhlo::FusionOp fusionOp, PatternRewriter& rewriter) const {
-  FailureOr<HloToGpuPipelineOptions> options =
-      getPipelineOptions(fusionOp, rewriter);
-  if (failed(options)) return failure();
+// Annotates gpu.launch_func with attribute specifying written operands.
+//
+// func.func @fusion(%arg0, %arg1 {lmhlo.written}) {
+//   gpu.launch_func args(%arg0, %arg1, %arg0)
+//
+// will add a `lmhlo.written = [false, true, false]` attribute.
+//
+// The 'written_operands' attribute is used later to retrieve which
+// gpu.launch_func arguments are written vs. just read.
+static void annotateLaunchFunc(func::FuncOp funcOp, RewriterBase& rewriter) {
+  funcOp.walk([&](gpu::LaunchFuncOp op) {
+    auto writtenOperands = llvm::to_vector(
+        llvm::map_range(op.getKernelOperands(), [&](Value operand) -> bool {
+          auto arg = operand.dyn_cast<BlockArgument>();
+          if (!arg) return false;
+          return funcOp.getArgAttr(arg.getArgNumber(),
+                                   kWrittenOperandsAttrName) != nullptr;
+        }));
+    op->setAttr(kWrittenOperandsAttrName,
+                rewriter.getBoolArrayAttr(writtenOperands));
+  });
+}
+
+// Returns whether 'type' is can be lowered by the FusionRewritePattern.
+static bool isRewritableType(Type type) {
+  auto shapedType = type.cast<ShapedType>();
+  // Complex types are not yet supported.
+  if (shapedType.getElementType().isa<ComplexType>()) return false;
+  // Zero ranked shapes are not yet supported.
+  if (shapedType.getRank() == 0) return false;
+  // MemRef types need to have identity layout.
+  if (auto memrefType = shapedType.dyn_cast<MemRefType>())
+    return memrefType.getLayout().isIdentity();
+  // Unsigned integers are not yet supported.
+  if (auto intType = shapedType.getElementType().dyn_cast<IntegerType>())
+    return !intType.isUnsigned();
+  // F8 types are not yet supported.
+  // TODO(b/259609697): Support F8 types.
+  if (shapedType.getElementType().isFloat8E5M2() ||
+      shapedType.getElementType().isFloat8E4M3FN())
+    return false;
+  return true;
+}
+
+// Returns target where lowerable fusion ops are marked legal.
+static ConversionTarget getRewritableTarget(MLIRContext* ctx) {
+  ConversionTarget target(*ctx);
+  // Mark expected auxiliary ops as legal.
+  target.addLegalOp<lmhlo::TerminatorOp>();
+  target.addDynamicallyLegalOp<bufferization::ToTensorOp>(
+      [&](bufferization::ToTensorOp op) {
+        return isRewritableType(op.getMemref().getType()) &&
+               isRewritableType(op.getType());
+      });
+  target.addDynamicallyLegalOp<memref::TensorStoreOp>(
+      [&](memref::TensorStoreOp op) {
+        return isRewritableType(op.getTensor().getType()) &&
+               isRewritableType(op.getMemref().getType());
+      });
+  // For now, use an explicit allow-list of hlo ops inside the fusion. If any
+  // other op is present, the fusion will not be rewritten.
+  target.addDynamicallyLegalOp<
+      mhlo::AddOp, mhlo::AbsOp, mhlo::CbrtOp, mhlo::CeilOp, mhlo::CosineOp,
+      mhlo::DivOp, mhlo::ExpOp, mhlo::Expm1Op, mhlo::FloorOp, mhlo::LogOp,
+      mhlo::Log1pOp, mhlo::LogisticOp, mhlo::MulOp, mhlo::NegOp, mhlo::RoundOp,
+      mhlo::RoundNearestEvenOp, mhlo::RsqrtOp, mhlo::SignOp, mhlo::SineOp,
+      mhlo::SqrtOp, mhlo::SubtractOp, mhlo::TanOp, mhlo::TanhOp>(
+      [&](Operation* op) { return op->hasOneUse(); });
+  return target;
+}
+
+// Returns the hlo-to-gpu pipeline options, or failure if the fusion cannot be
+// rewritten.
+static FailureOr<HloToGpuPipelineOptions> getPipelineOptions(
+    lmhlo::FusionOp fusionOp, RewriterBase& rewriter) {
+  if (fusionOp.getFusionResults().size() != 1)
+    return rewriter.notifyMatchFailure(fusionOp, "expected single result");
+  if (isa<bufferization::ToTensorOp>(fusionOp.getFusionRoots().front()))
+    return rewriter.notifyMatchFailure(fusionOp, "expected non-empty fusion");
+
+  auto resultType =
+      fusionOp.getFusionResults().front().getType().cast<TensorType>();
+  // If fusion type is tagged as softmax, use that.
+  if (auto fusionType = fusionOp->getAttrOfType<StringAttr>("fusion_type");
+      fusionType && fusionType.getValue() == "softmax_fusion") {
+    HloToGpuPipelineOptions options;
+    options.blockTileDim = {getGroupsPerBlock(resultType)};
+    options.warpTileDim = {1};
+    options.experimentalSoftmax = true;
+    return options;
+  }
+
+  ConversionTarget rewritableTarget =
+      getRewritableTarget(rewriter.getContext());
+
+  // If fusion_op (including its region) is not legal by rewriteableTarget, we
+  // expect lowering to GPU to fail or produce incorrect results.
+  auto callback = [&](Operation* op) {
+    if (rewritableTarget.isLegal(op)) return WalkResult::advance();
+    (void)rewriter.notifyMatchFailure(op, "expected to be rewritable");
+    return WalkResult::interrupt();
+  };
+  if (fusionOp.getRegion().walk(callback).wasInterrupted()) return failure();
+
+  int64_t elementsPerThread = getElementsPerThread(resultType);
+  constexpr int64_t kThreadsPerWarp = 32;
+  int64_t elementsPerWarp = elementsPerThread * kThreadsPerWarp;
+  int64_t elementsPerBlock =
+      getThreadsPerBlock(resultType, elementsPerThread) * elementsPerThread;
+  HloToGpuPipelineOptions options;
+  options.blockTileDim = {elementsPerBlock};
+  options.warpTileDim = {elementsPerWarp};
+  options.threadTileDim = {elementsPerThread};
+  return options;
+}
+
+// Rewrites `lmhlo.fusion` to `gpu.launch_func` for fusion regions that the
+// HLO to GPU pipeline can handle.
+LogicalResult GpuFusionRewritePass::rewriteFusionOp(
+    lmhlo::FusionOp fusionOp, RewriterBase& rewriter,
+    const HloToGpuPipelineOptions& options, GpuFusionRewritePass& parentPass,
+    SymbolTable& symbolTable) const {
+  rewriter.setInsertionPoint(fusionOp);
 
   // Collect values in fusion region defined above.
   SetVector<Value> captures;
@@ -200,7 +271,7 @@ LogicalResult FusionRewritePattern::matchAndRewrite(
   // Convert statically shaped types to their 1D equivalent. This only works for
   // element wise fusions and will have to become a more sophisticated pass when
   // e.g. broadcasts are involved.
-  if (!options->experimentalSoftmax) {
+  if (!options.experimentalSoftmax) {
     converter.addConversion([](ShapedType type) {
       if (!type.hasStaticShape()) return type;
       return type.clone(type.getNumElements());
@@ -220,10 +291,10 @@ LogicalResult FusionRewritePattern::matchAndRewrite(
   auto argTypes = llvm::to_vector(llvm::map_range(captures, [&](Value value) {
     return converter.convertType(value.getType());
   }));
-  auto funcType = rewriter.getFunctionType(argTypes, llvm::None);
+  auto funcType = rewriter.getFunctionType(argTypes, std::nullopt);
   auto funcOp = rewriter.create<func::FuncOp>(loc, "fusion", funcType);
   rewriter.setInsertionPointToEnd(funcOp.addEntryBlock());
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   for (const auto& [from, to] :
        llvm::zip_first(captures, funcOp.getArguments())) {
     mapping.map(from, to);
@@ -249,12 +320,11 @@ LogicalResult FusionRewritePattern::matchAndRewrite(
   // Create and run the HLO to GPU pass pipeline.
   // Note: passManager.enableIRPrinting() doesn't do anything on dynamic pass
   // pipelines. Printing needs to be enabled on the parent pass manager.
-  PassManager passManager(getContext());
-  createHloToGpuPipeline(passManager, options->blockTileDim,
-                         options->warpTileDim, options->threadTileDim,
-                         options->experimentalSoftmax);
+  PassManager passManager(rewriter.getContext());
+  createHloToGpuPipeline(passManager, options.blockTileDim, options.warpTileDim,
+                         options.threadTileDim, options.experimentalSoftmax);
   if (failed(parentPass.runPipeline(passManager, moduleOp)))
-    return rewriter.notifyMatchFailure(fusionOp, "failed to run pipeline");
+    return fusionOp->emitError() << "failed to run the hlo-to-gpu pipeline";
 
   // Clone the (single) gpu module with the device function.
   rewriter.setInsertionPoint(fusionOp->getParentOfType<func::FuncOp>());
@@ -262,7 +332,7 @@ LogicalResult FusionRewritePattern::matchAndRewrite(
     StringAttr symbol =
         symbolTable.insert(rewriter.clone(*gpuModuleOp.getOperation()));
     if (failed(symbolTable.replaceAllSymbolUses(gpuModuleOp, symbol, funcOp)))
-      return rewriter.notifyMatchFailure(fusionOp, "failed to replace symbol");
+      return gpuModuleOp->emitError() << "failed to replace symbol";
   }
   // Add 'gpu.container_module' attribute to parent module.
   fusionOp->getParentOfType<ModuleOp>()->setAttr(
@@ -283,104 +353,29 @@ LogicalResult FusionRewritePattern::matchAndRewrite(
   return success();
 }
 
-FailureOr<FusionRewritePattern::HloToGpuPipelineOptions>
-FusionRewritePattern::getPipelineOptions(lmhlo::FusionOp fusionOp,
-                                         PatternRewriter& rewriter) const {
-  if (fusionOp.getFusionResults().size() != 1)
-    return rewriter.notifyMatchFailure(fusionOp, "expected single result");
-  if (isa<bufferization::ToTensorOp>(fusionOp.getFusionRoots().front()))
-    return rewriter.notifyMatchFailure(fusionOp, "expected non-empty fusion");
-
-  // If fusion type is tagged as softmax, use that.
-  if (auto fusionType = fusionOp->getAttrOfType<StringAttr>("fusion_type");
-      fusionType && fusionType.getValue() == "softmax_fusion") {
-    HloToGpuPipelineOptions options;
-    options.blockTileDim = {8};
-    options.warpTileDim = {1};
-    options.experimentalSoftmax = true;
-    return options;
-  }
-
-  // If fusion_op (including its region) is not legal by rewriteableTarget, we
-  // expect lowering to GPU to fail or produce incorrect results.
-  auto callback = [&](Operation* op) {
-    if (rewritableTarget.isLegal(op)) return WalkResult::advance();
-    (void)rewriter.notifyMatchFailure(op, "expected to be rewritable");
-    return WalkResult::interrupt();
-  };
-  if (fusionOp.getRegion().walk(callback).wasInterrupted()) return failure();
-
-  auto resultType =
-      fusionOp.getFusionResults().front().getType().cast<TensorType>();
-  int64_t elementsPerThread = getElementsPerThread(resultType);
-  constexpr int64_t kThreadsPerWarp = 32;
-  int64_t elementsPerWarp = elementsPerThread * kThreadsPerWarp;
-  int64_t elementsPerBlock =
-      getThreadsPerBlock(resultType, elementsPerThread) * elementsPerThread;
-  HloToGpuPipelineOptions options;
-  options.blockTileDim = {elementsPerBlock};
-  options.warpTileDim = {elementsPerWarp};
-  options.threadTileDim = {elementsPerThread};
-  return options;
+void GpuFusionRewritePass::getDependentDialects(
+    DialectRegistry& registry) const {
+  // Collect the dependent dialects for both variants of the pipeline.
+  OpPassManager passManager;
+  for (bool experimentalSoftmax : {false, true})
+    createHloToGpuPipeline(passManager, {}, {}, {}, experimentalSoftmax);
+  passManager.getDependentDialects(registry);
 }
 
-void FusionRewritePattern::annotateLaunchFunc(func::FuncOp funcOp,
-                                              PatternRewriter& rewriter) {
-  funcOp.walk([&](gpu::LaunchFuncOp op) {
-    auto writtenOperands = llvm::to_vector(
-        llvm::map_range(op.getKernelOperands(), [&](Value operand) -> bool {
-          auto arg = operand.dyn_cast<BlockArgument>();
-          if (!arg) return false;
-          return funcOp.getArgAttr(arg.getArgNumber(),
-                                   kWrittenOperandsAttrName) != nullptr;
-        }));
-    op->setAttr(kWrittenOperandsAttrName,
-                rewriter.getBoolArrayAttr(writtenOperands));
-  });
-}
+void GpuFusionRewritePass::runOnOperation() {
+  IRRewriter rewriter(&getContext());
+  SymbolTable symbolTable(getOperation());
 
-// Returns whether 'type' is can be lowered by the FusionRewritePattern.
-static bool isRewritableType(Type type) {
-  auto shapedType = type.cast<ShapedType>();
-  // Complex types are not yet supported.
-  if (shapedType.getElementType().isa<ComplexType>()) return false;
-  // Zero ranked shapes are not yet supported.
-  if (shapedType.getRank() == 0) return false;
-  // MemRef types need to have identity layout.
-  if (auto memrefType = shapedType.dyn_cast<MemRefType>())
-    return memrefType.getLayout().isIdentity();
-  // Unsigned integers are not yet supported.
-  if (auto intType = shapedType.getElementType().dyn_cast<IntegerType>())
-    return !intType.isUnsigned();
-  return true;
-}
+  auto callback = [&](lmhlo::FusionOp fusionOp) {
+    FailureOr<HloToGpuPipelineOptions> options =
+        getPipelineOptions(fusionOp, rewriter);
+    if (failed(options)) return WalkResult::advance();
+    return WalkResult(
+        rewriteFusionOp(fusionOp, rewriter, *options, *this, symbolTable));
+  };
 
-ConversionTarget FusionRewritePattern::getRewritableTarget(MLIRContext* ctx) {
-  ConversionTarget target(*ctx);
-  // Mark expected auxiliary ops as legal.
-  target.addLegalOp<lmhlo::TerminatorOp>();
-  target.addDynamicallyLegalOp<bufferization::ToTensorOp>(
-      [&](bufferization::ToTensorOp op) {
-        return isRewritableType(op.getMemref().getType()) &&
-               isRewritableType(op.getType());
-      });
-  target.addDynamicallyLegalOp<memref::TensorStoreOp>(
-      [&](memref::TensorStoreOp op) {
-        return isRewritableType(op.getTensor().getType()) &&
-               isRewritableType(op.getMemref().getType());
-      });
-  // For now, use an explicit allow-list of hlo ops inside the fusion. If any
-  // other op is present, the fusion will not be rewritten.
-  target.addDynamicallyLegalOp<
-      mhlo::AddOp, mhlo::AbsOp, mhlo::CbrtOp, mhlo::CeilOp, mhlo::CosineOp,
-      mhlo::DivOp, mhlo::ExpOp, mhlo::Expm1Op, mhlo::FloorOp, mhlo::LogOp,
-      mhlo::Log1pOp, mhlo::LogisticOp, mhlo::MulOp, mhlo::NegOp, mhlo::RoundOp,
-#if !TENSORFLOW_USE_ROCM
-      mhlo::RoundNearestEvenOp,
-#endif
-      mhlo::RsqrtOp, mhlo::SignOp, mhlo::SineOp, mhlo::SqrtOp, mhlo::SubtractOp,
-      mhlo::TanhOp>([&](Operation* op) { return op->hasOneUse(); });
-  return target;
+  if (getOperation().walk(callback).wasInterrupted())
+    return signalPassFailure();
 }
 
 std::unique_ptr<OperationPass<ModuleOp>> createGpuFusionRewritePass() {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_kernel_lowering_passes.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
similarity index 78%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_kernel_lowering_passes.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
index c53c6378763..6861f13d169 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/gpu_kernel_lowering_passes.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_kernel_lowering_passes.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "mlir-hlo/Transforms/gpu_passes.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -31,13 +30,16 @@ limitations under the License.
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/gpu_passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_GPUKERNELTONVVMPASS
 #define GEN_PASS_DEF_GPUKERNELTOROCDLPASS
-#include "mlir-hlo/Transforms/gpu_passes.h.inc"
+#include "transforms/gpu_passes.h.inc"
 
 namespace {
 
@@ -57,11 +59,24 @@ class GpuKernelToROCDLPass
 
 }  // namespace
 
+static void populateAllCommonVectorProgressiveLoweringPatterns(
+    RewritePatternSet& patterns) {
+  vector::populateVectorToVectorCanonicalizationPatterns(patterns);
+  vector::populateVectorBroadcastLoweringPatterns(patterns);
+  vector::populateVectorContractLoweringPatterns(patterns);
+  vector::populateVectorMaskOpLoweringPatterns(patterns);
+  vector::populateVectorShapeCastLoweringPatterns(patterns);
+  vector::populateVectorTransposeLoweringPatterns(patterns);
+  // Vector transfer ops with rank > 1 should be lowered with VectorToSCF.
+  vector::populateVectorTransferLoweringPatterns(patterns,
+                                                 /*maxTransferRank=*/1);
+}
+
 static void populateCommonPatterns(LLVMTypeConverter& converter,
                                    RewritePatternSet& patterns) {
   arith::populateArithToLLVMConversionPatterns(converter, patterns);
   populateMathToLLVMConversionPatterns(converter, patterns);
-  populateMemRefToLLVMConversionPatterns(converter, patterns);
+  populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
   populateFuncToLLVMConversionPatterns(converter, patterns);
   cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
   populateComplexToLLVMConversionPatterns(converter, patterns);
@@ -69,6 +84,12 @@ static void populateCommonPatterns(LLVMTypeConverter& converter,
 }
 
 void GpuKernelToNVVMPass::runOnOperation() {
+  {
+    RewritePatternSet patterns(&getContext());
+    populateAllCommonVectorProgressiveLoweringPatterns(patterns);
+    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  }
+
   RewritePatternSet patterns(&getContext());
   LowerToLLVMOptions llvmOpts(&getContext(), DataLayout(getOperation()));
   LLVMTypeConverter converter(&getContext(), llvmOpts);
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h
new file mode 100644
index 00000000000..5c73d654cc7
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TRANSFORMS_GPU_PASSES_H
+#define MLIR_HLO_TRANSFORMS_GPU_PASSES_H
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class ModuleOp;
+class PassManager;
+namespace gpu {
+class GPUModuleOp;
+}  // namespace gpu
+
+#define GEN_PASS_DECL
+#include "transforms/gpu_passes.h.inc"
+
+// Create a pass which lowers a subset of lmhlo.fusion ops to gpu.launch_func
+// plus a gpu.module containing the kernel.
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createGpuFusionRewritePass();
+
+// Returns array of bool attributes. The value of each element specifies whether
+// the corresponding operand is written. This attribute is attached to
+// 'gpu.launc_func' ops during the fusion rewrite pass above.
+ArrayAttr getWrittenOperandsAttribute(Operation* op);
+
+/// Pass that transforms gpu modules in standard dialect to NNVM.
+std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
+createGpuKernelToNvvmPass();
+
+/// Pass that transforms gpu modules in standard dialect to ROCDL.
+std::unique_ptr<OperationPass<mlir::gpu::GPUModuleOp>>
+createGpuKernelToRocdlPass();
+
+/// Creates a pipeline that converts operations in HLO dialect to GPU kernels
+/// written in a combination of LLVM and NVVM dialects, and appends the pipeline
+/// to `pm`. `blockTileDim`, `warpTileDim` and `threadTileDim` indicate the
+/// size of the subproblem that will be operated on by the block, warp, and
+/// thread level, respectively.
+void createHloToGpuPipeline(OpPassManager& pm, ArrayRef<int64_t> blockTileDim,
+                            ArrayRef<int64_t> warpTileDim,
+                            ArrayRef<int64_t> threadTileDim,
+                            bool experimentalSoftmax);
+
+/// Creates a pipeline that converts operations in HLO dialect to Triton
+/// kernels. `blockTileDim`, indicates the block-level tile size that the
+/// problem will be tiled to.
+void createHloToTritonPipeline(OpPassManager& pm,
+                               ArrayRef<int64_t> blockTileDim);
+
+#define GEN_PASS_REGISTRATION
+#include "transforms/gpu_passes.h.inc"
+
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TRANSFORMS_GPU_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.td b/tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.td
rename to tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.td
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_gpu_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_gpu_pipeline.cc
new file mode 100644
index 00000000000..5faf8ba913f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_gpu_pipeline.cc
@@ -0,0 +1,291 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// This files contains a pipeline which converts HLO operations to GPU kernels
+/// written in a combination of LLVM and NVVM dialects.
+
+#include <memory>
+
+#include "gml_st/transforms/passes.h"
+#include "mhlo/transforms/passes.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "transforms/gpu_passes.h"
+#include "transforms/passes.h"
+
+using namespace mlir;
+using ::mlir::func::FuncOp;
+using ::mlir::gpu::GPUModuleOp;
+
+static constexpr const char *kBlockDistributionLabel = "block";
+static constexpr const char *kWarpDistributionLabel = "warp";
+static constexpr const char *kThreadDistributionLabel = "thread";
+
+namespace {
+
+SmallVector<Operation *> detectSinkableTransferRead(
+    vector::TransferReadOp transferReadOp) {
+  SmallVector<Operation *> currentOps;
+  currentOps.push_back(transferReadOp);
+  for (Value index : transferReadOp.getIndices()) {
+    auto indexDefOp = index.getDefiningOp<arith::ConstantOp>();
+    if (!indexDefOp) return {};
+    currentOps.push_back(indexDefOp);
+  }
+  auto paddingDefOp =
+      transferReadOp.getPadding().getDefiningOp<arith::ConstantOp>();
+  if (!paddingDefOp) return {};
+  currentOps.push_back(paddingDefOp);
+  if (currentOps.size() != transferReadOp.getTransferRank() + 2) return {};
+  return currentOps;
+}
+
+SmallVector<Operation *> detectSinkableScalar(
+    Operation *scalarOp, const SmallVector<Operation *> &opsToSink) {
+  SmallVector<Operation *> currentOps;
+  currentOps.push_back(scalarOp);
+  for (Value operand : scalarOp->getOperands()) {
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) return {};
+
+    if (isa<arith::ConstantOp, memref::LoadOp>(defOp) ||
+        llvm::is_contained(opsToSink, defOp)) {
+      currentOps.push_back(defOp);
+    }
+  }
+  if (currentOps.size() != scalarOp->getNumOperands() + 1) return {};
+  return currentOps;
+}
+
+LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp) {
+  Region &launchOpBody = launchOp.getBody();
+
+  // Identify uses from values defined outside of the scope of the launch
+  // operation.
+  SetVector<Value> valuesFromAbove;
+  getUsedValuesDefinedAbove(launchOpBody, valuesFromAbove);
+
+  SetVector<Operation *> producers;
+  for (Value value : valuesFromAbove) {
+    Operation *operandOp = value.getDefiningOp();
+    if (!operandOp) continue;
+    producers.insert(operandOp);
+  }
+
+  // Find operations that can be sunk into the scope.
+  SmallVector<Operation *> opsToSink;
+  bool foundSinkableOps = true;
+  while (foundSinkableOps && !producers.empty()) {
+    foundSinkableOps = false;
+    SmallVector<Operation *> producersToDrop;
+    for (Operation *producer : producers) {
+      if (producer->getNumResults() != 1) {
+        producersToDrop.push_back(producer);
+      }
+      if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(producer)) {
+        SmallVector<Operation *> vectorTransferReadDAG =
+            detectSinkableTransferRead(transferReadOp);
+        if (!vectorTransferReadDAG.empty()) {
+          for (Operation *op : llvm::reverse(vectorTransferReadDAG)) {
+            opsToSink.push_back(op);
+          }
+          foundSinkableOps = true;
+        }
+        producersToDrop.push_back(producer);
+      }
+      if (!isa<ShapedType>(producer->getResult(0).getType())) {
+        SmallVector<Operation *> scalarOpDAG =
+            detectSinkableScalar(producer, opsToSink);
+        if (!scalarOpDAG.empty()) {
+          for (Operation *op : llvm::reverse(scalarOpDAG)) {
+            opsToSink.push_back(op);
+          }
+          foundSinkableOps = true;
+          producersToDrop.push_back(producer);
+        }
+      }
+    }
+    for (Operation *producerToDrop : producersToDrop)
+      producers.remove(producerToDrop);
+  }
+
+  // Insert operations so that the defs get cloned before uses.
+  mlir::IRMapping map;
+  OpBuilder builder(launchOpBody);
+  for (Operation *op : opsToSink) {
+    Operation *clonedOp = builder.clone(*op, map);
+    // Only replace uses within the launch op.
+    for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) {
+      replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
+                                 launchOp.getBody());
+    }
+  }
+  return success();
+}
+
+}  // namespace
+
+/// Pass that moves ops which are likely an index computation into gpu.launch
+/// body.
+class GPUSinkVectorArgsInGPULaunchPass
+    : public mlir::PassWrapper<GPUSinkVectorArgsInGPULaunchPass,
+                               mlir::OperationPass<> > {
+ public:
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    if (op->walk([](gpu::LaunchOp launch) {
+            // Pull in instructions that can be sunk
+            if (failed(sinkOperationsIntoLaunchOp(launch)))
+              return WalkResult::interrupt();
+
+            return WalkResult::advance();
+          }).wasInterrupted())
+      signalPassFailure();
+  }
+};
+
+// TODO(b/233761238): We only want to have this pipeline temporarily, as it is
+// not yet clear how exactly it will look like. The goal is to merge this with
+// the unified kernel generator + autofusion + XLA Next pipeline once we have
+// it, and once this code stabilizes.
+void mlir::createHloToGpuPipeline(OpPassManager &pm,
+                                  ArrayRef<int64_t> blockTileDim,
+                                  ArrayRef<int64_t> warpTileDim,
+                                  ArrayRef<int64_t> threadTileDim,
+                                  bool experimentalSoftmax) {
+  pm.addNestedPass<FuncOp>(hlo::createUnbufferizePass());
+  pm.addPass(createCanonicalizerPass());  // Clean up get_tuple_element.
+  pm.addPass(createCSEPass());  // Combine repeated subtract(broadcast).
+
+  // HLO -> Linalg
+  pm.addNestedPass<FuncOp>(mhlo::createChloLegalizeToHloPass());
+  pm.addPass(createCanonicalizerPass());  // Clean up shape.assuming ops.
+  // Tiling either for softmax or for elementwise
+  if (experimentalSoftmax) {
+    pm.addNestedPass<FuncOp>(
+        mhlo::createLegalizeHloToLinalgPass(/*enablePrimitiveOps=*/true));
+
+    // Simplify unit dimension.
+    pm.addPass(mlir::createLinalgFoldUnitExtentDimsPass());
+
+    // Collapse all but the trailing reduction/bcast dimension.
+    pm.addNestedPass<FuncOp>(
+        gml_st::createCollapseShapePass({/*retainTrailingDims=*/1}));
+    // Merge multiple occurences of collapsed operand. This is needed to detect
+    // the softmax pattern later.
+    pm.addNestedPass<FuncOp>(mlir::createCSEPass());
+
+    // Tile parallel dimensions of the softmax-like patterns and distribute them
+    // across warps. Warps remain independant of each other.
+    pm.addNestedPass<FuncOp>(gml_st::createGreedyFusionPass(
+        /*distribute=*/true, blockTileDim, kBlockDistributionLabel));
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createCSEPass());
+    pm.addNestedPass<FuncOp>(gml_st::createGreedyFusionPass(
+        /*distribute=*/true, warpTileDim, kWarpDistributionLabel));
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createCSEPass());
+
+    // GPU-specific tiling for ops on the warp level.
+    pm.addNestedPass<FuncOp>(gml_st::createTilingGpuWarpPass());
+    pm.addNestedPass<FuncOp>(gml_st::createScalarizationPass());
+
+    pm.addNestedPass<FuncOp>(gml_st::createVectorizeForGPUPass(
+        /*vectorizeGmlStOps=*/true, /*distributionLabels=*/{
+            kWarpDistributionLabel, kThreadDistributionLabel}));
+  } else {
+    pm.addNestedPass<FuncOp>(
+        mhlo::createLegalizeHloToLinalgPass(/*enablePrimitiveOps=*/false));
+
+    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
+        /*distribute=*/true, blockTileDim, kBlockDistributionLabel));
+    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
+        /*distribute=*/true, warpTileDim, kWarpDistributionLabel));
+    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
+        /*distribute=*/true, threadTileDim, kThreadDistributionLabel));
+    // Convert the inner dimension into a sequential loop over all elements.
+    pm.addNestedPass<FuncOp>(gml_st::createTilingCwisePass(
+        /*distribute=*/false, /*tileSizes=*/1));
+    pm.addNestedPass<FuncOp>(gml_st::createScalarizationPass());
+  }
+
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // Bufferization-related passes
+  pm.addNestedPass<FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
+  pm.addPass(hlo::createOneShotBufferizePass());
+  // We do not deallocate buffers, since grid-level buffers get converted into
+  // functions arguments, while block- (and lower-)level buffers become shared
+  // memory. None of which have to be deallocated.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+  // Canonicalize away memory copies into itself.
+  pm.addPass(createCanonicalizerPass());
+
+  // GmlSt -> GPU
+  pm.addNestedPass<FuncOp>(
+      gml_st::createGmlStSimtfyPass(kBlockDistributionLabel));
+  pm.addNestedPass<FuncOp>(
+      gml_st::createGmlStToGpuPass(kWarpDistributionLabel));
+  pm.addNestedPass<FuncOp>(gml_st::createGmlStToScfPass());
+  pm.addNestedPass<FuncOp>(arith::createArithExpandOpsPass());
+  pm.addNestedPass<FuncOp>(createCanonicalizerPass());
+  pm.addPass(std::make_unique<GPUSinkVectorArgsInGPULaunchPass>());
+  pm.addNestedPass<FuncOp>(createCSEPass());
+  pm.addNestedPass<FuncOp>(mlir::createGpuLauchSinkIndexComputationsPass());
+
+  constexpr llvm::StringRef kGpuDataLayoutSpec =
+      "#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>";
+  pm.addPass(createGpuKernelOutliningPass(kGpuDataLayoutSpec));
+  pm.addNestedPass<GPUModuleOp>(createForLoopSpecializationPass());
+  pm.addNestedPass<GPUModuleOp>(hlo::createUnrollLoopsPass());
+  // Fold loads from subviews to optimize index computations.
+  pm.addNestedPass<GPUModuleOp>(memref::createFoldMemRefAliasOpsPass());
+  pm.addNestedPass<GPUModuleOp>(createLowerAffinePass());
+  pm.addNestedPass<GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<GPUModuleOp>(createConvertSCFToCFPass());
+
+  // GPU -> low-level IR
+#if TENSORFLOW_USE_ROCM
+  pm.addNestedPass<GPUModuleOp>(createGpuKernelToRocdlPass());
+#else
+  pm.addNestedPass<GPUModuleOp>(createGpuKernelToNvvmPass());
+#endif
+  pm.addPass(createPropagateStaticShapesToKernelPass());
+  // This is added as a global (instead of nested) pass to also remove duplicate
+  // constants on the host side of the code.
+  pm.addPass(createCSEPass());
+  // Some instructions crash ptxas down the line if they have debug info
+  // attached.
+  pm.addNestedPass<GPUModuleOp>(createStripDebugInfoPass());
+  pm.addNestedPass<FuncOp>(hlo::createAllocToArgPass());
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_triton_pipeline.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_triton_pipeline.cc
new file mode 100644
index 00000000000..b5cc98321b5
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/hlo_to_triton_pipeline.cc
@@ -0,0 +1,112 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// This files contains a pipeline which converts HLO operations to Triton
+/// kernels written in Triton Dialect.
+/// TODO(b/261710844): Define and actually generate Triton dialect.
+
+#include "gml_st/transforms/passes.h"
+#include "mhlo/transforms/passes.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/SCF/Transforms/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "transforms/gpu_passes.h"
+#include "transforms/passes.h"
+
+using namespace mlir;
+using ::mlir::func::FuncOp;
+using ::mlir::gpu::GPUModuleOp;
+
+static constexpr const char* kBlockDistributionLabel = "block";
+
+void mlir::createHloToTritonPipeline(OpPassManager& pm,
+                                     ArrayRef<int64_t> blockTileDim) {
+  pm.addNestedPass<FuncOp>(hlo::createUnbufferizePass());
+  pm.addPass(createCanonicalizerPass());  // Clean up get_tuple_element.
+  pm.addPass(createCSEPass());  // Combine repeated subtract(broadcast).
+
+  // HLO -> Linalg
+  pm.addNestedPass<FuncOp>(mhlo::createChloLegalizeToHloPass());
+  pm.addPass(createCanonicalizerPass());  // Clean up shape.assuming ops.
+  pm.addNestedPass<FuncOp>(
+      mhlo::createLegalizeHloToLinalgPass(/*enablePrimitiveOps=*/true));
+
+  // Tile softmax-like pattern
+
+  // Simplify unit dimension.
+  pm.addPass(mlir::createLinalgFoldUnitExtentDimsPass());
+  // Collapse all but the trailing reduction/bcast dimension.
+  pm.addNestedPass<FuncOp>(
+      gml_st::createCollapseShapePass({/*retainTrailingDims=*/1}));
+  // Merge multiple occurences of collapsed operand. This is needed to detect
+  // the softmax pattern later.
+  pm.addNestedPass<FuncOp>(mlir::createCSEPass());
+
+  // Tile parallel dimensions of the softmax-like patterns and distribute them
+  // across warps. Warps remain independant of each other.
+  pm.addNestedPass<FuncOp>(gml_st::createGreedyFusionPass(
+      /*distribute=*/true, blockTileDim, kBlockDistributionLabel));
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // TODO(b/261711206): Pad to power-of-2
+
+  pm.addNestedPass<FuncOp>(gml_st::createVectorizeForGPUPass());
+
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // Bufferization-related passes
+  pm.addNestedPass<FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
+  pm.addPass(hlo::createOneShotBufferizePass());
+  // We do not deallocate buffers, since grid-level buffers get converted into
+  // functions arguments, while block- (and lower-)level buffers become shared
+  // memory. None of which have to be deallocated.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+  // Canonicalize away memory copies into itself.
+  pm.addPass(createCanonicalizerPass());
+
+  // GmlSt -> Triton
+  pm.addNestedPass<FuncOp>(
+      gml_st::createGmlStSimtfyPass(kBlockDistributionLabel));
+  pm.addNestedPass<FuncOp>(gml_st::createGmlStToGpuPass());
+  pm.addNestedPass<FuncOp>(gml_st::createGmlStToScfPass());
+  pm.addNestedPass<FuncOp>(arith::createArithExpandOpsPass());
+  pm.addNestedPass<FuncOp>(createCanonicalizerPass());
+  pm.addPass(createGpuLauchSinkIndexComputationsPass());
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addNestedPass<GPUModuleOp>(createForLoopSpecializationPass());
+  pm.addNestedPass<GPUModuleOp>(hlo::createUnrollLoopsPass());
+  pm.addNestedPass<GPUModuleOp>(createLowerAffinePass());
+  pm.addNestedPass<GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<GPUModuleOp>(createConvertSCFToCFPass());
+
+  /// TODO(b/261710844): Define and actually generate Triton dialect.
+
+  // This is added as a global (instead of nested) pass to also remove duplicate
+  // constants on the host side of the code.
+  pm.addPass(createCSEPass());
+  pm.addNestedPass<FuncOp>(hlo::createAllocToArgPass());
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
new file mode 100644
index 00000000000..e1c3a724b58
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
@@ -0,0 +1,77 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains the patterns to convert arith.index_cast on tensors to
+// tensor ops and index_cast on scalars.
+
+#include <utility>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/passes.h"
+
+namespace mlir {
+
+#define GEN_PASS_DEF_LOWERINDEXCASTPASS
+#include "transforms/passes.h.inc"
+
+namespace {
+
+// index_cast is not defined on tensors, so lower it to a tensor.generate.
+template <typename T>
+struct IndexCastConverter : public OpRewritePattern<T> {
+ public:
+  using OpRewritePattern<T>::OpRewritePattern;
+  LogicalResult matchAndRewrite(T op, PatternRewriter &rewriter) const final {
+    auto resultTy = op.getType().template dyn_cast<RankedTensorType>();
+    if (!resultTy) return failure();
+
+    SmallVector<Value> dynamicExtents =
+        tensor::createDynamicDimValues(rewriter, op.getLoc(), op.getIn());
+    rewriter.replaceOpWithNewOp<tensor::GenerateOp>(
+        op, resultTy, dynamicExtents,
+        [&](OpBuilder &b, Location loc, ValueRange args) {
+          Value extent = b.create<tensor::ExtractOp>(loc, op.getIn(), args);
+          Value cast = b.create<T>(loc, resultTy.getElementType(), extent);
+          b.create<tensor::YieldOp>(loc, cast);
+        });
+    return success();
+  }
+};
+
+struct LowerIndexCastPass
+    : public impl::LowerIndexCastPassBase<LowerIndexCastPass> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<IndexCastConverter<arith::IndexCastOp>,
+                 IndexCastConverter<arith::IndexCastUIOp>>(
+        patterns.getContext());
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass() {
+  return std::make_unique<LowerIndexCastPass>();
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h
new file mode 100644
index 00000000000..bf686ebbd1d
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.h
@@ -0,0 +1,125 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef MLIR_HLO_TRANSFORMS_PASSES_H
+#define MLIR_HLO_TRANSFORMS_PASSES_H
+
+#include <functional>
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class ModuleOp;
+class MLIRContext;
+class ConversionTarget;
+class DialectRegistry;
+class PassManager;
+
+namespace func {
+class FuncOp;
+}  // namespace func
+namespace bufferization {
+class BufferizeTypeConverter;
+}  // namespace bufferization
+
+using BufferizeDialectsCallback = std::function<void(DialectRegistry&)>;
+using BufferizePatternsCallback = std::function<void(
+    ConversionTarget&, MLIRContext*, bufferization::BufferizeTypeConverter*,
+    RewritePatternSet*)>;
+
+//===----------------------------------------------------------------------===//
+// Passes
+//===----------------------------------------------------------------------===//
+
+#define GEN_PASS_DECL_BUFFERPACKING
+#define GEN_PASS_DECL_FINALBUFFERIZEPASS
+#define GEN_PASS_DECL_PROPAGATESTATICSHAPESTOKERNELPASS
+#define GEN_PASS_DECL_TILELOOPSPASS
+#include "transforms/passes.h.inc"
+
+/// Creates a pass that reuses buffers which are already allocated.
+std::unique_ptr<OperationPass<func::FuncOp>> createBufferReusePass();
+
+/// Creates a pass that merges smaller buffer into bigger buffer to optimize
+/// memory consumption.
+std::unique_ptr<OperationPass<func::FuncOp>> createBufferPackingPass(
+    unsigned windowSize = 5);
+
+/// Creates a pass that tests the useranges of the UserangeAnalysis.
+std::unique_ptr<OperationPass<func::FuncOp>> createTestUserangePass();
+
+/// Creates a pass that prints the analysis results of ShapeComponentsAnalysis.
+std::unique_ptr<OperationPass<func::FuncOp>>
+createTestShapeComponentAnalysisPass();
+
+/// Creates a pass that removes redundant operations that implement a
+/// CopyOpInterface.
+std::unique_ptr<OperationPass<func::FuncOp>> createCopyRemovalPass();
+
+/// Creates a pass that computes the allocated memory.
+std::unique_ptr<OperationPass<func::FuncOp>> createMemoryCountPass();
+
+// Pass to lower index cast on tensors to tensor dialect.
+std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass();
+
+// Pass to tranform compute computations (hlo and linalg) on values to their
+// corresponding counterparts on buffers. Also bufferizes function signatures.
+std::unique_ptr<OperationPass<ModuleOp>> createComputeOpAndFuncBufferizePass();
+
+// Pass to tranform computations on values to their corresponding parts on
+// buffers.
+std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass();
+
+std::unique_ptr<OperationPass<ModuleOp>> createFinalBufferizePass(
+    uint64_t alignment, BufferizeDialectsCallback dc = {},
+    BufferizePatternsCallback pc = {});
+
+// Pass to propagate static shapes to kernel, reducing the kernel arguments
+// from a flattened memref to a single pointer. The pointer is converted to
+// `pointer_type`, if provided.
+std::unique_ptr<OperationPass<ModuleOp>>
+createPropagateStaticShapesToKernelPass(Type pointerType = {});
+
+// Creates a pass for collapsing multidimensional parallel loops into 1D loops.
+std::unique_ptr<OperationPass<>> createCollapseParallelLoopsTo1DPass();
+
+// Creates a TileLoopsPass with tiles sizes provided through `tile_sizes`
+// and unroll factors provided through `unroll_factors`.
+std::unique_ptr<OperationPass<func::FuncOp>> createTileLoopsPass(
+    ArrayRef<int64_t> tileSizes = {}, ArrayRef<int64_t> unrollFactors = {});
+
+// Detensorizes loop-carried variables and block arguments of scf.while, scf.for
+// and scf.if.
+std::unique_ptr<OperationPass<func::FuncOp>> createDetensorizeScfOpsPass();
+
+namespace hlo {
+std::unique_ptr<OperationPass<ModuleOp>> createOneShotBufferizePass();
+
+std::unique_ptr<OperationPass<ModuleOp>> createGenericHostToLLVMPass();
+
+std::unique_ptr<OperationPass<func::FuncOp>> createUnbufferizePass();
+std::unique_ptr<OperationPass<func::FuncOp>> createAllocToArgPass();
+
+// Unrolls scf.for loops with static iteration count no larger than 8.
+std::unique_ptr<Pass> createUnrollLoopsPass();
+
+#define GEN_PASS_REGISTRATION
+#include "transforms/passes.h.inc"
+
+}  // namespace hlo
+}  // namespace mlir
+
+#endif  // MLIR_HLO_TRANSFORMS_PASSES_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td
new file mode 100644
index 00000000000..9e245af350f
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/passes.td
@@ -0,0 +1,177 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
+#define TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def BufferReuse : Pass<"buffer-reuse", "func::FuncOp"> {
+  let summary = "Reuses already allocated buffers to save allocation "
+                "operations if it is provably safe.";
+  let description = [{
+    This pass tries to reuse already allocated buffers if they have the same
+    size, are in the right appearance order and have no interference among
+    themselves.
+  }];
+  let constructor = "createBufferReusePass()";
+}
+
+def CopyRemoval : Pass<"copy-removal", "func::FuncOp"> {
+  let summary = "Removes redundant operations that implement a "
+                "CopyOpInterface, if the intersection of the useranges from"
+                "copy source and target only contains the CopyOp.";
+  let constructor = "createCopyRemovalPass()";
+}
+
+def BufferPacking : Pass<"buffer-packing", "func::FuncOp"> {
+  let summary = "Pass to pack allocated buffer to reduce memory consumption.";
+  let description = [{The pass tries to pack smaller buffers into larger buffers.
+  To do this, it sorts all allocated buffers by multiple criteria depends on the
+  selected window-size.
+  After this sorting, the buffers are checked whether subsequent buffers can be
+  packed into them.}];
+  let dependentDialects = ["func::FuncDialect","memref::MemRefDialect",
+    "arith::ArithDialect"];
+  let constructor = "createBufferPackingPass()";
+  let options = [
+   Option<"window_size_", "window-size", "unsigned",
+           /*default=*/"5", "The window size blurs the start position of an"
+           "allocated buffer. Buffers allocated in the same sliding window area"
+           "are treated equally in terms of starting position, withing the"
+           "sliding window area they are sorted by memory size."
+           "A window size of zero sorts the buffers only by memory size.">,
+  ];
+}
+
+def CollapseParallelLoopsTo1DPass : Pass<"collapse-parallel-loops-to-1d"> {
+  let summary = "Collapses multidimensional loops.";
+  let description = [{ The pass converts a multidimensional `scf.parallel` loop
+  into a 1D `scf.parallel` and index computation from a 1D to multidimensional
+  index. }];
+  let constructor = "createCollapseParallelLoopsTo1DPass()";
+}
+
+def DetensorizeScfOpsPass : Pass<"detensorize-scf-ops", "func::FuncOp"> {
+  let summary = "Detensorize arguments of SCF ops where possible.";
+  let constructor = "createDetensorizeScfOpsPass()";
+  let dependentDialects = ["scf::SCFDialect", "tensor::TensorDialect"];
+}
+
+def TileLoopsPass : Pass<"tile-loops", "func::FuncOp"> {
+  let summary = "Tiles parallel loops.";
+  let description = [{ The pass converts an `scf.parallel` loop into a nested,
+  "tiled", `scf.parallel` loop with 2 to 3 levels of nesting. The 3rd level of
+  nesting represents operation unrolling within a tile and is only applied on
+  simple memory access patterns (ones resulting from same shape, scalar, and/or
+  constant operands).}];
+  let constructor = "createTileLoopsPass()";
+  let options = [
+    ListOption<"tile_sizes_", "tile-sizes", "int64_t", "The size of the tile "
+               "in each dimension, expressed as the number of "
+               "`unroll_factors_` in that dimension.", "llvm::cl::ZeroOrMore">,
+    ListOption<"unroll_factors_", "unroll-factors", "int64_t", "The unroll "
+               "factor in each dimension, expressed as the number of elements "
+               "in that dimension.", "llvm::cl::ZeroOrMore">,
+  ];
+  let dependentDialects = ["AffineDialect"];
+}
+
+def MemoryCount : Pass<"memory-count", "func::FuncOp"> {
+  let summary = "Test pass to count the allocated memory of a module.";
+  let description = [{A test pass that prints the size of allocated memory of a
+  module.}];
+  let constructor = "createMemoryCountPass()";
+}
+
+def TestUserange : Pass<"test-print-userange", "func::FuncOp"> {
+  let summary = "Test pass for checking userange intervals.";
+  let constructor = "createTestUserangePass()";
+}
+
+def TestShapeComponentAnalysis : Pass<"test-print-shape-components",
+                                      "func::FuncOp"> {
+  let summary = "Test pass for analyzing shape components.";
+  let constructor = "createTestShapeComponentAnalysisPass()";
+}
+
+def LowerIndexCastPass
+    : Pass<"lower-index-cast", "mlir::func::FuncOp"> {
+  let summary = "Lower index cast on tensors to tensor dialect";
+  let dependentDialects = [
+    "tensor::TensorDialect"
+  ];
+  let constructor = "createLowerIndexCastPass()";
+}
+
+def OneShotBufferize : Pass<"hlo-one-shot-bufferize", "ModuleOp"> {
+  let summary = "One shot bufferization pass.";
+  let constructor = "hlo::createOneShotBufferizePass()";
+}
+
+def ComputeOpAndFuncBufferizePass : Pass<"computeop-and-func-bufferize", "ModuleOp"> {
+  let summary = "Pass to transform compute operations (hlo and linalg) on "
+                "values to buffer based ones.";
+  let constructor = "createComputeOpAndFuncBufferizePass()";
+}
+
+def FinalBufferizePass : Pass<"final-bufferize", "ModuleOp"> {
+  let summary = "Pass to transform late operations on values to buffer based "
+                "ones.";
+  let constructor = "createFinalBufferizePass()";
+  let options = [
+      Option<"alignment_", "alignment", "uint64_t",
+             /*default=*/"64", "Memory alignment">,
+  ];
+}
+
+def PropagateStaticShapesToKernelPass : Pass<"propagate-static-shapes", "ModuleOp"> {
+  let summary = "Pass to rewrite statically shaped kernel arguments to a pointer.";
+  let constructor = "createPropagateStaticShapesToKernelPass()";
+  let options = [
+      Option<"ptr_type_opt", "convert_pointer_args", "std::string",
+             /*default=*/"", "Pointer type to convert pointer arguments to">,
+  ];
+}
+
+def GenericHostToLLVMPass : Pass<"generic-host-to-llvm", "ModuleOp"> {
+  let summary = "Pass to lower common dialects resulting from HLO to LLVM.";
+  let constructor = "hlo::createGenericHostToLLVMPass()";
+  let options = [];
+}
+
+def UnbufferizePass : Pass<"unbufferize", "mlir::func::FuncOp"> {
+  let summary = "Unbufferize partially bufferized functions.";
+  let description = [{
+    Removes bufferization.to_tensor and memref.tensor_store ops that are the
+    result of XLA bufferizing during HLO to MHLO transformation.
+  }];
+  let constructor = "hlo::createUnbufferizePass()";
+}
+
+def UnrollLoopsPass : Pass<"unroll-loops"> {
+  let summary = "Unrolls scf.for loops with small static iteration counts.";
+  let constructor = "hlo::createUnrollLoopsPass()";
+}
+
+def AllocToArgPass : Pass<"alloc-to-arg", "mlir::func::FuncOp"> {
+  let summary = "Hoist memref allocations to function arguments.";
+  let description = [{
+    Replaces memref.alloc uses with a new argument of the parent function.
+  }];
+  let constructor = "hlo::createAllocToArgPass()";
+}
+
+#endif // TENSORFLOW_COMPILER_MLIR_HLO_TRANSFORMS_PASSES
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/propagate_static_shapes_to_kernel.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/propagate_static_shapes_to_kernel.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
index 79b2830ea5c..58853c586eb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/propagate_static_shapes_to_kernel.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/propagate_static_shapes_to_kernel.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -43,11 +42,12 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_PROPAGATESTATICSHAPESTOKERNELPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 
@@ -125,7 +125,7 @@ static void replaceStaticMemRefArguments(ArrayRef<BlockArgument> arguments,
   std::partial_sum(strides.rbegin(), strides.rend(), strides.rbegin(),
                    std::multiplies<int64_t>());
   strides.push_back(1);
-  replace(llvm::makeArrayRef(strides).drop_front(),
+  replace(llvm::ArrayRef(strides).drop_front(),
           arguments.drop_front(3 + memref.getRank()));
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/rewriters.h b/tensorflow/compiler/xla/mlir_hlo/transforms/rewriters.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/rewriters.h
rename to tensorflow/compiler/xla/mlir_hlo/transforms/rewriters.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/tile_loops_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
similarity index 97%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/tile_loops_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
index 022b2e6dc0e..2baf35798f1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/tile_loops_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -34,18 +33,19 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_TILELOOPSPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 using ::mlir::scf::ParallelOp;
 
 namespace {
 
 // This is the implementation of the TileLoops pass declared in
-//  include/mlir-hlo/Transforms/passes.td
+//  include/transforms/passes.td
 class TileLoopsPass : public impl::TileLoopsPassBase<TileLoopsPass> {
  public:
   // Creates a TileLoopsPass with tiles sizes provided through `tile_sizes`
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unbufferize_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/unbufferize_pass.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unbufferize_pass.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/unbufferize_pass.cc
index 1758b3eab2a..9a1531df005 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unbufferize_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/unbufferize_pass.cc
@@ -23,17 +23,17 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Value.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_UNBUFFERIZEPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 using ::mlir::func::FuncOp;
 
@@ -51,7 +51,7 @@ void UnbufferizePass::runOnOperation() {
   FuncOp funcOp = getOperation();
   IRRewriter rewriter(funcOp.getContext());
   BitVector argsToErase(funcOp.getNumArguments());
-  BlockAndValueMapping mapping;
+  IRMapping mapping;
   llvm::SmallDenseSet<BlockArgument> insertedArgs;
   funcOp->walk([&](bufferization::ToTensorOp op) {
     auto arg = op.getMemref().dyn_cast<BlockArgument>();
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unroll_loops.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/unroll_loops.cc
similarity index 95%
rename from tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unroll_loops.cc
rename to tensorflow/compiler/xla/mlir_hlo/transforms/unroll_loops.cc
index 5b787739767..b638fd1a400 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/Transforms/unroll_loops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/unroll_loops.cc
@@ -20,16 +20,16 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
-#include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/IR/Value.h"
+#include "transforms/passes.h"
 
 namespace mlir {
 
 #define GEN_PASS_DEF_UNROLLLOOPSPASS
-#include "mlir-hlo/Transforms/passes.h.inc"
+#include "transforms/passes.h.inc"
 
 namespace {
 class UnrollLoopsPass : public impl::UnrollLoopsPassBase<UnrollLoopsPass> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/utils/CMakeLists.txt
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/CMakeLists.txt
rename to tensorflow/compiler/xla/mlir_hlo/utils/CMakeLists.txt
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/codegen_utils.cc b/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/codegen_utils.cc
rename to tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.cc
index b5185f3cdc2..e107cd761b3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/utils/codegen_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/utils/codegen_utils.h"
+#include "utils/codegen_utils.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/codegen_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/codegen_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/utils/codegen_utils.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/convert_op_folder.cc b/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/convert_op_folder.cc
rename to tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
index 0b982bc11ab..8b9839fcce7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/utils/convert_op_folder.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file defines helpers useful when creating or manipulating lhlo/hlo.
 
-#include "mlir-hlo/utils/convert_op_folder.h"
+#include "utils/convert_op_folder.h"
 
 #include "llvm/ADT/APSInt.h"
 #include "mlir/IR/Attributes.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h b/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h
rename to tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector.cc b/tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector.cc
rename to tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector.cc
index 70dc76feed2..2543f4ad66f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/utils/cycle_detector.h"
+#include "utils/cycle_detector.h"
 
 #include <algorithm>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/cycle_detector.h b/tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/cycle_detector.h
rename to tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector_test.cc b/tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector_test.cc
similarity index 98%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector_test.cc
rename to tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector_test.cc
index 7b490e2a873..9ea84bd5a69 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/utils/cycle_detector_test.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/cycle_detector_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/utils/cycle_detector.h"
+#include "utils/cycle_detector.h"
 
 #include "tensorflow/compiler/xla/test.h"
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lib/utils/hlo_utils.cc b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
similarity index 99%
rename from tensorflow/compiler/xla/mlir_hlo/lib/utils/hlo_utils.cc
rename to tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
index 8e1d23a44ec..171ed496a17 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lib/utils/hlo_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/utils/hlo_utils.h"
+#include "utils/hlo_utils.h"
 
 #include <numeric>
 #include <string>
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/hlo_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/hlo_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
diff --git a/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/placement_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h
similarity index 100%
rename from tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/placement_utils.h
rename to tensorflow/compiler/xla/mlir_hlo/utils/placement_utils.h
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
index 1bc882a1eb7..aa696e872a0 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.h
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 
 // This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
-// modules to parse flags from an environtment variable, or (if the first
+// modules to parse flags from an environment variable, or (if the first
 // non-whitespace in the variable value is not '-'), a file named by that
 // environment variable.
 //
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 08b6255d207..cd499a9ec78 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -1,20 +1,21 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
 
 package_group(
     name = "friends",
     includes = [
-        "//tensorflow:internal",
         "//tensorflow/compiler/xla:friends",
+        "//tensorflow/compiler/xla:internal",
     ],
     packages = [
         "//third_party/australis/...",
@@ -59,14 +60,13 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "semaphore_test",
     srcs = ["semaphore_test.cc"],
     deps = [
         ":semaphore",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/synchronization",
@@ -79,7 +79,7 @@ cc_library(
     hdrs = ["tracked_device_buffer.h"],
     visibility = [
         "//learning/pathways/data_parallel:__pkg__",
-        "//tensorflow:internal",
+        "//tensorflow/compiler/xla:internal",
     ],
     deps = [
         ":event_pool",
@@ -97,7 +97,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tracked_device_buffer_test",
     srcs = ["tracked_device_buffer_test.cc"],
     deps = [
@@ -114,23 +114,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "cpu_device_test",
-    srcs = ["cpu_device_test.cc"],
-    deps = [
-        ":cpu_device",
-        ":pjrt_client",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/tsl/platform:random",
-        "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "local_device_state",
     srcs = ["local_device_state.cc"],
@@ -151,6 +134,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_api",
+    srcs = ["pjrt_api.cc"],
+    hdrs = ["pjrt_api.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
+        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_helpers",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_api_test",
+    srcs = ["pjrt_api_test.cc"],
+    deps = [
+        ":pjrt_api",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "pjrt_client",
     srcs = ["pjrt_client.cc"],
@@ -165,13 +175,11 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:fingerprint",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -198,7 +206,7 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pjrt_client_test_cpu",
     srcs = ["pjrt_client_test_cpu.cc"],
     deps = [
@@ -217,14 +225,14 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pjrt_executable_test",
     srcs = ["pjrt_executable_test.cc"],
     deps = [
@@ -252,7 +260,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pjrt_compiler_test",
     srcs = ["pjrt_compiler_test.cc"],
     deps = [
@@ -274,8 +282,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -309,7 +317,6 @@ cc_library(
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -318,11 +325,12 @@ cc_library(
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt/distributed:protocol_proto_cc",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -331,8 +339,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
-        "//tensorflow/compiler/xla/stream_executor/lib",
-        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/tsl/framework:allocator",
         "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:env",
@@ -341,8 +347,8 @@ cc_library(
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
         "//tensorflow/tsl/profiler/lib:traceme",
-        "//tensorflow/tsl/profiler/lib:traceme_encode",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -356,7 +362,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pjrt_stream_executor_client_test",
     srcs = ["pjrt_stream_executor_client_test.cc"],
     deps = [
@@ -382,7 +388,6 @@ cc_library(
     deps = [
         ":local_device_state",
         ":pjrt_stream_executor_client",
-        ":tracked_device_buffer",
         ":utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -390,11 +395,9 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:tpu_computation_placer",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executable",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executable_interface",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor",
@@ -406,10 +409,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_transfer_manager",
         "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
     ],
 )
 
@@ -427,27 +426,11 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_device",
-    srcs = ["cpu_device.cc"],
-    hdrs = ["cpu_device.h"],
-    compatible_with = [],
-    visibility = [":friends"],
-    deps = [
-        ":pjrt_stream_executor_client",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service:platform_util",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "pjrt_plugin_device_client",
     deps = [
         ":pjrt_plugin_device_client_headers",
-        "//tensorflow/compiler/plugin",
+        "//tensorflow/compiler/xla/pjrt/plugin",
     ],
 )
 
@@ -465,9 +448,9 @@ cc_library(
     hdrs = ["mlir_to_hlo.h"],
     visibility = [":friends"],
     deps = [
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/mlir/utils:error_util",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
@@ -479,6 +462,7 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -511,7 +495,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tracked_tfrt_cpu_device_buffer_test",
     srcs = ["tracked_tfrt_cpu_device_buffer_test.cc"],
     deps = [
@@ -539,6 +523,7 @@ cc_library(
         ":utils",
         ":worker_thread",
         "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -555,10 +540,10 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/compiler/xla/service/cpu:cpu_xfeed",
-        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/tsl/platform:denormal",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:setround",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//third_party/eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
         "@com_google_absl//absl/base:core_headers",
@@ -571,14 +556,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tfrt_cpu_pjrt_client_test",
     srcs = ["tfrt_cpu_pjrt_client_test.cc"],
     deps = [
         ":tfrt_cpu_pjrt_client",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:custom_call_status_public_headers",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -592,7 +581,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "lru_cache_test",
     srcs = ["lru_cache_test.cc"],
     deps = [
@@ -630,7 +619,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
     deps = [
@@ -655,18 +644,22 @@ cc_library(
     srcs = ["pjrt_c_api_client.cc"],
     hdrs = ["pjrt_c_api_client.h"],
     deps = [
+        ":pjrt_api",
         ":pjrt_client",
+        ":pjrt_executable",
         ":pjrt_future",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_helpers",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_wrapper_impl",  # TODO(skyewm): remove
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",  # TODO(b/238999986): Remove this.
-        "//tensorflow/compiler/xla/stream_executor/tpu:pjrt_api",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
         "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:BytecodeWriter",
     ],
 )
@@ -679,7 +672,7 @@ cc_library(
     deps = [":pjrt_client"],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "host_callback_test",
     srcs = ["host_callback_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/pjrt/c/BUILD b/tensorflow/compiler/xla/pjrt/c/BUILD
index 4522a1c9152..384dfa32a59 100644
--- a/tensorflow/compiler/xla/pjrt/c/BUILD
+++ b/tensorflow/compiler/xla/pjrt/c/BUILD
@@ -1,4 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 cc_library(
     name = "pjrt_c_api_hdrs",
@@ -30,13 +33,16 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/compiler/xla/pjrt:pjrt_future",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",  # TODO(b/238999986): Remove this.
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -51,5 +57,31 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_future",
+    ],
+)
+
+cc_library(
+    name = "pjrt_c_api_cpu",
+    srcs = ["pjrt_c_api_cpu.cc"],
+    hdrs = ["pjrt_c_api_cpu.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+        ":pjrt_c_api_helpers",
+        ":pjrt_c_api_wrapper_impl",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_c_api_cpu_test",
+    srcs = ["pjrt_c_api_cpu_test.cc"],
+    deps = [
+        ":pjrt_c_api_cpu",
+        ":pjrt_c_api_hdrs",
+        ":pjrt_c_api_wrapper_impl",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
index 9704e362508..1cb7527cc5a 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
@@ -184,11 +184,39 @@ const size_t PJRT_Event_OnReady_Args_STRUCT_SIZE =
 // error status and a pointer to an object of the caller's choice as arguments.
 typedef PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args);
 
+// ------------------------ Other Common Data Types ----------------------------
+
+// Named value for key-value pairs.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  const char* name;
+  size_t name_size;
+  enum {
+    PJRT_NamedValue_kString = 0,
+    PJRT_NamedValue_kInt64,
+    PJRT_NamedValue_kInt64List,
+    PJRT_NamedValue_kFloat
+  } type;
+  union {
+    const char* string_value;
+    int64_t int64_value;
+    const int64_t* int64_array_value;
+    float float_value;
+  };
+  // `value_size` is the number of elements for array/string and 1 for scalar
+  // values.
+  size_t value_size;
+} PJRT_NamedValue;
+const size_t PJRT_NamedValue_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_NamedValue, value_size);
+
 // ---------------------------------- Client -----------------------------------
 
 typedef struct PJRT_Client PJRT_Client;
 typedef struct PJRT_Device PJRT_Device;
 typedef struct PJRT_Executable PJRT_Executable;
+typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
 typedef struct PJRT_Buffer PJRT_Buffer;
 
 typedef struct {
@@ -307,19 +335,37 @@ const size_t PJRT_Client_LookupDevice_Args_STRUCT_SIZE =
 typedef PJRT_Error* PJRT_Client_LookupDevice(
     PJRT_Client_LookupDevice_Args* args);
 
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_Client* client;
+  int local_hardware_id;
+  // `addressable_device` has the same lifetime as `client`. It is owned by
+  // `client`.
+  PJRT_Device* addressable_device;  // out
+} PJRT_Client_LookupAddressableDevice_Args;
+
+const size_t PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_Client_LookupAddressableDevice_Args,
+                     addressable_device);
+
+// Returns an addressable PJRT_Device* with the specified ID as returned by
+// PJRT_Device_LocalHardwareId.
+typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
+    PJRT_Client_LookupAddressableDevice_Args* args);
+
 typedef struct {
   size_t struct_size;
   void* priv;
   // Serialized code in the specified format below.
-  // String is owned by the caller and should stay alive for the duration of the
-  // compile call.
-  const char* code;
+  // String is owned by the caller.
+  char* code;  // in/out depending on usage
   size_t code_size;
   // Supported formats are:
   // "hlo": code string takes serialized HloModuleProto.
+  // "hlo_with_config": code string takes serialized HloModuleProtoWithConfig.
   // "mlir": code string takes MLIR module bytecode (or string).
-  // String is owned by the caller and should stay alive for the duration of the
-  // compile call.
+  // Ownership of `format` varies across API functions.
   const char* format;
   size_t format_size;
 } PJRT_Program;
@@ -331,13 +377,14 @@ typedef struct {
   void* priv;
   PJRT_Client* client;
   // Only needs to stay alive for the duration of the Compile call.
+  // `program->format` and `program->format_size` are owned by the caller.
   PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
   // Serialized CompileOptionsProto
   // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
   const char* compile_options;
   size_t compile_options_size;
-  PJRT_Executable* executable;  // out
+  PJRT_LoadedExecutable* executable;  // out
 } PJRT_Client_Compile_Args;
 
 const size_t PJRT_Client_Compile_Args_STRUCT_SIZE =
@@ -412,6 +459,13 @@ typedef enum {
   // PJRT_Client_BufferFromHostBuffer call.
   PJRT_HostBufferSemantics_kImmutableOnlyDuringCall,
 
+  // The runtime may hold onto `data` after the call to
+  // `PJRT_Client_BufferFromHostBuffer`
+  // returns while the runtime completes a transfer to the device. The caller
+  // promises not to mutate or free `data` until the transfer completes, at
+  // which point `done_with_host_buffer` will be triggered.
+  PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes,
+
   // The PjRtBuffer may alias `data` internally and the runtime may use the
   // `data` contents as long as the buffer is alive. The caller promises to
   // keep `data` alive and not to mutate its contents as long as the buffer is
@@ -520,34 +574,12 @@ const size_t PJRT_Device_IsAddressable_Args_STRUCT_SIZE =
 typedef PJRT_Error* PJRT_Device_IsAddressable(
     PJRT_Device_IsAddressable_Args* args);
 
-typedef struct {
-  size_t struct_size;
-  void* priv;
-  const char* name;
-  size_t name_size;
-  enum {
-    PJRT_Device_Attribute_kString = 0,
-    PJRT_Device_Attribute_kInt64,
-    PJRT_Device_Attribute_kInt64List
-  } type;
-  union {
-    int64_t int64_value;
-    const int64_t* int64_array_value;
-    const char* string_value;
-  };
-  // `value_size` is the number of elements for array/string and 1 for scalar
-  // values.
-  size_t value_size;
-} PJRT_Device_Attribute;
-const size_t PJRT_Device_Attribute_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Device_Attribute, value_size);
-
 typedef struct {
   size_t struct_size;
   void* priv;
   PJRT_Device* device;
-  size_t num_attributes;              // out
-  PJRT_Device_Attribute* attributes;  // out
+  size_t num_attributes;        // out
+  PJRT_NamedValue* attributes;  // out
 } PJRT_Device_Attributes_Args;
 const size_t PJRT_Device_Attributes_Args_STRUCT_SIZE =
     PJRT_STRUCT_SIZE(PJRT_Device_Attributes_Args, attributes);
@@ -602,8 +634,6 @@ typedef PJRT_Error* PJRT_Device_ToString(PJRT_Device_ToString_Args* args);
 
 // ------------------------------- Executables ---------------------------------
 
-typedef struct PJRT_Buffer PJRT_Buffer;
-
 typedef struct {
   size_t struct_size;
   void* priv;
@@ -612,10 +642,36 @@ typedef struct {
 const size_t PJRT_Executable_Destroy_Args_STRUCT_SIZE =
     PJRT_STRUCT_SIZE(PJRT_Executable_Destroy_Args, executable);
 
-// Frees `executable` and deletes the underlying runtime object as if
-// `PJRT_Executable_Delete` were called. `executable` can be nullptr.
+// Frees `executable`. `executable` can be nullptr.
 typedef PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_LoadedExecutable* executable;
+} PJRT_LoadedExecutable_Destroy_Args;
+const size_t PJRT_LoadedExecutable_Destroy_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_Destroy_Args, executable);
+
+// Frees `executable` and deletes the underlying runtime object as if
+// `PJRT_LoadedExecutable_Delete` were called. `executable` can be nullptr.
+typedef PJRT_Error* PJRT_LoadedExecutable_Destroy(
+    PJRT_LoadedExecutable_Destroy_Args* args);
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_LoadedExecutable* loaded_executable;
+  PJRT_Executable* executable;  // out
+} PJRT_LoadedExecutable_GetExecutable_Args;
+const size_t PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_GetExecutable_Args, executable);
+
+// Constructs a PJRT_Executable from a PJRT_LoadedExecutable. The returned
+// executable should be freed by the caller with PJRT_Executable_Destroy.
+typedef PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
+    PJRT_LoadedExecutable_GetExecutable_Args* args);
+
 typedef struct {
   size_t struct_size;
   void* priv;
@@ -635,46 +691,82 @@ typedef PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
 typedef struct {
   size_t struct_size;
   void* priv;
-  PJRT_Executable* executable;
+  PJRT_LoadedExecutable* executable;
   PJRT_Device** addressable_devices;  // out
   size_t num_addressable_devices;     // out
-} PJRT_Executable_AddressableDevices_Args;
+} PJRT_LoadedExecutable_AddressableDevices_Args;
 
-const size_t PJRT_Executable_AddressableDevices_Args_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Executable_AddressableDevices_Args,
+const size_t PJRT_LoadedExecutable_AddressableDevices_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_AddressableDevices_Args,
                      num_addressable_devices);
 
 // Returns a list of devices this executable will run on.
-typedef PJRT_Error* PJRT_Executable_AddressableDevices(
-    PJRT_Executable_AddressableDevices_Args* args);
+typedef PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
+    PJRT_LoadedExecutable_AddressableDevices_Args* args);
 
 typedef struct {
   size_t struct_size;
   void* priv;
   PJRT_Executable* executable;
-} PJRT_Executable_Delete_Args;
-const size_t PJRT_Executable_Delete_Args_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Executable_Delete_Args, executable);
+  PJRT_Program* program;  // out, but read below
+} PJRT_Executable_OptimizedProgram_Args;
+const size_t PJRT_Executable_OptimizedProgram_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_Executable_OptimizedProgram_Args, program);
+
+// Retrieves the optimized program for a given PJRT_Executable (SPMD).
+// The caller should populate `program->format` and `format_size`.
+//
+// The implementation will set `program->format` and `program->format_size`
+// to inform callers of the format of the optimized program returned.
+// These members are owned by the implementation.
+//
+// If called with nullptr as `program->code`, `PJRT_Executable_OptimizedProgram`
+// will populate `program->code_size` as an output indicating the number of
+// bytes the string `program->code` requires.
+//
+// If `program->code` is not null, `PJRT_Executable_OptimizedProgram` will fill
+// the buffer pointed to by `program->code` with the serialization of the
+// optimized HLO program. `program->code` must point to a client-owned buffer of
+// size >= `program->code_size`, which must be at large enough to hold the
+// serialization of the optimized program.
+//
+// Callers should generally call this function twice with the same `args`.
+// In the first call, `program->code` must be nullptr. This call will populate
+// `program->code_size`. Clients should then allocate a buffer `code_buff` of at
+// least `code_size` bytes. Before the second call, callers should set
+// `program->code = code_buff`. The second call will then write the serialized
+// program to `code_buff`.
+typedef PJRT_Error* PJRT_Executable_OptimizedProgram(
+    PJRT_Executable_OptimizedProgram_Args* args);
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_LoadedExecutable* executable;
+} PJRT_LoadedExecutable_Delete_Args;
+const size_t PJRT_LoadedExecutable_Delete_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_Delete_Args, executable);
 
 // Drops `executable`'s reference to the internal runtime object and
 // associated resources, without freeing the `executable` object itself.
-// `executable` can only be used with PJRT_Executable_IsDeleted and
-// PJRT_Executable_Destroy after calling this method. The internal runtime
+// `executable` can only be used with PJRT_LoadedExecutable_IsDeleted and
+// PJRT_LoadedExecutable_Destroy after calling this method. The internal runtime
 // executable will be freed after the last execution completes.
-typedef PJRT_Error* PJRT_Executable_Delete(PJRT_Executable_Delete_Args* args);
+typedef PJRT_Error* PJRT_LoadedExecutable_Delete(
+    PJRT_LoadedExecutable_Delete_Args* args);
 
 typedef struct {
   size_t struct_size;
   void* priv;
-  PJRT_Executable* executable;
+  PJRT_LoadedExecutable* executable;
   bool is_deleted;  // out
-} PJRT_Executable_IsDeleted_Args;
-const size_t PJRT_Executable_IsDeleted_Args_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Executable_IsDeleted_Args, is_deleted);
+} PJRT_LoadedExecutable_IsDeleted_Args;
+const size_t PJRT_LoadedExecutable_IsDeleted_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_IsDeleted_Args, is_deleted);
 
-// True if and only if PJRT_Executable_Delete has previously been called.
-typedef PJRT_Error* PJRT_Executable_IsDeleted(
-    PJRT_Executable_IsDeleted_Args* args);
+// True if and only if PJRT_LoadedExecutable_Delete has previously been called.
+typedef PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
+    PJRT_LoadedExecutable_IsDeleted_Args* args);
 
 typedef struct {
   size_t struct_size;
@@ -691,7 +783,7 @@ const size_t PJRT_ExecuteOptions_STRUCT_SIZE =
 typedef struct {
   size_t struct_size;
   void* priv;
-  PJRT_Executable* executable;
+  PJRT_LoadedExecutable* executable;
   // Only needs to stay alive for the duration of the Execute call.
   PJRT_ExecuteOptions* options;
   // Execution input of size [`num_devices`, `num_args`].
@@ -720,12 +812,13 @@ typedef struct {
   // at compile time. Setting this field may not be supported on all platforms
   // or executables.
   PJRT_Device* execute_device;
-} PJRT_Executable_Execute_Args;
-const size_t PJRT_Executable_Execute_Args_STRUCT_SIZE =
-    PJRT_STRUCT_SIZE(PJRT_Executable_Execute_Args, device_complete_events);
+} PJRT_LoadedExecutable_Execute_Args;
+const size_t PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE = PJRT_STRUCT_SIZE(
+    PJRT_LoadedExecutable_Execute_Args, device_complete_events);
 
 // Executes on devices addressable by the client.
-typedef PJRT_Error* PJRT_Executable_Execute(PJRT_Executable_Execute_Args* args);
+typedef PJRT_Error* PJRT_LoadedExecutable_Execute(
+    PJRT_LoadedExecutable_Execute_Args* args);
 
 typedef struct {
   size_t struct_size;
@@ -753,6 +846,91 @@ const size_t PJRT_Executable_SizeOfGeneratedCodeInBytes_Args_STRUCT_SIZE =
 typedef PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
     PJRT_Executable_SizeOfGeneratedCodeInBytes_Args* args);
 
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_LoadedExecutable* executable;
+  size_t num_properties;  // out
+  // `properties` and any embedded data are owned by and have the same lifetime
+  // as `executable`.
+  PJRT_NamedValue* properties;  // out
+} PJRT_LoadedExecutable_GetCostAnalysis_Args;
+
+const size_t PJRT_LoadedExecutable_GetCostAnalysis_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_LoadedExecutable_GetCostAnalysis_Args, properties);
+
+// Get the cost properties for the executable. Different platforms may return
+// different properties; for example, some platforms may return the number of
+// operations, or memory size of the input/output of the executable, based on
+// program analysis.
+typedef PJRT_Error* PJRT_LoadedExecutable_GetCostAnalysis(
+    PJRT_LoadedExecutable_GetCostAnalysis_Args* args);
+
+typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  const PJRT_Executable* executable;
+  PJRT_SerializedExecutable* serialized_executable;  // out
+} PJRT_Executable_Serialize_Args;
+const size_t PJRT_Executable_Serialize_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_Executable_Serialize_Args, serialized_executable);
+
+// Returns a platform-specific serialization of `executable`. The serialization
+// is not guaranteed to be stable over time.
+typedef PJRT_Error* PJRT_Executable_Serialize(
+    PJRT_Executable_Serialize_Args* args);
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_Client* client;
+  const char* serialized_executable;
+  size_t serialized_executable_size;
+  PJRT_LoadedExecutable* loaded_executable;  // out
+} PJRT_Executable_DeserializeAndLoad_Args;
+const size_t PJRT_Executable_DeserializeAndLoad_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_Executable_DeserializeAndLoad_Args,
+                     loaded_executable);
+
+// Deserializes an executable serialized by `PJRT_Executable_Serialize`.
+// `serialized_executable` must have been produced by the same platform and
+// library version as this one.
+typedef PJRT_Error* PJRT_Executable_DeserializeAndLoad(
+    PJRT_Executable_DeserializeAndLoad_Args* args);
+
+// -------------------------- Serialized Executables ---------------------------
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_SerializedExecutable* serialized_executable;
+} PJRT_SerializedExecutable_Destroy_Args;
+const size_t PJRT_SerializedExecutable_Destroy_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_SerializedExecutable_Destroy_Args,
+                     serialized_executable);
+
+// Destroys a `PJRT_SerializedExecutable`.
+typedef PJRT_Error* PJRT_SerializedExecutable_Destroy(
+    PJRT_SerializedExecutable_Destroy_Args* args);
+
+// The string pointed to by `data` is owned by `serialized_executable` and has
+// the same object lifetime.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  PJRT_SerializedExecutable* serialized_executable;
+  const char* data;  // out
+  size_t data_size;  // out
+} PJRT_SerializedExecutable_Data_Args;
+const size_t PJRT_SerializedExecutable_Data_Args_STRUCT_SIZE =
+    PJRT_STRUCT_SIZE(PJRT_SerializedExecutable_Data_Args, data_size);
+
+// Returns the data of a `PJRT_SerializedExecutable` and its length in bytes
+typedef PJRT_Error* PJRT_SerializedExecutable_Data(
+    PJRT_SerializedExecutable_Data_Args* args);
+
 // ---------------------------------- Buffers ----------------------------------
 
 typedef struct {
@@ -954,6 +1132,7 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_Devices);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_AddressableDevices);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_LookupDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_LookupAddressableDevice);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_Compile);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_DefaultDeviceAssignment);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_BufferFromHostBuffer);
@@ -969,12 +1148,22 @@ typedef struct {
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Name);
-  _PJRT_API_STRUCT_FIELD(PJRT_Executable_AddressableDevices);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_NumOutputs);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_SizeOfGeneratedCodeInBytes);
-  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Delete);
-  _PJRT_API_STRUCT_FIELD(PJRT_Executable_IsDeleted);
-  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Execute);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_OptimizedProgram);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_Serialize);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetExecutable);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_AddressableDevices);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetCostAnalysis);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Delete);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_IsDeleted);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_Execute);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_DeserializeAndLoad);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_SerializedExecutable_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_SerializedExecutable_Data);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Buffer_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Buffer_OnDeviceTrimmedShape);
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc
new file mode 100644
index 00000000000..11e65552a08
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.cc
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
+
+namespace pjrt {
+namespace cpu_plugin {
+
+PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Client_Create_Args", PJRT_Client_Create_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  // TODO(b/263170683): cpu_device_count should be configurable after config
+  // options can be passed to PJRT_Client_Create.
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtClient> client,
+      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/4));
+  args->client = pjrt::CreateWrapperClient(std::move(client));
+  return nullptr;
+}
+
+}  // namespace cpu_plugin
+}  // namespace pjrt
+
+constexpr PJRT_Api pjrt_api =
+    pjrt::CreatePjrtApi(pjrt::cpu_plugin::PJRT_Client_Create);
+
+const PJRT_Api* GetPjrtApi() { return &pjrt_api; }
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.h
new file mode 100644
index 00000000000..c8a66f7c0e6
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_CPU_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_CPU_H_
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetPjrtApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_CPU_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc
new file mode 100644
index 00000000000..98d505b2594
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+
+namespace xla {
+namespace pjrt {
+namespace {
+
+class PjrtCApiCpuTest : public ::testing::Test {
+ protected:
+  const PJRT_Api* api_;
+  PJRT_Client* client_;
+  // We directly access the internal C++ client to test if the C API has the
+  // same behavior as the C++ API.
+  xla::PjRtClient* cc_client_;
+
+  void SetUp() override {
+    api_ = GetPjrtApi();
+    client_ = make_client();
+    cc_client_ = client_->client.get();
+  }
+
+  void TearDown() override { destroy_client(client_); }
+
+  void destroy_client(PJRT_Client* client) {
+    PJRT_Client_Destroy_Args destroy_args = PJRT_Client_Destroy_Args{
+        .struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE,
+        .priv = nullptr,
+        .client = client,
+    };
+    PJRT_Error* error = api_->PJRT_Client_Destroy(&destroy_args);
+    CHECK_EQ(error, nullptr);
+  }
+
+  PJRT_Client* make_client() {
+    PJRT_Client_Create_Args create_args = PJRT_Client_Create_Args{
+        .struct_size = PJRT_Client_Create_Args_STRUCT_SIZE,
+        .priv = nullptr,
+        .client = nullptr,
+    };
+    PJRT_Error* error = api_->PJRT_Client_Create(&create_args);
+    CHECK_EQ(error, nullptr);
+    CHECK_NE(create_args.client, nullptr);
+    return create_args.client;
+  }
+};
+
+TEST_F(PjrtCApiCpuTest, ClientProcessIndex) {
+  PJRT_Client_ProcessIndex_Args process_index_args =
+      PJRT_Client_ProcessIndex_Args{
+          .struct_size = PJRT_Client_ProcessIndex_Args_STRUCT_SIZE,
+          .priv = nullptr,
+          .client = client_,
+          .process_index = -1,
+      };
+  PJRT_Error* error = api_->PJRT_Client_ProcessIndex(&process_index_args);
+  CHECK_EQ(error, nullptr);
+
+  // Single-process test should return 0
+  CHECK_EQ(process_index_args.process_index, 0);
+}
+
+TEST_F(PjrtCApiCpuTest, PlatformName) {
+  PJRT_Client_PlatformName_Args args;
+  args.client = client_;
+  args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  PJRT_Error* error = api_->PJRT_Client_PlatformName(&args);
+  ASSERT_EQ(error, nullptr);
+  absl::string_view platform_name(args.platform_name, args.platform_name_size);
+  ASSERT_EQ("cpu", platform_name);
+}
+
+}  // namespace
+}  // namespace pjrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
index ecc11335347..da45bba1004 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
 
+#include <functional>
 #include <memory>
+#include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -26,6 +30,7 @@ namespace pjrt {
 
 const absl::string_view kHloFormat = "hlo";
 const absl::string_view kMlirFormat = "mlir";
+const absl::string_view kHloWithConfigFormat = "hlo_with_config";
 
 PJRT_ClientDeleter MakeClientDeleter(const PJRT_Api* api) {
   return [api](PJRT_Client* client) -> void {
@@ -72,6 +77,16 @@ PJRT_ExecutableDeleter MakeExecutableDeleter(const PJRT_Api* api) {
   };
 }
 
+PJRT_LoadedExecutableDeleter MakeLoadedExecutableDeleter(const PJRT_Api* api) {
+  return [api](PJRT_LoadedExecutable* executable) -> void {
+    PJRT_LoadedExecutable_Destroy_Args args;
+    args.struct_size = PJRT_LoadedExecutable_Destroy_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.executable = executable;
+    pjrt::LogFatalIfPjrtError(api->PJRT_LoadedExecutable_Destroy(&args), api);
+  };
+}
+
 xla::Status PjrtErrorToStatus(const PJRT_Error* error, const PJRT_Api* api) {
   xla::Status status;
   if (error != nullptr) {
@@ -301,6 +316,9 @@ PJRT_HostBufferSemantics ConvertToPjRtHostBufferSemantics(
     case xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall:
       return PJRT_HostBufferSemantics::
           PJRT_HostBufferSemantics_kImmutableOnlyDuringCall;
+    case xla::PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes:
+      return PJRT_HostBufferSemantics::
+          PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes;
     case xla::PjRtClient::HostBufferSemantics::kZeroCopy:
       return PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kZeroCopy;
     default:
@@ -316,9 +334,90 @@ xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
     case PJRT_HostBufferSemantics::
         PJRT_HostBufferSemantics_kImmutableOnlyDuringCall:
       return xla::PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall;
+    case PJRT_HostBufferSemantics::
+        PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes:
+      return xla::PjRtClient::HostBufferSemantics::
+          kImmutableUntilTransferCompletes;
     case PJRT_HostBufferSemantics::PJRT_HostBufferSemantics_kZeroCopy:
       return xla::PjRtClient::HostBufferSemantics::kZeroCopy;
   }
 }
 
+xla::PjRtFuture<xla::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
+                                                      const PJRT_Api* c_api) {
+  using xla::Status, xla::PjRtFuture;
+  PJRT_Event_OnReady_Args event_onready_args;
+  event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
+  event_onready_args.priv = nullptr;
+  event_onready_args.event = c_event;
+
+  PjRtFuture<Status>::Promise promise = PjRtFuture<Status>::CreatePromise();
+  event_onready_args.user_arg = new std::function<void(PJRT_Error*)>(
+      [promise, c_event, c_api](PJRT_Error* error) mutable {
+        if (error != nullptr) {
+          xla::Status s = ::pjrt::PjrtErrorToStatus(error, c_api);
+          promise.Set(s);
+          ::pjrt::MakeErrorDeleter(c_api)(error);
+        } else {
+          promise.Set(tsl::OkStatus());
+        }
+        ::pjrt::MakeEventDeleter(c_api)(c_event);
+      });
+  event_onready_args.callback = [](PJRT_Error* error, void* arg) {
+    std::function<void(PJRT_Error*)>* set_future =
+        reinterpret_cast<std::function<void(PJRT_Error*)>*>(arg);
+    (*set_future)(error);
+    delete set_future;
+  };
+
+  PJRT_Error* error = c_api->PJRT_Event_OnReady(&event_onready_args);
+  if (error != nullptr) {
+    xla::Status s = ::pjrt::PjrtErrorToStatus(error, c_api);
+    return PjRtFuture<Status>(s);
+  }
+  return PjRtFuture<Status>(std::move(promise));
+}
+
+PJRT_SerializedExecutableDeleter MakeSerializedExecutableDeleter(
+    const PJRT_Api* api) {
+  return [api](PJRT_SerializedExecutable* serialized_executable) -> void {
+    PJRT_SerializedExecutable_Destroy_Args destroy_args;
+    destroy_args.struct_size =
+        PJRT_SerializedExecutable_Destroy_Args_STRUCT_SIZE;
+    destroy_args.priv = nullptr;
+    destroy_args.serialized_executable = serialized_executable;
+    pjrt::LogFatalIfPjrtError(
+        api->PJRT_SerializedExecutable_Destroy(&destroy_args), api);
+  };
+}
+
+static std::string StructSizeErrorMsg(absl::string_view struct_name,
+                                      size_t expected_size,
+                                      size_t actual_size) {
+  return absl::StrCat("Unexpected ", struct_name, " size: expected ",
+                      expected_size, ", got ", actual_size,
+                      ". Check installed software versions.");
+}
+
+xla::Status CheckMatchingStructSizes(absl::string_view struct_name,
+                                     size_t expected_size, size_t actual_size) {
+  if (expected_size != actual_size) {
+    return tsl::errors::InvalidArgument(
+        StructSizeErrorMsg(struct_name, expected_size, actual_size));
+  }
+  return tsl::OkStatus();
+}
+
+absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api) {
+  PJRT_Client_PlatformVersion_Args args;
+  args.struct_size = PJRT_Client_PlatformVersion_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.client = client;
+  LogFatalIfPjrtError(api->PJRT_Client_PlatformVersion(&args), api);
+
+  absl::string_view platform_version(args.platform_version,
+                                     args.platform_version_size);
+  return platform_version;
+}
+
 }  // namespace pjrt
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
index 7c1adfdc5bc..8e2f2effba2 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -28,6 +29,7 @@ namespace pjrt {
 
 ABSL_CONST_INIT extern const absl::string_view kHloFormat;
 ABSL_CONST_INIT extern const absl::string_view kMlirFormat;
+ABSL_CONST_INIT extern const absl::string_view kHloWithConfigFormat;
 
 using PJRT_ClientDeleter = std::function<void(PJRT_Client*)>;
 
@@ -54,6 +56,29 @@ using PJRT_ExecutableDeleter = std::function<void(PJRT_Executable*)>;
 // The lifetime of the Api pointed to must be longer than the executable.
 PJRT_ExecutableDeleter MakeExecutableDeleter(const PJRT_Api* api);
 
+using PJRT_LoadedExecutableDeleter =
+    std::function<void(PJRT_LoadedExecutable*)>;
+
+// Creates a custom deleter for smart pointers.
+// Pass in pointer `api` to the PJRT C API.
+// The lifetime of the Api pointed to must be longer than the executable.
+PJRT_LoadedExecutableDeleter MakeLoadedExecutableDeleter(const PJRT_Api* api);
+
+using PJRT_EventDeleter = std::function<void(PJRT_Event*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the event.
+PJRT_EventDeleter MakeEventDeleter(const PJRT_Api* api);
+
+using PJRT_SerializedExecutableDeleter =
+    std::function<void(PJRT_SerializedExecutable*)>;
+
+// Pass in an API pointer; receive a custom deleter for smart pointers.
+// The lifetime of the Api pointed to must be longer than the serialized
+// executable.
+PJRT_SerializedExecutableDeleter MakeSerializedExecutableDeleter(
+    const PJRT_Api* api);
+
 // Fatal error logging if status is not success. This terminates the process
 // and frees the PJRT_Error passed in.
 void LogFatalIfPjrtError(PJRT_Error* error, const PJRT_Api* api);
@@ -68,12 +93,6 @@ tsl::error::Code PjrtErrorToStatusCode(const PJRT_Error* error,
 
 PJRT_Error_Code StatusCodeToPjrtErrorCode(tsl::error::Code code);
 
-using PJRT_EventDeleter = std::function<void(PJRT_Event*)>;
-
-// Pass in an API pointer; receive a custom deleter for smart pointers.
-// The lifetime of the Api pointed to must be longer than the event.
-PJRT_EventDeleter MakeEventDeleter(const PJRT_Api* api);
-
 // Conversion helper from xla::PrimitiveType to PJRT_Buffer_Type.
 PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type);
 
@@ -90,6 +109,19 @@ PJRT_HostBufferSemantics ConvertToPjRtHostBufferSemantics(
 xla::PjRtClient::HostBufferSemantics ConvertFromPjRtHostBufferSemantics(
     PJRT_HostBufferSemantics buffer_semantics);
 
+// Create and return a `PjRtFuture`  which will be set when `c_event` is ready.
+// This also deletes `c_event` when the `PjRtFuture` is set.
+xla::PjRtFuture<xla::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
+                                                      const PJRT_Api* c_api);
+
+// Helper function for checking C API argument struct sizes. Returns a non-OK
+// status if the expected and actual sizes aren't equal (i.e. no ABI
+// compatibility guarantees).
+xla::Status CheckMatchingStructSizes(absl::string_view struct_name,
+                                     size_t expected_size, size_t actual_size);
+
+absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
+
 }  // namespace pjrt
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_HELPERS_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_tpu.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_tpu.h
index 6db3f9ca472..aa6e3aead07 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_tpu.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_tpu.h
@@ -22,8 +22,11 @@ limitations under the License.
 extern "C" {
 #endif
 
+// TODO(b/261916900): this will cause symbol collision when we wanna statically
+// link in multiple plugins. We may need to make the function name configurable,
+// or we could add external-only wrapper shims
 // Does not pass ownership of returned PJRT_Api* to caller.
-const PJRT_Api* GetTpuPjrtApi();
+const PJRT_Api* GetPjrtApi();
 
 #ifdef __cplusplus
 }
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index f56ee9f9bea..05f17c95e9e 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 
+#include <cstddef>
+#include <cstring>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -23,7 +25,9 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
@@ -39,25 +43,10 @@ limitations under the License.
 
 // TODO(b/238999986): Remove this.
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace pjrt {
 
-xla::Status CheckMatchingStructSizes(absl::string_view struct_name,
-                                     size_t expected_size, size_t actual_size) {
-  if (expected_size != actual_size) {
-    return tsl::errors::InvalidArgument(
-        StructSizeErrorMsg(struct_name, expected_size, actual_size));
-  }
-  return tsl::OkStatus();
-}
-
-std::string StructSizeErrorMsg(absl::string_view struct_name,
-                               size_t expected_size, size_t actual_size) {
-  return absl::StrCat("Unexpected ", struct_name, " size: expected ",
-                      expected_size, ", got ", actual_size,
-                      ". Check installed software versions.");
-}
-
 std::string ProgramFormatErrorMsg(absl::string_view program_format) {
   return absl::StrCat("Unknown program format '", program_format, "'.");
 }
@@ -71,6 +60,57 @@ static PJRT_Device* GetCDevice(const PJRT_Client* client,
   return iter->second;
 }
 
+// Performs one-time cost-analysis on an executable if not done already, and
+// populates its cost analysis properties. After this returns successfully,
+// cost analysis properties of the executable can be accessed without mutex.
+static xla::Status PopulateExecutableCostAnalysisIfNeeded(
+    PJRT_LoadedExecutable* executable) {
+  absl::MutexLock lock(&executable->mutex);
+  if (!executable->cost_analysis_ran) {
+    // Call GetCostAnalysis in the underlying PjRtExecutable
+    using PropertiesMapType =
+        absl::flat_hash_map<std::string, xla::PjRtValueType>;
+    TF_ASSIGN_OR_RETURN(const PropertiesMapType properties,
+                        executable->get()->GetCostAnalysis());
+    // If no output, return empty result
+    if (properties.empty()) {
+      executable->cost_analysis_ran = true;
+      return xla::OkStatus();
+    }
+
+    // Copy each returned property to cost analysis vectors in PJRT_Executable
+    std::vector<PJRT_NamedValue>& cost_analysis_properties =
+        executable->cost_analysis_properties;
+    cost_analysis_properties.resize((properties.size()));
+    std::vector<std::string>& cost_analysis_names =
+        executable->cost_analysis_names;
+    cost_analysis_names.resize(properties.size());
+    size_t i = 0;
+    for (const auto& property : properties) {
+      PJRT_NamedValue& cost_analysis_property = cost_analysis_properties[i];
+      std::string& property_name = cost_analysis_names[i];
+
+      cost_analysis_property.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+      cost_analysis_property.priv = nullptr;
+
+      property_name = property.first;
+      cost_analysis_property.name = property_name.c_str();
+      cost_analysis_property.name_size = property_name.size();
+
+      const xla::PjRtValueType& property_value = property.second;
+      CHECK(std::holds_alternative<float>(property_value))
+          << property_value.index();
+      cost_analysis_property.type = PJRT_NamedValue::PJRT_NamedValue_kFloat;
+      cost_analysis_property.float_value = std::get<float>(property_value);
+      cost_analysis_property.value_size = 1;
+
+      ++i;
+    }
+    executable->cost_analysis_ran = true;
+  }
+  return xla::OkStatus();
+}
+
 // ---------------------------------- Errors -----------------------------------
 
 void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args) {
@@ -175,6 +215,18 @@ PJRT_Error* PJRT_Client_LookupDevice(PJRT_Client_LookupDevice_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Client_LookupAddressableDevice(
+    PJRT_Client_LookupAddressableDevice_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Client_LookupAddressableDevice_Args",
+      PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE, args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(
+      xla::PjRtDevice * addressable_device,
+      args->client->client->LookupAddressableDevice(args->local_hardware_id));
+  args->addressable_device = GetCDevice(args->client, addressable_device);
+  return nullptr;
+}
+
 // Searches `device_list` for a PJRT_Device* that wraps a provided
 // `xla::PjRtDevice *` (`cpp_device`). If a match is found, that PJRT_Device* is
 // returned. Otherwise, returns nullptr.
@@ -189,10 +241,10 @@ static PJRT_Device* FindDeviceWrapper(
 }
 
 static void PopulatePjrtExecutableAddressableDevices(
-    PJRT_Executable* executable) {
+    PJRT_LoadedExecutable* executable) {
   CHECK(executable->client != nullptr) << ": client was null";
   absl::Span<xla::PjRtDevice* const> cpp_devices =
-      executable->executable->addressable_devices();
+      executable->get()->addressable_devices();
   const size_t num_addressable_devices = cpp_devices.size();
   std::vector<PJRT_Device*>& exec_devices = executable->addressable_devices;
   exec_devices.reserve(num_addressable_devices);
@@ -200,9 +252,7 @@ static void PopulatePjrtExecutableAddressableDevices(
   const std::vector<PJRT_Device*>& client_devices =
       executable->client->addressable_devices;
 
-  CHECK(client_devices.size() >= num_addressable_devices)
-      << ": client->addressable_devices is not bigger than "
-         "executable->addressable_devices()";
+  CHECK_GE(client_devices.size(), num_addressable_devices);
 
   for (int i = 0; i < num_addressable_devices; ++i) {
     xla::PjRtDevice* cpp_device = cpp_devices[i];
@@ -259,10 +309,8 @@ PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args) {
     PJRT_RETURN_IF_ERROR(
         tsl::errors::InvalidArgument(ProgramFormatErrorMsg(format_str)));
   }
-  // TODO(b/237545405): Implement creation methods for PJRT_Executable.
-  args->executable = new PJRT_Executable{std::move(executable), args->client};
-  PopulatePjrtExecutableAddressableDevices(args->executable);
-  args->executable->populated = true;
+  args->executable =
+      new PJRT_LoadedExecutable(std::move(executable), args->client);
   return nullptr;
 }
 
@@ -428,28 +476,31 @@ PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_LoadedExecutable_Destroy(
+    PJRT_LoadedExecutable_Destroy_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_LoadedExecutable_Destroy_Args",
+      PJRT_LoadedExecutable_Destroy_Args_STRUCT_SIZE, args->struct_size));
+  delete args->executable;
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
       "PJRT_Executable_Name_Args", PJRT_Executable_Name_Args_STRUCT_SIZE,
       args->struct_size));
-  absl::string_view executable_name = args->executable->executable->name();
+  absl::string_view executable_name = args->executable->get()->name();
   args->executable_name = executable_name.data();
   args->executable_name_size = executable_name.size();
   return nullptr;
 }
 
-PJRT_Error* PJRT_Executable_AddressableDevices(
-    PJRT_Executable_AddressableDevices_Args* args) {
+PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
+    PJRT_LoadedExecutable_AddressableDevices_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Executable_AddressableDevices_Args",
-      PJRT_Executable_AddressableDevices_Args_STRUCT_SIZE, args->struct_size));
-
-  // TODO(b/237545405): Implement creation methods for PJRT_Executable that can
-  // populate addressable_devices on instantiation,  and use this logic there
-  if (!args->executable->populated) {
-    PopulatePjrtExecutableAddressableDevices(args->executable);
-    args->executable->populated = true;
-  }
+      "PJRT_LoadedExecutable_AddressableDevices_Args",
+      PJRT_LoadedExecutable_AddressableDevices_Args_STRUCT_SIZE,
+      args->struct_size));
 
   args->num_addressable_devices = args->executable->addressable_devices.size();
   args->addressable_devices = args->executable->addressable_devices.data();
@@ -462,12 +513,12 @@ PJRT_Error* PJRT_Executable_NumOutputs(PJRT_Executable_NumOutputs_Args* args) {
       PJRT_Executable_NumOutputs_Args_STRUCT_SIZE, args->struct_size));
   PJRT_ASSIGN_OR_RETURN(
       std::vector<std::shared_ptr<xla::HloModule>> hlo_modules,
-      args->executable->executable->GetHloModules());
+      args->executable->get()->GetHloModules());
   if (hlo_modules.empty()) {
     return new PJRT_Error{
         xla::InvalidArgument("Can't get number of executable outputs, Hlo "
                              "modules is empty for executable %s.",
-                             args->executable->executable->name())};
+                             args->executable->get()->name())};
   }
   if (hlo_modules.size() != 1) {
     return new PJRT_Error{
@@ -478,7 +529,7 @@ PJRT_Error* PJRT_Executable_NumOutputs(PJRT_Executable_NumOutputs_Args* args) {
   if (shape.IsTuple()) {
     args->num_outputs = shape.tuple_shapes_size();
   } else {
-    // The output size is 1 is it is not a tuple.
+    // The output size is 1, as it is not a tuple.
     args->num_outputs = 1;
   }
   return nullptr;
@@ -491,24 +542,111 @@ PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
       PJRT_Executable_SizeOfGeneratedCodeInBytes_Args_STRUCT_SIZE,
       args->struct_size));
 
-  args->size_in_bytes =
-      args->executable->executable->SizeOfGeneratedCodeInBytes();
+  args->size_in_bytes = args->executable->get()->SizeOfGeneratedCodeInBytes();
   return nullptr;
 }
 
-PJRT_Error* PJRT_Executable_Delete(PJRT_Executable_Delete_Args* args) {
+static xla::Status VerifyOptimizedProgramArgs(
+    PJRT_Executable_OptimizedProgram_Args* args) {
+  TF_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Executable_OptimizedProgram_Args",
+      PJRT_Executable_OptimizedProgram_Args_STRUCT_SIZE, args->struct_size));
+  TF_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Program", PJRT_Program_STRUCT_SIZE, args->program->struct_size));
+  return xla::OkStatus();
+}
+
+static xla::StatusOr<std::shared_ptr<xla::HloModule>> GetOptimizedProgramModule(
+    const PJRT_Executable_OptimizedProgram_Args* args) {
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<xla::HloModule>> hlo_modules,
+                      args->executable->get()->GetHloModules());
+  if (hlo_modules.empty()) {
+    return xla::InvalidArgument(
+        "Can't get the optimized program for executable "
+        "`%s`: HLO modules is empty.",
+        args->executable->get()->name());
+  }
+  if (hlo_modules.size() > 1) {
+    return xla::Unimplemented(
+        "Can't get the optimized program for executable "
+        "`%s`: MPMD execution is not supported by PJRT C API",
+        args->executable->get()->name());
+  }
+  return std::move(hlo_modules[0]);
+}
+
+PJRT_Error* PJRT_Executable_OptimizedProgram(
+    PJRT_Executable_OptimizedProgram_Args* args) {
+  PJRT_RETURN_IF_ERROR(VerifyOptimizedProgramArgs(args));
+  PJRT_Program* program = args->program;
+  program->format = kHloWithConfigFormat.data();
+  program->format_size = kHloWithConfigFormat.size();
+  PJRT_ASSIGN_OR_RETURN(std::shared_ptr<xla::HloModule> hlo_module,
+                        GetOptimizedProgramModule(args));
+  PJRT_ASSIGN_OR_RETURN(xla::HloModuleProtoWithConfig proto,
+                        hlo_module->ToProtoWithConfig());
+  if (program->code == nullptr) {
+    program->code_size = proto.ByteSizeLong();
+    if (program->code_size >= 2ull * 1024 * 1024 * 1024) {
+      return new PJRT_Error{xla::ResourceExhausted(
+          "%s: HLO program serialization would require more than the max "
+          "supported protobuff size of 2 GiB.",
+          __func__)};
+    }
+    return nullptr;
+  } else {
+    if (program->code_size < proto.ByteSizeLong()) {
+      return new PJRT_Error{
+          xla::InvalidArgument("`program->code_size` %d < required bytes %d",
+                               program->code_size, proto.ByteSizeLong()),
+      };
+    }
+    bool succeeded = proto.SerializeToArray(program->code, program->code_size);
+    if (!succeeded) {
+      return new PJRT_Error{
+          xla::ResourceExhausted("%s: HLO program serialization exceeds max "
+                                 "supported protobuff size of 2 GiB.",
+                                 __func__)};
+    }
+    return nullptr;
+  }
+}
+
+PJRT_Error* PJRT_LoadedExecutable_GetCostAnalysis(
+    PJRT_LoadedExecutable_GetCostAnalysis_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Executable_Delete_Args", PJRT_Executable_Delete_Args_STRUCT_SIZE,
+      "PJRT_LoadedExecutable_GetCostAnalysis_Args",
+      PJRT_LoadedExecutable_GetCostAnalysis_Args_STRUCT_SIZE,
       args->struct_size));
-  args->executable->executable->Delete();
+
+  PJRT_RETURN_IF_ERROR(
+      PopulateExecutableCostAnalysisIfNeeded(args->executable));
+
+  // Output cost analysis data in PJRT_Executable
+  args->num_properties = args->executable->cost_analysis_properties.size();
+  if (args->num_properties > 0) {
+    args->properties = args->executable->cost_analysis_properties.data();
+  } else {
+    args->properties = nullptr;
+  }
   return nullptr;
 }
 
-PJRT_Error* PJRT_Executable_IsDeleted(PJRT_Executable_IsDeleted_Args* args) {
+PJRT_Error* PJRT_LoadedExecutable_Delete(
+    PJRT_LoadedExecutable_Delete_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Executable_IsDeleted_Args",
-      PJRT_Executable_IsDeleted_Args_STRUCT_SIZE, args->struct_size));
-  args->is_deleted = args->executable->executable->IsDeleted();
+      "PJRT_LoadedExecutable_Delete_Args",
+      PJRT_LoadedExecutable_Delete_Args_STRUCT_SIZE, args->struct_size));
+  args->executable->get()->Delete();
+  return nullptr;
+}
+
+PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
+    PJRT_LoadedExecutable_IsDeleted_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_LoadedExecutable_IsDeleted_Args",
+      PJRT_LoadedExecutable_IsDeleted_Args_STRUCT_SIZE, args->struct_size));
+  args->is_deleted = args->executable->get()->IsDeleted();
   return nullptr;
 }
 
@@ -526,10 +664,11 @@ static std::vector<std::vector<xla::PjRtBuffer*>> Convert2DCBuffersToCppBuffers(
   return cpp_lists;
 }
 
-PJRT_Error* PJRT_Executable_Execute(PJRT_Executable_Execute_Args* args) {
+PJRT_Error* PJRT_LoadedExecutable_Execute(
+    PJRT_LoadedExecutable_Execute_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
-      "PJRT_Executable_Execute_Args", PJRT_Executable_Execute_Args_STRUCT_SIZE,
-      args->struct_size));
+      "PJRT_LoadedExecutable_Execute_Args",
+      PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE, args->struct_size));
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes("PJRT_ExecuteOptions",
                                                 PJRT_ExecuteOptions_STRUCT_SIZE,
                                                 args->options->struct_size));
@@ -550,16 +689,15 @@ PJRT_Error* PJRT_Executable_Execute(PJRT_Executable_Execute_Args* args) {
       std::optional<std::vector<xla::PjRtFuture<xla::Status>>> returned_futures;
       returned_futures.emplace();
       PJRT_ASSIGN_OR_RETURN(cpp_buffer_lists,
-                            args->executable->executable->Execute(
+                            args->executable->get()->Execute(
                                 cpp_argument_lists, options, returned_futures));
       for (int i = 0; i < returned_futures->size(); ++i) {
         args->device_complete_events[i] =
             new PJRT_Event{std::move((*returned_futures)[i])};
       }
     } else {
-      PJRT_ASSIGN_OR_RETURN(
-          cpp_buffer_lists,
-          args->executable->executable->Execute(cpp_argument_lists, options));
+      PJRT_ASSIGN_OR_RETURN(cpp_buffer_lists, args->executable->get()->Execute(
+                                                  cpp_argument_lists, options));
     }
     for (int i = 0; i < cpp_buffer_lists.size(); ++i) {
       for (int j = 0; j < cpp_buffer_lists[i].size(); ++j) {
@@ -571,32 +709,108 @@ PJRT_Error* PJRT_Executable_Execute(PJRT_Executable_Execute_Args* args) {
     if (args->num_devices != 1) {
       return new PJRT_Error{xla::InvalidArgument(
           "num_devices and corresponding output list sizes must be 1 when "
-          "calling PJRT_Executable_Execute with non-null execute_device. Got "
+          "calling PJRT_LoadedExecutable_Execute with non-null execute_device. "
+          "Got "
           "num_devices=%i",
           args->num_devices)};
     }
     std::vector<std::unique_ptr<xla::PjRtBuffer>> cpp_buffer_list;
-    if (args->executable->executable->num_partitions() == 1 &&
-        args->executable->executable->num_replicas() == 1) {
-      // TODO(b/247013351): Implement portable execution.
-      return new PJRT_Error{xla::Unimplemented(
-          "PJRT_Executabe_Execute doesn't support portable execution; "
-          "execute_device must be null for single-device executables")};
+    std::optional<xla::PjRtFuture<xla::Status>> returned_future;
+    bool fill_future = args->device_complete_events != nullptr;
+    if (args->executable->get()->num_partitions() == 1 &&
+        args->executable->get()->num_replicas() == 1) {
+      PJRT_ASSIGN_OR_RETURN(
+          cpp_buffer_list,
+          args->executable->get()->ExecutePortable(
+              cpp_argument_lists[0], args->execute_device->device, options,
+              returned_future, fill_future));
     } else {
       PJRT_ASSIGN_OR_RETURN(
           cpp_buffer_list,
-          args->executable->executable->ExecuteSharded(
-              cpp_argument_lists[0], args->execute_device->device, options));
+          args->executable->get()->ExecuteSharded(
+              cpp_argument_lists[0], args->execute_device->device, options,
+              returned_future, fill_future));
     }
     for (int i = 0; i < cpp_buffer_list.size(); ++i) {
       args->output_lists[0][i] = new PJRT_Buffer{std::move(cpp_buffer_list[i]),
                                                  args->executable->client};
     }
+    if (fill_future) {
+      args->device_complete_events[0] =
+          new PJRT_Event{std::move((*returned_future))};
+    }
   }
 
   return nullptr;
 }
 
+PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Executable_Serialize_Args",
+      PJRT_Executable_Serialize_Args_STRUCT_SIZE, args->struct_size));
+  std::string serialization;
+  PJRT_ASSIGN_OR_RETURN(serialization,
+                        args->executable->executable->SerializeExecutable());
+
+  PJRT_SerializedExecutable* serialized_exec = new PJRT_SerializedExecutable;
+  if (serialized_exec == nullptr) {
+    return new PJRT_Error{xla::ResourceExhausted(
+        "Out of memory for `PJRT_Executable_Serialize()`")};
+  }
+  serialized_exec->serialized = std::move(serialization);
+  args->serialized_executable = serialized_exec;
+  return nullptr;
+}
+
+PJRT_Error* PJRT_Executable_DeserializeAndLoad(
+    PJRT_Executable_DeserializeAndLoad_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Executable_DeserializeAndLoad_Args",
+      PJRT_Executable_DeserializeAndLoad_Args_STRUCT_SIZE, args->struct_size));
+  absl::string_view serialized(args->serialized_executable,
+                               args->serialized_executable_size);
+
+  PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtLoadedExecutable> executable,
+                        args->client->client->DeserializeExecutable(
+                            serialized, /*options=*/std::nullopt));
+
+  args->loaded_executable =
+      new PJRT_LoadedExecutable(std::move(executable), args->client);
+  return nullptr;
+}
+
+PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
+    PJRT_LoadedExecutable_GetExecutable_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_LoadedExecutable_GetExecutable_Args",
+      PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE, args->struct_size));
+  args->executable = new PJRT_Executable{args->loaded_executable->executable};
+  return nullptr;
+}
+
+// -------------------------- Serialized Executables ---------------------------
+
+PJRT_Error* PJRT_SerializedExecutable_Destroy(
+    PJRT_SerializedExecutable_Destroy_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_SerializedExecutable_Destroy_Args",
+      PJRT_SerializedExecutable_Destroy_Args_STRUCT_SIZE, args->struct_size));
+  if (args->serialized_executable != nullptr) {
+    delete args->serialized_executable;
+  }
+  return nullptr;
+}
+
+PJRT_Error* PJRT_SerializedExecutable_Data(
+    PJRT_SerializedExecutable_Data_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_SerializedExecutable_Data_Args",
+      PJRT_SerializedExecutable_Data_Args_STRUCT_SIZE, args->struct_size));
+  args->data = args->serialized_executable->serialized.c_str();
+  args->data_size = args->serialized_executable->serialized.size();
+  return nullptr;
+}
+
 // ---------------------------------- Buffers ----------------------------------
 // TODO(b/238999986): Replace this with decomposed shape methods.
 PJRT_Error* PJRT_Buffer_OnDeviceTrimmedShape(
@@ -811,4 +1025,82 @@ PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args) {
   return nullptr;
 }
 
+// Populates `c_device->attributes` with shallow copy of the vendor specific
+// attributes about the device.
+static void PopulatePjrtDeviceAttributes(PJRT_Device* c_device) {
+  CHECK(c_device != nullptr) << ": c device is null";
+  CHECK(c_device->device != nullptr) << ": cpp device is null";
+
+  const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>& attributes =
+      c_device->device->Attributes();
+
+  c_device->attributes.resize(attributes.size());
+  int ind = 0;
+  // Doing shallow copy of attribute names and values when it's string or an
+  // array.
+  for (auto const& [name, value] : attributes) {
+    PJRT_NamedValue& cur_attribute = c_device->attributes[ind];
+    cur_attribute.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+    cur_attribute.priv = nullptr;
+    cur_attribute.name = name.c_str();
+    cur_attribute.name_size = name.size();
+    if (const std::string* string_val = std::get_if<std::string>(&value)) {
+      cur_attribute.type = PJRT_NamedValue::PJRT_NamedValue_kString;
+      cur_attribute.string_value = string_val->c_str();
+      cur_attribute.value_size = string_val->size();
+    } else if (const std::vector<int64_t>* vector_val =
+                   std::get_if<std::vector<int64_t>>(&value)) {
+      cur_attribute.type = PJRT_NamedValue::PJRT_NamedValue_kInt64List;
+      cur_attribute.int64_array_value = vector_val->data();
+      cur_attribute.value_size = vector_val->size();
+    } else if (const int64_t* int_value = std::get_if<int64_t>(&value)) {
+      cur_attribute.type = PJRT_NamedValue::PJRT_NamedValue_kInt64;
+      cur_attribute.int64_value = *int_value;
+      cur_attribute.value_size = 1;
+    } else {
+      // Do not allow other types (such as
+      // PJRT_NamedValue::PJRT_NamedValue_kFloat) since device attributes
+      // currently should not return other types.
+      CHECK(false) << "Unexpected attribute type " << value.index() << " for "
+                   << name;
+    }
+    ++ind;
+  }
+}
+
+PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client) {
+  PJRT_Client* c_client = new PJRT_Client{std::move(cpp_client)};
+
+  absl::Span<xla::PjRtDevice* const> cpp_devices = c_client->client->devices();
+  const size_t num_devices = cpp_devices.size();
+  c_client->owned_devices.reserve(num_devices);
+  c_client->devices.reserve(num_devices);
+  c_client->addressable_devices.reserve(
+      c_client->client->addressable_device_count());
+
+  for (xla::PjRtDevice* device : cpp_devices) {
+    c_client->owned_devices.push_back(PJRT_Device{device});
+    PJRT_Device* c_device = &c_client->owned_devices.back();
+    PopulatePjrtDeviceAttributes(c_device);
+    c_client->devices.push_back(c_device);
+    if (device->IsAddressable()) {
+      c_client->addressable_devices.push_back(c_device);
+    }
+    c_client->c_device_from_cpp_device[device] = c_device;
+  }
+  CHECK_EQ(c_client->addressable_devices.size(),
+           c_client->client->addressable_device_count());
+  return c_client;
+}
+
 }  // namespace pjrt
+
+PJRT_Executable::PJRT_Executable(
+    std::shared_ptr<xla::PjRtExecutable> executable)
+    : executable(std::move(executable)) {}
+
+PJRT_LoadedExecutable::PJRT_LoadedExecutable(
+    std::shared_ptr<xla::PjRtLoadedExecutable> executable, PJRT_Client* client)
+    : executable(std::move(executable)), client(client) {
+  pjrt::PopulatePjrtExecutableAddressableDevices(this);
+}
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 79ed32da2f4..34ea6f9608c 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
@@ -47,19 +48,41 @@ struct PJRT_Device {
   // The xla::PjRtDevice* is owned by the corresponding xla::PjRtClient.
   xla::PjRtDevice* device;
   // The device specific attributes which are initialized once per device.
-  std::vector<PJRT_Device_Attribute> attributes;
+  std::vector<PJRT_NamedValue> attributes;
 };
 
 struct PJRT_Executable {
-  std::unique_ptr<xla::PjRtLoadedExecutable> executable;
+  // Must be shared_ptr so that we can share with PJRT_LoadedExecutable.
+  std::shared_ptr<xla::PjRtExecutable> executable;
+
+  explicit PJRT_Executable(std::shared_ptr<xla::PjRtExecutable> executable);
+
+  const xla::PjRtExecutable* get() const { return executable.get(); }
+  xla::PjRtExecutable* get() { return executable.get(); }
+};
+
+struct PJRT_LoadedExecutable {
+  // Must be shared_ptr so that we can share with PJRT_Executable.
+  std::shared_ptr<xla::PjRtLoadedExecutable> executable;
   PJRT_Client* client;
   // These pointers are a subset of `client`'s `addressable_devices`, i.e. those
   // addressed by the compiled executable program. `client` owns the objects
   // these point to.
   std::vector<PJRT_Device*> addressable_devices;
-  // TODO(b/237545405): Remove `populated` once we implement creation methods
-  // for PJRT_Executable that can populate addressable_devices on instantiation.
-  bool populated = false;
+
+  mutable absl::Mutex mutex;
+  // Cost analysis properties and name strings are populated after cost analysis
+  // has been run. These are returned from cost analysis calls, and do not
+  // change after the first call.
+  bool cost_analysis_ran ABSL_GUARDED_BY(mutex) = false;
+  std::vector<std::string> cost_analysis_names;
+  std::vector<PJRT_NamedValue> cost_analysis_properties;
+
+  PJRT_LoadedExecutable(std::shared_ptr<xla::PjRtLoadedExecutable> executable,
+                        PJRT_Client* client);
+
+  const xla::PjRtLoadedExecutable* get() const { return executable.get(); }
+  xla::PjRtLoadedExecutable* get() { return executable.get(); }
 };
 
 struct PJRT_Buffer {
@@ -76,6 +99,10 @@ struct PJRT_Event {
   std::optional<xla::Status> status;
 };
 
+struct PJRT_SerializedExecutable {
+  std::string serialized;
+};
+
 namespace pjrt {
 
 // C API definitions
@@ -98,6 +125,8 @@ PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
 PJRT_Error* PJRT_Client_AddressableDevices(
     PJRT_Client_AddressableDevices_Args* args);
 PJRT_Error* PJRT_Client_LookupDevice(PJRT_Client_LookupDevice_Args* args);
+PJRT_Error* PJRT_Client_LookupAddressableDevice(
+    PJRT_Client_LookupAddressableDevice_Args* args);
 PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
 PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
     PJRT_Client_DefaultDeviceAssignment_Args* args);
@@ -115,14 +144,34 @@ PJRT_Error* PJRT_Device_ToString(PJRT_Device_ToString_Args* args);
 
 PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
-PJRT_Error* PJRT_Executable_AddressableDevices(
-    PJRT_Executable_AddressableDevices_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
+    PJRT_LoadedExecutable_AddressableDevices_Args* args);
 PJRT_Error* PJRT_Executable_NumOutputs(PJRT_Executable_NumOutputs_Args* args);
 PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
     PJRT_Executable_SizeOfGeneratedCodeInBytes_Args* args);
-PJRT_Error* PJRT_Executable_Delete(PJRT_Executable_Delete_Args* args);
-PJRT_Error* PJRT_Executable_IsDeleted(PJRT_Executable_IsDeleted_Args* args);
-PJRT_Error* PJRT_Executable_Execute(PJRT_Executable_Execute_Args* args);
+PJRT_Error* PJRT_Executable_OptimizedProgram(
+    PJRT_Executable_OptimizedProgram_Args* args);
+
+PJRT_Error* PJRT_LoadedExecutable_Destroy(
+    PJRT_LoadedExecutable_Destroy_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_GetCostAnalysis(
+    PJRT_LoadedExecutable_GetCostAnalysis_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_Delete(
+    PJRT_LoadedExecutable_Delete_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_IsDeleted(
+    PJRT_LoadedExecutable_IsDeleted_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_Execute(
+    PJRT_LoadedExecutable_Execute_Args* args);
+PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args);
+PJRT_Error* PJRT_Executable_DeserializeAndLoad(
+    PJRT_Executable_DeserializeAndLoad_Args* args);
+PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
+    PJRT_LoadedExecutable_GetExecutable_Args* args);
+
+PJRT_Error* PJRT_SerializedExecutable_Destroy(
+    PJRT_SerializedExecutable_Destroy_Args* args);
+PJRT_Error* PJRT_SerializedExecutable_Data(
+    PJRT_SerializedExecutable_Data_Args* args);
 
 PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
 PJRT_Error* PJRT_Buffer_OnDeviceTrimmedShape(
@@ -166,19 +215,98 @@ PJRT_Error* PJRT_Buffer_UnsafePointer(PJRT_Buffer_UnsafePointer_Args* args);
 #define _PJRT_CONCAT(x, y) _PJRT_CONCAT_IMPL(x, y)
 #define _PJRT_CONCAT_IMPL(x, y) x##y
 
-// Helper function for checking C API argument struct sizes. Returns a non-OK
-// status if the expected and actual sizes aren't equal (i.e. no ABI
-// compatibility guarantees).
-xla::Status CheckMatchingStructSizes(absl::string_view struct_name,
-                                     size_t expected_size, size_t actual_size);
-
-// Helper function
-std::string StructSizeErrorMsg(absl::string_view struct_name,
-                               size_t expected_size, size_t actual_size);
-
 // Returns a specific error message when the program format is unknown.
 // Does not check the program format itself.
 std::string ProgramFormatErrorMsg(absl::string_view program_format);
+
+// Creates a C PJRT client from a C++ PJRT client and creates C PJRT devices
+// from cpp_client's devices. The returned client is owned by the caller and
+// should be destroyed with PJRT_Client_Destroy.
+PJRT_Client* CreateWrapperClient(std::unique_ptr<xla::PjRtClient> cpp_client);
+
+// Creates a PJRT_Api with create_fn from the input and other functions in
+// pjrt_c_api_wrapper_impl.
+constexpr PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn) {
+  return PJRT_Api{
+      .struct_size = PJRT_Api_STRUCT_SIZE,
+      .priv = nullptr,
+
+      .PJRT_Error_Destroy = pjrt::PJRT_Error_Destroy,
+      .PJRT_Error_Message = pjrt::PJRT_Error_Message,
+      .PJRT_Error_GetCode = pjrt::PJRT_Error_GetCode,
+
+      .PJRT_Event_Destroy = pjrt::PJRT_Event_Destroy,
+      .PJRT_Event_IsReady = pjrt::PJRT_Event_IsReady,
+      .PJRT_Event_Error = pjrt::PJRT_Event_Error,
+      .PJRT_Event_Await = pjrt::PJRT_Event_Await,
+      .PJRT_Event_OnReady = pjrt::PJRT_Event_OnReady,
+
+      .PJRT_Client_Create = create_fn,
+      .PJRT_Client_Destroy = pjrt::PJRT_Client_Destroy,
+      .PJRT_Client_PlatformName = pjrt::PJRT_Client_PlatformName,
+      .PJRT_Client_ProcessIndex = pjrt::PJRT_Client_ProcessIndex,
+      .PJRT_Client_PlatformVersion = pjrt::PJRT_Client_PlatformVersion,
+      .PJRT_Client_Devices = pjrt::PJRT_Client_Devices,
+      .PJRT_Client_AddressableDevices = pjrt::PJRT_Client_AddressableDevices,
+      .PJRT_Client_LookupDevice = pjrt::PJRT_Client_LookupDevice,
+      .PJRT_Client_LookupAddressableDevice =
+          pjrt::PJRT_Client_LookupAddressableDevice,
+      .PJRT_Client_Compile = pjrt::PJRT_Client_Compile,
+      .PJRT_Client_DefaultDeviceAssignment =
+          pjrt::PJRT_Client_DefaultDeviceAssignment,
+      .PJRT_Client_BufferFromHostBuffer =
+          pjrt::PJRT_Client_BufferFromHostBuffer,
+
+      .PJRT_Device_Id = pjrt::PJRT_Device_Id,
+      .PJRT_Device_ProcessIndex = pjrt::PJRT_Device_ProcessIndex,
+      .PJRT_Device_IsAddressable = pjrt::PJRT_Device_IsAddressable,
+      .PJRT_Device_Attributes = pjrt::PJRT_Device_Attributes,
+      .PJRT_Device_Kind = pjrt::PJRT_Device_Kind,
+      .PJRT_Device_LocalHardwareId = pjrt::PJRT_Device_LocalHardwareId,
+      .PJRT_Device_DebugString = pjrt::PJRT_Device_DebugString,
+      .PJRT_Device_ToString = pjrt::PJRT_Device_ToString,
+
+      .PJRT_Executable_Destroy = pjrt::PJRT_Executable_Destroy,
+      .PJRT_Executable_Name = pjrt::PJRT_Executable_Name,
+      .PJRT_Executable_NumOutputs = pjrt::PJRT_Executable_NumOutputs,
+      .PJRT_Executable_SizeOfGeneratedCodeInBytes =
+          pjrt::PJRT_Executable_SizeOfGeneratedCodeInBytes,
+      .PJRT_Executable_OptimizedProgram =
+          pjrt::PJRT_Executable_OptimizedProgram,
+      .PJRT_Executable_Serialize = pjrt::PJRT_Executable_Serialize,
+
+      .PJRT_LoadedExecutable_Destroy = pjrt::PJRT_LoadedExecutable_Destroy,
+      .PJRT_LoadedExecutable_GetExecutable =
+          pjrt::PJRT_LoadedExecutable_GetExecutable,
+      .PJRT_LoadedExecutable_AddressableDevices =
+          pjrt::PJRT_LoadedExecutable_AddressableDevices,
+      .PJRT_LoadedExecutable_GetCostAnalysis =
+          pjrt::PJRT_LoadedExecutable_GetCostAnalysis,
+      .PJRT_LoadedExecutable_Delete = pjrt::PJRT_LoadedExecutable_Delete,
+      .PJRT_LoadedExecutable_IsDeleted = pjrt::PJRT_LoadedExecutable_IsDeleted,
+      .PJRT_LoadedExecutable_Execute = pjrt::PJRT_LoadedExecutable_Execute,
+      .PJRT_Executable_DeserializeAndLoad =
+          pjrt::PJRT_Executable_DeserializeAndLoad,
+
+      .PJRT_SerializedExecutable_Destroy =
+          pjrt::PJRT_SerializedExecutable_Destroy,
+      .PJRT_SerializedExecutable_Data = pjrt::PJRT_SerializedExecutable_Data,
+
+      .PJRT_Buffer_Destroy = pjrt::PJRT_Buffer_Destroy,
+      .PJRT_Buffer_OnDeviceTrimmedShape =
+          pjrt::PJRT_Buffer_OnDeviceTrimmedShape,
+      .PJRT_Buffer_OnDeviceSizeInBytes = pjrt::PJRT_Buffer_OnDeviceSizeInBytes,
+      .PJRT_Buffer_Device = pjrt::PJRT_Buffer_Device,
+      .PJRT_Buffer_Delete = pjrt::PJRT_Buffer_Delete,
+      .PJRT_Buffer_IsDeleted = pjrt::PJRT_Buffer_IsDeleted,
+      .PJRT_Buffer_CopyToDevice = pjrt::PJRT_Buffer_CopyToDevice,
+      .PJRT_Buffer_ToHostBuffer = pjrt::PJRT_Buffer_ToHostBuffer,
+      .PJRT_Buffer_IsOnCpu = pjrt::PJRT_Buffer_IsOnCpu,
+      .PJRT_Buffer_ReadyEvent = pjrt::PJRT_Buffer_ReadyEvent,
+      .PJRT_Buffer_UnsafePointer = pjrt::PJRT_Buffer_UnsafePointer,
+  };
+}
+
 }  // namespace pjrt
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_WRAPPER_IMPL_H_
diff --git a/tensorflow/compiler/xla/pjrt/compile_options.proto b/tensorflow/compiler/xla/pjrt/compile_options.proto
index 29f02e8444d..e3b16e1108b 100644
--- a/tensorflow/compiler/xla/pjrt/compile_options.proto
+++ b/tensorflow/compiler/xla/pjrt/compile_options.proto
@@ -6,7 +6,7 @@ import "tensorflow/compiler/xla/xla.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 
 // A serialization of xla::ExecutableBuildOptions.
-// Next id: 13.
+// Next id: 14.
 message ExecutableBuildOptionsProto {
   // If set, this is the device to build the computation for. Valid
   // device_ordinal values are: 0 to # of devices - 1. These values are
@@ -22,6 +22,10 @@ message ExecutableBuildOptionsProto {
   // indicates the option has not been set.
   xla.ShapeProto result_layout = 2;
 
+  // Expose access to the XLA compilation environments, which will be passed to
+  // the compilation process.
+  xla.CompilationEnvironmentsProto comp_envs = 13;
+
   // Expose access to the XLA debug options which will be passed to the
   // compilation process.
   xla.DebugOptions debug_options = 3;
@@ -67,7 +71,13 @@ message ExecutableBuildOptionsProto {
   // which can be used by higher level framework as a way to query intermediate
   // sharding of operations when multiple computation would be chained and
   // merged together.
-  bool allow_spmd_sharding_propagation_to_output = 12;
+  // This is a vector of bool, because the user can control (if the output of
+  // the computation is a tuple) which elements of the tuple can have the
+  // sharding substituted and which don't. If only one boolean value is passed
+  // in the vector that's interpreted as the value to be applied for every
+  // single element of the output tuple. One value per element of the tuple
+  // means that each value is attached to one of the output elements.
+  repeated bool allow_spmd_sharding_propagation_to_output = 12;
 }
 
 message CompileOptionsProto {
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
deleted file mode 100644
index 5a8576e99bd..00000000000
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
-#include "tensorflow/compiler/xla/service/platform_util.h"
-
-namespace xla {
-
-static const char kCpuPlatformName[] = "cpu";
-
-CpuDevice::CpuDevice(int id,
-                     std::unique_ptr<LocalDeviceState> local_device_state)
-    : PjRtStreamExecutorDevice(id, std::move(local_device_state),
-                               /*device_kind=*/kCpuPlatformName) {}
-
-StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
-  TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      PlatformUtil::GetPlatform("Host"));
-  if (platform->VisibleDeviceCount() <= 0) {
-    return FailedPrecondition("CPU platform has no visible devices.");
-  }
-  LocalClientOptions options;
-  options.set_platform(platform);
-  TF_ASSIGN_OR_RETURN(LocalClient * client,
-                      ClientLibrary::GetOrCreateLocalClient(options));
-
-  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
-  for (int i = 0; i < client->device_count(); ++i) {
-    se::StreamExecutorConfig config;
-    config.ordinal = i;
-    // 8MiB stacks seem to be necessary for running LAPACK/OpenBLAS
-    // computations.
-    config.device_options.non_portable_tags["host_thread_stack_size_in_bytes"] =
-        absl::StrCat(8192 * 1024);
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                        platform->GetExecutor(config));
-    auto device_state = std::make_unique<LocalDeviceState>(
-        executor, client, LocalDeviceState::kSynchronous,
-        /*max_inflight_computations=*/32,
-        /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
-    auto device = std::make_unique<CpuDevice>(i, std::move(device_state));
-    devices.push_back(std::move(device));
-  }
-
-  return std::unique_ptr<PjRtClient>(std::make_unique<PjRtStreamExecutorClient>(
-      CpuName(), client, std::move(devices), /*process_index=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
-      /*should_stage_host_to_device_transfers=*/false,
-      /*gpu_run_options=*/nullptr));
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.h b/tensorflow/compiler/xla/pjrt/cpu_device.h
deleted file mode 100644
index e0106fdd179..00000000000
--- a/tensorflow/compiler/xla/pjrt/cpu_device.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
-#define TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
-#include "tensorflow/compiler/xla/statusor.h"
-
-namespace xla {
-
-class CpuDevice : public PjRtStreamExecutorDevice {
- public:
-  CpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
-};
-
-StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_PJRT_CPU_DEVICE_H_
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device_test.cc b/tensorflow/compiler/xla/pjrt/cpu_device_test.cc
deleted file mode 100644
index 9273adb0967..00000000000
--- a/tensorflow/compiler/xla/pjrt/cpu_device_test.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-
-namespace xla {
-namespace {
-
-TEST(CpuStreamDeviceTest, BlocksDeviceToHostStream) {  // b/214236179
-  const float data[] = {1, 2, 3, 4};
-  auto client = *GetCpuClient(true);
-  auto* device = client->devices()[0];
-  auto buffer = *client->BufferFromHostBuffer(
-      &data, PrimitiveType::F32, {4}, std::nullopt,
-      PjRtClient::HostBufferSemantics::kZeroCopy, {}, device);
-  auto literal = *buffer->ToLiteralSync();
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
index 4bd1453f759..49fd8699579 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/BUILD
+++ b/tensorflow/compiler/xla/pjrt/distributed/BUILD
@@ -1,11 +1,14 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 licenses(["notice"])
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+)
 
 tf_proto_library(
     name = "protocol_proto",
@@ -30,7 +33,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 cc_library(
@@ -43,6 +46,7 @@ cc_library(
         ":protocol_cc_grpc_proto",
         ":util",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/memory",
@@ -52,16 +56,16 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "service_test",
     srcs = ["service_test.cc"],
     tags = [
@@ -95,17 +99,17 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_error_util",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_error_util",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_client",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 cc_library(
@@ -113,7 +117,7 @@ cc_library(
     hdrs = ["util.h"],
     deps = [
         "//tensorflow/compiler/xla:status",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 cc_library(
@@ -124,10 +128,10 @@ cc_library(
         ":client",
         ":service",
         "//tensorflow/compiler/xla:statusor",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "client_server_test",
     size = "small",
     srcs = ["client_server_test.cc"],
@@ -145,8 +149,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/platform:errors",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.cc b/tensorflow/compiler/xla/pjrt/distributed/client.cc
index d1d45f745b7..b6124656107 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <random>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
@@ -30,10 +31,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/random.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
@@ -55,11 +56,14 @@ class DistributedRuntimeClientImpl : public DistributedRuntimeClient {
                                GlobalTopologyProto* global_topology) override;
   xla::StatusOr<std::string> BlockingKeyValueGet(
       std::string key, absl::Duration timeout) override;
+  xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
+  KeyValueDirGet(absl::string_view key) override;
   xla::Status KeyValueSet(std::string key, std::string value) override;
+  xla::Status KeyValueDelete(std::string key) override;
   xla::Status WaitAtBarrier(std::string barrier_id,
                             absl::Duration timeout) override;
-  xla::StatusOr<tensorflow::CoordinationServiceAgent*>
-  GetCoordinationServiceAgent() override;
+  xla::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
+      override;
 
  private:
   // Entry point for the heartbeat thread.
@@ -122,14 +126,17 @@ class DistributedRuntimeCoordinationServiceClient
                                GlobalTopologyProto* global_topology) override;
   xla::StatusOr<std::string> BlockingKeyValueGet(
       std::string key, absl::Duration timeout) override;
+  xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
+  KeyValueDirGet(absl::string_view key) override;
   xla::Status KeyValueSet(std::string key, std::string value) override;
+  xla::Status KeyValueDelete(std::string key) override;
   xla::Status WaitAtBarrier(std::string barrier_id,
                             absl::Duration timeout) override;
-  xla::StatusOr<tensorflow::CoordinationServiceAgent*>
-  GetCoordinationServiceAgent() override;
+  xla::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
+      override;
 
  private:
-  std::unique_ptr<tensorflow::CoordinationServiceAgent> coord_agent_;
+  std::unique_ptr<tsl::CoordinationServiceAgent> coord_agent_;
   tensorflow::CoordinationServiceConfig config_;
   absl::Duration min_connect_barrier_timeout_;
   int task_id_;
@@ -376,7 +383,20 @@ xla::Status DistributedRuntimeClientImpl::WaitAtBarrier(
   return FromGrpcStatus(status);
 }
 
-xla::StatusOr<tensorflow::CoordinationServiceAgent*>
+xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
+DistributedRuntimeClientImpl::KeyValueDirGet(absl::string_view key) {
+  return xla::Unimplemented(
+      "KeyValueDirGet() is unimplemented. Enable coordination service to use "
+      "this method.");
+}
+
+xla::Status DistributedRuntimeClientImpl::KeyValueDelete(std::string key) {
+  return xla::Unimplemented(
+      "KeyValueDelete() is unimplemented. Enable coordination service to use "
+      "this method.");
+}
+
+xla::StatusOr<tsl::CoordinationServiceAgent*>
 DistributedRuntimeClientImpl::GetCoordinationServiceAgent() {
   return xla::Internal(
       "Invoking GetCoordinationServiceAgent() while coordination service is "
@@ -453,9 +473,9 @@ DistributedRuntimeCoordinationServiceClient::
         timeout_fn(status, /*coordinator_reported_failure=*/true);
       };
 
-  std::unique_ptr<tensorflow::CoordinationClient> leader_client;
-  leader_client.reset(tensorflow::NewGrpcCoordinationClient(channel));
-  coord_agent_ = tensorflow::CreateCoordinationServiceAgent();
+  std::unique_ptr<tsl::CoordinationClient> leader_client;
+  leader_client.reset(tsl::NewGrpcCoordinationClient(channel));
+  coord_agent_ = tsl::CreateCoordinationServiceAgent();
   const Status status =
       coord_agent_->Initialize(options.env, "jax_worker", options.node_id,
                                config, std::move(leader_client), error_fn);
@@ -467,7 +487,7 @@ DistributedRuntimeCoordinationServiceClient::
 }
 
 DistributedRuntimeCoordinationServiceClient::
-    ~DistributedRuntimeCoordinationServiceClient() {}
+    ~DistributedRuntimeCoordinationServiceClient() = default;
 
 xla::Status DistributedRuntimeCoordinationServiceClient::Connect() {
   const absl::Time deadline =
@@ -525,6 +545,30 @@ DistributedRuntimeCoordinationServiceClient::BlockingKeyValueGet(
   return coord_agent_->GetKeyValue(key, timeout);
 }
 
+xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
+DistributedRuntimeCoordinationServiceClient::KeyValueDirGet(
+    absl::string_view key) {
+  // TODO(hanyangtay): Migrate to string_view for both client and coordination
+  // agent APIs.
+  TF_ASSIGN_OR_RETURN(const auto results,
+                      coord_agent_->GetKeyValueDir(std::string(key)));
+
+  std::vector<std::pair<std::string, std::string>> kvs;
+  kvs.reserve(results.size());
+
+  // Convert tensorflow::KeyValueEntry to std::pair<std::string,
+  // string>.
+  for (const auto& kv : results) {
+    kvs.push_back(std::make_pair(kv.key(), kv.value()));
+  }
+  return kvs;
+}
+
+xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueDelete(
+    std::string key) {
+  return coord_agent_->DeleteKeyValue(key);
+}
+
 xla::Status DistributedRuntimeCoordinationServiceClient::KeyValueSet(
     std::string key, std::string value) {
   return coord_agent_->InsertKeyValue(key, value);
@@ -535,7 +579,7 @@ xla::Status DistributedRuntimeCoordinationServiceClient::WaitAtBarrier(
   return coord_agent_->WaitAtBarrier(barrier_id, timeout, /*tasks=*/{});
 }
 
-xla::StatusOr<tensorflow::CoordinationServiceAgent*>
+xla::StatusOr<tsl::CoordinationServiceAgent*>
 DistributedRuntimeCoordinationServiceClient::GetCoordinationServiceAgent() {
   return coord_agent_.get();
 }
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.h b/tensorflow/compiler/xla/pjrt/distributed/client.h
index 2e93da9a160..2de61b10e36 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/time/time.h"
 #include "grpcpp/channel.h"
@@ -27,9 +29,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/env.h"
 
-namespace tensorflow {
+namespace tsl {
 class CoordinationServiceAgent;
-}  // namespace tensorflow
+}  // namespace tsl
 
 namespace xla {
 
@@ -92,7 +94,7 @@ class DistributedRuntimeClient {
     bool shutdown_on_destruction = true;
   };
 
-  virtual ~DistributedRuntimeClient() {}
+  virtual ~DistributedRuntimeClient() = default;
 
   // Connects to the master, and blocks until all clients have successfully
   // connected.
@@ -112,11 +114,27 @@ class DistributedRuntimeClient {
       GlobalTopologyProto* global_topology) = 0;
 
   // The following APIs are thread-safe.
+
+  // Key-value store API.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
   virtual xla::StatusOr<std::string> BlockingKeyValueGet(
       std::string key, absl::Duration timeout) = 0;
 
+  // Get all key-value pairs under a directory (key).
+  // A value is considered to be in the directory if its key is prefixed with
+  // the directory.
+  // This is not a blocking call. If no keys are found, an empty vector is
+  // returned immediately.
+  virtual xla::StatusOr<std::vector<std::pair<std::string, std::string>>>
+  KeyValueDirGet(absl::string_view key) = 0;
+
   virtual xla::Status KeyValueSet(std::string key, std::string value) = 0;
 
+  // Delete the key-value. If the key is a directory, recursively clean
+  // up all key-values under the directory.
+  virtual xla::Status KeyValueDelete(std::string key) = 0;
+
   // Blocks until all nodes are at the barrier or the barrier times out.
   // `barrier_id` should be unique across barriers.
   virtual xla::Status WaitAtBarrier(std::string barrier_id,
@@ -124,7 +142,7 @@ class DistributedRuntimeClient {
 
   // Returns pointer to coordination service agent, or InternalError if the
   // client does not use coordination service.
-  virtual StatusOr<tensorflow::CoordinationServiceAgent*>
+  virtual StatusOr<tsl::CoordinationServiceAgent*>
   GetCoordinationServiceAgent() = 0;
 };
 
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc b/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
index 3b84846fafe..802e490584a 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/client_server_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/barrier.h"
 #include "absl/synchronization/notification.h"
@@ -31,11 +32,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace {
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
 constexpr absl::Duration kHeartbeatInterval = absl::Milliseconds(500);
 constexpr int kMaxMissingHeartbeats = 3;
 constexpr absl::Duration kBarrierTimeout = absl::Milliseconds(200);
@@ -180,9 +186,13 @@ TEST_P(ClientServerTest, ConnectAndShutdownAreBarriers) {
 TEST_P(ClientServerTest, ConnectAndEnumerateDevices) {
   StartService(/*num_nodes=*/2, GetParam().use_coordination_service);
 
+  std::string host_0_boot_id = "foo";
+  std::string host_1_boot_id = "bar";
   std::vector<LocalTopologyProto> locals(2);
   locals[0].set_node_id(0);
   locals[1].set_node_id(1);
+  locals[0].set_boot_id(host_0_boot_id);
+  locals[1].set_boot_id(host_1_boot_id);
   DeviceProto* d0 = locals[0].add_devices();
   d0->set_local_device_ordinal(0);
   DeviceProto* d1 = locals[0].add_devices();
@@ -196,11 +206,17 @@ TEST_P(ClientServerTest, ConnectAndEnumerateDevices) {
   auto* node0 = expected_topology.add_nodes();
   auto* node1 = expected_topology.add_nodes();
   *node0 = locals[0];
+  node0->set_boot_id(host_0_boot_id);
   node0->mutable_devices(0)->set_global_device_id(0);
   node0->mutable_devices(1)->set_global_device_id(1);
   node0->mutable_devices(2)->set_global_device_id(2);
+  node0->mutable_devices(0)->set_slice_index(0);
+  node0->mutable_devices(1)->set_slice_index(0);
+  node0->mutable_devices(2)->set_slice_index(0);
   *node1 = locals[1];
+  node1->set_boot_id(host_1_boot_id);
   node1->mutable_devices(0)->set_global_device_id(3);
+  node1->mutable_devices(0)->set_slice_index(1);
 
   // Used to ensure that thread0's client sends their device after thread1's
   // client. This ensures that devices are sent out of turn (compared to their
@@ -270,6 +286,8 @@ TEST_P(ClientServerTest, EnumerateElevenDevices) {
   std::vector<LocalTopologyProto> locals(num_nodes);
   for (int i = 0; i < num_nodes; ++i) {
     locals[i].set_node_id(i);
+    // Two unique boot_id, one per host.
+    locals[i].set_boot_id(absl::StrCat("test_boot_id_", i % 2));
     auto device = locals[i].add_devices();
     // Split local devices across two hosts.
     int ordinal = i % (num_nodes / 2);
@@ -282,6 +300,7 @@ TEST_P(ClientServerTest, EnumerateElevenDevices) {
     auto* node = expected_topology.add_nodes();
     *node = locals[i];
     node->mutable_devices(0)->set_global_device_id(i);
+    node->mutable_devices(0)->set_slice_index(i % 2);
   }
 
   auto thread_fn = [&](int node_id) -> xla::Status {
@@ -704,6 +723,74 @@ TEST_P(ClientServerTest, WaitAtBarrier_FailWithSameBarrierId) {
   }
 }
 
+TEST_P(ClientServerTest, KeyValueDirGet) {
+  StartService(/*num_nodes=*/1, GetParam().use_coordination_service);
+  auto client = GetClient(/*node_id=*/0, GetParam().use_coordination_service);
+  TF_ASSERT_OK(client->Connect());
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/sub_dir/1", "1"));
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/sub_dir/2", "2"));
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/3", "3"));
+  TF_ASSERT_OK(client->KeyValueSet("test", "4"));  // Not in a directory.
+
+  auto results = client->KeyValueDirGet("test_dir/");
+
+  if (GetParam().use_coordination_service) {
+    TF_ASSERT_OK(results.status());
+    auto kvs = results.value();
+
+    EXPECT_THAT(kvs, UnorderedElementsAre(Pair("test_dir/sub_dir/1", "1"),
+                                          Pair("test_dir/sub_dir/2", "2"),
+                                          Pair("test_dir/3", "3")));
+  } else {
+    EXPECT_EQ(results.status().code(), tsl::error::UNIMPLEMENTED);
+  }
+}
+
+TEST_P(ClientServerTest, KeyValueDelete) {
+  StartService(/*num_nodes=*/1, GetParam().use_coordination_service);
+  auto client = GetClient(/*node_id=*/0, GetParam().use_coordination_service);
+  TF_ASSERT_OK(client->Connect());
+  TF_ASSERT_OK(client->KeyValueSet("to_be_deleted", "deleted"));
+  TF_ASSERT_OK(client->KeyValueSet("to_be_kept", "kept"));
+
+  auto results = client->KeyValueDelete("to_be_deleted");
+
+  if (GetParam().use_coordination_service) {
+    TF_EXPECT_OK(results);
+    auto deleted_kv =
+        client->BlockingKeyValueGet("to_be_deleted", absl::Milliseconds(200));
+    // We time out from attempting to retrieve a deleted key.
+    EXPECT_EQ(deleted_kv.status().code(), tsl::error::DEADLINE_EXCEEDED);
+    // Other key should still exist.
+    auto kept_kv =
+        client->BlockingKeyValueGet("to_be_kept", absl::Milliseconds(200));
+    TF_ASSERT_OK(kept_kv.status());
+    EXPECT_EQ(kept_kv.value(), "kept");
+  } else {
+    EXPECT_EQ(results.code(), tsl::error::UNIMPLEMENTED);
+  }
+}
+
+TEST_P(ClientServerTest, KeyValueDelete_Directory) {
+  StartService(/*num_nodes=*/1, GetParam().use_coordination_service);
+  auto client = GetClient(/*node_id=*/0, GetParam().use_coordination_service);
+  TF_ASSERT_OK(client->Connect());
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/sub_dir/1", "1"));
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/sub_dir/2", "2"));
+  TF_ASSERT_OK(client->KeyValueSet("test_dir/3", "3"));
+
+  auto results = client->KeyValueDelete("test_dir/");
+
+  if (GetParam().use_coordination_service) {
+    TF_EXPECT_OK(results);
+    auto kvs = client->KeyValueDirGet("test_dir/");
+    TF_ASSERT_OK(kvs.status());
+    EXPECT_THAT(kvs.value(), IsEmpty());
+  } else {
+    EXPECT_EQ(results.code(), tsl::error::UNIMPLEMENTED);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(
     ClientServerTests, ClientServerTest,
     ::testing::ValuesIn<ServiceParams>({
diff --git a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
index 30607b75637..15e443361b0 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
+++ b/tensorflow/compiler/xla/pjrt/distributed/protocol.proto
@@ -42,11 +42,17 @@ message DeviceProto {
   // passed to EnumerateDevices(). In other words, the coordinator node
   // determines the global device IDs during EnumerateDevices().
   int32 global_device_id = 4;  // Globally unique ID number.
+  // Devices with the same slice_index are connected by fast network, e.g.
+  // NVLink on GPUs.
+  int32 slice_index = 5;
 }
 
 message LocalTopologyProto {
   int32 node_id = 1;
-  repeated DeviceProto devices = 2;
+  // Unique per OS kernel restart to uniquely identify a host.
+  // See /proc/sys/kernel/random/boot_id.
+  string boot_id = 2;
+  repeated DeviceProto devices = 3;
 }
 
 message GlobalTopologyProto {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.cc b/tensorflow/compiler/xla/pjrt/distributed/service.cc
index 8a10e4e1c81..68dbf8c718b 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.cc
@@ -18,16 +18,17 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/memory/memory.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/random.h"
@@ -61,16 +62,35 @@ std::unique_ptr<tsl::CoordinationServiceInterface> EnableCoordinationService(
       [](const tensorflow::DeviceInfo& raw_global_devices) {
         xla::GlobalTopologyProto global_topology;
         int global_device_id = 0;
+        // Assign local devices of the same host to the same slice_index.
+        int next_slice_index = 0;
+        absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
         // Unwrap result to local device proto.
         for (const auto& device : raw_global_devices.device()) {
           xla::LocalTopologyProto local_topology;
+          // Note that tensorflow::DeviceInfo.device is xla.LocalTopologyProto!
           device.UnpackTo(&local_topology);
+          // Every new boot_id seen is treated as a new host/slice.
+          absl::string_view boot_id = local_topology.boot_id();
+          auto [it, inserted] =
+              boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
+          if (inserted) {
+            ++next_slice_index;
+          }
           // Set deterministic global ids.
           for (xla::DeviceProto& device : *local_topology.mutable_devices()) {
             device.set_global_device_id(global_device_id++);
+            device.set_slice_index(it->second);
           }
           *global_topology.mutable_nodes()->Add() = local_topology;
         }
+        if (VLOG_IS_ON(10)) {
+          for (auto it = boot_id_to_slice_index.begin();
+               it != boot_id_to_slice_index.end(); ++it) {
+            LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index "
+                      << it->first << "->" << it->second;
+          }
+        }
         // Wrap result back in DeviceInfo proto.
         tensorflow::DeviceInfo global_devices;
         global_devices.mutable_device()->Add()->PackFrom(global_topology);
@@ -104,12 +124,30 @@ DistributedRuntimeServiceImpl::~DistributedRuntimeServiceImpl() {
 void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
                          GlobalTopologyProto* global_topology) {
   int next_global_device_id = 0;
+  // Assign local devices of the same host to the same slice_index.
+  int next_slice_index = 0;
+  absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
   for (LocalTopologyProto& local : local_topologies) {
+    // Every new boot_id seen is treated as a new host/slice.
+    absl::string_view boot_id = local.boot_id();
+    auto [it, inserted] =
+        boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
+    if (inserted) {
+      ++next_slice_index;
+    }
     for (DeviceProto& device : *local.mutable_devices()) {
       device.set_global_device_id(next_global_device_id++);
+      device.set_slice_index(it->second);
     }
     global_topology->add_nodes()->Swap(&local);
   }
+  if (VLOG_IS_ON(10)) {
+    for (auto it = boot_id_to_slice_index.begin();
+         it != boot_id_to_slice_index.end(); ++it) {
+      LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index " << it->first
+                << "->" << it->second;
+    }
+  }
 }
 
 xla::Status DistributedRuntimeServiceImpl::ValidateNodeId(int node_id) {
@@ -476,9 +514,11 @@ CoordinationServiceImpl::CoordinationServiceImpl(
   coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
       options.env, "CoordinationServiceRpcHandler",
       /*num_threads=*/4);
-  coord_rpc_service_ =
-      std::make_unique<tensorflow::GrpcCoordinationServiceImpl>(
-          coord_compute_pool_.get(), builder);
+  coord_rpc_service_ = std::make_unique<tsl::GrpcCoordinationServiceImpl>(
+      coord_compute_pool_.get(), builder);
+  auto* grpc_coord_service =
+      static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
+  grpc_coord_service->SetCoordinationServiceInstance(coord_service_.get());
   LOG(INFO) << "Experimental coordination service is enabled.";
 }
 
@@ -486,6 +526,8 @@ CoordinationServiceImpl::~CoordinationServiceImpl() {
   // Service object must be destroyed to clear all pending RPCs before shutting
   // down the RPC service.
   coord_service_ = nullptr;
+  static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
+      ->SetCoordinationServiceInstance(nullptr);
   coord_rpc_service_->Shutdown();
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/BUILD b/tensorflow/compiler/xla/pjrt/gpu/BUILD
index 75fa79686e9..461af3cd678 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/BUILD
+++ b/tensorflow/compiler/xla/pjrt/gpu/BUILD
@@ -1,11 +1,14 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "if_nccl")
+load("//tensorflow/tsl:tsl.bzl", "if_nccl")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//tensorflow/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
 
@@ -23,7 +26,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:kernel",
-        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/tsl/framework:bfc_allocator",
+        "//tensorflow/tsl/framework:device_id_impl",
         "//tensorflow/tsl/util:env_var",
         "@com_google_absl//absl/types:span",
     ],
@@ -37,6 +41,7 @@ cc_library(
     visibility = ["//tensorflow/compiler/xla/pjrt:friends"],
     deps = [
         ":gpu_helpers",
+        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -49,6 +54,8 @@ cc_library(
         "//tensorflow/tsl/framework:bfc_allocator",
         "//tensorflow/tsl/framework:device_id",
         "//tensorflow/tsl/framework:device_id_impl",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/util:env_var",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
@@ -56,14 +63,30 @@ cc_library(
     ] + if_cuda([
         ":nccl_id_store_cuda",
         "@local_config_cuda//cuda:cuda_headers",
-        "//tensorflow/core/common_runtime/gpu:gpu_cudamallocasync_allocator",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_activation_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
     ]) + if_rocm([
         ":nccl_id_store_rocm",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
+xla_cc_test(
+    name = "se_gpu_pjrt_client_test",
+    srcs = if_gpu_is_configured(["se_gpu_pjrt_client_test.cc"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    tags = ["requires-gpu-nvidia"],
+    deps = [
+        ":se_gpu_pjrt_client",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 # We actually wish we could write if_cuda(if_nccl(...)) in :gpu_device,
 # but Bazel does not allow nested selects. We can work around the problem using
 # an intermediate library that has the conditional NCCL pieces that is only
@@ -102,7 +125,7 @@ cc_library(
     ] + if_nccl(["@local_config_nccl//:nccl"]),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pjrt_client_test_se_gpu",
     srcs = ["pjrt_client_test_se_gpu.cc"],
     tags = [
diff --git a/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc b/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
index 65be2f638e3..bb3dc7999bf 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.cc
@@ -100,7 +100,7 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
   // setting memory_fraction > 1.
   size_t allocator_memory = enable_unified_memory
                                 ? total_memory * fmax(1.0, memory_fraction)
-                                : free_memory * memory_fraction;
+                                : total_memory * memory_fraction;
   if (preallocate) {
     LOG(INFO) << "XLA backend allocating " << allocator_memory
               << " bytes on device " << device_ordinal << " for BFCAllocator.";
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 25bca714531..45cdb142558 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
 
+#include <fstream>
 #include <map>
 #include <optional>
 #include <set>
@@ -24,16 +25,18 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/tsl/framework/bfc_allocator.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 #ifdef GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_ROCM
@@ -78,7 +81,7 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateCudaAsyncAllocator(
     // unified memory.
     // When unified memory is enabled, allow GPU memory oversubscription by
     // setting memory_fraction > 1.
-    size_t allocator_memory = free_memory * memory_fraction;
+    size_t allocator_memory = total_memory * memory_fraction;
     if (preallocate) {
       LOG(INFO) << "XLA backend allocating " << allocator_memory
                 << " bytes on device " << device_ordinal
@@ -89,7 +92,7 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateCudaAsyncAllocator(
                 << " for BFCAllocator.";
     }
 
-    auto allocator = std::make_unique<tensorflow::GpuCudaMallocAsyncAllocator>(
+    auto allocator = std::make_unique<se::GpuCudaMallocAsyncAllocator>(
         tsl::PlatformDeviceId(device_ordinal), allocator_memory, preallocate);
     allocator->SetStreamAndPreallocateMemory(
         ordinal_and_device.second->compute_stream()
@@ -234,6 +237,28 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
   return devices;
 }
 
+// Exists on Linux systems. Unique per OS kernel restart.
+static constexpr char kBootIdPath[] = "/proc/sys/kernel/random/boot_id";
+
+// Retrieve content of /proc/sys/kernel/random/boot_id as a string.
+// Note that procfs file may have file size 0 which throws off generic file
+// readers such as tsl::ReadFileToString.
+StatusOr<std::string> GetBootIdString() {
+  std::string boot_id_str;
+#ifdef __linux__
+  std::ifstream file(kBootIdPath);
+  if (!file) {
+    return NotFound("%s not found.", kBootIdPath);
+  }
+  std::string line;
+  while (std::getline(file, line)) {
+    absl::StripAsciiWhitespace(&line);
+    absl::StrAppend(&boot_id_str, line);
+  }
+#endif
+  return boot_id_str;
+}
+
 Status BuildDistributedDevices(
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
@@ -241,6 +266,14 @@ Status BuildDistributedDevices(
     gpu::GpuExecutableRunOptions* gpu_executable_run_options) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
+  std::string boot_id_str;
+  auto boot_id_str_or_status = GetBootIdString();
+  if (!boot_id_str_or_status.ok()) {
+    LOG(INFO) << boot_id_str_or_status.status();
+  } else {
+    boot_id_str = boot_id_str_or_status.value();
+  }
+  local_topology.set_boot_id(boot_id_str);
   for (const auto& ordinal_and_device : local_device_states) {
     const se::Platform* platform =
         ordinal_and_device.second->executor()->platform();
@@ -276,7 +309,8 @@ Status BuildDistributedDevices(
       }
       auto device = std::make_unique<StreamExecutorGpuDevice>(
           device_proto.global_device_id(), std::move(local_device),
-          device_proto.name(), device_proto.vendor(), node.node_id());
+          device_proto.name(), device_proto.vendor(), node.node_id(),
+          device_proto.slice_index());
       devices->push_back(std::move(device));
     }
   }
@@ -300,17 +334,23 @@ Status BuildDistributedDevices(
 
 StreamExecutorGpuDevice::StreamExecutorGpuDevice(
     int id, std::unique_ptr<LocalDeviceState> local_device_state,
-    std::string device_kind, std::string device_vendor, int node_id)
+    std::string device_kind, std::string device_vendor, int node_id,
+    int slice_index)
     : PjRtStreamExecutorDevice(id, std::move(local_device_state),
                                std::move(device_kind), node_id),
-      device_vendor_(std::move(device_vendor)) {
+      device_vendor_(std::move(device_vendor)),
+      slice_index_(slice_index) {
   attributes_ = {
-      {"device_vendor", PjRtDeviceAttribute(device_vendor_)},
+      {"device_vendor", device_vendor_},
+      {"slice_index", static_cast<int64_t>(slice_index)},
   };
   to_string_ = absl::StrFormat(
-      "StreamExecutorGpuDevice(id=%i, process_index=%i)", id, process_index());
+      "StreamExecutorGpuDevice(id=%i, process_index=%i, slice_index=%i)", id,
+      process_index(), slice_index);
 }
 
+int StreamExecutorGpuDevice::slice_index() const { return slice_index_; }
+
 absl::string_view StreamExecutorGpuDevice::device_vendor() const {
   return device_vendor_;
 }
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
index c5034d9bd77..529c51bb28d 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -32,7 +32,9 @@ class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
   StreamExecutorGpuDevice(int id,
                           std::unique_ptr<LocalDeviceState> local_device_state,
                           std::string device_kind, std::string device_vendor,
-                          int node_id);
+                          int node_id, int slice_index = 0);
+
+  int slice_index() const;
 
   absl::string_view device_vendor() const;
 
@@ -41,6 +43,7 @@ class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
  private:
   std::string device_vendor_;
   std::string to_string_;
+  int slice_index_;
 };
 
 // distributed_client may be nullptr in non-distributed settings.
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
new file mode 100644
index 00000000000..d23323a366e
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -0,0 +1,216 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
+
+#include <array>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+
+namespace xla {
+namespace {
+
+StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
+    absl::string_view program, xla::PjRtClient& client,
+    xla::CompileOptions compile_options = xla::CompileOptions()) {
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      ParseAndReturnUnverifiedModule(program, {}));
+
+  xla::XlaComputation xla_computation(hlo_module->ToProto());
+  return client.Compile(xla_computation, compile_options);
+}
+
+// Given the result of a PjrtExecutable::Execute call (TF-status of vectors of
+// vectors), extract the zeroth result from the zeroth device.
+StatusOr<std::shared_ptr<xla::Literal>> ExtractSingleResult(
+    xla::StatusOr<std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>>>&
+        result) {
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RET_CHECK(result->size() == 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = (*result)[0];
+  TF_RET_CHECK(result_buffers.size() == 1);
+  auto literal_or = result_buffers[0]->ToLiteralSync();
+  if (!literal_or.status().ok()) return literal_or.status();
+  return *literal_or;
+}
+
+static constexpr char const* kProgram = R"(HloModule HostTransfer
+    ENTRY SendRecvSynchronous() -> f32[2] {
+      in_chain = token[] after-all()
+
+      data = f32[2] constant({2, 3})
+      send = (f32[2], u32[], token[]) send(data, in_chain),
+        channel_id=1,
+        is_host_transfer=true,
+        frontend_attributes={
+          _xla_host_transfer_handler_name="undef",
+          _xla_host_transfer_original_type="f32",
+          _xla_host_transfer_rendezvous="undef"
+        }
+      send-done = token[] send-done(send),
+        channel_id=1, is_host_transfer=true
+
+      recv = (f32[2], u32[], token[]) recv(send-done),
+        channel_id=2,
+        is_host_transfer=true,
+        frontend_attributes={
+          _xla_host_transfer_handler_name="undef",
+          _xla_host_transfer_original_type="f32",
+          _xla_host_transfer_rendezvous="undef"
+        }
+      recv-done = (f32[2], token[]) recv-done(recv),
+        channel_id=2, is_host_transfer=true
+
+      ROOT result = f32[2] get-tuple-element(recv-done), index=0
+    })";
+
+TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*distributed_client=*/nullptr,
+                                              /*node_id=*/0));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  std::array<float, 2> sent_value = {0.0f, 0.0f};
+
+  // Send buffer to host.
+  SendCallback send_callback = {
+      /*channel_id=*/1, [&](const PjRtTransferMetadata& m, PjRtChunk chunk,
+                            int64_t total_size_in_bytes, bool done) {
+        float* data = reinterpret_cast<float*>(chunk.data());
+        sent_value[0] = data[0];
+        sent_value[1] = data[1];
+        return OkStatus();
+      }};
+
+  // Recv buffer from host.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        auto chunk0 = PjRtChunk::AllocateDefault(sizeof(float));
+        *reinterpret_cast<float*>(chunk0.data()) = 5.0f;
+        TF_CHECK_OK(stream->AddChunk(std::move(chunk0)).Await());
+
+        auto chunk1 = PjRtChunk::AllocateDefault(sizeof(float));
+        *reinterpret_cast<float*>(chunk1.data()) = 6.0f;
+        TF_CHECK_OK(stream->AddChunk(std::move(chunk1)).Await());
+
+        return OkStatus();
+      }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          ExtractSingleResult(result));
+  EXPECT_EQ(sent_value[0], 2.0f);
+  EXPECT_EQ(sent_value[1], 3.0f);
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<float>({5.0f, 6.0f}),
+                                     *result_literal));
+}
+
+TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*distributed_client=*/nullptr,
+                                              /*node_id=*/0));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  // Always-failing Send handler.
+  SendCallback send_callback = {
+      /*channel_id=*/1,
+      [&](const PjRtTransferMetadata&, PjRtChunk, int64_t, bool) {
+        return InternalError("Uh-oh, can send chunk to host");
+      }};
+
+  // No-op Recv handler.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2,
+      [&](const PjRtTransferMetadata& m,
+          std::unique_ptr<CopyToDeviceStream> stream) { return OkStatus(); }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  // Check that send error safely rejected and we do not dead lock.
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+  EXPECT_TRUE(absl::StrContains(result.status().error_message(),
+                                "Uh-oh, can send chunk to host"));
+}
+
+TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*distributed_client=*/nullptr,
+                                              /*node_id=*/0));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          CompileExecutable(kProgram, *client));
+
+  // No-op Send handler.
+  SendCallback send_callback = {
+      /*channel_id=*/1, [&](const PjRtTransferMetadata&, PjRtChunk, int64_t,
+                            bool) { return OkStatus(); }};
+
+  // Invalid Recv handler that tries to add invalid chunk.
+  RecvCallback recv_callback = {
+      /*channel_id=*/2, [&](const PjRtTransferMetadata& m,
+                            std::unique_ptr<CopyToDeviceStream> stream) {
+        auto chunk = PjRtChunk::AllocateDefault(10 * sizeof(float));
+        stream->AddChunk(std::move(chunk)).Await().IgnoreError();
+        // Return ok status to proceed to corresponding recv-done call.
+        return OkStatus();
+      }};
+
+  // Callbacks for point-to-point communication ops.
+  std::vector<std::vector<SendCallback>> send_callbacks = {{send_callback}};
+  std::vector<std::vector<RecvCallback>> recv_callbacks = {{recv_callback}};
+
+  ExecuteOptions opts;
+  opts.send_callbacks = send_callbacks;
+  opts.recv_callbacks = recv_callbacks;
+
+  // Check that invalid chunk safely rejected and we do not dead lock.
+  auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
+  EXPECT_TRUE(absl::StrContains(result.status().error_message(),
+                                "Adding chunk of size 40 would overflow buffer "
+                                "of size 8 (0 already transferred)"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
index 31581642852..c5e822f9087 100644
--- a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
+++ b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
@@ -25,9 +25,10 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/mlir/utils/error_util.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 
 namespace xla {
@@ -35,7 +36,7 @@ namespace xla {
 Status MlirToXlaComputation(mlir::ModuleOp module,
                             XlaComputation& xla_computation,
                             bool use_tuple_args, bool return_tuple) {
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(module->getContext());
+  mlir::BaseScopedDiagnosticHandler diagnostic_handler(module->getContext());
   {
     mlir::PassManager pm(module->getContext());
     pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
@@ -52,7 +53,7 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
     if (failed(pm.run(module))) {
       VLOG(1) << "MHLO->HLO lowering passes failed.";
       module->dump();
-      return diagnostic_handler.ConsumeStatus();
+      return FromAbslStatus(diagnostic_handler.ConsumeStatus());
     }
 
     VLOG(5) << "MHLO module after lowering, before HLO import ";
@@ -63,8 +64,6 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
 
   HloProto proto;
   mlir::MlirToHloConversionOptions options;
-  // We don't want the conversion to muck with our operator names.
-  options.legalize_node_names = false;
   TF_RETURN_IF_ERROR(ConvertMlirHloToHlo(module, &proto, use_tuple_args,
                                          return_tuple, options));
 
@@ -79,17 +78,18 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
   context.loadDialect<mlir::mhlo::MhloDialect>();
   context.loadDialect<mlir::chlo::ChloDialect>();
   context.loadDialect<mlir::sparse_tensor::SparseTensorDialect>();
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+  context.loadDialect<mlir::stablehlo::StablehloDialect>();
+  mlir::BaseScopedDiagnosticHandler diagnostic_handler(&context);
   module = mlir::parseSourceString<mlir::ModuleOp>(
       llvm::StringRef(mlir_module_str.data(), mlir_module_str.size()),
       &context);
   if (!module) {
-    return diagnostic_handler.ConsumeStatus();
+    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
   }
   if (failed(module->verifyInvariants())) {
     VLOG(1) << "MLIR verification failed.";
     module->dump();
-    return diagnostic_handler.ConsumeStatus();
+    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
   }
   return std::move(module);
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_api.cc b/tensorflow/compiler/xla/pjrt/pjrt_api.cc
new file mode 100644
index 00000000000..4820c3162aa
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/pjrt_api.cc
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
+
+#if !defined(PLATFORM_WINDOWS)
+#include <dlfcn.h>
+#endif
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace pjrt {
+
+static auto* pjrt_apis =
+    new absl::flat_hash_map<std::string, const PJRT_Api*>{};
+
+static std::string CanonicalizeDeviceType(absl::string_view device_type) {
+  return absl::AsciiStrToLower(device_type);
+}
+
+xla::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type) {
+  std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
+  auto iter = pjrt_apis->find(canonicalize_device_type);
+
+  // TODO(b/261601433): the block below is for backward compatibiality. Remove
+  // this block after pytorch adds the call to LoadPjrtPlugin.
+  if (iter == pjrt_apis->end() && canonicalize_device_type == "tpu") {
+    const char* env_value = getenv("TPU_LIBRARY_PATH");
+    const char* libtpu_path =
+        env_value && strlen(env_value) > 0 ? env_value : "libtpu.so";
+    TF_RETURN_IF_ERROR(LoadPjrtPlugin("tpu", libtpu_path));
+    iter = pjrt_apis->find("tpu");
+  }
+
+  if (iter == pjrt_apis->end()) {
+    return tsl::errors::NotFound("PJRT_Api not found for device type ",
+                                 canonicalize_device_type);
+  }
+  return iter->second;
+}
+
+xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
+  std::string canonicalize_device_type = CanonicalizeDeviceType(device_type);
+  if (auto iter = pjrt_apis->find(canonicalize_device_type);
+      iter != pjrt_apis->end()) {
+    return tsl::errors::AlreadyExists(
+        "PJRT_Api already exists for device type ", canonicalize_device_type);
+  }
+  (*pjrt_apis)[canonicalize_device_type] = api;
+  LOG(INFO) << "PJRT_Api is set for device type " << canonicalize_device_type;
+  return tsl::OkStatus();
+}
+
+xla::Status InitPjrtPlugin(PjrtApiInitFn init_fn,
+                           absl::string_view device_type) {
+  const PJRT_Api* pjrt_api = init_fn();
+  TF_RETURN_IF_ERROR(pjrt::CheckMatchingStructSizes(
+      "PJRT_Api", PJRT_Api_STRUCT_SIZE, pjrt_api->struct_size));
+  return SetPjrtApi(device_type, pjrt_api);
+}
+
+xla::Status LoadPjrtPlugin(absl::string_view device_type,
+                           absl::string_view library_path) {
+#ifdef PLATFORM_WINDOWS
+  return tsl::errors::Unimplemented(
+      "LoadPjrtPlugin is not implemented on windows yet.");
+#else
+  void* library = dlopen(library_path.data(), RTLD_NOW);
+  if (library == nullptr) {
+    return tsl::errors::Internal("Failed to open ", library_path);
+  }
+  PjrtApiInitFn init_fn;
+  *reinterpret_cast<void**>(&init_fn) = dlsym(library, "GetPjrtApi");
+  if (init_fn == nullptr) {
+    return tsl::errors::NotFound("GetPjrtApi not found in ", library_path);
+  }
+  LOG(INFO) << "GetPjrtApi was found for " << device_type << " at "
+            << library_path;
+  return InitPjrtPlugin(init_fn, device_type);
+#endif
+}
+
+}  // namespace pjrt
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_api.h b/tensorflow/compiler/xla/pjrt/pjrt_api.h
new file mode 100644
index 00000000000..ec726824f45
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/pjrt_api.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_PJRT_API_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_API_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace pjrt {
+
+// Gets and sets the global map for PJRT_Api*. Not thread safe. `device_type` is
+// case insensitive.
+xla::StatusOr<const PJRT_Api*> PjrtApi(absl::string_view device_type);
+xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api);
+
+// Loads a PJRT plugin. The library provided by library_path must export a
+// symbol called `GetPjrtApi` with function signature `const PJRT_Api*
+// GetPjrtApi()`. This method dlopen the plugin library, dlsym `GetPjrtApi`,
+// calls `GetPjrtApi`, and `SetPjrtApi`.
+xla::Status LoadPjrtPlugin(absl::string_view device_type,
+                           absl::string_view library_path);
+
+// Initializes PJRT with a PjrtApiInitFn which is dynamically loaded. This
+// method calls init_fn, and `SetPjrtApi`.
+typedef const PJRT_Api* (*PjrtApiInitFn)();
+xla::Status InitPjrtPlugin(PjrtApiInitFn init_fn,
+                           absl::string_view device_type);
+}  // namespace pjrt
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_API_H_
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_api_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_api_test.cc
new file mode 100644
index 00000000000..8fa15a76574
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/pjrt_api_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
+TEST(PjRtApiTest, SetAndGetGlobalPjRtApi) {
+  PJRT_Api api;
+
+  TF_ASSERT_OK(pjrt::SetPjrtApi("CPU", &api));
+  TF_ASSERT_OK_AND_ASSIGN(const PJRT_Api* output, pjrt::PjrtApi("CPU"));
+  TF_ASSERT_OK_AND_ASSIGN(const PJRT_Api* output_lowercase,
+                          pjrt::PjrtApi("cpu"));
+
+  EXPECT_EQ(output, &api);
+  EXPECT_EQ(output_lowercase, &api);
+  EXPECT_THAT(pjrt::SetPjrtApi("CPU", &api),
+              StatusIs(tensorflow::error::ALREADY_EXISTS,
+                       HasSubstr("PJRT_Api already exists for device type")));
+  // TODO(b/261601433): change back to NOT_FOUND error after pytorch adds the
+  // call to LoadPjrtPlugin.
+  EXPECT_THAT(pjrt::PjrtApi("TPU"),
+              StatusIs(tensorflow::error::INTERNAL,
+                       HasSubstr("Failed to open libtpu.")));
+}
+
+}  // namespace
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
index 7089d990d97..7614e18b10e 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
@@ -22,17 +22,21 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
-// TODO(skyewm): remove when everything goes through C API
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
+// TODO(skyewm): remove when everything goes through C API
 #include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"  // NOLINT(unused-includes): required for tensorflow::tpu::FindAndLoadTpuLibrary
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -63,8 +67,16 @@ bool kPjRtCApiBypass = false;
 PjRtCApiClient::PjRtCApiClient(const PJRT_Api* c_api, PJRT_Client* c_client)
     : c_api_(c_api),
       c_client_(std::unique_ptr<PJRT_Client, ::pjrt::PJRT_ClientDeleter>(
-          c_client, ::pjrt::MakeClientDeleter(c_api))) {
-  wrapped_ = c_client_->client.get();
+          c_client, ::pjrt::MakeClientDeleter(c_api))),
+      // Example platform version string:
+      //   PJRT C API
+      //   TFRT TPU v2
+      //   Built on Mar 4 2021 15:25:57 (1614900357) cl/360760169
+      platform_version_(absl::StrCat(
+          "PJRT C API\n", ::pjrt::GetPlatformVersion(c_client, c_api))) {
+  if (kPjRtCApiBypass) {
+    wrapped_ = c_client_->client.get();
+  }
 
   InitDevices();
   LOG(INFO) << "PjRtCApiClient created.";
@@ -79,7 +91,6 @@ void PjRtCApiClient::InitDevices() {
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_Client_Devices(&devices_args), c_api_);
 
   const size_t n = devices_args.num_devices;
-  wrapped_device_map_.reserve(n);
   c_to_cpp_device_map_.reserve(n);
   owned_devices_.reserve(n);
   devices_.reserve(n);
@@ -90,10 +101,6 @@ void PjRtCApiClient::InitDevices() {
         std::make_unique<PjRtCApiDevice>(device, this));
     devices_.push_back(cpp_device.get());
     c_to_cpp_device_map_[device] = cpp_device.get();
-    // Map the wrapped PjRtDevice* to the PjRtCApiDevice* that wraps it.
-    // TODO(b/237017893): remove `wrapped_device_map_` and replace it with
-    // `c_api_device_map_`
-    wrapped_device_map_[device->device] = cpp_device.get();
   }
 
   PJRT_Client_AddressableDevices_Args address_args;
@@ -150,15 +157,7 @@ int PjRtCApiClient::process_index() const {
 }
 
 absl::string_view PjRtCApiClient::platform_version() const {
-  PJRT_Client_PlatformVersion_Args args;
-  args.struct_size = PJRT_Client_PlatformVersion_Args_STRUCT_SIZE;
-  args.priv = nullptr;
-  args.client = c_client_.get();
-  pjrt::LogFatalIfPjrtError(c_api_->PJRT_Client_PlatformVersion(&args), c_api_);
-
-  absl::string_view platform_version(args.platform_version,
-                                     args.platform_version_size);
-  return platform_version;
+  return platform_version_;
 }
 
 static DeviceAssignment CalculateDefaultAssignment(
@@ -208,6 +207,17 @@ StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(int device_id) const {
   return GetCppDevice(args.device);
 }
 
+StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
+    int local_hardware_id) const {
+  PJRT_Client_LookupAddressableDevice_Args args;
+  args.struct_size = PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.client = c_client_.get();
+  args.local_hardware_id = local_hardware_id;
+  RETURN_STATUS_IF_ERROR(c_api_->PJRT_Client_LookupAddressableDevice(&args),
+                         c_api_);
+  return GetCppDevice(args.addressable_device);
+}
 
 // Initializes `PJRT_Client_Compile_Args`, which will be used to call
 // API PJRT_Client_Compile().
@@ -228,7 +238,7 @@ static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
   PJRT_Program program;
   program.struct_size = PJRT_Program_STRUCT_SIZE;
   program.priv = nullptr;
-  program.code = code.c_str();
+  program.code = const_cast<char*>(code.c_str());
   program.code_size = code.size();
   program.format = format.c_str();
   program.format_size = format.size();
@@ -236,7 +246,7 @@ static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
 
   RETURN_STATUS_IF_ERROR(c_api->PJRT_Client_Compile(&args), c_api);
   std::unique_ptr<PjRtLoadedExecutable> ret =
-      std::make_unique<PjRtCApiExecutable>(api_client, args.executable);
+      std::make_unique<PjRtCApiLoadedExecutable>(api_client, args.executable);
   return ret;
 }
 
@@ -260,24 +270,25 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
                                   module_bytecode, format);
 }
 
-StatusOr<std::string> PjRtCApiClient::SerializeExecutable(
-    const PjRtLoadedExecutable& executable) const {
-  if (kPjRtCApiBypass) {
-    VLOG(1) << "PJRT C API BYPASS: SerializeExecutable";
-    return wrapped_->SerializeExecutable(
-        *PjRtCApiExecutable::GetWrapped(&executable));
-  }
-  return Unimplemented("PJRT C API does not support SerializeExecutable");
-}
-
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
-                                      CompileOptions options) {
-  if (kPjRtCApiBypass) {
-    VLOG(1) << "PJRT C API BYPASS: DeserializeExecutable";
-    return WrapExecutable(wrapped_->DeserializeExecutable(serialized, options));
-  }
-  return Unimplemented("PJRT C API does not support DeserializeExecutable");
+                                      std::optional<CompileOptions> options) {
+  PJRT_Executable_DeserializeAndLoad_Args des_args;
+
+  des_args.struct_size = PJRT_Executable_DeserializeAndLoad_Args_STRUCT_SIZE;
+  des_args.priv = nullptr;
+  des_args.client = c_client_.get();
+  des_args.serialized_executable = serialized.data();
+  des_args.serialized_executable_size = serialized.length();
+
+  const PJRT_Api* api = pjrt_c_api();
+
+  RETURN_STATUS_IF_ERROR(api->PJRT_Executable_DeserializeAndLoad(&des_args),
+                         api);
+  PJRT_LoadedExecutable* c_exec = des_args.loaded_executable;
+  CHECK(c_exec != nullptr);
+  return std::unique_ptr<PjRtLoadedExecutable>(
+      std::make_unique<PjRtCApiLoadedExecutable>(this, c_exec));
 }
 
 StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
@@ -311,7 +322,7 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::WrapExecutable(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
                       std::move(to_wrap));
   return std::unique_ptr<PjRtLoadedExecutable>(
-      std::make_unique<PjRtCApiExecutable>(this, std::move(executable)));
+      std::make_unique<PjRtCApiLoadedExecutable>(this, std::move(executable)));
 }
 
 StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::WrapBuffer(
@@ -327,11 +338,14 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::BufferFromHostBuffer(
     HostBufferSemantics host_buffer_semantics,
     std::function<void()> on_done_with_host_buffer, PjRtDevice* device) {
   if (host_buffer_semantics != HostBufferSemantics::kImmutableOnlyDuringCall &&
-      host_buffer_semantics != HostBufferSemantics::kZeroCopy) {
+      host_buffer_semantics != HostBufferSemantics::kZeroCopy &&
+      host_buffer_semantics !=
+          HostBufferSemantics::kImmutableUntilTransferCompletes) {
     return Unimplemented(
         "PJRT C API does not support HostBufferSemantics other than "
-        "HostBufferSemantics::kImmutableOnlyDuringCall and "
-        "HostBufferSemantics::kZeroCopy.");
+        "HostBufferSemantics::kImmutableOnlyDuringCall, "
+        "HostBufferSemantics::kZeroCopy and "
+        "HostBufferSemantics::kImmutableUntilTransferCompletes.");
   }
 
   PJRT_Client_BufferFromHostBuffer_Args args;
@@ -395,7 +409,9 @@ const PJRT_Api* PjRtCApiClient::pjrt_c_api() const { return c_api_; }
 
 PjRtCApiDevice::PjRtCApiDevice(PJRT_Device* device, PjRtCApiClient* client)
     : client_(client), device_(device) {
-  wrapped_ = device_->device;
+  if (kPjRtCApiBypass) {
+    wrapped_ = device_->device;
+  }
   InitAttributes();
 }
 
@@ -444,23 +460,35 @@ void PjRtCApiDevice::InitAttributes() {
     const auto& attribute = args.attributes[i];
     std::string attribute_name(attribute.name, attribute.name_size);
     switch (attribute.type) {
-      case PJRT_Device_Attribute::PJRT_Device_Attribute_kString: {
+      case PJRT_NamedValue::PJRT_NamedValue_kString: {
         std::string string_value(attribute.string_value, attribute.value_size);
         attributes_[attribute_name] = PjRtDeviceAttribute(string_value);
         break;
       }
-      case PJRT_Device_Attribute::PJRT_Device_Attribute_kInt64: {
+      case PJRT_NamedValue::PJRT_NamedValue_kInt64: {
         attributes_[attribute_name] =
             PjRtDeviceAttribute(attribute.int64_value);
         break;
       }
-      case PJRT_Device_Attribute::PJRT_Device_Attribute_kInt64List: {
+      case PJRT_NamedValue::PJRT_NamedValue_kInt64List: {
         const int64_t* array_ptr(attribute.int64_array_value);
         std::vector<int64_t> int64_array(array_ptr,
                                          array_ptr + attribute.value_size);
         attributes_[attribute_name] = PjRtDeviceAttribute(int64_array);
         break;
       }
+      // Do not allow other types (such as
+      // PJRT_NamedValue::PJRT_NamedValue_kFloat) since device attributes
+      // currently should not return other types. Also C API client currently
+      // does not support forward compatibility (such as if the underlying
+      // PJRT library is a newer version that returns types not supported by
+      // this client). Failing here to prevent undefined behavior.
+      default: {
+        LOG(FATAL) << "PJRT_Device_Attributes() returned attribute '"
+                   << attribute_name << "' with unsupported type "
+                   << attribute.type << " to PjRtCApiDevice::InitAttributes()";
+        break;
+      }
     }
   }
 }
@@ -517,31 +545,150 @@ absl::string_view PjRtCApiDevice::ToString() const {
 
 // ------------------------------- Executables ---------------------------------
 
-PjRtCApiExecutable::PjRtCApiExecutable(
+PjRtCApiExecutable::PjRtCApiExecutable(const PJRT_Api* c_api,
+                                       PJRT_Executable* executable)
+    : c_api_(c_api),
+      executable_(executable, ::pjrt::MakeExecutableDeleter(c_api)) {}
+
+PjRtExecutable* PjRtCApiExecutable::wrapped() const {
+  return c_executable()->get();
+}
+
+absl::string_view PjRtCApiExecutable::name() const {
+  auto* c_api = pjrt_c_api();
+  auto* executable = c_executable();
+  PJRT_Executable_Name_Args args;
+  args.executable = executable;
+  args.struct_size = PJRT_Executable_Name_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_Name(&args), c_api);
+
+  return absl::string_view(args.executable_name, args.executable_name_size);
+}
+
+int64_t PjRtCApiExecutable::SizeOfGeneratedCodeInBytes() const {
+  auto* c_api = pjrt_c_api();
+  auto* executable = c_executable();
+  PJRT_Executable_SizeOfGeneratedCodeInBytes_Args args;
+  args.struct_size =
+      PJRT_Executable_SizeOfGeneratedCodeInBytes_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.executable = executable;
+
+  pjrt::LogFatalIfPjrtError(
+      c_api->PJRT_Executable_SizeOfGeneratedCodeInBytes(&args), c_api);
+  return args.size_in_bytes;
+}
+
+StatusOr<std::vector<std::shared_ptr<HloModule>>>
+PjRtCApiExecutable::GetHloModules() const {
+  auto* c_api = pjrt_c_api();
+  auto* executable = c_executable();
+  PJRT_Executable_OptimizedProgram_Args args;
+  args.struct_size = PJRT_Executable_OptimizedProgram_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.executable = executable;
+  PJRT_Program program;
+  program.struct_size = PJRT_Program_STRUCT_SIZE;
+  program.priv = nullptr;
+  program.code = nullptr;
+  args.program = &program;
+
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_Executable_OptimizedProgram(&args), c_api);
+
+  constexpr size_t TWO_GIBIBYTES = 2ull * 1024 * 1024 * 1024;
+  const size_t code_size = args.program->code_size;
+  CHECK(code_size < TWO_GIBIBYTES);
+  std::string code(code_size, ' ');
+  args.program->code = code.data();
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_Executable_OptimizedProgram(&args), c_api);
+
+  absl::string_view program_format(program.format, program.format_size);
+  if (program_format != ::pjrt::kHloWithConfigFormat) {
+    return xla::InternalError(
+        "expected program format `hlo_with_config` but got %s", program_format);
+  }
+
+  HloModuleProtoWithConfig proto;
+  proto.ParseFromString(code);
+  std::vector<std::shared_ptr<HloModule>> out;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProtoWithConfig(proto));
+  out.push_back(std::move(module));
+  return out;
+}
+
+StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
+  auto* c_api = pjrt_c_api();
+  auto* executable = c_executable();
+  PJRT_Executable_Serialize_Args ser_args;
+  ser_args.struct_size = PJRT_Executable_Serialize_Args_STRUCT_SIZE;
+  ser_args.priv = nullptr;
+  ser_args.executable = executable;
+  ser_args.serialized_executable = nullptr;
+
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_Executable_Serialize(&ser_args), c_api);
+  PJRT_SerializedExecutable* c_serialized_exec = ser_args.serialized_executable;
+  std::unique_ptr<PJRT_SerializedExecutable,
+                  ::pjrt::PJRT_SerializedExecutableDeleter>
+      serialized_executable(c_serialized_exec,
+                            ::pjrt::MakeSerializedExecutableDeleter(c_api));
+
+  PJRT_SerializedExecutable_Data_Args data_args;
+  data_args.struct_size = PJRT_SerializedExecutable_Data_Args_STRUCT_SIZE;
+  data_args.priv = nullptr;
+  data_args.serialized_executable = c_serialized_exec;
+  data_args.data = nullptr;
+  data_args.data_size = 0;
+
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_SerializedExecutable_Data(&data_args),
+                         c_api);
+
+  return std::string(data_args.data, data_args.data_size);
+}
+
+// ------------------------ Loaded Executables ---------------------------------
+
+PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
     PjRtCApiClient* client, std::unique_ptr<PjRtLoadedExecutable> wrapped)
-    : PjRtCApiExecutable(client, new PJRT_Executable{std::move(wrapped),
-                                                     client->pjrt_c_client()}) {
+    : client_(client),
+      loaded_executable_(
+          new PJRT_LoadedExecutable{std::move(wrapped),
+                                    client->pjrt_c_client()},
+          ::pjrt::MakeLoadedExecutableDeleter(client->pjrt_c_api())) {
+  executable_ = std::make_unique<PjRtCApiExecutable>(
+      pjrt_c_api(), new PJRT_Executable{loaded_executable_->executable});
+  InitDevices();
 }
 
-PjRtCApiExecutable::PjRtCApiExecutable(PjRtCApiClient* client,
-                                       PJRT_Executable* executable)
+PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
+    PjRtCApiClient* client, PJRT_LoadedExecutable* executable)
     : client_(client),
-      executable_(executable,
-                  ::pjrt::MakeExecutableDeleter(client->pjrt_c_api())) {
+      loaded_executable_(executable, ::pjrt::MakeLoadedExecutableDeleter(
+                                         client->pjrt_c_api())) {
+  PJRT_LoadedExecutable_GetExecutable_Args args;
+  args.struct_size = PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.loaded_executable = c_loaded_executable();
+  args.executable = nullptr;
+  pjrt::LogFatalIfPjrtError(
+      pjrt_c_api()->PJRT_LoadedExecutable_GetExecutable(&args), pjrt_c_api());
+  executable_ =
+      std::make_unique<PjRtCApiExecutable>(pjrt_c_api(), args.executable);
   InitDevices();
 }
 
-void PjRtCApiExecutable::InitDevices() {
-  PJRT_Executable_AddressableDevices_Args args;
-  args.struct_size = PJRT_Executable_AddressableDevices_Args_STRUCT_SIZE;
+void PjRtCApiLoadedExecutable::InitDevices() {
+  PJRT_LoadedExecutable_AddressableDevices_Args args;
+  args.struct_size = PJRT_LoadedExecutable_AddressableDevices_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.executable = executable_.get();
+  args.executable = c_loaded_executable();
   args.addressable_devices = nullptr;
   args.num_addressable_devices = 0;
 
   const PJRT_Api* api = pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(api->PJRT_Executable_AddressableDevices(&args),
-                            api);
+  pjrt::LogFatalIfPjrtError(
+      api->PJRT_LoadedExecutable_AddressableDevices(&args), api);
 
   const size_t num_addressable_devices = args.num_addressable_devices;
   addressable_devices_.reserve(num_addressable_devices);
@@ -583,63 +730,31 @@ Convert2DCBuffersToCppBuffers(PJRT_Buffer*** c_lists, size_t outer_size,
   return ret;
 }
 
-// Create and return a `PjRtFuture` with a promise which will be set when
-// `PJRT_Event` is ready. This also deletes the input `PJRT_Event` on the
-// callback.
-static xla::PjRtFuture<Status> ConvertCEventToCppFuture(PJRT_Event* c_future,
-                                                        const PJRT_Api* c_api) {
-  PJRT_Event_OnReady_Args event_onready_args;
-  event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
-  event_onready_args.priv = nullptr;
-  event_onready_args.event = c_future;
-
-  PjRtFuture<Status>::Promise promise = PjRtFuture<Status>::CreatePromise();
-  event_onready_args.user_arg = new std::function<void(PJRT_Error*)>(
-      [promise, c_future, c_api](PJRT_Error* error) mutable {
-        if (error != nullptr) {
-          xla::Status s = ::pjrt::PjrtErrorToStatus(error, c_api);
-          promise.Set(s);
-          ::pjrt::MakeErrorDeleter(c_api)(error);
-        } else {
-          promise.Set(tsl::OkStatus());
-        }
-        ::pjrt::MakeEventDeleter(c_api)(c_future);
-      });
-  event_onready_args.callback = [](PJRT_Error* error, void* arg) {
-    std::function<void(PJRT_Error*)>* set_future =
-        reinterpret_cast<std::function<void(PJRT_Error*)>*>(arg);
-    (*set_future)(error);
-    delete set_future;
-  };
-
-  std::unique_ptr<PJRT_Error> error(
-      c_api->PJRT_Event_OnReady(&event_onready_args));
-  if (error != nullptr) {
-    xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), c_api);
-    return PjRtFuture<Status>(s);
-  }
-  return PjRtFuture<Status>(std::move(promise));
-}
-
-xla::StatusOr<PJRT_Executable_Execute_Args>
-PjRtCApiExecutable::GetCommonExecuteArgs(
+xla::StatusOr<PJRT_LoadedExecutable_Execute_Args>
+PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
     std::vector<std::vector<PJRT_Buffer*>>& c_argument_lists_storage,
     std::vector<PJRT_Buffer**>& c_arguments,
     std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
-    std::vector<PJRT_Buffer**>& c_output_lists) {
-  PJRT_Executable_Execute_Args args;
-  args.struct_size = PJRT_Executable_Execute_Args_STRUCT_SIZE;
+    std::vector<PJRT_Buffer**>& c_output_lists,
+    std::optional<std::vector<PJRT_Event*>>& device_complete_events) {
+  PJRT_LoadedExecutable_Execute_Args args;
+  args.struct_size = PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.executable = executable_.get();
+  args.executable = c_loaded_executable();
   args.options = &c_options;
   args.options->struct_size = PJRT_ExecuteOptions_STRUCT_SIZE;
   args.options->launch_id = options.launch_id;
   args.num_devices = argument_handles.size();
   CHECK_GT(args.num_devices, 0);
   args.num_args = argument_handles[0].size();
-
+  if (device_complete_events.has_value()) {
+    device_complete_events->resize(args.num_devices);
+    args.device_complete_events = device_complete_events->data();
+  } else {
+    args.device_complete_events = nullptr;
+  }
   // Populates `args.argument_lists` from `argument_handles`.
   c_argument_lists_storage = Convert2DCppBuffersToCBuffers(argument_handles);
   c_arguments.reserve(c_argument_lists_storage.size());
@@ -649,12 +764,12 @@ PjRtCApiExecutable::GetCommonExecuteArgs(
   args.argument_lists = c_arguments.data();
 
   // Allocates memory for output. `c_buffer_lists_storage` and `c_buffer_lists`
-  // needs to stay alive during the call of `PJRT_Executable_Execute`.
+  // needs to stay alive during the call of `PJRT_LoadedExecutable_Execute`.
 
   PJRT_Executable_NumOutputs_Args numoutputs_args;
   numoutputs_args.struct_size = PJRT_Executable_NumOutputs_Args_STRUCT_SIZE;
   numoutputs_args.priv = nullptr;
-  numoutputs_args.executable = executable_.get();
+  numoutputs_args.executable = c_executable();
   RETURN_STATUS_IF_ERROR(
       pjrt_c_api()->PJRT_Executable_NumOutputs(&numoutputs_args), pjrt_c_api());
   size_t outer_size = args.num_devices;
@@ -671,7 +786,7 @@ PjRtCApiExecutable::GetCommonExecuteArgs(
 }
 
 StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
-PjRtCApiExecutable::Execute(
+PjRtCApiLoadedExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options,
     std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
@@ -680,26 +795,26 @@ PjRtCApiExecutable::Execute(
   std::vector<PJRT_Buffer**> c_output_lists;
   PJRT_ExecuteOptions c_options;
   std::vector<PJRT_Buffer**> c_arguments;
+  std::optional<std::vector<PJRT_Event*>> device_complete_events;
+  if (returned_futures.has_value()) {
+    device_complete_events.emplace();
+  }
   TF_ASSIGN_OR_RETURN(
-      PJRT_Executable_Execute_Args args,
+      PJRT_LoadedExecutable_Execute_Args args,
       GetCommonExecuteArgs(argument_handles, options, c_options,
                            c_argument_lists_storage, c_arguments,
-                           c_output_lists_storage, c_output_lists));
+                           c_output_lists_storage, c_output_lists,
+                           device_complete_events));
 
   args.execute_device = nullptr;
-  args.device_complete_events = nullptr;
-  if (returned_futures.has_value()) {
-    std::vector<PJRT_Event*> c_events(args.num_devices);
-    args.device_complete_events = c_events.data();
-  }
 
-  RETURN_STATUS_IF_ERROR(pjrt_c_api()->PJRT_Executable_Execute(&args),
+  RETURN_STATUS_IF_ERROR(pjrt_c_api()->PJRT_LoadedExecutable_Execute(&args),
                          pjrt_c_api());
 
   if (returned_futures.has_value()) {
     returned_futures->resize(args.num_devices);
     for (int i = 0; i < returned_futures->size(); ++i) {
-      (*returned_futures)[i] = ConvertCEventToCppFuture(
+      (*returned_futures)[i] = pjrt::ConvertCEventToCppFuture(
           args.device_complete_events[i], pjrt_c_api());
     }
   }
@@ -710,14 +825,10 @@ PjRtCApiExecutable::Execute(
 }
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtCApiExecutable::ExecuteSharded(
+PjRtCApiLoadedExecutable::ExecuteWithSingleDevice(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options,
     std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
-  if (fill_future) {
-    return Unimplemented(
-        "PJRT C API does not support fill_future for ExecuteSharded");
-  }
   std::vector<std::vector<PjRtBuffer*>> argument_handles_vec = {
       {argument_handles.begin(), argument_handles.end()}};
 
@@ -726,96 +837,126 @@ PjRtCApiExecutable::ExecuteSharded(
   std::vector<PJRT_Buffer**> c_output_lists;
   PJRT_ExecuteOptions c_options;
   std::vector<PJRT_Buffer**> c_arguments;
+  std::optional<std::vector<PJRT_Event*>> device_complete_events;
+  if (fill_future) {
+    device_complete_events.emplace();
+  }
   TF_ASSIGN_OR_RETURN(
-      PJRT_Executable_Execute_Args args,
+      PJRT_LoadedExecutable_Execute_Args args,
       GetCommonExecuteArgs(argument_handles_vec, options, c_options,
                            c_argument_lists_storage, c_arguments,
-                           c_output_lists_storage, c_output_lists));
+                           c_output_lists_storage, c_output_lists,
+                           device_complete_events));
 
   args.execute_device =
       tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
 
-  RETURN_STATUS_IF_ERROR(pjrt_c_api()->PJRT_Executable_Execute(&args),
+  RETURN_STATUS_IF_ERROR(pjrt_c_api()->PJRT_LoadedExecutable_Execute(&args),
                          pjrt_c_api());
 
+  if (fill_future) {
+    *returned_future = pjrt::ConvertCEventToCppFuture(
+        args.device_complete_events[0], pjrt_c_api());
+  }
   return std::move(Convert2DCBuffersToCppBuffers(
       args.output_lists, args.num_devices, c_output_lists_storage[0].size(),
       client_)[0]);
 }
 
 StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
-PjRtCApiExecutable::ExecutePortable(
+PjRtCApiLoadedExecutable::ExecuteSharded(
     absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
     const ExecuteOptions& options,
     std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
-  if (kPjRtCApiBypass) {
-    VLOG(1) << "PJRT C API BYPASS: ExecutePortable";
-    std::vector<PjRtBuffer*> wrapped_args =
-        PjRtCApiBuffer::GetWrappedVector(argument_handles);
-
-    TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> out,
-                        wrapped()->ExecutePortable(
-                            wrapped_args, PjRtCApiDevice::GetWrapped(device),
-                            options, returned_future, fill_future));
-
-    for (std::unique_ptr<PjRtBuffer>& buffer : out) {
-      buffer = std::make_unique<PjRtCApiBuffer>(
-          client_,
-          new PJRT_Buffer{std::move(buffer), client_->pjrt_c_client()});
-    }
-    return out;
-  }
-  return Unimplemented("PJRT C API does not support ExecutePortable");
+  return ExecuteWithSingleDevice(argument_handles, device, options,
+                                 returned_future, fill_future);
 }
 
-PjRtLoadedExecutable* PjRtCApiExecutable::wrapped() const {
-  return executable_->executable.get();
+StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+PjRtCApiLoadedExecutable::ExecutePortable(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options,
+    std::optional<PjRtFuture<Status>>& returned_future, bool fill_future) {
+  return ExecuteWithSingleDevice(argument_handles, device, options,
+                                 returned_future, fill_future);
 }
 
-absl::string_view PjRtCApiExecutable::name() const {
-  const PJRT_Api* c_api = pjrt_c_api();
-  PJRT_Executable_Name_Args args;
-  args.executable = executable_.get();
-  args.struct_size = PJRT_Executable_Name_Args_STRUCT_SIZE;
-  args.priv = nullptr;
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_Name(&args), c_api);
-
-  absl::string_view executable_name(args.executable_name,
-                                    args.executable_name_size);
-  return executable_name;
+PjRtLoadedExecutable* PjRtCApiLoadedExecutable::wrapped() const {
+  return c_loaded_executable()->get();
 }
 
-void PjRtCApiExecutable::Delete() {
-  PJRT_Executable_Delete_Args args;
-  args.struct_size = PJRT_Executable_Delete_Args_STRUCT_SIZE;
+void PjRtCApiLoadedExecutable::Delete() {
+  PJRT_LoadedExecutable_Delete_Args args;
+  args.struct_size = PJRT_LoadedExecutable_Delete_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.executable = executable_.get();
+  args.executable = c_loaded_executable();
   const PJRT_Api* c_api = pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_Delete(&args), c_api);
+  pjrt::LogFatalIfPjrtError(c_api->PJRT_LoadedExecutable_Delete(&args), c_api);
 }
 
-bool PjRtCApiExecutable::IsDeleted() {
-  PJRT_Executable_IsDeleted_Args args;
-  args.struct_size = PJRT_Executable_IsDeleted_Args_STRUCT_SIZE;
+bool PjRtCApiLoadedExecutable::IsDeleted() {
+  PJRT_LoadedExecutable_IsDeleted_Args args;
+  args.struct_size = PJRT_LoadedExecutable_IsDeleted_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.executable = executable_.get();
+  args.executable = c_loaded_executable();
 
   const PJRT_Api* c_api = pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_IsDeleted(&args), c_api);
+  pjrt::LogFatalIfPjrtError(c_api->PJRT_LoadedExecutable_IsDeleted(&args),
+                            c_api);
   return args.is_deleted;
 }
 
-int64_t PjRtCApiExecutable::SizeOfGeneratedCodeInBytes() const {
-  PJRT_Executable_SizeOfGeneratedCodeInBytes_Args args;
-  args.struct_size =
-      PJRT_Executable_SizeOfGeneratedCodeInBytes_Args_STRUCT_SIZE;
+StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+PjRtCApiLoadedExecutable::GetCostAnalysis() const {
+  // Initialize function call args
+  PJRT_LoadedExecutable_GetCostAnalysis_Args args;
+  args.struct_size = PJRT_LoadedExecutable_GetCostAnalysis_Args_STRUCT_SIZE;
   args.priv = nullptr;
-  args.executable = executable_.get();
+  args.executable = c_loaded_executable();
 
+  // Make PJRT C API call
   const PJRT_Api* c_api = pjrt_c_api();
-  pjrt::LogFatalIfPjrtError(
-      c_api->PJRT_Executable_SizeOfGeneratedCodeInBytes(&args), c_api);
-  return args.size_in_bytes;
+  RETURN_STATUS_IF_ERROR(c_api->PJRT_LoadedExecutable_GetCostAnalysis(&args),
+                         c_api);
+
+  // Copy returned properties to output map
+  absl::flat_hash_map<std::string, PjRtValueType> output_map;
+  for (auto i = 0; i < args.num_properties; ++i) {
+    switch (args.properties[i].type) {
+      case PJRT_NamedValue::PJRT_NamedValue_kFloat:
+        output_map[args.properties[i].name] = args.properties[i].float_value;
+        break;
+      case PJRT_NamedValue::PJRT_NamedValue_kInt64:
+        output_map[args.properties[i].name] = args.properties[i].int64_value;
+        break;
+      case PJRT_NamedValue::PJRT_NamedValue_kInt64List: {
+        PjRtValueType& output_value = output_map[args.properties[i].name];
+        std::vector<int64_t>& output_int64_list =
+            std::get<std::vector<int64_t>>(output_value);
+        output_int64_list.reserve(args.properties[i].value_size);
+        for (auto j = 0; j < args.properties[i].value_size; ++j) {
+          output_int64_list.push_back(args.properties[i].int64_array_value[j]);
+        }
+        break;
+      }
+      case PJRT_NamedValue::PJRT_NamedValue_kString:
+        output_map[args.properties[i].name] = args.properties[i].string_value;
+        break;
+      // C API client currently does not support forward compatibility (such as
+      // if the underlying PJRT library is a newer version that returns types
+      // not supported by this client). Failing here to prevent undefined
+      // behavior.
+      default:
+        LOG(FATAL)
+            << "PJRT_LoadedExecutable_GetCostAnalysis() returned attribute '"
+            << args.properties[i].name << "' with unsupported type '"
+            << args.properties[i].type
+            << "' to PjRtCApiLoadedExecutable::GetCostAnalysis()";
+        break;
+    }
+  }
+
+  return output_map;
 }
 
 // ---------------------------------- Buffers ----------------------------------
@@ -823,8 +964,11 @@ int64_t PjRtCApiExecutable::SizeOfGeneratedCodeInBytes() const {
 PjRtCApiBuffer::PjRtCApiBuffer(PjRtCApiClient* client, PJRT_Buffer* buffer)
     : client_(client),
       buffer_(buffer, ::pjrt::MakeBufferDeleter(client->pjrt_c_api())),
-      readiness_event_(nullptr, ::pjrt::MakeEventDeleter(client->pjrt_c_api())),
-      wrapped_(buffer_->buffer.get()) {
+      readiness_event_(nullptr,
+                       ::pjrt::MakeEventDeleter(client->pjrt_c_api())) {
+  if (kPjRtCApiBypass) {
+    wrapped_ = buffer_->buffer.get();
+  }
   set_shape();
 }
 
@@ -897,53 +1041,18 @@ PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
 
   args.dst_size = ShapeUtil::ByteSizeOfElements(shape);
   args.dst = literal->untyped_data();
+  const PJRT_Api* api = pjrt_c_api();
 
   std::unique_ptr<PJRT_Error, ::pjrt::PJRT_ErrorDeleter> error{
       pjrt_c_api()->PJRT_Buffer_ToHostBuffer(&args),
-      ::pjrt::MakeErrorDeleter(pjrt_c_api())};
+      ::pjrt::MakeErrorDeleter(api)};
 
   if (error != nullptr) {
-    xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), pjrt_c_api());
+    xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), api);
     return PjRtFuture<Status>(s);
   }
 
-  PJRT_Event_OnReady_Args event_onready_args;
-  event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
-  event_onready_args.priv = nullptr;
-  event_onready_args.event = args.event;
-
-  PjRtFuture<Status>::Promise promise = PjRtFuture<Status>::CreatePromise();
-
-  event_onready_args.user_arg = new std::function<void(PJRT_Error*)>(
-      [promise, api = client_->pjrt_c_api(),
-       pjrt_event = args.event](PJRT_Error* error) mutable {
-        if (error) {
-          xla::Status s = ::pjrt::PjrtErrorToStatus(error, api);
-          promise.Set(s);
-          ::pjrt::MakeErrorDeleter(api)(error);
-        } else {
-          promise.Set(OkStatus());
-        }
-        ::pjrt::MakeEventDeleter(api)(pjrt_event);
-      });
-
-  event_onready_args.callback = [](PJRT_Error* error, void* args) {
-    std::function<void(PJRT_Error*)>* set_future =
-        reinterpret_cast<std::function<void(PJRT_Error*)>*>(args);
-    (*set_future)(error);
-    delete set_future;
-  };
-
-  error.reset(pjrt_c_api()->PJRT_Event_OnReady(&event_onready_args));
-
-  if (error != nullptr) {
-    xla::Status s = ::pjrt::PjrtErrorToStatus(error.get(), pjrt_c_api());
-    return PjRtFuture<Status>(s);
-  }
-
-  PjRtFuture<Status> future = PjRtFuture<Status>(std::move(promise));
-
-  return future;
+  return pjrt::ConvertCEventToCppFuture(args.event, api);
 }
 
 StatusOr<size_t> PjRtCApiBuffer::GetOnDeviceSizeInBytes() const {
@@ -1080,11 +1189,15 @@ PjRtFuture<Status> PjRtCApiBuffer::GetReadyFuture() {
 
 // -------------------------------- API access ---------------------------------
 
-StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient() {
+StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
+    absl::string_view device_type) {
 #if !defined(PLATFORM_GOOGLE) || defined(LIBTPU_STATIC)
-  TF_RETURN_IF_ERROR(tensorflow::tpu::FindAndLoadTpuLibrary());
+  if (absl::AsciiStrToLower(device_type) == "tpu") {
+    // TODO(b/261484192): handle device specific initialization.
+    TF_RETURN_IF_ERROR(tensorflow::tpu::FindAndLoadTpuLibrary());
+  }
 #endif
-  const PJRT_Api* c_api = tensorflow::tpu::PjrtApi();
+  TF_ASSIGN_OR_RETURN(const PJRT_Api* c_api, pjrt::PjrtApi(device_type));
   if (c_api == nullptr) {
     return InternalError("PJRT C API is nullptr");
   }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
index 54f587f619b..d65874d9e9c 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
@@ -124,15 +124,7 @@ class PjRtCApiClient : public PjRtClient {
   StatusOr<PjRtDevice*> LookupDevice(int device_id) const override;
 
   StatusOr<PjRtDevice*> LookupAddressableDevice(
-      int local_hardware_id) const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: LookupAddressableDevice";
-      TF_ASSIGN_OR_RETURN(PjRtDevice * wrapped_device,
-                          wrapped_->LookupAddressableDevice(local_hardware_id));
-      return GetCApiDevice(wrapped_device);
-    }
-    return Unimplemented("PJRT C API does not support LookupAddressableDevice");
-  }
+      int local_hardware_id) const override;
 
   PjRtPlatformId platform_id() const override {
     if (kPjRtCApiBypass) {
@@ -158,11 +150,8 @@ class PjRtCApiClient : public PjRtClient {
   StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
 
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis() override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: GetHloCostAnalysis";
-      return wrapped_->GetHloCostAnalysis();
-    }
+  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override {
     return Unimplemented("PJRT C API does not support GetHloCostAnalysis");
   }
 
@@ -175,11 +164,10 @@ class PjRtCApiClient : public PjRtClient {
   StatusOr<std::optional<std::string>> ExecutableFingerprint(
       const PjRtLoadedExecutable& executable) const override;
 
-  StatusOr<std::string> SerializeExecutable(
-      const PjRtLoadedExecutable& executable) const override;
-
+  // `PjRtCApiClient::DeserializeExecutable()` ignores `CompileOptions` arg
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, CompileOptions options) override;
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
 
   StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override {
@@ -258,12 +246,6 @@ class PjRtCApiClient : public PjRtClient {
 
   Status Defragment() override { return wrapped_->Defragment(); }
 
-  PjRtDevice* GetCApiDevice(PjRtDevice* wrapped_device) const {
-    auto it = wrapped_device_map_.find(wrapped_device);
-    CHECK(it != wrapped_device_map_.end());
-    return it->second;
-  }
-
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> WrapExecutable(
       StatusOr<std::unique_ptr<PjRtLoadedExecutable>> to_wrap);
 
@@ -291,12 +273,13 @@ class PjRtCApiClient : public PjRtClient {
   std::vector<PjRtDevice*> addressable_devices_;
   absl::flat_hash_map<PJRT_Device*, PjRtCApiDevice*> c_to_cpp_device_map_;
 
+  const std::string platform_version_;
+
   // TODO(skyewm): this is a shim so we can run PjRtCApiClient code without the
   // C API being fully implemented. All methods using wrapped_ should either be
   // marked unimplemented or implemented in terms of the C API, at which point
   // wrapped_ and related functionality should be removed.
   PjRtClient* wrapped_;
-  absl::flat_hash_map<PjRtDevice*, PjRtCApiDevice*> wrapped_device_map_;
 };
 
 class PjRtCApiBuffer : public PjRtBuffer {
@@ -359,14 +342,15 @@ class PjRtCApiBuffer : public PjRtBuffer {
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  void CopyToRemoteDevice(absl::string_view serialized_descriptor,
-                          RemoteSendCallback on_done) override {
+  void CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      RemoteSendCallback on_done) override {
     LOG(ERROR) << "PJRT C API does not support CopyToRemoteDevice";
   }
 
   void CopyToRemoteDeviceScattered(
-      absl::Span<const std::pair<std::string, RemoteSendCallback>>
-          serialized_descriptors_and_callbacks,
+      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override {
     LOG(ERROR) << "PJRT C API does not support CopyToRemoteDeviceScattered";
   }
@@ -423,20 +407,51 @@ class PjRtCApiBuffer : public PjRtBuffer {
   PjRtBuffer* wrapped_;
 };
 
-class PjRtCApiExecutable : public PjRtLoadedExecutable {
+class PjRtCApiExecutable : public PjRtExecutable {
  public:
-  PjRtCApiExecutable(PjRtCApiClient* client,
-                     std::unique_ptr<PjRtLoadedExecutable> wrapped);
+  PjRtCApiExecutable(const PJRT_Api* c_api, PJRT_Executable* executable);
 
-  PjRtCApiExecutable(PjRtCApiClient* client, PJRT_Executable* executable);
-
-  PjRtClient* client() const override { return client_; }
   absl::string_view name() const override;
   int num_replicas() const override { return wrapped()->num_replicas(); }
   int num_partitions() const override { return wrapped()->num_partitions(); }
 
   int64_t SizeOfGeneratedCodeInBytes() const override;
 
+  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  PjRtExecutable* wrapped() const;
+
+  const PJRT_Api* pjrt_c_api() const { return c_api_; }
+  PJRT_Executable* c_executable() const { return executable_.get(); }
+
+  StatusOr<std::string> SerializeExecutable() const override;
+
+ private:
+  const PJRT_Api* c_api_;
+  std::unique_ptr<PJRT_Executable, pjrt::PJRT_ExecutableDeleter> executable_;
+};
+
+class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
+ public:
+  PjRtCApiLoadedExecutable(PjRtCApiClient* client,
+                           std::unique_ptr<PjRtLoadedExecutable> wrapped);
+
+  PjRtCApiLoadedExecutable(PjRtCApiClient* client,
+                           PJRT_LoadedExecutable* executable);
+
+  PjRtClient* client() const override { return client_; }
+  absl::string_view name() const override { return executable_->name(); }
+  int num_replicas() const override { return wrapped()->num_replicas(); }
+  int num_partitions() const override { return wrapped()->num_partitions(); }
+
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    return executable_->SizeOfGeneratedCodeInBytes();
+  }
+
+  StatusOr<absl::flat_hash_map<std::string, PjRtValueType>> GetCostAnalysis()
+      const override;
+
   const DeviceAssignment& device_assignment() const override {
     if (kPjRtCApiBypass) {
       VLOG(1) << "PJRT C API BYPASS: device_assignment";
@@ -461,11 +476,7 @@ class PjRtCApiExecutable : public PjRtLoadedExecutable {
 
   StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
       const override {
-    if (kPjRtCApiBypass) {
-      VLOG(1) << "PJRT C API BYPASS: GetHloModules";
-      return wrapped()->GetHloModules();
-    }
-    return Unimplemented("PJRT C API does not support GetHloModules");
+    return executable_->GetHloModules();
   }
 
   StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
@@ -486,36 +497,58 @@ class PjRtCApiExecutable : public PjRtLoadedExecutable {
       std::optional<PjRtFuture<Status>>& returned_future,
       bool fill_future) override;
 
-  xla::StatusOr<PJRT_Executable_Execute_Args> GetCommonExecuteArgs(
-      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
-      const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
-      std::vector<std::vector<PJRT_Buffer*>>& c_argument_lists_storage,
-      std::vector<PJRT_Buffer**>& c_arguments,
-      std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
-      std::vector<PJRT_Buffer**>& c_output_lists);
-
   void Delete() override;
   bool IsDeleted() override;
 
+  StatusOr<std::string> SerializeExecutable() const override {
+    return executable_->SerializeExecutable();
+  }
+
   PjRtLoadedExecutable* wrapped() const;
 
   static PjRtLoadedExecutable* GetWrapped(
       const PjRtLoadedExecutable* c_api_executable) {
-    return tensorflow::down_cast<const PjRtCApiExecutable*>(c_api_executable)
+    return tensorflow::down_cast<const PjRtCApiLoadedExecutable*>(
+               c_api_executable)
         ->wrapped();
   }
 
   const PJRT_Api* pjrt_c_api() const { return client_->pjrt_c_api(); }
+  PJRT_Executable* c_executable() const { return executable_->c_executable(); }
+
+  PJRT_LoadedExecutable* c_loaded_executable() const {
+    return loaded_executable_.get();
+  }
 
  private:
+  // Gets common Execute_Args between Execute, ExecuteSharded and
+  // ExecutePortable. device_complete_events in the return is set if the input
+  // device_complete_events has value.
+  xla::StatusOr<PJRT_LoadedExecutable_Execute_Args> GetCommonExecuteArgs(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options, PJRT_ExecuteOptions& c_options,
+      std::vector<std::vector<PJRT_Buffer*>>& c_argument_lists_storage,
+      std::vector<PJRT_Buffer**>& c_arguments,
+      std::vector<std::vector<PJRT_Buffer*>>& c_output_lists_storage,
+      std::vector<PJRT_Buffer**>& c_output_lists,
+      std::optional<std::vector<PJRT_Event*>>& device_complete_events);
+
+  StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteWithSingleDevice(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<PjRtFuture<Status>>& returned_future, bool fill_future);
+
   PjRtCApiClient* client_;
-  std::unique_ptr<PJRT_Executable, pjrt::PJRT_ExecutableDeleter> executable_;
+  std::unique_ptr<PJRT_LoadedExecutable, pjrt::PJRT_LoadedExecutableDeleter>
+      loaded_executable_;
+  std::unique_ptr<PjRtCApiExecutable> executable_;
   std::vector<PjRtDevice*> addressable_devices_;
 
   void InitDevices();
 };
 
-StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient();
+StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
+    absl::string_view device_type);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 744e5789e38..700bc83f7f0 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -15,12 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/base/casts.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 
@@ -39,6 +42,15 @@ StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(PjRtBuffer* buffer) {
   return absl::bit_cast<std::uintptr_t>(ptr);
 }
 
+PjRtFuture<Status> PjRtBuffer::CopyRawToHostFuture(
+    PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
+  StatusOr<void*> awaited_dst = dst.Await();
+  if (!awaited_dst.ok()) {
+    return PjRtFuture<Status>(std::move(awaited_dst).status());
+  }
+  return CopyRawToHost(*awaited_dst, offset, transfer_size);
+}
+
 MultiSliceConfig::~MultiSliceConfig() {}
 
 std::string CompiledMemoryStats::DebugString() const {
@@ -60,4 +72,35 @@ PjRtHostMemoryForDeviceManager::~PjRtHostMemoryForDeviceManager() = default;
 
 CopyToDeviceStream::~CopyToDeviceStream() = default;
 
+StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+PjRtLoadedExecutable::GetCostAnalysis() const {
+  // Get HLO cost analysis first
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloCostAnalysis> hlo_cost_analysis,
+                      client()->GetHloCostAnalysis());
+
+  // Call into HLO module to accept the analysis, which also calculates the
+  // cost properties
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> modules,
+                      GetHloModules());
+  if (modules.empty()) {
+    return NotFound(
+        "Executable '%s' did not have an HloModule to generate "
+        "cost analysis with.",
+        name());
+  } else if (modules.size() > 1) {
+    return Unimplemented(
+        "GetCostAnalysis() doesn't support multiple program "
+        "multiple data executables.");
+  }
+
+  TF_RETURN_IF_ERROR(
+      modules[0]->entry_computation()->Accept(hlo_cost_analysis.get()));
+
+  // Return cost properties
+  absl::flat_hash_map<std::string, PjRtValueType> ret;
+  hlo_cost_analysis->properties().ForEach(
+      [&](absl::string_view key, float val) { ret[key] = val; });
+  return ret;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index eeafddb2e12..35df462b745 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -89,8 +89,9 @@ inline constexpr absl::string_view PjRtRuntimeTypeString(PjRtRuntimeType type) {
 
 class PjRtClient;
 
-using PjRtDeviceAttribute =
-    std::variant<std::string, int64_t, std::vector<int64_t>>;
+using PjRtValueType =
+    std::variant<std::string, int64_t, std::vector<int64_t>, float>;
+using PjRtDeviceAttribute = PjRtValueType;
 
 class PjRtDevice {
  public:
@@ -453,7 +454,8 @@ class PjRtClient {
   }
 
   // Returns a backend-specific HLO cost analysis visitor.
-  virtual StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis() = 0;
+  virtual StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const = 0;
 
   // Compile `computation` with given `options`.
   virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
@@ -467,23 +469,22 @@ class PjRtClient {
   virtual StatusOr<std::optional<std::string>> ExecutableFingerprint(
       const PjRtLoadedExecutable& executable) const = 0;
 
-  // Returns a platform-specific serialization of `executable`. The
-  // serialization is not guaranteed to be stable over time. `executable` must
-  // have been produced by this client.
-  virtual StatusOr<std::string> SerializeExecutable(
-      const PjRtLoadedExecutable& executable) const = 0;
-
   // Deserializes a serialized executable as produced by
-  // SerializeExecutable(). `serialized` must have been produced by a client of
-  // the same platform and version as this one.
+  // PjRtExecutable::SerializeExecutable(). `serialized` must have been
+  // produced by a compiler of the same platform and version as this one.
+  //
+  // Pending completion of b/237720161, `options` is a mandatory argument in
+  // most implementations of this interface. They _are_ optional for
+  // implementations related to the PJRT C API.
   virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, CompileOptions options) = 0;
+      absl::string_view serialized, std::optional<CompileOptions> options) = 0;
 
   // LoadSerializedExecutable takes the serialized output of PjRtExecutable. The
   // returned executable is loaded by this client. The same checks are made as
   // in Load that the serialized executable is compatible with the client.
   virtual StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-  LoadSerializedExecutable(absl::string_view serialized, CompileOptions options,
+  LoadSerializedExecutable(absl::string_view serialized,
+                           std::optional<CompileOptions> options,
                            const LoadOptions& load_options) {
     return Unimplemented("Loading serialized executable not supported.");
   }
@@ -504,6 +505,12 @@ class PjRtClient {
   virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) = 0;
 
+  // Creates buffer that carries an error future without allocating memory.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> CreateErrorBuffer(
+      Status error, const Shape& shape, PjRtDevice* device) {
+    return Unimplemented("CreateErrorBuffer not supported.");
+  }
+
   // A client may want to create a buffer, and hand the buffer to other PjRt
   // methods, before the data to store in the buffer is available to the client.
   // This is supported using CreateBuffersForAsyncHostToDevice, which returns an
@@ -549,22 +556,22 @@ class PjRtClient {
     // transfer is complete but before the buffers are made available to
     // their consumers. 'literal' must remain in scope until on_done is
     // called.
-    virtual Status TransferLiteralToBuffer(int buffer_index,
-                                           const LiteralSlice& literal,
-                                           std::function<void()> on_done) = 0;
+    virtual Status TransferLiteralToBuffer(
+        int buffer_index, const LiteralSlice& literal,
+        absl::AnyInvocable<void() &&> on_done) = 0;
 
     // Returns the on-device size in bytes of buffer buffer_index.
     virtual size_t buffer_size(int buffer_index) const = 0;
 
     // Transfers 'data' into buffer_index. 'data' must be already laid out in
     // the correct on-device format, for example returned by a call to
-    // buffer->CopyRawToHost. No transfer calls into buffer_index can be made
-    // after this call. on_done is called when the transfer is complete but
-    // before the buffers are made available to their consumers. 'data' must
-    // remain in scope until on_done is called.
-    virtual Status TransferRawDataToBuffer(int buffer_index,
-                                           absl::string_view data,
-                                           std::function<void()> on_done) = 0;
+    // buffer->CopyRawToHost. No transfer calls (or SetBufferError calls) into
+    // buffer_index can be made after this call. on_done is called when the
+    // transfer is complete but before the buffers are made available to their
+    // consumers. 'data' must remain in scope until on_done is called.
+    virtual Status TransferRawDataToBuffer(
+        int buffer_index, absl::string_view data,
+        absl::AnyInvocable<void() &&> on_done) = 0;
 
     // Transfers 'data' into a sub-buffer of buffer_index starting at offset, of
     // length transfer_size. 'data' must be already laid out in the correct
@@ -572,23 +579,19 @@ class PjRtClient {
     // buffer->CopyRawToHost. If is_last_transfer is false then the buffer
     // remains unavailable to consumers after the transfer completes. If
     // is_last_transfer is true then the buffer becomes available to consumers
-    // after the transfer completes, and no transfer calls into buffer_index can
-    // be made after this call. on_done is called when the transfer is complete
-    // but before the buffers are made available to their consumers. 'data' must
-    // remain in scope until on_done is called.
+    // after the transfer completes, and no transfer calls (or SetBufferError
+    // calls) into buffer_index can be made after this call. on_done is called
+    // when the transfer is complete but before the buffers are made available
+    // to their consumers. 'data' must remain in scope until on_done is called.
     virtual Status TransferRawDataToSubBuffer(
         int buffer_index, const void* data, int64_t offset,
         int64_t transfer_size, bool is_last_transfer,
-        std::function<void()> on_done) = 0;
+        absl::AnyInvocable<void() &&> on_done) = 0;
 
-    // Indicates that a client error occurred and the transfers will never
-    // complete. Puts all buffers in an error state. For the stream executor
-    // client, since error states are not well supported, this triggers a fatal
-    // error.
-    //
-    // SetTransferError may be called at most once, and may not be called unless
-    // at least one buffer has not yet had its final transfer initiated.
-    virtual void SetTransferError(Status error) = 0;
+    // Indicates that a specific buffer should result in an error status. No
+    // transfer calls (or further SetBufferError calls) into buffer_index can
+    // be made after this call.
+    virtual void SetBufferError(int buffer_index, Status error) = 0;
 
     // Adds the specified key/value metadata for the transfer operation.
     // This is typically used for debugging purposes, such as adding a handle
@@ -832,9 +835,24 @@ class PjRtBuffer {
   // offset+transfer_size must be less than GetOnDeviceSizeInBytes. The
   // returned future transitions to ready on error, or after the transfer has
   // completed.
+  //
+  // Note that the underlying driver may have requirements
+  // on the alignment of `dst` and `offset` as well. Look at implementations of
+  // this method for specific alignment requirements.
   virtual PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
                                            int64_t transfer_size) = 0;
 
+  // As above, but the transfer will not happen until `dst` is fulfilled with a
+  // valid pointer. If `dst` is fulfilled with a non-Ok status, then the
+  // transfer will be cancelled.
+  //
+  // In error cases it is possible for the returned Future to become ready
+  // before `dst` is fulfilled.
+  //
+  // Note that the default implementation will block until `dst` is fulfilled.
+  virtual PjRtFuture<Status> CopyRawToHostFuture(
+      PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size);
+
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
   // operations using the buffer have completed, according to the allocation
@@ -878,31 +896,40 @@ class PjRtBuffer {
   virtual StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) = 0;
 
-  // Copies the buffer to the remote device encoded in serialized_descriptor.
-  // This call must be preceded by a call to MakeCrossHostReceiveBuffers on the
-  // remote host's destination device. MakeCrossHostReceiveBuffers takes an
-  // array of shapes to construct the destination buffers, and a callback
-  // supplies an array containing both the destination buffers, and a serialized
-  // descriptor for each buffer. For each destination buffer there should be a
-  // matching call to src->CopyToRemoteDevice on a remote host for a src buffer
-  // of the corresponding shape. serialized_descriptor is the string returned by
-  // the callback along with the corresponding destination buffer.
+  // Prepares to send a copy of the buffer to a remote device. The destination
+  // device is encoded in `serialized_descriptor`, which must be fulfilled by
+  // the result of call to MakeCrossHostReceiveBuffers on the remote host's
+  // destination device. MakeCrossHostReceiveBuffers takes an array of shapes to
+  // construct the destination buffers, and a callback supplies an array
+  // containing both the destination buffers, and a serialized descriptor for
+  // each buffer. For each destination buffer there should be a matching call to
+  // src->CopyToRemoteDevice on a remote host for a src buffer of the
+  // corresponding shape. If `serialized_descriptor` is fulfilled with a non-Ok
+  // status, then the transfer is canceled, otherwise it must be the string
+  // returned by the MakeCrossHostReceiveBuffers callback corresponding to the
+  // destination buffer.
   //
-  // When the send either completes or fails, on_done will be called. If
-  // status is Ok then it is guaranteed that sends_were_enqueued==true.
+  // When the send either completes or fails, `on_done` will be called. If
+  // `status` is Ok then it is guaranteed that sends_were_enqueued==true.
   // Otherwise, if sends_were_enqueued==false then the sender should contact
   // the receiver out of band to request cancellation of the transfer. If
   // !status.ok() and sends_were_enqueued==true then it is not possible to
   // determine whether the transfer succeeded and the system is in an
   // undefined state. This undefined state almost certainly indicates an
-  // unrecoverable hardware error.
+  // unrecoverable hardware error. Note that in some error cases, `on_done` may
+  // be called before `serialized_descriptor` is fulfilled.
+  //
+  // Some implementations of this method may immediately block on the
+  // `serialized_descriptor` future (and not return until that future has been
+  // fulfilled).
   //
   // See note on semantics of cross-device copies in the class definition
   // comment for PjRtClient.
   using RemoteSendCallback =
       std::function<void(Status status, bool sends_were_enqueued)>;
-  virtual void CopyToRemoteDevice(absl::string_view serialized_descriptor,
-                                  RemoteSendCallback on_done) = 0;
+  virtual void CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      RemoteSendCallback on_done) = 0;
   struct ScatterDetails {
     // The dimensions of the corresponding buffer that the scatter slices
     // across. These dimensions must be the major dimensions in the on-device
@@ -920,11 +947,29 @@ class PjRtBuffer {
     // The start and end indices of the slices.
     std::vector<std::pair<int64_t, int64_t>> slices;
   };
+  // Each entry in `callbacks` will be called exactly once. As above, in error
+  // situations, this may happen before the corresponding entry in
+  // `serialaized_descriptors` is fulfilled. This method requires that both
+  // `calbacks.size()` and (if Ok) `serialized_descriptors.size()` match the
+  // product of the major dimensions specified in `scatter_details`.
   virtual void CopyToRemoteDeviceScattered(
-      absl::Span<const std::pair<std::string, RemoteSendCallback>>
-          serialized_descriptors_and_callbacks,
+      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) = 0;
 
+  // Donates 'this' and returns a new buffer that is ready only when both 'this'
+  // and 'dependency' are ready.
+  //
+  // Once ready, the new buffer's contents will be exactly the contents of
+  // 'this'.
+  //
+  // If either 'this' or 'dependency' transitions to error, then the returned
+  // buffer will transition to error.
+  virtual StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      PjRtFuture<Status> dependency) {
+    return Unimplemented("DonateWithControlDependency is not supported.");
+  }
+
   // Helper to allow a caller to indicate that it is going to do some "sends"
   // of the buffer a later date, where a send is a transfer out of a device
   // buffer, either copying to host, or to a remote device.
@@ -961,8 +1006,8 @@ class PjRtBuffer {
     // Equivalent to PjRtBuffer::CopyToRemoteDeviceScattered on the underlying
     // buffer;
     virtual void CopyToRemoteDeviceScattered(
-        absl::Span<const std::pair<std::string, RemoteSendCallback>>
-            serialized_descriptors_and_callbacks,
+        std::vector<std::string> serialized_descriptors,
+        std::vector<PjRtBuffer::RemoteSendCallback> callbacks,
         const ScatterDetails& scatter_details) = 0;
   };
   virtual StatusOr<std::unique_ptr<AsyncSendPlaceholder>>
@@ -1115,6 +1160,12 @@ class PjRtLoadedExecutable : public PjRtExecutable {
 
   virtual const DeviceAssignment& device_assignment() const = 0;
 
+  // Returns named values for cost properties of this executable (such as
+  // operations, size of input/outputs, and run time estimate). Properties may
+  // differ for different platforms.
+  virtual StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
+  GetCostAnalysis() const;
+
   // The replica and partition indices of device_assignment to be run by this
   // client. On single-host platforms without partitioning, this is all replicas
   // (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may not be the
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
index 78b7146f23c..9d1641e4689 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client_test.cc
@@ -123,6 +123,35 @@ TEST_P(PjRtClientTest, Execute) {
                                      *literal));
 }
 
+TEST_P(PjRtClientTest, ExecuteWithImmutableUntilTransferCompletes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetClient());
+  auto executable =
+      MakeIncrementProgram(client.get(), /*alias=*/false, /*device=*/0);
+
+  std::vector<int32_t> data(4, 0);
+  Shape shape = ShapeUtil::MakeShape(S32, {4});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes,
+          nullptr, client->addressable_devices()[0]));
+
+  ExecuteOptions options;
+  options.execution_mode = GetParam();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto results,
+                          executable->Execute({{buffer.get()}}, options));
+  ASSERT_EQ(results.size(), 1);
+  ASSERT_EQ(results[0].size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+
+  std::vector<int32_t> expected(4, 1);
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
+                                     *literal));
+}
+
 TEST_P(PjRtClientTest, ExecuteWithTupleZeroCopy) {
   TF_ASSERT_OK_AND_ASSIGN(auto client, GetClient());
   auto executable = MakeIncrementProgram(client.get(), /*alias=*/false,
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc b/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
index dea145f3f51..34c6aef1ef6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler.cc
@@ -45,6 +45,11 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, const XlaComputation& computation,
     const PjRtDeviceTopology& topology, PjRtClient* client) {
+  auto topology_compiler = topology.compiler();
+  if (topology_compiler.has_value()) {
+    return (*topology_compiler)
+        ->Compile(std::move(options), computation, topology, client);
+  }
   absl::ReaderMutexLock l(&registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
@@ -58,6 +63,11 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
 StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCompile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtDeviceTopology& topology, PjRtClient* client) {
+  auto topology_compiler = topology.compiler();
+  if (topology_compiler.has_value()) {
+    return (*topology_compiler)
+        ->Compile(std::move(options), module, topology, client);
+  }
   absl::ReaderMutexLock l(&registry_mutex);
   const auto* compiler_registry = CompilerRegistry();
   auto it = compiler_registry->find(topology.platform_name());
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
index d2ab6019466..00108c619a6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_COMPILER_H_
 
 #include <memory>
+#include <optional>
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -24,6 +25,8 @@ limitations under the License.
 
 namespace xla {
 
+class PjRtCompiler;
+
 // TODO(b/240299401): Move CompileOptions to this file.
 
 // Abstract interface to represent device topology that is used by the compiler.
@@ -40,6 +43,9 @@ class PjRtDeviceTopology {
   // Returns a string containing human-readable, platform-specific version info
   // (e.g. the CUDA version on GPU or libtpu version on Cloud TPU).
   virtual absl::string_view platform_version() const = 0;
+
+  // If non-null, overrides the compiler for this topology.
+  virtual std::optional<PjRtCompiler*> compiler() const { return std::nullopt; }
 };
 
 // Abstract interface that all registered compilers must implement.
@@ -70,6 +76,8 @@ void PjRtRegisterCompiler(absl::string_view platform_name,
 // registered for the platform using PjRtRegisterCompiler. The returned
 // PjRtExecutable must be loaded by a compatible client before execution.
 //
+// The actual compiler used may be overriden by Topology::compiler().
+//
 // Returns error::NotFound if a compiler has not been registered for the
 // platform. Forwards errors returned from the registered compiler in case of a
 // compilation failure.
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_executable.h b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
index cf9429097d1..d2a0ed97678 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_executable.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
@@ -24,8 +24,8 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -121,10 +121,10 @@ class PjRtExecutable {
       const = 0;
 
   // Returns a list of parameter OpSharding protos.
-  std::optional<std::vector<OpSharding>> GetParameterShardings() const;
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings() const;
 
   // Returns a list of output OpSharding protos.
-  std::optional<std::vector<OpSharding>> GetOutputShardings() const;
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const;
 
   // Return memory stats that allow callers to estimate device memory usage
   // when running this executable.
@@ -141,6 +141,10 @@ class PjRtExecutable {
   virtual StatusOr<std::string> FingerprintExecutable() const {
     return Unimplemented("Fingerprinting executable is not supported.");
   }
+
+  virtual StatusOr<struct CompileOptions> GetCompileOptions() const {
+    return Unimplemented("CompileOptions not available.");
+  }
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_future.h b/tensorflow/compiler/xla/pjrt/pjrt_future.h
index 361498fb78a..7eff48ced9d 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_future.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_future.h
@@ -20,8 +20,6 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
-#include "absl/types/span.h"
-#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/support/ref_count.h"  // from @tf_runtime
@@ -212,7 +210,7 @@ class PjRtFuture {
     CHECK(IsValid());
     if (!promise_ref_.IsAvailable()) {
       const auto keys = on_block_start_();
-      tfrt::Await({promise_ref_.GetAsyncValue()});
+      BlockUntilReady(promise_ref_.GetAsyncValue());
       on_block_end_(keys);
     }
     DCHECK(promise_ref_.IsConcrete());
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
index 76a6fd17a6d..88078beb65c 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
@@ -66,11 +66,14 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -81,6 +84,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -89,7 +93,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
 #include "tensorflow/compiler/xla/pjrt/event_pool.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
@@ -101,8 +104,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/utils.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -110,22 +113,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/stream_executor/event.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/profiler/lib/connected_traceme.h"
-#include "tensorflow/tsl/platform/cpu_info.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/fingerprint.h"
 #include "tensorflow/tsl/platform/mem.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
-#include "tensorflow/tsl/profiler/lib/traceme_encode.h"
 
 namespace xla {
 
@@ -260,7 +260,7 @@ StatusOr<DeviceAssignment> PjRtStreamExecutorClient::GetDefaultDeviceAssignment(
 }
 
 StatusOr<std::unique_ptr<HloCostAnalysis>>
-PjRtStreamExecutorClient::GetHloCostAnalysis() {
+PjRtStreamExecutorClient::GetHloCostAnalysis() const {
   return std::make_unique<HloCostAnalysis>(
       client_->backend().compiler()->ShapeSizeBytesFunction());
 }
@@ -1336,10 +1336,24 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
   if (!event_or.ok()) {
     return PjRtFuture<Status>(event_or.status());
   }
+
+  GenericTransferManager::LiteralFromDeviceMetadata transfer_metadata;
+  // We never call device functions from the `done` callback.
+  transfer_metadata.callback_is_host_callback_safe = true;
+
+  TransferManager* transfer_manager =
+      client_->client()->backend().transfer_manager();
+
+  TransferManager::TransferMetadata* transfer_metadata_ptr =
+      (dynamic_cast<GenericTransferManager*>(transfer_manager) != nullptr)
+          ? &transfer_metadata
+          : nullptr;
+
   auto promise = PjRtFuture<Status>::CreatePromise();
-  client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
+  transfer_manager->TransferLiteralFromDevice(
       stream, shaped_buffer, literal,
-      [promise](Status status) mutable { promise.Set(status); });
+      [promise](Status status) mutable { promise.Set(status); },
+      transfer_metadata_ptr);
 
   auto usage_event = std::make_shared<BufferSequencingEvent>();
   local_device->event_pool().ThenRecordEvent(stream, event_or.value());
@@ -1361,7 +1375,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
       std::move(promise),
       /*on_block_start=*/
       []() {
-        tensorflow::profiler::TraceMeProducer traceme(
+        tsl::profiler::TraceMeProducer traceme(
             "PjRtStreamExecutorBuffer::ToLiteral");
         VLOG(1) << "PjRtStreamExecutorBuffer::ToLiteral";
         return PjRtFutureHelpers::ProfilingKeys(
@@ -1369,7 +1383,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::ToLiteral(
       },
       /*on_block_end=*/
       [](PjRtFutureHelpers::ProfilingKeys keys) {
-        tensorflow::profiler::TraceMeConsumer traceme(
+        tsl::profiler::TraceMeConsumer traceme(
             "PjRtStreamExecutorBuffer::ToLiteral", keys.traceme_context_id);
       });
 }
@@ -1551,18 +1565,31 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtStreamExecutorBuffer::CopyToDevice(
 }
 
 void PjRtStreamExecutorBuffer::CopyToRemoteDevice(
-    absl::string_view serialized_descriptor, RemoteSendCallback on_done) {
+    PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+    RemoteSendCallback on_done) {
   VLOG(1) << "PjRtStreamExecutorBuffer::CopyToRemoteDevice";
-  client_->CopyToRemoteDevice(this, serialized_descriptor, std::move(on_done));
+  auto desc = serialized_descriptor.Await();
+  if (desc.ok()) {
+    client_->CopyToRemoteDevice(this, *desc, std::move(on_done));
+  } else {
+    on_done(desc.status(), /*sends_enqueued=*/false);
+  }
 }
 
 void PjRtStreamExecutorBuffer::CopyToRemoteDeviceScattered(
-    absl::Span<const std::pair<std::string, RemoteSendCallback>>
-        serialized_descriptors_and_callbacks,
+    PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+    std::vector<RemoteSendCallback> callbacks,
     const ScatterDetails& scatter_details) {
   VLOG(1) << "PjRtStreamExecutorBuffer::CopyToRemoteDeviceScattered";
-  client_->CopyToRemoteDeviceScattered(
-      this, serialized_descriptors_and_callbacks, scatter_details);
+  auto res = serialized_descriptors.Await();
+  if (res.ok()) {
+    client_->CopyToRemoteDeviceScattered(this, *std::move(res),
+                                         std::move(callbacks), scatter_details);
+  } else {
+    for (const auto& cb : callbacks) {
+      cb(res.status(), /*sends_enqueued=*/false);
+    }
+  }
 }
 
 PjRtFuture<Status> PjRtStreamExecutorBuffer::GetReadyFuture() {
@@ -1614,7 +1641,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::GetReadyFuture() {
       std::move(definition_promise),
       /*on_block_start=*/
       []() {
-        tensorflow::profiler::TraceMeProducer traceme(
+        tsl::profiler::TraceMeProducer traceme(
             "PjRtStreamExecutorBuffer::Await");
         VLOG(1) << "PjRtStreamExecutorBuffer::Await";
         return PjRtFutureHelpers::ProfilingKeys(
@@ -1622,7 +1649,7 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::GetReadyFuture() {
       },
       /*on_block_end=*/
       [](PjRtFutureHelpers::ProfilingKeys keys) {
-        tensorflow::profiler::TraceMeConsumer traceme(
+        tsl::profiler::TraceMeConsumer traceme(
             "PjRtStreamExecutorBuffer::Await", keys.traceme_context_id);
       });
 }
@@ -1891,6 +1918,223 @@ PjRtStreamExecutorExecutable::MakeExecutionInputsAndWaitForEvents(
   return execution_inputs;
 }
 
+template <typename T>
+static const T* FindCallback(int channel_id, absl::Span<const T> callbacks) {
+  // TODO(ezhulenev): Can we use binary search here assuming that callbacks
+  // are sorted by channel id? Are they always sorted?
+  auto it = absl::c_find_if(callbacks, [&](const T& callback) {
+    return callback.channel_id == channel_id;
+  });
+  return it == callbacks.end() ? nullptr : &*it;
+}
+
+using tsl::AsyncValueRef;
+using tsl::MakeConstructedAsyncValueRef;
+
+// Converts PjRt SendCallbacks to an XLA StreamExecutor send function.
+static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
+    int device_ordinal, const ExecuteOptions& options,
+    tsl::thread::ThreadPool* thread_pool) {
+  // Check if we have callbacks registered for the given device ordinal.
+  if (device_ordinal >= options.send_callbacks.size()) {
+    return [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
+                            const se::DeviceMemoryBase&) {
+      return InvalidArgument(
+          "Failed to send a buffer to the channel_id=%d, there was no send "
+          "callbacks registered for the device_ordinal=%d",
+          channel_id, device_ordinal);
+    };
+  }
+
+  // SendCallbacks registered for a device ordinal. Can be empty.
+  absl::Span<const SendCallback> callbacks =
+      options.send_callbacks[device_ordinal];
+
+  return [callbacks, thread_pool](int64_t channel_id, se::Stream* stream,
+                                  const Shape& shape,
+                                  const se::DeviceMemoryBase& src)
+             -> StatusOr<AsyncValueRef<se::Event>> {
+    VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
+            << " (shape=" << shape.ToString() << ")";
+
+    const SendCallback* send = FindCallback(channel_id, callbacks);
+    if (!send) {
+      return InvalidArgument(
+          "Failed to send a buffer to the channel_id=%d, callback not found",
+          channel_id);
+    }
+
+    // Allocate event that will signal completion of send operation. We do not
+    // actually track the completion of the send callback, we only have to keep
+    // the device memory long enough to complete the memcpy command.
+    auto done_event = MakeConstructedAsyncValueRef<se::Event>(stream->parent());
+    if (!done_event->Init())
+      return InternalError("Failed to initialize done event (channel_id=%d)",
+                           channel_id);
+
+    thread_pool->Schedule([done_event, stream, src, channel_id, shape, send] {
+      tsl::profiler::TraceMe trace([&] {
+        return tsl::profiler::TraceMeEncode(
+            "PjRtStreamExecutorExecutable::Send", {{"channel_id", channel_id}});
+      });
+
+      // Allocate chunk on the host for copying data from device.
+      PjRtChunk chunk = PjRtChunk::AllocateDefault(src.size());
+
+      stream->ThenMemcpy(chunk.data(), src, src.size());
+      stream->ThenRecordEvent(&done_event.get());
+
+      // Wait for the data to be available on the host.
+      if (auto st = stream->BlockHostUntilDone(); !st.ok()) {
+        done_event.SetError(absl::InternalError(absl::StrFormat(
+            "failed to synchronize send operation with a stream: %s",
+            st.error_message())));
+        return;
+      }
+
+      // Pass chunk to the registered callback.
+      auto sent = send->callback({shape}, std::move(chunk),
+                                 /*total_size_in_bytes=*/src.size(),
+                                 /*done=*/true);
+
+      if (!sent.ok()) {
+        done_event.SetError(ToAbslStatus(sent));
+      } else {
+        done_event.SetStateConcrete();
+      }
+    });
+
+    return std::move(done_event);
+  };
+}
+
+namespace {
+class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
+ public:
+  StreamExecutorCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
+                                   se::DeviceMemoryBase dst,
+                                   AsyncValueRef<se::Event> done)
+      : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
+        channel_id_(channel_id),
+        stream_(stream),
+        dst_(dst),
+        done_(std::move(done)) {}
+
+  PjRtFuture<Status> AddChunk(PjRtChunk chunk) final {
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode(
+          "StreamExecutorCopyToDeviceStream::AddChunk",
+          {{"channel_id", channel_id_}});
+    });
+
+    absl::ReleasableMutexLock lock(&mu_);
+
+    VLOG(3) << "Add chunk to a H2D channel #" << channel_id_ << ": "
+            << "size=" << chunk.size() << ", "
+            << "current_bytes=" << current_bytes_ << ", "
+            << "total_bytes=" << total_bytes_;
+
+    if (chunk.size() % granule_size_in_bytes() != 0) {
+      done_.SetError(absl::InvalidArgumentError(absl::StrFormat(
+          "Chunk size (%d) was not a multiple of the granule size (%d)",
+          chunk.size(), granule_size_in_bytes())));
+      return PjRtFuture<Status>(FromAbslStatus(done_.GetError()));
+    }
+
+    if (current_bytes_ + chunk.size() > total_bytes_) {
+      done_.SetError(absl::InvalidArgumentError(
+          absl::StrFormat("Adding chunk of size %d would overflow buffer of "
+                          "size %d (%d already transferred)",
+                          chunk.size(), total_bytes_, current_bytes_)));
+      return PjRtFuture<Status>(FromAbslStatus(done_.GetError()));
+    }
+
+    se::DeviceMemoryBase dst(
+        reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
+        dst_.size() - current_bytes_);
+
+    current_bytes_ += chunk.size();
+    bool complete = IsCompleteLocked();
+    lock.Release();
+
+    stream_->ThenMemcpy(&dst, chunk.data(), chunk.size());
+
+    // Delete chunk once the memcpy operation completes.
+    auto* chunk_ptr = std::make_unique<PjRtChunk>(std::move(chunk)).release();
+    stream_->ThenDoHostCallback([chunk_ptr]() { delete chunk_ptr; });
+
+    // Record done event once processed the last chunk. It is the caller
+    // responsibility to synchronize with this event before submitting any new
+    // computations to the stream.
+    if (complete) {
+      stream_->ThenRecordEvent(&done_.get());
+      done_.SetStateConcrete();
+    }
+
+    return PjRtFuture<Status>(OkStatus());
+  }
+
+ private:
+  int64_t channel_id_;
+  se::Stream* stream_;
+  se::DeviceMemoryBase dst_;
+
+  // Async value will become available after we'll submit the last memcpy
+  // operation, and the event will be recorded on the stream.
+  AsyncValueRef<se::Event> done_;
+};
+}  // namespace
+
+static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
+    int device_ordinal, const ExecuteOptions& options) {
+  // Check if we have callbacks registered for the given device ordinal.
+  if (device_ordinal >= options.send_callbacks.size()) {
+    return [device_ordinal](int64_t channel_id, se::Stream*, const Shape&,
+                            se::DeviceMemoryBase*) {
+      return InvalidArgument(
+          "Failed to receive a buffer from the channel_id=%d, there was no "
+          "recv callbacks registered for the device_ordinal=%d",
+          channel_id, device_ordinal);
+    };
+  }
+
+  // RecvCallbacks registered for a device ordinal. Can be empty.
+  absl::Span<const RecvCallback> callbacks =
+      options.recv_callbacks[device_ordinal];
+
+  return [callbacks](
+             int64_t channel_id, se::Stream* stream, const Shape& shape,
+             se::DeviceMemoryBase* dst) -> StatusOr<AsyncValueRef<se::Event>> {
+    VLOG(3) << "Recv from channel #" << channel_id
+            << " (shape=" << shape.ToString() << ")";
+
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode("PjRtStreamExecutorExecutable::Recv",
+                                          {{"channel_id", channel_id}});
+    });
+
+    const RecvCallback* recv = FindCallback(channel_id, callbacks);
+    if (!recv) {
+      return InvalidArgument(
+          "Failed to recv a buffer from the channel_id=%d, callback not found",
+          channel_id);
+    }
+
+    // Allocate event that will signal completion of recv operation. We record
+    // it on a stream after submitting the memcpy for the last chunk (see
+    // `StreamExecutorCopyToDeviceStream` implementation above).
+    auto done_event = MakeConstructedAsyncValueRef<se::Event>(stream->parent());
+    if (!done_event->Init())
+      return InternalError("Failed to initialize done event (channel_id=%d)",
+                           channel_id);
+
+    recv->callback({shape}, std::make_unique<StreamExecutorCopyToDeviceStream>(
+                                channel_id, stream, *dst, done_event));
+
+    return std::move(done_event);
+  };
+}
+
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
@@ -1905,9 +2149,9 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
                            ->local_device_state()
                            ->device_ordinal();
   LocalDeviceState* device_state = &(client_->device_state(device_ordinal));
-  tensorflow::profiler::TraceMeConsumer activity(
+  tsl::profiler::TraceMeConsumer activity(
       "PjRtStreamExecutorExecutable::EnqueueExecution",
-      tensorflow::profiler::ContextType::kPjRt, run_id.ToInt());
+      tsl::profiler::ContextType::kPjRt, run_id.ToInt());
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -1995,9 +2239,20 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
           on_device_executable_parameter_shapes_[executable_idx],
           argument_handles, *device_buffers, events));
 
+  // Schedule async send operations in the client thread pool.
+  auto* thread_pool = client_->thread_pool();
+
+  // Create a PjRt<->StreamExecutor adaptors to send/recv device memory as
+  // PjRt chunks via the user-provided callbacks.
+  SendDeviceMemoryFunction send_device_memory =
+      ConvertSendCallbacksToSendFunction(device_ordinal, options, thread_pool);
+  RecvDeviceMemoryFunction recv_device_memory =
+      ConvertRecvCallbacksToRecvFunction(device_ordinal, options);
+
   ExecutableRunOptions run_options;
   run_options.set_stream(device_state->compute_stream());
   run_options.set_host_to_device_stream(device_state->host_to_device_stream());
+  run_options.set_device_to_host_stream(device_state->GetDeviceToHostStream());
   run_options.set_allocator(client_->allocator());
   run_options.set_intra_op_thread_pool(
       client_->client()->backend().eigen_intra_op_thread_pool_device());
@@ -2006,6 +2261,8 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
   run_options.set_rng_seed(device_state->GetNewPrngSeed());
   run_options.set_gpu_executable_run_options(client_->gpu_run_options());
   run_options.set_launch_id(options.launch_id);
+  run_options.set_send_device_memory_function(&send_device_memory);
+  run_options.set_recv_device_memory_function(&recv_device_memory);
   if (run_options.launch_id() != 0) {
     VLOG(3) << "launch id for " << name() << ": " << run_options.launch_id();
   }
@@ -2223,9 +2480,9 @@ PjRtStreamExecutorExecutable::Execute(
   }
 
   RunId run_id;
-  tensorflow::profiler::TraceMeProducer activity(
+  tsl::profiler::TraceMeProducer activity(
       "PjRtStreamExecutorExecutable::Execute",
-      tensorflow::profiler::ContextType::kPjRt, run_id.ToInt());
+      tsl::profiler::ContextType::kPjRt, run_id.ToInt());
 
   const int num_addressable_devices = addressable_devices_.size();
 
@@ -2567,19 +2824,23 @@ StatusOr<std::string> PjRtStreamExecutorClient::SerializeExecutable(
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
-PjRtStreamExecutorClient::DeserializeExecutable(absl::string_view serialized,
-                                                CompileOptions options) {
+PjRtStreamExecutorClient::DeserializeExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options) {
+  if (serialized.empty()) {
+    return InternalError("Serialized executable is empty");
+  }
+
+  if (!options.has_value()) {
+    return InvalidArgument(
+        "PjRtStreamExecutorClient requires `CompileOptions` for "
+        "`DeserializeExecutable()`");
+  }
+
   tsl::profiler::TraceMe traceme(
       "PjRtStreamExecutorClient::DeserializeExecutable");
   VLOG(1) << "PjRtStreamExecutorClient::DeserializeExecutable";
 
-  std::string xla_flags(std::getenv("XLA_FLAGS"));
-  if (!absl::StrContains(xla_flags,
-                         "--xla_gpu_enable_xla_runtime_executable=true")) {
-    return InternalError("Desirialization requires enabling JitRt");
-  }
-
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&options));
+  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&*options));
   std::shared_ptr<DeviceAssignment>& device_assignment =
       extras.device_assignment;
   std::vector<PjRtStreamExecutorExecutable::LogicalDeviceIds>&
@@ -2588,18 +2849,18 @@ PjRtStreamExecutorClient::DeserializeExecutable(absl::string_view serialized,
 
   std::string str(serialized);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<LocalExecutable> loaded,
-                      client()->Load(str, options.executable_build_options));
+                      client()->Load(str, options->executable_build_options));
 
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.push_back(std::move(loaded));
 
   auto executable = std::make_unique<PjRtStreamExecutorExecutable>(
-      std::move(local_executables), options.parameter_is_tupled_arguments,
+      std::move(local_executables), options->parameter_is_tupled_arguments,
       std::move(device_assignment), std::move(addressable_device_logical_ids),
       std::move(addressable_devices), this);
 
   TF_RETURN_IF_ERROR(
-      executable->SetUpDonation(options.parameter_is_tupled_arguments));
+      executable->SetUpDonation(options->parameter_is_tupled_arguments));
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
index ff6240cf09a..b2635cdfcab 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
@@ -16,10 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_STREAM_EXECUTOR_CLIENT_H_
 
+#include <array>
 #include <functional>
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -33,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
@@ -43,7 +47,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -200,13 +203,18 @@ class PjRtStreamExecutorClient : public PjRtClient {
     return std::optional<std::string>();
   }
 
-  StatusOr<std::string> SerializeExecutable(
-      const PjRtLoadedExecutable& executable) const override;
+  virtual StatusOr<std::string> SerializeExecutable(
+      const PjRtLoadedExecutable& executable) const;
 
+  // For PjRtStreamExecutorClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, CompileOptions options) override;
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
 
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis() override;
+  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
 
   // Creates a buffer on the device without initializing or copying any data.
   // An optional `definition_event` may be speficied that can be used to
@@ -302,14 +310,12 @@ class PjRtStreamExecutorClient : public PjRtClient {
   }
 
   virtual void CopyToRemoteDeviceScattered(
-      PjRtBuffer* buffer,
-      absl::Span<const std::pair<std::string, PjRtBuffer::RemoteSendCallback>>
-          serialized_descriptors_and_callbacks,
+      PjRtBuffer* buffer, std::vector<std::string> serialized_descriptors,
+      std::vector<PjRtBuffer::RemoteSendCallback> callbacks,
       const PjRtBuffer::ScatterDetails& scatter_details) const {
-    for (const auto& d_and_cb : serialized_descriptors_and_callbacks) {
-      d_and_cb.second(
-          Unimplemented("Scattered cross host sends not implemented."),
-          /*sends_were_enqueued=*/false);
+    for (const auto& cb : callbacks) {
+      cb(Unimplemented("Scattered cross host sends not implemented."),
+         /*sends_were_enqueued=*/false);
     }
   }
 
@@ -612,12 +618,13 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  void CopyToRemoteDevice(absl::string_view serialized_descriptor,
-                          RemoteSendCallback on_done) override;
+  void CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      RemoteSendCallback on_done) override;
 
   void CopyToRemoteDeviceScattered(
-      absl::Span<const std::pair<std::string, RemoteSendCallback>>
-          serialized_descriptors_and_callbacks,
+      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
       const ScatterDetails& scatter_details) override;
 
   PjRtFuture<Status> GetReadyFuture() override;
@@ -790,6 +797,10 @@ class PjRtStreamExecutorExecutable : public PjRtLoadedExecutable {
 
   bool IsDeleted() override { return executables_.empty(); }
 
+  StatusOr<std::string> SerializeExecutable() const override {
+    return client_->SerializeExecutable(*this);
+  }
+
   bool IsReturnedFutureSupported() const override { return true; }
 
   absl::Span<const std::shared_ptr<LocalExecutable>> executables() const {
diff --git a/tensorflow/compiler/xla/pjrt/plugin/BUILD b/tensorflow/compiler/xla/pjrt/plugin/BUILD
new file mode 100644
index 00000000000..35cdea363ea
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/plugin/BUILD
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+
+# Configuration file for an XLA plugin.
+#
+# please don't check in changes to this file. to prevent changes appearing
+# in git status, use:
+#
+# git update-index --assume-unchanged tensorflow/compiler/xla/pjrt/plugin/BUILD
+#
+# To add additional devices to the XLA subsystem, add targets to the
+# dependency list in the 'plugin' target. For instance:
+#
+#   deps = ["//tensorflow/compiler/xla/pjrt/plugin/example:plugin_lib"],
+#
+# ** Please don't remove this file - it is supporting some 3rd party plugins **
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "plugin",
+    deps = [
+        #"//tensorflow/compiler/xla/pjrt/plugin/example:example_lib",
+    ],
+)
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
index c9a0ebb5c65..14fca8e9df1 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 
 #include <algorithm>
+#include <cstring>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 
@@ -35,7 +38,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -43,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/semaphore.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "tensorflow/compiler/xla/pjrt/utils.h"
-#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -55,10 +56,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/tsl/platform/denormal.h"
 #include "tensorflow/tsl/platform/setround.h"
-#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
 
@@ -116,7 +116,7 @@ static void EnqueueWorkWhenReady(
     tsl::thread::ThreadPool* pool,
     llvm::ArrayRef<tfrt::RCReference<tfrt::AsyncValue>> values,
     absl::AnyInvocable<void()> callee) {
-  tfrt::RunWhenReady(values, [pool, callee = std::move(callee)]() mutable {
+  RunWhenReady(values, [pool, callee = std::move(callee)]() mutable {
     EnqueueWork(pool, std::move(callee));
   });
 }
@@ -242,7 +242,8 @@ StatusOr<DeviceAssignment> TfrtCpuClient::GetDefaultDeviceAssignment(
   return computation_placer_->AssignDevices(num_replicas, num_partitions);
 }
 
-StatusOr<std::unique_ptr<HloCostAnalysis>> TfrtCpuClient::GetHloCostAnalysis() {
+StatusOr<std::unique_ptr<HloCostAnalysis>> TfrtCpuClient::GetHloCostAnalysis()
+    const {
   return std::make_unique<HloCostAnalysis>(cpu::CpuExecutable::ShapeSizeBytes);
 }
 
@@ -301,17 +302,10 @@ FindResultBufferAllocationIndex(const BufferAssignment& assignment,
   return {std::move(buffer_indices)};
 }
 
-StatusOr<std::string> TfrtCpuClient::SerializeExecutable(
-    const PjRtLoadedExecutable& executable) const {
-  const TfrtCpuExecutable* tfrt_cpu_executable =
-      tensorflow::down_cast<const TfrtCpuExecutable*>(&executable);
-
-  std::shared_ptr<Executable> cpu_executable =
-      tfrt_cpu_executable->cpu_executable();
-
+StatusOr<std::string> TfrtCpuExecutable::SerializeExecutable() const {
   cpu::CpuCompiler compiler;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
-                      compiler.Export(cpu_executable.get()));
+                      compiler.Export(cpu_executable_.get()));
 
   TF_ASSIGN_OR_RETURN(std::string serialized, aot_result->SerializeAsString());
 
@@ -324,7 +318,12 @@ StatusOr<std::string> TfrtCpuClient::SerializeExecutable(
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
-                                     CompileOptions options) {
+                                     std::optional<CompileOptions> options) {
+  if (!options.has_value()) {
+    return InvalidArgument(
+        "TfrtCpuClient requires `CompileOptions` for "
+        "`DeserializeExecutable()`");
+  }
   // Load a CpuExecutable
   cpu::CpuCompiler compiler;
   std::string str(serialized);
@@ -332,7 +331,7 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
                       compiler.LoadAotCompilationResult(str));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      aot_result->LoadExecutable(&compiler, /*stream_exec=*/nullptr));
+      aot_result->LoadExecutable(&compiler, /*executor=*/nullptr));
 
   // Set up other arguments for TfrtCpuExecutable
   // TODO(b/232263665): Remove duplicated code in DeserializeExecutable and
@@ -341,7 +340,7 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
   int num_partitions;
   std::shared_ptr<DeviceAssignment> device_assignment;
   TF_RETURN_IF_ERROR(ParseDeviceAssignmentCompileOptions(
-      options.compile_portable_executable, &options.executable_build_options,
+      options->compile_portable_executable, &options->executable_build_options,
       [this](int num_replicas, int num_partitions) {
         return this->GetDefaultDeviceAssignment(num_replicas, num_partitions);
       },
@@ -367,7 +366,7 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
   std::vector<PjRtLoadedExecutable::LogicalDeviceIds>
       addressable_device_logical_ids;
   std::vector<PjRtDevice*> addressable_devices;
-  ExecutableBuildOptions& build_options = options.executable_build_options;
+  ExecutableBuildOptions& build_options = options->executable_build_options;
   if (device_assignment != nullptr) {
     addressable_device_logical_ids.reserve(num_replicas * num_partitions);
     addressable_devices.reserve(num_replicas * num_partitions);
@@ -400,12 +399,12 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
 
   auto tfrt_cpu_executable = std::make_unique<TfrtCpuExecutable>(
       num_replicas, num_partitions, std::move(device_assignment),
-      options.parameter_is_tupled_arguments, std::move(executable),
+      options->parameter_is_tupled_arguments, std::move(executable),
       result_slice.index(), std::move(result_buffer_indices),
       std::move(addressable_device_logical_ids), std::move(addressable_devices),
       this);
   TF_RETURN_IF_ERROR(tfrt_cpu_executable->SetUpDonation(
-      options.parameter_is_tupled_arguments));
+      options->parameter_is_tupled_arguments));
 
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(tfrt_cpu_executable));
 }
@@ -435,7 +434,9 @@ static StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
 
   // Run Hlo Passes
-  cpu::CpuCompiler compiler;
+  bool allow_sparse_shapes =
+      hlo_module->config().debug_options().xla_cpu_use_xla_runtime();
+  cpu::CpuCompiler compiler(allow_sparse_shapes);
   xla::Compiler::CompileOptions dummy;
   TF_ASSIGN_OR_RETURN(hlo_module,
                       compiler.RunHloPasses(std::move(hlo_module),
@@ -877,10 +878,9 @@ void TfrtCpuBuffer::Delete() {
   // We should also wait for the definition event.
   event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
 
-  tfrt::RunWhenReady(event_avs,
-                     [device_buffer = std::move(device_buffer)]() mutable {
-                       device_buffer.reset();
-                     });
+  RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
+    device_buffer.reset();
+  });
 }
 
 bool TfrtCpuBuffer::IsDeleted() {
@@ -915,7 +915,7 @@ StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> TfrtCpuBuffer::Release(
     // defined. Return the first error encountered.
     Status first_error;
     for (const auto& av : events) {
-      tfrt::Await(av.CopyRCRef());
+      BlockUntilReady(av.GetAsyncValue());
       if (auto* error = av.GetErrorIfPresent()) {
         first_error.Update(
             InternalError("Error Execute: %s", error->message()));
@@ -988,7 +988,7 @@ StatusOr<Shape> TfrtCpuBuffer::logical_on_device_shape() {
 
   // Wait for the definition event.
   const auto& av = device_buffer->definition_event();
-  tfrt::Await(av.CopyRCRef());
+  BlockUntilReady(av.GetAsyncValue());
   if (auto* error = av.GetErrorIfPresent()) {
     return InternalError("Error Execute: %s", error->message());
   }
@@ -1100,16 +1100,15 @@ PjRtFuture<Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
         std::move(ready_event),
         /*on_block_start=*/
         []() {
-          tensorflow::profiler::TraceMeProducer traceme(
-              "TfrtCpuBuffer::ToLiteral");
+          tsl::profiler::TraceMeProducer traceme("TfrtCpuBuffer::ToLiteral");
           VLOG(1) << "TfrtCpuBuffer::ToLiteral";
           return PjRtFutureHelpers::ProfilingKeys(
               {/*traceme_context_id =*/traceme.GetContextId()});
         },
         /*on_block_end=*/
         [](PjRtFutureHelpers::ProfilingKeys keys) {
-          tensorflow::profiler::TraceMeConsumer traceme(
-              "TfrtCpuBuffer::ToLiteral", keys.traceme_context_id);
+          tsl::profiler::TraceMeConsumer traceme("TfrtCpuBuffer::ToLiteral",
+                                                 keys.traceme_context_id);
         });
   }
 }
@@ -1246,15 +1245,15 @@ PjRtFuture<Status> TfrtCpuBuffer::GetReadyFuture() {
         std::move(status_event),
         /*on_block_start=*/
         []() {
-          tensorflow::profiler::TraceMeProducer traceme("TfrtCpuBuffer::Await");
+          tsl::profiler::TraceMeProducer traceme("TfrtCpuBuffer::Await");
           VLOG(1) << "TfrtCpuBuffer::Await";
           return PjRtFutureHelpers::ProfilingKeys(
               {/*traceme_context_id=*/traceme.GetContextId()});
         },
         /*on_block_end=*/
         [](PjRtFutureHelpers::ProfilingKeys keys) {
-          tensorflow::profiler::TraceMeConsumer traceme(
-              "TfrtCpuBuffer::Await", keys.traceme_context_id);
+          tsl::profiler::TraceMeConsumer traceme("TfrtCpuBuffer::Await",
+                                                 keys.traceme_context_id);
         });
   }
 }
@@ -1415,6 +1414,18 @@ Status TfrtCpuExecutable::CheckBufferCompatibilities(
   return OkStatus();
 }
 
+// Create a descriptor table for XLA Runtime from a buffer table.
+static std::vector<xla::cpu::BufferDesc> MakeXLARuntimeDescriptorTable(
+    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffer_table) {
+  std::vector<xla::cpu::BufferDesc> descriptor_table;
+  descriptor_table.reserve(descriptor_table.size());
+  for (const auto& buf : buffer_table) {
+    descriptor_table.emplace_back(
+        xla::cpu::BufferDesc{buf->data(), buf->size()});
+  }
+  return descriptor_table;
+}
+
 StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     const RunId& run_id, const ExecuteOptions& options,
@@ -1612,13 +1623,8 @@ StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
 
     // Call generated function.
     if (cpu_executable->IsXlaRuntime()) {
-      std::vector<xla::cpu::BufferDesc> descriptor_table;
-      descriptor_table.reserve(descriptor_table.size());
-      for (const auto& buf : buffer_table) {
-        descriptor_table.emplace_back(
-            xla::cpu::BufferDesc{buf->data(), buf->size()});
-      }
-      Status status = cpu_executable->ExecuteXlaRuntime(descriptor_table);
+      Status status = cpu_executable->ExecuteXlaRuntime(
+          MakeXLARuntimeDescriptorTable(buffer_table), &run_options);
       if (!status.ok()) return status;
     } else {
       cpu_executable->compute_function()(result_buffer, &run_options, nullptr,
@@ -1675,15 +1681,22 @@ StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
           tsl::port::ScopedFlushDenormal flush;
           tsl::port::ScopedSetRound round(FE_TONEAREST);
 
-          XlaCustomCallStatus status;
-
           // Call generated function.
-          cpu_executable->compute_function()(result_buffer, &run_options,
-                                             nullptr, buffer_pointers.data(),
-                                             &status, nullptr);
-
-          std::optional<absl::string_view> error_message =
-              xla::CustomCallStatusGetMessage(&status);
+          std::optional<absl::string_view> error_message;
+          if (cpu_executable->IsXlaRuntime()) {
+            Status s = cpu_executable->ExecuteXlaRuntime(
+                MakeXLARuntimeDescriptorTable(buffer_table), &run_options);
+            if (!s.ok()) {
+              // TODO(kramerb): Propagate custom call error messages.
+              error_message = "XLA Runtime execution failed";
+            }
+          } else {
+            XlaCustomCallStatus status;
+            cpu_executable->compute_function()(result_buffer, &run_options,
+                                               nullptr, buffer_pointers.data(),
+                                               &status, nullptr);
+            error_message = xla::CustomCallStatusGetMessage(&status);
+          }
 
           for (auto& donation_transaction : donation_transactions) {
             std::move(donation_transaction).Commit();
@@ -1746,6 +1759,40 @@ StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   return Result({/*future=*/std::move(future), /*buffers=*/std::move(res)});
 }
 
+static void MaybeDumpHloSnapshot(
+    const HloModule& module, RunId run_id,
+    const std::vector<PjRtBuffer*>& arguments,
+    const std::vector<std::unique_ptr<PjRtBuffer>>& results) {
+  if (!DumpingEnabledForHloModule(module)) {
+    return;
+  }
+  if (!module.config().debug_options().xla_dump_hlo_snapshots()) {
+    return;
+  }
+  xla::HloSnapshot hlo_snapshot;
+  *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = module.ToProto();
+
+  for (auto* argument : arguments) {
+    *hlo_snapshot.add_arguments() = (*argument->ToLiteralSync())->ToProto();
+  }
+
+  // If there are multiple results, wrap them in a tuple.
+  if (results.size() == 1) {
+    *hlo_snapshot.mutable_result() = (*results[0]->ToLiteralSync())->ToProto();
+  } else {
+    std::vector<Literal> result_literals;
+    result_literals.reserve(results.size());
+    for (auto& result : results) {
+      result_literals.push_back(std::move(**result->ToLiteralSync()));
+    }
+    *hlo_snapshot.mutable_result() =
+        LiteralUtil::MakeTupleOwned(std::move(result_literals)).ToProto();
+  }
+
+  DumpToFileInDir(module, "", absl::StrCat("snapshot.", run_id.ToInt(), ".pb"),
+                  hlo_snapshot.SerializeAsString());
+}
+
 StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
 TfrtCpuExecutable::Execute(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
@@ -1757,9 +1804,9 @@ TfrtCpuExecutable::Execute(
   }
 
   RunId run_id;
-  tensorflow::profiler::TraceMeProducer activity(
-      "TfrtCpuExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
-      run_id.ToInt());
+  tsl::profiler::TraceMeProducer activity("TfrtCpuExecutable::Execute",
+                                          tsl::profiler::ContextType::kPjRt,
+                                          run_id.ToInt());
 
   const int num_addressable_devices = addressable_devices_.size();
 
@@ -1801,6 +1848,8 @@ TfrtCpuExecutable::Execute(
       (*returned_futures)[0] = std::move(*statusor->future);
     }
 
+    MaybeDumpHloSnapshot(cpu_executable_->module(), run_id, argument_handles[0],
+                         wrapped_results[0]);
   } else {
     // Gang schedule collectives to ensure that collectives with the same RunId
     // are run at the same time. We conservatively run only one collective at a
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
index 9d5e0172da3..06583e299ee 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -117,7 +118,7 @@ class TfrtCpuClient final : public PjRtClient {
   TfrtCpuClient(int process_index,
                 std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
                 size_t num_threads);
-  ~TfrtCpuClient();
+  ~TfrtCpuClient() override;
 
   int process_index() const override { return process_index_; }
 
@@ -151,7 +152,8 @@ class TfrtCpuClient final : public PjRtClient {
   StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override;
 
-  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis() override;
+  StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
+      const override;
 
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
@@ -161,11 +163,12 @@ class TfrtCpuClient final : public PjRtClient {
   StatusOr<std::optional<std::string>> ExecutableFingerprint(
       const PjRtLoadedExecutable& executable) const override;
 
-  StatusOr<std::string> SerializeExecutable(
-      const PjRtLoadedExecutable& executable) const override;
-
+  // For TfrtCpuClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, CompileOptions options) override;
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
 
   StatusOr<std::unique_ptr<PjRtBuffer>> CreateUninitializedBuffer(
       const Shape& shape, PjRtDevice* device) override;
@@ -319,20 +322,20 @@ class TfrtCpuBuffer final : public PjRtBuffer {
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(
       PjRtDevice* dst_device) override;
 
-  void CopyToRemoteDevice(absl::string_view serialized_descriptor,
-                          RemoteSendCallback on_done) override {
+  void CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>> serialized_descriptor,
+      RemoteSendCallback on_done) override {
     on_done(Unimplemented("CopyToRemoteDevice not implemented."),
             /*sends_were_enqueued=*/false);
   }
 
   void CopyToRemoteDeviceScattered(
-      absl::Span<const std::pair<std::string, RemoteSendCallback>>
-          serialized_descriptors_and_callbacks,
-      const ScatterDetails& scatter_details) override {
-    for (const auto& d_and_cb : serialized_descriptors_and_callbacks) {
-      d_and_cb.second(
-          Unimplemented("CopyToRemoteDeviceScattered not implemented."),
-          /*sends_were_enqueued=*/false);
+      PjRtFuture<StatusOr<std::vector<std::string>>> serialized_descriptors,
+      std::vector<RemoteSendCallback> callbacks,
+      const xla::PjRtBuffer::ScatterDetails& scatter_details) override {
+    for (const auto& on_done : callbacks) {
+      on_done(Unimplemented("Implement CopyToRemoteDeviceScattered."),
+              /*sends_were_enqueued=*/false);
     }
   }
 
@@ -516,6 +519,10 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
     CompiledMemoryStats memory_stats = CompiledMemoryStats();
     memory_stats.generated_code_size_in_bytes = SizeOfGeneratedCodeInBytes();
     const HloProto* proto = cpu_executable_->hlo_proto();
+    if (!proto) {
+      return tsl::errors::FailedPrecondition(
+          "cpu_executable_ has no hlo_proto.");
+    }
     memory_stats.serialized_hlo_proto = proto->SerializeAsString();
     return memory_stats;
   }
@@ -545,6 +552,8 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
 
   bool IsDeleted() override;
 
+  StatusOr<std::string> SerializeExecutable() const override;
+
   bool IsReturnedFutureSupported() const override { return true; }
 
   StatusOr<std::optional<std::string>> Fingerprint() const;
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
index 3d7aa205040..55e655518f2 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/custom_call_status.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/file_system.h"
+#include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -72,5 +77,71 @@ ENTRY DonationWithExecutionError() -> f32[2, 2] {
               ::testing::HasSubstr("buffer has been deleted or donated."));
 }
 
+TEST(TfrtCpuClientTest, HloSnapshot) {
+  constexpr char kProgram[] = R"(
+    HloModule add
+    ENTRY add {
+      x = f32[3,2] parameter(0)
+      y = f32[3,2] parameter(1)
+      ROOT add = f32[3,2] add(x, y)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnUnverifiedModule(kProgram, {}));
+
+  std::string dir = tsl::testing::TmpDir();
+  xla::CompileOptions options;
+  auto* debug_opts = options.executable_build_options.mutable_debug_options();
+  debug_opts->set_xla_dump_to(dir);
+  debug_opts->set_xla_dump_hlo_snapshots(true);
+  XlaComputation xla_computation(hlo_module->ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
+                          client->Compile(xla_computation, options));
+
+  std::vector<float> data1{1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  std::vector<float> data2{10.0, 20.0, 30.0, 40.0, 50.0, 60.0};
+  Shape shape = ShapeUtil::MakeShape(F32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer1,
+      client->BufferFromHostBuffer(
+          data1.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->addressable_devices()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto buffer2,
+      client->BufferFromHostBuffer(
+          data2.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->addressable_devices()[0]));
+
+  auto result = pjrt_executable->Execute(
+      /*argument_handles=*/{{buffer1.get(), buffer2.get()}},
+      /*options=*/{});
+  ASSERT_TRUE(result.ok());
+
+  tsl::FileSystem* fs;
+  ASSERT_TRUE(tsl::Env::Default()->GetFileSystemForFile(dir, &fs).ok());
+
+  std::vector<std::string> paths;
+  ASSERT_TRUE(fs->GetMatchingPaths(dir + "/*.snapshot.*.pb", &paths).ok());
+  ASSERT_EQ(paths.size(), 1);
+
+  HloSnapshot snapshot;
+  ASSERT_TRUE(
+      tsl::ReadBinaryProto(tsl::Env::Default(), paths[0], &snapshot).ok());
+
+  ASSERT_EQ(*Literal::CreateFromProto(snapshot.arguments(0)),
+            LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}));
+  ASSERT_EQ(
+      *Literal::CreateFromProto(snapshot.arguments(1)),
+      LiteralUtil::CreateR2<float>({{10.0, 20.0}, {30.0, 40.0}, {50.0, 60.0}}));
+  ASSERT_EQ(
+      *Literal::CreateFromProto(snapshot.result()),
+      LiteralUtil::CreateR2<float>({{11.0, 22.0}, {33.0, 44.0}, {55.0, 66.0}}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.cc b/tensorflow/compiler/xla/pjrt/tpu_client.cc
index 8f26ce1bd90..12a10fe2ee5 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@@ -15,30 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/tpu_client.h"
 
+#include <array>
+#include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
-#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/pjrt/utils.h"
-#include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/tpu_computation_placer.h"
 #include "tensorflow/compiler/xla/shape.h"
-#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"  // NOLINT(unused-includes): required for tensorflow::tpu::FindAndLoadTpuLibrary
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -104,16 +102,18 @@ PjRtTpuClient::PjRtTpuClient(
       }()) {
   // We always initialize the tpu client even if libtpu isn't linked in or
   // initialized.
-  if (tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_InitFn !=
-      nullptr) {
-    tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_InitFn();
+  if (stream_executor::tpu::ExecutorApiFn()
+          ->TpuAsyncCollectiveOffloadHelper_InitFn != nullptr) {
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuAsyncCollectiveOffloadHelper_InitFn();
   }
 }
 
 PjRtTpuClient::~PjRtTpuClient() {
-  if (tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_ShutdownFn !=
-      nullptr) {
-    tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_ShutdownFn();
+  if (stream_executor::tpu::ExecutorApiFn()
+          ->TpuAsyncCollectiveOffloadHelper_ShutdownFn != nullptr) {
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuAsyncCollectiveOffloadHelper_ShutdownFn();
   }
 }
 
@@ -170,11 +170,15 @@ StatusOr<std::string> PjRtTpuClient::SerializeExecutable(
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 PjRtTpuClient::DeserializeExecutable(absl::string_view serialized,
-                                     CompileOptions options) {
+                                     std::optional<CompileOptions> options) {
+  if (!options.has_value()) {
+    return InvalidArgument(
+        "PjRtTpuClient::DeserializeExecutable() requires CompileOptions");
+  }
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TpuExecutable> tpu_executable,
                       TpuExecutable::Deserialize(serialized));
 
-  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&options));
+  TF_ASSIGN_OR_RETURN(ExecutableExtras extras, GetExecutableExtras(&*options));
 
   // TODO(skyewm): can we streamline this? e.g. removing proto serialization
   XlaComputation computation(tpu_executable->module().ToProto());
@@ -188,22 +192,22 @@ PjRtTpuClient::DeserializeExecutable(absl::string_view serialized,
             .transfer_manager()
             ->ChooseCompactLayoutForShape(shape);
       },
-      options.argument_layouts, &options.executable_build_options,
+      options->argument_layouts, &options->executable_build_options,
       &unused_argument_layout_pointers));
 
   auto local_executable = std::make_unique<LocalExecutable>(
       std::move(tpu_executable), client_->mutable_backend(),
-      options.executable_build_options);
+      options->executable_build_options);
   std::vector<std::unique_ptr<LocalExecutable>> local_executables;
   local_executables.emplace_back(std::move(local_executable));
 
   auto pjrt_executable = std::make_unique<PjRtStreamExecutorExecutable>(
-      std::move(local_executables), options.parameter_is_tupled_arguments,
+      std::move(local_executables), options->parameter_is_tupled_arguments,
       std::move(extras.device_assignment),
       std::move(extras.addressable_device_logical_ids),
       std::move(extras.addressable_devices), this);
   TF_RETURN_IF_ERROR(
-      pjrt_executable->SetUpDonation(options.parameter_is_tupled_arguments));
+      pjrt_executable->SetUpDonation(options->parameter_is_tupled_arguments));
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(pjrt_executable));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.h b/tensorflow/compiler/xla/pjrt/tpu_client.h
index a17fc7fada8..d64702d3dc3 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.h
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <array>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -89,8 +90,12 @@ class PjRtTpuClient : public PjRtStreamExecutorClient {
   StatusOr<std::string> SerializeExecutable(
       const PjRtLoadedExecutable& executable) const override;
 
+  // For PjRtTpuClient, `options` is mandatory.
+  // This function returns an InvalidArgument error if `std::nullopt` is passed.
+  // TODO(b/237720161): make it actually optional
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
-      absl::string_view serialized, CompileOptions options) override;
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
 
  private:
   const std::string platform_version_;
diff --git a/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc b/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc
index 1abe190c879..0eea159f08f 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 
 namespace xla {
 namespace {
@@ -45,7 +44,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Basic) {
                                             definition_event,
                                             /*on_delete_callback_=*/nullptr);
 
-  tfrt::Await({tracked_buffer.definition_event().GetAsyncValue()});
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
   auto result = tracked_buffer.Buffers()[0];
 
@@ -83,7 +82,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, Tuple) {
       {definition_event_0, definition_event_1},
       /*on_delete_callback_=*/nullptr);
 
-  tfrt::Await({tracked_buffer.definition_event().GetAsyncValue()});
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
   auto result_0 = tracked_buffer.Buffers()[0];
   auto result_1 = tracked_buffer.Buffers()[1];
@@ -114,7 +113,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, BasicError) {
                                             definition_event,
                                             /*on_delete_callback_=*/nullptr);
 
-  tfrt::Await({tracked_buffer.definition_event().GetAsyncValue()});
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
   ASSERT_TRUE(tracked_buffer.definition_event().IsError());
   EXPECT_EQ(tracked_buffer.definition_event().GetError().message(),
@@ -149,7 +148,7 @@ TEST(TrackedTfrtCpuDeviceBufferTest, TupleError) {
       {definition_event_0, definition_event_1},
       /*on_delete_callback_=*/nullptr);
 
-  tfrt::Await({tracked_buffer.definition_event().GetAsyncValue()});
+  BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
   ASSERT_TRUE(tracked_buffer.definition_event().IsError());
   EXPECT_EQ(tracked_buffer.definition_event().GetError().message(),
diff --git a/tensorflow/compiler/xla/pjrt/utils.cc b/tensorflow/compiler/xla/pjrt/utils.cc
index 2474ed288cf..2dee5841c51 100644
--- a/tensorflow/compiler/xla/pjrt/utils.cc
+++ b/tensorflow/compiler/xla/pjrt/utils.cc
@@ -15,12 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/utils.h"
 
+#include <algorithm>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <utility>
+#include <vector>
+
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -297,4 +306,40 @@ bool HasMajorToMinorLayout(PrimitiveType type, absl::Span<int64_t const> dims,
   return true;
 }
 
+StatusOr<Shape> MakeShapeWithTrivialByteStrides(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+    absl::Span<const int64_t> byte_strides) {
+  TF_RET_CHECK(dimensions.size() == byte_strides.size());
+  std::vector<int64_t> minor_to_major(dimensions.size());
+  // Begin with a major-to-minor layout that is likey the most common.
+  std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
+  // Find minor-to-major only if there is no zero dimension size because
+  // minor-to-major is irrelevant with any zero dimension size.
+  if (absl::c_find(dimensions, 0) == dimensions.end()) {
+    absl::c_sort(minor_to_major, [&](int a, int b) {
+      if (byte_strides[a] < byte_strides[b]) {
+        return true;
+      }
+      if (byte_strides[a] > byte_strides[b]) {
+        return false;
+      }
+      return dimensions[a] == 1 && dimensions[b] != 1;
+    });
+    int64_t byte_stride = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+    for (int64_t d : minor_to_major) {
+      if (dimensions[d] != 1 && byte_strides[d] != byte_stride) {
+        return Unimplemented(
+            "Only trivial (compact) byte strides are supported; i.e., byte "
+            "striding represents a transposition of the underlying dense "
+            "buffer but not broadcasting. Dimensions were: [%s], byte strides "
+            "were [%s].",
+            absl::StrJoin(dimensions, ","), absl::StrJoin(byte_strides, ","));
+      }
+      byte_stride *= dimensions[d];
+    }
+  }
+  return ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
+                                             minor_to_major);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/utils.h b/tensorflow/compiler/xla/pjrt/utils.h
index 5c469c9f188..3f2056b604c 100644
--- a/tensorflow/compiler/xla/pjrt/utils.h
+++ b/tensorflow/compiler/xla/pjrt/utils.h
@@ -16,11 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_UTILS_H_
 
+#include <functional>
+#include <memory>
+#include <optional>
+#include <vector>
+
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -60,6 +65,13 @@ int DefaultThreadPoolSize();
 bool HasMajorToMinorLayout(PrimitiveType type, absl::Span<int64_t const> dims,
                            absl::Span<int64_t const> byte_strides);
 
+// Constructs a new dense array shape with the given byte strides. Supports only
+// trivial (compact) byte_strides that represents a transposition of a dense
+// buffer.
+StatusOr<Shape> MakeShapeWithTrivialByteStrides(
+    PrimitiveType element_type, absl::Span<const int64_t> dimensions,
+    absl::Span<const int64_t> byte_strides);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_UTILS_H_
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 1f317a75869..c2ab1cb508c 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -38,6 +38,10 @@ int SignificandWidth(PrimitiveType type) {
       return std::numeric_limits<bfloat16>::digits;
     case F16:
       return std::numeric_limits<half>::digits;
+    case F8E5M2:
+      return std::numeric_limits<tsl::float8_e5m2>::digits;
+    case F8E4M3FN:
+      return std::numeric_limits<tsl::float8_e4m3fn>::digits;
     default:
       LOG(FATAL) << "Not a floating data type " << type;
   }
@@ -71,13 +75,18 @@ int OverflowExponent(PrimitiveType type) {
       return std::numeric_limits<bfloat16>::max_exponent;
     case F16:
       return std::numeric_limits<half>::max_exponent;
+    case F8E5M2:
+      return std::numeric_limits<tsl::float8_e5m2>::max_exponent;
+    case F8E4M3FN:
+      return std::numeric_limits<tsl::float8_e4m3fn>::max_exponent;
     default:
       LOG(FATAL) << "Not a floating data type " << type;
   }
 }
 
 bool IsFloatingPointType(PrimitiveType type) {
-  return type == F16 || type == F32 || type == F64 || type == BF16;
+  return type == F16 || type == F32 || type == F64 || type == BF16 ||
+         type == F8E5M2 || type == F8E4M3FN;
 }
 
 bool IsComplexType(PrimitiveType type) { return type == C64 || type == C128; }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 5d663f93f39..3a0f146eeaa 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 namespace primitive_util {
@@ -127,6 +128,16 @@ inline PrimitiveType NativeToPrimitiveType<bfloat16>() {
   return BF16;
 }
 
+template <>
+inline PrimitiveType NativeToPrimitiveType<tsl::float8_e5m2>() {
+  return F8E5M2;
+}
+
+template <>
+inline PrimitiveType NativeToPrimitiveType<tsl::float8_e4m3fn>() {
+  return F8E4M3FN;
+}
+
 // Complex
 template <>
 inline PrimitiveType NativeToPrimitiveType<complex64>() {
@@ -162,6 +173,8 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int BitWidth(PrimitiveType type) {
 
     case S8:
     case U8:
+    case F8E5M2:
+    case F8E4M3FN:
       return 8;
 
     case S16:
@@ -203,6 +216,8 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int ByteWidth(PrimitiveType type) {
 
     case S8:
     case U8:
+    case F8E5M2:
+    case F8E4M3FN:
       return 1;
 
     case S16:
@@ -426,6 +441,16 @@ struct PrimitiveTypeToNative<BF16> {
   using type = bfloat16;
 };
 
+template <>
+struct PrimitiveTypeToNative<F8E5M2> {
+  using type = tsl::float8_e5m2;
+};
+
+template <>
+struct PrimitiveTypeToNative<F8E4M3FN> {
+  using type = tsl::float8_e4m3fn;
+};
+
 // Complex
 template <>
 struct PrimitiveTypeToNative<C64> {
@@ -451,8 +476,8 @@ bool IsPrimitiveTypeName(absl::string_view name);
 // For example,
 //  IsCanonicalRepresentation<float>(F32)          // true
 //  IsCanonicalRepresentation<xla::bfloat16>(BF16) // true
-//  IsCanonicalRepresentation<uint32_t>(S8)        // true, 8 <= 32
-//  IsCanonicalRepresentation<uint8_t>(S16)        // false, 16 > 8
+//  IsCanonicalRepresentation<int32_t>(S8)         // true, 8 <= 32
+//  IsCanonicalRepresentation<uint16_t>(S16)       // false, unsigned.
 template <typename T>
 bool IsCanonicalRepresentation(PrimitiveType type) {
   switch (type) {
@@ -460,6 +485,8 @@ bool IsCanonicalRepresentation(PrimitiveType type) {
     case F32:
     case BF16:
     case F64:
+    case F8E5M2:
+    case F8E4M3FN:
     case C64:
     case C128:
       return NativeToPrimitiveType<T>() == type;
diff --git a/tensorflow/compiler/xla/primitive_util_test.cc b/tensorflow/compiler/xla/primitive_util_test.cc
index 2437f32149f..ea797e8073f 100644
--- a/tensorflow/compiler/xla/primitive_util_test.cc
+++ b/tensorflow/compiler/xla/primitive_util_test.cc
@@ -69,6 +69,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[PRED][C64] = true;
   expecteds[PRED][BF16] = true;
   expecteds[PRED][C128] = true;
+  expecteds[PRED][F8E5M2] = true;
+  expecteds[PRED][F8E4M3FN] = true;
   expecteds[S8][PRED] = false;
   expecteds[S8][S8] = true;
   expecteds[S8][S16] = true;
@@ -84,6 +86,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S8][C64] = true;
   expecteds[S8][BF16] = true;
   expecteds[S8][C128] = true;
+  expecteds[S8][F8E5M2] = false;
+  expecteds[S8][F8E4M3FN] = false;
   expecteds[S16][PRED] = false;
   expecteds[S16][S8] = false;
   expecteds[S16][S16] = true;
@@ -99,6 +103,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S16][C64] = true;
   expecteds[S16][BF16] = false;
   expecteds[S16][C128] = true;
+  expecteds[S16][F8E5M2] = false;
+  expecteds[S16][F8E4M3FN] = false;
   expecteds[S32][PRED] = false;
   expecteds[S32][S8] = false;
   expecteds[S32][S16] = false;
@@ -114,6 +120,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S32][C64] = false;
   expecteds[S32][BF16] = false;
   expecteds[S32][C128] = true;
+  expecteds[S32][F8E5M2] = false;
+  expecteds[S32][F8E4M3FN] = false;
   expecteds[S64][PRED] = false;
   expecteds[S64][S8] = false;
   expecteds[S64][S16] = false;
@@ -129,6 +137,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[S64][C64] = false;
   expecteds[S64][BF16] = false;
   expecteds[S64][C128] = false;
+  expecteds[S64][F8E5M2] = false;
+  expecteds[S64][F8E4M3FN] = false;
   expecteds[U8][PRED] = false;
   expecteds[U8][S8] = false;
   expecteds[U8][S16] = true;
@@ -144,6 +154,10 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U8][C64] = true;
   expecteds[U8][BF16] = true;
   expecteds[U8][C128] = true;
+  expecteds[U8][BF16] = true;
+  expecteds[U8][C128] = true;
+  expecteds[U8][F8E5M2] = false;
+  expecteds[U8][F8E4M3FN] = false;
   expecteds[U16][PRED] = false;
   expecteds[U16][S8] = false;
   expecteds[U16][S16] = false;
@@ -159,6 +173,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U16][C64] = true;
   expecteds[U16][BF16] = false;
   expecteds[U16][C128] = true;
+  expecteds[U16][F8E5M2] = false;
+  expecteds[U16][F8E4M3FN] = false;
   expecteds[U32][PRED] = false;
   expecteds[U32][S8] = false;
   expecteds[U32][S16] = false;
@@ -174,6 +190,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U32][C64] = false;
   expecteds[U32][BF16] = false;
   expecteds[U32][C128] = true;
+  expecteds[U32][F8E5M2] = false;
+  expecteds[U32][F8E4M3FN] = false;
   expecteds[U64][PRED] = false;
   expecteds[U64][S8] = false;
   expecteds[U64][S16] = false;
@@ -189,6 +207,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[U64][C64] = false;
   expecteds[U64][BF16] = false;
   expecteds[U64][C128] = false;
+  expecteds[U64][F8E5M2] = false;
+  expecteds[U64][F8E4M3FN] = false;
   expecteds[F16][PRED] = false;
   expecteds[F16][S8] = false;
   expecteds[F16][S16] = false;
@@ -204,6 +224,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F16][C64] = true;
   expecteds[F16][BF16] = false;
   expecteds[F16][C128] = true;
+  expecteds[F16][F8E5M2] = false;
+  expecteds[F16][F8E4M3FN] = false;
   expecteds[F32][PRED] = false;
   expecteds[F32][S8] = false;
   expecteds[F32][S16] = false;
@@ -219,6 +241,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F32][C64] = true;
   expecteds[F32][BF16] = false;
   expecteds[F32][C128] = true;
+  expecteds[F32][F8E5M2] = false;
+  expecteds[F32][F8E4M3FN] = false;
   expecteds[F64][PRED] = false;
   expecteds[F64][S8] = false;
   expecteds[F64][S16] = false;
@@ -234,6 +258,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[F64][C64] = false;
   expecteds[F64][BF16] = false;
   expecteds[F64][C128] = true;
+  expecteds[F64][F8E5M2] = false;
+  expecteds[F64][F8E4M3FN] = false;
   expecteds[C64][PRED] = false;
   expecteds[C64][S8] = false;
   expecteds[C64][S16] = false;
@@ -249,6 +275,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C64][C64] = true;
   expecteds[C64][BF16] = false;
   expecteds[C64][C128] = true;
+  expecteds[C64][F8E5M2] = false;
+  expecteds[C64][F8E4M3FN] = false;
   expecteds[BF16][PRED] = false;
   expecteds[BF16][S8] = false;
   expecteds[BF16][S16] = false;
@@ -264,6 +292,8 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[BF16][C64] = true;
   expecteds[BF16][BF16] = true;
   expecteds[BF16][C128] = true;
+  expecteds[BF16][F8E5M2] = false;
+  expecteds[BF16][F8E4M3FN] = false;
   expecteds[C128][PRED] = false;
   expecteds[C128][S8] = false;
   expecteds[C128][S16] = false;
@@ -279,6 +309,42 @@ TEST(PrimitiveUtilTest, CastPreservesValues) {
   expecteds[C128][C64] = false;
   expecteds[C128][BF16] = false;
   expecteds[C128][C128] = true;
+  expecteds[C128][F8E5M2] = false;
+  expecteds[C128][F8E4M3FN] = false;
+  expecteds[F8E5M2][PRED] = false;
+  expecteds[F8E5M2][S8] = false;
+  expecteds[F8E5M2][S16] = false;
+  expecteds[F8E5M2][S32] = false;
+  expecteds[F8E5M2][S64] = false;
+  expecteds[F8E5M2][U8] = false;
+  expecteds[F8E5M2][U16] = false;
+  expecteds[F8E5M2][U32] = false;
+  expecteds[F8E5M2][U64] = false;
+  expecteds[F8E5M2][F16] = true;
+  expecteds[F8E5M2][F32] = true;
+  expecteds[F8E5M2][F64] = true;
+  expecteds[F8E5M2][C64] = true;
+  expecteds[F8E5M2][BF16] = true;
+  expecteds[F8E5M2][C128] = true;
+  expecteds[F8E5M2][F8E5M2] = true;
+  expecteds[F8E5M2][F8E4M3FN] = false;
+  expecteds[F8E4M3FN][PRED] = false;
+  expecteds[F8E4M3FN][S8] = false;
+  expecteds[F8E4M3FN][S16] = false;
+  expecteds[F8E4M3FN][S32] = false;
+  expecteds[F8E4M3FN][S64] = false;
+  expecteds[F8E4M3FN][U8] = false;
+  expecteds[F8E4M3FN][U16] = false;
+  expecteds[F8E4M3FN][U32] = false;
+  expecteds[F8E4M3FN][U64] = false;
+  expecteds[F8E4M3FN][F16] = true;
+  expecteds[F8E4M3FN][F32] = true;
+  expecteds[F8E4M3FN][F64] = true;
+  expecteds[F8E4M3FN][C64] = true;
+  expecteds[F8E4M3FN][BF16] = true;
+  expecteds[F8E4M3FN][C128] = true;
+  expecteds[F8E4M3FN][F8E5M2] = false;
+  expecteds[F8E4M3FN][F8E4M3FN] = true;
 
   for (int from_type_int = PrimitiveType_MIN;
        from_type_int < PrimitiveType_ARRAYSIZE; ++from_type_int) {
diff --git a/tensorflow/compiler/xla/printer.cc b/tensorflow/compiler/xla/printer.cc
new file mode 100644
index 00000000000..c8d49be1c5d
--- /dev/null
+++ b/tensorflow/compiler/xla/printer.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/printer.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+void StringPrinter::Append(absl::string_view s) {
+  absl::StrAppend(&result_, s);
+}
+
+std::string StringPrinter::ToString() && { return std::move(result_); }
+
+void CordPrinter::Append(absl::string_view s) { result_.Append(s); }
+
+absl::Cord CordPrinter::ToCord() && { return std::move(result_); }
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/printer.h b/tensorflow/compiler/xla/printer.h
new file mode 100644
index 00000000000..ce9c85a0ed6
--- /dev/null
+++ b/tensorflow/compiler/xla/printer.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PRINTER_H_
+#define TENSORFLOW_COMPILER_XLA_PRINTER_H_
+
+#include <string>
+
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+
+namespace xla {
+
+// Abstract "printer" interface.
+//
+// This interface is used in XLA to print data structures into a human-readaable
+// form. Most users can use either `StringPrinter` or `CordPrinter` to retrieve
+// the result in a string format; power users may implement their own printer to
+// implement "streamed printing" if needed.
+class Printer {
+ public:
+  virtual ~Printer() = default;
+
+  // Appends the given string to the printer.
+  virtual void Append(absl::string_view s) = 0;
+};
+
+// A printer implementation that accumulates printed strings into `std::string`.
+class StringPrinter : public Printer {
+ public:
+  void Append(absl::string_view s) override;
+
+  std::string ToString() &&;
+
+ private:
+  std::string result_;
+};
+
+// A printer implementation that accumulates printed strings into `absl::Cord`.
+class CordPrinter : public Printer {
+ public:
+  void Append(absl::string_view s) override;
+
+  absl::Cord ToCord() &&;
+
+ private:
+  absl::Cord result_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PRINTER_H_
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index d843601890d..60912f3a155 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -5,17 +5,21 @@ load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
 load(
-    "//tensorflow:tensorflow.bzl",
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_test",
+    "xla_py_test_deps",
+)
+load(
+    "//tensorflow/tsl:tsl.bzl",
     "if_cuda_or_rocm",
-    "tf_cc_test",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//tensorflow:pytype.default.bzl", "pytype_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -23,8 +27,8 @@ package(
 package_group(
     name = "friends",
     includes = [
-        "//tensorflow:internal",
         "//tensorflow/compiler/xla:friends",
+        "//tensorflow/compiler/xla:internal",
     ],
 )
 
@@ -110,7 +114,6 @@ py_test(
         "@absl_py//absl/logging",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
-        "//tensorflow/python:framework_test_lib",
     ] + xla_py_test_deps(),
 )
 
@@ -167,9 +170,10 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/python:bfloat16_lib",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@pybind11",
@@ -239,31 +243,23 @@ cc_library(
     name = "pprof_profile_builder",
     srcs = ["pprof_profile_builder.cc"],
     hdrs = ["pprof_profile_builder.h"],
+    compatible_with = [],
     copts = [
         "-fexceptions",
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":traceback",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/profiler/protobuf:profile_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@pybind11",
     ],
 )
 
-cc_library(
-    name = "numpy",
-    srcs = ["numpy.cc"],
-    hdrs = ["numpy.h"],
-    deps = [
-        "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
-    ],
-)
-
 cc_library(
     name = "py_client",
     srcs = [
@@ -299,7 +295,8 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":exceptions",
-        ":numpy",
+        "//tensorflow/tsl/platform:float8",
+        "//tensorflow/tsl/python/lib/core:numpy",
         ":pprof_profile_builder",
         ":python_ref_manager",
         ":python_utils",
@@ -317,7 +314,9 @@ cc_library(
         "@com_google_absl//absl/types:variant",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@llvm-project//llvm:Support",
         "//tensorflow/compiler/xla/pjrt:host_callback",
+        "//tensorflow/compiler/xla/pjrt:pjrt_future",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -328,9 +327,12 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "//tensorflow/compiler/xla/pjrt:transpose",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//tensorflow/tsl/platform:fingerprint",
@@ -359,7 +361,6 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@dlpack",
@@ -385,6 +386,7 @@ cc_library(
         ":python_utils",
         ":pytree",
         ":types",
+        ":util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -392,6 +394,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/pjrt:lru_cache",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/python/ifrt",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//third_party/python_runtime:headers",  # build_cleaner: keep
@@ -419,8 +422,8 @@ cc_library(
     deps = [
         ":status_casters",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:custom_call_sharding_helper",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_sharding_util",
         "//tensorflow/compiler/xla/service/spmd:spmd_partitioner",
         "@com_google_absl//absl/cleanup",
@@ -492,7 +495,12 @@ cc_library(
     deps = [
         ":jax_jit",
         ":py_client",
+        ":python_utils",
         ":status_casters",
+        ":util",
+        "//tensorflow/compiler/xla/pjrt:lru_cache",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/synchronization",
         "@pybind11",
@@ -516,9 +524,13 @@ cc_library(
         ":py_client",
         ":python_utils",
         ":types",
+        ":util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
@@ -531,7 +543,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "outfeed_receiver_test_cpu",
     size = "small",
     srcs = ["outfeed_receiver_test.cc"],
@@ -542,9 +554,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/pjrt:cpu_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -613,18 +625,23 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":types",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/mlir/utils:error_util",
         "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
+        "//tensorflow/tsl/platform:errors",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SparseTensorDialect",
         "@pybind11",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -642,13 +659,13 @@ cc_library(
     deps = [
         ":types",
         "//tensorflow/compiler/xla:status",
-        "//tensorflow/core/profiler/backends/cpu:python_tracer",
-        "//tensorflow/core/profiler/lib:profiler_backends",
-        "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",
-        "//tensorflow/core/profiler/rpc/client:capture_profile",
-        "//tensorflow/core/profiler/rpc/client:profiler_client",
-        "//tensorflow/python/profiler/internal:traceme_wrapper",
+        "//tensorflow/compiler/xla/backends/profiler:profiler_backends",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:python_tracer",
+        "//tensorflow/compiler/xla/python/profiler/internal:traceme_wrapper",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/profiler/rpc/client:capture_profile",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client",
         "@pybind11",
     ],
 )
@@ -675,6 +692,7 @@ cc_library(
 
 cc_library(
     name = "util",
+    srcs = ["util.cc"],
     hdrs = ["util.h"],
     compatible_with = [],
     copts = [
@@ -683,6 +701,11 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_future",
+        "//tensorflow/compiler/xla/python/ifrt",
         "@com_google_absl//absl/strings:str_format",
         "@pybind11",
     ],
@@ -729,17 +752,22 @@ cc_library(
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
+        "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/tsl/lib/strings:proto_serialization",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -816,7 +844,7 @@ cc_library(
     ]),
 )
 
-pybind_extension(
+tsl_pybind_extension(
     name = "xla_extension",
     srcs = [
         "xla.cc",
@@ -851,7 +879,7 @@ pybind_extension(
         ":custom_call_sharding",
         ":jax_jit",
         ":mlir",
-        ":numpy",
+        "//tensorflow/tsl/python/lib/core:numpy",
         ":ops",
         ":util",
         ":pmap_lib",
@@ -879,7 +907,6 @@ pybind_extension(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
-        "//tensorflow/compiler/xla/pjrt:cpu_device",
         "//tensorflow/compiler/xla/pjrt:interpreter_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
@@ -887,9 +914,13 @@ pybind_extension(
         "//tensorflow/compiler/xla/pjrt/distributed",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
         "//tensorflow/compiler/xla/pjrt/distributed:service",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt",
+        "//tensorflow/compiler/xla/pjrt:pjrt_api",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/core/distributed_runtime/preemption:preemption_sync_manager",
-        "//tensorflow/python:bfloat16_lib",
+        "//tensorflow/tsl/distributed_runtime/preemption:preemption_sync_manager",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",
     ] + select({
         ":gpu_enabled": [
             "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
@@ -913,23 +944,3 @@ pybind_extension(
         "//conditions:default": [],
     }),
 )
-
-cc_library(
-    name = "bfloat16_lib",
-    srcs = [
-        "bfloat16.cc",
-        "float8_e4m3b11.cc",
-    ],
-    hdrs = [
-        "bfloat16.h",
-        "float8_e4m3b11.h",
-    ],
-    deps = [
-        ":numpy",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:types",
-        "//third_party/eigen3",
-        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "@com_google_absl//absl/strings",
-    ],
-)
diff --git a/tensorflow/compiler/xla/python/bfloat16.cc b/tensorflow/compiler/xla/python/bfloat16.cc
deleted file mode 100644
index fb03a64d902..00000000000
--- a/tensorflow/compiler/xla/python/bfloat16.cc
+++ /dev/null
@@ -1,1861 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/python/bfloat16.h"
-
-#include <array>
-#include <cmath>
-#include <limits>
-#include <locale>
-#include <memory>
-
-#include "tensorflow/compiler/xla/python/float8_e4m3b11.h"
-// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
-#include <Python.h>
-
-#include "absl/strings/str_cat.h"
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/compiler/xla/python/numpy.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/types.h"
-
-using bfloat16 = ::Eigen::bfloat16;
-
-namespace xla {
-namespace {
-
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
-// Safe container for an owned PyObject. On destruction, the reference count of
-// the contained object will be decremented.
-using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
-Safe_PyObjectPtr make_safe(PyObject* object) {
-  return Safe_PyObjectPtr(object);
-}
-
-bool PyLong_CheckNoOverflow(PyObject* object) {
-  if (!PyLong_Check(object)) {
-    return false;
-  }
-  int overflow = 0;
-  PyLong_AsLongAndOverflow(object, &overflow);
-  return (overflow == 0);
-}
-
-template <typename T, typename Enable = void>
-struct TypeDescriptor {
-  // typedef ... T;  // Representation type in memory for NumPy values of type
-  // static int Dtype() { return NPY_...; }  // Numpy type number for T.
-};
-
-template <typename T>
-struct CustomFloatTypeDescriptor {
-  static int Dtype() { return npy_type; }
-
-  // Registered numpy type ID. Global variable populated by the registration
-  // code. Protected by the GIL.
-  static int npy_type;
-
-  static PyTypeObject type;
-  // Pointer to the python type object we are using. This is either a pointer
-  // to type, if we choose to register it, or to the python type
-  // registered by another system into NumPy.
-  static PyTypeObject* type_ptr;
-
-  static PyNumberMethods number_methods;
-
-  static PyArray_ArrFuncs arr_funcs;
-
-  static PyArray_Descr npy_descr;
-};
-template <typename T>
-int CustomFloatTypeDescriptor<T>::npy_type = NPY_NOTYPE;
-template <typename T>
-PyTypeObject* CustomFloatTypeDescriptor<T>::type_ptr = nullptr;
-
-// Representation of a Python custom float object.
-template <typename T>
-struct PyCustomFloat {
-  PyObject_HEAD;  // Python object header
-  T value;
-};
-
-// Returns true if 'object' is a PyCustomFloat.
-template <typename T>
-bool PyCustomFloat_Check(PyObject* object) {
-  return PyObject_IsInstance(
-      object, reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type));
-}
-
-// Extracts the value of a PyCustomFloat object.
-template <typename T>
-T PyCustomFloat_CustomFloat(PyObject* object) {
-  return reinterpret_cast<PyCustomFloat<T>*>(object)->value;
-}
-
-// Constructs a PyCustomFloat object from PyCustomFloat<T>::T.
-template <typename T>
-Safe_PyObjectPtr PyCustomFloat_FromT(T x) {
-  Safe_PyObjectPtr ref =
-      make_safe(TypeDescriptor<T>::type.tp_alloc(&TypeDescriptor<T>::type, 0));
-  PyCustomFloat<T>* p = reinterpret_cast<PyCustomFloat<T>*>(ref.get());
-  if (p) {
-    p->value = x;
-  }
-  return ref;
-}
-
-// Converts a Python object to a reduced float value. Returns true on success,
-// returns false and reports a Python error on failure.
-template <typename T>
-bool CastToCustomFloat(PyObject* arg, T* output) {
-  if (PyCustomFloat_Check<T>(arg)) {
-    *output = PyCustomFloat_CustomFloat<T>(arg);
-    return true;
-  }
-  if (PyFloat_Check(arg)) {
-    double d = PyFloat_AsDouble(arg);
-    if (PyErr_Occurred()) {
-      return false;
-    }
-    // TODO(phawkins): check for overflow
-    *output = T(d);
-    return true;
-  }
-  if (PyLong_CheckNoOverflow(arg)) {
-    long l = PyLong_AsLong(arg);  // NOLINT
-    if (PyErr_Occurred()) {
-      return false;
-    }
-    // TODO(phawkins): check for overflow
-    *output = T(static_cast<float>(l));
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Half)) {
-    Eigen::half f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Float)) {
-    float f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Double)) {
-    double f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, LongDouble)) {
-    long double f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsZeroDim(arg)) {
-    Safe_PyObjectPtr ref;
-    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
-    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
-      ref = make_safe(PyArray_Cast(arr, TypeDescriptor<T>::Dtype()));
-      if (PyErr_Occurred()) {
-        return false;
-      }
-      arg = ref.get();
-      arr = reinterpret_cast<PyArrayObject*>(arg);
-    }
-    *output = *reinterpret_cast<T*>(PyArray_DATA(arr));
-    return true;
-  }
-  return false;
-}
-
-template <typename T>
-bool SafeCastToCustomFloat(PyObject* arg, T* output) {
-  if (PyCustomFloat_Check<T>(arg)) {
-    *output = PyCustomFloat_CustomFloat<T>(arg);
-    return true;
-  }
-  return false;
-}
-
-// Converts a PyReduceFloat into a PyFloat.
-template <typename T>
-PyObject* PyCustomFloat_Float(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  return PyFloat_FromDouble(static_cast<double>(static_cast<float>(x)));
-}
-
-// Converts a PyReduceFloat into a PyInt.
-template <typename T>
-PyObject* PyCustomFloat_Int(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  long y = static_cast<long>(static_cast<float>(x));  // NOLINT
-  return PyLong_FromLong(y);
-}
-
-// Negates a PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Negative(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  return PyCustomFloat_FromT<T>(-x).release();
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Add(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x + y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_add(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Subtract(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x - y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_subtract(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Multiply(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x * y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_multiply(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_TrueDivide(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x / y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_true_divide(a, b);
-}
-
-// Python number methods for PyCustomFloat objects.
-template <typename T>
-PyNumberMethods CustomFloatTypeDescriptor<T>::number_methods = {
-    PyCustomFloat_Add<T>,       // nb_add
-    PyCustomFloat_Subtract<T>,  // nb_subtract
-    PyCustomFloat_Multiply<T>,  // nb_multiply
-    nullptr,                    // nb_remainder
-    nullptr,                    // nb_divmod
-    nullptr,                    // nb_power
-    PyCustomFloat_Negative<T>,  // nb_negative
-    nullptr,                    // nb_positive
-    nullptr,                    // nb_absolute
-    nullptr,                    // nb_nonzero
-    nullptr,                    // nb_invert
-    nullptr,                    // nb_lshift
-    nullptr,                    // nb_rshift
-    nullptr,                    // nb_and
-    nullptr,                    // nb_xor
-    nullptr,                    // nb_or
-    PyCustomFloat_Int<T>,       // nb_int
-    nullptr,                    // reserved
-    PyCustomFloat_Float<T>,     // nb_float
-
-    nullptr,  // nb_inplace_add
-    nullptr,  // nb_inplace_subtract
-    nullptr,  // nb_inplace_multiply
-    nullptr,  // nb_inplace_remainder
-    nullptr,  // nb_inplace_power
-    nullptr,  // nb_inplace_lshift
-    nullptr,  // nb_inplace_rshift
-    nullptr,  // nb_inplace_and
-    nullptr,  // nb_inplace_xor
-    nullptr,  // nb_inplace_or
-
-    nullptr,                      // nb_floor_divide
-    PyCustomFloat_TrueDivide<T>,  // nb_true_divide
-    nullptr,                      // nb_inplace_floor_divide
-    nullptr,                      // nb_inplace_true_divide
-    nullptr,                      // nb_index
-};
-
-// Constructs a new PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_New(PyTypeObject* type, PyObject* args,
-                            PyObject* kwds) {
-  if (kwds && PyDict_Size(kwds)) {
-    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
-    return nullptr;
-  }
-  Py_ssize_t size = PyTuple_Size(args);
-  if (size != 1) {
-    PyErr_Format(PyExc_TypeError,
-                 "expected number as argument to %s constructor",
-                 TypeDescriptor<T>::kTypeName);
-    return nullptr;
-  }
-  PyObject* arg = PyTuple_GetItem(args, 0);
-
-  T value;
-  if (PyCustomFloat_Check<T>(arg)) {
-    Py_INCREF(arg);
-    return arg;
-  } else if (CastToCustomFloat<T>(arg, &value)) {
-    return PyCustomFloat_FromT<T>(value).release();
-  } else if (PyArray_Check(arg)) {
-    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
-    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
-      return PyArray_Cast(arr, TypeDescriptor<T>::Dtype());
-    } else {
-      Py_INCREF(arg);
-      return arg;
-    }
-  }
-  PyErr_Format(PyExc_TypeError, "expected number, got %s",
-               Py_TYPE(arg)->tp_name);
-  return nullptr;
-}
-
-// Comparisons on PyCustomFloats.
-template <typename T>
-PyObject* PyCustomFloat_RichCompare(PyObject* a, PyObject* b, int op) {
-  T x, y;
-  if (!SafeCastToCustomFloat<T>(a, &x) || !SafeCastToCustomFloat<T>(b, &y)) {
-    return PyGenericArrType_Type.tp_richcompare(a, b, op);
-  }
-  bool result;
-  switch (op) {
-    case Py_LT:
-      result = x < y;
-      break;
-    case Py_LE:
-      result = x <= y;
-      break;
-    case Py_EQ:
-      result = x == y;
-      break;
-    case Py_NE:
-      result = x != y;
-      break;
-    case Py_GT:
-      result = x > y;
-      break;
-    case Py_GE:
-      result = x >= y;
-      break;
-    default:
-      LOG(FATAL) << "Invalid op type " << op;
-  }
-  return PyBool_FromLong(result);
-}
-
-// Implementation of repr() for PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Repr(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  std::string v = absl::StrCat(static_cast<float>(x));
-  return PyUnicode_FromString(v.c_str());
-}
-
-// Implementation of str() for PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Str(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  std::string v = absl::StrCat(static_cast<float>(x));
-  return PyUnicode_FromString(v.c_str());
-}
-
-// _Py_HashDouble changed its prototype for Python 3.10 so we use an overload to
-// handle the two possibilities.
-// NOLINTNEXTLINE(clang-diagnostic-unused-function)
-Py_hash_t HashImpl(Py_hash_t (*hash_double)(PyObject*, double), PyObject* self,
-                   double value) {
-  return hash_double(self, value);
-}
-
-// NOLINTNEXTLINE(clang-diagnostic-unused-function)
-Py_hash_t HashImpl(Py_hash_t (*hash_double)(double), PyObject* self,
-                   double value) {
-  return hash_double(value);
-}
-
-// Hash function for PyCustomFloat.
-template <typename T>
-Py_hash_t PyCustomFloat_Hash(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  return HashImpl(&_Py_HashDouble, self, static_cast<double>(x));
-}
-
-// Python type for PyCustomFloat objects.
-template <typename T>
-PyTypeObject CustomFloatTypeDescriptor<T>::type = {
-    PyVarObject_HEAD_INIT(nullptr, 0) TypeDescriptor<T>::kTypeName,  // tp_name
-    sizeof(PyCustomFloat<T>),  // tp_basicsize
-    0,                         // tp_itemsize
-    nullptr,                   // tp_dealloc
-#if PY_VERSION_HEX < 0x03080000
-    nullptr,  // tp_print
-#else
-    0,  // tp_vectorcall_offset
-#endif
-    nullptr,                                        // tp_getattr
-    nullptr,                                        // tp_setattr
-    nullptr,                                        // tp_compare / tp_reserved
-    PyCustomFloat_Repr<T>,                          // tp_repr
-    &CustomFloatTypeDescriptor<T>::number_methods,  // tp_as_number
-    nullptr,                                        // tp_as_sequence
-    nullptr,                                        // tp_as_mapping
-    PyCustomFloat_Hash<T>,                          // tp_hash
-    nullptr,                                        // tp_call
-    PyCustomFloat_Str<T>,                           // tp_str
-    nullptr,                                        // tp_getattro
-    nullptr,                                        // tp_setattro
-    nullptr,                                        // tp_as_buffer
-                                                    // tp_flags
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
-    TypeDescriptor<T>::kTpDoc,     // tp_doc
-    nullptr,                       // tp_traverse
-    nullptr,                       // tp_clear
-    PyCustomFloat_RichCompare<T>,  // tp_richcompare
-    0,                             // tp_weaklistoffset
-    nullptr,                       // tp_iter
-    nullptr,                       // tp_iternext
-    nullptr,                       // tp_methods
-    nullptr,                       // tp_members
-    nullptr,                       // tp_getset
-    nullptr,                       // tp_base
-    nullptr,                       // tp_dict
-    nullptr,                       // tp_descr_get
-    nullptr,                       // tp_descr_set
-    0,                             // tp_dictoffset
-    nullptr,                       // tp_init
-    nullptr,                       // tp_alloc
-    PyCustomFloat_New<T>,          // tp_new
-    nullptr,                       // tp_free
-    nullptr,                       // tp_is_gc
-    nullptr,                       // tp_bases
-    nullptr,                       // tp_mro
-    nullptr,                       // tp_cache
-    nullptr,                       // tp_subclasses
-    nullptr,                       // tp_weaklist
-    nullptr,                       // tp_del
-    0,                             // tp_version_tag
-};
-
-// Numpy support
-template <typename T>
-PyArray_ArrFuncs CustomFloatTypeDescriptor<T>::arr_funcs;
-
-template <typename T>
-PyArray_Descr CustomFloatTypeDescriptor<T>::npy_descr = {
-    PyObject_HEAD_INIT(nullptr)  //
-                                 /*typeobj=*/
-    (&TypeDescriptor<T>::type),
-    /*kind=*/TypeDescriptor<T>::kNpyDescrKind,
-    /*type=*/TypeDescriptor<T>::kNpyDescrType,
-    /*byteorder=*/TypeDescriptor<T>::kNpyDescrByteorder,
-    /*flags=*/NPY_NEEDS_PYAPI | NPY_USE_SETITEM,
-    /*type_num=*/0,
-    /*elsize=*/sizeof(T),
-    /*alignment=*/alignof(T),
-    /*subarray=*/nullptr,
-    /*fields=*/nullptr,
-    /*names=*/nullptr,
-    /*f=*/&CustomFloatTypeDescriptor<T>::arr_funcs,
-    /*metadata=*/nullptr,
-    /*c_metadata=*/nullptr,
-    /*hash=*/-1,  // -1 means "not computed yet".
-};
-
-// Implementations of NumPy array methods.
-
-template <typename T>
-PyObject* NPyCustomFloat_GetItem(void* data, void* arr) {
-  T x;
-  memcpy(&x, data, sizeof(T));
-  return PyFloat_FromDouble(static_cast<float>(x));
-}
-
-template <typename T>
-int NPyCustomFloat_SetItem(PyObject* item, void* data, void* arr) {
-  T x;
-  if (!CastToCustomFloat<T>(item, &x)) {
-    PyErr_Format(PyExc_TypeError, "expected number, got %s",
-                 Py_TYPE(item)->tp_name);
-    return -1;
-  }
-  memcpy(data, &x, sizeof(T));
-  return 0;
-}
-
-void ByteSwap16(void* value) {
-  char* p = reinterpret_cast<char*>(value);
-  std::swap(p[0], p[1]);
-}
-
-template <typename T>
-int NPyCustomFloat_Compare(const void* a, const void* b, void* arr) {
-  T x;
-  memcpy(&x, a, sizeof(T));
-
-  T y;
-  memcpy(&y, b, sizeof(T));
-  float fy(y);
-  float fx(x);
-
-  if (fx < fy) {
-    return -1;
-  }
-  if (fy < fx) {
-    return 1;
-  }
-  // NaNs sort to the end.
-  if (!Eigen::numext::isnan(fx) && Eigen::numext::isnan(fy)) {
-    return -1;
-  }
-  if (Eigen::numext::isnan(fx) && !Eigen::numext::isnan(fy)) {
-    return 1;
-  }
-  return 0;
-}
-
-template <typename T>
-void NPyCustomFloat_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
-                              npy_intp sstride, npy_intp n, int swap,
-                              void* arr) {
-  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
-                "Not supported");
-  char* dst = reinterpret_cast<char*>(dstv);
-  char* src = reinterpret_cast<char*>(srcv);
-  if (!src) {
-    return;
-  }
-  if (swap && sizeof(T) == sizeof(int16_t)) {
-    for (npy_intp i = 0; i < n; i++) {
-      char* r = dst + dstride * i;
-      memcpy(r, src + sstride * i, sizeof(T));
-      ByteSwap16(r);
-    }
-  } else if (dstride == sizeof(T) && sstride == sizeof(T)) {
-    memcpy(dst, src, n * sizeof(T));
-  } else {
-    for (npy_intp i = 0; i < n; i++) {
-      memcpy(dst + dstride * i, src + sstride * i, sizeof(T));
-    }
-  }
-}
-
-template <typename T>
-void NPyCustomFloat_CopySwap(void* dst, void* src, int swap, void* arr) {
-  if (!src) {
-    return;
-  }
-  memcpy(dst, src, sizeof(T));
-  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
-                "Not supported");
-  if (swap && sizeof(T) == sizeof(int16_t)) {
-    ByteSwap16(dst);
-  }
-}
-
-template <typename T>
-npy_bool NPyCustomFloat_NonZero(void* data, void* arr) {
-  T x;
-  memcpy(&x, data, sizeof(x));
-  return x != static_cast<T>(0);
-}
-
-template <typename T>
-int NPyCustomFloat_Fill(void* buffer_raw, npy_intp length, void* ignored) {
-  T* const buffer = reinterpret_cast<T*>(buffer_raw);
-  const float start(buffer[0]);
-  const float delta = static_cast<float>(buffer[1]) - start;
-  for (npy_intp i = 2; i < length; ++i) {
-    buffer[i] = static_cast<T>(start + i * delta);
-  }
-  return 0;
-}
-
-template <typename T>
-void NPyCustomFloat_DotFunc(void* ip1, npy_intp is1, void* ip2, npy_intp is2,
-                            void* op, npy_intp n, void* arr) {
-  char* c1 = reinterpret_cast<char*>(ip1);
-  char* c2 = reinterpret_cast<char*>(ip2);
-  float acc = 0.0f;
-  for (npy_intp i = 0; i < n; ++i) {
-    T* const b1 = reinterpret_cast<T*>(c1);
-    T* const b2 = reinterpret_cast<T*>(c2);
-    acc += static_cast<float>(*b1) * static_cast<float>(*b2);
-    c1 += is1;
-    c2 += is2;
-  }
-  T* out = reinterpret_cast<T*>(op);
-  *out = static_cast<T>(acc);
-}
-
-template <typename T>
-int NPyCustomFloat_CompareFunc(const void* v1, const void* v2, void* arr) {
-  T b1 = *reinterpret_cast<const T*>(v1);
-  T b2 = *reinterpret_cast<const T*>(v2);
-  if (b1 < b2) {
-    return -1;
-  }
-  if (b1 > b2) {
-    return 1;
-  }
-  return 0;
-}
-
-template <typename T>
-int NPyCustomFloat_ArgMaxFunc(void* data, npy_intp n, npy_intp* max_ind,
-                              void* arr) {
-  const T* bdata = reinterpret_cast<const T*>(data);
-  // Start with a max_val of NaN, this results in the first iteration preferring
-  // bdata[0].
-  float max_val = std::numeric_limits<float>::quiet_NaN();
-  for (npy_intp i = 0; i < n; ++i) {
-    // This condition is chosen so that NaNs are always considered "max".
-    if (!(static_cast<float>(bdata[i]) <= max_val)) {
-      max_val = static_cast<float>(bdata[i]);
-      *max_ind = i;
-      // NumPy stops at the first NaN.
-      if (Eigen::numext::isnan(max_val)) {
-        break;
-      }
-    }
-  }
-  return 0;
-}
-
-template <typename T>
-int NPyCustomFloat_ArgMinFunc(void* data, npy_intp n, npy_intp* min_ind,
-                              void* arr) {
-  const T* bdata = reinterpret_cast<const T*>(data);
-  float min_val = std::numeric_limits<float>::quiet_NaN();
-  // Start with a min_val of NaN, this results in the first iteration preferring
-  // bdata[0].
-  for (npy_intp i = 0; i < n; ++i) {
-    // This condition is chosen so that NaNs are always considered "min".
-    if (!(static_cast<float>(bdata[i]) >= min_val)) {
-      min_val = static_cast<float>(bdata[i]);
-      *min_ind = i;
-      // NumPy stops at the first NaN.
-      if (Eigen::numext::isnan(min_val)) {
-        break;
-      }
-    }
-  }
-  return 0;
-}
-
-template <>
-struct TypeDescriptor<unsigned char> {
-  typedef unsigned char T;
-  static int Dtype() { return NPY_UBYTE; }
-};
-
-template <>
-struct TypeDescriptor<unsigned short> {  // NOLINT
-  typedef unsigned short T;              // NOLINT
-  static int Dtype() { return NPY_USHORT; }
-};
-
-// We register "int", "long", and "long long" types for portability across
-// Linux, where "int" and "long" are the same type, and Windows, where "long"
-// and "longlong" are the same type.
-template <>
-struct TypeDescriptor<unsigned int> {
-  typedef unsigned int T;
-  static int Dtype() { return NPY_UINT; }
-};
-
-template <>
-struct TypeDescriptor<unsigned long> {  // NOLINT
-  typedef unsigned long T;              // NOLINT
-  static int Dtype() { return NPY_ULONG; }
-};
-
-template <>
-struct TypeDescriptor<unsigned long long> {  // NOLINT
-  typedef unsigned long long T;              // NOLINT
-  static int Dtype() { return NPY_ULONGLONG; }
-};
-
-template <>
-struct TypeDescriptor<signed char> {
-  typedef signed char T;
-  static int Dtype() { return NPY_BYTE; }
-};
-
-template <>
-struct TypeDescriptor<short> {  // NOLINT
-  typedef short T;              // NOLINT
-  static int Dtype() { return NPY_SHORT; }
-};
-
-template <>
-struct TypeDescriptor<int> {
-  typedef int T;
-  static int Dtype() { return NPY_INT; }
-};
-
-template <>
-struct TypeDescriptor<long> {  // NOLINT
-  typedef long T;              // NOLINT
-  static int Dtype() { return NPY_LONG; }
-};
-
-template <>
-struct TypeDescriptor<long long> {  // NOLINT
-  typedef long long T;              // NOLINT
-  static int Dtype() { return NPY_LONGLONG; }
-};
-
-template <>
-struct TypeDescriptor<bool> {
-  typedef unsigned char T;
-  static int Dtype() { return NPY_BOOL; }
-};
-
-template <>
-struct TypeDescriptor<Eigen::half> {
-  typedef Eigen::half T;
-  static int Dtype() { return NPY_HALF; }
-};
-
-template <>
-struct TypeDescriptor<float> {
-  typedef float T;
-  static int Dtype() { return NPY_FLOAT; }
-};
-
-template <>
-struct TypeDescriptor<double> {
-  typedef double T;
-  static int Dtype() { return NPY_DOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<long double> {
-  typedef long double T;
-  static int Dtype() { return NPY_LONGDOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<float>> {
-  typedef std::complex<float> T;
-  static int Dtype() { return NPY_CFLOAT; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<double>> {
-  typedef std::complex<double> T;
-  static int Dtype() { return NPY_CDOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<long double>> {
-  typedef std::complex<long double> T;
-  static int Dtype() { return NPY_CLONGDOUBLE; }
-};
-
-template <typename T>
-float CastToFloat(T value) {
-  return static_cast<float>(value);
-}
-
-template <typename T>
-float CastToFloat(std::complex<T> value) {
-  return CastToFloat(value.real());
-}
-
-// Performs a NumPy array cast from type 'From' to 'To'.
-template <typename From, typename To>
-void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
-             void* toarr) {
-  const auto* from =
-      reinterpret_cast<typename TypeDescriptor<From>::T*>(from_void);
-  auto* to = reinterpret_cast<typename TypeDescriptor<To>::T*>(to_void);
-  for (npy_intp i = 0; i < n; ++i) {
-    to[i] = static_cast<typename TypeDescriptor<To>::T>(
-        static_cast<To>(CastToFloat(from[i])));
-  }
-}
-
-// Registers a cast between T (a reduced float) and type 'OtherT'. 'numpy_type'
-// is the NumPy type corresponding to 'OtherT'.
-template <typename T, typename OtherT>
-bool RegisterCustomFloatCast(int numpy_type = TypeDescriptor<OtherT>::Dtype()) {
-  PyArray_Descr* descr = PyArray_DescrFromType(numpy_type);
-  if (PyArray_RegisterCastFunc(descr, TypeDescriptor<T>::Dtype(),
-                               NPyCast<OtherT, T>) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCastFunc(&CustomFloatTypeDescriptor<T>::npy_descr,
-                               numpy_type, NPyCast<T, OtherT>) < 0) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-bool RegisterCasts() {
-  if (!RegisterCustomFloatCast<T, Eigen::half>(NPY_HALF)) {
-    return false;
-  }
-
-  if (!RegisterCustomFloatCast<T, float>(NPY_FLOAT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, double>(NPY_DOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long double>(NPY_LONGDOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, bool>(NPY_BOOL)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned char>(NPY_UBYTE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned short>(NPY_USHORT)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned int>(NPY_UINT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned long>(NPY_ULONG)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned long long>(  // NOLINT
-          NPY_ULONGLONG)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, signed char>(NPY_BYTE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, short>(NPY_SHORT)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, int>(NPY_INT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long>(NPY_LONG)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long long>(NPY_LONGLONG)) {  // NOLINT
-    return false;
-  }
-  // Following the numpy convention. imag part is dropped when converting to
-  // float.
-  if (!RegisterCustomFloatCast<T, std::complex<float>>(NPY_CFLOAT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, std::complex<double>>(NPY_CDOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, std::complex<long double>>(NPY_CLONGDOUBLE)) {
-    return false;
-  }
-
-  // Safe casts from T to other types
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_FLOAT,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_DOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_LONGDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CFLOAT,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CLONGDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-
-  // Safe casts to T from other types
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BOOL),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_UBYTE),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BYTE),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-
-  return true;
-}
-
-template <typename InType, typename OutType, typename Functor>
-struct UnaryUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    char* o = args[1];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) = Functor()(x);
-      i0 += steps[0];
-      o += steps[1];
-    }
-  }
-};
-
-template <typename InType, typename OutType, typename OutType2,
-          typename Functor>
-struct UnaryUFunc2 {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype(),
-            TypeDescriptor<OutType2>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    char* o0 = args[1];
-    char* o1 = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      std::tie(*reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o0),
-               *reinterpret_cast<typename TypeDescriptor<OutType2>::T*>(o1)) =
-          Functor()(x);
-      i0 += steps[0];
-      o0 += steps[1];
-      o1 += steps[2];
-    }
-  }
-};
-
-template <typename InType, typename OutType, typename Functor>
-struct BinaryUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType>::Dtype(),
-            TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      auto y = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i1);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
-          Functor()(x, y);
-      i0 += steps[0];
-      i1 += steps[1];
-      o += steps[2];
-    }
-  }
-};
-
-template <typename InType, typename InType2, typename OutType, typename Functor>
-struct BinaryUFunc2 {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType2>::Dtype(),
-            TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      auto y =
-          *reinterpret_cast<const typename TypeDescriptor<InType2>::T*>(i1);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
-          Functor()(x, y);
-      i0 += steps[0];
-      i1 += steps[1];
-      o += steps[2];
-    }
-  }
-};
-
-template <typename UFunc, typename CustomFloatT>
-bool RegisterUFunc(PyObject* numpy, const char* name) {
-  std::vector<int> types = UFunc::Types();
-  PyUFuncGenericFunction fn =
-      reinterpret_cast<PyUFuncGenericFunction>(UFunc::Call);
-  Safe_PyObjectPtr ufunc_obj = make_safe(PyObject_GetAttrString(numpy, name));
-  if (!ufunc_obj) {
-    return false;
-  }
-  PyUFuncObject* ufunc = reinterpret_cast<PyUFuncObject*>(ufunc_obj.get());
-  if (static_cast<int>(types.size()) != ufunc->nargs) {
-    PyErr_Format(PyExc_AssertionError,
-                 "ufunc %s takes %d arguments, loop takes %lu", name,
-                 ufunc->nargs, types.size());
-    return false;
-  }
-  if (PyUFunc_RegisterLoopForType(ufunc, TypeDescriptor<CustomFloatT>::Dtype(),
-                                  fn, const_cast<int*>(types.data()),
-                                  nullptr) < 0) {
-    return false;
-  }
-  return true;
-}
-
-namespace ufuncs {
-
-template <typename T>
-struct Add {
-  T operator()(T a, T b) { return a + b; }
-};
-template <typename T>
-struct Subtract {
-  T operator()(T a, T b) { return a - b; }
-};
-template <typename T>
-struct Multiply {
-  T operator()(T a, T b) { return a * b; }
-};
-template <typename T>
-struct TrueDivide {
-  T operator()(T a, T b) { return a / b; }
-};
-
-inline std::pair<float, float> divmod(float a, float b) {
-  if (b == 0.0f) {
-    float nan = std::numeric_limits<float>::quiet_NaN();
-    return {nan, nan};
-  }
-  float mod = std::fmod(a, b);
-  float div = (a - mod) / b;
-  if (mod != 0.0f) {
-    if ((b < 0.0f) != (mod < 0.0f)) {
-      mod += b;
-      div -= 1.0f;
-    }
-  } else {
-    mod = std::copysign(0.0f, b);
-  }
-
-  float floordiv;
-  if (div != 0.0f) {
-    floordiv = std::floor(div);
-    if (div - floordiv > 0.5f) {
-      floordiv += 1.0f;
-    }
-  } else {
-    floordiv = std::copysign(0.0f, a / b);
-  }
-  return {floordiv, mod};
-}
-
-template <typename T>
-struct FloorDivide {
-  T operator()(T a, T b) {
-    return T(divmod(static_cast<float>(a), static_cast<float>(b)).first);
-  }
-};
-template <typename T>
-struct Remainder {
-  T operator()(T a, T b) {
-    return T(divmod(static_cast<float>(a), static_cast<float>(b)).second);
-  }
-};
-template <typename T>
-struct DivmodUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype(),
-            TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype()};
-  }
-  static void Call(char** args, npy_intp* dimensions, npy_intp* steps,
-                   void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o0 = args[2];
-    char* o1 = args[3];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      T x = *reinterpret_cast<const T*>(i0);
-      T y = *reinterpret_cast<const T*>(i1);
-      float floordiv, mod;
-      std::tie(floordiv, mod) =
-          divmod(static_cast<float>(x), static_cast<float>(y));
-      *reinterpret_cast<T*>(o0) = T(floordiv);
-      *reinterpret_cast<T*>(o1) = T(mod);
-      i0 += steps[0];
-      i1 += steps[1];
-      o0 += steps[2];
-      o1 += steps[3];
-    }
-  }
-};
-template <typename T>
-struct Fmod {
-  T operator()(T a, T b) {
-    return T(std::fmod(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Negative {
-  T operator()(T a) { return -a; }
-};
-template <typename T>
-struct Positive {
-  T operator()(T a) { return a; }
-};
-template <typename T>
-struct Power {
-  T operator()(T a, T b) {
-    return T(std::pow(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Abs {
-  T operator()(T a) { return T(std::abs(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cbrt {
-  T operator()(T a) { return T(std::cbrt(static_cast<float>(a))); }
-};
-template <typename T>
-struct Ceil {
-  T operator()(T a) { return T(std::ceil(static_cast<float>(a))); }
-};
-template <typename T>
-struct CopySign;
-
-template <typename T>
-struct Exp {
-  T operator()(T a) { return T(std::exp(static_cast<float>(a))); }
-};
-template <typename T>
-struct Exp2 {
-  T operator()(T a) { return T(std::exp2(static_cast<float>(a))); }
-};
-template <typename T>
-struct Expm1 {
-  T operator()(T a) { return T(std::expm1(static_cast<float>(a))); }
-};
-template <typename T>
-struct Floor {
-  T operator()(T a) { return T(std::floor(static_cast<float>(a))); }
-};
-template <typename T>
-struct Frexp {
-  std::pair<T, int> operator()(T a) {
-    int exp;
-    float f = std::frexp(static_cast<float>(a), &exp);
-    return {T(f), exp};
-  }
-};
-template <typename T>
-struct Heaviside {
-  T operator()(T bx, T h0) {
-    float x = static_cast<float>(bx);
-    if (Eigen::numext::isnan(x)) {
-      return bx;
-    }
-    if (x < 0) {
-      return T(0.0f);
-    }
-    if (x > 0) {
-      return T(1.0f);
-    }
-    return h0;  // x == 0
-  }
-};
-template <typename T>
-struct Conjugate {
-  T operator()(T a) { return a; }
-};
-template <typename T>
-struct IsFinite {
-  bool operator()(T a) { return std::isfinite(static_cast<float>(a)); }
-};
-template <typename T>
-struct IsInf {
-  bool operator()(T a) { return std::isinf(static_cast<float>(a)); }
-};
-template <typename T>
-struct IsNan {
-  bool operator()(T a) { return Eigen::numext::isnan(static_cast<float>(a)); }
-};
-template <typename T>
-struct Ldexp {
-  T operator()(T a, int exp) {
-    return T(std::ldexp(static_cast<float>(a), exp));
-  }
-};
-template <typename T>
-struct Log {
-  T operator()(T a) { return T(std::log(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log2 {
-  T operator()(T a) { return T(std::log2(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log10 {
-  T operator()(T a) { return T(std::log10(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log1p {
-  T operator()(T a) { return T(std::log1p(static_cast<float>(a))); }
-};
-template <typename T>
-struct LogAddExp {
-  T operator()(T bx, T by) {
-    float x = static_cast<float>(bx);
-    float y = static_cast<float>(by);
-    if (x == y) {
-      // Handles infinities of the same sign.
-      return T(x + std::log(2.0f));
-    }
-    float out = std::numeric_limits<float>::quiet_NaN();
-    if (x > y) {
-      out = x + std::log1p(std::exp(y - x));
-    } else if (x < y) {
-      out = y + std::log1p(std::exp(x - y));
-    }
-    return T(out);
-  }
-};
-template <typename T>
-struct LogAddExp2 {
-  T operator()(T bx, T by) {
-    float x = static_cast<float>(bx);
-    float y = static_cast<float>(by);
-    if (x == y) {
-      // Handles infinities of the same sign.
-      return T(x + 1.0f);
-    }
-    float out = std::numeric_limits<float>::quiet_NaN();
-    if (x > y) {
-      out = x + std::log1p(std::exp2(y - x)) / std::log(2.0f);
-    } else if (x < y) {
-      out = y + std::log1p(std::exp2(x - y)) / std::log(2.0f);
-    }
-    return T(out);
-  }
-};
-template <typename T>
-struct Modf {
-  std::pair<T, T> operator()(T a) {
-    float integral;
-    float f = std::modf(static_cast<float>(a), &integral);
-    return {T(f), T(integral)};
-  }
-};
-
-template <typename T>
-struct Reciprocal {
-  T operator()(T a) { return T(1.f / static_cast<float>(a)); }
-};
-template <typename T>
-struct Rint {
-  T operator()(T a) { return T(std::rint(static_cast<float>(a))); }
-};
-template <typename T>
-struct Sign {
-  T operator()(T a) {
-    float f(a);
-    if (f < 0) {
-      return T(-1);
-    }
-    if (f > 0) {
-      return T(1);
-    }
-    return a;
-  }
-};
-template <typename T>
-struct SignBit {
-  bool operator()(T a) { return std::signbit(static_cast<float>(a)); }
-};
-template <typename T>
-struct Sqrt {
-  T operator()(T a) { return T(std::sqrt(static_cast<float>(a))); }
-};
-template <typename T>
-struct Square {
-  T operator()(T a) {
-    float f(a);
-    return T(f * f);
-  }
-};
-template <typename T>
-struct Trunc {
-  T operator()(T a) { return T(std::trunc(static_cast<float>(a))); }
-};
-
-// Trigonometric functions
-template <typename T>
-struct Sin {
-  T operator()(T a) { return T(std::sin(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cos {
-  T operator()(T a) { return T(std::cos(static_cast<float>(a))); }
-};
-template <typename T>
-struct Tan {
-  T operator()(T a) { return T(std::tan(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arcsin {
-  T operator()(T a) { return T(std::asin(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arccos {
-  T operator()(T a) { return T(std::acos(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctan {
-  T operator()(T a) { return T(std::atan(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctan2 {
-  T operator()(T a, T b) {
-    return T(std::atan2(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Hypot {
-  T operator()(T a, T b) {
-    return T(std::hypot(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Sinh {
-  T operator()(T a) { return T(std::sinh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cosh {
-  T operator()(T a) { return T(std::cosh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Tanh {
-  T operator()(T a) { return T(std::tanh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arcsinh {
-  T operator()(T a) { return T(std::asinh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arccosh {
-  T operator()(T a) { return T(std::acosh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctanh {
-  T operator()(T a) { return T(std::atanh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Deg2rad {
-  T operator()(T a) {
-    static constexpr float radians_per_degree = M_PI / 180.0f;
-    return T(static_cast<float>(a) * radians_per_degree);
-  }
-};
-template <typename T>
-struct Rad2deg {
-  T operator()(T a) {
-    static constexpr float degrees_per_radian = 180.0f / M_PI;
-    return T(static_cast<float>(a) * degrees_per_radian);
-  }
-};
-
-template <typename T>
-struct Eq {
-  npy_bool operator()(T a, T b) { return a == b; }
-};
-template <typename T>
-struct Ne {
-  npy_bool operator()(T a, T b) { return a != b; }
-};
-template <typename T>
-struct Lt {
-  npy_bool operator()(T a, T b) { return a < b; }
-};
-template <typename T>
-struct Gt {
-  npy_bool operator()(T a, T b) { return a > b; }
-};
-template <typename T>
-struct Le {
-  npy_bool operator()(T a, T b) { return a <= b; }
-};
-template <typename T>
-struct Ge {
-  npy_bool operator()(T a, T b) { return a >= b; }
-};
-template <typename T>
-struct Maximum {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fa) || fa > fb ? a : b;
-  }
-};
-template <typename T>
-struct Minimum {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fa) || fa < fb ? a : b;
-  }
-};
-template <typename T>
-struct Fmax {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fb) || fa > fb ? a : b;
-  }
-};
-template <typename T>
-struct Fmin {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fb) || fa < fb ? a : b;
-  }
-};
-
-template <typename T>
-struct LogicalNot {
-  npy_bool operator()(T a) { return !a; }
-};
-template <typename T>
-struct LogicalAnd {
-  npy_bool operator()(T a, T b) { return a && b; }
-};
-template <typename T>
-struct LogicalOr {
-  npy_bool operator()(T a, T b) { return a || b; }
-};
-template <typename T>
-struct LogicalXor {
-  npy_bool operator()(T a, T b) {
-    return static_cast<bool>(a) ^ static_cast<bool>(b);
-  }
-};
-
-template <typename T>
-struct NextAfter;
-
-template <typename T>
-struct Spacing {
-  T operator()(T x) {
-    // Compute the distance between the input and the next number with greater
-    // magnitude. The result should have the sign of the input.
-    T away(std::copysign(std::numeric_limits<float>::infinity(),
-                         static_cast<float>(x)));
-    return NextAfter<T>()(x, away) - x;
-  }
-};
-
-template <typename T>
-bool RegisterUFuncs(PyObject* numpy) {
-  bool ok =
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Add<T>>, T>(numpy, "add") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Subtract<T>>, T>(numpy,
-                                                               "subtract") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Multiply<T>>, T>(numpy,
-                                                               "multiply") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(numpy,
-                                                                 "divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp<T>>, T>(numpy,
-                                                                "logaddexp") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp2<T>>, T>(
-          numpy, "logaddexp2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Negative<T>>, T>(numpy,
-                                                              "negative") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Positive<T>>, T>(numpy,
-                                                              "positive") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(
-          numpy, "true_divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::FloorDivide<T>>, T>(
-          numpy, "floor_divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Power<T>>, T>(numpy, "power") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy,
-                                                                "remainder") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy, "mod") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmod<T>>, T>(numpy, "fmod") &&
-      RegisterUFunc<ufuncs::DivmodUFunc<T>, T>(numpy, "divmod") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "absolute") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "fabs") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rint<T>>, T>(numpy, "rint") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sign<T>>, T>(numpy, "sign") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Heaviside<T>>, T>(numpy,
-                                                                "heaviside") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Conjugate<T>>, T>(numpy,
-                                                               "conjugate") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp<T>>, T>(numpy, "exp") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp2<T>>, T>(numpy, "exp2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Expm1<T>>, T>(numpy, "expm1") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log<T>>, T>(numpy, "log") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log2<T>>, T>(numpy, "log2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log10<T>>, T>(numpy, "log10") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log1p<T>>, T>(numpy, "log1p") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sqrt<T>>, T>(numpy, "sqrt") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Square<T>>, T>(numpy, "square") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cbrt<T>>, T>(numpy, "cbrt") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Reciprocal<T>>, T>(numpy,
-                                                                "reciprocal") &&
-
-      // Trigonometric functions
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sin<T>>, T>(numpy, "sin") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cos<T>>, T>(numpy, "cos") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tan<T>>, T>(numpy, "tan") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsin<T>>, T>(numpy, "arcsin") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccos<T>>, T>(numpy, "arccos") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctan<T>>, T>(numpy, "arctan") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Arctan2<T>>, T>(numpy,
-                                                              "arctan2") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Hypot<T>>, T>(numpy, "hypot") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sinh<T>>, T>(numpy, "sinh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cosh<T>>, T>(numpy, "cosh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tanh<T>>, T>(numpy, "tanh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsinh<T>>, T>(numpy,
-                                                             "arcsinh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccosh<T>>, T>(numpy,
-                                                             "arccosh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctanh<T>>, T>(numpy,
-                                                             "arctanh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Deg2rad<T>>, T>(numpy,
-                                                             "deg2rad") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rad2deg<T>>, T>(numpy,
-                                                             "rad2deg") &&
-
-      // Comparison functions
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Eq<T>>, T>(numpy, "equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ne<T>>, T>(numpy,
-                                                            "not_equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Lt<T>>, T>(numpy, "less") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Gt<T>>, T>(numpy, "greater") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Le<T>>, T>(numpy,
-                                                            "less_equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ge<T>>, T>(numpy,
-                                                            "greater_equal") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Maximum<T>>, T>(numpy,
-                                                              "maximum") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Minimum<T>>, T>(numpy,
-                                                              "minimum") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmax<T>>, T>(numpy, "fmax") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmin<T>>, T>(numpy, "fmin") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalAnd<T>>, T>(
-          numpy, "logical_and") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalOr<T>>, T>(
-          numpy, "logical_or") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalXor<T>>, T>(
-          numpy, "logical_xor") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::LogicalNot<T>>, T>(
-          numpy, "logical_not") &&
-
-      // Floating point functions
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsFinite<T>>, T>(numpy,
-                                                                 "isfinite") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsInf<T>>, T>(numpy, "isinf") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsNan<T>>, T>(numpy, "isnan") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::SignBit<T>>, T>(numpy,
-                                                                "signbit") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::CopySign<T>>, T>(numpy,
-                                                               "copysign") &&
-      RegisterUFunc<UnaryUFunc2<T, T, T, ufuncs::Modf<T>>, T>(numpy, "modf") &&
-      RegisterUFunc<BinaryUFunc2<T, int, T, ufuncs::Ldexp<T>>, T>(numpy,
-                                                                  "ldexp") &&
-      RegisterUFunc<UnaryUFunc2<T, T, int, ufuncs::Frexp<T>>, T>(numpy,
-                                                                 "frexp") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Floor<T>>, T>(numpy, "floor") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Ceil<T>>, T>(numpy, "ceil") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Trunc<T>>, T>(numpy, "trunc") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::NextAfter<T>>, T>(numpy,
-                                                                "nextafter") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Spacing<T>>, T>(numpy, "spacing");
-
-  return ok;
-}
-
-}  // namespace ufuncs
-
-template <typename T>
-bool RegisterNumpyDtype(PyObject* numpy) {
-  // If another module (presumably either TF or JAX) has registered a bfloat16
-  // type, use it. We don't want two bfloat16 types if we can avoid it since it
-  // leads to confusion if we have two different types with the same name. This
-  // assumes that the other module has a sufficiently complete bfloat16
-  // implementation. The only known NumPy bfloat16 extension at the time of
-  // writing is this one (distributed in TF and JAX).
-  // TODO(phawkins): distribute the bfloat16 extension as its own pip package,
-  // so we can unambiguously refer to a single canonical definition of bfloat16.
-  int typenum =
-      PyArray_TypeNumFromName(const_cast<char*>(TypeDescriptor<T>::kTypeName));
-  if (typenum != NPY_NOTYPE) {
-    PyArray_Descr* descr = PyArray_DescrFromType(typenum);
-    // The test for an argmax function here is to verify that the
-    // bfloat16 implementation is sufficiently new, and, say, not from
-    // an older version of TF or JAX.
-    if (descr && descr->f && descr->f->argmax) {
-      TypeDescriptor<T>::npy_type = typenum;
-      TypeDescriptor<T>::type_ptr = descr->typeobj;
-      return true;
-    }
-  }
-
-  TypeDescriptor<T>::type.tp_base = &PyGenericArrType_Type;
-
-  if (PyType_Ready(&TypeDescriptor<T>::type) < 0) {
-    return false;
-  }
-
-  // Initializes the NumPy descriptor.
-  PyArray_ArrFuncs& arr_funcs = CustomFloatTypeDescriptor<T>::arr_funcs;
-  PyArray_InitArrFuncs(&arr_funcs);
-  arr_funcs.getitem = NPyCustomFloat_GetItem<T>;
-  arr_funcs.setitem = NPyCustomFloat_SetItem<T>;
-  arr_funcs.compare = NPyCustomFloat_Compare<T>;
-  arr_funcs.copyswapn = NPyCustomFloat_CopySwapN<T>;
-  arr_funcs.copyswap = NPyCustomFloat_CopySwap<T>;
-  arr_funcs.nonzero = NPyCustomFloat_NonZero<T>;
-  arr_funcs.fill = NPyCustomFloat_Fill<T>;
-  arr_funcs.dotfunc = NPyCustomFloat_DotFunc<T>;
-  arr_funcs.compare = NPyCustomFloat_CompareFunc<T>;
-  arr_funcs.argmax = NPyCustomFloat_ArgMaxFunc<T>;
-  arr_funcs.argmin = NPyCustomFloat_ArgMinFunc<T>;
-
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
-  Py_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr) = &PyArrayDescr_Type;
-#else
-  Py_SET_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr, &PyArrayDescr_Type);
-#endif
-  TypeDescriptor<T>::npy_type =
-      PyArray_RegisterDataType(&CustomFloatTypeDescriptor<T>::npy_descr);
-  TypeDescriptor<T>::type_ptr = &TypeDescriptor<T>::type;
-  if (TypeDescriptor<T>::Dtype() < 0) {
-    return false;
-  }
-
-  Safe_PyObjectPtr typeDict_obj =
-      make_safe(PyObject_GetAttrString(numpy, "sctypeDict"));
-  if (!typeDict_obj) return false;
-  // Add the type object to `numpy.typeDict`: that makes
-  // `numpy.dtype(type_name)` work.
-  if (PyDict_SetItemString(
-          typeDict_obj.get(), TypeDescriptor<T>::kTypeName,
-          reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type)) < 0) {
-    return false;
-  }
-
-  // Support dtype(type_name)
-  if (PyDict_SetItemString(TypeDescriptor<T>::type.tp_dict, "dtype",
-                           reinterpret_cast<PyObject*>(
-                               &CustomFloatTypeDescriptor<T>::npy_descr)) < 0) {
-    return false;
-  }
-
-  return RegisterCasts<T>() && ufuncs::RegisterUFuncs<T>(numpy);
-}
-
-namespace ufuncs {
-
-template <>
-struct CopySign<bfloat16> {
-  bfloat16 operator()(bfloat16 a, bfloat16 b) {
-    // LLVM is smart enough to turn this into (a & 0x7fff) | (b & 0x8000).
-    bfloat16 abs_a = Eigen::numext::abs(a);
-    return std::signbit(static_cast<float>(b)) ? -abs_a : abs_a;
-  }
-};
-
-template <>
-struct NextAfter<bfloat16> {
-  bfloat16 operator()(bfloat16 from, bfloat16 to) {
-    uint16_t from_as_int, to_as_int;
-    const uint16_t sign_mask = 1 << 15;
-    float from_as_float(from), to_as_float(to);
-    memcpy(&from_as_int, &from, sizeof(bfloat16));
-    memcpy(&to_as_int, &to, sizeof(bfloat16));
-    if (Eigen::numext::isnan(from_as_float) ||
-        Eigen::numext::isnan(to_as_float)) {
-      return bfloat16(std::numeric_limits<float>::quiet_NaN());
-    }
-    if (from_as_int == to_as_int) {
-      return to;
-    }
-    if (from_as_float == 0) {
-      if (to_as_float == 0) {
-        return to;
-      } else {
-        // Smallest subnormal signed like `to`.
-        uint16_t out_int = (to_as_int & sign_mask) | 1;
-        bfloat16 out;
-        memcpy(&out, &out_int, sizeof(bfloat16));
-        return out;
-      }
-    }
-    uint16_t from_sign = from_as_int & sign_mask;
-    uint16_t to_sign = to_as_int & sign_mask;
-    uint16_t from_abs = from_as_int & ~sign_mask;
-    uint16_t to_abs = to_as_int & ~sign_mask;
-    uint16_t magnitude_adjustment =
-        (from_abs > to_abs || from_sign != to_sign) ? 0xFFFF : 0x0001;
-    uint16_t out_int = from_as_int + magnitude_adjustment;
-    bfloat16 out;
-    memcpy(&out, &out_int, sizeof(bfloat16));
-    return out;
-  }
-};
-
-}  // namespace ufuncs
-
-
-template <>
-struct TypeDescriptor<bfloat16> : CustomFloatTypeDescriptor<bfloat16> {
-  typedef bfloat16 T;
-  static constexpr const char* kTypeName = "bfloat16";
-  static constexpr const char* kTpDoc = "bfloat16 floating-point values";
-  // We must register bfloat16 with a kind other than "f", because numpy
-  // considers two types with the same kind and size to be equal, but
-  // float16 != bfloat16.
-  // The downside of this is that NumPy scalar promotion does not work with
-  // bfloat16 values.
-  static constexpr char kNpyDescrKind = 'V';
-  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
-  // character is unique.
-  static constexpr char kNpyDescrType = 'E';
-  static constexpr char kNpyDescrByteorder = '=';
-};
-
-template <>
-struct TypeDescriptor<float8_e4m3b11>
-    : CustomFloatTypeDescriptor<float8_e4m3b11> {
-  typedef float8_e4m3b11 T;
-  static constexpr const char* kTypeName = "float8_e4m3b11";
-  static constexpr const char* kTpDoc = "float8_e4m3b11 floating-point values";
-  // We must register float8_e4m3b11 with a kind other than "f", because numpy
-  // considers two types with the same kind and size to be equal, and we
-  // expect multiple 1 byte floating point types.
-  // The downside of this is that NumPy scalar promotion does not work with
-  // float8_e4m3b11 values.
-  static constexpr char kNpyDescrKind = 'V';
-  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
-  // character is unique.
-  static constexpr char kNpyDescrType = 'L';
-  static constexpr char kNpyDescrByteorder = '=';
-};
-
-namespace ufuncs {
-
-template <>
-struct CopySign<float8_e4m3b11> {
-  float8_e4m3b11 operator()(float8_e4m3b11 a, float8_e4m3b11 b) {
-    return float8_e4m3b11::FromRep((a.rep() & 0x7f) | (b.rep() & 0x80));
-  }
-};
-
-template <>
-struct NextAfter<float8_e4m3b11> {
-  float8_e4m3b11 operator()(float8_e4m3b11 from, float8_e4m3b11 to) {
-    uint8_t from_rep = from.rep();
-    uint8_t to_rep = to.rep();
-    if (from_rep == 0x80 || to_rep == 0x80) {
-      return float8_e4m3b11::FromRep(0x80);
-    }
-    if (from_rep == to_rep) {
-      return to;
-    }
-    if (from_rep == 0) {
-      return float8_e4m3b11::FromRep(0x01 | (to_rep & 0x80));
-    }
-    const uint16_t sign_mask = 0x80;
-    uint8_t from_sign = from_rep & sign_mask;
-    uint8_t to_sign = to_rep & sign_mask;
-    uint8_t from_abs = from_rep & ~sign_mask;
-    uint8_t to_abs = to_rep & ~sign_mask;
-    uint8_t magnitude_adjustment =
-        (from_abs > to_abs || from_sign != to_sign) ? 0xFF : 0x0001;
-    uint8_t out_int = from_rep + magnitude_adjustment;
-    if (out_int == 0x80) {
-      out_int = 0x0;
-    }
-    return float8_e4m3b11::FromRep(out_int);
-  }
-};
-
-}  // namespace ufuncs
-
-}  // namespace
-
-// Initializes the module.
-bool Initialize() {
-  ImportNumpy();
-  import_umath1(false);
-
-  Safe_PyObjectPtr numpy_str = make_safe(PyUnicode_FromString("numpy"));
-  if (!numpy_str) {
-    return false;
-  }
-  Safe_PyObjectPtr numpy = make_safe(PyImport_Import(numpy_str.get()));
-  if (!numpy) {
-    return false;
-  }
-
-  if (!RegisterNumpyDtype<bfloat16>(numpy.get())) {
-    return false;
-  }
-  if (!RegisterNumpyDtype<float8_e4m3b11>(numpy.get())) {
-    return false;
-  }
-  // TODO(parkers): Enable CanCast to-from fp8 and bf16 and f16.
-  return true;
-}
-
-bool RegisterNumpyBfloat16() {
-  if (TypeDescriptor<bfloat16>::Dtype() != NPY_NOTYPE) {
-    // Already initialized.
-    return true;
-  }
-  if (!Initialize()) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
-    }
-    PyErr_Print();
-    return false;
-  }
-  return true;
-}
-
-PyObject* Bfloat16Dtype() {
-  return reinterpret_cast<PyObject*>(TypeDescriptor<bfloat16>::type_ptr);
-}
-
-int Bfloat16NumpyType() { return TypeDescriptor<bfloat16>::Dtype(); }
-
-PyObject* Float8_E4M3B11Dtype() {
-  return reinterpret_cast<PyObject*>(TypeDescriptor<float8_e4m3b11>::type_ptr);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/callback.h b/tensorflow/compiler/xla/python/callback.h
index 87936354e46..823f18e09b3 100644
--- a/tensorflow/compiler/xla/python/callback.h
+++ b/tensorflow/compiler/xla/python/callback.h
@@ -60,8 +60,13 @@ class CpuCallback {
   ~CpuCallback() {
     // The destructor may be called without GIL held. In that case, we defer it
     // to GlobalPyRefManager.
-    pybind11::object object = std::move(callable_);
-    GlobalPyRefManager()->AddGarbage(absl::MakeSpan(&object, 1));
+    std::vector<pybind11::object> objects;
+    objects.push_back(std::move(callable_));
+    for (auto& arg : args_) {
+      objects.push_back(std::move(arg.dtype));
+    }
+
+    GlobalPyRefManager()->AddGarbage(absl::MakeSpan(objects));
   }
 
   const std::vector<Arg>& args() const { return args_; }
@@ -85,8 +90,8 @@ class CpuCallback {
   StatusOr<pybind11::tuple> CallInternal(pybind11::tuple args);
 
   pybind11::function callable_;
-  std::vector<Arg> const args_;
-  std::vector<Result> const results_;
+  std::vector<Arg> args_;
+  std::vector<Result> results_;
   xla::TransposePlanCache transpose_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/python/custom_call_sharding.cc b/tensorflow/compiler/xla/python/custom_call_sharding.cc
index 944bd0ff519..47e57c24359 100644
--- a/tensorflow/compiler/xla/python/custom_call_sharding.cc
+++ b/tensorflow/compiler/xla/python/custom_call_sharding.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 27e73574938..4b8472538d3 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -282,12 +282,12 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
                                                   bool take_ownership) {
   TF_ASSIGN_OR_RETURN(PyBuffer * buffer, PyBuffer::AsPyBuffer(py_buffer));
   auto pack = std::make_unique<DLPackTensor>();
-  if (buffer->buffer()->on_device_shape().IsTuple()) {
+  if (buffer->pjrt_buffer()->on_device_shape().IsTuple()) {
     return Unimplemented(
         "unsafe_buffer_pointer is not implemented for tuple "
         "buffers.");
   }
-  if (buffer->buffer()->on_device_shape().is_dynamic()) {
+  if (buffer->pjrt_buffer()->on_device_shape().is_dynamic()) {
     return Unimplemented("DynamicShape is not implemented in DLPack.");
   }
 
@@ -296,7 +296,7 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
     // Block on outstanding operations, so that it is safe to read or mutate the
     // returned buffer.
     StatusOr<std::unique_ptr<PjRtBuffer::ExternalReference>> buffer_or =
-        buffer->buffer()->ReleaseDeviceMemoryOwnership(
+        buffer->pjrt_buffer()->ReleaseDeviceMemoryOwnership(
             /*wait_for_operations_to_complete=*/true);
     if (!buffer_or.ok()) {
       return InvalidArgument(
@@ -314,23 +314,23 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
     TF_RETURN_IF_ERROR(buffer->BlockHostUntilReady());
     pack->buffer_reference = py::reinterpret_borrow<py::object>(py_buffer);
     TF_ASSIGN_OR_RETURN(pack->external_reference,
-                        buffer->buffer()->AcquireExternalReference());
+                        buffer->pjrt_buffer()->AcquireExternalReference());
   }
   dt.data = pack->external_reference->OpaqueDeviceMemoryDataPointer();
   pack->tensor.manager_ctx = pack.get();
   pack->tensor.deleter = DLPackTensorDeleter;
   TF_ASSIGN_OR_RETURN(dt.device,
-                      DLDeviceForDevice(*buffer->buffer()->device()));
-  dt.device.device_id = buffer->buffer()->device()->local_hardware_id();
-  dt.ndim = buffer->buffer()->on_device_shape().dimensions_size();
-  TF_ASSIGN_OR_RETURN(dt.dtype,
-                      PrimitiveTypeToDLDataType(
-                          buffer->buffer()->on_device_shape().element_type()));
+                      DLDeviceForDevice(*buffer->pjrt_buffer()->device()));
+  dt.device.device_id = buffer->pjrt_buffer()->device()->local_hardware_id();
+  dt.ndim = buffer->pjrt_buffer()->on_device_shape().dimensions_size();
+  TF_ASSIGN_OR_RETURN(
+      dt.dtype, PrimitiveTypeToDLDataType(
+                    buffer->pjrt_buffer()->on_device_shape().element_type()));
 
   pack->shape = std::vector<int64_t>(
-      buffer->buffer()->on_device_shape().dimensions().begin(),
-      buffer->buffer()->on_device_shape().dimensions().end());
-  pack->strides = StridesForShape(buffer->buffer()->on_device_shape());
+      buffer->pjrt_buffer()->on_device_shape().dimensions().begin(),
+      buffer->pjrt_buffer()->on_device_shape().dimensions().end());
+  pack->strides = StridesForShape(buffer->pjrt_buffer()->on_device_shape());
   dt.shape = reinterpret_cast<std::int64_t*>(pack->shape.data());
   dt.strides = reinterpret_cast<std::int64_t*>(pack->strides.data());
   dt.byte_offset = 0;
@@ -353,15 +353,20 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(py::handle py_buffer,
 StatusOr<PyBuffer::object> DLPackManagedTensorToBuffer(
     const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
     std::shared_ptr<PyClient> gpu_client) {
+  // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
+  // multiple PjRt clients. Devices from these PjRt clients could be expressed
+  // as a unified set of IFRT devices.
   // Backward compatibility: if only one client is passed, it may be from any
   // platform. Drop this support after dropping support for jax <= 0.2.14.
   if (cpu_client && cpu_client->pjrt_client()->platform_id() == GpuId()) {
     gpu_client = std::move(cpu_client);
     cpu_client = nullptr;
   }
-  if (cpu_client && cpu_client->pjrt_client()->platform_id() != CpuId()) {
+  auto* cpu_pjrt_client = cpu_client ? cpu_client->pjrt_client() : nullptr;
+  auto* gpu_pjrt_client = gpu_client ? gpu_client->pjrt_client() : nullptr;
+  if (cpu_client && cpu_pjrt_client->platform_id() != CpuId()) {
     return InvalidArgument("DLPack does not support platform %s",
-                           cpu_client->pjrt_client()->platform_name());
+                           cpu_pjrt_client->platform_name());
   }
 
   if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
@@ -376,11 +381,10 @@ StatusOr<PyBuffer::object> DLPackManagedTensorToBuffer(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
         dlmt->dl_tensor.ndim);
   }
-  TF_ASSIGN_OR_RETURN(
-      PjRtDevice * device,
-      DeviceForDLDevice(cpu_client ? cpu_client->pjrt_client() : nullptr,
-                        gpu_client ? gpu_client->pjrt_client() : nullptr,
-                        dlmt->dl_tensor.device));
+  TF_ASSIGN_OR_RETURN(PjRtDevice * device,
+                      DeviceForDLDevice(cpu_client ? cpu_pjrt_client : nullptr,
+                                        gpu_client ? gpu_pjrt_client : nullptr,
+                                        dlmt->dl_tensor.device));
   absl::Span<int64_t const> dimensions(
       reinterpret_cast<int64_t*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
   TF_ASSIGN_OR_RETURN(PrimitiveType element_type,
@@ -415,11 +419,19 @@ StatusOr<PyBuffer::object> DLPackManagedTensorToBuffer(
   PyCapsule_SetDestructor(tensor.ptr(), nullptr);
   // TODO(phawkins): simplify the expression below once we know cpu_client is
   // always non-null.
-  return PyBuffer::Make(
-      (cpu_client && device->client() == cpu_client->pjrt_client())
-          ? std::move(cpu_client)
-          : std::move(gpu_client),
-      std::move(pjrt_buffer), Traceback::Get());
+  auto client = (cpu_client && device->client() == cpu_pjrt_client)
+                    ? std::move(cpu_client)
+                    : std::move(gpu_client);
+  auto* ifrt_client =
+      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
+  if (ifrt_client == nullptr) {
+    throw XlaRuntimeError(
+        "This operation is implemented for a PjRt-compatible backend only.");
+  }
+  TF_ASSIGN_OR_RETURN(auto ifrt_array,
+                      ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
+  return PyBuffer::Make(std::move(client), std::move(ifrt_array),
+                        Traceback::Get());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/exceptions.h b/tensorflow/compiler/xla/python/exceptions.h
index 898cf22c689..a81cd60bf75 100644
--- a/tensorflow/compiler/xla/python/exceptions.h
+++ b/tensorflow/compiler/xla/python/exceptions.h
@@ -15,8 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_EXCEPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_EXCEPTIONS_H_
 
+#include <optional>
 #include <stdexcept>
 #include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/status.h"
 
@@ -26,12 +28,17 @@ namespace xla {
 // Python code instead of RuntimeError.
 class XlaRuntimeError : public std::runtime_error {
  public:
-  explicit XlaRuntimeError(const Status status)
-      : std::runtime_error(status.ToString()) {
-    CHECK(!status.ok());
+  explicit XlaRuntimeError(Status status)
+      : std::runtime_error(status.ToString()), status_(std::move(status)) {
+    CHECK(!status_->ok());
   }
 
   explicit XlaRuntimeError(const std::string what) : std::runtime_error(what) {}
+
+  std::optional<Status> status() const { return status_; }
+
+ private:
+  std::optional<Status> status_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/float8_e4m3b11.cc b/tensorflow/compiler/xla/python/float8_e4m3b11.cc
deleted file mode 100644
index 12aaa923698..00000000000
--- a/tensorflow/compiler/xla/python/float8_e4m3b11.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/python/float8_e4m3b11.h"
-
-#include <stdio.h>
-
-namespace xla {
-
-uint8_t float_to_float8_e4m3b11(float v) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "Invalid");
-  uint32_t tmp = *reinterpret_cast<uint32_t*>(&v);
-
-  uint32_t sign = (tmp & 0x80000000) >> 24;
-  uint32_t exponent = (tmp >> 23) & 0xff;
-  uint32_t mantissa = tmp & 0x7fffff;
-  // subnormals
-  if (exponent < 127 - 10) {
-    if (exponent < 127 - 14) {
-      return 0x00;
-    }
-    uint32_t shifted_mantissa =
-        (0x800000 | mantissa) >> (10 - ((exponent - 127)));
-    if (shifted_mantissa == 0) return 0x00;
-    return sign | shifted_mantissa;
-  }
-  if (exponent > 127 + 4) {
-    if (exponent == 255 && mantissa != 0) {
-      return 0x80;  // nan.
-    }
-    return 0x7f | sign;
-  }
-  exponent = exponent - (127 - 11);
-  uint8_t result = sign | (exponent << 3) | (mantissa >> 20);
-  if (result == 0x80) {
-    result = 0;
-  }
-  return result;
-}
-
-static uint32_t clz_uint32(uint32_t x) {
-#ifdef __GNUC__
-  return __builtin_clz(x);
-#else
-  uint32_t out = 32;
-  while (x != 0) {
-    x = x >> 1;
-    out -= 1;
-  }
-  return out;
-#endif
-}
-
-float float8_e4m3b11_to_float(uint8_t v) {
-  if (v == 0x80) {
-    return NAN;
-  }
-  if (v == 0) {
-    return 0;
-  }
-  uint32_t sign = (0x80 & v) << 24;
-  uint32_t exponent = (((v & 0x78) >> 3) + (127 - 11));
-  uint32_t mantissa = (v & 0x7) << 20;
-  // subnormals
-  if ((v & 0x78) == 0) {
-    uint32_t nzeros = clz_uint32(v & 0x7);
-    mantissa = ((v & 0x7) << (nzeros - 29 + 21)) & (0x3 << 21);
-    uint32_t tmp = sign | ((0x72 - nzeros + 31) << 23) | mantissa;
-    return *reinterpret_cast<float*>(&tmp);
-  }
-  uint32_t tmp = sign | (exponent << 23) | mantissa;
-  return *reinterpret_cast<float*>(&tmp);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/float8_e4m3b11.h b/tensorflow/compiler/xla/python/float8_e4m3b11.h
deleted file mode 100644
index 5ea304f7a59..00000000000
--- a/tensorflow/compiler/xla/python/float8_e4m3b11.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_FLOAT8_E4M3B11_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_FLOAT8_E4M3B11_H_
-
-#include <stdint.h>
-
-#include <cmath>
-#include <cstring>
-#include <memory>
-
-namespace xla {
-
-uint8_t float_to_float8_e4m3b11(float v);
-float float8_e4m3b11_to_float(uint8_t v);
-
-class float8_e4m3b11 {
- public:
-  // Exponent: 4, Mantissa: 3, bias: 11
-  float8_e4m3b11() {}
-  float8_e4m3b11(float v) : rep_(float_to_float8_e4m3b11(v)) {}  // NOLINT
-
-  operator float() const {  // NOLINT: Allow implicit conversion to float,
-                            // because it is lossless.
-    return float8_e4m3b11_to_float(rep_);
-  }
-
-  float8_e4m3b11 operator-() const {
-    if ((rep_ & 0x7f) == 0x00) {
-      return *this;
-    }  // nan or 0.
-    float8_e4m3b11 result = *this;
-    result.rep_ = result.rep_ ^ 0x80;
-    return result;
-  }
-
-  uint8_t rep() const { return rep_; }
-
-  static float8_e4m3b11 FromRep(uint8_t rep) {
-    float8_e4m3b11 result;
-    memcpy(&result, &rep, sizeof(float8_e4m3b11));
-    return result;
-  }
-
- private:
-  uint8_t rep_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_FLOAT8_E4M3B11_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/BUILD b/tensorflow/compiler/xla/python/ifrt/BUILD
new file mode 100644
index 00000000000..6f1a3c2436d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/BUILD
@@ -0,0 +1,262 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla/python:friends",
+    ],
+    packages = [
+        "//tensorflow/compiler/xla/python/...",
+    ],
+)
+
+package_group(
+    name = "internal",
+    packages = [
+        "//tensorflow/compiler/xla/python/ifrt/...",
+    ],
+)
+
+package(
+    default_visibility = [
+        ":friends",
+        ":internal",
+    ],
+    licenses = ["notice"],
+)
+
+exports_files([
+    "BUILD",
+])
+
+cc_library(
+    name = "ifrt",
+    srcs = [
+        "array.cc",
+        "client.cc",
+        "compiler.cc",
+        "dtype.cc",
+        "executable.cc",
+        "future.cc",
+        "index.cc",
+        "index_domain.cc",
+        "shape.cc",
+        "sharding.cc",
+        "tuple.cc",
+        "value.cc",
+    ],
+    hdrs = [
+        "array.h",
+        "client.h",
+        "compiler.h",
+        "device.h",
+        "dtype.h",
+        "executable.h",
+        "future.h",
+        "index.h",
+        "index_domain.h",
+        "shape.h",
+        "sharding.h",
+        "tuple.h",
+        "value.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@tf_runtime//:ref_count",
+    ],
+)
+
+xla_cc_test(
+    name = "array_test",
+    size = "small",
+    srcs = ["array_test.cc"],
+    deps = [
+        ":ifrt",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "future_test",
+    size = "small",
+    srcs = ["future_test.cc"],
+    deps = [
+        ":ifrt",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "index_domain_test",
+    size = "small",
+    srcs = ["index_domain_test.cc"],
+    deps = [
+        ":ifrt",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "index_test",
+    size = "small",
+    srcs = ["index_test.cc"],
+    deps = [
+        ":ifrt",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "shape_test",
+    size = "small",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":ifrt",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "sharding_test",
+    size = "small",
+    srcs = ["sharding_test.cc"],
+    deps = [
+        ":ifrt",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":ifrt",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "no_impl_test_main",
+    testonly = 1,
+    srcs = ["no_impl_test_main.cc"],
+    deps = [
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "array_impl_test_lib",
+    testonly = 1,
+    srcs = ["array_impl_test_lib.cc"],
+    deps = [
+        ":ifrt",
+        ":test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "array_test_no_impl",
+    srcs = [],
+    deps = [
+        ":array_impl_test_lib",
+        ":no_impl_test_main",
+    ],
+)
+
+cc_library(
+    name = "client_impl_test_lib",
+    testonly = 1,
+    srcs = ["client_impl_test_lib.cc"],
+    deps = [
+        ":ifrt",
+        ":test_util",
+        "//tensorflow/tsl/platform:test",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "client_test_no_impl",
+    srcs = [],
+    deps = [
+        ":client_impl_test_lib",
+        ":no_impl_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "executable_impl_test_lib",
+    testonly = 1,
+    srcs = ["executable_impl_test_lib.cc"],
+    deps = [
+        ":ifrt",
+        ":test_util",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "executable_test_no_impl",
+    srcs = [],
+    deps = [
+        ":executable_impl_test_lib",
+        ":no_impl_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "tuple_impl_test_lib",
+    testonly = 1,
+    srcs = ["tuple_impl_test_lib.cc"],
+    deps = [
+        ":ifrt",
+        ":test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:ref_count",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "tuple_test_no_impl",
+    srcs = [],
+    deps = [
+        ":no_impl_test_main",
+        ":tuple_impl_test_lib",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/README.md b/tensorflow/compiler/xla/python/ifrt/README.md
new file mode 100644
index 00000000000..523e225b348
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/README.md
@@ -0,0 +1,36 @@
+IFRT (the Interim Framework Runtime) is a high-level ML runtime API that is
+designed to be used as the interface between a user-facing framework, such as
+JAX, PyTorch, or TensorFlow, and the runtimes below. The purpose of the IFRT
+is to make a framework portable across as wide a range of hardware
+configurations as possible. IFRT allows the framework to more or less
+declaratively express the work that needs to be done, and delegate policy
+choices about how to efficiently execute that work to the runtime
+implementations.
+
+The status quo is that PjRt ("pretty much just another runtime"), which is a
+low-level ML runtime API for XLA computations, doubles as a high-level ML
+runtime API in several deployment scenarios. This low-level ML runtime API
+was originally designed to abstract away implementation details of underlying
+hardware attached to a single local host, for example masking minor
+differences between generations of TPUs/GPUs and CPUs. However, PjRt nowadays
+is being asked to embrace vastly different scenarios such as scaling to
+distributed execution spanning thousands of accelerators on modern shared
+infrastructure. We believe that we have reached a point where we should
+bifurcate the current PjRt API, and allow the PjRt API and a new API, IFRT,
+to deviate to better support their intended use cases. Initially, because we
+are bifurcating the APIs rather than inventing a fundamentally new API, it is
+a design goal that it will be easy to migrate from PjRt to IFRT. Over time as
+the APIs deviate, such migration may become harder.
+
+IFRT requires careful prototyping as it is a new portable layer that would
+interact with multiple user-facing frameworks and low-level runtimes. Our
+initial prototyping effort aims at demonstrating that an IFRT implementation
+for PjRt enables existing PjRt users to use IFRT with little friction,
+removing several complexities caused by direct interaction with low-level
+runtime APIs. An IFRT prototype in the XLA source tree will accelerate this
+prototyping effort by sharing the building and testing infrastructure with
+XLA and will ensure coherent development of XLA and IFRT.
+
+Once early IFRT prototyping is complete we will consult with stakeholders to
+ensure both that all requirements are met, and also that there is a simple
+migration path from PjRt to IFRT for teams who will benefit from migration.
diff --git a/tensorflow/compiler/xla/python/ifrt/array.cc b/tensorflow/compiler/xla/python/ifrt/array.cc
new file mode 100644
index 00000000000..193b33970fd
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/array.cc
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+
+#include <memory>
+#include <vector>
+
+namespace xla {
+namespace ifrt {
+
+char Array::ID = 0;
+
+std::vector<Array*> MakeArrayPointerList(
+    absl::Span<const tsl::RCReference<Array>> arrays) {
+  std::vector<Array*> result;
+  result.reserve(arrays.size());
+  for (const auto& array : arrays) {
+    result.push_back(array.get());
+  }
+  return result;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/array.h b/tensorflow/compiler/xla/python/ifrt/array.h
new file mode 100644
index 00000000000..96052ac4132
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/array.h
@@ -0,0 +1,141 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_ARRAY_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_ARRAY_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/python/ifrt/value.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Semantics for operations that may copy or move sharded buffers in an array.
+enum class ArrayCopySemantics : int {
+  // Always creates new buffers to construct an output array. Mutation of the
+  // output array buffers will not mutate the input array buffers.
+  kAlwaysCopy = 0,
+
+  // Tries to use the existing buffers of the input array to construct an output
+  // array. In-place mutation of the output array buffers may also mutate the
+  // input array buffers.
+  kReuseInput,
+
+  // Tries to use existing buffers of the input array to construct an output
+  // array. Drops the ownership of unused buffers in the input array, making the
+  // input array no longer usable and reclaiming its on-device resources.
+  kDonateInput,
+};
+
+// Represents a single logical array from one or more sharded buffers.
+// Implementations must be thread-safe.
+class Array : public llvm::RTTIExtends<Array, Value> {
+ public:
+  Array() = default;
+
+  // Not copyable or movable.
+  Array(const Array&) = delete;
+  Array(Array&&) = delete;
+  Array& operator=(const Array&) = delete;
+  Array& operator=(Array&&) = delete;
+
+  virtual DType dtype() const = 0;
+  virtual const Shape& shape() const = 0;
+  virtual const Sharding& sharding() const = 0;
+  virtual std::shared_ptr<const Sharding> shared_ptr_sharding() const = 0;
+
+  // Breaks an array up into per-device arrays. This is the elimination
+  // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
+  virtual StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) = 0;
+
+  // Fetches the array to host and stores it as unreplicated, unsharded data.
+  //
+  // DType whose sizes are unknown are unsupported.
+  //
+  // It may fail if sharding has insufficient information to
+  // unreplicated/unshard the data (e.g., `OpaqueSharding`), or the sharding
+  // contains an unaddressable device from the local runtime.
+  //
+  // If byte_strides is omitted, it defaults to a dense layout with dimensions
+  // in major-to-minor order. The runtime may return `UNIMPLEMENTED` if
+  // byte_strides does not equate to a reordering of the dimensions.
+  //
+  // `data` must remain valid until the returned future becomes ready. It will
+  // contain a valid data only if the returned future has an OK. Otherwise, its
+  // content is undefined.
+  //
+  // TODO(hyeontaek): Add a `size` argument or change the type of `data` to
+  // `absl::Span<char>` to guard against buffer underflows and overflows.
+  //
+  // TODO(hyeontaek): Clarify memory alignment issues and document them.
+  // Implementations may impose alignment requirements on `data`. They can fail
+  // if the requirements are not satisfied so that they avoid extra memory
+  // copies that could incur performance overhead or extra memory use. The
+  // required alignments may be different across backends (e.g., depending on
+  // they use DMA) and across different `DType` and `Shape`. We may need to add
+  // an API that lets users query the alignment requirement of the specific
+  // implementation.
+  ABSL_MUST_USE_RESULT
+  virtual Future<Status> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) = 0;
+
+  // Copies the array with a new sharding, creating a new array.
+  //
+  // Resharding falls into one of the three cases:
+  //
+  // * Metadata-only resharding: Use a new sharding for the array that expects
+  //   the same physical layout of underlying buffers on the same devices.
+  // * 1-to-1 buffer copy: Copy individual buffers to different devices without
+  //   altering their physical layout.
+  // * M-to-N buffer resharding: Shuffle the buffer data across the boundary of
+  //   the buffers, changing their physical layout.
+  //
+  // Implementations may return `UNIMPLEMENTED` if they do not know how to copy
+  // or reshuffle the data to match the new sharding.
+  //
+  // It may fail if the buffer data would be sent from/to an unaddressable
+  // device.
+  virtual StatusOr<tsl::RCReference<Array>> Reshard(
+      std::shared_ptr<const Sharding> new_sharding,
+      ArrayCopySemantics semantics) = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Convenience function to create a list of pointer Arrays from a list of
+// RCReference<Array>s.
+std::vector<Array*> MakeArrayPointerList(
+    absl::Span<const tsl::RCReference<Array>> arrays);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_ARRAY_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc
new file mode 100644
index 00000000000..2ec81877828
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc
@@ -0,0 +1,496 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::SizeIs;
+
+TEST(ArrayImplTest, MakeArrayFromHostBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data->data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/nullptr));
+
+  EXPECT_EQ(array->dtype(), dtype);
+  EXPECT_EQ(array->shape(), shape);
+  EXPECT_EQ(array->shared_ptr_sharding().get(), sharding.get());
+}
+
+class ArrayImplWithHostBufferSemanticsTest
+    : public testing::TestWithParam<Client::HostBufferSemantics> {};
+
+TEST_P(ArrayImplWithHostBufferSemanticsTest,
+       MakeArrayFromHostBufferCallsWithOnDoneWithHostBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Client::HostBufferSemantics semantics = GetParam();
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  absl::Notification done_with_host_buffer;
+  auto on_done_with_host_buffer = [&]() { done_with_host_buffer.Notify(); };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data->data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      std::move(on_done_with_host_buffer)));
+
+  // Regardless of the host buffer semantics chosen, the host buffer must not be
+  // used by the runtime once `on_done_with_host_buffer` has been called.
+  if (semantics == Client::HostBufferSemantics::kZeroCopy) {
+    // `on_done_with_host_buffer` is called only when the `Array` is destroyed
+    // if the runtime implements `kZeroCopy`. A deadlock will occur if we keep
+    // the `Array` instance.
+    array.reset();
+
+    // `done_with_host_buffer` is very likely to have been called after
+    // sleeping. This method has false positives (sleeping was not long enough
+    // for the callback to be called asynchronously), but it may greatly
+    // increases the chance of detecting an incorrect implementation as a form
+    // of test flakes.
+    absl::SleepFor(absl::Seconds(3));
+    ASSERT_TRUE(done_with_host_buffer.HasBeenNotified());
+  } else {
+    done_with_host_buffer.WaitForNotification();
+  }
+  data = nullptr;
+  // There should be no use-after-free.
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AllHostBufferSemantics, ArrayImplWithHostBufferSemanticsTest,
+    testing::Values(
+        Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+        Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+        Client::HostBufferSemantics::kZeroCopy));
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferImmutableOnlyDuringCall) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  absl::Notification done_with_host_buffer;
+  auto on_done_with_host_buffer = [&]() {
+    // Sleeping facilitates testing if `MakeArrayFromHostBuffer()` calls
+    // `on_done_with_host_buffer` synchronously before returning. This method
+    // has false negatives (when a call to
+    // `done_with_host_buffer.HasBeenNotified()` is delayed), but it may greatly
+    // increases the chance of detecting an incorrect implementation as a form
+    // of test flakes.
+    absl::SleepFor(absl::Seconds(3));
+
+    done_with_host_buffer.Notify();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data->data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      std::move(on_done_with_host_buffer)));
+
+  // `on_done_with_host_buffer` should have been called before returning from
+  // `MakeArrayFromHostBuffer`.
+  ASSERT_TRUE(done_with_host_buffer.HasBeenNotified());
+  data = nullptr;
+  // There should be no use-after-free.
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferImmutableUntilTransferCompletes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      client->MakeArrayFromHostBuffer(
+          data->data(), dtype, shape,
+          /*byte_strides=*/std::nullopt, sharding,
+          Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+          /*on_done_with_host_buffer=*/nullptr));
+
+  // Once the `Array` has become ready, the host buffer is not accessed.
+  TF_ASSERT_OK(array->GetReadyFuture().Await());
+  data = nullptr;
+  // There should be no use-after-free.
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferZeroCopy) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  auto data = std::make_unique<std::vector<float>>(6);
+  std::iota(data->begin(), data->end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array,
+      client->MakeArrayFromHostBuffer(data->data(), dtype, shape,
+                                      /*byte_strides=*/std::nullopt, sharding,
+                                      Client::HostBufferSemantics::kZeroCopy,
+                                      /*on_done_with_host_buffer=*/nullptr));
+
+  // The `Array` may alias the host buffer, but once the transfer is done and
+  // the `Array` is destroyed, the host buffer is not accessed. This test would
+  // pass trivially on the implementations that downgrade `kZeroCopy`, if
+  // `MakeArrayFromHostBufferImmutableUntilTransferCompletes` already passes.
+  TF_ASSERT_OK(array->GetReadyFuture().Await());
+  array.reset();
+  data = nullptr;
+  // There should be no use-after-free.
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferAndCopyToHostBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/{}));
+
+  std::vector<float> out_data(6);
+  auto future =
+      array->CopyToHostBuffer(out_data.data(), /*byte_strides=*/std::nullopt,
+                              ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  EXPECT_THAT(out_data, ElementsAreArray(data));
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferWithByteStridesAndCopyToHostBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  // The input data layout is minor-to-major.
+  std::vector<float> data = {0, 3, 1, 4, 2, 5};
+  std::vector<int64_t> byte_strides = {4, 8};
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape, byte_strides, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/{}));
+
+  std::vector<float> out_data(6);
+  // The expected output data layout is major-to-minor.
+  std::vector<float> expected_out_data = {0, 1, 2, 3, 4, 5};
+  auto future =
+      array->CopyToHostBuffer(out_data.data(), /*byte_strides=*/std::nullopt,
+                              ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
+}
+
+TEST(ArrayImplTest, MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  // The input data layout is major-to-minor.
+  std::vector<float> data = {0, 1, 2, 3, 4, 5};
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/{}));
+
+  std::vector<float> out_data(6);
+  // The requested output data layout is minor-to-major.
+  std::vector<int64_t> byte_strides = {4, 8};
+  std::vector<float> expected_out_data = {0, 3, 1, 4, 2, 5};
+  auto future = array->CopyToHostBuffer(out_data.data(), byte_strides,
+                                        ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
+}
+
+TEST(ArrayImplTest, AssembleArray) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device0 = client->addressable_devices().at(0);
+  auto sharding0 = SingleDeviceSharding::Create(device0);
+  Device* device1 = client->addressable_devices().at(1);
+  auto sharding1 = SingleDeviceSharding::Create(device1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array0, client->MakeArrayFromHostBuffer(
+                       data.data(), dtype, shape,
+                       /*byte_strides=*/std::nullopt, sharding0,
+                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                       /*on_done_with_host_buffer=*/{}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array1, client->MakeArrayFromHostBuffer(
+                       data.data(), dtype, shape,
+                       /*byte_strides=*/std::nullopt, sharding1,
+                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                       /*on_done_with_host_buffer=*/{}));
+
+  std::vector<tsl::RCReference<Array>> arrays({array0, array1});
+  Shape assembled_shape({4, 3});
+  auto assembled_sharding = OpaqueSharding::Create(
+      DeviceList(DeviceList::Devices({array0->sharding().devices().front(),
+                                      array1->sharding().devices().front()})));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assembled_array,
+      client->AssembleArrayFromSingleDeviceArrays(
+          assembled_shape, assembled_sharding, absl::MakeSpan(arrays),
+          ArrayCopySemantics::kAlwaysCopy));
+
+  EXPECT_EQ(assembled_array->dtype(), dtype);
+  EXPECT_EQ(assembled_array->shape(), assembled_shape);
+  EXPECT_EQ(assembled_array->shared_ptr_sharding().get(),
+            assembled_sharding.get());
+}
+
+TEST(ArrayImplTest, AssembleAndDisassembleArray) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device0 = client->addressable_devices().at(0);
+  auto sharding0 = SingleDeviceSharding::Create(device0);
+  Device* device1 = client->addressable_devices().at(1);
+  auto sharding1 = SingleDeviceSharding::Create(device1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array0, client->MakeArrayFromHostBuffer(
+                       data.data(), dtype, shape,
+                       /*byte_strides=*/std::nullopt, sharding0,
+                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                       /*on_done_with_host_buffer=*/{}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array1, client->MakeArrayFromHostBuffer(
+                       data.data(), dtype, shape,
+                       /*byte_strides=*/std::nullopt, sharding1,
+                       Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                       /*on_done_with_host_buffer=*/{}));
+
+  std::vector<tsl::RCReference<Array>> arrays({array0, array1});
+  std::vector<Shape> single_device_shapes({shape, shape});
+  Shape assembled_shape({4, 3});
+  auto assembled_sharding = OpaqueSharding::Create(
+      DeviceList(DeviceList::Devices({array0->sharding().devices().front(),
+                                      array1->sharding().devices().front()})),
+      OpaqueSharding::MakeDisassembleFuncFromShapes(single_device_shapes));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assembled_array,
+      client->AssembleArrayFromSingleDeviceArrays(
+          assembled_shape, assembled_sharding, absl::MakeSpan(arrays),
+          ArrayCopySemantics::kAlwaysCopy));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
+                          assembled_array->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kAlwaysCopy));
+
+  ASSERT_THAT(single_device_arrays, SizeIs(2));
+  EXPECT_EQ(single_device_arrays[0]->dtype(), array0->dtype());
+  EXPECT_EQ(single_device_arrays[0]->shape(), array0->shape());
+  EXPECT_THAT(single_device_arrays[0]->sharding().devices().devices(),
+              ElementsAreArray(array0->sharding().devices().devices()));
+  EXPECT_EQ(single_device_arrays[1]->dtype(), array1->dtype());
+  EXPECT_EQ(single_device_arrays[1]->shape(), array1->shape());
+  EXPECT_THAT(single_device_arrays[1]->sharding().devices().devices(),
+              ElementsAreArray(array1->sharding().devices().devices()));
+}
+
+TEST(ArrayImplTest, ReshardToSameSharding) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      /*on_done_with_host_buffer=*/{}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reshared_array,
+      array->Reshard(sharding, ArrayCopySemantics::kAlwaysCopy));
+
+  std::vector<float> out_data(6);
+  auto future = reshared_array->CopyToHostBuffer(
+      out_data.data(), /*byte_strides=*/std::nullopt,
+      ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  EXPECT_THAT(out_data, ElementsAreArray(data));
+}
+
+TEST(ArrayImplTest, ReshardToDifferentDevice) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      /*on_done_with_host_buffer=*/{}));
+
+  Device* new_device = client->addressable_devices().at(1);
+  auto new_sharding = SingleDeviceSharding::Create(new_device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reshared_array,
+      array->Reshard(sharding, ArrayCopySemantics::kAlwaysCopy));
+
+  std::vector<float> out_data(6);
+  auto future = reshared_array->CopyToHostBuffer(
+      out_data.data(), /*byte_strides=*/std::nullopt,
+      ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+  EXPECT_THAT(out_data, ElementsAreArray(data));
+}
+
+TEST(ArrayImplTest, GetReadyFuture) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      /*on_done_with_host_buffer=*/{}));
+  TF_EXPECT_OK(array->GetReadyFuture().Await());
+}
+
+TEST(ArrayImplTest, Delete) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      /*on_done_with_host_buffer=*/{}));
+  TF_EXPECT_OK(array->Delete().Await());
+}
+
+TEST(ArrayImplTest, IsDeleted) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+  auto semantics = Client::HostBufferSemantics::kImmutableOnlyDuringCall;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding, semantics,
+                      /*on_done_with_host_buffer=*/{}));
+  EXPECT_FALSE(array->IsDeleted());
+  auto future = array->Delete();
+  EXPECT_TRUE(array->IsDeleted());
+  TF_EXPECT_OK(future.Await());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/array_test.cc b/tensorflow/compiler/xla/python/ifrt/array_test.cc
new file mode 100644
index 00000000000..0e3842af020
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/array_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "llvm/Support/ExtensibleRTTI.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+class MockArray : public llvm::RTTIExtends<MockArray, Array> {
+ public:
+  static tsl::RCReference<Array> Create() { return tsl::MakeRef<MockArray>(); }
+
+  MOCK_METHOD(Client*, client, (), (const, override));
+
+  MOCK_METHOD(DType, dtype, (), (const, override));
+  MOCK_METHOD(const Shape&, shape, (), (const, override));
+  MOCK_METHOD(const Sharding&, sharding, (), (const, override));
+  MOCK_METHOD(std::shared_ptr<const Sharding>, shared_ptr_sharding, (),
+              (const, override));
+
+  MOCK_METHOD(StatusOr<std::vector<tsl::RCReference<Array>>>,
+              DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
+              (override));
+
+  MOCK_METHOD(Future<Status>, CopyToHostBuffer,
+              (void* data,
+               std::optional<absl::Span<const int64_t>> byte_strides,
+               ArrayCopySemantics semantics),
+              (override));
+
+  MOCK_METHOD(StatusOr<tsl::RCReference<Array>>, Reshard,
+              (std::shared_ptr<const Sharding> new_sharding,
+               ArrayCopySemantics semantics),
+              (override));
+
+  MOCK_METHOD(Future<Status>, GetReadyFuture, (), (const, override));
+
+  MOCK_METHOD(Future<Status>, Delete, (), (override));
+
+  MOCK_METHOD(bool, IsDeleted, (), (const, override));
+
+  MOCK_METHOD(std::string, DebugString, (), (const, override));
+
+  static char ID;  // NOLINT
+};
+
+char MockArray::ID ABSL_ATTRIBUTE_UNUSED = 0;
+
+TEST(ArrayTest, MakeArrayPointerListTest) {
+  const int kNumArrays = 3;
+  std::vector<tsl::RCReference<Array>> arrays;
+  arrays.reserve(kNumArrays);
+  for (int i = 0; i < kNumArrays; ++i) {
+    arrays.push_back(MockArray::Create());
+  }
+
+  std::vector<Array*> array_pointer_list = MakeArrayPointerList(arrays);
+  ASSERT_THAT(array_pointer_list, testing::SizeIs(kNumArrays));
+  for (int i = 0; i < kNumArrays; ++i) {
+    EXPECT_THAT(array_pointer_list[i], arrays[i].get());
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/client.cc b/tensorflow/compiler/xla/python/ifrt/client.cc
new file mode 100644
index 00000000000..1b32590a60b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/client.cc
@@ -0,0 +1,24 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+
+namespace xla {
+namespace ifrt {
+
+char Client::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/client.h b/tensorflow/compiler/xla/python/ifrt/client.h
new file mode 100644
index 00000000000..5040c3db5b8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/client.h
@@ -0,0 +1,147 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_CLIENT_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+#include "tensorflow/compiler/xla/python/ifrt/tuple.h"
+#include "tensorflow/compiler/xla/python/ifrt/value.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+using PlatformId = ::xla::PjRtPlatformId;
+using ChannelHandle = ::xla::ChannelHandle;
+
+// TODO(hyeontaek): Generalize DeviceAssignment or hide it from the top-level
+// API.
+using DeviceAssignment = ::xla::DeviceAssignment;
+
+// Represents an IFRT client. It wraps a runtime that interacts with computation
+// devices and memory attached to it.
+class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
+ public:
+  // Describes the semantics the caller to `MakeArrayFromHostBuffer` expects
+  // from the runtime, in a total order from most restrictive to least
+  // restrictive.
+  //
+  // kImmutableOnlyDuringCall:
+  // The runtime may not hold references to `data` after the call to
+  // `MakeArrayFromHostBuffer` completes. The caller promises that `data` is
+  // immutable and will not be freed only for the duration of the
+  // `MakeArrayFromHostBuffer` call. `on_done_with_host_buffer` will be called
+  // before `MakeArrayFromHostBuffer` returns.
+
+  // kImmutableUntilTransferCompletes:
+  // The runtime may hold onto `data` after the call to
+  // `MakeArrayFromHostBuffer` returns while the runtime completes transfers to
+  // devices. The caller promises not to mutate or free `data` until the
+  // transfer completes, at which point the runtime will call
+  // `on_done_with_host_buffer`. It is also correct to wait (directly or
+  // indirectly) for the `Array`'s ready event. The runtime does not promise a
+  // certain ordering between an `on_done_with_host_buffer` call and the
+  // `Array`'s ready event.
+
+  // kZeroCopy:
+  // The `Array` may alias `data` internally and the runtime may use the `data`
+  // contents as long as the buffer is alive. The caller promises to keep `data`
+  // alive and not to mutate its contents as long as the buffer is alive; to
+  // notify the caller that the buffer may be freed, the runtime will call
+  // `on_done_with_host_buffer` when the `Array` is freed. The implementation is
+  // free to make a copy and downgrade the semantics to
+  // `kImmutableUntilTransferCompletes`. Many non-CPU runtimes will make a copy
+  // by default.
+  using HostBufferSemantics = ::xla::PjRtClient::HostBufferSemantics;
+
+  // Creates a new array from a host buffer.
+  //
+  // `data` points to the backing array of the host buffer. Caution:
+  // `byte_strides` are allowed to be negative, in which case `data` may need to
+  // point to the interior of the buffer, not necessarily its start.
+  //
+  // If `byte_strides` is omitted, it defaults to a dense layout with dimensions
+  // in major-to-minor order. The runtime may return `UNIMPLEMENTED` if
+  // `byte_strides` does not equate to a reordering of the dimensions.
+  //
+  // `on_done_with_host_buffer` is optional and may be null.
+  // `on_done_with_host_buffer` will be called iff OK is returned.
+  //
+  // TODO(hyeontaek): Consider changing `on_done_with_host_buffer` into a
+  // returned `Future<Status>` for consistency with other IFRT APIs.
+  virtual StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const Sharding> sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) = 0;
+
+  // Builds a larger array out of individual per-device shards.
+  virtual StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics semantics) = 0;
+
+  // Builds a tuple from a sequence of values.
+  virtual StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) = 0;
+
+  // The following APIs are taken from `xla::PjRtClient` for fast prototyping.
+  // Most of the APIs will be factored out as a `Platform`/`Topology` in the
+  // future to facilitate topology discovery and ahead-of-time compilation.
+
+  // TODO(hyeontaek): Remove runtime_type() in favor of LLVM RTTI.
+  virtual absl::string_view runtime_type() const = 0;
+
+  // TODO(hyeontaek): Factor them out to a `Platform`/`Topology` class.
+  virtual absl::string_view platform_name() const = 0;
+  virtual absl::string_view platform_version() const = 0;
+  virtual PlatformId platform_id() const = 0;
+
+  virtual int device_count() const = 0;
+  virtual int addressable_device_count() const = 0;
+  virtual absl::Span<Device* const> devices() const = 0;
+  virtual absl::Span<Device* const> addressable_devices() const = 0;
+  virtual int process_index() const = 0;
+
+  // TODO(hyeontaek): Consider removing this API. This API is potentially not
+  // being used by JAX or will be replaced with explicit device assignment.
+  virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const = 0;
+  virtual StatusOr<Device*> LookupDevice(int device_id) const = 0;
+
+  virtual StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() = 0;
+  virtual StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() = 0;
+
+  // TODO(hyeontaek): Potentially remove this method to encourage supporting
+  // only ahead-of-time compilation.
+  virtual Compiler* GetDefaultCompiler() = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/client_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/client_impl_test_lib.cc
new file mode 100644
index 00000000000..725c3ba5cc2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/client_impl_test_lib.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::SizeIs;
+
+TEST(ClientImplTest, RuntimeTypeAndPlatform) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  EXPECT_THAT(client->runtime_type(), Not(IsEmpty()));
+  EXPECT_THAT(client->platform_name(), Not(IsEmpty()));
+  EXPECT_THAT(client->platform_version(), Not(IsEmpty()));
+  client->platform_id();
+}
+
+TEST(ClientImplTest, Devices) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  ASSERT_GE(client->device_count(), 0);
+  EXPECT_THAT(client->devices(), SizeIs(client->device_count()));
+
+  ASSERT_GE(client->addressable_device_count(), 0);
+  EXPECT_THAT(client->addressable_devices(),
+              SizeIs(client->addressable_device_count()));
+
+  for (Device* device : client->devices()) {
+    TF_ASSERT_OK_AND_ASSIGN(auto* looked_up_device,
+                            client->LookupDevice(device->id()));
+    EXPECT_EQ(device, looked_up_device);
+  }
+
+  EXPECT_GE(client->process_index(), 0);
+}
+
+TEST(ClientImplTest, DefaultCompiler) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  EXPECT_THAT(client->GetDefaultCompiler(), NotNull());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/compiler.cc b/tensorflow/compiler/xla/python/ifrt/compiler.cc
new file mode 100644
index 00000000000..cd73d6d59da
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/compiler.cc
@@ -0,0 +1,24 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+
+namespace xla {
+namespace ifrt {
+
+char Compiler::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/compiler.h b/tensorflow/compiler/xla/python/ifrt/compiler.h
new file mode 100644
index 00000000000..f3d083ec18d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/compiler.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_COMPILER_H_
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+
+namespace xla {
+namespace ifrt {
+
+// TODO(hyeontaek): Generalize `xla::CompileOptions`.
+using CompileOptions = ::xla::CompileOptions;
+
+// Represents a compiler that creates an `Executable` that can run a computation
+// on devices.
+class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
+ public:
+  // Compiles `mlir_module` and returns an `Executable`.
+
+  // TODO(hyeontaek): Introduce `Platform`/`Topology` and return `Executable`
+  // instead of `LoadedExecutable`. This will factor out the loading portion of
+  // the compilation, enabling ahead-of-time compilation.
+  virtual StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) = 0;
+
+  // Deserializes a serialized executable as produced by
+  // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
+  // implementation specific.
+  virtual StatusOr<std::unique_ptr<LoadedExecutable>>
+  DeserializeLoadedExecutable(absl::string_view serialized,
+                              CompileOptions options) = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_COMPILER_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/device.h b/tensorflow/compiler/xla/python/ifrt/device.h
new file mode 100644
index 00000000000..f30bfa6280f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/device.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DEVICE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DEVICE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+
+namespace xla {
+namespace ifrt {
+
+// Short-term alias to reuse `xla::PjRtDevice` without a separate abstract type.
+using Device = ::xla::PjRtDevice;
+
+// Ordered list of devices.
+class DeviceList {
+ public:
+  // Number of devices to inline in `Devices`.
+  static constexpr int kInlineDeviceSize = 1;
+
+  // TODO(hyeontaek): Consider using variant<Device*, std::vector<Device*>> for
+  // better performance.
+  using Devices = absl::InlinedVector<Device*, kInlineDeviceSize>;
+
+  explicit DeviceList(Devices devices) : devices_(std::move(devices)) {}
+
+  absl::Span<Device* const> devices() const { return devices_; }
+
+  int size() const { return devices_.size(); }
+  bool empty() const { return devices_.empty(); }
+
+  Device* operator[](int i) const { return devices_[i]; }
+  Device* at(int i) const { return devices_.at(i); }
+  Device* front() const { return devices_.front(); }
+  Device* back() const { return devices_.back(); }
+
+  auto begin() const { return devices_.begin(); }
+  auto cbegin() const { return devices_.cbegin(); }
+  auto end() const { return devices_.end(); }
+  auto cend() const { return devices_.cend(); }
+
+ private:
+  Devices devices_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DEVICE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/dtype.cc b/tensorflow/compiler/xla/python/ifrt/dtype.cc
new file mode 100644
index 00000000000..de04817559c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/dtype.cc
@@ -0,0 +1,129 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+
+#include <optional>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace xla {
+namespace ifrt {
+
+std::optional<int> DType::byte_size() const {
+  switch (kind_) {
+    case kS8:
+    case kU8:
+      return 1;
+    case kS16:
+    case kU16:
+    case kF16:
+    case kBF16:
+      return 2;
+    case kS32:
+    case kU32:
+    case kF32:
+      return 4;
+    case kS64:
+    case kU64:
+    case kF64:
+    case kC64:
+      return 8;
+    case kC128:
+      return 16;
+    default:
+      return std::nullopt;
+  }
+}
+
+std::optional<int> DType::bit_size() const {
+  switch (kind_) {
+    case kPred:
+      return 1;
+    case kS8:
+    case kU8:
+      return 8;
+    case kS16:
+    case kU16:
+    case kF16:
+    case kBF16:
+      return 16;
+    case kS32:
+    case kU32:
+    case kF32:
+      return 32;
+    case kS64:
+    case kU64:
+    case kF64:
+    case kC64:
+      return 64;
+    case kC128:
+      return 128;
+    default:
+      return std::nullopt;
+  }
+}
+
+std::string DType::DebugString() const {
+  switch (kind_) {
+    case kInvalid:
+      return "INVALID";
+    case kPred:
+      return "PRED";
+    case kS8:
+      return "S8";
+    case kS16:
+      return "S16";
+    case kS32:
+      return "S32";
+    case kS64:
+      return "S64";
+    case kU8:
+      return "U8";
+    case kU16:
+      return "U16";
+    case kU32:
+      return "U32";
+    case kU64:
+      return "U64";
+    case kF16:
+      return "F16";
+    case kF32:
+      return "F32";
+    case kF64:
+      return "F64";
+    case kBF16:
+      return "BF16";
+    case kC64:
+      return "C64";
+    case kC128:
+      return "C128";
+    case kToken:
+      return "TOKEN";
+    case kString:
+      return "STRING";
+    default:
+      return absl::StrCat("UNKNOWN(", static_cast<int>(kind_), ")");
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const DType& dtype) {
+  return os << dtype.DebugString();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/dtype.h b/tensorflow/compiler/xla/python/ifrt/dtype.h
new file mode 100644
index 00000000000..3352be88571
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/dtype.h
@@ -0,0 +1,114 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DTYPE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DTYPE_H_
+
+#include <optional>
+#include <ostream>
+#include <string>
+
+namespace xla {
+namespace ifrt {
+
+// Data type of an element.
+//
+// Based on `xla::PrimitiveType`. Differences:
+//
+// * Match the Google C++ style guide for enumerator naming.
+// * Rename PRIMITIVE_TYPE_INVALID to kInvalid.
+// * Remove TUPLE, OPAQUE_TYPE.
+// * Add kString.
+class DType {
+ public:
+  enum Kind {
+    // Invalid data type.
+    kInvalid = 0,
+
+    // Predicates are two-state booleans.
+    kPred = 1,
+
+    // Signed integral values of fixed width.
+    kS8 = 2,
+    kS16 = 3,
+    kS32 = 4,
+    kS64 = 5,
+
+    // Unsigned integral values of fixed width.
+    kU8 = 6,
+    kU16 = 7,
+    kU32 = 8,
+    kU64 = 9,
+
+    // Floating-point values of fixed width.
+    kF16 = 10,
+    kF32 = 11,
+    kF64 = 12,
+
+    // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+    // floating-point format, but uses 1 bit for the sign, 8 bits for the
+    // exponent and 7 bits for the mantissa.
+    kBF16 = 16,
+
+    // Complex values of fixed width.
+    kC64 = 15,   // Paired F32 (real, imag), as in std::complex<float>.
+    kC128 = 18,  // Paired F64 (real, imag), as in std::complex<double>.
+
+    // A token type threaded between side-effecting operations. Shapes of this
+    // dtype will have empty dimensions.
+    kToken = 17,
+
+    kF8E4M3FN = 19,
+    kF8E5M2 = 20,
+
+    // Next = 21
+
+    // String is not support in XLA. DType.Kind needs to match xla.PrimitiveType
+    // enum, so choose a large enum to avoid collision.
+    kString = 99,
+  };
+
+  explicit DType(Kind kind) : kind_(kind) {}
+  DType(const DType&) = default;
+  DType(DType&&) = default;
+  DType& operator=(const DType&) = default;
+  DType& operator=(DType&&) = default;
+
+  Kind kind() const { return kind_; }
+
+  bool operator==(const DType& other) const { return kind_ == other.kind_; }
+  bool operator!=(const DType& other) const { return kind_ != other.kind_; }
+
+  // Returns the byte size of a single element of this DType. Returns
+  // std::nullopt if there is no fixed size or not aligned to a byte boundary
+  // (such as kPred).
+  std::optional<int> byte_size() const;
+
+  // Returns the bit size of a single element of this DType. Returns
+  // std::nullopt if there is no fixed size.
+  std::optional<int> bit_size() const;
+
+  std::string DebugString() const;
+
+ private:
+  Kind kind_;
+};
+
+std::ostream& operator<<(std::ostream& os, const DType& dtype);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DTYPE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/executable.cc b/tensorflow/compiler/xla/python/ifrt/executable.cc
new file mode 100644
index 00000000000..48c72054960
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/executable.cc
@@ -0,0 +1,25 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+
+namespace xla {
+namespace ifrt {
+
+char Executable::ID = 0;
+char LoadedExecutable::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/executable.h b/tensorflow/compiler/xla/python/ifrt/executable.h
new file mode 100644
index 00000000000..75c21bcabd8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/executable.h
@@ -0,0 +1,174 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_EXECUTABLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Wraps a computation that has been partially compiled and can be loaded.
+class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
+ public:
+  // Unique name for this executable.
+  virtual absl::string_view name() const = 0;
+
+  // Returns a fingerprint of this executable.
+  virtual StatusOr<std::optional<std::string>> Fingerprint() const = 0;
+
+  // Serializes this executable into a string. The compatibility of the
+  // serialized executable is implementation-specific.
+  virtual StatusOr<std::string> Serialize() const = 0;
+
+  // The following APIs are taken from `xla::PjRtExecutable` for fast
+  // prototyping. TODO(hyeontaek): Factor some of them out as
+  // `XlaCompatibleExecutable`.
+  virtual int num_devices() const = 0;
+  virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
+  virtual StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const = 0;
+
+  // TODO(hyeontaek): Move the following XLA-specific methods to
+  // pjrt_executable.h and put it in an `XlaCompatibleExecutable`.
+
+  // Returns a list of parameter `OpSharding`.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const = 0;
+  // Returns a list of output `OpSharding`.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
+  // Returns an `HloModule` (optimized) per partition.
+  virtual StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Wraps a computation that has been fully compiled and loaded for execution.
+class LoadedExecutable
+    : public llvm::RTTIExtends<LoadedExecutable, llvm::RTTIRoot> {
+ public:
+  virtual Client* client() const = 0;
+
+  // Executable methods. Note that LoadedExecutable does not inherit from
+  // Executable to avoid multiple inheritance in LoadedExecutable
+  // implementations.
+
+  // Unique name for this executable.
+  virtual absl::string_view name() const = 0;
+
+  // Returns a fingerprint of this executable.
+  virtual StatusOr<std::optional<std::string>> Fingerprint() const = 0;
+
+  // Serializes this executable into a string. The compatibility of the
+  // serialized executable is implementation-specific.
+  virtual StatusOr<std::string> Serialize() const = 0;
+
+  // The following APIs are taken from `xla::PjRtExecutable` for fast
+  // prototyping.
+
+  // TODO(hyeontaek): Factor some of them out as `XlaCompatibleExecutable`.
+  virtual int num_devices() const = 0;
+  virtual int64_t SizeOfGeneratedCodeInBytes() const = 0;
+  virtual StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const = 0;
+
+  // The following APIs are taken from `xla::PjRtLoadedExecutable` for fast
+  // prototyping.
+
+  // TODO(hyeontaek): Move the following to pjrt_executable.h and put it in an
+  // `XlaCompatibleExecutable`.
+  // Returns a list of parameter Sharding.
+  virtual std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const = 0;
+  // Returns a list of output OpSharding.
+  virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
+  // Return an HloModule (optimized) per partition.
+  virtual StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const = 0;
+
+  // `LoadedExecutable` methods.
+
+  // Short-term alias.
+  using ExecuteOptions = ::xla::ExecuteOptions;
+
+  // Result from an execution.
+  struct ExecuteResult {
+    // Resulting status of the execution.
+    Future<Status> status;
+    // Output arrays.
+    std::vector<tsl::RCReference<Array>> outputs;
+  };
+
+  // Executes the executable on devices.
+  //
+  // The runtime expects input arrays to be present on the execution devices.
+  //
+  // If `devices` is specified, the execution runs on the devices if the runtime
+  // supports. Otherwise, the execution runs on the devices where the executable
+  // has been compiled and loaded onto.
+  //
+  // TODO(hyeontaek): This call does not have strict "barrier" semantics, and
+  // thus it is up to the backend implementation: Some backends will wait all
+  // arguments to be available to run any computation (which may be composed of
+  // individually dispatchable sub-computations), while others may run the
+  // computation incrementally. Some backends will mark outputs to become ready
+  // roughly at the same time, while others may make outputs ready
+  // incrementally. We need to have a stricter way to control this behavior
+  // (e.g., having per-argument/output booleans or providing a separate barrier
+  // API).
+  virtual StatusOr<ExecuteResult> Execute(
+      absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+      std::optional<DeviceList> devices) = 0;
+
+  // Deletes the executable from the devices. The operation may be asynchronous.
+  // The returned future will have the result of the deletion on the devices.
+  // Implementations that do not track the completion of the deletion operation
+  // may make the future immediately ready with an OK status.
+  virtual Future<Status> Delete() = 0;
+  // Returns whether the executable has been enqueued for deletion from the
+  // devices.
+  virtual bool IsDeleted() const = 0;
+
+  // The following APIs are taken from xla::PjRtLoadedExecutable for fast
+  // prototyping.
+  // TODO(hyeontaek): Move the following XLA-specific methods to
+  // pjrt_executable.h and put it in an `XlaCompatibleExecutable`.
+
+  virtual const DeviceAssignment& device_assignment() const = 0;
+  using LogicalDeviceIds = ::xla::PjRtLoadedExecutable::LogicalDeviceIds;
+  virtual absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const = 0;
+  virtual absl::Span<Device* const> addressable_devices() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_EXECUTABLE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc
new file mode 100644
index 00000000000..31376cb4257
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc
@@ -0,0 +1,152 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::SizeIs;
+
+// Serialized `ModuleOp` that does add 1.
+static const char* const module_add_one =
+    R"(module {
+func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = "mhlo.copy"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  %2 = "mhlo.broadcast"(%1) {broadcast_sizes = dense<[2, 3]> : tensor<2xi64>} : (tensor<f32>) -> tensor<2x3xf32>
+  %3 = mhlo.add %0, %2 : tensor<2x3xf32>
+  return %3 : tensor<2x3xf32>
+}})";
+
+// Compiles an MLIR module on specified devices.
+StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
+    Client* client, Compiler* compiler, absl::string_view mlir_module_str,
+    absl::Span<Device* const> devices, bool replicated) {
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      xla::ParseMlirModuleString(mlir_module_str, context));
+
+  CompileOptions compile_options;
+  ExecutableBuildOptions& build_options =
+      compile_options.executable_build_options;
+  for (Device* device : devices) {
+    build_options.set_device_ordinal(device->id());
+    if (replicated) {
+      DeviceAssignment device_assignment(/*replica_count=*/devices.size(),
+                                         /*computation_count=*/1);
+      for (int i = 0; i < devices.size(); ++i) {
+        device_assignment(i, 0) = i;
+      }
+      build_options.set_device_assignment(device_assignment);
+    } else {
+      DeviceAssignment device_assignment(/*replica_count=*/1,
+                                         /*computation_count=*/devices.size());
+      for (int i = 0; i < devices.size(); ++i) {
+        device_assignment(i, 0) = i;
+      }
+      build_options.set_device_assignment(device_assignment);
+    }
+  }
+  return compiler->Compile(*module, std::move(compile_options));
+}
+
+TEST(LoadedExecutableImplTest, CompileAndExecute) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Compiler* compiler = client->GetDefaultCompiler();
+
+  std::vector<Device*> devices = {client->addressable_devices().at(0)};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      CompileOnDevices(client.get(), compiler, module_add_one, devices,
+                       /*replicated=*/false));
+
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto array, client->MakeArrayFromHostBuffer(
+                      data.data(), dtype, shape,
+                      /*byte_strides=*/std::nullopt, sharding,
+                      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+                      /*on_done_with_host_buffer=*/{}));
+
+  ExecuteOptions execute_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_executable->Execute(absl::MakeSpan(&array, 1), execute_options,
+                                 /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  EXPECT_THAT(result.outputs, SizeIs(1));
+
+  std::vector<float> out_data(6);
+  auto future = result.outputs[0]->CopyToHostBuffer(
+      out_data.data(), /*byte_strides=*/std::nullopt,
+      ArrayCopySemantics::kAlwaysCopy);
+  TF_ASSERT_OK(future.Await());
+
+  std::vector<float> expected_out_data(6);
+  std::iota(expected_out_data.begin(), expected_out_data.end(), 1);
+  EXPECT_THAT(out_data, ElementsAreArray(expected_out_data));
+}
+
+TEST(LoadedExecutableImplTest, Delete) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Compiler* compiler = client->GetDefaultCompiler();
+
+  std::vector<Device*> devices = {client->addressable_devices().at(0)};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      CompileOnDevices(client.get(), compiler, module_add_one, devices,
+                       /*replicated=*/false));
+  TF_EXPECT_OK(loaded_executable->Delete().Await());
+}
+
+TEST(LoadedExecutableImplTest, IsDeleted) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  Compiler* compiler = client->GetDefaultCompiler();
+
+  std::vector<Device*> devices = {client->addressable_devices().at(0)};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto loaded_executable,
+      CompileOnDevices(client.get(), compiler, module_add_one, devices,
+                       /*replicated=*/false));
+  EXPECT_FALSE(loaded_executable->IsDeleted());
+  auto future = loaded_executable->Delete();
+  EXPECT_TRUE(loaded_executable->IsDeleted());
+  TF_EXPECT_OK(future.Await());
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/future.cc b/tensorflow/compiler/xla/python/ifrt/future.cc
new file mode 100644
index 00000000000..0f6d5d65940
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/future.cc
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
+
+#include <atomic>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace ifrt {
+
+Future<Status> JoinFutures(absl::Span<Future<Status>> futures) {
+  if (futures.empty()) {
+    return Future<Status>(OkStatus());
+  } else if (futures.size() == 1) {
+    return futures.front();
+  }
+  // State shared by `PjRtFuture` onready callbacks.
+  struct CombinedStatus {
+    explicit CombinedStatus(int initial_count)
+        : count(initial_count), promise(Future<Status>::CreatePromise()) {}
+    std::atomic<int> count;
+    absl::Mutex mu;
+    Status status ABSL_GUARDED_BY(&mu);
+    Promise<Status> promise;
+  };
+  auto combined_status = std::make_shared<CombinedStatus>(futures.size());
+  Future<Status> future(combined_status->promise);
+  for (auto& fut : futures) {
+    fut.OnReady([combined_status](Status s) {
+      if (!s.ok()) {
+        absl::MutexLock lock(&combined_status->mu);
+        combined_status->status.Update(std::move(s));
+      }
+      const int pre_dec_count =
+          combined_status->count.fetch_add(-1, std::memory_order_acq_rel);
+      CHECK_GE(pre_dec_count, 1);
+      if (pre_dec_count == 1) {
+        absl::MutexLock lock(&combined_status->mu);
+        combined_status->promise.Set(std::move(combined_status->status));
+      }
+    });
+  }
+  return future;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/future.h b/tensorflow/compiler/xla/python/ifrt/future.h
new file mode 100644
index 00000000000..5a280fc5878
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/future.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_FUTURE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_FUTURE_H_
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace ifrt {
+
+// Future reuses `xla::PjRtFuture` as the short-term implementation.
+//
+// We will address the following properties in a new `Future` implementation.
+//
+// * Creating and destroying Future should be very cheap if no one ever awaits
+// on the `Future`.
+//
+// * Awaiting on a `Future` should possibly be cancellable to lower overhead
+// when the `Future` value woudld be no longer useful or relevant.
+//
+// * Ideally, there should be a move-only version of `Future`, which will enable
+// (1) no reference counting of `Future`s sharing the same `Promise` and (2)
+// safe mutable access to the value when the `Future` becomes ready, including
+// moving the value out of the `Future`/`Promise`.
+template <typename T>
+using Future = ::xla::PjRtFuture<T>;
+
+template <typename T>
+using Promise = typename ::xla::PjRtFuture<T>::Promise;
+
+// Returns a `Future` that aggregates the return status of all `Future`s.
+Future<Status> JoinFutures(absl::Span<Future<Status>> futures);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_FUTURE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/future_test.cc b/tensorflow/compiler/xla/python/ifrt/future_test.cc
new file mode 100644
index 00000000000..9128e80d2a8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/future_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
+TEST(FutureTest, JoinZeroFuture) {
+  Future<Status> future = JoinFutures({});
+
+  TF_EXPECT_OK(future.Await());
+}
+
+TEST(FutureTest, JoinOneOkFuture) {
+  Promise<Status> promise = Future<Status>::CreatePromise();
+  std::vector<Future<Status>> futures;
+  futures.push_back(Future<Status>(promise));
+
+  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+
+  ASSERT_FALSE(future.IsReady());
+  promise.Set(OkStatus());
+  TF_EXPECT_OK(future.Await());
+}
+
+TEST(FutureTest, JoinOneFailingFuture) {
+  Promise<Status> promise = Future<Status>::CreatePromise();
+  std::vector<Future<Status>> futures;
+  futures.push_back(Future<Status>(promise));
+
+  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+
+  ASSERT_FALSE(future.IsReady());
+  promise.Set(InvalidArgument("Some error"));
+  EXPECT_THAT(future.Await(), StatusIs(tensorflow::error::INVALID_ARGUMENT,
+                                       HasSubstr("Some error")));
+}
+
+TEST(FutureTest, JoinAllOkFutures) {
+  constexpr int kNumFutures = 3;
+  std::vector<Promise<Status>> promises;
+  std::vector<Future<Status>> futures;
+  promises.reserve(kNumFutures);
+  futures.reserve(kNumFutures);
+  for (int i = 0; i < kNumFutures; ++i) {
+    promises.push_back(Future<Status>::CreatePromise());
+    futures.push_back(Future<Status>(promises.back()));
+  }
+
+  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+
+  ASSERT_FALSE(future.IsReady());
+  for (Promise<Status>& promise : promises) {
+    promise.Set(OkStatus());
+  }
+  TF_EXPECT_OK(future.Await());
+}
+
+TEST(FutureTest, JoinAllFailingFutures) {
+  constexpr int kNumFutures = 3;
+  std::vector<Promise<Status>> promises;
+  std::vector<Future<Status>> futures;
+  promises.reserve(kNumFutures);
+  futures.reserve(kNumFutures);
+  for (int i = 0; i < kNumFutures; ++i) {
+    promises.push_back(Future<Status>::CreatePromise());
+    futures.push_back(Future<Status>(promises.back()));
+  }
+
+  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+
+  ASSERT_FALSE(future.IsReady());
+  for (Promise<Status>& promise : promises) {
+    promise.Set(InvalidArgument("Some error"));
+  }
+  EXPECT_THAT(future.Await(), StatusIs(tensorflow::error::INVALID_ARGUMENT,
+                                       HasSubstr("Some error")));
+}
+
+class JoinAllOkFuturesExceptForOneTest : public testing::TestWithParam<int> {};
+
+TEST_P(JoinAllOkFuturesExceptForOneTest, JoinAllOkFuturesExceptForOne) {
+  const int kNumFutures = 3;
+  const int failing_future_idx = GetParam();
+  std::vector<Promise<Status>> promises;
+  std::vector<Future<Status>> futures;
+  promises.reserve(kNumFutures);
+  futures.reserve(kNumFutures);
+  for (int i = 0; i < kNumFutures; ++i) {
+    promises.push_back(Future<Status>::CreatePromise());
+    futures.push_back(Future<Status>(promises.back()));
+  }
+
+  Future<Status> future = JoinFutures(absl::MakeSpan(futures));
+
+  ASSERT_FALSE(future.IsReady());
+  for (int i = 0; i < kNumFutures; ++i) {
+    if (i == failing_future_idx) {
+      promises[i].Set(InvalidArgument("Some error"));
+    } else {
+      promises[i].Set(OkStatus());
+    }
+  }
+  EXPECT_THAT(future.Await(), StatusIs(tensorflow::error::INVALID_ARGUMENT,
+                                       HasSubstr("Some error")));
+}
+
+INSTANTIATE_TEST_SUITE_P(FutureTest, JoinAllOkFuturesExceptForOneTest,
+                         testing::Range(0, 3));
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/index.cc b/tensorflow/compiler/xla/python/ifrt/index.cc
new file mode 100644
index 00000000000..4c8d9a1b82a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index.cc
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/index.h"
+
+#include <ostream>
+#include <string>
+
+#include "absl/strings/str_join.h"
+
+namespace xla {
+namespace ifrt {
+
+std::string Index::DebugString() const {
+  return absl::StrCat("[", absl::StrJoin(elements_, ","), "]");
+}
+
+std::ostream& operator<<(std::ostream& os, const Index& index) {
+  return os << index.DebugString();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/index.h b/tensorflow/compiler/xla/python/ifrt/index.h
new file mode 100644
index 00000000000..606124c3a9d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index.h
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+#include "tensorflow/tsl/platform/logging.h"
+
+namespace xla {
+namespace ifrt {
+
+// Multi-dimensional index. Every element must be equal to or greater than 0.
+class Index {
+ public:
+  // Maximum elements to inline.
+  static constexpr int kInlineElementSize = 6;
+
+  using Elements = absl::InlinedVector<int64_t, kInlineElementSize>;
+
+  explicit Index(absl::Span<const int64_t> elements)
+      : elements_(Elements(elements.begin(), elements.end())) {}
+
+  static Index Zeros(int num_elements) {
+    return Index(Elements(/*n=*/num_elements));
+  }
+
+  Index(const Index&) = default;
+  Index(Index&&) = default;
+  Index& operator=(const Index&) = default;
+  Index& operator=(Index&&) = default;
+
+  absl::Span<const int64_t> elements() const { return elements_; }
+
+  bool operator==(const Index& other) const {
+    return elements_ == other.elements_;
+  }
+  bool operator!=(const Index& other) const {
+    return elements_ != other.elements_;
+  }
+  Index operator+(const Index& offset) const {
+    CHECK_EQ(elements_.size(), offset.elements_.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] += offset.elements_[i];
+    }
+    return result;
+  }
+  Index operator-(const Index& offset) const {
+    CHECK_EQ(elements_.size(), offset.elements_.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] -= offset.elements_[i];
+    }
+    return result;
+  }
+  Index operator*(absl::Span<const int64_t> multiplier) const {
+    CHECK_EQ(elements_.size(), multiplier.size());
+    Index result = *this;
+    for (int i = 0; i < elements_.size(); ++i) {
+      result.elements_[i] *= multiplier[i];
+    }
+    return result;
+  }
+  Index& operator+=(const Index& offset) { return *this = *this + offset; }
+  Index& operator-=(const Index& offset) { return *this = *this - offset; }
+  Index& operator*=(absl::Span<const int64_t> multiplier) {
+    return *this = *this * multiplier;
+  }
+
+  std::string DebugString() const;
+
+ private:
+  Elements elements_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Index& index);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/index_domain.cc b/tensorflow/compiler/xla/python/ifrt/index_domain.cc
new file mode 100644
index 00000000000..ac13bb74e9f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index_domain.cc
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
+
+#include <ostream>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+
+namespace xla {
+namespace ifrt {
+
+std::string IndexDomain::DebugString() const {
+  return absl::StrCat("IndexDomain(origin=", origin_.DebugString(),
+                      ",shape=", shape_.DebugString(), ")");
+}
+
+std::ostream& operator<<(std::ostream& os, const IndexDomain& index_domain) {
+  return os << index_domain.DebugString();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/index_domain.h b/tensorflow/compiler/xla/python/ifrt/index_domain.h
new file mode 100644
index 00000000000..8edaf1e74b9
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index_domain.h
@@ -0,0 +1,83 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/python/ifrt/index.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+
+// Domain of a multi-dimensional index space. Informally, it represents a slice
+// that is defined by the origin (lower inclusive bound) of the slice and the
+// shape of the slice.
+class IndexDomain {
+ public:
+  // General `IndexDomain` construction.
+  IndexDomain(Index origin, Shape shape)
+      : origin_(std::move(origin)), shape_(std::move(shape)) {}
+
+  // `IndexDomain` construction with a zeros origin.
+  explicit IndexDomain(Shape shape)
+      : origin_(Index::Zeros(shape.dims().size())), shape_(std::move(shape)) {}
+
+  IndexDomain(const IndexDomain&) = default;
+  IndexDomain(IndexDomain&&) = default;
+  IndexDomain& operator=(const IndexDomain&) = default;
+  IndexDomain& operator=(IndexDomain&&) = default;
+
+  const Index& origin() const { return origin_; }
+  const Shape& shape() const { return shape_; }
+
+  bool operator==(const IndexDomain& other) const {
+    return origin_ == other.origin_ && shape_ == other.shape_;
+  }
+  bool operator!=(const IndexDomain& other) const {
+    return origin_ != other.origin_ || shape_ != other.shape_;
+  }
+  IndexDomain operator+(const Index& offset) const {
+    return IndexDomain(origin_ + offset, shape_);
+  }
+  IndexDomain operator-(const Index& offset) const {
+    return IndexDomain(origin_ - offset, shape_);
+  }
+  IndexDomain& operator+=(const Index& offset) {
+    origin_ += offset;
+    return *this;
+  }
+  IndexDomain& operator-=(const Index& offset) {
+    origin_ -= offset;
+    return *this;
+  }
+  std::string DebugString() const;
+
+ private:
+  Index origin_;
+  Shape shape_;
+};
+
+std::ostream& operator<<(std::ostream& os, const IndexDomain& index_domain);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_INDEX_DOMAIN_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/index_domain_test.cc b/tensorflow/compiler/xla/python/ifrt/index_domain_test.cc
new file mode 100644
index 00000000000..11c91bb2960
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index_domain_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(IndexDomainTest, Construction) {
+  IndexDomain a(Index({1, 2}), Shape({3, 4}));
+  EXPECT_EQ(a.origin(), Index({1, 2}));
+  EXPECT_EQ(a.shape(), Shape({3, 4}));
+
+  IndexDomain b(Shape({3, 4}));
+  EXPECT_EQ(b.origin(), Index({0, 0}));
+  EXPECT_EQ(b.shape(), Shape({3, 4}));
+}
+
+TEST(IndexDomainTest, Operations) {
+  IndexDomain a(Index({1, 2}), Shape({3, 4}));
+  Index b({1, 2});
+
+  EXPECT_EQ(a + b, IndexDomain(Index({2, 4}), Shape({3, 4})));
+  {
+    IndexDomain c = a;
+    EXPECT_EQ(c += b, IndexDomain(Index({2, 4}), Shape({3, 4})));
+  }
+
+  EXPECT_EQ(a - b, IndexDomain(Index({0, 0}), Shape({3, 4})));
+  {
+    IndexDomain c = a;
+    EXPECT_EQ(c -= b, IndexDomain(Index({0, 0}), Shape({3, 4})));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/index_test.cc b/tensorflow/compiler/xla/python/ifrt/index_test.cc
new file mode 100644
index 00000000000..c6e1f4a8087
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/index_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/index.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(IndexTest, Construction) {
+  EXPECT_THAT(Index({1, 2}).elements(), ElementsAre(1, 2));
+  EXPECT_THAT(Index::Zeros(2).elements(), ElementsAre(0, 0));
+}
+
+TEST(IndexTest, Operations) {
+  EXPECT_EQ(Index({1, 2}), Index({1, 2}));
+  EXPECT_NE(Index({1, 2}), Index({1, 3}));
+
+  Index a({11, 22});
+  Index b({2, 3});
+
+  EXPECT_EQ(a + b, Index({13, 25}));
+  {
+    Index c = a;
+    EXPECT_EQ(c += b, Index({13, 25}));
+  }
+
+  EXPECT_EQ(a - b, Index({9, 19}));
+  {
+    Index c = a;
+    EXPECT_EQ(c -= b, Index({9, 19}));
+  }
+
+  EXPECT_EQ(a * std::vector<int64_t>({1, 2}), Index({11, 44}));
+  {
+    Index c = a;
+    EXPECT_EQ(c *= std::vector<int64_t>({1, 2}), Index({11, 44}));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/no_impl_test_main.cc b/tensorflow/compiler/xla/python/ifrt/no_impl_test_main.cc
new file mode 100644
index 00000000000..eed1ff46188
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/no_impl_test_main.cc
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char** argv) {
+  // Skip all tests. This is to verify that implementation tests build
+  // successfully without registering an IFRT client factory.
+  //
+  // Actual implementation tests may link with the standard `gtest_main` to run
+  // all tests or define a custom `main` function to filter out some tests.
+  const char* kFilter = "-*";
+#ifdef GTEST_FLAG_SET
+  GTEST_FLAG_SET(filter, kFilter);
+#else
+  testing::GTEST_FLAG(filter) = kFilter;
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/shape.cc b/tensorflow/compiler/xla/python/ifrt/shape.cc
new file mode 100644
index 00000000000..bd3ff1fc8e0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/shape.cc
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+
+#include <ostream>
+#include <string>
+
+#include "absl/strings/str_join.h"
+
+namespace xla {
+namespace ifrt {
+
+int64_t Shape::num_elements() const {
+  int64_t count = 1;
+  for (int64_t d : dims_) {
+    count *= d;
+  }
+  return count;
+}
+
+std::string Shape::DebugString() const {
+  return absl::StrCat("[", absl::StrJoin(dims_, ","), "]");
+}
+
+std::ostream& operator<<(std::ostream& os, const Shape& shape) {
+  return os << shape.DebugString();
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/shape.h b/tensorflow/compiler/xla/python/ifrt/shape.h
new file mode 100644
index 00000000000..3558e3518ed
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/shape.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHAPE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHAPE_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+
+namespace xla {
+namespace ifrt {
+
+// Shape of an array. Only supports static shapes at the moment. Every dimension
+// size must be equal to or greater than 0.
+class Shape {
+ public:
+  // Maximum dimensions to inline.
+  static constexpr int kInlineDimensionSize = 6;
+
+  using Dimensions = absl::InlinedVector<int64_t, kInlineDimensionSize>;
+
+  explicit Shape(absl::Span<const int64_t> dims)
+      : dims_(Dimensions(dims.begin(), dims.end())) {}
+  Shape(const Shape&) = default;
+  Shape(Shape&&) = default;
+  Shape& operator=(const Shape&) = default;
+  Shape& operator=(Shape&&) = default;
+
+  absl::Span<const int64_t> dims() const { return dims_; }
+
+  bool operator==(const Shape& other) const { return dims_ == other.dims_; }
+  bool operator!=(const Shape& other) const { return dims_ != other.dims_; }
+
+  // Total number of elements in this shape.
+  int64_t num_elements() const;
+
+  std::string DebugString() const;
+
+ private:
+  Dimensions dims_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Shape& shape);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHAPE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/shape_test.cc b/tensorflow/compiler/xla/python/ifrt/shape_test.cc
new file mode 100644
index 00000000000..9e3a6909c55
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/shape_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+TEST(ShapeTest, LargeDim) {
+  Shape shape({std::numeric_limits<int64_t>::max()});
+  EXPECT_THAT(shape.dims(),
+              testing::ElementsAre(std::numeric_limits<int64_t>::max()));
+}
+
+TEST(ShapeTest, ManyDims) {
+  const int kNumDims = 65536;  // Arbitrarily large number.
+  std::vector<int64_t> dims(kNumDims);
+  std::iota(dims.begin(), dims.end(), 0);
+  Shape shape(dims);
+  EXPECT_THAT(shape.dims(), testing::ElementsAreArray(dims));
+}
+
+TEST(ShapeTest, ScalarNumElements) {
+  Shape shape({});
+  EXPECT_EQ(shape.num_elements(), 1);
+}
+
+TEST(ShapeTest, ZeroDimNumElements) {
+  {
+    Shape shape({0});
+    EXPECT_EQ(shape.num_elements(), 0);
+  }
+  {
+    Shape shape({1, 0});
+    EXPECT_EQ(shape.num_elements(), 0);
+  }
+  {
+    Shape shape({0, 1});
+    EXPECT_EQ(shape.num_elements(), 0);
+  }
+  {
+    Shape shape({0, 0});
+    EXPECT_EQ(shape.num_elements(), 0);
+  }
+}
+
+TEST(ShapeTest, NonZeroDimsNumElements) {
+  {
+    Shape shape({2});
+    EXPECT_EQ(shape.num_elements(), 2);
+  }
+  {
+    Shape shape({2, 3});
+    EXPECT_EQ(shape.num_elements(), 6);
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding.cc b/tensorflow/compiler/xla/python/ifrt/sharding.cc
new file mode 100644
index 00000000000..5c23039766d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/sharding.cc
@@ -0,0 +1,135 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+
+#include <functional>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace ifrt {
+
+char Sharding::ID = 0;
+char SingleDeviceSharding::ID = 0;
+char OpaqueSharding::ID = 0;
+
+std::ostream& operator<<(std::ostream& os, const Sharding& sharding) {
+  return os << sharding.DebugString();
+}
+
+std::shared_ptr<const Sharding> SingleDeviceSharding::Create(Device* device) {
+  return std::shared_ptr<const Sharding>(new SingleDeviceSharding(device));
+}
+
+StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+SingleDeviceSharding::Disassemble(const Shape& shape) const {
+  DCHECK(this);
+  return InvalidArgument("Single-device sharding does not support disassembly");
+}
+
+StatusOr<std::vector<IndexDomain>> SingleDeviceSharding::IndexDomains(
+    const Shape& shape) const {
+  DCHECK(this);
+  std::vector<IndexDomain> result;
+  result.reserve(1);
+  result.push_back(IndexDomain(shape));
+  return result;
+}
+
+std::string SingleDeviceSharding::DebugString() const {
+  DCHECK(this);
+  return absl::StrFormat("SingleDeviceSharding(%s)",
+                         devices_.front()->ToString());
+}
+
+std::shared_ptr<const Sharding> OpaqueSharding::Create(DeviceList devices) {
+  return std::shared_ptr<const Sharding>(new OpaqueSharding(
+      std::move(devices),
+      DisassembleFunc([](const OpaqueSharding& sharding,
+                         const Shape& shape) -> StatusOr<std::vector<Shape>> {
+        return FailedPrecondition(
+            "Using an opaque sharding that disallows disassembly: "
+            "sharding=%s; shape=%s",
+            sharding.DebugString(), shape.DebugString());
+      })));
+}
+
+std::shared_ptr<const Sharding> OpaqueSharding::Create(
+    DeviceList devices, DisassembleFunc disassemble_func) {
+  return std::shared_ptr<const Sharding>(
+      new OpaqueSharding(std::move(devices), std::move(disassemble_func)));
+}
+
+OpaqueSharding::DisassembleFunc OpaqueSharding::MakeDisassembleFuncFromShapes(
+    std::vector<Shape> shapes) {
+  // Capture shapes in a shared_ptr so that the disassemble function can be
+  // copied cheaply.
+  return DisassembleFunc(
+      [shapes = std::make_shared<std::vector<Shape>>(std::move(shapes))](
+          const OpaqueSharding&, const Shape&) -> StatusOr<std::vector<Shape>> {
+        return *shapes;
+      });
+}
+
+OpaqueSharding::OpaqueSharding(DeviceList devices,
+                               DisassembleFunc disassemble_func)
+    : llvm::RTTIExtends<OpaqueSharding, Sharding>(std::move(devices)),
+      disassemble_func_(std::move(disassemble_func)) {}
+
+StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+OpaqueSharding::Disassemble(const Shape& shape) const {
+  DCHECK(this);
+  TF_ASSIGN_OR_RETURN(auto shapes, disassemble_func_(*this, shape));
+  if (shapes.size() != devices_.size()) {
+    return FailedPrecondition(
+        "DisassembleFunc returned an incorrect number of shapes");
+  }
+  std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>> result;
+  result.reserve(shapes.size());
+  for (int i = 0; i < shapes.size(); ++i) {
+    result.push_back(
+        {std::move(shapes[i]), SingleDeviceSharding::Create(devices_[i])});
+  }
+  return result;
+}
+
+StatusOr<std::vector<IndexDomain>> OpaqueSharding::IndexDomains(
+    const Shape& shape) const {
+  DCHECK(this);
+  return InvalidArgument(
+      "OpaqueSharding does not have index domain information");
+}
+
+std::string OpaqueSharding::DebugString() const {
+  DCHECK(this);
+  return absl::StrFormat(
+      "OpaqueSharding(%s)",
+      absl::StrJoin(devices_, ",", [](std::string* out, const Device* device) {
+        absl::StrAppend(out, device->ToString());
+      }));
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding.h b/tensorflow/compiler/xla/python/ifrt/sharding.h
new file mode 100644
index 00000000000..fe3a31d99a2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/sharding.h
@@ -0,0 +1,158 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHARDING_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHARDING_H_
+
+#include <functional>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/index_domain.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+// TODO(hyeontaek): Unify sharding types with jax::Sharding.
+
+// Abstract sharding type.
+//
+// TODO(hyeontaek): There is an indication that we may prefer to split logical
+// partitioning and device assignment into two separate data structures. It is
+// common that an operation preserves the logical partitioning and only updates
+// devices (e.g., "copy to devices" and portable execution). This fine-grained
+// sharding design may help reduce overhead around these operations.
+class Sharding : public llvm::RTTIExtends<Sharding, llvm::RTTIRoot> {
+ public:
+  // All devices in this sharding. Devices may appear more than once.
+  const DeviceList& devices() const { return devices_; }
+
+  // Breaks a shape up into per-device shapes and shardings. See
+  // Array::DisassembleIntoSingleDeviceArrays(). It may return an error if
+  // disassembly is unsupported.
+  virtual StatusOr<
+      std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const Shape& shape) const = 0;
+
+  // Maps each shard to an `IndexDomain` over `shape`. The result is a list of
+  // `index_domain_i` such that `array[index_domain_i] = disassembled_array_i`.
+  // Note that multiple shards may map onto equal `IndexDomain`. For instance, a
+  // fully replicated sharding would return a vector of `[IndexDomain(shape)] *
+  // devices().size()`.
+  virtual StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const = 0;
+
+  virtual std::string DebugString() const = 0;
+
+  static char ID;  // NOLINT
+
+ protected:
+  explicit Sharding(DeviceList devices) : devices_(devices) {}
+
+  DeviceList devices_;
+};
+
+std::ostream& operator<<(std::ostream& os, const Shape& shape);
+
+// Single-device sharding. It does not support per-device disassembly.
+//
+// TODO(hyeontaek): `SingleDeviceSharding` tends to be created or consumed in a
+// large quantity. It may be useful for performance optimization to special-case
+// this sharding type rather than expressing it as a general `Sharding`.
+class SingleDeviceSharding final
+    : public llvm::RTTIExtends<SingleDeviceSharding, Sharding> {
+ public:
+  // Creates a single-device sharding.
+  static std::shared_ptr<const Sharding> Create(Device* device);
+
+  // Sharding implementation.
+
+  ~SingleDeviceSharding() override = default;
+
+  StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const Shape& shape) const override;
+
+  StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit SingleDeviceSharding(Device* device)
+      : llvm::RTTIExtends<SingleDeviceSharding, Sharding>(
+            DeviceList({device})) {}
+};
+
+// Opaque sharding that does not define a fixed semantics for conversion between
+// a logical shape and per-device shapes, and device placements.
+//
+// TODO(hyeontaek): In most cases, we have the same shape on each device. Make
+// an OpaqueEqualSharding to save time to construct a disassemble function.
+// TODO(hyeontaek): Make a separate type to explore non-disassemblable sharding.
+class OpaqueSharding : public llvm::RTTIExtends<OpaqueSharding, Sharding> {
+ public:
+  using DisassembleFunc = std::function<StatusOr<std::vector<Shape>>(
+      const OpaqueSharding&, const Shape&)>;
+
+  // Creates an opaque sharding. `Disassemble()` will fail.
+  static std::shared_ptr<const Sharding> Create(DeviceList devices);
+
+  // Creates an opaque sharding with a custom shape disassemble function.
+  static std::shared_ptr<const Sharding> Create(
+      DeviceList devices, DisassembleFunc disassemble_func);
+
+  // Creates a `DisassembleFunc` from a list of shapes. The `DisassembleFunc`
+  // would ignore sharding and shape arguments.
+  static DisassembleFunc MakeDisassembleFuncFromShapes(
+      std::vector<Shape> shapes);
+
+  DisassembleFunc disassemble_func() const {
+    DCHECK(this);
+    return disassemble_func_;
+  }
+
+  // Sharding implementation.
+
+  ~OpaqueSharding() override = default;
+
+  StatusOr<std::vector<std::pair<Shape, std::shared_ptr<const Sharding>>>>
+  Disassemble(const Shape& shape) const override;
+
+  StatusOr<std::vector<IndexDomain>> IndexDomains(
+      const Shape& shape) const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit OpaqueSharding(DeviceList devices, DisassembleFunc disassemble_func);
+
+  DisassembleFunc disassemble_func_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SHARDING_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding_test.cc b/tensorflow/compiler/xla/python/ifrt/sharding_test.cc
new file mode 100644
index 00000000000..b3e2a13cf9c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/sharding_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "llvm/Support/Casting.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(SingleDeviceShardingTest, IndexDomains) {
+  std::shared_ptr<const Sharding> sharding =
+      SingleDeviceSharding::Create(reinterpret_cast<Device*>(1));
+
+  Shape shape({10, 20});
+  auto index_domains = sharding->IndexDomains(shape);
+  TF_ASSERT_OK(index_domains.status());
+  EXPECT_THAT(*index_domains, ElementsAre(IndexDomain(shape)));
+}
+
+TEST(OpaqueShardingTest, Disassemble) {
+  DeviceList::Devices devices;
+  devices.reserve(2);
+  devices.push_back(reinterpret_cast<Device*>(1));
+  devices.push_back(reinterpret_cast<Device*>(2));
+  DeviceList device_list(std::move(devices));
+
+  std::vector<Shape> shapes;
+  shapes.reserve(2);
+  shapes.push_back(Shape({10}));
+  shapes.push_back(Shape({20}));
+  OpaqueSharding::DisassembleFunc disassemble_func =
+      OpaqueSharding::MakeDisassembleFuncFromShapes(shapes);
+
+  std::shared_ptr<const Sharding> sharding =
+      OpaqueSharding::Create(device_list, std::move(disassemble_func));
+
+  auto exploded = sharding->Disassemble(Shape({30}));
+  TF_ASSERT_OK(exploded.status());
+
+  ASSERT_THAT(*exploded, testing::SizeIs(2));
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_EQ((*exploded)[i].first, shapes[i]);
+    EXPECT_TRUE(llvm::isa<SingleDeviceSharding>((*exploded)[i].second.get()));
+    EXPECT_THAT((*exploded)[i].second->devices().devices(),
+                testing::ElementsAre(device_list.devices()[i]));
+  }
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/test_util.cc b/tensorflow/compiler/xla/python/ifrt/test_util.cc
new file mode 100644
index 00000000000..8eb8ed74a72
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/test_util.cc
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+namespace {
+
+class ClientFactory {
+ public:
+  void Register(std::function<StatusOr<std::unique_ptr<Client>>()> factory) {
+    absl::MutexLock lock(&mu_);
+    CHECK(!factory_) << "Client factory has been already registered.";
+    factory_ = std::move(factory);
+  }
+
+  std::function<StatusOr<std::unique_ptr<Client>>()> Get() const {
+    absl::MutexLock lock(&mu_);
+    return factory_;
+  }
+
+ private:
+  mutable absl::Mutex mu_;
+  std::function<StatusOr<std::unique_ptr<Client>>()> factory_
+      ABSL_GUARDED_BY(mu_);
+};
+
+ClientFactory& GetGlobalClientFactory() {
+  static auto* const factory = new ClientFactory;
+  return *factory;
+}
+
+}  // namespace
+
+void RegisterClientFactory(
+    std::function<StatusOr<std::unique_ptr<Client>>()> factory) {
+  GetGlobalClientFactory().Register(std::move(factory));
+}
+
+StatusOr<std::unique_ptr<Client>> GetClient() {
+  auto factory = GetGlobalClientFactory().Get();
+  CHECK(factory) << "Client factory has not been registered.";
+  return factory();
+}
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/test_util.h b/tensorflow/compiler/xla/python/ifrt/test_util.h
new file mode 100644
index 00000000000..26654c0099f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/test_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TEST_UTIL_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+// Registers an IFRT client factory function. Must be called only once.
+void RegisterClientFactory(
+    std::function<StatusOr<std::unique_ptr<Client>>()> factory);
+
+// Returns true iff an IFRT client factory function has been registered.
+bool IsClientFactoryRegistered();
+
+// Gets a new IFRT client using the registered client factory.
+StatusOr<std::unique_ptr<Client>> GetClient();
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TEST_UTIL_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/tuple.cc b/tensorflow/compiler/xla/python/ifrt/tuple.cc
new file mode 100644
index 00000000000..ce83abdafcb
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/tuple.cc
@@ -0,0 +1,27 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/tuple.h"
+
+#include <memory>
+#include <vector>
+
+namespace xla {
+namespace ifrt {
+
+char Tuple::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/tuple.h b/tensorflow/compiler/xla/python/ifrt/tuple.h
new file mode 100644
index 00000000000..771bd2661a9
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/tuple.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TUPLE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TUPLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/value.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// A tuple containing zero or more Values.
+//
+// IsDeleted() returns true if either the tuple itself has been deleted or
+// any of its elements have been deleted.
+// The future returned by Delete() triggers with an Ok status when the tuple and
+// all of its elements have been deleted.
+class Tuple : public llvm::RTTIExtends<Tuple, Value> {
+ public:
+  Tuple() = default;
+
+  // Not copyable or movable.
+  Tuple(const Tuple&) = delete;
+  Tuple(Tuple&&) = delete;
+  Tuple& operator=(const Tuple&) = delete;
+  Tuple& operator=(Tuple&&) = delete;
+
+  // Returns the arity of the tuple.
+  virtual int Arity() = 0;
+
+  // Unpacks the tuple into its constituent pieces.
+  virtual Status Unpack(absl::Span<tsl::RCReference<Value>> values) = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_TUPLE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/tuple_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/tuple_impl_test_lib.cc
new file mode 100644
index 00000000000..57617d3f480
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/tuple_impl_test_lib.cc
@@ -0,0 +1,136 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/compiler/xla/python/ifrt/tuple.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+StatusOr<tsl::RCReference<Array>> MakeArray(Client* client) {
+  DType dtype(DType::kF32);
+  Shape shape({2, 3});
+  std::vector<float> data(6);
+  std::iota(data.begin(), data.end(), 0);
+  Device* device = client->addressable_devices().at(0);
+  auto sharding = SingleDeviceSharding::Create(device);
+
+  return client->MakeArrayFromHostBuffer(
+      data.data(), dtype, shape,
+      /*byte_strides=*/std::nullopt, sharding,
+      Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+      /*on_done_with_host_buffer=*/{});
+}
+
+TEST(TupleImplTest, NullaryTuple) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto t, client->MakeTuple({}));
+
+  EXPECT_EQ(t->Arity(), 0);
+  std::vector<tsl::RCReference<Value>> elements;
+  TF_EXPECT_OK(t->Unpack(absl::MakeSpan(elements)));
+  EXPECT_EQ(elements.size(), 0);
+
+  TF_EXPECT_OK(t->GetReadyFuture().Await());
+
+  EXPECT_THAT(t->DebugString(), ::testing::MatchesRegex(".*Tuple\\(\\)"));
+  EXPECT_FALSE(t->IsDeleted());
+
+  TF_EXPECT_OK(t->Delete().Await());
+  EXPECT_TRUE(t->IsDeleted());
+}
+
+TEST(TupleImplTest, TupleOfArrays) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
+  std::vector<tsl::RCReference<Value>> elements_in{a1, a2, a3};
+  TF_ASSERT_OK_AND_ASSIGN(auto t,
+                          client->MakeTuple(absl::MakeSpan(elements_in)));
+  EXPECT_EQ(t->Arity(), 3);
+  std::vector<tsl::RCReference<Value>> elements(3);
+  TF_EXPECT_OK(t->Unpack(absl::MakeSpan(elements)));
+  EXPECT_THAT(elements, ::testing::ElementsAre(a1, a2, a3));
+
+  EXPECT_THAT(t->DebugString(),
+              ::testing::MatchesRegex(".*Tuple\\(.*,.*,.*\\)"));
+
+  TF_EXPECT_OK(t->Delete().Await());
+  EXPECT_TRUE(t->IsDeleted());
+  EXPECT_TRUE(a1->IsDeleted());
+  EXPECT_TRUE(a2->IsDeleted());
+  EXPECT_TRUE(a3->IsDeleted());
+}
+
+TEST(TupleImplTest, DeleteOfElementDeletesTuple) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
+  std::vector<tsl::RCReference<Value>> elements_in{a1, a2, a3};
+  TF_ASSERT_OK_AND_ASSIGN(auto t,
+                          client->MakeTuple(absl::MakeSpan(elements_in)));
+
+  TF_EXPECT_OK(a1->Delete().Await());
+  EXPECT_TRUE(t->IsDeleted());
+  EXPECT_FALSE(a2->IsDeleted());
+  EXPECT_FALSE(a3->IsDeleted());
+}
+
+TEST(TupleImplTest, NestedTuples) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, test_util::GetClient());
+  TF_ASSERT_OK_AND_ASSIGN(auto a1, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a2, MakeArray(client.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto a3, MakeArray(client.get()));
+  std::vector<tsl::RCReference<Value>> e1{a1, a2};
+  TF_ASSERT_OK_AND_ASSIGN(auto t1, client->MakeTuple(absl::MakeSpan(e1)));
+  EXPECT_EQ(t1->Arity(), 2);
+  std::vector<tsl::RCReference<Value>> e2{};
+  TF_ASSERT_OK_AND_ASSIGN(auto t2, client->MakeTuple(absl::MakeSpan(e2)));
+  EXPECT_EQ(t2->Arity(), 0);
+
+  std::vector<tsl::RCReference<Value>> e3{t1, t2, a3};
+  TF_ASSERT_OK_AND_ASSIGN(auto t3, client->MakeTuple(absl::MakeSpan(e3)));
+  EXPECT_EQ(t3->Arity(), 3);
+
+  std::vector<tsl::RCReference<Value>> elements(3);
+  TF_EXPECT_OK(t3->Unpack(absl::MakeSpan(elements)));
+  EXPECT_THAT(elements, ::testing::ElementsAre(t1, t2, a3));
+
+  t3.reset();
+
+  elements.resize(t1->Arity());
+  TF_EXPECT_OK(t1->Unpack(absl::MakeSpan(elements)));
+  EXPECT_THAT(elements, ::testing::ElementsAre(a1, a2));
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/value.cc b/tensorflow/compiler/xla/python/ifrt/value.cc
new file mode 100644
index 00000000000..df76301e7b6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/value.cc
@@ -0,0 +1,24 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/value.h"
+
+namespace xla {
+namespace ifrt {
+
+char Value::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/value.h b/tensorflow/compiler/xla/python/ifrt/value.h
new file mode 100644
index 00000000000..fcf1a6f5141
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/value.h
@@ -0,0 +1,68 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_VALUE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_VALUE_H_
+
+#include <string>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Abstract superclass of values such as arrays.
+class Value : public tsl::ReferenceCounted<Value>,
+              public llvm::RTTIExtends<Value, llvm::RTTIRoot> {
+ public:
+  Value() = default;
+
+  // Not copyable or movable.
+  Value(const Value&) = delete;
+  Value(Value&&) = delete;
+  Value& operator=(const Value&) = delete;
+  Value& operator=(Value&&) = delete;
+
+  virtual Client* client() const = 0;
+
+  // Returns a future that becomes ready when the buffer is computed or has an
+  // error.
+  virtual Future<Status> GetReadyFuture() const = 0;
+
+  // Deletes the value from the devices. The operation may be asynchronous. The
+  // returned future will have the result of the deletion on the devices, and
+  // will be triggered after all values have been deleted.
+  // Implementations that do not track the completion of the deletion operation
+  // may make the future immediately ready with an OK status.
+  // TODO(phawkins): decide if we want Delete() to be idempotent.
+  virtual Future<Status> Delete() = 0;
+
+  // Returns whether the value has been enqueued for deletion from the devices.
+  virtual bool IsDeleted() const = 0;
+
+  virtual std::string DebugString() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_VALUE_H_
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index a6abc8be8f2..f6dc83ca12b 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -29,13 +29,16 @@ limitations under the License.
 #include <Python.h>
 
 #include <algorithm>
+#include <cstddef>
 #include <exception>
 #include <memory>
 #include <optional>
 #include <stdexcept>
 #include <string>
 #include <thread>  // NOLINT
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -47,6 +50,9 @@ limitations under the License.
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 #include "tensorflow/compiler/xla/pjrt/lru_cache.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
@@ -58,6 +64,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/python_utils.h"
 #include "tensorflow/compiler/xla/python/pytree.h"
 #include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -158,11 +165,16 @@ std::string CallSignature::DebugString() const {
                                 const xla::PyArgSignature& s) {
     out->append(s.DebugString());
   };
+  auto bool_formatter = [](std::string* out, bool o) {
+    out->append(o ? "true" : "false");
+  };
   return absl::StrFormat(
       "static args (positional + keyword): %s\nstatic arg keyword names: %s\n"
       "dynamic arg signatures (positional + keyword): %s\n"
       "dynamic arg shardings: %s\n"
-      "dynamic arg keyword names: %s\ndynamic arg treedefs: %s\n"
+      "committed args: %s\n"
+      "dynamic arg keyword names: %s\n"
+      "dynamic arg treedefs: %s\n"
       "device: %s\n"
       "jax_enable_x64: %d\n"
       "jax_array: %d\n"
@@ -172,6 +184,7 @@ std::string CallSignature::DebugString() const {
       absl::StrJoin(static_arg_names, ",", py_object_formatter),
       absl::StrJoin(dynamic_arg_signatures, ", ", signature_formatter),
       absl::StrJoin(dynamic_arg_shardings, ", ", py_object_formatter),
+      absl::StrJoin(committed_args, ",", bool_formatter),
       absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
       absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter),  // new line
       device != nullptr ? device->DebugString() : "nullptr", jax_enable_x64,
@@ -183,12 +196,12 @@ bool CallSignature::operator==(const CallSignature& other) const {
   // TODO(chky): Consider implementing hashing and equality for sharding in cpp
   // instead of hashing and checking sharding's pointer values.
   return std::tie(dynamic_arg_treedefs, dynamic_arg_names,
-                  dynamic_arg_signatures, dynamic_arg_shardings, device,
-                  jax_enable_x64, jax_array, static_arg_names) ==
+                  dynamic_arg_signatures, device, jax_enable_x64, jax_array,
+                  static_arg_names, committed_args) ==
              std::tie(other.dynamic_arg_treedefs, other.dynamic_arg_names,
-                      other.dynamic_arg_signatures, other.dynamic_arg_shardings,
-                      other.device, other.jax_enable_x64, other.jax_array,
-                      other.static_arg_names) &&
+                      other.dynamic_arg_signatures, other.device,
+                      other.jax_enable_x64, other.jax_array,
+                      other.static_arg_names, other.committed_args) &&
          // `==` on py:objects is the Python `is`. We need equal.
          std::equal(dynamic_arg_shardings.begin(), dynamic_arg_shardings.end(),
                     other.dynamic_arg_shardings.begin(),
@@ -226,56 +239,54 @@ bool CallSignature::operator==(const CallSignature& other) const {
 
 // Filter out static arguments, flatten and concatenate other arguments (i.e.
 // dynamic positional and keyword arguments), filling `arguments` in place.
-xla::Status ParseArguments(py::handle args,
-                           const std::optional<py::kwargs>& py_kwargs,
+xla::Status ParseArguments(absl::Span<PyObject* const> positional_args,
+                           absl::Span<PyObject* const> keyword_args,
+                           py::handle kwnames,
                            absl::Span<int const> static_argnums,
                            absl::Span<py::str const> static_argnames,
                            ParsedArgumentsAsBuffers& arguments) {
   tsl::profiler::TraceMe traceme("ParseArguments");
-  int num_args = PyTuple_GET_SIZE(args.ptr());
-  int num_kwargs = py_kwargs ? py_kwargs->size() : 0;
 
-  arguments.flat_dynamic_args.reserve(num_args + num_kwargs);
+  arguments.flat_dynamic_args.reserve(positional_args.size() +
+                                      keyword_args.size());
   if (static_argnums.empty()) {
-    arguments.signature.dynamic_arg_treedefs.resize(num_args);
+    arguments.signature.dynamic_arg_treedefs.resize(positional_args.size());
 
     // Positional arguments.
-    for (int i = 0; i < num_args; ++i) {
+    for (int i = 0; i < positional_args.size(); ++i) {
       xla::PyTreeDef& pytree_def = arguments.signature.dynamic_arg_treedefs[i];
-      pytree_def.FlattenInto(PyTuple_GET_ITEM(args.ptr(), i),
-                             arguments.flat_dynamic_args);
+      pytree_def.FlattenInto(positional_args[i], arguments.flat_dynamic_args);
     }
   } else {
-    arguments.signature.dynamic_arg_treedefs.reserve(num_args);
+    arguments.signature.dynamic_arg_treedefs.reserve(positional_args.size());
 
     // Positional arguments.
-    for (int i = 0; i < num_args; ++i) {
+    for (int i = 0; i < positional_args.size(); ++i) {
       if (std::find(static_argnums.begin(), static_argnums.end(), i) ==
           static_argnums.end()) {
         arguments.signature.dynamic_arg_treedefs.emplace_back();
         xla::PyTreeDef& pytree_def =
             arguments.signature.dynamic_arg_treedefs.back();
-        pytree_def.FlattenInto(PyTuple_GET_ITEM(args.ptr(), i),
-                               arguments.flat_dynamic_args);
+        pytree_def.FlattenInto(positional_args[i], arguments.flat_dynamic_args);
       } else {
         arguments.signature.static_args.emplace_back(
-            py::reinterpret_borrow<py::object>(
-                PyTuple_GET_ITEM(args.ptr(), i)));
+            py::reinterpret_borrow<py::object>(positional_args[i]));
       }
     }
   }
 
   // Keyword arguments.
-  if (py_kwargs) {
-    std::vector<std::pair<py::handle, py::handle>> kwargs(py_kwargs->begin(),
-                                                          py_kwargs->end());
+  if (!keyword_args.empty()) {
+    std::vector<std::pair<py::handle, py::handle>> kwargs(keyword_args.size());
     // We first intern the keys, then sort them (by name, as in the Python path)
     // (see also xla::PyTreeDef::Flatten) and then create the signatures.
     // TODO(jblespiau): We should be able to sort the keys by interned-key
     // pointers, but this requires the Python compilation to do the same.
-    for (int i = 0; i < num_kwargs; ++i) {
+    for (int i = 0; i < keyword_args.size(); ++i) {
       // Intern the key if not already interned.
+      kwargs[i].first = py::handle(PyTuple_GET_ITEM(kwnames.ptr(), i));
       kwargs[i].first.inc_ref();
+      kwargs[i].second = py::handle(keyword_args[i]);
       if (!PyUnicode_CHECK_INTERNED(kwargs[i].first.ptr())) {
         PyUnicode_InternInPlace(&kwargs[i].first.ptr());
       }
@@ -293,8 +304,8 @@ xla::Status ParseArguments(py::handle args,
       return false;
     };
 
-    arguments.signature.dynamic_arg_names.reserve(num_kwargs);
-    for (int i = 0; i < num_kwargs; ++i) {
+    arguments.signature.dynamic_arg_names.reserve(keyword_args.size());
+    for (int i = 0; i < keyword_args.size(); ++i) {
       if (kwarg_is_static(kwargs[i].first)) {
         arguments.signature.static_arg_names.push_back(
             py::reinterpret_steal<py::object>(kwargs[i].first));
@@ -340,7 +351,7 @@ struct CacheEntry {
 
   // Bitvector of kept arguments from Jaxpr DCE pass. Used to drop some `args`
   // in CompiledFunction::Call before calling into compiled computation.
-  std::optional<std::vector<bool>> kept_var_bitvec;
+  std::vector<bool> kept_var_bitvec;
   std::optional<xla::ClientAndPtr<xla::PjRtDevice>> sticky_device;
 
   // Fallback to Python happens:
@@ -486,8 +497,8 @@ class CompiledFunction {
   // (c) call the executable
   // (d) construct `DeviceArray` objects from the outputs
   // (e) reconstruct the `PyTree`.
-  xla::StatusOr<py::object> Call(py::handle args,
-                                 std::optional<py::kwargs> kwargs);
+  xla::StatusOr<py::object> Call(py::handle callable, PyObject* const* args,
+                                 size_t nargs, PyObject* kwnames);
 
   // This allows `inspect.signature(cpp_jitted_f)` from Python.
   py::object PythonSignature() {
@@ -497,8 +508,9 @@ class CompiledFunction {
 
   int cache_size() const { return executables_->Size(); }
   void ClearCache() {
-    // Setting `default_device_` to nullptr forces Call() to retrieve the
-    // device.
+// Setting `default_device_` to nullptr forces Call() to retrieve the
+// device.
+    default_client_ = nullptr;
     default_device_ = nullptr;
     executables_->Clear();
   }
@@ -524,7 +536,6 @@ class CompiledFunction {
     std::swap(get_device_, get_device);
   }
 
-  py::handle AsPyHandle();
   const std::string& function_name() const { return function_name_; }
 
  private:
@@ -571,6 +582,8 @@ class CompiledFunction {
   //   the `default_device_` which will be used as the targeted device. In
   //   which case, we will always copy input buffers to this device.
   // These fields are protected by the GIL.
+
+  xla::ifrt::Client* default_client_ = nullptr;
   xla::PjRtDevice* default_device_ = nullptr;
   bool is_committed_;
 };
@@ -619,7 +632,7 @@ CompiledFunction::CompiledFunction(py::function fun, py::function cache_miss,
       get_device_(std::move(get_device)),
       cache_(std::move(cache)) {
   std::sort(static_argnums_.begin(), static_argnums_.end());
-  for (py::str& s : static_argnames) {
+  for (py::str& s : static_argnames_) {
     PyUnicode_InternInPlace(&s.ptr());
   }
   executables_ = cache_->Lookup(fun_, donate_argnums);
@@ -633,8 +646,8 @@ CompiledFunction::~CompiledFunction() {
 }
 
 // Returns nullptr if arg has no sticky device
-static xla::StatusOr<xla::PjRtDevice*> GetJitArgumentStickyDevice(
-    py::handle arg) {
+static xla::StatusOr<std::pair<xla::ifrt::Client*, xla::PjRtDevice*>>
+GetJitArgumentStickyDevice(py::handle arg) {
   struct PythonTypes {
     py::object device_array;
   };
@@ -656,9 +669,12 @@ static xla::StatusOr<xla::PjRtDevice*> GetJitArgumentStickyDevice(
       }
 
       if (!py_array.committed()) {
-        return nullptr;
+        return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr,
+                                                               nullptr};
       }
-      return py_array.GetBuffer(0)->device();
+      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
+          py_array.ifrt_array()->client(),
+          py_array.ifrt_array()->sharding().devices().front()};
     }
   }
 
@@ -667,20 +683,23 @@ static xla::StatusOr<xla::PjRtDevice*> GetJitArgumentStickyDevice(
   if (arg.get_type().ptr() == xla::PyBuffer::type()) {
     xla::PyBuffer* buffer = xla::PyBuffer::AsPyBufferUnchecked(arg);
     if (!buffer->sticky_device()) {
-      return nullptr;
+      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
     }
-    return buffer->sticky_device();
+    return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
+        buffer->ifrt_array()->client(), buffer->sticky_device()};
   }
 
   if (arg.get_type().ptr() == types.device_array.ptr()) {
     if (arg.attr("_device").is_none()) {
-      return nullptr;
+      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
     }
     try {
       // This can fail, e.g. for cloud TPU 2VM buffers.
       TF_ASSIGN_OR_RETURN(xla::PyBuffer * buffer,
                           xla::PyBuffer::AsPyBuffer(arg.attr("device_buffer")));
-      return buffer->buffer()->device();
+      return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{
+          buffer->ifrt_array()->client(),
+          buffer->ifrt_array()->sharding().devices().front()};
     } catch (const py::cast_error& e) {
       return xla::InvalidArgument(
           "%s", absl::StrCat("[jaxjit] Unsupported subclass of `DeviceArray`: "
@@ -691,7 +710,7 @@ static xla::StatusOr<xla::PjRtDevice*> GetJitArgumentStickyDevice(
     }
   }
 
-  return nullptr;
+  return std::pair<xla::ifrt::Client*, xla::PjRtDevice*>{nullptr, nullptr};
 }
 
 // Compute signature for arguments.
@@ -699,6 +718,7 @@ static xla::StatusOr<xla::PjRtDevice*> GetJitArgumentStickyDevice(
 // Returns `OkStatus()` on success. Returning an error should lead to
 // calling the Python fallback.
 xla::Status ComputeSignature(bool jax_enable_x64,
+                             xla::ifrt::Client* default_client,
                              xla::PjRtDevice* default_device, bool is_committed,
                              ParsedArgumentsAsBuffers& arguments) {
   tsl::profiler::TraceMe traceme("ComputeSignature");
@@ -710,14 +730,15 @@ xla::Status ComputeSignature(bool jax_enable_x64,
   // https://github.com/google/jax/pull/1916 for the rationale why the
   // computation follows the data locality.
   // It's also similar to PyTorch's behavior.
+  xla::ifrt::Client* ifrt_client = nullptr;
   xla::PjRtDevice* data_device = nullptr;
-  if (is_committed) {
-    data_device = default_device;
-  } else {
+  if (!is_committed) {
     for (int i = 0; i < num_flat_dynamic_args; ++i) {
       TF_ASSIGN_OR_RETURN(
-          xla::PjRtDevice * device,
+          auto client_and_device,
           GetJitArgumentStickyDevice(arguments.flat_dynamic_args[i]));
+      xla::ifrt::Client* client = client_and_device.first;
+      xla::PjRtDevice* device = client_and_device.second;
       if (device) {
         if (data_device && (device != data_device)) {
           throw std::invalid_argument(absl::StrCat(
@@ -725,16 +746,18 @@ xla::Status ComputeSignature(bool jax_enable_x64,
               "C++ jax.jit). Arguments are on devices: ",
               device->DebugString(), " and ", data_device->DebugString()));
         } else {
+          ifrt_client = client;
           data_device = device;
         }
       }
     }
   }
   if (!data_device) {
-    // No `DeviceArray` were found default to `default_device`.
+    ifrt_client = default_client;
     data_device = default_device;
   }
   CHECK(data_device);
+  arguments.ifrt_client = ifrt_client;
   arguments.signature.device = data_device;
 
   arguments.signature.dynamic_arg_signatures.reserve(num_flat_dynamic_args);
@@ -750,32 +773,31 @@ xla::Status ComputeSignature(bool jax_enable_x64,
 // Copy buffers to device, skipping pruned arguments.
 // Returns `OkStatus()` on success. Returning an error should lead to
 // calling the Python fallback.
-xla::Status CopyBuffersToDevice(
-    bool jax_enable_x64, const std::optional<std::vector<bool>>& kept_args,
-    ParsedArgumentsAsBuffers& arguments) {
-  std::vector<xla::PjRtBuffer*>& arg_buffers = arguments.arg_buffers;
+xla::Status CopyBuffersToDevice(bool jax_enable_x64,
+                                const std::vector<bool>& kept_args,
+                                ParsedArgumentsAsBuffers& arguments) {
+  std::vector<tsl::RCReference<xla::ifrt::Array>>& ifrt_arg_arrays =
+      arguments.ifrt_arg_arrays;
   xla::PjRtDevice* data_device = arguments.signature.device;
 
   int num_flat_dynamic_args = arguments.flat_dynamic_args.size();
   xla::DevicePutOptions options;
   options.squash_64bit_types = !jax_enable_x64;
   options.allow_zero_copy = true;
-  arg_buffers.reserve(num_flat_dynamic_args);
-  bool input_pruning_enabled = kept_args.has_value();
+  ifrt_arg_arrays.reserve(num_flat_dynamic_args);
   for (int i = 0; i < num_flat_dynamic_args; ++i) {
-    if (input_pruning_enabled && !kept_args.value()[i]) {
+    if (!kept_args[i]) {
       continue;
     }
 
     py::handle arg = arguments.flat_dynamic_args[i];
     TF_ASSIGN_OR_RETURN(xla::DevicePutResult on_device,
-                        DevicePut(arg, data_device, options));
+                        DevicePut(arg,
+                                  arguments.ifrt_client,
+                                  data_device, options));
 
-    xla::PjRtBuffer* buffer = on_device.buffer;
-    arg_buffers.push_back(buffer);
-    if (on_device.owned_buffer) {
-      arguments.keep_alive.push_back(std::move(on_device.owned_buffer));
-    } else if (on_device.owning_pybuffer) {
+    ifrt_arg_arrays.push_back(std::move(on_device.ifrt_array));
+    if (on_device.owning_pybuffer) {
       arguments.keep_alive_objects.push_back(
           std::move(on_device.owning_pybuffer));
     }
@@ -798,7 +820,7 @@ void CompiledFunction::PopulateCacheEntry(
       executable_handlers_out_tree.attr("xla_executable"));
   cache_entry->executable = std::move(executable);
   int num_devices =
-      cache_entry->executable->pjrt_executable().addressable_devices().size();
+      cache_entry->executable->ifrt_executable()->addressable_devices().size();
   // The presence of jit(pmap) is detected from Python.
   CHECK_EQ(num_devices, 1);
 
@@ -841,16 +863,11 @@ void CompiledFunction::PopulateCacheEntry(
     cache_entry->committed.push_back(c.cast<bool>());
   }
 
-  auto kept_var_bitvec_attr =
-      py::getattr(executable_handlers_out_tree, "kept_var_bitvec", py::none());
-  if (!kept_var_bitvec_attr.is_none()) {
-    auto kept_var_bitvec = py::cast<py::list>(kept_var_bitvec_attr);
-    cache_entry->kept_var_bitvec =
-        std::make_optional<std::vector<bool>>(kept_var_bitvec.size(), false);
-    for (int i = 0; i < kept_var_bitvec.size(); ++i) {
-      cache_entry->kept_var_bitvec.value()[i] =
-          py::cast<bool>(kept_var_bitvec[i]);
-    }
+  auto kept_var_bitvec =
+      py::cast<py::list>(executable_handlers_out_tree.attr("kept_var_bitvec"));
+  cache_entry->kept_var_bitvec.reserve(kept_var_bitvec.size());
+  for (const auto& b : kept_var_bitvec) {
+    cache_entry->kept_var_bitvec.push_back(b.cast<bool>());
   }
 }
 
@@ -872,6 +889,7 @@ void CompiledFunction::TryToPopulateDefaultDevice() {
           device_and_is_committed.attr("default_device"));
       is_committed_ =
           py::cast<bool>(device_and_is_committed.attr("committed_to_device"));
+      default_client_ = default_pydevice.client->ifrt_client();
       default_device_ = default_pydevice.contents;
     } catch (const py::cast_error& e) {
       // Pathways, Cloud TPU 2VM, and UPTC runtime.
@@ -880,8 +898,10 @@ void CompiledFunction::TryToPopulateDefaultDevice() {
   }
 }
 
-xla::StatusOr<py::object> CompiledFunction::Call(
-    py::handle args, std::optional<py::kwargs> kwargs) {
+xla::StatusOr<py::object> CompiledFunction::Call(py::handle callable,
+                                                 PyObject* const* args,
+                                                 size_t nargs,
+                                                 PyObject* kwnames) {
   VLOG(3) << "Calling CompiledFunction " << function_name_;
 
   // Make sure we trigger a garbage collection on JIT function calls. Otherwise
@@ -895,15 +915,33 @@ xla::StatusOr<py::object> CompiledFunction::Call(
   auto& global_state = GlobalJitState();
   auto& tls = ThreadLocalJitState();
   if (GetDisableJit()) {
-    return fun_(*py::reinterpret_borrow<py::args>(args),
-                **kwargs.value_or(py::kwargs()));
+    return py::reinterpret_steal<py::object>(
+        JAX_PyObject_Vectorcall(fun_.ptr(), args, nargs, kwnames));
   }
+
+  // Calls the cache_miss_ function. This just calls the Python function; it may
+  // return nullptr value if a Python exception is thrown.
+  auto cache_miss = [&]() -> py::tuple {
+    return py::reinterpret_steal<py::tuple>(
+        JAX_PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
+  };
+
+  // Call the cache_miss() function, extracting the output data and ignoring
+  // the fastpath data. If the cache miss returns a Python error, returns
+  // nullptr and leaves the Python error set.
+  auto fallback_to_cache_miss = [&]() {
+    py::tuple cache_miss_output = cache_miss();
+    if (!cache_miss_output.ptr()) {
+      return py::object();
+    }
+    return py::object(cache_miss_output[0]);
+  };
+
   if (always_fallback_to_python_) {
-    return py::object(
-        py::cast<py::tuple>(cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                                        **kwargs.value_or(py::kwargs())))[0]);
+    return fallback_to_cache_miss();
   }
 
+  xla::ifrt::Client* client = nullptr;
   xla::PjRtDevice* device = nullptr;
   // Whether `device` should override an input with a sticky device.
   bool is_committed;
@@ -920,6 +958,7 @@ xla::StatusOr<py::object> CompiledFunction::Call(
       cast_success = false;
     }
     if (cast_success) {
+      client = pjrt_device_ptr.client->ifrt_client();
       device = pjrt_device_ptr.get();
       is_committed = false;
       VLOG(3) << "Using config.default_device (uncommitted): "
@@ -935,11 +974,10 @@ xla::StatusOr<py::object> CompiledFunction::Call(
       // @jit may be used as a decorator.
       TryToPopulateDefaultDevice();
       if (!default_device_) {
-        return py::object(py::cast<py::tuple>(
-            cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                        **kwargs.value_or(py::kwargs())))[0]);
+        return fallback_to_cache_miss();
       }
     }
+    client = default_client_;
     device = default_device_;
     is_committed = is_committed_;
     VLOG(3) << "Using device from Python): " << device->DebugString()
@@ -949,13 +987,17 @@ xla::StatusOr<py::object> CompiledFunction::Call(
 
   ParsedArgumentsAsBuffers arguments;
   arguments.signature.function_name = function_name_;
-  xla::Status status = ParseArguments(args, kwargs, static_argnums_,
-                                      static_argnames_, arguments);
+  size_t num_positional_args = PyVectorcall_NARGS(nargs);
+  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
+  absl::Span<PyObject* const> positional_args(args, num_positional_args);
+  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
+                                           num_keyword_args);
+  xla::Status status =
+      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
+                     static_argnames_, arguments);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
-    return py::object(
-        py::cast<py::tuple>(cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                                        **kwargs.value_or(py::kwargs())))[0]);
+    return fallback_to_cache_miss();
   }
 
   bool jax_enable_x64 = GetEnableX64();
@@ -963,12 +1005,11 @@ xla::StatusOr<py::object> CompiledFunction::Call(
   arguments.signature.jax_array = GetEnableJaxArray();
   // The C++ jit do not support Tracers arguments inputs yet. The Python-based
   // jit function will be called if any of the dynamic arguments is unsupported.
-  status = ComputeSignature(jax_enable_x64, device, is_committed, arguments);
+  status =
+      ComputeSignature(jax_enable_x64, client, device, is_committed, arguments);
   if (!status.ok()) {
     VLOG(2) << "ComputeSignature failed: " << status;
-    return py::object(
-        py::cast<py::tuple>(cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                                        **kwargs.value_or(py::kwargs())))[0]);
+    return fallback_to_cache_miss();
   }
   arguments.signature.global_extra_jit_context = global_state.extra_jit_context;
   arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
@@ -991,9 +1032,10 @@ xla::StatusOr<py::object> CompiledFunction::Call(
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
-        out_and_fastpath_data =
-            cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                        **kwargs.value_or(py::kwargs()));
+        out_and_fastpath_data = cache_miss();
+        if (!out_and_fastpath_data.ptr()) {
+          throw py::error_already_set();
+        }
         out_tuple = py::cast<py::tuple>(out_and_fastpath_data);
         PopulateCacheEntry(cache_entry.get(), arguments.signature, out_tuple);
       } catch (const std::exception& e) {
@@ -1025,52 +1067,48 @@ xla::StatusOr<py::object> CompiledFunction::Call(
   // the Python path.
   if (cache_entry->fall_back_to_python) {
     VLOG(2) << "fallback to python: " << function_name_;
-    return py::object(
-        py::cast<py::tuple>(cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                                        **kwargs.value_or(py::kwargs())))[0]);
+    return fallback_to_cache_miss();
   }
 
   status = CopyBuffersToDevice(jax_enable_x64, cache_entry->kept_var_bitvec,
                                arguments);
   if (!status.ok()) {
     VLOG(2) << "CopyBuffersToDevice failed: " << status;
-    return py::object(
-        py::cast<py::tuple>(cache_miss_(*py::reinterpret_borrow<py::args>(args),
-                                        **kwargs.value_or(py::kwargs())))[0]);
+    return fallback_to_cache_miss();
   }
 
-  // Executes the computation.
-  std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>> output_buffers;
+// Executes the computation.
+  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
   {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(
-        output_buffers,
-        cache_entry->executable->mutable_pjrt_executable()->Execute(
-            {arguments.arg_buffers}, cache_entry->executable->options()));
+    TF_ASSIGN_OR_RETURN(auto result,
+                        cache_entry->executable->ifrt_executable()->Execute(
+                            absl::MakeSpan(arguments.ifrt_arg_arrays),
+                            cache_entry->executable->options(),
+                            /*devices=*/std::nullopt));
+    output_arrays = std::move(result.outputs);
   }
   auto traceback = xla::Traceback::Get();
 
-  int num_outputs = output_buffers[0].size();
+  int num_outputs = output_arrays.size();
   absl::InlinedVector<py::object, 1> flat_device_arrays;
   flat_device_arrays.reserve(num_outputs);
 
   if (!cache_entry->out_shardings.empty()) {
-    for (int i = 0; i < output_buffers[0].size(); ++i) {
-      std::vector<std::shared_ptr<xla::PjRtBuffer>> pjrt_buffers{
-          std::move(output_buffers[0][i])};
+    for (int i = 0; i < output_arrays.size(); ++i) {
       xla::PyArray array(
           cache_entry->out_avals[i], cache_entry->out_weak_types[i],
           cache_entry->out_dtypes[i], cache_entry->out_shapes[i],
           cache_entry->out_shardings.at(i), cache_entry->executable->client(),
-          traceback, std::move(pjrt_buffers),
+          traceback, std::move(output_arrays[i]),
           /*committed=*/cache_entry->committed.at(i), /*skip_checks=*/true);
       flat_device_arrays.push_back(std::move(array));
     }
   } else {
-    for (int i = 0; i < output_buffers[0].size(); ++i) {
+    for (int i = 0; i < output_arrays.size(); ++i) {
       bool last = (i == (num_outputs - 1));
       xla::PyBuffer::object buffer = xla::PyBuffer::Make(
-          cache_entry->executable->client(), std::move(output_buffers[0][i]),
+          cache_entry->executable->client(), std::move(output_arrays[i]),
           last ? std::move(traceback) : traceback);
       buffer.buf()->SetAval(cache_entry->out_avals[i]);
       buffer.buf()->set_weak_type(cache_entry->out_weak_types[i]);
@@ -1086,8 +1124,19 @@ xla::StatusOr<py::object> CompiledFunction::Call(
   // If there is a post-hook function, call it with the inputs and the outputs.
   std::optional<py::object> post_hook = GetPostHook();
   if (post_hook) {
-    (*post_hook)(AsPyHandle(), args,
-                 py::cast<py::dict>(kwargs.value_or(py::kwargs())), out);
+    py::tuple args_tuple(num_positional_args);
+    for (size_t i = 0; i < num_positional_args; ++i) {
+      args_tuple[i] = args[i];
+    }
+    py::dict kwargs;
+    if (kwnames) {
+      for (size_t i = 0; i < num_keyword_args; ++i) {
+        kwargs[py::handle(PyTuple_GET_ITEM(kwnames, i))] =
+            args[num_positional_args + i];
+      }
+    }
+
+    (*post_hook)(callable, args_tuple, kwargs, out);
   }
   return std::move(out);
 }
@@ -1096,6 +1145,7 @@ struct JaxCompiledFunctionObject {
   PyObject_HEAD;
   PyObject* dict;      // Dictionary for __dict__
   PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
+  vectorcallfunc vectorcall;
   CompiledFunction fun;
 };
 
@@ -1117,13 +1167,38 @@ xla::StatusOr<CompiledFunction*> AsCompiledFunction(py::handle handle) {
   return CompiledFunction::AsCompiledFunctionUnchecked(handle);
 }
 
-py::handle CompiledFunction::AsPyHandle() {
-  return reinterpret_cast<PyObject*>(reinterpret_cast<char*>(this) -
-                                     offsetof(JaxCompiledFunctionObject, fun));
-}
-
 extern "C" {
 
+PyObject* JaxCompiledFunction_tp_vectorcall(PyObject* callable,
+                                            PyObject* const* args, size_t nargs,
+                                            PyObject* kwnames) {
+  JaxCompiledFunctionObject* o =
+      reinterpret_cast<JaxCompiledFunctionObject*>(callable);
+  tsl::profiler::TraceMe traceme([&] {
+    return absl::StrCat("JaxCompiledFunction(", o->fun.function_name(), ")");
+  });
+  try {
+    xla::StatusOr<py::object> out = o->fun.Call(callable, args, nargs, kwnames);
+    if (!out.ok()) {
+      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
+      return nullptr;
+    }
+    return out.value().release().ptr();
+  } catch (py::error_already_set& e) {
+    e.restore();
+    return nullptr;
+  } catch (py::cast_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  } catch (std::invalid_argument& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  } catch (std::runtime_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  }
+}
+
 PyObject* JaxCompiledFunction_tp_new(PyTypeObject* subtype, PyObject* args,
                                      PyObject* kwds) {
   JaxCompiledFunctionObject* self =
@@ -1132,6 +1207,7 @@ PyObject* JaxCompiledFunction_tp_new(PyTypeObject* subtype, PyObject* args,
   if (!self) return nullptr;
   self->dict = nullptr;
   self->weakrefs = nullptr;
+  self->vectorcall = JaxCompiledFunction_tp_vectorcall;
   return reinterpret_cast<PyObject*>(self);
 }
 
@@ -1152,6 +1228,10 @@ int JaxCompiledFunction_tp_traverse(PyObject* self, visitproc visit,
                                     void* arg) {
   JaxCompiledFunctionObject* o =
       reinterpret_cast<JaxCompiledFunctionObject*>(self);
+#if PY_VERSION_HEX >= 0x03090000
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+  Py_VISIT(Py_TYPE(self));
+#endif
   Py_VISIT(o->dict);
   Py_VISIT(o->fun.fun().ptr());
   Py_VISIT(o->fun.cache_miss().ptr());
@@ -1212,39 +1292,6 @@ static PyGetSetDef JaxCompiledFunction_tp_getset[] = {
      JaxCompiledFunction_set_dict, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
-PyObject* JaxCompiledFunction_tp_call(PyObject* self, PyObject* args,
-                                      PyObject* kwargs) {
-  JaxCompiledFunctionObject* o =
-      reinterpret_cast<JaxCompiledFunctionObject*>(self);
-  tsl::profiler::TraceMe traceme([&] {
-    return absl::StrCat("JaxCompiledFunction(", o->fun.function_name(), ")");
-  });
-  std::optional<py::kwargs> py_kwargs;
-  if (kwargs) {
-    py_kwargs = py::reinterpret_borrow<py::kwargs>(kwargs);
-  }
-  try {
-    xla::StatusOr<py::object> out = o->fun.Call(args, std::move(py_kwargs));
-    if (!out.ok()) {
-      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
-      return nullptr;
-    }
-    return out.value().release().ptr();
-  } catch (py::error_already_set& e) {
-    e.restore();
-    return nullptr;
-  } catch (py::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::invalid_argument& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::runtime_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  }
-}
-
 PyObject* JaxCompiledFunction_tp_repr(PyObject* self) {
   try {
     const std::string& repr = absl::StrFormat(
@@ -1354,8 +1401,8 @@ void BuildJaxjitSubmodule(py::module& m) {
     PyTypeObject* type = &heap_type->ht_type;
     type->tp_name = "CompiledFunction";
     type->tp_basicsize = sizeof(JaxCompiledFunctionObject);
-    type->tp_flags =
-        Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE | Py_TPFLAGS_HAVE_GC;
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
+                     Py_TPFLAGS_HAVE_GC | JAX_TPFLAGS_HAVE_VECTORCALL;
     type->tp_new = JaxCompiledFunction_tp_new;
     type->tp_dealloc = JaxCompiledFunction_tp_dealloc;
     type->tp_dictoffset = offsetof(JaxCompiledFunctionObject, dict);
@@ -1364,7 +1411,9 @@ void BuildJaxjitSubmodule(py::module& m) {
     type->tp_weaklistoffset = offsetof(JaxCompiledFunctionObject, weakrefs);
     type->tp_getset = JaxCompiledFunction_tp_getset;
     type->tp_descr_get = JaxCompiledFunction_tp_descr_get;
-    type->tp_call = JaxCompiledFunction_tp_call;
+    type->tp_call = PyVectorcall_Call;
+    type->tp_vectorcall_offset =
+        offsetof(JaxCompiledFunctionObject, vectorcall);
     type->tp_repr = JaxCompiledFunction_tp_repr;
     CHECK_EQ(PyType_Ready(type), 0);
     JaxCompiledFunction_Type = reinterpret_cast<PyObject*>(type);
@@ -1477,38 +1526,38 @@ void BuildJaxjitSubmodule(py::module& m) {
   // (a) it does not support abstract types,
   // (b) it does not set the device stickiness yet.
   // TODO(jblespiau): Finish the replacement of the Python feature.
-  jitlib.def("device_put",
-             [](py::handle obj, bool jax_enable_x64,
-                xla::ClientAndPtr<xla::PjRtDevice> to_device)
-                 -> xla::StatusOr<py::object> {
-               std::shared_ptr<xla::PyClient>& pyclient = to_device.client;
-               xla::DevicePutOptions options;
-               options.squash_64bit_types = !jax_enable_x64;
-               options.allow_zero_copy = true;
-               xla::StatusOr<xla::DevicePutResult> results =
-                   DevicePut(obj, to_device.contents, options);
-               if (!results.ok()) {
-                 throw xla::XlaRuntimeError(results.status().error_message());
-               }
-               if (results->owned_buffer) {
-                 auto buffer = xla::PyBuffer::Make(
-                     pyclient, std::move(results->owned_buffer),
-                     xla::Traceback::Get());
-
-                 static const auto* jax_core =
-                     new py::module(py::module::import("jax.core"));
-                 static const auto* shaped_array =
-                     new py::handle(jax_core->attr("ShapedArray"));
-                 buffer.buf()->SetAval((*shaped_array)(
-                     buffer.buf()->python_shape(), buffer.buf()->python_dtype(),
-                     results->weak_type));
-                 TF_RETURN_IF_ERROR(buffer.buf()->set_sticky_device(nullptr));
-
-                 return std::move(buffer);
-               } else {
-                 return py::cast<py::object>(obj);
-               }
-             });
+  jitlib.def(
+      "device_put",
+      [](py::handle obj, bool jax_enable_x64,
+         xla::ClientAndPtr<xla::PjRtDevice> to_device)
+          -> xla::StatusOr<py::object> {
+        std::shared_ptr<xla::PyClient>& pyclient = to_device.client;
+        xla::DevicePutOptions options;
+        options.squash_64bit_types = !jax_enable_x64;
+        options.allow_zero_copy = true;
+        xla::StatusOr<xla::DevicePutResult> results = DevicePut(
+            obj, pyclient->ifrt_client(), to_device.contents, options);
+        if (!results.ok()) {
+          throw xla::XlaRuntimeError(results.status().error_message());
+        }
+        if (results->ifrt_array) {
+          auto buffer = xla::PyBuffer::Make(
+              pyclient, std::move(results->ifrt_array), xla::Traceback::Get());
+
+          static const auto* jax_core =
+              new py::module(py::module::import("jax.core"));
+          static const auto* shaped_array =
+              new py::handle(jax_core->attr("ShapedArray"));
+          buffer.buf()->SetAval((*shaped_array)(buffer.buf()->python_shape(),
+                                                buffer.buf()->python_dtype(),
+                                                results->weak_type));
+          TF_RETURN_IF_ERROR(buffer.buf()->set_sticky_device(nullptr));
+
+          return std::move(buffer);
+        } else {
+          return py::cast<py::object>(obj);
+        }
+      });
 
   py::class_<xla::PyArgSignature> arg_signature(jitlib, "PyArgSignature");
   arg_signature
diff --git a/tensorflow/compiler/xla/python/jax_jit.h b/tensorflow/compiler/xla/python/jax_jit.h
index 39658429fa4..bf6d67e07a5 100644
--- a/tensorflow/compiler/xla/python/jax_jit.h
+++ b/tensorflow/compiler/xla/python/jax_jit.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
 
+#include <memory>
 #include <stdexcept>
 #include <string>
 
@@ -23,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/py_values.h"
@@ -119,6 +121,8 @@ struct CallSignature {
   // Static keyword argument names. Interned, and sorted by keyword name.
   std::vector<pybind11::object> static_arg_names;
 
+  absl::InlinedVector<bool, 2> committed_args;
+
   // For JIT, we need this in the key because computation follows the data, so
   // we may have multiple executables depending on the devices the data is on.
   // This is not the case for PMAP, and is set to `nullptr`.
@@ -157,6 +161,8 @@ H AbslHashValue(H h, const CallSignature& s) {
     h = H::combine(std::move(h), name.ptr());
   }
 
+  h = H::combine(std::move(h), s.committed_args);
+
   h = H::combine(std::move(h), s.dynamic_arg_names.size());
   for (const auto& static_arg : s.static_args) {
     ssize_t hash;
@@ -200,19 +206,16 @@ struct ParsedArgumentsAsBuffers {
   absl::InlinedVector<pybind11::object, 2> flat_dynamic_args;
   std::vector<pybind11::object> keep_alive_objects;
 
+  xla::ifrt::Client* ifrt_client;
   // The following is only valid if the parsing succeeds.
-  std::vector<xla::PjRtBuffer*> arg_buffers;
-  // We may need to keep these objects around, because:
-  // (a) we need to extend the lifetime of objects created within
-  //    `CopyBuffersToDevice`
-  // (b) `arg_buffers` do not maintain ownership
-  std::vector<std::unique_ptr<xla::PjRtBuffer>> keep_alive;
+  std::vector<tsl::RCReference<xla::ifrt::Array>> ifrt_arg_arrays;
 };
 
 // Filter out static arguments, flatten and concatenate other arguments (i.e.
 // dynamic positional and keyword arguments), filling `arguments` in place.
-xla::Status ParseArguments(pybind11::handle args,
-                           const std::optional<pybind11::kwargs>& py_kwargs,
+xla::Status ParseArguments(absl::Span<PyObject* const> positional_args,
+                           absl::Span<PyObject* const> keyword_args,
+                           pybind11::handle kwnames,
                            absl::Span<int const> static_argnums,
                            absl::Span<pybind11::str const> static_argnames,
                            ParsedArgumentsAsBuffers& arguments);
diff --git a/tensorflow/compiler/xla/python/mlir.cc b/tensorflow/compiler/xla/python/mlir.cc
index c5afeaf5d79..04c790c8f04 100644
--- a/tensorflow/compiler/xla/python/mlir.cc
+++ b/tensorflow/compiler/xla/python/mlir.cc
@@ -17,82 +17,158 @@ limitations under the License.
 
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "pybind11/cast.h"
 #include "pybind11/pybind11.h"
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir/utils/error_util.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace py = pybind11;
 
 namespace xla {
 namespace {
 
-// Converts an XlaComputation to an MHLO mlir::Module string. Exists for
-// backwards compatibility.
+StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseModule(
+    mlir::MLIRContext* context, std::string str) {
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  context->loadDialect<mlir::func::FuncDialect>();
+  context->loadDialect<mlir::mhlo::MhloDialect>();
+  context->loadDialect<mlir::chlo::ChloDialect>();
+  context->loadDialect<mlir::sparse_tensor::SparseTensorDialect>();
+  context->loadDialect<mlir::stablehlo::StablehloDialect>();
+  mlir::BaseScopedDiagnosticHandler diagnostic_handler(context);
+  module = mlir::parseSourceString<mlir::ModuleOp>(
+      llvm::StringRef(str.data(), str.size()), context);
+  if (!module) {
+    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+  }
+  if (failed(module->verifyInvariants())) {
+    VLOG(1) << "MLIR verification failed.";
+    module->dump();
+    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+  }
+  return module;
+}
+
+std::string PrintModule(mlir::ModuleOp module) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  mlir::OpPrintingFlags flags;
+  flags.enableDebugInfo();
+  module->print(os, flags);
+  return s;
+}
+
+void EnablePrintBeforeAndAfter(mlir::PassManager& pm) {
+  auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
+  auto print_after = [](mlir::Pass*, mlir::Operation*) { return true; };
+  pm.enableIRPrinting(print_before, print_after);
+}
+
+// Converts an XlaComputation to an MHLO or StableHLO mlir::Module string.
+// Exists for backwards compatibility.
 // TODO(phawkins): port remaining users of XlaComputations to use mlir::Modules
 // instead and delete this function.
 StatusOr<std::string> PyXlaComputationToMlirModule(
-    const XlaComputation& computation) {
+    const XlaComputation& computation, bool emit_stable_hlo) {
   mlir::MLIRContext context;
+  if (VLOG_IS_ON(3)) context.disableMultithreading();
   mlir::OwningOpRef<mlir::ModuleOp> module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   context.loadDialect<mlir::func::FuncDialect>();
   context.loadDialect<mlir::mhlo::MhloDialect>();
   TF_RETURN_IF_ERROR(ConvertHloToMlirHlo(*module, &computation.proto(),
                                          /*import_all_computations=*/true));
-  std::string s;
-  llvm::raw_string_ostream os(s);
-  mlir::OpPrintingFlags flags;
-  flags.enableDebugInfo();
-  module->print(os, flags);
-  return s;
+  mlir::PassManager pm(&context);
+  if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
+  if (emit_stable_hlo) {
+    pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  }
+  if (!mlir::succeeded(pm.run(*module))) {
+    return tsl::errors::InvalidArgument("MHLO => StableHLO failed");
+  }
+  return PrintModule(*module);
 }
 
 StatusOr<XlaComputation> PyMlirModuleToXlaComputation(std::string mlir_module,
                                                       bool use_tuple_args,
                                                       bool return_tuple) {
   mlir::MLIRContext context;
-  mlir::OwningOpRef<mlir::ModuleOp> module;
-  context.loadDialect<mlir::func::FuncDialect>();
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  context.loadDialect<mlir::chlo::ChloDialect>();
-  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-  module = mlir::parseSourceString<mlir::ModuleOp>(
-      llvm::StringRef(mlir_module.data(), mlir_module.size()), &context);
-  if (!module) {
-    return diagnostic_handler.ConsumeStatus();
-  }
-  if (failed(module->verifyInvariants())) {
-    VLOG(1) << "MLIR verification failed.";
-    module->dump();
-    return diagnostic_handler.ConsumeStatus();
-  }
-
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      ParseModule(&context, mlir_module));
   XlaComputation computation;
   TF_RETURN_IF_ERROR(
       MlirToXlaComputation(*module, computation, use_tuple_args, return_tuple));
   return computation;
 }
 
+StatusOr<std::string> PyMhloToStablehlo(std::string mlir_module) {
+  mlir::MLIRContext context;
+  if (VLOG_IS_ON(3)) context.disableMultithreading();
+  // JAX can be customized in a way that involves operations from custom
+  // dialects showing up in JAX IR.
+  // `ParseModule` won't know about these dialects, but that's fine since we
+  // just want to convert MHLO ops to StableHLO ops here and leave everything
+  // else unchanged.
+  // In order to achieve that, we're allowing unregistered dialects here.
+  context.allowUnregisteredDialects(true);
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      ParseModule(&context, mlir_module));
+  mlir::PassManager pm(&context);
+  if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  if (!mlir::succeeded(pm.run(*module))) {
+    return tsl::errors::InvalidArgument("MHLO => StableHLO failed");
+  }
+  return PrintModule(*module);
+}
+
+StatusOr<std::string> PyStablehloToMhlo(std::string mlir_module) {
+  mlir::MLIRContext context;
+  if (VLOG_IS_ON(3)) context.disableMultithreading();
+  // See PyMhloToStablehlo for an explanation of why we're allowing unregistered
+  // dialects here.
+  context.allowUnregisteredDialects(true);
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
+                      ParseModule(&context, mlir_module));
+  mlir::PassManager pm(&context);
+  if (VLOG_IS_ON(3)) EnablePrintBeforeAndAfter(pm);
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  if (!mlir::succeeded(pm.run(*module))) {
+    return tsl::errors::InvalidArgument("StableHLO => MHLO failed");
+  }
+  return PrintModule(*module);
+}
+
 }  // namespace
 
 void BuildMlirSubmodule(py::module& m) {
   py::module mlir_module = m.def_submodule("mlir", "MLIR/XLA integration");
 
   mlir_module.def("xla_computation_to_mlir_module",
-                  &PyXlaComputationToMlirModule);
+                  &PyXlaComputationToMlirModule, py::arg("computation"),
+                  py::arg("emit_stable_hlo") = true);
   mlir_module.def("mlir_module_to_xla_computation",
                   &PyMlirModuleToXlaComputation, py::arg("mlir_module"),
                   py::arg("use_tuple_args") = false,
                   py::arg("return_tuple") = false);
+  mlir_module.def("mhlo_to_stablehlo", &PyMhloToStablehlo,
+                  py::arg("mlir_module"));
+  mlir_module.def("stablehlo_to_mhlo", &PyStablehloToMhlo,
+                  py::arg("mlir_module"));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy.cc b/tensorflow/compiler/xla/python/numpy.cc
deleted file mode 100644
index 3f77edf43e6..00000000000
--- a/tensorflow/compiler/xla/python/numpy.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// We define the PY_ARRAY_UNIQUE_SYMBOL in this .cc file and provide an
-// ImportNumpy function to populate it.
-#define XLA_IMPORT_NUMPY
-
-#include "tensorflow/compiler/xla/python/numpy.h"
-
-namespace xla {
-
-void ImportNumpy() {
-  import_array1();
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy.h b/tensorflow/compiler/xla/python/numpy.h
deleted file mode 100644
index e8bfe6b285c..00000000000
--- a/tensorflow/compiler/xla/python/numpy.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_H_
-
-#ifdef PyArray_Type
-#error "Numpy cannot be included before numpy.h."
-#endif
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-// We import_array in the XLA init function only.
-#define PY_ARRAY_UNIQUE_SYMBOL _xla_numpy_api
-#ifndef XLA_IMPORT_NUMPY
-#define NO_IMPORT_ARRAY
-#endif
-
-// Place `<locale>` before <Python.h> to avoid build failure in macOS.
-#include <locale>
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-namespace xla {
-
-// Import numpy.  This wrapper function exists so that the
-// PY_ARRAY_UNIQUE_SYMBOL can be safely defined in a .cc file to
-// avoid weird linking issues.  Should be called only from our
-// module initialization function.
-void ImportNumpy();
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_H_
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 5565e58fb6a..a9052a182d4 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -95,7 +95,8 @@ void BuildOpsSubmodule(py::module* m) {
   ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
           py::arg("concat_dimension"), py::arg("split_count"),
           py::arg("replica_groups") = py::list(),
-          py::arg("layout") = std::nullopt);
+          py::arg("layout") = std::nullopt,
+          py::arg("channel_id") = std::nullopt);
   ops.def("ApproxTopK", &ApproxTopK, py::arg("builder"), py::arg("operands"),
           py::arg("init_values"), py::arg("top_k"), py::arg("reduction_dim"),
           py::arg("comparator"), py::arg("recall_target") = 0.9,
@@ -209,6 +210,26 @@ void BuildOpsSubmodule(py::module* m) {
       py::arg("output_operand_aliasing"), py::arg("literal") = nullptr,
       py::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
       py::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
+  ops.def(
+      "CustomCallWithComputation",
+      [](XlaBuilder* builder, const std::string& call_target_name,
+         absl::Span<const XlaOp> operands, const XlaComputation& computation,
+         const Shape& shape, const std::string& opaque, bool has_side_effect,
+         absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
+             output_operand_aliasing,
+         const Literal* literal, CustomCallSchedule schedule,
+         CustomCallApiVersion api_version) -> XlaOp {
+        return CustomCallWithComputation(
+            builder, call_target_name, operands, computation, shape, opaque,
+            has_side_effect, output_operand_aliasing, literal, schedule,
+            api_version);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("computation"), py::arg("shape"),
+      py::arg("opaque") = py::bytes(""), py::arg("has_side_effect") = false,
+      py::arg("output_operand_aliasing"), py::arg("literal") = nullptr,
+      py::arg("schedule") = CustomCallSchedule::SCHEDULE_NONE,
+      py::arg("api_version") = CustomCallApiVersion::API_VERSION_ORIGINAL);
   ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
           py::arg("precision_config") = nullptr,
           py::arg("preferred_element_type") = std::nullopt);
@@ -476,6 +497,7 @@ void BuildOpsSubmodule(py::module* m) {
   UNARY_OP(Sign);
   UNARY_OP(Cos);
   UNARY_OP(Sin);
+  UNARY_OP(Tan);
   UNARY_OP(Tanh);
   UNARY_OP(IsFinite);
   UNARY_OP(Neg);
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index c428895a16b..128543f9226 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <sys/types.h>
 
+#include <cstdint>
 #include <memory>
 #include <queue>
 #include <sstream>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
@@ -166,7 +168,8 @@ class OutfeedReceiverImpl {
 
   StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
                                       uint32_t consumer_id,
-                                      std::vector<XlaOp> arrays);
+                                      std::vector<XlaOp> arrays,
+                                      uint32_t device_idx);
 
  private:
   bool CallbackQueueHasSpace() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -398,10 +401,17 @@ Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
           << "] SendSpecialHeader cons=" << consumer_id;
   XlaBuilder builder(
       absl::StrFormat("special_outfeed_header_%d_%d", consumer_id, device_idx));
-  XlaOp send =
-      AddOutfeedToBuilder(&builder, CreateToken(&builder), consumer_id, {})
+
+  // XLA Next doesn't support returning tokens from computations, so we use
+  // add-dependency to return a constant while ensuring the side-effect is still
+  // executed.
+  XlaOp cst_operand = xla::ConstantR0<int32_t>(&builder, 0);
+  XlaOp outfeed =
+      AddOutfeedToBuilder(&builder, CreateToken(&builder), consumer_id, {}, 0)
           .value();
-  XlaComputation computation = builder.Build(send).value();
+  XlaOp add_dep = xla::internal::XlaBuilderFriend::BuildAddDependency(
+      &builder, cst_operand, outfeed, ShapeUtil::MakeScalarShape(S32));
+  XlaComputation computation = builder.Build(add_dep).value();
 
   CompileOptions compile_options;
   compile_options.executable_build_options.set_num_replicas(1);
@@ -423,7 +433,7 @@ Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
 
 StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
     XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
-    std::vector<XlaOp> arrays) {
+    std::vector<XlaOp> arrays, uint32_t device_idx) {
   XlaOp data = Tuple(builder, std::move(arrays));
   Shape shape_with_layout = builder->GetShape(data).value();
   ShapeUtil::ForEachMutableSubshape(
@@ -452,9 +462,9 @@ StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
 
   std::vector<uint32_t> header{kOutfeedHeaderStart, consumer_id};
   XlaOp header_op = ConstantR1<uint32_t>(builder, header);
-  // We assign the outfeed to the first device. This must match the sharding
-  // for the paired infeed.
-  builder->SetSharding(sharding_builder::AssignDevice(0));
+  // We assign the outfeed to the device specified by device_idx (first device
+  // by default). This must match the sharding for the paired infeed.
+  builder->SetSharding(sharding_builder::AssignDevice(device_idx));
   token = OutfeedWithToken(
       header_op, token, ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords}), "");
   if (consumer_id != kOutfeedCidShutdown) {
@@ -475,14 +485,17 @@ OutfeedReceiver::~OutfeedReceiver() {}
 
 void OutfeedReceiver::Start() { p_impl_->Start(); }
 
-StatusOr<XlaOp> OutfeedReceiver::AddOutfeedToBuilder(
-    XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
-    std::vector<XlaOp> arrays) {
+StatusOr<XlaOp> OutfeedReceiver::AddOutfeedToBuilder(XlaBuilder* builder,
+                                                     XlaOp token,
+                                                     uint32_t consumer_id,
+                                                     std::vector<XlaOp> arrays,
+                                                     uint32_t device_idx) {
   if (consumer_id == kOutfeedCidShutdown) {
     return InvalidArgument("Consumer ID cannot be a reserved value: %d",
                            consumer_id);
   }
-  return p_impl_->AddOutfeedToBuilder(builder, token, consumer_id, arrays);
+  return p_impl_->AddOutfeedToBuilder(builder, token, consumer_id, arrays,
+                                      device_idx);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
index 46e2e5d9526..58775ca14b0 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.h
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
 
+#include <cstdint>
+#include <functional>
 #include <memory>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -64,7 +67,8 @@ class OutfeedReceiver {
   // invalid.
   StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
                                       uint32_t consumer_id,
-                                      std::vector<XlaOp> arrays);
+                                      std::vector<XlaOp> arrays,
+                                      uint32_t device_idx);
 
  private:
   std::unique_ptr<OutfeedReceiverImpl> p_impl_;
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
index 92ca3d6a02c..cc98c9be3d6 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 
+#include <cstdint>
 #include <memory>
 
 #include "absl/algorithm/container.h"
 #include "absl/synchronization/mutex.h"
+#include "pybind11/cast.h"
 #include "pybind11/functional.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -80,9 +82,10 @@ class OutfeedReceiverForPython {
   void Start() { outfeed_receiver_->Start(); }
 
   StatusOr<XlaOp> AddOutfeed(XlaBuilder* builder, XlaOp token,
-                             uint32_t consumer_id, std::vector<XlaOp> arrays) {
+                             uint32_t consumer_id, std::vector<XlaOp> arrays,
+                             uint32_t device_idx) {
     return outfeed_receiver_->AddOutfeedToBuilder(builder, token, consumer_id,
-                                                  arrays);
+                                                  arrays, device_idx);
   }
 
   void Callback(PjRtDevice* device, uint32_t consumer_id,
@@ -157,6 +160,7 @@ void BuildOutfeedReceiverSubmodule(py::module* m) {
   outfeed_receiver_class.def(
       "add_outfeed", &OutfeedReceiverForPython::AddOutfeed, py::arg("builder"),
       py::arg("token"), py::arg("consumer_id"), py::arg("arrays"),
+      py::arg("device_idx"),
       R"(Adds an outfeed into the given computation builder.
 
       Has the side-effect of registering the sent shape along with the consumer
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index 367b38c8b56..daa813a9707 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/test.h"
 
@@ -75,39 +75,42 @@ class Accumulator {
   std::vector<Data> received_ ABSL_GUARDED_BY(mutex_);
 };
 
-StatusOr<std::unique_ptr<PjRtClient>> GetCpuClientWithNonLocalDevice() {
-  TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      PlatformUtil::GetPlatform("Host"));
-  if (platform->VisibleDeviceCount() <= 0) {
-    return FailedPrecondition("CPU platform has no visible devices.");
-  }
-  LocalClientOptions options;
-  options.set_platform(platform);
-  TF_ASSIGN_OR_RETURN(LocalClient * client,
-                      ClientLibrary::GetOrCreateLocalClient(options));
-
-  se::StreamExecutorConfig config(0);
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      platform->GetExecutor(config));
-  auto device_state = std::make_unique<LocalDeviceState>(
-      executor, client, LocalDeviceState::kSynchronous,
-      /*max_inflight_computations=*/32,
-      /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
-
-  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
-  devices.push_back(std::make_unique<CpuDevice>(0, std::move(device_state)));
-  devices.push_back(std::make_unique<CpuDevice>(1, nullptr));
-
-  return std::unique_ptr<PjRtClient>(std::make_unique<PjRtStreamExecutorClient>(
-      CpuName(), client, std::move(devices), /*process_index=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
-      /*should_stage_host_to_device_transfers=*/false,
-      /*gpu_run_options=*/nullptr));
-}
+// TODO(necula): update this test for the TFRT CPU client, which current does
+// not support non-local devices.
+// StatusOr<std::unique_ptr<PjRtClient>> GetCpuClientWithNonLocalDevice() {
+//   TF_ASSIGN_OR_RETURN(se::Platform * platform,
+//                       PlatformUtil::GetPlatform("Host"));
+//   if (platform->VisibleDeviceCount() <= 0) {
+//     return FailedPrecondition("CPU platform has no visible devices.");
+//   }
+//   LocalClientOptions options;
+//   options.set_platform(platform);
+//   TF_ASSIGN_OR_RETURN(LocalClient * client,
+//                       ClientLibrary::GetOrCreateLocalClient(options));
+
+//   se::StreamExecutorConfig config(0);
+//   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+//                       platform->GetExecutor(config));
+//   auto device_state = std::make_unique<LocalDeviceState>(
+//       executor, client, LocalDeviceState::kSynchronous,
+//       /*max_inflight_computations=*/32,
+//       /*allow_event_reuse=*/false, /*use_callback_stream=*/false);
+
+//   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
+//   devices.push_back(std::make_unique<CpuDevice>(0, std::move(device_state)));
+//   devices.push_back(std::make_unique<CpuDevice>(1, nullptr));
+
+//   return
+//   std::unique_ptr<PjRtClient>(std::make_unique<PjRtStreamExecutorClient>(
+//       CpuName(), client, std::move(devices), /*process_index=*/0,
+//       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+//       /*should_stage_host_to_device_transfers=*/false,
+//       /*gpu_run_options=*/nullptr));
+// }
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClient(true));
+                          GetTfrtCpuClient(true));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -126,7 +129,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
   XlaOp data = Iota(&builder, shape0, 0);
   XlaOp send = outfeed_receiver
                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
-                                         consumer_id0, {data})
+                                         consumer_id0, {data}, 0)
                    .value();
   EXPECT_TRUE(CompileAndExecute(&builder, send, 0, cpu_client.get()).ok());
 
@@ -140,7 +143,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClient(true));
+                          GetTfrtCpuClient(true));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -159,7 +162,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   XlaOp data0 = Iota(&builder0, shape0, 0);
   XlaOp send0 = outfeed_receiver
                     ->AddOutfeedToBuilder(&builder0, CreateToken(&builder0),
-                                          consumer_id0, {data0})
+                                          consumer_id0, {data0}, 0)
                     .value();
   EXPECT_TRUE(CompileAndExecute(&builder0, send0, 0, cpu_client.get()).ok());
 
@@ -169,7 +172,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
   XlaOp data1 = Iota(&builder1, shape1, 0);
   XlaOp send1 = outfeed_receiver
                     ->AddOutfeedToBuilder(&builder1, CreateToken(&builder1),
-                                          consumer_id1, {data1})
+                                          consumer_id1, {data1}, 0)
                     .value();
   EXPECT_TRUE(CompileAndExecute(&builder1, send1, 0, cpu_client.get()).ok());
 
@@ -185,7 +188,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
 
 TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClient(true));
+                          GetTfrtCpuClient(true));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -204,7 +207,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   XlaOp data0 = Iota(&builder, shape0, 0);
   XlaOp send0 = outfeed_receiver
                     ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
-                                          consumer_id0, {data0})
+                                          consumer_id0, {data0}, 0)
                     .value();
 
   constexpr int consumer_id1 = 6;
@@ -212,7 +215,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
   XlaOp data1 = Iota(&builder, shape1, 0);
   XlaOp send1 =
       outfeed_receiver
-          ->AddOutfeedToBuilder(&builder, send0, consumer_id1, {data1})
+          ->AddOutfeedToBuilder(&builder, send0, consumer_id1, {data1}, 0)
           .value();
   EXPECT_TRUE(CompileAndExecute(&builder, send1, 0, cpu_client.get()).ok());
 
@@ -228,7 +231,7 @@ TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
 
 TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClient(true));
+                          GetTfrtCpuClient(true));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -247,14 +250,14 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
   XlaOp data0 = Iota(&builder, shape0, 0);
   XlaOp send0 = outfeed_receiver
                     ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
-                                          consumer_id0, {data0})
+                                          consumer_id0, {data0}, 0)
                     .value();
 
   const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
   XlaOp data1 = Iota(&builder, shape1, 0);
   // A different shape for the same consumer ID.
   StatusOr<XlaOp> send1 = outfeed_receiver->AddOutfeedToBuilder(
-      &builder, send0, consumer_id0, {data1});
+      &builder, send0, consumer_id0, {data1}, 0);
   EXPECT_FALSE(send1.ok());
   EXPECT_THAT(send1.status().ToString(),
               testing::HasSubstr("does not match previous shape element_type"));
@@ -262,7 +265,7 @@ TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
 
 TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClient(true));
+                          GetTfrtCpuClient(true));
   std::vector<PjRtClient*> clients{cpu_client.get()};
 
   auto receiver = std::make_unique<Accumulator>();
@@ -279,45 +282,45 @@ TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
   const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
   XlaOp data0 = Iota(&builder, shape0, 0);
   StatusOr<XlaOp> send0 = outfeed_receiver->AddOutfeedToBuilder(
-      &builder, CreateToken(&builder), 0, {data0});
+      &builder, CreateToken(&builder), 0, {data0}, 0);
 
   EXPECT_FALSE(send0.ok());
   EXPECT_THAT(send0.status().ToString(),
               testing::HasSubstr("Consumer ID cannot be a reserved value"));
 }
 
-TEST(OutfeedReceiverTest, NonLocalDevicesIgnored) {
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
-                          GetCpuClientWithNonLocalDevice());
-  std::vector<PjRtClient*> clients{cpu_client.get()};
-
-  auto receiver = std::make_unique<Accumulator>();
-  OutfeedReceiver::Callback callback =
-      [&receiver](PjRtDevice* device, uint32_t consumer_id,
-                  std::shared_ptr<Literal> data) {
-        receiver->Receive(consumer_id, data);
-      };
-  auto outfeed_receiver =
-      std::make_shared<OutfeedReceiver>(callback, clients, 128);
-  outfeed_receiver->Start();
-
-  XlaBuilder builder("execute_test_outfeed");
-  constexpr int consumer_id0 = 5;
-  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
-  XlaOp data = Iota(&builder, shape0, 0);
-  XlaOp send = outfeed_receiver
-                   ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
-                                         consumer_id0, {data})
-                   .value();
-  EXPECT_TRUE(CompileAndExecute(&builder, send, 0, cpu_client.get()).ok());
-
-  // Shutdown the receiver, to force it to wait to deliver the callbacks.
-  outfeed_receiver = nullptr;
-  std::vector<Accumulator::Data> received = receiver->received();
-  EXPECT_EQ(1, received.size());
-  EXPECT_EQ(consumer_id0, received[0].consumer_id);
-  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
-}
+// TEST(OutfeedReceiverTest, NonLocalDevicesIgnored) {
+//   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+//                           GetCpuClientWithNonLocalDevice());
+//   std::vector<PjRtClient*> clients{cpu_client.get()};
+
+//   auto receiver = std::make_unique<Accumulator>();
+//   OutfeedReceiver::Callback callback =
+//       [&receiver](PjRtDevice* device, uint32_t consumer_id,
+//                   std::shared_ptr<Literal> data) {
+//         receiver->Receive(consumer_id, data);
+//       };
+//   auto outfeed_receiver =
+//       std::make_shared<OutfeedReceiver>(callback, clients, 128);
+//   outfeed_receiver->Start();
+
+//   XlaBuilder builder("execute_test_outfeed");
+//   constexpr int consumer_id0 = 5;
+//   const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+//   XlaOp data = Iota(&builder, shape0, 0);
+//   XlaOp send = outfeed_receiver
+//                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+//                                          consumer_id0, {data})
+//                    .value();
+//   EXPECT_TRUE(CompileAndExecute(&builder, send, 0, cpu_client.get()).ok());
+
+//   // Shutdown the receiver, to force it to wait to deliver the callbacks.
+//   outfeed_receiver = nullptr;
+//   std::vector<Accumulator::Data> received = receiver->received();
+//   EXPECT_EQ(1, received.size());
+//   EXPECT_EQ(consumer_id0, received[0].consumer_id);
+//   EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+// }
 
 }  // namespace
 
diff --git a/tensorflow/compiler/xla/python/pjit.cc b/tensorflow/compiler/xla/python/pjit.cc
index 209a409b637..0523a9686f6 100644
--- a/tensorflow/compiler/xla/python/pjit.cc
+++ b/tensorflow/compiler/xla/python/pjit.cc
@@ -15,20 +15,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/pjit.h"
 
+#include <algorithm>
 #include <exception>
 #include <memory>
 #include <optional>
+#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/synchronization/notification.h"
+#include "tensorflow/compiler/xla/pjrt/lru_cache.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
 #include "tensorflow/compiler/xla/python/py_values.h"
+#include "tensorflow/compiler/xla/python/python_utils.h"
 #include "tensorflow/compiler/xla/python/sharding.h"
 #include "tensorflow/compiler/xla/python/status_casters.h"
+#include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace jax {
@@ -46,6 +52,9 @@ struct PjitCacheEntry {
   std::vector<py::object> out_shardings;
   std::vector<bool> out_committed;
   xla::PyTreeDef out_pytree_def;
+  // Bitvector of kept arguments from Jaxpr DCE pass. Used to drop some `args`
+  // in CompiledFunction::Call before calling into compiled computation.
+  std::vector<bool> kept_var_bitvec;
 
   // Ensures a single thread performs the compilation for a given executable.
   //
@@ -59,11 +68,20 @@ struct PjitCacheEntry {
 
 class PjitFunction {
  public:
-  PjitFunction(py::function fun, py::function cache_miss,
-               std::vector<int> static_argnums)
-      : cache_miss_(std::move(cache_miss)),
-        static_argnums_(std::move(static_argnums)) {
-    function_name_ = py::str(py::getattr(fun, "__name__", py::none()));
+  PjitFunction(std::string function_name, std::optional<py::function> fun,
+               py::function cache_miss, std::vector<int> static_argnums,
+               std::vector<py::str> static_argnames, int executables_cache_size)
+      : function_name_(std::move(function_name)),
+        fun_(std::move(fun)),
+        cache_miss_(std::move(cache_miss)),
+        static_argnums_(std::move(static_argnums)),
+        static_argnames_(std::move(static_argnames)),
+        lru_list_(std::make_unique<Cache::LRUList>(executables_cache_size)),
+        executables_(std::make_unique<Cache>(lru_list_.get())) {
+    std::sort(static_argnums_.begin(), static_argnums_.end());
+    for (py::str& s : static_argnames_) {
+      PyUnicode_InternInPlace(&s.ptr());
+    }
   }
 
   PjitFunction(const PjitFunction&) = delete;
@@ -71,102 +89,183 @@ class PjitFunction {
   PjitFunction(PjitFunction&&) = default;
   PjitFunction& operator=(PjitFunction&&) = default;
 
-  xla::StatusOr<py::object> Call(py::args args, py::kwargs kwargs);
+  // pybind11::object typed subclass for PjitFunction objects.
+  class pyobject : public py::object {
+   public:
+    PYBIND11_OBJECT(pyobject,  // NOLINT
+                    py::object, PjitFunction::IsPjitFunction);
+    pyobject() = default;
+    PjitFunction* func() const {
+      return PjitFunction::AsPjitFunctionUnchecked(*this);
+    }
+  };
+  // Alias as ::object; outside the scope above we won't confuse pybind11's
+  // macros.
+  using object = pyobject;
+
+  // Returns true if `h` is a PjitFunction.
+  static bool IsPjitFunction(py::handle handle);
+  // Converts `handle` to a PjitFunction*. Does not do any checking.
+  static PjitFunction* AsPjitFunctionUnchecked(py::handle handle);
+
+  xla::StatusOr<py::object> Call(py::handle callable, PyObject* const* args,
+                                 size_t nargs, PyObject* kwnames);
+
+  using Cache = xla::LRUCache<CallSignature, std::shared_ptr<PjitCacheEntry>>;
+
+  void ClearPythonReferences();
+
+  const std::string& function_name() const { return function_name_; }
+  const std::optional<py::function>& fun() const { return fun_; }
+  const py::function& cache_miss() const { return cache_miss_; }
+
+  const std::vector<int>& static_argnums() const { return static_argnums_; }
+  const std::vector<py::str>& static_argnames() const {
+    return static_argnames_;
+  }
+
+  int cache_capacity() const { return executables_->Capacity(); }
+
+  py::object PythonSignature() {
+    if (!fun_.has_value()) {
+      throw py::value_error(absl::StrFormat(
+          "Calling __signature__ on PjitFunction(%s) not supported.",
+          function_name_));
+    }
+    static const auto* inspect = new py::module(py::module::import("inspect"));
+    return inspect->attr("signature")(*fun_);
+  }
 
  private:
-  xla::Status UpdateArgsSignature(const py::args& args,
-                                  const py::kwargs& kwargs,
-                                  ParsedArgumentsAsBuffers& arguments);
+  xla::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments);
 
   void PopulateCacheEntry(PjitCacheEntry& cache_entry,
                           const CallSignature& signature,
                           const py::tuple& out_and_fastpath_data);
 
   std::string function_name_;
+  std::optional<py::function> fun_;
   py::function cache_miss_;
   std::vector<int> static_argnums_;
+  std::vector<py::str> static_argnames_;
 
-  absl::flat_hash_map<CallSignature, std::unique_ptr<PjitCacheEntry>>
-      executables_;
+  std::unique_ptr<Cache::LRUList> lru_list_;
+  std::unique_ptr<Cache> executables_;
 };
 
 // Prepares the input PjRtBuffers from the python arguments. This is equivalent
 // to shard_args() in pxla.py but for only a few supported cases.
-xla::StatusOr<std::vector<std::vector<xla::PjRtBuffer*>>> PreparePjRtInputs(
-    const xla::PyLoadedExecutable& executable,
-    ParsedArgumentsAsBuffers& arguments) {
-  const auto& devices = executable.AddressableDevices();
+xla::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
+                  ParsedArgumentsAsBuffers& arguments,
+                  const std::vector<bool>& kept_args) {
+  const auto& addressable_devices = executable.AddressableDevices();
   int num_args = arguments.flat_dynamic_args.size();
 
-  std::vector<std::vector<xla::PjRtBuffer*>> num_computation_num_args_buffers(
-      devices.size());
-
-  for (int i = 0; i < devices.size(); ++i) {
-    num_computation_num_args_buffers[i].resize(num_args);
-  }
+  std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays;
+  num_args_arrays.reserve(num_args);
 
   for (int i = 0; i < num_args; ++i) {
+    if (!kept_args[i]) {
+      continue;
+    }
     const py::object& arg = arguments.flat_dynamic_args[i];
 
     xla::PyArray py_array = arg;
-
-    // Currently only committed PyArray inputs are allowed. This is checked
-    // previously in the entry point of PjitFunction::Call().
-    DCHECK(py_array.committed());
-
     const auto& sharding = py_array.sharding();
+    auto* cpp_sharding = sharding.cast<jax::Sharding*>();
+
+    // Currently only committed PyArray inputs or uncommitted PyArray on a
+    // single device inputs are allowed. This is checked previously in the entry
+    // point of PjitFunction::Call().
+    DCHECK(py_array.committed() ||
+           (!py_array.committed() && cpp_sharding->num_devices() == 1));
 
     if (sharding.get_type() == jax::PmapSharding::type()) {
       return xla::Unimplemented(
           "Handling PyArray in PmapSharding is not implemented.");
     }
 
-    auto* cpp_sharding = sharding.cast<jax::Sharding*>();
-
-    if (py_array.num_shards() != devices.size()) {
+    if (py_array.num_shards() != addressable_devices.size()) {
       return xla::InvalidArgument(
-          "Expected PyArray to have %d shards, but got %d", devices.size(),
-          py_array.num_shards());
+          "Expected PyArray to have %d shards, but got %d",
+          addressable_devices.size(), py_array.num_shards());
     }
 
-    if (cpp_sharding->num_devices() == 1) {
-      // TODO(chky): Remove this special handling and don't move to another
-      // device if it is already committed.
-      for (int j = 0; j < devices.size(); ++j) {
-        auto* to_device = devices[j].get();
-        xla::PjRtBuffer* buffer = py_array.GetBuffer(j);
-
-        if (buffer->device() == to_device) {
-          num_computation_num_args_buffers[j][i] = buffer;
-        } else {
-          TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtBuffer> copied_buffer,
-                              buffer->CopyToDevice(to_device));
-          num_computation_num_args_buffers[j][i] = copied_buffer.get();
-          arguments.keep_alive.push_back(std::move(copied_buffer));
-        }
-      }
+    xla::ifrt::Array* ifrt_array = py_array.ifrt_array();
+    // PyArray inputs should have already been checked in
+    // `xla::PyArgSignatureOfValue()` called by
+    // `PjitFunction::UpdateArgsSignature()`.
+    DCHECK(ifrt_array != nullptr) << "PyArray has been unexpectedly deleted.";
+
+    if (cpp_sharding->num_devices() == 1 &&
+        ifrt_array->sharding().devices().front() !=
+            addressable_devices[0].get()) {
+      xla::ifrt::DeviceList::Devices ifrt_devices;
+      ifrt_devices.push_back(addressable_devices[0].get());
+      auto sharding = xla::ifrt::OpaqueSharding::Create(
+          xla::ifrt::DeviceList(std::move(ifrt_devices)));
+      TF_ASSIGN_OR_RETURN(
+          auto copied_ifrt_array,
+          ifrt_array->Reshard(std::move(sharding),
+                              xla::ifrt::ArrayCopySemantics::kReuseInput));
+      num_args_arrays.push_back(std::move(copied_ifrt_array));
     } else {
-      for (int j = 0; j < devices.size(); ++j) {
-        num_computation_num_args_buffers[j][i] = py_array.GetBuffer(j);
-      }
+      num_args_arrays.push_back(tsl::FormRef(ifrt_array));
     }
 
     arguments.keep_alive_objects.push_back(arg);
   }
 
-  return num_computation_num_args_buffers;
+  return num_args_arrays;
 }
 
-xla::StatusOr<py::object> PjitFunction::Call(py::args args, py::kwargs kwargs) {
+xla::StatusOr<py::object> PjitFunction::Call(py::handle callable,
+                                             PyObject* const* args,
+                                             size_t nargs, PyObject* kwnames) {
   tsl::profiler::TraceMe traceme(
-      [&] { return absl::StrCat("JaxPjitFunction(", function_name_, ")"); });
+      [&] { return absl::StrCat("PjitFunction(", function_name_, ")"); });
   ParsedArgumentsAsBuffers arguments;
 
-  auto status = ParseArguments(args, kwargs, static_argnums_,
-                               /*static_argnames=*/{}, arguments);
+  if (GetDisableJit()) {
+    if (!fun_.has_value()) {
+      throw py::value_error(
+          absl::StrFormat("Disable jit is not supported in the AOT path since "
+                          "the function is not available for (%s)",
+                          function_name_));
+    }
+    return py::reinterpret_steal<py::object>(
+        JAX_PyObject_Vectorcall(fun_.value().ptr(), args, nargs, kwnames));
+  }
+
+  // Calls the cache_miss_ function. This just calls the Python function; it may
+  // return nullptr value if a Python exception is thrown.
+  auto cache_miss = [&]() -> py::tuple {
+    return py::reinterpret_steal<py::tuple>(
+        JAX_PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
+  };
+
+  // Call the cache_miss() function, extracting the output data and ignoring
+  // the fastpath data. If the cache miss returns a Python error, returns
+  // nullptr and leaves the Python error set.
+  auto fallback_to_cache_miss = [&]() {
+    py::tuple cache_miss_output = cache_miss();
+    if (!cache_miss_output.ptr()) {
+      return py::object();
+    }
+    return py::object(cache_miss_output[0]);
+  };
+
+  size_t num_positional_args = PyVectorcall_NARGS(nargs);
+  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
+  absl::Span<PyObject* const> positional_args(args, num_positional_args);
+  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
+                                           num_keyword_args);
+  auto status = ParseArguments(positional_args, keyword_args, kwnames,
+                               static_argnums_, static_argnames_, arguments);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
   // Perform a few checks for the arguments. Currently we are only allowing
@@ -176,11 +275,11 @@ xla::StatusOr<py::object> PjitFunction::Call(py::args args, py::kwargs kwargs) {
     if (arg.get_type() != xla::PyArray::type()) {
       VLOG(2) << "Only PyArray arguments are supported in cpp pjit, but got: "
               << py::cast<std::string>(arg.get_type().str());
-      return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+      return fallback_to_cache_miss();
     }
     xla::PyArray py_array = arg;
     if (!py_array.fastpath_enabled()) {
-      return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+      return fallback_to_cache_miss();
     }
 
     // Only allow committed PyArray in cpp pjit for now as the logic on handling
@@ -189,23 +288,29 @@ xla::StatusOr<py::object> PjitFunction::Call(py::args args, py::kwargs kwargs) {
     //
     // TODO(chky): Consider support uncommitted PyArray in cpp when the python
     // side stablizes.
-    if (!py_array.committed()) {
-      VLOG(2) << "PyArray argument is not committed; fallback to python.";
-      return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    auto* cpp_sharding = py_array.sharding().cast<jax::Sharding*>();
+    if (!py_array.committed() && cpp_sharding->num_devices() > 1) {
+      VLOG(2) << "PyArray argument is not committed and number of global "
+                 "devices is more than 1; fallback to python.";
+      return fallback_to_cache_miss();
     }
   }
 
-  status = UpdateArgsSignature(args, kwargs, arguments);
+  status = UpdateArgsSignature(arguments);
   if (!status.ok()) {
     VLOG(2) << "UpdateArgsSignature failed: " << status;
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
-  auto [it, inserted] = executables_.try_emplace(
-      arguments.signature, std::make_unique<PjitCacheEntry>());
-  auto& cache_entry = *(it->second);
+  bool inserted = false;
+  std::shared_ptr<PjitCacheEntry> cache_entry =
+      executables_->GetOrCreateIfAbsent(
+          arguments.signature, [&inserted](const CallSignature& unused) {
+            inserted = true;
+            return std::make_shared<PjitCacheEntry>();
+          });
 
-  if (!cache_entry.compilation_complete.HasBeenNotified()) {
+  if (!cache_entry->compilation_complete.HasBeenNotified()) {
     // In case of several threads attempting to compile the executable, only
     // the one that inserted the item will perform the compilation.
     if (inserted) {
@@ -215,17 +320,20 @@ xla::StatusOr<py::object> PjitFunction::Call(py::args args, py::kwargs kwargs) {
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
-        out_and_fastpath_data = cache_miss_(*args, **kwargs);
+        out_and_fastpath_data = cache_miss();
+        if (!out_and_fastpath_data.ptr()) {
+          throw py::error_already_set();
+        }
         out_tuple = py::cast<py::tuple>(out_and_fastpath_data);
 
-        PopulateCacheEntry(cache_entry, arguments.signature, out_tuple);
+        PopulateCacheEntry(*cache_entry, arguments.signature, out_tuple);
       } catch (const std::exception& e) {
         LOG(ERROR) << "cache miss fail: " << e.what();
-        cache_entry.fall_back_to_python = true;
-        cache_entry.compilation_complete.Notify();
+        cache_entry->fall_back_to_python = true;
+        cache_entry->compilation_complete.Notify();
         throw;
       }
-      cache_entry.compilation_complete.Notify();
+      cache_entry->compilation_complete.Notify();
 
       // We have already computed the result in the miss path so we can return
       // it. We are even *required* to do so if there are donated arguments,
@@ -235,77 +343,79 @@ xla::StatusOr<py::object> PjitFunction::Call(py::args args, py::kwargs kwargs) {
       // Release the GIL while we wait, making sure the compile thread can
       // lock it.
       py::gil_scoped_release release;
-      cache_entry.compilation_complete.WaitForNotification();
+      cache_entry->compilation_complete.WaitForNotification();
     }
   }
 
-  if (cache_entry.fall_back_to_python) {
+  if (cache_entry->fall_back_to_python) {
     VLOG(2) << "cpp pjit fallback to python.";
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
-  // A vector of [num_devices, num_inputs].
-  auto num_computation_num_args_buffers =
-      PreparePjRtInputs(*cache_entry.executable, arguments);
-  if (!num_computation_num_args_buffers.ok()) {
-    VLOG(2) << "Failed to prepare PjRt inputs: "
-            << num_computation_num_args_buffers.status();
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+  // A vector of [num_inputs].
+  auto num_args_arrays = PrepareIfrtInputs(*cache_entry->executable, arguments,
+                                           cache_entry->kept_var_bitvec);
+  if (!num_args_arrays.ok()) {
+    VLOG(2) << "Failed to prepare IFRT inputs: " << num_args_arrays.status();
+    return fallback_to_cache_miss();
   }
-  int num_computations = num_computation_num_args_buffers->size();
 
-  // A vector of [num_devices, num_outputs].
-  std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>> output_buffers;
+  // A vector of [num_outputs].
+  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
   {
     py::gil_scoped_release gil_release;
-    auto pjrt_executable = cache_entry.executable->mutable_pjrt_executable();
-    TF_ASSIGN_OR_RETURN(output_buffers, pjrt_executable->Execute(
-                                            *num_computation_num_args_buffers,
-                                            cache_entry.executable->options()));
+    TF_ASSIGN_OR_RETURN(auto result,
+                        cache_entry->executable->ifrt_executable()->Execute(
+                            absl::MakeSpan(*num_args_arrays),
+                            cache_entry->executable->options(),
+                            /*devices=*/std::nullopt));
+    output_arrays = std::move(result.outputs);
   }
 
   auto traceback = xla::Traceback::Get();
-  const auto& client = cache_entry.executable->client();
+  const auto& client = cache_entry->executable->client();
 
-  // Convert the PjRtBuffer objects to PyArray, and invert the order from
-  // [num_devices, num_args] to [num_args, num_devices].
-  int num_outputs = output_buffers[0].size();
+  // Convert the ifrt::Array objects to PyArray.
+  int num_outputs = output_arrays.size();
   absl::InlinedVector<py::object, 4> outputs;
   outputs.reserve(num_outputs);
   for (int i = 0; i < num_outputs; ++i) {
-    std::vector<std::shared_ptr<xla::PjRtBuffer>> pjrt_buffers;
-    pjrt_buffers.reserve(num_computations);
-
-    for (int j = 0; j < num_computations; ++j) {
-      pjrt_buffers.push_back(std::move(output_buffers[j][i]));
-    }
-
-    // Creating the PyArray result. In addition to the PjRtBuffers, the metadata
+    // Creating the PyArray result. In addition to the IFRT arrays, the metadata
     // like `aval` and `sharding` are retrieved from the cache for this
     // function, which are produced by the python path in `cache_miss`.
     xla::PyArray py_array(
-        cache_entry.out_avals[i], cache_entry.out_weak_types[i],
-        cache_entry.out_dtypes[i], cache_entry.out_shapes[i],
-        cache_entry.out_shardings[i], cache_entry.executable->client(),
-        traceback, std::move(pjrt_buffers),
-        /*committed=*/cache_entry.out_committed.at(i), /*skip_checks=*/true);
+        cache_entry->out_avals[i], cache_entry->out_weak_types[i],
+        cache_entry->out_dtypes[i], cache_entry->out_shapes[i],
+        cache_entry->out_shardings[i], cache_entry->executable->client(),
+        traceback, std::move(output_arrays[i]),
+        /*committed=*/cache_entry->out_committed.at(i), /*skip_checks=*/true);
 
     outputs.push_back(std::move(py_array));
   }
 
-  py::object out = cache_entry.out_pytree_def.Unflatten(outputs);
+  py::object out = cache_entry->out_pytree_def.Unflatten(outputs);
 
   // If there is a post-hook function, call it with the inputs and the outputs.
   std::optional<py::object> post_hook = GetPostHook();
   if (post_hook) {
-    (*post_hook)(py::cast(this), args, kwargs, out);
+    py::tuple args_tuple(num_positional_args);
+    for (size_t i = 0; i < num_positional_args; ++i) {
+      args_tuple[i] = args[i];
+    }
+    py::dict kwargs;
+    if (kwnames) {
+      for (size_t i = 0; i < num_keyword_args; ++i) {
+        kwargs[py::handle(PyTuple_GET_ITEM(kwnames, i))] =
+            args[num_positional_args + i];
+      }
+    }
+    (*post_hook)(callable, args_tuple, kwargs, out);
   }
 
   return out;
 }
 
 xla::Status PjitFunction::UpdateArgsSignature(
-    const py::args& args, const py::kwargs& kwargs,
     ParsedArgumentsAsBuffers& arguments) {
   arguments.signature.function_name = function_name_;
 
@@ -333,6 +443,7 @@ xla::Status PjitFunction::UpdateArgsSignature(
     auto py_array = py::reinterpret_borrow<xla::PyArray>(arg);
 
     arguments.signature.dynamic_arg_shardings.push_back(py_array.sharding());
+    arguments.signature.committed_args.push_back(py_array.committed());
   }
 
   arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
@@ -393,18 +504,311 @@ void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
 
   cache_entry.out_pytree_def =
       py::cast<xla::PyTreeDef>(fastpath_data.attr("out_pytree_def"));
+
+  py::list kept_var_bitvec = fastpath_data.attr("kept_var_bitvec");
+  cache_entry.kept_var_bitvec.reserve(kept_var_bitvec.size());
+  for (py::handle k : kept_var_bitvec) {
+    cache_entry.kept_var_bitvec.push_back(py::cast<bool>(k));
+  }
 }
 
+// Helper function used by the tp_clear GC method.
+void PjitFunction::ClearPythonReferences() {
+  py::function cache_miss;
+  // Swap values for nulls before they are destroyed. See the Python
+  // Py_CLEAR() documentation for a discussion of this topic.
+  std::swap(cache_miss_, cache_miss);
+}
+
+struct PjitFunctionObject {
+  PyObject_HEAD;
+  PyObject* dict;      // Dictionary for __dict__
+  PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
+  vectorcallfunc vectorcall;
+  PjitFunction fun;
+};
+
+PyObject* PjitFunction_Type = nullptr;
+
+bool PjitFunction::IsPjitFunction(py::handle handle) {
+  return handle.get_type() == PjitFunction_Type;
+}
+
+PjitFunction* PjitFunction::AsPjitFunctionUnchecked(py::handle handle) {
+  return &(reinterpret_cast<PjitFunctionObject*>(handle.ptr())->fun);
+}
+
+PjitFunction* AsPjitFunction(py::handle handle) {
+  if (!PjitFunction::IsPjitFunction(handle)) {
+    throw xla::XlaRuntimeError(xla::InvalidArgument("Expected a PjitFunction"));
+  }
+  return PjitFunction::AsPjitFunctionUnchecked(handle);
+}
+
+extern "C" {
+
+PyObject* PjitFunction_tp_vectorcall(PyObject* callable, PyObject* const* args,
+                                     size_t nargs, PyObject* kwnames) {
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(callable);
+  tsl::profiler::TraceMe traceme([&] {
+    return absl::StrCat("PjitFunction(", o->fun.function_name(), ")");
+  });
+  try {
+    xla::StatusOr<py::object> out = o->fun.Call(callable, args, nargs, kwnames);
+    if (!out.ok()) {
+      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
+      return nullptr;
+    }
+    return out.value().release().ptr();
+  } catch (py::error_already_set& e) {
+    e.restore();
+    return nullptr;
+  } catch (py::cast_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  } catch (std::invalid_argument& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  } catch (std::runtime_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  }
+}
+
+PyObject* PjitFunction_tp_new(PyTypeObject* subtype, PyObject* args,
+                              PyObject* kwds) {
+  PjitFunctionObject* self =
+      reinterpret_cast<PjitFunctionObject*>(subtype->tp_alloc(subtype, 0));
+  if (!self) return nullptr;
+  self->dict = nullptr;
+  self->weakrefs = nullptr;
+  self->vectorcall = PjitFunction_tp_vectorcall;
+  return reinterpret_cast<PyObject*>(self);
+}
+
+void PjitFunction_tp_dealloc(PyObject* self) {
+  PyTypeObject* tp = Py_TYPE(self);
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+  if (o->weakrefs) {
+    PyObject_ClearWeakRefs(self);
+  }
+  Py_CLEAR(o->dict);
+  o->fun.~PjitFunction();
+  tp->tp_free(self);
+  Py_DECREF(tp);
+}
+
+int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+#if PY_VERSION_HEX >= 0x03090000
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+  Py_VISIT(Py_TYPE(self));
+#endif
+  Py_VISIT(o->dict);
+  Py_VISIT(o->fun.cache_miss().ptr());
+  return 0;
+}
+
+int PjitFunction_tp_clear(PyObject* self) {
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+  Py_CLEAR(o->dict);
+  o->fun.ClearPythonReferences();
+  return 0;
+}
+
+// Implements the Python descriptor protocol so JIT-compiled functions can be
+// used as bound methods. See:
+// https://docs.python.org/3/howto/descriptor.html#functions-and-methods
+PyObject* PjitFunction_tp_descr_get(PyObject* self, PyObject* obj,
+                                    PyObject* type) {
+  if (obj == nullptr || obj == Py_None) {
+    Py_INCREF(self);
+    return self;
+  }
+  return PyMethod_New(self, obj);
+}
+
+// Support d = instance.__dict__.
+PyObject* PjitFunction_get_dict(PyObject* self, void*) {
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+  if (!o->dict) {
+    o->dict = PyDict_New();
+  }
+  Py_XINCREF(o->dict);
+  return o->dict;
+}
+
+int PjitFunction_set_dict(PyObject* self, PyObject* new_dict, void*) {
+  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+  if (!PyDict_Check(new_dict)) {
+    PyErr_Format(PyExc_TypeError,
+                 "__dict__ must be set to a dictionary, not a '%s'",
+                 Py_TYPE(new_dict)->tp_name);
+    return -1;
+  }
+  Py_INCREF(new_dict);
+  Py_CLEAR(o->dict);
+  o->dict = new_dict;
+  return 0;
+}
+
+static PyGetSetDef PjitFunction_tp_getset[] = {
+    // Having a __dict__ seems necessary to allow !functool.wraps to override
+    // __doc__.
+    {const_cast<char*>("__dict__"), PjitFunction_get_dict,
+     PjitFunction_set_dict, nullptr, nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+
+PyObject* PjitFunction_tp_repr(PyObject* self) {
+  try {
+    const std::string& repr = absl::StrFormat(
+        "<PjitFunction of %s>",
+        static_cast<std::string>(py::repr(py::getattr(self, "__wrapped__"))));
+    return PyUnicode_FromString(repr.c_str());
+  } catch (...) {
+    // Ignore all errors when accessing a repr.
+    return PyUnicode_FromString("<PjitFunction>");
+  }
+}
+
+}  // extern "C"
+
+void InitializePjitFunction(PjitFunctionObject* fn_obj,
+                            std::string function_name,
+                            std::optional<py::function> fun,
+                            py::function cache_miss,
+                            std::vector<int> static_argnums,
+                            std::vector<py::str> static_argnames,
+                            int executables_cache_size) {
+  new (&fn_obj->fun)
+      PjitFunction(std::move(function_name), std::move(fun),
+                   std::move(cache_miss), std::move(static_argnums),
+                   std::move(static_argnames), executables_cache_size);
+}
+
+py::object MakePjitFunction(std::string function_name,
+                            std::optional<py::function> fun,
+                            py::function cache_miss,
+                            std::vector<int> static_argnums,
+                            std::vector<py::str> static_argnames,
+                            int executables_cache_size) {
+  py::object obj = py::reinterpret_steal<py::object>(PjitFunction_tp_new(
+      reinterpret_cast<PyTypeObject*>(PjitFunction_Type), nullptr, nullptr));
+  PjitFunctionObject* fn_obj = reinterpret_cast<PjitFunctionObject*>(obj.ptr());
+  InitializePjitFunction(fn_obj, std::move(function_name), std::move(fun),
+                         std::move(cache_miss), std::move(static_argnums),
+                         std::move(static_argnames), executables_cache_size);
+  return obj;
+}
+
+// Version numbers for the pickled representations of
+// PjitFunction. Increment these if changing them.
+const int kPjitFunctionPickleVersion = 1;
+
 }  // namespace
 
 void BuildPjitSubmodule(py::module& m) {
-  py::class_<PjitFunction>(m, "PjitFunction", py::dynamic_attr())
-      .def("__call__", &PjitFunction::Call);
-
-  m.def("pjit", [](py::function fun, py::function cache_miss,
-                   std::vector<int> static_argnums) {
-    return PjitFunction(std::move(fun), std::move(cache_miss),
-                        std::move(static_argnums));
+  // We need to use heap-allocated type objects because we want to add
+  // additional methods dynamically.
+  py::object cfun;
+  {
+    py::str name = py::str("PjitFunction");
+    py::str qualname = py::str("PjitFunction");
+    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
+        PyType_Type.tp_alloc(&PyType_Type, 0));
+    // Caution: we must not call any functions that might invoke the GC until
+    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
+    // type object.
+    CHECK(heap_type) << "Unable to create heap type object";
+    heap_type->ht_name = name.release().ptr();
+    heap_type->ht_qualname = qualname.release().ptr();
+    PyTypeObject* type = &heap_type->ht_type;
+    type->tp_name = "PjitFunction";
+    type->tp_basicsize = sizeof(PjitFunctionObject);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
+                     Py_TPFLAGS_HAVE_GC | JAX_TPFLAGS_HAVE_VECTORCALL;
+    type->tp_new = PjitFunction_tp_new;
+    type->tp_dealloc = PjitFunction_tp_dealloc;
+    type->tp_dictoffset = offsetof(PjitFunctionObject, dict);
+    type->tp_traverse = PjitFunction_tp_traverse;
+    type->tp_clear = PjitFunction_tp_clear;
+    type->tp_weaklistoffset = offsetof(PjitFunctionObject, weakrefs);
+    type->tp_getset = PjitFunction_tp_getset;
+    type->tp_descr_get = PjitFunction_tp_descr_get;
+    type->tp_call = PyVectorcall_Call;
+    type->tp_vectorcall_offset = offsetof(PjitFunctionObject, vectorcall);
+    type->tp_repr = PjitFunction_tp_repr;
+    CHECK_EQ(PyType_Ready(type), 0);
+    PjitFunction_Type = reinterpret_cast<PyObject*>(type);
+    cfun = py::reinterpret_borrow<py::object>(PjitFunction_Type);
+  }
+  py::object cfun_type = py::reinterpret_borrow<py::object>(PjitFunction_Type);
+
+  // Add PjitFunction to the xla_extension module so it can be pickled.
+  m.attr("PjitFunction") = cfun_type;
+  cfun.attr("__module__") = m.attr("__name__");
+
+  cfun.attr("__getstate__") = py::cpp_function(
+      [](const PjitFunction::object& self) {
+        PjitFunction* fn = self.func();
+        py::dict pickle;
+        pickle["version"] = kPjitFunctionPickleVersion;
+        pickle["function_name"] = fn->function_name();
+        if (fn->fun().has_value()) {
+          pickle["fun"] = *fn->fun();
+        }
+        pickle["cache_miss"] = fn->cache_miss();
+        pickle["static_argnums"] = fn->static_argnums();
+        pickle["static_argnames"] = fn->static_argnames();
+        pickle["cache_capacity"] = fn->cache_capacity();
+        return pickle;
+      },
+      py::is_method(cfun_type));
+  cfun.attr("__setstate__") = py::cpp_function(
+      [](py::object& self, const py::dict& pickle) {
+        int version = py::cast<int>(pickle["version"]);
+        if (version != kPjitFunctionPickleVersion) {
+          throw std::invalid_argument(absl::StrFormat(
+              "Invalid PjitFunction pickle version, got %d, expected %d. "
+              "Pickling/Unpickling jitted functions using different JAX "
+              "versions is not supported.",
+              version, kPjitFunctionPickleVersion));
+        }
+        std::string function_name =
+            py::cast<std::string>(pickle["function_name"]);
+        std::optional<py::function> fun;
+        if (pickle.contains("fun")) {
+          fun = py::cast<py::function>(pickle["fun"]);
+        }
+        py::function cache_miss = py::cast<py::function>(pickle["cache_miss"]);
+        std::vector<int> static_argnums =
+            py::cast<std::vector<int>>(pickle["static_argnums"]);
+        std::vector<py::str> static_argnames =
+            py::cast<std::vector<py::str>>(pickle["static_argnames"]);
+        int cache_capacity = py::cast<int>(pickle["cache_capacity"]);
+        InitializePjitFunction(
+            reinterpret_cast<PjitFunctionObject*>(self.ptr()),
+            std::move(function_name), std::move(fun), std::move(cache_miss),
+            std::move(static_argnums), std::move(static_argnames),
+            cache_capacity);
+      },
+      py::is_method(cfun_type));
+  cfun.attr("__signature__") =
+      property_readonly([](py::handle self) -> xla::StatusOr<py::object> {
+        return AsPjitFunction(self)->PythonSignature();
+      });
+  cfun.attr("_cache_miss") =
+      property_readonly([](py::handle self) -> py::object {
+        return AsPjitFunction(self)->cache_miss();
+      });
+
+  m.def("pjit", [](std::string function_name, std::optional<py::function> fun,
+                   py::function cache_miss, std::vector<int> static_argnums,
+                   std::vector<py::str> static_argnames) {
+    return MakePjitFunction(std::move(function_name), std::move(fun),
+                            std::move(cache_miss), std::move(static_argnums),
+                            std::move(static_argnames),
+                            /*executables_cache_size=*/4096);
   });
 }
 
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD b/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD
new file mode 100644
index 00000000000..44a4709498a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD
@@ -0,0 +1,126 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla/python:friends",
+    ],
+    packages = [
+        "//tensorflow/compiler/xla/python/...",
+    ],
+)
+
+package_group(
+    name = "internal",
+    packages = [
+        "//tensorflow/compiler/xla/python/pjrt_ifrt/...",
+    ],
+)
+
+package(
+    default_visibility = [
+        ":friends",
+        ":internal",
+    ],
+    licenses = ["notice"],
+)
+
+exports_files([
+    "BUILD",
+])
+
+cc_library(
+    name = "pjrt_ifrt",
+    srcs = [
+        "pjrt_array.cc",
+        "pjrt_client.cc",
+        "pjrt_compiler.cc",
+        "pjrt_executable.cc",
+        "pjrt_tuple.cc",
+    ],
+    hdrs = [
+        "pjrt_array.h",
+        "pjrt_client.h",
+        "pjrt_compiler.h",
+        "pjrt_executable.h",
+        "pjrt_tuple.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:utils",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@tf_runtime//:ref_count",
+    ],
+)
+
+cc_library(
+    name = "tfrt_cpu_client_test_lib",
+    testonly = 1,
+    srcs = ["tfrt_cpu_client_test_lib.cc"],
+    deps = [
+        ":pjrt_ifrt",
+        "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
+        "//tensorflow/compiler/xla/python/ifrt:test_util",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_test(
+    name = "pjrt_array_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = ["pjrt_array_impl_test_tfrt_cpu.cc"],
+    deps = [
+        ":tfrt_cpu_client_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:array_impl_test_lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_client_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = [],
+    deps = [
+        ":tfrt_cpu_client_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:client_impl_test_lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_executable_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = ["pjrt_executable_impl_test_tfrt_cpu.cc"],
+    deps = [
+        ":tfrt_cpu_client_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:executable_impl_test_lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_tuple_impl_test_tfrt_cpu",
+    size = "small",
+    srcs = [],
+    deps = [
+        ":tfrt_cpu_client_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:tuple_impl_test_lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
new file mode 100644
index 00000000000..3d66e82fa59
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -0,0 +1,329 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/utils.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+char PjRtCompatibleArray::ID = 0;
+char PjRtArray::ID = 0;
+
+StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype) {
+  switch (dtype.kind()) {
+    case DType::kInvalid:
+    case DType::kPred:
+    case DType::kS8:
+    case DType::kS16:
+    case DType::kS32:
+    case DType::kS64:
+    case DType::kU8:
+    case DType::kU16:
+    case DType::kU32:
+    case DType::kU64:
+    case DType::kF8E4M3FN:
+    case DType::kF8E5M2:
+    case DType::kF16:
+    case DType::kF32:
+    case DType::kBF16:
+    case DType::kF64:
+    case DType::kC64:
+    case DType::kC128:
+    case DType::kToken:
+      return static_cast<xla::PrimitiveType>(static_cast<int>(dtype.kind()));
+    case DType::kString:
+      return InvalidArgument("Not supported as XLA PrimitiveType: %d",
+                             static_cast<int>(dtype.kind()));
+  }
+  return InvalidArgument("Invalid DType: %d", static_cast<int>(dtype.kind()));
+}
+
+StatusOr<DType> ToDType(xla::PrimitiveType primitive_type) {
+  switch (primitive_type) {
+    case xla::PrimitiveType::PRIMITIVE_TYPE_INVALID:
+    case xla::PrimitiveType::PRED:
+    case xla::PrimitiveType::S8:
+    case xla::PrimitiveType::S16:
+    case xla::PrimitiveType::S32:
+    case xla::PrimitiveType::S64:
+    case xla::PrimitiveType::U8:
+    case xla::PrimitiveType::U16:
+    case xla::PrimitiveType::U32:
+    case xla::PrimitiveType::U64:
+    case xla::PrimitiveType::F8E4M3FN:
+    case xla::PrimitiveType::F8E5M2:
+    case xla::PrimitiveType::F16:
+    case xla::PrimitiveType::F32:
+    case xla::PrimitiveType::BF16:
+    case xla::PrimitiveType::F64:
+    case xla::PrimitiveType::C64:
+    case xla::PrimitiveType::C128:
+    case xla::PrimitiveType::TOKEN:
+      return DType(static_cast<DType::Kind>(static_cast<int>(primitive_type)));
+    default:
+      return InvalidArgument("Invalid XLA PrimitiveType: %d",
+                             static_cast<int>(primitive_type));
+  }
+}
+
+StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
+    PjRtCompatibleClient* client, DType dtype, Shape shape,
+    std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers) {
+  if (pjrt_buffers.empty()) {
+    return InvalidArgument("pjrt_buffers must be non-empty");
+  }
+  if (sharding->devices().size() != pjrt_buffers.size()) {
+    return InvalidArgument("device and buffer counts mismatch: %d vs. %d",
+                           sharding->devices().size(), pjrt_buffers.size());
+  }
+  return tsl::MakeRef<PjRtArray>(client, dtype, std::move(shape),
+                                 std::move(sharding), std::move(pjrt_buffers));
+}
+
+StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
+    PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer) {
+  TF_ASSIGN_OR_RETURN(auto dtype,
+                      ToDType(pjrt_buffer->on_device_shape().element_type()));
+  Shape shape(pjrt_buffer->on_device_shape().dimensions());
+  auto sharding = SingleDeviceSharding::Create(pjrt_buffer->device());
+  return tsl::MakeRef<PjRtArray>(client, dtype, std::move(shape),
+                                 std::move(sharding),
+                                 PjRtBuffers({std::move(pjrt_buffer)}));
+}
+
+StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
+    PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers) {
+  TF_ASSIGN_OR_RETURN(
+      auto dtype, xla::ifrt::ToDType(
+                      pjrt_buffers.front()->on_device_shape().element_type()));
+  DeviceList::Devices devices;
+  devices.reserve(pjrt_buffers.size());
+  std::vector<Shape> shapes;
+  shapes.reserve(pjrt_buffers.size());
+
+  for (const auto& pjrt_buffer : pjrt_buffers) {
+    devices.push_back(pjrt_buffer->device());
+    shapes.push_back(Shape(pjrt_buffer->on_device_shape().dimensions()));
+  }
+  return PjRtArray::Create(
+      client, dtype, std::move(shape),
+      ifrt::OpaqueSharding::Create(
+          xla::ifrt::DeviceList(std::move(devices)),
+          xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+              std::move(shapes))),
+      std::move(pjrt_buffers));
+}
+
+PjRtArray::PjRtArray(PjRtCompatibleClient* client, DType dtype, Shape shape,
+                     std::shared_ptr<const Sharding> sharding,
+                     PjRtBuffers pjrt_buffers)
+    : client_(client),
+      dtype_(dtype),
+      shape_(std::move(shape)),
+      sharding_(std::move(sharding)),
+      pjrt_buffers_(std::move(pjrt_buffers)) {}
+
+StatusOr<std::vector<tsl::RCReference<Array>>>
+PjRtArray::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
+  DCHECK(this);
+  std::vector<tsl::RCReference<Array>> result;
+  result.reserve(sharding_->devices().size());
+  TF_ASSIGN_OR_RETURN(auto shape_and_shardings, sharding_->Disassemble(shape_));
+  for (int i = 0; i < sharding_->devices().size(); ++i) {
+    PjRtBuffers buffers;
+    buffers.reserve(1);
+    switch (semantics) {
+      case ArrayCopySemantics::kAlwaysCopy:
+        // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
+        // API does not have efficient buffer cloning on the same device.
+        buffers.push_back(pjrt_buffers_[i]);
+        break;
+      case ArrayCopySemantics::kReuseInput:
+        buffers.push_back(pjrt_buffers_[i]);
+        break;
+      case ArrayCopySemantics::kDonateInput:
+        // TODO(hyeontaek): We may try std::move(pjrt_buffers_[i]), but this
+        // would be unsafe if there is a subsequent access to the buffer.
+        buffers.push_back(pjrt_buffers_[i]);
+        break;
+    }
+    TF_ASSIGN_OR_RETURN(
+        auto array, PjRtArray::Create(client_, dtype_,
+                                      std::move(shape_and_shardings[i].first),
+                                      std::move(shape_and_shardings[i].second),
+                                      std::move(buffers)));
+    result.push_back(std::move(array));
+  }
+  return result;
+}
+
+Future<Status> PjRtArray::CopyToHostBuffer(
+    void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+    ArrayCopySemantics semantics) {
+  DCHECK(this);
+  if (sharding_->devices().size() != 1) {
+    return Future<Status>(
+        InvalidArgument("Only single-shard is implemented, but got %d",
+                        sharding_->devices().size()));
+  }
+
+  auto dtype = ToPrimitiveType(dtype_);
+  if (!dtype.ok()) {
+    return Future<Status>(std::move(dtype).status());
+  }
+
+  PjRtBuffer* pjrt_buffer = pjrt_buffers_.front().get();
+  absl::Span<const int64_t> dims;
+  StatusOr<xla::Shape> dynamic_shape;
+  if (pjrt_buffer->on_device_shape().is_static()) {
+    dims = shape_.dims();
+  } else {
+    // TODO(b/182461453): This is a blocking call. If we further implemented
+    // populating dynamic shape metadata while fetching the literal, we wouldn't
+    // need this static approach.
+    // TODO(hyeontaek): Clean up this dynamic shape access once we formalize
+    // dynamic shape support in IFRT.
+    dynamic_shape = pjrt_buffer->logical_on_device_shape();
+    if (!dynamic_shape.ok()) {
+      return Future<Status>(std::move(dynamic_shape).status());
+    }
+    dims = dynamic_shape->dimensions();
+  }
+
+  std::unique_ptr<xla::MutableBorrowingLiteral> literal;
+  if (byte_strides.has_value()) {
+    auto xla_shape =
+        MakeShapeWithTrivialByteStrides(*dtype, dims, *byte_strides);
+    if (!xla_shape.ok()) {
+      return Future<Status>(std::move(xla_shape).status());
+    }
+    literal = std::make_unique<xla::MutableBorrowingLiteral>(
+        static_cast<char*>(data), *xla_shape);
+  } else {
+    auto xla_shape = ShapeUtil::MakeShapeWithDescendingLayout(*dtype, dims);
+    literal = std::make_unique<xla::MutableBorrowingLiteral>(
+        static_cast<char*>(data), xla_shape);
+  }
+  auto* literal_ptr = literal.get();
+  auto promise = Future<Status>::CreatePromise();
+  Future<Status> future(promise);
+  // TODO(hyeontaek): Handle semantics == kDonateInput.
+  pjrt_buffer->ToLiteral(literal_ptr)
+      .OnReady([literal = std::move(literal),
+                promise = std::move(promise)](Status s) mutable {
+        promise.Set(std::move(s));
+        literal = nullptr;
+      });
+  return future;
+}
+
+StatusOr<tsl::RCReference<Array>> PjRtArray::Reshard(
+    std::shared_ptr<const Sharding> new_sharding,
+    ArrayCopySemantics semantics) {
+  DCHECK(this);
+  if (new_sharding->devices().size() != sharding_->devices().size()) {
+    return InvalidArgument(
+        "Resharding to a different number of devices: %d; expected %d",
+        new_sharding->devices().size(), sharding_->devices().size());
+  }
+  // TODO(hyeontaek): We should have an equivalence test for sharding that
+  // permits device changes and nothing else.
+  PjRtBuffers buffers;
+  buffers.reserve(pjrt_buffers_.size());
+  for (int i = 0; i < pjrt_buffers_.size(); ++i) {
+    if (pjrt_buffers_[i]->device() == new_sharding->devices()[i]) {
+      switch (semantics) {
+        case ArrayCopySemantics::kAlwaysCopy:
+          // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
+          // API does not have efficient buffer cloning on the same device.
+          buffers.push_back(pjrt_buffers_[i]);
+          break;
+        case ArrayCopySemantics::kReuseInput:
+          buffers.push_back(pjrt_buffers_[i]);
+          break;
+        case ArrayCopySemantics::kDonateInput:
+          // TODO(hyeontaek): We may try std::move(pjrt_buffers_[i]), but this
+          // would be unsafe if there is a subsequent access to the buffer.
+          buffers.push_back(pjrt_buffers_[i]);
+          break;
+      }
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<xla::PjRtBuffer> copied_buffer,
+          pjrt_buffers_[i]->CopyToDevice(new_sharding->devices()[i]));
+      if (semantics == ArrayCopySemantics::kDonateInput) {
+        pjrt_buffers_[i] = nullptr;
+      }
+      buffers.push_back(std::shared_ptr<PjRtBuffer>(copied_buffer.release()));
+    }
+  }
+  return PjRtArray::Create(client_, dtype_, shape_, std::move(new_sharding),
+                           std::move(buffers));
+}
+
+Future<Status> PjRtArray::GetReadyFuture() const {
+  DCHECK(this);
+  if (pjrt_buffers_.size() == 1) {
+    return pjrt_buffers_.front()->GetReadyFuture();
+  }
+  std::vector<Future<Status>> futures;
+  futures.reserve(pjrt_buffers_.size());
+  for (auto& buf : pjrt_buffers_) {
+    futures.push_back(buf->GetReadyFuture());
+  }
+  return JoinFutures(absl::MakeSpan(futures));
+}
+
+Future<Status> PjRtArray::Delete() {
+  DCHECK(this);
+  for (auto& buffer : pjrt_buffers_) {
+    buffer->Delete();
+  }
+  // TODO(hyeontaek): Return a correct future.
+  return Future<Status>(OkStatus());
+}
+
+bool PjRtArray::IsDeleted() const {
+  DCHECK(this);
+  // TODO(hyeontaek): This may be incorrect if PjRtBuffers are shared and a
+  // portion of pjrt_buffers_ is deleted or not deleted.
+  return pjrt_buffers_.front()->IsDeleted();
+}
+
+std::string PjRtArray::DebugString() const {
+  DCHECK(this);
+  return absl::StrFormat("PjRtArray(dtype=%s; shape=%s; sharding=%s)",
+                         dtype_.DebugString(), shape_.DebugString(),
+                         sharding_->DebugString());
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h
new file mode 100644
index 00000000000..5f635cda89b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+// Converts IFRT `DType` into `xla::PrimitiveType`.
+StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype);
+
+// Converts `xla::PrimitiveType` into IFRT `DType`.
+StatusOr<DType> ToDType(xla::PrimitiveType primitive_type);
+
+// PjRt-compatible `Array` interface that wraps a list of `xla::PjRtBuffer`s.
+class PjRtCompatibleArray
+    : public llvm::RTTIExtends<PjRtCompatibleArray, Array> {
+ public:
+  // APIs that allow direct access to `PjRtBuffer` for PjRt-only operations.
+  virtual absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() = 0;
+  virtual StatusOr<absl::Span<std::shared_ptr<PjRtBuffer>>>
+  mutable_pjrt_buffers() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Array` implementation that wraps a list of `xla::PjRtBuffer`s.
+class PjRtArray final
+    : public llvm::RTTIExtends<PjRtArray, PjRtCompatibleArray> {
+ public:
+  static constexpr int kPjRtBufferInlineSize = 1;
+  using PjRtBuffers =
+      absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
+
+  // General array construction.
+  static StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, DType dtype, Shape shape,
+      std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  // Shorthand for a single-shard array construction.
+  static StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, std::shared_ptr<PjRtBuffer> pjrt_buffer);
+
+  // Shorthand for a multi-shard array construction using OpaqueSharding.
+  // TODO(hyeontaek): Remove this once IFRT Sharding and JAX Sharding is unified
+  // so that OpaqueSharding can be replaced with a real Sharding.
+  static StatusOr<tsl::RCReference<PjRtArray>> Create(
+      PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers);
+
+  // PjRtCompatibleArray implementation.
+
+  absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() override {
+    DCHECK(this);
+    return pjrt_buffers_;
+  }
+  StatusOr<absl::Span<std::shared_ptr<PjRtBuffer>>> mutable_pjrt_buffers()
+      override {
+    DCHECK(this);
+    return absl::MakeSpan(pjrt_buffers_);
+  }
+
+  // Array implementation.
+
+  ~PjRtArray() override = default;
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  DType dtype() const override {
+    DCHECK(this);
+    return dtype_;
+  }
+  const Shape& shape() const override {
+    DCHECK(this);
+    return shape_;
+  }
+  const Sharding& sharding() const override {
+    DCHECK(this);
+    return *sharding_;
+  }
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    DCHECK(this);
+    return sharding_;
+  }
+
+  StatusOr<std::vector<tsl::RCReference<Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<Status> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  StatusOr<tsl::RCReference<Array>> Reshard(
+      std::shared_ptr<const Sharding> new_sharding,
+      ArrayCopySemantics semantics) override;
+
+  Future<Status> GetReadyFuture() const override;
+
+  Future<Status> Delete() override;
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtArray(PjRtCompatibleClient* client, DType dtype, Shape shape,
+            std::shared_ptr<const Sharding> sharding, PjRtBuffers pjrt_buffers);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  PjRtCompatibleClient* client_;
+  DType dtype_;
+  Shape shape_;
+  std::shared_ptr<const Sharding> sharding_;
+  PjRtBuffers pjrt_buffers_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_ARRAY_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array_impl_test_tfrt_cpu.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array_impl_test_tfrt_cpu.cc
new file mode 100644
index 00000000000..ac8b529ff74
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array_impl_test_tfrt_cpu.cc
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char** argv) {
+  // TfrtCpuBuffer::ToLiteral() currently does not respect the layout of the
+  // destination literal.
+  const char* kFilter =
+      "-ArrayImplTest."
+      "MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides";
+#ifdef GTEST_FLAG_SET
+  GTEST_FLAG_SET(filter, kFilter);
+#else
+  testing::GTEST_FLAG(filter) = kFilter;
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc
new file mode 100644
index 00000000000..8983b71782f
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -0,0 +1,139 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+char PjRtCompatibleClient::ID = 0;
+char PjRtClient::ID = 0;
+
+std::unique_ptr<PjRtClient> PjRtClient::Create(
+    std::shared_ptr<xla::PjRtClient> pjrt_client) {
+  return absl::WrapUnique(new PjRtClient(std::move(pjrt_client)));
+}
+
+StatusOr<tsl::RCReference<PjRtCompatibleArray>> PjRtClient::CreatePjRtArray(
+    std::shared_ptr<PjRtBuffer> pjrt_buffer) {
+  TF_ASSIGN_OR_RETURN(auto array,
+                      PjRtArray::Create(this, std::move(pjrt_buffer)));
+  return tsl::RCReference<PjRtCompatibleArray>(std::move(array));
+}
+
+StatusOr<tsl::RCReference<PjRtCompatibleArray>> PjRtClient::CreatePjRtArray(
+    Shape shape, PjRtBuffers pjrt_buffers) {
+  TF_ASSIGN_OR_RETURN(auto array, PjRtArray::Create(this, std::move(shape),
+                                                    std::move(pjrt_buffers)));
+  return tsl::RCReference<PjRtCompatibleArray>(std::move(array));
+}
+
+StatusOr<tsl::RCReference<Array>> PjRtClient::MakeArrayFromHostBuffer(
+    const void* data, DType dtype, Shape shape,
+    std::optional<absl::Span<const int64_t>> byte_strides,
+    std::shared_ptr<const Sharding> sharding,
+    Client::HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer) {
+  DCHECK(this);
+  if (!llvm::isa<const SingleDeviceSharding>(sharding.get())) {
+    return InvalidArgument(
+        "Only SingleDeviceSharding is supported: sharding=%s",
+        sharding->DebugString());
+  }
+  TF_ASSIGN_OR_RETURN(auto primitive_type, ToPrimitiveType(dtype));
+  TF_ASSIGN_OR_RETURN(
+      auto buffer,
+      pjrt_client_->BufferFromHostBuffer(
+          data, primitive_type, shape.dims(), byte_strides, semantics,
+          std::move(on_done_with_host_buffer), sharding->devices().front()));
+  return PjRtArray::Create(
+      this, dtype, std::move(shape), std::move(sharding),
+      PjRtArray::PjRtBuffers({std::shared_ptr<PjRtBuffer>(buffer.release())}));
+}
+
+StatusOr<tsl::RCReference<Array>>
+PjRtClient::AssembleArrayFromSingleDeviceArrays(
+    Shape shape, std::shared_ptr<const Sharding> sharding,
+    absl::Span<tsl::RCReference<Array>> arrays, ArrayCopySemantics semantics) {
+  DCHECK(this);
+  if (!llvm::isa<const OpaqueSharding>(sharding.get())) {
+    return InvalidArgument("Only OpaqueSharding is supported: sharding=%s",
+                           sharding->DebugString());
+  }
+  if (sharding->devices().size() != arrays.size()) {
+    return InvalidArgument(
+        "Number of output shards must match the number of single-shard arrays: "
+        "%d vs. %d",
+        sharding->devices().size(), arrays.size());
+  }
+  PjRtArray::PjRtBuffers buffers;
+  buffers.reserve(arrays.size());
+  DType dtype = arrays[0]->dtype();
+  for (int i = 0; i < arrays.size(); ++i) {
+    if (!llvm::isa<PjRtCompatibleArray>(arrays[i].get())) {
+      return InvalidArgument(
+          "Only PjRtCompatibleArray is supported: arrays[%d]=%s", i,
+          arrays[i]->DebugString());
+    }
+    auto* array = static_cast<PjRtCompatibleArray*>(arrays[i].get());
+    if (array->dtype() != dtype) {
+      return InvalidArgument(
+          "Every input must have the same dtype: %s (shard 0) vs. %s (shard "
+          "%d)",
+          dtype.DebugString(), array->dtype().DebugString(), i);
+    }
+    if (array->sharding().devices().size() != 1) {
+      return InvalidArgument(
+          "Every input must use a single device sharding, but input %d has "
+          "sharding=%s",
+          i, array->sharding().DebugString());
+    }
+    switch (semantics) {
+      case ArrayCopySemantics::kAlwaysCopy:
+        // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
+        // API does not have efficient buffer cloning on the same device.
+        buffers.push_back(array->pjrt_buffers().front());
+        break;
+      case ArrayCopySemantics::kReuseInput:
+        buffers.push_back(array->pjrt_buffers().front());
+        break;
+      case ArrayCopySemantics::kDonateInput:
+        buffers.push_back(std::move(array->pjrt_buffers().front()));
+        break;
+    }
+  }
+  return PjRtArray::Create(this, dtype, std::move(shape), std::move(sharding),
+                           std::move(buffers));
+}
+
+StatusOr<tsl::RCReference<Tuple>> PjRtClient::MakeTuple(
+    absl::Span<tsl::RCReference<Value>> values) {
+  return PjRtTuple::Create(this, values);
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h
new file mode 100644
index 00000000000..507b3071a12
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h
@@ -0,0 +1,167 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class PjRtCompatibleArray;
+
+// PjRt-compatible `Client` interface.
+class PjRtCompatibleClient
+    : public llvm::RTTIExtends<PjRtCompatibleClient, Client> {
+ public:
+  static constexpr int kPjRtBufferInlineSize = 1;
+  using PjRtBuffers =
+      absl::InlinedVector<std::shared_ptr<PjRtBuffer>, kPjRtBufferInlineSize>;
+
+  // APIs that allow direct access to `xla::PjRtClient` for PjRt-only
+  // operations.
+  virtual xla::PjRtClient* pjrt_client() = 0;
+  virtual std::shared_ptr<xla::PjRtClient> shared_ptr_pjrt_client() = 0;
+  virtual StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      std::shared_ptr<PjRtBuffer> pjrt_buffer) = 0;
+  virtual StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      Shape shape, PjRtBuffers pjrt_buffers) = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Client` implementation that wraps `xla::PjRtClient`.
+class PjRtClient final
+    : public llvm::RTTIExtends<PjRtClient, PjRtCompatibleClient> {
+ public:
+  static std::unique_ptr<PjRtClient> Create(
+      std::shared_ptr<xla::PjRtClient> pjrt_client);
+
+  // PjRtCompatibleClient implementation.
+
+  xla::PjRtClient* pjrt_client() override { return pjrt_client_.get(); }
+  std::shared_ptr<xla::PjRtClient> shared_ptr_pjrt_client() override {
+    return pjrt_client_;
+  }
+  StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      std::shared_ptr<PjRtBuffer> pjrt_buffer) override;
+  StatusOr<tsl::RCReference<PjRtCompatibleArray>> CreatePjRtArray(
+      Shape shape, PjRtBuffers pjrt_buffers) override;
+
+  // Client implementation.
+
+  ~PjRtClient() override = default;
+
+  StatusOr<tsl::RCReference<Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const Sharding> sharding,
+      Client::HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) override;
+
+  StatusOr<tsl::RCReference<Array>> AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
+  StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) override;
+
+  absl::string_view runtime_type() const override {
+    DCHECK(this);
+    return PjRtRuntimeTypeString(pjrt_client_->runtime_type());
+  }
+
+  absl::string_view platform_name() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_name();
+  }
+  absl::string_view platform_version() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_version();
+  }
+  PlatformId platform_id() const override {
+    DCHECK(this);
+    return pjrt_client_->platform_id();
+  }
+
+  int device_count() const override {
+    DCHECK(this);
+    return pjrt_client_->device_count();
+  }
+  int addressable_device_count() const override {
+    DCHECK(this);
+    return pjrt_client_->addressable_device_count();
+  }
+  absl::Span<Device* const> devices() const override {
+    DCHECK(this);
+    return pjrt_client_->devices();
+  }
+  absl::Span<Device* const> addressable_devices() const override {
+    DCHECK(this);
+    return pjrt_client_->addressable_devices();
+  }
+  int process_index() const override { return pjrt_client_->process_index(); }
+  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override {
+    DCHECK(this);
+    return pjrt_client_->GetDefaultDeviceAssignment(num_replicas,
+                                                    num_partitions);
+  }
+  StatusOr<Device*> LookupDevice(int device_id) const override {
+    DCHECK(this);
+    return pjrt_client_->LookupDevice(device_id);
+  }
+
+  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
+    DCHECK(this);
+    return pjrt_client_->CreateDeviceToHostChannelHandle();
+  }
+  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
+    DCHECK(this);
+    return pjrt_client_->CreateHostToDeviceChannelHandle();
+  }
+
+  Compiler* GetDefaultCompiler() override {
+    DCHECK(this);
+    return &default_compiler_;
+  }
+
+  static char ID;  // NOLINT
+
+ private:
+  explicit PjRtClient(std::shared_ptr<xla::PjRtClient> pjrt_client)
+      : pjrt_client_(std::move(pjrt_client)), default_compiler_(this) {}
+
+  std::shared_ptr<xla::PjRtClient> pjrt_client_;
+  PjRtCompiler default_compiler_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
new file mode 100644
index 00000000000..7a9c000ec30
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h"
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+char PjRtCompiler::ID = 0;
+
+StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
+    mlir::ModuleOp mlir_module, CompileOptions options) {
+  DCHECK(this);
+  return PjRtLoadedExecutable::Create(client_, mlir_module, std::move(options));
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>>
+PjRtCompiler::DeserializeLoadedExecutable(absl::string_view serialized,
+                                          CompileOptions options) {
+  DCHECK(this);
+  TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executble,
+                      client_->pjrt_client()->DeserializeExecutable(
+                          serialized, std::move(options)));
+  return PjRtLoadedExecutable::Create(client_,
+                                      std::move(pjrt_loaded_executble));
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
new file mode 100644
index 00000000000..2f9cc797cf5
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace ifrt {
+
+class PjRtClient;
+
+class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
+ public:
+  explicit PjRtCompiler(PjRtClient* client) : client_(client) {}
+  ~PjRtCompiler() override = default;
+
+  StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
+      mlir::ModuleOp mlir_module, CompileOptions options) override;
+
+  StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
+      absl::string_view serialized, CompileOptions options) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtClient* client_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc
new file mode 100644
index 00000000000..4de101b5023
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -0,0 +1,502 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+// Returns the op sharding of the root instruction in the entry computation.
+StatusOr<const xla::HloInstructionProto*> FindRootInstruction(
+    const HloModuleProto& proto) {
+  for (const auto& computation : proto.computations()) {
+    if (computation.id() == proto.entry_computation_id()) {
+      for (const auto& instruction : computation.instructions()) {
+        if (instruction.id() == computation.root_id()) {
+          return &instruction;
+        }
+      }
+    }
+  }
+  return InvalidArgument("Entry computation not found");
+}
+
+}  // namespace
+
+char PjRtCompatibleExecutable::ID = 0;
+char PjRtCompatibleLoadedExecutable::ID = 0;
+char PjRtExecutable::ID = 0;
+char PjRtLoadedExecutable::ID = 0;
+
+StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
+    std::unique_ptr<xla::PjRtExecutable> pjrt_executable) {
+  return std::unique_ptr<Executable>(new PjRtExecutable(
+      std::shared_ptr<xla::PjRtExecutable>(pjrt_executable.release())));
+}
+
+StatusOr<std::unique_ptr<Executable>> PjRtExecutable::Create(
+    std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
+  return std::unique_ptr<Executable>(
+      new PjRtExecutable(std::move(pjrt_executable)));
+}
+
+StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
+  DCHECK(this);
+  return pjrt_executable_->FingerprintExecutable();
+}
+
+StatusOr<std::string> PjRtExecutable::Serialize() const {
+  DCHECK(this);
+  return pjrt_executable_->SerializeExecutable();
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+    PjRtCompatibleClient* client,
+    std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable) {
+  return Create(client, std::shared_ptr<xla::PjRtLoadedExecutable>(
+                            pjrt_loaded_executable.release()));
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+    PjRtCompatibleClient* client,
+    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable) {
+  // TODO(hyeontaek): We should request output sharding instead of the entire
+  // HLO modules once PjRt supports it.
+  // TODO(hyeontaek): We would not need to use GetHloModules() if
+  // PjRtLoadedExecutable can return the result shape and op sharding when
+  // output sharding propagation is not used.
+  VLOG(3) << "PjRtLoadedExecutable::Create";
+  VLOG(3) << "Requesting GetHloModules";
+  TF_ASSIGN_OR_RETURN(auto hlo_modules,
+                      pjrt_loaded_executable->GetHloModules());
+  if (hlo_modules.empty()) {
+    return FailedPrecondition("No HLO module found");
+  }
+  const auto& hlo_module = hlo_modules.front();
+  // result_shape already contains per-device shapes. Do not use HLO sharding
+  // (e.g., from hlo_module->spmd_output_sharding()), which would accidentally
+  // apply sharding twice.
+  const xla::Shape& result_shape = hlo_module->result_shape();
+  return CreateInternal(client, std::move(pjrt_loaded_executable), result_shape,
+                        /*result_hlo_sharding=*/nullptr);
+}
+
+static StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
+    mlir::ModuleOp module) {
+  auto main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  if (!main) {
+    return InvalidArgument("MLIR module has no main function");
+  }
+  auto type = main.getFunctionType();
+  std::vector<xla::Shape> result_shapes;
+  result_shapes.reserve(type.getNumResults());
+  for (unsigned i = 0; i < type.getNumResults(); ++i) {
+    auto result_type = type.getResult(i);
+    result_shapes.push_back(xla::TypeToShape(result_type));
+  }
+  return result_shapes;
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+    PjRtCompatibleClient* client, mlir::ModuleOp module,
+    CompileOptions options) {
+  VLOG(3) << "PjRtLoadedExecutable::Create";
+  if (VLOG_IS_ON(3)) {
+    module.dump();
+  }
+  VLOG(3) << options.ToProto()->DebugString();
+  const auto& build_options = options.executable_build_options;
+  const bool auto_spmd_partitioning =
+      build_options.use_spmd_partitioning() &&
+      build_options.num_partitions() > 1 &&
+      (build_options.use_auto_spmd_partitioning() ||
+       build_options.any_allow_spmd_sharding_propagation_to_output());
+  TF_ASSIGN_OR_RETURN(
+      auto pjrt_loaded_executable,
+      client->pjrt_client()->Compile(module, std::move(options)));
+
+  if (auto_spmd_partitioning) {
+    // TODO(hyeontaek): We should request output shapes and shardings instead of
+    // the entire HLO modules once PjRt supports it.
+    VLOG(3) << "Requesting GetHloModules";
+    TF_ASSIGN_OR_RETURN(auto hlo_modules,
+                        pjrt_loaded_executable->GetHloModules());
+    if (hlo_modules.empty()) {
+      return FailedPrecondition("No HLO module found");
+    }
+    const auto& hlo_module = hlo_modules.front();
+    // result_shape already contains per-device shapes. Do not use HLO sharding
+    // (e.g., from hlo_module->spmd_output_sharding()), which would accidentally
+    // apply sharding twice.
+    const xla::Shape& result_shape = hlo_module->result_shape();
+    return CreateInternal(client, std::move(pjrt_loaded_executable),
+                          result_shape,
+                          /*result_hlo_sharding=*/nullptr);
+  } else {
+    VLOG(3) << "Not requesting GetHloModules";
+    TF_ASSIGN_OR_RETURN(auto result_shapes, ResultShapesOfModule(module));
+    bool tuple_output = result_shapes.size() != 1;
+    xla::Shape result_shape;
+    if (tuple_output) {
+      result_shape = xla::ShapeUtil::MakeTupleShape(result_shapes);
+    } else {
+      result_shape = result_shapes.front();
+    }
+
+    std::optional<HloSharding> result_hlo_sharding_holder;
+    const xla::HloSharding* result_hlo_sharding = nullptr;
+    std::optional<std::vector<OpSharding>> output_shardings =
+        pjrt_loaded_executable->GetOutputShardings();
+    if (output_shardings) {
+      std::vector<HloSharding> hlo_shardings;
+      hlo_shardings.reserve(output_shardings->size());
+      for (const auto& sharding : *output_shardings) {
+        TF_ASSIGN_OR_RETURN(auto hlo_sharding,
+                            HloSharding::FromProto(sharding));
+        hlo_shardings.push_back(hlo_sharding);
+      }
+      if (tuple_output) {
+        result_hlo_sharding_holder =
+            HloSharding::Tuple(result_shape, hlo_shardings);
+      } else {
+        result_hlo_sharding_holder = hlo_shardings.front();
+      }
+      result_hlo_sharding = &*result_hlo_sharding_holder;
+    }
+    return CreateInternal(client, std::move(pjrt_loaded_executable),
+                          result_shape, result_hlo_sharding);
+  }
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
+    PjRtCompatibleClient* client, const XlaComputation& computation,
+    CompileOptions options) {
+  VLOG(3) << "PjRtLoadedExecutable::Create";
+  VLOG(3) << computation.proto().DebugString();
+  VLOG(3) << options.ToProto()->DebugString();
+  const auto& build_options = options.executable_build_options;
+  const bool auto_spmd_partitioning =
+      build_options.use_spmd_partitioning() &&
+      build_options.num_partitions() > 1 &&
+      (build_options.use_auto_spmd_partitioning() ||
+       build_options.any_allow_spmd_sharding_propagation_to_output());
+  TF_ASSIGN_OR_RETURN(
+      auto pjrt_loaded_executable,
+      client->pjrt_client()->Compile(computation, std::move(options)));
+
+  if (auto_spmd_partitioning) {
+    // TODO(hyeontaek): We should request output shapes and shardings instead of
+    // the entire HLO modules once PjRt supports it.
+    VLOG(3) << "Requesting GetHloModules";
+    TF_ASSIGN_OR_RETURN(auto hlo_modules,
+                        pjrt_loaded_executable->GetHloModules());
+    if (hlo_modules.empty()) {
+      return FailedPrecondition("No HLO module found");
+    }
+    const auto& hlo_module = hlo_modules.front();
+    // result_shape already contains per-device shapes. Do not use HLO sharding
+    // (e.g., from hlo_module->spmd_output_sharding()), which would accidentally
+    // apply sharding twice.
+    const xla::Shape& result_shape = hlo_module->result_shape();
+    return CreateInternal(client, std::move(pjrt_loaded_executable),
+                          result_shape,
+                          /*result_hlo_sharding=*/nullptr);
+  } else {
+    VLOG(3) << "Not requesting GetHloModules";
+    TF_ASSIGN_OR_RETURN(const auto* root_instruction,
+                        FindRootInstruction(computation.proto()));
+    const xla::Shape result_shape(root_instruction->shape());
+    const xla::HloSharding* result_hlo_sharding = nullptr;
+    std::optional<xla::HloSharding> result_hlo_sharding_holder;
+    if (root_instruction->has_sharding()) {
+      TF_ASSIGN_OR_RETURN(
+          result_hlo_sharding_holder,
+          xla::HloSharding::FromProto(root_instruction->sharding()));
+      result_hlo_sharding = &*result_hlo_sharding_holder;
+    }
+    return CreateInternal(client, std::move(pjrt_loaded_executable),
+                          result_shape, result_hlo_sharding);
+  }
+}
+
+StatusOr<std::unique_ptr<LoadedExecutable>>
+PjRtLoadedExecutable::CreateInternal(
+    PjRtCompatibleClient* client,
+    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+    const xla::Shape& result_shape,
+    const xla::HloSharding* result_hlo_sharding) {
+  DeviceList devices(
+      DeviceList::Devices(pjrt_loaded_executable->addressable_devices().begin(),
+                          pjrt_loaded_executable->addressable_devices().end()));
+  if (devices.empty()) {
+    return InvalidArgument("At least one device is required");
+  }
+  std::vector<DType> output_dtypes;
+  std::vector<Shape> output_shapes;
+  std::vector<std::shared_ptr<const Sharding>> output_shardings;
+
+  auto append_arg = [&](const xla::Shape& shape,
+                        const xla::HloSharding* sharding) -> Status {
+    TF_ASSIGN_OR_RETURN(auto dtype, ToDType(shape.element_type()));
+    output_dtypes.push_back(dtype);
+    output_shapes.push_back(Shape(shape.dimensions()));
+
+    CHECK(shape.IsArray());
+
+    xla::Shape tile_shape;
+    if (sharding != nullptr) {
+      CHECK(!sharding->IsTuple());
+      tile_shape = sharding->TileShape(shape);
+    } else {
+      tile_shape = shape;
+    }
+    std::vector<Shape> per_device_shapes(
+        /*n=*/pjrt_loaded_executable->addressable_devices().size(),
+        /*v=*/Shape(tile_shape.dimensions()));
+    output_shardings.push_back(OpaqueSharding::Create(
+        devices, OpaqueSharding::MakeDisassembleFuncFromShapes(
+                     std::move(per_device_shapes))));
+    return OkStatus();
+  };
+  auto append_token = [&] {
+    output_dtypes.push_back(DType(DType::kToken));
+    output_shapes.push_back(Shape({}));
+    output_shardings.push_back(OpaqueSharding::Create(devices));
+  };
+
+  if (result_shape.IsArray()) {
+    output_dtypes.reserve(1);
+    output_shapes.reserve(1);
+    output_shardings.reserve(1);
+    TF_RETURN_IF_ERROR(append_arg(result_shape, result_hlo_sharding));
+  } else if (result_shape.IsToken()) {
+    output_dtypes.reserve(1);
+    output_shapes.reserve(1);
+    output_shardings.reserve(1);
+    append_token();
+  } else if (result_shape.IsTuple()) {
+    output_dtypes.reserve(result_shape.tuple_shapes().size());
+    output_shapes.reserve(result_shape.tuple_shapes().size());
+    output_shardings.reserve(result_shape.tuple_shapes().size());
+    if (result_hlo_sharding != nullptr &&
+        (!result_hlo_sharding->IsTuple() ||
+         result_hlo_sharding->tuple_elements().size() !=
+             result_shape.tuple_shapes().size())) {
+      return FailedPrecondition(
+          "Output sharding is inconsistent with the tuple result");
+    }
+    for (int i = 0; i < result_shape.tuple_shapes().size(); ++i) {
+      const auto& element_shape = result_shape.tuple_shapes(i);
+      if (element_shape.IsArray()) {
+        const xla::HloSharding* element_hlo_sharding = nullptr;
+        if (result_hlo_sharding != nullptr) {
+          element_hlo_sharding = &result_hlo_sharding->tuple_elements()[i];
+          if (element_hlo_sharding->IsTuple()) {
+            return FailedPrecondition(
+                "Output sharding is inconsistent with the tuple result");
+          }
+        }
+        TF_RETURN_IF_ERROR(append_arg(element_shape, element_hlo_sharding));
+      } else if (element_shape.IsToken()) {
+        append_token();
+      } else {
+        return FailedPrecondition(
+            "The tuple element is not a supported type (array, token)");
+      }
+    }
+  } else {
+    return FailedPrecondition(
+        "The computation result is not a support type (array, token, tuple)");
+  }
+
+  return std::unique_ptr<LoadedExecutable>(new PjRtLoadedExecutable(
+      client, std::move(pjrt_loaded_executable), std::move(devices),
+      std::move(output_dtypes), std::move(output_shapes),
+      std::move(output_shardings)));
+}
+
+StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
+    absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
+    std::optional<DeviceList> devices) {
+  DCHECK(this);
+  // TODO(hyeontaek): Check input sharding consistency.
+
+  // Convert an Array vector into 2-level PjRtBuffer vectors, optionally copying
+  // to new devices.
+  std::vector<std::vector<PjRtBuffer*>> argument_handles;
+  std::vector<std::unique_ptr<PjRtBuffer>> owned_buffers;
+
+  const int num_computations = devices_.size();
+  argument_handles.resize(num_computations);
+  for (int i = 0; i < num_computations; ++i) {
+    argument_handles[i].reserve(args.size());
+  }
+  for (int i = 0; i < args.size(); ++i) {
+    auto* pjrt_array =
+        llvm::dyn_cast_or_null<PjRtCompatibleArray>(args[i].get());
+    if (!pjrt_array) {
+      return InvalidArgument(
+          "Only PjRtCompatibleArray is supported, but argument %d is %s", i,
+          pjrt_array->DebugString());
+    }
+    int j = 0;
+    // TODO(hyeontaek): Check pjrt_array->pjrt_buffers().size() ==
+    // num_computations
+    for (const auto& pjrt_buffer : pjrt_array->pjrt_buffers()) {
+      argument_handles[j].push_back(pjrt_buffer.get());
+      ++j;
+    }
+  }
+
+  const bool portable_execution = devices.has_value();
+  Device* portable_execution_device = devices_.front();
+  if (portable_execution) {
+    if (devices->size() != 1) {
+      return InvalidArgument(
+          "Only single-shard portable execution is supported");
+    }
+    portable_execution_device = devices->front();
+  }
+
+  if (portable_execution) {
+    if (!argument_handles[0].empty()) {
+      portable_execution_device = argument_handles[0][0]->device();
+    } else {
+      // Cannot infer the device from the input.
+      // TODO(hyeontaek): Probably we should take devices as an argument?
+      portable_execution_device = devices_.front();
+    }
+  }
+
+  const bool returned_future_supported =
+      pjrt_loaded_executable_->IsReturnedFutureSupported();
+
+  // Execute the computation.
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> pjrt_outputs;
+  ExecuteResult result;
+  if (portable_execution) {
+    std::optional<PjRtFuture<Status>> returned_pjrt_future;
+    TF_ASSIGN_OR_RETURN(
+        std::vector<std::unique_ptr<PjRtBuffer>> single_device_pjrt_results,
+        pjrt_loaded_executable_->ExecutePortable(
+            argument_handles.front(), portable_execution_device, options,
+            returned_pjrt_future, /*fill_future=*/returned_future_supported));
+
+    pjrt_outputs.push_back(std::move(single_device_pjrt_results));
+    if (returned_future_supported) {
+      result.status = *std::move(returned_pjrt_future);
+    } else {
+      result.status = Future<Status>(OkStatus());
+    }
+  } else {
+    std::optional<std::vector<PjRtFuture<Status>>> returned_pjrt_futures;
+    if (returned_future_supported) {
+      returned_pjrt_futures.emplace();
+    }
+
+    TF_ASSIGN_OR_RETURN(pjrt_outputs,
+                        pjrt_loaded_executable_->Execute(
+                            argument_handles, options, returned_pjrt_futures));
+
+    if (returned_future_supported) {
+      result.status = JoinFutures(absl::MakeSpan(*returned_pjrt_futures));
+    } else {
+      result.status = Future<Status>(OkStatus());
+    }
+  }
+
+  // Convert 2-level PjRtBuffer vectors into an Array vector.
+  std::vector<tsl::RCReference<Array>> outputs;
+  // TODO(hyeontaek): Check output dtype/shape consistency with the actual
+  // output.
+  if (pjrt_outputs.size() != num_computations) {
+    return FailedPrecondition(
+        "Unexpected number of computations in outputs: %d vs. %d",
+        pjrt_outputs.front().size(), num_computations);
+  }
+  const int num_outputs = pjrt_outputs.front().size();
+  if (num_outputs != output_dtypes_.size()) {
+    return FailedPrecondition("Unexpected number of outputs: %d vs. %d",
+                              num_outputs, output_dtypes_.size());
+  }
+  outputs.reserve(num_outputs);
+  std::shared_ptr<const Sharding> single_device_sharding;
+  if (portable_execution) {
+    single_device_sharding =
+        SingleDeviceSharding::Create(portable_execution_device);
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    PjRtArray::PjRtBuffers buffers;
+    buffers.reserve(num_computations);
+    for (int j = 0; j < num_computations; ++j) {
+      buffers.push_back(
+          std::shared_ptr<PjRtBuffer>(pjrt_outputs[j][i].release()));
+    }
+    std::shared_ptr<const Sharding> sharding;
+    if (portable_execution) {
+      sharding = single_device_sharding;
+    } else {
+      sharding = output_shardings_[i];
+    }
+    outputs.push_back(*PjRtArray::Create(client_, output_dtypes_[i],
+                                         output_shapes_[i], std::move(sharding),
+                                         std::move(buffers)));
+  }
+  result.outputs = std::move(outputs);
+  return result;
+}
+
+StatusOr<std::optional<std::string>> PjRtLoadedExecutable::Fingerprint() const {
+  DCHECK(this);
+  return client_->pjrt_client()->ExecutableFingerprint(
+      *pjrt_loaded_executable_);
+}
+
+StatusOr<std::string> PjRtLoadedExecutable::Serialize() const {
+  DCHECK(this);
+  return pjrt_loaded_executable_->SerializeExecutable();
+}
+
+Future<Status> PjRtLoadedExecutable::Delete() {
+  DCHECK(this);
+  pjrt_loaded_executable_->Delete();
+  // TODO(hyeontaek): Return a correct future.
+  return Future<Status>(OkStatus());
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
new file mode 100644
index 00000000000..adc9e094b77
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -0,0 +1,278 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace ifrt {
+
+// PjRt-compatible `Executable` interface.
+class PjRtCompatibleExecutable
+    : public llvm::RTTIExtends<PjRtCompatibleExecutable, Executable> {
+ public:
+  // APIs that allow direct access to `xla::PjRtExecutable` for PjRt-only
+  // operations.
+  virtual xla::PjRtExecutable* pjrt_executable() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// PjRt-compatible `LoadedExecutable` interface.
+class PjRtCompatibleLoadedExecutable
+    : public llvm::RTTIExtends<PjRtCompatibleLoadedExecutable,
+                               LoadedExecutable> {
+ public:
+  // APIs that allow direct access to `xla::PjRtLoadedExecutable` for PjRt-only
+  // operations.
+  virtual xla::PjRtLoadedExecutable* pjrt_loaded_executable() = 0;
+  virtual std::shared_ptr<xla::PjRtLoadedExecutable>
+  shared_ptr_pjrt_loaded_executable() = 0;
+
+  static char ID;  // NOLINT
+};
+
+// `Executable` implementation that wraps a `xla::PjRtExecutable`.
+class PjRtExecutable final
+    : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
+ public:
+  // Creates PjRtExecutable from xla::PjRtExecutable.
+  static StatusOr<std::unique_ptr<Executable>> Create(
+      std::unique_ptr<xla::PjRtExecutable> pjrt_executable);
+  static StatusOr<std::unique_ptr<Executable>> Create(
+      std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
+
+  // PjRtCompatibleExecutable implementation.
+
+  xla::PjRtExecutable* pjrt_executable() override {
+    DCHECK(this);
+    return pjrt_executable_.get();
+  }
+
+  // Executable implementation.
+
+  ~PjRtExecutable() override = default;
+
+  absl::string_view name() const override {
+    DCHECK(this);
+    return pjrt_executable_->name();
+  }
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const override {
+    DCHECK(this);
+    return pjrt_executable_->GetParameterShardings();
+  }
+
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetOutputShardings();
+  }
+
+  StatusOr<std::optional<std::string>> Fingerprint() const override;
+
+  StatusOr<std::string> Serialize() const override;
+
+  int num_devices() const override {
+    DCHECK(this);
+    return pjrt_executable_->num_replicas() *
+           pjrt_executable_->num_partitions();
+  }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    DCHECK(this);
+    return pjrt_executable_->SizeOfGeneratedCodeInBytes();
+  }
+  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    DCHECK(this);
+    return pjrt_executable_->GetCompiledMemoryStats();
+  }
+
+  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    DCHECK(this);
+    return pjrt_executable_->GetHloModules();
+  }
+
+  static char ID;  // NOLINT
+
+ protected:
+  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable)
+      : pjrt_executable_(std::move(pjrt_executable)) {}
+
+  std::shared_ptr<xla::PjRtExecutable> pjrt_executable_;
+};
+
+// `LoadedExecutable` implementation that wraps a `xla::PjRtLoadedExecutable`.
+class PjRtLoadedExecutable final
+    : public llvm::RTTIExtends<PjRtLoadedExecutable,
+                               PjRtCompatibleLoadedExecutable> {
+ public:
+  using LoadedExecutable::ExecuteOptions;
+  using LoadedExecutable::ExecuteResult;
+
+  // Creates PjRtExecutable from xla::PjRtLoadedExecutable. We expect that
+  // xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings.
+  // PjRtLoadedExecutable::GetHloModules() must be implemented.
+  static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client,
+      std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable);
+  static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable);
+
+  // Creates PjRtExecutable from an MHLO or StableHLO MLIR module. We expect
+  // that xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings. If
+  // options.executable_build_options has use_auto_spmd_partitioning or
+  // allow_spmd_sharding_propagation_to_output enabled,
+  // PjRtLoadedExecutable::GetHloModules() must be implemented.
+  static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client, mlir::ModuleOp module,
+      CompileOptions options);
+  // TODO(phawkins): remove the XlaComputation overload.
+  static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
+      PjRtCompatibleClient* client, const XlaComputation& computation,
+      CompileOptions options);
+
+  // PjRtCompatibleLoadedExecutable implementation.
+
+  xla::PjRtLoadedExecutable* pjrt_loaded_executable() override {
+    DCHECK(this);
+    return pjrt_loaded_executable_.get();
+  }
+  std::shared_ptr<xla::PjRtLoadedExecutable> shared_ptr_pjrt_loaded_executable()
+      override {
+    DCHECK(this);
+    return pjrt_loaded_executable_;
+  }
+
+  // LoadedExecutable implementation.
+
+  ~PjRtLoadedExecutable() override = default;
+
+  absl::string_view name() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->name();
+  }
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings()
+      const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetParameterShardings();
+  }
+
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetOutputShardings();
+  }
+
+  StatusOr<std::optional<std::string>> Fingerprint() const override;
+
+  StatusOr<std::string> Serialize() const override;
+
+  int num_devices() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->num_replicas() *
+           pjrt_loaded_executable_->num_partitions();
+  }
+  int64_t SizeOfGeneratedCodeInBytes() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
+  }
+  StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetCompiledMemoryStats();
+  }
+
+  StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->GetHloModules();
+  }
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+  StatusOr<ExecuteResult> Execute(absl::Span<tsl::RCReference<Array>> args,
+                                  const ExecuteOptions& options,
+                                  std::optional<DeviceList> devices) override;
+
+  Future<Status> Delete() override;
+  bool IsDeleted() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->IsDeleted();
+  }
+
+  const DeviceAssignment& device_assignment() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->device_assignment();
+  }
+  absl::Span<const LoadedExecutable::LogicalDeviceIds>
+  addressable_device_logical_ids() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->addressable_device_logical_ids();
+  }
+  absl::Span<Device* const> addressable_devices() const override {
+    DCHECK(this);
+    return pjrt_loaded_executable_->addressable_devices();
+  }
+
+  static char ID;  // NOLINT
+
+ private:
+  static StatusOr<std::unique_ptr<LoadedExecutable>> CreateInternal(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      const xla::Shape& result_shape,
+      const xla::HloSharding* result_hlo_sharding);
+
+  PjRtLoadedExecutable(
+      PjRtCompatibleClient* client,
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      DeviceList devices, std::vector<DType> output_dtypes,
+      std::vector<Shape> output_shapes,
+      std::vector<std::shared_ptr<const Sharding>> output_shardings)
+      : client_(client),
+        pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
+        devices_(std::move(devices)),
+        output_dtypes_(std::move(output_dtypes)),
+        output_shapes_(std::move(output_shapes)),
+        output_shardings_(std::move(output_shardings)) {}
+
+  PjRtCompatibleClient* client_;
+  std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
+  DeviceList devices_;
+  std::vector<DType> output_dtypes_;
+  std::vector<Shape> output_shapes_;
+  std::vector<std::shared_ptr<const Sharding>> output_shardings_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_EXECUTABLE_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable_impl_test_tfrt_cpu.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable_impl_test_tfrt_cpu.cc
new file mode 100644
index 00000000000..f0eb9d4ac26
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable_impl_test_tfrt_cpu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char** argv) {
+  // TfrtCpuExecutable::IsDeleted() always returns false.
+  const char* kFilter = "-LoadedExecutableImplTest.IsDeleted";
+#ifdef GTEST_FLAG_SET
+  GTEST_FLAG_SET(filter, kFilter);
+#else
+  testing::GTEST_FLAG(filter) = kFilter;
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.cc
new file mode 100644
index 00000000000..33ee256c9e8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.cc
@@ -0,0 +1,101 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+/*static*/ StatusOr<tsl::RCReference<PjRtTuple>> PjRtTuple::Create(
+    PjRtCompatibleClient* client, absl::Span<tsl::RCReference<Value>> values) {
+  return tsl::MakeRef<PjRtTuple>(client, values);
+}
+
+Future<Status> PjRtTuple::GetReadyFuture() const {
+  std::vector<Future<Status>> futures;
+  futures.reserve(values_.size());
+  for (const auto& value : values_) {
+    futures.push_back(value->GetReadyFuture());
+  }
+  return JoinFutures(absl::MakeSpan(futures));
+}
+
+Future<Status> PjRtTuple::Delete() {
+  {
+    absl::MutexLock lock(&mu_);
+    if (!is_deleted_.HasBeenNotified()) {
+      is_deleted_.Notify();
+    }
+  }
+  std::vector<Future<Status>> futures;
+  futures.reserve(values_.size());
+  for (const auto& value : values_) {
+    futures.push_back(value->Delete());
+  }
+  return JoinFutures(absl::MakeSpan(futures));
+}
+
+bool PjRtTuple::IsDeleted() const {
+  if (is_deleted_.HasBeenNotified()) {
+    return true;
+  }
+  for (const auto& value : values_) {
+    if (value->IsDeleted()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::string PjRtTuple::DebugString() const {
+  return absl::StrFormat(
+      "PjRtTuple(%s)",
+      absl::StrJoin(values_, ",",
+                    [](std::string* out, const tsl::RCReference<Value>& value) {
+                      out->append(value->DebugString());
+                    }));
+}
+int PjRtTuple::Arity() { return values_.size(); }
+
+Status PjRtTuple::Unpack(absl::Span<tsl::RCReference<Value>> values_out) {
+  if (values_out.size() != values_.size()) {
+    return InvalidArgument(
+        "Wrong number of output values for "
+        "PjRtTuple::Unpack(); got %d expected %d.",
+        values_out.size(), values_.size());
+  }
+  absl::c_copy(values_, values_out.begin());
+  return OkStatus();
+}
+
+char PjRtTuple::ID = 0;
+
+PjRtTuple::PjRtTuple(PjRtCompatibleClient* client,
+                     absl::Span<tsl::RCReference<Value>> values)
+    : client_(client), values_(values.begin(), values.end()) {}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.h
new file mode 100644
index 00000000000..fdbf3c3607d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_tuple.h
@@ -0,0 +1,82 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class PjRtTuple final : public llvm::RTTIExtends<PjRtTuple, Tuple> {
+ public:
+  static StatusOr<tsl::RCReference<PjRtTuple>> Create(
+      PjRtCompatibleClient* client, absl::Span<tsl::RCReference<Value>> values);
+
+  ~PjRtTuple() override = default;
+
+  PjRtCompatibleClient* client() const override {
+    DCHECK(this);
+    return client_;
+  }
+
+  Future<Status> GetReadyFuture() const override;
+
+  Future<Status> Delete() override;
+
+  bool IsDeleted() const override;
+
+  std::string DebugString() const override;
+
+  int Arity() override;
+
+  Status Unpack(absl::Span<tsl::RCReference<Value>> values) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PjRtTuple(PjRtCompatibleClient* client,
+            absl::Span<tsl::RCReference<Value>> values);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  PjRtCompatibleClient* client_;
+  absl::InlinedVector<tsl::RCReference<Value>, 4> values_;
+
+  absl::Mutex mu_;
+
+  // Notifying requires holding mu_.
+  absl::Notification is_deleted_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_TUPLE_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc
new file mode 100644
index 00000000000..181f3be1ff8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/tfrt_cpu_client_test_lib.cc
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+const bool kUnused =
+    (test_util::RegisterClientFactory(
+         []() -> StatusOr<std::unique_ptr<Client>> {
+           TF_ASSIGN_OR_RETURN(auto pjrt_client,
+                               xla::GetTfrtCpuClient(/*asynchronous=*/true,
+                                                     /*cpu_device_count=*/2));
+           return StatusOr<std::unique_ptr<PjRtClient>>(
+               PjRtClient::Create(std::move(pjrt_client)));
+         }),
+     true);
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pmap_lib.cc b/tensorflow/compiler/xla/python/pmap_lib.cc
index 13ed5af19e8..b541361ac4f 100644
--- a/tensorflow/compiler/xla/python/pmap_lib.cc
+++ b/tensorflow/compiler/xla/python/pmap_lib.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/pmap_lib.h"
 
 #include <algorithm>
+#include <exception>
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -33,6 +36,9 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
@@ -43,8 +49,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/sharded_device_array.h"
 #include "tensorflow/compiler/xla/python/sharding.h"
 #include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace jax {
@@ -84,14 +92,10 @@ struct ResultSpec {
 
 // The result of `ShardArg`.
 struct ShardArgResult {
-  // Points to the on-device buffers. Not owned.
-  // Size `num_devices`.
-  std::vector<xla::PjRtBuffer*> per_device_buffers;
-
+  // Points to the on-device array.
+  // ifrt_array->sharding().num_shards() == `num_devices`.
+  tsl::RCReference<xla::ifrt::Array> ifrt_array;
   // The Python argument will be always be copied to `owning_sda`.
-  // If we need to copy data to a device, the newly created buffers will be
-  // added to `owned_buffers`.
-  std::vector<std::unique_ptr<xla::PjRtBuffer>> owned_buffers;
   py::object owning_sda;
 };
 
@@ -129,23 +133,24 @@ xla::StatusOr<ShardArgResult> ShardArg(
             cached_pmap_sharding->sharding_spec()) {
           ShardArgResult result;
           result.owning_sda = py::reinterpret_borrow<py::object>(arg);
-          auto& per_device_buffers = result.per_device_buffers;
-          per_device_buffers.reserve(devices.size());
-
-          DCHECK_EQ(py_array.num_shards(), devices.size());
-
-          for (int i = 0; i < devices.size(); ++i) {
-            auto* pjrt_buffer = py_array.GetBuffer(i);
-            if (devices[i] == pjrt_buffer->device()) {
-              per_device_buffers.push_back(pjrt_buffer);
-            } else {
-              TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtBuffer> out,
-                                  pjrt_buffer->CopyToDevice(devices[i]));
-              per_device_buffers.push_back(out.get());
-              result.owned_buffers.push_back(std::move(out));
-            }
+          result.ifrt_array = tsl::FormRef(py_array.ifrt_array());
+          if (result.ifrt_array == nullptr) {
+            return xla::InvalidArgument("Array has been deleted.");
+          }
+          if (result.ifrt_array->sharding().devices().devices() != devices) {
+            xla::ifrt::DeviceList::Devices ifrt_devices;
+            ifrt_devices.reserve(devices.size());
+            ifrt_devices.insert(ifrt_devices.end(), devices.begin(),
+                                devices.end());
+            auto sharding = xla::ifrt::OpaqueSharding::Create(
+                xla::ifrt::DeviceList(std::move(ifrt_devices)));
+            TF_ASSIGN_OR_RETURN(
+                auto copied_ifrt_array,
+                result.ifrt_array->Reshard(
+                    std::move(sharding),
+                    xla::ifrt::ArrayCopySemantics::kReuseInput));
+            result.ifrt_array = std::move(copied_ifrt_array);
           }
-
           return result;
         }
       }
@@ -157,27 +162,24 @@ xla::StatusOr<ShardArgResult> ShardArg(
         ShardedDeviceArray::AsShardedDeviceArrayUnchecked(arg);
     const ShardingSpec& sharding_spec = input_spec.sharding_spec;
     if (sharding_spec == sda->GetShardingSpec()) {
-      const int num_devices = devices.size();
-      TF_ASSIGN_OR_RETURN(absl::Span<xla::PjRtBuffer* const> sda_buffers,
-                          sda->GetPjRtBuffers());
-      CHECK_EQ(sda_buffers.size(), num_devices);
-
       ShardArgResult result;
       result.owning_sda = py::reinterpret_borrow<py::object>(arg);
-      std::vector<xla::PjRtBuffer*>& per_device_buffers =
-          result.per_device_buffers;
-      per_device_buffers.reserve(num_devices);
-
-      for (int i = 0; i < num_devices; ++i) {
-        xla::PjRtBuffer* current_buffer = sda_buffers[i];
-        if (devices[i] == current_buffer->device()) {  // Pointer equality.
-          per_device_buffers.push_back(current_buffer);
-        } else {
-          TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtBuffer> out,
-                              current_buffer->CopyToDevice(devices[i]));
-          per_device_buffers.push_back(out.get());
-          result.owned_buffers.push_back(std::move(out));
-        }
+      TF_ASSIGN_OR_RETURN(auto ifrt_array, sda->ifrt_array());
+      result.ifrt_array = tsl::FormRef(ifrt_array);
+      if (result.ifrt_array == nullptr) {
+        return xla::InvalidArgument("Array has been deleted.");
+      }
+      if (result.ifrt_array->sharding().devices().devices() != devices) {
+        xla::ifrt::DeviceList::Devices ifrt_devices;
+        ifrt_devices.reserve(devices.size());
+        ifrt_devices.insert(ifrt_devices.end(), devices.begin(), devices.end());
+        auto sharding = xla::ifrt::OpaqueSharding::Create(
+            xla::ifrt::DeviceList(std::move(ifrt_devices)));
+        TF_ASSIGN_OR_RETURN(auto copied_ifrt_array,
+                            result.ifrt_array->Reshard(
+                                std::move(sharding),
+                                xla::ifrt::ArrayCopySemantics::kReuseInput));
+        result.ifrt_array = std::move(copied_ifrt_array);
       }
       return result;
     }
@@ -190,19 +192,41 @@ xla::StatusOr<ShardArgResult> ShardArg(
       py::cast<py::list>(python_fallback(arg, py_devices, input_spec.indices));
   ShardArgResult result;
   result.owning_sda = py::reinterpret_borrow<py::object>(per_device_pybuffers);
-  std::vector<xla::PjRtBuffer*>& per_device_buffers = result.per_device_buffers;
   if (!per_device_pybuffers.empty()) {
-    per_device_buffers.reserve(per_device_pybuffers.size());
+    std::vector<tsl::RCReference<xla::ifrt::Array>> per_device_arrays;
+    per_device_arrays.reserve(per_device_pybuffers.size());
+    xla::ifrt::DeviceList::Devices devices;
+    devices.reserve(per_device_pybuffers.size());
+    // TODO(hyeontaek): The created array will never be disassembled. We should
+    // omit collecting shapes and make the OpaqueSharding non-disassemblable?
+    std::vector<xla::ifrt::Shape> shapes;
+    shapes.reserve(per_device_pybuffers.size());
 
     // The JAX Python shard_arg function is expected to return JAX PyBuffer
     // objects. If executing a JAX extension, it should have fallbacked to
     // Python well before this point.
     TF_RET_CHECK(xla::PyBuffer::IsPyBuffer(per_device_pybuffers[0]));
     for (py::handle per_device_pybuffer : per_device_pybuffers) {
-      xla::PjRtBuffer* buf =
-          xla::PyBuffer::AsPyBuffer(per_device_pybuffer).value()->buffer();
-      per_device_buffers.push_back(buf);
+      auto b = xla::PyBuffer::AsPyBuffer(per_device_pybuffer).value();
+      per_device_arrays.push_back(tsl::FormRef(b->ifrt_array()));
+      devices.push_back(per_device_arrays.back()->sharding().devices().front());
+      shapes.push_back(per_device_arrays.back()->shape());
     }
+    TF_ASSIGN_OR_RETURN(
+        result.ifrt_array,
+        per_device_arrays.front()
+            ->client()
+            ->AssembleArrayFromSingleDeviceArrays(
+                // TODO(hyeontaek): The logical shape here is inaccurate. We
+                // may want to avoid creating a new Array or specialize Array
+                // to disallow access to the logical shape.
+                per_device_arrays.front()->shape(),
+                xla::ifrt::OpaqueSharding::Create(
+                    xla::ifrt::DeviceList(std::move(devices)),
+                    xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+                        std::move(shapes))),
+                absl::MakeSpan(per_device_arrays),
+                xla::ifrt::ArrayCopySemantics::kReuseInput));
   }
   return result;
 }
@@ -261,7 +285,8 @@ class PmapFunction {
   // (c) call the executable
   // (d) construct `ShardedDeviceArray` objects from the outputs
   // (e) reconstruct the `PyTree`.
-  xla::StatusOr<py::object> Call(py::args args, py::kwargs kwargs);
+  xla::StatusOr<py::object> Call(py::handle callable, PyObject* const* args,
+                                 size_t nargs, PyObject* kwnames);
 
   py::object PythonSignature() {
     static const auto* inspect = new py::module(py::module::import("inspect"));
@@ -291,7 +316,6 @@ class PmapFunction {
   // macros.
   using object = pyobject;
 
-  py::handle AsPyHandle();
   // Returns true if `h` is a PmapFunction.
   static bool IsPmapFunction(py::handle handle);
   // Converts `handle` to a PmapFunction*. Does not do any checking.
@@ -311,9 +335,7 @@ class PmapFunction {
   //
   // It deals with the arguments signatures and also of the global and
   // thread-local jit context.
-  xla::Status UpdateArgsSignature(const py::args& args,
-                                  const py::kwargs& kwargs,
-                                  ParsedArgumentsAsBuffers& arguments) {
+  xla::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments) {
     arguments.signature.function_name = function_name_;
 
     // Get dynamic argument signatures.
@@ -496,22 +518,48 @@ void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
   }
 }
 
-xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
+xla::StatusOr<py::object> PmapFunction::Call(py::handle callable,
+                                             PyObject* const* args,
+                                             size_t nargs, PyObject* kwnames) {
+  // Calls the cache_miss_ function. This just calls the Python function; it may
+  // return nullptr value if a Python exception is thrown.
+  auto cache_miss = [&]() -> py::tuple {
+    return py::reinterpret_steal<py::tuple>(
+        JAX_PyObject_Vectorcall(cache_miss_.ptr(), args, nargs, kwnames));
+  };
+
+  // Call the cache_miss() function, extracting the output data and ignoring
+  // the fastpath data. If the cache miss returns a Python error, returns
+  // nullptr and leaves the Python error set.
+  auto fallback_to_cache_miss = [&]() {
+    py::tuple cache_miss_output = cache_miss();
+    if (!cache_miss_output.ptr()) {
+      return py::object();
+    }
+    return py::object(cache_miss_output[0]);
+  };
+
   if (always_fallback_to_python_) {
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
+  size_t num_positional_args = PyVectorcall_NARGS(nargs);
+  size_t num_keyword_args = kwnames ? PyTuple_GET_SIZE(kwnames) : 0;
+  absl::Span<PyObject* const> positional_args(args, num_positional_args);
+  absl::Span<PyObject* const> keyword_args(args + num_positional_args,
+                                           num_keyword_args);
   ParsedArgumentsAsBuffers arguments;
-  xla::Status status = ParseArguments(args, kwargs, static_argnums_,
-                                      /*static_argnames=*/{}, arguments);
+  xla::Status status =
+      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
+                     /*static_argnames=*/{}, arguments);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
-  status = UpdateArgsSignature(args, kwargs, arguments);
+  status = UpdateArgsSignature(arguments);
   if (!status.ok()) {
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
   // Retrieve/Maybe add the executable to the cache.
@@ -519,7 +567,10 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
       it;
   bool inserted;
   std::tie(it, inserted) = executables_.try_emplace(
-      arguments.signature, std::make_unique<PmapCacheEntry>());
+      arguments.signature, std::unique_ptr<PmapCacheEntry>());
+  if (inserted) {
+    it->second = std::make_unique<PmapCacheEntry>();
+  }
   PmapCacheEntry& cache_entry = *(it->second);
 
   if (!cache_entry.compilation_complete.HasBeenNotified()) {
@@ -532,7 +583,10 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
-        out_and_fastpath_data = cache_miss_(*args, **kwargs);
+        out_and_fastpath_data = cache_miss();
+        if (!out_and_fastpath_data.ptr()) {
+          throw py::error_already_set();
+        }
         out_tuple = py::cast<py::tuple>(out_and_fastpath_data);
         PopulateCacheEntry(cache_entry, arguments.signature, out_tuple);
       } catch (const std::exception& e) {
@@ -554,7 +608,7 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
     }
   }
   if (cache_entry.fall_back_to_python) {
-    return py::object(py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0]);
+    return fallback_to_cache_miss();
   }
 
   // 1. Parse arguments.
@@ -564,40 +618,30 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
   std::vector<InputSpec>& input_specs = cache_entry.input_specs;
   const int num_args = arguments.flat_dynamic_args.size();
 
-  // We need [num_computation, num_args] for the `Execute` call bellow,
-  std::vector<std::vector<xla::PjRtBuffer*>> num_computation_num_args_buffers(
-      num_computations);
-  for (int computation = 0; computation < num_computations; ++computation) {
-    num_computation_num_args_buffers[computation].resize(num_args);
-  }
+  // We need [num_args] for the `Execute` call below.
+  std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays(num_args);
   for (int i = 0; i < num_args; ++i) {
     TF_ASSIGN_OR_RETURN(
         ShardArgResult sharded_arg,
         ShardArg(arguments.flat_dynamic_args[i], input_devices, input_specs[i],
                  cache_entry.py_devices, python_shard_arg_fallback_));
 
-    std::vector<xla::PjRtBuffer*>& per_device_buffers =
-        sharded_arg.per_device_buffers;
-    for (int computation = 0; computation < num_computations; ++computation) {
-      num_computation_num_args_buffers[computation][i] =
-          per_device_buffers[computation];
-    }
-    for (auto& owned_buffer : sharded_arg.owned_buffers) {
-      arguments.keep_alive.push_back(std::move(owned_buffer));
-    }
+    num_args_arrays[i] = std::move(sharded_arg.ifrt_array);
     if (sharded_arg.owning_sda) {
       arguments.keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
     }
   }
 
-  // A vector of [num_devices, num_outputs].
-  std::vector<std::vector<std::unique_ptr<xla::PjRtBuffer>>> output_buffers;
+  // A vector of [num_outputs].
+  std::vector<tsl::RCReference<xla::ifrt::Array>> output_arrays;
   {
     py::gil_scoped_release gil_release;
-    auto pjrt_executable = cache_entry.executable->mutable_pjrt_executable();
-    TF_ASSIGN_OR_RETURN(output_buffers, pjrt_executable->Execute(
-                                            num_computation_num_args_buffers,
-                                            cache_entry.executable->options()));
+    auto ifrt_executable = cache_entry.executable->ifrt_executable();
+    TF_ASSIGN_OR_RETURN(
+        auto result, ifrt_executable->Execute(absl::MakeSpan(num_args_arrays),
+                                              cache_entry.executable->options(),
+                                              /*devices=*/std::nullopt));
+    output_arrays = std::move(result.outputs);
   }
 
   // TODO(jblespiau): We don't need to create the PyBuffer objects.
@@ -610,7 +654,7 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
 
   // Convert the PjRtBuffer objects to PyBuffer, and invert the order from
   // [num_devices, num_args] to [num_args, num_devices].
-  const int num_outputs = output_buffers[0].size();
+  const int num_outputs = output_arrays.size();
   std::vector<py::object> flat_sharded_device_arrays;
   flat_sharded_device_arrays.reserve(num_outputs);
 
@@ -618,19 +662,12 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
 
   if (!cache_entry.out_array_shardings.empty()) {
     for (int i = 0; i < num_outputs; ++i) {
-      std::vector<std::shared_ptr<xla::PjRtBuffer>> outputs;
-      outputs.reserve(num_computations);
-      for (int j = 0; j < num_computations; ++j) {
-        outputs.push_back(std::move(output_buffers[j][i]));
-      }
-
       const ResultSpec& result_spec = output_specs[i];
-
       xla::PyArray py_array(
           result_spec.out_aval, result_spec.weak_type,
           cache_entry.out_dtypes[i], cache_entry.out_shapes[i],
           cache_entry.out_array_shardings[i], client, traceback,
-          std::move(outputs), cache_entry.out_committed[i]);
+          std::move(output_arrays[i]), cache_entry.out_committed[i]);
 
       flat_sharded_device_arrays.push_back(std::move(py_array));
     }
@@ -639,10 +676,13 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
     outputs.resize(num_outputs);
     for (int output_id = 0; output_id < num_outputs; ++output_id) {
       outputs[output_id].reserve(num_computations);
-      for (int computation = 0; computation < num_computations; ++computation) {
+      TF_ASSIGN_OR_RETURN(
+          auto single_device_arrays,
+          output_arrays[output_id]->DisassembleIntoSingleDeviceArrays(
+              xla::ifrt::ArrayCopySemantics::kReuseInput));
+      for (auto& single_device_array : single_device_arrays) {
         outputs[output_id].push_back(xla::PyBuffer::Make(
-            client, std::move(output_buffers[computation][output_id]),
-            traceback));
+            client, std::move(single_device_array), traceback));
       }
     }
 
@@ -663,7 +703,19 @@ xla::StatusOr<py::object> PmapFunction::Call(py::args args, py::kwargs kwargs) {
   // If there is a post-hook function, call it with the inputs and the outputs.
   std::optional<py::object> post_hook = GetPostHook();
   if (post_hook) {
-    (*post_hook)(this->AsPyHandle(), args, kwargs, out);
+    py::tuple args_tuple(num_positional_args);
+    for (size_t i = 0; i < num_positional_args; ++i) {
+      args_tuple[i] = args[i];
+    }
+    py::dict kwargs;
+    if (kwnames) {
+      for (size_t i = 0; i < num_keyword_args; ++i) {
+        kwargs[py::handle(PyTuple_GET_ITEM(kwnames, i))] =
+            args[num_positional_args + i];
+      }
+    }
+
+    (*post_hook)(callable, args_tuple, kwargs, out);
   }
 
   return out;
@@ -673,6 +725,7 @@ struct JaxPmapFunctionObject {
   PyObject_HEAD;
   PyObject* dict;      // Dictionary for __dict__
   PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
+  vectorcallfunc vectorcall;
   PmapFunction fun;
 };
 
@@ -693,15 +746,36 @@ xla::StatusOr<PmapFunction*> AsPmapFunction(py::handle handle) {
   return PmapFunction::AsPmapFunctionUnchecked(handle);
 }
 
-py::handle PmapFunction::AsPyHandle() {
-  return reinterpret_cast<PyObject*>(reinterpret_cast<char*>(this) -
-                                     offsetof(JaxPmapFunctionObject, fun));
-}
-
 namespace {
 
 extern "C" {
 
+PyObject* JaxPmapFunction_tp_vectorcall(PyObject* callable,
+                                        PyObject* const* args, size_t nargs,
+                                        PyObject* kwnames) {
+  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(callable);
+  tsl::profiler::TraceMe traceme([&] {
+    return absl::StrCat("JaxPmapFunction(", o->fun.function_name(), ")");
+  });
+  try {
+    xla::StatusOr<py::object> out = o->fun.Call(callable, args, nargs, kwnames);
+    if (!out.ok()) {
+      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
+      return nullptr;
+    }
+    return out.value().release().ptr();
+  } catch (py::error_already_set& e) {
+    e.restore();
+    return nullptr;
+  } catch (py::cast_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  } catch (std::invalid_argument& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
+  }
+}
+
 PyObject* JaxPmapFunction_tp_new(PyTypeObject* subtype, PyObject* args,
                                  PyObject* kwds) {
   JaxPmapFunctionObject* self =
@@ -709,6 +783,7 @@ PyObject* JaxPmapFunction_tp_new(PyTypeObject* subtype, PyObject* args,
   if (!self) return nullptr;
   self->dict = nullptr;
   self->weakrefs = nullptr;
+  self->vectorcall = JaxPmapFunction_tp_vectorcall;
   return reinterpret_cast<PyObject*>(self);
 }
 
@@ -726,6 +801,10 @@ void JaxPmapFunction_tp_dealloc(PyObject* self) {
 
 int JaxPmapFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
   JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
+#if PY_VERSION_HEX >= 0x03090000
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+  Py_VISIT(Py_TYPE(self));
+#endif
   Py_VISIT(o->dict);
   Py_VISIT(o->fun.fun().ptr());
   Py_VISIT(o->fun.cache_miss().ptr());
@@ -782,36 +861,6 @@ static PyGetSetDef JaxPmapFunction_tp_getset[] = {
      JaxPmapFunction_set_dict, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
-PyObject* JaxPmapFunction_tp_call(PyObject* self, PyObject* args,
-                                  PyObject* kwargs) {
-  JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
-  tsl::profiler::TraceMe traceme([&] {
-    return absl::StrCat("JaxPmapFunction(", o->fun.function_name(), ")");
-  });
-  py::kwargs py_kwargs;
-  if (kwargs) {
-    py_kwargs = py::reinterpret_borrow<py::kwargs>(kwargs);
-  }
-  try {
-    xla::StatusOr<py::object> out = o->fun.Call(
-        py::reinterpret_borrow<py::args>(args), std::move(py_kwargs));
-    if (!out.ok()) {
-      PyErr_SetString(PyExc_ValueError, out.status().ToString().c_str());
-      return nullptr;
-    }
-    return out.value().release().ptr();
-  } catch (py::error_already_set& e) {
-    e.restore();
-    return nullptr;
-  } catch (py::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  } catch (std::invalid_argument& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
-  }
-}
-
 void InitializePmapFunction(JaxPmapFunctionObject* cfun, py::function fun,
                             py::function cache_miss,
                             std::vector<int> static_argnums,
@@ -847,6 +896,8 @@ void BuildPmapSubmodule(py::module& m) {
 
   py::class_<NoSharding> no_sharding(pmap_lib, "NoSharding");
   no_sharding.def(py::init<>())
+      .def(py::pickle([](const NoSharding& self) { return py::make_tuple(); },
+                      [](py::tuple t) { return NoSharding{}; }))
       .def("__repr__",
            [](const NoSharding& chuncked) { return "NoSharding()"; })
       .def("__eq__",
@@ -860,6 +911,9 @@ void BuildPmapSubmodule(py::module& m) {
 
   py::class_<Chunked> chunked(pmap_lib, "Chunked");
   chunked.def(py::init<std::vector<int>>())
+      .def(py::pickle(
+          [](const Chunked& self) { return py::make_tuple(self.chunks); },
+          [](py::tuple t) { return Chunked{t[0].cast<std::vector<int>>()}; }))
       .def_readonly("chunks", &Chunked::chunks)
       .def("__repr__",
            [](const Chunked& chuncked) {
@@ -875,6 +929,9 @@ void BuildPmapSubmodule(py::module& m) {
 
   py::class_<Unstacked> unstacked(pmap_lib, "Unstacked");
   unstacked.def(py::init<int>())
+      .def(py::pickle(
+          [](const Unstacked& self) { return py::make_tuple(self.size); },
+          [](py::tuple t) { return Unstacked{t[0].cast<int>()}; }))
       .def_readonly("size", &Unstacked::size)
       .def("__repr__",
            [](const Unstacked& x) {
@@ -888,8 +945,11 @@ void BuildPmapSubmodule(py::module& m) {
       });
 
   py::class_<ShardedAxis> sharded_axis(pmap_lib, "ShardedAxis");
-  sharded_axis.def(py::init<int>()).def_readonly("axis", &ShardedAxis::axis);
-  sharded_axis
+  sharded_axis.def(py::init<int>())
+      .def(py::pickle(
+          [](const ShardedAxis& self) { return py::make_tuple(self.axis); },
+          [](py::tuple t) { return ShardedAxis{t[0].cast<int>()}; }))
+      .def_readonly("axis", &ShardedAxis::axis)
       .def("__repr__",
            [](const ShardedAxis& x) {
              return absl::StrCat("ShardedAxis(axis=", x.axis, ")");
@@ -900,6 +960,9 @@ void BuildPmapSubmodule(py::module& m) {
 
   py::class_<Replicated> replicated(pmap_lib, "Replicated");
   replicated.def(py::init<int>())
+      .def(py::pickle(
+          [](const Replicated& self) { return py::make_tuple(self.replicas); },
+          [](py::tuple t) { return Replicated{t[0].cast<int>()}; }))
       .def_readonly("replicas", &Replicated::replicas)
       .def("__repr__",
            [](const Replicated& x) {
@@ -913,6 +976,18 @@ void BuildPmapSubmodule(py::module& m) {
   sharding_spec
       .def(py::init<py::iterable, py::iterable>(), py::arg("sharding"),
            py::arg("mesh_mapping"))
+      .def(py::pickle(
+          [](const ShardingSpec& self) {
+            auto sharding =
+                xla::SpanToTuple(absl::MakeConstSpan(self.GetSharding()));
+            auto mesh_mapping =
+                xla::SpanToTuple(absl::MakeConstSpan(self.GetMeshMapping()));
+            return py::make_tuple(sharding, mesh_mapping);
+          },
+          [](py::tuple t) {
+            return ShardingSpec{t[0].cast<std::vector<AvalDimSharding>>(),
+                                t[1].cast<std::vector<MeshDimAssignment>>()};
+          }))
       .def_property_readonly(
           "sharding",
           [](const ShardingSpec& self) {
@@ -949,8 +1024,8 @@ void BuildPmapSubmodule(py::module& m) {
     PyTypeObject* type = &heap_type->ht_type;
     type->tp_name = "PmapFunction";
     type->tp_basicsize = sizeof(JaxPmapFunctionObject);
-    type->tp_flags =
-        Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE | Py_TPFLAGS_HAVE_GC;
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
+                     Py_TPFLAGS_HAVE_GC | JAX_TPFLAGS_HAVE_VECTORCALL;
     type->tp_new = JaxPmapFunction_tp_new;
     type->tp_dealloc = JaxPmapFunction_tp_dealloc;
     type->tp_dictoffset = offsetof(JaxPmapFunctionObject, dict);
@@ -959,7 +1034,8 @@ void BuildPmapSubmodule(py::module& m) {
     type->tp_weaklistoffset = offsetof(JaxPmapFunctionObject, weakrefs);
     type->tp_getset = JaxPmapFunction_tp_getset;
     type->tp_descr_get = JaxPmapFunction_tp_descr_get;
-    type->tp_call = JaxPmapFunction_tp_call;
+    type->tp_call = PyVectorcall_Call;
+    type->tp_vectorcall_offset = offsetof(JaxPmapFunctionObject, vectorcall);
     CHECK_EQ(PyType_Ready(type), 0);
     JaxPmapFunction_Type = reinterpret_cast<PyObject*>(type);
     cfun = py::reinterpret_borrow<py::object>(JaxPmapFunction_Type);
@@ -1031,28 +1107,6 @@ void BuildPmapSubmodule(py::module& m) {
       },
       py::is_method(cfun_type));
 
-  // Accepts _arbitrary_ arguments for a pmapped function and returns the
-  // corresponding signatures that are used as cache keys. No-op.
-  //
-  // This function allows to pass partial args, which is especially useful when
-  // the full list of arguments is too long and results in enormous signatures.
-  // For example, this function can be multiple times as
-  // > fn._debug_compute_cache_key(arg[0])
-  // > fn._debug_compute_cache_key(arg[1])
-  // > fn._debug_compute_cache_key(arg[-3:-1])
-  // ...
-  cfun.attr("_debug_compute_cache_key") = py::cpp_function(
-      [](const PmapFunction::object& self, const py::args& args,
-         const py::kwargs& kwargs) -> xla::StatusOr<std::string> {
-        ParsedArgumentsAsBuffers arguments;
-        TF_ASSIGN_OR_RETURN(PmapFunction * fun, AsPmapFunction(self));
-        TF_RETURN_IF_ERROR(ParseArguments(args, kwargs, fun->static_argnums(),
-                                          /*static_argnames=*/{}, arguments));
-        TF_RETURN_IF_ERROR(fun->UpdateArgsSignature(args, kwargs, arguments));
-        return arguments.signature.DebugString();
-      },
-      py::is_method(cfun_type));
-
   pmap_lib.def("pmap",
                [](py::function fun, py::function cache_miss,
                   std::vector<int> static_argnums,
diff --git a/tensorflow/compiler/xla/python/pprof_profile_builder.cc b/tensorflow/compiler/xla/python/pprof_profile_builder.cc
index d62435ee66d..fed6b15bcbc 100644
--- a/tensorflow/compiler/xla/python/pprof_profile_builder.cc
+++ b/tensorflow/compiler/xla/python/pprof_profile_builder.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/protobuf.h"
@@ -59,7 +60,7 @@ int PprofProfileBuilder::LocationId(PyCodeObject* code, int instruction) {
     location->set_id(ret.first->second);
     auto* line = location->add_line();
     line->set_function_id(FunctionId(code));
-    line->set_line(PyCode_Addr2Line(code, instruction));
+    line->set_line(PyCode_Addr2Line(code, instruction * kLastiWordBytes));
   }
   return ret.first->second;
 }
diff --git a/tensorflow/compiler/xla/python/pprof_profile_builder.h b/tensorflow/compiler/xla/python/pprof_profile_builder.h
index b37c0038709..055f3677a80 100644
--- a/tensorflow/compiler/xla/python/pprof_profile_builder.h
+++ b/tensorflow/compiler/xla/python/pprof_profile_builder.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/profiler/profile.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profile.pb.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/python/profiler.cc b/tensorflow/compiler/xla/python/profiler.cc
index 931a4848763..47bb8bcfcdd 100644
--- a/tensorflow/compiler/xla/python/profiler.cc
+++ b/tensorflow/compiler/xla/python/profiler.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/profiler.h"
 
+#include <memory>
+
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
-#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
-#include "tensorflow/core/profiler/rpc/profiler_server.h"
-#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
+#include "tensorflow/tsl/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_server.h"
 
 namespace xla {
 
@@ -30,14 +32,13 @@ namespace py = pybind11;
 namespace {
 // Adds a trivial forwarding class so these Python bindings and TensorFlow's
 // bindings of the same thing don't register the same class with pybind11.
-class TraceMeWrapper : public tensorflow::profiler::TraceMeWrapper {
+class TraceMeWrapper : public xla::profiler::TraceMeWrapper {
  public:
-  using tensorflow::profiler::TraceMeWrapper::TraceMeWrapper;
+  using xla::profiler::TraceMeWrapper::TraceMeWrapper;
 };
 
 tensorflow::ProfileOptions DefaultPythonProfileOptions() {
-  tensorflow::ProfileOptions options =
-      tensorflow::ProfilerSession::DefaultOptions();
+  tensorflow::ProfileOptions options = tsl::ProfilerSession::DefaultOptions();
   options.set_python_tracer_level(1);
   options.set_enable_hlo_proto(true);
   return options;
@@ -47,36 +48,35 @@ tensorflow::ProfileOptions DefaultPythonProfileOptions() {
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");
-  py::class_<tensorflow::profiler::ProfilerServer,
-             std::unique_ptr<tensorflow::profiler::ProfilerServer>>
+  py::class_<tsl::profiler::ProfilerServer,
+             std::unique_ptr<tsl::profiler::ProfilerServer>>
       profiler_server_class(profiler, "ProfilerServer");
   profiler.def(
       "start_server",
-      [](int port) -> std::unique_ptr<tensorflow::profiler::ProfilerServer> {
-        auto server = std::make_unique<tensorflow::profiler::ProfilerServer>();
+      [](int port) -> std::unique_ptr<tsl::profiler::ProfilerServer> {
+        auto server = std::make_unique<tsl::profiler::ProfilerServer>();
         server->StartProfilerServer(port);
         return server;
       },
       py::arg("port"));
 
-  py::class_<tensorflow::ProfilerSession> profiler_session_class(
-      profiler, "ProfilerSession");
+  py::class_<tsl::ProfilerSession> profiler_session_class(profiler,
+                                                          "ProfilerSession");
   profiler_session_class
       .def(py::init([]() {
-        return tensorflow::ProfilerSession::Create(
-            DefaultPythonProfileOptions());
+        return tsl::ProfilerSession::Create(DefaultPythonProfileOptions());
       }))
       .def(py::init([](const tensorflow::ProfileOptions& options) {
-        return tensorflow::ProfilerSession::Create(options);
+        return tsl::ProfilerSession::Create(options);
       }))
       .def("stop_and_export",
-           [](tensorflow::ProfilerSession* sess,
+           [](tsl::ProfilerSession* sess,
               const std::string& tensorboard_dir) -> xla::Status {
              tensorflow::profiler::XSpace xspace;
              // Disables the ProfilerSession
              TF_RETURN_IF_ERROR(sess->CollectData(&xspace));
-             return tensorflow::profiler::ExportToTensorBoard(xspace,
-                                                              tensorboard_dir);
+             return tsl::profiler::ExportToTensorBoard(
+                 xspace, tensorboard_dir, /* also_export_trace_json= */ true);
            });
 
   py::class_<tensorflow::ProfileOptions> profile_options_class(
diff --git a/tensorflow/compiler/xla/python/profiler/internal/BUILD b/tensorflow/compiler/xla/python/profiler/internal/BUILD
index c9171eb17a8..8f4f0a413e8 100644
--- a/tensorflow/compiler/xla/python/profiler/internal/BUILD
+++ b/tensorflow/compiler/xla/python/profiler/internal/BUILD
@@ -1,8 +1,9 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/python/profiler:__subpackages__"],
     licenses = ["notice"],
 )
@@ -15,11 +16,11 @@ cc_library(
     copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],  # Incompatible with -fexceptions.
     visibility = [
+        "//tensorflow/compiler/xla/backends/profiler:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
         "//tensorflow/python/profiler/internal:__subpackages__",
     ],
     deps = [
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:path",
@@ -36,3 +37,20 @@ cc_library(
     ],
     alwayslink = True,
 )
+
+cc_library(
+    name = "traceme_wrapper",
+    hdrs = ["traceme_wrapper.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/lib:traceme_for_pybind",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/profiler/internal/python_hooks.h b/tensorflow/compiler/xla/python/profiler/internal/python_hooks.h
index d5f6e404d23..d1eb06f8b6c 100644
--- a/tensorflow/compiler/xla/python/profiler/internal/python_hooks.h
+++ b/tensorflow/compiler/xla/python/profiler/internal/python_hooks.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "pybind11/cast.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.h b/tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h
similarity index 83%
rename from tensorflow/python/profiler/internal/traceme_wrapper.h
rename to tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h
index 35d95be0f8b..38021c82be2 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.h
+++ b/tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
-#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
 
 #include <string>
 #include <utility>
@@ -21,11 +21,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "pybind11/pytypes.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
-namespace tensorflow {
+namespace xla {
 namespace profiler {
 
 // Wraps TraceMe with an interface that takes python types.
@@ -58,7 +58,7 @@ class TraceMeWrapper {
 
   void Stop() { traceme_.Stop(); }
 
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
+  static bool IsEnabled() { return tsl::profiler::TraceMe::Active(); }
 
  private:
   // Converts kwargs to strings and appends them to name encoded as TraceMe
@@ -80,10 +80,10 @@ class TraceMeWrapper {
     return std::string(pybind11::str(handle));
   }
 
-  tensorflow::profiler::TraceMe traceme_;
+  tsl::profiler::TraceMe traceme_;
 };
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_H_
diff --git a/tensorflow/compiler/xla/python/py_array.cc b/tensorflow/compiler/xla/python/py_array.cc
index ba4e210b300..ed0d8338a30 100644
--- a/tensorflow/compiler/xla/python/py_array.cc
+++ b/tensorflow/compiler/xla/python/py_array.cc
@@ -22,39 +22,95 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "llvm/Support/Casting.h"
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
 #include "tensorflow/compiler/xla/python/status_casters.h"
+#include "tensorflow/compiler/xla/python/util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
 namespace py = pybind11;
 
-std::vector<std::shared_ptr<PjRtBuffer>> CreatePjRtBuffersFromPyBuffers(
+tsl::RCReference<ifrt::Array> CreateIfRtArrayFromPyBuffers(
+    py::dtype dtype, absl::Span<const int64_t> shape,
     absl::Span<const PyBuffer::object> py_buffers) {
-  std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers;
-  pjrt_buffers.reserve(py_buffers.size());
+  if (py_buffers.empty()) {
+    // TODO(hyeontaek): Return a Status.
+    throw py::value_error("At least one buffer must be provided.");
+  }
+
+  auto* ifrt_client = py_buffers.front().buf()->client()->ifrt_client();
+
+  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
+  ifrt_arrays.reserve(py_buffers.size());
+  ifrt::DeviceList::Devices devices;
+  devices.reserve(py_buffers.size());
+  std::vector<ifrt::Shape> shapes;
+  shapes.reserve(py_buffers.size());
 
   for (const auto& py_buffer : py_buffers) {
-    pjrt_buffers.push_back(py_buffer.buf()->shared_ptr_buffer());
+    ifrt_arrays.push_back(tsl::FormRef(py_buffer.buf()->ifrt_array()));
+    devices.push_back(ifrt_arrays.back()->sharding().devices()[0]);
+    shapes.push_back(ifrt_arrays.back()->shape());
   }
-
-  return pjrt_buffers;
+  auto ifrt_array = ifrt_client->AssembleArrayFromSingleDeviceArrays(
+      ifrt::Shape(shape),
+      ifrt::OpaqueSharding::Create(
+          ifrt::DeviceList(std::move(devices)),
+          xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+              std::move(shapes))),
+      absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput);
+  if (!ifrt_array.ok()) {
+    // TODO(hyeontaek): Return a Status.
+    throw py::value_error(ifrt_array.status().ToString());
+  }
+  return *std::move(ifrt_array);
 }
 
-std::vector<std::shared_ptr<PjRtBuffer>>
-CreatePjRtBuffersFromSingleShardedPyArrays(
+tsl::RCReference<ifrt::Array> CreateIfRtArrayFromSingleDeviceShardedPyArrays(
+    py::object dtype, absl::Span<const int64_t> shape,
     absl::Span<const PyArray> py_arrays) {
-  std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers;
-  pjrt_buffers.reserve(py_arrays.size());
+  if (py_arrays.empty()) {
+    // TODO(hyeontaek): Return a Status.
+    throw py::value_error("At least one array must be provided.");
+  }
+  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
+  ifrt_arrays.reserve(py_arrays.size());
+  ifrt::DeviceList::Devices devices;
+  devices.reserve(py_arrays.size());
+  std::vector<ifrt::Shape> shapes;
+  shapes.reserve(py_arrays.size());
 
   for (const auto& py_array : py_arrays) {
     DCHECK_EQ(py_array.num_shards(), 1);
-    pjrt_buffers.push_back(py_array.GetSharedPtrBuffer(0));
+    ifrt_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
+    devices.push_back(ifrt_arrays.back()->sharding().devices().front());
+    shapes.push_back(ifrt_arrays.back()->shape());
   }
+  ifrt::Client* client = ifrt_arrays.front()->client();
 
-  return pjrt_buffers;
+  auto ifrt_dtype = ToIfRtDType(dtype);
+  if (!ifrt_dtype.ok()) {
+    // TODO(hyeontaek): Return a Status.
+    throw py::value_error(ifrt_dtype.status().ToString());
+  }
+  auto ifrt_array = client->AssembleArrayFromSingleDeviceArrays(
+      ifrt::Shape(shape),
+      ifrt::OpaqueSharding::Create(
+          ifrt::DeviceList(std::move(devices)),
+          xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+              std::move(shapes))),
+      absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput);
+  if (!ifrt_array.ok()) {
+    // TODO(hyeontaek): Return a Status.
+    throw py::value_error(ifrt_array.status().ToString());
+  }
+  return *std::move(ifrt_array);
 }
 
 struct PyArrayObject {
@@ -141,12 +197,15 @@ PyArray::Storage* Construct(PyArrayObject* self, Args&&... args) {
 void PyArray::PyInit(py::object self, py::object aval, py::object sharding,
                      absl::Span<const PyArray> py_arrays, bool committed,
                      bool skip_checks) {
+  auto dtype = aval.attr("dtype");
+  auto shape = pybind11::cast<std::vector<int64_t>>(aval.attr("shape"));
+  auto ifrt_array =
+      CreateIfRtArrayFromSingleDeviceShardedPyArrays(dtype, shape, py_arrays);
   Construct(reinterpret_cast<PyArrayObject*>(self.ptr()), aval,
-            pybind11::cast<bool>(aval.attr("weak_type")), aval.attr("dtype"),
-            pybind11::cast<std::vector<int64_t>>(aval.attr("shape")),
-            std::move(sharding), committed, py_arrays.at(0).py_client(),
-            Traceback::Get(),
-            CreatePjRtBuffersFromSingleShardedPyArrays(py_arrays));
+            pybind11::cast<bool>(aval.attr("weak_type")), std::move(dtype),
+            std::move(shape), std::move(sharding), committed,
+            py_arrays.at(0).py_client(), Traceback::Get(),
+            std::move(ifrt_array));
 
   PyArray py_array = self;
 
@@ -158,11 +217,14 @@ void PyArray::PyInit(py::object self, py::object aval, py::object sharding,
 void PyArray::PyInit(py::object self, py::object aval, py::object sharding,
                      absl::Span<const PyBuffer::object> py_buffers,
                      bool committed, bool skip_checks) {
+  auto dtype = aval.attr("dtype");
+  auto shape = pybind11::cast<std::vector<int64_t>>(aval.attr("shape"));
+  auto ifrt_array = CreateIfRtArrayFromPyBuffers(dtype, shape, py_buffers);
   Construct(reinterpret_cast<PyArrayObject*>(self.ptr()), aval,
-            pybind11::cast<bool>(aval.attr("weak_type")), aval.attr("dtype"),
-            pybind11::cast<std::vector<int64_t>>(aval.attr("shape")),
-            std::move(sharding), committed, py_buffers.at(0).buf()->client(),
-            Traceback::Get(), CreatePjRtBuffersFromPyBuffers(py_buffers));
+            pybind11::cast<bool>(aval.attr("weak_type")), std::move(dtype),
+            std::move(shape), std::move(sharding), committed,
+            py_buffers.at(0).buf()->client(), Traceback::Get(),
+            std::move(ifrt_array));
 
   PyArray py_array = self;
 
@@ -180,7 +242,7 @@ PyArray::PyArray(py::object aval, bool weak_type, py::dtype dtype,
                  std::vector<int64_t> shape, py::object sharding,
                  std::shared_ptr<PyClient> py_client,
                  std::shared_ptr<Traceback> traceback,
-                 std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers,
+                 tsl::RCReference<ifrt::Array> ifrt_array,
                  bool committed, bool skip_checks) {
   auto* self =
       PyArray_tp_new(reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr);
@@ -188,7 +250,8 @@ PyArray::PyArray(py::object aval, bool weak_type, py::dtype dtype,
   Construct(reinterpret_cast<PyArrayObject*>(self), std::move(aval), weak_type,
             std::move(dtype), std::move(shape), std::move(sharding), committed,
             std::move(py_client), std::move(traceback),
-            std::move(pjrt_buffers));
+            std::move(ifrt_array)
+  );
 
   if (!skip_checks) {
     CheckAndRearrange();
@@ -205,21 +268,43 @@ const PyArray::Storage& PyArray::GetStorage() const {
 
 void PyArray::CheckAndRearrange() { this->attr("_check_and_rearrange")(); }
 
+void PyArray::SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array) {
+  GetStorage().ifrt_array = std::move(ifrt_array);
+}
+
 py::object PyArray::arrays() {
-  // For performance, we only keep pjrt buffers by default. But on python side
-  // "_arrays" returns PyBuffers instead, and subsequent calls to "_arrays"
-  // should return the same PyBuffers (to avoid duplicate device to host
-  // transfers). So we create PyBuffers the first time it is called and reuse
-  // them later.
-  if (pjrt_buffers().empty()) return py::none();
+// For performance, we only keep pjrt buffers by default. But on python side
+// "_arrays" returns PyBuffers instead, and subsequent calls to "_arrays"
+// should return the same PyBuffers (to avoid duplicate device to host
+// transfers). So we create PyBuffers the first time it is called and reuse
+// them later.
+  if (ifrt_array() == nullptr) return py::none();
 
   auto& py_buffers = this->py_buffers();
 
   if (py_buffers.empty()) {
-    py_buffers.reserve(pjrt_buffers().size());
-    for (const auto& pjrt_buffer : pjrt_buffers()) {
+    if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
+      py_buffers.reserve(1);
       py_buffers.push_back(
-          PyBuffer::Make(py_client(), pjrt_buffer, traceback()));
+          PyBuffer::Make(py_client(),
+                         ifrt_array()
+                             ->Reshard(ifrt_array()->shared_ptr_sharding(),
+                                       ifrt::ArrayCopySemantics::kReuseInput)
+                             .value(),
+                         traceback()));
+    } else {
+      auto ifrt_arrays = ifrt_array()->DisassembleIntoSingleDeviceArrays(
+          ifrt::ArrayCopySemantics::kReuseInput);
+      if (!ifrt_arrays.ok()) {
+        throw py::value_error(
+            absl::StrCat("Failed to disassemble into single-device arrays: ",
+                         ifrt_arrays.status().ToString()));
+      }
+      py_buffers.reserve(ifrt_arrays->size());
+      for (auto& ifrt_array : *ifrt_arrays) {
+        py_buffers.push_back(
+            PyBuffer::Make(py_client(), std::move(ifrt_array), traceback()));
+      }
     }
   }
 
@@ -228,7 +313,7 @@ py::object PyArray::arrays() {
 
 Status PyArray::set_arrays(py::object obj) {
   if (obj.is_none()) {
-    pjrt_buffers().clear();
+    SetIfrtArray(tsl::RCReference<ifrt::Array>());
     py_buffers().clear();
     return OkStatus();
   }
@@ -242,9 +327,14 @@ Status PyArray::set_arrays(py::object obj) {
 
   if (list.empty()) return OkStatus();
 
-  pjrt_buffers().clear();
+  SetIfrtArray(tsl::RCReference<ifrt::Array>());
   py_buffers().clear();
-  pjrt_buffers().reserve(list.size());
+  std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
+  ifrt_arrays.reserve(list.size());
+  ifrt::DeviceList::Devices devices;
+  devices.reserve(list.size());
+  std::vector<ifrt::Shape> shapes;
+  shapes.reserve(list.size());
   for (py::handle obj : list) {
     // TODO(chky): Currently only List[Buffer] is handled here. We need to
     // handle List[Array] as well.
@@ -255,32 +345,42 @@ Status PyArray::set_arrays(py::object obj) {
 
     auto* py_buffer = PyBuffer::AsPyBufferUnchecked(obj);
     DCHECK_EQ(py_buffer->client(), py_client());
-    pjrt_buffers().push_back(py_buffer->shared_ptr_buffer());
+    // TODO(hyeontaek): This should return an error instead of failing.
+    CHECK(py_buffer->ifrt_array() != nullptr);
+    ifrt_arrays.push_back(tsl::FormRef(py_buffer->ifrt_array()));
+    devices.push_back(ifrt_arrays.back()->sharding().devices().front());
+    shapes.push_back(ifrt_arrays.back()->shape());
   }
+  TF_ASSIGN_OR_RETURN(
+      auto array,
+      py_client()->ifrt_client()->AssembleArrayFromSingleDeviceArrays(
+          ifrt::Shape(shape()),
+          ifrt::OpaqueSharding::Create(
+              ifrt::DeviceList(std::move(devices)),
+              xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+                  std::move(shapes))),
+          absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput));
+  SetIfrtArray(std::move(array));
   return OkStatus();
 }
 
 Status PyArray::BlockUntilReady() const {
   pybind11::gil_scoped_release gil_release;
   Status status;
-  for (const auto& pjrt_buffer : pjrt_buffers()) {
-    // PjRtBuffer::BlockHostUntilReady() fix up the error message because some
-    // clients rely on it.
-    auto s = pjrt_buffer->BlockHostUntilReady();
-    if (!s.ok()) status = std::move(s);
+  if (ifrt_array() == nullptr) {
+    return InvalidArgument(
+        "BlockHostUntilReady() called on deleted or donated buffer");
   }
+  return AwaitBuffersReady(ifrt_array());
   return status;
 }
 
 bool PyArray::IsDeleted() const {
-  if (pjrt_buffers().empty()) {
+  if (ifrt_array() == nullptr) {
     return true;
   }
 
-  for (const auto& pjrt_buffer : pjrt_buffers()) {
-    if (pjrt_buffer->IsDeleted()) return true;
-  }
-  return false;
+  return ifrt_array()->IsDeleted();
 }
 
 py::handle PyArray::Storage::AsHandle() {
@@ -307,10 +407,8 @@ PyArray::Storage::~PyArray_Storage() {
 std::vector<py::object> PyClient::LiveArrays() {
   std::vector<py::object> result;
   for (PyArray::Storage* array = arrays_; array; array = array->next) {
-    bool all_deleted = true;
-    for (auto& buffer : array->pjrt_buffers) {
-      all_deleted &= buffer->IsDeleted();
-    }
+    bool all_deleted =
+        (array->ifrt_array == nullptr || array->ifrt_array->IsDeleted());
     if (!all_deleted) {
       result.push_back(py::reinterpret_borrow<py::object>(array->AsHandle()));
     }
@@ -401,6 +499,8 @@ Status PyArray::RegisterTypes(py::module& m) {
         return self;
       },
       py::is_method(type));
+  type.attr("is_ready") = py::cpp_function(
+      [](PyArray self) { return self.IsReady(); }, py::is_method(type));
   type.attr("is_deleted") =
       py::cpp_function(&PyArray::IsDeleted, py::is_method(type));
   type.attr("traceback") = jax::property_readonly(&PyArray::traceback);
diff --git a/tensorflow/compiler/xla/python/py_array.h b/tensorflow/compiler/xla/python/py_array.h
index 3bd0c00288d..b23771d88a3 100644
--- a/tensorflow/compiler/xla/python/py_array.h
+++ b/tensorflow/compiler/xla/python/py_array.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "llvm/Support/Casting.h"
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/types.h"
 
@@ -32,7 +35,8 @@ struct PyArray_Storage {
                   std::vector<int64_t> shape, pybind11::object sharding,
                   bool committed, std::shared_ptr<PyClient> py_client,
                   std::shared_ptr<Traceback> traceback,
-                  std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers)
+                  tsl::RCReference<ifrt::Array> ifrt_array
+                  )
       : fastpath_enabled(true),
         aval(std::move(aval)),
         weak_type(weak_type),
@@ -42,7 +46,8 @@ struct PyArray_Storage {
         committed(committed),
         py_client(std::move(py_client)),
         traceback(std::move(traceback)),
-        pjrt_buffers(std::move(pjrt_buffers)) {
+        ifrt_array(std::move(ifrt_array))
+  {
     next = this->py_client->arrays_;
     this->py_client->arrays_ = this;
     if (next) {
@@ -72,7 +77,7 @@ struct PyArray_Storage {
 
   std::shared_ptr<PyClient> py_client;
   std::shared_ptr<Traceback> traceback;
-  std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers;
+  tsl::RCReference<ifrt::Array> ifrt_array;
 
   // optional field, used only in python
   std::vector<PyBuffer::object> py_buffers;
@@ -114,8 +119,8 @@ class PyArray : public pybind11::object {
           std::vector<int64_t> shape, pybind11::object sharding,
           std::shared_ptr<PyClient> py_client,
           std::shared_ptr<Traceback> traceback,
-          std::vector<std::shared_ptr<PjRtBuffer>> pjrt_buffers, bool committed,
-          bool skip_checks = true);
+          tsl::RCReference<ifrt::Array> ifrt_array,
+          bool committed, bool skip_checks = true);
 
   static Status RegisterTypes(pybind11::module& m);
 
@@ -146,12 +151,34 @@ class PyArray : public pybind11::object {
     return GetStorage().traceback;
   }
 
-  std::vector<std::shared_ptr<PjRtBuffer>>& pjrt_buffers() {
-    return GetStorage().pjrt_buffers;
+  // Returns xla::InvalidArgument if the buffer has been deleted.
+  // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
+  StatusOr<bool> IsReady() {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr->IsDeleted()) {
+      return InvalidArgument("Array has been deleted.");
+    }
+    return ifrt_array_ptr->GetReadyFuture().IsReady();
   }
-  const std::vector<std::shared_ptr<PjRtBuffer>>& pjrt_buffers() const {
-    return GetStorage().pjrt_buffers;
+
+  ifrt::Array* ifrt_array() const { return GetStorage().ifrt_array.get(); }
+
+  // Short-term escape hatch to get PjRtBuffers from PyArray.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  absl::Span<const std::shared_ptr<PjRtBuffer>> pjrt_buffers() const {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr == nullptr) {
+      return {};
+    }
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_ptr);
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return arr->pjrt_buffers();
   }
+
   std::vector<PyBuffer::object>& py_buffers() {
     return GetStorage().py_buffers;
   }
@@ -162,16 +189,14 @@ class PyArray : public pybind11::object {
   pybind11::object arrays();
   Status set_arrays(pybind11::object obj);
 
-  PjRtBuffer* GetBuffer(int device_id) const {
-    return pjrt_buffers().at(device_id).get();
-  }
-
-  const std::shared_ptr<PjRtBuffer>& GetSharedPtrBuffer(int device_id) const {
-    return pjrt_buffers().at(device_id);
+  int num_shards() const {
+    ifrt::Array* ifrt_array_ptr = ifrt_array();
+    if (ifrt_array_ptr == nullptr) {
+      return 0;
+    }
+    return ifrt_array_ptr->sharding().devices().size();
   }
 
-  int num_shards() const { return pjrt_buffers().size(); }
-
   // TODO(yashkatariya): remove this once the transition completes.
   bool fastpath_enabled() const { return GetStorage().fastpath_enabled; }
 
@@ -191,6 +216,8 @@ class PyArray : public pybind11::object {
  private:
   void CheckAndRearrange();
 
+  void SetIfrtArray(tsl::RCReference<ifrt::Array> ifrt_array);
+
   Storage& GetStorage();
   const Storage& GetStorage() const;
 
diff --git a/tensorflow/compiler/xla/python/py_buffer.cc b/tensorflow/compiler/xla/python/py_buffer.cc
index d8fedd9d5fc..cf3e05d77dd 100644
--- a/tensorflow/compiler/xla/python/py_buffer.cc
+++ b/tensorflow/compiler/xla/python/py_buffer.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 
+#include <cstring>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -26,6 +28,9 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
@@ -33,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -79,16 +85,38 @@ void PyBuffer_tp_dealloc(PyObject* self) {
   Py_DECREF(tp);
 }
 
+// Returns if shape has a major-to-minor layout.
+bool HasMajorToMinorLayout(const xla::Shape& shape) {
+  if (shape.has_layout()) {
+    for (int i = 0; i < shape.layout().minor_to_major_size(); ++i) {
+      if (shape.layout().minor_to_major(i) !=
+          shape.layout().minor_to_major_size() - 1 - i) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Returns byte_strides if shape has a non-major-to-minor layout.
+std::optional<std::vector<int64_t>> ByteStridesOrDefaultForShapeInt64(
+    const Shape& shape) {
+  if (!shape.has_layout() || HasMajorToMinorLayout(shape)) {
+    return std::nullopt;
+  }
+  return ByteStridesForShapeInt64(shape);
+}
+
 }  // namespace
 
 /*static*/ PyBuffer::object PyBuffer::Make(
-    std::shared_ptr<PyClient> client, std::shared_ptr<PjRtBuffer> buffer,
+    std::shared_ptr<PyClient> client, tsl::RCReference<ifrt::Array> ifrt_array,
     std::shared_ptr<Traceback> traceback) {
   py::object obj = py::reinterpret_steal<py::object>(PyBuffer_tp_new(
       reinterpret_cast<PyTypeObject*>(type_), nullptr, nullptr));
   PyBufferPyObject* buf = reinterpret_cast<PyBufferPyObject*>(obj.ptr());
   new (&buf->buffer)
-      PyBuffer(std::move(client), std::move(buffer), std::move(traceback));
+      PyBuffer(std::move(client), std::move(ifrt_array), std::move(traceback));
   return py::reinterpret_borrow<PyBuffer::object>(obj);
 }
 
@@ -114,14 +142,18 @@ py::handle PyBuffer::AsHandle() {
 }
 
 PyBuffer::PyBuffer(std::shared_ptr<PyClient> client,
-                   std::shared_ptr<PjRtBuffer> buffer,
+                   tsl::RCReference<ifrt::Array> ifrt_array,
                    std::shared_ptr<Traceback> traceback)
     : client_(std::move(client)),
-      buffer_(std::move(buffer)),
+      ifrt_array_(std::move(ifrt_array)),
       traceback_(std::move(traceback)) {
   CHECK(PyGILState_Check());
-  next_ = client_->buffers_[buffer_->device()->id()];
-  client_->buffers_[buffer_->device()->id()] = this;
+  const int device_id = ifrt_array_->sharding().devices().front()->id();
+  if (device_id >= client_->buffers_.size()) {
+    client_->buffers_.resize(device_id + 1);
+  }
+  next_ = client_->buffers_[device_id];
+  client_->buffers_[device_id] = this;
   prev_ = nullptr;
   if (next_) {
     next_->prev_ = this;
@@ -130,8 +162,9 @@ PyBuffer::PyBuffer(std::shared_ptr<PyClient> client,
 
 PyBuffer::~PyBuffer() {
   CHECK(PyGILState_Check());
-  if (client_->buffers_[device()->id()] == this) {
-    client_->buffers_[device()->id()] = next_;
+  const int device_id = ifrt_array_->sharding().devices().front()->id();
+  if (client_->buffers_[device_id] == this) {
+    client_->buffers_[device_id] = next_;
   }
   if (prev_) {
     prev_->next_ = next_;
@@ -142,18 +175,22 @@ PyBuffer::~PyBuffer() {
 }
 
 StatusOr<int64_t> PyBuffer::size() {
-  Shape max_buffer_shape = buffer()->on_device_shape();
-  if (max_buffer_shape.is_dynamic()) {
-    TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
-    return ShapeUtil::ElementsIn(*dynamic_shape);
+  if (llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array_.get())) {
+    Shape max_buffer_shape = pjrt_buffer()->on_device_shape();
+    if (max_buffer_shape.is_dynamic()) {
+      TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
+      return ShapeUtil::ElementsIn(*dynamic_shape);
+    }
+    return ShapeUtil::ElementsIn(max_buffer_shape);
+  } else {
+    return ifrt_array_->shape().num_elements();
   }
-  return ShapeUtil::ElementsIn(max_buffer_shape);
 }
 
 StatusOr<const Shape*> PyBuffer::xla_dynamic_shape() {
   CHECK(PyGILState_Check());
-  if (buffer_->on_device_shape().is_static()) {
-    return &buffer_->on_device_shape();
+  if (pjrt_buffer()->on_device_shape().is_static()) {
+    return &pjrt_buffer()->on_device_shape();
   }
   // Python buffer protocol references shape data by pointer, therefore we must
   // store a valid copy of the shape.
@@ -161,7 +198,8 @@ StatusOr<const Shape*> PyBuffer::xla_dynamic_shape() {
     Shape dynamic_shape;
     {
       py::gil_scoped_release gil_release;
-      TF_ASSIGN_OR_RETURN(dynamic_shape, buffer_->logical_on_device_shape());
+      TF_ASSIGN_OR_RETURN(dynamic_shape,
+                          pjrt_buffer()->logical_on_device_shape());
     }
     dynamic_shape_ = dynamic_shape;
   }
@@ -169,20 +207,26 @@ StatusOr<const Shape*> PyBuffer::xla_dynamic_shape() {
 }
 
 pybind11::tuple PyBuffer::python_shape() const {
-  return SpanToTuple(buffer()->on_device_shape().dimensions());
+  return SpanToTuple(ifrt_array_->shape().dims());
 }
 
 pybind11::dtype PyBuffer::python_dtype() const {
-  PrimitiveType primitive = buffer()->on_device_shape().element_type();
+  // TODO(hyeontaek): Support non-XLA types such as xla::ifrt::DType::kString.
+  PrimitiveType primitive = ifrt::ToPrimitiveType(ifrt_array_->dtype()).value();
   return PrimitiveTypeToDtype(primitive).value();
 }
 
 ClientAndPtr<PjRtDevice> PyBuffer::device() const {
-  return WrapWithClient(client_, buffer_->device());
+  return WrapWithClient(client_, ifrt_array_->sharding().devices().front());
 }
 
 PyBuffer::object PyBuffer::Clone() const {
-  auto buffer = Make(client_, buffer_, traceback_);
+  auto buffer = Make(client_,
+                     ifrt_array_
+                         ->Reshard(ifrt_array_->shared_ptr_sharding(),
+                                   ifrt::ArrayCopySemantics::kReuseInput)
+                         .value(),
+                     traceback_);
   buffer.buf()->sticky_device_ = sticky_device_;
   buffer.buf()->aval_ = aval_;
   return buffer;
@@ -202,10 +246,13 @@ StatusOr<py::object> PyBuffer::CopyToDevice(
       jax::ApplyTransferGuardToDeviceToDevice(transfer_guard_formatter));
 
   GlobalPyRefManager()->CollectGarbage();
-  std::unique_ptr<PjRtBuffer> out;
+  tsl::RCReference<ifrt::Array> out;
   {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(out, buffer_->CopyToDevice(dst_device.get()));
+    TF_ASSIGN_OR_RETURN(
+        out, ifrt_array_->Reshard(
+                 ifrt::SingleDeviceSharding::Create(dst_device.get()),
+                 ifrt::ArrayCopySemantics::kReuseInput));
   }
   auto traceback = Traceback::Get();
   return Make(dst_device.client, std::move(out), std::move(traceback));
@@ -217,8 +264,8 @@ std::pair<Status, bool> PyBuffer::CopyToRemoteDevice(
   bool done = false;
   Status status;
   bool sends_were_enqueued;
-  buffer_->CopyToRemoteDevice(
-      serialized_descriptor,
+  pjrt_buffer()->CopyToRemoteDevice(
+      PjRtFuture<StatusOr<std::string>>(std::string(serialized_descriptor)),
       [&done, &status, &sends_were_enqueued, &mu](Status s, bool dispatched) {
         absl::MutexLock l(&mu);
         done = true;
@@ -237,11 +284,13 @@ std::pair<Status, bool> PyBuffer::CopyToRemoteDevice(
 Status PyBuffer::BlockHostUntilReady() {
   GlobalPyRefManager()->CollectGarbage();
   py::gil_scoped_release gil_release;
-  return buffer_->BlockHostUntilReady();
+  return AwaitBuffersReady(ifrt_array_.get());
 }
 
 Status PyBuffer::CopyToHostAsync() {
-  if (!buffer_->IsOnCpu() && !host_value_) {
+  if ((!llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array_.get()) ||
+       !pjrt_buffer()->IsOnCpu()) &&
+      !host_value_) {
     auto transfer_guard_formatter = [this] {
       auto shape = py::cast<std::string>(py::str(python_shape()));
       auto dtype = py::cast<std::string>(py::str(python_dtype()));
@@ -256,51 +305,74 @@ Status PyBuffer::CopyToHostAsync() {
     // TODO(b/182461453): This is a blocking call. If we further implemented
     // populating dynamic shape metadata while fetching the literal, we wouldn't
     // need this static approach.
-    TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
+    const xla::Shape* dynamic_shape;
+    std::optional<xla::Shape> shape_holder;
+    if (llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array_.get())) {
+      TF_ASSIGN_OR_RETURN(dynamic_shape, xla_dynamic_shape());
+    } else {
+      // Skip querying the dynamic shape for a non-PjRt Array.
+      TF_ASSIGN_OR_RETURN(xla::PrimitiveType type,
+                          ifrt::ToPrimitiveType(ifrt_array_->dtype()));
+      shape_holder = ShapeUtil::MakeShapeWithDescendingLayout(
+          type, ifrt_array_->shape().dims());
+      dynamic_shape = &*shape_holder;
+    }
 
     py::gil_scoped_release gil;
-    host_value->value = std::make_shared<Literal>(
-        ShapeUtil::DeviceShapeToHostShape(*dynamic_shape));
-    Literal* literal = host_value->value.get();
-    buffer_->ToLiteral(literal,
-                       [host_value{std::move(host_value)}](Status status) {
-                         host_value->status = std::move(status);
-                         host_value->ready.Notify();
-                       });
+    xla::Shape host_shape = ShapeUtil::DeviceShapeToHostShape(*dynamic_shape);
+    // TODO(hyeontaek): Several PjRt runtimes assume that the host buffer uses
+    // the same transposition as the device buffer. This is different from
+    // PjRtBuffer::ToLiteral()'s semantics that the runtime respects the layout
+    // of the host buffer literal. On the other hand, the runtime often knows
+    // better about an efficient layout for the host buffer. It will be useful
+    // to revisit the semantics of PjRtBuffer::ToLiteral() to see if it is
+    // desirable for the runtime to choose the layout.
+    host_value->value = std::make_shared<Literal>(host_shape);
+    ifrt::Future<Status> copy_future = ifrt_array_->CopyToHostBuffer(
+        host_value->value->untyped_data(),
+        ByteStridesOrDefaultForShapeInt64(host_shape),
+        ifrt::ArrayCopySemantics::kReuseInput);
+    copy_future.OnReady([host_value{std::move(host_value)}](Status status) {
+      host_value->status = std::move(status);
+      host_value->ready.Notify();
+    });
   }
   return OkStatus();
 }
 
 StatusOr<pybind11::object> PyBuffer::AsNumPyArray(py::handle this_obj) {
-  if (buffer_->IsDeleted()) {
+  if (ifrt_array_->IsDeleted()) {
     return InvalidArgument("DeviceArray has been deleted.");
   }
-  TF_RET_CHECK(buffer_->on_device_shape().IsArray());
-  // On CPU, we can return the value in a zero-copy way.
-  if (buffer_->IsOnCpu()) {
-    TF_ASSIGN_OR_RETURN(const auto* shape, xla_dynamic_shape());
-    TF_ASSIGN_OR_RETURN(py::dtype dtype,
-                        PrimitiveTypeToDtype(shape->element_type()));
-    // Objects that must be kept alive while the array is alive.
-    struct Hold {
-      py::object buffer;
-      std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
-    };
-    auto hold = std::make_unique<Hold>();
-    TF_ASSIGN_OR_RETURN(hold->external_reference_hold,
-                        buffer_->AcquireExternalReference());
-    hold->buffer = py::reinterpret_borrow<py::object>(this_obj);
-    void* data = hold->external_reference_hold->OpaqueDeviceMemoryDataPointer();
-    py::capsule hold_capsule(hold.release(),
-                             [](void* h) { delete static_cast<Hold*>(h); });
-    py::array array(dtype, shape->dimensions(), ByteStridesForShape(*shape),
-                    data, hold_capsule);
-    array.attr("flags").attr("writeable") = Py_False;
-    {
-      py::gil_scoped_release gil;
-      TF_RETURN_IF_ERROR(buffer_->BlockHostUntilReady());
+  if (llvm::isa<ifrt::PjRtCompatibleArray>(ifrt_array_.get())) {
+    TF_RET_CHECK(pjrt_buffer()->on_device_shape().IsArray());
+    // On CPU, we can return the value in a zero-copy way.
+    if (pjrt_buffer()->IsOnCpu()) {
+      TF_ASSIGN_OR_RETURN(const auto* shape, xla_dynamic_shape());
+      TF_ASSIGN_OR_RETURN(py::dtype dtype,
+                          PrimitiveTypeToDtype(shape->element_type()));
+      // Objects that must be kept alive while the array is alive.
+      struct Hold {
+        py::object buffer;
+        std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
+      };
+      auto hold = std::make_unique<Hold>();
+      TF_ASSIGN_OR_RETURN(hold->external_reference_hold,
+                          pjrt_buffer()->AcquireExternalReference());
+      hold->buffer = py::reinterpret_borrow<py::object>(this_obj);
+      void* data =
+          hold->external_reference_hold->OpaqueDeviceMemoryDataPointer();
+      py::capsule hold_capsule(hold.release(),
+                               [](void* h) { delete static_cast<Hold*>(h); });
+      py::array array(dtype, shape->dimensions(), ByteStridesForShape(*shape),
+                      data, hold_capsule);
+      array.attr("flags").attr("writeable") = Py_False;
+      {
+        py::gil_scoped_release gil;
+        TF_RETURN_IF_ERROR(ifrt_array_->GetReadyFuture().Await());
+      }
+      return array;
     }
-    return array;
   }
 
   TF_RETURN_IF_ERROR(CopyToHostAsync());
@@ -315,36 +387,44 @@ StatusOr<pybind11::object> PyBuffer::AsNumPyArray(py::handle this_obj) {
 }
 
 StatusOr<std::uintptr_t> PyBuffer::UnsafeBufferPointer() const {
-  return client_->pjrt_client()->UnsafeBufferPointer(buffer_.get());
+  return client_->pjrt_client()->UnsafeBufferPointer(pjrt_buffer());
 }
 
 StatusOr<py::dict> PyBuffer::CudaArrayInterface() {
   // TODO(zhangqiaorjc): Differentiate between NVidia and other GPUs.
-  if (buffer_->client()->platform_id() != GpuId()) {
+  if (pjrt_buffer()->client()->platform_id() != GpuId()) {
     return InvalidArgument(
         "__cuda_array_interface__ is only defined for NVidia GPU buffers.");
   }
-  if (!buffer_->on_device_shape().IsArray()) {
+  if (!pjrt_buffer()->on_device_shape().IsArray()) {
     return InvalidArgument(
         "__cuda_array_interface__ is only defined for array buffers.");
   }
-  if (buffer_->on_device_shape().element_type() == BF16) {
+  if (pjrt_buffer()->on_device_shape().element_type() == BF16) {
     return InvalidArgument(
         "__cuda_array_interface__ is not supported for bfloat16 buffers.");
   }
+  if (pjrt_buffer()->on_device_shape().element_type() == F8E4M3FN) {
+    return InvalidArgument(
+        "__cuda_array_interface__ is not supported for F8E4M3FN buffers.");
+  }
+  if (pjrt_buffer()->on_device_shape().element_type() == F8E5M2) {
+    return InvalidArgument(
+        "__cuda_array_interface__ is not supported for F8E5M2 buffers.");
+  }
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(
-      buffer_->on_device_shape().layout()));
+      pjrt_buffer()->on_device_shape().layout()));
 
   py::dict result;
   TF_ASSIGN_OR_RETURN(const auto* dynamic_shape, xla_dynamic_shape());
   result["shape"] = SpanToTuple(dynamic_shape->dimensions());
   TF_ASSIGN_OR_RETURN(py::str typestr,
                       TypeDescriptorForPrimitiveType(
-                          buffer_->on_device_shape().element_type()));
+                          pjrt_buffer()->on_device_shape().element_type()));
   result["typestr"] = std::move(typestr);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold,
-      buffer_->AcquireExternalReference());
+      pjrt_buffer()->AcquireExternalReference());
   const void* root_ptr =
       external_reference_hold->OpaqueDeviceMemoryDataPointer();
   py::tuple data(2);
@@ -357,6 +437,9 @@ StatusOr<py::dict> PyBuffer::CudaArrayInterface() {
 
 PyShardedBuffer PyShardedBuffer::CreateFromPyBuffers(
     absl::Span<const PyBuffer::object> py_buffers) {
+  // TODO(hyeontaek): This Array creation has insufficient information about
+  // the shape (a dummy shape is used). This should be removed if possible and
+  // only be used in the context where the shape information is unused.
   PyBuffer* first_py_buffer = py_buffers.at(0).buf();
   auto client = first_py_buffer->client();
   auto traceback = first_py_buffer->traceback();
@@ -367,29 +450,38 @@ PyShardedBuffer PyShardedBuffer::CreateFromPyBuffers(
     return buf.buf()->sticky_device() == nullptr;
   };
 
-  std::vector<std::shared_ptr<PjRtBuffer>> results;
-  results.reserve(py_buffers.size());
+  std::vector<tsl::RCReference<ifrt::Array>> arrays;
+  arrays.reserve(py_buffers.size());
+  ifrt::DeviceList::Devices devices;
+  devices.reserve(py_buffers.size());
+  std::vector<ifrt::Shape> shapes;
+  shapes.reserve(py_buffers.size());
   for (const auto& py_buffer : py_buffers) {
     // Either all device buffers are sticky or none of them are sticky.
     DCHECK(check_sticky(py_buffer));
-    results.push_back(py_buffer.buf()->shared_ptr_buffer());
+    arrays.push_back(tsl::FormRef(py_buffer.buf()->ifrt_array()));
+    devices.push_back(
+        py_buffer.buf()->ifrt_array()->sharding().devices().front());
+    shapes.push_back(py_buffer.buf()->ifrt_array()->shape());
   }
-
-  return PyShardedBuffer(std::move(client), std::move(results),
+  auto array = client->ifrt_client()->AssembleArrayFromSingleDeviceArrays(
+      arrays.front()->shape(),
+      ifrt::OpaqueSharding::Create(
+          ifrt::DeviceList(std::move(devices)),
+          ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+              std::move(shapes))),
+      absl::MakeSpan(arrays), ifrt::ArrayCopySemantics::kReuseInput);
+  if (!array.ok()) {
+    throw py::value_error(array.status().ToString());
+  }
+  return PyShardedBuffer(std::move(client), *std::move(array),
                          std::move(traceback), sticky);
 }
 
 Status PyShardedBuffer::BlockHostUntilReady() {
   GlobalPyRefManager()->CollectGarbage();
   py::gil_scoped_release gil_release;
-  Status status = OkStatus();
-  for (const auto& buffer : buffers_) {
-    // PjRtBuffer::BlockHostUntilReady() fix up the error message because some
-    // clients rely on it.
-    auto s = buffer->BlockHostUntilReady();
-    if (!s.ok()) status = std::move(s);
-  }
-  return status;
+  return AwaitBuffersReady(ifrt_array());
 }
 
 // PEP 3118 buffer protocol implementation.
@@ -414,16 +506,24 @@ struct ExtraBufferInfo {
 int PyBuffer_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
   Status status = [&]() {
     TF_ASSIGN_OR_RETURN(PyBuffer * py_buffer, PyBuffer::AsPyBuffer(exporter));
-    PjRtBuffer& buffer = *py_buffer->buffer();
-    TF_ASSIGN_OR_RETURN(const auto* shape, py_buffer->xla_dynamic_shape());
-    // Py_buffer objects are POD C structures, so we don't need to hold the GIL.
-    // Additionally we call BlockHostUntilReady() below, which may block.
-    py::gil_scoped_release gil_release;
+    PjRtBuffer* buffer_ptr;
+    try {
+      buffer_ptr = py_buffer->pjrt_buffer();
+    } catch (const XlaRuntimeError& e) {
+      return InvalidArgument("%s", e.what());
+    }
 
+    PjRtBuffer& buffer = *buffer_ptr;
     if (!buffer.IsOnCpu()) {
       return InvalidArgument(
           "Python buffer protocol is only defined for CPU buffers.");
     }
+
+    TF_ASSIGN_OR_RETURN(const auto* shape, py_buffer->xla_dynamic_shape());
+    // Py_buffer objects are POD C structures, so we don't need to hold the GIL.
+    // Additionally we call BlockHostUntilReady() below, which may block.
+    py::gil_scoped_release gil_release;
+
     if (!buffer.on_device_shape().IsArray()) {
       return InvalidArgument(
           "Python buffer protocol is only defined for array buffers.");
@@ -436,6 +536,16 @@ int PyBuffer_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
       return InvalidArgument(
           "bfloat16 buffer format not supported by Python buffer protocol.");
     }
+    if (buffer.on_device_shape().element_type() == F8E4M3FN &&
+        ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)) {
+      return InvalidArgument(
+          "F8E4M3FN buffer format not supported by Python buffer protocol.");
+    }
+    if (buffer.on_device_shape().element_type() == F8E5M2 &&
+        ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)) {
+      return InvalidArgument(
+          "F8E5M2 buffer format not supported by Python buffer protocol.");
+    }
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
       return InvalidArgument("XLA buffers are read-only.");
     }
@@ -631,15 +741,17 @@ Status PyBuffer::RegisterTypes(py::module& m) {
       });
   type.attr("device_buffer") =
       property_readonly([](py::object self) { return self; });
-  type.attr(
-      "shape") = property_readonly([](PyBuffer::object self) -> py::tuple {
-    return SpanToTuple(self.buf()->buffer()->on_device_shape().dimensions());
-  });
-  type.attr("dtype") = property_readonly([](PyBuffer::object self) {
-    PrimitiveType primitive =
-        self.buf()->buffer()->on_device_shape().element_type();
-    return PrimitiveTypeToDtype(primitive).value();
-  });
+  type.attr("shape") =
+      property_readonly([](PyBuffer::object self) -> py::tuple {
+        return SpanToTuple(self.buf()->ifrt_array()->shape().dims());
+      });
+  type.attr("dtype") =
+      property_readonly([](PyBuffer::object self) -> StatusOr<py::dtype> {
+        TF_ASSIGN_OR_RETURN(
+            auto primitive_type,
+            ifrt::ToPrimitiveType(self.buf()->ifrt_array()->dtype()));
+        return PrimitiveTypeToDtype(primitive_type);
+      });
   type.attr("size") =
       property_readonly([](PyBuffer::object self) -> StatusOr<int64_t> {
         return self.buf()->size();
@@ -673,15 +785,6 @@ Status PyBuffer::RegisterTypes(py::module& m) {
       py::is_method(type));
   type.attr("delete") = py::cpp_function(
       [](PyBuffer::object self) { self.buf()->Delete(); }, py::is_method(type));
-  type.attr("block_host_until_ready") = py::cpp_function(
-      [](PyBuffer::object self) {
-        // TODO(phawkins): remove 3 months after the release of jaxlib >= 0.3.2.
-        PythonDeprecationWarning(
-            "block_host_until_ready() on a JAX array object is deprecated, use "
-            "block_until_ready() instead.");
-        return self.buf()->BlockHostUntilReady();
-      },
-      py::is_method(type));
   type.attr("is_ready") = py::cpp_function(
       [](PyBuffer::object self) { return self.buf()->IsReady(); },
       py::is_method(type));
@@ -744,4 +847,14 @@ Status PyBuffer::RegisterTypes(py::module& m) {
   return OkStatus();
 }
 
+StatusOr<ifrt::DType> ToIfRtDType(py::dtype dtype) {
+  TF_ASSIGN_OR_RETURN(auto primitive_type, DtypeToPrimitiveType(dtype));
+  return ifrt::ToDType(primitive_type);
+}
+
+StatusOr<py::dtype> ToPybind11DType(ifrt::DType dtype) {
+  TF_ASSIGN_OR_RETURN(auto primitive_type, ifrt::ToPrimitiveType(dtype));
+  return PrimitiveTypeToDtype(primitive_type);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/py_buffer.h b/tensorflow/compiler/xla/python/py_buffer.h
index 282de534b1f..4892de87a52 100644
--- a/tensorflow/compiler/xla/python/py_buffer.h
+++ b/tensorflow/compiler/xla/python/py_buffer.h
@@ -19,12 +19,15 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <stdexcept>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -51,7 +54,7 @@ class PyBuffer {
   using object = pyobject;
 
   static object Make(std::shared_ptr<PyClient> client,
-                     std::shared_ptr<PjRtBuffer> buffer,
+                     tsl::RCReference<ifrt::Array> ifrt_array,
                      std::shared_ptr<Traceback> traceback);
 
   // Returns true if `h` is a PyBuffer.
@@ -69,14 +72,56 @@ class PyBuffer {
   ~PyBuffer();
 
   std::shared_ptr<PyClient> client() const { return client_; }
-  PjRtBuffer* buffer() const { return buffer_.get(); }
-  std::shared_ptr<PjRtBuffer> shared_ptr_buffer() const { return buffer_; }
+
+  ifrt::Array* ifrt_array() const { return ifrt_array_.get(); }
+
+  // Short-term escape hatch to get PjRtBuffer from PyBuffer.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  PjRtBuffer* pjrt_buffer() const {
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return arr->pjrt_buffers().front().get();
+  }
+
+  // Short-term escape hatch to get PjRtBuffer from PyBuffer.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  std::shared_ptr<PjRtBuffer> shared_ptr_pjrt_buffer() const {
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return arr->pjrt_buffers().front();
+  }
+
+  void SetPjRtBuffer(std::shared_ptr<PjRtBuffer> buffer) {
+    auto* client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+        client_->ifrt_client());
+    if (client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    auto ifrt_array = client->CreatePjRtArray(std::move(buffer));
+    TF_CHECK_OK(ifrt_array.status());
+    ifrt_array_ = *std::move(ifrt_array);
+  }
+
+  // Legacy alises.
+  PjRtBuffer* buffer() const { return pjrt_buffer(); }
+  std::shared_ptr<PjRtBuffer> shared_ptr_buffer() const {
+    return shared_ptr_pjrt_buffer();
+  }
 
   ClientAndPtr<PjRtDevice> device() const;
   absl::string_view platform_name() const {
-    return buffer_->client()->platform_name();
+    return ifrt_array_->client()->platform_name();
   }
-  bool is_deleted() const { return buffer_->IsDeleted(); }
+  bool is_deleted() const { return ifrt_array_->IsDeleted(); }
 
   StatusOr<pybind11::object> CopyToDevice(
       const ClientAndPtr<PjRtDevice>& dst_device) const;
@@ -84,11 +129,12 @@ class PyBuffer {
       absl::string_view serialized_descriptor) const;
 
   StatusOr<size_t> OnDeviceSizeInBytes() {
-    return buffer_->GetOnDeviceSizeInBytes();
+    return pjrt_buffer()->GetOnDeviceSizeInBytes();
   }
 
   void Delete() {
-    buffer_->Delete();
+    // TODO(hyeontaek): Return Status.
+    TF_CHECK_OK(ifrt_array_->Delete().Await());
     host_value_ = nullptr;
   }
 
@@ -100,23 +146,23 @@ class PyBuffer {
   // Returns xla::InvalidArgument if the buffer has been deleted.
   // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
   StatusOr<bool> IsReady() {
-    if (buffer_->IsDeleted()) {
+    if (ifrt_array_->IsDeleted()) {
       return InvalidArgument("DeviceArray has been deleted.");
     }
-    return buffer_->GetReadyFuture().IsReady();
+    return ifrt_array_->GetReadyFuture().IsReady();
   }
   StatusOr<bool> IsKnownReady() {
-    if (buffer_->IsDeleted()) {
+    if (ifrt_array_->IsDeleted()) {
       return InvalidArgument("DeviceArray has been deleted.");
     }
-    return buffer_->GetReadyFuture().IsKnownReady();
+    return ifrt_array_->GetReadyFuture().IsKnownReady();
   }
 
   // Returns xla::InvalidArgument if the buffer has been deleted.
   Status BlockHostUntilReady();
   Status CopyToHostAsync();
 
-  const Shape& shape() { return buffer_->on_device_shape(); }
+  const Shape& shape() { return pjrt_buffer()->on_device_shape(); }
 
   StatusOr<std::uintptr_t> UnsafeBufferPointer() const;
 
@@ -130,7 +176,7 @@ class PyBuffer {
   StatusOr<int64_t> size();
 
   // Returns the number of dimensions of the (host) numpy array.
-  int ndim() const { return buffer()->on_device_shape().dimensions_size(); }
+  int ndim() const { return ifrt_array_->shape().dims().size(); }
 
   pybind11::tuple python_shape() const;
   pybind11::dtype python_dtype() const;
@@ -140,7 +186,7 @@ class PyBuffer {
 
   Status set_sticky_device(PjRtDevice* sticky_device) {
     TF_RET_CHECK(sticky_device == nullptr ||
-                 sticky_device == buffer_->device());
+                 sticky_device == ifrt_array_->sharding().devices().front());
     sticky_device_ = sticky_device;
     return OkStatus();
   }
@@ -161,7 +207,8 @@ class PyBuffer {
  private:
   // PyBuffer objects must not be allocated directly since they must always live
   // on the Python heap. Use Make() instead.
-  PyBuffer(std::shared_ptr<PyClient> client, std::shared_ptr<PjRtBuffer> buffer,
+  PyBuffer(std::shared_ptr<PyClient> client,
+           tsl::RCReference<ifrt::Array> array,
            std::shared_ptr<Traceback> traceback);
 
   static PyObject* base_type_;
@@ -175,7 +222,7 @@ class PyBuffer {
     std::shared_ptr<xla::Literal> value;
   };
   std::shared_ptr<PyClient> client_;
-  std::shared_ptr<PjRtBuffer> buffer_;
+  tsl::RCReference<ifrt::Array> ifrt_array_;
   std::shared_ptr<Traceback> traceback_;
   std::shared_ptr<HostValue> host_value_;  // Protected by the GIL.
 
@@ -209,10 +256,10 @@ class PyShardedBuffer {
       absl::Span<const PyBuffer::object> py_buffers);
 
   PyShardedBuffer(std::shared_ptr<PyClient> client,
-                  std::vector<std::shared_ptr<PjRtBuffer>> buffers,
+                  tsl::RCReference<ifrt::Array> ifrt_array,
                   std::shared_ptr<Traceback> traceback, bool sticky = false)
       : client_(std::move(client)),
-        buffers_(std::move(buffers)),
+        ifrt_array_(std::move(ifrt_array)),
         traceback_(std::move(traceback)),
         sticky_(sticky) {
     Link();
@@ -224,7 +271,7 @@ class PyShardedBuffer {
   PyShardedBuffer(PyShardedBuffer&& other) {
     other.Unlink();
     client_ = std::move(other.client_);
-    buffers_ = std::move(other.buffers_);
+    ifrt_array_ = std::move(other.ifrt_array_);
     traceback_ = std::move(other.traceback_);
     sticky_ = other.sticky_;
     Link();
@@ -234,7 +281,7 @@ class PyShardedBuffer {
     Unlink();
     other.Unlink();
     client_ = std::move(other.client_);
-    buffers_ = std::move(other.buffers_);
+    ifrt_array_ = std::move(other.ifrt_array_);
     traceback_ = std::move(other.traceback_);
     sticky_ = other.sticky_;
     Link();
@@ -245,11 +292,16 @@ class PyShardedBuffer {
 
   std::vector<PyBuffer::object> GetPyBuffers() const {
     std::vector<PyBuffer::object> results;
-    results.reserve(buffers_.size());
-    for (const auto& pjrt_buffer : buffers_) {
-      auto py_buffer = PyBuffer::Make(client_, pjrt_buffer, traceback_);
+    results.reserve(ifrt_array_->sharding().devices().size());
+    auto ifrt_arrays = ifrt_array_->DisassembleIntoSingleDeviceArrays(
+        ifrt::ArrayCopySemantics::kReuseInput);
+    TF_CHECK_OK(ifrt_arrays.status());
+    for (auto& ifrt_array : *ifrt_arrays) {
+      auto* device = ifrt_array->sharding().devices().front();
+      auto py_buffer =
+          PyBuffer::Make(client_, std::move(ifrt_array), traceback_);
       if (sticky_) {
-        TF_CHECK_OK(py_buffer.buf()->set_sticky_device(pjrt_buffer->device()));
+        TF_CHECK_OK(py_buffer.buf()->set_sticky_device(device));
       }
       results.push_back(std::move(py_buffer));
     }
@@ -257,8 +309,23 @@ class PyShardedBuffer {
   }
 
   PyBuffer::object GetPyBuffer(int device_id) const {
-    const auto& pjrt_buffer = buffers_.at(device_id);
-    auto py_buffer = PyBuffer::Make(client_, pjrt_buffer, traceback_);
+    // TODO(hyeontaek): Remove this method. This method will not scale well.
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    auto* ifrt_client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+        client_->ifrt_client());
+    if (ifrt_client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    auto& pjrt_buffer = arr->pjrt_buffers().at(device_id);
+    auto py_buffer = PyBuffer::Make(
+        client_, ifrt::PjRtArray::Create(ifrt_client, pjrt_buffer).value(),
+        traceback_);
     if (sticky_) {
       TF_CHECK_OK(py_buffer.buf()->set_sticky_device(pjrt_buffer->device()));
     }
@@ -266,24 +333,30 @@ class PyShardedBuffer {
   }
 
   PrimitiveType dtype() const {
-    return buffers_.at(0)->on_device_shape().element_type();
+    return *ifrt::ToPrimitiveType(ifrt_array_->dtype());
   }
 
-  PjRtBuffer* GetPjRtBuffer(int device_id) const {
-    return buffers_.at(device_id).get();
+  ifrt::Array* ifrt_array() const { return ifrt_array_.get(); }
+
+  // Short-term escape hatch to get PjRtBuffer from PyShardedBuffer.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  PjRtBuffer* pjrt_buffer(int device_id) const {
+    auto* arr =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array_.get());
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return arr->pjrt_buffers().at(device_id).get();
   }
 
-  int num_devices() const { return buffers_.size(); }
+  int num_devices() const { return ifrt_array_->sharding().devices().size(); }
 
   const std::shared_ptr<Traceback>& traceback() const { return traceback_; }
 
   Status BlockHostUntilReady();
 
-  void Delete() {
-    for (auto& pjrt_buffer : buffers_) {
-      pjrt_buffer->Delete();
-    }
-  }
+  void Delete() { ifrt_array_->Delete(); }
 
  private:
   void Link() {
@@ -316,7 +389,7 @@ class PyShardedBuffer {
   friend class PyClient;
 
   std::shared_ptr<PyClient> client_;
-  std::vector<std::shared_ptr<PjRtBuffer>> buffers_;
+  tsl::RCReference<ifrt::Array> ifrt_array_;
   std::shared_ptr<Traceback> traceback_;
   bool sticky_ = false;
 
@@ -324,6 +397,10 @@ class PyShardedBuffer {
   PyShardedBuffer* prev_ = nullptr;
 };
 
+// TODO(hyeontaek): Move the following functions to a separate file.
+StatusOr<ifrt::DType> ToIfRtDType(pybind11::dtype dtype);
+StatusOr<pybind11::dtype> ToPybind11DType(ifrt::DType dtype);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PY_BUFFER_H_
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index e0931e62644..e7438be996e 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/py_client.h"
 
+#include <exception>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/numbers.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
 #include "tensorflow/compiler/xla/pjrt/host_callback.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -48,14 +52,11 @@ namespace xla {
 
 namespace py = pybind11;
 
-PyClient::PyClient(std::unique_ptr<PjRtClient> pjrt_client)
-    : PyClient(std::shared_ptr<PjRtClient>(std::move(pjrt_client))) {}
-
-PyClient::PyClient(std::shared_ptr<PjRtClient> pjrt_client)
-    : pjrt_client_(std::move(pjrt_client)) {
-  CHECK(pjrt_client_ != nullptr);
-  buffers_.resize(pjrt_client_->device_count());
-  for (PjRtDevice* device : pjrt_client_->addressable_devices()) {
+PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
+    : ifrt_client_(std::move(ifrt_client)) {
+  CHECK(ifrt_client_);
+  buffers_.resize(ifrt_client_->device_count());
+  for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
     if (device->id() >= buffers_.size()) {
       buffers_.resize(device->id() + 1);
     }
@@ -64,12 +65,12 @@ PyClient::PyClient(std::shared_ptr<PjRtClient> pjrt_client)
 
 PyClient::~PyClient() {
   py::gil_scoped_release gil;
-  pjrt_client_ = nullptr;
+  ifrt_client_ = nullptr;
 }
 
 std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
   std::vector<ClientAndPtr<PjRtDevice>> devices;
-  auto span = pjrt_client_->devices();
+  auto span = ifrt_client_->devices();
   devices.reserve(span.size());
   for (PjRtDevice* device : span) {
     devices.push_back(WrapWithClient(shared_from_this(), device));
@@ -79,8 +80,8 @@ std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
 
 std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
   std::vector<ClientAndPtr<PjRtDevice>> devices;
-  devices.reserve(pjrt_client_->addressable_devices().size());
-  for (PjRtDevice* device : pjrt_client_->addressable_devices()) {
+  devices.reserve(ifrt_client_->addressable_devices().size());
+  for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
     devices.push_back(WrapWithClient(shared_from_this(), device));
   }
   return devices;
@@ -129,55 +130,106 @@ std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
 
 Status PyClient::Defragment() {
   CHECK(PyGILState_Check());
-  switch (pjrt_client_->runtime_type()) {
-    case PjRtRuntimeType::kTfrt:
-      return pjrt_client_->Defragment();
-    case PjRtRuntimeType::kStreamExecutor:
-      struct TmpBuffer {
-        PyBuffer* py_buffer;
-        // TODO(skyewm): maybe use py_buffer's HostValue
-        std::shared_ptr<Literal> host_copy;
-      };
-
-      // Synchronously copy all buffers to host
-      std::vector<TmpBuffer> tmp_buffers;
-      for (PyBuffer* device_buffers : buffers_) {
-        for (PyBuffer* buffer = device_buffers; buffer;
-             buffer = buffer->next_) {
-          if (!buffer->is_deleted()) {
-            TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
-                                buffer->buffer_->ToLiteralSync());
-            tmp_buffers.push_back({buffer, literal});
-          }
+  auto runtime_type = ifrt_client_->runtime_type();
+  if (runtime_type == PjRtRuntimeTypeString(PjRtRuntimeType::kTfrt)) {
+    return pjrt_client()->Defragment();
+  } else if (runtime_type ==
+             PjRtRuntimeTypeString(PjRtRuntimeType::kStreamExecutor)) {
+    struct TmpBuffer {
+      // TODO(skyewm): Arrays create multiple PyBuffers for the same
+      // PjRtBuffer when Array._arrays is called.  This should theoretically
+      // be a single possibly-null PyBuffer* for Arrays.
+      std::vector<PyBuffer*> py_buffers;
+      // Non-empty for buffers found in a PyArray_Storage. Multiple Arrays
+      // can reference the same PjRtBuffer.
+      std::vector<std::shared_ptr<PjRtBuffer>*> pjrt_buffer_ptrs;
+      // TODO(skyewm): maybe use py_buffer's HostValue
+      std::shared_ptr<Literal> host_copy;
+    };
+
+    // Synchronously copy all buffers to host
+    absl::flat_hash_map<PjRtBuffer*, TmpBuffer> pjrt_buf_to_tmp_buffer;
+    for (PyBuffer* device_buffers : buffers_) {
+      for (PyBuffer* buffer = device_buffers; buffer; buffer = buffer->next_) {
+        if (buffer->is_deleted()) {
+          continue;
+        }
+        auto [iter, inserted] =
+            pjrt_buf_to_tmp_buffer.insert({buffer->pjrt_buffer(), TmpBuffer()});
+        if (inserted) {
+          TF_ASSIGN_OR_RETURN(iter->second.host_copy,
+                              buffer->pjrt_buffer()->ToLiteralSync());
         }
+        iter->second.py_buffers.push_back(buffer);
       }
+    }
 
-      // All buffers successfully copied to host, delete on-device copies.
-      //
-      // Use blocking delete operation to ensure all memory is actually cleared
-      // before we start rewriting buffers.
-      //
-      // Die instead of returning a bad status because program presumably can't
-      // continue if we fail to reconstitute device buffers.
-      for (TmpBuffer& tmp_buffer : tmp_buffers) {
-        TF_CHECK_OK(tensorflow::down_cast<PjRtStreamExecutorBuffer*>(
-                        tmp_buffer.py_buffer->buffer_.get())
-                        ->Release(/*wait_for_operations_to_complete=*/true)
-                        .status());
+    for (PyArray_Storage* array = arrays_; array; array = array->next) {
+      // TODO(hyeontaek): Support non-PjRt Arrays.
+      // TODO(hyeontaek): Re-construct ifrt::Array with new PjRtBuffer so that
+      // std::shared_ptr<PjRtBuffer> does not need to be updated in-place.
+      if (array->ifrt_array == nullptr) {
+        continue;
+      }
+      auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(
+          array->ifrt_array.get());
+      if (arr == nullptr) {
+        throw XlaRuntimeError(
+            "This operation is implemented for a PjRt-compatible backend "
+            "only.");
+      }
+      TF_ASSIGN_OR_RETURN(absl::Span<std::shared_ptr<PjRtBuffer>> pjrt_buffers,
+                          arr->mutable_pjrt_buffers());
+      for (int i = 0; i < pjrt_buffers.size(); ++i) {
+        std::shared_ptr<PjRtBuffer>& pjrt_buf_ptr = pjrt_buffers[i];
+        if (pjrt_buf_ptr->IsDeleted()) {
+          continue;
+        }
+        auto [iter, inserted] =
+            pjrt_buf_to_tmp_buffer.insert({pjrt_buf_ptr.get(), TmpBuffer()});
+        if (inserted) {
+          TF_ASSIGN_OR_RETURN(iter->second.host_copy,
+                              pjrt_buf_ptr->ToLiteralSync());
+        }
+        iter->second.pjrt_buffer_ptrs.push_back(&pjrt_buf_ptr);
       }
+    }
+
+    // All buffers successfully copied to host, delete on-device copies.
+    //
+    // Use blocking delete operation to ensure all memory is actually cleared
+    // before we start rewriting buffers.
+    //
+    // Die instead of returning a bad status because program presumably can't
+    // continue if we fail to reconstitute device buffers.
+    for (const auto& it : pjrt_buf_to_tmp_buffer) {
+      PjRtBuffer* pjrt_buf = it.first;
+      TF_CHECK_OK(tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buf)
+                      ->Release(/*wait_for_operations_to_complete=*/true)
+                      .status());
+    }
 
-      // Copy host copies back to device and update PyBuffers in-place.
-      for (TmpBuffer& tmp_buffer : tmp_buffers) {
-        std::unique_ptr<PjRtBuffer> new_copy =
-            pjrt_client_
-                ->BufferFromHostLiteral(*tmp_buffer.host_copy,
-                                        tmp_buffer.py_buffer->buffer_->device())
-                .value();
-        TF_CHECK_OK(new_copy->BlockHostUntilReady());
-        tmp_buffer.py_buffer->buffer_.reset(new_copy.release());
+    // Copy host copies back to device and update PyBuffers in-place.
+    for (auto& it : pjrt_buf_to_tmp_buffer) {
+      PjRtBuffer* pjrt_buf = it.first;
+      TmpBuffer& tmp_buffer = it.second;
+      std::unique_ptr<PjRtBuffer> new_copy =
+          pjrt_client()
+              ->BufferFromHostLiteral(*tmp_buffer.host_copy, pjrt_buf->device())
+              .value();
+      TF_CHECK_OK(new_copy->BlockHostUntilReady());
+
+      std::shared_ptr<PjRtBuffer> new_pjrt_buf_ptr(new_copy.release());
+      for (PyBuffer* py_buffer : tmp_buffer.py_buffers) {
+        py_buffer->SetPjRtBuffer(new_pjrt_buf_ptr);
+      }
+      for (std::shared_ptr<PjRtBuffer>* pjrt_buffer_ptr :
+           tmp_buffer.pjrt_buffer_ptrs) {
+        *pjrt_buffer_ptr = new_pjrt_buf_ptr;
       }
+    }
 
-      // TODO(skyewm): delete executables?
+    // TODO(skyewm): delete executables?
   }
   return OkStatus();
 }
@@ -186,7 +238,7 @@ StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
 PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
-      pjrt_client_->GetDefaultDeviceAssignment(num_replicas, num_partitions));
+      ifrt_client_->GetDefaultDeviceAssignment(num_replicas, num_partitions));
   std::vector<std::vector<ClientAndPtr<PjRtDevice>>> result;
   result.resize(num_replicas);
   for (int r = 0; r < num_replicas; ++r) {
@@ -194,7 +246,7 @@ PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
     for (int p = 0; p < num_partitions; ++p) {
       int device_id = device_assignment(r, p);
       TF_ASSIGN_OR_RETURN(PjRtDevice * device,
-                          pjrt_client_->LookupDevice(device_id));
+                          ifrt_client_->LookupDevice(device_id));
       result[r][p] = WrapWithClient(shared_from_this(), device);
     }
   }
@@ -204,13 +256,13 @@ PyClient::GetDefaultDeviceAssignment(int num_replicas, int num_partitions) {
 StatusOr<std::vector<ClientAndPtr<PjRtDevice>>>
 PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
   TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      pjrt_client_->GetDefaultDeviceAssignment(
+                      ifrt_client_->GetDefaultDeviceAssignment(
                           num_replicas, /*num_partitions=*/1));
   std::vector<ClientAndPtr<PjRtDevice>> result;
   for (int i = 0; i < num_replicas; ++i) {
     int device_id = device_assignment(i, 0);
     TF_ASSIGN_OR_RETURN(PjRtDevice * device,
-                        pjrt_client_->LookupDevice(device_id));
+                        ifrt_client_->LookupDevice(device_id));
     result.push_back(WrapWithClient(shared_from_this(), device));
   }
   return result;
@@ -218,10 +270,11 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 
 StatusOr<py::object> PyClient::BufferFromPyval(
     pybind11::handle argument, PjRtDevice* device, bool force_copy,
-    PjRtClient::HostBufferSemantics host_buffer_semantics) {
+    ifrt::Client::HostBufferSemantics host_buffer_semantics
+) {
   if (device == nullptr) {
-    TF_RET_CHECK(!pjrt_client_->addressable_devices().empty());
-    device = pjrt_client_->addressable_devices().front();
+    TF_RET_CHECK(!ifrt_client_->addressable_devices().empty());
+    device = ifrt_client_->addressable_devices().front();
   }
   CHECK(device != nullptr);
 
@@ -248,11 +301,11 @@ StatusOr<py::object> PyClient::BufferFromPyval(
       jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
 
   TF_ASSIGN_OR_RETURN(PjRtDevice * found_device,
-                      pjrt_client_->LookupDevice(device->id()));
+                      ifrt_client_->LookupDevice(device->id()));
   if (found_device != device) {
     return InvalidArgument("Cannot copy value to device '%s' with '%s' backend",
                            device->DebugString(),
-                           pjrt_client_->platform_name());
+                           ifrt_client_->platform_name());
   }
   GlobalPyRefManager()->CollectGarbage();
 
@@ -260,13 +313,13 @@ StatusOr<py::object> PyClient::BufferFromPyval(
   options.squash_64bit_types = false;
   options.allow_zero_copy =
       (!force_copy &&
-       (host_buffer_semantics == PjRtClient::HostBufferSemantics::kZeroCopy));
+       (host_buffer_semantics == ifrt::Client::HostBufferSemantics::kZeroCopy));
   TF_ASSIGN_OR_RETURN(DevicePutResult put,
-                      DevicePut(argument, device, options));
+                      DevicePut(argument, ifrt_client_.get(), device, options));
 
-  if (put.owned_buffer) {
+  if (put.ifrt_array) {
     auto traceback = Traceback::Get();
-    return PyBuffer::Make(shared_from_this(), std::move(put.owned_buffer),
+    return PyBuffer::Make(shared_from_this(), std::move(put.ifrt_array),
                           std::move(traceback));
   } else {
     return py::reinterpret_borrow<py::object>(put.owning_pybuffer);
@@ -282,7 +335,7 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
   bool done = false;
 
   TF_ASSIGN_OR_RETURN(
-      auto buffers, pjrt_client_->MakeCrossHostReceiveBuffers(
+      auto buffers, pjrt_client()->MakeCrossHostReceiveBuffers(
                         shapes, device,
                         [&done, &recv_descriptors_or,
                          &mu](StatusOr<PjRtCrossHostRecvState> recv_state_or) {
@@ -313,73 +366,65 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
     const std::string& desc = descriptors.serialized_descriptors[0];
     pybind11::bytes py_desc = pybind11::bytes(desc);
     auto traceback = Traceback::Get();
+    auto* client =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client());
+    if (client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    TF_ASSIGN_OR_RETURN(auto ifrt_array,
+                        client->CreatePjRtArray(std::move(buffers[i])));
     auto py_buf =
-        PyBuffer::Make(shared_from_this(), std::move(buffers[i]), traceback);
+        PyBuffer::Make(shared_from_this(), std::move(ifrt_array), traceback);
     result.push_back(std::make_pair(std::move(py_desc), std::move(py_buf)));
   }
   return result;
 }
 
 StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
-    const XlaComputation& computation, CompileOptions options,
-    std::vector<pybind11::capsule> host_callbacks) {
-  std::unique_ptr<PjRtLoadedExecutable> executable;
-  std::optional<std::string> fingerprint;
-  {
-    py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(executable,
-                        pjrt_client_->Compile(computation, std::move(options)));
-    TF_ASSIGN_OR_RETURN(fingerprint,
-                        pjrt_client_->ExecutableFingerprint(*executable));
-  }
-  auto traceback = Traceback::Get();
-  return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(executable), std::move(traceback),
-      std::move(fingerprint), std::move(host_callbacks));
-}
-
-StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::CompileMlir(
     std::string mlir_module, CompileOptions options,
     std::vector<pybind11::capsule> host_callbacks) {
-  std::unique_ptr<PjRtLoadedExecutable> executable;
+  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
   {
     py::gil_scoped_release gil_release;
     mlir::MLIRContext context;
     TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                         ParseMlirModuleString(mlir_module, context));
-    TF_ASSIGN_OR_RETURN(
-        executable, pjrt_client_->Compile(module.get(), std::move(options)));
-    TF_ASSIGN_OR_RETURN(fingerprint,
-                        pjrt_client_->ExecutableFingerprint(*executable));
+    TF_ASSIGN_OR_RETURN(ifrt_loaded_executable,
+                        ifrt_client_->GetDefaultCompiler()->Compile(
+                            module.get(), std::move(options)));
+    TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
   return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(executable), std::move(traceback),
-      std::move(fingerprint), std::move(host_callbacks));
+      shared_from_this(), std::move(ifrt_loaded_executable),
+      std::move(traceback), std::move(fingerprint), std::move(host_callbacks));
 }
 
 StatusOr<py::bytes> PyClient::SerializeExecutable(
     const PyLoadedExecutable& executable) const {
-  return pjrt_client_->SerializeExecutable(executable.pjrt_executable());
+  return executable.ifrt_loaded_executable()->Serialize();
 }
 
 StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::DeserializeExecutable(
     const std::string& serialized, CompileOptions options,
     std::vector<pybind11::capsule> host_callbacks) {
-  std::unique_ptr<PjRtLoadedExecutable> executable;
+  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
   {
     py::gil_scoped_release gil_release;
-    TF_ASSIGN_OR_RETURN(executable, pjrt_client_->DeserializeExecutable(
-                                        serialized, std::move(options)));
-    TF_ASSIGN_OR_RETURN(fingerprint,
-                        pjrt_client_->ExecutableFingerprint(*executable));
+    TF_ASSIGN_OR_RETURN(
+        ifrt_loaded_executable,
+        ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
+            serialized, std::move(options)));
+    TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
+  TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   auto traceback = Traceback::Get();
   return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(executable), std::move(traceback),
-      std::move(fingerprint), std::move(host_callbacks));
+      shared_from_this(), std::move(ifrt_loaded_executable),
+      std::move(traceback), std::move(fingerprint), std::move(host_callbacks));
 }
 
 namespace {
@@ -434,13 +479,24 @@ StatusOr<py::bytes> PyClient::HeapProfile() {
 
   for (PyBuffer* device_buffers : buffers_) {
     for (PyBuffer* buffer = device_buffers; buffer; buffer = buffer->next_) {
-      TF_RETURN_IF_ERROR(
-          add_buffer_to_profile(buffer->buffer(), buffer->traceback().get()));
+      TF_RETURN_IF_ERROR(add_buffer_to_profile(buffer->pjrt_buffer(),
+                                               buffer->traceback().get()));
     }
   }
 
   for (PyArray_Storage* array = arrays_; array; array = array->next) {
-    for (const auto& buffer : array->pjrt_buffers) {
+    if (array->ifrt_array == nullptr) {
+      continue;
+    }
+    auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(
+        array->ifrt_array.get());
+    // TODO(hyeontaek): Support non-PjRt Arrays.
+    if (arr == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend "
+          "only.");
+    }
+    for (const auto& buffer : arr->pjrt_buffers()) {
       TF_RETURN_IF_ERROR(
           add_buffer_to_profile(buffer.get(), array->traceback.get()));
     }
@@ -449,7 +505,7 @@ StatusOr<py::bytes> PyClient::HeapProfile() {
   for (auto* sharded_buffer = sharded_buffers_; sharded_buffer;
        sharded_buffer = sharded_buffer->next_) {
     for (int i = 0; i < sharded_buffer->num_devices(); ++i) {
-      auto* buffer = sharded_buffer->GetPjRtBuffer(i);
+      auto* buffer = sharded_buffer->pjrt_buffer(i);
       TF_RETURN_IF_ERROR(
           add_buffer_to_profile(buffer, sharded_buffer->traceback().get()));
     }
@@ -616,7 +672,7 @@ StatusOr<std::pair<uint64_t, pybind11::object>>
 PyClient::GetEmitPythonCallbackDescriptor(
     pybind11::function callable, absl::Span<Shape const> operand_shapes,
     absl::Span<Shape const> result_shapes) {
-  PjRtPlatformId platform_id = pjrt_client_->platform_id();
+  ifrt::PlatformId platform_id = ifrt_client_->platform_id();
   if (platform_id != GpuId() && platform_id != CpuId()) {
     return Unimplemented(
         "EmitPythonCallback is only implemented on CPU and GPU");
@@ -697,7 +753,7 @@ StatusOr<XlaOp> PyClient::EmitPythonCallbackFromDescriptor(
   Shape result_shape = ShapeUtil::MakeTupleShape(result_shapes_with_layout);
   std::string callback_str = std::to_string(descriptor);
   std::string callback_name = "xla_python_cpu_callback";
-  if (pjrt_client_->platform_id() == GpuId()) {
+  if (ifrt_client_->platform_id() == GpuId()) {
     callback_name = "xla_python_gpu_callback";
   }
   XlaOp result =
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index 79dc591cf77..5c3d05828e3 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -25,6 +25,9 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/exceptions.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -93,27 +96,51 @@ ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyClient> client, T* contents) {
 // We use a wrapper class to add Python-specific functionality.
 class PyClient : public std::enable_shared_from_this<PyClient> {
  public:
-  explicit PyClient(std::unique_ptr<PjRtClient> pjrt_client);
-  explicit PyClient(std::shared_ptr<PjRtClient> pjrt_client);
+  explicit PyClient(std::shared_ptr<ifrt::Client> ifrt_client);
   virtual ~PyClient();
 
-  PjRtClient* pjrt_client() const { return pjrt_client_.get(); }
-  std::shared_ptr<PjRtClient> shared_pjrt_client() { return pjrt_client_; }
+  ifrt::Client* ifrt_client() const { return ifrt_client_.get(); }
+
+  // Short-term escape hatch to get PjRtClient from PyClient.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  xla::PjRtClient* pjrt_client() const {
+    auto* pjrt_client =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+    if (pjrt_client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return pjrt_client->pjrt_client();
+  }
+  std::shared_ptr<PjRtClient> shared_ptr_pjrt_client() {
+    auto* pjrt_client =
+        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+    if (pjrt_client == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return pjrt_client->shared_ptr_pjrt_client();
+  }
+
+  // Legacy alises.
+  std::shared_ptr<PjRtClient> shared_pjrt_client() {
+    return shared_ptr_pjrt_client();
+  }
 
   absl::string_view platform_name() const {
-    return pjrt_client_->platform_name();
+    return ifrt_client_->platform_name();
   }
   absl::string_view platform_version() const {
-    return pjrt_client_->platform_version();
+    return ifrt_client_->platform_version();
   }
   absl::string_view runtime_type() const {
-    return PjRtRuntimeTypeString(pjrt_client_->runtime_type());
+    return ifrt_client_->runtime_type();
   }
   int addressable_device_count() const {
-    return pjrt_client_->addressable_device_count();
+    return ifrt_client_->addressable_device_count();
   }
-  int device_count() const { return pjrt_client_->device_count(); }
-  int process_index() const { return pjrt_client_->process_index(); }
+  int device_count() const { return ifrt_client_->device_count(); }
+  int process_index() const { return ifrt_client_->process_index(); }
 
   std::vector<ClientAndPtr<PjRtDevice>> Devices();
   std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
@@ -141,10 +168,10 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
 
   StatusOr<ChannelHandle> CreateChannelHandle() { return ChannelHandle(); }
   StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() {
-    return pjrt_client_->CreateDeviceToHostChannelHandle();
+    return ifrt_client_->CreateDeviceToHostChannelHandle();
   }
   StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() {
-    return pjrt_client_->CreateHostToDeviceChannelHandle();
+    return ifrt_client_->CreateHostToDeviceChannelHandle();
   }
 
   StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
@@ -153,12 +180,9 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
 
   StatusOr<pybind11::object> BufferFromPyval(
       pybind11::handle argument, PjRtDevice* device, bool force_copy,
-      PjRtClient::HostBufferSemantics host_buffer_semantics);
+      ifrt::Client::HostBufferSemantics host_buffer_semantics);
 
   StatusOr<std::shared_ptr<PyLoadedExecutable>> Compile(
-      const XlaComputation& computation, CompileOptions options,
-      std::vector<pybind11::capsule> host_callbacks);
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> CompileMlir(
       std::string mlir_module, CompileOptions options,
       std::vector<pybind11::capsule> host_callbacks);
 
@@ -196,7 +220,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   GetEmitPythonCallbackDescriptor(pybind11::function callable,
                                   absl::Span<Shape const> operand_shapes,
                                   absl::Span<Shape const> result_shapes);
-  // Deprecated; please switch to emitting an MHLO `CustomCallOp` directly.
+  // Deprecated; please switch to emitting a `CustomCallOp` directly.
   StatusOr<XlaOp> EmitPythonCallbackFromDescriptor(
       XlaBuilder& builder, uint64_t descriptor,
       absl::Span<XlaOp const> operands, absl::Span<Shape const> result_shapes,
@@ -232,7 +256,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   friend class PyArray;
   friend struct PyArray_Storage;
 
-  std::shared_ptr<PjRtClient> pjrt_client_;
+  std::shared_ptr<ifrt::Client> ifrt_client_;
 
   // Pointers to intrusive doubly-linked lists of buffers and executables, used
   // to iterate over all known objects when heap profiling. The list structure
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index a3578cb6f0c..3749a572d9b 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -24,6 +24,10 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/pjrt/host_callback.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+#include "tensorflow/compiler/xla/python/ifrt/future.h"
 #include "tensorflow/tsl/platform/fingerprint.h"
 
 namespace xla {
@@ -48,12 +52,12 @@ Status PyShardedToken::Await() {
 
 PyLoadedExecutable::PyLoadedExecutable(
     std::shared_ptr<PyClient> client,
-    std::unique_ptr<PjRtLoadedExecutable> executable,
+    std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
     std::shared_ptr<Traceback> traceback,
     std::optional<std::string> fingerprint,
     std::vector<pybind11::capsule> host_callbacks)
     : client_(std::move(client)),
-      executable_(std::move(executable)),
+      ifrt_loaded_executable_(std::move(ifrt_loaded_executable)),
       traceback_(std::move(traceback)),
       fingerprint_(std::move(fingerprint)),
       host_callbacks_(std::move(host_callbacks)) {
@@ -67,8 +71,8 @@ PyLoadedExecutable::PyLoadedExecutable(
   options_.untuple_result = true;
   if (fingerprint_) {
     options_.launch_id = tsl::Fingerprint32(*fingerprint_);
-    VLOG(1) << "Fingerprint for executable " << executable_->name() << ": "
-            << *fingerprint_;
+    VLOG(1) << "Fingerprint for executable " << ifrt_loaded_executable_->name()
+            << ": " << *fingerprint_;
   }
 }
 
@@ -88,18 +92,17 @@ PyLoadedExecutable::~PyLoadedExecutable() {
 std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
     const {
   std::vector<ClientAndPtr<PjRtDevice>> devices;
-  devices.reserve(executable_->addressable_devices().size());
-  for (PjRtDevice* device : executable_->addressable_devices()) {
+  devices.reserve(ifrt_loaded_executable_->addressable_devices().size());
+  for (ifrt::Device* device : ifrt_loaded_executable_->addressable_devices()) {
     devices.push_back(WrapWithClient(client_, device));
   }
   return devices;
 }
 
-StatusOr<std::pair<std::vector<PyBuffer::object>, PyToken>>
-PyLoadedExecutable::ExecuteInternal(
-    absl::Span<PyBuffer::object const> args, PjRtDevice* device,
-    std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
-  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
+StatusOr<std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>>
+PyLoadedExecutable::ExecuteInternal(absl::Span<PyBuffer::object const> args,
+                                    PjRtDevice* device) {
+  ifrt::LoadedExecutable::ExecuteResult result;
   {
     auto options = options_;
     std::shared_ptr<HostCallbackStates> host_callback_states;
@@ -112,8 +115,6 @@ PyLoadedExecutable::ExecuteInternal(
                              client()->runtime_type());
       }
 
-      returned_futures.emplace();
-
       host_callback_states = std::make_shared<HostCallbackStates>();
       auto& contexts = host_callback_states->contexts.emplace_back();
       auto& send_callbacks =
@@ -131,66 +132,51 @@ PyLoadedExecutable::ExecuteInternal(
     }
 
     py::gil_scoped_release gil_release;
-    std::vector<PjRtBuffer*> arg_buffers(args.size());
-    absl::c_transform(
-        args, arg_buffers.begin(),
-        [](const PyBuffer::object& buf) { return buf.buf()->buffer(); });
+    std::vector<tsl::RCReference<ifrt::Array>> arg_arrays(args.size());
+    absl::c_transform(args, arg_arrays.begin(),
+                      [](const PyBuffer::object& buf) {
+                        return tsl::FormRef(buf.buf()->ifrt_array());
+                      });
+
     if (device) {
-      std::optional<PjRtFuture<Status>> future;
-      output_buffers.resize(1);
       TF_ASSIGN_OR_RETURN(
-          output_buffers[0],
-          executable_->ExecutePortable(arg_buffers, device, options, future,
-                                       returned_futures.has_value()));
-      if (future) {
-        returned_futures->emplace_back(std::move(*future));
-      }
+          result, ifrt_loaded_executable()->Execute(
+                      absl::MakeSpan(arg_arrays), options,
+                      /*devices=*/
+                      ifrt::DeviceList(ifrt::DeviceList::Devices({device}))));
     } else {
-      TF_ASSIGN_OR_RETURN(
-          output_buffers,
-          executable_->Execute({arg_buffers}, options, returned_futures));
+      TF_ASSIGN_OR_RETURN(result, ifrt_loaded_executable()->Execute(
+                                      absl::MakeSpan(arg_arrays), options,
+                                      /*devices=*/std::nullopt));
     }
 
     if (!host_callbacks_.empty()) {
-      // For host callbacks to work, `returned_futures` must not be nullopt.
-      returned_futures->at(0).OnReady([host_callback_states](Status) mutable {
+      result.status.OnReady([host_callback_states](Status) mutable {
         host_callback_states.reset();
       });
     }
   }
   auto traceback = Traceback::Get();
   std::vector<PyBuffer::object> outputs;
-  outputs.reserve(output_buffers[0].size());
-  for (auto& buffer : output_buffers[0]) {
-    outputs.push_back(PyBuffer::Make(client_, std::move(buffer), traceback));
+  outputs.reserve(result.outputs.size());
+  for (auto& array : result.outputs) {
+    outputs.push_back(PyBuffer::Make(client_, std::move(array), traceback));
   }
-
-  // TODO(b/240696624): Although the PjRt interface require `returned_futures`
-  // to be resized correctly if it is not nullopt, some implementation does not
-  // implement this. So we have to check whether returned_futures is empty.
-  // Remove this check once the implementation is fixed.
-  if (!returned_futures.has_value()) {
-    return std::pair<std::vector<PyBuffer::object>, PyToken>(
-        std::move(outputs), PyToken::ReadyPyToken());
-  }
-  return std::pair<std::vector<PyBuffer::object>, PyToken>(
-      std::move(outputs), PyToken(std::move(returned_futures->at(0))));
+  return std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>(
+      std::move(outputs), std::move(result.status));
 }
 
 StatusOr<std::pair<std::vector<PyBuffer::object>, PyToken>>
 PyLoadedExecutable::ExecuteWithToken(absl::Span<PyBuffer::object const> args,
                                      PjRtDevice* device) {
-  std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
-  if (executable_->IsReturnedFutureSupported()) returned_futures.emplace();
-  return ExecuteInternal(args, device, returned_futures);
+  TF_ASSIGN_OR_RETURN(auto out, ExecuteInternal(args, device));
+  return std::make_pair(std::move(out.first), PyToken(std::move(out.second)));
 }
 
 StatusOr<std::vector<PyBuffer::object>> PyLoadedExecutable::Execute(
     absl::Span<PyBuffer::object const> args, PjRtDevice* device) {
-  std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
-  TF_ASSIGN_OR_RETURN(auto outputs_and_token,
-                      ExecuteInternal(args, device, returned_futures));
-  return std::move(outputs_and_token.first);
+  TF_ASSIGN_OR_RETURN(auto out, ExecuteInternal(args, device));
+  return std::move(out.first);
 }
 
 namespace {
@@ -204,10 +190,14 @@ template <>
 struct ShardedBufferAdapter<PyShardedBuffer*> {
   using ResultT = PyShardedBuffer;
   static int num_devices(const PyShardedBuffer* arg) {
+    DCHECK(arg);
     return arg->num_devices();
   }
-  static PjRtBuffer* GetPjRtBuffer(const PyShardedBuffer* arg, int device_id) {
-    return arg->GetPjRtBuffer(device_id);
+  static tsl::RCReference<ifrt::Array> GetIfRtArray(
+      const PyShardedBuffer* arg) {
+    DCHECK(arg);
+    DCHECK(arg->ifrt_array());
+    return tsl::FormRef(arg->ifrt_array());
   }
 };
 
@@ -217,45 +207,67 @@ struct ShardedBufferAdapter<std::vector<PyBuffer::object>> {
   static int num_devices(const std::vector<PyBuffer::object>& arg) {
     return arg.size();
   }
-  static PjRtBuffer* GetPjRtBuffer(const std::vector<PyBuffer::object>& arg,
-                                   int device_id) {
-    return arg.at(device_id).buf()->buffer();
+  static tsl::RCReference<ifrt::Array> GetIfRtArray(
+      const std::vector<PyBuffer::object>& arg) {
+    // TODO(hyeontaek): This on-demand Array creation is not efficient and has
+    // insufficient information about the shape (a dummy shape is used). This
+    // should be removed if possible and only be used in the context where the
+    // shape information is unused.
+    DCHECK(&arg);
+    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays;
+    ifrt_arrays.reserve(arg.size());
+    ifrt::DeviceList::Devices devices;
+    devices.reserve(arg.size());
+    for (auto& buf : arg) {
+      DCHECK(buf.buf());
+      DCHECK(buf.buf()->ifrt_array());
+      ifrt_arrays.push_back(tsl::FormRef(buf.buf()->ifrt_array()));
+      devices.push_back(buf.buf()->ifrt_array()->sharding().devices().front());
+      // Do not need to collect per-device shapes because the created array is
+      // not supposed to explode.
+    }
+    CHECK(!ifrt_arrays.empty());
+    // Use a dummy shape.
+    // TODO(hyeontaek): Find a way to compute a correct shape.
+    auto ifrt_array =
+        ifrt_arrays.front()->client()->AssembleArrayFromSingleDeviceArrays(
+            ifrt_arrays.front()->shape(),
+            ifrt::OpaqueSharding::Create(ifrt::DeviceList(std::move(devices))),
+            absl::MakeSpan(ifrt_arrays), ifrt::ArrayCopySemantics::kReuseInput);
+    TF_CHECK_OK(ifrt_array.status());
+    return *ifrt_array;
   }
 };
 
 void PopulateExecuteShardedResults(
     const std::shared_ptr<PyClient>& client,
-    std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> pjrt_buffers,
-    std::vector<PyShardedBuffer>& outputs) {
+    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
+    int num_computations, std::vector<PyShardedBuffer>& outputs) {
   auto traceback = Traceback::Get();
-  int num_computations = pjrt_buffers.size();
-  DCHECK_GT(num_computations, 0);
-  int num_output_buffers = pjrt_buffers[0].size();
+  int num_output_buffers = ifrt_arrays.size();
   outputs.reserve(num_output_buffers);
-  for (int buffer_id = 0; buffer_id < num_output_buffers; ++buffer_id) {
-    std::vector<std::shared_ptr<PjRtBuffer>> buffers;
-    buffers.reserve(num_computations);
-    for (int computation = 0; computation < num_computations; ++computation) {
-      buffers.push_back(std::move(pjrt_buffers[computation][buffer_id]));
-    }
-    outputs.emplace_back(client, std::move(buffers), traceback);
+  for (auto& array : ifrt_arrays) {
+    outputs.emplace_back(client, std::move(array), traceback);
   }
 }
 
 void PopulateExecuteShardedResults(
     const std::shared_ptr<PyClient>& client,
-    std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> pjrt_buffers,
-    std::vector<std::vector<PyBuffer::object>>& outputs) {
+    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
+    int num_computations, std::vector<std::vector<PyBuffer::object>>& outputs) {
   auto traceback = Traceback::Get();
-  int num_computations = pjrt_buffers.size();
   DCHECK_GT(num_computations, 0);
-  int num_output_buffers = pjrt_buffers[0].size();
+  int num_output_buffers = ifrt_arrays.size();
   outputs.resize(num_output_buffers);
   for (int buffer_id = 0; buffer_id < num_output_buffers; ++buffer_id) {
     outputs[buffer_id].reserve(num_computations);
-    for (int computation = 0; computation < num_computations; ++computation) {
-      outputs[buffer_id].push_back(PyBuffer::Make(
-          client, std::move(pjrt_buffers[computation][buffer_id]), traceback));
+    auto exploded_arrays =
+        ifrt_arrays[buffer_id]->DisassembleIntoSingleDeviceArrays(
+            ifrt::ArrayCopySemantics::kReuseInput);
+    TF_CHECK_OK(exploded_arrays.status());
+    for (auto& exploded_array : *exploded_arrays) {
+      outputs[buffer_id].push_back(
+          PyBuffer::Make(client, std::move(exploded_array), traceback));
     }
   }
 }
@@ -266,11 +278,12 @@ template <typename ArgT,
 StatusOr<std::pair<std::vector<ResultT>, PyShardedToken>>
 ExecuteShardedOnLocalDevicesInternal(
     const ExecuteOptions& options, const std::shared_ptr<PyClient>& client,
-    PjRtLoadedExecutable* executable,
+    ifrt::LoadedExecutable* ifrt_loaded_executable,
     absl::Span<const py::capsule> host_callbacks, absl::Span<const ArgT> args,
     std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
-  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> output_buffers;
-  int num_computations = executable->addressable_devices().size();
+  std::vector<tsl::RCReference<ifrt::Array>> output_arrays;
+  std::unique_ptr<ifrt::Future<Status>> returned_future;
+  int num_computations = ifrt_loaded_executable->addressable_devices().size();
   {
     auto opts = options;
     std::shared_ptr<HostCallbackStates> host_callback_states;
@@ -314,31 +327,30 @@ ExecuteShardedOnLocalDevicesInternal(
             }));
       }
     }
-    std::vector<std::vector<PjRtBuffer*>> arg_buffers(num_computations);
-    const int num_args = args.size();
-    for (int computation = 0; computation < num_computations; ++computation) {
-      arg_buffers[computation].resize(num_args);
-      absl::c_transform(args, arg_buffers[computation].begin(),
-                        [&](const ArgT& arg) {
-                          return ArgAdapter::GetPjRtBuffer(arg, computation);
-                        });
+    std::vector<tsl::RCReference<ifrt::Array>> arg_arrays(args.size());
+    absl::c_transform(args, arg_arrays.begin(), [&](const ArgT& arg) mutable {
+      return ArgAdapter::GetIfRtArray(arg);
+    });
+    TF_ASSIGN_OR_RETURN(auto result, ifrt_loaded_executable->Execute(
+                                         absl::MakeSpan(arg_arrays), opts,
+                                         /*devices=*/std::nullopt));
+    output_arrays = std::move(result.outputs);
+    if (returned_futures.has_value()) {
+      returned_futures->resize(num_computations, std::move(result.status));
     }
-    TF_ASSIGN_OR_RETURN(output_buffers, executable->Execute(arg_buffers, opts,
-                                                            returned_futures));
 
     if (!host_callbacks.empty()) {
       // For host callbacks to work, `returned_futures` must not be nullopt.
-      for (int i = 0; i < num_computations; ++i) {
-        returned_futures.value().at(i).OnReady(
-            [host_callback_states](Status) mutable {
-              host_callback_states.reset();
-            });
-      }
+      returned_futures.value().at(0).OnReady(
+          [host_callback_states](Status) mutable {
+            host_callback_states.reset();
+          });
     }
   }
 
   std::vector<ResultT> outputs;
-  PopulateExecuteShardedResults(client, std::move(output_buffers), outputs);
+  PopulateExecuteShardedResults(client, std::move(output_arrays),
+                                num_computations, outputs);
 
   // TODO(b/240696624): Although the PjRt interface require `returned_futures`
   // to be resized correctly if it is not nullopt, some implementation does not
@@ -362,8 +374,8 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevices(
   std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
   TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
                       ExecuteShardedOnLocalDevicesInternal(
-                          options_, client_, executable_.get(), host_callbacks_,
-                          args, returned_futures));
+                          options_, client_, ifrt_loaded_executable_.get(),
+                          host_callbacks_, args, returned_futures));
   return std::move(outputs_and_tokens.first);
 }
 
@@ -371,9 +383,9 @@ StatusOr<std::pair<std::vector<PyShardedBuffer>, PyShardedToken>>
 PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
     absl::Span<PyShardedBuffer* const> args) {
   std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
-  if (executable_->IsReturnedFutureSupported()) returned_futures.emplace();
+  returned_futures.emplace();
   return ExecuteShardedOnLocalDevicesInternal(
-      options_, client_, executable_.get(), host_callbacks_, args,
+      options_, client_, ifrt_loaded_executable_.get(), host_callbacks_, args,
       returned_futures);
 }
 
@@ -383,8 +395,8 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevices(
   std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
   TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
                       ExecuteShardedOnLocalDevicesInternal(
-                          options_, client_, executable_.get(), host_callbacks_,
-                          args, returned_futures));
+                          options_, client_, ifrt_loaded_executable_.get(),
+                          host_callbacks_, args, returned_futures));
   return std::move(outputs_and_tokens.first);
 }
 
@@ -392,25 +404,25 @@ StatusOr<std::pair<std::vector<std::vector<PyBuffer::object>>, PyShardedToken>>
 PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
     absl::Span<const std::vector<PyBuffer::object>> args) {
   std::optional<std::vector<PjRtFuture<Status>>> returned_futures;
-  if (executable_->IsReturnedFutureSupported()) returned_futures.emplace();
+  returned_futures.emplace();
   return ExecuteShardedOnLocalDevicesInternal(
-      options_, client_, executable_.get(), host_callbacks_, args,
+      options_, client_, ifrt_loaded_executable_.get(), host_callbacks_, args,
       returned_futures);
 }
 
 StatusOr<std::vector<std::shared_ptr<HloModule>>>
 PyLoadedExecutable::HloModules() const {
-  return executable_->GetHloModules();
+  return ifrt_loaded_executable_->GetHloModules();
 }
 
 std::optional<std::vector<OpSharding>>
 PyLoadedExecutable::GetParameterShardings() const {
-  return executable_->GetParameterShardings();
+  return ifrt_loaded_executable_->GetParameterShardings();
 }
 
 std::optional<std::vector<OpSharding>> PyLoadedExecutable::GetOutputShardings()
     const {
-  return executable_->GetOutputShardings();
+  return ifrt_loaded_executable_->GetOutputShardings();
 }
 
 void PyLoadedExecutable::KeepAlive(py::object obj) {
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index 492a487dc02..7d6932ab5e4 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
@@ -72,36 +73,40 @@ class PyShardedToken {
 class PyLoadedExecutable
     : public std::enable_shared_from_this<PyLoadedExecutable> {
  public:
-  PyLoadedExecutable(std::shared_ptr<PyClient> client,
-                     std::unique_ptr<PjRtLoadedExecutable> executable,
-                     std::shared_ptr<Traceback> traceback,
-                     std::optional<std::string> fingerprint,
-                     std::vector<pybind11::capsule> host_callbacks);
+  PyLoadedExecutable(
+      std::shared_ptr<PyClient> client,
+      std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
+      std::shared_ptr<Traceback> traceback,
+      std::optional<std::string> fingerprint,
+      std::vector<pybind11::capsule> host_callbacks);
   ~PyLoadedExecutable();
 
   std::shared_ptr<PyClient> client() const { return client_; }
-  std::shared_ptr<PjRtLoadedExecutable> executable() const {
-    return executable_;
+  ifrt::LoadedExecutable* ifrt_loaded_executable() const {
+    return ifrt_loaded_executable_.get();
   }
 
   absl::Span<const PjRtLoadedExecutable::LogicalDeviceIds>
   addressable_device_logical_ids() const {
-    return executable_->addressable_device_logical_ids();
+    return ifrt_loaded_executable_->addressable_device_logical_ids();
   }
 
   std::vector<ClientAndPtr<PjRtDevice>> AddressableDevices() const;
 
   int64_t SizeOfGeneratedCodeInBytes() const {
-    return executable_->SizeOfGeneratedCodeInBytes();
+    return ifrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
   }
 
   StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const {
-    return executable_->GetCompiledMemoryStats();
+    return ifrt_loaded_executable_->GetCompiledMemoryStats();
   }
 
-  void Delete() { return executable_->Delete(); }
+  void Delete() {
+    // TODO(hyeontaek): Return Status.
+    TF_CHECK_OK(ifrt_loaded_executable_->Delete().Await());
+  }
 
-  bool is_deleted() { return executable_->IsDeleted(); }
+  bool is_deleted() { return ifrt_loaded_executable_->IsDeleted(); }
 
   StatusOr<std::vector<PyBuffer::object>> Execute(
       absl::Span<PyBuffer::object const> args, PjRtDevice* device);
@@ -137,11 +142,31 @@ class PyLoadedExecutable
 
   Traceback* traceback() { return traceback_.get(); }
 
-  const PjRtLoadedExecutable& pjrt_executable() const { return *executable_; }
+  ifrt::LoadedExecutable* ifrt_executable() const {
+    return ifrt_loaded_executable_.get();
+  }
 
-  PjRtLoadedExecutable* mutable_pjrt_executable() const {
-    return executable_.get();
+  // Short-term escape hatch to get PjRtLoadedExecutable from PyExecutable.
+  // TODO(hyeontaek): Migrate all users of this method to be agnostic of PjRt.
+  PjRtLoadedExecutable* pjrt_executable() const {
+    auto* exec = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleLoadedExecutable>(
+        ifrt_loaded_executable_.get());
+    if (exec == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return exec->pjrt_loaded_executable();
   }
+  std::shared_ptr<PjRtLoadedExecutable> shared_ptr_pjrt_executable() {
+    auto* exec = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleLoadedExecutable>(
+        ifrt_loaded_executable_.get());
+    if (exec == nullptr) {
+      throw XlaRuntimeError(
+          "This operation is implemented for a PjRt-compatible backend only.");
+    }
+    return exec->shared_ptr_pjrt_loaded_executable();
+  }
+
   const ExecuteOptions& options() const { return options_; }
   const std::optional<std::string>& fingerprint() const { return fingerprint_; }
 
@@ -149,14 +174,13 @@ class PyLoadedExecutable
   void KeepAlive(pybind11::object obj);
 
  private:
-  StatusOr<std::pair<std::vector<PyBuffer::object>, PyToken>> ExecuteInternal(
-      absl::Span<PyBuffer::object const> args, PjRtDevice* device,
-      std::optional<std::vector<PjRtFuture<Status>>>& returned_futures);
+  StatusOr<std::pair<std::vector<PyBuffer::object>, ifrt::Future<Status>>>
+  ExecuteInternal(absl::Span<PyBuffer::object const> args, PjRtDevice* device);
 
   friend class PyClient;
 
   std::shared_ptr<PyClient> client_;
-  std::shared_ptr<PjRtLoadedExecutable> executable_;
+  std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
   std::shared_ptr<Traceback> traceback_;
 
   // Identical executables (i.e. representing the same program) will have the
diff --git a/tensorflow/compiler/xla/python/py_values.cc b/tensorflow/compiler/xla/python/py_values.cc
index 540788edfc6..5faa4712075 100644
--- a/tensorflow/compiler/xla/python/py_values.cc
+++ b/tensorflow/compiler/xla/python/py_values.cc
@@ -12,13 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include "tensorflow/compiler/xla/python/py_values.h"
 
+// NOLINTBEGIN
+#include <exception>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+// NOLINTEND
+
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/python/numpy.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
@@ -26,6 +40,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/sharding.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/float8.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace py = pybind11;
@@ -35,11 +52,12 @@ namespace xla {
 namespace {
 
 using DevicePutFunc = std::function<StatusOr<DevicePutResult>(
-    py::handle, PjRtDevice*, const DevicePutOptions& options)>;
+    py::handle, ifrt::Client*, ifrt::Device*, const DevicePutOptions& options)>;
 
 template <typename T, typename SquashedT>
 StatusOr<DevicePutResult> HandlePythonScalar(py::handle obj,
-                                             PjRtDevice* to_device,
+                                             ifrt::Client* client,
+                                             ifrt::Device* to_device,
                                              const DevicePutOptions& options) {
   T data;
 
@@ -70,16 +88,20 @@ StatusOr<DevicePutResult> HandlePythonScalar(py::handle obj,
   // Must release the GIL before BufferFromHostBuffer because backends may
   // decide to block/sleep for device buffer allocation.
   py::gil_scoped_release gil_release;
+  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
   TF_ASSIGN_OR_RETURN(
-      auto buffer,
-      to_device->client()->BufferFromHostBuffer(
-          ptr, type, /*dims=*/{}, /*byte_strides=*/{},
-          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/nullptr, to_device));
-  return DevicePutResult(std::move(buffer), /*weak_type=*/true);
+      auto ifrt_array,
+      client->MakeArrayFromHostBuffer(
+          ptr, ifrt_dtype, /*shape=*/ifrt::Shape({}), /*byte_strides=*/{},
+          ifrt::SingleDeviceSharding::Create(to_device),
+          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/{}));
+  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
 }
 
-StatusOr<DevicePutResult> HandlePythonInt(py::handle obj, PjRtDevice* to_device,
+StatusOr<DevicePutResult> HandlePythonInt(py::handle obj,
+                                          ifrt::Client* client,
+                                          ifrt::Device* to_device,
                                           const DevicePutOptions& options) {
   void* ptr;
   PrimitiveType type;
@@ -114,17 +136,21 @@ StatusOr<DevicePutResult> HandlePythonInt(py::handle obj, PjRtDevice* to_device,
   // Must release the GIL before BufferFromHostBuffer because backends may
   // decide to block/sleep for device buffer allocation.
   py::gil_scoped_release gil_release;
+  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
   TF_ASSIGN_OR_RETURN(
-      auto buffer,
-      to_device->client()->BufferFromHostBuffer(
-          ptr, type, /*dims=*/{}, /*byte_strides=*/{},
-          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/nullptr, to_device));
-  return DevicePutResult(std::move(buffer), /*weak_type=*/true);
+      auto ifrt_array,
+      client->MakeArrayFromHostBuffer(
+          ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}), /*byte_strides=*/{},
+          ifrt::SingleDeviceSharding::Create(to_device),
+          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr));
+  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
 }
 
 template <typename T, typename SquashedT = T>
-StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h, PjRtDevice* to_device,
+StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h,
+                                            ifrt::Client* client,
+                                            ifrt::Device* to_device,
                                             const DevicePutOptions& options) {
   T data;
   SquashedT data_squashed;
@@ -134,6 +160,14 @@ StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h, PjRtDevice* to_device,
     // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = BF16;
+  } else if (std::is_same<T, tsl::float8_e4m3fn>()) {
+    // For extension types, ScalarAsCtype returns a pointer to the data.
+    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    type = F8E4M3FN;
+  } else if (std::is_same<T, tsl::float8_e5m2>()) {
+    // For extension types, ScalarAsCtype returns a pointer to the data.
+    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    type = F8E5M2;
   } else if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
     PyArray_ScalarAsCtype(h.ptr(), &data);
     ptr = &data;
@@ -147,16 +181,20 @@ StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h, PjRtDevice* to_device,
   // Must release the GIL before BufferFromHostBuffer because backends may
   // decide to block/sleep for device buffer allocation.
   py::gil_scoped_release gil_release;
+  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(type));
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtBuffer> buffer,
-      to_device->client()->BufferFromHostBuffer(
-          ptr, type, /*dims=*/{}, /*byte_strides=*/{},
-          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
-          /*on_done_with_host_buffer=*/nullptr, to_device));
-  return DevicePutResult(std::move(buffer), /*weak_type=*/false);
+      auto ifrt_array,
+      client->MakeArrayFromHostBuffer(
+          ptr, ifrt_dtype, /*shape=*/xla::ifrt::Shape({}), /*byte_strides=*/{},
+          ifrt::SingleDeviceSharding::Create(to_device),
+          ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+          /*on_done_with_host_buffer=*/nullptr));
+  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
 }
 
-StatusOr<DevicePutResult> HandleNumpyArray(py::handle h, PjRtDevice* to_device,
+StatusOr<DevicePutResult> HandleNumpyArray(py::handle h,
+                                           ifrt::Client* client,
+                                           ifrt::Device* to_device,
                                            const DevicePutOptions& options) {
   py::array array = py::cast<py::array>(h);
   TF_ASSIGN_OR_RETURN(PrimitiveType type, DtypeToPrimitiveType(array.dtype()));
@@ -183,8 +221,8 @@ StatusOr<DevicePutResult> HandleNumpyArray(py::handle h, PjRtDevice* to_device,
     byte_strides[i] = array.strides(i);
   }
   const void* data = array.data();
-  PjRtClient::HostBufferSemantics host_buffer_semantics =
-      PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall;
+  ifrt::Client::HostBufferSemantics host_buffer_semantics =
+      ifrt::Client::HostBufferSemantics::kImmutableOnlyDuringCall;
   std::function<void()> on_done_with_host_buffer;
   if (options.allow_zero_copy) {
     std::shared_ptr<PythonRefManager::ManagedPyObjects> py_buffer_ref =
@@ -192,17 +230,19 @@ StatusOr<DevicePutResult> HandleNumpyArray(py::handle h, PjRtDevice* to_device,
     on_done_with_host_buffer =
         [py_buffer_ref{
             std::move(py_buffer_ref)}]() { /* keeps py_buffer_ref alive */ };
-    host_buffer_semantics = PjRtClient::HostBufferSemantics::kZeroCopy;
+    host_buffer_semantics = ifrt::Client::HostBufferSemantics::kZeroCopy;
   }
   // Must release the GIL before BufferFromHostBuffer because backends may
   // decide to block/sleep for device buffer allocation.
   py::gil_scoped_release gil_release;
+  TF_ASSIGN_OR_RETURN(auto ifrt_dtype, xla::ifrt::ToDType(squashed_type));
   TF_ASSIGN_OR_RETURN(
-      auto buffer,
-      to_device->client()->BufferFromHostBuffer(
-          data, squashed_type, dims, byte_strides, host_buffer_semantics,
-          std::move(on_done_with_host_buffer), to_device));
-  return DevicePutResult(std::move(buffer), /*weak_type=*/false);
+      auto ifrt_array,
+      client->MakeArrayFromHostBuffer(
+          data, ifrt_dtype, ifrt::Shape(dims), byte_strides,
+          xla::ifrt::SingleDeviceSharding::Create(to_device),
+          host_buffer_semantics, std::move(on_done_with_host_buffer)));
+  return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
 }
 
 StatusOr<DevicePutResult> PyBufferHelper(py::handle obj, py::handle py_buffer,
@@ -211,24 +251,30 @@ StatusOr<DevicePutResult> PyBufferHelper(py::handle obj, py::handle py_buffer,
   bool weak_type = buffer->weak_type()
                        ? *buffer->weak_type()
                        : py::cast<bool>(obj.attr("aval").attr("weak_type"));
-  if (buffer->buffer()->device() == to_device) {
+  if (buffer->ifrt_array()->sharding().devices().front() == to_device) {
     return DevicePutResult(
-        buffer->buffer(), weak_type,
+        tsl::FormRef(buffer->ifrt_array()), weak_type,
         /*owning_pybuffer=*/py::reinterpret_borrow<py::object>(py_buffer));
   } else {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
-                        buffer->buffer()->CopyToDevice(to_device));
-    return DevicePutResult(std::move(copied_buffer), weak_type);
+    TF_ASSIGN_OR_RETURN(tsl::RCReference<ifrt::Array> copied_ifrt_array,
+                        buffer->ifrt_array()->Reshard(
+                            ifrt::SingleDeviceSharding::Create(to_device),
+                            ifrt::ArrayCopySemantics::kReuseInput));
+    return DevicePutResult(std::move(copied_ifrt_array), weak_type);
   }
 }
 
-StatusOr<DevicePutResult> HandlePyBuffer(py::handle obj, PjRtDevice* to_device,
+StatusOr<DevicePutResult> HandlePyBuffer(py::handle obj,
+                                         ifrt::Client* client,
+                                         ifrt::Device* to_device,
                                          const DevicePutOptions& options) {
   return PyBufferHelper(obj, obj, PyBuffer::AsPyBufferUnchecked(obj),
                         to_device);
 }
 
-StatusOr<DevicePutResult> HandlePyArray(py::handle obj, PjRtDevice* to_device,
+StatusOr<DevicePutResult> HandlePyArray(py::handle obj,
+                                        ifrt::Client* client,
+                                        ifrt::Device* to_device,
                                         const DevicePutOptions& options) {
   auto py_array = py::reinterpret_borrow<PyArray>(obj);
 
@@ -241,23 +287,29 @@ StatusOr<DevicePutResult> HandlePyArray(py::handle obj, PjRtDevice* to_device,
   if (py_array.sharding().get_type() == jax::PmapSharding::type()) {
     // We are only handling single device case for PmapSharding here. For other
     // cases, it fallbacks to python.
-    return HandleNumpyArray(obj.attr("_value"), to_device, options);
+    return HandleNumpyArray(obj.attr("_value"), client, to_device, options);
   }
 
-  PjRtBuffer* buffer = py_array.GetBuffer(0);
-  if (buffer->device() == to_device) {
+  ifrt::Array* ifrt_array = py_array.ifrt_array();
+  if (ifrt_array == nullptr) {
+    return InvalidArgument("Array has been deleted.");
+  }
+  if (ifrt_array->sharding().devices().front() == to_device) {
     return DevicePutResult(
-        buffer, py_array.weak_type(),
+        tsl::FormRef(ifrt_array), py_array.weak_type(),
         /*owning_pybuffer=*/py::reinterpret_borrow<py::object>(obj));
   } else {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> copied_buffer,
-                        buffer->CopyToDevice(to_device));
-    return DevicePutResult(std::move(copied_buffer), py_array.weak_type());
+    TF_ASSIGN_OR_RETURN(
+        tsl::RCReference<ifrt::Array> copied_ifrt_array,
+        ifrt_array->Reshard(ifrt::SingleDeviceSharding::Create(to_device),
+                            ifrt::ArrayCopySemantics::kReuseInput));
+    return DevicePutResult(std::move(copied_ifrt_array), py_array.weak_type());
   }
 }
 
 StatusOr<DevicePutResult> HandleDeviceArray(py::handle obj,
-                                            PjRtDevice* to_device,
+                                            ifrt::Client* client,
+                                            ifrt::Device* to_device,
                                             const DevicePutOptions& options) {
   // Handle Python DeviceArray objects provided they have a .device_buffer field
   // Otherwise, fallback to handling as a NumPy array, since we do not
@@ -265,7 +317,7 @@ StatusOr<DevicePutResult> HandleDeviceArray(py::handle obj,
   // in JAX is handled by this path.
   py::object buffer = py::getattr(obj, "device_buffer", py::none());
   if (buffer.is_none()) {
-    return HandleNumpyArray(obj, to_device, options);
+    return HandleNumpyArray(obj, client, to_device, options);
   }
 
   return PyBufferHelper(obj, buffer, py::cast<PyBuffer*>(buffer), to_device);
@@ -273,7 +325,9 @@ StatusOr<DevicePutResult> HandleDeviceArray(py::handle obj,
 
 }  // namespace
 
-StatusOr<DevicePutResult> DevicePut(py::handle arg, PjRtDevice* to_device,
+StatusOr<DevicePutResult> DevicePut(py::handle arg,
+                                    ifrt::Client* client,
+                                    ifrt::Device* to_device,
                                     const DevicePutOptions& options) {
   tsl::profiler::TraceMe traceme("DevicePut");
   static const absl::flat_hash_map<PyObject*, DevicePutFunc>* const handlers =
@@ -330,6 +384,9 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg, PjRtDevice* to_device,
         (*p)[dtypes.np_uint16.ptr()] = HandleNumpyScalar<uint16_t>;
         (*p)[dtypes.np_uint32.ptr()] = HandleNumpyScalar<uint32_t>;
         (*p)[dtypes.np_uint64.ptr()] = HandleNumpyScalar<uint64_t, uint32_t>;
+        (*p)[dtypes.np_float8_e4m3fn.ptr()] =
+            HandleNumpyScalar<tsl::float8_e4m3fn>;
+        (*p)[dtypes.np_float8_e5m2.ptr()] = HandleNumpyScalar<tsl::float8_e5m2>;
         (*p)[dtypes.np_bfloat16.ptr()] = HandleNumpyScalar<bfloat16>;
         (*p)[dtypes.np_float16.ptr()] = HandleNumpyScalar<half>;
         (*p)[dtypes.np_float32.ptr()] = HandleNumpyScalar<float>;
@@ -350,13 +407,13 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg, PjRtDevice* to_device,
   if (arg.get_type() == PyArray::type()) {
     auto array = py::reinterpret_borrow<PyArray>(arg);
     if (array.fastpath_enabled()) {
-      return HandlePyArray(arg, to_device, options);
+      return HandlePyArray(arg, client, to_device, options);
     }
   }
 
   // Fast-path for the most common case of PyBuffer.
   if (arg.get_type().ptr() == PyBuffer::type()) {
-    return HandlePyBuffer(arg, to_device, options);
+    return HandlePyBuffer(arg, client, to_device, options);
   }
 
   auto res = handlers->find(arg.get_type().ptr());
@@ -364,7 +421,7 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg, PjRtDevice* to_device,
     for (auto base_class : arg.get_type().attr("__mro__")) {
       res = handlers->find(base_class.ptr());
       if (res != handlers->end()) {
-        return res->second(arg, to_device, options);
+        return res->second(arg, client, to_device, options);
       }
     }
     return InvalidArgument(
@@ -374,7 +431,7 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg, PjRtDevice* to_device,
                   "(see implementation), or Python scalars. Got type ",
                   py::cast<std::string>(py::str(arg.get_type()))));
   }
-  return res->second(arg, to_device, options);
+  return res->second(arg, client, to_device, options);
 }
 
 bool IsFloat0(py::array arg) {
@@ -545,6 +602,8 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
         (*p)[dtypes.np_uint16.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_uint32.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_uint64.ptr()] = np_uint64_handler;
+        (*p)[dtypes.np_float8_e4m3fn.ptr()] = numpy_array_handler;
+        (*p)[dtypes.np_float8_e5m2.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float16.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_bfloat16.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float32.ptr()] = numpy_array_handler;
@@ -560,8 +619,13 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
   if (arg.get_type() == PyArray::type()) {
     auto array = py::reinterpret_borrow<PyArray>(arg);
     if (array.fastpath_enabled()) {
-      auto dtype = array.GetBuffer(0)->on_device_shape().element_type();
-      return PyArgSignature(dtype, array.shape(), array.weak_type());
+      ifrt::Array* ifrt_array = array.ifrt_array();
+      if (ifrt_array == nullptr) {
+        return xla::InvalidArgument("Array has been deleted.");
+      }
+      TF_ASSIGN_OR_RETURN(auto primitive_type,
+                          ifrt::ToPrimitiveType(ifrt_array->dtype()));
+      return PyArgSignature(primitive_type, array.shape(), array.weak_type());
     }
   }
 
@@ -571,8 +635,9 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
     bool weak_type = buffer->weak_type().has_value()
                          ? *buffer->weak_type()
                          : py::cast<bool>(arg.attr("aval").attr("weak_type"));
-    return PyArgSignature(buffer->buffer()->on_device_shape().element_type(),
-                          buffer->buffer()->on_device_shape().dimensions(),
+    TF_ASSIGN_OR_RETURN(auto primitive_type,
+                        ifrt::ToPrimitiveType(buffer->ifrt_array()->dtype()));
+    return PyArgSignature(primitive_type, buffer->ifrt_array()->shape().dims(),
                           weak_type);
   }
 
diff --git a/tensorflow/compiler/xla/python/py_values.h b/tensorflow/compiler/xla/python/py_values.h
index e160f67ba4c..6d8e322f37c 100644
--- a/tensorflow/compiler/xla/python/py_values.h
+++ b/tensorflow/compiler/xla/python/py_values.h
@@ -19,6 +19,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_PY_VALUES_H_
 
 #include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
 
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -28,23 +31,17 @@ limitations under the License.
 namespace xla {
 
 struct DevicePutResult {
-  explicit DevicePutResult(PjRtBuffer* b, bool weak_type,
-                           pybind11::object owning_pybuffer)
-      : buffer(b), weak_type(weak_type), owning_pybuffer(owning_pybuffer) {}
-  explicit DevicePutResult(std::unique_ptr<PjRtBuffer> new_buffer,
-                           bool weak_type)
-      : buffer(new_buffer.get()),
+  explicit DevicePutResult(
+      tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type,
+      pybind11::object owning_pybuffer = pybind11::object())
+      : ifrt_array(std::move(ifrt_array)),
         weak_type(weak_type),
-        owned_buffer(std::move(new_buffer)) {}
+        owning_pybuffer(owning_pybuffer) {}
 
-  // Points to the on-device buffer. Not owned.
-  PjRtBuffer* buffer;
+  // Points to the on-device array. Not owned.
+  tsl::RCReference<ifrt::Array> ifrt_array;
   bool weak_type;
 
-  // One of owned_buffer or owning_pybuffer is valid. If owned_buffer is
-  // non-null, it holds ownership of the buffer. Otherwise owning_pybuffer is
-  // the PyBuffer object that owns the buffer.
-  std::unique_ptr<PjRtBuffer> owned_buffer;
   pybind11::object owning_pybuffer;
 };
 
@@ -61,7 +58,8 @@ struct DevicePutOptions {
   bool squash_64bit_types = false;
   bool allow_zero_copy = true;
 };
-StatusOr<DevicePutResult> DevicePut(pybind11::handle arg, PjRtDevice* to_device,
+StatusOr<DevicePutResult> DevicePut(pybind11::handle arg, ifrt::Client* client,
+                                    ifrt::Device* to_device,
                                     const DevicePutOptions& options);
 
 // Returns `true` if `arg` is a JAX float0 array.
diff --git a/tensorflow/compiler/xla/python/pytree.cc b/tensorflow/compiler/xla/python/pytree.cc
index 7ef55f48ba7..4d03b355228 100644
--- a/tensorflow/compiler/xla/python/pytree.cc
+++ b/tensorflow/compiler/xla/python/pytree.cc
@@ -90,6 +90,42 @@ namespace py = pybind11;
   return it == registry->registrations_.end() ? nullptr : it->second.get();
 }
 
+/*static*/ std::vector<py::object> GetSortedPyDictKeys(PyObject* py_dict) {
+  std::vector<py::object> keys;
+  keys.reserve(PyDict_Size(py_dict));
+  PyObject* key;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(py_dict, &pos, &key, /*value=*/nullptr)) {
+    keys.push_back(py::reinterpret_borrow<py::object>(key));
+  }
+
+  int ret = 0;
+  std::stable_sort(keys.begin(), keys.end(),
+                   [&ret](const py::object& a, const py::object& b) {
+                     int cmp =
+                         PyObject_RichCompareBool(a.ptr(), b.ptr(), Py_LT);
+                     if (cmp == -1) ret = -1;
+                     return cmp;
+                   });
+  if (ret == -1) {
+    throw py::error_already_set();
+  }
+  return keys;
+}
+
+/*static*/ bool IsSortedPyDictKeysEqual(absl::Span<const py::object> lhs,
+                                        absl::Span<const py::object> rhs) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (int i = 0; i < lhs.size(); ++i) {
+    if (lhs[i].not_equal(rhs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool PyTreeDef::operator==(const PyTreeDef& other) const {
   if (traversal_.size() != other.traversal_.size()) {
     return false;
@@ -99,12 +135,16 @@ bool PyTreeDef::operator==(const PyTreeDef& other) const {
     const Node& b = other.traversal_[i];
     if (a.kind != b.kind || a.arity != b.arity ||
         (a.node_data.ptr() == nullptr) != (b.node_data.ptr() == nullptr) ||
+        (a.sorted_dict_keys.size() != b.sorted_dict_keys.size()) ||
         a.custom != b.custom) {
       return false;
     }
     if (a.node_data && a.node_data.not_equal(b.node_data)) {
       return false;
     }
+    if (!IsSortedPyDictKeysEqual(a.sorted_dict_keys, b.sorted_dict_keys)) {
+      return false;
+    }
     // We don't need to test equality of num_leaves and num_nodes since they
     // are derivable from the other node data.
   }
@@ -165,16 +205,13 @@ void PyTreeDef::FlattenIntoImpl(
       }
       case PyTreeKind::kDict: {
         py::dict dict = py::reinterpret_borrow<py::dict>(handle);
-        py::list keys =
-            py::reinterpret_steal<py::list>(PyDict_Keys(dict.ptr()));
-        if (PyList_Sort(keys.ptr())) {
-          throw py::error_already_set();
-        }
+
+        std::vector<py::object> keys = GetSortedPyDictKeys(dict.ptr());
         for (py::handle key : keys) {
           recurse(dict[key]);
         }
         node.arity = dict.size();
-        node.node_data = std::move(keys);
+        node.sorted_dict_keys = std::move(keys);
         break;
       }
       case PyTreeKind::kCustom: {
@@ -329,9 +366,8 @@ py::object PyTreeDef::Unflatten(absl::Span<const py::object> leaves) const {
 
     case PyTreeKind::kDict: {
       py::dict dict;
-      py::list keys = py::reinterpret_borrow<py::list>(node.node_data);
       for (int i = 0; i < node.arity; ++i) {
-        dict[keys[i]] = std::move(children[i]);
+        dict[node.sorted_dict_keys[i]] = std::move(children[i]);
       }
       return std::move(dict);
       break;
@@ -415,15 +451,13 @@ py::list PyTreeDef::FlattenUpTo(py::handle xs) const {
               absl::StrFormat("Expected dict, got %s.", py::repr(object)));
         }
         py::dict dict = py::reinterpret_borrow<py::dict>(object);
-        py::list keys =
-            py::reinterpret_steal<py::list>(PyDict_Keys(dict.ptr()));
-        if (PyList_Sort(keys.ptr())) {
-          throw xla::XlaRuntimeError("Dictionary key sort failed.");
-        }
-        if (keys.not_equal(node.node_data)) {
-          throw std::invalid_argument(
-              absl::StrFormat("Dict key mismatch; expected keys: %s; dict: %s.",
-                              py::repr(node.node_data), py::repr(object)));
+        std::vector<py::object> keys = GetSortedPyDictKeys(dict.ptr());
+        if (!IsSortedPyDictKeysEqual(keys, node.sorted_dict_keys)) {
+          // Convert to a py::list for py::repr to avoid having to stringify a
+          // vector. This is error path so it is fine to pay conversion cost.
+          throw std::invalid_argument(absl::StrFormat(
+              "Dict key mismatch; expected keys: %s; dict: %s.",
+              py::repr(py::cast(node.sorted_dict_keys)), py::repr(object)));
         }
         for (py::handle key : keys) {
           agenda.push_back(dict[key]);
@@ -524,8 +558,12 @@ py::object PyTreeDef::Walk(const py::function& f_node, py::handle f_leaf,
           tuple[i] = agenda.back();
           agenda.pop_back();
         }
-        agenda.push_back(
-            f_node(tuple, node.node_data ? node.node_data : py::none()));
+        py::object node_data = node.node_data;
+        if (node.kind == PyTreeKind::kDict) {
+          // Convert to a py::list for f_node invocation.
+          node_data = py::cast(node.sorted_dict_keys);
+        }
+        agenda.push_back(f_node(tuple, node_data ? node_data : py::none()));
       }
     }
   }
@@ -661,13 +699,13 @@ std::string PyTreeDef::ToString() const {
         representation = absl::StrCat("[", children, "]");
         break;
       case PyTreeKind::kDict: {
-        if (py::len(node.node_data) != node.arity) {
+        if (node.sorted_dict_keys.size() != node.arity) {
           throw std::logic_error("Number of keys and entries does not match.");
         }
         representation = "{";
         std::string separator;
         auto child_iter = agenda.end() - node.arity;
-        for (const py::handle& key : node.node_data) {
+        for (const py::handle& key : node.sorted_dict_keys) {
           absl::StrAppendFormat(&representation, "%s%s: %s", separator,
                                 py::repr(key), *child_iter);
           child_iter++;
@@ -713,9 +751,16 @@ std::string PyTreeDef::ToString() const {
 py::object PyTreeDef::ToPickleable() const {
   py::list traversal;
   for (const auto& node : traversal_) {
+    py::object node_data = node.node_data;
+    if (node.kind == PyTreeKind::kDict) {
+      // Convert to a py::list for pickling to avoid having to pickle a vector.
+      // Pickle should be a rare operation so this conversion cost is hopefully
+      // on non-critical path.
+      node_data = py::cast(node.sorted_dict_keys);
+    }
     traversal.append(
         py::make_tuple(static_cast<int>(node.kind), node.arity,
-                       node.node_data ? node.node_data : py::none(),
+                       node_data ? node_data : py::none(),
                        node.custom != nullptr ? node.custom->type : py::none(),
                        node.num_leaves, node.num_nodes));
   }
@@ -737,7 +782,7 @@ PyTreeDef PyTreeDef::FromPickleable(py::object pickleable) {
         node.node_data = t[2].cast<py::type>();
         break;
       case PyTreeKind::kDict:
-        node.node_data = t[2].cast<py::list>();
+        node.sorted_dict_keys = t[2].cast<std::vector<py::object>>();
         break;
       case PyTreeKind::kCustom:
         node.node_data = t[2];
diff --git a/tensorflow/compiler/xla/python/pytree.h b/tensorflow/compiler/xla/python/pytree.h
index 2b6adbea5e8..a6b00a885ed 100644
--- a/tensorflow/compiler/xla/python/pytree.h
+++ b/tensorflow/compiler/xla/python/pytree.h
@@ -186,10 +186,17 @@ class PyTreeDef {
     int arity = 0;
 
     // Kind-specific auxiliary data. For a kNamedTuple, contains the tuple type
-    // object. For a kDict, contains a sorted list of keys. For a kCustom type,
-    // contains the auxiliary data returned by the `to_iterable` function.
+    // object. For a kDict, use `sorted_dict_keys` field below. For a kCustom
+    // type, contains the auxiliary data returned by the `to_iterable` function.
     pybind11::object node_data;
 
+    // Kind-specific auxiliary data specialized for kDict. Use a c++ vector
+    // to hold the sorted dict keys instead of a py::list to avoid creating
+    // a new python list object when flattening kDict. For deeply nested dict,
+    // using c++ vector instead of py::list avoids creating too many python
+    // objects that make python gc sweep slow.
+    std::vector<pybind11::object> sorted_dict_keys;
+
     // Custom type registration. Must be null for non-custom types.
     const PyTreeTypeRegistry::Registration* custom = nullptr;
 
diff --git a/tensorflow/compiler/xla/python/sharded_device_array.cc b/tensorflow/compiler/xla/python/sharded_device_array.cc
index ffb10058013..2870f0a7bc1 100644
--- a/tensorflow/compiler/xla/python/sharded_device_array.cc
+++ b/tensorflow/compiler/xla/python/sharded_device_array.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/sharded_device_array.h"
 
+#include <memory>
 #include <optional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/sharding.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/python_utils.h"
@@ -75,40 +81,73 @@ void ShardedDeviceArray::Delete() {
   if (is_deleted_) {
     return;
   }
-  // We can't inline this expression into the for loop! Here, .value()
-  // returns an rvalue reference to the Span embedded in the StatusOr.
-  // Binding the reference would extend the lifetime of the Span itself,
-  // but not of the StatusOr, causing stack-use-after-scope errors. Also see
-  // https://en.cppreference.com/w/cpp/language/range-for#Temporary_range_expression
-  auto buffers = GetPjRtBuffers().value();
-  for (xla::PjRtBuffer* pjrt_buffer : buffers) {
-    pjrt_buffer->Delete();
+  auto array = ifrt_array();
+  if (!array.ok()) {
+    return;
   }
+  ifrt_array_ = std::nullopt;
   device_buffers_ = std::nullopt;
   cpp_device_buffers_ = std::nullopt;
   npy_value_ = std::nullopt;
   is_deleted_ = true;
 }
 
-xla::StatusOr<absl::Span<xla::PjRtBuffer* const>>
-ShardedDeviceArray::GetPjRtBuffers() {
-  if (cpp_device_buffers_.has_value()) {
-    return absl::MakeConstSpan(cpp_device_buffers_.value());
+xla::StatusOr<xla::ifrt::Array*> ShardedDeviceArray::ifrt_array() {
+  if (ifrt_array_.has_value()) {
+    return ifrt_array_->get();
   }
-
   if (!device_buffers_.has_value()) {
     return xla::InvalidArgument("ShardedDeviceArray has been deleted.");
   }
   const int num_devices = device_buffers_->size();
-  std::vector<xla::PjRtBuffer*> cpp_device_buffers;
-  cpp_device_buffers.reserve(num_devices);
-  int i = 0;
+  std::vector<tsl::RCReference<xla::ifrt::Array>> ifrt_arrays;
+  ifrt_arrays.reserve(num_devices);
+  std::vector<xla::ifrt::Shape> shapes;
+  shapes.reserve(num_devices);
+  xla::ifrt::DeviceList::Devices devices;
+  devices.reserve(num_devices);
   for (auto& handle : device_buffers_.value()) {
     // Note that invariants guarantee the cast should never fail.
     TF_ASSIGN_OR_RETURN(xla::PyBuffer * pybuffer,
                         xla::PyBuffer::AsPyBuffer(handle));
-    cpp_device_buffers.push_back(pybuffer->buffer());
-    i += 1;
+    ifrt_arrays.push_back(tsl::FormRef(pybuffer->ifrt_array()));
+    shapes.push_back(pybuffer->ifrt_array()->shape());
+    devices.push_back(pybuffer->ifrt_array()->sharding().devices().front());
+  }
+  xla::ifrt::Client* client = ifrt_arrays.front()->client();
+  xla::ifrt::Shape shape(
+      pybind11::cast<std::vector<int64_t>>(aval_.attr("shape")));
+  auto sharding = xla::ifrt::OpaqueSharding::Create(
+      xla::ifrt::DeviceList(std::move(devices)),
+      xla::ifrt::OpaqueSharding::MakeDisassembleFuncFromShapes(
+          std::move(shapes)));
+  TF_ASSIGN_OR_RETURN(
+      auto ifrt_array,
+      client->AssembleArrayFromSingleDeviceArrays(
+          std::move(shape), std::move(sharding), absl::MakeSpan(ifrt_arrays),
+          xla::ifrt::ArrayCopySemantics::kReuseInput));
+  ifrt_array_ = std::move(ifrt_array);
+  return ifrt_array_->get();
+}
+
+xla::StatusOr<absl::Span<xla::PjRtBuffer* const>>
+ShardedDeviceArray::pjrt_buffers() {
+  if (cpp_device_buffers_.has_value()) {
+    return absl::MakeConstSpan(*cpp_device_buffers_);
+  }
+
+  TF_ASSIGN_OR_RETURN(auto* ifrt_array, ifrt_array());
+  auto* pjrt_array =
+      llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleArray>(ifrt_array);
+  if (pjrt_array == nullptr) {
+    throw xla::XlaRuntimeError(
+        "This operation is implemented for a PjRt-compatible backend only.");
+  }
+  const int num_devices = device_buffers_->size();
+  std::vector<xla::PjRtBuffer*> cpp_device_buffers;
+  cpp_device_buffers.reserve(num_devices);
+  for (const auto& pjrt_buffer : pjrt_array->pjrt_buffers()) {
+    cpp_device_buffers.push_back(pjrt_buffer.get());
   }
   cpp_device_buffers_ = std::move(cpp_device_buffers);
   return absl::MakeConstSpan(cpp_device_buffers_.value());
diff --git a/tensorflow/compiler/xla/python/sharded_device_array.h b/tensorflow/compiler/xla/python/sharded_device_array.h
index 318fd40d606..ebc5a905d06 100644
--- a/tensorflow/compiler/xla/python/sharded_device_array.h
+++ b/tensorflow/compiler/xla/python/sharded_device_array.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_SHARDED_DEVICE_ARRAY_H_
 
+#include <memory>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -241,8 +242,12 @@ class ShardedDeviceArray {
   // The Numpy value on the host, if it exists, will also be deleted.
   void Delete();
   const ShardingSpec& GetShardingSpec() const { return sharding_spec_; }
+
+  // Returns an error status iff the object has been deleted.
+  xla::StatusOr<xla::ifrt::Array*> ifrt_array();
+
   // Returns an error status iff the object has been deleted.
-  xla::StatusOr<absl::Span<xla::PjRtBuffer* const>> GetPjRtBuffers();
+  xla::StatusOr<absl::Span<xla::PjRtBuffer* const>> pjrt_buffers();
 
   bool is_deleted() const { return is_deleted_; }
   bool weak_type() const { return weak_type_; }
@@ -333,6 +338,8 @@ class ShardedDeviceArray {
   std::optional<pybind11::object> npy_value_ = std::nullopt;
   std::optional<pybind11::object> one_replica_buffer_indices_ = std::nullopt;
 
+  std::optional<tsl::RCReference<xla::ifrt::Array>> ifrt_array_ = std::nullopt;
+
   // The device_buffers as a C++ object. As this is what we consume from C++
   // and this is also what we generate from C++, cache the result so that
   // we don't have to perform casts.
diff --git a/tensorflow/compiler/xla/python/sharding.cc b/tensorflow/compiler/xla/python/sharding.cc
index cfbacae8c47..cb09a5695ec 100644
--- a/tensorflow/compiler/xla/python/sharding.cc
+++ b/tensorflow/compiler/xla/python/sharding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/compiler/xla/python/util.h"
 
 namespace jax {
 
@@ -26,16 +27,22 @@ namespace py = pybind11;
 size_t ShardingHash(const pybind11::object& sharding) {
   auto type = sharding.get_type();
 
-  if (type.is(MeshPspecSharding::type())) {
-    const auto* mesh_sharding = py::cast<jax::MeshPspecSharding*>(sharding);
+  if (type.is(NamedSharding::type())) {
+    const auto* mesh_sharding = xla::fast_cast<jax::NamedSharding>(sharding);
     return absl::Hash<void*>()(mesh_sharding->mesh().ptr());
   }
 
   if (type.is(OpShardingSharding::type())) {
-    auto* op_sharding = py::cast<OpShardingSharding*>(sharding);
+    auto* op_sharding = xla::fast_cast<OpShardingSharding>(sharding);
     return op_sharding->Hash();
   }
 
+  if (type.is(SingleDeviceSharding::type())) {
+    auto* single_device_sharding =
+        xla::fast_cast<SingleDeviceSharding>(sharding);
+    return absl::Hash<void*>()(single_device_sharding->device().ptr());
+  }
+
   return py::hash(sharding);
 }
 
@@ -47,19 +54,36 @@ bool ShardingEqual(const pybind11::object& a, const pybind11::object& b) {
 
   if (!a_type.is(b_type)) return false;
 
-  if (a_type.is(MeshPspecSharding::type())) {
-    auto* a_mesh_sharding = py::cast<const MeshPspecSharding*>(a);
-    auto* b_mesh_sharding = py::cast<const MeshPspecSharding*>(b);
+  if (a_type.is(NamedSharding::type())) {
+    auto* a_mesh_sharding = xla::fast_cast<const NamedSharding>(a);
+    auto* b_mesh_sharding = xla::fast_cast<const NamedSharding>(b);
 
     return a_mesh_sharding->mesh().ptr() == b_mesh_sharding->mesh().ptr() &&
            a_mesh_sharding->spec().equal(b_mesh_sharding->spec());
   }
 
+  if (a_type.is(OpShardingSharding::type())) {
+    auto* a_op_sharding_sharding = xla::fast_cast<const OpShardingSharding>(a);
+    auto* b_op_sharding_sharding = xla::fast_cast<const OpShardingSharding>(b);
+
+    return a_op_sharding_sharding == b_op_sharding_sharding;
+  }
+
+  if (a_type.is(SingleDeviceSharding::type())) {
+    auto* a_single_device_sharding =
+        xla::fast_cast<const SingleDeviceSharding>(a);
+    auto* b_single_device_sharding =
+        xla::fast_cast<const SingleDeviceSharding>(b);
+
+    return a_single_device_sharding->device().ptr() ==
+           b_single_device_sharding->device().ptr();
+  }
+
   return a.equal(b);
 }
 
-MeshPspecSharding::MeshPspecSharding(py::object mesh, py::object spec,
-                                     py::object parsed_pspec)
+NamedSharding::NamedSharding(py::object mesh, py::object spec,
+                             py::object parsed_pspec)
     : XLACompatibleSharding(/*num_devices=*/[&mesh]() {
         py::array devices = mesh.attr("devices");
         return devices.size();
@@ -84,14 +108,14 @@ void RegisterSharding(py::module& m) {
                                               py::metaclass(abc_meta));
   abc_init(py::type::of<XLACompatibleSharding>());
 
-  py::class_<MeshPspecSharding, XLACompatibleSharding>(m, "MeshPspecSharding",
-                                                       py::dynamic_attr())
+  py::class_<NamedSharding, XLACompatibleSharding>(m, "NamedSharding",
+                                                   py::dynamic_attr())
       .def(py::init<py::object, py::object, py::object>(), py::arg("mesh"),
            py::arg("spec"), py::arg("_parsed_pspec") = py::none())
-      .def_property_readonly("mesh", &MeshPspecSharding::mesh)
-      .def_property_readonly("spec", &MeshPspecSharding::spec)
-      .def_property("_parsed_pspec", &MeshPspecSharding::parsed_pspec,
-                    &MeshPspecSharding::set_parsed_pspec);
+      .def_property_readonly("mesh", &NamedSharding::mesh)
+      .def_property_readonly("spec", &NamedSharding::spec)
+      .def_property("_parsed_pspec", &NamedSharding::parsed_pspec,
+                    &NamedSharding::set_parsed_pspec);
 
   py::class_<SingleDeviceSharding, XLACompatibleSharding>(
       m, "SingleDeviceSharding", py::dynamic_attr())
diff --git a/tensorflow/compiler/xla/python/sharding.h b/tensorflow/compiler/xla/python/sharding.h
index ee446c33118..dc21113181a 100644
--- a/tensorflow/compiler/xla/python/sharding.h
+++ b/tensorflow/compiler/xla/python/sharding.h
@@ -55,6 +55,9 @@ class Sharding {
   std::optional<int> num_devices_;
 };
 
+// Returns a hash that may sometimes return different hashes for equal values.
+// It is not a correct implementation of `__hash__` in python, but it's fine
+// for jit/pjit dispatch since it only causes spurious cache misses.
 size_t ShardingHash(const pybind11::object& obj);
 
 bool ShardingEqual(const pybind11::object& a, const pybind11::object& b);
@@ -66,10 +69,10 @@ class XLACompatibleSharding : public Sharding {
   ~XLACompatibleSharding() override = default;
 };
 
-class MeshPspecSharding : public XLACompatibleSharding {
+class NamedSharding : public XLACompatibleSharding {
  public:
-  MeshPspecSharding(pybind11::object mesh, pybind11::object spec,
-                    pybind11::object parsed_pspec);
+  NamedSharding(pybind11::object mesh, pybind11::object spec,
+                pybind11::object parsed_pspec);
 
   const pybind11::object& mesh() const { return mesh_; }
   const pybind11::object& spec() const { return spec_; }
@@ -79,7 +82,7 @@ class MeshPspecSharding : public XLACompatibleSharding {
   }
 
   static pybind11::handle type() {
-    static auto type = pybind11::type::handle_of<MeshPspecSharding>();
+    static auto type = pybind11::type::handle_of<NamedSharding>();
     return type;
   }
 
@@ -96,6 +99,11 @@ class SingleDeviceSharding : public XLACompatibleSharding {
 
   const pybind11::object& device() const { return device_; }
 
+  static pybind11::handle type() {
+    static auto type = pybind11::type::handle_of<SingleDeviceSharding>();
+    return type;
+  }
+
  private:
   pybind11::object device_;
 };
@@ -152,6 +160,19 @@ class OpShardingSharding : public XLACompatibleSharding {
     return type;
   }
 
+  xla::HloSharding hlo_sharding() const {
+    auto hlo_sharding = xla::HloSharding::FromProto(op_sharding_);
+    if (!hlo_sharding.ok()) {
+      throw xla::XlaRuntimeError(hlo_sharding.status().error_message());
+    }
+    return hlo_sharding.value();
+  }
+
+  bool operator==(const OpShardingSharding& other) const {
+    return AreOpShardingsEqual(*this, other) &&
+           this->devices().equal(other.devices());
+  }
+
  private:
   size_t CalculateHash() const {
     // We only hash `op_sharding_` here for performance.
@@ -162,6 +183,27 @@ class OpShardingSharding : public XLACompatibleSharding {
     return absl::Hash<xla::HloSharding>()(*hlo_sharding);
   }
 
+  bool IsOpShardingReplicated() const {
+    if (op_sharding_.tile_assignment_devices().size() == 1) {
+      return true;
+    } else {
+      return hlo_sharding().IsReplicated();
+    }
+  }
+
+  static bool AreOpShardingsEqual(const OpShardingSharding& a,
+                                  const OpShardingSharding& b) {
+    // If the OpSharding object is the same, return true
+    if (&a.op_sharding() == &b.op_sharding()) {
+      return true;
+    }
+    // If both OpShardings are replicated, return true
+    if (a.IsOpShardingReplicated() && b.IsOpShardingReplicated()) {
+      return true;
+    }
+    return a.hlo_sharding() == b.hlo_sharding();
+  }
+
   pybind11::tuple devices_;
   xla::OpSharding op_sharding_;
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/BUILD b/tensorflow/compiler/xla/python/tpu_driver/BUILD
index 42286f2b02b..6952c57b8e1 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
     "//tensorflow/compiler/xla/python/tpu_driver:platform/external/tools.bzl",
     "external_deps",
@@ -9,7 +9,10 @@ load(
 
 licenses(["notice"])
 
-package(default_visibility = ["//visibility:public"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
 
 tf_proto_library(
     name = "tpu_driver_proto",
@@ -71,7 +74,7 @@ cc_library(
         ":tpu_service_proto_cc",
         ":tpu_service_cc_grpc_proto",
         ":tpu_driver_proto_cc",
-    ] + tf_grpc_cc_dependencies() + external_deps(),
+    ] + tsl_grpc_cc_dependencies() + external_deps(),
     alwayslink = 1,
 )
 
@@ -131,8 +134,7 @@ cc_library(
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/core:protos_all_cc",
-    ] + tf_grpc_cc_dependencies() + external_deps(),
+    ] + tsl_grpc_cc_dependencies() + external_deps(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index 2a8e059489a..1211557c373 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -1,8 +1,9 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "pybind_extension")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "tsl_pybind_extension")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
 
@@ -45,7 +46,7 @@ cc_library(
     ],
 )
 
-pybind_extension(
+tsl_pybind_extension(
     name = "tpu_client_extension",
     srcs = [
         "tpu_client_extension.cc",
@@ -64,7 +65,7 @@ pybind_extension(
         "//tensorflow/compiler/xla/python:util",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
-        "//tensorflow/python:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index e54116d44e9..34cb50d1151 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -24,14 +24,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/python/util.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
 
 namespace xla {
 
 namespace py = pybind11;
 
 PYBIND11_MODULE(tpu_client_extension, m) {
-  CHECK(tensorflow::RegisterNumpyBfloat16());
+  CHECK(tsl::RegisterNumpyBfloat16());
 
   py::class_<PyTpuClient, std::shared_ptr<PyTpuClient>>(m, "TpuClient")
       .def_static("Get", &PyTpuClient::Get, py::arg("worker"))
@@ -175,17 +175,6 @@ PYBIND11_MODULE(tpu_client_extension, m) {
              return buffer->CopyToDevice(std::move(dst_device));
            })
       .def("delete", &PyTpuBuffer::Delete)
-      .def("block_host_until_ready",
-           [](PyTpuBuffer* buffer) {
-             // TODO(phawkins): remove 3 months after the release of jaxlib >=
-             // 0.3.2.
-             PythonDeprecationWarning(
-                 "block_host_until_ready() on a JAX array object is "
-                 "deprecated, use block_until_ready() instead.");
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             return buffer->BlockHostUntilReady();
-           })
       .def("block_until_ready",
            [](PyTpuBuffer* buffer) {
              GlobalPyRefManager()->CollectGarbage();
diff --git a/tensorflow/compiler/xla/python/traceback.cc b/tensorflow/compiler/xla/python/traceback.cc
index 3ca5cf7cd67..79c7518c4d3 100644
--- a/tensorflow/compiler/xla/python/traceback.cc
+++ b/tensorflow/compiler/xla/python/traceback.cc
@@ -44,9 +44,11 @@ Traceback::Traceback() {
     frames_.emplace_back(py_frame->f_code, py_frame->f_lasti);
   }
 #else   // PY_VERSION_HEX < 0x030b0000
+  PyFrameObject* next;
   for (PyFrameObject* py_frame = PyThreadState_GetFrame(thread_state);
-       py_frame != nullptr; py_frame = PyFrame_GetBack(py_frame)) {
+       py_frame != nullptr; py_frame = next) {
     frames_.emplace_back(PyFrame_GetCode(py_frame), PyFrame_GetLasti(py_frame));
+    next = PyFrame_GetBack(py_frame);
     Py_XDECREF(py_frame);
   }
 #endif  // PY_VERSION_HEX < 0x030b0000
@@ -89,7 +91,7 @@ std::vector<Traceback::Frame> Traceback::Frames() const {
         std::string(py::reinterpret_borrow<py::str>(frame.first->co_filename)),
         std::string(py::reinterpret_borrow<py::str>(frame.first->co_name)),
         frame.first->co_firstlineno,
-        PyCode_Addr2Line(frame.first, frame.second)});
+        PyCode_Addr2Line(frame.first, frame.second * kLastiWordBytes)});
   }
   return frames;
 }
@@ -125,7 +127,8 @@ py::object Traceback::AsPythonTraceback() const {
         py::reinterpret_steal<py::object>(
             reinterpret_cast<PyObject*>(py_frame)),
         /*tb_lasti=*/frame.second,
-        /*tb_lineno=*/PyCode_Addr2Line(frame.first, frame.second));
+        /*tb_lineno=*/
+        PyCode_Addr2Line(frame.first, frame.second * kLastiWordBytes));
   }
   return traceback;
 }
@@ -188,7 +191,7 @@ void BuildTracebackSubmodule(py::module& m) {
           throw xla::XlaRuntimeError("code argument must be a code object");
         }
         return PyCode_Addr2Line(reinterpret_cast<PyCodeObject*>(code.ptr()),
-                                lasti);
+                                lasti * kLastiWordBytes);
       },
       "Python wrapper around the Python C API function PyCode_Addr2Line");
 
diff --git a/tensorflow/compiler/xla/python/traceback.h b/tensorflow/compiler/xla/python/traceback.h
index 361876ee129..b66a0cb9e18 100644
--- a/tensorflow/compiler/xla/python/traceback.h
+++ b/tensorflow/compiler/xla/python/traceback.h
@@ -26,6 +26,11 @@ limitations under the License.
 
 namespace xla {
 
+// The representation of frame->f_lasti changed from bytes to words in Python
+// 3.10, see https://docs.python.org/3/whatsnew/3.10.html#changes-in-the-c-api
+// This should match sizeof(_Py_CODEUNIT) which is unfortunately private.
+static constexpr int kLastiWordBytes = (PY_VERSION_HEX < 0x030a0000) ? 1 : 2;
+
 // Represents a Python traceback.
 class Traceback {
  public:
diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc
index 862996e87ea..322e3c2f625 100644
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
 
 namespace xla {
 
@@ -36,6 +38,8 @@ xla::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
           {{'u', 2}, U16},
           {{'u', 4}, U32},
           {{'u', 8}, U64},
+          {{'V', 1}, F8E4M3FN},
+          {{'f', 1}, F8E5M2},
           {{'V', 2}, BF16},  // array protocol code for raw data (void*)
           {{'f', 2}, F16},
           {{'f', 4}, F32},
@@ -71,8 +75,17 @@ xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
       return py::dtype::of<uint32_t>();
     case U64:
       return py::dtype::of<uint64_t>();
+    case F8E4M3FN: {
+      py::handle f8_e4m3fn(tsl::Float8e4m3fnDtype());
+      return py::dtype::from_args(
+          py::reinterpret_borrow<py::object>(f8_e4m3fn));
+    }
+    case F8E5M2: {
+      py::handle f8_e5m2(tsl::Float8e5m2Dtype());
+      return py::dtype::from_args(py::reinterpret_borrow<py::object>(f8_e5m2));
+    }
     case BF16: {
-      py::handle bfloat16(tensorflow::Bfloat16Dtype());
+      py::handle bfloat16(tsl::Bfloat16Dtype());
       return py::dtype::from_args(py::reinterpret_borrow<py::object>(bfloat16));
     }
     case F16:
@@ -105,7 +118,11 @@ const NumpyScalarTypes& GetNumpyScalarTypes() {
     dtypes->np_uint32 = py::object(numpy.attr("uint32"));
     dtypes->np_uint64 = py::object(numpy.attr("uint64"));
     dtypes->np_bfloat16 =
-        py::reinterpret_borrow<py::object>(tensorflow::Bfloat16Dtype());
+        py::reinterpret_borrow<py::object>(tsl::Bfloat16Dtype());
+    dtypes->np_float8_e4m3fn =
+        py::reinterpret_borrow<py::object>(tsl::Float8e4m3fnDtype());
+    dtypes->np_float8_e5m2 =
+        py::reinterpret_borrow<py::object>(tsl::Float8e5m2Dtype());
     dtypes->np_float16 = py::object(numpy.attr("float16"));
     dtypes->np_float32 = py::object(numpy.attr("float32"));
     dtypes->np_float64 = py::object(numpy.attr("float64"));
@@ -159,39 +176,42 @@ StatusOr<std::string> FormatDescriptorForPrimitiveType(PrimitiveType type) {
 }
 
 StatusOr<py::str> TypeDescriptorForPrimitiveType(PrimitiveType type) {
-  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__,
-                "Big endian support not implemented");
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define ENDIAN_PREFIX "<"
+#else
+#define ENDIAN_PREFIX ">"
+#endif
   switch (type) {
     case PRED:
       return py::str("|b1");
     case S8:
       return py::str("|i1");
     case S16:
-      return py::str("<i2");
+      return py::str(ENDIAN_PREFIX "i2");
     case S32:
-      return py::str("<i4");
+      return py::str(ENDIAN_PREFIX "i4");
     case S64:
-      return py::str("<i8");
+      return py::str(ENDIAN_PREFIX "i8");
     case U8:
       return py::str("|u1");
     case U16:
-      return py::str("<u2");
+      return py::str(ENDIAN_PREFIX "u2");
     case U32:
-      return py::str("<u4");
+      return py::str(ENDIAN_PREFIX "u4");
     case U64:
-      return py::str("<u8");
+      return py::str(ENDIAN_PREFIX "u8");
     case BF16:
-      return py::str("<V2");
+      return py::str(ENDIAN_PREFIX "V2");
     case F16:
-      return py::str("<f2");
+      return py::str(ENDIAN_PREFIX "f2");
     case F32:
-      return py::str("<f4");
+      return py::str(ENDIAN_PREFIX "f4");
     case F64:
-      return py::str("<f8");
+      return py::str(ENDIAN_PREFIX "f8");
     case C64:
-      return py::str("<c8");
+      return py::str(ENDIAN_PREFIX "c8");
     case C128:
-      return py::str("<c16");
+      return py::str(ENDIAN_PREFIX "c16");
     default:
       return Unimplemented("Unimplemented primitive type %s",
                            PrimitiveType_Name(type));
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index 137e44c1a40..a323640c6ab 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -60,6 +60,8 @@ struct NumpyScalarTypes {
   pybind11::object np_uint32;
   pybind11::object np_uint64;
   pybind11::object np_bfloat16;
+  pybind11::object np_float8_e4m3fn;
+  pybind11::object np_float8_e5m2;
   pybind11::object np_float16;
   pybind11::object np_float32;
   pybind11::object np_float64;
diff --git a/tensorflow/compiler/xla/python/util.cc b/tensorflow/compiler/xla/python/util.cc
new file mode 100644
index 00000000000..d6799cc2a46
--- /dev/null
+++ b/tensorflow/compiler/xla/python/util.cc
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/util.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+Status AwaitBuffersReady(ifrt::Array* ifrt_array) {
+  Status s = ifrt_array->GetReadyFuture().Await();
+  if (!s.ok()) {
+    // Fix up error string because some clients rely on it.
+    if (s.error_message() ==
+        "GetReadyFuture() called on deleted or donated buffer") {
+      s = InvalidArgument(
+          "BlockHostUntilReady() called on deleted or donated buffer");
+    }
+  }
+  return s;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/util.h b/tensorflow/compiler/xla/python/util.h
index f45f8992e18..1fed49ad7b5 100644
--- a/tensorflow/compiler/xla/python/util.h
+++ b/tensorflow/compiler/xla/python/util.h
@@ -16,11 +16,73 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_UTIL_H_
 
+#include <memory>
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/status.h"
 
 namespace xla {
 
+// Backward compatibility for vectorcalls in Python 3.8. Remove this after
+// dropping support for Python 3.8.
+#if PY_VERSION_HEX < 0x03090000
+#define JAX_PyObject_Vectorcall _PyObject_Vectorcall
+#define JAX_TPFLAGS_HAVE_VECTORCALL _Py_TPFLAGS_HAVE_VECTORCALL
+#else  // PY_VERSION_HEX < 0x30900000
+#define JAX_PyObject_Vectorcall PyObject_Vectorcall
+#define JAX_TPFLAGS_HAVE_VECTORCALL Py_TPFLAGS_HAVE_VECTORCALL
+#endif  // PY_VERSION_HEX < 0x30900000
+
+// Faster version of the pybind11 cast x.cast<T*>.
+// pybind11's cast is fairly slow because it looks up the type information
+// in a global hash table. It's not a particularly fast hash table and the
+// lookup is pointless when we know the target type and can cache the lookup.
+// This function does depend on a number of pybind11 internals;
+// if it ever bitrots, one option is to replace it with a pybind11 cast.
+// Return nullptr if the cast fails.
+template <typename T>
+T* fast_cast(pybind11::handle h) {
+  static pybind11::detail::type_info* const type_info = []() {
+    auto* type_info =
+        pybind11::detail::get_type_info(typeid(T), /*throw_if_missing=*/false);
+    CHECK(type_info);
+    CHECK(type_info->simple_type);
+    return type_info;
+  }();
+  PyTypeObject* srctype = Py_TYPE(h.ptr());
+  auto reinterpret_cast_ok = [&]() {
+    // Exact type match.
+    if (srctype == type_info->type) {
+      return true;
+    }
+    // If we have a subtype, then look for a base type that matches.
+    if (PyType_IsSubtype(srctype, type_info->type)) {
+      const auto& bases = pybind11::detail::all_type_info(srctype);
+      for (auto* base : bases) {
+        if (PyType_IsSubtype(base->type, type_info->type)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+  if (!reinterpret_cast_ok()) {
+    // Fall back to pybind11's usual cast.
+    return h.cast<T*>();
+  }
+  auto* instance = reinterpret_cast<pybind11::detail::instance*>(h.ptr());
+  if (instance->simple_layout) {
+    return reinterpret_cast<T*>(instance->simple_value_holder[0]);
+  } else {
+    return reinterpret_cast<T*>(
+        pybind11::detail::values_and_holders(instance).begin()->value_ptr());
+  }
+}
+
 // Issues a Python deprecation warning. Throws a C++ exception if issuing the
 // Python warning causes a Python exception to be raised.
 template <typename... Args>
@@ -32,6 +94,10 @@ void PythonDeprecationWarning(const absl::FormatSpec<Args...>& format,
   }
 }
 
+// Requests if given buffers are ready, awaits for results and returns OK if
+// all of the buffers are ready or the last non-ok status.
+Status AwaitBuffersReady(ifrt::Array* ifrt_array);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PYTHON_UTIL_H_
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 5f78bb3139c..c8bc89b9c3b 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -19,6 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+// clang-format off
+// Must be included first
+#include "tensorflow/compiler/xla/python/py_client.h"
+#include "tensorflow/tsl/python/lib/core/numpy.h"  //NOLINT
+// clang-format on
+
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -30,18 +36,17 @@ limitations under the License.
 #include "pybind11/pytypes.h"
 #include "pybind11/stl_bind.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/client.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/distributed.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/service.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
-#include "tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h"
 #ifdef XLA_PYTHON_ENABLE_GPU
 #include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #endif  // XLA_PYTHON_ENABLE_GPU
 #include "tensorflow/compiler/xla/pjrt/interpreter_device.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
 #ifdef XLA_PYTHON_ENABLE_PLUGIN_DEVICE
 #include "tensorflow/compiler/xla/pjrt/pjrt_plugin_device_client.h"
 #endif  // XLA_PYTHON_ENABLE_PLUGIN_DEVICE
@@ -50,11 +55,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h"
 #include "tensorflow/compiler/xla/pjrt/tpu_client.h"
 #endif  // XLA_PYTHON_ENABLE_TPU
+#include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/compiler/xla/python/custom_call_sharding.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
 #include "tensorflow/compiler/xla/python/mlir.h"
-#include "tensorflow/compiler/xla/python/numpy.h"
 #include "tensorflow/compiler/xla/python/ops.h"
 #include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 #include "tensorflow/compiler/xla/python/pjit.h"
@@ -77,7 +82,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
 
 // TODO(phawkins): remove host_id properties after JAX is update to avoid them.
 
@@ -97,8 +104,10 @@ bool IsOptimizedBuild() {
 }  // namespace
 
 PYBIND11_MODULE(xla_extension, m) {
-  ImportNumpy();
-  CHECK(tensorflow::RegisterNumpyBfloat16());
+  tsl::ImportNumpy();
+  CHECK(tsl::RegisterNumpyBfloat16());
+  CHECK(tsl::RegisterNumpyFloat8e4m3fn());
+  CHECK(tsl::RegisterNumpyFloat8e5m2());
 
   // Exceptions
   py::register_exception<XlaRuntimeError>(m, "XlaRuntimeError",
@@ -117,6 +126,8 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("U32", U32)
       .value("U64", U64)
       .value("F16", F16)
+      .value("F8E4M3FN", F8E4M3FN)
+      .value("F8E5M2", F8E5M2)
       .value("BF16", BF16)
       .value("F32", F32)
       .value("F64", F64)
@@ -126,8 +137,11 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("OPAQUE_TYPE", OPAQUE_TYPE)
       .value("TOKEN", TOKEN);
 
-  m.def("bfloat16_dtype",
-        []() { return py::handle(tensorflow::Bfloat16Dtype()); });
+  m.def("bfloat16_dtype", []() { return py::handle(tsl::Bfloat16Dtype()); });
+  m.def("float8_e4m3fn_dtype",
+        []() { return py::handle(tsl::Float8e4m3fnDtype()); });
+  m.def("float8_e5m2_dtype",
+        []() { return py::handle(tsl::Float8e5m2Dtype()); });
 
   // Must be before PyClient.compile.
   BuildXlaCompilerSubmodule(m);
@@ -150,8 +164,8 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_property_readonly("task_id", &PjRtDevice::process_index,
                              "Deprecated; please use process_index")
       .def_property_readonly("platform",
-                             [](const PjRtDevice& device) {
-                               return device.client()->platform_name();
+                             [](const ClientAndPtr<PjRtDevice>& device) {
+                               return device.client->platform_name();
                              })
       .def_property_readonly("device_kind", &PjRtDevice::device_kind)
       .def_property_readonly(
@@ -237,19 +251,28 @@ PYBIND11_MODULE(xla_extension, m) {
            &PyClient::CreateDeviceToHostChannelHandle)
       .def("create_host_to_device_channel_handle",
            &PyClient::CreateHostToDeviceChannelHandle)
-      .def("buffer_from_pyval", &PyClient::BufferFromPyval, py::arg("argument"),
-           py::arg("device") = nullptr, py::arg("force_copy") = false,
-           py::arg("host_buffer_semantics") =
-               PjRtClient::HostBufferSemantics::kZeroCopy)
+      .def(
+          "buffer_from_pyval",
+          [](py::handle py_client, py::handle argument, py::handle py_device,
+             bool force_copy,
+             PjRtClient::HostBufferSemantics host_buffer_semantics) {
+            PyClient* client = fast_cast<PyClient>(py_client);
+            PjRtDevice* device = py_device.is_none()
+                                     ? nullptr
+                                     : fast_cast<PjRtDevice>(py_device);
+            return client->BufferFromPyval(argument, device, force_copy,
+                                           host_buffer_semantics);
+          },
+          py::arg("argument"), py::arg("device") = nullptr,
+          py::arg("force_copy") = false,
+          py::arg("host_buffer_semantics") =
+              PjRtClient::HostBufferSemantics::kZeroCopy)
       .def("make_cross_host_receive_buffers",
            &PyClient::MakeCrossHostReceiveBuffers, py::arg("shapes"),
            py::arg("device"))
       .def("compile", &PyClient::Compile, py::arg("computation"),
            py::arg("compile_options") = CompileOptions(),
            py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("compile", &PyClient::CompileMlir, py::arg("computation"),
-           py::arg("compile_options") = CompileOptions(),
-           py::arg("host_callbacks") = std::vector<py::capsule>())
       .def("serialize_executable", &PyClient::SerializeExecutable)
       .def("deserialize_executable",
            py::overload_cast<const std::string&, CompileOptions,
@@ -282,30 +305,27 @@ PYBIND11_MODULE(xla_extension, m) {
            py::arg("result_shapes"), py::arg("operand_layouts") = std::nullopt,
            py::arg("has_side_effects") = false);
 
-  m.def(
-      "get_cpu_client",
-      [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
-        py::gil_scoped_release gil_release;
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                            GetCpuClient(asynchronous));
-        return std::make_shared<PyClient>(std::move(client));
-      },
-      py::arg("asynchronous") = true);
   m.def(
       "get_tfrt_cpu_client",
       [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
         py::gil_scoped_release gil_release;
         TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                             GetTfrtCpuClient(asynchronous));
-        return std::make_shared<PyClient>(std::move(client));
+        return std::make_shared<PyClient>(
+            ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("asynchronous") = true);
   m.def("get_interpreter_client", []() -> StatusOr<std::shared_ptr<PyClient>> {
     py::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                         GetInterpreterClient());
-    return std::make_shared<PyClient>(std::move(client));
+    return std::make_shared<PyClient>(
+        ifrt::PjRtClient::Create(std::move(client)));
   });
+  m.def("load_pjrt_plugin",
+        [](std::string platform_name, std::string library_path) -> Status {
+          return pjrt::LoadPjrtPlugin(platform_name, library_path);
+        });
 
 #ifdef XLA_PYTHON_ENABLE_GPU
   py::class_<GpuAllocatorConfig> alloc_config(m, "GpuAllocatorConfig");
@@ -319,10 +339,6 @@ PYBIND11_MODULE(xla_extension, m) {
       .value("BFC", GpuAllocatorConfig::Kind::kBFC)
       .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
 
-  // TODO(tomhennigan): Remove this types.
-  py::class_<StreamExecutorGpuDevice, PjRtDevice,
-             ClientAndPtr<StreamExecutorGpuDevice>>
-      gpu_device(m, "GpuDevice");
   m.def(
       "get_gpu_client",
       [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
@@ -336,7 +352,8 @@ PYBIND11_MODULE(xla_extension, m) {
             GetStreamExecutorGpuClient(asynchronous, allocator_config,
                                        std::move(distributed_client), node_id,
                                        allowed_devices, platform_name));
-        return std::make_shared<PyClient>(std::move(client));
+        return std::make_shared<PyClient>(
+            ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("asynchronous") = true,
       py::arg("allocator_config") = GpuAllocatorConfig(),
@@ -346,24 +363,25 @@ PYBIND11_MODULE(xla_extension, m) {
 #endif  // XLA_PYTHON_ENABLE_GPU
 
 #ifdef XLA_PYTHON_ENABLE_TPU
-  // TODO(tomhennigan): Remove this types.
-  py::class_<PjRtTpuDevice, PjRtDevice, ClientAndPtr<PjRtTpuDevice>> tpu_device(
-      m, "TpuDevice");
   m.def(
       "get_tpu_client",
       [](int max_inflight_computations) -> StatusOr<std::shared_ptr<PyClient>> {
         py::gil_scoped_release gil_release;
         TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
                             GetTpuClient(max_inflight_computations));
-        return std::make_shared<PyClient>(std::move(client));
+        return std::make_shared<PyClient>(
+            ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("max_inflight_computations") = 32);
-  m.def("get_tfrt_tpu_c_api_client",
-        []() -> StatusOr<std::shared_ptr<PyClient>> {
+  // TODO(b/262050449): move out from `#ifdef XLA_PYTHON_ENABLE_TPU` when
+  // GetCApiClient does not depend on TPU.
+  m.def("get_c_api_client",
+        [](std::string platform_name) -> StatusOr<std::shared_ptr<PyClient>> {
           py::gil_scoped_release gil_release;
           TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> c_api_client,
-                              GetCApiClient());
-          return std::make_shared<PyClient>(std::move(c_api_client));
+                              GetCApiClient(platform_name));
+          return std::make_shared<PyClient>(
+              ifrt::PjRtClient::Create(std::move(c_api_client)));
         });
 #endif  // XLA_PYTHON_ENABLE_TPU
 
@@ -373,7 +391,8 @@ PYBIND11_MODULE(xla_extension, m) {
           py::gil_scoped_release gil_release;
           TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                               GetTfrtPluginDeviceClient());
-          return std::make_shared<PyClient>(std::move(client));
+          return std::make_shared<PyClient>(
+              ifrt::PjRtClient::Create(std::move(client)));
         });
 #endif  // XLA_PYTHON_ENABLE_PLUGIN_DEVICE
 
@@ -445,6 +464,10 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("get_parameter_shardings",
            &PyLoadedExecutable::GetParameterShardings)
       .def("keep_alive", &PyLoadedExecutable::KeepAlive)
+      .def("compile_options",
+           [](const PyLoadedExecutable& self) {
+             return self.pjrt_executable()->GetCompileOptions();
+           })
       .def_property_readonly("traceback", &PyLoadedExecutable::traceback)
       .def_property_readonly("fingerprint",
                              [](PyLoadedExecutable* exec) -> py::object {
@@ -478,25 +501,25 @@ PYBIND11_MODULE(xla_extension, m) {
   BuildMlirSubmodule(m);
   BuildCustomCallShardingPybindAPI(m);
 
-  py::class_<tensorflow::PreemptionSyncManager,
-             std::unique_ptr<tensorflow::PreemptionSyncManager>>
+  py::class_<tsl::PreemptionSyncManager,
+             std::unique_ptr<tsl::PreemptionSyncManager>>
       preemption_sync_manager(m, "PreemptionSyncManager");
   preemption_sync_manager
       .def(
           "initialize",
-          [](tensorflow::PreemptionSyncManager& manager,
+          [](tsl::PreemptionSyncManager& manager,
              DistributedRuntimeClient* client) {
-            TF_ASSIGN_OR_RETURN(tensorflow::CoordinationServiceAgent * agent,
+            TF_ASSIGN_OR_RETURN(tsl::CoordinationServiceAgent * agent,
                                 client->GetCoordinationServiceAgent());
             return manager.Initialize(agent);
           },
           py::arg("distributed_client"))
       .def("reached_sync_point",
-           [](tensorflow::PreemptionSyncManager& manager, int step_counter) {
+           [](tsl::PreemptionSyncManager& manager, int step_counter) {
              return manager.ReachedSyncPoint(step_counter);
            });
   m.def("create_preemption_sync_manager",
-        []() { return tensorflow::CreatePreemptionSyncManager(); });
+        []() { return tsl::CreatePreemptionSyncManager(); });
 
   py::class_<DistributedRuntimeService,
              std::unique_ptr<DistributedRuntimeService>>
@@ -537,7 +560,21 @@ PYBIND11_MODULE(xla_extension, m) {
             py::gil_scoped_release gil_release;
             return client.KeyValueSet(key, value);
           },
-          py::arg("key"), py::arg("value"));
+          py::arg("key"), py::arg("value"))
+      .def(
+          "key_value_dir_get",
+          [](DistributedRuntimeClient& client, std::string key) {
+            py::gil_scoped_release gil_release;
+            return client.KeyValueDirGet(key);
+          },
+          py::arg("key"))
+      .def(
+          "key_value_delete",
+          [](DistributedRuntimeClient& client, std::string key) {
+            py::gil_scoped_release gil_release;
+            return client.KeyValueDelete(key);
+          },
+          py::arg("key"));
 
   m.def(
       "get_distributed_runtime_service",
@@ -644,7 +681,10 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
       .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
       .def("get_compiled_memory_stats", &PjRtExecutable::GetCompiledMemoryStats)
-      .def("serialize", &PjRtExecutable::SerializeExecutable);
+      .def("compile_options", &PjRtExecutable::GetCompileOptions)
+      .def("serialize", [](const PjRtExecutable& exec) -> py::bytes {
+        return ValueOrThrow(exec.SerializeExecutable());
+      });
 
   m.def(
       "compile",
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index c605d18a220..d441344cd3b 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -19,11 +19,11 @@
 import enum  # pylint: disable=g-bad-import-order
 import gzip
 import inspect
+import logging
 import os
-from typing import List, Sequence, Tuple, Union
+from typing import Dict, List, Sequence, Tuple, Union
 
 from . import xla_extension as _xla
-
 import numpy as np
 
 # Note this module does *not* depend on any Python protocol buffers. The XLA
@@ -43,26 +43,26 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes.
-_version = 102
+_version = 123
 
 # Version number for MLIR:Python components.
-mlir_api_version = 36
+mlir_api_version = 43
 
 xla_platform_names = {
     'cpu': 'Host',
     'gpu': 'CUDA',
 }
 
+logger = logging.getLogger(__name__)
+
 
 def make_interpreter_client():
   return _xla.get_interpreter_client()
 
 
 def make_cpu_client(*, use_tfrt: bool = True) -> ...:
-  if use_tfrt:
-    return _xla.get_tfrt_cpu_client(asynchronous=True)
-  else:
-    return _xla.get_cpu_client(asynchronous=True)
+  assert use_tfrt
+  return _xla.get_tfrt_cpu_client(asynchronous=True)
 
 
 def make_gpu_client(distributed_client=None, node_id=0, platform_name=None,
@@ -98,17 +98,21 @@ def make_gpu_client(distributed_client=None, node_id=0, platform_name=None,
 
 
 def make_tfrt_tpu_c_api_client():
-  return _xla.get_tfrt_tpu_c_api_client()
+  return _xla.get_c_api_client('tpu')
 
 
-def make_tpu_client():
-  """Returns a TPU client. Defaults to allowing 32 in-flight computations."""
+def _use_pjrt_c_api() -> bool:
   use_pjrt_c_api = os.getenv('JAX_USE_PJRT_C_API_ON_TPU', 'false')
-  if use_pjrt_c_api not in ('1', 'true', 'false'):
+  if use_pjrt_c_api not in ('1', '0', 'true', 'false'):
     raise ValueError(
-        'JAX_USE_PJRT_C_API_ON_TPU env var must be "1", "true" or "false", '
-        f'got "{use_pjrt_c_api}"')
-  if use_pjrt_c_api in ('1', 'true'):
+        'JAX_USE_PJRT_C_API_ON_TPU env var must be "0", "1", "true" or '
+        f'"false", got "{use_pjrt_c_api}"')
+  return use_pjrt_c_api in ('1', 'true')
+
+
+def make_tpu_client(use_pjrt_c_api: bool = False):
+  """Returns a TPU client. Defaults to allowing 32 in-flight computations."""
+  if use_pjrt_c_api or _use_pjrt_c_api():
     return make_tfrt_tpu_c_api_client()
 
   max_inflight_computations = os.getenv(
@@ -135,6 +139,46 @@ def make_plugin_device_client():
         '(defaults to false) to enable this.') from e
 
 
+def _get_pjrt_plugin_names_and_library_paths() -> Dict[str, str]:
+  """Gets the names and library paths of PJRT plugins to load from ENV.
+
+  By default, TPU with path set in 'TPU_LIBRARY_PATH' will be loaded. Set
+  PJRT_NAMES_AND_LIBRARY_PATHS='name1:path1,name2:path2' to load other PJRT
+  plugins as well.
+
+  Returns:
+    A dict of {plugin_name: library path} for the PJRT plugins to load.
+  """
+  pjrt_plugins = {'tpu': os.getenv('TPU_LIBRARY_PATH', 'libtpu.so')}
+  plugins_from_env = os.getenv('PJRT_NAMES_AND_LIBRARY_PATHS', '')
+  if not plugins_from_env:
+    return pjrt_plugins
+
+  for plugin in plugins_from_env.split(','):
+    try:
+      name, library_path = plugin.split(':')
+      pjrt_plugins[name] = library_path
+    except ValueError:
+      logger.warning('invalid value in env PJRT_NAMES_AND_LIBRARY_PATHS: %s',
+                     plugin)
+  return pjrt_plugins
+
+
+# TODO(b/237099479): Move to xla_bridge.py when ready.
+def maybe_load_pjrt_plugins() -> None:
+  """Tries to load PJRT plugin for platform."""
+  if not _use_pjrt_c_api():
+    return
+  # TODO(b/261345120): implement plugin discovery.
+  pjrt_plugins = _get_pjrt_plugin_names_and_library_paths()
+  for plugin_name, library_path in pjrt_plugins.items():
+    try:
+      _xla.load_pjrt_plugin(plugin_name, library_path)
+    except Exception as e:  # pylint: disable=broad-except
+      logger.error("Error loading '%s' plugin from '%s': %s", plugin_name,
+                   library_path, e)
+
+
 class OpMetadata:
   """Python representation of a xla.OpMetadata protobuf."""
   __slots__ = ('op_type', 'op_name', 'source_file', 'source_line')
@@ -160,6 +204,8 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
 PrimitiveType = _xla.PrimitiveType
 
 bfloat16 = _xla.bfloat16_dtype()
+float8_e4m3fn = _xla.float8_e4m3fn_dtype()
+float8_e5m2 = _xla.float8_e5m2_dtype()
 
 XLA_ELEMENT_TYPE_TO_DTYPE = {
     PrimitiveType.PRED: np.dtype('bool'),
@@ -171,6 +217,8 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
     PrimitiveType.U16: np.dtype('uint16'),
     PrimitiveType.U32: np.dtype('uint32'),
     PrimitiveType.U64: np.dtype('uint64'),
+    PrimitiveType.F8E4M3FN: np.dtype(float8_e4m3fn),
+    PrimitiveType.F8E5M2: np.dtype(float8_e5m2),
     PrimitiveType.BF16: np.dtype(bfloat16),
     PrimitiveType.F16: np.dtype('float16'),
     PrimitiveType.F32: np.dtype('float32'),
@@ -423,7 +471,7 @@ def window_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
 HloSharding = _xla.HloSharding
 Sharding = _xla.Sharding
 XLACompatibleSharding = _xla.XLACompatibleSharding
-MeshPspecSharding = _xla.MeshPspecSharding
+NamedSharding = _xla.NamedSharding
 SingleDeviceSharding = _xla.SingleDeviceSharding
 PmapSharding = _xla.PmapSharding
 OpShardingSharding = _xla.OpShardingSharding
diff --git a/tensorflow/compiler/xla/python/xla_client.pyi b/tensorflow/compiler/xla/python/xla_client.pyi
index 9ba8055fb4c..cadac762291 100644
--- a/tensorflow/compiler/xla/python/xla_client.pyi
+++ b/tensorflow/compiler/xla/python/xla_client.pyi
@@ -45,7 +45,7 @@ from .xla_extension import XlaComputation as XlaComputation
 from .xla_extension import XlaOp as XlaOp
 from .xla_extension import Sharding as Sharding
 from .xla_extension import XLACompatibleSharding as XLACompatibleSharding
-from .xla_extension import MeshPspecSharding as MeshPspecSharding
+from .xla_extension import NamedSharding as NamedSharding
 from .xla_extension import SingleDeviceSharding as SingleDeviceSharding
 from .xla_extension import PmapSharding as PmapSharding
 from .xla_extension import OpShardingSharding as OpShardingSharding
@@ -55,6 +55,8 @@ _version: int
 mlir_api_version: int
 
 bfloat16: numpy.dtype
+float8_e4m3fn: numpy.dtype
+float8_e5m2: numpy.dtype
 XLA_ELEMENT_TYPE_TO_DTYPE: Dict[PrimitiveType, numpy.dtype]
 
 
@@ -101,6 +103,8 @@ def make_tpu_client() -> Client:
 def make_plugin_device_client() -> Client:
   ...
 
+def maybe_load_pjrt_plugins() -> None:
+  ...
 
 class OpMetadata:
 
diff --git a/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
index 5aadba13426..e1fdd4cc67d 100644
--- a/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
@@ -73,6 +73,22 @@ def testHloModuleToHloText(self):
     hlo_text = computation.as_hlo_module().to_string()
     self.assertTrue(hlo_text.startswith("HloModule acomputation"))
 
+  def testHloModuleFromText(self):
+    hlo_module_text = """HloModule test
+        add {
+          x = f32[] parameter(0)
+          y = f32[] parameter(1)
+          ROOT add = f32[] add(x, y)
+        }
+        ENTRY entry {
+          p0 = f32[2,3] parameter(0)
+          start = f32[2,3] all-reduce-start(p0), to_apply=add
+          ROOT done = f32[2,3] all-reduce-done(start)
+        }"""
+    hlo_module = xla_client._xla.hlo_module_from_text(hlo_module_text)
+    hlo_text = hlo_module.to_string()
+    self.assertTrue(hlo_text.startswith("HloModule test"))
+
   def testHloModuleToHloGraph(self):
     computation = self.ExampleComputation()
     hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
@@ -138,5 +154,42 @@ def testStartServer(self):
     del server
 
 
+class HloModuleGroupTest(absltest.TestCase):
+
+  def testHloModuleGroup(self):
+    builder0 = xla_client.XlaBuilder("computation0")
+    p0 = ops.Parameter(builder0, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder0, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    root = ops.Mul(p0, p1)
+    computation0 = builder0.build(root)
+
+    m = computation0.get_hlo_module()
+    mg_name = "test_module_group"
+    mg = xla_client._xla.HloModuleGroup(mg_name, [m])
+    self.assertEqual(mg.name, mg_name)
+
+    modules = mg.to_modules()
+    self.assertLen(modules, 1)
+    self.assertEqual(m.to_string(), modules[0].to_string())
+
+
+class RunHloPassTest(absltest.TestCase):
+
+  def testHloDCE(self):
+    b = xla_client.XlaBuilder("acomputation")
+    p0 = ops.Parameter(b, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(b, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    root = ops.Mul(p0, p1)
+
+    # Dead instructions
+    p2 = ops.Parameter(b, 2, xla_client.shape_from_pyval(np.float32(0)))
+    ops.Add(p2, p2)
+
+    hlo_module = b.build(root).get_hlo_module()
+    self.assertTrue(xla_client._xla.HloDCE().run(hlo_module))
+
+
 if __name__ == "__main__":
   absltest.main()
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 98a1bfebb92..36fc0c34f9b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -29,14 +29,6 @@
 
 from tensorflow.compiler.xla.python import xla_client
 
-# pylint: disable=g-import-not-at-top
-try:
-  # This import is only used for GPU; the dependency is incompatible with TPU
-  # so it results in an import error.
-  from tensorflow.python.framework import test_util
-except ImportError:
-  test_util = None
-
 # pylint: disable=g-import-not-at-top
 try:
   from tensorflow.compiler.xla.python import custom_call_for_test
@@ -44,7 +36,11 @@
   custom_call_for_test = None
 
 bfloat16 = xla_client.bfloat16
+float8_e4m3fn = xla_client.float8_e4m3fn
+float8_e5m2 = xla_client.float8_e5m2
 ops = xla_client.ops
+xla_computation_to_mlir_module = (
+    xla_client._xla.mlir.xla_computation_to_mlir_module)
 
 FLAGS = flags.FLAGS
 
@@ -56,7 +52,9 @@
 def TestFactory(xla_backend,
                 cloud_tpu=False,
                 tfrt_tpu=False,
-                external_tpu=False):
+                external_tpu=False,
+                pjrt_c_api=False,
+                pathways=False):
   tests = []
 
   if not cloud_tpu:
@@ -70,6 +68,10 @@ def TestFactory(xla_backend,
     float_dtypes = [np.float32]
     complex_dtypes = [np.complex64]
     standard_dtypes = int_dtypes + float_dtypes + complex_dtypes + [np.bool_]
+  # TODO(zhangqiaorjc): test fp8 types when XLA support is complete.
+  # standard_dtypes is only used for BufferProtocolTest so we only test fp8
+  # round trip tests.
+  standard_dtypes += [float8_e4m3fn, float8_e5m2]
   dlpack_dtypes = int_dtypes + float_dtypes + [np.bool_] + complex_dtypes
 
   class ComputationTest(parameterized.TestCase):
@@ -85,7 +87,8 @@ def _NewComputation(self, name=None):
       return xla_client.XlaBuilder(name)
 
     def _Execute(self, c, arguments):
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       return xla_client.execute_with_python_values(
           compiled_c, arguments, backend=self.backend)
 
@@ -145,20 +148,22 @@ def ExampleComputation(self):
       ops.Add(x, x)
       return builder.build()
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testCompiledHloModuleToHloText(self):
       computation = self.ExampleComputation()
-      executable = self.backend.compile(computation)
+      executable = self.backend.compile(
+          xla_computation_to_mlir_module(computation))
       hlo_modules = executable.hlo_modules()
       self.assertLen(hlo_modules, 1)
       hlo_text = hlo_modules[0].to_string()
       self.assertTrue(hlo_text.startswith("HloModule acomputation"))
       self.assertIn("fusion", hlo_text)
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testCompiledHloModuleAsSerializedProto(self):
       computation = self.ExampleComputation()
-      executable = self.backend.compile(computation)
+      executable = self.backend.compile(
+          xla_computation_to_mlir_module(computation))
       hlo_modules = executable.hlo_modules()
       self.assertLen(hlo_modules, 1)
       hlo_text = hlo_modules[0].to_string()
@@ -167,7 +172,7 @@ def testCompiledHloModuleAsSerializedProto(self):
       hlo_text_roundtrip = hlo_module_roundtrip.to_string()
       self.assertEqual(hlo_text, hlo_text_roundtrip)
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testStableComputationSerialization(self):
       # Ideally we would test identical computations produced in different
       # processes. For now we have this limited smoke test.
@@ -176,18 +181,22 @@ def testStableComputationSerialization(self):
       for _ in range(10):
         self.assertEqual(computation.as_serialized_hlo_module_proto(), ref)
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    # TODO(b/261771737): some version of this should work with pjrt_c_api=True
+    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
     def testFlopEstimate(self):
       computation = self.ExampleComputation()
       properties = xla_client._xla.hlo_module_cost_analysis(
           self.backend, computation.as_hlo_module())
       self.assertEqual(properties["flops"], 8.0)
 
+    # TODO(b/264472335): implement fingerprint for PJRT C API
+    @unittest.skipIf(pjrt_c_api, "not implemented")
     def testFingerprint(self):
       computation = self.ExampleComputation()
-      executable = self.backend.compile(computation)
+      executable = self.backend.compile(
+          xla_computation_to_mlir_module(computation))
       fingerprint = executable.fingerprint
-      if self.backend.platform == "tpu" and not cloud_tpu:
+      if self.backend.platform == "tpu" and not (cloud_tpu or pathways):
         logging.info("fingerprint: %s", fingerprint)
         self.assertNotEmpty(fingerprint)
       else:
@@ -512,8 +521,9 @@ def testExecuteFromProto(self):
 
       # Load and execute the proto
       c = xla_client.XlaComputation(serialized_proto)
+      m = xla_computation_to_mlir_module(c)
       ans, = xla_client.execute_with_python_values(
-          self.backend.compile(c), (), backend=self.backend)
+          self.backend.compile(m), (), backend=self.backend)
       np.testing.assert_equal(ans, np.int32(3))
 
   tests.append(ComputationFromProtoTest)
@@ -583,14 +593,15 @@ def testTwoParameterSum(self):
                      NumpyArrayF32(3.14)],
           expected=[4.25])
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testCannotCallWithDeletedBuffers(self):
       c = self._NewComputation()
       ops.Add(
           ops.Parameter(c, 0, xla_client.shape_from_pyval(NumpyArrayF32(0.))),
           ops.Constant(c, np.float32(3.14)))
       arg = NumpyArrayF32(1.11)
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       arg_buffer = self.backend.buffer_from_pyval(arg)
       arg_buffer.delete()
       with self.assertRaises(xla_client.XlaRuntimeError):
@@ -642,6 +653,7 @@ def testBlockUntilReadyRaisesOnDeletedBuffer(self):
               "BlockHostUntilReady() called on deleted or donated buffer")):
         buffer.block_until_ready()
 
+    @unittest.skipIf(pjrt_c_api, "b/264472918")
     def testDeviceArrayBaseSignatures(self):
       # When extending `DeviceArrayBase`, the object behaves as a `DeviceArray`
       # and thus needs to correctly implement the following methods.
@@ -734,6 +746,10 @@ def testStandardTypes(self):
       for dtype in standard_dtypes:
         if dtype == bfloat16 or dtype == np.complex128:
           continue
+        # NV FP8 not supported on TPU.
+        if (dtype in [float8_e4m3fn, float8_e5m2] and
+            self.backend.platform == "tpu"):
+          continue
         arr = self.backend.buffer_from_pyval(np.array([0, 1], dtype))
         arr = np.asarray(arr)
         self.assertEqual(dtype, type(arr[0]))
@@ -751,7 +767,7 @@ def testUnsafeBufferPointer(self):
       self.assertGreaterEqual(arg1_buffer.unsafe_buffer_pointer(), 0)
       self.assertGreaterEqual(arg2_buffer.unsafe_buffer_pointer(), 0)
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testClone(self):
       x = np.array([[3., 4., 5.]], np.float32)
       y = self.backend.buffer_from_pyval(x)
@@ -760,7 +776,7 @@ def testClone(self):
       np.testing.assert_array_equal(np.asarray(y), np.asarray(z))
       self.assertEqual(y.unsafe_buffer_pointer(), z.unsafe_buffer_pointer())
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways, "not implemented")
     def testJaxAttributesHaveCorrectDefaults(self):
       x = np.array([[3., 4., 5.]], np.float32)
       y = self.backend.buffer_from_pyval(x)
@@ -811,7 +827,8 @@ def testConvertElementType(self, src_dtype, dst_dtype):
           ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
 
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       self.assertLen(result, 1)
       expected = np.array(x, dtype=dst_dtype)
 
@@ -840,7 +857,8 @@ def testBitcastConvertType(self, src_dtype, dst_dtype):
           ops.Constant(c, x), xla_client.dtype_to_etype(dst_dtype))
 
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       self.assertLen(result, 1)
       expected = x.view(dst_dtype)
 
@@ -1161,6 +1179,12 @@ def testAbs(self):
       ops.Abs(ops.Constant(c, arr))
       self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
 
+    def testTanF32(self):
+      c = self._NewComputation()
+      arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
+      ops.Tan(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.tan(arr)])
+
     def testTanhF32(self):
       c = self._NewComputation()
       arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
@@ -1392,16 +1416,20 @@ def testSliceInDim(self):
     def testDynamicSlice(self):
       c = self._NewComputation()
       ops.DynamicSlice(
-          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          [ops.Constant(c, NumpyArrayS32([1, 0]))], [2, 2])
+          ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])), [
+              ops.Constant(c, NumpyArrayS32(1)),
+              ops.Constant(c, NumpyArrayS32(0))
+          ], [2, 2])
       self._ExecuteAndCompareExact(c, expected=[[[4, 5], [7, 8]]])
 
     def testDynamicUpdateSlice(self):
       c = self._NewComputation()
       ops.DynamicUpdateSlice(
           ops.Constant(c, NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
-          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4]])),
-          [ops.Constant(c, NumpyArrayS32([1, 1]))])
+          ops.Constant(c, NumpyArrayS32([[1, 2], [3, 4]])), [
+              ops.Constant(c, NumpyArrayS32(1)),
+              ops.Constant(c, NumpyArrayS32(1))
+          ])
       self._ExecuteAndCompareExact(
           c, expected=[[[1, 2, 3], [4, 1, 2], [7, 3, 4]]])
 
@@ -1413,7 +1441,8 @@ def testTuple(self):
           ops.Constant(c, NumpyArrayBool([True, False, False, True]))
       ])
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       self.assertLen(result, 3)
       np.testing.assert_equal(result[0], 42)
       np.testing.assert_allclose(result[1], [1.0, 2.0])
@@ -1452,7 +1481,8 @@ def testRngNormal(self):
           shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
                                              shape))
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       # since the result is random, we just check shape and uniqueness
       self.assertLen(result, 1)
       self.assertEqual(result[0].shape, shape)
@@ -1468,7 +1498,8 @@ def testRngUniformF32(self):
           shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.F32,
                                              shape))
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       # since the result is random, we just check shape, uniqueness, and range
       self.assertLen(result, 1)
       self.assertEqual(result[0].shape, shape)
@@ -1486,7 +1517,8 @@ def testRngUniformS32(self):
           shape=xla_client.Shape.array_shape(xla_client.PrimitiveType.S32,
                                              shape))
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       # since the result is random, we just check shape, integrality, and range
       self.assertLen(result, 1)
       self.assertEqual(result[0].shape, shape)
@@ -1515,7 +1547,8 @@ def testSortKeyVal(self):
       c = self._NewComputation()
       ops.Sort(c, (ops.Constant(c, keys), ops.Constant(c, values)), dimension=0)
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       self.assertLen(result, 2)
       np.testing.assert_allclose(result[0], [[2, 1, 1, 2], [3, 4, 4, 3]])
       np.testing.assert_equal(result[1], [[0, 5, 2, 7], [4, 1, 6, 3]])
@@ -1537,7 +1570,8 @@ def testSortCustomComparator(self):
           dimension=1,
           comparator=comparator)
       result = xla_client.execute_with_python_values(
-          self.backend.compile(c.build()), (), backend=self.backend)
+          self.backend.compile(xla_computation_to_mlir_module(c.build())), (),
+          backend=self.backend)
       self.assertLen(result, 2)
       np.testing.assert_allclose(result[0], [[1, 2, 3, 3], [1, 2, 2, 3]])
       np.testing.assert_equal(result[1], [[2, 0, 3, 1], [5, 7, 6, 4]])
@@ -1908,6 +1942,7 @@ def testReduce1DtoScalar(self, dtype):
       self._ExecuteAndCompareClose(c, expected=[10])
 
     # TODO(phawkins): test comparison harness doesn't support bfloat16
+    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
     @parameterized.named_parameters({
         "testcase_name": "_{}_dim{}".format(dtype.__name__, dim),
         "dtype": dtype,
@@ -1924,6 +1959,7 @@ def testReduce2DTo1D(self, dtype, dim):
           dimensions_to_reduce=[dim])
       self._ExecuteAndCompareClose(c, expected=[np.sum(input_array, axis=dim)])
 
+    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
     @parameterized.named_parameters({
         "testcase_name": "_{}_dims[{}]".format(dtype.__name__, dims),
         "dtype": dtype,
@@ -2015,6 +2051,7 @@ def testReduceWindowValidGeneralStrides(self, dtype):
           padding=padding)
       self._ExecuteAndCompareClose(c, expected=[[[5., 9.]]])
 
+    @unittest.skipIf(pjrt_c_api, "b/264473047: hangs")
     def testReduceWindowVariadic(self):
       c = self._NewComputation("reducer")
       shape = xla_client.shape_from_pyval(np.array(0, dtype=np.int32))
@@ -2092,7 +2129,7 @@ def testConditionalFalse(self):
                       false_computation)
       self._ExecuteAndCompareClose(c, expected=[1.])
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
     def testInfeedS32Values(self):
       to_infeed = NumpyArrayS32([1, 2, 3, 4])
       c = self._NewComputation()
@@ -2101,7 +2138,8 @@ def testInfeedS32Values(self):
               ops.CreateToken(c),
               xla_client.shape_from_pyval(
                   to_infeed[0]).with_major_to_minor_layout_if_absent()), 0)
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       device = self.backend.local_devices()[0]
       for item in to_infeed:
         device.transfer_to_infeed(item)
@@ -2111,7 +2149,7 @@ def testInfeedS32Values(self):
             compiled_c, (), backend=self.backend)
         self.assertEqual(result, item)
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
     def testInfeedTuple(self):
       to_infeed = (NumpyArrayS32([1, 2, 3, 4]), NumpyArrayS32([[7], [8]]))
       c = self._NewComputation()
@@ -2120,7 +2158,8 @@ def testInfeedTuple(self):
               ops.CreateToken(c),
               xla_client.shape_from_pyval(
                   to_infeed).with_major_to_minor_layout_if_absent()), 0)
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       device = self.backend.local_devices()[0]
       device.transfer_to_infeed(to_infeed)
 
@@ -2130,7 +2169,7 @@ def testInfeedTuple(self):
       np.testing.assert_equal(result[0], to_infeed[0])
       np.testing.assert_equal(result[1], to_infeed[1])
 
-    @unittest.skipIf(cloud_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or pjrt_c_api, "not implemented")
     def testInfeedThenOutfeedS32(self):
       to_round_trip = NumpyArrayS32([1, 2, 3, 4])
       c = self._NewComputation()
@@ -2144,7 +2183,8 @@ def testInfeedThenOutfeedS32(self):
           to_round_trip[0]).with_major_to_minor_layout_if_absent()
       ops.OutfeedWithToken(x, token, outfeed_shape)
 
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       device = self.backend.local_devices()[0]
 
       for want in to_round_trip:
@@ -2216,7 +2256,8 @@ def testInvokeWithWrongElementType(self):
 
       def TestFun():
         return xla_client.execute_with_python_values(
-            self.backend.compile(c.build()), [self.f32_scalar_2], self.backend)
+            self.backend.compile(xla_computation_to_mlir_module(c.build())),
+            [self.f32_scalar_2], self.backend)
 
       self.assertRaisesRegex(
           RuntimeError, r"Invalid argument: Argument does not match.*"
@@ -2234,7 +2275,8 @@ def testComputationRootDifferentFromLastOp(self):
       ops.Add(result, ops.Constant(c, np.float32(1.618)))
 
       arg = NumpyArrayF32(1.0)
-      compiled_c = self.backend.compile(c.build(result))
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build(result)))
       ans, = xla_client.execute_with_python_values(
           compiled_c, [arg], backend=self.backend)
       np.testing.assert_allclose(ans, 4.14)
@@ -2257,7 +2299,8 @@ def testSetSharding(self):
       result = ops.Add(x, ops.Constant(c, np.float32(3.14)))
       ops.Add(result, ops.Constant(c, np.float32(1.618)))
       arg = NumpyArrayF32(1.0)
-      compiled_c = self.backend.compile(c.build(result))
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build(result)))
       ans, = xla_client.execute_with_python_values(
           compiled_c, [arg], backend=self.backend)
       np.testing.assert_allclose(ans, 4.14)
@@ -2434,7 +2477,7 @@ def testNoTracebacksIfDisabled(self):
 
         b = xla_client.XlaBuilder("computation")
         ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
-        e = self.backend.compile(b.build())
+        e = self.backend.compile(xla_computation_to_mlir_module(b.build()))
         self.assertEqual(None, e.traceback)
 
     def assertIsTracebackContaining(self, tb, function):
@@ -2457,7 +2500,7 @@ def testTracebacks(self):
 
         b = xla_client.XlaBuilder("computation")
         ops.Add(ops.Constant(b, np.int32(1)), ops.Constant(b, np.int32(2)))
-        e = self.backend.compile(b.build())
+        e = self.backend.compile(xla_computation_to_mlir_module(b.build()))
         self.assertIsTracebackContaining(e.traceback, "testTracebacks")
 
     def testNestedFunction(self):
@@ -2493,17 +2536,16 @@ def testPlatformVersion(self):
         self.assertEqual(version, "<unknown>")
       elif self.backend.platform == "gpu":
         # Following is false if not built with --config=cuda
-        if test_util.is_gpu_available(cuda_only=True):
+        if version != "<unknown>":
           self.assertTrue(
               re.match(r"^cuda \d{4,}$", version),
               msg=f"Expected CUDA version string; got {repr(version)}")
-        else:
-          self.assertEqual(version, "<unknown>")
-      elif self.backend.platform == "tpu" and not cloud_tpu:
+      elif self.backend.platform == "tpu" and not pathways:
         self.assertIn("tpu", version.lower())
         self.assertIn("cl/", version)
+        self.assertIn("Built on ", version)
 
-    @unittest.skipIf(cloud_tpu or tfrt_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or tfrt_tpu, "not implemented")
     def testExecutableSerialization(self):
       if self.backend.platform != "tpu":
         self.skipTest("Test requires tpu platform")
@@ -2514,7 +2556,8 @@ def testExecutableSerialization(self):
           ops.Constant(c, NumpyArrayS32([3, 4])))
 
       options = xla_client.CompileOptions()
-      executable = self.backend.compile(c.build(), options)
+      executable = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()), options)
       self.assertLen(executable.hlo_modules(), 1)
 
       serialized = self.backend.serialize_executable(executable)
@@ -2528,16 +2571,49 @@ def testExecutableSerialization(self):
                                                       self.backend)
       self.assertTrue(np.all(actual == expected))
 
+    def testCompileOptionsSerialization(self):
+      options = xla_client.CompileOptions()
+      executable_build_options = options.executable_build_options
+      options.num_replicas = 3
+      options.num_partitions = 2
+      options.profile_version = 1337
+      options.compile_portable_executable = True
+      executable_build_options.num_replicas = 3
+      executable_build_options.num_partitions = 2
+      executable_build_options.debug_options.xla_cpu_enable_fast_math = True
+      executable_build_options.debug_options.xla_test_all_input_layouts = True
+
+      b = options.SerializeAsString()
+      restored = xla_client.CompileOptions.ParseFromString(b)
+
+      for name in ("num_replicas", "num_partitions", "profile_version",
+                   "compile_portable_executable"):
+        self.assertEqual(getattr(options, name), getattr(restored, name),
+                         msg=name)
+
+      for name in ("num_replicas", "num_partitions"):
+        self.assertEqual(getattr(options.executable_build_options, name),
+                         getattr(restored.executable_build_options, name),
+                         msg=name)
+
+      for name in ("xla_cpu_enable_fast_math", "xla_test_all_input_layouts"):
+        self.assertEqual(
+            getattr(options.executable_build_options.debug_options, name),
+            getattr(restored.executable_build_options.debug_options, name),
+            msg=name)
+
   tests.append(ClientTest)
 
   # TODO(b/182461453): Add TFRT and cloud TPU implementation of
   # ReadDynamicShapes
+  @unittest.skip("Test fails HLO -> MHLO conversion")
   class DynamicReshapeTest(ComputationTest):
     """Tests related to DynamicReshape."""
 
     def _CompareToPyAndBufferProtocol(self, builder, args, expected_results,
                                       test_fn):
-      compiled = self.backend.compile(builder.build())
+      compiled = self.backend.compile(
+          xla_computation_to_mlir_module(builder.build()))
       output_buffers = compiled.execute([
           self.backend.buffer_from_pyval(
               arg, device=compiled.local_devices()[0]) for arg in args
@@ -2559,7 +2635,7 @@ def _CompareToPyAndBufferProtocol(self, builder, args, expected_results,
             memoryview(buf)
 
     # 1D reshape of full size, half size, and size of 0.
-    @unittest.skipIf(cloud_tpu or tfrt_tpu or external_tpu, "not implemented")
+    @unittest.skip("not implemented")
     @parameterized.parameters((5), (3), (0))
     def testReshape1D(self, reshape_size):
       full_size = 5
@@ -2577,7 +2653,9 @@ def testReshape1D(self, reshape_size):
     # where the strides may differ between the host and devices. The reshaped
     # physical memory layout is not consecutive, and we test if the program can
     # return the correct logical view of the data.
-    @unittest.skipIf(cloud_tpu or tfrt_tpu or external_tpu, "not implemented")
+    @unittest.skipIf(
+        cloud_tpu or pathways or tfrt_tpu or external_tpu or pjrt_c_api,
+        "not implemented")
     @parameterized.named_parameters({
         "testcase_name": "_{}".format(dtype.__name__),
         "dtype": dtype,
@@ -2593,7 +2671,7 @@ def testReshape2D(self, dtype):
       self._CompareToPyAndBufferProtocol(c, [arg0, arg1], [expected],
                                          np.testing.assert_equal)
 
-    @unittest.skipIf(cloud_tpu or tfrt_tpu, "not implemented")
+    @unittest.skipIf(cloud_tpu or pathways or tfrt_tpu, "not implemented")
     @parameterized.named_parameters({
         "testcase_name": "_{}".format(dtype.__name__),
         "dtype": dtype,
@@ -2652,7 +2730,8 @@ def testExecuteWithToken(self):
       ops.Mul(
           ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], np.float32)),
           ops.Constant(c, np.array([-1.2, 2, -2, -3], np.float32)))
-      compiled_c = self.backend.compile(c.build())
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()))
       results, token = compiled_c.execute_with_token([])
       token.block_until_ready()
       self.assertLen(results, 1)
@@ -2667,7 +2746,8 @@ def testExecuteShardedOnLocalDevicesWithTokens(self):
       num_replicas = 1
       options = xla_client.CompileOptions()
       options.num_replicas = num_replicas
-      compiled_c = self.backend.compile(c.build(), compile_options=options)
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()), compile_options=options)
       results, sharded_token = compiled_c.execute_sharded_on_local_devices_with_tokens(
           [])
       sharded_token.block_until_ready()
@@ -2680,6 +2760,7 @@ def testExecuteShardedOnLocalDevicesWithTokens(self):
 
   tests.append(TokenTest)
 
+  @unittest.skip("TODO(b/263274176): channel handles do not round trip")
   class HostCallbackTest(ComputationTest):
     """Tests related to HostCallback."""
 
@@ -2727,7 +2808,8 @@ def Identity(x):
           recv_channel_ids=[2])
 
       compiled_c = self.backend.compile(
-          c.build(), host_callbacks=[host_callback])
+          xla_computation_to_mlir_module(c.build()),
+          host_callbacks=[host_callback])
       c.clear_frontend_attributes()
 
       results = compiled_c.execute([])
@@ -2737,6 +2819,7 @@ def Identity(x):
 
   tests.append(HostCallbackTest)
 
+  @unittest.skip("TODO(b/263274176): channel handles do not round trip")
   class HostCallbackMultiReplicaTest(ComputationTest):
     """Tests related to HostCallback for multi-replica execution."""
 
@@ -2787,7 +2870,8 @@ def Identity(x):
       options = xla_client.CompileOptions()
       options.num_replicas = num_replicas
       compiled_c = self.backend.compile(
-          c.build(), compile_options=options, host_callbacks=[host_callback])
+          xla_computation_to_mlir_module(c.build()),
+          compile_options=options, host_callbacks=[host_callback])
       c.clear_frontend_attributes()
 
       results = compiled_c.execute_sharded_on_local_devices([])
@@ -2801,6 +2885,7 @@ def Identity(x):
 
   class ExecutePortableTest(ComputationTest):
 
+    @unittest.skip("Test does not work under IFRT")
     def testExecutePortable(self):
       devices_by_kind = collections.defaultdict(list)
       for device in self.backend.devices():
@@ -2836,7 +2921,8 @@ def testExecuteShardedOverloadEmptyInput(self):
       ops.Constant(c, np.array([2.5, 3.3, -1.2, 0.7], np.float32))
       options = xla_client.CompileOptions()
       options.num_replicas = 1
-      compiled_c = self.backend.compile(c.build(), compile_options=options)
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()), compile_options=options)
 
       results = compiled_c.execute_sharded_on_local_devices([])
       self.assertLen(results, 1)
@@ -2859,7 +2945,8 @@ def testExecuteShardedOverloadBufferInput(self):
 
       options = xla_client.CompileOptions()
       options.num_replicas = 1
-      compiled_c = self.backend.compile(c.build(), compile_options=options)
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()), compile_options=options)
 
       buffer = self.backend.buffer_from_pyval(arg)
 
@@ -2885,7 +2972,8 @@ def testExecuteShardedOverloadShardedBufferInput(self):
 
       options = xla_client.CompileOptions()
       options.num_replicas = 1
-      compiled_c = self.backend.compile(c.build(), compile_options=options)
+      compiled_c = self.backend.compile(
+          xla_computation_to_mlir_module(c.build()), compile_options=options)
 
       sharded_buffer = xla_client.ShardedBuffer.create_sharded_buffer(
           [self.backend.buffer_from_pyval(arg)])
diff --git a/tensorflow/compiler/xla/python/xla_compiler.cc b/tensorflow/compiler/xla/python/xla_compiler.cc
index b7e35a0294b..c2817304694 100644
--- a/tensorflow/compiler/xla/python/xla_compiler.cc
+++ b/tensorflow/compiler/xla/python/xla_compiler.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/xla_compiler.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -34,20 +35,26 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -488,6 +495,34 @@ void BuildXlaCompilerSubmodule(py::module& m) {
             return param_shardings;
           });
 
+  py::class_<HloModuleGroup, std::shared_ptr<HloModuleGroup>>
+      hlo_module_group_class(m, "HloModuleGroup");
+  hlo_module_group_class
+      .def(py::init(
+          [](const std::string& name,
+             const std::vector<std::shared_ptr<HloModule>>& hlo_modules)
+              -> std::shared_ptr<HloModuleGroup> {
+            std::vector<std::unique_ptr<HloModule>> modules;
+            modules.reserve(hlo_modules.size());
+            for (const auto& m : hlo_modules) {
+              modules.push_back(m->Clone(/*suffix=*/""));
+            }
+            return std::make_shared<HloModuleGroup>(name, std::move(modules));
+          }))
+      .def_property_readonly("name", &HloModuleGroup::name)
+      .def("to_string", &HloModuleGroup::ToString)
+      .def("to_modules",
+           [](HloModuleGroup& m) -> std::vector<std::shared_ptr<HloModule>> {
+             std::vector<std::unique_ptr<HloModule>> modules =
+                 m.ConsumeModules();
+             std::vector<std::shared_ptr<HloModule>> shared_modules;
+             shared_modules.reserve(modules.size());
+             for (auto& module : modules) {
+               shared_modules.push_back(std::move(module));
+             }
+             return shared_modules;
+           });
+
   m.def("hlo_module_to_dot_graph",
         [](const HloModule& hlo_module) -> StatusOr<std::string> {
           return RenderGraph(*hlo_module.entry_computation(), /*label=*/"",
@@ -496,13 +531,27 @@ void BuildXlaCompilerSubmodule(py::module& m) {
         });
   m.def(
       "hlo_module_cost_analysis",
-      [](PyClient* client,
-         const HloModule& module) -> StatusOr<HloCostAnalysis::Properties> {
+      [](PyClient* client, const HloModule& module)
+          -> StatusOr<absl::flat_hash_map<std::string, float>> {
         TF_ASSIGN_OR_RETURN(auto analysis,
                             client->pjrt_client()->GetHloCostAnalysis());
         TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
-        return analysis->properties();
+
+        // Convert from HloCostAnalysis::Properties to a standard map.
+        absl::flat_hash_map<std::string, float> ret;
+        analysis->properties().ForEach(
+            [&](absl::string_view key, float val) { ret[key] = val; });
+        return ret;
       });
+  m.def("hlo_module_from_text",
+        [](const std::string& hlo_module_text)
+            -> StatusOr<std::shared_ptr<HloModule>> {
+          auto hlo_module =
+              xla::ParseAndReturnUnverifiedModule(hlo_module_text);
+          TF_RETURN_IF_ERROR(hlo_module.status());
+          std::shared_ptr<HloModule> result(std::move(*hlo_module));
+          return result;
+        });
 
   py::class_<XlaOp> xla_op_class(m, "XlaOp");
 
@@ -593,6 +642,26 @@ void BuildXlaCompilerSubmodule(py::module& m) {
         debug_options->set_xla_gpu_enable_fast_min_max(false);
         return options;
       }))
+      .def(py::pickle(
+          [](const CompileOptions& self) -> py::tuple {
+            return py::make_tuple(
+                py::bytes(ValueOrThrow(self.ToProto()).SerializeAsString()));
+          },
+          [](py::tuple t) {
+            CompileOptionsProto result;
+            result.ParseFromString(t[0].cast<std::string>());
+            return ValueOrThrow(CompileOptions::FromProto(result));
+          }))
+      .def("SerializeAsString",
+           [](const CompileOptions& self) -> py::bytes {
+             return py::bytes(ValueOrThrow(self.ToProto()).SerializeAsString());
+           })
+      .def_static("ParseFromString",
+                  [](py::bytes s) {
+                    CompileOptionsProto result;
+                    result.ParseFromString(s);
+                    return ValueOrThrow(CompileOptions::FromProto(result));
+                  })
       .def_readwrite("argument_layouts", &CompileOptions::argument_layouts)
       .def_readwrite("parameter_is_tupled_arguments",
                      &CompileOptions::parameter_is_tupled_arguments)
@@ -729,9 +798,15 @@ void BuildXlaCompilerSubmodule(py::module& m) {
           &ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_ids)
       .def_property(
           "allow_spmd_sharding_propagation_to_output",
-          &ExecutableBuildOptions::allow_spmd_sharding_propagation_to_output,
-          &ExecutableBuildOptions::
-              set_allow_spmd_sharding_propagation_to_output);
+          [](const ExecutableBuildOptions& options) -> std::vector<bool> {
+            return std::vector<bool>(
+                options.allow_spmd_sharding_propagation_to_output().begin(),
+                options.allow_spmd_sharding_propagation_to_output().end());
+          },
+          [](ExecutableBuildOptions& options, std::vector<bool> values) {
+            absl::InlinedVector<bool, 1> v(values.begin(), values.end());
+            options.set_allow_spmd_sharding_propagation_to_output(v);
+          });
 
   py::enum_<OpSharding::Type> op_sharding_type(m, "OpSharding_Type");
   op_sharding_type.value("REPLICATED", OpSharding::REPLICATED)
@@ -746,11 +821,24 @@ void BuildXlaCompilerSubmodule(py::module& m) {
           "Type",
           [op_sharding_type](const py::object&) { return op_sharding_type; })
       .def(py::init<>())
+      .def(py::pickle(
+          [](const OpSharding& self) {
+            return py::make_tuple(py::bytes(self.SerializeAsString()));
+          },
+          [](py::tuple t) {
+            OpSharding result;
+            result.ParseFromString(t[0].cast<std::string>());
+            return result;
+          }))
       .def_property("type", &xla::OpSharding::type, &xla::OpSharding::set_type)
       .def_property("replicate_on_last_tile_dim",
                     &xla::OpSharding::replicate_on_last_tile_dim,
                     &xla::OpSharding::set_replicate_on_last_tile_dim)
       .def("__repr__", &xla::OpSharding::DebugString)
+      .def("ParseFromString",
+           [](OpSharding& sharding, const std::string& s) {
+             sharding.ParseFromString(s);
+           })
       .def("SerializeToString",
            [](const OpSharding& sharding) {
              return py::bytes(sharding.SerializeAsString());
@@ -812,5 +900,26 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .value("IFFT", FftType::IFFT)
       .value("RFFT", FftType::RFFT)
       .value("IRFFT", FftType::IRFFT);
+
+  // Hlo Module Passes
+  py::class_<HloPassInterface> hlo_pass_interface(m, "HloPassInterface");
+  hlo_pass_interface.def_property_readonly("name", &HloPassInterface::name)
+      .def("is_pass_pipeline", &HloPassInterface::IsPassPipeline)
+      .def("run",
+           [](HloPassInterface& pass, HloModule* module) -> StatusOr<bool> {
+             return pass.Run(module);
+           })
+      .def("run_on_module_group",
+           [](HloPassInterface& pass,
+              HloModuleGroup* module_group) -> StatusOr<bool> {
+             return pass.RunOnModuleGroup(module_group);
+           });
+
+  py::class_<HloDCE, HloPassInterface>(m, "HloDCE").def(py::init<>());
+  py::class_<CallInliner, HloPassInterface>(m, "CallInliner").def(py::init<>());
+  py::class_<FlattenCallGraph, HloPassInterface>(m, "FlattenCallGraph")
+      .def(py::init<>());
+  py::class_<TupleSimplifier, HloPassInterface>(m, "TupleSimplifier")
+      .def(py::init<>());
 }  // NOLINT(readability/fn_size)
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
index 18eb8d85a31..8c691069697 100644
--- a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
@@ -51,6 +51,8 @@ class PrimitiveType(enum.IntEnum):
   U16: PrimitiveType
   U32: PrimitiveType
   U64: PrimitiveType
+  F8_E4M3FN: PrimitiveType
+  F8_E5M2: PrimitiveType
   BF16: PrimitiveType
   F16: PrimitiveType
   F32: PrimitiveType
@@ -62,6 +64,8 @@ class PrimitiveType(enum.IntEnum):
   TOKEN: PrimitiveType
 
 def bfloat16_dtype() -> Type[Any]: ...
+def float8_e4m3fn_dtype() -> Type[Any]: ...
+def float8_e5m2_dtype() -> Type[Any]: ...
 
 # === BEGIN xla_compiler.cc
 
@@ -169,8 +173,16 @@ class HloModule:
   @staticmethod
   def from_serialized_hlo_module_proto(
     serialized_hlo_module_proto: bytes) -> HloModule: ...
+  
+class HloModuleGroup:
+  def __init__(self, name: str, modules: List[HloModule]) -> None: ...
+  @property
+  def name(self) -> str: ...
+  def to_string(self) -> str: ...
+  def to_modules(self) -> List[HloModule]: ...
 
 def hlo_module_to_dot_graph(hlo_module: HloModule) -> str: ...
+def hlo_module_from_text(hlo_module_text: str) -> HloModule: ...
 
 def hlo_module_cost_analysis(
     client: Client,
@@ -205,7 +217,10 @@ class DeviceAssignment:
   def serialize(self) -> bytes: ...
 
 class CompileOptions:
+  @staticmethod
+  def ParseFromString(s: bytes) -> CompileOptions: ...
   def __init__(self) -> None: ...
+  def SerializeAsString(self) -> bytes: ...
   argument_layouts: Optional[List[Shape]]
   parameter_is_tupled_arguments: bool
   executable_build_options: ExecutableBuildOptions
@@ -277,6 +292,7 @@ class OpSharding:
   tile_assignment_dimensions: Sequence[int]
   tile_assignment_devices: Sequence[int]
   tuple_shardings: Sequence[OpSharding]
+  def ParseFromString(self, s: bytes) -> None: ...
   def SerializeToString(self) -> bytes: ...
   def clone(self) -> OpSharding: ...
 
@@ -323,12 +339,6 @@ class Device:
   def live_buffers(self) -> List[Buffer]: ...
   def __getattr__(self, name: str) -> Any: ...
 
-class GpuDevice(Device):
-  pass
-
-class TpuDevice(Device):
-  pass
-
 class _GpuAllocatorKind(enum.IntEnum):
     DEFAULT: int
     PLATFORM: int
@@ -387,7 +397,7 @@ class Client:
       device: Device) -> List[Tuple[Buffer, bytes]]: ...
   def compile(
       self,
-      computation: XlaComputation,
+      computation: Union[str, bytes],
       compile_options: CompileOptions = ..., host_callbacks: Sequence[Any] = ...) -> LoadedExecutable: ...
   def serialize_executable(self, executable: LoadedExecutable) -> bytes: ...
   def deserialize_executable(
@@ -412,7 +422,6 @@ class Client:
                                          operand_shapes: Any, send_channel_ids: Any, recv_channel_ids: Any) -> Any: ...
 
 
-def get_cpu_client(asynchronous: bool = ...) -> Client: ...
 def get_tfrt_cpu_client(asynchronous: bool = ...) -> Client: ...
 def get_interpreter_client() -> Client: ...
 def get_gpu_client(
@@ -423,7 +432,8 @@ def get_gpu_client(
     allowed_devices: Optional[Any] = ...,
     platform_name: Optional[str] = ...) -> Client:...
 def get_tpu_client(max_inflight_computations: int = ...) -> Client: ...
-def get_tfrt_tpu_c_api_client() -> Client: ...
+def get_c_api_client(platform_name: str) -> Client: ...
+def load_pjrt_plugin(platform_name: str, library_path: str) -> _Status: ...
 
 class DeviceArrayBase: ...
 
@@ -484,6 +494,7 @@ class ArrayImpl:
   def is_deleted(self) -> bool: ...
   # TODO(yashkatariya): remove this once the transition completes.
   def _init_with_fastpath_disabled(self) -> None: ...
+  def is_ready(self) -> bool: ...
   dtype: np.dtype
   shape: Tuple[int, ...]
   _arrays: Any
@@ -517,6 +528,7 @@ class LoadedExecutable:
   def hlo_modules(self) -> List[HloModule]: ...
   def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
   def keep_alive(self) -> None: ...
+  def compile_options(self) -> CompileOptions: ...
   traceback: Traceback
   fingerprint: Optional[bytes]
 
@@ -526,6 +538,7 @@ class Executable:
   def get_parameter_shardings(self) -> Optional[List[OpSharding]]: ...
   def get_compiled_memory_stats(self) -> CompiledMemoryStats: ...
   def serialize(self) -> str: ...
+  def compile_options(self) -> CompileOptions: ...
 
 class DeviceTopology:
   platform: str
@@ -573,7 +586,9 @@ class DistributedRuntimeClient:
   def connect(self) -> _Status: ...
   def shutdown(self) -> _Status: ...
   def blocking_key_value_get(self, key: str, timeout_in_ms: int) -> _Status: ...
+  def key_value_dir_get(self, key: str) -> _Status: ...
   def key_value_set(self, key: str, value: str) -> _Status: ...
+  def key_value_delete(self, key:str) -> _Status: ...
   def wait_at_barrier(self, barrier_id: str, timeout_in_ms: int) -> _Status: ...
 def get_distributed_runtime_service(
     address: str,
@@ -631,7 +646,7 @@ class Sharding: ...
 
 class XLACompatibleSharding(Sharding): ...
 
-class MeshPspecSharding(XLACompatibleSharding):
+class NamedSharding(XLACompatibleSharding):
   def __init__(self, mesh: Any, spec: Any, _parsed_pspec: Any = None): ...
   mesh: Any
   spec: Any
@@ -654,4 +669,23 @@ class OpShardingSharding(XLACompatibleSharding):
 class PjitFunction:
   def __call__(self, *args, **kwargs) -> Any: ...
 
-def pjit(fun: Callable, cache_miss: Callable, static_argnums: Sequence[int]) -> PjitFunction: ...
+def pjit(function_name: str, fun: Optional[Callable], cache_miss: Callable, static_argnums: Sequence[int], static_argnames: Sequence[str]) -> PjitFunction: ...
+
+class HloPassInterface:
+  @property
+  def name(self) -> str: ...
+  def is_pass_pipeline(self) -> bool: ...
+  def run(self, module: HloModule) -> bool: ...
+  def run_on_module_group(self, module_group: HloModuleGroup) -> bool: ...
+
+class HloDCE(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class CallInliner(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class FlattenCallGraph(HloPassInterface):
+  def __init__(self) -> None: ...
+
+class TupleSimplifer(HloPassInterface):
+  def __init__(self) -> None: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
index fd8978571d7..fea878038e1 100644
--- a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
@@ -19,3 +19,5 @@ def xla_computation_to_mlir_module(computation: XlaComputation) -> str: ...
 def mlir_module_to_xla_computation(
     mlir_module: str, use_tuple_args: bool = ...,
     return_tuple: bool = ...) -> XlaComputation: ...
+def mhlo_to_stablehlo(mlir_module: str) -> str: ...
+def stablehlo_to_mhlo(mlir_module: str) -> str: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/ops.pyi b/tensorflow/compiler/xla/python/xla_extension/ops.pyi
index 0a6288e0675..56de1891649 100644
--- a/tensorflow/compiler/xla/python/xla_extension/ops.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/ops.pyi
@@ -117,7 +117,8 @@ def AllToAll(
     concat_dimension: int,
     split_count: int,
     replica_groups: Sequence[_ReplicaGroup] = ...,
-    layout: Optional[_Layout] = ...) -> XlaOp: ...
+    layout: Optional[_Layout] = ...,
+    channel_id: Optional[ChannelHandle] = ...) -> XlaOp: ...
 def BitcastConvertType(operand: XlaOp,
                        new_element_type: PrimitiveType) -> XlaOp: ...
 def Broadcast(operand: XlaOp, sizes: Sequence[int]) -> XlaOp: ...
@@ -433,6 +434,7 @@ def Sign(__arg: XlaOp) -> XlaOp: ...
 def Cos(__arg: XlaOp) -> XlaOp: ...
 def OptimizationBarrier(__arg: XlaOp) -> XlaOp: ...
 def Sin(__arg: XlaOp) -> XlaOp: ...
+def Tan(__arg: XlaOp) -> XlaOp: ...
 def Tanh(__arg: XlaOp) -> XlaOp: ...
 def IsFinite(__arg: XlaOp) -> XlaOp: ...
 def Neg(__arg: XlaOp) -> XlaOp: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi b/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
index 10d217ea8cd..f53d9f7a763 100644
--- a/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/outfeed_receiver.pyi
@@ -34,4 +34,5 @@ class OutfeedReceiverForPython:
       builder: XlaBuilder,
       token: XlaOp,
       consumer_id: int,
-      arrays: Sequence[XlaOp]) -> XlaOp: ...
+      arrays: Sequence[XlaOp],
+      device_idx: int) -> XlaOp: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi b/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
index ec371f8a8a5..e31f0224337 100644
--- a/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/pmap_lib.pyi
@@ -104,7 +104,6 @@ class PmapFunction:
   __signature__: inspect.Signature
   def _cache_size(self) -> int: ...
   def _debug_cache_keys(self) -> str: ...
-  def _debug_compute_cache_key(self, *args, **kwargs) -> str: ...
 
 def pmap(__fun: Callable[..., Any],
          __cache_miss: Callable[..., Any],
diff --git a/tensorflow/compiler/xla/python_api/BUILD b/tensorflow/compiler/xla/python_api/BUILD
index 28039ec567b..1a04ed248b4 100644
--- a/tensorflow/compiler/xla/python_api/BUILD
+++ b/tensorflow/compiler/xla/python_api/BUILD
@@ -1,7 +1,10 @@
 # Description:
 #   Python API for XLA.
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "types",
@@ -11,7 +14,6 @@ py_library(
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto_py",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:platform",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/compiler/xla/pytype.default.bzl b/tensorflow/compiler/xla/pytype.default.bzl
new file mode 100644
index 00000000000..05143e8a715
--- /dev/null
+++ b/tensorflow/compiler/xla/pytype.default.bzl
@@ -0,0 +1,14 @@
+"""Default (OSS) build versions of Python pytype rules."""
+
+# Placeholder to use until bazel supports pytype_library.
+def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    _ = (pytype_deps, pytype_srcs)  # @unused
+    native.py_library(name = name, **kwargs)
+
+# Placeholder to use until bazel supports pytype_strict_binary.
+def pytype_strict_binary(name, **kwargs):
+    native.py_binary(name = name, **kwargs)
+
+# Placeholder to use until bazel supports pytype_strict_library.
+def pytype_strict_library(name, **kwargs):
+    native.py_library(name = name, **kwargs)
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 69e66f8355b..4578e8a3dc5 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index e6f42d7ac08..ebbf728a7bc 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_binary",
-    "tf_cc_test",
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_grpc_library",
 )
 load(
     "//tensorflow/tsl:tsl.bzl",
@@ -13,13 +14,10 @@ load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
-load(
-    "//tensorflow/compiler/xla:xla.bzl",
-    "xla_py_grpc_library",
-)
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
 
@@ -61,13 +59,13 @@ cc_library(
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/util:command_line_flags",
-    ] + tf_grpc_cc_dependencies() + if_libtpu(
+    ] + tsl_grpc_cc_dependencies() + if_libtpu(
         if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
         if_true = [],
     ),
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "grpc_service_main_cpu",
     deps = [
         ":grpc_service_main_library",
@@ -75,7 +73,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "grpc_client_test",
     srcs = ["grpc_client_test.cc"],
     data = [
@@ -93,7 +91,7 @@ tf_cc_test(
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/platform:path",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 cc_library(
@@ -106,7 +104,7 @@ cc_library(
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
-    ] + tf_grpc_cc_dependencies(),
+    ] + tsl_grpc_cc_dependencies(),
 )
 
 # copybara:uncomment_begin(google-only)
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index ca5f78617ad..b072ff5731a 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // Launches an RPC service in a subprocess and connects to it over a socket
 // using an RPCStub.
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_format.h"
@@ -45,9 +46,9 @@ namespace {
 class GRPCClientTestBase : public ::testing::Test {
  protected:
   GRPCClientTestBase() {
-    std::string test_srcdir = tsl::testing::TensorFlowSrcRoot();
-    std::string service_main_path = tsl::io::JoinPath(
-        test_srcdir, "compiler/xla/rpc/grpc_service_main_cpu");
+    std::string test_srcdir = tsl::testing::XlaSrcRoot();
+    std::string service_main_path =
+        tsl::io::JoinPath(test_srcdir, "rpc/grpc_service_main_cpu");
     int port = tsl::internal::PickUnusedPortOrDie();
     subprocess_.SetProgram(
         service_main_path,
diff --git a/tensorflow/compiler/xla/runlit.cfg.py b/tensorflow/compiler/xla/runlit.cfg.py
new file mode 100644
index 00000000000..cb3474c7e9c
--- /dev/null
+++ b/tensorflow/compiler/xla/runlit.cfg.py
@@ -0,0 +1,85 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lit runner configuration."""
+
+import os
+import platform
+import sys
+import lit.formats
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+
+# Lint for undefined variables is disabled as config is not defined inside this
+# file, instead config is injected by way of evaluating runlit.cfg.py from
+# runlit.site.cfg.py which in turn is evaluated by lit.py. The structure is
+# common for lit tests and intended to only persist temporarily (b/136126535).
+# pylint: disable=undefined-variable
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'MLIR ' + os.path.basename(config.mlir_test_dir)
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.cc', '.hlo', '.hlotxt', '.json', '.mlir', '.pbtxt', '.py']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = config.mlir_test_dir
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.environ['RUNFILES_DIR']
+
+if platform.system() == 'Windows':
+  tool_patterns = [
+      ToolSubst('FileCheck.exe', unresolved='fatal'),
+      #  Handle these specially as they are strings searched for during testing.
+      ToolSubst('count.exe', unresolved='fatal'),
+      ToolSubst('not.exe', unresolved='fatal')
+  ]
+
+  llvm_config.config.substitutions.append(
+      ('%python', '"%s"' % (sys.executable)))
+
+  llvm_config.add_tool_substitutions(tool_patterns,
+                                     [llvm_config.config.llvm_tools_dir])
+else:
+  llvm_config.use_default_substitutions()
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = config.mlir_tf_tools_dirs + [
+    config.mlir_tools_dir, config.llvm_tools_dir
+]
+tool_names = [
+    'hlo_to_llvm_ir',
+    'kernel-gen-opt',
+    'mhlo-tosa-opt',
+    'mlir-bisect',
+    'mlir-hlo-opt',
+    'mlir-opt',
+    'mlir-translate',
+    'xla-cpu-opt',
+    'xla-gpu-opt',
+    'xla-mlir-gpu-opt',
+    'xla-opt',
+    'xla-runtime-opt',
+    'xla-translate',
+    'xla-translate-gpu-opt',
+    'xla-translate-opt',
+]
+tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
+# pylint: enable=undefined-variable
diff --git a/tensorflow/compiler/xla/runlit.site.cfg.py b/tensorflow/compiler/xla/runlit.site.cfg.py
new file mode 100644
index 00000000000..861e788c287
--- /dev/null
+++ b/tensorflow/compiler/xla/runlit.site.cfg.py
@@ -0,0 +1,73 @@
+# Copyright 2019 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lit runner site configuration."""
+
+import os
+import platform
+import lit.llvm
+
+# Handle the test srcdir for platforms. On windows, things are weird with bazel.
+if platform.system() == 'Windows':
+  srcdir = os.environ['TEST_SRCDIR']
+  real_test_srcdir = srcdir[:srcdir.find('xla/')]
+  external_srcdir = os.path.join(real_test_srcdir, 'external')
+else:
+  real_test_srcdir = os.environ['TEST_SRCDIR']
+  external_srcdir = real_test_srcdir
+
+# Lint for undefined variables is disabled as config is not defined inside this
+# file, instead config is injected by lit.py. The structure is common for lit
+# tests and intended to only persist temporarily (b/136126535).
+# pylint: disable=undefined-variable
+config.llvm_tools_dir = os.path.join(external_srcdir, 'llvm-project', 'llvm')
+config.mlir_obj_root = os.path.join(real_test_srcdir)
+config.mlir_tools_dir = os.path.join(external_srcdir, 'llvm-project', 'mlir')
+# TODO(jpienaar): Replace with suffices in build rule.
+config.suffixes = ['.td', '.mlir', '.pbtxt']
+
+xla_root_dir = 'tensorflow/compiler/xla/'
+mlir_tf_tools_dirs = [
+    'mlir/backends/cpu',
+    'mlir/backends/gpu',
+    'mlir/runtime',
+    'mlir/tools/mlir_bisect',
+    'mlir_hlo',
+    'mlir_hlo/tosa',
+    'service/gpu/tests',
+    'service/mlir_gpu',
+    'translate',
+    'translate/mhlo_to_lhlo_with_xla',
+]
+config.mlir_tf_tools_dirs = [
+    os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], xla_root_dir,
+                 s) for s in mlir_tf_tools_dirs
+]
+test_dir = os.environ['TEST_TARGET']
+test_dir = test_dir.strip('/').rsplit(':', 1)[0]
+config.mlir_test_dir = os.path.join(real_test_srcdir,
+                                    os.environ['TEST_WORKSPACE'], test_dir)
+
+if platform.system() == 'Windows':
+  # Configure this to work with msys2, TF's preferred windows bash.
+  config.lit_tools_dir = '/usr/bin'
+
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(
+    config,
+    os.path.join(
+        os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'],
+                     'xla/runlit.cfg.py')))
+# pylint: enable=undefined-variable
diff --git a/tensorflow/compiler/xla/runtime/BUILD b/tensorflow/compiler/xla/runtime/BUILD
index 2d5aa9c4622..82c1e4767a5 100644
--- a/tensorflow/compiler/xla/runtime/BUILD
+++ b/tensorflow/compiler/xla/runtime/BUILD
@@ -1,11 +1,12 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_platform_deps")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow:internal",
+        "//tensorflow/compiler/xla:internal",
         "@tf_runtime//:friends",
     ],
     licenses = ["notice"],
@@ -30,7 +31,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "arguments_test",
     srcs = ["arguments_test.cc"],
     deps = [
@@ -58,6 +59,18 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "async_runtime_test",
+    srcs = ["async_runtime_test.cc"],
+    deps = [
+        ":async_runtime",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@tf_runtime//:async_value",
+    ],
+)
+
 cc_library(
     name = "async_values_cache",
     hdrs = ["async_values_cache.h"],
@@ -89,21 +102,26 @@ cc_library(
     hdrs = ["custom_call.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        ":async_runtime",
         ":diagnostics",
         ":errors",
         ":logical_result",
         ":map_by_type",
+        ":state",
         ":type_id",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_abi",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@tf_runtime//:async_value",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
     deps = [
@@ -114,6 +132,8 @@ tf_cc_test(
         ":diagnostics",
         ":executable",
         ":jit_executable",
+        ":module",
+        ":state",
         "//tensorflow/compiler/xla/mlir/runtime/ir/tests:testlib",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
@@ -147,7 +167,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "diagnostics_test",
     srcs = ["diagnostics_test.cc"],
     deps = [
@@ -198,22 +218,26 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "executable_test",
     srcs = ["executable_test.cc"],
     tags = ["nomsan"],  # TODO(ezhulenev): Find msan error in LLVM coroutine passes
     deps = [
         ":arguments",
         ":async_runtime",
-        ":diagnostics",
-        ":executable",
+        ":custom_call_registry",
         ":jit_executable",
+        ":logical_result",
         ":results",
+        ":types",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_options",
         "//tensorflow/compiler/xla/mlir/runtime/transforms/tests:testlib_pipeline",
+        "//tensorflow/compiler/xla/mlir/runtime/utils:async_runtime_api",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/base:dynamic_annotations",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -235,11 +259,49 @@ cc_library(
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:OrcJIT",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TransformUtils",
         "@llvm-project//llvm:X86AsmParser",
         "@llvm-project//llvm:X86CodeGen",
     ],
 )
 
+cc_library(
+    name = "ffi",
+    srcs = ["ffi.cc"],
+    hdrs = ["ffi.h"],
+    deps = [
+        ":custom_call",
+        ":module",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_c_api_hdrs",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+xla_cc_test(
+    name = "ffi_test",
+    srcs = ["ffi_test.cc"],
+    deps = [
+        ":arguments",
+        ":async_runtime",
+        ":custom_call",
+        ":custom_call_registry",
+        ":diagnostics",
+        ":executable",
+        ":ffi",
+        ":jit_executable",
+        ":results",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_c_api_hdrs",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_library(
     name = "jit_executable",
     srcs = ["jit_executable.cc"],
@@ -278,7 +340,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "map_by_type_test",
     srcs = ["map_by_type_test.cc"],
     deps = [
@@ -304,6 +366,39 @@ cc_library(
     ),
 )
 
+cc_library(
+    name = "module",
+    hdrs = ["module.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":custom_call",
+        ":custom_call_registry",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+cc_library(
+    name = "module_registry",
+    srcs = ["module_registry.cc"],
+    hdrs = ["module_registry.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":module",
+    ],
+)
+
+xla_cc_test(
+    name = "module_test",
+    srcs = ["module_test.cc"],
+    deps = [
+        ":custom_call",
+        ":module",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "results",
     hdrs = ["results.h"],
@@ -314,7 +409,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "results_test",
     srcs = ["results_test.cc"],
     deps = [
@@ -333,6 +428,27 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
 )
 
+cc_library(
+    name = "state",
+    hdrs = ["state.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+xla_cc_test(
+    name = "state_test",
+    srcs = ["state_test.cc"],
+    deps = [
+        ":state",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "symbolic_shape",
     srcs = ["symbolic_shape.cc"],
@@ -349,7 +465,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "symbolic_shape_test",
     srcs = ["symbolic_shape_test.cc"],
     deps = [
@@ -409,7 +525,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "type_id_test",
     srcs = ["type_id_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/runtime/arguments.cc b/tensorflow/compiler/xla/runtime/arguments.cc
index 2e0cab1086b..81c01a74fb3 100644
--- a/tensorflow/compiler/xla/runtime/arguments.cc
+++ b/tensorflow/compiler/xla/runtime/arguments.cc
@@ -112,7 +112,7 @@ static Status VerifyMemrefArgument(
     llvm::raw_string_ostream os(err);
 
     auto dim = [](int64_t d) -> std::string {
-      return d == MemrefType::kDynamicSize ? "?" : std::to_string(d);
+      return d == MemrefType::kDynamic ? "?" : std::to_string(d);
     };
 
     auto print_shaped = [&](std::optional<absl::Span<const int64_t>> dims,
diff --git a/tensorflow/compiler/xla/runtime/arguments.h b/tensorflow/compiler/xla/runtime/arguments.h
index e488fc3ccfb..6ad54891043 100644
--- a/tensorflow/compiler/xla/runtime/arguments.h
+++ b/tensorflow/compiler/xla/runtime/arguments.h
@@ -104,8 +104,11 @@ class Arguments {
     return *(new (&storage_.back()) T(std::forward<Args>(args)...));
   }
 
-  const Argument& operator[](size_t index) const {
-    return *reinterpret_cast<const Argument*>(storage_[index].data);
+  const auto& operator[](size_t index) const {
+    using T = std::conditional_t<sizeof...(Ts) == 1,
+                                 std::tuple_element_t<0, std::tuple<Ts...>>,
+                                 Argument>;
+    return *reinterpret_cast<const T*>(storage_[index].data);
   }
 
   size_t size() const { return storage_.size(); }
diff --git a/tensorflow/compiler/xla/runtime/async_runtime.cc b/tensorflow/compiler/xla/runtime/async_runtime.cc
index 33dbab2d944..1acc5ed3727 100644
--- a/tensorflow/compiler/xla/runtime/async_runtime.cc
+++ b/tensorflow/compiler/xla/runtime/async_runtime.cc
@@ -48,39 +48,34 @@ using xla::runtime::AsyncRuntimeObject;
 using tsl::port::AlignedFree;
 using tsl::port::AlignedMalloc;
 
-class AsyncToken : public AsyncRuntimeObject {
- public:
+struct AsyncToken : public AsyncRuntimeObject {
   explicit AsyncToken(unsigned ref_count = 1)
       : AsyncRuntimeObject(ref_count),
-        chain_(MakeConstructedAsyncValueRef<Chain>(storage_)) {}
+        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {}
 
-  tsl::AsyncValue* GetAsyncValue() const { return chain_.AsPtr().value(); }
+  tsl::AsyncValue* GetAsyncValue() const { return chain.AsPtr().value(); }
 
- private:
-  AsyncValueStorage<Chain> storage_;
-  AsyncValueOwningRef<Chain> chain_;
+  AsyncValueStorage<Chain> storage;
+  AsyncValueOwningRef<Chain> chain;
 };
 
-class AsyncValue : public AsyncRuntimeObject {
- public:
+struct AsyncValue : public AsyncRuntimeObject {
   explicit AsyncValue(size_t size, size_t alignment, unsigned ref_count = 1)
       : AsyncRuntimeObject(ref_count),
-        data_storage_(size, alignment),
-        chain_(MakeConstructedAsyncValueRef<Chain>(storage_)) {
+        data_storage(size, alignment),
+        chain(MakeConstructedAsyncValueRef<Chain>(storage)) {
     // Storage memory will be initialized by the compiled executable.
     ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(GetStorage(), size);
   }
 
-  void* GetStorage() {
+  std::byte* GetStorage() {
     assert(!GetAsyncValue()->IsError() && "unexpected error state");
-    if (data_storage_.is_inline)
-      return reinterpret_cast<void*>(&data_storage_.inline_buffer[0]);
-    return data_storage_.allocated_buffer;
+    if (data_storage.is_inline) return &data_storage.inline_buffer[0];
+    return data_storage.allocated_buffer;
   }
 
-  tsl::AsyncValue* GetAsyncValue() const { return chain_.AsPtr().value(); }
+  tsl::AsyncValue* GetAsyncValue() const { return chain.AsPtr().value(); }
 
- private:
   // If the requested async value storage is small, use the inlined storage.
   // Fall back on dynamic allocation if the requested storage size is large.
   struct Storage {
@@ -89,7 +84,9 @@ class AsyncValue : public AsyncRuntimeObject {
 
     Storage(size_t size, size_t alignment)
         : is_inline(CanStoreInline(size, alignment)) {
-      if (!is_inline) allocated_buffer = AlignedMalloc(size, alignment);
+      if (!is_inline)
+        allocated_buffer =
+            reinterpret_cast<std::byte*>(AlignedMalloc(size, alignment));
     }
 
     ~Storage() {
@@ -104,69 +101,67 @@ class AsyncValue : public AsyncRuntimeObject {
     bool is_inline;
     union {
       alignas(kAlign) std::array<std::byte, kSize> inline_buffer;
-      void* allocated_buffer;
+      std::byte* allocated_buffer;
     };
   };
 
-  Storage data_storage_;
+  Storage data_storage;
 
   // Async value that tracks value readiness. It becomes available when result
   // is written to the data storage and ready for consumption.
-  AsyncValueStorage<Chain> storage_;
-  AsyncValueOwningRef<Chain> chain_;
+  AsyncValueStorage<Chain> storage;
+  AsyncValueOwningRef<Chain> chain;
 };
 
-class AsyncGroup : public AsyncRuntimeObject {
- public:
+struct AsyncGroup : public AsyncRuntimeObject {
   explicit AsyncGroup(int64_t size, unsigned ref_count = 1)
       : AsyncRuntimeObject(ref_count),
-        size_(size),
-        rank_(0),
-        pending_tokens_(size),
-        num_errors_(0),
-        completed_(size_ == 0 ? MakeAvailableAsyncValueRef<Chain>(storage_)
-                              : MakeConstructedAsyncValueRef<Chain>(storage_)) {
-    assert(size_ >= 0 && "size can't be negative");
+        size(size),
+        rank(0),
+        pending_tokens(size),
+        num_errors(0),
+        completed(size == 0 ? MakeAvailableAsyncValueRef<Chain>(storage)
+                            : MakeConstructedAsyncValueRef<Chain>(storage)) {
+    assert(size >= 0 && "size can't be negative");
   }
 
   size_t AddToken(AsyncToken* token) {
-    size_t rank = rank_.fetch_add(1, std::memory_order_relaxed);
-    assert(rank < size_ && "can't add more tokens than the group size");
+    size_t token_rank = rank.fetch_add(1, std::memory_order_relaxed);
+    assert(token_rank < size && "can't add more tokens than the group size");
 
     // When token becomes available drop the number of pending tokens and maybe
     // make the group completion async value available.
     token->GetAsyncValue()->AndThen([group = this, token]() {
       // Increment the number of errors in the group.
-      if (token->GetAsyncValue()->IsError()) group->num_errors_.fetch_add(1);
+      if (token->GetAsyncValue()->IsError()) group->num_errors.fetch_add(1);
 
       // Pending tokens can't drop below zero.
-      assert(group->pending_tokens_ > 0 && "wrong group size");
+      assert(group->pending_tokens > 0 && "wrong group size");
 
       // We do track group error state with the number of errors, and never
       // set completion async value state to error.
-      if (group->pending_tokens_.fetch_sub(1) == 1)
-        group->completed_.AsPtr().SetStateConcrete();
+      if (group->pending_tokens.fetch_sub(1) == 1)
+        group->completed.AsPtr().SetStateConcrete();
     });
 
-    return rank;
+    return token_rank;
   }
 
   tsl::AsyncValue* GetCompletionAsyncValue() const {
-    return completed_.AsPtr().value();
+    return completed.AsPtr().value();
   }
 
-  bool IsError() const { return num_errors_.load() != 0; }
+  bool IsError() const { return num_errors.load() != 0; }
 
- private:
-  int64_t size_;
-  std::atomic<int64_t> rank_;
-  std::atomic<int64_t> pending_tokens_;
-  std::atomic<int64_t> num_errors_;
+  int64_t size;
+  std::atomic<int64_t> rank;
+  std::atomic<int64_t> pending_tokens;
+  std::atomic<int64_t> num_errors;
 
   // Async value that keeps track the group completion, it will become available
   // when the number of pending tokens will drop to zero.
-  AsyncValueStorage<Chain> storage_;
-  AsyncValueOwningRef<Chain> completed_;
+  AsyncValueStorage<Chain> storage;
+  AsyncValueOwningRef<Chain> completed;
 };
 
 }  // namespace runtime
@@ -209,7 +204,7 @@ static_assert(sizeof(AsyncRuntime) == 1 * sizeof(void*),
   return async_runtime;
 }
 
-/*static*/ void* AsyncRuntime::GetStorage(Value* value) {
+/*static*/ std::byte* AsyncRuntime::GetStorage(Value* value) {
   return value->GetStorage();
 }
 
@@ -342,5 +337,20 @@ static_assert(sizeof(AsyncRuntime) == 1 * sizeof(void*),
   Await(group->GetCompletionAsyncValue());
 }
 
+/*static*/ AsyncRuntime::Token* AsyncRuntime::AsToken(
+    tsl::AsyncValueRef<tsl::Chain> chain) {
+  AsyncRuntime::Token* token = CreateToken();
+
+  chain.AndThen([token](absl::StatusOr<tsl::Chain*> status_or) {
+    if (!status_or.ok()) {
+      SetError(token);
+    } else {
+      SetAvailable(token);
+    }
+  });
+
+  return token;
+}
+
 }  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/async_runtime.h b/tensorflow/compiler/xla/runtime/async_runtime.h
index a41c3d17ea6..27590aedf34 100644
--- a/tensorflow/compiler/xla/runtime/async_runtime.h
+++ b/tensorflow/compiler/xla/runtime/async_runtime.h
@@ -24,6 +24,8 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/threadpool.h"
 #include "tfrt/concurrency/async_value.h"  // from @tf_runtime
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
 #include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
 
 namespace mlir {
@@ -157,7 +159,7 @@ class AsyncRuntime {
   // ------------------------------------------------------------------------ //
 
   // Returns a pointer to the async value storage.
-  static void* GetStorage(Value* value);
+  static std::byte* GetStorage(Value* value);
 
   // Extracts async value that holds a chain owned by the value.
   static tsl::AsyncValue* GetAsyncValue(Value* value);
@@ -177,6 +179,27 @@ class AsyncRuntime {
   static AsyncRuntimeObject* ToAsyncRuntimeObject(Value* value);
   static AsyncRuntimeObject* ToAsyncRuntimeObject(Group* group);
 
+  // Convert async value/token to async runtime object.
+  static Token* AsToken(tsl::AsyncValueRef<tsl::Chain> chain);
+
+  template <typename T>
+  static Value* AsValue(
+      tsl::AsyncValueRef<T> value, size_t size,
+      absl::FunctionRef<void(const T*, std::byte* storage)> write) {
+    Value* runtime_async_value =
+        AsyncRuntime::CreateValue(size, alignof(std::max_align_t));
+    value.AndThen([runtime_async_value, write](absl::StatusOr<T*> status_or) {
+      if (!status_or.ok()) {
+        AsyncRuntime::SetError(runtime_async_value);
+      } else {
+        auto* store = AsyncRuntime::GetStorage(runtime_async_value);
+        write(*status_or, store);
+        AsyncRuntime::SetAvailable(runtime_async_value);
+      }
+    });
+    return runtime_async_value;
+  }
+
   AsyncTaskRunner* runner() const { return runner_; }
 
  private:
diff --git a/tensorflow/compiler/xla/runtime/async_runtime_test.cc b/tensorflow/compiler/xla/runtime/async_runtime_test.cc
new file mode 100644
index 00000000000..20ddc21f3fe
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/async_runtime_test.cc
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2022 The TensorFlow Runtime Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/compiler/xla/runtime/async_runtime.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/tsl/platform/test.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
+
+namespace xla {
+namespace runtime {
+constexpr int kDefaultNumOfThreads = 4;
+
+class AsyncRuntimeTest : public ::testing::Test {
+ protected:
+  AsyncRuntimeTest() {
+    thread_pool_ = std::make_unique<tsl::thread::ThreadPool>(
+        tsl::Env::Default(), "test", kDefaultNumOfThreads);
+    async_task_runner_ =
+        std::make_unique<ThreadPoolAsyncTaskRunner>(thread_pool_.get());
+    AsyncRuntime::Set(AsyncRuntime(async_task_runner_.get()));
+  }
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+  std::unique_ptr<AsyncTaskRunner> async_task_runner_;
+};
+
+TEST_F(AsyncRuntimeTest, SetTokenError) {
+  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
+  AsyncRuntime::SetError(token);
+  EXPECT_EQ(AsyncRuntime::IsError(token), true);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+}
+
+TEST_F(AsyncRuntimeTest, SetValueError) {
+  AsyncRuntime::Value *value =
+      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
+  AsyncRuntime::SetError(value);
+  EXPECT_EQ(AsyncRuntime::IsError(value), true);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
+}
+
+TEST_F(AsyncRuntimeTest, IsGroupError) {
+  AsyncRuntime::Group *group = AsyncRuntime::CreateGroup(1);
+  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
+  AsyncRuntime::SetError(token);
+  AsyncRuntime::AddTokenToGroup(group, token);
+  EXPECT_EQ(AsyncRuntime::IsError(group), true);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(group));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+}
+
+TEST_F(AsyncRuntimeTest, AwaitToken) {
+  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
+  AsyncRuntime::Value *value =
+      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
+  int v = 0;
+  AsyncRuntime::AwaitToken(token, [&] {
+    v = 42;
+    AsyncRuntime::SetAvailable(value);
+  });
+
+  AsyncRuntime::SetAvailable(token);
+  AsyncRuntime::AwaitValue(value);
+  EXPECT_EQ(v, 42);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
+}
+
+TEST_F(AsyncRuntimeTest, AwaitGroup) {
+  AsyncRuntime::Group *group = AsyncRuntime::CreateGroup(1);
+  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
+  AsyncRuntime::Value *value =
+      AsyncRuntime::CreateValue(sizeof(int32_t), alignof(std::max_align_t));
+  AsyncRuntime::AddTokenToGroup(group, token);
+  int v = 0;
+  AsyncRuntime::AwaitGroup(group, [&] {
+    v = 42;
+    AsyncRuntime::SetAvailable(value);
+  });
+
+  AsyncRuntime::SetAvailable(token);
+  AsyncRuntime::AwaitValue(value);
+  EXPECT_EQ(v, 42);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(group));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value));
+}
+
+TEST_F(AsyncRuntimeTest, Execute) {
+  auto &runtime = AsyncRuntime::GetCurrentRuntime();
+  AsyncRuntime::Token *token = AsyncRuntime::CreateToken();
+  int v = 0;
+  runtime.Execute([&] {
+    v = 42;
+    AsyncRuntime::SetAvailable(token);
+  });
+  AsyncRuntime::AwaitToken(token);
+  EXPECT_EQ(v, 42);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+}
+
+TEST_F(AsyncRuntimeTest, AsToken) {
+  auto chain1 = tsl::MakeAvailableAsyncValueRef<tsl::Chain>();
+  auto *token1 = AsyncRuntime::AsToken(chain1);
+  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token1)->IsAvailable(), true);
+
+  auto chain2 = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
+  chain2.SetError("error");
+  auto *token2 = AsyncRuntime::AsToken(chain2);
+  EXPECT_EQ(AsyncRuntime::IsError(token2), true);
+
+  auto chain3 = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
+  auto *token3 = AsyncRuntime::AsToken(chain3);
+  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token3)->IsAvailable(), false);
+  chain3.SetStateConcrete();
+  AsyncRuntime::AwaitToken(token3);
+  EXPECT_EQ(AsyncRuntime::GetAsyncValue(token3)->IsAvailable(), true);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token1));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token2));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token3));
+}
+
+TEST_F(AsyncRuntimeTest, AsValue) {
+  auto async_value1 = tsl::MakeAvailableAsyncValueRef<int32_t>(42);
+  auto write = [](auto *v, std::byte *store) {
+    int32_t *store_t = reinterpret_cast<int32_t *>(store);
+    *store_t = *v;
+  };
+
+  auto *value1 =
+      AsyncRuntime::AsValue<int32_t>(async_value1, sizeof(int32_t), write);
+  auto *storage1 =
+      reinterpret_cast<int32_t *>(AsyncRuntime::GetStorage(value1));
+  EXPECT_EQ(*storage1, 42);
+
+  auto async_value2 = tsl::MakeConstructedAsyncValueRef<int32_t>();
+  async_value2.SetError("error");
+  auto *value2 =
+      AsyncRuntime::AsValue<int32_t>(async_value2, sizeof(int32_t), write);
+  EXPECT_EQ(AsyncRuntime::IsError(value2), true);
+
+  auto async_value3 = tsl::MakeConstructedAsyncValueRef<int32_t>(42);
+  auto *value3 =
+      AsyncRuntime::AsValue<int32_t>(async_value3, sizeof(int32_t), write);
+  EXPECT_EQ(AsyncRuntime::GetAsyncValue(value3)->IsAvailable(), false);
+  async_value3.SetStateConcrete();
+  AsyncRuntime::AwaitValue(value3);
+  auto *storage3 =
+      reinterpret_cast<int32_t *>(AsyncRuntime::GetStorage(value3));
+  EXPECT_EQ(*storage3, 42);
+
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value1));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value2));
+  AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(value3));
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/custom_call.cc b/tensorflow/compiler/xla/runtime/custom_call.cc
index 93e075d6864..14b58030fea 100644
--- a/tensorflow/compiler/xla/runtime/custom_call.cc
+++ b/tensorflow/compiler/xla/runtime/custom_call.cc
@@ -20,19 +20,19 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace xla {
 namespace runtime {
 
-using llvm::ArrayRef;
 using llvm::raw_ostream;
 
 template <typename T>
 using TensorRef = CustomCall::TensorRef<T>;
 
 static void PrintArr(raw_ostream& os, std::string_view name,
-                     llvm::ArrayRef<int64_t> arr) {
+                     absl::Span<const int64_t> arr) {
   os << " " << name << ": [";
   auto i64_to_string = [](int64_t v) { return std::to_string(v); };
   os << llvm::join(llvm::map_range(arr, i64_to_string), ", ");
@@ -83,19 +83,31 @@ void PopulateCustomCallTypeIdNames(TypeIDNameRegistry& r) {
 
   r.Register<Tagged<MemrefView>>("__type_id_memref_view");
   r.Register<Tagged<StridedMemrefView>>("__type_id_strided_memref_view");
-  r.Register<Tagged<EmptyArrayRef>>("__type_id_empty_array");
+  r.Register<Tagged<EmptyArray>>("__type_id_empty_array");
+  r.Register<Tagged<Dictionary>>("__type_id_dictionary");
 
-  r.Register<Tagged<ArrayRef<int8_t>>>("__type_id_array_int8");
-  r.Register<Tagged<ArrayRef<int16_t>>>("__type_id_array_int16");
-  r.Register<Tagged<ArrayRef<int32_t>>>("__type_id_array_int32");
-  r.Register<Tagged<ArrayRef<int64_t>>>("__type_id_array_int64");
-  r.Register<Tagged<ArrayRef<float>>>("__type_id_array_float");
-  r.Register<Tagged<ArrayRef<double>>>("__type_id_array_double");
+  r.Register<Tagged<absl::Span<const int8_t>>>("__type_id_array_int8");
+  r.Register<Tagged<absl::Span<const int16_t>>>("__type_id_array_int16");
+  r.Register<Tagged<absl::Span<const int32_t>>>("__type_id_array_int32");
+  r.Register<Tagged<absl::Span<const int64_t>>>("__type_id_array_int64");
+  r.Register<Tagged<absl::Span<const float>>>("__type_id_array_float");
+  r.Register<Tagged<absl::Span<const double>>>("__type_id_array_double");
 
   r.Register<Tagged<TensorRef<int32_t>>>("__type_id__tensor_int32_t");
   r.Register<Tagged<TensorRef<int64_t>>>("__type_id_tensor_int64_t");
   r.Register<Tagged<TensorRef<float>>>("__type_id_tensor_float");
   r.Register<Tagged<TensorRef<double>>>("__type_id_tensor_double");
+
+  r.Register<Tagged<tsl::AsyncValueRef<bool>>>("__type_id_async_bool");
+  r.Register<Tagged<tsl::AsyncValueRef<int8_t>>>("__type_id_async_int8");
+  r.Register<Tagged<tsl::AsyncValueRef<int16_t>>>("__type_id_async_int16");
+  r.Register<Tagged<tsl::AsyncValueRef<int32_t>>>("__type_id_async_int32");
+  r.Register<Tagged<tsl::AsyncValueRef<int64_t>>>("__type_id_async_int64");
+  r.Register<Tagged<tsl::AsyncValueRef<float>>>("__type_id_async_float");
+  r.Register<Tagged<tsl::AsyncValueRef<double>>>("__type_id_async_double");
+  r.Register<Tagged<tsl::AsyncValueRef<xla::runtime::MemrefView>>>(
+      "__type_id_async_memref");
+  r.Register<Tagged<tsl::AsyncValueRef<tsl::Chain>>>("__type_id_async_chain");
 }
 
 }  // namespace runtime
@@ -105,12 +117,23 @@ XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(std::string_view);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::StridedMemrefView);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::MemrefView);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::FlatMemrefView);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArrayRef);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArray);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(xla::runtime::Dictionary);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(int32_t);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(int64_t);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(float);
 XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(double);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(ArrayRef<int32_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(ArrayRef<int64_t>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(ArrayRef<float>);
-XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(ArrayRef<double>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const int32_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const int64_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const float>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(absl::Span<const double>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<bool>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int8_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int16_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int32_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int64_t>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<float>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<double>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(
+    tsl::AsyncValueRef<xla::runtime::MemrefView>);
+XLA_RUNTIME_DEFINE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<tsl::Chain>);
diff --git a/tensorflow/compiler/xla/runtime/custom_call.h b/tensorflow/compiler/xla/runtime/custom_call.h
index 1a5d9cd3802..fc1a8473a98 100644
--- a/tensorflow/compiler/xla/runtime/custom_call.h
+++ b/tensorflow/compiler/xla/runtime/custom_call.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_CUSTOM_CALL_H_
 #define TENSORFLOW_COMPILER_XLA_RUNTIME_CUSTOM_CALL_H_
 
+#include <algorithm>
 #include <any>
 #include <cstddef>
 #include <cstdint>
@@ -33,17 +34,19 @@ limitations under the License.
 #include "absl/base/dynamic_annotations.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Compiler.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/runtime/async_runtime.h"
 #include "tensorflow/compiler/xla/runtime/diagnostics.h"
 #include "tensorflow/compiler/xla/runtime/errors.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_abi.h"
 #include "tensorflow/compiler/xla/runtime/logical_result.h"
 #include "tensorflow/compiler/xla/runtime/map_by_type.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/runtime/type_id.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
 
 namespace xla {
 namespace runtime {
@@ -83,8 +86,8 @@ class CustomCall {
   // A type for representing tensors with shapes.
   template <typename T>
   struct TensorRef {
-    llvm::ArrayRef<int64_t> shape;
-    llvm::ArrayRef<T> data;
+    absl::Span<const int64_t> shape;
+    absl::Span<const T> data;
   };
 
   // An ordinal of a function exported from executable.
@@ -140,14 +143,12 @@ class CustomCall {
   }
 
   template <typename T>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static bool Isa(RuntimeChecks checks,
-                                               TypeID type_id) {
+  static bool Isa(RuntimeChecks checks, TypeID type_id) {
     return !CheckTypes(checks) || type_id == TypeID::get<Tagged<T>>();
   }
 
   template <typename T, typename U, typename... Ts>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static bool Isa(RuntimeChecks checks,
-                                               TypeID type_id) {
+  static bool Isa(RuntimeChecks checks, TypeID type_id) {
     return !CheckTypes(checks) || type_id == TypeID::get<Tagged<T>>() ||
            Isa<U, Ts...>(checks, type_id);
   }
@@ -161,6 +162,33 @@ class CustomCall {
 
   static CustomCallBinding<> Bind(std::string callee);
   static CustomCallBinding<> Bind(std::string callee, const Options& opts);
+
+  // This is a helper template that allows to convert functions pointers from
+  // the run time values to compile time values (template arguments) with
+  // automatic template arguments inference.
+  //
+  // Example:
+  //
+  //   static LogicalResult Foo(int32_t arg) {... }
+  //
+  //   template<typename Callable>
+  //   void call(Callable callable) { callable(42); }
+  //
+  //   call(Foo);                     // `Foo` passed as a runtime value
+  //   call(FunctionWrapper<Foo>())   // `Foo` passed as a template argument
+  //
+  // In the first case compiler will not be able to inline `Foo` into the `call`
+  // body. However in the second case it can do that, because function pointer
+  // is a statically known value (template non-type argument).
+  template <auto fn>
+  struct FunctionWrapper;
+
+  template <typename Ret, typename... Args, Ret (*fn)(Args...)>
+  struct FunctionWrapper<fn> {
+    ABSL_ATTRIBUTE_ALWAYS_INLINE Ret operator()(Args... args) const {
+      return fn(args...);
+    }
+  };
 };
 
 // Forward declare template defined below.
@@ -184,12 +212,18 @@ struct Ret {};
 template <typename T>
 struct UserData {};
 
+// A type tag to distinguish arguments tied to the state in the
+// `CustomCallBinding` variadic template argument.
+template <typename T>
+struct StateTag {};
+
 // A type tag to distinguish arguments tied to the constant values in the
 // `CustomCallBinding` variadic template argument.
 template <typename T>
 struct Value {};
 
-// A template for checking if type is a wrapped attribute or user data.
+// A template for checking if type is a regular argument or one of the special
+// arguments wrapped in a type tag (e.g. attr, user data, etc...).
 template <typename>
 struct IsWrapped : std::false_type {};
 
@@ -202,6 +236,9 @@ struct IsWrapped<internal::Ret<T>> : std::true_type {};
 template <typename T>
 struct IsWrapped<internal::UserData<T>> : std::true_type {};
 
+template <typename T>
+struct IsWrapped<internal::StateTag<T>> : std::true_type {};
+
 template <typename T>
 struct IsWrapped<internal::Value<T>> : std::true_type {};
 
@@ -261,6 +298,12 @@ class CustomCallBinding {
     return {std::move(*this)};
   }
 
+  template <typename T>
+  CustomCallBinding<Ts..., internal::StateTag<T>> State(std::string id) && {
+    attrs_.push_back(std::move(id));
+    return {std::move(*this)};
+  }
+
   template <typename T>
   CustomCallBinding<Ts..., internal::Value<T>> Value(T value) && {
     values_.push_back(std::move(value));
@@ -352,34 +395,6 @@ struct CustomCallAttrDecoding;
 template <typename T, CustomCall::RuntimeChecks>
 struct CustomCallRetDecoding;
 
-//===----------------------------------------------------------------------===//
-// C structures corresponding to the `rt-to-llvm` pass LLVM structs encoding
-// various types of arguments/attributes.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-struct EncodedMemref {
-  uint8_t dtype;
-  uint8_t rank;
-  void* data;
-  int64_t dims[];
-};
-
-template <typename T>
-struct EncodedArray {
-  int64_t size;
-  const T* data;
-};
-
-template <typename T>
-struct EncodedDenseElements {
-  struct EncodedArray<T> payload;
-  int64_t rank;
-  int64_t shape[];
-};
-
-}  // namespace internal
-
 //===----------------------------------------------------------------------===//
 // Helpers for decoding opaque arguments and attributes memory.
 //===----------------------------------------------------------------------===//
@@ -402,36 +417,46 @@ struct DecodedAttr {
 // A convenience wrapper around opaque arguments memory.
 class DecodedArgs {
  public:
-  explicit DecodedArgs(void** args)
-      : args_(args), num_args_(*reinterpret_cast<int64_t*>(args_[0])) {}
-
-  LLVM_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return num_args_; }
+  explicit DecodedArgs(void** args) {
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args, sizeof(void*));
+    size_ = *reinterpret_cast<int64_t*>(args[0]);
+    if (size_) {
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args + 1, sizeof(void*));
+      type_table_ = reinterpret_cast<void**>(args[1]);
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(type_table_, size_ * sizeof(void*));
+      values_ = args + 2;
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(values_, size_ * sizeof(void*));
+    }
+  }
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE DecodedArg operator[](size_t i) const {
-    void** arg_base = args_ + 1 + i * 2;
+  ABSL_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return size_; }
 
+  ABSL_ATTRIBUTE_ALWAYS_INLINE DecodedArg operator[](size_t i) const {
     DecodedArg arg;
-    arg.type_id = TypeID::getFromOpaquePointer(arg_base[0]);
-    arg.value = arg_base[1];
-
+    arg.type_id = TypeID::getFromOpaquePointer(type_table_[i]);
+    arg.value = values_[i];
     return arg;
   }
 
  private:
-  void** args_;
-  int64_t num_args_;
+  int64_t size_;
+  void** type_table_ = nullptr;
+  void** values_ = nullptr;
 };
 
 // A convenience wrapper around opaque attributes memory.
 class DecodedAttrs {
  public:
-  explicit DecodedAttrs(void** attrs)
-      : attrs_(attrs), num_attrs_(*reinterpret_cast<int64_t*>(attrs_[0])) {}
+  explicit DecodedAttrs(void** attrs) : encoded_(attrs + 1) {
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(attrs, sizeof(void*));
+    size_ = *reinterpret_cast<int64_t*>(attrs[0]);
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_, 3 * size_ * sizeof(void*));
+  }
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return num_attrs_; }
+  ABSL_ATTRIBUTE_ALWAYS_INLINE int64_t size() const { return size_; }
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE DecodedAttr operator[](size_t i) const {
-    void** attr_base = attrs_ + 1 + i * 3;
+  ABSL_ATTRIBUTE_ALWAYS_INLINE DecodedAttr operator[](size_t i) const {
+    void** attr_base = encoded_ + i * 3;
 
     DecodedAttr attr;
     auto* name = reinterpret_cast<internal::EncodedArray<char>*>(attr_base[0]);
@@ -443,8 +468,8 @@ class DecodedAttrs {
   }
 
  private:
-  void** attrs_;
-  int64_t num_attrs_;
+  void** encoded_;
+  int64_t size_;
 };
 
 // Using the same class for decoded returns
@@ -566,6 +591,12 @@ struct FnArgType<internal::UserData<T>> {
   using Type = T;
 };
 
+// Extracts the underlying type from the state type tag.
+template <typename T>
+struct FnArgType<internal::StateTag<T>> {
+  using Type = State<T>;
+};
+
 // Extracts the underlying type from the value type tag.
 template <typename T>
 struct FnArgType<internal::Value<T>> {
@@ -676,116 +707,141 @@ struct DecodingOffsets {
   int64_t values = 0;
 };
 
+struct DecodingContext {
+  internal::DecodedArgs args;
+  internal::DecodedRets rets;
+  internal::DecodedAttrs attrs;
+
+  // Attributes' names and mapping from attrs' offsets to indices in `attrs`.
+  absl::Span<const std::string> attrs_names;
+  absl::Span<const size_t> attrs_idx;
+
+  // Values bound to arguments at handler construction time.
+  absl::Span<const std::any> values;
+
+  // User-provided auxiliary data.
+  const CustomCall::UserData* user_data;
+};
+
+template <typename T, CustomCall::RuntimeChecks checks>
+ABSL_ATTRIBUTE_ALWAYS_INLINE inline FailureOr<T*> DecodeUserData(
+    const CustomCall::UserData* user_data) {
+  if (!CustomCall::CheckUserData(checks)) return user_data->get<T>();
+
+  // TODO(ezhulenev): Add an option to request nullable user data, because
+  // right now we do not distinguish between a user data pointer that doesn't
+  // exist, and a null pointer passed by the user.
+
+  // Get the requested value if user data was passed to the custom call.
+  auto* ptr = user_data ? user_data->getIfExists<T>() : nullptr;
+  if (LLVM_UNLIKELY(!ptr)) return failure();
+  return ptr;
+}
+
+template <typename T, CustomCall::RuntimeChecks checks>
+ABSL_ATTRIBUTE_ALWAYS_INLINE inline FailureOr<T> DecodeAttr(
+    DecodingOffsets& offsets, absl::Span<const std::string> attrs_names,
+    absl::Span<const size_t> attrs_idx, internal::DecodedAttrs attrs) {
+  // Find decoded attribute corresponding for the given attribute index.
+  int64_t idx = offsets.attrs++;
+
+  // Do not check the attribute name, and decode attribute at the given index.
+  if (!CustomCall::CheckNames(checks)) {
+    size_t i = attrs_idx[idx];
+    return CustomCallAttrDecoding<T, checks>::Decode(
+        attrs[i].name, attrs[i].type_id, attrs[i].value);
+  }
+
+  std::string_view attr_name = attrs_names[idx];
+
+  // Given that attributes are passed to the custom call handler
+  // lexicographically sorted by name, we can find the attribute we are
+  // looking for only between the `attrs_idx` offset and the end of the
+  // attributes array.
+  for (size_t i = attrs_idx[idx]; i < attrs.size(); ++i) {
+    if (LLVM_LIKELY(attrs[i].name == attr_name))
+      return CustomCallAttrDecoding<T, checks>::Decode(
+          attrs[i].name, attrs[i].type_id, attrs[i].value);
+  }
+
+  // Attribute we were looking for was not passed as an argument.
+  return failure();
+}
+
 template <typename T, CustomCall::RuntimeChecks checks>
 struct Decode {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attrs_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    internal::DecodedArg arg = args[offsets.args++];
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    internal::DecodedArg arg = ctx.args[offsets.args++];
     return CustomCallArgDecoding<T, checks>::Decode(arg.type_id, arg.value);
   }
 };
 
 template <typename T, CustomCall::RuntimeChecks checks>
 struct Decode<internal::Ret<T>, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attrs_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    internal::DecodedRet ret = rets[offsets.rets++];
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    internal::DecodedRet ret = ctx.rets[offsets.rets++];
     return CustomCallRetDecoding<T, checks>::Decode(ret.type_id, ret.value);
   }
 };
 
 template <typename T, CustomCall::RuntimeChecks checks>
 struct Decode<internal::Attr<T>, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attrs_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    // Find decoded attribute corresponding for the given attribute index.
-    int64_t idx = offsets.attrs++;
-
-    // Do not check the attribute name, and decode attribute at the given index.
-    if (!CustomCall::CheckNames(checks)) {
-      size_t i = attrs_idx[idx];
-      return CustomCallAttrDecoding<T, checks>::Decode(
-          attrs[i].name, attrs[i].type_id, attrs[i].value);
-    }
-
-    std::string_view attr = attrs_names[idx];
-
-    // Given that attributes are passed to the custom call handler
-    // lexicographically sorted by name, we can find the attribute we are
-    // looking for only between the `attrs_idx` offset and the end of the
-    // attributes array.
-    for (size_t i = attrs_idx[idx]; i < attrs.size(); ++i) {
-      if (LLVM_LIKELY(attrs[i].name == attr))
-        return CustomCallAttrDecoding<T, checks>::Decode(
-            attrs[i].name, attrs[i].type_id, attrs[i].value);
-    }
-
-    // Attribute we were looking for was not passed as an argument.
-    return failure();
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    return DecodeAttr<T, checks>(offsets, ctx.attrs_names, ctx.attrs_idx,
+                                 ctx.attrs);
   }
 };
 
 template <typename T, CustomCall::RuntimeChecks checks>
 struct Decode<internal::UserData<T>, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attrs_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
     using UserDataT = std::remove_pointer_t<T>;
+    return DecodeUserData<UserDataT, checks>(ctx.user_data);
+  }
+};
 
-    if (!CustomCall::CheckUserData(checks)) return user_data->get<UserDataT>();
-
-    // TODO(ezhulenev): Add an option to request nullable user data, because
-    // right now we do not distinguish between a user data pointer that doesn't
-    // exist, and a null pointer passed by the user.
-
-    // Get the requested value if user data was passed to the custom call.
-    auto* ptr = user_data ? user_data->getIfExists<UserDataT>() : nullptr;
-    if (LLVM_UNLIKELY(!ptr)) return failure();
-    return ptr;
+template <typename T, CustomCall::RuntimeChecks checks>
+struct Decode<internal::StateTag<T>, checks> {
+  using Snapshot = typename StateVector<T>::Snapshot;
+
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<runtime::State<T>> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    // Get the state snapshot and state id from user data and attributes.
+    FailureOr<Snapshot*> snapshot =
+        DecodeUserData<Snapshot, checks>(ctx.user_data);
+    FailureOr<int64_t> id = DecodeAttr<int64_t, checks>(
+        offsets, ctx.attrs_names, ctx.attrs_idx, ctx.attrs);
+    if (LLVM_UNLIKELY(failed(snapshot) || failed(id))) return failure();
+
+    return (*snapshot)->state(*id);
   }
 };
 
 template <typename T, CustomCall::RuntimeChecks checks>
 struct Decode<internal::Value<T>, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attrs_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    return std::any_cast<T>(values[offsets.values++]);
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    return std::any_cast<T>(ctx.values[offsets.values++]);
   }
 };
 
 template <CustomCall::RuntimeChecks checks>
 struct Decode<CustomCall::RemainingArgs, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::RemainingArgs> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attr_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    return CustomCall::RemainingArgs(args, offsets.args);
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::RemainingArgs> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    return CustomCall::RemainingArgs(ctx.args, offsets.args);
   }
 };
 
 template <CustomCall::RuntimeChecks checks>
 struct Decode<CustomCall::VariantArg, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantArg> call(
-      DecodingOffsets& offsets, internal::DecodedArgs args,
-      internal::DecodedRets rets, llvm::ArrayRef<std::string> attr_names,
-      llvm::ArrayRef<size_t> attrs_idx, internal::DecodedAttrs attrs,
-      llvm::ArrayRef<std::any> values, const CustomCall::UserData* user_data) {
-    return CustomCall::VariantArg(args, offsets.args++);
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantArg> call(
+      DecodingOffsets& offsets, DecodingContext& ctx) {
+    return CustomCall::VariantArg(ctx.args, offsets.args++);
   }
 };
 
@@ -858,14 +914,9 @@ class CustomCallHandler : public CustomCall {
  public:
   std::string_view name() const final { return callee_; }
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE LogicalResult
+  ABSL_ATTRIBUTE_ALWAYS_INLINE LogicalResult
   call(void** args, void** attrs, void** rets, const UserData* user_data,
        const DiagnosticEngine* diagnostic) const final {
-    // Unpoison the first pointer to get the args, attrs, and rets sizes.
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args, sizeof(void*));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(attrs, sizeof(void*));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(rets, sizeof(void*));
-
     // Decode arguments and attributes from the opaque pointers.
     internal::DecodedArgs decoded_args(args);
     internal::DecodedAttrs decoded_attrs(attrs);
@@ -875,14 +926,6 @@ class CustomCallHandler : public CustomCall {
     int64_t num_attrs = decoded_attrs.size();
     int64_t num_rets = decoded_rets.size();
 
-    // Unpoison the rest of the of args, attrs, and rets data.
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(args,
-                                        (1 + 2 * num_args) * sizeof(void*));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(attrs,
-                                        (1 + 3 * num_attrs) * sizeof(void*));
-    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(rets,
-                                        (1 + 2 * num_rets) * sizeof(void*));
-
     if (LLVM_UNLIKELY(diagnostic == nullptr))
       diagnostic = DiagnosticEngine::DefaultDiagnosticEngine();
 
@@ -916,11 +959,11 @@ class CustomCallHandler : public CustomCall {
     // Check that we have a correct number of attributes passed to the custom
     // call. Each individual attribute decoding will check the name and the
     // type of the attribute.
-    if (LLVM_UNLIKELY(eval(opts_.exact_attrs ? num_attrs != attrs_.size()
-                                             : num_attrs < attrs_.size())))
+    if (LLVM_UNLIKELY(eval(opts_.exact_attrs ? num_attrs != num_encoded_attrs_
+                                             : num_attrs < num_encoded_attrs_)))
       return diagnostic->EmitError(InvalidArgument(
           "Wrong number of attributes: expected %s%d got %d",
-          opts_.exact_attrs ? "" : "at least ", attrs_.size(), num_attrs));
+          opts_.exact_attrs ? "" : "at least ", num_encoded_attrs_, num_attrs));
 
     // Define index sequences to access custom call operands.
     using Is = std::make_index_sequence<kSize>;
@@ -932,7 +975,7 @@ class CustomCallHandler : public CustomCall {
   }
 
   template <size_t... Is, size_t... ArgsIs, size_t... RetsIs>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE LogicalResult
+  ABSL_ATTRIBUTE_ALWAYS_INLINE LogicalResult
   call(internal::DecodedArgs args, internal::DecodedAttrs attrs,
        internal::DecodedRets rets, const UserData* user_data,
        const DiagnosticEngine* diagnostic, std::index_sequence<Is...>,
@@ -941,13 +984,15 @@ class CustomCallHandler : public CustomCall {
     // arguments, attributes or results.
     internal::DecodingOffsets offsets;
 
+    // Package all the data required for decoding custom call operands.
+    internal::DecodingContext ctx{args,       rets,    attrs,    attrs_,
+                                  attrs_idx_, values_, user_data};
+
     // Decode all operands into FailureOr containers. It is guaranteed
     // that initializer list will be evaluated left-to-right, and we can rely
     // on correct offsets computation.
     std::tuple<FailureOr<FnArgType<Ts>>...> fn_args = {
-        internal::Decode<Ts, checks>::call(offsets, args, rets, attrs_,
-                                           attrs_idx_, attrs, values_,
-                                           user_data)...};
+        internal::Decode<Ts, checks>::call(offsets, ctx)...};
 
     // Check if all operands and results were decoded.
     bool all_decoded = (succeeded(std::get<Is>(fn_args)) && ...);
@@ -1006,13 +1051,18 @@ class CustomCallHandler : public CustomCall {
         values_(std::move(values)),
         opts_(opts),
         attrs_idx_(attrs_.size()) {
-    // Sort attributes names.
+    // Sort attributes names and remove duplicates. These unique attributes are
+    // what we'll be looking for in the encoded custom call attributes.
     std::vector<std::string> sorted = attrs_;
-    llvm::sort(sorted);
+    std::sort(sorted.begin(), sorted.end());
+    sorted.erase(
+        std::unique(sorted.begin(), sorted.end(), std::equal_to<std::string>()),
+        sorted.end());
+    num_encoded_attrs_ = sorted.size();
 
     // Find index or every attribute in the sorted attributes vector.
     for (size_t i = 0; i < attrs_.size(); ++i) {
-      const std::string& attr = attrs_[i];
+      std::string_view attr = attrs_[i];
       attrs_idx_[i] = std::distance(sorted.begin(), llvm::find(sorted, attr));
     }
   }
@@ -1028,6 +1078,11 @@ class CustomCallHandler : public CustomCall {
   // handler sorted by the name, we use this index to efficiently find the
   // decoded attribute entry.
   std::vector<size_t> attrs_idx_;
+
+  // The number of attributes we expect in the encoded custom call arguments.
+  // This is not the same as `attrs_.size()` because of potential duplicates,
+  // e.g. attribute corresponding to state id might be used multiple times.
+  size_t num_encoded_attrs_;
 };
 
 template <CustomCall::RuntimeChecks checks, typename Fn, typename... Ts>
@@ -1049,15 +1104,15 @@ constexpr int64_t CustomCallHandler<checks, Fn, Ts...>::kNumRets;
 struct StridedMemrefView {
   PrimitiveType dtype;
   void* data;
-  llvm::ArrayRef<int64_t> sizes;
-  llvm::ArrayRef<int64_t> strides;
+  absl::Span<const int64_t> sizes;
+  absl::Span<const int64_t> strides;
 };
 
 // A view into the memref argument with an identity (row major) layout.
 struct MemrefView {
   PrimitiveType dtype;
   void* data;
-  llvm::ArrayRef<int64_t> sizes;
+  absl::Span<const int64_t> sizes;
 };
 
 // A flat view into memref argument with an identity (row major) layout. If the
@@ -1077,7 +1132,7 @@ template <CustomCall::RuntimeChecks checks>
 struct CustomCallArgDecoding<StridedMemrefView, checks> {
   using EncodedMemref = internal::EncodedMemref;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
   static FailureOr<StridedMemrefView> Decode(TypeID type_id, void* value) {
     if (!CustomCall::Isa<MemrefView, StridedMemrefView>(checks, type_id)) {
       return failure();
@@ -1100,7 +1155,7 @@ template <CustomCall::RuntimeChecks checks>
 struct CustomCallArgDecoding<MemrefView, checks> {
   using EncodedMemref = internal::EncodedMemref;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
   static FailureOr<MemrefView> Decode(TypeID type_id, void* value) {
     if (!CustomCall::Isa<MemrefView>(checks, type_id)) {
       return failure();
@@ -1120,7 +1175,7 @@ template <CustomCall::RuntimeChecks checks>
 struct CustomCallArgDecoding<FlatMemrefView, checks> {
   using EncodedMemref = internal::EncodedMemref;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
   static FailureOr<FlatMemrefView> Decode(TypeID type_id, void* value) {
     if (!CustomCall::Isa<MemrefView>(checks, type_id)) {
       return failure();
@@ -1141,7 +1196,7 @@ struct CustomCallArgDecoding<FlatMemrefView, checks> {
 #define XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(T)                         \
   template <CustomCall::RuntimeChecks checks>                               \
   struct CustomCallArgDecoding<T, checks> {                                 \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
                                                             void* value) {  \
       if (!CustomCall::Isa<T>(checks, type_id)) {                           \
         return failure();                                                   \
@@ -1166,7 +1221,7 @@ XLA_RUNTIME_REGISTER_SCALAR_ARG_DECODING(double);
 #define XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(T, STORAGE)              \
   template <CustomCall::RuntimeChecks checks>                               \
   struct CustomCallArgDecoding<T, checks> {                                 \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
                                                             void* value) {  \
       if (!CustomCall::Isa<T>(checks, type_id)) {                           \
         return failure();                                                   \
@@ -1196,7 +1251,7 @@ XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(Eigen::half, uint16_t);
     static_assert(std::is_trivially_destructible_v<T>,                      \
                   "must be a trivially destructible reference type");       \
                                                                             \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(TypeID type_id, \
                                                             void* value) {  \
       if (!CustomCall::Isa<T>(checks, type_id)) {                           \
         return failure();                                                   \
@@ -1204,7 +1259,8 @@ XLA_RUNTIME_REGISTER_EIGEN_FP_ARG_DECODING(Eigen::half, uint16_t);
                                                                             \
       auto* src = reinterpret_cast<PTR*>(value);                            \
       ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(PTR));              \
-      return (T){*src};                                                     \
+      T ref{*src};                                                          \
+      return std::move(ref);                                                \
     }                                                                       \
   }
 
@@ -1227,7 +1283,7 @@ XLA_RUNTIME_REGISTER_OPAQUE_ARG_DECODING(void*, void*);
                                                                      \
   template <CustomCall::RuntimeChecks checks>                        \
   struct CustomCallRetDecoding<T, checks> {                          \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
         TypeID type_id, void* value) {                               \
       if (!CustomCall::Isa<T>(checks, type_id)) {                    \
         return failure();                                            \
@@ -1235,9 +1291,11 @@ XLA_RUNTIME_REGISTER_OPAQUE_ARG_DECODING(void*, void*);
                                                                      \
       return Result<T>(reinterpret_cast<T*>(value));                 \
     }                                                                \
-  }
+  };
 
 XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(bool);
+XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int8_t);
+XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int16_t);
 XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int32_t);
 XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(int64_t);
 XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(float);
@@ -1267,7 +1325,7 @@ XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(double);
   struct CustomCallRetDecoding<T, checks> {                          \
     static_assert(std::is_pointer_v<PTR>, "must be a pointer type"); \
                                                                      \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Result<T>> Decode( \
         TypeID type_id, void* value) {                               \
       if (!CustomCall::Isa<T>(checks, type_id)) {                    \
         return failure();                                            \
@@ -1275,7 +1333,7 @@ XLA_RUNTIME_REGISTER_SCALAR_RET_DECODING(double);
                                                                      \
       return Result<T>(reinterpret_cast<PTR*>(value));               \
     }                                                                \
-  }
+  };
 
 XLA_RUNTIME_REGISTER_OPAQUE_RET_DECODING(void*, void*);
 
@@ -1310,7 +1368,7 @@ class Result<MemrefView> {
 
     for (unsigned i = 0; i < storage_->rank; ++i) {
       is_compatible = (storage_->dims[i] == value.sizes[i]) ||
-                      (storage_->dims[i] == /*MemrefType::kDynamicSize=*/-1);
+                      (storage_->dims[i] == /*MemrefType::kDynamic=*/-1);
     }
     return is_compatible;
   }
@@ -1322,7 +1380,7 @@ template <CustomCall::RuntimeChecks checks>
 struct CustomCallRetDecoding<MemrefView, checks> {
   using EncodedMemref = internal::EncodedMemref;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
   static FailureOr<Result<MemrefView>> Decode(TypeID type_id, void* value) {
     if (!CustomCall::Isa<MemrefView>(checks, type_id)) return failure();
 
@@ -1334,13 +1392,152 @@ struct CustomCallRetDecoding<MemrefView, checks> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+
+// Custom call AsyncValueRef result decoding
+#define XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(T)               \
+  template <>                                                                 \
+  class Result<tsl::AsyncValueRef<T>> {                                       \
+   public:                                                                    \
+    explicit Result(void** storage) : storage_(storage) {}                    \
+    void Set(tsl::AsyncValueRef<T> value) {                                   \
+      auto write = [](const T* v, std::byte* store) {                         \
+        T* store_t = reinterpret_cast<T*>(store);                             \
+        *store_t = *v;                                                        \
+      };                                                                      \
+      *storage_ = runtime::AsyncRuntime::AsValue<T>(value, sizeof(T), write); \
+    }                                                                         \
+                                                                              \
+   private:                                                                   \
+    void** storage_;                                                          \
+  };                                                                          \
+                                                                              \
+  template <CustomCall::RuntimeChecks checks>                                 \
+  struct CustomCallRetDecoding<tsl::AsyncValueRef<T>, checks> {               \
+    LLVM_ATTRIBUTE_ALWAYS_INLINE                                              \
+    static FailureOr<Result<tsl::AsyncValueRef<T>>> Decode(TypeID type_id,    \
+                                                           void* value) {     \
+      if (!CustomCall::Isa<tsl::AsyncValueRef<T>>(checks, type_id))           \
+        return failure();                                                     \
+      return Result<tsl::AsyncValueRef<T>>(reinterpret_cast<void**>(value));  \
+    }                                                                         \
+  };
+
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(bool);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int8_t);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int16_t);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int32_t);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(int64_t);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(float);
+XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING(double);
+
+#undef XLA_RUNTIME_REGISTER_ASYNC_SCALAR_VALUE_RET_DECODING
+
+template <>
+class Result<tsl::AsyncValueRef<tsl::Chain>> {
+ public:
+  explicit Result(void** storage) : storage_(storage) {}
+  void Set(tsl::AsyncValueRef<tsl::Chain> value) {
+    *storage_ = runtime::AsyncRuntime::AsToken(value);
+  }
+
+ private:
+  void** storage_;
+};
+
+template <CustomCall::RuntimeChecks checks>
+struct CustomCallRetDecoding<tsl::AsyncValueRef<tsl::Chain>, checks> {
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  static FailureOr<Result<tsl::AsyncValueRef<tsl::Chain>>> Decode(
+      TypeID type_id, void* value) {
+    if (!CustomCall::Isa<tsl::AsyncValueRef<tsl::Chain>>(checks, type_id))
+      return failure();
+
+    return Result<tsl::AsyncValueRef<tsl::Chain>>(
+        reinterpret_cast<void**>(value));
+  }
+};
+
+template <>
+class Result<tsl::AsyncValueRef<MemrefView>> {
+  using EncodedMemref = internal::EncodedMemref;
+
+  struct MemrefDescriptor {
+    void* allocated_ptr;
+    void* aligned_ptr;
+    int64_t offset;
+    int64_t dims[];
+  };
+
+ public:
+  explicit Result(EncodedMemref* storage) : storage_(storage) {}
+  void Set(tsl::AsyncValueRef<MemrefView> value) {
+    auto write = [this](const MemrefView* view, std::byte* store) {
+      assert(IsCompatible(*view) &&
+             "Custom call return types is not compatible with types in MLIR");
+      MemrefDescriptor* store_t = reinterpret_cast<MemrefDescriptor*>(store);
+      store_t->allocated_ptr = view->data;
+      store_t->aligned_ptr = view->data;
+      store_t->offset = 0;
+      for (unsigned i = 0; i < storage_->rank; ++i) {
+        store_t->dims[i] = view->sizes[i];
+      }
+    };
+    storage_->data = runtime::AsyncRuntime::AsValue<MemrefView>(
+        value, 3 * sizeof(int64_t) + 2 * storage_->rank * sizeof(int64_t),
+        write);
+  }
+
+  PrimitiveType GetDType() { return PrimitiveType{storage_->dtype}; }
+  absl::Span<const int64_t> GetDims() {
+    return absl::Span<const int64_t>(storage_->dims, storage_->rank);
+  }
+
+ private:
+  bool IsCompatible(MemrefView value) {
+    bool is_compatible =
+        storage_->dtype == value.dtype && storage_->rank == value.sizes.size();
+    if (!is_compatible) return false;
+
+    for (unsigned i = 0; i < storage_->rank; ++i) {
+      is_compatible = (storage_->dims[i] == value.sizes[i]) ||
+                      (storage_->dims[i] == /*MemrefType::kDynamic=*/-1);
+    }
+
+    return is_compatible;
+  }
+
+  EncodedMemref* storage_;
+};
+
+template <CustomCall::RuntimeChecks checks>
+struct CustomCallRetDecoding<tsl::AsyncValueRef<MemrefView>, checks> {
+  using EncodedMemref = internal::EncodedMemref;
+
+  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  static FailureOr<Result<tsl::AsyncValueRef<MemrefView>>> Decode(
+      TypeID type_id, void* value) {
+    if (!CustomCall::Isa<tsl::AsyncValueRef<MemrefView>>(checks, type_id))
+      return failure();
+
+    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(
+        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
+
+    return Result<tsl::AsyncValueRef<MemrefView>>(encoded);
+  }
+};
+
+// XLA_RUNTIME_REGISTER_ASYNC_VALUE_RET_DECODING(MemrefView);
+
 //===----------------------------------------------------------------------===//
 // Custom call attributes decoding.
 //===----------------------------------------------------------------------===//
 
 template <CustomCall::RuntimeChecks checks>
 struct CustomCallAttrDecoding<std::string_view, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::string_view> Decode(
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::string_view> Decode(
       std::string_view name, TypeID type_id, void* value) {
     if (!CustomCall::Isa<std::string_view>(checks, type_id)) {
       return failure();
@@ -1355,7 +1552,7 @@ template <CustomCall::RuntimeChecks checks>
 struct CustomCallAttrDecoding<CustomCall::FunctionOrdinal, checks> {
   using FunctionOrdinal = CustomCall::FunctionOrdinal;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<FunctionOrdinal> Decode(
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<FunctionOrdinal> Decode(
       std::string_view name, TypeID type_id, void* value) {
     if (!CustomCall::Isa<FunctionOrdinal>(checks, type_id)) {
       return failure();
@@ -1370,7 +1567,7 @@ template <typename T, CustomCall::RuntimeChecks checks>
 struct CustomCallAttrDecoding<std::optional<T>, checks> {
   using ValueDecoding = CustomCallAttrDecoding<T, checks>;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::optional<T>> Decode(
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<std::optional<T>> Decode(
       std::string_view name, TypeID type_id, void* value) {
     // Convert nullptr to empty optional.
     bool is_nullopt = CustomCall::Isa<std::nullopt_t>(checks, type_id);
@@ -1387,7 +1584,7 @@ struct CustomCallAttrDecoding<std::optional<T>, checks> {
 
 template <CustomCall::RuntimeChecks checks>
 struct CustomCallAttrDecoding<CustomCall::VariantAttr, checks> {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantAttr> Decode(
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::VariantAttr> Decode(
       std::string_view name, TypeID type_id, void* value) {
     return CustomCall::VariantAttr(name, type_id, value);
   }
@@ -1396,7 +1593,7 @@ struct CustomCallAttrDecoding<CustomCall::VariantAttr, checks> {
 #define XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(T)          \
   template <CustomCall::RuntimeChecks checks>                 \
   struct CustomCallAttrDecoding<T, checks> {                  \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(  \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(  \
         std::string_view name, TypeID type_id, void* value) { \
       if (!CustomCall::Isa<T>(checks, type_id)) {             \
         return failure();                                     \
@@ -1415,24 +1612,24 @@ XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING(double);
 #undef XLA_RUNTIME_REGISTER_SCALAR_ATTR_DECODING
 
 // A type tag to represent empty arrays of unknown element type.
-struct EmptyArrayRef {};
+struct EmptyArray {};
 
 // Both EncodedArray and 1-D EncodedDenseElements can be decoded as an
-// llvm::ArrayRef. Pointers to both EncodedArray and 1-D EncodedDenseElements
+// absl::Span. Pointers to both EncodedArray and 1-D EncodedDenseElements
 // can be dereferenced as a pointer to EncodedArray.
-#define XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(T)                          \
-  template <CustomCall::RuntimeChecks checks>                                \
-  struct CustomCallAttrDecoding<llvm::ArrayRef<T>, checks> {                 \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<llvm::ArrayRef<T>> Decode( \
-        std::string_view name, TypeID type_id, void* value) {                \
-      if (!CustomCall::Isa<llvm::ArrayRef<T>, CustomCall::TensorRef<T>,      \
-                           EmptyArrayRef>(checks, type_id)) {                \
-        return failure();                                                    \
-      }                                                                      \
-                                                                             \
-      auto* encoded = reinterpret_cast<internal::EncodedArray<T>*>(value);   \
-      return llvm::ArrayRef<T>(encoded->data, encoded->size);                \
-    }                                                                        \
+#define XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(T)                            \
+  template <CustomCall::RuntimeChecks checks>                                  \
+  struct CustomCallAttrDecoding<absl::Span<const T>, checks> {                 \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<absl::Span<const T>> Decode( \
+        std::string_view name, TypeID type_id, void* value) {                  \
+      if (!CustomCall::Isa<absl::Span<const T>, CustomCall::TensorRef<T>,      \
+                           EmptyArray>(checks, type_id)) {                     \
+        return failure();                                                      \
+      }                                                                        \
+                                                                               \
+      auto* encoded = reinterpret_cast<internal::EncodedArray<T>*>(value);     \
+      return absl::Span<const T>(encoded->data, encoded->size);                \
+    }                                                                          \
   }
 
 XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(int32_t);
@@ -1445,7 +1642,7 @@ XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(double);
 #define XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(T)                \
   template <CustomCall::RuntimeChecks checks>                               \
   struct CustomCallAttrDecoding<CustomCall::TensorRef<T>, checks> {         \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::TensorRef<T>> \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<CustomCall::TensorRef<T>> \
     Decode(std::string_view name, TypeID type_id, void* value) {            \
       if (!CustomCall::Isa<CustomCall::TensorRef<T>>(checks, type_id)) {    \
         return failure();                                                   \
@@ -1454,8 +1651,8 @@ XLA_RUNTIME_REGISTER_ARRAY_ATTR_DECODING(double);
       auto* encoded =                                                       \
           reinterpret_cast<internal::EncodedDenseElements<T>*>(value);      \
       auto payload = encoded->payload;                                      \
-      llvm::ArrayRef<T> data(payload.data, payload.size);                   \
-      llvm::ArrayRef<int64_t> shape(encoded->shape, encoded->rank);         \
+      absl::Span<const T> data(payload.data, payload.size);                 \
+      absl::Span<const int64_t> shape(encoded->shape, encoded->rank);       \
       return CustomCall::TensorRef<T>({shape, data});                       \
     }                                                                       \
   }
@@ -1484,7 +1681,7 @@ XLA_RUNTIME_REGISTER_DENSE_ELEMENTS_ATTR_DECODING(double);
     static_assert(std::is_enum<T>::value, "expected enum class"); \
     using U = std::underlying_type_t<T>;                          \
                                                                   \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(      \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(      \
         std::string_view name, TypeID type_id, void* value) {     \
       if (!CustomCall::Isa<T>(checks, type_id)) {                 \
         return failure();                                         \
@@ -1518,7 +1715,7 @@ struct AggregateMember {
 #define XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(T, ...)                   \
   template <CustomCall::RuntimeChecks checks>                                  \
   struct CustomCallAttrDecoding<T, checks> {                                   \
-    LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(                   \
+    ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(                   \
         std::string_view name, TypeID type_id, void* value) {                  \
       if (!CustomCall::Isa<T>(checks, type_id)) {                              \
         return failure();                                                      \
@@ -1539,7 +1736,7 @@ struct DecodeAggregateAttr {
 
   using RuntimeChecks = CustomCall::RuntimeChecks;
 
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
+  ABSL_ATTRIBUTE_ALWAYS_INLINE
   static FailureOr<T> Decode(void** value,
                              std::array<std::string_view, kSize> names) {
     internal::DecodedAttrs attrs(value);
@@ -1547,7 +1744,7 @@ struct DecodeAggregateAttr {
   }
 
   template <size_t... Is>
-  LLVM_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> Decode(
       internal::DecodedAttrs attrs, std::array<std::string_view, kSize> names,
       std::index_sequence<Is...>) {
     // Check that the number of encoded attributes matches the signature.
@@ -1587,7 +1784,68 @@ auto AggregateDecoder(Members... m) {
 
 }  // namespace internal
 
-// Declare/define an explicit specialialization for TypeID for types used
+//===----------------------------------------------------------------------===//
+// Register an XLA custom call attribute decoding for dictionary attributes.
+//===----------------------------------------------------------------------===//
+
+// Dictionary attributes are encoded using the same scheme as aggregate
+// attributes and as custom call attributes: <type_id, name, data> x length.
+class Dictionary {
+  using RuntimeChecks = CustomCall::RuntimeChecks;
+
+ public:
+  explicit Dictionary(internal::DecodedAttrs attrs) : attrs_(attrs) {}
+
+  int64_t size() { return attrs_.size(); }
+
+  template <typename T, RuntimeChecks checks = RuntimeChecks::kDefault>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE FailureOr<T> get(std::string_view name) const {
+    // TODO(ezhulenev): Use `std::binary_search` because it's guaranteed that
+    // encoded attributes are sorted by name.
+    for (int64_t i = 0; i < attrs_.size(); ++i) {
+      if (auto attr = attrs_[i]; attr.name == name)
+        return CustomCallAttrDecoding<T, checks>::Decode(
+            attr.name, attr.type_id, attr.value);
+    }
+    return failure();
+  }
+
+ private:
+  internal::DecodedAttrs attrs_;
+};
+
+template <CustomCall::RuntimeChecks checks>
+struct CustomCallAttrDecoding<Dictionary, checks> {
+  ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<Dictionary> Decode(
+      std::string_view name, TypeID type_id, void* value) {
+    if (!CustomCall::Isa<Dictionary>(checks, type_id)) return failure();
+    return Dictionary(internal::DecodedAttrs(reinterpret_cast<void**>(value)));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// XLA Custom Call helper macro for registering custom call handlers.
+//===----------------------------------------------------------------------===//
+
+#define XLA_RUNTIME_DEFINE_CUSTOM_CALL(fn, impl, checks, bind)             \
+  static bool fn(::xla::runtime::ExecutionContext* ctx, void** args,       \
+                 void** attrs, void** rets) {                              \
+    static auto* handler = bind.To<checks>(impl).release();                \
+    return ::xla::runtime::succeeded(                                      \
+        xla::runtime::Executable::Call(ctx, *handler, args, attrs, rets)); \
+  }
+
+#define XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(param, fn, impl, checks, bind) \
+  template <param>                                                             \
+  static bool fn(::xla::runtime::ExecutionContext* ctx, void** args,           \
+                 void** attrs, void** rets) {                                  \
+    static auto* handler = bind.To<checks>(impl).release();                    \
+    return ::xla::runtime::succeeded(                                          \
+        xla::runtime::Executable::Call(ctx, *handler, args, attrs, rets));     \
+  }
+
+//===----------------------------------------------------------------------===//
+// Declare/define an explicit specialization for TypeID for types used
 // by the custom calls. This forces the compiler to emit a strong definition for
 // a class and controls which translation unit and shared object will actually
 // have it.
@@ -1597,6 +1855,8 @@ auto AggregateDecoder(Members... m) {
 // Because custom calls do not "own" the types passed across the function
 // boundary, we declare/define specializations for tagged types to avoid
 // potential conflicts with other libraries.
+//===----------------------------------------------------------------------===//
+
 #define XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(T) \
   MLIR_DECLARE_EXPLICIT_TYPE_ID(::xla::runtime::Tagged<T>)
 
@@ -1610,14 +1870,25 @@ XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(std::string_view);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::StridedMemrefView);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::MemrefView);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::FlatMemrefView);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArrayRef);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::EmptyArray);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(xla::runtime::Dictionary);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(int32_t);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(int64_t);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(float);
 XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(double);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(llvm::ArrayRef<int32_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(llvm::ArrayRef<int64_t>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(llvm::ArrayRef<float>);
-XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(llvm::ArrayRef<double>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const int32_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const int64_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const float>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(absl::Span<const double>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<bool>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int8_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int16_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int32_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<int64_t>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<float>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<double>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(
+    tsl::AsyncValueRef<xla::runtime::MemrefView>);
+XLA_RUNTIME_DECLARE_EXPLICIT_TYPE_ID(tsl::AsyncValueRef<tsl::Chain>);
 
 #endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_CUSTOM_CALL_H_
diff --git a/tensorflow/compiler/xla/runtime/custom_call_test.cc b/tensorflow/compiler/xla/runtime/custom_call_test.cc
index b5ceb58d363..262e289a37c 100644
--- a/tensorflow/compiler/xla/runtime/custom_call_test.cc
+++ b/tensorflow/compiler/xla/runtime/custom_call_test.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
 #include "tensorflow/compiler/xla/runtime/diagnostics.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/runtime/module.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/platform/test_benchmark.h"
 
@@ -48,95 +50,103 @@ using absl::StatusOr;
 // calls (direct or dynamic) and custom types.
 //===----------------------------------------------------------------------===//
 
-struct TestOpts {
+struct CustomCallRegistry {
   std::function<void(DynamicCustomCallRegistry&)> dynamic_custom_calls;
   std::function<void(DirectCustomCallRegistry&)> direct_custom_calls;
-  std::function<void(TypeIDNameRegistry&)> types;
-
-  // Encoding for non-canonical custom call operands.
-  std::function<void(mlir::TypeConverter&)> populate_type_conversions;
-  std::function<void(CustomCallArgEncodingSet&)> populate_arg_encodings;
-  std::function<void(CustomCallRetEncodingSet&)> populate_ret_encodings;
-  std::function<void(CustomCallAttrEncodingSet&)> populate_attr_encodings;
-
-  TypeConverter type_converter;
-  DiagnosticEngine diagnostic_engine;
 };
 
 static absl::StatusOr<JitExecutable> Compile(
-    std::string_view module, const TestOpts& test_opts,
+    std::string_view source, const CustomCallRegistry& registry,
+    const CompilationPipelineOptions& copts,
+    const TypeConverter& type_converter = {},
     absl::Span<const std::string_view> exported = {"test"}) {
   JitExecutable::Options opts;
   opts.specialization = JitExecutable::Specialization::kDisabled;
-  opts.compiler.symbols_binding =
-      ToSymbolsBinding(test_opts.direct_custom_calls, test_opts.types);
-  opts.compiler.type_converter = test_opts.type_converter;
-
-  opts.compiler.register_dialects =
-      [&](xla::runtime::DialectRegistry& dialects) {
-        RegisterTestlibDialect(dialects);
-        RegisterDefaultXlaGpuRuntimeDialects(dialects);
-      };
-
-  opts.compiler.create_compilation_pipeline =
-      [&](xla::runtime::PassManager& passes) {
-        CompilationPipelineOptions copts;
-        copts.populate_type_id_names = test_opts.types;
-        copts.populate_arg_encodings = test_opts.populate_arg_encodings;
-        copts.populate_ret_encodings = test_opts.populate_ret_encodings;
-        copts.populate_attr_encodings = test_opts.populate_attr_encodings;
-        copts.populate_type_conversions = test_opts.populate_type_conversions;
-        CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
-      };
-
-  return JitExecutable::Instantiate(module, opts, exported);
+  opts.compiler.symbols_binding = ToSymbolsBinding(
+      registry.direct_custom_calls, copts.populate_type_id_names);
+  opts.compiler.type_converter = type_converter;
+
+  opts.compiler.register_dialects = [&](DialectRegistry& dialects) {
+    RegisterTestlibDialect(dialects);
+    RegisterDefaultXlaGpuRuntimeDialects(dialects);
+  };
+
+  opts.compiler.create_compilation_pipeline = [=](PassManager& passes) {
+    CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
+  };
+
+  return JitExecutable::Instantiate(source, opts, exported);
 }
 
 static absl::Status CompileAndExecute(
-    std::string_view module, ArgumentsRef args, const TestOpts& test_opts,
-    absl::Span<const std::string_view> exported = {"test"}) {
-  StatusOr<JitExecutable> jit_executable = Compile(module, test_opts, exported);
+    std::string_view source, ArgumentsRef args,
+    const CustomCallRegistry& registry,
+    const CompilationPipelineOptions& copts = {},
+    const TypeConverter& type_converter = {},
+    absl::Span<const std::string_view> exported = {"test"},
+    CustomCall::UserData user_data = {}) {
+  StatusOr<JitExecutable> jit_executable =
+      Compile(source, registry, copts, type_converter, exported);
   if (!jit_executable.ok()) return jit_executable.status();
 
   AsyncValuePtr<Executable> executable = jit_executable->DefaultExecutable();
   if (executable.IsError())
     return absl::InternalError(executable.GetError().message());
 
-  // Prepare the call frame outside of a benchmark loop.
-  Executable::CallFrame call_frame;
-  auto initialized = executable->InitializeCallFrame(args, &call_frame);
-  if (!initialized.ok()) return initialized;
-
   // Register all dynamic custom calls.
   DynamicCustomCallRegistry dynamic_custom_calls;
-  if (test_opts.dynamic_custom_calls)
-    test_opts.dynamic_custom_calls(dynamic_custom_calls);
+  if (registry.dynamic_custom_calls)
+    registry.dynamic_custom_calls(dynamic_custom_calls);
 
   // Always add a pointer to `self` to user data.
-  CustomCall::UserData user_data;
   user_data.insert(&executable.get());
 
+  // Collect all emitted diangostics to a string;
+  std::string error;
+  DiagnosticEngine diagnostic_engine;
+  diagnostic_engine.AddHandler([&](Diagnostic& diagnostic) -> LogicalResult {
+    error.append(diagnostic.status().message());
+    return success();
+  });
+
   Executable::ExecuteOpts execute_opts;
   execute_opts.custom_call_registry = &dynamic_custom_calls;
-  execute_opts.diagnostic_engine = &test_opts.diagnostic_engine;
+  execute_opts.diagnostic_engine = &diagnostic_engine;
   execute_opts.custom_call_data = &user_data;
   execute_opts.async_task_runner =
       reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
 
-  executable->Execute(call_frame, execute_opts);
-  if (call_frame.is_error) return absl::InternalError(call_frame.error);
+  // We do not support returning results from tests.
+  NoResultConverter converter;
+
+  auto executed = executable->Execute(args, converter, execute_opts);
+  if (!executed.ok())
+    return absl::InternalError(
+        absl::StrFormat("%s: %s", executed.status().message(), error));
 
   return absl::OkStatus();
 }
 
-// Diagnostic engine that appends all emitted diagnostic to the `error` string.
-static DiagnosticEngine CollectDiagnostic(std::string* error) {
-  DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([=](Diagnostic& diagnostic) -> LogicalResult {
-    error->append(diagnostic.status().message());
-    return success();
-  });
-  return diagnostic_engine;
+template <typename State>
+static absl::StatusOr<std::unique_ptr<State>> CompileAndExecute(
+    std::string_view source, ArgumentsRef args, const StatefulModule<State>& m,
+    absl::Span<const std::string_view> exported = {"test"}) {
+  CustomCallRegistry registry = {
+      [&](DynamicCustomCallRegistry& registry) { m.Export(registry); },
+      [&](DirectCustomCallRegistry& registry) { m.Export(registry); },
+  };
+  auto state = m.CreateModuleState();
+  if (!state.ok()) return state.status();
+
+  CustomCall::UserData user_data;
+  auto initialized = m.InitializeUserData(state->get(), user_data);
+  if (!initialized.ok()) return initialized;
+
+  auto executed = CompileAndExecute(source, args, registry, /*copts=*/{},
+                                    /*type_converter=*/{}, exported, user_data);
+  if (!executed.ok()) return executed;
+
+  return state;
 }
 
 // No-Op custom call with a single `i32` argument.
@@ -147,48 +157,66 @@ static void I32NoOp(DynamicCustomCallRegistry& registry) {
       }));
 }
 
+//===----------------------------------------------------------------------===//
+// A test for stateful module with a direct custom call.
 //===----------------------------------------------------------------------===//
 
-// Static counter to observe side effects of direct custom call.
-static int32_t custom_call_counter = 0;
+struct Counter : public Module::State {
+  int32_t value = 0;
+};
 
-// Direct custom call linked with XLA runtime executable at compile (link) time.
-static bool CustomCallFn(ExecutionContext* ctx, void** args, void** attrs,
-                         void** rets) {
-  auto handler = CustomCall::Bind("test.custom_call")
-                     .Arg<int32_t>()
-                     .To([&](int32_t arg) -> LogicalResult {
-                       custom_call_counter += arg;
-                       return success();
-                     });
+// Package custom call that updates a `Counter` as a runtime module.
+struct CounterModule : public StatefulModule<Counter> {
+  CounterModule() : StatefulModule<Counter>("counter") {}
+
+  static bool Inc(ExecutionContext* e, void** args, void** attrs, void** rets) {
+    auto impl = CustomCall::Bind("test.increment")
+                    .UserData<Counter*>()  // counter
+                    .Arg<int32_t>()        // value
+                    .To([](Counter* counter, int32_t value) {
+                      return success(counter->value += value);
+                    });
+    return succeeded(Executable::Call(e, *impl, args, attrs, rets));
+  }
 
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+  void Export(DirectCustomCallRegistry& registry) const final {
+    registry.Register("test.increment", Inc);
+  }
+
+  absl::StatusOr<std::unique_ptr<Counter>> CreateModuleState() const final {
+    return std::make_unique<Counter>();
+  }
+
+  absl::Status InitializeUserData(Counter* state,
+                                  CustomCall::UserData& user_data) const final {
+    user_data.insert(state);
+    return absl::OkStatus();
+  }
+};
 
 TEST(CustomCallTest, DirectCustomCall) {
-  absl::string_view module = R"(
-    func.func private @custom_call(%arg0: i32)
-      attributes { rt.custom_call = "test.custom_call" }
+  absl::string_view source = R"(
+    func.func private @increment(%arg0: i32)
+      attributes { rt.custom_call = "test.increment" }
 
     func.func @test() {
       %0 = arith.constant 42 : i32
-      call @custom_call(%0) : (i32) -> ()
+      call @increment(%0) : (i32) -> ()
       return
     }
   )";
 
-  TestOpts opts;
-  opts.direct_custom_calls = [&](DirectCustomCallRegistry& registry) {
-    registry.Register("test.custom_call", CustomCallFn);
-  };
-
-  ASSERT_EQ(custom_call_counter, 0);
-  ASSERT_TRUE(CompileAndExecute(module, {}, opts).ok());
-  EXPECT_EQ(custom_call_counter, 42);
+  auto counter = CompileAndExecute(source, /*args=*/{}, CounterModule());
+  ASSERT_TRUE(counter.ok());
+  EXPECT_EQ((*counter)->value, 42);
 }
 
+//===----------------------------------------------------------------------===//
+// All other tests use dynamic custom calls and do not use modules.
+//===----------------------------------------------------------------------===//
+
 TEST(CustomCallTest, ScalarArgs) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call(%arg0: i1, %arg1: i32, %arg2: i64,
                                    %arg3: f32, %arg4: f64)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
@@ -215,8 +243,7 @@ TEST(CustomCallTest, ScalarArgs) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Arg<bool>()
                           .Arg<int32_t>()
@@ -224,9 +251,9 @@ TEST(CustomCallTest, ScalarArgs) {
                           .Arg<float>()
                           .Arg<double>()
                           .To(f));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
 
   EXPECT_EQ(i1, false);
   EXPECT_EQ(i32, 42);
@@ -236,7 +263,7 @@ TEST(CustomCallTest, ScalarArgs) {
 }
 
 TEST(CustomCallTest, ScalarRets) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call_result() -> (i1, i32, i64, f32, f64)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call_result" }
 
@@ -274,8 +301,7 @@ TEST(CustomCallTest, ScalarRets) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call_result")
                           .Ret<bool>()
                           .Ret<int32_t>()
@@ -291,9 +317,9 @@ TEST(CustomCallTest, ScalarRets) {
                           .Arg<float>()
                           .Arg<double>()
                           .To(f));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
 
   EXPECT_EQ(i1, false);
   EXPECT_EQ(i32, 42);
@@ -303,7 +329,7 @@ TEST(CustomCallTest, ScalarRets) {
 }
 
 TEST(CustomCallTest, StatusOrRet) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call_return(%arg0: i32) -> (i64)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
 
@@ -325,8 +351,7 @@ TEST(CustomCallTest, StatusOrRet) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call_return")
                           .Arg<int32_t>()
                           .Ret<int64_t>()
@@ -334,14 +359,78 @@ TEST(CustomCallTest, StatusOrRet) {
 
     registry.Register(
         CustomCall::Bind("test.custom_call").Arg<int64_t>().To(f));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
   EXPECT_EQ(i64, 42);
 }
 
+TEST(CustomCallTest, StatusOrAsyncToken) {
+  absl::string_view source = R"(
+    func.func private @custom_call_return() -> !async.token
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
+
+    func.func @test() {
+      %0 = call @custom_call_return() : () -> !async.token
+      async.await %0 : !async.token
+      return
+    }
+  )";
+
+  auto f_result = []() -> absl::StatusOr<AsyncValueRef<tsl::Chain>> {
+    return tsl::MakeAvailableAsyncValueRef<tsl::Chain>();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call_return")
+                          .Ret<AsyncValueRef<tsl::Chain>>()
+                          .To(f_result));
+  }};
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
+}
+
+TEST(CustomCallTest, StatusOrAsyncScalarValue) {
+  absl::string_view source = R"(
+    func.func private @custom_call_return() -> !async.value<i32>
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
+
+    func.func private @custom_call(%arg32 : i32)
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      %0 = call @custom_call_return() : () -> !async.value<i32>
+      %1 = async.await %0 : !async.value<i32>
+      call @custom_call(%1) : (i32) -> ()
+      return
+    }
+  )";
+
+  auto f_result = []() -> absl::StatusOr<AsyncValueRef<int32_t>> {
+    return tsl::MakeAvailableAsyncValueRef<int32_t>(42);
+  };
+
+  int32_t i32 = 0;
+  auto f = [&](int32_t arg) {
+    i32 = arg;
+    return success();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call_return")
+                          .Ret<AsyncValueRef<int32_t>>()
+                          .To(f_result));
+
+    registry.Register(
+        CustomCall::Bind("test.custom_call").Arg<int32_t>().To(f));
+  }};
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
+  EXPECT_EQ(i32, 42);
+}
+
 TEST(CustomCallTest, StatusOrTupleRets) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call_return(%arg0 : i64, %arg1 : i64) -> (i64,
                                                                         i64)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
@@ -371,8 +460,7 @@ TEST(CustomCallTest, StatusOrTupleRets) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call_return")
                           .Arg<int64_t>()
                           .Ret<int64_t>()
@@ -384,15 +472,15 @@ TEST(CustomCallTest, StatusOrTupleRets) {
                           .Arg<int64_t>()
                           .Arg<int64_t>()
                           .To(f));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
   EXPECT_EQ(a, 42);
   EXPECT_EQ(b, 43);
 }
 
 TEST(CustomCallTest, OpaqueArgs) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @use(%arg0: !rt.opaque)
       attributes { rt.dynamic, rt.custom_call = "test.use" }
 
@@ -413,18 +501,16 @@ TEST(CustomCallTest, OpaqueArgs) {
 
   OpaqueArg arg0(&message);
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.use").Arg<void*>().To(use));
-  };
-
-  ASSERT_TRUE(CompileAndExecute(module, {arg0}, opts).ok());
+  }};
 
+  ASSERT_TRUE(CompileAndExecute(source, {arg0}, registry).ok());
   EXPECT_EQ(message, "foo");
 }
 
 TEST(CustomCallTest, OpaqueArgsAndRets) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @make() -> (!rt.opaque)
       attributes { rt.dynamic, rt.custom_call = "test.make" }
 
@@ -452,14 +538,12 @@ TEST(CustomCallTest, OpaqueArgsAndRets) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.make").Ret<void*>().To(make));
     registry.Register(CustomCall::Bind("test.use").Arg<void*>().To(use));
-  };
-
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  }};
 
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
   EXPECT_EQ(message, "foo");
 }
 
@@ -496,8 +580,18 @@ static std::unique_ptr<Type> ConvertArgTypeToOpaqueArg(ValueType arg) {
   return std::make_unique<OpaqueOperandType>();
 }
 
+// Compilation pipeline options with `testlib` and custom args/rets support.
+CompilationPipelineOptions TestlibCopts() {
+  CompilationPipelineOptions copts;
+  copts.populate_type_id_names = RegisterTypeName;
+  copts.populate_arg_encodings = RegisterArgEncoding;
+  copts.populate_ret_encodings = RegisterRetEncoding;
+  copts.populate_type_conversions = AddTestlibTypeConversions;
+  return copts;
+}
+
 TEST(CustomCallTest, CustomArgAsOpaqueArg) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @use(%arg0: !testlib.value)
       attributes { rt.dynamic, rt.custom_call = "test.use" }
 
@@ -517,17 +611,15 @@ TEST(CustomCallTest, CustomArgAsOpaqueArg) {
 
   OpaqueArg arg0(&message);
 
-  TestOpts opts;
-  opts.types = RegisterTypeName;
-  opts.populate_arg_encodings = RegisterArgEncoding;
-  opts.populate_type_conversions = AddTestlibTypeConversions;
-  opts.type_converter.AddConversion(ConvertArgTypeToOpaqueArg);
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.use").Arg<ValueRef>().To(use));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, {arg0}, opts).ok());
+  CompilationPipelineOptions copts = TestlibCopts();
+  TypeConverter type_converter(ConvertArgTypeToOpaqueArg);
 
+  ASSERT_TRUE(
+      CompileAndExecute(source, {arg0}, registry, copts, type_converter).ok());
   EXPECT_EQ(message, "foo");
 }
 
@@ -567,7 +659,7 @@ static std::unique_ptr<Type> ConvertArgTypeToValueArg(ValueType arg) {
 }
 
 TEST(CustomCallTest, CustomArg) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @use(%arg0: !testlib.value)
       attributes { rt.dynamic, rt.custom_call = "test.use" }
 
@@ -587,22 +679,20 @@ TEST(CustomCallTest, CustomArg) {
 
   ValueArg arg0(&message);
 
-  TestOpts opts;
-  opts.types = RegisterTypeName;
-  opts.populate_arg_encodings = RegisterArgEncoding;
-  opts.populate_type_conversions = AddTestlibTypeConversions;
-  opts.type_converter.AddConversion(ConvertArgTypeToValueArg);
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.use").Arg<ValueRef>().To(use));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, {arg0}, opts).ok());
+  CompilationPipelineOptions copts = TestlibCopts();
+  TypeConverter type_converter(ConvertArgTypeToValueArg);
 
+  ASSERT_TRUE(
+      CompileAndExecute(source, {arg0}, registry, copts, type_converter).ok());
   EXPECT_EQ(message, "bar");
 }
 
 TEST(CustomCallTest, CustomArgsAndRets) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @make() -> (!testlib.value)
       attributes { rt.dynamic, rt.custom_call = "test.make" }
 
@@ -630,23 +720,19 @@ TEST(CustomCallTest, CustomArgsAndRets) {
     return success();
   };
 
-  TestOpts opts;
-  opts.types = RegisterTypeName;
-  opts.populate_arg_encodings = RegisterArgEncoding;
-  opts.populate_ret_encodings = RegisterRetEncoding;
-  opts.populate_type_conversions = AddTestlibTypeConversions;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.make").Ret<ValueRef>().To(make));
     registry.Register(CustomCall::Bind("test.use").Arg<ValueRef>().To(use));
-  };
+  }};
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  CompilationPipelineOptions copts = TestlibCopts();
 
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, copts).ok());
   EXPECT_EQ(message, "foo");
 }
 
 TEST(CustomCallTest, MemRefRets) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call_result() -> memref<2x2xf32>
       attributes { rt.dynamic, rt.custom_call = "test.custom_call_result" }
 
@@ -674,14 +760,13 @@ TEST(CustomCallTest, MemRefRets) {
   };
 
   auto f = [&](MemrefView arg0) {
-    llvm::ArrayRef<float> data = {reinterpret_cast<float*>(arg0.data), 4};
+    absl::Span<const float> data = {reinterpret_cast<float*>(arg0.data), 4};
     arg_shape = {arg0.sizes.begin(), arg0.sizes.end()};
     arg_data = {data.begin(), data.end()};
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call_result")
                           .Ret<MemrefView>()  // ret0
                           .To(f_result));
@@ -689,17 +774,69 @@ TEST(CustomCallTest, MemRefRets) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Arg<MemrefView>()  // arg0
                           .To(f));
+  }};
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
+  EXPECT_EQ(arg_shape, std::vector<int64_t>({2, 2}));
+  EXPECT_EQ(arg_data, input);
+}
+
+TEST(CustomCallTest, AsyncMemRefRets) {
+  absl::string_view source = R"(
+    func.func private @custom_call_result() -> !async.value<memref<2x2xf32>>
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call_result" }
+
+    func.func private @custom_call(%arg0: memref<2x2xf32>)
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      %0 = call @custom_call_result() : () -> (!async.value<memref<2x2xf32>>)
+      %1 = async.await %0 : !async.value<memref<2x2xf32>>
+      call @custom_call(%1) : (memref<2x2xf32>) -> ()
+      return
+    }
+  )";
+
+  // Allocate storage for arguments.
+  std::vector<float> input = {1.0, 2.0, 3.0, 4.0};
+
+  // Observe returned memref by capturing memref argument shape and data.
+  std::vector<int64_t> arg_shape;
+  std::vector<float> arg_data;
+
+  auto f_result = [&](Result<AsyncValueRef<MemrefView>> ret0) {
+    std::vector<int64_t> dims = {ret0.GetDims().begin(), ret0.GetDims().end()};
+    auto async_value = tsl::MakeAvailableAsyncValueRef<MemrefView>(
+        ret0.GetDType(), input.data(), dims);
+    ret0.Set(async_value);
+    return success();
+  };
+
+  auto f = [&](MemrefView arg0) {
+    llvm::ArrayRef<float> data = {reinterpret_cast<float*>(arg0.data), 4};
+    arg_shape = {arg0.sizes.begin(), arg0.sizes.end()};
+    arg_data = {data.begin(), data.end()};
+    return success();
   };
 
-  ASSERT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call_result")
+                          .Ret<AsyncValueRef<MemrefView>>()  // ret0
+                          .To(f_result));
+
+    registry.Register(CustomCall::Bind("test.custom_call")
+                          .Arg<MemrefView>()  // arg0
+                          .To(f));
+  }};
 
+  ASSERT_TRUE(CompileAndExecute(source, {}, registry).ok());
   EXPECT_EQ(arg_shape, std::vector<int64_t>({2, 2}));
   EXPECT_EQ(arg_data, input);
 }
 
 TEST(CustomCallTest, ArgSizeCheck) {
   // Try to pass two argument to a custom call that expects one.
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call(%arg0: i32, %arg1: i32)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -712,19 +849,18 @@ TEST(CustomCallTest, ArgSizeCheck) {
 
   std::string error = "";
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = I32NoOp;
-  opts.diagnostic_engine = CollectDiagnostic(&error);
+  CustomCallRegistry registry = {I32NoOp};
 
-  auto status = CompileAndExecute(module, /*args=*/{}, opts);
+  auto status = CompileAndExecute(source, /*args=*/{}, registry);
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.message(), "custom call 'test.custom_call' failed");
-  EXPECT_EQ(error, "Wrong number of arguments: expected 1 got 2");
+  EXPECT_EQ(status.message(),
+            "run time error: custom call 'test.custom_call' failed: Wrong "
+            "number of arguments: expected 1 got 2");
 }
 
 TEST(CustomCallTest, ArgTypeCheck) {
   // Try to pass `i64` argument to a custom call that expects `i32`.
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call(%arg1: i64)
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -737,21 +873,20 @@ TEST(CustomCallTest, ArgTypeCheck) {
 
   std::string error = "";
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = I32NoOp;
-  opts.diagnostic_engine = CollectDiagnostic(&error);
+  CustomCallRegistry registry = {I32NoOp};
 
-  auto status = CompileAndExecute(module, /*args=*/{}, opts);
+  auto status = CompileAndExecute(source, /*args=*/{}, registry);
   EXPECT_FALSE(status.ok());
-  EXPECT_EQ(status.message(), "custom call 'test.custom_call' failed");
-  EXPECT_EQ(error, "Failed to decode all custom call operands");
+  EXPECT_EQ(status.message(),
+            "run time error: custom call 'test.custom_call' failed: Failed to "
+            "decode all custom call operands");
 }
 
 // Register custom call attribute decoding for `testlib.enum_type`.
 XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(EnumType);
 
 TEST(CustomCallTest, EnumAttr) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -776,18 +911,17 @@ TEST(CustomCallTest, EnumAttr) {
     encoding.Add<EnumAttrEncoding<EnumTypeAttr, EnumType>>();
   };
 
-  auto custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Attr<EnumType>("enum")
                           .To(handler));
-  };
+  }};
 
-  TestOpts opts;
-  opts.types = types;
-  opts.populate_attr_encodings = attrs;
-  opts.dynamic_custom_calls = custom_calls;
+  CompilationPipelineOptions copts;
+  copts.populate_type_id_names = types;
+  copts.populate_attr_encodings = attrs;
 
-  EXPECT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  EXPECT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, copts).ok());
   ASSERT_EQ(enums.size(), 1);
   EXPECT_EQ(enums.front(), EnumType::Baz);
 }
@@ -809,7 +943,7 @@ MyEnumType FromEnumType(EnumType value) {
 XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(MyEnumType);
 
 TEST(CustomCallTest, MappedEnumAttr) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -835,18 +969,17 @@ TEST(CustomCallTest, MappedEnumAttr) {
         FromEnumType);
   };
 
-  auto custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Attr<MyEnumType>("enum")
                           .To(handler));
-  };
+  }};
 
-  TestOpts opts;
-  opts.types = types;
-  opts.populate_attr_encodings = attrs;
-  opts.dynamic_custom_calls = custom_calls;
+  CompilationPipelineOptions copts;
+  copts.populate_type_id_names = types;
+  copts.populate_attr_encodings = attrs;
 
-  EXPECT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  EXPECT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, copts).ok());
   ASSERT_EQ(enums.size(), 1);
   EXPECT_EQ(enums.front(), MyEnumType::kBaz);
 }
@@ -854,18 +987,18 @@ TEST(CustomCallTest, MappedEnumAttr) {
 // Structure corresponding to the MLIR attribute.
 struct PairOfDims {
   int64_t rank;
-  llvm::ArrayRef<int64_t> a;
-  llvm::ArrayRef<int64_t> b;
+  absl::Span<const int64_t> a;
+  absl::Span<const int64_t> b;
 };
 
 // Register aggregate attribute decoding.
 XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
     PairOfDims, AggregateMember<int64_t>("rank"),
-    AggregateMember<llvm::ArrayRef<int64_t>>("a"),
-    AggregateMember<llvm::ArrayRef<int64_t>>("b"));
+    AggregateMember<absl::Span<const int64_t>>("a"),
+    AggregateMember<absl::Span<const int64_t>>("b"));
 
 TEST(CustomCallTest, StructAttr) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -900,18 +1033,17 @@ TEST(CustomCallTest, StructAttr) {
                       .Add("b", &PairOfDimsAttr::getB));
   };
 
-  auto custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Attr<PairOfDims>("dims")
                           .To(handler));
-  };
+  }};
 
-  TestOpts opts;
-  opts.types = types;
-  opts.populate_attr_encodings = attrs;
-  opts.dynamic_custom_calls = custom_calls;
+  CompilationPipelineOptions copts;
+  copts.populate_type_id_names = types;
+  copts.populate_attr_encodings = attrs;
 
-  EXPECT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  EXPECT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, copts).ok());
   EXPECT_EQ(rank, 2);
   EXPECT_EQ(a, std::vector<int64_t>(2, 1));
   EXPECT_EQ(b, std::vector<int64_t>(2, 2));
@@ -920,7 +1052,7 @@ TEST(CustomCallTest, StructAttr) {
 TEST(CustomCallTest, FunctionOrdinalAttr) {
   using FunctionOrdinal = CustomCall::FunctionOrdinal;
 
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @init()
       attributes { rt.dynamic, rt.custom_call = "test.init" }
 
@@ -949,12 +1081,12 @@ TEST(CustomCallTest, FunctionOrdinalAttr) {
   };
 
   // Dynamic custom call registry for resolving nested custom calls.
-  DynamicCustomCallRegistry registry;
-  registry.Register(CustomCall::Bind("test.init").To(init));
+  DynamicCustomCallRegistry nested_registry;
+  nested_registry.Register(CustomCall::Bind("test.init").To(init));
 
   // Execute options for nested custom calls.
   Executable::ExecuteOpts execute_opts;
-  execute_opts.custom_call_registry = &registry;
+  execute_opts.custom_call_registry = &nested_registry;
   execute_opts.async_task_runner =
       reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
 
@@ -964,24 +1096,23 @@ TEST(CustomCallTest, FunctionOrdinalAttr) {
     return success(fn({}, NoResultConverter{}, execute_opts).ok());
   };
 
-  auto custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.init").To(init));
     registry.Register(CustomCall::Bind("test.custom_call")
                           .UserData<Executable*>()
                           .Attr<FunctionOrdinal>("func")
                           .To(handler));
-  };
-
-  TestOpts opts;
-  opts.dynamic_custom_calls = custom_calls;
+  }};
 
   std::vector<std::string_view> exported = {"test", "call_init"};
-  EXPECT_TRUE(CompileAndExecute(module, /*args=*/{}, opts, exported).ok());
+  EXPECT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, /*copts=*/{},
+                                /*type_converter=*/{}, exported)
+                  .ok());
   EXPECT_TRUE(called_init);
 }
 
 TEST(CustomCallTest, OptionalAttr) {
-  absl::string_view module = R"(
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
 
@@ -1000,25 +1131,103 @@ TEST(CustomCallTest, OptionalAttr) {
     return success();
   };
 
-  TestOpts opts;
-  opts.dynamic_custom_calls = [&](DynamicCustomCallRegistry& registry) {
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
     registry.Register(CustomCall::Bind("test.custom_call")
                           .Attr<std::optional<int64_t>>("attr0")
                           .Attr<std::optional<int64_t>>("attr1")
                           .To(handler));
-  };
+  }};
 
-  EXPECT_TRUE(CompileAndExecute(module, /*args=*/{}, opts).ok());
+  EXPECT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
   ASSERT_EQ(attrs.size(), 2);
   EXPECT_EQ(attrs[0], std::nullopt);
   EXPECT_EQ(attrs[1], 42);
 }
 
+TEST(CustomCallTest, StateArg) {
+  absl::string_view source = R"(
+    func.func private @custom_call()
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      call @custom_call() { id = 0 : i64 } : () -> ()
+      return
+    }
+  )";
+
+  auto handler = [](int64_t id, State<int32_t> state0, State<int64_t> state1) {
+    state0.GetOrCreate([] { return 42; }).IgnoreError();
+    state1.GetOrCreate([] { return 42; }).IgnoreError();
+    return success();
+  };
+
+  StateVector<int32_t> state_i32;
+  StateVector<int64_t> state_i64;
+
+  StateVector<int32_t>::Snapshot snapshot_i32 = state_i32.snapshot();
+  StateVector<int64_t>::Snapshot snapshot_i64 = state_i64.snapshot();
+  CustomCall::UserData user_data(&snapshot_i32, &snapshot_i64);
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call")
+                          .Attr<int64_t>("id")
+                          .State<int32_t>("id")
+                          .State<int64_t>("id")
+                          .To(handler));
+  }};
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry, /*copts=*/{},
+                                /*type_converter=*/{}, {"test"}, user_data)
+                  .ok());
+  ASSERT_EQ(*state_i32[0], 42);
+  ASSERT_EQ(*state_i64[0], 42);
+}
+
+TEST(CustomCallTest, DictionaryAttr) {
+  absl::string_view source = R"(
+    func.func private @custom_call()
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      call @custom_call() {
+        dict = { foo = "Uh oh", bar = 42 : i32, baz = array<i32: 1, 2> }
+      }: () -> ()
+      return
+    }
+  )";
+
+  std::string foo;
+  int32_t bar = 0;
+  std::vector<int32_t> baz;
+
+  auto handler = [&](Dictionary dict) -> LogicalResult {
+    if (dict.size() != 3) return failure();
+
+    foo = *dict.get<std::string_view>("foo");
+    bar = *dict.get<int32_t>("bar");
+    auto span = dict.get<absl::Span<const int32_t>>("baz");
+    baz = std::vector<int32_t>(span->begin(), span->end());
+
+    return success();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call")
+                          .Attr<Dictionary>("dict")
+                          .To(handler));
+  }};
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, registry).ok());
+  EXPECT_EQ(foo, "Uh oh");
+  EXPECT_EQ(bar, 42);
+  EXPECT_EQ(baz, std::vector<int32_t>({1, 2}));
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
 
-using benchmark::State;
+namespace bm = ::testing::benchmark;
 
 using DirectCustomCall = DirectCustomCallRegistry::DirectCustomCall;
 using RuntimeChecks = CustomCall::RuntimeChecks;
@@ -1029,20 +1238,23 @@ static constexpr RuntimeChecks less = RuntimeChecks::kLess;
 static constexpr RuntimeChecks none = RuntimeChecks::kNone;
 
 static void BenchmarkCustomCall(
-    State& state, std::string_view module, ArgumentsRef args,
+    bm::State& state, std::string_view module, ArgumentsRef args,
     std::string_view name, DirectCustomCall custom_call,
     std::function<void(TypeIDNameRegistry&)> types = {},
-    std::function<void(CustomCallAttrEncodingSet&)> attrs = {}) {
-  TestOpts opts;
-  opts.types = std::move(types);
-  opts.populate_attr_encodings = std::move(attrs);
+    std::function<void(CustomCallAttrEncodingSet&)> attrs = {},
+    const CustomCall::UserData& user_data = {}) {
+  CustomCallRegistry registry;
 
   // Wrap benchmarked custom call into a direct custom call registry.
-  opts.direct_custom_calls = [&](DirectCustomCallRegistry& registry) {
+  registry.direct_custom_calls = [&](DirectCustomCallRegistry& registry) {
     registry.Register(name, custom_call);
   };
 
-  StatusOr<JitExecutable> jit_executable = Compile(module, opts);
+  CompilationPipelineOptions copts;
+  copts.populate_type_id_names = std::move(types);
+  copts.populate_attr_encodings = std::move(attrs);
+
+  StatusOr<JitExecutable> jit_executable = Compile(module, registry, copts);
   CHECK(jit_executable.ok()) << jit_executable.status();
 
   AsyncValuePtr<Executable> executable = jit_executable->DefaultExecutable();
@@ -1053,6 +1265,7 @@ static void BenchmarkCustomCall(
   CHECK(executable->InitializeCallFrame(args, &call_frame).ok());
 
   Executable::ExecuteOpts execute_opts;
+  execute_opts.custom_call_data = &user_data;
   execute_opts.async_task_runner =
       reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
 
@@ -1084,8 +1297,8 @@ static bool I32X1(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <RuntimeChecks checks>
-static void I32X1(State& state) {
-  absl::string_view module = R"(
+static void I32X1(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call(%arg0: i32)
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1096,11 +1309,11 @@ static void I32X1(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call", &I32X1<checks>);
+  BenchmarkCustomCall(state, source, {}, "test.custom_call", &I32X1<checks>);
 }
 
-static void BM_I32X1All(State& s) { I32X1<all>(s); }
-static void BM_I32X1None(State& s) { I32X1<none>(s); }
+static void BM_I32X1All(bm::State& s) { I32X1<all>(s); }
+static void BM_I32X1None(bm::State& s) { I32X1<none>(s); }
 
 BENCHMARK(BM_I32X1All);
 BENCHMARK(BM_I32X1None);
@@ -1139,8 +1352,8 @@ static bool I32X12(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks>
-static void I32X12(State& state) {
-  absl::string_view module = R"(
+static void I32X12(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call(%arg0: i32, %arg1: i32, %arg2: i32,
                                    %arg3: i32, %arg4: i32, %arg5: i32,
                                    %arg6: i32, %arg7: i32, %arg8: i32,
@@ -1166,11 +1379,11 @@ static void I32X12(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call", &I32X12<checks>);
+  BenchmarkCustomCall(state, source, {}, "test.custom_call", &I32X12<checks>);
 }
 
-static void BM_I32X12All(State& s) { I32X12<all>(s); }
-static void BM_I32X12None(State& s) { I32X12<none>(s); }
+static void BM_I32X12All(bm::State& s) { I32X12<all>(s); }
+static void BM_I32X12None(bm::State& s) { I32X12<none>(s); }
 
 BENCHMARK(BM_I32X12All);
 BENCHMARK(BM_I32X12None);
@@ -1191,8 +1404,8 @@ static bool RetI32X1(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <RuntimeChecks checks>
-static void RetI32X1(State& state) {
-  absl::string_view module = R"(
+static void RetI32X1(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call() -> i32
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1202,11 +1415,11 @@ static void RetI32X1(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call", &RetI32X1<checks>);
+  BenchmarkCustomCall(state, source, {}, "test.custom_call", &RetI32X1<checks>);
 }
 
-static void BM_RetI32X1All(State& s) { RetI32X1<all>(s); }
-static void BM_RetI32X1None(State& s) { RetI32X1<none>(s); }
+static void BM_RetI32X1All(bm::State& s) { RetI32X1<all>(s); }
+static void BM_RetI32X1None(bm::State& s) { RetI32X1<none>(s); }
 
 BENCHMARK(BM_RetI32X1All);
 BENCHMARK(BM_RetI32X1None);
@@ -1243,8 +1456,8 @@ static bool RetI32X12(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <RuntimeChecks checks>
-static void RetI32X12(State& state) {
-  absl::string_view module = R"(
+static void RetI32X12(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call()
       -> (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
       attributes { rt.custom_call = "test.custom_call" }
@@ -1256,12 +1469,12 @@ static void RetI32X12(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &RetI32X12<checks>);
 }
 
-static void BM_RetI32X12All(State& s) { RetI32X12<all>(s); }
-static void BM_RetI32X12None(State& s) { RetI32X12<none>(s); }
+static void BM_RetI32X12All(bm::State& s) { RetI32X12<all>(s); }
+static void BM_RetI32X12None(bm::State& s) { RetI32X12<none>(s); }
 
 BENCHMARK(BM_RetI32X12All);
 BENCHMARK(BM_RetI32X12None);
@@ -1287,8 +1500,8 @@ static bool MemrefX1(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks, typename MemrefType>
-static void MemrefX1(State& state) {
-  absl::string_view module = R"(
+static void MemrefX1(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call(%arg0: memref<4x4xf32>)
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1299,16 +1512,16 @@ static void MemrefX1(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &MemrefX1<checks, MemrefType>);
 }
 
-static void BM_FlatMemrefX1All(State& s) { MemrefX1<all, Flat>(s); }
-static void BM_FlatMemrefX1None(State& s) { MemrefX1<none, Flat>(s); }
-static void BM_MemrefX1All(State& s) { MemrefX1<all, MemrefView>(s); }
-static void BM_MemrefX1None(State& s) { MemrefX1<none, MemrefView>(s); }
-static void BM_StridedMemrefX1All(State& s) { MemrefX1<all, Strided>(s); }
-static void BM_StridedMemrefX1None(State& s) { MemrefX1<none, Strided>(s); }
+static void BM_FlatMemrefX1All(bm::State& s) { MemrefX1<all, Flat>(s); }
+static void BM_FlatMemrefX1None(bm::State& s) { MemrefX1<none, Flat>(s); }
+static void BM_MemrefX1All(bm::State& s) { MemrefX1<all, MemrefView>(s); }
+static void BM_MemrefX1None(bm::State& s) { MemrefX1<none, MemrefView>(s); }
+static void BM_StridedMemrefX1All(bm::State& s) { MemrefX1<all, Strided>(s); }
+static void BM_StridedMemrefX1None(bm::State& s) { MemrefX1<none, Strided>(s); }
 
 BENCHMARK(BM_FlatMemrefX1All);
 BENCHMARK(BM_FlatMemrefX1None);
@@ -1364,8 +1577,8 @@ static bool MemrefX12(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks, typename MemrefType>
-static void MemrefX12(State& state) {
-  absl::string_view module = R"(
+static void MemrefX12(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call(
       %arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>, %arg2: memref<4x4xf32>,
       %arg3: memref<4x4xf32>, %arg4: memref<4x4xf32>, %arg5: memref<4x4xf32>,
@@ -1384,16 +1597,18 @@ static void MemrefX12(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &MemrefX12<checks, MemrefType>);
 }
 
-static void BM_FlatMemrefX12All(State& s) { MemrefX12<all, Flat>(s); }
-static void BM_FlatMemrefX12None(State& s) { MemrefX12<none, Flat>(s); }
-static void BM_MemrefX12All(State& s) { MemrefX12<all, MemrefView>(s); }
-static void BM_MemrefX12None(State& s) { MemrefX12<none, MemrefView>(s); }
-static void BM_StridedMemrefX12All(State& s) { MemrefX12<all, Strided>(s); }
-static void BM_StridedMemrefX12None(State& s) { MemrefX12<none, Strided>(s); }
+static void BM_FlatMemrefX12All(bm::State& s) { MemrefX12<all, Flat>(s); }
+static void BM_FlatMemrefX12None(bm::State& s) { MemrefX12<none, Flat>(s); }
+static void BM_MemrefX12All(bm::State& s) { MemrefX12<all, MemrefView>(s); }
+static void BM_MemrefX12None(bm::State& s) { MemrefX12<none, MemrefView>(s); }
+static void BM_StridedMemrefX12All(bm::State& s) { MemrefX12<all, Strided>(s); }
+static void BM_StridedMemrefX12None(bm::State& s) {
+  MemrefX12<none, Strided>(s);
+}
 
 BENCHMARK(BM_FlatMemrefX12All);
 BENCHMARK(BM_FlatMemrefX12None);
@@ -1422,8 +1637,8 @@ static bool I32AttrX1(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks>
-static void I32AttrX1(State& state) {
-  absl::string_view module = R"(
+static void I32AttrX1(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1433,13 +1648,13 @@ static void I32AttrX1(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &I32AttrX1<checks>);
 }
 
-static void BM_I32AttrX1All(State& s) { I32AttrX1<all>(s); }
-static void BM_I32AttrX1None(State& s) { I32AttrX1<none>(s); }
-static void BM_I32AttrX1Less(State& s) { I32AttrX1<less>(s); }
+static void BM_I32AttrX1All(bm::State& s) { I32AttrX1<all>(s); }
+static void BM_I32AttrX1None(bm::State& s) { I32AttrX1<none>(s); }
+static void BM_I32AttrX1Less(bm::State& s) { I32AttrX1<less>(s); }
 
 BENCHMARK(BM_I32AttrX1All);
 BENCHMARK(BM_I32AttrX1Less);
@@ -1480,8 +1695,8 @@ static bool I32AttrX12(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks>
-static void I32AttrX12(State& state) {
-  absl::string_view module = R"(
+static void I32AttrX12(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1496,13 +1711,13 @@ static void I32AttrX12(State& state) {
     }
   )";
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &I32AttrX12<checks>);
 }
 
-static void BM_I32AttrX12All(State& s) { I32AttrX12<all>(s); }
-static void BM_I32AttrX12None(State& s) { I32AttrX12<none>(s); }
-static void BM_I32AttrX12Types(State& s) { I32AttrX12<less>(s); }
+static void BM_I32AttrX12All(bm::State& s) { I32AttrX12<all>(s); }
+static void BM_I32AttrX12None(bm::State& s) { I32AttrX12<none>(s); }
+static void BM_I32AttrX12Types(bm::State& s) { I32AttrX12<less>(s); }
 
 BENCHMARK(BM_I32AttrX12All);
 BENCHMARK(BM_I32AttrX12Types);
@@ -1526,8 +1741,8 @@ static bool AggregateAttrX1(ExecutionContext* ctx, void** args, void** attrs,
 }
 
 template <CustomCall::RuntimeChecks checks>
-static void AggregateAttrX1(State& state) {
-  absl::string_view module = R"(
+static void AggregateAttrX1(bm::State& state) {
+  absl::string_view source = R"(
     func.func private @custom_call()
       attributes { rt.custom_call = "test.custom_call" }
 
@@ -1551,17 +1766,221 @@ static void AggregateAttrX1(State& state) {
                       .Add("b", &PairOfDimsAttr::getB));
   };
 
-  BenchmarkCustomCall(state, module, {}, "test.custom_call",
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
                       &AggregateAttrX1<checks>, types, attrs);
 }
 
-static void BM_AggregateAttrX1All(State& s) { AggregateAttrX1<all>(s); }
-static void BM_AggregateAttrX1None(State& s) { AggregateAttrX1<none>(s); }
-static void BM_AggregateAttrX1Less(State& s) { AggregateAttrX1<less>(s); }
+static void BM_AggregateAttrX1All(bm::State& s) { AggregateAttrX1<all>(s); }
+static void BM_AggregateAttrX1None(bm::State& s) { AggregateAttrX1<none>(s); }
+static void BM_AggregateAttrX1Less(bm::State& s) { AggregateAttrX1<less>(s); }
 
 BENCHMARK(BM_AggregateAttrX1All);
 BENCHMARK(BM_AggregateAttrX1Less);
 BENCHMARK(BM_AggregateAttrX1None);
 
+//===----------------------------------------------------------------------===//
+// Custom call with UserData arguments.
+//===----------------------------------------------------------------------===//
+
+// Use std::integral_constant to fake multiple unique UserData types.
+template <int value>
+using Data = std::integral_constant<int, value>;
+
+// Benchmark how long it takes to prepare UserData.
+static void BM_PrepareUserData(bm::State& state) {
+  Data<0> data0;
+  Data<1> data1;
+  Data<2> data2;
+  Data<3> data3;
+  Data<4> data4;
+  Data<5> data5;
+  Data<6> data6;
+  Data<7> data7;
+  Data<8> data8;
+  Data<9> data9;
+
+  for (auto _ : state) {
+    CustomCall::UserData user_data(&data0, &data1, &data2, &data3, &data4,
+                                   &data5, &data6, &data7, &data8, &data9);
+    benchmark::DoNotOptimize(user_data);
+  }
+}
+
+BENCHMARK(BM_PrepareUserData);
+
+template <CustomCall::RuntimeChecks checks>
+static bool UserDataX12(ExecutionContext* ctx, void** args, void** attrs,
+                        void** rets) {
+  static auto* handler =
+      CustomCall::Bind("test.custom_call")
+          .UserData<Data<0>*>()
+          .UserData<Data<1>*>()
+          .UserData<Data<2>*>()
+          .UserData<Data<3>*>()
+          .UserData<Data<4>*>()
+          .UserData<Data<5>*>()
+          .UserData<Data<6>*>()
+          .UserData<Data<7>*>()
+          .UserData<Data<8>*>()
+          .UserData<Data<9>*>()
+          .UserData<Data<10>*>()
+          .UserData<Data<11>*>()
+          .To<checks>([](Data<0>* data0, Data<1>* data1, Data<2>* data2,
+                         Data<3>* data3, Data<4>* data4, Data<5>* data5,
+                         Data<6>* data6, Data<7>* data7, Data<8>* data8,
+                         Data<9>* data9, Data<10>* data10, Data<11>* data11) {
+            benchmark::DoNotOptimize(data0);
+            benchmark::DoNotOptimize(data1);
+            benchmark::DoNotOptimize(data2);
+            benchmark::DoNotOptimize(data3);
+            benchmark::DoNotOptimize(data4);
+            benchmark::DoNotOptimize(data5);
+            benchmark::DoNotOptimize(data6);
+            benchmark::DoNotOptimize(data7);
+            benchmark::DoNotOptimize(data8);
+            benchmark::DoNotOptimize(data9);
+            benchmark::DoNotOptimize(data10);
+            benchmark::DoNotOptimize(data11);
+            return success();
+          })
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+template <CustomCall::RuntimeChecks checks>
+static void UserDataX12(bm::State& state) {
+  absl::string_view source = R"(
+    func.func private @custom_call()
+      attributes { rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      call @custom_call() : () -> ()
+      return
+    }
+  )";
+
+  Data<0> data0;
+  Data<1> data1;
+  Data<2> data2;
+  Data<3> data3;
+  Data<4> data4;
+  Data<5> data5;
+  Data<6> data6;
+  Data<7> data7;
+  Data<8> data8;
+  Data<9> data9;
+  Data<10> data10;
+  Data<11> data11;
+
+  CustomCall::UserData user_data;
+  user_data.insert_all(&data0, &data1, &data2, &data3, &data4, &data5, &data6,
+                       &data7, &data8, &data9, &data10, &data11);
+
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
+                      &UserDataX12<checks>, {}, {}, user_data);
+}
+
+static void BM_UserDataX12All(bm::State& s) { UserDataX12<all>(s); }
+static void BM_UserDataX12None(bm::State& s) { UserDataX12<none>(s); }
+static void BM_UserDataX12Less(bm::State& s) { UserDataX12<less>(s); }
+
+BENCHMARK(BM_UserDataX12All);
+BENCHMARK(BM_UserDataX12Less);
+BENCHMARK(BM_UserDataX12None);
+
+//===----------------------------------------------------------------------===//
+// Benchmark memref encoding for a sequence of custom calls.
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Sink(CustomCall::RemainingArgs) { return success(); }
+
+template <CustomCall::RuntimeChecks checks>
+static bool RemainingArgsSink(ExecutionContext* ctx, void** args, void** attrs,
+                              void** rets) {
+  static auto* handler = CustomCall::Bind("test.custom_call")
+                             .RemainingArgs()
+                             .To<checks>(CustomCall::FunctionWrapper<Sink>())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+template <CustomCall::RuntimeChecks checks>
+static void MemrefEncoding(bm::State& state) {
+  absl::string_view source = R"(
+    func.func private @custom_call(
+      %arg0: memref<4x4xf32>, %arg1: memref<5x5xf32>, %arg2: memref<6x6xf32>,
+      %arg3: memref<4x4xf32>, %arg4: memref<5x5xf32>, %arg5: memref<6x6xf32>
+    ) attributes { rt.custom_call = "test.custom_call" }
+
+    func.func @test() {
+      %0 = memref.alloca() : memref<4x4xf32>
+      %1 = memref.alloca() : memref<5x5xf32>
+      %2 = memref.alloca() : memref<6x6xf32>
+
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      call @custom_call(%0, %1, %2, %0, %1, %2)
+        : (memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>,
+           memref<4x4xf32>, memref<5x5xf32>, memref<6x6xf32>) -> ()
+      return
+    }
+  )";
+
+  BenchmarkCustomCall(state, source, {}, "test.custom_call",
+                      &RemainingArgsSink<checks>);
+}
+
+static void BM_MemrefEncoding(bm::State& s) { MemrefEncoding<none>(s); }
+
+BENCHMARK(BM_MemrefEncoding);
+
 }  // namespace runtime
 }  // namespace xla
+
+// Add explicit dense type ids for all data types passed as UserData to measure
+// the effects of explicit type id declaration/definition.
+#define DEFINE_DENSE_TYPE_ID(n)                                        \
+  XLA_RUNTIME_DECLARE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::CustomCall, \
+                                             xla::runtime::Data<n>);   \
+  XLA_RUNTIME_DEFINE_EXPLICIT_DENSE_TYPE_ID(xla::runtime::CustomCall,  \
+                                            xla::runtime::Data<n>)
+
+DEFINE_DENSE_TYPE_ID(0);
+DEFINE_DENSE_TYPE_ID(1);
+DEFINE_DENSE_TYPE_ID(2);
+DEFINE_DENSE_TYPE_ID(3);
+DEFINE_DENSE_TYPE_ID(4);
+DEFINE_DENSE_TYPE_ID(5);
+DEFINE_DENSE_TYPE_ID(6);
+DEFINE_DENSE_TYPE_ID(7);
+DEFINE_DENSE_TYPE_ID(8);
+DEFINE_DENSE_TYPE_ID(9);
+DEFINE_DENSE_TYPE_ID(10);
+DEFINE_DENSE_TYPE_ID(11);
+
+#undef DEFINE_DENSE_TYPE_ID
diff --git a/tensorflow/compiler/xla/runtime/default/BUILD b/tensorflow/compiler/xla/runtime/default/BUILD
index e5e20707237..a5a335869bf 100644
--- a/tensorflow/compiler/xla/runtime/default/BUILD
+++ b/tensorflow/compiler/xla/runtime/default/BUILD
@@ -1,7 +1,8 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla/runtime:__pkg__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/runtime/executable.cc b/tensorflow/compiler/xla/runtime/executable.cc
index 6c288aa8aca..9d715d5c61d 100644
--- a/tensorflow/compiler/xla/runtime/executable.cc
+++ b/tensorflow/compiler/xla/runtime/executable.cc
@@ -68,6 +68,8 @@ struct ExecutionContext {
   const DiagnosticEngine* diagnostic_engine = nullptr;
 };
 
+void DestroyExecutionContext::operator()(ExecutionContext* ctx) { delete ctx; }
+
 //===----------------------------------------------------------------------===//
 // Conversion from custom calls and type id registries to symbols binding.
 //===----------------------------------------------------------------------===//
@@ -284,10 +286,9 @@ Status Executable::InitializeCallFrame(unsigned ordinal, ArgumentsRef arguments,
 // Execute the compiled XLA runtime executable.
 //===----------------------------------------------------------------------===//
 
-Status Executable::Execute(unsigned ordinal, ArgumentsRef arguments,
-                           const ResultConverter& results,
-                           const ExecuteOpts& opts,
-                           bool verify_arguments) const {
+absl::StatusOr<ExecutionReference> Executable::Execute(
+    unsigned ordinal, ArgumentsRef arguments, const ResultConverter& results,
+    const ExecuteOpts& opts, bool verify_arguments) const {
   // CallFrame can be allocated on the stack because compiled function will
   // unpack all the arguments it needs, and async regions will not access
   // the data after the initial function will return the result.
@@ -325,17 +326,17 @@ Status Executable::Execute(unsigned ordinal, ArgumentsRef arguments,
       !st.ok())
     return (results.ReturnError(st), st);
 
-  Execute(ordinal, call_frame, opts);
+  auto exec_ref = Execute(ordinal, call_frame, opts);
 
   // Convert compiled function return values into results.
   if (auto st = ReturnResults(ordinal, results, &call_frame); !st.ok())
     return st;
 
-  return absl::OkStatus();
+  return std::move(exec_ref);
 }
 
-void Executable::Execute(unsigned ordinal, CallFrame& call_frame,
-                         const ExecuteOpts& opts) const {
+ExecutionReference Executable::Execute(unsigned ordinal, CallFrame& call_frame,
+                                       const ExecuteOpts& opts) const {
   assert(ordinal < functions_.size() && "function ordinal out of bounds");
   const Function& fn = functions_[ordinal];
 
@@ -343,25 +344,32 @@ void Executable::Execute(unsigned ordinal, CallFrame& call_frame,
   // executable.
   AsyncRuntime::Set(AsyncRuntime(opts.async_task_runner));
 
-  // Runtime execution context can be used only by the compiled function and
-  // can be safely allocated on the stack.
-  //
-  // TODO(ezhulenev): It's not quite true, with custom calls inside async
-  // functions the lifetime of the execution context must be extended until all
-  // pending async tasks are completed. Figure out where to transfer the
-  // execution context ownership for async functions.
+  ExecutionReference exec_ref;
+  ExecutionContext* execution_ctx_ptr = nullptr;
+  // For sync executable, runtime execution context can be used only by the
+  // compiled function and can be safely allocated on the stack.
   ExecutionContext execution_ctx = {
       &fn.results_memory_layout, &call_frame, opts.custom_call_data,
       opts.custom_call_registry, opts.diagnostic_engine};
+  if (IsAsync()) {
+    // With custom calls inside async functions the lifetime of the execution
+    // context must be extended until all pending async tasks are completed.
+    exec_ref = ExecutionReference(new ExecutionContext{
+        &fn.results_memory_layout, &call_frame, opts.custom_call_data,
+        opts.custom_call_registry, opts.diagnostic_engine});
+    execution_ctx_ptr = exec_ref.get();
+  } else {
+    // Override the execution context argument.
+    execution_ctx_ptr = &execution_ctx;
+  }
 
-  // Override the execution context argument.
-  ExecutionContext* execution_context_ptr = &execution_ctx;
   assert(call_frame.args.size() == fn.arguments_memory_layout.num_args_ptrs);
   assert(call_frame.args[0] == nullptr && "expected to see a placeholder");
-  call_frame.args[0] = &execution_context_ptr;
+  call_frame.args[0] = &execution_ctx_ptr;
 
   // Call the compiled function.
   (*fn.fptr)(call_frame.args.data());
+  return exec_ref;
 }
 
 Status Executable::ReturnResults(unsigned ordinal,
@@ -438,14 +446,10 @@ Status Executable::ReturnResults(unsigned ordinal,
     auto results_memory_layout = GetResultsMemoryLayout(fn.runtime_signature);
     if (!results_memory_layout.ok()) return results_memory_layout.status();
 
-    Executable::Function function{std::move(fn.name),
-                                  (*engine)->exported(indexed.index()),
-                                  std::move(fn.signature),
-                                  std::move(fn.runtime_signature),
-                                  std::move(*args_memory_layout),
-                                  std::move(*results_memory_layout)};
-
-    functions.push_back(std::move(function));
+    functions.push_back(Executable::Function(
+        std::move(fn.name), (*engine)->exported(indexed.index()),
+        std::move(fn.signature), std::move(fn.runtime_signature),
+        std::move(*args_memory_layout), std::move(*results_memory_layout)));
   }
 
   return Executable(name, std::move(memory_mapper), std::move(*engine),
@@ -512,10 +516,9 @@ FunctionRef::FunctionRef(const Executable* executable, unsigned ordinal)
   assert(executable && "executable must be not null");
 }
 
-absl::Status FunctionRef::operator()(ArgumentsRef arguments,
-                                     const ResultConverter& results,
-                                     const Executable::ExecuteOpts& opts,
-                                     bool verify_arguments) const {
+absl::StatusOr<ExecutionReference> FunctionRef::operator()(
+    ArgumentsRef arguments, const ResultConverter& results,
+    const Executable::ExecuteOpts& opts, bool verify_arguments) const {
   return executable_->Execute(ordinal_, arguments, results, opts,
                               verify_arguments);
 }
@@ -565,11 +568,23 @@ bool CustomCall(ExecutionContext* ctx, const char* target, void** args,
                 void** attrs, void** rets) {
   assert(ctx && target && args && attrs && rets && "must be not null");
   assert(ctx->custom_call_registry && "custom call registry must be not null");
-  if (ctx->custom_call_registry == nullptr) return false;
+
+  const DiagnosticEngine* diagnostic = ctx->diagnostic_engine;
+
+  if (ctx->custom_call_registry == nullptr) {
+    if (diagnostic)
+      diagnostic->EmitError(
+          absl::InternalError("custom call registry is not available"));
+    return false;
+  }
 
   auto* custom_call = ctx->custom_call_registry->Find(target);
-  assert(custom_call && "custom call not found");
-  if (custom_call == nullptr) return false;
+  if (custom_call == nullptr) {
+    if (diagnostic)
+      diagnostic->EmitError(absl::InternalError(absl::StrFormat(
+          "custom call is not registered with runtime: %s", target)));
+    return false;
+  }
 
   return succeeded(custom_call->call(args, attrs, rets, ctx->custom_call_data,
                                      ctx->diagnostic_engine));
diff --git a/tensorflow/compiler/xla/runtime/executable.h b/tensorflow/compiler/xla/runtime/executable.h
index 91b1092feb0..aa8b4dd3954 100644
--- a/tensorflow/compiler/xla/runtime/executable.h
+++ b/tensorflow/compiler/xla/runtime/executable.h
@@ -42,7 +42,21 @@ limitations under the License.
 namespace xla {
 namespace runtime {
 
-class ExecutionContext;
+struct ExecutionContext;
+
+struct DestroyExecutionContext {
+  void operator()(ExecutionContext* ctx);
+};
+
+// If executable has async results, ExecutionReference keeps that
+// execution context alive. For sync executables `Execute` always returns
+// ExecutionReference with nullptr.
+class ExecutionReference
+    : public std::unique_ptr<ExecutionContext, DestroyExecutionContext> {
+  // Bring std::unique_ptr constructors in scope.
+  using std::unique_ptr<ExecutionContext, DestroyExecutionContext>::unique_ptr;
+};
+
 class FunctionRef;
 class JitCompiler;
 
@@ -112,13 +126,13 @@ class Executable {
   //
   // Returns exported function results via the user-provided results converter.
   // If execution completed in the error state, returns error for all results.
-  absl::Status Execute(unsigned ordinal, ArgumentsRef arguments,
-                       const ResultConverter& results, const ExecuteOpts& opts,
-                       bool verify_arguments = true) const;
+  absl::StatusOr<ExecutionReference> Execute(
+      unsigned ordinal, ArgumentsRef arguments, const ResultConverter& results,
+      const ExecuteOpts& opts, bool verify_arguments = true) const;
 
-  absl::Status Execute(ArgumentsRef arguments, const ResultConverter& results,
-                       const ExecuteOpts& opts,
-                       bool verify_arguments = true) const {
+  absl::StatusOr<ExecutionReference> Execute(
+      ArgumentsRef arguments, const ResultConverter& results,
+      const ExecuteOpts& opts, bool verify_arguments = true) const {
     return Execute(0, arguments, results, opts, verify_arguments);
   }
 
@@ -126,8 +140,8 @@ class Executable {
   //
   // It is the caller responsibility to handle the compiled function results
   // stored in the call frame.
-  void Execute(unsigned ordinal, CallFrame& call_frame,
-               const ExecuteOpts& opts) const;
+  ExecutionReference Execute(unsigned ordinal, CallFrame& call_frame,
+                             const ExecuteOpts& opts) const;
 
   void Execute(CallFrame& call_frame, const ExecuteOpts& opts) const {
     Execute(0, call_frame, opts);
@@ -293,6 +307,16 @@ class Executable {
   // debugging. Function ordinal is defined by its index in the `functions_`
   // vector.
   struct Function {
+    Function(std::string_view name, ExecutionEngine::ExportedFunctionPtr fptr,
+             FunctionType signature, FunctionType runtime_signature,
+             ArgumentsMemoryLayout arguments_memory_layout,
+             ResultsMemoryLayout results_memory_layout)
+        : name(name),
+          fptr(std::move(fptr)),
+          signature(std::move(signature)),
+          runtime_signature(std::move(runtime_signature)),
+          arguments_memory_layout(std::move(arguments_memory_layout)),
+          results_memory_layout(std::move(results_memory_layout)) {}
     Function(const Function&) = delete;
     Function(Function&&) = default;
 
@@ -374,10 +398,9 @@ class FunctionRef {
  public:
   FunctionRef(const Executable* executable, unsigned ordinal);
 
-  absl::Status operator()(ArgumentsRef arguments,
-                          const ResultConverter& results,
-                          const Executable::ExecuteOpts& opts,
-                          bool verify_arguments = true) const;
+  absl::StatusOr<ExecutionReference> operator()(
+      ArgumentsRef arguments, const ResultConverter& results,
+      const Executable::ExecuteOpts& opts, bool verify_arguments = true) const;
 
  private:
   const Executable* executable_;
diff --git a/tensorflow/compiler/xla/runtime/executable_test.cc b/tensorflow/compiler/xla/runtime/executable_test.cc
index 95e3fcfbfbe..6f1b85cc6fb 100644
--- a/tensorflow/compiler/xla/runtime/executable_test.cc
+++ b/tensorflow/compiler/xla/runtime/executable_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <tuple>
@@ -24,11 +27,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/dynamic_annotations.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_options.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.h"
+#include "tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h"
 #include "tensorflow/compiler/xla/runtime/arguments.h"
 #include "tensorflow/compiler/xla/runtime/async_runtime.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/runtime/logical_result.h"
 #include "tensorflow/compiler/xla/runtime/results.h"
+#include "tensorflow/compiler/xla/runtime/types.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/platform/test_benchmark.h"
 
@@ -47,46 +55,119 @@ static AsyncTaskRunner* NoRunner() {
   return reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
 }
 
+// Lazily execute tasks
+class LazyAsyncTaskRunner : public AsyncTaskRunner {
+ public:
+  void Schedule(Task task) final { tasks_.push_back(std::move(task)); }
+  void Run() {
+    while (!tasks_.empty()) {
+      tasks_.back()();
+      tasks_.pop_back();
+      break;
+    }
+  }
+
+ private:
+  std::vector<Task> tasks_;
+};
+
+struct CustomCallRegistry {
+  std::function<void(DynamicCustomCallRegistry&)> dynamic_custom_calls;
+  std::function<void(DirectCustomCallRegistry&)> direct_custom_calls;
+};
+
 static absl::StatusOr<JitExecutable> Compile(
-    std::string_view module, absl::Span<const std::string_view> exported) {
+    std::string_view module, absl::Span<const std::string_view> exported,
+    const CustomCallRegistry& registry = {}) {
   JitExecutable::Options opts;
+  CompilationPipelineOptions copts;
   opts.specialization = JitExecutable::Specialization::kDisabled;
-  opts.compiler.register_dialects = RegisterXlaRuntimeTestlibDialects;
+  opts.compiler.symbols_binding = ToSymbolsBinding(
+      registry.direct_custom_calls, copts.populate_type_id_names);
+  opts.compiler.register_dialects = [&](DialectRegistry& dialects) {
+    RegisterXlaRuntimeTestlibDialects(dialects);
+  };
   opts.compiler.create_compilation_pipeline = CreateXlaRuntimeTestlibPipeline;
 
   return JitExecutable::Instantiate(module, opts, exported);
 }
 
-static absl::Status Execute(JitExecutable& jit_executable, unsigned ordinal,
-                            ArgumentsRef args, ResultConverter& results,
-                            AsyncTaskRunner* async_task_runner = NoRunner()) {
+static absl::StatusOr<ExecutionReference> Execute(
+    JitExecutable& jit_executable, unsigned ordinal, ArgumentsRef args,
+    ResultConverter& results, AsyncTaskRunner* async_task_runner = NoRunner(),
+    const CustomCallRegistry& registry = {}, bool use_lazy_runner = false) {
   AsyncValuePtr<Executable> executable = jit_executable.DefaultExecutable();
   if (executable.IsError()) return executable.GetError();
 
+  // Register all dynamic custom calls.
+  DynamicCustomCallRegistry dynamic_custom_calls;
+  if (registry.dynamic_custom_calls)
+    registry.dynamic_custom_calls(dynamic_custom_calls);
+
+  CustomCall::UserData user_data;
+  // Always add a pointer to `self` to user data.
+  user_data.insert(&executable.get());
+
   Executable::ExecuteOpts execute_opts;
+  execute_opts.custom_call_registry = &dynamic_custom_calls;
+  execute_opts.custom_call_data = &user_data;
   execute_opts.async_task_runner = async_task_runner;
+  if (use_lazy_runner) {
+    LazyAsyncTaskRunner runner;
+    execute_opts.async_task_runner = &runner;
+    FunctionRef function_ref = executable->function_ref(ordinal);
+    auto status = function_ref(args, results, execute_opts);
+    runner.Run();
+    return status;
+  }
 
   FunctionRef function_ref = executable->function_ref(ordinal);
   return function_ref(args, results, execute_opts);
 }
 
-static absl::Status CompileAndExecute(
+static absl::StatusOr<ExecutionReference> CompileAndExecute(
     std::string_view module, ArgumentsRef args, ResultConverter& results,
-    AsyncTaskRunner* async_task_runner = NoRunner()) {
-  StatusOr<JitExecutable> jit_executable = Compile(module, {"test"});
+    AsyncTaskRunner* async_task_runner = NoRunner(),
+    const CustomCallRegistry& registry = {}, bool use_lazy_runner = false) {
+  StatusOr<JitExecutable> jit_executable = Compile(module, {"test"}, registry);
   if (!jit_executable.ok()) return jit_executable.status();
 
-  return Execute(*jit_executable, 0, args, results, async_task_runner);
+  return Execute(*jit_executable, 0, args, results, async_task_runner, registry,
+                 use_lazy_runner);
 }
 
 //===----------------------------------------------------------------------===//
 
+namespace {
+
+// An owning wrapper around Memref desciptor that releases the underlying buffer
+// when destructed. Used for testing passing ownerhip of memrefs allocated in
+// the compiled executables to the C++ caller.
+struct OwnedMemref {
+  ~OwnedMemref() {
+    if (desc.has_value()) std::free(desc->data());
+  }
+
+  MemrefDesc* operator->() { return &desc.value(); }
+
+  std::optional<MemrefDesc> desc;
+};
+
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+
 static void AssertNoError(const absl::Status& status) {
   assert(false && "Unexpected call to `ReturnError`");
 }
 
 static void IgnoreError(const absl::Status& status) {}
 
+void Emplace(void* int_ptr, AsyncValue* dst) {
+  auto& v = dst->get<int32_t>();
+  v = *reinterpret_cast<int32_t*>(int_ptr);
+}
+
 struct ReturnI32 {
   LogicalResult operator()(unsigned result_index, const Type* type,
                            const Type* runtime_type, void* ret) const {
@@ -102,6 +183,109 @@ struct ReturnI32 {
   int32_t* ptr = nullptr;
 };
 
+struct ReturnMemref {
+  LogicalResult operator()(unsigned result_index, const Type* type,
+                           const Type* runtime_type, void* ret) const {
+    auto* memref = llvm::dyn_cast<MemrefType>(runtime_type);
+    if (!memref) return failure();
+
+    auto desc = ConvertReturnedMemref<MemrefDesc>(*this, memref, ret);
+    if (failed(desc)) return failure();
+
+    ptr->desc = std::move(*desc);
+    return success();
+  }
+
+  MemrefDesc operator()(PrimitiveType element_type, void* base_ptr,
+                        void* data_ptr, int64_t offset,
+                        absl::Span<const int64_t> sizes,
+                        absl::Span<const int64_t> strides) const {
+    return MemrefDesc(element_type, base_ptr, offset, sizes, strides);
+  }
+
+  OwnedMemref* ptr = nullptr;
+};
+
+struct ReturnAsyncToken {
+  LogicalResult operator()(unsigned result_index, const Type* type,
+                           const Type* runtime_type, void* result_ptr) const {
+    if (!llvm::isa<AsyncTokenType>(type)) return failure();
+
+    // Load the pointer to the async token from a pointer to result storage.
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(result_ptr, sizeof(void*));
+    void* ret = *reinterpret_cast<void**>(result_ptr);
+    auto* token = static_cast<AsyncRuntime::Token*>(ret);
+    auto* async_value = AsyncRuntime::GetAsyncValue(token);
+    CHECK(async_value->IsAvailable());
+    chain.SetStateConcrete();
+    AsyncRuntime::DropRef(AsyncRuntime::ToAsyncRuntimeObject(token));
+    return success();
+  }
+
+  AsyncValuePtr<Chain> chain;
+};
+
+struct ReturnAsyncI32 {
+  LogicalResult operator()(unsigned result_index, const Type* type,
+                           const Type* runtime_type, void* result_ptr) const {
+    auto* value_type = llvm::dyn_cast<AsyncValueType>(type);
+    if (!value_type) return failure();
+
+    // Load the pointer to the async value from a pointer to result storage.
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(result_ptr, sizeof(void*));
+    void* ret = *reinterpret_cast<void**>(result_ptr);
+    auto* value = static_cast<AsyncRuntime::Value*>(ret);
+    auto* scalar = llvm::dyn_cast<ScalarType>(&value_type->value_type());
+
+    if (scalar && scalar->type() == PrimitiveType::S32) {
+      ExtractAsyncValue(value, ptr.value(), Emplace);
+      return success();
+    }
+
+    return failure();
+  }
+
+  AsyncValuePtr<int32_t> ptr;
+};
+
+struct ReturnAsyncMemref {
+  LogicalResult operator()(unsigned result_index, const Type* type,
+                           const Type* runtime_type, void* result_ptr) const {
+    auto* value_type = llvm::dyn_cast<AsyncValueType>(type);
+    if (!value_type) return failure();
+
+    // Load the pointer to the async memref from a pointer to result storage.
+    ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(result_ptr, sizeof(void*));
+    void* ret = *reinterpret_cast<void**>(result_ptr);
+    auto* value = static_cast<AsyncRuntime::Value*>(ret);
+    auto* memref = llvm::dyn_cast<MemrefType>(&value_type->value_type());
+
+    if (memref) {
+      // TODO(ezhulenev): Emplace function captures `memref` by reference, and
+      // if `value` is not available, then it will lead to asan errors. We need
+      // an `ExtractAsyncValue` that can take absl::AnyInvocable callback, that
+      // will capture all referenced values. Alternative solution is a large
+      // switch statement that will dispatch for different types and ranks.
+      ExtractAsyncValue(value, ptr.value(), [&](void* data, AsyncValue* dst) {
+        auto desc = ConvertReturnedMemref<MemrefDesc>(*this, memref, data);
+        if (succeeded(desc)) dst->get<OwnedMemref>().desc = std::move(*desc);
+      });
+      return success();
+    }
+
+    return failure();
+  }
+
+  MemrefDesc operator()(PrimitiveType element_type, void* base_ptr,
+                        void* data_ptr, int64_t offset,
+                        absl::Span<const int64_t> sizes,
+                        absl::Span<const int64_t> strides) const {
+    return MemrefDesc(element_type, base_ptr, offset, sizes, strides);
+  }
+
+  AsyncValuePtr<OwnedMemref> ptr;
+};
+
 // Execute all tasks in the caller thread immediately.
 class InlineAsyncTaskRunner : public AsyncTaskRunner {
  public:
@@ -129,6 +313,26 @@ TEST(ExecutableTest, ReturnScalar) {
   EXPECT_EQ(result, 42);
 }
 
+TEST(ExecutableTest, ReturnMemref) {
+  absl::string_view module = R"(
+    func.func @test() -> memref<?x?xf32> {
+      %0 = arith.constant 1 : index
+      %1 = arith.constant 2 : index
+      %2 = memref.alloc(%0, %1) : memref<?x?xf32>
+      return %2 : memref<?x?xf32>
+    }
+  )";
+
+  OwnedMemref result;
+  ResultConverterSet converter(AssertNoError, ReturnMemref{&result});
+
+  ASSERT_TRUE(CompileAndExecute(module, {}, converter).ok());
+  ASSERT_TRUE(result.desc.has_value());
+  EXPECT_EQ(result->rank(), 2);
+  EXPECT_EQ(result->size(0), 1);
+  EXPECT_EQ(result->size(1), 2);
+}
+
 TEST(ExecutableTest, ScalarArgs) {
   absl::string_view module = R"(
     func.func @test(%arg0: i32, %arg1: i32) -> i32 {
@@ -198,7 +402,8 @@ TEST(ExecutableTest, AssertionFailure) {
     ScalarArg arg0(int32_t{42});
     auto executed = CompileAndExecute(module, {arg0}, converter);
     EXPECT_FALSE(executed.ok());
-    EXPECT_EQ(executed.message(), "run time error: Oops, argument can't be 42");
+    EXPECT_EQ(executed.status().message(),
+              "run time error: Oops, argument can't be 42");
   }
 }
 
@@ -229,7 +434,8 @@ TEST(ExecutableTest, AssertionFailureOrResult) {
     ScalarArg arg0(int32_t{42});
     auto executed = CompileAndExecute(module, {arg0}, converter);
     EXPECT_FALSE(executed.ok());
-    EXPECT_EQ(executed.message(), "run time error: Oops, argument can't be 42");
+    EXPECT_EQ(executed.status().message(),
+              "run time error: Oops, argument can't be 42");
     EXPECT_EQ(result, 0);
   }
 }
@@ -259,14 +465,183 @@ TEST(ExecutableTest, AsyncExecuteAndAwait) {
   EXPECT_EQ(result, 42);
 }
 
+TEST(ExecutableTest, AsyncTokenRet) {
+  absl::string_view module = R"(
+    async.func @test() -> !async.token {
+      return
+    }
+  )";
+
+  AsyncValueRef<Chain> result = MakeConstructedAsyncValueRef<Chain>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncToken{result.AsPtr()});
+
+  ASSERT_TRUE(CompileAndExecute(module, {}, converter).ok());
+  EXPECT_EQ(result.IsAvailable(), true);
+}
+
+TEST(ExecutableTest, AsyncScalarRet) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: i32, %arg1: i32) -> !async.value<i32> {
+      %0 = arith.addi %arg0, %arg1 : i32
+      return %0 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  ScalarArg arg0(static_cast<int32_t>(20));
+  ScalarArg arg1(static_cast<int32_t>(22));
+
+  ASSERT_TRUE(CompileAndExecute(module, {arg0, arg1}, converter).ok());
+  EXPECT_EQ(result.get(), 42);
+}
+
+TEST(ExecutableTest, AsyncMemrefRet) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: index) -> !async.value<memref<?xf32>> {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+
+      %0 = memref.alloc(%arg0) : memref<?xf32>
+      scf.for %i = %c0 to %arg0 step %c1 {
+        %c42 = arith.constant 42.0 : f32
+        memref.store %c42, %0[%i] : memref<?xf32>
+      }
+
+      return %0 : memref<?xf32>
+    }
+  )";
+
+  AsyncValueRef<OwnedMemref> result =
+      MakeConstructedAsyncValueRef<OwnedMemref>();
+  ResultConverterSet converter(AssertNoError,
+                               ReturnAsyncMemref{result.AsPtr()});
+
+  ScalarArg arg0(static_cast<int64_t>(32));
+
+  ASSERT_TRUE(CompileAndExecute(module, {arg0}, converter).ok());
+  ASSERT_TRUE(result.get().desc.has_value());
+  EXPECT_EQ(result.get()->rank(), 1);
+  EXPECT_EQ(result.get()->size(0), 32);
+
+  float* data = reinterpret_cast<float*>(result.get()->data());
+  EXPECT_TRUE(std::all_of(data, data + 32, [](float v) { return v == 42.0f; }));
+}
+
+TEST(ExecutableTest, AsyncWaiting) {
+  absl::string_view module = R"(
+    async.func @test2(%arg0: i32, %arg1: i32) -> !async.value<i32> {
+      %0 = arith.addi %arg0, %arg1 : i32
+      return %0 : i32
+    }
+    async.func @test(%arg0: i32, %arg1:i32) -> !async.value<i32> {
+      %0 = async.call @test2(%arg0, %arg1) : (i32, i32) -> !async.value<i32>
+      %1 = async.await %0 : !async.value<i32>
+      return %1 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  ScalarArg arg0(static_cast<int32_t>(20));
+  ScalarArg arg1(static_cast<int32_t>(22));
+
+  ASSERT_TRUE(CompileAndExecute(module, {arg0, arg1}, converter).ok());
+  EXPECT_EQ(result.get(), 42);
+}
+
+TEST(ExecutableTest, AsyncCustomCall) {
+  absl::string_view source = R"(
+    func.func private @custom_call_return() -> !async.value<i32>
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
+
+    func.func private @custom_call(%arg32 : i32)
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call" }
+
+    async.func @test() -> !async.token {
+      %0 = func.call @custom_call_return() : () -> !async.value<i32>
+      %1 = async.await %0 : !async.value<i32>
+      func.call @custom_call(%1) : (i32) -> ()
+      return
+    }
+  )";
+
+  auto f_result = []() -> absl::StatusOr<AsyncValueRef<int32_t>> {
+    return tsl::MakeAvailableAsyncValueRef<int32_t>(42);
+  };
+
+  int32_t i32 = 0;
+  auto f = [&](int32_t arg) {
+    i32 = arg;
+    return success();
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call_return")
+                          .Ret<AsyncValueRef<int32_t>>()
+                          .To(f_result));
+
+    registry.Register(
+        CustomCall::Bind("test.custom_call").Arg<int32_t>().To(f));
+  }};
+
+  AsyncValueRef<Chain> result = MakeConstructedAsyncValueRef<Chain>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncToken{result.AsPtr()});
+
+  ASSERT_TRUE(
+      CompileAndExecute(source, /*args=*/{}, converter, NoRunner(), registry)
+          .ok());
+  EXPECT_EQ(i32, 42);
+}
+
+TEST(ExecutableTest, AsyncExecute) {
+  absl::string_view source = R"(
+    module {
+    func.func private @custom_call_return() -> !async.value<i32>
+      attributes { rt.dynamic, rt.custom_call = "test.custom_call_return" }
+
+    async.func @test() -> !async.value<i32> {
+      %token, %result = async.execute -> !async.value<i32> {
+        %0 = func.call @custom_call_return() : () -> !async.value<i32>
+        %1 = async.await %0 : !async.value<i32>
+        async.yield %1 : i32
+      }
+      %1 = async.await %result : !async.value<i32>
+      return %1 : i32
+    }
+    }
+  )";
+
+  LazyAsyncTaskRunner runner;
+
+  auto async_result = tsl::MakeAvailableAsyncValueRef<int32_t>(42);
+  auto f_result = [&]() -> absl::StatusOr<AsyncValueRef<int32_t>> {
+    return async_result;
+  };
+
+  CustomCallRegistry registry = {[&](DynamicCustomCallRegistry& registry) {
+    registry.Register(CustomCall::Bind("test.custom_call_return")
+                          .Ret<AsyncValueRef<int32_t>>()
+                          .To(f_result));
+  }};
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  ASSERT_TRUE(CompileAndExecute(source, /*args=*/{}, converter, &runner,
+                                registry, /*use_lazy_runner=*/true)
+                  .ok());
+
+  EXPECT_EQ(result.get(), 42);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
 
-using benchmark::State;
-
 static void CompileAndBenchmark(
-    State& state, std::string_view module, ArgumentsRef args,
+    benchmark::State& state, std::string_view module, ArgumentsRef args,
     ResultConverter& results, AsyncTaskRunner* async_task_runner = NoRunner()) {
   JitExecutable::Options opts;
   opts.specialization = JitExecutable::Specialization::kDisabled;
@@ -296,7 +671,7 @@ static void CompileAndBenchmark(
   }
 }
 
-void BM_AsyncExecuteAndAwait(State& state) {
+void BM_AsyncExecuteAndAwait(benchmark::State& state) {
   absl::string_view module = R"(
     func.func @test(%arg0: i32, %arg1: i32) -> i32 {
       %token, %result = async.execute -> !async.value<i32> {
@@ -318,7 +693,50 @@ void BM_AsyncExecuteAndAwait(State& state) {
   CompileAndBenchmark(state, module, {arg0, arg1}, converter, &runner);
 }
 
+void BM_AsyncFunc(benchmark::State& state) {
+  absl::string_view module = R"(
+    async.func @test(%arg0: i32, %arg1: i32) -> !async.value<i32> {
+      %0 = arith.addi %arg0, %arg1 : i32
+      return %0 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  ScalarArg arg0(static_cast<int32_t>(20));
+  ScalarArg arg1(static_cast<int32_t>(22));
+
+  InlineAsyncTaskRunner runner;
+  CompileAndBenchmark(state, module, {arg0, arg1}, converter, &runner);
+}
+
+void BM_AsyncFuncCall(benchmark::State& state) {
+  absl::string_view module = R"(
+    async.func @test2(%arg0: i32, %arg1: i32) -> !async.value<i32> {
+      %0 = arith.addi %arg0, %arg1 : i32
+      return %0 : i32
+    }
+    async.func @test(%arg0: i32, %arg1:i32) -> !async.value<i32> {
+      %0 = async.call @test2(%arg0, %arg1) : (i32, i32) -> !async.value<i32>
+      %1 = async.await %0 : !async.value<i32>
+      return %1 : i32
+    }
+  )";
+
+  AsyncValueRef<int32_t> result = MakeConstructedAsyncValueRef<int32_t>();
+  ResultConverterSet converter(AssertNoError, ReturnAsyncI32{result.AsPtr()});
+
+  ScalarArg arg0(static_cast<int32_t>(20));
+  ScalarArg arg1(static_cast<int32_t>(22));
+
+  InlineAsyncTaskRunner runner;
+  CompileAndBenchmark(state, module, {arg0, arg1}, converter, &runner);
+}
+
 BENCHMARK(BM_AsyncExecuteAndAwait);
+BENCHMARK(BM_AsyncFunc);
+BENCHMARK(BM_AsyncFuncCall);
 
 }  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/execution_engine.cc b/tensorflow/compiler/xla/runtime/execution_engine.cc
index cfc7a6ef002..94a343e534c 100644
--- a/tensorflow/compiler/xla/runtime/execution_engine.cc
+++ b/tensorflow/compiler/xla/runtime/execution_engine.cc
@@ -32,9 +32,14 @@ limitations under the License.
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/runtime/errors.h"
 
 namespace xla {
@@ -125,26 +130,70 @@ static absl::Status SetUpExportedFunction(llvm::Module &module,
   bb->insertInto(callee);
   builder.SetInsertPoint(bb);
 
-  llvm::SmallVector<llvm::Value *, 8> args;
+  // We collect all load instructions that load arguments from a single pointer,
+  // and duplicate them into the basic blocks where the value is used. We do it
+  // to avoid creating massive entry block with potentially tens of thousands of
+  // loads, which puts a lot of pressure on instruction scheduling.
+  //
+  // TODO(ezhulenev): Currently we do it only for loads with a single use, we
+  // should consider doing it for all loads with small number of uses.
+  llvm::SmallVector<std::pair<llvm::LoadInst *, llvm::LoadInst *>> args;
   args.reserve(llvm::size(func->args()));
 
   for (auto &indexed_arg : llvm::enumerate(func->args())) {
-    llvm::Value *arg_idx = llvm::Constant::getIntegerValue(
-        builder.getInt64Ty(), llvm::APInt(64, indexed_arg.index()));
-    llvm::Value *arg_ptr_ptr =
-        builder.CreateGEP(builder.getInt8PtrTy(), packed_args, arg_idx);
-    llvm::Value *arg_ptr =
-        builder.CreateLoad(builder.getInt8PtrTy(), arg_ptr_ptr);
     llvm::Type *art_ty = indexed_arg.value().getType();
-    arg_ptr = builder.CreateBitCast(arg_ptr, art_ty->getPointerTo());
-    llvm::Value *arg = builder.CreateLoad(art_ty, arg_ptr);
-    args.push_back(arg);
+
+    llvm::Value *arg_ptr_gep = builder.CreateConstGEP1_64(
+        builder.getPtrTy(), packed_args, indexed_arg.index());
+    llvm::LoadInst *arg_ptr_load =
+        builder.CreateLoad(builder.getPtrTy(), arg_ptr_gep);
+    llvm::LoadInst *arg_load = builder.CreateLoad(art_ty, arg_ptr_load);
+
+    args.emplace_back(arg_ptr_load, arg_load);
   }
 
   // Call the implementation function with the extracted arguments.
-  builder.CreateCall(func, args);
+  llvm::SmallVector<llvm::Value *> args_values;
+  for (auto &[_, arg] : args) args_values.push_back(arg);
+  auto *call = builder.CreateCall(func, args_values);
   builder.CreateRetVoid();
 
+  // Make sure that we do not keep exported function in the binary if we do not
+  // have any other callers.
+  func->setLinkage(llvm::GlobalValue::LinkageTypes::PrivateLinkage);
+
+  // Explicitly inline implementation function into the interface function,
+  // because it potentially can have thousands of arguments and it interacts
+  // badly with various SCCP passes in LLVM.
+  llvm::InlineFunctionInfo ifi;
+
+  // If inlined function is a coroutine (result of lowering async function),
+  // then we have to mark the interface function as a corotuine as well.
+  bool is_coro = func->isPresplitCoroutine();
+  if (auto inlined = llvm::InlineFunction(*call, ifi); inlined.isSuccess()) {
+    if (is_coro) callee->setPresplitCoroutine();
+  }
+
+  // Clean up loads from the packed argument pointer.
+  for (auto &[ptr_load, arg_load] : args) {
+    // Dead argument elimination after inlining.
+    if (arg_load->use_empty()) {
+      arg_load->eraseFromParent();
+      ptr_load->eraseFromParent();
+      continue;
+    }
+
+    // Move loads used only once into the entry block where they are used.
+    if (!arg_load->hasOneUser()) continue;
+
+    for (llvm::User *user : arg_load->users()) {
+      auto *inst = cast<llvm::Instruction>(user);
+      if (llvm::isa<llvm::PHINode>(inst)) continue;
+      arg_load->moveBefore(inst);
+      ptr_load->moveBefore(arg_load);
+    }
+  }
+
   // Always keep the frame pointer inside jit-compiled modules, so that we can
   // correctly walk the stack when collecting profiles at run time.
   for (llvm::Function &fn : module.functions()) {
@@ -221,13 +270,6 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
   module->setDataLayout(options.target_machine->createDataLayout());
   module->setTargetTriple(options.target_machine->getTargetTriple().str());
 
-  // Run an optimization pipeline over the LLVM module.
-  auto transformer = options.make_optimizing_transformer(
-      options.opt_level, /*sizeLevel=*/0, options.target_machine);
-  if (auto err = transformer(module_ptr))
-    return InternalError("failed to run optimization pipeline: %s",
-                         ToString(err));
-
   // Set up exported functions interface functions in the LLVM module.
   for (std::string_view name : exported) {
     if (auto status = SetUpExportedFunction(*module, name); !status.ok())
@@ -236,6 +278,18 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
           status.message());
   }
 
+  // Run an optimization pipeline over the LLVM module (alway run with default
+  // opt level independent of the options).
+  //
+  // TODO(ezhulenev): We should have out own optimizing transformer pipelines
+  // for different Xla backends, e.g. there is absolutely no need to run
+  // SLV vectorizer for Xla Gpi host side executable.
+  auto transformer = options.make_optimizing_transformer(
+      llvm::CodeGenOpt::Default, /*sizeLevel=*/0, options.target_machine);
+  if (auto err = transformer(module_ptr))
+    return InternalError("failed to run optimization pipeline: %s",
+                         ToString(err));
+
   // Callback to create the object layer with a user-provided section memory
   // mapper and JIT event listeners.
   auto obj_layer_creator = [&](ExecutionSession &session, const Triple &tt) {
@@ -331,6 +385,14 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
   return std::move(engine);
 }
 
+static void InitializeLlvmNativeTarget() {
+  static const bool initialized = [] {
+    llvm::InitializeNativeTarget();
+    return true;
+  }();
+  (void)initialized;
+}
+
 /*static*/ StatusOr<std::unique_ptr<ExecutionEngine>>
 ExecutionEngine::CreateFromObjFile(
     std::unique_ptr<llvm::MemoryBuffer> obj_file, AotOptions options,
@@ -355,6 +417,9 @@ ExecutionEngine::CreateFromObjFile(
     return obj_layer;
   };
 
+  // Initialize LLVM native target before constructing LLJIT.
+  InitializeLlvmNativeTarget();
+
   // Construct the LLJIT with the given compiler and object linking layers.
   auto jit = llvm::orc::LLJITBuilder()
                  .setObjectLinkingLayerCreator(obj_layer_creator)
diff --git a/tensorflow/compiler/xla/runtime/ffi.cc b/tensorflow/compiler/xla/runtime/ffi.cc
new file mode 100644
index 00000000000..231abca9c92
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi.cc
@@ -0,0 +1,481 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/ffi.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h"
+#include "tensorflow/compiler/xla/runtime/module.h"
+
+//===----------------------------------------------------------------------===//
+// Define structs forward-declared by XLA FFI C API.
+//===----------------------------------------------------------------------===//
+
+struct XLA_FFI_Error {
+  XLA_FFI_Error_Code errc;
+  std::string error;
+};
+
+struct XLA_FFI_ExecutionContext {
+  XLA_FFI_Module_State* state;
+  XLA_FFI_Stream* stream;
+};
+
+//===----------------------------------------------------------------------===//
+
+namespace xla {
+namespace runtime {
+namespace ffi {
+
+//===----------------------------------------------------------------------===//
+// Helper functions to check ABI compatibility.
+//===----------------------------------------------------------------------===//
+
+static std::string StructSizeErrorMsg(absl::string_view struct_name,
+                                      size_t expected_size,
+                                      size_t actual_size) {
+  return absl::StrCat("Unexpected ", struct_name, " size: expected ",
+                      expected_size, ", got ", actual_size,
+                      ". Check installed software versions.");
+}
+
+static absl::Status CheckMatchingStructSizes(absl::string_view struct_name,
+                                             size_t expected_size,
+                                             size_t actual_size) {
+  if (expected_size != actual_size) {
+    return absl::InvalidArgumentError(
+        StructSizeErrorMsg(struct_name, expected_size, actual_size));
+  }
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// Error code conversion helper.
+//===----------------------------------------------------------------------===//
+
+absl::StatusCode ConvertErrorCode(XLA_FFI_Error_Code errc) {
+  switch (errc) {
+    case XLA_FFI_Error_Code_ABORTED:
+      return absl::StatusCode::kAborted;
+    case XLA_FFI_Error_Code_CANCELLED:
+      return absl::StatusCode::kCancelled;
+    case XLA_FFI_Error_Code_UNKNOWN:
+      return absl::StatusCode::kUnknown;
+    case XLA_FFI_Error_Code_INVALID_ARGUMENT:
+      return absl::StatusCode::kInvalidArgument;
+    case XLA_FFI_Error_Code_DEADLINE_EXCEEDED:
+      return absl::StatusCode::kDeadlineExceeded;
+    case XLA_FFI_Error_Code_NOT_FOUND:
+      return absl::StatusCode::kNotFound;
+    case XLA_FFI_Error_Code_ALREADY_EXISTS:
+      return absl::StatusCode::kAlreadyExists;
+    case XLA_FFI_Error_Code_PERMISSION_DENIED:
+      return absl::StatusCode::kPermissionDenied;
+    case XLA_FFI_Error_Code_RESOURCE_EXHAUSTED:
+      return absl::StatusCode::kResourceExhausted;
+    case XLA_FFI_Error_Code_FAILED_PRECONDITION:
+      return absl::StatusCode::kFailedPrecondition;
+    case XLA_FFI_Error_Code_OUT_OF_RANGE:
+      return absl::StatusCode::kOutOfRange;
+    case XLA_FFI_Error_Code_UNIMPLEMENTED:
+      return absl::StatusCode::kUnimplemented;
+    case XLA_FFI_Error_Code_INTERNAL:
+      return absl::StatusCode::kInternal;
+    case XLA_FFI_Error_Code_UNAVAILABLE:
+      return absl::StatusCode::kUnavailable;
+    case XLA_FFI_Error_Code_DATA_LOSS:
+      return absl::StatusCode::kDataLoss;
+    case XLA_FFI_Error_Code_UNAUTHENTICATED:
+      return absl::StatusCode::kUnauthenticated;
+    default:
+      return absl::StatusCode::kUnknown;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Adaptor from the Xla custom call to an Xla FFI calling convention.
+//===----------------------------------------------------------------------===//
+
+// We use weak linking to provide a default implementation here. The XLA:GPU
+// backend overrides this implementation, and it is picked at link time.
+ABSL_ATTRIBUTE_WEAK XLA_FFI_Stream* GetXlaFfiStream(
+    const CustomCall::UserData* user_data, const DiagnosticEngine* diagnostic) {
+  return nullptr;
+}
+
+class FfiCustomCall : public CustomCall {
+ public:
+  FfiCustomCall(const XLA_FFI_Api* api, int64_t module_id,
+                std::string_view name, XLA_FFI_Function* function)
+      : api_(api), module_id_(module_id), name_(name), function_(function) {}
+
+  std::string_view name() const final { return name_; }
+
+  LogicalResult call(void** args, void** attrs, void** rets,
+                     const UserData* user_data,
+                     const DiagnosticEngine* diagnostic) const final {
+    // Find an FFI module state for a given FFI call.
+    FfiStateVector* state_vector =
+        user_data ? user_data->getIfExists<FfiStateVector>() : nullptr;
+    if (!state_vector || module_id_ >= state_vector->state.size())
+      return diagnostic->EmitError(
+          absl::InvalidArgumentError("FFI module state was not found"));
+
+    // Prepare FFI execution context.
+    XLA_FFI_ExecutionContext ctx;
+    ctx.stream = GetXlaFfiStream(user_data, diagnostic);
+    ctx.state = state_vector->state[module_id_];
+
+    // Package custom call arguments and state into FFI function arguments.
+    XLA_FFI_Function_Args ffi_args;
+    ffi_args.struct_size = XLA_FFI_Function_Args_STRUCT_SIZE;
+    ffi_args.priv = nullptr;
+    ffi_args.api = api_;
+    ffi_args.ctx = &ctx;
+    ffi_args.args = args;
+    ffi_args.attrs = attrs;
+    ffi_args.rets = rets;
+
+    // Execute FFI function and maybe report an error.
+    if (XLA_FFI_Error* error = function_(&ffi_args)) {
+      return diagnostic->EmitError(
+          absl::Status(ConvertErrorCode(error->errc), error->error));
+    }
+
+    return success();
+  }
+
+ private:
+  const XLA_FFI_Api* api_;
+  int64_t module_id_;
+  std::string name_;
+  XLA_FFI_Function* function_;
+};
+
+//===----------------------------------------------------------------------===//
+// FFI modules registered with the runtime.
+//===----------------------------------------------------------------------===//
+
+// FFI module state can be of two different types:
+//
+//   1. Per-executable state: state is instantiated for each executable, and
+//      destructed together with the executable. This type of state can be used
+//      for caching long lived objects whose lifetime is inherently coupled with
+//      the executable. For example in XLA:GPU per-executable state is used for
+//      caching various cuDNN library handles.
+//
+//   2. Per-execution state: state is instantiated for each execution, and
+//      destructed once execution is completed. This type of state can be used
+//      for caching short lived objects with lifetime bound to the concrete
+//      invocation of XLA executable.
+//
+struct FfiState : public runtime::Module::State {
+  // Per-executable state owned by the XLA executable instance.
+  explicit FfiState(OwnedFfiState state) : state_or_module(std::move(state)) {}
+
+  // Per-execution state instantiated lazily for each execution.
+  explicit FfiState(const FfiModule* module) : state_or_module(module) {}
+
+  std::variant<OwnedFfiState, const FfiModule*> state_or_module;
+};
+
+// Adaptor from the XLA FFI module and corresponding module API functions to the
+// Xla runtime stateful module.
+class FfiModule : public runtime::StatefulModule<FfiState> {
+  using Base = runtime::StatefulModule<FfiState>;
+
+ public:
+  struct ExportedFunction {
+    std::string_view name;
+    XLA_FFI_Function* function;
+  };
+
+  FfiModule(const XLA_FFI_Api* api, int64_t module_id, const char* name,
+            XLA_FFI_Module* module, XLA_FFI_Module_StateType state_type,
+            XLA_FFI_Module_CreateState* create_state,
+            XLA_FFI_Module_DestroyState* destroy_state,
+            std::vector<ExportedFunction> exported_functions)
+      : Base(name),
+        api_(api),
+        module_id_(module_id),
+        module_(module),
+        state_type_(state_type),
+        create_state_(create_state),
+        destroy_state_(destroy_state),
+        exported_functions_(std::move(exported_functions)) {}
+
+  int64_t module_id() const { return module_id_; }
+
+  void Export(DynamicCustomCallRegistry& registry) const final;
+  absl::StatusOr<std::unique_ptr<FfiState>> CreateModuleState() const final;
+
+  absl::StatusOr<XLA_FFI_Module_State*> CreateFfiState() const;
+  void DestroyFfiState(XLA_FFI_Module_State* state) const;
+
+ private:
+  const XLA_FFI_Api* api_;
+  int64_t module_id_;
+  XLA_FFI_Module* module_;
+  XLA_FFI_Module_StateType state_type_;
+  XLA_FFI_Module_CreateState* create_state_;
+  XLA_FFI_Module_DestroyState* destroy_state_;
+  std::vector<ExportedFunction> exported_functions_;
+};
+
+absl::StatusOr<std::unique_ptr<FfiState>> FfiModule::CreateModuleState() const {
+  if (state_type_ == XLA_FFI_Module_State_PER_EXECUTABLE) {
+    VLOG(3) << "Create a new per-executable state for module: " << name();
+    auto state = CreateFfiState();
+    if (!state.ok()) return state.status();
+    return std::make_unique<FfiState>(OwnedFfiState(this, state.value()));
+  }
+
+  VLOG(3) << "Create a new per-execution state for module: " << name();
+  return std::make_unique<FfiState>(this);
+}
+
+void FfiModule::Export(DynamicCustomCallRegistry& registry) const {
+  for (auto& fn : exported_functions_) {
+    VLOG(1) << "Export FFI function: " << fn.name
+            << " for module id: " << module_id_;
+    registry.Register(std::make_unique<FfiCustomCall>(api_, module_id_, fn.name,
+                                                      fn.function));
+  }
+}
+
+absl::StatusOr<XLA_FFI_Module_State*> FfiModule::CreateFfiState() const {
+  if (!create_state_) return nullptr;  // stateless FFI module
+
+  XLA_FFI_Module_CreateState_Args args;
+  args.struct_size = XLA_FFI_Module_CreateState_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.module = module_;
+  args.state = nullptr;
+
+  XLA_FFI_Error* error = create_state_(&args);
+  if (error) return absl::InternalError(error->error);
+
+  return args.state;
+}
+
+void FfiModule::DestroyFfiState(XLA_FFI_Module_State* state) const {
+  if (!destroy_state_) return;  // stateless FFI module
+
+  XLA_FFI_Module_DestroyState_Args args;
+  args.struct_size = XLA_FFI_Module_DestroyState_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.module = module_;
+  args.state = state;
+
+  destroy_state_(&args);
+}
+
+void FfiStateDeleter::operator()(XLA_FFI_Module_State* state) {
+  module->DestroyFfiState(state);
+}
+
+OwnedFfiState::OwnedFfiState(const FfiModule* module,
+                             XLA_FFI_Module_State* state)
+    : Base(state, {module}) {}
+
+//===----------------------------------------------------------------------===//
+// Implement XLA FFI error reporting API.
+//===----------------------------------------------------------------------===//
+
+static XLA_FFI_Error* CreateError(XLA_FFI_Error_Create_Args* args) {
+  absl::Status struct_size_check = CheckMatchingStructSizes(
+      "XLA_FFI_Error_Create_Args", XLA_FFI_Error_Create_Args_STRUCT_SIZE,
+      args->struct_size);
+  if (!struct_size_check.ok()) LOG(ERROR) << struct_size_check.message();
+
+  return new XLA_FFI_Error{args->errc, std::string(args->message)};
+}
+
+//===----------------------------------------------------------------------===//
+// XLA runtime FFI backend implementation.
+//===----------------------------------------------------------------------===//
+
+static std::vector<FfiModule>& OwnedFfiModules() {
+  static auto* modules = new std::vector<FfiModule>();
+  return *modules;
+}
+
+std::vector<const runtime::Module*> FfiModules() {
+  std::vector<const runtime::Module*> modules;
+  absl::c_transform(OwnedFfiModules(), std::back_inserter(modules),
+                    [](const FfiModule& module) { return &module; });
+  return modules;
+}
+
+void ExportFfiModules(DynamicCustomCallRegistry& registry) {
+  for (auto* module : FfiModules()) module->Export(registry);
+}
+
+/*static*/ absl::StatusOr<FfiModulesState> FfiModulesState::Instantiate() {
+  std::vector<std::unique_ptr<Module::State>> state;
+
+  for (auto* module : FfiModules()) {
+    auto module_state = module->CreateState();
+    if (!module_state.ok()) return module_state.status();
+    state.push_back(std::move(*module_state));
+  }
+
+  return FfiModulesState(std::move(state));
+}
+
+FfiModulesState::FfiModulesState(
+    std::vector<std::unique_ptr<Module::State>> state)
+    : state_(std::move(state)) {}
+
+absl::StatusOr<FfiStateVector> FfiModulesState::state_vector() const {
+  FfiStateVector state_vector;
+  for (auto& state : state_) {
+    auto* ffi_state = dynamic_cast<FfiState*>(state.get());
+
+    // Skip stateless FFI modules.
+    if (!ffi_state) {
+      state_vector.state.push_back(nullptr);
+      continue;
+    }
+
+    // Pass the existing per-executable state.
+    if (auto* s = std::get_if<OwnedFfiState>(&ffi_state->state_or_module)) {
+      state_vector.state.push_back(s->get());
+      continue;
+    }
+
+    // Try to instantiate a new per-execution state.
+    if (auto* m = std::get_if<const FfiModule*>(&ffi_state->state_or_module)) {
+      auto created = (*m)->CreateFfiState();
+      if (!created.ok()) return created.status();
+
+      state_vector.state.push_back(created.value());
+      state_vector.per_execution_state.emplace_back(*m, created.value());
+      continue;
+    }
+
+    return absl::InternalError("Unsupported FFI module state");
+  }
+  return std::move(state_vector);
+}
+
+//===----------------------------------------------------------------------===//
+// Implement XLA FFI module and function registration API.
+//===----------------------------------------------------------------------===//
+
+template <const XLA_FFI_Api* (*api)()>
+static void RegisterXlaFfiModule(XLA_FFI_Module_Register_Args* args) {
+  absl::Status struct_size_check = CheckMatchingStructSizes(
+      "XLA_FFI_Module_Register_Args", XLA_FFI_Module_Register_Args_STRUCT_SIZE,
+      args->struct_size);
+  if (!struct_size_check.ok()) LOG(ERROR) << struct_size_check.message();
+
+  VLOG(1) << "Register FFI module: " << args->name;
+
+  std::vector<FfiModule::ExportedFunction> exported_functions;
+  for (int64_t i = 0; i < args->num_exported_functions; ++i) {
+    FfiModule::ExportedFunction fn = {args->exported_names[i],
+                                      args->exported_functions[i]};
+    exported_functions.push_back(fn);
+  }
+
+  auto& modules = OwnedFfiModules();
+  modules.emplace_back(api(), /*id=*/modules.size(), args->name, args->module,
+                       args->state_type, args->create_state,
+                       args->destroy_state, std::move(exported_functions));
+}
+
+static XLA_FFI_Module_State* GetXlaFfiModuleState(
+    XLA_FFI_ExecutionContext_GetModuleState_Args* args) {
+  absl::Status struct_size_check = CheckMatchingStructSizes(
+      "XLA_FFI_ExecutionContext_GetModuleState_Args",
+      XLA_FFI_ExecutionContext_GetModuleState_Args_STRUCT_SIZE,
+      args->struct_size);
+  if (!struct_size_check.ok()) LOG(ERROR) << struct_size_check.message();
+
+  return args->ctx->state;
+}
+
+static XLA_FFI_Stream* GetXlaFfiStream(
+    XLA_FFI_ExecutionContext_GetStream_Args* args) {
+  absl::Status struct_size_check = CheckMatchingStructSizes(
+      "XLA_FFI_ExecutionContext_GetStream_Args",
+      XLA_FFI_ExecutionContext_GetStream_Args_STRUCT_SIZE, args->struct_size);
+  if (!struct_size_check.ok()) LOG(ERROR) << struct_size_check.message();
+
+  return args->ctx->stream;
+}
+
+}  // namespace ffi
+}  // namespace runtime
+}  // namespace xla
+
+template <typename T>
+static XLA_FFI_TypeId FfiTypeId() {
+  return xla::runtime::TypeID::get<xla::runtime::Tagged<T>>()
+      .getAsOpaquePointer();
+}
+
+const XLA_FFI_Api ffi_api = {
+    /*struct_size=*/XLA_FFI_Api_STRUCT_SIZE,
+    /*priv=*/nullptr,
+
+    //===------------------------------------------------------------------===//
+    // Module Registration APIs.
+    //===------------------------------------------------------------------===//
+    ::xla::runtime::ffi::RegisterXlaFfiModule<GetXlaFfiApi>,
+
+    //===------------------------------------------------------------------===//
+    // Execution Context APIs.
+    //===------------------------------------------------------------------===//
+    ::xla::runtime::ffi::GetXlaFfiModuleState,
+    ::xla::runtime::ffi::GetXlaFfiStream,
+
+    //===------------------------------------------------------------------===//
+    // Error Reporting APIs.
+    //===------------------------------------------------------------------===//
+    ::xla::runtime::ffi::CreateError,
+
+    //===------------------------------------------------------------------===//
+    // Type table.
+    //===------------------------------------------------------------------===//
+    FfiTypeId<std::string_view>,
+    FfiTypeId<float>,
+    FfiTypeId<double>,
+    FfiTypeId<bool>,
+    FfiTypeId<int32_t>,
+    FfiTypeId<int64_t>,
+    FfiTypeId<absl::Span<const float>>,
+    FfiTypeId<absl::Span<const double>>,
+    FfiTypeId<absl::Span<const int32_t>>,
+    FfiTypeId<absl::Span<const int64_t>>,
+    FfiTypeId<::xla::runtime::MemrefView>,
+    FfiTypeId<::xla::runtime::StridedMemrefView>,
+    FfiTypeId<::xla::runtime::Dictionary>,
+};
+
+const XLA_FFI_Api* GetXlaFfiApi() { return &ffi_api; }
diff --git a/tensorflow/compiler/xla/runtime/ffi.h b/tensorflow/compiler/xla/runtime/ffi.h
new file mode 100644
index 00000000000..2b972ea18b2
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi.h
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h"
+#include "tensorflow/compiler/xla/runtime/module.h"
+
+namespace xla {
+namespace runtime {
+namespace ffi {
+
+// Returns FFI modules registered with an XLA runtime.
+std::vector<const runtime::Module*> FfiModules();
+
+// Exports registered FFI modules to the given custom call registry.
+void ExportFfiModules(DynamicCustomCallRegistry& registry);
+
+// XLA runtime wrapper around FFI module.
+class FfiModule;
+
+//===----------------------------------------------------------------------===//
+// RAII helpers for the state instantiated by the FFI modules.
+//===----------------------------------------------------------------------===//
+
+// Deletes the FFI module `state` instantiated by the `module`.
+struct FfiStateDeleter {
+  void operator()(XLA_FFI_Module_State* state);
+  const FfiModule* module;
+};
+
+// A smart pointer owning `state` instantiated by the `module`.
+struct OwnedFfiState
+    : public std::unique_ptr<XLA_FFI_Module_State, FfiStateDeleter> {
+  using Base = std::unique_ptr<XLA_FFI_Module_State, FfiStateDeleter>;
+  OwnedFfiState(const FfiModule* module, XLA_FFI_Module_State* state);
+};
+
+//===----------------------------------------------------------------------===//
+// FFI modules <-> XLA runtime integration via UserData.
+//===----------------------------------------------------------------------===//
+
+// A vector of opaque pointers to FFI modules state that is passed around inside
+// `UserData` and enables FFI functions to find their state.
+struct FfiStateVector {
+  std::vector<XLA_FFI_Module_State*> state;  // indexed by module id
+
+  // If FFI module instantiates state for each execution, the state vector will
+  // be the owner of that state for the duration of execution.
+  std::vector<OwnedFfiState> per_execution_state;
+  FfiStateVector(const FfiStateVector&) = delete;
+  FfiStateVector() = default;
+  FfiStateVector(FfiStateVector&&) = default;
+};
+
+// FfiModulesState is a container that owns the FFI modules state.
+class FfiModulesState {
+ public:
+  FfiModulesState() = default;
+
+  // Instantiates `FfiModulesState` from the registered FFI module.
+  static absl::StatusOr<FfiModulesState> Instantiate();
+
+  absl::StatusOr<FfiStateVector> state_vector() const;
+
+ private:
+  explicit FfiModulesState(std::vector<std::unique_ptr<Module::State>> state);
+
+  std::vector<std::unique_ptr<Module::State>> state_;
+};
+
+}  // namespace ffi
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_H_
diff --git a/tensorflow/compiler/xla/runtime/ffi/BUILD b/tensorflow/compiler/xla/runtime/ffi/BUILD
new file mode 100644
index 00000000000..12b1c274cfd
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi/BUILD
@@ -0,0 +1,29 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "ffi_abi",
+    hdrs = ["ffi_abi.h"],
+    compatible_with = get_compatible_with_cloud(),
+)
+
+cc_library(
+    name = "ffi_api",
+    hdrs = ["ffi_api.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":ffi_abi",
+        ":ffi_c_api_hdrs",
+    ],
+)
+
+cc_library(
+    name = "ffi_c_api_hdrs",
+    hdrs = ["ffi_c_api.h"],
+    compatible_with = get_compatible_with_cloud(),
+)
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_abi.h b/tensorflow/compiler/xla/runtime/ffi/ffi_abi.h
new file mode 100644
index 00000000000..9252028be39
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_abi.h
@@ -0,0 +1,62 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_ABI_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_ABI_H_
+
+#include <cstdint>
+
+namespace xla {
+namespace runtime {
+namespace internal {
+
+//===----------------------------------------------------------------------===//
+// C structures that XLA uses internally to encode arguments and attributes.
+//===----------------------------------------------------------------------===//
+
+// When XLA compiles host-side executable via lowering to LLVM (see `rt-to-llvm`
+// pass) it encodes arguments and attributes as `!llvm.struct<...>` types stored
+// as LLVM global constants (attributes and statically known arguments) or as
+// allocas on the stack. We rely on standard layout C++ structs to reinterpret
+// cast arguments and attributes pointers, and convert them to user-friendly C++
+// types (e.g. `EncodedMemref` to `StridedBufferArg`).
+//
+// See: https://en.cppreference.com/w/cpp/types/is_standard_layout
+
+struct EncodedMemref {
+  uint8_t dtype;
+  uint8_t rank;
+  void* data;
+  int64_t dims[];
+};
+
+template <typename T>
+struct EncodedArray {
+  int64_t size;
+  const T* data;
+};
+
+template <typename T>
+struct EncodedDenseElements {
+  struct EncodedArray<T> payload;
+  int64_t rank;
+  int64_t shape[];
+};
+
+}  // namespace internal
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_ABI_H_
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_api.h b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
new file mode 100644
index 00000000000..8111c7b48f6
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
@@ -0,0 +1,1137 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_API_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_API_H_
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_abi.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h"
+
+namespace xla {
+namespace runtime {
+namespace ffi {
+
+// Forward declare template defined below.
+template <typename... Ts>
+class FfiBinding;
+
+// Forward declare template defined below.
+template <typename Fn, typename... Ts>
+class FfiHandler;
+
+// FFI arguments allocated by the jit-compiled code and we need to mark all
+// memory initialized to suppress memory sanitizer errors.
+#if defined(MEMORY_SANITIZER)
+#define XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(address, size) \
+  __msan_unpoison(address, size)
+#else
+#define XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(address, size)  // empty
+#endif  // MEMORY_SANITIZER
+
+//===----------------------------------------------------------------------===//
+// Check struct sizes passed across the C API to detect mismatched versions.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+inline void CheckStructSize(std::string_view struct_name, size_t expected_size,
+                            size_t actual_size) {
+  if (expected_size != actual_size) {
+    std::cerr << "Unexpected " << struct_name << " size: expected "
+              << expected_size << " << got " << actual_size
+              << ". Check installed software versions." << std::endl;
+    std::abort();
+  }
+}
+}  // namespace internal
+
+#define CHECK_ARGS_SIZE(name, args)                            \
+  internal::CheckStructSize("XLA_FFI_##name##_Args",           \
+                            XLA_FFI_##name##_Args_STRUCT_SIZE, \
+                            args->struct_size)
+
+//===----------------------------------------------------------------------===//
+// Span is non-owning view into contiguous values ot type `T`.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): Replace with `std::span` when C++20 is available.
+template <typename T>
+class Span {
+ public:
+  Span(T* data, size_t size) : data_(data), size_(size) {}
+  Span(const std::vector<std::remove_const_t<T>>& vec)  // NOLINT
+      : Span(vec.data(), vec.size()) {}
+
+  T& operator[](size_t index) const { return data_[index]; }
+
+  size_t size() const { return size_; }
+
+  T* begin() const { return data_; }
+  T* end() const { return data_ + size_; }
+
+ private:
+  T* data_;
+  size_t size_;
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI status wrapper around error reporting APIs.
+//===----------------------------------------------------------------------===//
+
+class FfiStatus {
+ public:
+  static FfiStatus Ok() { return FfiStatus(); }
+
+  static FfiStatus Internal(std::string message) {
+    XLA_FFI_Error_Code errc = XLA_FFI_Error_Code_INTERNAL;
+    return FfiStatus(errc, message);
+  }
+
+  static FfiStatus InvalidArgument(std::string message) {
+    XLA_FFI_Error_Code errc = XLA_FFI_Error_Code_INVALID_ARGUMENT;
+    return FfiStatus(errc, message);
+  }
+
+  std::optional<XLA_FFI_Error_Code> errc() const { return errc_; }
+
+  std::string_view message() const {
+    return message_.has_value() ? *message_ : std::string_view();
+  }
+
+  const char* message_c_str() const {
+    return message_.has_value() ? message_->c_str() : "";
+  }
+
+ private:
+  FfiStatus() = default;
+
+  FfiStatus(XLA_FFI_Error_Code errc, std::string message)
+      : errc_(errc), message_(std::move(message)) {}
+
+  std::optional<XLA_FFI_Error_Code> errc_;
+  std::optional<std::string> message_;
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI virtual base for implementing FFI functions.
+//===----------------------------------------------------------------------===//
+
+class Ffi {
+ public:
+  virtual ~Ffi() = default;
+
+  virtual XLA_FFI_Error* operator()(const XLA_FFI_Api* api,
+                                    XLA_FFI_ExecutionContext* ctx, void** args,
+                                    void** attrs, void** rets) const = 0;
+
+  static FfiBinding<> Binding();
+
+  template <typename T>
+  static bool Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id);
+
+  template <typename T, typename U, typename... Ts>
+  static bool Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id) {
+    return Isa<T>(api, type_id) || Isa<U, Ts...>(api, type_id);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI module is a base class for stateful and stateless FFI modules.
+//===----------------------------------------------------------------------===//
+
+class Module {
+ public:
+  virtual ~Module() = default;
+
+  struct ExportedFunction {
+    std::string target;
+    XLA_FFI_Function* function;
+  };
+
+ protected:
+  Module(const XLA_FFI_Api* api, std::string module_name,
+         std::vector<ExportedFunction> exported_functions,
+         XLA_FFI_Module_StateType state_type,
+         XLA_FFI_Module_CreateState* create_state,
+         XLA_FFI_Module_DestroyState* destroy_state)
+      : api_(api),
+        module_name_(std::move(module_name)),
+        exported_functions_(std::move(exported_functions)) {
+    Register(state_type, create_state, destroy_state);
+  }
+
+ private:
+  // Register `this` module with the XLA runtime.
+  void Register(XLA_FFI_Module_StateType state_type,
+                XLA_FFI_Module_CreateState* create_state,
+                XLA_FFI_Module_DestroyState* destroy_state) {
+    XLA_FFI_Module_Register_Args args;
+    args.struct_size = XLA_FFI_Module_Register_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.name = module_name_.c_str();
+    args.module = reinterpret_cast<XLA_FFI_Module*>(this);
+    args.state_type = state_type;
+    args.create_state = create_state;
+    args.destroy_state = destroy_state;
+
+    std::vector<const char*> exported_names;
+    std::vector<XLA_FFI_Function*> exported_functions;
+    for (auto& fn : exported_functions_) {
+      exported_names.push_back(fn.target.c_str());
+      exported_functions.push_back(fn.function);
+    }
+
+    args.num_exported_functions = exported_functions_.size();
+    args.exported_names = exported_names.data();
+    args.exported_functions = exported_functions.data();
+
+    api_->XLA_FFI_Module_Register(&args);
+  }
+
+  // Module is registered with the XLA runtime behind this API instance, and any
+  // module manipulation (e.g. export functions) must be done through it.
+  const XLA_FFI_Api* api_;
+
+  std::string module_name_;
+  std::vector<ExportedFunction> exported_functions_;
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI stateful module is a collection of FFI functions and a state.
+//===----------------------------------------------------------------------===//
+
+template <typename State>
+class StatefulModule : public Module {
+ public:
+  // TODO(ezhulenev): To gracefully fail if state can't be created, this has to
+  // return `FfiStatusOr<std::unique_ptr<State>>`, but we do not have a
+  // `StatusOr` implementation yet and we can't depend on absl.
+  virtual std::unique_ptr<State> CreateState() = 0;
+
+ protected:
+  StatefulModule(const XLA_FFI_Api* api, std::string module_name,
+                 std::vector<ExportedFunction> exported_functions,
+                 bool per_execution_state = false)
+      : Module(api, std::move(module_name), std::move(exported_functions),
+               per_execution_state ? XLA_FFI_Module_State_PER_EXECUTION
+                                   : XLA_FFI_Module_State_PER_EXECUTABLE,
+               CreateState, DestroyState) {}
+
+ private:
+  // Implements `XLA_FFI_Module_CreateState` API function.
+  static XLA_FFI_Error* CreateState(XLA_FFI_Module_CreateState_Args* args);
+
+  // Implements `XLA_FFI_Module_DestroyState` API function.
+  static void DestroyState(XLA_FFI_Module_DestroyState_Args* args);
+};
+
+template <typename State>
+XLA_FFI_Error* StatefulModule<State>::CreateState(
+    XLA_FFI_Module_CreateState_Args* args) {
+  CHECK_ARGS_SIZE(Module_CreateState, args);
+
+  auto* module = reinterpret_cast<StatefulModule*>(args->module);
+  auto* state = module->CreateState().release();
+  args->state = reinterpret_cast<XLA_FFI_Module_State*>(state);
+  return nullptr;  // success
+}
+
+template <typename State>
+void StatefulModule<State>::DestroyState(
+    XLA_FFI_Module_DestroyState_Args* args) {
+  CHECK_ARGS_SIZE(Module_DestroyState, args);
+
+  delete reinterpret_cast<State*>(args->state);
+}
+
+//===----------------------------------------------------------------------===//
+// XLA FFI stateless module is a collection of FFI functions without a state.
+//===----------------------------------------------------------------------===//
+
+class StatelessModule : public Module {
+ protected:
+  StatelessModule(const XLA_FFI_Api* api, std::string module_name,
+                  std::vector<ExportedFunction> exported_functions)
+      : Module(api, std::move(module_name), std::move(exported_functions),
+               /*state_type=*/XLA_FFI_Module_State_PER_EXECUTABLE,
+               /*create_state=*/nullptr, /*destroy_state=*/nullptr) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Helper macro to define a static module registration.
+//===----------------------------------------------------------------------===//
+
+#define XLA_REGISTER_FFI_MODULE(FUNC) \
+  XLA_REGISTER_FFI_MODULE_IMPL(FUNC, __COUNTER__)
+
+#define XLA_REGISTER_FFI_MODULE_IMPL(FUNC, N)           \
+  static bool xla_ffi_module_##N##_registered_ = []() { \
+    static auto* module = FUNC.release();               \
+    return module != nullptr;                           \
+  }()
+
+//===----------------------------------------------------------------------===//
+// Arguments supported by the FFI handlers.
+//===----------------------------------------------------------------------===//
+
+// This enum corresponds to xla::PrimitiveType enum defined in `hlo.proto`.
+enum class PrimitiveType : uint8_t {
+  // Invalid primitive type to serve as default.
+  PRIMITIVE_TYPE_INVALID = 0,
+
+  // Predicates are two-state booleans.
+  PRED = 1,
+
+  // Signed integral values of fixed width.
+  S8 = 2,
+  S16 = 3,
+  S32 = 4,
+  S64 = 5,
+
+  // Unsigned integral values of fixed width.
+  U8 = 6,
+  U16 = 7,
+  U32 = 8,
+  U64 = 9,
+
+  // Floating-point values of fixed width.
+  //
+  // Note: if f16s are not natively supported on the device, they will be
+  // converted to f16 from f32 at arbitrary points in the computation.
+  F16 = 10,
+  F32 = 11,
+
+  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+  // floating-point format, but uses 1 bit for the sign, 8 bits for the exponent
+  // and 7 bits for the mantissa.
+  BF16 = 16,
+
+  F64 = 12,
+};
+
+constexpr std::string_view PrimitiveTypeToString(PrimitiveType type) {
+  switch (type) {
+    case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+      return "invalid";
+    case PrimitiveType::PRED:
+      return "pred";
+    case PrimitiveType::S8:
+      return "s8";
+    case PrimitiveType::S16:
+      return "s16";
+    case PrimitiveType::S32:
+      return "s32";
+    case PrimitiveType::S64:
+      return "s64";
+    case PrimitiveType::U8:
+      return "u8";
+    case PrimitiveType::U16:
+      return "u16";
+    case PrimitiveType::U32:
+      return "u32";
+    case PrimitiveType::U64:
+      return "u64";
+    case PrimitiveType::F16:
+      return "f16";
+    case PrimitiveType::F32:
+      return "f32";
+    case PrimitiveType::BF16:
+      return "bf16";
+    case PrimitiveType::F64:
+      return "f64";
+  }
+}
+
+// A view into the buffer argument. Buffers with non-identity layouts can be
+// decoded only as a StridedBufferArg.
+struct StridedBufferArg {
+  std::string ToString() const;
+
+  PrimitiveType dtype;
+  void* data;
+  Span<const int64_t> sizes;
+  Span<const int64_t> strides;
+};
+
+// A view into the buffer argument with an identity (row major) layout.
+struct BufferArg {
+  std::string ToString() const;
+
+  PrimitiveType dtype;
+  void* data;
+  Span<const int64_t> sizes;
+};
+
+// A type tag to represent dictionary attributes that can be decoded into
+// structs using aggregate attribute decoding.
+struct Dictionary {};
+
+template <typename T>
+bool Ffi::Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id) {
+#define ISA(type, name)                  \
+  if constexpr (std::is_same_v<T, type>) \
+    return api->XLA_FFI_Get_##name##_TypeId() == type_id;
+
+  ISA(std::string_view, String);
+
+  ISA(float, Float);
+  ISA(double, Double);
+  ISA(bool, Int1);
+  ISA(int32_t, Int32);
+  ISA(int64_t, Int64);
+
+  ISA(Span<const float>, FloatArray);
+  ISA(Span<const double>, DoubleArray);
+  ISA(Span<const int32_t>, Int32Array);
+  ISA(Span<const int64_t>, Int64Array);
+
+  ISA(StridedBufferArg, StridedBufferArg);
+  ISA(BufferArg, BufferArg);
+  ISA(Dictionary, Dictionary);
+
+  assert(false && "Unsupported type");
+  return false;
+
+#undef ISA
+}
+
+//===----------------------------------------------------------------------===//
+// Pretty printing for buffers.
+//===----------------------------------------------------------------------===//
+
+static void PrintArray(std::stringstream& ss, Span<const int64_t> arr) {
+  ss << "[";
+  for (unsigned i = 0; i < arr.size(); ++i)
+    (i > 0) ? ss << ", " << arr[i] : ss << arr[i];
+  ss << "]";
+}
+
+inline std::string StridedBufferArg::ToString() const {
+  std::stringstream ss;
+  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
+  ss << " sizes=";
+  PrintArray(ss, sizes);
+  ss << " strides=";
+  PrintArray(ss, strides);
+  return ss.str();
+}
+
+inline std::string BufferArg::ToString() const {
+  std::stringstream ss;
+  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
+  ss << " sizes=";
+  PrintArray(ss, sizes);
+  return ss.str();
+}
+
+//===----------------------------------------------------------------------===//
+// FFI binding describes the function signature expected by the FFI handler
+// using its variadic template parameter.
+//
+//   FFI binding:
+//     FfiBinding<int32_t, float>
+//
+//   Corresponds to the function signature:
+//     FfiStatus MyHandler(int32_t arg0, float arg1);
+//
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// A type tag to distinguish arguments tied to the attributes in the
+// `FfiBinding` variadic template argument.
+template <typename T>
+struct AttrTag {};
+
+// A type tag to distinguish argument tied to FFI module state.
+template <typename T>
+struct StateTag {};
+
+// A type tag to distinguish argument tied to XLA runtime stream.
+template <typename T>
+struct StreamTag {};
+
+// A template for checking if type is a wrapped attribute or user data.
+// clang-format off
+template <typename>   struct IsWrapped               : std::false_type {};
+template <typename T> struct IsWrapped<AttrTag<T>>   : std::true_type {};
+template <typename T> struct IsWrapped<StateTag<T>>  : std::true_type {};
+template <typename T> struct IsWrapped<StreamTag<T>> : std::true_type {};
+// clang-format on
+
+}  // namespace internal
+
+template <typename... Ts>
+class FfiBinding {
+ public:
+  template <typename T>
+  FfiBinding<Ts..., T> Arg() && {
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  FfiBinding<Ts..., internal::AttrTag<T>> Attr(std::string attr) && {
+    attrs_.push_back(std::move(attr));
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  FfiBinding<Ts..., internal::StateTag<T>> State() && {
+    return {std::move(*this)};
+  }
+
+  template <typename T>
+  FfiBinding<Ts..., internal::StreamTag<T>> Stream() && {
+    static_assert(std::is_pointer_v<T>,
+                  "T must be a pointer type, e.g. for GPU platform it must be "
+                  "se::gpu::GpuStreamHandle");
+    return {std::move(*this)};
+  }
+
+  template <typename Fn>
+  std::unique_ptr<FfiHandler<Fn, Ts...>> To(Fn fn) {
+    return std::unique_ptr<FfiHandler<Fn, Ts...>>(
+        new FfiHandler<Fn, Ts...>(std::forward<Fn>(fn), std::move(attrs_)));
+  }
+
+ private:
+  template <typename...>
+  friend class FfiBinding;
+  friend class Ffi;
+
+  explicit FfiBinding() {
+    static_assert(sizeof...(Ts) == 0, "ffi arguments must be empty");
+  }
+
+  template <typename... TTs>
+  FfiBinding(FfiBinding<TTs...>&& other)  // NOLINT
+      : attrs_(std::move(other.attrs_)) {}
+
+  FfiBinding(FfiBinding&) = delete;
+
+  std::vector<std::string> attrs_;  // names of bound attributes
+};
+
+inline FfiBinding<> Ffi::Binding() { return FfiBinding<>(); }
+
+//===----------------------------------------------------------------------===//
+// Helpers for decoding opaque arguments and attributes' memory.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+using runtime::internal::EncodedArray;
+using runtime::internal::EncodedMemref;
+
+// Decoded pair of argument type and opaque value.
+struct DecodedArg {
+  XLA_FFI_TypeId type_id;
+  void* value;
+};
+
+// Decoded triple of attribute name, type and opaque value.
+struct DecodedAttr {
+  std::string_view name;
+  XLA_FFI_TypeId type_id;
+  void* value;
+};
+
+// A convenience wrapper around opaque arguments memory.
+class DecodedArgs {
+ public:
+  explicit DecodedArgs(void** args) {
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(args, sizeof(void*));
+    size_ = *reinterpret_cast<int64_t*>(args[0]);
+    if (size_) {
+      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(args + 1, sizeof(void*));
+      types_ = reinterpret_cast<void**>(args[1]);
+      values_ = args + 2;
+      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(types_, size_ * sizeof(void*));
+      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(values_, size_ * sizeof(void*));
+    }
+  }
+
+  int64_t size() const { return size_; }
+
+  DecodedArg operator[](size_t i) const {
+    DecodedArg arg;
+    arg.type_id = types_[i];
+    arg.value = values_[i];
+    return arg;
+  }
+
+ private:
+  int64_t size_;
+  void** types_ = nullptr;
+  void** values_ = nullptr;
+};
+
+// A convenience wrapper around opaque attributes' memory.
+class DecodedAttrs {
+ public:
+  explicit DecodedAttrs(void** attrs) : encoded_(attrs + 1) {
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(attrs, sizeof(void*));
+    size_ = *reinterpret_cast<int64_t*>(attrs[0]);
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded_, 3 * size_ * sizeof(void*));
+  }
+
+  int64_t size() const { return size_; }
+
+  DecodedAttr operator[](size_t i) const {
+    void** attr_base = encoded_ + i * 3;
+
+    DecodedAttr attr;
+    auto* name = reinterpret_cast<internal::EncodedArray<char>*>(attr_base[0]);
+    attr.name = std::string_view(name->data, name->size);
+    attr.type_id = attr_base[1];
+    attr.value = attr_base[2];
+
+    return attr;
+  }
+
+ private:
+  void** encoded_;
+  int64_t size_;
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// XLA FFI arguments decoding implementation.
+//===----------------------------------------------------------------------===//
+
+// XLA FFI arguments decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` arguments
+//
+//   template <>
+//   struct FfiArgDecoding<MyType> {
+//    static std::optional<MyType> Decode(ExecutionContext* ctx, TypeId type_id,
+//                                        void* value);
+//   };
+//
+template <typename T>
+struct FfiArgDecoding;
+
+// XLA FFI attribute decoding must be defined by specializing this template.
+//
+// Example: decoding for the `MyType` attributes
+//
+//   template <>
+//   struct FfiAttrDecoding<MyType> {
+//    static std::optional<MyType> Decode(ExecutionContext* ctx,
+//                                        std::string_view name,
+//                                        TypeId type_id, void* value);
+//   }
+//
+template <typename T>
+struct FfiAttrDecoding;
+
+namespace internal {
+
+// When decoding input data we need to keep track of how many arguments,
+// attributes, and returns we decoded so far to index into the correct data
+// strucuture.
+struct DecodingOffsets {
+  int64_t args = 0;
+  int64_t attrs = 0;
+  int64_t rets = 0;
+  int64_t values = 0;
+};
+
+template <typename T>
+struct Decode {
+  static std::optional<T> call(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               DecodingOffsets& offsets,
+                               internal::DecodedArgs args,
+                               const std::vector<std::string>& attrs_names,
+                               const std::vector<size_t>& attrs_idx,
+                               internal::DecodedAttrs attrs) {
+    internal::DecodedArg arg = args[offsets.args++];
+    return FfiArgDecoding<T>::Decode(api, arg.type_id, arg.value);
+  }
+};
+
+template <typename T>
+struct Decode<AttrTag<T>> {
+  static std::optional<T> call(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               DecodingOffsets& offsets,
+                               internal::DecodedArgs args,
+                               const std::vector<std::string>& attrs_names,
+                               const std::vector<size_t>& attrs_idx,
+                               internal::DecodedAttrs attrs) {
+    // Find decoded attribute corresponding to the given attribute index.
+    int64_t idx = offsets.attrs++;
+
+    // Get mapping from the attribute to its index in the sorted array.
+    size_t i = attrs_idx[idx];
+
+    // Attribute name does not match.
+    if (attrs[i].name != attrs_names[idx]) return std::nullopt;
+
+    return FfiAttrDecoding<T>::Decode(api, attrs[i].name, attrs[i].type_id,
+                                      attrs[i].value);
+  }
+};
+
+template <typename T>
+struct Decode<StateTag<T>> {
+  static std::optional<T*> call(const XLA_FFI_Api* api,
+                                XLA_FFI_ExecutionContext* ctx,
+                                DecodingOffsets& offsets, internal::DecodedArgs,
+                                const std::vector<std::string>& attrs_names,
+                                const std::vector<size_t>& attrs_idx,
+                                internal::DecodedAttrs attrs) {
+    XLA_FFI_ExecutionContext_GetModuleState_Args args;
+    args.struct_size = XLA_FFI_ExecutionContext_GetModuleState_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.ctx = ctx;
+
+    XLA_FFI_Module_State* state =
+        api->XLA_FFI_ExecutionContext_GetModuleState(&args);
+    return reinterpret_cast<T*>(state);
+  }
+};
+
+template <typename T>
+struct Decode<StreamTag<T>> {
+  static std::optional<T> call(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               DecodingOffsets& offsets, internal::DecodedArgs,
+                               const std::vector<std::string>& attrs_names,
+                               const std::vector<size_t>& attrs_idx,
+                               internal::DecodedAttrs attrs) {
+    XLA_FFI_ExecutionContext_GetStream_Args args;
+    args.struct_size = XLA_FFI_ExecutionContext_GetStream_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.ctx = ctx;
+
+    XLA_FFI_Stream* stream = api->XLA_FFI_ExecutionContext_GetStream(&args);
+    return reinterpret_cast<T>(stream);
+  }
+};
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Ffi handler binds concrete ffi implementation of type `Fn` to the ffi
+// function signature. `Fn` can be a function pointer or a lambda.
+//
+// Ffi handler uses the variadic template parameter `Ts` to decode the
+// opaque pointers passed to the `call` function into the C++ types that are
+// forwarded to the ffi implementation.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// A helper template to extract the type of the handler argument.
+// clang-format off
+template <typename T> struct FnArgType               { using Type = T;  };
+template <typename T> struct FnArgType<AttrTag<T>>   { using Type = T;  };
+template <typename T> struct FnArgType<StateTag<T>>  { using Type = T*; };
+template <typename T> struct FnArgType<StreamTag<T>> { using Type = T;  };
+// clang-format on
+
+// A template for counting regular arguments in the Ts pack.
+template <typename... Ts>
+struct NumArgs;
+template <>
+struct NumArgs<> {
+  static constexpr int64_t value = 0;
+};
+
+template <typename T, typename... Ts>
+struct NumArgs<T, Ts...> {
+  static constexpr int64_t value = !IsWrapped<T>::value + NumArgs<Ts...>::value;
+};
+
+}  // namespace internal
+
+template <typename Fn, typename... Ts>
+class FfiHandler : public Ffi {
+  static constexpr int64_t kSize = sizeof...(Ts);
+  static constexpr int64_t kNumArgs = internal::NumArgs<Ts...>::value;
+
+  template <typename T>
+  using FnArgType = typename internal::FnArgType<T>::Type;
+
+  // Check if FFI function returns `FfiStatus`.
+  static constexpr bool kIsFfiStatusHandler =
+      std::is_invocable_r_v<FfiStatus, Fn, FnArgType<Ts>...>;
+  static_assert(kIsFfiStatusHandler, "unsupported FFI handler type");
+
+  static XLA_FFI_Error* ToError(const XLA_FFI_Api* api, FfiStatus status) {
+    if (!status.errc().has_value()) return nullptr;
+
+    XLA_FFI_Error_Create_Args args;
+    args.struct_size = XLA_FFI_Error_Create_Args_STRUCT_SIZE;
+    args.priv = nullptr;
+    args.errc = *status.errc();
+    args.message = status.message_c_str();
+
+    return api->XLA_FFI_Error_Create(&args);
+  }
+
+ public:
+  XLA_FFI_Error* operator()(const XLA_FFI_Api* api,
+                            XLA_FFI_ExecutionContext* ctx, void** args,
+                            void** attrs, void** rets) const final {
+    // Decode arguments and attributes from the opaque pointers.
+    internal::DecodedArgs decoded_args(args);
+    internal::DecodedAttrs decoded_attrs(attrs);
+
+    int64_t num_args = decoded_args.size();
+    int64_t num_attrs = decoded_attrs.size();
+
+    // Check that we have the correct number of arguments passed to the handler.
+    if (num_args != kNumArgs) {
+      std::ostringstream err;
+      err << "Wrong number of arguments: expected " << kNumArgs << " got "
+          << num_args;
+      return ToError(api, FfiStatus::InvalidArgument(err.str()));
+    }
+
+    // Check that we have the correct number of attributes passed to the
+    // handler. Each individual attribute decoding will check the name and the
+    // type of the attribute.
+    if (num_attrs != attrs_.size()) {
+      std::ostringstream err;
+      err << "Wrong number of attributes: expected " << attrs_.size() << " got "
+          << num_attrs;
+      return ToError(api, FfiStatus::InvalidArgument(err.str()));
+    }
+
+    // Define index sequence to access ffi handler arguments.
+    using Is = std::make_index_sequence<kSize>;
+    return call(api, ctx, decoded_args, decoded_attrs, Is{});
+  }
+
+ private:
+  template <typename...>
+  friend class FfiBinding;
+
+  template <size_t... Is>
+  XLA_FFI_Error* call(const XLA_FFI_Api* api, XLA_FFI_ExecutionContext* ctx,
+                      internal::DecodedArgs args, internal::DecodedAttrs attrs,
+                      std::index_sequence<Is...>) const {
+    // A helper structure to allow each decoder find the correct offset in the
+    // arguments, attributes or results.
+    internal::DecodingOffsets offsets;
+
+    // Decode all operands into `std::optional` containers. It is guaranteed
+    // that initializer list will be evaluated left-to-right, and we can rely
+    // on correct offsets computation.
+    std::tuple<std::optional<FnArgType<Ts>>...> fn_args = {
+        internal::Decode<Ts>::call(api, ctx, offsets, args, attrs_, attrs_idx_,
+                                   attrs)...};
+
+    // Check if all arguments, attributes and results were decoded;
+    bool all_decoded = (std::get<Is>(fn_args).has_value() && ...);
+    if (!all_decoded) {
+      return ToError(
+          api, FfiStatus::InvalidArgument("Failed to decode all FFI operands"));
+    }
+
+    // Custom call returns `FfiStatus`, we can call it directly.
+    if constexpr (kIsFfiStatusHandler) {
+      return ToError(api, fn_(std::move(*std::get<Is>(fn_args))...));
+    }
+
+    return ToError(api, FfiStatus::Ok());
+  }
+
+  FfiHandler(Fn fn, std::vector<std::string> attrs)
+      : fn_(std::move(fn)),
+        attrs_(std::move(attrs)),
+        attrs_idx_(attrs_.size()) {
+    // Sort attributes names.
+    std::vector<std::string> sorted = attrs_;
+    std::sort(sorted.begin(), sorted.end());
+
+    // Find the index of every attribute in the sorted attributes vector.
+    for (size_t i = 0; i < attrs_.size(); ++i) {
+      const std::string& attr = attrs_[i];
+      attrs_idx_[i] = std::distance(
+          sorted.begin(), std::find(sorted.begin(), sorted.end(), attr));
+    }
+  }
+
+  Fn fn_;
+
+  std::vector<std::string> attrs_;
+
+  // A mapping from the attribute index to its index in the lexicographically
+  // sorted vector of attribute names. Attributes are passed to the ffi handler
+  // sorted by the name, we use this index to efficiently find the decoded
+  // attribute entry.
+  std::vector<size_t> attrs_idx_;
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI arguments decoding.
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_SCALAR_ARG_DECODING(T)                           \
+  template <>                                                             \
+  struct FfiArgDecoding<T> {                                              \
+    static std::optional<T> Decode(const XLA_FFI_Api* api,                \
+                                   XLA_FFI_TypeId type_id, void* value) { \
+      if (!Ffi::Isa<T>(api, type_id)) {                                   \
+        return std::nullopt;                                              \
+      }                                                                   \
+                                                                          \
+      XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(value, sizeof(T));           \
+      return *reinterpret_cast<T*>(value);                                \
+    }                                                                     \
+  }
+
+XLA_FFI_REGISTER_SCALAR_ARG_DECODING(int32_t);
+
+#undef XLA_FFI_REGISTER_SCALAR_ARG_DECODING
+
+template <>
+struct FfiArgDecoding<StridedBufferArg> {
+  using EncodedMemref = internal::EncodedMemref;
+
+  static std::optional<StridedBufferArg> Decode(const XLA_FFI_Api* api,
+                                                XLA_FFI_TypeId type_id,
+                                                void* value) {
+    if (!Ffi::Isa<BufferArg, StridedBufferArg>(api, type_id)) {
+      return std::nullopt;
+    }
+
+    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
+        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
+
+    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
+    return StridedBufferArg{dtype,
+                            encoded->data,
+                            {encoded->dims, encoded->rank},
+                            {encoded->dims + encoded->rank, encoded->rank}};
+  }
+};
+
+template <>
+struct FfiArgDecoding<BufferArg> {
+  using EncodedMemref = internal::EncodedMemref;
+
+  static std::optional<BufferArg> Decode(const XLA_FFI_Api* api,
+                                         XLA_FFI_TypeId type_id, void* value) {
+    if (!Ffi::Isa<BufferArg>(api, type_id)) {
+      return std::nullopt;
+    }
+
+    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
+        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
+
+    PrimitiveType dtype = static_cast<PrimitiveType>(encoded->dtype);
+    return BufferArg{dtype, encoded->data, {encoded->dims, encoded->rank}};
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// XLA FFI attributes decoding.
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T)                          \
+  template <>                                                             \
+  struct FfiAttrDecoding<T> {                                             \
+    static std::optional<T> Decode(const XLA_FFI_Api* api,                \
+                                   std::string_view name,                 \
+                                   XLA_FFI_TypeId type_id, void* value) { \
+      if (!Ffi::Isa<T>(api, type_id)) {                                   \
+        return std::nullopt;                                              \
+      }                                                                   \
+                                                                          \
+      return *reinterpret_cast<T*>(value);                                \
+    }                                                                     \
+  }
+
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(double);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(bool);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
+#define XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(T)                            \
+  template <>                                                              \
+  struct FfiAttrDecoding<Span<const T>> {                                  \
+    static std::optional<Span<const T>> Decode(const XLA_FFI_Api* api,     \
+                                               std::string_view name,      \
+                                               XLA_FFI_TypeId type_id,     \
+                                               void* value) {              \
+      if (!Ffi::Isa<Span<const T>>(api, type_id)) {                        \
+        return std::nullopt;                                               \
+      }                                                                    \
+                                                                           \
+      auto* encoded = reinterpret_cast<internal::EncodedArray<T>*>(value); \
+      return Span<const T>(encoded->data, encoded->size);                  \
+    }                                                                      \
+  }
+
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(float);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(double);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(bool);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int32_t);
+XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(int64_t);
+
+#undef XLA_FFI_REGISTER_ARRAY_ATTR_DECODING
+
+template <>
+struct FfiAttrDecoding<std::string_view> {
+  static std::optional<std::string_view> Decode(const XLA_FFI_Api* api,
+                                                std::string_view name,
+                                                XLA_FFI_TypeId type_id,
+                                                void* value) {
+    if (!Ffi::Isa<std::string_view>(api, type_id)) {
+      return std::nullopt;
+    }
+
+    auto* encoded = reinterpret_cast<internal::EncodedArray<char>*>(value);
+    return std::string_view(encoded->data, encoded->size);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Register an XLA FFI attribute decoding from dictionaries to structs.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct AggregateMember {
+  using Type = T;
+
+  explicit AggregateMember(std::string_view name) : name(name) {}
+  std::string_view name;
+};
+
+// Example: register decoding for a user-defined struct
+//
+//   struct PairOfI64 { int64_t a; int64_t b; };
+//
+//   XLA_FFI_REGISTER_AGGREGATE_ATTR_DECODING(
+//     PairOfI64,
+//     AggregateMember<int64_t>("a"),
+//     AggregateMember<int64_t>("b"));
+//
+#define XLA_FFI_REGISTER_AGGREGATE_ATTR_DECODING(T, ...)                       \
+  template <>                                                                  \
+  struct FfiAttrDecoding<T> {                                                  \
+    static std::optional<T> Decode(const XLA_FFI_Api* api,                     \
+                                   std::string_view name,                      \
+                                   XLA_FFI_TypeId type_id, void* value) {      \
+      if (!Ffi::Isa<Dictionary>(api, type_id)) {                               \
+        return std::nullopt;                                                   \
+      }                                                                        \
+                                                                               \
+      auto decoder = internal::AggregateDecoder<T>(__VA_ARGS__);               \
+      return decltype(decoder)::Decode(api, reinterpret_cast<void**>(value),   \
+                                       internal::AggregateNames(__VA_ARGS__)); \
+    }                                                                          \
+  }
+
+namespace internal {
+// Decodes aggregate attribute into the object of type `T` that must be
+// constructible from the `Ts` types.
+template <typename T, typename... Ts>
+struct DecodeAggregateAttr {
+  static constexpr size_t kSize = sizeof...(Ts);
+
+  static std::optional<T> Decode(const XLA_FFI_Api* api, void** value,
+                                 std::array<std::string_view, kSize> names) {
+    internal::DecodedAttrs attrs(value);
+    return Decode(api, attrs, names, std::make_index_sequence<kSize>{});
+  }
+
+  template <size_t... Is>
+  static std::optional<T> Decode(const XLA_FFI_Api* api,
+                                 internal::DecodedAttrs attrs,
+                                 std::array<std::string_view, kSize> names,
+                                 std::index_sequence<Is...>) {
+    // Check that the number of encoded attributes matches the signature.
+    if (kSize != attrs.size()) return std::nullopt;
+
+    // Check that aggregate member names match the expected names.
+    for (unsigned i = 0; i < kSize; ++i)
+      if (attrs[i].name != names[i]) return std::nullopt;
+
+    // Decode all arguments into std::optional containers. It is guaranteed
+    // that initializer list will be evaluated left-to-right, and we can rely
+    // on correct offsets computation.
+    std::tuple<std::optional<Ts>...> members = {FfiAttrDecoding<Ts>::Decode(
+        api, attrs[Is].name, attrs[Is].type_id, attrs[Is].value)...};
+
+    bool all_decoded = (std::get<Is>(members).has_value() && ...);
+    if (!all_decoded) return std::nullopt;
+
+    // Forward unpacked members to the type constructor.
+    return T{std::move(*std::get<Is>(members))...};
+  }
+};
+
+template <typename... Members>
+auto AggregateNames(Members... m) {
+  return std::array<std::string_view, sizeof...(Members)>{m.name...};
+}
+
+template <typename T, typename... Members>
+auto AggregateDecoder(Members... m) {
+  return DecodeAggregateAttr<T, typename Members::Type...>();
+}
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// XLA FFI helper macro for registering FFI implementations.
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_DEFINE_FUNCTION(fn, impl, binding)                   \
+  static XLA_FFI_Error* fn(XLA_FFI_Function_Args* args) {            \
+    ::xla::runtime::ffi::internal::CheckStructSize(                  \
+        "XLA_FFI_Function_Args", XLA_FFI_Function_Args_STRUCT_SIZE,  \
+        args->struct_size);                                          \
+    static auto* handler = binding.To(impl).release();               \
+    return (*handler)(args->api, args->ctx, args->args, args->attrs, \
+                      args->rets);                                   \
+  }
+
+#undef CHECK_ARGS_SIZE
+
+}  // namespace ffi
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_API_H_
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h b/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h
new file mode 100644
index 00000000000..4f952bd9923
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h
@@ -0,0 +1,269 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_C_API_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Every struct passed across the C API boundary has its size as a member, and
+// we use it as a sanity check for API compatibility.
+#define XLA_FFI_STRUCT_SIZE(struct_type, last_field) \
+  offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Forward declare.
+typedef struct XLA_FFI_Api XLA_FFI_Api;
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Type checking.
+//===----------------------------------------------------------------------===//
+
+// XLA FFI passes type ids along with all arguments and attributes so that it
+// should be possible to check types at run time inside the FFI handler.
+typedef const void* XLA_FFI_TypeId;
+
+typedef XLA_FFI_TypeId XLA_FFI_Get_TypeId();
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Error handling.
+//===----------------------------------------------------------------------===//
+
+// XLA FFI handler must return a XLA_FFI_Error*, which is NULL if there is no
+// error and set if there is. Caller allocates any returned XLA_FFI_Errors, and
+// the XLA FFI is responsible for freeing them.
+typedef struct XLA_FFI_Error XLA_FFI_Error;
+
+// Codes are based on https://abseil.io/docs/cpp/guides/status-codes
+typedef enum {
+  XLA_FFI_Error_Code_CANCELLED = 1,
+  XLA_FFI_Error_Code_UNKNOWN = 2,
+  XLA_FFI_Error_Code_INVALID_ARGUMENT = 3,
+  XLA_FFI_Error_Code_DEADLINE_EXCEEDED = 4,
+  XLA_FFI_Error_Code_NOT_FOUND = 5,
+  XLA_FFI_Error_Code_ALREADY_EXISTS = 6,
+  XLA_FFI_Error_Code_PERMISSION_DENIED = 7,
+  XLA_FFI_Error_Code_RESOURCE_EXHAUSTED = 8,
+  XLA_FFI_Error_Code_FAILED_PRECONDITION = 9,
+  XLA_FFI_Error_Code_ABORTED = 10,
+  XLA_FFI_Error_Code_OUT_OF_RANGE = 11,
+  XLA_FFI_Error_Code_UNIMPLEMENTED = 12,
+  XLA_FFI_Error_Code_INTERNAL = 13,
+  XLA_FFI_Error_Code_UNAVAILABLE = 14,
+  XLA_FFI_Error_Code_DATA_LOSS = 15,
+  XLA_FFI_Error_Code_UNAUTHENTICATED = 16
+} XLA_FFI_Error_Code;
+
+//===----------------------------------------------------------------------===//
+// XLA FFI module defines a set of exported FFI functions and their state.
+//===----------------------------------------------------------------------===//
+
+// XLA FFI module is a way to structure FFI functions together with a state
+// required for calling them. XLA runtime executable can be linked with multiple
+// of such modules at run time.
+typedef struct XLA_FFI_Module XLA_FFI_Module;
+
+// XLA FFI module state can be instantiated once for each XLA executable, and
+// its life time will be bound to the executable iself, or it can be
+// instantiated for each separate execution.
+typedef enum {
+  XLA_FFI_Module_State_PER_EXECUTABLE = 1,
+  XLA_FFI_Module_State_PER_EXECUTION = 2,
+} XLA_FFI_Module_StateType;
+
+// Exported FFI functions will have access to the state object that they can
+// use to share data between different function invocations. If the state is
+// instantiated for each executable, it is the user's responsibility to
+// guarantee that it is thread-safe to use from multiple concurrent executions.
+typedef struct XLA_FFI_Module_State XLA_FFI_Module_State;
+
+// Creates a new per-executable module state.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_Module* module;
+  XLA_FFI_Module_State* state;  // out
+} XLA_FFI_Module_CreateState_Args;
+
+const size_t XLA_FFI_Module_CreateState_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_CreateState_Args, state);
+
+typedef XLA_FFI_Error* XLA_FFI_Module_CreateState(
+    XLA_FFI_Module_CreateState_Args* args);
+
+// Destroys a module state.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_Module* module;
+  XLA_FFI_Module_State* state;
+} XLA_FFI_Module_DestroyState_Args;
+
+const size_t XLA_FFI_Module_DestroyState_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_DestroyState_Args, state);
+
+typedef void XLA_FFI_Module_DestroyState(
+    XLA_FFI_Module_DestroyState_Args* args);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Error Reporting APIs.
+//===----------------------------------------------------------------------===//
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  const char* message;
+  XLA_FFI_Error_Code errc;
+} XLA_FFI_Error_Create_Args;
+
+const size_t XLA_FFI_Error_Create_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Error_Create_Args, message);
+
+typedef XLA_FFI_Error* XLA_FFI_Error_Create(XLA_FFI_Error_Create_Args* args);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Stream.
+//===----------------------------------------------------------------------===//
+
+// XLA FFI stream is an opaque handle to the underlying stream executor `Stream`
+// implementation. In XLA:GPU it is `se::gpu::GpuStreamHandle` (when running on
+// CUDA platform it is a `CUstream`).
+typedef struct XLA_FFI_Stream XLA_FFI_Stream;
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Execution Context.
+//===----------------------------------------------------------------------===//
+
+typedef struct XLA_FFI_ExecutionContext XLA_FFI_ExecutionContext;
+
+// Get `XLA_FFI_Module_State` from the execution context.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_ExecutionContext* ctx;
+} XLA_FFI_ExecutionContext_GetModuleState_Args;
+
+const size_t XLA_FFI_ExecutionContext_GetModuleState_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_ExecutionContext_GetModuleState_Args, ctx);
+
+typedef XLA_FFI_Module_State* XLA_FFI_ExecutionContext_GetModuleState(
+    XLA_FFI_ExecutionContext_GetModuleState_Args* args);
+
+// Get `XLA_FFI_Stream` from the execution context.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  XLA_FFI_ExecutionContext* ctx;
+} XLA_FFI_ExecutionContext_GetStream_Args;
+
+const size_t XLA_FFI_ExecutionContext_GetStream_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_ExecutionContext_GetStream_Args, ctx);
+
+typedef XLA_FFI_Stream* XLA_FFI_ExecutionContext_GetStream(
+    XLA_FFI_ExecutionContext_GetStream_Args* args);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Function API.
+//===----------------------------------------------------------------------===//
+
+// Arguments passed to an FFI function.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  const XLA_FFI_Api* api;
+  XLA_FFI_ExecutionContext* ctx;
+  void** args;
+  void** attrs;
+  void** rets;
+} XLA_FFI_Function_Args;
+
+const size_t XLA_FFI_Function_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Function_Args, rets);
+
+// XLA FFI function type that can be exported to a runtime.
+typedef XLA_FFI_Error* XLA_FFI_Function(XLA_FFI_Function_Args* args);
+
+//===----------------------------------------------------------------------===//
+// XLA FFI Api.
+//===----------------------------------------------------------------------===//
+
+// Register FFI module with an XLA runtime.
+typedef struct {
+  size_t struct_size;
+  void* priv;
+  const char* name;
+  XLA_FFI_Module* module;
+  XLA_FFI_Module_StateType state_type;
+  XLA_FFI_Module_CreateState* create_state;
+  XLA_FFI_Module_DestroyState* destroy_state;
+  int64_t num_exported_functions;
+  const char** exported_names;            // length == num_exported_functions
+  XLA_FFI_Function** exported_functions;  // length == num_exported_functions
+} XLA_FFI_Module_Register_Args;
+
+const size_t XLA_FFI_Module_Register_Args_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Module_Register_Args, exported_functions);
+
+typedef void XLA_FFI_Module_Register(XLA_FFI_Module_Register_Args* args);
+
+#define XLA_FFI_API_STRUCT_FIELD(fn_type) fn_type* fn_type
+
+#define XLA_FFI_API_TYPEID_FIELD(type) \
+  XLA_FFI_Get_TypeId* XLA_FFI_Get_##type##_TypeId
+
+typedef struct XLA_FFI_Api {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Module_Register);
+
+  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_GetModuleState);
+  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_ExecutionContext_GetStream);
+
+  XLA_FFI_API_STRUCT_FIELD(XLA_FFI_Error_Create);
+
+  XLA_FFI_API_TYPEID_FIELD(String);
+  XLA_FFI_API_TYPEID_FIELD(Float);
+  XLA_FFI_API_TYPEID_FIELD(Double);
+  XLA_FFI_API_TYPEID_FIELD(Int1);
+  XLA_FFI_API_TYPEID_FIELD(Int32);
+  XLA_FFI_API_TYPEID_FIELD(Int64);
+  XLA_FFI_API_TYPEID_FIELD(FloatArray);
+  XLA_FFI_API_TYPEID_FIELD(DoubleArray);
+  XLA_FFI_API_TYPEID_FIELD(Int32Array);
+  XLA_FFI_API_TYPEID_FIELD(Int64Array);
+  XLA_FFI_API_TYPEID_FIELD(BufferArg);
+  XLA_FFI_API_TYPEID_FIELD(StridedBufferArg);
+  XLA_FFI_API_TYPEID_FIELD(Dictionary);
+} XLA_FFI_Api;
+
+#undef XLA_FFI_API_STRUCT_FIELD
+#undef XLA_FFI_API_TYPEID_FIELD
+
+const size_t XLA_FFI_Api_STRUCT_SIZE =
+    XLA_FFI_STRUCT_SIZE(XLA_FFI_Api, XLA_FFI_Get_StridedBufferArg_TypeId);
+
+// Does not pass ownership of returned XLA_FFI_Api* to caller.
+const XLA_FFI_Api* GetXlaFfiApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_FFI_FFI_C_API_H_
diff --git a/tensorflow/compiler/xla/runtime/ffi_test.cc b/tensorflow/compiler/xla/runtime/ffi_test.cc
new file mode 100644
index 00000000000..62ab611f9b5
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/ffi_test.cc
@@ -0,0 +1,400 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/ffi.h"
+
+#include <algorithm>
+#include <array>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
+#include "tensorflow/compiler/xla/runtime/arguments.h"
+#include "tensorflow/compiler/xla/runtime/async_runtime.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h"
+#include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/runtime/results.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace runtime {
+
+// Diagnostic engine that appends all emitted diagnostics to the `error` string.
+static DiagnosticEngine CollectDiagnostic(std::string* error) {
+  DiagnosticEngine diagnostic_engine;
+  diagnostic_engine.AddHandler([=](Diagnostic& diagnostic) -> LogicalResult {
+    error->append(diagnostic.status().message());
+    return success();
+  });
+  return diagnostic_engine;
+}
+
+static absl::StatusOr<JitExecutable> Compile(std::string_view source) {
+  JitExecutable::Options opts;
+  opts.specialization = JitExecutable::Specialization::kDisabled;
+  opts.compiler.symbols_binding = ToSymbolsBinding();
+  opts.compiler.register_dialects = RegisterDefaultXlaGpuRuntimeDialects;
+  opts.compiler.create_compilation_pipeline =
+      [&](xla::runtime::PassManager& passes) {
+        CompilationPipelineOptions copts;
+        CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
+      };
+
+  return JitExecutable::Instantiate(source, opts, {"test"});
+}
+
+static absl::Status CompileAndExecute(std::string_view source,
+                                      ArgumentsRef args,
+                                      const DynamicCustomCallRegistry& registry,
+                                      CustomCall::UserData user_data) {
+  absl::StatusOr<JitExecutable> jit_executable = Compile(source);
+  if (!jit_executable.ok()) return jit_executable.status();
+
+  AsyncValuePtr<Executable> executable = jit_executable->DefaultExecutable();
+  if (executable.IsError())
+    return absl::InternalError(executable.GetError().message());
+
+  std::string diagnostic;
+  DiagnosticEngine diagnostic_engine = CollectDiagnostic(&diagnostic);
+
+  Executable::ExecuteOpts execute_opts;
+  execute_opts.custom_call_registry = &registry;
+  execute_opts.diagnostic_engine = &diagnostic_engine;
+  execute_opts.custom_call_data = &user_data;
+  execute_opts.async_task_runner =
+      reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
+
+  // Append emited diagnostic if execution failed.
+  auto executed = executable->Execute(args, NoResultConverter{}, execute_opts);
+  if (!executed.ok()) {
+    return absl::InternalError(
+        absl::StrFormat("%s: %s", executed.status().message(), diagnostic));
+  }
+
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+
+using ffi::FfiStatus;
+
+// Aggregate attribute for testing decoding MLIR dictionaries into C++ structs.
+struct AggregateAttr {
+  int32_t idx0;
+  int32_t idx1;
+};
+
+namespace ffi {
+// Register decoding from dictionary to aggregate attribute.
+XLA_FFI_REGISTER_AGGREGATE_ATTR_DECODING(AggregateAttr,
+                                         AggregateMember<int32_t>("idx0"),
+                                         AggregateMember<int32_t>("idx1"));
+}  // namespace ffi
+
+// When FFI module is instantiated for an Xla runtime executable, it creates a
+// state object whose lifetime is bound to the executable, and the state can be
+// accessed from exported FFI functions. We use this state object to observe
+// side effects of executing exported FFI functions in tests.
+struct TestModuleState {
+  // Test scalar arguments decoding.
+  int32_t i32_arg = 0;
+
+  // Test string attribute decoding.
+  std::string str;
+
+  // Test scalar attributes decoding.
+  float f32_attr;
+  double f64_attr;
+  bool i1_attr;
+  int32_t i32_attr;
+  int64_t i64_attr;
+
+  // Test array attributes decoding.
+  std::vector<float> f32_arr_attr;
+  std::vector<double> f64_arr_attr;
+  std::vector<int32_t> i32_arr_attr;
+  std::vector<int64_t> i64_arr_attr;
+
+  // Test aggregate attribute decoding.
+  AggregateAttr aggregate_attr;
+};
+
+// TestModule is a stateful FFI module with every exported function having
+// access to the instance of `TestModuleState`. State is optional, it's ok to
+// skip it in the FFI binding if it's not needed.
+struct TestModule : public ffi::StatefulModule<TestModuleState> {
+  using Base = ffi::StatefulModule<TestModuleState>;
+
+  explicit TestModule(const XLA_FFI_Api* api)
+      : Base(api, "ffi-module",
+             {{"ffi.attrs_decoding", FFI_AttrsDecoding},
+              {"ffi.fill", FFI_Fill}}) {}
+
+  // Creates a new TestModule state for each executable.
+  std::unique_ptr<TestModuleState> CreateState() final {
+    return std::make_unique<TestModuleState>();
+  }
+
+  // Function that tests that we can successfully decode various kinds of
+  // attributes attached to custom calls.
+  XLA_FFI_DEFINE_FUNCTION(FFI_AttrsDecoding, AttrsDecoding,
+                          ffi::Ffi::Binding()
+                              .State<TestModuleState>()  // state
+                              .Attr<std::string_view>("str")
+                              .Attr<float>("f32")
+                              .Attr<double>("f64")
+                              .Attr<bool>("i1")
+                              .Attr<int32_t>("i32")
+                              .Attr<int64_t>("i64")
+                              .Attr<ffi::Span<const float>>("f32_arr")
+                              .Attr<ffi::Span<const double>>("f64_arr")
+                              .Attr<ffi::Span<const int32_t>>("i32_arr")
+                              .Attr<ffi::Span<const int64_t>>("i64_arr")
+                              .Attr<AggregateAttr>("aggregate"));
+
+  // Function that tests that we can successfully decode various kinds of
+  // arguments passed to custom calls.
+  XLA_FFI_DEFINE_FUNCTION(FFI_Fill, Fill,
+                          ffi::Ffi::Binding()
+                              .State<TestModuleState>()  // state
+                              .Arg<int32_t>()            // arg0
+                              .Arg<ffi::BufferArg>()     // arg1
+                              .Attr<float>("attr"));
+
+  static FfiStatus AttrsDecoding(TestModuleState* state, std::string_view str,
+                                 float f32, double f64, bool i1, int32_t i32,
+                                 int64_t i64, ffi::Span<const float> f32_arr,
+                                 ffi::Span<const double> f64_arr,
+                                 ffi::Span<const int32_t> i32_arr,
+                                 ffi::Span<const int64_t> i64_arr,
+                                 AggregateAttr aggregate);
+
+  static FfiStatus Fill(TestModuleState* state, int32_t arg0,
+                        ffi::BufferArg arg1, float attr0);
+};
+
+FfiStatus TestModule::AttrsDecoding(
+    TestModuleState* state, std::string_view str, float f32, double f64,
+    bool i1, int32_t i32, int64_t i64, ffi::Span<const float> f32_arr,
+    ffi::Span<const double> f64_arr, ffi::Span<const int32_t> i32_arr,
+    ffi::Span<const int64_t> i64_arr, AggregateAttr aggregate) {
+  state->str = std::string(str);
+  state->f32_attr = f32;
+  state->f64_attr = f64;
+  state->i1_attr = i1;
+  state->i32_attr = i32;
+  state->i64_attr = i64;
+  state->f32_arr_attr.assign(f32_arr.begin(), f32_arr.end());
+  state->f64_arr_attr.assign(f64_arr.begin(), f64_arr.end());
+  state->i32_arr_attr.assign(i32_arr.begin(), i32_arr.end());
+  state->i64_arr_attr.assign(i64_arr.begin(), i64_arr.end());
+  state->aggregate_attr = aggregate;
+  return FfiStatus::Ok();
+}
+
+FfiStatus TestModule::Fill(TestModuleState* state, int32_t arg0,
+                           ffi::BufferArg arg1, float attr0) {
+  // Update state to observe side effects.
+  state->i32_arg = arg0;
+
+  // Write attribute value into the buffer argument.
+  if (arg1.dtype != ffi::PrimitiveType::F32)
+    return FfiStatus::InvalidArgument("Unsupported buffer type");
+  if (arg1.sizes.size() != 2)
+    return FfiStatus::InvalidArgument("Unsupported buffer rank");
+
+  size_t size = arg1.sizes[0] * arg1.sizes[1];
+  float* data = reinterpret_cast<float*>(arg1.data);
+  std::fill(data, data + size, attr0);
+
+  return FfiStatus::Ok();
+}
+
+//===----------------------------------------------------------------------===//
+// FFI module for testing instantiating per-execution state.
+//===----------------------------------------------------------------------===//
+
+struct PerExecutionState {
+  PerExecutionState() { counter++; }
+  static int32_t counter;
+};
+
+int32_t PerExecutionState::counter = 0;
+
+struct PerExecutionStateModule : public ffi::StatefulModule<PerExecutionState> {
+  using Base = ffi::StatefulModule<PerExecutionState>;
+
+  explicit PerExecutionStateModule(const XLA_FFI_Api* api)
+      : Base(api, "per-execution-state-ffi-module", {},
+             /*per_execution_state=*/true) {}
+
+  std::unique_ptr<PerExecutionState> CreateState() final {
+    return std::make_unique<PerExecutionState>();
+  }
+};
+
+//----------------------------------------------------------------------------//
+
+// When test is instantiated it automatically registers FFI modules with the XLA
+// runtime.
+class FfiTest : public ::testing::Test {
+ public:
+  FfiTest() {
+    static const XLA_FFI_Api* api = GetXlaFfiApi();
+
+    static TestModule* module0 = new TestModule(api);
+    static PerExecutionStateModule* module1 = new PerExecutionStateModule(api);
+    (void)module0;
+    (void)module1;
+
+    ffi::ExportFfiModules(registry_);
+  }
+
+  DynamicCustomCallRegistry& registry() { return registry_; }
+
+ private:
+  DynamicCustomCallRegistry registry_;
+};
+
+TEST_F(FfiTest, ModulesRegistered) {
+  std::vector<const Module*> modules = ffi::FfiModules();
+  ASSERT_EQ(modules.size(), 2);
+  EXPECT_EQ(modules[0]->name(), "ffi-module");
+  EXPECT_EQ(modules[1]->name(), "per-execution-state-ffi-module");
+}
+
+TEST_F(FfiTest, ModulesExported) {
+  EXPECT_TRUE(registry().Find("ffi.attrs_decoding"));
+  EXPECT_TRUE(registry().Find("ffi.fill"));
+}
+
+TEST_F(FfiTest, CreateState) {
+  auto state = ffi::FfiModulesState::Instantiate();
+  ASSERT_TRUE(state.ok());
+
+  absl::StatusOr<ffi::FfiStateVector> state_vector = state->state_vector();
+  ASSERT_EQ(state_vector->state.size(), 2);
+}
+
+TEST_F(FfiTest, AttrsDecoding) {
+  absl::string_view source = R"(
+    func.func private @attrs_decoding()
+      attributes { rt.dynamic, rt.custom_call = "ffi.attrs_decoding" }
+
+    func.func @test() {
+      call @attrs_decoding() {
+        str = "Foo",
+        f32 = 42.0 : f32,
+        f64 = 43.0 : f64,
+        i1 = true,
+        i32 = 42 : i32,
+        i64 = 43 : i64,
+        f32_arr = array<f32: 1.0, 2.0, 3.0, 4.0>,
+        f64_arr = array<f64: 5.0, 6.0, 7.0, 8.0>,
+        i32_arr = array<i32: 1, 2, 3, 4>,
+        i64_arr = array<i64: 5, 6, 7, 8>,
+        aggregate = { idx0 = 123 : i32, idx1 = 456 : i32 }
+      } : () -> ()
+      return
+    }
+  )";
+
+  auto state = ffi::FfiModulesState::Instantiate();
+  auto state_vector = state->state_vector();
+  CustomCall::UserData user_data(&*state_vector);
+
+  VLOG(0) << CompileAndExecute(source, {}, registry(), user_data).message();
+  ASSERT_TRUE(CompileAndExecute(source, {}, registry(), user_data).ok());
+  auto* attrs = reinterpret_cast<TestModuleState*>(state_vector->state[0]);
+
+  EXPECT_EQ(attrs->str, "Foo");
+  EXPECT_EQ(attrs->f32_attr, 42.0);
+  EXPECT_EQ(attrs->f64_attr, 43.0);
+  EXPECT_EQ(attrs->i1_attr, true);
+  EXPECT_EQ(attrs->i32_attr, 42);
+  EXPECT_EQ(attrs->i64_attr, 43);
+  EXPECT_EQ(attrs->f32_arr_attr, std::vector<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_EQ(attrs->f64_arr_attr, std::vector<double>({5.0, 6.0, 7.0, 8.0}));
+  EXPECT_EQ(attrs->i32_arr_attr, std::vector<int32_t>({1, 2, 3, 4}));
+  EXPECT_EQ(attrs->i64_arr_attr, std::vector<int64_t>({5, 6, 7, 8}));
+  EXPECT_EQ(attrs->aggregate_attr.idx0, 123);
+  EXPECT_EQ(attrs->aggregate_attr.idx1, 456);
+}
+
+TEST_F(FfiTest, ScalarAndBufferArgs) {
+  absl::string_view source = R"(
+    func.func private @fill(%arg0: i32, %arg1: memref<?x?xf32>)
+      attributes { rt.dynamic, rt.custom_call = "ffi.fill" }
+
+    func.func @test(%arg0: memref<?x?xf32>) {
+      %0 = arith.constant 42 : i32
+      call @fill(%0, %arg0) { attr = 42.0 : f32 } : (i32, memref<?x?xf32>) -> ()
+      return
+    }
+  )";
+
+  // Instantiate state for all registered FFI modules.
+  auto state = ffi::FfiModulesState::Instantiate();
+  ASSERT_TRUE(state.ok());
+
+  // Add an FFI state vector to the UserData.
+  absl::StatusOr<ffi::FfiStateVector> state_vector = state->state_vector();
+  CustomCall::UserData user_data(&state_vector.value());
+  ASSERT_EQ(state_vector->state.size(), 2);
+
+  // Use vector as buffer storage.
+  std::vector<float> buffer(16);
+
+  // Use row major layout.
+  std::array<int64_t, 2> sizes = {8, 2};
+  std::array<int64_t, 2> strides = {2, 1};
+
+  // Pass a single memref argument to the executable.
+  std::vector<MemrefDesc> args;
+  args.emplace_back(PrimitiveType::F32, buffer.data(), 0, sizes, strides);
+
+  ASSERT_TRUE(CompileAndExecute(source, args, registry(), user_data).ok());
+
+  // Check that the FFI function updated the corresponding module state.
+  auto* state_ptr = reinterpret_cast<TestModuleState*>(state_vector->state[0]);
+  EXPECT_EQ(state_ptr->i32_arg, 42);
+
+  // Check that FFI function filled the buffer argument with data.
+  EXPECT_EQ(buffer, std::vector<float>(16, 42.0));
+}
+
+TEST_F(FfiTest, PerExecutionState) {
+  auto state = ffi::FfiModulesState::Instantiate();
+  ASSERT_TRUE(state.ok());
+
+  int32_t cnt0 = PerExecutionState::counter;
+  absl::StatusOr<ffi::FfiStateVector> state_vector0 = state->state_vector();
+  absl::StatusOr<ffi::FfiStateVector> state_vector1 = state->state_vector();
+  absl::StatusOr<ffi::FfiStateVector> state_vector2 = state->state_vector();
+  int32_t cnt1 = PerExecutionState::counter;
+
+  // Check that every time we create a state vector we create a new instance of
+  // the `PerExecutionState`.
+  EXPECT_EQ(cnt1 - cnt0, 3);
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/jit_executable.cc b/tensorflow/compiler/xla/runtime/jit_executable.cc
index 3df85d2eed7..a6ca56d385f 100644
--- a/tensorflow/compiler/xla/runtime/jit_executable.cc
+++ b/tensorflow/compiler/xla/runtime/jit_executable.cc
@@ -111,14 +111,14 @@ static bool HasStaticShapeOperands(const FunctionType& signature) {
 
   for (unsigned ordinal = 0; ordinal < (*compiler)->num_exported(); ordinal++) {
     auto fn = (*compiler)->exported(ordinal);
-
     // Get resolved operands constraints for the exported function.
     auto constraints = GetArgumentsConstraints(fn);
     if (!constraints.ok()) return constraints.status();
 
     // Get the exported function signature, it will be later required to
     // compute the specialized function signature from the operands at runtime.
-    auto signature = opts.compiler.type_converter.Convert(fn.getFunctionType());
+    auto signature = opts.compiler.type_converter.Convert(
+        llvm::cast<mlir::FunctionType>(fn.getFunctionType()));
     if (!signature.ok()) return signature.status();
 
     JitExecutable::Function function{fn.getName(), std::move(*signature),
diff --git a/tensorflow/compiler/xla/runtime/map_by_type.h b/tensorflow/compiler/xla/runtime/map_by_type.h
index 4ef3ae68ca1..412f058eb94 100644
--- a/tensorflow/compiler/xla/runtime/map_by_type.h
+++ b/tensorflow/compiler/xla/runtime/map_by_type.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_MAP_BY_TYPE_H_
 #define TENSORFLOW_COMPILER_XLA_RUNTIME_MAP_BY_TYPE_H_
 
+#include <algorithm>
 #include <vector>
 
 #include "llvm/ADT/SmallVector.h"
@@ -39,9 +40,16 @@ namespace runtime {
 //   assert(map.contains<int32_t*>());
 //   assert(map.contains<int64_t*>());
 //
-template <typename IdSet>
+template <typename IdSet, unsigned n = 16>
 class PtrMapByType {
  public:
+  PtrMapByType() = default;
+
+  template <typename... Ts>
+  explicit PtrMapByType(Ts*... values) {
+    insert_all<Ts...>(values..., std::make_index_sequence<sizeof...(Ts)>{});
+  }
+
   template <typename T>
   T* insert(T* value) {
     size_t id = GetDenseTypeId<T>();
@@ -54,16 +62,7 @@ class PtrMapByType {
 
   template <typename... Ts>
   void insert_all(Ts*... values) {
-    static constexpr size_t n = sizeof...(Ts);
-    if (n == 0) {
-      return;
-    }
-
-    // Resize the `data_` to prepare the storage for inserted values.
-    std::array<size_t, n> ids = {GetDenseTypeId<Ts>()...};
-    data_.resize(1 + *std::max_element(ids.begin(), ids.end()), nullptr);
-
-    (insert<Ts>(values), ...);
+    insert_all<Ts...>(values..., std::make_index_sequence<sizeof...(Ts)>{});
   }
 
   template <typename T>
@@ -92,7 +91,17 @@ class PtrMapByType {
     return DenseTypeId<IdSet>::template get<T>();
   }
 
-  llvm::SmallVector<void*> data_;
+  template <typename... Ts, size_t... Is>
+  void insert_all(Ts*... values, std::index_sequence<Is...>) {
+    static constexpr size_t kNumInserted = sizeof...(Ts);
+    if constexpr (kNumInserted > 0) {
+      std::array<size_t, kNumInserted> ids = {GetDenseTypeId<Ts>()...};
+      data_.resize(1 + *std::max_element(ids.begin(), ids.end()), nullptr);
+      ((data_[ids[Is]] = const_cast<std::decay_t<Ts>*>(values)), ...);
+    }
+  }
+
+  llvm::SmallVector<void*, n> data_;
 };
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/runtime/module.h b/tensorflow/compiler/xla/runtime/module.h
new file mode 100644
index 00000000000..8338b475af9
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/module.h
@@ -0,0 +1,185 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace runtime {
+
+template <typename S, typename R>
+class StatefulModule;
+class StatelessModule;
+
+//===---------------------------------------------------------------------===///
+// Xla runtime module.
+//===---------------------------------------------------------------------===///
+
+// Xla runtime module is a way to structure custom calls (external functions)
+// together with the state required for calling them. Xla runtime executable can
+// be linked with multiple of such modules at run time.
+//
+// As an example Xla Gpu has a StreamExecutor-based HAL (hardware abstraction
+// layer) implemented as a runtime module. Different Xla backends can require
+// different sets of modules for running executables, and some of the modules
+// can be shared between backends (e.g. XProf integration).
+//
+// User-defined modules must inherit from `StatefulModule` or `StatelessModule`.
+class Module {
+ public:
+  // When a module is instantiated for each runtime executable it can optionally
+  // create a state object that can be used to implement stateful functions,
+  // to keep a state between exported functions invocations. State can be
+  // accessed from different executable invocations running concurrently.
+  struct State {
+    virtual ~State() = default;
+  };
+
+  // StateRef is a potentially long-lived reference held by the runtime that
+  // must guarantee that all pointers added to the user data container are
+  // pointing to live objects (see `InitializeUserData` for more details).
+  struct StateRef {
+    virtual ~StateRef() = default;
+  };
+
+  virtual ~Module() = default;
+
+  virtual std::string_view name() const = 0;
+
+  // Creates a new per-executable module state.
+  virtual absl::StatusOr<std::unique_ptr<State>> CreateState() const = 0;
+
+  // Initializes `user_data` with values required for calling functions exported
+  // from the module. If initializing `user_data` requires creating temporaries,
+  // then the returned `StateRef` should guarantee their lifetime. Runtime
+  // itself guarantees that `state` is alive when it calls any of the exported
+  // functions; this means that `user_data` can safely point to `state`'s
+  // members.
+  virtual absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
+      State* state, CustomCall::UserData& user_data) const = 0;
+
+  // Exports direct custom calls provided by this module to the registry.
+  virtual void Export(DirectCustomCallRegistry& registry) const {}
+
+  // Exports dynamic custom calls provided by this module to the registry.
+  virtual void Export(DynamicCustomCallRegistry& registry) const {}
+
+ private:
+  template <typename S, typename R>
+  friend class StatefulModule;
+  friend class StatelessModule;
+
+  Module() = default;
+};
+
+//===---------------------------------------------------------------------===///
+// Stateful and typed Xla runtime module.
+//===---------------------------------------------------------------------===///
+
+template <typename S, typename R = void>
+class StatefulModule : public Module {
+  static_assert(std::is_base_of_v<State, S>,
+                "State must be derived from Module::State");
+
+  static_assert(std::is_base_of_v<StateRef, R> || std::is_void_v<R>,
+                "State ref must be `void` or derived from Module::StateRef");
+
+ public:
+  std::string_view name() const final { return name_; }
+
+  // Creates a new per-executable module State.
+  virtual absl::StatusOr<std::unique_ptr<S>> CreateModuleState() const = 0;
+
+  // If state ref is void then UserData initialization must return
+  // `absl::Status`, otherwise it must return `absl::StatusOr` with a state
+  // reference value.
+  virtual std::conditional_t<std::is_void_v<R>, absl::Status,
+                             absl::StatusOr<std::unique_ptr<R>>>
+  InitializeUserData(S* state, CustomCall::UserData& user_data) const {
+    if constexpr (std::is_void_v<R>) {
+      return absl::OkStatus();
+    } else {
+      return nullptr;
+    }
+  }
+
+ protected:
+  explicit StatefulModule(std::string name) : name_(std::move(name)) {}
+
+ private:
+  absl::StatusOr<std::unique_ptr<State>> CreateState() const final {
+    return CreateModuleState();
+  }
+
+  absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
+      State* state, CustomCall::UserData& user_data) const final {
+    auto initialized = InitializeUserData(dynamic_cast<S*>(state), user_data);
+
+    // Convert `absl::OkStatus` to an `absl::StatusOr` with an empty state ref.
+    if constexpr (std::is_void_v<R>) {
+      if (initialized.ok()) return nullptr;
+    }
+
+    return initialized;
+  }
+
+  std::string name_;
+};
+
+//===---------------------------------------------------------------------===///
+// Stateless Xla runtime module.
+//===---------------------------------------------------------------------===///
+
+class StatelessModule : public Module {
+ public:
+  std::string_view name() const final { return name_; }
+
+  virtual absl::Status InitializeUserData(
+      CustomCall::UserData& user_data) const {
+    return absl::OkStatus();
+  }
+
+ protected:
+  explicit StatelessModule(std::string name) : name_(std::move(name)) {}
+
+ private:
+  absl::StatusOr<std::unique_ptr<State>> CreateState() const final {
+    return nullptr;
+  }
+
+  absl::StatusOr<std::unique_ptr<StateRef>> InitializeUserData(
+      State* state, CustomCall::UserData& user_data) const final {
+    if (auto init = InitializeUserData(user_data); !init.ok()) return init;
+    return nullptr;
+  }
+
+  std::string name_;
+};
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_H_
diff --git a/tensorflow/compiler/xla/runtime/module_registry.cc b/tensorflow/compiler/xla/runtime/module_registry.cc
new file mode 100644
index 00000000000..ffe7836d85e
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/module_registry.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/module_registry.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace xla {
+namespace runtime {
+
+static std::vector<std::unique_ptr<Module>>& RegisteredModules() {
+  static auto* modules = new std::vector<std::unique_ptr<Module>>;
+  return *modules;
+}
+
+void RegisterModule(std::unique_ptr<Module> module) {
+  VLOG(1) << "Register XLA runtime module: " << module->name();
+  RegisteredModules().push_back(std::move(module));
+}
+
+void ExportModules(DynamicCustomCallRegistry& registry) {
+  for (auto& module : RegisteredModules()) {
+    module->Export(registry);
+  }
+}
+
+ModulesState::ModulesState(
+    std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state)
+    : state_(std::move(state)) {}
+
+/*static*/ absl::StatusOr<ModulesState> ModulesState::Instantiate() {
+  VLOG(1) << "Instantiate state for all registered XLA runtime modules";
+  std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state_vec;
+
+  for (auto& module : RegisteredModules()) {
+    VLOG(2) << "Instantiate state for module: " << module->name();
+    auto state = module->CreateState();
+    if (!state.ok()) return state.status();
+    state_vec.emplace_back(module.get(), std::move(*state));
+  }
+
+  return ModulesState(std::move(state_vec));
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Module::StateRef>>>
+ModulesState::InitializeUserData(CustomCall::UserData& user_data) {
+  VLOG(1) << "Initialize UserData for all XLA runtime modules";
+  std::vector<std::unique_ptr<Module::StateRef>> ref_vec;
+  ref_vec.reserve(state_.size());
+
+  for (auto& [module, state] : state_) {
+    VLOG(2) << "Initialize user data for module: " << module->name();
+    auto ref = module->InitializeUserData(state.get(), user_data);
+    if (!ref.ok()) return ref.status();
+    ref_vec.push_back(std::move(*ref));
+  }
+
+  return {std::move(ref_vec)};
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/module_registry.h b/tensorflow/compiler/xla/runtime/module_registry.h
new file mode 100644
index 00000000000..c1cece9fc25
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/module_registry.h
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_REGISTRY_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_REGISTRY_H_
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/module.h"
+
+namespace xla {
+namespace runtime {
+
+// Registers Xla runtime module with a global modules registry.
+//
+// TODO(ezhulenev): We need to support modules restricted for different
+// platforms, e.g. we should not instantiate state for GPU modules when
+// compiling a CPU executable. Xla today uses a "platform name" (Host,
+// CUDA, etc...) for this.
+void RegisterModule(std::unique_ptr<Module> module);
+
+// Exports registered modules to the given custom call registry.
+//
+// TODO(ezhulenev): We also need to support exporting direct custom calls.
+void ExportModules(DynamicCustomCallRegistry& registry);
+
+// Helper macro to define a static module registration.
+#define XLA_REGISTER_RUNTIME_MODULE(FUNC) \
+  XLA_REGISTER_RUNTIME_MODULE_IMPL(FUNC, __COUNTER__)
+
+#define XLA_REGISTER_RUNTIME_MODULE_IMPL(FUNC, N)           \
+  static bool xla_runtime_module_##N##_registered_ = []() { \
+    ::xla::runtime::RegisterModule(FUNC);                   \
+    return true;                                            \
+  }()
+
+// Container that owns the state of all registered modules.
+class ModulesState {
+ public:
+  ModulesState() = default;
+
+  // Instantiates `ModulesState` from the registered module.
+  //
+  // TODO(ezhulenev): Take module's platform.
+  static absl::StatusOr<ModulesState> Instantiate();
+
+  // Initializes `UserData` from the module's state.
+  absl::StatusOr<std::vector<std::unique_ptr<Module::StateRef>>>
+  InitializeUserData(CustomCall::UserData& user_data);
+
+ private:
+  explicit ModulesState(
+      std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state);
+
+  std::vector<std::pair<Module*, std::unique_ptr<Module::State>>> state_;
+};
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_MODULE_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/runtime/module_test.cc b/tensorflow/compiler/xla/runtime/module_test.cc
new file mode 100644
index 00000000000..282e27378d1
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/module_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/module.h"
+
+#include <memory>
+#include <optional>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace runtime {
+
+struct TestStateRef : public Module::StateRef {};
+
+struct TestState : public Module::State {
+  explicit TestState(int32_t value) : value(value) {}
+
+  int32_t value;
+};
+
+struct ModuleNoStateRef : public StatefulModule<TestState> {
+  using Base = StatefulModule<TestState>;
+
+  ModuleNoStateRef() : Base("module-no-state-ref") {}
+
+  absl::StatusOr<std::unique_ptr<TestState>> CreateModuleState() const final {
+    return std::make_unique<TestState>(42);
+  }
+};
+
+struct ModuleWithStateRef : public StatefulModule<TestState, TestStateRef> {
+  using Base = StatefulModule<TestState, TestStateRef>;
+
+  ModuleWithStateRef() : Base("module-with-state-ref") {}
+
+  absl::StatusOr<std::unique_ptr<TestState>> CreateModuleState() const final {
+    return std::make_unique<TestState>(42);
+  }
+};
+
+struct ModuleNoState : public StatelessModule {
+  ModuleNoState() : StatelessModule("module-no-state") {}
+};
+
+TEST(ModuleTest, ModuleNoStateRef) {
+  ModuleNoStateRef module;
+  EXPECT_EQ(module.name(), "module-no-state-ref");
+
+  auto state = module.CreateModuleState();
+  ASSERT_TRUE(state.ok());
+  EXPECT_EQ((*state)->value, 42);
+
+  CustomCall::UserData user_data;
+  ASSERT_TRUE(module.InitializeUserData(state->get(), user_data).ok());
+}
+
+TEST(ModuleTest, ModuleWithStateRef) {
+  ModuleWithStateRef module;
+  EXPECT_EQ(module.name(), "module-with-state-ref");
+
+  auto state = module.CreateModuleState();
+  ASSERT_TRUE(state.ok());
+  EXPECT_EQ((*state)->value, 42);
+
+  CustomCall::UserData user_data;
+  ASSERT_TRUE(module.InitializeUserData(state->get(), user_data).ok());
+}
+
+TEST(ModuleTest, ModuleNoState) {
+  ModuleNoState module;
+  EXPECT_EQ(module.name(), "module-no-state");
+
+  auto state = dynamic_cast<Module&>(module).CreateState();
+  ASSERT_TRUE(state.ok());
+  EXPECT_FALSE(state->get());
+
+  CustomCall::UserData user_data;
+  ASSERT_TRUE(module.InitializeUserData(user_data).ok());
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/results.h b/tensorflow/compiler/xla/runtime/results.h
index 84dcad7420f..fa5b484d02d 100644
--- a/tensorflow/compiler/xla/runtime/results.h
+++ b/tensorflow/compiler/xla/runtime/results.h
@@ -134,6 +134,52 @@ template <typename RetError, typename... RetValue>
 ResultConverterSet(RetError, RetValue...)
     -> ResultConverterSet<RetError, RetValue...>;
 
+//===----------------------------------------------------------------------===//
+// Helper functions for converting results of canonical types.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// This struct corresponds to the `llvm.struct` used by memref to llvm lowering
+// pass to represent memref descriptors in the compilation pipeline. It is a
+// type-erased version of `::mlir::StridedMemRefType<T>` template.
+struct MemrefDescriptor {
+  void* base_ptr;
+  void* data_ptr;
+  int64_t offset;
+  int64_t sizes_and_strides[];
+};
+
+}  // namespace internal
+
+// Converts returned memref using user-provided converter. Converter must
+// satisfy this concept:
+//
+//   struct Converter {
+//     ResultType operator()(PrimitiveType element_type, void* base_ptr,
+//                           void* data_ptr, int64_t offset,
+//                           absl::Span<const int64_t> dims,
+//                           absl::Span<const int64_t> strides);
+//   };
+//
+template <typename T, typename Converter>
+FailureOr<T> ConvertReturnedMemref(const Converter& converter,
+                                   const Type* memref_type, void* ret) {
+  // Check if the runtime type is a valid memref.
+  auto* memref = llvm::dyn_cast<MemrefType>(memref_type);
+  if (!memref) return failure();
+
+  PrimitiveType element_type = memref->element_type();
+  size_t rank = memref->rank();
+
+  auto* desc = reinterpret_cast<internal::MemrefDescriptor*>(ret);
+  absl::Span<const int64_t> dims(desc->sizes_and_strides, rank);
+  absl::Span<const int64_t> strides(desc->sizes_and_strides + rank, rank);
+
+  return converter(element_type, desc->base_ptr, desc->data_ptr, desc->offset,
+                   dims, strides);
+}
+
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/runtime/runner/BUILD b/tensorflow/compiler/xla/runtime/runner/BUILD
new file mode 100644
index 00000000000..abcff17ed77
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/BUILD
@@ -0,0 +1,85 @@
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "runner_lib",
+    srcs = ["runner.cc"],
+    hdrs = ["runner.h"],
+    deps = [
+        ":runner_proto_cc",
+        "//tensorflow/compiler/xla/runtime:arguments",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:jit_executable",
+        "//tensorflow/compiler/xla/runtime:logical_result",
+        "//tensorflow/compiler/xla/runtime:results",
+        "//tensorflow/compiler/xla/runtime:types",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_proto_library(
+    name = "runner_proto",
+    srcs = ["runner.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/compiler/xla:xla_data_proto"],
+)
+
+xla_py_proto_library(
+    name = "runner_pb2",
+    api_version = 2,
+    deps = [":runner_proto"],
+)
+
+xla_py_proto_library(
+    name = "xla_data_pb2",
+    api_version = 2,
+    deps = ["//tensorflow/compiler/xla:xla_data_proto"],
+)
+
+py_library(
+    name = "runner",
+    testonly = True,
+    srcs = ["runner.py"],
+    deps = [
+        ":runner_proto_py",
+        "//tensorflow/compiler/xla/python:xla_client",
+    ],
+)
+
+# copybara:uncomment_begin(b/254857628)
+# py_test(
+#     name = "testlib_runner_test",
+#     size = "small",
+#     srcs = ["testlib_runner_test.py"],
+#     data = [":testlib_runner"],
+#     python_version = "PY3",
+#     srcs_version = "PY3",
+#     deps = [
+#         ":runner",
+#         "//third_party/py/numpy",
+#         "@absl_py//absl/testing:absltest",
+#     ],
+# )
+#
+# cc_binary(
+#     name = "testlib_runner",
+#     testonly = True,
+#     srcs = ["testlib_runner.cc"],
+#     deps = [
+#         ":runner_lib",
+#         "//tensorflow/compiler/xla/mlir/runtime/transforms/tests:testlib_pipeline",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.cc b/tensorflow/compiler/xla/runtime/runner/runner.cc
new file mode 100644
index 00000000000..c176e7cab27
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/runner.cc
@@ -0,0 +1,342 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/runner/runner.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/runtime/arguments.h"
+#include "tensorflow/compiler/xla/runtime/logical_result.h"
+#include "tensorflow/compiler/xla/runtime/results.h"
+#include "tensorflow/compiler/xla/runtime/runner/runner.pb.h"
+#include "tensorflow/compiler/xla/runtime/types.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/init_main.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+
+namespace xla {
+namespace runtime {
+
+using absl::InternalError;
+using absl::InvalidArgumentError;
+using absl::StrFormat;
+
+using tsl::Env;
+using tsl::ReadBinaryProto;
+using tsl::ReadFileToString;
+using tsl::ReadTextProto;
+using tsl::WriteBinaryProto;
+using tsl::WriteTextProto;
+
+using RunnerArgs = Arguments<ScalarArg, MemrefDesc>;
+
+void AppendRunnerFlags(std::vector<tsl::Flag>* flag_list, RunnerFlags* flags) {
+  flag_list->emplace_back("function", &flags->function, "Test function name.");
+
+  flag_list->emplace_back("module", &flags->module_path, "Path to MLIR input.");
+
+  flag_list->emplace_back(
+      "arguments", &flags->arguments_path,
+      "Path to arguments file. If the file ends in '.pbtxt' it is expected to "
+      "be in the human-readable proto text format, otherwise it is expected "
+      "to be in the proto binary format.");
+
+  flag_list->emplace_back(
+      "results", &flags->results_path,
+      "Path to results file. The runner tool will serialize results into a "
+      " proto message and write it to this file path.");
+}
+//===----------------------------------------------------------------------===//
+
+AsyncTaskRunner* NoAsyncTaskRunner() {
+  return reinterpret_cast<AsyncTaskRunner*>(0xDEADBEEF);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions to Read/Write protobuf messages.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+static absl::Status ReadProtoFile(Env* env, const std::string& fname,
+                                  T* proto) {
+  if (absl::EndsWith(fname, ".pbtxt")) {
+    return ToAbslStatus(ReadTextProto(env, fname, proto));
+  } else {
+    return ToAbslStatus(ReadBinaryProto(env, fname, proto));
+  }
+}
+
+template <typename T>
+static absl::Status WriteProtoFile(Env* env, const std::string& fname,
+                                   T& proto) {
+  if (absl::EndsWith(fname, ".pbtxt")) {
+    return ToAbslStatus(WriteTextProto(env, fname, proto));
+  } else {
+    return ToAbslStatus(WriteBinaryProto(env, fname, proto));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Convert ArgumentsProto message to Xla runtime arguments.
+//===----------------------------------------------------------------------===//
+
+static absl::Status ConvertScalar(const ScalarProto& scalar, RunnerArgs& args) {
+  switch (scalar.value_case()) {
+    case ScalarProto::ValueCase::kI32:
+      args.emplace_back<ScalarArg>(scalar.i32());
+      break;
+    case ScalarProto::ValueCase::kI64:
+      args.emplace_back<ScalarArg>(scalar.i64());
+      break;
+    default:
+      return InvalidArgumentError(
+          StrFormat("unsupported scalar argument: %s", scalar.DebugString()));
+  }
+  return absl::OkStatus();
+}
+
+static absl::Status ConvertTensor(const TensorProto& tensor, RunnerArgs& args) {
+  args.emplace_back<MemrefDesc>(
+      tensor.dtype(),
+      static_cast<void*>(const_cast<std::string*>(&tensor.contents())),
+      /*offset=*/0, tensor.sizes(), tensor.strides());
+  return absl::OkStatus();
+}
+
+// Converts arguments protobuf message into Xla runtime arguments.
+static absl::Status ConvertArgs(ArgumentsProto& proto, RunnerArgs& args) {
+  for (auto& arg : proto.arguments()) {
+    switch (arg.argument_case()) {
+      // Convert `ScalarProto` -> `ScalarArg`.
+      case ArgumentProto::ArgumentCase::kScalar:
+        if (auto st = ConvertScalar(arg.scalar(), args); !st.ok()) return st;
+        break;
+      // Convert `TensorProto` -> `MemrefDesc`.
+      case ArgumentProto::ArgumentCase::kTensor:
+        if (auto st = ConvertTensor(arg.tensor(), args); !st.ok()) return st;
+        break;
+      // Unsupported argument type.
+      default:
+        return InvalidArgumentError(
+            StrFormat("unsupported argument: %s", arg.DebugString()));
+    }
+  }
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// Convert returned results to ResultsProto message.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): Implement error propagation through the results proto.
+static void CheckNoError(const absl::Status& status) {
+  LOG(FATAL) << "Unexpected call to `ReturnError`";
+}
+
+// Converts results returned from compiled Xla executable to results proto.
+struct ReturnResults {
+  LogicalResult operator()(unsigned result_index, const Type* type,
+                           const Type* runtime_type, void* ret) const {
+    // We rely on the fact that result converter handles results from left to
+    // right and we can push new results to the back of the list.
+    auto* result = proto->add_results();
+
+    // Return scalar result as `ScalarProto`.
+    auto* scalar = llvm::dyn_cast<ScalarType>(type);
+    switch (scalar ? scalar->type() : PrimitiveType::PRIMITIVE_TYPE_INVALID) {
+      case PrimitiveType::S32:
+        ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(ret, sizeof(int32_t));
+        result->mutable_scalar()->set_i32(*reinterpret_cast<int32_t*>(ret));
+        return success();
+      default:
+        break;
+    }
+
+    // Assuming result cannot be processed as Scalar, try `TensorProto`
+    auto* memref = llvm::dyn_cast<MemrefType>(runtime_type);
+    if (memref) {
+      auto desc = ConvertReturnedMemref<MemrefDesc>(*this, memref, ret);
+      if (failed(desc)) return failure();
+
+      char* data = static_cast<char*>(desc->data());
+      int64_t size_in_bytes = primitive_util::ByteWidth(desc->dtype());
+
+      TensorProto* tensor_proto = result->mutable_tensor();
+      for (int64_t size : desc->sizes()) {
+        size_in_bytes *= size;
+        tensor_proto->add_sizes(size);
+      }
+
+      ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(data, size_in_bytes);
+      tensor_proto->set_contents(std::string(data, size_in_bytes));
+      tensor_proto->set_dtype(desc->dtype());
+
+      std::free(desc->data());
+      return success();
+    }
+
+    return failure();
+  }
+
+  MemrefDesc operator()(PrimitiveType element_type, void* base_ptr,
+                        void* data_ptr, int64_t offset,
+                        absl::Span<const int64_t> sizes,
+                        absl::Span<const int64_t> strides) const {
+    return MemrefDesc(element_type, base_ptr, offset, sizes, strides);
+  }
+
+  ResultsProto* proto = nullptr;
+};
+
+// Converts arguments protobuf message into Xla runtime arguments.
+static absl::Status WriteInoutResults(ArgumentsProto& proto, RunnerArgs& args,
+                                      ResultsProto* results) {
+  for (int i = 0; i < proto.arguments().size(); ++i) {
+    ArgumentProto arg = proto.arguments().Get(i);
+    switch (arg.argument_case()) {
+      case ArgumentProto::ArgumentCase::kScalar:
+        continue;
+      case ArgumentProto::ArgumentCase::kTensor:
+        if (arg.tensor().inout()) {
+          auto* result = results->add_results();
+          TensorProto* tensor_proto = result->mutable_tensor();
+
+          auto* memref = llvm::cast<MemrefDesc>(&args[i]);
+
+          char* sv = static_cast<char*>(memref->data());
+          int64_t size_in_bytes = primitive_util::ByteWidth(memref->dtype());
+
+          for (int64_t size : memref->sizes()) {
+            size_in_bytes *= size;
+            tensor_proto->add_sizes(size);
+          }
+
+          tensor_proto->set_contents(std::string(sv, size_in_bytes));
+          tensor_proto->set_dtype(memref->dtype());
+        }
+        break;
+      // Unsupported argument type.
+      default:
+        return InvalidArgumentError(
+            StrFormat("unsupported argument: %s", arg.DebugString()));
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+
+absl::Status Execute(RunnerFlags flags,
+                     const JitExecutable::Options& compile_opts,
+                     const Executable::ExecuteOpts& execute_opts) {
+  LOG(INFO) << "Executing runner tool:\n"
+            << " - module: " << flags.module_path << "\n"
+            << " - arguments: " << flags.arguments_path << "\n"
+            << " - results: " << flags.results_path;
+
+  Env* env = Env::Default();
+
+  // Read MLIR module from the input file.
+  std::string module;
+  if (auto st = ReadFileToString(env, flags.module_path, &module); !st.ok()) {
+    return InternalError(
+        StrFormat("failed to read module input from %s, error: %s",
+                  flags.module_path, st.error_message()));
+  }
+
+  // Read arguments from the input file.
+  ArgumentsProto args_proto;
+  if (auto read = ReadProtoFile(env, flags.arguments_path, &args_proto);
+      !read.ok()) {
+    return InternalError(
+        StrFormat("failed to read arguments input from %s, error %s",
+                  flags.arguments_path, read.message()));
+  }
+
+  // Convert arguments proto message to the Xla runtime arguments.
+  RunnerArgs args(args_proto.arguments_size());
+  if (auto converted = ConvertArgs(args_proto, args); !converted.ok())
+    return converted;
+
+  // Instantiate JitExecutable from the input module.
+  absl::StatusOr<JitExecutable> jit_executable =
+      JitExecutable::Instantiate(module, compile_opts, {flags.function});
+  if (!jit_executable.ok()) return jit_executable.status();
+
+  // TODO(ezhulenev): Add support for specializing to arguments shapes/values.
+  AsyncValuePtr<Executable> executable = jit_executable->DefaultExecutable();
+  if (executable.IsError()) return executable.GetError();
+
+  // Convert returned results to results proto.
+  ResultsProto results_proto;
+  ResultConverterSet converter(CheckNoError, ReturnResults{&results_proto});
+
+  // Execute and convert results to proto message.
+  if (auto executed = executable->Execute(args, converter, execute_opts);
+      !executed.ok())
+    return executed.status();
+
+  if (auto inout = WriteInoutResults(args_proto, args, &results_proto);
+      !inout.ok())
+    return inout;
+
+  // Write results proto to the requested file location.
+  if (auto wrote = WriteProtoFile(env, flags.results_path, results_proto);
+      !wrote.ok())
+    return InternalError(
+        StrFormat("failed to write results proto to %s, error %s",
+                  flags.results_path, wrote.message()));
+
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// Compose Xla Runtime Runner into `main` function.
+//===----------------------------------------------------------------------===//
+
+int Main(int argc, char** argv, const JitExecutable::Options& compile_opts,
+         const Executable::ExecuteOpts& execute_opts) {
+  xla::runtime::RunnerFlags flags;
+
+  std::vector<tsl::Flag> flag_list;
+  xla::runtime::AppendRunnerFlags(&flag_list, &flags);
+
+  if (auto parsed = tsl::Flags::Parse(&argc, argv, flag_list); !parsed) {
+    std::cerr << "Failed to parse runner flags";
+    return 1;
+  }
+
+  tsl::port::InitMain(argv[0], &argc, &argv);
+
+  if (auto executed = Execute(flags, compile_opts, execute_opts);
+      !executed.ok()) {
+    std::cerr << "Failed to execute runner tool: " << executed.message();
+    return 1;
+  }
+
+  return 0;
+}
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.h b/tensorflow/compiler/xla/runtime/runner/runner.h
new file mode 100644
index 00000000000..124e28f8aed
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/runner.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_RUNNER_RUNNER_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_RUNNER_RUNNER_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
+
+namespace xla {
+namespace runtime {
+
+struct RunnerFlags {
+  std::string function;
+  std::string module_path;
+  std::string arguments_path;
+  std::string results_path;
+};
+
+void AppendRunnerFlags(std::vector<tsl::Flag>* flag_list, RunnerFlags* flags);
+
+// Fake AsyncTaskRunner for programs that do not plan to execute any async work.
+AsyncTaskRunner* NoAsyncTaskRunner();
+
+// Compiles and executes the MLIR input program defined by `flags` using
+// user-provided compilation and execution options.
+absl::Status Execute(RunnerFlags flags,
+                     const JitExecutable::Options& compile_opts,
+                     const Executable::ExecuteOpts& execute_opts);
+
+// A wrapper around `Execute` that does argument parsing and binary
+// initialization. Can be used as a main function in user-defined tools.
+int Main(int argc, char** argv, const JitExecutable::Options& compile_opts,
+         const Executable::ExecuteOpts& execute_opts);
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_RUNNER_RUNNER_H_
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.proto b/tensorflow/compiler/xla/runtime/runner/runner.proto
new file mode 100644
index 00000000000..ec5c0f75280
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/runner.proto
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+import "tensorflow/compiler/xla/xla_data.proto";
+
+message ScalarProto {
+  oneof value {
+    int32 i32 = 1;
+    int64 i64 = 2;
+  }
+}
+
+message TensorProto {
+  PrimitiveType dtype = 1;
+  int64 offset = 2;
+  repeated int64 sizes = 3;
+  repeated int64 strides = 4;
+  bool inout = 5;
+
+  bytes contents = 6;
+}
+
+message ArgumentProto {
+  oneof argument {
+    ScalarProto scalar = 1;
+    TensorProto tensor = 2;
+  }
+}
+
+message ResultProto {
+  oneof result {
+    ScalarProto scalar = 1;
+    TensorProto tensor = 2;
+  }
+}
+
+message ArgumentsProto {
+  repeated ArgumentProto arguments = 1;
+}
+
+message ResultsProto {
+  repeated ResultProto results = 1;
+}
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.py b/tensorflow/compiler/xla/runtime/runner/runner.py
new file mode 100644
index 00000000000..6606a401ccc
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/runner.py
@@ -0,0 +1,149 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python helper for running Xla runtime runner tools."""
+
+import os
+import subprocess
+import tempfile
+from typing import Any, Sequence
+
+import numpy as np
+
+from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.compiler.xla.runtime.runner import runner_pb2
+
+PrimitiveType = xla_data_pb2.PrimitiveType
+
+XLA_ELEMENT_TYPE_TO_DTYPE = {
+    PrimitiveType.PRED: np.dtype("bool"),
+    PrimitiveType.S8: np.dtype("int8"),
+    PrimitiveType.S16: np.dtype("int16"),
+    PrimitiveType.S32: np.dtype("int32"),
+    PrimitiveType.S64: np.dtype("int64"),
+    PrimitiveType.U8: np.dtype("uint8"),
+    PrimitiveType.U16: np.dtype("uint16"),
+    PrimitiveType.U32: np.dtype("uint32"),
+    PrimitiveType.U64: np.dtype("uint64"),
+    PrimitiveType.F16: np.dtype("float16"),
+    PrimitiveType.F32: np.dtype("float32"),
+    PrimitiveType.F64: np.dtype("float64"),
+    PrimitiveType.C64: np.dtype("complex64"),
+    PrimitiveType.C128: np.dtype("complex128"),
+    PrimitiveType.TUPLE: np.dtype(np.object_),
+    PrimitiveType.TOKEN: np.dtype(np.object_),
+}
+
+# Note the conversion on the key. Numpy has a known issue wherein dtype hashing
+# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
+# when keying by dtype in this dict, we use the string form of dtypes.
+DTYPE_TO_XLA_ELEMENT_TYPE = {
+    str(dt): et for et, dt in XLA_ELEMENT_TYPE_TO_DTYPE.items()
+}
+
+
+class Runner:
+  """Python helper for running Xla runtime runner tools."""
+
+  def __init__(self, runner: str):
+    self.runner = runner
+
+  def execute(self,
+              module: str,
+              function: str,
+              arguments: Sequence[Any],
+              inout: Sequence[int] = None) -> Sequence[Any]:
+    """Executes `module` with user-provided arguments."""
+    temp = tempfile.mkdtemp()
+
+    # Write input mlir module to a file.
+    module_file = os.path.join(temp, "module.mlir")
+    with open(module_file, "w") as f:
+      f.write(module)
+
+    inout = set(inout or [])
+
+    # Pack arguments into a proto message.
+    args_proto = runner_pb2.ArgumentsProto()
+    for i, arg in enumerate(arguments):
+      if isinstance(arg, int):
+        args_proto.arguments.append(
+            runner_pb2.ArgumentProto(scalar=runner_pb2.ScalarProto(i32=arg)))
+        if i in inout:
+          raise RuntimeError(f"inout param {i} cannot be of type ScalarArg")
+        continue
+      elif isinstance(arg, np.ndarray):
+        element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(arg.dtype)]
+        args_proto.arguments.append(
+            runner_pb2.ArgumentProto(
+                tensor=runner_pb2.TensorProto(
+                    dtype=element_type,
+                    sizes=arg.shape,
+                    strides=arg.strides,
+                    inout=(i in inout),
+                    contents=arg.tobytes())))
+
+        continue
+
+      raise TypeError("Unsupported argument type")
+
+    # Serialize argument proto message to a file.
+    arguments_file = os.path.join(temp, "arguments.pb")
+    with open(arguments_file, "wb") as f:
+      f.write(args_proto.SerializeToString())
+
+    # Expected results file path.
+    results_file = os.path.join(temp, "results.pb")
+
+    # Execute the runner tool.
+    runner_cmd = [
+        self.runner, "--logtostderr", f"--function={function}",
+        f"--module={module_file}", f"--arguments={arguments_file}",
+        f"--results={results_file}"
+    ]
+    result = subprocess.run(runner_cmd, capture_output=False, check=False)
+
+    if result.returncode != 0:
+      err = result.stderr.decode("utf-8")
+      raise RuntimeError(f"failed to execute runner tool: {err}")
+
+    # Read returned results.
+    with open(results_file, "rb") as f:
+      results_proto = runner_pb2.ResultsProto.FromString(f.read())
+
+    # Convert results from proto back to python objects.
+    results = []
+
+    for res in results_proto.results:
+      # Convert ScalarProto to scalar object
+      if res.HasField("scalar"):
+        scalar = res.scalar
+
+        if hasattr(scalar, "i32"):
+          results.append(scalar.i32)
+          continue
+        if hasattr(scalar, "i64"):
+          results.append(scalar.i64)
+          continue
+
+      # Convert TensorProto to numpy array
+      elif res.HasField("tensor"):
+        tensor = res.tensor
+        dtype = XLA_ELEMENT_TYPE_TO_DTYPE[tensor.dtype]
+        result_array = np.frombuffer(tensor.contents, dtype=dtype)
+        results.append(result_array)
+        continue
+
+      raise ValueError(f"Unknown result {res}")
+
+    return results
diff --git a/tensorflow/compiler/xla/runtime/runner/testlib_runner.cc b/tensorflow/compiler/xla/runtime/runner/testlib_runner.cc
new file mode 100644
index 00000000000..76cf9680115
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/testlib_runner.cc
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/tests/testlib_pipeline.h"
+#include "tensorflow/compiler/xla/runtime/runner/runner.h"
+
+using namespace xla::runtime;  // NOLINT
+
+static JitExecutable::Options CompileOpts() {
+  JitExecutable::Options opts;
+  opts.specialization = JitExecutable::Specialization::kDisabled;
+  opts.compiler.register_dialects = RegisterXlaRuntimeTestlibDialects;
+  opts.compiler.create_compilation_pipeline = CreateXlaRuntimeTestlibPipeline;
+  return opts;
+}
+
+static Executable::ExecuteOpts ExecuteOpts() {
+  Executable::ExecuteOpts opts;
+  opts.async_task_runner = xla::runtime::NoAsyncTaskRunner();
+  return opts;
+}
+
+int main(int argc, char** argv) {
+  return xla::runtime::Main(argc, argv, CompileOpts(), ExecuteOpts());
+}
diff --git a/tensorflow/compiler/xla/runtime/runner/testlib_runner_test.py b/tensorflow/compiler/xla/runtime/runner/testlib_runner_test.py
new file mode 100644
index 00000000000..2a95d9cc264
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/runner/testlib_runner_test.py
@@ -0,0 +1,85 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for testlib_runner."""
+
+import pathlib
+
+from absl.testing import absltest
+import numpy as np
+
+from tensorflow.compiler.xla.runtime.runner import runner
+
+# We assume that the testlib runner is defined in the same project as this test.
+r = runner.Runner(f'{pathlib.Path(__file__).parent.resolve()}/testlib_runner')
+
+
+class TestlibRunnerTest(absltest.TestCase):
+
+  def testScalarAdd(self):
+    module = """
+      func.func @add(%arg0: i32) -> i32 {
+        %0 = arith.constant 42 : i32
+        %1 = arith.addi %arg0, %0 : i32
+        return %1 : i32
+      }"""
+
+    [res] = r.execute(module, 'add', [42])
+    self.assertEqual(res, 84)
+
+  def testTensorAdd(self):
+    module = """
+      func.func @addtensor(%arg0: memref<?xf32>) {
+        %c0 = arith.constant 0 : index
+        %c1 = arith.constant 3 : index
+        %step = arith.constant 1 : index
+
+        scf.for %i = %c0 to %c1 step %step {
+          %0 = arith.constant 42.0 : f32
+          %1 = memref.load %arg0[%i] : memref<?xf32>
+          %2 = arith.addf %0, %1 : f32
+          memref.store %2, %arg0[%i] : memref<?xf32>
+        }
+        
+        func.return
+      }"""
+
+    arg = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+    [res] = r.execute(module, 'addtensor', [arg], inout=[0])
+    self.assertTrue(
+        np.array_equal(res, np.array([43.0, 44.0, 45.0], dtype=np.float32)))
+
+  def testTensorReturn(self):
+    module = """
+      func.func @returntensor(%arg0: memref<?xf32>) -> memref<4xf32> {
+      %out = memref.alloc() : memref<4xf32>
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 4 : index
+      %step = arith.constant 1 : index
+
+      scf.for %i = %c0 to %c1 step %step {
+        %0 = memref.load %arg0[%i] : memref<?xf32>
+        memref.store %0, %out[%i] : memref<4xf32>
+      }
+
+      return %out : memref<4xf32>
+    }"""
+
+    arg = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
+    [res] = r.execute(module, 'returntensor', [arg])
+
+    self.assertTrue(
+        np.array_equal(res, np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)))
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/tensorflow/compiler/xla/runtime/state.h b/tensorflow/compiler/xla/runtime/state.h
new file mode 100644
index 00000000000..4aad2a15037
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/state.h
@@ -0,0 +1,182 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_STATE_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_STATE_H_
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace runtime {
+
+// Forward declare.
+template <typename T>
+class State;
+
+// StateVector is a container for keeping the state between custom call
+// invocations (within same or concurrent calls to `Executable::Execute`).
+//
+// Every instance of the custom call in the executable must be assigned a unique
+// id (dense i64 integer in the [0, num_instances) range), and the state vector
+// enables efficient access to value of type `T`, with lazy initialization done
+// by the custom call itself.
+//
+// Custom calls implement special bindings for state arguments:
+//
+//   CustomCall::Bind("foo")
+//     .State<FooState>("id_attr_name")
+//     .To([](State<FooState> state) {
+//       FooState foo = state.GetOrCreate(...);
+//       ...
+//     });
+//
+// State snapshot must be passed via UserData:
+//
+//   StateVector<FooState>::Snapshot snapshot = ...;
+//   CustomCall::UserData user_data(&snapshot, ...);
+//
+// See `custom_call.h` and `custom_call_test.cc` for more examples.
+template <typename T>
+class StateVector {
+ public:
+  explicit StateVector(size_t reserve = 0);
+
+  // Snapshot wraps the read-only state snapshot (potentially obsolete) and
+  // falls back on synchronized state access if it can't find an entry in its
+  // read-only state snapshot.
+  class Snapshot {
+   public:
+    // Returns a state for the given id if it's already available, or tries to
+    // create a new one using the user-provided `create` function. Returns an
+    // error if state creation fails. Failures are not cached and always
+    // retried.
+    template <typename F>
+    absl::StatusOr<T*> GetOrCreate(size_t id, F&& create);
+
+    // Returns a state constructed from this snapshot for a given id.
+    State<T> state(size_t id) { return State<T>(id, this); }
+
+   private:
+    friend StateVector;
+    Snapshot(StateVector& state, std::shared_ptr<std::vector<T*>> snapshot);
+
+    // State vector that this snapshot was taken from.
+    StateVector& owning_state_;
+
+    // State vector snapshot captured at construction time. It might not contain
+    // all the latest data available in the `owning_state_`.
+    std::shared_ptr<std::vector<T*>> maybe_obsolete_snapshot_;
+  };
+
+  Snapshot snapshot();
+  T* operator[](size_t id);
+
+ private:
+  mutable absl::Mutex mu_;
+
+  // StateVector owns the values of type `T` indexed by the key.
+  std::vector<std::unique_ptr<T>> vector_ ABSL_GUARDED_BY(mu_);
+
+  // A read-only snapshot of `vector_` that is updated every time a new entry
+  // is added to the state vector.
+  std::shared_ptr<std::vector<T*>> vector_snapshot_ ABSL_GUARDED_BY(mu_);
+};
+
+template <typename T>
+class State {
+  using Snapshot = typename StateVector<T>::Snapshot;
+
+ public:
+  template <typename F>
+  absl::StatusOr<T*> GetOrCreate(F&& create) {
+    return snapshot_->GetOrCreate(id_, std::forward<F>(create));
+  }
+
+ private:
+  friend Snapshot;
+  State(size_t id, Snapshot* snapshot) : id_(id), snapshot_(snapshot) {}
+
+  size_t id_;
+  Snapshot* snapshot_;
+};
+
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+StateVector<T>::StateVector(size_t reserve)
+    : vector_(reserve), vector_snapshot_(new std::vector<T*>(reserve)) {}
+
+template <typename T>
+StateVector<T>::Snapshot::Snapshot(StateVector& state,
+                                   std::shared_ptr<std::vector<T*>> snapshot)
+    : owning_state_(state), maybe_obsolete_snapshot_(std::move(snapshot)) {}
+
+template <typename T>
+auto StateVector<T>::snapshot() -> Snapshot {
+  absl::MutexLock lock(&mu_);
+  return Snapshot(*this, vector_snapshot_);
+}
+
+template <typename T>
+T* StateVector<T>::operator[](size_t id) {
+  absl::MutexLock lock(&mu_);
+  return id < vector_.size() ? vector_[id].get() : nullptr;
+}
+
+template <typename T>
+template <typename F>
+absl::StatusOr<T*> StateVector<T>::Snapshot::GetOrCreate(size_t id,
+                                                         F&& create) {
+  static_assert(std::is_invocable_r_v<absl::StatusOr<T>, F>);
+
+  // If snapshot already contains the entry, just return it.
+  std::vector<T*>& snapshot = *maybe_obsolete_snapshot_;
+  if (id < snapshot.size() && snapshot[id]) return snapshot[id];
+
+  // Otherwise go through the slow synchronized code path.
+  absl::MutexLock lock(&owning_state_.mu_);
+
+  // Check if value is present in the state vector, and was not captured in
+  // the snapshot that we have.
+  std::vector<std::unique_ptr<T>>& state = owning_state_.vector_;
+  if (id < state.size() && state[id].get()) return state[id].get();
+
+  // Try to insert a new entry.
+  absl::StatusOr<T> value = create();
+  if (!value.ok()) return value.status();
+
+  // Update the state vector.
+  if (id >= state.size()) state.resize(id + 1);
+  state[id] = std::make_unique<T>(*std::move(value));
+
+  // And take the new snapshot.
+  auto new_snapshot = std::make_shared<std::vector<T*>>(state.size());
+  for (size_t i = 0; i < state.size(); ++i) (*new_snapshot)[i] = state[i].get();
+
+  // Update the snapshot owned by the state.
+  owning_state_.vector_snapshot_ = std::move(new_snapshot);
+
+  return state[id].get();
+}
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_STATE_H_
diff --git a/tensorflow/compiler/xla/runtime/state_test.cc b/tensorflow/compiler/xla/runtime/state_test.cc
new file mode 100644
index 00000000000..0394996268b
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/state_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/state.h"
+
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/test_benchmark.h"
+
+namespace xla {
+namespace runtime {
+
+TEST(StateVectorTest, GetOrCreate) {
+  int32_t cnt = 0;
+  auto create = [&] { return cnt++; };
+
+  StateVector<int32_t> state;
+
+  StateVector<int32_t>::Snapshot empty_snapshot = state.snapshot();
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(0, create), 0);
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(0, create), 0);
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(1, create), 1);
+
+  StateVector<int32_t>::Snapshot snapshot = state.snapshot();
+  EXPECT_EQ(**snapshot.GetOrCreate(0, create), 0);
+  EXPECT_EQ(**snapshot.GetOrCreate(1, create), 1);
+  EXPECT_EQ(**snapshot.GetOrCreate(9, create), 2);
+
+  State<int32_t> st0 = snapshot.state(0);
+  State<int32_t> st1 = snapshot.state(1);
+  EXPECT_EQ(**st0.GetOrCreate(create), 0);
+  EXPECT_EQ(**st1.GetOrCreate(create), 1);
+
+  EXPECT_EQ(cnt, 3);
+}
+
+TEST(StateVectorTest, GetOrCreateAtRandomOrder) {
+  int32_t cnt = 0;
+  auto create = [&] { return cnt++; };
+
+  StateVector<int32_t> state;
+
+  StateVector<int32_t>::Snapshot empty_snapshot = state.snapshot();
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(99, create), 0);
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(22, create), 1);
+  EXPECT_EQ(**empty_snapshot.GetOrCreate(33, create), 2);
+
+  StateVector<int32_t>::Snapshot snapshot = state.snapshot();
+  EXPECT_EQ(**snapshot.GetOrCreate(99, create), 0);
+  EXPECT_EQ(**snapshot.GetOrCreate(22, create), 1);
+  EXPECT_EQ(**snapshot.GetOrCreate(33, create), 2);
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks.
+//===----------------------------------------------------------------------===//
+
+static void BM_TakeSnapshot(benchmark::State& state) {
+  StateVector<int32_t> ints;
+
+  for (auto _ : state) {
+    StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
+    benchmark::DoNotOptimize(snapshot);
+  }
+}
+
+static void BM_GetFromStateVectorVector(benchmark::State& state) {
+  StateVector<int32_t> ints;
+  StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
+
+  for (auto _ : state) {
+    auto value = snapshot.GetOrCreate(0, [] { return 0; });
+    assert(value.ok() && "unexpected error");
+    benchmark::DoNotOptimize(value);
+  }
+}
+
+static void BM_GetFromSnapshot(benchmark::State& state) {
+  StateVector<int32_t> ints;
+  StateVector<int32_t>::Snapshot empty_snapshot = ints.snapshot();
+  empty_snapshot.GetOrCreate(0, [] { return 0; }).IgnoreError();
+
+  StateVector<int32_t>::Snapshot snapshot = ints.snapshot();
+
+  for (auto _ : state) {
+    auto value = snapshot.GetOrCreate(0, [] { return 0; });
+    assert(value.ok() && "unexpected error");
+    benchmark::DoNotOptimize(value);
+  }
+}
+
+BENCHMARK(BM_TakeSnapshot);
+BENCHMARK(BM_GetFromStateVectorVector);
+BENCHMARK(BM_GetFromSnapshot);
+
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/symbolic_shape.cc b/tensorflow/compiler/xla/runtime/symbolic_shape.cc
index 3694a6fd65f..eee39c086ff 100644
--- a/tensorflow/compiler/xla/runtime/symbolic_shape.cc
+++ b/tensorflow/compiler/xla/runtime/symbolic_shape.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <numeric>
 
 #include "absl/status/status.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
@@ -61,7 +62,7 @@ SymbolicShapesResolver::SymbolicShapesResolver(
 
       // Keep track of all statically known dimension sizes.
       for (int64_t size : sizes) {
-        if (size != MemrefType::kDynamicSize) seen_static_sizes_.insert(size);
+        if (size != MemrefType::kDynamic) seen_static_sizes_.insert(size);
       }
     };
 
@@ -186,7 +187,7 @@ LLVM_ATTRIBUTE_ALWAYS_INLINE static LogicalResult ResolveImpl(
       symbolic_shapes[i].assign(static_sizes.begin(), static_sizes.end());
     } else {
       size_t rank = runtime_sizes.size();
-      symbolic_shapes[i].resize(rank, MemrefType::kDynamicSize);
+      symbolic_shapes[i].resize(rank, MemrefType::kDynamic);
     }
 
     MutableArrayRef<int64_t> symbolic_sizes = symbolic_shapes[i];
@@ -284,9 +285,8 @@ absl::StatusOr<llvm::hash_code> SymbolicShapesResolver::ResolveHash(
 
 /*static*/ StaticShape SymbolicShapesResolver::Normalize(
     const SymbolicShape& shape) {
-  auto normalize = llvm::map_range(shape, [](int64_t dim) {
-    return dim < 0 ? MemrefType::kDynamicSize : dim;
-  });
+  auto normalize = llvm::map_range(
+      shape, [](int64_t dim) { return dim < 0 ? MemrefType::kDynamic : dim; });
   return {normalize.begin(), normalize.end()};
 }
 
diff --git a/tensorflow/compiler/xla/runtime/symbolic_shape.h b/tensorflow/compiler/xla/runtime/symbolic_shape.h
index 3f64564668a..d6ea89126ba 100644
--- a/tensorflow/compiler/xla/runtime/symbolic_shape.h
+++ b/tensorflow/compiler/xla/runtime/symbolic_shape.h
@@ -62,7 +62,7 @@ class SymbolicShapesResolver {
  public:
   // Dimension size can be symbolic (<= -2) or static.
   using SymbolicShape = llvm::SmallVector<int64_t>;
-  // Dimension size can be dynamic (ShapedType::kDynamicSize) or static.
+  // Dimension size can be dynamic (ShapedType::kDynamic) or static.
   using StaticShape = llvm::SmallVector<int64_t>;
 
   SymbolicShapesResolver(const FunctionType& signature,
diff --git a/tensorflow/compiler/xla/runtime/symbolic_shape_test.cc b/tensorflow/compiler/xla/runtime/symbolic_shape_test.cc
index ff9f813bdf6..db6d2171447 100644
--- a/tensorflow/compiler/xla/runtime/symbolic_shape_test.cc
+++ b/tensorflow/compiler/xla/runtime/symbolic_shape_test.cc
@@ -84,9 +84,9 @@ TEST(SymbolicShapeResolverTest, UnrankedInputs) {
   // Operands: tensor<*xf32>, tensor<?xi32>, tensor<?x4xi1>
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
 
-  auto type = GetFunctionType(dtypes, {std::nullopt,
-                                       {{MemrefType::kDynamicSize}},
-                                       {{MemrefType::kDynamicSize, 4}}});
+  auto type = GetFunctionType(
+      dtypes,
+      {std::nullopt, {{MemrefType::kDynamic}}, {{MemrefType::kDynamic, 4}}});
 
   auto constraints = {ArgumentConstraint::kResolved,
                       ArgumentConstraint::kResolved,
@@ -146,9 +146,9 @@ TEST(SymbolicShapeResolverTest, UnrankedInputs) {
 TEST(SymbolicShapeResolverTest, DynamicInputShapes) {
   // Operands: tensor<?xf32>, tensor<?xi32>, tensor<?xi1>
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamicSize}},
-                                       {{MemrefType::kDynamicSize}},
-                                       {{MemrefType::kDynamicSize}}});
+  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic}},
+                                       {{MemrefType::kDynamic}},
+                                       {{MemrefType::kDynamic}}});
 
   auto constraints = {ArgumentConstraint::kResolved,
                       ArgumentConstraint::kResolved,
@@ -208,9 +208,9 @@ TEST(SymbolicShapeResolverTest, DynamicInputShapes) {
 TEST(SymbolicShapeResolverTest, PartialInputShapes) {
   // Operands: tensor<?x4xf32>, tensor<?x8xi32>, tensor<?xi1>
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamicSize, 4}},
-                                       {{MemrefType::kDynamicSize, 8}},
-                                       {{MemrefType::kDynamicSize}}});
+  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
+                                       {{MemrefType::kDynamic, 8}},
+                                       {{MemrefType::kDynamic}}});
 
   auto constraints = {ArgumentConstraint::kResolved,
                       ArgumentConstraint::kResolved,
@@ -284,7 +284,7 @@ TEST(SymbolicShapeResolverTest, ShapeConstrainedInput) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
 
   auto type =
-      GetFunctionType(dtypes, {std::nullopt, {{MemrefType::kDynamicSize, 4}}});
+      GetFunctionType(dtypes, {std::nullopt, {{MemrefType::kDynamic, 4}}});
 
   auto constraints = {ArgumentConstraint::kShape, ArgumentConstraint::kShape};
 
@@ -307,9 +307,9 @@ TEST(SymbolicShapeResolverTest, ShapeConstrainedInputAfterDynamicInput) {
   // Operands: tensor<?x?xf32>, tensor<?x?xi32>
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
 
-  auto type = GetFunctionType(
-      dtypes, {{{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}}});
+  auto type =
+      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
 
   auto constraints = {ArgumentConstraint::kResolved,
                       ArgumentConstraint::kShape};
@@ -347,8 +347,7 @@ TEST(SymbolicShapeResolverTest, StaticShapeOperandHash) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32};
 
   auto type = GetFunctionType(
-      dtypes,
-      {{{MemrefType::kDynamicSize, MemrefType::kDynamicSize}}, {{4, 4}}});
+      dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}}, {{4, 4}}});
 
   auto constraints = {ArgumentConstraint::kResolved,
                       ArgumentConstraint::kShape};
@@ -371,7 +370,7 @@ TEST(SymbolicShapeResolverTest, StaticShapeOperandHash) {
 TEST(SymbolicShapeResolverTest, IncompatibleInput) {
   // Operands: tensor<?x4xi32>
   auto dtypes = {PrimitiveType::F32};
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamicSize, 4}}});
+  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}}});
   auto constraints = {ArgumentConstraint::kResolved};
 
   SymbolicShapesResolver resolver(type, constraints);
@@ -396,7 +395,7 @@ TEST(SymbolicShapeResolverTest, IncompatibleInput) {
 }
 
 TEST(SymbolicShapeResolverTest, OpaqueAndShapedInputs) {
-  std::vector<int64_t> shape = {MemrefType::kDynamicSize, 4};
+  std::vector<int64_t> shape = {MemrefType::kDynamic, 4};
 
   // Operands: !async.token, tensor<?x4xf32>, tensor<?x4xf32>
   std::vector<std::unique_ptr<Type>> operands;
@@ -470,11 +469,11 @@ static void BenchmarkFullyDynamic(benchmark::State& state) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
                  PrimitiveType::F32};
 
-  auto type = GetFunctionType(
-      dtypes, {{{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}}});
+  auto type =
+      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
 
   auto constraints = {
       ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
@@ -495,11 +494,11 @@ static void BenchmarkSameDynamic(benchmark::State& state) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
                  PrimitiveType::F32};
 
-  auto type = GetFunctionType(
-      dtypes, {{{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}}});
+  auto type =
+      GetFunctionType(dtypes, {{{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
 
   auto constraints = {
       ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
@@ -520,11 +519,11 @@ static void BenchmarkSomeDynamic(benchmark::State& state) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
                  PrimitiveType::F32};
 
-  auto type = GetFunctionType(
-      dtypes, {{{2, 2}},
-               {{4, 4}},
-               {{8, 8}},
-               {{MemrefType::kDynamicSize, MemrefType::kDynamicSize}}});
+  auto type =
+      GetFunctionType(dtypes, {{{2, 2}},
+                               {{4, 4}},
+                               {{8, 8}},
+                               {{MemrefType::kDynamic, MemrefType::kDynamic}}});
 
   auto constraints = {
       ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
@@ -545,10 +544,10 @@ static void BenchmarkStatic(benchmark::State& state) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
                  PrimitiveType::F32};
 
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamicSize, 4}},
-                                       {{MemrefType::kDynamicSize, 8}},
-                                       {{MemrefType::kDynamicSize, 16}},
-                                       {{MemrefType::kDynamicSize, 32}}});
+  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
+                                       {{MemrefType::kDynamic, 8}},
+                                       {{MemrefType::kDynamic, 16}},
+                                       {{MemrefType::kDynamic, 32}}});
 
   auto constraints = {
       ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
@@ -569,10 +568,10 @@ static void BenchmarkSymbolic(benchmark::State& state) {
   auto dtypes = {PrimitiveType::F32, PrimitiveType::S32, PrimitiveType::PRED,
                  PrimitiveType::F32};
 
-  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamicSize, 4}},
-                                       {{MemrefType::kDynamicSize, 8}},
-                                       {{MemrefType::kDynamicSize, 16}},
-                                       {{MemrefType::kDynamicSize, 32}}});
+  auto type = GetFunctionType(dtypes, {{{MemrefType::kDynamic, 4}},
+                                       {{MemrefType::kDynamic, 8}},
+                                       {{MemrefType::kDynamic, 16}},
+                                       {{MemrefType::kDynamic, 32}}});
 
   auto constraints = {
       ArgumentConstraint::kResolved, ArgumentConstraint::kResolved,
diff --git a/tensorflow/compiler/xla/runtime/tracing.h b/tensorflow/compiler/xla/runtime/tracing.h
index 63b4d07ca0c..3fc405d493e 100644
--- a/tensorflow/compiler/xla/runtime/tracing.h
+++ b/tensorflow/compiler/xla/runtime/tracing.h
@@ -27,8 +27,6 @@ namespace runtime {
 // XLA run-time representation of the `!rt.hlo_trace` attribute.
 struct HloTrace {
   std::string_view hlo_op;
-  std::string_view module;
-  int64_t program_id;
 };
 
 // Registers type id names for tracing attributes.
@@ -38,9 +36,7 @@ inline void PopulateTraceTypeIdNames(TypeIDNameRegistry& registry) {
 
 // Register XLA runtime custom calls attribute decoding.
 XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    HloTrace, AggregateMember<std::string_view>("hlo_op"),
-    AggregateMember<std::string_view>("module"),
-    AggregateMember<int64_t>("program_id"));
+    HloTrace, AggregateMember<std::string_view>("hlo_op"));
 
 }  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/types.h b/tensorflow/compiler/xla/runtime/types.h
index d09ba94b922..e84b8e3670d 100644
--- a/tensorflow/compiler/xla/runtime/types.h
+++ b/tensorflow/compiler/xla/runtime/types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_RUNTIME_TYPES_H_
 
 #include <functional>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -174,9 +175,9 @@ class TupleType : public llvm::RTTIExtends<TupleType, Type> {
 class RankedTensorType : public llvm::RTTIExtends<RankedTensorType, Type> {
  public:
   static constexpr char ID = 0;  // NOLINT
-  static constexpr int64_t kDynamicSize = -1;
+  static constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
 
-  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamicSize; }
+  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamic; }
 
   RankedTensorType(absl::Span<const int64_t> sizes, PrimitiveType element_type)
       : sizes_(sizes.begin(), sizes.end()), element_type_(element_type) {}
@@ -218,14 +219,15 @@ class UnrankedTensorType : public llvm::RTTIExtends<UnrankedTensorType, Type> {
 class MemrefType : public llvm::RTTIExtends<MemrefType, Type> {
  public:
   static constexpr char ID = 0;  // NOLINT
-  static constexpr int64_t kDynamicSize = -1;
+  static constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
 
-  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamicSize; }
+  static constexpr bool IsDynamic(int64_t dim) { return dim == kDynamic; }
 
   MemrefType(absl::Span<const int64_t> sizes, PrimitiveType element_type)
       : sizes_(sizes.begin(), sizes.end()), element_type_(element_type) {}
 
   absl::Span<const int64_t> sizes() const { return sizes_; }
+  int64_t size(size_t dim) const { return sizes_[dim]; }
   unsigned rank() const { return sizes_.size(); }
   PrimitiveType element_type() const { return element_type_; }
 
@@ -289,6 +291,8 @@ class OpaqueOperandType : public llvm::RTTIExtends<OpaqueOperandType, Type> {
 // Compiled function signature type corresponding to the mlir::FunctionType.
 //===----------------------------------------------------------------------===//
 
+// TODO(ezhulenev): Make function type copyable (replace std::unique_ptr with
+// std::shared ptr).
 class FunctionType {
  public:
   const Type* operand(unsigned index) const { return operands_[index].get(); }
@@ -301,6 +305,14 @@ class FunctionType {
                std::vector<std::unique_ptr<Type>> results)
       : operands_(std::move(operands)), results_(std::move(results)) {}
 
+  void insert_operand(unsigned index, std::unique_ptr<Type> operand) {
+    operands_.insert(operands_.begin() + index, std::move(operand));
+  }
+
+  void insert_result(unsigned index, std::unique_ptr<Type> result) {
+    results_.insert(results_.begin() + index, std::move(result));
+  }
+
  private:
   std::vector<std::unique_ptr<Type>> operands_;
   std::vector<std::unique_ptr<Type>> results_;
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index faa1deb3741..4e105d98400 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3,14 +3,19 @@
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library", "xla_py_test_deps")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
+    "xla_py_test_deps",
+)
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/tsl:tsl.bzl", "if_libtpu")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable", "internal_cuda_deps", "internal_hlo_deps")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -21,9 +26,10 @@ load(
 )
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow/compiler/xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
 
 package(
-    default_visibility = [":friends"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -56,6 +62,18 @@ tf_proto_library(
     protodeps = [":hlo_profile_printer_data"],
 )
 
+tf_proto_library(
+    name = "metrics_proto",
+    srcs = ["metrics.proto"],
+    cc_api_version = 2,
+)
+
+xla_py_proto_library(
+    name = "metrics_pb2",
+    api_version = 2,
+    deps = [":metrics_proto"],
+)
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -70,7 +88,7 @@ cc_library(
     srcs = ["reduce_scatter_utils.cc"],
     hdrs = ["reduce_scatter_utils.h"],
     deps = [
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -79,22 +97,22 @@ cc_library(
     srcs = ["async_collective_creator.cc"],
     hdrs = ["async_collective_creator.h"],
     deps = [
-        ":computation_placer_hdr",
-        ":hlo",
         ":hlo_pass",
         ":shape_inference",
+        "//tensorflow/compiler/xla:frontend_attributes",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "async_collective_creator_test",
     srcs = ["async_collective_creator_test.cc"],
     deps = [
         ":async_collective_creator",
-        ":hlo",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -107,12 +125,12 @@ cc_library(
     srcs = ["async_op_canonicalizer.cc"],
     hdrs = ["async_op_canonicalizer.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "async_op_canonicalizer_test",
     srcs = ["async_op_canonicalizer_test.cc"],
     deps = [
@@ -128,9 +146,38 @@ cc_library(
     srcs = ["all_reduce_key.cc"],
     hdrs = ["all_reduce_key.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_map",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+    ],
+)
+
+cc_library(
+    name = "all_reduce_promotion",
+    srcs = ["all_reduce_promotion.cc"],
+    hdrs = ["all_reduce_promotion.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:change_op_data_type",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
+xla_cc_test(
+    name = "all_reduce_promotion_test",
+    srcs = ["all_reduce_promotion_test.cc"],
+    deps = [
+        ":all_reduce_promotion",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:test_main",
     ],
 )
 
@@ -141,22 +188,22 @@ cc_library(
     deps = [
         ":all_reduce_key",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_reassociate_test",
     srcs = ["all_reduce_reassociate_test.cc"],
     deps = [
         ":all_reduce_reassociate",
-        ":hlo",
         ":hlo_matchers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -169,24 +216,24 @@ cc_library(
     hdrs = ["all_reduce_folder.h"],
     deps = [
         ":all_reduce_key",
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_folder_test",
     srcs = ["all_reduce_folder_test.cc"],
     deps = [
         ":all_reduce_folder",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -198,7 +245,7 @@ cc_library(
     srcs = ["bfloat16_support.cc"],
     hdrs = ["bfloat16_support.h"],
     deps = [
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -212,7 +259,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "broadcast_canonicalizer_test",
     srcs = ["broadcast_canonicalizer_test.cc"],
     deps = [
@@ -232,29 +279,29 @@ cc_library(
     hdrs = ["bfloat16_conversion_folding.h"],
     deps = [
         ":bfloat16_support",
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "bfloat16_conversion_folding_test",
     srcs = ["bfloat16_conversion_folding_test.cc"],
     deps = [
         ":bfloat16_conversion_folding",
         ":bfloat16_support",
-        ":hlo",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -267,26 +314,25 @@ cc_library(
     hdrs = ["bfloat16_normalization.h"],
     deps = [
         ":bfloat16_support",
-        ":hlo",
         ":hlo_dce",
         ":hlo_pass",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "bfloat16_normalization_test",
     srcs = ["bfloat16_normalization_test.cc"],
     deps = [
         ":bfloat16_normalization",
         ":bfloat16_support",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
@@ -295,6 +341,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -307,7 +354,6 @@ cc_library(
     hdrs = ["bfloat16_propagation.h"],
     deps = [
         ":bfloat16_support",
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_dce",
         ":hlo_pass",
@@ -316,6 +362,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
@@ -324,18 +371,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "bfloat16_propagation_test",
     srcs = ["bfloat16_propagation_test.cc"],
     deps = [
         ":bfloat16_propagation",
         ":bfloat16_support",
-        ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -347,12 +394,12 @@ cc_library(
     srcs = ["dump.cc"],
     hdrs = ["dump.h"],
     deps = [
-        ":hlo",
         ":hlo_graph_dumper",
         ":hlo_proto_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/io:zlib_compression_options",
         "//tensorflow/tsl/lib/io:zlib_outputbuffer",
         "//tensorflow/tsl/lib/strings:proto_serialization",
@@ -374,7 +421,6 @@ cc_library(
     srcs = ["shape_inference.cc"],
     hdrs = ["shape_inference.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -383,6 +429,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
@@ -395,11 +442,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shape_inference_test",
     srcs = ["shape_inference_test.cc"],
     deps = [
-        ":hlo",
         ":shape_inference",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -407,6 +453,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -414,45 +461,17 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_constant_splitter",
-    srcs = ["hlo_constant_splitter.cc"],
-    hdrs = ["hlo_constant_splitter.h"],
-    deps = ["//tensorflow/compiler/xla/service:hlo_pass"],
-)
-
-tf_cc_test(
-    name = "hlo_constant_splitter_test",
-    srcs = ["hlo_constant_splitter_test.cc"],
-    deps = [
-        ":hlo_constant_splitter",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
-    ],
-)
-
-tf_cc_test(
+xla_cc_test(
     name = "hlo_opcode_test",
     srcs = ["hlo_opcode_test.cc"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
-cc_library(
-    name = "hlo_evaluator",
-    deps = [
-        "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
-    ],
-)
-
 cc_library(
     name = "hlo_live_range",
     srcs = [
@@ -462,7 +481,6 @@ cc_library(
         "hlo_live_range.h",
     ],
     deps = [
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
@@ -470,6 +488,7 @@ cc_library(
         ":hlo_value",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -478,12 +497,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_live_range_test",
     srcs = ["hlo_live_range_test.cc"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_live_range",
         ":hlo_memory_scheduler",
@@ -499,6 +517,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -510,73 +529,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo",
-    hdrs = [
-        "dfs_hlo_visitor.h",
-        "dfs_hlo_visitor_with_default.h",
-        "dynamic_parameter_binding.h",
-        "hlo_casting_utils.h",
-        "hlo_clone_context.h",
-        "hlo_computation.h",
-        "hlo_domain_metadata.h",
-        "hlo_input_output_alias_config.h",
-        "hlo_instruction.h",
-        "hlo_instructions.h",
-        "hlo_module.h",
-        "hlo_module_metadata.h",
-        "hlo_op_metadata.h",
-        "hlo_opcode.h",
-        "hlo_schedule.h",
-        "hlo_sharding.h",
-        "hlo_sharding_metadata.h",
-    ],
-    deps = [
-        ":compilation_environments",
-        ":computation_placer_hdr",
-        ":hlo_module_config",
-        ":hlo_proto_cc",
-        ":mapped_ptr_container_sorter",
-        ":name_uniquer",
-        "//tensorflow/compiler/xla:array",
-        "//tensorflow/compiler/xla:comparison_util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:protobuf_util",
-        "//tensorflow/compiler/xla:shape_tree",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/lib/gtl:iterator_range",
-        "//tensorflow/tsl/lib/gtl:map_util",
-        "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:fingerprint",
-        "//tensorflow/tsl/platform:human_readable_json",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:protobuf",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:cord",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "hlo_sharding_util",
     srcs = [
@@ -587,20 +539,21 @@ cc_library(
     ],
     deps = [
         ":call_graph",
-        ":hlo",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_sharding_util_test",
     srcs = [
         "hlo_sharding_util_test.cc",
@@ -609,6 +562,7 @@ tf_cc_test(
         ":hlo_sharding_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -625,7 +579,6 @@ cc_library(
         ":call_graph",
         ":custom_call_sharding_helper",
         ":dot_as_convolution_util",
-        ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":hlo_sharding_util",
@@ -636,6 +589,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
@@ -648,19 +602,19 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "sharding_propagation_test",
     srcs = [
         "sharding_propagation_test.cc",
     ],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         ":sharding_propagation",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
@@ -676,9 +630,9 @@ cc_library(
         "sharding_remover.h",
     ],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -686,7 +640,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "sharding_remover_test",
     size = "small",
     srcs = [
@@ -711,24 +665,24 @@ cc_library(
         "dot_as_convolution_util.h",
     ],
     deps = [
-        ":hlo",
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -759,17 +713,17 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_runner",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -781,22 +735,22 @@ cc_library(
     name = "pattern_matcher",
     hdrs = ["pattern_matcher.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/utility",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pattern_matcher_test",
     srcs = ["pattern_matcher_test.cc"],
     deps = [
-        ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -815,15 +769,15 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pattern_matcher_gmock_test",
     srcs = ["pattern_matcher_gmock_test.cc"],
     deps = [
-        ":hlo",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
@@ -834,26 +788,23 @@ cc_library(
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "@com_google_absl//absl/base",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_reachability_test",
     srcs = ["hlo_reachability_test.cc"],
     deps = [
         ":computation_placer",
-        ":hlo",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -865,14 +816,14 @@ cc_library(
     srcs = ["hlo_matchers.cc"],
     hdrs = ["hlo_matchers.h"],
     deps = [
-        ":hlo",
         ":hlo_parser",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_matchers_test",
     srcs = ["hlo_matchers_test.cc"],
     deps = [
@@ -884,11 +835,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_instruction_test",
     srcs = ["hlo_instruction_test.cc"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -896,6 +846,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -905,11 +856,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -918,6 +868,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -928,9 +879,9 @@ cc_library(
     srcs = ["call_graph.cc"],
     hdrs = ["call_graph.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -941,12 +892,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "call_graph_test",
     srcs = ["call_graph_test.cc"],
     deps = [
         ":call_graph",
-        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -954,6 +904,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -968,10 +919,10 @@ cc_library(
     hdrs = ["flatten_call_graph.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -984,11 +935,11 @@ cc_library(
     hdrs = ["call_inliner.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_dce",
         ":hlo_domain_isolator",
         ":hlo_pass",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -996,13 +947,12 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "call_inliner_test",
     size = "small",
     srcs = ["call_inliner_test.cc"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
@@ -1011,6 +961,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1026,17 +977,16 @@ cc_library(
     srcs = ["hlo_computation_deduplicator.cc"],
     hdrs = ["hlo_computation_deduplicator.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_computation_deduplicator_test",
     size = "small",
     srcs = ["hlo_computation_deduplicator_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_computation_deduplicator",
         ":hlo_matchers",
         ":hlo_pass",
@@ -1045,6 +995,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1053,13 +1004,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "flatten_call_graph_test",
     srcs = ["flatten_call_graph_test.cc"],
     deps = [
         ":call_graph",
         ":flatten_call_graph",
-        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1067,6 +1017,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1142,11 +1093,9 @@ cc_library(
         ":dynamic_padder",
         ":executable",
         ":execution_tracker",
-        ":hlo",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
         ":hlo_module_config",
-        ":hlo_module_group",
         ":hlo_module_util",
         ":hlo_proto_util",
         ":platform_util",
@@ -1166,6 +1115,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:env",
@@ -1190,7 +1141,6 @@ cc_library(
         ":compiler",
         ":computation_layout",
         ":executable",
-        ":hlo",
         ":hlo_execution_profile",
         ":hlo_module_config",
         ":hlo_module_util",
@@ -1207,6 +1157,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:logging",
@@ -1223,7 +1174,7 @@ cc_library(
     srcs = ["latency_hiding_scheduler.cc"],
     hdrs = ["latency_hiding_scheduler.h"],
     deps = [
-        ":hlo",
+        ":dump",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_cost_analysis",
@@ -1233,23 +1184,26 @@ cc_library(
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "latency_hiding_scheduler_test",
     srcs = ["latency_hiding_scheduler_test.cc"],
     deps = [
         ":async_collective_creator",
-        ":hlo",
         ":latency_hiding_scheduler",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1297,7 +1251,7 @@ cc_library(
 
 cc_library(
     name = "gpu_plugin_impl",
-    compatible_with = [],
+    compatible_with = get_compatible_with_portable(),
     deps = if_gpu_is_configured([
         ":service",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
@@ -1309,7 +1263,7 @@ cc_library(
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler",
         "//tensorflow/compiler/xla/stream_executor/rocm:stream_executor_rocm",
-    ]) + internal_cuda_deps(),
+    ]),
 )
 
 cc_library(
@@ -1362,7 +1316,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shaped_buffer_test",
     srcs = ["shaped_buffer_test.cc"],
     deps = [
@@ -1392,7 +1346,7 @@ cc_library(
     deps = [
         ":computation_layout",
         ":dump",
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":hlo_proto_cc",
@@ -1431,18 +1385,18 @@ cc_library(
         ":buffer_value",
         ":computation_placer",
         ":executable",
-        ":hlo",
         ":hlo_module_config",
-        ":hlo_module_group",
-        ":logical_buffer",
+        ":metrics_hook_interface",
+        ":metrics_proto_cc",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
-        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1534,13 +1488,13 @@ cc_library(
     srcs = ["channel_tracker.cc"],
     hdrs = ["channel_tracker.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1564,7 +1518,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "name_uniquer_test",
     srcs = ["name_uniquer_test.cc"],
     deps = [
@@ -1587,7 +1541,6 @@ cc_library(
     deps = [
         ":buffer_value_containers",
         ":heap_simulator",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
@@ -1602,6 +1555,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
@@ -1617,7 +1571,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "buffer_assignment_test",
     srcs = ["buffer_assignment_test.cc"],
     deps = [
@@ -1628,7 +1582,6 @@ tf_cc_test(
         ":copy_insertion",
         ":cpu_plugin",
         ":flatten_call_graph",
-        ":hlo",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
@@ -1640,6 +1593,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1658,7 +1612,6 @@ cc_library(
     hdrs = ["hlo_ordering.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_proto_cc",
         ":hlo_reachability",
@@ -1668,6 +1621,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -1677,18 +1631,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_ordering_test",
     size = "small",
     srcs = ["hlo_ordering_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1703,7 +1657,6 @@ cc_library(
     deps = [
         ":buffer_value",
         ":buffer_value_containers",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
@@ -1715,6 +1668,7 @@ cc_library(
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1722,13 +1676,12 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "heap_simulator_test",
     srcs = ["heap_simulator_test.cc"],
     deps = [
         ":buffer_value",
         ":heap_simulator",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
@@ -1737,6 +1690,7 @@ tf_cc_test(
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1748,31 +1702,19 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_module_group",
-    srcs = ["hlo_module_group.cc"],
-    hdrs = ["hlo_module_group.h"],
-    deps = [
-        ":hlo",
-        ":hlo_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-tf_cc_test(
+xla_cc_test(
     name = "hlo_module_group_test",
     srcs = ["hlo_module_group_test.cc"],
     # TODO(b/148211710) Test fails in OSS.
     tags = ["no_oss"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
-        ":hlo_module_group",
         ":hlo_module_group_metadata",
         ":hlo_proto_cc",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1787,7 +1729,6 @@ cc_library(
     srcs = ["hlo_module_group_metadata.cc"],
     hdrs = ["hlo_module_group_metadata.h"],
     deps = [
-        ":hlo",
         ":hlo_alias_analysis",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -1795,6 +1736,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -1823,7 +1765,6 @@ cc_library(
     srcs = ["hlo_module_group_util.cc"],
     hdrs = ["hlo_module_group_util.h"],
     deps = [
-        ":hlo",
         ":hlo_module_group_metadata",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:status",
@@ -1831,6 +1772,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -1842,17 +1784,17 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_schedule_test",
     srcs = ["hlo_schedule_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1861,17 +1803,17 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_input_output_alias_config_test",
     srcs = ["hlo_input_output_alias_config_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1886,7 +1828,6 @@ cc_library(
     hdrs = ["hlo_memory_scheduler.h"],
     deps = [
         ":heap_simulator",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_ordering",
         ":hlo_pass",
@@ -1897,6 +1838,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:map_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -1906,12 +1848,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_memory_scheduler_test",
     srcs = ["hlo_memory_scheduler_test.cc"],
     deps = [
         ":heap_simulator",
-        ":hlo",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
@@ -1919,6 +1860,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -1933,9 +1875,9 @@ cc_library(
     srcs = ["hlo_query.cc"],
     hdrs = ["hlo_query.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -1944,7 +1886,7 @@ cc_library(
     name = "fusion_queue",
     hdrs = ["fusion_queue.h"],
     deps = [
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1955,7 +1897,6 @@ cc_library(
     hdrs = ["instruction_fusion.h"],
     deps = [
         ":fusion_queue",
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
         ":hlo_pass",
@@ -1963,6 +1904,7 @@ cc_library(
         ":pattern_matcher",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -1972,7 +1914,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "instruction_fusion_test",
     srcs = ["instruction_fusion_test.cc"],
     deps = [
@@ -1989,7 +1931,6 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_dce",
         ":hlo_pass",
@@ -1998,6 +1939,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2013,7 +1955,6 @@ cc_library(
         "hlo_creation_utils.h",
     ],
     deps = [
-        ":hlo",
         ":hlo_module_config",
         ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
@@ -2024,6 +1965,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -2035,8 +1977,8 @@ cc_library(
     hdrs = ["fusion_node_indexing_evaluation.h"],
     deps = [
         ":elemental_ir_emitter",
-        ":hlo",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2044,14 +1986,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "fusion_node_indexing_evaluation_test",
     srcs = ["fusion_node_indexing_evaluation_test.cc"],
     deps = [
         ":fusion_node_indexing_evaluation",
-        ":hlo",
         ":hlo_parser",
         ":instruction_fusion",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -2059,11 +2001,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_creation_utils_test",
     srcs = ["hlo_creation_utils_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
@@ -2071,6 +2012,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -2082,7 +2024,6 @@ cc_library(
     srcs = ["batchnorm_expander.cc"],
     hdrs = ["batchnorm_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -2091,6 +2032,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -2103,11 +2045,11 @@ cc_library(
     srcs = ["op_expander_pass.cc"],
     hdrs = ["op_expander_pass.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -2117,7 +2059,6 @@ cc_library(
     srcs = ["gather_expander.cc"],
     hdrs = ["gather_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":op_expander_pass",
@@ -2125,6 +2066,7 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -2143,13 +2085,13 @@ cc_library(
     srcs = ["comparison_expander.cc"],
     hdrs = ["comparison_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -2159,21 +2101,20 @@ cc_library(
     hdrs = ["scatter_expander.h"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":hlo_creation_utils",
         ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "scatter_expander_test",
     srcs = ["scatter_expander_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         ":scatter_expander",
@@ -2182,6 +2123,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2215,18 +2157,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "triangular_solve_expander_test",
     size = "medium",
     srcs = ["triangular_solve_expander_test.cc"],
     shard_count = 12,
     deps = [
-        ":hlo",
         ":triangular_solve_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2301,12 +2243,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "real_imag_expander_test",
     size = "small",
     srcs = ["real_imag_expander_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_matchers",
         ":hlo_parser",
@@ -2320,6 +2261,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2359,26 +2301,26 @@ cc_library(
     srcs = ["convolution_4d_expander.cc"],
     hdrs = ["convolution_4d_expander.h"],
     deps = [
-        ":hlo",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "convolution_4d_expander_test",
     srcs = ["convolution_4d_expander_test.cc"],
     deps = [
         "convolution_4d_expander",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2389,19 +2331,19 @@ cc_library(
     srcs = ["convolution_pred_expander.cc"],
     hdrs = ["convolution_pred_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":op_expander_pass",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "convolution_pred_expander_test",
     srcs = ["convolution_pred_expander_test.cc"],
     deps = [
@@ -2414,13 +2356,16 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_test(
     name = "batchnorm_expander_test",
     size = "small",
     srcs = ["batchnorm_expander_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
     deps = [
         ":batchnorm_expander",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         ":hlo_pass",
@@ -2430,6 +2375,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -2442,7 +2388,6 @@ cc_library(
     srcs = ["algebraic_simplifier.cc"],
     hdrs = ["algebraic_simplifier.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":hlo_query",
@@ -2459,7 +2404,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -2468,6 +2413,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2478,7 +2424,6 @@ cc_library(
     srcs = ["tree_reduction_rewriter.cc"],
     hdrs = ["tree_reduction_rewriter.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:shape_util",
@@ -2486,7 +2431,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
@@ -2494,12 +2439,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "algebraic_simplifier_test",
     srcs = ["algebraic_simplifier_test.cc"],
     deps = [
         ":algebraic_simplifier",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
@@ -2511,14 +2455,17 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2538,20 +2485,20 @@ cc_library(
     srcs = ["simplify_fp_conversions.cc"],
     hdrs = ["simplify_fp_conversions.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "simplify_fp_conversions_test",
     srcs = ["simplify_fp_conversions_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":simplify_fp_conversions",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/platform:status_matchers",
@@ -2564,7 +2511,6 @@ cc_library(
     srcs = ["logistic_expander.cc"],
     hdrs = ["logistic_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":op_expander_pass",
@@ -2575,6 +2521,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -2582,11 +2529,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "logistic_expander_test",
     srcs = ["logistic_expander_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
@@ -2602,6 +2548,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2617,18 +2564,12 @@ cc_library(
     srcs = ["collectives_schedule_linearizer.cc"],
     hdrs = ["collectives_schedule_linearizer.h"],
     deps = [
-        ":hlo",
-        ":hlo_domain_map",
         ":hlo_pass",
-        ":hlo_query",
         ":hlo_reachability",
-        ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2637,26 +2578,19 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "collectives_schedule_linearizer_test",
     srcs = ["collectives_schedule_linearizer_test.cc"],
     deps = [
         ":collectives_schedule_linearizer",
-        ":hlo",
-        ":hlo_graph_dumper",
-        ":hlo_matchers",
-        ":hlo_runner",
         ":pattern_matcher",
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_benchmark",
     ],
 )
 
@@ -2664,12 +2598,12 @@ cc_library(
     name = "collective_combiner_utils",
     hdrs = ["collective_combiner_utils.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -2683,12 +2617,12 @@ cc_library(
     hdrs = ["collective_decomposer_utils.h"],
     deps = [
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_module_config",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2698,12 +2632,12 @@ cc_library(
     srcs = ["all_gather_broadcast_reorder.cc"],
     hdrs = ["all_gather_broadcast_reorder.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -2714,7 +2648,6 @@ cc_library(
     srcs = ["bitcast_dtypes_expander.cc"],
     hdrs = ["bitcast_dtypes_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:literal_util",
@@ -2726,19 +2659,20 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:broadcast",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "bitcast_dtypes_expander_test",
     srcs = ["bitcast_dtypes_expander_test.cc"],
     deps = [
         ":bitcast_dtypes_expander",
-        ":hlo",
         ":hlo_matchers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2748,13 +2682,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_gather_broadcast_reorder_test",
     srcs = ["all_gather_broadcast_reorder_test.cc"],
     deps = [
         ":all_gather_broadcast_reorder",
-        ":hlo",
         ":hlo_matchers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:statusor",
@@ -2768,7 +2702,6 @@ cc_library(
     hdrs = ["all_gather_combiner.h"],
     deps = [
         ":collective_combiner_utils",
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
@@ -2781,6 +2714,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2789,18 +2723,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_gather_combiner_test",
     srcs = ["all_gather_combiner_test.cc"],
     deps = [
         ":all_gather_combiner",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2818,7 +2752,6 @@ cc_library(
     deps = [
         ":all_reduce_key",
         ":collective_combiner_utils",
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
@@ -2831,6 +2764,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2839,18 +2773,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_combiner_test",
     srcs = ["all_reduce_combiner_test.cc"],
     deps = [
         ":all_reduce_combiner",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2866,22 +2800,22 @@ cc_library(
     srcs = ["all_reduce_contiguous.cc"],
     hdrs = ["all_reduce_contiguous.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_contiguous_test",
     srcs = ["all_reduce_contiguous_test.cc"],
     deps = [
         ":all_reduce_contiguous",
-        ":hlo",
         ":hlo_matchers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/platform:test_main",
@@ -2896,7 +2830,6 @@ cc_library(
         ":all_reduce_key",
         ":collective_combiner_utils",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
@@ -2908,6 +2841,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -2916,11 +2850,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduce_scatter_combiner_test",
     srcs = ["reduce_scatter_combiner_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":reduce_scatter_combiner",
         "//tensorflow/compiler/xla:literal",
@@ -2928,6 +2861,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/platform:logging",
@@ -2942,21 +2876,20 @@ cc_library(
     srcs = ["all_reduce_simplifier.cc"],
     hdrs = ["all_reduce_simplifier.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_replication_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_simplifier_test",
     srcs = ["all_reduce_simplifier_test.cc"],
     deps = [
         ":all_reduce_simplifier",
-        ":hlo",
         ":hlo_parser",
         ":hlo_pass",
         ":pattern_matcher",
@@ -2966,6 +2899,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2983,7 +2917,6 @@ cc_library(
     deps = [
         ":collective_decomposer_utils",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_module_config",
         ":hlo_pass",
@@ -2991,22 +2924,23 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduce_scatter_decomposer_test",
     srcs = ["reduce_scatter_decomposer_test.cc"],
     deps = [
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         ":reduce_scatter_decomposer",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -3023,22 +2957,22 @@ cc_library(
     deps = [
         ":all_reduce_key",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduce_scatter_reassociate_test",
     srcs = ["reduce_scatter_reassociate_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":reduce_scatter_reassociate",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -3050,19 +2984,18 @@ cc_library(
     srcs = ["batch_dot_simplification.cc"],
     hdrs = ["batch_dot_simplification.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "batch_dot_simplification_test",
     srcs = ["batch_dot_simplification_test.cc"],
     deps = [
         ":batch_dot_simplification",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
@@ -3072,6 +3005,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/platform:logging",
@@ -3079,7 +3013,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gather_expander_test",
     srcs = ["gather_expander_test.cc"],
     deps = [
@@ -3099,7 +3033,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":call_inliner",
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -3107,6 +3040,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -3116,19 +3050,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conditional_simplifier_test",
     srcs = ["conditional_simplifier_test.cc"],
     deps = [
         ":conditional_simplifier",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3144,7 +3077,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":call_inliner",
-        ":hlo",
         ":hlo_cse",
         ":hlo_dce",
         ":hlo_pass",
@@ -3158,6 +3090,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -3165,18 +3098,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conditional_code_motion_test",
     srcs = ["conditional_code_motion_test.cc"],
     deps = [
         ":conditional_code_motion",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3191,7 +3124,6 @@ cc_library(
     srcs = ["convolution_group_converter.cc"],
     hdrs = ["convolution_group_converter.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
@@ -3201,6 +3133,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -3208,16 +3141,16 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "convolution_group_converter_test",
     size = "small",
     srcs = ["convolution_group_converter_test.cc"],
     deps = [
         ":convolution_group_converter",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -3228,7 +3161,6 @@ cc_library(
     srcs = ["space_to_batch_converter.cc"],
     hdrs = ["space_to_batch_converter.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":pattern_matcher",
@@ -3242,7 +3174,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/core:bitmap",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -3256,16 +3188,16 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "space_to_batch_converter_test",
     size = "small",
     srcs = ["space_to_batch_converter_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":space_to_batch_converter",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -3276,15 +3208,15 @@ cc_library(
     srcs = ["while_loop_analysis.cc"],
     hdrs = ["while_loop_analysis.h"],
     deps = [
-        ":hlo",
         ":hlo_reachability",
         ":pattern_matcher",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/base",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_analysis_test",
     srcs = ["while_loop_analysis_test.cc"],
     deps = [
@@ -3304,7 +3236,6 @@ cc_library(
     hdrs = ["while_loop_simplifier.h"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         ":pattern_matcher",
@@ -3312,6 +3243,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:union_find",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3319,12 +3251,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
         ":algebraic_simplifier",
-        ":hlo",
         ":hlo_cse",
         ":hlo_dce",
         ":hlo_matchers",
@@ -3334,6 +3265,7 @@ tf_cc_test(
         ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3348,15 +3280,15 @@ cc_library(
     srcs = ["while_loop_trip_count_annotator.cc"],
     hdrs = ["while_loop_trip_count_annotator.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":while_loop_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_trip_count_annotator_test",
     srcs = ["while_loop_trip_count_annotator_test.cc"],
     deps = [
@@ -3380,11 +3312,11 @@ cc_library(
     hdrs = ["defuser.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -3392,7 +3324,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "defuser_test",
     srcs = ["defuser_test.cc"],
     deps = [
@@ -3405,15 +3337,15 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "despecializer_test",
     srcs = ["despecializer_test.cc"],
     deps = [
         ":despecializer",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest_main",
@@ -3425,19 +3357,19 @@ cc_library(
     srcs = ["dot_decomposer.cc"],
     hdrs = ["dot_decomposer.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dot_decomposer_test",
     srcs = ["dot_decomposer_test.cc"],
     deps = [
@@ -3455,14 +3387,14 @@ cc_library(
     srcs = ["dot_merger.cc"],
     hdrs = ["dot_merger.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":shape_inference",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/graphcycles",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dot_merger_test",
     srcs = ["dot_merger_test.cc"],
     deps = [
@@ -3485,14 +3417,14 @@ cc_library(
     srcs = ["convert_mover.cc"],
     hdrs = ["convert_mover.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/graphcycles",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "convert_mover_test",
     srcs = ["convert_mover_test.cc"],
     deps = [
@@ -3514,7 +3446,6 @@ cc_library(
     srcs = ["all_to_all_decomposer.cc"],
     hdrs = ["all_to_all_decomposer.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:literal_util",
@@ -3522,6 +3453,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -3535,28 +3467,28 @@ cc_library(
     deps = [
         ":collective_decomposer_utils",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_gather_decomposer_test",
     srcs = ["all_gather_decomposer_test.cc"],
     deps = [
         ":all_gather_decomposer",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -3571,28 +3503,28 @@ cc_library(
     srcs = ["tuple_simplifier.cc"],
     hdrs = ["tuple_simplifier.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tuple_simplifier_test",
     srcs = ["tuple_simplifier_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3622,10 +3554,10 @@ cc_library(
     srcs = ["reshape_decomposer.cc"],
     hdrs = ["reshape_decomposer.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -3634,18 +3566,17 @@ cc_library(
     srcs = ["reduce_decomposer.cc"],
     hdrs = ["reduce_decomposer.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduce_decomposer_test",
     srcs = ["reduce_decomposer_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_matchers",
         ":hlo_parser",
@@ -3657,6 +3588,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3665,11 +3597,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reshape_decomposer_test",
     srcs = ["reshape_decomposer_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_matchers",
         ":hlo_parser",
@@ -3682,6 +3613,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3695,13 +3627,13 @@ cc_library(
     srcs = ["dynamic_window_utils.cc"],
     hdrs = ["dynamic_window_utils.h"],
     deps = [
-        ":hlo",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -3711,7 +3643,6 @@ cc_library(
     hdrs = ["dynamic_dimension_inference.h"],
     deps = [
         ":dynamic_window_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":tuple_util",
         ":while_util",
@@ -3724,6 +3655,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -3736,18 +3668,17 @@ cc_library(
     srcs = ["dynamic_dimension_simplifier.cc"],
     hdrs = ["dynamic_dimension_simplifier.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dynamic_dimension_simplifier_test",
     srcs = ["dynamic_dimension_simplifier_test.cc"],
     deps = [
         ":dynamic_dimension_simplifier",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
@@ -3762,6 +3693,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -3779,7 +3711,6 @@ cc_library(
     deps = [
         ":dynamic_dimension_inference",
         ":dynamic_window_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_dce",
         ":hlo_pass",
@@ -3794,6 +3725,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/monitoring:gauge",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -3812,7 +3744,6 @@ xla_test(
         ":dynamic_dimension_inference",
         ":dynamic_dimension_simplifier",
         ":dynamic_padder",
-        ":hlo",
         ":hlo_dce",
         ":hlo_matchers",
         ":hlo_parser",
@@ -3827,6 +3758,7 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -3841,12 +3773,11 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dynamic_dimension_inference_test",
     srcs = ["dynamic_dimension_inference_test.cc"],
     deps = [
         ":dynamic_dimension_inference",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_runner",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -3856,6 +3787,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -3865,11 +3797,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reshape_mover_test",
     srcs = ["reshape_mover_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":reshape_mover",
         "//tensorflow/compiler/xla:literal",
@@ -3879,6 +3810,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -3988,13 +3920,13 @@ cc_library(
     srcs = ["hlo_cost_analysis.cc"],
     hdrs = ["hlo_cost_analysis.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:map_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -4003,12 +3935,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_cost_analysis_test",
     srcs = ["hlo_cost_analysis_test.cc"],
     deps = [
         ":cpu_plugin",
-        ":hlo",
         ":hlo_cost_analysis",
         ":hlo_parser",
         ":local_service",
@@ -4023,6 +3954,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -4034,22 +3966,23 @@ cc_library(
     srcs = ["hlo_execution_profile.cc"],
     hdrs = ["hlo_execution_profile.h"],
     deps = [
-        ":hlo",
         ":hlo_cost_analysis",
         ":hlo_execution_profile_data_cc",
         ":hlo_profile_printer",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_execution_profile_test",
     srcs = ["hlo_execution_profile_test.cc"],
+    tags = ["no_mac_arm64"],
     deps = [
         ":cpu_plugin",
         ":hlo_cost_analysis",
@@ -4061,11 +3994,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_computation_test",
     srcs = ["hlo_computation_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
@@ -4073,6 +4005,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:statusor",
@@ -4081,33 +4014,40 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_module_test",
     srcs = ["hlo_module_test.cc"],
     deps = [
-        ":hlo",
+        ":computation_placer_hdr",
         ":hlo_matchers",
         ":hlo_memory_scheduler",
+        ":hlo_module_config",
         ":test_compilation_environment_proto_cc",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/lib/strings:proto_serialization",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_module_metadata_test",
     srcs = ["hlo_module_metadata_test.cc"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -4117,11 +4057,11 @@ cc_library(
     srcs = ["buffer_value.cc"],
     hdrs = ["buffer_value.h"],
     deps = [
-        ":hlo",
         ":hlo_proto_cc",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/types:span",
     ],
@@ -4145,11 +4085,11 @@ cc_library(
     hdrs = ["logical_buffer.h"],
     deps = [
         ":buffer_value",
-        ":hlo",
         ":hlo_proto_cc",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:int_type",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -4162,18 +4102,18 @@ cc_library(
     hdrs = ["hlo_value.h"],
     deps = [
         ":buffer_value",
-        ":hlo",
+        "//tensorflow/compiler/xla:lazy",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -4186,7 +4126,6 @@ cc_library(
     hdrs = ["hlo_dataflow_analysis.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_phi_graph",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -4194,6 +4133,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -4206,13 +4146,12 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_dataflow_analysis_test",
     srcs = ["hlo_dataflow_analysis_test.cc"],
     deps = [
         ":async_op_canonicalizer",
         ":flatten_call_graph",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_dataflow_analysis",
         ":hlo_dce",
@@ -4226,6 +4165,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4240,13 +4180,13 @@ cc_library(
     hdrs = ["hlo_phi_graph.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4258,11 +4198,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_phi_graph_test",
     srcs = ["hlo_phi_graph_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
         ":hlo_matchers",
@@ -4274,22 +4213,53 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
 
+cc_library(
+    name = "hlo_activation_analysis",
+    srcs = ["hlo_activation_analysis.cc"],
+    hdrs = ["hlo_activation_analysis.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_activation_analysis_test",
+    srcs = ["hlo_activation_analysis_test.cc"],
+    deps = [
+        ":hlo_activation_analysis",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:test",
+    ],
+)
+
 cc_library(
     name = "hlo_replication_analysis",
     srcs = ["hlo_replication_analysis.cc"],
     hdrs = ["hlo_replication_analysis.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -4298,14 +4268,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_replication_analysis_test",
     srcs = ["hlo_replication_analysis_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_replication_analysis",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4320,7 +4290,6 @@ cc_library(
     hdrs = ["hlo_liveness_analysis.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
@@ -4328,6 +4297,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4337,11 +4307,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_liveness_analysis_test",
     srcs = ["hlo_liveness_analysis_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_liveness_analysis",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -4349,6 +4318,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -4361,13 +4331,13 @@ cc_library(
     srcs = ["hlo_buffer.cc"],
     hdrs = ["hlo_buffer.h"],
     deps = [
-        ":hlo",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -4380,7 +4350,6 @@ cc_library(
     srcs = ["hlo_alias_analysis.cc"],
     hdrs = ["hlo_alias_analysis.h"],
     deps = [
-        ":hlo",
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
         ":hlo_ordering",
@@ -4392,6 +4361,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -4402,12 +4372,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_alias_analysis_test",
     srcs = ["hlo_alias_analysis_test.cc"],
     deps = [
         ":flatten_call_graph",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_graph_dumper",
         ":hlo_matchers",
@@ -4418,6 +4387,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4431,10 +4401,10 @@ cc_library(
     srcs = ["logical_buffer_analysis.cc"],
     hdrs = ["logical_buffer_analysis.h"],
     deps = [
-        ":hlo",
         ":logical_buffer",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4446,7 +4416,6 @@ cc_library(
     srcs = ["tuple_points_to_analysis.cc"],
     hdrs = ["tuple_points_to_analysis.h"],
     deps = [
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":logical_buffer",
         ":logical_buffer_analysis",
@@ -4456,6 +4425,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:compactptrset",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -4470,11 +4440,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tuple_points_to_analysis_test",
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_matchers",
         ":instruction_fusion",
@@ -4485,6 +4454,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -4519,7 +4489,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":computation_layout",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_dce",
         ":hlo_graph_dumper",
@@ -4535,6 +4504,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
@@ -4558,7 +4528,6 @@ cc_library(
     ],
     deps = [
         ":dump",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_dce",
@@ -4567,10 +4536,12 @@ cc_library(
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_simplifier",
+        "//tensorflow/compiler/xla:frontend_attributes",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -4587,7 +4558,6 @@ cc_library(
     hdrs = ["loop_schedule_linearizer.h"],
     deps = [
         ":dump",
-        ":hlo",
         ":hlo_alias_analysis",
         ":hlo_dce",
         ":hlo_graph_dumper",
@@ -4599,6 +4569,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4607,12 +4578,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "copy_insertion_test",
     srcs = ["copy_insertion_test.cc"],
     deps = [
         ":copy_insertion",
-        ":hlo",
         ":hlo_graph_dumper",
         ":hlo_matchers",
         ":hlo_runner",
@@ -4622,6 +4592,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -4629,12 +4600,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "loop_schedule_linearizer_test",
     srcs = ["loop_schedule_linearizer_test.cc"],
     deps = [
         ":copy_insertion",
-        ":hlo",
         ":hlo_graph_dumper",
         ":hlo_matchers",
         ":hlo_runner",
@@ -4645,6 +4615,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -4658,7 +4629,7 @@ cc_library(
     hdrs = ["memory_space_assignment_utils.h"],
     deps = [
         ":heap_simulator",
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -4668,9 +4639,9 @@ cc_library(
     hdrs = ["memory_space_assignment_tuning_utils.h"],
     deps = [
         ":heap_simulator",
-        ":hlo",
         ":hlo_module_config",
         ":memory_space_assignment_utils",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -4693,7 +4664,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "memory_space_assignment_best_fit_repacker_test",
     srcs = ["memory_space_assignment_best_fit_repacker_test.cc"],
     deps = [
@@ -4722,16 +4693,16 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "memory_space_assignment_test",
     srcs = ["memory_space_assignment_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":instruction_hoister",
         ":memory_space_assignment",
         ":memory_space_assignment_utils",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
@@ -4743,13 +4714,13 @@ cc_library(
     srcs = ["memory_space_propagation.cc"],
     hdrs = ["memory_space_propagation.h"],
     deps = [
-        ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "memory_space_propagation_test",
     srcs = ["memory_space_propagation_test.cc"],
     deps = [
@@ -4767,13 +4738,13 @@ cc_library(
     srcs = ["hlo_dce.cc"],
     hdrs = ["hlo_dce.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4785,7 +4756,6 @@ cc_library(
     srcs = ["hlo_module_dce.cc"],
     hdrs = ["hlo_module_dce.h"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_liveness_analysis",
         ":hlo_pass",
@@ -4796,6 +4766,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
     ],
@@ -4807,14 +4778,15 @@ cc_library(
     hdrs = ["hlo_verifier.h"],
     deps = [
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:permutation_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -4824,11 +4796,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_verifier_test",
     srcs = ["hlo_verifier_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_module_config",
         ":hlo_parser",
         ":hlo_verifier",
@@ -4836,6 +4807,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4852,18 +4824,18 @@ cc_library(
         ":buffer_value",
         ":call_graph",
         ":flatten_call_graph",
-        ":hlo",
-        ":hlo_dataflow_analysis",
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         ":hlo_query",
-        ":hlo_value",
+        ":logical_buffer",
+        ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -4880,13 +4852,13 @@ cc_library(
     testonly = 1,
     hdrs = ["hlo_rematerialization_test_utils.h"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":hlo_ordering",
         ":hlo_rematerialization",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4895,12 +4867,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_rematerialization_test_utils_test",
     srcs = ["hlo_rematerialization_test_utils_test.cc"],
     deps = [
         ":flatten_call_graph",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_ordering",
         ":hlo_rematerialization_test_utils",
@@ -4908,6 +4879,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4915,12 +4887,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_rematerialization_test",
     srcs = ["hlo_rematerialization_test.cc"],
     deps = [
         ":flatten_call_graph",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_ordering",
         ":hlo_rematerialization",
@@ -4928,6 +4899,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -4935,11 +4907,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_dce_test",
     srcs = ["hlo_dce_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -4947,6 +4918,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -4958,17 +4930,17 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_module_dce_test",
     srcs = ["hlo_module_dce_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_module_dce",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -4979,13 +4951,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "layout_assignment_test",
     srcs = ["layout_assignment_test.cc"],
     deps = [
         ":algebraic_simplifier",
         ":computation_layout",
-        ":hlo",
         ":hlo_parser",
         ":layout_assignment",
         ":pattern_matcher",
@@ -4997,6 +4968,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5014,11 +4986,11 @@ cc_library(
         "hlo_pass_interface.h",
     ],
     deps = [
-        ":hlo",
-        ":hlo_module_group",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -5035,7 +5007,6 @@ cc_library(
     deps = [
         ":compilation_stats",
         ":dump",
-        ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":hlo_proto_util",
@@ -5043,8 +5014,10 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -5053,11 +5026,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_pass_pipeline_test",
     srcs = ["hlo_pass_pipeline_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_parser",
         ":hlo_pass_pipeline",
         "//tensorflow/compiler/xla:test",
@@ -5065,6 +5037,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5079,12 +5052,12 @@ cc_library(
     srcs = ["hlo_cse.cc"],
     hdrs = ["hlo_cse.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5092,12 +5065,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_cse_test",
     srcs = ["hlo_cse_test.cc"],
     deps = [
         ":cpu_plugin",
-        ":hlo",
         ":hlo_cse",
         ":hlo_matchers",
         ":hlo_parser",
@@ -5108,6 +5080,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -5123,7 +5096,6 @@ cc_library(
     srcs = ["hlo_constant_folding.cc"],
     hdrs = ["hlo_constant_folding.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         ":slow_operation_alarm",
@@ -5131,16 +5103,16 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_constant_folding_test",
     srcs = ["hlo_constant_folding_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_constant_folding",
         ":hlo_matchers",
         ":hlo_parser",
@@ -5152,6 +5124,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5163,10 +5136,10 @@ cc_library(
     srcs = ["hlo_domain_map.cc"],
     hdrs = ["hlo_domain_map.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5178,11 +5151,11 @@ cc_library(
     srcs = ["hlo_domain_verifier.cc"],
     hdrs = ["hlo_domain_verifier.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_graph_dumper",
         ":hlo_pass",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
     ],
 )
@@ -5192,12 +5165,12 @@ cc_library(
     srcs = ["hlo_domain_isolator.cc"],
     hdrs = ["hlo_domain_isolator.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_remover",
         ":hlo_graph_dumper",
         ":hlo_pass",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
     ],
 )
@@ -5207,28 +5180,28 @@ cc_library(
     srcs = ["hlo_domain_remover.cc"],
     hdrs = ["hlo_domain_remover.h"],
     deps = [
-        ":hlo",
         ":hlo_domain_map",
         ":hlo_domain_verifier",
         ":hlo_graph_dumper",
         ":hlo_pass",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_domain_test",
     srcs = ["hlo_domain_test.cc"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
         ":hlo_domain_verifier",
         ":hlo_parser",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5241,19 +5214,19 @@ cc_library(
     srcs = ["hlo_element_type_converter.cc"],
     hdrs = ["hlo_element_type_converter.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_element_type_converter_test",
     srcs = ["hlo_element_type_converter_test.cc"],
     deps = [
@@ -5269,18 +5242,17 @@ cc_library(
     srcs = ["conditional_canonicalizer.cc"],
     hdrs = ["conditional_canonicalizer.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conditional_canonicalizer_test",
     srcs = ["conditional_canonicalizer_test.cc"],
     deps = [
         ":conditional_canonicalizer",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         "//tensorflow/compiler/xla:literal",
@@ -5288,6 +5260,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -5317,7 +5290,6 @@ cc_library(
     srcs = ["elemental_ir_emitter.cc"],
     hdrs = ["elemental_ir_emitter.h"],
     deps = [
-        ":hlo",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -5326,6 +5298,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_builder_mixin",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
@@ -5345,6 +5318,11 @@ cc_library(
 xla_test(
     name = "elemental_ir_emitter_test",
     srcs = ["elemental_ir_emitter_test.cc"],
+    backend_tags = {"gpu": [
+        "requires-gpu-nvidia",
+        "requires-gpu-sm70-only",
+        "requires-gpu-sm80-only",
+    ]},
     backends = [
         "cpu",
         "gpu",
@@ -5371,22 +5349,39 @@ cc_library(
     deps = [
         ":computation_layout",
         ":computation_placer",
+        ":hlo_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_layout",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_test(
+    name = "hlo_module_config_test",
+    srcs = ["hlo_module_config_test.cc"],
+    tags = [
+        "no_oss",
+    ],  # b/169705709, no protobuf matchers in OSS.
+    deps = [
+        ":hlo_module_config",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "computation_layout",
     srcs = ["computation_layout.cc"],
     hdrs = ["computation_layout.h"],
     deps = [
+        "//tensorflow/compiler/xla:printer",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -5399,7 +5394,6 @@ cc_library(
     srcs = ["hlo_graph_dumper.cc"],
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
-        ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -5407,6 +5401,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
         "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
@@ -5427,15 +5422,15 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_graph_dumper_test",
     srcs = ["hlo_graph_dumper_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_graph_dumper",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -5449,12 +5444,12 @@ cc_library(
     srcs = ["transpose_folding.cc"],
     hdrs = ["transpose_folding.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -5463,11 +5458,10 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "transpose_folding_test",
     srcs = ["transpose_folding_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":shape_inference",
         ":transpose_folding",
@@ -5477,6 +5471,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5492,23 +5487,22 @@ cc_library(
     srcs = ["zero_sized_hlo_elimination.cc"],
     hdrs = ["zero_sized_hlo_elimination.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "zero_sized_hlo_elimination_test",
     srcs = ["zero_sized_hlo_elimination_test.cc"],
     deps = [
-        ":hlo",
         ":shape_inference",
         ":zero_sized_hlo_elimination",
         "//tensorflow/compiler/xla:literal",
@@ -5518,6 +5512,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -5536,7 +5531,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "stream_pool_test",
     srcs = ["stream_pool_test.cc"],
     deps = [
@@ -5554,25 +5549,25 @@ cc_library(
     hdrs = ["hlo_proto_util.h"],
     deps = [
         ":buffer_assignment",
-        ":hlo",
         ":hlo_proto_cc",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_proto_util_test",
     srcs = ["hlo_proto_util_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_proto_cc",
         ":hlo_proto_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
@@ -5586,13 +5581,13 @@ cc_library(
     deps = [
         ":computation_placer",
         ":executable",
-        ":hlo",
         ":hlo_parser",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -5606,8 +5601,6 @@ cc_library(
         ":compiler",
         ":computation_placer",
         ":executable",
-        ":hlo",
-        ":hlo_module_group",
         ":hlo_module_util",
         ":hlo_parser",
         ":hlo_runner_interface",
@@ -5618,6 +5611,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:logging",
@@ -5633,10 +5628,11 @@ cc_library(
     hdrs = ["hlo_runner_pjrt.h"],
     deps = [
         ":executable",
-        ":hlo",
+        ":hlo_module_util",
         ":hlo_runner_interface",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/compiler/xla/pjrt:pjrt_future",
@@ -5663,15 +5659,15 @@ cc_library(
     srcs = ["sort_simplifier.cc"],
     hdrs = ["sort_simplifier.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "sort_simplifier_test",
     srcs = ["sort_simplifier_test.cc"],
     deps = [
@@ -5693,16 +5689,16 @@ cc_library(
     srcs = ["stable_sort_expander.cc"],
     hdrs = ["stable_sort_expander.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "stable_sort_expander_test",
     srcs = ["stable_sort_expander_test.cc"],
     deps = [
@@ -5725,13 +5721,13 @@ cc_library(
     srcs = ["tuple_util.cc"],
     hdrs = ["tuple_util.h"],
     deps = [
-        ":hlo",
         ":hlo_value",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tuple_util_test",
     srcs = ["tuple_util_test.cc"],
     deps = [
@@ -5753,13 +5749,13 @@ cc_library(
     srcs = ["root_instruction_sinker.cc"],
     hdrs = ["root_instruction_sinker.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":tuple_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "root_instruction_sinker_test",
     srcs = ["root_instruction_sinker_test.cc"],
     deps = [
@@ -5776,10 +5772,10 @@ cc_library(
     hdrs = ["while_util.h"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":hlo_creation_utils",
         ":tuple_util",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -5788,7 +5784,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_util_test",
     srcs = ["while_util_test.cc"],
     deps = [
@@ -5810,7 +5806,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         ":hlo_replication_analysis",
@@ -5819,21 +5814,22 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_all_reduce_code_motion_test",
     srcs = ["while_loop_all_reduce_code_motion_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":hlo_verifier",
         ":while_loop_all_reduce_code_motion",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -5848,7 +5844,6 @@ cc_library(
     srcs = ["while_loop_concat_code_motion.cc"],
     hdrs = ["while_loop_concat_code_motion.h"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_pass",
         ":hlo_pass_pipeline",
@@ -5860,7 +5855,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
@@ -5870,15 +5865,15 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_concat_code_motion_test",
     srcs = ["while_loop_concat_code_motion_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":hlo_verifier",
         ":while_loop_concat_code_motion",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -5896,7 +5891,6 @@ cc_library(
         "while_loop_invariant_code_motion.h",
     ],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_pass",
         ":tuple_util",
@@ -5905,6 +5899,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5913,7 +5908,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_invariant_code_motion_test",
     srcs = ["while_loop_invariant_code_motion_test.cc"],
     deps = [
@@ -5933,7 +5928,6 @@ cc_library(
     srcs = ["while_loop_expensive_invariant_code_motion.cc"],
     hdrs = ["while_loop_expensive_invariant_code_motion.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":tuple_util",
         ":while_loop_analysis",
@@ -5941,6 +5935,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5948,7 +5943,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_expensive_invariant_code_motion_test",
     srcs = ["while_loop_expensive_invariant_code_motion_test.cc"],
     deps = [
@@ -5968,17 +5963,17 @@ cc_library(
     srcs = ["while_loop_constant_sinking.cc"],
     hdrs = ["while_loop_constant_sinking.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_loop_constant_sinking_test",
     srcs = ["while_loop_constant_sinking_test.cc"],
     deps = [
@@ -5999,11 +5994,11 @@ cc_library(
     deps = [
         ":bfloat16_normalization",
         ":defuser",
-        ":hlo",
         ":hlo_memory_scheduler",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -6024,10 +6019,10 @@ cc_library(
     srcs = ["indexed_array_analysis.cc"],
     hdrs = ["indexed_array_analysis.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/util:ptr_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -6037,7 +6032,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "indexed_array_analysis_test",
     srcs = ["indexed_array_analysis_test.cc"],
     deps = [
@@ -6059,7 +6054,6 @@ cc_library(
     hdrs = ["hlo_parser.h"],
     deps = [
         ":computation_layout",
-        ":hlo",
         ":hlo_lexer",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal",
@@ -6068,11 +6062,14 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:map_util",
+        "//tensorflow/tsl/platform:float8",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -6081,18 +6078,18 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_parser_test",
     size = "small",
     srcs = ["hlo_parser_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -6128,11 +6125,11 @@ cc_library(
     srcs = ["map_inliner.cc"],
     hdrs = ["map_inliner.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -6145,19 +6142,19 @@ cc_library(
     srcs = ["optimize_input_output_buffer_alias.cc"],
     hdrs = ["optimize_input_output_buffer_alias.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "optimize_input_output_buffer_alias_test",
     srcs = ["optimize_input_output_buffer_alias_test.cc"],
     deps = [
@@ -6184,7 +6181,6 @@ cc_library(
     hdrs = ["ar_crs_combiner.h"],
     deps = [
         ":call_graph",
-        ":hlo",
         ":hlo_pass",
         ":hlo_query",
         ":hlo_replication_analysis",
@@ -6195,6 +6191,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -6208,9 +6205,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
@@ -6220,10 +6216,10 @@ cc_library(
     srcs = ["dynamic_index_splitter.cc"],
     hdrs = ["dynamic_index_splitter.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -6231,29 +6227,29 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "dynamic_index_splitter_test",
     srcs = ["dynamic_index_splitter_test.cc"],
     deps = [
         ":dynamic_index_splitter",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "ar_crs_combiner_test",
     srcs = ["ar_crs_combiner_test.cc"],
     deps = [
         ":ar_crs_combiner",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -6262,17 +6258,17 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":map_inliner",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -6280,11 +6276,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_casting_utils_test",
     srcs = ["hlo_casting_utils_test.cc"],
     deps = [
-        ":hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/platform:test",
     ],
@@ -6297,27 +6293,27 @@ cc_library(
     deps = [
         ":call_graph",
         ":call_inliner",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conditional_to_select_test",
     srcs = ["conditional_to_select_test.cc"],
     deps = [
         ":conditional_to_select",
-        ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/memory",
@@ -6329,10 +6325,10 @@ cc_library(
     srcs = ["slice_sinker.cc"],
     hdrs = ["slice_sinker.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
@@ -6403,7 +6399,7 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "custom_call_status_test",
     srcs = ["custom_call_status_test.cc"],
     deps = [
@@ -6422,11 +6418,10 @@ cc_library(
     deps = [":custom_call_status"],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "slice_sinker_test",
     srcs = ["slice_sinker_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_parser",
         ":hlo_pass",
@@ -6438,6 +6433,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -6465,7 +6461,6 @@ cc_library(
     srcs = ["rng_bit_generator_expander.cc"],
     hdrs = ["rng_bit_generator_expander.h"],
     deps = [
-        ":hlo",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -6473,7 +6468,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:prng",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -6503,17 +6498,17 @@ cc_library(
     deps = [
         ":computation_placer",
         ":global_device_id",
-        ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:blocking_counter",
         "@com_google_absl//absl/functional:function_ref",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "collective_ops_utils_test",
     srcs = ["collective_ops_utils_test.cc"],
     deps = [
@@ -6533,22 +6528,22 @@ cc_library(
     srcs = ["topk_rewriter.cc"],
     hdrs = ["topk_rewriter.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "topk_rewriter_test",
     srcs = ["topk_rewriter_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_dce",
         ":hlo_matchers",
         ":topk_rewriter",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_cpu",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -6563,15 +6558,15 @@ cc_library(
     srcs = ["operand_upcaster.cc"],
     hdrs = ["operand_upcaster.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":op_expander_pass",
         ":shape_inference",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "operand_upcaster_test",
     srcs = ["operand_upcaster_test.cc"],
     deps = [
@@ -6589,13 +6584,13 @@ cc_library(
     srcs = ["result_caster.cc"],
     hdrs = ["result_caster.h"],
     deps = [
-        ":hlo",
         ":op_expander_pass",
         ":shape_inference",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "result_caster_test",
     srcs = ["result_caster_test.cc"],
     deps = [
@@ -6625,15 +6620,15 @@ cc_library(
     srcs = ["convert_operand_folding.cc"],
     hdrs = ["convert_operand_folding.h"],
     deps = [
-        ":hlo",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/base:core_headers",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "convert_operand_folding_test",
     srcs = ["convert_operand_folding_test.cc"],
     deps = [
@@ -6655,15 +6650,15 @@ cc_library(
         "xla_debug_info_manager.h",
     ],
     deps = [
-        ":hlo",
         ":hlo_proto_cc",
         ":hlo_proto_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "xla_debug_info_manager_test",
     srcs = ["xla_debug_info_manager_test.cc"],
     deps = [
@@ -6723,7 +6718,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mapped_ptr_container_sorter_test",
     srcs = ["mapped_ptr_container_sorter_test.cc"],
     deps = [
@@ -6754,9 +6749,13 @@ cc_library(
     srcs = ["compilation_environments.cc"],
     hdrs = ["compilation_environments.h"],
     deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",  # fixdeps: keep
         "//tensorflow/tsl/platform:protobuf",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -6769,7 +6768,7 @@ cc_library(
     name = "custom_call_sharding_helper",
     srcs = ["custom_call_sharding_helper.cc"],
     hdrs = ["custom_call_sharding_helper.h"],
-    deps = [":hlo"],
+    deps = ["//tensorflow/compiler/xla/hlo/ir:hlo"],
 )
 
 tf_proto_library(
@@ -6779,14 +6778,17 @@ tf_proto_library(
     cc_api_version = 2,
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "compilation_environments_test",
     srcs = ["compilation_environments_test.cc"],
     deps = [
         ":compilation_environments",
         ":test_compilation_environment_proto_cc",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:protobuf",
     ],
@@ -6798,7 +6800,6 @@ cc_library(
     hdrs = ["layout_normalization.h"],
     deps = [
         ":collective_ops_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
         ":pattern_matcher",
@@ -6810,7 +6811,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -6822,8 +6823,8 @@ cc_library(
     srcs = ["instruction_hoister.cc"],
     hdrs = ["instruction_hoister.h"],
     deps = [
-        ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -6833,20 +6834,20 @@ cc_library(
     hdrs = ["scatter_simplifier.h"],
     deps = [
         ":gather_scatter_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "scatter_simplifier_test",
     srcs = ["scatter_simplifier_test.cc"],
     deps = [
@@ -6864,30 +6865,30 @@ cc_library(
     hdrs = ["select_and_scatter_expander.h"],
     deps = [
         ":call_inliner",
-        ":hlo",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "select_and_scatter_expander_test",
     srcs = ["select_and_scatter_expander_test.cc"],
     deps = [
-        ":hlo",
         ":hlo_matchers",
         ":select_and_scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "layout_normalization_test",
     srcs = [
         "layout_normalization_test.cc",
@@ -6896,7 +6897,6 @@ tf_cc_test(
         ":layout_normalization",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -6912,9 +6912,9 @@ cc_library(
     srcs = ["change_op_data_type.cc"],
     hdrs = ["change_op_data_type.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -6923,9 +6923,9 @@ cc_library(
     srcs = ["gather_scatter_utils.cc"],
     hdrs = ["gather_scatter_utils.h"],
     deps = [
-        ":hlo",
         ":hlo_creation_utils",
         "//tensorflow/compiler/xla:permutation_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -6936,18 +6936,60 @@ cc_library(
     hdrs = ["gather_simplifier.h"],
     deps = [
         ":gather_scatter_utils",
-        ":hlo",
         ":hlo_creation_utils",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+cc_library(
+    name = "stochastic_convert_decomposer",
+    srcs = ["stochastic_convert_decomposer.cc"],
+    hdrs = ["stochastic_convert_decomposer.h"],
+    deps = [
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "stochastic_convert_decomposer_test",
+    srcs = ["stochastic_convert_decomposer_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":stochastic_convert_decomposer",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "metrics_hook_interface",
+    hdrs = ["metrics_hook_interface.h"],
+    deps = [
+        ":metrics_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+xla_cc_test(
     name = "gather_simplifier_test",
     srcs = ["gather_simplifier_test.cc"],
     deps = [
@@ -6957,19 +6999,18 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "change_op_data_type_test",
     srcs = ["change_op_data_type_test.cc"],
     deps = [
         ":change_op_data_type",
-        ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:permutation_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
@@ -6979,3 +7020,158 @@ tf_cc_test(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+xla_cc_binary(
+    name = "xla_compile",
+    srcs = ["xla_compile_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":compiler",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:cpu_executable",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/util:command_line_flags",
+        "@stablehlo//:register",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service/gpu:executable_proto_cc",
+        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
+        "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
+    ]),
+)
+
+# A simple test of xla_aot_compile which generates an output file from an mhlo file.
+xla_aot_compile_cpu(
+    name = "xla_aot_compile_test_cpu_executable",
+    module = "xla_aot_compile_test.mlir",
+)
+
+xla_aot_compile_cpu(
+    name = "xla_aot_compile_stablehlo_test_cpu_executable",
+    module = "xla_aot_compile_stablehlo_test.mlir",
+)
+
+xla_aot_compile_gpu(
+    name = "xla_aot_compile_test_gpu_executable",
+    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test.mlir",
+)
+
+xla_aot_compile_gpu(
+    name = "xla_aot_compile_test_gpu_executable_constant",
+    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test_constant.mlir",
+)
+
+xla_aot_compile_gpu(
+    name = "xla_aot_compile_test_gpu_executable_gemm",
+    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test_gemm.mlir",
+)
+
+xla_aot_compile_gpu_runtime_autotuning(
+    name = "xla_aot_compile_test_gpu_executable_gemm_runtime_autotuning",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test_gemm.mlir",
+)
+
+xla_aot_compile_gpu(
+    name = "xla_aot_compile_test_gpu_executable_convolution",
+    autotune_results = "xla_aot_compile_test_autotune_results.prototxt",
+    gpu_target_config = "xla_aot_compile_test_gpu_target_config.prototxt",
+    module = "xla_aot_compile_test_convolution.mlir",
+)
+
+xla_cc_test(
+    name = "xla_aot_compile_cpu_test",
+    srcs = ["xla_aot_compile_cpu_test.cc"],
+    data = [":xla_aot_compile_test_cpu_executable"],
+    tags = ["no_oss"],
+    deps = [
+        ":cpu_plugin",
+        ":platform_util",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "xla_aot_compile_stablehlo_cpu_test",
+    srcs = ["xla_aot_compile_stablehlo_cpu_test.cc"],
+    data = [":xla_aot_compile_stablehlo_test_cpu_executable"],
+    deps = [
+        ":cpu_plugin",
+        ":platform_util",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "xla_aot_compile_gpu_test",
+    srcs = if_cuda_is_configured(["xla_aot_compile_gpu_test.cc"]),
+    data = if_cuda_is_configured([
+        ":xla_aot_compile_test_gpu_executable",
+        ":xla_aot_compile_test_gpu_executable_constant",
+        ":xla_aot_compile_test_gpu_executable_gemm",
+        ":xla_aot_compile_test_gpu_executable_gemm_runtime_autotuning",
+        ":xla_aot_compile_test_gpu_executable_convolution",
+    ]),
+    env = {
+        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
+    },
+    tags = [
+        "gpu",
+        "no_oss",
+        "no_rocm",
+        "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
+        "requires-gpu-sm60-only",
+    ],
+    deps = if_cuda_is_configured([
+        ":gpu_plugin_impl",
+        ":platform_util",
+        "@com_google_googletest//:gtest_main",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+    ]),
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8c909167fe0..be5cf4eb6fa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -34,6 +34,12 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_comparison.h"
@@ -41,19 +47,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/overflow_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -1050,8 +1049,11 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) {
   }
   HloInstruction* op;
   if (Match(bitcast, m::Bitcast(m::Bitcast(m::Op(&op))))) {
-    return ReplaceWithNewInstruction(
-        bitcast, HloInstruction::CreateBitcast(bitcast->shape(), op));
+    auto new_bitcast = HloInstruction::CreateBitcast(bitcast->shape(), op);
+    HloInstruction* new_bitcast_ptr = new_bitcast.get();
+    TF_RETURN_IF_ERROR(
+        ReplaceWithNewInstruction(bitcast, std::move(new_bitcast)));
+    bitcast = new_bitcast_ptr;
   }
   // All bitcasts can be eliminated (assuming layout constraints are satisfied).
   ReplaceInstructionIfCompatible(bitcast, bitcast->mutable_operand(0));
@@ -1335,7 +1337,9 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
         copy, HloInstruction::CreateUnary(copy->shape(), HloOpcode::kCopy, op));
   }
   // All copies can be eliminated (assuming layout constraints are satisfied).
-  if (ReplaceInstructionIfCompatible(copy, copy->mutable_operand(0))) {
+  if ((!copy->has_sharding() ||
+       copy->GetModule()->entry_computation()->root_instruction() != copy) &&
+      ReplaceInstructionIfCompatible(copy, copy->mutable_operand(0))) {
     return OkStatus();
   }
 
@@ -2774,7 +2778,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
         lhs->shape().rank()) ||
        (dnums.rhs_contracting_dimensions_size() +
             dnums.rhs_batch_dimensions_size() ==
-        rhs->shape().rank()))) {
+        rhs->shape().rank())) &&
+      ShouldStrengthReduceDotToReduce(dot)) {
     TF_ASSIGN_OR_RETURN(HloInstruction * new_lhs,
                         NormalizeDotOperandToBatchMajorAndContractingMinor(
                             lhs, dnums.lhs_batch_dimensions(),
@@ -3676,24 +3681,7 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
   HloInstruction* lhs;
   HloInstruction* rhs;
   CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
-  HloInstruction *a, *b;
-  // Gt(Max(a,b),_) or Gt(Max(b,a),_)
-  if (Match(compare, m::Gt(m::Maximum(m::Op(&a), m::Op(&b)), m::Op(&rhs)))) {
-    // Gt(Max(a,b),b) -> Gt(a,b)
-    if (rhs == b) {
-      HloInstruction* new_compare =
-          compare->AddInstruction(HloInstruction::CreateCompare(
-              compare->shape(), a, b, compare->comparison_direction()));
-      return ReplaceInstruction(compare, new_compare);
-    } else if (rhs == a) {  // Gt(Max(a,b),a) -> Gt(b,a)
-      HloInstruction* new_compare =
-          compare->AddInstruction(HloInstruction::CreateCompare(
-              compare->shape(), b, a, compare->comparison_direction()));
-      return ReplaceInstruction(compare, new_compare);
-    } else {
-      return OkStatus();
-    }
-  }
+
   if (Cast<HloCompareInstruction>(compare)->type() ==
       Comparison::Type::kUnsigned) {
     // X u<  0 -> false
@@ -6332,6 +6320,50 @@ Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
         HloInstruction::CreateTernary(select->shape(), HloOpcode::kSelect,
                                       pred_operand, on_false, on_true));
   }
+
+  // select(pred, xs, dynamic_update_slice(xs, x, i))
+  //     -> dynamic_update_slice(xs, select(pred, dynamic_slice(xs, i), x), i)
+  HloInstruction* update_slice;
+  HloInstruction* xs;
+  HloInstruction* xs2;
+  auto update_slice_op = m::Op(&update_slice)
+                             .WithOpcode(HloOpcode::kDynamicUpdateSlice)
+                             .WithOperand(0, m::Op(&xs))
+                             .WithOneUse();
+  bool match_slice_left =
+      Match(select, m::Select(m::Op(), m::Op(&xs2), update_slice_op)) &&
+      (xs == xs2);
+  bool match_slice_right =
+      Match(select, m::Select(m::Op(), update_slice_op, m::Op(&xs2))) &&
+      (xs == xs2);
+  if (match_slice_left || match_slice_right) {
+    HloInstruction* pred = select->mutable_operand(0);
+    HloInstruction* x = update_slice->mutable_operand(1);
+    absl::Span<HloInstruction* const> i =
+        absl::MakeSpan(update_slice->operands()).subspan(2);
+    HloInstruction* new_pred;
+    if (ShapeUtil::IsScalar(pred->shape())) {
+      new_pred = pred;
+    } else {
+      Shape new_pred_shape = x->shape();
+      new_pred_shape.set_element_type(pred->shape().element_type());
+      simplifier_->UpdateLayout(&new_pred_shape);
+      new_pred = select->AddInstruction(HloInstruction::CreateDynamicSlice(
+          new_pred_shape, pred, i, x->shape().dimensions()));
+    }
+    HloInstruction* new_x =
+        select->AddInstruction(HloInstruction::CreateDynamicSlice(
+            x->shape(), xs, i, x->shape().dimensions()));
+    HloInstruction* new_x2 =
+        select->AddInstruction(HloInstruction::CreateTernary(
+            x->shape(), HloOpcode::kSelect, new_pred,
+            match_slice_left ? new_x : x, match_slice_left ? x : new_x));
+    std::unique_ptr<HloInstruction> new_xs =
+        HloInstruction::CreateDynamicUpdateSlice(select->shape(), xs, new_x2,
+                                                 i);
+    return ReplaceWithNewInstruction(select, std::move(new_xs));
+  }
+
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 1fdcc7214aa..7baa3d0d5eb 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/inlined_vector.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -371,6 +371,10 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   // Allow backend constraints on tiling etc. to invalidate optimizations.
   virtual bool IsValidLayout(const Shape& shape) { return true; }
+  // Allow backend targets to determine whether a layout is inefficient.
+  virtual bool ShouldStrengthReduceDotToReduce(const HloInstruction* hlo) {
+    return true;
+  }
 
  protected:
   // The backend-specific options selected for the algebraic simplifier.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index d4946f1618a..f5925e4b9b7 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -21,14 +21,14 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -609,6 +609,80 @@ TEST_F(AlgebraicSimplifierTest, SelectWithNotPred) {
   EXPECT_EQ(operands[2], param1);
 }
 
+// Test that select(pred, xs, dynamic_update_slice(xs, x, i)) is simplified
+// to dynamic_update_slice(xs, select(pred, dynamic_slice(xs, i), x), i)
+TEST_F(AlgebraicSimplifierTest, SelectDUSWithShapedPred) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = pred[8] parameter(0)
+      xs = f32[8] parameter(1)
+      x = f32[2] parameter(2)
+      i = u32[] parameter(3)
+      dus = f32[8] dynamic-update-slice(xs, x, i)
+      ROOT out = f32[8] select(p, dus, xs)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::DynamicUpdateSlice(
+                  m::Parameter(1),
+                  m::Select(m::DynamicSlice(m::Parameter(0), m::Parameter(3)),
+                            m::Parameter(2),
+                            m::DynamicSlice(m::Parameter(1), m::Parameter(3))),
+                  m::Parameter(3))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReverseSelectDUSWithShapedPred) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = pred[8] parameter(0)
+      xs = f32[8] parameter(1)
+      x = f32[2] parameter(2)
+      i = u32[] parameter(3)
+      dus = f32[8] dynamic-update-slice(xs, x, i)
+      ROOT out = f32[8] select(p, xs, dus)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::DynamicUpdateSlice(
+                  m::Parameter(1),
+                  m::Select(m::DynamicSlice(m::Parameter(0), m::Parameter(3)),
+                            m::DynamicSlice(m::Parameter(1), m::Parameter(3)),
+                            m::Parameter(2)),
+                  m::Parameter(3))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SelectDUSNotTriggering) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = pred[8] parameter(0)
+      xs = f32[8] parameter(1)
+      x = f32[2] parameter(2)
+      i = u32[] parameter(3)
+      ys = f32[8] parameter(4)
+      dus = f32[8] dynamic-update-slice(xs, x, i)
+      ROOT out = f32[8] select(p, dus, ys)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(
+                  m::Parameter(0),
+                  m::DynamicUpdateSlice(m::Parameter(1), m::Parameter(2),
+                                        m::Parameter(3)),
+                  m::Parameter(4))));
+}
+
 // Test that Reduce(Reduce(A)) -> Reduce(A)
 TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   auto m = CreateNewVerifiedModule();
@@ -7889,49 +7963,6 @@ TEST_F(AlgebraicSimplifierTest, AbsEliminationMultiply) {
               GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
-TEST_F(AlgebraicSimplifierTest, ScalarCompareMaximumSimplification) {
-  std::string module_string = R"(
-    HloModule m
-    test {
-      a = s32[] parameter(0)
-      b = s32[] parameter(1)
-      max = s32[] maximum(a,b)
-      ROOT cmp = pred[] compare(max, a), direction=GT
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_string));
-  SCOPED_TRACE("before: " + m->ToString());
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-  SCOPED_TRACE("after: " + m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::Compare(m::Parameter(1), m::Parameter(0))));
-  // Numerically unstable transformation shouldn't be applied to floating types.
-  std::string module_string_f32 =
-      absl::StrReplaceAll(module_string, {{"s32", "f32"}});
-  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-}
-
-TEST_F(AlgebraicSimplifierTest, CompareMaximumSimplification) {
-  std::string module_string = R"(
-    HloModule m
-    test {
-      a = s32[12,4,1,6] parameter(0)
-      b = s32[12,4,1,6] parameter(1)
-      max = s32[12,4,1,6] maximum(a,b)
-      ROOT cmp = pred[12,4,1,6] compare(max, b), direction=GT
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_string));
-  SCOPED_TRACE("before: " + m->ToString());
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-  SCOPED_TRACE("after: " + m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              GmockMatch(m::Compare(m::Parameter(0), m::Parameter(1))));
-  // Numerically unstable transformation shouldn't be applied to floating types.
-  std::string module_string_f32 =
-      absl::StrReplaceAll(module_string, {{"s32", "f32"}});
-  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
-}
 
 TEST_F(AlgebraicSimplifierTest, AbsEliminationPower2) {
   const char* kModuleStr = R"(
@@ -8848,6 +8879,29 @@ TEST_F(AlgebraicSimplifierTest, BitcastCopyChainSmall) {
               GmockMatch(m::Bitcast(m::Bitcast(m::Copy(m::Parameter(0))))));
 }
 
+TEST_F(AlgebraicSimplifierTest, BitcastUndoesBitcast) {
+  const char* kModuleStr = R"(
+   HloModule m
+   ENTRY %main (p0: f32[32]) -> f32[1, 32, 32] {
+    %p0 = f32[32]{0} parameter(0)
+    %bitcast.0 = f32[1,32]{1, 0} bitcast(%p0)
+    %bitcast.1 = f32[32]{0} bitcast(%bitcast.0)
+    ROOT %broadcast = f32[1, 32, 32]{2, 1, 0} broadcast(%bitcast.1), dimensions={1}
+  }
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  SCOPED_TRACE(m->ToString());
+  auto result = simplifier.Run(m.get()).value();
+  SCOPED_TRACE(m->ToString());
+  ASSERT_TRUE(result);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
+}
+
 // Reverse(Reverse(A)) ==> A.
 TEST_F(AlgebraicSimplifierTest, RemoveIdenticalNestedReverse) {
   const char* kModuleStr = R"(
@@ -8940,6 +8994,27 @@ TEST_F(AlgebraicSimplifierTest, SwapConstantEwboWithReverse2) {
                                       m::Reverse(m::Parameter(0)))));
 }
 
+// Don't replace root instruction with the copy-to-operand optimization if
+// sharding is applied.
+TEST_F(AlgebraicSimplifierTest, RootCopySharding) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = bf16[8] parameter(0)
+      ROOT r0 = bf16[8] copy(p0), sharding={devices=[8]0,1,2,3,4,5,6,7}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  SCOPED_TRACE("Before rewrite\n" + m->ToString());
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  auto returned = simplifier.Run(m.get()).value();
+  SCOPED_TRACE("After rewrite\n" + m->ToString());
+  ASSERT_FALSE(returned);
+  auto* root = m->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Copy(m::Parameter(0))));
+}
+
 // non-contiguous degenerate dimensions adding
 // reverse(DegenerateDimensionAddingReshape(x)) ==>
 // DegenerateDimensionAddingReshape(reverse(x))
diff --git a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
index f11dbd454f2..b700d86e013 100644
--- a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
+++ b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.h b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.h
index a82334e3185..f652696570c 100644
--- a/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.h
+++ b/tensorflow/compiler/xla/service/all_gather_broadcast_reorder.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_BROADCAST_REORDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/all_gather_combiner.cc b/tensorflow/compiler/xla/service/all_gather_combiner.cc
index 1546d2cd2e2..a6f1fece799 100644
--- a/tensorflow/compiler/xla/service/all_gather_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_gather_combiner.cc
@@ -26,12 +26,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/all_gather_combiner.h b/tensorflow/compiler/xla/service/all_gather_combiner.h
index b1bdb539360..21b83f63a27 100644
--- a/tensorflow/compiler/xla/service/all_gather_combiner.h
+++ b/tensorflow/compiler/xla/service/all_gather_combiner.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/all_gather_combiner_test.cc b/tensorflow/compiler/xla/service/all_gather_combiner_test.cc
index 54c52b66779..02a24e80352 100644
--- a/tensorflow/compiler/xla/service/all_gather_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/all_gather_combiner_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.cc b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
index 3d4cb46b9d1..670b94d355a 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer.cc
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/collective_decomposer_utils.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.h b/tensorflow/compiler/xla/service/all_gather_decomposer.h
index 53dbdf88d2a..6534e0818e6 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer.h
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
index 16d6767082c..81808678d7e 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
index c91b74e87a7..9e042c8e66e 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -25,13 +25,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.h b/tensorflow/compiler/xla/service/all_reduce_combiner.h
index 8b007e05a6b..d59e1928a45 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner.h
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
index acf4812da3c..135eebf7eba 100644
--- a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_contiguous.cc b/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
index 10f8b255814..a4efedbad94 100644
--- a/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_contiguous.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_contiguous.h b/tensorflow/compiler/xla/service/all_reduce_contiguous.h
index 5d6e7cd3cb7..73cf97ff1da 100644
--- a/tensorflow/compiler/xla/service/all_reduce_contiguous.h
+++ b/tensorflow/compiler/xla/service/all_reduce_contiguous.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_CONTIGUOUS_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_contiguous_test.cc b/tensorflow/compiler/xla/service/all_reduce_contiguous_test.cc
index 10be83d376e..5c38ec9fb08 100644
--- a/tensorflow/compiler/xla/service/all_reduce_contiguous_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_contiguous_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_folder.cc b/tensorflow/compiler/xla/service/all_reduce_folder.cc
index 098c06e3a09..2f762b83506 100644
--- a/tensorflow/compiler/xla/service/all_reduce_folder.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_folder.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/all_reduce_folder.h b/tensorflow/compiler/xla/service/all_reduce_folder.h
index 5a20b8673bf..bdfa305a393 100644
--- a/tensorflow/compiler/xla/service/all_reduce_folder.h
+++ b/tensorflow/compiler/xla/service/all_reduce_folder.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_FOLDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_FOLDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_folder_test.cc b/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
index de8c65062d8..03e8ccc4eaf 100644
--- a/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_folder_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_folder.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_key.cc b/tensorflow/compiler/xla/service/all_reduce_key.cc
index e0d4b661e73..c2a2a265b73 100644
--- a/tensorflow/compiler/xla/service/all_reduce_key.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_key.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_key.h b/tensorflow/compiler/xla/service/all_reduce_key.h
index 6f93baccc60..6c1b475dc7e 100644
--- a/tensorflow/compiler/xla/service/all_reduce_key.h
+++ b/tensorflow/compiler/xla/service/all_reduce_key.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_promotion.cc b/tensorflow/compiler/xla/service/all_reduce_promotion.cc
new file mode 100644
index 00000000000..e6c09b4b07b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_promotion.cc
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_promotion.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace xla {
+
+namespace {
+bool IsAllReduce(const HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kAllReduce ||
+         inst->opcode() == HloOpcode::kReduceScatter;
+}
+
+std::unique_ptr<HloInstruction> CloneAllReduce(
+    const HloInstruction* inst, const Shape& shape,
+    absl::Span<HloInstruction* const> operands) {
+  // clone an all-reduce or reduce-scatter and also clone the attached
+  // computation to match the type.
+  std::unique_ptr<HloInstruction> new_inst =
+      inst->CloneWithNewOperands(shape, operands);
+  HloComputation* to_apply = new_inst->to_apply();
+  HloComputation* to_apply_promoted = [&]() {
+    PrimitiveType type = shape.element_type();
+    std::string name = to_apply->name() + "_promoted";
+    HloComputation::Builder promoted(name);
+    auto x = promoted.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+    auto y = promoted.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+    promoted.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), to_apply->root_instruction()->opcode(),
+        x, y));
+    return inst->GetModule()->AddEmbeddedComputation(promoted.Build());
+  }();
+  new_inst->set_to_apply(to_apply_promoted);
+  return new_inst;
+}
+
+}  // namespace
+
+// Promote 16-bit integer all-reduce and reduce-scatter to 32-bit integer types.
+// {{U16, U32}, {S16, S32}}
+AllReducePromotion::AllReducePromotion(
+    absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types)
+    : pass_(from_to_types, IsAllReduce, CloneAllReduce) {}
+
+StatusOr<bool> AllReducePromotion::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return pass_.Run(module, execution_threads);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_promotion.h b/tensorflow/compiler/xla/service/all_reduce_promotion.h
new file mode 100644
index 00000000000..3ae84f0e519
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_promotion.h
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
+
+#include <utility>
+#include "tensorflow/compiler/xla/service/change_op_data_type.h"
+
+namespace xla {
+
+class AllReducePromotion : public HloModulePass {
+ public:
+  explicit AllReducePromotion(
+      absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types);
+  absl::string_view name() const override { return "all-reduce-promotion"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  ChangeOpDataType pass_;
+};
+
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_PROMOTION_H_
diff --git a/tensorflow/compiler/xla/service/all_reduce_promotion_test.cc b/tensorflow/compiler/xla/service/all_reduce_promotion_test.cc
new file mode 100644
index 00000000000..20c21ba3b0a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_promotion_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_promotion.h"
+
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+namespace m = ::xla::match;
+
+class AllReducePromotionTest : public HloTestBase {
+ public:
+  AllReducePromotion pass_{{{U16, U32}, {S16, S32}}};
+};
+
+TEST_F(AllReducePromotionTest, SimplePromotionAllReduce) {
+  absl::string_view hlo_text = R"(
+  HloModule test
+
+  sum {
+    a = u16[] parameter(0)
+    b = u16[] parameter(1)
+    ROOT add.2 = u16[] add(a, b)
+  }
+
+  ENTRY test_computation {
+    id32 = u32[] replica-id()
+    id = u16[] convert(id32)
+    id2 = u16[2] broadcast(id), dimensions={}
+    a0 = u16[2] constant({10, 15})
+    a1 = u16[2] add(id2, a0)
+    ROOT cp = u16[2] all-reduce(a1), replica_groups={}, to_apply=sum
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass_, module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Convert(m::AllReduce(m::Convert().WithShape(U32, {2}))
+                                .WithShape(U32, {2}))
+                     .WithShape(U16, {2})));
+}
+
+TEST_F(AllReducePromotionTest, SimplePromotionReduceScatter) {
+  absl::string_view hlo_text = R"(
+  HloModule test
+
+  sum {
+    a = u16[] parameter(0)
+    b = u16[] parameter(1)
+    ROOT add.2 = u16[] add(a, b)
+  }
+
+  ENTRY test_computation {
+    id32 = u32[] replica-id()
+    id = u16[] convert(id32)
+    id2 = u16[2] broadcast(id), dimensions={}
+    a0 = u16[2] constant({10, 15})
+    a1 = u16[2] add(id2, a0)
+    ROOT cp = u16[1] reduce-scatter(a1), dimensions={0}, replica_groups={}, to_apply=sum
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass_, module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Convert(m::ReduceScatter(m::Convert().WithShape(U32, {2}))
+                                .WithShape(U32, {1}))
+                     .WithShape(U16, {1})));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate.cc b/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
index df70f85eaca..affd2e6d0fe 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/tsl/platform/errors.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate.h b/tensorflow/compiler/xla/service/all_reduce_reassociate.h
index faae495dfe9..ecb44d6b09d 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate.h
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_REASSOCIATE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_REASSOCIATE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc b/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
index 7312bed40d3..2c3723ce029 100644
--- a/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_reassociate_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
index 60920088298..808d36f35c6 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.h b/tensorflow/compiler/xla/service/all_reduce_simplifier.h
index 65204f87769..1a57a2def82 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier.h
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_SIMPLIFIER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
index d59ece27aae..f65fd7e11e9 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
diff --git a/tensorflow/compiler/xla/service/all_to_all_decomposer.cc b/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
index 06112625af6..ee97a89b92e 100644
--- a/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
+++ b/tensorflow/compiler/xla/service/all_to_all_decomposer.cc
@@ -20,13 +20,13 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/all_to_all_decomposer.h b/tensorflow/compiler/xla/service/all_to_all_decomposer.h
index 264e6e4c432..6557f616ef9 100644
--- a/tensorflow/compiler/xla/service/all_to_all_decomposer.h
+++ b/tensorflow/compiler/xla/service/all_to_all_decomposer.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_TO_ALL_DECOMPOSER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 871941856a4..b4141cd330b 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 43cd3ff7263..a069c2ca857 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/async_collective_creator.cc b/tensorflow/compiler/xla/service/async_collective_creator.cc
index 752c5166972..7699fda7a52 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator.cc
+++ b/tensorflow/compiler/xla/service/async_collective_creator.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 
+#include <iterator>
+#include <memory>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/frontend_attributes.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/tsl/platform/errors.h"
 
@@ -135,6 +140,9 @@ StatusOr<bool> AsyncCollectiveCreator::Run(
                   operand, cp->mutable_operand(1), cp->mutable_operand(2),
                   cp->mutable_operand(3), cp->source_target_pairs(),
                   cp->dynamic_slice_sizes_list(), cp->channel_id()));
+          if (HasDisjointReadWriteRegionsAttr(cp)) {
+            SetDisjointReadWriteRegionsAttr(collective_permute_start);
+          }
         }
         collective_permute_start->set_metadata(cp->metadata());
         collective_permute_start->CopyBackendConfigFrom(cp);
diff --git a/tensorflow/compiler/xla/service/async_collective_creator_test.cc b/tensorflow/compiler/xla/service/async_collective_creator_test.cc
index 91888ee8251..952cc143814 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator_test.cc
+++ b/tensorflow/compiler/xla/service/async_collective_creator_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/xla/service/async_op_canonicalizer.h b/tensorflow/compiler/xla/service/async_op_canonicalizer.h
index 748a700e0b7..b16ede69320 100644
--- a/tensorflow/compiler/xla/service/async_op_canonicalizer.h
+++ b/tensorflow/compiler/xla/service/async_op_canonicalizer.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ASYNC_OP_CANONICALIZER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ASYNC_OP_CANONICALIZER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
index 8d618ec6105..205d3c2a40b 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.h b/tensorflow/compiler/xla/service/batch_dot_simplification.h
index c2ee8e380b1..93dfeff86e0 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.h
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 7afe64b0a6d..c845eef7537 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 2026e77877d..41bfe71ac89 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 6ad88b5ac8d..6edc7854b0d 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index d1a15431f21..e6fe1fa106f 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_conversion_folding.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
index aa592fe949c..549b4c10397 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 0d0847a379e..7907dd090a2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -14,11 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_conversion_folding.h"
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index c9f683a65ac..c0f507de356 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.h b/tensorflow/compiler/xla/service/bfloat16_normalization.h
index e5464e2b960..607c0e1fa58 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.h
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_NORMALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_NORMALIZATION_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 2ae896ae46f..e4e0810d419 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -14,12 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 50eb56df77a..2306d302fa4 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index 44dbd32f128..e2348768fd7 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index c34b9e59cab..fc674d07f7d 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/bfloat16_propagation.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index a941bb0450b..151250ebd4c 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.h b/tensorflow/compiler/xla/service/bfloat16_support.h
index 6535a1892f6..0e9b6bf1d85 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.h
+++ b/tensorflow/compiler/xla/service/bfloat16_support.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_SUPPORT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_SUPPORT_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/bitcast_dtypes_expander.cc b/tensorflow/compiler/xla/service/bitcast_dtypes_expander.cc
index 71a7faa4d96..67f720ec541 100644
--- a/tensorflow/compiler/xla/service/bitcast_dtypes_expander.cc
+++ b/tensorflow/compiler/xla/service/bitcast_dtypes_expander.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/broadcast.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/bitcast_dtypes_expander.h b/tensorflow/compiler/xla/service/bitcast_dtypes_expander.h
index 986a44fc1b2..d39fcfb93dd 100644
--- a/tensorflow/compiler/xla/service/bitcast_dtypes_expander.h
+++ b/tensorflow/compiler/xla/service/bitcast_dtypes_expander.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 3a9d2e887c6..05c7e7c47f7 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
@@ -37,8 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
-#include "tensorflow/compiler/xla/service/hlo_op_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -545,7 +545,8 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 
 // Combines allocations of temporary buffers of the same color into one big
 // BufferAllocation.
-void BufferAssignment::CombineTempAllocations() {
+void BufferAssignment::CombineTempAllocations(
+    const absl::flat_hash_set<BufferValue::Color>& private_stack_colors) {
   VLOG(1) << "CombineTempAllocations()";
   // Stores the combined allocations.
   std::deque<BufferAllocation> combined_allocations;
@@ -594,10 +595,20 @@ void BufferAssignment::CombineTempAllocations() {
 
       // Each temp allocation is placed end-to-end, accounting for alignment.
       // The offset of each buffer in the combined allocation is computed from
-      // the base offset of the allocation.
+      // the base offset of the allocation. For private stack color, we assume
+      // each allocation object corresponds to one of the independent executions
+      // of the private stack computations, so it is safe to reuse offsets in
+      // that case.
       int64_t alignment = color_alignment_(color);
-      const int64_t base = RoundUpTo(combined_allocation->size(), alignment);
-      combined_allocation->set_size(base + temp_allocation.size());
+      int64_t base;
+      bool is_private_stack = private_stack_colors.contains(color);
+      if (is_private_stack) {
+        base = 0;
+        combined_allocation->set_size(std::max(base, temp_allocation.size()));
+      } else {
+        base = RoundUpTo(combined_allocation->size(), alignment);
+        combined_allocation->set_size(base + temp_allocation.size());
+      }
       for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
         const HloValue* value = buffer_offset_size.first;
         const int64_t offset = buffer_offset_size.second.offset;
@@ -609,10 +620,16 @@ void BufferAssignment::CombineTempAllocations() {
         combined_allocation->AddHeapTrace(temp_allocation.HeapTraces().front());
       }
 
-      combined_allocation->peak_buffers_.insert(
-          combined_allocation->peak_buffers_.end(),
-          temp_allocation.peak_buffers_.begin(),
-          temp_allocation.peak_buffers_.end());
+      if (is_private_stack) {
+        if (temp_allocation.size() == combined_allocation->size()) {
+          combined_allocation->peak_buffers_ = temp_allocation.peak_buffers_;
+        }
+      } else {
+        combined_allocation->peak_buffers_.insert(
+            combined_allocation->peak_buffers_.end(),
+            temp_allocation.peak_buffers_.begin(),
+            temp_allocation.peak_buffers_.end());
+      }
     }
     // Replace all existing temporary allocations with the new combined
     // allocations.
@@ -918,12 +935,13 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     bool allocate_buffers_for_constants, BufferAssigner::Colorer colorer,
     std::optional<BufferAssigner::MustNotLiveOut> must_not_live_out,
     HloDataflowAnalysis::CanShareBuffer can_share_buffer,
-    std::unique_ptr<PresetAssignments> preset_assignments) {
+    std::unique_ptr<PresetAssignments> preset_assignments,
+    const PrivateStacks& private_stacks) {
   BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
                           must_not_live_out, std::move(preset_assignments));
   return assigner.CreateAssignment(
       module, std::move(hlo_ordering), std::move(buffer_size),
-      std::move(color_alignment), std::move(can_share_buffer));
+      std::move(color_alignment), std::move(can_share_buffer), private_stacks);
 }
 
 bool BufferAssigner::LiveRangeInterferes(const HloValue* buffer1,
@@ -1358,7 +1376,7 @@ Status BufferAssigner::AssignBuffersForComputations(
 
 flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>>
 BufferAssigner::SplitBuffersByColor(
-    const flat_hash_set<const HloValue*>& buffers) {
+    const flat_hash_set<const HloValue*>& buffers) const {
   flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>> color_map;
   for (auto buffer : buffers) {
     color_map[buffer->color()].insert(buffer);
@@ -1366,6 +1384,28 @@ BufferAssigner::SplitBuffersByColor(
   return color_map;
 }
 
+absl::flat_hash_map<const HloComputation*, absl::flat_hash_set<const HloValue*>>
+BufferAssigner::SplitBuffersByPrivateStackComputation(
+    const absl::flat_hash_set<const HloValue*>& buffers,
+    absl::Span<const HloComputation* const> private_stack_computations,
+    const CallGraph& call_graph) const {
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloValue*>>
+      computation_map;
+  for (const HloValue* value : buffers) {
+    bool found_computation = false;
+    for (const HloComputation* computation : private_stack_computations) {
+      if (call_graph.InstructionIsNestedIn(value->instruction(), computation)) {
+        found_computation = true;
+        computation_map[computation].insert(value);
+        break;
+      }
+    }
+    CHECK(found_computation);
+  }
+  return computation_map;
+}
+
 Status BufferAssigner::AssignPresetBuffers(
     absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
     BufferAssignment* assignment) {
@@ -1420,7 +1460,8 @@ Status BufferAssigner::AssignPresetBuffers(
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const flat_hash_map<const HloComputation*, flat_hash_set<const HloValue*>>&
         buffers_to_assign_sequentially,
-    bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
+    bool run_whole_module_heap_simulation, BufferAssignment* assignment,
+    const PrivateStacks& private_stacks) {
   // Run the sequence of instructions through the heap simulator.  The
   // heuristic that seems to give the best results is lazy-best-fit, with all
   // runs of alloc / free calls sorted in decreasing size order.
@@ -1468,15 +1509,48 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       int64_t alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
       options.alloc_constants = allocate_buffers_for_constants_;
-      options.buffers_to_assign = &single_colored_set.second;
-
-      TF_ASSIGN_OR_RETURN(
-          HeapSimulator::Result<HloValue> result,
-          HeapSimulator::Run(
-              get_heap_algorithm(alignment), assignment->module(), schedule,
-              assignment->alias_analysis(), assignment->buffer_size_, options));
-      AssignBuffersFromHeapSimulator(result, assignment,
-                                     single_colored_set.first);
+      auto private_stacks_it = private_stacks.find(color);
+      if (private_stacks_it != private_stacks.end()) {
+        // For private stack colors, we collect all of the buffers that are
+        // dominated by the private stack computation and run heap simulation on
+        // that computation. The reason why we don't perform a whole-module heap
+        // simulation is that all buffers that participate in an async operation
+        // are treated as live for the duration of the async operation in
+        // whole-module heap simulation. Performing heap simulation from the
+        // private stack computation allows better temporal reuse of buffers.
+        auto computation_map = SplitBuffersByPrivateStackComputation(
+            single_colored_set.second, private_stacks_it->second,
+            assignment->alias_analysis().dataflow_analysis().call_graph());
+        for (const HloComputation* private_stack_computation :
+             private_stacks_it->second) {
+          VLOG(2) << "private stack computation: "
+                  << private_stack_computation->name();
+          auto computation_map_it =
+              computation_map.find(private_stack_computation);
+          CHECK(computation_map_it != computation_map.end());
+          options.buffers_to_assign = &computation_map_it->second;
+          const HloInstructionSequence* instruction_sequence =
+              hlo_ordering.SequentialOrder(*private_stack_computation);
+          TF_ASSIGN_OR_RETURN(
+              HeapSimulator::Result<HloValue> result,
+              HeapSimulator::Run(
+                  get_heap_algorithm(alignment), *private_stack_computation,
+                  *instruction_sequence, assignment->alias_analysis(),
+                  assignment->buffer_size_, &schedule, options));
+          AssignBuffersFromHeapSimulator(result, assignment,
+                                         single_colored_set.first);
+        }
+      } else {
+        options.buffers_to_assign = &single_colored_set.second;
+        TF_ASSIGN_OR_RETURN(
+            HeapSimulator::Result<HloValue> result,
+            HeapSimulator::Run(get_heap_algorithm(alignment),
+                               assignment->module(), schedule,
+                               assignment->alias_analysis(),
+                               assignment->buffer_size_, options));
+        AssignBuffersFromHeapSimulator(result, assignment,
+                                       single_colored_set.first);
+      }
     }
   } else {
     // Run the heap-simulation on a per-computation basis. Buffers for
@@ -1671,7 +1745,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     BufferValue::SizeFunction buffer_size,
     LogicalBuffer::AlignmentFunction color_alignment,
-    HloDataflowAnalysis::CanShareBuffer can_share_buffer) {
+    HloDataflowAnalysis::CanShareBuffer can_share_buffer,
+    const PrivateStacks& private_stacks) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer));
 
@@ -1737,7 +1812,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
           << multiheap_size_constraint_per_heap;
   TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
       buffers_to_assign_sequentially, run_whole_module_heap_simulation,
-      assignment.get()));
+      assignment.get(), private_stacks));
 
   std::vector<const HloComputation*> thread_local_computations_no_fusion;
   // Now assign buffers for thread-local computations. All LogicalBuffers get
@@ -1774,7 +1849,11 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // performed after all buffers have been assigned, and after maybe_live_out
   // is marked, since it is used to determine whether an allocation contains
   // temporary buffers or not.
-  assignment->CombineTempAllocations();
+  absl::flat_hash_set<BufferValue::Color> private_stack_colors;
+  for (const auto& [color, computations] : private_stacks) {
+    private_stack_colors.insert(color);
+  }
+  assignment->CombineTempAllocations(private_stack_colors);
 
   XLA_VLOG_LINES(2, assignment->ToString());
   TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index de42b25ef34..2ea8d132688 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -25,14 +25,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -549,7 +549,8 @@ class BufferAssignment {
   }
 
   // Combines allocations of temporary buffers into one big BufferAllocation.
-  void CombineTempAllocations();
+  void CombineTempAllocations(
+      const absl::flat_hash_set<BufferValue::Color>& private_stack_colors);
 
   // Computes stats for the assignment, to be retrieved by GetStats.
   Status ComputeSummaryStats();
@@ -592,6 +593,8 @@ class BufferAssigner {
   using Colorer = std::function<Status(HloAliasAnalysis*, const HloOrdering&)>;
   using MustNotLiveOut =
       std::function<bool(const HloInstruction*, const ShapeIndex&)>;
+  using PrivateStacks = absl::flat_hash_map<BufferValue::Color,
+                                            std::vector<const HloComputation*>>;
 
   static Colorer DefaultColorer() {
     return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
@@ -625,7 +628,8 @@ class BufferAssigner {
       std::optional<MustNotLiveOut> must_not_live_out = std::nullopt,
       HloDataflowAnalysis::CanShareBuffer can_share_buffer = nullptr,
       std::unique_ptr<memory_space_assignment::PresetAssignments>
-          preset_assignments = {});
+          preset_assignments = {},
+      const PrivateStacks& private_stacks = {});
 
  private:
   BufferAssigner(bool allocate_buffers_for_constants, Colorer colorer,
@@ -643,7 +647,8 @@ class BufferAssigner {
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
       BufferValue::SizeFunction buffer_size,
       LogicalBuffer::AlignmentFunction color_alignment,
-      HloDataflowAnalysis::CanShareBuffer can_share_buffer);
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer,
+      const PrivateStacks& private_stacks);
 
   // Assigns buffers to the instructions in the given computations. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
@@ -685,7 +690,8 @@ class BufferAssigner {
       const absl::flat_hash_map<const HloComputation*,
                                 absl::flat_hash_set<const HloValue*>>&
           buffers_to_assign_sequentially,
-      bool run_whole_module_heap_simulation, BufferAssignment* assignment);
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment,
+      const PrivateStacks& private_stacks);
 
   // Uses the results of the heap simulator to create a single allocation, with
   // LogicalBuffers packed to specific offsets.
@@ -702,7 +708,19 @@ class BufferAssigner {
   // colored with the same color.
   absl::flat_hash_map<LogicalBuffer::Color,
                       absl::flat_hash_set<const HloValue*>>
-  SplitBuffersByColor(const absl::flat_hash_set<const HloValue*>& buffers);
+  SplitBuffersByColor(
+      const absl::flat_hash_set<const HloValue*>& buffers) const;
+
+  // Split a set of buffers into several sets, each of which contains buffers
+  // with defining instructions that are dominated by the given private stack
+  // computation. This function CHECK-fails if there are outstanding buffers
+  // that do not have a dominating private stack computation.
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloValue*>>
+  SplitBuffersByPrivateStackComputation(
+      const absl::flat_hash_set<const HloValue*>& buffers,
+      absl::Span<const HloComputation* const> private_stack_computations,
+      const CallGraph& call_graph) const;
 
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 9f758c0b653..6a6f3bd81b5 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
 #include <utility>
@@ -23,21 +24,21 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/async_op_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -100,13 +101,17 @@ class BufferAssignmentTest : public HloTestBase {
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithSequentialOrdering(
-      HloModule* module, int64_t alignment = 1) {
+      HloModule* module, int64_t alignment = 1,
+      BufferAssigner::Colorer colorer = BufferAssigner::DefaultColorer(),
+      const BufferAssigner::PrivateStacks& private_stacks = {}) {
     return BufferAssigner::Run(
                module,
                std::make_unique<SequentialHloOrdering>(module->schedule()),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allocate_buffers_for_constants=*/true)
+               /*allocate_buffers_for_constants=*/true, colorer,
+               /*must_not_live_out=*/std::nullopt, /*can_share_buffer=*/nullptr,
+               /*preset_assignments=*/{}, private_stacks)
         .value();
   }
 
@@ -2658,6 +2663,234 @@ ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
   }
 }
 
+TEST_F(BufferAssignmentTest, AsyncCallPrivateStack) {
+  const char* hlo_text = R"(
+HloModule AsyncCall, is_scheduled=true
+
+%called_computation (param_0: f32[4096], param_1: f32[4096]) -> f32[4096] {
+  %param_0 = f32[4096]{0} parameter(0)
+  %param_1 = f32[4096]{0} parameter(1)
+  %negate_0 = f32[4096]{0} negate(f32[4096]{0} %param_0)
+  %negate_1 = f32[4096]{0} negate(f32[4096]{0} %param_1)
+  %negate_2 = f32[4096]{0} negate(f32[4096]{0} %negate_1)
+  %negate_3 = f32[4096]{0} negate(f32[4096]{0} %negate_2)
+  ROOT %result.1 = f32[4096]{0} add(f32[4096]{0} %negate_0, f32[4096]{0} %negate_3)
+}, execution_thread="foobar"
+
+ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
+  %a = f32[4096]{0} parameter(0)
+  %b = f32[4096]{0} parameter(1)
+  %async-start = ((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) call-start(f32[4096]{0} %a, f32[4096]{0} %b), async_execution_thread="foobar", to_apply=%called_computation
+  %negate_4 = f32[4096]{0} negate(f32[4096]{0} %a)
+  %negate_5 = f32[4096]{0} negate(f32[4096]{0} %b)
+  %negate_6 = f32[4096]{0} negate(f32[4096]{0} %negate_5)
+  %negate_7 = f32[4096]{0} negate(f32[4096]{0} %negate_6)
+  %add_0 = f32[4096]{0} add(f32[4096]{0} %negate_4, f32[4096]{0} %negate_7)
+  %async-done = f32[4096]{0} call-done(((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) %async-start), async_execution_thread="foobar", to_apply=%called_computation
+  ROOT %add_1 = f32[4096]{0} add(f32[4096]{0} %add_0, f32[4096]{0} %async-done)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+  AsyncOpCanonicalizer async_op_canonicalizer;
+  EXPECT_TRUE(async_op_canonicalizer.Run(m.get()).ok());
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(m.get()).ok());
+
+  auto colorer = [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    for (const HloBuffer& buffer : alias_analysis->buffers()) {
+      int color = 1;
+      for (const HloValue* value : buffer.values()) {
+        if (absl::c_any_of(
+                value->positions(),
+                [](const HloPosition& position) {
+                  return position.instruction->parent()->execution_thread() !=
+                         "foobar";
+                }) ||
+            absl::c_any_of(value->GetUses(), [](const HloUse& use) {
+              return use.instruction->parent()->execution_thread() != "foobar";
+            })) {
+          color = 0;
+        }
+      }
+      for (const HloValue* value : buffer.values()) {
+        const HloPosition& defining_position = value->defining_position();
+        if (defining_position.shape().has_layout()) {
+          const int memory_space =
+              defining_position.shape().layout().memory_space();
+          if (memory_space != 0) {
+            color = memory_space;
+          }
+        }
+        alias_analysis->dataflow_analysis()
+            .GetValue(value->id())
+            .set_color(BufferValue::Color(color));
+      }
+    }
+    return OkStatus();
+  };
+
+  BufferAssigner::PrivateStacks private_stacks;
+  private_stacks[1] = {FindComputation(m.get(), "called_computation")};
+  auto buffers = RunBufferAssignmentWithSequentialOrdering(
+      m.get(), /*alignment=*/1, colorer, private_stacks);
+
+  LOG(INFO) << buffers->ToString();
+
+  auto get_slice = [&](std::string_view hlo_name, const ShapeIndex& index) {
+    return buffers->GetUniqueSlice(FindInstruction(m.get(), hlo_name), index)
+        .value();
+  };
+
+  // Make sure the parameters and root of the async called computation has the
+  // same slice as the async call operands/output.
+  EXPECT_EQ(get_slice("param_0", {}), get_slice("a", {}));
+  EXPECT_EQ(get_slice("param_1", {}), get_slice("b", {}));
+  EXPECT_EQ(get_slice("result.1", {}), get_slice("async-done", {}));
+
+  // Make sure the intermediate values in the async called computation have
+  // different allocated slices than the values that overlap it.
+  for (const auto& hlo_name :
+       {"negate_0", "negate_1", "negate_2", "negate_3"}) {
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_4", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_5", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_6", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_7", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("add_0", {}));
+  }
+
+  // Make sure the private stack allocations negate_1-3 are allocated at the
+  // same offset.
+  EXPECT_NE(get_slice("negate_0", {}), get_slice("negate_1", {}));
+  EXPECT_EQ(get_slice("negate_1", {}), get_slice("negate_2", {}));
+  EXPECT_EQ(get_slice("negate_1", {}), get_slice("negate_3", {}));
+}
+
+TEST_F(BufferAssignmentTest, MultipleAsyncCallPrivateStack) {
+  const char* hlo_text = R"(
+HloModule AsyncCall, is_scheduled=true
+
+%called_computation1 {
+  %param_0 = f32[4096]{0} parameter(0)
+  %param_1 = f32[4096]{0} parameter(1)
+  %negate_0 = f32[4096]{0} negate(f32[4096]{0} %param_0)
+  %negate_1 = f32[4096]{0} negate(f32[4096]{0} %param_1)
+  %negate_2 = f32[4096]{0} negate(f32[4096]{0} %negate_1)
+  %negate_3 = f32[4096]{0} negate(f32[4096]{0} %negate_2)
+  ROOT %result.1 = f32[4096]{0} add(f32[4096]{0} %negate_0, f32[4096]{0} %negate_3)
+}, execution_thread="foobar"
+
+%called_computation2 {
+  %param_2 = f32[4096]{0} parameter(0)
+  %param_3 = f32[4096]{0} parameter(1)
+  %negate_4 = f32[4096]{0} negate(f32[4096]{0} %param_2)
+  %negate_5 = f32[4096]{0} negate(f32[4096]{0} %param_3)
+  ROOT %result.2 = f32[4096]{0} add(f32[4096]{0} %negate_4, f32[4096]{0} %negate_5)
+}, execution_thread="foobar"
+
+ENTRY %main (a: f32[4096], b: f32[4096]) -> f32[4096] {
+  %a = f32[4096]{0} parameter(0)
+  %b = f32[4096]{0} parameter(1)
+  %async-start.1 = ((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) call-start(f32[4096]{0} %a, f32[4096]{0} %b), async_execution_thread="foobar", to_apply=%called_computation1
+  %async-start.2 = ((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) call-start(f32[4096]{0} %b, f32[4096]{0} %a), async_execution_thread="foobar", to_apply=%called_computation2
+  %negate_6 = f32[4096]{0} negate(f32[4096]{0} %a)
+  %negate_7 = f32[4096]{0} negate(f32[4096]{0} %b)
+  %negate_8 = f32[4096]{0} negate(f32[4096]{0} %negate_7)
+  %negate_9 = f32[4096]{0} negate(f32[4096]{0} %negate_8)
+  %add_0 = f32[4096]{0} add(f32[4096]{0} %negate_6, f32[4096]{0} %negate_9)
+  %async-done.1 = f32[4096]{0} call-done(((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) %async-start.1), async_execution_thread="foobar", to_apply=%called_computation1
+  %async-done.2 = f32[4096]{0} call-done(((f32[4096]{0}, f32[4096]{0}), f32[4096]{0}, u32[]) %async-start.2), async_execution_thread="foobar", to_apply=%called_computation2
+  %add_1 = f32[4096]{0} add(f32[4096]{0} %add_0, f32[4096]{0} %async-done.1)
+  ROOT %add_2 = f32[4096]{0} add(f32[4096]{0} %add_1, f32[4096]{0} %async-done.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
+  AsyncOpCanonicalizer async_op_canonicalizer;
+  EXPECT_TRUE(async_op_canonicalizer.Run(m.get()).ok());
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(m.get()).ok());
+
+  auto colorer = [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    for (const HloBuffer& buffer : alias_analysis->buffers()) {
+      int color = 1;
+      for (const HloValue* value : buffer.values()) {
+        if (absl::c_any_of(
+                value->positions(),
+                [](const HloPosition& position) {
+                  return position.instruction->parent()->execution_thread() !=
+                         "foobar";
+                }) ||
+            absl::c_any_of(value->GetUses(), [](const HloUse& use) {
+              return use.instruction->parent()->execution_thread() != "foobar";
+            })) {
+          color = 0;
+        }
+      }
+      for (const HloValue* value : buffer.values()) {
+        const HloPosition& defining_position = value->defining_position();
+        if (defining_position.shape().has_layout()) {
+          const int memory_space =
+              defining_position.shape().layout().memory_space();
+          if (memory_space != 0) {
+            color = memory_space;
+          }
+        }
+        alias_analysis->dataflow_analysis()
+            .GetValue(value->id())
+            .set_color(BufferValue::Color(color));
+      }
+    }
+    return OkStatus();
+  };
+
+  BufferAssigner::PrivateStacks private_stacks;
+  private_stacks[1] = {FindComputation(m.get(), "called_computation1"),
+                       FindComputation(m.get(), "called_computation2")};
+  auto buffers = RunBufferAssignmentWithSequentialOrdering(
+      m.get(), /*alignment=*/1, colorer, private_stacks);
+
+  LOG(INFO) << buffers->ToString();
+
+  auto get_slice = [&](std::string_view hlo_name, const ShapeIndex& index) {
+    return buffers->GetUniqueSlice(FindInstruction(m.get(), hlo_name), index)
+        .value();
+  };
+
+  // Make sure the parameters and root of the async called computation has the
+  // same slice as the async call operands/output.
+  EXPECT_EQ(get_slice("param_0", {}), get_slice("a", {}));
+  EXPECT_EQ(get_slice("param_3", {}), get_slice("a", {}));
+  EXPECT_EQ(get_slice("param_1", {}), get_slice("b", {}));
+  EXPECT_EQ(get_slice("param_2", {}), get_slice("b", {}));
+  EXPECT_EQ(get_slice("result.1", {}), get_slice("async-done.1", {}));
+  EXPECT_EQ(get_slice("result.2", {}), get_slice("async-done.2", {}));
+
+  // Make sure the intermediate values in the async called computation have
+  // different allocated slices than the values that overlap it.
+  for (const auto& hlo_name : {"negate_0", "negate_1", "negate_2", "negate_3",
+                               "negate_4", "negate_5"}) {
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_6", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_7", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_8", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("negate_9", {}));
+    EXPECT_NE(get_slice(hlo_name, {}), get_slice("add_0", {}));
+  }
+
+  // Make sure the private stack allocations negate_1-3 are allocated at the
+  // same offset.
+  EXPECT_NE(get_slice("negate_0", {}), get_slice("negate_1", {}));
+  EXPECT_EQ(get_slice("negate_1", {}), get_slice("negate_2", {}));
+  EXPECT_EQ(get_slice("negate_1", {}), get_slice("negate_3", {}));
+
+  // Make sure the private stacks for called_computation1 and
+  // called_computation2 are able to reuse the same offsets.
+  EXPECT_TRUE(get_slice("negate_4", {}) == get_slice("negate_0", {}) ||
+              get_slice("negate_4", {}) == get_slice("negate_1", {}));
+  EXPECT_TRUE(get_slice("negate_5", {}) == get_slice("negate_0", {}) ||
+              get_slice("negate_5", {}) == get_slice("negate_1", {}));
+}
+
 TEST_F(BufferAssignmentTest, BufferInfoStringTest) {
   absl::string_view module_str = R"(
 HloModule test_module
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index 3a27cff8a3d..36422ba7f90 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <iosfwd>
 #include <ostream>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
@@ -55,11 +55,6 @@ LogicalBufferProto BufferValue::ToProto(const SizeFunction& size_fn) const {
   if (has_color()) {
     proto.set_color(color());
   }
-  // TODO(b/239098765): Stop populating these fields and delete them when
-  // profiler finishes adaptation.
-  proto.mutable_defined_at()->set_computation_name(
-      instruction()->parent()->name());
-  proto.mutable_defined_at()->set_instruction_name(instruction()->name());
   return proto;
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index 2b00247d9c0..051378183fa 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 32065e4ad95..11fbd1b39a4 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 23753dbda7e..3bec0739e9f 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_graph.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/call_inliner.cc b/tensorflow/compiler/xla/service/call_inliner.cc
index 225680bc8b7..48fb5cc8a90 100644
--- a/tensorflow/compiler/xla/service/call_inliner.cc
+++ b/tensorflow/compiler/xla/service/call_inliner.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <deque>
 
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index 0da746c6512..6612fe8a7de 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/change_op_data_type.cc b/tensorflow/compiler/xla/service/change_op_data_type.cc
index a0f72307567..40f45bb9f46 100644
--- a/tensorflow/compiler/xla/service/change_op_data_type.cc
+++ b/tensorflow/compiler/xla/service/change_op_data_type.cc
@@ -15,38 +15,63 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/change_op_data_type.h"
 
+#include <optional>
+
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 
 namespace xla {
+namespace {
+std::optional<PrimitiveType> GetUniformOperandType(
+    const HloInstruction* instr) {
+  std::optional<PrimitiveType> type;
+  for (const HloInstruction* operand : instr->operands()) {
+    if (!type.has_value()) {
+      type = operand->shape().element_type();
+    } else if (operand->shape().element_type() != type.value()) {
+      return std::nullopt;
+    }
+  }
+  return type;
+}
+}  // namespace
 
 StatusOr<bool> ChangeOpDataType::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
+  HloCloner default_cloner = [](const HloInstruction* inst, const Shape& shape,
+                                absl::Span<HloInstruction* const> operands) {
+    return inst->CloneWithNewOperands(shape, operands);
+  };
+  HloCloner cloner = cloner_ ? cloner_ : default_cloner;
+
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
-      if (!op_matcher_(instr) ||
-          !absl::c_all_of(instr->operands(),
-                          [&](const HloInstruction* operand) {
-                            return operand->shape().element_type() == from_ty_;
-                          }) ||
+      std::optional<PrimitiveType> operand_type = GetUniformOperandType(instr);
+      if (!op_matcher_(instr) || !operand_type.has_value() ||
           !instr->shape().IsArray() ||
           instr->opcode() == HloOpcode::kParameter) {
         continue;
       }
+      const PrimitiveType from_type = *operand_type;
+      auto it = to_type_map_.find(from_type);
+      if (it == to_type_map_.end()) {
+        continue;
+      }
+      const PrimitiveType to_type = it->second;
       absl::InlinedVector<HloInstruction*, 8> new_operands;
       for (HloInstruction* operand : instr->mutable_operands()) {
-        new_operands.push_back(MakeConvertToHlo(operand, to_ty_));
+        new_operands.push_back(MakeConvertToHlo(operand, to_type));
       }
 
       Shape new_shape = instr->shape();
-      new_shape.set_element_type(to_ty_);
+      new_shape.set_element_type(to_type);
 
-      HloInstruction* new_instr = comp->AddInstruction(
-          instr->CloneWithNewOperands(new_shape, new_operands));
+      HloInstruction* new_instr =
+          comp->AddInstruction(cloner(instr, new_shape, new_operands));
       TF_RETURN_IF_ERROR(comp->ReplaceInstruction(
-          instr, MakeConvertToHlo(new_instr, from_ty_)));
+          instr, MakeConvertToHlo(new_instr, from_type)));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/change_op_data_type.h b/tensorflow/compiler/xla/service/change_op_data_type.h
index df9fd43075f..3e12fbd501b 100644
--- a/tensorflow/compiler/xla/service/change_op_data_type.h
+++ b/tensorflow/compiler/xla/service/change_op_data_type.h
@@ -17,8 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CHANGE_OP_DATA_TYPE_H_
 
 #include <functional>
+#include <memory>
+#include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -36,11 +37,30 @@ namespace xla {
 // have type `from_ty`.  It will not do the correct thing for ops like
 // dynamic-slice where only some of the arguments should be converted; it's up
 // to you to avoid matching such ops with `op_matcher`.
+//
+// The pass support multiple <from_ty, to_ty> pairs and will apply the transform
+// if all operands match one of the types in from_ty.
+//
+// It uses provided `cloner` to clone an instruction with shape and converted
+// operands. If the cloner is not provided, it will uses `CloneWithNewOperands`.
 class ChangeOpDataType : public HloModulePass {
  public:
+  using HloCloner = std::function<std::unique_ptr<HloInstruction>(
+      const HloInstruction*, const Shape&, absl::Span<HloInstruction* const>)>;
+  ChangeOpDataType(
+      absl::Span<std::pair<PrimitiveType, PrimitiveType> const> from_to_types,
+      HloPredicate op_matcher, HloCloner cloner = nullptr)
+      : op_matcher_(op_matcher), cloner_(cloner) {
+    for (const std::pair<PrimitiveType, PrimitiveType>& pair : from_to_types) {
+      to_type_map_[pair.first] = pair.second;
+    }
+  }
+
   ChangeOpDataType(PrimitiveType from_ty, PrimitiveType to_ty,
-                   std::function<bool(const HloInstruction*)> op_matcher)
-      : from_ty_(from_ty), to_ty_(to_ty), op_matcher_(op_matcher) {}
+                   HloPredicate op_matcher, HloCloner cloner = nullptr)
+      : op_matcher_(op_matcher), cloner_(cloner) {
+    to_type_map_[from_ty] = to_ty;
+  }
 
   absl::string_view name() const override { return "change-op-data-type"; }
   StatusOr<bool> Run(
@@ -48,9 +68,10 @@ class ChangeOpDataType : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  PrimitiveType from_ty_;
-  PrimitiveType to_ty_;
-  std::function<bool(const HloInstruction*)> op_matcher_;
+  // map with key = from_type and value = to_type.
+  absl::flat_hash_map<PrimitiveType, PrimitiveType> to_type_map_;
+  HloPredicate op_matcher_;
+  HloCloner cloner_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/change_op_data_type_test.cc b/tensorflow/compiler/xla/service/change_op_data_type_test.cc
index 6b331a85440..eb630ca23ba 100644
--- a/tensorflow/compiler/xla/service/change_op_data_type_test.cc
+++ b/tensorflow/compiler/xla/service/change_op_data_type_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 namespace {
@@ -106,5 +105,63 @@ TEST_F(ChangeOpDataTypeTest, DotAndConv) {
               .WithShape(F16, {1, 2, 1}))));
 }
 
+TEST_F(ChangeOpDataTypeTest, SimpleWithCloner) {
+  const char* const kModuleStr = R"(
+  HloModule module
+  ENTRY entry {
+    ROOT op = add(f16[10] parameter(0), f16[10] parameter(1))
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  HloPredicate matcher = [](const HloInstruction*) { return true; };
+
+  int count = 0;
+  ChangeOpDataType::HloCloner cloner =
+      [&count](const HloInstruction* instr, const Shape& shape,
+               absl::Span<HloInstruction* const> operands) {
+        count++;
+        return instr->CloneWithNewOperands(shape, operands);
+      };
+  ChangeOpDataType pass(F16, F32, matcher, cloner);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  SCOPED_TRACE(module->ToString());
+  EXPECT_TRUE(changed);
+  // Verify that the cloner provided was used.
+  EXPECT_EQ(count, 1);
+}
+
+TEST_F(ChangeOpDataTypeTest, SimpleWithMultipleTypes) {
+  const char* const kModuleStr = R"(
+  HloModule module
+  ENTRY entry {
+    op1 = add(f16[10] parameter(0), f16[10] parameter(1))
+    op2 = add(u16[10] parameter(2), u16[10] parameter(3))
+    ROOT tup = (f16[10], u16[10]) tuple(op1, op2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloPredicate matcher = [](const HloInstruction*) { return true; };
+  ChangeOpDataType pass({{F16, F32}, {U16, U32}}, matcher);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
+  SCOPED_TRACE(module->ToString());
+  EXPECT_TRUE(changed);
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
+  EXPECT_EQ(root->operand_count(), 2);
+  EXPECT_THAT(
+      root->operand(0),
+      GmockMatch(
+          m::Convert(m::Add(m::Convert(m::Parameter(0)).WithShape(F32, {10}),
+                            m::Convert(m::Parameter(1)).WithShape(F32, {10})))
+              .WithShape(F16, {10})));
+  EXPECT_THAT(
+      root->operand(1),
+      GmockMatch(
+          m::Convert(m::Add(m::Convert(m::Parameter(2)).WithShape(U32, {10}),
+                            m::Convert(m::Parameter(3)).WithShape(U32, {10})))
+              .WithShape(U16, {10})));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc
index d6d357b8e33..10c61f49b58 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.cc
+++ b/tensorflow/compiler/xla/service/channel_tracker.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index e06bc8d6d42..d53825e3ac0 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/collective_combiner_utils.h b/tensorflow/compiler/xla/service/collective_combiner_utils.h
index e01bb5aff7b..a2212f57349 100644
--- a/tensorflow/compiler/xla/service/collective_combiner_utils.h
+++ b/tensorflow/compiler/xla/service/collective_combiner_utils.h
@@ -23,10 +23,10 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/collective_decomposer_utils.cc b/tensorflow/compiler/xla/service/collective_decomposer_utils.cc
index ed5571c09b6..cd796d22f54 100644
--- a/tensorflow/compiler/xla/service/collective_decomposer_utils.cc
+++ b/tensorflow/compiler/xla/service/collective_decomposer_utils.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <functional>
 #include <limits>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/collective_decomposer_utils.h b/tensorflow/compiler/xla/service/collective_decomposer_utils.h
index e7ea1dcba8f..ebc840741bb 100644
--- a/tensorflow/compiler/xla/service/collective_decomposer_utils.h
+++ b/tensorflow/compiler/xla/service/collective_decomposer_utils.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <functional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVE_DECOMPOSER_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVE_DECOMPOSER_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.cc b/tensorflow/compiler/xla/service/collective_ops_utils.cc
index 23d73d70eb6..a823ee7d2b5 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.cc
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/global_device_id.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index 83f5f52ff42..3e04dcf3d0e 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -23,10 +23,10 @@ limitations under the License.
 
 #include "absl/functional/function_ref.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/global_device_id.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/platform/blocking_counter.h"
diff --git a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
index db9b3b8f76f..a0b9d3b10b1 100644
--- a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
+++ b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.cc
@@ -24,18 +24,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 
@@ -48,21 +41,44 @@ StatusOr<bool> CollectivesScheduleLinearizer::Run(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    std::unique_ptr<HloReachabilityMap> reachability =
-        HloReachabilityMap::Build(computation);
-    HloCollectiveInstruction* prev = nullptr;
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      if (auto* next = DynCast<HloCollectiveInstruction>(instruction)) {
-        if (prev != nullptr && !reachability->IsConnected(next, prev)) {
-          // If prev and next are independent, enforce ordering.
-          TF_RETURN_IF_ERROR(prev->AddControlDependencyTo(next));
-          VLOG(1) << "Adding control dependency from " << prev->ToString()
-                  << " to " << next->ToString();
-          changed = true;
-        }
-        prev = next;
+    std::unique_ptr<HloReachabilityMap> reachability;
+    HloInstruction* prev_done = nullptr;
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      auto* next = DynCast<HloCollectiveInstruction>(inst);
+      if (!next) {
+        continue;
       }
+      // Build reachability map on demand if we actually see collectives.
+      if (!reachability) {
+        reachability = HloReachabilityMap::Build(computation);
+      }
+      // Derive the 'start' and 'done' peers of this instruction. For non-async
+      // variants of collectives, they are the same as this instruction. For
+      // async variants, the start is this instruction and the 'done' is the
+      // matching async-done instruction.
+      HloInstruction* start = next;
+      HloInstruction* done = next;
+      switch (next->opcode()) {
+        case HloOpcode::kAllReduceStart:
+        case HloOpcode::kAllGatherStart:
+        case HloOpcode::kCollectivePermuteStart:
+        case HloOpcode::kAsyncStart:
+          // Find the async-done corresponding to this async start instruction.
+          CHECK_EQ(start->user_count(), 1);
+          done = start->users()[0];
+          break;
+        default:
+          break;
+      }
+
+      if (prev_done && !reachability->IsConnected(start, prev_done)) {
+        // If prev_done and start are independent, enforce ordering.
+        TF_RETURN_IF_ERROR(prev_done->AddControlDependencyTo(next));
+        VLOG(1) << "Adding control dependency from " << prev_done->ToString()
+                << " to " << start->ToString();
+        changed = true;
+      }
+      prev_done = done;
     }
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
index b74ad41ee8f..e38221b9f14 100644
--- a/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
+++ b/tensorflow/compiler/xla/service/collectives_schedule_linearizer.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/collectives_schedule_linearizer_test.cc b/tensorflow/compiler/xla/service/collectives_schedule_linearizer_test.cc
index c466a491883..d2e1c9b61dd 100644
--- a/tensorflow/compiler/xla/service/collectives_schedule_linearizer_test.cc
+++ b/tensorflow/compiler/xla/service/collectives_schedule_linearizer_test.cc
@@ -15,21 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/collectives_schedule_linearizer.h"
 
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
-#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -169,5 +162,44 @@ ENTRY entry {
   EXPECT_EQ(CountControlEdges(*module->entry_computation()), 2);
 }
 
+TEST_F(CollectivesScheduleLinearizerTest, AsyncOrdering) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT out = f32[] add(a, b)
+}
+
+ENTRY entry {
+  p0 = f32[100] parameter(0), parameter_replication={false}
+  p1 = f32[100] parameter(1), parameter_replication={false}
+  ars0 = f32[100] all-reduce-start(p0), replica_groups={}, to_apply=sum
+  ard0 = f32[100] all-reduce-done(ars0)
+  ars1 = f32[100] all-reduce-start(p1), replica_groups={}, to_apply=sum
+  ard1 = f32[100] all-reduce-done(ars1)
+  ROOT out = f32[100] add(ard0, ard1)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCollectivesSchedule(module.get());
+  EXPECT_EQ(CountControlEdges(*module->entry_computation()), 1);
+
+  const HloInstruction *root = module->entry_computation()->root_instruction();
+  const HloInstruction *ard0 = root->operand(0);
+  const HloInstruction *ard1 = root->operand(1);
+  EXPECT_EQ(ard0->opcode(), HloOpcode::kAllReduceDone);
+  EXPECT_EQ(ard1->opcode(), HloOpcode::kAllReduceDone);
+
+  const HloInstruction *ars1 = ard1->operand(0);
+  EXPECT_EQ(ars1->opcode(), HloOpcode::kAllReduceStart);
+
+  // verify control dependency is inserted from all-reduce-done to
+  // all-reduce-start.
+  EXPECT_TRUE(absl::c_linear_search(ars1->control_predecessors(), ard0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/comparison_expander.cc b/tensorflow/compiler/xla/service/comparison_expander.cc
index 025a553a3e1..4bde4c68868 100644
--- a/tensorflow/compiler/xla/service/comparison_expander.cc
+++ b/tensorflow/compiler/xla/service/comparison_expander.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/comparison_expander.h"
 
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/comparison_expander.h b/tensorflow/compiler/xla/service/comparison_expander.h
index df8b5dc0137..27fd997a175 100644
--- a/tensorflow/compiler/xla/service/comparison_expander.h
+++ b/tensorflow/compiler/xla/service/comparison_expander.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
diff --git a/tensorflow/compiler/xla/service/compilation_environments.cc b/tensorflow/compiler/xla/service/compilation_environments.cc
index e3a8daa104c..1e8336fbbaf 100644
--- a/tensorflow/compiler/xla/service/compilation_environments.cc
+++ b/tensorflow/compiler/xla/service/compilation_environments.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "google/protobuf/any.pb.h"
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
+#include "absl/base/const_init.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
@@ -29,12 +33,20 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 
 namespace xla {
 namespace {
 
+ABSL_CONST_INIT absl::Mutex process_new_env_fns_mu(absl::kConstInit);
+absl::flat_hash_map<const tsl::protobuf::Descriptor*,
+                    CompilationEnvironments::ProcessNewEnvFn>*
+    process_new_env_fns ABSL_GUARDED_BY(process_new_env_fns_mu) = nullptr;
+
 // A global singleton stats object for implementing CompilationEnvironments::{
 // DefaultEnvCreatedByCompilationEnvironments(), EnvAdded()}.
 class GlobalCompEnvStats {
@@ -111,6 +123,111 @@ CompilationEnvironments& CompilationEnvironments::operator=(
   return *this;
 }
 
+StatusOr<std::unique_ptr<CompilationEnvironments>>
+CompilationEnvironments::CreateFromProto(
+    const CompilationEnvironmentsProto& proto) {
+  auto envs = std::make_unique<CompilationEnvironments>();
+
+  const tsl::protobuf::DescriptorPool* const pool =
+      tsl::protobuf::DescriptorPool::generated_pool();
+
+  for (const auto& env_proto : proto.environments()) {
+    std::string fullname;
+    if (!google::protobuf::Any::ParseAnyTypeUrl(env_proto.type_url(),
+                                                &fullname)) {
+      return tsl::errors::DataLoss(
+          "Invalid CompilationEnvironment message type url: %s",
+          env_proto.type_url());
+    }
+
+    const tsl::protobuf::Descriptor* const descriptor =
+        pool->FindMessageTypeByName(fullname);
+    if (descriptor == nullptr) {
+      return tsl::errors::DataLoss(
+          "Unknown CompilationEnvironment message type: %s", fullname);
+    }
+
+    const tsl::protobuf::Message* const prototype =
+        tsl::protobuf::MessageFactory::generated_factory()->GetPrototype(
+            descriptor);
+    if (prototype == nullptr) {
+      return tsl::errors::Internal(
+          "Unsupported CompilationEnvironment message type: %s", fullname);
+    }
+
+    std::unique_ptr<tsl::protobuf::Message> env(prototype->New());
+    if (!env_proto.UnpackTo(env.get())) {
+      return tsl::errors::DataLoss(
+          "Unable to unpack CompilationEnvironment message of type '%s'",
+          fullname);
+    }
+
+    TF_RETURN_IF_ERROR(envs->AddEnv(std::move(env)));
+  }
+
+  return envs;
+}
+
+void CompilationEnvironments::RegisterProcessNewEnvFn(
+    const tsl::protobuf::Descriptor* descriptor,
+    ProcessNewEnvFn process_new_env) {
+  absl::MutexLock l(&process_new_env_fns_mu);
+  if (process_new_env_fns == nullptr) {
+    process_new_env_fns =
+        new absl::flat_hash_map<const tsl::protobuf::Descriptor*,
+                                CompilationEnvironments::ProcessNewEnvFn>();
+  }
+  const bool inserted =
+      process_new_env_fns->insert({descriptor, std::move(process_new_env)})
+          .second;
+  CHECK(inserted) << "ProcessNewEnvFn for XLA compilation environment '"
+                  << descriptor->full_name() << "' has already been registered";
+}
+
+Status CompilationEnvironments::AddEnv(
+    std::unique_ptr<tsl::protobuf::Message> env) {
+  if (!env) {
+    return tsl::errors::InvalidArgument(
+        "Can not add a null compilation environment.");
+  }
+  const tsl::protobuf::Descriptor& descriptor = *env->GetDescriptor();
+  return AddEnvImpl(descriptor, std::move(env));
+}
+
+CompilationEnvironmentsProto CompilationEnvironments::ToProto() const {
+  // Sort the environments by their message types' full names so that the
+  // proto fields are deterministically ordered.
+  std::vector<const tsl::protobuf::Descriptor*> descriptors;
+  descriptors.reserve(environments_.size());
+  for (const auto& [descriptor, message] : environments_) {
+    descriptors.push_back(descriptor);
+  }
+  absl::c_sort(descriptors, [](const tsl::protobuf::Descriptor* lhs,
+                               const tsl::protobuf::Descriptor* rhs) {
+    return lhs->full_name() < rhs->full_name();
+  });
+
+  CompilationEnvironmentsProto proto;
+  for (const auto* const descriptor : descriptors) {
+    proto.add_environments()->PackFrom(*environments_.at(descriptor));
+  }
+  return proto;
+}
+
+CompilationEnvironments::ProcessNewEnvFn
+CompilationEnvironments::GetProcessNewEnvFn(
+    const tsl::protobuf::Descriptor& descriptor) {
+  absl::MutexLock l(&process_new_env_fns_mu);
+  if (process_new_env_fns == nullptr) {
+    return nullptr;
+  }
+  const auto it = process_new_env_fns->find(&descriptor);
+  if (it == process_new_env_fns->end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
 void CompilationEnvironments::DefaultEnvCreatedByCompilationEnvironments(
     std::string_view env_type) {
   GlobalCompEnvStats::GetSingleton().DefaultEnvCreatedByCompilationEnvironments(
@@ -121,18 +238,27 @@ void CompilationEnvironments::EnvAdded(std::string_view env_type) {
   GlobalCompEnvStats::GetSingleton().EnvAdded(env_type);
 }
 
-void CompilationEnvironments::AddProcessedEnv(
+Status CompilationEnvironments::AddEnvImpl(
+    const tsl::protobuf::Descriptor& descriptor,
     std::unique_ptr<tsl::protobuf::Message> env) {
   // Check if we already have an environment of env's type
-  auto descriptor = env->GetDescriptor();
-  if (environments_.contains(descriptor)) {
-    LOG(WARNING) << "Replacing CompilationEnvironment of type "
-                 << descriptor->full_name();
+  if (environments_.contains(&descriptor)) {
+    return tsl::errors::InvalidArgument(
+        "Replacing CompilationEnvironment of type %s.", descriptor.full_name());
+  }
+
+  // Process env
+  ProcessNewEnvFn process_new_env = GetProcessNewEnvFn(descriptor);
+  if (!process_new_env) {
+    return tsl::errors::InvalidArgument(
+        "Unknown compilation environment type: %s", descriptor.full_name());
   }
+  std::unique_ptr<tsl::protobuf::Message> processed_env =
+      process_new_env(std::move(env));
 
   // Check for unknown fields
   const tsl::protobuf::UnknownFieldSet& unknown_fields =
-      env->GetReflection()->GetUnknownFields(*env);
+      processed_env->GetReflection()->GetUnknownFields(*processed_env);
   std::vector<int> unknown_tags;
   unknown_tags.reserve(unknown_fields.field_count());
   for (int i = 0; i < unknown_fields.field_count(); ++i) {
@@ -140,14 +266,15 @@ void CompilationEnvironments::AddProcessedEnv(
     unknown_tags.push_back(field.number());
   }
   if (!unknown_tags.empty()) {
-    LOG(WARNING) << "CompilationEnvironment " << descriptor->full_name()
+    LOG(WARNING) << "CompilationEnvironment " << descriptor.full_name()
                  << " contains unknown fields with tag numbers: "
                  << absl::StrJoin(unknown_tags, ", ");
   }
 
   // Actually add the env
-  environments_.insert({descriptor, std::move(env)});
-  EnvAdded(descriptor->full_name());
+  environments_.insert({&descriptor, std::move(processed_env)});
+  EnvAdded(descriptor.full_name());
+  return OkStatus();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_environments.h b/tensorflow/compiler/xla/service/compilation_environments.h
index 179d0395edd..013bcf85247 100644
--- a/tensorflow/compiler/xla/service/compilation_environments.h
+++ b/tensorflow/compiler/xla/service/compilation_environments.h
@@ -17,12 +17,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
 
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <string_view>
 #include <typeindex>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 
@@ -43,48 +46,52 @@ namespace xla {
 // CompilationEnvironments is not thread-safe.
 class CompilationEnvironments {
  public:
+  using ProcessNewEnvFn = std::function<std::unique_ptr<tsl::protobuf::Message>(
+      std::unique_ptr<tsl::protobuf::Message>)>;
+
   CompilationEnvironments() = default;
   CompilationEnvironments(const CompilationEnvironments& rhs) { *this = rhs; }
   CompilationEnvironments& operator=(const CompilationEnvironments& rhs);
   ~CompilationEnvironments() = default;
 
+  // Deserializes the given CompilationEnvironments proto.
+  static StatusOr<std::unique_ptr<CompilationEnvironments>> CreateFromProto(
+      const CompilationEnvironmentsProto& proto);
+
   // Whenever an environment is added to CompilationEnvironments, even when
-  // GetEnv() adds a lazily initialized one, it is passed to this method. The
-  // result of this method is the environment that is used by
-  // CompilationEnvironments. This allows environment authors to do things like
-  // populate missing fields in an added environment.
+  // GetEnv() adds a lazily initialized one, it is passed to the function
+  // registered by this method, corresponding to the environment's proto
+  // descriptor. The result is the environment that is used by
+  // CompilationEnvironments. This allows environment authors to
+  // do things like populate missing fields in an added environment.
   //
-  // Users of CompilationEnvironments must specialize this method for each type
-  // of CompilationEnvironment they wish to use in code.
+  // Users of CompilationEnvironments must register their `ProcessNewEnvFn`
+  // function via this method for each type of CompilationEnvironment they wish
+  // to use in code.
   //
-  // The input env may be null.
+  // The input env to a ProcessNewEnvFn may be null.
   //
   // REQUIRES:
-  // - T must be a type of proto message.
   // - The output is *not* allowed to be null, even for null input.
-  template <typename T>
-  static std::unique_ptr<T> ProcessNewEnv(std::unique_ptr<T> env) = delete;
+  static void RegisterProcessNewEnvFn(
+      const tsl::protobuf::Descriptor* descriptor,
+      ProcessNewEnvFn process_new_env);
 
   // Adds env to the list of CompilationEnvironments. If an environment with
-  // std::type_index equal to env.GetTypeid() has already been added, env
-  // will replace it.
+  // the same proto descriptor has already been added, env will replace it.
   //
-  // All added environments are processed via ProcessNewEnv().
-  //
-  // AddEnv<T> will not compile for type T, unless ProcessNewEnv<T> is defined.
-  template <typename T>
-  void AddEnv(std::unique_ptr<T> env);
+  // All added environments are processed via registered ProcessNewEnvFns. If
+  // such a function was not regitered for env's proto descriptor or env's
+  // proto type is unknown, an error will be returned.
+  Status AddEnv(std::unique_ptr<tsl::protobuf::Message> env);
 
   // Returns the CompilationEnvironment corresponding to T. If such an
-  // environment has not been added, ProcessNewEnv<T>(nullptr) will be added
-  // and returned.
+  // environment has not been added, ProcessNewEnvFn(nullptr) will be added and
+  // returned.
   //
   // GetMutableEnv()/GetEnv() are not const because they can perform lazy
   // initialization, thereby modifying the CompilationEnvironments's data
   // members.
-  //
-  // GetMutableEnv<T>/GetEnv<T> will not compile for type T, unless
-  // ProcessNewEnv<T> is defined.
   template <typename T>
   T& GetMutableEnv();
   template <typename T>
@@ -93,9 +100,17 @@ class CompilationEnvironments {
   // Removes all added environments.
   void Clear() { environments_.clear(); }
 
+  // Serializes this CompilationEnvironments into a protobuf message.
+  CompilationEnvironmentsProto ToProto() const;
+
  private:
-  // Called by GetEnv() when it calls lazily creates a new environment, to
-  // globally track stats about how many such environments are created by
+  // Returns the ProcessNewEnvFn for the given env type. Returns nullptr if no
+  // ProcessNewEnvFn has been registered for the env type.
+  static ProcessNewEnvFn GetProcessNewEnvFn(
+      const tsl::protobuf::Descriptor& descriptor);
+
+  // Called by GetEnv(), when it lazily creates a new environment, to globally
+  // track stats about how many such environments are created by
   // CompilationEnvironments.
   static void DefaultEnvCreatedByCompilationEnvironments(
       std::string_view env_type);
@@ -104,8 +119,8 @@ class CompilationEnvironments {
   // are added to CompilationEnvironments.
   static void EnvAdded(std::string_view env_type);
 
-  // Implements the part of AddEnv() after the ProcessNewEnv() call.
-  void AddProcessedEnv(std::unique_ptr<tsl::protobuf::Message> env);
+  Status AddEnvImpl(const tsl::protobuf::Descriptor& descriptor,
+                    std::unique_ptr<tsl::protobuf::Message> env);
 
   absl::flat_hash_map<const tsl::protobuf::Descriptor*,
                       std::unique_ptr<tsl::protobuf::Message>>
@@ -114,24 +129,12 @@ class CompilationEnvironments {
 
 // ----- Template implementation below -----
 
-// Make sure no one tries to specialize ProcessNewEnv() for raw
-// tsl::protobuf::Message. Specialization should always be for a specific
-// type of proto message.
-template <>
-std::unique_ptr<tsl::protobuf::Message> CompilationEnvironments::ProcessNewEnv(
-    std::unique_ptr<tsl::protobuf::Message> env) = delete;
-
-template <typename T>
-void CompilationEnvironments::AddEnv(std::unique_ptr<T> env) {
-  AddProcessedEnv(ProcessNewEnv<T>(std::move(env)));
-}
-
 template <typename T>
 T& CompilationEnvironments::GetMutableEnv() {
   auto descriptor = T::descriptor();
   auto it = environments_.find(descriptor);
   if (it == environments_.end()) {
-    AddEnv<T>(nullptr);
+    TF_CHECK_OK(AddEnvImpl(*descriptor, nullptr));
     DefaultEnvCreatedByCompilationEnvironments(descriptor->full_name());
     it = environments_.find(descriptor);
   }
diff --git a/tensorflow/compiler/xla/service/compilation_environments_test.cc b/tensorflow/compiler/xla/service/compilation_environments_test.cc
index 32849b2bcb7..d8044cd5c63 100644
--- a/tensorflow/compiler/xla/service/compilation_environments_test.cc
+++ b/tensorflow/compiler/xla/service/compilation_environments_test.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/test_compilation_environment.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 
@@ -27,10 +30,10 @@ namespace xla {
 
 // In order to use TestCompilationEnvironment* with CompilationEnvironments, we
 // must define ProcessNewEnv for them.
-template <>
-std::unique_ptr<test::TestCompilationEnvironment1>
-CompilationEnvironments::ProcessNewEnv(
-    std::unique_ptr<test::TestCompilationEnvironment1> env) {
+std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv1(
+    std::unique_ptr<tsl::protobuf::Message> msg) {
+  std::unique_ptr<test::TestCompilationEnvironment1> env(
+      tensorflow::down_cast<test::TestCompilationEnvironment1*>(msg.release()));
   if (!env) {
     env = std::make_unique<test::TestCompilationEnvironment1>();
   }
@@ -39,10 +42,10 @@ CompilationEnvironments::ProcessNewEnv(
   }
   return env;
 }
-template <>
-std::unique_ptr<test::TestCompilationEnvironment2>
-CompilationEnvironments::ProcessNewEnv(
-    std::unique_ptr<test::TestCompilationEnvironment2> env) {
+std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv2(
+    std::unique_ptr<tsl::protobuf::Message> msg) {
+  std::unique_ptr<test::TestCompilationEnvironment2> env(
+      tensorflow::down_cast<test::TestCompilationEnvironment2*>(msg.release()));
   if (!env) {
     env = std::make_unique<test::TestCompilationEnvironment2>();
   }
@@ -51,10 +54,10 @@ CompilationEnvironments::ProcessNewEnv(
   }
   return env;
 }
-template <>
-std::unique_ptr<test::TestCompilationEnvironment3>
-CompilationEnvironments::ProcessNewEnv(
-    std::unique_ptr<test::TestCompilationEnvironment3> env) {
+std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv3(
+    std::unique_ptr<tsl::protobuf::Message> msg) {
+  std::unique_ptr<test::TestCompilationEnvironment3> env(
+      tensorflow::down_cast<test::TestCompilationEnvironment3*>(msg.release()));
   if (!env) {
     env = std::make_unique<test::TestCompilationEnvironment3>();
   }
@@ -67,7 +70,17 @@ CompilationEnvironments::ProcessNewEnv(
 namespace test {
 namespace {
 
-class CompilationEnvironmentsTest : public ::testing::Test {};
+class CompilationEnvironmentsTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    CompilationEnvironments::RegisterProcessNewEnvFn(
+        test::TestCompilationEnvironment1::descriptor(), ProcessNewEnv1);
+    CompilationEnvironments::RegisterProcessNewEnvFn(
+        test::TestCompilationEnvironment2::descriptor(), ProcessNewEnv2);
+    CompilationEnvironments::RegisterProcessNewEnvFn(
+        test::TestCompilationEnvironment3::descriptor(), ProcessNewEnv3);
+  }
+};
 
 TEST_F(CompilationEnvironmentsTest, GetDefaultEnv) {
   CompilationEnvironments envs;
@@ -85,7 +98,7 @@ TEST_F(CompilationEnvironmentsTest, GetAddedEnvNotModifiedByProcessNewEnv) {
   CompilationEnvironments envs;
   auto env = std::make_unique<TestCompilationEnvironment1>();
   env->set_some_flag(5);
-  envs.AddEnv(std::move(env));
+  TF_ASSERT_OK(envs.AddEnv(std::move(env)));
   EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment1>().some_flag(), 5);
   EXPECT_EQ(envs.GetMutableEnv<TestCompilationEnvironment1>().some_flag(), 5);
 }
@@ -94,7 +107,7 @@ TEST_F(CompilationEnvironmentsTest, GetAddedEnvModifiedByProcessNewEnv) {
   CompilationEnvironments envs;
   auto env = std::make_unique<TestCompilationEnvironment1>();
   env->set_some_flag(1);
-  envs.AddEnv(std::move(env));
+  TF_ASSERT_OK(envs.AddEnv(std::move(env)));
   EXPECT_EQ(envs.GetEnv<TestCompilationEnvironment1>().some_flag(), 100);
   EXPECT_EQ(envs.GetMutableEnv<TestCompilationEnvironment1>().some_flag(), 100);
 }
@@ -123,9 +136,9 @@ TEST_F(CompilationEnvironmentsTest, CopyConstructor) {
   auto envs = std::make_unique<CompilationEnvironments>();
   auto env1 = std::make_unique<TestCompilationEnvironment1>();
   env1->set_some_flag(10);
-  envs->AddEnv(std::move(env1));
+  TF_ASSERT_OK(envs->AddEnv(std::move(env1)));
   auto env2 = std::make_unique<TestCompilationEnvironment2>();
-  envs->AddEnv(std::move(env2));
+  TF_ASSERT_OK(envs->AddEnv(std::move(env2)));
   envs->GetMutableEnv<TestCompilationEnvironment2>().set_some_other_flag(20);
 
   // Call the copy constructor and delete the original CompilationEnvironments
@@ -143,9 +156,9 @@ TEST_F(CompilationEnvironmentsTest, CopyAssignment) {
   auto envs1 = std::make_unique<CompilationEnvironments>();
   auto env1 = std::make_unique<TestCompilationEnvironment1>();
   env1->set_some_flag(10);
-  envs1->AddEnv(std::move(env1));
+  TF_ASSERT_OK(envs1->AddEnv(std::move(env1)));
   auto env2 = std::make_unique<TestCompilationEnvironment2>();
-  envs1->AddEnv(std::move(env2));
+  TF_ASSERT_OK(envs1->AddEnv(std::move(env2)));
   envs1->GetMutableEnv<TestCompilationEnvironment2>().set_some_other_flag(20);
 
   // Create envs2 with some environments that should be deleted on copy
@@ -153,10 +166,10 @@ TEST_F(CompilationEnvironmentsTest, CopyAssignment) {
   auto envs2 = std::make_unique<CompilationEnvironments>();
   auto env3 = std::make_unique<TestCompilationEnvironment1>();
   env3->set_some_flag(30);
-  envs2->AddEnv(std::move(env3));
+  TF_ASSERT_OK(envs2->AddEnv(std::move(env3)));
   auto env4 = std::make_unique<TestCompilationEnvironment3>();
   env4->set_a_third_flag(40);
-  envs2->AddEnv(std::move(env4));
+  TF_ASSERT_OK(envs2->AddEnv(std::move(env4)));
 
   // Assign envs1 to envs2, and delete envs1. After assignment, the environments
   // originaly added to envs2 should be deleted, and copies of the environments
@@ -174,6 +187,29 @@ TEST_F(CompilationEnvironmentsTest, CopyAssignment) {
   EXPECT_EQ(envs2->GetEnv<TestCompilationEnvironment3>().a_third_flag(), 300);
 }
 
+TEST_F(CompilationEnvironmentsTest, ProtoRoundTrip) {
+  // Setup envs with 2 environments.
+  auto envs = std::make_unique<CompilationEnvironments>();
+  auto env1 = std::make_unique<TestCompilationEnvironment1>();
+  env1->set_some_flag(10);
+  TF_ASSERT_OK(envs->AddEnv(std::move(env1)));
+  auto env2 = std::make_unique<TestCompilationEnvironment2>();
+  TF_ASSERT_OK(envs->AddEnv(std::move(env2)));
+  envs->GetMutableEnv<TestCompilationEnvironment2>().set_some_other_flag(20);
+
+  auto proto = envs->ToProto();
+  TF_ASSERT_OK_AND_ASSIGN(auto envs_deserialized,
+                          CompilationEnvironments::CreateFromProto(proto));
+
+  // Verify that envs_deserialized has the same values with which envs was
+  // initialized.
+  EXPECT_EQ(
+      envs_deserialized->GetEnv<TestCompilationEnvironment1>().some_flag(), 10);
+  EXPECT_EQ(envs_deserialized->GetEnv<TestCompilationEnvironment2>()
+                .some_other_flag(),
+            20);
+}
+
 }  // namespace
 }  // namespace test
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_stats.cc b/tensorflow/compiler/xla/service/compilation_stats.cc
index 44398721449..fc1eee188c2 100644
--- a/tensorflow/compiler/xla/service/compilation_stats.cc
+++ b/tensorflow/compiler/xla/service/compilation_stats.cc
@@ -37,6 +37,9 @@ class NoopStats : public CompilationStats {
   void CompilationReport() override {}
 
   int GetPassesSize() override { return 0; }
+
+  void RecordPassError(absl::string_view pass_name,
+                       absl::string_view err) override{};
 };
 
 class Stats : public CompilationStats {
@@ -51,6 +54,9 @@ class Stats : public CompilationStats {
 
   int GetPassesSize() override;
 
+  void RecordPassError(absl::string_view pass_name,
+                       absl::string_view err) override{};
+
  private:
   struct PassInfo {
     PassInfo(absl::string_view name, double duration)
diff --git a/tensorflow/compiler/xla/service/compilation_stats.h b/tensorflow/compiler/xla/service/compilation_stats.h
index f0ab18d0e05..d9828686b3b 100644
--- a/tensorflow/compiler/xla/service/compilation_stats.h
+++ b/tensorflow/compiler/xla/service/compilation_stats.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 
 namespace xla {
 
@@ -43,6 +44,9 @@ class CompilationStats {
   virtual void CompilationReport() = 0;
 
   virtual int GetPassesSize() = 0;
+
+  virtual void RecordPassError(absl::string_view pass_name,
+                               absl::string_view err) = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_time_cap.h b/tensorflow/compiler/xla/service/compile_time_cap.h
index 1099daf2b4b..d7b9020a7fe 100644
--- a/tensorflow/compiler/xla/service/compile_time_cap.h
+++ b/tensorflow/compiler/xla/service/compile_time_cap.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 namespace xla {
 // Provide a common way to bound compiler analyses that potentially have
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 5445c010d4c..352ffe73d82 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/compiler.h"
 
+#include <functional>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -53,16 +55,17 @@ Compiler::CompileAheadOfTime(
   return CompileAheadOfTime(std::move(module_group), options);
 }
 
-/* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
+/* static */ absl::flat_hash_map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
-  static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
+  static auto* r = new absl::flat_hash_map<se::Platform::Id, CompilerFactory>;
   return r;
 }
 
 /* static */
-std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
+absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>*
 Compiler::GetPlatformCompilers() {
-  static auto* r = new std::map<se::Platform::Id, std::unique_ptr<Compiler>>;
+  static auto* r =
+      new absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>;
   return r;
 }
 
@@ -106,6 +109,14 @@ Compiler::GetPlatformCompilers() {
   return compilers->at(platform->id()).get();
 }
 
+// Default implementation
+// TODO(b/256849421) Replace with non-null instantiation of MetricsHookInterface
+// with empty implementations.
+std::unique_ptr<MetricsHookInterface> Compiler::CreateMetricsHook(
+    absl::string_view filename_prefix) const {
+  return nullptr;
+}
+
 AotCompilationOptions::AotCompilationOptions()
     : debug_options_(GetDebugOptionsFromFlags()) {}
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 1c30cad5e3d..f9e52e96cbb 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -20,27 +20,28 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILER_H_
 
+#include <any>
 #include <functional>
-#include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/metrics_hook_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/threadpool.h"
 
@@ -174,6 +175,11 @@ class AotCompilationOptions {
     sanitize_abilists_dataflow_ = abilists;
   }
 
+  const std::any& target_config() const { return target_config_; }
+  void set_target_config(std::any target_config) {
+    target_config_ = std::move(target_config);
+  }
+
  protected:
   AotCompilationOptions();
 
@@ -191,6 +197,8 @@ class AotCompilationOptions {
   bool run_backend_only_ = false;
   bool sanitize_dataflow_ = false;
   std::vector<std::string> sanitize_abilists_dataflow_;
+  // Contains target-specific information required by AOT compilation.
+  std::any target_config_;
 };
 
 // Abstract superclass describing metadata produced during ahead-of-time
@@ -238,7 +246,7 @@ class Compiler {
         layout_canonicalization_callback = {};
   };
 
-  virtual ~Compiler() {}
+  virtual ~Compiler() = default;
 
   // Returns the ID of the platform that this compiler targets.
   virtual se::Platform::Id PlatformId() const = 0;
@@ -260,7 +268,7 @@ class Compiler {
   // The returned 'BufferAssignment' retains a pointer to the 'HloModule', so
   // the module must live at least as long as the buffer assignments.
   virtual StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      const HloModule* module) {
+      HloModule* module, se::StreamExecutor* executor) {
     return Unimplemented("This compiler does not support this method");
   }
 
@@ -379,17 +387,22 @@ class Compiler {
     return Unimplemented("Export unimplemented");
   }
 
+  // Returns a MetricsHookInterface object used to instrument Compiler's
+  // compilation stages.
+  virtual std::unique_ptr<MetricsHookInterface> CreateMetricsHook(
+      absl::string_view filename_prefix) const;
+
  private:
   // Mutex that guards the platform-compiler map.
   static absl::Mutex platform_compiler_mutex_;
 
   // Map from platform kind to compiler factory.
-  static std::map<se::Platform::Id, CompilerFactory>*
+  static absl::flat_hash_map<se::Platform::Id, CompilerFactory>*
   GetPlatformCompilerFactories();
 
   // Map from platform kind to compiler instance, if we made one already (based
   // on the factories above).
-  static std::map<se::Platform::Id, std::unique_ptr<Compiler>>*
+  static absl::flat_hash_map<se::Platform::Id, std::unique_ptr<Compiler>>*
   GetPlatformCompilers();
 };
 
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index 3392ab0e87d..d3a91fab63a 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
@@ -61,14 +63,21 @@ bool ComputationLayout::LayoutIsSet() const {
          result_layout_.LayoutIsSet();
 }
 
-std::string ComputationLayout::ToString() const {
-  std::vector<std::string> params;
-  params.reserve(parameter_layouts_.size());
-  for (auto& param_layout : parameter_layouts_) {
-    params.push_back(param_layout.ToString());
+void ComputationLayout::Print(Printer* printer) const {
+  printer->Append("(");
+  for (int i = 0; i < parameter_layouts_.size(); ++i) {
+    const auto& param_layout = parameter_layouts_[i];
+    if (i != 0) printer->Append(",");
+    param_layout.Print(printer);
   }
-  return absl::StrCat("(", absl::StrJoin(params, ","), ")->",
-                      result_layout_.ToString());
+  printer->Append(")->");
+  result_layout_.Print(printer);
+}
+
+std::string ComputationLayout::ToString() const {
+  StringPrinter printer;
+  Print(&printer);
+  return std::move(printer).ToString();
 }
 
 ProgramShape ComputationLayout::ComputeProgramShape() const {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 57edac94161..b3cd07c43c3 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -82,6 +83,9 @@ class ComputationLayout {
   // Returns true if all layouts (parameters and result) have been set.
   bool LayoutIsSet() const;
 
+  // Prints a string representation of this object.
+  void Print(Printer* printer) const;
+
   // Returns a string representation of this object.
   std::string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer.cc b/tensorflow/compiler/xla/service/conditional_canonicalizer.cc
index 5860d336525..b84b8c82344 100644
--- a/tensorflow/compiler/xla/service/conditional_canonicalizer.cc
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer.h b/tensorflow/compiler/xla/service/conditional_canonicalizer.h
index ffad3949415..e3b643e9207 100644
--- a/tensorflow/compiler/xla/service/conditional_canonicalizer.h
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc b/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc
index de4d7a7a1c7..69b22fc54c7 100644
--- a/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_canonicalizer_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/conditional_canonicalizer.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index 9b2c80f8b18..ed7c02d5caa 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -27,15 +27,15 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
@@ -1146,6 +1146,23 @@ Status ReplaceInputAndMoveIntoBranches(
           };
       UpdateTupleUsers(inserted);
     }
+    // We can create invalid get-tuple-element() instructions when the output
+    // is not a tuple. Clean them away here.
+    // The algorithm creates some get-tuple-element() instructions, then when
+    // instructions are added to the conditional branch the algorithm can
+    // replace the operand of the GTE with an array shape. Because we use
+    // ReplaceWithDifferentShape() that's accepted. We used to rely on the
+    // TupleSimplifier to clean that up, but we shouldn't (because cleaning up
+    // invalid patterns is not the job of the TupleSimplifier).
+    // TODO(b/263496154): Change the algorithm in conditional code motion to
+    // avoid having invalid patterns lingering around at the end of the
+    // algorithm.
+    while (branch_comp->root_instruction()->opcode() ==
+               HloOpcode::kGetTupleElement &&
+           !branch_comp->root_instruction()->operand(0)->shape().IsTuple()) {
+      branch_comp->set_root_instruction(
+          branch_comp->root_instruction()->mutable_operands()[0]);
+    }
   }
   return OkStatus();
 }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 68efb4b054e..0de684fb291 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index b85a9819c70..eac95e7244b 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <utility>
 
 #include <gmock/gmock.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index ca872db1d37..c286cb3c11e 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.h b/tensorflow/compiler/xla/service/conditional_simplifier.h
index 4c94d91b1c4..9d3650c3826 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.h
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_SIMPLIFIER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index adfc62177a7..e314f54fad2 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -18,13 +18,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/conditional_to_select.cc b/tensorflow/compiler/xla/service/conditional_to_select.cc
index 5fdf9ab288e..84a0c531e36 100644
--- a/tensorflow/compiler/xla/service/conditional_to_select.cc
+++ b/tensorflow/compiler/xla/service/conditional_to_select.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/conditional_to_select.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/conditional_to_select.h b/tensorflow/compiler/xla/service/conditional_to_select.h
index c9a160ca541..73a06c92763 100644
--- a/tensorflow/compiler/xla/service/conditional_to_select.h
+++ b/tensorflow/compiler/xla/service/conditional_to_select.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONDITIONAL_TO_SELECT_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/conditional_to_select_test.cc b/tensorflow/compiler/xla/service/conditional_to_select_test.cc
index 42b8343fe19..6ef615d7ae0 100644
--- a/tensorflow/compiler/xla/service/conditional_to_select_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_to_select_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
diff --git a/tensorflow/compiler/xla/service/convert_mover.cc b/tensorflow/compiler/xla/service/convert_mover.cc
index 197f21935e6..6f9ce6d63ae 100644
--- a/tensorflow/compiler/xla/service/convert_mover.cc
+++ b/tensorflow/compiler/xla/service/convert_mover.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/convert_mover.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/convert_operand_folding.h b/tensorflow/compiler/xla/service/convert_operand_folding.h
index 3c5c6b592ba..3f85c6198c2 100644
--- a/tensorflow/compiler/xla/service/convert_operand_folding.h
+++ b/tensorflow/compiler/xla/service/convert_operand_folding.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONVERT_OPERAND_FOLDING_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/convolution_4d_expander.cc b/tensorflow/compiler/xla/service/convolution_4d_expander.cc
index 4a609301424..9f9978e4a6e 100644
--- a/tensorflow/compiler/xla/service/convolution_4d_expander.cc
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/convolution_4d_expander.h b/tensorflow/compiler/xla/service/convolution_4d_expander.h
index 7bade688ea8..8d16c39f137 100644
--- a/tensorflow/compiler/xla/service/convolution_4d_expander.h
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_4D_EXPANDER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc b/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc
index 3556155ca0a..eff591d1bd6 100644
--- a/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_4d_expander_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 4350b7e3eae..3ee07219ccd 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.h b/tensorflow/compiler/xla/service/convolution_group_converter.h
index 71835804849..d7f0d4b70e8 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index c02b24aeafc..96f8001d4f4 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/convolution_pred_expander.cc b/tensorflow/compiler/xla/service/convolution_pred_expander.cc
index b5cd279517f..02c8b2b8d22 100644
--- a/tensorflow/compiler/xla/service/convolution_pred_expander.cc
+++ b/tensorflow/compiler/xla/service/convolution_pred_expander.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/convolution_pred_expander.h b/tensorflow/compiler/xla/service/convolution_pred_expander.h
index bbea1861bde..02b0df5f87f 100644
--- a/tensorflow/compiler/xla/service/convolution_pred_expander.h
+++ b/tensorflow/compiler/xla/service/convolution_pred_expander.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_PRED_EXPANDER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 2405aafaf83..15658a515d6 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -27,16 +27,17 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/any.h"
+#include "tensorflow/compiler/xla/frontend_attributes.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/compile_time_cap.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
@@ -1844,6 +1845,33 @@ Status CopyInsertion::AddCopiesToResolveInterference(
           if (copied_operands.contains(operand_index.operand_number)) {
             continue;
           }
+
+          bool can_share_buffer = false;
+          if (can_share_buffer_ != nullptr) {
+            auto maybe_can_share_buffer = can_share_buffer_(
+                instruction, instruction->operand(operand_index.operand_number),
+                operand_index.operand_index);
+            if (maybe_can_share_buffer.has_value()) {
+              can_share_buffer = maybe_can_share_buffer.value();
+            }
+          }
+
+          // Skip copies for aliasing input/output pairs iff:
+          // *) Operand can share buffer with 'instruction' output.
+          // *) Instruction has frontend attribute which indicates that the
+          //    write region of the input/output aliased buffer updated by
+          //    'instruction' is disjoint from the read region of the shared
+          //    buffer.
+          // *) All uses of the operand are 'instruction'.
+          if (can_share_buffer &&
+              HasDisjointReadWriteRegionsAttr(instruction) &&
+              absl::c_all_of(
+                  instruction->operand(operand_index.operand_number)->users(),
+                  [&instruction](const HloInstruction* user) {
+                    return user == instruction;
+                  })) {
+            continue;
+          }
           copied_operands.insert(operand_index.operand_number);
           TF_RETURN_IF_ERROR(AddCopiesForInPlaceOperation(
               *alias_analysis, instruction, operand_index.operand_number));
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 734525e7784..b19fc61e569 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COPY_INSERTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_COPY_INSERTION_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 80a565d9677..c06474e0b91 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <set>
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -3384,5 +3384,83 @@ ROOT %arg_tuple.1 = (f32[]{:T(256)}, f32[]{:T(256)}) parameter(0), parameter_rep
   VLOG(2) << module->ToString();
 }
 
+TEST_F(CopyInsertionTest, AsyncCallDUSNoCopy) {
+  const char* const kModuleString = R"(
+HloModule async_call
+
+%called_computation {
+  %out_param = s32[1024]{0} parameter(1)
+  %input = s32[1024]{0} parameter(0)
+  %size = s32[] constant(256)
+  %index = s32[] custom-call(), custom_call_target="Baz"
+  %start = s32[] multiply(s32[] %size, s32[] %index)
+  %input2 = s32[256]{0} dynamic-slice(s32[1024]{0} %input, s32[] %start), dynamic_slice_sizes={256}
+  %output = s32[256]{0} add(s32[256]{0} %input2, s32[256]{0} %input2)
+  ROOT %output2 = s32[1024]{0} dynamic-update-slice(s32[1024]{0} %out_param, s32[256]{0} %output, s32[] %start)
+}, execution_thread="foobar"
+
+%async_wrapped {
+  %async_param = s32[1024]{0} parameter(0)
+  %async_param.1 = s32[1024]{0} parameter(1)
+  ROOT %call = s32[1024]{0} call(s32[1024]{0} %async_param, s32[1024]{0} %async_param.1), to_apply=%called_computation
+}, execution_thread="foobar"
+
+ENTRY %main {
+  %input.1 = s32[1024]{0} parameter(0)
+  %buf = s32[1024]{0} custom-call(), custom_call_target="AllocateBuffer"
+  %async-start = ((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) async-start(s32[1024]{0} %input.1, s32[1024]{0} %buf), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+  ROOT %async-done = s32[1024]{0} async-done(((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) %async-start), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnUnverifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion(nullptr,
+                               /*use_region_based_live_range_analysis=*/-1);
+  ASSERT_IS_OK(copy_insertion.Run(module.get(), {"foobar"}).status());
+  VLOG(2) << module->ToString();
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, AsyncCallDUSCopy) {
+  const char* const kModuleString = R"(
+HloModule async_call
+
+%called_computation {
+  %out_param = s32[1024]{0} parameter(1)
+  %input = s32[1024]{0} parameter(0)
+  %size = s32[] constant(256)
+  %index = s32[] custom-call(), custom_call_target="Baz"
+  %start = s32[] multiply(s32[] %size, s32[] %index)
+  %input2 = s32[256]{0} dynamic-slice(s32[1024]{0} %input, s32[] %start), dynamic_slice_sizes={256}
+  %output = s32[256]{0} add(s32[256]{0} %input2, s32[256]{0} %input2)
+  ROOT %output2 = s32[1024]{0} dynamic-update-slice(s32[1024]{0} %out_param, s32[256]{0} %output, s32[] %start)
+}, execution_thread="foobar"
+
+%async_wrapped {
+  %async_param = s32[1024]{0} parameter(0)
+  %async_param.1 = s32[1024]{0} parameter(1)
+  ROOT %call = s32[1024]{0} call(s32[1024]{0} %async_param, s32[1024]{0} %async_param.1), to_apply=%called_computation
+}, execution_thread="foobar"
+
+ENTRY %main {
+  %input.1 = s32[1024]{0} parameter(0)
+  %input.2 = s32[1024]{0} parameter(1)
+  %async-start = ((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) async-start(s32[1024]{0} %input.1, s32[1024]{0} %input.2), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+  ROOT %async-done = s32[1024]{0} async-done(((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) %async-start), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnUnverifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion(nullptr,
+                               /*use_region_based_live_range_analysis=*/-1);
+  ASSERT_IS_OK(copy_insertion.Run(module.get(), {"foobar"}).status());
+  VLOG(2) << module->ToString();
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 68382bcba29..3ab27656e04 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1,14 +1,18 @@
 # Description:
 #    LLVM-based CPU backend for XLA.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
-load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
-    "//third_party/mkl:build_defs.bzl",
+    "//tensorflow/compiler/xla:xla.bzl",
+    "ORC_JIT_MEMORY_MAPPER_TARGETS",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load(
+    "//tensorflow/tsl/mkl:build_defs.bzl",
     "mkl_deps",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_openmp_copts")
-load("//tensorflow/tsl:tsl.bzl", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "tf_openmp_copts", "tsl_copts")
 load(
     "//third_party/compute_library:build_defs.bzl",
     "acl_deps",
@@ -19,6 +23,7 @@ load("//tensorflow/tsl/platform:build_config.bzl", "if_llvm_system_z_available",
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -210,13 +215,19 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:ToLLVMIRTranslation",
+        "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/service/cpu/runtime:collectives",
         "//tensorflow/compiler/xla/service/cpu/runtime:custom_call",
+        "//tensorflow/compiler/xla/service/cpu/runtime:fft_call",
+        "//tensorflow/compiler/xla/service/cpu/runtime:rng",
+        "//tensorflow/compiler/xla/service/cpu/runtime:xfeed",
+        "//tensorflow/compiler/xla/service:stochastic_convert_decomposer",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/mlir/xla:xla_framework",
-        "//tensorflow/compiler/mlir/xla:xla_passes",
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:LinalgTransforms",
@@ -236,6 +247,7 @@ cc_library(
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
         "//tensorflow/compiler/xla/service/spmd:stateful_rng_spmd_partitioner",
         "//tensorflow/compiler/xla/service:all_gather_decomposer",
+        "//tensorflow/compiler/xla/service:all_reduce_promotion",
         "//tensorflow/compiler/xla/service:all_to_all_decomposer",
         "//tensorflow/compiler/xla/service:bfloat16_normalization",
         "//tensorflow/compiler/xla/service:bitcast_dtypes_expander",
@@ -288,7 +300,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
@@ -329,13 +341,13 @@ cc_library(
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
     ] + select({
-        "//tensorflow:arm_any": [
+        "//tensorflow/tsl:arm_any": [
             "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
         ],
-        "//tensorflow:linux_ppc64le": [
+        "//tensorflow/tsl:linux_ppc64le": [
             "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
         ],
-        "//tensorflow:macos_arm64": [
+        "//tensorflow/tsl:macos_arm64": [
             "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
         ],
         "//conditions:default": [
@@ -373,10 +385,10 @@ cc_library(
     srcs = ["hlo_xla_runtime_pipeline.cc"],
     hdrs = ["hlo_xla_runtime_pipeline.h"],
     deps = [
-        "//tensorflow/compiler/mlir/xla:xla_passes",
         "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
-        "//tensorflow/compiler/xla/mlir/transforms/cpu:passes",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_bufferizable_op_interface",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
@@ -489,7 +501,6 @@ cc_library(
     srcs = ["cpu_executable.cc"],
     hdrs = ["cpu_executable.h"],
     deps = [
-        ":executable_proto_cc",
         ":simple_orc_jit",
         ":xla_framework",
         "//tensorflow/compiler/xla:shape_tree",
@@ -499,14 +510,15 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
         "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:ffi",
         "//tensorflow/compiler/xla/runtime:jit_executable",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:custom_call_status_internal",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:logical_buffer",
@@ -518,13 +530,13 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/host:host_stream",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:platform_port",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Parser",
     ],
 )
@@ -557,10 +569,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
@@ -659,7 +671,7 @@ cc_library(
     deps = [
         ":vector_support_library",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -688,7 +700,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
@@ -708,7 +720,7 @@ cc_library(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "sample_harness",
     srcs = ["sample_harness.cc"],
     deps = [
@@ -774,9 +786,9 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/stream_executor",
@@ -825,8 +837,8 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//tensorflow/tsl/platform:mutex",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -846,8 +858,8 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//tensorflow/tsl/platform:mutex",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -882,7 +894,7 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -916,7 +928,7 @@ cc_library(
     deps = [
         ":runtime_lightweight_check",
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         "//tensorflow/tsl/platform:mutex",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -967,7 +979,7 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/tsl/platform:dynamic_annotations",
         "//tensorflow/tsl/platform:types",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//third_party/eigen3",
         "//tensorflow/tsl/platform:logging",
     ] + acl_deps(),
@@ -984,8 +996,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_lightweight_check",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
@@ -1003,8 +1015,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_lightweight_check",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
@@ -1051,7 +1063,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_single_threaded_matmul_impl",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
     ],
@@ -1064,7 +1076,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_single_threaded_matmul_impl",
-        "//tensorflow/core/kernels:eigen_contraction_kernel_no_mkl",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
     ],
@@ -1113,7 +1125,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
     shard_count = 10,
@@ -1140,7 +1152,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "runtime_fft_test",
     srcs = [
         "runtime_fft_impl.h",
@@ -1157,7 +1169,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_instruction_fusion_test",
     srcs = ["cpu_instruction_fusion_test.cc"],
     deps = [
@@ -1174,7 +1186,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "xfeed_manager_test",
     size = "small",
     srcs = ["xfeed_manager_test.cc"],
@@ -1195,8 +1207,8 @@ cc_library(
     hdrs = ["cpu_instruction_fusion.h"],
     deps = [
         ":ir_emission_utils",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:fusion_node_indexing_evaluation",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1212,12 +1224,12 @@ cc_library(
         ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@llvm-project//llvm:Core",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "ir_emission_utils_test",
     srcs = ["ir_emission_utils_test.cc"],
     deps = [
@@ -1226,7 +1238,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1259,7 +1271,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_layout_assignment_test",
     size = "small",
     srcs = ["cpu_layout_assignment_test.cc"],
@@ -1273,9 +1285,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1297,14 +1309,14 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conv_canonicalization_test",
     srcs = ["conv_canonicalization_test.cc"],
     deps = [
@@ -1313,7 +1325,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -1328,7 +1340,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "shape_partition_test",
     srcs = ["shape_partition_test.cc"],
     deps = [
@@ -1349,7 +1361,7 @@ cc_library(
         ":ir_emission_utils",
         ":shape_partition",
         ":target_machine_features",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
@@ -1358,7 +1370,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "parallel_task_assignment_test",
     srcs = ["parallel_task_assignment_test.cc"],
     deps = [
@@ -1372,9 +1384,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1426,7 +1438,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_eigen_tensor_alignment_test",
     size = "small",
     srcs = ["cpu_eigen_tensor_alignment_test.cc"],
@@ -1439,7 +1451,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "vectorized_reduce_with_no_vector_registers_test",
     size = "small",
     srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/build_defs.bzl b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
index ffa1cd4ec8e..d42b90c4546 100644
--- a/tensorflow/compiler/xla/service/cpu/build_defs.bzl
+++ b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
@@ -3,9 +3,9 @@
 def runtime_copts():
     """Returns copts used for CPU runtime libraries."""
     return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
-        "//tensorflow:android_arm": ["-mfpu=neon"],
+        "//tensorflow/tsl:android_arm": ["-mfpu=neon"],
         "//conditions:default": [],
     }) + select({
-        "//tensorflow:android": ["-O2"],
+        "//tensorflow/tsl:android": ["-O2"],
         "//conditions:default": [],
     }))
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index d18257409ac..aa06808ff33 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -128,7 +128,7 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
   llvm::ModuleAnalysisManager mam;
 
   llvm::PassInstrumentationCallbacks pic;
-  llvm::StandardInstrumentations si(false);
+  llvm::StandardInstrumentations si(module.getContext(), false);
   si.registerCallbacks(pic, &fam);
 
   llvm::PassBuilder pb(target_machine_, pto, {}, &pic);
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 3f63f568335..12bafb747e6 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
index 7cda9def4fe..4a0a6767312 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 08330ac7b1e..9d0be92480c 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -17,16 +17,15 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
-#include "tensorflow/compiler/xla/test_helpers.h"
-
 namespace xla {
 namespace cpu {
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 820e3fd55ff..909e085ef34 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
@@ -59,9 +58,7 @@ limitations under the License.
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
@@ -69,30 +66,33 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
-#include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/ir/xla_framework.h"
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
-#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/calling_convention.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+#include "tensorflow/compiler/xla/service/all_reduce_promotion.h"
 #include "tensorflow/compiler/xla/service/all_to_all_decomposer.h"
 #include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
@@ -119,13 +119,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_shape_verifier.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h"
-#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/collectives.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/fft_call.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/xfeed.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/cpu/xla_framework.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h"
@@ -135,20 +137,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gather_expander.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
@@ -170,10 +165,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/select_and_scatter_expander.h"
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
 #include "tensorflow/compiler/xla/service/sharding_remover.h"
-#include "tensorflow/compiler/xla/service/slice_sinker.h"
 #include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h"
+#include "tensorflow/compiler/xla/service/stochastic_convert_decomposer.h"
 #include "tensorflow/compiler/xla/service/topk_rewriter.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tree_reduction_rewriter.h"
@@ -186,9 +181,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
-#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -217,14 +211,6 @@ namespace xla {
 
 namespace {
 
-bool UseMlirHloLowering(bool use_mlir, HloModule* module) {
-  // TODO(tpopp): The prototype currently does not properly handle constant
-  // buffers that are handled by the runtime's buffer assignmen.
-  return use_mlir &&
-         module->entry_computation()->root_instruction()->opcode() !=
-             HloOpcode::kConstant;
-}
-
 // For each computation in the module, determines whether that computation
 // calls a custom-call function, either directly or indirectly (e.g. because it
 // calls another computation that does).
@@ -323,7 +309,8 @@ class FlattenTuplesAndBufferizeTypeConverter : public mlir::TypeConverter {
   }
 };
 
-runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions() {
+runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
+    const HloModule& module) {
   runtime::CpuPipelineOptions copts;
   runtime::JitExecutable::Options opts;
   opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
@@ -331,19 +318,37 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions() {
       [](xla::runtime::DialectRegistry& dialects) {
         dialects->insert<mlir::mhlo::MhloDialect, mlir::lmhlo::LmhloDialect>();
         runtime::RegisterDefaultXlaCpuRuntimeDialects(dialects);
-        RegisterHloXlaRuntimePipelineDialects(dialects);
-      };
-  opts.compiler.symbols_binding =
-      runtime::ToSymbolsBinding(PopulateXlaCpuCustomCall);
-  opts.compiler.create_compilation_pipeline =
-      [copts](xla::runtime::PassManager& passes) {
-        Status status = CreateDefaultHloXlaRuntimePipeline(passes);
-        if (!status.ok()) {
-          LOG(FATAL) << "HLO-XLA Runtime pipeline failed with: "
-                     << status.error_message();
-        }
-        runtime::CreateDefaultXlaCpuRuntimeCompilationPipeline(passes, copts);
+        RegisterHloXlaRuntimePipelineDialects(*dialects);
       };
+  opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
+      [](runtime::DirectCustomCallRegistry& registry) {
+        PopulateXlaCpuCollectivesCall(registry);
+        PopulateXlaCpuCustomCall(registry);
+        PopulateXlaXfeedCall(registry);
+        PopulateXlaCpuFftCall(registry);
+        PopulateXlaCpuRngCall(registry);
+      });
+  opts.compiler
+      .create_compilation_pipeline = [&module, copts](
+                                         xla::runtime::PassManager& passes) {
+    HloXlaRuntimePipelineOptions options;
+    options.enable_tiling_and_fusion =
+        GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
+    Status status = CreateHloXlaRuntimePipeline(passes, options);
+    if (!status.ok()) {
+      LOG(FATAL) << "HLO-XLA Runtime pipeline failed with: "
+                 << status.error_message();
+    }
+    runtime::CreateDefaultXlaCpuRuntimeCompilationPipeline(passes, copts);
+
+    if (DumpingEnabledForHloModule(module) &&
+        module.config().debug_options().xla_dump_hlo_snapshots()) {
+      passes->addInstrumentation(
+          std::make_unique<mlir::interpreter::MlirCompilerTraceInstrumentation>(
+              module.config().debug_options().xla_dump_to(), module.unique_id(),
+              module.name()));
+    }
+  };
   opts.compiler.calling_convention = runtime::ResultsToOutsCallingConvention(
       FlattenTuplesAndBufferizeTypeConverter());
   return opts;
@@ -370,11 +375,12 @@ CpuXlaRuntimeAotCompilationResult::LoadExecutable(
       xla_runtime_cpu_executable_.xla_framework_mapping());
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
-                      compiler->AssignBuffers(hlo_module.get()));
+                      compiler->AssignBuffers(hlo_module.get(), executor));
 
   // TODO(b/232263665): JitOptions should be used only for JIT case because it
   // has details irrelevant to AOT.
-  runtime::JitExecutable::Options opts = GetXlaRuntimeJitExecutableOptions();
+  runtime::JitExecutable::Options opts =
+      GetXlaRuntimeJitExecutableOptions(*hlo_module);
 
   return CpuExecutable::LoadFromObjFile(
       std::move(hlo_module), xla_runtime_executable.obj_file(),
@@ -391,9 +397,8 @@ CpuAotCompilationResult::CpuAotCompilationResult(
       result_buffer_index_(result_buffer_index),
       hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
-CpuAotCompilationResult::~CpuAotCompilationResult() = default;
-
-CpuCompiler::CpuCompiler() {
+CpuCompiler::CpuCompiler(bool allow_sparse_shapes)
+    : allow_sparse_shapes_(allow_sparse_shapes) {
   // Initialize LLVM the first time the CpuCompiler is initialized.
   static bool llvm_initialized = []() {
     InitializeLLVMTarget();
@@ -402,6 +407,8 @@ CpuCompiler::CpuCompiler() {
   (void)llvm_initialized;
 }
 
+CpuCompiler::CpuCompiler() : CpuCompiler(false) {}
+
 StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
     std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
@@ -511,10 +518,15 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 
 // Adds the HloVerifier for CPU to the given pipeline.
-void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
-                    bool debug_only = false) {
-  std::unique_ptr<TargetVerifierMetadata> verifier_metadata =
-      std::make_unique<CpuVerifierMetadata>(std::move(opts));
+void AddHloVerifier(HloPassPipeline* pipeline, bool allow_sparse_shapes,
+                    HloVerifierOpts&& opts = {}, bool debug_only = false) {
+  std::unique_ptr<TargetVerifierMetadata> verifier_metadata;
+  if (allow_sparse_shapes) {
+    verifier_metadata =
+        std::make_unique<DefaultVerifierMetadata>(std::move(opts));
+  } else {
+    verifier_metadata = std::make_unique<CpuVerifierMetadata>(std::move(opts));
+  }
   if (debug_only) {
     pipeline->AddInvariantCheckerDebug<HloVerifier>(
         std::move(verifier_metadata), "hlo verifier (debug)");
@@ -539,7 +551,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     HloPassPipeline spmd_pipeline("spmd-partitioner");
     // Run some IR cleanup passes before running the SPMD partitioning
     // passes.
-    AddHloVerifier(&spmd_pipeline);
+    AddHloVerifier(&spmd_pipeline, allow_sparse_shapes_);
     spmd_pipeline.AddPass<CallInliner>();
     spmd_pipeline.AddPass<ZeroSizedHloElimination>();
     spmd_pipeline.AddPass<ConditionalCanonicalizer>();
@@ -552,7 +564,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status());
   } else {
     HloPassPipeline sharding_removal_pipeline("sharding-removal");
-    AddHloVerifier(&sharding_removal_pipeline);
+    AddHloVerifier(&sharding_removal_pipeline, allow_sparse_shapes_);
     // Remove redundant sharding ops when partition_count == 1.
     sharding_removal_pipeline.AddPass<ShardingRemover>();
     sharding_removal_pipeline.AddPass<HloDCE>();
@@ -560,14 +572,16 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   }
 
   HloPassPipeline pipeline("HLO passes through layout assignment");
-  AddHloVerifier(&pipeline);
+  AddHloVerifier(&pipeline, allow_sparse_shapes_);
 
   pipeline.AddPass<OperandUpcaster>();
   pipeline.AddPass<ResultCaster>();
 
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
-  pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
+  if (!is_mlir_compile) {
+    pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
+  }
 
   // Remove zero-sized HLO from the input so that other passes don't have to
   // handle it.
@@ -586,11 +600,16 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<AllGatherDecomposer>();
   pipeline.AddPass<AllToAllDecomposer>();
   pipeline.AddPass<ReduceScatterDecomposer>();
+  pipeline.AddPass<StochasticConvertDecomposer>();
 
   // Inline computations with a single call site.
   pipeline.AddPass<CallInliner>(/*single_call_site=*/true);
   pipeline.AddPass<BatchDotSimplification>();
   pipeline.AddPass<DotDecomposer>();
+  // Promote BF16 all-reduce to F32.
+  const std::pair<PrimitiveType, PrimitiveType> ar_promoted_types[] = {
+      {BF16, F32}};
+  pipeline.AddPass<AllReducePromotion>(ar_promoted_types);
   // Convert BF16 operations to F32 operations so that the CPU backend can
   // support BF16 operations without directly implementing a BF16 lowering for
   // most ops.
@@ -630,8 +649,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   dynamic_padder_options.shape_check_mode =
       DynamicDimensionInference::ShapeCheckMode::kCompileTime;
   pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
-  pipeline.AddPass<SelectAndScatterExpander>();
-  pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
+  if (!is_mlir_compile) {
+    pipeline.AddPass<SelectAndScatterExpander>();
+    pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
+  }
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
 
   // Run fp16 dots/convs in fp32 and then downcast the result to fp16.
@@ -651,9 +672,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   }
 
   // Run the following passes to a fixed point.
-  [&pipeline =
-       pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification")] {
-    AddHloVerifier(&pipeline, HloVerifierOpts{}, /*debug_only=*/true);
+  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification"),
+   this] {
+    AddHloVerifier(&pipeline, allow_sparse_shapes_, HloVerifierOpts{},
+                   /*debug_only=*/true);
 
     AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
@@ -749,8 +771,8 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   // After layout assignment, use a layout-sensitive verifier.
   pipeline.AddPass<HloPassPipeline>("after layout assignment");
-  AddHloVerifier(&pipeline, HloVerifierOpts{}.MakeLayoutSensitive(),
-                 /*debug_only=*/true);
+  AddHloVerifier(&pipeline, allow_sparse_shapes_,
+                 HloVerifierOpts{}.MakeLayoutSensitive(), /*debug_only=*/true);
 
   pipeline.AddPass<ReshapeDecomposer>();
 
@@ -761,9 +783,10 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   // Run this to a fixed point.
   [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-       "simplification after layout assignment")] {
+       "simplification after layout assignment"),
+   this] {
     AddHloVerifier(
-        &pipeline,
+        &pipeline, allow_sparse_shapes_,
         HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
             LayoutAssignment::InstructionCanChangeLayout),
         /*debug_only=*/true);
@@ -811,9 +834,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
   TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(
       module, is_aot_compile, &target_machine_features, is_mlir_compile));
 
-  return RunHloPassesAfterLayoutAssn(
-      module, is_aot_compile, &target_machine_features,
-      UseMlirHloLowering(is_mlir_compile, module));
+  return RunHloPassesAfterLayoutAssn(module, is_aot_compile,
+                                     &target_machine_features, is_mlir_compile);
 }
 
 namespace {
@@ -937,13 +959,12 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
   TF_RETURN_IF_ERROR(RunHloPasses(
       module.get(), /*is_aot_compile=*/false, jit_target_machine.get(),
       /*is_mlir_compile=*/
-      module->config().debug_options().xla_cpu_enable_mlir_lowering() ||
-          module->config().debug_options().xla_cpu_use_xla_runtime()));
+      module->config().debug_options().xla_cpu_use_xla_runtime()));
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> CpuCompiler::AssignBuffers(
-    const HloModule* module) {
+    HloModule* module, se::StreamExecutor* /*stream_exec*/) {
   // Select an order for emitting the HLO instructions for each computation.
   // Using this sequence enables tighter buffer liveness analysis and reduced
   // memory usage (as compared to using DependencyHloOrdering).
@@ -951,14 +972,15 @@ StatusOr<std::unique_ptr<BufferAssignment>> CpuCompiler::AssignBuffers(
                       ScheduleModule(module, BufferSizeBytesFunction(),
                                      ComputationSchedulerToModuleScheduler(
                                          DFSMemoryScheduler)));
+  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(module,
-                          std::make_unique<SequentialHloOrdering>(schedule),
-                          BufferSizeBytesFunction(), memory_alignment,
-                          /*allocate_buffers_for_constants=*/true));
+      BufferAssigner::Run(
+          module, std::make_unique<SequentialHloOrdering>(module->schedule()),
+          BufferSizeBytesFunction(), memory_alignment,
+          /*allocate_buffers_for_constants=*/true));
 
   return std::move(assignment);
 }
@@ -1014,6 +1036,8 @@ Status LowerMLIRModule(mlir::ModuleOp mlir_module,
 
   xla::runtime::PassManager xla_pm(&pm);
   HloXlaRuntimePipelineOptions options;
+  options.enable_tiling_and_fusion =
+      GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
   options.sparse_bufferization = false;
   options.outline_with_xla_framework = true;
   TF_RETURN_IF_ERROR(CreateHloXlaRuntimePipeline(xla_pm, options));
@@ -1229,7 +1253,8 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
                           std::make_unique<SequentialHloOrdering>(schedule),
                           BufferSizeBytesFunction(), memory_alignment,
                           /*allocate_buffers_for_constants=*/true));
-  DumpHloModuleIfEnabled(*module, *assignment, "cpu_after_optimizations");
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
 
   // Each computation is a single function.  Emit all embedded computations
   // before the entry computation. The order of computations returned from
@@ -1237,77 +1262,52 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
   // before a caller computation.
 
   std::string function_name;
-  if (UseMlirHloLowering(
-          module->config().debug_options().xla_cpu_enable_mlir_lowering(),
-          module.get())) {
-    TF_ASSIGN_OR_RETURN(
-        auto mlir_module,
-        createMLIRModule(module.get(), mlir_context, assignment.get()));
-    TF_RETURN_IF_ERROR(LowerMLIRModule(*mlir_module, mlir_context));
-
-    function_name = entry_computation->name();
-    // TODO(kramerb): Don't rely on the exact function name.
-    llvm::cast<mlir::LLVM::LLVMFuncOp>(
-        mlir_module->lookupSymbol("main_xla_framework"))
-        .setName(function_name);
-
-    llvm_module = mlir::translateModuleToLLVMIR(*mlir_module, *llvm_context);
-    if (!llvm_module) {
-      return InternalError("Translation to LLVM IR failed");
-    }
-    llvm_module->setDataLayout((*jit)->data_layout());
-    llvm_module->setTargetTriple((*jit)->target_triple().getTriple());
-  } else {
-    LLVMTargetMachineFeatures target_machine_features((*jit)->target_machine());
-    IrEmitter ir_emitter(
-        &mlir_context, *module, *assignment, llvm_module.get(),
-        std::move(instruction_to_profile_idx),
-        std::move(computation_to_profile_idx),
-        ModuleComputationsTransitivelyContainCustomCall(*module),
-        &target_machine_features,
+  LLVMTargetMachineFeatures target_machine_features((*jit)->target_machine());
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
+                       std::move(instruction_to_profile_idx),
+                       std::move(computation_to_profile_idx),
+                       ModuleComputationsTransitivelyContainCustomCall(*module),
+                       &target_machine_features,
 #ifdef MEMORY_SANITIZER
-        /*emit_code_for_msan=*/true
+                       /*emit_code_for_msan=*/true
 #else
-        /*emit_code_for_msan=*/false
+                       /*emit_code_for_msan=*/false
 #endif
-    );
+  );
 
-    TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
-    for (ComputationToEmit subcomputation :
-         SubcomputationEmissionOrder(entry_computation)) {
-      if (subcomputation.computation->IsFusionComputation()) {
-        continue;
-      }
-      TF_RETURN_IF_ERROR(
-          ir_emitter
-              .EmitComputation(
-                  subcomputation.computation,
-                  subcomputation.computation->name(),
-                  /*is_top_level_computation=*/false,
-                  schedule.sequence(subcomputation.computation).instructions(),
-                  subcomputation.allow_reassociation)
-              .status());
+  for (ComputationToEmit subcomputation :
+       SubcomputationEmissionOrder(entry_computation)) {
+    if (subcomputation.computation->IsFusionComputation()) {
+      continue;
     }
-    std::string function_name_prefix = entry_computation->name().empty()
-                                           ? "__compute"
-                                           : entry_computation->name();
-    TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
-                        ir_emitter.EmitComputation(
-                            entry_computation, function_name_prefix,
-                            /*is_top_level_computation=*/true,
-                            schedule.sequence(entry_computation).instructions(),
-                            /*allow_reassociation=*/false));
-
-    function_name = [&]() {
-      llvm::SmallVector<char, 40> function_name_vector;
-      llvm::Mangler::getNameWithPrefix(function_name_vector,
-                                       entry_function->getName(),
-                                       (*jit)->data_layout());
-      return std::string(function_name_vector.begin(),
-                         function_name_vector.end());
-    }();
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            .EmitComputation(
+                subcomputation.computation, subcomputation.computation->name(),
+                /*is_top_level_computation=*/false,
+                schedule.sequence(subcomputation.computation).instructions(),
+                subcomputation.allow_reassociation)
+            .status());
   }
+  std::string function_name_prefix = entry_computation->name().empty()
+                                         ? "__compute"
+                                         : entry_computation->name();
+  TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
+                      ir_emitter.EmitComputation(
+                          entry_computation, function_name_prefix,
+                          /*is_top_level_computation=*/true,
+                          schedule.sequence(entry_computation).instructions(),
+                          /*allow_reassociation=*/false));
+
+  function_name = [&]() {
+    llvm::SmallVector<char, 40> function_name_vector;
+    llvm::Mangler::getNameWithPrefix(
+        function_name_vector, entry_function->getName(), (*jit)->data_layout());
+    return std::string(function_name_vector.begin(),
+                       function_name_vector.end());
+  }();
 
   std::string ir_module_string;
   if (embed_ir_in_executable) {
@@ -1343,9 +1343,11 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
 namespace {
 
 StatusOr<std::unique_ptr<XlaRuntimeCpuExecutable>> GetXlaRuntimeCpuExecutable(
-    mlir::ModuleOp mlir_module, absl::string_view entry_point,
+    const HloModule& hlo_module, mlir::ModuleOp mlir_module,
+    absl::string_view entry_point,
     const XlaFrameworkMapping& xla_framework_mapping) {
-  runtime::JitExecutable::Options opts = GetXlaRuntimeJitExecutableOptions();
+  runtime::JitExecutable::Options opts =
+      GetXlaRuntimeJitExecutableOptions(hlo_module);
   std::string serialized_mlir;
   llvm::raw_string_ostream os(serialized_mlir);
   mlir_module.print(os);
@@ -1355,9 +1357,16 @@ StatusOr<std::unique_ptr<XlaRuntimeCpuExecutable>> GetXlaRuntimeCpuExecutable(
     return InternalError("Failed to compile XLA Runtime program: %s",
                          jit_executable.status().message());
   }
+
+  // Instantiate state for all registered FFI modules.
+  auto ffi_modules_state = runtime::ffi::FfiModulesState::Instantiate();
+  if (!ffi_modules_state.ok())
+    return InternalError("Failed to instantiate FFI modules state: %s",
+                         ffi_modules_state.status().message());
+
   return std::make_unique<XlaRuntimeCpuExecutable>(
       std::make_unique<runtime::JitExecutable>(std::move(*jit_executable)),
-      xla_framework_mapping);
+      xla_framework_mapping, std::move(*ffi_modules_state));
 }
 }  // namespace
 
@@ -1407,7 +1416,15 @@ CpuCompiler::CompileXlaRuntimeCpuExecutable(
 
   TF_ASSIGN_OR_RETURN(
       auto xla_runtime_executable,
-      GetXlaRuntimeCpuExecutable(*mlir_module, "main", xla_framework_mapping));
+      GetXlaRuntimeCpuExecutable(*hlo_module, *mlir_module, "main",
+                                 xla_framework_mapping));
+
+  if (DumpingEnabledForHloModule(*hlo_module)) {
+    TF_ASSIGN_OR_RETURN(std::string_view obj_file,
+                        xla_runtime_executable->GetObjFile());
+    DumpToFileInDir(*hlo_module, /*file_prefix=*/"", /*file_suffix=*/"o",
+                    obj_file);
+  }
 
   return std::make_unique<CpuExecutable>(
       std::move(hlo_module), std::move(hlo_profile_printer_data),
@@ -1564,7 +1581,8 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
       DumpToFileInDirOrStdout(*module, "", "buffer_assignment",
                               assignment->ToString());
     }
-    DumpHloModuleIfEnabled(*module, *assignment, "cpu_after_optimizations");
+    DumpHloModuleIfEnabled(*module, *assignment,
+                           absl::StrCat("cpu_", kAfterOptimizationsDumpName));
 
     absl::flat_hash_map<const HloInstruction*, int64_t>
         instruction_to_profile_idx;
@@ -1584,7 +1602,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         CreateBufferInfosFromBufferAssignment(*assignment);
     HloComputation* computation = module->entry_computation();
 
-    if (UseMlirHloLowering(options.use_mlir_hlo_lowering(), module)) {
+    if (options.use_mlir_hlo_lowering()) {
       TF_ASSIGN_OR_RETURN(
           auto mlir_module,
           createMLIRModule(module, mlir_context, assignment.get()));
@@ -1707,8 +1725,18 @@ se::Platform::Id CpuCompiler::PlatformId() const {
   return se::host::kHostPlatformId;
 }
 
+// A special version that assigns zero size to sparse types
+// and passes all other shapes to the cpu executable function.
+static int64_t ShapeSizeBytesZeroSparse(const Shape& shape) {
+  if (LayoutUtil::IsSparseArray(shape)) {
+    return 0;
+  }
+  return CpuExecutable::ShapeSizeBytes(shape);
+}
+
 HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
-  return CpuExecutable::ShapeSizeBytes;
+  return allow_sparse_shapes_ ? ShapeSizeBytesZeroSparse
+                              : CpuExecutable::ShapeSizeBytes;
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index fa237d1251e..34742ccbb81 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -17,15 +17,16 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_COMPILER_H_
 
 #include <memory>
+#include <string>
 #include <string_view>
+#include <vector>
 
-#include "absl/types/span.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/executable.pb.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
@@ -125,7 +126,7 @@ class CpuAotCompilationResult : public AotCompilationResult {
       std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
       int64_t result_buffer_index,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
-  ~CpuAotCompilationResult();
+  ~CpuAotCompilationResult() override = default;
 
   HloProfilePrinterData* hlo_profile_printer_data() const {
     return hlo_profile_printer_data_.get();
@@ -163,7 +164,8 @@ class CpuAotCompilationResult : public AotCompilationResult {
 class CpuCompiler : public LLVMCompiler {
  public:
   CpuCompiler();
-  ~CpuCompiler() override {}
+  explicit CpuCompiler(bool allow_sparse_shapes);
+  ~CpuCompiler() override = default;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::unique_ptr<HloModuleGroup> module_group,
@@ -175,7 +177,7 @@ class CpuCompiler : public LLVMCompiler {
       const CompileOptions& options) override;
 
   StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      const HloModule* module) override;
+      HloModule* module, se::StreamExecutor* stream_exec) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
@@ -199,6 +201,9 @@ class CpuCompiler : public LLVMCompiler {
     return CpuXlaRuntimeAotCompilationResult::FromString(serialized_aot_result);
   }
 
+  StatusOr<std::unique_ptr<CpuExecutable>> CompileXlaRuntimeCpuExecutable(
+      std::unique_ptr<HloModule> module);
+
  private:
   // Initialize the LLVM target.
   static void InitializeLLVMTarget();
@@ -222,11 +227,13 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<CpuExecutable>> CompileLegacyCpuExecutable(
       std::unique_ptr<HloModule> module);
-  StatusOr<std::unique_ptr<CpuExecutable>> CompileXlaRuntimeCpuExecutable(
-      std::unique_ptr<HloModule> module);
 
   CpuCompiler(const CpuCompiler&) = delete;
   CpuCompiler& operator=(const CpuCompiler&) = delete;
+
+  // Flag that can be used to override bail-out on sparse shapes.
+  // When set, buffer assignment assigns zero sizes to these shapes.
+  const bool allow_sparse_shapes_ = false;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 7d45df2c32b..b3530432b98 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -31,12 +31,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -237,7 +238,7 @@ Status CpuExecutable::ExecuteComputeFunction(
       BufferDesc desc(const_cast<void*>(base.opaque()), base.size());
       descriptor_table.push_back(std::move(desc));
     }
-    Status status = ExecuteXlaRuntime(descriptor_table);
+    Status status = ExecuteXlaRuntime(descriptor_table, run_options);
     record_profile();
     if (!status.ok()) {
       return status;
@@ -310,11 +311,18 @@ StatusOr<std::unique_ptr<Executable>> CpuExecutable::LoadFromObjFile(
     return InternalError("Failed to load XLA Runtime executable: %s",
                          executable.status().message());
 
+  // Instantiate state for all registered FFI modules.
+  auto ffi_modules_state = runtime::ffi::FfiModulesState::Instantiate();
+  if (!ffi_modules_state.ok())
+    return InternalError("Failed to instantiate FFI modules state: %s",
+                         ffi_modules_state.status().message());
+
   // Move runtime::Executable ownership to the XlaRuntimeCpuExecutable.
   auto executable_ptr =
       std::make_unique<runtime::Executable>(std::move(executable.value()));
   auto xla_runtime_executable = std::make_unique<XlaRuntimeCpuExecutable>(
-      std::move(executable_ptr), xla_framework_mapping);
+      std::move(executable_ptr), xla_framework_mapping,
+      std::move(*ffi_modules_state));
 
   return std::unique_ptr<Executable>(new CpuExecutable(
       std::move(hlo_module), nullptr, nullptr, std::move(buffer_assignment),
@@ -470,7 +478,8 @@ static StatusOr<runtime::MemrefDesc> BufferToMemref(
 // converted to MemrefDesc's according to the corresponding operands in the
 // runtime signature.
 Status XlaRuntimeCpuExecutable::Execute(
-    const std::vector<BufferDesc>& descriptor_table) {
+    const std::vector<BufferDesc>& descriptor_table,
+    const ExecutableRunOptions* run_options) {
   const runtime::FunctionType& signature = GetExecutable().runtime_signature();
 
   size_t num_arguments = xla_framework_mapping_.inputs.size();
@@ -536,7 +545,25 @@ Status XlaRuntimeCpuExecutable::Execute(
   // No results to return; they are returned via out params.
   runtime::NoResultConverter converter;
 
+  // Collect all emitted diagnostic messages.
+  std::string diagnostic;
+  runtime::DiagnosticEngine diagnostic_engine;
+  diagnostic_engine.AddHandler([&](runtime::Diagnostic& d) {
+    absl::StrAppend(&diagnostic, d.status().message());
+    return runtime::success();
+  });
+
+  // Initialize state required for running functions exported from FFI modules.
+  absl::StatusOr<runtime::ffi::FfiStateVector> ffi_state =
+      ffi_modules_state_.state_vector();
+  if (!ffi_state.ok()) return FromAbslStatus(ffi_state.status());
+
+  runtime::CustomCall::UserData user_data(run_options, &ffi_state.value());
+
   runtime::Executable::ExecuteOpts opts;
+  opts.custom_call_data = &user_data;
+  opts.diagnostic_engine = &diagnostic_engine;
+  opts.custom_call_registry = &dynamic_custom_calls_;
 
   // We don't expect to see any async tasks in the XLA Runtime executable.
   opts.async_task_runner =
@@ -546,8 +573,9 @@ Status XlaRuntimeCpuExecutable::Execute(
   GetExecutable().Execute(call_frame, opts);
   if (auto status = GetExecutable().ReturnResults(converter, &call_frame);
       !status.ok()) {
-    return InternalError("Failed to execute XLA Runtime executable: %s.",
-                         status.message());
+    return InternalError("Failed to execute XLA Runtime executable: %s%s%s.",
+                         status.message(), diagnostic.empty() ? "" : ": ",
+                         diagnostic);
   }
   return OkStatus();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 13907108e1e..8e03c770aa3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -25,7 +25,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
@@ -34,8 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
@@ -58,20 +59,31 @@ class BufferDesc {
 };
 
 class XlaRuntimeCpuExecutable {
+  using FfiModulesState = ::xla::runtime::ffi::FfiModulesState;
+
  public:
   explicit XlaRuntimeCpuExecutable(
       std::unique_ptr<runtime::JitExecutable> jit_executable,
-      const XlaFrameworkMapping& xla_framework_mapping)
+      const XlaFrameworkMapping& xla_framework_mapping,
+      FfiModulesState ffi_modules_state)
       : executable_(std::move(jit_executable)),
-        xla_framework_mapping_(xla_framework_mapping) {}
+        xla_framework_mapping_(xla_framework_mapping),
+        ffi_modules_state_(std::move(ffi_modules_state)) {
+    runtime::ffi::ExportFfiModules(dynamic_custom_calls_);
+  }
 
   explicit XlaRuntimeCpuExecutable(
       std::unique_ptr<runtime::Executable> executable,
-      const XlaFrameworkMapping& xla_framework_mapping)
+      const XlaFrameworkMapping& xla_framework_mapping,
+      FfiModulesState ffi_modules_state)
       : executable_(std::move(executable)),
-        xla_framework_mapping_(xla_framework_mapping) {}
+        xla_framework_mapping_(xla_framework_mapping),
+        ffi_modules_state_(std::move(ffi_modules_state)) {
+    runtime::ffi::ExportFfiModules(dynamic_custom_calls_);
+  }
 
-  Status Execute(const std::vector<BufferDesc>& descriptor_table);
+  Status Execute(const std::vector<BufferDesc>& descriptor_table,
+                 const ExecutableRunOptions* run_options);
 
   runtime::Executable& GetExecutable() {
     if (std::holds_alternative<std::unique_ptr<runtime::JitExecutable>>(
@@ -123,6 +135,12 @@ class XlaRuntimeCpuExecutable {
       executable_;
 
   XlaFrameworkMapping xla_framework_mapping_;
+
+  // Keeps an executable state for all registered FFI modules.
+  FfiModulesState ffi_modules_state_;
+
+  // Dynamic custom calls exported from XLA runtime modules (and FFI modules).
+  runtime::DynamicCustomCallRegistry dynamic_custom_calls_;
 };
 
 // CPU-targeting implementation of the XLA Executable interface.
@@ -149,8 +167,9 @@ class CpuExecutable : public Executable {
 
   bool IsXlaRuntime() const { return xla_runtime_executable_ != nullptr; }
 
-  Status ExecuteXlaRuntime(const std::vector<BufferDesc>& descriptor_table) {
-    return xla_runtime_executable_->Execute(descriptor_table);
+  Status ExecuteXlaRuntime(const std::vector<BufferDesc>& descriptor_table,
+                           const ExecutableRunOptions* run_options = nullptr) {
+    return xla_runtime_executable_->Execute(descriptor_table, run_options);
   }
 
   StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 2b2f9086ef0..8c2814f1a6e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
index e9e9da4dde6..85c2cb8596a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_INSTRUCTION_FUSION_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index f77067c14f2..6ab33d9af2d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -21,16 +21,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index e159122dcf6..5d3246bf44f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <cstring>
 #include <functional>
 #include <limits>
+#include <map>
+#include <memory>
 #include <optional>
 #include <string>
 #include <type_traits>
@@ -38,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/refcounting_hash_map.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -47,15 +50,6 @@ limitations under the License.
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
-namespace se = ::stream_executor;
-
-namespace {
-template <class T>
-struct is_complex : std::false_type {};
-template <class T>
-struct is_complex<std::complex<T>> : std::true_type {};
-}  // namespace
-
 namespace xla {
 namespace cpu {
 namespace runtime {
@@ -162,14 +156,15 @@ extern const char* const kPartitionIdSymbolName =
     "__xla_cpu_runtime_PartitionId";
 extern const char* const kReplicaIdSymbolName = "__xla_cpu_runtime_ReplicaId";
 
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
-
 namespace {
 
-struct CollectivePermuteParticipantData : xla::ParticipantData {
-  CollectivePermuteParticipantData(const xla::RendezvousKey& rendezvous_key_p,
+template <class T>
+struct is_complex : std::false_type {};
+template <class T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+struct CollectivePermuteParticipantData : ParticipantData {
+  CollectivePermuteParticipantData(const RendezvousKey& rendezvous_key_p,
                                    int64_t device_ordinal_p,
                                    se::Stream* stream_p)
       : ParticipantData(rendezvous_key_p),
@@ -194,8 +189,8 @@ struct CollectivePermuteParticipantData : xla::ParticipantData {
   }
 };
 
-struct AllToAllParticipantData : xla::ParticipantData {
-  AllToAllParticipantData(const xla::RendezvousKey& rendezvous_key_p,
+struct AllToAllParticipantData : ParticipantData {
+  AllToAllParticipantData(const RendezvousKey& rendezvous_key_p,
                           int64_t device_ordinal_p, se::Stream* stream_p)
       : ParticipantData(rendezvous_key_p),
         device_ordinal(device_ordinal_p),
@@ -205,19 +200,18 @@ struct AllToAllParticipantData : xla::ParticipantData {
   se::Stream* stream;
   std::vector<se::DeviceMemoryBase> source_buffers;
   std::vector<se::DeviceMemoryBase> destination_buffers;
-  xla::GlobalDeviceId device_id;
+  GlobalDeviceId device_id;
 
   // Replica ids participating in AllToAll, concatenation happens in the order
   // of appearance.
-  std::vector<xla::GlobalDeviceId> devices_to_copy_to;
+  std::vector<GlobalDeviceId> devices_to_copy_to;
 
   std::string ToString() const override {
     auto addr_formatter = [](std::string* out,
                              const se::DeviceMemoryBase& mem) {
       absl::StrAppend(out, absl::StrFormat("%p", mem.opaque()));
     };
-    auto device_formatter = [](std::string* out,
-                               const xla::GlobalDeviceId& device) {
+    auto device_formatter = [](std::string* out, const GlobalDeviceId& device) {
       absl::StrAppend(out, device.value());
     };
     return absl::StrFormat(
@@ -233,14 +227,14 @@ struct AllToAllParticipantData : xla::ParticipantData {
 };
 
 // Inverses the encoding of a Shape protobuf into an LLVM global variable.
-xla::StatusOr<xla::Shape> DecodeSelfDescribingShapeConstant(
-    const void* shape_ptr, int32_t size_bytes) {
-  xla::ShapeProto shape_proto;
+StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
+                                                  int32_t size_bytes) {
+  ShapeProto shape_proto;
   if (!shape_proto.ParseFromArray(shape_ptr, size_bytes)) {
     return tsl::errors::Internal("Failed parsing the shape proto");
   }
-  xla::Shape shape(shape_proto);
-  auto status = xla::ShapeUtil::ValidateShape(shape);
+  Shape shape(shape_proto);
+  auto status = ShapeUtil::ValidateShape(shape);
   if (!status.ok()) {
     return status;
   }
@@ -248,17 +242,17 @@ xla::StatusOr<xla::Shape> DecodeSelfDescribingShapeConstant(
 }
 
 std::string ShapeString(const void* shape_ptr, int32_t shape_length) {
-  xla::StatusOr<xla::Shape> shape =
+  StatusOr<Shape> shape =
       DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
   if (shape.ok()) {
-    return xla::ShapeUtil::HumanStringWithLayout(shape.value());
+    return ShapeUtil::HumanStringWithLayout(shape.value());
   }
   return "<invalid shape>";
 }
 
 // TODO(zhangqiaorjc): Prefer to make callers set and use device_ordinal
 // directly since callers may not have a Stream*.
-int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options) {
+int GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
   if (!run_options) {
     return 0;
   } else if (run_options->device_ordinal() != -1) {
@@ -267,127 +261,14 @@ int GetDeviceOrdinal(const xla::ExecutableRunOptions* run_options) {
   return run_options->stream()->parent()->device_ordinal();
 }
 
-}  // namespace
-
-extern "C" {
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int __xla_cpu_runtime_PrintfToStderr(
-    const char* format, ...) {
-  VLOG(3) << "__xla_cpu_runtime_PrintfToStderr " << format;
-  va_list args;
-  va_start(args, format);
-  int result = vfprintf(stderr, format, args);
-  va_end(args);
-  return result;
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int64_t __xla_cpu_runtime_TracingStart(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
-    const char* name) {
-  VLOG(3) << "TracingStart " << name;
-  return tsl::profiler::TraceMe::ActivityStart(name);
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TracingEnd(
-    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, int64_t id) {
-  VLOG(3) << "TracingEnd " << id;
-  tsl::profiler::TraceMe::ActivityEnd(id);
-}
-
-}  // extern "C"
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void*
-__xla_cpu_runtime_AcquireInfeedBufferForDequeue(
-    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
-    const void* shape, int32_t shape_length) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-
-  VLOG(2) << "AcquireInfeedBufferForDequeue: "
-          << ShapeString(shape, shape_length) << " on stream executor "
-          << device_ordinal;
-
-  xla::cpu::runtime::XfeedManager* xfeed =
-      xla::cpu::runtime::GetXfeedManager(device_ordinal);
-  // Wait until there's a buffer to dequeue.
-  xla::cpu::runtime::XfeedBuffer* buffer =
-      xfeed->infeed()->BlockingDequeueBuffer();
-  CHECK_EQ(buffer->length(), buffer_length)
-      << "XLA program infeed request buffer size " << buffer_length
-      << " did not match the runtime's infed buffer length " << buffer->length()
-      << "; program reports desired shape: "
-      << ShapeString(shape, shape_length);
-  return buffer->data();
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
-    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
-    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-
-  VLOG(2) << "ReleaseInfeedBufferAfterDeque: "
-          << ShapeString(shape_ptr, shape_length) << " on stream executor "
-          << device_ordinal;
-
-  xla::cpu::runtime::XfeedManager* xfeed =
-      xla::cpu::runtime::GetXfeedManager(device_ordinal);
-  xla::StatusOr<xla::Shape> shape =
-      DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
-  xfeed->infeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr,
-                                        std::move(shape));
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void*
-__xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
-    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
-    const void* shape_ptr, int32_t shape_length) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-
-  VLOG(2) << "AcquireOutfeedBufferForPopulation: "
-          << ShapeString(shape_ptr, shape_length) << " on stream executor "
-          << device_ordinal;
-
-  xla::cpu::runtime::XfeedManager* xfeed =
-      xla::cpu::runtime::GetXfeedManager(device_ordinal);
-  // Wait until there's a buffer to dequeue.
-  xla::cpu::runtime::XfeedBuffer* buffer =
-      xfeed->outfeed()->BlockingDequeueBuffer();
-  CHECK_EQ(buffer->length(), buffer_length)
-      << "XLA program outfeed request buffer size " << buffer_length
-      << " did not match the runtime's outfeed buffer length "
-      << buffer->length() << "; program reports outfed shape: "
-      << ShapeString(shape_ptr, shape_length);
-  return buffer->data();
-}
-
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
-__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
-    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
-    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
-  int device_ordinal = GetDeviceOrdinal(run_options);
-
-  VLOG(2) << "ReleaseOutfeedBufferAfterPopulation: "
-          << ShapeString(shape_ptr, shape_length) << " on stream executor "
-          << device_ordinal;
-
-  xla::cpu::runtime::XfeedManager* xfeed =
-      xla::cpu::runtime::GetXfeedManager(device_ordinal);
-  xla::StatusOr<xla::Shape> shape =
-      DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
-  xfeed->outfeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr,
-                                         std::move(shape));
-}
-
-namespace {
-
 class CpuAllToAllRendezvous
-    : public xla::Rendezvous<AllToAllParticipantData, std::nullptr_t> {
+    : public Rendezvous<AllToAllParticipantData, std::nullptr_t> {
  public:
-  explicit CpuAllToAllRendezvous(const xla::RendezvousKey& k)
-      : xla::Rendezvous<AllToAllParticipantData, std::nullptr_t>(k) {}
+  explicit CpuAllToAllRendezvous(const RendezvousKey& k)
+      : Rendezvous<AllToAllParticipantData, std::nullptr_t>(k) {}
 
  protected:
-  xla::StatusOr<std::nullptr_t> RunCollectiveOp(
+  StatusOr<std::nullptr_t> RunCollectiveOp(
       const AllToAllParticipantData& /*participant*/) override {
     bool is_primary = InitializationBarrier();
 
@@ -399,7 +280,7 @@ class CpuAllToAllRendezvous
       int expected_buffer_size = participants_[0].source_buffers[0].size();
 
       // Device id -> position in participants_.
-      absl::flat_hash_map<xla::GlobalDeviceId, int> device_map;
+      absl::flat_hash_map<GlobalDeviceId, int> device_map;
 
       for (int pos = 0; pos < participants_.size(); pos++) {
         const AllToAllParticipantData& p = participants_[pos];
@@ -412,11 +293,11 @@ class CpuAllToAllRendezvous
         device_map[p.device_id] = pos;
       }
 
-      const std::vector<xla::GlobalDeviceId>& devices_to_copy_to =
+      const std::vector<GlobalDeviceId>& devices_to_copy_to =
           participants_[0].devices_to_copy_to;
 
       // Device id -> rank
-      absl::flat_hash_map<xla::GlobalDeviceId, int> device_ranks;
+      absl::flat_hash_map<GlobalDeviceId, int> device_ranks;
       for (int rank = 0; rank < devices_to_copy_to.size(); ++rank) {
         auto device_id = devices_to_copy_to[rank];
         device_ranks[device_id] = rank;
@@ -425,11 +306,11 @@ class CpuAllToAllRendezvous
       for (const AllToAllParticipantData& sender : participants_) {
         VLOG(3) << "Processing AllToAll participant: " << sender.ToString();
 
-        int rank = xla::FindOrDie(device_ranks, sender.device_id);
+        int rank = FindOrDie(device_ranks, sender.device_id);
 
         for (int i = 0; i < participants_.size(); ++i) {
           auto device_id = devices_to_copy_to[i];
-          int participant_num = xla::FindOrDie(device_map, device_id);
+          int participant_num = FindOrDie(device_map, device_id);
           AllToAllParticipantData& receiver = participants_[participant_num];
 
           std::memcpy(receiver.destination_buffers[rank].opaque(),
@@ -442,13 +323,13 @@ class CpuAllToAllRendezvous
 };
 
 class CpuCollectivePermuteRendezvous
-    : public xla::Rendezvous<CollectivePermuteParticipantData, std::nullptr_t> {
+    : public Rendezvous<CollectivePermuteParticipantData, std::nullptr_t> {
  public:
-  explicit CpuCollectivePermuteRendezvous(const xla::RendezvousKey& k)
-      : xla::Rendezvous<CollectivePermuteParticipantData, std::nullptr_t>(k) {}
+  explicit CpuCollectivePermuteRendezvous(const RendezvousKey& k)
+      : Rendezvous<CollectivePermuteParticipantData, std::nullptr_t>(k) {}
 
  protected:
-  xla::StatusOr<std::nullptr_t> RunCollectiveOp(
+  StatusOr<std::nullptr_t> RunCollectiveOp(
       const CollectivePermuteParticipantData& /*participant*/) override {
     bool primary = InitializationBarrier();
 
@@ -462,8 +343,8 @@ class CpuCollectivePermuteRendezvous
       }
       for (auto& p : participants_) {
         for (int dest_replica : p.replica_ids_to_copy_to) {
-          auto& dest_p = participants_[xla::FindOrDie(
-              replica_idx_to_participant_idx, dest_replica)];
+          auto& dest_p = participants_[FindOrDie(replica_idx_to_participant_idx,
+                                                 dest_replica)];
           std::memcpy(dest_p.destination_data.opaque(), p.source_data.opaque(),
                       p.byte_size);
 
@@ -483,52 +364,58 @@ class CpuCollectivePermuteRendezvous
 };
 
 class CpuAllReduceRendezvous
-    : public xla::Rendezvous<xla::AllReduceParticipantData, std::nullptr_t> {
+    : public Rendezvous<AllReduceParticipantData, std::nullptr_t> {
  public:
-  explicit CpuAllReduceRendezvous(const xla::RendezvousKey& k)
-      : xla::Rendezvous<xla::AllReduceParticipantData, std::nullptr_t>(k) {}
+  explicit CpuAllReduceRendezvous(const RendezvousKey& k)
+      : Rendezvous<AllReduceParticipantData, std::nullptr_t>(k) {}
 
  protected:
-  xla::StatusOr<std::nullptr_t> RunCollectiveOp(
-      const xla::AllReduceParticipantData& participant) override {
-    xla::PrimitiveType datatype = participant.buffers.front().primitive_type;
+  StatusOr<std::nullptr_t> RunCollectiveOp(
+      const AllReduceParticipantData& participant) override {
+    PrimitiveType datatype = participant.buffers.front().primitive_type;
     bool primary = InitializationBarrier();
 
     if (primary) {
       switch (datatype) {
-        case xla::S8:
-          DoAllReduce<xla::S8>(participant);
+        case S8:
+          DoAllReduce<S8>(participant);
+          break;
+        case PRED:
+        case U8:
+          DoAllReduce<U8>(participant);
+          break;
+        case S16:
+          DoAllReduce<S16>(participant);
           break;
-        case xla::PRED:
-        case xla::U8:
-          DoAllReduce<xla::U8>(participant);
+        case U16:
+          DoAllReduce<U16>(participant);
           break;
-        case xla::S32:
-          DoAllReduce<xla::S32>(participant);
+        case S32:
+          DoAllReduce<S32>(participant);
           break;
-        case xla::U32:
-          DoAllReduce<xla::U32>(participant);
+        case U32:
+          DoAllReduce<U32>(participant);
           break;
-        case xla::S64:
-          DoAllReduce<xla::S64>(participant);
+        case S64:
+          DoAllReduce<S64>(participant);
           break;
-        case xla::U64:
-          DoAllReduce<xla::U64>(participant);
+        case U64:
+          DoAllReduce<U64>(participant);
           break;
-        case xla::F16:
-          DoAllReduce<xla::F16>(participant);
+        case F16:
+          DoAllReduce<F16>(participant);
           break;
-        case xla::F32:
-          DoAllReduce<xla::F32>(participant);
+        case F32:
+          DoAllReduce<F32>(participant);
           break;
-        case xla::F64:
-          DoAllReduce<xla::F64>(participant);
+        case F64:
+          DoAllReduce<F64>(participant);
           break;
-        case xla::C64:
-          DoAllReduce<xla::C64>(participant);
+        case C64:
+          DoAllReduce<C64>(participant);
           break;
-        case xla::C128:
-          DoAllReduce<xla::C128>(participant);
+        case C128:
+          DoAllReduce<C128>(participant);
           break;
         default:
           LOG(FATAL) << "Unexpected datatype;";
@@ -538,12 +425,12 @@ class CpuAllReduceRendezvous
   }
 
  private:
-  template <xla::PrimitiveType PT>
-  void DoAllReduce(xla::AllReduceParticipantData participant) {
-    using T = typename xla::primitive_util::PrimitiveTypeToNative<PT>::type;
+  template <PrimitiveType PT>
+  void DoAllReduce(AllReduceParticipantData participant) {
+    using T = typename primitive_util::PrimitiveTypeToNative<PT>::type;
     absl::MutexLock lock(&mu_);
     CHECK(!participants_.empty());
-    xla::ReductionKind reduction_kind = participant.reduction_kind;
+    ReductionKind reduction_kind = participant.reduction_kind;
     for (const auto& p : participants_) {
       CHECK(p.reduction_kind == reduction_kind);
     }
@@ -554,11 +441,10 @@ class CpuAllReduceRendezvous
     std::vector<std::vector<absl::Span<T>>> output_buffers;
     input_buffers.reserve(num_participants);
     output_buffers.reserve(num_participants);
-    const xla::AllReduceParticipantData& first_participant =
-        participants_.front();
+    const AllReduceParticipantData& first_participant = participants_.front();
 
     int buffers_per_participant = first_participant.buffers.size();
-    for (xla::AllReduceParticipantData& p : participants_) {
+    for (AllReduceParticipantData& p : participants_) {
       CHECK_EQ(p.buffers.size(), buffers_per_participant);
 
       input_buffers.emplace_back();
@@ -604,15 +490,15 @@ class CpuAllReduceRendezvous
   }
 
   template <typename T>
-  T GetInitialValue(xla::ReductionKind reduction_kind) {
+  T GetInitialValue(ReductionKind reduction_kind) {
     switch (reduction_kind) {
-      case xla::ReductionKind::SUM:
+      case ReductionKind::SUM:
         return static_cast<T>(0);
-      case xla::ReductionKind::PRODUCT:
+      case ReductionKind::PRODUCT:
         return static_cast<T>(1);
-      case xla::ReductionKind::MIN:
+      case ReductionKind::MIN:
         return std::numeric_limits<T>::max();
-      case xla::ReductionKind::MAX:
+      case ReductionKind::MAX:
         return std::numeric_limits<T>::min();
     }
   }
@@ -629,115 +515,191 @@ class CpuAllReduceRendezvous
 
   template <typename T,
             typename std::enable_if<!is_complex<T>::value>::type* = nullptr>
-  T PerformReductionStep(xla::ReductionKind reduction_kind, T a, T b) {
+  T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
     using SumProductType = typename SumProductTypeForReductionStep<
         T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
     switch (reduction_kind) {
-      case xla::ReductionKind::SUM:
+      case ReductionKind::SUM:
         return absl::bit_cast<T>(
             static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) +
                                         absl::bit_cast<SumProductType>(b)));
-      case xla::ReductionKind::PRODUCT:
+      case ReductionKind::PRODUCT:
         return absl::bit_cast<T>(
             static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) *
                                         absl::bit_cast<SumProductType>(b)));
-      case xla::ReductionKind::MIN:
+      case ReductionKind::MIN:
         return std::min(a, b);
-      case xla::ReductionKind::MAX:
+      case ReductionKind::MAX:
         return std::max(a, b);
     }
   }
 
   template <typename T,
             typename std::enable_if<is_complex<T>::value>::type* = nullptr>
-  T PerformReductionStep(xla::ReductionKind reduction_kind, T a, T b) {
+  T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
     using SumProductType = typename SumProductTypeForReductionStep<
         T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
     switch (reduction_kind) {
-      case xla::ReductionKind::SUM:
+      case ReductionKind::SUM:
         return absl::bit_cast<T>(
             static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) +
                                         absl::bit_cast<SumProductType>(b)));
-      case xla::ReductionKind::PRODUCT:
+      case ReductionKind::PRODUCT:
         return absl::bit_cast<T>(
             static_cast<SumProductType>(absl::bit_cast<SumProductType>(a) *
                                         absl::bit_cast<SumProductType>(b)));
-      case xla::ReductionKind::MIN:
-      case xla::ReductionKind::MAX:
+      case ReductionKind::MIN:
+      case ReductionKind::MAX:
         LOG(FATAL) << "min/max not valid for complex types";
     }
   }
 };
 
-xla::RefcountingHashMap<xla::RendezvousKey, CpuAllReduceRendezvous>&
+RefcountingHashMap<RendezvousKey, CpuAllReduceRendezvous>&
 GlobalAllReduceRendezvousMap() {
   static auto& m =
-      *new xla::RefcountingHashMap<xla::RendezvousKey, CpuAllReduceRendezvous>;
+      *new RefcountingHashMap<RendezvousKey, CpuAllReduceRendezvous>;
   return m;
 }
 
-xla::RefcountingHashMap<xla::RendezvousKey, CpuCollectivePermuteRendezvous>&
+RefcountingHashMap<RendezvousKey, CpuCollectivePermuteRendezvous>&
 GlobalCollectivePermuteRendezvousMap() {
-  static auto& m = *new xla::RefcountingHashMap<xla::RendezvousKey,
-                                                CpuCollectivePermuteRendezvous>;
+  static auto& m =
+      *new RefcountingHashMap<RendezvousKey, CpuCollectivePermuteRendezvous>;
   return m;
 }
 
-xla::RefcountingHashMap<xla::RendezvousKey, CpuAllToAllRendezvous>&
+RefcountingHashMap<RendezvousKey, CpuAllToAllRendezvous>&
 GlobalAllToAllRendezvousMap() {
   static auto& m =
-      *new xla::RefcountingHashMap<xla::RendezvousKey, CpuAllToAllRendezvous>;
+      *new RefcountingHashMap<RendezvousKey, CpuAllToAllRendezvous>;
   return m;
 }
 
-xla::RendezvousKey GetRendezvousKey(
-    const xla::ExecutableRunOptions* run_options,
-    std::vector<xla::ReplicaGroup> group, int32_t channel_id_present,
-    std::optional<bool> use_global_device_ids, int64_t op_id) {
-  const xla::DeviceAssignment& device_assignment =
-      *run_options->device_assignment();
+RendezvousKey GetRendezvousKey(const ExecutableRunOptions* run_options,
+                               std::vector<ReplicaGroup> group,
+                               int32_t channel_id_present,
+                               std::optional<bool> use_global_device_ids,
+                               int64_t op_id) {
+  const DeviceAssignment& device_assignment = *run_options->device_assignment();
   int device_ordinal = GetDeviceOrdinal(run_options);
-  xla::RendezvousKey::CollectiveOpKind op_kind =
-      channel_id_present ? xla::RendezvousKey::kCrossModule
-                         : xla::RendezvousKey::kCrossReplica;
-  std::vector<xla::GlobalDeviceId> participating_devices =
-      xla::GetParticipatingDevices(
-          xla::GlobalDeviceId(device_ordinal), device_assignment, group,
-          xla::GetCollectiveOpGroupMode(channel_id_present != 0,
-                                        use_global_device_ids)
-              .value())
+  RendezvousKey::CollectiveOpKind op_kind = channel_id_present
+                                                ? RendezvousKey::kCrossModule
+                                                : RendezvousKey::kCrossReplica;
+  std::vector<GlobalDeviceId> participating_devices =
+      GetParticipatingDevices(GlobalDeviceId(device_ordinal), device_assignment,
+                              group,
+                              GetCollectiveOpGroupMode(channel_id_present != 0,
+                                                       use_global_device_ids)
+                                  .value())
           .value();
   int num_local_participants = participating_devices.size();
-  return xla::RendezvousKey{run_options->run_id(),
-                            std::move(participating_devices),
-                            num_local_participants, op_kind, op_id};
+  return RendezvousKey{run_options->run_id(), std::move(participating_devices),
+                       num_local_participants, op_kind, op_id};
 }
 
-}  // namespace
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void* AcquireInfeedBufferForDequeueImpl(const ExecutableRunOptions* run_options,
+                                        int32_t buffer_length,
+                                        const void* shape,
+                                        int32_t shape_length) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllToAll(
-    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
-    int64_t op_id, const void* replica_groups_str,
-    int32_t replica_groups_str_size, int32_t num_buffers, int64_t buffer_size,
-    void** source_buffers, void** destination_buffers) {
+  VLOG(2) << "AcquireInfeedBufferForDequeue: "
+          << ShapeString(shape, shape_length) << " on stream executor "
+          << device_ordinal;
+
+  XfeedManager* xfeed = GetXfeedManager(device_ordinal);
+  // Wait until there's a buffer to dequeue.
+  XfeedBuffer* buffer = xfeed->infeed()->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), buffer_length)
+      << "XLA program infeed request buffer size " << buffer_length
+      << " did not match the runtime's infed buffer length " << buffer->length()
+      << "; program reports desired shape: "
+      << ShapeString(shape, shape_length);
+  return buffer->data();
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void ReleaseInfeedBufferAfterDequeueImpl(
+    const ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
+
+  VLOG(2) << "ReleaseInfeedBufferAfterDeque: "
+          << ShapeString(shape_ptr, shape_length) << " on stream executor "
+          << device_ordinal;
+
+  XfeedManager* xfeed = GetXfeedManager(device_ordinal);
+  StatusOr<Shape> shape =
+      DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
+  xfeed->infeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr,
+                                        std::move(shape));
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void* AcquireOutfeedBufferForPopulationImpl(
+    const ExecutableRunOptions* run_options, int32_t buffer_length,
+    const void* shape_ptr, int32_t shape_length) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
+
+  VLOG(2) << "AcquireOutfeedBufferForPopulation: "
+          << ShapeString(shape_ptr, shape_length) << " on stream executor "
+          << device_ordinal;
+
+  XfeedManager* xfeed = GetXfeedManager(device_ordinal);
+  // Wait until there's a buffer to dequeue.
+  XfeedBuffer* buffer = xfeed->outfeed()->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), buffer_length)
+      << "XLA program outfeed request buffer size " << buffer_length
+      << " did not match the runtime's outfeed buffer length "
+      << buffer->length() << "; program reports outfed shape: "
+      << ShapeString(shape_ptr, shape_length);
+  return buffer->data();
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void ReleaseOutfeedBufferAfterPopulationImpl(
+    const ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
+  int device_ordinal = GetDeviceOrdinal(run_options);
+
+  VLOG(2) << "ReleaseOutfeedBufferAfterPopulation: "
+          << ShapeString(shape_ptr, shape_length) << " on stream executor "
+          << device_ordinal;
+
+  XfeedManager* xfeed = GetXfeedManager(device_ordinal);
+  StatusOr<Shape> shape =
+      DecodeSelfDescribingShapeConstant(shape_ptr, shape_length);
+  xfeed->outfeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr,
+                                         std::move(shape));
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void AllToAllImpl(const ExecutableRunOptions* run_options,
+                  int32_t channel_id_present, int64_t op_id,
+                  const void* replica_groups_str,
+                  int32_t replica_groups_str_size, int32_t num_buffers,
+                  int64_t buffer_size, void** source_buffers,
+                  void** destination_buffers) {
   int device_ordinal = GetDeviceOrdinal(run_options);
   absl::string_view replica_groups_serialized(
       static_cast<const char*>(replica_groups_str), replica_groups_str_size);
-  std::vector<xla::ReplicaGroup> group =
-      xla::ParseReplicaGroupsOnly(replica_groups_serialized).value();
-  xla::RendezvousKey rendezvous_key =
+  std::vector<ReplicaGroup> group =
+      ParseReplicaGroupsOnly(replica_groups_serialized).value();
+  RendezvousKey rendezvous_key =
       GetRendezvousKey(run_options, group, channel_id_present,
                        /*use_global_device_ids=*/std::nullopt, op_id);
 
   AllToAllParticipantData participant(rendezvous_key, device_ordinal,
                                       run_options->stream());
-  participant.device_id = xla::GlobalDeviceId(device_ordinal);
+  participant.device_id = GlobalDeviceId(device_ordinal);
   participant.devices_to_copy_to =
-      xla::GetParticipatingDevices(
-          xla::GlobalDeviceId(device_ordinal),
-          *run_options->device_assignment(), group,
-          xla::GetCollectiveOpGroupMode(channel_id_present != 0,
-                                        /*use_global_device_ids=*/std::nullopt)
+      GetParticipatingDevices(
+          GlobalDeviceId(device_ordinal), *run_options->device_assignment(),
+          group,
+          GetCollectiveOpGroupMode(channel_id_present != 0,
+                                   /*use_global_device_ids=*/std::nullopt)
               .value())
           .value();
   for (int i = 0; i < num_buffers; i++) {
@@ -745,7 +707,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllToAll(
     participant.destination_buffers.emplace_back(destination_buffers[i],
                                                  buffer_size);
   }
-  auto make_cpu_rendezvous = [](const xla::RendezvousKey& k) {
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
     return std::make_unique<CpuAllToAllRendezvous>(k);
   };
   TF_CHECK_OK(CpuAllToAllRendezvous::SubmitParticipant(
@@ -757,44 +719,46 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllToAll(
                   .status());
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
-    const xla::ExecutableRunOptions* run_options,
-    const void* replica_groups_str, int32_t replica_groups_str_size,
-    int32_t channel_id_present, int32_t use_global_device_ids, int64_t op_id,
-    int32_t reduction_kind, const void* shape_ptr, int32_t shape_length,
-    int32_t num_buffers, void** input_buffers, void** output_buffers) {
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void AllReduceImpl(const ExecutableRunOptions* run_options,
+                   const void* replica_groups_str,
+                   int32_t replica_groups_str_size, int32_t channel_id_present,
+                   int32_t use_global_device_ids, int64_t op_id,
+                   int32_t reduction_kind, const void* shape_ptr,
+                   int32_t shape_length, int32_t num_buffers,
+                   void** input_buffers, void** output_buffers) {
   int device_ordinal = GetDeviceOrdinal(run_options);
   absl::string_view replica_groups_serialized(
       static_cast<const char*>(replica_groups_str), replica_groups_str_size);
-  std::vector<xla::ReplicaGroup> group =
-      xla::ParseReplicaGroupsOnly(replica_groups_serialized).value();
-  xla::RendezvousKey rendezvous_key = GetRendezvousKey(
+  std::vector<ReplicaGroup> group =
+      ParseReplicaGroupsOnly(replica_groups_serialized).value();
+  RendezvousKey rendezvous_key = GetRendezvousKey(
       run_options, group, channel_id_present, use_global_device_ids, op_id);
   auto shape_str = ShapeString(shape_ptr, shape_length);
   VLOG(2) << "All-reduce input/output shape : " << shape_str;
 
-  xla::Shape shape =
+  Shape shape =
       DecodeSelfDescribingShapeConstant(shape_ptr, shape_length).value();
 
   CHECK((num_buffers > 1 && shape.IsTuple()) ||
-        (num_buffers == 1 && xla::LayoutUtil::IsDenseArray(shape)));
+        (num_buffers == 1 && LayoutUtil::IsDenseArray(shape)));
 
-  xla::AllReduceParticipantData participant(rendezvous_key, device_ordinal,
-                                            run_options->stream());
-  participant.reduction_kind = static_cast<xla::ReductionKind>(reduction_kind);
+  AllReduceParticipantData participant(rendezvous_key, device_ordinal,
+                                       run_options->stream());
+  participant.reduction_kind = static_cast<ReductionKind>(reduction_kind);
   for (int i = 0; i < num_buffers; i++) {
-    xla::Shape subshape = num_buffers == 1 ? shape : shape.tuple_shapes(i);
-    xla::AllReduceParticipantData::Buffer buffer;
-    buffer.element_count = xla::ShapeUtil::ElementsIn(subshape);
+    Shape subshape = num_buffers == 1 ? shape : shape.tuple_shapes(i);
+    AllReduceParticipantData::Buffer buffer;
+    buffer.element_count = ShapeUtil::ElementsIn(subshape);
     buffer.primitive_type = subshape.element_type();
-    buffer.source_data = se::DeviceMemoryBase(
-        input_buffers[i], xla::ShapeUtil::ByteSizeOf(subshape));
+    buffer.source_data =
+        se::DeviceMemoryBase(input_buffers[i], ShapeUtil::ByteSizeOf(subshape));
     buffer.destination_data = se::DeviceMemoryBase(
-        output_buffers[i], xla::ShapeUtil::ByteSizeOf(subshape));
+        output_buffers[i], ShapeUtil::ByteSizeOf(subshape));
     participant.buffers.push_back(buffer);
   }
 
-  auto make_cpu_rendezvous = [](const xla::RendezvousKey& k) {
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
     return std::make_unique<CpuAllReduceRendezvous>(k);
   };
 
@@ -807,37 +771,40 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
                   .status());
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ReplicaId(
-    const xla::ExecutableRunOptions* run_options, void* output_buffer) {
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void ReplicaIdImpl(const ExecutableRunOptions* run_options,
+                   void* output_buffer) {
   int device_ordinal = GetDeviceOrdinal(run_options);
-  int32_t replica_id =
-      run_options->device_assignment()
-          ->ReplicaIdForDevice(xla::GlobalDeviceId(device_ordinal))
-          .value();
+  int32_t replica_id = run_options->device_assignment()
+                           ->ReplicaIdForDevice(GlobalDeviceId(device_ordinal))
+                           .value();
   std::memcpy(output_buffer, &replica_id, 4);
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_PartitionId(
-    const xla::ExecutableRunOptions* run_options, void* output_buffer) {
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void PartitionIdImpl(const ExecutableRunOptions* run_options,
+                     void* output_buffer) {
   int device_ordinal = GetDeviceOrdinal(run_options);
-  const xla::DeviceAssignment::LogicalID logical_id =
+  const DeviceAssignment::LogicalID logical_id =
       run_options->device_assignment()
-          ->LogicalIdForDevice(xla::GlobalDeviceId(device_ordinal))
+          ->LogicalIdForDevice(GlobalDeviceId(device_ordinal))
           .value();
   std::memcpy(output_buffer, &logical_id.computation_id, 4);
 }
 
-ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_CollectivePermute(
-    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
-    int64_t op_id, int32_t byte_size, void* input_buffer, void* output_buffer,
-    const void* source_target_pairs, int32_t source_target_pairs_size) {
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY
+void CollectivePermuteImpl(const ExecutableRunOptions* run_options,
+                           int32_t channel_id_present, int64_t op_id,
+                           int32_t byte_size, void* input_buffer,
+                           void* output_buffer, const void* source_target_pairs,
+                           int32_t source_target_pairs_size) {
   int device_ordinal = GetDeviceOrdinal(run_options);
   absl::string_view source_target_pairs_serialized(
       static_cast<const char*>(source_target_pairs), source_target_pairs_size);
   auto pairs = absl::StrSplit(source_target_pairs_serialized, ',');
-  const xla::DeviceAssignment::LogicalID logical_id =
+  const DeviceAssignment::LogicalID logical_id =
       run_options->device_assignment()
-          ->LogicalIdForDevice(xla::GlobalDeviceId(device_ordinal))
+          ->LogicalIdForDevice(GlobalDeviceId(device_ordinal))
           .value();
   int32_t logical_device_id =
       channel_id_present ? logical_id.computation_id : logical_id.replica_id;
@@ -852,7 +819,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_CollectivePermute(
       copy_to.push_back(to);
     }
   }
-  xla::RendezvousKey rendezvous_key =
+  RendezvousKey rendezvous_key =
       GetRendezvousKey(run_options, {}, channel_id_present,
                        /*use_global_device_ids=*/std::nullopt, op_id);
 
@@ -864,7 +831,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_CollectivePermute(
   participant.replica_ids_to_copy_to = copy_to;
   participant.byte_size = byte_size;
 
-  auto make_cpu_rendezvous = [](const xla::RendezvousKey& k) {
+  auto make_cpu_rendezvous = [](const RendezvousKey& k) {
     return std::make_unique<CpuCollectivePermuteRendezvous>(k);
   };
   TF_CHECK_OK(
@@ -876,3 +843,107 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_CollectivePermute(
           participant)
           .status());
 }
+}  // namespace
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+extern "C" {
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int __xla_cpu_runtime_PrintfToStderr(
+    const char* format, ...) {
+  VLOG(3) << "__xla_cpu_runtime_PrintfToStderr " << format;
+  va_list args;
+  va_start(args, format);
+  int result = vfprintf(stderr, format, args);
+  va_end(args);
+  return result;
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY int64_t __xla_cpu_runtime_TracingStart(
+    const void* /* ExecutableRunOptions*  run_options_ptr*/, const char* name) {
+  VLOG(3) << "TracingStart " << name;
+  return tsl::profiler::TraceMe::ActivityStart(name);
+}
+
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_TracingEnd(
+    const void* /* ExecutableRunOptions*  run_options_ptr*/, int64_t id) {
+  VLOG(3) << "TracingEnd " << id;
+  tsl::profiler::TraceMe::ActivityEnd(id);
+}
+
+void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    const void* shape, int32_t shape_length) {
+  return xla::cpu::runtime::AcquireInfeedBufferForDequeueImpl(
+      run_options, buffer_length, shape, shape_length);
+}
+
+void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
+  return xla::cpu::runtime::ReleaseInfeedBufferAfterDequeueImpl(
+      run_options, buffer_length, buffer_ptr, shape_ptr, shape_length);
+}
+
+void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    const void* shape_ptr, int32_t shape_length) {
+  return xla::cpu::runtime::AcquireOutfeedBufferForPopulationImpl(
+      run_options, buffer_length, shape_ptr, shape_length);
+}
+
+void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    const xla::ExecutableRunOptions* run_options, int32_t buffer_length,
+    void* buffer_ptr, const void* shape_ptr, int32_t shape_length) {
+  return xla::cpu::runtime::ReleaseOutfeedBufferAfterPopulationImpl(
+      run_options, buffer_length, buffer_ptr, shape_ptr, shape_length);
+}
+
+void __xla_cpu_runtime_AllToAll(const xla::ExecutableRunOptions* run_options,
+                                int32_t channel_id_present, int64_t op_id,
+                                const void* replica_groups_str,
+                                int32_t replica_groups_str_size,
+                                int32_t num_buffers, int64_t buffer_size,
+                                void** source_buffers,
+                                void** destination_buffers) {
+  return xla::cpu::runtime::AllToAllImpl(
+      run_options, channel_id_present, op_id, replica_groups_str,
+      replica_groups_str_size, num_buffers, buffer_size, source_buffers,
+      destination_buffers);
+}
+
+void __xla_cpu_runtime_AllReduce(const xla::ExecutableRunOptions* run_options,
+                                 const void* replica_groups_str,
+                                 int32_t replica_groups_str_size,
+                                 int32_t channel_id_present,
+                                 int32_t use_global_device_ids, int64_t op_id,
+                                 int32_t reduction_kind, const void* shape_ptr,
+                                 int32_t shape_length, int32_t num_buffers,
+                                 void** input_buffers, void** output_buffers) {
+  return xla::cpu::runtime::AllReduceImpl(
+      run_options, replica_groups_str, replica_groups_str_size,
+      channel_id_present, use_global_device_ids, op_id, reduction_kind,
+      shape_ptr, shape_length, num_buffers, input_buffers, output_buffers);
+}
+
+void __xla_cpu_runtime_ReplicaId(const xla::ExecutableRunOptions* run_options,
+                                 void* output_buffer) {
+  return xla::cpu::runtime::ReplicaIdImpl(run_options, output_buffer);
+}
+
+void __xla_cpu_runtime_PartitionId(const xla::ExecutableRunOptions* run_options,
+                                   void* output_buffer) {
+  return xla::cpu::runtime::PartitionIdImpl(run_options, output_buffer);
+}
+
+void __xla_cpu_runtime_CollectivePermute(
+    const xla::ExecutableRunOptions* run_options, int32_t channel_id_present,
+    int64_t op_id, int32_t byte_size, void* input_buffer, void* output_buffer,
+    const void* source_target_pairs, int32_t source_target_pairs_size) {
+  return xla::cpu::runtime::CollectivePermuteImpl(
+      run_options, channel_id_present, op_id, byte_size, input_buffer,
+      output_buffer, source_target_pairs, source_target_pairs_size);
+}
+
+}  // extern "C"
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 46b0e21d7d6..f84c567b399 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -28,8 +28,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 550629428e0..4c14a9b642f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -53,7 +53,7 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
 }
 
 Status CpuTransferManager::ReadDynamicShapes(se::Stream* stream,
-                                             ShapedBuffer* device_buffer,
+                                             const ShapedBuffer* device_buffer,
                                              Shape* device_shape) {
   if (stream != nullptr) {
     // When a stream is presented, respect the stream dependency.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 1908feac1a1..262af50a745 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -54,7 +54,8 @@ class CpuTransferManager : public GenericTransferManager {
     return true;
   }
 
-  Status ReadDynamicShapes(se::Stream* stream, ShapedBuffer* device_buffer,
+  Status ReadDynamicShapes(se::Stream* stream,
+                           const ShapedBuffer* device_buffer,
                            Shape* device_shape) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_xfeed.cc b/tensorflow/compiler/xla/service/cpu/cpu_xfeed.cc
index 7d0fabe4054..fed8631bab2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_xfeed.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_xfeed.cc
@@ -276,12 +276,12 @@ Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
 }
 
 Status ReadDynamicShapesOnCpu(
-    ShapedBuffer* device_buffer, Shape* device_shape,
+    const ShapedBuffer* device_buffer, Shape* device_shape,
     HloCostAnalysis::ShapeSizeFunction shape_size_fn) {
   TF_RET_CHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
-  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
+      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -292,7 +292,7 @@ Status ReadDynamicShapesOnCpu(
         if (device_sub_shape.is_static()) {
           return OkStatus();
         }
-        void* memory = buffer->opaque();
+        const void* memory = buffer.opaque();
 
         // Read the dynamic shape metadata from the device stream.
         Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
@@ -301,8 +301,9 @@ Status ReadDynamicShapesOnCpu(
         if (metadata_size == 0) {
           return InvalidArgument("Dynamic shape metadata size should not be 0");
         }
-        auto buffer_8 = static_cast<int8_t*>(memory);
-        auto metadata_buffer = reinterpret_cast<int32_t*>(buffer_8 + offset);
+        auto buffer_8 = static_cast<const int8_t*>(memory);
+        auto metadata_buffer =
+            reinterpret_cast<const int32_t*>(buffer_8 + offset);
 
         // Update shape size from metadata.
         for (int64_t i = 0; i < device_sub_shape.rank(); ++i) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_xfeed.h b/tensorflow/compiler/xla/service/cpu/cpu_xfeed.h
index 628aae18523..680c99da716 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_xfeed.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_xfeed.h
@@ -38,7 +38,8 @@ Status TransferLiteralFromOutfeedOnCpu(int device_ordinal,
                                        MutableBorrowingLiteral literal);
 
 // Helper function to retrieve dynamic shape on CPU.
-Status ReadDynamicShapesOnCpu(ShapedBuffer* device_buffer, Shape* device_shape,
+Status ReadDynamicShapesOnCpu(const ShapedBuffer* device_buffer,
+                              Shape* device_shape,
                               HloCostAnalysis::ShapeSizeFunction shape_size_fn);
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 74242514356..8c630975a42 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -33,6 +33,9 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/cpu/backend_config.pb.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
@@ -42,9 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -317,9 +317,9 @@ Status DotOpEmitter::EmitLinalgMatmul() {
           }
         }
 
-        llvm::SmallVector<llvm::StringRef, 4> iteratorTypes(
-            parallel_exprs.size(), mlir::getParallelIteratorTypeName());
-        iteratorTypes.push_back(mlir::getReductionIteratorTypeName());
+        llvm::SmallVector<mlir::utils::IteratorType, 4> iteratorTypes(
+            parallel_exprs.size(), mlir::utils::IteratorType::parallel);
+        iteratorTypes.push_back(mlir::utils::IteratorType::reduction);
         builder->create<mlir::linalg::GenericOp>(
             function.getLoc(),
             /*inputs=*/mlir::ValueRange{b, c},
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index ea3c04f6248..38c6b821304 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
index 0c75eaec858..f4aee76a5d0 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
@@ -15,9 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 
 // -----------------------------------------------------------------------------
 // INTERNAL HEADER.
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index b0ef03b2a95..c9f4abf6d77 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index fa913ba2436..e1e77b6c602 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index 7732272a66a..f2e97f2854a 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
 #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
@@ -24,7 +25,6 @@ limitations under the License.
 #include "mlir/Conversion/TensorToLinalg/TensorToLinalgPass.h"  // from @llvm-project
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
@@ -37,18 +37,17 @@ limitations under the License.
 #include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
-#include "mlir/Dialect/Tensor/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/transforms/xla_passes.h"
+#include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
-#include "tensorflow/compiler/xla/mlir/transforms/cpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/bufferizable_op_interface_impl.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/gml_st/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/bufferizable_op_interface_impl.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h"
+#include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -61,14 +60,15 @@ using mlir::func::FuncOp;
 
 mlir::bufferization::OneShotBufferizationOptions GetBufferizationOptions() {
   using mlir::bufferization::BufferizationOptions;
+  using mlir::bufferization::LayoutMapOption;
   using mlir::bufferization::OneShotBufferizationOptions;
 
   OneShotBufferizationOptions options;
   options.bufferizeFunctionBoundaries = true;
   options.allowReturnAllocs = true;
-  options.functionBoundaryTypeConversion =
-      BufferizationOptions::LayoutMapOption::IdentityLayoutMap;
-  options.unknownTypeConverterFn = [](mlir::Value value, unsigned memorySpace,
+  options.functionBoundaryTypeConversion = LayoutMapOption::IdentityLayoutMap;
+  options.unknownTypeConverterFn = [](mlir::Value value,
+                                      mlir::Attribute memorySpace,
                                       const BufferizationOptions& options) {
     return mlir::bufferization::getMemRefTypeWithStaticIdentityLayout(
         value.getType().cast<mlir::TensorType>(), memorySpace);
@@ -80,12 +80,14 @@ void AddSparsificationPasses(mlir::OpPassManager& pm) {
   pm.addNestedPass<FuncOp>(mlir::createLinalgGeneralizationPass());
   pm.addNestedPass<FuncOp>(
       mlir::bufferization::createEmptyTensorToAllocTensorPass());
-  pm.addPass(mlir::bufferization::createTensorCopyInsertionPass(
-      GetBufferizationOptions()));
-  pm.addPass(mlir::createSparseTensorRewritePass());
-  pm.addPass(mlir::createSparsificationPass());
-  pm.addPass(mlir::createSparseTensorConversionPass());
-  pm.addPass(mlir::createDenseBufferizationPass(GetBufferizationOptions()));
+  pm.addPass(mlir::createPreSparsificationRewritePass());
+  pm.addPass(mlir::createSparsificationAndBufferizationPass(
+      GetBufferizationOptions(), mlir::SparsificationOptions(),
+      mlir::SparseTensorConversionOptions(), /*enableRuntimeLibrary=*/false,
+      /*enableBufferInitialization=*/false,
+      /*vectorLength=*/0,
+      /*enableVLAVectorization=*/false,
+      /*enableSIMDIndex32*/ false));
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createFinalizingBufferizePass());
 }
@@ -119,10 +121,18 @@ static Status CreateHloXlaPipeline(
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeControlFlowPass());
   pm.addPass(::mlir::mhlo::createLegalizeToArithmeticPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      xla::cpu::createLegalizeCollectiveOpsPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createMhloExpandOpsSimplifierPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createHloCanonicalizeScatterPass());
+  pm.addNestedPass<FuncOp>(mlir::mhlo::createGroupReductionDimensionsPass());
   // TODO(kramerb): Give THLO lowerings priority over linalg when it's ready for
   // concat, reduce and friends.
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createLegalizeHloToLinalgPass());
+      mlir::mhlo::createLegalizeHloToLinalgPass(
+          options.enable_tiling_and_fusion));
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeMHLOToTHLOPass());
 
@@ -132,12 +142,14 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
 
   // Transform scatter ops.
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::gml_st::createTransformScatterForCpuPass());
+  if (!options.enable_tiling_and_fusion) {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::gml_st::createTransformScatterForCpuPass());
+  }
 
   // Lower shape dialect to standard to enable linalg canonicalizations (e.g.
   // use linalg inputs instead of outputs for memref.dim operations).
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createShapeSimplification());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createShapeSimplification());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createShapeToShapeLowering());
   pm.addPass(mlir::createConvertShapeToStandardPass());
   pm.addNestedPass<mlir::func::FuncOp>(
@@ -147,8 +159,19 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::memref::createResolveShapedTypeResultDimsPass());
   pm.addPass(mlir::createCanonicalizerPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::createLinalgElementwiseOpFusionPass());
+  if (options.enable_tiling_and_fusion) {
+    mlir::gml_st::GmlStCPUPipelineOptions gml_st_opts;
+    gml_st_opts.vectorSize = 8;
+    gml_st_opts.reduction1DTileSize = 32;
+    gml_st_opts.reduction2DTileSizes = {4, 4};
+    gml_st_opts.matmulTileSizes = {4, 4, 4};
+    gml_st_opts.lowerToMmt4d = true;
+
+    mlir::gml_st::addCPUTilingPipeline(pm, gml_st_opts);
+  } else {
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::createLinalgElementwiseOpFusionPass());
+  }
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::createConvertTensorToLinalgPass());
 
@@ -162,7 +185,7 @@ static Status CreateHloXlaPipeline(
     return tsl::errors::Internal("Failed to set up detensorize pass.");
   }
   pm.addNestedPass<mlir::func::FuncOp>(std::move(detensorize));
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::createScalarizationPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::gml_st::createScalarizationPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createEmptyTensorToAllocTensorPass());
 
@@ -177,30 +200,44 @@ static Status CreateHloXlaPipeline(
     pm.addPass(mlir::hlo::createOneShotBufferizePass());
   }
 
+  if (options.enable_tiling_and_fusion) {
+    pm.addNestedPass<FuncOp>(mlir::gml_st::createVectorizeCopyPass());
+    pm.addNestedPass<FuncOp>(mlir::gml_st::createSimplifyDeadCopyPass());
+  }
   // Handle framework specific requirements for buffers and then insert
   // deallocations for temporary buffers.
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertLinalgToLoopsPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::gml_st::createGmlStToScfPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
-  pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass());
+  mlir::bufferization::BufferResultsToOutParamsOptions out_params_options;
+  out_params_options.filterFn = [](mlir::func::FuncOp* func) {
+    // Only transform the entry point.
+    return func->getSymName() == "main";
+  };
+  pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass(
+      out_params_options));
   if (options.outline_with_xla_framework) {
     pm.addPass(mlir::mhlo::CreateOutlineWithXLAFrameworkPass());
   }
   pm.addPass(mlir::createInlinerPass());
+  if (!options.sparse_bufferization) {
+    pm.addNestedPass<FuncOp>(
+        mlir::bufferization::createPromoteBuffersToStackPass(nullptr));
+  }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::bufferization::createBufferDeallocationPass());
 
   pm.addPass(mlir::createBufferizationToMemRefPass());
 
+  pm.addNestedPass<mlir::func::FuncOp>(
+      xla::cpu::createRemoveCopiesToOutParamsPass());
+
   // Specialize linalg.matmul to linalg.dot, linalg.matvec or linalg.vecmat,
   // and immediately canonicalize to clean up not taken branches.
   // pm.addNestedPass<mlir::func::FuncOp>(CreateLinalgMatmulSpecializationPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
-  // Tile and vectorize linalg operation using Linalg Codegen Strategy.
-  // pm.addNestedPass<mlir::func::FuncOp>(CreateCodegenStrategyForMatMulPass());
-
   // TODO(tpopp): Move hits to mlir::hlo::createGenericHostToLLVMPass?
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::createConvertComplexToStandardPass());
@@ -208,11 +245,16 @@ static Status CreateHloXlaPipeline(
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
+  pm.addNestedPass<FuncOp>(mlir::gml_st::createRewriteVectorTransposePass());
   mlir::VectorTransferToSCFOptions vec_to_scf_options;
   vec_to_scf_options.unroll = true;
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::createConvertVectorToSCFPass(vec_to_scf_options));
-
+  pm.addNestedPass<FuncOp>(
+      mlir::gml_st::createRewriteVectorMultiReductionPass());
+  pm.addNestedPass<FuncOp>(xla::cpu::createLegalizeI1VectorTransferOpsPass());
+  pm.addNestedPass<FuncOp>(
+      xla::cpu::createConvertXlaCpuMemRefElementCastToLLVMPass());
   return OkStatus();
 }
 
@@ -227,18 +269,18 @@ Status CreateDefaultHloXlaRuntimePipeline(xla::runtime::PassManager& passes) {
   return CreateHloXlaPipeline(*passes, options);
 }
 
-void RegisterHloXlaRuntimePipelineDialects(
-    xla::runtime::DialectRegistry& dialects) {
-  mlir::arith::registerBufferizableOpInterfaceExternalModels(*dialects);
+void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects) {
+  mlir::arith::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::bufferization::func_ext::registerBufferizableOpInterfaceExternalModels(
-      *dialects);
-  mlir::gml_st::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::linalg::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::mhlo::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::scf::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::shape::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::tensor::registerBufferizableOpInterfaceExternalModels(*dialects);
-  mlir::vector::registerBufferizableOpInterfaceExternalModels(*dialects);
+      dialects);
+  mlir::gml_st::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::linalg::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::linalg::registerTilingInterfaceExternalModels(dialects);
+  mlir::mhlo::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::scf::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::shape::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::tensor::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::vector::registerBufferizableOpInterfaceExternalModels(dialects);
 }
 
 static mlir::PassPipelineRegistration<> hlo_xla_runtime_pipeline(
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
index 00dfc9b5bcb..08830cfc7fe 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
@@ -19,10 +19,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/compiler.h"
 #include "tensorflow/compiler/xla/status.h"
 
+namespace mlir {
+class DialectRegistry;
+}  // namespace mlir
+
 namespace xla {
 namespace cpu {
 
 struct HloXlaRuntimePipelineOptions {
+  bool enable_tiling_and_fusion = false;
   bool sparse_bufferization = true;
   bool outline_with_xla_framework = false;
 };
@@ -32,8 +37,7 @@ Status CreateHloXlaRuntimePipeline(xla::runtime::PassManager& passes,
                                    const HloXlaRuntimePipelineOptions& options);
 Status CreateDefaultHloXlaRuntimePipeline(xla::runtime::PassManager& passes);
 
-void RegisterHloXlaRuntimePipelineDialects(
-    xla::runtime::DialectRegistry& dialects);
+void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index baa2cd21e34..05ba909f3d7 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index 30cc8bba90c..00066ad0ec9 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace cpu {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index b7624e48fff..2d496e8735b 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <algorithm>
 #include <iterator>
 #include <limits>
+#include <map>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
@@ -35,8 +37,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/FMF.h"
@@ -46,6 +46,10 @@ limitations under the License.
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -59,13 +63,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -76,7 +74,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -247,13 +244,11 @@ void IrEmitter::InitializeIrFunction(const std::string& function_name) {
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
   // Create and initialize new IrFunction.
-  compute_function_.reset(new IrFunction(function_name, linkage,
-                                         hlo_module_config_, module_, &b_,
-                                         num_dynamic_loop_bounds_));
+  compute_function_ =
+      std::make_unique<IrFunction>(function_name, linkage, hlo_module_config_,
+                                   module_, &b_, num_dynamic_loop_bounds_);
 }
 
-IrEmitter::~IrEmitter() {}
-
 Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
   emitted_value_[bitcast] =
@@ -1185,6 +1180,8 @@ Status IrEmitter::HandleAllReduceMultipleReplica(HloInstruction* crs) {
       case PRED:
       case S8:
       case U8:
+      case S16:
+      case U16:
       case S32:
       case U32:
       case S64:
@@ -2604,7 +2601,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   Br(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
-  compute_function_->function()->getBasicBlockList().push_back(exit_bb);
+  llvm::Function* llvm_fn = compute_function_->function();
+  llvm_fn->insert(llvm_fn->end(), exit_bb);
   b_.SetInsertPoint(exit_bb);
 
   return OkStatus();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index edfc6cea8b5..3ee1a5f5441 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <memory>
 #include <ostream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -33,25 +34,20 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Target/TargetMachine.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -94,7 +90,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                 computation_transitively_contains_custom_call,
             const TargetMachineFeatures* target_machine,
             bool emit_code_for_msan);
-  ~IrEmitter() override;
+  ~IrEmitter() override = default;
 
   // Emit and return the given HLO computation as an LLVM IR
   // function.
@@ -149,7 +145,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleFft(HloInstruction* fft) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleCollectivePermute(HloInstruction* crs) override;
-  Status HandleInfeed(HloInstruction* infeed) override;
+  Status HandleInfeed(HloInstruction* instruction) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSort(HloInstruction* hlo) override;
   Status HandleParameter(HloInstruction* parameter) override;
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
index 846bceff600..959554a0540 100644
--- a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -42,7 +42,7 @@ std::unique_ptr<llvm::Module> MakeLLVMModule(
   // TODO(kramerb): link this to the right option, command line flag, etc.
   constexpr bool kReassociateFPReductions = true;
 
-  mlir::PassManager manager(module->getContext(),
+  mlir::PassManager manager((*module)->getName(),
                             mlir::OpPassManager::Nesting::Implicit);
   manager.addPass(mlir::createConvertLinalgToLoopsPass());
   manager.addPass(mlir::createLowerAffinePass());
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 17247cc73ef..cbd20d5d105 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <memory>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/cpu/backend_config.pb.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 307e84aac48..2c8565a95fc 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/BUILD b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
index bf803c00269..e13428c13a1 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -12,6 +13,22 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "collectives",
+    srcs = ["collectives.cc"],
+    hdrs = ["collectives.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "custom_call",
     srcs = ["custom_call.cc"],
@@ -28,3 +45,51 @@ cc_library(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "fft_call",
+    srcs = ["fft_call.cc"],
+    hdrs = ["fft_call.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service/cpu:runtime_fft",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "xfeed",
+    srcs = ["xfeed.cc"],
+    hdrs = ["xfeed.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "rng",
+    srcs = ["rng.cc"],
+    hdrs = ["rng.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc b/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc
new file mode 100644
index 00000000000..4c254f32d1b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/collectives.cc
@@ -0,0 +1,356 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/collectives.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+using mlir::succeeded;
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+using ::xla::runtime::MemrefView;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+static std::string ReplicaGroupsToString(
+    CustomCall::TensorRef<int64_t> replica_groups) {
+  if (replica_groups.shape[0] == 0) {
+    return "{}";
+  }
+  std::string result;
+
+  const auto& shape = replica_groups.shape;
+  size_t stride = replica_groups.data.size() / shape[0];
+
+  absl::StrAppend(&result, "{");
+  for (size_t i = 0; i < replica_groups.data.size(); i += stride) {
+    if (i > 0) {
+      absl::StrAppend(&result, ", ");
+    }
+
+    auto start = replica_groups.data.begin() + i;
+    llvm::ArrayRef<int64_t> inner_data(start, start + stride);
+
+    absl::StrAppend(&result, "{");
+    absl::StrAppend(
+        &result,
+        // The replica groups can have different sizes. Smaller groups are
+        // padded with -1.
+        absl::StrJoin(llvm::make_filter_range(
+                          inner_data, [](int64_t id) { return id >= 0; }),
+                      ", "));
+    absl::StrAppend(&result, "}");
+  }
+  absl::StrAppend(&result, "}");
+
+  return result;
+}
+
+static std::string SourceTargetPairsToString(
+    CustomCall::TensorRef<int64_t> source_target_pairs) {
+  std::string result;
+  for (size_t i = 0; i < source_target_pairs.data.size(); i += 2) {
+    if (i > 0) {
+      absl::StrAppend(&result, ",");
+    }
+    absl::StrAppend(&result, source_target_pairs.data[i], "=",
+                    source_target_pairs.data[i + 1]);
+  }
+  return result;
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaPartitionId {
+  absl::StatusOr<int32_t> operator()(
+      const ExecutableRunOptions* run_options) const;
+  static XlaPartitionId Handler() { return XlaPartitionId(); }
+};
+}  // namespace
+
+absl::StatusOr<int32_t> XlaPartitionId::operator()(
+    const ExecutableRunOptions* run_options) const {
+  int32_t result;
+  __xla_cpu_runtime_PartitionId(run_options, &result);
+  return result;
+}
+
+static bool PartitionId(xla::runtime::ExecutionContext* ctx, void** args,
+                        void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.partition_id")
+                             .Ret<int32_t>()
+                             .UserData<const ExecutableRunOptions*>()
+                             .To<RuntimeChecks()>(XlaPartitionId::Handler())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaReplicaId {
+  absl::StatusOr<int32_t> operator()(
+      const ExecutableRunOptions* run_options) const;
+  static XlaReplicaId Handler() { return XlaReplicaId(); }
+};
+}  // namespace
+
+absl::StatusOr<int32_t> XlaReplicaId::operator()(
+    const ExecutableRunOptions* run_options) const {
+  int32_t result;
+  __xla_cpu_runtime_ReplicaId(run_options, &result);
+  return result;
+}
+
+static bool ReplicaId(xla::runtime::ExecutionContext* ctx, void** args,
+                      void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.replica_id")
+                             .Ret<int32_t>()
+                             .UserData<const ExecutableRunOptions*>()
+                             .To<RuntimeChecks()>(XlaReplicaId::Handler())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaAllReduce {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          CustomCall::RemainingArgs buffers,
+                          CustomCall::TensorRef<int64_t> replica_groups,
+                          int64_t channel_id, int32_t use_global_device_ids,
+                          int64_t op_id, int32_t reduction_kind) const;
+  static XlaAllReduce Handler() { return XlaAllReduce(); }
+};
+}  // namespace
+
+absl::Status XlaAllReduce::operator()(
+    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs buffers,
+    CustomCall::TensorRef<int64_t> replica_groups, int64_t channel_id,
+    int32_t use_global_device_ids, int64_t op_id,
+    int32_t reduction_kind) const {
+  if (replica_groups.shape.size() != 2) {
+    return absl::InvalidArgumentError("replica_groups must be a 2d tensor.");
+  }
+
+  if (buffers.size() % 2) {
+    return absl::InvalidArgumentError(
+        "number of input buffers and output buffers must be equal.");
+  }
+
+  std::string replica_groups_str = ReplicaGroupsToString(replica_groups);
+  int64_t num_buffers = static_cast<int64_t>(buffers.size()) / 2;
+
+  llvm::SmallVector<void*> input_buffers, output_buffers;
+  ShapeProto shape;
+  for (int i = 0; i < num_buffers; ++i) {
+    auto input = buffers.get<MemrefView>(i);
+    auto output = buffers.get<MemrefView>(i + num_buffers);
+    if (!succeeded(input) || !succeeded(output)) {
+      return absl::InvalidArgumentError("all arguments must be memrefs.");
+    }
+
+    *shape.add_tuple_shapes() =
+        ShapeUtil::MakeShapeWithDescendingLayout(input->dtype, input->sizes)
+            .ToProto();
+    input_buffers.push_back(input->data);
+    output_buffers.push_back(output->data);
+  }
+  std::string shape_str =
+      (shape.tuple_shapes().size() == 1 ? shape.tuple_shapes(0) : shape)
+          .SerializeAsString();
+
+  __xla_cpu_runtime_AllReduce(
+      run_options, replica_groups_str.c_str(),
+      static_cast<int32_t>(replica_groups_str.size()),
+      static_cast<int32_t>(channel_id), use_global_device_ids, op_id,
+      reduction_kind, shape_str.c_str(), static_cast<int32_t>(shape_str.size()),
+      static_cast<int32_t>(num_buffers), input_buffers.data(),
+      output_buffers.data());
+
+  return absl::OkStatus();
+}
+
+static bool AllReduce(xla::runtime::ExecutionContext* ctx, void** args,
+                      void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla.cpu.all_reduce")
+          .UserData<const ExecutableRunOptions*>()
+          .RemainingArgs()
+          .Attr<CustomCall::TensorRef<int64_t>>("replica_groups")
+          .Attr<int64_t>("channel_handle")
+          .Attr<int32_t>("use_global_device_ids")
+          .Attr<int64_t>("op_id")
+          .Attr<int32_t>("reduction_kind")
+          .To<RuntimeChecks()>(XlaAllReduce::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaTupleAllToAll {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          CustomCall::RemainingArgs buffers,
+                          CustomCall::TensorRef<int64_t> replica_groups) const;
+  static XlaTupleAllToAll Handler() { return XlaTupleAllToAll(); }
+};
+}  // namespace
+
+absl::Status XlaTupleAllToAll::operator()(
+    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs buffers,
+    CustomCall::TensorRef<int64_t> replica_groups) const {
+  if (replica_groups.shape.size() != 2) {
+    return absl::InvalidArgumentError("replica_groups must be a 2d tensor.");
+  }
+
+  if (buffers.size() % 2) {
+    return absl::InvalidArgumentError(
+        "number of input buffers and output buffers must be equal.");
+  }
+
+  std::string replica_groups_str = ReplicaGroupsToString(replica_groups);
+  int64_t num_buffers = static_cast<int64_t>(buffers.size()) / 2;
+
+  llvm::SmallVector<void*> input_buffers, output_buffers;
+  for (int i = 0; i < num_buffers; ++i) {
+    auto input = buffers.get<MemrefView>(i);
+    auto output = buffers.get<MemrefView>(i + num_buffers);
+    if (!succeeded(input) || !succeeded(output)) {
+      return absl::InvalidArgumentError("all arguments must be memrefs.");
+    }
+
+    input_buffers.push_back(input->data);
+    output_buffers.push_back(output->data);
+  }
+
+  auto first_input = *buffers.get<MemrefView>(0);
+  size_t buffer_size = ShapeUtil::ByteSizeOfElements(
+      ShapeUtil::MakeShape(first_input.dtype, first_input.sizes));
+
+  __xla_cpu_runtime_AllToAll(run_options, 0, 0, replica_groups_str.c_str(),
+                             static_cast<int32_t>(replica_groups_str.size()),
+                             static_cast<int32_t>(num_buffers),
+                             static_cast<int64_t>(buffer_size),
+                             input_buffers.data(), output_buffers.data());
+
+  return absl::OkStatus();
+}
+
+static bool TupleAllToAll(xla::runtime::ExecutionContext* ctx, void** args,
+                          void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla.cpu.all_reduce")
+          .UserData<const ExecutableRunOptions*>()
+          .RemainingArgs()
+          .Attr<CustomCall::TensorRef<int64_t>>("replica_groups")
+          .To<RuntimeChecks()>(XlaTupleAllToAll::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaCollectivePermute {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          MemrefView input, MemrefView output,
+                          CustomCall::TensorRef<int64_t> source_target_pairs,
+                          int64_t channel_id) const;
+  static XlaCollectivePermute Handler() { return XlaCollectivePermute(); }
+};
+}  // namespace
+
+absl::Status XlaCollectivePermute::operator()(
+    const ExecutableRunOptions* run_options, MemrefView input,
+    MemrefView output, CustomCall::TensorRef<int64_t> source_target_pairs,
+    int64_t channel_id) const {
+  if (source_target_pairs.shape.size() != 2 ||
+      source_target_pairs.shape[1] != 2) {
+    return absl::InvalidArgumentError(
+        "source_target_pairs must be a ?x2 tensor.");
+  }
+  size_t byte_size = ShapeUtil::ByteSizeOfElements(
+      ShapeUtil::MakeShape(input.dtype, input.sizes));
+  std::string source_target_pairs_str =
+      SourceTargetPairsToString(source_target_pairs);
+
+  __xla_cpu_runtime_CollectivePermute(
+      run_options, static_cast<int32_t>(channel_id), 0,
+      static_cast<int32_t>(byte_size), input.data, output.data,
+      source_target_pairs_str.c_str(),
+      static_cast<int32_t>(source_target_pairs_str.size()));
+
+  return absl::OkStatus();
+}
+
+static bool CollectivePermute(xla::runtime::ExecutionContext* ctx, void** args,
+                              void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla.cpu.collective_permute")
+          .UserData<const ExecutableRunOptions*>()
+          .Arg<MemrefView>()  // input
+          .Arg<MemrefView>()  // output
+          .Attr<CustomCall::TensorRef<int64_t>>("source_target_pairs")
+          .Attr<int64_t>("channel_handle")
+          .To<RuntimeChecks()>(XlaCollectivePermute::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaCpuCollectivesCall(
+    xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.cpu.all_reduce", &xla::cpu::AllReduce);
+  registry.Register("xla.cpu.tuple_all_to_all", &xla::cpu::TupleAllToAll);
+  registry.Register("xla.cpu.collective_permute", &xla::cpu::CollectivePermute);
+  registry.Register("xla.cpu.partition_id", &xla::cpu::PartitionId);
+  registry.Register("xla.cpu.replica_id", &xla::cpu::ReplicaId);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/collectives.h b/tensorflow/compiler/xla/service/cpu/runtime/collectives.h
new file mode 100644
index 00000000000..8a57ecaf196
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/collectives.h
@@ -0,0 +1,29 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU collectives
+void PopulateXlaCpuCollectivesCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_COLLECTIVES_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
index a35964e9401..3e6f1bab459 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/custom_call.cc
@@ -58,14 +58,14 @@ static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
 namespace {
 struct XlaCustomCall {
   absl::Status operator()(CustomCall::RemainingArgs args, int32_t num_results,
-                          StringRef call_target_name,
+                          bool output_tuple, StringRef call_target_name,
                           int32_t api_version) const;
   static XlaCustomCall Handler() { return XlaCustomCall(); }
 };
 }  // namespace
 
 absl::Status XlaCustomCall::operator()(CustomCall::RemainingArgs args,
-                                       int32_t num_results,
+                                       int32_t num_results, bool output_tuple,
                                        StringRef call_target_name,
                                        int32_t api_version) const {
   // Find the Xla custom call handler.
@@ -99,7 +99,7 @@ absl::Status XlaCustomCall::operator()(CustomCall::RemainingArgs args,
   // Multiple result buffers are passed as a tuple, which is represented as a
   // buffer of pointers.
   void* result_buffer =
-      num_results <= 1 ? buffers.back() : buffers.end() - num_results;
+      !output_tuple ? buffers.back() : buffers.end() - num_results;
 
   // Original custom call API version that doesn't support returning status.
   if (api_version == CustomCallApiVersion::API_VERSION_ORIGINAL) {
@@ -135,6 +135,7 @@ static bool CustomCall(runtime::ExecutionContext* ctx, void** args,
   static auto* handler = CustomCall::Bind("xla.cpu.custom_call")
                              .Arg<CustomCall::RemainingArgs>()  // args
                              .Attr<int32_t>("num_results")
+                             .Attr<bool>("output_tuple")
                              .Attr<std::string_view>("call_target_name")
                              .Attr<int32_t>("api_version")
                              .To<RuntimeChecks()>(XlaCustomCall::Handler())
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/fft_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/fft_call.cc
new file mode 100644
index 00000000000..c73db4d73da
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/fft_call.cc
@@ -0,0 +1,102 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "tensorflow/compiler/xla/service/cpu/runtime/fft_call.h"
+
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace xla {
+namespace cpu {
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+using ::xla::runtime::MemrefView;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+namespace {
+struct XlaFft {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          MemrefView input, MemrefView output, int32_t fft_type,
+                          absl::Span<const int64_t> fft_length) const;
+  static XlaFft Handler() { return XlaFft(); }
+};
+}  // namespace
+
+absl::Status XlaFft::operator()(const ExecutableRunOptions* run_options,
+                                MemrefView input, MemrefView output,
+                                int32_t fft_type,
+                                absl::Span<const int64_t> fft_length) const {
+  if (fft_length.empty() || fft_length.size() > 3) {
+    return absl::InvalidArgumentError(
+        "fft_length must contain 1 to 3 elements");
+  }
+  bool double_precision = output.dtype == PrimitiveType::C128;
+  int64_t input_batch = 1;
+  auto fft_rank = static_cast<int32_t>(fft_length.size());
+  for (int64_t dim = 0; dim < input.sizes.size() - fft_rank; ++dim) {
+    input_batch *= input.sizes[dim];
+  }
+  __xla_cpu_runtime_EigenFft(run_options, output.data, input.data, fft_type,
+                             static_cast<int32_t>(double_precision), fft_rank,
+                             input_batch, fft_length[0],
+                             fft_length.size() > 1 ? fft_length[1] : 0,
+                             fft_length.size() > 2 ? fft_length[2] : 0);
+  return absl::OkStatus();
+}
+
+static bool Fft(xla::runtime::ExecutionContext* ctx, void** args, void** attrs,
+                void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.fft")
+                             .UserData<const ExecutableRunOptions*>()
+                             .Arg<MemrefView>()  // input
+                             .Arg<MemrefView>()  // output
+                             .Attr<int32_t>("fft_type")
+                             .Attr<absl::Span<const int64_t>>("fft_length")
+                             .To<RuntimeChecks()>(XlaFft::Handler())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaCpuFftCall(xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.cpu.fft", &Fft);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/fft_call.h b/tensorflow/compiler/xla/service/cpu/runtime/fft_call.h
new file mode 100644
index 00000000000..585cc4f5293
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/fft_call.h
@@ -0,0 +1,28 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU FFT.
+void PopulateXlaCpuFftCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_CALL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng.cc b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
new file mode 100644
index 00000000000..297c0a0da8a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
@@ -0,0 +1,226 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
+
+#include <array>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+using ::xla::runtime::FlatMemrefView;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+namespace {
+struct XlaThreeFry {
+  absl::Status operator()(const ExecutableRunOptions*,
+                          FlatMemrefView state_buffer,
+                          FlatMemrefView state_out_buffer,
+                          FlatMemrefView values_buffer) const;
+};
+struct XlaPhilox {
+  absl::Status operator()(const ExecutableRunOptions*,
+                          FlatMemrefView state_buffer,
+                          FlatMemrefView state_out_buffer,
+                          FlatMemrefView values_buffer) const;
+};
+}  // namespace
+
+static std::array<uint32_t, 2> threefry2x32(std::array<uint32_t, 2> key,
+                                            std::array<uint32_t, 2> ctr) {
+  constexpr std::array<std::array<int, 4>, 2> rotations{
+      std::array<int, 4>{13, 15, 26, 6}, std::array<int, 4>{17, 29, 16, 24}};
+
+  std::array<uint32_t, 3> ks{key[0], key[1], key[0] ^ key[1] ^ 0x1BD11BDAu};
+  ctr[0] += ks[0];
+  ctr[1] += ks[1];
+
+  auto apply_round = [&](int r, int i0, int i1, int b) {
+    for (int64_t rot : rotations[r]) {
+      ctr[0] += ctr[1];
+      ctr[1] = (ctr[1] << rot) | (ctr[1] >> (32 - rot));
+      ctr[1] ^= ctr[0];
+    }
+    ctr[0] += ks[i0];
+    ctr[1] += ks[i1] + b;
+  };
+
+  apply_round(0, 1, 2, 1);
+  apply_round(1, 2, 0, 2);
+  apply_round(0, 0, 1, 3);
+  apply_round(1, 1, 2, 4);
+  apply_round(0, 2, 0, 5);
+  return ctr;
+}
+
+static std::array<uint32_t, 4> philox4x32(std::array<uint32_t, 2> key,
+                                          std::array<uint32_t, 4> ctr) {
+  auto mulhilo = [](uint64_t a, uint64_t b) -> std::array<uint32_t, 2> {
+    return {static_cast<uint32_t>((a * b) >> 32), static_cast<uint32_t>(a * b)};
+  };
+  for (int i = 0; i < 10; ++i) {
+    auto [hi0, lo0] = mulhilo(0xD2511F53, ctr[0]);
+    auto [hi1, lo1] = mulhilo(0xCD9E8D57, ctr[2]);
+    ctr = {{hi1 ^ ctr[1] ^ key[0], lo1, hi0 ^ ctr[3] ^ key[1], lo0}};
+    key[0] += 0x9E3779B9u;
+    key[1] += 0xBB67AE85u;
+  }
+  return ctr;
+}
+
+template <typename E, typename T, typename C>
+void FillBuffer(void* buffer, void* state_buffer, int64_t size_bytes, T fn,
+                C ctr, std::array<uint32_t, 2> key) {
+  E* out = static_cast<E*>(buffer);
+  int64_t i = 0;
+  int64_t num = size_bytes / sizeof(E);
+  while (i < num) {
+    auto val = fn(key, ctr);
+    for (int64_t j = 0; j < val.size() && i < num; ++i, ++j) {
+      out[i] = val[j];
+    }
+    if (!++ctr[0]) {
+      ++ctr[1];
+    }
+  }
+
+  auto state_out = static_cast<uint32_t*>(state_buffer);
+  state_out[0] = key[0];
+  state_out[1] = key[1];
+  state_out[2] = ctr[0];
+  state_out[3] = ctr[1];
+}
+
+static absl::Status ValidateStateBuffers(FlatMemrefView state_buffer,
+                                         FlatMemrefView state_out_buffer,
+                                         bool allow_24 = false) {
+  if (state_buffer.size_in_bytes != 16 &&
+      !(allow_24 && state_buffer.size_in_bytes == 24)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Unexpected state size: ", state_buffer.size_in_bytes));
+  }
+  if (state_out_buffer.size_in_bytes != state_buffer.size_in_bytes) {
+    return absl::InvalidArgumentError(
+        "Expected state output to have the same size as input.");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status XlaThreeFry::operator()(const ExecutableRunOptions*,
+                                     FlatMemrefView state_buffer,
+                                     FlatMemrefView state_out_buffer,
+                                     FlatMemrefView values_buffer) const {
+  auto status = ValidateStateBuffers(state_buffer, state_out_buffer);
+  if (!status.ok()) {
+    return status;
+  }
+
+  auto* state_vals = static_cast<uint32_t*>(state_buffer.data);
+  std::array<uint32_t, 2> key{state_vals[0], state_vals[1]};
+  std::array<uint32_t, 2> ctr{state_vals[2], state_vals[3]};
+
+  if (values_buffer.dtype == PrimitiveType::U16 ||
+      values_buffer.dtype == PrimitiveType::S16 ||
+      values_buffer.dtype == PrimitiveType::F16) {
+    // XLA's RngBitGeneratorExpander has a corner case for U16 where it discards
+    // half the bits. We don't really need that, but some TF tests depend on it,
+    // somehow.
+    FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
+                         values_buffer.size_in_bytes, threefry2x32, ctr, key);
+  } else {
+    FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
+                         values_buffer.size_in_bytes, threefry2x32, ctr, key);
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status XlaPhilox::operator()(const ExecutableRunOptions*,
+                                   FlatMemrefView state_buffer,
+                                   FlatMemrefView state_out_buffer,
+                                   FlatMemrefView values_buffer) const {
+  auto status = ValidateStateBuffers(state_buffer, state_out_buffer, true);
+  if (!status.ok()) {
+    return status;
+  }
+
+  auto* state_vals = static_cast<uint32_t*>(state_buffer.data);
+  std::array<uint32_t, 2> key{state_vals[0], state_vals[1]};
+  bool is_24 = state_buffer.size_in_bytes == 24;
+  std::array<uint32_t, 4> ctr{state_vals[2], state_vals[3],
+                              state_vals[is_24 ? 4 : 0],
+                              state_vals[is_24 ? 5 : 1]};
+
+  if (values_buffer.dtype == PrimitiveType::U16 ||
+      values_buffer.dtype == PrimitiveType::S16 ||
+      values_buffer.dtype == PrimitiveType::F16) {
+    FillBuffer<uint16_t>(values_buffer.data, state_out_buffer.data,
+                         values_buffer.size_in_bytes, philox4x32, ctr, key);
+  } else {
+    FillBuffer<uint32_t>(values_buffer.data, state_out_buffer.data,
+                         values_buffer.size_in_bytes, philox4x32, ctr, key);
+  }
+  return absl::OkStatus();
+}
+
+static bool ThreeFry(xla::runtime::ExecutionContext* ctx, void** args,
+                     void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.rng.three_fry")
+                             .UserData<const ExecutableRunOptions*>()
+                             .Arg<FlatMemrefView>()
+                             .Arg<FlatMemrefView>()
+                             .Arg<FlatMemrefView>()
+                             .To<RuntimeChecks()>(XlaThreeFry())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+static bool Philox(xla::runtime::ExecutionContext* ctx, void** args,
+                   void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.rng.philox")
+                             .UserData<const ExecutableRunOptions*>()
+                             .Arg<FlatMemrefView>()
+                             .Arg<FlatMemrefView>()
+                             .Arg<FlatMemrefView>()
+                             .To<RuntimeChecks()>(XlaPhilox())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaCpuRngCall(xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.cpu.rng.three_fry", &ThreeFry);
+  registry.Register("xla.cpu.rng.philox", &Philox);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng.h b/tensorflow/compiler/xla/service/cpu/runtime/rng.h
new file mode 100644
index 00000000000..79e73eae543
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng.h
@@ -0,0 +1,29 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU RNGs.
+void PopulateXlaCpuRngCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/xfeed.cc b/tensorflow/compiler/xla/service/cpu/runtime/xfeed.cc
new file mode 100644
index 00000000000..2cc3dbfa1e9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/xfeed.cc
@@ -0,0 +1,184 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/xfeed.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+namespace cpu {
+
+using mlir::succeeded;
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+static xla::Shape ToShape(const xla::runtime::StridedMemrefView& memref) {
+  // Recover `minor_to_major` dimensions permutation from strides.
+  auto indexed_strides_range =
+      llvm::map_range(llvm::enumerate(memref.strides), [](auto pair) {
+        return std::pair<int64_t, int64_t>{pair.value(), pair.index()};
+      });
+
+  auto indexed_strides = llvm::to_vector(indexed_strides_range);
+  llvm::stable_sort(indexed_strides);
+
+  auto minor_to_major =
+      llvm::to_vector(llvm::make_second_range(indexed_strides));
+  return xla::ShapeUtil::MakeShapeWithDenseLayout(memref.dtype, memref.sizes,
+                                                  minor_to_major);
+}
+
+static int64_t MemrefSize(const xla::runtime::StridedMemrefView& memref) {
+  int64_t size_in_bytes = primitive_util::ByteWidth(memref.dtype);
+  for (int64_t size : memref.sizes) {
+    size_in_bytes *= size;
+  }
+  return size_in_bytes;
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaInfeed {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          CustomCall::RemainingArgs args) const;
+  static XlaInfeed Handler() { return XlaInfeed(); }
+};
+}  // namespace
+
+absl::Status XlaInfeed::operator()(const ExecutableRunOptions* run_options,
+                                   CustomCall::RemainingArgs args) const {
+  for (unsigned i = 0; i < args.size(); ++i) {
+    auto memref = args.get<xla::runtime::StridedMemrefView>(i);
+    if (!succeeded(memref)) {
+      return absl::InvalidArgumentError(
+          "Failed to get arguments as (strided) memref view");
+    }
+
+    auto size_in_bytes = static_cast<int32_t>(MemrefSize(*memref));
+    std::string shape_string = ToShape(*memref).SerializeAsString();
+
+    void* infeed_buffer = __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
+        run_options, size_in_bytes, shape_string.data(),
+        static_cast<int32_t>(shape_string.size()));
+    // Copy from the infeed buffer.
+    std::memcpy(memref->data, infeed_buffer, size_in_bytes);
+    __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
+        run_options, size_in_bytes, infeed_buffer, shape_string.data(),
+        static_cast<int32_t>(shape_string.size()));
+  }
+  return absl::OkStatus();
+}
+
+static bool Infeed(xla::runtime::ExecutionContext* ctx, void** args,
+                   void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.infeed")
+                             .UserData<const ExecutableRunOptions*>()
+                             .Arg<CustomCall::RemainingArgs>()  // args
+                             .To<RuntimeChecks()>(XlaInfeed::Handler())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+// -------------------------------------------------------------------------- //
+
+namespace {
+struct XlaOutfeed {
+  absl::Status operator()(const ExecutableRunOptions* run_options,
+                          CustomCall::RemainingArgs args,
+                          absl::Span<const int32_t> result_type) const;
+  static XlaOutfeed Handler() { return XlaOutfeed(); }
+};
+}  // namespace
+
+absl::Status XlaOutfeed::operator()(
+    const ExecutableRunOptions* run_options, CustomCall::RemainingArgs args,
+    absl::Span<const int32_t> result_type) const {
+  assert(result_type.size() == args.size() &&
+         "Result types and input args should be of the same size.");
+  for (unsigned i = 0; i < args.size(); ++i) {
+    auto memref = args.get<xla::runtime::StridedMemrefView>(i);
+    if (!succeeded(memref)) {
+      return absl::InvalidArgumentError(
+          "Failed to get arguments as (strided) memref view");
+    }
+
+    // Restoring the sign information that was lost during convert-to-signless
+    // pass. This information was stashed in an attribute inside
+    // xla_cpu::outfeed.
+    memref->dtype = PrimitiveType(result_type[i]);
+
+    auto size_in_bytes = static_cast<int32_t>(MemrefSize(*memref));
+    std::string shape_string = ToShape(*memref).SerializeAsString();
+
+    void* outfeed_buffer = __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+        run_options, size_in_bytes, shape_string.data(),
+        static_cast<int32_t>(shape_string.size()));
+    // Copy to the outfeed buffer.
+    std::memcpy(outfeed_buffer, memref->data, size_in_bytes);
+    __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+        run_options, size_in_bytes, outfeed_buffer, shape_string.data(),
+        static_cast<int32_t>(shape_string.size()));
+  }
+  return absl::OkStatus();
+}
+
+static bool Outfeed(xla::runtime::ExecutionContext* ctx, void** args,
+                    void** attrs, void** rets) {
+  static auto* handler = CustomCall::Bind("xla.cpu.outfeed")
+                             .UserData<const ExecutableRunOptions*>()
+                             .Arg<CustomCall::RemainingArgs>()  // args
+                             .Attr<absl::Span<const int32_t>>("result_type")
+                             .To<RuntimeChecks()>(XlaOutfeed::Handler())
+                             .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaXfeedCall(xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.cpu.infeed", &xla::cpu::Infeed);
+  registry.Register("xla.cpu.outfeed", &xla::cpu::Outfeed);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/xfeed.h b/tensorflow/compiler/xla/service/cpu/runtime/xfeed.h
new file mode 100644
index 00000000000..77d3418ebf8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/xfeed.h
@@ -0,0 +1,29 @@
+// Copyright 2022 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_XFEED_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_XFEED_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU infeed and outfeed.
+void PopulateXlaXfeedCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_XFEED_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
index 7f9fecbb2b3..1deb0a97259 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
@@ -16,10 +16,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV_IMPL_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 // 'tensorflow' namespace is used so that types don't require qualification.
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index a1991711731..21ca5ed6402 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_lightweight_check.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 namespace {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 05f5485e542..d5f0b6b93a6 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 namespace {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 2e3ac209660..6902e5dfcd9 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -125,7 +125,9 @@ SimpleOrcJIT::SimpleOrcJIT(
               std::move(post_optimization_hook), std::move(post_codegen_hook))),
       main_jit_dylib_(&execution_session_->createBareJITDylib("<main>")),
       gdb_jit_event_listener_(
-          llvm::JITEventListener::createGDBRegistrationListener()) {
+          llvm::JITEventListener::createGDBRegistrationListener()),
+      perf_jit_event_listener_(
+          llvm::JITEventListener::createPerfJITEventListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
 
@@ -156,6 +158,9 @@ SimpleOrcJIT::SimpleOrcJIT(
   main_jit_dylib_->addGenerator(
       std::make_unique<RuntimeSymbolGenerator>(*this));
   object_layer_.registerJITEventListener(*this);
+  if (perf_jit_event_listener_) {
+    object_layer_.registerJITEventListener(*perf_jit_event_listener_);
+  }
 
   // Copied from LLJIT, required to find symbols on Windows.
   if (target_triple_.isOSBinFormatCOFF()) {
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 5c208990dbe..d1a6f888167 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -126,6 +126,8 @@ class SimpleOrcJIT : public llvm::JITEventListener {
   // free this, but the function is poorly named and really just returns a
   // pointer to a static object.
   llvm::JITEventListener* gdb_jit_event_listener_;
+
+  llvm::JITEventListener* perf_jit_event_listener_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 012a64e8049..0f59c5590ba 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -1,10 +1,11 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -36,11 +37,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_dyn_shape_test",
     srcs = ["cpu_dyn_shape_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
@@ -50,7 +51,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_fusion_test",
     srcs = ["cpu_fusion_test.cc"],
     deps = [
@@ -58,8 +59,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -69,7 +70,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_bytesizeof_test",
     srcs = ["cpu_bytesizeof_test.cc"],
     deps = [
@@ -80,20 +81,20 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_external_constants_test",
     srcs = ["cpu_external_constants_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/tsl/platform:test",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_noalias_test",
     srcs = ["cpu_noalias_test.cc"],
     deps = [
@@ -101,8 +102,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -114,12 +115,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_intrinsic_test",
     srcs = ["cpu_intrinsic_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
@@ -131,11 +132,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_eigen_dot_operation_test",
     srcs = ["cpu_eigen_dot_operation_test.cc"],
+    tags = ["no_mac_arm64"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
@@ -147,12 +149,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_profiling_test",
     srcs = ["cpu_profiling_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
@@ -164,18 +166,17 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tree_reduction_rewriter_test",
     srcs = ["tree_reduction_rewriter_test.cc"],
     deps = [
         ":cpu_codegen_test",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:codegen_test_base",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -183,13 +184,15 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_infeed_test",
     srcs = ["cpu_infeed_test.cc"],
     deps = [
@@ -213,11 +216,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_literal_caching_test",
     srcs = ["cpu_literal_caching_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
@@ -228,11 +231,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_outfeed_test",
     srcs = ["cpu_outfeed_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
@@ -242,11 +245,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_key_value_sort_test",
     srcs = ["cpu_key_value_sort_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
@@ -256,7 +259,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_spmd_compile_test",
     srcs = ["cpu_spmd_compile_test.cc"],
     deps = [
@@ -273,14 +276,14 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_topk_test",
     srcs = ["cpu_topk_test.cc"],
     deps = [
         ":cpu_codegen_test",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/tsl/platform:logging",
@@ -289,12 +292,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_vectorization_test",
     srcs = ["cpu_vectorization_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
@@ -306,12 +309,12 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cpu_while_test",
     srcs = ["cpu_while_test.cc"],
     deps = [
         ":cpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 6eb0b700e8c..b76b8e74080 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/platform/test.h"
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index 5585202efc5..e8e21950914 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/tsl/platform/test.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index 7c28e167229..9b0101a6150 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index e422039cb01..e0dea7c0591 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "llvm-c/Target.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index b319272f94f..dc454860737 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_profiling_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_profiling_test.cc
index 03910fafa10..a701881d620 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_profiling_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_profiling_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "llvm-c/Target.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
index 6434c3e8ad9..ecb8e61c5cc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "llvm-c/Target.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
index 509a64e9cda..d1b47880fad 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
@@ -15,16 +15,16 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
index d7be27aefb1..b1034e292e5 100644
--- a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 505d8c559d0..343753ca88e 100644
--- a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -46,7 +46,7 @@ StatusOr<unsigned> GetTargetVectorRegisterByteSize(std::string triple) {
   std::unique_ptr<llvm::TargetMachine> target_machine =
       absl::WrapUnique(target->createTargetMachine(
           /*TT=*/triple, /*CPU=*/"", /*Features=*/"", llvm::TargetOptions{},
-          /*RM=*/llvm::None));
+          /*RM=*/std::nullopt));
   cpu::LLVMTargetMachineFeatures target_machine_features(target_machine.get());
   return target_machine_features.vector_register_byte_size(*function);
 }
diff --git a/tensorflow/compiler/xla/service/custom_call_sharding_helper.h b/tensorflow/compiler/xla/service/custom_call_sharding_helper.h
index 4a2976f0c92..72a350340aa 100644
--- a/tensorflow/compiler/xla/service/custom_call_sharding_helper.h
+++ b/tensorflow/compiler/xla/service/custom_call_sharding_helper.h
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_SHARDING_HELPER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CUSTOM_CALL_SHARDING_HELPER_H_
diff --git a/tensorflow/compiler/xla/service/defuser.cc b/tensorflow/compiler/xla/service/defuser.cc
index db5eee35cda..2735a877b3d 100644
--- a/tensorflow/compiler/xla/service/defuser.cc
+++ b/tensorflow/compiler/xla/service/defuser.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/defuser.h b/tensorflow/compiler/xla/service/defuser.h
index 21a09ddeff5..e56fdaa582b 100644
--- a/tensorflow/compiler/xla/service/defuser.h
+++ b/tensorflow/compiler/xla/service/defuser.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
index 6184bf80703..7badaad2ae4 100644
--- a/tensorflow/compiler/xla/service/despecializer.h
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/despecializer_test.cc b/tensorflow/compiler/xla/service/despecializer_test.cc
index 01090721c4b..7cd9eb65dc9 100644
--- a/tensorflow/compiler/xla/service/despecializer_test.cc
+++ b/tensorflow/compiler/xla/service/despecializer_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
deleted file mode 100644
index 5b855f26186..00000000000
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
-
-#include <type_traits>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
deleted file mode 100644
index f6ffed0222b..00000000000
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
-
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
index 4b7d241adac..6f5336d6680 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
index 1e0e54e5b9e..b26d728568f 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
index 2940552b78c..c56dd4f62e8 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.h
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 namespace dot_as_convolution_util {
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 0be9890d6c5..3b9d76151c1 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
index bd662ca56eb..945e9e1c61a 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.h
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/dot_merger.cc b/tensorflow/compiler/xla/service/dot_merger.cc
index a5f3b8266c9..a572fe1de3c 100644
--- a/tensorflow/compiler/xla/service/dot_merger.cc
+++ b/tensorflow/compiler/xla/service/dot_merger.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <functional>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 
@@ -266,10 +266,6 @@ StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge) {
   }
 
   // Build a dependency graph representing the whole computation.
-  //
-  // TODO(jlebar): If this is slow to create or use, could we make it faster by
-  // collapsing elements of the graph that don't correspond to dots, or
-  // otherwise not adding them to the graph in the first place?
   tensorflow::GraphCycles graph;
 
   absl::flat_hash_map<HloInstruction*, int32_t> graph_ids_map;
@@ -283,7 +279,9 @@ StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge) {
     return it->second;
   };
 
-  for (HloInstruction* instr : comp->instructions()) {
+  // Iteration order doesn't matter for correctness, but graph.InsertEdge() is
+  // *much* faster if we iterate in topological order.
+  for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
     int32_t id = graph_id(instr);
     for (HloInstruction* operand : instr->operands()) {
       CHECK(graph.InsertEdge(graph_id(operand), id));
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 6e04a7375a1..c6bda02b18a 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Transforms/LocationSnapshot.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/io/zlib_compression_options.h"
@@ -62,7 +62,8 @@ struct CanonicalDebugOptions {
         dump_module_metadata(opts.xla_dump_module_metadata()),
         dump_compress_protos(opts.xla_dump_compress_protos()),
         dump_hlo_metadata(!opts.xla_dump_disable_metadata()),
-        dump_as_long_text(opts.xla_dump_hlo_as_long_text()) {
+        dump_as_long_text(opts.xla_dump_hlo_as_long_text()),
+        dump_mlir_pretty_form(opts.xla_dump_enable_mlir_pretty_form()) {
     // This constructor examines the values in `opts` and turns on other flags
     // based on what we think is the user's intent.  To reduce confusion about
     // what was a user-specified value versus an extrapolated value, within this
@@ -178,6 +179,7 @@ struct CanonicalDebugOptions {
   bool dump_compress_protos;
   bool dump_hlo_metadata;
   bool dump_as_long_text;
+  bool dump_mlir_pretty_form;
 };
 
 // Helper class to hold a list of functions that produces data to be written to
@@ -418,10 +420,15 @@ static std::vector<std::string> DumpHloModuleImpl(
         pb, opts, opts.dump_compress_protos));
   }
 
-  auto render_graph = [&](RenderedGraphFormat format) {
-    StatusOr<std::string> rendered_graph = RenderGraph(
-        *module.entry_computation(),
-        /*label=*/filename, module.config().debug_options(), format);
+  auto render_graph = [&](RenderedGraphFormat format,
+                          bool show_fusion_subcomputations = true) {
+    HloRenderOptions hlo_render_options;
+    hlo_render_options.show_fusion_subcomputations =
+        show_fusion_subcomputations;
+    StatusOr<std::string> rendered_graph =
+        RenderGraph(*module.entry_computation(),
+                    /*label=*/filename, module.config().debug_options(), format,
+                    hlo_render_options);
     if (rendered_graph.ok()) {
       return std::move(rendered_graph).value();
     }
@@ -439,6 +446,11 @@ static std::vector<std::string> DumpHloModuleImpl(
     file_paths.push_back(
         DumpToFileInDirImpl(StrFormat("%s.html", filename),
                             render_graph(RenderedGraphFormat::kHtml), opts));
+    if (absl::StrContains(filename, kAfterOptimizationsDumpName)) {
+      file_paths.push_back(DumpToFileInDirImpl(
+          StrFormat("%s.top_level.html", filename),
+          render_graph(RenderedGraphFormat::kHtml, false), opts));
+    }
   }
 
   if (opts.dump_fusion_visualization) {
@@ -617,7 +629,8 @@ void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
   mlir::OpPrintingFlags print_flags = mlir::OpPrintingFlags().useLocalScope();
   // Enable debug info so that it is easier to see the corresponding HLO node.
   if (file_prefix == "lmhlo") {
-    print_flags.enableDebugInfo(/*prettyForm=*/true);
+    print_flags.enableDebugInfo(/*enable=*/true,
+                                /*prettyForm=*/opts.dump_mlir_pretty_form);
   }
   op->print(outputFile->os(), print_flags);
   outputFile->keep();
diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h
index 47cd542d484..101a4fb0be7 100644
--- a/tensorflow/compiler/xla/service/dump.h
+++ b/tensorflow/compiler/xla/service/dump.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
@@ -30,6 +30,11 @@ limitations under the License.
 
 namespace xla {
 
+// Argument used when calling DumpHloModuleIfEnabled before optimizations are
+// performed on an HloModule.
+constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
+constexpr char kAfterOptimizationsDumpName[] = "after_optimizations";
+
 class BufferAssignment;
 class HloSnapshot;
 
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 885d227273a..6c7be51725e 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -19,15 +19,15 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dynamic_window_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -1696,13 +1696,6 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
       hlo,
       [&](HloInstruction* operand, ShapeIndex dynamic_index, int64_t dimension,
           int64_t operand_index, HloInstruction* operand_dynamic_size) {
-        // Sometimes the incoming operand dimension is no longer dynamic,
-        // although it is still marked as dynamic in the parent computation.
-        // Simply return OK in this case.
-        if (!ShapeUtil::GetSubshape(operand->shape(), dynamic_index)
-                 .is_dynamic_dimension(dimension)) {
-          return OkStatus();
-        }
         if (operand_index == 0) {
           parent_->SetDynamicSize(hlo, {}, dimension, operand_dynamic_size);
           return OkStatus();
@@ -1729,16 +1722,22 @@ Status DynamicDimensionInferenceVisitor::HandleScatter(HloInstruction* hlo) {
               const Shape& update_shape = hlo->operand(2)->shape();
               int64_t dim_in_operand = update_window_dims_in_operand[i];
               if (operand_shape.dimensions(dim_in_operand) !=
-                      update_shape.dimensions(dimension) ||
-                  !operand_shape.is_dynamic_dimension(dim_in_operand)) {
+                  update_shape.dimensions(dimension)) {
                 return Unimplemented(
                     "Dynamic dimension of update window dims that are not the "
                     "same as corresponding operand dim is not supported: "
-                    "%s",
-                    hlo->ToString());
+                    "%s : %d : %d : %d",
+                    hlo->ToString(), i, update_shape.dimensions(dimension),
+                    operand_shape.dimensions(dim_in_operand));
               }
               HloInstruction* base_dynamic_size = parent_->GetDynamicSize(
                   hlo->mutable_operand(0), {}, dim_in_operand);
+              // Sometimes the incoming operand dimension is no longer dynamic,
+              // Simply return OK in this case.
+              if (base_dynamic_size == nullptr ||
+                  !operand_shape.is_dynamic_dimension(dim_in_operand)) {
+                return OkStatus();
+              }
               if (base_dynamic_size != operand_dynamic_size) {
                 return Unimplemented(
                     "Dynamic dimension size of update window dims that are not "
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 83342dc80b5..bc97bfc1c0f 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 16d058c6865..f28733a166b 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -16,14 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.cc b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.cc
index 84c582de0c2..4ae0b74d121 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h
index 283c849dec5..8d359308f3f 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier_test.cc
index 11182e1884f..9fd3112591a 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_simplifier_test.cc
@@ -20,14 +20,14 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
index e163325219d..288dc8964f6 100644
--- a/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.h b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
index 001b22711de..92afaa49896 100644
--- a/tensorflow/compiler/xla/service/dynamic_index_splitter.h
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
index d8803b62eed..fdaeaf2fdfa 100644
--- a/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 13b36717321..e5b2911034a 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -26,19 +26,19 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/dynamic_window_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
index d3a88ba0db0..f91d97883a4 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -17,15 +17,15 @@ limitations under the License.
 
 #include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
@@ -283,19 +283,19 @@ ENTRY main {
 
   auto* root = module_->entry_computation()->root_instruction();
   EXPECT_THAT(root,
-              op::CustomCall("SliceToDynamic", op::Negate(), op::Constant()));
+              op::CustomCall({"SliceToDynamic"}, op::Negate(), op::Constant()));
   HloInstruction* negate = root->mutable_operand(0);
   EXPECT_THAT(
       negate,
       op::Negate(op::GetTupleElement(op::CustomCall(
-          "PadToStatic", op::GetTupleElement(op::CustomCall(
-                             "OpWithDynamicLowering", ::testing::_))))));
+          {"PadToStatic"}, op::GetTupleElement(op::CustomCall(
+                               {"OpWithDynamicLowering"}, ::testing::_))))));
   auto custom_call_1 =
       module_->entry_computation()->GetInstructionWithName("custom-call.1");
   EXPECT_THAT(custom_call_1,
-              op::CustomCall("OpWithDynamicLowering",
+              op::CustomCall({"OpWithDynamicLowering"},
                              op::Tuple(op::GetTupleElement(),
-                                       op::CustomCall("SliceToDynamic"))));
+                                       op::CustomCall({"SliceToDynamic"}))));
 }
 
 TEST_F(DynamicPadderTest, DynamicOutputNestedTuple) {
@@ -324,7 +324,7 @@ ENTRY main {
   EXPECT_THAT(root, op::Tuple(op::Constant(), op::Tuple()));
   HloInstruction* nested_tuple = root->mutable_operand(1);
   EXPECT_THAT(nested_tuple,
-              op::Tuple(op::Constant(), op::CustomCall("SliceToDynamic")));
+              op::Tuple(op::Constant(), op::CustomCall({"SliceToDynamic"})));
 }
 
 TEST_F(DynamicPadderTest, ConvolutionTest) {
@@ -495,7 +495,7 @@ ENTRY test {
   TF_ASSERT_OK(RunPadder(/*slice_dynamic_output=*/true).status());
 
   EXPECT_THAT(module_->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("SliceToDynamic",
+              GmockMatch(m::CustomCall({"SliceToDynamic"},
                                        m::Dot(m::Op().WithShape(S8, {16, 32}),
                                               m::Op().WithShape(S8, {32, 64}))
                                            .WithShape(S32, {16, 64}),
@@ -515,12 +515,12 @@ ENTRY test {
   module_ = GetHloModule(hlo_text);
   TF_ASSERT_OK(RunPadder(/*slice_dynamic_output=*/true).status());
 
-  EXPECT_THAT(
-      module_->entry_computation()->root_instruction(),
-      GmockMatch(m::CustomCall("SliceToDynamic",
-                               m::GetTupleElement(m::CustomCall(
-                                   "PadToStatic", m::CustomCall("UnknownOp"))),
-                               m::Op())));
+  EXPECT_THAT(module_->entry_computation()->root_instruction(),
+              GmockMatch(m::CustomCall(
+                  {"SliceToDynamic"},
+                  m::GetTupleElement(m::CustomCall(
+                      {"PadToStatic"}, m::CustomCall({"UnknownOp"}))),
+                  m::Op())));
 }
 
 TEST_F(DynamicPadderTest, WhileLoopDynamicShapeChangeToStatic) {
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
deleted file mode 100644
index 20602212505..00000000000
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
-
-#include <functional>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
index 5852a7f7547..d98c0da4614 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
 
 #include <memory>
 #include <string>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/dynamic_window_utils.cc b/tensorflow/compiler/xla/service/dynamic_window_utils.cc
index ba56ec7b9bb..9853266e3ed 100644
--- a/tensorflow/compiler/xla/service/dynamic_window_utils.cc
+++ b/tensorflow/compiler/xla/service/dynamic_window_utils.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/dynamic_window_utils.h b/tensorflow/compiler/xla/service/dynamic_window_utils.h
index 9c62ee65b9c..40e891fad4b 100644
--- a/tensorflow/compiler/xla/service/dynamic_window_utils.h
+++ b/tensorflow/compiler/xla/service/dynamic_window_utils.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 295f8032b99..f16df07e6f1 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -32,12 +32,12 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -202,8 +202,8 @@ StatusOr<llvm::Value*> EmitReducePrecisionIR(
   return result;
 }
 
-StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value,
-                                     llvm::IRBuilder<>* b) {
+StatusOr<llvm::Value*> DefaultEmitF32ToBF16Impl(llvm::Value* f32_value,
+                                                llvm::IRBuilder<>* b) {
   TF_ASSIGN_OR_RETURN(
       auto reduced_precision,
       EmitReducePrecisionIR(
@@ -224,6 +224,245 @@ llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value, llvm::IRBuilder<>* b) {
   return b->CreateBitCast(shifted, b->getFloatTy());
 }
 
+StatusOr<llvm::Value*> EmitF16ToF8e5m2(llvm::Value* f16_value,
+                                       llvm::IRBuilder<>* b) {
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value * reduced_precision,
+      EmitReducePrecisionIR(
+          /*src_ty=*/F16, f16_value,
+          /*dest_exponent_bits=*/primitive_util::ExponentWidth(F8E5M2),
+          /*dest_mantissa_bits=*/primitive_util::SignificandWidth(F8E5M2) - 1,
+          /*quiet_nans=*/true, b));
+  llvm::Value* as_int16 = b->CreateBitCast(reduced_precision, b->getInt16Ty());
+  llvm::Value* shifted = b->CreateLShr(as_int16, 8);
+  llvm::Value* truncated = b->CreateTrunc(shifted, b->getInt8Ty());
+  return b->CreateBitCast(truncated, b->getInt8Ty());
+}
+
+llvm::Value* EmitF8e5m2ToF16(llvm::Value* f8_value, llvm::IRBuilder<>* b) {
+  llvm::Value* as_int8 = b->CreateBitCast(f8_value, b->getInt8Ty());
+  llvm::Value* as_int16 = b->CreateZExt(as_int8, b->getInt16Ty());
+  llvm::Value* shifted = b->CreateShl(as_int16, 8);
+  return b->CreateBitCast(shifted, b->getHalfTy());
+}
+
+llvm::Value* EmitF16ToF8e4m3fn(llvm::Value* f16_value, llvm::IRBuilder<>* b) {
+  using llvm::APInt;
+  using llvm::Value;
+
+  llvm::IntegerType* i8_type = b->getInt8Ty();
+  llvm::IntegerType* i16_type = b->getInt16Ty();
+  auto i8_const = [i8_type](int val) {
+    return llvm::ConstantInt::get(i8_type, val);
+  };
+  auto i16_const = [i16_type](int val) {
+    return llvm::ConstantInt::get(i16_type, val);
+  };
+
+  // Cast the input value to an integer for bitwise manipulation. Get the
+  // absolute value of the input value.
+  //   f16_as_int = bitcast(f16_value, int)
+  //   f16_abs_bits = f16_as_int & 0x7FFF
+  Value* f16_as_int = b->CreateBitCast(f16_value, i16_type);
+  llvm::Value* f16_abs_bits = b->CreateAnd(f16_as_int, i16_const(0x7FFF));
+
+  // Get the sign.
+  //   f8_sign = (f16_as_int & 0x8000) >> 8
+  Value* f16_sign = b->CreateAnd(f16_as_int, i16_const(0x8000));
+  f16_sign = b->CreateLShr(f16_sign, i16_const(8));
+  Value* f8_sign = b->CreateTrunc(f16_sign, i8_type);
+
+  // Truncate the mantissa to 3 bits. ReducePrecision cannot deal with
+  // f8E4M3FN's NaN representations, so don't use ReducePrecision to handle
+  // exponent reduction. Denormal values are not handled properly here and are
+  // dealt with later in this function.
+  StatusOr<Value*> f16_reduced_statusor = EmitReducePrecisionIR(
+      /*src_ty=*/F16, f16_value,
+      /*dest_exponent_bits=*/5,
+      /*dest_mantissa_bits=*/3,
+      /*quiet_nans=*/false, b);
+  CHECK(f16_reduced_statusor.ok());  // Crash OK
+  Value* f16_reduced = f16_reduced_statusor.value();
+  f16_reduced = b->CreateBitCast(f16_reduced, i16_type);
+
+  // Remove the sign bit.
+  //   f16_reduced = f16_reduced & 0x7FFF
+  f16_reduced = b->CreateAnd(f16_reduced, i16_const(0x7FFF));
+
+  // Bits of the F16 representation of the smallest F8 normal value.
+  constexpr int min_normal_value = 0x2400;
+
+  // Round values smaller than the smallest F8 normal value up to the smallest
+  // F8 normal value. The case where we round to a denormal value is handled
+  // later.
+  //    f16_reduced = max(f16_reduced, min_normal_value)
+  f16_reduced = b->CreateSelect(
+      b->CreateICmpULT(f16_reduced, i16_const(min_normal_value)),
+      i16_const(min_normal_value), f16_reduced);
+
+  constexpr int exponent_bias_difference = 15 - 7;
+  constexpr int f16_mantissa_bits = 10;
+  constexpr int f8_mantissa_bits = 3;
+  constexpr int mantissa_bits_difference = f16_mantissa_bits - f8_mantissa_bits;
+
+  // Adjust the exponent by subtracting the difference in exponent bias.
+  //   f16_reduced -= (exponent_bias_difference << f16_mantissa_bits)
+  f16_reduced = b->CreateSub(
+      f16_reduced, i16_const(exponent_bias_difference << f16_mantissa_bits));
+
+  // Shift to convert to F8.
+  //   f8_bits = f16_reduced >> mantissa_bits_difference;
+  Value* f8_bits =
+      b->CreateLShr(f16_reduced, i16_const(mantissa_bits_difference));
+  f8_bits = b->CreateTrunc(f8_bits, i8_type);
+
+  // Bits of the highest F16 value that gets converted to a finite F8 value.
+  // In binary: 0 10111 1101111111
+  constexpr int max_finite_value = 0x5F7F;
+
+  // If we're above the maximum F8 value, output NaN.
+  //   f8_bits = f16_abs_bits > max_finite_value ? 0x7F : f8_bits
+  f8_bits = b->CreateSelect(
+      b->CreateICmpUGT(f16_abs_bits, i16_const(max_finite_value)),
+      i8_const(0x7F), f8_bits);
+
+  // F16 values that are halfway between denormal F8 values. This is used to
+  // determine how to round to denormal F8 values.
+  const int halfway_points[8] = {
+      0x1400,  // 2**-10;        halfway between [0,            2**-9]
+      0x1A00,  // 1.5 * 2**-9;   halfway between [2**-9,        2**-8]
+      0x1D00,  // 1.25 * 2**-8;  halfway between [2**-8,        1.5 * 2**-8]
+      0x1F00,  // 1.75 * 2**-8;  halfway between [1.5 * 2**-8,  2**-7]
+      0x2080,  // 1.125 * 2**-7; halfway between [2**-7,        1.25 * 2**-7]
+      0x2180,  // 1.375 * 2**-7; halfway between [1.25 * 2**-7, 1.5 * 2**-7]
+      0x2280,  // 1.625 * 2**-7; halfway between [1.5 * 2**-7,  1.75 * 2**-7]
+      0x2380,  // 1.875 * 2**-7; halfway between [1.75 * 2**-7, 2**-6]
+  };
+
+  // Handle case where output is denormal. If we're rounding to a denormal
+  // value, ignore the current value of f8_bits and set it to the correct
+  // denormal value. We emit the equivalent of the following:
+  //
+  //   if (f16_abs_bits <= halfway_points[0]) {
+  //     f8_bits = 0;
+  //   } else if (f16_abs_bits < halfway_points[1]) {
+  //     f8_bits = 1;
+  //   } else if (f16_abs_bits <= halfway_points[2]) {
+  //   ...  // More if-else statements. The comparisons alternate between <=
+  //   ...  // and < to handle round-to-even properly.
+  //   } else if (f16_abs_bits < halfway_points[7])  {
+  //     f8_bits = 7;
+  //   }
+  for (int i = ABSL_ARRAYSIZE(halfway_points) - 1; i >= 0; i--) {
+    Value* comparison;
+    if (i % 2 == 0) {
+      comparison = b->CreateICmpULE(f16_abs_bits, i16_const(halfway_points[i]));
+    } else {
+      comparison = b->CreateICmpULT(f16_abs_bits, i16_const(halfway_points[i]));
+    }
+    f8_bits = b->CreateSelect(comparison, i8_const(i), f8_bits);
+  }
+
+  // Set the sign bit.
+  //   f8_bits |= f8_sign
+  f8_bits = b->CreateOr(f8_bits, f8_sign);
+  return f8_bits;
+}
+
+llvm::Value* EmitF8e4m3fnToF16(llvm::Value* f8_value, llvm::IRBuilder<>* b) {
+  using llvm::APInt;
+  using llvm::Value;
+
+  llvm::IntegerType* i8_type = b->getInt8Ty();
+  llvm::IntegerType* i16_type = b->getInt16Ty();
+  auto i8_const = [i8_type](int val) {
+    return llvm::ConstantInt::get(i8_type, val);
+  };
+  auto i16_const = [i16_type](int val) {
+    return llvm::ConstantInt::get(i16_type, val);
+  };
+
+  // Cast the input value to an integer for bitwise manipulation. Get the
+  // absolute value of the input value.
+  //   f8_as_int = bitcast(f16_value, int)
+  //   f8_abs_bits = f8_as_int & 0x7F
+  Value* f8_as_int = b->CreateBitCast(f8_value, i8_type);
+  Value* f8_abs_bits = b->CreateAnd(f8_as_int, i8_const(0x7F));
+
+  // We assume below that the value is neither NaN nor denormal. If it NaN or
+  // denormal, the output is set to NaN or zero at the end using Select
+  // instructions.
+
+  // Get the sign:
+  //   f16_sign = (f8_as_int & 0x80) << 8
+  Value* f8_sign = b->CreateAnd(f8_as_int, i8_const(0x80));
+  Value* f16_sign = b->CreateZExt(f8_sign, i16_type);
+  f16_sign = b->CreateShl(f16_sign, i16_const(8));
+
+  constexpr int exponent_bias_difference = 15 - 7;
+  constexpr int f16_mantissa_bits = 10;
+  constexpr int f8_mantissa_bits = 3;
+  constexpr int mantissa_bits_difference = f16_mantissa_bits - f8_mantissa_bits;
+  constexpr int f8_mantissa_mask = (1 << f8_mantissa_bits) - 1;
+
+  // Get the exponent:
+  //   f8_exponent = (f8_as_int & 0x78) >> f8_mantissa_bits
+  Value* f8_exponent_bits = b->CreateAnd(f8_as_int, i8_const(0x78));
+  Value* f8_exponent =
+      b->CreateLShr(f8_exponent_bits, i8_const(f8_mantissa_bits));
+
+  // Adjust the exponent by adding the difference in exponent bias:
+  //   f16_exponent = (f8_exopnent + exponent_bias_difference)
+  //                  << f16_mantissa_bits
+  Value* f16_exponent =
+      b->CreateAdd(f8_exponent, i8_const(exponent_bias_difference));
+  f16_exponent = b->CreateZExt(f16_exponent, i16_type);
+  f16_exponent = b->CreateShl(f16_exponent, i16_const(f16_mantissa_bits));
+
+  // Get the mantissa:
+  //   f16_mantissa = (f8_mantissa & f8_mantissa_mask)
+  //                  << mantissa_bits_difference
+  Value* f8_mantissa = b->CreateAnd(f8_as_int, i8_const(f8_mantissa_mask));
+  Value* f16_mantissa = b->CreateZExt(f8_mantissa, i16_type);
+  f16_mantissa =
+      b->CreateShl(f16_mantissa, i16_const(mantissa_bits_difference));
+
+  // Combine the exponent and mantissa:
+  //   f16_as_int = f16_exponent | f16_mantissa
+  Value* f16_as_int = b->CreateOr(f16_exponent, f16_mantissa);
+
+  // Set output to NaN if input is NaN
+  //   f16_as_int = f8_abs_bits == 0x7F ? 0x7E00 : f16_as_int
+  Value* is_nan = b->CreateICmpEQ(f8_abs_bits, i8_const(0x7F));
+  f16_as_int = b->CreateSelect(is_nan, i16_const(0x7E00), f16_as_int);
+
+  // Map from F8 denormal value to F16 value.
+  int f8_denormal_to_f16[8] = {
+      0x0000,  // 0
+      0x1800,  // 2**-9
+      0x1C00,  // 2**-8
+      0x1E00,  // 1.5 * 2**-8
+      0x2000,  // 2**-7
+      0x2100,  // 1.25 * 2**-7
+      0x2200,  // 1.5 * 2**-7
+      0x2300,  // 1.75 * 2**-7
+  };
+
+  // If the F8 value is denormal, use the map above to determine the correct F16
+  // value.
+  //    if (f8_abs_bits < 8) { f16_as_int = f8_denormal_to_f16[f8_abs_bits]; }
+  for (int i = 0; i < ABSL_ARRAYSIZE(f8_denormal_to_f16); i++) {
+    Value* is_denormal_value = b->CreateICmpEQ(f8_abs_bits, i8_const(i));
+    f16_as_int = b->CreateSelect(is_denormal_value,
+                                 i16_const(f8_denormal_to_f16[i]), f16_as_int);
+  }
+
+  // Set the sign bit.
+  //   f16_as_int |= f16_sign
+  f16_as_int = b->CreateOr(f16_as_int, f16_sign);
+  return b->CreateBitCast(f16_as_int, b->getHalfTy());
+}
+
 llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
                                     PrimitiveType from_type,
                                     PrimitiveType to_type, llvm::Module* module,
@@ -295,8 +534,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       if (primitive_util::IsFloatingPointType(to_type)) {
         if (to_type == BF16) {
           return EmitF32ToBF16(EmitIntegralToFloating(operand_value, from_type,
-                                                      F32, module_, b_),
-                               b_);
+                                                      F32, module_, b_));
+        }
+        if (to_type == F8E5M2) {
+          return EmitF16ToF8e5m2(
+              EmitIntegralToFloating(operand_value, from_type, F16, module_,
+                                     b_),
+              b_);
+        }
+        if (to_type == F8E4M3FN) {
+          return EmitF16ToF8e4m3fn(
+              EmitIntegralToFloating(operand_value, from_type, F16, module_,
+                                     b_),
+              b_);
         }
         return EmitIntegralToFloating(operand_value, from_type, to_type,
                                       module_, b_);
@@ -408,6 +658,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           return operand_value;
         }
       }
+      if (from_type == F8E5M2) {
+        TF_RET_CHECK(to_type != F8E5M2);
+        operand_value = EmitF8e5m2ToF16(operand_value, b_);
+        from_type = F16;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
+      if (from_type == F8E4M3FN) {
+        TF_RET_CHECK(to_type != F8E4M3FN);
+        operand_value = EmitF8e4m3fnToF16(operand_value, b_);
+        from_type = F16;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
       if (primitive_util::IsComplexType(to_type)) {
         PrimitiveType to_component_type =
             primitive_util::ComplexComponentType(to_type);
@@ -427,7 +693,23 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           operand_value = b_->CreateFPCast(
               operand_value, llvm_ir::PrimitiveTypeToIrType(F32, module_));
         }
-        return EmitF32ToBF16(operand_value, b_);
+        return EmitF32ToBF16(operand_value);
+      }
+      if (to_type == F8E5M2) {
+        // Cast to F16 first. Casts to F8E5M2 must be from F16.
+        if (from_type != F16) {
+          operand_value = b_->CreateFPCast(
+              operand_value, llvm_ir::PrimitiveTypeToIrType(F16, module_));
+        }
+        return EmitF16ToF8e5m2(operand_value, b_);
+      }
+      if (to_type == F8E4M3FN) {
+        // Cast to F16 first. Casts to F8E4M3FN must be from F16.
+        if (from_type != F16) {
+          operand_value = b_->CreateFPCast(
+              operand_value, llvm_ir::PrimitiveTypeToIrType(F16, module_));
+        }
+        return EmitF16ToF8e4m3fn(operand_value, b_);
       }
       if (to_type == PRED) {
         return b_->CreateZExt(
@@ -514,6 +796,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
       return EmitSin(op->shape().element_type(), operand_value);
+    case HloOpcode::kTan:
+      return EmitTan(op->shape().element_type(), operand_value);
     case HloOpcode::kTanh:
       return EmitTanh(op->shape().element_type(), operand_value);
     case HloOpcode::kSqrt:
@@ -649,47 +933,46 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto imag_result = FMul(exp_a, sin_b);
       return EmitComposeComplex(op, real_result, imag_result);
     }
-    case HloOpcode::kCos: {
-      // cos(z) = .5(e^(iz) + e^(-iz))
-      // cos(a+bi) = .5(e^(-b+ai) + e^(b-ai))
-      // now, e^(x+yi) = e^x*(cos(y)+sin(y)i), so we have
-      // cos(a+bi) = .5(e^-b*(cos(a)+sin(a)i) + e^b*(cos(-a)+sin(-a)i))
-      // cos(-x) = cos(x) and sin(-x) = -sin(x), so
-      // cos(a+bi) = .5(e^-b*(cos(a)+sin(a)i) + e^b*(cos(a)-sin(a)i))
-      //           = .5(cos(a)*(e^-b+e^b) + i*sin(a)*(e^-b-e^b))
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      auto type = a->getType();
-      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b, ""));
-      auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
-      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
-      return EmitComposeComplex(op,
-                                FMul(cos_a, FAdd(half_exp_neg_b, half_exp_b)),
-                                FMul(sin_a, FSub(half_exp_neg_b, half_exp_b)));
-    }
-    case HloOpcode::kSin: {
-      // sin(z) = .5i(e^(-iz) - e^(iz))
-      // sin(a+bi) = .5i(e^(-i(a+bi)) - e^(i(a+bi)))
-      //           = .5i(e^(b-ai) - e^(-b+ai))
-      // now, e^(x+yi) = e^x*(cos(y)+sin(y)i), so we have
-      // sin(a+bi) = 0.5i(e^b*(cos(-a)+sin(-a)i) - e^-b*(cos(a)+sin(a)i))
-      //           = 0.5(e^b*(cos(-a)i-sin(-a)) - e^-b*(cos(a)i-sin(a)))
-      // cos(-x) = cos(x) and sin(-x) = -sin(x), so
-      //           = 0.5(e^b*(cos(a)i+sin(a)) - e^-b*(cos(a)i-sin(a)))
-      //           = 0.5(sin(a)*(e^b+e^-b) + i*cos(a)*(e^b-e^-b)
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      auto type = a->getType();
-      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b, ""));
-      auto half_exp_b = FMul(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto half_exp_neg_b = FDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
-      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
-      return EmitComposeComplex(op,
-                                FMul(sin_a, FAdd(half_exp_b, half_exp_neg_b)),
-                                FMul(cos_a, FSub(half_exp_b, half_exp_neg_b)));
+    case HloOpcode::kCos:
+    case HloOpcode::kSin:
+    case HloOpcode::kTan: {
+      // If the argument is z = x + i*y, let
+      //   sinh(y) = (exp(y) - exp(-y)) / 2
+      //   cosh(y) = (exp(y) + exp(-y)) / 2 ,
+      // then
+      //   sin(x + i*y) = sin(x)*cosh(y) + i*cos(x)*sinh(y)
+      //   cos(x + i*y) = cos(x)*cosh(y) - i*sin(x)*sinh(y)
+      //   tan(x + i*y) = (sin(x)*cos(x) + i*sinh(y)*cosh(y)) /
+      //                    (cos(x)^2 + sinh(y)^2).
+      auto x = EmitExtractReal(operand_value);
+      auto y = EmitExtractImag(operand_value);
+      auto type = y->getType();
+      TF_ASSIGN_OR_RETURN(auto exp_y, EmitExp(component_type, y, ""));
+      auto half_exp_y = FMul(llvm::ConstantFP::get(type, 0.5), exp_y);
+      auto half_exp_neg_y = FDiv(llvm::ConstantFP::get(type, 0.5), exp_y);
+      TF_ASSIGN_OR_RETURN(auto sin_x, EmitSin(component_type, x));
+      TF_ASSIGN_OR_RETURN(auto cos_x, EmitCos(component_type, x));
+      auto sinh_y = FSub(half_exp_y, half_exp_neg_y);
+      auto cosh_y = FAdd(half_exp_y, half_exp_neg_y);
+      llvm::Value* real_result = nullptr;
+      llvm::Value* imag_result = nullptr;
+      if (op->opcode() == HloOpcode::kTan) {
+        auto num_real = FMul(sin_x, cos_x);
+        auto num_imag = FMul(sinh_y, cosh_y);
+        auto denom = FAdd(FMul(cos_x, cos_x), FMul(sinh_y, sinh_y));
+        auto denom_inv = FDiv(llvm::ConstantFP::get(type, 1.0), denom);
+        real_result = FMul(num_real, denom_inv);
+        imag_result = FMul(num_imag, denom_inv);
+      }
+      if (op->opcode() == HloOpcode::kSin) {
+        real_result = FMul(sin_x, cosh_y);
+        imag_result = FMul(cos_x, sinh_y);
+      }
+      if (op->opcode() == HloOpcode::kCos) {
+        real_result = FMul(cos_x, cosh_y);
+        imag_result = FNeg(FMul(sin_x, sinh_y));
+      }
+      return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kTanh: {
       /*
@@ -854,9 +1137,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     case HloOpcode::kRsqrt: {
       return EmitComplexRsqrt(op, component_type, operand_value);
     }
-    case HloOpcode::kCbrt: {
-      return EmitComplexCbrt(op, component_type, operand_value);
-    }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
                                 FNeg(EmitExtractImag(operand_value)));
@@ -1314,19 +1594,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexRsqrt(
   return EmitComposeComplex(op, real_part, imag_part);
 }
 
-//
-// Using EmitComplexPower with c=1.0/3.0 and d=0
-StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexCbrt(
-    const HloInstruction* op, PrimitiveType prim_type,
-    llvm::Value* operand_value) {
-  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
-  auto third = llvm::ConstantFP::get(type, 1.0 / 3.0);
-  auto zero = llvm::ConstantFP::get(type, 0);
-  llvm::Value* a = EmitExtractReal(operand_value);
-  llvm::Value* b = EmitExtractImag(operand_value);
-  return EmitComplexPower(op, a, b, third, zero);
-}
-
 // (a+bi)^(c+di) =
 //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
 //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
@@ -1602,6 +1869,15 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   return Unimplemented("tanh");
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitTan(PrimitiveType prim_type,
+                                                   llvm::Value* value) {
+  auto sin_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
+                                            {value->getType()}, b_);
+  auto cos_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value},
+                                            {value->getType()}, b_);
+  return FDiv(sin_x, cos_x);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
     const HloInstruction* hlo, llvm::Value* x) {
   return EmitReducePrecisionIR(
@@ -1997,8 +2273,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
     // because they require non-degenerate basic blocks.
     b_->SetInsertPoint(llvm::BranchInst::Create(
         exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
-    llvm_ir::IrArray::Index operand_index(operand_multi_index, operand->shape(),
-                                          source_index.GetType());
+    llvm_ir::IrArray::Index operand_index(source_index.GetType());
+    // If we are concatenating the fastest varying dimension, we can reuse the
+    // linear index.
+    if (source_index.linear() != nullptr && operand->shape().rank() > 1 &&
+        concat_dim == operand->shape().layout().minor_to_major(0)) {
+      llvm::Value* linear_without_concat_dim = b_->CreateUDiv(
+          source_index.linear(), source_index.GetConstantWithIndexType(
+                                     hlo->shape().dimensions(concat_dim)));
+      llvm::Value* adjusted_linear_base =
+          b_->CreateMul(linear_without_concat_dim,
+                        source_index.GetConstantWithIndexType(
+                            operand->shape().dimensions(concat_dim)));
+      llvm::Value* adjusted_linear =
+          b_->CreateAdd(adjusted_linear_base, operand_multi_index[concat_dim]);
+      operand_index = llvm_ir::IrArray::Index(
+          adjusted_linear, operand_multi_index, operand->shape(), b_);
+    } else {
+      operand_index = llvm_ir::IrArray::Index(
+          operand_multi_index, operand->shape(), source_index.GetType());
+    }
 
     TF_ASSIGN_OR_RETURN(llvm::Value * value,
                         operand_to_generator.at(operand)(operand_index));
@@ -2419,26 +2713,50 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   // shapes [A,B,C,T] and [D,T,E], the result has a shape [A,B,C,D,E].
   // Given an output index [a,b,c,d,e] in the result, we compute:
   //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
-
-  std::vector<llvm::Value*> lhs_multi_index, rhs_multi_index;
-  for (int64_t i = 0; i < lhs_dims - 1; i++) {
-    lhs_multi_index.push_back(dot_result_index[i]);
+  // Note that due to dot dimension numbers, the operands can have the
+  // dimensions in a different order, so when constructing the index for the
+  // operands, we need to take that into account. We can assume though that the
+  // non-contracting dimensions are in the same order as in the output.
+
+  std::vector<llvm::Value*> lhs_multi_index(lhs_dims, nullptr),
+      rhs_multi_index(rhs_dims, nullptr);
+  int64_t num_batch_dims = dim_numbers.lhs_batch_dimensions_size();
+  DCHECK_EQ(num_batch_dims, dim_numbers.rhs_batch_dimensions_size());
+
+  for (int64_t i = 0; i < num_batch_dims; ++i) {
+    lhs_multi_index[dim_numbers.lhs_batch_dimensions(i)] = dot_result_index[i];
+    rhs_multi_index[dim_numbers.rhs_batch_dimensions(i)] = dot_result_index[i];
+  }
+  lhs_multi_index[lhs_contracting_dim] = inner_loop->GetIndVarValue();
+  rhs_multi_index[rhs_contracting_dim] = inner_loop->GetIndVarValue();
+  // There are lhs_dims - 1 - num_batch_dims non-contracting dimensions for the
+  // lhs operand. We can assume they have the same relative order as in the
+  // output.
+  for (int64_t i = num_batch_dims, j = 0; i < lhs_dims - 1; ++i, ++j) {
+    // Skip the positions which have already been filled with contracting
+    // dimension and batch dimensions.
+    while (j < lhs_dims && lhs_multi_index[j] != nullptr) {
+      ++j;
+    }
+    DCHECK_LT(j, lhs_dims);
+    lhs_multi_index[j] = dot_result_index[i];
   }
-  lhs_multi_index.insert(lhs_multi_index.begin() + lhs_contracting_dim,
-                         inner_loop->GetIndVarValue());
   IrArray::Index lhs_index(lhs_multi_index, hlo->operand(0)->shape(),
                            index_type);
 
-  int64_t num_batch_dims = dim_numbers.rhs_batch_dimensions_size();
-  for (int64_t i = 0; i < num_batch_dims; i++) {
-    rhs_multi_index.push_back(
-        dot_result_index[dim_numbers.rhs_batch_dimensions(i)]);
-  }
-  for (int64_t i = 0; i < rhs_dims - 1 - num_batch_dims; i++) {
-    rhs_multi_index.push_back(dot_result_index[lhs_dims - 1 + i]);
+  // There are rhs_dims - 1 - num_batch_dims non-contracting dimensions for the
+  // rhs operand. We can assume they have the same relative order as in the
+  // output.
+  DCHECK_EQ(hlo->shape().rank(), lhs_dims + rhs_dims - 2 - num_batch_dims);
+  for (int64_t i = lhs_dims - 1, j = 0; i < hlo->shape().rank(); ++i, ++j) {
+    // Skip the positions which have already been filled with contracting
+    // dimension and batch dimensions.
+    while (j < rhs_dims && rhs_multi_index[j] != nullptr) {
+      ++j;
+    }
+    DCHECK_LT(j, rhs_dims);
+    rhs_multi_index[j] = dot_result_index[i];
   }
-  rhs_multi_index.insert(rhs_multi_index.begin() + rhs_contracting_dim,
-                         inner_loop->GetIndVarValue());
   IrArray::Index rhs_index(rhs_multi_index, hlo->operand(1)->shape(),
                            index_type);
 
@@ -2482,6 +2800,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
@@ -2608,7 +2927,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
           llvm::Value* float_val =
               b_->CreateUIToFP(elem_index_linear, float_ir_type);
           if (component_element_type == BF16) {
-            TF_ASSIGN_OR_RETURN(iota_result, EmitF32ToBF16(float_val, b_));
+            TF_ASSIGN_OR_RETURN(iota_result, EmitF32ToBF16(float_val));
           } else {
             iota_result = float_val;
           }
@@ -2752,6 +3071,11 @@ llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) {
   return ExtractValue(value, {1});
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitF32ToBF16(
+    llvm::Value* f32_value) {
+  return DefaultEmitF32ToBF16Impl(f32_value, b_);
+}
+
 llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
                                                     llvm::Value* real,
                                                     llvm::Value* imag) {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index a05c7455e24..45e3b779c3d 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -24,8 +24,8 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
@@ -76,6 +76,8 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual llvm::Value* EmitExtractReal(llvm::Value* value);
   virtual llvm::Value* EmitExtractImag(llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value);
+
  private:
   virtual StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
                                              llvm::Value* operand_value);
@@ -159,6 +161,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
                                          llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
+                                         llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
                                          llvm::Value* value,
                                          absl::string_view name);
@@ -211,10 +216,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                                                  PrimitiveType prim_type,
                                                  llvm::Value* operand_value);
 
-  virtual StatusOr<llvm::Value*> EmitComplexCbrt(const HloInstruction* op,
-                                                 PrimitiveType prim_type,
-                                                 llvm::Value* operand_value);
-
   virtual StatusOr<llvm::Value*> EmitComplexRsqrt(const HloInstruction* op,
                                                   PrimitiveType prim_type,
                                                   llvm::Value* operand_value);
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
index 14d23c7929e..a13e5e2041c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -337,5 +337,16 @@ XLA_TEST_F(ElementalIrEmitterExecutionTest, CompareBF16) {
   RunTest(hlo_text, {&lhs, &rhs});
 }
 
+XLA_TEST_F(ElementalIrEmitterExecutionTest, IotaBF16) {
+  constexpr char hlo_text[] = R"(
+  HloModule IotaBF16
+  ENTRY main {
+    ROOT iota_ = bf16[4] iota(), iota_dimension=0
+  }
+  )";
+
+  RunTest(hlo_text, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index bf2f2ec8ca5..7ab5a066dbf 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 5129881f19a..33529749c21 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 1f828824721..e7499b01bce 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
index a84679f9a15..22251ee341d 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/logging.h"
 
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
index e102ec20d2b..68e714e9192 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
index 2fa8eaadbf2..33bd38d003b 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/fusion_queue.h b/tensorflow/compiler/xla/service/fusion_queue.h
index ba5da674c68..1cfa13bc8de 100644
--- a/tensorflow/compiler/xla/service/fusion_queue.h
+++ b/tensorflow/compiler/xla/service/fusion_queue.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/g3doc/gpu_backend.md b/tensorflow/compiler/xla/service/g3doc/gpu_backend.md
index ec560126c5c..abc52d2c204 100644
--- a/tensorflow/compiler/xla/service/g3doc/gpu_backend.md
+++ b/tensorflow/compiler/xla/service/g3doc/gpu_backend.md
@@ -82,7 +82,7 @@ allocate and deallocate buffers.
 
 `GpuCompiler` takes the optimized HLO and `BufferAssignment`, and convert them
 to the MLIR dialect
-[`LMHLO`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.td).
+[`LMHLO`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.td).
 
 The `LMHLO` dialect is a graph consists of `LMHLO` ops. `LMHLO` ops are
 buffer-based and sequentially ordered. The sequential order reflects the
@@ -92,12 +92,12 @@ In `LMHLO`, direct operand-user information is stripped away, as each op is only
 connected with its buffers, not ops which generate those buffers.
 
 Notice that some `LMHLO` ops, e.g. `lmhlo.fusion` or `lmhlo.reduce`, contain
-[`MHLO`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td)-based
+[`MHLO`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td)-based
 regions. They are tensor-based `MHLO` regions because ops in them don't have
 buffers associated.
 
 The code that converts XLA HLO to `LMHLO` is
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h).
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h).
 
 Currently, lowering of those `MHLO` regions takes a twist:
 
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 2f45ced97d4..8f31cb8a22a 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gather_scatter_utils.h b/tensorflow/compiler/xla/service/gather_scatter_utils.h
index 560054fc3b9..d0f17493f19 100644
--- a/tensorflow/compiler/xla/service/gather_scatter_utils.h
+++ b/tensorflow/compiler/xla/service/gather_scatter_utils.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/gather_simplifier.cc b/tensorflow/compiler/xla/service/gather_simplifier.cc
index 3272442fc3b..ba0bc062b86 100644
--- a/tensorflow/compiler/xla/service/gather_simplifier.cc
+++ b/tensorflow/compiler/xla/service/gather_simplifier.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/service/gather_scatter_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 55c3361f3be..b240ca63ff0 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -61,10 +62,11 @@ Status GenericTransferManager::WriteSingleTupleIndexTable(
 void GenericTransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
     MutableBorrowingLiteral literal, std::function<void(Status)> done,
-    const TransferMetadata* /*transfer_metadata*/) {
+    const TransferMetadata* transfer_metadata) {
   VLOG(2) << "transferring literal from device ordinal "
           << stream->parent()->device_ordinal()
           << "; device buffer: " << device_buffer;
+
   Status status = [&]() -> Status {
     TF_RET_CHECK(stream->parent()->device_ordinal() ==
                  device_buffer.device_ordinal());
@@ -73,24 +75,41 @@ void GenericTransferManager::TransferLiteralFromDevice(
         device_buffer.on_device_shape(),
         [&](const Shape& subshape, const ShapeIndex& index) -> Status {
           if (subshape.IsArray()) {
-            stream->ThenMemcpy(
-                /*host_dst=*/literal.untyped_data(index),
-                /*gpu_src=*/device_buffer.buffer(index),
+            TF_RETURN_IF_ERROR(TransferBufferFromDevice(
+                stream,
+                /*source=*/device_buffer.buffer(index),
                 // With bounded dynamic shapes, the shape of the device buffer
                 // (bounded allocation) can be bigger than the literal.
                 /*size=*/
                 GetByteSizeRequirement(
-                    ShapeUtil::GetSubshape(literal.shape(), index)));
+                    ShapeUtil::GetSubshape(literal.shape(), index)),
+                /*destination=*/literal.untyped_data(index)));
           }
           return OkStatus();
         }));
     return OkStatus();
   }();
+
   if (!status.ok()) {
     done(status);
     return;
   }
-  done(stream->BlockHostUntilDone());
+
+  // CUDA callbacks are tricky as we cannot call any CUDA driver functions from
+  // within a host callback. As a result, `TransferLiteralFromDevice` must be
+  // very conservative, and is synchronous by default. However, if the user
+  // declares, via the metadata, that their callback is safe to call from a host
+  // callback, we enqueue it and return immediately.
+  if ((transfer_metadata != nullptr) &&
+      tensorflow::down_cast<const LiteralFromDeviceMetadata*>(transfer_metadata)
+          ->callback_is_host_callback_safe) {
+    stream->ThenDoHostCallback([done = std::move(done), stream] {
+      done(stream->ok() ? OkStatus()
+                        : InternalError("`TransferLiteralFromDevice` failed"));
+    });
+  } else {
+    done(stream->BlockHostUntilDone());
+  }
 }
 
 Status GenericTransferManager::TransferLiteralToDeviceAsync(
@@ -112,31 +131,24 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
-        se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
         if (device_subshape.IsArray()) {
-          TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
-                       device_memory.size());
-          // Element is array-shaped: transfer array data to device buffer.
-          const auto subliteral = LiteralSlice(literal, index);
-          Literal relayed_out_literal;
-          const void* source;
-          if (LayoutUtil::Equal(device_subshape.layout(),
-                                subliteral.shape().layout())) {
-            source = subliteral.untyped_data();
-            return TransferBufferToDevice(
-                stream,
-                /*size=*/GetByteSizeRequirement(device_subshape), source,
-                &device_memory);
+          int64_t size = GetByteSizeRequirement(device_subshape);
+          se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
+          TF_RET_CHECK(size == device_memory.size());
+          LiteralSlice subliteral(literal, index);
+          if (device_subshape.layout() == subliteral.shape().layout()) {
+            return TransferBufferToDevice(stream, size,
+                                          /*source=*/subliteral.untyped_data(),
+                                          /*destination=*/&device_memory);
           } else {
             // Relayout data before transferring.
-            relayed_out_literal = subliteral.Relayout(device_subshape.layout(),
-                                                      /*shape_index=*/{});
-            source = relayed_out_literal.untyped_data();
+            auto relaid_out = std::make_shared<Literal>(
+                subliteral.Relayout(device_subshape.layout()));
             TF_RETURN_IF_ERROR(TransferBufferToDevice(
-                stream,
-                /*size=*/GetByteSizeRequirement(device_subshape), source,
-                &device_memory));
-            return stream->BlockHostUntilDone();
+                stream, size, /*source=*/relaid_out->untyped_data(),
+                /*destination=*/&device_memory));
+            // Ensure the buffer is transferred before we destroy it.
+            stream->ThenDoHostCallback([keep_alive = std::move(relaid_out)] {});
           }
         }
         return OkStatus();
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index da3828373c4..c5216091ea4 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -33,8 +33,11 @@ namespace xla {
 // infeed.
 class GenericTransferManager : public TransferManager {
  public:
+  struct LiteralFromDeviceMetadata : public TransferManager::TransferMetadata {
+    bool callback_is_host_callback_safe = false;
+  };
+
   GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size);
-  ~GenericTransferManager() override {}
 
   se::Platform::Id PlatformId() const override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 429bbdd2952..b75ec5e7f80 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -8,19 +8,16 @@ load(
 )
 load(
     "//tensorflow/tsl/platform:build_config_root.bzl",
+    "if_static",
     "tf_cuda_tests_tags",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_cuda_library",
-)
-load("//tensorflow/tsl:tsl.bzl", "if_google", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "if_google", "if_nccl", "tsl_copts", "tsl_gpu_library")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
     "if_dcu",
 )
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load(
     "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -29,10 +26,10 @@ load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_cloud", "if_nccl")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_cloud")
 
 package(
-    default_visibility = [":friends"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -56,9 +53,11 @@ tf_proto_library(
     name = "backend_configs",
     srcs = ["backend_configs.proto"],
     cc_api_version = 2,
+    make_default_target_header_only = True,
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto",
+	"//tensorflow/tsl/protobuf:dnn_proto",
     ],
 )
 
@@ -133,7 +132,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "custom_call_test",
     srcs = if_gpu_is_configured(["custom_call_test.cc"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
@@ -142,11 +141,15 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/runtime:module",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+        "//tensorflow/compiler/xla/runtime:module_registry",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
@@ -166,7 +169,7 @@ cc_library(
         ":buffer_allocations",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -197,7 +200,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "target_util_test",
     srcs = ["target_util_test.cc"],
     deps = [
@@ -213,20 +216,66 @@ cc_library(
     srcs = ["gpu_device_info.cc"],
     hdrs = ["gpu_device_info.h"],
     compatible_with = get_compatible_with_cloud(),
-    deps = ["//tensorflow/compiler/xla/stream_executor:stream_executor_headers"],
+    deps = [
+        "//tensorflow/compiler/xla/stream_executor:device_description_proto_cc",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+    ],
+)
+
+cc_library(
+    name = "gpu_device_info_for_tests",
+    testonly = 1,
+    srcs = ["gpu_device_info_for_tests.cc"],
+    hdrs = ["gpu_device_info_for_tests.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":gpu_device_info",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_device_info_test",
+    srcs = ["gpu_device_info_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_device_info_for_tests",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "ir_emitter_context",
+    srcs = [
+        "ir_emitter_context.cc",
+    ],
+    hdrs = [
+        "ir_emitter_context.h",
+    ],
+    deps = [
+        ":gpu_constants",
+        ":gpu_device_info",
+        ":gpu_executable",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:name_uniquer",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+    ],
 )
 
 cc_library(
     name = "ir_emitter",
     srcs = [
         "ir_emitter.cc",
-        "ir_emitter_context.cc",
         "ir_emitter_nested.cc",
         "ir_emitter_unnested.cc",
     ],
     hdrs = [
         "ir_emitter.h",
-        "ir_emitter_context.h",
         "ir_emitter_nested.h",
         "ir_emitter_unnested.h",
         "kernel_mapping_scheme.h",
@@ -238,12 +287,14 @@ cc_library(
         ":backend_configs_cc",
         ":buffer_allocations",
         ":elemental_ir_emitter",
+        ":ir_emitter_context",
         ":fft_thunk",
         ":gpu_asm_opts_util",
         ":gpu_constants",
         ":gpu_conv_runner",
         ":gpu_executable",
         ":gpu_device_info",
+        ":gpu_fusible",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
         ":launch_dimensions",
@@ -273,7 +324,7 @@ cc_library(
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
-        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
@@ -297,7 +348,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -321,6 +372,7 @@ cc_library(
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:human_readable_json",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
     ] + if_gpu_is_configured([
         ":triangular_solve_thunk",
         ":cholesky_thunk",
@@ -356,6 +408,7 @@ cc_library(
     hdrs = ["elemental_ir_emitter.h"],
     deps = [
         ":backend_configs_cc",
+        ":ir_emitter_context",
         ":target_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -365,8 +418,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
@@ -412,15 +465,15 @@ cc_library(
         ":buffer_allocations",
         ":gpu_executable_run_options",
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:status",
         "@llvm-project//mlir:IR",
     ],
 )
 
-tf_cuda_library(
+tsl_gpu_library(
     name = "nccl_collective_thunks",
     srcs = [
         "nccl_all_gather_thunk.cc",
@@ -436,7 +489,7 @@ tf_cuda_library(
         "nccl_collective_permute_thunk.h",
         "nccl_collective_thunk.h",
     ],
-    # Override tf_cuda_library()'s internal default value of ["//buildenv/target:gce"].
+    # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
     compatible_with = [],
     deps = [
         ":buffer_allocations",
@@ -446,12 +499,12 @@ tf_cuda_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
         "//tensorflow/compiler/xla/service:global_device_id",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
@@ -462,6 +515,7 @@ tf_cuda_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -480,11 +534,11 @@ alias(
 )
 
 # Do not depend on this target, but rather depend on :nccl_utils.
-tf_cuda_library(
+tsl_gpu_library(
     name = "_nccl_utils",
     srcs = if_gpu_is_configured(["nccl_utils.cc"]),
     hdrs = if_gpu_is_configured(["nccl_utils.h"]),
-    # Override tf_cuda_library()'s internal default value of ["//buildenv/target:gce"].
+    # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
     compatible_with = [],
     defines = if_gpu_is_configured(["XLA_ENABLE_XCCL"]),
     tags = ["manual"],  # Only builds with if_nccl().
@@ -518,54 +572,6 @@ bool_flag(
     build_setting_default = if_google(True, False),
 )
 
-cc_library(
-    name = "jitrt_custom_calls",
-    srcs = ["jitrt_custom_calls.cc"],
-    hdrs = ["jitrt_custom_calls.h"],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
-    tags = ["manual"],
-    deps = [
-        ":fft_thunk",
-        ":gpu_asm_opts_util",
-        ":io_feed_manager",
-        ":matmul_utils",
-        ":nccl_collective_thunks",
-        ":stream_executor_util",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//mlir:Support",
-        "//tensorflow/compiler/xla/runtime:arguments",
-        "//tensorflow/compiler/xla/runtime:types",
-        "//tensorflow/compiler/xla/runtime:custom_call_registry",
-        "//tensorflow/compiler/xla/runtime:executable",
-        "//tensorflow/compiler/xla/runtime:jit_executable",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:custom_call_status_internal",
-        "//tensorflow/compiler/xla/service:custom_call_target_registry",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service/gpu/runtime:conv",
-        "//tensorflow/compiler/xla/service/gpu/runtime:io_feed",
-        "//tensorflow/compiler/xla/service/gpu/runtime:fft",
-        "//tensorflow/compiler/xla/service/gpu/runtime:kernel_launch",
-        "//tensorflow/compiler/xla/service/gpu/runtime:cublas_lt_matmul",
-        "//tensorflow/compiler/xla/service/gpu/runtime:support",
-        "//tensorflow/compiler/xla/service/gpu/runtime:tracing",
-        "//tensorflow/compiler/xla/service/gpu:gpu_conv_runner",
-        "//tensorflow/tsl/platform:human_readable_json",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
-        "@tf_runtime//:dtype",
-        "@tf_runtime//:support",
-        "//tensorflow/compiler/xla/runtime:custom_call",
-        "//tensorflow/compiler/xla/runtime:type_id",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
-    ] + if_gpu_is_configured([
-        ":cholesky_thunk",
-        ":triangular_solve_thunk",
-    ]) + if_cuda_is_configured([
-        "//tensorflow/compiler/xla/service/gpu/runtime:graph_launch",
-    ]),
-)
-
 cc_library(
     name = "gpu_executable",
     srcs = [
@@ -616,7 +622,6 @@ cc_library(
         ":gpu_types",
         ":io_feed_manager",
         ":ir_emission_utils",
-        ":jitrt_custom_calls",
         ":launch_dimensions",
         ":matmul_utils",
         ":nccl_collective_thunks",
@@ -638,6 +643,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Support",
+        "//tensorflow/compiler/xla/service/gpu/runtime:gemm",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:refcounting_hash_map",
@@ -650,6 +656,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/compiler/xla/runtime:diagnostics",
         "//tensorflow/compiler/xla/runtime:executable",
@@ -657,7 +665,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:custom_call_status_internal",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_parser",
@@ -666,6 +674,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:xla_debug_info_manager",
         "//tensorflow/compiler/xla/service/gpu/runtime:kernel_launch",
+        "//tensorflow/compiler/xla/service/gpu/runtime:collectives",
         "//tensorflow/compiler/xla/service/gpu/runtime:cublas_lt_matmul",
         "//tensorflow/compiler/xla/service/gpu/runtime:support",
         "//tensorflow/compiler/xla/service/gpu/runtime:executable",
@@ -711,22 +720,20 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         ":target_util",
-        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_type_conversion_util",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
         "@llvm-project//llvm:Core",
         "@llvm-project//mlir:ArithDialect",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "ir_emission_utils_test",
     srcs = ["ir_emission_utils_test.cc"],
     deps = [
@@ -748,8 +755,9 @@ cc_library(
     hdrs = ["cublas_cudnn.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -771,13 +779,15 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
-    ]) + ["//tensorflow/tsl/platform:status"],
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts"
+    ]) + ["//tensorflow/tsl/platform:status"] + if_dcu([
+        "//tensorflow/stream_executor/rocm:hipsolver_wrapper",
+    ]),
 )
 
 # TODO(ezhulenev): Extract `RunTriangularSolve` into a separate library.
@@ -798,7 +808,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
@@ -817,8 +827,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
@@ -840,17 +850,18 @@ cc_library(
         ":cublas_cudnn",
         ":ir_emission_utils",
         ":matmul_utils",
+        "//tensorflow/compiler/xla:literal_comparison",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/stream_executor:blas",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -900,26 +911,55 @@ cc_library(
         ":ir_emission_utils",
         ":matmul_utils",
         ":stream_executor_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/core/protobuf:autotuning_proto_cc",
-        "//tensorflow/tsl/util/proto:proto_utils",
         "//tensorflow/compiler/xla/stream_executor:blas",
-        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
         "//tensorflow/tsl/platform:logger",
+        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
+        "//tensorflow/tsl/util/proto:proto_utils",
     ]),
 )
 
+xla_cc_test(
+    name = "gemm_algorithm_picker_test",
+    srcs = ["gemm_algorithm_picker_test.cc"],
+    tags = [
+        "gpu",
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "requires-gpu-sm70",
+    ],
+    deps = [
+        ":gemm_algorithm_picker",
+        ":gemm_rewriter",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
+    ],
+)
+
 cc_library(
     name = "matmul_utils",
     srcs = ["matmul_utils.cc"],
@@ -939,18 +979,21 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
         "//tensorflow/tsl/platform/default/build_config:cublas_plugin",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "//tensorflow/compiler/xla/stream_executor:host_or_device_scalar",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
+    ]) + if_static([
+        "//tensorflow/tsl/platform:tensor_float_32_utils",
     ]),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "matmul_utils_test",
     srcs = ["matmul_utils_test.cc"],
     deps = [
@@ -963,6 +1006,37 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "dot_dimension_sorter",
+    srcs = ["dot_dimension_sorter.cc"],
+    hdrs = ["dot_dimension_sorter.h"],
+    deps = [
+        "//tensorflow/compiler/xla:permutation_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "dot_dimension_sorter_test",
+    srcs = ["dot_dimension_sorter_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":dot_dimension_sorter",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+    ],
+)
+
 cc_library(
     name = "gpu_conv_algorithm_picker",
     srcs = ["gpu_conv_algorithm_picker.cc"],
@@ -981,17 +1055,18 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/tsl/platform:logger",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/tsl/util/proto:proto_utils",
@@ -1005,6 +1080,31 @@ cc_library(
     ]),
 )
 
+xla_cc_test(
+    name = "gpu_conv_algorithm_picker_test",
+    srcs = ["gpu_conv_algorithm_picker_test.cc"],
+    tags = [
+        "gpu",
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "requires-gpu-sm70",
+    ],
+    deps = [
+        ":gpu_conv_algorithm_picker",
+        ":gpu_conv_rewriter",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "gpu_conv_runner",
     srcs = ["gpu_conv_runner.cc"],
@@ -1020,7 +1120,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/compiler/xla/stream_executor:lazy_op_runner",
@@ -1040,7 +1140,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
@@ -1058,7 +1158,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:logging",
@@ -1066,16 +1166,13 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "move_copy_to_users_test",
-    srcs = [
-        "move_copy_to_users_test.cc",
-    ],
+    srcs = ["move_copy_to_users_test.cc"],
     deps = [
         ":move_copy_to_users",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -1086,18 +1183,16 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_conv_rewriter_test",
     srcs = ["gpu_conv_rewriter_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cublas_cudnn",
         ":gpu_conv_rewriter",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1117,6 +1212,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/tsl/platform:status",
@@ -1144,7 +1241,7 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:logging",
@@ -1159,33 +1256,29 @@ cc_library(
     srcs = ["instruction_fusion.cc"],
     hdrs = ["instruction_fusion.h"],
     deps = [
+        ":gpu_device_info",
         ":gpu_fusible",
         ":ir_emission_utils",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:fusion_node_indexing_evaluation",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:instruction_fusion",
-        "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "instruction_fusion_test",
     srcs = ["instruction_fusion_test.cc"],
     tags = ["no_pip"],
     deps = [
+        ":gpu_device_info_for_tests",
         ":gpu_fusible",
         ":instruction_fusion",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1197,19 +1290,18 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":gpu_device_info",
         ":gpu_fusible",
         ":gpu_hlo_cost_analysis",
-        ":instruction_fusion",
+        ":gpu_performance_model",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1217,23 +1309,17 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "multi_output_fusion_test",
     srcs = ["multi_output_fusion_test.cc"],
     tags = ["no_pip"],
     deps = [
+        ":gpu_device_info_for_tests",
         ":gpu_fusible",
-        ":instruction_fusion",
         ":multi_output_fusion",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1246,8 +1332,9 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/tsl/platform:errors",
@@ -1258,13 +1345,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "softmax_fusion_test",
     srcs = ["softmax_fusion_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":softmax_fusion",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -1274,6 +1362,7 @@ tf_cc_test(
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1282,7 +1371,7 @@ cc_library(
     srcs = ["gpu_sanitize_constant_names.cc"],
     hdrs = ["gpu_sanitize_constant_names.h"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/tsl/platform:logging",
@@ -1290,10 +1379,9 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_sanitize_constant_names_test",
     srcs = ["gpu_sanitize_constant_names_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_sanitize_constant_names",
         ":ir_emission_utils",
@@ -1303,8 +1391,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
@@ -1321,27 +1409,29 @@ cc_library(
     srcs = ["fusion_merger.cc"],
     hdrs = ["fusion_merger.h"],
     deps = [
+        ":gpu_device_info",
         ":gpu_fusible",
         ":gpu_hlo_cost_analysis",
-        ":instruction_fusion",
+        ":gpu_performance_model",
+        ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/tsl/platform:errors",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "fusion_merger_test",
     srcs = ["fusion_merger_test.cc"],
     tags = ["no_pip"],
     deps = [
         ":fusion_merger",
+        ":gpu_device_info",
+        ":gpu_device_info_for_tests",
         ":gpu_fusible",
         ":instruction_fusion",
         "//tensorflow/compiler/xla:test_helpers",
@@ -1364,7 +1454,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:shape_inference",
@@ -1372,17 +1462,16 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_conv_padding_legalization_test",
     srcs = ["gpu_conv_padding_legalization_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cublas_cudnn",
         ":gpu_conv_padding_legalization",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -1399,22 +1488,21 @@ cc_library(
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/tsl/platform:status",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cudnn_support_utils_test",
     srcs = ["cudnn_support_utils_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cudnn_support_utils",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
@@ -1442,7 +1530,7 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/tsl/platform:status",
@@ -1450,10 +1538,9 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cudnn_pad_for_convolutions_test",
     srcs = ["cudnn_pad_for_convolutions_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cublas_cudnn",
         ":cudnn_pad_for_convolutions",
@@ -1478,16 +1565,15 @@ cc_library(
         ":stream_executor_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:call_inliner",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cudnn_vectorize_convolutions_test",
     srcs = ["cudnn_vectorize_convolutions_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cublas_cudnn",
         ":cudnn_vectorize_convolutions",
@@ -1517,10 +1603,9 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cudnn_simplify_padding_test",
     srcs = ["cudnn_simplify_padding_test.cc"],
-    tags = tf_cuda_tests_tags(),
     deps = [
         ":cublas_cudnn",
         ":cudnn_pad_for_convolutions",
@@ -1533,6 +1618,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
@@ -1551,12 +1637,12 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cublas_pad_for_gemms_test",
     srcs = ["cublas_pad_for_gemms_test.cc"],
     tags = ["no_pip"],
@@ -1624,7 +1710,7 @@ cc_library(
     srcs = ["gpu_reduce_scatter_creator.cc"],
     hdrs = ["gpu_reduce_scatter_creator.h"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:reduce_scatter_utils",
@@ -1643,6 +1729,7 @@ cc_library(
         ":alias_passthrough_params",
         ":all_reduce_blueconnect",
         ":conv_layout_normalization",
+        ":dot_dimension_sorter",
         ":executable_proto_cc",
         ":fusion_merger",
         ":gemm_broadcast_folding_rewriter",
@@ -1665,13 +1752,11 @@ cc_library(
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":jitrt_custom_calls",
-        ":launch_dimensions",
+        ":ir_emitter_context",
         ":matmul_utils",
         ":metrics",
         ":move_copy_to_users",
         ":multi_output_fusion",
-        ":nccl_collective_thunks",
         ":reduction_degenerate_dim_remover",
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
@@ -1679,27 +1764,26 @@ cc_library(
         ":runtime_intrinsics",
         ":scatter_slice_simplifier",
         ":softmax_fusion",
-        ":stream_executor_util",
-        ":target_constants",
         ":tree_reduction_rewriter",
         ":variadic_op_splitter",
-        "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
-        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:autotune_results_proto_cc",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/transforms:hlo_constant_splitter",
+        "//tensorflow/compiler/xla/mlir/backends/gpu/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
-        "//tensorflow/compiler/xla/mlir/transforms/gpu:passes",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_gpu_passes",
         "//tensorflow/compiler/xla/runtime:jit_executable",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:all_gather_broadcast_reorder",
         "//tensorflow/compiler/xla/service:all_gather_combiner",
-        "//tensorflow/compiler/xla/service:all_gather_decomposer",
         "//tensorflow/compiler/xla/service:all_reduce_combiner",
         "//tensorflow/compiler/xla/service:all_reduce_contiguous",
         "//tensorflow/compiler/xla/service:all_reduce_folder",
+        "//tensorflow/compiler/xla/service:all_reduce_promotion",
         "//tensorflow/compiler/xla/service:all_reduce_reassociate",
         "//tensorflow/compiler/xla/service:all_to_all_decomposer",
         "//tensorflow/compiler/xla/service:async_collective_creator",
@@ -1728,18 +1812,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:gather_expander",
         "//tensorflow/compiler/xla/service:gather_simplifier",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_computation_deduplicator",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
-        "//tensorflow/compiler/xla/service:hlo_constant_splitter",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_dce",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:layout_normalization",
         "//tensorflow/compiler/xla/service:llvm_compiler",
@@ -1757,59 +1837,54 @@ cc_library(
         "//tensorflow/compiler/xla/service:result_caster",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:rng_expander",
-        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla/service:scatter_simplifier",
         "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/compiler/xla/service:sharding_remover",
         "//tensorflow/compiler/xla/service:simplify_fp_conversions",
-        "//tensorflow/compiler/xla/service:slice_sinker",
         "//tensorflow/compiler/xla/service:slow_operation_alarm",
         "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:stable_sort_expander",
+        "//tensorflow/compiler/xla/service:stochastic_convert_decomposer",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_all_reduce_code_motion",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_trip_count_annotator",
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/spmd:collective_permute_motion",
         "//tensorflow/compiler/xla/service/spmd:stateful_rng_spmd_partitioner",
         "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_description_proto_cc_impl",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
-        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla",
         "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:regexp",
+        "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
-        "//tensorflow/tsl/util:env_var",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
         "@llvm-project//llvm:AsmParser",
-        "@llvm-project//llvm:BitReader",
-        "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:Support",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_compiler_test",
     srcs = ["gpu_compiler_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -1835,19 +1910,6 @@ cc_library(
     alwayslink = True,  # Contains compiler registration
 )
 
-cc_library(
-    name = "nvptx_helper",
-    srcs = ["nvptx_helper.cc"],
-    hdrs = ["nvptx_helper.h"],
-    deps = [
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/tsl/platform:cuda_libdevice_path",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:path",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "nvptx_compiler_impl",
     srcs = if_cuda_is_configured([
@@ -1873,18 +1935,24 @@ cc_library(
         ":gpu_layout_assignment",
         ":ir_emission_utils",
         ":metrics",
-        ":nvptx_helper",
         ":target_constants",
+        ":triangular_solve_rewriter",
         "@com_google_absl//absl/base",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Support",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:convert_mover",
         "//tensorflow/compiler/xla/service:dump",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:bfloat16_support",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -1892,34 +1960,31 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:llvm_compiler",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:cuda_libdevice_path",
-        "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_diagnostics",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
         "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/tsl/platform:cuda_libdevice_path",
+        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
-        ":triangular_solve_rewriter",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/tsl/util:env_var",
     ]),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "nvptx_compiler_test",
     srcs = if_gpu_is_configured([
         "nvptx_compiler_test.cc",
     ]),
-    tags = [
-        "gpu",
+    tags = tf_cuda_tests_tags() + [
         "no_rocm",
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false
         # positives in msan.
@@ -1928,38 +1993,36 @@ tf_cc_test(
         ":nvptx_compiler_impl",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
     ],
 )
 
-# TODO(ezhulenev): This test breaks MacOS build, try to re-enable it later.
-# copybara:uncomment_begin
-# tf_cc_test(
-#     name = "gpu_aot_compilation_test",
-#     srcs = [
-#         "gpu_aot_compilation_test.cc",
-#     ],
-#     env = {
-#         "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-#     },
-#     tags = [
-#         "gpu",
-#         "no_oss",
-#         "no_rocm",
-#         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
-#         "requires-gpu-nvidia",
-#     ],
-#     deps = [
-#         ":nvptx_compiler_impl",
-#         "//tensorflow/compiler/xla/tests:hlo_test_base",
-#         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
-#     ],
-# )
-# copybara:uncomment_end
+xla_cc_test(
+    name = "gpu_aot_compilation_test",
+    srcs = if_cuda_is_configured([
+        "gpu_aot_compilation_test.cc",
+    ]),
+    env = {
+        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
+    },
+    tags = [
+        "gpu",
+        "no_oss",
+        "no_rocm",
+        "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
+        "requires-gpu-nvidia",
+    ],
+    deps = if_cuda_is_configured([
+        ":nvptx_compiler_impl",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
+    ]),
+)
 
 cc_library(
     name = "amdgpu_compiler",
@@ -1999,7 +2062,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:call_inliner",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -2021,7 +2084,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_query",
@@ -2031,12 +2094,12 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "all_reduce_blueconnect_test",
     srcs = ["all_reduce_blueconnect_test.cc"],
     deps = [
         ":all_reduce_blueconnect",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -2104,8 +2167,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:errors",
@@ -2115,7 +2178,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_layout_assignment_test",
     srcs = ["gpu_layout_assignment_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -2126,11 +2189,11 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "//tensorflow/tsl/platform:status_matchers",
@@ -2143,27 +2206,33 @@ cc_library(
     srcs = ["gpu_hlo_schedule.cc"],
     hdrs = ["gpu_hlo_schedule.h"],
     deps = [
+        ":gpu_device_info",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_value",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_ordering",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:latency_hiding_scheduler",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_hlo_schedule_test",
     srcs = [
         "gpu_hlo_schedule_test.cc",
     ],
-    tags = ["no_pip"],
+    tags = tf_cuda_tests_tags() + ["no_pip"],
     deps = [
+        ":gpu_device_info",
         ":gpu_hlo_schedule",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2173,7 +2242,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "while_transformer_test",
     srcs = ["while_transformer_test.cc"],
     tags = ["no_pip"],
@@ -2205,16 +2274,16 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:kernel_spec",
-        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/tsl/platform:cuda_libdevice_path",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "//tensorflow/tsl/util:determinism_for_kernels",
         "//tensorflow/tsl/util:env_var",
         "//tensorflow/tsl/util/proto:proto_utils",
@@ -2244,12 +2313,15 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "@com_google_absl//absl/container:node_hash_map",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_hlo_cost_analysis_test",
     srcs = ["gpu_hlo_cost_analysis_test.cc"],
     deps = [
@@ -2259,6 +2331,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_performance_model",
+    srcs = ["gpu_performance_model.cc"],
+    hdrs = ["gpu_performance_model.h"],
+    deps = [
+        ":gpu_device_info",
+        ":gpu_hlo_cost_analysis",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_performance_model_test",
+    srcs = ["gpu_performance_model_test.cc"],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":gpu_performance_model",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "buffer_comparator",
     srcs = if_cuda_is_configured(["buffer_comparator.cc"]),
@@ -2278,7 +2373,7 @@ cc_library(
     ]),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "buffer_comparator_test",
     srcs = if_cuda_is_configured(["buffer_comparator_test.cc"]),
     tags = tf_cuda_tests_tags(),
@@ -2289,7 +2384,6 @@ tf_cc_test(
         "//tensorflow/tsl/platform:test",
     ] + if_cuda_is_configured([
         ":buffer_comparator",
-        "//tensorflow/compiler/xla/stream_executor/cuda:stream_executor_cuda",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
     ]),
 )
@@ -2299,20 +2393,21 @@ cc_library(
     srcs = ["gpu_fusible.cc"],
     hdrs = ["gpu_fusible.h"],
     deps = [
+        ":gpu_device_info",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_fusible_test",
     srcs = ["gpu_fusible_test.cc"],
     tags = ["no_pip"],
     deps = [
         ":gpu_fusible",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2329,7 +2424,8 @@ cc_library(
         ":cublas_cudnn",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -2340,7 +2436,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "cudnn_fused_conv_rewriter_test",
     srcs = ["cudnn_fused_conv_rewriter_test.cc"],
     tags = [
@@ -2370,13 +2466,15 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "conv_layout_normalization_test",
     srcs = ["conv_layout_normalization_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -2406,7 +2504,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
@@ -2421,13 +2519,13 @@ cc_library(
     hdrs = ["gpu_scatter_expander.h"],
     deps = [
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "variadic_op_splitter_test",
     srcs = ["variadic_op_splitter_test.cc"],
     tags = ["no_pip"],
@@ -2439,7 +2537,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -2455,7 +2553,7 @@ tf_proto_library(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core/protobuf:autotuning_proto",
+        "//tensorflow/tsl/protobuf:autotuning_proto",
     ],
 )
 
@@ -2467,12 +2565,12 @@ cc_library(
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/core/protobuf:autotuning_proto_cc",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_algorithm_denylist_test",
     srcs = ["hlo_algorithm_denylist_test.cc"],
     data = ["data/hlo_algorithm_denylist.pbtxt"],
@@ -2495,12 +2593,12 @@ cc_library(
     hdrs = ["alias_passthrough_params.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "alias_passthrough_params_test",
     srcs = ["alias_passthrough_params_test.cc"],
     tags = ["no_pip"],
@@ -2523,7 +2621,7 @@ cc_library(
         ":gpu_fusible",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:errors",
@@ -2533,27 +2631,25 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "horizontal_loop_fusion_test",
     srcs = ["horizontal_loop_fusion_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
-        ":fusion_merger",
+        ":gpu_device_info_for_tests",
         ":horizontal_loop_fusion",
         ":instruction_fusion",
-        ":multi_output_fusion",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
-        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -2565,36 +2661,28 @@ cc_library(
     srcs = ["horizontal_input_fusion.cc"],
     hdrs = ["horizontal_input_fusion.h"],
     deps = [
+        ":gpu_device_info",
         ":gpu_fusible",
-        ":ir_emission_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "horizontal_input_fusion_test",
     srcs = ["horizontal_input_fusion_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":gpu_device_info_for_tests",
         ":horizontal_input_fusion",
         ":multi_output_fusion",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
-        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -2608,10 +2696,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -2626,7 +2713,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
     ],
@@ -2639,12 +2726,12 @@ cc_library(
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduction_splitter_test",
     srcs = ["reduction_splitter_test.cc"],
     deps = [
@@ -2668,10 +2755,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -2689,8 +2775,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -2706,23 +2792,22 @@ cc_library(
         ":cublas_cudnn",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-# These tests are intended to be run with --test_env=XLA_FLAGS=--xla_gpu_enable_xla_runtime_executable
 # See tap/tensorflow.xla_gpu_jitrt.
 test_suite(
     name = "jitrt_executable_tests",
     tests = [
         # copybara:uncomment "//platforms/xla/tests/internal:xfeed_test_gpu",
         # copybara:uncomment "//third_party/py/jax/experimental/jax2tf/tests:primitives_test_gpu",
-        "//tensorflow/compiler/tests:fft_test_gpu",
+        # copybara:uncomment "//third_party/py/jax/tests:pmap_test_gpu",
+        # copybara:uncomment "//tensorflow/compiler/tests:fft_test_gpu",
         "//tensorflow/compiler/xla/python:xla_client_test_gpu",
         "//tensorflow/compiler/xla/service/gpu:cudnn_fused_conv_rewriter_test",
         "//tensorflow/compiler/xla/service/gpu:custom_call_test",
@@ -2831,11 +2916,6 @@ test_suite(
         "//tensorflow/compiler/xla/tests:dynamic_ops_test_gpu",
         "//tensorflow/compiler/xla/tests:execution_profile_test_gpu",
         "//tensorflow/compiler/xla/tests:execution_profile_test_with_xla_hlo_profile_gpu",
-        "//tensorflow/compiler/xla/tests:exhaustive_binary_16_bit_test_gpu",
-        "//tensorflow/compiler/xla/tests:exhaustive_binary_test_f32_f64_gpu",
-        "//tensorflow/compiler/xla/tests:exhaustive_unary_test_complex_gpu",
-        "//tensorflow/compiler/xla/tests:exhaustive_unary_test_f32_or_smaller_gpu",
-        "//tensorflow/compiler/xla/tests:exhaustive_unary_test_f64_gpu",
         "//tensorflow/compiler/xla/tests:floor_ceil_test_gpu",
         "//tensorflow/compiler/xla/tests:fmax_fmin_test_gpu",
         "//tensorflow/compiler/xla/tests:gather_operation_test_gpu",
@@ -2928,14 +3008,14 @@ cc_library(
     deps = [
         ":cublas_cudnn",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cuda_library(
+tsl_gpu_library(
     name = "runtime_intrinsics",
     srcs = ["runtime_intrinsics.cc"],
     hdrs = ["runtime_intrinsics.h"],
@@ -2960,7 +3040,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
@@ -2968,7 +3048,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_fusion_stats_test",
     srcs = ["hlo_fusion_stats_test.cc"],
     tags = ["no_pip"],
@@ -2976,7 +3056,7 @@ tf_cc_test(
         ":hlo_fusion_stats",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2991,14 +3071,14 @@ cc_library(
     hdrs = ["scatter_slice_simplifier.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "scatter_slice_simplifier_test",
     srcs = ["scatter_slice_simplifier_test.cc"],
     deps = [
@@ -3021,10 +3101,9 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/gpu/README.md b/tensorflow/compiler/xla/service/gpu/README.md
new file mode 100644
index 00000000000..d4c5f8555d2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/README.md
@@ -0,0 +1,3 @@
+# XLA GPU Backend
+
+Please see [the documentation](../g3doc/gpu_backend.md).
diff --git a/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.cc b/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.cc
index d3547990076..bb24f13dcf9 100644
--- a/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.cc
+++ b/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h b/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h
index 96377de614b..0b02b4a342b 100644
--- a/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h
+++ b/tensorflow/compiler/xla/service/gpu/alias_passthrough_params.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
index b216832851a..1fda37121b2 100644
--- a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
+++ b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/btree_map.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h
index 82ff5eefa17..526bbc97520 100644
--- a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h
+++ b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_ALL_REDUCE_BLUECONNECT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_ALL_REDUCE_BLUECONNECT_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect_test.cc b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect_test.cc
index d60ad21d99c..fc744eb3050 100644
--- a/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index dbbfe18819b..459c6661b64 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -97,7 +97,8 @@ std::string GetROCDLDir(const HloModuleConfig& config) {
 }  // namespace
 
 Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    HloModule* hlo_module, GpuVersion gpu_version,
+    se::dnn::VersionInfo dnn_version,
     se::DeviceMemoryAllocator* device_allocator) {
   // Convert convolutions into CustomCalls to MIOpen, then canonicalize them
   // (PadInsertion).
@@ -130,9 +131,12 @@ Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
 
 Status AMDGPUCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator,
+    const GpuTargetConfig& gpu_target_config,
+    const AutotuneResults* autotune_results) {
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, device_allocator));
+      hlo_module, stream_exec, device_allocator, gpu_target_config,
+      autotune_results));
 
   HloPassPipeline post_pipeline("AMDGPU post-layout_assignment");
 
@@ -156,9 +160,7 @@ GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
                                     llvm::Module* llvm_module,
-                                    GpuVersion gpu_version,
-                                    se::StreamExecutor* stream_exec,
-                                    bool relocatable,
+                                    GpuVersion gpu_version, bool relocatable,
                                     const HloModule* debug_module) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index c9a9ec001f4..b00f875468c 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -31,21 +31,23 @@ namespace gpu {
 class AMDGPUCompiler : public GpuCompiler {
  public:
   AMDGPUCompiler();
-  ~AMDGPUCompiler() override {}
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      HloModule* hlo_module, GpuVersion gpu_version,
+      se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) override;
 
   Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator,
+      const GpuTargetConfig& gpu_target_config,
+      const AutotuneResults* autotune_results) override;
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec, bool relocatable,
+      GpuVersion gpu_version, bool relocatable,
       const HloModule* debug_module) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
index d3b962bc48b..2c886fb9b25 100644
--- a/tensorflow/compiler/xla/service/gpu/backend_configs.proto
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
@@ -63,9 +63,11 @@ message GemmBackendConfig {
     DEFAULT = 0;
     BIAS = 1;
     RELU = 2;
-    BIASRELU = 3;
+    BIAS_RELU = 3;
     GELU = 4;
-    BIASGELU = 5;
+    GELU_AUX = 5;
+    BIAS_GELU = 6;
+    BIAS_GELU_AUX = 7;
   }
 
   Epilogue epilogue = 13;
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
index c9c9baa7537..0f90d67e6bb 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
@@ -20,14 +20,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/base/call_once.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
 #include "tensorflow/compiler/xla/service/gpu/precompiled_kernels.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -37,26 +34,9 @@ namespace gpu {
 
 namespace {
 
-StatusOr<GpuSolverContext*> GetContext(se::Stream* stream) {
-  // TODO(b/214454412): This global hashtable is incorrect (ABA bug if a Stream
-  // is added to the hasthable, then deleted, and then a new Stream is created
-  // at the same address).  It also leaks memory!
-  static absl::Mutex mu(absl::kConstInit);
-  static auto contexts =
-      new absl::flat_hash_map<se::Stream*, GpuSolverContext> ABSL_GUARDED_BY(
-          mu);
-
-  absl::MutexLock lock(&mu);
-  auto result = contexts->emplace(stream, GpuSolverContext());
-  if (result.second) {
-    TF_ASSIGN_OR_RETURN(result.first->second, GpuSolverContext::Create(stream));
-  }
-  return &result.first->second;
-}
-
 template <typename T>
 Status DoPotrfBatched(const se::GpuAsmOpts& asm_opts, CholeskyParams* params,
-                      se::Stream* stream, GpuSolverContext* context) {
+                      se::Stream* stream, GpuSolverContext& context) {
   T* a_base = static_cast<T*>(params->a_buffer.opaque());
   se::DeviceMemory<int> infos(params->info_buffer);
 #if TENSORFLOW_USE_ROCSOLVER
@@ -78,26 +58,8 @@ Status DoPotrfBatched(const se::GpuAsmOpts& asm_opts, CholeskyParams* params,
       static_cast<int>(params->batch_size), se::DeviceMemoryBase(as)));
 
   // Now that we've set up the `as` array, we can call cusolver.
-  return context->PotrfBatched(params->uplo, params->n, as, params->n, infos,
-                               params->batch_size);
-}
-
-template <typename T>
-Status DoPotrfUnbatched(CholeskyParams* params, GpuSolverContext* context) {
-  T* a_base = static_cast<T*>(params->a_buffer.opaque());
-  int* info_base = static_cast<int*>(params->info_buffer.opaque());
-
-  int64_t stride = params->n * params->n;
-  for (int64_t i = 0; i < params->batch_size; ++i) {
-    se::DeviceMemory<T> a_data(
-        se::DeviceMemoryBase(&a_base[i * stride], sizeof(T) * stride));
-    se::DeviceMemory<int> info_data(
-        se::DeviceMemoryBase(&info_base[i], sizeof(int)));
-    TF_RETURN_IF_ERROR(context->Potrf(params->uplo, params->n, a_data,
-                                      params->n, info_data,
-                                      params->workspace_buffer));
-  }
-  return OkStatus();
+  return context.PotrfBatched(params->uplo, params->n, as, params->n, infos,
+                              params->batch_size);
 }
 
 }  // namespace
@@ -141,39 +103,25 @@ Status CholeskyThunk::ExecuteOnStream(const ExecuteParams& params) {
 
 Status RunCholesky(const se::GpuAsmOpts& asm_opts, PrimitiveType type,
                    CholeskyParams* cholesky_params, se::Stream* stream) {
-  TF_ASSIGN_OR_RETURN(GpuSolverContext * context, GetContext(stream));
-  if (context->SupportsPotrfBatched()) {
-    switch (type) {
-      case F32:
-        return DoPotrfBatched<float>(asm_opts, cholesky_params, stream,
-                                     context);
-      case F64:
-        return DoPotrfBatched<double>(asm_opts, cholesky_params, stream,
-                                      context);
-      case C64:
-        return DoPotrfBatched<std::complex<float>>(asm_opts, cholesky_params,
-                                                   stream, context);
-      case C128:
-        return DoPotrfBatched<std::complex<double>>(asm_opts, cholesky_params,
-                                                    stream, context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
-  } else {
-    switch (type) {
-      case F32:
-        return DoPotrfUnbatched<float>(cholesky_params, context);
-      case F64:
-        return DoPotrfUnbatched<double>(cholesky_params, context);
-      case C64:
-        return DoPotrfUnbatched<std::complex<float>>(cholesky_params, context);
-      case C128:
-        return DoPotrfUnbatched<std::complex<double>>(cholesky_params, context);
-      default:
-        return InvalidArgument("Invalid type for cholesky %s",
-                               PrimitiveType_Name(type));
-    }
+  thread_local StatusOr<GpuSolverContext> context = GpuSolverContext::Create();
+  TF_RETURN_IF_ERROR(context.status());
+  TF_RETURN_IF_ERROR(context->SetStream(stream));
+
+  switch (type) {
+    case F32:
+      return DoPotrfBatched<float>(asm_opts, cholesky_params, stream, *context);
+    case F64:
+      return DoPotrfBatched<double>(asm_opts, cholesky_params, stream,
+                                    *context);
+    case C64:
+      return DoPotrfBatched<std::complex<float>>(asm_opts, cholesky_params,
+                                                 stream, *context);
+    case C128:
+      return DoPotrfBatched<std::complex<double>>(asm_opts, cholesky_params,
+                                                  stream, *context);
+    default:
+      return InvalidArgument("Invalid type for cholesky %s",
+                             PrimitiveType_Name(type));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
index f8e9a6bb54c..9298f37319b 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index d0cb53e688c..ccaa279e9f3 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 82367c51e62..97c60f3bfdc 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.cc b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.cc
index 5e111d40533..c2ab437b48e 100644
--- a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h
index 0ff46665be5..c6305784f94 100644
--- a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h
+++ b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization_test.cc b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization_test.cc
index 7452fbc7c24..3859fffbf79 100644
--- a/tensorflow/compiler/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/conv_layout_normalization_test.cc
@@ -70,7 +70,7 @@ HloModule TestModule
   RunAndCompareWithLayoutsNormalized(hlo);
 
   MatchOptimizedHlo(hlo, R"(
-// CHECK: (f32[1,136,23]{2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%fusion.1, %transpose.1), window={size=31 stride=2 pad=23_23}, dim_labels=bf0_oi0->bf0, custom_call_target="__cudnn$convBackwardInput"
+// CHECK: (f32[1,136,23]{2,1,0}, u8[{{[0-9]+}}]{0}) custom-call([[fusion_1_0:%[^ ]+]], [[transpose_1_1:%[^ ]+]]), window={size=31 stride=2 pad=23_23}, dim_labels=bf0_oi0->bf0, custom_call_target="__cudnn$convBackwardInput"
   )");
 }
 
@@ -87,7 +87,7 @@ ENTRY %TestComputation {
 
   RunAndCompareWithLayoutsNormalized(hlo);
   MatchOptimizedHlo(hlo, R"(
-// CHECK: (f32[2,128,1,378]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%param_0, %bitcast.5), window={size=1x5 pad=0_0x2_2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward"
+// CHECK: (f32[2,128,1,378]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call([[param_0_0:%[^ ]+]], [[bitcast_5_1:%[^ ]+]]), window={size=1x5 pad=0_0x2_2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward"
   )");
 }
 
@@ -111,7 +111,7 @@ ENTRY TestComputation {
   RunAndCompareWithLayoutsNormalized(hlo);
 
   MatchOptimizedHlo(hlo, R"(
-// CHECK: (f32[8,32,4,5,5]{4,3,2,1,0}, u8[0]{0}) custom-call(%bitcast.8, %fusion, %bias), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBiasActivationForward"
+// CHECK: (f32[8,32,4,5,5]{4,3,2,1,0}, u8[0]{0}) custom-call([[bitcast_8_0:%[^ ]+]], [[fusion_1:%[^ ]+]], [[bias_2:%[^ ]+]]), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBiasActivationForward"
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index b22f7c7bb1a..91db1bfce25 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index ed530ce6552..53f63f24a29 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index af33a4ba4ee..0a1ff689071 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_THUNK_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc b/tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc
index 69058b275dd..917e42b54ae 100644
--- a/tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc
+++ b/tensorflow/compiler/xla/service/gpu/cublas_cudnn.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
 
+#include "absl/strings/string_view.h"
+
 namespace xla {
 namespace gpu {
 
@@ -32,16 +34,22 @@ bool IsCublasLtMatmul(const HloInstruction& hlo) {
          hlo.custom_call_target() == kCublasLtMatmulCallTarget;
 }
 
-const char* const kGemmCallTarget = "__cublas$gemm";
-const char* const kCublasLtMatmulCallTarget = "__cublas$lt$matmul";
-const char* const kTriangularSolveCallTarget = "__cublas$triangularSolve";
-const char* const kCudnnConvForwardCallTarget = "__cudnn$convForward";
-const char* const kCudnnConvBackwardInputCallTarget =
+bool IsCublasLtMatmulF8(const HloInstruction& hlo) {
+  return hlo.opcode() == HloOpcode::kCustomCall &&
+         hlo.custom_call_target() == kCublasLtMatmulF8CallTarget;
+}
+
+const absl::string_view kGemmCallTarget = "__cublas$gemm";
+const absl::string_view kCublasLtMatmulCallTarget = "__cublas$lt$matmul";
+const absl::string_view kCublasLtMatmulF8CallTarget = "__cublas$lt$matmul$f8";
+const absl::string_view kCudnnConvBackwardInputCallTarget =
     "__cudnn$convBackwardInput";
-const char* const kCudnnConvBackwardFilterCallTarget =
+const absl::string_view kCudnnConvBackwardFilterCallTarget =
     "__cudnn$convBackwardFilter";
-const char* const kCudnnConvBiasActivationForwardCallTarget =
+const absl::string_view kCudnnConvBiasActivationForwardCallTarget =
     "__cudnn$convBiasActivationForward";
+const absl::string_view kCudnnConvForwardCallTarget = "__cudnn$convForward";
+const absl::string_view kTriangularSolveCallTarget = "__cublas$triangularSolve";
 
 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() != HloOpcode::kCustomCall) {
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_cudnn.h b/tensorflow/compiler/xla/service/gpu/cublas_cudnn.h
index 7eca2307532..6afdf5bdf25 100644
--- a/tensorflow/compiler/xla/service/gpu/cublas_cudnn.h
+++ b/tensorflow/compiler/xla/service/gpu/cublas_cudnn.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUBLAS_CUDNN_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUBLAS_CUDNN_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
@@ -60,16 +61,22 @@ bool IsLegacyCublasMatmul(const HloInstruction& hlo);
 // Matrix multiplication that calls into cublasLt.
 bool IsCublasLtMatmul(const HloInstruction& hlo);
 
+// Scaled matrix multiplication in FP8. Calls into cublasLt.
+bool IsCublasLtMatmulF8(const HloInstruction& hlo);
+
 // A call to cuBLAS general matrix multiplication API.
-extern const char* const kGemmCallTarget;
+extern const absl::string_view kGemmCallTarget;
 
 // A call to cuBLAS Lt API matrix multiplication.
-extern const char* const kCublasLtMatmulCallTarget;
+extern const absl::string_view kCublasLtMatmulCallTarget;
+
+// A call to cuBLASLt for scaled matrix multiplication in FP8.
+extern const absl::string_view kCublasLtMatmulF8CallTarget;
 
 // A call to cuBLAS for a triangular solve.
 //
 // Like cudnn convolutions, this op returns a tuple (result, scratch_memory).
-extern const char* const kTriangularSolveCallTarget;
+extern const absl::string_view kTriangularSolveCallTarget;
 
 // A call to cuDNN for convolution (forward, backward filter, or backward input)
 // is represented as a CustomCall HLO with a call target equal to one of these
@@ -97,10 +104,10 @@ extern const char* const kTriangularSolveCallTarget;
 // location in memory that the conv can write into, but which it can't legally
 // read from, at least until it's written something first.  But that's exactly
 // the definition of an output buffer.)
-extern const char* const kCudnnConvForwardCallTarget;
-extern const char* const kCudnnConvBackwardInputCallTarget;
-extern const char* const kCudnnConvBackwardFilterCallTarget;
-extern const char* const kCudnnConvBiasActivationForwardCallTarget;
+extern const absl::string_view kCudnnConvForwardCallTarget;
+extern const absl::string_view kCudnnConvBackwardInputCallTarget;
+extern const absl::string_view kCudnnConvBackwardFilterCallTarget;
+extern const absl::string_view kCudnnConvBiasActivationForwardCallTarget;
 
 // Returns true if `hlo` will be implemented as a call to a cuDNN convolution
 // routine.
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.cc b/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.cc
index 0b57c292b12..881f40c5223 100644
--- a/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.cc
@@ -31,7 +31,10 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
     ThunkInfo thunk_info, cublas_lt::MatmulPlan plan, int64_t algorithm_idx,
     BufferAllocation::Slice a_buffer, BufferAllocation::Slice b_buffer,
     BufferAllocation::Slice c_buffer, BufferAllocation::Slice d_buffer,
-    BufferAllocation::Slice bias_buffer)
+    BufferAllocation::Slice bias_buffer, BufferAllocation::Slice aux_buffer,
+    BufferAllocation::Slice a_scale, BufferAllocation::Slice b_scale,
+    BufferAllocation::Slice c_scale, BufferAllocation::Slice d_scale,
+    BufferAllocation::Slice d_amax)
     : Thunk(Kind::kCublasLtMatmul, thunk_info),
       plan_(std::move(plan)),
       algorithm_idx_(algorithm_idx),
@@ -39,7 +42,13 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
       b_buffer_(b_buffer),
       c_buffer_(c_buffer),
       d_buffer_(d_buffer),
-      bias_buffer_(bias_buffer) {}
+      bias_buffer_(bias_buffer),
+      aux_buffer_(aux_buffer),
+      a_scale_buffer_(a_scale),
+      b_scale_buffer_(b_scale),
+      c_scale_buffer_(c_scale),
+      d_scale_buffer_(d_scale),
+      d_amax_buffer_(d_amax) {}
 
 Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   if (!algorithm_) {
@@ -53,17 +62,38 @@ Status CublasLtMatmulThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(3) << "Running cublas_lt matmul thunk";
   const BufferAllocations& allocs = *params.buffer_allocations;
 
-  se::DeviceMemoryBase bias;
+  se::DeviceMemoryBase bias, a_scale, b_scale, c_scale, d_scale, d_amax;
   if (bias_buffer_.allocation() != nullptr) {
     bias = allocs.GetDeviceAddress(bias_buffer_);
   }
+  if (a_scale_buffer_.allocation() != nullptr) {
+    a_scale = allocs.GetDeviceAddress(a_scale_buffer_);
+  }
+  if (b_scale_buffer_.allocation() != nullptr) {
+    b_scale = allocs.GetDeviceAddress(b_scale_buffer_);
+  }
+  if (c_scale_buffer_.allocation() != nullptr) {
+    c_scale = allocs.GetDeviceAddress(c_scale_buffer_);
+  }
+  if (d_scale_buffer_.allocation() != nullptr) {
+    d_scale = allocs.GetDeviceAddress(d_scale_buffer_);
+  }
+  if (d_amax_buffer_.allocation() != nullptr) {
+    d_amax = allocs.GetDeviceAddress(d_amax_buffer_);
+  }
+
+  se::DeviceMemoryBase aux;
+  if (aux_buffer_.allocation() != nullptr) {
+    aux = allocs.GetDeviceAddress(aux_buffer_);
+  }
 
   se::OwningScratchAllocator<> scratch_allocator(allocs.device_ordinal(),
                                                  allocs.memory_allocator());
   return plan_.ExecuteOnStream(
       params.stream, allocs.GetDeviceAddress(a_buffer_),
       allocs.GetDeviceAddress(b_buffer_), allocs.GetDeviceAddress(c_buffer_),
-      allocs.GetDeviceAddress(d_buffer_), bias, *algorithm_, scratch_allocator);
+      allocs.GetDeviceAddress(d_buffer_), bias, aux, a_scale, b_scale, c_scale,
+      d_scale, d_amax, *algorithm_, scratch_allocator);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.h b/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.h
index 77b86255c64..da156e322ee 100644
--- a/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.h
@@ -35,7 +35,13 @@ class CublasLtMatmulThunk : public Thunk {
                       BufferAllocation::Slice b_buffer,
                       BufferAllocation::Slice c_buffer,
                       BufferAllocation::Slice d_buffer,
-                      BufferAllocation::Slice bias_buffer /* may be null */);
+                      BufferAllocation::Slice bias_buffer /* may be null */,
+                      BufferAllocation::Slice aux_buffer /* may be null */,
+                      BufferAllocation::Slice a_scale_buffer /* may be null */,
+                      BufferAllocation::Slice b_scale_buffer /* may be null */,
+                      BufferAllocation::Slice c_scale_buffer /* may be null */,
+                      BufferAllocation::Slice d_scale_buffer /* may be null */,
+                      BufferAllocation::Slice d_amax_buffer /* may be null */);
 
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -47,6 +53,12 @@ class CublasLtMatmulThunk : public Thunk {
   BufferAllocation::Slice c_buffer_;
   BufferAllocation::Slice d_buffer_;
   BufferAllocation::Slice bias_buffer_;
+  BufferAllocation::Slice aux_buffer_;
+  BufferAllocation::Slice a_scale_buffer_;
+  BufferAllocation::Slice b_scale_buffer_;
+  BufferAllocation::Slice c_scale_buffer_;
+  BufferAllocation::Slice d_scale_buffer_;
+  BufferAllocation::Slice d_amax_buffer_;
   std::optional<se::cuda::BlasLt::MatmulAlgorithm> algorithm_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.cc b/tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.cc
index 86451c1a4e9..eb90782ba38 100644
--- a/tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.cc
+++ b/tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
index cfea1555f11..e0426d934f1 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
@@ -15,19 +15,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
 
+#include <array>
 #include <functional>
 #include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -44,6 +49,33 @@ bool IsConvCustomCall(const HloInstruction* instr) {
               kCudnnConvBiasActivationForwardCallTarget);
 }
 
+bool IsConvDepthwise(const HloInstruction* instr) {
+  int64_t feature_group_count = instr->feature_group_count();
+  if (feature_group_count == 1) {
+    return false;
+  }
+
+  const HloInstruction* input = instr->operand(0);
+  int64_t input_feature_dimension =
+      instr->convolution_dimension_numbers().input_feature_dimension();
+  int64_t input_feature_count =
+      input->shape().dimensions(input_feature_dimension);
+  return input_feature_count == feature_group_count;
+}
+
+bool IsNonDepthwiseConvCustomCall(const HloInstruction* instr) {
+  return IsConvCustomCall(instr) && !IsConvDepthwise(instr);
+}
+
+bool IsExponentialMinusOne(const HloInstruction* instr) {
+  return instr->opcode() == HloOpcode::kExpm1;
+}
+
+bool HasThreeUsers(const HloInstruction* instr) {
+  int64_t user_count = instr->user_count();
+  return user_count == 3;
+}
+
 // Can instr be converted to type `dst_ty` without losing any precision?  For
 // our purposes, this is true if:
 //
@@ -148,28 +180,30 @@ StatusOr<HloInstruction*> EnsureIsConvBiasActivation(HloInstruction* conv) {
   return FailedPrecondition("Unsupported conv: %s", conv->ToString());
 }
 
-// convert<float>(gte(custom-call<int32>(int8_x, int8_w))) ->
-// gte(custom-call<float>(int8_x, int8_w))
-StatusOr<bool> FuseConvertToFloat(HloComputation* comp) {
+// convert<cvt_type>(gte(custom-call<conv_type>(int8_x, int8_w))) ->
+// gte(custom-call<cvt_type>(int8_x, int8_w))
+StatusOr<bool> FuseConvertTypeIntoConv(HloComputation* comp,
+                                       PrimitiveType conv_type,
+                                       PrimitiveType cvt_type) {
   bool changed = false;
   for (auto instr : comp->MakeInstructionPostOrder()) {
     HloInstruction* conv = nullptr;
+    auto tuple_elem =
+        m::GetTupleElement(m::Op(&conv).WithPredicate(IsConvCustomCall), 0)
+            .WithElementType(conv_type);
     auto pattern =
-        m::Convert(
-            m::GetTupleElement(m::Op(&conv).WithPredicate(IsConvCustomCall), 0)
-                .WithElementType(S32))
-            .WithElementType(F32);
+        m::Convert(tuple_elem.WithOneUser()).WithElementType(cvt_type);
     if (!Match(instr, pattern)) {
       continue;
     }
     if (!ConsumeFuel("cudnn-fused-convolution-rewriter", [&] {
-          return absl::StrCat("FuseConvertToFloat: ", conv->ToString());
+          return absl::StrCat("FuseConvertTypeIntoConv: ", conv->ToString());
         })) {
       continue;
     }
 
     Shape new_shape = conv->shape();
-    new_shape.mutable_tuple_shapes(0)->set_element_type(F32);
+    new_shape.mutable_tuple_shapes(0)->set_element_type(cvt_type);
     HloInstruction* new_conv =
         comp->AddInstruction(conv->CloneWithNewShape(new_shape));
     comp->parent()->SetAndUniquifyInstrName(new_conv, conv->name());
@@ -183,6 +217,33 @@ StatusOr<bool> FuseConvertToFloat(HloComputation* comp) {
   return changed;
 }
 
+struct ConvConvertTypes {
+  PrimitiveType convolution_type;
+  PrimitiveType conversion_type;
+};
+
+// Remove convert around convolution by making the convolution-type
+// (custom call) to be the same as the conversion result.
+// For example: convert<float>(gte(custom-call<int32>(int8_x, int8_w))) ->
+// gte(custom-call<float>(int8_x, int8_w))
+StatusOr<bool> FuseRemoveConvertInConv(HloComputation* comp) {
+  bool changed = false;
+  // Note: We are eliminating F16->F32 due to the benchmark/test
+  // waymo/ml/deploy/sync_test/local:perception_occlusion_net_sync_test_v100
+  // not able to find an appropriate cuDNN function.
+  std::array<ConvConvertTypes, 3> types{{
+      {S32, F32},
+      {S8, F32},
+      {F32, S8},
+  }};
+  for (auto [conv_type, cvt_type] : types) {
+    TF_ASSIGN_OR_RETURN(bool curr_change,
+                        FuseConvertTypeIntoConv(comp, conv_type, cvt_type));
+    changed |= curr_change;
+  }
+  return changed;
+}
+
 // alpha * gte(custom-call(...)) ->
 // gte(custom-call(..., backend_config={alpha})).
 StatusOr<bool> FuseConvAlpha(HloComputation* comp) {
@@ -191,9 +252,12 @@ StatusOr<bool> FuseConvAlpha(HloComputation* comp) {
     HloInstruction* conv = nullptr;
     HloInstruction* gte = nullptr;
     HloInstruction* alpha = nullptr;
+
+    // We don't want to upgrade depthwise convolutions to ConvBiasActivation,
+    // because the fused CUDNN functions are slower for some of those.
     auto pattern = m::MultiplyAnyOrder(
-        m::GetTupleElement(&gte, m::Op(&conv).WithPredicate(IsConvCustomCall),
-                           0)
+        m::GetTupleElement(
+            &gte, m::Op(&conv).WithPredicate(IsNonDepthwiseConvCustomCall), 0)
             .WithOneUse(),
         m::Broadcast(m::ConstantEffectiveScalar(&alpha)));
     if (!Match(instr, pattern)) {
@@ -239,9 +303,15 @@ StatusOr<bool> FuseBiasOrSideInput(HloComputation* comp) {
     HloInstruction* conv = nullptr;
     HloInstruction* gte = nullptr;
     HloInstruction* addend = nullptr;
+
+    // We don't want to upgrade depthwise convolutions to ConvBiasActivation,
+    // because the fused CUDNN functions are slower for some of those.
     auto pattern = m::AddAnyOrder(
-        m::GetTupleElement(
-            &gte, m::Op(&conv).WithPredicate(IsConvCustomCall).WithOneUse(), 0)
+        m::GetTupleElement(&gte,
+                           m::Op(&conv)
+                               .WithPredicate(IsNonDepthwiseConvCustomCall)
+                               .WithOneUse(),
+                           0)
             .WithOneUse(),
         m::Op(&addend));
     if (!Match(instr, pattern)) {
@@ -445,19 +515,102 @@ StatusOr<bool> FuseSideInputAlpha(HloComputation* comp) {
   return changed;
 }
 
+StatusOr<bool> FuseElu(HloComputation* comp, se::CudaComputeCapability cc) {
+  bool changed = false;
+  for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
+    const DebugOptions& debug_options =
+        instr->GetModule()->config().debug_options();
+    if (!debug_options.xla_gpu_use_runtime_fusion() ||
+        !cc.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+      return false;
+    }
+
+    HloInstruction* gte;
+    HloInstruction* conv;
+    HloInstruction* expm1;
+
+    // In Elu computation, the GetTupleElement node will have three users:
+    // Compare, ExponentialMinusOnem, and Select.
+    // We don't want to upgrade depthwise convolutions to ConvBiasActivation,
+    // because the fused CUDNN functions are slower for some of those.
+    auto gte_pattern =
+        m::GetTupleElement(&gte,
+                           m::Op(&conv)
+                               .WithPredicate(IsNonDepthwiseConvCustomCall)
+                               .WithOneUse())
+            .WithElementType(F16)
+            .WithPredicate(HasThreeUsers);
+    if (!Match(instr,
+               m::Select(m::Compare(gte_pattern,
+                                    m::Broadcast(m::ConstantEffectiveScalar(0)))
+                             .WithComparisonDirection(ComparisonDirection::kGt)
+                             .WithOneUse(),
+                         gte_pattern,
+                         m::Op(&expm1)
+                             .WithPredicate(IsExponentialMinusOne)
+                             .WithOperand(0, gte_pattern)
+                             .WithOneUse()))) {
+      continue;
+    }
+
+    // In some cases, the XLA optimizes the inputs of the convolution by
+    // moving and broadcasting the bias to the side input, e.g., when the input
+    // spatial dimensions are all ones and filter spatial dimentsions are all
+    // non-ones. However, there is a known issue that the side input is not well
+    // supported in the cuDNN runtime fusion. Therefore, we skip these cases.
+    // TODO(kaixih@nvidia): remove this check when cuDNN fixes it.
+    if (conv->operands().size() > 3) {
+      continue;
+    }
+
+    // cuDNN runtime funsion kernels require 32-bit aligned data access. Since
+    // we only allow fp16 datatype, we need to check if the in and out channels
+    // of filter are even numbers.
+    const Shape& shape = conv->operand(1)->shape();
+    int64_t num_input_features = shape.dimensions(
+        conv->convolution_dimension_numbers().kernel_input_feature_dimension());
+    int64_t num_output_features =
+        shape.dimensions(conv->convolution_dimension_numbers()
+                             .kernel_output_feature_dimension());
+    if (num_input_features % 2 != 0 || num_output_features % 2 != 0) {
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig config,
+                        conv->backend_config<CudnnConvBackendConfig>());
+    if (config.activation_mode() != se::dnn::kNone) {
+      continue;
+    }
+
+    if (!ConsumeFuel("cudnn-fused-convolution-rewriter", [&] {
+          return absl::StrCat("FuseElu: ", conv->ToString());
+        })) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(conv, EnsureIsConvBiasActivation(conv));
+    config.set_activation_mode(se::dnn::kElu);
+    TF_RETURN_IF_ERROR(conv->set_backend_config(config));
+    TF_RETURN_IF_ERROR(comp->ReplaceInstruction(instr, gte));
+    changed = true;
+  }
+  return changed;
+}
+
 StatusOr<bool> FuseRelu(HloComputation* comp) {
   bool changed = false;
   for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
     HloInstruction* gte;
     HloInstruction* conv;
-    if (!Match(
-            instr,
-            m::MaximumAnyOrder(
-                m::Broadcast(m::ConstantEffectiveScalar(0)),
-                m::GetTupleElement(
-                    &gte,
-                    m::Op(&conv).WithPredicate(IsConvCustomCall).WithOneUse())
-                    .WithOneUse()))) {
+    // We don't want to upgrade depthwise convolutions to ConvBiasActivation,
+    // because the fused CUDNN functions are slower for some of those.
+    if (!Match(instr,
+               m::MaximumAnyOrder(
+                   m::Broadcast(m::ConstantEffectiveScalar(0)),
+                   m::GetTupleElement(
+                       &gte, m::Op(&conv)
+                                 .WithPredicate(IsNonDepthwiseConvCustomCall)
+                                 .WithOneUse())
+                       .WithOneUse()))) {
       continue;
     }
     TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig config,
@@ -764,8 +917,7 @@ StatusOr<bool> CudnnFusedConvRewriter::Run(
        module->MakeNonfusionComputations(execution_threads)) {
     // Fuse "inside out" starting with the operations closest to the conv.
     bool changed = false;
-
-    TF_ASSIGN_OR_RETURN(changed, FuseConvertToFloat(comp));
+    TF_ASSIGN_OR_RETURN(changed, FuseRemoveConvertInConv(comp));
     any_changed |= changed;
 
     TF_ASSIGN_OR_RETURN(changed, FuseConvAlpha(comp));
@@ -786,6 +938,8 @@ StatusOr<bool> CudnnFusedConvRewriter::Run(
     // cases.
     TF_ASSIGN_OR_RETURN(changed, FuseRelu(comp));
     any_changed |= changed;
+    TF_ASSIGN_OR_RETURN(changed, FuseElu(comp, compute_capability_));
+    any_changed |= changed;
 
     TF_ASSIGN_OR_RETURN(changed, FuseConvertToF16(comp));
     any_changed |= changed;
@@ -803,6 +957,8 @@ StatusOr<bool> CudnnFusedConvRewriter::Run(
 
     TF_ASSIGN_OR_RETURN(changed, FuseRelu(comp));
     any_changed |= changed;
+    TF_ASSIGN_OR_RETURN(changed, FuseElu(comp, compute_capability_));
+    any_changed |= changed;
 
     // Check that we don't have any convs outputing integer types other than s8.
     // cudnn does not support these.  They should have been transformed to
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
index 5a9596b2cdd..589fc35df5c 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONV_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONV_REWRITER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -94,6 +94,9 @@ namespace gpu {
 // pass returns an error -- cudnn will not be able to run it.
 class CudnnFusedConvRewriter : public HloModulePass {
  public:
+  explicit CudnnFusedConvRewriter(se::CudaComputeCapability cc)
+      : compute_capability_(cc) {}
+
   absl::string_view name() const override {
     return "cudnn-fused-convolution-rewriter";
   }
@@ -102,6 +105,9 @@ class CudnnFusedConvRewriter : public HloModulePass {
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const se::CudaComputeCapability compute_capability_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
index d3af3305d36..a7412a2598d 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
 
 #include <string>
+#include <string_view>
 
 #include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
@@ -51,6 +52,13 @@ using ::testing::Not;
 
 class CudnnFusedConvRewriterHloTest : public HloTestBase {
  public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
   CudnnFusedConvRewriterHloTest()
       : HloTestBase(/*verifier_layout_sensitive=*/false,
                     /*allow_mixed_precision_in_hlo_verifier=*/false,
@@ -58,6 +66,14 @@ class CudnnFusedConvRewriterHloTest : public HloTestBase {
 };
 
 class CudnnFusedConvRewriterTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+
  protected:
   std::string GetOptimizedHlo(absl::string_view hlo_string) {
     // cudnn_vectorize_convolutions transforms convolutions, making it hard to
@@ -145,6 +161,23 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvOnly) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, DontFuseReluWithDepthwiseConv) {
+  // max(0, conv(x, w));
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,17,9,9] broadcast(zero), dimensions={}
+
+      input = TYPE[1,17,9,9] parameter(0)
+      filter = TYPE[3,3,1,17] parameter(1)
+
+      conv = TYPE[1,17,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=17
+      ROOT relu = TYPE[1,17,9,9] maximum(zeros, conv)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestBias) {
   // max(0, conv(x, w) + bias);
   TestMatchWithAllTypes(R"(
@@ -165,6 +198,83 @@ TEST_F(CudnnFusedConvRewriterTest, TestBias) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, DontFuseBiasWithDepthwiseConv) {
+  // conv(x, w) + bias;
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,1,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=64
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestElu) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Conv-Bias-Elu fusion is supported and recommended with "
+                    "the Nvidia Ampere+ GPUs.";
+  }
+  // sum = conv(x, w) + bias
+  // select(compare(sum, 0, GT), sum, exponential-minus-one(sum));
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      sum = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+      cmp = pred[1,3,3,64] compare(sum, zeros), direction=GT
+      expm1 = TYPE[1,3,3,64] exponential-minus-one(sum)
+      ROOT elu = TYPE[1,3,3,64] select(cmp, sum, expm1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, DontFuseEluWithDepthwiseConv) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Conv-Bias-Elu fusion is supported and recommended with "
+                    "the Nvidia Ampere+ GPUs.";
+  }
+
+  // sum = conv(x, w) + bias
+  // select(compare(sum, 0, GT), sum, exponential-minus-one(sum));
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,1,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=64
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      sum = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+      cmp = pred[1,3,3,64] compare(sum, zeros), direction=GT
+      expm1 = TYPE[1,3,3,64] exponential-minus-one(sum)
+      ROOT elu = TYPE[1,3,3,64] select(cmp, sum, expm1)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestSideInputOnly) {
   // max(0, conv(x, w) + side_input);
   TestMatchWithAllTypes(R"(
@@ -184,6 +294,25 @@ TEST_F(CudnnFusedConvRewriterTest, TestSideInputOnly) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, DontFuseSideInputWithDepthwiseConv) {
+  // max(0, conv(x, w) + side_input);
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,1,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=64
+      add1 = TYPE[1,3,3,64] add(conv, side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestBiasAndSideInput) {
   // max(0, conv(x, w) + side_input + bias);
   TestMatchWithAllTypes(R"(
@@ -226,6 +355,26 @@ TEST_F(CudnnFusedConvRewriterTest, TestScaledConv) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, DontFuseScaledDepthwiseConv) {
+  // max(0, 0.999994934 * conv(x, w));
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,17,9,9] broadcast(zero), dimensions={}
+      alpha_conv_scalar = TYPE[] constant(0.999994934)
+
+      input = TYPE[1,17,9,9] parameter(0)
+      filter = TYPE[3,3,1,17] parameter(1)
+
+      conv = TYPE[1,17,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=17
+      alpha_conv = TYPE[1,17,9,9] broadcast(alpha_conv_scalar), dimensions={}
+      scaled_conv = TYPE[1,17,9,9] multiply(conv, alpha_conv)
+      ROOT relu = TYPE[1,17,9,9] maximum(zeros, scaled_conv)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestNoCrashOnInf) {
   EXPECT_TRUE(RunAndCompare(R"(
     HloModule Test
@@ -246,7 +395,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestNoCrashOnInf) {
                             ErrorSpec{0.01}));
 }
 
-TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndSideInput) {
+TEST_F(CudnnFusedConvRewriterTest, TestConvAndScaledSideInput) {
   // max(0, conv(x, w) + 0.899994934 * side_input);
   TestMatchWithAllTypes(R"(
     HloModule Test
@@ -268,6 +417,28 @@ TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndSideInput) {
     })");
 }
 
+TEST_F(CudnnFusedConvRewriterTest, DontFuseDepthwiseConvWithScaledSideInput) {
+  // max(0, conv(x, w) + 0.899994934 * side_input);
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_side_input_scalar = TYPE[] constant(0.899994934)
+      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,1,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=64
+      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
+      add1 = TYPE[1,3,3,64] add(conv, scaled_side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
 TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndScaledSideInput) {
   // max(0, 0.999994934 * conv(x, w) + 0.899994934 * side_input);
   TestMatchWithAllTypes(R"(
@@ -421,8 +592,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8) {
     })",
       // post_hlo
       R"(
-      ; CHECK-LABEL: ENTRY %Test (input: s8[1,17,9,9], filter: s8[3,3,17,32]) -> s8[1,32,9,9] {
-      ; CHECK:  %cudnn-conv{{(\.[0-9])?}} = (s8[1,32,9,9]{1,3,2,0}, u8[{{[0-9]*}}]{0}) custom-call(%fusion{{(\.[0-9])?}}, %fusion{{(\.[0-9])?}}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convForward", backend_config=
+// CHECK: [[cudnn_conv_4_0:%[^ ]+]] = (s8[1,9,9,32]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[fusion_2_1:%[^ ]+]], [[fusion_1_2:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward"
       )");
 }
 
@@ -447,13 +617,13 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloat) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::GetTupleElement(
-                             m::CustomCall(kCudnnConvForwardCallTarget), 0)
+                             m::CustomCall({kCudnnConvForwardCallTarget}), 0)
                              .WithShape(F32, {1, 32, 9, 9})));
 }
 
@@ -479,7 +649,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToInt8BiasSideInput) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -490,7 +660,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToInt8BiasSideInput) {
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-                     m::CustomCall(kCudnnConvBiasActivationForwardCallTarget,
+                     m::CustomCall({kCudnnConvBiasActivationForwardCallTarget},
                                    m::Parameter(0), m::Parameter(1),
                                    m::Parameter(2), m::Parameter(3)),
                      0)
@@ -518,7 +688,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestReluAfterConvert) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -532,7 +702,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestReluAfterConvert) {
       GmockMatch(
           m::GetTupleElement(
               m::CustomCall(
-                  &conv, kCudnnConvBiasActivationForwardCallTarget,
+                  &conv, {kCudnnConvBiasActivationForwardCallTarget},
                   m::Parameter(0),  //
                   m::Parameter(1),  //
                   m::Broadcast(
@@ -569,7 +739,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloatBiasSideInput) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -580,7 +750,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, TestConvInt8ToFloatBiasSideInput) {
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-                     m::CustomCall(kCudnnConvBiasActivationForwardCallTarget,
+                     m::CustomCall({kCudnnConvBiasActivationForwardCallTarget},
                                    m::Parameter(0), m::Parameter(1),
                                    m::Parameter(2), m::Parameter(3)),
                      0)
@@ -613,7 +783,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, Int8SideInputWithScaleAndReshape) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -630,7 +800,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, Int8SideInputWithScaleAndReshape) {
       GmockMatch(
           m::GetTupleElement(
               m::CustomCall(
-                  &conv, kCudnnConvBiasActivationForwardCallTarget,
+                  &conv, {kCudnnConvBiasActivationForwardCallTarget},
                   m::Parameter(0),  //
                   m::Parameter(1),  //
                   m::Parameter(2),  //
@@ -665,7 +835,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseAlpha) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -674,7 +844,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseAlpha) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget),
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget}),
               0)
               .WithShape(F32, {1, 32, 9, 9})));
   TF_ASSERT_OK_AND_ASSIGN(auto config,
@@ -703,7 +873,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseRelu) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -712,7 +882,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseRelu) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1), m::Parameter(2)),
               0)
               .WithShape(F32, {1, 32, 9, 9})));
@@ -742,7 +912,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseReluIfMultipleUses) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -754,7 +924,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseReluIfMultipleUses) {
               m::Broadcast(m::ConstantEffectiveScalar(0)),
               m::GetTupleElement(
                   m::CustomCall(
-                      &conv, kCudnnConvBiasActivationForwardCallTarget,
+                      &conv, {kCudnnConvBiasActivationForwardCallTarget},
                       m::Parameter(0), m::Parameter(1), m::Parameter(2)),
                   0)
                   .WithShape(F32, {1, 32, 9, 9})),
@@ -764,6 +934,106 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseReluIfMultipleUses) {
   EXPECT_EQ(config.activation_mode(), se::dnn::kNone);
 }
 
+TEST_F(CudnnFusedConvRewriterHloTest, FuseElu) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Conv-Bias-Elu fusion is supported and recommended with "
+                    "the Nvidia Ampere+ GPUs.";
+  }
+  const std::string module_str = R"(
+    HloModule Test
+
+    ENTRY Test {
+      inputs = f16[1,16,9,9] parameter(0)
+      filters = f16[3,3,16,32] parameter(1)
+      bias = f16[32] parameter(2)
+      bias_broadcast = f16[1,32,9,9] broadcast(bias), dimensions={1}
+      zero = f16[] constant(0)
+      zeros = f16[1,32,9,9] broadcast(zero), dimensions={}
+      conv = f16[1,32,9,9] convolution(inputs, filters),
+               window={size=3x3 pad=1_1x1_1},
+               dim_labels=bf01_01io->bf01
+      sum = add(conv, bias_broadcast)
+      cmp = compare(sum, zeros), direction=GT
+      expm1 = exponential-minus-one(sum)
+      ROOT elu = select(cmp, sum, expm1)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+
+  SCOPED_TRACE(m->ToString());
+  const HloInstruction* conv;
+  ASSERT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::GetTupleElement(
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
+                            m::Parameter(0), m::Parameter(1), m::Parameter(2)),
+              0)
+              .WithShape(F16, {1, 32, 9, 9})));
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          conv->backend_config<CudnnConvBackendConfig>());
+  EXPECT_EQ(config.activation_mode(), se::dnn::kElu);
+}
+
+TEST_F(CudnnFusedConvRewriterHloTest, DontFuseEluIfMultipleUses) {
+  const std::string module_str = R"(
+    HloModule Test
+
+    ENTRY Test {
+      inputs = f16[1,16,9,9] parameter(0)
+      filters = f16[3,3,16,32] parameter(1)
+      bias = f16[32] parameter(2)
+      bias_broadcast = f16[1,32,9,9] broadcast(bias), dimensions={1}
+      zero = f16[] constant(0)
+      zeros = f16[1,32,9,9] broadcast(zero), dimensions={}
+      conv = f16[1,32,9,9] convolution(inputs, filters),
+               window={size=3x3 pad=1_1x1_1},
+               dim_labels=bf01_01io->bf01
+      sum = add(conv, bias_broadcast)
+      cmp = compare(sum, zeros), direction=GT
+      expm1 = exponential-minus-one(sum)
+      elu = select(cmp, sum, expm1)
+      not_elu = minimum(sum, zeros)
+      ROOT root = tuple(elu, not_elu) 
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+
+  SCOPED_TRACE(m->ToString());
+  const HloInstruction* conv;
+  auto gte_pattern =
+      m::GetTupleElement(
+          m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
+                        m::Parameter(0), m::Parameter(1), m::Parameter(2)),
+          0)
+          .WithShape(F16, {1, 32, 9, 9});
+  ASSERT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Select(m::Compare(gte_pattern,
+                               m::Broadcast(m::ConstantEffectiveScalar(0)))
+                        .WithComparisonDirection(ComparisonDirection::kGt),
+                    gte_pattern,
+                    m::Op()
+                        .WithPredicate([](const HloInstruction* instr) {
+                          return instr->opcode() == HloOpcode::kExpm1;
+                        })
+                        .WithOperand(0, gte_pattern)),
+          m::Minimum())));
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          conv->backend_config<CudnnConvBackendConfig>());
+  EXPECT_EQ(config.activation_mode(), se::dnn::kNone);
+}
+
 TEST_F(CudnnFusedConvRewriterHloTest, DontFuseAlphaIfMultipleUsers) {
   const std::string module_str = R"(
     HloModule Test
@@ -783,7 +1053,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseAlphaIfMultipleUsers) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -821,7 +1091,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseBiasIfMultipleUsers) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -858,7 +1128,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseSideInputThroughRelu) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -895,7 +1165,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseBiasThroughRelu) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -929,7 +1199,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseSideInputIfMultipleUsers) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -964,7 +1234,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseConvertToF16IfMultipleUsers) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -997,7 +1267,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseToS8IfMultipleUsers) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1013,6 +1283,109 @@ TEST_F(CudnnFusedConvRewriterHloTest, DontFuseToS8IfMultipleUsers) {
   EXPECT_EQ(conv1, conv2);
 }
 
+TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS32ToF32) {
+  const std::string_view module_str = R"(
+    HloModule Test
+
+    ENTRY test_entry {
+      inputs = s8[1, 17, 9, 9] parameter(0)
+      filters = s8[3, 3, 17, 32] parameter(1)
+      mult_op  = f32[1, 32, 9, 9] parameter(2)
+      conv = s32[1, 32, 9, 9] convolution(inputs, filters), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01
+      ROOT ret = multiply(f32[1, 32, 9, 9] convert(conv), mult_op)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* conv1 = nullptr;
+  // Checks that it removed the Convert inside multiply around conv.
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::GetTupleElement(m::CustomCall(&conv1)),
+                                     m::Parameter(2))));
+}
+
+TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingS8ToF32) {
+  const std::string_view module_str = R"(
+    HloModule Test
+
+    ENTRY test_entry {
+      inputs = s8[1, 17, 9, 9] parameter(0)
+      filters = s8[3, 3, 17, 32] parameter(1)
+      mult_op  = f32[1, 32, 9, 9] parameter(2)
+      conv = convolution(inputs, filters), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01
+      ROOT ret = multiply(f32[1, 32, 9, 9] convert(conv), mult_op)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* conv1 = nullptr;
+  // Checks that it removed the Convert inside multiply around conv.
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::GetTupleElement(m::CustomCall(&conv1)),
+                                     m::Parameter(2))));
+}
+
+TEST_F(CudnnFusedConvRewriterHloTest, RemoveConvertByFusingF32ToS8) {
+  const std::string_view module_str = R"(
+    HloModule Test
+
+    ENTRY test_entry {
+      inputs = f32[1, 17, 9, 9] parameter(0)
+      filters = f32[3, 3, 17, 32] parameter(1)
+      mult_op  = s8[1, 32, 9, 9] parameter(2)
+      conv = convolution(inputs, filters), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01
+      ROOT ret = multiply(s8[1, 32, 9, 9] convert(conv), mult_op)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* conv1 = nullptr;
+  // Checks that it removed the Convert inside multiply around conv.
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::GetTupleElement(m::CustomCall(&conv1)),
+                                     m::Parameter(2))));
+}
+
+TEST_F(CudnnFusedConvRewriterHloTest, DontRemoveConvertDuetoMultpleUser) {
+  const std::string_view module_str = R"(
+    HloModule Test
+
+    ENTRY test_entry {
+      inputs = f32[1, 17, 9, 9] parameter(0)
+      filters = f32[3, 3, 17, 32] parameter(1)
+      mult_op  = s8[1, 32, 9, 9] parameter(2)
+      sub_op = s8[1, 32, 9, 9] parameter(3)
+      conv = convolution(inputs, filters), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01
+      another = subtract(s8[1, 32, 9, 9] convert(conv), sub_op)
+      ROOT ret = multiply(s8[1, 32, 9, 9] convert(conv), mult_op)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  GpuConvRewriter rewriter;
+  TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
+  TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* conv1 = nullptr;
+  // Checks that it removed the Convert inside multiply around conv.
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(
+                  m::Convert(m::GetTupleElement(m::CustomCall(&conv1))),
+                  m::Parameter(2))));
+}
+
 TEST_F(CudnnFusedConvRewriterHloTest, FuseBias) {
   const std::string module_str = R"(
     HloModule Test
@@ -1031,7 +1404,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBias) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1039,7 +1412,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBias) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall({kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1), m::Parameter(2)),
               0)
               .WithShape(F32, {1, 32, 9, 9})));
@@ -1062,7 +1435,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseSideInput) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1071,7 +1444,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseSideInput) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1),
                             m::Broadcast(m::ConstantEffectiveScalar(0))
                                 .WithShape(F32, {32}),
@@ -1103,7 +1476,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseScaledSideInput) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1112,7 +1485,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseScaledSideInput) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1),
                             m::Broadcast(m::ConstantEffectiveScalar(0))
                                 .WithShape(F32, {32}),
@@ -1144,7 +1517,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBiasAndSideInput) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1153,7 +1526,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseBiasAndSideInput) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1), m::Parameter(2),
                             m::Parameter(3)),
               0)
@@ -1180,7 +1553,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, EffectiveScalarBias) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   SCOPED_TRACE(m->ToString());
@@ -1189,7 +1562,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, EffectiveScalarBias) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1),
                             m::Broadcast(m::Parameter(2)).WithShape(F32, {32})),
               0)
@@ -1222,7 +1595,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, StrengthReduceF32ToF16) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1235,7 +1608,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, StrengthReduceF32ToF16) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0), m::Parameter(1), m::Parameter(2),
                             m::Parameter(3)),
               0)
@@ -1267,7 +1640,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, BroadcastReshapeTransposeAfterConvert) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1280,7 +1653,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, BroadcastReshapeTransposeAfterConvert) {
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
                      m::CustomCall(
-                         &conv, kCudnnConvBiasActivationForwardCallTarget,
+                         &conv, {kCudnnConvBiasActivationForwardCallTarget},
                          m::Convert(m::Reshape(m::Convert(m::Parameter(0))))
                              .WithElementType(F16),
                          m::Convert(m::Transpose(m::Convert(m::Parameter(1))))
@@ -1318,7 +1691,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, NoStrengthReduceF32ToF16IfBiasIsF32) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1334,7 +1707,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, NoStrengthReduceF32ToF16IfBiasIsF32) {
       GmockMatch(
           m::Convert(m::GetTupleElement(
                          m::CustomCall(
-                             &conv, kCudnnConvBiasActivationForwardCallTarget,
+                             &conv, {kCudnnConvBiasActivationForwardCallTarget},
                              m::Convert(m::Parameter(0)).WithElementType(F32),
                              m::Convert(m::Parameter(1)).WithElementType(F32),
                              m::Parameter(2),
@@ -1372,7 +1745,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32Constants) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph, and fold
@@ -1388,7 +1761,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32Constants) {
       m->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
                      m::CustomCall(
-                         &conv, kCudnnConvBiasActivationForwardCallTarget,
+                         &conv, {kCudnnConvBiasActivationForwardCallTarget},
                          m::Parameter(0), m::Constant().WithElementType(F16),
                          m::Parameter(1), m::Constant().WithElementType(F16)),
                      0)
@@ -1424,7 +1797,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32ConstantsNotLosslesslyConvertible) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph, and fold
@@ -1443,7 +1816,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, F32ConstantsNotLosslesslyConvertible) {
       GmockMatch(
           m::Convert(m::GetTupleElement(
                          m::CustomCall(
-                             &conv, kCudnnConvBiasActivationForwardCallTarget,
+                             &conv, {kCudnnConvBiasActivationForwardCallTarget},
                              m::Convert(m::Parameter(0)).WithElementType(F32),
                              m::Constant().WithElementType(F32),
                              m::Convert(m::Parameter(1)).WithElementType(F32),
@@ -1485,7 +1858,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseReluBeforeConvert) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1498,7 +1871,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, FuseReluBeforeConvert) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0),  //
                             m::Parameter(1),  //
                             m::Broadcast(m::ConstantEffectiveScalar(0))
@@ -1525,7 +1898,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, BiasTypeMatchesConvTypeIfFp) {
 
   GpuConvRewriter rewriter;
   TF_ASSERT_OK(RunHloPass(&rewriter, m.get()).status());
-  CudnnFusedConvRewriter fuser;
+  CudnnFusedConvRewriter fuser{GetCudaComputeCapability()};
   TF_ASSERT_OK(RunHloPass(&fuser, m.get()).status());
 
   // Simplify new `convert`'s that may be added to the graph.
@@ -1538,7 +1911,7 @@ TEST_F(CudnnFusedConvRewriterHloTest, BiasTypeMatchesConvTypeIfFp) {
       m->entry_computation()->root_instruction(),
       GmockMatch(
           m::GetTupleElement(
-              m::CustomCall(&conv, kCudnnConvBiasActivationForwardCallTarget,
+              m::CustomCall(&conv, {kCudnnConvBiasActivationForwardCallTarget},
                             m::Parameter(0),  //
                             m::Parameter(1),  //
                             m::Convert(m::Parameter(2)).WithShape(F64, {32})),
@@ -1582,9 +1955,7 @@ TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8) {
     })",
       // post_hlo
       R"(
-      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], bias: f32[64]) -> s8[1,3,3,64]
-      ; CHECK:  %cudnn-conv-bias-activation{{(\.[0-9])?}} = (s8[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
-      ; CHECK-NEXT:  ROOT %get-tuple-element{{(\.[0-9])?}} = s8[1,3,3,64]{3,2,1,0} get-tuple-element(%cudnn-conv-bias-activation{{(\.[0-9])?}}), index=0
+// CHECK: [[cudnn_conv_bias_activation_7_0:%[^ ]+]] = (s8[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call([[input_1:%[^ ]+]], [[transpose_2:%[^ ]+]], [[bias_3:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convBiasActivationForward"
       )");
 }
 
@@ -1618,8 +1989,8 @@ TEST_F(CudnnFusedConvRewriterTest, DISABLED_TestFusedConvInt8ToFloat) {
       // post_hlo
       R"(
       ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], bias: f32[64]) -> f32[1,3,3,64] {
-      ; CHECK:  %custom-call{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
-      ; CHECK-NEXT:  ROOT %get-tuple-element{{(\.[0-9])?}} = f32[1,3,3,64]{3,2,1,0} get-tuple-element(%custom-call{{(\.[0-9])?}}), index=0
+      ; CHECK:  [[custom_call_0:%[^ ]+]]{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call([[input_1:%[^ ]+]], [[copy_2:%[^ ]+]]{{(\.[0-9])?}}, [[bias_3:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
+      ; CHECK-NEXT:  ROOT [[get_tuple_element_4:%[^ ]+]]{{(\.[0-9])?}} = f32[1,3,3,64]{3,2,1,0} get-tuple-element([[custom_call_0]]{{(\.[0-9])?}}), index=0
       )");
 }
 
@@ -1670,14 +2041,7 @@ TEST_F(CudnnFusedConvRewriterTest,
     })",
       // post_hlo
       R"(
-      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: s8[1,3,3,64], bias: f32[64]) -> s8[1,3,3,64] {
-      ; CHECK:  %cudnn-conv-bias-activation{{(\.[0-9]+)?}} =
-      ; CHECK-SAME: (s8[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0})
-      ; CHECK-SAME: custom-call(%input, %copy{{(\.[0-9]+)?}}, %bias, %side_input),
-      ; CHECK-SAME: window={size=3x3 pad=1_1x1_1},
-      ; CHECK-SAME: dim_labels=b01f_01io->b01f,
-      ; CHECK-SAME: custom_call_target="__cudnn$convBiasActivationForward",
-      ; CHECK-NEXT: ROOT %get-tuple-element{{(\.[0-9]+)?}} = s8[1,3,3,64]{3,2,1,0} get-tuple-element(%cudnn-conv-bias-activation{{(\.[0-9]+)?}}), index=0
+// CHECK: [[cudnn_conv_bias_activation_11_0:%[^ ]+]] = (s8[1,3,3,64]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[input_1:%[^ ]+]], [[transpose_2:%[^ ]+]], [[bias_3:%[^ ]+]], [[side_input_4:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convBiasActivationForward"
       )");
 }
 
@@ -1729,9 +2093,7 @@ TEST_F(CudnnFusedConvRewriterTest,
     })",
       //  post_hlo
       R"(
-      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: f32[1,3,3,64], bias: f32[64]) -> s8[1,3,3,64] {
-      ; CHECK:  %cudnn-conv-bias-activation{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]+}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias, %side_input), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
-      ; CHECK:  ROOT %fusion = s8[1,3,3,64]{3,2,1,0} fusion(%get-tuple-element{{(\.[0-9])?}}), kind=kLoop, calls=%fused_computation
+// CHECK: [[cudnn_conv_bias_activation_9_0:%[^ ]+]] = (f32[1,3,3,64]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[input_1:%[^ ]+]], [[transpose_2:%[^ ]+]], [[bias_3:%[^ ]+]], [[side_input_4:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convBiasActivationForward"
       )");
 }
 
@@ -1782,10 +2144,7 @@ TEST_F(CudnnFusedConvRewriterTest,
     })",
       // post_hlo
       R"(
-      ; CHECK-LABEL: ENTRY %Test (input: s8[1,3,3,64], filter: s8[3,3,64,64], side_input: s8[1,3,3,64], bias: f32[64]) -> f32[1,3,3,64] {
-      ; CHECK:  %side_input_f32 = f32[1,3,3,64]{3,2,1,0} convert(%side_input)
-      ; CHECK:  %cudnn-conv-bias-activation{{(\.[0-9])?}} = (f32[1,3,3,64]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(%input, %copy{{(\.[0-9])?}}, %bias, %side_input_f32), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, custom_call_target="__cudnn$convBiasActivationForward", backend_config=
-      ; CHECK:  ROOT %fusion = f32[1,3,3,64]{3,2,1,0} fusion(%get-tuple-element{{(\.[0-9])?}}), kind=kLoop, calls=%fused_computation
+// CHECK: [[cudnn_conv_bias_activation_9_0:%[^ ]+]] = (f32[1,3,3,64]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[input_1:%[^ ]+]], [[transpose_2:%[^ ]+]], [[bias_3:%[^ ]+]], [[fusion_1_4:%[^ ]+]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convBiasActivationForward"
       )");
 }
 
@@ -1808,7 +2167,8 @@ TEST_F(CudnnFusedConvRewriterTest, TestConvInt8ToInt8NoClamp) {
     })");
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ASSERT_FALSE(CudnnFusedConvRewriter().Run(m.get()).ok());
+  ASSERT_FALSE(
+      CudnnFusedConvRewriter(GetCudaComputeCapability()).Run(m.get()).ok());
 }
 
 TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8NoClamp) {
@@ -1832,7 +2192,8 @@ TEST_F(CudnnFusedConvRewriterTest, TestFusedConvInt8ToInt8NoClamp) {
     })");
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
-  ASSERT_FALSE(CudnnFusedConvRewriter().Run(m.get()).ok());
+  ASSERT_FALSE(
+      CudnnFusedConvRewriter(GetCudaComputeCapability()).Run(m.get()).ok());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
index 070debf0e84..ac7efb51e31 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/bind_front.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h
index 396724d4c20..e3e26a592fe 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_PAD_FOR_CONVOLUTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_PAD_FOR_CONVOLUTIONS_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
index 04dd6f45863..e1c027dd27f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions_test.cc
@@ -51,7 +51,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16ForwardConvInputChannels) {
   EXPECT_THAT(
       root,
       GmockMatch(m::CustomCall(
-          kCudnnConvForwardCallTarget,
+          {kCudnnConvForwardCallTarget},
           m::Pad(m::Parameter(0), m::Op()).WithShape(F16, {10, 20, 30, 48}),
           m::Pad(m::Parameter(1), m::Op()).WithShape(F16, {2, 2, 48, 40}))));
 }
@@ -73,7 +73,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardInputConvOutputChannels) {
   EXPECT_THAT(
       root,
       GmockMatch(m::CustomCall(
-          kCudnnConvBackwardInputCallTarget,
+          {kCudnnConvBackwardInputCallTarget},
           m::Pad(m::Parameter(0), m::Op()).WithShape(F16, {10, 20, 30, 48}),
           m::Pad(m::Parameter(1), m::Op()).WithShape(F16, {2, 2, 40, 48}))));
 }
@@ -94,7 +94,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16ForwardConvOutputChannels) {
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Tuple(
                         m::Slice(m::GetTupleElement(m::CustomCall(
-                            kCudnnConvForwardCallTarget, m::Parameter(0),
+                            {kCudnnConvForwardCallTarget}, m::Parameter(0),
                             m::Pad(m::Parameter(1), m::Op())))),
                         m::Op())));
 }
@@ -114,11 +114,12 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardInputConvInputChannels) {
                     .value();
   EXPECT_TRUE(CudnnPadForConvolutions({7, 0}).Run(module.get()).value());
   auto* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(m::Tuple(
-                        m::Slice(m::GetTupleElement(m::CustomCall(
-                            kCudnnConvBackwardInputCallTarget, m::Parameter(0),
-                            m::Pad(m::Parameter(1), m::Op())))),
-                        m::Op()))));
+  EXPECT_THAT(root,
+              GmockMatch(m::GetTupleElement(m::Tuple(
+                  m::Slice(m::GetTupleElement(m::CustomCall(
+                      {kCudnnConvBackwardInputCallTarget}, m::Parameter(0),
+                      m::Pad(m::Parameter(1), m::Op())))),
+                  m::Op()))));
 }
 
 TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardFilterConvInputChannels) {
@@ -139,7 +140,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardFilterConvInputChannels) {
   EXPECT_THAT(root,
               GmockMatch(m::GetTupleElement(m::Tuple(
                   m::Slice(m::GetTupleElement(m::CustomCall(
-                      kCudnnConvBackwardFilterCallTarget,
+                      {kCudnnConvBackwardFilterCallTarget},
                       m::Pad(m::Parameter(0), m::Op()), m::Parameter(1)))),
                   m::Op()))));
 }
@@ -159,11 +160,12 @@ TEST_F(CudnnPadForConvolutionsTest, PadF16BackwardFilterConvOutputChannels) {
                     .value();
   EXPECT_TRUE(CudnnPadForConvolutions({7, 0}).Run(module.get()).value());
   auto* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(m::Tuple(
-                        m::Slice(m::GetTupleElement(m::CustomCall(
-                            kCudnnConvBackwardFilterCallTarget, m::Parameter(0),
-                            m::Pad(m::Parameter(1), m::Op())))),
-                        m::Op()))));
+  EXPECT_THAT(root,
+              GmockMatch(m::GetTupleElement(m::Tuple(
+                  m::Slice(m::GetTupleElement(m::CustomCall(
+                      {kCudnnConvBackwardFilterCallTarget}, m::Parameter(0),
+                      m::Pad(m::Parameter(1), m::Op())))),
+                  m::Op()))));
 }
 
 TEST_F(CudnnPadForConvolutionsTest, PadInputFeatures3To4) {
@@ -185,7 +187,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadInputFeatures3To4) {
   EXPECT_THAT(
       root,
       GmockMatch(m::CustomCall(
-          kCudnnConvForwardCallTarget,
+          {kCudnnConvForwardCallTarget},
           m::Pad(m::Parameter(0), m::Op()).WithShape(F16, {10, 20, 30, 4}),
           m::Pad(m::Parameter(1), m::Op()).WithShape(F16, {2, 2, 4, 32}))));
 }
@@ -209,7 +211,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadIntForwardConvInputChannels) {
   EXPECT_THAT(
       root,
       GmockMatch(m::CustomCall(
-          kCudnnConvForwardCallTarget,
+          {kCudnnConvForwardCallTarget},
           m::Pad(m::Parameter(0), m::Op()).WithShape(S8, {10, 20, 30, 44}),
           m::Pad(m::Parameter(1), m::Op()).WithShape(S8, {2, 2, 44, 40}))));
 }
@@ -230,7 +232,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadIntForwardConvOutputChannels) {
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Tuple(
                         m::Slice(m::GetTupleElement(m::CustomCall(
-                            kCudnnConvForwardCallTarget, m::Parameter(0),
+                            {kCudnnConvForwardCallTarget}, m::Parameter(0),
                             m::Pad(m::Parameter(1), m::Op())))),
                         m::Op())));
 }
@@ -253,7 +255,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadInt8To32OnSm75) {
       root,
       GmockMatch(m::Tuple(
           m::Slice(m::GetTupleElement(m::CustomCall(
-              kCudnnConvForwardCallTarget,
+              {kCudnnConvForwardCallTarget},
               m::Pad(m::Parameter(0), m::Op()).WithShape(S8, {10, 20, 30, 64}),
               m::Pad(m::Parameter(1), m::Op()).WithShape(S8, {2, 2, 64, 64})))),
           m::Op())));
@@ -277,7 +279,7 @@ TEST_F(CudnnPadForConvolutionsTest, NoPadInt8To32OnSm70) {
       root,
       GmockMatch(m::Tuple(
           m::Slice(m::GetTupleElement(m::CustomCall(
-              kCudnnConvForwardCallTarget, m::Parameter(0),
+              {kCudnnConvForwardCallTarget}, m::Parameter(0),
               m::Pad(m::Parameter(1), m::Op()).WithShape(S8, {2, 2, 40, 44})))),
           m::Op())));
 }
@@ -304,7 +306,7 @@ TEST_F(CudnnPadForConvolutionsTest, NoPadInt8To32FloatOutputSm75) {
       root,
       GmockMatch(m::Tuple(
           m::Slice(m::GetTupleElement(m::CustomCall(
-              kCudnnConvForwardCallTarget,
+              {kCudnnConvForwardCallTarget},
               m::Pad(m::Parameter(0), m::Op()).WithShape(S8, {10, 20, 30, 40}),
               m::Pad(m::Parameter(1), m::Op()).WithShape(S8, {2, 2, 40, 44})))),
           m::Op())));
@@ -361,7 +363,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadInt8x4To32) {
       root,
       GmockMatch(m::Tuple(
           m::Slice(m::GetTupleElement(
-                       m::CustomCall(kCudnnConvForwardCallTarget,
+                       m::CustomCall({kCudnnConvForwardCallTarget},
                                      m::Pad(m::Parameter(0), m::Op())
                                          .WithShape(S8, {10, 20, 30, 48, 4}),
                                      m::Pad(m::Parameter(1), m::Op())
@@ -392,7 +394,7 @@ TEST_F(CudnnPadForConvolutionsTest, PadInt8x4To32BiasActivation) {
           m::Slice(
               m::GetTupleElement(
                   m::CustomCall(
-                      kCudnnConvBiasActivationForwardCallTarget,
+                      {kCudnnConvBiasActivationForwardCallTarget},
                       m::Pad(m::Parameter(0), m::Op())
                           .WithShape(S8, {10, 20, 30, 48, 4}),
                       m::Pad(m::Parameter(1), m::Op())
@@ -423,7 +425,7 @@ TEST_F(CudnnPadForConvolutionsTest,
   auto* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::GetTupleElement(m::Tuple(
                         m::Slice(m::GetTupleElement(m::CustomCall(
-                            kCudnnConvBiasActivationForwardCallTarget,
+                            {kCudnnConvBiasActivationForwardCallTarget},
                             m::Pad(m::Parameter(0), m::Op()),
                             m::Pad(m::Parameter(1), m::Op()),
                             m::Pad(m::Convert(m::Parameter(3)), m::Op()),
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding.cc b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding.cc
index ca89e47be55..8a890efd65c 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding.cc
@@ -164,36 +164,43 @@ std::optional<int64_t> NumTrailingZeroOutputFeatures(HloInstruction* conv) {
     for (int64_t dim : dims) {
       multi_index.push_back(dim - 1);
     }
-    while (true) {
-      if (!lit.IsZero(multi_index)) {
-        break;
-      }
-      multi_index[multi_index.size() - 1]--;
-      for (int i = multi_index.size() - 2; i > 0; i--) {
-        if (multi_index[i] == -1) {
-          multi_index[i] = dims[i] - 1;
-          multi_index[i - 1]--;
-        } else {
-          break;
+    // This iterates through the literal with feature_dim as the most
+    // major dimension looking for the final non-zero feature.
+    auto decrement_multi_index = [&] {
+      for (int i = 0; i < multi_index.size(); ++i) {
+        if (i != feature_dim) {
+          int64_t& idx = multi_index[i];
+          --idx;
+          if (idx == -1) {
+            idx = dims[i] - 1;
+          } else {
+            return true;
+          }
         }
       }
-      if (multi_index[0] == -1) {
+      int64_t& idx = multi_index[feature_dim];
+      --idx;
+      return idx != -1;
+    };
+    do {
+      if (!lit.IsZero(multi_index)) {
         break;
       }
-    }
+    } while (decrement_multi_index());
 
-    VLOG(2) << "First nonzero index in weights constant is "
-            << absl::StrJoin(multi_index, ",");
-    int64_t first_nonzero_feature = multi_index[feature_dim];
-    // "round up" the first nonzero feature index if it's not *all* zeros.
-    for (int i = 0; i < multi_index.size(); i++) {
-      if (i != feature_dim && multi_index[i] != 0) {
-        first_nonzero_feature++;
-        break;
-      }
+    // The iteration stops if a feature has a non-zero value (or -1), but we
+    // want the first zero feature which is always the next one (or 0 if -1).
+    int64_t first_trailing_zero_feature = multi_index[feature_dim] + 1;
+
+    if (first_trailing_zero_feature == 0) {
+      VLOG(2) << "Weights constant is entirely zero.";
+    } else {
+      VLOG(2) << "First nonzero index in weights constant is "
+              << absl::StrJoin(multi_index, ",");
     }
-    int64_t ret = std::max<int64_t>(
-        0, weights->shape().dimensions(feature_dim) - first_nonzero_feature);
+    int64_t ret =
+        std::max<int64_t>(0, weights->shape().dimensions(feature_dim) -
+                                 first_trailing_zero_feature);
     VLOG(2) << "Success: weights is a constant; num zero trailing output "
                "features is "
             << ret;
@@ -333,8 +340,10 @@ StatusOr<bool> TrySimplifyPadding(HloInstruction* instr) {
 
   // We're only allowed to slice the feature dim.
   for (int64_t dim = 0; dim < slice->slice_limits().size(); dim++) {
-    if (dim != output_feature_dim &&
-        slice->slice_limits(dim) != slice->shape().dimensions(dim)) {
+    if (slice->slice_starts(dim) != 0 || slice->slice_strides(dim) != 1 ||
+        (dim != output_feature_dim &&
+         slice->slice_limits(dim) !=
+             slice->operand(0)->shape().dimensions(dim))) {
       VLOG(2) << "fail: Slice removes something other than the features dim.";
       return false;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
index fac597ca0bb..74c9f15addd 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -65,6 +66,9 @@ class CudnnSimplifyPaddingTest : public HloTestBase {
                         RunHloPass(CudnnSimplifyPadding(), module));
     VLOG(1) << "after simplify_padding:\n" << module->ToString();
 
+    TF_RETURN_IF_ERROR(RunHloPass(HloPassFix<ReshapeMover>(), module).status());
+    VLOG(1) << "after reshape mover:\n" << module->ToString();
+
     TF_RETURN_IF_ERROR(RunHloPass(HloPassFix<AlgebraicSimplifier>(
                                       AlgebraicSimplifierOptions()),
                                   module)
@@ -140,13 +144,43 @@ TEST_F(CudnnSimplifyPaddingTest, EndToEnd) {
   // conv2 should be fed directly from conv1, without any intervening
   // reshapes/pads.
   EXPECT_THAT(
-      root, GmockMatch(m::Tuple(
-                m::Slice(m::Reshape(m::GetTupleElement(m::CustomCall(
-                    "__cudnn$convBiasActivationForward",
-                    m::GetTupleElement(
-                        m::CustomCall("__cudnn$convBiasActivationForward"), 0),
-                    m::Op(), m::Op(), m::Op())))),
-                m::Op())));
+      root,
+      GmockMatch(m::Tuple(
+          m::Slice(m::Reshape(m::GetTupleElement(m::CustomCall(
+              {"__cudnn$convBiasActivationForward"},
+              m::GetTupleElement(
+                  m::CustomCall({"__cudnn$convBiasActivationForward"}), 0),
+              m::Op(), m::Op(), m::Op())))),
+          m::Op())));
+}
+
+TEST_F(CudnnSimplifyPaddingTest, EndToEndNCHW) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    conv1 = (s8[1,64,480,400], u8[0]) custom-call(
+        s8[1,112,480,400] parameter(0), s8[3,3,112,64] parameter(1),
+        f32[64] parameter(2)),
+      window={size=3x3}, dim_labels=bf01_01io->bf01,
+      custom_call_target="__cudnn$convBiasActivationForward"
+    conv1_result = get-tuple-element(conv1), index=0
+    convert = f32[1,64,480,400] convert(conv1_result)
+    constant = f32[] constant(0.349002093)
+    broadcast = f32[1,64,480,400] broadcast(constant)
+    ROOT multiply = f32[1,64,480,400] multiply(convert, broadcast)
+  })")
+                    .value();
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunEndToEnd({7, 5}, module.get()));
+  // The SimplifyPadding pass itself does not do anything.
+  EXPECT_FALSE(changed);
+
+  SCOPED_TRACE(module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+
+  // The reshape introduced by CudnnVectorizeConvolutions should have been moved
+  // to the root.
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Multiply())));
 }
 
 TEST_F(CudnnSimplifyPaddingTest, PaddedWeights) {
@@ -589,5 +623,98 @@ TEST_F(CudnnSimplifyPaddingTest, SliceMoreElementsThanPad) {
   }
 }
 
+TEST_F(CudnnSimplifyPaddingTest, NoChangeOnNonTrivialConstants) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule jit_outer
+
+ENTRY main.26 {
+  reshape.2 = f32[1,3,3,12]{3,2,1,0} parameter(0)
+  constant.1 = f32[3,3,1,12]{3,2,1,0} constant({ {
+    { /*i1=0*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+    { /*i1=2*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } })
+  cudnn-conv = (f32[1,5,5,12]{3,2,1,0}, u8[0]{0}) custom-call(reshape.2, constant.1), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, feature_group_count=12, custom_call_target="__cudnn$convForward"
+  get-tuple-element = f32[1,5,5,12]{3,2,1,0} get-tuple-element(cudnn-conv), index=0
+  slice.2 = f32[1,5,1,12]{3,2,1,0} slice(get-tuple-element), slice={[0:1], [0:5], [0:1], [0:12]}
+  constant.0 = f32[] constant(0)
+  ROOT pad.1 = f32[1,5,3,12]{3,2,1,0} pad(slice.2, constant.0), padding=0_0x0_0x2_0x0_0
+}
+  )")
+                    .value();
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunJustThisPass(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(CudnnSimplifyPaddingTest, NoChangeOnComplexSlices) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule jit_outer
+
+ENTRY main.26 {
+  reshape.2 = f32[1,3,3,12]{3,2,1,0} parameter(0)
+  constant.1 = f32[3,3,1,12]{3,2,1,0} constant({ {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } })
+  cudnn-conv = (f32[1,5,5,12]{3,2,1,0}, u8[0]{0}) custom-call(reshape.2, constant.1), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, feature_group_count=12, custom_call_target="__cudnn$convForward"
+  get-tuple-element = f32[1,5,5,12]{3,2,1,0} get-tuple-element(cudnn-conv), index=0
+  slice.2 = f32[1,5,5,4]{3,2,1,0} slice(get-tuple-element), slice={[0:1], [0:5], [0:5], [2:6]}
+  constant.0 = f32[] constant(0)
+  ROOT pad.1 = f32[1,5,5,12]{3,2,1,0} pad(slice.2, constant.0), padding=0_0x0_0x0_0x0_8
+}
+  )")
+                    .value();
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunJustThisPass(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(CudnnSimplifyPaddingTest, ScanOrderFeatureDimLast) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule jit_outer
+
+ENTRY main.26 {
+  reshape.2 = f32[1,3,3,12]{3,2,1,0} parameter(0)
+  constant.1 = f32[3,3,1,12]{3,2,1,0} constant({ {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+  }, {
+    { /*i1=0*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=1*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+    { /*i1=2*/ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } })
+  cudnn-conv = (f32[1,5,5,12]{3,2,1,0}, u8[0]{0}) custom-call(reshape.2, constant.1), window={size=3x3 pad=2_2x2_2}, dim_labels=b01f_01io->b01f, feature_group_count=12, custom_call_target="__cudnn$convForward"
+  get-tuple-element = f32[1,5,5,12]{3,2,1,0} get-tuple-element(cudnn-conv), index=0
+  slice.2 = f32[1,5,5,6]{3,2,1,0} slice(get-tuple-element), slice={[0:1], [0:5], [0:5], [0:6]}
+  constant.0 = f32[] constant(0)
+  ROOT pad.1 = f32[1,5,5,12]{3,2,1,0} pad(slice.2, constant.0), padding=0_0x0_0x0_0x0_6
+}
+  )")
+                    .value();
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunJustThisPass(module.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // anonymous namespace
 }  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h b/tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h
index 8a0f57336bf..d8c4dece1ce 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_SUPPORT_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_SUPPORT_UTILS_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_support_utils_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_support_utils_test.cc
index b931925a323..cb328f52d32 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_support_utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_support_utils_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
index cb4effd96da..7511e81a423 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_support_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
index 36b525ac3c0..3c6d323d658 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
@@ -71,7 +71,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeTo4) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 20, 30, 10, 4}),
                                        m::Reshape(m::Parameter(1))
@@ -146,7 +146,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeTo4NCHW) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 12, 4, 20, 30}),
                                        m::Reshape(m::Parameter(1))
@@ -199,7 +199,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, IncrementAllDnums) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {4, 4, 16, 16, 16}),
                                        m::Reshape(m::Parameter(1))
@@ -252,7 +252,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, FilterDnums) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {1, 5, 4, 9, 9}),
                                        m::Reshape(m::Parameter(1))
@@ -362,7 +362,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeTo32) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 20, 30, 2, 32}),
                                        m::Reshape(m::Parameter(1))
@@ -397,7 +397,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, BiasAndSideInput) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 20, 30, 2, 32}),
                                        m::Reshape(m::Parameter(1))
@@ -432,7 +432,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, NoVectorizeTo32) {
       root,
       GmockMatch(m::Tuple(
           m::Reshape(m::GetTupleElement(
-                         m::CustomCall(&conv, kCudnnConvForwardCallTarget,
+                         m::CustomCall(&conv, {kCudnnConvForwardCallTarget},
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 20, 30, 16, 4}),
                                        m::Reshape(m::Parameter(1))
@@ -466,7 +466,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32) {
   auto conv_pat =
       m::GetTupleElement(
           m::CustomCall(
-              &conv, kCudnnConvForwardCallTarget,
+              &conv, {kCudnnConvForwardCallTarget},
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(0))
                                           .WithShape(S8, {10, 20, 30, 2, 8, 4}))
                              .WithShape(S8, {10, 20, 30, 2, 8, 4}))
@@ -536,7 +536,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32NCHW) {
   auto conv_pat =
       m::GetTupleElement(
           m::CustomCall(
-              &conv, kCudnnConvForwardCallTarget,
+              &conv, {kCudnnConvForwardCallTarget},
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(0))
                                           .WithShape(S8, {10, 2, 8, 20, 30, 4}))
                              .WithShape(S8, {10, 2, 20, 30, 8, 4}))
@@ -604,7 +604,7 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32VectorDimFirst) {
   auto conv_pat =
       m::GetTupleElement(
           m::CustomCall(
-              &conv, kCudnnConvForwardCallTarget,
+              &conv, {kCudnnConvForwardCallTarget},
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(0))
                                           .WithShape(S8, {4, 10, 20, 30, 2, 8}))
                              .WithShape(S8, {8, 4, 10, 20, 30, 2}))
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
index 65f8ffaaa6f..7ad1b815396 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -292,41 +293,23 @@ Status ConvertStatus(rocblas_status status) {
 
 }  // namespace
 
-StatusOr<GpuSolverContext> GpuSolverContext::Create(se::Stream* stream) {
+StatusOr<GpuSolverContext> GpuSolverContext::Create() {
   gpusolverHandle_t handle;
   TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverCreate(&handle)));
-  GpuSolverContext context(stream, handle);
-
-  if (stream) {
-    // StreamExecutor really should just expose the Cuda stream to clients...
-    const gpuStream_t* gpu_stream =
-        CHECK_NOTNULL(reinterpret_cast<const gpuStream_t*>(
-            stream->implementation()->GpuStreamMemberHack()));
-    TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverSetStream(handle, *gpu_stream)));
-  }
-
-  return std::move(context);
+  return GpuSolverContext(handle);
 }
 
-GpuSolverContext::GpuSolverContext(se::Stream* stream, gpusolverHandle_t handle)
-    : stream_(stream), handle_(handle) {}
-
-GpuSolverContext::GpuSolverContext(GpuSolverContext&& other) {
-  handle_ = other.handle_;
-  stream_ = other.stream_;
-  other.handle_ = nullptr;
-  other.stream_ = nullptr;
+Status GpuSolverContext::SetStream(se::Stream* stream) {
+  return ConvertStatus(
+      GpuSolverSetStream(handle_.get(), se::gpu::AsGpuStreamValue(stream)));
 }
 
-GpuSolverContext& GpuSolverContext::operator=(GpuSolverContext&& other) {
-  std::swap(handle_, other.handle_);
-  std::swap(stream_, other.stream_);
-  return *this;
-}
+GpuSolverContext::GpuSolverContext(gpusolverHandle_t handle)
+    : handle_(handle) {}
 
-GpuSolverContext::~GpuSolverContext() {
-  if (handle_) {
-    Status status = ConvertStatus(GpuSolverDestroy(handle_));
+void GpuSolverContext::Deleter::operator()(gpusolverHandle_t handle) {
+  if (handle) {
+    Status status = ConvertStatus(GpuSolverDestroy(handle));
     if (!status.ok()) {
       LOG(ERROR) << "GpuSolverDestroy failed: " << status;
     }
@@ -344,23 +327,27 @@ StatusOr<int64_t> GpuSolverContext::PotrfBufferSize(PrimitiveType type,
   int size = -1;
   switch (type) {
     case F32: {
-      TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverSpotrf_bufferSize(
-          handle(), GpuBlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      TF_RETURN_IF_ERROR(ConvertStatus(
+          GpuSolverSpotrf_bufferSize(handle_.get(), GpuBlasUpperLower(uplo), n,
+                                     /*A=*/nullptr, lda, &size)));
       break;
     }
     case F64: {
-      TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverDpotrf_bufferSize(
-          handle(), GpuBlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      TF_RETURN_IF_ERROR(ConvertStatus(
+          GpuSolverDpotrf_bufferSize(handle_.get(), GpuBlasUpperLower(uplo), n,
+                                     /*A=*/nullptr, lda, &size)));
       break;
     }
     case C64: {
-      TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverCpotrf_bufferSize(
-          handle(), GpuBlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      TF_RETURN_IF_ERROR(ConvertStatus(
+          GpuSolverCpotrf_bufferSize(handle_.get(), GpuBlasUpperLower(uplo), n,
+                                     /*A=*/nullptr, lda, &size)));
       break;
     }
     case C128: {
-      TF_RETURN_IF_ERROR(ConvertStatus(GpuSolverZpotrf_bufferSize(
-          handle(), GpuBlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      TF_RETURN_IF_ERROR(ConvertStatus(
+          GpuSolverZpotrf_bufferSize(handle_.get(), GpuBlasUpperLower(uplo), n,
+                                     /*A=*/nullptr, lda, &size)));
       break;
     }
     default:
@@ -379,64 +366,12 @@ StatusOr<int64_t> GpuSolverContext::PotrfBufferSize(PrimitiveType type,
 #endif
 }
 
-Status GpuSolverContext::Potrf(se::blas::UpperLower uplo, int n,
-                               se::DeviceMemory<float> a, int lda,
-                               se::DeviceMemory<int> lapack_info,
-                               se::DeviceMemoryBase workspace) {
-  return ConvertStatus(GpuSolverSpotrf(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(a), lda,
-#if TENSORFLOW_USE_CUSOLVER_OR_HIPSOLVER
-      ToDevicePointer(se::DeviceMemory<float>(workspace)),
-      se::DeviceMemory<float>(workspace).ElementCount(),
-#endif
-      ToDevicePointer(lapack_info)));
-}
-
-Status GpuSolverContext::Potrf(se::blas::UpperLower uplo, int n,
-                               se::DeviceMemory<double> a, int lda,
-                               se::DeviceMemory<int> lapack_info,
-                               se::DeviceMemoryBase workspace) {
-  return ConvertStatus(GpuSolverDpotrf(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(a), lda,
-#if TENSORFLOW_USE_CUSOLVER_OR_HIPSOLVER
-      ToDevicePointer(se::DeviceMemory<double>(workspace)),
-      se::DeviceMemory<double>(workspace).ElementCount(),
-#endif
-      ToDevicePointer(lapack_info)));
-}
-
-Status GpuSolverContext::Potrf(se::blas::UpperLower uplo, int n,
-                               se::DeviceMemory<std::complex<float>> a, int lda,
-                               se::DeviceMemory<int> lapack_info,
-                               se::DeviceMemoryBase workspace) {
-  return ConvertStatus(GpuSolverCpotrf(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(a), lda,
-#if TENSORFLOW_USE_CUSOLVER_OR_HIPSOLVER
-      ToDevicePointer(se::DeviceMemory<std::complex<float>>(workspace)),
-      se::DeviceMemory<std::complex<float>>(workspace).ElementCount(),
-#endif
-      ToDevicePointer(lapack_info)));
-}
-
-Status GpuSolverContext::Potrf(se::blas::UpperLower uplo, int n,
-                               se::DeviceMemory<std::complex<double>> a,
-                               int lda, se::DeviceMemory<int> lapack_info,
-                               se::DeviceMemoryBase workspace) {
-  return ConvertStatus(GpuSolverZpotrf(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(a), lda,
-#if TENSORFLOW_USE_CUSOLVER_OR_HIPSOLVER
-      ToDevicePointer(se::DeviceMemory<std::complex<double>>(workspace)),
-      se::DeviceMemory<std::complex<double>>(workspace).ElementCount(),
-#endif
-      ToDevicePointer(lapack_info)));
-}
-
 Status GpuSolverContext::PotrfBatched(se::blas::UpperLower uplo, int n,
                                       se::DeviceMemory<float*> as, int lda,
                                       se::DeviceMemory<int> lapack_info,
                                       int batch_size) {
   return ConvertStatus(GpuSolverSpotrfBatched(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
+      handle_.get(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
 #if TENSORFLOW_USE_HIPSOLVER
       nullptr, 0,
 #endif
@@ -448,7 +383,7 @@ Status GpuSolverContext::PotrfBatched(se::blas::UpperLower uplo, int n,
                                       se::DeviceMemory<int> lapack_info,
                                       int batch_size) {
   return ConvertStatus(GpuSolverDpotrfBatched(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
+      handle_.get(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
 #if TENSORFLOW_USE_HIPSOLVER
       nullptr, 0,
 #endif
@@ -461,7 +396,7 @@ Status GpuSolverContext::PotrfBatched(se::blas::UpperLower uplo, int n,
                                       se::DeviceMemory<int> lapack_info,
                                       int batch_size) {
   return ConvertStatus(GpuSolverCpotrfBatched(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
+      handle_.get(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
 #if TENSORFLOW_USE_HIPSOLVER
       nullptr, 0,
 #endif
@@ -473,7 +408,7 @@ Status GpuSolverContext::PotrfBatched(
     se::DeviceMemory<std::complex<double>*> as, int lda,
     se::DeviceMemory<int> lapack_info, int batch_size) {
   return ConvertStatus(GpuSolverZpotrfBatched(
-      handle(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
+      handle_.get(), GpuBlasUpperLower(uplo), n, ToDevicePointer(as), lda,
 #if TENSORFLOW_USE_HIPSOLVER
       nullptr, 0,
 #endif
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.h b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
index 73134af36b6..f0712f5fa7d 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_CONTEXT_H_
 
 #include <complex>
+#include <memory>
 
 #define TENSORFLOW_USE_HIPSOLVER \
   ((!TENSORFLOW_USE_DCU) && TENSORFLOW_USE_ROCM && (TF_ROCM_VERSION >= 40500))
@@ -44,45 +45,18 @@ using gpusolverHandle_t = rocblas_handle;
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
 
+namespace se = ::stream_executor;
+
 class GpuSolverContext {
  public:
-  // stream may be nullptr, in which case the context can only be used for
-  // buffer size queries.
-  static StatusOr<GpuSolverContext> Create(se::Stream* stream);
-  GpuSolverContext() = default;
-  ~GpuSolverContext();
-
-  GpuSolverContext(const GpuSolverContext&) = delete;
-  GpuSolverContext(GpuSolverContext&&);
-  GpuSolverContext& operator=(const GpuSolverContext&) = delete;
-  GpuSolverContext& operator=(GpuSolverContext&&);
-
-  bool SupportsPotrfBatched() const { return true; }
-
-  // Computes the Cholesky factorization A = L * L^T for a single matrix.
-  // Returns OkStatus() if the kernel was launched successfully. See:
-  // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
-  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<float> a,
-               int lda, se::DeviceMemory<int> lapack_info,
-               se::DeviceMemoryBase workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n, se::DeviceMemory<double> a,
-               int lda, se::DeviceMemory<int> lapack_info,
-               se::DeviceMemoryBase workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n,
-               se::DeviceMemory<std::complex<float>> a, int lda,
-               se::DeviceMemory<int> lapack_info,
-               se::DeviceMemoryBase workspace);
-  Status Potrf(se::blas::UpperLower uplo, int n,
-               se::DeviceMemory<std::complex<double>> a, int lda,
-               se::DeviceMemory<int> lapack_info,
-               se::DeviceMemoryBase workspace);
+  static StatusOr<GpuSolverContext> Create();
+
+  Status SetStream(se::Stream* stream);
 
   // Computes the Cholesky factorization of multiple matrices.  See
   // https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-batchpotrf
@@ -121,12 +95,13 @@ class GpuSolverContext {
                                     int batch_size);
 
  private:
-  GpuSolverContext(se::Stream* stream, gpusolverHandle_t handle);
+  explicit GpuSolverContext(gpusolverHandle_t handle);
 
-  gpusolverHandle_t handle() const { return handle_; }
+  struct Deleter {
+    void operator()(gpusolverHandle_t handle);
+  };
 
-  se::Stream* stream_ = nullptr;
-  gpusolverHandle_t handle_ = nullptr;
+  std::unique_ptr<std::remove_pointer_t<gpusolverHandle_t>, Deleter> handle_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
index 0b747c6d0d0..30ef3f9aa9c 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -167,8 +167,7 @@ StatusOr<bool> GpusolverRewriter::RunOnComputation(
     return false;
   }
 
-  TF_ASSIGN_OR_RETURN(GpuSolverContext context,
-                      GpuSolverContext::Create(/*stream=*/nullptr));
+  TF_ASSIGN_OR_RETURN(GpuSolverContext context, GpuSolverContext::Create());
 
   bool changed = false;
   for (HloInstruction* instruction : cusolver_calls) {
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
index e584931606d..94068d1fa7a 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
index 9d79462c119..ea1e09a46b7 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <sstream>
+#include <string>
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -24,8 +25,12 @@ limitations under the License.
 #include "rocm/include/hip/hip_runtime.h"
 #define PLATFORM "ROCM"
 #endif
+
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+#include "tensorflow/compiler/xla/runtime/module.h"
+#include "tensorflow/compiler/xla/runtime/module_registry.h"
 #include "tensorflow/compiler/xla/service/custom_call_status.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -128,14 +133,18 @@ void Callback_SubBuffers(se::gpu::GpuStreamHandle stream, void** buffers,
 
   // Set output leaf buffers, copying data from the corresponding same-sized
   // inputs.
-  gpuMemcpyAsync(buffers[4], buffers[3], 8 * sizeof(float),
-                 gpuMemcpyDeviceToDevice, stream);
-  gpuMemcpyAsync(buffers[5], buffers[0], 128 * sizeof(float),
-                 gpuMemcpyDeviceToDevice, stream);
-  gpuMemcpyAsync(buffers[6], buffers[1], 256 * sizeof(float),
-                 gpuMemcpyDeviceToDevice, stream);
-  gpuMemcpyAsync(buffers[7], buffers[2], 1024 * sizeof(float),
-                 gpuMemcpyDeviceToDevice, stream);
+  auto err = gpuMemcpyAsync(buffers[4], buffers[3], 8 * sizeof(float),
+                            gpuMemcpyDeviceToDevice, stream);
+  ASSERT_EQ(err, gpuSuccess);
+  err = gpuMemcpyAsync(buffers[5], buffers[0], 128 * sizeof(float),
+                       gpuMemcpyDeviceToDevice, stream);
+  ASSERT_EQ(err, gpuSuccess);
+  err = gpuMemcpyAsync(buffers[6], buffers[1], 256 * sizeof(float),
+                       gpuMemcpyDeviceToDevice, stream);
+  ASSERT_EQ(err, gpuSuccess);
+  err = gpuMemcpyAsync(buffers[7], buffers[2], 1024 * sizeof(float),
+                       gpuMemcpyDeviceToDevice, stream);
+  ASSERT_EQ(err, gpuSuccess);
 }
 XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, PLATFORM);
 TEST_F(CustomCallTest, SubBuffers) {
@@ -319,5 +328,163 @@ TEST_F(CustomCallTest, WithStatusFailed) {
   EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Failed"));
 }
 
+//===----------------------------------------------------------------------===//
+// Custom calls based on XLA runtime modules.
+//===----------------------------------------------------------------------===//
+
+struct TestModule : runtime::StatelessModule {
+  TestModule() : StatelessModule("TestModule") {}
+
+  // Check that we can use absl::Status to return errors back to the caller.
+  static absl::Status AlwaysFail(runtime::StridedMemrefView arg) {
+    return absl::InternalError("Uh oh, too bad");
+  }
+
+  // Check that we can get access to the stream and launch on device.
+  static absl::Status Memcpy(const ServiceExecutableRunOptions* run_options,
+                             runtime::FlatMemrefView src,
+                             runtime::FlatMemrefView dst) {
+    se::DeviceMemoryBase src_mem(src.data);
+    se::DeviceMemoryBase dst_mem(dst.data);
+
+    if (src.size_in_bytes != dst.size_in_bytes) {
+      return absl::InternalError("Size in bytes must match");
+    }
+
+    run_options->stream()->ThenMemcpyD2D(&dst_mem, src_mem, src.size_in_bytes);
+    return absl::OkStatus();
+  }
+
+  // Write bindings for custom calls and register with runtime.
+  void Export(runtime::DynamicCustomCallRegistry& registry) const final {
+    registry.Register(runtime::CustomCall::Bind("test.always_fail")
+                          .Arg<runtime::StridedMemrefView>()
+                          .To(AlwaysFail));
+
+    registry.Register(runtime::CustomCall::Bind("test.memcpy")
+                          .UserData<const ServiceExecutableRunOptions*>()
+                          .Arg<runtime::FlatMemrefView>()
+                          .Arg<runtime::FlatMemrefView>()
+                          .To(Memcpy));
+  }
+};
+
+XLA_REGISTER_RUNTIME_MODULE(std::make_unique<TestModule>());
+
+TEST_F(CustomCallTest, ExportedAlwaysFail) {
+  // TODO(ezhulenev): Remove once XLA runtime is enabled by default.
+  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(true);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "test.always_fail", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  auto status = Execute(&b, {}).status();
+  EXPECT_EQ(status.code(), tsl::error::Code::INTERNAL);
+  VLOG(0) << status.error_message();
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Uh oh, too bad"));
+}
+
+TEST_F(CustomCallTest, ExportedMemcpy) {
+  // TODO(ezhulenev): Remove once XLA runtime is enabled by default.
+  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(true);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "test.memcpy",
+             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
+}
+
+//===----------------------------------------------------------------------===//
+// XLA runtime FFI modules is an external version of custom calls (C API based).
+//===----------------------------------------------------------------------===//
+
+namespace ffi = ::xla::runtime::ffi;
+
+struct TestFfiModule : ffi::StatelessModule {
+  explicit TestFfiModule(const XLA_FFI_Api* api)
+      : StatelessModule(
+            api, "TestFfiModule",
+            {{"ffi.always_fail", FFI_AlwaysFail}, {"ffi.memcpy", FFI_Memcpy}}) {
+  }
+
+  XLA_FFI_DEFINE_FUNCTION(FFI_AlwaysFail, AlwaysFail,
+                          ffi::Ffi::Binding().Arg<ffi::StridedBufferArg>());
+
+  XLA_FFI_DEFINE_FUNCTION(FFI_Memcpy, Memcpy,
+                          ffi::Ffi::Binding()
+                              .Stream<se::gpu::GpuStreamHandle>()
+                              .Arg<ffi::StridedBufferArg>()
+                              .Arg<ffi::StridedBufferArg>());
+
+  // Check that we can use `FfiStatus` to return errors back to the caller.
+  static ffi::FfiStatus AlwaysFail(ffi::StridedBufferArg arg) {
+    return ffi::FfiStatus::Internal("Uh oh, too bad");
+  }
+
+  // Check that we can get access to the stream and launch on device.
+  static ffi::FfiStatus Memcpy(se::gpu::GpuStreamHandle stream,
+                               ffi::StridedBufferArg src,
+                               ffi::StridedBufferArg dst) {
+    se::DeviceMemoryBase src_mem(src.data);
+    se::DeviceMemoryBase dst_mem(dst.data);
+
+    int64_t size_in_bytes = sizeof(float);
+    for (unsigned d = 0; d < src.sizes.size(); ++d)
+      size_in_bytes *= src.sizes[d];
+
+    auto err = gpuMemcpyAsync(dst.data, src.data, size_in_bytes,
+                              gpuMemcpyDeviceToDevice, stream);
+    if (err != gpuSuccess)
+      return ffi::FfiStatus::Internal("Failed to launch memcpy");
+
+    return ffi::FfiStatus::Ok();
+  }
+};
+
+XLA_REGISTER_FFI_MODULE(std::make_unique<TestFfiModule>(GetXlaFfiApi()));
+
+TEST_F(CustomCallTest, ExportedFfiAlwaysFail) {
+  // TODO(ezhulenev): Remove once XLA runtime is enabled by default.
+  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(true);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "ffi.always_fail", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  auto status = Execute(&b, {}).status();
+  EXPECT_EQ(status.code(), tsl::error::Code::INTERNAL);
+  VLOG(0) << status.error_message();
+  EXPECT_THAT(status.error_message(), ::testing::HasSubstr("Uh oh, too bad"));
+}
+
+TEST_F(CustomCallTest, ExportedFfiMemcpy) {
+  // TODO(ezhulenev): Remove once XLA runtime is enabled by default.
+  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(true);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "ffi.memcpy",
+             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(42));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc
new file mode 100644
index 00000000000..8dd3f2ae6c7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc
@@ -0,0 +1,135 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/permutation_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Check that a sequence of distinct numbers is a continuous interval.
+bool ConsecutiveIfSorted(absl::Span<const int64_t> seq) {
+  return *absl::c_max_element(seq) - *absl::c_min_element(seq) ==
+         seq.size() - 1;
+}
+
+// Sort contracting dimensions of a dot() instruction preserving lhs-rhs pairs.
+Status SortDotDimensions(HloInstruction* dot) {
+  const DotDimensionNumbers& dims = dot->dot_dimension_numbers();
+  DotDimensionNumbers new_dims(dims);
+  new_dims.clear_lhs_contracting_dimensions();
+  new_dims.clear_rhs_contracting_dimensions();
+  const bool sort_by_lhs =
+      ConsecutiveIfSorted(dims.lhs_contracting_dimensions());
+  // Sort lhs and rhs by sort_key using the fact that
+  // sort_key is guaranteed to have only distinct consecutive numbers.
+  const absl::Span<const int64_t>& sort_key =
+      sort_by_lhs ? dims.lhs_contracting_dimensions()
+                  : dims.rhs_contracting_dimensions();
+  std::vector<int64_t> permutation;
+  for (const int64_t a : sort_key) {
+    permutation.push_back(a - *absl::c_min_element(sort_key));
+  }
+  const std::vector<int64_t> sorted_lhs =
+      Permute(dims.lhs_contracting_dimensions(), permutation);
+  *new_dims.mutable_lhs_contracting_dimensions() = {sorted_lhs.begin(),
+                                                    sorted_lhs.end()};
+  const std::vector<int64_t> sorted_rhs =
+      Permute(dims.rhs_contracting_dimensions(), permutation);
+  *new_dims.mutable_rhs_contracting_dimensions() = {sorted_rhs.begin(),
+                                                    sorted_rhs.end()};
+  std::unique_ptr<HloInstruction> new_dot = HloInstruction::CreateDot(
+      dot->shape(), dot->mutable_operand(0), dot->mutable_operand(1), new_dims,
+      dot->precision_config());
+  dot->SetupDerivedInstruction(new_dot.get());
+
+  VLOG(3) << "Sorted dot() dimensions:\n"
+          << "\t before: " << dot->ToString() << "\n"
+          << "\t after: " << new_dot->ToString();
+  return dot->parent()->ReplaceWithNewInstruction(dot, std::move(new_dot));
+}
+
+}  // namespace
+
+StatusOr<bool> DotDimensionSorter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<HloInstruction*> dots_to_process;
+  for (const HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      // TODO(b/265688934): should non-default layouts be expected here at all?
+      if ((instr->operand(0)->shape().has_layout() &&
+           !LayoutUtil::IsMonotonicWithDim0Major(
+               instr->operand(0)->shape().layout())) ||
+          (instr->operand(1)->shape().has_layout() &&
+           !LayoutUtil::IsMonotonicWithDim0Major(
+               instr->operand(1)->shape().layout()))) {
+        continue;
+      }
+      const DotDimensionNumbers& dims = instr->dot_dimension_numbers();
+      if (dims.lhs_contracting_dimensions_size() == 0) {
+        continue;
+      }
+      const bool cons_lhs =
+          ConsecutiveIfSorted(dims.lhs_contracting_dimensions());
+      const bool cons_rhs =
+          ConsecutiveIfSorted(dims.rhs_contracting_dimensions());
+      const bool sorted_lhs =
+          absl::c_is_sorted(dims.lhs_contracting_dimensions());
+      const bool sorted_rhs =
+          absl::c_is_sorted(dims.rhs_contracting_dimensions());
+      // The side to be sorted has to be consecutive and not sorted yet;
+      // the other side should not get worsened.
+      // TODO(b/265688934): we may still want to change which one is sorted
+      // if this reduces the amount of transposed data.
+      if ((cons_lhs && !sorted_lhs && !cons_rhs) ||
+          (cons_rhs && !sorted_rhs && !cons_lhs) ||
+          (cons_lhs && !sorted_lhs && cons_rhs && !sorted_rhs)) {
+        dots_to_process.push_back(instr);
+      }
+    }
+  }
+  if (dots_to_process.empty()) {
+    return false;
+  }
+  for (HloInstruction* dot : dots_to_process) {
+    TF_RETURN_IF_ERROR(SortDotDimensions(dot));
+  }
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h
new file mode 100644
index 00000000000..814815ca72a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_DOT_DIMENSION_SORTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_DOT_DIMENSION_SORTER_H_
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Sorts contracting dimensions of dot() operands when this reduces the
+// number of transposes. Example:
+// dot(p0, p1), lhs_contracting_dims={3,2}, rhs_contracting_dims={2,1}  ->
+// dot(p0, p1), lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+// The first case gets transposes inserted by dot_decomposer, the second one
+// does not and thus is generally more efficient.
+
+// TODO(b/265688934): do the same for batch dimensions?
+
+class DotDimensionSorter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_dimension_sorter"; }
+
+  // Run the pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_DOT_DIMENSION_SORTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
new file mode 100644
index 00000000000..106637e0bbe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h"
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class WithoutDotDimensionSorterTest : public GpuCodegenTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The pass is disabled here to preserve suboptimal dimension order in
+    // 1) UnsortedDimsCreateTransposes to reveal the transposes.
+    // 2) DimOrderCanBeChanged for the comparison of ordered vs unordered.
+    // The pass does not touch SortedDimsDoNotCreateTransposes anyway because
+    // the dimensions are already ordered there.
+    debug_options.add_xla_disable_hlo_passes("dot_dimension_sorter");
+    return debug_options;
+  }
+};
+
+TEST_F(WithoutDotDimensionSorterTest, UnsortedDimsCreateTransposes) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,14,9,32] parameter(0)
+  p1 = f16[12,9,32] parameter(1)
+  ROOT _ = f16[1,14,12] dot(p0, p1),
+    lhs_contracting_dims={3,2}, rhs_contracting_dims={2,1}
+}
+)";
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK: transpose(%param_
+)");
+}
+
+TEST_F(WithoutDotDimensionSorterTest, SortedDimsDoNotCreateTransposes) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,14,9,32] parameter(0)
+  p1 = f16[12,9,32] parameter(1)
+  ROOT _ = f16[1,14,12] dot(p0, p1),
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+}
+)";
+
+  MatchOptimizedHlo(hlo_text, R"(
+; CHECK-NOT: transpose
+)");
+}
+
+TEST_F(WithoutDotDimensionSorterTest, DimOrderCanBeChanged) {
+  const char* hlo_text_ref = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,14,9,32] parameter(0)
+  p1 = f16[12,9,32] parameter(1)
+  ROOT _ = f16[1,14,12] dot(p0, p1),
+    lhs_contracting_dims={3,2}, rhs_contracting_dims={2,1}
+}
+)";
+
+  const char* hlo_text_modified = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,14,9,32] parameter(0)
+  p1 = f16[12,9,32] parameter(1)
+  ROOT _ = f16[1,14,12] dot(p0, p1),
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_modified,
+                                      ErrorSpec{1e-6, 1e-6},
+                                      /*run_hlo_passes=*/true));
+}
+
+using DotDimensionSorterTest = GpuCodegenTest;
+
+TEST_F(DotDimensionSorterTest, SortContractingDims) {
+  const char* module_string = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,144,96,32] parameter(0)
+  p1 = f16[122,96,32] parameter(1)
+  ROOT _ = f16[1,144,122] dot(p0, p1),
+    lhs_contracting_dims={3,2}, rhs_contracting_dims={2,1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  const auto& dims =
+      module->entry_computation()->root_instruction()->dot_dimension_numbers();
+
+  EXPECT_EQ(dims.lhs_contracting_dimensions(0), 3);
+  EXPECT_EQ(dims.lhs_contracting_dimensions(1), 2);
+
+  EXPECT_EQ(dims.rhs_contracting_dimensions(0), 2);
+  EXPECT_EQ(dims.rhs_contracting_dimensions(1), 1);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotDimensionSorter().Run(module.get()));
+  EXPECT_TRUE(modified);
+  const auto& dims2 =
+      module->entry_computation()->root_instruction()->dot_dimension_numbers();
+
+  EXPECT_EQ(dims2.lhs_contracting_dimensions(0), 2);
+  EXPECT_EQ(dims2.lhs_contracting_dimensions(1), 3);
+
+  EXPECT_EQ(dims2.rhs_contracting_dimensions(0), 1);
+  EXPECT_EQ(dims2.rhs_contracting_dimensions(1), 2);
+}
+
+TEST_F(DotDimensionSorterTest, NothingToReorder) {
+  const char* module_string = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[1,144,96,32] parameter(0)
+  p1 = f16[122,96,32] parameter(1)
+  ROOT _ = f16[1,144,122] dot(p0, p1),
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={1,2}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotDimensionSorter().Run(module.get()));
+  EXPECT_FALSE(modified);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 0ea5d2a081c..6432e5ca964 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <utility>
 #include <vector>
 
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "tensorflow/tsl/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Attributes.gen.inc"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
@@ -31,11 +33,12 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/ModRef.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -70,10 +73,12 @@ bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
-    llvm::IRBuilder<>* b, NestedComputer compute_nested)
+    llvm::IRBuilder<>* b, NestedComputer compute_nested,
+    IrEmitterContext* ir_emitter_context)
     : ElementalIrEmitter(module, b),
       hlo_module_config_(hlo_module_config),
-      compute_nested_(std::move(compute_nested)) {}
+      compute_nested_(std::move(compute_nested)),
+      ir_emitter_context_(ir_emitter_context) {}
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitDeviceMathCall(
     TargetDeviceFunctionID funcid, absl::Span<llvm::Value* const> operands,
@@ -153,9 +158,11 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
     }
   }
 
-  return EmitDeviceFunctionCall(
-      callee_name, operands, input_types, output_type,
-      {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}, b(), name);
+  return EmitDeviceFunctionCall(callee_name, operands, input_types, output_type,
+                                llvm::AttrBuilder(b()->getContext())
+                                    .addMemoryAttr(llvm::MemoryEffects::none())
+                                    .addAttribute(llvm::Attribute::NoUnwind),
+                                b(), name);
 }
 
 llvm_ir::IrArray::Index GpuElementalIrEmitter::GetSourceIndexOfBitcast(
@@ -239,6 +246,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitCos(PrimitiveType prim_type,
                             prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTan(PrimitiveType prim_type,
+                                                      llvm::Value* value) {
+  return EmitDeviceMathCall(TargetDeviceFunctionID::kTan, {value}, {prim_type},
+                            prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExp(
     PrimitiveType prim_type, llvm::Value* value, absl::string_view /*name*/) {
   return EmitDeviceMathCall(TargetDeviceFunctionID::kExp, {value}, {prim_type},
@@ -332,5 +345,16 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitF32ToBF16(
+    llvm::Value* f32_value) {
+  if (ir_emitter_context_->cuda_compute_capability().IsAtLeast(8)) {
+    return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_f2bf16_rn,
+                                        {f32_value}, {}, b());
+  } else {
+    // More complex fallback solution.
+    return ElementalIrEmitter::EmitF32ToBF16(f32_value);
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 4f2ec5ac150..db729373beb 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -23,10 +23,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -43,9 +44,14 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   using NestedComputer = std::function<StatusOr<std::vector<llvm::Value*>>(
       const HloComputation&, absl::Span<llvm::Value* const>)>;
 
+  // Constructs a GpuElementalIrEmitter.
+  //
+  // ir_emitter_context is owned by the caller and should outlive the
+  // GpuElementalIrEmitter object.
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
                         llvm::Module* module, llvm::IRBuilder<>* b,
-                        NestedComputer compute_nested);
+                        NestedComputer compute_nested,
+                        IrEmitterContext* ir_emitter_context);
 
  protected:
   llvm_ir::IrArray::Index GetSourceIndexOfBitcast(
@@ -67,6 +73,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
                                  llvm::Value* value) override;
 
+  StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
+                                 llvm::Value* value) override;
+
   StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type, llvm::Value* value,
                                  absl::string_view name) override;
 
@@ -101,6 +110,8 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
 
   llvm::Value* EmitThreadId() override;
 
+  StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value) override;
+
   bool fast_min_max() override {
     return hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max();
   }
@@ -136,6 +147,8 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   const HloModuleConfig& hlo_module_config_;
 
   NestedComputer compute_nested_;
+
+  IrEmitterContext* ir_emitter_context_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/executable.proto b/tensorflow/compiler/xla/service/gpu/executable.proto
index ce0993e6858..607e98e3adb 100644
--- a/tensorflow/compiler/xla/service/gpu/executable.proto
+++ b/tensorflow/compiler/xla/service/gpu/executable.proto
@@ -20,6 +20,12 @@ package xla.gpu;
 import "tensorflow/compiler/xla/service/hlo.proto";
 
 message XlaRuntimeGpuExecutableProto {
+  message ConstantInfoProto {
+    string symbol_name = 1;
+    bytes content = 2;
+    int32 allocation_index = 3;
+  }
+
   XlaRuntimeExecutableProto xla_runtime_executable = 1;
 
   // XLA-specific attributes of the executable's entry function.
@@ -30,4 +36,7 @@ message XlaRuntimeGpuExecutableProto {
 
   // Corresponding CUBIN for the above PTX.
   bytes gpu_binary = 4;
+
+  // Constants required by the serialized executable.
+  repeated ConstantInfoProto constants = 5;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 616ecfc5f54..03b2fb6b2fd 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index b53d5517fa2..4511142e36e 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 6d41bd24fdd..3318b1dee23 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -16,17 +16,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_performance_model.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -34,107 +36,16 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-namespace {
-
-// Traverses users of tuple shape, adding leaf instructions to 'instructions'.
-void MaybeResolveTupleElements(HloInstruction* instruction,
-                               std::vector<HloInstruction*>* instructions) {
-  if (instruction->shape().IsTuple()) {
-    for (auto tuple_user : instruction->users()) {
-      MaybeResolveTupleElements(tuple_user, instructions);
-    }
-  } else {
-    instructions->push_back(instruction);
-  }
-}
-
-// Returns the bytes read by fusion parameter 'param', by returning the byte
-// size of 'param' shape (or the cumulative byte sizes of all leaf tuple
-// elements if 'param' is tuple-shaped).
-//
-// In the special case where all users of 'param' (or all users of a leaf
-// tuple element if 'param' is tuple-shaped) are Slice instructions, the size
-// of each slice instruction is accumulated instead, to give a more accurate
-// value for bytes read.
-double CalculateBytesReadByFusionParameter(HloInstruction* param) {
-  CHECK_EQ(HloOpcode::kParameter, param->opcode());
-
-  // Adds all leaf tuple elements to 'instructions' if 'param' is tuple-shaped.
-  // Adds 'param' to 'instructions' otherwise.
-  std::vector<HloInstruction*> instructions;
-  MaybeResolveTupleElements(param, &instructions);
-
-  // Iterate through 'instructions' accumulating byte sizes of each instruction
-  // shape. For each 'instruction' in 'instructions', if all users of
-  // 'instruction' are Slice instructions, accumulates the byte sizes of each
-  // Slice for a more accurate estimate of bytes read.
-  double bytes = 0.0;
-  for (auto& instruction : instructions) {
-    if (absl::c_all_of(
-            instruction->users(), [](const HloInstruction* instruction) {
-              return instruction->opcode() == HloOpcode::kSlice ||
-                     instruction->opcode() == HloOpcode::kDynamicSlice;
-            })) {
-      // All users are slice: accumulate bytes of all user slice instructions.
-      for (auto& user : instruction->users()) {
-        bytes += ShapeUtil::ByteSizeOf(user->shape());
-      }
-    } else {
-      // Some users are not slice: accumulate full size of 'instruction'.
-      bytes += ShapeUtil::ByteSizeOf(instruction->shape());
-    }
-  }
-  return bytes;
-}
-
-// Returns the bytes read by all fusion parameters of instruction 'fusion'.
-double CalculateBytesReadByFusionInstruction(HloInstruction* fusion) {
-  double bytes = 0.0;
-  for (auto* fused_instruction : fusion->fused_instructions()) {
-    if (fused_instruction->opcode() != HloOpcode::kParameter) {
-      continue;
-    }
-    bytes += CalculateBytesReadByFusionParameter(fused_instruction);
-  }
-  return bytes;
-}
-
-// Returns bytes transferred by instruction 'fusion', including the bytes
-// that would be read by all users.
-double GetCurrentBytesTransferred(HloInstruction* fusion) {
-  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
-  const double bytes_read = CalculateBytesReadByFusionInstruction(fusion);
-  double bytes_written = 0;
-  if (fusion->IsMultiOutputFusion()) {
-    for (auto& operand : fusion->fused_expression_root()->operands()) {
-      bytes_written += ShapeUtil::ByteSizeOf(operand->shape());
-    }
-  } else {
-    bytes_written =
-        ShapeUtil::ByteSizeOf(fusion->fused_expression_root()->shape());
-  }
-  // Current bytes transferred (ignoring non 'fusion' user operands) is bytes
-  // read and written by 'fusion', plus reads of size 'bytes_written' for each
-  // user.
-  return bytes_read + bytes_written * (fusion->user_count() + 1);
-}
-
-// Returns bytes transferred if 'fusion' were to be merged into its users.
-double GetMergedBytesTransferred(HloInstruction* fusion) {
-  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
-  return CalculateBytesReadByFusionInstruction(fusion) * fusion->user_count();
-}
-
-}  // anonymous namespace
-
 // For each fusion F, attempts to fuse F into *all* of F's users (does not fuse
 // if can't fuse into at least one).
 class FusionInstructionMerger {
  public:
   explicit FusionInstructionMerger(HloComputation* computation,
+                                   const GpuDeviceInfo& d,
                                    HloCostAnalysis::ShapeSizeFunction f)
       : computation_(computation),
         shape_size_function_(f),
+        gpu_device_info_(d),
         dump_fusion_visualization_(computation->parent()
                                        ->config()
                                        .debug_options()
@@ -145,15 +56,16 @@ class FusionInstructionMerger {
   bool changed() const { return changed_; }
 
  private:
-  FusionDecision HandleFusion(HloInstruction* fusion);
-  Status FuseIntoAllUsers(HloInstruction* instruction);
+  FusionDecision ShouldFuse(HloInstruction* producer);
+  Status FuseIntoAllUsers(HloInstruction* producer);
 
   HloComputation* computation_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
   // Many cheap checks can prevent fusion merging - postpone execution of full
   // HLO cost analysis of the computation so that it may be not needed at all.
   std::optional<GpuHloCostAnalysis> cost_analysis_;
-
+  FusionInfoCache fusion_info_cache_;
+  const GpuDeviceInfo& gpu_device_info_;
   bool changed_ = false;
   bool dump_fusion_visualization_ = false;
 
@@ -163,27 +75,26 @@ class FusionInstructionMerger {
   int num_fail_no_users_ = 0;
   int num_fail_not_loop_fusion_ = 0;
   int num_fail_merge_all_users_ = 0;
-  int num_fail_expensive_fused_instruction_ = 0;
-  int num_fail_net_bytes_transferred_ratio_ = 0;
   int num_fail_inefficient_fusion_emitter_ = 0;
   int num_fail_fusion_too_large_ = 0;
   int num_fail_uncoalesced_read_ = 0;
+  int num_fail_slower_if_fused_ = 0;
 
   FusionInstructionMerger(const FusionInstructionMerger&) = delete;
   FusionInstructionMerger& operator=(const FusionInstructionMerger&) = delete;
 };
 
-Status FusionInstructionMerger::FuseIntoAllUsers(HloInstruction* instruction) {
+Status FusionInstructionMerger::FuseIntoAllUsers(HloInstruction* producer) {
   // Merge fused instructions from 'fusion' into each user.
-  std::vector<HloInstruction*> users = instruction->users();
+  std::vector<HloInstruction*> users = producer->users();
   for (HloInstruction* user : users) {
     if (dump_fusion_visualization_) {
       RegisterFusionState(
           *computation_,
-          absl::StrCat("About to fuse |", instruction->name(), "| into |",
+          absl::StrCat("About to fuse |", producer->name(), "| into |",
                        user->name(), "| inside FusionMerger"),
           /*consumer=*/*user,
-          /*producer=*/instruction);
+          /*producer=*/producer);
     }
 
     TF_RETURN_IF_ERROR(cost_analysis_->RemoveInstruction(user));
@@ -192,27 +103,28 @@ Status FusionInstructionMerger::FuseIntoAllUsers(HloInstruction* instruction) {
     HloInstruction* consumer = user;
     if (consumer->opcode() != HloOpcode::kFusion) {
       consumer = computation_->AddInstruction(HloInstruction::CreateFusion(
-          user->shape(), ChooseFusionKind(*instruction, *user), user));
+          user->shape(), ChooseFusionKind(*producer, *user), user));
       TF_CHECK_OK(computation_->ReplaceInstruction(user, consumer));
     }
 
-    consumer->MergeFusionInstruction(instruction);
+    consumer->MergeFusionInstruction(producer);
     TF_RETURN_IF_ERROR(cost_analysis_->RevisitInstruction(consumer));
+    fusion_info_cache_.Invalidate(consumer);
 
     if (dump_fusion_visualization_) {
-      RegisterFusionState(
-          *computation_,
-          absl::StrCat("Fused |", instruction->name(), "| into |", user->name(),
-                       "| inside FusionMerger"),
-          *consumer);
+      RegisterFusionState(*computation_,
+                          absl::StrCat("Fused |", producer->name(), "| into |",
+                                       user->name(), "| inside FusionMerger"),
+                          *consumer);
     }
     changed_ = true;
   }
 
-  CHECK_EQ(0, instruction->user_count()) << instruction->ToString();
-  TF_RETURN_IF_ERROR(computation_->RemoveInstruction(instruction));
-  TF_RETURN_IF_ERROR(cost_analysis_->RemoveInstruction(instruction));
-  VLOG(2) << "Merged fusion instruction: " << instruction->name()
+  CHECK_EQ(0, producer->user_count()) << producer->ToString();
+  TF_RETURN_IF_ERROR(computation_->RemoveInstruction(producer));
+  TF_RETURN_IF_ERROR(cost_analysis_->RemoveInstruction(producer));
+  fusion_info_cache_.Invalidate(producer);
+  VLOG(2) << "Merged fusion instruction: " << producer->name()
           << " into users { "
           << absl::StrJoin(users, ", ",
                            [](std::string* out, HloInstruction* user) {
@@ -223,24 +135,26 @@ Status FusionInstructionMerger::FuseIntoAllUsers(HloInstruction* instruction) {
 }
 
 Status FusionInstructionMerger::Run() {
-  for (HloInstruction* instruction : computation_->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      FusionDecision was_fused = HandleFusion(instruction);
-      if (!was_fused) {
-        VLOG(2) << "Not fusing fusion |" << instruction->name()
-                << "| with all of it's users due to: " << was_fused.Explain();
-        if (dump_fusion_visualization_ && !instruction->users().empty()) {
-          RegisterFusionState(
-              *computation_,
-              absl::StrCat(
-                  "Not fusing fusion |", instruction->name(),
-                  "| into all of its users due to: ", was_fused.Explain()),
-              // Just pick any consumer, since we are trying to merge into all.
-              /*consumer=*/*instruction->users()[0],
-              /*producer=*/instruction);
-        }
-      } else {
-        TF_RETURN_IF_ERROR(FuseIntoAllUsers(instruction));
+  for (HloInstruction* producer : computation_->MakeInstructionPostOrder()) {
+    if (producer->opcode() != HloOpcode::kFusion) {
+      continue;
+    }
+    FusionDecision should_fuse = ShouldFuse(producer);
+    if (should_fuse) {
+      TF_RETURN_IF_ERROR(FuseIntoAllUsers(producer));
+      ++total_merged_;
+    } else {
+      VLOG(3) << "Not fusing fusion |" << producer->name()
+              << "| with all of it's users due to: " << should_fuse.Explain();
+      if (dump_fusion_visualization_ && !producer->users().empty()) {
+        RegisterFusionState(
+            *computation_,
+            absl::StrCat(
+                "Not fusing fusion |", producer->name(),
+                "| into all of its users due to: ", should_fuse.Explain()),
+            // Just pick any consumer, since we are trying to merge into all.
+            /*consumer=*/*producer->users()[0],
+            /*producer=*/producer);
       }
     }
   }
@@ -252,11 +166,10 @@ Status FusionInstructionMerger::Run() {
           << " no_users: " << num_fail_no_users_
           << " not_loop_fusion: " << num_fail_not_loop_fusion_
           << " merge_all_users: " << num_fail_merge_all_users_
-          << " expensive_instruction: " << num_fail_expensive_fused_instruction_
           << " uncoalesced_read: " << num_fail_uncoalesced_read_
-          << " net_bytes_transferred: " << num_fail_net_bytes_transferred_ratio_
           << " inefficient_fusion_emitter: "
           << num_fail_inefficient_fusion_emitter_
+          << " slower_if_fused: " << num_fail_slower_if_fused_
           << " fusion_too_large: " << num_fail_fusion_too_large_ << " }";
   return OkStatus();
 }
@@ -266,8 +179,8 @@ bool TransposesMostData(const HloInstruction& fusion) {
 
   for (const HloInstruction* instr : fusion.fused_instructions()) {
     if (IsPhysicallyTransposing(*instr)) {
-      score += 1.0 * ShapeUtil::ElementsIn(instr->shape()) /
-               ShapeUtil::ElementsIn(fusion.shape());
+      score += 1.0 * ShapeUtil::ElementsInRecursive(instr->shape()) /
+               ShapeUtil::ElementsInRecursive(fusion.shape());
       if (score >= 0.5) {
         VLOG(3) << fusion.ToString() << " transpose ratio exceeds " << score;
         return true;
@@ -278,33 +191,37 @@ bool TransposesMostData(const HloInstruction& fusion) {
   return false;
 }
 
-FusionDecision FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
+FusionDecision FusionInstructionMerger::ShouldFuse(HloInstruction* producer) {
   ++total_visited_;
 
-  // Skip 'fusion' instruction if there are no users into which we can merge.
-  if (fusion->users().empty()) {
+  VLOG(4) << "Considering producer " << producer->name();
+
+  // Skip 'producer' instruction if there are no users into which we can
+  // merge.
+  if (producer->users().empty()) {
     ++num_fail_no_users_;
     return "fusion has no users";
   }
 
-  // Skip 'fusion' instruction if it is not a loop fusion. Library fusion
+  // Skip 'producer' instruction if it is not a loop fusion. Library fusion
   // instructions match specific patterns, so they shouldn't be further fused.
   // Input fusion instructions need to be rooted at a particular HLO (e.g.
   // kReduce), so they shouldn't be further fused either.
-  if (!fusion->IsLoopFusion()) {
+  if (!producer->IsLoopFusion()) {
     ++num_fail_not_loop_fusion_;
     return "not a loop fusion";
   }
 
   bool has_reduction_user = false;
-  for (const HloInstruction* user : fusion->users()) {
+  for (const HloInstruction* user : producer->users()) {
     if (user->opcode() == HloOpcode::kBitcast) {
       ++num_fail_merge_all_users_;
       return "not fusing bitcast ops";
     }
-    FusionDecision fusible = IsProducerConsumerFusible(*fusion, *user);
+    FusionDecision fusible = IsProducerConsumerFusible(*producer, *user);
     if (!fusible) {
       ++num_fail_merge_all_users_;
+      VLOG(9) << user->ToString();
       return fusible;
     }
     if (IsInputFusibleReduction(*user)) {
@@ -314,62 +231,17 @@ FusionDecision FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
 
   // We do not want to worsen reduction's memory access pattern by connecting
   // it to a producer which transposes most data.
-  if (has_reduction_user && TransposesMostData(*fusion)) {
+  if (has_reduction_user && TransposesMostData(*producer)) {
     ++num_fail_uncoalesced_read_;
     return "would read mostly uncoalesced";
   }
 
-  // Skip 'fusion' instruction if merging it into all users would result in a
-  // net increase in bytes transferred (currently allowing the net bytes
-  // transferred to be exceeded up to ~10% in exchange for eliminating the
-  // overhead from a GPU kernel launch).
-  const double current_bytes_transferred = GetCurrentBytesTransferred(fusion);
-  const double merged_bytes_transferred = GetMergedBytesTransferred(fusion);
-  const double merged_to_current_bytes_ratio =
-      merged_bytes_transferred / std::max(1.0, current_bytes_transferred);
-  if (merged_to_current_bytes_ratio > 1.10) {
-    ++num_fail_net_bytes_transferred_ratio_;
-    return FusionDecision{} << "merged-to-current-bytes-ratio of "
-                            << merged_to_current_bytes_ratio
-                            << " is not favorable";
-  }
-
-  // Skip 'fusion' instruction if any of its fused instructions are expensive.
-  // This is done to avoid the duplication of expensive instructions, which
-  // would occur if 'fusion' were merged into multiple users.
-  //
-  // Also, we don't want to fuse expensive instructions with instructions which
-  // reuse its operand values (e.g. Broadcast instructions).
-  //
-  // However, if we are going to save a "lot" in memory bandwidth then we
-  // ignore how expensive the fusion instructions are.  The heuristic used to
-  // determine "a lot" is the following: merging must reduce memory traffic by a
-  // factor of 0.3, and the amount of memory accessed must not be entirely
-  // trivial (above 1K).  This likely has room for improvement in the future.
-
-  bool allow_expensive_ops =
-      (fusion->user_count() == 1 || (merged_to_current_bytes_ratio < 0.3 &&
-                                     current_bytes_transferred > 1024)) &&
-      !absl::c_any_of(fusion->users(), [fusion](const HloInstruction* user) {
-        int64_t operand_index = user->operand_index(fusion);
-        return user->ReusesOperandElements(operand_index);
-      });
-  if (!allow_expensive_ops) {
-    for (const HloInstruction* instruction : fusion->fused_instructions()) {
-      if (instruction->opcode() != HloOpcode::kParameter &&
-          GpuInstructionFusion::IsExpensive(*instruction)) {
-        ++num_fail_expensive_fused_instruction_;
-        return FusionDecision{} << "fusion contains an expensive instruction |"
-                                << instruction->name() << "|";
-      }
-    }
-  }
-
-  for (const HloInstruction* user : fusion->users()) {
+  for (const HloInstruction* user : producer->users()) {
     // Skip 'fusion' instruction if merging it into at least one of the users
     // would make the fusion use too much shared memory or registers.
     FusionDecision fits = FusionFitsInBudget(
-        *user, *fusion, /*is_consumer_producer_fusion=*/true);
+        *user, *producer, gpu_device_info_,
+        /*is_consumer_producer_fusion=*/true, &fusion_info_cache_);
     if (!fits) {
       ++num_fail_fusion_too_large_;
       return fits;
@@ -378,19 +250,29 @@ FusionDecision FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
 
   if (!cost_analysis_) {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
-    cost_analysis_.emplace(GpuHloCostAnalysis::Options{shape_size_function_});
+    cost_analysis_.emplace(
+        GpuHloCostAnalysis::Options{shape_size_function_,
+                                    /*per_second_rates=*/{},
+                                    /*count_multiple_input_accesses=*/true});
     TF_CHECK_OK(computation_->Accept(&cost_analysis_.value()));
   }
 
-  for (HloInstruction* user : fusion->users()) {
-    if (cost_analysis_->ProducerConsumerMergedTooLarge(*fusion, *user)) {
+  for (const HloInstruction* user : producer->users()) {
+    if (cost_analysis_->ProducerConsumerMergedTooLarge(*producer, *user)) {
       ++num_fail_inefficient_fusion_emitter_;
       return FusionDecision{} << "if merged with " << user->name()
                               << " will generate huge IR";
     }
   }
 
-  ++total_merged_;
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      producer, &*cost_analysis_, gpu_device_info_, producer->users(),
+      /*multi_output=*/false);
+  if (t.time_fused > t.time_unfused) {
+    ++num_fail_slower_if_fused_;
+    return "will execute slower if fused";
+  }
+
   return {};
 }
 
@@ -398,20 +280,27 @@ StatusOr<bool> FusionMerger::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
-  VLOG(2) << "FusionMerger for module: " << module->name();
+  VLOG(1) << "FusionMerger for module: " << module->name();
   for (auto* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    VLOG(1) << "Before running FusionInstructionMerger for computation: "
+    // Skip Softmax CustomCall computations.
+    if (computation->IsCustomCallComputation() &&
+        IsSoftmaxCustomCall(*computation->CustomCallInstruction())) {
+      continue;
+    }
+
+    VLOG(9) << "Before running FusionInstructionMerger for computation: "
             << computation->name();
-    XLA_VLOG_LINES(3, computation->ToString());
+    XLA_VLOG_LINES(9, computation->ToString());
 
-    FusionInstructionMerger fusion_merger(computation, shape_size_function_);
+    FusionInstructionMerger fusion_merger(computation, gpu_device_info_,
+                                          shape_size_function_);
     TF_RETURN_IF_ERROR(fusion_merger.Run());
     changed |= fusion_merger.changed();
 
-    VLOG(1) << "After running FusionInstructionMerger for computation: "
+    VLOG(9) << "After running FusionInstructionMerger for computation: "
             << computation->name() << " changed: " << changed;
-    XLA_VLOG_LINES(3, computation->ToString());
+    XLA_VLOG_LINES(9, computation->ToString());
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index c87b00ce89e..b3ce80490e2 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -59,17 +60,20 @@ namespace gpu {
 // the latter two could be beneficial.
 
 class FusionMerger : public HloModulePass {
-  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
  public:
-  explicit FusionMerger(HloCostAnalysis::ShapeSizeFunction f)
-      : shape_size_function_(f) {}
+  explicit FusionMerger(const GpuDeviceInfo& d,
+                        HloCostAnalysis::ShapeSizeFunction f)
+      : gpu_device_info_(d), shape_size_function_(f) {}
   absl::string_view name() const override { return "fusion_merger"; }
 
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const GpuDeviceInfo gpu_device_info_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 1880c46f2a5..a0df58f6685 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -18,11 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
-#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
@@ -40,7 +38,8 @@ class FusionMergerTest : public HloTestBase {
   }
 
  public:
-  FusionMerger fusion_merger_{ShapeSizeBytesFunction()};
+  FusionMerger fusion_merger_{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+                              ShapeSizeBytesFunction()};
   FusionMergerTest() : HloTestBase() {}
 };
 
@@ -114,63 +113,65 @@ ENTRY MergeSharedFusionInstruction.Computation0 {
   EXPECT_EQ(7, operand2->fused_instruction_count());
 }
 
-// Tests that threshold for bytes transferred if merged is exceeded.
-//
-// Fusion2 is not merged because it exceeds the threshold bytes transferred.
-// This is because the bytes read by Fusion2 (when replicated if the instruction
-// is merged into Fusion0 and Fusion1) would exceed the bytes transferred
-// threshold.
-TEST_F(FusionMergerTest, BytesTransferredThresholdExceeded) {
+TEST_F(FusionMergerTest, MoreMemoryAccessIfFused) {
   auto module = ParseAndReturnVerifiedModule(R"(
-HloModule BytesTransferredThresholdExceeded
+HloModule m
 
-comp.2 {
-  state.param_1.1 = (f32[4]{0}, f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
-  get-tuple-element.7 = f32[4]{0} get-tuple-element(state.param_1.1), index=0
-  get-tuple-element.8 = f32[4]{0} get-tuple-element(state.param_1.1), index=1
-  add.9 = f32[4]{0} add(get-tuple-element.7, get-tuple-element.8)
-  get-tuple-element.9 = f32[4]{0} get-tuple-element(state.param_1.1), index=2
-  add.10 = f32[4]{0} add(add.9, get-tuple-element.9)
-  get-tuple-element.10 = f32[4]{0} get-tuple-element(state.param_1.1), index=3
-  ROOT add.11 = f32[4]{0} add(add.10, get-tuple-element.10)
+f32add {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  ROOT _ = f32[] add(x, y)
 }
 
-comp.1 {
-  add.2.param_1.1 = f32[4]{0} parameter(1)
-  constant.param_1.3 = f32[4]{0} parameter(0)
-  add.6 = f32[4]{0} add(add.2.param_1.1, constant.param_1.3)
-  ROOT multiply.3 = f32[4]{0} multiply(add.6, constant.param_1.3)
+comp0 {
+  p = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(0)
+  gte0 = f32[100000000] get-tuple-element(p), index=0
+  gte1 = f32[100000000] get-tuple-element(p), index=1
+  add.9 = f32[100000000] add(gte0, gte1)
+  gte2 = f32[100000000] get-tuple-element(p), index=2
+  add.10 = f32[100000000] add(add.9, gte2)
+  gte3 = f32[100000000] get-tuple-element(p), index=3
+  add.11 = f32[100000000] add(add.10, gte3)
+  p1 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(1)
+  gte4 = f32[100000000] get-tuple-element(p1), index=0
+  gte5 = f32[100000000] get-tuple-element(p1), index=1
+  add.12 = f32[100000000] add(gte4, gte5)
+  gte6 = f32[100000000] get-tuple-element(p1), index=2
+  add.13 = f32[100000000] add(add.12, gte6)
+  gte7 = f32[100000000] get-tuple-element(p1), index=3
+  add.14 = f32[100000000] add(add.13, gte7)
+  ROOT r = f32[100000000] add(add.14, add.11)
 }
 
-comp {
-  add.2.param_1 = f32[4]{0} parameter(1)
-  constant.param_1.1 = f32[4]{0} parameter(0)
-  multiply.2 = f32[4]{0} multiply(add.2.param_1, constant.param_1.1)
-  ROOT add.5 = f32[4]{0} add(multiply.2, constant.param_1.1)
+comp1 {
+  p = f32[100000000] parameter(0)
+  c0 = f32[] constant(0)
+  ROOT r = f32[] reduce(p, c0), dimensions={0}, to_apply=f32add
 }
 
-ENTRY BytesTransferredThresholdExceeded.Computation2 {
-  constant = f32[4]{0} constant({1, 1, 1, 1})
-  state = (f32[4]{0}, f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
-  fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
-  fusion.3 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp.1
-  fusion.4 = f32[4]{0} fusion(constant, fusion.2), kind=kLoop, calls=comp
-  ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
-})")
+comp2 {
+  p = f32[100000000] parameter(0)
+  c0 = f32[] constant(0)
+  r = f32[] reduce(p, c0), dimensions={0}, to_apply=f32add
+  ROOT n = f32[] negate(r)
+}
+
+ENTRY m.Computation2 {
+  p0 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(0)
+  p1 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(1)
+  fusion.0 = f32[100000000] fusion(p0, p1), kind=kLoop, calls=comp0
+  fusion.1 = f32[] fusion(fusion.0), kind=kLoop, calls=comp1
+  fusion.2 = f32[] fusion(fusion.0), kind=kLoop, calls=comp2
+  ROOT tuple = (f32[], f32[]) tuple(fusion.1, fusion.2)
+}
+)")
                     .value();
-  // Run fusion merger pass, which should detect that the net bytes transferred
-  // (if merged) would increase.
   EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
 }
 
-// Tests that threshold for bytes transferred if merged is not exceeded.
-//
-// Fusion2 is merged into Fusion0 and Fusion1, because bytes read from Param by
-// Fusion2 is reduced for this test which makes the merge operation into its
-// operand below the bytes transferred threshold.
-TEST_F(FusionMergerTest, BytesTransferredThresholdNotExceeded) {
+TEST_F(FusionMergerTest, LessMemoryAccessIfFused) {
   auto module = ParseAndReturnVerifiedModule(R"(
-HloModule BytesTransferredThresholdNotExceeded
+HloModule m
 
 comp.2 {
   state.param_1.1 = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
@@ -195,7 +196,7 @@ comp {
   ROOT add.4 = f32[4]{0} add(multiply.2, constant.param_1.1)
 }
 
-ENTRY BytesTransferredThresholdNotExceeded.Computation2 {
+ENTRY m.Computation2 {
   constant = f32[4]{0} constant({1, 1, 1, 1})
   state = (f32[4]{0}, f32[4]{0}, f32[4]{0}) parameter(0)
   fusion.2 = f32[4]{0} fusion(state), kind=kLoop, calls=comp.2
@@ -204,8 +205,6 @@ ENTRY BytesTransferredThresholdNotExceeded.Computation2 {
   ROOT tuple = (f32[4]{0}, f32[4]{0}) tuple(fusion.3, fusion.4)
 })")
                     .value();
-  // Run fusion merger pass, which should detect that the net bytes transferred
-  // (if merged) would not increase.
   EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
 }
 
@@ -466,6 +465,29 @@ ENTRY e {
   EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
 }
 
+TEST_F(FusionMergerTest, WillMergeSliceIntoReusingConsumer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+f1 {
+  p01 = s8[1000000] parameter(0)
+  ROOT s0 = s8[10] slice(p01), slice={[0:10]}
+}
+
+f2 {
+  p02 = s8[10] parameter(0)
+  ROOT b0 = s8[10,1000000] broadcast(p02), dimensions={0}
+}
+
+ENTRY e {
+  p0 = s8[1000000] parameter(0)
+  f1 = s8[10] fusion(p0), kind=kLoop, calls=f1
+  ROOT r = s8[10,1000000] fusion(f1), kind=kLoop, calls=f2
+})")
+                    .value();
+  EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
+}
+
 TEST_F(FusionMergerTest, WillMergeExpensiveFusionsIfSavesMemory) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
@@ -524,180 +546,291 @@ TEST_F(FusionMergerTest, WillNotMergeExpensiveFusionsWithReusingConsumer) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
 
-    %f_b (p: f32[1024,1024,1024]) -> f32[1024,1024,1024] {
+    %f_b {
       %p = f32[1024,1024,1024] parameter(0)
-      ROOT %t = f32[1024,1024,1024] tanh(%p)
+      %t1 = f32[1024,1024,1024] tanh(%p)
+      %t2 = f32[1024,1024,1024] tanh(%t1)
+      %t3 = f32[1024,1024,1024] tanh(%t2)
+      %t4 = f32[1024,1024,1024] tanh(%t3)
+      %t5 = f32[1024,1024,1024] tanh(%t4)
+      %t6 = f32[1024,1024,1024] tanh(%t5)
+      %t7 = f32[1024,1024,1024] tanh(%t6)
+      %t8 = f32[1024,1024,1024] tanh(%t7)
+      ROOT %t9 = f32[1024,1024,1024] tanh(%t8)
     }
 
-    %f_c (p: f32[1024,1024,1024]) -> f32[1024,1024,1024,2] {
+    %f_c {
       %p = f32[1024,1024,1024] parameter(0)
-      ROOT %t = f32[1024,1024,1024,2] broadcast(%p), dimensions={0,1,2}
+      ROOT %t = f32[1024,1024,1024,2048] broadcast(%p), dimensions={0,1,2}
     }
 
     ENTRY entry {
       p0 = f32[1024,1024,1024] parameter(0)
       f1 = f32[1024,1024,1024] fusion(p0), kind=kLoop, calls=%f_b
-      ROOT f2 = f32[1024,1024,1024,2] fusion(f1), kind=kLoop, calls=%f_c
+      ROOT f2 = f32[1024,1024,1024,2048] fusion(f1), kind=kLoop, calls=%f_c
     })")
                     .value();
   EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
 }
 
-TEST_F(FusionMergerTest, NoMergeBecauseCodeDuplication) {
+TEST_F(FusionMergerTest, NoMergeWithBitcast) {
   auto module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-
-and.reduce_sub_computation {
-  x = pred[] parameter(0)
-  y = pred[] parameter(1)
-  ROOT and = pred[] and(x, y)
-}
-
-fused_computation.1 {
-  param_4.658 = f32[2,20,256]{2,0,1} parameter(4)
-  slice.1385 = f32[2,1,256]{2,0,1} slice(param_4.658), slice={[0:2], [11:12], [0:256]}
-  constant.6847 = s32[] constant(0)
-  broadcast.4823 = s32[3]{0} broadcast(constant.6847), dimensions={}
-  param_9.415 = s32[3]{0} parameter(9)
-  compare.700 = pred[3]{0} compare(broadcast.4823, param_9.415), direction=LE
-  constant.6846 = pred[] constant(true)
-  reduce.221 = pred[] reduce(compare.700, constant.6846), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2933 = pred[2,1,256]{2,0,1} broadcast(reduce.221), dimensions={}
-  param_5.528 = f32[2,512]{1,0} parameter(5)
-  slice.1384 = f32[2,256]{1,0} slice(param_5.528), slice={[0:2], [0:256]}
-  bitcast.341 = f32[2,1,256]{2,0,1} bitcast(slice.1384)
-  constant.5418 = f32[] constant(0)
-  broadcast.3227 = f32[2,1,256]{2,0,1} broadcast(constant.5418), dimensions={}
-  select.173 = f32[2,1,256]{2,0,1} select(broadcast.2933, bitcast.341, broadcast.3227)
-  add.573 = f32[2,1,256]{2,0,1} add(slice.1385, select.173)
-  param_0.299 = s32[] parameter(0)
-  constant.5157 = s32[] constant(11)
-  dynamic-update-slice.189 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.658, add.573, param_0.299, constant.5157, param_0.299)
-  slice.1383 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.189), slice={[0:2], [10:11], [0:256]}
-  constant.6800 = s32[] constant(0)
-  broadcast.4803 = s32[3]{0} broadcast(constant.6800), dimensions={}
-  param_8.484 = s32[3]{0} parameter(8)
-  compare.681 = pred[3]{0} compare(broadcast.4803, param_8.484), direction=LE
-  constant.6798 = pred[] constant(true)
-  reduce.203 = pred[] reduce(compare.681, constant.6798), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2932 = pred[2,1,256]{2,0,1} broadcast(reduce.203), dimensions={}
-  param_3.1169 = f32[2,512]{1,0} parameter(3)
-  slice.1382 = f32[2,256]{1,0} slice(param_3.1169), slice={[0:2], [0:256]}
-  bitcast.340 = f32[2,1,256]{2,0,1} bitcast(slice.1382)
-  select.172 = f32[2,1,256]{2,0,1} select(broadcast.2932, bitcast.340, broadcast.3227)
-  add.572 = f32[2,1,256]{2,0,1} add(slice.1383, select.172)
-  constant.5154 = s32[] constant(10)
-  dynamic-update-slice.188 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.189, add.572, param_0.299, constant.5154, param_0.299)
-  slice.1381 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.188), slice={[0:2], [9:10], [0:256]}
-  constant.6794 = s32[] constant(0)
-  broadcast.4801 = s32[3]{0} broadcast(constant.6794), dimensions={}
-  param_7.478 = s32[3]{0} parameter(7)
-  compare.679 = pred[3]{0} compare(broadcast.4801, param_7.478), direction=LE
-  constant.6793 = pred[] constant(true)
-  reduce.201 = pred[] reduce(compare.679, constant.6793), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2930 = pred[2,1,256]{2,0,1} broadcast(reduce.201), dimensions={}
-  param_2.1685 = f32[2,512]{1,0} parameter(2)
-  slice.1380 = f32[2,256]{1,0} slice(param_2.1685), slice={[0:2], [0:256]}
-  bitcast.339 = f32[2,1,256]{2,0,1} bitcast(slice.1380)
-  select.171 = f32[2,1,256]{2,0,1} select(broadcast.2930, bitcast.339, broadcast.3227)
-  add.571 = f32[2,1,256]{2,0,1} add(slice.1381, select.171)
-  constant.5153 = s32[] constant(9)
-  dynamic-update-slice.187 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.188, add.571, param_0.299, constant.5153, param_0.299)
-  slice.1379 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.187), slice={[0:2], [8:9], [0:256]}
-  constant.6788 = s32[] constant(0)
-  broadcast.4799 = s32[3]{0} broadcast(constant.6788), dimensions={}
-  param_6.495 = s32[3]{0} parameter(6)
-  compare.677 = pred[3]{0} compare(broadcast.4799, param_6.495), direction=LE
-  constant.6786 = pred[] constant(true)
-  reduce.199 = pred[] reduce(compare.677, constant.6786), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2929 = pred[2,1,256]{2,0,1} broadcast(reduce.199), dimensions={}
-  param_1.1408 = f32[2,512]{1,0} parameter(1)
-  slice.1378 = f32[2,256]{1,0} slice(param_1.1408), slice={[0:2], [0:256]}
-  bitcast.338 = f32[2,1,256]{2,0,1} bitcast(slice.1378)
-  select.170 = f32[2,1,256]{2,0,1} select(broadcast.2929, bitcast.338, broadcast.3227)
-  add.570 = f32[2,1,256]{2,0,1} add(slice.1379, select.170)
-  constant.5152 = s32[] constant(8)
-  ROOT dynamic-update-slice.186 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.187, add.570, param_0.299, constant.5152, param_0.299)
-}
-
-fused_computation.2 {
-  param_4.655 = f32[2,20,256]{2,0,1} parameter(4)
-  slice.1369 = f32[2,1,256]{2,0,1} slice(param_4.655), slice={[0:2], [7:8], [0:256]}
-  param_6.483 = pred[] parameter(6)
-  broadcast.2927 = pred[2,1,256]{2,0,1} broadcast(param_6.483), dimensions={}
-  param_5.525 = f32[2,512]{1,0} parameter(5)
-  slice.1368 = f32[2,256]{1,0} slice(param_5.525), slice={[0:2], [0:256]}
-  bitcast.333 = f32[2,1,256]{2,0,1} bitcast(slice.1368)
-  constant.5415 = f32[] constant(0)
-  broadcast.3225 = f32[2,1,256]{2,0,1} broadcast(constant.5415), dimensions={}
-  select.161 = f32[2,1,256]{2,0,1} select(broadcast.2927, bitcast.333, broadcast.3225)
-  add.549 = f32[2,1,256]{2,0,1} add(slice.1369, select.161)
-  param_0.265 = s32[] parameter(0)
-  constant.5151 = s32[] constant(7)
-  dynamic-update-slice.185 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.655, add.549, param_0.265, constant.5151, param_0.265)
-  slice.1367 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.185), slice={[0:2], [6:7], [0:256]}
-  constant.6782 = s32[] constant(0)
-  broadcast.4797 = s32[3]{0} broadcast(constant.6782), dimensions={}
-  param_9.391 = s32[3]{0} parameter(9)
-  compare.675 = pred[3]{0} compare(broadcast.4797, param_9.391), direction=LE
-  constant.6781 = pred[] constant(true)
-  reduce.197 = pred[] reduce(compare.675, constant.6781), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2926 = pred[2,1,256]{2,0,1} broadcast(reduce.197), dimensions={}
-  param_3.1167 = f32[2,512]{1,0} parameter(3)
-  slice.1366 = f32[2,256]{1,0} slice(param_3.1167), slice={[0:2], [0:256]}
-  bitcast.332 = f32[2,1,256]{2,0,1} bitcast(slice.1366)
-  select.160 = f32[2,1,256]{2,0,1} select(broadcast.2926, bitcast.332, broadcast.3225)
-  add.548 = f32[2,1,256]{2,0,1} add(slice.1367, select.160)
-  constant.5150 = s32[] constant(6)
-  dynamic-update-slice.184 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.185, add.548, param_0.265, constant.5150, param_0.265)
-  slice.1365 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.184), slice={[0:2], [5:6], [0:256]}
-  constant.6776 = s32[] constant(0)
-  broadcast.4794 = s32[3]{0} broadcast(constant.6776), dimensions={}
-  param_8.464 = s32[3]{0} parameter(8)
-  compare.673 = pred[3]{0} compare(broadcast.4794, param_8.464), direction=LE
-  constant.6775 = pred[] constant(true)
-  reduce.195 = pred[] reduce(compare.673, constant.6775), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2925 = pred[2,1,256]{2,0,1} broadcast(reduce.195), dimensions={}
-  param_2.1684 = f32[2,512]{1,0} parameter(2)
-  slice.1364 = f32[2,256]{1,0} slice(param_2.1684), slice={[0:2], [0:256]}
-  bitcast.331 = f32[2,1,256]{2,0,1} bitcast(slice.1364)
-  select.159 = f32[2,1,256]{2,0,1} select(broadcast.2925, bitcast.331, broadcast.3225)
-  add.547 = f32[2,1,256]{2,0,1} add(slice.1365, select.159)
-  constant.5149 = s32[] constant(5)
-  dynamic-update-slice.183 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.184, add.547, param_0.265, constant.5149, param_0.265)
-  slice.1363 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.183), slice={[0:2], [4:5], [0:256]}
-  constant.6770 = s32[] constant(0)
-  broadcast.4792 = s32[3]{0} broadcast(constant.6770), dimensions={}
-  param_7.458 = s32[3]{0} parameter(7)
-  compare.671 = pred[3]{0} compare(broadcast.4792, param_7.458), direction=LE
-  constant.6769 = pred[] constant(true)
-  reduce.193 = pred[] reduce(compare.671, constant.6769), dimensions={0}, to_apply=and.reduce_sub_computation
-  broadcast.2924 = pred[2,1,256]{2,0,1} broadcast(reduce.193), dimensions={}
-  param_1.1405 = f32[2,512]{1,0} parameter(1)
-  slice.1362 = f32[2,256]{1,0} slice(param_1.1405), slice={[0:2], [0:256]}
-  bitcast.330 = f32[2,1,256]{2,0,1} bitcast(slice.1362)
-  select.158 = f32[2,1,256]{2,0,1} select(broadcast.2924, bitcast.330, broadcast.3225)
-  add.546 = f32[2,1,256]{2,0,1} add(slice.1363, select.158)
-  constant.5148 = s32[] constant(4)
-  ROOT dynamic-update-slice.182 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.183, add.546, param_0.265, constant.5148, param_0.265)
-}
-
-ENTRY main {
-  param_0.0 = s32[] parameter(0)
-  param_1.0 = f32[2,512]{1,0} parameter(1)
-  param_2.0 = f32[2,512]{1,0} parameter(2)
-  param_3.0 = f32[2,512]{1,0} parameter(3)
-  param_4.0 = f32[2,20,256]{2,1,0} parameter(4)
-  param_5.0 = f32[2,512]{1,0} parameter(5)
-  param_6.0 = s32[3]{0} parameter(6)
-  param_7.0 = s32[3]{0} parameter(7)
-  param_8.0 = s32[3]{0} parameter(8)
-  param_9.0 = s32[3]{0} parameter(9)
-  fusion.1 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, param_4.0, param_5.0, param_6.0, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.1
-  param_10 = pred[] parameter(10)
-  ROOT fusion.2 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, fusion.1, param_5.0, param_10, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.2
+HloModule m
+
+f32add {
+  x.634 = f32[] parameter(0)
+  y.635 = f32[] parameter(1)
+  ROOT add.636 = f32[] add(x.634, y.635)
 }
-  )")
+
+fused_computation.103 {
+  param_0.310 = f16[1,8,512,1536]{2,3,1,0} parameter(0)
+  param_1.420 = f32[8,512]{1,0} parameter(1)
+  bitcast.1144 = f32[1,8,512]{2,1,0} bitcast(param_1.420)
+  convert.252 = f16[1,8,512]{2,1,0} convert(bitcast.1144)
+  bitcast.1143 = f16[8,512]{1,0} bitcast(convert.252)
+  broadcast.481 = f16[1,8,512,1536]{2,3,1,0} broadcast(bitcast.1143), dimensions={1,2}
+  divide.15 = f16[1,8,512,1536]{2,3,1,0} divide(param_0.310, broadcast.481)
+  ROOT bitcast.1142 = f16[8,512,1536]{1,2,0} bitcast(divide.15)
+}
+
+fused_computation.105 {
+  param_1.426 = f16[8,1536,512]{2,1,0} parameter(1)
+  bitcast.1896 = f16[1,8,1536,512]{3,2,1,0} bitcast(param_1.426)
+  transpose.238 = f16[1,8,512,1536]{2,3,1,0} transpose(bitcast.1896), dimensions={0,1,3,2}
+  param_0.315 = f16[8,512]{1,0} parameter(0)
+  broadcast.482 = f16[1,8,512,1536]{2,3,1,0} broadcast(param_0.315), dimensions={1,2}
+  subtract.22 = f16[1,8,512,1536]{2,3,1,0} subtract(transpose.238, broadcast.482) 
+  ROOT exponential.15 = f16[1,8,512,1536]{2,3,1,0} exponential(subtract.22)
+}
+
+fused_computation.104 {
+  param_0.1000 = f16[8,1536,512]{2,1,0} parameter(0)
+  convert.652 = f32[8,1536,512]{2,1,0} convert(param_0.1000)
+  constant_752 = f32[] constant(-0)
+  ROOT reduce.232 = f32[8,512]{1,0} reduce(convert.652, constant_752),
+  dimensions={1}, to_apply=f32add
+}
+
+ENTRY entry {
+  p0 = f16[8,1536,512]{2,1,0} parameter(0)
+  p1 = f16[8,512]{1,0} parameter(1)
+  fusion.105 = f16[1,8,512,1536]{2,3,1,0} fusion(p1, p0), kind=kLoop, calls=fused_computation.105
+  bitcast.1787 = f16[8,1536,512]{2,1,0} bitcast(fusion.105)
+  fusion.104 = f32[8,512]{1,0} fusion(bitcast.1787), kind=kInput, calls=fused_computation.104
+  ROOT fusion.103 = f16[8,512,1536]{1,2,0} fusion(fusion.105, fusion.104), kind=kLoop, calls=fused_computation.103
+}
+    )")
+                    .value();
+  EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
+}
+
+TEST_F(FusionMergerTest, CostBasedMerge) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+fused_computation.45 {
+  param_1.194 = f16[8,1536,512]{2,1,0} parameter(1)
+  bitcast.1042 = f16[1,8,512,1536]{2,3,1,0} bitcast(param_1.194)
+  param_0.135 = f16[8,512]{1,0} parameter(0)
+  broadcast.391 = f16[1,8,512,1536]{2,3,1,0} broadcast(param_0.135), dimensions={1,2}
+  subtract.6 = f16[1,8,512,1536]{2,3,1,0} subtract(bitcast.1042, broadcast.391)
+  ROOT exponential.11 = f16[1,8,512,1536]{2,3,1,0} exponential(subtract.6)
+}
+
+f32add {
+  x.634 = f32[] parameter(0)
+  y.635 = f32[] parameter(1)
+  ROOT add.636 = f32[] add(x.634, y.635)
+}
+
+fused_computation.44 {
+  param_0.869 = f16[1,8,512,1536]{2,3,1,0} parameter(0)
+  convert.221 = f32[1,8,512,1536]{2,3,1,0} convert(param_0.869)
+  transpose.212 = f32[1,8,1536,512]{3,2,1,0} transpose(convert.221), dimensions={0,1,3,2}
+  bitcast.1041 = f32[8,1536,512]{2,1,0} bitcast(transpose.212)
+  constant_429 = f32[] constant(0)
+  ROOT reduce.149 = f32[8,512]{1,0} reduce(bitcast.1041, constant_429), dimensions={1}, to_apply=f32add
+}
+
+fused_computation.43 {
+  param_0.130 = f16[1,8,512,1536]{2,3,1,0} parameter(0)
+  param_1.188 = f32[8,512]{1,0} parameter(1)
+  bitcast.1040 = f32[1,8,512]{2,1,0} bitcast(param_1.188)
+  convert.220 = f16[1,8,512]{2,1,0} convert(bitcast.1040)
+  bitcast.1039 = f16[8,512]{1,0} bitcast(convert.220)
+  broadcast.390 = f16[1,8,512,1536]{2,3,1,0} broadcast(bitcast.1039), dimensions={1,2}
+  divide.11 = f16[1,8,512,1536]{2,3,1,0} divide(param_0.130, broadcast.390)
+  ROOT bitcast.1038 = f16[8,512,1536]{1,2,0} bitcast(divide.11)
+}
+
+ENTRY entry {
+  p0 = f16[8,1536,512]{2,1,0} parameter(0)
+  p1 = f16[8,512]{1,0} parameter(1)
+  fusion.45 = f16[1,8,512,1536]{2,3,1,0} fusion(p1, p0), kind=kLoop, calls=fused_computation.45
+  fusion.44 = f32[8,512]{1,0} fusion(fusion.45), kind=kInput, calls=fused_computation.44
+  ROOT fusion.43 = f16[8,512,1536]{1,2,0} fusion(fusion.45, fusion.44), kind=kLoop, calls=fused_computation.43
+}
+    )")
+                    .value();
+  EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
+}
+
+// Outputs of fusions 66 and 67 here are heavily reused by fusion 59 - so
+// it is better to not merge here.
+TEST_F(FusionMergerTest, CostBasedNoMerge) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+add_float_.56 {
+  x.57 = f32[] parameter(0)
+  y.58 = f32[] parameter(1)
+  ROOT add.59 = f32[] add(x.57, y.58)
+}
+
+fused_computation.66 {
+  constant.635 = f32[] constant(0)
+  broadcast.257 = f32[459,3]{1,0} broadcast(constant.635), dimensions={}
+  constant.641 = f32[] constant(1)
+  broadcast.256 = f32[459,3]{1,0} broadcast(constant.641), dimensions={}
+  broadcast.255 = f32[459]{0} broadcast(constant.635), dimensions={}
+  iota.28 = f32[459]{0} iota(), iota_dimension=0
+  constant.629 = f32[] constant(1.49891067)
+  broadcast.253 = f32[459]{0} broadcast(constant.629), dimensions={}
+  multiply.39 = f32[459]{0} multiply(iota.28, broadcast.253)
+  constant.633 = f32[] constant(-1)
+  broadcast.252 = f32[459]{0} broadcast(constant.633), dimensions={}
+  add.31 = f32[459]{0} add(multiply.39, broadcast.252)
+  ceil.11 = f32[459]{0} ceil(add.31)
+  constant.630 = f32[] constant(685)
+  broadcast.251 = f32[459]{0} broadcast(constant.630), dimensions={}
+  clamp.49 = f32[459]{0} clamp(broadcast.255, ceil.11, broadcast.251)
+  subtract.11 = f32[459]{0} subtract(clamp.49, multiply.39)
+  broadcast.249 = f32[459,3]{1,0} broadcast(subtract.11), dimensions={0}
+  iota.26 = f32[459,3]{1,0} iota(), iota_dimension=1
+  add.30 = f32[459,3]{1,0} add(broadcast.249, iota.26)
+  abs.3 = f32[459,3]{1,0} abs(add.30)
+  subtract.10 = f32[459,3]{1,0} subtract(broadcast.256, abs.3)
+  maximum.6 = f32[459,3]{1,0} maximum(broadcast.257, subtract.10)
+  ROOT reduce.3 = f32[459]{0} reduce(maximum.6, constant.635), dimensions={1}, to_apply=add_float_.56
+}
+
+fused_computation.67 {
+  constant.684 = f32[] constant(0)
+  broadcast.296 = f32[1130,3]{1,0} broadcast(constant.684), dimensions={}
+  constant.685 = f32[] constant(1)
+  broadcast.295 = f32[1130,3]{1,0} broadcast(constant.685), dimensions={}
+  broadcast.294 = f32[1130]{0} broadcast(constant.684), dimensions={}
+  iota.41 = f32[1130]{0} iota(), iota_dimension=0
+  constant.675 = f32[] constant(1.34513271)
+  broadcast.293 = f32[1130]{0} broadcast(constant.675), dimensions={}
+  multiply.47 = f32[1130]{0} multiply(iota.41, broadcast.293)
+  constant.677 = f32[] constant(-1)
+  broadcast.290 = f32[1130]{0} broadcast(constant.677), dimensions={}
+  add.39 = f32[1130]{0} add(multiply.47, broadcast.290)
+  ceil.15 = f32[1130]{0} ceil(add.39)
+  constant.676 = f32[] constant(1517)
+  broadcast.289 = f32[1130]{0} broadcast(constant.676), dimensions={}
+  clamp.53 = f32[1130]{0} clamp(broadcast.294, ceil.15, broadcast.289)
+  subtract.19 = f32[1130]{0} subtract(clamp.53, multiply.47)
+  broadcast.287 = f32[1130,3]{1,0} broadcast(subtract.19), dimensions={0}
+  iota.39 = f32[1130,3]{1,0} iota(), iota_dimension=1
+  add.38 = f32[1130,3]{1,0} add(broadcast.287, iota.39)
+  abs.7 = f32[1130,3]{1,0} abs(add.38)
+  subtract.18 = f32[1130,3]{1,0} subtract(broadcast.295, abs.7)
+  maximum.10 = f32[1130,3]{1,0} maximum(broadcast.296, subtract.18)
+  ROOT reduce.4 = f32[1130]{0} reduce(maximum.10, constant.684), dimensions={1}, to_apply=add_float_.56
+}
+
+fused_computation.59 {
+  constant.532 = f32[] constant(0)
+  broadcast.316 = f32[1130,3]{1,0} broadcast(constant.532), dimensions={}
+  constant.663 = f32[] constant(1)
+  broadcast.315 = f32[1130,3]{1,0} broadcast(constant.663), dimensions={}
+  broadcast.314 = f32[1130]{0} broadcast(constant.532), dimensions={}
+  iota.47 = f32[1130]{0} iota(), iota_dimension=0
+  constant.579 = f32[] constant(1.34513271)
+  broadcast.311 = f32[1130]{0} broadcast(constant.579), dimensions={}
+  multiply.51 = f32[1130]{0} multiply(iota.47, broadcast.311)
+  constant.578 = f32[] constant(-1)
+  broadcast.310 = f32[1130]{0} broadcast(constant.578), dimensions={}
+  add.43 = f32[1130]{0} add(multiply.51, broadcast.310)
+  ceil.17 = f32[1130]{0} ceil(add.43)
+  constant.576 = f32[] constant(1517)
+  broadcast.309 = f32[1130]{0} broadcast(constant.576), dimensions={}
+  clamp.55 = f32[1130]{0} clamp(broadcast.314, ceil.17, broadcast.309)
+  subtract.24 = f32[1130]{0} subtract(clamp.55, multiply.51)
+  broadcast.306 = f32[1130,3]{1,0} broadcast(subtract.24), dimensions={0}
+  iota.45 = f32[1130,3]{1,0} iota(), iota_dimension=1
+  add.42 = f32[1130,3]{1,0} add(broadcast.306, iota.45)
+  abs.9 = f32[1130,3]{1,0} abs(add.42)
+  subtract.23 = f32[1130,3]{1,0} subtract(broadcast.315, abs.9)
+  maximum.12 = f32[1130,3]{1,0} maximum(broadcast.316, subtract.23)
+  param_2.183 = f32[1130]{0} parameter(2)
+  broadcast.172 = f32[1130,3]{1,0} broadcast(param_2.183), dimensions={0}
+  divide.3 = f32[1130,3]{1,0} divide(maximum.12, broadcast.172)
+  bitcast.53 = f32[3390]{0} bitcast(divide.3)
+  broadcast.171 = f32[3390,1377]{1,0} broadcast(bitcast.53), dimensions={0}
+  broadcast.276 = f32[459,3]{1,0} broadcast(constant.532), dimensions={}
+  broadcast.275 = f32[459,3]{1,0} broadcast(constant.663), dimensions={}
+  broadcast.274 = f32[459]{0} broadcast(constant.532), dimensions={}
+  iota.35 = f32[459]{0} iota(), iota_dimension=0
+  constant.614 = f32[] constant(1.49891067)
+  broadcast.273 = f32[459]{0} broadcast(constant.614), dimensions={}
+  multiply.43 = f32[459]{0} multiply(iota.35, broadcast.273)
+  broadcast.272 = f32[459]{0} broadcast(constant.578), dimensions={}
+  add.35 = f32[459]{0} add(multiply.43, broadcast.272)
+  ceil.13 = f32[459]{0} ceil(add.35)
+  constant.611 = f32[] constant(685)
+  broadcast.269 = f32[459]{0} broadcast(constant.611), dimensions={}
+  clamp.51 = f32[459]{0} clamp(broadcast.274, ceil.13, broadcast.269)
+  subtract.15 = f32[459]{0} subtract(clamp.51, multiply.43)
+  broadcast.267 = f32[459,3]{1,0} broadcast(subtract.15), dimensions={0}
+  iota.33 = f32[459,3]{1,0} iota(), iota_dimension=1
+  add.34 = f32[459,3]{1,0} add(broadcast.267, iota.33)
+  abs.5 = f32[459,3]{1,0} abs(add.34)
+  subtract.14 = f32[459,3]{1,0} subtract(broadcast.275, abs.5)
+  maximum.8 = f32[459,3]{1,0} maximum(broadcast.276, subtract.14)
+  param_1.177 = f32[459]{0} parameter(1)
+  broadcast.170 = f32[459,3]{1,0} broadcast(param_1.177), dimensions={0}
+  divide.2 = f32[459,3]{1,0} divide(maximum.8, broadcast.170)
+  bitcast.52 = f32[1377]{0} bitcast(divide.2)
+  broadcast.169 = f32[3390,1377]{1,0} broadcast(bitcast.52), dimensions={1}
+  multiply.15 = f32[3390,1377]{1,0} multiply(broadcast.171, broadcast.169)
+  bitcast.61 = f32[1130,3,459,3]{3,2,1,0} bitcast(multiply.15)
+  transpose.68 = f32[459,1130,3,3]{2,0,3,1} transpose(bitcast.61), dimensions={2,0,3,1}
+  copy.1 = f32[459,1130,3,3]{3,2,1,0} copy(transpose.68)
+  bitcast.50 = f32[1130,459,9]{2,1,0} bitcast(copy.1)
+  broadcast.168 = f32[1130,459,6,9]{3,2,1,0} broadcast(bitcast.50), dimensions={0,1,3}
+  param_0.171 = u8[1,688,1520,6]{3,2,1,0} parameter(0)
+  bitcast.49 = u8[688,1520,1,6]{3,1,0,2} bitcast(param_0.171)
+  convert.175 = f32[688,1520,1,6]{3,1,0,2} convert(bitcast.49)
+  broadcast.167 = f32[459,1130,1]{2,1,0} broadcast(clamp.51), dimensions={0}
+  broadcast.166 = f32[459,1130,1]{2,1,0} broadcast(clamp.55), dimensions={1}
+  concatenate.3 = f32[459,1130,2]{2,1,0} concatenate(broadcast.167, broadcast.166), dimensions={2}
+  convert.174 = s32[459,1130,2]{2,1,0} convert(concatenate.3)
+  bitcast.48 = s32[518670,2]{1,0} bitcast(convert.174)
+  gather.1 = f32[518670,3,3,1,6]{2,1,4,0,3} gather(convert.175, bitcast.48), offset_dims={1,2,3,4}, collapsed_slice_dims={}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={3,3,1,6}
+  transpose.69 = f32[1,518670,6,3,3]{4,3,2,1,0} transpose(gather.1), dimensions={3,0,4,1,2}
+  bitcast.47 = f32[1130,459,6,9]{3,2,1,0} bitcast(transpose.69)
+  multiply.14 = f32[1130,459,6,9]{3,2,1,0} multiply(broadcast.168, bitcast.47)
+  reduce.2 = f32[1130,459,6]{2,1,0} reduce(multiply.14, constant.532), dimensions={3}, to_apply=add_float_.56
+  convert.173 = f16[1130,459,6]{2,1,0} convert(reduce.2)
+  bitcast.46 = f16[1,459,1130,6]{3,2,1,0} bitcast(convert.173)
+  constant.533 = f16[] constant(0)
+  pad.9 = f16[1,480,1130,6]{3,2,1,0} pad(bitcast.46, constant.533), padding=0_0x0_21x0_0x0_0
+  pad.8 = f16[1,480,1152,6]{3,2,1,0} pad(pad.9, constant.533), padding=0_0x0_0x0_22x0_0
+  constant.532f16 = f16[] constant(0)
+  ROOT pad.7 = f16[1,485,1157,6]{3,2,1,0} pad(pad.8, constant.532f16), padding=0_0x2_3x2_3x0_0
+}
+
+ENTRY e {
+  arg0.1 = u8[1,688,1520,6]{3,2,1,0} parameter(0), parameter_replication={false}
+  fusion.66 = f32[459]{0} fusion(), kind=kLoop, calls=fused_computation.66
+  fusion.67 = f32[1130]{0} fusion(), kind=kLoop, calls=fused_computation.67
+  ROOT fusion.59 = f16[1,485,1157,6]{2,1,3,0} fusion(arg0.1, fusion.66, fusion.67), kind=kLoop, calls=fused_computation.59
+}
+    )")
                     .value();
   EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
 }
@@ -853,6 +986,47 @@ ENTRY e {
   EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
 }
 
+TEST_F(FusionMergerTest, CommonElementwiseUsedParameter) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+
+    p {
+      p0 = f32[10000000] parameter(0)
+      p1 = f32[10000000] parameter(1)
+      p2 = f32[10000000] parameter(2)
+      p3 = f32[10000000] parameter(3)
+      a0 = f32[10000000] add(p1, p2)
+      a1 = f32[10000000] add(a0, p3)
+      ROOT _ = add(p0, a1)
+    }
+
+    c1 {
+      p0 = f32[10000000] parameter(0)
+      p1 = f32[10000000] parameter(1)
+      ROOT _ = add(p0, p1)
+    }
+
+    c2 {
+      p0 = f32[10000000] parameter(0)
+      p1 = f32[10000000] parameter(1)
+      ROOT _ = multiply(p0, p1)
+    }
+
+    ENTRY entry {
+      p0 = f32[10000000] parameter(0)
+      p1 = f32[10000000] parameter(1)
+      p2 = f32[10000000] parameter(2)
+      p3 = f32[10000000] parameter(3)
+      f = f32[10000000] fusion(p0, p1, p2, p3), kind=kLoop, calls=p
+      f1 = f32[10000000] fusion(p0, f), kind=kLoop, calls=c1
+      f2 = f32[10000000] fusion(p1, f), kind=kLoop, calls=c2
+      ROOT _ = (f32[10000000], f32[10000000]) tuple(f1, f2)
+    }
+    )")
+                    .value();
+  EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index 92a3628fd3b..986dcb20513 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -15,29 +15,34 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
 
+#include <algorithm>
 #include <functional>
 #include <limits>
+#include <optional>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <utility>
+#include <variant>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logger.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 #include "tensorflow/tsl/util/proto/proto_utils.h"
 
 namespace xla {
@@ -45,38 +50,6 @@ namespace gpu {
 
 using tensorflow::AutotuneResult;
 
-namespace {
-
-StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
-    GemmBackendConfig_Epilogue epilogue) {
-  switch (epilogue) {
-    case GemmBackendConfig::DEFAULT:
-      return se::cuda::BlasLt::Epilogue::kDefault;
-    case GemmBackendConfig::RELU:
-      return se::cuda::BlasLt::Epilogue::kReLU;
-    case GemmBackendConfig::BIAS:
-      return se::cuda::BlasLt::Epilogue::kBias;
-    case GemmBackendConfig::BIASRELU:
-      return se::cuda::BlasLt::Epilogue::kBiasThenReLU;
-    default:
-      return InternalError("Unsupported Epilogue.");
-  }
-}
-
-struct AutotuneConfig {
-  bool should_init_buffers() const { return autotune_level >= 2; }
-  bool should_reinit_output_buffer() const { return autotune_level >= 3; }
-  bool should_check_correctness() const { return autotune_level >= 4; }
-
-  int32_t autotune_level;
-  bool should_crash_on_check_failure;
-};
-
-AutotuneConfig GetConfig(const DebugOptions& debug_options) {
-  return {debug_options.xla_gpu_autotune_level(),
-          debug_options.xla_gpu_crash_on_verification_failures()};
-}
-
 se::RedzoneAllocator CreateRedzoneAllocator(
     se::Stream* stream, se::DeviceMemoryAllocator* allocator,
     const DebugOptions& debug_options, const AutotuneConfig& config) {
@@ -90,44 +63,29 @@ se::RedzoneAllocator CreateRedzoneAllocator(
       /*redzone_size=*/redzone_size);
 }
 
-StatusOr<se::DeviceMemoryBase> CreateBuffer(se::RedzoneAllocator& allocator,
-                                            const HloInstruction& op,
-                                            const AutotuneConfig& config,
-                                            int64_t& rng_state) {
-  TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase buffer,
-      allocator.AllocateBytes(ShapeUtil::ByteSizeOf(op.shape())));
-  if (config.should_init_buffers()) {
-    InitializeBuffer(allocator.stream(), op.shape().element_type(), &rng_state,
-                     buffer);
-  }
-  return buffer;
-}
-
 // Returns the index (into `algorithms`) of the fastest algorithm.
 template <typename AlgoT>
 StatusOr<std::optional<size_t>> GetBestAlgorithm(
     se::Stream* stream, se::RedzoneAllocator& allocator,
-    const HloInstruction& gemm, const AutotuneConfig& autotune_config,
-    se::DeviceMemoryBase lhs_buffer, se::DeviceMemoryBase rhs_buffer,
-    se::DeviceMemoryBase output_buffer, absl::Span<const AlgoT> algorithms,
+    std::optional<std::string_view> gemm_str,
+    const AutotuneConfig& autotune_config, se::DeviceMemoryBase lhs_buffer,
+    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
+    absl::Span<const AlgoT> algorithms, const Shape& output_shape,
+    const HloModuleConfig& hlo_module_config, double beta,
     const std::function<StatusOr<se::blas::ProfileResult>(const AlgoT&)>&
         run_benchmark) {
   if (!stream->parent()->SynchronizeAllActivity()) {
     return InternalError("Failed to synchronize GPU for autotuning.");
   }
 
-  TF_ASSIGN_OR_RETURN(GemmBackendConfig backend_config,
-                      gemm.backend_config<GemmBackendConfig>());
-
   se::DeviceMemoryBase reference_buffer;
   if (autotune_config.should_check_correctness()) {
     TF_ASSIGN_OR_RETURN(
         reference_buffer,
-        allocator.AllocateBytes(ShapeUtil::ByteSizeOf(gemm.shape())));
+        allocator.AllocateBytes(ShapeUtil::ByteSizeOf(output_shape)));
   }
 
-  BufferComparator comparator(gemm.shape(), gemm.GetModule()->config());
+  BufferComparator comparator(output_shape, hlo_module_config);
 
   std::vector<AutotuneResult> results;
   std::optional<int64_t> reference_algorithm;
@@ -135,10 +93,9 @@ StatusOr<std::optional<size_t>> GetBestAlgorithm(
   for (const AlgoT& algorithm : algorithms) {
     // Make sure the output buffer always has the same value if we use
     // the bias parameter.
-    if (autotune_config.should_reinit_output_buffer() &&
-        backend_config.beta() != 0) {
+    if (autotune_config.should_reinit_output_buffer() && beta != 0) {
       int64_t rng_state = 0;
-      InitializeBuffer(stream, gemm.shape().element_type(), &rng_state,
+      InitializeBuffer(stream, output_shape.element_type(), &rng_state,
                        output_buffer);
     }
 
@@ -206,7 +163,8 @@ StatusOr<std::optional<size_t>> GetBestAlgorithm(
     tsl::Logger::GetSingleton()->LogProto(log);
   }
 
-  StatusOr<AutotuneResult> best = PickBestResult(results, gemm);
+  StatusOr<AutotuneResult> best =
+      PickBestResult(results, gemm_str, hlo_module_config);
   if (best.ok()) {
     for (size_t i = 0; i < results.size(); ++i) {
       if (best->gemm().algorithm() == results[i].gemm().algorithm()) {
@@ -222,45 +180,114 @@ StatusOr<std::optional<size_t>> GetBestAlgorithm(
   return {std::nullopt};
 }
 
+StatusOr<std::optional<size_t>> GetBestBlasAlgorithm(
+    se::Stream* stream, se::RedzoneAllocator& allocator,
+    std::optional<std::string_view> gemm_str,
+    const AutotuneConfig& autotune_config, se::DeviceMemoryBase lhs_buffer,
+    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
+    absl::Span<const se::blas::AlgorithmType> algorithms,
+    const Shape& output_shape, const HloModuleConfig& hlo_module_config,
+    double beta,
+    const std::function<StatusOr<se::blas::ProfileResult>(
+        const se::blas::AlgorithmType&)>& run_benchmark) {
+  TF_ASSIGN_OR_RETURN(
+      std::optional<size_t> result,
+      GetBestAlgorithm<se::blas::AlgorithmType>(
+          stream, allocator, gemm_str, autotune_config, lhs_buffer, rhs_buffer,
+          output_buffer, algorithms, output_shape, hlo_module_config, beta,
+          run_benchmark));
+  return result;
+}
+
+namespace {
+
+StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return se::cuda::BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return se::cuda::BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return se::cuda::BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return se::cuda::BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::BIAS:
+      return se::cuda::BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return se::cuda::BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return se::cuda::BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return se::cuda::BlasLt::Epilogue::kBiasThenGELUWithAux;
+    default:
+      return InternalError("Unsupported Epilogue.");
+  }
+}
+
+StatusOr<se::DeviceMemoryBase> CreateBuffer(se::RedzoneAllocator& allocator,
+                                            const Shape& shape,
+                                            const AutotuneConfig& config,
+                                            int64_t& rng_state) {
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
+                      allocator.AllocateBytes(ShapeUtil::ByteSizeOf(shape)));
+  if (config.should_init_buffers()) {
+    InitializeBuffer(allocator.stream(), shape.element_type(), &rng_state,
+                     buffer);
+  }
+  return buffer;
+}
+
+StatusOr<se::DeviceMemoryBase> CreateBuffer(se::RedzoneAllocator& allocator,
+                                            const HloInstruction& op,
+                                            const AutotuneConfig& config,
+                                            int64_t& rng_state) {
+  return CreateBuffer(allocator, op.shape(), config, rng_state);
+}
+
+static absl::Mutex autotune_cache_mu(absl::kConstInit);
+static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
+    *new absl::flat_hash_map<
+        std::tuple<
+            std::string /*stream_exec->GetDeviceDescription()->model_str()*/,
+            std::string /*conv->ToString(HloPrintOptions::Canonical()) */>,
+        std::optional<se::blas::AlgorithmType>>();
+static int64_t autotune_cache_hits ABSL_GUARDED_BY(autotune_cache_mu) = 0;
+static int64_t autotune_cache_misses ABSL_GUARDED_BY(autotune_cache_mu) = 0;
+
 StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     const HloInstruction* gemm, const GemmBackendConfig& gemm_config,
     se::DeviceMemoryAllocator* allocator, se::Stream* stream) {
   VLOG(3) << "Starting autotune of GemmThunk " << gemm->ToString();
-  const HloInstruction* lhs = gemm->operand(0);
-  const HloInstruction* rhs = gemm->operand(1);
 
   TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(gemm));
   // Don't run autotuning concurrently on the same GPU.
   absl::MutexLock gpu_lock(&GetGpuMutex(stream->parent()));
 
-  auto key = std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
-                             gemm->shape(), gemm_config.SerializeAsString(),
-                             IsCublasLtMatmul(*gemm));
-
-  static absl::Mutex mutex(absl::kConstInit);
-  static auto& cache ABSL_GUARDED_BY(mutex) =
-      *new absl::flat_hash_map<decltype(key),
-                               std::optional<se::blas::AlgorithmType>>();
-  static int64_t cache_hits ABSL_GUARDED_BY(mutex) = 0;
-  static int64_t cache_misses ABSL_GUARDED_BY(mutex) = 0;
-
-  absl::MutexLock lock(&mutex);
-  auto it = cache.find(key);
-  int64_t requests = cache_hits + cache_misses;
-  if (requests && requests % 10 == 0) {
-    VLOG(2) << "Autotuning cache hits/(hits + misses): " << cache_hits << "/"
-            << requests;
-  }
+  auto key = std::make_tuple(
+      stream->parent()->GetDeviceDescription().model_str(),
+      gemm->ToString(
+          HloPrintOptions::Canonical().set_print_backend_config(true)));
+
+  {
+    absl::MutexLock lock(&autotune_cache_mu);
+    auto it = autotune_cache.find(key);
+    int64_t requests = autotune_cache_hits + autotune_cache_misses;
+    if (requests && requests % 10 == 0) {
+      VLOG(2) << "Autotuning cache hits/(hits + misses): "
+              << autotune_cache_hits << "/" << requests;
+    }
 
-  if (it != cache.end()) {
-    cache_hits++;
-    VLOG(4) << "Autotuning cache hit, using algorithm: "
-            << (it->second.has_value() ? absl::StrCat(*(it->second))
-                                       : "<generic>");
-    return it->second;
+    if (it != autotune_cache.end()) {
+      autotune_cache_hits++;
+      VLOG(4) << "Autotuning cache hit, using algorithm: "
+              << (it->second.has_value() ? absl::StrCat(*(it->second))
+                                         : "<generic>");
+      return it->second;
+    }
+    VLOG(4) << "Autotuning cache miss";
+    autotune_cache_misses++;
   }
-  cache_misses++;
-  VLOG(4) << "Autotuning cache miss";
 
   const DebugOptions& debug_options =
       gemm->GetModule()->config().debug_options();
@@ -276,10 +303,15 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
   TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
                       CreateBuffer(buffer_allocator, *gemm->operand(1),
                                    autotune_config, rng_state));
+
+  const Shape& output_shape =
+      gemm->shape().IsTuple() ? gemm->shape().tuple_shapes(0) : gemm->shape();
+
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase output_buffer,
-      CreateBuffer(buffer_allocator, *gemm, autotune_config, rng_state));
+      CreateBuffer(buffer_allocator, output_shape, autotune_config, rng_state));
 
+  HloModuleConfig& hlo_module_config = gemm->GetModule()->config();
   std::optional<se::blas::AlgorithmType> best_algorithm;
   if (IsCublasLtMatmul(*gemm)) {
     bool has_matrix_bias = config.beta != 0.;
@@ -287,6 +319,10 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     TF_ASSIGN_OR_RETURN(bool has_vector_bias, cublas_lt::EpilogueAddsVectorBias(
                                                   gemm_config.epilogue()));
 
+    TF_ASSIGN_OR_RETURN(
+        bool has_aux_output,
+        cublas_lt::EpilogueHasAuxiliaryOutput(gemm_config.epilogue()));
+
     TF_ASSIGN_OR_RETURN(auto epilogue,
                         AsBlasLtEpilogue(gemm_config.epilogue()));
 
@@ -297,6 +333,16 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
                                        *gemm->operand(has_matrix_bias ? 3 : 2),
                                        autotune_config, rng_state));
     }
+    se::DeviceMemoryBase a_scale_buffer, b_scale_buffer, c_scale_buffer,
+        d_scale_buffer, d_amax_buffer;
+
+    se::DeviceMemoryBase aux_buffer;
+    if (has_aux_output) {
+      TF_ASSIGN_OR_RETURN(
+          aux_buffer,
+          CreateBuffer(buffer_allocator, gemm->shape().tuple_shapes(1),
+                       autotune_config, rng_state));
+    }
 
     TF_ASSIGN_OR_RETURN(auto plan,
                         cublas_lt::MatmulPlan::From(config, epilogue));
@@ -307,8 +353,9 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     TF_ASSIGN_OR_RETURN(
         std::optional<size_t> best_algorithm_idx,
         GetBestAlgorithm<se::cuda::BlasLt::MatmulAlgorithm>(
-            stream, buffer_allocator, *gemm, autotune_config, lhs_buffer,
-            rhs_buffer, output_buffer, algorithms,
+            stream, buffer_allocator, gemm->ToString(), autotune_config,
+            lhs_buffer, rhs_buffer, output_buffer, algorithms, output_shape,
+            hlo_module_config, gemm_config.beta(),
             [&](const se::cuda::BlasLt::MatmulAlgorithm& algorithm)
                 -> StatusOr<se::blas::ProfileResult> {
               se::OwningScratchAllocator<> scratch_allocator(
@@ -316,7 +363,9 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
               se::blas::ProfileResult profile_result;
               TF_RETURN_IF_ERROR(plan.ExecuteOnStream(
                   stream, lhs_buffer, rhs_buffer, output_buffer, output_buffer,
-                  bias_buffer, algorithm, scratch_allocator, &profile_result));
+                  bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
+                  c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
+                  scratch_allocator, &profile_result));
               return std::move(profile_result);
             }));
 
@@ -327,9 +376,11 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     TF_RET_CHECK(stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms));
 
     TF_ASSIGN_OR_RETURN(std::optional<size_t> best_algorithm_idx,
-                        GetBestAlgorithm<se::blas::AlgorithmType>(
-                            stream, buffer_allocator, *gemm, autotune_config,
-                            lhs_buffer, rhs_buffer, output_buffer, algorithms,
+                        GetBestBlasAlgorithm(
+                            stream, buffer_allocator, gemm->ToString(),
+                            autotune_config, lhs_buffer, rhs_buffer,
+                            output_buffer, algorithms, output_shape,
+                            hlo_module_config, gemm_config.beta(),
                             [&](const se::blas::AlgorithmType& algorithm)
                                 -> StatusOr<se::blas::ProfileResult> {
                               se::blas::ProfileResult profile_result;
@@ -349,13 +400,18 @@ StatusOr<std::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     if (best_algorithm_idx) best_algorithm = algorithms[*best_algorithm_idx];
   }
 
-  CHECK(cache.emplace(key, best_algorithm).second);
-  return best_algorithm;
+  // Insert our result into the cache.  After we released the lock on
+  // autotune_cache_mu, another autotuning job may have run for this same key on
+  // another GPU on the machine.  If so, use its result.
+  absl::MutexLock lock(&autotune_cache_mu);
+  auto [it, inserted] = autotune_cache.emplace(key, best_algorithm);
+  return it->second;
 }
 
 StatusOr<bool> RunOnInstruction(HloInstruction* instr,
-                                se::StreamExecutor* executor,
-                                se::DeviceMemoryAllocator* allocator) {
+                                GemmAlgorithmPicker::DeviceConfig config) {
+  se::StreamExecutor* executor = config.stream_exec;
+  se::DeviceMemoryAllocator* allocator = config.allocator;
   if (allocator == nullptr) {
     allocator = executor->GetAllocator();
   }
@@ -385,13 +441,68 @@ StatusOr<bool> RunOnInstruction(HloInstruction* instr,
   return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
 }
 
-StatusOr<bool> RunOnComputation(HloComputation* computation,
-                                se::StreamExecutor* se,
-                                se::DeviceMemoryAllocator* allocator) {
+// Do Gemm Autotune without stream executor. Use results from autotune cache
+// only.
+StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
+                                GemmAlgorithmPicker::DevicelessConfig config) {
+  VLOG(3) << "Loading the autotune result of GemmThunk " << gemm->ToString();
+
+  auto key = std::make_tuple(
+      std::string(config.model_str),
+      gemm->ToString(
+          HloPrintOptions::Canonical().set_print_backend_config(true)));
+
+  // Load selected algorithm from the autotune cache.
+  std::optional<se::blas::AlgorithmType> algorithm;
+  {
+    absl::MutexLock lock(&autotune_cache_mu);
+    if (auto it = autotune_cache.find(key); it != autotune_cache.end()) {
+      VLOG(4) << "AOT autotuning cache hit, using algorithm: "
+              << (it->second.has_value() ? absl::StrCat(*(it->second))
+                                         : "<generic>");
+      algorithm = it->second;
+    }
+    VLOG(4) << "AOT autotuning cache miss";
+  }
+
+  se::CudaComputeCapability capability = config.cuda_compute_capability;
+  GemmBackendConfig gemm_config =
+      gemm->backend_config<GemmBackendConfig>().value();
+  GemmBackendConfig updated_config = gemm_config;
+
+  // We only set the 'algorithm' field on non-Ampere architectures, as for
+  // Ampere it's ignored in any case.
+  if (!capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
+    if (algorithm) {
+      updated_config.set_selected_algorithm(*algorithm);
+    } else {
+      updated_config.set_selected_algorithm(se::blas::kRuntimeAutotuning);
+    }
+  }
+  TF_RETURN_IF_ERROR(gemm->set_backend_config(updated_config));
+  return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
+}
+
+StatusOr<bool> RunOnComputation(
+    HloComputation* computation,
+    std::variant<GemmAlgorithmPicker::DeviceConfig,
+                 GemmAlgorithmPicker::DevicelessConfig>
+        config) {
   bool changed = false;
   for (HloInstruction* instr : computation->instructions()) {
     if (IsCublasGemm(*instr)) {
-      TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, se, allocator));
+      bool result;
+      if (std::holds_alternative<GemmAlgorithmPicker::DeviceConfig>(config)) {
+        TF_ASSIGN_OR_RETURN(
+            result,
+            RunOnInstruction(
+                instr, std::get<GemmAlgorithmPicker::DeviceConfig>(config)));
+      } else {
+        TF_ASSIGN_OR_RETURN(
+            result, RunOnInstruction(
+                        instr, std::get<GemmAlgorithmPicker::DevicelessConfig>(
+                                   config)));
+      }
       changed |= result;
     }
   }
@@ -400,10 +511,53 @@ StatusOr<bool> RunOnComputation(HloComputation* computation,
 
 }  // namespace
 
+void GemmAlgorithmPicker::ClearAutotuneResults() {
+  absl::MutexLock lock(&autotune_cache_mu);
+  autotune_cache.clear();
+}
+
+Status GemmAlgorithmPicker::WriteAutotuneResults(AutotuneResults* results) {
+  absl::MutexLock lock(&autotune_cache_mu);
+
+  for (const auto& [k, result] : autotune_cache) {
+    // For now, we don't cache "failed to autotune" results, because we don't
+    // have a good way to represent them in the proto.
+    if (!result.has_value()) continue;
+
+    const auto& [model_str, hlo] = k;
+    auto& entry = *results->add_dots();
+    entry.set_device(model_str);
+    entry.set_hlo(hlo);
+    entry.mutable_result()->mutable_gemm()->set_algorithm(*result);
+  }
+
+  // Sort the results so they're deterministic.
+  std::sort(results->mutable_dots()->pointer_begin(),
+            results->mutable_dots()->pointer_end(),
+            [](const auto* a, const auto* b) {
+              return std::make_pair(absl::string_view(a->device()),
+                                    absl::string_view(a->hlo())) <
+                     std::make_pair(absl::string_view(b->device()),
+                                    absl::string_view(b->hlo()));
+            });
+  return OkStatus();
+}
+
+Status GemmAlgorithmPicker::LoadAutotuneResults(
+    const AutotuneResults& results) {
+  absl::MutexLock lock(&autotune_cache_mu);
+  for (const auto& result : results.dots()) {
+    autotune_cache[std::make_tuple(result.device(), result.hlo())] =
+        result.result().gemm().algorithm();
+  }
+  return OkStatus();
+}
+
 StatusOr<bool> GemmAlgorithmPicker::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_SCOPED_LOGGING_TIMER("GemmAlgorithmPicker");
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GemmAlgorithmPicker for ", module->name()));
 
   if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
     VLOG(2) << "GEMM auto-tuning disabled, GemmAlgorithmPicker returning early";
@@ -413,8 +567,7 @@ StatusOr<bool> GemmAlgorithmPicker::Run(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(
-        bool result, RunOnComputation(computation, stream_exec_, allocator_));
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation, config_));
     changed |= result;
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h
index aaed5d1210b..25d7cedbddd 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h
@@ -15,24 +15,89 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GEMM_ALGORITHM_PICKER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GEMM_ALGORITHM_PICKER_H_
 
+#include <functional>
 #include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
 
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/autotune_results.pb.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
 
+struct AutotuneConfig {
+  bool should_init_buffers() const { return autotune_level >= 2; }
+  bool should_reinit_output_buffer() const { return autotune_level >= 3; }
+  bool should_check_correctness() const { return autotune_level >= 4; }
+
+  int32_t autotune_level;
+  bool should_crash_on_check_failure;
+};
+
+static AutotuneConfig GetConfig(const DebugOptions& debug_options) {
+  return {debug_options.xla_gpu_autotune_level(),
+          debug_options.xla_gpu_crash_on_verification_failures()};
+}
+
+se::RedzoneAllocator CreateRedzoneAllocator(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    const DebugOptions& debug_options, const AutotuneConfig& config);
+
+// Select the best algorithm using information from a Blas instruction.
+// Returns the index (into `algorithms`) of the fastest algorithm.
+StatusOr<std::optional<size_t>> GetBestBlasAlgorithm(
+    se::Stream* stream, se::RedzoneAllocator& allocator,
+    std::optional<std::string_view> gemm_str,
+    const AutotuneConfig& autotune_config, se::DeviceMemoryBase lhs_buffer,
+    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
+    absl::Span<const se::blas::AlgorithmType> algorithms,
+    const Shape& output_shape, const HloModuleConfig& hlo_module_config,
+    double beta,
+    const std::function<StatusOr<se::blas::ProfileResult>(
+        const se::blas::AlgorithmType&)>& run_benchmark);
+
+// GemmAlgorithmPicker supports two modes: device and deviceless.
+// In device mode, we run autotuning on the device and store autotune results.
+// In deviceless mode, we pass in some information related to the device and
+// use stored autotune results to rewrite Gemm instructions. If the required
+// autotune result is not stored, then algorithm is set to kRuntimeAutotuning.
 class GemmAlgorithmPicker : public HloModulePass {
  public:
-  GemmAlgorithmPicker(se::StreamExecutor* stream_exec,
-                      se::DeviceMemoryAllocator* allocator)
-      : stream_exec_(stream_exec), allocator_(allocator) {}
+  static void ClearAutotuneResults();
+  static Status WriteAutotuneResults(AutotuneResults* results);
+  static Status LoadAutotuneResults(const AutotuneResults& results);
+
+  struct DeviceConfig {
+    se::StreamExecutor* stream_exec;
+    se::DeviceMemoryAllocator* allocator;
+  };
+
+  struct DevicelessConfig {
+    // The human-readable description of the device.  It can be found by using
+    // stream_exec->GetDeviceDescription().model_str() when the stream executor
+    // is available.
+    std::string model_str;
+
+    // A field to determine the architecture of the device. We only pick an
+    // algorithm for non-Ampere architectures.
+    se::CudaComputeCapability cuda_compute_capability{0, 0};
+  };
+
+  explicit GemmAlgorithmPicker(DeviceConfig config) : config_(config) {}
+
+  explicit GemmAlgorithmPicker(DevicelessConfig config) : config_(config) {}
 
   absl::string_view name() const override { return "gemm-algorithm-picker"; }
 
@@ -42,8 +107,7 @@ class GemmAlgorithmPicker : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
  private:
-  se::StreamExecutor* stream_exec_;
-  se::DeviceMemoryAllocator* allocator_;
+  std::variant<DeviceConfig, DevicelessConfig> config_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker_test.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker_test.cc
new file mode 100644
index 00000000000..96b5b43da55
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace m = ::xla::match;
+
+class GemmAlgorithmPickerTest : public HloTestBase {
+ public:
+  GemmAlgorithmPickerTest() { GemmAlgorithmPicker::ClearAutotuneResults(); }
+};
+
+TEST_F(GemmAlgorithmPickerTest, SetAlgorithm) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %arg0 = f32[100,100]{1,0} parameter(0)
+  %arg1 = f32[100,100]{1,0} parameter(1)
+  ROOT %dot = f32[100,100]{1,0} dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+  ASSERT_GT(executors.size(), 0);
+  se::StreamExecutor* stream_exec = executors[0];
+  GemmAlgorithmPicker::DeviceConfig device_config{stream_exec, nullptr};
+
+  bool changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
+                                           .cuda_compute_capability()),
+                          m.get()));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmAlgorithmPicker(device_config), m.get()));
+  ASSERT_TRUE(changed);
+
+  AutotuneResults results;
+  TF_ASSERT_OK(GemmAlgorithmPicker::WriteAutotuneResults(&results));
+  ASSERT_EQ(results.dots_size(), 1);
+  auto& result = *results.mutable_dots(0)->mutable_result();
+  int64_t old_algo_id = result.algorithm().algo_id();
+  int64_t new_algo_id = old_algo_id + 1;
+  result.mutable_gemm()->set_algorithm(new_algo_id);
+
+  GemmAlgorithmPicker::ClearAutotuneResults();
+  TF_ASSERT_OK(GemmAlgorithmPicker::LoadAutotuneResults(results));
+
+  // Now send the same module through GemmAlgorithmPicker again.  The dot should
+  // have the new algorithm.
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
+                                           .cuda_compute_capability()),
+                          m.get()));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmAlgorithmPicker(device_config), m.get()));
+  ASSERT_TRUE(changed);
+
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* dot;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::CustomCall(&dot)));
+
+  TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
+                          dot->backend_config<GemmBackendConfig>());
+  EXPECT_EQ(config.selected_algorithm(), new_algo_id);
+}
+
+TEST_F(GemmAlgorithmPickerTest, GetAlgorithmWithoutDevice) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %arg0 = f32[100,100]{1,0} parameter(0)
+  %arg1 = f32[100,100]{1,0} parameter(1)
+  ROOT %dot = f32[100,100]{1,0} dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+  ASSERT_GT(executors.size(), 0);
+  se::StreamExecutor* stream_exec = executors[0];
+
+  bool changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
+                                           .cuda_compute_capability()),
+                          m.get()));
+  changed = false;
+  GemmAlgorithmPicker::DeviceConfig device_config{stream_exec, nullptr};
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmAlgorithmPicker(device_config), m.get()));
+  ASSERT_TRUE(changed);
+
+  AutotuneResults results;
+  TF_ASSERT_OK(GemmAlgorithmPicker::WriteAutotuneResults(&results));
+  ASSERT_EQ(results.dots_size(), 1);
+  auto& result = *results.mutable_dots(0)->mutable_result();
+  int64_t old_algo_id = result.algorithm().algo_id();
+  int64_t new_algo_id = old_algo_id + 1;
+  result.mutable_gemm()->set_algorithm(new_algo_id);
+
+  GemmAlgorithmPicker::ClearAutotuneResults();
+  TF_ASSERT_OK(GemmAlgorithmPicker::LoadAutotuneResults(results));
+
+  // Now send the same module through GemmAlgorithmPicker again.  The dot should
+  // have the new algorithm.
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmRewriter(stream_exec->GetDeviceDescription()
+                                           .cuda_compute_capability()),
+                          m.get()));
+  changed = false;
+  GemmAlgorithmPicker::DevicelessConfig deviceless_config{
+      stream_exec->GetDeviceDescription().model_str(),
+      stream_exec->GetDeviceDescription().cuda_compute_capability()};
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GemmAlgorithmPicker(deviceless_config), m.get()))
+  ASSERT_TRUE(changed);
+
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* dot;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::CustomCall(&dot)));
+
+  TF_ASSERT_OK_AND_ASSIGN(GemmBackendConfig config,
+                          dot->backend_config<GemmBackendConfig>());
+  EXPECT_EQ(config.selected_algorithm(), new_algo_id);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.cc
index 0823ff0a355..87fafedc959 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.cc
@@ -16,18 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h"
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h b/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h
index 83e2af40241..213b4f1c6a6 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
index 06310f0873d..75e216401a3 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
 
+#include <algorithm>
 #include <array>
+#include <cmath>
+#include <limits>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -25,23 +29,24 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/literal_comparison.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -136,6 +141,51 @@ HloInstruction *MaybeConstantFoldBias(HloInstruction *bias) {
   return bias;
 }
 
+auto Gemm(HloInstruction **instr) {
+  return m::CustomCall(instr, {kGemmCallTarget});
+}
+
+auto CublasLtMatmul(HloInstruction **instr) {
+  return m::CustomCall(instr, {kCublasLtMatmulCallTarget});
+}
+
+auto GemmOrCublasLtMatmul(HloInstruction **instr) {
+  return m::CustomCall(instr, {kGemmCallTarget, kCublasLtMatmulCallTarget});
+}
+
+auto BcastConstScalar(HloInstruction **instr, double value) {
+  return m::Broadcast(instr, m::ConstantScalar(value));
+}
+
+auto BcastConstScalar(double value) { return BcastConstScalar(nullptr, value); }
+
+auto BcastConstScalarNear(double value) {
+  return m::Broadcast(m::ConstantScalar().WithPredicate(
+      [expected = value](const HloInstruction *instr) {
+        // Not a very robust floating-point comparison, but good enough for our
+        // purposes.
+        std::optional<double> actual =
+            static_cast<const HloConstantInstruction *>(instr)
+                ->literal()
+                .GetAsDouble({});
+        if (!actual.has_value()) return false;
+        double epsilon = 128 * std::numeric_limits<float>::epsilon();
+        return abs(*actual - expected) < (abs(*actual + expected) * epsilon);
+      }));
+}
+
+template <typename Pattern>
+auto OptionalSlice(HloInstruction **optional_slice, Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Slice(optional_slice, pattern),
+                                  std::move(pattern));
+}
+
+template <typename Pattern>
+auto OptionalBitcast(HloInstruction **optional_bitcast, Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Bitcast(optional_bitcast, pattern),
+                                  std::move(pattern));
+}
+
 // The rewriting proceeds in a bottom-up way:
 //
 // (kDot A B) is rewritten into a (kCustomCall:gemm A B)
@@ -148,12 +198,34 @@ HloInstruction *MaybeConstantFoldBias(HloInstruction *bias) {
 // and provided C has no other users).
 // We then guide the buffer assignment to alias the buffer of the custom call
 // and C.
+//
+// For scaled FP8 GEMMs on Hopper systems, the following steps mentioned in RFC
+// #22 (https://github.com/openxla/xla/discussions/22) are elided and rewritten
+// into a Custom Call:
+//
+// 1. Cast each input from FP8 to a wider type such as FP16 or FP32.
+// 2. Unscale each input by multiplying each input by the corresponding input
+// scale.
+// 3. Evaluate the matrix multiplication on the scaled inputs.
+// 4. Compute the maximum of the absolute values in the result of the GEMM
+// (DAmax).
+// 5. Scale the output by dividing the output by the output scale.
+// 6. Cast the output back to FP8. Since saturation should be done on overflow,
+// this is represented by a Clamp instruction followed by a Convert instruction.
+
+// Steps 1 through 3 can be elided independently of the remainder. Steps 5 and 6
+// are elided only if steps 1 through 3 were successfully transformed. Step 4
+// requires steps 5 and 6, i.e. the computation of DAmax can be elided only when
+// the output of the GEMM is requested in FP8 format.
+
 class GemmRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   explicit GemmRewriterVisitor(
       se::CudaComputeCapability cuda_compute_capability)
       : cuda_compute_capability_(cuda_compute_capability) {}
   Status HandleDot(HloInstruction *instr) override {
+    HloInstruction *a, *b, *a_scale, *b_scale, *a_binary, *b_binary,
+        *a_bitcast = nullptr, *b_bitcast = nullptr;
     if (IsMatrixMultiplication(*instr)) {
       CHECK(!instr->IsRank2Transpose());
       HloInstruction *lhs = instr->mutable_operand(0);
@@ -172,14 +244,61 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
       TF_ASSIGN_OR_RETURN(absl::string_view gemm_custom_call_target,
                           GetGemmCustomCallTarget(instr, gemm_config));
-      std::unique_ptr<HloInstruction> gemm_call =
-          HloInstruction::CreateCustomCall(output_shape, {lhs, rhs},
-                                           gemm_custom_call_target);
-
+      HloInstruction *gemm_call =
+          instr->AddInstruction(HloInstruction::CreateCustomCall(
+              output_shape, {lhs, rhs}, gemm_custom_call_target));
       TF_RETURN_IF_ERROR(gemm_call->set_backend_config(gemm_config));
-      TF_RETURN_IF_ERROR(SetName(instr->GetModule(), gemm_call.get()));
-      TF_RETURN_IF_ERROR(
-          ReplaceWithNewInstruction(instr, std::move(gemm_call)));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, gemm_call));
+      // Enable further rewriting below.
+      instr = gemm_call;
+    }
+
+    // Attempt to elide an FP8 GEMM with scaled inputs as described by steps 1
+    // through 3 detailed above and rewrite into a Custom Call.
+    if (Match(
+            instr,
+            m::CustomCall(
+                {kCublasLtMatmulCallTarget},
+                m::AnyOf<HloInstruction>(
+                    OptionalBitcast(
+                        &a_bitcast,
+                        m::MultiplyAnyOrder(&a_binary, m::Convert(m::Op(&a)),
+                                            m::Broadcast(m::Op(&a_scale)))),
+                    OptionalBitcast(&a_bitcast,
+                                    m::Divide(&a_binary, m::Convert(m::Op(&a)),
+                                              m::Broadcast(m::Op(&a_scale))))),
+                m::AnyOf<HloInstruction>(
+                    OptionalBitcast(
+                        &b_bitcast,
+                        m::MultiplyAnyOrder(&b_binary, m::Convert(m::Op(&b)),
+                                            m::Broadcast(m::Op(&b_scale)))),
+                    OptionalBitcast(
+                        &b_bitcast,
+                        m::Divide(&b_binary, m::Convert(m::Op(&b)),
+                                  m::Broadcast(m::Op(&b_scale)))))))) {
+      TF_ASSIGN_OR_RETURN(
+          bool created_call,
+          CreateF8CustomCall(
+              instr, a, b, a_scale, b_scale, a_bitcast, b_bitcast,
+              /*a_mult_scale=*/a_binary->opcode() == HloOpcode::kMultiply,
+              /*b_mult_scale=*/b_binary->opcode() == HloOpcode::kMultiply));
+      if (created_call) {
+        return OkStatus();
+      }
+    }
+
+    // Attempt to rewrite an FP8 GEMM directly operating on the unscaled but
+    // possibly type converted FP8 operands into a Custom Call.
+    if (Match(instr,
+              m::AnyOf<HloInstruction>(
+                  m::CustomCall({kCublasLtMatmulCallTarget},
+                                m::Convert(m::Op(&a)), m::Convert(m::Op(&b))),
+                  m::CustomCall({kCublasLtMatmulCallTarget}, m::Op(&a),
+                                m::Op(&b))))) {
+      TF_ASSIGN_OR_RETURN(bool created_call, CreateF8CustomCall(instr, a, b));
+      if (created_call) {
+        return OkStatus();
+      }
     }
     return OkStatus();
   }
@@ -188,9 +307,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     HloInstruction *alpha, *existing_gemm;
     if (Match(instr,
               m::MultiplyAnyOrder(
-                  m::CustomCall(&existing_gemm,
-                                {kGemmCallTarget, kCublasLtMatmulCallTarget})
-                      .WithOneUser(),
+                  GemmOrCublasLtMatmul(&existing_gemm).WithOneUser(),
                   m::Broadcast(m::ConstantScalar(&alpha)).WithOneUser()))) {
       TF_ASSIGN_OR_RETURN(auto config,
                           existing_gemm->backend_config<GemmBackendConfig>());
@@ -208,30 +325,58 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         config.set_alpha_real(new_alpha.real());
         config.set_alpha_imag(new_alpha.imag());
         TF_RETURN_IF_ERROR(existing_gemm->set_backend_config(config));
-        TF_RETURN_IF_ERROR(ReplaceInstruction(instr, existing_gemm));
+        return ReplaceInstruction(instr, existing_gemm);
       }
     }
+
+    // Attempt to match approximate GELU activation
+    // (https://arxiv.org/abs/1606.08415), where:
+    // approx_gelu(x) = x * cdf(x)
+    // cdf(x) = 0.5 * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x**3))
+    HloInstruction *cdf;
+    if (Match(instr, m::MultiplyAnyOrder(CublasLtMatmul(&existing_gemm),
+                                         m::Op(&cdf).WithOneUser())) &&
+        Match(cdf,
+              m::MultiplyAnyOrder(
+                  BcastConstScalar(0.5),
+                  m::AddAnyOrder(
+                      BcastConstScalar(1.0),
+                      m::Tanh(m::MultiplyAnyOrder(
+                                  BcastConstScalarNear(sqrt(M_2_PI)),
+                                  m::AddAnyOrder(
+                                      m::Op().Is(existing_gemm),
+                                      m::MultiplyAnyOrder(
+                                          BcastConstScalarNear(0.044715),
+                                          m::MultiplyAnyOrder(
+                                              m::Op().Is(existing_gemm),
+                                              m::MultiplyAnyOrder(
+                                                  m::Op().Is(existing_gemm),
+                                                  m::Op().Is(existing_gemm))
+                                                  .WithOneUser())
+                                              .WithOneUser())
+                                          .WithOneUser())
+                                      .WithOneUser())
+                                  .WithOneUser())
+                          .WithOneUser())))) {
+      return FuseGeluActivation(instr, existing_gemm);
+    }
     return OkStatus();
   }
 
   Status HandleAdd(HloInstruction *instr) override {
-    HloInstruction *bias, *existing_gemm, *optional_slice;
+    HloInstruction *bias, *existing_gemm;
+    HloInstruction *optional_slice = nullptr;
     // Attempt to elide broadcast and fuse addition of a vector bias into GEMM,
     // including when slicing is applied to the result.
     if (Match(instr,
               m::AddAnyOrder(
-                  m::OptionalUnaryOp(
-                      &optional_slice, {HloOpcode::kSlice},
-                      m::CustomCall(&existing_gemm, kCublasLtMatmulCallTarget)
-                          .WithOneUser())
+                  OptionalSlice(&optional_slice,
+                                CublasLtMatmul(&existing_gemm).WithOneUser())
                       .WithOneUser(),
-                  m::Broadcast(&bias, m::Op()).WithOneUser()))) {
+                  m::Broadcast(&bias, m::Op())))) {
       TF_ASSIGN_OR_RETURN(
           bool was_fused,
-          FuseVectorBiasAdd(
-              instr, bias, existing_gemm,
-              (optional_slice->opcode() == HloOpcode::kSlice ? optional_slice
-                                                             : nullptr)));
+          FuseVectorBiasAdd(instr, bias, existing_gemm, optional_slice));
 
       if (was_fused) {
         return OkStatus();
@@ -245,11 +390,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     //   bitcast(gemm(a, b, bitcast(broadcast(bias)))) (FuseMatrixBiasAdd)
     //
     if (Match(instr,
-              m::AddAnyOrder(m::Bitcast(m::CustomCall(&existing_gemm,
-                                                      kCublasLtMatmulCallTarget)
-                                            .WithOneUser())
-                                 .WithOneUser(),
-                             m::Broadcast(&bias, m::Op()).WithOneUser()))) {
+              m::AddAnyOrder(
+                  m::Bitcast(CublasLtMatmul(&existing_gemm).WithOneUser())
+                      .WithOneUser(),
+                  m::Broadcast(&bias, m::Op()).WithOneUser()))) {
       TF_ASSIGN_OR_RETURN(
           HloInstruction * new_add,
           MakeBinaryHlo(HloOpcode::kAdd, existing_gemm,
@@ -272,11 +416,11 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // The last stage of the transform may fail (because of any of the checks in
     // FuseMatrixBiasAdd), but if so that's okay -- we'll have done a useless
     // transformation, but it doesn't hurt anything.
-    if (Match(instr, m::AddAnyOrder(m::Bitcast(m::CustomCall(&existing_gemm,
-                                                             kGemmCallTarget)
-                                                   .WithOneUser())
-                                        .WithOneUser(),
-                                    m::Op(&bias)))) {
+    if (Match(instr,
+              m::AddAnyOrder(
+                  m::Bitcast(GemmOrCublasLtMatmul(&existing_gemm).WithOneUser())
+                      .WithOneUser(),
+                  m::Op(&bias)))) {
       HloInstruction *new_bitcast =
           MakeBitcastHlo(bias, existing_gemm->shape(), &bias->metadata());
       TF_ASSIGN_OR_RETURN(HloInstruction * new_add,
@@ -290,12 +434,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     }
 
     if (Match(instr,
-              m::AddAnyOrder(
-                  m::Op(&existing_gemm)
-                      .WithCustomCallTarget(absl::Span<const absl::string_view>{
-                          kGemmCallTarget, kCublasLtMatmulCallTarget})
-                      .WithOneUser(),
-                  m::Op(&bias)))) {
+              m::AddAnyOrder(GemmOrCublasLtMatmul(&existing_gemm).WithOneUser(),
+                             m::Op(&bias)))) {
       return FuseMatrixBiasAdd(instr, bias, existing_gemm);
     }
 
@@ -303,80 +443,402 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
   Status HandleMaximum(HloInstruction *instr) override {
-    HloInstruction *existing_gemm, *optional_slice_or_bitcast, *zeros;
+    HloInstruction *existing_gemm, *zeros;
+    HloInstruction *optional_slice_or_bitcast = nullptr;
     // Attempt to elide maximum and fuse ReLU activation into GEMM, including
     // when slicing or bitcasting is applied to the result.
     if (Match(instr,
               m::MaximumAnyOrder(
-                  m::OptionalUnaryOp(
-                      &optional_slice_or_bitcast,
-                      {HloOpcode::kBitcast, HloOpcode::kSlice},
-                      m::CustomCall(&existing_gemm, kCublasLtMatmulCallTarget)
-                          .WithOneUser())
+                  m::AnyOf<HloInstruction>(
+                      m::Slice(&optional_slice_or_bitcast,
+                               CublasLtMatmul(&existing_gemm).WithOneUser()),
+                      m::Bitcast(&optional_slice_or_bitcast,
+                                 CublasLtMatmul(&existing_gemm).WithOneUser()),
+                      CublasLtMatmul(&existing_gemm))
                       .WithOneUser(),
-                  m::Broadcast(&zeros, m::ConstantScalar(0)).WithOneUser()))) {
-      TF_RETURN_IF_ERROR(FuseReluActivation(
-          instr, zeros, existing_gemm,
-          (optional_slice_or_bitcast->opcode() == HloOpcode::kSlice ||
-                   optional_slice_or_bitcast->opcode() == HloOpcode::kBitcast
-               ? optional_slice_or_bitcast
-               : nullptr)));
+                  m::Broadcast(&zeros, m::ConstantScalar(0))))) {
+      TF_RETURN_IF_ERROR(FuseReluActivation(instr, zeros, existing_gemm,
+                                            optional_slice_or_bitcast));
     }
     return OkStatus();
   }
 
   Status HandleConvert(HloInstruction *instr) override {
-    HloInstruction *bias, *existing_gemm;
-    if (Match(
-            instr,
-            m::Convert(m::AddAnyOrder(
-                           m::Convert(m::CustomCall(&existing_gemm,
-                                                    {kGemmCallTarget,
-                                                     kCublasLtMatmulCallTarget})
-                                          .WithOneUser()
-                                          .WithElementType(BF16))
-                               .WithOneUser(),
-                           m::Convert(m::Op(&bias).WithElementType(BF16))
-                               .WithOneUser())
-                           .WithOneUser())
-                .WithElementType(BF16))) {
-      return FuseMatrixBiasAdd(instr, bias, existing_gemm);
+    HloInstruction *clamp_lower, *clamp_upper, *d_scale, *existing_gemm,
+        *binary;
+
+    // Attempt to elide the scaling and conversion of the result of an FP8
+    // GEMM, including the optional calculation of the maximum of the absolute
+    // values before scaling, and adapt the Custom Call.
+    if (Match(instr,
+              m::Convert(
+                  m::Clamp(m::Broadcast(m::ConstantScalar(&clamp_lower)),
+                           m::AnyOf<HloInstruction>(
+                               m::Divide(
+                                   &binary,
+                                   m::CustomCall(&existing_gemm,
+                                                 {kCublasLtMatmulF8CallTarget}),
+                                   m::Broadcast(m::Op(&d_scale))),
+                               m::MultiplyAnyOrder(
+                                   &binary,
+                                   m::CustomCall(&existing_gemm,
+                                                 {kCublasLtMatmulF8CallTarget}),
+                                   m::Broadcast(m::Op(&d_scale)))),
+                           m::Broadcast(m::ConstantScalar(&clamp_upper)))
+                      .WithOneUser()))) {
+      return F8ConvertD(
+          instr, existing_gemm, d_scale, clamp_lower, clamp_upper,
+          /*mult_scale=*/binary->opcode() == HloOpcode::kMultiply);
     }
     return OkStatus();
   }
 
-  // Replaces binary(slice/bitcast(gemm), broadcast) with
-  // slice/bitcast(binary(gemm, broadcast)) and changes the shape of broadcast
-  // from that of slice/bitcast to that of the GEMM, i.e. the operand of
-  // slice/bitcast.
-  Status SinkSliceOrBitcastBelowBinaryOp(HloInstruction *slice_or_bitcast,
-                                         HloInstruction **binary,
-                                         HloInstruction **broadcast) {
-    TF_RET_CHECK(slice_or_bitcast->user_count() == 1);
-    TF_RET_CHECK((*broadcast)->user_count() == 1);
-    TF_RET_CHECK((*binary)->IsRoot() || (*binary)->user_count() == 1);
-
-    // Re-broadcast the operand of broadcast to the shape of the GEMM.
-    HloInstruction *gemm = slice_or_bitcast->mutable_operand(0);
-    HloInstruction *new_broadcast = (*binary)->AddInstruction(
-        (*broadcast)->CloneWithNewShape(gemm->shape()));
-
-    // Create a new binary instruction of the same type as binary and of the
-    // shape of the GEMM.
-    TF_ASSIGN_OR_RETURN(
-        HloInstruction * new_binary,
-        MakeBinaryHlo((*binary)->opcode(), gemm, new_broadcast));
-    TF_RETURN_IF_ERROR(slice_or_bitcast->ReplaceOperandWith(0, new_binary));
-    TF_RETURN_IF_ERROR(ReplaceInstruction(*binary, slice_or_bitcast));
-    *binary = slice_or_bitcast->mutable_operand(0);
-    *broadcast = new_broadcast;
+  StatusOr<bool> CreateF8CustomCall(
+      HloInstruction *instr, HloInstruction *a, HloInstruction *b,
+      HloInstruction *a_scale = nullptr, HloInstruction *b_scale = nullptr,
+      HloInstruction *a_bitcast = nullptr, HloInstruction *b_bitcast = nullptr,
+      bool a_mult_scale = true, bool b_mult_scale = true) {
+    // FP8 GEMM kernels are only available on Hopper and newer architectures.
+    if (!cuda_compute_capability_.IsAtLeast(
+            se::CudaComputeCapability::HOPPER)) {
+      return false;
+    }
+
+#if CUDA_VERSION < 11080
+    // FP8 GEMM kernels are only available with CUDA 11.8 and above
+    return false;
+#endif
+
+    // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
+    // F8E4M3FN format.
+    if (!((a->shape().element_type() == F8E4M3FN &&
+           b->shape().element_type() == F8E4M3FN) ||
+          (a->shape().element_type() == F8E4M3FN &&
+           b->shape().element_type() == F8E5M2) ||
+          (a->shape().element_type() == F8E5M2 &&
+           b->shape().element_type() == F8E4M3FN))) {
+      return false;
+    }
+
+    // cuBLASLt FP8 GEMM kernels require the non-batch dimensions of the
+    // operands to be multiples of 16.
+    TF_ASSIGN_OR_RETURN(auto gemm_backend_config,
+                        instr->backend_config<GemmBackendConfig>());
+    absl::Span<const int64_t> a_dims =
+        (a_bitcast ? a_bitcast : a)->shape().dimensions();
+    absl::Span<const int64_t> b_dims =
+        (b_bitcast ? b_bitcast : b)->shape().dimensions();
+    absl::Span<const int64_t> a_batch_dims =
+        gemm_backend_config.dot_dimension_numbers().lhs_batch_dimensions();
+    absl::Span<const int64_t> b_batch_dims =
+        gemm_backend_config.dot_dimension_numbers().rhs_batch_dimensions();
+    for (int i = 0; i < a_dims.size(); ++i) {
+      if (a_dims[i] % 16 && !absl::c_linear_search(a_batch_dims, i)) {
+        return false;
+      }
+    }
+    for (int i = 0; i < b_dims.size(); ++i) {
+      if (b_dims[i] % 16 && !absl::c_linear_search(b_batch_dims, i)) {
+        return false;
+      }
+    }
+
+    // cuBLASLt FP8 GEMM kernels require the scaling factors to be in F32
+    // format. Set the factors to one when no scaling factors were captured.
+    Literal one_literal = LiteralUtil::One(F32);
+    HloInstruction *one = instr->AddInstruction(
+        HloInstruction::CreateConstant(one_literal.Clone()));
+    std::array<bool, 2> mult_scale{a_mult_scale, b_mult_scale};
+    std::array<HloInstruction *, 2> scales{a_scale, b_scale}, inv_scales,
+        scales_f32;
+    for (int i = 0; i < scales.size(); ++i) {
+      if (scales[i]) {
+        if (!ShapeUtil::IsScalar(scales[i]->shape())) {
+          return false;
+        }
+        if (!mult_scale[i]) {
+          inv_scales[i] = instr->AddInstruction(HloInstruction::CreateBinary(
+              scales[i]->shape(), HloOpcode::kDivide, one, scales[i]));
+        }
+        scales_f32[i] = instr->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeScalarShape(F32),
+            mult_scale[i] ? scales[i] : inv_scales[i]));
+      } else {
+        scales_f32[i] = one;
+      }
+    }
+
+    PrimitiveType c_type;
+    switch (instr->shape().element_type()) {
+      case F8E4M3FN:
+      case F8E5M2:
+      case BF16:
+        c_type = BF16;
+        break;
+      case F16:
+        c_type = F16;
+        break;
+      case F32:
+        c_type = F32;
+        break;
+      default:
+        return false;
+    }
+
+    // TODO(philipphack): Consider enabling epilogue fusions and the addition of
+    // a matrix bias for FP8 GEMMs.
+    Literal c_literal = LiteralUtil::Zero(c_type);
+    HloInstruction *c = instr->AddInstruction(
+        HloInstruction::CreateConstant(c_literal.Clone()));
+    HloInstruction *c_bcast = instr->AddInstruction(
+        HloInstruction::CreateBroadcast(instr->shape(), c, {}));
+
+    // Each operand must have exactly one contracting and one non-contracting
+    // dimension.
+    absl::Span<const int64_t> a_contracting_dims =
+        gemm_backend_config.dot_dimension_numbers()
+            .lhs_contracting_dimensions();
+    absl::Span<const int64_t> b_contracting_dims =
+        gemm_backend_config.dot_dimension_numbers()
+            .rhs_contracting_dimensions();
+    if (a_contracting_dims.size() != 1 || b_contracting_dims.size() != 1) {
+      return false;
+    }
+    if ((a_bitcast ? a_bitcast : a)->shape().dimensions_size() -
+                gemm_backend_config.dot_dimension_numbers()
+                    .lhs_batch_dimensions()
+                    .size() !=
+            2 ||
+        (b_bitcast ? b_bitcast : b)->shape().dimensions_size() -
+                gemm_backend_config.dot_dimension_numbers()
+                    .rhs_batch_dimensions()
+                    .size() !=
+            2) {
+      return false;
+    }
+
+    // Verify that bitcasts preserve the element types.
+    if (a_bitcast && !ShapeUtil::SameElementType(
+                         a_bitcast->shape(), a_bitcast->operand(0)->shape())) {
+      return false;
+    }
+    if (b_bitcast && !ShapeUtil::SameElementType(
+                         b_bitcast->shape(), b_bitcast->operand(0)->shape())) {
+      return false;
+    }
+
+    // Shift any bitcasts to the unconverted and unscaled operands.
+    if (a_bitcast) {
+      a = instr->AddInstruction(a_bitcast->CloneWithNewOperands(
+          ShapeUtil::MakeShape(a->shape().element_type(),
+                               a_bitcast->shape().dimensions()),
+          {a}));
+    }
+    if (b_bitcast) {
+      b = instr->AddInstruction(b_bitcast->CloneWithNewOperands(
+          ShapeUtil::MakeShape(b->shape().element_type(),
+                               b_bitcast->shape().dimensions()),
+          {b}));
+    }
+
+    // cuBLASLt FP8 GEMM kernels currently require the first operand, i.e. A, to
+    // be transposed. If the result of the GEMM is not in column major order, A
+    // and B are later exchanged, and B is transposed here instead.
+    // TODO(philipphack): Remove once cuBLASLt supports the NN configuration.
+    TF_ASSIGN_OR_RETURN(bool is_col_major,
+                        OutputIsColumnMajor(instr, gemm_backend_config));
+
+    // Identify the dimensional order which describes a transpose of the
+    // contracting and non-contracting dimensions of the GEMM.
+    auto transp_dim_order =
+        [](HloInstruction *x, int64_t x_contracting_dim,
+           absl::Span<const int64_t> x_batch_dims) -> std::vector<int64_t> {
+      std::vector<int64_t> dims(x->shape().dimensions_size(), -1);
+      // Discard the batch dimensions.
+      for (int64_t batch_dim : x_batch_dims) {
+        dims[batch_dim] = batch_dim;
+      }
+      // Identify the non-contracting dimension.
+      int non_contracting_dim;
+      for (int i = 0; i < x->shape().dimensions_size(); ++i) {
+        if (dims[i] == -1 && x_contracting_dim != i) {
+          non_contracting_dim = i;
+        }
+      }
+      dims[non_contracting_dim] = x_contracting_dim;
+      dims[x_contracting_dim] = non_contracting_dim;
+      return dims;
+    };
+
+    auto transp_dims =
+        [](HloInstruction *x,
+           absl::Span<const int64_t> transp_dim_order) -> std::vector<int64_t> {
+      std::vector<int64_t> transp_dims;
+      transp_dims.reserve(x->shape().dimensions_size());
+      for (int64_t dim : transp_dim_order) {
+        transp_dims.emplace_back(x->shape().dimensions(dim));
+      }
+      return transp_dims;
+    };
+
+    if (is_col_major) {
+      std::vector<int64_t> new_dim_order =
+          transp_dim_order(a, a_contracting_dims[0], a_batch_dims);
+      a = instr->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(a->shape().element_type(),
+                               transp_dims(a, new_dim_order)),
+          a, new_dim_order));
+    } else {
+      std::vector<int64_t> new_dim_order =
+          transp_dim_order(b, b_contracting_dims[0], b_batch_dims);
+      b = instr->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(b->shape().element_type(),
+                               transp_dims(b, new_dim_order)),
+          b, new_dim_order));
+    }
+
+    std::unique_ptr<HloInstruction> new_custom_call =
+        HloInstruction::CreateCustomCall(
+            instr->shape(),
+            {a, b, c_bcast, scales_f32[0], scales_f32[1], one, one},
+            kCublasLtMatmulF8CallTarget);
+
+    TF_RETURN_IF_ERROR(
+        new_custom_call->set_backend_config(gemm_backend_config));
+    TF_RETURN_IF_ERROR(SetName(instr->GetModule(), new_custom_call.get()));
+    TF_RETURN_IF_ERROR(
+        ReplaceWithNewInstruction(instr, std::move(new_custom_call)));
+
+    return true;
+  }
+
+  Status F8ConvertD(HloInstruction *instr, HloInstruction *existing_gemm,
+                    HloInstruction *d_scale, HloInstruction *clamp_lower,
+                    HloInstruction *clamp_upper, bool mult_scale = false) {
+    // Verify the data types and the operands of clamp.
+    if (instr->shape().element_type() == F8E4M3FN) {
+      if (!clamp_lower->literal().IsAllFloat(static_cast<float>(
+              std::numeric_limits<tsl::float8_e4m3fn>::lowest())) ||
+          !clamp_upper->literal().IsAllFloat(static_cast<float>(
+              std::numeric_limits<tsl::float8_e4m3fn>::max()))) {
+        return OkStatus();
+      }
+    } else if (instr->shape().element_type() == F8E5M2) {
+      if (!clamp_lower->literal().IsAllFloat(static_cast<float>(
+              std::numeric_limits<tsl::float8_e5m2>::lowest())) ||
+          !clamp_upper->literal().IsAllFloat(static_cast<float>(
+              std::numeric_limits<tsl::float8_e5m2>::max()))) {
+        return OkStatus();
+      }
+    } else {
+      return OkStatus();
+    }
+
+    if (!ShapeUtil::IsScalar(d_scale->shape())) {
+      return OkStatus();
+    }
+
+    // The possible second user of the GEMM must be the calculation of the
+    // maximum of the absolute value of the result of the GEMM. Since it is
+    // unknown in what form this operation will be used, it is identified in a
+    // top-down approach by inspecting the users of the GEMM.
+    const std::vector<HloInstruction *> gemm_users = existing_gemm->users();
+    HloInstruction *reduce_damax = nullptr;
+    if (gemm_users.size() == 2) {
+      for (int i = 0; i < gemm_users.size(); ++i) {
+        if (gemm_users[i]->opcode() == HloOpcode::kAbs &&
+            gemm_users[i]->users().size() == 1 &&
+            gemm_users[i]->users()[0]->opcode() == HloOpcode::kReduce &&
+            gemm_users[i]->users()[0]->operands().size() == 2 &&
+            gemm_users[i]->users()[0]->operand(1)->opcode() ==
+                HloOpcode::kConstant &&
+            ShapeUtil::IsScalar(
+                gemm_users[i]->users()[0]->operand(1)->shape())) {
+          HloInstruction *reduce = gemm_users[i]->users()[0];
+          HloComputation *reduce_comp = reduce->to_apply();
+          HloInstruction *reduce_comp_root = reduce_comp->root_instruction();
+          if (reduce->operand(1)->literal().Get<float>({}) <= 0. &&
+              reduce_comp_root->opcode() == HloOpcode::kMaximum &&
+              reduce_comp_root->operand(0)->opcode() == HloOpcode::kParameter &&
+              reduce_comp_root->operand(1)->opcode() == HloOpcode::kParameter) {
+            reduce_damax = reduce;
+          }
+        }
+      }
+      if (!reduce_damax) {
+        return OkStatus();
+      }
+    } else if (gemm_users.size() > 2) {
+      return OkStatus();
+    }
+
+    // Change the data type of C to BF16 as required by cuBLASLt for GEMMs with
+    // FP8 outputs (see cuBLASLt documentation).
+    Literal c_literal = LiteralUtil::Zero(BF16);
+    HloInstruction *c = instr->AddInstruction(
+        HloInstruction::CreateConstant(c_literal.Clone()));
+    HloInstruction *c_bcast =
+        instr->AddInstruction(HloInstruction::CreateBroadcast(
+            ShapeUtil::ChangeElementType(instr->shape(), BF16), c, {}));
+    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(2, c_bcast));
+
+    // If necessary, invert the scaling factor of D and convert to F32.
+    if (!mult_scale) {
+      Literal one_literal = LiteralUtil::One(d_scale->shape().element_type());
+      HloInstruction *one = instr->AddInstruction(
+          HloInstruction::CreateConstant(one_literal.Clone()));
+      d_scale = instr->AddInstruction(HloInstruction::CreateBinary(
+          d_scale->shape(), HloOpcode::kDivide, one, d_scale));
+    }
+    HloInstruction *d_scale_f32 =
+        instr->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeScalarShape(F32), d_scale));
+
+    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(6, d_scale_f32));
+
+    // If present, elide the calculation of the maximum of the absolute values
+    // of the result of the GEMM.
+    if (reduce_damax) {
+      return F8AddDAmax(instr, existing_gemm, reduce_damax);
+    }
+
+    std::unique_ptr<HloInstruction> new_gemm =
+        existing_gemm->CloneWithNewShape(instr->shape());
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(new_gemm)));
+
+    return OkStatus();
+  }
+
+  // Adds a scalar DAmax return value to an FP8 GEMM.
+  Status F8AddDAmax(HloInstruction *instr, HloInstruction *existing_gemm,
+                    HloInstruction *reduce_damax) {
+    // Change the output shape of the Custom Call to tuple(D, DAmax).
+    Shape damax_shape = ShapeUtil::MakeScalarShape(F32);
+    Shape tuple_shape =
+        ShapeUtil::MakeTupleShape({instr->shape(), damax_shape});
+    HloInstruction *gemm_and_damax =
+        instr->AddInstruction(existing_gemm->CloneWithNewShape(tuple_shape));
+
+    // Obtain D and DAmax separately from the output tuple.
+    HloInstruction *d =
+        instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+            instr->shape(), gemm_and_damax, 0));
+    HloInstruction *damax = instr->AddInstruction(
+        HloInstruction::CreateGetTupleElement(damax_shape, gemm_and_damax, 1));
+
+    // Convert DAmax from FP32 to the requested type and elide reduce.
+    HloInstruction *damax_converted = instr->AddInstruction(
+        HloInstruction::CreateConvert(reduce_damax->shape(), damax));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(reduce_damax, damax_converted));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(instr, d));
 
     return OkStatus();
   }
 
   Status FuseMatrixBiasAdd(HloInstruction *instr, HloInstruction *bias,
-                           HloInstruction *gemm) {
-    TF_RET_CHECK(bias->shape() == gemm->shape());
+                           const HloInstruction *gemm,
+                           HloInstruction *bitcast = nullptr) {
+    TF_RET_CHECK(bias->shape() == (bitcast ? bitcast->shape() : gemm->shape()));
 
     // Do not fuse bias into S32 GEMM, as for this datatype cuBLAS only
     // supports fixed values for alpha/beta.
@@ -409,7 +871,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     operands.insert(operands.begin() + 2, MaybeConstantFoldBias(bias));
 
     std::unique_ptr<HloInstruction> fused_op =
-        gemm->CloneWithNewOperands(instr->shape(), operands);
+        gemm->CloneWithNewOperands(gemm->shape(), operands);
 
     TF_RETURN_IF_ERROR(fused_op->set_backend_config(config));
 
@@ -435,22 +897,30 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
           ->set_output_to_operand_aliasing({{{}, {2, {}}}});
     }
     TF_RETURN_IF_ERROR(SetName(instr->GetModule(), fused_op.get()));
-    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(fused_op)));
-    return OkStatus();
+
+    if (bitcast != nullptr) {
+      fused_op = bitcast->CloneWithNewOperands(
+          bitcast->shape(),
+          {bitcast->parent()->AddInstruction(std::move(fused_op))});
+    }
+
+    return ReplaceWithNewInstruction(instr, std::move(fused_op));
   }
 
-  StatusOr<bool> FuseVectorBiasAdd(HloInstruction *add,
-                                   HloInstruction *broadcast_bias,
+  StatusOr<bool> FuseVectorBiasAdd(HloInstruction *instr,
+                                   HloInstruction *broadcast,
                                    HloInstruction *gemm,
                                    HloInstruction *slice = nullptr) {
     TF_RET_CHECK(ShapeUtil::Compatible(
-        broadcast_bias->shape(), (slice ? slice->shape() : gemm->shape())));
-    auto out_type = gemm->shape().element_type();
+        broadcast->shape(), (slice ? slice->shape() : gemm->shape())));
+
     // Verify that the data type is supported by Epilogue Fusion.
-    if (!SupportsEpilogueFusion(out_type)) {
+    if (!SupportsEpilogueFusion(gemm->shape().element_type())) {
       return false;
     }
 
+    HloInstruction *bias = broadcast->mutable_operand(0);
+
     TF_ASSIGN_OR_RETURN(auto config, gemm->backend_config<GemmBackendConfig>());
 
     // # output column dims == # non-contracting rhs operand dims.
@@ -459,7 +929,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                           dot_dims.rhs_batch_dimensions_size() -
                           dot_dims.rhs_contracting_dimensions_size();
 
-    HloInstruction *bias = broadcast_bias->mutable_operand(0);
     if ((gemm->user_count() != 1) ||
         (config.epilogue() != GemmBackendConfig::DEFAULT) ||
         (bias->shape().rank() != num_col_dims)) {
@@ -468,7 +937,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // We require the bias vector to have been broadcast in the most major
     // dimensions; i.e. its most minor physical dimensions align with most minor
     // physical dimensions of the gemm output.
-    absl::Span<const int64_t> broadcast_dims = broadcast_bias->dimensions();
+    absl::Span<const int64_t> broadcast_dims = broadcast->dimensions();
     for (size_t i = 0; i < num_col_dims; ++i) {
       int64_t dim = gemm->shape().layout().minor_to_major(i);
 
@@ -485,46 +954,38 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    // When slicing is applied to the GEMM, replace
-    // add(slice(gemm), broadcast_bias) with
-    // slice(add(gemm, broadcast_bias)) to enable fusing.
-    if (slice) {
-      TF_RETURN_IF_ERROR(
-          SinkSliceOrBitcastBelowBinaryOp(slice, &add, &broadcast_bias));
-      bias = broadcast_bias->mutable_operand(0);
-    }
-
-    // Replace add(gemm, broadcast_bias) with fused new_gemm.
+    // Replace add(gemm, broadcast) with fused new_gemm.
     config.set_epilogue(GemmBackendConfig::BIAS);
-    gemm->AppendOperand(bias);
+    std::vector<HloInstruction *> operands(gemm->operands().begin(),
+                                           gemm->operands().end());
+    operands.push_back(bias);
+
+    std::unique_ptr<HloInstruction> result =
+        gemm->CloneWithNewOperands(gemm->shape(), operands);
+    TF_RETURN_IF_ERROR(result->set_backend_config(config));
+    TF_RETURN_IF_ERROR(SetName(result->GetModule(), result.get()));
 
-    std::unique_ptr<HloInstruction> new_gemm = HloInstruction::CreateCustomCall(
-        add->shape(), gemm->operands(), kCublasLtMatmulCallTarget);
-    TF_RETURN_IF_ERROR(new_gemm->set_backend_config(config));
-    TF_RETURN_IF_ERROR(SetName(add->GetModule(), new_gemm.get()));
-    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(add, std::move(new_gemm)));
+    if (slice != nullptr) {
+      result = slice->CloneWithNewOperands(
+          slice->shape(), {slice->parent()->AddInstruction(std::move(result))});
+    }
 
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(instr, std::move(result)));
     return true;
   }
 
-  Status FuseReluActivation(HloInstruction *maximum,
-                            HloInstruction *broadcast_zeros,
+  Status FuseReluActivation(HloInstruction *instr, HloInstruction *broadcast,
                             HloInstruction *gemm,
                             HloInstruction *slice_or_bitcast = nullptr) {
     TF_RET_CHECK(ShapeUtil::Compatible(
-        broadcast_zeros->shape(),
+        broadcast->shape(),
         (slice_or_bitcast ? slice_or_bitcast->shape() : gemm->shape())));
-    auto out_type = gemm->shape().element_type();
-    // Verify that the data type is supported by Epilogue Fusion.
-    if (!SupportsEpilogueFusion(out_type)) {
+
+    if (!SupportsEpilogueFusion(gemm->shape().element_type())) {
       return OkStatus();
     }
-    bool valid_fusion_pattern =
-        (gemm->operand_count() == 3)
-            ? gemm->operand(0)->shape() != gemm->operand(2)->shape()
-            : true;
 
-    if (!valid_fusion_pattern || gemm->user_count() != 1) {
+    if (gemm->user_count() != 1) {
       return OkStatus();
     }
 
@@ -532,27 +993,58 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     if (config.epilogue() == GemmBackendConfig::DEFAULT) {
       config.set_epilogue(GemmBackendConfig::RELU);
     } else if (config.epilogue() == GemmBackendConfig::BIAS) {
-      config.set_epilogue(GemmBackendConfig::BIASRELU);
+      config.set_epilogue(GemmBackendConfig::BIAS_RELU);
     } else {
       return OkStatus();
     }
 
-    // When slicing or bitcasting is applied to the GEMM, replace
-    // maximum(slice/bitcast(gemm), broadcast_zeros) with
-    // slice/bitcast(maximum(gemm, broadcast_zeros)) to enable fusing.
-    if (slice_or_bitcast) {
-      TF_RETURN_IF_ERROR(SinkSliceOrBitcastBelowBinaryOp(
-          slice_or_bitcast, &maximum, &broadcast_zeros));
+    std::unique_ptr<HloInstruction> result = gemm->Clone();
+    TF_RETURN_IF_ERROR(result->set_backend_config(config));
+    TF_RETURN_IF_ERROR(SetName(result->GetModule(), result.get()));
+
+    if (slice_or_bitcast != nullptr) {
+      result = slice_or_bitcast->CloneWithNewOperands(
+          slice_or_bitcast->shape(),
+          {slice_or_bitcast->parent()->AddInstruction(std::move(result))});
     }
 
-    // Replace maximum(gemm, broadcast_zeros) with fused new_gemm.
-    std::unique_ptr<HloInstruction> new_gemm = HloInstruction::CreateCustomCall(
-        maximum->shape(), gemm->operands(), kCublasLtMatmulCallTarget);
-    TF_RETURN_IF_ERROR(new_gemm->set_backend_config(config));
-    TF_RETURN_IF_ERROR(SetName(maximum->GetModule(), new_gemm.get()));
-    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(maximum, std::move(new_gemm)));
+    return ReplaceWithNewInstruction(instr, std::move(result));
+  }
 
-    return OkStatus();
+  Status FuseGeluActivation(HloInstruction *multiply, HloInstruction *gemm) {
+    if (!SupportsEpilogueFusion(gemm->shape().element_type())) {
+      return OkStatus();
+    }
+
+    // There are four users of the gemm output within the GELU calculation.
+    bool has_aux = gemm->user_count() > 4;
+
+    TF_ASSIGN_OR_RETURN(auto config, gemm->backend_config<GemmBackendConfig>());
+    if (config.epilogue() == GemmBackendConfig::DEFAULT) {
+      config.set_epilogue(has_aux ? GemmBackendConfig::GELU_AUX
+                                  : GemmBackendConfig::GELU);
+    } else if (config.epilogue() == GemmBackendConfig::BIAS) {
+      config.set_epilogue(has_aux ? GemmBackendConfig::BIAS_GELU_AUX
+                                  : GemmBackendConfig::BIAS_GELU);
+    } else {
+      return OkStatus();
+    }
+
+    std::unique_ptr<HloInstruction> output = gemm->CloneWithNewShape(
+        has_aux ? ShapeUtil::MakeTupleShape({gemm->shape(), gemm->shape()})
+                : gemm->shape());
+    TF_RETURN_IF_ERROR(output->set_backend_config(config));
+    TF_RETURN_IF_ERROR(SetName(multiply->GetModule(), output.get()));
+
+    if (has_aux) {
+      HloInstruction *tuple_output =
+          gemm->parent()->AddInstruction(std::move(output));
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          gemm, HloInstruction::CreateGetTupleElement(tuple_output, 1)));
+      output = HloInstruction::CreateGetTupleElement(tuple_output, 0);
+    }
+
+    return ReplaceWithNewInstruction(multiply, std::move(output));
   }
 
  private:
@@ -605,11 +1097,6 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     const PrimitiveType a_dtype = instr->operand(0)->shape().element_type();
     const PrimitiveType b_dtype = instr->operand(1)->shape().element_type();
 
-    if (a_dtype != b_dtype) {
-      // AType must match BType.
-      return false;
-    }
-
     using se::blas::ComputationType;
     using se::blas::DataType;
     // This matrix of supported types is taken directly from cublasLt
@@ -617,59 +1104,117 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul
     const std::array<
         std::tuple<ComputationType, DataType /*scale_type*/,
-                   PrimitiveType /*a_dtype*/, DataType /*output_dtype*/>,
-        18>
+                   PrimitiveType /*a_dtype*/, PrimitiveType /*b_dtype*/,
+                   DataType /*output_dtype*/>,
+        32>
         supported_type_combinations = {{
+            // FP8 types:
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E4M3FN, DataType::kBF16},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E4M3FN, DataType::kHalf},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E4M3FN, DataType::kFloat},
+
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E5M2, DataType::kBF16},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E5M2, DataType::kF8E4M3FN},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E5M2, DataType::kF8E5M2},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E5M2, DataType::kHalf},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E4M3FN,
+             PrimitiveType::F8E5M2, DataType::kFloat},
+
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+             PrimitiveType::F8E4M3FN, DataType::kBF16},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+             PrimitiveType::F8E4M3FN, DataType::kF8E4M3FN},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+             PrimitiveType::F8E4M3FN, DataType::kF8E5M2},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+             PrimitiveType::F8E4M3FN, DataType::kHalf},
+            {ComputationType::kF32, DataType::kFloat, PrimitiveType::F8E5M2,
+             PrimitiveType::F8E4M3FN, DataType::kFloat},
+
+            // Other data types:
             {ComputationType::kF16, DataType::kHalf, PrimitiveType::F16,
-             DataType::kHalf},
+             PrimitiveType::F16, DataType::kHalf},
 
             {ComputationType::kI32, DataType::kInt32, PrimitiveType::S8,
-             DataType::kInt32},
+             PrimitiveType::S8, DataType::kInt32},
             {ComputationType::kI32, DataType::kFloat, PrimitiveType::S8,
-             DataType::kInt8},
+             PrimitiveType::S8, DataType::kInt8},
 
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
-             DataType::kBF16},
+             PrimitiveType::BF16, DataType::kBF16},
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
-             DataType::kHalf},
+             PrimitiveType::F16, DataType::kHalf},
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::S8,
-             DataType::kFloat},
+             PrimitiveType::S8, DataType::kFloat},
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::BF16,
-             DataType::kFloat},
+             PrimitiveType::BF16, DataType::kFloat},
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::F16,
-             DataType::kFloat},
+             PrimitiveType::F16, DataType::kFloat},
             {ComputationType::kF32, DataType::kFloat, PrimitiveType::F32,
-             DataType::kFloat},
+             PrimitiveType::F32, DataType::kFloat},
 
             // There would be an entry here for A/BType complex int8, but we do
             // not support that type.
             {ComputationType::kF32, DataType::kComplexFloat, PrimitiveType::C64,
-             DataType::kComplexFloat},
+             PrimitiveType::C64, DataType::kComplexFloat},
 
             {ComputationType::kF16AsF32, DataType::kFloat, PrimitiveType::F32,
-             DataType::kFloat},
+             PrimitiveType::F32, DataType::kFloat},
             {ComputationType::kF16AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, DataType::kComplexFloat},
+             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
 
             {ComputationType::kBF16AsF32, DataType::kFloat, PrimitiveType::F32,
-             DataType::kFloat},
+             PrimitiveType::F32, DataType::kFloat},
             {ComputationType::kBF16AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, DataType::kComplexFloat},
+             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
 
             {ComputationType::kTF32AsF32, DataType::kFloat, PrimitiveType::F32,
-             DataType::kFloat},
+             PrimitiveType::F32, DataType::kFloat},
             {ComputationType::kTF32AsF32, DataType::kComplexFloat,
-             PrimitiveType::C64, DataType::kComplexFloat},
+             PrimitiveType::C64, PrimitiveType::C64, DataType::kComplexFloat},
 
             {ComputationType::kF64, DataType::kDouble, PrimitiveType::F64,
-             DataType::kDouble},
+             PrimitiveType::F64, DataType::kDouble},
             {ComputationType::kF64, DataType::kComplexDouble,
-             PrimitiveType::C128, DataType::kComplexDouble},
+             PrimitiveType::C128, PrimitiveType::C128,
+             DataType::kComplexDouble},
         }};
 
     return absl::c_linear_search(
         supported_type_combinations,
-        std::make_tuple(compute_type, scale_type, a_dtype, output_dtype));
+        std::make_tuple(compute_type, scale_type, a_dtype, b_dtype,
+                        output_dtype));
+  }
+
+  StatusOr<bool> OutputIsColumnMajor(
+      const HloInstruction *instr,
+      const GemmBackendConfig &gemm_backend_config) const {
+    const HloInstruction *lhs = instr->operand(0);
+    const HloInstruction *rhs = instr->operand(1);
+
+    const DotDimensionNumbers &dot_dims =
+        gemm_backend_config.dot_dimension_numbers();
+    TF_ASSIGN_OR_RETURN(
+        GemmConfig gemm_config,
+        GemmConfig::For(
+            lhs->shape(), dot_dims.lhs_batch_dimensions(),
+            dot_dims.lhs_contracting_dimensions(), rhs->shape(),
+            dot_dims.rhs_batch_dimensions(),
+            dot_dims.rhs_contracting_dimensions(),
+            /*output_shape=*/instr->shape(), gemm_backend_config.alpha_real(),
+            gemm_backend_config.alpha_imag(), gemm_backend_config.beta(),
+            /*algorithm*/ std::nullopt, se::blas::kDefaultComputePrecision));
+
+    return gemm_config.output_layout.order == MatrixLayout::Order::kColumnMajor;
   }
 
   StatusOr<bool> GemmIsSupportedByCublasLt(
@@ -723,17 +1268,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     std::vector<int64_t> rhs_non_contracting_dims;
     const DotDimensionNumbers &dot_dims =
         gemm_backend_config.dot_dimension_numbers();
-    TF_ASSIGN_OR_RETURN(
-        GemmConfig gemm_config,
-        GemmConfig::For(
-            lhs->shape(), dot_dims.lhs_batch_dimensions(),
-            dot_dims.lhs_contracting_dimensions(), rhs->shape(),
-            dot_dims.rhs_batch_dimensions(),
-            dot_dims.rhs_contracting_dimensions(),
-            /*output_shape=*/instr->shape(), gemm_backend_config.alpha_real(),
-            gemm_backend_config.alpha_imag(), gemm_backend_config.beta(),
-            /*algorithm*/ std::nullopt, se::blas::kDefaultComputePrecision));
-    if (gemm_config.output_layout.order != MatrixLayout::Order::kColumnMajor) {
+
+    TF_ASSIGN_OR_RETURN(bool output_is_column_major,
+                        OutputIsColumnMajor(instr, gemm_backend_config));
+    if (!output_is_column_major) {
       // cublasLt's matmul output is column major by default. This gemm requires
       // the output to be in row major. Later we will swap lhs & rhs (and
       // transpose each operand) of this gemm. Since we care about the rhs at
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
index fb6beb79a6b..7c59baf9204 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_aot_compilation_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_aot_compilation_test.cc
index 222e06991e4..a334a1ef9c1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
@@ -61,5 +62,57 @@ ENTRY main {
                           aot_result->LoadExecutable(&compiler, stream_exec));
 }
 
+TEST_F(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
+  const absl::string_view hlo_string = R"(
+HloModule Test
+
+ENTRY main {
+  a = f32[100, 200]{1,0} parameter(0)
+  ROOT b = f32[100, 200]{0,1} copy(a)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  NVPTXCompiler compiler;
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          se::MultiPlatformManager::PlatformWithName("cuda"));
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
+                          platform->ExecutorForDevice(0));
+
+  auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
+
+  const stream_executor::DeviceDescription& device_description =
+      stream_exec->GetDeviceDescription();
+  stream_executor::CudaComputeCapability cuda_compute_capability =
+      device_description.cuda_compute_capability();
+  stream_executor::RocmComputeCapability rocm_compute_capability =
+      device_description.rocm_compute_capability();
+
+  // Stream executor is not passed as an option.
+  GpuTargetConfig gpu_target_config;
+  gpu_target_config.gpu_device_info = GetGpuDeviceInfo(stream_exec);
+  gpu_target_config.gpu_version = cuda_compute_capability;
+  gpu_target_config.platform_name = stream_exec->platform()->Name();
+
+  AotCompilationOptions aot_options(compiler.PlatformId());
+  aot_options.set_target_config(gpu_target_config);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+
+  // Serialize-deserialize AOT compilation result.
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
+                          aot_results[0]->SerializeAsString());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AotCompilationResult> aot_result,
+      compiler.LoadAotCompilationResult(serialized_aot_result));
+
+  // Load Executable from AOT compilation result.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          aot_result->LoadExecutable(&compiler, stream_exec));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
index 563245da969..a905bb8e1ad 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
+++ b/tensorflow/compiler/xla/service/gpu/gpu_autotuning.proto
@@ -6,7 +6,7 @@ package xla.gpu;
 
 import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/core/protobuf/autotuning.proto";
+import "tensorflow/tsl/protobuf/autotuning.proto";
 
 message ConvInstructionLog {
   xla.HloInstructionProto instruction = 1;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index ffda881ea97..2562e1ce3e3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -18,50 +18,47 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <algorithm>
+#include <any>
 #include <atomic>
 #include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
+#include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
 #include <vector>
 
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "llvm/AsmParser/Parser.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "mlir/Transforms/LocationSnapshot.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/transforms/hlo_constant_splitter.h"
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
-#include "tensorflow/compiler/xla/mlir/transforms/gpu/passes.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h"
-#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/all_gather_broadcast_reorder.h"
 #include "tensorflow/compiler/xla/service/all_gather_combiner.h"
-#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
 #include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
 #include "tensorflow/compiler/xla/service/all_reduce_contiguous.h"
 #include "tensorflow/compiler/xla/service/all_reduce_folder.h"
+#include "tensorflow/compiler/xla/service/all_reduce_promotion.h"
 #include "tensorflow/compiler/xla/service/all_reduce_reassociate.h"
 #include "tensorflow/compiler/xla/service/all_to_all_decomposer.h"
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
@@ -93,6 +90,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/all_reduce_blueconnect.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/conv_layout_normalization.h"
+#include "tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -100,6 +98,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
@@ -115,14 +114,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
-#include "tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h"
-#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/metrics.h"
 #include "tensorflow/compiler/xla/service/gpu/move_copy_to_users.h"
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h"
@@ -131,25 +126,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/softmax_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_computation_deduplicator.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
-#include "tensorflow/compiler/xla/service/hlo_constant_splitter.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
-#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/layout_normalization.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -171,36 +157,40 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
 #include "tensorflow/compiler/xla/service/sharding_remover.h"
 #include "tensorflow/compiler/xla/service/simplify_fp_conversions.h"
-#include "tensorflow/compiler/xla/service/slice_sinker.h"
 #include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/spmd/collective_permute_motion.h"
 #include "tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h"
 #include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+#include "tensorflow/compiler/xla/service/stochastic_convert_decomposer.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h"
 #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
+#include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
-#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
-#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/blocking_counter.h"
 #include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/regexp.h"
+#include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
-#include "tensorflow/tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {
@@ -208,18 +198,12 @@ namespace {
 
 class GpuBfloat16Support : public BFloat16Support {
  public:
-  explicit GpuBfloat16Support(bool supports_matrix_multiplication,
-                              se::StreamExecutor* stream_exec)
-      : supports_matrix_multiplication_(supports_matrix_multiplication),
-        stream_exec_(stream_exec) {}
-
   bool SupportsBF16Operand(const HloInstruction& hlo,
                            int64_t operand_index) const override {
     return BFloat16Support::SupportsBF16Operand(hlo, operand_index) ||
            IsSupported(hlo);
   }
 
-  // Returns whether the backend supports BF16 output for the HLO instruction.
   bool SupportsBF16Output(const HloInstruction& hlo) const override {
     return BFloat16Support::SupportsBF16Output(hlo) || IsSupported(hlo);
   }
@@ -253,42 +237,12 @@ class GpuBfloat16Support : public BFloat16Support {
       // Other special ops.
       case HloOpcode::kBitcast:
         return true;
-      case HloOpcode::kConvolution:
-        return IsConvBF16Supported();
       default:
-        return supports_matrix_multiplication_ &&
-               gpu::IsMatrixMultiplication(hlo);
-    }
-  }
-
-  bool IsConvBF16Supported() const {
-    if (se::dnn::DnnSupport* dnn = stream_exec_->AsDnn()) {
-      se::port::StatusOr<se::dnn::VersionInfo> cudnn_version =
-          dnn->GetVersion();
-      return cudnn_version.ok() &&
-             (cudnn_version->major_version() > 8 ||
-              (cudnn_version->major_version() == 8 &&
-               cudnn_version->minor_version() >= 2)) &&
-             stream_exec_->GetDeviceDescription()
-                 .cuda_compute_capability()
-                 .IsAtLeast(se::CudaComputeCapability::AMPERE);
+        return false;
     }
-    return false;
   }
-
-  bool supports_matrix_multiplication_;
-  se::StreamExecutor* stream_exec_;
 };
 
-int64_t GetSizeOfShape(const Shape& shape, int pointer_size) {
-  if (shape.is_static() || shape.IsTuple()) {
-    return ShapeUtil::ByteSizeOf(shape, pointer_size);
-  }
-  // Each dynamic dimension size is represented as a S32.
-  int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
-  return ShapeUtil::ByteSizeOf(shape, pointer_size) + metadata_size;
-}
-
 bool ConvIsLowerable(HloInstruction* conv) {
   return GpuConvRewriter::ConvIsLowerable(conv);
 }
@@ -312,15 +266,65 @@ GpuXlaRuntimeAotCompilationResult::LoadExecutable(
       HloModule::CreateFromProto(xla_runtime_executable.hlo_module_proto(),
                                  hlo_module_config));
   auto gpu_compiler = tensorflow::down_cast<GpuCompiler*>(compiler);
+
+  std::vector<GpuExecutable::ConstantInfo> constants;
+  for (auto& cst : xla_runtime_gpu_executable_.constants()) {
+    GpuExecutable::ConstantInfo constant = {
+        cst.symbol_name(),
+        {cst.content().begin(), cst.content().end()},
+        cst.allocation_index()};
+    constants.push_back(std::move(constant));
+  }
+
   return GpuExecutable::LoadFromObjFile(
       std::move(hlo_module), xla_runtime_executable.obj_file(),
       xla_runtime_executable.mlir_module(),
       xla_runtime_gpu_executable_.entry_func_attrs(),
       GetDebugOptionsFromFlags(), xla_runtime_gpu_executable_.gpu_asm_text(),
-      xla_runtime_gpu_executable_.gpu_binary(),
+      xla_runtime_gpu_executable_.gpu_binary(), std::move(constants),
       gpu_compiler->GetGpuVersion(executor), executor);
 }
 
+GpuTargetConfig::GpuTargetConfig(const se::GpuTargetConfigProto& proto)
+    : gpu_device_info(proto.gpu_device_info()),
+      platform_name(proto.platform_name()),
+      dnn_version_info(proto.dnn_version_info()) {
+  if (proto.has_cuda_compute_capability()) {
+    stream_executor::CudaComputeCapability cuda_compute_capability(
+        proto.cuda_compute_capability());
+    gpu_version = cuda_compute_capability;
+  } else {
+    CHECK(proto.has_rocm_compute_capability());
+    stream_executor::RocmComputeCapability rocm_compute_capability(
+        proto.rocm_compute_capability());
+    gpu_version = rocm_compute_capability;
+  }
+
+  device_description_str = proto.device_description_str();
+}
+
+se::GpuTargetConfigProto GpuTargetConfig::ToProto() const {
+  se::GpuTargetConfigProto proto;
+  *proto.mutable_gpu_device_info() = gpu_device_info.ToProto();
+
+  if (std::holds_alternative<se::CudaComputeCapability>(gpu_version)) {
+    auto cuda_compute_capability =
+        std::get<se::CudaComputeCapability>(gpu_version);
+    *proto.mutable_cuda_compute_capability() =
+        cuda_compute_capability.ToProto();
+  } else {
+    auto rocm_compute_capability =
+        std::get<se::RocmComputeCapability>(gpu_version);
+    *proto.mutable_rocm_compute_capability() =
+        rocm_compute_capability.ToProto();
+  }
+
+  proto.set_platform_name(platform_name);
+  *proto.mutable_dnn_version_info() = dnn_version_info.ToProto();
+  proto.set_device_description_str(device_description_str);
+  return proto;
+}
+
 GpuCompiler::GpuCompiler(se::Platform::Id platform_id,
                          const char* target_triple, const char* data_layout)
     : platform_id_(platform_id),
@@ -348,7 +352,9 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
 // Runs optimization passes on the given HLO module.
 Status GpuCompiler::OptimizeHloModule(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator,
+    const GpuTargetConfig& gpu_target_config,
+    const AutotuneResults* autotune_results) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
   AlgebraicSimplifierOptions layout_insensitive_algsimp_opts({},
@@ -357,9 +363,7 @@ Status GpuCompiler::OptimizeHloModule(
   layout_insensitive_algsimp_opts.set_minmax_propagate_nan(
       !debug_options.xla_gpu_enable_fast_min_max());
 
-  const se::Platform* platform = stream_exec->platform();
-  if (platform->Name() == "ROCM") {
-    // SwapConvOperands does not yet work on ROCM
+  if (gpu_target_config.platform_name == "ROCM") {
     layout_insensitive_algsimp_opts.set_enable_conv_operand_swap(false);
   }
 
@@ -404,6 +408,7 @@ Status GpuCompiler::OptimizeHloModule(
         hlo_module->config().allow_spmd_sharding_propagation_to_output());
     spmd_pipeline.AddPass<spmd::StatefulRngSpmdPartitioner>(
         num_partitions, hlo_module->config().replica_count());
+    spmd_pipeline.AddPass<CollectivePermuteMotion>();
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(hlo_module).status());
   } else {
     HloPassPipeline sharding_removal_pipeline("sharding-removal");
@@ -440,12 +445,14 @@ Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<ZeroSizedHloElimination>();
 
     if (debug_options.xla_gpu_deterministic_ops()) {
-      // Scatter is nondeterministic, so eliminate all Scatters.
-      pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
-    } else {
-      // Only Scatters unsupported on XLA:GPU are eliminated.
-      pipeline.AddPass<GpuScatterExpander>();
+      // Scatter can be indeterministic if indices are not unique or a non
+      // associative combiner function is used. Eliminate these Scatter ops.
+      pipeline.AddPass<ScatterExpander>(
+          ScatterExpander::kEliminateIndeterminisitcScatters);
     }
+    // Scatters unsupported on XLA:GPU are eliminated.
+    pipeline.AddPass<GpuScatterExpander>();
+
     // TODO(phawkins): replace QR and Eigh decompositions with calls to
     // cuSOLVER.
     pipeline.AddPass<QrExpander>();
@@ -456,8 +463,11 @@ Status GpuCompiler::OptimizeHloModule(
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
 
+    pipeline.AddPass<DotDimensionSorter>();
     pipeline.AddPass<DotDecomposer>();
 
+    pipeline.AddPass<StochasticConvertDecomposer>();
+
     pipeline.AddPass<Convolution4DExpander>();
 
     // Replace PRED convolutions with F16.
@@ -466,10 +476,6 @@ Status GpuCompiler::OptimizeHloModule(
     // Expand the sort op to support stable sorting if required.
     pipeline.AddPass<StableSortExpander>();
 
-    GpuBfloat16Support bf16(/*supports_matrix_multiplication=*/true,
-                            stream_exec);
-    pipeline.AddPass<BFloat16Normalization>(&bf16);
-
     pipeline.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
@@ -530,6 +536,7 @@ Status GpuCompiler::OptimizeHloModule(
       pipeline.AddPass<AlgebraicSimplifier>(layout_insensitive_algsimp_opts);
       pipeline.AddPass<BitcastDtypesExpander>();
       // AlgebraicSimplifier may add contracting dimensions to a dot.
+      pipeline.AddPass<DotDimensionSorter>();
       pipeline.AddPass<DotDecomposer>();
       // Only merge "smallish" dots.  This threshold was not set carefully, but
       // so far we know that 1mb is too small.
@@ -561,15 +568,6 @@ Status GpuCompiler::OptimizeHloModule(
       pipeline.AddPass<AlgebraicSimplifier>(layout_insensitive_algsimp_opts);
     }();
 
-    // Run Softmax fusion after the simplification pipeline. This makes matching
-    // softmax easier, because redundant reshapes/broadcasts have already been
-    // removed. But we also want to run before layout assignment so that we can
-    // assure that the default layout will be used for the matched softmax
-    // fusion.
-    if (hlo_module->config().debug_options().xla_gpu_enable_softmax_fusion()) {
-      pipeline.AddPass<SoftmaxFusion>();
-    }
-
     // Run WhileLoopTripCountAnnotator at the end of the simplification
     // pipeline, before layout assignment and fusion.  This pass does some
     // pattern-matching on while bodies/conditions, and this is where the HLO is
@@ -589,6 +587,7 @@ Status GpuCompiler::OptimizeHloModule(
   {
     HloPassPipeline collectives_pipeline("collective-optimizations");
     collectives_pipeline.AddPass<AllReduceFolder>();
+    collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>();
     collectives_pipeline.AddPass<ReduceScatterCreator>();
     collectives_pipeline.AddPass<AllReduceReassociate>();
     collectives_pipeline.AddPass<ReduceScatterReassociate>();
@@ -600,13 +599,30 @@ Status GpuCompiler::OptimizeHloModule(
         layout_insensitive_algsimp_opts);
 
     collectives_pipeline.AddPass<AllGatherBroadcastReorder>();
+
+    // promote 16 bit integer all-reduce and reduce-scatter to 32-bit.
+    const std::pair<PrimitiveType, PrimitiveType> ar_promoted_types[] = {
+        {U16, U32}, {S16, S32}};
+    collectives_pipeline.AddPass<AllReducePromotion>(ar_promoted_types);
+    // Remove dead computations left over after ar/rs promotion.
+    collectives_pipeline.AddPass<HloDCE>();
+
     TF_RETURN_IF_ERROR(collectives_pipeline.Run(hlo_module).status());
   }
 
   // Run target-specific HLO optimization passes for convolution
   // canonicalization.
+  GpuVersion gpu_version = gpu_target_config.gpu_version;
+  se::dnn::VersionInfo dnn_version = gpu_target_config.dnn_version_info;
+  if (stream_exec != nullptr) {
+    gpu_version = GetGpuVersion(stream_exec);
+    se::dnn::DnnSupport* dnn = stream_exec->AsDnn();
+    TF_RET_CHECK(dnn != nullptr);
+    TF_ASSIGN_OR_RETURN(dnn_version, dnn->GetVersion());
+  }
+
   TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
-      hlo_module, stream_exec, device_allocator));
+      hlo_module, gpu_version, dnn_version, device_allocator));
 
   {
     // Run layout assignment in a separate pipeline from
@@ -627,8 +643,11 @@ Status GpuCompiler::OptimizeHloModule(
   }
 
   // Run target-specific HLO optimization passes after layout assignment.
-  TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(hlo_module, stream_exec,
-                                                     device_allocator));
+  TF_RETURN_IF_ERROR(
+      OptimizeHloPostLayoutAssignment(hlo_module, stream_exec, device_allocator,
+                                      gpu_target_config, autotune_results));
+
+  const GpuDeviceInfo& gpu_device_info = gpu_target_config.gpu_device_info;
 
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
@@ -640,10 +659,13 @@ Status GpuCompiler::OptimizeHloModule(
         HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
             LayoutAssignment::InstructionCanChangeLayout),
         /*debug_only=*/true);
-    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
-    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
-    fusion.AddPass<FusionMerger>(ShapeSizeBytesFunction());
-    fusion.AddPass<GpuMultiOutputFusion>(ShapeSizeBytesFunction());
+    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false,
+                                         gpu_device_info);
+    fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true,
+                                         gpu_device_info);
+    fusion.AddPass<FusionMerger>(gpu_device_info, ShapeSizeBytesFunction());
+    fusion.AddPass<GpuMultiOutputFusion>(gpu_device_info,
+                                         ShapeSizeBytesFunction());
     fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                            /*only_fusion_computations=*/true);
     fusion.AddPass<HloDCE>();
@@ -653,7 +675,7 @@ Status GpuCompiler::OptimizeHloModule(
   {
     HloPassFix<HloPassPipeline> horizontal_fusion("horizontal fusion");
     horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
-    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
+    horizontal_fusion.AddPass<GpuHorizontalInputFusion>(gpu_device_info);
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                       /*only_fusion_computations=*/true);
     horizontal_fusion.AddPass<HloDCE>();
@@ -688,9 +710,18 @@ Status GpuCompiler::OptimizeHloModule(
       pipeline.AddPass<AllReduceBlueConnect>(blueconnect_num_devices_per_host);
     }
 
-    if (debug_options.xla_gpu_enable_async_all_reduce()) {
+    bool async_all_reduce = debug_options.xla_gpu_enable_async_all_reduce();
+    bool async_collective_permute =
+        debug_options.xla_gpu_enable_async_collective_permute();
+
+    if (async_all_reduce || async_collective_permute) {
       AsyncCollectiveCreator::CollectiveCreatorConfig config;
-      config.convert_all_reduce = [](const HloInstruction*) { return true; };
+      config.convert_all_reduce = [=](const HloInstruction*) {
+        return async_all_reduce;
+      };
+      config.convert_collective_permute = [=](const HloInstruction*) {
+        return async_collective_permute;
+      };
       pipeline.AddPass<AsyncCollectiveCreator>(std::move(config));
     }
 
@@ -699,8 +730,6 @@ Status GpuCompiler::OptimizeHloModule(
     AlgebraicSimplifierOptions options = layout_insensitive_algsimp_opts;
     options.set_is_layout_sensitive(true);
     pipeline.AddPass<AlgebraicSimplifier>(options);
-    pipeline.AddPass<OptimizationBarrierExpander>();
-    pipeline.AddPass<TupleSimplifier>();
 
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
@@ -735,32 +764,78 @@ Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   }
   pipeline.AddPass<LoopScheduleLinearizer>(GetCanShareBuffer());
   pipeline.AddPass<CopyInsertion>(GetCanShareBuffer());
+  // We are using a sub-pipeline here, so that the verifier only runs after both
+  // GpuHorizontalLoopFusion and HloDCE.
+  auto& sub_pipeline =
+      pipeline.AddPass<HloPassPipeline>("horizontal-loop-fusion-for-copy");
   // To fuse the copy.
-  pipeline.AddPass<GpuHorizontalLoopFusion>("copy_");
-  // To remove temporary fused_computation created by GpuHorizontalLoopFusion
-  pipeline.AddPass<HloDCE>();
-
+  sub_pipeline.AddPass<GpuHorizontalLoopFusion>("copy_");
+  sub_pipeline.AddPass<HloDCE>();
   pipeline.AddPass<GpuSanitizeConstantNames>();
   return pipeline.Run(hlo_module).status();
 }
 
 Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator,
+    const GpuTargetConfig& gpu_target_config,
+    const AutotuneResults* autotune_results) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
 
   {
     HloPassPipeline pipeline("hlo normalization");
+
+    // The LayoutAssignment pass may leave behind kCopy instructions which are
+    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+    AlgebraicSimplifierOptions options;
+    options.set_is_layout_sensitive(true);
+    options.set_enable_conv_operand_swap(false);
+    // "slow" minmax means we propagate nan.
+    options.set_minmax_propagate_nan(
+        !hlo_module->config().debug_options().xla_gpu_enable_fast_min_max());
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+
+    // GemmRewriter assumes that all transposes are folded into gemms, but,
+    // since commit 7d529df, this is not always true at this point.
+    // Therefore, rerun transpose folding.
+    pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot,
+                                       TransposeFolding::NeverFoldTranspose);
+
     pipeline.AddPass<ReshapeDecomposer>();
     pipeline.AddPass<ReduceDecomposer>([&](const HloInstruction* r) {
       return IsReductionFromOrToContiguousDimensions(*r);
     });
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
+
+    // Rewrite GEMMs into custom calls.
+    pipeline.AddPass<GemmRewriter>(
+        std::get<se::CudaComputeCapability>(gpu_target_config.gpu_version));
+
+    // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
+    pipeline.AddPass<GemmBroadcastFoldingRewriter>();
+
     if (hlo_module->config().debug_options().xla_gpu_normalize_layouts()) {
       pipeline.AddPass<LayoutNormalization>(
           &NormalizeLayoutForCustomCallConvolution);
+      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
     }
     pipeline.AddPass<BroadcastCanonicalizer>();
+
+    pipeline.AddPass<ReductionDegenerateDimRemover>();
+    pipeline.AddPass<ReductionLayoutNormalizer>();
+    // Run Softmax fusion after layout normalization. We expect a default layout
+    // in the softmax codegen pipeline. However we should run before
+    // ReductionDimensionGrouper, as that makes matching the softmax pattern
+    // harder.
+    if (hlo_module->config().debug_options().xla_gpu_enable_softmax_fusion()) {
+      pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
+      pipeline.AddPass<SoftmaxFusion>();
+    }
+
+    pipeline.AddPass<ReductionDimensionGrouper>();
+    pipeline.AddPass<HloPassFix<ReductionSplitter>>();
+    pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>(
+        std::get<se::CudaComputeCapability>(gpu_target_config.gpu_version));
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
@@ -774,40 +849,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
                      .VerifyReshapeIsBitcast(),
                  /*debug_only=*/true);
 
-  pipeline.AddPass<ReductionDegenerateDimRemover>();
-  pipeline.AddPass<ReductionLayoutNormalizer>();
-  pipeline.AddPass<ReductionDimensionGrouper>();
-  pipeline.AddPass<HloPassFix<ReductionSplitter>>();
-  pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>(
-      stream_exec->GetDeviceDescription().cuda_compute_capability());
-
-  // The LayoutAssignment pass may leave behind kCopy instructions which are
-  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-  AlgebraicSimplifierOptions options;
-  options.set_is_layout_sensitive(true);
-  options.set_enable_conv_operand_swap(false);
-  // "slow" minmax means we propagate nan.
-  options.set_minmax_propagate_nan(
-      !hlo_module->config().debug_options().xla_gpu_enable_fast_min_max());
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
-
-  // GemmRewriter assumes that all transposes are folded into gemms, but,
-  // since commit 7d529df, this is not always true at this point.
-  // Therefore, rerun transpose folding.
-  pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot,
-                                     TransposeFolding::NeverFoldTranspose);
-  // Rewrite GEMMs into custom calls.
-  pipeline.AddPass<GemmRewriter>(
-      stream_exec->GetDeviceDescription().cuda_compute_capability());
-
-  // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
-  pipeline.AddPass<GemmBroadcastFoldingRewriter>();
-
-  // Run conversion again, to catch those matrix multiplications which were not
-  // rewritten into cuBLAS calls.
-  GpuBfloat16Support bf16(/*supports_matrix_multiplication=*/false,
-                          stream_exec);
-  pipeline.AddPass<BFloat16Normalization>(&bf16);
+  GpuBfloat16Support bf16_support;
+  pipeline.AddPass<BFloat16Normalization>(&bf16_support);
 
   // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
   if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
@@ -839,7 +882,20 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // the gte(customcall, 0) would probably already be into a fusion node.  We
   // can't simplify across HloComputation boundaries, so in this case we
   // wouldn't be able to simplify away the new_tuple bits.
-  pipeline.AddPass<GpuConvAlgorithmPicker>(stream_exec, device_allocator);
+  if (stream_exec) {
+    // Autotune if stream_exec is available.
+    GpuConvAlgorithmPicker::DeviceConfig config{stream_exec, device_allocator};
+    pipeline.AddPass<GpuConvAlgorithmPicker>(config);
+  } else {
+    // Device not available. Use autotune results from gpu_target_config.
+    GpuConvAlgorithmPicker::ClearAutotuneResults();
+    TF_RETURN_IF_ERROR(
+        GpuConvAlgorithmPicker::LoadAutotuneResults(*autotune_results));
+
+    GpuConvAlgorithmPicker::DevicelessConfig config{
+        gpu_target_config.device_description_str};
+    pipeline.AddPass<GpuConvAlgorithmPicker>(config);
+  }
 
   // Clean up new_tuple described above.
   pipeline.AddPass<TupleSimplifier>();
@@ -866,13 +922,43 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
-  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuCompiler::RunHloPasses for ", module->name()));
   uint64_t start_usecs = tsl::Env::Default()->NowMicros();
   tsl::profiler::TraceMe activity(
       [&] { return absl::StrCat("HLO Transforms:", module->name()); },
       tsl::profiler::TraceMeLevel::kInfo);
+
+  GpuTargetConfig gpu_target_config = GetGpuTargetConfig(stream_exec);
   TF_RETURN_IF_ERROR(
-      OptimizeHloModule(module.get(), stream_exec, options.device_allocator));
+      OptimizeHloModule(module.get(), stream_exec, options.device_allocator,
+                        gpu_target_config, /*autotune_results=*/nullptr));
+
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+
+  uint64_t end_usecs = tsl::Env::Default()->NowMicros();
+
+  // This won't record values for calls that error out (because if they error
+  // out we have no way of telling how far through the process we got).
+  RecordHloPassesDuration(end_usecs - start_usecs);
+
+  return std::move(module);
+}
+
+StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPassesWithoutDevice(
+    std::unique_ptr<HloModule> module, const CompileOptions& options,
+    const GpuTargetConfig& gpu_target_config,
+    const AutotuneResults& autotune_results) {
+  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuCompiler::RunHloPasses for ", module->name()));
+  uint64_t start_usecs = tsl::Env::Default()->NowMicros();
+  tsl::profiler::TraceMe activity(
+      [&] { return absl::StrCat("HLO Transforms:", module->name()); },
+      tsl::profiler::TraceMeLevel::kInfo);
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), nullptr,
+                                       options.device_allocator,
+                                       gpu_target_config, &autotune_results));
 
   TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
 
@@ -892,9 +978,10 @@ static std::optional<bool> DummyCanShareBufferFunction(const HloInstruction*,
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
-    const HloModule* hlo_module) {
-  TF_ASSIGN_OR_RETURN(HloSchedule hlo_schedule,
-                      ScheduleGpuModule(hlo_module, pointer_size_));
+    HloModule* hlo_module, se::StreamExecutor* stream_exec) {
+  const GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
+  TF_RETURN_IF_ERROR(
+      ScheduleGpuModule(hlo_module, pointer_size_, gpu_device_info));
 
   auto buffer_size_bytes_function =
       [this](const BufferValue& buffer_value) -> int64_t {
@@ -904,7 +991,8 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> assignment,
       BufferAssigner::Run(
-          hlo_module, std::make_unique<SequentialHloOrdering>(hlo_schedule),
+          hlo_module,
+          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
           buffer_size_bytes_function,
           /*color_alignment=*/
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
@@ -919,14 +1007,17 @@ StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
 static Status LowerToXlaGpuRuntime(mlir::ModuleOp module,
                                    llvm::StringRef entry_function_name,
                                    llvm::ArrayRef<int64_t> buffer_sizes,
-                                   ThunkSequence* thunk_sequence) {
+                                   ThunkSequence* thunk_sequence,
+                                   const DebugOptions& debug_options) {
   if (!module) {
     return InternalError("No MLIR module to lower.");
   }
 
-  mlir::PassManager pm(module.getContext(),
-                       mlir::PassManager::Nesting::Implicit);
-  populateXlaGpuRuntimePasses(pm, thunk_sequence);
+  mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
+
+  GpuPipelineOpts opts;
+  opts.enable_cuda_graphs = debug_options.xla_gpu_enable_cuda_graphs();
+  populateXlaGpuRuntimePasses(pm, thunk_sequence, opts);
 
   if (pm.run(module).failed()) {
     return InternalError("Failed to lower LMHLO to Gpu runtime custom calls.");
@@ -953,7 +1044,8 @@ static StatusOr<OwnedGpuRuntimeProgram> LowerToJitRt(
   // Lower LMHLO operations to the JitRt compatible custom calls.
   TF_RETURN_IF_ERROR(LowerToXlaGpuRuntime(
       mlir_module, {entry_function_name.data(), entry_function_name.size()},
-      buffer_sizes, thunk_sequence.get()));
+      buffer_sizes, thunk_sequence.get(),
+      hlo_module->config().debug_options()));
   // Serialize module to pass it to GpuExecutable for compilation.
   std::string serialized_module;
   llvm::raw_string_ostream os(serialized_module);
@@ -1032,12 +1124,30 @@ static void ForAllThunks(const std::function<void(Thunk*)>& fn,
   }
 }
 
+static bool HasFp8(const HloModule& hlo_module) {
+  for (const HloComputation* computation : hlo_module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (ShapeUtil::HasPrimitiveType(instruction->shape(), F8E5M2) ||
+          ShapeUtil::HasPrimitiveType(instruction->shape(), F8E4M3FN)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Prints mlir diagnostic messages to VLOG level 2.
+static mlir::LogicalResult DiagnosticHandler(mlir::Diagnostic& diag) {
+  VLOG(2) << diag.str();
+  return mlir::failure();
+}
+
 // The order of `thunk_sequence` corresponds to
 // `hlo_schedule->ThunkLaunchOrder()`.
 static Status CompileModuleToLlvmIrImpl(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, const se::Platform::Id platform_id,
+    const std::string& platform_name, se::Platform::Id platform_id,
     GpuDeviceInfo gpu_device_info,
     se::CudaComputeCapability cuda_compute_capability,
     se::RocmComputeCapability rocm_compute_capability,
@@ -1048,8 +1158,14 @@ static Status CompileModuleToLlvmIrImpl(
   results->llvm_module->setTargetTriple(target_triple);
   results->llvm_module->setDataLayout(data_layout);
 
-  TF_ASSIGN_OR_RETURN(HloSchedule hlo_schedule,
-                      ScheduleGpuModule(hlo_module, pointer_size));
+  TF_RETURN_IF_ERROR(
+      ScheduleGpuModule(hlo_module, pointer_size, gpu_device_info));
+  {
+    HloPassPipeline pipeline("opt-barrier-expander");
+    pipeline.AddPass<OptimizationBarrierExpander>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
 
   auto buffer_size_bytes_function =
       [pointer_size](const BufferValue& buffer_value) -> int64_t {
@@ -1059,7 +1175,8 @@ static Status CompileModuleToLlvmIrImpl(
   TF_ASSIGN_OR_RETURN(
       results->buffer_assignment,
       BufferAssigner::Run(
-          hlo_module, std::make_unique<SequentialHloOrdering>(hlo_schedule),
+          hlo_module,
+          std::make_unique<SequentialHloOrdering>(hlo_module->schedule()),
           buffer_size_bytes_function,
           /*color_alignment=*/
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
@@ -1071,12 +1188,13 @@ static Status CompileModuleToLlvmIrImpl(
           << results->buffer_assignment->GetStats().ToString();
   DumpHloModuleIfEnabled(*hlo_module, *results->buffer_assignment,
                          absl::StrCat("sm_", cuda_compute_capability.ToString(),
-                                      "_gpu_after_optimizations"));
+                                      "_gpu_", kAfterOptimizationsDumpName));
 
   uint64_t start_usecs = tsl::Env::Default()->NowMicros();
   mlir::DialectRegistry registry;
   IrEmitterUnnested::GetDependentDialects(registry);
   mlir::MLIRContext mlir_context(registry);
+  mlir_context.getDiagEngine().registerHandler(DiagnosticHandler);
   mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
       mlir::ModuleOp::create(mlir::Builder(&mlir_context).getUnknownLoc());
 
@@ -1099,10 +1217,37 @@ static Status CompileModuleToLlvmIrImpl(
 
   if (hlo_module->config().debug_options().xla_gpu_enable_mlir_lowering()) {
     mlir::PassManager pm(&mlir_context);
+    bool uses_multithreading = pm.getContext()->isMultithreadingEnabled();
+    std::optional<llvm::raw_fd_ostream> log_stream;
+    if (hlo_module->config().debug_options().xla_gpu_dump_llvmir()) {
+      const std::string basename =
+          absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module->name())),
+                       ".mlir-passes.log");
+      std::string outputs_dir;
+      tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
+      std::string path = tsl::io::JoinPath(outputs_dir, basename);
+      std::error_code err;
+      log_stream.emplace(path, err, llvm::sys::fs::OF_None);
+      if (err) {
+        log_stream.reset();
+      }
+
+      auto print_before = [](mlir::Pass*, mlir::Operation*) { return true; };
+      auto print_after = [](mlir::Pass*, mlir::Operation*) { return false; };
+      pm.getContext()->disableMultithreading();
+      pm.enableIRPrinting(print_before, print_after, true, true, false,
+                          *log_stream, {});
+    }
     pm.addPass(mlir::createGpuFusionRewritePass());
     if (failed(pm.run(mlir_module.get()))) {
       return InternalError("Failed to run gpu-fusion-rewrite pass");
     }
+    if (hlo_module->config().debug_options().xla_gpu_dump_llvmir()) {
+      pm.getContext()->enableMultithreading(uses_multithreading);
+      if (log_stream) {
+        log_stream->flush();
+      }
+    }
   }
 
   IrEmitterContext ir_emitter_context(
@@ -1117,7 +1262,8 @@ static Status CompileModuleToLlvmIrImpl(
       IrEmitterUnnested::Create(hlo_module->config(), &ir_emitter_context));
 
   {
-    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
+    XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
+        "GpuCompiler::RunBackend - IR emission for ", hlo_module->name()));
 
     TF_RETURN_IF_ERROR(ir_emitter->EmitLmhloRegion(&entry_function.getBody()));
 
@@ -1140,7 +1286,10 @@ static Status CompileModuleToLlvmIrImpl(
     RecordHloToLlvmDuration(end_usecs - start_usecs);
   }
 
-  if (IsXlaRuntimeExecutableEnabled(hlo_module->config())) {
+  // TODO(ezhulenev): Remove the FP8 check once https://reviews.llvm.org/D140088
+  // is submitted. Currently we can't emit LLVM IR with fp8 types.
+  if (IsXlaRuntimeExecutableEnabled(hlo_module->config()) &&
+      !HasFp8(*hlo_module)) {
     std::vector<int64_t> buffer_sizes;
     llvm::transform(
         results->allocations, std::back_inserter(buffer_sizes),
@@ -1172,18 +1321,20 @@ static void NullDiagnosticHandler(const llvm::DiagnosticInfo& diag_info,
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
                                    std::unique_ptr<llvm::Module> llvm_module,
+                                   GpuVersion gpu_version,
                                    se::StreamExecutor* stream_exec,
                                    const CompileOptions& options,
                                    const HloModule* debug_module) {
   using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
 
   const auto compile_single_module =
-      [this, stream_exec, &module_config, debug_module](
+      [this, gpu_version, &module_config, debug_module](
           llvm::Module* llvm_module, bool relocatable,
           std::optional<int> shard_number) -> StatusOr<BackendCompileResult> {
     {
-      XLA_SCOPED_LOGGING_TIMER(
-          "GpuCompiler::RunBackend - Running LLVM verifier");
+      XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
+          "GpuCompiler::RunBackend - Running LLVM verifier for ",
+          (debug_module != nullptr ? debug_module->name() : "(unknown)")));
 
       llvm_module->getContext().setDiagnosticHandlerCallBack(
           NullDiagnosticHandler, nullptr);
@@ -1202,10 +1353,9 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
                                  FilenameFor(*debug_module, "", ""), "*")
                   : ".");
     }
-    GpuVersion gpu_version = GetGpuVersion(stream_exec);
     StatusOr<std::pair<std::string, std::vector<uint8_t>>> result =
         CompileTargetBinary(module_config, llvm_module, gpu_version,
-                            stream_exec, relocatable, debug_module);
+                            relocatable, debug_module);
 
     if (!result.ok()) {
       return result;
@@ -1388,12 +1538,14 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
   }
 
   auto maybe_backend_result =
-      this->LinkModules(stream_exec, std::move(submodule_compile_results));
+      this->LinkModules(stream_exec, std::move(submodule_compile_results),
+                        module_config.debug_options());
   if (!maybe_backend_result.ok()) {
     LOG(ERROR) << "The CUDA linking API did not work. Please use "
                   "XLA_FLAGS=--xla_gpu_force_compilation_parallelism=1 to "
                   "bypass it, but expect to get longer compilation time due to "
-                  "the lack of multi-threading.";
+                  "the lack of multi-threading. Original error: "
+               << maybe_backend_result.status();
     return maybe_backend_result.status();
   }
 
@@ -1404,7 +1556,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
   VLOG(1) << "Starting to compile HLO module " << module->name();
-  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuCompiler::RunBackend for ", module->name()));
   std::string slow_compilation_msg =
       absl::StrCat("Compiling module ", module->name());
   auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg);
@@ -1413,7 +1566,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   llvm::LLVMContext llvm_context;
 
-  GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
+  const GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
 
   if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
     HloCostAnalysis::Options options{ShapeSizeBytesFunction()};
@@ -1456,9 +1609,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
   TF_ASSIGN_OR_RETURN(
       BackendCompileResult backend_result,
-      CompileToTargetBinary(module->config(),
-                            std::move(compile_module_results.llvm_module),
-                            stream_exec, options, module.get()));
+      CompileToTargetBinary(
+          module->config(), std::move(compile_module_results.llvm_module),
+          GetGpuVersion(stream_exec), stream_exec, options, module.get()));
   if (DumpingEnabledForHloModule(*module) &&
       std::holds_alternative<OwnedThunkSequence>(
           compile_module_results.executable)) {
@@ -1509,37 +1662,78 @@ StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                 const AotCompilationOptions& options) {
   CHECK(options.PlatformId() == se::cuda::kCudaPlatformId);
-  CHECK(options.executor() != nullptr);
-  auto stream_exec = options.executor();
 
   std::vector<std::unique_ptr<HloModule>> modules =
       module_group->ConsumeModules();
   std::vector<std::unique_ptr<AotCompilationResult>> results;
 
+  std::any target_config = options.target_config();
+  auto* gpu_target_config = std::any_cast<GpuTargetConfig>(&target_config);
+  CHECK(gpu_target_config != nullptr || options.executor() != nullptr);
+
   for (const auto& module : modules) {
     llvm::LLVMContext llvm_context;
-    GpuDeviceInfo gpu_device_info = GetGpuDeviceInfo(stream_exec);
 
     // Compile the module
     CompileModuleResults compile_module_results;
-    TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
-        module.get(), &llvm_context, target_triple_, data_layout_,
-        stream_exec->platform()->Name(), stream_exec->platform()->id(),
-        gpu_device_info,
-        stream_exec->GetDeviceDescription().cuda_compute_capability(),
-        stream_exec->GetDeviceDescription().rocm_compute_capability(),
-        GetCanShareBuffer(), pointer_size_, &compile_module_results));
+
+    const std::any& target_config = options.target_config();
+    auto* gpu_target_config = std::any_cast<GpuTargetConfig>(&target_config);
+
+    if (gpu_target_config) {
+      // CUDA "CC" major value, -1 if not available.
+      se::CudaComputeCapability cuda_compute_capability{-1, -1};
+      // ROCm gfx arch,  "gfx000" if not available.
+      se::RocmComputeCapability rocm_compute_capability{"gfx000"};
+      if (auto* cuda = std::get_if<se::CudaComputeCapability>(
+              &gpu_target_config->gpu_version)) {
+        cuda_compute_capability = *cuda;
+      } else {
+        rocm_compute_capability =
+            std::get<se::RocmComputeCapability>(gpu_target_config->gpu_version);
+      }
+
+      TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
+          module.get(), &llvm_context, target_triple_, data_layout_,
+          gpu_target_config->platform_name, options.PlatformId(),
+          gpu_target_config->gpu_device_info, cuda_compute_capability,
+          rocm_compute_capability, GetCanShareBuffer(), pointer_size_,
+          &compile_module_results));
+    } else {
+      CHECK(options.executor() != nullptr);
+      auto stream_exec = options.executor();
+      const stream_executor::DeviceDescription& device_description =
+          stream_exec->GetDeviceDescription();
+      TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
+          module.get(), &llvm_context, target_triple_, data_layout_,
+          stream_exec->platform()->Name(), options.PlatformId(),
+          GetGpuDeviceInfo(stream_exec),
+          device_description.cuda_compute_capability(),
+          device_description.rocm_compute_capability(), GetCanShareBuffer(),
+          pointer_size_, &compile_module_results));
+    }
 
     if (user_pre_optimization_hook_) {
       user_pre_optimization_hook_(*compile_module_results.llvm_module);
     }
 
     using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
-    TF_ASSIGN_OR_RETURN(
-        BackendCompileResult backend_result,
-        CompileToTargetBinary(
-            module->config(), std::move(compile_module_results.llvm_module),
-            stream_exec, {options.device_allocator()}, module.get()));
+    BackendCompileResult backend_result;
+    if (gpu_target_config) {
+      TF_ASSIGN_OR_RETURN(
+          backend_result,
+          CompileToTargetBinary(
+              module->config(), std::move(compile_module_results.llvm_module),
+              gpu_target_config->gpu_version, options.executor(),
+              {options.device_allocator()}, module.get()));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          backend_result,
+          CompileToTargetBinary(
+              module->config(), std::move(compile_module_results.llvm_module),
+              GetGpuVersion(options.executor()), options.executor(),
+              {options.device_allocator()}, module.get()));
+    }
 
     auto& compiled_executable = compile_module_results.executable;
 
@@ -1547,20 +1741,29 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
       return InternalError("Gpu runtime program was not provided");
     }
 
+    // TODO(ezhulenev): Unify AOT compilation with GpuRuntimeExecutable::Create
+    // (see `gpu/runtime/executable.h`).
+
     const auto& program = std::get<OwnedGpuRuntimeProgram>(compiled_executable);
 
-    // Options for the default JitRt compilation pipeline.
+    // Options for the default XLA runtime compilation pipeline.
     runtime::CompilationPipelineOptions copts;
 
-    // Options for constructing JitRt JitExecutable.
+    // Populate mapping from XLA (SE) enums/structs type id to symbol names.
+    copts.populate_type_id_names = RegisterXlaGpuTypeIdNames;
+
+    // For passing LMHLO attributes as XLA (SE) enums/structs to custom calls.
+    copts.populate_attr_encodings = RegisterXlaGpuAttrEncoding;
+
+    // Options for constructing XLA runtime JitExecutable.
     runtime::JitExecutable::Options opts;
     opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
     opts.compiler.register_dialects =
         runtime::RegisterDefaultXlaGpuRuntimeDialects;
 
-    // Register JitRt Gpu runtime custom calls with the linker.
+    // Register XLA Gpu runtime custom calls with the linker.
     opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
-        PopulateXlaGpuCustomCalls, PopulateXlaGpuTypeIdNames);
+        RegisterXlaGpuRuntimeCustomCalls, RegisterXlaGpuTypeIdNames);
 
     opts.compiler.create_compilation_pipeline =
         [copts](xla::runtime::PassManager& passes) {
@@ -1577,18 +1780,18 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     // For static shapes we can always serialize only the default executable.
     runtime::Executable& executable = jit_executable->DefaultExecutable().get();
 
-    // Check if JitRt executable saved the compilation result.
+    // Check if XLA runtime executable saved the compilation result.
     std::unique_ptr<llvm::MemoryBuffer> obj_file = executable.obj_file();
     if (!obj_file)
-      return InternalError("JitRt executable didn't save the obj file");
+      return InternalError("XLA runtime executable didn't save the obj file");
 
     std::string data(obj_file->getBuffer().data(),
                      obj_file->getBuffer().size());
-    results.emplace_back(
-        std::make_unique<xla::gpu::GpuXlaRuntimeAotCompilationResult>(
-            module->ToProto(), data, program->module,
-            compile_module_results.entry_func_attrs, backend_result.first,
-            backend_result.second));
+
+    results.emplace_back(std::make_unique<GpuXlaRuntimeAotCompilationResult>(
+        module->ToProto(), data, program->module,
+        compile_module_results.entry_func_attrs, backend_result.first,
+        backend_result.second, compile_module_results.constants));
   }
   return std::move(results);
 }
@@ -1614,7 +1817,8 @@ StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
 
   std::unique_ptr<AotCompilationResult> result =
       std::make_unique<xla::gpu::GpuXlaRuntimeAotCompilationResult>(
-          module_proto, obj_file, mlir_module, entry_func_attrs, text, binary);
+          module_proto, obj_file, mlir_module, entry_func_attrs, text, binary,
+          gpu_executable->constants());
   return result;
 }
 
@@ -1660,7 +1864,9 @@ static Status GetMlirAllocationInfo(mlir::func::FuncOp func,
   }
 
   for (int i = 0; i < func.getNumArguments(); i++) {
-    for (const mlir::NamedAttribute& attr : func.getArgAttrs(i)) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs =
+        mlir::function_interface_impl::getArgAttrs(func, i);
+    for (const mlir::NamedAttribute& attr : attrs) {
       TF_RET_CHECK(attr.getName() == "lmhlo.params" ||
                    attr.getName() == "lmhlo.param_shape_index" ||
                    attr.getName() == "lmhlo.constant_name" ||
@@ -1750,7 +1956,8 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
   using BackendCompileResult = std::pair<std::string, std::vector<uint8_t>>;
   TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
                       compiler->CompileToTargetBinary(
-                          module_config, std::move(llvm_module), stream_exec,
+                          module_config, std::move(llvm_module),
+                          compiler->GetGpuVersion(stream_exec), stream_exec,
                           options, /*debug_module=*/nullptr));
 
   GpuVersion gpu_version = compiler->GetGpuVersion(stream_exec);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index a13c65e14c7..9278775a18c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -17,12 +17,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COMPILER_H_
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/autotune_results.pb.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/executable.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
@@ -30,9 +33,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -44,22 +48,29 @@ namespace gpu {
 // TODO(b/232263665): It should be shared between GPU and CPU.
 class GpuXlaRuntimeAotCompilationResult : public AotCompilationResult {
  public:
-  GpuXlaRuntimeAotCompilationResult(HloModuleProto hlo,
-                                    std::string_view obj_file,
-                                    std::string_view mlir_module,
-                                    EntryFunctionAttributes entry_func_attrs,
-                                    std::string_view gpu_asm_text,
-                                    absl::Span<const uint8_t> gpu_binary) {
+  GpuXlaRuntimeAotCompilationResult(
+      HloModuleProto hlo, std::string_view obj_file,
+      std::string_view mlir_module, EntryFunctionAttributes entry_func_attrs,
+      std::string_view gpu_asm_text, absl::Span<const uint8_t> gpu_binary,
+      absl::Span<const GpuExecutable::ConstantInfo> constants = {}) {
     XlaRuntimeExecutableProto xla_runtime_executable;
     *xla_runtime_executable.mutable_hlo_module_proto() = hlo;
     xla_runtime_executable.set_obj_file(std::string(obj_file));
     xla_runtime_executable.set_mlir_module(std::string(mlir_module));
     *xla_runtime_gpu_executable_.mutable_xla_runtime_executable() =
         xla_runtime_executable;
+
     *xla_runtime_gpu_executable_.mutable_entry_func_attrs() = entry_func_attrs;
     xla_runtime_gpu_executable_.set_gpu_asm_text(std::string(gpu_asm_text));
     xla_runtime_gpu_executable_.set_gpu_binary(gpu_binary.data(),
                                                gpu_binary.size());
+
+    for (const GpuExecutable::ConstantInfo& cst : constants) {
+      auto* cst_proto = xla_runtime_gpu_executable_.add_constants();
+      cst_proto->set_symbol_name(cst.symbol_name);
+      cst_proto->set_allocation_index(cst.allocation_index);
+      cst_proto->set_content(cst.content.data(), cst.content.size());
+    }
   }
 
   explicit GpuXlaRuntimeAotCompilationResult(
@@ -87,23 +98,53 @@ class GpuXlaRuntimeAotCompilationResult : public AotCompilationResult {
   XlaRuntimeGpuExecutableProto xla_runtime_gpu_executable_;
 };
 
+struct GpuTargetConfig {
+  GpuTargetConfig() = default;
+  explicit GpuTargetConfig(const stream_executor::GpuTargetConfigProto& proto);
+
+  se::GpuTargetConfigProto ToProto() const;
+
+  GpuDeviceInfo gpu_device_info;
+  GpuVersion gpu_version;
+  std::string platform_name;
+  se::dnn::VersionInfo dnn_version_info;
+  std::string device_description_str;
+};
+
 // The GPU compiler generates efficient GPU executables.
 class GpuCompiler : public LLVMCompiler {
  public:
   GpuCompiler(se::Platform::Id platform_id, const char* target_triple,
               const char* data_layout);
-  ~GpuCompiler() override {}
 
   using LLVMCompiler::Compile;
 
+  // An attached device is passed in via stream_exec. We get GPU configuration
+  // from the attached device. GemmAlgorithmPicker and GpuConvAlgorithmPicker
+  // can run on the attached device.
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
 
+  // Run HloPasses without an attached deivce. So GemmAlgorithmPicker and
+  // GpuConvAlgorithmPicker can not run.
+  StatusOr<std::unique_ptr<HloModule>> RunHloPassesWithoutDevice(
+      std::unique_ptr<HloModule> module, const CompileOptions& options,
+      const GpuTargetConfig& gpu_target_config,
+      const AutotuneResults& autotune_results);
+
   StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(
-      const HloModule* hlo_module) override;
+      HloModule* hlo_module, se::StreamExecutor* stream_exec) override;
 
   virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
+  GpuTargetConfig GetGpuTargetConfig(se::StreamExecutor* stream_exec) {
+    GpuTargetConfig gpu_target_config;
+    gpu_target_config.gpu_device_info = GetGpuDeviceInfo(stream_exec);
+    gpu_target_config.gpu_version = GetGpuVersion(stream_exec);
+    gpu_target_config.platform_name = stream_exec->platform()->Name();
+
+    return gpu_target_config;
+  }
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
@@ -115,7 +156,7 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileToTargetBinary(
       const HloModuleConfig& module_config,
-      std::unique_ptr<llvm::Module> llvm_module,
+      std::unique_ptr<llvm::Module> llvm_module, GpuVersion gpu_version,
       se::StreamExecutor* stream_exec, const CompileOptions& options,
       const HloModule* debug_module);
 
@@ -134,17 +175,28 @@ class GpuCompiler : public LLVMCompiler {
       Executable* executable) const override;
 
  protected:
+  // During compilation with device, stream_exec != null and autotune_results
+  // == null. During deviceless AOT compilation, stream_exec == null and
+  // autotune_results != null.
   virtual Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator);
+      se::DeviceMemoryAllocator* device_allocator,
+      const GpuTargetConfig& gpu_target_config,
+      const AutotuneResults* autotune_results);
 
  private:
+  // During compilation with device, stream_exec != null and autotune_results
+  // == null. During deviceless AOT compilation, stream_exec == null and
+  // autotune_results != null.
   Status OptimizeHloModule(HloModule* hlo_module,
                            se::StreamExecutor* stream_exec,
-                           se::DeviceMemoryAllocator* device_allocator);
+                           se::DeviceMemoryAllocator* device_allocator,
+                           const GpuTargetConfig& gpu_target_config,
+                           const AutotuneResults* autotune_results);
 
   virtual Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      HloModule* hlo_module, GpuVersion gpu_version,
+      se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
   virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() {
@@ -158,8 +210,7 @@ class GpuCompiler : public LLVMCompiler {
   virtual StatusOr<std::pair<std::string, std::vector<uint8_t>>>
   CompileTargetBinary(const HloModuleConfig& module_config,
                       llvm::Module* llvm_module, GpuVersion gpu_version,
-                      se::StreamExecutor* stream_exec, bool relocatable,
-                      const HloModule* debug_module) = 0;
+                      bool relocatable, const HloModule* debug_module) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
@@ -169,7 +220,8 @@ class GpuCompiler : public LLVMCompiler {
 
   virtual StatusOr<std::vector<uint8_t>> LinkModules(
       se::StreamExecutor* stream_exec,
-      std::vector<std::vector<uint8_t>> modules) {
+      std::vector<std::vector<uint8_t>> modules,
+      const DebugOptions& debug_options) {
     return Unimplemented("LinkModules is not implemented.");
   }
 
@@ -193,7 +245,7 @@ class GpuCompiler : public LLVMCompiler {
 StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
     HloModule* hlo_module, llvm::LLVMContext* llvm_context,
     const std::string& target_triple, const std::string& data_layout,
-    const std::string& platform_name, const se::Platform::Id platform_id,
+    const std::string& platform_name, se::Platform::Id platform_id,
     GpuDeviceInfo gpu_device_info,
     se::CudaComputeCapability cuda_compute_capability,
     se::RocmComputeCapability rocm_compute_capability, int pointer_size);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 4c42b0f609a..3c6fb37ebe0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -17,14 +17,20 @@ limitations under the License.
 
 #include <algorithm>
 #include <limits>
+#include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
@@ -33,11 +39,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logger.h"
@@ -91,8 +97,8 @@ StatusOr<se::DeviceMemory<uint8_t>> ScratchAllocator::AllocateBytes(
     int64_t byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes()) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
+    return Status(
+        tsl::error::RESOURCE_EXHAUSTED,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
             byte_size, GetMemoryLimitInBytes()));
@@ -142,7 +148,7 @@ StatusOr<std::vector<MaybeFusedConvRunner>> GetAlgorithms(
           /* conv_input_scale = */ config.conv_result_scale,
           /* side_input_scale = */ config.fusion->side_input_scale,
           /* leakyrelu_alpha = */ 0.0, stream, config.input_descriptor,
-          config.filter_descriptor, GetBiasDescriptor(config),
+          config.filter_descriptor, config.bias_descriptor,
           config.output_descriptor, config.conv_desc, use_fallback,
           config.fusion->mode, &runners));
       for (auto& runner : runners) {
@@ -308,35 +314,92 @@ StatusOr<bool> CheckRedzones(const se::RedzoneAllocator& allocator,
 #endif
 
 using ConvCacheKey =
-    std::tuple<se::StreamExecutor*,
-               /* conv->ToString(HloPrintOptions::Canonical()) */ std::string>;
+    std::tuple<std::string /* stream_exec->GetDeviceDescription().model_str()*/,
+               std::string /* conv->ToString(HloPrintOptions::Canonical()) */>;
 
 struct ConvCacheStats {
   int64_t cache_hits = 0;
   int64_t cache_misses = 0;
 
   void LogStats() {
-    VLOG(2) << "Cache hits: " << cache_hits;
-    VLOG(2) << "Cache misses: " << cache_misses;
+    VLOG(3) << "Cache hits: " << cache_hits;
+    VLOG(3) << "Cache misses: " << cache_misses;
   }
 };
 
 ConvCacheKey AutotuneCacheKeyfromInstruction(
-    const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
+    const HloCustomCallInstruction* conv,
+    absl::string_view device_description_str) {
   auto options = HloPrintOptions::Canonical();
   options.set_print_backend_config(true);
-  return std::make_tuple(se, conv->ToString(options));
+  return std::make_tuple(std::string(device_description_str),
+                         conv->ToString(options));
 }
 
-absl::Mutex autotune_cache_lock(absl::kConstInit);
-auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_lock) =
+absl::Mutex autotune_cache_mu(absl::kConstInit);
+auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
     *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
-auto& autotune_cache_stats ABSL_GUARDED_BY(autotune_cache_lock) =
+auto& autotune_cache_stats ABSL_GUARDED_BY(autotune_cache_mu) =
     *new ConvCacheStats();
+
 }  // anonymous namespace
 
+void GpuConvAlgorithmPicker::ClearAutotuneResults() {
+  absl::MutexLock lock(&autotune_cache_mu);
+  autotune_cache.clear();
+}
+
+Status GpuConvAlgorithmPicker::WriteAutotuneResults(AutotuneResults* results) {
+  absl::MutexLock lock(&autotune_cache_mu);
+
+  for (const auto& [k, result] : autotune_cache) {
+    const auto& [model_str, hlo] = k;
+    auto& entry = *results->add_convs();
+    entry.set_device(model_str);
+    entry.set_hlo(hlo);
+    *entry.mutable_result() = result;
+  }
+
+  // Sort the results so they're deterministic.
+  std::sort(results->mutable_convs()->pointer_begin(),
+            results->mutable_convs()->pointer_end(),
+            [](const auto* a, const auto* b) {
+              return std::make_pair(absl::string_view(a->device()),
+                                    absl::string_view(a->hlo())) <
+                     std::make_pair(absl::string_view(b->device()),
+                                    absl::string_view(b->hlo()));
+            });
+  return OkStatus();
+}
+
+Status GpuConvAlgorithmPicker::LoadAutotuneResults(
+    const AutotuneResults& results) {
+  absl::MutexLock lock(&autotune_cache_mu);
+  for (const auto& result : results.convs()) {
+    autotune_cache[std::make_tuple(result.device(), result.hlo())] =
+        result.result();
+  }
+  return OkStatus();
+}
+
 StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
     const HloCustomCallInstruction* instr) {
+  // If in deviceless mode, return the result from the autotune_cache.
+  if (auto deviceless_config = std::get_if<DevicelessConfig>(&config_)) {
+    auto device_description_str = deviceless_config->device_description_str;
+    ConvCacheKey key =
+        AutotuneCacheKeyfromInstruction(instr, device_description_str);
+    absl::MutexLock autotune_lock(&autotune_cache_mu);
+    auto it = autotune_cache.find(key);
+    if (it != autotune_cache.end()) {
+      return it->second;
+    }
+    return InternalError(
+        "Failed to load autotune result when running GpuConvAlgorithmPicker in "
+        "deviceless mode");
+  }
+
+  se::StreamExecutor* stream_exec = std::get<DeviceConfig>(config_).stream_exec;
   // Don't run this function concurrently on the same GPU.
   //
   // This is a bit of a hack and doesn't protect us against arbitrary concurrent
@@ -346,15 +409,16 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
   // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
   // avoid ever doing duplicate work.  If we have a cache miss, only one thread
   // will run PickBestAlgorithmImpl for a particular device.
-  absl::MutexLock lock(&GetGpuMutex(stream_exec_));
+  absl::MutexLock lock(&GetGpuMutex(stream_exec));
 
   // We cache the autotuning results to avoid doing the duplicate work,
   // which can greatly improve both stability (deterministic numeric results
   // within a process for a given input) and performance (2x speedup on some
   // models).
-  ConvCacheKey key = AutotuneCacheKeyfromInstruction(instr, stream_exec_);
+  ConvCacheKey key = AutotuneCacheKeyfromInstruction(
+      instr, stream_exec->GetDeviceDescription().model_str());
   {
-    absl::MutexLock lock(&autotune_cache_lock);
+    absl::MutexLock autotune_lock(&autotune_cache_mu);
     auto it = autotune_cache.find(key);
     if (it != autotune_cache.end()) {
       autotune_cache_stats.cache_hits++;
@@ -365,42 +429,49 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
 
   // Make sure any previous activity on this executor is done. We don't want
   // other work still running on the GPU to interfere with autotuning.
-  if (!stream_exec_->SynchronizeAllActivity()) {
+  if (!stream_exec->SynchronizeAllActivity()) {
     return InternalError(
         "Failed to synchronize GPU for autotuning conv instruction: %s",
         std::get<1>(key) /* instr */);
   }
 
   // allocator either points to this->allocator_ or, if that's null, to a
-  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  // se::StreamExecutorMemoryAllocator for stream_exec.
+  se::DeviceMemoryAllocator* device_allocator =
+      std::get<DeviceConfig>(config_).allocator;
   se::DeviceMemoryAllocator* allocator;
   optional<se::StreamExecutorMemoryAllocator> se_allocator;
-  if (allocator_ != nullptr) {
-    allocator = allocator_;
+  if (device_allocator != nullptr) {
+    allocator = device_allocator;
   } else {
-    se_allocator.emplace(stream_exec_);
+    se_allocator.emplace(stream_exec);
     allocator = &*se_allocator;
   }
 
   TF_ASSIGN_OR_RETURN(se::Stream* const stream,
-                      allocator->GetStream(stream_exec_->device_ordinal()));
+                      allocator->GetStream(stream_exec->device_ordinal()));
   StatusOr<AutotuneResult> result_or(InternalError("Unknown platform."));
   // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
   // have diverged. Specifically, we need to make sure redzone allocator related
   // utilities are not used in ROCm routine
-  if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
+  if (stream_exec->platform_kind() == se::PlatformKind::kROCm) {
     result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
-  } else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
+  } else if (stream_exec->platform_kind() == se::PlatformKind::kCuda) {
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
     result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
 #endif
   }
 
-  if (result_or.ok()) {
-    absl::MutexLock lock(&autotune_cache_lock);
-    CHECK(autotune_cache.insert({key, result_or.value()}).second);
+  if (!result_or.ok()) {
+    return result_or;
   }
-  return result_or;
+
+  // Insert our result into the cache.  After we released the lock on
+  // autotune_cache_mu, another autotuning job may have run for this same key on
+  // another GPU on the machine.  If so, use its result.
+  absl::MutexLock autotune_lock(&autotune_cache_mu);
+  auto [it, inserted] = autotune_cache.insert({key, result_or.value()});
+  return it->second;
 }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
@@ -438,6 +509,7 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
     absl::Span<const AlgorithmDesc> disabled_algos) {
   auto alg = runner->ToAlgorithmDesc();
 
+  se::StreamExecutor* stream_exec = std::get<DeviceConfig>(config_).stream_exec;
   XLA_SCOPED_LOGGING_TIMER_LEVEL(
       absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
                    alg.ToString()),
@@ -493,7 +565,7 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
       PtxOptsFromDebugOptions(hlo_module_config.debug_options()),
       /*memory_limit=*/rz_space_limit);
   se::dnn::ProfileResult profile_result;
-  VLOG(3) << "Trying algorithm " << alg.ToString() << " for "
+  VLOG(4) << "Trying algorithm " << alg.ToString() << " for "
           << instr->ToString();
 
   std::optional<size_t> workspace_size =
@@ -542,14 +614,14 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
     }
   }
   if (!launch_status.ok()) {
-    VLOG(4) << "Launch failed: " << launch_status;
+    VLOG(5) << "Launch failed: " << launch_status;
     return make_failure(
         AutotuneResult::DISQUALIFIED,
         absl::StrCat("Profiling failure on cuDNN engine ", alg.ToString(), ": ",
                      launch_status.ToString()));
   }
   if (!profile_result.is_valid()) {
-    VLOG(4) << "Launch succeeded but profile result is invalid.";
+    VLOG(5) << "Launch succeeded but profile result is invalid.";
     // Not DISQUALIFIED: this means something went wrong internally.
     return make_failure(
         AutotuneResult::UNKNOWN,
@@ -557,7 +629,7 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
                      "with cuDNN engine ",
                      alg.ToString(), ": ", launch_status.ToString()));
   }
-  VLOG(3) << "Best time: " << min_time << " ms. Worst time: " << max_time
+  VLOG(4) << "Best time: " << min_time << " ms. Worst time: " << max_time
           << " ms. Total iterations: " << num_iters;
   int64_t scratch_bytes_used =
       scratch_allocator.TotalAllocatedBytesExcludingRedzones();
@@ -586,19 +658,19 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
 
   if (!input_output_allocator_redzone_clear ||
       !scratch_allocator_redzone_clear) {
-    std::string canonical_hlo =
-        std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
+    std::string canonical_hlo = std::get<1>(AutotuneCacheKeyfromInstruction(
+        instr, stream_exec->GetDeviceDescription().model_str()));
 
     std::string blas_version;
-    if (auto* blas = stream_exec_->AsBlas()) {
+    if (auto* blas = stream_exec->AsBlas()) {
       (void)blas->GetVersion(&blas_version);
     }
 
     AlgorithmDenylist proto;
     auto entry = proto.add_entries();
     entry->set_hlo(canonical_hlo);
-    *entry->mutable_cc() = GetComputeCapability(stream_exec_);
-    *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
+    *entry->mutable_cc() = GetComputeCapability(stream_exec);
+    *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec);
     entry->set_blas_version(blas_version);
     auto algo = entry->add_algos();
     algo->set_id(alg.algo_id());
@@ -640,7 +712,7 @@ GpuConvAlgorithmPicker::AutotuneOneConvRunner(
           << (*reference_result)->algorithm.ToString() << " vs "
           << alg.ToString();
       PrintPlatformInfo(stream);
-      VLOG(1) << "Full module on failure: \n" << instr->GetModule()->ToString();
+      VLOG(2) << "Full module on failure: \n" << instr->GetModule()->ToString();
       auto* fail = result.mutable_failure();
       fail->set_kind(AutotuneResult::WRONG_RESULT);
       fail->set_buffer_address(
@@ -665,6 +737,8 @@ StatusOr<tensorflow::AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
     se::Stream* stream) {
+  se::StreamExecutor* stream_exec = std::get<DeviceConfig>(config_).stream_exec;
+
   // Right now Redzone allocator is available in Cuda target only
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
@@ -709,16 +783,16 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   const bool crash_on_checking_failure =
       debug_options.xla_gpu_crash_on_verification_failures();
 
-  std::string canonical_hlo =
-      std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
+  std::string canonical_hlo = std::get<1>(AutotuneCacheKeyfromInstruction(
+      instr, stream_exec->GetDeviceDescription().model_str()));
 
   std::string blas_version;
-  if (auto* blas = stream_exec_->AsBlas()) {
+  if (auto* blas = stream_exec->AsBlas()) {
     (void)blas->GetVersion(&blas_version);
   }
 
   absl::Span<const AlgorithmDesc> disabled_algos = GetDisabledConvAlgorithms(
-      GetComputeCapability(stream_exec_), GetCudnnVersion(stream_exec_),
+      GetComputeCapability(stream_exec), GetCudnnVersion(stream_exec),
       blas_version, canonical_hlo);
 
   TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
@@ -784,12 +858,11 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     for (const auto& profile : profile_results) {
       *log.add_results() = profile;
     }
-    *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
-    *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
-    log.set_device_pci_bus_id(
-        stream_exec_->GetDeviceDescription().pci_bus_id());
+    *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
+    *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
+    log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
     log.set_blas_version(blas_version);
-    VLOG(1) << "Autotuning result: " << log.ShortDebugString();
+    VLOG(2) << "Autotuning result: " << log.ShortDebugString();
     // If we crash on checking failure, we are in a testing/benchmark mode, thus
     // omitting logging through the logger.
     if (!crash_on_checking_failure) {
@@ -807,7 +880,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
   }
 
   TF_ASSIGN_OR_RETURN(AutotuneResult selected_algorithm,
-                      PickBestResult(profile_results, *instr));
+                      PickBestResult(profile_results, instr->ToString(),
+                                     instr->GetModule()->config()));
   return selected_algorithm;
 }
 #endif
@@ -819,7 +893,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
       "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
 
-  const auto device_ordinal = stream_exec_->device_ordinal();
+  se::StreamExecutor* stream_exec = std::get<DeviceConfig>(config_).stream_exec;
+  const auto device_ordinal = stream_exec->device_ordinal();
   std::vector<se::DeviceMemoryBase> operand_buffers;
 
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
@@ -853,7 +928,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners,
       GetMIOpenAlgorithms(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                          stream_exec_, &scratch_allocator, stream));
+                          stream_exec, &scratch_allocator, stream));
 
   std::vector<AutotuneResult> profile_results;
 
@@ -881,7 +956,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
           2);
 
       se::dnn::ProfileResult profile_result;
-      VLOG(3) << "Trying algorithm " << alg.ToString() << " for "
+      VLOG(4) << "Trying algorithm " << alg.ToString() << " for "
               << instr->ToString();
 
       TF_ASSIGN_OR_RETURN(
@@ -922,7 +997,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
   }
 
   TF_ASSIGN_OR_RETURN(AutotuneResult selected_algorithm,
-                      PickBestResult(profile_results, *instr));
+                      PickBestResult(profile_results, instr->ToString(),
+                                     instr->GetModule()->config()));
   return selected_algorithm;
 }
 
@@ -957,7 +1033,7 @@ StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
   }
 
   auto best_algo = std::move(best_algo_or).value();
-  VLOG(2) << "Setting cudnn conv to use algorithm "
+  VLOG(3) << "Setting cudnn conv to use algorithm "
           << best_algo.conv().algorithm() << " and "
           << NumBytesToString(best_algo.scratch_bytes())
           << " of scratch memory: " << instr->ToString()
@@ -984,7 +1060,7 @@ StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
   // is transformed through all our passes.
   new_call->SetAndSanitizeName(instr->name());
 
-  VLOG(2) << "Replacing convolution " << instr->ToString() << " with "
+  VLOG(3) << "Replacing convolution " << instr->ToString() << " with "
           << new_call->ToString();
 
   TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
@@ -1022,10 +1098,11 @@ StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
 StatusOr<bool> GpuConvAlgorithmPicker::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  XLA_SCOPED_LOGGING_TIMER("GpuConvAlgorithmPicker");
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrCat("GpuConvAlgorithmPicker for ", module->name()));
 
   if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
-    VLOG(2) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
+    VLOG(3) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
                "returning early.";
     return false;
   }
@@ -1038,7 +1115,7 @@ StatusOr<bool> GpuConvAlgorithmPicker::Run(
   }
 
   {
-    absl::MutexLock lock(&autotune_cache_lock);
+    absl::MutexLock lock(&autotune_cache_mu);
     autotune_cache_stats.LogStats();
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
index 9a70050ae7f..cb181f9ace0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h
@@ -17,16 +17,19 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_ALGORITHM_PICKER_H_
 
 #include <optional>
+#include <string>
+#include <variant>
 
 #include "absl/time/time.h"
+#include "tensorflow/compiler/xla/autotune_results.pb.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
 #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
@@ -37,14 +40,38 @@ namespace gpu {
 
 // Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
 // each and adding explicit scratch space to the CustomCalls.
+//
+// It supports two modes: device and deviceless.
+// In device mode, we run autotuning on the device and store autotune results.
+//
+// In deviceless mode, we pass in some information related to the device and
+// use stored autotune results to rewrite convolutions. If the required autotune
+// result is not stored, then the performance of convolution will be suboptimal.
 class GpuConvAlgorithmPicker : public HloModulePass {
  public:
-  // If the `allocator` parameter is not null, we will use it to allocate temp
-  // memory while timing the various convolution algorithms.  If it's null,
-  // we'll use the default allocator on the StreamExecutor.
-  GpuConvAlgorithmPicker(se::StreamExecutor* stream_exec,
-                         se::DeviceMemoryAllocator* allocator)
-      : stream_exec_(stream_exec), allocator_(allocator) {}
+  static void ClearAutotuneResults();
+  static Status WriteAutotuneResults(AutotuneResults* results);
+  static Status LoadAutotuneResults(const AutotuneResults& results);
+
+  struct DeviceConfig {
+    se::StreamExecutor* stream_exec;  // never null
+
+    // If the `allocator` parameter is not null, we will use it to allocate temp
+    // memory while timing the various convolution algorithms.  If it's null,
+    // we'll use the default allocator on the StreamExecutor.
+    se::DeviceMemoryAllocator* allocator;  // may be null
+  };
+
+  struct DevicelessConfig {
+    // Used as the key to search for autotune result for the device. Can be
+    // found by stream_exec->GetDeviceDescription()->model_str().
+    std::string device_description_str;
+  };
+
+  explicit GpuConvAlgorithmPicker(DeviceConfig device_config)
+      : config_(device_config) {}
+  explicit GpuConvAlgorithmPicker(DevicelessConfig deviceless_config)
+      : config_(deviceless_config) {}
 
   absl::string_view name() const override {
     return "gpu-conv-algorithm-picker";
@@ -87,8 +114,7 @@ class GpuConvAlgorithmPicker : public HloModulePass {
       const HloCustomCallInstruction* instr,
       se::DeviceMemoryAllocator* allocator, se::Stream* stream);
 
-  se::StreamExecutor* stream_exec_;       // never null
-  se::DeviceMemoryAllocator* allocator_;  // may be null
+  std::variant<DeviceConfig, DevicelessConfig> config_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker_test.cc
new file mode 100644
index 00000000000..77d405011d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace m = ::xla::match;
+
+class GpuConvAlgorithmPickerTest : public HloTestBase {
+ public:
+  GpuConvAlgorithmPickerTest() {
+    GpuConvAlgorithmPicker::ClearAutotuneResults();
+  }
+};
+
+TEST_F(GpuConvAlgorithmPickerTest, SetAlgorithm) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %arg0 = f32[3,56,56,16]{2,1,0,3} parameter(0)
+  %arg1 = f32[3,3,3,64]{2,1,0,3} parameter(1)
+  ROOT %conv = f32[54,54,16,64]{1,0,3,2} convolution(%arg0, %arg1), window={size=3x3}, dim_labels=f01b_i01o->01bf
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kHlo));
+
+  se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<se::StreamExecutor*> executors,
+                          PlatformUtil::GetStreamExecutors(platform));
+  ASSERT_GT(executors.size(), 0);
+  se::StreamExecutor* stream_exec = executors[0];
+  GpuConvAlgorithmPicker::DeviceConfig device_config{stream_exec,
+                                                     /*allocator=*/nullptr};
+
+  bool changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GpuConvRewriter(), m.get()));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GpuConvAlgorithmPicker(device_config), m.get()));
+  ASSERT_TRUE(changed);
+
+  AutotuneResults results;
+  TF_ASSERT_OK(GpuConvAlgorithmPicker::WriteAutotuneResults(&results));
+  ASSERT_EQ(results.convs_size(), 1);
+  auto& result = *results.mutable_convs(0)->mutable_result();
+  int64_t old_scratch_bytes = result.scratch_bytes();
+  int64_t new_scratch_bytes = old_scratch_bytes + 1;
+  result.set_scratch_bytes(new_scratch_bytes);
+
+  GpuConvAlgorithmPicker::ClearAutotuneResults();
+  TF_ASSERT_OK(GpuConvAlgorithmPicker::LoadAutotuneResults(results));
+
+  // Now send the same module through GpuConvAlgorithmPicker again.  The conv
+  // should have the new scratch bytes.
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kHlo));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(changed, RunHloPass(GpuConvRewriter(), m.get()));
+  changed = false;
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, RunHloPass(GpuConvAlgorithmPicker(device_config), m.get()));
+  ASSERT_TRUE(changed);
+
+  // TupleSimplifier cleans this up a bit before we pattern-match
+  TF_ASSERT_OK(RunHloPass(TupleSimplifier(), m.get()).status());
+
+  SCOPED_TRACE(m->ToString());
+  HloInstruction* conv;
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(m::CustomCall(&conv))));
+  EXPECT_THAT(
+      conv->shape(),
+      GmockMatch(m::Shape().WithSubshape(
+          {1}, m::Shape().WithElementType(U8).WithDims({new_scratch_bytes}))));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
index 5b77cedc991..1eaa56dd7db 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
index ae64e11f70c..c57ce99fdda 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -78,12 +78,14 @@ ENTRY %convolution (operand f64[2,2,2,3]{3,2,1,0}) -> (f64[2,2,4,4]{3,2,1,0}, u8
                     .value();
   ASSERT_TRUE(GpuConvPaddingLegalization().Run(module.get()).value());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              op::Tuple(op::Slice(op::GetTupleElement(
-                            op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
-                                           op::Reverse(op::Constant())),
-                            0)),
-                        op::GetTupleElement()));
+  EXPECT_THAT(
+      root,
+      op::Tuple(
+          op::Slice(op::GetTupleElement(
+              op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget), _,
+                             op::Reverse(op::Constant())),
+              0)),
+          op::GetTupleElement()));
   auto slice = root->operand(0);
   Shape expected_slice_shape = ShapeUtil::MakeShape(F64, {2, 2, 4, 4});
   EXPECT_TRUE(ShapeUtil::Equal(slice->shape(), expected_slice_shape));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
index a6b3c4664cf..bcf85390296 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h
index 7355efb520b..50a43560e10 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <optional>
 #include <tuple>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter_test.cc
index d981864f07d..ed37689d3bb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -126,9 +126,10 @@ TEST_F(GpuConvRewriterTest, BackwardFilterConvolve) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  ASSERT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardFilterCallTarget)), 0));
 
   // Check that metadata was preserved.
   const auto& md_after_opt =
@@ -163,9 +164,9 @@ TEST_F(GpuConvRewriterTest,
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(std::string(kCudnnConvForwardCallTarget)), 0));
 }
 
 // Extracted from block35 training.
@@ -193,9 +194,10 @@ TEST_F(GpuConvRewriterTest, BackwardFilterConvolveWithPaddedActivations) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardFilterCallTarget)), 0));
 }
 
 // Extracted from inception v3 training.
@@ -223,9 +225,10 @@ TEST_F(GpuConvRewriterTest, BackwardFilterConvolveWithPaddedGradients) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardFilterCallTarget)), 0));
 }
 
 TEST_F(GpuConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
@@ -252,9 +255,10 @@ TEST_F(GpuConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardFilterCallTarget)), 0));
 }
 
 TEST_F(GpuConvRewriterTest, BackwardInputConvolveEvenPadding) {
@@ -307,9 +311,10 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolveEvenPadding) {
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
 
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  ASSERT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget)), 0));
   const HloInstruction* custom_call =
       entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
@@ -356,9 +361,10 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolve1x1Filter) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget)), 0));
 }
 
 // BackwardInputConvolve([abc], [x], stride=1) is equivalent to
@@ -391,9 +397,9 @@ TEST_F(GpuConvRewriterTest,
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(std::string(kCudnnConvForwardCallTarget)), 0));
 }
 
 // Extracted from Inception V3 training.
@@ -446,9 +452,10 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolveUnevenPaddingOnGradients) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  ASSERT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget)), 0));
   const HloInstruction* custom_call =
       entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
@@ -497,9 +504,9 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(std::string(kCudnnConvForwardCallTarget)), 0));
 }
 
 // Extracted from Resnet-50.
@@ -552,9 +559,10 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) {
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  ASSERT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget)), 0));
   const WindowDimension& backward_conv_col_dim =
       entry_computation->root_instruction()->operand(0)->window().dimensions(1);
   EXPECT_EQ(0, backward_conv_col_dim.padding_low());
@@ -608,9 +616,9 @@ TEST_F(GpuConvRewriterTest,
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
   EXPECT_TRUE(RunPass(module.get()));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(std::string(kCudnnConvForwardCallTarget)), 0));
 }
 
 // Check that we will materialize a reversed version of a constant in order to
@@ -635,11 +643,11 @@ TEST_F(GpuConvRewriterTest, BackwardInputConvolveConstantFilter) {
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   EXPECT_TRUE(RunPass(m.get()));
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
-                                         op::Reverse(op::Constant())),
-                          0));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(std::string(kCudnnConvBackwardInputCallTarget),
+                                 _, op::Reverse(op::Constant())),
+                  0));
 }
 
 TEST_F(GpuConvRewriterTest, TestBackwardFilterPattern) {
@@ -655,9 +663,11 @@ TEST_F(GpuConvRewriterTest, TestBackwardFilterPattern) {
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   EXPECT_TRUE(RunPass(m.get()));
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget, _, _), 0));
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      op::GetTupleElement(
+          op::CustomCall(std::string(kCudnnConvBackwardFilterCallTarget), _, _),
+          0));
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
index aba74568c42..24e0d093f9b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
@@ -22,36 +22,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace gpu {
-
-se::dnn::BatchDescriptor GetBiasDescriptor(const GpuConvConfig& config) {
-  se::dnn::BatchDescriptor result(config.output_descriptor.ndims());
-  result.set_count(1)
-      .set_height(1)
-      .set_width(1)
-      .set_feature_map_count(config.output_descriptor.feature_map_count())
-      .set_layout([&] {
-        // Normalize NCHW_VECT_C to NCHW for layout of `bias`, even though it's
-        // actually the same (because `bias` only has one dimension):  cudnn
-        // does not accept NCHW_VECT_C for `bias`.
-        se::dnn::DataLayout layout = config.output_descriptor.layout();
-        switch (layout) {
-          case se::dnn::DataLayout::kBatchDepthYX4:
-          case se::dnn::DataLayout::kBatchDepthYX32:
-            return se::dnn::DataLayout::kBatchDepthYX;
-          default:
-            return layout;
-        }
-      }());
-  if (result.ndims() == 3) {
-    result.set_spatial_dim(se::dnn::DimIndex::Z, 1);
-  }
-  return result;
-}
-
 namespace {
 
 using se::DeviceMemory;
@@ -67,7 +42,7 @@ using se::dnn::FilterLayout;
 using se::dnn::ProfileResult;
 
 template <typename ElementType, typename OutputType>
-Status RunGpuConvUnfused(GpuConvParams params, se::Stream* stream,
+Status RunGpuConvUnfused(const GpuConvParams& params, se::Stream* stream,
                          RunConvOptions options,
                          DeviceMemory<ElementType> input_buf,
                          DeviceMemory<ElementType> filter_buf,
@@ -119,8 +94,6 @@ Status RunGpuConvForwardActivation(const GpuConvParams& params,
                                    DeviceMemory<ElementType> filter_buf,
                                    DeviceMemory<OutputType> output_buf,
                                    DeviceMemoryBase scratch_memory) {
-  BatchDescriptor bias_desc = GetBiasDescriptor(*params.config);
-
   se::DeviceMemory<OutputType> side_input(params.fusion->side_input_buf);
   // If there is no side input, use output as the side input.
   if (side_input.is_null()) {
@@ -163,7 +136,7 @@ Status RunGpuConvForwardActivation(const GpuConvParams& params,
                                       /* leakyrelu_alpha = */ 0.0,
                                       params.config->input_descriptor,
                                       params.config->filter_descriptor,
-                                      bias_desc,
+                                      params.config->bias_descriptor,
                                       params.config->output_descriptor,
                                       params.config->conv_desc,
                                       params.config->fusion->mode};
@@ -455,6 +428,30 @@ StatusOr<GpuConvConfig> GetGpuConvConfig(
         .set_filter_stride(static_cast<DimIndex>(dim), 1);
   }
 
+  // Initialize bias descriptor for fused convolutions.
+  BatchDescriptor& bias_descriptor = config.bias_descriptor;
+  bias_descriptor = BatchDescriptor(config.output_descriptor.ndims());
+  bias_descriptor.set_count(1)
+      .set_height(1)
+      .set_width(1)
+      .set_feature_map_count(config.output_descriptor.feature_map_count())
+      .set_layout([&] {
+        // Normalize NCHW_VECT_C to NCHW for layout of `bias`, even though it's
+        // actually the same (because `bias` only has one dimension):  cudnn
+        // does not accept NCHW_VECT_C for `bias`.
+        se::dnn::DataLayout layout = config.output_descriptor.layout();
+        switch (layout) {
+          case se::dnn::DataLayout::kBatchDepthYX4:
+          case se::dnn::DataLayout::kBatchDepthYX32:
+            return se::dnn::DataLayout::kBatchDepthYX;
+          default:
+            return layout;
+        }
+      }());
+  if (bias_descriptor.ndims() == 3) {
+    bias_descriptor.set_spatial_dim(se::dnn::DimIndex::Z, 1);
+  }
+
   return config;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
index 73e989b1041..0a1d22f7073 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
@@ -62,6 +62,7 @@ struct GpuConvConfig {
   se::dnn::FilterDescriptor filter_descriptor;
   se::dnn::BatchDescriptor output_descriptor;
   se::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::BatchDescriptor bias_descriptor;
 
   Shape input_shape;
   Shape filter_shape;
@@ -212,8 +213,6 @@ StatusOr<GpuConvParams> GetGpuConvParams(
     absl::Span<const se::DeviceMemoryBase> operand_buffers,
     se::DeviceMemoryBase result_buffer);
 
-se::dnn::BatchDescriptor GetBiasDescriptor(const GpuConvConfig& config);
-
 inline se::dnn::DataType BiasTypeForInputType(se::dnn::DataType input_type) {
   switch (input_type) {
     default:
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
index 8a4fb4cc4db..8633f17d43d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.cc
@@ -17,26 +17,38 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
-
-GpuDeviceInfo GetGpuDeviceInfo(stream_executor::StreamExecutor* stream_exec) {
+namespace {
+GpuDeviceInfo GetGpuDeviceInfo(
+    const stream_executor::DeviceDescription& device) {
   GpuDeviceInfo device_info;
-  const stream_executor::DeviceDescription& d =
-      stream_exec->GetDeviceDescription();
-  device_info.threads_per_block_limit = d.threads_per_block_limit();
-  device_info.threads_per_warp = d.threads_per_warp();
-  device_info.shared_memory_per_block = d.shared_memory_per_block();
-  device_info.shared_memory_per_core = d.shared_memory_per_core();
-  device_info.threads_per_core_limit = d.threads_per_core_limit();
-  device_info.core_count = d.core_count();
-  device_info.fpus_per_core = d.fpus_per_core();
-  device_info.block_dim_limit_x = d.block_dim_limit().x;
-  device_info.block_dim_limit_y = d.block_dim_limit().y;
-  device_info.block_dim_limit_z = d.block_dim_limit().z;
-  device_info.memory_bandwidth = d.memory_bandwidth();
-  device_info.l2_cache_size = d.l2_cache_size();
-  device_info.clock_rate_ghz = d.clock_rate_ghz();
+  device_info.threads_per_block_limit = device.threads_per_block_limit();
+  device_info.threads_per_warp = device.threads_per_warp();
+  device_info.shared_memory_per_block = device.shared_memory_per_block();
+  device_info.shared_memory_per_core = device.shared_memory_per_core();
+  device_info.threads_per_core_limit = device.threads_per_core_limit();
+  device_info.core_count = device.core_count();
+  device_info.fpus_per_core = device.fpus_per_core();
+  device_info.block_dim_limit_x = device.block_dim_limit().x;
+  device_info.block_dim_limit_y = device.block_dim_limit().y;
+  device_info.block_dim_limit_z = device.block_dim_limit().z;
+  device_info.memory_bandwidth = device.memory_bandwidth();
+  device_info.l2_cache_size = device.l2_cache_size();
+  device_info.clock_rate_ghz = device.clock_rate_ghz();
+  device_info.device_memory_size = device.device_memory_size();
   return device_info;
 }
 
+}  // namespace
+
+GpuDeviceInfo GetGpuDeviceInfo(
+    const stream_executor::StreamExecutor* stream_exec) {
+  return GetGpuDeviceInfo(stream_exec->GetDeviceDescription());
+}
+
+GpuDeviceInfo GetGpuDeviceInfo(const stream_executor::Platform* platform) {
+  auto device = platform->DescriptionForDevice(0);
+  return GetGpuDeviceInfo(**device);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
index fe528cd70eb..bcc15019aed 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_H_
 
+#include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -38,9 +39,49 @@ struct GpuDeviceInfo {
   int64_t memory_bandwidth;
   int64_t l2_cache_size;
   float clock_rate_ghz;
+  int64_t device_memory_size;
+
+  stream_executor::GpuDeviceInfoProto ToProto() const {
+    stream_executor::GpuDeviceInfoProto proto;
+    proto.set_threads_per_block_limit(threads_per_block_limit);
+    proto.set_threads_per_warp(threads_per_warp);
+    proto.set_shared_memory_per_block(shared_memory_per_block);
+    proto.set_shared_memory_per_core(shared_memory_per_core);
+    proto.set_threads_per_core_limit(threads_per_core_limit);
+    proto.set_core_count(core_count);
+    proto.set_fpus_per_core(fpus_per_core);
+    proto.set_block_dim_limit_x(block_dim_limit_x);
+    proto.set_block_dim_limit_y(block_dim_limit_y);
+    proto.set_block_dim_limit_z(block_dim_limit_z);
+    proto.set_memory_bandwidth(memory_bandwidth);
+    proto.set_l2_cache_size(l2_cache_size);
+    proto.set_clock_rate_ghz(clock_rate_ghz);
+    proto.set_device_memory_size(device_memory_size);
+    return proto;
+  }
+
+  GpuDeviceInfo() = default;
+  explicit GpuDeviceInfo(const stream_executor::GpuDeviceInfoProto& proto) {
+    threads_per_block_limit = proto.threads_per_block_limit();
+    threads_per_warp = proto.threads_per_warp();
+    shared_memory_per_block = proto.shared_memory_per_block();
+    shared_memory_per_core = proto.shared_memory_per_core();
+    threads_per_core_limit = proto.threads_per_core_limit();
+    core_count = proto.core_count();
+    fpus_per_core = proto.fpus_per_core();
+    block_dim_limit_x = proto.block_dim_limit_x();
+    block_dim_limit_y = proto.block_dim_limit_y();
+    block_dim_limit_z = proto.block_dim_limit_z();
+    memory_bandwidth = proto.memory_bandwidth();
+    l2_cache_size = proto.l2_cache_size();
+    clock_rate_ghz = proto.clock_rate_ghz();
+    device_memory_size = proto.device_memory_size();
+  }
 };
 
-GpuDeviceInfo GetGpuDeviceInfo(stream_executor::StreamExecutor* stream_exec);
+GpuDeviceInfo GetGpuDeviceInfo(
+    const stream_executor::StreamExecutor* stream_exec);
+GpuDeviceInfo GetGpuDeviceInfo(const stream_executor::Platform* platform);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
new file mode 100644
index 00000000000..5462d7ce60d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
+
+namespace xla {
+namespace gpu {
+
+/*static*/ GpuDeviceInfo TestGpuDeviceInfo::RTXA6000DeviceInfo() {
+  GpuDeviceInfo d;
+  d.threads_per_block_limit = 1024;
+  d.threads_per_warp = 32;
+  d.shared_memory_per_block = 49152;
+  d.shared_memory_per_core = 100 * 1024;
+  d.threads_per_core_limit = 1536;
+  d.core_count = 84;
+  d.fpus_per_core = 128;
+  d.block_dim_limit_x = 2'147'483'647;
+  d.block_dim_limit_y = 65535;
+  d.block_dim_limit_z = 65535;
+  d.memory_bandwidth = 768'096'000'000;
+  d.l2_cache_size = 6 * 1024 * 1024;
+  d.clock_rate_ghz = 1.410;
+  return d;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h
new file mode 100644
index 00000000000..fc56b242c41
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h
@@ -0,0 +1,32 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+
+namespace xla {
+namespace gpu {
+
+class TestGpuDeviceInfo {
+ public:
+  static GpuDeviceInfo RTXA6000DeviceInfo();
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEVICE_INFO_FOR_TESTS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
new file mode 100644
index 00000000000..130c99a9af6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
+
+namespace stream_executor {
+namespace gpu {
+namespace {
+
+TEST(DeviceInfoTest, DeviceInfo) {
+  ASSERT_FALSE(cuInit(/*Flags=*/0));
+  std::unique_ptr<DeviceDescription> d =
+      GpuExecutor::CreateDeviceDescription(/*device_ordinal=*/0).value();
+  const std::string &name = d->name();
+  if (name == "NVIDIA RTX A6000") {
+    auto t = xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo();
+    EXPECT_EQ(t.threads_per_block_limit, d->threads_per_block_limit());
+    EXPECT_EQ(t.threads_per_warp, d->threads_per_warp());
+    EXPECT_EQ(t.shared_memory_per_block, d->shared_memory_per_block());
+    EXPECT_EQ(t.shared_memory_per_core, d->shared_memory_per_core());
+    EXPECT_EQ(t.threads_per_core_limit, d->threads_per_core_limit());
+    EXPECT_EQ(t.core_count, d->core_count());
+    EXPECT_EQ(t.fpus_per_core, d->fpus_per_core());
+    EXPECT_EQ(t.block_dim_limit_x, d->block_dim_limit().x);
+    EXPECT_EQ(t.block_dim_limit_y, d->block_dim_limit().y);
+    EXPECT_EQ(t.block_dim_limit_z, d->block_dim_limit().z);
+    EXPECT_EQ(t.memory_bandwidth, d->memory_bandwidth());
+    EXPECT_EQ(t.l2_cache_size, d->l2_cache_size());
+    // Clock rate can vary between base and boost values.
+    EXPECT_LE(t.clock_rate_ghz, d->clock_rate_ghz());
+  } else if (name == "Quadro P1000") {
+    EXPECT_EQ(d->fpus_per_core(), 128);
+    EXPECT_EQ(d->l2_cache_size(), 1024 * 1024);
+  } else if (name == "Tesla P100-SXM2-16GB") {
+    EXPECT_EQ(d->fpus_per_core(), 64);
+    EXPECT_EQ(d->l2_cache_size(), 4 * 1024 * 1024);
+  } else {
+    VLOG(1) << "Not tested for " << name;
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index a8f9710612b..ed6d32f708c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -29,12 +29,16 @@ limitations under the License.
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h"
 #include "tensorflow/compiler/xla/runtime/diagnostics.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
@@ -42,12 +46,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
-#include "tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
@@ -76,6 +81,7 @@ bool IsXlaRuntimeExecutableEnabled(const HloModuleConfig& config) {
 namespace {
 
 using ::tsl::profiler::ScopedAnnotation;
+using ::tsl::profiler::ScopedAnnotationAlways;
 
 bool NeedsAsyncCommsStream(Thunk& thunk) {
   switch (thunk.kind()) {
@@ -134,19 +140,6 @@ GpuExecutable::~GpuExecutable() {
   if (has_module()) {
     XlaDebugInfoManager::Get()->UnregisterModule(module().unique_id());
   }
-
-  {
-    // We could have issued host->device mem copies in ResolveConstantGlobals.
-    // Wait for those to finish so that we can safely deallocate the backing HLO
-    // module.
-    //
-    // We need for the host->device memcpies to finish they are concurrently
-    // reading memory (xla::Literal's) owned by the HLO module.
-    absl::MutexLock lock(&module_handle_mutex_);
-    for (const auto& pair : module_globals_) {
-      CHECK(pair.first->SynchronizeAllActivity());
-    }
-  }
 }
 
 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
@@ -181,9 +174,9 @@ Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
 namespace {
 
 Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options,
-                           uint64_t start_micros, se::Stream* stream_to_sync);
+                           uint64_t start_nanos, se::Stream* stream_to_sync);
 
-Status ExecuteThunks(const std::string& module_name,
+Status ExecuteThunks(const std::string& module_name, ModuleIdentifier module_id,
                      const ThunkSequence& thunk_sequence,
                      const ServiceExecutableRunOptions* run_options,
                      const BufferAllocations& buffer_allocations,
@@ -194,12 +187,21 @@ Status ExecuteThunks(const std::string& module_name,
   StatusOr<StreamPool::Ptr> async_comms_stream =
       run_options->BorrowStream(executor->device_ordinal());
 
-  uint64_t start_micros = tsl::Env::Default()->NowMicros();
+  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
 
   tsl::profiler::TraceMe hlo_module_activity(
       [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
       tsl::profiler::TraceMeLevel::kInfo);
 
+  ScopedAnnotationAlways annotation([&] {
+    std::string module_id_str;
+    if (module_id >= 0) {
+      module_id_str = absl::StrFormat(",program_id=%d", module_id);
+    }
+    return absl::StrFormat("XlaModule:#hlo_module=%s%s#", module_name,
+                           module_id_str);
+  });
+
   for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
     // Annotate execution of this op if tracing was enabled when we started
     // running this module.  If tracing is enabled *while* we're running the
@@ -214,12 +216,12 @@ Status ExecuteThunks(const std::string& module_name,
         async_comms_stream.ok() ? async_comms_stream->get() : nullptr};
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
   }
-  return MaybeSyncAndProfile(run_options, start_micros,
+  return MaybeSyncAndProfile(run_options, start_nanos,
                              block_host_until_done ? main_stream : nullptr);
 }
 
 Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options,
-                           uint64_t start_micros,
+                           uint64_t start_nanos,
                            se::Stream* stream_to_sync = nullptr) {
   // Make sure kernels are completed before deallocating temporary buffers or
   // the profiler state.
@@ -237,11 +239,11 @@ Status MaybeSyncAndProfile(const ServiceExecutableRunOptions* run_options,
   // FinishExecution() blocks until main_stream has completed if profiling is
   // enabled; we therefore do not need to defer profile collection onto a
   // stream.
-  uint64_t end_micros = tsl::Env::Default()->NowMicros();
+  uint64_t end_nanos = tsl::Env::Default()->NowNanos();
 
   if (run_options->run_options().execution_profile()) {
     ExecutionProfile* profile = run_options->run_options().execution_profile();
-    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    const double nanoseconds = end_nanos - start_nanos;
     profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
   }
 
@@ -276,6 +278,10 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
     TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
   }
 
+  // A flag signalling if constant initialization submitted memcpy operations
+  // to the `stream`.
+  int submitted_mem_copies = 0;
+
   for (const ConstantInfo& info : constants_) {
     StatusOr<stream_executor::DeviceMemoryBase> global_status;
     if (static_cast<bool>(module_handle)) {
@@ -295,6 +301,7 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
         // This means the constant did not have an initializer in the PTX and
         // therefore must be initialized by XLA here.
         stream->ThenMemcpy(&global, info.content.data(), info.content.size());
+        submitted_mem_copies = true;
       }
     } else {
       // The constant was not defined in the PTX and therefore must be both
@@ -317,6 +324,13 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
     }
   }
 
+  // Wait for the completion of all host->device transfers, to guarantee that
+  // destructor will not race with any operations in flight (deallocate
+  // xla::Literal owned by the HLO module).
+  if (submitted_mem_copies) {
+    TF_CHECK_OK(stream->BlockHostUntilDone());
+  }
+
   module_handles_.emplace(executor,
                           se::ScopedModuleHandle(executor, module_handle));
   return &module_globals_.emplace(executor, std::move(globals)).first->second;
@@ -437,6 +451,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
 }
 
 static Status ExecuteXlaRuntime(const std::string& module_name,
+                                ModuleIdentifier module_id,
                                 GpuRuntimeExecutable& gpu_runtime_executable,
                                 const ServiceExecutableRunOptions* run_options,
                                 const std::string& asm_text,
@@ -444,20 +459,27 @@ static Status ExecuteXlaRuntime(const std::string& module_name,
                                 const BufferAllocations& buffer_allocations,
                                 const BufferAllocation* temp_buffer,
                                 bool block_host_until_done) {
-  uint64_t start_micros = tsl::Env::Default()->NowMicros();
+  uint64_t start_nanos = tsl::Env::Default()->NowNanos();
 
   tsl::profiler::TraceMe hlo_module_activity(
       [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
       tsl::profiler::TraceMeLevel::kInfo);
 
-  ScopedAnnotation annotation([] { return std::string("ExecuteXlaRuntime"); });
+  ScopedAnnotationAlways annotation([&] {
+    std::string module_id_str;
+    if (module_id >= 0) {
+      module_id_str = absl::StrFormat(",program_id=%d", module_id);
+    }
+    return absl::StrFormat("XlaModule:#hlo_module=%s%s#", module_name,
+                           module_id_str);
+  });
 
   auto executed = gpu_runtime_executable.Execute(
       run_options, asm_text, binary, buffer_allocations, temp_buffer);
   if (!executed.ok()) return executed;
 
   return MaybeSyncAndProfile(
-      run_options, start_micros,
+      run_options, start_nanos,
       block_host_until_done ? run_options->stream() : nullptr);
 }
 
@@ -627,12 +649,19 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
   TF_RETURN_IF_ERROR(
       CheckCompatibilityWithServiceExecutableRunOptions(run_options));
 
+  // There isn't always an HLO module.
+  ModuleIdentifier unique_id = -1;
+  if (has_module()) {
+    unique_id = module().unique_id();
+  }
+
   if (thunks_) {
     se::StreamExecutor* executor = run_options->stream()->parent();
     for (const std::unique_ptr<Thunk>& thunk : *thunks_) {
       TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     }
-    return ExecuteThunks(module_name_, *thunks_, run_options,
+
+    return ExecuteThunks(module_name_, unique_id, *thunks_, run_options,
                          buffer_allocations, block_host_until_done);
   }
 
@@ -646,7 +675,7 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
         if (temp_buffer == nullptr) temp_buffer = &alloc;
       }
     }
-    return ExecuteXlaRuntime(module_name_, *gpu_runtime_executable_,
+    return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_,
                              run_options, text_, binary_, buffer_allocations,
                              temp_buffer, block_host_until_done);
   }
@@ -786,8 +815,8 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
 
 GpuExecutable::GpuExecutable(
     std::shared_ptr<HloModule> hlo_module, std::string asm_text,
-    std::vector<uint8_t> binary, GpuVersion gpu_version,
-    xla::EntryFunctionAttributes entry_func_attrs,
+    std::vector<uint8_t> binary, std::vector<ConstantInfo> constants,
+    GpuVersion gpu_version, xla::EntryFunctionAttributes entry_func_attrs,
     absl::string_view module_name, Shape xla_output_shape,
     std::vector<BufferAllocation> allocations,
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
@@ -801,17 +830,91 @@ GpuExecutable::GpuExecutable(
       module_name_(module_name),
       output_shape_(xla_output_shape),
       allocations_(std::move(allocations)),
+      constants_(std::move(constants)),
       output_info_(std::move(output_info)) {
   XlaDebugInfoManager::Get()->RegisterModule(
       module().unique_id(), shared_module(), debug_buffer_assignment_);
 }
 
+// Returns a list of functions exported from the `module` that should be loaded
+// from the object file. Entrypoint functions always loaded with ordinal 0.
+static StatusOr<std::vector<runtime::Executable::LoadFunction>>
+GetFunctionsToLoad(mlir::ModuleOp module, std::string_view entry) {
+  std::vector<runtime::Executable::LoadFunction> functions;
+
+  // Use canonical type converter because we currently do not support any
+  // user-defined types in XLA:GPU executables.
+  runtime::TypeConverter type_converter;
+
+  // Converts function type and adds load function metadata. In XLA:GPU exported
+  // function runtime signature is the same as regular signature with an extra
+  // execution context argument at index 0.
+  auto convert = [&](mlir::func::FuncOp func) -> Status {
+    auto signature = type_converter.Convert(func.getFunctionType());
+    if (!signature.ok())
+      return InternalError("Failed to convert entry function type: %s",
+                           signature.status().message());
+
+    // TODO(ezhulenev): Copy `signature` once FunctionType is copyable.
+    auto rt_signature = type_converter.Convert(func.getFunctionType());
+    rt_signature->insert_operand(
+        0, std::make_unique<runtime::ExecutionContextOperandType>());
+
+    functions.push_back({func.getName().str(), std::move(*signature),
+                         std::move(*rt_signature)});
+
+    return OkStatus();
+  };
+
+  mlir::SymbolTable sym_table(module);
+
+  // Load entrypoint function first at ordinal 0.
+  TF_CHECK_OK(convert(module.lookupSymbol<mlir::func::FuncOp>(entry)));
+
+  // Load all functions explicitly exported from the module (in XLA:GPU it's
+  // always CUDA graph capture functions). We explicitly sort them by ordinal,
+  // to make sure they are loaded in correct order.
+  auto export_ops = llvm::to_vector(module.getOps<runtime::ExportOp>());
+  llvm::sort(export_ops, [](runtime::ExportOp a, runtime::ExportOp b) {
+    return b.getOrdinal()->getSExtValue() < b.getOrdinal()->getSExtValue();
+  });
+  for (runtime::ExportOp exported : export_ops) {
+    TF_CHECK_OK(convert(
+        sym_table.lookup<mlir::func::FuncOp>(exported.getFunctionRef())));
+  }
+
+  return functions;
+}
+
+// Get arguments buffer sizes from the entry function signature.
+static StatusOr<std::vector<int64_t>> GetBufferSizes(runtime::FunctionType& f) {
+  std::vector<int64_t> buffer_sizes;
+  for (unsigned i = 0; i < f.num_operands(); ++i) {
+    auto* memref = llvm::dyn_cast<runtime::MemrefType>(f.operand(i));
+
+    // Entry function argument must be a statically shaped 1d I8 memref.
+    if (memref == nullptr || memref->element_type() != PrimitiveType::S8 ||
+        memref->rank() != 1 || runtime::MemrefType::IsDynamic(memref->size(0)))
+      return InternalError("Illegal buffer argument type: %s",
+                           f.operand(0)->ToString());
+
+    buffer_sizes.push_back(memref->size(0));
+  }
+  return buffer_sizes;
+}
+
 StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
     std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
     absl::string_view mlir_module,
     xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
     absl::string_view asm_text, absl::string_view binary,
-    GpuVersion gpu_version, se::StreamExecutor* executor) {
+    std::vector<ConstantInfo> constants, GpuVersion gpu_version,
+    se::StreamExecutor* executor) {
+  VLOG(1) << "Load serialized Gpu executable from object file: module="
+          << hlo_module->name();
+
+  std::string_view entry = hlo_module->entry_computation()->name();
+
   // Load MLIR module behind the compiled object file to recover XLA allocations
   // and output info details. Also recover buffer sizes from the entrypoint
   // function signature.
@@ -821,20 +924,17 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   auto module = mlir::parseSourceString<mlir::ModuleOp>(mlir_module, &context);
   if (!module) return InternalError("Failed to parse AOT compiled module");
 
-  // Get the XLA module entrypoint function.
-  auto func = mlir::cast<mlir::func::FuncOp>(
-      module->lookupSymbol(hlo_module->entry_computation()->name()));
+  // Get the list of functions to be loaded from the object file.
+  TF_ASSIGN_OR_RETURN(std::vector<runtime::Executable::LoadFunction> functions,
+                      GetFunctionsToLoad(*module, entry));
+  VLOG(2) << "Found " << functions.size() << " functions to load";
 
-  // Get the buffer sizes from the entrypoint function signature.
-  std::vector<int64_t> buffer_sizes;
-  buffer_sizes.reserve(func.getNumArguments());
-  for (auto type : func.getArgumentTypes()) {
-    auto memref = type.dyn_cast<mlir::MemRefType>();
-    if (!memref || !memref.hasStaticShape() || memref.getRank() != 1)
-      return InternalError("Illegal entrypoint argument type: %s",
-                           mlir::debugString(type));
-    buffer_sizes.push_back(memref.getDimSize(0));
-  }
+  // Get the buffer sizes from the entry function signature.
+  TF_ASSIGN_OR_RETURN(std::vector<int64_t> buffer_sizes,
+                      GetBufferSizes(functions[0].signature));
+
+  // Get the XLA module entrypoint function.
+  auto func = mlir::cast<mlir::func::FuncOp>(module->lookupSymbol(entry));
 
   // Infer XLA allocations and output info from the MLIR module.
   std::vector<BufferAllocation> allocations;
@@ -847,29 +947,8 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   llvm::StringRef data(obj_file.data(), obj_file.size());
   auto buffer = llvm::MemoryBuffer::getMemBuffer(data, hlo_module->name());
 
-  // Create a XLA Runtime function signature (all arguments passed as 1d
-  // memrefs).
-  std::vector<std::unique_ptr<runtime::Type>> args;
-  std::vector<std::unique_ptr<runtime::Type>> rt_args;
-  rt_args.push_back(std::make_unique<runtime::ExecutionContextOperandType>());
-
-  for (int64_t size : buffer_sizes) {
-    auto s8 = PrimitiveType::S8;
-    std::array<int64_t, 1> dims = {size};
-    args.push_back(std::make_unique<runtime::MemrefType>(dims, s8));
-    rt_args.push_back(std::make_unique<runtime::MemrefType>(dims, s8));
-  }
-
-  runtime::FunctionType signature(std::move(args), /*results=*/{});
-  runtime::FunctionType rt_signature(std::move(rt_args), /*results=*/{});
-
-  auto symbol_map = runtime::ToSymbolsBinding(PopulateXlaGpuCustomCalls,
-                                              PopulateXlaGpuTypeIdNames);
-
-  // Gpu executable has a single exported function.
-  std::vector<runtime::Executable::LoadFunction> functions;
-  functions.push_back({hlo_module->entry_computation()->name(),
-                       std::move(signature), std::move(rt_signature)});
+  auto symbol_map = runtime::ToSymbolsBinding(RegisterXlaGpuRuntimeCustomCalls,
+                                              RegisterXlaGpuTypeIdNames);
 
   // Load XLA Runtime executable from an object file, and link it with Gpu
   // runtime intrinsics implementing Gpu custom calls.
@@ -892,9 +971,9 @@ StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
   std::vector<uint8_t> binary_vector(binary.begin(), binary.end());
   return std::unique_ptr<Executable>(new GpuExecutable(
       std::move(hlo_module), std::move(asm_text_string),
-      std::move(binary_vector), gpu_version, entry_func_attrs, name,
-      result_xla_shape, std::move(allocations), std::move(output_info),
-      std::move(gpu_runtime_executable)));
+      std::move(binary_vector), std::move(constants), gpu_version,
+      entry_func_attrs, name, result_xla_shape, std::move(allocations),
+      std::move(output_info), std::move(gpu_runtime_executable)));
 }
 
 StatusOr<std::string_view> GpuExecutable::GetObjFile() const {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index a958eab0d49..989722dfcfc 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
@@ -121,13 +121,15 @@ class GpuExecutable : public Executable {
       absl::string_view mlir_module,
       xla::EntryFunctionAttributes entry_func_attrs, DebugOptions debug_options,
       absl::string_view asm_text, absl::string_view binary,
-      GpuVersion gpu_version, stream_executor::StreamExecutor* executor);
+      std::vector<ConstantInfo> constants, GpuVersion gpu_version,
+      stream_executor::StreamExecutor* executor);
 
   // Constructor to use when loading a GpuExecutable from an object file (native
   // function compiled for XLA Runtime). Omits setting class members that aren't
   // used in XLA Runtime execution mode.
   GpuExecutable(std::shared_ptr<HloModule> hlo_module, std::string asm_text,
-                std::vector<uint8_t> binary, GpuVersion gpu_version,
+                std::vector<uint8_t> binary,
+                std::vector<ConstantInfo> constants, GpuVersion gpu_version,
                 xla::EntryFunctionAttributes entry_func_attrs,
                 absl::string_view module_name, Shape xla_output_shape,
                 std::vector<BufferAllocation> allocations,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index bf8d8e17d1a..64332c6e79b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <stack>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -101,6 +101,15 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
          IsReductionFromOrToContiguousDimensions(instr);
 }
 
+bool IsNestableVariadicReduction(const HloInstruction& instr) {
+  return instr.shape().IsTuple() &&
+         ((instr.opcode() == HloOpcode::kReduce &&
+           !IsReductionFromOrToContiguousDimensions(instr)) ||
+          (instr.opcode() == HloOpcode::kFusion &&
+           instr.fusion_kind() == HloInstruction::FusionKind::kLoop &&
+           instr.fused_expression_root()->opcode() == HloOpcode::kReduce));
+}
+
 bool IsTransposeInputFusion(const HloInstruction& instr) {
   return instr.opcode() == HloOpcode::kFusion &&
          HasAnyTiledTransposeRoot(instr.called_computations()[0]);
@@ -130,6 +139,27 @@ const HloInstruction* GetRealHeroForMultiOutputFusion(
   return fused_expression_root->operands()[0];
 }
 
+// Returns whether the output of a fusion with reduction are consistent with
+// `first_reduce`.
+static bool IsFusedReductionOutputConsistent(
+    const HloInstruction* inst, const HloInstruction* first_reduce) {
+  if (IsReductionFromOrToContiguousDimensions(*inst)) {
+    // Shapes, layouts and dimensions must be the same for all reduces
+    // inside of this fusion.
+    return ShapeUtil::EqualIgnoringElementType(first_reduce->shape(),
+                                               inst->shape()) &&
+           ShapeUtil::EqualIgnoringElementType(
+               first_reduce->operand(0)->shape(), inst->operand(0)->shape()) &&
+           ShapeUtil::EqualIgnoringElementType(
+               first_reduce->operand(1)->shape(), inst->operand(1)->shape()) &&
+           first_reduce->dimensions() == inst->dimensions();
+  }
+  return ShapeUtil::CompatibleIgnoringElementType(
+             first_reduce->operand(0)->shape(), inst->shape()) &&
+         LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
+                           inst->shape().layout());
+}
+
 FusionDecision ShapesCompatibleForMultiOutputFusion(
     const HloInstruction& instr1, const HloInstruction& instr2) {
   // Multi-output fusion kernels share a common parallel loop. The loop
@@ -159,7 +189,7 @@ FusionDecision ShapesCompatibleForMultiOutputFusion(
   bool hero2_is_unnested_transpose = FindAnyTiledTranspose(*hero2).has_value();
 
   if (hero1_is_unnested_reduce && hero2_is_unnested_reduce &&
-      !AreFusedReductionOutputsConsistent({hero1, hero2}, hero1)) {
+      !IsFusedReductionOutputConsistent(hero2, hero1)) {
     return "tiled reductions with different shapes";
   } else if (hero1_is_unnested_transpose && hero2_is_unnested_transpose &&
              (!ShapeUtil::EqualIgnoringElementType(hero1->shape(),
@@ -207,49 +237,59 @@ bool IsInputFusible(const HloInstruction& instr) {
           IsInputFusibleTranspose(instr));
 }
 
-bool IsLoopFusible(const HloInstruction& instr) {
+bool IsUniversallyLoopFusible(const HloInstruction& instr) {
   // Don't fuse get-tuple-element on GPU: We can, but it's slower than not
   // fusing.  We never generate kernels for unfused GTEs.  Instead, if an
   // unfused GTE is an input to a kernel (including a fusion kernel), we
   // compute the address of the GTE at the top of the kernel.  Often we know the
   // address of the GTE result statically, so we can do this without chasing any
   // pointers.
+  return (
+      (instr.IsElementwise() && instr.operand_count() > 0 &&
+       instr.opcode() != HloOpcode::kCopy) ||
+      (instr.opcode() == HloOpcode::kCopy && !FindAnyTiledTranspose(instr)) ||
+      instr.opcode() == HloOpcode::kBitcast ||
+      instr.opcode() == HloOpcode::kBroadcast ||
+      instr.opcode() == HloOpcode::kConcatenate ||
+      instr.opcode() == HloOpcode::kDynamicSlice ||
+      instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
+      (instr.opcode() == HloOpcode::kFusion &&
+       instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
+      instr.opcode() == HloOpcode::kGather ||
+      instr.opcode() == HloOpcode::kPad ||
+      instr.opcode() == HloOpcode::kReduceWindow ||
+      instr.opcode() == HloOpcode::kReshape ||
+      instr.opcode() == HloOpcode::kReverse ||
+      instr.opcode() == HloOpcode::kSlice ||
+      instr.opcode() == HloOpcode::kTranspose);
+}
+
+bool IsLoopFusibleAsConsumer(const HloInstruction& instr) {
+  return instr.IsFusible() && (IsUniversallyLoopFusible(instr) ||
+                               // Any reduction can be fused as a consumer.
+                               instr.opcode() == HloOpcode::kReduce);
+}
+
+bool IsLoopFusibleAsProducer(const HloInstruction& instr) {
   return instr.IsFusible() &&
-         ((instr.IsElementwise() && instr.operand_count() > 0 &&
-           instr.opcode() != HloOpcode::kCopy) ||
-          (instr.opcode() == HloOpcode::kCopy &&
-           !FindAnyTiledTranspose(instr)) ||
-          instr.opcode() == HloOpcode::kBitcast ||
-          instr.opcode() == HloOpcode::kBroadcast ||
-          instr.opcode() == HloOpcode::kConcatenate ||
-          instr.opcode() == HloOpcode::kDynamicSlice ||
-          instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
-          (instr.opcode() == HloOpcode::kFusion &&
-           instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
-          instr.opcode() == HloOpcode::kGather ||
-          instr.opcode() == HloOpcode::kIota ||
-          instr.opcode() == HloOpcode::kPad ||
-          (instr.opcode() == HloOpcode::kReduce &&
-           !IsReductionFromOrToContiguousDimensions(instr) &&
-           !instr.shape().IsTuple()) ||  // TODO(b/129089333): Don't fuse
-                                         // variadic reductions.
-          instr.opcode() == HloOpcode::kReduceWindow ||
-          instr.opcode() == HloOpcode::kReshape ||
-          instr.opcode() == HloOpcode::kReverse ||
-          instr.opcode() == HloOpcode::kSlice ||
-          instr.opcode() == HloOpcode::kConstant ||
-          instr.opcode() == HloOpcode::kTranspose);
+         (IsUniversallyLoopFusible(instr) ||
+          (instr.opcode() == HloOpcode::kIota ||
+           instr.opcode() == HloOpcode::kConstant ||
+           // Non-variadic elemental reductions can be fused as producers.
+           (instr.opcode() == HloOpcode::kReduce &&
+            !IsReductionFromOrToContiguousDimensions(instr) &&
+            !instr.shape().IsTuple())));
 }
 
 FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
                                          const HloInstruction& consumer) {
-  if (!IsLoopFusible(producer) &&
+  if (!IsLoopFusibleAsProducer(producer) &&
       !(FindAnyTiledTranspose(producer) &&
         &FindNonTrivialHero(consumer) == &producer)) {
     return "the producer is not loop-fusible";
   }
 
-  if (!IsInputFusible(consumer) && !IsLoopFusible(consumer)) {
+  if (!IsInputFusible(consumer) && !IsLoopFusibleAsConsumer(consumer)) {
     return "the consumer is not input-fusible and not loop-fusible";
   }
 
@@ -258,10 +298,6 @@ FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
     return "the producer is not fusible as it is a multi-output fusion";
   }
 
-  if (CreatesHeavyComputation(producer, consumer)) {
-    return "the fusion would create a heavy computation";
-  }
-
   // Fuse scalar constants into loop fusion nodes. This reduces the number of
   // parameters and makes matching scalar broadcasts easier.
   //
@@ -315,12 +351,10 @@ FusionDecision IsProducerConsumerMultiOutputFusible(
     return "In-place operations are present";
   }
 
-  if (!IsLoopFusible(producer)) {
+  if (!IsLoopFusibleAsProducer(producer)) {
     return "producer is not loop-fusible";
   } else if (!IsFusibleAsMultiOutputFusionRoot(consumer)) {
     return "consumer is not fusible as multi-output-fusion-root";
-  } else if (CreatesHeavyComputation(producer, consumer)) {
-    return "fusion creates heavy computation";
   } else if (NoFusionPossible fusible =
                  !ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
     return !fusible;
@@ -367,8 +401,7 @@ static int64_t SharedMemoryUsageNoCache(const HloInstruction& instr) {
   return 0;
 }
 
-static int64_t SharedMemoryUsage(const HloInstruction& instr,
-                                 FusionInfoCache* cache = nullptr) {
+int64_t SharedMemoryUsage(const HloInstruction& instr, FusionInfoCache* cache) {
   if (!cache) {
     return SharedMemoryUsageNoCache(instr);
   }
@@ -453,13 +486,14 @@ static int64_t NumUnnestedReductions(const HloInstruction& instr,
 // to true to enable more fusion.
 FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
                                   const HloInstruction& instr2,
+                                  const GpuDeviceInfo& device_info,
                                   bool is_consumer_producer_fusion,
                                   FusionInfoCache* cache /*=nullptr*/) {
   if (SharedMemoryUsage(instr1, cache) + SharedMemoryUsage(instr2, cache) >
-      kSharedMemoryBudgetInBytes) {
+      device_info.shared_memory_per_block) {
     return FusionDecision{}
            << "shared memory usage would be over the budget of "
-           << kSharedMemoryBudgetInBytes << "B";
+           << device_info.shared_memory_per_block << "B";
   }
 
   if (NumUnnestedReductions(instr1, cache) +
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index 3914ae89812..b5b13651bd1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_FUSIBLE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
 // TODO(b/112957171): Extract logic to determine fusibility of HLO ops from
@@ -54,11 +55,11 @@ struct FusionInfoCache {
   absl::flat_hash_map<const HloInstruction*, int64_t> num_unnested_reductions;
 };
 
-inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 64; }
-
-bool IsInputFusible(const HloInstruction& instr);
+// Returns projected shared memory usage of a given instruction in bytes.
+int64_t SharedMemoryUsage(const HloInstruction& instr,
+                          FusionInfoCache* cache = nullptr);
 
-bool IsLoopFusible(const HloInstruction& instr);
+inline constexpr int64_t MaxOperandsAndOutputsPerFusion() { return 64; }
 
 // Whether the op tranposes the physical data layout. Fusing such ops may lead
 // to uncoalesced data access and may thus not be beneficial.
@@ -77,6 +78,10 @@ bool IsReduceInputFusion(const HloInstruction& instr);
 // is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
+// Whether `instr` is a nestable variadic reduction
+// or a loop fusion rooted with such.
+bool IsNestableVariadicReduction(const HloInstruction& instr);
+
 // Whether `instr` is fusible as root of a scatter input fusions, i.e. `instr`
 // is either an unfused scatter op or a scatter input fusion.
 bool IsInputFusibleScatter(const HloInstruction& instr);
@@ -88,6 +93,7 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 // the producer, set consumer_producer_fusion to true to enable more fusion.
 FusionDecision FusionFitsInBudget(const HloInstruction& instr1,
                                   const HloInstruction& instr2,
+                                  const GpuDeviceInfo& device_info,
                                   bool is_consumer_producer_fusion = false,
                                   FusionInfoCache* cache = nullptr);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index b25d74a89e3..35b773e44b5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.cc
index 83b8b37fd52..0033428c36d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.cc
@@ -19,15 +19,19 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 
 namespace xla {
 namespace gpu {
 
-static constexpr const char kIRSizeKey[] = "code_size";
-static constexpr const char kBasicBlockSplitCountKey[] = "basic_block_count";
+// Use the "reserved" keys for these properties so lookups are fast.
+static constexpr absl::string_view kIRSizeKey = HloCostAnalysis::kReserved0Key;
+static constexpr absl::string_view kBasicBlockSplitCountKey =
+    HloCostAnalysis::kReserved1Key;
 
 Status GpuHloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(HloCostAnalysis::Preprocess(hlo));
@@ -43,66 +47,64 @@ int64_t GpuHloCostAnalysis::FusionParameterReadBytes(
     const HloInstruction* hlo) const {
   CHECK(hlo->IsFused() && (hlo->opcode() == HloOpcode::kParameter ||
                            hlo->opcode() == HloOpcode::kGetTupleElement));
-  return GetShapeSize(hlo->shape()) *
-         hlo_properties_.at(hlo).at(kUtilizationKey);
+  float utilization = hlo_properties_.at(hlo)[kUtilizationKey];
+  if (!options_.count_multiple_input_accesses) {
+    utilization = fmin(utilization, 1.0);
+  }
+  return GetShapeSize(hlo->shape()) * utilization;
 }
 
 Status GpuHloCostAnalysis::FusionCalculateUtilizations(
     const HloInstruction* fusion) {
   const HloInstruction* root = fusion->fused_expression_root();
   // Traverse through the computation from the root till parameters propagating
-  // the utilization of operands; store utilization of each node
-  // in hlo_properties_. All consumers of an instruction are processed before
-  // the instruction itself.
+  // the utilization of operands; store utilization of each node in
+  // hlo_properties_. All consumers of an instruction are processed before the
+  // instruction itself.
   std::vector<HloInstruction*> instructions =
       fusion->fused_instructions_computation()->MakeInstructionPostOrder();
   absl::c_reverse(instructions);
 
-  // To estimate where within the computation an instruction output can be
-  // reused and where it has to be recomputed again we group accesses to the
-  // instruction by their origin from "element-wise use roots". All access
-  // paths from such a root to the instruction are element-wise.
   // Whenever we account a non-element-wise operation we forget about
   // element-wise roots encountered so far and provisionally set its operands
   // as new element-wise roots.
-  absl::flat_hash_map<const HloInstruction*, ConstHloInstructionSet>
-      elementwise_use_roots;
+  absl::flat_hash_map<const HloInstruction*, int64_t> root_ir_sizes;
 
   for (const HloInstruction* instr : instructions) {
     hlo_properties_[instr][kUtilizationKey] = 0;
     hlo_properties_[instr][kIRSizeKey] = 0;
+    elementwise_use_roots_[instr].clear();
+    root_utilizations_[instr] = 0;
   }
 
   // For the purpose of operand utilization analysis, no matter how the fusion
   // outputs are used, we assume that fusion is always executed completely
   // producing 100% of its outputs.
-  hlo_properties_[root][kUtilizationKey] = 1.0;
-  hlo_properties_[root][kIRSizeKey] = 1;
-  elementwise_use_roots[root].insert(root);
+  root_utilizations_[root] = 1.0;
+  root_ir_sizes[root] = 1;
+  elementwise_use_roots_[root].insert(root);
 
   current_properties_[kFlopsKey] = 0;
   current_properties_[kBasicBlockSplitCountKey] = 0;
   current_properties_[kIRSizeKey] = 0;
 
   for (const HloInstruction* instr : instructions) {
-    VLOG(8) << instr->ToString() << ":";
+    VLOG(8) << instr->name() << ":";
     VLOG(9) << "Elementwise use roots:";
-    for (const HloInstruction* r : elementwise_use_roots[instr]) {
-      VLOG(9) << "\t" << r->ToString();
-      if (instr != r) {
-        hlo_properties_[instr][kUtilizationKey] +=
-            hlo_properties_[r][kUtilizationKey];
-        hlo_properties_[instr][kIRSizeKey] += hlo_properties_[r][kIRSizeKey];
-      }
+    Properties& instr_props = hlo_properties_[instr];
+    for (const HloInstruction* r : elementwise_use_roots_[instr]) {
+      VLOG(9) << "\t" << r->name() << ": " << root_utilizations_[r];
+      instr_props[kUtilizationKey] += root_utilizations_[r];
+      instr_props[kIRSizeKey] += root_ir_sizes[r];
     }
 
-    float cur_instr_utilization = hlo_properties_[instr][kUtilizationKey];
+    float cur_instr_utilization = instr_props[kUtilizationKey];
     VLOG(8) << "Total utilization: " << cur_instr_utilization;
-    float cur_instr_times_emitted = hlo_properties_[instr][kIRSizeKey];
+    float cur_instr_times_emitted = instr_props[kIRSizeKey];
     VLOG(8) << "Times emitted: " << cur_instr_times_emitted;
 
     current_properties_[kFlopsKey] +=
-        cur_instr_utilization * hlo_properties_[instr][kFlopsKey];
+        cur_instr_utilization * instr_props[kFlopsKey];
     current_properties_[kIRSizeKey] += cur_instr_times_emitted;
     current_properties_[kBasicBlockSplitCountKey] +=
         cur_instr_times_emitted * ElementalIrEmitter::OpInvalidatesCache(instr);
@@ -112,12 +114,11 @@ Status GpuHloCostAnalysis::FusionCalculateUtilizations(
       const HloInstruction* operand = instr->operand(operand_idx);
       if ((instr->IsElementwise()) || instr->opcode() == HloOpcode::kTuple ||
           instr->opcode() == HloOpcode::kGetTupleElement) {
-        auto instr_roots = elementwise_use_roots[instr];
-        for (const HloInstruction* r : instr_roots) {
-          elementwise_use_roots[operand].insert(r);
+        for (const HloInstruction* r : elementwise_use_roots_[instr]) {
+          elementwise_use_roots_[operand].insert(r);
         }
       } else {
-        elementwise_use_roots[operand].insert(operand);
+        elementwise_use_roots_[operand].insert(operand);
         float cur_operand_utilization =
             cur_instr_utilization * operand_utilization(*instr, operand_idx);
         // The utilization is always a best-effort estimate, but in some cases
@@ -130,8 +131,8 @@ Status GpuHloCostAnalysis::FusionCalculateUtilizations(
             ShapeUtil::ElementsInRecursive(operand->shape());
         cur_operand_utilization =
             ceil(cur_operand_utilization * operand_elements) / operand_elements;
-        hlo_properties_[operand][kUtilizationKey] += cur_operand_utilization;
-        hlo_properties_[operand][kIRSizeKey] += cur_instr_times_emitted;
+        root_utilizations_[operand] += cur_operand_utilization;
+        root_ir_sizes[operand] += cur_instr_times_emitted;
       }
     }
   }
@@ -139,6 +140,17 @@ Status GpuHloCostAnalysis::FusionCalculateUtilizations(
   return OkStatus();
 }
 
+float GpuHloCostAnalysis::CommonElementwiseUtilization(
+    const HloInstruction* a, const HloInstruction* b) const {
+  float ret = 0;
+  for (auto r : elementwise_use_roots_.at(a)) {
+    if (elementwise_use_roots_.at(b).count(r)) {
+      ret += root_utilizations_.at(r);
+    }
+  }
+  return ret;
+}
+
 bool GpuHloCostAnalysis::ProducerConsumerMergedTooLarge(
     const HloInstruction& producer, const HloInstruction& consumer) {
   int64_t producer_replication = 1;
@@ -191,8 +203,14 @@ Status GpuHloCostAnalysis::HandleCustomCall(const HloInstruction* custom_call) {
     // gemm is an integer type, because in that case no floating point
     // operations are involved at all! But we still calculate FLOPS because the
     // number is sometimes required for ad-hoc calculations.
+
+    // cublasLt supports auxiliary outputs, so output may be tuple.
+    const Shape& output_shape = custom_call->shape().IsTuple()
+                                    ? custom_call->shape().tuple_shapes(0)
+                                    : custom_call->shape();
+
     current_properties_[kFlopsKey] =
-        GetDotFlops(custom_call->operand(0)->shape(), custom_call->shape(),
+        GetDotFlops(custom_call->operand(0)->shape(), output_shape,
                     gemm_config.dot_dimension_numbers());
     return OkStatus();
   }
@@ -217,8 +235,30 @@ Status GpuHloCostAnalysis::HandleCustomCall(const HloInstruction* custom_call) {
     // possible", and if we were to include temp memory in here, we'd
     // essentially be *rewarding* convs that use additional temp memory!
     if (custom_call->shape().IsTuple()) {
-      SetOutputBytesAccessed(
-          options_.shape_size(custom_call->shape().tuple_shapes(0)));
+      float output_size =
+          options_.shape_size(custom_call->shape().tuple_shapes(0));
+      // 'Bytes accessed' are estimated in HloCostAnalysis::Preprocess() as
+      // input + output. As the output size is being adjusted here it has
+      // to propagate to the total bytes accessed.
+      current_properties_[kBytesAccessedKey] -=
+          current_properties_.output_bytes_accessed();
+      current_properties_[kBytesAccessedKey] += output_size;
+      current_properties_.set_output_bytes_accessed(output_size);
+    }
+    return OkStatus();
+  }
+
+  if (IsSoftmaxCustomCall(*custom_call)) {
+    TF_ASSIGN_OR_RETURN(current_properties_,
+                        ProcessSubcomputation(custom_call->to_apply()));
+    current_properties_[kBytesAccessedKey] = GetShapeSize(custom_call->shape());
+    current_properties_.set_output_bytes_accessed(
+        GetShapeSize(custom_call->shape()));
+
+    for (int64_t i = 0; i < custom_call->operand_count(); ++i) {
+      int64_t operand_size = GetShapeSize(custom_call->operand(i)->shape());
+      current_properties_[kBytesAccessedKey] += operand_size;
+      current_properties_.set_operand_bytes_accessed(i, operand_size);
     }
     return OkStatus();
   }
@@ -246,6 +286,106 @@ int64_t GpuHloCostAnalysis::GetConvolutionFlops(
                                               result_shape);
 }
 
+Status GpuHloCostAnalysis::HandleElementwiseOp(const HloInstruction* hlo) {
+  const HloOpcode opcode = hlo->opcode();
+  const auto& shape = hlo->shape();
+  const PrimitiveType type = shape.element_type();
+
+  // These are clock cycle estimates of some of the most common expensive
+  // operations. They most likely vary a lot from GPU to GPU but should
+  // at least provide reasonable comparisons for the computation cost analysis.
+  // HLOs used to measure these can be found in gpu_performance_model_test.cc
+  // This list is far from complete yet.
+  // TODO(b/256570878): Make a tool to measure these numbers and store them
+  // separately from the code where possible.
+
+  // Typical elementwise instructions take about 3 clock cycles.
+  int64_t flop_per_element = 3;
+  switch (opcode) {
+    case HloOpcode::kTanh:
+      if (type == F32) {
+        flop_per_element = 30;
+      } else if (type == F64) {
+        flop_per_element = 2000;
+      }
+      break;
+    case HloOpcode::kDivide:
+      if (type == S32) {
+        flop_per_element = 80;
+      } else if (type == F64) {
+        flop_per_element = 3200;
+      } else if (type == C128) {
+        flop_per_element = 20000;
+      }
+      break;
+    // Expands to multiple instructions.
+    case HloOpcode::kExp:
+      if (type == F64) {
+        flop_per_element = 2200;
+      }
+      break;
+    case HloOpcode::kSqrt:
+      if (type == F64) {
+        flop_per_element = 1100;
+      } else if (type == C128) {
+        flop_per_element = 25000;
+      }
+      break;
+    case HloOpcode::kRsqrt:
+      if (type == F64) {
+        flop_per_element = 900;
+      }
+      break;
+    case HloOpcode::kAdd:
+      if (type == F64) {
+        flop_per_element = 120;
+      } else if (type == C128) {
+        flop_per_element = 240;
+      }
+      break;
+    case HloOpcode::kMultiply:
+      if (type == F64) {
+        flop_per_element = 120;
+      } else if (type == C128) {
+        flop_per_element = 650;
+      }
+      break;
+    case HloOpcode::kPower:
+      if (type == F64) {
+        flop_per_element = 11000;
+      } else if (type == C128) {
+        flop_per_element = 28000;
+      }
+      break;
+    case HloOpcode::kLog:
+      if (type == F32) {
+        flop_per_element = 45;
+      } else if (type == F64) {
+        flop_per_element = 1000;
+      }
+      break;
+    default:
+      // Raise default cost of all unlisted F64 and C128 ops.
+      if (type == F64) {
+        flop_per_element = 10;
+      } else if (type == C128) {
+        flop_per_element = 20;
+      }
+      break;
+  }
+  current_properties_[kFlopsKey] =
+      flop_per_element * ShapeUtil::ElementsInRecursive(shape);
+  return OkStatus();
+}
+
+Status GpuHloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) {
+  return HandleElementwiseOp(hlo);
+}
+
+Status GpuHloCostAnalysis::HandleElementwiseBinary(const HloInstruction* hlo) {
+  return HandleElementwiseOp(hlo);
+}
+
 std::unique_ptr<HloCostAnalysis>
 GpuHloCostAnalysis::CreateNestedCostAnalysis() {
   return std::make_unique<GpuHloCostAnalysis>(options_);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h
index 57f61399bd0..b138e1786e2 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h
@@ -41,18 +41,33 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
 
   int64_t GetConvolutionFlops(const HloInstruction* convolution) override;
 
+  Status HandleElementwiseOp(const HloInstruction* hlo);
+  Status HandleElementwiseUnary(const HloInstruction* hlo) override;
+  Status HandleElementwiseBinary(const HloInstruction* hlo) override;
+
   // Estimate the total size of IR accounting for both duplication
   // of producer code by consumer and the total number of basic blocks.
   // Tell if merged IR size would be too slow to compile.
   bool ProducerConsumerMergedTooLarge(const HloInstruction& producer,
                                       const HloInstruction& consumer);
 
+  // IR size scale of an instruction: 1 for most instructions,
+  // but for fusions is the number of instructions emitted including the
+  // duplication due to non-element-wise accesses.
+  float IrSize(const HloInstruction& hlo) const;
+
+  // Total common elementwise utilization of two instructions within a fusion.
+  // If two parameters have several common elementwise use roots returned is
+  // the sum of these utilizations. Can also be used to query if a parameter
+  // is used elementwise from the fusion's root.
+  float CommonElementwiseUtilization(const HloInstruction* a,
+                                     const HloInstruction* b) const;
+
  protected:
   std::unique_ptr<HloCostAnalysis> CreateNestedCostAnalysis() override;
   int64_t FusionParameterReadBytes(const HloInstruction* hlo) const override;
   Status FusionCalculateUtilizations(const HloInstruction* fusion) override;
 
-  bool input_reuse_is_inefficient() const override { return true; }
   size_t immediate_constant_max_elements() const override { return 8; }
 
   bool KeyToCopyFromSubcomputation(absl::string_view key) const override;
@@ -63,10 +78,18 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   // Count these to avoid unmanageable IR code size.
   float IrBasicBlockSplitCount(const HloInstruction& hlo) const;
 
-  // IR size scale of an instruction: 1 for most instructions,
-  // but for fusions is the number of instructions emitted including the
-  // duplication due to non-element-wise accesses.
-  float IrSize(const HloInstruction& hlo) const;
+  // To estimate where within the computation an instruction output can be
+  // reused and where it has to be recomputed again we group accesses to the
+  // instruction by their origin from "element-wise use roots". All access
+  // paths from such a root to the instruction are element-wise.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      elementwise_use_roots_;
+
+  // Elementwise utilization of instruction's input subtree if it is a root.
+  // This is different from hlo_properties_[instr][kUtilizationKey] which
+  // is the utilization of the instruction by other roots.
+  absl::flat_hash_map<const HloInstruction*, float> root_utilizations_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis_test.cc
index b31c6d280d2..f2b2c6cc03b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis_test.cc
@@ -20,13 +20,21 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-constexpr int64_t kPointerSize = 8;
-
-int64_t ShapeSize(const Shape& shape) {
-  return ShapeUtil::ByteSizeOf(shape, kPointerSize);
-}
+class GpuHloCostAnalysisTest : public HloTestBase {
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const {
+    return [&](const Shape& shape) {
+      constexpr int64_t kPointerSize = 8;
+      return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+    };
+  }
 
-using GpuHloCostAnalysisTest = HloTestBase;
+ public:
+  HloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
+                                    /*per_second_rates=*/{},
+                                    /*count_multiple_input_accesses=*/true};
+  GpuHloCostAnalysis analysis_{options_};
+  GpuHloCostAnalysisTest() : HloTestBase() {}
+};
 
 TEST_F(GpuHloCostAnalysisTest, ConvCustomCall) {
   absl::string_view hlo_string = R"(
@@ -45,18 +53,67 @@ ENTRY entry {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
-
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
   HloComputation* comp = module->entry_computation();
   const HloInstruction* conv1 = comp->GetInstructionWithName("conv1");
-  EXPECT_EQ(analysis.operand_bytes_accessed(*conv1, 0),
-            sizeof(int8_t) * 128 * 12 * 24 * 24 * 4);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*conv1, 1),
-            sizeof(int8_t) * 16 * 12 * 5 * 5 * 4);
-  EXPECT_EQ(analysis.output_bytes_accessed(*conv1),
-            sizeof(int8_t) * 128 * 4 * 24 * 24 * 4);
-  EXPECT_EQ(analysis.flop_count(*conv1), 159694848);
+  int op0_size = sizeof(int8_t) * 128 * 12 * 24 * 24 * 4;
+  int op1_size = sizeof(int8_t) * 16 * 12 * 5 * 5 * 4;
+  int op2_size = sizeof(float) * 16;
+  int out_size = sizeof(int8_t) * 128 * 4 * 24 * 24 * 4;
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*conv1, 0), op0_size);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*conv1, 1), op1_size);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*conv1, 2), op2_size);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*conv1), out_size);
+  EXPECT_EQ(analysis_.bytes_accessed(*conv1),
+            op0_size + op1_size + op2_size + out_size);
+  EXPECT_EQ(analysis_.flop_count(*conv1), 159694848);
+}
+
+TEST_F(GpuHloCostAnalysisTest, SoftmaxCustomCall) {
+  absl::string_view hlo_string = R"(
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT maximum = f32[] add(arg_0.1, arg_1.1)
+}
+
+softmax_computation {
+  param_0 = f32[127,125]{1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+  exponential = f32[127,125]{1,0} exponential(subtract)
+  constant_zero = f32[] constant(0)
+  second_reduce = f32[127]{0} reduce(exponential, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[127,125]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT divide = f32[127,125]{1,0} divide(exponential, second_broadcast)
+}
+
+ENTRY entry {
+  param = f32[127,125]{1,0} parameter(0)
+  ROOT softmax = f32[127,125]{1,0} custom-call(param), custom_call_target="__softmax_fusion", to_apply=softmax_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  HloComputation* comp = module->entry_computation();
+  const HloInstruction* softmax = comp->GetInstructionWithName("softmax");
+  int op_size = sizeof(float) * 127 * 125;
+  int out_size = sizeof(float) * 127 * 125;
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*softmax, 0), op_size);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*softmax), out_size);
+  EXPECT_EQ(analysis_.bytes_accessed(*softmax), op_size + out_size);
+  EXPECT_EQ(analysis_.flop_count(*softmax), 237363);
 }
 
 TEST_F(GpuHloCostAnalysisTest, ReduceWindowWithOverlapsRepeatedReads) {
@@ -81,21 +138,21 @@ ENTRY entry {
   HloInstruction* root = module->entry_computation()->root_instruction();
   int n_output_elements = 3 * 4;
 
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  // Each of the output elements are generated from reducing [4x5] elements.
-  EXPECT_EQ(analysis.flop_count(), n_output_elements * (4 * 5 - 1));
+  // Each of the output elements are generated from reducing [4x5] elements;
+  // each elementwise operation is counted as 3 flops.
+  EXPECT_EQ(analysis_.flop_count(), 3 * n_output_elements * (4 * 5 - 1));
 
-  EXPECT_EQ(analysis.bytes_accessed(),
+  EXPECT_EQ(analysis_.bytes_accessed(),
             sizeof(float) * (8 * 8 + 1 + n_output_elements));
 
   // For every output element (window size) elements are read from operand 0
   // independently.
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0),
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0),
             sizeof(float) * n_output_elements * 4 * 5);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 1), sizeof(float) * 1);
-  EXPECT_EQ(analysis.output_bytes_accessed(*root),
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 1), sizeof(float) * 1);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root),
             sizeof(float) * n_output_elements);
 }
 
@@ -119,17 +176,52 @@ ENTRY e {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
-
   HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 10000);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 10000);
+  // Operand + output.
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 2 * 10000);
+  EXPECT_EQ(analysis_.bytes_accessed(), 2 * 10000);
+}
+
+TEST_F(GpuHloCostAnalysisTest, WithoutRepeats) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  p1 = s8[] parameter(0)
+  a1 = s8[] add(p1, p1)
+  b1 = s8[10000] broadcast(a1), dimensions={}
+  a2 = s8[10000] add(b1, b1)
+  s1 = s8[8000] slice(a2), slice={[0:8000]}
+  s2 = s8[8000] slice(a2), slice={[2000:10000]}
+  c = s8[10000] constant({...})
+  sc1 = s8[8000] slice(c), slice={[0:8000]}
+  sc2 = s8[8000] slice(c), slice={[2000:10000]}
+  a3 = s8[8000] add(s1, s2)
+  a4 = s8[8000] add(sc1, sc2)
+  ROOT a5 = s8[8000] add(a3, a4)
+}
 
-  GpuHloCostAnalysis analysis({ShapeSize});
+ENTRY e {
+  p0 = s8[] parameter(0)
+  ROOT r0 = s8[8000] fusion(p0), kind=kInput, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  options_.count_multiple_input_accesses = false;
+  GpuHloCostAnalysis analysis{options_};
   ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
 
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 10000);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 10000);
-  // operand + output
-  EXPECT_EQ(analysis.bytes_accessed(*root), 2 * 10000);
-  EXPECT_EQ(analysis.bytes_accessed(), 2 * 10000);
+  EXPECT_EQ(analysis.output_bytes_accessed(*root), 8000);
+  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 1);
+  // Operand + output + constant.
+  EXPECT_EQ(analysis.bytes_accessed(*root), 1 + 8000 + 10000);
+  EXPECT_EQ(analysis.bytes_accessed(), 1 + 8000 + 10000);
 }
 
 TEST_F(GpuHloCostAnalysisTest, BroadcastFlops) {
@@ -151,14 +243,14 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
   auto n_elements = 1024 * 1024;
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), n_elements * 4);
-  EXPECT_EQ(analysis.bytes_accessed(*root), n_elements * 4);
-  EXPECT_EQ(analysis.bytes_accessed(), n_elements * 4);
-  EXPECT_EQ(analysis.flop_count(), n_elements * 3);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), n_elements * 4);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), n_elements * 4);
+  EXPECT_EQ(analysis_.bytes_accessed(), n_elements * 4);
+  EXPECT_EQ(analysis_.flop_count(), n_elements * 3 * 3);
+  EXPECT_EQ(analysis_.IrSize(*root), 5);
 }
 
 TEST_F(GpuHloCostAnalysisTest, Slice) {
@@ -180,13 +272,13 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   const HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 1);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 1);
-  EXPECT_EQ(analysis.bytes_accessed(*root), 2);
-  EXPECT_EQ(analysis.bytes_accessed(), 2);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 1);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 1);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 2);
+  EXPECT_EQ(analysis_.bytes_accessed(), 2);
+  EXPECT_EQ(analysis_.IrSize(*root), 4);
 }
 
 TEST_F(GpuHloCostAnalysisTest, TwoSlices) {
@@ -211,13 +303,13 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   const HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 1);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 2);
-  EXPECT_EQ(analysis.bytes_accessed(*root), 3);
-  EXPECT_EQ(analysis.bytes_accessed(), 3);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 1);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 2);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 3);
+  EXPECT_EQ(analysis_.bytes_accessed(), 3);
+  EXPECT_EQ(analysis_.IrSize(*root), 9);
 }
 
 TEST_F(GpuHloCostAnalysisTest, MultipleTrivialUsers) {
@@ -239,15 +331,15 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
   // Expect that uses of p0 by different trivial users (m0, n0) can be
   // combined into a single memory access.
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 1);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 1);
-  EXPECT_EQ(analysis.bytes_accessed(*root), 1 + 1);
-  EXPECT_EQ(analysis.bytes_accessed(), 1 + 1);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 1);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 1);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 1 + 1);
+  EXPECT_EQ(analysis_.bytes_accessed(), 1 + 1);
+  EXPECT_EQ(analysis_.IrSize(*root), 4);
 }
 
 TEST_F(GpuHloCostAnalysisTest, MixedUsers) {
@@ -256,10 +348,11 @@ HloModule m
 
 f {
   p0 = s8[10] parameter(0)
-  m0 = s8[10] multiply(p0, p0)
-  a0 = s8[10] add(p0, p0)
+  n0 = s8[10] negate(p0)
+  m0 = s8[10] multiply(n0, n0)
+  a0 = s8[10] add(n0, n0)
   s0 = s8[5] slice(a0), slice={[0:5]}
-  s1 = s8[2] slice(p0), slice={[4:6]}
+  s1 = s8[2] slice(n0), slice={[4:6]}
   n1 = s8[2] negate(s1)
   ROOT c0 = s8[17] concatenate(s0, m0, n1), dimensions={0}
 }
@@ -272,16 +365,22 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  // Expect that uses of p0 by different trivial users (m0, a0) can be
+  // Expect that uses of n0 by different trivial users (m0, a0) can be
   // combined into a single memory access, but slices have to be counted
   // separately.
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 17);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 17);
-  EXPECT_EQ(analysis.bytes_accessed(*root), 17 + 17);
-  EXPECT_EQ(analysis.bytes_accessed(), 17 + 17);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 17);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 17);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 17 + 17);
+  EXPECT_EQ(analysis_.bytes_accessed(), 17 + 17);
+  // There are 2 slice accesses + 1 element-wise from the root.
+  EXPECT_EQ(analysis_.IrSize(*root->fused_parameter(0)), 3);
+  // Because p0 is only directly used by elementwise n0 their code sizes
+  // have to be equal.
+  EXPECT_EQ(analysis_.IrSize(*root->fused_parameter(0)),
+            analysis_.IrSize(*root->fused_parameter(0)->users()[0]));
+  EXPECT_EQ(analysis_.IrSize(*root), 12);
 }
 
 TEST_F(GpuHloCostAnalysisTest, FractionalUseRoundingUp) {
@@ -315,14 +414,13 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 2);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 10);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 1), 4);
-  EXPECT_EQ(analysis.bytes_accessed(*root), 2 + 10 + 4);
-  EXPECT_EQ(analysis.bytes_accessed(), 2 + 10 + 4);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 2);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 10);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 1), 4);
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 2 + 10 + 4);
+  EXPECT_EQ(analysis_.bytes_accessed(), 2 + 10 + 4);
 }
 
 TEST_F(GpuHloCostAnalysisTest, LargeConstant) {
@@ -343,14 +441,14 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
-  GpuHloCostAnalysis analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
-  EXPECT_EQ(analysis.output_bytes_accessed(*root), 1000);
-  EXPECT_EQ(analysis.operand_bytes_accessed(*root, 0), 1000);
-  // parameter + output + constant
-  EXPECT_EQ(analysis.bytes_accessed(*root), 3000);
-  EXPECT_EQ(analysis.bytes_accessed(), 3000);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*root), 1000);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*root, 0), 1000);
+  // Parameter + output + constant.
+  EXPECT_EQ(analysis_.bytes_accessed(*root), 3000);
+  EXPECT_EQ(analysis_.bytes_accessed(), 3000);
+  EXPECT_EQ(analysis_.IrSize(*root), 3);
 }
 
 TEST_F(GpuHloCostAnalysisTest, DynUpdateSliceUsingOperandData) {
@@ -373,15 +471,14 @@ TEST_F(GpuHloCostAnalysisTest, DynUpdateSliceUsingOperandData) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_fusion_module_str));
-  GpuHloCostAnalysis fusion_analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&fusion_analysis));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
 
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ASSERT_EQ(fusion->opcode(), HloOpcode::kFusion);
 
   // Input size minus update size.
-  EXPECT_EQ(fusion_analysis.operand_bytes_accessed(*fusion, 0), 3 - 1);
-  EXPECT_EQ(fusion_analysis.output_bytes_accessed(*fusion), 3);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*fusion, 0), 3 - 1);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*fusion), 3);
 }
 
 TEST_F(GpuHloCostAnalysisTest, DynUpdateSliceNotUsingOperandData) {
@@ -403,14 +500,80 @@ TEST_F(GpuHloCostAnalysisTest, DynUpdateSliceNotUsingOperandData) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_fusion_module_str));
-  GpuHloCostAnalysis fusion_analysis({ShapeSize});
-  ASSERT_IS_OK(module->entry_computation()->Accept(&fusion_analysis));
-
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ASSERT_EQ(fusion->opcode(), HloOpcode::kFusion);
 
-  EXPECT_EQ(fusion_analysis.operand_bytes_accessed(*fusion, 0), 0);
-  EXPECT_EQ(fusion_analysis.output_bytes_accessed(*fusion), 1);
+  EXPECT_EQ(analysis_.operand_bytes_accessed(*fusion, 0), 0);
+  EXPECT_EQ(analysis_.output_bytes_accessed(*fusion), 1);
+}
+
+TEST_F(GpuHloCostAnalysisTest, CommonElementwiseUseTwoParameters) {
+  const char* hlo_fusion_module_str = R"(
+  HloModule m
+
+  add {
+    p0 = s8[] parameter(0)
+    p1 = s8[] parameter(1)
+    ROOT _ = s8[] add(p0, p1)
+  }
+
+  f {
+    p0 = s8[10] parameter(0)
+    p1 = s8[10] parameter(1)
+    a = s8[10] add(p0, p1)
+    c0 = s8[] constant(0)
+    r0 = s8[] reduce(a, c0), dimensions={0}, to_apply=add
+    c1 = s8[] constant(100)
+    r1 = s8[] reduce(a, c1), dimensions={0}, to_apply=add
+    ROOT _ = s8[] add(r0, r1)
+  }
+
+  ENTRY _ {
+    p0 = s8[10] parameter(0)
+    p1 = s8[10] parameter(1)
+    ROOT _ = s8[] fusion(p0, p1), kind=kLoop, calls=f
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_fusion_module_str));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(analysis_.CommonElementwiseUtilization(fusion->fused_parameter(0),
+                                                   fusion->fused_parameter(1)),
+            2.f);
+}
+
+TEST_F(GpuHloCostAnalysisTest, CommonElementwiseUseParameterAndRoot) {
+  const char* hlo_fusion_module_str = R"(
+  HloModule m
+
+  f {
+    p0 = s8[10] parameter(0)
+    p1 = s8[] parameter(1)
+    p1b = s8[10] broadcast(p1)
+    a = s8[10] add(p0, p1b)
+    ROOT _ = s8[10] negate(a)
+  }
+
+  ENTRY _ {
+    p0 = s8[10] parameter(0)
+    p1 = s8[] parameter(1)
+    ROOT _ = s8[10] fusion(p0, p1), kind=kLoop, calls=f
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_fusion_module_str));
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(analysis_.CommonElementwiseUtilization(
+                fusion->fused_parameter(0), fusion->fused_expression_root()),
+            1.f);
+  EXPECT_EQ(analysis_.CommonElementwiseUtilization(
+                fusion->fused_parameter(1), fusion->fused_expression_root()),
+            0.f);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index df0d9657b97..713cda7820c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -15,24 +15,31 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 
+#include <cstddef>
 #include <deque>
+#include <iostream>
 #include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/latency_hiding_scheduler.h"
 
 namespace xla {
 namespace gpu {
 
 namespace {
 
-
 bool ShouldScheduleAsEarlyAsPossible(const HloInstruction& instr) {
   switch (instr.opcode()) {
     case HloOpcode::kAllReduceStart:
+    case HloOpcode::kCollectivePermuteStart:
       return true;
     case HloOpcode::kCustomCall:
       return static_cast<const HloCustomCallInstruction&>(instr)
@@ -53,6 +60,7 @@ bool ShouldScheduleSuccessor(const HloInstruction& sussessor,
 bool ShouldScheduleAsLateAsPossible(const HloInstruction& instr) {
   switch (instr.opcode()) {
     case HloOpcode::kAllReduceDone:
+    case HloOpcode::kCollectivePermuteDone:
       return true;
     case HloOpcode::kCustomCall:
       return static_cast<const HloCustomCallInstruction&>(instr)
@@ -149,18 +157,90 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
   return result;
 }
 
-}  // end namespace
-
-StatusOr<HloSchedule> ScheduleGpuModule(const HloModule* module,
-                                        int64_t pointer_size) {
+StatusOr<HloSchedule> ScheduleGpuModuleWithMemoryScheduler(
+    const HloModule* module, int64_t pointer_size, bool enable_post_processor) {
+  MemorySchedulerPostprocessor post_processor =
+      enable_post_processor ? PostprocessorToScheduleAsEarlyOrLateAsPossible
+                            : nullptr;
   return ScheduleModule(
       module,
       [pointer_size](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
       },
-      ComputationSchedulerToModuleScheduler(
-          DefaultMemoryScheduler,
-          PostprocessorToScheduleAsEarlyOrLateAsPossible));
+      ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler,
+                                            post_processor));
+}
+
+// Latency hiding scheduler support.
+
+SchedulerConfig GetSchedulerConfig(const GpuDeviceInfo& gpu_info) {
+  SchedulerConfig config;
+  config.all_reduce_overlap_limit = 1;
+  config.use_real_cost_model = false;
+  config.aggressive_scheduling_policies = true;
+
+  // Assume 75% of the total device memory is available for XLA.
+  config.memory_limit = gpu_info.device_memory_size * 0.75;
+  return config;
+}
+
+// Latency estimator that assigns uniform latency and cost for all instructions.
+// The expectation is that this should keep the schedule "mostly" unchanged.
+class GpuLatencyEstimatorNop : public LatencyEstimator {
+ public:
+  TimeCost GetLatencyBetween(const HloGraphNode& from,
+                             const HloGraphNode& target) const override {
+    return 1.0;
+  }
+  TimeCost NodeCost(const HloInstruction* instr) const override { return 1.0; }
+  int CyclesPerMicrosecond() const override { return 1; }
+};
+
+}  // end namespace
+
+int64_t GetSizeOfShape(const Shape& shape, int pointer_size) {
+  int64_t size = ShapeUtil::ByteSizeOf(shape, pointer_size);
+  if (shape.is_static() || shape.IsTuple()) {
+    return size;
+  }
+  // Each dynamic dimension size is represented as a S32.
+  int64_t metadata_size = sizeof(int32_t) * shape.dimensions_size();
+  return size + metadata_size;
+}
+
+Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
+                         const GpuDeviceInfo& gpu_info) {
+  const bool enable_latency_hiding_scheduler =
+      module->config()
+          .debug_options()
+          .xla_gpu_enable_latency_hiding_scheduler();
+  TF_ASSIGN_OR_RETURN(
+      HloSchedule schedule,
+      ScheduleGpuModuleWithMemoryScheduler(module, pointer_size,
+                                           !enable_latency_hiding_scheduler));
+  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
+
+  if (!enable_latency_hiding_scheduler) {
+    return OkStatus();
+  }
+  SchedulerConfig config = GetSchedulerConfig(gpu_info);
+  auto latency_estimator = std::make_unique<GpuLatencyEstimatorNop>();
+  auto async_tracker = std::make_unique<AsyncTracker>(config);
+
+  auto shape_size_in_bytes = [pointer_size](const Shape& shape) {
+    return GetSizeOfShape(shape, pointer_size);
+  };
+  HloPassPipeline pipeline("latency-hiding-scheduler");
+  auto scheduler_core = std::make_unique<DefaultSchedulerCore>(
+      shape_size_in_bytes, async_tracker.get(), latency_estimator.get(),
+      config);
+
+  pipeline.AddPass<LatencyHidingScheduler>(
+      std::move(latency_estimator), std::move(async_tracker),
+      std::move(scheduler_core), shape_size_in_bytes);
+
+  TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+  return OkStatus();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 68cd594bacf..0b6382b27f9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -19,15 +19,18 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 
 namespace xla {
 namespace gpu {
 
+int64_t GetSizeOfShape(const Shape& shape, int pointer_size);
+
 // Determines the schedule of HLO instructions for a module run on the GPU.
-StatusOr<HloSchedule> ScheduleGpuModule(const HloModule* module,
-                                        int64_t pointer_size);
+Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
+                         const GpuDeviceInfo& gpu_info);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 1984d50d050..49f3f9388f0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -17,11 +17,14 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+#include <optional>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -37,15 +40,20 @@ class GpuHloScheduleTest : public HloTestBase {
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
 
-  static SequentialHloOrdering BuildHloOrdering(HloModule* module) {
-    HloSchedule schedule =
-        ScheduleGpuModule(module, /*pointer_size=*/8).value();
-    return SequentialHloOrdering{schedule};
+  SequentialHloOrdering BuildHloOrdering(HloModule* module) {
+    Backend& test_backend = backend();
+    const GpuDeviceInfo gpu_device_info =
+        GetGpuDeviceInfo(test_backend.default_stream_executor());
+    TF_CHECK_OK(ScheduleGpuModule(module, /*pointer_size=*/8, gpu_device_info));
+    return SequentialHloOrdering{module->schedule()};
   }
 
-  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule(
+      bool enable_latency_hiding_scheduler = false) {
     HloModuleConfig config;
-    auto debug_options = GetDebugOptionsForTest();
+    DebugOptions debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_latency_hiding_scheduler(
+        enable_latency_hiding_scheduler);
     config.set_debug_options(debug_options);
     return std::make_unique<HloModule>("test_module", config);
   }
@@ -165,7 +173,66 @@ TEST_F(GpuHloScheduleTest, AsyncCustomCall) {
   EXPECT_TRUE(order.ExecutesBefore(blocking_call, add4));
 }
 
-TEST_F(GpuHloScheduleTest, AsyncAllReduce) {
+TEST_F(GpuHloScheduleTest, AsyncCollectivePermute) {
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
+
+  HloComputation::Builder builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
+  HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, x, y));
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add0, y));
+  HloInstruction* add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, z));
+
+  Shape u32_scalar = ShapeUtil::MakeShape(U32, {});
+
+  Shape collective_permute_start_shape =
+      ShapeUtil::MakeTupleShape({f32_2x2_, f32_2x2_, u32_scalar, u32_scalar});
+  HloInstruction* collective_permute_start =
+      builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
+          collective_permute_start_shape, add0,
+          /*source_target_pairs=*/{{0, 1}}, /*channel_id=*/std::nullopt));
+  // In addition, add control_dependency: add1->nonblocking_call.
+  TF_CHECK_OK(add1->AddControlDependencyTo(collective_permute_start));
+  // Blocking call, which only add4 depends on.
+  HloInstruction* collective_permute_done = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32_2x2_, HloOpcode::kCollectivePermuteDone,
+                                  collective_permute_start));
+  HloInstruction* add3 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
+  HloInstruction* add4 = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32_2x2_, HloOpcode::kAdd, add3, collective_permute_done));
+
+  module->AddEntryComputation(builder.Build(add4));
+
+  SequentialHloOrdering order = BuildHloOrdering(module.get());
+  VLOG(2) << order.ToString();
+
+  // Order constrained by data dependency.
+  EXPECT_TRUE(order.ExecutesBefore(add0, collective_permute_start));
+  // Order constrained by control dependency.
+  EXPECT_TRUE(order.ExecutesBefore(add1, collective_permute_start));
+  // Test that all_reduce_start is scheduled before add2.
+  EXPECT_TRUE(order.ExecutesBefore(collective_permute_start, add2));
+  EXPECT_TRUE(order.ExecutesBefore(collective_permute_start, add3));
+  EXPECT_TRUE(order.ExecutesBefore(collective_permute_start, add4));
+
+  // Test that all_reduce_done is scheduled after add3.
+  EXPECT_TRUE(order.ExecutesBefore(add3, collective_permute_done));
+  EXPECT_TRUE(order.ExecutesBefore(collective_permute_done, add4));
+}
+
+class GpuHloScheduleParameterizedTest
+    : public GpuHloScheduleTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(GpuHloScheduleParameterizedTest, AsyncAllReduce) {
   // All-reduce reduction computation.
   HloComputation::Builder reduction_builder("add");
   HloInstruction* x0 =
@@ -180,7 +247,9 @@ TEST_F(GpuHloScheduleTest, AsyncAllReduce) {
       reduction_builder.AddInstruction(HloInstruction::CreateBinary(
           ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, x0, y0));
 
-  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
+  const bool use_latency_hiding_scheduler = GetParam();
+  std::unique_ptr<HloModule> module =
+      CreateNewVerifiedModule(use_latency_hiding_scheduler);
   HloComputation* reduction_computation =
       module->AddEmbeddedComputation(reduction_builder.Build(add));
 
@@ -235,5 +304,8 @@ TEST_F(GpuHloScheduleTest, AsyncAllReduce) {
   EXPECT_TRUE(order.ExecutesBefore(all_reduce_done, add4));
 }
 
+INSTANTIATE_TEST_SUITE_P(GpuHloScheduleParameterizedTest,
+                         GpuHloScheduleParameterizedTest, ::testing::Bool());
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 40a04d0e5dd..79e6c65b338 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -20,15 +20,15 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -98,9 +98,10 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
     return kAllNHWC;
   }
 
-  // If we're not Volta or not fp16, or not conv2D, the decision is easy: Use
-  // NCHW.
-  if (input_ty != F16 ||
+  // If we're not Volta or not fp16/bfloat16, or not conv2D, the decision is
+  // easy: Use NCHW.
+  const bool isFloat16 = (input_ty == F16) || (input_ty == BF16);
+  if (!isFloat16 ||
       !stream_executor->GetDeviceDescription()
            .cuda_compute_capability()
            .IsAtLeast(se::CudaComputeCapability::VOLTA) ||
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index d74ce21436a..dd932bd00b1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/platform/status.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index df8aeee46cc..497cdd2cd75 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -16,19 +16,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
@@ -41,7 +40,15 @@ namespace op = xla::testing::opcode_matchers;
 using ::testing::AllOf;
 using ::tsl::testing::IsOkAndHolds;
 
-using LayoutAssignmentTest = HloTestBase;
+class LayoutAssignmentTest : public HloTestBase {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
 
 TEST_F(LayoutAssignmentTest, Elementwise) {
   Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
@@ -347,6 +354,32 @@ ENTRY entry {
   expect_layout(call_0->operand(1)->shape(), {1, 2, 0});
 }
 
+TEST_F(LayoutAssignmentTest, ConvCuDNNBF16) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Conv with Bfloat16 uses NHWC layout for "
+                    "architectures with Tensor Cores.";
+  }
+
+  const char* hlo = R"(
+
+  HloModule jit_conv_general_dilated
+
+  ENTRY main.4 {
+    Arg_0.1 = bf16[1,64,64,16]{3,2,1,0} parameter(0), sharding={replicated}
+    Arg_1.2 = bf16[3,3,16,32]{3,2,1,0} parameter(1), sharding={replicated}
+    ROOT convolution.3 = bf16[1,64,64,32]{3,2,1,0} convolution(Arg_0.1, Arg_1.2), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, metadata={op_name="jit(conv_general_dilated)/jit(main)/conv_general_dilated[window_strides=(1, 1) padding=((1, 1), (1, 1)) lhs_dilation=(1, 1) rhs_dilation=(1, 1) dimension_numbers=ConvDimensionNumbers(lhs_spec=(0, 3, 1, 2), rhs_spec=(3, 2, 0, 1), out_spec=(0, 3, 1, 2)) feature_group_count=1 batch_group_count=1 lhs_shape=(1, 64, 64, 16) rhs_shape=(3, 3, 16, 32) precision=None preferred_element_type=None]" source_file="/usr/local/lib/python3.8/dist-packages/flax/linen/linear.py" source_line=438}
+  }
+)";
+
+  MatchOptimizedHlo(hlo, R"(
+  // CHECK: [[P0:%[^ ]+]] = bf16[1,64,64,16]{3,2,1,0} parameter(0), sharding={replicated}
+  // CHECK: [[P1:%[^ ]+]] = bf16[3,3,16,32]{3,2,1,0} parameter(1), sharding={replicated}
+  // CHECK-NEXT: [[P2:%[^ ]+]] = bf16[32,3,3,16]{3,2,1,0} transpose([[P1]]), dimensions={3,0,1,2}
+  // CHECK-NEXT: %cudnn-conv.1 = (bf16[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call([[P0]], [[P2]]), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward"
+  )");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_performance_model.cc b/tensorflow/compiler/xla/service/gpu/gpu_performance_model.cc
new file mode 100644
index 00000000000..7520b89a9e9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_performance_model.cc
@@ -0,0 +1,214 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_performance_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Estimated values in the absence of easy ways to query them.
+static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(1);
+static constexpr float kL2CacheSpeedup = 2.5;
+static constexpr float kL1CacheSpeedup = 8;
+// A very conservative estimate. L1 size varies because it can be dynamically
+// configured as shared memory; there is no easy way to query its actual size;
+// also we do not count what occupies cache, but rather claim that what is
+// much smaller than the cache size will likely stay in it.
+// For reference, it can be up to 256 kB per SM on RTX A6000.
+static constexpr float kL1CacheSizePerSM = 2 * 1024;
+
+// Returns whether a fusion uses the parameter at the given index elementwise
+// from its root.
+bool FusionUsesParameterElementwiseFromRoot(
+    const HloInstruction* fusion, int parameter_index,
+    const GpuHloCostAnalysis* cost_analysis) {
+  return cost_analysis->CommonElementwiseUtilization(
+             fusion->fused_parameter(parameter_index),
+             fusion->fused_expression_root()) == 1.f;
+}
+
+// Estimate read time of n_bytes_total bytes from global memory on a
+// given GPU. Account for L1 / L2 cache speedup if the input's nominal size
+// n_bytes_net is small.
+absl::Duration ReadTime(const GpuDeviceInfo& gpu_device_info,
+                        int64_t n_bytes_net, int64_t n_bytes_total) {
+  float bw = gpu_device_info.memory_bandwidth;
+  if (n_bytes_net < gpu_device_info.l2_cache_size) {
+    bw *= kL2CacheSpeedup;
+    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count) {
+      bw *= kL1CacheSpeedup;
+    }
+  }
+  return absl::Seconds(n_bytes_total / bw);
+}
+
+// Tells input access time of the producer alone if fused_consumer
+// is not specified. Otherwise estimates the access time to producer's
+// inputs as if it is fused into the consumer.
+absl::Duration ProducerInputAccessTime(
+    const GpuHloCostAnalysis* cost_analysis,
+    const GpuDeviceInfo& gpu_device_info, const HloInstruction* producer,
+    const HloInstruction* fused_consumer = nullptr) {
+  absl::Duration ret = absl::ZeroDuration();
+  float producer_output_utilization = 1.f;
+  ConstHloInstructionSet consumer_operands;
+  if (fused_consumer) {
+    producer_output_utilization = cost_analysis->operand_utilization(
+        *fused_consumer, fused_consumer->operand_index(producer));
+    for (const HloInstruction* op : fused_consumer->operands()) {
+      consumer_operands.insert(op);
+    }
+  }
+  for (int i = 0; i < producer->operand_count(); ++i) {
+    int64_t p_size_accessed =
+        cost_analysis->operand_bytes_accessed(*producer, i);
+    float operand_utilization =
+        cost_analysis->operand_utilization(*producer, i);
+    int64_t p_size_net =
+        (operand_utilization == 0)
+            ? 0
+            : static_cast<float>(p_size_accessed) / operand_utilization;
+    // Look for common operands of producer and consumer that are accessed
+    // more efficiently on merge:
+    // 1) Producer has to use the common operand elementwise from its root if
+    //    it is a fusion or just be an elementwise instruction.
+    // 2) Consumer has to have common elementwise roots for the producer
+    //    and the common operand if it is a fusion or just be an elementwise
+    //    instruction.
+    float common_utilization = 0;
+    if (consumer_operands.count(producer->operand(i)) &&
+        (producer->IsElementwise() ||
+         (producer->opcode() == HloOpcode::kFusion &&
+          FusionUsesParameterElementwiseFromRoot(producer, i,
+                                                 cost_analysis)))) {
+      if (fused_consumer->opcode() == HloOpcode::kFusion) {
+        int64_t consumer_idx_of_common_operand =
+            fused_consumer->operand_index(producer->operand(i));
+        int64_t consumer_idx_of_producer =
+            fused_consumer->operand_index(producer);
+        common_utilization = cost_analysis->CommonElementwiseUtilization(
+            fused_consumer->fused_parameter(consumer_idx_of_common_operand),
+            fused_consumer->fused_parameter(consumer_idx_of_producer));
+      } else {
+        if (fused_consumer->IsElementwise()) {
+          common_utilization = 1.f;
+        }
+      }
+    }
+    CHECK_LE(common_utilization, producer_output_utilization);
+    ret += ReadTime(
+        gpu_device_info, std::min(p_size_net, p_size_accessed),
+        p_size_accessed * (producer_output_utilization - common_utilization));
+  }
+  return ret;
+}
+}  // namespace
+
+/*static*/ struct GpuPerformanceModel::RunTimes
+GpuPerformanceModel::EstimateRunTimes(
+    const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+    const GpuDeviceInfo& gpu_device_info,
+    const std::vector<HloInstruction*> fused_users, bool multi_output) {
+  VLOG(8) << "Producer: " << producer->name();
+  if (producer->opcode() == HloOpcode::kFusion) {
+    VLOG(10) << producer->fused_instructions_computation()->ToString();
+  }
+
+  float memory_bandwidth_bytes_per_second = gpu_device_info.memory_bandwidth;
+
+  float producer_bytes_out = cost_analysis->output_bytes_accessed(*producer);
+  float producer_bytes_in =
+      cost_analysis->bytes_accessed(*producer) - producer_bytes_out;
+  VLOG(8) << "Producer FLOPs: " << cost_analysis->flop_count(*producer);
+  VLOG(8) << "Producer bytes in: " << producer_bytes_in;
+  VLOG(8) << "Producer bytes out: " << producer_bytes_out;
+  float producer_elements_out =
+      ShapeUtil::ElementsInRecursive(producer->shape());
+  VLOG(8) << "Producer elements out: " << producer_elements_out;
+
+  auto compute_time = [&](int64_t n_flops, int64_t n_threads) {
+    int fpu_count = gpu_device_info.core_count * gpu_device_info.fpus_per_core;
+    float n_threads_active = fmin(n_threads, fpu_count);
+    float flop_per_second_per_fpu = 2 * 1e9 * gpu_device_info.clock_rate_ghz;
+    float flop_per_second_effective =
+        flop_per_second_per_fpu * n_threads_active;
+    return absl::Seconds(n_flops / flop_per_second_effective);
+  };
+
+  absl::Duration compute_time_unfused =
+      compute_time(cost_analysis->flop_count(*producer), producer_elements_out);
+  VLOG(8) << "Compute time unfused: " << compute_time_unfused;
+  VLOG(8) << "Input access time unfused: "
+          << ProducerInputAccessTime(cost_analysis, gpu_device_info, producer);
+  absl::Duration output_write_time_unfused =
+      absl::Seconds(producer_bytes_out / memory_bandwidth_bytes_per_second);
+  VLOG(8) << "Output write time unfused: " << output_write_time_unfused;
+  absl::Duration exec_time_unfused = std::max(
+      compute_time_unfused,
+      ProducerInputAccessTime(cost_analysis, gpu_device_info, producer) +
+          output_write_time_unfused);
+
+  int64_t fused_consumer_count = fused_users.size();
+  VLOG(8) << "Consumer count: " << fused_consumer_count;
+  float total_producer_utilization = 0;
+
+  absl::Duration exec_time_fused = absl::ZeroDuration();
+  absl::Duration producer_output_read_time_unfused = absl::ZeroDuration();
+  for (const HloInstruction* u : fused_users) {
+    float utilization_by_this_consumer =
+        cost_analysis->operand_utilization(*u, u->operand_index(producer));
+    total_producer_utilization += utilization_by_this_consumer;
+    absl::Duration compute_time_by_this_consumer = compute_time(
+        cost_analysis->flop_count(*producer) * utilization_by_this_consumer,
+        producer_elements_out * utilization_by_this_consumer);
+    exec_time_fused += std::max(
+        compute_time_by_this_consumer,
+        ProducerInputAccessTime(cost_analysis, gpu_device_info, producer, u));
+    producer_output_read_time_unfused +=
+        ReadTime(gpu_device_info,
+                 std::min(producer_bytes_out,
+                          producer_bytes_out * utilization_by_this_consumer),
+                 producer_bytes_out * utilization_by_this_consumer);
+  }
+  VLOG(8) << "Utilization of producer output: " << total_producer_utilization;
+
+  absl::Duration time_unfused =
+      kKernelLaunchOverhead * (fused_consumer_count + 1) + exec_time_unfused +
+      producer_output_read_time_unfused;
+  VLOG(8) << "Unfused time: " << time_unfused;
+
+  absl::Duration time_fused =
+      kKernelLaunchOverhead * fused_consumer_count + exec_time_fused;
+  // Multi-output fusion still writes the initial output of the producer.
+  // For now assume that the producer's output does not need to be recomputed.
+  if (multi_output) {
+    time_fused += output_write_time_unfused;
+  }
+  VLOG(8) << "Fused time: " << time_fused;
+
+  return {time_unfused, time_fused};
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_performance_model.h b/tensorflow/compiler/xla/service/gpu/gpu_performance_model.h
new file mode 100644
index 00000000000..334966f9793
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_performance_model.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_PERFORMANCE_MODEL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_PERFORMANCE_MODEL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuPerformanceModel {
+ public:
+  struct RunTimes {
+    absl::Duration time_unfused;
+    absl::Duration time_fused;
+  };
+  static struct RunTimes EstimateRunTimes(
+      const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
+      const GpuDeviceInfo& gpu_device_info,
+      const std::vector<HloInstruction*> fused_users = {},
+      bool multi_output = false);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_PERFORMANCE_MODEL_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_performance_model_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_performance_model_test.cc
new file mode 100644
index 00000000000..1a11440c2df
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_performance_model_test.cc
@@ -0,0 +1,850 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_performance_model.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuPerformanceModelTest : public HloTestBase {
+  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const {
+    return [&](const Shape& shape) {
+      constexpr int64_t kPointerSize = 8;
+      return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+    };
+  }
+
+ public:
+  HloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
+                                    /*per_second_rates=*/{},
+                                    /*count_multiple_input_accesses=*/true};
+  GpuHloCostAnalysis analysis_{options_};
+  // The reference times in the test cases below are measured
+  // on A6000 by profiling the execution of the HLOs.
+  GpuDeviceInfo device_info_ = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  GpuPerformanceModelTest() : HloTestBase() {}
+};
+
+TEST_F(GpuPerformanceModelTest, LargeWrite) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  c0 = f32[] constant(0)
+  ROOT b0 = f32[10000000] broadcast(c0)
+}
+
+ENTRY e {
+  ROOT r.1 = f32[10000000] fusion(), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  // Dominated by the DRAM bandwidth.
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 57, 10);
+}
+
+TEST_F(GpuPerformanceModelTest, SmallReadWrite) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  p0 = f32[1000] parameter(0)
+  p1 = f32[1000] parameter(1)
+  ROOT b0 = f32[1000] add(p0, p1)
+}
+
+ENTRY e {
+  p0 = f32[1000] parameter(0)
+  p1 = f32[1000] parameter(1)
+  ROOT r.1 = f32[1000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  // Dominated by the kernel launch overhead.
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 2, 1);
+}
+
+TEST_F(GpuPerformanceModelTest, LargeReadWrite) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+ p0 = f32[10000000] parameter(0)
+ p1 = f32[10000000] parameter(1)
+ ROOT a0 = f32[10000000] add(p0, p1)
+}
+
+ENTRY e {
+ p0 = f32[10000000] parameter(0)
+ p1 = f32[10000000] parameter(1)
+ ROOT r.1 = f32[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  // Dominated by the DRAM bandwidth.
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 175, 30);
+}
+
+TEST_F(GpuPerformanceModelTest, L1CacheEffect) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  p0 = f32[10000] parameter(0)
+  bc0 = f32[10000,1000] broadcast(p0), dimensions={0}
+  b0 = f32[10000000] bitcast(bc0)
+  p1 = f32[10000000] parameter(1)
+  ROOT a0 = f32[10000000] add(b0, p1)
+}
+
+ENTRY e {
+  p0 = f32[10000] parameter(0)
+  p1 = f32[10000000] parameter(1)
+  ROOT r.1 = f32[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  // Parameter 0 read is accelerated by L1 cache even though the total data
+  // volume is the same as in the test LargeReadWrite above.
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 118, 12);
+}
+
+TEST_F(GpuPerformanceModelTest, L2CacheEffect) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  p0 = f32[1000000] parameter(0)
+  bc0 = f32[1000000,10] broadcast(p0), dimensions={0}
+  b0 = f32[10000000] bitcast(bc0)
+  p1 = f32[10000000] parameter(1)
+  ROOT a0 = f32[10000000] add(b0, p1)
+}
+
+ENTRY e {
+  p0 = f32[1000000] parameter(0)
+  p1 = f32[10000000] parameter(1)
+  ROOT r.1 = f32[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  // Parameter 0 read is accelerated by L2 cache (does not fit in L1).
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 123, 12);
+}
+
+TEST_F(GpuPerformanceModelTest, S32Divide) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = s32[10000000] parameter(0)
+  b1 = s32[10000000] parameter(1)
+  d0 = s32[10000000] divide(b0, b1)
+  d1 = s32[10000000] divide(d0, b1)
+  d2 = s32[10000000] divide(d1, b1)
+  d3 = s32[10000000] divide(d2, b1)
+  d4 = s32[10000000] divide(d3, b1)
+  d5 = s32[10000000] divide(d4, b1)
+  d6 = s32[10000000] divide(d5, b1)
+  d7 = s32[10000000] divide(d6, b1)
+  d8 = s32[10000000] divide(d7, b1)
+  d9 = s32[10000000] divide(d8, b1)
+  d10 = s32[10000000] divide(d9, b1)
+  d11 = s32[10000000] divide(d10, b1)
+  d12 = s32[10000000] divide(d11, b1)
+  d13 = s32[10000000] divide(d12, b1)
+  d14 = s32[10000000] divide(d13, b1)
+  d15 = s32[10000000] divide(d14, b1)
+  d16 = s32[10000000] divide(d15, b1)
+  d17 = s32[10000000] divide(d16, b1)
+  d18 = s32[10000000] divide(d17, b1)
+  ROOT d19 = s32[10000000] divide(d18, b1)
+}
+
+ENTRY e {
+  p0 = s32[10000000] parameter(0)
+  p1 = s32[10000000] parameter(1)
+  ROOT r.1 = s32[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 482, 48);
+}
+
+TEST_F(GpuPerformanceModelTest, F32Log) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f32[10000000] parameter(0)
+  e0 = f32[10000000] log(b0)
+  e1 = f32[10000000] log(e0)
+  e2 = f32[10000000] log(e1)
+  e3 = f32[10000000] log(e2)
+  e4 = f32[10000000] log(e3)
+  e5 = f32[10000000] log(e4)
+  e6 = f32[10000000] log(e5)
+  e7 = f32[10000000] log(e6)
+  e8 = f32[10000000] log(e7)
+  e9 = f32[10000000] log(e8)
+  e10 = f32[10000000] log(e9)
+  e11 = f32[10000000] log(e10)
+  e12 = f32[10000000] log(e11)
+  e13 = f32[10000000] log(e12)
+  e14 = f32[10000000] log(e13)
+  e15 = f32[10000000] log(e14)
+  e16 = f32[10000000] log(e15)
+  e17 = f32[10000000] log(e16)
+  e18 = f32[10000000] log(e17)
+  e19 = f32[10000000] log(e18)
+  ROOT e20 = f32[10000000] log(e19)
+}
+
+ENTRY e {
+  p0 = f32[10000000] parameter(0)
+  ROOT r.1 = f32[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 312, 31);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Log) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  e0 = f64[10000000] log(b0)
+  e1 = f64[10000000] log(e0)
+  e2 = f64[10000000] log(e1)
+  e3 = f64[10000000] log(e2)
+  e4 = f64[10000000] log(e3)
+  e5 = f64[10000000] log(e4)
+  e6 = f64[10000000] log(e5)
+  e7 = f64[10000000] log(e6)
+  e8 = f64[10000000] log(e7)
+  e9 = f64[10000000] log(e8)
+  e10 = f64[10000000] log(e9)
+  e11 = f64[10000000] log(e10)
+  e12 = f64[10000000] log(e11)
+  e13 = f64[10000000] log(e12)
+  e14 = f64[10000000] log(e13)
+  e15 = f64[10000000] log(e14)
+  e16 = f64[10000000] log(e15)
+  e17 = f64[10000000] log(e16)
+  e18 = f64[10000000] log(e17)
+  e19 = f64[10000000] log(e18)
+  ROOT e20 = f64[10000000] log(e19)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  ROOT r.1 = f64[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 7100, 700);
+}
+
+TEST_F(GpuPerformanceModelTest, F64DivideOnce) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+ b0 = f64[10000000] parameter(0)
+ b1 = f64[10000000] parameter(1)
+ ROOT d0 = f64[10000000] divide(b0, b1)
+}
+
+ENTRY e {
+ p0 = f64[10000000] parameter(0)
+ p1 = f64[10000000] parameter(1)
+ ROOT r.1 = f64[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1100, 110);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Exp) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  e0 = f64[10000000] exponential(b0)
+  ROOT r0 = f64[10000000] exponential(e0)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  ROOT r.1 = f64[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1400, 140);
+}
+
+TEST_F(GpuPerformanceModelTest, F64DivideManyTimes) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  b1 = f64[10000000] parameter(1)
+  d0 = f64[10000000] divide(b0, b1)
+  d1 = f64[10000000] divide(d0, b1)
+  d2 = f64[10000000] divide(d1, b1)
+  d3 = f64[10000000] divide(d2, b1)
+  d4 = f64[10000000] divide(d3, b1)
+  d5 = f64[10000000] divide(d4, b1)
+  d6 = f64[10000000] divide(d5, b1)
+  d7 = f64[10000000] divide(d6, b1)
+  d8 = f64[10000000] divide(d7, b1)
+  d9 = f64[10000000] divide(d8, b1)
+  d10 = f64[10000000] divide(d9, b1)
+  d11 = f64[10000000] divide(d10, b1)
+  d12 = f64[10000000] divide(d11, b1)
+  d13 = f64[10000000] divide(d12, b1)
+  d14 = f64[10000000] divide(d13, b1)
+  d15 = f64[10000000] divide(d14, b1)
+  d16 = f64[10000000] divide(d15, b1)
+  d17 = f64[10000000] divide(d16, b1)
+  d18 = f64[10000000] divide(d17, b1)
+  ROOT d19 = f64[10000000] divide(d18, b1)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  p1 = f64[10000000] parameter(1)
+  ROOT r.1 = f64[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 20000, 2000);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Multiply) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  b1 = f64[10000000] parameter(1)
+  d0 = f64[10000000] multiply(b0, b1)
+  d1 = f64[10000000] multiply(d0, b1)
+  d2 = f64[10000000] multiply(d1, b1)
+  d3 = f64[10000000] multiply(d2, b1)
+  d4 = f64[10000000] multiply(d3, b1)
+  d5 = f64[10000000] multiply(d4, b1)
+  d6 = f64[10000000] multiply(d5, b1)
+  d7 = f64[10000000] multiply(d6, b1)
+  d8 = f64[10000000] multiply(d7, b1)
+  d9 = f64[10000000] multiply(d8, b1)
+  d10 = f64[10000000] multiply(d9, b1)
+  d11 = f64[10000000] multiply(d10, b1)
+  d12 = f64[10000000] multiply(d11, b1)
+  d13 = f64[10000000] multiply(d12, b1)
+  d14 = f64[10000000] multiply(d13, b1)
+  d15 = f64[10000000] multiply(d14, b1)
+  d16 = f64[10000000] multiply(d15, b1)
+  d17 = f64[10000000] multiply(d16, b1)
+  d18 = f64[10000000] multiply(d17, b1)
+  ROOT d19 = f64[10000000] multiply(d18, b1)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  p1 = f64[10000000] parameter(1)
+  ROOT r.1 = f64[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 794, 80);
+}
+
+TEST_F(GpuPerformanceModelTest, C128Multiply) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = c128[10000000] parameter(0)
+  b1 = c128[10000000] parameter(1)
+  d0 = c128[10000000] multiply(b0, b1)
+  d1 = c128[10000000] multiply(d0, b1)
+  d2 = c128[10000000] multiply(d1, b1)
+  d3 = c128[10000000] multiply(d2, b1)
+  d4 = c128[10000000] multiply(d3, b1)
+  d5 = c128[10000000] multiply(d4, b1)
+  d6 = c128[10000000] multiply(d5, b1)
+  d7 = c128[10000000] multiply(d6, b1)
+  d8 = c128[10000000] multiply(d7, b1)
+  d9 = c128[10000000] multiply(d8, b1)
+  d10 = c128[10000000] multiply(d9, b1)
+  d11 = c128[10000000] multiply(d10, b1)
+  d12 = c128[10000000] multiply(d11, b1)
+  d13 = c128[10000000] multiply(d12, b1)
+  d14 = c128[10000000] multiply(d13, b1)
+  d15 = c128[10000000] multiply(d14, b1)
+  d16 = c128[10000000] multiply(d15, b1)
+  d17 = c128[10000000] multiply(d16, b1)
+  d18 = c128[10000000] multiply(d17, b1)
+  ROOT d19 = c128[10000000] multiply(d18, b1)
+}
+
+ENTRY e {
+  p0 = c128[10000000] parameter(0)
+  p1 = c128[10000000] parameter(1)
+  ROOT r.1 = c128[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 4700, 470);
+}
+
+TEST_F(GpuPerformanceModelTest, C128Power) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = c128[10000000] parameter(0)
+  b1 = c128[10000000] parameter(1)
+  d0 = c128[10000000] power(b0, b1)
+  d1 = c128[10000000] power(d0, b1)
+  d2 = c128[10000000] power(d1, b1)
+  d3 = c128[10000000] power(d2, b1)
+  d4 = c128[10000000] power(d3, b1)
+  d5 = c128[10000000] power(d4, b1)
+  d6 = c128[10000000] power(d5, b1)
+  d7 = c128[10000000] power(d6, b1)
+  d8 = c128[10000000] power(d7, b1)
+  ROOT d9 = c128[10000000] power(d8, b1)
+}
+
+ENTRY e {
+  p0 = c128[10000000] parameter(0)
+  p1 = c128[10000000] parameter(1)
+  ROOT r.1 = c128[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 93000, 9300);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Power) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  b1 = f64[10000000] parameter(1)
+  d0 = f64[10000000] power(b0, b1)
+  d1 = f64[10000000] power(d0, b1)
+  d2 = f64[10000000] power(d1, b1)
+  d3 = f64[10000000] power(d2, b1)
+  d4 = f64[10000000] power(d3, b1)
+  d5 = f64[10000000] power(d4, b1)
+  d6 = f64[10000000] power(d5, b1)
+  d7 = f64[10000000] power(d6, b1)
+  d8 = f64[10000000] power(d7, b1)
+  ROOT d9 = f64[10000000] power(d8, b1)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  p1 = f64[10000000] parameter(1)
+  ROOT r.1 = f64[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 36000, 3600);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Tanh) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = f64[10000000] parameter(0)
+  e0 = f64[10000000] tanh(b0)
+  e1 = f64[10000000] tanh(e0)
+  e2 = f64[10000000] tanh(e1)
+  e3 = f64[10000000] tanh(e2)
+  e4 = f64[10000000] tanh(e3)
+  e5 = f64[10000000] tanh(e4)
+  e6 = f64[10000000] tanh(e5)
+  e7 = f64[10000000] tanh(e6)
+  e8 = f64[10000000] tanh(e7)
+  e9 = f64[10000000] tanh(e8)
+  e10 = f64[10000000] tanh(e9)
+  e11 = f64[10000000] tanh(e10)
+  e12 = f64[10000000] tanh(e11)
+  e13 = f64[10000000] tanh(e12)
+  e14 = f64[10000000] tanh(e13)
+  e15 = f64[10000000] tanh(e14)
+  e16 = f64[10000000] tanh(e15)
+  e17 = f64[10000000] tanh(e16)
+  e18 = f64[10000000] tanh(e17)
+  e19 = f64[10000000] tanh(e18)
+  ROOT e20 = f64[10000000] tanh(e19)
+}
+
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  ROOT r.1 = f64[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 14000, 1400);
+}
+
+TEST_F(GpuPerformanceModelTest, F32Tanh) {
+  absl::string_view hlo_string = R"(
+HloModule m
+ 
+f {
+ b0 = f32[10000000] parameter(0)
+ e0 = f32[10000000] tanh(b0)
+ e1 = f32[10000000] tanh(e0)
+ e2 = f32[10000000] tanh(e1)
+ e3 = f32[10000000] tanh(e2)
+ e4 = f32[10000000] tanh(e3)
+ e5 = f32[10000000] tanh(e4)
+ e6 = f32[10000000] tanh(e5)
+ e7 = f32[10000000] tanh(e6)
+ e8 = f32[10000000] tanh(e7)
+ e9 = f32[10000000] tanh(e8)
+ e10 = f32[10000000] tanh(e9)
+ e11 = f32[10000000] tanh(e10)
+ e12 = f32[10000000] tanh(e11)
+ e13 = f32[10000000] tanh(e12)
+ e14 = f32[10000000] tanh(e13)
+ e15 = f32[10000000] tanh(e14)
+ e16 = f32[10000000] tanh(e15)
+ e17 = f32[10000000] tanh(e16)
+ e18 = f32[10000000] tanh(e17)
+ e19 = f32[10000000] tanh(e18)
+ ROOT e20 = f32[10000000] tanh(e19)
+}
+
+ENTRY e {
+ p0 = f32[10000000] parameter(0)
+ ROOT r.1 = f32[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 200, 20);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Sqrt) {
+  absl::string_view hlo_string = R"(
+HloModule m
+f {
+  b0 = f64[10000000] parameter(0)
+  e0 = f64[10000000] sqrt(b0)
+  e1 = f64[10000000] sqrt(e0)
+  e2 = f64[10000000] sqrt(e1)
+  e3 = f64[10000000] sqrt(e2)
+  e4 = f64[10000000] sqrt(e3)
+  e5 = f64[10000000] sqrt(e4)
+  e6 = f64[10000000] sqrt(e5)
+  e7 = f64[10000000] sqrt(e6)
+  e8 = f64[10000000] sqrt(e7)
+  e9 = f64[10000000] sqrt(e8)
+  e10 = f64[10000000] sqrt(e9)
+  e11 = f64[10000000] sqrt(e10)
+  e12 = f64[10000000] sqrt(e11)
+  e13 = f64[10000000] sqrt(e12)
+  e14 = f64[10000000] sqrt(e13)
+  e15 = f64[10000000] sqrt(e14)
+  e16 = f64[10000000] sqrt(e15)
+  e17 = f64[10000000] sqrt(e16)
+  e18 = f64[10000000] sqrt(e17)
+  e19 = f64[10000000] sqrt(e18)
+  ROOT e20 = f64[10000000] sqrt(e19)
+}
+ENTRY e {
+  p0 = f64[10000000] parameter(0)
+  ROOT r.1 = f64[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 7800, 780);
+}
+
+TEST_F(GpuPerformanceModelTest, C128Sqrt) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = c128[10000000] parameter(0)
+  e0 = c128[10000000] sqrt(b0)
+  e1 = c128[10000000] sqrt(e0)
+  e2 = c128[10000000] sqrt(e1)
+  e3 = c128[10000000] sqrt(e2)
+  e4 = c128[10000000] sqrt(e3)
+  e5 = c128[10000000] sqrt(e4)
+  e6 = c128[10000000] sqrt(e5)
+  e7 = c128[10000000] sqrt(e6)
+  e8 = c128[10000000] sqrt(e7)
+  ROOTe9 = c128[10000000] sqrt(e8)
+}
+
+ENTRY e {
+  p0 = c128[10000000] parameter(0)
+  ROOT r.1 = c128[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 83000, 8000);
+}
+
+TEST_F(GpuPerformanceModelTest, F64Rsqrt) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+ b0 = f64[10000000] parameter(0)
+ e0 = f64[10000000] rsqrt(b0)
+ e1 = f64[10000000] rsqrt(e0)
+ e2 = f64[10000000] rsqrt(e1)
+ e3 = f64[10000000] rsqrt(e2)
+ e4 = f64[10000000] rsqrt(e3)
+ e5 = f64[10000000] rsqrt(e4)
+ e6 = f64[10000000] rsqrt(e5)
+ e7 = f64[10000000] rsqrt(e6)
+ e8 = f64[10000000] rsqrt(e7)
+ e9 = f64[10000000] rsqrt(e8)
+ e10 = f64[10000000] rsqrt(e9)
+ e11 = f64[10000000] rsqrt(e10)
+ e12 = f64[10000000] rsqrt(e11)
+ e13 = f64[10000000] rsqrt(e12)
+ e14 = f64[10000000] rsqrt(e13)
+ e15 = f64[10000000] rsqrt(e14)
+ e16 = f64[10000000] rsqrt(e15)
+ e17 = f64[10000000] rsqrt(e16)
+ e18 = f64[10000000] rsqrt(e17)
+ e19 = f64[10000000] rsqrt(e18)
+ ROOT e20 = f64[10000000] rsqrt(e19)
+}
+
+ENTRY e {
+ p0 = f64[10000000] parameter(0)
+ ROOT r.1 = f64[10000000] fusion(p0), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 6300, 630);
+}
+
+TEST_F(GpuPerformanceModelTest, C128Divide) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f {
+  b0 = c128[10000000] parameter(0)
+  b1 = c128[10000000] parameter(1)
+  d0 = c128[10000000] divide(b0, b1)
+  d1 = c128[10000000] divide(d0, b1)
+  d2 = c128[10000000] divide(d1, b1)
+  d3 = c128[10000000] divide(d2, b1)
+  d4 = c128[10000000] divide(d3, b1)
+  d5 = c128[10000000] divide(d4, b1)
+  d6 = c128[10000000] divide(d5, b1)
+  d7 = c128[10000000] divide(d6, b1)
+  d8 = c128[10000000] divide(d7, b1)
+  ROOT d9 = c128[10000000] divide(d8, b1)
+}
+
+ENTRY e {
+  p0 = c128[10000000] parameter(0)
+  p1 = c128[10000000] parameter(1)
+  ROOT r.1 = c128[10000000] fusion(p0, p1), kind=kLoop, calls=f
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(root->Accept(&analysis_));
+
+  HloInstruction* instruction = root;
+  GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+      instruction, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 64000, 6400);
+}
+
+TEST_F(GpuPerformanceModelTest, UnusedParameter) {
+  Shape shape = ShapeUtil::MakeShape(F32, {100000});
+
+  auto module = std::make_unique<HloModule>("m", HloModuleConfig{});
+  HloComputation::Builder b("b");
+  auto p0 = b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto p1 = b.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+
+  HloComputation::Builder sub_builder("subcomp");
+  HloInstruction* p0f = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "p0f"));
+  // p1f is not used.
+  HloInstruction* p1f = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "p1f"));
+  ASSERT_NE(p1f, nullptr);
+  sub_builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0f));
+
+  HloComputation* subcomp = module->AddEmbeddedComputation(sub_builder.Build());
+  auto fusion = HloInstruction::CreateFusion(
+      shape, HloInstruction::FusionKind::kLoop, {p0, p1}, subcomp);
+  b.AddInstruction(std::move(fusion));
+  module->AddEntryComputation(b.Build());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_IS_OK(module->entry_computation()->Accept(&analysis_));
+
+  GpuPerformanceModel::RunTimes t =
+      GpuPerformanceModel::EstimateRunTimes(root, &analysis_, device_info_);
+  EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 2, 1);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc b/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
index 5a6b0cf12fc..8b974f42b8a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.h"
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/reduce_scatter_utils.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
index b155f94b903..0833402d355 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <set>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/status.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
index 45fe58dfd7b..39b6c70ccd3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
index 954275bf463..164670525ca 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 718faad0cf1..4a36773cce7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -86,7 +86,7 @@ void GpuTransferManager::EnsurePinnedBuffersAllocated(
 }
 
 Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
-                                             ShapedBuffer* device_buffer,
+                                             const ShapedBuffer* device_buffer,
                                              Shape* device_shape) {
   DCHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
@@ -100,8 +100,8 @@ Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
   // DeviceMemoryBase into the Shape*'s dimensions.
   std::vector<std::pair<se::DeviceMemoryBase, Shape*>> copies;
 
-  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
+      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -122,7 +122,7 @@ Status GpuTransferManager::ReadDynamicShapes(se::Stream* stream,
           return InvalidArgument("Dynamic shape metadata size should not be 0");
         }
 
-        auto buffer_8 = se::DeviceMemory<uint8_t>(*buffer);
+        auto buffer_8 = se::DeviceMemory<uint8_t>(buffer);
         auto metadata_buffer =
             stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
         copies.push_back(std::make_pair(metadata_buffer, &device_sub_shape));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index c40fe243156..5421db07310 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -40,7 +40,8 @@ class GpuTransferManager : public GenericTransferManager {
                                  const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     MutableBorrowingLiteral literal) override;
-  Status ReadDynamicShapes(se::Stream* stream, ShapedBuffer* device_buffer,
+  Status ReadDynamicShapes(se::Stream* stream,
+                           const ShapedBuffer* device_buffer,
                            Shape* device_shape) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
index 45d9695bc89..86376922df3 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
index 23bc8c723d4..1e470831a58 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist_test.cc
@@ -39,10 +39,10 @@ class DenylistTest : public testing::Test {
 
     tsl::setenv(
         "XLA_FLAGS",
-        absl::StrCat(existing_xla_flags, "--xla_gpu_algorithm_denylist_path=",
-                     tsl::GetDataDependencyFilepath(tsl::io::JoinPath(
-                         "tensorflow", "compiler", "xla", "service", "gpu",
-                         "data", "hlo_algorithm_denylist.pbtxt")))
+        absl::StrCat(
+            existing_xla_flags, "--xla_gpu_algorithm_denylist_path=",
+            tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu",
+                              "data", "hlo_algorithm_denylist.pbtxt"))
             .data(),
         /*overwrite=*/true);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.cc b/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.cc
index 1552119f136..7f3fedd3f41 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/match.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -60,6 +60,7 @@ class OpcodeCollector : public ConstDfsHloVisitorWithDefault {
       case HloOpcode::kSign:
       case HloOpcode::kSin:
       case HloOpcode::kSqrt:
+      case HloOpcode::kTan:
       case HloOpcode::kTanh:
       // Binary
       case HloOpcode::kAdd:
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.h b/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.h
index a9e4303f00b..541fe25af52 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_fusion_stats.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 160e2d334d6..adc53474201 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index e748143d45a..c93afda55a2 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
index 4ebf3a8f5ff..06b0223c860 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -18,13 +18,9 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
@@ -46,8 +42,9 @@ Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
 
 class HorizontalInputFusionImpl {
  public:
-  explicit HorizontalInputFusionImpl(HloComputation* computation)
-      : computation_(computation) {}
+  explicit HorizontalInputFusionImpl(HloComputation* computation,
+                                     const GpuDeviceInfo& d)
+      : computation_(computation), device_info_(d) {}
 
   ~HorizontalInputFusionImpl() {}
 
@@ -55,6 +52,7 @@ class HorizontalInputFusionImpl {
 
  private:
   HloComputation* computation_;
+  const GpuDeviceInfo device_info_;
 };  // HorizontalInputFusionImpl
 
 // Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
@@ -138,7 +136,7 @@ StatusOr<bool> HorizontalInputFusionImpl::Run() {
       HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
       HloInstruction* fused = candidates[j];
       if (ShapesCompatibleForMultiOutputFusion(*fusion_anchor, *fused) &&
-          FusionFitsInBudget(*fusion_anchor, *fused)) {
+          FusionFitsInBudget(*fusion_anchor, *fused, device_info_)) {
         VLOG(3) << "Fuse " << fused->ToString() << " into "
                 << fusion_anchor->ToString();
         fusion_anchor->MergeFusionInstructionIntoMultiOutput(fused);
@@ -159,7 +157,7 @@ StatusOr<bool> HorizontalInputFusionImpl::Run() {
 
 StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
     HloComputation* computation) {
-  HorizontalInputFusionImpl horizontal_fusion_impl(computation);
+  HorizontalInputFusionImpl horizontal_fusion_impl(computation, device_info_);
   return horizontal_fusion_impl.Run();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
index eedc7649b10..041f7c3e6b2 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -38,7 +38,7 @@ namespace gpu {
 // ROOT tuple of the entry computation.
 class GpuHorizontalInputFusion : public HloModulePass {
  public:
-  GpuHorizontalInputFusion() {}
+  explicit GpuHorizontalInputFusion(const GpuDeviceInfo& d) : device_info_(d) {}
 
   absl::string_view name() const override {
     return "gpu_horizontal_input_fusion";
@@ -51,6 +51,8 @@ class GpuHorizontalInputFusion : public HloModulePass {
 
  private:
   StatusOr<bool> RunOnComputation(HloComputation*);
+
+  const GpuDeviceInfo device_info_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
index a93820d9b04..9e9cec161c8 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -15,14 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
 
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/filecheck.h"
 
 namespace xla {
 namespace gpu {
@@ -30,7 +27,11 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class HorizontalInputFusionTest : public GpuCodegenTest {};
+class HorizontalInputFusionTest : public GpuCodegenTest {
+ public:
+  GpuHorizontalInputFusion horizontal_input_fusion_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+};
 
 TEST_F(HorizontalInputFusionTest, BasicTest) {
   auto module = ParseAndReturnVerifiedModule(R"(
@@ -64,7 +65,7 @@ TEST_F(HorizontalInputFusionTest, BasicTest) {
 )")
                     .value();
 
-  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).value());
+  EXPECT_TRUE(horizontal_input_fusion_.Run(module.get()).value());
 
   const HloInstruction* entry_root =
       module->entry_computation()->root_instruction();
@@ -208,7 +209,7 @@ TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
 )")
                     .value();
 
-  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).value());
+  EXPECT_TRUE(horizontal_input_fusion_.Run(module.get()).value());
 }
 
 TEST_F(HorizontalInputFusionTest, NonfusionInstrs) {
@@ -232,7 +233,7 @@ TEST_F(HorizontalInputFusionTest, NonfusionInstrs) {
 )")
                     .value();
 
-  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).value());
+  EXPECT_TRUE(horizontal_input_fusion_.Run(module.get()).value());
 
   const HloInstruction* entry_root =
       module->entry_computation()->root_instruction();
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
index 152de4e9959..52bb0fc1855 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
@@ -22,12 +22,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 
@@ -57,33 +57,52 @@ class HorizontalLoopFusionImpl {
                                     absl::string_view prefix)
       : computation_(computation), prefix_(prefix) {}
 
-  ~HorizontalLoopFusionImpl() {}
+  ~HorizontalLoopFusionImpl() = default;
 
   StatusOr<bool> Run();
 
  private:
-  Status Fuse(absl::Span<HloInstruction*> fused_fusion_instrs);
+  Status Fuse(absl::Span<HloInstruction*> fused_fusion_instrs,
+              bool sliced_input_fusion,
+              std::vector<HloInstruction*>& to_fuse_candidates);
 
-  // Horizontally fuses `fused_fusion_instrs`. It is required that each of
-  // `fused_fusion_instrs` is a kLoop fusion. Also, we require their numbers of
-  // outputs to be the same, so that each output will be fused/concatenated with
-  // the same number of outputs from other fused fusion instrs. Then, all the
-  // fused outputs still have the same shapes for kernel generation.
+  // If `sliced_input_fusion` is true, Horizontally fuses `fused_fusion_instrs`
+  // into kInput computation, else fuses `fused_fusion_instrs` into kLoop
+  // computation.
+  //
+  // It is required that each of `fused_fusion_instrs` is a kLoop fusion. Also,
+  // we require their numbers of outputs to be the same, so that each output
+  // will be fused/concatenated with the same number of outputs from other fused
+  // fusion instrs. Then, all the fused outputs still have the same shapes for
+  // kernel generation.
   //
   // Returns the fused computation in `uniq_computation` and the operands that
   // are used by `uniq_computation`.
   Status CreateFusedComputation(
       absl::Span<HloInstruction*> fused_fusion_instrs,
       std::unique_ptr<HloComputation>* uniq_computation,
-      std::vector<HloInstruction*>* bound_operands);
+      std::vector<HloInstruction*>* bound_operands, bool sliced_input_fusion);
+
+  // Horizontally fuses the operands of consumer instruction,
+  // `sliced_input_fusion` controls whether kInput or kLoop type fused
+  // instruction want to be created. `to_fuse_candidates` is the instruction
+  // stack that we want to try horizontally fuse its operands, when we create a
+  // new fusion instruction, we push it to the stack in hope to further fuse its
+  // operands.
+  StatusOr<bool> FuseConsumerOperands(
+      HloInstruction* consumer, bool sliced_input_fusion,
+      std::vector<HloInstruction*>& to_fuse_candidates);
 
   // FusionCandidates collects profitable candidates for a given consumer
   // instruction. GetNextSpanOfFusions() can then be iteratively invoked to
   // acquire the next set of fusion candidates based on some heuristics.
   class FusionCandidates {
    public:
-    explicit FusionCandidates(HloInstruction* consumer)
-        : fusible_instrs_(), pos_(0) {
+    explicit FusionCandidates(HloInstruction* consumer,
+                              bool sliced_input_fusion)
+        : fusible_instrs_(),
+          pos_(0),
+          sliced_input_fusion_(sliced_input_fusion) {
       Initialize(consumer);
     }
 
@@ -96,6 +115,9 @@ class HorizontalLoopFusionImpl {
     std::vector<HloInstruction*> fusible_instrs_;
     // `pos_` points to the start position of the next span.
     size_t pos_;
+    // `sliced_input_fusion_` flag controls whether we want to fuse
+    // into kLoop (false) or kInput (True) type kernel
+    bool sliced_input_fusion_;
   };
 
   HloComputation* computation_;
@@ -103,6 +125,16 @@ class HorizontalLoopFusionImpl {
 };  // HorizontalLoopFusionImpl
 
 bool IsFusibleCandidate(const HloInstruction& instr) {
+  // For now, we do not support fusing instruction with control flow.
+  if (!instr.control_successors().empty() ||
+      !instr.control_predecessors().empty()) {
+    return false;
+  }
+
+  if (IsNestableVariadicReduction(instr)) {
+    return false;
+  }
+
   // Require no further check for element-wise instructions.
   if (instr.IsElementwise() && instr.operand_count() > 0) {
     return true;
@@ -139,9 +171,15 @@ bool IsFusibleCandidate(const HloInstruction& instr) {
 // fusion instruction has shapes smaller than `kShapeThreshold` and has fewer
 // instructions than `kInstrCountThreshold`, it is launch-latency-bound and
 // profitable by horizontal fusion.
-bool IsProfitableFusionCandidate(const HloInstruction& instr) {
-  constexpr int64_t kShapeThreshold = 128 * 2048;
-  constexpr int64_t kInstrCountThreshold = 30;
+bool IsProfitableFusionCandidate(const HloInstruction& instr,
+                                 bool sliced_input_fusion) {
+  // For kLoop fused kernel, each GPU thread will process 1 or more elements
+  // from each horizontal fused operands, while for kInput fused kernel, each
+  // GPU thread can only process 1 element. From experience, we enable larger
+  // tensor size threshold for kLoop fusion.
+  const int64_t kShapeThreshold =
+      sliced_input_fusion ? 128 * 2048 : 8192 * 8192;
+  const int64_t kInstrCountThreshold = sliced_input_fusion ? 30 : 128;
   const HloInstruction* root = (instr.opcode() == HloOpcode::kFusion)
                                    ? instr.fused_expression_root()
                                    : &instr;
@@ -152,11 +190,17 @@ bool IsProfitableFusionCandidate(const HloInstruction& instr) {
     // representative.
     Shape shape = root->operand(0)->shape();
     if (ShapeUtil::ElementsIn(shape) > kShapeThreshold) {
+      VLOG(2) << "Profitable check failed due to element count with "
+                 "sliced_input_fusion="
+              << sliced_input_fusion;
       return false;
     }
   } else {
     Shape shape = root->shape();
     if (ShapeUtil::ElementsIn(shape) > kShapeThreshold) {
+      VLOG(2) << "Profiltable check failed due to element size with "
+                 "sliced_input_fusion="
+              << sliced_input_fusion;
       return false;
     }
   }
@@ -224,21 +268,25 @@ void HorizontalLoopFusionImpl::FusionCandidates::Initialize(
 
   for (HloInstruction* instr : ordered_fusible_candidates) {
     if (!IsConsumerTheOnlyNonRootUser(*instr, *consumer)) {
-      VLOG(2) << "Reject maybe illegal instr " << instr->ToString()
+      VLOG(2) << "sliced_input_fusion=" << sliced_input_fusion_
+              << " rejects maybe illegal instr " << instr->ToString()
               << "; including it may create cycles in HLO.";
       continue;
-    } else if (!IsProfitableFusionCandidate(*instr)) {
-      VLOG(2) << "Reject may-not-be profitable fusion instr "
+    } else if (!IsProfitableFusionCandidate(*instr, sliced_input_fusion_)) {
+      VLOG(2) << "sliced_input_fusion=" << sliced_input_fusion_
+              << " rejects may-not-be profitable fusion instr"
               << instr->ToString();
       continue;
     } else if (!HasOnlyRowMajorLayout(*instr)) {
-      VLOG(2) << "Reject non-row-major fusion instr " << instr->ToString();
+      VLOG(2) << "sliced_input_fusion=" << sliced_input_fusion_
+              << " rejects non-row-major fusion instr " << instr->ToString();
       continue;
     } else if (AnyOpndIsParamSharedAmongFusions(instr, fusible_candidates)) {
       // Don't fuse fusions whose operands are parameter instructions that are
       // shared among fusions because we cannot i/o alias the produced
       // horizontal fusion due to the concat insertion.
-      VLOG(2) << "Reject the fusion instr because it shares parameter with"
+      VLOG(2) << "sliced_input_fusion=" << sliced_input_fusion_
+              << " rejects the fusion instr because it shares parameter with"
               << " other fusion candidates, instr: " << instr->ToString();
       continue;
     } else {
@@ -250,9 +298,12 @@ void HorizontalLoopFusionImpl::FusionCandidates::Initialize(
   }
 
   // Sort `fusible_instrs_` according to output types, the number of outputs,
-  // and instruction counts, because we only fuse instructions with the same
-  // number/type of outputs and whose computations have the same instruction
-  // count.
+  // instruction counts, output tensor element count. For sliced input fusion,
+  // we only fuse instructions with the same number/type of outputs and whose
+  // computations have the same instruction count. For kLoop fusion, we requires
+  // the fused instructions to have the same number/type of outputs and also the
+  // same output shape. We did a sort here so the fusion candidates is
+  // populating a continuous span.
   std::sort(
       fusible_instrs_.begin(), fusible_instrs_.end(),
       [&](const HloInstruction* a, const HloInstruction* b) {
@@ -262,8 +313,11 @@ void HorizontalLoopFusionImpl::FusionCandidates::Initialize(
                  GetUniqueOutputTypeOfFusible(*b);
         } else if (GetOutputSizeOfFusible(*a) != GetOutputSizeOfFusible(*b)) {
           return GetOutputSizeOfFusible(*a) < GetOutputSizeOfFusible(*b);
-        } else {
+        } else if (GetInstrCountOfFusible(*a) != GetInstrCountOfFusible(*b)) {
           return GetInstrCountOfFusible(*a) < GetInstrCountOfFusible(*b);
+        } else {
+          return ShapeUtil::ElementsIn(GetOutputsOfFusible(*a)[0]->shape()) <
+                 ShapeUtil::ElementsIn(GetOutputsOfFusible(*b)[0]->shape());
         }
       });
 }
@@ -277,7 +331,35 @@ HorizontalLoopFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
 
   // Fusing too many computations at a time may not be easily profitable and
   // may increase compile time due to large kernels. Set a limit to it.
-  constexpr int64_t kMaxFusionBatchSize = 32;
+  // From profiling results, we found an issue that large fused horizontal
+  // kernel could have lower E2E perf, though the pure GPU kernel time is
+  // shorter. TODO task for understanding why E2E perf regression for large
+  // horiizontal fused kernel. Use the experience max fusion batch size based on
+  // the fused instruction count of the operand
+  const auto kMaxFusionBatchSize = [&]() -> int64_t {
+    if (sliced_input_fusion_) {
+      return 32;
+    } else {
+      if (fusible_instrs_[pos_]->opcode() == HloOpcode::kFusion) {
+        auto fused_instruction_count =
+            fusible_instrs_[pos_]->fused_instruction_count();
+        if (fused_instruction_count < 8) {
+          return 32;
+        } else if (fused_instruction_count < 16) {
+          return 16;
+        } else if (fused_instruction_count < 32) {
+          return 8;
+        } else if (fused_instruction_count < 64) {
+          return 4;
+        } else {
+          return 2;
+        }
+      } else {
+        return 64;
+      }
+    }
+  }();
+
   // CUDA has a parameter size limit of ~4k bytes.
   constexpr int64_t kMaxCudaParamSize = 4000;
   size_t accum_io_size = 0;
@@ -318,20 +400,64 @@ HorizontalLoopFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
       // fusing computations with too much discrepancy and we may improve it
       // when the needs arise.
       break;
+    } else if (!sliced_input_fusion_ &&
+               !ShapeUtil::EqualIgnoringElementType(
+                   GetOutputsOfFusible(*fusible_instrs_[left])[0]->shape(),
+                   GetOutputsOfFusible(*fusible_instrs_[right])[0]->shape())) {
+      // This is for fusing into kLoop type kernel, so we requires that each
+      // fusion operand have the same shape
+      break;
     } else if (reach_max_fusion_batch_size(left, right)) {
       // Hit max fusion batch size.
       break;
     }
   }
-
+  VLOG(2) << "horizontal fuse get instruction span with " << (right - left)
+          << " instructions for sliced_input_fusion=" << sliced_input_fusion_
+          << " fusion";
   pos_ = right;
   return absl::MakeSpan(fusible_instrs_).subspan(left, right - left);
 }
 
+StatusOr<bool> HorizontalLoopFusionImpl::FuseConsumerOperands(
+    HloInstruction* consumer, bool sliced_input_fusion,
+    std::vector<HloInstruction*>& to_fuse_candidates) {
+  bool changed = false;
+  FusionCandidates loop_fusion_candidates(consumer, sliced_input_fusion);
+  while (true) {
+    auto fusibles = loop_fusion_candidates.GetNextSpanOfFusions();
+    if (fusibles.empty()) {
+      break;
+    } else if (fusibles.size() == 1) {
+      // Skip; there is just one fused_instr.
+      continue;
+    }
+
+    changed = true;
+    // Convert fusible into fusion_instrs to simplify the implementation of
+    // `Fuse()`.
+    std::vector<HloInstruction*> fusion_instrs;
+    for (HloInstruction* instr : fusibles) {
+      if (instr->opcode() == HloOpcode::kFusion) {
+        fusion_instrs.push_back(instr);
+      } else {
+        TF_ASSIGN_OR_RETURN(
+            HloInstruction * fusion_instr,
+            MakeFusionInstruction(instr, HloInstruction::FusionKind::kLoop));
+        fusion_instrs.push_back(fusion_instr);
+      }
+    }
+
+    TF_RETURN_IF_ERROR(Fuse(absl::MakeSpan(fusion_instrs), sliced_input_fusion,
+                            to_fuse_candidates));
+  }
+  return changed;
+}
+
 Status HorizontalLoopFusionImpl::CreateFusedComputation(
     absl::Span<HloInstruction*> fused_fusion_instrs,
     std::unique_ptr<HloComputation>* uniq_computation,
-    std::vector<HloInstruction*>* bound_operands) {
+    std::vector<HloInstruction*>* bound_operands, bool sliced_input_fusion) {
   // First, build a computation with only params.
   HloComputation::Builder b(prefix_ + "horizontally_fused_computation");
   size_t fused_comp_param_id = 0;
@@ -374,7 +500,7 @@ Status HorizontalLoopFusionImpl::CreateFusedComputation(
                                 ->MakeInstructionPostOrder();
     for (HloInstruction* old_instr : def_to_use_order) {
       if (old_instr->opcode() == HloOpcode::kParameter ||
-          (old_instr->opcode() == HloOpcode::kTuple &&
+          (sliced_input_fusion && old_instr->opcode() == HloOpcode::kTuple &&
            old_instr == fused_fusion_instrs[i]->fused_expression_root())) {
         // Parameters have been created, and we don't need tuples from
         // multi-output fusions, as we will directly reference the tuple
@@ -396,78 +522,114 @@ Status HorizontalLoopFusionImpl::CreateFusedComputation(
     }
   }
 
-  std::vector<HloInstruction*> concated_outputs;
   // Since we require each fusion to have the same number of outputs, we can
   // simply use the first fusion as the representative for output size.
   size_t fused_instr_output_size =
       GetOutputSizeOfFusible(*fused_fusion_instrs[0]);
-  for (size_t i = 0; i < fused_instr_output_size; ++i) {
-    std::vector<HloInstruction*> instr_outputs(fused_fusion_instrs.size());
-    for (size_t j = 0; j < fused_fusion_instrs.size(); ++j) {
-      const HloInstruction* old_output =
-          GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
-      HloInstruction* new_output = clone_map[old_output];
-      if (new_output->shape().dimensions_size() == 1) {
-        instr_outputs[j] = new_output;
-      } else {
-        Shape new_shape = ShapeUtil::MakeShapeWithDenseLayout(
-            new_output->shape().element_type(),
-            {ShapeUtil::ElementsIn(new_output->shape())},
-            /*minor_to_major=*/std::vector<int64_t>(1, 0));
-        TF_ASSIGN_OR_RETURN(instr_outputs[j],
-                            MakeReshapeHlo(new_shape, new_output));
+
+  if (sliced_input_fusion) {
+    // Fusing into kInput fusion
+    std::vector<HloInstruction*> concated_outputs;
+    for (size_t i = 0; i < fused_instr_output_size; ++i) {
+      std::vector<HloInstruction*> instr_outputs(fused_fusion_instrs.size());
+      for (size_t j = 0; j < fused_fusion_instrs.size(); ++j) {
+        const HloInstruction* old_output =
+            GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
+        HloInstruction* new_output = clone_map[old_output];
+        if (new_output->shape().dimensions_size() == 1) {
+          instr_outputs[j] = new_output;
+        } else {
+          Shape new_shape = ShapeUtil::MakeShapeWithDenseLayout(
+              new_output->shape().element_type(),
+              {ShapeUtil::ElementsIn(new_output->shape())},
+              /*minor_to_major=*/std::vector<int64_t>(1, 0));
+          TF_ASSIGN_OR_RETURN(instr_outputs[j],
+                              MakeReshapeHlo(new_shape, new_output));
+        }
       }
+      TF_ASSIGN_OR_RETURN(HloInstruction * concated_output,
+                          MakeConcatHlo(instr_outputs, 0));
+      concated_outputs.push_back(concated_output);
     }
-    TF_ASSIGN_OR_RETURN(HloInstruction * concated_output,
-                        MakeConcatHlo(instr_outputs, 0));
-    concated_outputs.push_back(concated_output);
-  }
 
-  // Make slices of outputs.
-  std::vector<HloInstruction*> output_slices(concated_outputs.size() *
-                                             fused_fusion_instrs.size());
-  for (size_t i = 0; i < concated_outputs.size(); ++i) {
-    HloInstruction* concated_output = concated_outputs[i];
-    int64_t slice_start = 0;
-    // Create a slice per fused computation.
-    for (size_t j = 0; j < fused_fusion_instrs.size(); ++j) {
-      const HloInstruction* old_output =
-          GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
-      Shape shape = old_output->shape();
-      int64_t slice_limit = slice_start + ShapeUtil::ElementsIn(shape);
-      TF_ASSIGN_OR_RETURN(
-          output_slices[concated_outputs.size() * j + i],
-          MakeSliceHlo(concated_output, {slice_start}, {slice_limit},
-                       /*strides=*/{1}));
-      slice_start = slice_limit;
+    // Make slices of outputs.
+    std::vector<HloInstruction*> output_slices(concated_outputs.size() *
+                                               fused_fusion_instrs.size());
+    for (size_t i = 0; i < concated_outputs.size(); ++i) {
+      HloInstruction* concated_output = concated_outputs[i];
+      int64_t slice_start = 0;
+      // Create a slice per fused computation.
+      for (size_t j = 0; j < fused_fusion_instrs.size(); ++j) {
+        const HloInstruction* old_output =
+            GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
+        Shape shape = old_output->shape();
+        int64_t slice_limit = slice_start + ShapeUtil::ElementsIn(shape);
+        TF_ASSIGN_OR_RETURN(
+            output_slices[concated_outputs.size() * j + i],
+            MakeSliceHlo(concated_output, {slice_start}, {slice_limit},
+                         /*strides=*/{1}));
+        slice_start = slice_limit;
+      }
     }
-  }
 
-  // Make a tuple of output_slices.
-  HloInstruction* tuple = comp->AddInstruction(
-      HloInstruction::CreateTuple(output_slices), metadata);
-  comp->set_root_instruction(tuple, /*accept_different_shape=*/true);
-  TF_RETURN_IF_ERROR(comp->RemoveInstruction(dummy_root));
+    // Make a tuple of output_slices.
+    HloInstruction* tuple = comp->AddInstruction(
+        HloInstruction::CreateTuple(output_slices), metadata);
+    comp->set_root_instruction(tuple, /*accept_different_shape=*/true);
+    TF_RETURN_IF_ERROR(comp->RemoveInstruction(dummy_root));
+
+  } else {
+    // Fusing into kLoop fusion
+    std::vector<HloInstruction*> tuple_operands(fused_instr_output_size *
+                                                fused_fusion_instrs.size());
+    // If fusing into kLoop fusion, the new fusion root is tuple of fused
+    // fusion computaton's root.
+    for (size_t i = 0; i < fused_instr_output_size; ++i) {
+      for (size_t j = 0; j < fused_fusion_instrs.size(); ++j) {
+        const HloInstruction* old_output =
+            GetOutputsOfFusible(*fused_fusion_instrs[j])[i];
+        HloInstruction* new_output = clone_map[old_output];
+        tuple_operands[fused_instr_output_size * j + i] = new_output;
+      }
+    }
+    // Make a tuple instruction of fused instruction outputs as
+    // the root of fused computation.
+    HloInstruction* tuple =
+        comp->AddInstruction(HloInstruction::CreateTuple(tuple_operands));
+    comp->set_root_instruction(tuple, /*accept_different_shape=*/true);
+    TF_RETURN_IF_ERROR(comp->RemoveInstruction(dummy_root));
+  }
 
   return OkStatus();
 }
 
 Status HorizontalLoopFusionImpl::Fuse(
-    absl::Span<HloInstruction*> fused_fusion_instrs) {
+    absl::Span<HloInstruction*> fused_fusion_instrs, bool sliced_input_fusion,
+    std::vector<HloInstruction*>& to_fuse_candidates) {
   // Fuse fused_fusion_instrs and replace them with the new fused computation.
   std::unique_ptr<HloComputation> uniq_computation;
   std::vector<HloInstruction*> bound_operands;
-  TF_RETURN_IF_ERROR(CreateFusedComputation(
-      fused_fusion_instrs, &uniq_computation, &bound_operands));
+
+  TF_RETURN_IF_ERROR(CreateFusedComputation(fused_fusion_instrs,
+                                            &uniq_computation, &bound_operands,
+                                            sliced_input_fusion));
+
   HloComputation* fused_comp = computation_->parent()->AddEmbeddedComputation(
       std::move(uniq_computation));
   HloInstruction* hori_fusion_instr = computation_->AddInstruction(
       HloInstruction::CreateFusion(fused_comp->root_instruction()->shape(),
-                                   HloInstruction::FusionKind::kInput,
+                                   sliced_input_fusion
+                                       ? HloInstruction::FusionKind::kInput
+                                       : HloInstruction::FusionKind::kLoop,
                                    bound_operands, fused_comp, prefix_),
       &fused_comp->root_instruction()->metadata());
   fused_comp->SetFusionInstruction(hori_fusion_instr);
 
+  // we push the newly fused instruction into fusion candidate stack, because
+  // the operands of the newly fused instruction could now be possible to be
+  // horizontally fused.
+  to_fuse_candidates.push_back(hori_fusion_instr);
+
   // Insert bitcasts and replace corresponding users. Note that we do not insert
   // the bitcasts in the fused computation as it does not fit into the slice
   // input fusion pattern. However, inserting bitcasts outside the fused
@@ -515,51 +677,37 @@ StatusOr<bool> HorizontalLoopFusionImpl::Run() {
   bool changed = false;
   XLA_VLOG_LINES(3, computation_->ToString());
 
-  for (HloInstruction* instr : computation_->instructions()) {
-    if (!instr->control_successors().empty()) {
-      VLOG(1) << "Skipping HorizontalLoopFusion as there is control flow in "
-                 "the graph";
-      return false;
-    }
-  }
-
   // Traverse from use to def. Bitcasts are placed after h-fusions to resolve
   // shape mismatch but bitcasts could prevent future h-fusion from happening.
   // So, a bottom-up, use-to-def order should be more favorable. It also helps
   // to save compiler iterations to reach the fixed point.
-  std::vector<HloInstruction*> use_to_def_order =
+  std::vector<HloInstruction*> to_fuse_candidates =
       computation_->MakeInstructionPostOrder();
-  absl::c_reverse(use_to_def_order);
-  for (size_t i = 0; i < use_to_def_order.size(); ++i) {
-    HloInstruction* consumer = use_to_def_order[i];
-    HorizontalLoopFusionImpl::FusionCandidates fusion_candidates(consumer);
-    while (true) {
-      auto fusibles = fusion_candidates.GetNextSpanOfFusions();
-      if (fusibles.empty()) {
-        break;
-      } else if (fusibles.size() == 1) {
-        // Skip; there is just one fused_instr.
-        continue;
-      }
 
-      changed = true;
-      // Convert fusible into fusion_instrs to simplify the implementation of
-      // `Fuse()`.
-      std::vector<HloInstruction*> fusion_instrs;
-      for (HloInstruction* instr : fusibles) {
-        if (instr->opcode() == HloOpcode::kFusion) {
-          fusion_instrs.push_back(instr);
-        } else {
-          TF_ASSIGN_OR_RETURN(
-              HloInstruction * fusion_instr,
-              MakeFusionInstruction(instr, HloInstruction::FusionKind::kLoop));
-          fusion_instrs.push_back(fusion_instr);
-        }
-      }
-      TF_RETURN_IF_ERROR(Fuse(absl::MakeSpan(fusion_instrs)));
+  while (!to_fuse_candidates.empty()) {
+    HloInstruction* consumer = to_fuse_candidates.back();
+    to_fuse_candidates.pop_back();
+
+    // the consumer may be the operands of previously fused instruction, so
+    // it will no longer valid, skip this instruction.
+    if (consumer->IsDead()) {
+      continue;
     }
-  }
 
+    // we first try to fuse into kLoop fusion instruction for those operands
+    // that have the same shape.
+    TF_ASSIGN_OR_RETURN(
+        bool loop_fusion_changed,
+        FuseConsumerOperands(consumer, false, to_fuse_candidates));
+
+    // for the remaining operands with diffent shape, we further try fuse them
+    // into kInput fusion instruction.
+    TF_ASSIGN_OR_RETURN(
+        bool sliced_input_fusion_changed,
+        FuseConsumerOperands(consumer, true, to_fuse_candidates));
+
+    changed = changed || loop_fusion_changed || sliced_input_fusion_changed;
+  }
   return changed;
 }
 
@@ -574,11 +722,11 @@ StatusOr<bool> GpuHorizontalLoopFusion::RunOnComputation(
 StatusOr<bool> GpuHorizontalLoopFusion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
   VLOG(2) << "Run horizontal fusion.";
 
   // Run on the entry computation is actually enough.
-  TF_ASSIGN_OR_RETURN(changed, RunOnComputation(module->entry_computation()));
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      RunOnComputation(module->entry_computation()));
 
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
index e367bf774d3..ae4588b6aae 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_LOOP_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_LOOP_FUSION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
@@ -59,8 +59,22 @@ namespace gpu {
 //   v       v
 //  (ROOT) tuple
 //
-// We horizontally fuse them into the below pattern.
+// We fuse into one of two possible patterns, depending on whether all the
+// fused operations have the same shape or not.
 //
+// case 1: if Mul and Add's output shape and type are the same, then we fuse
+// them into the below pattern: i0 i1   i2 i3
+//  | |     | |
+//  v v     v v
+//  Mul     Add
+//   |       |
+//   v       v
+//  (ROOT) tuple
+// the fused kernel will be kLoop type, i.e, GPU code is emitted through
+// IrEmitterUnnested::EmitLoopFusion
+//
+// case 2: if Mul and Add's output shape are diffent, then we fuse them into
+// the below pattern that adds extra indexing:
 // i0 i1   i2 i3       +++ (Slice) Input Fusion
 //  | |     | |          +
 //  v v     v v          +
@@ -81,12 +95,19 @@ namespace gpu {
 //   v       v
 //  (ROOT) tuple
 //
-// Note that this fusion style provides an important advantage that kernels of
-// different shapes can be horizontally fused. The first pair of reshapes
-// (i.e., Reshape0 and Reshape1) reshape the dims to 1 dimension, so that the
-// outputs of the fused kernels can (always) be concatenated. The second pair
-// of reshapes (Reshape2 and Reshape3) restore the original shapes to the
-// output tensors.
+// the fused kernel will be kInput type, and, the GPU code is emitted through
+// IrEmitterUnnested::EmitInputFusibleNonStridedSlices
+//
+// In theory, the pattern in case 1 could also be fused into the case2 target
+// graph, but we prefer to fuse into kLoop type, because the codegen for it does
+// not have the slicing range check cost introduced by case 2 pattern.
+//
+// Note that the fusion style by case 2 provides an important advantage that
+// kernels of different shapes can be horizontally fused. The first pair of
+// reshapes (i.e., Reshape0 and Reshape1) reshape the dims to 1 dimension, so
+// that the outputs of the fused kernels can (always) be concatenated. The
+// second pair of reshapes (Reshape2 and Reshape3) restore the original shapes
+// to the output tensors.
 //
 // No extra copies are introduced by the horizontal fusion. Besides Reshape2
 // and Reshape3, the other instructions are fused into an input fusion; the
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
index 1a031ab7b2b..abc571e3ee5 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 
@@ -38,7 +38,12 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class HorizontalLoopFusionTest : public HloTestBase {};
+class HorizontalLoopFusionTest : public HloTestBase {
+ public:
+  static bool IsFusion(const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kFusion;
+  }
+};
 
 TEST_F(HorizontalLoopFusionTest, BasicTest) {
   auto module = ParseAndReturnVerifiedModule(R"(
@@ -161,6 +166,106 @@ TEST_F(HorizontalLoopFusionTest, NegativeTestForIncompatibleTypes) {
   EXPECT_FALSE(GpuHorizontalLoopFusion().Run(module.get()).value());
 }
 
+TEST_F(HorizontalLoopFusionTest, FusingIntoKLoopAndKInputTogether) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule FusingIntoKLoopAndKInputTogether
+
+ fused_computation.1 {
+   arg.1 = f16[129, 2048]{1, 0} parameter(0)
+   arg.2 = f16[129, 2048]{1, 0} parameter(1)
+   ROOT mul.1 = f16[129,2048]{1, 0} multiply(arg.1, arg.2)
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[129, 2048]{1, 0} parameter(0)
+   arg.2 = f16[129, 2048]{1, 0} parameter(1)
+   ROOT mul.1 = f16[129,2048]{1, 0} multiply(arg.1, arg.2)
+ }
+
+ fused_computation.3 {
+   arg.1 = f16[130, 2048]{1, 0} parameter(0)
+   arg.2 = f16[130, 2048]{1, 0} parameter(1)
+   ROOT mul.1 = f16[130,2048]{1, 0} multiply(arg.1, arg.2)
+ }
+
+ fused_computation.4 {
+   arg.1 = f16[130, 2048]{1, 0} parameter(0)
+   arg.2 = f16[130, 2048]{1, 0} parameter(1)
+   ROOT mul.1 = f16[130,2048]{1, 0} multiply(arg.1, arg.2)
+ }
+
+ fused_computation.5 {
+   arg.1 = f16[123]{0} parameter(0)
+   arg.2 = f16[123]{0} parameter(1)
+   ROOT add.1 = f16[123]{0} add(arg.1, arg.2)
+ }
+
+ fused_computation.6 {
+   arg.1 = f16[128]{0} parameter(0)
+   arg.2 = f16[128]{0} parameter(1)
+   ROOT add.1 = f16[128]{0} add(arg.1, arg.2)
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[129, 2048]{1, 0} parameter(0)
+   arg.2 = f16[129, 2048]{1, 0} parameter(1)
+   arg.3 = f16[129, 2048]{1, 0} parameter(2)
+   arg.4 = f16[129, 2048]{1, 0} parameter(3)
+   arg.5 = f16[130, 2048]{1, 0} parameter(4)
+   arg.6 = f16[130, 2048]{1, 0} parameter(5)
+   arg.7 = f16[130, 2048]{1, 0} parameter(6)
+   arg.8 = f16[130, 2048]{1, 0} parameter(7)
+   arg.9 = f16[123]{0} parameter(8)
+   arg.10 = f16[123]{0} parameter(9)
+   arg.11 = f16[128]{0} parameter(10)
+   arg.12 = f16[128]{0} parameter(11)
+
+   // fusion.1 and fusion.2 will be fused into kLoop fusion
+   // fusion.3 and fusion.4 will be fused into another kLoop fusion
+   // fusion.5 and fusion.6 will be fused into kInput fusion
+
+   fusion.1 = f16[129,2048]{1, 0}
+      fusion(arg.1, arg.2), kind=kLoop, calls=fused_computation.1
+
+   fusion.2 = f16[129,2048]{1, 0}
+      fusion(arg.3, arg.4), kind=kLoop, calls=fused_computation.2
+
+   fusion.3 = f16[130,2048]{1, 0}
+      fusion(arg.5, arg.6), kind=kLoop, calls=fused_computation.3
+
+   fusion.4 = f16[130,2048]{1, 0}
+      fusion(arg.7, arg.8), kind=kLoop, calls=fused_computation.4
+
+   fusion.5 = f16[123]{0}
+      fusion(arg.9, arg.10), kind=kLoop, calls=fused_computation.5
+
+   fusion.6 = f16[128]{0}
+      fusion(arg.11, arg.12), kind=kLoop, calls=fused_computation.6
+
+   ROOT tuple.1 = (f16[129,2048]{1, 0}, f16[129,2048]{1, 0},
+                   f16[130,2048]{1, 0}, f16[130,2048]{1, 0},
+                   f16[123]{0}, f16[128]{0})
+      tuple(fusion.1, fusion.2, fusion.3, fusion.4, fusion.5, fusion.6)
+ }
+)")
+                    .value();
+
+  EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).value());
+
+  int input_fusion_count = 0;
+  int loop_fusion_count = 0;
+  for (auto inst : module->entry_computation()->MakeInstructionPostOrder()) {
+    if (inst->opcode() == HloOpcode::kFusion) {
+      input_fusion_count +=
+          (inst->fusion_kind() == HloInstruction::FusionKind::kInput) ? 1 : 0;
+      loop_fusion_count +=
+          (inst->fusion_kind() == HloInstruction::FusionKind::kLoop) ? 1 : 0;
+    }
+  }
+  EXPECT_EQ(input_fusion_count, 1);
+  EXPECT_EQ(loop_fusion_count, 2);
+}
+
 TEST_F(HorizontalLoopFusionTest, HorizontalLoopFusionAfterVerticalFusion) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule MergeSharedFusionInstruction
@@ -187,8 +292,11 @@ TEST_F(HorizontalLoopFusionTest, HorizontalLoopFusionAfterVerticalFusion) {
                     .value();
 
   HloPassPipeline fusion("fusion");
-  fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/false);
-  fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/true);
+  const GpuDeviceInfo device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/false,
+                                                 device_info);
+  fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/true,
+                                                 device_info);
   EXPECT_TRUE(fusion.Run(module.get()).value());
   EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).value());
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -524,14 +632,9 @@ TEST_F(HorizontalLoopFusionTest, IterativeHorizontalFusion) {
 
   // Verify that the total number of fusion instructions is 2 so that we
   // know sqrt.0 and sqrt.1 are fused.
-  size_t total_fusion_instrs = 0;
-  for (const HloInstruction* instr :
-       module->entry_computation()->instructions()) {
-    if (instr->opcode() == HloOpcode::kFusion) {
-      ++total_fusion_instrs;
-    }
-  }
-  EXPECT_EQ(total_fusion_instrs, 2);
+  EXPECT_EQ(
+      absl::c_count_if(module->entry_computation()->instructions(), IsFusion),
+      2);
 }
 
 TEST_F(HorizontalLoopFusionTest, TraversalOrder) {
@@ -593,14 +696,9 @@ TEST_F(HorizontalLoopFusionTest, TraversalOrder) {
   // know all the sqrt instructions are fused into a kernel. Note that if we
   // traverse from def-to-use (i.e., top-to-down) instead of use-to-def, we
   // will end up having 3 fusions instead of 2.
-  size_t total_fusion_instrs = 0;
-  for (const HloInstruction* instr :
-       module->entry_computation()->instructions()) {
-    if (instr->opcode() == HloOpcode::kFusion) {
-      ++total_fusion_instrs;
-    }
-  }
-  EXPECT_EQ(total_fusion_instrs, 2);
+  EXPECT_EQ(
+      absl::c_count_if(module->entry_computation()->instructions(), IsFusion),
+      2);
 }
 
 // Simplified reproducer for Google bug b/242287055.
@@ -650,6 +748,94 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
 }
 
+TEST_F(HorizontalLoopFusionTest, CopyInsertionFusionControlFlow) {
+  const char* hlo_text = R"(
+HloModule cluster
+
+ENTRY main {
+  cst = f32[1]{0} constant({0})
+  cp1 = f32[1]{0} copy(cst)
+  cp2 = f32[1]{0} copy(cst)
+  cp3 = f32[1]{0} copy(cst)
+  cp4 = f32[1]{0} copy(cst), control-predecessors={cp1}
+  ROOT tuple_out = (f32[1]{0}, f32[1]{0}, f32[1]{0}, f32[1]{0}) tuple(cp1, cp2, cp3, cp4)
+}
+)";
+
+  auto module = ParseAndReturnUnverifiedModule(hlo_text).value();
+  EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).value());
+
+  VLOG(2) << module->ToString();
+
+  // Verify that the total number of fusion instructions is 1.
+  EXPECT_EQ(
+      absl::c_count_if(module->entry_computation()->instructions(), IsFusion),
+      1);
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  // Check that we fuse when supported.
+  EXPECT_THAT(entry_root,
+              op::Tuple(op::Copy(), op::GetTupleElement(op::Fusion()),
+                        op::GetTupleElement(op::Fusion()), op::Copy()));
+}
+
+TEST_F(HorizontalLoopFusionTest, DoNotMergeVariadicReductions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule m
+
+  fused_computation.94 {
+    tmp_0 = f32[] parameter(0)
+    tmp_1 = f32[] parameter(1)
+    tmp_2 = pred[] compare(tmp_0, tmp_1), direction=GE
+    tmp_3 = f32[] select(tmp_2, tmp_0, tmp_1)
+    tmp_4 = pred[] compare(tmp_0, tmp_1), direction=EQ
+    tmp_5 = s32[] parameter(2)
+    tmp_6 = s32[] parameter(3)
+    tmp_7 = s32[] minimum(tmp_5, tmp_6)
+    tmp_8 = s32[] select(tmp_2, tmp_5, tmp_6)
+    tmp_9 = s32[] select(tmp_4, tmp_7, tmp_8)
+    ROOT tmp_10 = (f32[], s32[]) tuple(tmp_3, tmp_9)
+  }
+
+  minmax_func.1536 {
+    tmp_0 = f32[] parameter(0)
+    tmp_1 = f32[] parameter(2)
+    tmp_2 = s32[] parameter(1)
+    tmp_3 = s32[] parameter(3)
+    ROOT tmp_4 = (f32[], s32[]) fusion(tmp_0, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=fused_computation.94
+  }
+
+  fused_computation {
+    tmp_0 = f32[554112,10]{1,0} parameter(0)
+    tmp_1 = s32[554112,10]{1,0} iota(), iota_dimension=1
+    tmp_2 = f32[] constant(-inf)
+    tmp_3 = s32[] constant(0)
+    ROOT tmp_4 = (f32[554112]{0}, s32[554112]{0}) reduce(tmp_0, tmp_1, tmp_2, tmp_3), dimensions={1}, to_apply=minmax_func.1536
+  }
+
+  fused_computation2 {
+    tmp_0 = f32[554112,10]{1,0} parameter(0)
+    tmp_1 = s32[554112,10]{1,0} iota(), iota_dimension=1
+    tmp_2 = f32[] constant(inf)
+    tmp_3 = s32[] constant(1)
+    ROOT tmp_4 = (f32[554112]{0}, s32[554112]{0}) reduce(tmp_0, tmp_1, tmp_2, tmp_3), dimensions={1}, to_apply=minmax_func.1536
+  }
+
+  ENTRY e {
+    tmp_0 = f32[554112,10]{1,0} parameter(0)
+    tmp_1 = (f32[554112]{0}, s32[554112]{0}) fusion(tmp_0), kind=kLoop, calls=fused_computation
+    tmp_2 = s32[554112]{0} get-tuple-element(tmp_1), index=1
+    tmp_3 = f32[554112,10]{1,0} parameter(1)
+    tmp_4 = (f32[554112]{0}, s32[554112]{0}) fusion(tmp_3), kind=kLoop, calls=fused_computation2
+    tmp_5 = s32[554112]{0} get-tuple-element(tmp_4), index=1
+    ROOT tmp_6 = s32[554112]{0} add(tmp_2, tmp_5)
+  })")
+                    .value();
+
+  EXPECT_FALSE(GpuHorizontalLoopFusion().Run(module.get()).value());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 2dc20b347c4..0f0364ac631 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_THUNK_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 67230c5f5e7..32a09e945a6 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 
+#include <vector>
+
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -75,6 +78,11 @@ FusionDecision GpuInstructionFusion::ShouldFuseInexpensiveChecks(
           !IsProducerConsumerFusible(*producer, *consumer)) {
     return !fusible;
   }
+
+  if (CreatesHeavyComputation(*producer, *consumer)) {
+    return "the fusion would create a heavy computation";
+  }
+
   if (NoFusionPossible fusible =
           !InstructionFusion::ShouldFuse(consumer, operand_index)) {
     return !fusible;
@@ -93,7 +101,7 @@ FusionDecision GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
   // The following checks are potentially expensive.
   if (NoFusionPossible too_large =
-          !FusionFitsInBudget(*consumer, *producer,
+          !FusionFitsInBudget(*consumer, *producer, device_info_,
                               /*is_consumer_producer_fusion=*/true)) {
     return !too_large;
   }
@@ -125,6 +133,21 @@ HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
   return ChooseFusionKind(*producer, *consumer);
 }
 
+std::vector<HloComputation*> GpuInstructionFusion::GetFusionComputations(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::vector<HloComputation*> computations =
+      InstructionFusion::GetFusionComputations(module, execution_threads);
+  computations.erase(
+      std::remove_if(computations.begin(), computations.end(),
+                     [](HloComputation* c) {
+                       return c->IsCustomCallComputation() &&
+                              IsSoftmaxCustomCall(*c->CustomCallInstruction());
+                     }),
+      computations.end());
+  return computations;
+}
+
 HloInstruction* GpuInstructionFusion::FuseInstruction(
     HloInstruction* fusion_instruction, HloInstruction* producer) {
   auto evaluation = fusion_node_evaluations_.find(fusion_instruction);
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index f69bc806978..dd1459dee1a 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INSTRUCTION_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INSTRUCTION_FUSION_H_
 
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
 namespace xla {
@@ -26,8 +31,9 @@ namespace gpu {
 
 class GpuInstructionFusion : public InstructionFusion {
  public:
-  explicit GpuInstructionFusion(bool may_duplicate)
-      : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
+  explicit GpuInstructionFusion(bool may_duplicate, const GpuDeviceInfo& d)
+      : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate),
+        device_info_(d) {}
 
   static bool IsExpensive(const HloInstruction& instruction);
 
@@ -46,6 +52,12 @@ class GpuInstructionFusion : public InstructionFusion {
   HloInstruction::FusionKind ChooseKind(
       const HloInstruction* producer, const HloInstruction* consumer) override;
 
+  // Return computations on which to run Fusion. We explicitly filter out
+  // softmax custom-call computations.
+  std::vector<HloComputation*> GetFusionComputations(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
  private:
   // This method is called by ShouldFuse() to do all the computationally
   // inexpensive checks whether we should fuse the operand into 'consumer'.
@@ -59,6 +71,8 @@ class GpuInstructionFusion : public InstructionFusion {
   // indexed with different index vectors.
   absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
       fusion_node_evaluations_;
+
+  const GpuDeviceInfo device_info_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 384a1f5b2d2..615a5d3dc5f 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -29,7 +27,11 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace gpu {
 
-using InstructionFusionTest = HloTestBase;
+class InstructionFusionTest : public HloTestBase {
+ public:
+  GpuInstructionFusion duplicating_instruction_fusion_{
+      /*may_duplicate=*/true, TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+};
 
 TEST_F(InstructionFusionTest,
        CostlyProducerAndOperandElementReusingConsumerNotFused) {
@@ -45,8 +47,7 @@ TEST_F(InstructionFusionTest,
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
   EXPECT_EQ(broadcast2, computation->root_instruction());
 }
 
@@ -64,8 +65,7 @@ TEST_F(InstructionFusionTest,
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
@@ -82,8 +82,7 @@ TEST_F(InstructionFusionTest,
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
@@ -100,8 +99,7 @@ TEST_F(InstructionFusionTest,
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
@@ -119,8 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotFused) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(log, computation->root_instruction());
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
@@ -135,8 +132,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 // Tests that broadcasts fused into a fusion with a reduce root.
@@ -159,8 +155,7 @@ TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -186,8 +181,7 @@ TEST_F(InstructionFusionTest, DoNotFuseLayoutChangingOpWithReduce) {
     })")
                     .value();
 
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 TEST_F(InstructionFusionTest, DoNotFuseLayoutChangingOpWithReduceFusion) {
@@ -215,8 +209,7 @@ TEST_F(InstructionFusionTest, DoNotFuseLayoutChangingOpWithReduceFusion) {
     })")
                     .value();
 
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 TEST_F(InstructionFusionTest, DoNotRepeatLargeReduceWindow) {
@@ -241,8 +234,7 @@ TEST_F(InstructionFusionTest, DoNotRepeatLargeReduceWindow) {
     })")
                     .value();
 
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 TEST_F(InstructionFusionTest, FuseLayoutChangingOpWithElementwise) {
@@ -255,8 +247,7 @@ TEST_F(InstructionFusionTest, FuseLayoutChangingOpWithElementwise) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -275,8 +266,7 @@ TEST_F(InstructionFusionTest, BitcastIntoAdd) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -296,8 +286,7 @@ TEST_F(InstructionFusionTest, AddIntoBitcast) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -316,8 +305,7 @@ TEST_F(InstructionFusionTest, DontFuseGTE) {
   })")
                     .value();
 
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
@@ -341,8 +329,7 @@ TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
   })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Tuple(op::Fusion(), op::Fusion()))
@@ -371,8 +358,7 @@ TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) {
   })")
                     .value();
 
-  EXPECT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value())
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value())
       << module->ToString();
 }
 
@@ -390,8 +376,7 @@ TEST_F(InstructionFusionTest, DotOutputFusionImpossible) {
   })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -444,8 +429,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) {
 
   // Multi-output fusion is disabled here and performed in the
   // GpuMultiOutputFusion pass instead.
-  ASSERT_FALSE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  ASSERT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
 }
 
 TEST_F(InstructionFusionTest, FuseScalarConstant) {
@@ -462,8 +446,7 @@ TEST_F(InstructionFusionTest, FuseScalarConstant) {
   })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -491,8 +474,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
   }
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
   SCOPED_TRACE(module->ToString());
   for (const HloInstruction* instr : computation->instructions()) {
     EXPECT_LE(instr->operand_count(), MaxOperandsAndOutputsPerFusion())
@@ -527,8 +509,7 @@ TEST_F(InstructionFusionTest, FuseIntoScatter) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Add(op::Fusion(), op::Fusion()));
@@ -557,8 +538,7 @@ TEST_F(InstructionFusionTest, NonscalarConstantsNotFused) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
   // The f32[16] constant should not be fused into the reduce, but the f32[]
   // constant should be.
   auto* root = module->entry_computation()->root_instruction();
@@ -578,8 +558,7 @@ TEST_F(InstructionFusionTest, FuseReverse) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
@@ -698,8 +677,7 @@ TEST_F(InstructionFusionTest, FloatingPointExpIsCheap) {
   })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Tuple(op::Fusion(), op::Fusion()))
@@ -725,13 +703,95 @@ TEST_F(InstructionFusionTest, SmallReducedDimensionIsNotLoweredToLoop) {
     })")
                     .value();
 
-  EXPECT_TRUE(
-      GpuInstructionFusion(/*may_duplicate=*/true).Run(module.get()).value());
+  EXPECT_TRUE(duplicating_instruction_fusion_.Run(module.get()).value());
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   ASSERT_THAT(root, op::Fusion());
   EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kInput);
 }
 
+TEST_F(InstructionFusionTest, DontTouchSoftmaxCustomCall) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule softmax, entry_computation_layout={(f32[554112,10]{1,0})->f32[554112,10]{1,0}}
+
+    %max_computation (arg_0: f32[], arg_1: f32[]) -> f32[] {
+      %arg_0 = f32[] parameter(0)
+      %arg_1 = f32[] parameter(1)
+      ROOT %maximum = f32[] maximum(f32[] %arg_0, f32[] %arg_1)
+    }
+
+    %add_computation (arg_0.1: f32[], arg_1.1: f32[]) -> f32[] {
+      %arg_0.1 = f32[] parameter(0)
+      %arg_1.1 = f32[] parameter(1)
+      ROOT %maximum.1 = f32[] add(f32[] %arg_0.1, f32[] %arg_1.1)
+    }
+
+    %softmax_computation (parameter_0: f32[554112,10]) -> f32[554112,10] {
+      %parameter_0 = f32[554112,10]{1,0} parameter(0)
+      %constant_neg_inf.1 = f32[] constant(-inf)
+      %reduce.1 = f32[554112]{0} reduce(f32[554112,10]{1,0} %parameter_0, f32[] %constant_neg_inf.1), dimensions={1}, to_apply=%max_computation
+      %broadcast.1 = f32[554112,10]{1,0} broadcast(f32[554112]{0} %reduce.1), dimensions={0}
+      %subtract.1 = f32[554112,10]{1,0} subtract(f32[554112,10]{1,0} %parameter_0, f32[554112,10]{1,0} %broadcast.1)
+      %exponential.1 = f32[554112,10]{1,0} exponential(f32[554112,10]{1,0} %subtract.1)
+      %constant_zero.1 = f32[] constant(0)
+      %second_reduce.1 = f32[554112]{0} reduce(f32[554112,10]{1,0} %exponential.1, f32[] %constant_zero.1), dimensions={1}, to_apply=%add_computation
+      %second_broadcast.1 = f32[554112,10]{1,0} broadcast(f32[554112]{0} %second_reduce.1), dimensions={0}
+      ROOT %divide.1 = f32[554112,10]{1,0} divide(f32[554112,10]{1,0} %exponential.1, f32[554112,10]{1,0} %second_broadcast.1)
+    }
+
+    ENTRY %main (param_0: f32[554112,10]) -> f32[554112,10] {
+      %param_0 = f32[554112,10]{1,0} parameter(0)
+      ROOT %custom-call = f32[554112,10]{1,0} custom-call(f32[554112,10]{1,0} %param_0), custom_call_target="__softmax_fusion", called_computations={%softmax_computation}
+    })")
+                    .value();
+  EXPECT_FALSE(duplicating_instruction_fusion_.Run(module.get()).value());
+}
+
+TEST_F(InstructionFusionTest, IotaIntoVariadicReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule m
+
+  f {
+    tmp_0 = f32[] parameter(0)
+    tmp_1 = f32[] parameter(1)
+    tmp_2 = pred[] compare(tmp_0, tmp_1), direction=GE
+    tmp_3 = f32[] select(tmp_2, tmp_0, tmp_1)
+    tmp_4 = pred[] compare(tmp_0, tmp_1), direction=EQ
+    tmp_5 = s32[] parameter(2)
+    tmp_6 = s32[] parameter(3)
+    tmp_7 = s32[] minimum(tmp_5, tmp_6)
+    tmp_8 = s32[] select(tmp_2, tmp_5, tmp_6)
+    tmp_9 = s32[] select(tmp_4, tmp_7, tmp_8)
+    ROOT tmp_10 = (f32[], s32[]) tuple(tmp_3, tmp_9)
+  }
+
+  minmax {
+    tmp_0 = f32[] parameter(0)
+    tmp_1 = f32[] parameter(2)
+    tmp_2 = s32[] parameter(1)
+    tmp_3 = s32[] parameter(3)
+    ROOT tmp_4 = (f32[], s32[]) fusion(tmp_0, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=f
+  }
+
+  ENTRY e {
+    tmp_0 = f32[554112,10]{1,0} parameter(0)
+    tmp_1 = s32[554112,10]{1,0} iota(), iota_dimension=1
+    tmp_2 = f32[] constant(-inf)
+    tmp_3 = s32[] constant(0)
+    ROOT tmp_4 = (f32[554112]{0}, s32[554112]{0}) reduce(tmp_0, tmp_1, tmp_2, tmp_3), dimensions={1}, to_apply=minmax
+  })")
+                    .value();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false,
+                                   TestGpuDeviceInfo::RTXA6000DeviceInfo())
+                  .Run(module.get())
+                  .value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Fusion(op::Parameter()));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction()->fused_expression_root(),
+      op::Reduce(op::Parameter(), op::Iota(), op::Constant(), op::Constant()));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 532a5fff619..b7db965e93a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -24,10 +24,10 @@ limitations under the License.
 
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
@@ -101,7 +101,8 @@ bool IsMatrixMultiplication(const HloInstruction& dot) {
 
   PrimitiveType output_primitive_type = dot.shape().element_type();
   bool type_is_allowed =
-      (output_primitive_type == F16 || output_primitive_type == BF16 ||
+      (output_primitive_type == F8E4M3FN || output_primitive_type == F8E5M2 ||
+       output_primitive_type == F16 || output_primitive_type == BF16 ||
        output_primitive_type == F32 || output_primitive_type == F64 ||
        output_primitive_type == C64 || output_primitive_type == C128) ||
       (output_primitive_type == S32 && lhs_shape.element_type() == S8 &&
@@ -127,8 +128,7 @@ bool IsMatrixMultiplication(const HloInstruction& dot) {
   return true;
 }
 
-Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions,
-                           se::CudaComputeCapability cuda_compute_capability) {
+Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions) {
   if (reduction_dimensions.is_row_reduction) {
     int64_t tile_z = std::min(reduction_dimensions.dimensions[0],
                               BatchedReductionRaceFreeBound());
@@ -424,25 +424,6 @@ llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b) {
   return b->CreateAnd(is_thread0, is_block0);
 }
 
-bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
-                                      const HloInstruction* first_reduce) {
-  if (IsReductionFromOrToContiguousDimensions(*inst)) {
-    // Shapes, layouts and dimensions must be the same for all reduces
-    // inside of this fusion.
-    return ShapeUtil::EqualIgnoringElementType(first_reduce->shape(),
-                                               inst->shape()) &&
-           ShapeUtil::EqualIgnoringElementType(
-               first_reduce->operand(0)->shape(), inst->operand(0)->shape()) &&
-           ShapeUtil::EqualIgnoringElementType(
-               first_reduce->operand(1)->shape(), inst->operand(1)->shape()) &&
-           first_reduce->dimensions() == inst->dimensions();
-  }
-  return ShapeUtil::CompatibleIgnoringElementType(
-             first_reduce->operand(0)->shape(), inst->shape()) &&
-         LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
-                           inst->shape().layout());
-}
-
 // Given an LMHLO op, returns the operand index of the first output operand.
 //
 // Notice that an operand alised to an output isn't an output, even though in
@@ -545,12 +526,18 @@ StatusOr<BufferAllocation::Slice> GetAllocationSlice(
 
   // We match the following patterns here:
   //  base := ViewOp(arg) | get_global_memref (global_memref) | arg
-  //  root := base | MemRefReinterpretCastOp(base)
+  //  root := base | MemRefReinterpretCastOp(base) | CollapseShapeOp(base)
 
   if (auto cast = mlir::dyn_cast_or_null<mlir::memref::ReinterpretCastOp>(
           v.getDefiningOp())) {
     v = cast.getViewSource();
   }
+  if (auto collapse_shape =
+          mlir::dyn_cast_or_null<mlir::memref::CollapseShapeOp>(
+              v.getDefiningOp())) {
+    v = collapse_shape.getSrc();
+  }
+
   if (auto view =
           mlir::dyn_cast_or_null<mlir::memref::ViewOp>(v.getDefiningOp())) {
     TF_RET_CHECK(view.getSource().isa<mlir::BlockArgument>());
@@ -627,8 +614,8 @@ Shape GetShape(mlir::Value value) {
   return {};
 }
 
-bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
-                         const Vector3& reduction_tiling) {
+bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions) {
+  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
   return (reduction_dimensions.is_row_reduction &&
           reduction_dimensions.dimensions[2] <=
               MinThreadsXRowReduction() * reduction_tiling[2] &&
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 5781a944559..2be575eccd6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -23,20 +23,14 @@ limitations under the License.
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
-// The amount of shared memory a CUDA kernel can use.
-//
-// Stay on the conservative side, this is smaller than full 64kB, but allows
-// some extra space for cache.
-inline constexpr int64_t kSharedMemoryBudgetInBytes = 48 * 1024;
-
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
 inline constexpr int64_t kMinDimensionToTransposeTiled = 16;
@@ -106,8 +100,7 @@ ReductionDimensions GetReductionKindAndContiguousComponents(
     const HloInstruction& reduce);
 
 // Get tiling per thread for the given reduction in dimensions [D, H, W].
-Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions,
-                           se::CudaComputeCapability cuda_compute_capability);
+Vector3 GetReductionTiling(const ReductionDimensions& reduction_dimensions);
 
 // Emits call to "vprintf" with given format and arguments.
 llvm::Value* EmitPrintf(absl::string_view fmt,
@@ -131,18 +124,6 @@ llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
 // block 0 of the kernel.
 llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
 
-// Returns whether the output of a fusion with reduction are consistent with
-// `first_reduce`.
-bool IsFusedReductionOutputConsistent(const HloInstruction* inst,
-                                      const HloInstruction* first_reduce);
-inline bool AreFusedReductionOutputsConsistent(
-    absl::Span<const HloInstruction* const> output_instructions,
-    const HloInstruction* first_reduce) {
-  return absl::c_all_of(output_instructions, [=](const HloInstruction* inst) {
-    return IsFusedReductionOutputConsistent(inst, first_reduce);
-  });
-}
-
 inline std::string MlirToString(mlir::Operation* op) {
   std::string s;
   {
@@ -184,8 +165,7 @@ Shape GetShape(mlir::Value value);
 
 // Returns whether the given reduction can be safely generated without atomics:
 // that is, at most one block will write to every output element.
-bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions,
-                         const Vector3& reduction_tiling);
+bool ReductionIsRaceFree(const ReductionDimensions& reduction_dimensions);
 
 // Description of how to emit a given transposition.
 //
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
index a1c0b191c56..290ca92b488 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/test.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 0786920a059..258cd63fb1b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -23,14 +23,14 @@ limitations under the License.
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -87,7 +87,7 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   }
   return EmitTargetElementLoop(
       *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
-                                  GetNestedComputer())
+                                  GetNestedComputer(), ir_emitter_context_)
                 .MakeElementGenerator(hlo, operand_to_generator));
 }
 
@@ -560,7 +560,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   // IrEmitterUnnested::HandleFusion.
   CHECK_EQ(HloInstruction::FusionKind::kLoop, fusion->fusion_kind());
   GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
-                                          GetNestedComputer());
+                                          GetNestedComputer(),
+                                          ir_emitter_context_);
   FusedIrEmitter fused_emitter(elemental_emitter);
   BindFusionArguments(fusion, &fused_emitter);
   TF_ASSIGN_OR_RETURN(auto generator, fused_emitter.GetGenerator(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 87c478b468a..d96d3196bad 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -27,15 +27,15 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 11486876c43..e0fb0b63d5b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -24,14 +24,14 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7622b3c4926..d82ab05fd18 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -19,12 +19,15 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <iterator>
+#include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -58,12 +61,16 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Transforms/gpu_passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/transforms/gpu_passes.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
@@ -80,6 +87,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -93,6 +101,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/replica_id_thunk.h"
@@ -100,11 +109,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
@@ -120,12 +124,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/human_readable_json.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/service/gpu/cublas_lt_matmul_thunk.h"
@@ -155,6 +161,37 @@ const auto kDimTot = TilingScheme::DimTot;
 const auto kLinearIndexingX = TilingScheme::LinearIndexingX;
 const auto kStridedIndexingX = TilingScheme::StridedIndexingX;
 
+// Some HLO operations are not implemented as Thunks, and only available when
+// XLA:GPU compiled for XLA runtime. However we still depend on emitting thunk
+// sequence during compilation, and for unsupported operations we emit
+// unreachable thunk, which is not supposed to be executed, and exists only
+// during compilation as we transition from thunks to XLA runtime.
+//
+// Examples: Point-to-point communication operations (Send and Recv) are only
+// available as XLA runtime custom calls. API_VERSION_TYPED_FFI custom calls
+// are only implemented when executing with XLA runtime.
+class UnreachableThunk : public Thunk {
+ public:
+  UnreachableThunk(mlir::Operation* op, std::string error_message)
+      : Thunk(Kind::kKernel, ThunkInfo(op)),
+        error_message_(std::move(error_message)) {}
+
+  UnreachableThunk(const UnreachableThunk&) = delete;
+  UnreachableThunk& operator=(const UnreachableThunk&) = delete;
+
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) final {
+    return tsl::errors::Internal(error_message_);
+  }
+
+  Status ExecuteOnStream(const ExecuteParams& params) final {
+    return tsl::errors::Internal(error_message_);
+  }
+
+ private:
+  std::string error_message_;
+};
+
 void AnnotateWithInt32Value(std::string name, int64_t value,
                             const std::string& kernel_name,
                             llvm::Module* llvm_module) {
@@ -229,6 +266,7 @@ bool MayPreventVectorization(mlir::Operation* op) {
       case HloOpcode::kDot:
       case HloOpcode::kSin:
       case HloOpcode::kCos:
+      case HloOpcode::kTan:
       case HloOpcode::kPower:
       case HloOpcode::kAtan2:
         return true;
@@ -252,8 +290,7 @@ bool MayPreventVectorization(mlir::Operation* op) {
 // Computes the maximum valid unroll factor for a given instruction.
 int ComputeMaxUnrollFactor(mlir::Type type,
                            const HloModuleConfig& hlo_module_config) {
-  int max_unroll_factor =
-      hlo_module_config.debug_options().xla_gpu_max_kernel_unroll_factor();
+  constexpr int kMaxUnrollFactor = 4;
 
   // Find the largest possible power of two to unroll by.
   // TODO(kramerb): Make this smarter.
@@ -262,7 +299,7 @@ int ComputeMaxUnrollFactor(mlir::Type type,
   int64_t num_elements = std::accumulate(
       shaped_type.getShape().begin(), shaped_type.getShape().end(), int64_t{1},
       std::multiplies<int64_t>());
-  for (int i = max_unroll_factor; i > 1; i /= 2) {
+  for (int i = kMaxUnrollFactor; i > 1; i /= 2) {
     if (num_elements % i == 0) {
       return i;
     }
@@ -453,8 +490,8 @@ int RowReductionGetRowsPerWarp(int reduced_dimension_size) {
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                                      IrEmitterContext* ir_emitter_context)
     : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/false),
-      elemental_emitter_(hlo_module_config_, module_, &b_,
-                         GetNestedComputer()) {}
+      elemental_emitter_(hlo_module_config_, module_, &b_, GetNestedComputer(),
+                         ir_emitter_context) {}
 
 StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
     const HloModuleConfig& hlo_module_config,
@@ -533,6 +570,13 @@ StatusOr<BufferAllocation::Slice> IrEmitterUnnested::GetAllocationSlice(
                                       constant_name);
 }
 
+Status IrEmitterUnnested::EmitUnreachable(mlir::Operation* op,
+                                          std::string error_message) {
+  AddThunkToThunkSequence(std::unique_ptr<Thunk>(
+      new UnreachableThunk(op, std::move(error_message))));
+  return OkStatus();
+}
+
 Status IrEmitterUnnested::EmitConstant(mlir::Operation* op) {
   auto get_global = mlir::cast<mlir::memref::GetGlobalOp>(op);
   auto module = get_global->getParentOfType<mlir::ModuleOp>();
@@ -670,9 +714,8 @@ Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
                       CalculateLaunchDimensions(
                           input_shape, ir_emitter_context_->gpu_device_info(),
                           {unroll_factor}));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(pad_to_static, GetThunkInfo(op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(pad_to_static, launch_dimensions));
 
   const llvm_ir::IrArray source_array = ir_arrays[0];
   const llvm_ir::IrArray output_array = ir_arrays[1];
@@ -796,9 +839,8 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
                           {unroll_factor}));
   llvm::Type* index_ty = GetIndexTypeForKernel(
       slice_to_dynamic, launch_dimensions.launch_bound(), &b_);
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(slice_to_dynamic, GetThunkInfo(op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(slice_to_dynamic, launch_dimensions));
 
   TF_RET_CHECK(slice_to_dynamic.getOutput().size() == 1);
   const Shape& data_shape = GetShape(slice_to_dynamic.getOutput().front());
@@ -944,12 +986,11 @@ Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
         GetShape(conv_result), op.getBackendConfig().getResultLayout());
     descriptor.dnums = ConvertConvDimensionNumbers(op.getDimensionNumbers());
     descriptor.scratch_size = scratch_slice.size();
-    mlir::DenseIntElementsAttr window_strides =
-        op.getWindowStrides().getValue();
-    mlir::DenseIntElementsAttr padding = op.getPadding().getValue();
-    mlir::DenseIntElementsAttr lhs_dilation = op.getLhsDilation().getValue();
-    mlir::DenseIntElementsAttr rhs_dilation = op.getRhsDilation().getValue();
-    mlir::DenseElementsAttr window_reversal = op.getWindowReversal().getValue();
+    mlir::DenseIntElementsAttr window_strides = op.getWindowStrides().value();
+    mlir::DenseIntElementsAttr padding = op.getPadding().value();
+    mlir::DenseIntElementsAttr lhs_dilation = op.getLhsDilation().value();
+    mlir::DenseIntElementsAttr rhs_dilation = op.getRhsDilation().value();
+    mlir::DenseElementsAttr window_reversal = op.getWindowReversal().value();
     for (auto index : llvm::seq<int>(0, window_strides.getNumElements())) {
       WindowDimension* dim = descriptor.window.add_dimensions();
       // Window size for a convolution is the same as the kernel size.
@@ -1054,16 +1095,58 @@ Status IrEmitterUnnested::EmitCublasLtMatmulThunk(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(auto c, GetAllocationSlice(matmul.getC()));
   TF_ASSIGN_OR_RETURN(auto d, GetAllocationSlice(matmul.getD()));
 
-  BufferAllocation::Slice bias;
+  BufferAllocation::Slice bias, a_scale, b_scale, c_scale, d_scale, d_amax;
   if (matmul.getBias() != nullptr) {
     TF_ASSIGN_OR_RETURN(bias, GetAllocationSlice(matmul.getBias()));
   }
 
+  BufferAllocation::Slice aux;
+  if (matmul.getAux() != nullptr) {
+    TF_ASSIGN_OR_RETURN(aux, GetAllocationSlice(matmul.getAux()));
+  }
+
+  TF_ASSIGN_OR_RETURN(cublas_lt::MatmulPlan plan,
+                      cublas_lt::MatmulPlan::For(matmul));
+  auto thunk = std::make_unique<CublasLtMatmulThunk>(
+      GetThunkInfo(op), std::move(plan), matmul.getAlgorithm(), a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax);
+
+  AddThunkToThunkSequence(std::move(thunk));
+  return OkStatus();
+}
+
+Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(mlir::Operation* op) {
+  auto matmul = mlir::dyn_cast<mlir::lmhlo_gpu::CublasLtMatmulF8Op>(op);
+  TF_RET_CHECK(matmul != nullptr);
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
+                      GetAllocationSlice(matmul.getA()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b,
+                      GetAllocationSlice(matmul.getB()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice c,
+                      GetAllocationSlice(matmul.getC()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d,
+                      GetAllocationSlice(matmul.getD()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_scale,
+                      GetAllocationSlice(matmul.getAScale()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b_scale,
+                      GetAllocationSlice(matmul.getBScale()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice c_scale,
+                      GetAllocationSlice(matmul.getCScale()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_scale,
+                      GetAllocationSlice(matmul.getDScale()));
+  BufferAllocation::Slice d_amax;
+  if (matmul.getDAmax() != nullptr) {
+    TF_ASSIGN_OR_RETURN(d_amax, GetAllocationSlice(matmul.getDAmax()));
+  }
+
+  BufferAllocation::Slice bias, aux;  // Not used.
+
   TF_ASSIGN_OR_RETURN(cublas_lt::MatmulPlan plan,
                       cublas_lt::MatmulPlan::For(matmul));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       GetThunkInfo(op), std::move(plan), matmul.getAlgorithm(), a, b, c, d,
-      bias);
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax);
 
   AddThunkToThunkSequence(std::move(thunk));
   return OkStatus();
@@ -1213,7 +1296,14 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
 
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(platform_name()));
-  if (!call_target) {
+
+  // Typed custom calls only are supported by XLA runtime. It's ok to emit a
+  // thunk with an unresolved custom call target, as we'll never execute it.
+  bool is_typed_custom_call =
+      custom_call.getApiVersion() ==
+      mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI;
+
+  if (!call_target && !is_typed_custom_call) {
     return Unimplemented(
         "No registered implementation for custom call to \"%s\"",
         call_target_name);
@@ -1291,14 +1381,28 @@ Status IrEmitterUnnested::EmitCustomCallThunk(mlir::Operation* op) {
       custom_call_target =
           reinterpret_cast<status_returning_call_type>(call_target);
       break;
+    case mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      custom_call_target = [](CustomCallThunk::Stream, void**, const char*,
+                              size_t, XlaCustomCallStatus*) {
+        LOG(FATAL) << "Typed FFI custom call must be called by XLA runtime";
+      };
+      break;
     default:
       return InternalError("Unknown custom-call API version enum value: %d",
                            custom_call.getApiVersion());
   }
 
+  // Thunks support only user-encoded string backend config.
+  std::string backend_config;
+  if (auto str = custom_call.getBackendConfig()
+                     .value_or(mlir::Attribute())
+                     .dyn_cast_or_null<mlir::StringAttr>()) {
+    backend_config = str.str();
+  }
+
   auto thunk = std::make_unique<CustomCallThunk>(
       GetThunkInfo(op), std::move(custom_call_target), std::move(operands),
-      std::move(results), custom_call.getBackendConfig().str());
+      std::move(results), backend_config);
   AddThunkToThunkSequence(std::move(thunk));
   return OkStatus();
 }
@@ -1375,9 +1479,13 @@ Status IrEmitterUnnested::EmitTriangularSolveCustomCall(mlir::Operation* op) {
 
   const Shape b_shape = GetShape(operands[1]);
   const PrimitiveType elem_ty = b_shape.element_type();
+
   TriangularSolveOptions backend_config;
-  TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(
-      custom_call.getBackendConfig().str(), &backend_config));
+  if (auto str = custom_call.getBackendConfig()
+                     .value_or(mlir::Attribute())
+                     .dyn_cast_or_null<mlir::StringAttr>())
+    TF_RETURN_IF_ERROR(
+        tsl::HumanReadableJsonToProto(str.str(), &backend_config));
 
   ThunkSequence thunks;
 
@@ -1546,8 +1654,7 @@ Status IrEmitterUnnested::EmitLaunchFunc(mlir::Operation* op) {
   llvm::Function* prototype_func = b_.GetInsertBlock()->getParent();
   llvm::Function* implementation_func =
       module_->getFunction(kernel_func.getName());
-  prototype_func->getBasicBlockList().splice(
-      prototype_func->end(), implementation_func->getBasicBlockList());
+  prototype_func->splice(prototype_func->end(), implementation_func);
   for (const auto& [arg, ir_array] :
        llvm::zip_first(implementation_func->args(), ir_arrays)) {
     arg.replaceAllUsesWith(ir_array.GetBasePointer());
@@ -1636,9 +1743,8 @@ Status IrEmitterUnnested::EmitLoopFusion(mlir::Operation* op) {
       LaunchDimensions launch_dimensions,
       CalculateLaunchDimensions(element_shape, gpu_device_info, launch_config));
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion, GetThunkInfo(op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion, launch_dimensions));
 
   absl::Span<llvm_ir::IrArray> operand_arrays =
       absl::MakeSpan(ir_arrays).subspan(0, fusion.getInputBuffers().size());
@@ -1703,9 +1809,8 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
       tiling_scheme.GetNumberOfBlocksPhysical(),
       tiling_scheme.GetNumThreadsPerBlockPhysical());
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion, GetThunkInfo(fusion), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion, launch_dimensions));
 
   TF_RETURN_IF_ERROR(EmitTranspose021Tile(
       fusion, fused_computation,
@@ -1869,13 +1974,12 @@ Status IrEmitterUnnested::EmitSelectAndScatter(mlir::Operation* op) {
                                 ir_emitter_context_->gpu_device_info()));
 
   // Init value is not needed in IR emission.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(
-          select_and_scatter_op,
-          {select_and_scatter_op.getOperand(),
-           select_and_scatter_op.getSource(), select_and_scatter_op.getOut()},
-          Thunk::ThunkInfo(op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(select_and_scatter_op,
+                                       {select_and_scatter_op.getOperand(),
+                                        select_and_scatter_op.getSource(),
+                                        select_and_scatter_op.getOut()},
+                                       launch_dimensions));
 
   CHECK_EQ(ir_arrays.size(), 3);
   const IrArray& operand_array = ir_arrays[0];
@@ -1932,7 +2036,7 @@ Status IrEmitterUnnested::EmitSelectAndScatter(mlir::Operation* op) {
 
     DimensionVector window_size;
     mlir::DenseIntElementsAttr window_dimensions =
-        select_and_scatter_op.getWindowDimensions().getValue();
+        select_and_scatter_op.getWindowDimensions().value();
     for (const auto& dim : window_dimensions) {
       window_size.push_back(dim.getSExtValue());
       CHECK_GT(dim.getSExtValue(), 0);
@@ -2114,9 +2218,9 @@ Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
   auto rng_op = mlir::dyn_cast<mlir::lmhlo::RngGetAndUpdateStateOp>(op);
 
   // Emit a kernel to increment the global state for Philox RNG algorithm.
-  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                      BuildKernelThunk(rng_op, rng_op.getState(),
-                                       GetThunkInfo(op), LaunchDimensions()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm_ir::IrArray> ir_arrays,
+      BuildKernelThunk(rng_op, rng_op.getState(), LaunchDimensions()));
 
   llvm::Value* old_state =
       llvm_ir::RngGetAndUpdateState(rng_op.getDelta(), module_, &b_);
@@ -2139,11 +2243,6 @@ Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
 Status IrEmitterUnnested::EmitScatter(mlir::Operation* op) {
   auto scatter_op = mlir::cast<mlir::lmhlo::ScatterOp>(op);
 
-  if (!scatter_op.getUniqueIndices()) {
-    TF_RETURN_IF_ERROR(
-        AssertNonDeterminismIsOkay(GetIrNameFromLoc(scatter_op.getLoc())));
-  }
-
   TF_ASSIGN_OR_RETURN(auto operand_buffer,
                       GetAllocationSlice(scatter_op.getOperand()));
   TF_ASSIGN_OR_RETURN(auto output_buffer,
@@ -2171,7 +2270,7 @@ Status IrEmitterUnnested::EmitScatter(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(
       std::vector<llvm_ir::IrArray> ir_arrays,
       BuildKernelThunk(scatter_op, scatter_op.getOperands().drop_front(),
-                       GetThunkInfo(op), launch_dimensions));
+                       launch_dimensions));
 
   CHECK_EQ(ir_arrays.size(), 3);
   const IrArray& scatter_indices = ir_arrays[0];
@@ -2230,9 +2329,6 @@ Status IrEmitterUnnested::EmitScatter(
 
 Status IrEmitterUnnested::EmitScatter(
     const ScatterDescriptor& desc, const LaunchDimensions& launch_dimensions) {
-  if (!desc.unique_indices) {
-    TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(desc.name));
-  }
   auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
     std::vector<llvm::Value*> raw_window_multidim;
     std::vector<llvm::Value*> input_scatter_multidim;
@@ -2380,7 +2476,6 @@ IrEmitterUnnested::GetOrCreateSubComputationFromRegion(mlir::Region* region,
     mlir::MlirToHloConversionOptions options;
     options.propagate_layouts = true;
     options.propagate_bitcast_layouts_to_backend_config = true;
-    options.legalize_node_names = false;
     TF_RETURN_IF_ERROR(
         ConvertRegionToComputation(region, &xla_computation, options));
 
@@ -2612,8 +2707,7 @@ Status IrEmitterUnnested::EmitSort(mlir::Operation* op) {
                                              : standard_launch_dimensions;
     TF_ASSIGN_OR_RETURN(
         std::vector<llvm_ir::IrArray> ir_arrays,
-        BuildKernelThunk(sort_op, sort_op.getOutput(), Thunk::ThunkInfo(op),
-                         launch_dimensions));
+        BuildKernelThunk(sort_op, sort_op.getOutput(), launch_dimensions));
     std::vector<IrArray> values_arrays;
     values_arrays.reserve(operands.size());
     for (int64_t i = 0; i < operands.size(); ++i) {
@@ -2668,8 +2762,9 @@ Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
   return OkStatus();
 }
 
+template <typename NcclThunkType, typename OpT>
 Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
-  auto collective_permute_op = mlir::cast<mlir::lmhlo::CollectivePermuteOp>(op);
+  auto collective_permute_op = mlir::cast<OpT>(op);
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice source_slice,
                       GetAllocationSlice(collective_permute_op.getOperand()));
@@ -2680,8 +2775,9 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
   const int64_t replica_count = hlo_module_config_.replica_count();
   const int64_t partition_count = hlo_module_config_.num_partitions();
 
-  if (NcclCollectivePermuteThunk::IsDegenerate(
-          collective_permute_op, replica_count, partition_count)) {
+  NcclCollectiveThunk::AsyncExecutor* async_executor = nullptr;
+  if (NcclThunkType::IsDegenerate(collective_permute_op, replica_count,
+                                  partition_count)) {
     // For a degenerate collective permute, just generate a copy thunk.
     AddThunkToThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
         GetThunkInfo(op),
@@ -2691,34 +2787,31 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
         /*source_value=*/collective_permute_op.getOperand(),
         /*destination_value=*/collective_permute_op.getOutput()));
   } else {
-    const NcclCollectivePermuteThunk::Buffer buffer = {
+    const NcclCollectiveThunk::Buffer buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(shape),
         /*source_buffer=*/source_slice,
         /*destination_buffer=*/result_slice};
-    auto thunk = std::make_unique<NcclCollectivePermuteThunk>(
-        GetThunkInfo(op), collective_permute_op, replica_count, partition_count,
-        buffer);
+    auto thunk =
+        std::make_unique<NcclThunkType>(GetThunkInfo(op), collective_permute_op,
+                                        replica_count, partition_count, buffer);
+    if constexpr (std::is_same_v<NcclThunkType,
+                                 NcclCollectivePermuteStartThunk>) {
+      async_executor = &thunk->async_executor();
+    }
     AddThunkToThunkSequence(std::move(thunk));
   }
-  return OkStatus();
-}
 
-Status MaybeAddAllReduceStartThunkToMap(
-    absl::flat_hash_map<mlir::Operation*, NcclAllReduceStartThunk*>&
-        all_reduce_start_thunks,
-    mlir::Operation* op, Thunk* thunk) {
-  if (mlir::isa<mlir::lmhlo_gpu::AllReduceStartOp>(op)) {
-    TF_RET_CHECK(all_reduce_start_thunks
-                     .emplace(op, static_cast<NcclAllReduceStartThunk*>(thunk))
-                     .second)
-        << "all-reduce-start with this unique ID already seen";
+  // Signal that start thunk not created with nullptr.
+  if constexpr (std::is_same_v<NcclThunkType,
+                               NcclCollectivePermuteStartThunk>) {
+    async_executors_.insert({op, async_executor});
   }
   return OkStatus();
 }
 
-template <typename NcclThunkType, typename OpTy>
+template <typename NcclThunkType, typename OpT>
 Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
-  OpTy op = mlir::cast<OpTy>(untyped_op);
+  OpT op = mlir::cast<OpT>(untyped_op);
   int64_t replica_count = hlo_module_config_.replica_count();
   int64_t partition_count = hlo_module_config_.num_partitions();
   VLOG(2) << NcclThunkType::GetName() << "; replica count: " << replica_count
@@ -2756,16 +2849,19 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
     auto thunk =
         std::make_unique<NcclThunkType>(GetThunkInfo(op), op,
                                         /*buffers=*/std::move(buffers));
-    // Record thunks for all-reduce-start ops as the done ops need them.
-    TF_RETURN_IF_ERROR(MaybeAddAllReduceStartThunkToMap(
-        all_reduce_start_thunks_, op, thunk.get()));
+
+    if constexpr (std::is_same_v<NcclThunkType, NcclAllReduceStartThunk>) {
+      async_executors_.insert({untyped_op, &thunk->async_executor()});
+    }
+
     AddThunkToThunkSequence(std::move(thunk));
     return OkStatus();
   }
 
-  // Signal that all-reduce-start thunk not created with nullptr.
-  TF_RETURN_IF_ERROR(
-      MaybeAddAllReduceStartThunkToMap(all_reduce_start_thunks_, op, nullptr));
+  // Signal that start thunk not created with nullptr.
+  if constexpr (std::is_same_v<NcclThunkType, NcclAllReduceStartThunk>) {
+    async_executors_.insert({untyped_op, nullptr});
+  }
 
   if (!is_degenerate) {
     CollectiveOpGroupMode group_mode = NcclThunkType::GetGroupMode(op);
@@ -2809,21 +2905,18 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
   return OkStatus();
 }
 
-Status IrEmitterUnnested::EmitAllReduceDone(mlir::Operation* op) {
-  auto done_op = mlir::cast<mlir::lmhlo_gpu::AllReduceDoneOp>(op);
-  auto start_op =
-      done_op.getToken().getDefiningOp<mlir::lmhlo_gpu::AllReduceStartOp>();
-  auto it = all_reduce_start_thunks_.find(start_op);
-  TF_RET_CHECK(it != all_reduce_start_thunks_.end())
-      << "couldn't find thunk for all-reduce-start op";
-
-  // Can be null if no all-reduce-start thunk was created (e.g. if the start op
-  // is degenerate), in which case there's nothing to do here.
-  if (it->second != nullptr) {
-    AddThunkToThunkSequence(std::make_unique<NcclAllReduceDoneThunk>(
-        GetThunkInfo(op), *it->second));
-  }
-  all_reduce_start_thunks_.erase(it);
+template <typename NcclThunkType, typename OpT>
+Status IrEmitterUnnested::EmitNcclAsyncDone(mlir::Operation* op) {
+  auto start_op = mlir::cast<OpT>(op).getToken().getDefiningOp();
+  auto async_executor = async_executors_.extract(start_op);
+  TF_RET_CHECK(async_executor) << "couldn't find async executor for start op";
+
+  // Can be null if no start thunk was created (e.g. if the start op is
+  // degenerate), in which case there's nothing to do here.
+  if (async_executor.mapped() != nullptr) {
+    AddThunkToThunkSequence(std::make_unique<NcclThunkType>(
+        GetThunkInfo(op), *async_executor.mapped()));
+  }
   return OkStatus();
 }
 
@@ -2931,6 +3024,17 @@ StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunkImpl(
       continue;
     }
 
+    if (auto collapse_shape_op =
+            llvm::dyn_cast<mlir::memref::CollapseShapeOp>(defining_op)) {
+      argument = collapse_shape_op.getSrc();
+      if (auto view_op =
+              llvm::dyn_cast<mlir::memref::ViewOp>(argument.getDefiningOp())) {
+        argument = view_op.getOperand(0);
+      }
+      add_if_not_exists(argument);
+      continue;
+    }
+
     return Unimplemented(
         "Defining op for argument to GPU kernel not handled: %s",
         defining_op->getName().getStringRef().str());
@@ -3055,7 +3159,7 @@ IrEmitterUnnested::ValueToKernelArgument(mlir::Value operand, int order,
 }
 
 StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
-    mlir::Operation* op, mlir::ValueRange operands, Thunk::ThunkInfo thunk_info,
+    mlir::Operation* op, mlir::ValueRange operands,
     const LaunchDimensions& launch_dimensions) {
   TF_RET_CHECK(!mlir::isa<mlir::lmhlo::FusionOp>(op));
 
@@ -3066,13 +3170,13 @@ StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
         ValueToKernelArgument(operand, i, WritesMlirBuffer(op, operand)));
   }
   std::string name = GetIrNameFromLoc(op->getLoc());
+  auto thunk_info = GetThunkInfo(op);
   return BuildKernelThunkImpl(name, thunk_info, std::move(kernel_arguments),
                               launch_dimensions);
 }
 
 StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
-    mlir::Operation* op, Thunk::ThunkInfo thunk_info,
-    const LaunchDimensions& launch_dimensions) {
+    mlir::Operation* op, const LaunchDimensions& launch_dimensions) {
   if (auto fusion = mlir::dyn_cast<mlir::lmhlo::FusionOp>(op)) {
     llvm::SmallVector<mlir::Value> operands = GetHloOperands(op);
     llvm::SmallVector<mlir::Value> outputs = GetHloOutputs(op);
@@ -3088,10 +3192,10 @@ StatusOr<std::vector<llvm_ir::IrArray>> IrEmitterUnnested::BuildKernelThunk(
     }
 
     std::string name = GetIrNameFromLoc(op->getLoc());
-    return BuildKernelThunkImpl(name, thunk_info, kernel_arguments,
+    return BuildKernelThunkImpl(name, GetThunkInfo(op), kernel_arguments,
                                 launch_dimensions);
   }
-  return BuildKernelThunk(op, op->getOperands(), thunk_info, launch_dimensions);
+  return BuildKernelThunk(op, op->getOperands(), launch_dimensions);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildConstantInitializerThunk(
@@ -3149,7 +3253,7 @@ IrEmitterUnnested::TryBuildConstantInitializerThunk(mlir::Operation* op,
       // If the initial value happens to be a constant, generate a specialized
       // thunk.
       const_init = global_memref.getInitialValue()
-                       .getValue()
+                       .value()
                        .cast<mlir::DenseElementsAttr>();
     }
   } else if (auto constant = mlir::dyn_cast_or_null<mlir::mhlo::ConstantOp>(
@@ -3196,8 +3300,7 @@ Status IrEmitterUnnested::BuildInitializerThunk(mlir::Operation* op,
                           dest_shape, ir_emitter_context_->gpu_device_info()));
   TF_ASSIGN_OR_RETURN(
       std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(op, {init_value, dest}, Thunk::ThunkInfo(op),
-                       launch_dimensions));
+      BuildKernelThunk(op, {init_value, dest}, launch_dimensions));
 
   const llvm_ir::IrArray init_array = ir_arrays[0];
   const llvm_ir::IrArray dest_array = ir_arrays[1];
@@ -3236,9 +3339,8 @@ Status IrEmitterUnnested::BuildFusedInitializerThunk(
   TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
                       CalculateLaunchDimensions(
                           dest_shape, ir_emitter_context_->gpu_device_info()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion, Thunk::ThunkInfo(fusion), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion, launch_dimensions));
 
   const llvm_ir::IrArray dest_array =
       ir_arrays[input_buffers.size() + output_index];
@@ -3346,18 +3448,18 @@ static llvm::Value* GetStartOffsetX(const TilingScheme& tiling_scheme,
 // }
 static void EmitXTileLoop(
     const IrEmitterUnnested::ThreadIdInfo& thread_id_info,
-    const TilingScheme& tiling_scheme, bool check_x_tile_bounds,
-    llvm::Value* start_offset_x, llvm::Value* y_loc,
-    IrEmitterUnnested::ValueVector2 tile_dimensions,
-    const IrArray::Index& source_idx, llvm::IRBuilder<>* b,
+    const IrArray::Index& tile_origin_index, const TilingScheme& tiling_scheme,
+    bool check_x_tile_bounds, llvm::Value* y_loc,
+    IrEmitterUnnested::ValueVector2 tile_dimensions, llvm::IRBuilder<>* b,
     const IrEmitterUnnested::EmitElementFunction* emit_elem_function) {
   llvm::Type* index_ty = tile_dimensions[1]->getType();
   KernelSupportLibrary ksl(b, llvm_ir::UnrollMode::kDefaultUnroll);
   auto constant = [&](int64_t val) {
     return llvm::ConstantInt::get(index_ty, val);
   };
+  llvm::Value* start_offset_x =
+      GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x, index_ty, b);
 
-  IrArray::Index source_idx_x_base = source_idx.AddOffsetToDim(y_loc, kDimY, b);
   int64_t vector_size = tiling_scheme.GetVectorSize();
   int64_t stride_x = tiling_scheme.GetIndexingOrder() == kLinearIndexingX
                          ? 1
@@ -3369,19 +3471,15 @@ static void EmitXTileLoop(
       /*end=*/constant(tiling_scheme.GetTileSizeFor(kDimX) / vector_size),
       /*step=*/1, [&](llvm::Value* x) {
         for (int64_t i = 0; i < vector_size; i++) {
-          llvm::Value* linear_index =
-              b->CreateAdd(b->CreateMul(x, constant(vector_size)), constant(i));
-          llvm::Value* x_loc = b->CreateAdd(
-              b->CreateAdd(b->CreateMul(x, constant(stride_x * vector_size)),
-                           constant(i)),
-              start_offset_x, "x_loc");
-          IrArray::Index source_idx_x = source_idx_x_base.AddOffsetToDim(
-              b->CreateAdd(b->CreateMul(x, constant(stride_x * vector_size)),
-                           constant(i)),
-              kDimX, b);
+          llvm::Value* x_offset = b->CreateAdd(
+              b->CreateMul(x, constant(stride_x * vector_size)), constant(i));
+          llvm::Value* x_loc = b->CreateAdd(x_offset, start_offset_x, "x_loc");
+          IrArray::Index source_idx_x =
+              tile_origin_index.AddOffsetToDim(y_loc, kDimY, b)
+                  .AddOffsetToDim(x_loc, kDimX, b);
           auto emit_element = [&] {
             return (*emit_elem_function)(thread_id_info, source_idx_x, y_loc,
-                                         x_loc, linear_index);
+                                         x_loc);
           };
           if (check_x_tile_bounds) {
             ksl.If("x_in_tile", b->CreateICmpULT(x_loc, tile_dimensions[1]),
@@ -3402,12 +3500,8 @@ void IrEmitterUnnested::EmitTile(
     return llvm::ConstantInt::get(index_ty, val);
   };
   llvm::Value* num_threads_y = constant(tiling_scheme.GetNumThreadsFor(kDimY));
-  llvm::Value* start_offset_x =
-      GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x, index_ty, &b_);
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-  IrArray::Index source_idx =
-      tile_origin_index.AddOffsetToDim(start_offset_x, kDimX, &b_);
 
   ksl.For(
       "y_in_tile",
@@ -3416,9 +3510,8 @@ void IrEmitterUnnested::EmitTile(
       tile_dimensions[0],
       /*step=*/num_threads_y, [&](llvm::Value* y_loc) {
         auto unroll_inner_tile_loop = [&](bool check_x_tile_bounds) {
-          return EmitXTileLoop(thread_id_info, tiling_scheme,
-                               check_x_tile_bounds, start_offset_x, y_loc,
-                               tile_dimensions, source_idx, &b_,
+          return EmitXTileLoop(thread_id_info, tile_origin_index, tiling_scheme,
+                               check_x_tile_bounds, y_loc, tile_dimensions, &b_,
                                &emit_elem_function);
         };
 
@@ -3562,8 +3655,7 @@ ReductionCodegenState IrEmitterUnnested::GenerateReductionCodegenState(
 
 void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForReduce(
     const HloComputation* reducer,
-    absl::Span<std::pair<llvm::Value* const, llvm::Type* const>>
-        partial_result_addresses,
+    absl::Span<TypedPointer const> partial_result_addresses,
     int threads_per_block, int num_results_per_warp) {
   // This only works when the block size is a multiple of 32 threads.
 
@@ -3579,10 +3671,8 @@ void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForReduce(
       reduction_params.push_back(acc.first);
     }
 
-    for (auto i : partial_result_addresses) {
-      llvm::Value* partial_result_address = i.first;
-      llvm::Type* element_type = i.second;
-
+    for (auto [partial_result_address, element_type] :
+         partial_result_addresses) {
       int bit_width = llvm_ir::GetSizeInBits(element_type);
       llvm::Value* result_from_other_lane = llvm_ir::EmitAllocaAtFunctionEntry(
           element_type, "result_from_other_lane", &b_);
@@ -3745,6 +3835,28 @@ llvm::Value* IrEmitterUnnested::CastSharedToGlobal(llvm::Value* input,
                                 name);
 }
 
+void IrEmitterUnnested::WriteReductionOutput(
+    llvm::Type* index_ty, const ReductionCodegenState& reduction_codegen_state,
+    const TilingKernelInfo& tiling_kernel_info,
+    const ReductionOutputMap& output_arrays,
+    const HloReduceInstruction* reduction, int partial_result_idx,
+    const absl::Span<TypedPointer const> values) {
+  const HloComputation* reducer = reduction->to_apply();
+  for (auto [oidx, typed_ptr] : llvm::enumerate(values)) {
+    auto [output_ptr, type] = typed_ptr;
+    llvm::Value* output_address = GetOutputAddressForReduction(
+        partial_result_idx, index_ty, reduction_codegen_state,
+        tiling_kernel_info, output_arrays, reduction, oidx);
+    if (reduction_codegen_state.IsRaceFree()) {
+      b_.CreateStore(b_.CreateLoad(type, output_ptr, "output"), output_address);
+    } else {
+      CHECK_EQ(values.size(), 1);
+      TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+          *reducer, output_address, output_ptr, type));
+    }
+  }
+}
+
 void IrEmitterUnnested::EmitReductionOutputForRowReduction(
     const TilingKernelInfo& tiling_kernel_info,
     const ReductionCodegenState& reduction_codegen_state, llvm::Type* index_ty,
@@ -3761,8 +3873,7 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
 
   int num_outputs = reducer->num_parameters() / 2;
   const TilingScheme& tiling_scheme = reduction_codegen_state.GetTilingScheme();
-  absl::InlinedVector<std::pair<llvm::Value* const, llvm::Type* const>, 2>
-      current_outputs;
+  absl::InlinedVector<TypedPointer, 2> current_outputs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const ReductionCodegenState::ReductionCalculationState& state =
         reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
@@ -3783,28 +3894,14 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
   llvm::Value* warp_id =
       b_.CreateUDiv(thread_id_info.thread_id_x, constant(WarpSize()));
 
-  auto emit_write_output =
-      [&](llvm::Value* write_condition,
-          const absl::InlinedVector<
-              std::pair<llvm::Value* const, llvm::Type* const>, 2>& values) {
-        ksl.If("reduction_write_output", write_condition, [&] {
-          for (int oidx = 0; oidx < num_outputs; oidx++) {
-            llvm::Value* output_address = GetOutputAddressForReduction(
-                partial_result_idx, index_ty, reduction_codegen_state,
-                tiling_kernel_info, output_arrays, reduction, oidx);
-
-            if (reduction_codegen_state.IsRaceFree()) {
-              Store(Load(values[oidx].second, values[oidx].first, "output"),
-                    output_address);
-            } else {
-              CHECK_EQ(num_outputs, 1);
-              TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-                  *reducer, output_address, values[oidx].first,
-                  values[oidx].second));
-            }
-          }
-        });
-      };
+  auto emit_write_output = [&](llvm::Value* write_condition,
+                               const absl::Span<TypedPointer const> values) {
+    ksl.If("reduction_write_output", write_condition, [&] {
+      WriteReductionOutput(index_ty, reduction_codegen_state,
+                           tiling_kernel_info, output_arrays, reduction,
+                           partial_result_idx, values);
+    });
+  };
 
   if (num_rows_per_warp > 1) {
     llvm::Value* is_writing_thread = is_zero(b_.CreateAnd(
@@ -3828,8 +3925,7 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
   // output? Not once per each?
   EmitSyncThreads();
   ksl.If("inter_warp_reduce", is_zero(warp_id), [&] {
-    absl::InlinedVector<std::pair<llvm::Value* const, llvm::Type* const>, 2>
-        selected_values;
+    absl::InlinedVector<TypedPointer, 2> selected_values;
     for (int oidx = 0; oidx < num_outputs; oidx++) {
       const ReductionCodegenState::ReductionCalculationState& state =
           reduction_codegen_state.GetCalculationStateFor(reduction, oidx);
@@ -3914,9 +4010,7 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
   EmitSyncThreads();
 
   // Get transposed element from shared memory.
-  absl::InlinedVector<std::pair<llvm::Value* const, llvm::Type* const>, 1>
-      shmem_transposed_addrs;
-
+  absl::InlinedVector<TypedPointer, 2> shmem_transposed_addrs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const ReductionCodegenState::ReductionCalculationState& state =
         reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
@@ -3948,21 +4042,9 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
 
   ksl.If("reduction_write_output",
          b_.CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] {
-           for (int oidx = 0; oidx < num_outputs; oidx++) {
-             llvm::Value* output_address = GetOutputAddressForReduction(
-                 partial_result_idx, index_ty, reduction_codegen_state,
-                 tiling_kernel_info, output_arrays, reduction, oidx);
-             if (reduction_codegen_state.IsRaceFree()) {
-               Store(Load(shmem_transposed_addrs[oidx].second,
-                          shmem_transposed_addrs[oidx].first, "output_value"),
-                     output_address);
-             } else {
-               CHECK_EQ(num_outputs, 1);
-               TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-                   *reducer, output_address, shmem_transposed_addrs[oidx].first,
-                   shmem_transposed_addrs[oidx].second));
-             }
-           }
+           WriteReductionOutput(index_ty, reduction_codegen_state,
+                                tiling_kernel_info, output_arrays, reduction,
+                                partial_result_idx, shmem_transposed_addrs);
          });
 }
 
@@ -4131,7 +4213,6 @@ Status IrEmitterUnnested::EmitTranspose021Tile(
     absl::Span<const llvm_ir::IrArray> output_arrays,
     const TilingScheme& tiling_scheme,
     const LaunchDimensions& launch_dimensions) {
-
   std::vector<HloInstruction*> hlo_roots = GetFusionRoots(fusion_hlo);
   const HloInstruction* first_transpose = &FindNonTrivialHero(
       **absl::c_find_if(hlo_roots, [](HloInstruction* instr) {
@@ -4194,8 +4275,7 @@ Status IrEmitterUnnested::EmitTranspose021Tile(
     EmitTile(
         tiling_scheme, index, thread_id_info, tile_dimensions,
         [&](const ThreadIdInfo& thread_id_info, const IrArray::Index& index,
-            llvm::Value* y_loc, llvm::Value* x_loc,
-            llvm::Value* /*x_iter_num*/) {
+            llvm::Value* y_loc, llvm::Value* x_loc) {
           // Compute all extra output values before writing them. This avoids
           // overwriting aliased input/output values before all reads occurred.
           std::vector<std::tuple<IrArray, IrArray::Index, llvm::Value*>>
@@ -4246,7 +4326,7 @@ Status IrEmitterUnnested::EmitTranspose021Tile(
         /*emit_elem_function=*/
         [&](const ThreadIdInfo& thread_id_info,
             const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-            llvm::Value* x_loc, llvm::Value* /*x_iter_num*/) {
+            llvm::Value* x_loc) {
           for (const auto& [output_idx, root] : llvm::enumerate(hlo_roots)) {
             if (FindAnyTiledTranspose(*root)) {
               const HloInstruction& hero = FindNonTrivialHero(*root);
@@ -4345,7 +4425,8 @@ int64_t NumInputsWithMoreElementsThan(mlir::lmhlo::FusionOp fusion,
 bool IsUnrollingColumnReductionBeneficial(mlir::lmhlo::FusionOp fusion,
                                           HloComputation* fused_computation,
                                           const Shape& input_shape,
-                                          int64_t num_kept_minor) {
+                                          int64_t num_kept_minor,
+                                          bool reduction_is_race_free) {
   if (num_kept_minor % (WarpSize() * 2) != 0) {
     return false;
   }
@@ -4358,25 +4439,17 @@ bool IsUnrollingColumnReductionBeneficial(mlir::lmhlo::FusionOp fusion,
   int64_t cannot_be_vectorized = 0;
   llvm::SmallVector<mlir::Operation*> fusion_roots = fusion.getFusionRoots();
   absl::flat_hash_set<mlir::Operation*> use_chain_endings;
-  auto hlo_roots = GetFusionRoots(fused_computation);
+  std::vector<HloInstruction*> hlo_roots = GetFusionRoots(fused_computation);
 
-  if (fusion_roots.size() == 1) {
-    if (IsReductionFromOrToContiguousDimensions(*hlo_roots[0])) {
-      use_chain_endings.insert(fusion_roots[0]);
-      // Atomic.add of the reduction result can't be vectorized.
+  for (int i = 0; i < fusion_roots.size(); i++) {
+    if (!reduction_is_race_free &&
+        IsReductionFromOrToContiguousDimensions(*hlo_roots[i])) {
+      // Atomics cannot be vectorized.
       cannot_be_vectorized++;
+    } else {
+      can_be_vectorized++;
     }
-  } else {
-    for (int i = 0; i < fusion_roots.size(); i++) {
-      if (IsReductionFromOrToContiguousDimensions(*hlo_roots[i])) {
-        // Atomic.add of the reduction result can't be vectorized.
-        cannot_be_vectorized++;
-      } else {
-        // Write of the non-reduction result can be vectorized.
-        can_be_vectorized++;
-      }
-      use_chain_endings.insert(fusion_roots[i]);
-    }
+    use_chain_endings.insert(fusion_roots[i]);
   }
   // Fusion inputs that have the same dimension as the reduce input and
   // only involve in elementwise operations can be vectorized.
@@ -4487,11 +4560,12 @@ static bool CanVectorizeReduction(
     se::CudaComputeCapability cc, mlir::lmhlo::FusionOp fusion,
     HloComputation* fused_computation,
     const ReductionDimensions& reduction_dimensions, int num_threads_x,
-    std::array<int64_t, 3> reduction_tiling, const Shape& input_shape) {
+    Vector3 reduction_tiling, const Shape& input_shape,
+    bool reduction_is_race_free) {
   if (!reduction_dimensions.is_row_reduction) {
     return IsUnrollingColumnReductionBeneficial(
         fusion, fused_computation, input_shape,
-        reduction_dimensions.dimensions[kDimX]);
+        reduction_dimensions.dimensions[kDimX], reduction_is_race_free);
   }
 
   if (reduction_dimensions.dimensions[kDimX] % 2 != 0 ||
@@ -4523,9 +4597,30 @@ static bool CanVectorizeReduction(
   return false;
 }
 
+// Projected shmem usage of reduction fusion.
+static int64_t ProjectedShmemUsageBytes(
+    const ReductionDimensions& reduction_dimensions,
+    const std::vector<std::vector<HloInstruction*>>& instr_index_groups) {
+  int64_t out = 0;
+  // Different groups are computed in parallel on different blocks, so they are
+  // not sharing the shmem budget. The overall usage is given by the largest
+  // one.
+  for (const std::vector<HloInstruction*>& group : instr_index_groups) {
+    int64_t sum = 0;
+    for (HloInstruction* root : group) {
+      if (IsReductionFromOrToContiguousDimensions(*root)) {
+        sum += SharedMemoryUsage(*root);
+      }
+    }
+    out = std::max(out, sum);
+  }
+  return out;
+}
+
 StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
     mlir::lmhlo::FusionOp fusion, HloComputation* fused_computation,
-    HloInstruction* first_reduce) {
+    HloInstruction* first_reduce,
+    const std::vector<std::vector<HloInstruction*>>& instr_index_groups) {
   Shape input_shape = first_reduce->operand(0)->shape();
   ReductionDimensions reduction_dimensions =
       GetReductionKindAndContiguousComponents(*first_reduce);
@@ -4533,8 +4628,7 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
            << " " << reduction_dimensions.dimensions[0] << " "
            << reduction_dimensions.dimensions[1] << " "
            << reduction_dimensions.dimensions[2];
-  Vector3 reduction_tiling = GetReductionTiling(
-      reduction_dimensions, ir_emitter_context_->cuda_compute_capability());
+  Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
 
   int64_t num_threads_y =
       reduction_dimensions.is_row_reduction ? 1 : WarpSize();
@@ -4569,10 +4663,19 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
   TilingScheme::IndexingOrder indexing_order =
       reduction_dimensions.is_row_reduction ? kStridedIndexingX
                                             : kLinearIndexingX;
+  int64_t shmem_usage =
+      ProjectedShmemUsageBytes(reduction_dimensions, instr_index_groups);
+  const int64_t shmem_budget =
+      ir_emitter_context_->gpu_device_info().shared_memory_per_block;
+  bool reduction_is_race_free = ReductionIsRaceFree(reduction_dimensions);
   bool vectorize =
+      // Vectorization might cause us to run out of budget.
+      (shmem_usage * 2 <= shmem_budget) &&
       CanVectorizeReduction(cc, fusion, fused_computation, reduction_dimensions,
-                            num_threads_x, reduction_tiling, input_shape);
+                            num_threads_x, reduction_tiling, input_shape,
+                            reduction_is_race_free);
   int vector_size = vectorize ? 2 : 1;
+
   int num_partial_results = 1;
   if (!reduction_dimensions.is_row_reduction && vectorize) {
     if (smallest_input_dtype_bits <= 32) {
@@ -4596,6 +4699,13 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
     }
   }
 
+  while (shmem_usage * num_partial_results > shmem_budget) {
+    num_partial_results /= 2;
+    if (num_partial_results == 1) {
+      break;
+    }
+  }
+
   VLOG(3) << "Each thread will produce " << num_partial_results << " output(s)";
   reduction_tiling[kDimX] *= num_partial_results;
 
@@ -4607,9 +4717,9 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
   TilingScheme tiling_scheme(reduction_dimensions.dimensions, reduction_tiling,
                              num_threads, indexing_order, vector_size,
                              virtual_thread_scaling_factor);
-  return ReductionCodegenInfo(
-      tiling_scheme, num_partial_results, reduction_dimensions.is_row_reduction,
-      ReductionIsRaceFree(reduction_dimensions, reduction_tiling));
+  return ReductionCodegenInfo(tiling_scheme, num_partial_results,
+                              reduction_dimensions.is_row_reduction,
+                              reduction_is_race_free);
 }
 
 // Generate a single element of the tile (update the accumulator state) for a
@@ -4695,13 +4805,17 @@ Status IrEmitterUnnested::EmitIRForReduction(
 
   EmitElementFunction emit_reduction_element =
       [&](const ThreadIdInfo& thread_id_info, const IrArray::Index& index,
-          llvm::Value* y_loc, llvm::Value* x_loc, llvm::Value* x_iter_num) {
+          llvm::Value* y_loc, llvm::Value* x_loc) {
         IrArray::Index input_index = GetUnnormalizedIndex(
             index, input_shape, &b_,
             codegen_state.GetTilingScheme().GetDimsInElems());
-
         llvm::Value* partial_result_index =
-            codegen_state.IsRowReduction() ? b_.getInt32(0) : x_iter_num;
+            codegen_state.IsRowReduction()
+                ? b_.getInt32(0)
+                : b_.CreateSub(
+                      x_loc,
+                      GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x,
+                                      index_ty, &b_));
 
         // Clear the linear index field of the IrArray::Index to enable the use
         // of GetElementPointer with array types. This enables the vectorization
@@ -4875,7 +4989,8 @@ Status IrEmitterUnnested::EmitUnnestedReduction(
   // same shape and layout as verified by `IsFusedReductionOutputConsistent()`.
   TF_ASSIGN_OR_RETURN(
       ReductionCodegenInfo reduction_codegen_info,
-      ComputeReductionCodegenInfo(fusion, fused_computation, first_reduce));
+      ComputeReductionCodegenInfo(fusion, fused_computation, first_reduce,
+                                  instr_index_groups));
   const TilingScheme& tiling_scheme = reduction_codegen_info.GetTilingScheme();
 
   // block_y_count is set to instr_index_groups.size(), so that each reduction
@@ -4895,9 +5010,8 @@ Status IrEmitterUnnested::EmitUnnestedReduction(
       }
     }
   }
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion, Thunk::ThunkInfo(fusion), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion, launch_dimensions));
 
   FusedIrEmitter fused_emitter(elemental_emitter_);
   CHECK_LT(fused_computation->num_parameters(), ir_arrays.size());
@@ -4944,7 +5058,6 @@ Status IrEmitterUnnested::EmitUnnestedReduction(
         }));
   }
 
-
   return OkStatus();
 }
 
@@ -5055,9 +5168,8 @@ Status IrEmitterUnnested::EmitInputFusibleNonStridedSlices(
                           element_shape, ir_emitter_context_->gpu_device_info(),
                           {unroll_factor}));
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion, GetThunkInfo(op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion, launch_dimensions));
   return ParallelLoopEmitter(
              [&](const llvm_ir::IrArray::Index index) -> Status {
                return EmitElementForInputFusibleSlices(fused_computation,
@@ -5087,9 +5199,8 @@ Status IrEmitterUnnested::EmitDynamicUpdateSlice(
                                 ir_emitter_context_->gpu_device_info()));
 
   // Set up kernel thunk and fused ir emitter.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<llvm_ir::IrArray> ir_arrays,
-      BuildKernelThunk(fusion_op, GetThunkInfo(fusion_op), launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
+                      BuildKernelThunk(fusion_op, launch_dimensions));
 
   FusedIrEmitter fused_emitter(elemental_emitter_);
   for (int i = 0; i < fused_computation->num_parameters(); i++) {
@@ -5127,8 +5238,7 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
                                   {unroll_factor, /*few_waves=*/false}));
 
     TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                        BuildKernelThunk(fusion_op, Thunk::ThunkInfo(fusion_op),
-                                         launch_dimensions));
+                        BuildKernelThunk(fusion_op, launch_dimensions));
 
     FusedIrEmitter operand_fused_emitter(elemental_emitter_);
     for (int i = 0; i < fused_computation->num_parameters(); i++) {
@@ -5160,8 +5270,7 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
         CalculateLaunchDimensions(updates_shape,
                                   ir_emitter_context_->gpu_device_info()));
     TF_ASSIGN_OR_RETURN(std::vector<llvm_ir::IrArray> ir_arrays,
-                        BuildKernelThunk(fusion_op, Thunk::ThunkInfo(fusion_op),
-                                         launch_dimensions));
+                        BuildKernelThunk(fusion_op, launch_dimensions));
     // Spin up a new fused emitter for the scatter kernel and emit it.
     FusedIrEmitter scatter_fused_emitter = FusedIrEmitter(elemental_emitter_);
     for (int i = 0; i < fused_computation->num_parameters(); i++) {
@@ -5201,9 +5310,10 @@ Status IrEmitterUnnested::EmitScatter(mlir::lmhlo::FusionOp fusion_op,
 }
 
 Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
-  if (mlir::isa<mlir::func::ConstantOp, mlir::arith::ConstantOp,
-                mlir::memref::ViewOp, mlir::memref::ReinterpretCastOp,
-                mlir::func::ReturnOp, mlir::lmhlo::TerminatorOp>(op)) {
+  if (mlir::isa<mlir::memref::CollapseShapeOp, mlir::func::ConstantOp,
+                mlir::arith::ConstantOp, mlir::memref::ReinterpretCastOp,
+                mlir::func::ReturnOp, mlir::lmhlo::TerminatorOp,
+                mlir::memref::ViewOp>(op)) {
     return OkStatus();
   }
 
@@ -5219,7 +5329,9 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
       return EmitSliceToDynamic(op);
     }
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    if (call.getCallTargetName() == kTriangularSolveCallTarget) {
+    auto call_target = call.getCallTargetName();
+    if (absl::string_view(call_target.data(), call_target.size()) ==
+        kTriangularSolveCallTarget) {
       return EmitTriangularSolveCustomCall(op);
     }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -5235,6 +5347,9 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   if (mlir::isa<mlir::lmhlo_gpu::CublasLtMatmulOp>(op)) {
     return EmitCublasLtMatmulThunk(op);
   }
+  if (mlir::isa<mlir::lmhlo_gpu::CublasLtMatmulF8Op>(op)) {
+    return EmitCublasLtMatmulThunkF8(op);
+  }
 #endif  // GOOGLE_CUDA
 
   if (mlir::isa<mlir::lmhlo_gpu::ConvForwardOp,
@@ -5292,7 +5407,18 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo::CollectivePermuteOp>(op)) {
-    return EmitCollectivePermute(op);
+    return EmitCollectivePermute<NcclCollectivePermuteThunk,
+                                 mlir::lmhlo::CollectivePermuteOp>(op);
+  }
+
+  if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteStartOp>(op)) {
+    return EmitCollectivePermute<NcclCollectivePermuteStartThunk,
+                                 mlir::lmhlo_gpu::CollectivePermuteStartOp>(op);
+  }
+
+  if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op)) {
+    return EmitNcclAsyncDone<NcclCollectivePermuteDoneThunk,
+                             mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op);
   }
 
   if (mlir::isa<mlir::lmhlo::AllGatherOp>(op)) {
@@ -5309,7 +5435,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllReduceDoneOp>(op)) {
-    return EmitAllReduceDone(op);
+    return EmitNcclAsyncDone<NcclAllReduceDoneThunk,
+                             mlir::lmhlo_gpu::AllReduceDoneOp>(op);
   }
 
   if (mlir::isa<mlir::lmhlo::ReduceScatterOp>(op)) {
@@ -5348,6 +5475,18 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
     return OkStatus();
   }
 
+  // Point to point communication operations are only implemented as XLA
+  // GPU runtime custom calls.
+  bool is_gpu_runtime = hlo_module_config_.debug_options()
+                            .xla_gpu_enable_xla_runtime_executable();
+  if (is_gpu_runtime &&
+      mlir::isa<mlir::lmhlo::SendOp, mlir::lmhlo::RecvOp,
+                mlir::lmhlo::SendDoneOp, mlir::lmhlo::RecvDoneOp>(op)) {
+    return EmitUnreachable(op,
+                           "Point-to-point communication operations are not "
+                           "implemented as thunks");
+  }
+
   return InternalError("Unrecognized op: %s", MlirToString(op));
 }
 
@@ -5368,21 +5507,9 @@ void IrEmitterUnnested::GetDependentDialects(mlir::DialectRegistry& registry) {
 }
 
 Thunk::ThunkInfo IrEmitterUnnested::GetThunkInfo(mlir::Operation* op) {
-  auto module = op->getParentOfType<mlir::ModuleOp>();
-  // Include the HloModule's unique_id in the thunk's module name so that xprof
-  // shows different modules differently, addressing b/202415436#comment24.
-  // xprof calls this the "program_id".
-  std::string unique_id_str;
-  if (auto unique_id_attr =
-          module->getAttrOfType<mlir::IntegerAttr>("mhlo.unique_id")) {
-    unique_id_str = absl::StrFormat(",program_id=%d",
-                                    unique_id_attr.getValue().getZExtValue());
-  }
   Thunk::ThunkInfo thunk_info(op);
   thunk_info.profile_annotation = absl::StrFormat(
-      "Thunk:#hlo_op=%s,hlo_module=%s%s#",
-      mlir::mhlo::GetDebugNameFromLocation(op->getLoc()),
-      mlir::mhlo::GetDebugNameFromLocation(module->getLoc()), unique_id_str);
+      "Thunk:#hlo_op=%s#", mlir::mhlo::GetDebugNameFromLocation(op->getLoc()));
   return thunk_info;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 94e80f264f6..3e13c75467d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
@@ -132,11 +132,9 @@ class IrEmitterUnnested : public IrEmitter {
   // index: the index for the first output element of the current thread.
   // y_loc: The y coordinate within a tile.
   // x_loc: The x coordinate within a tile.
-  // x_iter_num: When a thread process N elements in the X dimension, x_iter_num
-  //             has a value of 0..N-1 to identify the element being process.
   using EmitElementFunction = std::function<void(
       const ThreadIdInfo& thread_id_info, const llvm_ir::IrArray::Index& index,
-      llvm::Value* y_loc, llvm::Value* x_loc, llvm::Value* x_iter_num)>;
+      llvm::Value* y_loc, llvm::Value* x_loc)>;
 
   using ConstantGenerator = std::function<llvm::Value*(int64_t)>;
 
@@ -181,6 +179,8 @@ class IrEmitterUnnested : public IrEmitter {
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     IrEmitterContext* ir_emitter_context);
 
+  Status EmitUnreachable(mlir::Operation* op, std::string error_message);
+
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter. It also mixes in some special handling for custom kernels
   // via the ThunkEmitter.
@@ -193,6 +193,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitGemmThunk(mlir::Operation* op);
 #if GOOGLE_CUDA
   Status EmitCublasLtMatmulThunk(mlir::Operation* op);
+  Status EmitCublasLtMatmulThunkF8(mlir::Operation* op);
 #endif  // GOOGLE_CUDA
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   Status EmitCholeskyThunk(mlir::Operation* op);
@@ -214,13 +215,15 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitTriangularSolveCustomCall(mlir::Operation* op);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-  template <typename NcclThunkType, typename OpTy>
+  template <typename NcclThunkType, typename OpT>
   Status EmitNcclThunk(mlir::Operation* op);
-  Status EmitAllReduceDone(mlir::Operation* op);
+  template <typename NcclThunkType, typename OpT>
+  Status EmitNcclAsyncDone(mlir::Operation* op);
 
   template <typename ThunkType, typename OpT>
   Status EmitReplicaOrPartitionId(mlir::Operation* op);
 
+  template <typename NcclThunkType, typename OpT>
   Status EmitCollectivePermute(mlir::Operation* op);
 
   Status EmitOp(mlir::Operation* op);
@@ -475,7 +478,8 @@ class IrEmitterUnnested : public IrEmitter {
   // reduce op.
   StatusOr<ReductionCodegenInfo> ComputeReductionCodegenInfo(
       mlir::lmhlo::FusionOp fusion, HloComputation* fused_computation,
-      HloInstruction* first_reduce);
+      HloInstruction* first_reduce,
+      const std::vector<std::vector<HloInstruction*>>& instr_index_groups);
 
   // Generates code for input-fusible slices.
   //
@@ -611,6 +615,16 @@ class IrEmitterUnnested : public IrEmitter {
       const IrEmitterUnnested::ReductionOutputMap& output_arrays,
       const HloReduceInstruction* reduction, int output_idx);
 
+  // Performs the actual write of the reduction result.
+  using TypedPointer = std::pair<llvm::Value* const, llvm::Type* const>;
+  void WriteReductionOutput(
+      llvm::Type* index_ty,
+      const ReductionCodegenState& reduction_codegen_state,
+      const TilingKernelInfo& tiling_kernel_info,
+      const ReductionOutputMap& output_arrays,
+      const HloReduceInstruction* reduction, int partial_result_idx,
+      const absl::Span<TypedPointer const> values);
+
   // `current_output`: the value the tile has calculated.
   // `output_address`: address where the output value has to be written.
   void EmitReductionOutputForRowReduction(
@@ -651,8 +665,7 @@ class IrEmitterUnnested : public IrEmitter {
   // reduction: each one should get the output value.
   void EmitFullWarpShuffleDownLoopForReduce(
       const HloComputation* reducer,
-      absl::Span<std::pair<llvm::Value* const, llvm::Type* const>>
-          partial_result_addresses,
+      absl::Span<TypedPointer const> partial_result_addresses,
       int threads_per_block, int num_results_per_warp = 1);
 
   // Allocates a shared tile of given dimensions, applying scaling specified in
@@ -680,11 +693,10 @@ class IrEmitterUnnested : public IrEmitter {
 
   StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunk(
       mlir::Operation* op, mlir::ValueRange operands,
-      Thunk::ThunkInfo thunk_info, const LaunchDimensions& launch_dimensions);
+      const LaunchDimensions& launch_dimensions);
 
   StatusOr<std::vector<llvm_ir::IrArray>> BuildKernelThunk(
-      mlir::Operation* op, Thunk::ThunkInfo thunk_info,
-      const LaunchDimensions& launch_dimensions);
+      mlir::Operation* op, const LaunchDimensions& launch_dimensions);
 
   // Returns a thunk that, given a reduce or select-and-scatter op,
   // initializes its memory to the appropriate initial value.
@@ -763,9 +775,10 @@ class IrEmitterUnnested : public IrEmitter {
   // The thunk sequence this IrEmitter generates for the input computation.
   ThunkSequence thunk_sequence_;
 
-  // Maps all-reduce-start ops to their thunk so done can access the thunk.
-  absl::flat_hash_map<mlir::Operation*, NcclAllReduceStartThunk*>
-      all_reduce_start_thunks_;
+  // Maps async start ops to their executors so done can access the thunk.
+  // Executor may be null if the start op is degenerate (so not emitted).
+  absl::flat_hash_map<mlir::Operation*, NcclCollectiveThunk::AsyncExecutor*>
+      async_executors_;
 
   // Begin optional members for XLA HLO -> LMHLO:
   absl::flat_hash_map<const mlir::Region*, std::unique_ptr<HloModule>>
diff --git a/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.cc b/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.cc
deleted file mode 100644
index 01e03d558fc..00000000000
--- a/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.cc
+++ /dev/null
@@ -1,1557 +0,0 @@
-// Copyright 2022 The TensorFlow Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h"
-
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/runtime/arguments.h"
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
-#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
-#include "tensorflow/compiler/xla/runtime/executable.h"
-#include "tensorflow/compiler/xla/runtime/jit_executable.h"
-#include "tensorflow/compiler/xla/runtime/type_id.h"
-#include "tensorflow/compiler/xla/runtime/types.h"
-#include "tensorflow/compiler/xla/service/custom_call_status_internal.h"
-#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
-#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
-#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/fft.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/io_feed.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/tsl/platform/human_readable_json.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#if GOOGLE_CUDA
-#include "tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h"
-#endif  // GOOGLE_CUDA
-
-namespace xla {
-namespace gpu {
-
-using Eigen::bfloat16;
-using Eigen::half;
-
-using llvm::ArrayRef;
-
-using mlir::failure;
-using mlir::FailureOr;
-using mlir::LogicalResult;
-using mlir::StringRef;
-using mlir::succeeded;
-using mlir::success;
-
-using ::xla::runtime::AggregateAttrDef;
-using ::xla::runtime::AggregateAttrEncoding;
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::CustomCallAttrEncodingSet;
-using ::xla::runtime::EnumAttrEncoding;
-using ::xla::runtime::Executable;
-using ::xla::runtime::FlatMemrefView;
-using ::xla::runtime::StridedMemrefView;
-using ::xla::runtime::Tagged;
-using ::xla::runtime::TypeIDNameRegistry;
-
-namespace se = ::stream_executor;
-namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
-namespace mhlo = ::mlir::mhlo;
-
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-// -------------------------------------------------------------------------- //
-
-// Add custom call arguments and attributes encoding for custom HLO enums and
-// structs, so that we can pass them to custom calls.
-void PopulateLmhloToXlaAttrEncoding(CustomCallAttrEncodingSet& encoding) {
-  encoding
-      .Add<EnumAttrEncoding<lmhlo_gpu::ActivationAttr, lmhlo_gpu::Activation,
-                            se::dnn::ActivationMode>>(
-          [](lmhlo_gpu::Activation value) -> se::dnn::ActivationMode {
-            return ConvertConvActivationMode(value).value();
-          });
-
-#if GOOGLE_CUDA
-  encoding.Add<EnumAttrEncoding<lmhlo_gpu::CublasLtMatmulEpilogueAttr,
-                                lmhlo_gpu::CublasLtMatmulEpilogue,
-                                se::cuda::BlasLt::Epilogue>>(
-      [](lmhlo_gpu::CublasLtMatmulEpilogue value)
-          -> se::cuda::BlasLt::Epilogue {
-        return cublas_lt::AsBlasLtEpilogue(value).value();
-      });
-#endif  // GOOGLE_CUDA
-
-  encoding
-      .Add<EnumAttrEncoding<mhlo::FftTypeAttr, mhlo::FftType, se::fft::Type>>(
-          [](mhlo::FftType value) -> se::fft::Type {
-            switch (value) {
-              case mhlo::FftType::FFT:
-                return se::fft::Type::kC2CForward;
-              case mhlo::FftType::IFFT:
-                return se::fft::Type::kC2CInverse;
-              case mhlo::FftType::RFFT:
-                return se::fft::Type::kR2C;
-              case mhlo::FftType::IRFFT:
-                return se::fft::Type::kC2R;
-              default:
-                return se::fft::Type::kInvalid;
-            }
-          });
-
-  using DotDimsAttr = mhlo::DotDimensionNumbersAttr;
-  encoding.Add<
-      xla::runtime::AggregateAttrEncoding<DotDimsAttr, DotDimensionNumbers>>(
-      encoding,
-      xla::runtime::AggregateAttrDef<DotDimsAttr>()
-          .Add("lhs_batch", &DotDimsAttr::getLhsBatchingDimensions)
-          .Add("lhs_contract", &DotDimsAttr::getLhsContractingDimensions)
-          .Add("rhs_batch", &DotDimsAttr::getRhsBatchingDimensions)
-          .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions));
-
-  using ConvDimsAttr = mhlo::ConvDimensionNumbersAttr;
-  encoding.Add<
-      xla::runtime::AggregateAttrEncoding<ConvDimsAttr, ConvDimensionNumbers>>(
-      encoding,
-      xla::runtime::AggregateAttrDef<ConvDimsAttr>()
-          .Add("input_batch_dim", &ConvDimsAttr::getInputBatchDimension)
-          .Add("input_feature_dim", &ConvDimsAttr::getInputFeatureDimension)
-          .Add("input_spatial_dims", &ConvDimsAttr::getInputSpatialDimensions)
-          .Add("kernel_in_feature_dim",
-               &ConvDimsAttr::getKernelInputFeatureDimension)
-          .Add("kernel_out_feature_dim",
-               &ConvDimsAttr::getKernelOutputFeatureDimension)
-          .Add("kernel_spatial_dims", &ConvDimsAttr::getKernelSpatialDimensions)
-          .Add("output_batch_dim", &ConvDimsAttr::getOutputBatchDimension)
-          .Add("output_feature_dim", &ConvDimsAttr::getOutputFeatureDimension)
-          .Add("output_spatial_dims",
-               &ConvDimsAttr::getOutputSpatialDimensions));
-
-  using ConvConfigAttr = lmhlo_gpu::ConvolutionBackendConfigAttr;
-  encoding.Add<
-      xla::runtime::AggregateAttrEncoding<ConvConfigAttr, ConvBackendConfig>>(
-      encoding,
-      xla::runtime::AggregateAttrDef<ConvConfigAttr>()
-          .Add("algorithm", &ConvConfigAttr::getAlgorithm)
-          .Add("tensor_ops_enabled", &ConvConfigAttr::getTensorOpsEnabled)
-          .Add("is_cudnn_frontend", &ConvConfigAttr::getIsCudnnFrontend)
-          .Add("knob_ids", &ConvConfigAttr::getKnobIds)
-          .Add("knob_values", &ConvConfigAttr::getKnobValues)
-          .Add("operand_0_layout", &ConvConfigAttr::getOperand_0Layout)
-          .Add("operand_1_layout", &ConvConfigAttr::getOperand_1Layout)
-          .Add("result_layout", &ConvConfigAttr::getResultLayout)
-          .Add("workspace_size", &ConvConfigAttr::getWorkspaceSize));
-}
-
-// -------------------------------------------------------------------------- //
-
-template <typename MemrefArg>
-static se::DeviceMemoryBase GetDeviceAddress(MemrefArg& memref) {
-  uint64_t size = primitive_util::ByteWidth(memref.dtype);
-  for (auto dim : memref.sizes) size *= dim;
-  return se::DeviceMemoryBase(memref.data, size);
-}
-
-static se::DeviceMemoryBase GetDeviceAddress(runtime::FlatMemrefView& memref) {
-  return se::DeviceMemoryBase(memref.data, memref.size_in_bytes);
-}
-
-// -------------------------------------------------------------------------- //
-
-const GemmConfig* JitRtGemmConfigCache::Get(int64_t uid) {
-  absl::MutexLock lock(&mutex_);
-  auto it = configs_.find(uid);
-  if (it != configs_.end()) return &it->second;
-  return nullptr;
-}
-
-const GemmConfig* JitRtGemmConfigCache::Set(int64_t uid, GemmConfig config) {
-  absl::MutexLock lock(&mutex_);
-  auto it = configs_.find(uid);
-  if (it != configs_.end()) return &it->second;
-
-  auto emplaced = configs_.try_emplace(uid, std::move(config));
-  return &emplaced.first->second;
-}
-
-// -------------------------------------------------------------------------- //
-
-JitRtAsyncCollectiveSupport::JitRtAsyncCollectiveSupport(
-    se::Stream* async_comm_stream)
-    : async_comm_stream_(async_comm_stream) {}
-
-Status JitRtCollectiveSupport::MaybeBlockAfterFirstRun(int32_t uid,
-                                                       int32_t device_ordinal,
-                                                       se::Stream* stream) {
-  bool block = [&] {
-    absl::MutexLock lock(&mutex_);
-    return executed_.try_emplace(Key(uid, device_ordinal), true).second;
-  }();
-  return block ? stream->BlockHostUntilDone() : OkStatus();
-}
-
-FailureOr<se::Event> JitRtAsyncCollectiveSupport::PopEvent(
-    int32_t uid, int32_t device_ordinal) {
-  const int64_t key = EventKey(uid, device_ordinal);
-
-  absl::MutexLock lock(&mutex_);
-  auto it = done_events_.find(key);
-  if (it == done_events_.end()) return failure();
-
-  se::Event done_event = std::move(it->second);
-  done_events_.erase(it);
-  return done_event;
-}
-
-LogicalResult JitRtAsyncCollectiveSupport::PushEvent(int32_t uid,
-                                                     int32_t device_ordinal,
-                                                     se::Event done_event) {
-  const int64_t key = EventKey(uid, device_ordinal);
-
-  absl::MutexLock lock(&mutex_);
-  auto result = done_events_.try_emplace(key, std::move(done_event));
-  if (!result.second) return failure();  // done event has not been consumed
-
-  return success();
-}
-
-// -------------------------------------------------------------------------- //
-
-#if XLA_ENABLE_XCCL
-FailureOr<NcclComm::Lock> GetNcclComm(const NcclExecuteParams& params,
-                                      int64_t group_mode, int64_t op_id,
-                                      ArrayRef<int64_t> replica_group_offsets,
-                                      ArrayRef<int64_t> replica_group_values) {
-  // TODO(b/233930690): Pass the attribute below as a nested array.
-  // Pass an array of arrays using two vectors; one specifying all the values
-  // and another specifying the (ending) offsets of each array in the other
-  // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
-  // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
-  std::vector<ReplicaGroup> replica_groups;
-  int i = 0;
-  for (int64_t replica_group_end : replica_group_offsets) {
-    ReplicaGroup replica_group;
-    while (i < replica_group_end)
-      replica_group.add_replica_ids(replica_group_values[i++]);
-    replica_groups.push_back(replica_group);
-  }
-
-  auto comm =
-      LockNcclComm(params, replica_groups,
-                   static_cast<CollectiveOpGroupMode>(group_mode), op_id);
-  if (comm.ok()) return std::move(comm.value());
-  return failure();
-}
-#endif  // XLA_ENABLE_XCCL
-
-FailureOr<std::vector<DeviceBufferPair>> GetDeviceBufferPairs(
-    CustomCall::RemainingArgs& args) {
-  // Add MemRef arguments as buffer arguments.
-  const int buffer_pairs = args.size() / 2;
-  std::vector<DeviceBufferPair> device_buffers;
-  device_buffers.reserve(buffer_pairs);
-  for (int i = 0; i < buffer_pairs; ++i) {
-    auto source = args.get<runtime::StridedMemrefView>(i);
-    auto destination = args.get<runtime::StridedMemrefView>(i + buffer_pairs);
-    if (failed(source) || failed(destination)) {
-      // Unsupported argument type.
-      return failure();
-    }
-
-    int element_count = 1;
-    for (int size : source->sizes) element_count *= size;
-    device_buffers.emplace_back(DeviceBufferPair{
-        source->dtype, element_count, GetDeviceAddress(*source),
-        GetDeviceAddress(*destination)});
-  }
-  return device_buffers;
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct Gemm {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          JitRtGemmConfigCache* configs,
-                          runtime::StridedMemrefView lhs,
-                          runtime::StridedMemrefView rhs,
-                          runtime::StridedMemrefView out, int64_t algorithm,
-                          double alpha_real, double alpha_imag, double beta,
-                          DotDimensionNumbers dot_dims, int64_t uid) const;
-
-  static Gemm Handler() { return Gemm(); }
-};
-}  // namespace
-
-absl::Status Gemm::operator()(const ServiceExecutableRunOptions* run_options,
-                              const DebugOptions* debug_options,
-                              JitRtGemmConfigCache* configs,
-                              runtime::StridedMemrefView lhs,
-                              runtime::StridedMemrefView rhs,
-                              runtime::StridedMemrefView out, int64_t algorithm,
-                              double alpha_real, double alpha_imag, double beta,
-                              DotDimensionNumbers dot_dims, int64_t uid) const {
-  se::DeviceMemoryBase lhs_data = GetDeviceAddress(lhs);
-  se::DeviceMemoryBase rhs_data = GetDeviceAddress(rhs);
-  se::DeviceMemoryBase output_data = GetDeviceAddress(out);
-
-  VLOG(3) << "Running GEMM";
-  se::Stream* stream = run_options->stream();
-
-  // Find the gemm config for this instance of operation based on uid.
-  const GemmConfig* config = configs->Get(uid);
-  if (config == nullptr) {
-    auto cfg = GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag,
-                             beta, dot_dims.lhs_batch, dot_dims.lhs_contract,
-                             dot_dims.rhs_batch, dot_dims.rhs_contract);
-    if (!cfg.ok()) return ToAbslStatus(cfg.status());
-    config = configs->Set(uid, std::move(*cfg));
-  }
-
-  Status executed = [&]() -> Status {
-    return RunGemm(*config, lhs_data, rhs_data, output_data, stream);
-  }();
-
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
-}
-
-static bool Gemm(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                 void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.gemm")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .UserData<JitRtGemmConfigCache*>()
-                             .Arg<runtime::StridedMemrefView>()  // lhs
-                             .Arg<runtime::StridedMemrefView>()  // rhs
-                             .Arg<runtime::StridedMemrefView>()  // out
-                             .Attr<int64_t>("algorithm")
-                             .Attr<double>("alpha_real")
-                             .Attr<double>("alpha_imag")
-                             .Attr<double>("beta")
-                             .Attr<DotDimensionNumbers>("dot_dims")
-                             .Attr<int64_t>("uid")
-                             .To<RuntimeChecks()>(Gemm::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-
-enum class MemcpyDirection { kDeviceToDevice, kDeviceToHost, kHostToDevice };
-
-template <MemcpyDirection direction>
-struct Memcpy {
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          runtime::StridedMemrefView dst,
-                          runtime::StridedMemrefView src) const;
-  static Memcpy Handler() { return Memcpy(); }
-};
-}  // namespace
-
-template <MemcpyDirection direction>
-absl::Status Memcpy<direction>::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::StridedMemrefView dst, runtime::StridedMemrefView src) const {
-  se::Stream* stream = run_options->stream();
-
-  if (dst.sizes != src.sizes) {
-    return absl::InvalidArgumentError(
-        "Source memref sizes do not match destination memref sizes");
-  }
-
-  if (dst.strides != src.strides) {
-    return absl::InvalidArgumentError(
-        "Source memref strides do not match destination memref strides");
-  }
-
-  switch (direction) {
-    case MemcpyDirection::kDeviceToDevice: {
-      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
-      stream->ThenMemcpy(&dst_data, src_data, src_data.size());
-    } break;
-    case MemcpyDirection::kDeviceToHost: {
-      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
-      stream->ThenMemcpy(dst.data, src_data, src_data.size());
-    } break;
-    case MemcpyDirection::kHostToDevice: {
-      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-      stream->ThenMemcpy(&dst_data, src.data, dst_data.size());
-    } break;
-  }
-
-  // TODO(ezhulenev): H2D and D2H memcpy instead of blocking the execution
-  // thread should return an async token that will become available when
-  // transfer is completed.
-  if (direction != MemcpyDirection::kDeviceToDevice) {
-    auto st = stream->BlockHostUntilDone();
-    if (!st.ok()) return ToAbslStatus(st);
-  }
-
-  return absl::OkStatus();
-}
-
-template <MemcpyDirection direction>
-static bool MemcpyFn(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                     void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.memcpy")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // dst
-                             .Arg<runtime::StridedMemrefView>()  // src
-                             .To<RuntimeChecks()>(Memcpy<direction>::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-
-struct Memset {
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          runtime::StridedMemrefView dst,
-                          CustomCall::VariantArg constant) const;
-  static Memset Handler() { return Memset(); }
-};
-
-}  // namespace
-
-// TODO(ezhulenev): Add `VariantArg` type dispatching for all scalar types
-// supported by Xla (PrimitiveType).
-
-// Checks all supported data types to see if the value is zero.
-static bool IsZero(CustomCall::VariantArg constant) {
-  if (auto i1 = constant.get<bool>(); succeeded(i1))
-    return *i1 == false;
-  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
-    return *i8 == 0;
-  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
-    return *i16 == 0;
-  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
-    return *i32 == 0;
-  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
-    return *i64 == 0;
-  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
-    return *bf16 == bfloat16(0.0);
-  else if (auto f16 = constant.get<half>(); succeeded(f16))
-    return *f16 == half(0.0);
-  else if (auto f32 = constant.get<float>(); succeeded(f32))
-    return *f32 == 0.0;
-  else if (auto f64 = constant.get<double>(); succeeded(f64))
-    return *f64 == 0.0;
-
-  return false;
-}
-
-// Convert constant value to 32-bit pattern.
-static absl::StatusOr<uint32_t> ToBitPattern(CustomCall::VariantArg constant) {
-  // If the value is 8 or 16 bits wide, we can emit a 32-bit memset by
-  // repeating the value 4 or 2 times, so long as the destination buffer is
-  // an even multiple of 32 bits long.
-  //
-  // This code is identical to `ir_emitter_unnested`.
-  //
-  // We use `memcpy` operation to copy bytes between value and the uint32_t bit
-  // pattern because in theory they might have incompatible alignment, and we
-  // rely on LLVM to optimize it.
-  auto extend = [](auto value) -> uint32_t {
-    static constexpr size_t num_bytes = sizeof(value);
-    static_assert(num_bytes < 4);
-
-    uint16_t pattern16;
-    if constexpr (num_bytes == 1) {
-      uint8_t b = value;
-      pattern16 = uint16_t{b} | (uint16_t{b} << 8);
-    } else {
-      memcpy(&pattern16, &value, sizeof(pattern16));
-    }
-    return uint32_t{pattern16} | (uint32_t{pattern16} << 16);
-  };
-
-  // Truncate value to 32-bit pattern.
-  auto truncate = [](auto value) -> uint32_t {
-    static_assert(sizeof(value) >= 4);
-
-    uint32_t pattern;
-    memcpy(&pattern, &value, sizeof(pattern));
-    return pattern;
-  };
-
-  if (auto i1 = constant.get<bool>(); succeeded(i1))
-    return extend(*i1);
-  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
-    return extend(*i8);
-  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
-    return extend(*i16);
-  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
-    return truncate(*i32);
-  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
-    return truncate(*i64);
-  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
-    return extend(static_cast<uint16_t>(*bf16));
-  else if (auto f16 = constant.get<half>(); succeeded(f16))
-    return extend(static_cast<uint16_t>(*f16));
-  else if (auto f32 = constant.get<float>(); succeeded(f32))
-    return truncate(*f32);
-  else if (auto f64 = constant.get<double>(); succeeded(f64))
-    return truncate(*f64);
-
-  return absl::InvalidArgumentError("Unsupported memset constant type");
-}
-
-absl::Status Memset::operator()(const ServiceExecutableRunOptions* run_options,
-                                runtime::StridedMemrefView dst,
-                                CustomCall::VariantArg constant) const {
-  se::Stream* stream = run_options->stream();
-  se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-
-  // If the constant is zero we can use memzero directly.
-  if (IsZero(constant)) {
-    stream->ThenMemZero(&dst_data, dst_data.size());
-    return absl::OkStatus();
-  }
-
-  // If the constant is not zero, use the given pattern to `memset`.
-  absl::StatusOr<uint32_t> pattern = ToBitPattern(constant);
-  if (!pattern.ok()) return pattern.status();
-
-  if (dst_data.size() % 4 != 0)
-    return absl::InvalidArgumentError("Memref size is not divisible by 4");
-
-  stream->ThenMemset32(&dst_data, *pattern, dst_data.size());
-
-  return absl::OkStatus();
-}
-
-static bool MemsetFn(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                     void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.memset")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // dst
-                             .Arg<CustomCall::VariantArg>()      // constant
-                             .To<RuntimeChecks()>(Memset::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct Cholesky {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          runtime::StridedMemrefView operand,
-                          runtime::StridedMemrefView a,
-                          runtime::MemrefView workspace,
-                          runtime::MemrefView info, int64_t batch_size,
-                          bool is_lower, int64_t n) const;
-  static Cholesky Handler() { return Cholesky(); }
-};
-}  // namespace
-
-absl::Status Cholesky::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, runtime::StridedMemrefView operand,
-    runtime::StridedMemrefView a, runtime::MemrefView workspace,
-    runtime::MemrefView info, int64_t batch_size, bool is_lower,
-    int64_t n) const {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  se::DeviceMemoryBase operand_buffer = GetDeviceAddress(operand);
-  se::DeviceMemoryBase a_buffer = GetDeviceAddress(a);
-  se::DeviceMemoryBase workspace_buffer = GetDeviceAddress(workspace);
-  se::DeviceMemoryBase info_buffer = GetDeviceAddress(info);
-
-  VLOG(3) << "Running Cholesky";
-  se::Stream* stream = run_options->stream();
-
-  // Copy operand to the a buffer if they are different.
-  if (a.data != operand.data)
-    stream->ThenMemcpy(&a_buffer, operand_buffer, operand_buffer.size());
-
-  using UpperLower = se::blas::UpperLower;
-  UpperLower uplo = is_lower ? UpperLower::kLower : UpperLower::kUpper;
-
-  CholeskyParams params{n,        batch_size,       uplo,
-                        a_buffer, workspace_buffer, info_buffer};
-  auto executed = RunCholesky(xla::gpu::PtxOptsFromDebugOptions(*debug_options),
-                              operand.dtype, &params, stream);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
-#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return absl::InternalError("Not implemented without Gpu");
-#endif
-}
-
-static bool Cholesky(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                     void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.cholesky")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // operand
-                             .Arg<runtime::StridedMemrefView>()  // a
-                             .Arg<runtime::MemrefView>()         // workspace
-                             .Arg<runtime::MemrefView>()         // info
-                             .Attr<int64_t>("batch_size")
-                             .Attr<bool>("is_lower")
-                             .Attr<int64_t>("n")
-                             .To<RuntimeChecks()>(Cholesky::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-
-// TODO(ezhulenev): Today XLA represents TriangularSolve as a "classic" XLA
-// custom call operation, and we provide a thin adaptor from Xla custom call
-// to JitRt custom call. Once we are fully migrated to JitRt exectuion, XLA
-// compiler should directly emit properly typed TriangularSolve JitRt custom
-// call (no need to pass config via the serialized string).
-struct TriangularSolve {
-  // Adaptor from XlaCustomCall API to properly typed TriangularSolve handler.
-  static absl::Status run(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          CustomCall::RemainingArgs args,
-                          StringRef backend_config);
-
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          runtime::StridedMemrefView a,
-                          runtime::StridedMemrefView b,
-                          runtime::StridedMemrefView result,
-                          runtime::FlatMemrefView temp, bool left_side,
-                          bool lower, bool unit_diagonal,
-                          TriangularSolveOptions::Transpose transpose_a) const;
-  static TriangularSolve Handler() { return TriangularSolve(); }
-};
-
-}  // namespace
-
-absl::Status TriangularSolve::run(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
-    StringRef backend_config) {
-  TriangularSolve handler = TriangularSolve::Handler();
-
-  if (args.size() != 4)
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Expected 4 arguments, got %d", args.size()));
-
-  // Check if all arguments have the correct type.
-  auto a = args.get<runtime::StridedMemrefView>(0);
-  auto b = args.get<runtime::StridedMemrefView>(1);
-  auto result = args.get<runtime::StridedMemrefView>(2);
-  auto temp = args.get<runtime::FlatMemrefView>(3);
-  if (failed(a) || failed(b) || failed(result) || failed(temp))
-    return absl::InvalidArgumentError("Incorrect argument types");
-
-  // Parse backend config string.
-  TriangularSolveOptions opts;
-  auto st = tsl::HumanReadableJsonToProto(backend_config.str(), &opts);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return handler(run_options, debug_options, *a, *b, *result, *temp,
-                 opts.left_side(), opts.lower(), opts.unit_diagonal(),
-                 opts.transpose_a());
-}
-
-absl::Status TriangularSolve::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, runtime::StridedMemrefView a,
-    runtime::StridedMemrefView b, runtime::StridedMemrefView result,
-    runtime::FlatMemrefView temp, bool left_side, bool lower,
-    bool unit_diagonal, TriangularSolveOptions::Transpose transpose_a) const {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  se::Stream* stream = run_options->stream();
-
-  se::DeviceMemoryBase a_data = GetDeviceAddress(a);
-  se::DeviceMemoryBase b_data = GetDeviceAddress(b);
-  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
-  se::DeviceMemoryBase temp_data = GetDeviceAddress(temp);
-
-  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
-  // aren't the same buffer.
-  if (b.data != result.data)
-    stream->ThenMemcpy(&result_data, b_data, b_data.size());
-
-  Shape b_shape = ToShape(b);
-  int64_t m = b_shape.dimensions(b_shape.rank() - 2);
-  int64_t n = b_shape.dimensions(b_shape.rank() - 1);
-  int64_t batch_size = std::accumulate(
-      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
-      [](int64_t a, int64_t b) { return a * b; });
-
-  PrimitiveType elem_type = b.dtype;
-  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_type);
-  int64_t a_batch_stride = left_side ? m * m * elem_size : n * n * elem_size;
-  int64_t b_batch_stride = m * n * elem_size;
-
-  using Side = se::blas::Side;
-  using Diagonal = se::blas::Diagonal;
-  using Transpose = se::blas::Transpose;
-  using UpperLower = se::blas::UpperLower;
-
-  // Convert custom call attributes to se::blas enums.
-  UpperLower uplo = lower ? UpperLower::kLower : UpperLower::kUpper;
-  Side side = left_side ? Side::kLeft : Side::kRight;
-  Diagonal diagonal = unit_diagonal ? Diagonal::kUnit : Diagonal::kNonUnit;
-
-  auto transpose = [&]() -> mlir::FailureOr<Transpose> {
-    switch (transpose_a) {
-      case TriangularSolveOptions::NO_TRANSPOSE:
-        return se::blas::Transpose::kNoTranspose;
-      case TriangularSolveOptions::TRANSPOSE:
-        return se::blas::Transpose::kTranspose;
-      case TriangularSolveOptions::ADJOINT:
-        return se::blas::Transpose::kConjugateTranspose;
-      default:
-        return failure();
-    }
-  }();
-
-  if (failed(transpose))
-    return absl::InternalError("Failed to convert transpose type");
-
-  auto st = RunTriangulatSolve(
-      a_data, result_data, temp_data, PtxOptsFromDebugOptions(*debug_options),
-      uplo, side, diagonal, *transpose, elem_type, batch_size, m, n,
-      a_batch_stride, b_batch_stride, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
-#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return absl::InternalError("Not implemented without Gpu");
-#endif
-}
-
-// -------------------------------------------------------------------------- //
-// Implements JitRt custom call that forward to the Xla Custom Call handler.
-//
-// Longer term all Xla custom calls probably should be directly implemented as
-// JitRt custom calls. However for smooth migration from Thunks to JitRt we have
-// to seamlessly support all current XLA users.
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-namespace {
-struct XlaCustomCall {
-  using Stream = se::gpu::GpuStreamHandle;
-
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          CustomCall::RemainingArgs args,
-                          StringRef call_target_name, int32_t api_version,
-                          StringRef backend_config) const;
-  static XlaCustomCall Handler() { return XlaCustomCall(); }
-};
-}  // namespace
-
-absl::Status XlaCustomCall::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
-    StringRef call_target_name, int32_t api_version,
-    StringRef backend_config) const {
-  // Pattern match custom call to a few special cases, otherwise find the custom
-  // call handler regustered with the runtime.
-  if (call_target_name == kTriangularSolveCallTarget)
-    return TriangularSolve::run(run_options, debug_options, args,
-                                backend_config);
-
-  // Find the Xla custom call handler.
-  auto& platform_name = run_options->stream()->parent()->platform()->Name();
-  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-      call_target_name.str(), platform_name);
-  if (!call_target) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Cannot find the Xla custom call handler ", call_target_name.str()));
-  }
-
-  // Prepare pointers to buffers to pass to the Xla custom call handler.
-  llvm::SmallVector<void*> buffers;
-  for (unsigned i = 0; i < args.size(); ++i) {
-    if (auto memref = args.get<FlatMemrefView>(i); succeeded(memref)) {
-      buffers.push_back(memref->data);
-      continue;
-    }
-
-    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
-      int64_t size_in_bytes = primitive_util::ByteWidth(strided->dtype);
-      for (int64_t size : strided->sizes) size_in_bytes *= size;
-      buffers.push_back(strided->data);
-      continue;
-    }
-
-    // TODO(ezhulenev): Add dialect and type to model Xla custom call holes,
-    // today we rely on the fact that custom calls do not support scalar
-    // arguments and we can disambiguate holes from real arguments.
-    if (auto hole = args.get<int64_t>(i); succeeded(hole)) {
-      buffers.push_back(nullptr);
-      continue;
-    }
-
-    return absl::InvalidArgumentError(
-        "Failed to get arguments as (strided) memref view");
-  }
-
-  // Original custom call API version that doesn't support returning status.
-  if (api_version == CustomCallApiVersion::API_VERSION_ORIGINAL) {
-    using XlaCustomCallType = void (*)(Stream, void**, const char*, size_t);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                    buffers.data(), backend_config.data(),
-                    backend_config.size());
-
-    return absl::OkStatus();
-  }
-
-  // Xla Custom call API returning status.
-  if (api_version == CustomCallApiVersion::API_VERSION_STATUS_RETURNING ||
-      api_version ==
-          CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED) {
-    using XlaCustomCallType =
-        void (*)(Stream, void**, const char*, size_t, XlaCustomCallStatus*);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    XlaCustomCallStatus custom_call_status;
-    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                    buffers.data(), backend_config.data(),
-                    backend_config.size(), &custom_call_status);
-
-    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
-      return absl::InternalError(message.value());
-    } else {
-      return absl::OkStatus();
-    }
-  }
-
-  return absl::InvalidArgumentError("Incorrect custom call API version");
-}
-
-static bool CustomCall(runtime::ExecutionContext* ctx, void** args,
-                       void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.memcpy")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .Attr<std::string_view>("call_target_name")
-                             .Attr<int32_t>("api_version")
-                             .Attr<std::string_view>("backend_config")
-                             .To<RuntimeChecks()>(XlaCustomCall::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-// ------------------------------------------------------------------------- //
-
-namespace {
-struct AllReduce {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, int64_t op_id,
-                          int64_t reduction_kind,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values) const;
-  static AllReduce Handler() { return AllReduce(); }
-};
-}  // namespace
-
-absl::Status AllReduce::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduce";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NcclComm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-
-  auto executed = RunAllReduce(static_cast<ReductionKind>(reduction_kind),
-                               *device_buffers, *stream, **comm);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  auto st = collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  // NCCL disabled.
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool AllReduce(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                      void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.all_reduce")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtCollectiveSupport*>()
-          .RemainingArgs()  // args
-          .Attr<int32_t>("uid")
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<int64_t>("op_id")
-          .Attr<int64_t>("reduction_kind")  // ReductionKind
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .To<RuntimeChecks()>(AllReduce::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// ------------------------------------------------------------------------- //
-
-namespace {
-struct AllReduceStart {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtAsyncCollectiveSupport* async_collectives,
-                          CustomCall::RemainingArgs args, int64_t group_mode,
-                          int64_t op_id, int64_t reduction_kind,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values,
-                          int32_t uid) const;
-  static AllReduceStart Handler() { return AllReduceStart(); }
-};
-}  // namespace
-
-absl::Status AllReduceStart::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtAsyncCollectiveSupport* async_collectives,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    int64_t reduction_kind, ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values, int32_t uid) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduceStart";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NcclComm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-
-  // Wait until compute inputs are ready.
-  async_collectives->async_comm_stream()->ThenWaitFor(params.stream);
-
-  auto executed =
-      RunAllReduce(static_cast<ReductionKind>(reduction_kind), *device_buffers,
-                   *async_collectives->async_comm_stream(), **comm);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  // Create an event on the async stream for the completion of the all-reduce.
-  se::Event done_event(async_collectives->async_comm_stream()->parent());
-  if (!done_event.Init()) return absl::InternalError("Failed to create event");
-  async_collectives->async_comm_stream()->ThenRecordEvent(&done_event);
-
-  if (failed(async_collectives->PushEvent(
-          uid, stream->parent()->device_ordinal(), std::move(done_event))))
-    return absl::InternalError("Failed to push event to async collectives");
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool AllReduceStart(runtime::ExecutionContext* ctx, void** args,
-                           void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.all_reduce_start")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtAsyncCollectiveSupport*>()
-          .RemainingArgs()              // args
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<int64_t>("op_id")
-          .Attr<int64_t>("reduction_kind")  // ReductionKind
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .Attr<int32_t>("uid")
-          .To<RuntimeChecks()>(AllReduceStart::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// ------------------------------------------------------------------------- //
-
-namespace {
-struct AllReduceDone {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          JitRtAsyncCollectiveSupport* async_collectives,
-                          CustomCall::RemainingArgs args, int32_t uid) const;
-  static AllReduceDone Handler() { return AllReduceDone(); }
-};
-}  // namespace
-
-absl::Status AllReduceDone::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives,
-    JitRtAsyncCollectiveSupport* async_collectives,
-    CustomCall::RemainingArgs args, int32_t uid) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduceDone";
-  se::Stream* stream = run_options->stream();
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  auto event = async_collectives->PopEvent(uid, device_ordinal);
-  if (failed(event)) return absl::InternalError("Failed to pop event");
-
-  stream->ThenWaitFor(&*event);
-
-  if (!collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream).ok())
-    return absl::InternalError("Failed to block host");
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool AllReduceDone(runtime::ExecutionContext* ctx, void** args,
-                          void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.all_reduce_done")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<JitRtCollectiveSupport*>()
-                             .UserData<JitRtAsyncCollectiveSupport*>()
-                             .RemainingArgs()  // args
-                             .Attr<int32_t>("uid")
-                             .To<RuntimeChecks()>(AllReduceDone::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct ReduceScatter {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, int64_t op_id,
-                          int64_t reduction_kind,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values) const;
-  static ReduceScatter Handler() { return ReduceScatter(); }
-};
-}  // namespace
-
-absl::Status ReduceScatter::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
-    ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running ReduceScatter";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NcclComm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-
-  auto executed = RunReduceScatter(static_cast<ReductionKind>(reduction_kind),
-                                   *device_buffers, *stream, **comm);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  if (!collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream).ok())
-    return absl::InternalError("Failed to block host");
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool ReduceScatter(runtime::ExecutionContext* ctx, void** args,
-                          void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.reduce_scatter")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtCollectiveSupport*>()
-          .RemainingArgs()  // args
-          .Attr<int32_t>("uid")
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<int64_t>("op_id")
-          .Attr<int64_t>("reduction_kind")  // ReductionKind
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .To<RuntimeChecks()>(ReduceScatter::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct AllGather {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, int64_t op_id,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values) const;
-  static AllGather Handler() { return AllGather(); }
-};
-}  // namespace
-
-absl::Status AllGather::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id,
-    ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllGather";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NCCL comm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-
-  auto st = RunAllGather(*device_buffers, *stream, **comm);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  st = collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL diasbled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool AllGather(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                      void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.all_gather")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtCollectiveSupport*>()
-          .RemainingArgs()  // args
-          .Attr<int32_t>("uid")
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<int64_t>("op_id")
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .To<RuntimeChecks()>(AllGather::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct AllToAll {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, bool has_split_dimension,
-                          int64_t op_id,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values) const;
-  static AllToAll Handler() { return AllToAll(); }
-};
-}  // namespace
-
-absl::Status AllToAll::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, bool has_split_dimension, int64_t op_id,
-    ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllToAll";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NCCL comm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-
-  auto st = RunAllToAll(has_split_dimension, *device_buffers, *stream, **comm);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  st = collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool AllToAll(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                     void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.all_to_all")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtCollectiveSupport*>()
-          .RemainingArgs()  // args
-          .Attr<int32_t>("uid")
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<bool>("has_split_dimension")
-          .Attr<int64_t>("op_id")
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .To<RuntimeChecks()>(AllToAll::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct CollectivePermute {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          JitRtCollectiveSupport* collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, int64_t op_id,
-                          ArrayRef<int64_t> replica_group_offsets,
-                          ArrayRef<int64_t> replica_group_values,
-                          ArrayRef<int64_t> source_peers,
-                          ArrayRef<int64_t> target_peers) const;
-  static CollectivePermute Handler() { return CollectivePermute(); }
-};
-}  // namespace
-
-absl::Status CollectivePermute::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    JitRtCollectiveSupport* collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id,
-    ArrayRef<int64_t> replica_group_offsets,
-    ArrayRef<int64_t> replica_group_values, ArrayRef<int64_t> source_peers,
-    ArrayRef<int64_t> target_peers) const {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running CollectivePermute";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (failed(comm)) return absl::InternalError("Failed to get NcclComm");
-
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (failed(device_buffers))
-    return absl::InternalError("Failed to get device buffers");
-  if (device_buffers->size() != 1) {
-    return absl::InternalError(absl::StrFormat(
-        "Expected device buffer size: 1, got %d", device_buffers->size()));
-  }
-
-  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
-  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
-
-  StatusOr<DeviceAssignment::LogicalID> current_logical_id =
-      params.device_assn->LogicalIdForDevice(global_device_id.value());
-  if (!current_logical_id.ok())
-    return ToAbslStatus(current_logical_id.status());
-
-  const int64_t current_id = static_cast<CollectiveOpGroupMode>(group_mode) ==
-                                     CollectiveOpGroupMode::kCrossReplica
-                                 ? current_logical_id.value().replica_id
-                                 : current_logical_id.value().computation_id;
-  std::string device_string = NcclCollectiveThunk::GetDeviceString(params);
-
-  NcclCollectivePermuteConfig::IdToSourceTargetMap id_to_source_target;
-  for (int i = 0; i < source_peers.size(); ++i) {
-    id_to_source_target.insert({target_peers[i], {}}).first->second.source =
-        source_peers[i];
-    id_to_source_target.insert({source_peers[i], {}}).first->second.target =
-        target_peers[i];
-  }
-  const NcclCollectivePermuteConfig::SourceTargetMapEntry source_target =
-      NcclCollectivePermuteConfig::GetSourceTarget(id_to_source_target,
-                                                   current_id);
-
-  auto executed =
-      RunCollectivePermute(source_target, (*device_buffers)[0], *stream, **comm,
-                           device_string, current_id);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  auto st = collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-static bool CollectivePermute(runtime::ExecutionContext* ctx, void** args,
-                              void** attrs, void** rets) {
-  static auto* handler =
-      CustomCall::Bind("xla.gpu.collective_permute")
-          .UserData<const ServiceExecutableRunOptions*>()
-          .UserData<JitRtCollectiveSupport*>()
-          .RemainingArgs()  // args
-          .Attr<int32_t>("uid")
-          .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-          .Attr<int64_t>("op_id")
-          .Attr<ArrayRef<int64_t>>("replica_group_offsets")
-          .Attr<ArrayRef<int64_t>>("replica_group_values")
-          .Attr<ArrayRef<int64_t>>("source_peers")
-          .Attr<ArrayRef<int64_t>>("target_peers")
-          .To<RuntimeChecks()>(CollectivePermute::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct ReplicaId {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          runtime::FlatMemrefView result) const;
-  static ReplicaId Handler() { return ReplicaId(); }
-};
-}  // namespace
-
-absl::Status ReplicaId::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::FlatMemrefView result) const {
-  VLOG(3) << "Running ReplicaId";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
-  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
-
-  StatusOr<DeviceAssignment::LogicalID> logical_id =
-      params.device_assn->LogicalIdForDevice(global_device_id.value());
-  if (!logical_id.ok()) return ToAbslStatus(logical_id.status());
-
-  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
-  params.stream->ThenMemset32(&result_data, logical_id.value().replica_id,
-                              /*size=*/4);
-
-  return absl::OkStatus();
-}
-
-static bool ReplicaId(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                      void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.replica_id")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<runtime::FlatMemrefView>()  // result
-                             .To<RuntimeChecks()>(ReplicaId::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-namespace {
-struct PartitionId {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          runtime::FlatMemrefView result) const;
-  static PartitionId Handler() { return PartitionId(); }
-};
-}  // namespace
-
-absl::Status PartitionId::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::FlatMemrefView result) const {
-  VLOG(3) << "Running PartitionId";
-  se::Stream* stream = run_options->stream();
-  NcclExecuteParams params(*run_options, stream);
-
-  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
-  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
-
-  StatusOr<DeviceAssignment::LogicalID> logical_id =
-      params.device_assn->LogicalIdForDevice(global_device_id.value());
-  if (!logical_id.ok()) return ToAbslStatus(logical_id.status());
-
-  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
-  params.stream->ThenMemset32(&result_data, logical_id.value().computation_id,
-                              /*size=*/4);
-
-  return absl::OkStatus();
-}
-
-static bool PartitionId(runtime::ExecutionContext* ctx, void** args,
-                        void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.partition_id")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<runtime::FlatMemrefView>()  // result
-                             .To<RuntimeChecks()>(PartitionId::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Populate mapping from XLA (SE) enums/structs type id to symbol names.
-void PopulateXlaGpuTypeIdNames(TypeIDNameRegistry& registry) {
-#if GOOGLE_CUDA
-  registry.Register<Tagged<se::cuda::BlasLt::Epilogue>>(
-      "__type_id_se_cublas_lt_epilogue");
-#endif  // GOOGLE_CUDA
-
-  registry.Register<Tagged<se::dnn::ActivationMode>>(
-      "__type_id_se_dnn_activation");
-  registry.Register<Tagged<se::fft::Type>>("__type_id_se_fft_type");
-
-  registry.Register<Tagged<DotDimensionNumbers>>(
-      "__type_id_dot_dimension_numbers");
-  registry.Register<Tagged<ConvDimensionNumbers>>(
-      "__type_id_conv_dimension_numbers");
-  registry.Register<Tagged<ConvBackendConfig>>("__type_id_conv_backend_config");
-
-  RegisterTracingTypeIdNames(registry);
-}
-
-void PopulateXlaGpuCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  RegisterKernelLaunchCustomCalls(registry);
-  RegisterTracingCustomCalls(registry);
-
-#if GOOGLE_CUDA
-  // Graph launch kernels depend on Cuda Graph API.
-  RegisterGraphLaunchCustomCalls(registry);
-#endif  // GOOGLE_CUDA
-
-  RegisterFftCustomCalls(registry);
-  registry.Register("xla.gpu.cholesky", &xla::gpu::Cholesky);
-  registry.Register("xla.gpu.collective_permute", &xla::gpu::CollectivePermute);
-  registry.Register("xla.gpu.gemm", &xla::gpu::Gemm);
-
-#if GOOGLE_CUDA
-  RegisterMatmulCustomCalls(registry);
-#endif  // GOOGLE_CUDA
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  registry.Register("xla.gpu.custom_call", &xla::gpu::CustomCall);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  RegisterConvCustomCalls(registry);
-
-  registry.Register("xla.gpu.memcpy.d2d",
-                    &MemcpyFn<MemcpyDirection::kDeviceToDevice>);
-  registry.Register("xla.gpu.memcpy.h2d",
-                    &MemcpyFn<MemcpyDirection::kHostToDevice>);
-  registry.Register("xla.gpu.memcpy.d2h",
-                    &MemcpyFn<MemcpyDirection::kDeviceToHost>);
-  registry.Register("xla.gpu.memset", &MemsetFn);
-  RegisterIoFeedCustomCalls(registry);
-
-  // Collective operations.
-  registry.Register("xla.gpu.all_gather", &xla::gpu::AllGather);
-  registry.Register("xla.gpu.all_reduce", &xla::gpu::AllReduce);
-  registry.Register("xla.gpu.all_reduce_done", &xla::gpu::AllReduceDone);
-  registry.Register("xla.gpu.all_reduce_start", &xla::gpu::AllReduceStart);
-  registry.Register("xla.gpu.all_to_all", &xla::gpu::AllToAll);
-  registry.Register("xla.gpu.reduce_scatter", &xla::gpu::ReduceScatter);
-  registry.Register("xla.gpu.partition_id", &xla::gpu::PartitionId);
-  registry.Register("xla.gpu.replica_id", &xla::gpu::ReplicaId);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h b/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h
deleted file mode 100644
index 6889ecf8368..00000000000
--- a/tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2022 The TensorFlow Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_JITRT_CUSTOM_CALLS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_JITRT_CUSTOM_CALLS_H_
-
-#include <cstdint>
-#include <memory>
-#include <tuple>
-
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
-#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
-#include "tensorflow/compiler/xla/runtime/type_id.h"
-#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/tracing.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
-
-namespace xla {
-namespace gpu {
-class JitRtKernelsCache;
-class JitRtGemmConfigCache;
-class JitRtCollectiveSupport;
-class JitRtAsyncCollectiveSupport;
-
-// Populate custom calls implementing XLA GPU runtime API.
-void PopulateXlaGpuCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Populate mapping from XLA (SE) enums/structs type id to symbol names.
-void PopulateXlaGpuTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-// Populate encoding from LMHLO attributes to XLA(SE) enums and structs.
-void PopulateLmhloToXlaAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-class JitRtGemmConfigCache {
- public:
-  const GemmConfig* Get(int64_t uid);
-  const GemmConfig* Set(int64_t uid, GemmConfig config);
-
- private:
-  mutable absl::Mutex mutex_;
-
-  llvm::SmallDenseMap<int64_t, GemmConfig> configs_ ABSL_GUARDED_BY(mutex_);
-};
-
-class JitRtCollectiveSupport {
- public:
-  // Maybe block host after the first call to the collective operation with the
-  // given uid, to ensure that all devices have allocated the required buffers
-  // for their communicators before allowing any device to continue enqueuing
-  // operations. Otherwise, the allocations can cause deadlock in the CUDA
-  // driver.
-  //
-  // This basically ports workaround form cr/435058849 to JitRt (see details in
-  // the b/215649390).
-  Status MaybeBlockAfterFirstRun(int32_t uid, int32_t device_ordinal,
-                                 se::Stream* stream);
-
- private:
-  static int64_t Key(int32_t uid, int32_t device_ordinal) {
-    return static_cast<int64_t>(uid) << 32 | device_ordinal;
-  }
-
-  mutable absl::Mutex mutex_;
-
-  // Store if a particular collective operation was executed at least once. We
-  // rely on unique `uid` assigned to each collective operation by the lowering
-  // pass.
-  llvm::SmallDenseMap<int64_t, bool> executed_ ABSL_GUARDED_BY(mutex_);
-};
-
-// Support for running async collective operations communicating via events.
-class JitRtAsyncCollectiveSupport {
- public:
-  explicit JitRtAsyncCollectiveSupport(se::Stream* async_comm_stream);
-
-  mlir::FailureOr<se::Event> PopEvent(int32_t uid, int32_t device_ordinal);
-  mlir::LogicalResult PushEvent(int32_t uid, int32_t device_ordinal,
-                                se::Event done_event);
-
-  ::stream_executor::Stream* async_comm_stream() const {
-    return async_comm_stream_;
-  }
-
- private:
-  static int64_t EventKey(int32_t uid, int32_t device_ordinal) {
-    return static_cast<int64_t>(uid) << 32 | device_ordinal;
-  }
-
-  mutable absl::Mutex mutex_;
-
-  ::stream_executor::Stream* async_comm_stream_;
-
-  // Store done events for the AllReduceDone to wait on.
-  llvm::SmallDenseMap<int64_t, se::Event> done_events_ ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_JITRT_CUSTOM_CALLS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 2cba6db6a9b..ac625ce7ee2 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_KERNEL_MAPPING_SCHEME_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_KERNEL_MAPPING_SCHEME_H_
 
+#include <string>
+
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -55,7 +57,9 @@ class TilingScheme {
         indexing_order_(indexing_order),
         vector_size_(vector_size),
         thread_id_virtual_scaling_(scaling_factor) {
-    CHECK_EQ(tile_sizes[2] % vector_size_, 0);
+    CHECK_EQ(tile_sizes[2] % vector_size_, 0)
+        << "tile sizes = " << absl::StrJoin(tile_sizes, ", ")
+        << "; vector size = " << vector_size_;
   }
 
   static std::string IndexingOrderToString(IndexingOrder order) {
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 88a2fac0c15..9cc800ee1c7 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -65,7 +65,8 @@ Status KernelThunk::Initialize(const GpuExecutable& executable,
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::KernelBase> kernel,
         CreateKernel(kernel_name_, args_.size(), executable.text(),
-                     executable.binary(), executor));
+                     executable.binary(), executor,
+                     launch_dimensions_.SharedMemBytes()));
 
     kernel_cache_.emplace(executor, std::move(kernel));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 06ad2c68a8a..95b44fe8ff3 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/types.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index 02a34a449cf..e18192aca05 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -72,9 +72,18 @@ class LaunchDimensions {
                         thread_counts_per_block_.z, "}");
   }
 
+  void SetSharedMemBytes(uint32_t shared_mem_bytes) {
+    shared_mem_bytes_ = shared_mem_bytes;
+  }
+
+  uint32_t SharedMemBytes() const { return shared_mem_bytes_; }
+
  private:
   Dim3D block_counts_;
   Dim3D thread_counts_per_block_;
+
+  // Dynamic shared memory size.
+  uint32_t shared_mem_bytes_ = 0;
 };
 
 std::ostream& operator<<(std::ostream& out,
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 892c9b6e85c..25e004b2d1e 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl",
-     "tf_cc_test",
      "tf_copts",
 )
 load(
@@ -9,7 +9,7 @@ load(
 )
 
 package(
-    default_visibility = [":friends"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -23,12 +23,10 @@ package_group(
 cc_library(
     name = "llvm_gpu_backend",
     srcs = [
-        "dump_ir_pass.cc",
         "gpu_backend_lib.cc",
         "utils.cc",
     ],
     hdrs = [
-        "dump_ir_pass.h",
         "gpu_backend_lib.h",
         "utils.h",
     ],
@@ -49,6 +47,7 @@ cc_library(
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:NVPTXCodeGen",  # buildcleaner: keep
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
         "@llvm-project//llvm:Scalar",
@@ -59,6 +58,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/tsl/platform:cuda_libdevice_path",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_command_line_options",
@@ -73,7 +73,7 @@ cc_library(
     ]),
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc
deleted file mode 100644
index 58b50a0a6b0..00000000000
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
-
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/path.h"
-
-namespace xla {
-namespace gpu {
-
-// Pass which dumps the IR of a module into a file.
-//
-// Because it is implemented as a FunctionPass (IR is dumped
-// function-by-function) rather than as a ModulePass the resulting IR is not
-// valid (missing metadata, for example) but is still useful for inspection.
-// The pass needs to be a FunctionPass rather than a ModulePass because
-// inserting ModulePasses is disruptive to LLVM's pass manager.  For sequential
-// FunctionPasses (also SCC passes, etc) the pass manager executes the passes
-// sequentially on each function (SCC, etc).  Inserting a ModulePass between
-// FunctionPasses acts as a barrier forcing the FunctionPasses to execute fully
-// across all functions prior to advancing to the next pass.  For some reason
-// this results in different generated code resulting in an undesirable
-// Heisenberg effect when dumping the IR.
-class DumpIrPass : public llvm::FunctionPass {
- public:
-  explicit DumpIrPass(const std::string &output_filename)
-      : llvm::FunctionPass(id_), output_filename_(output_filename) {}
-
-  bool doInitialization(llvm::Module &M) override {
-    out_.reset(new llvm::raw_fd_ostream(llvm::StringRef(output_filename_), ec_,
-                                        llvm::sys::fs::OF_None));
-    if (ec_) {
-      LOG(FATAL) << "Unable to open " << output_filename_
-                 << " to dump LLVM IR: " << ec_.message();
-    }
-    return false;
-  }
-
-  bool runOnFunction(llvm::Function &Function) override {
-    Function.print(*out_);
-    return false;
-  }
-
-  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  bool doFinalization(llvm::Module &M) override {
-    out_->close();
-    return false;
-  }
-
- private:
-  static char id_;
-  std::string output_filename_;
-  std::error_code ec_;
-  std::unique_ptr<llvm::raw_fd_ostream> out_;
-};
-
-char DumpIrPass::id_ = 0;
-
-void IrDumpingPassManager::run(llvm::Module &module) {
-  for (int i = 0; i < passes_.size(); ++i) {
-    llvm::Pass *P = passes_[i];
-    if (dump_ir_) {
-      const llvm::PassInfo *PI =
-          llvm::PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
-      const std::string basename = ReplaceFilenameExtension(
-          absl::string_view(tsl::io::Basename(input_filename_)),
-          absl::StrFormat(
-              "pass-%02d.before.%s.ll", i,
-              absl::string_view(PI == nullptr ? "unknown"
-                                              : PI->getPassArgument().data())));
-      llvm::legacy::PassManager::add(
-          new DumpIrPass(tsl::io::JoinPath(output_dir_, basename)));
-    }
-    llvm::legacy::PassManager::add(P);
-  }
-  passes_.clear();
-  llvm::legacy::PassManager::run(module);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h
deleted file mode 100644
index 7f1bd0020f8..00000000000
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_DUMP_IR_PASS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_DUMP_IR_PASS_H_
-
-#include <string>
-
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Pass.h"
-#include "tensorflow/compiler/xla/types.h"
-
-namespace xla {
-namespace gpu {
-
-// Pass manager which optionally dumps the IR to a sequence of files before each
-// pass.
-class IrDumpingPassManager : public llvm::legacy::PassManager {
- public:
-  IrDumpingPassManager(const std::string& input_filename,
-                       const std::string& output_dir, bool dump_ir)
-      : llvm::legacy::PassManager(),
-        input_filename_(input_filename),
-        output_dir_(output_dir),
-        dump_ir_(dump_ir) {}
-  void add(llvm::Pass* P) { passes_.push_back(P); }
-  void run(llvm::Module& module);  // NOLINT(runtime/references)
-
- private:
-  std::string input_filename_;
-  std::string output_dir_;
-  bool dump_ir_;
-  std::vector<llvm::Pass*> passes_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LLVM_GPU_BACKEND_DUMP_IR_PASS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 048640fb046..52cdc5627e2 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -40,6 +40,8 @@ limitations under the License.
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
@@ -52,7 +54,6 @@ limitations under the License.
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 #include "tensorflow/compiler/xla/service/gpu/metrics.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_command_line_options.h"
@@ -60,6 +61,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/path.h"
@@ -178,71 +180,19 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
       llvm::codegen::getExplicitCodeModel(), codegen_opt_level));
 }
 
-// Adds the standard LLVM optimization passes, based on the speed optimization
-// level (opt_level) and size optimization level (size_level). Both module
-// and function-level passes are added, so two pass managers are passed in and
-// modified by this function.
-void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
-                           llvm::TargetMachine* target_machine,
-                           llvm::legacy::PassManagerBase* module_passes,
-                           llvm::legacy::FunctionPassManager* function_passes,
-                           int inline_threshold) {
-  llvm::PassManagerBuilder builder;
-  builder.OptLevel = opt_level;
-  builder.SizeLevel = size_level;
-
-  if (opt_level > 1) {
-    builder.Inliner = llvm::createFunctionInliningPass(inline_threshold);
-  } else {
-    // Only inline functions marked with "alwaysinline".
-    builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
-  }
-
-  builder.DisableUnrollLoops = opt_level == 0;
-  builder.LoopVectorize = opt_level > 0;
-  builder.SLPVectorize = opt_level > 1 && size_level < 2;
-
-  // NVPTX's early-as-possible passes include NVVM reflect.
-  target_machine->adjustPassManager(builder);
-
-  builder.populateFunctionPassManager(*function_passes);
-  builder.populateModulePassManager(*module_passes);
-}
-
-// Emits the given module to a bit code file.
-void EmitBitcodeToFile(const llvm::Module& module, absl::string_view filename) {
-  std::error_code error_code;
-  llvm::ToolOutputFile outfile(std::string(filename).c_str(), error_code,
-                               llvm::sys::fs::OF_None);
-  if (error_code) {
-    LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
-  }
-
-  llvm::WriteBitcodeToFile(module, outfile.os());
-  outfile.keep();
-}
-
 // Emits the given module to PTX. target_machine is an initialized TargetMachine
 // for the NVPTX target.
 std::string EmitModuleToPTX(llvm::Module* module,
                             llvm::TargetMachine* target_machine) {
   std::string ptx;
-  {
-    llvm::raw_string_ostream stream(ptx);
-    llvm::buffer_ostream pstream(stream);
-    // The extension is stripped by IrDumpingPassManager, so we need to
-    // get creative to add a suffix.
-    IrDumpingPassManager codegen_passes(
-        MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
-        "", false);
-    codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
-        llvm::Triple(module->getTargetTriple())));
-
-    target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
-                                        llvm::CGFT_AssemblyFile);
-    codegen_passes.run(*module);
-  }
-
+  llvm::raw_string_ostream stream(ptx);
+  llvm::buffer_ostream pstream(stream);
+  llvm::legacy::PassManager pm;
+  pm.add(new llvm::TargetLibraryInfoWrapperPass(
+      llvm::Triple(module->getTargetTriple())));
+  target_machine->addPassesToEmitFile(pm, pstream, nullptr,
+                                      llvm::CGFT_AssemblyFile);
+  pm.run(*module);
   return ptx;
 }
 
@@ -369,6 +319,61 @@ std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
 using TargetModuleLinker = std::function<Status(
     llvm::Module*, GpuVersion, const HloModuleConfig&, const std::string&)>;
 
+void DumpModule(const std::string output_filename, const llvm::Module* module) {
+  std::error_code ec;
+  auto out = std::make_unique<llvm::raw_fd_ostream>(
+      llvm::StringRef(output_filename), ec, llvm::sys::fs::OF_None);
+  if (ec) {
+    LOG(FATAL) << "Unable to open " << output_filename
+               << " to dump LLVM IR: " << ec.message();
+    return;
+  }
+  module->print(*out, /*AAW=*/nullptr);
+  out->close();
+}
+
+const llvm::Module* GetModule(llvm::Any IR) {
+  if (llvm::any_isa<const llvm::Module*>(IR))
+    return llvm::any_cast<const llvm::Module*>(IR);
+
+  if (llvm::any_isa<const llvm::Function*>(IR)) {
+    const llvm::Function* F = llvm::any_cast<const llvm::Function*>(IR);
+    return F->getParent();
+  }
+
+  if (llvm::any_isa<const llvm::LazyCallGraph::SCC*>(IR)) {
+    const llvm::LazyCallGraph::SCC* C =
+        llvm::any_cast<const llvm::LazyCallGraph::SCC*>(IR);
+    return C->begin()->getFunction().getParent();
+  }
+
+  if (llvm::any_isa<const llvm::Loop*>(IR)) {
+    const llvm::Loop* L = llvm::any_cast<const llvm::Loop*>(IR);
+    const llvm::Function* F = L->getHeader()->getParent();
+    return F->getParent();
+  }
+
+  return nullptr;
+}
+
+auto DumpCallbackForModule(std::string module_identifier) {
+  int i = 0;
+  return [module_identifier, i](llvm::StringRef pass, llvm::Any ir) mutable {
+    const llvm::Module* module = GetModule(ir);
+    if (!module) {
+      return;
+    }
+
+    const std::string basename = ReplaceFilenameExtension(
+        absl::string_view(tsl::io::Basename(module_identifier)),
+        absl::StrFormat("pass-%02d.before.%s.ll", i++,
+                        absl::string_view(pass.str())));
+    std::string outputs_dir;
+    tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
+    DumpModule(tsl::io::JoinPath(outputs_dir, basename), module);
+  };
+}
+
 Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
                              const HloModuleConfig& hlo_module_config,
                              const std::string& device_bitcode_dir_path,
@@ -379,37 +384,33 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
   TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
                                    device_bitcode_dir_path));
 
-  bool dump_ir = hlo_module_config.debug_options().xla_gpu_dump_llvmir();
-  std::string outputs_dir;
-  tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
-  IrDumpingPassManager module_passes(module->getModuleIdentifier(), outputs_dir,
-                                     dump_ir);
+  llvm::LoopAnalysisManager lam;
+  llvm::FunctionAnalysisManager fam;
+  llvm::CGSCCAnalysisManager cgam;
+  llvm::ModuleAnalysisManager mam;
 
-  // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  llvm::TargetLibraryInfoWrapperPass* tliwp =
-      new llvm::TargetLibraryInfoWrapperPass(
-          llvm::Triple(module->getTargetTriple()));
-  module_passes.add(tliwp);
+  fam.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
 
-  // Try to fetch the target triple from the module. If not present, set a
-  // default target triple.
-  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
-  if (target_triple.getArch() == llvm::Triple::UnknownArch) {
-    LOG(WARNING) << "target triple not found in the module";
-    target_triple = default_target_triple;
-  }
+  llvm::PipelineTuningOptions pto;
+  pto.SLPVectorization = true;
+  pto.InlinerThreshold = inline_threshold;
+
+  llvm::PassInstrumentationCallbacks pic;
 
-  module_passes.add(llvm::createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
+  llvm::StandardInstrumentations si(module->getContext(), false);
+  si.registerCallbacks(pic, &fam);
 
-  // The LLVM IR verifier performs sanity checking on the IR. This helps
-  // discover problems and report them in a meaningful manner, rather than let
-  // later passes report obscure assertions because of unfulfilled invariants.
-  module_passes.add(llvm::createVerifierPass());
+  llvm::PassBuilder pb(target_machine, pto, std::nullopt, &pic);
+  pb.registerModuleAnalyses(mam);
+  pb.registerCGSCCAnalyses(cgam);
+  pb.registerFunctionAnalyses(fam);
+  pb.registerLoopAnalyses(lam);
+  pb.crossRegisterProxies(lam, fam, cgam, mam);
 
-  // Create the function-level pass manager. It needs data layout information
-  // too.
-  llvm::legacy::FunctionPassManager function_passes(module);
+  if (hlo_module_config.debug_options().xla_gpu_dump_llvmir()) {
+    pic.registerBeforeNonSkippedPassCallback(
+        DumpCallbackForModule(module->getModuleIdentifier()));
+  }
 
   int32_t opt_level =
       hlo_module_config.debug_options().xla_backend_optimization_level();
@@ -424,36 +425,32 @@ Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
                   "--xla_backend_optimization_level >= 2.)";
     LOG(ERROR) << std::string(80, '*');
   }
-
-  // Add optimization passes, and set inliner threshold.
-  AddOptimizationPasses(opt_level,
-                        /*size_level=*/0, target_machine, &module_passes,
-                        &function_passes, inline_threshold);
-
-  // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
-  // again after the standard optimization passes [http://b/13329423].
-  // TODO(jingyue): SROA may further expose more optimization opportunities such
-  // as more precise alias analysis and more function inlining (SROA may change
-  // the inlining cost of a function). For now, running SROA already emits good
-  // enough code for the evaluated benchmarks. We may want to run more
-  // optimizations later.
-  if (opt_level > 0) {
-    // LLVM's optimizer turns on SROA when the optimization level is greater
-    // than 0. We mimic this behavior here.
-    module_passes.add(llvm::createSROAPass());
+  llvm::OptimizationLevel ol;
+  switch (opt_level) {
+    case 0:
+      ol = llvm::OptimizationLevel::O0;
+      break;
+    case 1:
+      ol = llvm::OptimizationLevel::O1;
+      break;
+    case 2:
+      ol = llvm::OptimizationLevel::O2;
+      break;
+    case 3:
+      ol = llvm::OptimizationLevel::O3;
+      break;
   }
 
-  // Verify that the module is well formed after optimizations ran.
-  module_passes.add(llvm::createVerifierPass());
-
-  // Done populating the pass managers. Now run them.
-
-  function_passes.doInitialization();
-  for (auto func = module->begin(); func != module->end(); ++func) {
-    function_passes.run(*func);
+  llvm::ModulePassManager mpm;
+  mpm.addPass(llvm::VerifierPass());
+  if (ol == llvm::OptimizationLevel::O0) {
+    mpm.addPass(pb.buildO0DefaultPipeline(ol));
+  } else {
+    mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
   }
-  function_passes.doFinalization();
-  module_passes.run(*module);
+  mpm.addPass(llvm::VerifierPass());
+
+  mpm.run(*module, mam);
 
   return OkStatus();
 }
@@ -512,14 +509,63 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
 
 namespace nvptx {
 
+std::string CantFindCudaMessage(absl::string_view msg,
+                                absl::string_view xla_gpu_cuda_data_dir) {
+  return absl::StrCat(
+      msg, "\nSearched for CUDA in the following directories:\n  ",
+      absl::StrJoin(tsl::CandidateCudaRoots(std::string{xla_gpu_cuda_data_dir}),
+                    "\n  "),
+      "\nYou can choose the search directory by setting xla_gpu_cuda_data_dir "
+      "in HloModule's DebugOptions.  For most apps, setting the environment "
+      "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.");
+}
+
+static std::string GetLibdeviceDir(absl::string_view xla_gpu_cuda_data_dir) {
+  for (const std::string& cuda_root :
+       tsl::CandidateCudaRoots(std::string{xla_gpu_cuda_data_dir})) {
+    std::string libdevice_dir =
+        tsl::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tsl::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
+    }
+  }
+  LOG(WARNING) << CantFindCudaMessage(
+      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may "
+      "result in compilation or runtime failures, if the program we try to run "
+      "uses routines from libdevice.",
+      xla_gpu_cuda_data_dir);
+
+  // GetCudaRootCandidates always includes ".", but if everything fails, we
+  // return it anyway.  Better than returning the empty string.
+  return ".";
+}
+
 StatusOr<std::string> CompileToPtx(
     llvm::Module* module, GpuVersion gpu_version,
     const HloModuleConfig& hlo_module_config,
-    const std::string& libdevice_dir_path,
     std::function<void(llvm::TargetMachine*)> configure_target) {
   static absl::once_flag backend_init_flag;
   absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
 
+  absl::string_view xla_gpu_cuda_data_dir =
+      hlo_module_config.debug_options().xla_gpu_cuda_data_dir();
+
+  static absl::Mutex libdevice_cache_mu(absl::kConstInit);
+  static auto& libdevice_dir_path_cache ABSL_GUARDED_BY(libdevice_cache_mu) =
+      *new absl::flat_hash_map<std::string, std::string>();
+  std::string libdevice_dir_path = [&] {
+    absl::MutexLock l(&libdevice_cache_mu);
+    auto it = libdevice_dir_path_cache.find(xla_gpu_cuda_data_dir);
+    if (it != libdevice_dir_path_cache.end()) {
+      return it->second;
+    }
+    auto [it2, inserted] = libdevice_dir_path_cache.emplace(
+        xla_gpu_cuda_data_dir, GetLibdeviceDir(xla_gpu_cuda_data_dir));
+    return it2->second;
+  }();
+
   std::string ptx;
   std::unique_ptr<llvm::TargetMachine> target_machine;
   {
@@ -724,23 +770,17 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   ir_fs->flush();
 
   // Emit GCN ISA binary.
-  // The extension is stripped by IrDumpingPassManager, so we need to
-  // get creative to add a suffix.
-  std::string module_id = module->getModuleIdentifier();
-  IrDumpingPassManager codegen_passes(
-      ReplaceFilenameExtension(tsl::io::Basename(module_id),
-                               random_number + "-amdgpu.dummy"),
-      "", false);
-  codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
+  llvm::legacy::PassManager pm;
+  pm.add(new llvm::TargetLibraryInfoWrapperPass(
       llvm::Triple(module->getTargetTriple())));
   llvm::SmallVector<char, 0> stream;
   llvm::raw_svector_ostream pstream(stream);
   std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
       new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
   module->setDataLayout(target_machine->createDataLayout());
-  target_machine->addPassesToEmitFile(codegen_passes, *isabin_fs, nullptr,
+  target_machine->addPassesToEmitFile(pm, *isabin_fs, nullptr,
                                       llvm::CGFT_ObjectFile);
-  codegen_passes.run(*module);
+  pm.run(*module);
   isabin_fs->flush();
 
   if (keep_tempfiles) {
@@ -752,7 +792,11 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   // Locate lld.
   // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
   // ROCm-Device-Libs PR.
+#if TENSORFLOW_USE_DCU
+  std::string lld_path = tsl::io::JoinPath("/opt/dtk", "llvm/bin");
+#else
   std::string lld_path = tsl::io::JoinPath("/opt/rocm", "llvm/bin");
+#endif 
   auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
   if (!lld_program) {
     return xla::InternalError("unable to find ld.lld in PATH: %s",
@@ -768,7 +812,7 @@ StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   std::string error_message;
   int lld_result =
       llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
-                                llvm::None, {}, 0, 0, &error_message);
+                                std::nullopt, {}, 0, 0, &error_message);
   if (lld_result) {
     return xla::InternalError("ld.lld execute fail: %s, error code %d",
                               error_message, lld_result);
@@ -811,8 +855,11 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
   if (!compute_capability) {
     return xla::InternalError("Incompatible compute capability was specified.");
   }
-
+#if TENSORFLOW_USE_DCU
+  std::string gcn_arch_name = "gfx906";
+#else
   std::string gcn_arch_name = compute_capability->gcn_arch_name();
+#endif
   TF_RETURN_IF_ERROR(
       LinkROCDLIfNecessary(module, gcn_arch_name, device_bitcode_dir_path));
 
@@ -887,8 +934,11 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     const HloModuleConfig& hlo_module_config) {
   auto compute_capability =
       std::get_if<se::RocmComputeCapability>(&gpu_version);
-
+#if TENSORFLOW_USE_DCU 
+  std::string gcn_arch_name = "gfx906"; 
+#else
   std::string gcn_arch_name = compute_capability->gcn_arch_name();
+#endif
   auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
   return GetTargetMachine(std::move(target_triple), arch.first,
                           hlo_module_config, arch.second);
@@ -958,9 +1008,11 @@ StatusOr<std::vector<uint8_t>> CompileToHsaco(
       return xla::InternalError(
           "Incompatible compute capability was specified.");
     }
-
+#if TENSORFLOW_USE_DCU 
+    std::string gcn_arch_name = "gfx906";
+#else     
     std::string gcn_arch_name = compute_capability->gcn_arch_name();
-
+#endif
     uint64_t hash;
     if (HsacoCache::Find(str, hash, gcn_arch_name, hsaco)) {
       VLOG(1) << "HSACO cache hit";
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index 1396f13f071..b767ef5ab3b 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -38,6 +38,10 @@ std::vector<std::string> GetROCDLPaths(std::string amdgpu_version,
                                        const std::string& rocdl_dir_path);
 
 namespace nvptx {
+
+std::string CantFindCudaMessage(absl::string_view msg,
+                                absl::string_view xla_gpu_cuda_data_dir);
+
 // Compiles the argument module and returns it. libdevice_dir_path is the parent
 // directory of the libdevice bitcode libraries. The contents of the module may
 // be changed.
@@ -48,7 +52,6 @@ namespace nvptx {
 StatusOr<std::string> CompileToPtx(
     llvm::Module* module, GpuVersion gpu_version,
     const HloModuleConfig& hlo_module_config,
-    const std::string& libdevice_dir_path,
     std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
 }  // namespace nvptx
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
index 9cd9a306900..63e5ac9eee5 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 
+#include <memory>
 #include <string>
 
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/resource_loader.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -29,15 +29,15 @@ namespace gpu {
 namespace {
 
 std::string SaxpyIRFile() {
-  return tsl::io::JoinPath("tensorflow", "compiler", "xla", "service", "gpu",
+  return tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu",
                            "llvm_gpu_backend", "tests_data", "saxpy.ll");
 }
 
 TEST(UtilsTest, TestLoadIRModule) {
   llvm::LLVMContext llvm_context;
   std::string test_srcdir = tsl::testing::TensorFlowSrcRoot();
-  std::unique_ptr<llvm::Module> module = LoadIRModule(
-      tsl::GetDataDependencyFilepath(SaxpyIRFile()), &llvm_context);
+  std::unique_ptr<llvm::Module> module =
+      LoadIRModule(SaxpyIRFile(), &llvm_context);
   // Sanity check that the module was loaded properly.
   ASSERT_NE(nullptr, module);
   ASSERT_NE(std::string::npos, module->getModuleIdentifier().find("saxpy.ll"));
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
index a78afff7ddf..105d3fba48c 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
@@ -18,21 +18,22 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <optional>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
 #include "tensorflow/compiler/xla/stream_executor/host_or_device_scalar.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace xla {
@@ -236,6 +238,19 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     absl::Span<const int64_t> rhs_contracting_dims, const Shape& output_shape,
     double alpha_real, double alpha_imag, double beta,
     std::optional<int64_t> algorithm, int64_t compute_precision) {
+  return GemmConfig::For(lhs_shape, lhs_batch_dims, lhs_contracting_dims,
+                         rhs_shape, rhs_batch_dims, rhs_contracting_dims,
+                         output_shape, output_shape, alpha_real, alpha_imag,
+                         beta, algorithm, compute_precision);
+}
+
+/*static*/ StatusOr<GemmConfig> GemmConfig::For(
+    const Shape& lhs_shape, absl::Span<const int64_t> lhs_batch_dims,
+    absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
+    absl::Span<const int64_t> rhs_batch_dims,
+    absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
+    const Shape& output_shape, double alpha_real, double alpha_imag,
+    double beta, std::optional<int64_t> algorithm, int64_t compute_precision) {
   absl::Span<const int64_t> lhs_col_dims = lhs_contracting_dims;
   TF_ASSIGN_OR_RETURN(
       std::vector<int64_t> lhs_row_dims,
@@ -274,17 +289,30 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
                       MatrixLayout::For(output_shape, output_batch_dims,
                                         output_row_dims, output_col_dims));
 
+  TF_ASSIGN_OR_RETURN(MatrixLayout c_layout,
+                      MatrixLayout::For(c_shape, output_batch_dims,
+                                        output_row_dims, output_col_dims));
+
   // TODO(cjfj): We should also check that the batch, contracting and
   // non-contracting dimensions match in size and relative physical location.
-  TF_RET_CHECK(lhs_layout.num_cols == rhs_layout.num_rows);
-  TF_RET_CHECK(output_layout.num_rows == lhs_layout.num_rows);
-  TF_RET_CHECK(output_layout.num_cols == rhs_layout.num_cols);
+  // TODO(philipphack): Check the remaining dimensions in the FP8 case once
+  // cuBLASLt supports the NN configuration.
+  if (lhs_shape.element_type() != F8E4M3FN &&
+      lhs_shape.element_type() != F8E5M2) {
+    TF_RET_CHECK(lhs_layout.num_cols == rhs_layout.num_rows);
+    TF_RET_CHECK(output_layout.num_rows == lhs_layout.num_rows);
+    TF_RET_CHECK(output_layout.num_cols == rhs_layout.num_cols);
+  }
+  TF_RET_CHECK(c_layout.num_rows == output_layout.num_rows);
+  TF_RET_CHECK(c_layout.num_cols == output_layout.num_cols);
   TF_RET_CHECK((lhs_layout.batch_size == output_layout.batch_size) ||
                (lhs_layout.batch_size == 1));
   TF_RET_CHECK((rhs_layout.batch_size == output_layout.batch_size) ||
                (rhs_layout.batch_size == 1));
 
   switch (output_shape.element_type()) {
+    case F8E4M3FN:
+    case F8E5M2:
     case F16:
     case BF16:
     case F32:
@@ -312,8 +340,14 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   }
 
   return GemmConfig{
-      lhs_layout, rhs_layout, output_layout,     {alpha_real, alpha_imag},
-      beta,       algorithm,  compute_precision,
+      lhs_layout,
+      rhs_layout,
+      c_layout,
+      output_layout,
+      {alpha_real, alpha_imag},
+      beta,
+      algorithm,
+      compute_precision,
   };
 }
 
@@ -329,13 +363,15 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   const Shape& lhs_shape = gemm->operand(0)->shape();
   const Shape& rhs_shape = gemm->operand(1)->shape();
   const DotDimensionNumbers& dot_dims = config.dot_dimension_numbers();
+  const Shape& output_shape =
+      gemm->shape().IsTuple() ? gemm->shape().tuple_shapes(0) : gemm->shape();
 
   return GemmConfig::For(
       lhs_shape, dot_dims.lhs_batch_dimensions(),
       dot_dims.lhs_contracting_dimensions(), rhs_shape,
       dot_dims.rhs_batch_dimensions(), dot_dims.rhs_contracting_dimensions(),
-      /*output_shape=*/gemm->shape(), config.alpha_real(), config.alpha_imag(),
-      config.beta(), algorithm, se::blas::kDefaultComputePrecision);
+      output_shape, config.alpha_real(), config.alpha_imag(), config.beta(),
+      algorithm, se::blas::kDefaultComputePrecision);
 }
 
 /*static*/ StatusOr<GemmConfig> GemmConfig::For(mlir::lmhlo_gpu::GEMMOp op) {
@@ -347,7 +383,7 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   int64_t compute_precision = 0;  // Default
   if (op.getPrecisionConfig().has_value()) {
     auto precision_config = op.getPrecisionConfig();
-    for (auto attr : precision_config.getValue()) {
+    for (auto attr : precision_config.value()) {
       int64_t value = static_cast<int64_t>(
           attr.template cast<mlir::mhlo::PrecisionAttr>().getValue());
       if (value > compute_precision) {
@@ -368,20 +404,27 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
 StatusOr<se::blas::ComputationType> GetBlasComputationType(
     PrimitiveType dtype) {
   switch (dtype) {
+    case F8E5M2:    // fall-through
+    case F8E4M3FN:  // fall-through
     case F16:  // fall-through
     case BF16:
       // Accumulate in f32 precision.
       return se::blas::ComputationType::kF32;
     case F32:  // fall-through
     case C64:
-      return se::blas::ComputationType::kTF32AsF32;
+#if GOOGLE_CUDA
+      if (tsl::tensor_float_32_execution_enabled()) {
+        return se::blas::ComputationType::kTF32AsF32;
+      }
+#endif
+      return se::blas::ComputationType::kF32;
     case F64:  // fall-through
     case C128:
       return se::blas::ComputationType::kF64;
     case S32:
       return se::blas::ComputationType::kI32;
     default:
-      return InternalError("unsupported type");
+      return InternalError("GetBlasComputationType: unsupported type");
   }
 }
 
@@ -399,6 +442,20 @@ se::blas::DataType GetScaleType(se::blas::DataType c_type,
 
 namespace {
 
+// This struct contains the metadata of a matrix, e.g., its base address and
+// dimensions.
+struct MatrixDescriptor {
+  se::DeviceMemoryBase data;
+  int64_t leading_dim_stride;
+  int64_t batch_stride;
+  se::blas::Transpose transpose;
+
+  template <typename T>
+  se::DeviceMemory<T> cast() const {
+    return se::DeviceMemory<T>(data);
+  }
+};
+
 // BLAS GeMM's output is column-major. If we require row-major, use identity:
 // C^T = (A @ B)^T = B^T @ A^T.
 bool MakeOutputColumnMajor(MatrixLayout& lhs, MatrixLayout& rhs,
@@ -413,6 +470,19 @@ bool MakeOutputColumnMajor(MatrixLayout& lhs, MatrixLayout& rhs,
   return swap_operands;
 }
 
+bool MakeOutputColumnMajor(MatrixLayout& lhs, MatrixLayout& rhs,
+                           MatrixLayout& output, MatrixLayout& c) {
+  bool swap_operands = output.order != MatrixLayout::Order::kColumnMajor;
+  if (swap_operands) {
+    std::swap(lhs, rhs);
+    rhs.Transpose();
+    lhs.Transpose();
+    c.Transpose();
+    output.Transpose();
+  }
+  return swap_operands;
+}
+
 se::blas::Transpose AsBlasTranspose(MatrixLayout::Order order) {
   // BLAS is column-major by default.
   return (order == MatrixLayout::Order::kColumnMajor)
@@ -420,8 +490,8 @@ se::blas::Transpose AsBlasTranspose(MatrixLayout::Order order) {
              : se::blas::Transpose::kTranspose;
 }
 
-se::blas::MatrixDescriptor GetMatrixDesc(const MatrixLayout& layout,
-                                         se::DeviceMemoryBase data) {
+MatrixDescriptor GetMatrixDesc(const MatrixLayout& layout,
+                               se::DeviceMemoryBase data) {
   return {
       data,
       layout.leading_dim_stride,
@@ -432,10 +502,10 @@ se::blas::MatrixDescriptor GetMatrixDesc(const MatrixLayout& layout,
 
 template <typename Input, typename Output>
 Status DoGemmWithAlgorithm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
-                           const se::blas::MatrixDescriptor& lhs,
-                           const se::blas::MatrixDescriptor& rhs,
-                           const se::blas::MatrixDescriptor& output,
-                           Output alpha, Output beta, se::Stream* stream,
+                           const MatrixDescriptor& lhs,
+                           const MatrixDescriptor& rhs,
+                           const MatrixDescriptor& output, Output alpha,
+                           Output beta, se::Stream* stream,
                            se::blas::AlgorithmType algorithm,
                            se::blas::ComputePrecision compute_precision,
                            se::blas::ProfileResult* profile_result) {
@@ -463,9 +533,8 @@ Status DoGemmWithAlgorithm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
 
 template <typename Input>
 Status DoGemm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
-              const se::blas::MatrixDescriptor& lhs,
-              const se::blas::MatrixDescriptor& rhs,
-              const se::blas::MatrixDescriptor& output, Input alpha, Input beta,
+              const MatrixDescriptor& lhs, const MatrixDescriptor& rhs,
+              const MatrixDescriptor& output, Input alpha, Input beta,
               se::Stream* stream,
               std::optional<se::blas::AlgorithmType> algorithm,
               se::blas::ComputePrecision compute_precision,
@@ -515,14 +584,25 @@ Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
   int64_t m = output_layout.num_rows;
   int64_t n = output_layout.num_cols;
   int64_t k = lhs_layout.num_cols;
-  se::blas::MatrixDescriptor lhs = GetMatrixDesc(lhs_layout, lhs_buffer);
-  se::blas::MatrixDescriptor rhs = GetMatrixDesc(rhs_layout, rhs_buffer);
-  se::blas::MatrixDescriptor output =
-      GetMatrixDesc(output_layout, output_buffer);
+  MatrixDescriptor lhs = GetMatrixDesc(lhs_layout, lhs_buffer);
+  MatrixDescriptor rhs = GetMatrixDesc(rhs_layout, rhs_buffer);
+  MatrixDescriptor output = GetMatrixDesc(output_layout, output_buffer);
   int64_t batch_size = output_layout.batch_size;
 
   if (!algorithm) algorithm = config.algorithm;
 
+  if ((output_layout.dtype == F16 || output_layout.dtype == BF16 ||
+       output_layout.dtype == F32 || output_layout.dtype == F64 ||
+       output_layout.dtype == C64 || output_layout.dtype == C128) &&
+      (lhs_layout.dtype != output_layout.dtype ||
+       rhs_layout.dtype != output_layout.dtype)) {
+    return InternalError(
+        "GEMM lhs type(%s) and rhs type(%s) must match output type(%s)",
+        primitive_util::LowercasePrimitiveTypeName(lhs_layout.dtype),
+        primitive_util::LowercasePrimitiveTypeName(rhs_layout.dtype),
+        primitive_util::LowercasePrimitiveTypeName(output_layout.dtype));
+  }
+
   switch (output_layout.dtype) {
     case S32:
       if (!algorithm) algorithm = se::blas::kDefaultGemmAlgo;
@@ -576,10 +656,29 @@ StatusOr<bool> EpilogueAddsVectorBias(GemmBackendConfig_Epilogue epilogue) {
     case GemmBackendConfig::DEFAULT:
     case GemmBackendConfig::RELU:
     case GemmBackendConfig::GELU:
+    case GemmBackendConfig::GELU_AUX:
       return false;
     case GemmBackendConfig::BIAS:
-    case GemmBackendConfig::BIASRELU:
-    case GemmBackendConfig::BIASGELU:
+    case GemmBackendConfig::BIAS_RELU:
+    case GemmBackendConfig::BIAS_GELU:
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return true;
+    default:
+      return InternalError("Unknown Epilogue.");
+  }
+}
+
+StatusOr<bool> EpilogueHasAuxiliaryOutput(GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+    case GemmBackendConfig::RELU:
+    case GemmBackendConfig::GELU:
+    case GemmBackendConfig::BIAS:
+    case GemmBackendConfig::BIAS_RELU:
+    case GemmBackendConfig::BIAS_GELU:
+      return false;
+    case GemmBackendConfig::GELU_AUX:
+    case GemmBackendConfig::BIAS_GELU_AUX:
       return true;
     default:
       return InternalError("Unknown Epilogue.");
@@ -590,6 +689,10 @@ StatusOr<bool> EpilogueAddsVectorBias(GemmBackendConfig_Epilogue epilogue) {
 
 StatusOr<se::blas::DataType> AsBlasDataType(PrimitiveType dtype) {
   switch (dtype) {
+    case F8E5M2:
+      return se::blas::DataType::kF8E5M2;
+    case F8E4M3FN:
+      return se::blas::DataType::kF8E4M3FN;
     case F16:
       return se::blas::DataType::kHalf;
     case BF16:
@@ -603,7 +706,7 @@ StatusOr<se::blas::DataType> AsBlasDataType(PrimitiveType dtype) {
     case C128:
       return se::blas::DataType::kComplexDouble;
     default:
-      return InternalError("unsupported type");
+      return InternalError("AsBlasDataType: unsupported type");
   }
 }
 
@@ -624,6 +727,45 @@ StatusOr<se::cuda::BlasLt::MatrixLayout> AsBlasLtMatrixLayout(
       layout.leading_dim_stride, layout.batch_stride);
 }
 
+template <cudaDataType_t CudaT>
+struct CudaToNativeT;
+
+#if CUDA_VERSION >= 11080
+template <>
+struct CudaToNativeT<CUDA_R_8F_E4M3> {
+  using type = tsl::float8_e4m3fn;
+};
+template <>
+struct CudaToNativeT<CUDA_R_8F_E5M2> {
+  using type = tsl::float8_e5m2;
+};
+#endif
+
+template <>
+struct CudaToNativeT<CUDA_R_16BF> {
+  using type = Eigen::bfloat16;
+};
+template <>
+struct CudaToNativeT<CUDA_R_16F> {
+  using type = Eigen::half;
+};
+template <>
+struct CudaToNativeT<CUDA_R_32F> {
+  using type = float;
+};
+template <>
+struct CudaToNativeT<CUDA_R_64F> {
+  using type = double;
+};
+template <>
+struct CudaToNativeT<CUDA_C_32F> {
+  using type = complex64;
+};
+template <>
+struct CudaToNativeT<CUDA_C_64F> {
+  using type = complex128;
+};
+
 }  // namespace
 
 namespace cublas_lt {
@@ -633,47 +775,22 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
   switch (epilogue) {
     case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Default:
       return se::cuda::BlasLt::Epilogue::kDefault;
-    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Bias:
-      return se::cuda::BlasLt::Epilogue::kBias;
     case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Relu:
       return se::cuda::BlasLt::Epilogue::kReLU;
+    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Gelu:
+      return se::cuda::BlasLt::Epilogue::kGELU;
+    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::GeluAux:
+      return se::cuda::BlasLt::Epilogue::kGELUWithAux;
+    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Bias:
+      return se::cuda::BlasLt::Epilogue::kBias;
     case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasRelu:
       return se::cuda::BlasLt::Epilogue::kBiasThenReLU;
-    default:
-      return InternalError("unknown epilogue");
-  }
-}
-
-/*static*/ StatusOr<MatmulPlan> MatmulPlan::For(
-    mlir::lmhlo_gpu::CublasLtMatmulOp op) {
-  mlir::mhlo::DotDimensionNumbersAttr dot_dims = op.getDotDimensionNumbers();
-
-  int64_t compute_precision = 0;  // Default
-  if (op.getPrecisionConfig().hasValue()) {
-    auto precision_config = op.getPrecisionConfig();
-    for (auto attr : precision_config.getValue()) {
-      int64_t value = static_cast<int64_t>(
-          attr.template cast<mlir::mhlo::PrecisionAttr>().getValue());
-      if (value > compute_precision) {
-        compute_precision = value;
-      }
-    }
+    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGelu:
+      return se::cuda::BlasLt::Epilogue::kBiasThenGELU;
+    case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGeluAux:
+      return se::cuda::BlasLt::Epilogue::kBiasThenGELUWithAux;
   }
-
-  TF_ASSIGN_OR_RETURN(
-      GemmConfig config,
-      GemmConfig::For(GetShape(op.getA()), dot_dims.getLhsBatchingDimensions(),
-                      dot_dims.getLhsContractingDimensions(),
-                      GetShape(op.getB()), dot_dims.getRhsBatchingDimensions(),
-                      dot_dims.getRhsContractingDimensions(),
-                      GetShape(op.getC()), op.getAlphaReal().convertToDouble(),
-                      op.getAlphaImag().convertToDouble(),
-                      op.getBeta().convertToDouble(), op.getAlgorithm(),
-                      compute_precision));
-
-  TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::Epilogue epilogue,
-                      AsBlasLtEpilogue(op.getEpilogue()));
-  return From(config, epilogue);
+  return InternalError("unexpected epilogue value");
 }
 
 /*static*/ StatusOr<MatmulPlan> MatmulPlan::From(
@@ -681,6 +798,7 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
   MatrixLayout lhs_layout = config.lhs_layout;
   MatrixLayout rhs_layout = config.rhs_layout;
   MatrixLayout output_layout = config.output_layout;
+  MatrixLayout c_layout = config.c_layout;
 
   // cublasLt matmul requires batch sizes to be equal. If only one operand has a
   // batch, the other will be broadcast (as its batch_stride == 0).
@@ -688,8 +806,16 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
   lhs_layout.batch_size = batch_size;
   rhs_layout.batch_size = batch_size;
 
+  // cuBLASLt FP8 GEMM kernels require A (i.e. lhs) to be transposed.
+  se::blas::Transpose trans_a;
+  if (lhs_layout.dtype == F8E4M3FN || lhs_layout.dtype == F8E5M2) {
+    trans_a = se::blas::Transpose::kTranspose;
+  } else {
+    trans_a = se::blas::Transpose::kNoTranspose;
+  }
+
   bool must_swap_operands =
-      MakeOutputColumnMajor(lhs_layout, rhs_layout, output_layout);
+      MakeOutputColumnMajor(lhs_layout, rhs_layout, c_layout, output_layout);
 
   TF_ASSIGN_OR_RETURN(se::blas::DataType output_dtype,
                       AsBlasDataType(output_layout.dtype));
@@ -699,15 +825,14 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
       se::cuda::BlasLt::MatmulDesc op_desc,
       se::cuda::BlasLt::MatmulDesc::Create(
           computation_type, GetScaleType(output_dtype, computation_type),
-          /*trans_a=*/se::blas::Transpose::kNoTranspose,
-          /*trans_b=*/se::blas::Transpose::kNoTranspose, epilogue));
+          trans_a, /*trans_b=*/se::blas::Transpose::kNoTranspose, epilogue));
 
   TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::MatrixLayout a_desc,
                       AsBlasLtMatrixLayout(lhs_layout));
   TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::MatrixLayout b_desc,
                       AsBlasLtMatrixLayout(rhs_layout));
   TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::MatrixLayout c_desc,
-                      AsBlasLtMatrixLayout(output_layout));
+                      AsBlasLtMatrixLayout(c_layout));
   TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::MatrixLayout d_desc,
                       AsBlasLtMatrixLayout(output_layout));
 
@@ -718,15 +843,17 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
       config.alpha, config.beta, must_swap_operands};
 }
 
-template <typename Input, typename Scale>
-Status MatmulPlan::DoMatmul(se::Stream* stream, se::DeviceMemoryBase a_buffer,
-                            se::DeviceMemoryBase b_buffer,
-                            se::DeviceMemoryBase c_buffer,
-                            se::DeviceMemoryBase d_buffer,
-                            se::DeviceMemoryBase bias_buffer,
-                            const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
-                            se::ScratchAllocator& scratch_allocator,
-                            se::blas::ProfileResult* profile_result) {
+template <typename Scale, typename A, typename B, typename C, typename D>
+Status MatmulPlan::DoMatmul(
+    se::Stream* stream, se::DeviceMemoryBase a_buffer,
+    se::DeviceMemoryBase b_buffer, se::DeviceMemoryBase c_buffer,
+    se::DeviceMemoryBase d_buffer, se::DeviceMemoryBase bias_buffer,
+    se::DeviceMemoryBase aux_buffer, se::DeviceMemoryBase a_scale_buffer,
+    se::DeviceMemoryBase b_scale_buffer, se::DeviceMemoryBase c_scale_buffer,
+    se::DeviceMemoryBase d_scale_buffer, se::DeviceMemoryBase d_amax_buffer,
+    const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
+    se::ScratchAllocator& scratch_allocator,
+    se::blas::ProfileResult* profile_result) const {
   se::cuda::BlasLt* blas_lt = se::cuda::GetBlasLt(stream);
   TF_RET_CHECK(blas_lt != nullptr);
 
@@ -740,54 +867,94 @@ Status MatmulPlan::DoMatmul(se::Stream* stream, se::DeviceMemoryBase a_buffer,
 
   Scale beta = static_cast<Scale>(beta_);
 
-  se::DeviceMemory<Input> output(d_buffer);
+  se::DeviceMemory<D> output(d_buffer);
   return blas_lt->DoMatmul(
       stream, plan_, se::HostOrDeviceScalar<Scale>(alpha),
-      se::DeviceMemory<Input>(a_buffer), se::DeviceMemory<Input>(b_buffer),
-      se::HostOrDeviceScalar<Scale>(beta), se::DeviceMemory<Input>(c_buffer),
-      output, algorithm, scratch_allocator,
-      se::DeviceMemory<Input>(bias_buffer), profile_result);
+      se::DeviceMemory<A>(a_buffer), se::DeviceMemory<B>(b_buffer),
+      se::HostOrDeviceScalar<Scale>(beta), se::DeviceMemory<C>(c_buffer),
+      output, algorithm, scratch_allocator, se::DeviceMemory<C>(bias_buffer),
+      aux_buffer, se::DeviceMemory<Scale>(a_scale_buffer),
+      se::DeviceMemory<Scale>(b_scale_buffer),
+      se::DeviceMemory<Scale>(c_scale_buffer),
+      se::DeviceMemory<Scale>(d_scale_buffer),
+      se::DeviceMemory<Scale>(d_amax_buffer), profile_result);
 }
 
 Status MatmulPlan::ExecuteOnStream(
     se::Stream* stream, se::DeviceMemoryBase a_buffer,
     se::DeviceMemoryBase b_buffer, se::DeviceMemoryBase c_buffer,
     se::DeviceMemoryBase d_buffer, se::DeviceMemoryBase bias_buffer,
+    se::DeviceMemoryBase aux_buffer, se::DeviceMemoryBase a_scale_buffer,
+    se::DeviceMemoryBase b_scale_buffer, se::DeviceMemoryBase c_scale_buffer,
+    se::DeviceMemoryBase d_scale_buffer, se::DeviceMemoryBase d_amax_buffer,
     const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
     se::ScratchAllocator& scratch_allocator,
-    se::blas::ProfileResult* profile_result) {
+    se::blas::ProfileResult* profile_result) const {
   if (must_swap_operands_) {
     std::swap(a_buffer, b_buffer);
   }
 
-  switch (plan_.d_desc.type()) {
-    case CUDA_R_16F:
-      return DoMatmul<Eigen::half, float>(stream, a_buffer, b_buffer, c_buffer,
-                                          d_buffer, bias_buffer, algorithm,
-                                          scratch_allocator, profile_result);
-    case CUDA_R_16BF:
-      return DoMatmul<Eigen::bfloat16, float>(
-          stream, a_buffer, b_buffer, c_buffer, d_buffer, bias_buffer,
-          algorithm, scratch_allocator, profile_result);
-    case CUDA_R_32F:
-      return DoMatmul<float>(stream, a_buffer, b_buffer, c_buffer, d_buffer,
-                             bias_buffer, algorithm, scratch_allocator,
-                             profile_result);
-    case CUDA_R_64F:
-      return DoMatmul<double>(stream, a_buffer, b_buffer, c_buffer, d_buffer,
-                              bias_buffer, algorithm, scratch_allocator,
-                              profile_result);
-    case CUDA_C_32F:
-      return DoMatmul<complex64>(stream, a_buffer, b_buffer, c_buffer, d_buffer,
-                                 bias_buffer, algorithm, scratch_allocator,
-                                 profile_result);
-    case CUDA_C_64F:
-      return DoMatmul<complex128>(stream, a_buffer, b_buffer, c_buffer,
-                                  d_buffer, bias_buffer, algorithm,
-                                  scratch_allocator, profile_result);
-    default:
-      return InternalError("Unexpected dtype");
+  std::tuple<cudaDataType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t>
+      operand_types{plan_.a_desc.type(), plan_.b_desc.type(),
+                    plan_.c_desc.type(), plan_.d_desc.type()};
+
+#define TYPED_MATMUL(SCALENTYPE, ATYPE, BTYPE, CTYPE, DTYPE)                \
+  if (operand_types == std::make_tuple(ATYPE, BTYPE, CTYPE, DTYPE)) {       \
+    return DoMatmul<SCALENTYPE, CudaToNativeT<ATYPE>::type,                 \
+                    CudaToNativeT<BTYPE>::type, CudaToNativeT<CTYPE>::type, \
+                    CudaToNativeT<DTYPE>::type>(                            \
+        stream, a_buffer, b_buffer, c_buffer, d_buffer, bias_buffer,        \
+        aux_buffer, a_scale_buffer, b_scale_buffer, c_scale_buffer,         \
+        d_scale_buffer, d_amax_buffer, algorithm, scratch_allocator,        \
+        profile_result);                                                    \
   }
+
+#if CUDA_VERSION >= 11080
+  // FP8 compatible type combinations (see cuBLASLt documentation):
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
+
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_16BF)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16BF,
+               CUDA_R_8F_E5M2)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F,
+               CUDA_R_8F_E5M2)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_16F, CUDA_R_16F)
+  TYPED_MATMUL(float, CUDA_R_8F_E4M3, CUDA_R_8F_E5M2, CUDA_R_32F, CUDA_R_32F)
+
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16BF,
+               CUDA_R_8F_E5M2)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F,
+               CUDA_R_8F_E4M3)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F,
+               CUDA_R_8F_E5M2)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_16F, CUDA_R_16F)
+  TYPED_MATMUL(float, CUDA_R_8F_E5M2, CUDA_R_8F_E4M3, CUDA_R_32F, CUDA_R_32F)
+#endif
+
+  // Other data types:
+  TYPED_MATMUL(float, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
+  TYPED_MATMUL(float, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF, CUDA_R_16BF)
+  TYPED_MATMUL(float, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F)
+  TYPED_MATMUL(double, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F)
+  TYPED_MATMUL(complex64, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F)
+  TYPED_MATMUL(complex128, CUDA_C_64F, CUDA_C_64F, CUDA_C_64F, CUDA_C_64F)
+
+#undef TYPED_MATMUL
+
+  return InternalError("Unexpected dtype");
 }
 
 StatusOr<std::vector<se::cuda::BlasLt::MatmulAlgorithm>>
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils.h b/tensorflow/compiler/xla/service/gpu/matmul_utils.h
index 9d2b127d48e..0880126ac97 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils.h
@@ -22,9 +22,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
@@ -99,8 +100,18 @@ struct GemmConfig {
       double alpha_real, double alpha_imag, double beta,
       std::optional<int64_t> algorithm, int64_t compute_precision);
 
+  // As above with additional `c_shape` parameter.
+  static StatusOr<GemmConfig> For(
+      const Shape& lhs_shape, absl::Span<const int64_t> lhs_batch_dims,
+      absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
+      absl::Span<const int64_t> rhs_batch_dims,
+      absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
+      const Shape& output_shape, double alpha_real, double alpha_imag,
+      double beta, std::optional<int64_t> algorithm, int64_t compute_precision);
+
   MatrixLayout lhs_layout;
   MatrixLayout rhs_layout;
+  MatrixLayout c_layout;
   MatrixLayout output_layout;
   complex128 alpha;
   double beta;
@@ -131,6 +142,7 @@ Status RunGemm(const GemmConfig& config, se::DeviceMemoryBase lhs_buffer,
 namespace cublas_lt {
 
 StatusOr<bool> EpilogueAddsVectorBias(GemmBackendConfig_Epilogue epilogue);
+StatusOr<bool> EpilogueHasAuxiliaryOutput(GemmBackendConfig_Epilogue epilogue);
 
 }  // namespace cublas_lt
 
@@ -145,18 +157,58 @@ StatusOr<se::cuda::BlasLt::Epilogue> AsBlasLtEpilogue(
 
 class MatmulPlan {
  public:
-  static StatusOr<MatmulPlan> For(mlir::lmhlo_gpu::CublasLtMatmulOp op);
+  template <typename CublasLtMatmulMaybeF8Op,
+            typename = std::enable_if<
+                std::is_same<CublasLtMatmulMaybeF8Op,
+                             mlir::lmhlo_gpu::CublasLtMatmulOp>::value ||
+                std::is_same<CublasLtMatmulMaybeF8Op,
+                             mlir::lmhlo_gpu::CublasLtMatmulF8Op>::value>>
+  static StatusOr<MatmulPlan> For(CublasLtMatmulMaybeF8Op op) {
+    mlir::mhlo::DotDimensionNumbersAttr dot_dims = op.getDotDimensionNumbers();
+
+    int64_t compute_precision = 0;  // Default
+    if (op.getPrecisionConfig().has_value()) {
+      auto precision_config = op.getPrecisionConfig();
+      for (auto attr : precision_config.value()) {
+        int64_t value = static_cast<int64_t>(
+            attr.template cast<mlir::mhlo::PrecisionAttr>().getValue());
+        if (value > compute_precision) {
+          compute_precision = value;
+        }
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        GemmConfig config,
+        GemmConfig::For(
+            GetShape(op.getA()), dot_dims.getLhsBatchingDimensions(),
+            dot_dims.getLhsContractingDimensions(), GetShape(op.getB()),
+            dot_dims.getRhsBatchingDimensions(),
+            dot_dims.getRhsContractingDimensions(), GetShape(op.getC()),
+            GetShape(op.getD()), op.getAlphaReal().convertToDouble(),
+            op.getAlphaImag().convertToDouble(), op.getBeta().convertToDouble(),
+            op.getAlgorithm(), compute_precision));
+
+    TF_ASSIGN_OR_RETURN(se::cuda::BlasLt::Epilogue epilogue,
+                        AsBlasLtEpilogue(op.getEpilogue()));
+    return From(config, epilogue);
+  }
+
   static StatusOr<MatmulPlan> From(const GemmConfig& config,
                                    se::cuda::BlasLt::Epilogue epilogue);
 
-  Status ExecuteOnStream(se::Stream* stream, se::DeviceMemoryBase a_buffer,
-                         se::DeviceMemoryBase b_buffer,
-                         se::DeviceMemoryBase c_buffer,
-                         se::DeviceMemoryBase d_buffer,
-                         se::DeviceMemoryBase bias_buffer,  // may be null
-                         const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
-                         se::ScratchAllocator& scratch_allocator,
-                         se::blas::ProfileResult* profile_result = nullptr);
+  Status ExecuteOnStream(
+      se::Stream* stream, se::DeviceMemoryBase a_buffer,
+      se::DeviceMemoryBase b_buffer, se::DeviceMemoryBase c_buffer,
+      se::DeviceMemoryBase d_buffer,
+      se::DeviceMemoryBase bias_buffer,  // may be null
+      se::DeviceMemoryBase aux_buffer,   // may be null
+      se::DeviceMemoryBase a_scale_buffer, se::DeviceMemoryBase b_scale_buffer,
+      se::DeviceMemoryBase c_scale_buffer, se::DeviceMemoryBase d_scale_buffer,
+      se::DeviceMemoryBase d_amax_buffer,
+      const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
+      se::ScratchAllocator& scratch_allocator,
+      se::blas::ProfileResult* profile_result = nullptr) const;
 
   StatusOr<std::vector<se::cuda::BlasLt::MatmulAlgorithm>> GetAlgorithms(
       se::Stream* stream) const;
@@ -169,14 +221,19 @@ class MatmulPlan {
         beta_(beta),
         must_swap_operands_(must_swap_operands) {}
 
-  template <typename Input, typename Scale = Input>
+  template <typename Scale, typename A, typename B = A, typename C = A,
+            typename D = A>
   Status DoMatmul(se::Stream* stream, se::DeviceMemoryBase a_buffer,
                   se::DeviceMemoryBase b_buffer, se::DeviceMemoryBase c_buffer,
                   se::DeviceMemoryBase d_buffer,
                   se::DeviceMemoryBase bias_buffer,  // may be null
+                  se::DeviceMemoryBase aux_buffer,   // may be null
+                  se::DeviceMemoryBase a_scale, se::DeviceMemoryBase b_scale,
+                  se::DeviceMemoryBase c_scale, se::DeviceMemoryBase d_scale,
+                  se::DeviceMemoryBase d_amax,
                   const se::cuda::BlasLt::MatmulAlgorithm& algorithm,
                   se::ScratchAllocator& scratch_allocator,
-                  se::blas::ProfileResult* profile_result);
+                  se::blas::ProfileResult* profile_result) const;
 
   se::cuda::BlasLt::MatmulPlan plan_;
   complex128 alpha_;
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
index 96184f3ea7e..9a4c719cfce 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MEMSET_THUNK_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/move_copy_to_users.cc b/tensorflow/compiler/xla/service/gpu/move_copy_to_users.cc
index 3cce74a732b..4d6a1220bf2 100644
--- a/tensorflow/compiler/xla/service/gpu/move_copy_to_users.cc
+++ b/tensorflow/compiler/xla/service/gpu/move_copy_to_users.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/move_copy_to_users.h b/tensorflow/compiler/xla/service/gpu/move_copy_to_users.h
index 10e75b9cb8d..8a5d8e6de97 100644
--- a/tensorflow/compiler/xla/service/gpu/move_copy_to_users.h
+++ b/tensorflow/compiler/xla/service/gpu/move_copy_to_users.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 541478f0506..bdc158992f1 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -25,10 +25,12 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_performance_model.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
@@ -48,6 +50,7 @@ bool IsProfitableOperand(HloInstruction* instr) {
 }
 
 FusionDecision LegalToFuse(HloInstruction* instr1, HloInstruction* instr2,
+                           const GpuDeviceInfo& device_info,
                            FusionInfoCache* fusion_info_cache) {
   CHECK(instr1->opcode() == HloOpcode::kFusion);
 
@@ -64,7 +67,7 @@ FusionDecision LegalToFuse(HloInstruction* instr1, HloInstruction* instr2,
   }
 
   // Do this check last, as it may be expensive.
-  return FusionFitsInBudget(*instr1, *instr2,
+  return FusionFitsInBudget(*instr1, *instr2, device_info,
                             /*is_consumer_producer_fusion=*/false,
                             fusion_info_cache);
 }
@@ -95,7 +98,8 @@ HloInstruction* SelectPreferredFusionCandidate(
 
 std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
     const HloInstruction* producer, const HloReachabilityMap& reachability,
-    FusionInfoCache* fusion_info_cache, GpuHloCostAnalysis* cost_analysis) {
+    FusionInfoCache* fusion_info_cache, GpuHloCostAnalysis* cost_analysis,
+    const GpuDeviceInfo& device_info) {
   std::vector<HloInstruction*> fusion_candidates;
   const HloComputation* computation = producer->parent();
   const HloModule* module = computation->parent();
@@ -158,7 +162,7 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
                                 << " would introduce a cycle when fused.");
       continue;
     }
-    if (!FusionFitsInBudget(*producer, *consumer,
+    if (!FusionFitsInBudget(*producer, *consumer, device_info,
                             /*is_consumer_producer_fusion=*/false,
                             fusion_info_cache)) {
       dump_negative_explanation(
@@ -174,6 +178,15 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
       continue;
     }
 
+    GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
+        producer, cost_analysis, device_info, {consumer},
+        /*multi_output=*/true);
+    if (t.time_fused > t.time_unfused) {
+      dump_negative_explanation(FusionDecision{}
+                                << "will execute slower if fused");
+      continue;
+    }
+
     fusion_candidates.push_back(consumer);
   }
   return fusion_candidates;
@@ -186,6 +199,9 @@ FusionDecision IsSiblingFusionCandidate(const HloInstruction* instr) {
   if (!IsFusibleAsMultiOutputFusionRoot(*instr)) {
     return "not fusible as MOF root";
   }
+  if (IsNestableVariadicReduction(*instr)) {
+    return "merging with variadic reductions is not supported yet";
+  }
   // Check if the users of multioutput fusion is not a get-tuple-element.
   // If this is the case, we bail out because the transformation assumes
   // the users are get-tuple-element.
@@ -245,7 +261,7 @@ bool GpuMultiOutputFusion::FuseSiblings(HloInstruction* parent,
       if (NoFusionPossible sibling_fusible =
               (!IsSiblingFusionCandidate(*j) || !is_disconnected(*i, *j) ||
                !ShapesCompatibleForMultiOutputFusion(*(*i), *(*j)) ||
-               !LegalToFuse(*i, *j, fusion_info_cache))) {
+               !LegalToFuse(*i, *j, device_info_, fusion_info_cache))) {
         // We pick `j` arbitrarily as a consumer.
         if (dump_fusion) {
           RegisterFusionState(
@@ -306,7 +322,9 @@ bool GpuMultiOutputFusion::FuseSiblings(HloInstruction* parent,
 StatusOr<bool> GpuMultiOutputFusion::DoMultiOutputFusion() {
   bool changed = false;
   RecomputeReachability();
-  GpuHloCostAnalysis cost_analysis({shape_size_function_});
+  GpuHloCostAnalysis cost_analysis({shape_size_function_,
+                                    /*per_second_rates=*/{},
+                                    /*count_multiple_input_accesses=*/true});
   TF_RETURN_IF_ERROR(computation_->Accept(&cost_analysis));
   std::vector<HloInstruction*> defs_before_uses =
       computation_->MakeInstructionPostOrder();
@@ -335,7 +353,8 @@ StatusOr<bool> GpuMultiOutputFusion::DoMultiOutputFusion() {
     // multi-output fusion will occur before the current op in the order of
     // traversal, and hence, not get into the way of subsequent fusion attempts.
     const auto candidates = GetProducerConsumerMultiOutputFusionCandidates(
-        producer, *reachability_, &fusion_info_cache, &cost_analysis);
+        producer, *reachability_, &fusion_info_cache, &cost_analysis,
+        device_info_);
     auto* consumer_for_fusion = SelectPreferredFusionCandidate(candidates);
     if (consumer_for_fusion == nullptr) {
       continue;
@@ -427,6 +446,11 @@ StatusOr<bool> GpuMultiOutputFusion::Run(
   bool changed = false;
   for (auto* computation :
        module->MakeNonfusionComputations(execution_threads)) {
+    // Skip Softmax CustomCall computations.
+    if (computation->IsCustomCallComputation() &&
+        IsSoftmaxCustomCall(*computation->CustomCallInstruction())) {
+      continue;
+    }
     computation_ = computation;
     TF_ASSIGN_OR_RETURN(bool fusion_changed, DoMultiOutputFusion());
     if (fusion_changed) {
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
index 8bcc55b471f..6d44d331715 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
@@ -16,14 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 
+#include <memory>
 #include <queue>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -90,11 +92,10 @@ namespace gpu {
 //   the fusion kinds must match.
 
 class GpuMultiOutputFusion : public HloModulePass {
-  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
  public:
-  explicit GpuMultiOutputFusion(HloCostAnalysis::ShapeSizeFunction f)
-      : shape_size_function_(f) {}
+  explicit GpuMultiOutputFusion(const GpuDeviceInfo& d,
+                                HloCostAnalysis::ShapeSizeFunction f)
+      : device_info_(d), shape_size_function_(f) {}
 
   absl::string_view name() const override { return "multi_output_fusion"; }
 
@@ -120,6 +121,9 @@ class GpuMultiOutputFusion : public HloModulePass {
 
   // The reachability map of current computation.
   std::unique_ptr<HloReachabilityMap> reachability_;
+
+  const GpuDeviceInfo device_info_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 9110d1fc35d..b8f0b44a306 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -38,12 +39,16 @@ class MultiOutputFusionTest : public HloTestBase {
   }
 
  public:
-  GpuMultiOutputFusion mof_{ShapeSizeBytesFunction()};
+  GpuMultiOutputFusion mof_{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+                            ShapeSizeBytesFunction()};
 
   void CheckGpuMultiOutputFusion(absl::string_view hlo,
                                  std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(
-        hlo, GpuMultiOutputFusion{ShapeSizeBytesFunction()}, expected);
+        hlo,
+        GpuMultiOutputFusion{TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+                             ShapeSizeBytesFunction()},
+        expected);
   }
 };
 
@@ -303,6 +308,119 @@ TEST_F(MultiOutputFusionTest,
   ASSERT_FALSE(mof_.Run(module.get()).value());
 }
 
+TEST_F(MultiOutputFusionTest, LoopVariadicReductionFusions) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation.94 {
+      tmp_0 = f32[] parameter(0)
+      tmp_1 = f32[] parameter(1)
+      tmp_2 = pred[] compare(tmp_0, tmp_1), direction=GE
+      tmp_3 = f32[] select(tmp_2, tmp_0, tmp_1)
+      tmp_4 = pred[] compare(tmp_0, tmp_1), direction=EQ
+      tmp_5 = s32[] parameter(2)
+      tmp_6 = s32[] parameter(3)
+      tmp_7 = s32[] minimum(tmp_5, tmp_6)
+      tmp_8 = s32[] select(tmp_2, tmp_5, tmp_6)
+      tmp_9 = s32[] select(tmp_4, tmp_7, tmp_8)
+      ROOT tmp_10 = (f32[], s32[]) tuple(tmp_3, tmp_9)
+    }
+
+    minmax_func.1536 {
+      tmp_0 = f32[] parameter(0)
+      tmp_1 = f32[] parameter(2)
+      tmp_2 = s32[] parameter(1)
+      tmp_3 = s32[] parameter(3)
+      ROOT tmp_4 = (f32[], s32[]) fusion(tmp_0, tmp_1, tmp_2, tmp_3), kind=kLoop, calls=fused_computation.94
+    }
+
+    fused_computation {
+      tmp_0 = f32[554112,10]{1,0} parameter(0)
+      tmp_1 = s32[554112,10]{1,0} iota(), iota_dimension=1
+      tmp_2 = f32[] constant(-inf)
+      tmp_3 = s32[] constant(0)
+      ROOT tmp_4 = (f32[554112]{0}, s32[554112]{0}) reduce(tmp_0, tmp_1, tmp_2, tmp_3), dimensions={1}, to_apply=minmax_func.1536
+    }
+
+    fused_computation2 {
+      tmp_0 = f32[554112,10]{1,0} parameter(0)
+      tmp_1 = s32[554112,10]{1,0} iota(), iota_dimension=1
+      tmp_2 = f32[] constant(inf)
+      tmp_3 = s32[] constant(1)
+      ROOT tmp_4 = (f32[554112]{0}, s32[554112]{0}) reduce(tmp_0, tmp_1, tmp_2, tmp_3), dimensions={1}, to_apply=minmax_func.1536
+    }
+
+    ENTRY e {
+      tmp_0 = f32[554112,10]{1,0} parameter(0)
+      tmp_1 = (f32[554112]{0}, s32[554112]{0}) fusion(tmp_0), kind=kLoop, calls=fused_computation
+      tmp_2 = s32[554112]{0} get-tuple-element(tmp_1), index=1
+      tmp_4 = (f32[554112]{0}, s32[554112]{0}) fusion(tmp_0), kind=kLoop, calls=fused_computation2
+      tmp_5 = s32[554112]{0} get-tuple-element(tmp_4), index=1
+      ROOT tmp_6 = s32[554112]{0} add(tmp_2, tmp_5)
+    })"))
+                    .value();
+  EXPECT_FALSE(mof_.Run(module.get()).value());
+}
+
+TEST_F(MultiOutputFusionTest, InputVariadicReductionFusions) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation.1117 {
+      param_0.2433 = f32[] parameter(0)
+      param_1.2571 = f32[] parameter(1)
+      compare.1770 = pred[] compare(param_0.2433, param_1.2571), direction=LE
+      select.682 = f32[] select(compare.1770, param_0.2433, param_1.2571)
+      compare.1303.clone.1 = pred[] compare(param_0.2433, param_1.2571), direction=EQ
+      param_2.6460 = s32[] parameter(2)
+      param_3.6755 = s32[] parameter(3)
+      minimum.633.clone.1 = s32[] minimum(param_2.6460, param_3.6755)
+      select.398.clone.1 = s32[] select(compare.1770, param_2.6460, param_3.6755)
+      select.397.clone.1 = s32[] select(compare.1303.clone.1, minimum.633.clone.1, select.398.clone.1)
+      ROOT tuple.151 = (f32[], s32[]) tuple(select.682, select.397.clone.1)
+    }
+
+    minmax_func.223 {
+      lhs_value.224 = f32[] parameter(0)
+      rhs_value.226 = f32[] parameter(2)
+      lhs_index.225 = s32[] parameter(1)
+      rhs_index.227 = s32[] parameter(3)
+      ROOT fusion.1117 = (f32[], s32[]) fusion(lhs_value.224, rhs_value.226, lhs_index.225, rhs_index.227), kind=kLoop, calls=fused_computation.1117
+    }
+
+    fused_computation.73 {
+      bitcast.86661 = f32[3,1024,300]{2,1,0} parameter(0)
+      iota.734 = s32[3,1,1024,300]{3,2,1,0} iota(), iota_dimension=3
+      bitcast.97555 = s32[3,1024,300]{2,1,0} bitcast(iota.734)
+      constant_3917 = f32[] constant(inf)
+      constant_3918 = s32[] constant(0)
+      ROOT reduce.1069 = (f32[3,1024]{1,0}, s32[3,1024]{1,0}) reduce(bitcast.86661, bitcast.97555, constant_3917, constant_3918), dimensions={2}, to_apply=minmax_func.223
+    }
+
+    fused_computation.84 {
+      bitcast.86676 = f32[3,1024,300]{2,1,0} parameter(0)
+      iota.732 = s32[3,1,1024,300]{3,2,1,0} iota(), iota_dimension=3
+      bitcast.97553 = s32[3,1024,300]{2,1,0} bitcast(iota.732)
+      constant_3915 = f32[] constant(inf)
+      constant_3916 = s32[] constant(0)
+      ROOT reduce.1070 = (f32[3,1024]{1,0}, s32[3,1024]{1,0}) reduce(bitcast.86676, bitcast.97553, constant_3915, constant_3916), dimensions={2}, to_apply=minmax_func.223
+    }
+
+    ENTRY e {
+      p0 = f32[3,1024,300]{2,1,0} parameter(0)
+      fusion.84 = (f32[3,1024]{1,0}, s32[3,1024]{1,0}) fusion(p0), kind=kInput, calls=fused_computation.84
+      gte.391 = s32[3,1024]{1,0} get-tuple-element(fusion.84), index=1
+      fusion.73 = (f32[3,1024]{1,0}, s32[3,1024]{1,0}) fusion(p0), kind=kInput, calls=fused_computation.73
+      gte.393 = s32[3,1024]{1,0} get-tuple-element(fusion.73), index=1
+      ROOT r = s32[3,1024]{1,0} add(gte.391, gte.393)
+    })"))
+                    .value();
+  EXPECT_TRUE(mof_.Run(module.get()).value());
+  EXPECT_EQ(module->entry_computation()->parameter_instruction(0)->user_count(),
+            1);
+  const HloInstruction* fusion =
+      module->entry_computation()->parameter_instruction(0)->users()[0];
+  EXPECT_THAT(fusion, op::Fusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
 TEST_F(MultiOutputFusionTest, MultiOutputFusionTwoLoops) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     fused_computation_1 {
@@ -1308,6 +1426,172 @@ ENTRY main {
   EXPECT_FALSE(mof_.Run(module.get()).value());
 }
 
+TEST_F(MultiOutputFusionTest, CostBasedNoMerge) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+region_3.63 {
+  Arg_0.64 = f32[] parameter(0)
+  Arg_1.65 = f32[] parameter(1)
+  ROOT add.66 = f32[] add(Arg_0.64, Arg_1.65)
+}
+
+fused_computation.29 {
+  param_0.161 = f32[5,32,32,1]{3,2,1,0} parameter(0)
+  multiply.208 = f32[5,32,32,1]{3,2,1,0} multiply(param_0.161, param_0.161)
+  bitcast.67 = f32[5,32,32]{2,1,0} bitcast(multiply.208)
+  constant.265 = f32[] constant(0)
+  reduce-window.81 = f32[5,30,31]{2,1,0} reduce-window(bitcast.67, constant.265), window={size=1x3x2}, to_apply=region_3.63
+  constant.264 = f32[] constant(0.166666672)
+  broadcast.204 = f32[5,30,31]{2,1,0} broadcast(constant.264), dimensions={}
+  multiply.205 = f32[5,30,31]{2,1,0} multiply(reduce-window.81, broadcast.204)
+  constant.263 = f32[] constant(0)
+  reduce-window.80 = f32[5,30,31]{2,1,0} reduce-window(multiply.205, constant.263), window={size=1x2x3 pad=0_0x0_1x1_1}, to_apply=region_3.63
+  constant.262 = f32[] constant(0.0138888899)
+  broadcast.201 = f32[5,30,31]{2,1,0} broadcast(constant.262), dimensions={}
+  multiply.204 = f32[5,30,31]{2,1,0} multiply(reduce-window.80, broadcast.201)
+  constant.261 = f32[] constant(0)
+  reduce-window.78 = f32[5,30,31]{2,1,0} reduce-window(multiply.204, constant.261), window={size=1x1x2 pad=0_0x0_0x0_1}, to_apply=region_3.63
+  constant.113 = f32[] constant(0.5)
+  broadcast.137 = f32[5,30,31]{2,1,0} broadcast(constant.113), dimensions={}
+  multiply.125 = f32[5,30,31]{2,1,0} multiply(reduce-window.78, broadcast.137)
+  constant.114 = f32[] constant(0)
+  ROOT reduce-window.17 = f32[5,30,31]{2,1,0} reduce-window(multiply.125, constant.114), window={size=1x2x1 pad=0_0x0_1x0_0}, to_apply=region_3.63
+}
+
+fused_computation.15 {
+  constant.108 = f32[] constant(0.5)
+  broadcast.105 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.108), dimensions={}
+  param_3.126 = f32[5,30,31]{2,1,0} parameter(3)
+  constant.295 = f32[] constant(0.25)
+  broadcast.234 = f32[5,30,31]{2,1,0} broadcast(constant.295), dimensions={}
+  multiply.242 = f32[5,30,31]{2,1,0} multiply(param_3.126, broadcast.234)
+  broadcast.233 = f32[5,5,30,31]{3,2,1,0} broadcast(multiply.242), dimensions={0,2,3}
+  param_2.154 = f32[5,30,31]{2,1,0} parameter(2)
+  multiply.241 = f32[5,30,31]{2,1,0} multiply(param_2.154, broadcast.234)
+  broadcast.232 = f32[5,5,30,31]{3,2,1,0} broadcast(multiply.241), dimensions={1,2,3}
+  multiply.240 = f32[5,5,30,31]{3,2,1,0} multiply(broadcast.233, broadcast.232)
+  param_1.188 = f32[5,5,30,31]{3,2,1,0} parameter(1)
+  constant.294 = f32[] constant(0.159154937)
+  broadcast.231 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.294), dimensions={}
+  multiply.239 = f32[5,5,30,31]{3,2,1,0} multiply(param_1.188, broadcast.231)
+  param_0.164 = f32[5,5,30,31]{3,2,1,0} parameter(0)
+  add.19 = f32[5,5,30,31]{3,2,1,0} add(multiply.239, param_0.164)
+  constant.293 = f32[] constant(0)
+  reduce-window.90 = f32[5,5,30,31]{3,2,1,0} reduce-window(add.19, constant.293), window={size=1x1x1x2 pad=0_0x0_0x0_0x0_1}, to_apply=region_3.63
+  constant.292 = f32[] constant(0.5)
+  broadcast.230 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.292), dimensions={}
+  multiply.238 = f32[5,5,30,31]{3,2,1,0} multiply(reduce-window.90, broadcast.230)
+  constant.291 = f32[] constant(0)
+  reduce-window.89 = f32[5,5,30,31]{3,2,1,0} reduce-window(multiply.238, constant.291), window={size=1x1x2x1 pad=0_0x0_0x0_1x0_0}, to_apply=region_3.63
+  constant.290 = f32[] constant(0.25)
+  broadcast.229 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.290), dimensions={}
+  multiply.237 = f32[5,5,30,31]{3,2,1,0} multiply(reduce-window.89, broadcast.229)
+  multiply.236 = f32[5,5,30,31]{3,2,1,0} multiply(multiply.237, multiply.237)
+  subtract.10 = f32[5,5,30,31]{3,2,1,0} subtract(multiply.240, multiply.236)
+  constant.289 = f32[] constant(0)
+  broadcast.228 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.289), dimensions={}
+  maximum.6 = f32[5,5,30,31]{3,2,1,0} maximum(subtract.10, broadcast.228)
+  sqrt.6 = f32[5,5,30,31]{3,2,1,0} sqrt(maximum.6)
+  constant.110 = f32[] constant(0)
+  broadcast.107 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.110), dimensions={}
+  compare.4 = pred[5,5,30,31]{3,2,1,0} compare(sqrt.6, broadcast.107), direction=EQ
+  constant.243 = f32[] constant(0.159154937)
+  broadcast.193 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.243), dimensions={}
+  multiply.194 = f32[5,5,30,31]{3,2,1,0} multiply(param_1.188, broadcast.193)
+  add.15 = f32[5,5,30,31]{3,2,1,0} add(multiply.194, param_0.164)
+  constant.242 = f32[] constant(0)
+  reduce-window.66 = f32[5,5,30,31]{3,2,1,0} reduce-window(add.15, constant.242), window={size=1x1x1x2 pad=0_0x0_0x0_0x0_1}, to_apply=region_3.63
+  constant.241 = f32[] constant(0.5)
+  broadcast.192 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.241), dimensions={}
+  multiply.193 = f32[5,5,30,31]{3,2,1,0} multiply(reduce-window.66, broadcast.192)
+  constant.240 = f32[] constant(0)
+  reduce-window.65 = f32[5,5,30,31]{3,2,1,0} reduce-window(multiply.193, constant.240), window={size=1x1x2x1 pad=0_0x0_0x0_1x0_0}, to_apply=region_3.63
+  constant.239 = f32[] constant(0.25)
+  broadcast.191 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.239), dimensions={}
+  multiply.192 = f32[5,5,30,31]{3,2,1,0} multiply(reduce-window.65, broadcast.191)
+  compare.3 = pred[5,5,30,31]{3,2,1,0} compare(multiply.192, broadcast.107), direction=EQ
+  and.1 = pred[5,5,30,31]{3,2,1,0} and(compare.4, compare.3)
+  constant.109 = f32[] constant(1.57079637)
+  broadcast.104 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.109), dimensions={}
+  atan2.1 = f32[5,5,30,31]{3,2,1,0} atan2(sqrt.6, multiply.192)
+  select.4 = f32[5,5,30,31]{3,2,1,0} select(and.1, broadcast.104, atan2.1)
+  constant.107 = f32[] constant(0.159154937)
+  broadcast.106 = f32[5,5,30,31]{3,2,1,0} broadcast(constant.107), dimensions={}
+  multiply.100 = f32[5,5,30,31]{3,2,1,0} multiply(select.4, broadcast.106)
+  ROOT subtract.3 = f32[5,5,30,31]{3,2,1,0} subtract(broadcast.105, multiply.100)
+}
+
+fused_computation.4 {
+  param_0.172 = f32[5,30,31]{2,1,0} parameter(0)
+  constant.315 = f32[] constant(0.125)
+  broadcast.242 = f32[5,30,31]{2,1,0} broadcast(constant.315), dimensions={}
+  multiply.250 = f32[5,30,31]{2,1,0} multiply(param_0.172, broadcast.242)
+  constant.314 = f32[] constant(0)
+  reduce-window.100 = f32[5,30,31]{2,1,0} reduce-window(multiply.250, constant.314), window={size=1x3x3 pad=0_0x1_1x1_1}, to_apply=region_3.63
+  constant.79 = f32[] constant(0.055555556)
+  broadcast.85 = f32[5,30,31]{2,1,0} broadcast(constant.79), dimensions={}
+  multiply.80 = f32[5,30,31]{2,1,0} multiply(reduce-window.100, broadcast.85)
+  constant.81 = f32[] constant(0)
+  reduce-window.1 = f32[5,30,31]{2,1,0} reduce-window(multiply.80, constant.81), window={size=1x3x3 pad=0_0x1_1x1_1}, to_apply=region_3.63
+  constant.80 = f32[] constant(0.111111112)
+  broadcast.86 = f32[5,30,31]{2,1,0} broadcast(constant.80), dimensions={}
+  multiply.79 = f32[5,30,31]{2,1,0} multiply(reduce-window.1, broadcast.86)
+  bitcast.26 = f32[5,930]{1,0} bitcast(multiply.79)
+  ROOT reduce.8 = f32[5]{0} reduce(bitcast.26, constant.81), dimensions={1}, to_apply=region_3.63
+}
+
+ENTRY e {
+  Arg_0.1 = f32[5,32,32,1]{3,2,1,0} parameter(0)
+  p1 = f32[5,5,30,31]{3,2,1,0} parameter(1)
+  p2 = f32[5,5,30,31]{3,2,1,0} parameter(2)
+  p3 = f32[5,30,31]{2,1,0} parameter(3)
+  fusion.29 = f32[5,30,31]{2,1,0} fusion(Arg_0.1), kind=kLoop, calls=fused_computation.29
+  fusion.15 = f32[5,5,30,31]{3,2,1,0} fusion(p2, p1, p3, fusion.29), kind=kLoop, calls=fused_computation.15
+  ROOT fusion.4 = f32[5]{0} fusion(fusion.29), kind=kInput, calls=fused_computation.4
+})")
+                    .value();
+  EXPECT_FALSE(mof_.Run(module.get()).value());
+}
+
+TEST_F(MultiOutputFusionTest, SkipSoftmaxCustomCallComputation) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule softmax
+
+%max_computation (arg_0: f32[], arg_1: f32[]) -> f32[] {
+  %arg_0 = f32[] parameter(0)
+  %arg_1 = f32[] parameter(1)
+  ROOT %maximum.2 = f32[] maximum(f32[] %arg_0, f32[] %arg_1)
+}
+
+%add_computation (arg_0.1: f32[], arg_1.1: f32[]) -> f32[] {
+  %arg_0.1 = f32[] parameter(0)
+  %arg_1.1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %arg_0.1, f32[] %arg_1.1)
+}
+
+%softmax_computation (parameter_0: f32[10,2]) -> f32[10,2] {
+  %parameter_0 = f32[10,2]{1,0} parameter(0)
+  %constant = f32[] constant(-inf)
+  %reduce.1 = f32[10]{0} reduce(f32[10,2]{1,0} %parameter_0, f32[] %constant), dimensions={1}, to_apply=%max_computation
+  %broadcast.3 = f32[10,2]{1,0} broadcast(f32[10]{0} %reduce.1), dimensions={0}
+  %subtract.3 = f32[10,2]{1,0} subtract(f32[10,2]{1,0} %parameter_0, f32[10,2]{1,0} %broadcast.3)
+  %exponential.3 = f32[10,2]{1,0} exponential(f32[10,2]{1,0} %subtract.3)
+  %constant.1 = f32[] constant(0)
+  %second_reduce.1 = f32[10]{0} reduce(f32[10,2]{1,0} %exponential.3, f32[] %constant.1), dimensions={1}, to_apply=%add_computation
+  %broadcast.4 = f32[10,2]{1,0} broadcast(f32[10]{0} %second_reduce.1), dimensions={0}
+  ROOT %divide.2 = f32[10,2]{1,0} divide(f32[10,2]{1,0} %exponential.3, f32[10,2]{1,0} %broadcast.4)
+}
+
+ENTRY %main (param_0: f32[10,2]) -> f32[10,2] {
+  %param_0 = f32[10,2]{1,0} parameter(0)
+  ROOT %custom-call = f32[10,2]{1,0} custom-call(f32[10,2]{1,0} %param_0), custom_call_target="__softmax_fusion", called_computations={%softmax_computation}
+}
+)")
+                    .value();
+  EXPECT_FALSE(mof_.Run(module.get()).value());
+}
+
 class TransposeMultiOutputFusionTest : public MultiOutputFusionTest {
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
index b546efec2f7..50b52ce5a27 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_GATHER_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_GATHER_THUNK_H_
 
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index aafd1a92f7f..f1b7c725296 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -226,11 +226,23 @@ NcclAllReduceThunkBase::NcclAllReduceThunkBase(Thunk::Kind kind,
   CHECK_EQ(config_.config.operand_count, buffers_.size());
 }
 
+Status NcclAllReduceThunkBase::RunAllReduce(const ExecuteParams& params,
+                                            se::Stream& stream,
+                                            ncclComm_t comm) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+  return ::xla::gpu::RunAllReduce(config_.reduction_kind, device_buffers,
+                                  stream, comm);
+}
+
 NcclAllReduceThunk::NcclAllReduceThunk(ThunkInfo thunk_info,
                                        mlir::lmhlo::AllReduceOp op,
                                        std::vector<Buffer> buffers)
     : NcclAllReduceThunkBase(Thunk::kNcclAllReduce, thunk_info,
-                             impl::GetNcclAllReduceConfig(op), buffers) {}
+                             impl::GetNcclAllReduceConfig(op),
+                             std::move(buffers)) {}
 
 bool NcclAllReduceThunk::CanImplement(mlir::lmhlo::AllReduceOp op) {
   return impl::CanImplement(op, Thunk::kNcclAllReduce);
@@ -249,24 +261,15 @@ CollectiveOpGroupMode NcclAllReduceThunk::GetGroupMode(
 
 Status NcclAllReduceThunk::RunNcclCollective(const ExecuteParams& params,
                                              ncclComm_t comm) {
-  se::Stream& stream = *params.stream;
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_RETURN_IF_ERROR(
-      RunAllReduce(config_.reduction_kind, device_buffers, stream, comm));
-
-  int device_ordinal = stream.parent()->device_ordinal();
-  VLOG(3) << "Done performing all-reduce for ordinal: " << device_ordinal;
-  return OkStatus();
+  return RunAllReduce(params, *params.stream, comm);
 }
 
 NcclAllReduceStartThunk::NcclAllReduceStartThunk(
     ThunkInfo thunk_info, mlir::lmhlo_gpu::AllReduceStartOp op,
     std::vector<Buffer> buffers)
     : NcclAllReduceThunkBase(Thunk::kNcclAllReduceStart, thunk_info,
-                             impl::GetNcclAllReduceConfig(op), buffers) {}
+                             impl::GetNcclAllReduceConfig(op),
+                             std::move(buffers)) {}
 
 bool NcclAllReduceStartThunk::CanImplement(
     mlir::lmhlo_gpu::AllReduceStartOp op) {
@@ -286,55 +289,16 @@ CollectiveOpGroupMode NcclAllReduceStartThunk::GetGroupMode(
 
 Status NcclAllReduceStartThunk::RunNcclCollective(const ExecuteParams& params,
                                                   ncclComm_t comm) {
-  se::Stream& async_comms_stream = *params.async_comms_stream;
-  // Wait until compute inputs are ready.
-  async_comms_stream.ThenWaitFor(params.stream);
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_RETURN_IF_ERROR(RunAllReduce(config_.reduction_kind, device_buffers,
-                                  async_comms_stream, comm));
-
-  // Create an event on the async stream for the completion of the all-reduce.
-  se::Event done_event(async_comms_stream.parent());
-  TF_RET_CHECK(done_event.Init());
-  async_comms_stream.ThenRecordEvent(&done_event);
-
-  int device_ordinal = async_comms_stream.parent()->device_ordinal();
-
-  {
-    absl::MutexLock lock(&mu_);
-    auto result = done_events_.emplace(device_ordinal, std::move(done_event));
-    TF_RET_CHECK(result.second) << "done event has not been consumed";
-  }
-
-  VLOG(3) << "Done performing all-reduce-start for ordinal: " << device_ordinal;
-  return OkStatus();
-}
-
-StatusOr<se::Event> NcclAllReduceStartThunk::TakeDoneEvent(int device_ordinal) {
-  absl::MutexLock lock(&mu_);
-  auto it = done_events_.find(device_ordinal);
-  TF_RET_CHECK(it != done_events_.end()) << "done event not found";
-  // Take ownership of the event.
-  se::Event done_event = std::move(it->second);
-  done_events_.erase(it);
-  return done_event;
+  return async_.Execute(
+      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
+        return RunAllReduce(params, stream, comm);
+      },
+      params, comm);
 }
 
 NcclAllReduceDoneThunk::NcclAllReduceDoneThunk(
-    ThunkInfo thunk_info, NcclAllReduceStartThunk& start_thunk)
-    : Thunk(Thunk::kNcclAllReduceDone, thunk_info), start_thunk_(start_thunk) {}
-
-Status NcclAllReduceDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
-  int device_ordinal = params.stream->parent()->device_ordinal();
-  TF_ASSIGN_OR_RETURN(se::Event done_event,
-                      start_thunk_.TakeDoneEvent(device_ordinal));
-  params.stream->ThenWaitFor(&done_event);
-  return OkStatus();
-}
+    ThunkInfo thunk_info, NcclCollectiveThunk::AsyncExecutor& async)
+    : NcclCollectiveDoneThunk(Thunk::kNcclAllReduceDone, thunk_info, async) {}
 
 NcclReduceScatterThunk::NcclReduceScatterThunk(
     ThunkInfo thunk_info, mlir::lmhlo::ReduceScatterOp op,
@@ -394,7 +358,7 @@ Status RunReduceScatter(ReductionKind reduction_kind,
                         ToNcclDataTypeAndCountMultiplier(
                             buffer.element_type, Thunk::kNcclReduceScatter));
     ncclDataType_t dtype = dtype_and_multiplier.first;
-    int element_count = buffer.element_count * dtype_and_multiplier.second;
+    int64_t element_count = buffer.element_count * dtype_and_multiplier.second;
 
     // buffer.element_count is the source buffers element count. For
     // ncclReduceScatter, we need the destination buffers element count.
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 6de7fa26f00..f6dbbfe8caf 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -16,12 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
 
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -44,9 +48,11 @@ class NcclAllReduceThunkBase : public NcclCollectiveThunk {
                          std::vector<Buffer> buffers);
 
  protected:
+  Status RunAllReduce(const ExecuteParams& params, se::Stream& stream,
+                      ncclComm_t comm);
+
   const NcclCollectiveConfig& config() const override { return config_.config; }
 
- protected:
   const NcclAllReduceConfig config_;
   const std::vector<Buffer> buffers_;
 };
@@ -82,28 +88,20 @@ class NcclAllReduceStartThunk : public NcclAllReduceThunkBase {
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::AllReduceStartOp op);
 
-  StatusOr<se::Event> TakeDoneEvent(int device_ordinal)
-      ABSL_LOCKS_EXCLUDED(mu_);
+  AsyncExecutor& async_executor() { return async_; }
 
  protected:
   Status RunNcclCollective(const ExecuteParams& params,
                            ncclComm_t comm) override;
 
  private:
-  absl::Mutex mu_;
-  // Store done events (by device ordinal) for the done thunk to wait on.
-  absl::flat_hash_map<int, se::Event> done_events_ ABSL_GUARDED_BY(mu_);
+  AsyncExecutor async_;
 };
 
-class NcclAllReduceDoneThunk : public Thunk {
+class NcclAllReduceDoneThunk : public NcclCollectiveDoneThunk {
  public:
-  explicit NcclAllReduceDoneThunk(ThunkInfo thunk_info,
-                                  NcclAllReduceStartThunk& start_thunk);
-
-  Status ExecuteOnStream(const ExecuteParams& params) override;
-
- private:
-  NcclAllReduceStartThunk& start_thunk_;
+  NcclAllReduceDoneThunk(ThunkInfo thunk_info,
+                         NcclCollectiveThunk::AsyncExecutor& async);
 };
 
 class NcclReduceScatterThunk : public NcclAllReduceThunkBase {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
index abc5a632f61..21ee72f02ec 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
@@ -136,7 +136,8 @@ Status RunAllToAll(bool has_split_dimension,
                           ToNcclDataTypeAndCountMultiplier(
                               buffer.element_type, Thunk::kNcclAllToAll));
       ncclDataType_t dtype = dtype_and_multiplier.first;
-      int element_count = buffer.element_count * dtype_and_multiplier.second;
+      int64_t element_count =
+          buffer.element_count * dtype_and_multiplier.second;
 
       XLA_CUDA_RETURN_IF_ERROR(ncclSend(send_buffer, element_count, dtype,
                                         /*rank=*/i, comm, gpu_stream));
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
index 576b42c8e53..da7284a42a6 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_TO_ALL_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_ALL_TO_ALL_THUNK_H_
 
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
index 87d52611a98..33ca6342ac1 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
@@ -34,11 +34,17 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace impl {
 
-/*static*/ NcclCollectivePermuteConfig
-NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
-    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
-    int64_t partition_count) {
+template <typename OpT>
+CollectiveOpGroupMode GetGroupMode(OpT op) {
+  return GetCollectiveOpGroupMode(op.getChannelId().has_value(), std::nullopt)
+      .value();
+}
+
+template <typename OpT>
+NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
+    OpT op, int64_t replica_count, int64_t partition_count) {
   NcclCollectivePermuteConfig collective_permute_config;
   auto& config = collective_permute_config.config;
 
@@ -78,9 +84,8 @@ NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
 
 // The collective permute is degenerate if all source-target pairs are identity,
 // and all the IDs appear in the list.
-/*static*/ bool NcclCollectivePermuteThunk::IsDegenerate(
-    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
-    int64_t partition_count) {
+template <typename OpT>
+bool IsDegenerate(OpT op, int64_t replica_count, int64_t partition_count) {
   const std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
       ConvertNx2Attribute(op.getSourceTargetPairs()).value();
   // Each ID can appear only once as a source and as a target. So if all pairs
@@ -95,28 +100,29 @@ NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
                         });
 }
 
-/*static*/ bool NcclCollectivePermuteThunk::CanImplement(
-    mlir::lmhlo::CollectivePermuteOp op) {
+template <typename OpT>
+bool CanImplement(OpT op) {
   const Shape shape = GetShape(op.getOperand());
-  return IsTypeSupportedByNccl(shape.element_type(), Thunk::kCollectivePermute);
+  return IsTypeSupportedByNccl(shape.element_type(),
+                               Thunk::kNcclCollectivePermute);
 }
 
-NcclCollectivePermuteThunk::NcclCollectivePermuteThunk(
-    ThunkInfo thunk_info, mlir::lmhlo::CollectivePermuteOp op,
-    int64_t replica_count, int64_t partition_count, const Buffer& buffer)
-    : NcclCollectiveThunk(Thunk::kCollectivePermute, thunk_info),
-      config_(
-          GetNcclCollectivePermuteConfig(op, replica_count, partition_count)),
+}  // namespace impl
+
+NcclCollectivePermuteThunkBase::NcclCollectivePermuteThunkBase(
+    Kind kind, ThunkInfo thunk_info, NcclCollectivePermuteConfig config,
+    const Buffer& buffer)
+    : NcclCollectiveThunk(kind, thunk_info),
+      config_(std::move(config)),
       buffer_(buffer) {}
 
-Status NcclCollectivePermuteThunk::RunNcclCollective(
-    const ExecuteParams& params, ncclComm_t comm) {
+Status NcclCollectivePermuteThunkBase::RunCollectivePermute(
+    const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, {buffer_},
                              config_.config.operand_element_type));
-  if (device_buffers.size() != 1)
-    return FailedPrecondition("Expected a single input-output buffer pair.");
+  TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
 
   TF_ASSIGN_OR_RETURN(const GlobalDeviceId global_device_id,
                       params.nccl_params.GetGlobalDeviceId());
@@ -133,10 +139,94 @@ Status NcclCollectivePermuteThunk::RunNcclCollective(
       NcclCollectivePermuteConfig::GetSourceTarget(config_.id_to_source_target,
                                                    current_id);
 
-  return RunCollectivePermute(source_target, device_buffers[0], *params.stream,
-                              comm, device_string, current_id);
+  return ::xla::gpu::RunCollectivePermute(source_target, device_buffers[0],
+                                          stream, comm, device_string,
+                                          current_id);
+}
+
+/*static*/ NcclCollectivePermuteConfig
+NcclCollectivePermuteThunk::GetNcclCollectivePermuteConfig(
+    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::GetNcclCollectivePermuteConfig(op, replica_count,
+                                              partition_count);
+}
+
+/*static*/ bool NcclCollectivePermuteThunk::CanImplement(
+    mlir::lmhlo::CollectivePermuteOp op) {
+  return impl::CanImplement(op);
+}
+
+/*static*/ bool NcclCollectivePermuteThunk::IsDegenerate(
+    mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::IsDegenerate(op, replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode NcclCollectivePermuteThunk::GetGroupMode(
+    mlir::lmhlo::CollectivePermuteOp op) {
+  return impl::GetGroupMode(op);
+}
+
+NcclCollectivePermuteThunk::NcclCollectivePermuteThunk(
+    ThunkInfo thunk_info, mlir::lmhlo::CollectivePermuteOp op,
+    int64_t replica_count, int64_t partition_count, const Buffer& buffer)
+    : NcclCollectivePermuteThunkBase(
+          Thunk::kNcclCollectivePermute, thunk_info,
+          GetNcclCollectivePermuteConfig(op, replica_count, partition_count),
+          buffer) {}
+
+Status NcclCollectivePermuteThunk::RunNcclCollective(
+    const ExecuteParams& params, ncclComm_t comm) {
+  return RunCollectivePermute(params, *params.stream, comm);
+}
+
+/*static*/ NcclCollectivePermuteConfig
+NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
+    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::GetNcclCollectivePermuteConfig(op, replica_count,
+                                              partition_count);
+}
+
+/*static*/ bool NcclCollectivePermuteStartThunk::CanImplement(
+    mlir::lmhlo_gpu::CollectivePermuteStartOp op) {
+  return impl::CanImplement(op);
 }
 
+/*static*/ bool NcclCollectivePermuteStartThunk::IsDegenerate(
+    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::IsDegenerate(op, replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode NcclCollectivePermuteStartThunk::GetGroupMode(
+    mlir::lmhlo_gpu::CollectivePermuteStartOp op) {
+  return impl::GetGroupMode(op);
+}
+
+NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
+    ThunkInfo thunk_info, mlir::lmhlo_gpu::CollectivePermuteStartOp op,
+    int64_t replica_count, int64_t partition_count, const Buffer& buffer)
+    : NcclCollectivePermuteThunkBase(
+          Thunk::kNcclCollectivePermuteStart, thunk_info,
+          GetNcclCollectivePermuteConfig(op, replica_count, partition_count),
+          buffer) {}
+
+Status NcclCollectivePermuteStartThunk::RunNcclCollective(
+    const ExecuteParams& params, ncclComm_t comm) {
+  return async_.Execute(
+      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
+        return RunCollectivePermute(params, stream, comm);
+      },
+      params, comm);
+}
+
+NcclCollectivePermuteDoneThunk::NcclCollectivePermuteDoneThunk(
+    ThunkInfo thunk_info, NcclCollectiveThunk::AsyncExecutor& async)
+    : NcclCollectiveDoneThunk(Thunk::kNcclCollectivePermuteDone, thunk_info,
+                              async) {}
+
 Status RunCollectivePermute(
     NcclCollectivePermuteConfig::SourceTargetMapEntry source_target,
     DeviceBufferPair& buffer, se::Stream& stream, ncclComm_t comm,
@@ -184,7 +274,7 @@ Status RunCollectivePermute(
 
   TF_ASSIGN_OR_RETURN(auto dtype_and_multiplier,
                       ToNcclDataTypeAndCountMultiplier(
-                          buffer.element_type, Thunk::kCollectivePermute));
+                          buffer.element_type, Thunk::kNcclCollectivePermute));
   ncclDataType_t dtype = dtype_and_multiplier.first;
   int64_t element_count = buffer.element_count * dtype_and_multiplier.second;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
index 757fb66b578..24dafac19cb 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
@@ -17,11 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_COLLECTIVE_PERMUTE_THUNK_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -57,39 +57,79 @@ struct NcclCollectivePermuteConfig {
 };
 
 // Thunk that performs a NCCL-based collective permute.
-class NcclCollectivePermuteThunk : public NcclCollectiveThunk {
+class NcclCollectivePermuteThunkBase : public NcclCollectiveThunk {
+ public:
+  NcclCollectivePermuteThunkBase(Kind kind, ThunkInfo thunk_info,
+                                 NcclCollectivePermuteConfig config,
+                                 const Buffer& buffer);
+
+ protected:
+  Status RunCollectivePermute(const ExecuteParams& params, se::Stream& stream,
+                              ncclComm_t comm);
+
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+
+ private:
+  const NcclCollectivePermuteConfig config_;
+  const Buffer buffer_;
+};
+
+class NcclCollectivePermuteThunk : public NcclCollectivePermuteThunkBase {
  public:
   static NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
       mlir::lmhlo::CollectivePermuteOp op, int64_t replica_count,
       int64_t partition_count);
 
+  static bool CanImplement(mlir::lmhlo::CollectivePermuteOp op);
+  static bool IsDegenerate(mlir::lmhlo::CollectivePermuteOp op,
+                           int64_t replica_count, int64_t partition_count);
+  static CollectiveOpGroupMode GetGroupMode(
+      mlir::lmhlo::CollectivePermuteOp op);
+  static const char* GetName() { return "CollectivePermute"; }
+
   NcclCollectivePermuteThunk(ThunkInfo thunk_info,
                              mlir::lmhlo::CollectivePermuteOp op,
                              int64_t replica_count, int64_t partition_count,
                              const Buffer& buffer);
 
-  // Returns whether the given instruction can be lowered to a nccl collective
-  // permute thunk.
-  static bool CanImplement(mlir::lmhlo::CollectivePermuteOp op);
+ protected:
+  Status RunNcclCollective(const ExecuteParams& params,
+                           ncclComm_t comm) override;
+};
 
-  static const char* GetName() { return "CollectivePermute"; }
-  static bool IsDegenerate(mlir::lmhlo::CollectivePermuteOp op,
+class NcclCollectivePermuteStartThunk : public NcclCollectivePermuteThunkBase {
+ public:
+  static NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
+      mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
+      int64_t partition_count);
+
+  static bool CanImplement(mlir::lmhlo_gpu::CollectivePermuteStartOp op);
+  static bool IsDegenerate(mlir::lmhlo_gpu::CollectivePermuteStartOp op,
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
-      mlir::lmhlo::CollectivePermuteOp op) {
-    return GetCollectiveOpGroupMode(op.getChannelId().has_value(), std::nullopt)
-        .value();
-  }
+      mlir::lmhlo_gpu::CollectivePermuteStartOp op);
+  static const char* GetName() { return "CollectivePermuteStart"; }
+
+  NcclCollectivePermuteStartThunk(ThunkInfo thunk_info,
+                                  mlir::lmhlo_gpu::CollectivePermuteStartOp op,
+                                  int64_t replica_count,
+                                  int64_t partition_count,
+                                  const Buffer& buffer);
+
+  AsyncExecutor& async_executor() { return async_; }
 
  protected:
   Status RunNcclCollective(const ExecuteParams& params,
                            ncclComm_t comm) override;
 
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-
  private:
-  const NcclCollectivePermuteConfig config_;
-  const Buffer buffer_;
+  AsyncExecutor async_;
+};
+
+class NcclCollectivePermuteDoneThunk : public NcclCollectiveDoneThunk {
+ public:
+  NcclCollectivePermuteDoneThunk(ThunkInfo thunk_info,
+                                 NcclCollectiveThunk::AsyncExecutor& async);
 };
 
 Status RunCollectivePermute(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
index a636eb76edc..22ec9f16687 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
@@ -210,6 +210,48 @@ std::string NcclCollectiveThunk::GetDeviceString(
                          global_device_id.value(), device_ordinal);
 }
 
+Status NcclCollectiveThunk::AsyncExecutor::Execute(
+    absl::FunctionRef<Status(const ExecuteParams&, se::Stream&, ncclComm_t)> fn,
+    const ExecuteParams& params, ncclComm_t comm) {
+  se::Stream& async_comms_stream = *params.async_comms_stream;
+  // Wait until compute inputs are ready.
+  async_comms_stream.ThenWaitFor(params.stream);
+
+  TF_RETURN_IF_ERROR(fn(params, async_comms_stream, comm));
+
+  // Create an event on the async stream for the completion of the collective.
+  se::Event done_event(async_comms_stream.parent());
+  TF_RET_CHECK(done_event.Init());
+  async_comms_stream.ThenRecordEvent(&done_event);
+
+  int device_ordinal = async_comms_stream.parent()->device_ordinal();
+  absl::MutexLock lock(&mu_);
+  auto [_, was_inserted] =
+      done_events_.insert({device_ordinal, std::move(done_event)});
+  TF_RET_CHECK(was_inserted) << "done event has not been consumed";
+  return OkStatus();
+}
+
+Status NcclCollectiveThunk::AsyncExecutor::Await(const ExecuteParams& params) {
+  int device_ordinal = params.stream->parent()->device_ordinal();
+  auto done_event = [this, device_ordinal] {
+    absl::MutexLock lock(&mu_);
+    return done_events_.extract(device_ordinal);
+  }();
+  TF_RET_CHECK(done_event) << "done event not found";
+  params.stream->ThenWaitFor(&done_event.mapped());
+  return OkStatus();
+}
+
+NcclCollectiveDoneThunk::NcclCollectiveDoneThunk(
+    Thunk::Kind kind, ThunkInfo thunk_info,
+    NcclCollectiveThunk::AsyncExecutor& async)
+    : Thunk(kind, std::move(thunk_info)), async_(async) {}
+
+Status NcclCollectiveDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
+  return async_.Await(params);
+}
+
 bool IsTypeSupportedByNccl(PrimitiveType element_type,
                            Thunk::Kind reduction_op) {
   switch (element_type) {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
index b4e3be3276c..9cd0f3f9b5e 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/functional/function_ref.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -103,6 +105,23 @@ class NcclCollectiveThunk : public Thunk {
     mlir::Value destination_value;
   };
 
+  class AsyncExecutor {
+   public:
+    // Executes the function on the async communications stream and records a
+    // completion event.
+    Status Execute(
+        absl::FunctionRef<Status(const ExecuteParams&, se::Stream&, ncclComm_t)>
+            fn,
+        const ExecuteParams& params, ncclComm_t comm);
+    // Blocks the compute stream until async communication is complete.
+    Status Await(const ExecuteParams& params);
+
+   private:
+    absl::Mutex mu_;
+    // Store done events (by device ordinal) for the done thunk to wait on.
+    absl::flat_hash_map<int, se::Event> done_events_ ABSL_GUARDED_BY(mu_);
+  };
+
   // Returns whether NCCL operations appear possible to perform; e.g. if we
   // haven't done a build with the CUDA compiler enabled, we can't compile the
   // NCCL header, and thus this will be false.
@@ -127,6 +146,17 @@ class NcclCollectiveThunk : public Thunk {
 #endif  // XLA_ENABLE_XCCL
 };
 
+class NcclCollectiveDoneThunk : public Thunk {
+ public:
+  NcclCollectiveDoneThunk(Thunk::Kind kind, ThunkInfo thunk_info,
+                          NcclCollectiveThunk::AsyncExecutor& async);
+
+  Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  NcclCollectiveThunk::AsyncExecutor& async_;
+};
+
 // Returns if the given data type is supported by NCCL.
 // Note: Keep this in sync with ToNcclDataType().
 bool IsTypeSupportedByNccl(PrimitiveType element_type,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
index 57312f0a7d5..916ff810825 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
@@ -143,8 +143,8 @@ struct NcclCliqueState {
   ncclUniqueId unique_id;
   int64_t run_id = -1;
 
-  // mu guards ready, status, and communicators during initialization.
-  // Once 'ready' has been notified, the communicators may be accessed without
+  // `mu` guards `communicators` and `status` during initialization.
+  // Once `ready` has been notified, the communicators may be accessed without
   // synchronization.
   absl::Mutex mu;
   absl::Notification ready;
@@ -235,8 +235,9 @@ StatusOr<const NcclUniqueIdCallback*> GetNcclUniqueIdCallback(
       << "If non-local devices are taking part of a collective API on "
          "GPU, the nccl_unique_id_callback must be provided by the client.";
 
-  static NcclUniqueIdCallback local_callback(LocalNcclUniqueIdCallback);
-  return &local_callback;
+  static auto* local_callback =
+      new NcclUniqueIdCallback(LocalNcclUniqueIdCallback);
+  return local_callback;
 }
 
 StatusOr<NcclComm::Lock> AcquireNcclComm(
@@ -250,7 +251,6 @@ StatusOr<NcclComm::Lock> AcquireNcclComm(
       run_id, op_id, clique_key, unique_id_callback, num_local_participants);
 
   if (!clique->ok()) return clique->status();
-  NcclCliqueState& state = ***clique;
 
   struct AllCommunicators {
     absl::Mutex mu;
@@ -273,45 +273,39 @@ StatusOr<NcclComm::Lock> AcquireNcclComm(
       });
   (void)check_async_error_thread;  // Silence unused variable warning.
 
-  NcclComm::Lock comm;
-  if (state.ready.HasBeenNotified()) {
-    comm = state.communicators[rank]->Acquire();
-  } else {
-    auto comm_ptr = std::make_unique<NcclComm>();
-    comm = comm_ptr->Acquire();
+  NcclCliqueState& state = ***clique;
+  if (!state.ready.HasBeenNotified()) {
     int nranks = clique_key.devices().size();
     const ncclUniqueId& id = state.unique_id;
-    Status status =
-        XLA_CUDA_STATUS(ncclCommInitRank(comm.get(), nranks, id, rank));
 
-    // Add the communicator to the all_communicators list.
-    {
-      absl::MutexLock lock(&all_communicators.mu);
-      all_communicators.communicators.push_back(comm_ptr.get());
-    }
+    ncclComm_t comm = nullptr;
+    Status status = XLA_CUDA_STATUS(ncclCommInitRank(&comm, nranks, id, rank));
 
-    absl::MutexLock lock(&state.mu);
-    state.status.Update(status);
-    state.communicators[rank] = std::move(comm_ptr);
+    size_t num_initialized = [&] {
+      absl::MutexLock lock(&state.mu);
+      state.status.Update(status);
+      state.communicators[rank] = std::make_unique<NcclComm>(comm);
+      return state.communicators.size();
+    }();
 
     // Wait for all communicators to initialize before allowing any progress.
     // Otherwise we may get deadlocks, because ncclCommInitRank may allocate,
     // which may block on the completion of device activity on a peer device,
     // which may depend on the completion of this collective if we do not have a
     // barrier to prevent it.
-    auto all_initialized = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state.mu) {
-      return state.communicators.size() == num_local_participants;
-    };
-    state.mu.Await(absl::Condition(&all_initialized));
-    status = state.status;
-    if (!state.ready.HasBeenNotified()) {
+    if (num_initialized == num_local_participants) {
       state.ready.Notify();
+    } else {
+      TF_RETURN_IF_ERROR(status);
+      state.ready.WaitForNotification();
     }
+
+    absl::MutexLock lock(&all_communicators.mu);
+    all_communicators.communicators.push_back(state.communicators[rank].get());
   }
-  if (!state.status.ok()) {
-    return state.status;
-  }
-  return comm;
+
+  TF_RETURN_IF_ERROR(state.status);
+  return state.communicators[rank]->Acquire();
 }
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.h b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
index 4ff9a137f2b..5e1215f198a 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
@@ -30,8 +30,12 @@ limitations under the License.
 
 // Common place for all collective thunks to include nccl/rccl headers.
 #if TENSORFLOW_USE_ROCM
+#if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/rccl/rccl.h"
 #else
+#include "rocm/include/rccl.h"
+#endif
+#else
 #include "third_party/nccl/nccl.h"
 #endif
 
@@ -115,7 +119,7 @@ class Lockable {
 TSL_LIB_GTL_DEFINE_INT_TYPE(OpId, int64_t);
 
 struct NcclComm : public Lockable<ncclComm_t> {
-  NcclComm() : Lockable(nullptr) {}
+  explicit NcclComm(ncclComm_t comm) : Lockable(comm) {}
 };
 
 StatusOr<NcclComm::Lock> AcquireNcclComm(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 4ac02f14e23..8642c4bf7b6 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -19,14 +19,20 @@ limitations under the License.
 
 #include <fstream>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
 
 #include "absl/base/call_once.h"
 #include "absl/strings/str_format.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/convert_mover.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_pad_for_gemms.h"
@@ -42,45 +48,79 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/metrics.h"
-#include "tensorflow/compiler/xla/service/gpu/nvptx_helper.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/triangular_solve_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
+#include "tensorflow/tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {
+namespace {
+
+class ConvBfloat16Support : public BFloat16Support {
+ public:
+  explicit ConvBfloat16Support(
+      se::dnn::VersionInfo cudnn_version,
+      se::CudaComputeCapability cuda_compute_capability)
+      : is_conv_bf16_supported_((cudnn_version.major_version() > 8 ||
+                                 (cudnn_version.major_version() == 8 &&
+                                  cudnn_version.minor_version() >= 2)) &&
+                                cuda_compute_capability.IsAtLeast(
+                                    se::CudaComputeCapability::AMPERE)) {}
+
+  bool SupportsBF16Operand(const HloInstruction& hlo,
+                           int64_t operand_index) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+  bool SupportsBF16Output(const HloInstruction& hlo) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+ private:
+  bool is_conv_bf16_supported_;
+};
+
+}  // namespace
 
 Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    HloModule* hlo_module, GpuVersion gpu_version,
+    se::dnn::VersionInfo dnn_version,
     se::DeviceMemoryAllocator* device_allocator) {
+  auto cuda_compute_capability =
+      std::get<se::CudaComputeCapability>(gpu_version);
   // Convert convolutions into CustomCalls to cudnn, then canonicalize them
   // (GpuConvPaddingLegalization). Also expand cuSolver calls.
   HloPassPipeline pipeline("conv_canonicalization");
   pipeline.AddInvariantCheckerDebug<HloVerifier>(
       /*layout_sensitive=*/false,
       /*allow_mixed_precision=*/false);
+
+  // Convert upsupported bf16 convolutions to f32.
+  ConvBfloat16Support conv_bf16_support(dnn_version, cuda_compute_capability);
+  pipeline.AddPass<BFloat16Normalization>(&conv_bf16_support);
+
   pipeline.AddPass<GpusolverRewriter>();
   pipeline.AddPass<GpuConvRewriter>();
-  pipeline.AddPass<CudnnFusedConvRewriter>();
+  pipeline.AddPass<CudnnFusedConvRewriter>(cuda_compute_capability);
   pipeline.AddPass<GpuConvPaddingLegalization>();
-  pipeline.AddPass<CudnnPadForConvolutions>(
-      stream_exec->GetDeviceDescription().cuda_compute_capability());
-  pipeline.AddPass<CudnnVectorizeConvolutions>(
-      stream_exec->GetDeviceDescription().cuda_compute_capability());
+  pipeline.AddPass<CudnnPadForConvolutions>(cuda_compute_capability);
+  pipeline.AddPass<CudnnVectorizeConvolutions>(cuda_compute_capability);
   // The conv padding/vectorization passes which we need to get rid of.  They
   // also leave behind unnecessary tuple/get-tuple-element pairs that
   // TupleSimplifier fixes.
@@ -99,9 +139,18 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddPass<CudnnSimplifyPadding>();
 
   // tf2xla bridge, DepthwiseConvolutionConverter, GpuConvRewriter, and
-  // CudnnSimplifyPadding introduce reshapes and transposes that can be
-  // eliminated using AlgebraicSimplifier  We run algsimp to a fixed point.
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(algsimp_options);
+  // CudnnSimplifyPadding introduce reshapes and transposes.
+  pipeline.AddPass<HloPassFix<ReshapeMover>>();
+  // The reshapes and transposes can possibly be eliminated using
+  // AlgebraicSimplifier. ConvertMover and ReshapeMover fight with each other.
+  // ConvertMover wants to move some converts down the graph, but ReshapeMover
+  // wants to move them up the graph. We run ConvertMover and algsimp to a fixed
+  // point.
+  [&, &pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+          "simplify_after_conv_canonicalization")] {
+    pipeline.AddPass<ConvertMover>();
+    pipeline.AddPass<AlgebraicSimplifier>(algsimp_options);
+  }();
 
   // GpuConvRewriter, GpuConvPaddingLegalization and
   // CudnnConvPadForTensorCores may add instructions which can be simplified
@@ -114,18 +163,20 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 
 Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    se::DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator,
+    const GpuTargetConfig& gpu_target_config,
+    const AutotuneResults* autotune_results) {
   HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
 
   // This needs to run before GemmRewriter, which is part of
   // OptimizeHloPostLayoutAssignment().
-  if (stream_exec->GetDeviceDescription().cuda_compute_capability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
+  auto cuda_compute_capability =
+      std::get<se::CudaComputeCapability>(gpu_target_config.gpu_version);
+  if (cuda_compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     pre_pipeline.AddPass<CublasPadForGemms>(PrimitiveType::BF16,
                                             /*pad_to_multiple_of=*/8);
   }
-  if (stream_exec->GetDeviceDescription().cuda_compute_capability().IsAtLeast(
-          se::CudaComputeCapability::VOLTA)) {
+  if (cuda_compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)) {
     // Pad gemms over S8 to multiples of 4 so cuBLAS can run them.
     pre_pipeline.AddPass<CublasPadForGemms>(PrimitiveType::S8,
                                             /*pad_to_multiple_of=*/4);
@@ -140,10 +191,27 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   TF_RETURN_IF_ERROR(pre_pipeline.Run(hlo_module).status());
 
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
-      hlo_module, stream_exec, device_allocator));
+      hlo_module, stream_exec, device_allocator, gpu_target_config,
+      autotune_results));
 
   HloPassPipeline post_pipeline("nvptx post-layout_assignment part 2");
-  post_pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
+  if (!stream_exec) {
+    // Device not available. Use AOT autotune results.
+    CHECK(autotune_results);
+    GemmAlgorithmPicker::ClearAutotuneResults();
+    TF_RETURN_IF_ERROR(
+        GemmAlgorithmPicker::LoadAutotuneResults(*autotune_results));
+
+    std::string device_description_str =
+        gpu_target_config.device_description_str;
+    GemmAlgorithmPicker::DevicelessConfig deviceless_config{
+        device_description_str, cuda_compute_capability};
+    post_pipeline.AddPass<GemmAlgorithmPicker>(deviceless_config);
+  } else {
+    GemmAlgorithmPicker::DeviceConfig device_config{stream_exec,
+                                                    device_allocator};
+    post_pipeline.AddPass<GemmAlgorithmPicker>(device_config);
+  }
 
   // Transform TriangularSolve ops into custom-calls, so we can add temp
   // memory.
@@ -316,23 +384,8 @@ GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
 StatusOr<std::pair<std::string, std::vector<uint8_t>>>
 NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
                                    llvm::Module* llvm_module,
-                                   GpuVersion gpu_version,
-                                   se::StreamExecutor* stream_exec,
-                                   bool relocatable,
+                                   GpuVersion gpu_version, bool relocatable,
                                    const HloModule* debug_module) {
-  std::string libdevice_dir;
-  {
-    absl::MutexLock lock(&mutex_);
-
-    // Find the directory containing libdevice.  To avoid searching for it every
-    // time, we have a one-element cache, keyed on the module's config's
-    // cuda_data_dir.
-    if (cached_libdevice_dir_.empty()) {
-      cached_libdevice_dir_ = GetLibdeviceDir(module_config);
-    }
-    libdevice_dir = cached_libdevice_dir_;
-  }
-  VLOG(2) << "Libdevice dir = " << libdevice_dir << "\n";
   std::unique_ptr<llvm::Module> loaded_module =
       MaybeLoadLLVMFromFile(debug_module, llvm_module);
   llvm::Module* selected_module = nullptr;
@@ -345,11 +398,12 @@ NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
   std::string ptx;
   if (!(debug_module &&
         MaybeLoadPtxFromFile(module_config, debug_module, &ptx))) {
-    XLA_SCOPED_LOGGING_TIMER(
-        "NVPTXCompiler::CompileTargetBinary - CompileToPtx");
+    XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
+        "NVPTXCompiler::CompileTargetBinary - CompileToPtx for ",
+        (debug_module != nullptr ? debug_module->name() : "(unknown")));
     uint64_t start_usecs = tsl::Env::Default()->NowMicros();
-    TF_ASSIGN_OR_RETURN(ptx, nvptx::CompileToPtx(selected_module, gpu_version,
-                                                 module_config, libdevice_dir));
+    TF_ASSIGN_OR_RETURN(
+        ptx, nvptx::CompileToPtx(selected_module, gpu_version, module_config));
 
     uint64_t end_usecs = tsl::Env::Default()->NowMicros();
     // This won't record values for calls that error out (because if they error
@@ -358,18 +412,20 @@ NVPTXCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
   }
 
   std::vector<uint8_t> cubin = CompileGpuAsmOrGetCachedResult(
-      stream_exec, ptx, std::get<se::CudaComputeCapability>(gpu_version),
-      module_config, relocatable);
+      ptx, std::get<se::CudaComputeCapability>(gpu_version), module_config,
+      (debug_module != nullptr ? debug_module->name() : "(unknown)"),
+      relocatable);
 
   return std::pair<std::string, std::vector<uint8_t>>(std::move(ptx),
                                                       std::move(cubin));
 }
 
 std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
-    se::StreamExecutor* stream_exec, const std::string& ptx,
-    se::CudaComputeCapability cc, const HloModuleConfig& hlo_module_config,
+    const std::string& ptx, se::CudaComputeCapability cc,
+    const HloModuleConfig& hlo_module_config, absl::string_view module_name,
     bool relocatable) {
-  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompileGpuAsmOrGetCachedResult");
+  XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
+      "NVPTXCompiler::CompileGpuAsmOrGetCachedResult for ", module_name));
   tsl::profiler::TraceMe activity("PTX->CUBIN",
                                   tsl::profiler::TraceMeLevel::kInfo);
   bool inserted;
@@ -405,7 +461,7 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
         uint64_t start_usecs = tsl::Env::Default()->NowMicros();
 
         StatusOr<std::vector<uint8_t>> maybe_cubin = se::CompileGpuAsm(
-            stream_exec->device_ordinal(), cache_ptx->c_str(), ptxas_config);
+            cc.major, cc.minor, cache_ptx->c_str(), ptxas_config);
 
         if (maybe_cubin.ok()) {
           uint64_t end_usecs = tsl::Env::Default()->NowMicros();
@@ -420,10 +476,10 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
           if (maybe_cubin.status().code() == tsl::error::Code::NOT_FOUND) {
             if (!hlo_module_config.debug_options()
                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found()) {
-              LOG(WARNING) << CantFindCudaMessage(
+              LOG(WARNING) << nvptx::CantFindCudaMessage(
                   "Can't find ptxas binary in ${CUDA_DIR}/bin.  Custom ptxas "
                   "location can be specified using $PATH.",
-                  hlo_module_config);
+                  hlo_module_config.debug_options().xla_gpu_cuda_data_dir());
               LOG(FATAL)
                   << "Can't find ptxas binary.  You can pass the flag "
                      "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found "
@@ -435,13 +491,13 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
 
-            LOG_FIRST_N(WARNING, 1) << CantFindCudaMessage(
+            LOG_FIRST_N(WARNING, 1) << nvptx::CantFindCudaMessage(
                 "Can't find ptxas binary in ${CUDA_DIR}/bin.  Will back to "
                 "the GPU driver for PTX -> sass compilation.  This is OK so "
                 "long as you don't see a warning below about an out-of-date "
                 "driver version. Custom ptxas location can be specified "
                 "using $PATH.",
-                hlo_module_config);
+                hlo_module_config.debug_options().xla_gpu_cuda_data_dir());
           } else if (maybe_cubin.status().code() !=
                      tsl::error::Code::UNIMPLEMENTED) {
             // If unimplemented is returned, we fallback to the driver.
@@ -472,12 +528,40 @@ std::vector<uint8_t> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
   return cache_value->cubin_data;
 }
 
+static bool UseNvlink(const std::string& preferred_cuda_dir) {
+  const bool use_nvlink_by_default =
+#ifdef TF_DISABLE_NVLINK_BY_DEFAULT
+      false;
+#else
+      true;
+#endif
+  bool use_nvlink;
+  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_NVLINK_FOR_PARALLEL_COMPILATION",
+                                      /*default_val=*/
+                                      use_nvlink_by_default, &use_nvlink));
+
+  if (!use_nvlink) {
+    return false;
+  }
+
+  // Make sure nvlink exists and is executable.
+  const std::string bin_path =
+      se::FindCudaExecutable("nvlink", preferred_cuda_dir);
+  return se::GetToolVersion(bin_path).ok();
+}
+
 StatusOr<bool> NVPTXCompiler::CanUseLinkModules(
     const HloModuleConfig& hlo_module_config) {
   // TODO(phawkins): rather than comparing version numbers, it might be more
   // robust if we simply tried to link something the first time we compile.
   auto ptxas_config =
       PtxOptsFromDebugOptions(hlo_module_config.debug_options());
+
+  static const bool use_nvlink = UseNvlink(ptxas_config.preferred_cuda_dir);
+  if (use_nvlink) {
+    return true;
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto ptxas_version_tuple,
       se::GetAsmCompilerVersion(ptxas_config.preferred_cuda_dir));
@@ -506,16 +590,22 @@ StatusOr<bool> NVPTXCompiler::CanUseLinkModules(
 }
 
 StatusOr<std::vector<uint8_t>> NVPTXCompiler::LinkModules(
-    se::StreamExecutor* stream_exec,
-    std::vector<std::vector<uint8_t>> modules) {
+    se::StreamExecutor* stream_exec, std::vector<std::vector<uint8_t>> modules,
+    const DebugOptions& debug_options) {
+  auto ptxas_config = PtxOptsFromDebugOptions(debug_options);
+
   std::vector<stream_executor::CubinOrPTXImage> images;
   images.reserve(modules.size());
-  for (auto& module : modules) {
+  for (std::vector<uint8_t>& module : modules) {
     images.push_back({"", std::move(module)});
   }
-  return LinkGpuAsm(static_cast<se::gpu::GpuContext*>(
-                        stream_exec->implementation()->GpuContextHack()),
-                    images);
+  auto context = static_cast<se::gpu::GpuContext*>(
+      stream_exec->implementation()->GpuContextHack());
+  if (UseNvlink(ptxas_config.preferred_cuda_dir)) {
+    return LinkUsingNvlink(debug_options.xla_gpu_cuda_data_dir(), context,
+                           images);
+  }
+  return LinkGpuAsm(context, images);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index 79f27b195bb..9c94786f8ca 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 
 namespace xla {
 namespace gpu {
@@ -35,15 +36,17 @@ void WarnIfBadDriverJITVersion();
 class NVPTXCompiler : public GpuCompiler {
  public:
   NVPTXCompiler();
-  ~NVPTXCompiler() override {}
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      HloModule* hlo_module, GpuVersion gpu_version,
+      se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) override;
 
   Status OptimizeHloPostLayoutAssignment(
       HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator,
+      const GpuTargetConfig& gpu_target_config,
+      const AutotuneResults* autotune_results) override;
 
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override;
 
@@ -51,7 +54,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec, bool relocatable,
+      GpuVersion gpu_version, bool relocatable,
       const HloModule* debug_module) override;
 
  private:
@@ -60,7 +63,8 @@ class NVPTXCompiler : public GpuCompiler {
 
   StatusOr<std::vector<uint8_t>> LinkModules(
       se::StreamExecutor* stream_exec,
-      std::vector<std::vector<uint8_t>> modules) override;
+      std::vector<std::vector<uint8_t>> modules,
+      const DebugOptions& debug_options) override;
 
   absl::Mutex mutex_;
 
@@ -77,8 +81,8 @@ class NVPTXCompiler : public GpuCompiler {
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
   std::vector<uint8_t> CompileGpuAsmOrGetCachedResult(
-      se::StreamExecutor* stream_exec, const std::string& ptx,
-      se::CudaComputeCapability cc, const HloModuleConfig& hlo_module_config,
+      const std::string& ptx, se::CudaComputeCapability cc,
+      const HloModuleConfig& hlo_module_config, absl::string_view module_name,
       bool relocatable);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler_test.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler_test.cc
index b738d58acf0..2a03dca5613 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
 
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -25,7 +27,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using NVPTXCompilerTest = HloTestBase;
+class NVPTXCompilerTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<BufferAssignment>> AssignBuffers(HloModule* module) {
+    Backend& test_backend = backend();
+    NVPTXCompiler compiler;
+    return compiler.AssignBuffers(module,
+                                  test_backend.default_stream_executor());
+  }
+};
 
 TEST_F(NVPTXCompilerTest, AllReducePerformedInplace) {
   const absl::string_view hlo_string = R"(
@@ -46,9 +56,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  NVPTXCompiler compiler;
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment,
-                          compiler.AssignBuffers(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment, AssignBuffers(module.get()));
 
   HloInstruction* all_reduce = module->entry_computation()->root_instruction();
   EXPECT_TRUE(buffer_assignment->SharesTopLevelSlice(all_reduce,
@@ -76,9 +84,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  NVPTXCompiler compiler;
-  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment,
-                          compiler.AssignBuffers(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer_assignment, AssignBuffers(module.get()));
 
   HloInstruction* all_reduce = module->entry_computation()->root_instruction();
   EXPECT_TRUE(buffer_assignment->SharesSliceAtIndex(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_helper.cc b/tensorflow/compiler/xla/service/gpu/nvptx_helper.cc
deleted file mode 100644
index 2f525b477f0..00000000000
--- a/tensorflow/compiler/xla/service/gpu/nvptx_helper.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/nvptx_helper.h"
-
-#include <string>
-
-#include "absl/strings/str_join.h"
-#include "tensorflow/tsl/platform/cuda_libdevice_path.h"
-#include "tensorflow/tsl/platform/path.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-std::vector<std::string> CandidateCudaRoots(const HloModuleConfig& config) {
-  return tsl::CandidateCudaRoots(
-      config.debug_options().xla_gpu_cuda_data_dir());
-}
-
-}  // namespace
-
-std::string CantFindCudaMessage(absl::string_view msg,
-                                const HloModuleConfig& hlo_module_config) {
-  return absl::StrCat(
-      msg, "\nSearched for CUDA in the following directories:\n  ",
-      absl::StrJoin(CandidateCudaRoots(hlo_module_config), "\n  "),
-      "\nYou can choose the search directory by setting xla_gpu_cuda_data_dir "
-      "in HloModule's DebugOptions.  For most apps, setting the environment "
-      "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.");
-}
-
-std::string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
-  for (const std::string& cuda_root : CandidateCudaRoots(hlo_module_config)) {
-    std::string libdevice_dir =
-        tsl::io::JoinPath(cuda_root, "nvvm", "libdevice");
-    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
-    if (tsl::Env::Default()->IsDirectory(libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << libdevice_dir;
-      return libdevice_dir;
-    }
-  }
-  LOG(WARNING) << CantFindCudaMessage(
-      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may "
-      "result in compilation or runtime failures, if the program we try to run "
-      "uses routines from libdevice.",
-      hlo_module_config);
-
-  // GetCudaRootCandidates always includes ".", but if everything fails, we
-  // return it anyway.  Better than returning the empty string.
-  return ".";
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_helper.h b/tensorflow/compiler/xla/service/gpu/nvptx_helper.h
deleted file mode 100644
index 8bf4feece5b..00000000000
--- a/tensorflow/compiler/xla/service/gpu/nvptx_helper.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_HELPER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_HELPER_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-namespace xla {
-namespace gpu {
-
-// Logs a warning message that CUDA could not be found in the candidate
-// directories.
-std::string CantFindCudaMessage(absl::string_view msg,
-                                const HloModuleConfig& hlo_module_config);
-
-// Returns the directory containing nvvm libdevice files.
-std::string GetLibdeviceDir(const HloModuleConfig& hlo_module_config);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_HELPER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index 9500f898219..1543a17dd75 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
index c94259e2e16..b475db116a8 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_OUTFEED_THUNK_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc
index 70fc53eca89..daebcc2f654 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc
+++ b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc
@@ -19,16 +19,15 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h
index 5e844a67ceb..cc11c4fcdc1 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h
+++ b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc
index eac13999148..8d816fd1ccd 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc
+++ b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h
index 9494d8f3a81..44296d165ef 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h
+++ b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
index 1850849736d..44c951cd231 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
+++ b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc
@@ -19,16 +19,15 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h
index 7f888ba3d87..33a661997e1 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h
+++ b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
index 7fc65d4fb80..2baa2328a11 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.h b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
index 688d50159b2..527241434b2 100644
--- a/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/BUILD b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
index 2537e3d1fa7..0b1e35d7ed9 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
@@ -1,4 +1,7 @@
+load("//tensorflow/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -10,6 +13,43 @@ package_group(
     ],
 )
 
+cc_library(
+    name = "cholesky",
+    srcs = ["cholesky.cc"],
+    hdrs = ["cholesky.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service/gpu:cholesky_thunk",
+        "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
+    ],
+)
+
+cc_library(
+    name = "collectives",
+    srcs = ["collectives.cc"],
+    hdrs = ["collectives.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:computation_placer_hdr",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:global_device_id",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+        "//tensorflow/compiler/xla/service/gpu:nccl_collective_thunks",
+        "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/compiler/xla/stream_executor:executor_cache",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
 cc_library(
     name = "conv",
     srcs = ["conv.cc"],
@@ -17,31 +57,70 @@ cc_library(
     deps = [
         ":support",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:gpu_conv_runner",
-        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:attribute_exporter",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Support",
     ],
 )
 
+cc_library(
+    name = "custom_call",
+    srcs = ["custom_call.cc"],
+    hdrs = ["custom_call.h"],
+    deps = [
+        ":support",
+        ":triangular_solve",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:custom_call_status_internal",
+        "//tensorflow/compiler/xla/service:custom_call_status_public_headers",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
+    ],
+)
+
 cc_library(
     name = "executable",
     srcs = ["executable.cc"],
     hdrs = ["executable.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
     deps = [
+        ":cholesky",
+        ":collectives",
         ":conv",
+        ":cublas_lt_matmul",
+        ":custom_call",
+        ":fft",
+        ":gemm",
+        ":graph_launch",
+        ":io_feed",
         ":kernel_launch",
+        ":memcpy",
+        ":memset",
+        ":send_recv",
+        ":support",
+        ":tracing",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:ffi",
         "//tensorflow/compiler/xla/runtime:jit_executable",
+        "//tensorflow/compiler/xla/runtime:module_registry",
+        "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:buffer_allocations",
-        "//tensorflow/compiler/xla/service/gpu:jitrt_custom_calls",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -52,35 +131,60 @@ cc_library(
     hdrs = ["fft.h"],
     deps = [
         ":support",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
-        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/runtime:state",
         "//tensorflow/compiler/xla/service/gpu:fft_thunk",
         "//tensorflow/compiler/xla/stream_executor:fft",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:attribute_exporter",
     ],
 )
 
+cc_library(
+    name = "gemm",
+    srcs = ["gemm.cc"],
+    hdrs = ["gemm.h"],
+    deps = [
+        ":support",
+        "@com_google_absl//absl/container:node_hash_map",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:state",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service/gpu:matmul_utils",
+        "//tensorflow/compiler/xla/stream_executor:blas",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service/gpu:gemm_algorithm_picker",
+        "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
+    ]),
+)
+
 cc_library(
     name = "graph_launch",
     srcs = ["graph_launch.cc"],
     hdrs = ["graph_launch.h"],
     deps = [
+        ":conv",
         ":kernel_launch",
         ":support",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/synchronization",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
-        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/synchronization",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_graph",
+    ]),
 )
 
 cc_library(
@@ -107,11 +211,12 @@ cc_library(
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/runtime:state",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -123,17 +228,45 @@ cc_library(
     deps = [
         ":support",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
         "//tensorflow/compiler/xla/runtime:logical_result",
+        "//tensorflow/compiler/xla/runtime:state",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:matmul_utils",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
         "//tensorflow/tsl/platform:status",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "memcpy",
+    srcs = ["memcpy.cc"],
+    hdrs = ["memcpy.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service/gpu:io_feed_manager",
+    ],
+)
+
+cc_library(
+    name = "memset",
+    srcs = ["memset.cc"],
+    hdrs = ["memset.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service/gpu:io_feed_manager",
     ],
 )
 
@@ -142,10 +275,36 @@ cc_library(
     hdrs = ["support.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/service/gpu:matmul_utils",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "send_recv",
+    srcs = ["send_recv.cc"],
+    hdrs = ["send_recv.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:custom_call_encoding",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "//tensorflow/tsl/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@tf_runtime//:async_value",
     ],
 )
 
@@ -166,3 +325,19 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
     ],
 )
+
+cc_library(
+    name = "triangular_solve",
+    srcs = ["triangular_solve.cc"],
+    hdrs = ["triangular_solve.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
+        "//tensorflow/compiler/xla/service/gpu:triangular_solve_thunk",
+        "//tensorflow/tsl/platform:human_readable_json",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc
new file mode 100644
index 00000000000..09314bc0728
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace xla {
+namespace gpu {
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::MemrefView;
+using ::xla::runtime::StridedMemrefView;
+
+static absl::Status CholeskyImpl(const ServiceExecutableRunOptions* run_options,
+                                 const DebugOptions* debug_options,
+                                 StridedMemrefView operand, StridedMemrefView a,
+                                 MemrefView workspace, MemrefView info,
+                                 int64_t batch_size, bool is_lower, int64_t n) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  se::DeviceMemoryBase operand_buffer = GetDeviceAddress(operand);
+  se::DeviceMemoryBase a_buffer = GetDeviceAddress(a);
+  se::DeviceMemoryBase workspace_buffer = GetDeviceAddress(workspace);
+  se::DeviceMemoryBase info_buffer = GetDeviceAddress(info);
+
+  VLOG(3) << "Running Cholesky";
+  se::Stream* stream = run_options->stream();
+
+  // Copy operand to the a buffer if they are different.
+  if (a.data != operand.data)
+    stream->ThenMemcpy(&a_buffer, operand_buffer, operand_buffer.size());
+
+  using UpperLower = se::blas::UpperLower;
+  UpperLower uplo = is_lower ? UpperLower::kLower : UpperLower::kUpper;
+
+  CholeskyParams params{n,        batch_size,       uplo,
+                        a_buffer, workspace_buffer, info_buffer};
+  auto executed = RunCholesky(xla::gpu::PtxOptsFromDebugOptions(*debug_options),
+                              operand.dtype, &params, stream);
+  if (!executed.ok()) return ToAbslStatus(executed);
+
+  return absl::OkStatus();
+#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  return absl::InternalError("Cholesky is not supported without GPU");
+#endif
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Cholesky, FunctionWrapper<CholeskyImpl>(), checks,
+    CustomCall::Bind("xla.gpu.cholesky")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
+        .Arg<StridedMemrefView>()  // operand
+        .Arg<StridedMemrefView>()  // a
+        .Arg<MemrefView>()         // workspace
+        .Arg<MemrefView>()         // info
+        .Attr<int64_t>("batch_size")
+        .Attr<bool>("is_lower")
+        .Attr<int64_t>("n"));
+
+void RegisterCholeskyCustomCalls(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.cholesky", Cholesky);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cholesky.h b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.h
new file mode 100644
index 00000000000..cb2a56f581c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime cholesky custom calls.
+void RegisterCholeskyCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
new file mode 100644
index 00000000000..9f47dececd4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
@@ -0,0 +1,635 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
+
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/global_device_id.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+using xla::runtime::CustomCall;
+using xla::runtime::FlatMemrefView;
+using xla::runtime::StridedMemrefView;
+
+#if XLA_ENABLE_XCCL
+StatusOr<NcclComm::Lock> GetNcclComm(
+    const NcclExecuteParams& params, int64_t group_mode, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+  // TODO(b/233930690): Pass the attribute below as a nested array.
+  // Pass an array of arrays using two vectors; one specifying all the values
+  // and another specifying the (ending) offsets of each array in the other
+  // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
+  // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
+  std::vector<ReplicaGroup> replica_groups;
+  int i = 0;
+  for (int64_t replica_group_end : replica_group_offsets) {
+    ReplicaGroup replica_group;
+    while (i < replica_group_end)
+      replica_group.add_replica_ids(replica_group_values[i++]);
+    replica_groups.push_back(replica_group);
+  }
+
+  return LockNcclComm(params, replica_groups,
+                      static_cast<CollectiveOpGroupMode>(group_mode), op_id);
+}
+#endif  // XLA_ENABLE_XCCL
+
+StatusOr<std::vector<DeviceBufferPair>> GetDeviceBufferPairs(
+    CustomCall::RemainingArgs& args) {
+  // Add MemRef arguments as buffer arguments.
+  const int buffer_pairs = args.size() / 2;
+  std::vector<DeviceBufferPair> device_buffers;
+  device_buffers.reserve(buffer_pairs);
+  for (int i = 0; i < buffer_pairs; ++i) {
+    auto source = args.get<StridedMemrefView>(i);
+    auto destination = args.get<StridedMemrefView>(i + buffer_pairs);
+    if (failed(source) || failed(destination)) {
+      return InvalidArgument("Unsupported device buffer pair type");
+    }
+
+    int64_t element_count = 1;
+    for (int64_t size : source->sizes) element_count *= size;
+    device_buffers.emplace_back(DeviceBufferPair{
+        source->dtype, element_count, GetDeviceAddress(*source),
+        GetDeviceAddress(*destination)});
+  }
+  return device_buffers;
+}
+
+//===----------------------------------------------------------------------===//
+// Collectives support library.
+//===----------------------------------------------------------------------===//
+
+static int64_t Key(int32_t uid, int32_t device_ordinal) {
+  return static_cast<int64_t>(uid) << 32 | device_ordinal;
+}
+
+AsyncCollectivesSupport::AsyncCollectivesSupport(se::Stream* async_comm_stream)
+    : async_comm_stream_(async_comm_stream) {}
+
+absl::Status CollectivesSupport::MaybeBlockAfterFirstRun(int32_t uid,
+                                                         int32_t device_ordinal,
+                                                         se::Stream* stream) {
+  bool block = [&] {
+    absl::MutexLock lock(&mutex_);
+    return executed_.insert(Key(uid, device_ordinal)).second;
+  }();
+  return block ? ToAbslStatus(stream->BlockHostUntilDone()) : absl::OkStatus();
+}
+
+absl::Status AsyncCollectivesSupport::RecordEvent(int32_t uid) {
+  // Create an event on the async stream for the completion of the collective.
+  se::Event done_event(async_comm_stream_->parent());
+  if (!done_event.Init()) return absl::InternalError("Failed to create event");
+  async_comm_stream_->ThenRecordEvent(&done_event);
+
+  absl::MutexLock lock(&mutex_);
+  auto [_, was_inserted] = done_events_.insert({uid, std::move(done_event)});
+  if (!was_inserted) {
+    return absl::InternalError(absl::StrFormat(
+        "Async done event has not been consumed (uid=%d, device_ordinal=%d)",
+        uid, async_comm_stream_->parent()->device_ordinal()));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<se::Event> AsyncCollectivesSupport::PopEvent(int32_t uid) {
+  absl::MutexLock lock(&mutex_);
+  auto done_event = done_events_.extract(uid);
+  if (!done_event) {
+    return absl::InternalError(absl::StrFormat(
+        "Async done event was not found (uid=%d, device_ordinal=%d)", uid,
+        async_comm_stream_->parent()->device_ordinal()));
+  }
+  return std::move(done_event.mapped());
+}
+
+static absl::Status AsyncDoneImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, AsyncCollectivesSupport* async_collectives,
+    const char* op_name, int32_t uid) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running " << op_name;
+  se::Stream* stream = run_options->stream();
+
+  auto event = async_collectives->PopEvent(uid);
+  if (!event.ok()) return event.status();
+  stream->ThenWaitFor(&*event);
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+//===----------------------------------------------------------------------===//
+// CollectivePermute.
+//===----------------------------------------------------------------------===//
+
+#if XLA_ENABLE_XCCL
+static absl::Status CollectivePermuteImplCommon(
+    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values,
+    absl::Span<const int64_t> source_peers,
+    absl::Span<const int64_t> target_peers) {
+  NcclExecuteParams params(*run_options, run_options->stream());
+
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values);
+  if (!comm.ok()) return ToAbslStatus(comm.status());
+
+  auto device_buffers = GetDeviceBufferPairs(args);
+  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+
+  if (device_buffers->size() != 1) {
+    return absl::InternalError(absl::StrFormat(
+        "Expected device buffer size: 1, got %d", device_buffers->size()));
+  }
+
+  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
+  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
+
+  StatusOr<DeviceAssignment::LogicalID> current_logical_id =
+      params.device_assn->LogicalIdForDevice(global_device_id.value());
+  if (!current_logical_id.ok())
+    return ToAbslStatus(current_logical_id.status());
+
+  const int64_t current_id = static_cast<CollectiveOpGroupMode>(group_mode) ==
+                                     CollectiveOpGroupMode::kCrossReplica
+                                 ? current_logical_id.value().replica_id
+                                 : current_logical_id.value().computation_id;
+  std::string device_string = NcclCollectiveThunk::GetDeviceString(params);
+
+  NcclCollectivePermuteConfig::IdToSourceTargetMap id_to_source_target;
+  for (int i = 0; i < source_peers.size(); ++i) {
+    id_to_source_target[target_peers[i]].source = source_peers[i];
+    id_to_source_target[source_peers[i]].target = target_peers[i];
+  }
+  const NcclCollectivePermuteConfig::SourceTargetMapEntry source_target =
+      NcclCollectivePermuteConfig::GetSourceTarget(id_to_source_target,
+                                                   current_id);
+
+  return ToAbslStatus(RunCollectivePermute(source_target, (*device_buffers)[0],
+                                           *stream, **comm, device_string,
+                                           current_id));
+}
+#endif  // XLA_ENABLE_XCCL
+
+static absl::Status CollectivePermuteImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values,
+    absl::Span<const int64_t> source_peers,
+    absl::Span<const int64_t> target_peers) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running CollectivePermute";
+  se::Stream* stream = run_options->stream();
+  auto status = CollectivePermuteImplCommon(
+      run_options, stream, args, group_mode, op_id, replica_group_offsets,
+      replica_group_values, source_peers, target_peers);
+  if (!status.ok()) return status;
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CollectivePermute, FunctionWrapper<CollectivePermuteImpl>(), checks,
+    CustomCall::Bind("xla.gpu.collective_permute")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values")
+        .Attr<absl::Span<const int64_t>>("source_peers")
+        .Attr<absl::Span<const int64_t>>("target_peers"));
+
+//===----------------------------------------------------------------------===//
+// CollectivePermuteStart.
+//===----------------------------------------------------------------------===//
+
+static absl::Status CollectivePermuteStartImpl(
+    const ServiceExecutableRunOptions* run_options,
+    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values,
+    absl::Span<const int64_t> source_peers,
+    absl::Span<const int64_t> target_peers) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running CollectivePermuteStart";
+  se::Stream* stream = run_options->stream();
+  se::Stream* async_stream = async_collectives->async_comm_stream();
+
+  // Wait until compute inputs are ready.
+  async_stream->ThenWaitFor(stream);
+
+  auto status = CollectivePermuteImplCommon(
+      run_options, async_stream, args, group_mode, op_id, replica_group_offsets,
+      replica_group_values, source_peers, target_peers);
+  if (!status.ok()) return status;
+
+  return async_collectives->RecordEvent(uid);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CollectivePermuteStart, FunctionWrapper<CollectivePermuteStartImpl>(),
+    checks,
+    CustomCall::Bind("xla.gpu.collective_permute_start")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<AsyncCollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values")
+        .Attr<absl::Span<const int64_t>>("source_peers")
+        .Attr<absl::Span<const int64_t>>("target_peers"));
+
+//===----------------------------------------------------------------------===//
+// CollectivePermuteDone.
+//===----------------------------------------------------------------------===//
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CollectivePermuteDone, FunctionWrapper<AsyncDoneImpl>(), checks,
+    CustomCall::Bind("xla.gpu.collective_permute_done")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .UserData<AsyncCollectivesSupport*>()
+        .Value("CollectivePermuteDone")
+        .Attr<int32_t>("uid"));
+
+//===----------------------------------------------------------------------===//
+// AllGather.
+//===----------------------------------------------------------------------===//
+
+static absl::Status AllGatherImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running AllGather";
+  se::Stream* stream = run_options->stream();
+  NcclExecuteParams params(*run_options, stream);
+
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values);
+  if (!comm.ok()) return ToAbslStatus(comm.status());
+
+  auto device_buffers = GetDeviceBufferPairs(args);
+  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+
+  auto st = RunAllGather(*device_buffers, *stream, **comm);
+  if (!st.ok()) return ToAbslStatus(st);
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL diasbled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    AllGather, FunctionWrapper<AllGatherImpl>(), checks,
+    CustomCall::Bind("xla.gpu.all_gather")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values"));
+
+//===----------------------------------------------------------------------===//
+// AllReduce.
+//===----------------------------------------------------------------------===//
+
+#if XLA_ENABLE_XCCL
+static absl::Status AllReduceImplCommon(
+    const ServiceExecutableRunOptions* run_options, se::Stream* stream,
+    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
+    int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+  NcclExecuteParams params(*run_options, run_options->stream());
+
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values);
+  if (!comm.ok()) return ToAbslStatus(comm.status());
+
+  auto device_buffers = GetDeviceBufferPairs(args);
+  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+
+  return ToAbslStatus(RunAllReduce(static_cast<ReductionKind>(reduction_kind),
+                                   *device_buffers, *stream, **comm));
+}
+#endif  // XLA_ENABLE_XCCL
+
+static absl::Status AllReduceImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running AllReduce";
+  se::Stream* stream = run_options->stream();
+  auto status = AllReduceImplCommon(
+      run_options, stream, args, group_mode, op_id, reduction_kind,
+      replica_group_offsets, replica_group_values);
+  if (!status.ok()) return status;
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  // NCCL disabled.
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    AllReduce, FunctionWrapper<AllReduceImpl>(), checks,
+    CustomCall::Bind("xla.gpu.all_reduce")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<int64_t>("reduction_kind")  // ReductionKind
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values"));
+
+//===----------------------------------------------------------------------===//
+// AllReduceStart.
+//===----------------------------------------------------------------------===//
+
+static absl::Status AllReduceStartImpl(
+    const ServiceExecutableRunOptions* run_options,
+    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
+    int64_t group_mode, int64_t op_id, int64_t reduction_kind,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values, int32_t uid) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running AllReduceStart";
+  se::Stream* stream = run_options->stream();
+  se::Stream* async_stream = async_collectives->async_comm_stream();
+
+  // Wait until compute inputs are ready.
+  async_stream->ThenWaitFor(stream);
+
+  auto status = AllReduceImplCommon(
+      run_options, async_stream, args, group_mode, op_id, reduction_kind,
+      replica_group_offsets, replica_group_values);
+  if (!status.ok()) return status;
+
+  return async_collectives->RecordEvent(uid);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    AllReduceStart, FunctionWrapper<AllReduceStartImpl>(), checks,
+    CustomCall::Bind("xla.gpu.all_reduce_start")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<AsyncCollectivesSupport*>()
+        .RemainingArgs()              // args
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<int64_t>("reduction_kind")  // ReductionKind
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values")
+        .Attr<int32_t>("uid"));
+
+//===----------------------------------------------------------------------===//
+// AllReduceDone.
+//===----------------------------------------------------------------------===//
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    AllReduceDone, FunctionWrapper<AsyncDoneImpl>(), checks,
+    CustomCall::Bind("xla.gpu.all_reduce_done")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .UserData<AsyncCollectivesSupport*>()
+        .Value("AllReduceDone")
+        .Attr<int32_t>("uid"));
+
+//===----------------------------------------------------------------------===//
+// AllToAll.
+//===----------------------------------------------------------------------===//
+
+static absl::Status AllToAllImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, bool has_split_dimension, int64_t op_id,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running AllToAll";
+  se::Stream* stream = run_options->stream();
+  NcclExecuteParams params(*run_options, stream);
+
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values);
+  if (!comm.ok()) return ToAbslStatus(comm.status());
+
+  auto device_buffers = GetDeviceBufferPairs(args);
+  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+
+  auto st = RunAllToAll(has_split_dimension, *device_buffers, *stream, **comm);
+  if (!st.ok()) return ToAbslStatus(st);
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    AllToAll, FunctionWrapper<AllToAllImpl>(), checks,
+    CustomCall::Bind("xla.gpu.all_to_all")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<bool>("has_split_dimension")
+        .Attr<int64_t>("op_id")
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values"));
+
+//===----------------------------------------------------------------------===//
+// ReduceScatter.
+//===----------------------------------------------------------------------===//
+
+static absl::Status ReduceScatterImpl(
+    const ServiceExecutableRunOptions* run_options,
+    CollectivesSupport* collectives, CustomCall::RemainingArgs args,
+    int32_t uid, int64_t group_mode, int64_t op_id, int64_t reduction_kind,
+    absl::Span<const int64_t> replica_group_offsets,
+    absl::Span<const int64_t> replica_group_values) {
+#if XLA_ENABLE_XCCL
+  VLOG(3) << "Running ReduceScatter";
+  se::Stream* stream = run_options->stream();
+  NcclExecuteParams params(*run_options, stream);
+
+  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                          replica_group_values);
+  if (!comm.ok()) return ToAbslStatus(comm.status());
+
+  auto device_buffers = GetDeviceBufferPairs(args);
+  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+
+  auto executed = RunReduceScatter(static_cast<ReductionKind>(reduction_kind),
+                                   *device_buffers, *stream, **comm);
+  if (!executed.ok()) return ToAbslStatus(executed);
+
+  int32_t device_ordinal = stream->parent()->device_ordinal();
+  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+#else   // XLA_ENABLE_XCCL
+  return absl::InternalError("NCCL disabled");
+#endif  // XLA_ENABLE_XCCL
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    ReduceScatter, FunctionWrapper<ReduceScatterImpl>(), checks,
+    CustomCall::Bind("xla.gpu.reduce_scatter")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<CollectivesSupport*>()
+        .RemainingArgs()  // args
+        .Attr<int32_t>("uid")
+        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
+        .Attr<int64_t>("op_id")
+        .Attr<int64_t>("reduction_kind")  // ReductionKind
+        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
+        .Attr<absl::Span<const int64_t>>("replica_group_values"));
+
+//===----------------------------------------------------------------------===//
+// ReplicaId.
+//===----------------------------------------------------------------------===//
+
+static absl::Status ReplicaIdImpl(
+    const ServiceExecutableRunOptions* run_options, FlatMemrefView result) {
+  VLOG(3) << "Running ReplicaId";
+  se::Stream* stream = run_options->stream();
+  NcclExecuteParams params(*run_options, stream);
+
+  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
+  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
+
+  StatusOr<DeviceAssignment::LogicalID> logical_id =
+      params.device_assn->LogicalIdForDevice(global_device_id.value());
+  if (!logical_id.ok()) return ToAbslStatus(logical_id.status());
+
+  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
+  params.stream->ThenMemset32(&result_data, logical_id.value().replica_id,
+                              /*size=*/4);
+
+  return absl::OkStatus();
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    ReplicaId, FunctionWrapper<ReplicaIdImpl>(), checks,
+    CustomCall::Bind("xla.gpu.replica_id")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<FlatMemrefView>());
+
+//===----------------------------------------------------------------------===//
+// PartitionId.
+//===----------------------------------------------------------------------===//
+
+static absl::Status PartitionIdImpl(
+    const ServiceExecutableRunOptions* run_options, FlatMemrefView result) {
+  VLOG(3) << "Running PartitionId";
+  se::Stream* stream = run_options->stream();
+  NcclExecuteParams params(*run_options, stream);
+
+  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
+  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
+
+  StatusOr<DeviceAssignment::LogicalID> logical_id =
+      params.device_assn->LogicalIdForDevice(global_device_id.value());
+  if (!logical_id.ok()) return ToAbslStatus(logical_id.status());
+
+  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
+  params.stream->ThenMemset32(&result_data, logical_id.value().computation_id,
+                              /*size=*/4);
+
+  return absl::OkStatus();
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    PartitionId, FunctionWrapper<PartitionIdImpl>(), checks,
+    CustomCall::Bind("xla.gpu.partition_id")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<FlatMemrefView>());
+
+//===----------------------------------------------------------------------===//
+
+void RegisterCollectiveCustomCalls(
+    runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.collective_permute", CollectivePermute);
+  registry.Register("xla.gpu.collective_permute_done", CollectivePermuteDone);
+  registry.Register("xla.gpu.collective_permute_start", CollectivePermuteStart);
+  registry.Register("xla.gpu.all_gather", AllGather);
+  registry.Register("xla.gpu.all_reduce", AllReduce);
+  registry.Register("xla.gpu.all_reduce_done", AllReduceDone);
+  registry.Register("xla.gpu.all_reduce_start", AllReduceStart);
+  registry.Register("xla.gpu.all_to_all", AllToAll);
+  registry.Register("xla.gpu.reduce_scatter", ReduceScatter);
+  registry.Register("xla.gpu.partition_id", PartitionId);
+  registry.Register("xla.gpu.replica_id", ReplicaId);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/collectives.h b/tensorflow/compiler/xla/service/gpu/runtime/collectives.h
new file mode 100644
index 00000000000..0911df63e63
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/collectives.h
@@ -0,0 +1,75 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/stream_executor/event.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+
+namespace xla {
+namespace gpu {
+
+// Support for running async collective operations communicating via events.
+// Registers XLA Gpu runtime collective custom calls.
+void RegisterCollectiveCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+class CollectivesSupport {
+ public:
+  // Maybe block host after the first call to the collective operation with the
+  // given uid, to ensure that all devices have allocated the required buffers
+  // for their communicators before allowing any device to continue enqueuing
+  // operations. Otherwise, the allocations can cause deadlock in the CUDA
+  // driver.
+  //
+  // This basically ports workaround from cr/435058849 to Xla runtime (see
+  // details in the b/215649390).
+  absl::Status MaybeBlockAfterFirstRun(int32_t uid, int32_t device_ordinal,
+                                       se::Stream* stream);
+
+ private:
+  absl::Mutex mutex_;
+
+  // Store if a particular collective operation was executed at least once. We
+  // rely on unique `uid` assigned to each collective operation by the lowering
+  // pass.
+  absl::flat_hash_set<int64_t> executed_ ABSL_GUARDED_BY(mutex_);
+};
+
+// Support for running async collective operations communicating via events.
+class AsyncCollectivesSupport {
+ public:
+  explicit AsyncCollectivesSupport(se::Stream* async_comm_stream);
+
+  absl::Status RecordEvent(int32_t uid);
+  absl::StatusOr<se::Event> PopEvent(int32_t uid);
+
+  se::Stream* async_comm_stream() const { return async_comm_stream_; }
+
+ private:
+  absl::Mutex mutex_;
+  se::Stream* async_comm_stream_;
+
+  // Store done events for the Done ops to wait upon.
+  absl::flat_hash_map<int, se::Event> done_events_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/conv.cc b/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
index e2d0fff5cf9..c62aca0456f 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
@@ -16,42 +16,188 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "llvm/ADT/Sequence.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
-namespace gpu {
 
+using xla::runtime::AggregateAttrDef;
+using xla::runtime::AggregateAttrEncoding;
 using xla::runtime::CustomCall;
-using xla::runtime::Executable;
+using xla::runtime::EnumAttrEncoding;
 using xla::runtime::FlatMemrefView;
+using xla::runtime::State;
 using xla::runtime::StridedMemrefView;
+using xla::runtime::Tagged;
+
+namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
+namespace mhlo = ::mlir::mhlo;
+
+//===----------------------------------------------------------------------===//
+// Structs for encoding convolution attributes defined in MHLO dialect.
+//===----------------------------------------------------------------------===//
+
+namespace gpu {
+
+struct ConvDimensionNumbers {
+  int64_t input_batch_dim;
+  int64_t input_feature_dim;
+  absl::Span<const int64_t> input_spatial_dims;
+
+  int64_t kernel_in_feature_dim;
+  int64_t kernel_out_feature_dim;
+  absl::Span<const int64_t> kernel_spatial_dims;
+
+  int64_t output_batch_dim;
+  int64_t output_feature_dim;
+  absl::Span<const int64_t> output_spatial_dims;
+};
+
+struct ConvBackendConfig {
+  int64_t algorithm;
+  bool tensor_ops_enabled;
+  bool is_cudnn_frontend;
+  absl::Span<const int64_t> knob_ids;
+  absl::Span<const int64_t> knob_values;
+  absl::Span<const int64_t> operand_0_layout;
+  absl::Span<const int64_t> operand_1_layout;
+  absl::Span<const int64_t> result_layout;
+  int64_t workspace_size;
+};
+
+}  // namespace gpu
+
+//===----------------------------------------------------------------------===//
+// Register convolution attributes decoding with the Xla runtime.
+//===----------------------------------------------------------------------===//
+
+namespace runtime {
+
+XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::dnn::ActivationMode);
+
+XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
+    xla::gpu::ConvDimensionNumbers,
+    // --- input dimensions
+    AggregateMember<int64_t>("input_batch_dim"),
+    AggregateMember<int64_t>("input_feature_dim"),
+    AggregateMember<absl::Span<const int64_t>>("input_spatial_dims"),
+    // --- kernel dimensions
+    AggregateMember<int64_t>("kernel_in_feature_dim"),
+    AggregateMember<int64_t>("kernel_out_feature_dim"),
+    AggregateMember<absl::Span<const int64_t>>("kernel_spatial_dims"),
+    // --- output dimensions
+    AggregateMember<int64_t>("output_batch_dim"),
+    AggregateMember<int64_t>("output_feature_dim"),
+    AggregateMember<absl::Span<const int64_t>>("output_spatial_dims"));
+
+XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
+    xla::gpu::ConvBackendConfig,  //
+    AggregateMember<int64_t>("algorithm"),
+    AggregateMember<bool>("tensor_ops_enabled"),
+    AggregateMember<bool>("is_cudnn_frontend"),
+    AggregateMember<absl::Span<const int64_t>>("knob_ids"),
+    AggregateMember<absl::Span<const int64_t>>("knob_values"),
+    AggregateMember<absl::Span<const int64_t>>("operand_0_layout"),
+    AggregateMember<absl::Span<const int64_t>>("operand_1_layout"),
+    AggregateMember<absl::Span<const int64_t>>("result_layout"),
+    AggregateMember<int64_t>("workspace_size"));
+
+}  // namespace runtime
+
+//===----------------------------------------------------------------------===//
+// Type names for encoded attributes.
+//===----------------------------------------------------------------------===//
+
+namespace gpu {
+
+void RegisterConvTypeIdNames(runtime::TypeIDNameRegistry& registry) {
+  registry.Register<Tagged<ConvDimensionNumbers>>("__type_id_conv_dim_numbers");
+  registry.Register<Tagged<ConvBackendConfig>>("__type_id_conv_backend_config");
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding from MHLO attributes to Xla runtime aggregate attributes.
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): We have to support enum encoding that can fail instead of
+// always getting the value from returned StatusOr.
+static auto EncodeConvActivation(lmhlo_gpu::Activation activation) {
+  return ConvertConvActivationMode(activation).value();
+}
+
+void PopulateConvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding) {
+  {  // --- Encode `lmhlo_gpu::ActivationAttr`.
+    encoding
+        .Add<EnumAttrEncoding<lmhlo_gpu::ActivationAttr, lmhlo_gpu::Activation,
+                              se::dnn::ActivationMode>>(EncodeConvActivation);
+  }
+
+  {  // --- Encode `mhlo::ConvDimensionNumbersAttr`.
+    using Attr = mhlo::ConvDimensionNumbersAttr;
+    encoding.Add<AggregateAttrEncoding<Attr, ConvDimensionNumbers>>(
+        encoding,
+        AggregateAttrDef<Attr>()
+            .Add("input_batch_dim", &Attr::getInputBatchDimension)
+            .Add("input_feature_dim", &Attr::getInputFeatureDimension)
+            .Add("input_spatial_dims", &Attr::getInputSpatialDimensions)
+            .Add("kernel_in_feature_dim", &Attr::getKernelInputFeatureDimension)
+            .Add("kernel_out_feature_dim",
+                 &Attr::getKernelOutputFeatureDimension)
+            .Add("kernel_spatial_dims", &Attr::getKernelSpatialDimensions)
+            .Add("output_batch_dim", &Attr::getOutputBatchDimension)
+            .Add("output_feature_dim", &Attr::getOutputFeatureDimension)
+            .Add("output_spatial_dims", &Attr::getOutputSpatialDimensions));
+  }
 
-using llvm::ArrayRef;
-using mlir::StringRef;
+  {  // --- Encode `lmhlo_gpu::ConvolutionBackendConfigAttr`.
+    using Attr = lmhlo_gpu::ConvolutionBackendConfigAttr;
+    encoding.Add<AggregateAttrEncoding<Attr, ConvBackendConfig>>(
+        encoding, AggregateAttrDef<Attr>()
+                      .Add("algorithm", &Attr::getAlgorithm)
+                      .Add("tensor_ops_enabled", &Attr::getTensorOpsEnabled)
+                      .Add("is_cudnn_frontend", &Attr::getIsCudnnFrontend)
+                      .Add("knob_ids", &Attr::getKnobIds)
+                      .Add("knob_values", &Attr::getKnobValues)
+                      .Add("operand_0_layout", &Attr::getOperand_0Layout)
+                      .Add("operand_1_layout", &Attr::getOperand_1Layout)
+                      .Add("result_layout", &Attr::getResultLayout)
+                      .Add("workspace_size", &Attr::getWorkspaceSize));
+  }
+}
 
-// TODO(jacksonstokes): Add caching layer for convolution configs and runners.
+//===----------------------------------------------------------------------===//
+// Convolution runners caching.
+//===----------------------------------------------------------------------===//
 
-// TODO(ezhulenev): We need to find a better way to pass structured attributes
-// to JitRt custom calls.
+StreamExecutorConvRunners* ConvRunners::operator()(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mutex_);
+  return &runners_[executor];
+}
 
-// TODO(ezhulenev): Add caching layer for convolution configs and runners.
+//===----------------------------------------------------------------------===//
+// Convolution custom call implementation.
+//===----------------------------------------------------------------------===//
 
 namespace {
 
 struct Window {
-  ArrayRef<int64_t> window_strides;
-  ArrayRef<int64_t> padding;
-  ArrayRef<int64_t> lhs_dilation;
-  ArrayRef<int64_t> rhs_dilation;
-  ArrayRef<int64_t> window_reversal;
+  absl::Span<const int64_t> window_strides;
+  absl::Span<const int64_t> padding;
+  absl::Span<const int64_t> lhs_dilation;
+  absl::Span<const int64_t> rhs_dilation;
+  absl::Span<const int64_t> window_reversal;
 };
 
 struct ConvAttrs {
@@ -69,24 +215,11 @@ struct SideInputAttrs {
 
 }  // namespace
 
-absl::StatusOr<ConvRunnerCache::Entry> ConvRunnerCache::GetOrCreate(
-    int64_t uid, absl::FunctionRef<absl::StatusOr<GpuConvConfig>()> config) {
-  absl::MutexLock lock(&mutex_);
-  auto it = runners_.find(uid);
-  if (it != runners_.end()) return Entry{&it->second.first, &it->second.second};
-
-  absl::StatusOr<GpuConvConfig> cfg = config();
-  if (!cfg.ok()) return cfg.status();
-
-  auto emplaced = runners_.try_emplace(uid, *cfg, *cfg);
-  return Entry{&emplaced.first->second.first, &emplaced.first->second.second};
-}
-
 static GpuConvDescriptor GetConvDescriptor(
     CudnnConvKind kind,
     // Arguments
-    runtime::StridedMemrefView operand0, runtime::StridedMemrefView operand1,
-    runtime::StridedMemrefView output, runtime::FlatMemrefView scratch,
+    StridedMemrefView operand0, StridedMemrefView operand1,
+    StridedMemrefView output, FlatMemrefView scratch,
     // Attributes
     ConvDimensionNumbers dims, Window w, ConvBackendConfig b, ConvAttrs attrs,
     // Conv-specific arguments and attributes
@@ -97,8 +230,8 @@ static GpuConvDescriptor GetConvDescriptor(
   descriptor.kind = kind;
 
   // Apply backend config layout to the shape.
-  auto apply_layout = [](runtime::StridedMemrefView& memref,
-                         ArrayRef<int64_t> minor_to_major) {
+  auto apply_layout = [](StridedMemrefView& memref,
+                         absl::Span<const int64_t> minor_to_major) {
     Shape shape = ToShape(memref);
     return ShapeUtil::MakeShapeWithDenseLayout(
         shape.element_type(), shape.dimensions(), minor_to_major);
@@ -171,98 +304,94 @@ static GpuConvDescriptor GetConvDescriptor(
   return descriptor;
 }
 
-namespace {
-struct Conv {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(
-      const ServiceExecutableRunOptions* run_options,
-      const DebugOptions* debug_options, runtime::StridedMemrefView operand0,
-      runtime::StridedMemrefView operand1,
-      std::optional<runtime::FlatMemrefView> bias,
-      std::optional<runtime::StridedMemrefView> side_input,
-      runtime::StridedMemrefView output, runtime::FlatMemrefView scratch,
-      int64_t uid, ConvRunnerCache* runners, ConvDimensionNumbers conv_dims,
-      // Window config
-      ArrayRef<int64_t> window_strides, ArrayRef<int64_t> padding,
-      ArrayRef<int64_t> lhs_dilation, ArrayRef<int64_t> rhs_dilation,
-      ArrayRef<int64_t> window_reversal,
-      // Backend config attributes
-      ConvBackendConfig backend_config,
-      // Remaining attributes
-      int64_t feature_group_count, double result_scale,
-      // Optional attributes for fused convolutions.
-      std::optional<se::dnn::ActivationMode> activation_mode = std::nullopt,
-      std::optional<double> side_input_scale = std::nullopt) const {
-    // Build config for optional attributes.
-    std::optional<FusedConvAttrs> fused_attrs = std::nullopt;
-    if (activation_mode.has_value()) fused_attrs = {*activation_mode};
-
-    std::optional<SideInputAttrs> side_input_attrs = std::nullopt;
-    if (side_input_scale.has_value()) side_input_attrs = {*side_input_scale};
-
-    // Get the convolution runner from the cache.
-    absl::StatusOr<ConvRunnerCache::Entry> runner =
-        runners->GetOrCreate(uid, [&]() -> absl::StatusOr<GpuConvConfig> {
-          GpuConvDescriptor descriptor = GetConvDescriptor(
-              kind, operand0, operand1, output, scratch, conv_dims,
-              {window_strides, padding, lhs_dilation, rhs_dilation,
-               window_reversal},
-              backend_config, {feature_group_count, result_scale}, fused_attrs,
-              side_input_attrs);
-
-          StatusOr<GpuConvConfig> conv_config =
-              GetGpuConvConfig(descriptor, "");
-          if (!conv_config.ok()) return ToAbslStatus(conv_config.status());
-
-          return *conv_config;
-        });
-    if (!runner.ok()) return runner.status();
-
-    // Prepare buffer arguments.
-    std::vector<se::DeviceMemoryBase> buffers = {GetDeviceAddress(operand0),
-                                                 GetDeviceAddress(operand1)};
-    if (bias.has_value()) buffers.push_back(GetDeviceAddress(*bias));
-    if (side_input.has_value())
-      buffers.push_back(GetDeviceAddress(*side_input));
-
-    se::DeviceMemoryBase result_buffer = GetDeviceAddress(output);
-    se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
-
-    RunConvOptions opts;
-    opts.runner_cache = runner->runner;
-
-    // Run the convolution.
-    auto st = RunGpuConv(*runner->config, buffers, result_buffer,
-                         scratch_buffer, run_options->stream(), opts);
-    if (!st.ok() || !run_options->stream()->ok()) {
-      return ToAbslStatus(st);
-    }
-
-    return absl::OkStatus();
+template <CudnnConvKind kind>
+static absl::Status ConvImpl(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, State<ConvRunner> runner,
+    // Arguments
+    StridedMemrefView operand0, StridedMemrefView operand1,
+    std::optional<FlatMemrefView> bias,
+    std::optional<StridedMemrefView> side_input, StridedMemrefView output,
+    FlatMemrefView scratch, int64_t uid,
+    // Convolution config
+    ConvDimensionNumbers conv_dims,
+    // Window config
+    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation,
+    absl::Span<const int64_t> window_reversal,
+    // Backend config attributes
+    ConvBackendConfig backend_config,
+    // Remaining attributes
+    int64_t feature_group_count, double result_scale,
+    // Optional attributes for fused convolutions.
+    std::optional<se::dnn::ActivationMode> activation_mode = std::nullopt,
+    std::optional<double> side_input_scale = std::nullopt) {
+  // Build config for optional attributes.
+  std::optional<FusedConvAttrs> fused_attrs = std::nullopt;
+  if (activation_mode.has_value()) fused_attrs = {*activation_mode};
+
+  std::optional<SideInputAttrs> side_input_attrs = std::nullopt;
+  if (side_input_scale.has_value()) side_input_attrs = {*side_input_scale};
+
+  // Get or create the convolution runner state.
+  absl::StatusOr<ConvRunner*> conv =
+      runner.GetOrCreate([&]() -> absl::StatusOr<ConvRunner> {
+        GpuConvDescriptor descriptor = GetConvDescriptor(
+            kind, operand0, operand1, output, scratch, conv_dims,
+            {window_strides, padding, lhs_dilation, rhs_dilation,
+             window_reversal},
+            backend_config, {feature_group_count, result_scale}, fused_attrs,
+            side_input_attrs);
+
+        StatusOr<GpuConvConfig> conv_config = GetGpuConvConfig(descriptor, "");
+        if (!conv_config.ok()) return ToAbslStatus(conv_config.status());
+
+        return ConvRunner(*std::move(conv_config));
+      });
+  if (!conv.ok()) return conv.status();
+
+  // Prepare buffer arguments.
+  std::vector<se::DeviceMemoryBase> buffers = {GetDeviceAddress(operand0),
+                                               GetDeviceAddress(operand1)};
+  if (bias.has_value()) buffers.push_back(GetDeviceAddress(*bias));
+  if (side_input.has_value()) buffers.push_back(GetDeviceAddress(*side_input));
+
+  se::DeviceMemoryBase result_buffer = GetDeviceAddress(output);
+  se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
+
+  RunConvOptions opts;
+  opts.runner_cache = &(*conv)->runner;
+
+  // Run the convolution.
+  auto st = RunGpuConv((*conv)->config, buffers, result_buffer, scratch_buffer,
+                       run_options->stream(), opts);
+  if (!st.ok() || !run_options->stream()->ok()) {
+    return ToAbslStatus(st);
   }
 
-  static Conv Handler(CudnnConvKind kind) { return Conv{kind}; }
+  return absl::OkStatus();
+}
 
-  CudnnConvKind kind;
-};
+//===----------------------------------------------------------------------===//
+// Convolution custom calls bindings and registration.
+//===----------------------------------------------------------------------===//
 
-}  // namespace
+using Kind = CudnnConvKind;
 
-// Adds custom call bindings for convolution operations.
 template <typename... Ts>
 static auto BindConvAttributes(runtime::CustomCallBinding<Ts...> binding) {
   return std::move(binding)
       // Unique convolution id for caching state.
       .template Attr<int64_t>("uid")
-      .template UserData<ConvRunnerCache*>()
       // Convolution dimensions numbers
       .template Attr<ConvDimensionNumbers>("conv_dims")
       // Window config
-      .template Attr<ArrayRef<int64_t>>("window_strides")
-      .template Attr<ArrayRef<int64_t>>("padding")
-      .template Attr<ArrayRef<int64_t>>("lhs_dilation")
-      .template Attr<ArrayRef<int64_t>>("rhs_dilation")
-      .template Attr<ArrayRef<int64_t>>("window_reversal")
+      .template Attr<absl::Span<const int64_t>>("window_strides")
+      .template Attr<absl::Span<const int64_t>>("padding")
+      .template Attr<absl::Span<const int64_t>>("lhs_dilation")
+      .template Attr<absl::Span<const int64_t>>("rhs_dilation")
+      .template Attr<absl::Span<const int64_t>>("window_reversal")
       // Backend config attributes
       .template Attr<ConvBackendConfig>("backend_config")
       // Remaining attributes.
@@ -270,80 +399,68 @@ static auto BindConvAttributes(runtime::CustomCallBinding<Ts...> binding) {
       .template Attr<double>("result_scale");
 }
 
-template <CudnnConvKind kind>
-static bool ConvFn(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                   void** rets) {
-  static auto* handler =
-      BindConvAttributes(CustomCall::Bind("xla.gpu.conv")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // operand0
-                             .Arg<runtime::StridedMemrefView>()  // operand1
-                             .Value(std::nullopt)                // bias
-                             .Value(std::nullopt)                // side_input
-                             .Arg<runtime::StridedMemrefView>()  // output
-                             .Arg<runtime::FlatMemrefView>()     // scratch
-                         )
-          .To<checks>(Conv::Handler(kind))
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-template <CudnnConvKind kind>
-static bool ConvFusedFn(runtime::ExecutionContext* ctx, void** args,
-                        void** attrs, void** rets) {
-  static auto* handler =
-      BindConvAttributes(CustomCall::Bind("xla.gpu.conv.fused")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // operand0
-                             .Arg<runtime::StridedMemrefView>()  // operand1
-                             .Arg<runtime::FlatMemrefView>()     // bias
-                             .Value(std::nullopt)                // side_input
-                             .Arg<runtime::StridedMemrefView>()  // output
-                             .Arg<runtime::FlatMemrefView>()     // scratch
-                         )
-          .Attr<se::dnn::ActivationMode>("activation_mode")
-          .To<checks>(Conv::Handler(kind))
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-template <CudnnConvKind kind>
-static bool ConvFuseSideInputdFn(runtime::ExecutionContext* ctx, void** args,
-                                 void** attrs, void** rets) {
-  static auto* handler =
-      BindConvAttributes(CustomCall::Bind("xla.gpu.conv.fused.side_input")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const DebugOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // operand0
-                             .Arg<runtime::StridedMemrefView>()  // operand1
-                             .Arg<runtime::FlatMemrefView>()     // bias
-                             .Arg<runtime::StridedMemrefView>()  // side_input
-                             .Arg<runtime::StridedMemrefView>()  // output
-                             .Arg<runtime::FlatMemrefView>()     // scratch
-                         )
-          .Attr<se::dnn::ActivationMode>("activation_mode")
-          .Attr<double>("side_input_scale")
-          .To<checks>(Conv::Handler(kind))
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(
+    Kind kind, Conv, FunctionWrapper<ConvImpl<kind>>(), checks,
+    BindConvAttributes(
+        CustomCall::Bind("xla.gpu.conv")
+            .UserData<const ServiceExecutableRunOptions*>()
+            .UserData<const DebugOptions*>()
+            .State<ConvRunner>("uid")                   // runner
+            .Arg<StridedMemrefView>()                   // operand0
+            .Arg<StridedMemrefView>()                   // operand1
+            .Value(std::optional<FlatMemrefView>())     // bias
+            .Value(std::optional<StridedMemrefView>())  // side_input
+            .Arg<StridedMemrefView>()                   // output
+            .Arg<FlatMemrefView>()                      // scratch
+        )
+        .Value(std::optional<se::dnn::ActivationMode>())  // activation_mode
+        .Value(std::optional<double>())                   // side_input_scale
+);
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    ConvFused, FunctionWrapper<ConvImpl<Kind::kForwardActivation>>(), checks,
+    BindConvAttributes(
+        CustomCall::Bind("xla.gpu.conv.fused")
+            .UserData<const ServiceExecutableRunOptions*>()
+            .UserData<const DebugOptions*>()
+            .State<ConvRunner>("uid")                   // runner
+            .Arg<StridedMemrefView>()                   // operand0
+            .Arg<StridedMemrefView>()                   // operand1
+            .Arg<FlatMemrefView>()                      // bias
+            .Value(std::optional<StridedMemrefView>())  // side_input
+            .Arg<StridedMemrefView>()                   // output
+            .Arg<FlatMemrefView>()                      // scratch
+        )
+        .Attr<se::dnn::ActivationMode>("activation_mode")
+        .Value(std::optional<double>())  // side_input_scale
+);
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    ConvFusedSideInput, FunctionWrapper<ConvImpl<Kind::kForwardActivation>>(),
+    checks,
+    BindConvAttributes(CustomCall::Bind("xla.gpu.conv.fused.side_input")
+                           .UserData<const ServiceExecutableRunOptions*>()
+                           .UserData<const DebugOptions*>()
+                           .State<ConvRunner>("uid")  // runner
+                           .Arg<StridedMemrefView>()  // operand0
+                           .Arg<StridedMemrefView>()  // operand1
+                           .Arg<FlatMemrefView>()     // bias
+                           .Arg<StridedMemrefView>()  // side_input
+                           .Arg<StridedMemrefView>()  // output
+                           .Arg<FlatMemrefView>()     // scratch
+                       )
+        .Attr<se::dnn::ActivationMode>("activation_mode")
+        .Attr<double>("side_input_scale"));
+
+//===----------------------------------------------------------------------===//
 
 void RegisterConvCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  auto conv = [](StringRef name) { return ("xla.gpu.conv." + name).str(); };
-  registry.Register(conv("forward"), &ConvFn<CudnnConvKind::kForward>);
-  registry.Register(conv("backward.input"),
-                    &ConvFn<CudnnConvKind::kBackwardInput>);
-  registry.Register(conv("backward.filter"),
-                    &ConvFn<CudnnConvKind::kBackwardFilter>);
-  registry.Register(conv("forward.fused"),
-                    &ConvFusedFn<CudnnConvKind::kForwardActivation>);
-  registry.Register(conv("forward.fused.side_input"),
-                    &ConvFuseSideInputdFn<CudnnConvKind::kForwardActivation>);
+  auto conv = [](std::string name) { return "xla.gpu.conv." + name; };
+  registry.Register(conv("forward"), Conv<Kind::kForward>);
+  registry.Register(conv("backward.input"), Conv<Kind::kBackwardInput>);
+  registry.Register(conv("backward.filter"), Conv<Kind::kBackwardFilter>);
+  registry.Register(conv("forward.fused"), ConvFused);
+  registry.Register(conv("forward.fused.side_input"), ConvFusedSideInput);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/conv.h b/tensorflow/compiler/xla/service/gpu/runtime/conv.h
index 7ab0674b612..a426f446299 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/conv.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/conv.h
@@ -16,109 +16,52 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONV_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONV_H_
 
+#include <memory>
 #include <utility>
 
-#include "absl/functional/function_ref.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/synchronization/mutex.h"
-#include "llvm/ADT/DenseMap.h"
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 
-using llvm::ArrayRef;
+// Registers XLA Gpu runtime Conv custom calls.
+void RegisterConvCustomCalls(runtime::DirectCustomCallRegistry& registry);
 
-struct ConvDimensionNumbers {
-  int64_t input_batch_dim;
-  int64_t input_feature_dim;
-  ArrayRef<int64_t> input_spatial_dims;
+// Register type names for convoluttion attributes defined by MHLO dialect.
+void RegisterConvTypeIdNames(runtime::TypeIDNameRegistry& registry);
 
-  int64_t kernel_in_feature_dim;
-  int64_t kernel_out_feature_dim;
-  ArrayRef<int64_t> kernel_spatial_dims;
+// Add attributes encoding for convoluttion attributes defined by MHLO dialect.
+void PopulateConvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
 
-  int64_t output_batch_dim;
-  int64_t output_feature_dim;
-  ArrayRef<int64_t> output_spatial_dims;
-};
+//===----------------------------------------------------------------------===//
+// Cache conv runners between invocations of convolution custom calls.
+//===----------------------------------------------------------------------===//
 
-struct ConvBackendConfig {
-  int64_t algorithm;
-  bool tensor_ops_enabled;
-  bool is_cudnn_frontend;
-  ArrayRef<int64_t> knob_ids;
-  ArrayRef<int64_t> knob_values;
-  ArrayRef<int64_t> operand_0_layout;
-  ArrayRef<int64_t> operand_1_layout;
-  ArrayRef<int64_t> result_layout;
-  int64_t workspace_size;
+struct ConvRunner {
+  explicit ConvRunner(GpuConvConfig config)
+      : config(std::move(config)), runner(this->config) {}
+  GpuConvConfig config;
+  MaybeFusedConvRunner runner;
 };
 
-// Registers XLA Gpu runtime Conv custom calls.
-void RegisterConvCustomCalls(runtime::DirectCustomCallRegistry& registry);
+class StreamExecutorConvRunners : public runtime::StateVector<ConvRunner> {};
 
-// Cache conv runners between invocations of convolution custom calls.
-class ConvRunnerCache {
+// Xla executable keeps a mapping from stream executors to convolution runners.
+class ConvRunners {
  public:
-  struct Entry {
-    MaybeFusedConvRunner* runner;
-    GpuConvConfig* config;
-  };
-
-  // Returns cached conv runner and the gpu config it was constructed from for
-  // the given id, or creates a new one using user-provided config construction
-  // function.
-  absl::StatusOr<Entry> GetOrCreate(
-      int64_t uid, absl::FunctionRef<absl::StatusOr<GpuConvConfig>()> config);
+  StreamExecutorConvRunners* operator()(se::StreamExecutor* executor);
 
  private:
   mutable absl::Mutex mutex_;
-
-  llvm::SmallDenseMap<int64_t, std::pair<MaybeFusedConvRunner, GpuConvConfig>>
-      runners_ ABSL_GUARDED_BY(mutex_);
+  absl::node_hash_map<se::StreamExecutor*, StreamExecutorConvRunners> runners_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-namespace xla {
-namespace runtime {
-
-using llvm::ArrayRef;
-
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(stream_executor::dnn::ActivationMode);
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::ConvDimensionNumbers,
-    // --- input dimensions
-    AggregateMember<int64_t>("input_batch_dim"),
-    AggregateMember<int64_t>("input_feature_dim"),
-    AggregateMember<ArrayRef<int64_t>>("input_spatial_dims"),
-    // --- kernel dimensions
-    AggregateMember<int64_t>("kernel_in_feature_dim"),
-    AggregateMember<int64_t>("kernel_out_feature_dim"),
-    AggregateMember<ArrayRef<int64_t>>("kernel_spatial_dims"),
-    // --- output dimensions
-    AggregateMember<int64_t>("output_batch_dim"),
-    AggregateMember<int64_t>("output_feature_dim"),
-    AggregateMember<ArrayRef<int64_t>>("output_spatial_dims"));
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::ConvBackendConfig,  //
-    AggregateMember<int64_t>("algorithm"),
-    AggregateMember<bool>("tensor_ops_enabled"),
-    AggregateMember<bool>("is_cudnn_frontend"),
-    AggregateMember<ArrayRef<int64_t>>("knob_ids"),
-    AggregateMember<ArrayRef<int64_t>>("knob_values"),
-    AggregateMember<ArrayRef<int64_t>>("operand_0_layout"),
-    AggregateMember<ArrayRef<int64_t>>("operand_1_layout"),
-    AggregateMember<ArrayRef<int64_t>>("result_layout"),
-    AggregateMember<int64_t>("workspace_size"));
-
-}  // namespace runtime
-}  // namespace xla
-
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONV_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
index 901c49e0f2f..3bb28c7707e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
@@ -22,10 +22,11 @@ limitations under the License.1
 #include <utility>
 #include <vector>
 
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/logical_result.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
@@ -38,59 +39,75 @@ limitations under the License.1
 #endif  // GOOGLE_CUDA
 
 namespace xla {
-namespace gpu {
-
-namespace se = ::stream_executor;
+#if GOOGLE_CUDA
 
-using llvm::ArrayRef;
-using mlir::succeeded;
-using tsl::ToAbslStatus;
 using xla::runtime::CustomCall;
-using xla::runtime::Executable;
+using xla::runtime::CustomCallAttrEncodingSet;
+using xla::runtime::EnumAttrEncoding;
+using xla::runtime::State;
+using xla::runtime::StridedMemrefView;
 
-// TODO(ezhulenev): Cache matmul plans similar to GemmConfig for Gemm.
+namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
 
-#if GOOGLE_CUDA
+//===----------------------------------------------------------------------===//
+// Register cuBLASLt attributes decoding with the Xla runtime.
+//===----------------------------------------------------------------------===//
+
+namespace runtime {
+XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::cuda::BlasLt::Epilogue);
+}  // namespace runtime
+
+//===----------------------------------------------------------------------===//
+// Encoding from MHLO attributes to Xla runtime enums.
+//===----------------------------------------------------------------------===//
+
+namespace gpu {
+
+void PopulateCublasLtMatmulAttrEncoding(CustomCallAttrEncodingSet& encoding) {
+  encoding.Add<EnumAttrEncoding<lmhlo_gpu::CublasLtMatmulEpilogueAttr,
+                                lmhlo_gpu::CublasLtMatmulEpilogue,
+                                se::cuda::BlasLt::Epilogue>>(
+      [](lmhlo_gpu::CublasLtMatmulEpilogue value)
+          -> se::cuda::BlasLt::Epilogue {
+        return cublas_lt::AsBlasLtEpilogue(value).value();
+      });
+}
+
+//===----------------------------------------------------------------------===//
+// cuBLASLt matmul custom call implementation.
+//===----------------------------------------------------------------------===//
 
-namespace {
-struct CublasLtMatmul {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(
-      const ServiceExecutableRunOptions* run_options,
-      const DebugOptions* debug_options, runtime::StridedMemrefView a,
-      runtime::StridedMemrefView b, runtime::StridedMemrefView c,
-      runtime::StridedMemrefView d,
-      std::optional<runtime::StridedMemrefView> bias, int64_t algorithm,
-      double alpha_real, double alpha_imag, double beta,
-      DotDimensionNumbers dot_dims, se::cuda::BlasLt::Epilogue epilogue,
-      ArrayRef<int32_t> precision, int64_t uid) const;
-
-  static CublasLtMatmul Handler() { return CublasLtMatmul(); }
-};
-}  // namespace
-
-absl::Status CublasLtMatmul::operator()(
+static absl::Status CublasLtMatmulImpl(
     const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, runtime::StridedMemrefView a,
-    runtime::StridedMemrefView b, runtime::StridedMemrefView c,
-    runtime::StridedMemrefView d,
-    std::optional<runtime::StridedMemrefView> bias, int64_t algorithm,
+    const DebugOptions* debug_options, State<GemmConfig> gemm_config,
+    State<cublas_lt::MatmulPlan> matmul_plan, StridedMemrefView a,
+    StridedMemrefView b, StridedMemrefView c, StridedMemrefView d,
+    std::optional<StridedMemrefView> bias, std::optional<StridedMemrefView> aux,
+    std::optional<StridedMemrefView> a_scale,
+    std::optional<StridedMemrefView> b_scale,
+    std::optional<StridedMemrefView> c_scale,
+    std::optional<StridedMemrefView> d_scale,
+    std::optional<StridedMemrefView> d_amax, int64_t algorithm,
     double alpha_real, double alpha_imag, double beta,
     DotDimensionNumbers dot_dims, se::cuda::BlasLt::Epilogue epilogue,
-    ArrayRef<int32_t> precision, int64_t uid) const {
+    absl::Span<const int32_t> precision) {
   VLOG(3) << "Running CublasLtMatmul";
   se::Stream* stream = run_options->stream();
 
-  // Construct a plan from a gemm config and an epilogue.
-  auto cfg = GetGemmConfig(a, b, c, algorithm, alpha_real, alpha_imag, beta,
-                           dot_dims.lhs_batch, dot_dims.lhs_contract,
-                           dot_dims.rhs_batch, dot_dims.rhs_contract);
-  if (!cfg.ok()) return ToAbslStatus(cfg.status());
+  // Find the gemm config for this instance of matmul.
+  absl::StatusOr<GemmConfig*> config = gemm_config.GetOrCreate([&] {
+    return ToAbsl(GetGemmConfig(a, b, c, algorithm, alpha_real, alpha_imag,
+                                beta, dot_dims.lhs_batch, dot_dims.lhs_contract,
+                                dot_dims.rhs_batch, dot_dims.rhs_contract));
+  });
+  if (!config.ok()) return config.status();
 
-  auto plan = cublas_lt::MatmulPlan::From(*cfg, epilogue);
-  if (!plan.ok()) return ToAbslStatus(plan.status());
+  // Get the matmul plan for this instance of matmul.
+  absl::StatusOr<cublas_lt::MatmulPlan*> plan = matmul_plan.GetOrCreate(
+      [&] { return ToAbsl(cublas_lt::MatmulPlan::From(**config, epilogue)); });
+  if (!plan.ok()) return plan.status();
 
-  auto algos = plan->GetAlgorithms(stream);
+  auto algos = (*plan)->GetAlgorithms(stream);
   if (!algos.ok()) return ToAbslStatus(algos.status());
 
   se::DeviceMemoryBase a_data = GetDeviceAddress(a);
@@ -99,21 +116,38 @@ absl::Status CublasLtMatmul::operator()(
   se::DeviceMemoryBase d_data = GetDeviceAddress(d);
   se::DeviceMemoryBase bias_data;
   if (bias.has_value()) bias_data = GetDeviceAddress(*bias);
+  se::DeviceMemoryBase aux_data;
+  if (aux.has_value()) aux_data = GetDeviceAddress(*aux);
+
+  se::DeviceMemoryBase a_scale_data;
+  if (a_scale.has_value()) a_scale_data = GetDeviceAddress(*a_scale);
+  se::DeviceMemoryBase b_scale_data;
+  if (b_scale.has_value()) b_scale_data = GetDeviceAddress(*b_scale);
+  se::DeviceMemoryBase c_scale_data;
+  if (c_scale.has_value()) c_scale_data = GetDeviceAddress(*c_scale);
+  se::DeviceMemoryBase d_scale_data;
+  if (d_scale.has_value()) d_scale_data = GetDeviceAddress(*d_scale);
+  se::DeviceMemoryBase d_amax_data;
+  if (d_amax.has_value()) d_amax_data = GetDeviceAddress(*d_amax);
 
   se::OwningScratchAllocator<> scratch_allocator(
       stream->parent()->device_ordinal(), stream->parent()->GetAllocator());
 
-  auto st =
-      plan->ExecuteOnStream(stream, a_data, b_data, c_data, d_data, bias_data,
-                            (*algos)[algorithm], scratch_allocator);
+  auto st = (*plan)->ExecuteOnStream(
+      stream, a_data, b_data, c_data, d_data, bias_data, aux_data, a_scale_data,
+      b_scale_data, c_scale_data, d_scale_data, d_amax_data,
+      (*algos)[algorithm], scratch_allocator);
   if (!st.ok()) return ToAbslStatus(st);
 
   return absl::OkStatus();
 }
 
-// Adds custom call bindings for matmul operations.
+//===----------------------------------------------------------------------===//
+// cuBLASLt custom calls bindings and registration.
+//===----------------------------------------------------------------------===//
+
 template <typename... Ts>
-static auto BindMatmulAttributes(runtime::CustomCallBinding<Ts...> binding) {
+auto BindMatmulAttributes(runtime::CustomCallBinding<Ts...> binding) {
   return std::move(binding)
       .template Attr<int64_t>("algorithm")
       .template Attr<double>("alpha_real")
@@ -121,52 +155,108 @@ static auto BindMatmulAttributes(runtime::CustomCallBinding<Ts...> binding) {
       .template Attr<double>("beta")
       .template Attr<DotDimensionNumbers>("dot_dims")
       .template Attr<se::cuda::BlasLt::Epilogue>("epilogue")
-      .template Attr<ArrayRef<int32_t>>("precision")
-      .template Attr<int64_t>("uid");
+      .template Attr<absl::Span<const int32_t>>("precision");
 }
 
-static bool CublasLtMatmul(runtime::ExecutionContext* ctx, void** args,
-                           void** attrs, void** rets) {
-  static auto* handler =
-      BindMatmulAttributes(CustomCall::Bind("xla.gpu.cublas.lt.matmul")
-                               .UserData<const ServiceExecutableRunOptions*>()
-                               .UserData<const DebugOptions*>()
-                               .Arg<runtime::StridedMemrefView>()  // a
-                               .Arg<runtime::StridedMemrefView>()  // b
-                               .Arg<runtime::StridedMemrefView>()  // c
-                               .Arg<runtime::StridedMemrefView>()  // d
-                               .Value(std::nullopt)                // bias
-                           )
-          .To<checks>(CublasLtMatmul::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+auto CublasLtMatmulCall(const char* name) {
+  return CustomCall::Bind(name)
+      .UserData<const ServiceExecutableRunOptions*>()
+      .UserData<const DebugOptions*>()
+      .State<GemmConfig>("uid")
+      .State<cublas_lt::MatmulPlan>("uid")
+      .Arg<StridedMemrefView>()   // a
+      .Arg<StridedMemrefView>()   // b
+      .Arg<StridedMemrefView>()   // c
+      .Arg<StridedMemrefView>();  // d
 }
 
-static bool CublasLtMatmulBias(runtime::ExecutionContext* ctx, void** args,
-                               void** attrs, void** rets) {
-  static auto* handler =
-      BindMatmulAttributes(CustomCall::Bind("xla.gpu.cublas.lt.matmul.bias")
-                               .UserData<const ServiceExecutableRunOptions*>()
-                               .UserData<const DebugOptions*>()
-                               .Arg<runtime::StridedMemrefView>()  // a
-                               .Arg<runtime::StridedMemrefView>()  // b
-                               .Arg<runtime::StridedMemrefView>()  // c
-                               .Arg<runtime::StridedMemrefView>()  // d
-                               .Arg<runtime::StridedMemrefView>()  // bias
-                           )
-          .To<checks>(CublasLtMatmul::Handler())
-          .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmul, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(
+        CublasLtMatmulCall("xla.gpu.cublas.lt.matmul")
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // aux
+            .Value(std::optional<StridedMemrefView>())  // a_scale
+            .Value(std::optional<StridedMemrefView>())  // b_scale
+            .Value(std::optional<StridedMemrefView>())  // c_scale
+            .Value(std::optional<StridedMemrefView>())  // d_scale
+            .Value(std::optional<StridedMemrefView>())  // d_amax
+        ));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmulBias, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(
+        CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.bias")
+            .Arg<StridedMemrefView>()                   // bias
+            .Value(std::optional<StridedMemrefView>())  // aux
+            .Value(std::optional<StridedMemrefView>())  // a_scale
+            .Value(std::optional<StridedMemrefView>())  // b_scale
+            .Value(std::optional<StridedMemrefView>())  // c_scale
+            .Value(std::optional<StridedMemrefView>())  // d_scale
+            .Value(std::optional<StridedMemrefView>())  // d_amax
+        ));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmulAux, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(
+        CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.aux")
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Arg<StridedMemrefView>()                   // aux
+            .Value(std::optional<StridedMemrefView>())  // a_scale
+            .Value(std::optional<StridedMemrefView>())  // b_scale
+            .Value(std::optional<StridedMemrefView>())  // c_scale
+            .Value(std::optional<StridedMemrefView>())  // d_scale
+            .Value(std::optional<StridedMemrefView>())  // d_amax
+        ));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmulBiasAux, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(
+        CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.bias.aux")
+            .Arg<StridedMemrefView>()                   // bias
+            .Arg<StridedMemrefView>()                   // aux
+            .Value(std::optional<StridedMemrefView>())  // a_scale
+            .Value(std::optional<StridedMemrefView>())  // b_scale
+            .Value(std::optional<StridedMemrefView>())  // c_scale
+            .Value(std::optional<StridedMemrefView>())  // d_scale
+            .Value(std::optional<StridedMemrefView>())  // d_amax
+        ));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmulF8, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(
+        CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.f8")
+            .Value(std::optional<StridedMemrefView>())  // bias
+            .Value(std::optional<StridedMemrefView>())  // aux
+            .Arg<StridedMemrefView>()                   // a_scale
+            .Arg<StridedMemrefView>()                   // b_scale
+            .Arg<StridedMemrefView>()                   // c_scale
+            .Arg<StridedMemrefView>()                   // d_scale
+            .Value(std::optional<StridedMemrefView>())  // d_amax
+        ));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    CublasLtMatmulF8DAmax, FunctionWrapper<CublasLtMatmulImpl>(), checks,
+    BindMatmulAttributes(CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.f8.damax")
+                             .Value(std::optional<StridedMemrefView>())  // bias
+                             .Value(std::optional<StridedMemrefView>())  // aux
+                             .Arg<StridedMemrefView>()  // a_scale
+                             .Arg<StridedMemrefView>()  // b_scale
+                             .Arg<StridedMemrefView>()  // c_scale
+                             .Arg<StridedMemrefView>()  // d_scale
+                             .Arg<StridedMemrefView>()  // d_amax
+                         ));
 
 void RegisterMatmulCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.cublas.lt.matmul", &xla::gpu::CublasLtMatmul);
+  registry.Register("xla.gpu.cublas.lt.matmul", CublasLtMatmul);
   registry.Register("xla.gpu.cublas.lt.matmul.bias", CublasLtMatmulBias);
+  registry.Register("xla.gpu.cublas.lt.matmul.aux", CublasLtMatmulAux);
+  registry.Register("xla.gpu.cublas.lt.matmul.bias.aux", CublasLtMatmulBiasAux);
+  registry.Register("xla.gpu.cublas.lt.matmul.f8", CublasLtMatmulF8);
+  registry.Register("xla.gpu.cublas.lt.matmul.f8.d_amax",
+                    CublasLtMatmulF8DAmax);
 }
 
-#endif  // GOOGLE_CUDA
-
 }  // namespace gpu
+#endif  // GOOGLE_CUDA
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h
index 5a406ceee96..b23ed016186 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h
@@ -16,55 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
 
-#include <memory>
-#include <optional>
-#include <string_view>
-#include <tuple>
-
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
-#include "tensorflow/compiler/xla/runtime/logical_result.h"
 
 #if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
 #endif  // GOOGLE_CUDA
 
 namespace xla {
 namespace gpu {
 
-using llvm::ArrayRef;
-
-struct DotDimensionNumbers {
-  llvm::ArrayRef<int64_t> lhs_batch;
-  llvm::ArrayRef<int64_t> lhs_contract;
-  llvm::ArrayRef<int64_t> rhs_batch;
-  llvm::ArrayRef<int64_t> rhs_contract;
-};
-
-// Registers XLA Gpu runtime kernel launch custom calls.
+// Registers XLA Gpu runtime cuBLASLt custom calls.
 void RegisterMatmulCustomCalls(runtime::DirectCustomCallRegistry& registry);
 
-}  // namespace gpu
-}  // namespace xla
-
-namespace xla {
-namespace runtime {
-
-using llvm::ArrayRef;
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::DotDimensionNumbers,
-    AggregateMember<ArrayRef<int64_t>>("lhs_batch"),
-    AggregateMember<ArrayRef<int64_t>>("lhs_contract"),
-    AggregateMember<ArrayRef<int64_t>>("rhs_batch"),
-    AggregateMember<ArrayRef<int64_t>>("rhs_contract"));
+// Add cuBLASLt attributes encoding
+void PopulateCublasLtMatmulAttrEncoding(
+    runtime::CustomCallAttrEncodingSet& encoding);
 
 #if GOOGLE_CUDA
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(
-    stream_executor::cuda::BlasLt::Epilogue);
+// Keep cublas_lt::MatmulPlan's for all matmul instances in the executable.
+class MatmulPlans : public runtime::StateVector<cublas_lt::MatmulPlan> {};
 #endif  // GOOGLE_CUDA
 
-}  // namespace runtime
+}  // namespace gpu
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUBLAS_LT_MATMUL_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc
new file mode 100644
index 00000000000..5d309669ed5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc
@@ -0,0 +1,152 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/custom_call.h"
+
+#include <string>
+#include <string_view>
+
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/custom_call_status_internal.h"
+#include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace xla {
+namespace gpu {
+
+// Custom calls with API version API_VERSION_TYPED_FFI lowered directly to an
+// Xla runtime custom calls. Older API versions handled by adapting Xla runtime
+// calling convention to the calling convention expected by the registered
+// handler.
+//
+// Once all Xla backends will use Xla runtime we will deprecate older API
+// version, and migrate all users to API_VERSION_TYPED_FFI.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+using xla::runtime::CustomCall;
+using xla::runtime::FlatMemrefView;
+using xla::runtime::StridedMemrefView;
+
+static absl::Status XlaCustomCallImpl(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
+    std::string_view call_target_name, int32_t api_version,
+    std::string_view backend_config) {
+  // Pattern match custom call to a few special cases, otherwise find the custom
+  // call handler regustered with the runtime.
+  if (call_target_name == kTriangularSolveCallTarget)
+    return TriangularSolve::run(run_options, debug_options, args,
+                                backend_config);
+
+  // Find the Xla custom call handler.
+  auto& platform_name = run_options->stream()->parent()->platform()->Name();
+  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+      std::string(call_target_name), platform_name);
+  if (!call_target) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Cannot find the Xla custom call handler ", call_target_name));
+  }
+
+  // Prepare pointers to buffers to pass to the Xla custom call handler.
+  llvm::SmallVector<void*> buffers;
+  for (unsigned i = 0; i < args.size(); ++i) {
+    if (auto memref = args.get<FlatMemrefView>(i); succeeded(memref)) {
+      buffers.push_back(memref->data);
+      continue;
+    }
+
+    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
+      buffers.push_back(strided->data);
+      continue;
+    }
+
+    // TODO(ezhulenev): Add dialect and type to model Xla custom call holes,
+    // today we rely on the fact that custom calls do not support scalar
+    // arguments and we can disambiguate holes from real arguments.
+    if (auto hole = args.get<int64_t>(i); succeeded(hole)) {
+      buffers.push_back(nullptr);
+      continue;
+    }
+
+    return absl::InvalidArgumentError(
+        "Failed to get arguments as (strided) memref view");
+  }
+
+  // Call custom call handler using the calling convention it requires.
+  using ApiVersion = CustomCallApiVersion;
+
+  // Original custom call API version that doesn't support returning status.
+  if (api_version == ApiVersion::API_VERSION_ORIGINAL) {
+    using XlaCustomCallType =
+        void (*)(se::gpu::GpuStreamHandle, void**, const char*, size_t);
+    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
+
+    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
+                    buffers.data(), backend_config.data(),
+                    backend_config.size());
+
+    return absl::OkStatus();
+  }
+
+  // Xla Custom call API returning status.
+  if (api_version == ApiVersion::API_VERSION_STATUS_RETURNING ||
+      api_version == ApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED) {
+    using XlaCustomCallType =
+        void (*)(se::gpu::GpuStreamHandle, void**, const char*, size_t,
+                 XlaCustomCallStatus*);
+    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
+
+    XlaCustomCallStatus custom_call_status;
+    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
+                    buffers.data(), backend_config.data(),
+                    backend_config.size(), &custom_call_status);
+
+    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
+      return absl::InternalError(message.value());
+    } else {
+      return absl::OkStatus();
+    }
+  }
+
+  return absl::InvalidArgumentError(
+      absl::StrFormat("Unsupported custom call API version: %d", api_version));
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    XlaCustomCall, FunctionWrapper<XlaCustomCallImpl>(), checks,
+    runtime::CustomCall::Bind("xla.gpu.memcpy")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
+        .Arg<CustomCall::RemainingArgs>()  // args
+        .Attr<std::string_view>("call_target_name")
+        .Attr<int32_t>("api_version")
+        .Attr<std::string_view>("backend_config"));
+
+void RegisterXlaClassicCustomCalls(
+    runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.custom_call", XlaCustomCall);
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/custom_call.h b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.h
new file mode 100644
index 00000000000..c5630105ff6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace gpu {
+
+void RegisterXlaClassicCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
index 7a7ec307c51..028c32b752b 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
@@ -25,32 +25,143 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/fft.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/io_feed.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/memcpy.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/memset.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/send_recv.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/tracing.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#endif  // #if GOOGLE_CUDA
 
 namespace xla {
+
+#if GOOGLE_CUDA
+namespace runtime {
+namespace ffi {
+
+// Override weak symbol defined in the `xla/runtime/ffi.cc` with a strong one
+// that provides implementation for the XLA:GPU backend.
+XLA_FFI_Stream* GetXlaFfiStream(const CustomCall::UserData* user_data,
+                                const DiagnosticEngine* diagnostic) {
+  auto run_opts = user_data->getIfExists<const ServiceExecutableRunOptions>();
+  auto stream = se::gpu::AsGpuStreamValue(run_opts->stream());
+  return reinterpret_cast<XLA_FFI_Stream*>(stream);
+}
+
+}  // namespace ffi
+}  // namespace runtime
+#endif  // GOOGLE_CUDA
+
 namespace gpu {
 
+using ::xla::runtime::CustomCallAttrEncodingSet;
+using ::xla::runtime::DirectCustomCallRegistry;
 using ::xla::runtime::Executable;
 using ::xla::runtime::JitExecutable;
 using ::xla::runtime::success;
+using ::xla::runtime::Tagged;
+using ::xla::runtime::TypeIDNameRegistry;
+
+using ::xla::runtime::ExportModules;
+using ::xla::runtime::ffi::ExportFfiModules;
+using ::xla::runtime::ffi::FfiStateVector;
+
+void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
+  RegisterKernelLaunchCustomCalls(registry);
+  RegisterTracingCustomCalls(registry);
+  RegisterFftCustomCalls(registry);
+  RegisterCholeskyCustomCalls(registry);
+  RegisterCollectiveCustomCalls(registry);
+  RegisterGemmCustomCalls(registry);
+  RegisterConvCustomCalls(registry);
+  RegisterMemcpyCustomCalls(registry);
+  RegisterIoFeedCustomCalls(registry);
+  RegisterMemsetCustomCalls(registry);
+  RegisterSendRecvCustomCalls(registry);
+
+#if GOOGLE_CUDA
+  // Graph launch kernels depend on Cuda Graph API.
+  RegisterGraphLaunchCustomCalls(registry);
+  RegisterMatmulCustomCalls(registry);
+#endif  // GOOGLE_CUDA
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  RegisterXlaClassicCustomCalls(registry);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}
+
+void RegisterXlaGpuTypeIdNames(TypeIDNameRegistry& registry) {
+  registry.Register<Tagged<se::dnn::ActivationMode>>(
+      "__type_id_se_dnn_activation");
+  registry.Register<Tagged<DotDimensionNumbers>>(
+      "__type_id_dot_dimension_numbers");
+  registry.Register<Tagged<se::fft::Type>>("__type_id_se_fft_type");
+
+  RegisterTracingTypeIdNames(registry);
+  RegisterConvTypeIdNames(registry);
+  RegisterSendRecvTypeIdNames(registry);
+
+#if GOOGLE_CUDA
+  registry.Register<Tagged<se::cuda::BlasLt::Epilogue>>(
+      "__type_id_se_cublas_lt_epilogue");
+#endif  // GOOGLE_CUDA
+}
+
+void RegisterXlaGpuAttrEncoding(CustomCallAttrEncodingSet& encoding) {
+  PopulateConvAttrEncoding(encoding);
+  PopulateFftAttrEncoding(encoding);
+  PopulateDotDimsAttrEncoding(encoding);
+  PopulateSendRecvAttrEncoding(encoding);
+
+#if GOOGLE_CUDA
+  PopulateCublasLtMatmulAttrEncoding(encoding);
+#endif  // GOOGLE_CUDA
+}
+
+//===----------------------------------------------------------------------===//
 
 GpuRuntimeExecutable::GpuRuntimeExecutable(
     std::vector<int64_t> buffer_sizes,
-    std::unique_ptr<JitExecutable> jit_executable, DebugOptions debug_options)
+    std::unique_ptr<JitExecutable> jit_executable, DebugOptions debug_options,
+    ModulesState modules_state, FfiModulesState ffi_modules_state)
     : buffer_sizes_(std::move(buffer_sizes)),
       executable_(std::move(jit_executable)),
-      debug_options_(std::move(debug_options)) {}
+      debug_options_(std::move(debug_options)),
+      modules_state_(std::move(modules_state)),
+      ffi_modules_state_(std::move(ffi_modules_state)) {
+  ExportModules(dynamic_custom_calls_);     // export runtime modules
+  ExportFfiModules(dynamic_custom_calls_);  // export FFI modules
+}
 
 GpuRuntimeExecutable::GpuRuntimeExecutable(
     std::vector<int64_t> buffer_sizes,
-    std::unique_ptr<Executable> aot_executable, DebugOptions debug_options)
+    std::unique_ptr<Executable> aot_executable, DebugOptions debug_options,
+    ModulesState modules_state, FfiModulesState ffi_modules_state)
     : buffer_sizes_(std::move(buffer_sizes)),
       executable_(std::move(aot_executable)),
-      debug_options_(std::move(debug_options)) {}
+      debug_options_(std::move(debug_options)),
+      modules_state_(std::move(modules_state)),
+      ffi_modules_state_(std::move(ffi_modules_state)) {
+  ExportModules(dynamic_custom_calls_);     // export runtime modules
+  ExportFfiModules(dynamic_custom_calls_);  // export FFI modules
+}
 
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 // Compile Xla program lowered to runtime dialects to Gpu runtime executable.
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 
 /*static*/ StatusOr<std::unique_ptr<GpuRuntimeExecutable>>
 GpuRuntimeExecutable::Create(std::unique_ptr<GpuRuntimeProgram> program) {
@@ -58,10 +169,10 @@ GpuRuntimeExecutable::Create(std::unique_ptr<GpuRuntimeProgram> program) {
   runtime::CompilationPipelineOptions copts;
 
   // Populate mapping from XLA (SE) enums/structs type id to symbol names.
-  copts.populate_type_id_names = PopulateXlaGpuTypeIdNames;
+  copts.populate_type_id_names = RegisterXlaGpuTypeIdNames;
 
   // For passing LMHLO attributes as XLA (SE) enums/structs to custom calls.
-  copts.populate_attr_encodings = PopulateLmhloToXlaAttrEncoding;
+  copts.populate_attr_encodings = RegisterXlaGpuAttrEncoding;
 
   // Options for constructing XLA runtime JitExecutable.
   JitExecutable::Options opts;
@@ -71,7 +182,7 @@ GpuRuntimeExecutable::Create(std::unique_ptr<GpuRuntimeProgram> program) {
 
   // Register XLA Gpu runtime custom calls with the linker.
   opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
-      PopulateXlaGpuCustomCalls, PopulateXlaGpuTypeIdNames);
+      RegisterXlaGpuRuntimeCustomCalls, RegisterXlaGpuTypeIdNames);
 
   // We just use the default compilation pipeline provided by the XLA runtime.
   // Alternatively instead of having a separate Xla Runtime program (LMHLO
@@ -83,10 +194,8 @@ GpuRuntimeExecutable::Create(std::unique_ptr<GpuRuntimeProgram> program) {
         runtime::CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
       };
 
-  // TODO(b/241296710): LLVM optimizations interact badly with the memory
-  // loads and stores pattern generated in very large XLA programs, and can
-  // take minutes to run. Currently we do not expect any expensive code
-  // running on the host, so we can safely disable optimization passes.
+  // Do not run expensive optimization passes because we do not expect any
+  // non-trivial host code in XLA:GPU host executables.
   opts.compiler.jit_code_opt_level = llvm::CodeGenOpt::None;
 
   // Instantiate new JitExecutable from the MLIR source.
@@ -96,29 +205,55 @@ GpuRuntimeExecutable::Create(std::unique_ptr<GpuRuntimeProgram> program) {
     return InternalError("Failed to compile XLA Runtime program: %s",
                          jit_executable.status().message());
 
+  // Instantiate state for all registered runtime modules.
+  auto modules_state = ModulesState::Instantiate();
+  if (!modules_state.ok())
+    return InternalError("Failed to instantiate modules state: %s",
+                         modules_state.status().message());
+
+  // Instantiate state for all registered FFI modules.
+  auto ffi_modules_state = FfiModulesState::Instantiate();
+  if (!ffi_modules_state.ok())
+    return InternalError("Failed to instantiate FFI modules state: %s",
+                         ffi_modules_state.status().message());
+
   return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
       std::move(program->buffer_sizes),
       std::make_unique<JitExecutable>(std::move(*jit_executable)),
-      std::move(program->debug_options)));
+      std::move(program->debug_options), std::move(*modules_state),
+      std::move(*ffi_modules_state)));
 }
 
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 // Constructs Gpu runtime executable from AOT compiled runtime artifact.
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 
 /*static*/ StatusOr<std::unique_ptr<GpuRuntimeExecutable>>
 GpuRuntimeExecutable::Create(absl::Span<const int64_t> buffer_sizes,
                              Executable executable,
                              DebugOptions debug_options) {
+  // Instantiate state for all registered runtime modules.
+  auto modules_state = ModulesState::Instantiate();
+  if (!modules_state.ok())
+    return InternalError("Failed to instantiate modules state: %s",
+                         modules_state.status().message());
+
+  // Instantiate state for all registered FFI modules.
+  auto ffi_modules_state = FfiModulesState::Instantiate();
+  if (!ffi_modules_state.ok())
+    return InternalError("Failed to instantiate FFI modules state: %s",
+                         ffi_modules_state.status().message());
+
   return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
       std::vector<int64_t>(buffer_sizes.begin(), buffer_sizes.end()),
       std::make_unique<Executable>(std::move(executable)),
-      std::move(debug_options)));
+      std::move(debug_options), std::move(*modules_state),
+      std::move(*ffi_modules_state)));
 }
 
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 // Executes with the given buffer arguments.
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 
 static runtime::AsyncTaskRunner* NoAsyncTaskRunner() {
   return reinterpret_cast<runtime::AsyncTaskRunner*>(0XDEADBEEF);
@@ -167,6 +302,10 @@ Status GpuRuntimeExecutable::Execute(
     const std::vector<uint8_t>& binary,
     const BufferAllocations& buffer_allocations,
     const BufferAllocation* temp_alloc) {
+  // We pass a pointer to the executable through UserData, so that we can
+  // get access to other exported functions from custom call handlers.
+  runtime::Executable& executable = this->executable();
+
   // Pack buffer allocations as executable arguments. It is guaranteed that
   // the compiled function will make a copy of all arguments and will write all
   // results after the call to `Execute` completes, so it is safe to keep them
@@ -176,19 +315,38 @@ Status GpuRuntimeExecutable::Execute(
   llvm::SmallVector<void*, 16> ptrs;  // storage for device address pointers
   InitializeCallFrame(call_frame, buffer_allocations, buffer_sizes_, ptrs);
 
+  // Check that initialized call frame is compatible with the executable
+  // entry point signature, otherwise compiled executable can read memory out of
+  // arguments bounds and crash with a segfault.
+  const runtime::FunctionType& signature = executable.signature();
+  if (signature.num_operands() != buffer_allocations.size())
+    return InternalError("Expected %d arguments but got %d buffer allocations",
+                         signature.num_operands(), buffer_allocations.size());
+
+  for (unsigned i = 0; i < executable.signature().num_operands(); ++i) {
+    auto* memref = llvm::dyn_cast<runtime::MemrefType>(signature.operand(i));
+    if (!memref) return InvalidArgument("Expected memref as %d-th argument", i);
+
+    if (memref->rank() != 1 || memref->sizes()[0] != buffer_sizes_[i])
+      return InvalidArgument("Expected a buffer of size %d but got %d",
+                             memref->sizes()[0], buffer_sizes_[i]);
+  }
+
   // XLA Runtime executables do not return any values.
   runtime::NoResultConverter converter;
 
   // Get the async communications stream for async collectives.
-  int device_ordinal = run_options->stream()->parent()->device_ordinal();
+  se::StreamExecutor* executor = run_options->stream()->parent();
+  int device_ordinal = executor->device_ordinal();
   StatusOr<StreamPool::Ptr> async_comms_stream =
       run_options->BorrowStream(device_ordinal);
 
-  // Async collective support instantiated for each Gpu executable run, so that
-  // concurrent executions can run independenty using a separate set of events
-  // for communication.
-  JitRtAsyncCollectiveSupport async_collectives(
+  // Async Collectives support and Send/Recv events instantiated for each Gpu
+  // executable run, so that concurrent executions can run independenty using a
+  // separate set of events for communication.
+  AsyncCollectivesSupport async_collectives(
       async_comms_stream.ok() ? async_comms_stream->get() : nullptr);
+  SendRecvEvents send_recv_events;
 
   // Always pass in the temp buffer, even if it is null, to accommodate the
   // 0-sized buffer corner case.
@@ -196,20 +354,47 @@ Status GpuRuntimeExecutable::Execute(
   if (temp_alloc)
     temp_buffer = buffer_allocations.GetDeviceAddress(temp_alloc->index());
 
-  // We pass a pointer to the executable through UserData, so that we can
-  // get access to other exported functions from custom call handlers.
-  runtime::Executable& executable = this->executable();
+  // State cached separately for each stream executor.
+  StreamExecutorKernels::Snapshot kernels = gpu_kernels_(executor)->snapshot();
+  StreamExecutorConvRunners::Snapshot conv_runners =
+      conv_runners_(executor)->snapshot();
+
+#if GOOGLE_CUDA
+  StreamExecutorGraphInstances::Snapshot graph_instances =
+      graph_instances_(executor)->snapshot();
+#endif  // GOOGLE_CUDA
+
+  // State cached globally for gpu executable.
+  GemmConfigs::Snapshot gemm_configs = gemm_configs_.snapshot();
+  FftPlans::Snapshot fft_plans = fft_plans_.snapshot();
+
+#if GOOGLE_CUDA
+  MatmulPlans::Snapshot matmul_plans = cublas_lt_matmul_plans_.snapshot();
+#endif  // GOOGLE_CUDA
+
+  // Initialize state required for running functions exported from FFI modules.
+  absl::StatusOr<FfiStateVector> ffi_state = ffi_modules_state_.state_vector();
+  if (!ffi_state.ok()) return FromAbslStatus(ffi_state.status());
 
   // Pass auxiliary data to the custom call handlers.
-  runtime::CustomCall::UserData user_data;
-  user_data.insert_all(
+  runtime::CustomCall::UserData user_data(
       run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
-      &binary, &kernels_cache_, &gemm_configs_cache_, &conv_runners_cache_,
-      &collectives_,
+      &ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners,
+      &collectives_, &fft_plans, &send_recv_events,
+#if GOOGLE_CUDA
+      // Auxiliary data that is available only if compiled with CUDA support.
+      &matmul_plans, &graph_instances,
+#endif  // GOOGLE_CUDA
       // Null pointer will be interpreted as an absence of async collectives
       // support and custom calls will safely return an error.
       async_collectives.async_comm_stream() ? &async_collectives : nullptr);
 
+  // Initialize state required for running functions from registered modules.
+  auto state_ref = modules_state_.InitializeUserData(user_data);
+  if (!state_ref.ok())
+    return InternalError("Failed to initialize runtime modules state: %s",
+                         state_ref.status().message());
+
   // Collect all emitted diagnostic messages.
   std::string diagnostic;
   runtime::DiagnosticEngine diagnostic_engine;
@@ -223,6 +408,7 @@ Status GpuRuntimeExecutable::Execute(
   opts.async_task_runner = NoAsyncTaskRunner();
   opts.custom_call_data = &user_data;
   opts.diagnostic_engine = &diagnostic_engine;
+  opts.custom_call_registry = &dynamic_custom_calls_;
 
   // Execute with the prepared call frame.
   executable.Execute(call_frame, opts);
@@ -236,7 +422,7 @@ Status GpuRuntimeExecutable::Execute(
   return OkStatus();
 }
 
-//===---------------------------------------------------------------------===///
+//===----------------------------------------------------------------------===//
 
 Executable& GpuRuntimeExecutable::executable() {
   if (auto* jit = std::get_if<std::unique_ptr<JitExecutable>>(&executable_)) {
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.h b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
index 957e29850ea..93fd9478390 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.h
@@ -24,16 +24,33 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
+#include "tensorflow/compiler/xla/runtime/module_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
-#include "tensorflow/compiler/xla/service/gpu/jitrt_custom_calls.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/fft.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
 
+// Register custom calls implementing Xla Gpu runtime.
+void RegisterXlaGpuRuntimeCustomCalls(
+    runtime::DirectCustomCallRegistry& registry);
+
+// Register mapping from XLA (SE) enums/structs type ids to symbol names.
+void RegisterXlaGpuTypeIdNames(runtime::TypeIDNameRegistry& registry);
+
+// Register encoding for (L)MHLO attributes required by the runtime functions.
+void RegisterXlaGpuAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
+
 // Xla Gpu program lowered to the Xla runtime dialects. Gpu runtime executable
 // jit-compiles this program to an executable artifact (via lowering to LLVM).
 //
@@ -69,6 +86,9 @@ struct GpuRuntimeProgram {
 // executable provides a lower level API exposing some of the implementation
 // details.
 class GpuRuntimeExecutable {
+  using ModulesState = ::xla::runtime::ModulesState;
+  using FfiModulesState = ::xla::runtime::ffi::FfiModulesState;
+
  public:
   // Creates GpuRuntimeExecutable from the Xla Gpu Program.
   static StatusOr<std::unique_ptr<GpuRuntimeExecutable>> Create(
@@ -96,11 +116,13 @@ class GpuRuntimeExecutable {
  private:
   GpuRuntimeExecutable(std::vector<int64_t> buffer_sizes,
                        std::unique_ptr<runtime::JitExecutable> jit_executable,
-                       DebugOptions debug_options);
+                       DebugOptions debug_options, ModulesState modules_state,
+                       FfiModulesState ffi_modules_state);
 
   GpuRuntimeExecutable(std::vector<int64_t> buffer_sizes,
                        std::unique_ptr<runtime::Executable> aot_executable,
-                       DebugOptions debug_options);
+                       DebugOptions debug_options, ModulesState modules_state,
+                       FfiModulesState ffi_modules_state);
 
   // Depending on the state of `executable_` returns a reference to active
   // Xla runtime executable.
@@ -116,17 +138,37 @@ class GpuRuntimeExecutable {
 
   const DebugOptions debug_options_;
 
-  // Keep a cache of kernels instantiated by this executable.
-  GpuExecutableKernelsCache kernels_cache_;
+  // Keep gpu kernels loaded by this executable.
+  GpuExecutableKernels gpu_kernels_;
 
-  // Keep a cache of gemm configs for all gemm operation in the program.
-  JitRtGemmConfigCache gemm_configs_cache_;
+  // Keep gemm configs for all gemm operation in the program.
+  GemmConfigs gemm_configs_;
 
   // Keep a cache for conv configs for all conv operations in the program.
-  ConvRunnerCache conv_runners_cache_;
+  ConvRunners conv_runners_;
 
   // Support for running collective operations.
-  JitRtCollectiveSupport collectives_;
+  CollectivesSupport collectives_;
+
+  // Keep a cache of fft plans for all FFT operations in the program.
+  FftPlans fft_plans_;
+
+#if GOOGLE_CUDA
+  // Keep matmul execution plans (only if cuBLASLt is available).
+  MatmulPlans cublas_lt_matmul_plans_;
+
+  // Keep captured and instantiated CUDA graphs instances.
+  GraphInstances graph_instances_;
+#endif  // GOOGLE_CUDA
+
+  // Keep an executable state for all registered runtime modules.
+  ModulesState modules_state_;
+
+  // Keeps an executable state for all registered FFI modules.
+  FfiModulesState ffi_modules_state_;
+
+  // Dynamic custom calls exported from XLA runtime modules (and FFI modules).
+  runtime::DynamicCustomCallRegistry dynamic_custom_calls_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/fft.cc b/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
index fa0402e4890..443d6d18f42 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
@@ -15,40 +15,69 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/runtime/fft.h"
 
+#include <memory>
+
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/stream_executor/fft.h"
 
 namespace xla {
-namespace gpu {
 
 using xla::runtime::CustomCall;
-using xla::runtime::Executable;
+using xla::runtime::State;
 using xla::runtime::StridedMemrefView;
 
-using llvm::ArrayRef;
-
-namespace {
-struct Fft {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          runtime::StridedMemrefView input,
-                          runtime::StridedMemrefView output,
-                          ArrayRef<int64_t> fft_length,
-                          se::fft::Type fft_type) const;
-  static Fft Handler() { return Fft(); }
-};
-}  // namespace
-
-absl::Status Fft::operator()(const ServiceExecutableRunOptions* run_options,
-                             runtime::StridedMemrefView input,
-                             runtime::StridedMemrefView output,
-                             ArrayRef<int64_t> fft_length,
-                             se::fft::Type fft_type) const {
-  // TODO(jacksonstokes): Cache FFT plans in the GpuExecutable.
-  FftPlanCache fft_plan_cache;
+//===----------------------------------------------------------------------===//
+// Register FFT attributes decoding with the Xla runtime.
+//===----------------------------------------------------------------------===//
+
+namespace runtime {
+
+XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::fft::Type);
+
+}  // namespace runtime
 
+//===----------------------------------------------------------------------===//
+// Encoding from MHLO attributes to Xla runtime aggregate attributes.
+//===----------------------------------------------------------------------===//
+
+namespace gpu {
+
+namespace mhlo = ::mlir::mhlo;
+
+static se::fft::Type ConvertFftType(mhlo::FftType type) {
+  switch (type) {
+    case mhlo::FftType::FFT:
+      return se::fft::Type::kC2CForward;
+    case mhlo::FftType::IFFT:
+      return se::fft::Type::kC2CInverse;
+    case mhlo::FftType::RFFT:
+      return se::fft::Type::kR2C;
+    case mhlo::FftType::IRFFT:
+      return se::fft::Type::kC2R;
+    default:
+      return se::fft::Type::kInvalid;
+  }
+}
+
+void PopulateFftAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding) {
+  encoding.Add<runtime::EnumAttrEncoding<mhlo::FftTypeAttr, mhlo::FftType,
+                                         se::fft::Type>>(ConvertFftType);
+}
+
+//===----------------------------------------------------------------------===//
+// FFT custom call implementation.
+//===----------------------------------------------------------------------===//
+
+static absl::Status FftImpl(const ServiceExecutableRunOptions* run_options,
+                            State<std::unique_ptr<FftPlanCache>> state,
+                            StridedMemrefView input, StridedMemrefView output,
+                            absl::Span<const int64_t> fft_length,
+                            se::fft::Type fft_type) {
   se::Stream* stream = run_options->stream();
   se::StreamExecutor* executor = stream->parent();
 
@@ -72,30 +101,35 @@ absl::Status Fft::operator()(const ServiceExecutableRunOptions* run_options,
     }
   }
 
+  absl::StatusOr<std::unique_ptr<FftPlanCache>*> fft_plan_cache =
+      state.GetOrCreate([]() -> absl::StatusOr<std::unique_ptr<FftPlanCache>> {
+        return std::make_unique<FftPlanCache>();
+      });
+  if (!fft_plan_cache.ok()) return fft_plan_cache.status();
+
   auto st =
       RunFft(GetDeviceAddress(input), ToShape(input), GetDeviceAddress(output),
              ToShape(output), fft_type, fft_length, executor->device_ordinal(),
-             &fft_plan_cache, stream, run_options->allocator());
+             (*fft_plan_cache)->get(), stream, run_options->allocator());
   if (!st.ok()) return ToAbslStatus(st);
 
   return absl::OkStatus();
 }
 
-static bool Fft(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.fft")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<runtime::StridedMemrefView>()  // input
-                             .Arg<runtime::StridedMemrefView>()  // output
-                             .Attr<ArrayRef<int64_t>>("fft_length")
-                             .Attr<se::fft::Type>("fft_type")
-                             .To<checks>(Fft::Handler())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Fft, FunctionWrapper<FftImpl>(), checks,
+    CustomCall::Bind("xla.gpu.fft")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .State<std::unique_ptr<FftPlanCache>>("uid")
+        .Arg<StridedMemrefView>()  // input
+        .Arg<StridedMemrefView>()  // output
+        .Attr<absl::Span<const int64_t>>("fft_length")
+        .Attr<se::fft::Type>("fft_type"));
+
+//===----------------------------------------------------------------------===//
 
 void RegisterFftCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.fft", &xla::gpu::Fft);
+  registry.Register("xla.gpu.fft", Fft);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/fft.h b/tensorflow/compiler/xla/service/gpu/runtime/fft.h
index c52fff2d5a6..e3be62e8662 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/fft.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/fft.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_FFT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_FFT_H_
 
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include <memory>
+
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
-#include "tensorflow/compiler/xla/stream_executor/fft.h"
+#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 
 namespace xla {
 namespace gpu {
@@ -26,17 +28,13 @@ namespace gpu {
 // Registers XLA Gpu runtime fft custom calls.
 void RegisterFftCustomCalls(runtime::DirectCustomCallRegistry& registry);
 
-}  //  namespace gpu
-}  //  namespace xla
-
-namespace xla {
-namespace runtime {
-
-// using llvm::ArrayRef;
+// Adds attributes encoding set for fft custom calls
+void PopulateFftAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
 
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(stream_executor::fft::Type);
+// Keep FftPlanCache for all FFT instances in the executable.
+class FftPlans : public runtime::StateVector<std::unique_ptr<FftPlanCache>> {};
 
-}  // namespace runtime
-}  // namespace xla
+}  //  namespace gpu
+}  //  namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_FFT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
new file mode 100644
index 00000000000..1d20e1e6532
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
@@ -0,0 +1,160 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
+
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/stream_executor/blas.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
+#endif
+
+namespace xla {
+namespace gpu {
+
+using xla::runtime::CustomCall;
+using xla::runtime::State;
+using xla::runtime::StridedMemrefView;
+
+#if GOOGLE_CUDA
+// TODO(anlunx): Runtime autotuning should be protected by an exclusive lock to
+// achieve precision. Right now it is protected by a reader lock acquired by
+// GpuExecutable::ExecuteAsyncOnStreamImpl, so it may run cuncurrently with
+// another runtime autotuning.
+Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig& config,
+                           se::DeviceMemoryBase lhs, se::DeviceMemoryBase rhs,
+                           se::DeviceMemoryBase out, const Shape& output_shape,
+                           double beta, const DebugOptions* debug_options) {
+  VLOG(3) << "Running GEMM runtime autotuning";
+  std::vector<se::blas::AlgorithmType> algorithms;
+  stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms);
+
+  // Set autotune_level to 3 to disable correctness checking, which avoids
+  // memory allocation during runtime.
+  AutotuneConfig autotune_config{
+      /*autotune_level=*/3,
+      /*should_crash_on_check_failure=*/true,
+  };
+
+  // RedzoneAllocator will have size 0 for this autotune_level.
+  se::RedzoneAllocator buffer_allocator =
+      CreateRedzoneAllocator(stream, stream->parent()->GetAllocator(),
+                             *debug_options, autotune_config);
+
+  TF_ASSIGN_OR_RETURN(
+      auto best_algorithm_idx,
+      GetBestBlasAlgorithm(
+          stream, buffer_allocator, /*gemm_str=*/std::nullopt, autotune_config,
+          lhs, rhs, out, algorithms, output_shape, HloModuleConfig(), beta,
+          [&](const se::blas::AlgorithmType& algorithm)
+              -> StatusOr<se::blas::ProfileResult> {
+            se::blas::ProfileResult profile_result;
+            // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will
+            // fail for all algorithms if we're targeting < sm_50.  But because
+            // we pass a non-null ProfileResult, DoGemmWithAlgorithm should
+            // always return true, and the actual success-ness is returned in
+            // ProfileResult::is_valid.
+            TF_RETURN_IF_ERROR(RunGemm(config, lhs, rhs, out, stream, algorithm,
+                                       &profile_result));
+            return std::move(profile_result);
+          }));
+
+  if (best_algorithm_idx.has_value()) {
+    config.algorithm = algorithms[best_algorithm_idx.value()];
+    return OkStatus();
+  } else {
+    return InternalError("Runtime autotuning failed to select an algorithm");
+  }
+}
+#endif
+
+static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
+                             const DebugOptions* debug_options,
+                             State<GemmConfig> state, StridedMemrefView lhs,
+                             StridedMemrefView rhs, StridedMemrefView out,
+                             int64_t algorithm, double alpha_real,
+                             double alpha_imag, double beta,
+                             DotDimensionNumbers dot_dims) {
+  se::DeviceMemoryBase lhs_data = GetDeviceAddress(lhs);
+  se::DeviceMemoryBase rhs_data = GetDeviceAddress(rhs);
+  se::DeviceMemoryBase output_data = GetDeviceAddress(out);
+
+  VLOG(3) << "Running GEMM";
+  se::Stream* stream = run_options->stream();
+  Shape output_shape = ToShape(out);
+
+  // Get the gemm config from the state.
+  absl::StatusOr<GemmConfig*> config = state.GetOrCreate([&] {
+    StatusOr<GemmConfig> gemm_config =
+        GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
+                      dot_dims.lhs_batch, dot_dims.lhs_contract,
+                      dot_dims.rhs_batch, dot_dims.rhs_contract);
+#if GOOGLE_CUDA
+    if (!gemm_config.ok()) return ToAbsl(gemm_config);
+    if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
+      auto status =
+          DoRuntimeAutotuning(stream, *gemm_config, lhs_data, rhs_data,
+                              output_data, output_shape, beta, debug_options);
+      if (!status.ok())
+        return absl::StatusOr<GemmConfig>(
+            absl::InternalError(status.ToString()));
+    }
+#endif
+    return ToAbsl(gemm_config);
+  });
+  if (!config.ok()) return config.status();
+
+  Status executed = RunGemm(**config, lhs_data, rhs_data, output_data, stream);
+
+  if (!executed.ok()) return ToAbslStatus(executed);
+
+  return absl::OkStatus();
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Gemm, FunctionWrapper<GemmImpl>(), checks,
+    CustomCall::Bind("xla.gpu.gemm")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
+        .State<GemmConfig>("uid")
+        .Arg<StridedMemrefView>()  // lhs
+        .Arg<StridedMemrefView>()  // rhs
+        .Arg<StridedMemrefView>()  // out
+        .Attr<int64_t>("algorithm")
+        .Attr<double>("alpha_real")
+        .Attr<double>("alpha_imag")
+        .Attr<double>("beta")
+        .Attr<DotDimensionNumbers>("dot_dims"));
+
+void RegisterGemmCustomCalls(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.gemm", Gemm);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/gemm.h b/tensorflow/compiler/xla/service/gpu/runtime/gemm.h
new file mode 100644
index 00000000000..5655e624ff4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/gemm.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GEMM_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GEMM_H_
+
+#include "absl/container/node_hash_map.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
+#include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime Gemm# custom calls.
+void RegisterGemmCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+// Keep GemmConfigs for all gemm/matmul instances in the executable.
+class GemmConfigs : public runtime::StateVector<GemmConfig> {};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GEMM_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
index 9bc2754a9b1..bc11827b781 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
@@ -15,57 +15,122 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h"
 
+#include <cstddef>
+#include <memory>
+#include <optional>
 #include <string>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 
 #if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
 #endif  // #if GOOGLE_CUDA
 
 namespace xla {
 namespace gpu {
 
-using xla::runtime::CustomCall;
-using xla::runtime::Executable;
-
-#if GOOGLE_CUDA
 using xla::runtime::Arguments;
 using xla::runtime::AsyncTaskRunner;
+using xla::runtime::CustomCall;
+using xla::runtime::Executable;
 using xla::runtime::MemrefDesc;
 using xla::runtime::ScalarArg;
 using xla::runtime::StridedMemrefView;
-#endif  // #if GOOGLE_CUDA
 
 //===----------------------------------------------------------------------===//
-// Define the cuda graph launch custom call.
+// CUDA graphs caching.
 //===----------------------------------------------------------------------===//
 
-static absl::Status LaunchGraph(
-    const ServiceExecutableRunOptions* run_options, const std::string* ptx,
-    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-    GpuExecutableKernelsCache* kernels_cache, runtime::Executable* executable,
-    CustomCall::RemainingArgs fwd_args, CustomCall::FunctionOrdinal capture) {
+StreamExecutorGraphInstances* GraphInstances::operator()(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mutex_);
+  return &graphs_[executor];
+}
+
+//===----------------------------------------------------------------------===//
+// Helper structure to hash the remaining arguments' memref pointers.
+//===----------------------------------------------------------------------===//
+
+struct RemainingArgsPtrs {
+  CustomCall::RemainingArgs args;
+  se::DeviceMemoryBase* temp_buffer;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const RemainingArgsPtrs& m);
+};
+
+template <typename H>
+H AbslHashValue(H h, const RemainingArgsPtrs& m) {
+  for (size_t i = 0; i < m.args.size(); ++i) {
+    if (auto memref = m.args.get<StridedMemrefView>(i); succeeded(memref))
+      h = H::combine(std::move(h), memref->data);
+  }
+  return std::move(H::combine(std::move(h), m.temp_buffer->opaque()));
+}
+
+//----------------------------------------------------------------------------//
+// Runs capture function exported by the executable to constuct a CUDA graph.
+//----------------------------------------------------------------------------//
+
 #if GOOGLE_CUDA
-  // Get a reference to exported function that captures the cuda graph.
-  runtime::FunctionRef function_ref = executable->function_ref(capture.ordinal);
 
-  VLOG(1) << "Launch Cuda Graph: capture=" << capture.ordinal;
+using se::gpu::OwnedCudaGraph;
 
-  // Forward user data required for launching kernels.
-  CustomCall::UserData user_data;
-  user_data.insert_all(run_options, ptx, cubin, temp_buffer, kernels_cache,
-                       executable);
+static bool InDebugMode() {
+#ifdef NDEBUG
+  return false;
+#endif
+  return true;
+}
 
-  // Graph capture function should not launch any async tasks.
+static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
+    const ServiceExecutableRunOptions* run_options,
+    runtime::FunctionRef function_ref, CustomCall::RemainingArgs fwd_args,
+    CustomCall::UserData user_data) {
+  // We capture graph on a borrowed stream because we do not want to
+  // accidentally record any concurrent kernel launches from other XLA
+  // executables.
+  se::StreamExecutor* executor = run_options->stream()->parent();
+  StatusOr<StreamPool::Ptr> capture_stream =
+      run_options->BorrowStream(executor->device_ordinal());
+
+  if (!capture_stream.ok())
+    return absl::InternalError(
+        absl::StrFormat("Failed to borrow a stream for graph capture: %s",
+                        capture_stream.status().error_message()));
+
+  // TODO(ezhulenev): Pass graph capture context explicitly to the custom calls
+  // via UserData to be able to detect when executing custom call in graph
+  // capture mode. Currently we rely on the fact that we know for sure that
+  // operations in the graph capture function do not need anything except the
+  // main stream (we capture only kernel launches).
+  ExecutableRunOptions capture_run_options;
+  capture_run_options.set_stream(capture_stream->get());
+
+  const ServiceExecutableRunOptions capture_opts(capture_run_options);
+  user_data.insert(&capture_opts);
+
+  std::string error;
+  runtime::DiagnosticEngine diagnostic_engine;
+  diagnostic_engine.AddHandler([&](runtime::Diagnostic& diagnostic) {
+    error.append(diagnostic.status().message());
+    return runtime::success();
+  });
+
+  // Prepare options for executing graph capture function.
   Executable::ExecuteOpts opts;
   opts.custom_call_data = &user_data;
+  opts.diagnostic_engine = &diagnostic_engine;
+
+  // Graph capture function should not launch any async tasks.
   opts.async_task_runner = reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
 
   // Graph capture functions can only have index arguments for launch
@@ -75,8 +140,8 @@ static absl::Status LaunchGraph(
   Arguments<ScalarArg, MemrefDesc> args(fwd_args.size());
 
   for (size_t i = 0; i < fwd_args.size(); ++i) {
-    // `index` argument passed as intptr_t.
-    if (auto idx = fwd_args.get<intptr_t>(i); succeeded(idx)) {
+    // `index` argument passed as int64_t.
+    if (auto idx = fwd_args.get<int64_t>(i); succeeded(idx)) {
       args.emplace_back<ScalarArg>(*idx);
       continue;
     }
@@ -91,73 +156,108 @@ static absl::Status LaunchGraph(
     return absl::InvalidArgumentError("Unsupported argument type");
   }
 
-  // TODO(ezhulenev): This function instantiates cuda graphs on every call,
-  // which is absolutely not how it should be done. Graphs have to be cached,
-  // and updated whenever we receive new arguments. This is a proof of concept
-  // demonstration of integrating Cuda Graphs with Xla runtime and trivial
-  // compiler pass that outlines sequences of device function launches.
+  // Create a graph from running the graph capture function.
+  auto captured = se::gpu::CaptureCudaGraph(capture_stream->get(), [&]() {
+    return FromAbslStatus(function_ref(args, runtime::NoResultConverter{}, opts,
+                                       /*verify_arguments=*/InDebugMode())
+                              .status());
+  });
 
-  // Construct cuda graph from the exported function.
-  cudaGraph_t graph;
-  cudaGraphExec_t instance;
+  if (!captured.ok()) return ToAbslStatus(captured.status());
+  return std::move(*captured);
+}
 
-  // Get the underlying cuda stream.
-  auto stream = se::gpu::AsGpuStreamValue(run_options->stream());
+#endif  // #if GOOGLE_CUDA
 
-  cudaError_t err;
+//===----------------------------------------------------------------------===//
+// Define the cuda graph launch custom call.
+//===----------------------------------------------------------------------===//
 
-  // Capture graph constructed by the exported graph capture function.
-  err = cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-  if (err != cudaSuccess)
-    return absl::InternalError("Stream begin capture failed");
+static absl::Status LaunchGraph(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, const std::string* ptx,
+    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
+    StreamExecutorKernels::Snapshot* kernels,
+    StreamExecutorConvRunners::Snapshot* convs,
+    StreamExecutorGraphInstances::Snapshot* instances,
+    runtime::Executable* executable, CustomCall::RemainingArgs fwd_args,
+    CustomCall::FunctionOrdinal capture) {
+#if GOOGLE_CUDA
+  VLOG(1) << "Launch Cuda Graph: capture=" << capture.ordinal;
 
-  // Call into graph capture function.
-  auto captured = function_ref(args, runtime::NoResultConverter{}, opts);
-  if (!captured.ok()) return captured;
+  // Get a reference to exported function that captures the cuda graph.
+  runtime::FunctionRef function_ref = executable->function_ref(capture.ordinal);
 
-  err = cudaStreamEndCapture(stream, &graph);
-  if (err != cudaSuccess)
-    return absl::InternalError("Stream end capture failed");
+  // Compute the hash of the buffer arguments.
+  size_t ptrs_hash = absl::HashOf(RemainingArgsPtrs{fwd_args, temp_buffer});
 
-  err = cudaGraphInstantiate(&instance, graph, nullptr, nullptr, 0);
-  if (err != cudaSuccess)
-    return absl::InternalError("Graph instantiation failed");
+  // Forwards user data required for launching kernels.
+  auto user_data = [&] {
+    return CustomCall::UserData(run_options, debug_options, ptx, cubin,
+                                temp_buffer, kernels, convs, executable);
+  };
 
-  // Run captured graph.
-  cudaGraphLaunch(instance, stream);
-  if (err != cudaSuccess)
-    return absl::InternalError("Failed to run captured graph");
+  absl::StatusOr<GraphInstance*> instance = instances->GetOrCreate(
+      capture.ordinal, [&]() -> absl::StatusOr<GraphInstance> {
+        auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
+        if (!g.ok()) return g.status();
 
-  // Destroy captured graph.
-  err = cudaGraphExecDestroy(instance);
-  if (err != cudaSuccess) return absl::InternalError("Instance destroy failed");
-  err = cudaGraphDestroy(graph);
-  if (err != cudaSuccess) return absl::InternalError("Graph destroy failed");
+        auto e = se::gpu::InstantiateCudaGraph(std::move(*g));
+        if (!e.ok()) return ToAbslStatus(e.status());
+
+        return GraphInstance(ptrs_hash, std::move(*e));
+      });
+  if (!instance.ok()) return instance.status();
+
+  // Lock graph instance mutex for exclusive access, because we potentially
+  // might have to update it with a new graph version.
+  absl::MutexLock lock((*instance)->mutex.get());
+
+  // If pointers did not change we can run captured graph.
+  if (ptrs_hash == (*instance)->ptr_hash) {
+    VLOG(3) << "Execute cached graph instance";
+    return ToAbslStatus((*instance)->exec.Launch(run_options->stream()));
+  }
+
+  // Otherwise we have to re-capture the graph and update the graph instance.
+  VLOG(3) << "Update cached graph instance";
+
+  // Capture CUDA graph by running capture function.
+  auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
+  if (!g.ok()) return g.status();
+
+  // Update captured graph executable.
+  auto updated = (*instance)->exec.Update(std::move(*g));
+  if (!updated.ok()) return ToAbslStatus(updated);
+
+  // Update captured graph pointers hash.
+  (*instance)->ptr_hash = ptrs_hash;
+
+  return ToAbslStatus((*instance)->exec.Launch(run_options->stream()));
+
+#else  // #if !GOOGLE_CUDA
 
-  return absl::OkStatus();
-#else
   return absl::InternalError("Cuda graphs are not supported");
+
 #endif  // #if GOOGLE_CUDA
 }
 
 //===----------------------------------------------------------------------===//
 
-static bool Launch(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                   void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.cuda.graph.launch")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const std::string*>()
-                             .UserData<const std::vector<uint8_t>*>()
-                             .UserData<se::DeviceMemoryBase*>()
-                             .UserData<GpuExecutableKernelsCache*>()
-                             .UserData<Executable*>()
-                             .RemainingArgs()
-                             .Attr<CustomCall::FunctionOrdinal>("capture")
-                             .To<checks>(LaunchGraph)
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Launch, FunctionWrapper<LaunchGraph>(), checks,
+    CustomCall::Bind("xla.gpu.cuda.graph.launch")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const DebugOptions*>()
+        .UserData<const std::string*>()
+        .UserData<const std::vector<uint8_t>*>()
+        .UserData<se::DeviceMemoryBase*>()
+        .UserData<StreamExecutorKernels::Snapshot*>()
+        .UserData<StreamExecutorConvRunners::Snapshot*>()
+        .UserData<StreamExecutorGraphInstances::Snapshot*>()
+        .UserData<Executable*>()
+        .RemainingArgs()
+        .Attr<CustomCall::FunctionOrdinal>("capture"));
 
 void RegisterGraphLaunchCustomCalls(
     runtime::DirectCustomCallRegistry& registry) {
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
index 3381d323d92..cec98701ee4 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.h
@@ -17,12 +17,19 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
 
 #include <memory>
+#include <optional>
 #include <string_view>
 #include <tuple>
+#include <utility>
 
+#include "absl/container/node_hash_map.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
+#endif  // #if GOOGLE_CUDA
+
 namespace xla {
 namespace gpu {
 
@@ -30,6 +37,52 @@ namespace gpu {
 void RegisterGraphLaunchCustomCalls(
     runtime::DirectCustomCallRegistry& registry);
 
+struct GraphInstance;                // Forward declare
+class StreamExecutorGraphInstances;  // Forward declare
+
+#if GOOGLE_CUDA
+
+// A state vector that owns all instantiated CUDA graphs. Graph capture function
+// ordinal is the key in this container.
+class StreamExecutorGraphInstances
+    : public runtime::StateVector<GraphInstance> {};
+
+// Instantiated CUDA graph instance guarded with a mutex for exclusive access.
+struct GraphInstance {
+  GraphInstance(size_t ptr_hash, se::gpu::OwnedCudaGraphExec exec)
+      : ptr_hash(ptr_hash), exec(std::move(exec)), mutex(new absl::Mutex) {}
+
+  // Graph instance is fully identified by the hash of its pointer arguments
+  // because currently it's guaranteed that all shapes and launch dimensions
+  // will be constant from run to run.
+  size_t ptr_hash ABSL_GUARDED_BY(*mutex);
+  se::gpu::OwnedCudaGraphExec exec ABSL_GUARDED_BY(*mutex);
+
+  // Access to a graph instance must be synchronized, because we potentially can
+  // run concurrent graph instance updates.
+  std::unique_ptr<absl::Mutex> mutex;
+};
+
+#else  // #if !GOOGLE_CUDA
+
+// Define empty struct and empty state when CUDA is not enabled.
+struct GraphInstance {};
+class StreamExecutorGraphInstances
+    : public runtime::StateVector<GraphInstance> {};
+
+#endif  // #if GOOGLE_CUDA
+
+// Xla executable keeps a mapping from stream executors to graph instances.
+class GraphInstances {
+ public:
+  StreamExecutorGraphInstances* operator()(se::StreamExecutor* executor);
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::node_hash_map<se::StreamExecutor*, StreamExecutorGraphInstances> graphs_
+      ABSL_GUARDED_BY(mutex_);
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc b/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
index ba6eb4dfd02..e1922ebaa82 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
@@ -29,20 +29,10 @@ namespace xla {
 namespace gpu {
 
 using runtime::CustomCall;
-using runtime::Executable;
-
-namespace {
-struct Infeed {
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs args,
-                          std::string_view config) const;
-  static Infeed Handler() { return Infeed(); }
-};
-}  // namespace
-
-absl::Status Infeed::operator()(const ServiceExecutableRunOptions* run_options,
-                                CustomCall::RemainingArgs args,
-                                std::string_view config) const {
+
+static absl::Status InfeedImpl(const ServiceExecutableRunOptions* run_options,
+                               CustomCall::RemainingArgs args,
+                               std::string_view config) {
   VLOG(3) << "Infeeding to GPU";
 
   se::Stream* stream = run_options->stream();
@@ -87,30 +77,9 @@ absl::Status Infeed::operator()(const ServiceExecutableRunOptions* run_options,
   return absl::OkStatus();
 }
 
-static bool Infeed(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                   void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.infeed")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .Attr<std::string_view>("config")
-                             .To<checks>(Infeed::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-namespace {
-struct Outfeed {
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          CustomCall::RemainingArgs args,
-                          std::string_view config) const;
-  static Outfeed Handler() { return Outfeed(); }
-};
-}  // namespace
-
-absl::Status Outfeed::operator()(const ServiceExecutableRunOptions* run_options,
-                                 CustomCall::RemainingArgs args,
-                                 std::string_view config) const {
+static absl::Status OutfeedImpl(const ServiceExecutableRunOptions* run_options,
+                                CustomCall::RemainingArgs args,
+                                std::string_view config) {
   VLOG(3) << "Outfeeding from GPU";
 
   se::Stream* stream = run_options->stream();
@@ -182,21 +151,29 @@ absl::Status Outfeed::operator()(const ServiceExecutableRunOptions* run_options,
   return absl::OkStatus();
 }
 
-static bool Outfeed(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                    void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.outfeed")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .Arg<CustomCall::RemainingArgs>()  // args
-                             .Attr<std::string_view>("config")
-                             .To<checks>(Outfeed::Handler())
-                             .release();
+//===----------------------------------------------------------------------===//
+// Define Xla runtime bindings for the custom calls.
+//===----------------------------------------------------------------------===//
 
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Infeed, FunctionWrapper<InfeedImpl>(), checks,
+    CustomCall::Bind("xla.gpu.infeed")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<CustomCall::RemainingArgs>()  // args
+        .Attr<std::string_view>("config"));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Outfeed, FunctionWrapper<OutfeedImpl>(), checks,
+    CustomCall::Bind("xla.gpu.outfeed")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<CustomCall::RemainingArgs>()  // args
+        .Attr<std::string_view>("config"));
+
+//===----------------------------------------------------------------------===//
 
 void RegisterIoFeedCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.infeed", &xla::gpu::Infeed);
-  registry.Register("xla.gpu.outfeed", &xla::gpu::Outfeed);
+  registry.Register("xla.gpu.infeed", Infeed);
+  registry.Register("xla.gpu.outfeed", Outfeed);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
index de56e658116..08af785b72e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
@@ -23,72 +23,38 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/stream_executor/kernel.h"
 
 namespace xla {
 namespace gpu {
 
 using xla::runtime::CustomCall;
-using xla::runtime::Executable;
-using xla::runtime::FlatMemrefView;
+using xla::runtime::State;
 using xla::runtime::StridedMemrefView;
 
-//===----------------------------------------------------------------------===//
-// Keep track of all device kernels loaded by a XLA runtime Gpu executable.
-//===----------------------------------------------------------------------===//
-
-se::KernelBase* GpuExecutableKernelsCache::Get(se::StreamExecutor* executor,
-                                               std::string_view name) {
-  Key key(executor, name);
-
+StreamExecutorKernels* GpuExecutableKernels::operator()(
+    se::StreamExecutor* executor) {
   absl::MutexLock lock(&mutex_);
-  auto it = kernels_cache_.find(key);
-  if (it != kernels_cache_.end()) return it->second.get();
-
-  return nullptr;
-}
-
-se::KernelBase* GpuExecutableKernelsCache::Set(
-    se::StreamExecutor* executor, std::string_view name,
-    std::unique_ptr<se::KernelBase> kernel) {
-  Key key(executor, name);
-
-  absl::MutexLock lock(&mutex_);
-  auto it = kernels_cache_.find(key);
-  if (it != kernels_cache_.end()) return it->second.get();
-
-  auto emplaced = kernels_cache_.try_emplace(key, std::move(kernel));
-  return emplaced.first->second.get();
+  return &kernels_[executor];
 }
 
 //===----------------------------------------------------------------------===//
 // Define the kernel launch custom call.
 //===----------------------------------------------------------------------===//
 
-namespace {
-struct KernelLaunch {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(
-      const ServiceExecutableRunOptions* run_options, const std::string* ptx,
-      const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-      GpuExecutableKernelsCache* kernels_cache, int32_t grid_size_x,
-      int32_t grid_size_y, int32_t grid_size_z, int32_t block_size_x,
-      int32_t block_size_y, int32_t block_size_z,
-      CustomCall::RemainingArgs args, std::string_view name) const;
-  static KernelLaunch Handler() { return KernelLaunch(); }
-};
-}  // namespace
-
-absl::Status KernelLaunch::operator()(
+static absl::Status LaunchImpl(
     const ServiceExecutableRunOptions* run_options, const std::string* ptx,
     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-    GpuExecutableKernelsCache* kernels_cache, int32_t grid_size_x,
-    int32_t grid_size_y, int32_t grid_size_z, int32_t block_size_x,
-    int32_t block_size_y, int32_t block_size_z, CustomCall::RemainingArgs args,
-    std::string_view name) const {
+    State<std::unique_ptr<se::KernelBase>> device_kernel,
+    int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
+    int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
+    int32_t block_size_z, CustomCall::RemainingArgs args,
+    std::string_view name) {
   se::Stream* stream = run_options->stream();
   se::StreamExecutor* executor = stream->parent();
 
@@ -96,35 +62,29 @@ absl::Status KernelLaunch::operator()(
       {grid_size_x, grid_size_y, grid_size_z},
       {block_size_x, block_size_y, block_size_z});
 
-  se::KernelBase* kernel = kernels_cache->Get(executor, name);
   const int args_size_including_temp_buffer = args.size() + 1;
 
-  // If kernel does not exists create it from the ptx and dubin.
-  if (kernel == nullptr) {
-    auto created =
-        CreateKernel(absl::string_view(name.data(), name.size()),
-                     args_size_including_temp_buffer, *ptx, *cubin, executor);
-    if (!created.ok()) return ToAbslStatus(created.status());
-
-    kernel = kernels_cache->Set(executor, name, std::move(*created));
-  }
-
-  VLOG(3) << "Launching " << kernel->name();
+  // If kernel does not exist create it from the ptx and cubin.
+  absl::StatusOr<std::unique_ptr<se::KernelBase>*> kernel =
+      device_kernel.GetOrCreate([&] {
+        return ToAbsl(CreateKernel(absl::string_view(name.data(), name.size()),
+                                   args_size_including_temp_buffer, *ptx,
+                                   *cubin, executor, shared_memory_bytes));
+      });
+  if (!kernel.ok()) return kernel.status();
+  assert((**kernel)->name() == name && "unexpected loaded kernel");
+
+  VLOG(3) << "Launching " << (**kernel)->name();
   absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(
       args_size_including_temp_buffer);
 
   // Add MemRef arguments as buffer arguments.
   for (unsigned i = 0; i < args.size(); ++i) {
-    // Simple row major memref passed as shapeless buffer.
-    if (auto memref = args.get<FlatMemrefView>(i); succeeded(memref)) {
-      buffer_args[i] = GetDeviceAddress(*memref);
-      continue;
-    }
-
-    // Memref layout must be encoded in the compiled device kernel, so we don't
-    // have to pass strides or minor to major dimensions order to the kernel.
+    // We get arguments corresponding to XLA allocations required by the
+    // compiled device kernel, and not the actual memrefs that device kernel
+    // writes/reads, so we don't have to pass the size along with the pointer.
     if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
-      buffer_args[i] = GetDeviceAddress(*strided);
+      buffer_args[i] = se::DeviceMemoryBase(strided->data);
       continue;
     }
 
@@ -137,7 +97,7 @@ absl::Status KernelLaunch::operator()(
 
   // Execute device kernel on a main stream.
   auto executed =
-      ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions, stream);
+      ExecuteKernelOnStream(***kernel, buffer_args, launch_dimensions, stream);
   if (!executed.ok()) return ToAbslStatus(executed);
 
   return absl::OkStatus();
@@ -145,27 +105,23 @@ absl::Status KernelLaunch::operator()(
 
 //===----------------------------------------------------------------------===//
 
-static bool Launch(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                   void** rets) {
-  static auto* handler = CustomCall::Bind("xla.gpu.func.launch")
-                             .UserData<const ServiceExecutableRunOptions*>()
-                             .UserData<const std::string*>()
-                             .UserData<const std::vector<uint8_t>*>()
-                             .UserData<se::DeviceMemoryBase*>()
-                             .UserData<GpuExecutableKernelsCache*>()
-                             .Arg<int32_t>()   // grid_size_x
-                             .Arg<int32_t>()   // grid_size_y
-                             .Arg<int32_t>()   // grid_size_z
-                             .Arg<int32_t>()   // block_size_x
-                             .Arg<int32_t>()   // block_size_y
-                             .Arg<int32_t>()   // block_size_x
-                             .RemainingArgs()  // args
-                             .Attr<std::string_view>("kernel")
-                             .To<checks>(KernelLaunch::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Launch, FunctionWrapper<LaunchImpl>(), checks,
+    CustomCall::Bind("xla.gpu.func.launch")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<const std::string*>()
+        .UserData<const std::vector<uint8_t>*>()
+        .UserData<se::DeviceMemoryBase*>()
+        .State<std::unique_ptr<se::KernelBase>>("uid")
+        .Arg<int32_t>()   // shared_memory_bytes
+        .Arg<int32_t>()   // grid_size_x
+        .Arg<int32_t>()   // grid_size_y
+        .Arg<int32_t>()   // grid_size_z
+        .Arg<int32_t>()   // block_size_x
+        .Arg<int32_t>()   // block_size_y
+        .Arg<int32_t>()   // block_size_x
+        .RemainingArgs()  // args
+        .Attr<std::string_view>("kernel"));
 
 void RegisterKernelLaunchCustomCalls(
     runtime::DirectCustomCallRegistry& registry) {
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h
index c1904a8b9f4..95c58fdc1af 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h
@@ -20,9 +20,10 @@ limitations under the License.
 #include <string_view>
 #include <tuple>
 
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -32,23 +33,20 @@ namespace gpu {
 void RegisterKernelLaunchCustomCalls(
     runtime::DirectCustomCallRegistry& registry);
 
+// Kernels loaded by Gpu executable for a single stream executor.
+class StreamExecutorKernels
+    : public runtime::StateVector<std::unique_ptr<se::KernelBase>> {};
+
 // Xla runtime Gpu executable owns the pre-compiled device module (PTX and
 // Cubin for Nvidia Gpus) for all device kernels, and the cache keeps a mapping
-// from the kernel name to the loaded device kernel.
-class GpuExecutableKernelsCache {
+// from stream executor to pre-loaded kernels
+class GpuExecutableKernels {
  public:
-  GpuExecutableKernelsCache() = default;
-
-  se::KernelBase* Get(se::StreamExecutor* executor, std::string_view name);
-
-  se::KernelBase* Set(se::StreamExecutor* executor, std::string_view name,
-                      std::unique_ptr<se::KernelBase> kernel);
+  StreamExecutorKernels* operator()(se::StreamExecutor* executor);
 
  private:
   mutable absl::Mutex mutex_;
-
-  using Key = std::tuple<se::StreamExecutor*, std::string_view>;
-  absl::flat_hash_map<Key, std::unique_ptr<se::KernelBase>> kernels_cache_
+  absl::node_hash_map<se::StreamExecutor*, StreamExecutorKernels> kernels_
       ABSL_GUARDED_BY(mutex_);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc
new file mode 100644
index 00000000000..27ec72e3bfe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc
@@ -0,0 +1,90 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/memcpy.h"
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+
+namespace xla {
+namespace gpu {
+
+using xla::runtime::CustomCall;
+using xla::runtime::StridedMemrefView;
+
+enum class MemcpyDirection { kD2D, kD2H, kH2D };
+
+template <MemcpyDirection direction>
+absl::Status MemcpyImpl(const ServiceExecutableRunOptions* run_options,
+                        runtime::StridedMemrefView dst,
+                        runtime::StridedMemrefView src) {
+  se::Stream* stream = run_options->stream();
+
+  if (dst.sizes != src.sizes) {
+    return absl::InvalidArgumentError(
+        "Source memref sizes do not match destination memref sizes");
+  }
+
+  if (dst.strides != src.strides) {
+    return absl::InvalidArgumentError(
+        "Source memref strides do not match destination memref strides");
+  }
+
+  switch (direction) {
+    case MemcpyDirection::kD2D: {
+      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
+      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
+      stream->ThenMemcpy(&dst_data, src_data, src_data.size());
+    } break;
+    case MemcpyDirection::kD2H: {
+      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
+      stream->ThenMemcpy(dst.data, src_data, src_data.size());
+    } break;
+    case MemcpyDirection::kH2D: {
+      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
+      stream->ThenMemcpy(&dst_data, src.data, dst_data.size());
+    } break;
+  }
+
+  // TODO(jacksonstokes): H2D and D2H memcpy instead of blocking the execution
+  // thread should return an async token that will become available when
+  // transfer is completed.
+  if (direction != MemcpyDirection::kD2D) {
+    auto st = stream->BlockHostUntilDone();
+    if (!st.ok()) return ToAbslStatus(st);
+  }
+
+  return absl::OkStatus();
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(
+    MemcpyDirection direction, Memcpy, FunctionWrapper<MemcpyImpl<direction>>(),
+    checks,
+    CustomCall::Bind("xla.gpu.memcpy")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<runtime::StridedMemrefView>()  // dst
+        .Arg<runtime::StridedMemrefView>()  // src
+);
+
+void RegisterMemcpyCustomCalls(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.memcpy.d2d", Memcpy<MemcpyDirection::kD2D>);
+  registry.Register("xla.gpu.memcpy.h2d", Memcpy<MemcpyDirection::kH2D>);
+  registry.Register("xla.gpu.memcpy.d2h", Memcpy<MemcpyDirection::kD2H>);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memcpy.h b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.h
new file mode 100644
index 00000000000..1a91b1ec400
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime memcpy custom calls.
+void RegisterMemcpyCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memset.cc b/tensorflow/compiler/xla/service/gpu/runtime/memset.cc
new file mode 100644
index 00000000000..45812c793cc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memset.cc
@@ -0,0 +1,146 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/memset.h"
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+
+namespace xla {
+namespace gpu {
+
+using xla::runtime::CustomCall;
+using xla::runtime::StridedMemrefView;
+
+// Checks all supported data types to see if the value is zero.
+static bool IsZero(CustomCall::VariantArg constant) {
+  if (auto i1 = constant.get<bool>(); succeeded(i1))
+    return *i1 == false;
+  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
+    return *i8 == 0;
+  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
+    return *i16 == 0;
+  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
+    return *i32 == 0;
+  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
+    return *i64 == 0;
+  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
+    return *bf16 == bfloat16(0.0);
+  else if (auto f16 = constant.get<half>(); succeeded(f16))
+    return *f16 == half(0.0);
+  else if (auto f32 = constant.get<float>(); succeeded(f32))
+    return *f32 == 0.0;
+  else if (auto f64 = constant.get<double>(); succeeded(f64))
+    return *f64 == 0.0;
+
+  return false;
+}
+
+// Convert constant value to 32-bit pattern.
+static absl::StatusOr<uint32_t> ToBitPattern(CustomCall::VariantArg constant) {
+  // If the value is 8 or 16 bits wide, we can emit a 32-bit memset by
+  // repeating the value 4 or 2 times, so long as the destination buffer is
+  // an even multiple of 32 bits long.
+  //
+  // This code is identical to `ir_emitter_unnested`.
+  //
+  // We use `memcpy` operation to copy bytes between value and the uint32_t bit
+  // pattern because in theory they might have incompatible alignment, and we
+  // rely on LLVM to optimize it.
+  auto extend = [](auto value) -> uint32_t {
+    static constexpr size_t num_bytes = sizeof(value);
+    static_assert(num_bytes < 4);
+
+    uint16_t pattern16;
+    if constexpr (num_bytes == 1) {
+      uint8_t b = value;
+      pattern16 = uint16_t{b} | (uint16_t{b} << 8);
+    } else {
+      memcpy(&pattern16, &value, sizeof(pattern16));
+    }
+    return uint32_t{pattern16} | (uint32_t{pattern16} << 16);
+  };
+
+  // Truncate value to 32-bit pattern.
+  auto truncate = [](auto value) -> uint32_t {
+    static_assert(sizeof(value) >= 4);
+
+    uint32_t pattern;
+    memcpy(&pattern, &value, sizeof(pattern));
+    return pattern;
+  };
+
+  if (auto i1 = constant.get<bool>(); succeeded(i1))
+    return extend(*i1);
+  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
+    return extend(*i8);
+  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
+    return extend(*i16);
+  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
+    return truncate(*i32);
+  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
+    return truncate(*i64);
+  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
+    return extend(static_cast<uint16_t>(*bf16));
+  else if (auto f16 = constant.get<half>(); succeeded(f16))
+    return extend(static_cast<uint16_t>(*f16));
+  else if (auto f32 = constant.get<float>(); succeeded(f32))
+    return truncate(*f32);
+  else if (auto f64 = constant.get<double>(); succeeded(f64))
+    return truncate(*f64);
+
+  return absl::InvalidArgumentError("Unsupported memset constant type");
+}
+
+static absl::Status MemsetImpl(const ServiceExecutableRunOptions* run_options,
+                               StridedMemrefView dst,
+                               CustomCall::VariantArg constant) {
+  se::Stream* stream = run_options->stream();
+  se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
+
+  // If the constant is zero we can use memzero directly.
+  if (IsZero(constant)) {
+    stream->ThenMemZero(&dst_data, dst_data.size());
+    return absl::OkStatus();
+  }
+
+  // If the constant is not zero, use the given pattern to `memset`.
+  absl::StatusOr<uint32_t> pattern = ToBitPattern(constant);
+  if (!pattern.ok()) return pattern.status();
+
+  if (dst_data.size() % 4 != 0)
+    return absl::InvalidArgumentError("Memref size is not divisible by 4");
+
+  stream->ThenMemset32(&dst_data, *pattern, dst_data.size());
+
+  return absl::OkStatus();
+}
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Memset, FunctionWrapper<MemsetImpl>(), checks,
+    CustomCall::Bind("xla.gpu.memset")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .Arg<StridedMemrefView>()       // dst
+        .Arg<CustomCall::VariantArg>()  // constant
+);
+
+void RegisterMemsetCustomCalls(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.memset", Memset);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memset.h b/tensorflow/compiler/xla/service/gpu/runtime/memset.h
new file mode 100644
index 00000000000..1d9f2ccbb6c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memset.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime memset custom calls.
+void RegisterMemsetCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
new file mode 100644
index 00000000000..ec29a5359c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
@@ -0,0 +1,310 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/send_recv.h"
+
+#include <memory>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
+#include "tensorflow/tsl/profiler/lib/traceme_encode.h"
+#include "tfrt/concurrency/async_value.h"  // from @tf_runtime
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace xla {
+namespace gpu {
+
+using absl::InternalError;
+using absl::InvalidArgumentError;
+using absl::StrFormat;
+
+using tsl::AsyncValueRef;
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+
+using xla::runtime::AggregateAttrDef;
+using xla::runtime::AggregateAttrEncoding;
+using xla::runtime::CustomCall;
+using xla::runtime::CustomCallAttrEncodingSet;
+using xla::runtime::Dictionary;
+using xla::runtime::StridedMemrefView;
+using xla::runtime::Tagged;
+using xla::runtime::TypeIDNameRegistry;
+
+namespace mhlo = ::mlir::mhlo;
+
+//===----------------------------------------------------------------------===//
+// Structs for encoding send/recv operations attributes.
+//===----------------------------------------------------------------------===//
+
+struct ChannelHandle {
+  int64_t handle;
+  int64_t type;
+};
+
+}  // namespace gpu
+
+//===----------------------------------------------------------------------===//
+// Register send/recv attributes decoding with the Xla runtime.
+//===----------------------------------------------------------------------===//
+
+namespace runtime {
+
+XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(xla::gpu::ChannelHandle,
+                                             AggregateMember<int64_t>("handle"),
+                                             AggregateMember<int64_t>("type"));
+
+}  // namespace runtime
+
+//===----------------------------------------------------------------------===//
+// Type names for encoded attributes.
+//===----------------------------------------------------------------------===//
+
+namespace gpu {
+
+void RegisterSendRecvTypeIdNames(TypeIDNameRegistry& registry) {
+  registry.Register<Tagged<ChannelHandle>>("__type_id_channel_handle");
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding from MHLO attributes to Xla runtime aggregate attributes.
+//===----------------------------------------------------------------------===//
+
+void PopulateSendRecvAttrEncoding(CustomCallAttrEncodingSet& encoding) {
+  {  // --- Encode `mhlo::ChannelHandleAttr`.
+    using Attr = mhlo::ChannelHandleAttr;
+    encoding.Add<AggregateAttrEncoding<Attr, ChannelHandle>>(
+        encoding, AggregateAttrDef<Attr>()
+                      .Add("handle", &Attr::getHandle)
+                      .Add("type", &Attr::getType));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Support for running asynchronous Send/Recv SendDone/RecvDone operations.
+//===----------------------------------------------------------------------===//
+
+absl::Status SendRecvEvents::PushEvent(int32_t handle,
+                                       AsyncValueRef<se::Event> event) {
+  absl::MutexLock lock(&mutex_);
+  if (auto it = events_.try_emplace(handle, std::move(event)); it.second)
+    return absl::OkStatus();
+
+  return InternalError(
+      StrFormat("Async send/recv event already exists (handle=%d)", handle));
+}
+
+absl::StatusOr<AsyncValueRef<se::Event>> SendRecvEvents::PopEvent(
+    int32_t handle) {
+  absl::MutexLock lock(&mutex_);
+  if (auto event = events_.extract(handle)) return std::move(event.mapped());
+
+  return InternalError(
+      StrFormat("Async send/recv event was not found (handle==%d)", handle));
+}
+
+//===----------------------------------------------------------------------===//
+// Send/Recv custom call implementation.
+//===----------------------------------------------------------------------===//
+
+static absl::Status SendImpl(const ServiceExecutableRunOptions* run_options,
+                             SendRecvEvents* events, StridedMemrefView arg,
+                             ChannelHandle channel, bool is_host_transfer,
+                             Dictionary frontend_attrs) {
+  VLOG(3) << "Send buffer:"
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("xla.gpu.send", {{"channel", channel.handle}});
+  });
+
+  // For now we only support transfers between the device and the host.
+  if (!is_host_transfer)
+    return InvalidArgumentError(
+        "Device to device communication operations are not supported");
+
+  // Use device_to_host stream if it is available.
+  se::Stream* stream = run_options->run_options().device_to_host_stream();
+  if (stream) {
+    stream->ThenWaitFor(run_options->stream());
+  } else {
+    stream = run_options->stream();
+  }
+
+  // Send buffer to a handler registered with the run options.
+  if (auto* send = run_options->run_options().send_device_memory_function()) {
+    auto done_event =
+        (*send)(channel.handle, stream, ToShape(arg), GetDeviceAddress(arg));
+    if (!done_event.ok()) return ToAbslStatus(done_event.status());
+    return events->PushEvent(channel.handle, std::move(*done_event));
+  }
+
+  return InvalidArgumentError("SendDeviceMemoryFunction is not available");
+}
+
+static absl::Status RecvImpl(const ServiceExecutableRunOptions* run_options,
+                             SendRecvEvents* events, StridedMemrefView arg,
+                             ChannelHandle channel, bool is_host_transfer,
+                             Dictionary frontend_attrs) {
+  VLOG(3) << "Receive buffer:"
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("xla.gpu.recv", {{"channel", channel.handle}});
+  });
+
+  // For now we only support transfers between the device and the host.
+  if (!is_host_transfer)
+    return InvalidArgumentError(
+        "Device to device communication operations are not supported");
+
+  // Use host_to_device stream if it is available.
+  se::Stream* stream = run_options->run_options().host_to_device_stream();
+  if (stream) {
+    stream->ThenWaitFor(run_options->stream());
+  } else {
+    stream = run_options->stream();
+  }
+
+  // Recv buffer from a handler registered with the run options.
+  if (auto* recv = run_options->run_options().recv_device_memory_function()) {
+    auto dst = GetDeviceAddress(arg);
+    auto done_event = (*recv)(channel.handle, stream, ToShape(arg), &dst);
+    if (!done_event.ok()) return ToAbslStatus(done_event.status());
+    return events->PushEvent(channel.handle, std::move(*done_event));
+  }
+
+  return InvalidArgumentError("RecvDeviceMemoryFunction is not available");
+}
+
+static absl::Status SendDoneImpl(const ServiceExecutableRunOptions* run_options,
+                                 SendRecvEvents* events, ChannelHandle channel,
+                                 bool is_host_transfer) {
+  VLOG(3) << "Wait for Send completion:"
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("xla.gpu.send_done", {{"channel", channel.handle}});
+  });
+
+  auto done_event = events->PopEvent(channel.handle);
+  if (!done_event.ok()) return done_event.status();
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event->GetAsyncValue());
+  if (done_event->IsError()) return done_event->GetError();
+
+  VLOG(5) << "Completed Send operation: "
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  // Once event is recorded we can add a stream dependency.
+  run_options->stream()->ThenWaitFor(&done_event->get());
+  return absl::OkStatus();
+}
+
+static absl::Status RecvDoneImpl(const ServiceExecutableRunOptions* run_options,
+                                 SendRecvEvents* events, ChannelHandle channel,
+                                 bool is_host_transfer) {
+  VLOG(3) << "Wait for Recv completion:"
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  TraceMe trace([&] {
+    return TraceMeEncode("xla.gpu.recv_done", {{"channel", channel.handle}});
+  });
+
+  auto done_event = events->PopEvent(channel.handle);
+  if (!done_event.ok()) return done_event.status();
+
+  // Wait until send handler will record an event on the stream.
+  BlockUntilReady(done_event->GetAsyncValue());
+  if (done_event->IsError()) return done_event->GetError();
+
+  VLOG(5) << "Completed Recv operation: "
+          << " channel=" << channel.handle
+          << " is_host_transfer=" << is_host_transfer;
+
+  // Once event is recorded we can add a stream dependency.
+  run_options->stream()->ThenWaitFor(&done_event->get());
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// Send/Recv custom calls bindings and registration.
+//===----------------------------------------------------------------------===//
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Send, FunctionWrapper<SendImpl>(), checks,
+    CustomCall::Bind("xla.gpu.send")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<SendRecvEvents*>()
+        .Arg<StridedMemrefView>()
+        .Attr<ChannelHandle>("channel_handle")
+        .Attr<bool>("is_host_transfer")
+        .Attr<Dictionary>("frontend_attributes"));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Recv, FunctionWrapper<RecvImpl>(), checks,
+    CustomCall::Bind("xla.gpu.recv")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<SendRecvEvents*>()
+        .Arg<StridedMemrefView>()
+        .Attr<ChannelHandle>("channel_handle")
+        .Attr<bool>("is_host_transfer")
+        .Attr<Dictionary>("frontend_attributes"));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    SendDone, FunctionWrapper<SendDoneImpl>(), checks,
+    CustomCall::Bind("xla.gpu.send_done")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<SendRecvEvents*>()
+        .Attr<ChannelHandle>("channel_handle")
+        .Attr<bool>("is_host_transfer"));
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    RecvDone, FunctionWrapper<RecvDoneImpl>(), checks,
+    CustomCall::Bind("xla.gpu.recv_done")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<SendRecvEvents*>()
+        .Attr<ChannelHandle>("channel_handle")
+        .Attr<bool>("is_host_transfer"));
+
+//===----------------------------------------------------------------------===//
+
+// Registers XLA Gpu runtime Send/Recv custom calls.
+void RegisterSendRecvCustomCalls(runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.send", Send);
+  registry.Register("xla.gpu.recv", Recv);
+  registry.Register("xla.gpu.send_done", SendDone);
+  registry.Register("xla.gpu.recv_done", RecvDone);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.h b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.h
new file mode 100644
index 00000000000..8e9bfb6eeaa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/stream_executor/event.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime Send/Recv custom calls.
+void RegisterSendRecvCustomCalls(runtime::DirectCustomCallRegistry& registry);
+
+// Register type names for communication attributes defined by MHLO dialect.
+void RegisterSendRecvTypeIdNames(runtime::TypeIDNameRegistry& registry);
+
+// Adds attributes encoding for Send/Recv custom calls
+void PopulateSendRecvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
+
+//===----------------------------------------------------------------------===//
+// Support for running asynchronous Send/Recv SendDone/RecvDone operations.
+//===----------------------------------------------------------------------===//
+
+class SendRecvEvents {
+ public:
+  absl::Status PushEvent(int32_t handle, tsl::AsyncValueRef<se::Event> event);
+  absl::StatusOr<tsl::AsyncValueRef<se::Event>> PopEvent(int32_t handle);
+
+ private:
+  absl::Mutex mutex_;
+  absl::flat_hash_map<int, tsl::AsyncValueRef<se::Event>> events_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/support.h b/tensorflow/compiler/xla/service/gpu/runtime/support.h
index 2653fda99d0..84c0bf5a2ad 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/support.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/support.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -27,14 +29,30 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Disable all CustomCall checks in optimized build.
+template <auto T>
+using FunctionWrapper = xla::runtime::CustomCall::FunctionWrapper<T>;
+
+struct DotDimensionNumbers {
+  absl::Span<const int64_t> lhs_batch;
+  absl::Span<const int64_t> lhs_contract;
+  absl::Span<const int64_t> rhs_batch;
+  absl::Span<const int64_t> rhs_contract;
+};
+
+// Disable expensive CustomCall checks in optimized build.
 inline constexpr runtime::CustomCall::RuntimeChecks checks =  // NOLINT
 #if defined(NDEBUG)
-    runtime::CustomCall::RuntimeChecks::kNone;
+    runtime::CustomCall::RuntimeChecks::kLess;
 #else
     runtime::CustomCall::RuntimeChecks::kDefault;
 #endif
 
+template <typename T>
+absl::StatusOr<T> ToAbsl(StatusOr<T> status_or) {
+  if (!status_or.ok()) return ToAbslStatus(status_or.status());
+  return std::move(status_or).value();
+}
+
 inline se::DeviceMemoryBase GetDeviceAddress(
     const runtime::FlatMemrefView& memref) {
   return se::DeviceMemoryBase(memref.data, memref.size_in_bytes);
@@ -76,16 +94,45 @@ inline StatusOr<GemmConfig> GetGemmConfig(
     const runtime::StridedMemrefView& lhs,
     const runtime::StridedMemrefView& rhs,
     const runtime::StridedMemrefView& out, int64_t algorithm, double alpha_real,
-    double alpha_imag, double beta, llvm::ArrayRef<int64_t> lhs_batch,
-    llvm::ArrayRef<int64_t> lhs_contract, llvm::ArrayRef<int64_t> rhs_batch,
-    llvm::ArrayRef<int64_t> rhs_contract) {
+    double alpha_imag, double beta, absl::Span<const int64_t> lhs_batch,
+    absl::Span<const int64_t> lhs_contract, absl::Span<const int64_t> rhs_batch,
+    absl::Span<const int64_t> rhs_contract) {
   return GemmConfig::For(ToShape(lhs), lhs_batch, lhs_contract, ToShape(rhs),
                          rhs_batch, rhs_contract, ToShape(out), alpha_real,
                          alpha_imag, beta, algorithm,
                          se::blas::kDefaultComputePrecision);
 }
 
+// adds Dot Dimension Attribute encodings for calls to Gemm and cuBLASLt
+inline void PopulateDotDimsAttrEncoding(
+    runtime::CustomCallAttrEncodingSet& encoding) {
+  using DotDimsAttr = mlir::mhlo::DotDimensionNumbersAttr;
+  encoding.Add<
+      xla::runtime::AggregateAttrEncoding<DotDimsAttr, DotDimensionNumbers>>(
+      encoding,
+      xla::runtime::AggregateAttrDef<DotDimsAttr>()
+          .Add("lhs_batch", &DotDimsAttr::getLhsBatchingDimensions)
+          .Add("lhs_contract", &DotDimsAttr::getLhsContractingDimensions)
+          .Add("rhs_batch", &DotDimsAttr::getRhsBatchingDimensions)
+          .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions));
+}
+
 }  // namespace gpu
 }  // namespace xla
 
+namespace xla {
+namespace runtime {
+
+// using llvm::ArrayRef;
+
+XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
+    xla::gpu::DotDimensionNumbers,
+    AggregateMember<absl::Span<const int64_t>>("lhs_batch"),
+    AggregateMember<absl::Span<const int64_t>>("lhs_contract"),
+    AggregateMember<absl::Span<const int64_t>>("rhs_batch"),
+    AggregateMember<absl::Span<const int64_t>>("rhs_contract"));
+
+}  // namespace runtime
+}  // namespace xla
+
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc b/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
index 47bbfa0088e..eb0da9e0fa4 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
@@ -28,70 +28,43 @@ namespace xla {
 namespace gpu {
 
 using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
 using ::xla::runtime::HloTrace;
 
 using ::tsl::profiler::ScopedAnnotationStack;
 
+//===----------------------------------------------------------------------===//
+// Type names for encoded attributes.
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-struct ActivityStart {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::StatusOr<int64_t> operator()(runtime::HloTrace annotation) const {
-    return ScopedAnnotationStack::ActivityStart([&] {
-      // We use the same tracing annotation scheme as the ThunkSequence (see
-      // implementation of `GetThunkInfo` in `ir_emitter_unnested.cc`).
-      return absl::StrFormat("Thunk:#hlo_op=%s,hlo_module=%s,program_id=%d#",
-                             annotation.hlo_op, annotation.module,
-                             annotation.program_id);
-    });
-  }
-
-  static ActivityStart Handler() { return ActivityStart(); }
-};
-
-struct ActivityEnd {
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  absl::Status operator()(int64_t activity_id) const {
-    ScopedAnnotationStack::ActivityEnd(activity_id);
-    return absl::OkStatus();
-  }
-
-  static ActivityEnd Handler() { return ActivityEnd(); }
-};
-
-}  // namespace
+void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry) {
+  runtime::PopulateTraceTypeIdNames(registry);
+}
 
+//===----------------------------------------------------------------------===//
+// Tracing custom calls implementation.
 //===----------------------------------------------------------------------===//
 
-static bool Start(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                  void** rets) {
-  static auto* handler = CustomCall::Bind("xla.trace.activity_start")
-                             .Attr<HloTrace>("annotation")
-                             .Ret<int64_t>()
-                             .To<checks>(ActivityStart::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+static absl::StatusOr<int64_t> ActivityStart(runtime::HloTrace annotation) {
+  return ScopedAnnotationStack::ActivityStart([&] {
+    // We use the same tracing annotation scheme as the ThunkSequence (see
+    // implementation of `GetThunkInfo` in `ir_emitter_unnested.cc`).
+    return absl::StrFormat("Thunk:#hlo_op=%s#", annotation.hlo_op);
+  });
 }
 
-static bool End(runtime::ExecutionContext* ctx, void** args, void** attrs,
-                void** rets) {
-  static auto* handler = CustomCall::Bind("xla.trace.activity_end")
-                             .Arg<int64_t>()
-                             .To<checks>(ActivityEnd::Handler())
-                             .release();
-
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+static absl::Status ActivityEnd(int64_t activity_id) {
+  ScopedAnnotationStack::ActivityEnd(activity_id);
+  return absl::OkStatus();
 }
 
-//===----------------------------------------------------------------------===//
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(Start, FunctionWrapper<ActivityStart>(), checks,
+                               CustomCall::Bind("xla.trace.activity_start")
+                                   .Attr<HloTrace>("annotation")
+                                   .Ret<int64_t>());
 
-void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry) {
-  runtime::PopulateTraceTypeIdNames(registry);
-}
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    End, FunctionWrapper<ActivityEnd>(), checks,
+    CustomCall::Bind("xla.trace.activity_end").Arg<int64_t>());
 
 void RegisterTracingCustomCalls(runtime::DirectCustomCallRegistry& registry) {
   registry.Register("xla.trace.activity_start", Start);
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc
new file mode 100644
index 00000000000..8188b138511
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc
@@ -0,0 +1,141 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.h"
+
+#include <numeric>
+#include <string>
+#include <string_view>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/tsl/platform/human_readable_json.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace xla {
+namespace gpu {
+
+using xla::runtime::CustomCall;
+
+using mlir::failure;
+using mlir::FailureOr;
+
+absl::Status TriangularSolve::run(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
+    std::string_view backend_config) {
+  TriangularSolve handler = TriangularSolve::Handler();
+
+  if (args.size() != 4)
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Expected 4 arguments, got %d", args.size()));
+
+  // Check if all arguments have the correct type.
+  auto a = args.get<runtime::StridedMemrefView>(0);
+  auto b = args.get<runtime::StridedMemrefView>(1);
+  auto result = args.get<runtime::StridedMemrefView>(2);
+  auto temp = args.get<runtime::FlatMemrefView>(3);
+  if (failed(a) || failed(b) || failed(result) || failed(temp))
+    return absl::InvalidArgumentError("Incorrect argument types");
+
+  // Parse backend config string.
+  TriangularSolveOptions opts;
+
+  const std::string backend_config_str =
+      std::string(backend_config.data(), backend_config.length());
+
+  auto st = tsl::HumanReadableJsonToProto(backend_config_str, &opts);
+  if (!st.ok()) return ToAbslStatus(st);
+
+  return handler(run_options, debug_options, *a, *b, *result, *temp,
+                 opts.left_side(), opts.lower(), opts.unit_diagonal(),
+                 opts.transpose_a());
+}
+
+absl::Status TriangularSolve::operator()(
+    const ServiceExecutableRunOptions* run_options,
+    const DebugOptions* debug_options, runtime::StridedMemrefView a,
+    runtime::StridedMemrefView b, runtime::StridedMemrefView result,
+    runtime::FlatMemrefView temp, bool left_side, bool lower,
+    bool unit_diagonal, TriangularSolveOptions::Transpose transpose_a) const {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  se::Stream* stream = run_options->stream();
+
+  se::DeviceMemoryBase a_data = GetDeviceAddress(a);
+  se::DeviceMemoryBase b_data = GetDeviceAddress(b);
+  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
+  se::DeviceMemoryBase temp_data = GetDeviceAddress(temp);
+
+  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
+  // aren't the same buffer.
+  if (b.data != result.data)
+    stream->ThenMemcpy(&result_data, b_data, b_data.size());
+
+  Shape b_shape = ToShape(b);
+  int64_t m = b_shape.dimensions(b_shape.rank() - 2);
+  int64_t n = b_shape.dimensions(b_shape.rank() - 1);
+  int64_t batch_size = std::accumulate(
+      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
+      [](int64_t a, int64_t b) { return a * b; });
+
+  PrimitiveType elem_type = b.dtype;
+  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_type);
+  int64_t a_batch_stride = left_side ? m * m * elem_size : n * n * elem_size;
+  int64_t b_batch_stride = m * n * elem_size;
+
+  using Side = se::blas::Side;
+  using Diagonal = se::blas::Diagonal;
+  using Transpose = se::blas::Transpose;
+  using UpperLower = se::blas::UpperLower;
+
+  // Convert custom call attributes to se::blas enums.
+  UpperLower uplo = lower ? UpperLower::kLower : UpperLower::kUpper;
+  Side side = left_side ? Side::kLeft : Side::kRight;
+  Diagonal diagonal = unit_diagonal ? Diagonal::kUnit : Diagonal::kNonUnit;
+
+  auto transpose = [&]() -> mlir::FailureOr<Transpose> {
+    switch (transpose_a) {
+      case TriangularSolveOptions::NO_TRANSPOSE:
+        return se::blas::Transpose::kNoTranspose;
+      case TriangularSolveOptions::TRANSPOSE:
+        return se::blas::Transpose::kTranspose;
+      case TriangularSolveOptions::ADJOINT:
+        return se::blas::Transpose::kConjugateTranspose;
+      default:
+        return failure();
+    }
+  }();
+
+  if (failed(transpose))
+    return absl::InternalError("Failed to convert transpose type");
+
+  auto st = RunTriangularSolve(
+      a_data, result_data, temp_data, PtxOptsFromDebugOptions(*debug_options),
+      uplo, side, diagonal, *transpose, elem_type, batch_size, m, n,
+      a_batch_stride, b_batch_stride, stream);
+  if (!st.ok()) return ToAbslStatus(st);
+
+  return absl::OkStatus();
+#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  return absl::InternalError("Not implemented without Gpu");
+#endif
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.h b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.h
new file mode 100644
index 00000000000..0e3a4b717ee
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
+
+#include <string_view>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace xla {
+namespace gpu {
+
+using runtime::CustomCall;
+
+struct TriangularSolve {
+  // Adaptor from XlaCustomCall API to properly typed TriangularSolve handler.
+  static absl::Status run(const ServiceExecutableRunOptions* run_options,
+                          const DebugOptions* debug_options,
+                          CustomCall::RemainingArgs args,
+                          std::string_view backend_config);
+
+  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
+                          const DebugOptions* debug_options,
+                          runtime::StridedMemrefView a,
+                          runtime::StridedMemrefView b,
+                          runtime::StridedMemrefView result,
+                          runtime::FlatMemrefView temp, bool left_side,
+                          bool lower, bool unit_diagonal,
+                          TriangularSolveOptions::Transpose transpose_a) const;
+
+  static TriangularSolve Handler() { return TriangularSolve(); }
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.cc b/tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.cc
index 52c3e60921a..d260764a5e7 100644
--- a/tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.cc
+++ b/tensorflow/compiler/xla/service/gpu/scatter_slice_simplifier.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 37f6298b03a..37539a552a9 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/softmax_fusion.cc b/tensorflow/compiler/xla/service/gpu/softmax_fusion.cc
index 580964ad8e8..f8c276d4f24 100644
--- a/tensorflow/compiler/xla/service/gpu/softmax_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/softmax_fusion.cc
@@ -18,17 +18,19 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <queue>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -38,6 +40,46 @@ namespace xla::gpu {
 namespace {
 namespace m = ::xla::match;
 
+bool HasDefaultLayout(const Shape& shape) {
+  return shape.has_layout() &&
+         LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
+}
+
+bool ShapeInvolvesUnsupportedElementType(const Shape& shape) {
+  if (shape.IsArray() &&
+      (shape.element_type() == C64 || shape.element_type() == C128 ||
+       shape.element_type() == U8 || shape.element_type() == U16 ||
+       shape.element_type() == U32 || shape.element_type() == U64 ||
+       shape.element_type() == BF16)) {
+    return true;
+  } else if (shape.IsTuple()) {
+    for (auto& tuple_shape : shape.tuple_shapes())
+      if (ShapeInvolvesUnsupportedElementType(tuple_shape)) return true;
+  }
+
+  return false;
+}
+
+bool IsSupportedReductionElementType(PrimitiveType element_type) {
+  return element_type == F16 || element_type == F32 || element_type == S32 ||
+         element_type == S16 || element_type == PRED;
+}
+
+bool IsSupportedReductionComputation(HloComputation* computation) {
+  static const absl::flat_hash_set<HloOpcode>* const kSupportedOpcodes =
+      new absl::flat_hash_set<HloOpcode>{
+          HloOpcode::kAdd,     HloOpcode::kMultiply, HloOpcode::kMaximum,
+          HloOpcode::kMinimum, HloOpcode::kAnd,      HloOpcode::kOr,
+          HloOpcode::kXor};
+  HloInstruction* root = computation->root_instruction();
+  if (root->operand_count() != 2 ||
+      root->operand(0)->opcode() != HloOpcode::kParameter ||
+      root->operand(1)->opcode() != HloOpcode::kParameter) {
+    return false;
+  }
+  return kSupportedOpcodes->contains(root->opcode());
+}
+
 bool MatchesSoftmaxPattern(HloInstruction* instr) {
   // Match the following pattern:
   //
@@ -51,65 +93,149 @@ bool MatchesSoftmaxPattern(HloInstruction* instr) {
   //
   // There should not be other users of these ops than indicated by the edges.
   // Between the root and the producer, there can be some optional unary
-  // elementwise ops. Also, initially we only support major-to-minor layouts.
+  // elementwise ops. The operand of the broadcast can be a reshape that removes
+  // 1-sized dimensions. Around the reduce can be a convert op. Also, initially
+  // we only support major-to-minor layouts.
+  //
+  // If any op between the producer and the root (both included) involves arrays
+  // of complex numbers, the pattern does not match.
 
   HloInstruction* root;
   HloInstruction* broadcast;
-  HloInstruction* reduce;
-  HloInstruction* producer;
+  HloInstruction* reduce_or_unary;
   if (!Match(instr,
              m::Op(&root)
-                 .WithOperand(
-                     1, m::Broadcast(
-                            &broadcast,
-                            m::Reduce(&reduce, m::Op(&producer), m::Constant())
-                                .WithOneUse()
-                                // The reduction should reduce the last
-                                // dimension of the operand shape.
-                                .WithPredicate([](const HloInstruction* instr) {
-                                  return instr->dimensions().size() == 1 &&
-                                         instr->dimensions()[0] ==
-                                             instr->shape().rank();
-                                }))
-                            .WithOneUse()
-                            // The broadcast should "undo" the reduction.
-                            .WithPredicate([](const HloInstruction* instr) {
-                              int64_t rank = instr->shape().rank();
-                              if (rank < 1) {
-                                return false;
-                              }
-                              std::vector<int64_t> expected_dims(rank - 1);
-                              std::iota(expected_dims.begin(),
-                                        expected_dims.end(), 0);
-                              return instr->dimensions() == expected_dims;
-                            }))
-                 // The root operation should be an elementwise binary op.
+                 .WithOperand(1,
+                              m::Broadcast(&broadcast,
+                                           m::Op(&reduce_or_unary).WithOneUse())
+                                  .WithOneUse())
+                 // The root operation should be an elementwise binary op of
+                 // rank 2.
                  .WithPredicate([](const HloInstruction* instr) {
-                   return instr->IsElementwiseBinary();
+                   if (!instr->shape().IsArray()) return false;
+
+                   int64_t rank = instr->shape().rank();
+                   return instr->IsElementwiseBinary() &&
+                          // We rely on L1 cache for performance, and there are
+                          // 256 elements in L1 cache per warp.
+                          instr->shape().dimensions().back() <= 256 &&
+                          // If the product of the first dimensions is 1, it
+                          // currently crashes the pipeline. Also, we expect
+                          // that the performance is not so good if the
+                          // reduction dimension is big compared to the other
+                          // dimensions.
+                          Product(absl::Span<const int64_t>(
+                                      instr->shape().dimensions())
+                                      .first(rank - 1)) >
+                              instr->shape().dimensions(rank - 1);
                  }))) {
     return false;
   }
-  bool has_major_to_minor_layout =
-      LayoutUtil::IsMonotonicWithDim0Major(root->shape().layout()) &&
-      LayoutUtil::IsMonotonicWithDim0Major(reduce->shape().layout()) &&
-      LayoutUtil::IsMonotonicWithDim0Major(broadcast->shape().layout()) &&
-      LayoutUtil::IsMonotonicWithDim0Major(reduce->shape().layout()) &&
-      LayoutUtil::IsMonotonicWithDim0Major(producer->shape().layout());
+  bool has_major_to_minor_layout = HasDefaultLayout(root->shape()) &&
+                                   HasDefaultLayout(reduce_or_unary->shape()) &&
+                                   HasDefaultLayout(broadcast->shape());
+  if (reduce_or_unary->opcode() == HloOpcode::kReshape) {
+    // Check that the reshape only removes 1-sized dimensions.
+    auto descr =
+        reduce_or_unary->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
+    if (!descr.has_value() || !descr->inserted_dimensions.empty()) {
+      return false;
+    }
+    reduce_or_unary = reduce_or_unary->mutable_operand(0);
+    if (reduce_or_unary->user_count() != 1) {
+      return false;
+    }
+    if (!HasDefaultLayout(reduce_or_unary->shape())) {
+      has_major_to_minor_layout = false;
+    }
+  }
+  bool has_convert_around_reduce = false;
+  if (reduce_or_unary->opcode() == HloOpcode::kConvert) {
+    has_convert_around_reduce = true;
+    reduce_or_unary = reduce_or_unary->mutable_operand(0);
+    if (reduce_or_unary->user_count() != 1) {
+      return false;
+    }
+    if (!HasDefaultLayout(reduce_or_unary->shape())) {
+      has_major_to_minor_layout = false;
+    }
+  }
+
+  // The reduction should reduce the last dimension of the operand shape.
+  if (reduce_or_unary->opcode() != HloOpcode::kReduce ||
+      reduce_or_unary->dimensions().size() != 1 ||
+      !IsSupportedReductionElementType(
+          reduce_or_unary->shape().element_type()) ||
+      !IsSupportedReductionComputation(reduce_or_unary->to_apply()) ||
+      reduce_or_unary->dimensions()[0] != reduce_or_unary->shape().rank()) {
+    return false;
+  }
+
+  // The broadcast dimensions should be sorted.
+  if (!std::is_sorted(broadcast->dimensions().begin(),
+                      broadcast->dimensions().end())) {
+    return false;
+  }
+  // The broadcast should "undo" the reduction. Therefore, the non-broadcasted
+  // dimensions should be the last dimension and 1-sized dimensions.
+  int64_t rank = broadcast->shape().rank();
+  if (rank < 1) {
+    return false;
+  }
+  int64_t pos = 0;
+  for (int64_t i = 0; i < rank; ++i) {
+    if (pos < broadcast->dimensions().size() &&
+        broadcast->dimensions()[pos] == i) {
+      // The last dimension should not be broadcasted from the operand.
+      if (i == rank - 1) {
+        return false;
+      }
+      ++pos;
+    } else if (i < rank - 1 && broadcast->shape().dimensions(i) != 1) {
+      return false;
+    }
+  }
+
+  HloInstruction* producer = reduce_or_unary->mutable_operand(0);
+  if (has_convert_around_reduce && producer->opcode() == HloOpcode::kConvert) {
+    if (!HasDefaultLayout(producer->shape())) {
+      has_major_to_minor_layout = false;
+    }
+    if (producer->user_count() != 1) {
+      return false;
+    }
+    producer = producer->mutable_operand(0);
+  }
+  if (!HasDefaultLayout(producer->shape())) {
+    has_major_to_minor_layout = false;
+  }
+
+  if (ShapeInvolvesUnsupportedElementType(root->shape()) ||
+      ShapeInvolvesUnsupportedElementType(broadcast->shape()) ||
+      ShapeInvolvesUnsupportedElementType(reduce_or_unary->shape()) ||
+      ShapeInvolvesUnsupportedElementType(producer->shape())) {
+    return false;
+  }
+
+  for (HloInstruction* operand : producer->operands()) {
+    if (ShapeInvolvesUnsupportedElementType(operand->shape())) {
+      return false;
+    }
+  }
 
-  // Check whether the operand of the reduce is a direct or indirect operand of
-  // 'root'.
-  const HloInstruction* maybe_common_operand = reduce->operand(0);
+  // Check whether 'producer' is a direct or indirect operand of 'root'.
+  const HloInstruction* maybe_common_operand = producer;
   const HloInstruction* current_operand = root->operand(0);
   while (current_operand != maybe_common_operand) {
     // Any intermediate operand between 'root' and 'maybe_common_operand' needs
     // to be an unary elementwise op with a single user.
     if (current_operand->operand_count() != 1 ||
         !current_operand->IsElementwise() ||
-        current_operand->user_count() > 1) {
+        current_operand->user_count() > 1 ||
+        ShapeInvolvesUnsupportedElementType(current_operand->shape())) {
       return false;
     }
-    if (!LayoutUtil::IsMonotonicWithDim0Major(
-            current_operand->shape().layout())) {
+    if (!HasDefaultLayout(current_operand->shape())) {
       has_major_to_minor_layout = false;
     }
     current_operand = current_operand->operand(0);
@@ -125,18 +251,108 @@ bool MatchesSoftmaxPattern(HloInstruction* instr) {
 
 HloInstruction* SoftmaxProducer(HloInstruction* softmax_root) {
   // The softmax producer is found by going up the chain
-  // -> broadcast -> reduce -> producer
-  return softmax_root->mutable_operand(1)->mutable_operand(0)->mutable_operand(
-      0);
+  // -> broadcast -> (reshape) -> reduce -> producer
+  auto reduce_or_unary = softmax_root->mutable_operand(1)->mutable_operand(0);
+  bool has_convert_around_reduce = false;
+  while (reduce_or_unary->opcode() != HloOpcode::kReduce) {
+    if (reduce_or_unary->opcode() == HloOpcode::kConvert) {
+      has_convert_around_reduce = true;
+    }
+    reduce_or_unary = reduce_or_unary->mutable_operand(0);
+  }
+  HloInstruction* producer = reduce_or_unary->mutable_operand(0);
+  if (has_convert_around_reduce && producer->opcode() == HloOpcode::kConvert) {
+    producer = producer->mutable_operand(0);
+  }
+  return producer;
+}
+
+bool IsSupportedBroadcast(HloInstruction* hlo) {
+  if (hlo->opcode() != HloOpcode::kBroadcast) {
+    return false;
+  }
+  int64_t rank = hlo->shape().rank();
+  if (rank <= 2) {
+    return true;
+  }
+  // TODO(akuegel): Remove this logic once we do not rely on collapsing shapes
+  // to 2D.
+  // For rank > 2, we need to collapse the shape to 2D. This only works if the
+  // dimensions that are to be collapsed have the same state regarding whether
+  // they are broadcasted or not.
+  if (!hlo->dimensions().empty()) {
+    // Make sure that the broadcast dimensions are sorted.
+    if (!std::is_sorted(hlo->dimensions().begin(), hlo->dimensions().end())) {
+      return false;
+    }
+    // If there is a broadcast dimension in the part of dimensions that are
+    // collapsed into 1 dimension, then all those rank - 1 dimensions need to be
+    // broadcast dimensions.
+    if (hlo->dimensions(0) < rank - 1 && hlo->dimensions().size() < rank) {
+      return false;
+    }
+  }
+  return true;
 }
 
-Status ReplaceSoftmaxWithCustomCall(HloInstruction* root,
-                                    HloInstruction* producer) {
+StatusOr<bool> TryReplaceSoftmaxWithCustomCall(HloInstruction* root,
+                                               HloInstruction* producer) {
   absl::flat_hash_map<const HloInstruction*, HloInstruction*>
       old_to_new_mapping;
   auto builder = HloComputation::Builder("softmax_computation");
-  old_to_new_mapping[producer] = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, producer->shape(), "parameter_0"));
+  std::vector<HloInstruction*> custom_call_operands;
+  absl::flat_hash_set<HloInstruction*> visited;
+  std::queue<HloInstruction*> worklist;
+  worklist.push(producer);
+  visited.insert(producer);
+  int64_t operand_idx = 0;
+  // Fuse all elementwise and broadcast ops into the softmax fusion computation,
+  // provided each of them (except the softmax root) has exactly one user. We do
+  // this by searching for unfusable ops which become the parameters of the
+  // computation. Everything that was fused will be reconstructed in the new
+  // computation by remapping the ops to their new operands.
+  while (!worklist.empty()) {
+    HloInstruction* current = worklist.front();
+    worklist.pop();
+    // If it is a broadcast that we cannot fuse in, we should not replace the
+    // matched softmax with the custom call, because not fusing the broadcast
+    // can have negative impact on performance.
+    if (current->opcode() == HloOpcode::kBroadcast &&
+        !IsSupportedBroadcast(current)) {
+      return false;
+    }
+    // TODO(akuegel): Currently our MLIR lowering doesn't work if we fuse
+    // constants in. This results in an error like:
+    // 'memref.get_global' op '__constant_150xf32' does not reference a valid
+    // global memref
+    // TODO(bchetioui): We currently do not have reliable support for complex
+    // numbers. Since the producer is only matched if it does not involve
+    // complex numbers, all that is left is to check the operands of the
+    // potentially fuseable preceding operations.
+    if ((current->user_count() == 1 ||
+         (current == producer && current->user_count() == 2)) &&
+        ((current->IsElementwise() &&
+          current->opcode() != HloOpcode::kConstant) ||
+         current->opcode() == HloOpcode::kBroadcast) &&
+        llvm::none_of(current->operands(), [](HloInstruction* operand) {
+          return ShapeInvolvesUnsupportedElementType(operand->shape());
+        })) {
+      for (HloInstruction* operand : current->operands()) {
+        if (!visited.contains(operand)) {
+          visited.insert(operand);
+          worklist.push(operand);
+        }
+      }
+    } else {
+      // The op is unfusable. Create a parameter for the softmax computation.
+      custom_call_operands.push_back(current);
+      old_to_new_mapping[current] =
+          builder.AddInstruction(HloInstruction::CreateParameter(
+              operand_idx, current->shape(),
+              absl::StrCat("parameter_", operand_idx)));
+      ++operand_idx;
+    }
+  }
   std::function<void(const HloInstruction*)> create_computation =
       [&](const HloInstruction* instr) {
         if (old_to_new_mapping.contains(instr)) {
@@ -156,7 +372,8 @@ Status ReplaceSoftmaxWithCustomCall(HloInstruction* root,
                                                            /*is_entry=*/false);
   auto softmax_custom_call =
       root->parent()->AddInstruction(HloInstruction::CreateCustomCall(
-          root->shape(), {producer}, softmax_computation, kSoftmaxCallTarget));
+          root->shape(), custom_call_operands, softmax_computation,
+          kSoftmaxCallTarget));
   if (root->IsRoot()) {
     root->parent()->set_root_instruction(softmax_custom_call);
     TF_RETURN_IF_ERROR(
@@ -165,7 +382,7 @@ Status ReplaceSoftmaxWithCustomCall(HloInstruction* root,
     TF_RETURN_IF_ERROR(
         root->parent()->ReplaceInstruction(root, softmax_custom_call));
   }
-  return OkStatus();
+  return true;
 }
 
 }  // anonymous namespace
@@ -178,6 +395,9 @@ StatusOr<bool> SoftmaxFusion::Run(
       softmax_producer_to_root_mapping;
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
+    if (comp->IsCustomCallComputation()) {
+      continue;
+    }
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
       if (MatchesSoftmaxPattern(instr)) {
         softmax_roots.push_back(instr);
@@ -190,6 +410,7 @@ StatusOr<bool> SoftmaxFusion::Run(
   }
 
   absl::flat_hash_set<HloInstruction*> processed_softmax_roots;
+  bool changed = false;
   for (HloInstruction* root : softmax_roots) {
     if (processed_softmax_roots.contains(root)) {
       continue;
@@ -216,9 +437,16 @@ StatusOr<bool> SoftmaxFusion::Run(
           valid = false;
           break;
         }
+
+        const Shape* current_shape = &current->shape();
+        while (!current_shape->has_layout() &&
+               current_shape->tuple_shapes_size() == 1) {
+          current_shape = &current_shape->tuple_shapes(0);
+        }
+
         // Again, we only allow the default layout for any unary ops on the
         // path.
-        if (!LayoutUtil::IsMonotonicWithDim0Major(current->shape().layout())) {
+        if (!HasDefaultLayout(*current_shape)) {
           valid = false;
           break;
         }
@@ -236,10 +464,11 @@ StatusOr<bool> SoftmaxFusion::Run(
       processed_softmax_roots.insert(merged_root);
     }
     HloInstruction* merged_producer = SoftmaxProducer(root);
-    TF_RETURN_IF_ERROR(
-        ReplaceSoftmaxWithCustomCall(merged_root, merged_producer));
+    TF_ASSIGN_OR_RETURN(bool replaced, TryReplaceSoftmaxWithCustomCall(
+                                           merged_root, merged_producer));
+    changed = changed || replaced;
   }
-  return true;
+  return changed;
 }
 
 }  // namespace xla::gpu
diff --git a/tensorflow/compiler/xla/service/gpu/softmax_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/softmax_fusion_test.cc
index a80a6e782bc..984cab077e5 100644
--- a/tensorflow/compiler/xla/service/gpu/softmax_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/softmax_fusion_test.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
@@ -33,7 +36,14 @@ namespace {
 
 namespace m = ::xla::match;
 
-using SoftmaxFusionTest = HloTestBase;
+class SoftmaxFusionTest : public HloTestBase {
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_softmax_fusion(true);
+    return debug_options;
+  }
+};
 
 TEST_F(SoftmaxFusionTest, SingleSoftmaxPattern) {
   const std::string& hlo_string = R"(
@@ -46,6 +56,174 @@ max_computation {
   ROOT maximum = f32[] maximum(arg_0, arg_1)
 }
 
+ENTRY main {
+  param_0 = f32[128,127]{1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[128]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[128,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  auto* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter(0))));
+  ASSERT_TRUE(root->has_to_apply());
+  // Assert that the softmax computation has exactly the softmax pattern.
+  ASSERT_THAT(root->to_apply()->root_instruction(),
+              GmockMatch(m::Subtract(
+                  m::Parameter(0),
+                  m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))));
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternF16) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+add_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f16[128,127]{1,0} parameter(0)
+  constant_zero = f32[] constant(0.0)
+  convert_up = f32[128,127]{1,0} convert(param_0)
+  reduce = f32[128]{0} reduce(convert_up, constant_zero), dimensions={1}, to_apply=add_computation
+  convert_down = f16[128]{0} convert(reduce)
+  broadcast = f16[128,127]{1,0} broadcast(convert_down), dimensions={0}
+  ROOT subtract = f16[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  auto initial_module = module->Clone();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+
+  EXPECT_TRUE(RunAndCompare(std::move(initial_module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternF16HigherRank) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+add_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f16[2,3,128,127]{3,2,1,0} parameter(0)
+  constant_zero = f32[] constant(0.0)
+  convert_up = f32[2,3,128,127]{3,2,1,0} convert(param_0)
+  reduce = f32[2,3,128]{2,1,0} reduce(convert_up, constant_zero), dimensions={3}, to_apply=add_computation
+  convert_down = f16[2,3,128]{2,1,0} convert(reduce)
+  broadcast = f16[2,3,128,127]{3,2,1,0} broadcast(convert_down), dimensions={0,1,2}
+  ROOT subtract = f16[2,3,128,127]{3,2,1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  auto initial_module = module->Clone();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+
+  EXPECT_TRUE(RunAndCompare(std::move(initial_module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternLargeReductionDim) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[258,257]{1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[258]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[258,257]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[258,257]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternF64) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f64[] parameter(0)
+  arg_1 = f64[] parameter(1)
+  ROOT maximum = f64[] maximum(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f64[128,127]{1,0} parameter(0)
+  constant_neg_inf = f64[] constant(-inf)
+  reduce = f64[128]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f64[128,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f64[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  // We currently do not support f64 reductions in softmax.
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternWrongReductionComputation) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+or_computation {
+  arg_0 = s32[] parameter(0)
+  arg_1 = s32[] parameter(1)
+  converted_arg_0 = pred[] convert(arg_0)
+  converted_arg_1 = pred[] convert(arg_1)
+  or = pred[] or(converted_arg_0, converted_arg_1)
+  ROOT result = s32[] convert(or)
+}
+
+ENTRY main {
+  param_0 = s32[128,127]{1,0} parameter(0)
+  constant_neg_inf = s32[] constant(0)
+  reduce = s32[128]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=or_computation
+  broadcast = s32[128,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = s32[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternRowsNotLargerThanColumns) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
 ENTRY main {
   param_0 = f32[128,128]{1,0} parameter(0)
   constant_neg_inf = f32[] constant(-inf)
@@ -53,6 +231,67 @@ ENTRY main {
   broadcast = f32[128,128]{1,0} broadcast(reduce), dimensions={0}
   ROOT subtract = f32[128,128]{1,0} subtract(param_0, broadcast)
 }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+// Currently disabled because we cannot enable matching cases that would crash
+// the mlir pipeline.
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternWithReshape) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[256,1,128]{2,1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[256,1]{1,0} reduce(param_0, constant_neg_inf), dimensions={2}, to_apply=max_computation
+  reshape = f32[256] reshape(reduce)
+  broadcast = f32[256,1,128]{2,1,0} broadcast(reshape), dimensions={0}
+  ROOT subtract = f32[256,1,128]{2,1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  auto* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter(0))));
+  ASSERT_TRUE(root->has_to_apply());
+  // Assert that the softmax computation has exactly the softmax pattern.
+  ASSERT_THAT(root->to_apply()->root_instruction(),
+              GmockMatch(m::Subtract(m::Parameter(0),
+                                     m::Broadcast(m::Reshape(m::Reduce(
+                                         m::Parameter(0), m::Constant()))))));
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPattern4D) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[4,8,16,64]{3,2,1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[4,8,16]{2,1,0} reduce(param_0, constant_neg_inf), dimensions={3}, to_apply=max_computation
+  broadcast = f32[4,8,16,64]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
+  ROOT subtract = f32[4,8,16,64]{3,2,1,0} subtract(param_0, broadcast)
+}
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   auto initial_module = module->Clone();
@@ -68,8 +307,8 @@ ENTRY main {
               GmockMatch(m::Subtract(
                   m::Parameter(0),
                   m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))));
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      std::move(initial_module), std::move(module), ErrorSpec(1e-6, 1e-6)));
+
+  EXPECT_TRUE(RunAndCompare(std::move(initial_module), ErrorSpec(1e-6, 1e-6)));
 }
 
 TEST_F(SoftmaxFusionTest, SoftmaxPatternWithExtraStuff) {
@@ -84,14 +323,14 @@ max_computation {
 }
 
 ENTRY main {
-  param_0 = f32[32,8,128,128]{3,2,1,0} parameter(0)
-  exponential = f32[32,8,128,128]{3,2,1,0} exponential(param_0)
+  param_0 = f32[128,127]{1,0} parameter(0)
+  exponential = f32[128,127]{1,0} exponential(param_0)
   constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[32,8,128]{2,1,0} reduce(exponential, constant_neg_inf), dimensions={3}, to_apply=max_computation
-  broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
-  negate = f32[32,8,128,128]{3,2,1,0} negate(exponential)
-  subtract = f32[32,8,128,128]{3,2,1,0} subtract(negate, broadcast)
-  ROOT log = f32[32,8,128,128]{3,2,1,0} log(subtract)
+  reduce = f32[128]{0} reduce(exponential, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[128,127]{1,0} broadcast(reduce), dimensions={0}
+  negate = f32[128,127]{1,0} negate(exponential)
+  subtract = f32[128,127]{1,0} subtract(negate, broadcast)
+  ROOT log = f32[128,127]{1,0} log(subtract)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -101,16 +340,16 @@ ENTRY main {
   VLOG(2) << module->ToString();
   auto* root = module->entry_computation()->root_instruction();
   HloInstruction* custom_call;
-  ASSERT_THAT(
-      root,
-      GmockMatch(m::Log(m::CustomCall(&custom_call, m::Exp(m::Parameter(0))))));
+  ASSERT_THAT(root,
+              GmockMatch(m::Log(m::CustomCall(&custom_call, m::Parameter(0)))));
   ASSERT_TRUE(custom_call->has_to_apply());
   // Assert that the softmax computation has the softmax pattern with the extra
   // Negate, but without the Exponential.
-  ASSERT_THAT(custom_call->to_apply()->root_instruction(),
-              GmockMatch(m::Subtract(
-                  m::Negate(m::Parameter(0)),
-                  m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))));
+  ASSERT_THAT(
+      custom_call->to_apply()->root_instruction(),
+      GmockMatch(m::Subtract(
+          m::Negate(m::Exp(m::Parameter(0))),
+          m::Broadcast(m::Reduce(m::Exp(m::Parameter(0)), m::Constant())))));
 }
 
 TEST_F(SoftmaxFusionTest, DoubleSoftmaxPattern) {
@@ -142,6 +381,59 @@ ENTRY main {
   second_broadcast = f32[127,125]{1,0} broadcast(second_reduce), dimensions={0}
   ROOT divide = f32[127,125]{1,0} divide(exponential, second_broadcast)
 }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  auto* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter(0))));
+  ASSERT_TRUE(root->has_to_apply());
+  // Assert that we have matched both softmax patterns.
+  HloInstruction* exp1;
+  HloInstruction* exp2;
+  ASSERT_THAT(
+      root->to_apply()->root_instruction(),
+      GmockMatch(m::Divide(m::Exp(&exp1, m::Subtract()),
+                           m::Broadcast(m::Reduce(m::Exp(&exp2, m::Subtract()),
+                                                  m::Constant())))));
+  EXPECT_EQ(exp1, exp2);
+  ASSERT_THAT(exp1,
+              GmockMatch(m::Exp(m::Subtract(
+                  m::Parameter(0),
+                  m::Broadcast(m::Reduce(m::Parameter(0), m::Constant()))))));
+}
+
+TEST_F(SoftmaxFusionTest, DoubleSoftmaxPattern4D) {
+  const std::string& hlo_string = R"(
+
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT maximum = f32[] add(arg_0.1, arg_1.1)
+}
+
+ENTRY main {
+  param_0 = f32[12,34,56,78]{3,2,1,0} parameter(0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[12,34,56]{2,1,0} reduce(param_0, constant_neg_inf), dimensions={3}, to_apply=max_computation
+  broadcast = f32[12,34,56,78]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
+  subtract = f32[12,34,56,78]{3,2,1,0} subtract(param_0, broadcast)
+  exponential = f32[12,34,56,78]{3,2,1,0} exponential (subtract)
+  constant_zero = f32[] constant(0)
+  second_reduce = f32[12,34,56]{2,1,0} reduce(exponential, constant_zero), dimensions={3}, to_apply=add_computation
+  second_broadcast = f32[12,34,56,78]{3,2,1,0} broadcast(second_reduce), dimensions={0,1,2}
+  ROOT divide = f32[12,34,56,78]{3,2,1,0} divide(exponential, second_broadcast)
+}
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
   auto initial_module = module->Clone();
@@ -165,8 +457,93 @@ ENTRY main {
               GmockMatch(m::Exp(m::Subtract(
                   m::Parameter(0),
                   m::Broadcast(m::Reduce(m::Parameter(0), m::Constant()))))));
-  EXPECT_TRUE(RunAndCompareTwoModules(
-      std::move(initial_module), std::move(module), ErrorSpec(1e-6, 1e-6)));
+
+  EXPECT_TRUE(RunAndCompare(std::move(initial_module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, 4DWithBroadcast) {
+  const std::string& hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[],f32[],f16[32,4,72,150]{3,2,1,0},f32[],f32[150]{0},f32[])->f32[32,4,72,150]{3,2,1,0}}
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT maximum.1 = f32[] add(arg_0.1, arg_1.1)
+}
+
+ENTRY main {
+  parameter_0.6 = f32[] parameter(0)
+  broadcast.595 = f32[32,4,72,150]{3,2,1,0} broadcast(parameter_0.6), dimensions={}
+  parameter_3.5 = f32[] parameter(3)
+  broadcast.596 = f32[150]{0} broadcast(parameter_3.5), dimensions={}
+  parameter_5.5 = f32[] parameter(5)
+  broadcast.597 = f32[150]{0} broadcast(parameter_5.5), dimensions={}
+  parameter_4.5 = f32[150]{0} parameter(4)
+  subtract.67 = f32[150]{0} subtract(broadcast.597, parameter_4.5)
+  multiply.65 = f32[150]{0} multiply(broadcast.596, subtract.67)
+  broadcast.598 = f32[32,4,72,150]{3,2,1,0} broadcast(multiply.65), dimensions={3}
+  parameter_2.5 = f16[32,4,72,150]{3,2,1,0} parameter(2)
+  convert.462 = f32[32,4,72,150]{3,2,1,0} convert(parameter_2.5)
+  add.123 = f32[32,4,72,150]{3,2,1,0} add(broadcast.598, convert.462)
+  parameter_1.6 = f32[] parameter(1)
+  broadcast.599 = f32[32,4,72,150]{3,2,1,0} broadcast(parameter_1.6), dimensions={}
+  clamp.55 = f32[32,4,72,150]{3,2,1,0} clamp(broadcast.595, add.123, broadcast.599)
+  constant.474 = f32[] constant(-inf)
+  reduce.45 = f32[32,4,72]{2,1,0} reduce(clamp.55, constant.474), dimensions={3}, to_apply=max_computation
+  broadcast.600 = f32[32,4,72,150]{3,2,1,0} broadcast(reduce.45), dimensions={0,1,2}
+  subtract.68 = f32[32,4,72,150]{3,2,1,0} subtract(clamp.55, broadcast.600)
+  exponential.35 = f32[32,4,72,150]{3,2,1,0} exponential(subtract.68)
+  constant.476 = f32[] constant(0)
+  reduce.46 = f32[32,4,72]{2,1,0} reduce(exponential.35, constant.476), dimensions={3}, to_apply=add_computation
+  broadcast.601 = f32[32,4,72,150]{3,2,1,0} broadcast(reduce.46), dimensions={0,1,2}
+  ROOT divide.28 = f32[32,4,72,150]{3,2,1,0} divide(exponential.35, broadcast.601)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+// Currently the pipeline wants to collapse everything to 2D, but for some
+// broadcasts this is not possible. This test is an example of such a case.
+TEST_F(SoftmaxFusionTest, 4DWithUncollapsibleBroadcast) {
+  const std::string& hlo_string = R"(
+HloModule module
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT maximum.1 = f32[] add(arg_0.1, arg_1.1)
+}
+
+ENTRY main {
+  parameter_1.6 = f32[32,72,150]{2,1,0} parameter(0)
+  broadcast.599 = f32[32,4,72,150]{3,2,1,0} broadcast(parameter_1.6), dimensions={0,2,3}
+  constant.474 = f32[] constant(-inf)
+  reduce.45 = f32[32,4,72]{2,1,0} reduce(broadcast.599, constant.474), dimensions={3}, to_apply=max_computation
+  broadcast.600 = f32[32,4,72,150]{3,2,1,0} broadcast(reduce.45), dimensions={0,1,2}
+  subtract.68 = f32[32,4,72,150]{3,2,1,0} subtract(broadcast.599, broadcast.600)
+  exponential.35 = f32[32,4,72,150]{3,2,1,0} exponential(subtract.68)
+  constant.476 = f32[] constant(0)
+  reduce.46 = f32[32,4,72]{2,1,0} reduce(exponential.35, constant.476), dimensions={3}, to_apply=add_computation
+  broadcast.601 = f32[32,4,72,150]{3,2,1,0} broadcast(reduce.46), dimensions={0,1,2}
+  ROOT divide.28 = f32[32,4,72,150]{3,2,1,0} divide(exponential.35, broadcast.601)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
 }
 
 TEST_F(SoftmaxFusionTest, DoubleSoftmaxPatternWithExtraStuff) {
@@ -187,18 +564,18 @@ add_computation {
 }
 
 ENTRY main {
-  param_0 = f32[32,8,128,128]{3,2,1,0} parameter(0)
-  log = f32[32,8,128,128]{3,2,1,0} log(param_0)
+  param_0 = f32[164,128]{1,0} parameter(0)
+  log = f32[164,128]{1,0} log(param_0)
   constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[32,8,128]{2,1,0} reduce(log, constant_neg_inf), dimensions={3}, to_apply=max_computation
-  broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
-  subtract = f32[32,8,128,128]{3,2,1,0} subtract(log, broadcast)
-  exponential = f32[32,8,128,128]{3,2,1,0} exponential(subtract)
-  negate = f32[32,8,128,128]{3,2,1,0} negate(exponential)
+  reduce = f32[164]{0} reduce(log, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[164,128]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[164,128]{1,0} subtract(log, broadcast)
+  exponential = f32[164,128]{1,0} exponential(subtract)
+  negate = f32[164,128]{1,0} negate(exponential)
   constant_zero = f32[] constant(0)
-  second_reduce = f32[32,8,128]{2,1,0} reduce(negate, constant_zero), dimensions={3}, to_apply=add_computation
-  second_broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(second_reduce), dimensions={0,1,2}
-  ROOT divide = f32[32,8,128,128]{3,2,1,0} divide(negate, second_broadcast)
+  second_reduce = f32[164]{0} reduce(negate, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[164,128]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT divide = f32[164,128]{1,0} divide(negate, second_broadcast)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -207,7 +584,7 @@ ENTRY main {
   EXPECT_TRUE(verifier().Run(module.get()).status().ok());
   VLOG(2) << module->ToString();
   auto* root = module->entry_computation()->root_instruction();
-  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Log(m::Parameter(0)))));
+  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter(0))));
   // Assert that we have matched both softmax patterns.
   ASSERT_TRUE(root->has_to_apply());
   HloInstruction* neg1;
@@ -218,10 +595,11 @@ ENTRY main {
           m::Negate(&neg1, m::Exp()),
           m::Broadcast(m::Reduce(m::Negate(&neg2, m::Exp()), m::Constant())))));
   EXPECT_EQ(neg1, neg2);
-  ASSERT_THAT(neg1,
-              GmockMatch(m::Negate(m::Exp(m::Subtract(
-                  m::Parameter(0),
-                  m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))))));
+  ASSERT_THAT(
+      neg1,
+      GmockMatch(m::Negate(m::Exp(m::Subtract(
+          m::Log(m::Parameter(0)),
+          m::Broadcast(m::Reduce(m::Log(m::Parameter(0)), m::Constant())))))));
 }
 
 TEST_F(SoftmaxFusionTest, TripleSoftmaxPattern) {
@@ -242,18 +620,18 @@ add_computation {
 }
 
 ENTRY main {
-  param_0 = f32[32,8,128,128]{3,2,1,0} parameter(0)
+  param_0 = f32[164,128]{1,0} parameter(0)
   constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[32,8,128]{2,1,0} reduce(param_0, constant_neg_inf), dimensions={3}, to_apply=max_computation
-  broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
-  subtract = f32[32,8,128,128]{3,2,1,0} subtract(param_0, broadcast)
+  reduce = f32[164]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[164,128]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[164,128]{1,0} subtract(param_0, broadcast)
   constant_zero = f32[] constant(0)
-  second_reduce = f32[32,8,128]{2,1,0} reduce(subtract, constant_zero), dimensions={3}, to_apply=add_computation
-  second_broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(second_reduce), dimensions={0,1,2}
-  divide = f32[32,8,128,128]{3,2,1,0} divide(subtract, second_broadcast)
-  third_reduce = f32[32,8,128]{2,1,0} reduce(divide, constant_zero), dimensions={3}, to_apply=add_computation
-  third_broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(third_reduce), dimensions={0,1,2}
-  ROOT add = f32[32,8,128,128]{3,2,1,0} add(divide, third_broadcast)
+  second_reduce = f32[164]{0} reduce(subtract, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[164,128]{1,0} broadcast(second_reduce), dimensions={0}
+  divide = f32[164,128]{1,0} divide(subtract, second_broadcast)
+  third_reduce = f32[164]{0} reduce(divide, constant_zero), dimensions={1}, to_apply=add_computation
+  third_broadcast = f32[164,128]{1,0} broadcast(third_reduce), dimensions={0}
+  ROOT add = f32[164,128]{1,0} add(divide, third_broadcast)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -428,17 +806,17 @@ add_computation {
 }
 
 ENTRY main {
-  param_0 = f32[32,8,128,128]{3,2,1,0} parameter(0)
+  param_0 = f32[164,128]{1,0} parameter(0)
   constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[32,8,128]{2,1,0} reduce(param_0, constant_neg_inf), dimensions={3}, to_apply=max_computation
-  broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
-  subtract = f32[32,8,128,128]{3,2,1,0} subtract(param_0, broadcast)
-  exponential = f32[32,8,128,128]{3,2,1,0} exponential(subtract)
-  add = f32[32,8,128,128]{3,2,1,0} add(exponential, exponential)
+  reduce = f32[164]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[164,128]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[164,128]{1,0} subtract(param_0, broadcast)
+  exponential = f32[164,128]{1,0} exponential(subtract)
+  add = f32[164,128]{1,0} add(exponential, exponential)
   constant_zero = f32[] constant(0)
-  second_reduce = f32[32,8,128]{2,1,0} reduce(add, constant_zero), dimensions={3}, to_apply=add_computation
-  second_broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(second_reduce), dimensions={0,1,2}
-  ROOT divide = f32[32,8,128,128]{3,2,1,0} divide(add, second_broadcast)
+  second_reduce = f32[164]{0} reduce(add, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[164,128]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT divide = f32[164,128]{1,0} divide(add, second_broadcast)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -447,21 +825,17 @@ ENTRY main {
   EXPECT_TRUE(verifier().Run(module.get()).status().ok());
   VLOG(2) << module->ToString();
   auto* root = module->entry_computation()->root_instruction();
-  HloInstruction* exp1;
-  HloInstruction* exp2;
   // Assert that we have matched both softmax patterns, but they are in separate
   // custom calls.
-  ASSERT_THAT(
-      root, GmockMatch(m::CustomCall(m::Add(m::Exp(&exp1, m::CustomCall()),
-                                            m::Exp(&exp2, m::CustomCall())))));
-  EXPECT_EQ(exp1, exp2);
-  ASSERT_THAT(exp1, GmockMatch(m::Exp(m::CustomCall(m::Parameter(0)))));
+  ASSERT_THAT(root, GmockMatch(m::CustomCall(m::CustomCall(m::Parameter(0)))));
   ASSERT_TRUE(root->has_to_apply());
   ASSERT_THAT(root->to_apply()->root_instruction(),
               GmockMatch(m::Divide(
-                  m::Parameter(0),
-                  m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))));
-  const HloInstruction* custom_call = exp1->operand(0);
+                  m::Add(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(0))),
+                  m::Broadcast(m::Reduce(
+                      m::Add(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(0))),
+                      m::Constant())))));
+  const HloInstruction* custom_call = root->operand(0);
   ASSERT_TRUE(custom_call->has_to_apply());
   ASSERT_THAT(custom_call->to_apply()->root_instruction(),
               GmockMatch(m::Subtract(
@@ -487,17 +861,17 @@ add_computation {
 }
 
 ENTRY main {
-  param_0 = f32[32,8,128,128]{3,2,1,0} parameter(0)
+  param_0 = f32[164,128]{1,0} parameter(0)
   constant_neg_inf = f32[] constant(-inf)
-  reduce = f32[32,8,128]{2,1,0} reduce(param_0, constant_neg_inf), dimensions={3}, to_apply=max_computation
-  broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(reduce), dimensions={0,1,2}
-  subtract = f32[32,8,128,128]{3,2,1,0} subtract(param_0, broadcast)
-  exponential = f32[32,8,128,128]{3,2,1,0} exponential(subtract)
+  reduce = f32[164]{0} reduce(param_0, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[164,128]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[164,128]{1,0} subtract(param_0, broadcast)
+  exponential = f32[164,128]{1,0} exponential(subtract)
   constant_zero = f32[] constant(0)
-  second_reduce = f32[32,8,128]{2,1,0} reduce(exponential, constant_zero), dimensions={3}, to_apply=add_computation
-  second_broadcast = f32[32,8,128,128]{3,2,1,0} broadcast(second_reduce), dimensions={0,1,2}
-  divide = f32[32,8,128,128]{3,2,1,0} divide(exponential, second_broadcast)
-  ROOT add = f32[32,8,128,128]{3,2,1,0} add(divide, exponential)
+  second_reduce = f32[164]{0} reduce(exponential, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[164,128]{1,0} broadcast(second_reduce), dimensions={0}
+  divide = f32[164,128]{1,0} divide(exponential, second_broadcast)
+  ROOT add = f32[164,128]{1,0} add(divide, exponential)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).value();
@@ -530,6 +904,406 @@ ENTRY main {
                   m::Broadcast(m::Reduce(m::Parameter(0), m::Constant())))));
 }
 
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternWithOneElementTupleResult) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  param_0 = f32[13,1,5]{2,1,0} parameter(0)
+  constant_zero = f32[] constant(0)
+  reduction = f32[13,1]{1,0} reduce(param_0, constant_zero), dimensions={2}, to_apply=add_computation
+  broadcast = f32[13,1,5]{2,1,0} broadcast(reduction), dimensions={0,1}
+  root_add = f32[13,1,5]{2,1,0} add(param_0, broadcast)
+  ROOT tuple = (f32[13,1,5]{2,1,0}) tuple(root_add)
+})";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SingleSoftmaxPatternMergeSomeUnaryElementwiseOps) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  param_0 = f32[13,1,5]{2,1,0} parameter(0)
+  constant_zero = f32[] constant(0)
+  sign = f32[13,1,5]{2,1,0} sign(param_0)
+  convert = c64[13,1,5]{2,1,0} convert(sign)
+  real = f32[13,1,5]{2,1,0} real(convert)
+  reduction = f32[13,1]{1,0} reduce(sign, constant_zero), dimensions={2}, to_apply=add_computation
+  broadcast = f32[13,1,5]{2,1,0} broadcast(reduction), dimensions={0,1}
+  ROOT add = f32[13,1,5]{2,1,0} add(real, broadcast)
+})";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest,
+       SingleSoftmaxPatternWithDoublyConsumedReducerIdentity) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  param_0 = f32[14,8]{1,0} parameter(0)
+  constant_one = f32[] constant(1)
+  broadcast_one = f32[14,8]{1,0} broadcast(constant_one), dimensions={}
+  constant_zero = f32[] constant(0)
+  broadcast_zero = f32[14,8]{1,0} broadcast(constant_zero), dimensions={}
+  rsqrt = f32[14,8]{1,0} rsqrt(param_0)
+  compare = pred[14,8]{1,0} compare(broadcast_zero, rsqrt), direction=EQ
+  select = f32[14,8]{1,0} select(compare, broadcast_one, rsqrt)
+  reduction = f32[14]{0} reduce(select, constant_zero), dimensions={1}, to_apply=add_computation
+  broadcast = f32[14,8]{1,0} broadcast(reduction), dimensions={0}
+  ROOT add = f32[14,8]{1,0} add(select, broadcast)
+})";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, TryMatchBroadcastIntoOpReturningATuple) {
+  // This is a regression test to check that we do not try to call rank() on a
+  // tuple shape.
+  const std::string& hlo_string = R"(
+HloModule match_softmax
+
+ENTRY main {
+  param_0 = f32[14]{0} parameter(0)
+  param_1 = f32[14]{0} parameter(1)
+  broadcast = f32[14,8]{1,0} broadcast(param_0), dimensions={0}
+  ROOT custom_call = (f32[2]{0}, f32[2]{0}) custom-call(param_1, broadcast), custom_call_target="custom_call"
+})";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SoftmaxMatcherIgnoresPatternWithComplexProducer) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = c64[] parameter(0)
+  arg1 = c64[] parameter(1)
+  ROOT add = c64[] add(arg0, arg1)
+}
+
+ENTRY main {
+  param_0 = c64[128,127]{1,0} parameter(0)
+  param_1 = c64[] parameter(1)
+  reduce = c64[128]{0} reduce(param_0, param_1), dimensions={1}, to_apply=add_computation
+  broadcast = c64[128,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = c64[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, SoftmaxMatcherIgnoresPatternWithUnsignedProducer) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = u32[] parameter(0)
+  arg1 = u32[] parameter(1)
+  ROOT add = u32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  param_0 = u32[128,127]{1,0} parameter(0)
+  param_1 = u32[] parameter(1)
+  reduce = u32[128]{0} reduce(param_0, param_1), dimensions={1}, to_apply=add_computation
+  broadcast = u32[128,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = u32[128,127]{1,0} subtract(param_0, broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxFusion fusion;
+  EXPECT_FALSE(fusion.Run(module.get()).value());
+}
+
+TEST_F(SoftmaxFusionTest, FuseableOpsInvolvingComplexNumbersBeforeProducer) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  %param_0 = c64[13,3]{1,0} parameter(0)
+  %constant_zero = f32[] constant(0)
+  %abs = f32[13,3]{1,0} abs(c64[13,3]{1,0} %param_0)
+  %cosine = f32[13,3]{1,0} cosine(f32[13,3]{1,0} %abs)
+  %reduce = f32[13]{0} reduce(f32[13,3]{1,0} %cosine, f32[] %constant_zero), dimensions={1}, to_apply=add_computation
+  %broadcast = f32[13,3]{1,0} broadcast(f32[13]{0} %reduce), dimensions={0}
+  ROOT add = f32[13,3]{1,0} add(f32[13,3]{1,0} %cosine, f32[13,3]{1,0} %broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, FuseableOpsInvolvingUnsignedBeforeProducer) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  %param_0 = u32[13,3]{1,0} parameter(0)
+  %constant_zero = f32[] constant(0)
+  %convert = f32[13,3]{1,0} convert(u32[13,3]{1,0} %param_0)
+  %cosine = f32[13,3]{1,0} cosine(f32[13,3]{1,0} %convert)
+  %reduce = f32[13]{0} reduce(f32[13,3]{1,0} %cosine, f32[] %constant_zero), dimensions={1}, to_apply=add_computation
+  %broadcast = f32[13,3]{1,0} broadcast(f32[13]{0} %reduce), dimensions={0}
+  ROOT add = f32[13,3]{1,0} add(f32[13,3]{1,0} %cosine, f32[13,3]{1,0} %broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_F(SoftmaxFusionTest, FuseableOpsInvolvingBfloat16BeforeProducer) {
+  const std::string& hlo_string = R"(
+HloModule softmax
+
+add_computation {
+  arg0 = f32[] parameter(0)
+  arg1 = f32[] parameter(1)
+  ROOT add = f32[] add(arg0, arg1)
+}
+
+ENTRY main {
+  %param_0 = bf16[13,3]{1,0} parameter(0)
+  %constant_zero = f32[] constant(0)
+  %convert = f32[13,3]{1,0} convert(bf16[13,3]{1,0} %param_0)
+  %cosine = f32[13,3]{1,0} cosine(f32[13,3]{1,0} %convert)
+  %reduce = f32[13]{0} reduce(f32[13,3]{1,0} %cosine, f32[] %constant_zero), dimensions={1}, to_apply=add_computation
+  %broadcast = f32[13,3]{1,0} broadcast(f32[13]{0} %reduce), dimensions={0}
+  ROOT add = f32[13,3]{1,0} add(f32[13,3]{1,0} %cosine, f32[13,3]{1,0} %broadcast)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+class SoftmaxFusionEnd2EndTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<::testing::tuple<int, int>> {
+ public:
+  void TestSoftmaxPattern(const std::string& hlo_string_template);
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_softmax_fusion(true);
+    return debug_options;
+  }
+};
+
+void SoftmaxFusionEnd2EndTest::TestSoftmaxPattern(
+    const std::string& hlo_string_template) {
+  std::string hlo_string = absl::Substitute(
+      hlo_string_template, std::get<0>(GetParam()), std::get<1>(GetParam()));
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec(1e-6, 1e-6)));
+}
+
+TEST_P(SoftmaxFusionEnd2EndTest, SingleSoftmaxPattern) {
+  const std::string& hlo_string_template = R"(
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = f32[$0,$1]{1,0} parameter(0)
+  exponential = f32[$0,$1]{1,0} exponential(param_0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[$0]{0} reduce(exponential, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[$0,$1]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[$0,$1]{1,0} subtract(exponential, broadcast)
+}
+)";
+  TestSoftmaxPattern(hlo_string_template);
+}
+
+TEST_P(SoftmaxFusionEnd2EndTest, DoubleSoftmaxPattern) {
+  const std::string& hlo_string_template = R"(
+HloModule softmax
+
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+
+add_computation {
+  arg_0.1 = f32[] parameter(0)
+  arg_1.1 = f32[] parameter(1)
+  ROOT maximum = f32[] add(arg_0.1, arg_1.1)
+}
+
+ENTRY main {
+  param_0 = f32[] parameter(0)
+  param_1 = f32[] parameter(1)
+  param_2 = f32[$0,$1]{1,0} parameter(2)
+  broadcast_param_0 = f32[$0,$1]{1,0} broadcast(param_0), dimensions={}
+  broadcast_param_1 = f32[$0,$1]{1,0} broadcast(param_1), dimensions={}
+  clamp = f32[$0,$1]{1,0} clamp(broadcast_param_0, param_2, broadcast_param_1)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[$0]{0} reduce(clamp, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[$0,$1]{1,0} broadcast(reduce), dimensions={0}
+  subtract = f32[$0,$1]{1,0} subtract(clamp, broadcast)
+  exponential = f32[$0,$1]{1,0} exponential(subtract)
+  constant_zero = f32[] constant(0)
+  second_reduce = f32[$0]{0} reduce(exponential, constant_zero), dimensions={1}, to_apply=add_computation
+  second_broadcast = f32[$0,$1]{1,0} broadcast(second_reduce), dimensions={0}
+  ROOT divide = f32[$0,$1]{1,0} divide(exponential, second_broadcast)
+}
+)";
+  TestSoftmaxPattern(hlo_string_template);
+}
+
+std::string TestDataToString(
+    const ::testing::TestParamInfo<::testing::tuple<int, int>>& data) {
+  return absl::StrCat(std::get<0>(data.param), "x", std::get<1>(data.param));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SoftmaxFusionTestSuite, SoftmaxFusionEnd2EndTest,
+    ::testing::ValuesIn(
+        {std::make_tuple(0, 10), std::make_tuple(10, 0), std::make_tuple(1, 10),
+         std::make_tuple(10, 1),  // For this shape, the reduces/broadcasts will
+                                  // be simplified away.
+         std::make_tuple(2, 10), std::make_tuple(10, 2), std::make_tuple(32, 2),
+         std::make_tuple(32, 3), std::make_tuple(32, 4), std::make_tuple(32, 5),
+         std::make_tuple(32, 6), std::make_tuple(32, 7), std::make_tuple(32, 8),
+         std::make_tuple(32, 9), std::make_tuple(32, 10),
+         std::make_tuple(32, 11), std::make_tuple(32, 12),
+         std::make_tuple(32, 13), std::make_tuple(32, 14),
+         std::make_tuple(32, 15), std::make_tuple(32, 16),
+         std::make_tuple(32, 17), std::make_tuple(32, 18),
+         std::make_tuple(127, 125), std::make_tuple(128, 128),
+         std::make_tuple(9216, 150), std::make_tuple(0, 0)}),
+    TestDataToString);
+
+struct ReductionParams {
+  std::string reduction_op;
+  std::string element_type;
+  std::string init_value;
+  ReductionParams(std::string reduction_op, std::string element_type,
+                  std::string init_value)
+      : reduction_op(reduction_op),
+        element_type(element_type),
+        init_value(init_value) {}
+};
+
+class SoftmaxFusionReductionEnd2EndTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<ReductionParams> {
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_softmax_fusion(true);
+    return debug_options;
+  }
+};
+
+TEST_P(SoftmaxFusionReductionEnd2EndTest, SingleSoftmaxPattern) {
+  const std::string& hlo_string_template = R"(
+HloModule softmax
+
+reduce_computation {
+  arg_0 = $1[] parameter(0)
+  arg_1 = $1[] parameter(1)
+  ROOT result = $1[] $0(arg_0, arg_1)
+}
+
+ENTRY main {
+  param_0 = $1[18,17]{1,0} parameter(0)
+  init = $1[] constant($2)
+  reduce = $1[18]{0} reduce(param_0, init), dimensions={1}, to_apply=reduce_computation
+  broadcast = $1[18,17]{1,0} broadcast(reduce), dimensions={0}
+  ROOT result = $1[18,17]{1,0} $0(param_0, broadcast)
+}
+)";
+  ReductionParams params = GetParam();
+  std::string hlo_string =
+      absl::Substitute(hlo_string_template, params.reduction_op,
+                       params.element_type, params.init_value);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  auto initial_module = module->Clone();
+  SoftmaxFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).value());
+
+  EXPECT_TRUE(RunAndCompare(std::move(initial_module), ErrorSpec(1e-3, 1e-3)));
+}
+
+std::string ReductionTestDataToString(
+    const ::testing::TestParamInfo<ReductionParams>& data) {
+  return absl::StrCat(data.param.reduction_op, "_", data.param.element_type);
+}
+
+INSTANTIATE_TEST_SUITE_P(SoftmaxFusionReductionTestSuite,
+                         SoftmaxFusionReductionEnd2EndTest,
+                         ::testing::ValuesIn({
+                             ReductionParams("add", "f16", "0.0"),
+                             ReductionParams("add", "f32", "0.0"),
+                             ReductionParams("add", "s16", "0"),
+                             ReductionParams("add", "s32", "0"),
+                             ReductionParams("multiply", "f16", "1.0"),
+                             ReductionParams("multiply", "f32", "1.0"),
+                             ReductionParams("multiply", "s16", "1"),
+                             ReductionParams("multiply", "s32", "1"),
+                             ReductionParams("minimum", "f16", "inf"),
+                             ReductionParams("minimum", "f32", "inf"),
+                             ReductionParams("minimum", "s16", "32767"),
+                             ReductionParams("minimum", "s32", "2147483647"),
+                             ReductionParams("maximum", "f16", "-inf"),
+                             ReductionParams("maximum", "f32", "-inf"),
+                             ReductionParams("maximum", "s16", "-32768"),
+                             ReductionParams("maximum", "s32", "-2147483648"),
+                             ReductionParams("and", "pred", "true"),
+                             ReductionParams("or", "pred", "false"),
+                             ReductionParams("xor", "pred", "false"),
+                         }),
+                         ReductionTestDataToString);
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index fe932780b83..2041edf88c9 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -17,14 +17,15 @@ limitations under the License.
 
 #include <memory>
 #include <random>
+#include <string_view>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel_spec.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -318,7 +319,8 @@ absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec) {
 
 StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
     absl::string_view kernel_name, uint64_t num_args, absl::string_view ptx,
-    absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec) {
+    absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec,
+    uint32_t shared_mem_bytes) {
   se::MultiKernelLoaderSpec loader_spec(num_args);
   loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
 
@@ -329,22 +331,30 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
 
   auto kernel_base = std::make_unique<se::KernelBase>(stream_exec);
   TF_RETURN_IF_ERROR(stream_exec->GetKernel(loader_spec, kernel_base.get()));
+  se::KernelMetadata m;
+  m.set_shared_memory_bytes(shared_mem_bytes);
+  kernel_base->set_metadata(m);
   return std::move(kernel_base);
 }
 
 template <int n>
 static std::unique_ptr<se::KernelArgsArrayBase> MakeKernelArgs(
-    absl::Span<const se::DeviceMemoryBase> args) {
+    absl::Span<const se::DeviceMemoryBase> args, uint32_t shared_mem_bytes) {
   auto kernel_args = std::make_unique<se::KernelArgsArray<n>>();
   for (const se::DeviceMemoryBase& buf : args) {
     kernel_args->add_device_memory_argument(buf);
   }
+  if (shared_mem_bytes > 0) {
+    kernel_args->add_shared_bytes(shared_mem_bytes);
+  }
   return kernel_args;
 }
 
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
                              const LaunchDimensions& dims, se::Stream* stream) {
+  int shared_mem_bytes = 0;
+  kernel.metadata().shared_memory_bytes(&shared_mem_bytes);
   static constexpr int kKernelArgsLimit = 1024;
   std::unique_ptr<se::KernelArgsArrayBase> kernel_args;
   // The KernelArgsArray structure requires at a minimum 48 * args.size()
@@ -352,11 +362,11 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
   // specializations for smaller sizes. 64 arguments are likely to fit in a
   // 4KiB page.
   if (args.size() <= 64) {
-    kernel_args = MakeKernelArgs<64>(args);
+    kernel_args = MakeKernelArgs<64>(args, shared_mem_bytes);
   } else if (args.size() <= 256) {
-    kernel_args = MakeKernelArgs<256>(args);
+    kernel_args = MakeKernelArgs<256>(args, shared_mem_bytes);
   } else {
-    kernel_args = MakeKernelArgs<kKernelArgsLimit>(args);
+    kernel_args = MakeKernelArgs<kKernelArgsLimit>(args, shared_mem_bytes);
   }
 
   LaunchDimensions::Dim3D thread_counts = dims.thread_counts_per_block();
@@ -516,7 +526,8 @@ bool RequireDeterminism(const HloModuleConfig& config) {
 
 StatusOr<AutotuneResult> PickBestResult(
     absl::Span<AutotuneResult const> profile_results,
-    const HloInstruction& instr) {
+    std::optional<std::string_view> instr_str,
+    HloModuleConfig hlo_module_config) {
   std::vector<AutotuneResult> filtered_results;
 
   // For now, we ignore WRONG_RESULT failures because false-positives are
@@ -532,8 +543,14 @@ StatusOr<AutotuneResult> PickBestResult(
 
   if (filtered_results.empty()) {
     std::ostringstream msg;
-    msg << "All algorithms tried for " << instr.ToString()
-        << " failed. Falling back to default algorithm.  Per-algorithm errors:";
+    if (instr_str.has_value()) {
+      msg << "All algorithms tried for " << instr_str.value()
+          << " failed. Falling back to default algorithm.  Per-algorithm "
+             "errors:";
+    } else {
+      msg << "All algorithms failed. Falling back to the default algorithm. "
+          << "Per-algorithm errors:";
+    }
     for (const auto& result : profile_results) {
       msg << "\n  " << result.failure().msg();
     }
@@ -541,7 +558,7 @@ StatusOr<AutotuneResult> PickBestResult(
   }
 
   auto selected_result = filtered_results.begin();
-  if (!RequireDeterminism(instr.GetModule()->config())) {
+  if (!RequireDeterminism(hlo_module_config)) {
     selected_result = absl::c_min_element(
         filtered_results,
         [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index a971dfba7f0..a1b4d2219ae 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
+#include <string_view>
+
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout.h"
@@ -27,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/tsl/protobuf/autotuning.pb.h"
 
 // Helper functions for interacting with StreamExecutor.
 
@@ -80,7 +82,8 @@ absl::Mutex& GetGpuMutex(const se::StreamExecutor* stream_exec);
 // the lifetime of the kernel.
 StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
     absl::string_view kernel_name, uint64_t num_args, absl::string_view ptx,
-    absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec);
+    absl::Span<const uint8_t> cubin_data, se::StreamExecutor* stream_exec,
+    uint32_t shared_mem_bytes = 0);
 
 // Runs loaded kernel on the stream with the provided arguments.
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
@@ -104,7 +107,8 @@ StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(PrimitiveType type);
 // If deterministic output is requested, returns first (not failing) result.
 StatusOr<tensorflow::AutotuneResult> PickBestResult(
     absl::Span<tensorflow::AutotuneResult const> profile_results,
-    const HloInstruction& instr);
+    std::optional<std::string_view> instr_str,
+    HloModuleConfig hlo_module_config);
 
 // Returns whether determinism is required.
 bool RequireDeterminism(const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc
index dbe30f05538..4101934ffdf 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@@ -78,24 +78,24 @@ struct TargetIntrinsics GetIntrinsic(TargetIntrinsicID intrin) {
       return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
               [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
                 return EmitDeviceFunctionCall("__ockl_get_local_size",
-                                              {b_->getInt32(0)}, {U32}, U64, {},
-                                              b_);
+                                              {b_->getInt32(0)}, {U32}, U64,
+                                              {b_->getContext()}, b_);
               }};
     }
     case TargetIntrinsicID::kBlockDimy: {
       return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_y,
               [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
                 return EmitDeviceFunctionCall("__ockl_get_local_size",
-                                              {b_->getInt32(1)}, {U32}, U64, {},
-                                              b_);
+                                              {b_->getInt32(1)}, {U32}, U64,
+                                              {b_->getContext()}, b_);
               }};
     }
     case TargetIntrinsicID::kBlockDimz: {
       return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_z,
               [](llvm::IRBuilder<>* b_) -> llvm::CallInst* {
                 return EmitDeviceFunctionCall("__ockl_get_local_size",
-                                              {b_->getInt32(2)}, {U32}, U64, {},
-                                              b_);
+                                              {b_->getInt32(2)}, {U32}, U64,
+                                              {b_->getContext()}, b_);
               }};
     }
     case TargetIntrinsicID::kGroupBarrierId: {
@@ -158,6 +158,9 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
     case TargetDeviceFunctionID::kSqrt: {
       return {"__nv_sqrt", "__ocml_sqrt"};
     }
+    case TargetDeviceFunctionID::kTan: {
+      return {"__nv_tan", "__ocml_tan"};
+    }
     case TargetDeviceFunctionID::kTanh: {
       return {"__nv_tanh", "__ocml_tanh"};
     }
@@ -198,8 +201,8 @@ std::string ObtainDeviceFunctionName(TargetDeviceFunctionID func_id,
 llvm::CallInst* EmitDeviceFunctionCall(
     const std::string& callee_name, absl::Span<llvm::Value* const> operands,
     absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
-    absl::Span<const llvm::Attribute::AttrKind> attributes,
-    llvm::IRBuilder<>* b, absl::string_view name) {
+    const llvm::AttrBuilder& attributes, llvm::IRBuilder<>* b,
+    absl::string_view name) {
   std::vector<llvm::Type*> ir_input_types;
   llvm::Module* module = b->GetInsertBlock()->getModule();
   for (PrimitiveType input_type : input_types) {
@@ -218,9 +221,7 @@ llvm::CallInst* EmitDeviceFunctionCall(
           ->getOrInsertFunction(callee_name, callee_type)
           .getCallee());
 
-  for (auto attribute : attributes) {
-    callee->addFnAttr(attribute);
-  }
+  callee->addFnAttrs(attributes);
 
   return b->CreateCall(callee, llvm_ir::AsArrayRef(operands), name.data());
 }
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h
index 9f5f9a64cee..c4c518d4f83 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@@ -61,6 +61,7 @@ enum class TargetDeviceFunctionID {
   kRsqrt,
   kSin,
   kSqrt,
+  kTan,
   kTanh,
 };
 
@@ -69,8 +70,8 @@ enum class TargetDeviceFunctionID {
 llvm::CallInst* EmitDeviceFunctionCall(
     const std::string& callee_name, absl::Span<llvm::Value* const> operands,
     absl::Span<const PrimitiveType> input_type, PrimitiveType output_type,
-    absl::Span<const llvm::Attribute::AttrKind> attributes,
-    llvm::IRBuilder<>* b, absl::string_view name = "");
+    const llvm::AttrBuilder& attributes, llvm::IRBuilder<>* b,
+    absl::string_view name = "");
 
 // Emits a call to the specified target intrinsic with the given operands.
 // Overloaded intrinsics (for example, "minnum") must include a type
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 7272c53b7dd..61523829a19 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -1,12 +1,13 @@
 # Description: GPU-specific XLA tests. For example, codegen tests that
 # verify the IR emitted.
-#
-# TODO(jlebar): None of these tests actually use the GPU, so they should not
-# need to run on machines with GPUs present.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
 load(
     "//tensorflow/tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -22,6 +23,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -55,7 +57,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
@@ -66,7 +67,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "element_wise_row_vectorization_test",
     srcs = ["element_wise_row_vectorization_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -89,8 +90,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:ir_emitter",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core/common_runtime/gpu:gpu_init",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -98,7 +99,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "pred_arithmetic_test",
     srcs = ["pred_arithmetic_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -109,33 +110,31 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mlir_fft_test",
     srcs = ["mlir_fft_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":mlir_gpu_test_base",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mlir_gemm_test",
     srcs = ["mlir_gemm_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":mlir_gpu_test_base",
-        "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mlir_sorting_test",
     srcs = ["mlir_sorting_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -147,7 +146,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mlir_gpu_compile_test",
     srcs = ["mlir_gpu_compile_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -159,13 +158,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_reduce_scatter_creator_test",
     srcs = ["gpu_reduce_scatter_creator_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -178,7 +177,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_spmd_e2e_compile_test",
     size = "small",
     srcs = ["gpu_spmd_e2e_compile_test.cc"],
@@ -195,7 +194,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gemm_rewrite_test",
     srcs = [
         "gemm_rewrite_test.cc",
@@ -207,8 +206,8 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:pattern_matcher",
@@ -216,17 +215,17 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service/gpu:gemm_rewriter",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gemm_broadcast_folding_rewrite_test",
     srcs = [
         "gemm_broadcast_folding_rewrite_test.cc",
@@ -245,7 +244,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_too_many_blocks_test",
     srcs = [
         "gpu_too_many_blocks_test.cc",
@@ -261,7 +260,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduction_degenerate_dim_remover_test",
     srcs = [
         "reduction_degenerate_dim_remover_test.cc",
@@ -269,11 +268,10 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:reduction_degenerate_dim_remover",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -285,15 +283,17 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduction_layout_normalizer_test",
     srcs = [
         "reduction_layout_normalizer_test.cc",
     ],
+    tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:reduction_layout_normalizer",
@@ -308,7 +308,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "tree_reduction_rewriter_test",
     srcs = [
         "tree_reduction_rewriter_test.cc",
@@ -316,11 +316,10 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:tree_reduction_rewriter",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -333,7 +332,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "swap_conv_operands_test",
     srcs = [
         "swap_conv_operands_test.cc",
@@ -343,13 +342,12 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:gemm_rewriter",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -360,7 +358,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduction_vectorization_test",
     srcs = [
         "reduction_vectorization_test.cc",
@@ -370,13 +368,12 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:gemm_rewriter",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -389,7 +386,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "reduction_dimension_grouper_test",
     srcs = [
         "reduction_dimension_grouper_test.cc",
@@ -397,11 +394,10 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:reduction_dimension_grouper",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -413,7 +409,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "parallel_reduction_test",
     srcs = [
         "parallel_reduction_test.cc",
@@ -421,8 +417,8 @@ tf_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
@@ -435,7 +431,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_compilation_parallelism_test",
     srcs = [
         "gpu_compilation_parallelism_test.cc",
@@ -443,13 +439,13 @@ tf_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -459,14 +455,14 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_copy_alone_test",
     srcs = [
         "gpu_copy_alone_test.cc",
@@ -474,19 +470,19 @@ tf_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_dyn_shape_test",
     srcs = ["gpu_dyn_shape_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -495,7 +491,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -506,7 +502,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_index_test",
     srcs = ["gpu_index_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -517,7 +513,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -527,7 +523,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_infeed_test",
     srcs = ["infeed_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -549,13 +545,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_kernel_tiling_test",
     srcs = ["gpu_kernel_tiling_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -565,13 +561,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "transpose_emitter_test",
     srcs = ["transpose_emitter_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -581,7 +577,23 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
+    name = "reduction_emitter_test",
+    srcs = ["reduction_emitter_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
     name = "gpu_ldg_test",
     srcs = ["gpu_ldg_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -591,7 +603,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -599,7 +611,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_noalias_test",
     srcs = ["gpu_noalias_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -609,14 +621,14 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_fusion_test",
     srcs = ["gpu_fusion_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -624,6 +636,7 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_device_info_for_tests",
         "//tensorflow/compiler/xla/service/gpu:gpu_fusible",
         "//tensorflow/compiler/xla/service/gpu:instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -632,17 +645,15 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_fusion_pipeline_test",
     srcs = ["gpu_fusion_pipeline_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service/gpu:fusion_merger",
-        "//tensorflow/compiler/xla/service/gpu:gpu_fusible",
+        "//tensorflow/compiler/xla/service/gpu:gpu_device_info_for_tests",
         "//tensorflow/compiler/xla/service/gpu:instruction_fusion",
         "//tensorflow/compiler/xla/service/gpu:multi_output_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -651,7 +662,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_unrolling_test",
     srcs = ["gpu_unrolling_test.cc"],
     tags = tf_cuda_tests_tags() + ["no_rocm"],
@@ -665,7 +676,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_alignment_test",
     testonly = True,
     srcs = ["gpu_alignment_test.cc"],
@@ -682,7 +693,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "gpu_atomic_test",
     srcs = ["gpu_atomic_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -694,13 +705,32 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_test(
+    name = "elemental_ir_emitter_test",
+    srcs = ["elemental_ir_emitter_test.cc"],
+    backend_tags = {"gpu": [
+        "requires-gpu-nvidia",
+        "requires-gpu-sm70-only",
+        "requires-gpu-sm80-only",
+    ]},
+    backends = [
+        "gpu",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+xla_cc_test(
     name = "gpu_input_fusible_slice_test",
     srcs = ["gpu_input_fusible_slice_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -728,7 +758,7 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "select_and_scatter_test",
     srcs = ["select_and_scatter_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -740,7 +770,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -749,12 +779,11 @@ tf_cc_test(
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
@@ -766,14 +795,14 @@ tf_cc_test(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "hlo_to_llvm_ir",
     srcs = ["hlo_to_llvm_ir.cc"],
     copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
     deps = [
         "@llvm-project//llvm:Target",
         "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:gpu_device_info",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
@@ -781,11 +810,15 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/tools:hlo_module_loader",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/compiler/xla/stream_executor:device_description_proto_cc_impl",
+        "//tensorflow/compiler/xla/stream_executor:device_description",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/util:command_line_flags",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler_impl",
-        "//tensorflow/compiler/xla/service/gpu:nvptx_helper",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler_impl",
     ]),
@@ -815,7 +848,7 @@ filegroup(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "kernel_launch_test",
     srcs = ["kernel_launch_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -828,7 +861,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "in_place_op_test",
     srcs = ["in_place_op_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -839,19 +872,24 @@ tf_cc_test(
     ],
 )
 
-py_test(
-    name = "mnist",
-    srcs = ["mnist.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-        "requires-gpu-nvidia",
-        "requires-net:external",
-    ],
+xla_cc_test(
+    name = "dynamic_shared_memory_test",
+    srcs = if_cuda_is_configured(["dynamic_shared_memory_test.cc"]),
+    tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow:tensorflow_py",
-        "@absl_py//absl:app",
-    ],
+        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/tsl/platform:test",
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
+        "//tensorflow/compiler/xla/service/gpu:gpu_asm_opts_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc
new file mode 100644
index 00000000000..ce3938c7150
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_asm_opts_util.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = stream_executor;
+
+static const char *dyn_shmem_ptx = R"(
+.version 4.2
+.target sm_30
+.address_size 64
+
+.extern .shared .align 4 .b8 s[];
+
+.visible .entry dyn_shmem_kernel(.param .u64 buf) {
+.reg .b32 %r<6>;
+.reg .b64 %rd<9>;
+
+ld.param.u64 %rd1, [buf];
+cvta.to.global.u64 %rd2, %rd1;
+mov.u32 %r1, %tid.x;
+mov.u32 %r2, 511;
+sub.s32 %r3, %r2, %r1;
+mul.wide.s32 %rd3, %r1, 4;
+add.s64 %rd4, %rd2, %rd3;
+ld.global.u32 %r4, [%rd4];
+mov.u64 %rd5, s;
+add.s64 %rd6, %rd5, %rd3;
+st.shared.u32 [%rd6], %r4;
+bar.sync 0;
+mul.wide.s32 %rd7, %r3, 4;
+add.s64 %rd8, %rd5, %rd7;
+ld.shared.u32 %r5, [%rd8];
+st.global.u32 [%rd4], %r5;
+ret;
+}
+)";
+
+TEST(ShmemTest, ReverseArray) {
+  // Testing that dynamic shared memory is allocated to kernels requesting it.
+  se::Platform *platform =
+      se::MultiPlatformManager::PlatformWithName("cuda").value();
+  se::StreamExecutor *executor = platform->ExecutorForDevice(0).value();
+  se::Stream stream(executor);
+  stream.Init();
+
+  constexpr int n_elements = 512;
+  using dtype = int32_t;
+  constexpr int buffer_size_bytes = n_elements * sizeof(dtype);
+
+  se::DeviceMemory<dtype> dev_buf = executor->AllocateArray<dtype>(n_elements);
+  std::vector<dtype> host_buf(n_elements);
+  for (int i = 0; i < n_elements; ++i) {
+    host_buf[i] = i;
+  }
+  stream.ThenMemcpy(&dev_buf, host_buf.data(), buffer_size_bytes);
+  TF_CHECK_OK(stream.BlockHostUntilDone());
+
+  std::vector<uint8_t> compiled_ptx =
+      se::CompileGpuAsm(executor->device_ordinal(), dyn_shmem_ptx,
+                        PtxOptsFromDebugOptions(DebugOptions{}))
+          .value();
+
+  auto kernel = CreateKernel("dyn_shmem_kernel", /*num_args=*/1,
+                             reinterpret_cast<char *>(compiled_ptx.data()),
+                             /*cubin_data=*/{}, executor,
+                             /*shared_mem_bytes=*/n_elements * sizeof(dtype))
+                    .value();
+  ExecuteKernelOnStream(
+      *kernel, {dev_buf},
+      {/*block_x_count=*/1, /*thread_x_count_per_block=*/n_elements}, &stream)
+      .ok();
+  TF_CHECK_OK(stream.BlockHostUntilDone());
+  stream.ThenMemcpy(host_buf.data(), dev_buf, n_elements * sizeof(dtype));
+  for (int i = 0; i < n_elements; ++i) {
+    EXPECT_EQ(host_buf[i], n_elements - 1 - i);
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/elemental_ir_emitter_test.cc
new file mode 100644
index 00000000000..80e8035297a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/elemental_ir_emitter_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class ElementalIrEmitterTest : public GpuCodegenTest {
+ protected:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
+
+TEST_F(ElementalIrEmitterTest, TestConvertF32ToBF16) {
+  const char* hlo_string = R"(
+    HloModule convertF32ToBF16
+
+    ENTRY main {
+      f32_ = f32[] parameter(0)
+      ROOT bf16_ = bf16[] convert(f32[] f32_)
+    }
+  )";
+
+  if (GetCudaComputeCapability().IsAtLeast(8)) {
+    CompileAndVerifyIr(hlo_string, R"(
+CHECK: call i16 @llvm.nvvm.f2bf16.rn(float %{{.*}})
+)");
+  } else {
+    CompileAndVerifyIr(hlo_string, R"(
+CHECK-NOT: nvvm.f2bf16
+)");
+
+    CompileAndVerifyIr(hlo_string, R"(
+CHECK: bitcast float %{{.*}} to i32
+CHECK: trunc i32 %{{.*}} to i16
+)");
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
index bf554f73d06..192be52a89f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -14,25 +14,27 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -50,6 +52,16 @@ class GemmRewriteTest : public GpuCodegenTest {
         ->GetDeviceDescription()
         .cuda_compute_capability();
   }
+  void SetUp() override {
+    tf32_state_ = tsl::tensor_float_32_execution_enabled();
+    tsl::enable_tensor_float_32_execution(false);
+  }
+  void TearDown() override {
+    tsl::enable_tensor_float_32_execution(tf32_state_);
+  }
+
+ private:
+  bool tf32_state_;
 };
 
 TEST_F(GemmRewriteTest, CheckCustomCallTarget) {
@@ -110,9 +122,9 @@ ENTRY AddDotsFunc {
   ErrorSpec error_spec = [&] {
     DebugOptions debug_options = GetDebugOptionsForTest();
     if (debug_options.xla_gpu_enable_cublaslt()) {
-      return ErrorSpec{1e-3, 1e-5};
+      return ErrorSpec{1e-3, 1e-3};
     } else {
-      return ErrorSpec{1e-5, 1e-5};
+      return ErrorSpec{1e-3, 1e-3};
     }
   }();
 
@@ -203,6 +215,9 @@ class ParameterizedGemmRewriteTest
     GemmRewriteTest::MatchOptimizedHlo(
         hlo, absl::StrReplaceAll(pattern, replacements_), print_operand_shape);
   }
+  absl::string_view CustomCallTarget() {
+    return replacements_[kCustomCallTargetPlaceholder];
+  }
 
  private:
   static constexpr const char* kCustomCallTargetPlaceholder{
@@ -380,7 +395,7 @@ ENTRY AddDotsFunc {
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[5,3,2], y: f32[5,3,4]) -> f32[5,2,4] {
@@ -464,7 +479,7 @@ ENTRY AddDotsFunc {
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[3,2,5], y: f32[5,3,4]) -> f32[5,2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2,5]{2,1,0} parameter(0)
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} fusion([[P0]])
+; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} transpose([[P0]])
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[FUSION]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config="{
@@ -499,7 +514,7 @@ ENTRY AddDotsFunc {
 
   // Batch sizes larger than 2^16-1 are not supported by cublasLt. Ensure that
   // the custom_call_target is __cublas$gemm.
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[20000,4,3,2], y: f32[20000,4,3,4]) -> f32[20000,4,2,4] {
@@ -699,12 +714,7 @@ ENTRY AddDotsFunc {
 
 )";
 
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  if (!debug_options.xla_gpu_enable_cublaslt()) {
-    EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-  } else {
-    EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-2}));
-  }
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 1e-5}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: c64[2,2], y: c64[2,2]) -> c64[2,2] {
@@ -990,6 +1000,66 @@ ENTRY int8gemm {
   }
 }
 
+TEST_P(ParameterizedGemmRewriteTest, GemmTypeCombinationCheck) {
+  std::vector<std::tuple<absl::string_view, absl::string_view, bool>>
+      type_combinations = {
+          {"s8", "s32", true},    {"s8", "s8", true},   {"s32", "s32", true},
+          {"bf16", "bf16", true}, {"f16", "f16", true}, {"f32", "f32", true},
+          {"f64", "f64", true},   {"c64", "c64", true}, {"c128", "c128", true},
+      };
+
+  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+    // For compute capabilities before volta, we always do upcasting, so it
+    // would be impossible for this test to fail. That is why we only add these
+    // cases when the compute capabilit is at least Volta.
+    std::vector<std::tuple<absl::string_view, absl::string_view, bool>>
+        more_type_combinations = {
+            {"s8", "bf16", false},  {"s8", "f16", false},
+            {"s8", "f32", false},   {"s8", "f64", false},
+            {"s8", "c64", false},   {"s8", "c128", false},
+
+            {"s32", "f32", false},  {"s32", "f64", false},
+            {"s32", "c64", false},  {"s32", "c128", false},
+
+            {"f16", "bf16", false}, {"f16", "f32", false},
+            {"f16", "f64", false},  {"f16", "c64", false},
+            {"f16", "c128", false},
+
+            {"bf16", "f16", false}, {"bf16", "f64", false},
+            {"bf16", "c64", false}, {"bf16", "c128", false},
+
+            {"f32", "f64", false},  {"f32", "c64", false},
+            {"f32", "c128", false},
+
+            {"f64", "c64", false},  {"f64", "c128", false},
+        };
+    type_combinations.insert(type_combinations.end(),
+                             more_type_combinations.begin(),
+                             more_type_combinations.end());
+  }
+
+  for (const auto& type_combination : type_combinations) {
+    absl::flat_hash_map<absl::string_view, absl::string_view> replacements;
+    replacements["<<ABType>>"] = std::get<0>(type_combination);
+    replacements["<<DType>>"] = std::get<1>(type_combination);
+    const char* hlo_template = R"(
+  HloModule type_combo
+
+  ENTRY type_combo {
+    %parameter.1 = <<ABType>>[4,4]{1,0} parameter(0)
+    %parameter.2 = <<ABType>>[4,4]{1,0} parameter(1)
+    ROOT %dot = <<DType>>[4,4] dot(%parameter.1, %parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+    )";
+    const auto hlo_text = absl::StrReplaceAll(hlo_template, replacements);
+    if (std::get<2>(type_combination)) {
+      EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+    } else {
+      EXPECT_FALSE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+    }
+  }
+}
+
 TEST_P(ParameterizedGemmRewriteTest, UpcastingBf16ToF64) {
   const char* hlo_text = R"(
 HloModule test
@@ -1010,7 +1080,7 @@ ENTRY test {
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("__cublas$gemm")));
+              GmockMatch(m::CustomCall({"__cublas$gemm"})));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingC64ToC128) {
@@ -1033,7 +1103,7 @@ ENTRY test {
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("__cublas$gemm")));
+              GmockMatch(m::CustomCall({"__cublas$gemm"})));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF16ToF32) {
@@ -1053,10 +1123,8 @@ ENTRY test {
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
-  // This is a type combination which is not supported by cublasLt, expect
-  // GemmRewriter to choose legacy cublas.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("__cublas$gemm")));
+              GmockMatch(m::CustomCall({CustomCallTarget()})));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF16ToF64) {
@@ -1079,7 +1147,7 @@ ENTRY test {
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("__cublas$gemm")));
+              GmockMatch(m::CustomCall({"__cublas$gemm"})));
 }
 
 TEST_P(ParameterizedGemmRewriteTest, UpcastingF32ToF64) {
@@ -1102,7 +1170,7 @@ ENTRY test {
   // This is a type combination which is not supported by cublasLt, expect
   // GemmRewriter to choose legacy cublas.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::CustomCall("__cublas$gemm")));
+              GmockMatch(m::CustomCall({"__cublas$gemm"})));
 }
 
 INSTANTIATE_TEST_SUITE_P(CublasTestsBothLegacyAndLt,
@@ -1222,7 +1290,7 @@ ENTRY AddDotsFunc {
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[1024,1024], y: f32[1024,1024], bias: f32[1024,1024]) -> f32[1024,1024] {
@@ -1417,7 +1485,7 @@ ENTRY test {
       module->entry_computation()->root_instruction(),
       GmockMatch(
           m::Bitcast(
-              m::CustomCall("__cublas$gemm", m::Parameter(0), m::Parameter(1),
+              m::CustomCall({"__cublas$gemm"}, m::Parameter(0), m::Parameter(1),
                             m::Bitcast(m::Parameter(2)).WithShape(F32, {2, 2})))
               .WithShape(F32, {4})));
 }
@@ -1578,7 +1646,7 @@ ENTRY AddDotsFunc {
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[1024,1024], y: f32[1024,1024], bias: f32[1024,1024]) -> f32[1024,1024] {
@@ -2263,6 +2331,99 @@ ENTRY test {
 )");
 }
 
+TEST_F(CublasLtGemmRewriteTest, BF16VectorBias) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[16,24] parameter(0)
+  y = bf16[24,32] parameter(1)
+  z = bf16[32] parameter(2)
+  dot_a = bf16[16,32] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[16,32] broadcast(z), dimensions={1}
+  ROOT out = bf16[16,32] add(dot_a, z_bcast)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{3e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[16,24], y: bf16[24,32], z: bf16[32]) -> bf16[16,32] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = bf16[16,24]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = bf16[24,32]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = bf16[32]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[16,32]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, BF16VectorBiasPadded) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Padding of GEMM bf16 operands only implemented on "
+                    "architectures with bf16 Tensor Cores.";
+  }
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[2,3] parameter(0)
+  y = bf16[3,4] parameter(1)
+  z = bf16[4] parameter(2)
+  dot_a = bf16[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[2,4] broadcast(z), dimensions={1}
+  ROOT out = bf16[2,4] add(dot_a, z_bcast)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[2,3], y: bf16[3,4], z: bf16[4]) -> bf16[2,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = bf16[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-NEXT:    [[P0_PADDED:%[^ ]+]] = bf16[8,8]{1,0} pad([[P0]], [[C0]]), padding=0_6x0_5
+; CHECK-NEXT:    [[P1:%[^ ]+]] = bf16[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_PADDED:%[^ ]+]] = bf16[8,8]{1,0} pad([[P1]], [[C0]]), padding=0_5x0_4
+; CHECK-NEXT:    [[P2:%[^ ]+]] = bf16[4]{0} parameter(2)
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[2,4]{1,0} slice([[MATMUL]]), slice={[0:2], [0:4]}
+      )");
+}
+
 TEST_F(CublasLtGemmRewriteTest, ReluActivation) {
   const char* hlo_text = R"(
 HloModule test
@@ -2439,6 +2600,51 @@ ENTRY test {
       )");
 }
 
+TEST_F(CublasLtGemmRewriteTest, SquareMatrixBiasReluActivation) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f32[4,4] parameter(0)
+  y = f32[4,4] parameter(1)
+  z = f32[4,4] parameter(2)
+  dot_a = f32[4,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  add = f32[4,4] add(dot_a, z)
+  c = f32[] constant(0)
+  c_bcast = f32[4,4] broadcast(c), dimensions={}
+  ROOT out = f32[4,4] maximum(add, c_bcast)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f32[4,4], y: f32[4,4], z: f32[4,4]) -> f32[4,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[4,4]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,4]{1,0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":1
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK:           }"
+      )");
+}
+
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivation) {
   const char* hlo_text = R"(
 HloModule test
@@ -2480,7 +2686,7 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
 ; CHECK:           }"
       )");
 }
@@ -2583,7 +2789,7 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
 ; CHECK:           }"
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} bitcast([[MATMUL]])
       )");
@@ -2633,41 +2839,53 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
 ; CHECK:           }"
       )");
 }
 
-// For F16, the sizes of all dimensions of the operands are required to be
-// multiples of 8 to allow matrix bias fusion.
-TEST_F(CublasLtGemmRewriteTest, MatrixBiasF16) {
+TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivation) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f16[8,16] parameter(0)
-  y = f16[16,8] parameter(1)
-  z = f16[8,8] parameter(2)
-  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT out = f16[8,8] add(dot_a, z)
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  dot = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  mul.0 = f32[2,4] multiply(dot, dot)
+  mul.1 = f32[2,4] multiply(dot, mul.0)
+  const.0 = f32[] constant(0.044715)
+  bcast.0 = f32[2,4] broadcast(const.0), dimensions={}
+  mul.2 = f32[2,4] multiply(mul.1, bcast.0)
+  add.0 = f32[2,4] add(dot, mul.2)
+  const.1 = f32[] constant(0.797884583)
+  bcast.1 = f32[2,4] broadcast(const.1), dimensions={}
+  mul.3 = f32[2,4] multiply(add.0, bcast.1)
+  tanh = f32[2,4] tanh(mul.3)
+  const.2 = f32[] constant(1)
+  bcast.2 = f32[2,4] broadcast(const.2), dimensions={}
+  add.2 = f32[2,4] add(tanh, bcast.2)
+  const.3 = f32[] constant(0.5)
+  bcast.3 = f32[2,4] broadcast(const.3), dimensions={}
+  mul.4 = f32[2,4] multiply(add.2, bcast.3)
+  ROOT out = f32[2,4] multiply(dot, mul.4)
 }
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8], z: f16[8,8]) -> f16[8,8] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
-; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4]) -> f32[2,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
 ; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
+; CHECK-DAG:         \"beta\":0
 ; CHECK-DAG:         \"dot_dimension_numbers\":{
 ; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
 ; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
@@ -2677,36 +2895,89 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK-DAG:         \"epilogue\":\"GELU\"
 ; CHECK:           }"
       )");
 }
 
-// For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
-// newer architectures) so that the sizes of all dimensions are multiples of 8.
-TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Unpadded) {
+TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationWrongConstant) {
+  // Modify one constant slightly, so it should no longer pattern match.
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f16[8,16] parameter(0)
-  y = f16[16,8] parameter(1)
-  z = f16[8] parameter(2)
-  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  z_bcast = f16[8,8] broadcast(z), dimensions={1}
-  ROOT add = f16[8,8] add(dot_a, z_bcast)
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  dot = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  mul.0 = f32[2,4] multiply(dot, dot)
+  mul.1 = f32[2,4] multiply(dot, mul.0)
+  const.0 = f32[] constant(0.05)
+  bcast.0 = f32[2,4] broadcast(const.0), dimensions={}
+  mul.2 = f32[2,4] multiply(mul.1, bcast.0)
+  add.0 = f32[2,4] add(dot, mul.2)
+  const.1 = f32[] constant(0.797884583)
+  bcast.1 = f32[2,4] broadcast(const.1), dimensions={}
+  mul.3 = f32[2,4] multiply(add.0, bcast.1)
+  tanh = f32[2,4] tanh(mul.3)
+  const.2 = f32[] constant(1)
+  bcast.2 = f32[2,4] broadcast(const.2), dimensions={}
+  add.2 = f32[2,4] add(tanh, bcast.2)
+  const.3 = f32[] constant(0.5)
+  bcast.3 = f32[2,4] broadcast(const.3), dimensions={}
+  mul.4 = f32[2,4] multiply(add.2, bcast.3)
+  ROOT out = f32[2,4] multiply(dot, mul.4)
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{8e-3, 2e-3}));
+
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8], z: f16[8]) -> f16[8,8] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
-; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-NOT: GELU
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivation) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  z = f32[4] parameter(2)
+  dot = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = f32[2,4] broadcast(z), dimensions={1}
+  add = f32[2,4] add(dot, z_bcast)
+  mul.0 = f32[2,4] multiply(add, add)
+  mul.1 = f32[2,4] multiply(add, mul.0)
+  const.0 = f32[] constant(0.044715)
+  bcast.0 = f32[2,4] broadcast(const.0), dimensions={}
+  mul.2 = f32[2,4] multiply(mul.1, bcast.0)
+  add.0 = f32[2,4] add(add, mul.2)
+  const.1 = f32[] constant(0.797884583)
+  bcast.1 = f32[2,4] broadcast(const.1), dimensions={}
+  mul.3 = f32[2,4] multiply(add.0, bcast.1)
+  tanh = f32[2,4] tanh(mul.3)
+  const.2 = f32[] constant(1)
+  bcast.2 = f32[2,4] broadcast(const.2), dimensions={}
+  add.2 = f32[2,4] add(tanh, bcast.2)
+  const.3 = f32[] constant(0.5)
+  bcast.3 = f32[2,4] broadcast(const.3), dimensions={}
+  mul.4 = f32[2,4] multiply(add.2, bcast.3)
+  ROOT out = f32[2,4] multiply(add, mul.4)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4], z: f32[4]) -> f32[2,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
@@ -2721,41 +2992,49 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_GELU\"
 ; CHECK:           }"
       )");
 }
 
-TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
-    GTEST_SKIP() << "Padding of GEMM operands only implemented on "
-                    "architectures with Tensor Cores.";
-  }
+TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationWithAux) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f16[6,12] parameter(0)
-  y = f16[12,6] parameter(1)
-  z = f16[6] parameter(2)
-  dot_a = f16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  z_bcast = f16[6,6] broadcast(z), dimensions={1}
-  ROOT add = f16[6,6] add(dot_a, z_bcast)
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  dot = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  mul.0 = f32[2,4] multiply(dot, dot)
+  mul.1 = f32[2,4] multiply(dot, mul.0)
+  const.0 = f32[] constant(0.044715)
+  bcast.0 = f32[2,4] broadcast(const.0), dimensions={}
+  mul.2 = f32[2,4] multiply(mul.1, bcast.0)
+  add.0 = f32[2,4] add(dot, mul.2)
+  const.1 = f32[] constant(0.797884583)
+  bcast.1 = f32[2,4] broadcast(const.1), dimensions={}
+  mul.3 = f32[2,4] multiply(add.0, bcast.1)
+  tanh = f32[2,4] tanh(mul.3)
+  const.2 = f32[] constant(1)
+  bcast.2 = f32[2,4] broadcast(const.2), dimensions={}
+  add.2 = f32[2,4] add(tanh, bcast.2)
+  const.3 = f32[] constant(0.5)
+  bcast.3 = f32[2,4] broadcast(const.3), dimensions={}
+  mul.4 = f32[2,4] multiply(add.2, bcast.3)
+  mul.5 = f32[2,4] multiply(dot, mul.4)
+  ROOT out = (f32[2,4], f32[2,4]) tuple(mul.5, dot)
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f16[6,12], y: f16[12,6], z: f16[6]) -> f16[6,6] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[6,12]{1,0} parameter(0)
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f16[] constant(0)
-; CHECK-NEXT:    [[P0_PADDED:%[^ ]+]] = f16[8,16]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_4
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[12,6]{1,0} parameter(1)
-; CHECK-NEXT:    [[P1_PADDED:%[^ ]+]] = f16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
-; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[6]{0} parameter(2)
-; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
+; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4]) -> (f32[2,4], f32[2,4]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
@@ -2770,36 +3049,53 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK-DAG:         \"epilogue\":\"GELU_AUX\"
 ; CHECK:           }"
-; CHECK-NEXT:    [[OUT:%[^ ]+]] = f16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
 
-// For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
-// newer architectures) so that the sizes of all dimensions are multiples of 8.
-TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Unpadded) {
+TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivationWithAux) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f16[8,16] parameter(0)
-  y = f16[16,8] parameter(1)
-  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  c = f16[] constant(0)
-  c_bcast = f16[8,8] broadcast(c), dimensions={}
-  ROOT out = f16[8,8] maximum(dot_a, c_bcast)
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  z = f32[4] parameter(2)
+  dot = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = f32[2,4] broadcast(z), dimensions={1}
+  add = f32[2,4] add(dot, z_bcast)
+  mul.0 = f32[2,4] multiply(add, add)
+  mul.1 = f32[2,4] multiply(add, mul.0)
+  const.0 = f32[] constant(0.044715)
+  bcast.0 = f32[2,4] broadcast(const.0), dimensions={}
+  mul.2 = f32[2,4] multiply(mul.1, bcast.0)
+  add.0 = f32[2,4] add(add, mul.2)
+  const.1 = f32[] constant(0.797884583)
+  bcast.1 = f32[2,4] broadcast(const.1), dimensions={}
+  mul.3 = f32[2,4] multiply(add.0, bcast.1)
+  tanh = f32[2,4] tanh(mul.3)
+  const.2 = f32[] constant(1)
+  bcast.2 = f32[2,4] broadcast(const.2), dimensions={}
+  add.2 = f32[2,4] add(tanh, bcast.2)
+  const.3 = f32[] constant(0.5)
+  bcast.3 = f32[2,4] broadcast(const.3), dimensions={}
+  mul.4 = f32[2,4] multiply(add.2, bcast.3)
+  mul.5 = f32[2,4] multiply(add, mul.4)
+  ROOT out = (f32[2,4], f32[2,4]) tuple(mul.5, add)
 }
 
 )";
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8]) -> f16[8,8] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4], z: f32[4]) -> (f32[2,4], f32[2,4]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
@@ -2814,23 +3110,204 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_GELU_AUX\"
 ; CHECK:           }"
       )");
 }
 
-TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Padded) {
-  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
-    GTEST_SKIP() << "Padding of GEMM operands only implemented on "
-                    "architectures with Tensor Cores.";
-  }
+// For F16, the sizes of all dimensions of the operands are required to be
+// multiples of 8 to allow matrix bias fusion.
+TEST_F(CublasLtGemmRewriteTest, MatrixBiasF16) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f16[6,12] parameter(0)
-  y = f16[12,6] parameter(1)
-  dot_a = f16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  x = f16[8,16] parameter(0)
+  y = f16[16,8] parameter(1)
+  z = f16[8,8] parameter(2)
+  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT out = f16[8,8] add(dot_a, z)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8], z: f16[8,8]) -> f16[8,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":1
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+// For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
+// newer architectures) so that the sizes of all dimensions are multiples of 8.
+TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Unpadded) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f16[8,16] parameter(0)
+  y = f16[16,8] parameter(1)
+  z = f16[8] parameter(2)
+  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = f16[8,8] broadcast(z), dimensions={1}
+  ROOT add = f16[8,8] add(dot_a, z_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{8e-3, 2e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8], z: f16[8]) -> f16[8,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, VectorBiasF16Padded) {
+  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+    GTEST_SKIP() << "Padding of GEMM operands only implemented on "
+                    "architectures with Tensor Cores.";
+  }
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f16[6,12] parameter(0)
+  y = f16[12,6] parameter(1)
+  z = f16[6] parameter(2)
+  dot_a = f16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = f16[6,6] broadcast(z), dimensions={1}
+  ROOT add = f16[6,6] add(dot_a, z_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f16[6,12], y: f16[12,6], z: f16[6]) -> f16[6,6] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[6,12]{1,0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f16[] constant(0)
+; CHECK-NEXT:    [[P0_PADDED:%[^ ]+]] = f16[8,16]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_4
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[12,6]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_PADDED:%[^ ]+]] = f16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[6]{0} parameter(2)
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK:           }"
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = f16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
+      )");
+}
+
+// For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
+// newer architectures) so that the sizes of all dimensions are multiples of 8.
+TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Unpadded) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f16[8,16] parameter(0)
+  y = f16[16,8] parameter(1)
+  dot_a = f16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  c = f16[] constant(0)
+  c_bcast = f16[8,8] broadcast(c), dimensions={}
+  ROOT out = f16[8,8] maximum(dot_a, c_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f16[8,16], y: f16[16,8]) -> f16[8,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f16[8,16]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, ReluActivationF16Padded) {
+  if (!GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+    GTEST_SKIP() << "Padding of GEMM operands only implemented on "
+                    "architectures with Tensor Cores.";
+  }
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f16[6,12] parameter(0)
+  y = f16[12,6] parameter(1)
+  dot_a = f16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
   c = f16[] constant(0)
   c_bcast = f16[6,6] broadcast(c), dimensions={}
   ROOT out = f16[6,6] maximum(dot_a, c_bcast)
@@ -2955,7 +3432,7 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
 ; CHECK:           }"
       )");
 }
@@ -2997,7 +3474,7 @@ ENTRY test {
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
 ; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         "beta\":0
 ; CHECK-DAG:         \"dot_dimension_numbers\":{
 ; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
 ; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
@@ -3007,44 +3484,41 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
 ; CHECK:           }"
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
 
-TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF64) {
+// For bfloat16, the sizes of all dimensions of the operands are required to be
+// multiples of 8 to allow matrix bias fusion.
+TEST_F(CublasLtGemmRewriteTest, MatrixBiasBF16) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f64[2,3] parameter(0)
-  y = f64[3,4] parameter(1)
-  z = f64[4] parameter(2)
-  dot_a = f64[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  z_bcast = f64[2,4] broadcast(z), dimensions={1}
-  add = f64[2,4] add(dot_a, z_bcast)
-  c = f64[] constant(0)
-  c_bcast = f64[2,4] broadcast(c), dimensions={}
-  ROOT out = f64[2,4] maximum(add, c_bcast)
+  x = bf16[8,16] parameter(0)
+  y = bf16[16,8] parameter(1)
+  z = bf16[8,8] parameter(2)
+  dot_a = bf16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT out = bf16[8,8] add(dot_a, z)
 }
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-10, 1e-10}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f64[2,3], y: f64[3,4], z: f64[4]) -> f64[2,4] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f64[2,3]{1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f64[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    [[P2:%[^ ]+]] = f64[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f64[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-LABEL: ENTRY %test (x: bf16[8,16], y: bf16[16,8], z: bf16[8,8]) -> bf16[8,8] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[8,16]{1,0} parameter(0)
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
+; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8,8]{1,0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
 ; CHECK-DAG:         \"alpha_real\":1
 ; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"beta\":1
 ; CHECK-DAG:         \"dot_dimension_numbers\":{
 ; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
 ; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
@@ -3054,44 +3528,71 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
 ; CHECK:           }"
       )");
 }
 
-TEST_F(CublasLtGemmRewriteTest, AlphaSimpleRewriteBiasAddActivation) {
+TEST_F(CublasLtGemmRewriteTest, MatrixBiasBitcastBF16) {
   const char* hlo_text = R"(
 HloModule test
 
 ENTRY test {
-  x = f32[2,3] parameter(0)
-  y = f32[3,4] parameter(1)
-  z = f32[4] parameter(2)
-  k = f32[] constant(3.0)
-  k_bcast = f32[2,4] broadcast(k), dimensions={}
-  dot_a = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  dot_a_multiplied = f32[2, 4] multiply(dot_a, k_bcast)
-  z_bcast = f32[2,4] broadcast(z), dimensions={1}
-  add = f32[2,4] add(dot_a_multiplied, z_bcast)
-  c = f32[] constant(0)
-  c_bcast = f32[2,4] broadcast(c), dimensions={}
-  ROOT out = f32[2,4] maximum(add, c_bcast)
+  x = bf16[8,16] parameter(0)
+  y = bf16[16,8] parameter(1)
+  bias = bf16[2,4,8] parameter(2)
+  dot = bf16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = bf16[2,4,8] bitcast(dot)
+  ROOT out = bf16[2,4,8] add(bitcast, bias)
 }
 
 )";
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GemmRewriter pass(GetCudaComputeCapability());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Bitcast(m::CustomCall(
+                         {"__cublas$lt$matmul"},
+                         m::Parameter(0).WithShape(BF16, {8, 16}),
+                         m::Parameter(1).WithShape(BF16, {16, 8}),
+                         m::Bitcast(m::Parameter(2)).WithShape(BF16, {8, 8})))
+              .WithShape(BF16, {2, 4, 8})));
+}
+
+// For bfloat16, the operands are padded if necessary on Ampere and newer
+// architectures so that the sizes of all dimensions are multiples of 8.
+TEST_F(CublasLtGemmRewriteTest, VectorBiasBF16Unpadded) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[8,16] parameter(0)
+  y = bf16[16,8] parameter(1)
+  z = bf16[8] parameter(2)
+  dot_a = bf16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[8,8] broadcast(z), dimensions={1}
+  ROOT add = bf16[8,8] add(dot_a, z_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{8e-3, 2e-3}));
   MatchOptimizedHlo(hlo_text,
                     R"(
 
-; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4], z: f32[4]) -> f32[2,4] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
-; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK-LABEL: ENTRY %test (x: bf16[8,16], y: bf16[16,8], z: bf16[8]) -> bf16[8,8] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[8,16]{1,0} parameter(0)
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
+; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
+; CHECK-DAG:         \"alpha_real\":1
 ; CHECK-DAG:         \"alpha_imag\":0
 ; CHECK-DAG:         \"beta\":0
 ; CHECK-DAG:         \"dot_dimension_numbers\":{
@@ -3103,37 +3604,349 @@ ENTRY test {
 ; CHECK-DAG:         \"precision_config\":{
 ; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIASRELU\"
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
 ; CHECK:           }"
       )");
 }
 
-TEST_F(CublasLtGemmRewriteTest, MergeBitcastAndAdd) {
+TEST_F(CublasLtGemmRewriteTest, VectorBiasBF16Padded) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
+                    "Ampere and newer architectures.";
+  }
   const char* hlo_text = R"(
 HloModule test
+
 ENTRY test {
-  x = f32[2,2] parameter(0)
-  y = f32[2,2] parameter(1)
-  bias = f32[4] parameter(2)
-  dot = f32[2,2] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT out = f32[4] add(f32[4] bitcast(dot), bias)
+  x = bf16[6,12] parameter(0)
+  y = bf16[12,6] parameter(1)
+  z = bf16[6] parameter(2)
+  dot_a = bf16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[6,6] broadcast(z), dimensions={1}
+  ROOT add = bf16[6,6] add(dot_a, z_bcast)
 }
+
 )";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(GetCudaComputeCapability());
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
-  EXPECT_TRUE(changed);
+; CHECK-LABEL: ENTRY %test (x: bf16[6,12], y: bf16[12,6], z: bf16[6]) -> bf16[6,6] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[6,12]{1,0} parameter(0)
+; CHECK-DAG:     [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-DAG:     [[P0_PADDED:%[^ ]+]] = bf16[8,16]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_4
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[12,6]{1,0} parameter(1)
+; CHECK-DAG:     [[P1_PADDED:%[^ ]+]] = bf16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
+; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[6]{0} parameter(2)
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK:           }"
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
+      )");
+}
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Add(m::Bitcast(m::CustomCall("__cublas$lt$matmul",
-                                                         m::Parameter(0),
-                                                         m::Parameter(1))
-                                               .WithShape(F32, {2, 2}))
-                                    .WithShape(F32, {4}),
-                                m::Parameter(2))
-                             .WithShape(F32, {4})));
+// For bfloat16, the operands are padded if necessary on Ampere and newer
+// architectures so that the sizes of all dimensions are multiples of 8.
+TEST_F(CublasLtGemmRewriteTest, ReluActivationBF16Unpadded) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[8,16] parameter(0)
+  y = bf16[16,8] parameter(1)
+  dot_a = bf16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  c = bf16[] constant(0)
+  c_bcast = bf16[8,8] broadcast(c), dimensions={}
+  ROOT out = bf16[8,8] maximum(dot_a, c_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[8,16], y: bf16[16,8]) -> bf16[8,8] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[8,16]{1,0} parameter(0)
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, ReluActivationBF16Padded) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
+                    "Ampere and newer architectures.";
+  }
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[6,12] parameter(0)
+  y = bf16[12,6] parameter(1)
+  dot_a = bf16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  c = bf16[] constant(0)
+  c_bcast = bf16[6,6] broadcast(c), dimensions={}
+  ROOT out = bf16[6,6] maximum(dot_a, c_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[6,12], y: bf16[12,6]) -> bf16[6,6] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[6,12]{1,0} parameter(0)
+; CHECK-DAG:     [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-DAG:     [[P0_PADDED:%[^ ]+]] = bf16[8,16]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_4
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[12,6]{1,0} parameter(1)
+; CHECK-DAG:     [[P1_PADDED:%[^ ]+]] = bf16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"RELU\"
+; CHECK:           }"
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
+      )");
+}
+
+// For bfloat16, the operands are padded if necessary on Ampere and newer
+// architectures so that the sizes of all dimensions are multiples of 8.
+TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationBF16Unpadded) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[8,16] parameter(0)
+  y = bf16[16,8] parameter(1)
+  z = bf16[8] parameter(2)
+  dot_a = bf16[8,8] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[8,8] broadcast(z), dimensions={1}
+  add = bf16[8,8] add(dot_a, z_bcast)
+  c = bf16[] constant(0)
+  c_bcast = bf16[8,8] broadcast(c), dimensions={}
+  ROOT out = bf16[8,8] maximum(add, c_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{8e-3, 2e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[8,16], y: bf16[16,8], z: bf16[8]) -> bf16[8,8] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[8,16]{1,0} parameter(0)
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
+; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationBF16Padded) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Padding of GEMM operands in bfloat16 only implemented on "
+                    "Ampere and newer architectures.";
+  }
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = bf16[6,12] parameter(0)
+  y = bf16[12,6] parameter(1)
+  z = bf16[6] parameter(2)
+  dot_a = bf16[6,6] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = bf16[6,6] broadcast(z), dimensions={1}
+  add = bf16[6,6] add(dot_a, z_bcast)
+  c = bf16[] constant(0)
+  c_bcast = bf16[6,6] broadcast(c), dimensions={}
+  ROOT out = bf16[6,6] maximum(add, c_bcast)
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: bf16[6,12], y: bf16[12,6], z: bf16[6]) -> bf16[6,6] {
+; CHECK-DAG:     [[P0:%[^ ]+]] = bf16[6,12]{1,0} parameter(0)
+; CHECK-DAG:     [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-DAG:     [[P0_PADDED:%[^ ]+]] = bf16[8,16]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_4
+; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[12,6]{1,0} parameter(1)
+; CHECK-DAG:     [[P1_PADDED:%[^ ]+]] = bf16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
+; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[6]{0} parameter(2)
+; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
+; CHECK:           }"
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF64) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f64[2,3] parameter(0)
+  y = f64[3,4] parameter(1)
+  z = f64[4] parameter(2)
+  dot_a = f64[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  z_bcast = f64[2,4] broadcast(z), dimensions={1}
+  add = f64[2,4] add(dot_a, z_bcast)
+  c = f64[] constant(0)
+  c_bcast = f64[2,4] broadcast(c), dimensions={}
+  ROOT out = f64[2,4] maximum(add, c_bcast)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-10, 1e-10}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f64[2,3], y: f64[3,4], z: f64[4]) -> f64[2,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f64[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f64[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f64[4]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f64[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtGemmRewriteTest, AlphaSimpleRewriteBiasAddActivation) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY test {
+  x = f32[2,3] parameter(0)
+  y = f32[3,4] parameter(1)
+  z = f32[4] parameter(2)
+  k = f32[] constant(3.0)
+  k_bcast = f32[2,4] broadcast(k), dimensions={}
+  dot_a = f32[2,4] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_a_multiplied = f32[2, 4] multiply(dot_a, k_bcast)
+  z_bcast = f32[2,4] broadcast(z), dimensions={1}
+  add = f32[2,4] add(dot_a_multiplied, z_bcast)
+  c = f32[] constant(0)
+  c_bcast = f32[2,4] broadcast(c), dimensions={}
+  ROOT out = f32[2,4] maximum(add, c_bcast)
+}
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f32[2,3], y: f32[3,4], z: f32[4]) -> f32[2,4] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":3
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
+; CHECK:           }"
+      )");
 }
 
 TEST_F(CublasLtGemmRewriteTest, FoldConstantBias) {
@@ -3178,6 +3991,481 @@ ENTRY test {
           m::CustomCall(m::Parameter(0), m::Parameter(1), m::Constant()))));
 }
 
+TEST_F(CublasLtGemmRewriteTest, MultipleMaximumUsers) {
+  const char* hlo_text = R"(
+HloModule multiple_maximum_users
+
+relu {
+  Arg_0 = f32[3,896,54]{2,1,0} parameter(0)
+  constant = f32[] constant(0)
+  broadcast = f32[3,896,54]{2,1,0} broadcast(constant), dimensions={}
+  ROOT maximum = f32[3,896,54]{2,1,0} maximum(Arg_0, broadcast)
+}
+
+ENTRY main {
+  constant = f32[] constant(1)
+  broadcast_1 = f32[3,896,1024]{2,1,0} broadcast(constant), dimensions={}
+  Arg_2 = f32[1024,54]{1,0} parameter(2)
+  dot = f32[3,896,54]{2,1,0} dot(broadcast_1, Arg_2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  Arg_1 = f32[54]{0} parameter(1)
+  broadcast_2 = f32[3,896,54]{2,1,0} broadcast(Arg_1), dimensions={2}
+  add = f32[3,896,54]{2,1,0} add(dot, broadcast_2)
+  call = f32[3,896,54]{2,1,0} call(add), to_apply=relu
+  Arg_0 = f32[1]{0} parameter(0)
+  reshape_1 = f32[1,1,1]{2,1,0} reshape(Arg_0)
+  broadcast_3 = f32[1,1,1]{2,1,0} broadcast(reshape_1), dimensions={0,1,2}
+  reshape_2 = f32[] reshape(broadcast_3)
+  broadcast_4 = f32[3,896,54]{2,1,0} broadcast(reshape_2), dimensions={}
+  multiply = f32[3,896,54]{2,1,0} multiply(call, broadcast_4)
+  ROOT tuple = (f32[3,896,54]{2,1,0}, f32[3,896,54]{2,1,0}) tuple(multiply, call)
+}
+)";
+
+  // TODO(cjfj): Why do we need to relax the error constraint here?!
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-4}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+; CHECK:           custom_call_target="__cublas$lt$matmul",
+      )");
+}
+
+// This class can be replaced with CublasLtGemmRewriteTest once runtime supports
+// FP8.
+class CublasLtF8GemmRewriteTest : public CublasLtGemmRewriteTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        CublasLtGemmRewriteTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
+    return debug_options;
+  }
+};
+
+TEST_F(CublasLtF8GemmRewriteTest, UnscaledABUnscaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      ROOT out = f8e4m3fn[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16]) -> f8e4m3fn[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f8e4m3fn[16,16]{1,0} fusion()
+; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[FUSION]], [[C1]], [[C1]], /*index=5*/[[C1]], [[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, ScaledABUnscaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      ROOT out = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[]) -> f32[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, BitcastScaledABUnscaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[2,8,16] parameter(0)
+      y = f8e4m3fn[16,16] parameter(1)
+      x_f32 = f32[2,8,16] convert(x)
+      y_f32 = f32[16,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[2,8,16] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[16,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[2,8,16] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[16,16] multiply(y_f32, y_scale_bcast)
+      x_bitcast = f32[16,16] bitcast(x_unscaled)
+      ROOT out = f32[16,16] dot(x_bitcast, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  GemmRewriter pass(GetCudaComputeCapability());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::CustomCall({"__cublas$lt$matmul$f8"}).WithShape(F32, {16, 16})));
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[10,16,32] parameter(0)
+      y = f8e4m3fn[10,32,16] parameter(1)
+      x_f32 = f32[10,16,32] convert(x)
+      y_f32 = f32[10,32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[10,16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[10,32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[10,16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[10,32,16] multiply(y_f32, y_scale_bcast)
+      ROOT out = f32[10,16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={2}, rhs_contracting_dims={1}, lhs_batch_dims={0}, rhs_batch_dims={0}
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[10,16,32], y: f8e4m3fn[10,32,16], x_scale: f32[], y_scale: f32[]) -> f32[10,16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[10,16,32]{2,1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[10,32,16]{2,1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[10,16,32]{2,1,0} transpose([[P1]]), dimensions={0,2,1}
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[10,16,16]{2,1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[10,16,16]{2,1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, InvScaledABUnscaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[16,32] divide(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] divide(y_f32, y_scale_bcast)
+      ROOT out = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{2e-2, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, ScaledABScaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      z_scale = f32[] parameter(4)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a_scaled = f32[16,16] divide(dot_a, z_scale_bcast)
+      c1 = f32[] constant(-448.)
+      c1_bcast = f32[16,16] broadcast(c1), dimensions={}
+      c2 = f32[] constant(448.)
+      c2_bcast = f32[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f32[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      ROOT dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{2e-3, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> f8e4m3fn[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
+; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C1]], [[P4]])
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{ 
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, ScaledABInvScaledDF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      z_scale = f32[] parameter(4)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      dot_a_scaled = f32[16,16] multiply(dot_a, z_scale_bcast)
+      c1 = f32[] constant(-448.)
+      c1_bcast = f32[16,16] broadcast(c1), dimensions={}
+      c2 = f32[] constant(448.)
+      c2_bcast = f32[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f32[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      ROOT dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{2e-3, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-NOT:     divide
+
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+
+      )");
+}
+
+TEST_F(CublasLtF8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "cuBLASLt FP8 kernels require Hopper or newer architecture.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] maximum(a, b)
+    }
+
+    ENTRY test {
+      x = f8e4m3fn[16,32] parameter(0)
+      y = f8e4m3fn[32,16] parameter(1)
+      x_f32 = f32[16,32] convert(x)
+      y_f32 = f32[32,16] convert(y)
+      x_scale = f32[] parameter(2)
+      y_scale = f32[] parameter(3)
+      z_scale = f32[] parameter(4)
+      x_scale_bcast = f32[16,32] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[32,16] broadcast(y_scale), dimensions={}
+      z_scale_bcast = f32[16,16] broadcast(z_scale), dimensions={}
+      x_unscaled = f32[16,32] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[32,16] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[16,16] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      abs_dot_a = f32[16,16] abs(dot_a)
+      c0 = f32[] constant(-inf)
+      amax = f32[] reduce(abs_dot_a, c0), dimensions={0,1}, to_apply=apply
+      dot_a_scaled = f32[16,16] divide(dot_a, z_scale_bcast)
+      c1 = f32[] constant(-448.)
+      c1_bcast = f32[16,16] broadcast(c1), dimensions={}
+      c2 = f32[] constant(448.)
+      c2_bcast = f32[16,16] broadcast(c2), dimensions={}
+      dot_a_clamped = f32[16,16] clamp(c1_bcast, dot_a_scaled, c2_bcast)
+      dot_a_f8 = f8e4m3fn[16,16] convert(dot_a_clamped)
+      ROOT out = (f8e4m3fn[16,16], f32[]) tuple(dot_a_f8, amax)
+          }
+
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{2e-3, 0.}));
+  MatchOptimizedHlo(hlo_text,
+                    R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[16,32], y: f8e4m3fn[32,16], x_scale: f32[], y_scale: f32[], z_scale: f32[]) -> (f8e4m3fn[16,16], f32[]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
+; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
+; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
+; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C1]], [[P4]])
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config="{
+; CHECK-DAG:         \"alpha_real\":1
+; CHECK-DAG:         \"alpha_imag\":0
+; CHECK-DAG:         \"beta\":0
+; CHECK-DAG:         \"dot_dimension_numbers\":{
+; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
+; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
+; CHECK-DAG:           \"lhs_batch_dimensions\":[]
+; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"precision_config\":{
+; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         }
+; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
+; CHECK:           }"
+      )");
+}
+
 class GemmRewriteAllocationTest : public GpuCodegenTest {
  public:
   void CheckNumberOfAllocations(const std::string& hlo,
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
index 5f5b9ffe516..69c1d2c99ab 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -104,7 +104,7 @@ TEST_F(GpuAtomicTest, TestAddAtomicF32) {
 )";
 
   CompileAndVerifyIr(hlo_string, is_built_with_rocm_ ? R"(
-CHECK: atomicrmw fadd float addrspace{{.*}}, float {{.*}} seq_cst, align 4
+CHECK: atomicrmw fadd ptr addrspace(1) %[[ADDR:.*]], float %[[VALUE:.*]] syncscope("agent") seq_cst
 )"
                                                      : R"(
 CHECK: atomicrmw fadd ptr %[[ADDR:.*]], float %[[VALUE:.*]] seq_cst
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 1b14a6f098c..a1339ee264b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -32,7 +32,6 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) {
   HloModuleConfig config;
   auto debug_options = GetDebugOptionsFromFlags();
   debug_options.set_xla_gpu_ftz(ftz);
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   config.set_debug_options(debug_options);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_compilation_parallelism_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_compilation_parallelism_test.cc
index ffa55c3f375..ac2e7ba88b4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_compilation_parallelism_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_compilation_parallelism_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
index 1c475ab4e10..96eef1fcfb7 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 503bf1d0430..7c6b6df9f78 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_dyn_shape_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_dyn_shape_test.cc
index 94913c74243..54d0f9815ed 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_dyn_shape_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_dyn_shape_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
index db3ff27cd54..8153cd5ec8e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_pipeline_test.cc
@@ -18,12 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -44,10 +42,13 @@ class GpuFusionPipelineTest : public GpuCodegenTest {
   void CheckGpuFusionPipeline(absl::string_view hlo,
                               std::optional<absl::string_view> expected) {
     HloPassPipeline pipeline("gpu-fusion");
-    pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
-    pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
-    pipeline.AddPass<FusionMerger>(ShapeSizeBytesFunction());
-    pipeline.AddPass<GpuMultiOutputFusion>(ShapeSizeBytesFunction());
+    const GpuDeviceInfo device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+    pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false,
+                                           device_info);
+    pipeline.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true, device_info);
+    pipeline.AddPass<FusionMerger>(device_info, ShapeSizeBytesFunction());
+    pipeline.AddPass<GpuMultiOutputFusion>(device_info,
+                                           ShapeSizeBytesFunction());
 
     RunAndFilecheckHloRewrite(hlo, std::move(pipeline), expected);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
index d7b8fe2ed53..1b7869f5073 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
@@ -81,7 +82,10 @@ TEST_F(GpuFusionTest, FusedBiggerThenThresholdButDoNotChangeTheFusionl) {
   b.AddInstruction(
       HloInstruction::CreateConcatenate(concat_shape, slice_params, 1));
   module->AddEntryComputation(b.Build());
-  EXPECT_TRUE(GpuInstructionFusion(false).Run(module.get()).value());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/false,
+                                   TestGpuDeviceInfo::RTXA6000DeviceInfo())
+                  .Run(module.get())
+                  .value());
   EXPECT_TRUE(module->entry_computation()->root_instruction()->opcode() ==
               HloOpcode::kFusion);
   for (HloInstruction* instr : module->entry_computation()->instructions()) {
@@ -93,8 +97,11 @@ class TransposeFusionTest : public GpuFusionTest {
  public:
   void CheckGpuFusion(absl::string_view hlo,
                       std::optional<absl::string_view> expected) {
-    RunAndFilecheckHloRewrite(hlo, GpuInstructionFusion{/*may_duplicate=*/true},
-                              expected);
+    RunAndFilecheckHloRewrite(
+        hlo,
+        GpuInstructionFusion{/*may_duplicate=*/true,
+                             TestGpuDeviceInfo::RTXA6000DeviceInfo()},
+        expected);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index ee9236ba526..347f91f1dfa 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -156,7 +156,6 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   config.set_debug_options(debug_options);
 
   auto module = ParseAndReturnVerifiedModule(R"(
@@ -185,7 +184,6 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithTranspose) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   config.set_debug_options(debug_options);
 
   auto module = ParseAndReturnVerifiedModule(R"(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 3a944d373da..ea65544c58c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -65,9 +65,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
           .value();
 
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @copy
 ; CHECK: call void BARRIER()
-; CHECK: }
 )";
   CompileAndVerifyIr(std::move(hlo_module),
                      MakePlatformSpecificLlvm(expected_ir),
@@ -93,9 +91,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @copy
 ; CHECK-NOT: call void BARRIER()
-; CHECK: }
 )";
   CompileAndVerifyIr(std::move(hlo_module),
                      MakePlatformSpecificLlvm(expected_ir),
@@ -314,50 +310,6 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
 }
 
-TEST_F(GpuKernelTilingTest,
-       ColumnReductionWithInputLargerThenReduceInputNotUnrolled) {
-  const char *const kHloString = R"(
-  HloModule larger_than_reduce_input_parameter
-
-  reduction22 {
-    x = f32[] parameter(0)
-    y = f32[] parameter(1)
-    ROOT add = f32[] add(x, y)
-  }
-
-  fused_computation {
-    constant0 = f32[] constant(0)
-    arg.1 = f16[1024,512]{1,0} parameter(0)
-    arg.2 = f16[1027,513]{1,0} parameter(1)
-    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
-    arg2.conv = f32[1027,513]{1,0} convert(arg.2)
-    slice2 = f32[1024,512]{1,0} slice(arg2.conv), slice={[2:1026], [1:513]}
-    add2 = f32[1024,512]{1,0} add(arg1.conv, slice2)
-    ROOT reduce = f32[512]{0} reduce(add2, constant0), dimensions={0},
-      to_apply=reduction22
-  }
-
-  ENTRY kernel_entry {
-    arg1 = f16[1024,512]{1,0} parameter(0)
-    arg2 = f16[1027,513]{1,0} parameter(1)
-    ROOT fusion = f32[512]{0} fusion(arg1, arg2), kind=kInput,
-      calls=fused_computation
-  })";
-
-  // Check that one call to llvm.nvvm.atomic is generated.
-  auto hlo_module =
-      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
-          .value();
-  const char *expected_ir = R"(
-; CHECK: store float %{{.*}}, ptr addrspace(1)
-; CHECK-NOT: store float %{{.*}}, ptr addrspace(1)
-)";
-  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
-                     /*match_optimized_ir=*/true);
-  // Check that the kernel runs correctly.
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
-}
-
 TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
   const char *const kHloString = R"(
   HloModule column_reduce_powerof2_mof
@@ -791,6 +743,35 @@ ENTRY kernel_entry {
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, ColumnMultiOutputVectorization) {
+  const char *const kHloString = R"(
+HloModule HandleReductionToVectorAndOtherReduction
+
+add {
+    acc = f16[] parameter(1)
+    op = f16[] parameter(0)
+    ROOT out = f16[] add(acc, op)
+}
+
+ENTRY main {
+    p = f16[4096,4096] parameter(0)
+    l1 = log(p)
+    l2 = log(l1)
+    s = log(l2)
+    z = f16[] constant(0)
+    r1 = f16[4096] reduce(p, z), dimensions={0}, to_apply=add
+    r2 = f16[4096] reduce(s, z), dimensions={0}, to_apply=add
+    ROOT out = (f16[4096], f16[4096]) tuple(r1, r2)
+}
+  )";
+  auto expected_ir = R"(
+; CHECK: load <2 x half>, ptr
+  )";
+  auto hlo_module = ParseAndReturnVerifiedModule(kHloString).value();
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
+                     /*match_optimized_ir=*/true);
+}
+
 TEST_F(GpuKernelTilingTest, Hlo021CopyNoOobAccess) {
   const char *const kHloString = R"(
 HloModule primitive_computation_svd.38
@@ -834,7 +815,7 @@ TEST_F(GpuKernelTilingTest, RowReductionCorrectShmemUsage) {
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).value();
   auto expected_ir = is_built_with_rocm_ ? R"(
-; CHECK: initial_value_addr = internal unnamed_addr addrspace({{[0-9]*}}) global [1024 x float] undef, align 4
+; CHECK: initial_value_addr = internal unnamed_addr addrspace({{[0-9]*}}) global [1024 x float] poison, align 4
   )"
                                          : R"(
 ; CHECK: shared_cache = private unnamed_addr addrspace({{[0-9]*}}) global [1 x [1 x [2 x float]]]
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 2f73671e099..b4c5f7a2b38 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index f205a3f5a87..d93d33882e3 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/test.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
index 77081e894fa..e18974dca3b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_reduce_scatter_creator_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_reduce_scatter_creator.h"
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 0c7596ee45c..249ba06c12c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -41,29 +41,11 @@ const char *const kAddModule = R"(
       ROOT fusion = f32[20000,20000]{1,0} fusion(p0, p1), kind=kLoop, calls=fused_computation
     })";
 
-TEST_F(GpuUnrollingTest, DoNotUnroll) {
-  HloModuleConfig config;
-  auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_mlir_lowering(false);
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
-  config.set_debug_options(debug_options);
-  auto hlo_module = ParseAndReturnVerifiedModule(kAddModule, config).value();
-
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
-; CHECK-LABEL: @fusion
-; CHECK: fadd
-; CHECK-NOT: fadd
-; CHECK: }
-      )",
-                     /*match_optimized_ir=*/false);
-}
-
-
 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
   // The default unrolling factor is 4.
   HloModuleConfig config;
   auto debug_options = GetDebugOptionsFromFlags();
+  debug_options.set_xla_gpu_enable_softmax_fusion(false);
   debug_options.set_xla_gpu_enable_mlir_lowering(false);
   config.set_debug_options(debug_options);
   auto hlo_module = ParseAndReturnVerifiedModule(kAddModule, config).value();
@@ -96,7 +78,7 @@ TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
 TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
+  debug_options.set_xla_gpu_enable_softmax_fusion(false);
   debug_options.set_xla_gpu_enable_mlir_lowering(false);
   config.set_debug_options(debug_options);
 
@@ -138,7 +120,6 @@ TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
   config.set_debug_options(debug_options);
 
   const char *const kUnfusedAddModule = R"(
@@ -162,7 +143,6 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedCosine) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
   config.set_debug_options(debug_options);
 
   const char *const kUnfusedAddModule = R"(
@@ -186,7 +166,6 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedCosine) {
 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedPower) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
   config.set_debug_options(debug_options);
 
   const char *const kUnfusedAddModule = R"(
@@ -211,7 +190,6 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedPower) {
 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedAtan2) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
   config.set_debug_options(debug_options);
 
   const char *const kUnfusedAddModule = R"(
@@ -236,7 +214,6 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedAtan2) {
 TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
   // Disable layout assignment for this test.  Layout assignment does not expect
   // fusions to be present, and so it does the wrong thing.
   debug_options.add_xla_disable_hlo_passes("layout-assignment");
@@ -293,8 +270,6 @@ TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
 ; CHECK: store float
 ; CHECK-NOT: store float
 ; CHECK-NOT: store float
-; CHECK-NOT: fadd
-; CHECK-NOT: fmul
 ; CHECK: }
       )",
                      /*match_optimized_ir=*/false);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
index 3ac4882c8bc..3296273bb17 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/hlo_to_llvm_ir.cc
@@ -19,10 +19,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
-#include "tensorflow/compiler/xla/service/gpu/nvptx_helper.h"
 #endif
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/compiler/xla/tools/hlo_module_loader.h"
@@ -87,11 +86,10 @@ xla::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
     llvm_module->print(llvm::outs(), nullptr);
   } else {
 #if GOOGLE_CUDA
-    std::string libdevice_dir = xla::gpu::GetLibdeviceDir(hlo_module->config());
-    TF_ASSIGN_OR_RETURN(std::string ptx,
-                        xla::gpu::nvptx::CompileToPtx(
-                            llvm_module.get(), cuda_compute_capability,
-                            hlo_module->config(), libdevice_dir));
+    TF_ASSIGN_OR_RETURN(
+        std::string ptx,
+        xla::gpu::nvptx::CompileToPtx(
+            llvm_module.get(), cuda_compute_capability, hlo_module->config()));
     std::cout << ptx << std::endl;
 #else
     return {tsl::error::UNIMPLEMENTED, "Feature not yet implemented in ROCm"};
diff --git a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
index 64be3dcf7dc..3df8f8d1976 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,5 +1,5 @@
 // Disable the MLIR lowering path to test the ir emitter block size logic.
-// RUN: XLA_FLAGS=--xla_gpu_enable_mlir_lowering=false \
+// RUN: XLA_FLAGS="--xla_gpu_enable_softmax_fusion=false --xla_gpu_enable_mlir_lowering=false" \
 // RUN:   hlo_to_llvm_ir %s | FileCheck %s
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/mlir_gemm_test.cc b/tensorflow/compiler/xla/service/gpu/tests/mlir_gemm_test.cc
index 9ea44d5dbd1..30fceb381ff 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/mlir_gemm_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/mlir_gemm_test.cc
@@ -48,6 +48,29 @@ class GemmTest : public MlirGpuTestBase {
                                       {ToUint8Span(&arg0), ToUint8Span(&arg1)})
         .value();
   }
+
+  std::vector<std::vector<uint8_t>> Run256x256Gemm(
+      std::vector<float> arg0, std::vector<float> arg1,
+      std::string matmul_options = "") {
+    std::string mlir_text = absl::StrCat(
+        R"(
+      module attributes {hlo.unique_id = 0 : i32} {
+        func.func @main(%arg0: memref<256x256xf32> {lmhlo.params = 0 : index},
+                   %arg1: memref<256x256xf32> {lmhlo.params = 1 : index},
+                   %arg2: memref<256x256xf32> {lmhlo.output_index = dense<[0]> : tensor<1xindex>}) attributes {
+                       result_xla_shape = "(f32[65536]) "
+                   } {
+          "lmhlo_gpu.gemm"(%arg0, %arg1, %arg2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 65536 : i64, rhs_stride = 65536 : i64, op_type = "", op_name = "", dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>)",
+        matmul_options,
+        R"(} : (memref<256x256xf32>, memref<256x256xf32>, memref<256x256xf32>) -> ()
+          "lmhlo.terminator"() : () -> ()
+        }
+      })");
+
+    return RunMlirTextWithHostBuffers(mlir_text,
+                                      {ToUint8Span(&arg0), ToUint8Span(&arg1)})
+        .value();
+  }
 };
 
 TEST_F(GemmTest, SimpleCase1) {
@@ -59,33 +82,36 @@ TEST_F(GemmTest, SimpleCase1) {
               ElementsAreArray<float>({11, 16, 19, 28}));
 }
 
+static std::vector<float> diag_matrix(size_t dim, float diag_value) {
+  std::vector<float> r(dim * dim, 0.0f);
+  for (size_t i = 0; i < dim; ++i) {
+    r[i * dim + i] = diag_value;
+  }
+  return r;
+}
+
 TEST_F(GemmTest, GemmPrecisionDefault) {
-  std::vector<float> arg0 = {0x1.fffffep+0, 0, 0, 0x1.fffffep+0};
-  std::vector<float> arg1 = {0x1.fffffep+0, 0, 0, 0x1.fffffep+0};
-  auto outputs = Run2x2Gemm(
-      arg0, arg1,
+  auto outputs = Run256x256Gemm(
+      diag_matrix(256, 0x1.fffffep+0), diag_matrix(256, 0x1.fffffep+0),
       R"(, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>])");
   ASSERT_EQ(1, outputs.size());
   auto stream = BorrowStream();
   if (stream->GetCudaComputeCapability().IsAtLeast(
           stream_executor::CudaComputeCapability::AMPERE)) {
-    EXPECT_THAT(FromUint8Span<float>(outputs[0]),
-                ElementsAreArray<float>({4, 0, 0, 4}));
+    EXPECT_THAT(FromUint8Span<float>(outputs[0]), diag_matrix(256, 4.0f));
   } else {
     EXPECT_THAT(FromUint8Span<float>(outputs[0]),
-                ElementsAreArray<float>({0x1.fffffcp+1, 0, 0, 0x1.fffffcp+1}));
+                diag_matrix(256, 0x1.fffffcp+1));
   }
 }
 
 TEST_F(GemmTest, GemmPrecisionHighest) {
-  std::vector<float> arg0 = {0x1.fffffep+0, 0, 0, 0x1.fffffep+0};
-  std::vector<float> arg1 = {0x1.fffffep+0, 0, 0, 0x1.fffffep+0};
-  auto outputs = Run2x2Gemm(
-      arg0, arg1,
+  auto outputs = Run256x256Gemm(
+      diag_matrix(256, 0x1.fffffep+0), diag_matrix(256, 0x1.fffffep+0),
       R"(, precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>])");
   ASSERT_EQ(1, outputs.size());
   EXPECT_THAT(FromUint8Span<float>(outputs[0]),
-              ElementsAreArray<float>({0x1.fffffcp+1, 0, 0, 0x1.fffffcp+1}));
+              diag_matrix(256, 0x1.fffffcp+1));
 }
 
 TEST_F(GemmTest, GemmBatchedPrecisionHighest) {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
index 6e09108d556..27232893ff1 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
@@ -24,15 +24,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 
 namespace xla {
 namespace gpu {
 
 MlirGpuTestBase::MlirGpuTestBase() {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName())
-          .value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   BackendOptions options;
   options.set_platform(platform);
   backend_ = xla::Backend::CreateBackend(options).value();
diff --git a/tensorflow/compiler/xla/service/gpu/tests/mnist.py b/tensorflow/compiler/xla/service/gpu/tests/mnist.py
deleted file mode 100644
index 5c9575058a8..00000000000
--- a/tensorflow/compiler/xla/service/gpu/tests/mnist.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MNIST end-to-end training and inference example.
-
-The source code here is from
-https://www.tensorflow.org/xla/tutorials/jit_compile, where there is also a
-walkthrough.
-
-To execute in XLA Runtime, run with
-`--config=cuda --test_env=XLA_FLAGS=--xla_gpu_enable_xla_runtime_executable`
-
-To dump debug output (e.g., LMHLO MLIR), run with
-`--test_env=XLA_FLAGS="--xla_dump_to=/tmp/mnist"`.
-"""
-
-from absl import app
-
-import tensorflow as tf
-tf.compat.v1.enable_eager_execution()
-
-
-# Size of each input image, 28 x 28 pixels
-IMAGE_SIZE = 28 * 28
-# Number of distinct number labels, [0..9]
-NUM_CLASSES = 10
-# Number of examples in each training batch (step)
-TRAIN_BATCH_SIZE = 100
-# Number of training steps to run
-TRAIN_STEPS = 1000
-
-
-def main(_):
-  # Loads MNIST dataset.
-  train, test = tf.keras.datasets.mnist.load_data()
-  train_ds = tf.data.Dataset.from_tensor_slices(train).batch(
-      TRAIN_BATCH_SIZE).repeat()
-
-  # Casting from raw data to the required datatypes.
-  def cast(images, labels):
-    images = tf.cast(
-        tf.reshape(images, [-1, IMAGE_SIZE]), tf.float32)
-    labels = tf.cast(labels, tf.int64)
-    return (images, labels)
-
-  layer = tf.keras.layers.Dense(NUM_CLASSES)
-  optimizer = tf.keras.optimizers.Adam()
-
-  @tf.function(jit_compile=True)
-  def train_mnist(images, labels):
-    images, labels = cast(images, labels)
-
-    with tf.GradientTape() as tape:
-      predicted_labels = layer(images)
-      loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=predicted_labels, labels=labels
-      ))
-    layer_variables = layer.trainable_variables
-    grads = tape.gradient(loss, layer_variables)
-    optimizer.apply_gradients(zip(grads, layer_variables))
-
-  for images, labels in train_ds:
-    if optimizer.iterations > TRAIN_STEPS:
-      break
-    train_mnist(images, labels)
-
-  images, labels = cast(test[0], test[1])
-  predicted_labels = layer(images)
-  correct_prediction = tf.equal(tf.argmax(predicted_labels, 1), labels)
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print("Prediction accuracy after training: %s" % accuracy)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
index 59983b8459b..b79590ba458 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
@@ -18,11 +18,10 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc
index c7803c6ef80..e02109f7025 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_emitter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_emitter_test.cc
new file mode 100644
index 00000000000..1b6afce5b5f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_emitter_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ReductionEmitterTest : public gpu::GpuCodegenTest {};
+
+TEST_F(ReductionEmitterTest, ProperShmemAllocation) {
+  const char* const kHloString = R"(
+  HloModule m
+
+  add {
+    a = f64[] parameter(0)
+    b = f64[] parameter(1)
+    ROOT out = f64[] add(a, b)
+  }
+
+  fused_computation {
+    p1 = f64[1024,1024]{1,0} parameter(0)
+    p2 = f64[1024,1024]{1,0} parameter(1)
+    s = pred[1024,1024]{1,0} parameter(2)
+    p = f64[1024,1024]{1,0} select(s, p1, p2)
+    z = f64[] constant(0)
+    ROOT out = f64[1024]{0} reduce(p, z), to_apply=add, dimensions={0}
+  }
+
+  ENTRY e {
+    p1 = f64[1024,1024]{1,0} parameter(0)
+    p2 = f64[1024,1024]{1,0} parameter(1)
+    s = pred[1024,1024]{1,0} parameter(2)
+    ROOT f = f64[1024]{0} fusion(p1, p2, s), kind=kInput, calls=fused_computation
+  })";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
index b8bf7fa4f86..da5309f7b5a 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index fb78b78235d..88b9065e8c4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -16,13 +16,12 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
@@ -43,9 +42,21 @@ class ReductionVectorizationNoOptTest : public GpuCodegenTest {
     debug_options.set_xla_disable_all_hlo_passes(true);
     return debug_options;
   }
+
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
 };
 
 TEST_F(ReductionVectorizationNoOptTest, MultiOutputStore) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::PASCAL_)) {
+    GTEST_SKIP() << "Maxwell GPUs are less vectorized";
+  }
   const char* hlo_text = R"(
 HloModule MultiOutputStore
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
index 92a27739a76..6784c8ac860 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/sorting_test.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/swap_conv_operands_test.cc b/tensorflow/compiler/xla/service/gpu/tests/swap_conv_operands_test.cc
index a20c034f325..ae880f47df6 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/swap_conv_operands_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/swap_conv_operands_test.cc
@@ -43,12 +43,9 @@ ENTRY swap_conv {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK-LABEL: ENTRY
-// CHECK:   [[FILTER:%[^ ]+]] = f32[1,30,30,512]{2,1,3,0}
-// CHECK:   [[INPUT:%[^ ]+]] = f32[512,128,3,3]{3,2,0,1}
-// CHECK:   %cudnn-conv = (f32[1,32,32,128]{2,1,3,0}, u8[{{[0-9]*}}]{0}) custom-call(f32[1,30,30,512]{2,1,3,0} [[FILTER]], f32[512,128,3,3]{3,2,0,1} [[INPUT]]), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_io01->b01f
+// CHECK: [[cudnn_conv_1_0:%[^ ]+]] = (f32[1,128,32,32]{3,2,1,0}, u8[{{.*}}]{0}) custom-call(f32[1,512,30,30]{3,2,1,0} [[fusion_1_1:%[^ ]+]], f32[128,512,3,3]{3,2,1,0} [[transpose_1_2:%[^ ]+]]), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward"
       )");
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
 }
 
 // If the padding is already small, we leave the operands as-is before lowering.
@@ -66,12 +63,9 @@ ENTRY swap_conv {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK-LABEL: ENTRY
-// CHECK:   [[INPUT:%[^ ]+]] = f32[1,30,30,512]{2,1,3,0}
-// CHECK:   [[FILTER:%[^ ]+]] = f32[512,128,3,3]{3,2,0,1}
-// CHECK:   %cudnn-conv = (f32[1,32,32,128]{2,1,3,0}, u8[{{[0-9]*}}]{0}) custom-call(f32[1,30,30,512]{2,1,3,0} [[INPUT]], f32[512,128,3,3]{3,2,0,1} [[FILTER]]), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=b01f_io01->b01f
+// CHECK: [[cudnn_conv_1_0:%[^ ]+]] = (f32[1,128,32,32]{3,2,1,0}, u8[{{[0-9]*}}]{0}) custom-call(f32[1,512,30,30]{3,2,1,0} [[fusion_1_1:%[^ ]+]], f32[128,512,3,3]{3,2,1,0} [[transpose_1_2:%[^ ]+]]), window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward"
       )");
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
 }
 
 // If swapping the conv operands would result in a conv that does not lower to a
@@ -92,11 +86,7 @@ ENTRY %conv3DBackpropInputV2(arg0.1: f32[3,3,3,2,3]) -> f32[2,4,3,3,2] {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK-LABEL: ENTRY
-// CHECK:   [[INSTR_0:%[^ ]+]] =  f32[2,2,2,2,3]{3,2,1,4,0} constant({...})
-// CHECK:   [[INSTR_1:%[^ ]+]] =  f32[3,3,3,2,3]{4,3,2,1,0} parameter(0), parameter_replication={false}
-// CHECK:   [[INSTR_2:%[^ ]+]] =  f32[3,3,3,2,3]{2,1,0,3,4} copy(f32[3,3,3,2,3]{4,3,2,1,0} %arg0.1)
-// CHECK:   [[INSTR_3:%[^ ]+]] = (f32[2,5,3,3,2]{3,2,1,4,0}, u8[{{[0-9]*}}]{0}) custom-call(f32[2,2,2,2,3]{3,2,1,4,0} [[INSTR_0]], f32[3,3,3,2,3]{2,1,0,3,4} [[INSTR_2]]), window={size=3x3x3 stride=2x2x2 pad=0_0x1_1x1_1}, dim_labels=b012f_012io->b012f, custom_call_target="__cudnn$convBackwardInput", metadata={op_type="Conv3DBackpropInputV2" op_name="gradients_2/Conv3DBackpropFilterV2_1_grad/Conv3DBackpropInputV2"}
+// CHECK:   [[cudnn_conv_bw_input_2_0:%[^ ]+]] = (f32[2,2,5,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[2,3,2,2,2]{4,3,2,1,0} [[constant_1:%[^ ]+]], f32[3,2,3,3,3]{4,3,2,1,0} [[transpose_2:%[^ ]+]]), window={size=3x3x3 stride=2x2x2 pad=0_0x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardInput"
       )");
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
index 6e772eb7111..2f1b6d189ef 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
@@ -19,11 +19,10 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index c25ac50722b..26a49116a79 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 
+#include <functional>
+#include <memory>
+#include <ostream>
+#include <string>
+
 namespace xla {
 namespace gpu {
 
@@ -31,8 +36,6 @@ Thunk::ExecuteParams::ExecuteParams(
   switch (kind) {
     case Thunk::kCholesky:
       return "kCholesky";
-    case Thunk::kCollectivePermute:
-      return "kCollectivePermute";
     case Thunk::kConditional:
       return "kConditional";
     case Thunk::kConvolution:
@@ -51,6 +54,12 @@ Thunk::ExecuteParams::ExecuteParams(
       return "kNcclAllReduceStart";
     case Thunk::kNcclAllReduceDone:
       return "kNcclAllReduceDone";
+    case Thunk::kNcclCollectivePermute:
+      return "kNcclCollectivePermute";
+    case Thunk::kNcclCollectivePermuteStart:
+      return "kNcclCollectivePermuteStart";
+    case Thunk::kNcclCollectivePermuteDone:
+      return "kNcclCollectivePermuteDone";
     case Thunk::kNcclReduceScatter:
       return "kNcclReduceScatter";
     case Thunk::kNcclAllToAll:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 1f286ea81d0..a64409eac76 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -48,7 +48,6 @@ class Thunk {
  public:
   enum Kind {
     kCholesky,
-    kCollectivePermute,
     kConditional,
     kConvolution,
     kCopy,
@@ -65,6 +64,9 @@ class Thunk {
     kNcclAllReduce,
     kNcclAllReduceStart,
     kNcclAllReduceDone,
+    kNcclCollectivePermute,
+    kNcclCollectivePermuteStart,
+    kNcclCollectivePermuteDone,
     kNcclReduceScatter,
     kNcclAllToAll,
     kOutfeed,
diff --git a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
index dcd521af333..db86c459ee2 100644
--- a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
@@ -23,12 +23,12 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -77,8 +77,6 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
   Status RewriteReduction(HloInstruction *hlo) {
     ReductionDimensions reduction_dimensions =
         GetReductionKindAndContiguousComponents(*hlo);
-    std::array<int64_t, 3> reduction_tiling =
-        GetReductionTiling(reduction_dimensions, cuda_compute_capability_);
     VLOG(5) << "Input: " << hlo->ToString();
     auto *reduce = Cast<HloReduceInstruction>(hlo);
     absl::Span<int64_t const> input_shape_dims =
@@ -106,7 +104,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
     bool is_row_reduction = reduction_dimensions.is_row_reduction;
 
     // Base case: everything fits.
-    if (ReductionIsRaceFree(reduction_dimensions, reduction_tiling)) {
+    if (ReductionIsRaceFree(reduction_dimensions)) {
       VLOG(3) << "Base case: dimensions fit";
       return OkStatus();
     }
diff --git a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
index 7b45595a60d..d5a97d3a50f 100644
--- a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_rewriter.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_rewriter.h
index 64cd4897c19..fafc4d419bf 100644
--- a/tensorflow/compiler/xla/service/gpu/triangular_solve_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_rewriter.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_REWRITER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
index 6e2e3e1f57b..5a20cac18cd 100644
--- a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
@@ -74,7 +74,7 @@ TriangularSolveThunk::TriangularSolveThunk(
 
 Status TriangularSolveThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& buffer_allocations = *params.buffer_allocations;
-  return RunTriangulatSolve(buffer_allocations.GetDeviceAddress(a_buffer_),
+  return RunTriangularSolve(buffer_allocations.GetDeviceAddress(a_buffer_),
                             buffer_allocations.GetDeviceAddress(b_buffer_),
                             buffer_allocations.GetDeviceAddress(temp_buffer_),
                             asm_opts_, uplo_, side_, unit_diagonal_,
@@ -82,7 +82,7 @@ Status TriangularSolveThunk::ExecuteOnStream(const ExecuteParams& params) {
                             a_batch_stride_, b_batch_stride_, params.stream);
 }
 
-Status RunTriangulatSolve(se::DeviceMemoryBase a_data,
+Status RunTriangularSolve(se::DeviceMemoryBase a_data,
                           se::DeviceMemoryBase b_data,
                           se::DeviceMemoryBase temp_data,
                           se::GpuAsmOpts asm_opts, se::blas::UpperLower uplo,
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
index 7971b227b04..201099dcbf8 100644
--- a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
@@ -72,7 +72,7 @@ class TriangularSolveThunk : public Thunk {
   const int64_t b_batch_stride_;
 };
 
-Status RunTriangulatSolve(se::DeviceMemoryBase a_data,
+Status RunTriangularSolve(se::DeviceMemoryBase a_data,
                           se::DeviceMemoryBase b_data,
                           se::DeviceMemoryBase temp_data,
                           se::GpuAsmOpts asm_opts, se::blas::UpperLower uplo,
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
index 14e52a10c34..ad30cb182de 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
index 50036ac9a12..af60ec8ac05 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 9553ce5769b..d07fe93414b 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/graphcycles/BUILD b/tensorflow/compiler/xla/service/graphcycles/BUILD
index 4853c9f814e..0d62a48b358 100644
--- a/tensorflow/compiler/xla/service/graphcycles/BUILD
+++ b/tensorflow/compiler/xla/service/graphcycles/BUILD
@@ -1,10 +1,10 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/compiler/tf2xla:internal",
-        "//tensorflow/compiler/xla:internal",
+        "//tensorflow/compiler:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -34,7 +34,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "graphcycles_test",
     srcs = ["graphcycles_test.cc"],
     deps = [
@@ -47,7 +47,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "ordered_set_test",
     srcs = ["ordered_set_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 6fed5b64d1a..377ecaa15d3 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 78f190d31b8..7092943ccee 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -24,17 +24,18 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_live_range.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -432,7 +433,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // end of the last co-located buffer.  There could be "holes" in the live
   // ranges of each co-located buffers, but in this heuristics we think they are
   // contiguous.
-  BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
+  virtual BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
   HeapResult result_;
@@ -446,6 +447,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // Alloc or Free call.
   int64_t current_time_ = 0;
 
+ protected:
   // Returns all transitive colocated buffers of this buffer interval. I.e., If
   // a buffer A is colocated with B and B is colocated with C, this function
   // returns all three of them.
@@ -526,6 +528,11 @@ class ChooseBestHeapAlgorithm : public HeapAlgorithm<BufferType> {
   std::vector<std::unique_ptr<HeapAlgorithm<BufferType>>> algorithms_;
 };
 
+extern template class GlobalDecreasingSizeBestFitHeap<HloValue>;
+extern template class GlobalDecreasingSizeBestFitHeap<
+    MemorySpaceAssignmentRepacker::AllocationBlock>;
+extern template class ChooseBestHeapAlgorithm<HloValue>;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HEAP_SIMULATOR_H_
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index fc9efa15015..8de0242d5cf 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 01ddb7a0d4e..2af6be22570 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -87,10 +87,31 @@ enum CustomCallApiVersion {
   //                       XlaCustomCallStatus* status);
   //
   API_VERSION_STATUS_RETURNING_UNIFIED = 3;
+
+  // Api version implementing XLA runtime custom call calling convention. These
+  // custom calls can be registered as an XLA runtime custom call (1) or as XLA
+  // runtime FFI binding (2).
+  //
+  // This type of custom call uses custom ABI to pass type information along
+  // with custom call arguments. Also it passes buffer arguments together with
+  // data type, sizes and strides.
+  //
+  // Example: (XLA runtime custom call)
+  //
+  //   absl::Status DoCustomCall(StridedMemrefView arg, float attr);
+  //
+  //   CustomCall::Bind("custom_call")
+  //     .Arg<StridedMemrefView>()
+  //     .Attr<float>("attr")
+  //     .To(DoCustomCall);
+  //
+  // (1) xla/runtime/custom_call.h
+  // (2) xla/runtime/ffi/ffi.h
+  API_VERSION_TYPED_FFI = 4;
 }
 
 // Serialization of HloInstruction.
-// Next ID: 80
+// Next ID: 81
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -282,10 +303,9 @@ message HloInstructionProto {
   // kCustomCall.
   bool custom_call_has_side_effect = 65;
 
-  // A list of CustomCallOutputOperandAliasing pairs that specifies aliasing
-  // buffers between output and operands for kCustomCall.
-  repeated xla.CustomCallOutputOperandAliasing
-      custom_call_output_operand_aliasing = 74;
+  // A list of OutputOperandAliasing pairs that specifies aliasing buffers
+  // between output and operands for kCustomCall and kFusion.
+  repeated xla.OutputOperandAliasing output_operand_aliasing = 74;
 
   // Specifies the desired schedule for the custom-call. The field is only
   // present for custom-call.
@@ -312,7 +332,15 @@ message HloInstructionProto {
   string comparison_type = 72;
 
   // Specifies if this is a cross-program-prefetch, used by kCopyStart.
-  bool is_cross_program_prefetch = 73;
+  // Deprecated and replaced by optional_cross_program_prefetch_index.
+  bool is_cross_program_prefetch = 73 [deprecated = true];
+
+  // Specifies the cross-program-prefetch index used by kCopyStart. Uses oneof
+  // to emulate the 'optional' keyword for proto3 versions before v3.15.0
+  // released 2021/2/18.
+  oneof optional_cross_program_prefetch_index {
+    int32 cross_program_prefetch_index = 80;
+  }
 
   // If a convolution is dynamic, a dynamic padding type will be specified.
   xla.PaddingType padding_type = 75;
@@ -414,7 +442,7 @@ message HloInputOutputAliasProto {
 }
 
 message DynamicParameterBindingProto {
-  // A list of bindings which indicates that the `target_dim_num` in
+  // A list of bindings which indicates that the `target_param_dim_num` in
   // the subshape `target_param_index` of parameter `target_param_num`
   // is a dynamic dimension and its real dynamic size is represented
   // by `dynamic_param_index` in parameter `dynamic_param_num`.
@@ -436,7 +464,7 @@ message DynamicParameterBindingProto {
   // dynamic_param_index = {}
   // target_param_num = 0
   // target_param_index = {}
-  // target_param_dim = 0
+  // target_param_dim_num = 0
   message Binding {
     int64 dynamic_param_num = 1;
     repeated int64 dynamic_param_index = 2;
@@ -451,6 +479,7 @@ message DynamicParameterBindingProto {
 message CrossProgramPrefetch {
   int64 parameter = 1;
   repeated int64 index = 2;
+  int64 offset = 3;
 }
 
 // Serialization of HloModule.
@@ -522,13 +551,12 @@ message LogicalBufferProto {
   // Location represents an instruction and its shape index, which uniquely
   // identifies a point where a buffer is needed.
   message Location {
-    // NOTE: module_name isn't necessary, since all LogicalBuffers are
-    // associated with a single HloModule.
     // TODO(b/239098765): Remove instruction_name and computation_name.
-    string computation_name = 1 [deprecated = true];
     string instruction_name = 2 [deprecated = true];
     int64 instruction_id = 4;
     repeated int64 shape_index = 3;
+
+    reserved 1;
   }
 
   int64 id = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis.cc b/tensorflow/compiler/xla/service/hlo_activation_analysis.cc
new file mode 100644
index 00000000000..51549d80c13
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_activation_analysis.cc
@@ -0,0 +1,127 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_activation_analysis.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+
+namespace xla {
+
+void ActivationAnalysisOnComputation(const HloComputation* computation,
+                                     ConstHloInstructionSet* activation_set) {
+  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
+    // Dot or convolution create an "Activation".
+    if (hlo->opcode() == HloOpcode::kDot ||
+        hlo->opcode() == HloOpcode::kConvolution) {
+      activation_set->insert(hlo);
+      continue;
+    }
+
+    // Don't mark tuples directly since we want to indirect through tuples.
+    if (hlo->opcode() == HloOpcode::kTuple) {
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
+        hlo->operand(0)->opcode() == HloOpcode::kTuple) {
+      if (activation_set->count(hlo->operand(0)->operand(hlo->tuple_index()))) {
+        activation_set->insert(hlo);
+      }
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
+        hlo->operand(0)->opcode() == HloOpcode::kWhile) {
+      if (activation_set->count(
+              hlo->operand(0)->while_body()->root_instruction()->operand(
+                  hlo->tuple_index()))) {
+        activation_set->insert(hlo);
+      }
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
+        hlo->operand(0)->opcode() == HloOpcode::kCall) {
+      if (activation_set->count(DynCast<HloCallableInstruction>(hlo->operand(0))
+                                    ->called_computation_root()
+                                    ->operand(hlo->tuple_index()))) {
+        activation_set->insert(hlo);
+      }
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kGetTupleElement &&
+        hlo->operand(0)->opcode() == HloOpcode::kConditional) {
+      for (auto branch : hlo->operand(0)->branch_computations()) {
+        if (activation_set->count(
+                branch->root_instruction()->operand(hlo->tuple_index()))) {
+          activation_set->insert(hlo);
+        }
+      }
+      continue;
+    }
+
+    if (hlo->opcode() == HloOpcode::kWhile) {
+      const HloInstruction* body_param =
+          hlo->while_body()->parameter_instruction(0);
+      if (!body_param->shape().IsTuple()) {
+        if (activation_set->count(hlo->operand(0))) {
+          activation_set->insert(body_param);
+        }
+      }
+      for (const HloInstruction* use : body_param->users()) {
+        if (use->opcode() == HloOpcode::kGetTupleElement &&
+            activation_set->count(
+                hlo->operand(0)->operand(use->tuple_index()))) {
+          activation_set->insert(use);
+        }
+      }
+      ActivationAnalysisOnComputation(hlo->while_body(), activation_set);
+      continue;
+    }
+
+    // Skipping conditional and call for now.
+    if (hlo->opcode() == HloOpcode::kConditional) {
+      continue;
+    }
+    if (hlo->opcode() == HloOpcode::kCall) {
+      continue;
+    }
+
+    for (const HloInstruction* operand : hlo->operands()) {
+      if (activation_set->count(operand) ||
+          (operand->opcode() == HloOpcode::kWhile &&
+           activation_set->count(operand->while_body()->root_instruction()))) {
+        activation_set->insert(hlo);
+        break;
+      }
+    }
+  }
+}
+
+ConstHloInstructionSet ComputeHloActivationAnalysis(const HloModule* module) {
+  ConstHloInstructionSet activation_set;
+  ActivationAnalysisOnComputation(module->entry_computation(), &activation_set);
+  return activation_set;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis.h b/tensorflow/compiler/xla/service/hlo_activation_analysis.h
new file mode 100644
index 00000000000..bf2b7fded6a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_activation_analysis.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Returns a set of nodes that are considered activations. The inputs will not
+// be considered as activations with the current implementation.
+ConstHloInstructionSet ComputeHloActivationAnalysis(const HloModule* module);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ACTIVATION_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc
new file mode 100644
index 00000000000..3066a35d4d7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_activation_analysis_test.cc
@@ -0,0 +1,298 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_activation_analysis.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class HloActivationAnalysisTest : public HloTestBase {};
+
+TEST_F(HloActivationAnalysisTest, OneMatmul) {
+  const std::string module_str = R"(
+HloModule OneMatmul
+
+region_0.39 {
+  Arg_0.40 = f32[] parameter(0)
+  Arg_1.41 = f32[] parameter(1)
+  ROOT add.42 = f32[] add(Arg_0.40, Arg_1.41)
+}
+
+ENTRY entry {
+  Arg_1.2 = f32[32,128]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  Arg_7.8 = f32[4,32]{1,0} parameter(1), sharding={devices=[2,1]0,1}
+  copy = f32[4,32]{1,0} copy(Arg_7.8), sharding={devices=[2,1]0,1}
+  dot.0 = f32[4,128]{1,0} dot(copy, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  constant.5 = f32[] constant(0), sharding={replicated}
+  broadcast.2 = f32[4,128]{1,0} broadcast(constant.5), dimensions={}, sharding={devices=[2,1]0,1}
+  maximum.33 = f32[4,128]{1,0} maximum(dot.0, broadcast.2), sharding={devices=[2,1]0,1}
+  compare.34 = pred[4,128]{1,0} compare(dot.0, maximum.33), direction=EQ, sharding={devices=[2,1]0,1}
+  constant.4 = f32[] constant(1), sharding={replicated}
+  broadcast.1 = f32[4,128]{1,0} broadcast(constant.4), dimensions={}, sharding={devices=[2,1]0,1}
+  select.35 = f32[4,128]{1,0} select(compare.34, broadcast.1, broadcast.2), sharding={devices=[2,1]0,1}
+  dot.2 = f32[32,128]{0,1} dot(copy, select.35), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  constant.11 = f32[] constant(-0.01), sharding={replicated}
+  broadcast.12 = f32[32,128]{1,0} broadcast(constant.11), dimensions={}, sharding={devices=[2,1]0,1}
+  multiply.52 = f32[32,128]{0,1} multiply(dot.2, broadcast.12), sharding={devices=[2,1]0,1}
+  add.93 = f32[32,128]{1,0} add(Arg_1.2, multiply.52), sharding={devices=[2,1]0,1}
+  reduce.43 = f32[] reduce(maximum.33, constant.5), dimensions={0,1}, to_apply=region_0.39, sharding={replicated}
+  ROOT tuple.109 = (f32[32,128]{1,0}, f32[]) tuple(add.93, reduce.43), sharding={{devices=[2,1]0,1}, {replicated}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
+                                                /*num_partitions=*/2));
+  auto act_set = ComputeHloActivationAnalysis(module.get());
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "copy")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_1.2")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "select.35")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.2")));
+}
+
+TEST_F(HloActivationAnalysisTest, TwoMatmuls) {
+  const std::string module_str = R"(
+HloModule TwoMatmuls
+
+region_0.44 {
+  Arg_0.45 = f32[] parameter(0)
+  Arg_1.46 = f32[] parameter(1)
+  ROOT add.47 = f32[] add(Arg_0.45, Arg_1.46)
+}
+
+ENTRY entry {
+  Arg_1.2 = f32[32,128]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  Arg_8.9 = f32[4,32]{1,0} parameter(2), sharding={devices=[2,1]0,1}
+  copy = f32[4,32]{1,0} copy(Arg_8.9), sharding={devices=[2,1]0,1}
+  dot.0 = f32[4,128]{1,0} dot(copy, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  Arg_2.3 = f32[128,8]{1,0} parameter(1), sharding={devices=[1,2]0,1}
+  dot.1 = f32[4,8]{1,0} dot(dot.0, Arg_2.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[1,2]0,1}
+  constant.5 = f32[] constant(0), sharding={replicated}
+  broadcast.1 = f32[4,8]{1,0} broadcast(constant.5), dimensions={}, sharding={devices=[1,2]0,1}
+  maximum.38 = f32[4,8]{1,0} maximum(dot.1, broadcast.1), sharding={devices=[1,2]0,1}
+  compare.39 = pred[4,8]{1,0} compare(dot.1, maximum.38), direction=EQ, sharding={devices=[1,2]0,1}
+  constant.4 = f32[] constant(1), sharding={replicated}
+  broadcast.0 = f32[4,8]{1,0} broadcast(constant.4), dimensions={}, sharding={devices=[1,2]0,1}
+  select.40 = f32[4,8]{1,0} select(compare.39, broadcast.0, broadcast.1), sharding={devices=[1,2]0,1}
+  dot.2 = f32[4,128]{1,0} dot(select.40, Arg_2.3), lhs_contracting_dims={1}, rhs_contracting_dims={1}, sharding={devices=[2,1]0,1}
+  dot.5 = f32[32,128]{0,1} dot(copy, dot.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  constant.12 = f32[] constant(-0.01), sharding={replicated}
+  broadcast.13 = f32[32,128]{1,0} broadcast(constant.12), dimensions={}, sharding={devices=[2,1]0,1}
+  multiply.68 = f32[32,128]{0,1} multiply(dot.5, broadcast.13), sharding={devices=[2,1]0,1}
+  add.79 = f32[32,128]{1,0} add(Arg_1.2, multiply.68), sharding={devices=[2,1]0,1}
+  dot.6 = f32[128,8]{0,1} dot(dot.0, select.40), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[1,2]0,1}
+  broadcast.11 = f32[128,8]{1,0} broadcast(constant.12), dimensions={}, sharding={devices=[1,2]0,1}
+  multiply.69 = f32[128,8]{0,1} multiply(dot.6, broadcast.11), sharding={devices=[1,2]0,1}
+  add.80 = f32[128,8]{1,0} add(Arg_2.3, multiply.69), sharding={devices=[1,2]0,1}
+  reduce.48 = f32[] reduce(maximum.38, constant.5), dimensions={0,1}, to_apply=region_0.44, sharding={replicated}
+  ROOT tuple.95 = (f32[32,128]{1,0}, f32[128,8]{1,0}, f32[]) tuple(add.79, add.80, reduce.48), sharding={{devices=[2,1]0,1}, {devices=[1,2]0,1}, {replicated}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
+                                                /*num_partitions=*/2));
+  auto act_set = ComputeHloActivationAnalysis(module.get());
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "copy")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_1.2")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "Arg_2.3")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.1")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "select.40")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.2")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.5")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.6")));
+}
+
+TEST_F(HloActivationAnalysisTest, RepeatWhile) {
+  const std::string module_str = R"(
+HloModule RepeatWhile
+
+region_0.52 {
+  arg_tuple.53 = (s32[], f32[4,32]{1,0}, f32[3,4,128]{2,1,0}, f32[3,4,32]{2,1,0}, f32[3,4,32]{2,1,0}, /*index=5*/f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}) parameter(0), sharding={{replicated}, {devices=[2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}}
+  get-tuple-element.54 = s32[] get-tuple-element(arg_tuple.53), index=0, sharding={replicated}
+  constant.61 = s32[] constant(1), sharding={replicated}
+  add.105 = s32[] add(get-tuple-element.54, constant.61), sharding={replicated}
+  get-tuple-element.55 = f32[4,32]{1,0} get-tuple-element(arg_tuple.53), index=1, sharding={devices=[2,1]0,1}
+  get-tuple-element.59 = f32[3,32,128]{2,1,0} get-tuple-element(arg_tuple.53), index=5, sharding={devices=[1,2,1]0,1}
+  constant.69 = s32[] constant(0), sharding={replicated}
+  compare.70 = pred[] compare(get-tuple-element.54, constant.69), direction=LT, sharding={replicated}
+  constant.68 = s32[] constant(3), sharding={replicated}
+  add.71 = s32[] add(get-tuple-element.54, constant.68), sharding={replicated}
+  select.72 = s32[] select(compare.70, add.71, get-tuple-element.54), sharding={replicated}
+  dynamic-slice.73 = f32[1,32,128]{2,1,0} dynamic-slice(get-tuple-element.59, select.72, constant.69, constant.69), dynamic_slice_sizes={1,32,128}, sharding={devices=[1,2,1]0,1}
+  reshape.74 = f32[32,128]{1,0} reshape(dynamic-slice.73), sharding={devices=[2,1]0,1}
+  dot.0 = f32[4,128]{1,0} dot(get-tuple-element.55, reshape.74), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  get-tuple-element.60 = f32[3,128,32]{2,1,0} get-tuple-element(arg_tuple.53), index=6, sharding={devices=[1,1,2]0,1}
+  dynamic-slice.78 = f32[1,128,32]{2,1,0} dynamic-slice(get-tuple-element.60, select.72, constant.69, constant.69), dynamic_slice_sizes={1,128,32}, sharding={devices=[1,1,2]0,1}
+  reshape.79 = f32[128,32]{1,0} reshape(dynamic-slice.78), sharding={devices=[1,2]0,1}
+  dot.1 = f32[4,32]{1,0} dot(dot.0, reshape.79), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  constant.43 = f32[] constant(0), sharding={replicated}
+  broadcast.2 = f32[4,32]{1,0} broadcast(constant.43), dimensions={}, sharding={devices=[2,1]0,1}
+  maximum.84 = f32[4,32]{1,0} maximum(dot.1, broadcast.2), sharding={devices=[2,1]0,1}
+  get-tuple-element.56 = f32[3,4,128]{2,1,0} get-tuple-element(arg_tuple.53), index=2, sharding={devices=[1,2,1]0,1}
+  reshape.90 = f32[1,4,128]{2,1,0} reshape(dot.0), sharding={devices=[1,2,1]0,1}
+  dynamic-update-slice.94 = f32[3,4,128]{2,1,0} dynamic-update-slice(get-tuple-element.56, reshape.90, select.72, constant.69, constant.69), sharding={devices=[1,2,1]0,1}
+  get-tuple-element.57 = f32[3,4,32]{2,1,0} get-tuple-element(arg_tuple.53), index=3, sharding={devices=[1,2,1]0,1}
+  compare.85 = pred[4,32]{1,0} compare(dot.1, maximum.84), direction=EQ, sharding={devices=[2,1]0,1}
+  constant.42 = f32[] constant(1), sharding={replicated}
+  broadcast.1 = f32[4,32]{1,0} broadcast(constant.42), dimensions={}, sharding={devices=[2,1]0,1}
+  select.86 = f32[4,32]{1,0} select(compare.85, broadcast.1, broadcast.2), sharding={devices=[2,1]0,1}
+  reshape.95 = f32[1,4,32]{2,1,0} reshape(select.86), sharding={devices=[1,2,1]0,1}
+  dynamic-update-slice.99 = f32[3,4,32]{2,1,0} dynamic-update-slice(get-tuple-element.57, reshape.95, select.72, constant.69, constant.69), sharding={devices=[1,2,1]0,1}
+  get-tuple-element.58 = f32[3,4,32]{2,1,0} get-tuple-element(arg_tuple.53), index=4, sharding={devices=[1,2,1]0,1}
+  reshape.100 = f32[1,4,32]{2,1,0} reshape(get-tuple-element.55), sharding={devices=[1,2,1]0,1}
+  dynamic-update-slice.104 = f32[3,4,32]{2,1,0} dynamic-update-slice(get-tuple-element.58, reshape.100, select.72, constant.69, constant.69), sharding={devices=[1,2,1]0,1}
+  ROOT tuple.106 = (s32[], f32[4,32]{1,0}, f32[3,4,128]{2,1,0}, f32[3,4,32]{2,1,0}, f32[3,4,32]{2,1,0}, /*index=5*/f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}) tuple(add.105, maximum.84, dynamic-update-slice.94, dynamic-update-slice.99, dynamic-update-slice.104, /*index=5*/get-tuple-element.59, get-tuple-element.60), sharding={{replicated}, {devices=[2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}}
+}
+
+region_1.107 {
+  arg_tuple.108 = (s32[], f32[4,32]{1,0}, f32[3,4,128]{2,1,0}, f32[3,4,32]{2,1,0}, f32[3,4,32]{2,1,0}, /*index=5*/f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}) parameter(0), sharding={{replicated}, {devices=[2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}}
+  get-tuple-element.109 = s32[] get-tuple-element(arg_tuple.108), index=0, sharding={replicated}
+  constant.116 = s32[] constant(3)
+  ROOT compare.117 = pred[] compare(get-tuple-element.109, constant.116), direction=LT
+}
+
+region_2.126 {
+  Arg_0.127 = f32[] parameter(0)
+  Arg_1.128 = f32[] parameter(1)
+  ROOT add.129 = f32[] add(Arg_0.127, Arg_1.128)
+}
+
+wide.wide.region_3.156.clone.clone {
+  wide_param.7 = (s32[], f32[4,32]{1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,128]{2,1,0}, /*index=5*/f32[3,4,32]{2,1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,32]{2,1,0}) parameter(0), sharding={{replicated}, {devices=[1,2]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}}
+  get-tuple-element.185 = s32[] get-tuple-element(wide_param.7), index=0, sharding={replicated}
+  constant.34 = s32[] constant(1), sharding={replicated}
+  add.14 = s32[] add(get-tuple-element.185, constant.34), sharding={replicated}
+  get-tuple-element.186 = f32[4,32]{1,0} get-tuple-element(wide_param.7), index=1, sharding={devices=[2,1]0,1}
+  get-tuple-element.190 = f32[3,4,32]{2,1,0} get-tuple-element(wide_param.7), index=5, sharding={devices=[1,2,1]0,1}
+  constant.35 = s32[] constant(3), sharding={replicated}
+  subtract.3 = s32[] subtract(constant.35, get-tuple-element.185), sharding={replicated}
+  constant.6..sunk.4 = s32[] constant(-1), sharding={replicated}
+  add.15 = s32[] add(subtract.3, constant.6..sunk.4), sharding={replicated}
+  constant.36 = s32[] constant(0), sharding={replicated}
+  compare.7 = pred[] compare(add.15, constant.36), direction=LT, sharding={replicated}
+  constant.26..sunk.1 = s32[] constant(2), sharding={replicated}
+  add.16 = s32[] add(subtract.3, constant.26..sunk.1), sharding={replicated}
+  select.4 = s32[] select(compare.7, add.16, add.15), sharding={replicated}
+  dynamic-slice.15 = f32[1,4,32]{2,1,0} dynamic-slice(get-tuple-element.190, select.4, constant.36, constant.36), dynamic_slice_sizes={1,4,32}, sharding={devices=[1,2,1]0,1}
+  reshape.21 = f32[4,32]{1,0} reshape(dynamic-slice.15), sharding={devices=[2,1]0,1}
+  multiply.3 = f32[4,32]{1,0} multiply(get-tuple-element.186, reshape.21), sharding={devices=[2,1]0,1}
+  get-tuple-element.192 = f32[3,128,32]{2,1,0} get-tuple-element(wide_param.7), index=7, sharding={devices=[1,1,2]0,1}
+  dynamic-slice.16 = f32[1,128,32]{2,1,0} dynamic-slice(get-tuple-element.192, select.4, constant.36, constant.36), dynamic_slice_sizes={1,128,32}, sharding={devices=[1,1,2]0,1}
+  reshape.22 = f32[128,32]{1,0} reshape(dynamic-slice.16), sharding={devices=[1,2]0,1}
+  dot.20 = f32[4,128]{1,0} dot(multiply.3, reshape.22), lhs_contracting_dims={1}, rhs_contracting_dims={1}, sharding={devices=[2,1]0,1}
+  get-tuple-element.191 = f32[3,32,128]{2,1,0} get-tuple-element(wide_param.7), index=6, sharding={devices=[1,2,1]0,1}
+  dynamic-slice.17 = f32[1,32,128]{2,1,0} dynamic-slice(get-tuple-element.191, select.4, constant.36, constant.36), dynamic_slice_sizes={1,32,128}, sharding={devices=[1,2,1]0,1}
+  reshape.23 = f32[32,128]{1,0} reshape(dynamic-slice.17), sharding={devices=[2,1]0,1}
+  dot.21 = f32[4,32]{1,0} dot(dot.20, reshape.23), lhs_contracting_dims={1}, rhs_contracting_dims={1}, sharding={devices=[1,2]0,1}
+  get-tuple-element.187 = f32[3,32,128]{2,1,0} get-tuple-element(wide_param.7), index=2, sharding={devices=[1,2,1]0,1}
+  get-tuple-element.193 = f32[3,4,32]{2,1,0} get-tuple-element(wide_param.7), index=8, sharding={devices=[1,2,1]0,1}
+  dynamic-slice.18 = f32[1,4,32]{2,1,0} dynamic-slice(get-tuple-element.193, select.4, constant.36, constant.36), dynamic_slice_sizes={1,4,32}, sharding={devices=[1,2,1]0,1}
+  reshape.24 = f32[4,32]{1,0} reshape(dynamic-slice.18), sharding={devices=[2,1]0,1}
+  dot.22 = f32[32,128]{0,1} dot(reshape.24, dot.20), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[2,1]0,1}
+  reshape.25 = f32[1,32,128]{2,1,0} reshape(dot.22), sharding={devices=[1,2,1]0,1}
+  dynamic-update-slice.6 = f32[3,32,128]{2,1,0} dynamic-update-slice(get-tuple-element.187, reshape.25, select.4, constant.36, constant.36), sharding={devices=[1,2,1]0,1}
+  get-tuple-element.188 = f32[3,128,32]{2,1,0} get-tuple-element(wide_param.7), index=3, sharding={devices=[1,1,2]0,1}
+  get-tuple-element.189 = f32[3,4,128]{2,1,0} get-tuple-element(wide_param.7), index=4, sharding={devices=[1,2,1]0,1}
+  dynamic-slice.19 = f32[1,4,128]{2,1,0} dynamic-slice(get-tuple-element.189, select.4, constant.36, constant.36), dynamic_slice_sizes={1,4,128}, sharding={devices=[1,2,1]0,1}
+  reshape.26 = f32[4,128]{1,0} reshape(dynamic-slice.19), sharding={devices=[2,1]0,1}
+  dot.23 = f32[128,32]{0,1} dot(reshape.26, multiply.3), lhs_contracting_dims={0}, rhs_contracting_dims={0}, sharding={devices=[1,2]0,1}
+  reshape.27 = f32[1,128,32]{2,1,0} reshape(dot.23), sharding={devices=[1,1,2]0,1}
+  dynamic-update-slice.7 = f32[3,128,32]{2,1,0} dynamic-update-slice(get-tuple-element.188, reshape.27, select.4, constant.36, constant.36), sharding={devices=[1,1,2]0,1}
+  ROOT tuple.19 = (s32[], f32[4,32]{1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,128]{2,1,0}, /*index=5*/f32[3,4,32]{2,1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,32]{2,1,0}) tuple(add.14, dot.21, dynamic-update-slice.6, dynamic-update-slice.7, get-tuple-element.189, /*index=5*/get-tuple-element.190, get-tuple-element.191, get-tuple-element.192, get-tuple-element.193), sharding={{replicated}, {devices=[1,2]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}}
+}
+
+wide.wide.region_4.218.clone.clone {
+  wide_param.6 = (s32[], f32[4,32]{1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,128]{2,1,0}, /*index=5*/f32[3,4,32]{2,1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,32]{2,1,0}) parameter(0), sharding={{replicated}, {devices=[1,2]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}}
+  get-tuple-element.184 = s32[] get-tuple-element(wide_param.6), index=0, sharding={replicated}
+  constant.28 = s32[] constant(3)
+  ROOT compare.6 = pred[] compare(get-tuple-element.184, constant.28), direction=LT
+}
+
+ENTRY entry {
+  Arg_1.2 = f32[3,32,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  constant.45 = s32[] constant(0), sharding={replicated}
+  constant.23 = f32[] constant(1), sharding={replicated}
+  broadcast.24 = f32[4,32]{1,0} broadcast(constant.23), dimensions={}, sharding={devices=[1,2]0,1}
+  constant.21 = f32[] constant(0), sharding={replicated}
+  broadcast.22 = f32[3,32,128]{2,1,0} broadcast(constant.21), dimensions={}, sharding={devices=[1,2,1]0,1}
+  broadcast.20 = f32[3,128,32]{2,1,0} broadcast(constant.21), dimensions={}, sharding={devices=[1,1,2]0,1}
+  Arg_8.9 = f32[4,32]{1,0} parameter(2), sharding={devices=[2,1]0,1}
+  copy = f32[4,32]{1,0} copy(Arg_8.9), sharding={devices=[2,1]0,1}
+  broadcast.28 = f32[3,4,128]{2,1,0} broadcast(constant.21), dimensions={}, sharding={devices=[1,2,1]0,1}
+  broadcast.26 = f32[3,4,32]{2,1,0} broadcast(constant.21), dimensions={}, sharding={devices=[1,2,1]0,1}
+  Arg_2.3 = f32[3,128,32]{2,1,0} parameter(1), sharding={devices=[1,1,2]0,1}
+  tuple.42 = (s32[], f32[4,32]{1,0}, f32[3,4,128]{2,1,0}, f32[3,4,32]{2,1,0}, f32[3,4,32]{2,1,0}, /*index=5*/f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}) tuple(constant.45, copy, broadcast.28, broadcast.26, broadcast.26, /*index=5*/Arg_1.2, Arg_2.3), sharding={{replicated}, {devices=[2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}}
+  while.118 = (s32[], f32[4,32]{1,0}, f32[3,4,128]{2,1,0}, f32[3,4,32]{2,1,0}, f32[3,4,32]{2,1,0}, /*index=5*/f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}) while(tuple.42), condition=region_1.107, body=region_0.52, sharding={{replicated}, {devices=[2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}}
+  get-tuple-element.179 = f32[3,4,128]{2,1,0} get-tuple-element(while.118), index=2, sharding={devices=[1,2,1]0,1}
+  get-tuple-element.180 = f32[3,4,32]{2,1,0} get-tuple-element(while.118), index=3, sharding={devices=[1,2,1]0,1}
+  get-tuple-element.183 = f32[3,4,32]{2,1,0} get-tuple-element(while.118), index=4, sharding={devices=[1,2,1]0,1}
+  tuple.18 = (s32[], f32[4,32]{1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,128]{2,1,0}, /*index=5*/f32[3,4,32]{2,1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,32]{2,1,0}) tuple(constant.45, broadcast.24, broadcast.22, broadcast.20, get-tuple-element.179, /*index=5*/get-tuple-element.180, Arg_1.2, Arg_2.3, get-tuple-element.183), sharding={{replicated}, {devices=[1,2]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}}
+  while.3 = (s32[], f32[4,32]{1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,128]{2,1,0}, /*index=5*/f32[3,4,32]{2,1,0}, f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[3,4,32]{2,1,0}) while(tuple.18), condition=wide.wide.region_4.218.clone.clone, body=wide.wide.region_3.156.clone.clone, sharding={{replicated}, {devices=[1,2]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}, /*index=5*/{devices=[1,2,1]0,1}, {devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {devices=[1,2,1]0,1}}
+  get-tuple-element.234 = f32[3,32,128]{2,1,0} get-tuple-element(while.3), index=2, sharding={devices=[1,2,1]0,1}
+  constant.16 = f32[] constant(-0.01), sharding={replicated}
+  broadcast.17 = f32[3,32,128]{2,1,0} broadcast(constant.16), dimensions={}, sharding={devices=[1,2,1]0,1}
+  multiply.243 = f32[3,32,128]{2,1,0} multiply(get-tuple-element.234, broadcast.17), sharding={devices=[1,2,1]0,1}
+  add.255 = f32[3,32,128]{2,1,0} add(Arg_1.2, multiply.243), sharding={devices=[1,2,1]0,1}
+  get-tuple-element.235 = f32[3,128,32]{2,1,0} get-tuple-element(while.3), index=3, sharding={devices=[1,1,2]0,1}
+  broadcast.15 = f32[3,128,32]{2,1,0} broadcast(constant.16), dimensions={}, sharding={devices=[1,1,2]0,1}
+  multiply.244 = f32[3,128,32]{2,1,0} multiply(get-tuple-element.235, broadcast.15), sharding={devices=[1,1,2]0,1}
+  add.256 = f32[3,128,32]{2,1,0} add(Arg_2.3, multiply.244), sharding={devices=[1,1,2]0,1}
+  get-tuple-element.120 = f32[4,32]{1,0} get-tuple-element(while.118), index=1, sharding={devices=[2,1]0,1}
+  reduce.130 = f32[] reduce(get-tuple-element.120, constant.21), dimensions={0,1}, to_apply=region_2.126, sharding={replicated}
+  ROOT tuple.271 = (f32[3,32,128]{2,1,0}, f32[3,128,32]{2,1,0}, f32[]) tuple(add.255, add.256, reduce.130), sharding={{devices=[1,2,1]0,1}, {devices=[1,1,2]0,1}, {replicated}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(module_str, /*replica_count=*/1,
+                                                /*num_partitions=*/2));
+  auto act_set = ComputeHloActivationAnalysis(module.get());
+  EXPECT_FALSE(
+      act_set.count(FindInstruction(module.get(), "get-tuple-element.55")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.74")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.0")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.79")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.1")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.22")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.95")));
+  EXPECT_TRUE(
+      act_set.count(FindInstruction(module.get(), "dynamic-update-slice.99")));
+  EXPECT_TRUE(
+      act_set.count(FindInstruction(module.get(), "get-tuple-element.180")));
+  EXPECT_TRUE(
+      act_set.count(FindInstruction(module.get(), "get-tuple-element.190")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.21")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "multiply.3")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.20")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.23")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.21")));
+  EXPECT_FALSE(act_set.count(FindInstruction(module.get(), "reshape.24")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.22")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "reshape.26")));
+  EXPECT_TRUE(act_set.count(FindInstruction(module.get(), "dot.23")));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 1a11e8eac05..162bcda1cc3 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index 98bbb3a9285..dcaeefb081e 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 7eb2b950e14..99bfc556eec 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <map>
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index d6347b3d76c..954309ca143 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils.h b/tensorflow/compiler/xla/service/hlo_casting_utils.h
deleted file mode 100644
index 05877f09acf..00000000000
--- a/tensorflow/compiler/xla/service/hlo_casting_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Casting utility functions for HLO instructions.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
-
-#include <type_traits>
-
-#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/tsl/platform/logging.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
index 0d49cd15493..29de122325f 100644
--- a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_clone_context.h b/tensorflow/compiler/xla/service/hlo_clone_context.h
deleted file mode 100644
index 492cc444dbb..00000000000
--- a/tensorflow/compiler/xla/service/hlo_clone_context.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
-
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/map_util.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
deleted file mode 100644
index 459a23e27d6..00000000000
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
-
-#include <functional>
-#include <list>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/cord.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
-#include "tensorflow/compiler/xla/iterator_util.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
index 6c30d060d61..715ef6da738 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_deduplicator.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc b/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
index 0c2a9d0b4ff..ad9e77098a3 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_deduplicator_test.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index ebfcdd7ed32..de397f1159a 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 
 #include <memory>
 #include <set>
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index a670a7fcce4..c45c989af6b 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/slow_operation_alarm.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
index 194d5c783aa..f1c69419ae1 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.h
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CONSTANT_FOLDING_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index ecc86616289..b4b2c773632 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 4dd06f06bc4..0be6061a4b9 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -14,17 +14,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <memory>
+#include <optional>
+#include <string>
+#include <utility>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -34,12 +39,6 @@ limitations under the License.
 
 namespace xla {
 
-constexpr const char HloCostAnalysis::kFlopsKey[];
-constexpr const char HloCostAnalysis::kTranscendentalsKey[];
-constexpr const char HloCostAnalysis::kBytesAccessedKey[];
-constexpr const char HloCostAnalysis::kOptimalSecondsKey[];
-constexpr const char HloCostAnalysis::kUtilizationKey[];
-
 HloCostAnalysis::HloCostAnalysis(const Options& options) : options_(options) {}
 HloCostAnalysis::HloCostAnalysis(ShapeSizeFunction shape_size,
                                  const Properties& per_second_rates)
@@ -49,19 +48,20 @@ Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   // Set current instruction cost values to reasonable default values. Each
   // handler can overwrite these values. In Postprocess, these values are
   // accumulated and written to the per-instruction maps.
-  current_properties_.clear();
+  current_properties_ = Properties();
   current_should_compute_bottleneck_time_ = true;
 
   // The default number of bytes accessed for an instruction is the sum of the
   // sizes of the inputs and outputs. The default ShapeUtil::ByteSizeOf does not
   // handle opaque types.
   float bytes_accessed = GetShapeSize(hlo->shape());
-  SetOutputBytesAccessed(GetShapeSize(hlo->shape()));
+  current_properties_.set_output_bytes_accessed(GetShapeSize(hlo->shape()));
   for (int64_t i = 0; i < hlo->operand_count(); ++i) {
     const HloInstruction* operand = hlo->operand(i);
     bytes_accessed += GetShapeSize(operand->shape());
-    SetOperandBytesAccessed(i, GetShapeSize(operand->shape()));
-    SetOperandUtilization(i, 1.0);
+    current_properties_.set_operand_bytes_accessed(
+        i, GetShapeSize(operand->shape()));
+    current_properties_.set_operand_utilization(i, 1.0);
   }
   current_properties_[kBytesAccessedKey] = bytes_accessed;
 
@@ -73,21 +73,27 @@ Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
     // Compute the time as the time of the bottleneck, i.e. the slowest property
     // given the per-second rate of each property.
     float optimal_seconds = 0.0f;
-    for (const auto& property : current_properties_) {
-      if (property.first != kOptimalSecondsKey) {
-        optimal_seconds = std::max(
-            optimal_seconds,
-            property.second / GetProperty(property.first,
-                                          options_.per_second_rates, INFINITY));
+    current_properties_.ForEach([&](absl::string_view key, float val) {
+      if (key == kOptimalSecondsKey) {
+        return;
       }
-    }
+      float per_second_rate = options_.per_second_rates[key];
+      if (per_second_rate != 0) {
+        optimal_seconds = std::max(optimal_seconds, val / per_second_rate);
+      }
+    });
     current_properties_[kOptimalSecondsKey] = optimal_seconds;
   }
 
-  TF_RET_CHECK(hlo_properties_.emplace(hlo, current_properties_).second);
-  for (const auto& property : current_properties_) {
-    properties_sum_[property.first] += property.second;
-  }
+  current_properties_.ForEach(
+      [&](absl::string_view key, float val) { properties_sum_[key] += val; });
+
+  // Move current_properties_ into hlo_properties_ and reset
+  // current_properties_.
+  auto [it_ignored, inserted] =
+      hlo_properties_.emplace(hlo, std::move(current_properties_));
+  current_properties_ = Properties();
+  TF_RET_CHECK(inserted);
 
   return OkStatus();
 }
@@ -98,9 +104,8 @@ Status HloCostAnalysis::RemoveInstruction(HloInstruction* instruction) {
   auto it = hlo_properties_.find(instruction);
   if (it != hlo_properties_.end()) {
     current_properties_ = it->second;
-    for (const auto& property : current_properties_) {
-      properties_sum_[property.first] -= property.second;
-    }
+    current_properties_.ForEach(
+        [&](absl::string_view key, float val) { properties_sum_[key] -= val; });
     hlo_properties_.erase(instruction);
   }
   return OkStatus();
@@ -133,7 +138,7 @@ Status HloCostAnalysis::HandleElementwiseOp(
       opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
       opcode == HloOpcode::kSin || opcode == HloOpcode::kCos ||
       opcode == HloOpcode::kExpm1 || opcode == HloOpcode::kLog1p ||
-      opcode == HloOpcode::kAtan2) {
+      opcode == HloOpcode::kAtan2 || opcode == HloOpcode::kTan) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
@@ -143,22 +148,14 @@ Status HloCostAnalysis::HandleElementwiseOp(
   return OkStatus();
 }
 
-/*static*/ float HloCostAnalysis::GetProperty(absl::string_view key,
-                                              const Properties& properties,
-                                              const float default_value) {
-  auto key_value = properties.find(key);
-  return key_value == properties.end() ? default_value : key_value->second;
-}
-
 /*static*/ float HloCostAnalysis::GetPropertyForHlo(
-    const HloInstruction& hlo, const std::string& key,
+    const HloInstruction& hlo, absl::string_view key,
     const HloToProperties& hlo_to_properties) {
   auto it = hlo_to_properties.find(&hlo);
   if (it == hlo_to_properties.end()) {
     return 0.0f;
-  } else {
-    return GetProperty(key, it->second);
   }
+  return it->second[key];
 }
 
 int64_t HloCostAnalysis::GetShapeSize(const Shape& shape) const {
@@ -222,7 +219,7 @@ Status HloCostAnalysis::FusionCalculateUtilizations(
   // instruction.
   for (const HloInstruction* instr :
        fusion->fused_instructions_computation()->instructions()) {
-    hlo_properties_[instr][kUtilizationKey] = 1.0;
+    hlo_properties_[instr][kUtilizationKey] = 1.f;
   }
   return OkStatus();
 }
@@ -250,7 +247,7 @@ Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
 Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
+  current_properties_.set_output_bytes_accessed(0);
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
 }
@@ -258,7 +255,7 @@ Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
 Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
+  current_properties_.set_output_bytes_accessed(0);
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
 }
@@ -271,8 +268,8 @@ Status HloCostAnalysis::HandleGetTupleElement(
   // output.
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
-  SetOperandBytesAccessed(0, 0);
+  current_properties_.set_output_bytes_accessed(0);
+  current_properties_.set_operand_bytes_accessed(0, 0);
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
 }
@@ -287,11 +284,12 @@ Status HloCostAnalysis::HandleReverse(const HloInstruction*) {
 
 Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) {
   current_properties_[kBytesAccessedKey] = GetShapeSize(slice->shape()) * 2;
-  SetOutputBytesAccessed(GetShapeSize(slice->shape()));
-  SetOperandBytesAccessed(0, GetShapeSize(slice->shape()));
-  SetOperandUtilization(0,
-                        1.0 * ShapeUtil::ElementsIn(slice->shape()) /
-                            ShapeUtil::ElementsIn(slice->operand(0)->shape()));
+  current_properties_.set_output_bytes_accessed(GetShapeSize(slice->shape()));
+  current_properties_.set_operand_bytes_accessed(0,
+                                                 GetShapeSize(slice->shape()));
+  current_properties_.set_operand_utilization(
+      0, 1.0 * ShapeUtil::ElementsIn(slice->shape()) /
+             ShapeUtil::ElementsIn(slice->operand(0)->shape()));
   return OkStatus();
 }
 
@@ -300,10 +298,13 @@ Status HloCostAnalysis::HandleDynamicSlice(
   current_properties_[kBytesAccessedKey] =
       GetShapeSize(dynamic_slice->shape()) * 2 +
       GetShapeSize(dynamic_slice->operand(1)->shape());
-  SetOutputBytesAccessed(GetShapeSize(dynamic_slice->shape()));
-  SetOperandBytesAccessed(0, GetShapeSize(dynamic_slice->shape()));
-  SetOperandBytesAccessed(1, GetShapeSize(dynamic_slice->operand(1)->shape()));
-  SetOperandUtilization(
+  current_properties_.set_output_bytes_accessed(
+      GetShapeSize(dynamic_slice->shape()));
+  current_properties_.set_operand_bytes_accessed(
+      0, GetShapeSize(dynamic_slice->shape()));
+  current_properties_.set_operand_bytes_accessed(
+      1, GetShapeSize(dynamic_slice->operand(1)->shape()));
+  current_properties_.set_operand_utilization(
       0, 1.0 * ShapeUtil::ElementsIn(dynamic_slice->shape()) /
              ShapeUtil::ElementsIn(dynamic_slice->operand(0)->shape()));
   return OkStatus();
@@ -315,16 +316,16 @@ Status HloCostAnalysis::HandleDynamicUpdateSlice(
       GetShapeSize(dynamic_update_slice->operand(1)->shape()) * 2 +
       GetShapeSize(dynamic_update_slice->operand(2)->shape());
   // Operand 0 aliases with the output.
-  SetOutputBytesAccessed(
+  current_properties_.set_output_bytes_accessed(
       GetShapeSize(dynamic_update_slice->operand(1)->shape()));
-  SetOperandBytesAccessed(0, 0);
-  SetOperandBytesAccessed(
+  current_properties_.set_operand_bytes_accessed(0, 0);
+  current_properties_.set_operand_bytes_accessed(
       1, GetShapeSize(dynamic_update_slice->operand(1)->shape()));
-  SetOperandBytesAccessed(
+  current_properties_.set_operand_bytes_accessed(
       2, GetShapeSize(dynamic_update_slice->operand(2)->shape()));
   // Part of operand 0 overwritten by operand 1 is not used by the users
   // of the output of this operation.
-  SetOperandUtilization(
+  current_properties_.set_operand_utilization(
       0,
       1.0 *
           (ShapeUtil::ElementsIn(dynamic_update_slice->shape()) -
@@ -339,9 +340,9 @@ Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) {
   // index table of the tuple.
 
   current_properties_[kBytesAccessedKey] = GetShapeSize(tuple->shape());
-  SetOutputBytesAccessed(GetShapeSize(tuple->shape()));
+  current_properties_.set_output_bytes_accessed(GetShapeSize(tuple->shape()));
   for (int i = 0; i < tuple->operand_count(); ++i) {
-    SetOperandBytesAccessed(i, 0);
+    current_properties_.set_operand_bytes_accessed(i, 0);
   }
   return OkStatus();
 }
@@ -360,9 +361,9 @@ Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
   // Domain does not have any computation or data transfer.
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
+  current_properties_.set_output_bytes_accessed(0);
   for (int i = 0; i < domain->operand_count(); ++i) {
-    SetOperandBytesAccessed(i, 0);
+    current_properties_.set_operand_bytes_accessed(i, 0);
   }
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
@@ -392,10 +393,10 @@ Status HloCostAnalysis::HandleInfeed(const HloInstruction* infeed) {
   int64_t size = 0;
   for (const auto& indexed_shape : ShapeUtil::GetLeafShapes(infeed->shape())) {
     size += GetShapeSize(indexed_shape.shape);
-    SetOutputBytesAccessed(indexed_shape.index,
-                           GetShapeSize(indexed_shape.shape));
+    current_properties_.set_output_bytes_accessed(
+        indexed_shape.index, GetShapeSize(indexed_shape.shape));
   }
-  SetOutputBytesAccessed(size);
+  current_properties_.set_output_bytes_accessed(size);
   current_properties_[kBytesAccessedKey] = size;
   return OkStatus();
 }
@@ -409,10 +410,10 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction* outfeed) {
     for (const auto& indexed_shape :
          ShapeUtil::GetLeafShapes(operand->shape())) {
       size += GetShapeSize(indexed_shape.shape);
-      SetOperandBytesAccessed(i, indexed_shape.index,
-                              GetShapeSize(indexed_shape.shape));
+      current_properties_.set_operand_bytes_accessed(
+          i, indexed_shape.index, GetShapeSize(indexed_shape.shape));
     }
-    SetOperandBytesAccessed(i, size);
+    current_properties_.set_operand_bytes_accessed(i, size);
     current_properties_[kBytesAccessedKey] += size;
   }
   return OkStatus();
@@ -425,11 +426,11 @@ Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
 
   // Compute the cost of all elements for this Map operation.
   const int64_t element_count = ShapeUtil::ElementsIn(map->shape());
-  for (const auto& property : sub_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] = property.second * element_count;
+  sub_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] = val * element_count;
     }
-  }
+  });
   return OkStatus();
 }
 
@@ -449,11 +450,11 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
                           : reduce->shape().tuple_shapes(0);
   int64_t reduction_count =
       ShapeUtil::ElementsIn(arg->shape()) - ShapeUtil::ElementsIn(output_shape);
-  for (const auto& property : sub_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] = property.second * reduction_count;
+  sub_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] = val * reduction_count;
     }
-  }
+  });
   return OkStatus();
 }
 
@@ -462,7 +463,7 @@ Status HloCostAnalysis::HandleReduceWindow(
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
   // Compute the properties of the reduction function.
-  TF_ASSIGN_OR_RETURN(const Properties sub_properties,
+  TF_ASSIGN_OR_RETURN(Properties sub_properties,
                       ProcessSubcomputation(function));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
@@ -522,20 +523,21 @@ Status HloCostAnalysis::HandleReduceWindow(
             << " reported for reduce-window:\n"
             << reduce_window->ToString();
   }
-  if (input_reuse_is_inefficient()) {
-    SetOperandUtilization(0, 1.0 * output_element_count * window_element_count /
-                                 input_element_count);
-    SetOperandBytesAccessed(
+  if (options_.count_multiple_input_accesses) {
+    current_properties_.set_operand_utilization(0, 1.0 * output_element_count *
+                                                       window_element_count /
+                                                       input_element_count);
+    current_properties_.set_operand_bytes_accessed(
         0, output_element_count * window_element_count *
                ShapeUtil::ByteSizeOfPrimitiveType(
                    reduce_window->operand(0)->shape().element_type()));
   }
 
-  for (const auto& property : sub_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] = property.second * reduction_count;
+  sub_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] = val * reduction_count;
     }
-  }
+  });
   return OkStatus();
 }
 
@@ -543,9 +545,9 @@ Status HloCostAnalysis::HandleSelectAndScatter(
     const HloInstruction* instruction) {
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
-  TF_ASSIGN_OR_RETURN(const Properties select_properties,
+  TF_ASSIGN_OR_RETURN(Properties select_properties,
                       ProcessSubcomputation(instruction->select()));
-  TF_ASSIGN_OR_RETURN(const Properties scatter_properties,
+  TF_ASSIGN_OR_RETURN(Properties scatter_properties,
                       ProcessSubcomputation(instruction->scatter()));
 
   // Compute the cost of all elements for this operation. For each scatter
@@ -559,33 +561,33 @@ Status HloCostAnalysis::HandleSelectAndScatter(
   }
   const int64_t select_count =
       source_element_count * (window_element_count - 1);
-  for (const auto& property : select_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] += property.second * select_count;
+  select_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] += val * select_count;
     }
-  }
-  for (const auto& property : scatter_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] +=
-          property.second * source_element_count;
+  });
+  scatter_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] += val * source_element_count;
     }
-  }
+  });
   return OkStatus();
 }
 
 Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   // A bitcast does no computation and touches no memory.
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
-  SetOperandBytesAccessed(0, 0);
+  current_properties_.set_output_bytes_accessed(0);
+  current_properties_.set_operand_bytes_accessed(0, 0);
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
 }
 
 Status HloCostAnalysis::HandleBroadcast(const HloInstruction* broadcast) {
-  if (input_reuse_is_inefficient()) {
-    SetOperandBytesAccessed(0, ShapeUtil::ElementsIn(broadcast->shape()));
-    SetOperandUtilization(
+  if (options_.count_multiple_input_accesses) {
+    current_properties_.set_operand_bytes_accessed(
+        0, ShapeUtil::ElementsIn(broadcast->shape()));
+    current_properties_.set_operand_utilization(
         0, 1.0 * ShapeUtil::ElementsIn(broadcast->shape()) /
                ShapeUtil::ElementsIn(broadcast->operand(0)->shape()));
   }
@@ -664,9 +666,9 @@ Status HloCostAnalysis::HandleAfterAll(const HloInstruction* token) {
   // emitted.
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
+  current_properties_.set_output_bytes_accessed(0);
   for (int i = 0; i < token->operand_count(); ++i) {
-    SetOperandBytesAccessed(i, 0);
+    current_properties_.set_operand_bytes_accessed(i, 0);
   }
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
@@ -678,9 +680,9 @@ Status HloCostAnalysis::HandleAddDependency(
   // emitted.
   current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
-  SetOutputBytesAccessed(0);
+  current_properties_.set_output_bytes_accessed(0);
   for (int i = 0; i < add_dependency->operand_count(); ++i) {
-    SetOperandBytesAccessed(i, 0);
+    current_properties_.set_operand_bytes_accessed(i, 0);
   }
   current_properties_[kOptimalSecondsKey] = 0;
   return OkStatus();
@@ -841,11 +843,13 @@ Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
 Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
   // Half of operand 0 is read.
   float bytes_accessed = GetShapeSize(hlo->shape());
-  SetOutputBytesAccessed(GetShapeSize(hlo->shape()));
+  current_properties_.set_output_bytes_accessed(GetShapeSize(hlo->shape()));
   bytes_accessed += GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
-  SetOperandBytesAccessed(0, GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
+  current_properties_.set_operand_bytes_accessed(
+      0, GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
   bytes_accessed += GetShapeSize(hlo->operand(1)->shape());
-  SetOperandBytesAccessed(0, GetShapeSize(hlo->operand(1)->shape()));
+  current_properties_.set_operand_bytes_accessed(
+      0, GetShapeSize(hlo->operand(1)->shape()));
   current_properties_[kBytesAccessedKey] = bytes_accessed;
 
   const Shape& a_shape = hlo->operand(0)->shape();
@@ -860,9 +864,11 @@ Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
 Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   // Half of operand 0 is read and half of the output will be written.
   float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
-  SetOutputBytesAccessed(GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
+  current_properties_.set_output_bytes_accessed(
+      GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
   bytes_accessed += GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
-  SetOperandBytesAccessed(0, GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
+  current_properties_.set_operand_bytes_accessed(
+      0, GetShapeSize(hlo->operand(0)->shape()) / 2.0f);
   current_properties_[kBytesAccessedKey] = bytes_accessed;
 
   const Shape& a_shape = hlo->operand(0)->shape();
@@ -910,7 +916,7 @@ Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
     bytes_accessed += GetShapeSize(operand->shape());
   }
   current_properties_[kFlopsKey] = flops;
-  SetOutputBytesAccessed(output_bytes_accessed);
+  current_properties_.set_output_bytes_accessed(output_bytes_accessed);
   current_properties_[kBytesAccessedKey] = bytes_accessed;
   return OkStatus();
 }
@@ -1009,8 +1015,8 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
           if (root->opcode() == HloOpcode::kDynamicUpdateSlice) {
             int64_t size = GetShapeSize(root->operand(1)->shape());
             current_properties_[kBytesAccessedKey] += size;
-            SetOutputBytesAccessed(shape_index, size);
-            hlo_properties_[root][GetOperandUtilizationKey(0).c_str()] = 0;
+            current_properties_.set_output_bytes_accessed(shape_index, size);
+            hlo_properties_[root][GetOperandUtilizationKey(0)] = 0;
             return;
           }
         } else if (shape_index.size() == 1) {
@@ -1020,14 +1026,15 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
             int64_t size = GetShapeSize(
                 root->operand(shape_index[0])->operand(1)->shape());
             current_properties_[kBytesAccessedKey] += size;
-            SetOutputBytesAccessed(shape_index, size);
+            current_properties_.set_output_bytes_accessed(shape_index, size);
             hlo_properties_[root->operand(shape_index[0])]
-                           [GetOperandUtilizationKey(0).c_str()] = 0;
+                           [GetOperandUtilizationKey(0)] = 0;
             return;
           }
         }
         current_properties_[kBytesAccessedKey] += GetShapeSize(subshape);
-        SetOutputBytesAccessed(shape_index, GetShapeSize(subshape));
+        current_properties_.set_output_bytes_accessed(shape_index,
+                                                      GetShapeSize(subshape));
       });
 
   if (fusion->shape().IsTuple()) {
@@ -1039,12 +1046,11 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
         propagate_output_size_to_parent;
     propagate_output_size_to_parent = [&](const Shape& shape,
                                           const ShapeIndex& shape_index) {
-      auto output_bytes_it =
-          current_properties_.find(GetOutputBytesAccessedKey(shape_index));
-      if (output_bytes_it != current_properties_.end()) {
-        return output_bytes_it->second;
+      float& bytes_accessed =
+          current_properties_[GetOutputBytesAccessedKey(shape_index)];
+      if (bytes_accessed != 0) {
+        return bytes_accessed;
       }
-      float bytes_accessed = 0;
       for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
         const Shape& subshape = shape.tuple_shapes(i);
         ShapeIndex subshape_index(shape_index);
@@ -1052,14 +1058,9 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
         bytes_accessed +=
             propagate_output_size_to_parent(subshape, subshape_index);
       }
-      SetOutputBytesAccessed(shape_index, bytes_accessed);
       return bytes_accessed;
     };
-    auto output_bytes_it =
-        current_properties_.find(GetOutputBytesAccessedKey());
-    if (output_bytes_it != current_properties_.end()) {
-      current_properties_.erase(output_bytes_it);
-    }
+    current_properties_[GetOutputBytesAccessedKey()] = 0;
     propagate_output_size_to_parent(fusion->shape(), {});
   }
 
@@ -1071,9 +1072,12 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
     if (instr->opcode() == HloOpcode::kConstant &&
         ShapeUtil::ElementsIn(instr->shape()) >
             immediate_constant_max_elements()) {
+      float utilization = hlo_properties_[instr][kUtilizationKey];
+      if (!options_.count_multiple_input_accesses) {
+        utilization = fmin(utilization, 1.0);
+      }
       current_properties_[kBytesAccessedKey] +=
-          GetShapeSize(instr->shape()) *
-          hlo_properties_[instr][kUtilizationKey];
+          GetShapeSize(instr->shape()) * utilization;
     }
   }
 
@@ -1099,12 +1103,14 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
         }
         int64_t size = FusionParameterReadBytes(gte);
         operand_size += size;
-        SetOperandBytesAccessed(i, indexed_shape.index, size);
+        current_properties_.set_operand_bytes_accessed(i, indexed_shape.index,
+                                                       size);
       }
     }
     current_properties_[kBytesAccessedKey] += operand_size;
-    SetOperandBytesAccessed(i, operand_size);
-    SetOperandUtilization(i, hlo_properties_[operand][kUtilizationKey]);
+    current_properties_.set_operand_bytes_accessed(i, operand_size);
+    current_properties_.set_operand_utilization(
+        i, hlo_properties_[operand][kUtilizationKey]);
   }
 
   return OkStatus();
@@ -1124,9 +1130,9 @@ Status HloCostAnalysis::HandleCustomCall(const HloInstruction* custom_call) {
   // computation which contains a CustomCall.
   current_properties_[kOptimalSecondsKey] = -1;
   current_properties_[kBytesAccessedKey] = -1;
-  SetOutputBytesAccessed(-1);
+  current_properties_.set_output_bytes_accessed(-1);
   for (int i = 0; i < custom_call->operand_count(); ++i) {
-    SetOperandBytesAccessed(i, -1);
+    current_properties_.set_operand_bytes_accessed(i, -1);
   }
   current_properties_[kFlopsKey] = -1;
   current_should_compute_bottleneck_time_ = false;
@@ -1151,13 +1157,13 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   TF_ASSIGN_OR_RETURN(const Properties condition_properties,
                       ProcessSubcomputation(xla_while->while_condition()));
 
-  current_properties_.clear();
-  for (const auto& property : body_properties) {
-    current_properties_[property.first] += property.second;
-  }
-  for (const auto& property : condition_properties) {
-    current_properties_[property.first] += property.second;
-  }
+  current_properties_ = Properties();
+  body_properties.ForEach([&](absl::string_view key, float val) {
+    current_properties_[key] += val;
+  });
+  condition_properties.ForEach([&](absl::string_view key, float val) {
+    current_properties_[key] += val;
+  });
   current_should_compute_bottleneck_time_ = false;
 
   return OkStatus();
@@ -1174,12 +1180,11 @@ Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
     TF_ASSIGN_OR_RETURN(
         const Properties branch_computation_properties,
         ProcessSubcomputation(conditional->branch_computation(j)));
-    for (const auto& property : branch_computation_properties) {
-      if (!tsl::gtl::InsertIfNotPresent(&current_properties_, property)) {
-        auto& current_property = current_properties_[property.first];
-        current_property = std::max(current_property, property.second);
-      }
-    }
+    branch_computation_properties.ForEach(
+        [&](absl::string_view key, float val) {
+          auto& current_property = current_properties_[key];
+          current_property = std::max(current_property, val);
+        });
   }
   current_should_compute_bottleneck_time_ = false;
 
@@ -1192,12 +1197,13 @@ Status HloCostAnalysis::HandleGather(const HloInstruction* gather) {
   int64_t output_size = GetShapeSize(gather->shape());
   current_properties_[kBytesAccessedKey] =
       output_size * 2 + GetShapeSize(gather->operand(1)->shape());
-  SetOperandBytesAccessed(0, output_size);
-  SetOperandBytesAccessed(1, GetShapeSize(gather->operand(1)->shape()));
-  SetOperandUtilization(0,
-                        1.0 * ShapeUtil::ElementsIn(gather->shape()) /
-                            ShapeUtil::ElementsIn(gather->operand(0)->shape()));
-  SetOutputBytesAccessed(output_size);
+  current_properties_.set_operand_bytes_accessed(0, output_size);
+  current_properties_.set_operand_bytes_accessed(
+      1, GetShapeSize(gather->operand(1)->shape()));
+  current_properties_.set_operand_utilization(
+      0, 1.0 * ShapeUtil::ElementsIn(gather->shape()) /
+             ShapeUtil::ElementsIn(gather->operand(0)->shape()));
+  current_properties_.set_output_bytes_accessed(output_size);
   // Gather does not issue any flops.
   return OkStatus();
 }
@@ -1209,26 +1215,26 @@ Status HloCostAnalysis::HandleScatter(const HloInstruction* hlo) {
   int64_t total_update_size = 0;
   for (int i = 0, n = scatter->scatter_operand_count(); i < n; ++i) {
     int64_t update_size = GetShapeSize(scatter->scatter_updates()[i]->shape());
-    SetOperandBytesAccessed(i, update_size);
-    SetOperandBytesAccessed(n + 1 + i, update_size);
+    current_properties_.set_operand_bytes_accessed(i, update_size);
+    current_properties_.set_operand_bytes_accessed(n + 1 + i, update_size);
     total_update_size += update_size;
   }
   int64_t scatter_indices_size =
       GetShapeSize(scatter->scatter_indices()->shape());
-  SetOperandBytesAccessed(scatter->scatter_operand_count(),
-                          scatter_indices_size);
+  current_properties_.set_operand_bytes_accessed(
+      scatter->scatter_operand_count(), scatter_indices_size);
   current_properties_[kBytesAccessedKey] =
       total_update_size * 3 + scatter_indices_size;
-  SetOutputBytesAccessed(total_update_size);
+  current_properties_.set_output_bytes_accessed(total_update_size);
   const int64_t element_count =
       ShapeUtil::ElementsIn(scatter->scatter_updates()[0]->shape());
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
                       ProcessSubcomputation(scatter->to_apply()));
-  for (const auto& property : sub_properties) {
-    if (KeyToCopyFromSubcomputation(property.first)) {
-      current_properties_[property.first] = property.second * element_count;
+  sub_properties.ForEach([&](absl::string_view key, float val) {
+    if (KeyToCopyFromSubcomputation(key)) {
+      current_properties_[key] = val * element_count;
     }
-  }
+  });
   return OkStatus();
 }
 
@@ -1246,20 +1252,18 @@ Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
   return OkStatus();
 }
 
-float HloCostAnalysis::flop_count() const {
-  return GetProperty(kFlopsKey, properties_sum_);
-}
+float HloCostAnalysis::flop_count() const { return properties_sum_[kFlopsKey]; }
 
 float HloCostAnalysis::transcendental_count() const {
-  return GetProperty(kTranscendentalsKey, properties_sum_);
+  return properties_sum_[kTranscendentalsKey];
 }
 
 float HloCostAnalysis::bytes_accessed() const {
-  return GetProperty(kBytesAccessedKey, properties_sum_);
+  return properties_sum_[kBytesAccessedKey];
 }
 
 float HloCostAnalysis::optimal_seconds() const {
-  return GetProperty(kOptimalSecondsKey, properties_sum_);
+  return properties_sum_[kOptimalSecondsKey];
 }
 
 int64_t HloCostAnalysis::flop_count(const HloInstruction& hlo) const {
@@ -1341,8 +1345,9 @@ StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
   auto visitor = CreateNestedCostAnalysis();
   visitor->ReserveVisitStates(computation->instruction_count());
   TF_RETURN_IF_ERROR(computation->Accept(visitor.get()));
-  hlo_properties_.insert(visitor->hlo_properties_.begin(),
-                         visitor->hlo_properties_.end());
+  hlo_properties_.insert(
+      std::make_move_iterator(visitor->hlo_properties_.begin()),
+      std::make_move_iterator(visitor->hlo_properties_.end()));
   return visitor->properties();
 }
 
@@ -1350,49 +1355,20 @@ std::unique_ptr<HloCostAnalysis> HloCostAnalysis::CreateNestedCostAnalysis() {
   return std::make_unique<HloCostAnalysis>(options_);
 }
 
-void HloCostAnalysis::SetOperandBytesAccessed(int64_t operand_num,
-                                              float value) {
-  current_properties_[GetOperandBytesAccessedKey(operand_num).c_str()] = value;
-}
-
-void HloCostAnalysis::SetOperandBytesAccessed(int64_t operand_num,
-                                              ShapeIndex index, float value) {
-  current_properties_[GetOperandBytesAccessedKey(operand_num, index).c_str()] =
-      value;
-}
-
-void HloCostAnalysis::SetOperandUtilization(int64_t operand_num, float value) {
-  current_properties_[GetOperandUtilizationKey(operand_num).c_str()] = value;
-}
-
-void HloCostAnalysis::SetOperandUtilization(int64_t operand_num,
-                                            ShapeIndex index, float value) {
-  current_properties_[GetOperandUtilizationKey(operand_num, index).c_str()] =
-      value;
-}
-
-void HloCostAnalysis::SetOutputBytesAccessed(float value) {
-  current_properties_[GetOutputBytesAccessedKey()] = value;
-}
-
-void HloCostAnalysis::SetOutputBytesAccessed(ShapeIndex index, float value) {
-  current_properties_[GetOutputBytesAccessedKey(index)] = value;
-}
-
 /*static*/ std::string HloCostAnalysis::GetOperandBytesAccessedKey(
-    int64_t operand_num, ShapeIndex index) {
+    int64_t operand_num, const ShapeIndex& index) {
   return absl::StrCat(kBytesAccessedKey, " operand ", operand_num, " ",
                       index.ToString());
 }
 
 /*static*/ std::string HloCostAnalysis::GetOperandUtilizationKey(
-    int64_t operand_num, ShapeIndex index) {
+    int64_t operand_num, const ShapeIndex& index) {
   return absl::StrCat(kUtilizationKey, " operand ", operand_num, " ",
                       index.ToString());
 }
 
 /*static*/ std::string HloCostAnalysis::GetOutputBytesAccessedKey(
-    ShapeIndex index) {
+    const ShapeIndex& index) {
   return absl::StrCat(kBytesAccessedKey, " output ", index.ToString());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index e5b03cdbcba..8559e0aefa8 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -17,12 +17,14 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_COST_ANALYSIS_H_
 
 #include <functional>
+#include <memory>
+#include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -36,20 +38,336 @@ namespace xla {
 // operations separately from transcendental operations.
 class HloCostAnalysis : public ConstDfsHloVisitor {
  public:
-  // Each HLO is associated to a vector of properties with the indices given
-  // below. Sub-classes can add further properties.
-  // MSVC 14.0 limitation requires the consts.
-  typedef std::map<std::string, float, std::less<>> Properties;
+  static inline constexpr absl::string_view kFlopsKey = "flops";
+  static inline constexpr absl::string_view kTranscendentalsKey =
+      "transcendentals";
+  static inline constexpr absl::string_view kBytesAccessedKey =
+      "bytes accessed";
+  static inline constexpr absl::string_view kOptimalSecondsKey =
+      "optimal_seconds";
+  static inline constexpr absl::string_view kUtilizationKey = "utilization";
+
+  // Keys reserved for use by subclasses.  These get the same special "fast
+  // path" treatment in Properties as the other keys above.
+  static inline constexpr absl::string_view kReserved0Key = "reserved0";
+  static inline constexpr absl::string_view kReserved1Key = "reserved1";
+
+  // A data structure like hash_map<string, float> for storing info about an HLO
+  // instruction or computation.
+  //
+  // Note that unlike a regular hashtable, there's no notion of an "unset" key.
+  // All keys are logically present, with value 0.
+  //
+  // This data structure *could* be simply map<string, float>, and indeed it
+  // was, once.  The problem is, XLA:GPU uses HloCostAnalysis during
+  // compilation.  This class is used *everywhere* within cost analysis, and the
+  // hashtable lookups added up to the majority (!) of its runtime.
+  //
+  // This is a bit silly, because the vast majority of the time, we're looking
+  // up a small, fixed set of keys.  So you might be tempted to convert
+  // Properties into a simple struct of floats.
+  //
+  // The problem with *that* is threefold.  (1) subclasses expect to be able to
+  // store arbitrary keys inside Properties.  This doesn't work if it's a
+  // struct.  (2) We expect to be able to store *and retrieve* values
+  // representing e.g. "the utilization of operand n at shape index i", and (3)
+  // the hashtable-ness of this class is part of XLA's public API and so is hard
+  // to change.
+  //
+  // So instead we end up with this Frankenstein's monster of a class.  It
+  // *acts* like a hashtable, but before falling back to the hashtable, it
+  // checks whether the string matches one of a list of "known keys".  If so, it
+  // returns that special value from the struct.
+  //
+  // Normally this would be much worse than just using a plain hashtable.  But
+  // we happen to know that you're almost always doing prop[kKnownKey], in which
+  // case operator[] can be inlined and the string comparison optimized away.
+  //
+  // Sorry for all this complexity, but this is the most impactful single
+  // optimization we were able make to GPU compilation time.
+  //
+  class Properties {
+   public:
+    Properties()
+        : flops_(0),
+          transcendentals_(0),
+          bytes_accessed_(0),
+          optimal_seconds_(0),
+          utilization_(0),
+          operand0_utilization_(0),
+          operand1_utilization_(0),
+          operand0_bytes_accessed_(0),
+          operand1_bytes_accessed_(0),
+          output_root_bytes_accessed_(0),
+          reserved0_(0),
+          reserved1_(0) {
+      DCHECK_EQ(kOperand0UtilizationKey, GetOperandUtilizationKey(0, {}));
+      DCHECK_EQ(kOperand1UtilizationKey, GetOperandUtilizationKey(1, {}));
+      DCHECK_EQ(kOperand0BytesAccessedKey, GetOperandBytesAccessedKey(0, {}));
+      DCHECK_EQ(kOperand1BytesAccessedKey, GetOperandBytesAccessedKey(1, {}));
+      DCHECK_EQ(kOutputRootBytesAccessedKey, GetOutputBytesAccessedKey({}));
+    }
+
+    float& operator[](absl::string_view property) {
+      if (property == kFlopsKey) {
+        return flops_;
+      }
+      if (property == kTranscendentalsKey) {
+        return transcendentals_;
+      }
+      if (property == kBytesAccessedKey) {
+        return bytes_accessed_;
+      }
+      if (property == kOptimalSecondsKey) {
+        return optimal_seconds_;
+      }
+      if (property == kUtilizationKey) {
+        return utilization_;
+      }
+      if (property == kOperand0UtilizationKey) {
+        return operand0_utilization_;
+      }
+      if (property == kOperand1UtilizationKey) {
+        return operand1_utilization_;
+      }
+      if (property == kOperand0BytesAccessedKey) {
+        return operand0_bytes_accessed_;
+      }
+      if (property == kOperand1BytesAccessedKey) {
+        return operand1_bytes_accessed_;
+      }
+      if (property == kOutputRootBytesAccessedKey) {
+        return output_root_bytes_accessed_;
+      }
+      if (property == kReserved0Key) {
+        return reserved0_;
+      }
+      if (property == kReserved1Key) {
+        return reserved1_;
+      }
+
+      auto it = named_props_.lazy_emplace(property, [&](const auto& ctor) {
+        ctor(std::string(property), 0.f);
+      });
+      return it->second;
+    }
+
+    float operator[](absl::string_view property) const {
+      if (property == kFlopsKey) {
+        return flops_;
+      }
+      if (property == kTranscendentalsKey) {
+        return transcendentals_;
+      }
+      if (property == kBytesAccessedKey) {
+        return bytes_accessed_;
+      }
+      if (property == kOptimalSecondsKey) {
+        return optimal_seconds_;
+      }
+      if (property == kUtilizationKey) {
+        return utilization_;
+      }
+      if (property == kOperand0UtilizationKey) {
+        return operand0_utilization_;
+      }
+      if (property == kOperand1UtilizationKey) {
+        return operand1_utilization_;
+      }
+      if (property == kOperand0BytesAccessedKey) {
+        return operand0_bytes_accessed_;
+      }
+      if (property == kOperand1BytesAccessedKey) {
+        return operand1_bytes_accessed_;
+      }
+      if (property == kOutputRootBytesAccessedKey) {
+        return output_root_bytes_accessed_;
+      }
+      if (property == kReserved0Key) {
+        return reserved0_;
+      }
+      if (property == kReserved1Key) {
+        return reserved1_;
+      }
+
+      auto it = named_props_.find(property);
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+
+    template <typename Fn>
+    void ForEach(Fn&& fn) const {
+      if (flops_ != 0) {
+        fn(kFlopsKey, flops_);
+      }
+      if (transcendentals_ != 0) {
+        fn(kTranscendentalsKey, transcendentals_);
+      }
+      if (bytes_accessed_ != 0) {
+        fn(kBytesAccessedKey, bytes_accessed_);
+      }
+      if (optimal_seconds_ != 0) {
+        fn(kOptimalSecondsKey, optimal_seconds_);
+      }
+      if (utilization_ != 0) {
+        fn(kUtilizationKey, utilization_);
+      }
+      if (operand0_utilization_ != 0) {
+        fn(kOperand0UtilizationKey, operand0_utilization_);
+      }
+      if (operand1_utilization_ != 0) {
+        fn(kOperand1UtilizationKey, operand1_utilization_);
+      }
+      if (operand0_bytes_accessed_ != 0) {
+        fn(kOperand0BytesAccessedKey, operand0_bytes_accessed_);
+      }
+      if (operand1_bytes_accessed_ != 0) {
+        fn(kOperand1BytesAccessedKey, operand1_bytes_accessed_);
+      }
+      if (output_root_bytes_accessed_ != 0) {
+        fn(kOutputRootBytesAccessedKey, output_root_bytes_accessed_);
+      }
+      if (reserved0_ != 0) {
+        fn(kReserved0Key, reserved0_);
+      }
+      if (reserved1_ != 0) {
+        fn(kReserved1Key, reserved1_);
+      }
+
+      for (const auto& [k, v] : named_props_) {
+        if (v != 0) {
+          fn(k, v);
+        }
+      }
+    }
+
+    // No getters/setters for simple properties like flops().  For these,
+    // props[kFlopsKey] gets optimized to `return flops_` just fine.
+
+    // Getters/setters for more complex properties like operand utilization,
+    // where we have a fastpath for e.g. operand 0/1 + shape_index {}.
+    float operand_utilization(int64_t operand,
+                              const ShapeIndex& shape_index = {}) {
+      if (operand == 0 && shape_index.empty()) {
+        return operand0_utilization_;
+      }
+      if (operand == 1 && shape_index.empty()) {
+        return operand1_utilization_;
+      }
+
+      auto it =
+          named_props_.find(GetOperandUtilizationKey(operand, shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_operand_utilization(int64_t operand, float value) {
+      set_operand_utilization(operand, /*shape_index=*/{}, value);
+    }
+    void set_operand_utilization(int64_t operand, const ShapeIndex& shape_index,
+                                 float value) {
+      if (operand == 0 && shape_index.empty()) {
+        operand0_utilization_ = value;
+      } else if (operand == 1 && shape_index.empty()) {
+        operand1_utilization_ = value;
+      } else {
+        named_props_[GetOperandUtilizationKey(operand, shape_index)] = value;
+      }
+    }
+
+    float operand_bytes_accessed(int64_t operand,
+                                 const ShapeIndex& shape_index = {}) {
+      if (operand == 0 && shape_index.empty()) {
+        return operand0_bytes_accessed_;
+      }
+      if (operand == 1 && shape_index.empty()) {
+        return operand1_bytes_accessed_;
+      }
+
+      auto it =
+          named_props_.find(GetOperandBytesAccessedKey(operand, shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_operand_bytes_accessed(int64_t operand, float value) {
+      set_operand_bytes_accessed(operand, /*shape_index=*/{}, value);
+    }
+    void set_operand_bytes_accessed(int64_t operand,
+                                    const ShapeIndex& shape_index,
+                                    float value) {
+      if (operand == 0 && shape_index.empty()) {
+        operand0_bytes_accessed_ = value;
+      } else if (operand == 1 && shape_index.empty()) {
+        operand1_bytes_accessed_ = value;
+      } else {
+        named_props_[GetOperandBytesAccessedKey(operand, shape_index)] = value;
+      }
+    }
+
+    float output_bytes_accessed(const ShapeIndex& shape_index = {}) {
+      if (shape_index.empty()) {
+        return output_root_bytes_accessed_;
+      }
+      auto it = named_props_.find(GetOutputBytesAccessedKey(shape_index));
+      if (it != named_props_.end()) {
+        return it->second;
+      }
+      return 0;
+    }
+    void set_output_bytes_accessed(float value) {
+      set_output_bytes_accessed({}, value);
+    }
+    void set_output_bytes_accessed(const ShapeIndex& shape_index, float value) {
+      if (shape_index.empty()) {
+        output_root_bytes_accessed_ = value;
+      } else {
+        named_props_[GetOutputBytesAccessedKey(shape_index)] = value;
+      }
+    }
+
+   private:
+    // These must match GetOperandUtilizationKey(0, {}) etc.
+    static inline constexpr absl::string_view kOperand0UtilizationKey =
+        "utilization operand 0 {}";
+    static inline constexpr absl::string_view kOperand1UtilizationKey =
+        "utilization operand 1 {}";
+    static inline constexpr absl::string_view kOperand0BytesAccessedKey =
+        "bytes accessed operand 0 {}";
+    static inline constexpr absl::string_view kOperand1BytesAccessedKey =
+        "bytes accessed operand 1 {}";
+    static inline constexpr absl::string_view kOutputRootBytesAccessedKey =
+        "bytes accessed output {}";
+
+    float flops_;
+    float transcendentals_;
+    float bytes_accessed_;
+    float optimal_seconds_;
+    float utilization_;
+
+    float operand0_utilization_;
+    float operand1_utilization_;
+
+    float operand0_bytes_accessed_;
+    float operand1_bytes_accessed_;
+
+    float output_root_bytes_accessed_;
+
+    // Fields reserved for use by subclasses.
+    float reserved0_;
+    float reserved1_;
+
+    absl::flat_hash_map<std::string, float> named_props_;
+  };
+
   // shape_size is a function which returns the size in bytes of the top-level
   // buffer of a shape.
   using ShapeSizeFunction = std::function<int64_t(const Shape&)>;
 
-  static constexpr const char kFlopsKey[] = "flops";
-  static constexpr const char kTranscendentalsKey[] = "transcendentals";
-  static constexpr const char kBytesAccessedKey[] = "bytes accessed";
-  static constexpr const char kOptimalSecondsKey[] = "optimal_seconds";
-  static constexpr const char kUtilizationKey[] = "utilization";
-
   // A struct to encapsulate hardware-related options. This includes the shape
   // size function, which is used to encode hardware-specific padding and per
   // second rates of FLOPs, bytes per second (available bandwidth), and
@@ -63,6 +381,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     // property is bytes accessed, this is the number of bytes that can be
     // processed per second. Is empty if no rates have been set.
     Properties per_second_rates = {};
+    // Operations like broadcast with reused inputs are not handled
+    // efficiently on some platforms. Depending on the goal of the analysis
+    // we may need to count or ignore them.
+    bool count_multiple_input_accesses = false;
 
     // Set the rates used to calculate the time taken by the computation.
     void set_flops_per_second(float value) {
@@ -76,8 +398,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
     }
 
     // Returns the specified per-second rate used by cost analysis.
-    const float per_second_rate(const std::string& key) const {
-      return GetProperty(key, per_second_rates);
+    float per_second_rate(const std::string& key) const {
+      return per_second_rates[key];
     }
   };
 
@@ -219,22 +541,20 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
       std::optional<int64_t> memory_space = std::nullopt) const;
 
   const Properties& properties() const { return properties_sum_; }
-  const float property(const std::string& key) const {
-    return GetProperty(key, properties());
-  }
+  float property(absl::string_view key) { return properties_sum_[key]; }
 
   // Returns the specified per-second rate used by cost analysis.
-  const float per_second_rate(absl::string_view key) const {
-    return GetProperty(key, options_.per_second_rates);
+  float per_second_rate(absl::string_view key) const {
+    return options_.per_second_rates[key];
   }
 
   // Return the key that is used to index into Properties for the specified
   // input/output at the shape index.
   static std::string GetOperandBytesAccessedKey(int64_t operand_num,
-                                                ShapeIndex index = {});
+                                                const ShapeIndex& index = {});
   static std::string GetOperandUtilizationKey(int64_t operand_num,
-                                              ShapeIndex index = {});
-  static std::string GetOutputBytesAccessedKey(ShapeIndex index = {});
+                                              const ShapeIndex& index = {});
+  static std::string GetOutputBytesAccessedKey(const ShapeIndex& index = {});
 
   // Returns the estimated convolution flops.
   virtual int64_t GetConvolutionFlops(const HloInstruction* convolution);
@@ -256,10 +576,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // An FMA counts as two floating point operations in these analyzes.
   static constexpr int64_t kFmaFlops = 2;
 
-  // Operations like broadcast with reused inputs are
-  // not handled efficiently on some platforms.
-  virtual bool input_reuse_is_inefficient() const { return false; }
-
   // Small constants can be embedded in the assembly and not require
   // memory access.
   virtual size_t immediate_constant_max_elements() const { return 1; }
@@ -276,17 +592,11 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
 
-  // Returns the default value if the key is not present in the
-  // properties. Otherwise, returns the value that the key maps to from the
-  // properties parameter.
-  static float GetProperty(absl::string_view key, const Properties& properties,
-                           float default_value = 0.0f);
-
   // Returns 0.0f if the hlo is not present in hlo_to_properties or if the key
   // is not present in hlo_to_properties[hlo]. Otherwise, returns the value that
   // the key maps to in the properties of the given hlo.
   static float GetPropertyForHlo(const HloInstruction& hlo,
-                                 const std::string& key,
+                                 absl::string_view key,
                                  const HloToProperties& hlo_to_properties);
 
   // Traverses a fusion operand to find the actual bytes accessed by the fusion
@@ -297,19 +607,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   // Currently implemented non-trivially only in the GPU cost analysis.
   virtual Status FusionCalculateUtilizations(const HloInstruction* fusion);
 
-  // Set bytes accessed by the specified operand and shape index.
-  void SetOperandBytesAccessed(int64_t operand_num, float value);
-  void SetOperandBytesAccessed(int64_t operand_num, ShapeIndex index,
-                               float value);
-
-  void SetOperandUtilization(int64_t operand_num, float value);
-  void SetOperandUtilization(int64_t operand_num, ShapeIndex index,
-                             float value);
-
-  // Set bytes accessed by the output at the shape index.
-  void SetOutputBytesAccessed(float value);
-  void SetOutputBytesAccessed(ShapeIndex index, float value);
-
   HloToProperties hlo_properties_;
 
   // If true, the time taken will be computed from the rates for each property
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 01b2e9969f8..bd08aebe78c 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -24,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/service.h"
@@ -135,8 +137,6 @@ class HloCostAnalysisTest : public ::testing::Test {
   XlaComputation gt_;
 };
 
-using HloCostAnalysisHloTest = HloTestBase;
-
 TEST_F(HloCostAnalysisTest, MatrixMultiply) {
   XlaBuilder builder("matrix_multiply");
   auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs");
@@ -1438,5 +1438,124 @@ TEST_F(FusionCostAnalysis, RevisitModifiedFusion) {
   EXPECT_EQ(analysis.output_bytes_accessed(*fusion), sizeof(float) * 2 * 2);
 }
 
+using Properties = HloCostAnalysis::Properties;
+constexpr auto kFlopsKey = HloCostAnalysis::kFlopsKey;
+constexpr auto kTranscendentalsKey = HloCostAnalysis::kTranscendentalsKey;
+constexpr auto kBytesAccessedKey = HloCostAnalysis::kBytesAccessedKey;
+constexpr auto kOptimalSecondsKey = HloCostAnalysis::kOptimalSecondsKey;
+constexpr auto kUtilizationKey = HloCostAnalysis::kUtilizationKey;
+constexpr auto kReserved0Key = HloCostAnalysis::kReserved0Key;
+constexpr auto kReserved1Key = HloCostAnalysis::kReserved1Key;
+
+TEST(HloCostAnalysisProperties, ZeroWhenInitialized) {
+  Properties p;
+  EXPECT_EQ(0, p[kFlopsKey]);
+  EXPECT_EQ(0, p[kTranscendentalsKey]);
+  EXPECT_EQ(0, p[kBytesAccessedKey]);
+  EXPECT_EQ(0, p[kOptimalSecondsKey]);
+  EXPECT_EQ(0, p[kUtilizationKey]);
+  EXPECT_EQ(0, p[kReserved0Key]);
+  EXPECT_EQ(0, p[kReserved1Key]);
+
+  EXPECT_EQ(0, p.operand_utilization(0, {}));
+  EXPECT_EQ(0, p.operand_utilization(1, {}));
+  EXPECT_EQ(0, p.operand_utilization(2, {}));
+  EXPECT_EQ(0, p.operand_utilization(0, {0}));
+  EXPECT_EQ(0, p.operand_utilization(2, {0}));
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandUtilizationKey(0, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandUtilizationKey(1, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandUtilizationKey(2, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandUtilizationKey(0, {0})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandUtilizationKey(2, {0})]);
+
+  EXPECT_EQ(0, p.operand_bytes_accessed(0, {}));
+  EXPECT_EQ(0, p.operand_bytes_accessed(1, {}));
+  EXPECT_EQ(0, p.operand_bytes_accessed(2, {}));
+  EXPECT_EQ(0, p.operand_bytes_accessed(0, {0}));
+  EXPECT_EQ(0, p.operand_bytes_accessed(2, {0}));
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandBytesAccessedKey(0, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandBytesAccessedKey(1, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandBytesAccessedKey(2, {})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandBytesAccessedKey(0, {0})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOperandBytesAccessedKey(2, {0})]);
+
+  EXPECT_EQ(0, p.output_bytes_accessed({}));
+  EXPECT_EQ(0, p.output_bytes_accessed({0}));
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOutputBytesAccessedKey({})]);
+  EXPECT_EQ(0, p[HloCostAnalysis::GetOutputBytesAccessedKey({0})]);
+
+  EXPECT_EQ(0, p["foobar"]);
+
+  std::vector<std::pair<std::string, float>> vals;
+  Properties().ForEach([&](absl::string_view key, float val) {
+    vals.push_back({std::string(key), val});
+  });
+  EXPECT_THAT(vals, ::testing::IsEmpty());
+}
+
+TEST(HloCostAnalysisProperties, SetValues) {
+  Properties p;
+
+  p[kFlopsKey] = 1;
+  p[kTranscendentalsKey] = 2;
+  p[kBytesAccessedKey] = 3;
+  p[kOptimalSecondsKey] = 4;
+  p[kUtilizationKey] = 5;
+  p[kReserved0Key] = 6;
+  p[kReserved1Key] = 7;
+  EXPECT_EQ(1, p[kFlopsKey]);
+  EXPECT_EQ(2, p[kTranscendentalsKey]);
+  EXPECT_EQ(3, p[kBytesAccessedKey]);
+  EXPECT_EQ(4, p[kOptimalSecondsKey]);
+  EXPECT_EQ(5, p[kUtilizationKey]);
+  EXPECT_EQ(6, p[kReserved0Key]);
+  EXPECT_EQ(7, p[kReserved1Key]);
+
+  p.set_operand_utilization(0, {}, 10);
+  p.set_operand_utilization(1, {}, 11);
+  p.set_operand_utilization(2, {}, 12);
+  p.set_operand_utilization(0, {0}, 13);
+  p.set_operand_utilization(2, {0}, 14);
+  EXPECT_EQ(10, p.operand_utilization(0, {}));
+  EXPECT_EQ(11, p.operand_utilization(1, {}));
+  EXPECT_EQ(12, p.operand_utilization(2, {}));
+  EXPECT_EQ(13, p.operand_utilization(0, {0}));
+  EXPECT_EQ(14, p.operand_utilization(2, {0}));
+  EXPECT_EQ(10, p[HloCostAnalysis::GetOperandUtilizationKey(0, {})]);
+  EXPECT_EQ(11, p[HloCostAnalysis::GetOperandUtilizationKey(1, {})]);
+  EXPECT_EQ(12, p[HloCostAnalysis::GetOperandUtilizationKey(2, {})]);
+  EXPECT_EQ(13, p[HloCostAnalysis::GetOperandUtilizationKey(0, {0})]);
+  EXPECT_EQ(14, p[HloCostAnalysis::GetOperandUtilizationKey(2, {0})]);
+
+  p.set_operand_bytes_accessed(0, {}, 20);
+  p.set_operand_bytes_accessed(1, {}, 21);
+  p.set_operand_bytes_accessed(2, {}, 22);
+  p.set_operand_bytes_accessed(0, {0}, 23);
+  p.set_operand_bytes_accessed(2, {0}, 24);
+  EXPECT_EQ(20, p.operand_bytes_accessed(0, {}));
+  EXPECT_EQ(21, p.operand_bytes_accessed(1, {}));
+  EXPECT_EQ(22, p.operand_bytes_accessed(2, {}));
+  EXPECT_EQ(23, p.operand_bytes_accessed(0, {0}));
+  EXPECT_EQ(24, p.operand_bytes_accessed(2, {0}));
+  EXPECT_EQ(20, p[HloCostAnalysis::GetOperandBytesAccessedKey(0, {})]);
+  EXPECT_EQ(21, p[HloCostAnalysis::GetOperandBytesAccessedKey(1, {})]);
+  EXPECT_EQ(22, p[HloCostAnalysis::GetOperandBytesAccessedKey(2, {})]);
+  EXPECT_EQ(23, p[HloCostAnalysis::GetOperandBytesAccessedKey(0, {0})]);
+  EXPECT_EQ(24, p[HloCostAnalysis::GetOperandBytesAccessedKey(2, {0})]);
+
+  p.set_output_bytes_accessed({}, 30);
+  p.set_output_bytes_accessed({0}, 31);
+  EXPECT_EQ(30, p.output_bytes_accessed({}));
+  EXPECT_EQ(31, p.output_bytes_accessed({0}));
+  EXPECT_EQ(30, p[HloCostAnalysis::GetOutputBytesAccessedKey({})]);
+  EXPECT_EQ(31, p[HloCostAnalysis::GetOutputBytesAccessedKey({0})]);
+
+  p["foo"] = 100;
+  EXPECT_EQ(100, p["foo"]);
+
+  p["bar"] += 101;
+  EXPECT_EQ(101, p["bar"]);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 83321f760d3..c593c895b1b 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -30,11 +30,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 44888a18165..a7dbcb0865a 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index 71fa469fcd0..2641b135d1a 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 2a1973377ea..8f6d23b07b5 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -26,14 +26,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -68,7 +68,17 @@ struct ConstantKey {
 // similar treatment.
 template <bool kIsLayoutSensitive>
 StatusOr<bool> CombineConstants(HloComputation* computation) {
-  TF_ASSIGN_OR_RETURN(auto domain_map, HloDomainMap::Create(computation, ""));
+  // Populating the domain map is somewhat expensive -- only do it if there are
+  // kDomain ops in the computation.  If there are no kDomain ops, the domain
+  // map is trivial, every op gets mapped to the same domain.
+  std::unique_ptr<HloDomainMap> domain_map;
+  if (absl::c_any_of(computation->instructions(),
+                     [&](const HloInstruction* instr) {
+                       return instr->opcode() == HloOpcode::kDomain;
+                     })) {
+    TF_ASSIGN_OR_RETURN(domain_map, HloDomainMap::Create(computation, ""));
+  }
+
   // Map from the literal hash of a constant or the shape hash of an iota all
   // equivalent instructions. This avoids extreme quadratic behavior with many
   // scalar constants.
@@ -85,7 +95,8 @@ StatusOr<bool> CombineConstants(HloComputation* computation) {
     HloInstruction* match = nullptr;
     if (auto* constant_inst = DynCast<HloConstantInstruction>(instruction)) {
       auto insert_result = constants.insert(ConstantKey<kIsLayoutSensitive>{
-          constant_inst, domain_map->GetDomainId(instruction)});
+          constant_inst,
+          (domain_map != nullptr ? domain_map->GetDomainId(instruction) : 0)});
       if (!insert_result.second) {
         match = insert_result.first->hlo;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h
index 01abcef263f..7ed07a3b1e6 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.h
+++ b/tensorflow/compiler/xla/service/hlo_cse.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CSE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CSE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 154ed2d7c1a..82c87c09309 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/substitute.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
@@ -307,57 +307,6 @@ TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
   EXPECT_EQ(5, computation->instruction_count());
 }
 
-// Test two identical while loops with different inputs
-TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
-  const char* const hlo_string = R"(
-    HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
-
-    %body {
-      %param = (f32[], f32[]) parameter(0)
-      %get-tuple-element = get-tuple-element(%param), index=0
-      %get-tuple-element.1 = get-tuple-element(%param), index=1
-      %add = add(%get-tuple-element, %get-tuple-element.1)
-      ROOT %tuple = tuple(%get-tuple-element, %add)
-    }
-
-    %body.1 {
-      %param.1 = (f32[], f32[]) parameter(0)
-      %gte = get-tuple-element(%param.1), index=0
-      %gte1 = get-tuple-element(%param.1), index=1
-      %add.1 = add(%gte, %gte1)
-      ROOT %tuple = tuple(%gte, %add.1)
-    }
-
-    %condition {
-      %param.1 = (f32[], f32[]) parameter(0)
-      ROOT %constant = pred[] constant(false)
-    }
-
-    %condition.1 {
-      %param.2 = (f32[], f32[]) parameter(0)
-      ROOT %constant.1 = pred[] constant(false)
-    }
-
-    ENTRY %WhileLoopsIdenticalConditionsAndBodiesDifferentInput {
-      %constant.2 = f32[] constant(1)
-      %constant.3 = f32[] constant(2)
-      %tuple.1 =  tuple(%constant.2, %constant.3)
-      %while = while(%tuple.1), condition=%condition, body=%body
-      %constant.4 = f32[] constant(1)
-      %constant.5 = f32[] constant(3)
-      %tuple.2 = tuple(%constant.4, %constant.5)
-      ROOT %while.1 = while(%tuple.2), condition=%condition.1, body=%body.1
-    })";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
-  auto computation = m->entry_computation();
-
-  EXPECT_EQ(8, computation->instruction_count());
-  HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(m.get()).value());
-  EXPECT_EQ(8, computation->instruction_count());
-}
-
 // Test two while loops with identical bodies and same inputs, but different
 // conditions
 TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferentConditions) {
@@ -667,30 +616,6 @@ TEST_F(HloCseTest, CompareComputations) {
   EXPECT_EQ(root->operand(0), root->operand(1));
 }
 
-TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
-  // Test that constants and iotas with the same value but in different domains
-  // (disjoint in this case) are not collapsed.
-  auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {42}), 0));
-  builder.AddInstruction(
-      HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {42}), 0));
-
-  auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_EQ(4, computation->instruction_count());
-
-  HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module.get()).value());
-
-  EXPECT_EQ(4, computation->instruction_count());
-}
-
 TEST_F(HloCseTest, Domain) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 9630bb39e97..62c7e3b5e80 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -29,13 +29,13 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -415,9 +415,13 @@ bool HloDataflowAnalysis::Phi(
     PrimitiveType ty = shape.element_type();
     bool is_array = shape.IsArray();
     absl::c_for_each(inputs, [&](const InstructionValueSet* input) {
-      DCHECK(ty == input->shape().element_type() &&
-             (!is_array || ShapeUtil::ElementsIn(shape) ==
-                               ShapeUtil::ElementsIn(input->shape())));
+      DCHECK(
+          ty == input->shape().element_type() &&
+          (!is_array ||
+           ShapeUtil::ElementsIn(shape) ==
+               ShapeUtil::ElementsIn(input->shape()) ||
+           ShapeUtil::ArraySize(shape) == ShapeUtil::ArraySize(input->shape())))
+          << shape.ToString() << " vs." << input->shape().ToString();
     });
   }
 
@@ -1835,7 +1839,26 @@ HloDataflowAnalysis::GetInPlaceInputOutputPairs(
     }
     return in_place_pairs;
   } else if (instruction->opcode() == HloOpcode::kFusion) {
-    return GetFusionInstructionInPlaceInputOutputPairs(instruction);
+    const auto& aliasing_pairs =
+        Cast<HloFusionInstruction>(instruction)->output_to_operand_aliasing();
+    // WARNING: The users of fusion's output_to_operand_aliasing should be aware
+    // that the annotated output-operand-aliasing pairs should not conflict with
+    // those discovered by GetFusionInstructionInPlaceInputOutputPairs.
+    // TODO (b/259460539): Make sure the annotated and discovered pairs do not
+    // conflict (possibly through implementing a new pass)
+    auto in_place_pairs =
+        GetFusionInstructionInPlaceInputOutputPairs(instruction);
+    if (!aliasing_pairs.empty()) {
+      for (const auto& pair : aliasing_pairs) {
+        ShapeIndex output_shape_index = pair.first;
+        int64_t operand_index = pair.second.first;
+        ShapeIndex operand_shape_index = pair.second.second;
+        in_place_pairs.push_back(
+            {HloOperandIndex{operand_index, {operand_shape_index}},
+             output_shape_index});
+      }
+    }
+    return in_place_pairs;
   }
 
   return {};
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 94fcc242067..1dbf45ca7d4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -29,9 +29,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_phi_graph.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 4689ebc94ea..bf9fa90f746 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -17,15 +17,15 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/async_op_canonicalizer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -3054,6 +3054,35 @@ TEST_F(GetInPlaceInputOutputPairsTest, DUSFusion) {
   EXPECT_EQ(in_place_pairs, expected_pairs);
 }
 
+TEST_F(GetInPlaceInputOutputPairsTest, DUSFusionWithOutputOperandAliasing) {
+  const char* kModule = R"(
+    HloModule test
+
+    fused_computation {
+      p0 = f32[10] parameter(0)
+      p1 = f32[5] parameter(1)
+      p2 = s32[] parameter(2)
+      dus = f32[10] dynamic-update-slice(p0, p1, p2)
+      ROOT tuple = (f32[5], f32[10]) tuple(p1, dus)
+    }
+
+    ENTRY test {
+      p0 = f32[10] parameter(0)
+      p1 = f32[5] parameter(1)
+      p2 = s32[] parameter(2)
+      ROOT fusion = (f32[5], f32[10]) fusion(p0, p1, p2), kind=kLoop, output_to_operand_aliasing={{0}: (1, {})}, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+
+  auto in_place_pairs = HloDataflowAnalysis::GetInPlaceInputOutputPairs(fusion);
+  std::vector<std::pair<HloOperandIndex, ShapeIndex>> expected_pairs;
+  expected_pairs.push_back({HloOperandIndex{0, {}}, {1}});  // discovered
+  expected_pairs.push_back({HloOperandIndex{1, {}}, {0}});  // annotated
+  EXPECT_EQ(in_place_pairs, expected_pairs);
+}
+
 TEST_F(GetInPlaceInputOutputPairsTest, NonDUSFusion) {
   const char* kModule = R"(
     HloModule test
@@ -3077,6 +3106,31 @@ TEST_F(GetInPlaceInputOutputPairsTest, NonDUSFusion) {
   EXPECT_THAT(in_place_pairs, IsEmpty());
 }
 
+TEST_F(GetInPlaceInputOutputPairsTest, NonDUSFusionWithOutputOperandAliasing) {
+  const char* kModule = R"(
+    HloModule test
+
+    fused_computation {
+      p0 = f32[10] parameter(0)
+      p1 = f32[10] parameter(1)
+      ROOT add = f32[10] add(p0, p1)
+    }
+
+    ENTRY test {
+      p0 = f32[10] parameter(0)
+      p1 = f32[10] parameter(1)
+      ROOT fusion = f32[10] fusion(p0, p1), kind=kLoop, output_to_operand_aliasing={{}: (0, {})}, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kModule));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  auto in_place_pairs = HloDataflowAnalysis::GetInPlaceInputOutputPairs(fusion);
+
+  std::vector<std::pair<HloOperandIndex, ShapeIndex>> expected_pairs;
+  expected_pairs.push_back({HloOperandIndex{0, {}}, {}});
+  EXPECT_EQ(in_place_pairs, expected_pairs);
+}
+
 TEST_F(GetInPlaceInputOutputPairsTest, NestedDUSFusion) {
   const char* kModule = R"(
     HloModule test
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index ce896869d05..0d4b8e2cbc7 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/hlo_dce.h b/tensorflow/compiler/xla/service/hlo_dce.h
index a9b5ebb3334..1e2f05db48f 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.h
+++ b/tensorflow/compiler/xla/service/hlo_dce.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DCE_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 04e50f694bf..cd8090fb74f 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -17,14 +17,14 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index b6925c36901..9e6fd393d2b 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
index dd64cdd5f1b..12fffee67f0 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 421571152c8..4873491708a 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
index a258f1561ea..857c7e9939b 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/platform/status.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
deleted file mode 100644
index 07b4efc2e97..00000000000
--- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
index c3916a925dc..59aa179e10a 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_remover.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_verifier.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.h b/tensorflow/compiler/xla/service/hlo_domain_remover.h
index 39f38243ddc..820b51fb08b 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_remover.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/tsl/platform/status.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index 1d31ace15ed..ac264829869 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_verifier.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -862,5 +862,40 @@ ENTRY entry {
   EXPECT_EQ(tuple0->sharding().tuple_elements().size(), 2);
 }
 
+TEST_F(HloDomainTest, DomainTupleSameSharding_ClearSharding) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = u32[2]{0} parameter(0), sharding={devices=[2]0,1}
+  p1 = u32[2]{0} parameter(1), sharding={devices=[2]0,1}
+  tuple.0 = (u32[2]{0}, u32[2]{0}) tuple(p0, p1), sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+  get-tuple-element.0 = u32[2]{0} get-tuple-element(tuple.0), index=0
+  get-tuple-element.1 = u32[2]{0} get-tuple-element(tuple.0), index=1
+  ROOT add = u32[2]{0} add(get-tuple-element.0, get-tuple-element.1)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  // If tuple does not have sharding, verify that tuple sharding normalization
+  // still happens in NormalizeDomain.
+  auto tuple0 = FindInstruction(module.get(), "tuple.0");
+  tuple0->clear_sharding();
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           ShardingPropagation::NormalizeDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  tuple0 = FindInstruction(module.get(), "tuple.0");
+  EXPECT_TRUE(tuple0->has_sharding());
+  EXPECT_TRUE(tuple0->sharding().IsTuple());
+  EXPECT_EQ(tuple0->sharding().tuple_elements().size(), 2);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.cc b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
index caac6c44f66..714d5b0932a 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <set>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_verifier.h b/tensorflow/compiler/xla/service/hlo_domain_verifier.h
index 5b42cf65f4c..2e5b25b9fb5 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_verifier.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/tsl/platform/status.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 5b34ab5b8d7..306a851d5ea 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
index fd964a7d7e5..86926a103d3 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.h
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 88ab83378a2..2085824ec79 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile_data.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 490f4c1021b..40d9513bb81 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -37,15 +37,15 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
@@ -1044,6 +1044,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
       // uninteresting.
@@ -1250,6 +1251,10 @@ ExtractGemmBackendConfigProps(const gpu::GemmBackendConfig& config,
   if (config.algorithm_case() == gpu::GemmBackendConfig::kSelectedAlgorithm) {
     props.emplace_back("algorithm", StrCat(config.selected_algorithm()));
   }
+  if (config.epilogue() != gpu::GemmBackendConfig::DEFAULT) {
+    props.emplace_back(
+        "epilogue", gpu::GemmBackendConfig::Epilogue_Name(config.epilogue()));
+  }
   return props;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index bace4b3ee1c..3c50448b32d 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 6fb5256351e..901a5aabf30 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
deleted file mode 100644
index 14627add650..00000000000
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
-
-#include <optional>
-#include <utility>
-
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index 03ce83a0150..e715172407f 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 
 #include <memory>
 #include <string>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
deleted file mode 100644
index c1fcb422b84..00000000000
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// HLO instructions are in DAG form and represent the computations that the user
-// has built up via the XLA service interface. They are ultimately lowered
-// in a platform-aware way by traversing the HLO DAG and emitting a lowered
-// form; e.g. see DfsHloVisitor.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
-
-#include <functional>
-#include <iosfwd>
-#include <list>
-#include <memory>
-#include <optional>
-#include <ostream>
-#include <set>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/hash/hash.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/iterator_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
-#include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
-#include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/lib/gtl/iterator_range.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/protobuf.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index bf0a4f146b1..e6096372467 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 #include <optional>
 #include <set>
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -858,13 +858,15 @@ TEST_F(HloInstructionTest, PreserveShardingThroughCompatibleClone) {
       })));
   auto* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant, constant}));
-  tuple->set_sharding(sharding);
+  HloSharding tuple_sharding =
+      HloSharding::SingleTuple(tuple->shape(), sharding);
+  tuple->set_sharding(tuple_sharding);
   // Compatible with original shape as tuple tree structure and leaf ranks are
   // identical
   auto clone_shape = ShapeUtil::MakeShape(F32, {3, 3});
   clone_shape = ShapeUtil::MakeTupleShape({clone_shape, clone_shape});
   auto tuple_clone = tuple->CloneWithNewOperands(clone_shape, {});
-  EXPECT_EQ(tuple_clone->sharding(), sharding);
+  EXPECT_EQ(tuple_clone->sharding(), tuple_sharding);
 }
 
 TEST_F(HloInstructionTest,
@@ -878,7 +880,7 @@ TEST_F(HloInstructionTest,
       })));
   auto* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant, constant}));
-  tuple->set_sharding(sharding);
+  tuple->set_sharding(HloSharding::SingleTuple(tuple->shape(), sharding));
   // Incompatible with original shape as tuple tree structure is different
   auto clone_shape = ShapeUtil::MakeShape(F32, {2, 2});
   clone_shape =
@@ -898,7 +900,7 @@ TEST_F(HloInstructionTest,
       })));
   auto* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant, constant}));
-  tuple->set_sharding(sharding);
+  tuple->set_sharding(HloSharding::SingleTuple(tuple->shape(), sharding));
   // Incompatible with original shape as tuple tree structure is different
   auto clone_shape = ShapeUtil::MakeShape(F32, {1, 2, 3});
   clone_shape = ShapeUtil::MakeTupleShape({clone_shape, clone_shape});
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
deleted file mode 100644
index d5453c0c163..00000000000
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// All HloInstruction subclasses are put in this file.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_live_range.cc b/tensorflow/compiler/xla/service/hlo_live_range.cc
index a6d79621787..53a735fade9 100644
--- a/tensorflow/compiler/xla/service/hlo_live_range.cc
+++ b/tensorflow/compiler/xla/service/hlo_live_range.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 
 namespace xla {
@@ -42,25 +42,27 @@ StatusOr<std::unique_ptr<HloLiveRange>> HloLiveRange::Run(
 }
 
 void HloLiveRange::NormalizeAliasedBuffers() {
-  absl::flat_hash_map<HloBuffer::Id, std::vector<TimeBound*>>
+  absl::flat_hash_map<HloBuffer::Id,
+                      std::vector<std::pair<TimeBound*, HloValue::Id>>>
       live_ranges_by_buffer;
   for (auto& entry : buffer_live_ranges_) {
     const HloValue& value = *entry.first;
     const HloBuffer& buffer = alias_analysis_.GetBufferContainingValue(value);
-    live_ranges_by_buffer[buffer.id()].push_back(&entry.second);
+    live_ranges_by_buffer[buffer.id()].push_back({&entry.second, value.id()});
   }
 
   for (auto& entry : live_ranges_by_buffer) {
-    std::vector<TimeBound*>& aliased_live_ranges = entry.second;
-    absl::c_sort(aliased_live_ranges,
-                 [](const TimeBound* a, const TimeBound* b) {
-                   return std::forward_as_tuple(a->start, a->end) <
-                          std::forward_as_tuple(b->start, b->end);
-                 });
+    auto& aliased_live_ranges = entry.second;
+    absl::c_sort(
+        aliased_live_ranges, [](std::pair<const TimeBound*, HloValue::Id> a,
+                                std::pair<const TimeBound*, HloValue::Id> b) {
+          return std::forward_as_tuple(a.first->start, a.first->end, a.second) <
+                 std::forward_as_tuple(b.first->start, b.first->end, b.second);
+        });
 
     for (int64_t i = 0; i + 1 < aliased_live_ranges.size(); ++i) {
-      TimeBound& live_range1 = *aliased_live_ranges[i];
-      TimeBound& live_range2 = *aliased_live_ranges[i + 1];
+      TimeBound& live_range1 = *aliased_live_ranges[i].first;
+      TimeBound& live_range2 = *aliased_live_ranges[i + 1].first;
       live_range2.end = std::max(live_range1.end, live_range2.end);
       live_range1.end = std::min(live_range1.end, live_range2.start);
     }
diff --git a/tensorflow/compiler/xla/service/hlo_live_range.h b/tensorflow/compiler/xla/service/hlo_live_range.h
index 9b1a48e738a..5d489bf749c 100644
--- a/tensorflow/compiler/xla/service/hlo_live_range.h
+++ b/tensorflow/compiler/xla/service/hlo_live_range.h
@@ -21,12 +21,12 @@ the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/hlo_live_range_test.cc b/tensorflow/compiler/xla/service/hlo_live_range_test.cc
index c01dd1b9331..72ff0b208fd 100644
--- a/tensorflow/compiler/xla/service/hlo_live_range_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_live_range_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -339,6 +339,77 @@ TEST_F(HloLiveRangeTest, While) {
   EXPECT_EQ(LiveRangeAt(body_iter).end, LiveRangeAt(body_iter_next).start);
 }
 
+TEST_F(HloLiveRangeTest, Determinism) {
+  std::string hlo_string = R"(
+HloModule While, is_scheduled=true
+
+%WhileBody {
+  %body_param = (f32[2,3]{1,0}, f32[], f32[2,3]{1,0}) parameter(0)
+  %get-tuple-element.2 = f32[2,3]{1,0} get-tuple-element(%body_param), index=0
+  %constant.2 = f32[2,3]{1,0} constant({ { 1, 2, 3 }, { 4, 5, 6 } })
+  %add.1 = f32[2,3]{1,0} add(f32[2,3]{1,0} %get-tuple-element.2, f32[2,3]{1,0} %constant.2)
+  %multiply = f32[2,3]{1,0} multiply(f32[2,3]{1,0} %get-tuple-element.2, f32[2,3]{1,0} %get-tuple-element.2)
+  %add.2 = f32[2,3]{1,0} add(f32[2,3]{1,0} %add.1, f32[2,3]{1,0} %multiply)
+  %get-tuple-element.1 = f32[] get-tuple-element(%body_param), index=1
+  %constant.1 = f32[] constant(1)
+  %add = f32[] add(f32[] %get-tuple-element.1, f32[] %constant.1)
+  %get-tuple-element.3 = f32[2,3]{1,0} get-tuple-element(%body_param), index=2
+  %add.3 = f32[2,3]{1,0} add(f32[2,3]{1,0} %get-tuple-element.3, f32[2,3]{1,0} %constant.2)
+  ROOT %tuple = (f32[2,3]{1,0}, f32[], f32[2,3]{1,0}) tuple(f32[2,3]{1,0} %add.2, f32[] %add, f32[2,3]{1,0} %add.3)
+}
+
+%WhileCond {
+  %cond_param = (f32[2,3]{1,0}, f32[], f32[2,3]{1,0}) parameter(0)
+  %get-tuple-element = f32[] get-tuple-element(%cond_param), index=1
+  %constant = f32[] constant(50)
+  ROOT %compare = pred[] compare(f32[] %get-tuple-element, f32[] %constant), direction=LT
+}
+
+ENTRY %While {
+  %param_iter = f32[2,3]{1,0} parameter(0)
+  %param_data = f32[] parameter(1)
+  %tuple.1 = (f32[2,3]{1,0}, f32[], f32[2,3]{1,0}) tuple(f32[2,3]{1,0} %param_iter, f32[] %param_data, f32[2,3]{1,0} %param_iter)
+  %while = (f32[2,3]{1,0}, f32[], f32[2,3]{1,0}) while(%tuple.1), condition=%WhileCond, body=%WhileBody
+  ROOT %get-tuple-element.4 = f32[2,3]{1,0} get-tuple-element(%while), index=0
+}
+
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_string));
+  const HloSchedule& schedule = module_->schedule();
+
+  const int32_t num_runs = 20;
+  std::vector<std::unique_ptr<HloLiveRange>> hlo_live_ranges;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis =
+      HloAliasAnalysis::Run(module_.get()).value();
+
+  for (int i = 0; i < num_runs; ++i) {
+    hlo_live_ranges.push_back(HloLiveRange::Run(schedule, *alias_analysis,
+                                                module_->entry_computation())
+                                  .value());
+  }
+
+  absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+      buffer_live_ranges_0 = hlo_live_ranges[0]->buffer_live_ranges();
+  for (const auto& iter : buffer_live_ranges_0) {
+    for (size_t i = 1; i < num_runs; i++) {
+      absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+          buffer_live_ranges_i = hlo_live_ranges[i]->buffer_live_ranges();
+      auto found_iter = buffer_live_ranges_i.find(iter.first);
+      EXPECT_TRUE(found_iter != buffer_live_ranges_i.end())
+          << "value does not exist: " << iter.first->ToString();
+      EXPECT_EQ(found_iter->second.start, iter.second.start)
+          << "value " << iter.first->ToString()
+          << " has different start: " << found_iter->second.start << " vs "
+          << iter.second.start;
+      EXPECT_EQ(found_iter->second.end, iter.second.end)
+          << "value " << iter.first->ToString()
+          << " has different end: " << found_iter->second.end << " vs "
+          << iter.second.end;
+    }
+  }
+}
+
 TEST_F(HloLiveRangeTest, AsyncCall) {
   std::string hlo_string = R"(
 HloModule AsyncCall, is_scheduled=true, entry_computation_layout={(f32[4096]{0},f32[4096]{0})->f32[4096]{0}}
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
index e24136ead20..fb7e9660b33 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.h b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
index f990f8c6afa..d6609118ca2 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
index 5bf6b00dbdb..a4100f8f3b0 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 870b6d1a107..cab450778f6 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 42e87a1ecf8..28dbe4e2c90 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 
@@ -241,7 +241,11 @@ HLO_MATCHER(AsyncStart);
 HLO_MATCHER(AsyncUpdate);
 HLO_MATCHER(AsyncDone);
 HLO_MATCHER(AllGather);
+HLO_MATCHER(AllGatherStart);
+HLO_MATCHER(AllGatherDone);
 HLO_MATCHER(AllReduce);
+HLO_MATCHER(AllReduceStart);
+HLO_MATCHER(AllReduceDone);
 HLO_MATCHER(AllToAll);
 HLO_MATCHER(And);
 HLO_MATCHER(BatchNormGrad);
@@ -313,6 +317,7 @@ HLO_MATCHER(Sign);
 HLO_MATCHER(Slice);
 HLO_MATCHER(Sort);
 HLO_MATCHER(Subtract);
+HLO_MATCHER(Tan);
 HLO_MATCHER(Tanh);
 HLO_MATCHER(Transpose);
 HLO_MATCHER(Tuple);
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 41ebcb09620..9b8978a0485 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index 157653ac02c..a345920fd1c 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 4982e7827a5..e01e70024f8 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
deleted file mode 100644
index ff3f878c803..00000000000
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
-
-#include <atomic>
-#include <functional>
-#include <list>
-#include <memory>
-#include <optional>
-#include <random>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/cord.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/iterator_util.h"
-#include "tensorflow/compiler/xla/service/compilation_environments.h"
-#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/tsl/lib/gtl/iterator_range.h"
-#include "tensorflow/tsl/platform/logging.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index a15c6f5de73..9a59cba14f8 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -15,14 +15,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 
+#include <algorithm>
 #include <atomic>
+#include <cstdint>
+#include <iterator>
+#include <map>
 #include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
-#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -76,9 +86,294 @@ std::string HloModuleConfig::compilation_cache_key() const {
     StrAppend(&key, device_type());
   }
   StrAppend(&key, "::alias_passthrough_params=", alias_passthrough_params_);
-  StrAppend(&key, "::allow_spmd_sharding_propagation_to_output=",
-            allow_spmd_sharding_propagation_to_output_);
+  StrAppend(&key, "::allow_spmd_sharding_propagation_to_output={",
+            absl::StrJoin(allow_spmd_sharding_propagation_to_output_, ","),
+            "}");
   return key;
 }
 
+/*static*/ void HloModuleConfig::AssignProtoShardableValueUpdatePairs(
+    tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>*
+        proto_update_pairs,
+    const std::vector<HloModuleConfig::ShardableValueUpdatePair>&
+        update_pairs) {
+  using ProtoShard = std::decay_t<decltype(proto_update_pairs->at(0))>;
+  proto_update_pairs->Reserve(update_pairs.size());
+
+  for (const auto& pair : update_pairs) {
+    ProtoShard shard;
+    shard.set_input_parameter_number(pair.input_parameter_number);
+    for (int64_t val : pair.parameter_shape_index) {
+      shard.add_parameter_shape_index(val);
+    }
+    for (int64_t val : pair.output_shape_index) {
+      shard.add_output_shape_index(val);
+    }
+    proto_update_pairs->Add(std::move(shard));
+  }
+}
+
+static HloModuleConfigProto::BoolList BoolVectorToProto(
+    const std::vector<bool>& vals) {
+  HloModuleConfigProto::BoolList list;
+  for (int i = 0; i < vals.size(); ++i) {
+    list.add_vals(vals[i]);
+  }
+  return list;
+}
+
+static void AssignProtoFusionConfig(
+    HloModuleConfigProto& proto,
+    const std::vector<std::vector<bool>>& fusion_config) {
+  auto* proto_config = proto.mutable_fusion_config();
+  proto_config->Reserve(fusion_config.size());
+  for (const auto& vals : fusion_config) {
+    proto_config->Add(BoolVectorToProto(vals));
+  }
+}
+
+static void AssignProtoDotConfig(
+    HloModuleConfigProto& proto,
+    const absl::flat_hash_map<std::string, std::vector<int64_t>>& dot_config) {
+  std::map<std::string, std::vector<int64_t>> sorted_dot_config;
+  sorted_dot_config.insert(dot_config.begin(), dot_config.end());
+  for (const auto& [key, list_vector] : sorted_dot_config) {
+    HloModuleConfigProto::Int64List list;
+    for (int64_t val : list_vector) {
+      list.add_vals(val);
+    }
+    proto.mutable_dot_config()->insert({key, std::move(list)});
+  }
+}
+
+static void AssignProtoLayoutConfig(
+    HloModuleConfigProto& proto,
+    const std::vector<std::vector<std::vector<int64_t>>>& layout_config) {
+  auto* proto_layout_config = proto.mutable_layout_config();
+  proto_layout_config->Reserve(layout_config.size());
+  for (const auto& config_row : layout_config) {
+    HloModuleConfigProto::Int64ListList proto_list_list;
+    proto_list_list.mutable_lists()->Reserve(config_row.size());
+    for (const auto& cell : config_row) {
+      HloModuleConfigProto::Int64List list;
+      for (int64_t val : cell) {
+        list.add_vals(val);
+      }
+      *proto_list_list.add_lists() = std::move(list);
+    }
+    proto_layout_config->Add(std::move(proto_list_list));
+  }
+}
+
+static void AssignProtoPhaseOrderingConfig(
+    HloModuleConfigProto& proto,
+    const std::vector<std::vector<bool>>& phase_config) {
+  auto* proto_config = proto.mutable_phase_ordering_config();
+  proto_config->Reserve(phase_config.size());
+  for (const auto& vals : phase_config) {
+    proto_config->Add(BoolVectorToProto(vals));
+  }
+}
+
+/*static*/ void HloModuleConfig::AssignStructShardableValueUpdatePairs(
+    HloModuleConfig& config,
+    const tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>&
+        pairs) {
+  std::vector<HloModuleConfig::ShardableValueUpdatePair> cfg_pairs;
+  cfg_pairs.reserve(pairs.size());
+  for (const auto& proto_pair : pairs) {
+    HloModuleConfig::ShardableValueUpdatePair pair;
+    pair.input_parameter_number = proto_pair.input_parameter_number();
+    const auto param_idx = proto_pair.parameter_shape_index();
+    pair.parameter_shape_index.assign(param_idx.begin(), param_idx.end());
+    const auto output_idx = proto_pair.output_shape_index();
+    pair.output_shape_index.assign(output_idx.begin(), output_idx.end());
+    cfg_pairs.push_back(pair);
+  }
+  config.set_shardable_value_update_pairs(cfg_pairs);
+}
+
+static void AssignStructFusionConfig(HloModuleConfig& config,
+                                     const HloModuleConfigProto& proto) {
+  std::vector<std::vector<bool>> module_config;
+  auto& proto_config = proto.fusion_config();
+  module_config.reserve(proto_config.size());
+  for (auto& list : proto_config) {
+    std::vector<bool> temp;
+    for (bool val : list.vals()) {
+      temp.push_back(val);
+    }
+    module_config.push_back(std::move(temp));
+  }
+  *config.mutable_fusion_config() = std::move(module_config);
+}
+
+static void AssignStructDotConfig(HloModuleConfig& config,
+                                  const HloModuleConfigProto& proto) {
+  auto& proto_config = proto.dot_config();
+  for (auto& [key, int_list] : proto_config) {
+    std::vector<int64_t> value{int_list.vals().begin(), int_list.vals().end()};
+    config.mutable_dot_config()->insert(std::pair{key, value});
+  }
+}
+
+static void AssignStructLayoutConfig(HloModuleConfig& config,
+                                     const HloModuleConfigProto& proto) {
+  std::vector<std::vector<std::vector<int64_t>>> module_config;
+  auto proto_config = proto.layout_config();
+  module_config.reserve(proto_config.size());
+  for (const auto& proto_row_wrapper : proto_config) {
+    const auto& proto_row = proto_row_wrapper.lists();
+    std::vector<std::vector<int64_t>> module_row;
+    module_row.reserve(proto_row.size());
+    for (const auto& proto_cell : proto_row) {
+      const auto& cell = proto_cell.vals();
+      module_row.push_back(std::vector<int64_t>(cell.begin(), cell.end()));
+    }
+    module_config.push_back(std::move(module_row));
+  }
+  *config.mutable_layout_config() = std::move(module_config);
+}
+
+static void AssignStructPhaseOrderingConfig(HloModuleConfig& config,
+                                            const HloModuleConfigProto& proto) {
+  std::vector<std::vector<bool>> module_config;
+  auto& proto_config = proto.phase_ordering_config();
+  module_config.reserve(proto_config.size());
+  for (auto& list : proto_config) {
+    std::vector<bool> temp;
+    for (bool val : list.vals()) {
+      temp.push_back(val);
+    }
+    module_config.push_back(std::move(temp));
+  }
+  *config.mutable_phase_ordering_config() = std::move(module_config);
+}
+
+StatusOr<HloModuleConfigProto> HloModuleConfig::ToProto() const {
+  HloModuleConfigProto proto;
+  if (has_entry_computation_layout()) {
+    *proto.mutable_entry_computation_layout() =
+        entry_computation_layout().ComputeProgramShape().ToProto();
+  }
+  proto.set_seed(seed_);
+  proto.set_launch_id(launch_id_);
+  proto.set_replica_count(replica_count_);
+  proto.set_num_partitions(num_partitions_);
+  for (bool requirement : param_requires_broadcast_via_collectives_) {
+    proto.add_param_requires_broadcast_via_collectives(requirement);
+  }
+  proto.set_use_spmd_partitioning(use_spmd_partitioning_);
+  proto.set_use_auto_spmd_partitioning(use_auto_spmd_partitioning_);
+  for (int64_t partitioning_shape : auto_spmd_partitioning_mesh_shape_) {
+    proto.add_auto_spmd_partitioning_mesh_shape(partitioning_shape);
+  }
+  for (int64_t partitioning_id : auto_spmd_partitioning_mesh_ids_) {
+    proto.add_auto_spmd_partitioning_mesh_ids(partitioning_id);
+  }
+  proto.set_deduplicate_hlo(deduplicate_hlo_);
+  proto.set_intra_op_parallelism_threads(intra_op_parallelism_threads_);
+  proto.set_device_type(device_type_);
+  *proto.mutable_debug_options() = debug_options_;
+
+  if (has_static_device_assignment()) {
+    auto proto_assignment = proto.mutable_static_device_assignment();
+    TF_RETURN_IF_ERROR(static_device_assignment_->Serialize(proto_assignment));
+  }
+  AssignProtoShardableValueUpdatePairs(
+      proto.mutable_shardable_value_update_pairs(),
+      shardable_value_update_pairs_);
+  proto.set_alias_passthrough_params(alias_passthrough_params_);
+  proto.set_content_aware_computation_sorting(
+      content_aware_computation_sorting_);
+  proto.set_fusion_config_collection(
+      static_cast<HloModuleConfigProto::FusionConfigCollection>(
+          fusion_config_collection_));
+  AssignProtoFusionConfig(proto, fusion_config_);
+  AssignProtoDotConfig(proto, dot_config_);
+  AssignProtoLayoutConfig(proto, layout_config_);
+  for (uint64_t cfg : memory_space_assignment_config_) {
+    proto.add_memory_space_assignment_config(cfg);
+  }
+  AssignProtoPhaseOrderingConfig(proto, phase_ordering_config_);
+  proto.set_phase_index(phase_index_);
+
+  for (bool value : allow_spmd_sharding_propagation_to_output_) {
+    proto.add_allow_spmd_sharding_propagation_to_output(value);
+  }
+
+  auto proto_analysis_map = proto.mutable_analysis_allowance_map();
+  for (const auto& [key, value] : analysis_allowance_map_) {
+    proto_analysis_map->insert({std::string(key), value});
+  }
+  proto.set_matrix_unit_operand_precision(matrix_unit_operand_precision_);
+  proto.set_allow_separate_sharding_programs(allow_separate_sharding_programs_);
+  return proto;
+}
+
+StatusOr<std::unique_ptr<HloModuleConfig>> HloModuleConfig::CreateFromProto(
+    const HloModuleConfigProto& proto) {
+  auto config = std::make_unique<HloModuleConfig>();
+
+  if (proto.has_entry_computation_layout()) {
+    auto comp_layout = ProgramShape{proto.entry_computation_layout()};
+    config->SetComputationLayoutIfExists(comp_layout);
+  } else {
+    config->clear_entry_computation_layout();
+  }
+  config->seed_ = proto.seed();
+  config->launch_id_ = proto.launch_id();
+  config->replica_count_ = proto.replica_count();
+  config->num_partitions_ = proto.num_partitions();
+  config->param_requires_broadcast_via_collectives_.assign(
+      proto.param_requires_broadcast_via_collectives().begin(),
+      proto.param_requires_broadcast_via_collectives().end());
+  config->use_spmd_partitioning_ = proto.use_spmd_partitioning();
+  config->use_auto_spmd_partitioning_ = proto.use_auto_spmd_partitioning();
+  config->auto_spmd_partitioning_mesh_shape_.assign(
+      proto.auto_spmd_partitioning_mesh_shape().begin(),
+      proto.auto_spmd_partitioning_mesh_shape().end());
+  config->auto_spmd_partitioning_mesh_ids_.assign(
+      proto.auto_spmd_partitioning_mesh_ids().begin(),
+      proto.auto_spmd_partitioning_mesh_ids().end());
+  config->deduplicate_hlo_ = proto.deduplicate_hlo();
+  config->intra_op_parallelism_threads_ = proto.intra_op_parallelism_threads();
+  config->device_type_ = proto.device_type();
+  if (proto.has_debug_options()) {
+    config->debug_options_ = proto.debug_options();
+  }
+  if (proto.has_static_device_assignment()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<DeviceAssignment> device_assignment,
+        DeviceAssignment::Deserialize(proto.static_device_assignment()));
+    config->static_device_assignment_ = std::move(*device_assignment);
+  }
+  AssignStructShardableValueUpdatePairs(*config,
+                                        proto.shardable_value_update_pairs());
+  config->alias_passthrough_params_ = proto.alias_passthrough_params();
+  config->content_aware_computation_sorting_ =
+      proto.content_aware_computation_sorting();
+  config->fusion_config_collection_ =
+      static_cast<FusionConfigCollection>(proto.fusion_config_collection());
+  AssignStructFusionConfig(*config, proto);
+  AssignStructDotConfig(*config, proto);
+  AssignStructLayoutConfig(*config, proto);
+  config->memory_space_assignment_config_.assign(
+      proto.memory_space_assignment_config().begin(),
+      proto.memory_space_assignment_config().end());
+  AssignStructPhaseOrderingConfig(*config, proto);
+  config->phase_index_ = proto.phase_index();
+  config->allow_spmd_sharding_propagation_to_output_.assign(
+      proto.allow_spmd_sharding_propagation_to_output().begin(),
+      proto.allow_spmd_sharding_propagation_to_output().end());
+  config->analysis_allowance_map_.insert(proto.analysis_allowance_map().begin(),
+                                         proto.analysis_allowance_map().end());
+  config->matrix_unit_operand_precision_ =
+      proto.matrix_unit_operand_precision();
+  config->allow_separate_sharding_programs_ =
+      proto.allow_separate_sharding_programs();
+
+  return std::move(config);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index a63b4db6829..5b83a85ee8e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -16,16 +16,20 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_CONFIG_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_CONFIG_H_
 
+#include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -80,6 +84,26 @@ class HloModuleConfig {
 
   explicit HloModuleConfig(ComputationLayout entry_computation_layout);
 
+  // Convert an HloModuleConfig to or from a proto.
+  StatusOr<HloModuleConfigProto> ToProto() const;
+  static StatusOr<std::unique_ptr<HloModuleConfig>> CreateFromProto(
+      const HloModuleConfigProto& proto);
+
+  // Assigns the repeated ShardableValueUpdatePairProto field to the given
+  // values in 'update_pairs'.
+  static void AssignProtoShardableValueUpdatePairs(
+      tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>*
+          proto_update_pairs,
+      const std::vector<HloModuleConfig::ShardableValueUpdatePair>&
+          update_pairs);
+
+  // Assigns shardable_value_update_pairs_ field in 'config' to the given values
+  // in 'pairs'.
+  static void AssignStructShardableValueUpdatePairs(
+      HloModuleConfig& config,
+      const tsl::protobuf::RepeatedPtrField<ShardableValueUpdatePairProto>&
+          pairs);
+
   // Checks if this config has an entry computation layout already.
   bool has_entry_computation_layout() const {
     return entry_computation_layout_.has_value();
@@ -142,7 +166,7 @@ class HloModuleConfig {
   }
   int64_t num_partitions() const { return num_partitions_; }
 
-  const std::vector<bool> param_requires_broadcast_via_collectives() const {
+  const std::vector<bool>& param_requires_broadcast_via_collectives() const {
     return param_requires_broadcast_via_collectives_;
   }
   void set_param_requires_broadcast_via_collectives(
@@ -234,7 +258,15 @@ class HloModuleConfig {
     static_device_assignment_ = device_assignment;
   }
 
-  const std::vector<ShardableValueUpdatePair> shardable_value_update_pairs()
+  bool allow_separate_sharding_programs() const {
+    return allow_separate_sharding_programs_;
+  }
+  void set_allow_separate_sharding_programs(
+      bool allow_separate_sharding_programs) {
+    allow_separate_sharding_programs_ = allow_separate_sharding_programs;
+  }
+
+  const std::vector<ShardableValueUpdatePair>& shardable_value_update_pairs()
       const {
     return shardable_value_update_pairs_;
   }
@@ -298,25 +330,17 @@ class HloModuleConfig {
     return &phase_ordering_config_;
   }
 
-  const absl::flat_hash_map<std::string, std::string>& flag_config() const {
-    return flag_config_;
-  }
-
-  absl::flat_hash_map<std::string, std::string>* mutable_flag_config() {
-    return &flag_config_;
-  }
-
-  const int phase_index() const { return phase_index_; }
+  int phase_index() const { return phase_index_; }
   void set_phase_index(const int phase_index) { phase_index_ = phase_index; }
 
-  void set_allow_spmd_sharding_propagation_to_output(
-      bool allow_spmd_sharding_propagation_to_output) {
-    allow_spmd_sharding_propagation_to_output_ =
-        allow_spmd_sharding_propagation_to_output;
-  }
-  bool allow_spmd_sharding_propagation_to_output() const {
+  absl::Span<const bool> allow_spmd_sharding_propagation_to_output() const {
     return allow_spmd_sharding_propagation_to_output_;
   }
+  void set_allow_spmd_sharding_propagation_to_output(
+      absl::Span<const bool> data) {
+    return allow_spmd_sharding_propagation_to_output_.assign(data.begin(),
+                                                             data.end());
+  }
 
   const std::vector<uint64_t>& memory_space_assignment_config() const {
     return memory_space_assignment_config_;
@@ -347,8 +371,9 @@ class HloModuleConfig {
   }
 
  private:
-  // If you add new members, be sure to update compilation_cache_key.
-
+  // If you add new members, be sure to update compilation_cache_key and the
+  // HloModuleConfigProto.
+  // LINT.IfChange
   std::optional<ComputationLayout> entry_computation_layout_;
 
   // Module/graph-level seed handle.
@@ -393,18 +418,17 @@ class HloModuleConfig {
   // Compile-time known device assignment.
   std::optional<DeviceAssignment> static_device_assignment_;
 
+  bool allow_separate_sharding_programs_ = false;
+
   std::vector<ShardableValueUpdatePair> shardable_value_update_pairs_;
 
   bool alias_passthrough_params_ = false;
 
-  bool content_aware_computation_sorting_ = true;
+  bool content_aware_computation_sorting_ = false;
 
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
-  // TODO(b/155665133): Consolidate fusion, dot, and layout config into a proto
-  // similar to backend config.
-
   // Custom fusion configuration, where fusion_config_[c][v] control if node v
   // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
@@ -432,10 +456,6 @@ class HloModuleConfig {
   // config across functions during compilation.
   int phase_index_ = 0;
 
-  // Flag configuration to use instead of global flags. This allows multiple
-  // HLO modules to be compiled in parallel with different flag values.
-  absl::flat_hash_map<std::string, std::string> flag_config_;
-
   // Allows sharding propagation to propagate to the outputs. This changes the
   // output shape of the computation (which is undesirable), but it can be used
   // to allow to run partial compilation to determine what would be the output
@@ -443,14 +463,21 @@ class HloModuleConfig {
   // which can be used by higher level framework as a way to query intermediate
   // sharding of operations when multiple computation would be chained and
   // merged together.
-  bool allow_spmd_sharding_propagation_to_output_ = false;
+  // Each boolean in the vector specifies if the propagation is allowed to
+  // change the sharding of a specific leaf in tuple output. One single boolean
+  // in the vector means we are applying this to every value in the tuple
+  // output. If the output is not a tuple then only a single value is valid
+  // here.
+  absl::InlinedVector<bool, 1> allow_spmd_sharding_propagation_to_output_ = {
+      false};
 
   // Each Hlo analysis is allowed at least a constant number of
   // abstract cost units, before it is considered for early termination.
-  absl::flat_hash_map<absl::string_view, int64_t> analysis_allowance_map_;
+  absl::flat_hash_map<std::string, int64_t> analysis_allowance_map_;
 
   PrecisionConfig::Precision matrix_unit_operand_precision_ =
       PrecisionConfig::DEFAULT;
+  // LINT.ThenChange(//tensorflow/compiler/xla/xla.proto)
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_config_test.cc b/tensorflow/compiler/xla/service/hlo_module_config_test.cc
new file mode 100644
index 00000000000..e1dbf8a046c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_config_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+using ::testing::EqualsProto;
+
+template <typename MessageType>
+StatusOr<MessageType> ParseTextProto(const std::string& text_proto) {
+  tsl::protobuf::TextFormat::Parser parser;
+  MessageType parsed_proto;
+  tsl::protobuf::io::ArrayInputStream input_stream(text_proto.data(),
+                                                   text_proto.size());
+  if (!parser.Parse(&input_stream, &parsed_proto)) {
+    return tsl::errors::InvalidArgument("Could not parse text proto: ",
+                                        text_proto);
+  }
+  return parsed_proto;
+}
+
+TEST(HloModuleConfigTest, ShardableValueUpdatePairProtoRoundTrip) {
+  const std::string text_proto = R"(
+  shardable_value_update_pairs {
+    input_parameter_number: 2
+    parameter_shape_index: 0
+    parameter_shape_index: 1
+    output_shape_index: 1
+    output_shape_index: 0
+  }
+  shardable_value_update_pairs {
+    input_parameter_number: 1
+    parameter_shape_index: 2
+    output_shape_index: 3
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto input_proto,
+                          ParseTextProto<HloModuleConfigProto>(text_proto));
+  HloModuleConfig config;
+  HloModuleConfig::AssignStructShardableValueUpdatePairs(
+      config, input_proto.shardable_value_update_pairs());
+  EXPECT_EQ(config.shardable_value_update_pairs().size(), 2);
+
+  HloModuleConfigProto output_proto;
+  HloModuleConfig::AssignProtoShardableValueUpdatePairs(
+      output_proto.mutable_shardable_value_update_pairs(),
+      config.shardable_value_update_pairs());
+  EXPECT_THAT(input_proto, EqualsProto(output_proto));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
index 55711081fb2..8e87009a75d 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <deque>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.h b/tensorflow/compiler/xla/service/hlo_module_dce.h
index eb749735fda..776fecbef82 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.h
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index 1ff74e64c96..debc53cd4fa 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module_dce.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index be9774f5d83..dbc41ce67fb 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 26b400839e4..e44d0031d22 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -23,10 +23,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/platform/status.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_test.cc b/tensorflow/compiler/xla/service/hlo_module_group_test.cc
index 506d3314c2d..46308617e27 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 9ec7fe9f060..28bbc6abe98 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
index aeac9111e06..c1a58ef7cfd 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/service/hlo_module_metadata.h b/tensorflow/compiler/xla/service/hlo_module_metadata.h
deleted file mode 100644
index 181c65fce15..00000000000
--- a/tensorflow/compiler/xla/service/hlo_module_metadata.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_METADATA_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_METADATA_H_
-
-#include <functional>
-#include <optional>
-
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/env.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_module_metadata_test.cc b/tensorflow/compiler/xla/service/hlo_module_metadata_test.cc
index 1e1275dce8d..a963dee054e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_metadata_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_metadata_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_module_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h"
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 0565cd17e02..243891ffc31 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -13,35 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/test_compilation_environment.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/strings/proto_serialization.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
 // In order to use TestCompilationEnvironment* with CompilationEnvironments, we
 // must define ProcessNewEnv for them.
-template <>
-std::unique_ptr<test::TestCompilationEnvironment1>
-CompilationEnvironments::ProcessNewEnv(
-    std::unique_ptr<test::TestCompilationEnvironment1> env) {
+std::unique_ptr<tsl::protobuf::Message> ProcessNewEnv(
+    std::unique_ptr<tsl::protobuf::Message> msg) {
+  std::unique_ptr<test::TestCompilationEnvironment1> env(
+      tensorflow::down_cast<test::TestCompilationEnvironment1*>(msg.release()));
   if (!env) {
     env = std::make_unique<test::TestCompilationEnvironment1>();
     env->set_some_flag(100);
@@ -55,7 +63,10 @@ namespace op = ::xla::testing::opcode_matchers;
 
 class HloModuleTest : public HloTestBase {
  protected:
-  HloModuleTest() {}
+  static void SetUpTestSuite() {
+    CompilationEnvironments::RegisterProcessNewEnvFn(
+        test::TestCompilationEnvironment1::descriptor(), ProcessNewEnv);
+  }
 
   // Create a computation which returns a constant.
   std::unique_ptr<HloComputation> CreateConstantComputation() {
@@ -118,7 +129,7 @@ TEST_F(HloModuleTest, CloneTest) {
   // Add a compilation environment to module
   auto env = std::make_unique<test::TestCompilationEnvironment1>();
   env->set_some_flag(10);
-  module->comp_envs().AddEnv(std::move(env));
+  TF_ASSERT_OK(module->comp_envs().AddEnv(std::move(env)));
 
   auto post_order = module->MakeComputationPostOrder();
   auto cloned_module = module->Clone("copy");
@@ -700,6 +711,187 @@ TEST_F(HloModuleTest, TwoComputationsFilterexecution_threads) {
   EXPECT_EQ(num_parallel_computations, 1);
 }
 
+TEST_F(HloModuleTest, HloModuleWithConfigSerializationEquality) {
+  const std::string computation_text =
+      R"(HloModule ReduceR3ToR2_module
+
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY ReduceR3ToR2.v3 {
+  input = f32[8,16,256]{2,1,0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(computation_text));
+
+  TF_ASSERT_OK_AND_ASSIGN(xla::HloModuleProtoWithConfig proto,
+                          module->ToProtoWithConfig());
+  std::string serialized_module;
+  ASSERT_TRUE(tsl::SerializeToStringDeterministic(proto, &serialized_module));
+  std::string original_debug_str = proto.DebugString();
+  RecordProperty("serialized_module", original_debug_str);
+
+  // Verify that we can create a module from our parsed proto copy
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> reconstructed_module,
+                          HloModule::CreateFromProtoWithConfig(proto));
+  TF_ASSERT_OK_AND_ASSIGN(
+      xla::HloModuleProtoWithConfig reconstructed_module_proto,
+      reconstructed_module->ToProtoWithConfig());
+
+  // The two protos should be equivalent except for the `id` field
+  google::protobuf::util::MessageDifferencer diff;
+  diff.set_message_field_comparison(
+      google::protobuf::util::MessageDifferencer::EQUIVALENT);
+  auto module_descriptor = HloModuleProto::GetDescriptor();
+  auto unique_id_field = module_descriptor->FindFieldByName("id");
+  diff.IgnoreField(unique_id_field);
+  EXPECT_TRUE(diff.Compare(proto, reconstructed_module_proto));
+}
+
+static ShardableValueUpdatePairProto MakeShardPair(int offset) {
+  ShardableValueUpdatePairProto pear;
+  pear.set_input_parameter_number(offset + 1);
+  for (int64_t i = 0; i < 5; ++i) {
+    pear.add_parameter_shape_index(offset + i);
+  }
+  for (int64_t j = 0; j < 3; ++j) {
+    pear.add_output_shape_index(offset + j);
+  }
+  return pear;
+}
+
+static HloModuleConfigProto::BoolList MakeOneHotBoolList(unsigned num_vals,
+                                                         unsigned hot_idx) {
+  HloModuleConfigProto::BoolList list;
+  for (unsigned i = 0; i < num_vals; ++i) {
+    list.add_vals(i == hot_idx);
+  }
+  return list;
+}
+
+static StatusOr<HloModuleConfigProto> MakeTestModuleConfigProto() {
+  HloModuleConfigProto proto;
+  // entry_computation_layout_ is optional
+  proto.set_seed(0xdeadbeef);
+  proto.set_launch_id(0xfeed100);
+  proto.set_replica_count(3);
+  proto.set_num_partitions(2);
+  for (int x = 0; x < 6; ++x) {
+    proto.add_param_requires_broadcast_via_collectives(x & 1);
+  }
+  proto.set_use_spmd_partitioning(true);
+  proto.set_use_auto_spmd_partitioning(true);
+  for (unsigned x = 0; x < 4; ++x) {
+    proto.add_auto_spmd_partitioning_mesh_ids(10 - x);
+    proto.add_auto_spmd_partitioning_mesh_ids(x);
+  }
+  proto.set_deduplicate_hlo(true);
+  proto.set_intra_op_parallelism_threads(42);
+  proto.set_device_type("Google Test framework");
+  // debug options
+  *proto.mutable_debug_options() = DefaultDebugOptionsIgnoringFlags();
+  // static device assignment
+  {
+    DeviceAssignmentProto device_assignment_proto;
+    DeviceAssignment device_assignment(/*replica_count=*/3,
+                                       /*computation_count=*/2);
+    TF_RETURN_IF_ERROR(device_assignment.Serialize(&device_assignment_proto));
+    proto.mutable_static_device_assignment()->Swap(&device_assignment_proto);
+  }
+  // Shardable Value Update Pairs
+  for (int k = 0; k < 3; ++k) {
+    *proto.add_shardable_value_update_pairs() = MakeShardPair(k);
+  }
+  proto.set_alias_passthrough_params(true);
+  proto.set_content_aware_computation_sorting(true);
+  proto.set_fusion_config_collection(HloModuleConfigProto::PER_NODE);
+  // fusion config
+  for (int idx = 0; idx < 4; ++idx) {
+    bool reverse = (idx & 1) == 0;
+    *proto.add_fusion_config() =
+        MakeOneHotBoolList(6, (reverse) ? 6 - idx : idx);
+  }
+  // dot config
+  for (int idx = 0; idx < 4; ++idx) {
+    HloModuleConfigProto::Int64List int_list;
+    for (int x = 1; x <= 3; ++x) {
+      int_list.add_vals(x * x * idx);
+    }
+    proto.mutable_dot_config()->insert(
+        {absl::StrCat("Node", idx, "dot"), std::move(int_list)});
+  }
+
+  // layout config
+  for (int idx = 0; idx < 4; ++idx) {
+    HloModuleConfigProto::Int64ListList list_of_lists;
+    for (int x = 0; x < 4; ++x) {
+      HloModuleConfigProto::Int64List int_list;
+      for (int y = 0; y < 6; ++y) {
+        int_list.add_vals(y * x + idx + y + 1);
+      }
+      list_of_lists.add_lists()->Swap(&int_list);
+    }
+    proto.mutable_layout_config()->Add(std::move(list_of_lists));
+  }
+
+  // memory space assignment config
+  for (uint64_t mem_asgn = 42; mem_asgn < 50; ++mem_asgn) {
+    proto.add_memory_space_assignment_config(mem_asgn);
+  }
+
+  // phase ordering config
+  for (int n = 0; n < 4; ++n) {
+    *proto.add_phase_ordering_config() = MakeOneHotBoolList(4, n);
+  }
+  proto.set_phase_index(2);
+
+  proto.add_allow_spmd_sharding_propagation_to_output(true);
+  for (int idx = 1; idx <= 3; ++idx) {
+    int64_t allowance = 35 * idx;
+    proto.mutable_analysis_allowance_map()->insert(
+        {absl::StrCat("Key", idx), allowance});
+  }
+  proto.set_matrix_unit_operand_precision(PrecisionConfig::HIGH);
+  return proto;
+}
+
+TEST_F(HloModuleTest, HloModuleConfigCreateFromProto) {
+  TF_ASSERT_OK_AND_ASSIGN(HloModuleConfigProto input_proto,
+                          MakeTestModuleConfigProto());
+  TF_ASSERT_OK_AND_ASSIGN(auto good_config,
+                          HloModuleConfig::CreateFromProto(input_proto));
+  TF_ASSERT_OK_AND_ASSIGN(HloModuleConfigProto output_proto,
+                          good_config->ToProto());
+
+  google::protobuf::util::MessageDifferencer diff;
+  diff.set_message_field_comparison(
+      google::protobuf::util::MessageDifferencer::EQUIVALENT);
+  EXPECT_TRUE(diff.Compare(input_proto, output_proto));
+}
+
+TEST_F(HloModuleTest, HloModuleConfigToProto) {
+  auto module = CreateNewVerifiedModule();
+  const HloModuleConfig& good_config = module->config();
+  TF_ASSERT_OK_AND_ASSIGN(HloModuleConfigProto first_proto,
+                          good_config.ToProto());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModuleConfig> remade_config,
+                          HloModuleConfig::CreateFromProto(first_proto));
+  ASSERT_NE(remade_config, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(HloModuleConfigProto second_proto,
+                          remade_config->ToProto());
+
+  google::protobuf::util::MessageDifferencer diff;
+  diff.set_message_field_comparison(
+      google::protobuf::util::MessageDifferencer::EQUIVALENT);
+  EXPECT_TRUE(diff.Compare(first_proto, second_proto));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_util.cc b/tensorflow/compiler/xla/service/hlo_module_util.cc
index 26e0d6021d7..a5236b60d6f 100644
--- a/tensorflow/compiler/xla/service/hlo_module_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_util.cc
@@ -95,8 +95,11 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     }
     config->set_use_spmd_partitioning(
         execution_options->use_spmd_partitioning());
-    config->set_allow_spmd_sharding_propagation_to_output(
-        execution_options->allow_spmd_sharding_propagation_to_output());
+    if (!execution_options->allow_spmd_sharding_propagation_to_output()
+             .empty()) {
+      config->set_allow_spmd_sharding_propagation_to_output(
+          execution_options->allow_spmd_sharding_propagation_to_output());
+    }
     config->set_use_auto_spmd_partitioning(
         execution_options->use_auto_spmd_partitioning());
     std::vector<int64_t> mesh_shape;
@@ -113,6 +116,14 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
     config->set_debug_options(execution_options->debug_options());
+    if (execution_options->has_device_assignment()) {
+      TF_ASSIGN_OR_RETURN(auto device_assignment,
+                          DeviceAssignment::Deserialize(
+                              execution_options->device_assignment()));
+      config->set_static_device_assignment(*device_assignment);
+    }
+    config->set_alias_passthrough_params(
+        execution_options->alias_passthrough_params());
   } else {
     config->set_replica_count(default_num_replicas);
     config->set_debug_options(GetDebugOptionsFromFlags());
@@ -122,16 +133,6 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     config->set_intra_op_parallelism_threads(*num_threads);
   }
 
-  if (execution_options != nullptr &&
-      execution_options->has_device_assignment()) {
-    TF_ASSIGN_OR_RETURN(
-        auto device_assignment,
-        DeviceAssignment::Deserialize(execution_options->device_assignment()));
-    config->set_static_device_assignment(*device_assignment);
-  }
-  config->set_alias_passthrough_params(
-      execution_options->alias_passthrough_params());
-
   if (aot_options != nullptr) {
     config->set_matrix_unit_operand_precision(
         aot_options->matrix_unit_operand_precision());
diff --git a/tensorflow/compiler/xla/service/hlo_op_metadata.h b/tensorflow/compiler/xla/service/hlo_op_metadata.h
deleted file mode 100644
index 97a059517de..00000000000
--- a/tensorflow/compiler/xla/service/hlo_op_metadata.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OP_METADATA_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OP_METADATA_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OP_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
deleted file mode 100644
index ed040c833bf..00000000000
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
-
-#include <iosfwd>
-#include <optional>
-#include <string>
-
-#include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 10351646ee9..40d694d28fa 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 523b459a56b..7b534f11755 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -327,6 +327,19 @@ bool HloOrdering::UsesBeforeValueDefinition(
         return true;
       }
     }
+    // The use at an async call occurs before values that are defined in the
+    // called computation of the async wrapped instruction.
+    if (use.instruction->IsAsynchronous() &&
+        use.instruction->async_wrapped_opcode() == HloOpcode::kCall) {
+      const HloInstruction* async = use.instruction;
+      if (call_graph_->InstructionIsNestedIn(
+              value.defining_instruction(),
+              async->async_wrapped_instruction()->to_apply())) {
+        VLOG(4) << "  use is async " << use.instruction->name()
+                << " and def is in called computation";
+        return true;
+      }
+    }
     if (use.instruction->opcode() == HloOpcode::kConditional) {
       const HloInstruction* conditional = use.instruction;
       // In general the use of a value in the conditional parameter should be
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index 3fd777cbb8a..7d449cb53de 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/types.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 9fcd74d1c5b..02008762f8a 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -589,5 +589,52 @@ ENTRY entry {
       ordering.UsesBeforeValueDefinition({&tuple_use}, value, *dataflow));
 }
 
+TEST_F(HloOrderingTest, AsyncCallUses) {
+  absl::string_view hlo_string = R"(
+HloModule single_sc_async_call
+
+%called_computation {
+  %out_param = s32[1024]{0} parameter(1)
+  %input = s32[1024]{0} parameter(0)
+  %size = s32[] constant(256)
+  %index = s32[] custom-call(), custom_call_target="Baz"
+  %start = s32[] multiply(s32[] %size, s32[] %index)
+  %input2 = s32[256]{0} dynamic-slice(s32[1024]{0} %input, s32[] %start), dynamic_slice_sizes={256}
+  %output = s32[256]{0} add(s32[256]{0} %input2, s32[256]{0} %input2)
+  ROOT %output2 = s32[1024]{0} dynamic-update-slice(s32[1024]{0} %out_param, s32[256]{0} %output, s32[] %start)
+}, execution_thread="foobar"
+
+%async_wrapped {
+  %async_param = s32[1024]{0} parameter(0)
+  %async_param.1 = s32[1024]{0} parameter(1)
+  ROOT %call = s32[1024]{0} call(s32[1024]{0} %async_param, s32[1024]{0} %async_param.1), to_apply=%called_computation
+}, execution_thread="foobar"
+
+ENTRY %main {
+  %input.1 = s32[1024]{0} parameter(0)
+  %buf = s32[1024]{0} custom-call(), custom_call_target="AllocateBuffer"
+  %async-start = ((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) async-start(s32[1024]{0} %input.1, s32[1024]{0} %buf), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+  ROOT %async-done = s32[1024]{0} async-done(((s32[1024]{0}, s32[1024]{0}), s32[1024]{0}, u32[]) %async-start), async_group_id=0, async_execution_thread="foobar", calls=%async_wrapped
+}
+)";
+  HloModuleConfig hlo_config;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string, hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
+                          HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+  auto async_start = FindInstruction(module.get(), "async-start");
+  auto async_done = FindInstruction(module.get(), "async-done");
+  auto call = FindInstruction(module.get(), "call");
+  auto output2 = FindInstruction(module.get(), "output2");
+
+  auto async_start_use = HloUse{async_start, 1};
+  auto async_done_use = HloUse{async_done, 0, {0, 1}};
+  auto call_use = HloUse{call, 1};
+  const HloValue& value = dataflow->GetUniqueValueAt(output2, {});
+  EXPECT_TRUE(ordering.UsesBeforeValueDefinition(
+      {&async_start_use, &call_use, &async_done_use}, value, *dataflow));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 61f324f8c2e..e5babb09caa 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -41,24 +42,25 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_lexer.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/gtl/map_util.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 
@@ -159,6 +161,7 @@ bool CanInferShape(HloOpcode code) {
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSort:
     case HloOpcode::kSubtract:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
     case HloOpcode::kTranspose:
     case HloOpcode::kTriangularSolve:
@@ -216,6 +219,7 @@ bool CanInferShape(HloOpcode code) {
 class HloParserImpl : public HloParser {
  public:
   using LocTy = HloLexer::LocTy;
+  using BoolList = absl::InlinedVector<bool, 1>;
 
   explicit HloParserImpl(absl::string_view str) : lexer_(str) {}
 
@@ -231,6 +235,7 @@ class HloParserImpl : public HloParser {
   StatusOr<HloSharding> ParseShardingOnly();
   StatusOr<FrontendAttributes> ParseFrontendAttributesOnly();
   StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
+  StatusOr<BoolList> ParseBooleanListOrSingleBooleanOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
@@ -257,6 +262,7 @@ class HloParserImpl : public HloParser {
     kConvolutionDimensionNumbers,
     kSharding,
     kFrontendAttributes,
+    kBracedBoolListOrBool,
     kParameterReplication,
     kInstructionList,
     kSliceRanges,
@@ -304,7 +310,12 @@ class HloParserImpl : public HloParser {
   bool ParseSingleInstruction(HloModule* module);
 
   // Parses a module, returning false if an error occurred.
-  bool ParseHloModule(HloModule* module);
+  // if `parse_module_without_header` is true, the parsed text is sequence of
+  // computations, and assume computation with `ENTRY` annotation or the last
+  // computation as module's entry computation, also using the entry
+  // computation's parameter and `ROOT` instruction's layout as module's layout.
+  bool ParseHloModule(HloModule* module,
+                      bool parse_module_without_header = false);
 
   bool ParseComputations(HloModule* module);
   bool ParseComputation(HloComputation** entry_computation);
@@ -455,6 +466,7 @@ class HloParserImpl : public HloParser {
   bool ParseFrontendAttributes(FrontendAttributes* frontend_attributes);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
   bool ParseParameterReplication(ParameterReplication* parameter_replication);
+  bool ParseBooleanListOrSingleBoolean(BoolList* boolean_list);
   bool ParseReplicaGroupsOnly(std::vector<ReplicaGroup>* replica_groups);
 
   // Parses the metadata behind a kDOmain instruction.
@@ -650,9 +662,13 @@ bool HloParserImpl::TokenError(absl::string_view msg) {
 
 Status HloParserImpl::Run(HloModule* module) {
   lexer_.Lex();
-  if (lexer_.GetKind() == TokKind::kw_HloModule) {
+  if ((lexer_.GetKind() == TokKind::kw_HloModule) ||
+      (lexer_.GetKind() == TokKind::kw_ENTRY) ||
+      (lexer_.LookAhead() == TokKind::kLbrace)) {
     // This means that the text contains a full HLO module.
-    if (!ParseHloModule(module)) {
+    bool parse_module_without_header =
+        (lexer_.GetKind() == TokKind::kw_HloModule) ? false : true;
+    if (!ParseHloModule(module, parse_module_without_header)) {
       return InvalidArgument(
           "Syntax error when trying to parse the text as a HloModule:\n%s",
           GetError());
@@ -918,23 +934,15 @@ bool HloParserImpl::ParseCustomCallApiVersion(CustomCallApiVersion* result) {
 }
 
 // ::= 'HloModule' name computations
-bool HloParserImpl::ParseHloModule(HloModule* module) {
-  if (lexer_.GetKind() != TokKind::kw_HloModule) {
-    return TokenError("expects HloModule");
-  }
-  // Eat 'HloModule'
-  lexer_.Lex();
-
+bool HloParserImpl::ParseHloModule(HloModule* module,
+                                   bool parse_module_without_header) {
   std::string name;
-  if (!ParseName(&name)) {
-    return false;
-  }
-
   std::optional<bool> is_scheduled;
   std::optional<AliasingData> aliasing_data;
   std::optional<bool> alias_passthrough_params;
   absl::flat_hash_map<std::string, AttrConfig> attrs;
   std::optional<ComputationLayout> entry_computation_layout;
+  BoolList allow_spmd_sharding_propagation_to_output;
 
   attrs["is_scheduled"] = {/*required=*/false, AttrTy::kBool, &is_scheduled};
   attrs["input_output_alias"] = {/*required=*/false, AttrTy::kAliasing,
@@ -944,20 +952,45 @@ bool HloParserImpl::ParseHloModule(HloModule* module) {
   attrs["entry_computation_layout"] = {/*required=*/false,
                                        AttrTy::kComputationLayout,
                                        &entry_computation_layout};
-  if (!ParseAttributes(attrs)) {
-    return false;
+  attrs["allow_spmd_sharding_propagation_to_output"] = {
+      /*required=*/false, AttrTy::kBracedBoolListOrBool,
+      &allow_spmd_sharding_propagation_to_output};
+
+  if (!parse_module_without_header) {
+    if (lexer_.GetKind() != TokKind::kw_HloModule) {
+      return TokenError("expects HloModule");
+    }
+    // Eat 'HloModule'
+    lexer_.Lex();
+
+    if (!ParseName(&name)) {
+      return false;
+    }
+    if (!ParseAttributes(attrs)) {
+      return false;
+    }
+    module->set_name(name);
   }
-  module->set_name(name);
+
   if (!ParseComputations(module)) {
     return false;
   }
 
-  if (is_scheduled.has_value() && *is_scheduled) {
+  if (parse_module_without_header) {
+    name = "module_" + module->entry_computation()->name();
+    entry_computation_layout =
+        ComputationLayout(module->entry_computation()->ComputeProgramShape(),
+                          /*ignore_layouts*/ false);
+  }
+
+  module->set_name(name);
+
+  if (is_scheduled.value_or(false)) {
     TF_CHECK_OK(module->set_schedule(ScheduleFromInstructionOrder(module)));
   }
   HloModuleConfig config = module->config();
   bool default_config = true;
-  if (alias_passthrough_params.has_value() && *alias_passthrough_params) {
+  if (alias_passthrough_params.value_or(false)) {
     config.set_alias_passthrough_params(true);
     default_config = false;
   }
@@ -965,6 +998,11 @@ bool HloParserImpl::ParseHloModule(HloModule* module) {
     *config.mutable_entry_computation_layout() = *entry_computation_layout;
     default_config = false;
   }
+  if (!allow_spmd_sharding_propagation_to_output.empty()) {
+    config.set_allow_spmd_sharding_propagation_to_output(
+        allow_spmd_sharding_propagation_to_output);
+    default_config = false;
+  }
   if (!default_config) {
     module->set_config(config);
   }
@@ -1210,7 +1248,12 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
 
   // Add shared attributes like metadata to the instruction, if they were seen.
   if (sharding) {
-    instruction->set_sharding(HloSharding::FromProto(sharding.value()).value());
+    // TODO(b/257495070): Eliminate tuple sharding normalization in HLO parser.
+    // Allow existing HLO text with invalid sharding on tuple shapes by
+    // normalizing tuple sharding.
+    HloSharding hlo_sharding = HloSharding::FromProto(sharding.value()).value();
+    hlo_sharding = hlo_sharding.NormalizeTupleSharding(instruction->shape());
+    instruction->set_sharding(hlo_sharding);
   }
   if (parameter_replication) {
     int leaf_count = ShapeUtil::GetLeafCount(instruction->shape());
@@ -1345,6 +1388,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     case HloOpcode::kSin:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh: {
       if ((!preset_operands &&
            !ParseOperands(&operands, builder, /*expected_size=*/1)) ||
@@ -1699,18 +1743,16 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           *async_execution_thread));
     }
     case HloOpcode::kCopyStart: {
-      // If the is_cross_program_prefetch attribute is not present then default
-      // to false.
-      optional<bool> is_cross_program_prefetch = false;
-      attrs["is_cross_program_prefetch"] = {/*required=*/false, AttrTy::kBool,
-                                            &is_cross_program_prefetch};
+      optional<int> cross_program_prefetch_index = std::nullopt;
+      attrs["cross_program_prefetch_index"] = {
+          /*required=*/false, AttrTy::kInt32, &cross_program_prefetch_index};
       if ((!preset_operands &&
            !ParseOperands(&operands, builder, /*expected_size=*/1)) ||
           !ParseAttributes(attrs, allow_attributes)) {
         return nullptr;
       }
       return builder->AddInstruction(HloInstruction::CreateCopyStart(
-          *shape, operands[0], *is_cross_program_prefetch));
+          *shape, operands[0], cross_program_prefetch_index));
     }
     case HloOpcode::kReplicaId: {
       if ((!preset_operands &&
@@ -2428,12 +2470,24 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
                         &fusion_computation};
       optional<HloInstruction::FusionKind> fusion_kind;
       attrs["kind"] = {/*required=*/true, AttrTy::kFusionKind, &fusion_kind};
+      optional<
+          std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+          output_to_operand_aliasing;
+      attrs["output_to_operand_aliasing"] = {/*required=*/false,
+                                             AttrTy::kInstructionAliasing,
+                                             &output_to_operand_aliasing};
       if ((!preset_operands && !ParseOperands(&operands, builder)) ||
           !ParseAttributes(attrs, allow_attributes)) {
         return nullptr;
       }
-      return builder->AddInstruction(HloInstruction::CreateFusion(
+      auto instr = builder->AddInstruction(HloInstruction::CreateFusion(
           *shape, *fusion_kind, operands, *fusion_computation));
+      auto fusion_instr = Cast<HloFusionInstruction>(instr);
+      if (output_to_operand_aliasing.has_value()) {
+        fusion_instr->set_output_to_operand_aliasing(
+            std::move(*output_to_operand_aliasing));
+      }
+      return instr;
     }
     case HloOpcode::kInfeed: {
       optional<std::string> config;
@@ -3206,6 +3260,47 @@ bool HloParserImpl::ParseParameterReplication(
                     "expected '}' to end parameter_replication attribute");
 }
 
+// boolean_list ::=
+//   ('true' | 'false') | ('{' ('true' | 'false')* (',' ('true' | 'false'))*
+//   '}')
+bool HloParserImpl::ParseBooleanListOrSingleBoolean(BoolList* boolean_list) {
+  if (lexer_.GetKind() != TokKind::kLbrace &&
+      lexer_.GetKind() != TokKind::kw_true &&
+      lexer_.GetKind() != TokKind::kw_false) {
+    TokenError("Expected list of booleans or true/false value");
+    return false;
+  }
+  auto parse_boolean = [this, boolean_list]() {
+    if (lexer_.GetKind() == TokKind::kw_true) {
+      boolean_list->push_back(true);
+      lexer_.Lex();
+      return true;
+    } else if (lexer_.GetKind() == TokKind::kw_false) {
+      boolean_list->push_back(false);
+      lexer_.Lex();
+      return true;
+    }
+    return false;
+  };
+  if (parse_boolean()) {
+    return true;
+  }
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start boolean list attribute")) {
+    return false;
+  }
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    do {
+      if (!parse_boolean()) {
+        return false;
+      }
+    } while (EatIfPresent(TokKind::kComma));
+  }
+
+  return ParseToken(TokKind::kRbrace,
+                    "expected '}' to end boolean list attribute");
+}
+
 // replica_groups ::='{' int64_tlist_elements '}'
 // int64_tlist_elements
 //   ::= /*empty*/
@@ -3311,6 +3406,12 @@ bool HloParserImpl::SetValueInLiteral(LocTy loc, double value, int64_t index,
                                       Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
+    case F8E5M2:
+      return SetValueInLiteralHelper<tsl::float8_e5m2>(loc, value, index,
+                                                       literal);
+    case F8E4M3FN:
+      return SetValueInLiteralHelper<tsl::float8_e4m3fn>(loc, value, index,
+                                                         literal);
     case F16:
       return SetValueInLiteralHelper<Eigen::half>(loc, value, index, literal);
     case BF16:
@@ -3445,32 +3546,47 @@ bool HloParserImpl::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
   }
   using ParsedElemComponentT = typename ComponentType<ParsedElemT>::Type;
   using LiteralNativeComponentT = typename ComponentType<LiteralNativeT>::Type;
-  const auto handle_nan = [this, literal, index, loc](
-                              ParsedElemComponentT parsed_value_component,
-                              LiteralNativeComponentT*
-                                  literal_value_component) {
-    if (!std::isnan(static_cast<double>(parsed_value_component))) {
-      return true;
-    }
-    auto nan_payload = GetNanPayload(parsed_value_component);
-    if (nan_payload == QuietNanWithoutPayload<double>()) {
-      nan_payload = QuietNanWithoutPayload<LiteralNativeComponentT>();
-    }
-    const auto kLargestPayload = NanPayloadBitMask<LiteralNativeComponentT>();
-    if (nan_payload > kLargestPayload) {
-      return Error(
-          loc,
-          StrCat("tries to set NaN payload 0x", absl::Hex(nan_payload),
-                 " to a literal in shape ",
-                 ShapeUtil::HumanString(literal->shape()), " at linear index ",
-                 index, ", but the NaN payload is out of range (0x",
-                 absl::Hex(kLargestPayload), ")"));
-    }
-    *literal_value_component = NanWithSignAndPayload<LiteralNativeComponentT>(
-        /*sign=*/std::signbit(static_cast<double>(parsed_value_component)),
-        /*nan_payload=*/nan_payload);
-    return true;
-  };
+  const auto handle_nan =
+      [this, literal, index, loc](
+          ParsedElemComponentT parsed_value_component,
+          LiteralNativeComponentT* literal_value_component) {
+        if (!std::isnan(static_cast<double>(parsed_value_component))) {
+          return true;
+        }
+        auto nan_payload = GetNanPayload(parsed_value_component);
+        if constexpr (std::is_same<LiteralNativeComponentT,
+                                   tsl::float8_e4m3fn>::value) {
+          if (nan_payload != QuietNanWithoutPayload<double>()) {
+            return Error(
+                loc, StrCat("tries to set NaN payload 0x",
+                            absl::Hex(nan_payload), " to a literal in shape ",
+                            ShapeUtil::HumanString(literal->shape()),
+                            " at linear index ", index,
+                            ", but f8e4m3fn does not support payloads"));
+          }
+        } else {
+          if (nan_payload == QuietNanWithoutPayload<double>()) {
+            nan_payload = QuietNanWithoutPayload<LiteralNativeComponentT>();
+          }
+          const auto kLargestPayload =
+              NanPayloadBitMask<LiteralNativeComponentT>();
+          if (nan_payload > kLargestPayload) {
+            return Error(
+                loc, StrCat("tries to set NaN payload 0x",
+                            absl::Hex(nan_payload), " to a literal in shape ",
+                            ShapeUtil::HumanString(literal->shape()),
+                            " at linear index ", index,
+                            ", but the NaN payload is out of range (0x",
+                            absl::Hex(kLargestPayload), ")"));
+          }
+          *literal_value_component =
+              NanWithSignAndPayload<LiteralNativeComponentT>(
+                  /*sign=*/std::signbit(
+                      static_cast<double>(parsed_value_component)),
+                  /*nan_payload=*/nan_payload);
+        }
+        return true;
+      };
   const ParsedElemComponentT parsed_real_value = GetReal(value);
   auto literal_real_value =
       static_cast<LiteralNativeComponentT>(parsed_real_value);
@@ -3642,6 +3758,9 @@ bool HloParserImpl::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         break;
       }
       case TokKind::kRbrace: {
+        if (nest_level == 0) {
+          return TokenError("unexpected '}' token");
+        }
         nest_level--;
         if (elems_seen_per_dim[nest_level] != shape.dimensions(nest_level)) {
           return TokenError(absl::StrFormat(
@@ -3754,22 +3873,29 @@ struct MinMaxFiniteValue {
   static T min() { return std::numeric_limits<T>::lowest(); }
 };
 
-template <>
-struct MinMaxFiniteValue<Eigen::half> {
+template <typename T>
+struct MinMaxFiniteValueCustomFloat {
   static double max() {
     // Sadly this is not constexpr, so this forces `value` to be a method.
-    return static_cast<double>(Eigen::NumTraits<Eigen::half>::highest());
+    return static_cast<double>(Eigen::NumTraits<T>::highest());
   }
   static double min() { return -max(); }
 };
 
 template <>
-struct MinMaxFiniteValue<bfloat16> {
-  static double max() {
-    return static_cast<double>(Eigen::NumTraits<Eigen::bfloat16>::highest());
-  }
-  static double min() { return -max(); }
-};
+struct MinMaxFiniteValue<Eigen::half>
+    : MinMaxFiniteValueCustomFloat<Eigen::half> {};
+
+template <>
+struct MinMaxFiniteValue<bfloat16> : MinMaxFiniteValueCustomFloat<bfloat16> {};
+
+template <>
+struct MinMaxFiniteValue<tsl::float8_e5m2>
+    : MinMaxFiniteValueCustomFloat<tsl::float8_e5m2> {};
+
+template <>
+struct MinMaxFiniteValue<tsl::float8_e4m3fn>
+    : MinMaxFiniteValueCustomFloat<tsl::float8_e4m3fn> {};
 
 // MSVC's standard C++ library does not define isnan/isfinite for integer types.
 // To work around that we will need to provide our own.
@@ -4097,6 +4223,13 @@ bool HloParserImpl::ParseAttributeHelper(
         static_cast<optional<bool>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kBracedBoolListOrBool: {
+        if (!ParseBooleanListOrSingleBoolean(
+                static_cast<BoolList*>(attr_out_ptr))) {
+          return false;
+        }
+        return true;
+      }
       case AttrTy::kInt64: {
         int64_t result;
         if (!ParseInt64(&result)) {
@@ -5024,19 +5157,20 @@ bool HloParserImpl::ParseDimLevelTypes(
     absl::InlinedVector<bool, InlineRank()>* dim_unique,
     absl::InlinedVector<bool, InlineRank()>* dim_ordered) {
   auto parse_and_add_item = [&]() {
-    bool dim_level_type_valid = false;
     if (lexer_.GetKind() == TokKind::kIdent) {
+      bool dim_level_type_valid = false;
+      DimLevelType dim_level_type;
       if (lexer_.GetStrVal() == "D") {
         lexer_.Lex();
-        dim_level_types->push_back(DIM_DENSE);
+        dim_level_type = DIM_DENSE;
         dim_level_type_valid = true;
       } else if (lexer_.GetStrVal() == "C") {
-        dim_level_types->push_back(DIM_COMPRESSED);
         lexer_.Lex();
+        dim_level_type = DIM_COMPRESSED;
         dim_level_type_valid = true;
       } else if (lexer_.GetStrVal() == "S") {
-        dim_level_types->push_back(DIM_SINGLETON);
         lexer_.Lex();
+        dim_level_type = DIM_SINGLETON;
         dim_level_type_valid = true;
       }
       if (dim_level_type_valid) {
@@ -5050,6 +5184,13 @@ bool HloParserImpl::ParseDimLevelTypes(
           new_dim_ordered = false;
           lexer_.Lex();
         }
+        if (!LayoutUtil::ValidateDimLevel(dim_level_type, new_dim_unique,
+                                          new_dim_ordered)) {
+          return Error(
+              lexer_.GetLoc(),
+              "invalid DimLevelType/unique/ordered combination in shape");
+        }
+        dim_level_types->push_back(dim_level_type);
         dim_unique->push_back(new_dim_unique);
         dim_ordered->push_back(new_dim_ordered);
         return true;
@@ -5161,7 +5302,8 @@ bool HloParserImpl::ParseLayoutIntAttribute(
 //       (':' dim_level_types
 //            tiles
 //            memory_space
-//            physical_shape)?
+//            physical_shape
+//            dynamic_shape_metadata_prefix_bytes)?
 //       '}'
 // memory_space
 //   ::= /*empty*/
@@ -5176,6 +5318,7 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID;
   int64_t memory_space = 0;
   std::optional<Shape> physical_shape;
+  int64_t dynamic_shape_metadata_prefix_bytes = 0;
 
   auto parse_and_add_item = [&]() {
     int64_t i;
@@ -5248,6 +5391,12 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
         physical_shape.emplace();
         ParsePhysicalShape(&*physical_shape);
       }
+
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "M") {
+        lexer_.Lex();
+        ParseLayoutIntAttribute(&dynamic_shape_metadata_prefix_bytes,
+                                "dynamic shape metadata prefix bytes");
+      }
     }
   }
   if (!ParseToken(TokKind::kRbrace,
@@ -5260,10 +5409,10 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   for (int i = 0; i < tiles.size(); i++) {
     vec_tiles[i] = Tile(tiles[i]);
   }
-  *layout = LayoutUtil::MakeLayout(minor_to_major, dim_level_types, dim_unique,
-                                   dim_ordered, vec_tiles, index_primitive_type,
-                                   pointer_primitive_type, memory_space,
-                                   std::move(physical_shape));
+  *layout = LayoutUtil::MakeLayout(
+      minor_to_major, dim_level_types, dim_unique, dim_ordered, vec_tiles,
+      index_primitive_type, pointer_primitive_type, memory_space,
+      std::move(physical_shape), dynamic_shape_metadata_prefix_bytes);
   return true;
 }
 
@@ -5915,6 +6064,19 @@ StatusOr<std::vector<bool>> HloParserImpl::ParseParameterReplicationOnly() {
       parameter_replication.replicated_at_leaf_buffers().end());
 }
 
+StatusOr<HloParserImpl::BoolList>
+HloParserImpl::ParseBooleanListOrSingleBooleanOnly() {
+  lexer_.Lex();
+  BoolList booleans;
+  if (!ParseBooleanListOrSingleBoolean(&booleans)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after boolean list");
+  }
+  return booleans;
+}
+
 StatusOr<std::vector<ReplicaGroup>> HloParserImpl::ParseReplicaGroupsOnly() {
   lexer_.Lex();
   std::vector<ReplicaGroup> replica_groups;
@@ -6052,6 +6214,12 @@ StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
   return parser.ParseParameterReplicationOnly();
 }
 
+StatusOr<HloParserImpl::BoolList> ParseBooleanListOrSingleBoolean(
+    absl::string_view str) {
+  HloParserImpl parser(str);
+  return parser.ParseBooleanListOrSingleBooleanOnly();
+}
+
 StatusOr<std::vector<ReplicaGroup>> ParseReplicaGroupsOnly(
     absl::string_view str) {
   HloParserImpl parser(str);
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 7fac4279d03..0ab47a4d276 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_lexer.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 982b1eea075..b2c963a6d80 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -24,8 +25,8 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -198,6 +199,17 @@ ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
   ROOT %is-finite = pred[6]{0} is-finite(f32[6]{0} %constant)
 }
 
+)"
+},
+// NaN constants for F8E4M3FN
+{
+"ConstantNonFiniteE4M3",
+R"(HloModule ConstantR1F8E4M3FNs_module, entry_computation_layout={()->f8e4m3fn[3]{0}}
+
+ENTRY %IsFiniteR1F32s.v2 () -> f8e4m3fn[3] {
+  ROOT %constant = f8e4m3fn[3]{0} constant({nan, 7, -nan})
+}
+
 )"
 },
 // constant f16
@@ -254,7 +266,7 @@ ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f3
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
   %greater-than = pred[4]{0} compare(f32[4]{0} %v1, f32[4]{0} %v2), direction=GT, type=TOTALORDER, sharding={replicated}
-  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
+  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated}
 }
 
 )"
@@ -353,7 +365,7 @@ R"(HloModule CopyStartAndCopyDone_module, entry_computation_layout={(f32[],f32[2
 
 ENTRY %CopyStartAndCopyDone (v1: f32[], v2: f32[2,3]) -> (f32[], f32[2,3]) {
   %v1 = f32[] parameter(0)
-  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1), is_cross_program_prefetch=true
+  %copy-start.1 = (f32[], f32[], u32[]) copy-start(f32[] %v1), cross_program_prefetch_index=0
   %copy-done.1 = f32[] copy-done((f32[], f32[], u32[]) %copy-start.1)
   %v2 = f32[2,3]{1,0:S(1)} parameter(1)
   %copy-start.2 = (f32[2,3]{1,0:S(2)}, f32[2,3]{1,0:S(1)}, u32[]) copy-start(f32[2,3]{1,0:S(1)} %v2)
@@ -370,10 +382,10 @@ R"(HloModule TwoSendRecvBothWayRecvFist_module, entry_computation_layout={()->(f
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
   %token0 = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, sharding={maximal device=1}
-  ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, sharding={maximal device=1}
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, sharding={{maximal device=1}, {replicated}, {replicated}}
+  ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, sharding={{maximal device=1}, {replicated}}
   %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, sharding={{maximal device=1}, {replicated}, {replicated}}, control-predecessors={%recv}
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, sharding={maximal device=0}
 }
 
@@ -962,6 +974,28 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
   ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
 }
 
+)"
+},
+// FusionWithAliasing
+{
+"FusionWithAliasing",
+R"(HloModule FusionWithAliasing, entry_computation_layout={((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}),f32[123,4]{0,1})->(f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2})}
+
+%FusedComp (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[123,4], f32[2,2], f32[1,2,3]) {
+  %p1 = f32[123,4]{0,1} parameter(1)
+  %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
+  %elem1 = f32[2,2]{0,1} get-tuple-element((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0), index=0
+  %constant0 = f32[] constant(1)
+  %broadcast0 = f32[1,2,3]{0,1,2} broadcast(f32[] %constant0), dimensions={}
+  ROOT %tuple = (f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2}) tuple(f32[123,4]{0,1} %p1, f32[2,2]{0,1} %elem1, f32[1,2,3]{0,1,2} %broadcast0)
+}
+
+ENTRY %FusionWithAliasing (p0.1: (f32[2,2], f32[42,2,3]), p1.1: f32[123,4]) -> (f32[123,4], f32[2,2], f32[1,2,3]) {
+  %p0.1 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
+  %p1.1 = f32[123,4]{0,1} parameter(1)
+  ROOT %fusion = (f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2}) fusion((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0.1, f32[123,4]{0,1} %p1.1), kind=kLoop, output_to_operand_aliasing={{0}: (1, {}), {1}: (0, {0})}, calls=%FusedComp
+}
+
 )"
 },
 {
@@ -2557,7 +2591,7 @@ TEST_F(HloParserTest, MoreConstants) {
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
-  %constant.1 = s32[] constant(-42), sharding={devices=[2,2]1,2,3,4}
+  %constant.1 = s32[] constant(-42), sharding={replicated}
   %constant = s32[] constant(42)
   %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant)
 }
@@ -2582,6 +2616,19 @@ ENTRY %configuration_test() -> s32[] {
                            ->raw_backend_config_string());
 }
 
+TEST_F(HloParserTest, LiteralDimensionsError) {
+  const std::string original = R"(HloModule some_2x3_module
+
+ENTRY %some_2x3 () -> f32[2,3] {
+  ROOT %constant = f32[2,3]{1,0} constant(}{1, 2, 3}, {4, 5, 6}})
+}
+
+)";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_NE(OkStatus(), result.status());
+  ExpectHasSubstr(result.status().error_message(), "unexpected '}' token");
+}
+
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
   const std::string original = R"(HloModule some_2_module
 
@@ -2790,6 +2837,34 @@ ENTRY %NanPayload () -> bf16[2] {
   EXPECT_EQ(result.value()->ToString(HloPrintOptions()), original);
 }
 
+TEST_F(HloParserTest, InvalidNanPayloadBf16) {
+  const std::string original =
+      R"(HloModule InvalidNanPayloadBf16_module, entry_computation_layout={()->bf16[1]{0}}
+
+ENTRY %NanPayload () -> bf16[1] {
+  ROOT %constant = bf16[1]{0} constant({nan(0x3ff)})
+}
+
+)";
+  ExpectHasSubstr(
+      ParseAndReturnUnverifiedModule(original).status().error_message(),
+      "tries to set NaN payload 0x3ff");
+}
+
+TEST_F(HloParserTest, InvalidNanPayloadF8e4m3fn) {
+  const std::string original =
+      R"(HloModule InvalidNanPayloadF8e4m3fn_module, entry_computation_layout={()->f8e4m3fn[1]{0}}
+
+ENTRY %NanPayload () -> f8e4m3fn[1] {
+  ROOT %constant = f8e4m3fn[1]{0} constant({nan(0x1)})
+}
+
+)";
+  ExpectHasSubstr(
+      ParseAndReturnUnverifiedModule(original).status().error_message(),
+      "tries to set NaN payload 0x1");
+}
+
 TEST_F(HloParserTest, AttributesAnyOrder) {
   const std::string original = R"(HloModule any_order_module
 
@@ -3855,6 +3930,18 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
 }
 
+TEST_F(HloParserTest, ParseShapeStringWithDynamicShapeMetadataPrefix) {
+  // Tile, element size, and memory space.
+  std::string shape_string = "f32[123,456]{1,0:T(16,128)M(1024)}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {1, 0},
+                                                       {Tile({16, 128})});
+  expected.mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(1024);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+}
+
 TEST_F(HloParserTest, ParseOpaqueType) {
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
   Shape expected = ShapeUtil::MakeOpaqueShape();
@@ -3900,6 +3987,17 @@ TEST_F(HloParserTest, ParseDynamicTuple) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST_F(HloParserTest, ParseInvalidDimLevel) {
+  constexpr std::string_view shape_string = "f32[123]{0:D(D+~)}";
+  StatusOr<Shape> result = ParseShape(shape_string);
+  ASSERT_THAT(
+      result.status(),
+      tsl::testing::StatusIs(
+          tsl::error::INVALID_ARGUMENT,
+          testing::HasSubstr(
+              "invalid DimLevelType/unique/ordered combination in shape")));
+}
+
 TEST_F(HloParserTest, NegativeParameterNumber) {
   const std::string hlo_string = "par0 = f32[3,5] parameter(-1)";
   auto result = ParseAndReturnUnverifiedModule(hlo_string);
@@ -4111,6 +4209,46 @@ ENTRY TestComputation {
   EXPECT_TRUE(result.value()->config().alias_passthrough_params());
 }
 
+TEST_F(HloParserTest, CheckAllowSpmdShardingPropagationToOutput) {
+  const char* const hlo_string = R"(
+HloModule TestModule, allow_spmd_sharding_propagation_to_output=true
+
+ENTRY TestComputation {
+    p0 = f16[2048,1024] parameter(0)
+    p1 = f16[2048,1024] parameter(1)
+    ROOT root = (f16[2048,1024], f16[2048,1024]) tuple(p0, p1)
+}
+)";
+  auto result = ParseAndReturnVerifiedModule(hlo_string);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(
+      (*result)->config().allow_spmd_sharding_propagation_to_output().size(),
+      1);
+  EXPECT_TRUE(
+      (*result)->config().allow_spmd_sharding_propagation_to_output()[0]);
+}
+
+TEST_F(HloParserTest, CheckAllowSpmdShardingPropagationToOutputVec) {
+  const char* const hlo_string = R"(
+HloModule TestModule, allow_spmd_sharding_propagation_to_output={true,false}
+
+ENTRY TestComputation {
+    p0 = f16[2048,1024] parameter(0)
+    p1 = f16[2048,1024] parameter(1)
+    ROOT root = (f16[2048,1024], f16[2048,1024]) tuple(p0, p1)
+}
+)";
+  auto result = ParseAndReturnVerifiedModule(hlo_string);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(
+      (*result)->config().allow_spmd_sharding_propagation_to_output().size(),
+      2);
+  EXPECT_TRUE(
+      (*result)->config().allow_spmd_sharding_propagation_to_output()[0]);
+  EXPECT_FALSE(
+      (*result)->config().allow_spmd_sharding_propagation_to_output()[1]);
+}
+
 TEST_F(HloParserTest, NestedBroadcastWithoutDimensionsAttribute) {
   const char* const hlo_string = R"(
 HloModule test
@@ -4174,5 +4312,103 @@ ENTRY test {
               "Layout has physical shape, but is not for a sparse array")));
 }
 
+TEST_F(HloParserTest, ParseSingleComputation) {
+  const std::string original = R"(
+test {
+  ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(original));
+  EXPECT_TRUE(module->entry_computation()
+                  ->ComputeProgramShape()
+                  .parameters()[0]
+                  .has_layout());
+  EXPECT_TRUE(
+      module->entry_computation()->ComputeProgramShape().result().has_layout());
+  EXPECT_EQ(module->entry_computation()
+                ->ComputeProgramShape()
+                .parameters()[0]
+                .layout(),
+            Layout({1, 0, 2, 3}));
+  EXPECT_EQ(
+      module->entry_computation()->ComputeProgramShape().result().layout(),
+      Layout({1, 0, 2, 3}));
+}
+
+TEST_F(HloParserTest, ParseSingleEntryComputation) {
+  const std::string original = R"(
+ENTRY test {
+  ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(original));
+  EXPECT_TRUE(module->entry_computation()
+                  ->ComputeProgramShape()
+                  .parameters()[0]
+                  .has_layout());
+  EXPECT_TRUE(
+      module->entry_computation()->ComputeProgramShape().result().has_layout());
+  EXPECT_EQ(module->entry_computation()
+                ->ComputeProgramShape()
+                .parameters()[0]
+                .layout(),
+            Layout({1, 0, 2, 3}));
+  EXPECT_EQ(
+      module->entry_computation()->ComputeProgramShape().result().layout(),
+      Layout({1, 0, 2, 3}));
+}
+
+TEST_F(HloParserTest, ParseMultiComputations) {
+  const std::string original = R"(
+comp1 {
+  ROOT root =  f32[1,64,10,128]{3,2,1,0} parameter(0)
+}
+comp2 {
+  ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(original));
+  EXPECT_TRUE(module->entry_computation()
+                  ->ComputeProgramShape()
+                  .parameters()[0]
+                  .has_layout());
+  EXPECT_TRUE(
+      module->entry_computation()->ComputeProgramShape().result().has_layout());
+  EXPECT_EQ(module->entry_computation()
+                ->ComputeProgramShape()
+                .parameters()[0]
+                .layout(),
+            Layout({1, 0, 2, 3}));
+  EXPECT_EQ(
+      module->entry_computation()->ComputeProgramShape().result().layout(),
+      Layout({1, 0, 2, 3}));
+}
+
+TEST_F(HloParserTest, ParseMultiComputationsWithEntry) {
+  const std::string original = R"(
+ENTRY comp1 {
+  ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
+}
+comp2 {
+  ROOT root =  f32[1,64,10,128]{3,2,1,0} parameter(0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(original));
+  EXPECT_TRUE(module->entry_computation()
+                  ->ComputeProgramShape()
+                  .parameters()[0]
+                  .has_layout());
+  EXPECT_TRUE(
+      module->entry_computation()->ComputeProgramShape().result().has_layout());
+  EXPECT_EQ(module->entry_computation()
+                ->ComputeProgramShape()
+                .parameters()[0]
+                .layout(),
+            Layout({1, 0, 2, 3}));
+  EXPECT_EQ(
+      module->entry_computation()->ComputeProgramShape().result().layout(),
+      Layout({1, 0, 2, 3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 63c20fc7bd7..ec3dab2fffa 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <algorithm>
 #include <type_traits>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/hlo_pass_interface.h b/tensorflow/compiler/xla/service/hlo_pass_interface.h
index 26273ff81be..468b43f0f8d 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_interface.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_interface.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 76c806dd19c..d15d3b3f156 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
 
@@ -180,8 +181,20 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
       compilation_stats_->StartPass(pass_name);
     }
     RecordPassStartMetadata(*hlo, pass_name, pipeline_name);
+    // Embed RunHelper into lambda to enable recording of error statuses
+    auto run_helper_lambda =
+        [this, pass_name](
+            HloPassInterface* pass, HloT* hlo,
+            const absl::flat_hash_set<absl::string_view>& execution_threads) {
+          auto status_or = RunHelper(pass, hlo, execution_threads);
+          if (!status_or.ok()) {
+            compilation_stats_->RecordPassError(
+                pass_name, tsl::error_name(status_or.status().code()));
+          }
+          return status_or;
+        };
     TF_ASSIGN_OR_RETURN(bool pass_changed,
-                        RunHelper(pass, hlo, execution_threads));
+                        run_helper_lambda(pass, hlo, execution_threads));
     SetInstructionMetadata(*hlo);
     if (!dump_regex.empty() && (pass_changed || dump_regex != ".*")) {
       MaybeDumpHloAndSaveFilenames(*hlo,
@@ -194,8 +207,18 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
     changed |= pass_changed;
     if (pass_changed) {
       VLOG(3) << "  Pass caused changes " << pass->name();
+      // Embed RunInvariantCheckers into lambda to enable recording of errors
+      auto run_invariant_checkers_lambda = [this](HloT* hlo,
+                                                  absl::string_view pass_name) {
+        auto status = RunInvariantCheckers(hlo, pass_name);
+        if (!status.ok()) {
+          compilation_stats_->RecordPassError(pass_name,
+                                              tsl::error_name(status.code()));
+        }
+        return status;
+      };
+      TF_RETURN_IF_ERROR(run_invariant_checkers_lambda(hlo, pass_name));
     }
-    TF_RETURN_IF_ERROR(RunInvariantCheckers(hlo, pass_name));
     if (!pass->IsPassPipeline()) {
       compilation_stats_->EndPass(pass_name);
     }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index cb798ca965b..44d42796269 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/compilation_stats.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
index 9a35e1c314c..c1383c89386 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.h b/tensorflow/compiler/xla/service/hlo_phi_graph.h
index 9230551b361..0185514a505 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.h
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.h
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index cc01306a19e..a869b910aca 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
index c3cacd7ce6b..f8a86314184 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 9b40c4da1fa..9c0e5841e85 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index 2d660cb3bba..b6fa2fca2bc 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -17,10 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index f574de30370..0f42bd9ea7b 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -17,36 +17,34 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
 HloReachabilityMap::HloReachabilityMap(
     absl::Span<const HloInstruction* const> instructions)
-    : size_(instructions.size()) {
-  bit_vectors_.reserve(size_);
-  for (const HloInstruction* hlo : instructions) {
-    indices_[GetKey(hlo)] = bit_vectors_.size();
-    bit_vectors_.emplace_back(size_);
+    : bit_sets_(instructions.size(), BitSet(instructions.size())) {
+  for (size_t i = 0; i < instructions.size(); ++i) {
+    bit_sets_[i].Set(i);  // Instructions are reachable from themselves.
+    indices_[GetKey(instructions[i])] = i;
   }
-  CHECK_EQ(size_, indices_.size());  // instructions should be unique
 }
 
 bool HloReachabilityMap::SetReachabilityToUnion(
     absl::Span<const HloInstruction* const> inputs,
     const HloInstruction* instruction) {
   Index index = GetIndex(instruction);
-  BitVector& bit_vector = GetBitVector(index);
-  tmp_bit_vector_ = bit_vector;
+  BitSet& bit_set = bit_sets_[index];
+  tmp_bit_set_ = bit_set;
   SetReachabilityToUnionHelper(inputs, index);
-  return bit_vector != tmp_bit_vector_;
+  return bit_set != tmp_bit_set_;
 }
 
 void HloReachabilityMap::FastSetReachabilityToUnion(
     absl::Span<const HloInstruction* const> inputs,
     const HloInstruction* instruction) {
-  Index index = GetIndex(instruction);
-  SetReachabilityToUnionHelper(inputs, index);
+  SetReachabilityToUnionHelper(inputs, GetIndex(instruction));
 }
 
 void HloReachabilityMap::FastSetReachabilityToUnion(
@@ -66,30 +64,25 @@ void HloReachabilityMap::SetReachabilityToUnionHelper(
 
 void HloReachabilityMap::SetReachabilityToUnionHelper(
     absl::Span<const Index> input_indices, Index index) {
-  BitVector& bit_vector = GetBitVector(index);
-  // If instruction is part of inputs, don't reset the bit_vector.
+  BitSet& bit_set = bit_sets_[index];
+  // If instruction is part of inputs, don't reset the bit-set.
   if (!absl::c_linear_search(input_indices, index)) {
-    bit_vector.SetToZero();
+    bit_set.SetToZero();
   }
-  bit_vector.Set(index.v);
+  bit_set.Set(index);
   for (Index input_index : input_indices) {
     if (input_index != index) {
-      bit_vector.OrWith(GetBitVector(input_index));
+      bit_set |= bit_sets_[input_index];
     }
   }
 }
 
 void HloReachabilityMap::Replace(const HloInstruction* original,
                                  const HloInstruction* replacement) {
-  if (GetKey(original) == GetKey(replacement)) {
-    return;
+  if (GetKey(original) != GetKey(replacement)) {
+    indices_[GetKey(replacement)] = GetIndex(original);
+    indices_.erase(GetKey(original));
   }
-  indices_[GetKey(replacement)] = GetIndex(original).v;
-  indices_.erase(GetKey(original));
-}
-
-void HloReachabilityMap::SetReachable(Index a, Index b) {
-  GetBitVector(b).Set(a.v);
 }
 
 std::unique_ptr<HloReachabilityMap> HloReachabilityMap::BuildWithRestrictions(
@@ -111,67 +104,36 @@ std::unique_ptr<HloReachabilityMap> HloReachabilityMap::BuildWithRestrictions(
 
 std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
     const HloComputation* computation) {
-  const auto& all = computation->MakeInstructionPostOrder();
-  auto result = std::make_unique<HloReachabilityMap>(all);
-  auto channel_group = computation->ComputeChannelDependencies();
-
-  std::vector<HloInstruction*> inputs;
-
-  const auto add_input = [&channel_group, &inputs](HloInstruction* input) {
-    inputs.push_back(input);
-    if ((input->opcode() == HloOpcode::kAllReduce ||
-         input->opcode() == HloOpcode::kReduceScatter) &&
-        input->channel_id()) {
-      auto it = channel_group.find(*input->channel_id());
-      if (it != channel_group.end()) {
-        inputs.insert(inputs.end(), it->second.begin(), it->second.end());
-      }
-    }
+  HloComputation::ChannelDependencies channel_dependencies =
+      computation->ComputeChannelDependencies();
+  std::vector<HloInstruction*> instructions =
+      computation->MakeInstructionPostOrder(channel_dependencies);
+  auto result = std::make_unique<HloReachabilityMap>(instructions);
+
+  auto get_bit_set = [&](const HloInstruction* instruction) -> BitSet& {
+    return result->bit_sets_[result->GetIndex(instruction)];
   };
 
-  const auto add_dependencies = [&add_input](const HloInstruction* hlo) {
-    for (HloInstruction* operand : hlo->operands()) {
-      add_input(operand);
-    }
-    for (HloInstruction* predecessor : hlo->control_predecessors()) {
-      add_input(predecessor);
-    }
-  };
+  for (const HloInstruction* instruction : instructions) {
+    BitSet& bit_set = get_bit_set(instruction);
 
-  for (const HloInstruction* hlo : all) {
-    inputs.clear();
-    add_dependencies(hlo);
-
-    switch (hlo->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_group.find(*hlo->channel_id());
-        if (it != channel_group.end()) {
-          for (HloInstruction* channel : it->second) {
-            if (channel->opcode() == HloOpcode::kSend) {
-              add_input(channel);
-            }
-          }
-        }
-        break;
+    auto add_dependencies = [&](const HloInstruction* instruction) {
+      for (const HloInstruction* operand : instruction->operands()) {
+        bit_set |= get_bit_set(operand);
       }
-      case HloOpcode::kAllReduce:
-      case HloOpcode::kReduceScatter: {
-        auto channel_id = hlo->channel_id();
-        if (channel_id) {
-          auto it = channel_group.find(channel_id.value());
-          if (it != channel_group.end()) {
-            for (HloInstruction* all_reduce : it->second) {
-              add_dependencies(all_reduce);
-            }
-          }
-        }
-        break;
+      for (const HloInstruction* predecessor :
+           instruction->control_predecessors()) {
+        bit_set |= get_bit_set(predecessor);
       }
-      default:
-        break;
-    }
+    };
 
-    result->FastSetReachabilityToUnion(inputs, hlo);
+    add_dependencies(instruction);
+
+    // If an instruction has channel depencencies, they are also reachable.
+    auto it = channel_dependencies.find(instruction);
+    if (it != channel_dependencies.end()) {
+      absl::c_for_each(it->second, add_dependencies);
+    }
   }
   return result;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 51bbd8fc928..6a4e8126a18 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -16,19 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 
-#include <cstdio>
-#include <list>
+#include <memory>
+#include <utility>
 #include <vector>
 
-#include "absl/base/casts.h"
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
 
@@ -41,22 +39,8 @@ namespace xla {
 // sense.
 class HloReachabilityMap {
  public:
-  // An opaque index that clients can use to make repeated operations for the
-  // same instruction faster, by calling GetIndex once for the instruction,
-  // and then calling the variants of other interfaces that take Index arguments
-  // rather than HloInstruction* arguments.
-  struct Index {
-   public:
-    bool operator==(Index other) const { return v == other.v; }
-    bool operator!=(Index other) const { return v != other.v; }
-
-   private:
-    friend class HloReachabilityMap;
+  using Index = size_t;
 
-    // Index assigned for a particular instruction.  The value is used to index
-    // into the vector of BitVectors and the BitVectors themselves.
-    int v;
-  };
   // Sets up a graph with no edges and where the nodes correspond to the given
   // instructions.
   explicit HloReachabilityMap(
@@ -106,9 +90,7 @@ class HloReachabilityMap {
                                   Index index);
 
   Index GetIndex(const HloInstruction* instruction) const {
-    Index i;
-    i.v = FindOrDie(indices_, GetKey(instruction));
-    return i;
+    return indices_.at(GetKey(instruction));
   }
 
   // Sets entry so that IsReachable(a, b) will return true
@@ -120,7 +102,7 @@ class HloReachabilityMap {
   void SetReachable(const HloInstruction* a, const HloInstruction* b) {
     SetReachable(GetIndex(a), GetIndex(b));
   }
-  void SetReachable(Index a, Index b);
+  void SetReachable(Index a, Index b) { bit_sets_[b].Set(a); }
 
   // Updates the given reachability map after the immediate predecessor set
   // (operands and control predecessors) of 'instruction' has changed.
@@ -133,7 +115,7 @@ class HloReachabilityMap {
   bool IsReachable(const HloInstruction* a, const HloInstruction* b) const {
     return IsReachable(GetIndex(a), GetIndex(b));
   }
-  bool IsReachable(Index a, Index b) const { return GetBitVector(b).Get(a.v); }
+  bool IsReachable(Index a, Index b) const { return bit_sets_[b].Get(a); }
 
   // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
   //
@@ -147,8 +129,8 @@ class HloReachabilityMap {
   }
 
   // Checks if an instruction is in the Reachability map.
-  bool IsPresent(const HloInstruction* a) const {
-    return indices_.contains(GetKey(a));
+  bool IsPresent(const HloInstruction* instruction) const {
+    return indices_.contains(GetKey(instruction));
   }
 
   // Replace the instruction "original" with "replacement" in the reachability
@@ -157,97 +139,71 @@ class HloReachabilityMap {
                const HloInstruction* replacement);
 
  private:
-  // A bit-vector implementation specialized for this use case which provides a
-  // fast bitwise OR operation not available in tsl::gtl::BitMap.
-  class BitVector {
+  // A dynamically sized bit-set implementation specialized for this use case
+  // providing fast bitwise OR (not available in tsl::gtl::BitMap).
+  class BitSet {
    public:
-    BitVector() = default;
-    BitVector(size_t size)
+    BitSet() = default;
+    explicit BitSet(size_t size)
         : size_(size), vector_((size + kBits - 1) / kBits, 0) {}
 
-    // Return the bit at the given index.
-    bool Get(size_t index) const {
+    // Returns the bit at the given index.
+    bool Get(Index index) const {
       DCHECK(index >= 0 && index < size_);
       return vector_[index / kBits] & (1ull << (index % kBits));
     }
 
-    // Set the bit at the given index.
-    void Set(size_t index) {
+    // Sets the bit at the given index.
+    void Set(Index index) {
       DCHECK(index >= 0 && index < size_);
       vector_[index / kBits] |= 1ull << (index % kBits);
     }
 
-    // Set this bitvector to the Logical OR of this bitvector and 'other'.
-    void OrWith(const BitVector& other) {
+    // Sets this bit-set to union of this bit-set and `other`.
+    void operator|=(const BitSet& other) {
       for (size_t i = 0; i < vector_.size(); ++i) {
         vector_[i] |= other.vector_[i];
       }
     }
 
-    // Set the bitvector to all zeros.
-    void SetToZero() { std::fill(vector_.begin(), vector_.end(), 0); }
+    // Sets the bitvector to all zeros.
+    void SetToZero() { absl::c_fill(vector_, 0); }
 
-    bool operator==(const BitVector& other) const {
+    bool operator==(const BitSet& other) const {
       return vector_ == other.vector_;
     }
-    bool operator!=(const BitVector& other) const {
-      return vector_ != other.vector_;
-    }
+    bool operator!=(const BitSet& other) const { return !(*this == other); }
 
    private:
     using Word = uint64_t;
     static constexpr size_t kBits = 64;
 
-    // Number of bits in the bitvector.
-    size_t size_;
-
+    size_t size_;  // Number of bits in the set.
     std::vector<Word> vector_;
   };
 
-  // Return the bitvector storing the reachability-to of the given instruction.
-  const BitVector& GetBitVector(const HloInstruction* instruction) const {
-    return GetBitVector(GetIndex(instruction));
-  }
-  BitVector& GetBitVector(const HloInstruction* instruction) {
-    return GetBitVector(GetIndex(instruction));
+  using Key = std::pair<int, int>;  // module ID, instruction ID.
+  static Key GetKey(const HloInstruction* instruction) {
+    return {instruction->GetModule()->unique_id(), instruction->unique_id()};
   }
 
-  const BitVector& GetBitVector(Index index) const {
-    return bit_vectors_[index.v];
-  }
-  BitVector& GetBitVector(Index index) { return bit_vectors_[index.v]; }
-
   // Helper for SetReachabilityToUnion/FastSetReachabilityToUnion.
   void SetReachabilityToUnionHelper(
       absl::Span<const HloInstruction* const> inputs, Index index);
   void SetReachabilityToUnionHelper(absl::Span<const Index> input_indices,
                                     Index index);
 
-  uint64_t GetKey(const HloInstruction* instruction) const {
-    uint64_t unique_id = absl::bit_cast<uint32_t>(instruction->unique_id());
-    uint64_t module_id =
-        absl::bit_cast<uint32_t>(instruction->GetModule()->unique_id());
-    return (module_id << 32) | unique_id;
-  }
-  // Return the index of the given instruction.
-  int GetIndexInternal(const HloInstruction* instruction) const {
-    return FindOrDie(indices_, GetKey(instruction));
-  }
-
-  // The number of instructions in the reachability map.
-  const size_t size_;
-
-  // Dense assignment from HloInstruction::unique_id to number. These numbers
-  // index into the bit_vectors_ vector and into the bits within a BitVector.
-  absl::flat_hash_map<uint64_t, int> indices_;
+  // Map from instruction to index. The index is used for bit_set_ and the bits
+  // within a BitSet.
+  absl::flat_hash_map<Key, Index> indices_;
 
-  // Bitvectors holding the reachability to each instruction. The bit vector for
+  // Bit-sets holding the reachability to each instruction. The bit-set for
   // instruction X includes ones for each instruction which X is reachable from.
-  std::vector<BitVector> bit_vectors_;
+  std::vector<BitSet> bit_sets_;
 
   // A temporary used by SetReachabilityToUnion to avoid an allocation with each
   // call to the method.
-  BitVector tmp_bit_vector_;
+  BitSet tmp_bit_set_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
index 5d0037b8923..6bd760e6964 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index db5ee2a58ab..333d5ab21f0 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -30,22 +30,21 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
-#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -445,46 +444,41 @@ class InstructionList {
   absl::flat_hash_map<const HloInstruction*, Item*> item_map_;
 };
 
-// Return the items which use the given HloValue. Sets
+// Return the items which use the given LogicalBuffer. Sets
 // has_indirect_users to whether any of the uses is indirect. A use is indirect
-// if the instruction defining hlo_value is not an operand of the use. This
+// if the instruction defining logical_buffer is not an operand of the use. This
 // can happen via buffer aliasing (eg, tuples).
 UsesList GetUsers(const InstructionList& instruction_list,
-                  const HloValue* hlo_value,
-                  const HloDataflowAnalysis& dataflow_analysis,
-                  const HloComputation* computation, bool* has_indirect_users) {
+                  const LogicalBuffer* logical_buffer,
+                  const TuplePointsToAnalysis& points_to_analysis,
+                  bool* has_indirect_users) {
   UsesList users;
   // To identify uses iterate through all HloInstruction users of the
-  // HloPositions of the hlo_value.
+  // BufferAliases of the logical buffer.
   *has_indirect_users = false;
-  for (const auto& position : hlo_value->positions()) {
-    if (position.instruction->parent() != computation) {
-      // Rematerialization is done for a given computation, hence skip if
-      // computations differ.
-      continue;
-    }
-
-    for (const HloInstruction* user : position.instruction->users()) {
-      if (dataflow_analysis.DoesNotUseOperandBuffer(position.instruction,
-                                                    position.index, user)) {
-        // The alias may be an operand of 'user', but the HloValue cannot
+  for (const BufferAlias& buffer_alias :
+       points_to_analysis.GetBufferAliases(*logical_buffer)) {
+    for (const HloInstruction* user : buffer_alias.instruction()->users()) {
+      if (points_to_analysis.DoesNotUseOperandBuffer(
+              buffer_alias.instruction(), buffer_alias.index(), user)) {
+        // The alias may be an operand of 'user', but the LogicalBuffer cannot
         // possibly be used by the instruction so ignore 'user'. This is the
         // case, for example, for the tuple element buffers in a GetTupleElement
         // instruction (the GTE instruction only uses the pointer vector).
         continue;
       }
-      if (position.instruction != hlo_value->instruction() &&
-          !IsSupportedIndirectUser(position.instruction)) {
+      if (buffer_alias.instruction() != logical_buffer->instruction() &&
+          !IsSupportedIndirectUser(buffer_alias.instruction())) {
         *has_indirect_users = true;
       }
       // A buffer may be used by the instruction via more than one alias. For
       // example, a buffer which appears in more than one element of a tuple.
       Item* user_item = instruction_list.GetItem(user);
       std::optional<int64_t> user_index =
-          hlo_value->index().size() != 1
+          logical_buffer->index().size() != 1
               ? std::nullopt
-              : std::make_optional(hlo_value->index().back());
-      for (int64_t op_idx : user->OperandIndices(position.instruction)) {
+              : std::make_optional(logical_buffer->index().back());
+      for (int64_t op_idx : user->OperandIndices(buffer_alias.instruction())) {
         if (!absl::c_linear_search(
                 users,
                 ItemUse{user_item, static_cast<int>(op_idx), user_index})) {
@@ -499,19 +493,19 @@ UsesList GetUsers(const InstructionList& instruction_list,
 
 // Class for tracking memory usage of a computation as the instructions are
 // placed sequentially. Memory usage is the sum of the sizes of live values
-// (HloValues) at the current point in the instruction sequence.
+// (LogicalBuffers) at the current point in the instruction sequence.
 class MemoryUsageTracker {
  public:
   MemoryUsageTracker(
       const HloComputation* computation,
       const HloRematerialization::ShapeSizeFunction& size_function,
       const HloRematerialization::CompactShapeFunction& compact_shape_function,
-      const HloDataflowAnalysis& dataflow_analysis,
+      const TuplePointsToAnalysis& points_to_analysis,
       const InstructionList& instruction_list,
       HloRematerialization::RematerializationMode mode);
 
   // Starts the placement of the given instruction. This adds the sizes of the
-  // HloValues defined by the instruction to the current memory
+  // LogicalBuffers defined by the instruction to the current memory
   // usage. Placement is broken into two steps (BeginInstruction and
   // EndInstruction) to accurately model memory usage. At BeginInstruction the
   // memory for the output value(s) of the current instruction is allocated. At
@@ -619,10 +613,10 @@ class MemoryUsageTracker {
   std::string ToString() const;
 
  private:
-  // A Buffer represents a single HloValue in the computation including
-  // various metadata useful for tracking liveness of the value. A HloValue
+  // A Buffer represents a single LogicalBuffer in the computation including
+  // various metadata useful for tracking liveness of the value. A LogicalBuffer
   // is not used directly because the HLO graph is transformed and
-  // HloDataflowAnalysis which owns all HloValues cannot be updated after
+  // TuplePointsToAnalysis which owns all LogicalBuffers cannot be updated after
   // HLO graph transformations.
   struct Buffer {
     // The unique id of this Buffer. This value is equal to the buffer's index
@@ -667,17 +661,17 @@ class MemoryUsageTracker {
   // to avoid computing the shape multiple times.
   StatusOr<Shape> GetCompactShape(const HloInstruction* hlo);
 
-  // Creates a Buffer representing the given hlo_value. The buffer is added
+  // Creates a Buffer representing the given logical buffer. The buffer is added
   // to buffers_ and a reference is returned.
-  Buffer& CreateBufferFromHloValue(const HloValue* hlo_value,
-                                   const HloDataflowAnalysis& dataflow_analysis,
-                                   bool live_out) {
+  Buffer& CreateBufferFromLogicalBuffer(
+      const LogicalBuffer* logical_buffer,
+      const TuplePointsToAnalysis& points_to_analysis, bool live_out) {
     bool has_indirect_uses = false;
-    UsesList users = GetUsers(instruction_list_, hlo_value, dataflow_analysis,
-                              computation_, &has_indirect_uses);
-    return NewBuffer(instruction_list_.GetItem(hlo_value->instruction()),
-                     hlo_value->shape(), hlo_value->index(), std::move(users),
-                     live_out, has_indirect_uses);
+    UsesList users = GetUsers(instruction_list_, logical_buffer,
+                              points_to_analysis, &has_indirect_uses);
+    return NewBuffer(instruction_list_.GetItem(logical_buffer->instruction()),
+                     logical_buffer->shape(), logical_buffer->index(),
+                     std::move(users), live_out, has_indirect_uses);
   }
 
   // Create a new buffer representing a rematerialization of given buffer for
@@ -801,7 +795,7 @@ MemoryUsageTracker::MemoryUsageTracker(
     const HloComputation* computation,
     const HloRematerialization::ShapeSizeFunction& size_function,
     const HloRematerialization::CompactShapeFunction& compact_shape_function,
-    const HloDataflowAnalysis& dataflow_analysis,
+    const TuplePointsToAnalysis& points_to_analysis,
     const InstructionList& instruction_list,
     HloRematerialization::RematerializationMode mode)
     : computation_(computation),
@@ -809,77 +803,68 @@ MemoryUsageTracker::MemoryUsageTracker(
       size_function_(size_function),
       compact_shape_function_(compact_shape_function),
       mode_(mode) {
-  tsl::gtl::CompactPointerSet<const HloValue*> live_out_set;
-  for (auto& [_, hlo_value_set] : dataflow_analysis.GetInstructionValueSet(
-           computation_->root_instruction())) {
-    for (auto* hlo_value : hlo_value_set.values()) {
-      live_out_set.insert(hlo_value);
-    }
-  }
-  absl::flat_hash_map<const HloValue*, BufferId> hlo_value_to_buffer_id;
+  PointsToSet::BufferSet live_out_set =
+      points_to_analysis.GetPointsToSet(computation_->root_instruction())
+          .CreateFlattenedSet();
+  absl::flat_hash_map<const LogicalBuffer*, BufferId>
+      logical_buffer_to_buffer_id;
   for (auto* item = instruction_list_.first(); item != nullptr;
        item = instruction_list_.next(item)) {
     const HloInstruction* const instruction = item->instruction;
-    for (auto& [_, hlo_value_set] :
-         dataflow_analysis.GetInstructionValueSet(instruction)) {
-      for (const HloValue* hlo_value : hlo_value_set.values()) {
-        if (hlo_value->defining_instruction() != instruction) {
-          continue;
-        }
-        Buffer* buffer;
-        if (instruction->opcode() == HloOpcode::kWhile) {
-          // The while instruction defines no new buffers. Instead it reuses the
-          // buffers of its operand. Find the Buffer of its operand at the
-          // proper ShapeIndex.
-          const auto& operand_value_set =
-              dataflow_analysis.GetInstructionValueSet(instruction->operand(0));
-          CHECK_EQ(
-              operand_value_set.element(hlo_value->index()).values().size(), 1);
-          const HloValue* source_hlo_value =
-              operand_value_set.element(hlo_value->index()).values()[0];
-          buffer = &buffers_.at(hlo_value_to_buffer_id.at(source_hlo_value));
-
-          // Mark buffer as has indirect use and live out.
-          buffer->has_indirect_uses = true;
-          buffer->live_out =
-              buffer->live_out || ContainsKey(live_out_set, hlo_value);
-
-          // Add users of while to Buffer users.
-          bool unused;
-          for (ItemUse& user_item :
-               GetUsers(instruction_list_, hlo_value, dataflow_analysis,
-                        computation_, &unused)) {
-            auto existing_user_it = absl::c_find_if(
-                buffer->users,
-                [&](const ItemUse& use) { return user_item.user == use.user; });
-            if (existing_user_it == buffer->users.end()) {
-              buffer->unfinished_user_count++;
-              user_item.user->buffers_used.push_back(buffer->id);
-              buffer->users.push_back(user_item);
-            }
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
+      Buffer* buffer;
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        // The while instruction defines no new buffers. Instead it reuses the
+        // buffers of its operand. Find the Buffer of its operand at the
+        // proper ShapeIndex.
+        const PointsToSet& operand_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        CHECK_EQ(operand_points_to.element(logical_buffer->index()).size(), 1);
+        const LogicalBuffer* source_logical_buffer =
+            operand_points_to.element(logical_buffer->index())[0];
+        buffer =
+            &buffers_.at(logical_buffer_to_buffer_id.at(source_logical_buffer));
+
+        // Mark buffer as has indirect use and live out.
+        buffer->has_indirect_uses = true;
+        buffer->live_out =
+            buffer->live_out || ContainsKey(live_out_set, logical_buffer);
+
+        // Add users of while to Buffer users.
+        bool unused;
+        for (ItemUse& user_item : GetUsers(instruction_list_, logical_buffer,
+                                           points_to_analysis, &unused)) {
+          auto existing_user_it = absl::c_find_if(
+              buffer->users,
+              [&](const ItemUse& use) { return user_item.user == use.user; });
+          if (existing_user_it == buffer->users.end()) {
+            buffer->unfinished_user_count++;
+            user_item.user->buffers_used.push_back(buffer->id);
+            buffer->users.push_back(user_item);
           }
-        } else {
-          buffer =
-              &CreateBufferFromHloValue(hlo_value, dataflow_analysis,
-                                        ContainsKey(live_out_set, hlo_value));
-          item->buffers_defined.push_back(buffer->id);
-          for (ItemUse& user : buffer->users) {
-            if (!absl::c_linear_search(user.user->buffers_used, buffer->id)) {
-              user.user->buffers_used.push_back(buffer->id);
-            }
+        }
+      } else {
+        buffer = &CreateBufferFromLogicalBuffer(
+            logical_buffer, points_to_analysis,
+            ContainsKey(live_out_set, logical_buffer));
+        item->buffers_defined.push_back(buffer->id);
+        for (ItemUse& user : buffer->users) {
+          if (!absl::c_linear_search(user.user->buffers_used, buffer->id)) {
+            user.user->buffers_used.push_back(buffer->id);
           }
         }
-
-        hlo_value_to_buffer_id[hlo_value] = buffer->id;
       }
+
+      logical_buffer_to_buffer_id[logical_buffer] = buffer->id;
     }
 
-    // Trace the output of each instruction. This is so that we can
-    // properly track which outputs does GTEs have.
-    const auto& hlo_value_set =
-        dataflow_analysis.GetFlattenedValueSet(instruction);
-    for (const HloValue* hlo_value : hlo_value_set.values()) {
-      item->buffers_output.push_back(hlo_value_to_buffer_id[hlo_value]);
+    // Trace the output of each instruction. This is so that we can properly
+    // track which outputs does GTEs have.
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetPointsToSet(instruction).CreateFlattenedSet()) {
+      item->buffers_output.push_back(
+          logical_buffer_to_buffer_id[logical_buffer]);
     }
   }
   XLA_VLOG_LINES(10, ToString());
@@ -992,7 +977,7 @@ int64_t MemoryUsageTracker::MemoryReducedIfRematerialized(
     }
 
     // Compute the amount of memory reduced (if any) by rematerializing
-    // 'item->instruction'. The HloValues defined by 'item->instruction'
+    // 'item->instruction'. The LogicalBuffers defined by 'item->instruction'
     // will no longer be live at this program point, so initially set
     // memory_reduced to the size of its defined values.
     for (BufferId buffer_id : item->buffers_defined) {
@@ -1861,7 +1846,7 @@ StatusOr<int64_t> HloRematerialization::ComputePeakMemory(
     const absl::flat_hash_set<absl::string_view>& execution_threads) const {
   InstructionList instruction_list(order);
   MemoryUsageTracker tracker(computation, size_function_,
-                             compact_shape_function_, *dataflow_analysis_,
+                             compact_shape_function_, *points_to_analysis_,
                              instruction_list, mode_);
   int64_t peak_memory = tracker.memory_usage();
   for (auto* item = instruction_list.first(); item != nullptr;
@@ -1918,8 +1903,8 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
   InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(
-      computation, size_function_, compact_shape_function_, *dataflow_analysis_,
-      instruction_list, mode_);
+      computation, size_function_, compact_shape_function_,
+      *points_to_analysis_, instruction_list, mode_);
 
   instruction_list.PromoteNodesToSkip([&](Item* item) {
     return memory_tracker.AllocatedSize(item) >= min_remat_size;
@@ -2112,7 +2097,7 @@ StatusOr<bool> HloRematerialization::Run(
   net_instructions_added_ = 0;
 
   TF_RET_CHECK(module->has_schedule());
-  TF_ASSIGN_OR_RETURN(dataflow_analysis_, HloDataflowAnalysis::Run(*module));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module));
   next_channel_id_ = hlo_query::NextChannelId(*module);
 
   // Adjust memory limit to account for the output of the entry
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 9735ff8c999..e5237451956 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -18,12 +18,13 @@
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -199,7 +200,7 @@ class HloRematerialization : public HloModulePass {
   // occurs.
   absl::flat_hash_map<const HloComputation*, int64_t> computation_peak_memory_;
 
-  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
 
   // Set of computations which have had rematerialization
   // applied. Rematerialization is only applied once per computation.
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 45ae76c242e..0423529aec5 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_rematerialization_test_utils.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -112,10 +112,10 @@ TEST_F(HloRematerializationTest, SingleComputationNoWorthRemat) {
   ASSERT_THAT(slice, op::Slice(op::Concatenate(op::Broadcast(_), _)));
 
   // Set the minimum remat size to 14KiB, meaning no nodes should be remat.
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
-                                            /*memory_limit_bytes=*/
-                                            14 * 1024, module.get(),
-                                            /*min_remat_size=*/14 * 1024));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/14 * 1024, module.get(),
+                              /*min_remat_size=*/14 * 1024));
   EXPECT_FALSE(changed);
 }
 
@@ -400,11 +400,11 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   EXPECT_EQ(add_4->operand(0), bcast);
 
   // Pick a memory limit some where between 24KB (initial peak memory including
-  // parameter and output) and 19KB (peak memory possible with
+  // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/19 * 1024, module.get()));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -538,11 +538,11 @@ TEST_P(IndirectUseTest, IndirectUseRematerialized) {
   EXPECT_EQ(entry_computation->instruction_count(), 8);
 
   // Pick a memory limit some where between 24KB (initial peak memory
-  // including parameter and output) and 19KB (peak memory possible with
+  // including parameter and output) and 20KB (peak memory possible with
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/19 * 1024, module.get()));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   // Rematerialization should only occur if the rematerializable instruction
   // has no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils.h b/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils.h
index 1fb8c116365..932facb25bf 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils.h
@@ -21,10 +21,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_rematerialization.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils_test.cc
index b2f4409c2ae..a2d111c9c30 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test_utils_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
index 1bc8849ec5b..4ccbdb6a64e 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -384,7 +384,6 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
                   DetermineHloInstructionIsReplicated(
                       inst, index, cross_partition_spmd_, hlo_replication_,
                       support_partial_replication_);
-              return OkStatus();
             });
         changed |= assign_or_combine_shapetree(std::move(shape_tree), inst);
       }
@@ -393,7 +392,7 @@ bool HloReplicationAnalysis::ComputeHloReplicationOnComputation(
   return changed;
 }
 
-void HloReplicationAnalysis::ComputeHloReplication() {
+Status HloReplicationAnalysis::ComputeHloReplication() {
   // Add entry parameters to the above sets according to user annotation.
   // Replicated modules read from `parameter_replicated_at_leaf_buffers` whereas
   // SPMD partitioned modules read from HloSharding attributes.
@@ -404,7 +403,7 @@ void HloReplicationAnalysis::ComputeHloReplication() {
                                          HloReplication::UniqueOnAllDevices());
     const auto& replication = param->parameter_replicated_at_leaf_buffers();
     int leaf_index = 0;
-    ShapeUtil::ForEachSubshape(
+    Status status = ShapeUtil::ForEachSubshapeWithStatus(
         param->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
           if (!ShapeUtil::IsLeafIndex(param->shape(), index)) {
             return OkStatus();
@@ -438,10 +437,12 @@ void HloReplicationAnalysis::ComputeHloReplication() {
           }
           return OkStatus();
         });
+    TF_RETURN_IF_ERROR(status);
     hlo_replication_[param] = std::move(shape_tree);
   }
   ComputeHloReplicationOnComputation(entry,
                                      /*mark_everything_not_replicated=*/false);
+  return OkStatus();
 }
 
 bool HloReplicationAnalysis::HloInstructionIsReplicatedAt(
@@ -495,7 +496,7 @@ HloReplicationAnalysis::Run(const HloModule* module, bool cross_partition_spmd,
   auto analysis = absl::WrapUnique(new HloReplicationAnalysis(
       module, cross_partition_spmd, loops_known_with_same_iterations,
       /*support_partial_replication=*/false));
-  analysis->ComputeHloReplication();
+  TF_RETURN_IF_ERROR(analysis->ComputeHloReplication());
   return analysis;
 }
 
@@ -506,7 +507,7 @@ HloReplicationAnalysis::RunWithPartialReplication(const HloModule* module,
   auto analysis = absl::WrapUnique(
       new HloReplicationAnalysis(module, cross_partition_spmd, &empty,
                                  /*support_partial_replication=*/true));
-  analysis->ComputeHloReplication();
+  TF_RETURN_IF_ERROR(analysis->ComputeHloReplication());
   return analysis;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis.h b/tensorflow/compiler/xla/service/hlo_replication_analysis.h
index 431a23af678..e77661e182c 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -114,7 +114,7 @@ class HloReplicationAnalysis {
         support_partial_replication_(support_partial_replication) {}
 
   // Computes hlo_replication_.
-  void ComputeHloReplication();
+  Status ComputeHloReplication();
 
   // A helper function to recursively compute hlo_replication on a computation.
   // Returns whether hlo_replication_ is changed.
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
index 15777f2b056..b823a11acf7 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index c14555f3860..73e0540ea58 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -19,11 +19,12 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -89,8 +90,25 @@ StatusOr<Literal> HloRunner::TransferLiteralFromDevice(
     const ShapedBuffer& buffer) {
   TF_ASSIGN_OR_RETURN(
       auto stream, backend().BorrowStream(backend().default_stream_executor()));
+
+  if (buffer.on_device_shape().is_static()) {
+    return backend().transfer_manager()->TransferLiteralFromDevice(stream.get(),
+                                                                   buffer);
+  }
+
+  Shape device_shape = buffer.on_device_shape();
+  // Read real literal's shape first.
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->ReadDynamicShapes(
+      stream.get(), &buffer, &device_shape));
+
+  ShapedBuffer shaped_buffer(device_shape, buffer.device_ordinal());
+  // Populate buffer element by element since the shapes differ now.
+  shaped_buffer.buffers().ForEachMutableElement(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* base_buffer) {
+        *base_buffer = buffer.buffer(index);
+      });
   return backend().transfer_manager()->TransferLiteralFromDevice(stream.get(),
-                                                                 buffer);
+                                                                 shaped_buffer);
 }
 
 StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
@@ -103,9 +121,9 @@ StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
   TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
                       TransferLiteralsToDevice(arguments));
   TF_ASSIGN_OR_RETURN(ExecutionOutput result,
-                      ExecuteWithDeviceBuffers(
+                      ExecuteWithMovedDeviceBuffers(
                           /*module=*/std::move(module),
-                          /*arguments=*/argument_buffers,
+                          /*arguments=*/std::move(argument_buffers),
                           /*run_hlo_passes=*/run_hlo_passes,
                           /*profile=*/profile));
   return TransferLiteralFromDevice(result.Result());
@@ -124,14 +142,19 @@ StatusOr<Literal> HloRunner::ExecuteWithExecutable(
   return TransferLiteralFromDevice(result.Result());
 }
 
-// Convert the owning buffer of inputs into a (partially) owning vector of
-// ExecutionInputs, and an owning vector of `OwningDeviceMemory`'s.
+// Create a partially owning vector of `ExecutionInput`s based on an owning
+// vector of `OwningDeviceMemory`'s.
+//
+// This function creates owning references to memory which is already
+// owned by a ScopedShapedBuffer. This can result in double-free and similar
+// problems in rare cases (for example when the running of the HLO is
+// unsuccessful). We keep this here because too much code depends on it for
+// repeatedly running HLOs without reallocating device buffers.
 static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
     absl::Span<ScopedShapedBuffer const> inputs,
     HloInputOutputAliasConfig alias_config, int device_ordinal,
     se::DeviceMemoryAllocator* allocator) {
   std::vector<ExecutionInput> execution_inputs;
-  std::vector<se::OwningDeviceMemory> owned_args;
 
   for (int param_num = 0; param_num < inputs.size(); param_num++) {
     const ScopedShapedBuffer& input_buffer = inputs[param_num];
@@ -155,6 +178,52 @@ static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
   return execution_inputs;
 }
 
+// Convert the owning buffer of inputs into a (partially) owning vector of
+// ExecutionInputs, and an owning vector of `OwningDeviceMemory`'s.
+static void ExecutionInputsFromMovedScopedShapedBuffers(
+    std::vector<ExecutionInput>* out_execution_inputs,
+    std::vector<se::OwningDeviceMemory>* out_owned_args,
+    std::vector<ScopedShapedBuffer> inputs,
+    HloInputOutputAliasConfig alias_config, int device_ordinal,
+    se::DeviceMemoryAllocator* allocator) {
+  CHECK(out_execution_inputs->empty());
+  CHECK(out_owned_args->empty());
+
+  for (int param_num = 0; param_num < inputs.size(); param_num++) {
+    ShapedBuffer input_buffer = inputs[param_num].release();
+
+    ShapeTree<MaybeOwningDeviceMemory> buffer_tree(
+        input_buffer.on_device_shape());
+
+    input_buffer.buffers().ForEachElement(
+        [&](const ShapeIndex& index,
+            const se::DeviceMemoryBase& execution_input_buffer) {
+          if (alias_config.ParameterHasAlias(param_num, index)) {
+            VLOG(1) << "Input " << param_num << " index " << index.ToString()
+                    << " buffer " << execution_input_buffer.opaque()
+                    << " will be owned by out_execution_inputs.";
+
+            // Owned by out_execution_inputs.
+            // This allows the Executable to transfer the ownership to the
+            // ExecutionOutput.
+            *buffer_tree.mutable_element(index) = se::OwningDeviceMemory{
+                execution_input_buffer, device_ordinal, allocator};
+          } else {
+            VLOG(1) << "Input " << param_num << " index " << index.ToString()
+                    << " buffer " << execution_input_buffer.opaque()
+                    << " will be owned by out_owned_args.";
+
+            // Not owned by out_execution_inputs.
+            *buffer_tree.mutable_element(index) = execution_input_buffer;
+            // Owned by out_owned_args.
+            out_owned_args->emplace_back(execution_input_buffer, device_ordinal,
+                                         allocator);
+          }
+        });
+    out_execution_inputs->emplace_back(std::move(buffer_tree));
+  }
+}
+
 StatusOr<ExecutionOutput> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
     absl::Span<ScopedShapedBuffer const> arguments, bool run_hlo_passes,
@@ -167,6 +236,51 @@ StatusOr<ExecutionOutput> HloRunner::ExecuteWithDeviceBuffers(
 StatusOr<ExecutionOutput> HloRunner::ExecuteWithDeviceBuffers(
     Executable* executable, absl::Span<ScopedShapedBuffer const> arguments,
     ExecutionProfile* profile) {
+  std::vector<ExecutionInput> execution_arguments =
+      ExecutionInputsFromScopedShapedBuffers(
+          arguments, executable->module().input_output_alias_config(),
+          backend().default_stream_executor()->device_ordinal(),
+          backend().default_stream_executor()->GetAllocator());
+  return ExecuteWithExecutionInputs(executable, std::move(execution_arguments),
+                                    profile);
+}
+
+StatusOr<ExecutionOutput> HloRunner::ExecuteWithMovedDeviceBuffers(
+    std::unique_ptr<HloModule> module,
+    std::vector<ScopedShapedBuffer> arguments, bool run_hlo_passes,
+    ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
+  return ExecuteWithMovedDeviceBuffers(executable.get(), std::move(arguments),
+                                       profile);
+}
+
+StatusOr<ExecutionOutput> HloRunner::ExecuteWithMovedDeviceBuffers(
+    Executable* executable, std::vector<ScopedShapedBuffer> arguments,
+    ExecutionProfile* profile) {
+  std::vector<ExecutionInput> execution_arguments;
+  // We need this to keep the arguments not owned by execution_arguments alive.
+  std::vector<se::OwningDeviceMemory> owned_arguments;
+
+  ExecutionInputsFromMovedScopedShapedBuffers(
+      &execution_arguments, &owned_arguments, std::move(arguments),
+      executable->module().input_output_alias_config(),
+      backend().default_stream_executor()->device_ordinal(),
+      backend().default_stream_executor()->GetAllocator());
+
+  TF_ASSIGN_OR_RETURN(ExecutionOutput retval,
+                      ExecuteWithExecutionInputs(
+                          executable, std::move(execution_arguments), profile));
+
+  // This is here to make sure that the output buffers get freed up when the
+  // ExecutionOutput is destroyed.
+  retval.Commit();
+  return retval;
+}
+
+StatusOr<ExecutionOutput> HloRunner::ExecuteWithExecutionInputs(
+    Executable* executable, std::vector<ExecutionInput> arguments,
+    ExecutionProfile* profile) {
   xla::UpdateEntryComputationLayout(&executable->module(),
                                     device_shape_representation_fn_);
 
@@ -178,15 +292,9 @@ StatusOr<ExecutionOutput> HloRunner::ExecuteWithDeviceBuffers(
                                     nullptr, RunId());
   service_run_options.mutable_run_options()->set_execution_profile(profile);
 
-  std::vector<ExecutionInput> execution_arguments =
-      ExecutionInputsFromScopedShapedBuffers(
-          arguments, executable->module().input_output_alias_config(),
-          stream.parent()->device_ordinal(), stream.parent()->GetAllocator());
-
-  TF_ASSIGN_OR_RETURN(
-      ExecutionOutput retval,
-      executable->ExecuteOnStreamWrapper(&service_run_options,
-                                         std::move(execution_arguments)));
+  TF_ASSIGN_OR_RETURN(ExecutionOutput retval,
+                      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                                         std::move(arguments)));
   TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
   return std::move(retval);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 3fd6b128624..fa42f6fd5dc 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -23,12 +23,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner_interface.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -84,6 +84,12 @@ class HloRunner : public HloRunnerInterface {
 
   // As Execute(), but accepts and returns device buffers instead of host
   // buffers.
+  //
+  // ExecuteWithMovedDeviceBuffers is more memory-safe, but it consumes the
+  // arguments. Please consider using that.
+  //
+  // This may overwrite the values of the arguments if the the module has
+  // aliasing.
   StatusOr<ExecutionOutput> ExecuteWithDeviceBuffers(
       std::unique_ptr<HloModule> module,
       absl::Span<ScopedShapedBuffer const> arguments,
@@ -93,6 +99,20 @@ class HloRunner : public HloRunnerInterface {
       Executable* executable, absl::Span<ScopedShapedBuffer const> arguments,
       ExecutionProfile* profile = nullptr);
 
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  //
+  // This is a memory-safer version of ExecuteWithDeviceBuffers, but it consumes
+  // the arguments.
+  StatusOr<ExecutionOutput> ExecuteWithMovedDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      std::vector<ScopedShapedBuffer> arguments, bool run_hlo_passes = true,
+      ExecutionProfile* profile = nullptr);
+
+  StatusOr<ExecutionOutput> ExecuteWithMovedDeviceBuffers(
+      Executable* executable, std::vector<ScopedShapedBuffer> arguments,
+      ExecutionProfile* profile = nullptr);
+
   // Creates an executable object given an HLO module. If run_hlo_passes is
   // true, the HLO passes will be run as part of compilation.
   StatusOr<std::unique_ptr<Executable>> CreateExecutable(
@@ -147,6 +167,10 @@ class HloRunner : public HloRunnerInterface {
   }
 
  private:
+  StatusOr<ExecutionOutput> ExecuteWithExecutionInputs(
+      Executable* executable, std::vector<ExecutionInput> arguments,
+      ExecutionProfile* profile);
+
   // Creates a ServiceExecutableRunOptions object to configure a run on device,
   // using the provided stream object. If device_assignment is not nullptr, it
   // will be used to configure the replication parameters. Replicated executions
diff --git a/tensorflow/compiler/xla/service/hlo_runner_interface.cc b/tensorflow/compiler/xla/service/hlo_runner_interface.cc
index 16bac00ea56..265a86d7079 100644
--- a/tensorflow/compiler/xla/service/hlo_runner_interface.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner_interface.cc
@@ -109,7 +109,7 @@ StatusOr<Literal> HloRunnerInterface::ExecuteWithExecutable(
   for (const auto& argument : arguments) {
     argument_pointers.push_back(&argument);
   }
-  return ExecuteWithExecutable(executable, argument_pointers, nullptr);
+  return ExecuteWithExecutable(executable, argument_pointers, profile);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_runner_interface.h b/tensorflow/compiler/xla/service/hlo_runner_interface.h
index 8f8a0dce685..451df3af04f 100644
--- a/tensorflow/compiler/xla/service/hlo_runner_interface.h
+++ b/tensorflow/compiler/xla/service/hlo_runner_interface.h
@@ -23,10 +23,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/hlo_runner_pjrt.cc b/tensorflow/compiler/xla/service/hlo_runner_pjrt.cc
index b1c07a3a397..0cc9388859b 100644
--- a/tensorflow/compiler/xla/service/hlo_runner_pjrt.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner_pjrt.cc
@@ -22,11 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -64,8 +65,11 @@ StatusOr<ExecutionOutput> PjRtWrappedExecutable::ExecuteAsyncOnStream(
 
 static const int kDeviceIdx = 0;
 
-HloRunnerPjRt::HloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client)
-    : pjrt_client_(std::move(pjrt_client)) {}
+HloRunnerPjRt::HloRunnerPjRt(
+    std::unique_ptr<PjRtClient> pjrt_client,
+    DeviceShapeRepresentationFn device_shape_representation_fn)
+    : pjrt_client_(std::move(pjrt_client)),
+      device_shape_representation_fn_(device_shape_representation_fn) {}
 
 HloRunnerPjRt::~HloRunnerPjRt() = default;
 
@@ -87,6 +91,15 @@ StatusOr<CompileOptions> HloRunnerPjRt::GenerateDefaultCompileOptions(
   compile_options.executable_build_options.set_run_backend_only(
       !run_hlo_passes);
 
+  std::vector<Shape> parameter_shapes;
+  parameter_shapes.reserve(
+      module->entry_computation_layout().parameter_count());
+  for (const ShapeLayout& shape_layout :
+       module->entry_computation_layout().parameter_layouts()) {
+    parameter_shapes.push_back(shape_layout.shape());
+  }
+  compile_options.argument_layouts = parameter_shapes;
+
   return compile_options;
 }
 
@@ -126,6 +139,9 @@ StatusOr<Literal> HloRunnerPjRt::Execute(
     std::unique_ptr<HloModule> module,
     absl::Span<const Literal* const> arguments, bool run_hlo_passes,
     ExecutionProfile* profile) {
+  // TODO (b/245550554) : Remove UpdateEntryComputationLayout from runner.
+  xla::UpdateEntryComputationLayout(module.get(),
+                                    device_shape_representation_fn_);
   TF_ASSIGN_OR_RETURN(auto compile_options, GenerateDefaultCompileOptions(
                                                 module.get(), run_hlo_passes));
 
@@ -146,6 +162,16 @@ std::vector<PjRtBuffer*> HloRunnerPjRt::BufferVecToPointerVec(
   return argument_ptrs;
 }
 
+std::vector<std::vector<PjRtBuffer*>> HloRunnerPjRt::BufferMatToPointerMat(
+    std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>& buffer) {
+  std::vector<std::vector<PjRtBuffer*>> argument_ptrs;
+  argument_ptrs.reserve(buffer.size());
+  for (int i = 0; i < buffer.size(); ++i) {
+    argument_ptrs.push_back(BufferVecToPointerVec(buffer[i]));
+  }
+  return argument_ptrs;
+}
+
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> HloRunnerPjRt::CreateExecutable(
     HloModule* module, CompileOptions compile_options) {
   XlaComputation computation(module->ToProto());
@@ -165,9 +191,6 @@ HloRunnerPjRt::ExecuteWithDeviceBuffers(
 
   std::optional<PjRtFuture<Status>> returned_future = {};
 
-  VLOG(1) << "HloRunnerPjRt::ExecuteWithDeviceBuffers"
-          << executable->device_assignment().ToString();
-
   TF_ASSIGN_OR_RETURN(
       auto output_buffers,
       executable->ExecuteSharded(argument_ptrs, devices[kDeviceIdx],
@@ -199,6 +222,7 @@ StatusOr<std::unique_ptr<Executable>> HloRunnerPjRt::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   TF_ASSIGN_OR_RETURN(auto compile_options, GenerateDefaultCompileOptions(
                                                 module.get(), run_hlo_passes));
+
   TF_ASSIGN_OR_RETURN(auto pjrt_executable,
                       CreateExecutable(module.get(), compile_options));
 
@@ -213,14 +237,57 @@ StatusOr<std::unique_ptr<Executable>> HloRunnerPjRt::CreateExecutable(
 StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options) {
-  return Unimplemented("Unimplemented ExecuteReplicated");
+  xla::UpdateEntryComputationLayout(module.get(),
+                                    device_shape_representation_fn_);
+
+  TF_ASSIGN_OR_RETURN(
+      auto device_assignment,
+      pjrt_client_->GetDefaultDeviceAssignment(
+          options.num_replicas, module->config().num_partitions()));
+  return ExecuteReplicated(std::move(module), options, &device_assignment);
 }
 
 StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  return Unimplemented("Unimplemented ExecuteReplicated");
+  module->config().set_replica_count(options.num_replicas);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      CreateExecutable(std::move(module), options.run_hlo_passes));
+
+  return ExecuteReplicated(executable.get(), options, device_assignment);
+}
+
+StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
+    Executable* executable,
+    const HloRunnerInterface::ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+  return ExecuteReplicatedImpl(
+      [&](absl::Span<const std::vector<PjRtBuffer*>>& argument_buffer_slices)
+          -> StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> {
+        PjRtWrappedExecutable* wrapped_executable =
+            static_cast<PjRtWrappedExecutable*>(executable);
+
+        TF_ASSIGN_OR_RETURN(
+            auto execution_results,
+            wrapped_executable->GetPjRtLoadedExecutable()->Execute(
+                argument_buffer_slices, {}));
+
+        std::vector<std::unique_ptr<PjRtBuffer>> results;
+
+        for (auto& device_execution_result : execution_results) {
+          for (auto& device_buffer : device_execution_result) {
+            results.push_back(std::move(device_buffer));
+          }
+        }
+
+        return results;
+      },
+      [&](int64_t replica) { return options.arguments.size(); },
+      [&](int64_t replica, int64_t index) { return options.arguments[index]; },
+      options, device_assignment);
 }
 
 StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
@@ -229,7 +296,70 @@ StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::function<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  return Unimplemented("Unimplemented ExecuteReplicated");
+  return Unimplemented("Unimplemeneted ExecuteReplicated");
+}
+
+StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
+    std::function<StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>(
+        absl::Span<const std::vector<PjRtBuffer*>>&)>
+        execution_helper,
+    std::function<int64_t(int64_t)> argument_count_provider,
+    std::function<const Literal*(int64_t, int64_t)> argument_provider,
+    const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment) {
+  absl::Span<PjRtDevice* const> devices = pjrt_client_->devices();
+
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffer_slices;
+  argument_buffer_slices.reserve(pjrt_client_->addressable_device_count());
+
+  for (int64_t i = 0; i < options.num_replicas; ++i) {
+    PjRtDevice* device_ptr = devices[i];
+
+    // Transfer literals to device.
+    const int64_t argument_count = argument_count_provider(i);
+
+    std::vector<std::unique_ptr<PjRtBuffer>> replica_buffers;
+    replica_buffers.reserve(argument_count);
+
+    for (int64_t arg_index = 0; arg_index < argument_count; arg_index++) {
+      const Literal* const argument = argument_provider(i, arg_index);
+      TF_RET_CHECK(argument != nullptr);
+
+      TF_ASSIGN_OR_RETURN(auto assignment, pjrt_client_->BufferFromHostLiteral(
+                                               *argument, device_ptr));
+      replica_buffers.push_back(std::move(assignment));
+    }
+
+    argument_buffer_slices.push_back(std::move(replica_buffers));
+  }
+
+  TF_RET_CHECK(options.infeed_values.empty() ||
+               options.infeed_values.size() == options.num_replicas);
+
+  if (!options.infeed_values.empty()) {
+    // TODO(b/245550554): Infeed/Outfeed
+  }
+
+  if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
+    // TODO(b/245550554): Infeed/Outfeed
+  }
+
+  auto mat = BufferMatToPointerMat(argument_buffer_slices);
+
+  auto span = absl::Span<const std::vector<PjRtBuffer*>>(mat);
+
+  TF_ASSIGN_OR_RETURN(auto results, execution_helper(span));
+  std::vector<Literal> exec_results;
+  exec_results.reserve(options.num_replicas);
+
+  for (int64_t i = 0; i < options.num_replicas; ++i) {
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        TransferLiteralFromDevice(*results[i]));
+
+    exec_results.push_back(std::move(literal));
+  }
+
+  return std::move(exec_results);
 }
 
 absl::string_view HloRunnerPjRt::Name() const { return "HloRunnerPjRt"; }
diff --git a/tensorflow/compiler/xla/service/hlo_runner_pjrt.h b/tensorflow/compiler/xla/service/hlo_runner_pjrt.h
index 1a8b4f54321..9d0ffdb4a3b 100644
--- a/tensorflow/compiler/xla/service/hlo_runner_pjrt.h
+++ b/tensorflow/compiler/xla/service/hlo_runner_pjrt.h
@@ -32,7 +32,9 @@ namespace xla {
 // hlo proto file), or parsed from a hlo textual IR string.
 class HloRunnerPjRt : public HloRunnerInterface {
  public:
-  explicit HloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client);
+  explicit HloRunnerPjRt(
+      std::unique_ptr<PjRtClient> pjrt_client,
+      DeviceShapeRepresentationFn device_shape_representation_fn);
 
   ~HloRunnerPjRt() override;
 
@@ -86,16 +88,34 @@ class HloRunnerPjRt : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      Executable* executable,
+      const HloRunnerInterface::ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment, ExecutionProfile* profile = nullptr);
+
   absl::string_view Name() const override;
 
  private:
   std::unique_ptr<PjRtClient> pjrt_client_;
+  DeviceShapeRepresentationFn device_shape_representation_fn_;
 
   std::vector<PjRtBuffer*> BufferVecToPointerVec(
       const std::vector<std::unique_ptr<PjRtBuffer>>& buffer);
 
+  std::vector<std::vector<PjRtBuffer*>> BufferMatToPointerMat(
+      std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>& buffer);
+
   StatusOr<CompileOptions> GenerateDefaultCompileOptions(HloModule* module,
                                                          bool run_hlo_passes);
+
+  StatusOr<std::vector<Literal>> ExecuteReplicatedImpl(
+      std::function<StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>(
+          absl::Span<const std::vector<PjRtBuffer*>>&)>
+          execution_helper,
+      std::function<int64_t(int64_t)> argument_count_provider,
+      std::function<const Literal*(int64_t, int64_t)> argument_provider,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
deleted file mode 100644
index 1336f1e901b..00000000000
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULE_H_
-
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SCHEDULE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index a1b8fbcbd34..cf87062b8bc 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
deleted file mode 100644
index c722b01eb3e..00000000000
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// HLO shardings describe how an HLO instruction is split across multiple
-// computations.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_
-
-#include <map>
-#include <optional>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
deleted file mode 100644
index 9a8cf11aadc..00000000000
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
-
-#include <string>
-#include <utility>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
-#include "tensorflow/tsl/platform/status.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 6aaba786426..3c555013b85 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -15,19 +15,18 @@ limitations under the License.
 
 #include <algorithm>
 #include <set>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -126,12 +125,19 @@ TEST_F(HloShardingTest, Tile) {
                                        /*num_devices=*/2));
   }
 
+  {
+    // Test should fail because not all devices present in tile assignment.
+    HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3}));
+    EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}),
+                                       /*num_devices=*/5));
+  }
+
   {
     // Test should pass.
     Shape shape = ShapeUtil::MakeShape(U32, {4, 5});
     HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1}));
     EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {3, 5}),
-                                   /*num_devices=*/5));
+                                   /*num_devices=*/4));
 
     EXPECT_EQ(0, sharding.DeviceForTileIndex({0, 0}));
     EXPECT_EQ(3, sharding.DeviceForTileIndex({0, 1}));
@@ -180,7 +186,7 @@ TEST_F(HloShardingTest, NestedTuple) {
   EXPECT_EQ(shape_tree.element({1, 0}), HloSharding::AssignDevice(0));
   EXPECT_EQ(shape_tree.element({2}), tiled_sharding);
 
-  EXPECT_IS_OK(tuple_sharding.Validate(nested_tuple_shape, /*num_devices=*/5));
+  EXPECT_IS_OK(tuple_sharding.Validate(nested_tuple_shape, /*num_devices=*/2));
   // Test should fail because tuple element count does not match.
   EXPECT_IS_NOT_OK(tuple_sharding.Validate(ShapeUtil::MakeTupleShape({}),
                                            /*num_devices=*/5));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index 341801f204e..8c8e8f7868c 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -30,21 +30,118 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace hlo_sharding_util {
 
+bool IsSubTilingOrEqualSharding(const Shape& potential_sharded_shape,
+                                const HloSharding& potential_subsharding,
+                                const HloSharding& sharding) {
+  // Some early exit cases.
+  // If any manual sharding return false.
+  if (potential_subsharding.IsManual() || sharding.IsManual()) {
+    return false;
+  }
+  // If the tile we are comparing with is maximal, then we are guaranteed to be
+  // equal or contained in it.
+  if (sharding.IsTileMaximal()) {
+    return true;
+  }
+  // If the subsharding tile is maximal and the sharding we are comparing with
+  // is not then it can't be contained.
+  if (potential_subsharding.IsTileMaximal()) {
+    return false;
+  }
+  // Different tiled ranks can't be compared (something is wrong, are the
+  // shardings for different shapes?)
+  if (potential_subsharding.TiledDataRank() != sharding.TiledDataRank()) {
+    return false;
+  }
+  // Helper to construct the base tile bounds based on a shape and a sharding.
+  auto get_base_tile_for_sharding = [](const Shape& shape,
+                                       const HloSharding& sharding) {
+    absl::InlinedVector<int32_t, 5> base_tile;
+    base_tile.resize(shape.dimensions_size());
+    for (int64_t i = 0; i < shape.dimensions_size(); ++i) {
+      base_tile[i] =
+          CeilOfRatio(shape.dimensions(i), sharding.tile_assignment().dim(i));
+    }
+    return base_tile;
+  };
+  auto potential_base_tile = get_base_tile_for_sharding(potential_sharded_shape,
+                                                        potential_subsharding);
+  auto base_tile =
+      get_base_tile_for_sharding(potential_sharded_shape, sharding);
+  // If the potential_base_tile is bigger than the base_tile on any dimension
+  // then it can't be contained regardless.
+  for (int64_t i = 0; i < potential_base_tile.size(); ++i) {
+    if (potential_base_tile[i] > base_tile[i]) {
+      return false;
+    }
+  }
+  const int32_t num_devices =
+      potential_subsharding.tile_assignment().num_elements();
+  // Need a map here, because the MPMD partitioner sharding annotations can have
+  // non contiguous partition numbers.
+  absl::flat_hash_map<int32_t, std::vector<int32_t>> subsharding_offsets;
+  absl::flat_hash_map<int32_t, std::vector<int32_t>> sharding_offsets;
+  const int32_t indices_count = potential_subsharding.TiledDataRank();
+  // Collect the start offsets for each tile for the subsharding we are
+  // evaluating.
+  potential_subsharding.tile_assignment().Each(
+      [&](absl::Span<const int64_t> indices, int64_t device) {
+        auto& indices_per_device = subsharding_offsets[device];
+        for (int64_t i = 0; i < indices_count; ++i) {
+          indices_per_device.push_back(potential_base_tile[i] * indices[i]);
+        }
+      });
+  // Collect the start offsets for each tile for the sharding we are evaluating
+  // against.
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64_t> indices, int64_t device) {
+        auto& indices_per_device = sharding_offsets[device];
+        for (int64_t i = 0; i < indices_count; ++i) {
+          indices_per_device.push_back(base_tile[i] * indices[i]);
+        }
+      });
+  // Compare the start offsets and the end offset of the tiles for each device.
+  for (int i = 0; i < num_devices; ++i) {
+    const int32_t device_id = potential_subsharding.tile_assignment().data()[i];
+    auto& subsharding_offset = subsharding_offsets[device_id];
+    auto& sharding_offset = sharding_offsets[device_id];
+    for (int j = 0; j < indices_count; ++j) {
+      // The subsharding contains data outside of the tile we are comparing
+      // against.
+      if (subsharding_offset[j] < sharding_offset[j]) {
+        return false;
+      }
+      // Skip last tile. It can never go beyond the limit as the shape is the
+      // same for both shardings and sometimes there's padding making one of the
+      // two limits bigger than the other, but it shouldn't be counted.
+      const bool is_last_tile =
+          subsharding_offset[j] + potential_base_tile[j] >=
+          potential_sharded_shape.dimensions(j);
+      if (!is_last_tile && subsharding_offset[j] + potential_base_tile[j] >
+                               sharding_offset[j] + base_tile[j]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
   CHECK_EQ(lhs.IsTuple(), rhs.IsTuple()) << lhs << " <> " << rhs;
   if (lhs.IsTuple()) {
@@ -65,6 +162,10 @@ bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
     }
     return is_better;
   }
+  // Manual sharding is more specific than tile maximal sharding.
+  if (lhs.IsManual() && rhs.IsTileMaximal()) {
+    return true;
+  }
   if (lhs.IsManual() || rhs.IsManual()) {
     return false;
   }
@@ -328,7 +429,7 @@ std::optional<int64_t> GetDominantDevice(
 
 HloSharding TransposeSharding(const HloSharding& sharding,
                               absl::Span<const int64_t> dimensions) {
-  if (sharding.IsTileMaximal()) {
+  if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
   DimensionVector perm_dimensions(dimensions.begin(), dimensions.end());
@@ -360,7 +461,7 @@ HloSharding TransposeSharding(const HloSharding& sharding,
 std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
                                            const Shape& target_shape,
                                            const HloSharding& sharding) {
-  if (sharding.IsTileMaximal()) {
+  if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
 
@@ -589,7 +690,7 @@ bool ContainsTileSharding(const HloModule& module) {
 HloSharding GatherOutputShardingFromIndexIndexPassthroughDimensions(
     const HloSharding& index_sharding, const HloInstruction* hlo) {
   CHECK(hlo->opcode() == HloOpcode::kGather);
-  if (index_sharding.IsTileMaximal()) {
+  if (index_sharding.IsTileMaximal() || index_sharding.IsManual()) {
     return index_sharding;
   }
 
@@ -630,7 +731,7 @@ HloSharding GatherOutputShardingFromIndexIndexPassthroughDimensions(
 HloSharding GatherIndexShardingFromOutputIndexPassthroughDimensions(
     const HloSharding& output_sharding, const HloInstruction* hlo) {
   CHECK(hlo->opcode() == HloOpcode::kGather);
-  if (output_sharding.IsTileMaximal()) {
+  if (output_sharding.IsTileMaximal() || output_sharding.IsManual()) {
     return output_sharding;
   }
 
@@ -669,7 +770,7 @@ HloSharding GatherIndexShardingFromOutputIndexPassthroughDimensions(
 }
 
 HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
-  if (hlo.sharding().IsTileMaximal()) {
+  if (hlo.sharding().IsTileMaximal() || hlo.sharding().IsManual()) {
     return hlo.sharding();
   }
 
@@ -721,7 +822,7 @@ HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
 
 HloSharding ScatterIndexShardingFromUpdateIndexPassthroughDimensions(
     const HloSharding& update_sharding, const HloScatterInstruction* scatter) {
-  if (update_sharding.IsTileMaximal()) {
+  if (update_sharding.IsTileMaximal() || update_sharding.IsManual()) {
     return update_sharding;
   }
 
@@ -763,7 +864,7 @@ HloSharding ScatterIndexShardingFromUpdateIndexPassthroughDimensions(
 
 HloSharding ScatterUpdateShardingFromIndexIndexPassthroughDimensions(
     const HloSharding& index_sharding, const HloScatterInstruction* scatter) {
-  if (index_sharding.IsTileMaximal()) {
+  if (index_sharding.IsTileMaximal() || index_sharding.IsManual()) {
     return index_sharding;
   }
 
@@ -805,7 +906,7 @@ HloSharding ScatterUpdateShardingFromIndexIndexPassthroughDimensions(
 
 HloSharding ScatterEffectiveIndexSharding(
     const HloSharding& index_sharding, const HloScatterInstruction& scatter) {
-  if (index_sharding.IsTileMaximal()) {
+  if (index_sharding.IsTileMaximal() || index_sharding.IsManual()) {
     return index_sharding;
   }
 
@@ -849,7 +950,7 @@ HloSharding ScatterEffectiveIndexSharding(
 
 HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
                                          const HloScatterInstruction& scatter) {
-  if (data_sharding.IsTileMaximal()) {
+  if (data_sharding.IsTileMaximal() || data_sharding.IsManual()) {
     return data_sharding;
   }
 
@@ -931,7 +1032,7 @@ std::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
     absl::Span<const int64_t> index_map,
     absl::Span<const int64_t> offset_or_window_dims,
     absl::Span<const int64_t> slice_size, const int64_t index_vector_dim) {
-  if (operand_sharding.IsTileMaximal()) {
+  if (operand_sharding.IsTileMaximal() || operand_sharding.IsManual()) {
     return operand_sharding;
   }
   auto operand_passthrough_dims = GetGatherScatterOperandPassthroughOperandDims(
@@ -979,7 +1080,8 @@ std::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
     absl::Span<const int64_t> index_map,
     absl::Span<const int64_t> offset_or_window_dims,
     absl::Span<const int64_t> slice_size) {
-  if (output_or_update_sharding.IsTileMaximal()) {
+  if (output_or_update_sharding.IsTileMaximal() ||
+      output_or_update_sharding.IsManual()) {
     return output_or_update_sharding;
   }
   auto operand_passthrough_dims = GetGatherScatterOperandPassthroughOperandDims(
@@ -1025,7 +1127,7 @@ std::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
 std::optional<HloSharding> GatherOperandShardingFromOutputParallelDimensions(
     const HloSharding& output_sharding, const HloInstruction& gather,
     const CallGraph& call_graph) {
-  if (output_sharding.IsTileMaximal()) {
+  if (output_sharding.IsTileMaximal() || output_sharding.IsManual()) {
     return output_sharding;
   }
   auto parallel_dims = GetGatherParallelBatchDims(gather, call_graph);
@@ -1261,7 +1363,7 @@ ScatterUpdateShardingFromOutputOperandPassthroughDimensions(
 std::optional<HloSharding> ScatterUpdateShardingFromOutputParallelDimensions(
     const HloSharding& output_sharding, const HloScatterInstruction& scatter,
     const CallGraph& call_graph) {
-  if (output_sharding.IsTileMaximal()) {
+  if (output_sharding.IsTileMaximal() || output_sharding.IsManual()) {
     return output_sharding;
   }
   auto parallel_dims = GetScatterParallelBatchDims(scatter, call_graph);
@@ -1321,6 +1423,9 @@ HloSharding GatherOutputOrScatterUpdateShardingFromIndicesParallelDimensions(
     const int64_t output_or_update_shape_rank,
     absl::Span<const int64_t> indices_parallel_dims,
     absl::Span<const int64_t> output_or_update_parallel_dims) {
+  if (indices_sharding.IsTileMaximal() || indices_sharding.IsManual()) {
+    return indices_sharding;
+  }
   CHECK_EQ(output_or_update_parallel_dims.size(), indices_parallel_dims.size());
   absl::InlinedVector<int64_t, 4> output_or_update_tiling(
       output_or_update_shape_rank, 1);
@@ -1449,7 +1554,7 @@ std::vector<int64_t> DevicesForSharding(
 
 HloSharding PartiallyReplicateTiledShardingOnDims(
     const HloSharding& sharding, absl::Span<const int64_t> dims_to_replicate) {
-  if (sharding.IsTileMaximal()) {
+  if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
   int64_t group_count = 1;
@@ -1500,7 +1605,7 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
 
 HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
     const HloSharding& sharding, absl::Span<const int64_t> dims_to_keep) {
-  if (sharding.IsTileMaximal()) {
+  if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
   std::vector<int64_t> dims_to_replicate(sharding.TiledDataRank());
@@ -1564,7 +1669,7 @@ HloSharding RemoveShapeDimensions(const HloSharding& sharding,
 std::optional<HloSharding> TransposeShardingWithCollapsedDims(
     const HloSharding& source, absl::Span<int64_t const> src_to_tgt,
     absl::Span<int64_t const> tgt_to_src) {
-  if (source.IsTileMaximal()) {
+  if (source.IsTileMaximal() || source.IsManual()) {
     return source;
   }
   if (src_to_tgt.size() < source.tile_assignment().num_dimensions()) {
@@ -2095,6 +2200,78 @@ GroupedSharding GetManualSubgroupSharding(const HloSharding& sharding) {
   return group_sharding;
 }
 
+std::optional<GroupedSharding>
+PartialReplicatedGroupShardingWithAssignedDeviceGroups(
+    const HloSharding& sharding, int64_t num_shards,
+    const std::vector<std::vector<int64_t>>& device_groups) {
+  if (!sharding.ReplicateOnLastTileDim() ||
+      sharding.tile_assignment().dimensions().back() % device_groups.size() !=
+          0) {
+    VLOG(5) << "Failed because not partial replicated or not divisible";
+    return std::nullopt;
+  }
+  std::vector<std::vector<int64_t>> device_to_index(
+      Product(sharding.tile_assignment().dimensions()),
+      std::vector<int64_t>(sharding.tile_assignment().num_dimensions()));
+  sharding.tile_assignment().Each(
+      [&device_to_index](absl::Span<const int64_t> indices, int64_t device) {
+        device_to_index[device].assign(indices.begin(), indices.end());
+      });
+  std::vector<int64_t> grouped_tiling_dims =
+      sharding.tile_assignment().dimensions();
+  grouped_tiling_dims.back() /= device_groups.size();
+  std::optional<HloSharding> final_sharding;
+  const int64_t shard_size_on_replicated_dim =
+      sharding.tile_assignment().dimensions().back() / num_shards;
+  for (int64_t group_idx = 0; group_idx < device_groups.size(); ++group_idx) {
+    HloSharding group_sharding = HloSharding::Replicate();
+    Array<int64_t> grouped_tiling(grouped_tiling_dims);
+    Array<int64_t> stacked_pos(
+        absl::MakeConstSpan(grouped_tiling_dims.data(),
+                            grouped_tiling_dims.size() - 1),
+        0);
+    for (int64_t device_idx = 0; device_idx < device_groups[group_idx].size();
+         ++device_idx) {
+      VLOG(5) << "Device idx: " << device_idx;
+      const int64_t device = device_groups[group_idx][device_idx];
+      const auto& indices = device_to_index[device];
+      absl::Span<const int64_t> stacked_pos_idx =
+          absl::MakeConstSpan(indices.data(), indices.size() - 1);
+      int64_t& position = stacked_pos(stacked_pos_idx);
+      if (position == num_shards) {
+        VLOG(5) << "Fail because stacked position overflow " << position
+                << " device_groups " << device_groups.size() << " ["
+                << absl::StrJoin(indices, ",") << "]";
+        VLOG(5) << "Device: " << device << " "
+                << device_groups[group_idx][device_idx];
+        VLOG(5) << "Indices: " << absl::StrJoin(indices, ",");
+        VLOG(5) << "Grouped tiling: " << grouped_tiling.ToString();
+        return std::nullopt;
+      }
+      auto stacked_indices = indices;
+      stacked_indices.back() = position++;
+      grouped_tiling(stacked_indices) = device_idx;
+    }
+    group_sharding =
+        HloSharding::PartialTile(grouped_tiling, sharding.metadata());
+    if (!final_sharding) {
+      final_sharding = group_sharding;
+      continue;
+    }
+    if (*final_sharding != group_sharding) {
+      VLOG(5) << "Fail because final sharding different from group sharding: "
+              << final_sharding->ToString() << " vs "
+              << group_sharding.ToString();
+      return std::nullopt;
+    }
+  }
+  return GroupedSharding(device_groups,
+                         {sharding.tile_assignment().num_dimensions() - 1},
+                         {shard_size_on_replicated_dim},
+                         sharding.tile_assignment().num_dimensions() - 1,
+                         *final_sharding, /*subgroup_manual=*/false);
+}
+
 HloSharding UngroupSharding(const GroupedSharding& grouped_sharding) {
   std::vector<int64_t> tiling_dims;
   bool partial_sharding = false;
@@ -2273,5 +2450,29 @@ std::shared_ptr<const HloSharding> CreateTupleSharding(
   return std::make_shared<const HloSharding>(
       HloSharding::Tuple(shape, sub_shardings));
 }
+
+bool IsSortOperandShardingMovable(const HloInstruction* sort_operand,
+                                  int64_t sort_dim) {
+  // Some early exit cases.
+  if (sort_operand == nullptr || sort_operand->shape().rank() < 2 ||
+      !sort_operand->has_sharding()) {
+    return false;
+  }
+  const auto& sharding = sort_operand->sharding();
+  if (!sharding.IsTiled() || sharding.IsTileMaximal() ||
+      sharding.tile_assignment().dim(sort_dim) == 1) {
+    return false;
+  }
+  // Test whether there exist a free dimension to move the sharding into
+  auto tile_assignment_dims = sharding.tile_assignment().dimensions();
+  const int rank = sort_operand->shape().rank();
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    if (dim == sort_dim || tile_assignment_dims[dim] != 1) {
+      continue;
+    }
+    return true;
+  }
+  return false;
+}
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
index 09c4193726e..ddf64fb92f0 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
 namespace xla {
 namespace hlo_sharding_util {
@@ -41,6 +41,14 @@ struct GatherScatterParallelDims {
   std::vector<int64_t> index_parallel_in_dim;
 };
 
+// Determines if the first operand 'potential_subsharding' is a subsharding of
+// the second operand 'sharding'. Subsharding means that the tiles in
+// 'potential_subsharding' define tiles that have a subset or the same data that
+// the tiles in 'sharding' define.
+bool IsSubTilingOrEqualSharding(const Shape& shape,
+                                const HloSharding& potential_subsharding,
+                                const HloSharding& sharding);
+
 // Returns true if the lhs sharding is preferable over the rhs sharding.
 // The most specific sharding is tile maximal followed by single device tile
 // maximal and finally replicated. This order aims to primarily reduce memory
@@ -370,6 +378,13 @@ GroupedSharding GetGroupedReplicatedSharding(const int64_t num_groups,
 // Get group sharding for each manual subgroup.
 GroupedSharding GetManualSubgroupSharding(const HloSharding& sharding);
 
+// Create a group sharding over the partially replicated dimension re-using an
+// existing device group subdivision to avoid unexpected devices reordering.
+std::optional<GroupedSharding>
+PartialReplicatedGroupShardingWithAssignedDeviceGroups(
+    const HloSharding& sharding, int64_t num_shards,
+    const std::vector<std::vector<int64_t>>& device_groups);
+
 // Reconstructs the ungrouped sharding from a GroupedSharding.
 HloSharding UngroupSharding(const GroupedSharding& grouped_sharding);
 
@@ -395,6 +410,11 @@ HloSharding MergeShardingDimension(const HloSharding& sharding,
 std::shared_ptr<const HloSharding> CreateTupleSharding(
     const Shape& shape, absl::Span<const HloInstruction* const> elements);
 
+// Tests whether the sort operand is sharded along the sort dimension and there
+// exists a free (i.e., unsharded) dimension to move the sharding into.
+bool IsSortOperandShardingMovable(const HloInstruction* sort_operand,
+                                  int64_t sort_dim);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
index 8d8b7ec734a..a6a9ad623f9 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -447,6 +448,116 @@ TEST(HloShardingUtilTest, DeviceGroupsMatch) {
   EXPECT_TRUE(DeviceGroupsAreMatch(lhs, rhs));
 }
 
+TEST(HloShardingUtilTest, IsSubShardingTiledReplicated) {
+  HloSharding rhs_sharding = HloSharding::Replicate();
+  HloSharding lhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0}, {1}, {2}, {3}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_TRUE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingReplicatedTiled) {
+  HloSharding rhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0}, {1}, {2}, {3}}));
+  HloSharding lhs_sharding = HloSharding::Replicate();
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_FALSE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingTiledPartialReplicated) {
+  HloSharding rhs_sharding = HloSharding::Replicate();
+  HloSharding lhs_sharding =
+      HloSharding::PartialTile(Array3D<int64_t>({{{0, 1}}, {{2, 3}}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_TRUE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingReplicatedTiledPartial) {
+  HloSharding rhs_sharding =
+      HloSharding::PartialTile(Array3D<int64_t>({{{0, 1}}, {{2, 3}}}));
+  HloSharding lhs_sharding = HloSharding::Replicate();
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_FALSE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingPartialTiledTiled) {
+  HloSharding rhs_sharding =
+      HloSharding::PartialTile(Array3D<int64_t>({{{0, 1}}, {{2, 3}}}));
+  HloSharding lhs_sharding =
+      HloSharding::Tile(Array3D<int64_t>({{{0}, {1}}, {{2}, {3}}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_FALSE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingIncompatibleTiled) {
+  HloSharding rhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0}, {1}, {2}, {3}}));
+  HloSharding lhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0, 1, 2, 3}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_FALSE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingIncompatibleShapeTiledPartialTiled) {
+  HloSharding rhs_sharding =
+      HloSharding::PartialTile(Array3D<int64_t>({{{0, 1}}, {{2, 3}}}));
+  HloSharding lhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0}, {1}, {2}, {3}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {129, 253});
+  EXPECT_FALSE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSubShardingCompatibleShapeTiledPartialTiled) {
+  HloSharding rhs_sharding =
+      HloSharding::PartialTile(Array3D<int64_t>({{{0, 1}}, {{2, 3}}}));
+  HloSharding lhs_sharding =
+      HloSharding::Tile(Array2D<int64_t>({{0}, {1}, {2}, {3}}));
+  Shape shape = ShapeUtil::MakeShape(F32, {128, 253});
+  EXPECT_TRUE(IsSubTilingOrEqualSharding(shape, lhs_sharding, rhs_sharding));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableRankTwoOneFreeDim) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {8, 128}), 1);
+  Array<int64_t> tile_assignment({1, 2});
+  tile_assignment.FillIota(0);
+  iota.set_sharding(HloSharding::Tile(tile_assignment));
+  EXPECT_TRUE(IsSortOperandShardingMovable(&iota, 1));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableRankTwoNoFreeDims) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {8, 128}), 1);
+  Array<int64_t> tile_assignment({2, 2});
+  tile_assignment.FillIota(0);
+  iota.set_sharding(HloSharding::Tile(tile_assignment));
+  EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 1));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableRankOne) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {1024}), 1);
+  Array<int64_t> tile_assignment({2});
+  tile_assignment.FillIota(0);
+  iota.set_sharding(HloSharding::Tile(tile_assignment));
+  EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 0));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableNoSharding) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {1024}), 1);
+  EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 0));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableReplicated) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {8, 128}), 1);
+  iota.set_sharding(HloSharding::Replicate());
+  EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 1));
+}
+
+TEST(HloShardingUtilTest, IsSortOperandShardingMovableSortDimUnsharded) {
+  HloIotaInstruction iota(ShapeUtil::MakeShape(F32, {8, 128}), 1);
+  Array<int64_t> tile_assignment({1, 2});
+  tile_assignment.FillIota(0);
+  iota.set_sharding(HloSharding::Tile(tile_assignment));
+  EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 0));
+}
 }  // namespace
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 730d38ecd4c..529f34698f5 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -18,17 +18,18 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -71,7 +72,9 @@ std::ostream& operator<<(std::ostream& out, const HloUse& use) {
 
 HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
                    const ShapeIndex& index, bool is_phi)
-    : BufferValue(instruction, index, id), is_phi_(is_phi) {
+    : BufferValue(instruction, index, id),
+      uses_([this] { return ComputeUses(); }),
+      is_phi_(is_phi) {
   // The defining position is always the first element in the positions_ vector.
   positions_.push_back(HloPosition{instruction, index});
 }
@@ -153,7 +156,7 @@ void HloValue::SetPositions(absl::Span<const HloPosition> positions) {
       IsRootOf(defining_instruction()->GetModule()->entry_computation());
 }
 
-void HloValue::ComputeUses(std::vector<HloUse>& uses) const {
+std::vector<HloUse> HloValue::ComputeUses() const {
   // Gather the computation roots at which this value appears.
   absl::flat_hash_set<HloInstruction*> root_positions;
   for (const HloPosition& position : positions_) {
@@ -162,6 +165,7 @@ void HloValue::ComputeUses(std::vector<HloUse>& uses) const {
     }
   }
 
+  std::vector<HloUse> uses;
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
     for (HloInstruction* user : position.instruction->users()) {
@@ -186,6 +190,7 @@ void HloValue::ComputeUses(std::vector<HloUse>& uses) const {
       }
     }
   }
+  return uses;
 }
 
 bool HloValue::IsRootOf(const HloComputation* computation) const {
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index a91e43a9711..69f7a3ff9ad 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -26,8 +26,9 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/lazy.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -148,10 +149,7 @@ class HloValue : public BufferValue {
   // overhead could be non-trivial for the first invocation. Therefore even
   // though it is marked `const`, it actually can mutate its data members. It is
   // kept this way to allow passing around const references.
-  absl::Span<const HloUse> GetUses() const {
-    return uses_.MaybeInitAndGet(
-        [this](std::vector<HloUse>& uses) { ComputeUses(uses); });
-  }
+  absl::Span<const HloUse> GetUses() const { return uses_.get(); }
 
   // Returns true if this has a position that is the root of the given
   // computation.
@@ -169,24 +167,8 @@ class HloValue : public BufferValue {
   std::string ToString() const override { return ToString(0); }
 
  private:
-  template <typename T>
-  class Lazy {
-   public:
-    Lazy() = default;
-    const T& MaybeInitAndGet(absl::FunctionRef<void(T&)> func) const {
-      if (!initialized_) {
-        func(uses_);
-        initialized_ = true;
-      }
-      return uses_;
-    }
-
-   private:
-    mutable T uses_;
-    mutable bool initialized_ = false;
-  };
   // Called when lazily computing the uses.
-  void ComputeUses(std::vector<HloUse>& uses) const;
+  std::vector<HloUse> ComputeUses() const;
 
   // The set of positions of this HloValue. The first element is always the
   // position of the definition.
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6063cd0bb6f..1fbba6d280f 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -26,16 +26,17 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -1209,6 +1210,30 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
           param_no, fusion->ToString().c_str());
     }
   }
+  const HloFusionInstruction* casted_fusion =
+      DynCast<const HloFusionInstruction>(fusion);
+  for (const auto& pair : casted_fusion->output_to_operand_aliasing()) {
+    TF_RET_CHECK(pair.second.first < casted_fusion->operand_count())
+        << "Invalid aliasing operand index.";
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        casted_fusion->operand(pair.second.first)->shape(), pair.second.second))
+        << "Invalid aliasing operand shape index.";
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(casted_fusion->shape(), pair.first))
+        << "Invalid aliasing output shape index.";
+    const Shape& output_subshape =
+        ShapeUtil::GetSubshape(casted_fusion->shape(), pair.first);
+    const Shape& operand_subshape = ShapeUtil::GetSubshape(
+        casted_fusion->operand(pair.second.first)->shape(), pair.second.second);
+    if (opts_.layout_sensitive) {
+      TF_RET_CHECK(operand_subshape == output_subshape)
+          << "Different aliasing shapes: " << operand_subshape.ToString()
+          << " vs " << output_subshape.ToString();
+    } else {
+      TF_RET_CHECK(ShapeUtil::Compatible(output_subshape, operand_subshape))
+          << "Different aliasing shapes: " << operand_subshape.ToString()
+          << " vs " << output_subshape.ToString();
+    }
+  }
   return OkStatus();
 }
 
@@ -2422,7 +2447,16 @@ Status CheckElementwiseInstruction(HloInstruction* instruction) {
 // not check result shape as that is checked in the ShapeVerifier.
 class InstructionVerifier : public DfsHloVisitorWithDefault {
  public:
-  explicit InstructionVerifier(const HloVerifierOpts& opts) : opts_(opts) {}
+  InstructionVerifier(const HloModule* module, const HloVerifierOpts& opts)
+      : opts_(opts) {
+    // TODO(b/258285553): Eliminate this check when all paths that enable SPMD
+    // partitioning also set the num_partitions correctly.
+    const int64_t num_partitions = module->config().num_partitions();
+    if (module->config().use_spmd_partitioning() &&
+        opts.verify_sharding_device_numbers && num_partitions > 1) {
+      num_devices_ = module->config().num_partitions();
+    }
+  }
 
   Status DefaultAction(HloInstruction*) override { return OkStatus(); }
 
@@ -2477,6 +2511,14 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     // Allow kWhile to contain computations on separate thread.
     TF_RETURN_IF_ERROR(CheckCallableInstructionThreadName(
         xla_while, /*skip_nested_async_op_check=*/true));
+
+    // Verify consistency of sharding of while instructions and related
+    // instructions (parameters, root) in its called computations.
+    TF_RETURN_IF_ERROR(VerifyConsistentSharding(
+        xla_while, {xla_while, xla_while->while_body()->root_instruction(),
+                    xla_while->while_body()->parameter_instruction(0),
+                    xla_while->while_condition()->parameter_instruction(0)}));
+
     return OkStatus();
   }
 
@@ -2487,17 +2529,31 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   Status HandleConditional(HloInstruction* conditional) override {
-    for (int b = 0; b < conditional->branch_count(); ++b) {
-      if (conditional->branch_computation(b)->num_parameters() != 1) {
+    const std::vector<HloComputation*> branch_computations =
+        conditional->branch_computations();
+    std::vector<const HloInstruction*> sharding_check_instructions;
+    sharding_check_instructions.reserve(branch_computations.size() + 1);
+    sharding_check_instructions.push_back(conditional);
+
+    for (const HloComputation* branch_computation : branch_computations) {
+      if (branch_computation->num_parameters() != 1) {
         return FailedPrecondition(
             "Branch computation %s of %s must have 1 parameter instead of %d",
-            conditional->branch_computation(b)->name(), conditional->ToString(),
-            conditional->branch_computation(b)->num_parameters());
+            branch_computation->name(), conditional->ToString(),
+            branch_computation->num_parameters());
       }
+      sharding_check_instructions.push_back(
+          branch_computation->root_instruction());
     }
     // Allow kConditional to contain computations on separate thread.
     TF_RETURN_IF_ERROR(CheckCallableInstructionThreadName(
         conditional, /*skip_nested_async_op_check=*/true));
+
+    // Verify consistency of sharding of conditional instructions and roots of
+    // its branches.
+    TF_RETURN_IF_ERROR(
+        VerifyConsistentSharding(conditional, sharding_check_instructions));
+
     return OkStatus();
   }
 
@@ -2558,15 +2614,26 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   Status Preprocess(HloInstruction* instruction) override {
-    auto previous = instructions_by_name_.find(instruction->name());
-    TF_RET_CHECK(previous == instructions_by_name_.end())
-        << "HLO has name that is not unique within module:\n"
-        << instruction->ToString()
-        << " in computation: " << instruction->parent()->name()
-        << "\nPrevious HLO with same name:\n"
-        << previous->second->ToString()
-        << " in computation: " << previous->second->parent()->name();
-    instructions_by_name_[instruction->name()] = instruction;
+    auto [it, inserted] =
+        instructions_by_name_.insert({instruction->name(), instruction});
+    TF_RET_CHECK(inserted) << "HLO has name that is not unique within module:\n"
+                           << instruction->ToString() << " in computation: "
+                           << instruction->parent()->name()
+                           << "\nPrevious HLO with same name:\n"
+                           << it->second->ToString() << " in computation: "
+                           << it->second->parent()->name();
+
+    if (instruction->has_sharding()) {
+      Status status =
+          instruction->sharding().Validate(instruction->shape(), num_devices_);
+      if (!status.ok()) {
+        return Status(status.code(),
+                      absl::StrCat("Invalid sharding for instruction: ",
+                                   instruction->ToString(), ": ",
+                                   status.error_message()));
+      }
+    }
+
     return OkStatus();
   }
 
@@ -2589,13 +2656,66 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
         }
       }
     }
+    TF_RETURN_IF_ERROR(VerifyF8Usage(instruction));
 
     return OkStatus();
   }
 
  private:
+  static Status VerifyConsistentSharding(
+      const HloInstruction* parent,
+      absl::Span<const HloInstruction* const> instructions) {
+    const HloInstruction* common_sharding_inst = nullptr;
+    for (const HloInstruction* check_inst : instructions) {
+      if (!check_inst->has_sharding()) {
+        continue;
+      }
+      if (!common_sharding_inst) {
+        common_sharding_inst = check_inst;
+        continue;
+      }
+      TF_RET_CHECK(check_inst->sharding() == common_sharding_inst->sharding())
+          << "Inconsistent " << HloOpcodeString(parent->opcode())
+          << " sharding among instructions: \n"
+          << common_sharding_inst->ToString() << "\n"
+          << check_inst->ToString();
+    }
+    return OkStatus();
+  }
+
+  static Status VerifyF8Usage(HloInstruction* instruction) {
+    bool has_fp8_operand =
+        absl::c_any_of(instruction->operands(), [](HloInstruction* operand) {
+          return ShapeUtil::HasPrimitiveType(operand->shape(), F8E5M2) ||
+                 ShapeUtil::HasPrimitiveType(operand->shape(), F8E4M3FN);
+        });
+    // TODO(b/259609697): Support FP8 operands in all instructions that support
+    // inputs of other floating-point dtypes. Currently the CPU and GPU backends
+    // only support FP8 operands in the convert, tuple, get-tuple-element and
+    // transpose instructions and FP8 Custom Calls.
+    if (has_fp8_operand && instruction->opcode() != HloOpcode::kConvert &&
+        instruction->opcode() != HloOpcode::kBitcast &&
+        instruction->opcode() != HloOpcode::kTuple &&
+        instruction->opcode() != HloOpcode::kGetTupleElement &&
+        instruction->opcode() != HloOpcode::kTranspose &&
+        instruction->opcode() != HloOpcode::kDot &&
+        instruction->opcode() != HloOpcode::kFusion &&
+        instruction->opcode() != HloOpcode::kReshape &&
+        instruction->opcode() != HloOpcode::kCopy &&
+        instruction->opcode() != HloOpcode::kCustomCall) {
+      return InvalidArgument(
+          "FP8 is currently only supported in convert, bitcast, tuple, "
+          "get-tuple-element, transpose, dot, fusion, reshape and copy "
+          "instructions as well as Custom Calls, but got instruction with FP8 "
+          "input: %s",
+          instruction->ToString());
+    }
+    return OkStatus();
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*> instructions_by_name_;
   const HloVerifierOpts& opts_;
+  std::optional<int64_t> num_devices_;
 };
 
 }  // namespace
@@ -2622,7 +2742,7 @@ StatusOr<bool> HloVerifier::Run(
     std::unique_ptr<ShapeVerifier> shape_verifier =
         target_metadata_->GetVerifier();
     InstructionVerifier instruction_verifier(
-        target_metadata_->GetVerifierOpts());
+        module, target_metadata_->GetVerifierOpts());
     for (auto* computation : module->computations(execution_threads)) {
       TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
       TF_RETURN_IF_ERROR(computation->Accept(&instruction_verifier));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index d3ac833a984..ade80cd7205 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -81,6 +81,11 @@ struct HloVerifierOpts {
     return std::move(*this);
   }
 
+  HloVerifierOpts&& WithVerifyShardingDeviceNumbers(bool verify) {
+    verify_sharding_device_numbers = verify;
+    return std::move(*this);
+  }
+
   bool IsLayoutSensitive() const { return layout_sensitive; }
 
   bool AllowMixedPrecision() const { return allow_mixed_precision; }
@@ -115,6 +120,9 @@ struct HloVerifierOpts {
   // parent computation.
   bool verify_custom_call_nested_computation_thread_name = true;
 
+  // Check device numbers in sharding verification.
+  bool verify_sharding_device_numbers = true;
+
   // Whether bitcast should have the same size, including all paddings.
   bool allow_bitcast_to_have_different_size = false;
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 9b91de6bf77..77a4808b026 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_replace.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -2539,5 +2539,126 @@ ENTRY entry (parameter.0: bf16[1,8,1,8,320], parameter.1: bf16[1,8,6,8,320]) ->
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTest, InvalidShardingRank) {
+  const char* const hlo = R"(
+HloModule Module
+
+ENTRY main {
+  p = f32[4,2] parameter(0), sharding={devices=[1,2,2,1]0,1,2,3}
+  ROOT r = f32[4,2] copy(p)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("tile assignment dimensions (excluding subgroups) is "
+                        "different than the input rank."));
+}
+
+TEST_F(HloVerifierTest, InvalidShardingDevices) {
+  const char* const hlo = R"(
+HloModule Module
+
+ENTRY main {
+  p = f32[4,2] parameter(0), sharding={devices=[2,2]0,1,2,3}
+  ROOT r = f32[4,2] copy(p)
+}
+)";
+
+  HloModuleConfig config;
+  config.set_num_partitions(2);
+  config.set_use_spmd_partitioning(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo, config));
+  ASSERT_TRUE(module->config().use_spmd_partitioning());
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("device 2 > num_devices (2) in tile assignment"));
+}
+
+TEST_F(HloVerifierTest, InconsistentWhileSharding) {
+  const char* const hlo = R"(
+    HloModule While
+
+    %body.v3 (prev.1: s32[]) -> s32[] {
+       %prev.1 = s32[] parameter(0), sharding={replicated}
+      %constant = s32[] constant(1)
+      ROOT %add = s32[] add(s32[] %constant, s32[] %prev.1)
+    }
+
+    %condition.v3 (prev.2: s32[]) -> pred[] {
+      %prev.2 = s32[] parameter(0), sharding={maximal device=0}
+      %constant.1 = s32[] constant(5)
+      ROOT %greater-than = pred[] compare(s32[] %constant.1, s32[] %prev.2), direction=GT
+    }
+
+    ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
+      %constant.2 = s32[] constant(0)
+      ROOT %while = s32[] while(s32[] %constant.2), condition=%condition.v3, body=%body.v3
+    }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Inconsistent while sharding among instructions"));
+}
+
+TEST_F(HloVerifierTest, InconsistentConditionSharding) {
+  const char* const hlo = R"(
+  HloModule Module
+
+  true_branch {
+    tparam = (s32[], f32[4]) parameter(0)
+    ROOT tgte1 = f32[4] get-tuple-element(tparam), index=1
+  }
+
+  false_branch {
+    fparam = (s32[], f32[4]) parameter(0)
+    ROOT fgte1 = f32[4] get-tuple-element(fparam), index=1, sharding={replicated}
+  }
+
+  ENTRY entry {
+    p0 = (s32[], f32[4]) parameter(0)
+    constant = pred[] constant(true)
+    ROOT conditional = f32[4] conditional(constant, p0, p0),
+      true_computation=true_branch, false_computation=false_branch,
+      sharding={maximal device=0}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr("Inconsistent conditional sharding among instructions"));
+}
+
+TEST_F(HloVerifierTest, InvalidF8Usage) {
+  const char* const hlo = R"(
+  HloModule Module
+
+  ENTRY entry {
+    param0 = f32[] parameter(0)
+    x = f8e5m2[] convert(param0)
+    ROOT add = f8e5m2[] add(x, x)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr("FP8 is currently only supported in convert, bitcast, tuple, "
+                "get-tuple-element, transpose, dot, fusion, reshape and copy "
+                "instructions as well as Custom Calls"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
index ac9db993786..347044d01bb 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.h
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/tsl/util/ptr_util.h"
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index e1047c8d321..7ff51976577 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -28,12 +28,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -127,6 +127,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCos:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kTan:
       return ShapeUtil::ElementIsComplex(instruction.shape());
 
     // We say that integer div/mod by a constant is cheap because it gets
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index d415ff1a814..89e4ecd7e1a 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -23,10 +23,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 
diff --git a/tensorflow/compiler/xla/service/instruction_hoister.h b/tensorflow/compiler/xla/service/instruction_hoister.h
index c19a9d29b6f..d95501bff54 100644
--- a/tensorflow/compiler/xla/service/instruction_hoister.h
+++ b/tensorflow/compiler/xla/service/instruction_hoister.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INSTRUCTION_HOISTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INSTRUCTION_HOISTER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
index 68ee4472555..a39065b9cdc 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
@@ -33,46 +33,68 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 
+namespace {
+struct CanonicalAsyncOp {
+  HloOpcode outer;  // kAsyncStart or kAsyncDone
+  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectivePermute
+};
+
+CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) {
+  switch (hlo.opcode()) {
+    case HloOpcode::kAsyncStart:
+    case HloOpcode::kAsyncDone:
+      return {hlo.opcode(), hlo.async_wrapped_opcode()};
+    case HloOpcode::kAllReduceStart:
+      return {HloOpcode::kAsyncStart, HloOpcode::kAllReduce};
+    case HloOpcode::kAllGatherStart:
+      return {HloOpcode::kAsyncStart, HloOpcode::kAllGather};
+    case HloOpcode::kCollectivePermuteStart:
+      return {HloOpcode::kAsyncStart, HloOpcode::kCollectivePermute};
+    case HloOpcode::kAllReduceDone:
+      return {HloOpcode::kAsyncDone, HloOpcode::kAllReduce};
+    case HloOpcode::kAllGatherDone:
+      return {HloOpcode::kAsyncDone, HloOpcode::kAllGather};
+    case HloOpcode::kCollectivePermuteDone:
+      return {HloOpcode::kAsyncDone, HloOpcode::kCollectivePermute};
+    default:
+      return {hlo.opcode(), hlo.opcode()};
+  }
+}
+
+}  // namespace
+
 LatencyEstimator::TimeCost ApproximateLatencyEstimator::GetLatencyBetween(
     const HloGraphNode& from, const HloGraphNode& target) const {
   // These values are empirically derived to obtain an overlap of one output
   // fusion/convolution with 1 async op or 5 loop fusions with an async op.
   static constexpr TimeCost kLowLatency = 1.0;
-  static constexpr TimeCost kHighCost = 5000.0;
-  switch (from.GetInstr().opcode()) {
-    case HloOpcode::kCollectivePermuteStart:
-      if (target.GetInstr().opcode() == HloOpcode::kCollectivePermuteDone) {
-        return kHighCost;
-      }
-      break;
-    case HloOpcode::kAllGatherStart:
-      if (target.GetInstr().opcode() == HloOpcode::kAllGatherDone) {
-        return kHighCost;
-      }
-      break;
-    case HloOpcode::kAllReduceStart:
-      if (target.GetInstr().opcode() == HloOpcode::kAllReduceDone) {
-        return kHighCost;
-      }
-      break;
-    default:
-      break;
+  static constexpr TimeCost kHighLatency = 5000.0;
+  CanonicalAsyncOp from_op = GetCanonicalAsyncOp(from.GetInstr());
+  CanonicalAsyncOp target_op = GetCanonicalAsyncOp(target.GetInstr());
+  if (from_op.outer == HloOpcode::kAsyncStart &&
+      target_op.outer == HloOpcode::kAsyncDone &&
+      from_op.inner == target_op.inner) {
+    return kHighLatency;
   }
   // Every other instruction we consider synchronous, which means the
   // latency between each of them is always one unit.
@@ -94,10 +116,16 @@ LatencyEstimator::TimeCost ApproximateLatencyEstimator::NodeCost(
   return kLowCost;
 }
 
-// Returns if this is an Async op done that the scheduler supports.
+// Returns if this is an Async done op that the scheduler supports.
 bool AsyncTracker::IsSupportedAsyncDone(const HloInstruction& hlo) const {
-  if (hlo.opcode() == HloOpcode::kAsyncDone) {
-    switch (hlo.async_wrapped_opcode()) {
+  CanonicalAsyncOp op = GetCanonicalAsyncOp(hlo);
+  if (op.outer == HloOpcode::kSendDone || op.outer == HloOpcode::kRecvDone) {
+    return config_.schedule_send_recvs;
+  }
+
+  if (op.outer == HloOpcode::kAsyncDone) {
+    switch (op.inner) {
+      case HloOpcode::kAllToAll:
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
@@ -106,23 +134,19 @@ bool AsyncTracker::IsSupportedAsyncDone(const HloInstruction& hlo) const {
         return false;
     }
   }
-  switch (hlo.opcode()) {
-    case HloOpcode::kAllGatherDone:
-    case HloOpcode::kAllReduceDone:
-    case HloOpcode::kCollectivePermuteDone:
-      return true;
-    case HloOpcode::kSendDone:
-    case HloOpcode::kRecvDone:
-      return config_.schedule_send_recvs;
-    default:
-      return false;
-  }
+  return false;
 }
 
 // Returns if this is an Async op start that the scheduler supports.
 bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
-  if (hlo.opcode() == HloOpcode::kAsyncStart) {
-    switch (hlo.async_wrapped_opcode()) {
+  CanonicalAsyncOp op = GetCanonicalAsyncOp(hlo);
+  if (op.outer == HloOpcode::kSend || op.outer == HloOpcode::kRecv) {
+    return config_.schedule_send_recvs;
+  }
+
+  if (op.outer == HloOpcode::kAsyncStart) {
+    switch (op.inner) {
+      case HloOpcode::kAllToAll:
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
@@ -131,47 +155,38 @@ bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
         return false;
     }
   }
-  switch (hlo.opcode()) {
-    case HloOpcode::kAllGatherStart:
-    case HloOpcode::kAllReduceStart:
-    case HloOpcode::kCollectivePermuteStart:
-      return true;
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
-      return config_.schedule_send_recvs;
-    default:
-      return false;
-  }
+  return false;
 }
 
 ResourcesVector AsyncTracker::GetResourcesFromInstruction(
     const HloInstruction& hlo) const {
+  CanonicalAsyncOp op = GetCanonicalAsyncOp(hlo);
+  auto get_resource_for_op = [](HloOpcode op) -> ResourceType {
+    switch (op) {
+      case HloOpcode::kAllReduce:
+        return ResourceType::kAllReduce;
+      case HloOpcode::kAllGather:
+        return ResourceType::kAllGather;
+      case HloOpcode::kAllToAll:
+        return ResourceType::kAllToAll;
+      case HloOpcode::kCollectivePermute:
+        return ResourceType::kCollectivePermute;
+      default:
+        return ResourceType::kNoResource;
+    }
+  };
+  if (op.outer == HloOpcode::kAsyncStart || op.outer == HloOpcode::kAsyncDone) {
+    ResourceType type = get_resource_for_op(op.inner);
+    if (type == ResourceType::kNoResource) {
+      return {};
+    }
+    ResourceUsageType usage = op.outer == HloOpcode::kAsyncStart
+                                  ? ResourceUsageType::kResourceRelease
+                                  : ResourceUsageType::kResourceOccupy;
+    return {std::make_pair(type, usage)};
+  }
+
   switch (hlo.opcode()) {
-    case HloOpcode::kAsyncStart:
-      switch (hlo.async_wrapped_opcode()) {
-        case HloOpcode::kAllGather:
-          return ResourcesVector{std::make_pair(
-              ResourceType::kAllGather, ResourceUsageType::kResourceRelease)};
-        case HloOpcode::kAllReduce:
-          return ResourcesVector{std::make_pair(
-              ResourceType::kAllReduce, ResourceUsageType::kResourceRelease)};
-        case HloOpcode::kCollectivePermute:
-          return ResourcesVector{
-              std::make_pair(ResourceType::kCollectivePermute,
-                             ResourceUsageType::kResourceRelease)};
-        default:
-          return ResourcesVector{};
-      }
-    case HloOpcode::kCollectivePermuteStart:
-      return ResourcesVector{
-          std::make_pair(ResourceType::kCollectivePermute,
-                         ResourceUsageType::kResourceRelease)};
-    case HloOpcode::kAllGatherStart:
-      return ResourcesVector{std::make_pair(
-          ResourceType::kAllGather, ResourceUsageType::kResourceRelease)};
-    case HloOpcode::kAllReduceStart:
-      return ResourcesVector{std::make_pair(
-          ResourceType::kAllReduce, ResourceUsageType::kResourceRelease)};
     case HloOpcode::kAfterAll:
       // TODO(maggioni): Understand why AfterAll need to not be overlapped.
       return ResourcesVector{std::make_pair(ResourceType::kSendHost,
@@ -192,31 +207,6 @@ ResourcesVector AsyncTracker::GetResourcesFromInstruction(
                                ResourceUsageType::kResourceRelease)
               : std::make_pair(ResourceType::kSendRecv,
                                ResourceUsageType::kResourceRelease)};
-    case HloOpcode::kAsyncDone:
-      switch (hlo.async_wrapped_opcode()) {
-        case HloOpcode::kAllGather:
-          return ResourcesVector{std::make_pair(
-              ResourceType::kAllGather, ResourceUsageType::kResourceOccupy)};
-        case HloOpcode::kAllReduce:
-          return ResourcesVector{std::make_pair(
-              ResourceType::kAllReduce, ResourceUsageType::kResourceOccupy)};
-        case HloOpcode::kCollectivePermute:
-          return ResourcesVector{
-              std::make_pair(ResourceType::kCollectivePermute,
-                             ResourceUsageType::kResourceOccupy)};
-        default:
-          return ResourcesVector{};
-      }
-    case HloOpcode::kCollectivePermuteDone:
-      return ResourcesVector{
-          std::make_pair(ResourceType::kCollectivePermute,
-                         ResourceUsageType::kResourceOccupy)};
-    case HloOpcode::kAllGatherDone:
-      return ResourcesVector{std::make_pair(
-          ResourceType::kAllGather, ResourceUsageType::kResourceOccupy)};
-    case HloOpcode::kAllReduceDone:
-      return ResourcesVector{std::make_pair(
-          ResourceType::kAllReduce, ResourceUsageType::kResourceOccupy)};
     case HloOpcode::kRecvDone:
       return ResourcesVector{
           static_cast<const HloSendRecvInstruction*>(hlo.operand(0))
@@ -246,7 +236,9 @@ int64_t AsyncTracker::CollectivesPerInstruction(
     ResourceType async_done, const HloInstruction& instr) const {
   // For instructions not calling a computation then return 1 if the instruction
   // has opcode equal to 'async_done'
-  if (instr.called_computations().empty()) {
+  if (instr.called_computations().empty() ||
+      instr.opcode() == HloOpcode::kAsyncStart ||
+      instr.opcode() == HloOpcode::kAsyncDone) {
     auto resources = GetResourcesFromInstruction(instr);
     return absl::c_any_of(GetResourcesFromInstruction(instr),
                           [async_done](const ResourcePair& resource) {
@@ -599,8 +591,8 @@ class ReadySetLt {
     }
     if (sched_state_.config.aggressive_scheduling_policies) {
       // If an instruction releasing a resource is not resource constrained and
-      // has an async depth of 0 delay it as much as possible to avoid potential
-      // cost model inefficiencies.
+      // has an async depth of 0, delay it as much as possible to avoid
+      // potential cost model inefficiencies.
       if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
               /*first_cond=*/!(a.node->DoesReleaseAnyResource() &&
                                a.node->GetAsyncDepth() == 0 &&
@@ -755,7 +747,7 @@ class ReadySetLt {
         gn.GetInstr().opcode() != HloOpcode::kSendDone) {
       return true;
     }
-    // Try to delay the send-done for a host based operations like outside
+    // Try to delay the send-done for host based operations like outside
     // compilation to avoid allocating memory unnecessarily.
     const HloGraphNode& start =
         sched_state_.sched_graph.GetNode(gn.GetInstr().operand(0));
@@ -772,8 +764,9 @@ class ReadySetLt {
     if (cand.pressure_change) {
       return *cand.pressure_change;
     }
-    return sched_state_.memory_pressure_tracker->MemoryPressureDifference(
-        &cand.node->GetInstr());
+    cand.pressure_change =
+        sched_state_.memory_pressure_tracker->MemoryPressureDifference(
+            &cand.node->GetInstr());
     return *cand.pressure_change;
   }
 };
@@ -847,7 +840,7 @@ HloGraphNode* DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
                     ? ready_chosen.node->GetInstr().name()
                     : ready_candidate.node->GetInstr().name())
             << ") Reason: " << cand_result.reason;
-    if (cand_result.result.node == *ready_node_it) {
+    if (new_candidate_selected) {
       ready_chosen = cand_result.result;
       chosen_it = ready_node_it;
     }
@@ -855,7 +848,7 @@ HloGraphNode* DefaultSchedulerCore::FindAndExtractBestNodeAvailable(
   if (ready_chosen.node == nullptr) {
     return nullptr;
   }
-  CHECK_NE(ready_chosen.node, nullptr);
+  CHECK(chosen_it != sched_state.ready_set.end());
   std::swap(*chosen_it, sched_state.ready_set.back());
   sched_state.ready_set.pop_back();
   return ready_chosen.node;
@@ -912,12 +905,11 @@ StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
     break;
   }
 
-  // After scheduling the node we decided to schedule release the nodes that
-  // don't have any more successors unscheduled. If a node is not ready for
-  // scheduling yet even if it doesn't have any more dependencies unscheduled
-  // (because for example the node latency would make us stall while we could
-  // schedule something else in between) we put it into the pending set.
-  // We schedule from the pending set if there's nothing in the ready set.
+  // After scheduling the node we decided to schedule, release the nodes that
+  // don't have any more successors unscheduled by putting them in the
+  // ready_set. If a released node ready time is higher than the current time we
+  // put it also in the next_ready_stack, which is used in the ReadySetLt class
+  // for nodes cost comparison.
   for (HloEdge& edge : n->GetPredecessors()) {
     const int64_t current_outdegree = edge.Target().GetOutdegree();
     // Node is not ready yet. Decrease the outdegree and continue.
@@ -927,8 +919,7 @@ StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
     }
     // This node is now ready to schedule. Set the outdegree to 0 and compute
     // the time at which it is gonna be ready to be scheduled. If the time is
-    // not what the current time is we put it into the pending set , otherwise
-    // the node is good for the ready set.
+    // not what the current time is we put it also in next_ready_stack.
     edge.Target().SetOutdegree(0);
     LatencyEstimator::TimeCost ready_time = current_time;
     for (const HloEdge& pred : edge.Target().GetSuccessors()) {
@@ -938,8 +929,6 @@ StatusOr<HloGraphNode::TimeCost> DefaultSchedulerCore::ScheduleNode(
         ready_time = edge_time;
       }
     }
-    auto resources = sched_state->async_tracker->GetResourcesFromInstruction(
-        edge.Target().GetInstr());
     for (auto& resource :
          sched_state->async_tracker->GetResourcesFromInstruction(
              edge.Target().GetInstr())) {
@@ -1215,6 +1204,8 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
   XLA_VLOG_LINES(5, sched_state.sched_graph.ToString());
   sched_state.max_concurrent_async[ResourceType::kCollectivePermute] =
       config_.collective_permute_overlap_limit;
+  sched_state.max_concurrent_async[ResourceType::kAllToAll] =
+      config_.all_to_all_overlap_limit;
   sched_state.max_concurrent_async[ResourceType::kAllGather] =
       config_.all_gather_overlap_limit;
   sched_state.max_concurrent_async[ResourceType::kAllReduce] =
@@ -1239,18 +1230,18 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
                                roots.end());
   // Schedule in order bottom up.
   while (!sched_state.ready_set.empty()) {
-      VLOG(10) << "Current ready queue:";
-      XLA_VLOG_LINES(10, [&sched_state]() {
-        struct LogFormatter {
-          void operator()(std::string* out, const HloGraphNode* n) const {
-            out->append(absl::StrCat("\t", n->GetInstr().ToString(),
-                                     " Ready time: ", n->GetReadyTime()));
-          }
-        };
-        return absl::StrJoin(sched_state.ready_set, "\n", LogFormatter());
-      }());
+    VLOG(10) << "Current ready queue:";
+    XLA_VLOG_LINES(10, [&sched_state]() {
+      struct LogFormatter {
+        void operator()(std::string* out, const HloGraphNode* n) const {
+          out->append(absl::StrCat("\t", n->GetInstr().ToString(),
+                                   " Ready time: ", n->GetReadyTime()));
+        }
+      };
+      return absl::StrJoin(sched_state.ready_set, "\n", LogFormatter());
+    }());
 
-      TF_RETURN_IF_ERROR(SchedulingStep(&sched_state));
+    TF_RETURN_IF_ERROR(SchedulingStep(&sched_state));
   }
   if (VLOG_IS_ON(5)) {
     VLOG(5) << "New order";
@@ -1272,9 +1263,46 @@ DefaultSchedulerCore::ScheduleComputation(const HloComputation* computation) {
                  .GetNode(sched_state.new_sequence_reversed.back())
                  .GetReadyTime();
   absl::c_reverse(sched_state.new_sequence_reversed);
+
+  const auto& debug_options = xla::GetDebugOptionsFromFlags();
+  if (debug_options.xla_dump_latency_hiding_schedule() &&
+      !absl::StrContains(computation->name(), "region")) {
+    int core_freq = latency_estimator_->CyclesPerMicrosecond();
+    DumpLatencyHidingSchedule(computation, sched_state.sched_graph,
+                              sched_state.new_sequence_reversed, core_freq,
+                              debug_options);
+  }
+
   return std::move(sched_state.new_sequence_reversed);
 }
 
+void DefaultSchedulerCore::DumpLatencyHidingSchedule(
+    const HloComputation* computation, const HloScheduleGraph& schedule_graph,
+    const std::vector<HloInstruction*>& instructions,
+    const int cycles_per_microsecond, const DebugOptions& debug_options) {
+  ScheduleProto proto;
+  proto.set_computation_id(computation->unique_id());
+  proto.set_cycles_per_microsecond(cycles_per_microsecond);
+
+  const HloGraphNode& first_node = schedule_graph.GetNode(instructions.front());
+  const double total_time = first_node.GetReadyTime() + first_node.GetCost();
+  for (const HloInstruction* instr : instructions) {
+    const HloGraphNode& instr_node = schedule_graph.GetNode(instr);
+    const double start_time =
+        total_time - (instr_node.GetReadyTime() + instr_node.GetCost());
+    const double end_time = start_time + instr_node.GetCost();
+
+    ScheduleProto::Instruction* instr_msg = proto.add_instructions();
+    instr_msg->set_id(instr->unique_id());
+    instr_msg->set_start_timestamp_cycles(start_time);
+    instr_msg->set_end_timestamp_cycles(end_time);
+  }
+  *proto.mutable_hlo_module() = computation->parent()->ToProto();
+
+  const std::string fn = absl::StrFormat("%s.schedule", computation->name());
+  DumpProtobufToFile(proto, debug_options, fn);
+}
+
 LatencyHidingScheduler::SchedulerStatistics
 LatencyHidingScheduler::LatencyHidingStatistics(
     const HloComputation* computation,
@@ -1381,21 +1409,20 @@ LatencyHidingScheduler::LatencyHidingStatistics(
                                     memory_pressure_state->live_ids_at_bottom);
   }
   return LatencyHidingScheduler::SchedulerStatistics{
-      .computation = computation,
-      .all_gather_wasted_cycles =
-          wasted_time_per_collective[AsyncKind::kAllGather],
-      .all_reduce_wasted_cycles =
-          wasted_time_per_collective[AsyncKind::kAllReduce],
-      .collective_permute_wasted_cycles =
-          wasted_time_per_collective[AsyncKind::kCollectivePermute],
-      .send_wasted_cycles = wasted_time_per_collective[AsyncKind::kSend],
-      .recv_wasted_cycles = wasted_time_per_collective[AsyncKind::kRecv],
-      .total_cycles = current_time,
-      .memory_pressure_peak =
-          memory_pressure_state
-              ? mem_pressure_tracker.initial_memory_pressure() +
-                    memory_pressure_state->memory_peak
-              : 0};
+      /*computation=*/computation,
+      /*all_gather_wasted_cycles=*/
+      wasted_time_per_collective[AsyncKind::kAllGather],
+      /*all_reduce_wasted_cycles=*/
+      wasted_time_per_collective[AsyncKind::kAllReduce],
+      /*collective_permute_wasted_cycles=*/
+      wasted_time_per_collective[AsyncKind::kCollectivePermute],
+      /*send_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kSend],
+      /*recv_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kRecv],
+      /*total_cycles=*/current_time,
+      /*memory_pressure_peak=*/
+      memory_pressure_state ? mem_pressure_tracker.initial_memory_pressure() +
+                                  memory_pressure_state->memory_peak
+                            : 0};
 }
 
 // Prints a SchedulerStatistics object.
@@ -1442,15 +1469,13 @@ StatusOr<bool> LatencyHidingScheduler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   VLOG(5) << "Original module:";
-  bool is_module_changed = false;
   XLA_VLOG_LINES(5, module->ToString());
   // Currently we expect that a schedule that minimizes memory pressure is
   // provided as a base. It's not necessary for the algorithm itself but it
-  // allows us to not having to think for now to memory pressure.
+  // allows us to not having to think for now about memory pressure.
   std::vector<HloComputation*> computations_to_schedule;
   computations_to_schedule.reserve(module->computation_count());
   // Collect which computations have latency hiding opportunities.
-  // Convert collective permutes to start/done pairs.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (auto* instr : computation->instructions()) {
@@ -1463,9 +1488,8 @@ StatusOr<bool> LatencyHidingScheduler::Run(
   }
 
   if (computations_to_schedule.empty()) {
-    return is_module_changed;
+    return false;
   }
-  is_module_changed = true;
 
   TF_RETURN_IF_ERROR(scheduler_core_->InitializeScheduler(module));
   for (HloComputation* computation : computations_to_schedule) {
@@ -1473,12 +1497,12 @@ StatusOr<bool> LatencyHidingScheduler::Run(
     LogScheduleStatistics(computation);
     TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> new_schedule,
                         scheduler_core_->ScheduleComputation(computation));
-    computation->parent()->schedule().set_sequence(
-        computation, absl::MakeConstSpan(new_schedule));
+    module->schedule().set_sequence(computation,
+                                    absl::MakeConstSpan(new_schedule));
     VLOG(1) << "Statistics after scheduling:";
     LogScheduleStatistics(computation);
   }
-  return is_module_changed;
+  return true;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
index 859b07c63bb..bc165d61522 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
@@ -28,9 +28,8 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
 
@@ -39,13 +38,14 @@ class ModulePressureState;
 
 enum class ResourceType {
   kNoResource = 0,
-  kAllGather = 1,
-  kAllReduce = 2,
-  kCollectivePermute = 3,
-  kSendRecv = 4,
-  kSendHost = 5,
-  kRecvHost = 6,
-  kNumResources = 7,
+  kAllToAll = 1,
+  kAllGather = 2,
+  kAllReduce = 3,
+  kCollectivePermute = 4,
+  kSendRecv = 5,
+  kSendHost = 6,
+  kRecvHost = 7,
+  kNumResources = 8,
 };
 
 enum class ResourceUsageType {
@@ -66,6 +66,7 @@ class HloScheduleGraph;
 
 struct SchedulerConfig {
   int64_t collective_permute_overlap_limit = 1;
+  int64_t all_to_all_overlap_limit = 1;
   int64_t all_gather_overlap_limit = 1;
   int64_t all_reduce_overlap_limit = 1;
   int64_t send_recv_overlap_limit = 1;
@@ -88,8 +89,9 @@ class LatencyEstimator {
   virtual TimeCost GetLatencyBetween(const HloGraphNode& from,
                                      const HloGraphNode& target) const = 0;
   // Uses the approximate or cost model function for NodeCost based on a flag.
-  virtual LatencyEstimator::TimeCost NodeCost(
-      const HloInstruction* node) const = 0;
+  virtual TimeCost NodeCost(const HloInstruction* node) const = 0;
+  // Returns the core frequency used in latency estimation.
+  virtual int CyclesPerMicrosecond() const = 0;
   virtual ~LatencyEstimator() = default;
 };
 
@@ -97,13 +99,14 @@ class LatencyEstimator {
 class ApproximateLatencyEstimator : public LatencyEstimator {
  public:
   // Returns a latency estimation between two instructions.
-  // Currently this is in abstract units. When the real/accurate cost model will
-  // be implemented this will be in the units that will be.
+  // Currently this is in abstract units. When the real/accurate cost model is
+  // implemented this will be in cycles.
   TimeCost GetLatencyBetween(const HloGraphNode& from,
                              const HloGraphNode& target) const override;
   // Uses the approximate or cost model function for NodeCost based on a flag.
-  LatencyEstimator::TimeCost NodeCost(
-      const HloInstruction* instr) const override;
+  TimeCost NodeCost(const HloInstruction* instr) const override;
+  // ApproximateLatencyEstimator uses abstract units so this returns 1.
+  int CyclesPerMicrosecond() const override { return 1; }
 };
 
 // Helper class to keep track of which instructions are to be supported and
@@ -340,9 +343,8 @@ class BufferInfoTracker {
       const HloBuffer* value, const HloInstruction* first_definition,
       const HloCostAnalysis::ShapeSizeFunction& shape_size_bytes) {
     return ValueInfo{
-        .value = value,
-        .first_definition = first_definition,
-        .buffer_size = shape_size_bytes(value->values()[0]->shape())};
+        /*value=*/value, /*first_definition=*/first_definition,
+        /*buffer_size=*/shape_size_bytes(value->values()[0]->shape())};
   }
   const ValueInfo& GetBufferInfo(HloBuffer::Id id) const {
     return buffer_infos_[id];
@@ -630,6 +632,10 @@ class DefaultSchedulerCore : public SchedulerCore {
   virtual HloGraphNode* FindAndExtractBestNodeAvailable(
       SchedulingState& sched_state,
       DefaultSchedulerCore::ShouldSkipNodeFunction should_skip_node);
+  void DumpLatencyHidingSchedule(
+      const HloComputation* computation, const HloScheduleGraph& schedule_graph,
+      const std::vector<HloInstruction*>& instructions,
+      int cycles_per_microsecond, const DebugOptions& debug_options);
 
   HloCostAnalysis::ShapeSizeFunction shape_size_bytes_;
   std::unique_ptr<ModulePressureState> module_pressure_state_;
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc b/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
index 0623c839f2e..dd079f44acc 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler_test.cc
@@ -21,16 +21,17 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <numeric>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/async_collective_creator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 
@@ -1358,7 +1359,7 @@ ENTRY entry {
 
 TEST_F(LatencyHidingSchedulerTest,
        BalanceChainedCollectivePermutesLoopedEinsum) {
-  absl::string_view hlo_string = R"(
+  std::string hlo_string = R"(
 HloModule module, is_scheduled=true
 
 %fused_computation.1793 (param_0.4944: s32[16], param_1.5648: u32[], param_2.3959: u32[], param_3.3338: u32[], param_4.2302: u32[]) -> (s32[1], s32[1], s32[1], s32[1]) {
@@ -1459,6 +1460,8 @@ HloModule module, is_scheduled=true
   %slice.1245 = bf16[1,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)} slice(bf16[2,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)} %param_1.265), slice={[1:2], [0:4], [0:288], [0:8], [0:1024], [0:1], [0:1]}
   ROOT %add.3080 = bf16[1,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)} add(bf16[1,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)} %param_0.240, bf16[1,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)} %slice.1245)
 }
+)";
+  hlo_string += R"(
 
 ENTRY entry {
   %param.163 = (bf16[1,20,256,16,4,288,1]{2,5,1,4,3,6,0:T(8,128)(2,1)}, bf16[8,1024,1,20,256,1,1]{4,1,3,0,6,5,2:T(8,128)(2,1)}, bf16[1,4,288,8,1024,1,1]{4,2,3,1,6,5,0:T(8,128)(2,1)}, bf16[1,4,288,8,1024,1,1]{4,2,0,3,1,6,5:T(8,128)(2,1)}, u32[]{:T(128)}) parameter(0)
@@ -1610,7 +1613,7 @@ ENTRY entry {
 
 TEST_F(LatencyHidingSchedulerTest,
        BalanceChainedCollectivePermutesLoopedEinsum2) {
-  absl::string_view hlo_string = R"(
+  std::string hlo_string = R"(
 HloModule module, is_scheduled=true
 
 %fused_computation.1851 (param_0.5170: s32[32], param_1.5848: u32[], param_2.4103: u32[], param_3.3513: u32[], param_4.2356: u32[]) -> (s32[1], s32[1], s32[1], s32[1]) {
@@ -1711,7 +1714,8 @@ HloModule module, is_scheduled=true
   %slice.1125 = bf16[1,576,16,1024,1,1]{3,1,0,2,5,4:T(8,128)(2,1)} slice(bf16[2,576,16,1024,1,1]{3,1,0,2,5,4:T(8,128)(2,1)} %param_1.298), slice={[1:2], [0:576], [0:16], [0:1024], [0:1], [0:1]}
   ROOT %add.3122 = bf16[1,576,16,1024,1,1]{3,1,0,2,5,4:T(8,128)(2,1)} add(bf16[1,576,16,1024,1,1]{3,1,0,2,5,4:T(8,128)(2,1)} %param_0.250, bf16[1,576,16,1024,1,1]{3,1,0,2,5,4:T(8,128)(2,1)} %slice.1125)
 }
-
+)";
+  hlo_string += R"(
 ENTRY entry {
   %constant.4782 = u32[]{:T(128)} constant(16)
   %constant.4661 = u32[]{:T(128)} constant(2)
@@ -1803,7 +1807,7 @@ ENTRY entry {
 
 TEST_F(LatencyHidingSchedulerTest,
        BalanceChainedCollectivePermutesLoopedEinsum3) {
-  absl::string_view hlo_string = R"(
+  std::string hlo_string = R"(
 HloModule module, is_scheduled=true
 
 %fused_computation.1799 (param_0.4926: s32[16], param_1.5709: u32[], param_2.3976: u32[], param_3.3386: u32[], param_4.2299: u32[]) -> (s32[1], s32[1], s32[1], s32[1]) {
@@ -1898,7 +1902,8 @@ HloModule module, is_scheduled=true
   %bitcast.596 = bf16[8,2048,1,36,256,1]{4,1,3,0,5,2:T(8,128)(2,1)} bitcast(bf16[8,2048,1,36,256,1,1]{4,1,6,5,3,2,0:T(8,128)(2,1)} %convolution.171)
   ROOT %add.3143 = bf16[8,2048,1,36,256,1]{4,1,3,0,5,2:T(8,128)(2,1)} add(bf16[8,2048,1,36,256,1]{4,1,3,0,5,2:T(8,128)(2,1)} %add.3146, bf16[8,2048,1,36,256,1]{4,1,3,0,5,2:T(8,128)(2,1)} %bitcast.596)
 }
-
+)";
+  hlo_string += R"(
 ENTRY entry {
   %constant.4735 = u32[]{:T(128)} constant(2)
   %constant.4598 = u32[]{:T(128)} constant(15)
@@ -2344,4 +2349,100 @@ ENTRY entry {
             GetIndex(new_instruction_sequence, "while"));
 }
 
+TEST_F(LatencyHidingSchedulerTest, AllToAllAsyncBalance) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+async_computation {
+  p = f32[2,8,256,256] parameter(0)
+  ROOT ata = f32[2,8,256,256] all-to-all(p), dimensions={0}, replica_groups={{0,1}}
+}
+
+async_computation.2 {
+  p.2 = f32[2,8,256,256] parameter(0)
+  ROOT ata.1 = f32[2,8,256,256] all-to-all(p.2), dimensions={0}, replica_groups={{0,1}}
+}
+
+
+ENTRY %module {
+  %constant.19 = u32[] constant(0)
+  %replica_id = u32[]{:T(128)} replica-id()
+  %convert = f32[]{:T(128)} convert(u32[]{:T(128)} %replica_id)
+  %color_operand.1 = f32[2,8,256,256]{3,2,1,0} broadcast(
+    f32[]{:T(128)} %convert), dimensions={}
+  %color_operand.2 = f32[2,8,256,256]{3,2,1,0} broadcast(
+    f32[]{:T(128)} %convert), dimensions={}
+  %ata-start = ((f32[2,8,256,256]), f32[2,8,256,256], u32[], u32[]) async-start(
+    f32[2,8,256,256] %color_operand.1), calls=async_computation,
+    metadata={op_type="AllToAll" op_name="ata0"}
+  %ata-start.2 = ((f32[2,8,256,256]), f32[2,8,256,256], u32[], u32[]) async-start(
+    f32[2,8,256,256] %color_operand.2), calls=async_computation.2,
+    metadata={op_type="AllToAll" op_name="ata1"}
+  %ata-done = f32[2,8,256,256] async-done(%ata-start), calls=async_computation,
+    metadata={op_type="AllToAll" op_name="ata0"}
+  %ata-done-bc = f32[16,256,256] bitcast(f32[2,8,256,256] %ata-done),
+    metadata={op_type="Bitcast" op_name="ata0"}
+  %ata-done.2 = f32[2,8,256,256] async-done(%ata-start.2), calls=async_computation.2,
+    metadata={op_type="AllToAll" op_name="ata1"}
+  %ata-done-bc.2 = f32[16,256,256] bitcast(f32[2,8,256,256] %ata-done.2),
+    metadata={op_type="Bitcast" op_name="ata1"}
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[16,64,256]{2,1,0} parameter(1)
+  p2 = f32[16,256,256]{2,1,0} parameter(2)
+  p3 = f32[16,256,256]{2,1,0} parameter(3)
+  c0 = f32[16,256,256]{2,1,0} convolution(p0, p1),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb,
+    metadata={op_type="AllToAll" op_name="c0"}
+  c1 = f32[16,256,256]{2,1,0} convolution(p0, p1),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb,
+    metadata={op_type="AllToAll" op_name="c1"}
+  a2 = f32[16,256,256]{2,1,0} add(c1, c0)
+  ROOT t = (f32[16,256,256], f32[16,256,256], f32[16,256,256]) tuple(a2, %ata-done-bc.2, %ata-done-bc)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  EXPECT_TRUE(RunScheduler(hlo_module.get()).ok());
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+  // We expect that the scheduling would look like this:
+  //   %ar-start = async-start()
+  //   %c0 = convolution()
+  //   %ar-done = async-done()
+  //   %ar-start.2 = all-reduce-start()
+  //   %c1 = convolution()
+  //   %ar-done.2 = f32[2,8,256,256]{3,2,1,0} async-done()
+  // This means that the asyncs are balanced over the two convolutions
+  // rather than being unbalanced (one of the two asyncs overlaps with
+  // both the convolutons and the other with nothing).
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kConvolution,
+                                        new_instruction_sequence, "c0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAsyncDone,
+                                        new_instruction_sequence, "ata0"));
+  EXPECT_GT(GetOpcodeIndexUsingMetaData(HloOpcode::kConvolution,
+                                        new_instruction_sequence, "c0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAsyncStart,
+                                        new_instruction_sequence, "ata0"));
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kConvolution,
+                                        new_instruction_sequence, "c1"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAsyncDone,
+                                        new_instruction_sequence, "ata1"));
+  EXPECT_GT(GetOpcodeIndexUsingMetaData(HloOpcode::kConvolution,
+                                        new_instruction_sequence, "c1"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAsyncStart,
+                                        new_instruction_sequence, "ata1"));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index c21bc442efa..38f94c4b255 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -32,18 +32,18 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
@@ -2447,8 +2447,7 @@ StatusOr<bool> LayoutAssignment::Run(
       if (IsLayoutConstrainedCustomCall(instruction)) {
         absl::flat_hash_set<int64_t> processed;
         for (const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>&
-                 output_operand_pair :
-             instruction->custom_call_output_operand_aliasing()) {
+                 output_operand_pair : instruction->output_operand_aliasing()) {
           int operand_no = output_operand_pair.second.first;
           if (!processed.contains(operand_no)) {
             TF_RETURN_IF_ERROR(AddCopyForOperand(instruction, operand_no));
@@ -2629,6 +2628,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kStochasticConvert:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
     case HloOpcode::kPopulationCount:
     case HloOpcode::kTriangularSolve:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 1870ae243db..bbe7117fd9a 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -27,12 +27,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index bf15d052344..dde5c1e9ec6 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -21,14 +21,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
diff --git a/tensorflow/compiler/xla/service/layout_normalization.cc b/tensorflow/compiler/xla/service/layout_normalization.cc
index 3694f124748..abd6695cbf3 100644
--- a/tensorflow/compiler/xla/service/layout_normalization.cc
+++ b/tensorflow/compiler/xla/service/layout_normalization.cc
@@ -22,12 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -73,7 +74,8 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
                 literal.size_bytes());
 
     HloInstruction* normalized = hlo->parent()->AddInstruction(
-        HloInstruction::CreateConstant(std::move(new_literal)));
+        HloInstruction::CreateConstant(std::move(new_literal)),
+        &hlo->metadata());
     HloInstruction* bc_to_orig = MakeBitcastHlo(normalized, shape);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     return OkStatus();
@@ -97,16 +99,8 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
         ToTransposeDimensions(hlo->shape().layout());
 
     auto normalize_slice_attr = [&](absl::Span<int64_t const> input) {
-      std::vector<int64_t> v = Permute(input, layout_as_permutation);
-      std::vector<int64_t> out;
-      // Slicing on degenerate dimensions only produces degenerate dimensions,
-      // so these can be safely ignored.
-      for (int i = 0; i < v.size(); i++) {
-        if (normalized_w_degen.dimensions(i) != 1) {
-          out.push_back(v[i]);
-        }
-      }
-      return out;
+      return PermuteSliceAttributes(input, layout_as_permutation,
+                                    normalized_w_degen);
     };
 
     TF_ASSIGN_OR_RETURN(HloInstruction * normalized_slice,
@@ -292,8 +286,8 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       d = FindIndex(orig_output_layout_as_permutation, d);
     }
     absl::c_sort(br_dimensions);
-    auto normalized_broadcast =
-        MakeBroadcastHlo(normalized_input, br_dimensions, normalized_shape);
+    auto normalized_broadcast = MakeBroadcastHlo(
+        normalized_input, br_dimensions, normalized_shape, &hlo->metadata());
     VLOG(3) << "Generated broadcast: " << normalized_broadcast->ToString();
     auto bc_to_orig = MakeBitcastHlo(normalized_broadcast, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
@@ -324,15 +318,19 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     PrimitiveType to_element_type = s.element_type();
     HloInstruction* new_unary;
     if (hlo->opcode() == HloOpcode::kConvert) {
-      new_unary = MakeConvertToHlo(normalized_input, to_element_type);
+      new_unary =
+          MakeConvertToHlo(normalized_input, to_element_type, &hlo->metadata());
     } else if (hlo->opcode() == HloOpcode::kReducePrecision) {
-      new_unary = MakeReducePrecisionHlo(normalized_input, hlo->exponent_bits(),
-                                         hlo->mantissa_bits());
+      new_unary =
+          MakeReducePrecisionHlo(normalized_input, hlo->exponent_bits(),
+                                 hlo->mantissa_bits(), &hlo->metadata());
     } else if (hlo->opcode() == HloOpcode::kBitcastConvert) {
-      new_unary = MakeBitcastConvertToHlo(normalized_input, to_element_type);
+      new_unary = MakeBitcastConvertToHlo(normalized_input, to_element_type,
+                                          &hlo->metadata());
     } else {
-      TF_ASSIGN_OR_RETURN(new_unary,
-                          MakeUnaryHlo(hlo->opcode(), normalized_input));
+      TF_ASSIGN_OR_RETURN(
+          new_unary,
+          MakeUnaryHlo(hlo->opcode(), normalized_input, &hlo->metadata()));
     }
     auto bc_to_orig = MakeBitcastHlo(new_unary, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
@@ -365,9 +363,11 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     HloInstruction* new_binary;
     if (hlo->opcode() == HloOpcode::kCompare) {
       TF_ASSIGN_OR_RETURN(new_binary,
-                          MakeCompareHlo(hlo->comparison_direction(), a0, b0));
+                          MakeCompareHlo(hlo->comparison_direction(), a0, b0,
+                                         &hlo->metadata()));
     } else {
-      TF_ASSIGN_OR_RETURN(new_binary, MakeBinaryHlo(hlo->opcode(), a0, b0));
+      TF_ASSIGN_OR_RETURN(
+          new_binary, MakeBinaryHlo(hlo->opcode(), a0, b0, &hlo->metadata()));
     }
     auto bc_to_orig = MakeBitcastHlo(new_binary, s);
     TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
@@ -444,7 +444,7 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
       auto bc_to_orig = MakeBitcastHlo(normalized_transpose, s);
       TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     } else {
-      auto bc_to_orig = MakeBitcastHlo(a0, s);
+      auto bc_to_orig = MakeBitcastHlo(a0, s, &hlo->metadata());
       TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
     }
     return OkStatus();
@@ -513,21 +513,16 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     auto operand = hlo->mutable_operand(0);
     const auto& operand_s = operand->shape();
     auto padded_by = hlo->mutable_operand(1);
-    TF_ASSIGN_OR_RETURN(auto a0, GetNormalizedInput(operand));
     auto padded_config = hlo->padding_config();
 
-    auto operand_s_filtered = ShapeUtil::FilterDimensions(
-        [&](int dim) {
-          return operand_s.dimensions(dim) != 1 ||
-                 !IsZeroPadding(hlo->padding_config().dimensions(dim));
-        },
-        operand->shape());
-    auto operand_s_normalized =
-        ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-            operand_s_filtered);
-    auto new_operand = operand_s_normalized == a0->shape()
-                           ? a0
-                           : MakeBitcastHlo(a0, operand_s_normalized);
+    auto dim_filter = [&](int64_t dim) {
+      return operand_s.dimensions(dim) != 1 ||
+             !IsZeroPadding(hlo->padding_config().dimensions(dim));
+    };
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_operand,
+        BitcastToNormalizedWithDegenIfNecessary(operand, dim_filter));
 
     auto s_normalized = Normalize(s);
     auto l = ToTransposeDimensions(s.layout());
@@ -570,7 +565,177 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return DefaultAction(hlo);
   }
 
+  // Pushes down bitcast across the ternary select operation: same logic as
+  // HandleElementwiseBinary.
+  Status HandleSelect(HloInstruction* hlo) override {
+    return HandleTernary(hlo);
+  }
+
+  // DyanmicSlice is layout-preserving, so handling is analoguous to elementwise
+  // unary, and transposing the elements inside the metadata, as well as the
+  // operands specifying dimension sizes.
+  Status HandleDynamicSlice(HloInstruction* hlo) override {
+    const Shape& s = hlo->shape();
+    HloInstruction* operand = hlo->mutable_operand(0);
+    const Shape& operand_shape = operand->shape();
+    TF_RET_CHECK(s.layout() == operand_shape.layout());
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * normalized_input,
+                        GetNormalizedInput(operand));
+
+    Shape normalized_w_degen =
+        ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+            operand_shape);
+
+    std::vector<int64_t> layout_as_permutation =
+        ToTransposeDimensions(hlo->shape().layout());
+
+    std::vector<HloInstruction*> new_start_indices =
+        FindNonDegenerateStartIdxs(hlo, /*param_offset=*/1, operand_shape);
+
+    auto normalize_slice_attr = [&](absl::Span<int64_t const> input) {
+      return PermuteSliceAttributes(input, layout_as_permutation,
+                                    normalized_w_degen);
+    };
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * normalized_dynamic_slice,
+        MakeDynamicSliceHlo(normalized_input, new_start_indices,
+                            normalize_slice_attr(hlo->dynamic_slice_sizes()),
+                            &hlo->metadata()));
+    *normalized_dynamic_slice->mutable_shape()->mutable_layout() =
+        normalized_input->shape().layout();
+    Shape normalized_shape = Normalize(s);
+    // Output of slice might contain degenerate dimensions.
+    HloInstruction* bc_to_normalized =
+        MakeBitcastHlo(normalized_dynamic_slice, normalized_shape);
+    HloInstruction* bc_to_orig = MakeBitcastHlo(bc_to_normalized, s);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
+    return OkStatus();
+  }
+
+  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override {
+    const Shape& s = hlo->shape();
+    HloInstruction* operand = hlo->mutable_operand(0);
+
+    HloInstruction* update = hlo->mutable_operand(1);
+    const Shape& operand_shape = operand->shape();
+    TF_RET_CHECK(s.layout() == operand_shape.layout());
+
+    auto shape_filter = [&](int64_t dim) {
+      return operand->shape().dimensions(dim) != 1;
+    };
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_operand,
+        BitcastToNormalizedWithDegenIfNecessary(operand, shape_filter));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_update,
+        BitcastToNormalizedWithDegenIfNecessary(update, shape_filter));
+    std::vector<HloInstruction*> new_start_indices =
+        FindNonDegenerateStartIdxs(hlo, /*param_offset=*/2, operand_shape);
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_dus,
+        MakeDynamicUpdateSliceHlo(new_operand, new_update, new_start_indices,
+                                  &hlo->metadata()));
+    *new_dus->mutable_shape()->mutable_layout() = new_operand->shape().layout();
+
+    // Output of DUS might contain degenerate dimensions.
+    Shape normalized_shape = Normalize(s);
+    HloInstruction* bc_to_normalized =
+        MakeBitcastHlo(new_dus, normalized_shape);
+    HloInstruction* bc_to_orig = MakeBitcastHlo(bc_to_normalized, s);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
+
+    return OkStatus();
+  }
+
+  Status HandleClamp(HloInstruction* hlo) override {
+    return HandleTernary(hlo);
+  }
+
  private:
+  // Replace clamp/select ternary operation with a normalized one.
+  Status HandleTernary(HloInstruction* hlo) {
+    Shape s = hlo->shape();
+    HloOpcode opcode = hlo->opcode();
+    TF_RET_CHECK(opcode == HloOpcode::kClamp || opcode == HloOpcode::kSelect);
+    HloInstruction* p = hlo->mutable_operand(0);
+    HloInstruction* i1 = hlo->mutable_operand(1);
+    HloInstruction* i2 = hlo->mutable_operand(2);
+    TF_RET_CHECK(p->shape().layout() == s.layout());
+    TF_RET_CHECK(i1->shape().layout() == s.layout());
+    TF_RET_CHECK(i2->shape().layout() == s.layout());
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * p_0, GetNormalizedInput(p));
+    TF_ASSIGN_OR_RETURN(HloInstruction * i1_0, GetNormalizedInput(i1));
+    TF_ASSIGN_OR_RETURN(HloInstruction * i2_0, GetNormalizedInput(i2));
+
+    TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferTernaryOpShape(
+                                             opcode, p_0, i1_0, i2_0));
+    HloInstruction* normalized = hlo->parent()->AddInstruction(
+        HloInstruction::CreateTernary(new_shape, opcode, p_0, i1_0, i2_0));
+    hlo->SetupDerivedInstruction(normalized);
+
+    HloInstruction* bc_to_orig = MakeBitcastHlo(normalized, s);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
+    return OkStatus();
+  }
+
+  std::vector<HloInstruction*> FindNonDegenerateStartIdxs(
+      HloInstruction* hlo, int param_offset, const Shape& operand_shape) {
+    std::vector<int64_t> layout_as_permutation =
+        ToTransposeDimensions(operand_shape.layout());
+    std::vector<HloInstruction*> start_indices;
+    for (int i = param_offset; i < hlo->operand_count(); i++) {
+      start_indices.push_back(hlo->mutable_operand(i));
+    }
+    Shape normalized_w_degen =
+        ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+            operand_shape);
+
+    std::vector<HloInstruction*> permuted_start_indices =
+        Permute(start_indices, layout_as_permutation);
+    std::vector<HloInstruction*> new_start_indices;
+    for (int i = 0; i < permuted_start_indices.size(); i++) {
+      if (normalized_w_degen.dimensions(i) != 1) {
+        new_start_indices.push_back(permuted_start_indices[i]);
+      }
+    }
+    return new_start_indices;
+  }
+
+  StatusOr<HloInstruction*> BitcastToNormalizedWithDegenIfNecessary(
+      HloInstruction* operand,
+      absl::FunctionRef<bool(int64_t)> keep_dimension) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * normalized_operand,
+                        GetNormalizedInput(operand));
+    Shape operand_shape_filtered =
+        ShapeUtil::FilterDimensions(keep_dimension, operand->shape());
+    Shape operand_shape_normalized =
+        ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+            operand_shape_filtered);
+    return operand_shape_normalized == normalized_operand->shape()
+               ? normalized_operand
+               : MakeBitcastHlo(normalized_operand, operand_shape_normalized);
+  }
+
+  std::vector<int64_t> PermuteSliceAttributes(
+      absl::Span<int64_t const> input,
+      absl::Span<int64_t const> layout_as_permutation,
+      const Shape& normalized_operand_w_degen) {
+    std::vector<int64_t> v = Permute(input, layout_as_permutation);
+    std::vector<int64_t> out;
+    // Slicing on degenerate dimensions only produces degenerate dimensions,
+    // so these can be safely ignored.
+    for (int i = 0; i < v.size(); i++) {
+      if (normalized_operand_w_degen.dimensions(i) != 1) {
+        out.push_back(v[i]);
+      }
+    }
+    return out;
+  }
+
   bool IsZeroPadding(const PaddingConfig::PaddingConfigDimension& c) {
     return c.edge_padding_high() == 0 && c.edge_padding_low() == 0 &&
            c.interior_padding() == 0;
diff --git a/tensorflow/compiler/xla/service/layout_normalization.h b/tensorflow/compiler/xla/service/layout_normalization.h
index d224d676491..8665b5da7f4 100644
--- a/tensorflow/compiler/xla/service/layout_normalization.h
+++ b/tensorflow/compiler/xla/service/layout_normalization.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/layout_normalization_test.cc b/tensorflow/compiler/xla/service/layout_normalization_test.cc
index 0e294e4b245..dafd68f800c 100644
--- a/tensorflow/compiler/xla/service/layout_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/layout_normalization_test.cc
@@ -558,5 +558,112 @@ ENTRY main {
   )");
 }
 
+TEST_F(LayoutNormalizationTest, Select) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  p0 = f32[1,17,9,9]{1,3,2,0} parameter(0)
+  p1 = f32[1,17,9,9]{1,3,2,0} parameter(1)
+  b = pred[1,17,9,9]{1,3,2,0} parameter(2)
+  ROOT out = f32[1,17,9,9]{1,3,2,0} select(b, p0, p1), metadata={op_name="test"}
+}
+)";
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[9,9,17]{2,1,0} select({{.*}}, {{.*}}, {{.*}}), metadata={op_name="test"}
+)");
+}
+
+TEST_F(LayoutNormalizationTest, DynamicSlice) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  input = f32[3,4,32]{1,0,2} parameter(0)
+  s1 = s32[] parameter(1)
+  s2 = s32[] parameter(2)
+  s3 = s32[] parameter(3)
+  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, s1, s2, s3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
+}
+  )";
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[32,1,4]{2,1,0} dynamic-slice({{.*}}, {{.*}}, {{.*}}, {{.*}}), dynamic_slice_sizes={32,1,4}, metadata={op_name="test"}
+)");
+}
+
+TEST_F(LayoutNormalizationTest, DynamicSliceHasDegenerate) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  input = f32[1,4,32]{1,0,2} parameter(0)
+  s1 = s32[] parameter(1)
+  s2 = s32[] parameter(2)
+  s3 = s32[] parameter(3)
+  ROOT out = f32[1,4,32]{1,0,2} dynamic-slice(input, s1, s2, s3), dynamic_slice_sizes={1,4,32}, metadata={op_name="test"}
+}
+  )";
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[32,4]{1,0} dynamic-slice({{.*}}, {{.*}}, {{.*}}), dynamic_slice_sizes={32,4}, metadata={op_name="test"}
+)");
+}
+
+TEST_F(LayoutNormalizationTest, DynamicUpdateSlice) {
+  const char* hlo = R"(
+HloModule m
+
+ENTRY main {
+  to_update = f32[3,1,32]{1,0,2} parameter(0)
+  updates = f32[1,1,32]{1,0,2} parameter(1)
+  p0 = s32[] parameter(2)
+  p1 = s32[] parameter(3)
+  p2 = s32[] parameter(4)
+
+  ROOT out = f32[3,1,32]{1,0,2} dynamic-update-slice(to_update, updates, p0, p1, p2), metadata={op_name="test"}
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[32,3]{1,0} dynamic-update-slice({{.*}}, {{.*}}, {{.*}}, {{.*}}), metadata={op_name="test"}
+)");
+}
+
+TEST_F(LayoutNormalizationTest, DynamicUpdateSliceNonDeg) {
+  const char* hlo = R"(
+HloModule m
+
+ENTRY main {
+  to_update = f32[5,3,32]{1,0,2} parameter(0)
+  updates = f32[1,1,32]{1,0,2} parameter(1)
+  p0 = s32[] parameter(2)
+  p1 = s32[] parameter(3)
+  p2 = s32[] parameter(4)
+
+  ROOT out = f32[5,3,32]{1,0,2} dynamic-update-slice(to_update, updates, p0, p1, p2), metadata={op_name="test"}
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[32,5,3]{2,1,0} dynamic-update-slice
+)");
+}
+
+TEST_F(LayoutNormalizationTest, Clamp) {
+  const char* hlo = R"(
+HloModule m
+
+ENTRY main {
+  p0 = f32[64,1,32]{1,0,2} parameter(0)
+  p1 = f32[64,1,32]{1,0,2} parameter(1)
+  p2 = f32[64,1,32]{1,0,2} parameter(2)
+  ROOT out = f32[64,1,32]{1,0,2} clamp(f32[64,1,32]{1,0,2} p0, f32[64,1,32]{1,0,2} p1, f32[64,1,32]{1,0,2} p2), metadata={op_name="test"}
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: f32[32,64]{1,0} clamp({{.*}}, {{.*}}, {{.*}}), metadata={op_name="test"}
+)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index bb0c07186ea..7b0acf21134 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -1,14 +1,12 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -37,8 +35,8 @@ cc_library(
         ":ir_array",
         ":llvm_type_conversion_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -48,7 +46,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "alias_analysis_test",
     srcs = ["alias_analysis_test.cc"],
     deps = [
@@ -74,8 +72,8 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:dump",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service/cpu:cpu_options",
@@ -184,9 +182,9 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:fusion_node_indexing_evaluation",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
@@ -204,9 +202,9 @@ cc_library(
         ":ir_array",
         ":llvm_util",
         ":loop_emitter",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:backend_config_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:launch_dimensions",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
@@ -295,7 +293,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "ir_array_test",
     srcs = ["ir_array_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 7e7a6f6f820..d5d3f96a802 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/types.h"
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index db3b8b39645..4640011820a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_DYNAMIC_UPDATE_SLICE_UTIL_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 39cb493780a..f16dfc359ef 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index bb28f3935f1..52f0e932da0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index 95a6688154c..bbe65306050 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_IR_BUILDER_MIXIN_H_
 
+#include <optional>
+
 #include "llvm/IR/IRBuilder.h"
 
 namespace xla {
@@ -80,14 +82,14 @@ class IrBuilderMixin {
   }
 
   llvm::CallInst* Call(llvm::FunctionCallee func_callee,
-                       llvm::ArrayRef<llvm::Value*> args = llvm::None,
+                       llvm::ArrayRef<llvm::Value*> args = std::nullopt,
                        const llvm::Twine& name = "",
                        llvm::MDNode* fp_math_tag = nullptr) {
     return mixin_builder()->CreateCall(func_callee, args, name, fp_math_tag);
   }
 
   llvm::CallInst* Call(llvm::FunctionType* func_type, llvm::Value* callee,
-                       llvm::ArrayRef<llvm::Value*> args = llvm::None,
+                       llvm::ArrayRef<llvm::Value*> args = std::nullopt,
                        const llvm::Twine& name = "",
                        llvm::MDNode* fp_math_tag = nullptr) {
     return mixin_builder()->CreateCall(func_type, callee, args, name,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 870a8691283..171c8d4b434 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -160,6 +160,11 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       // addition to an addition on this type (int16_t) - this is just the type
       // used for storage.
       return llvm::Type::getInt16Ty(module->getContext());
+    case F8E5M2:
+    case F8E4M3FN:
+      // Similarly as with BF16, we represent F8 as an int since there is no
+      // LLVM F8 dtype.
+      return llvm::Type::getInt8Ty(module->getContext());
     case F16:
       return llvm::Type::getHalfTy(module->getContext());
     case S32:
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 02fe4edceea..1c5bf1881a7 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -31,8 +31,8 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 9da9726f961..3ca3865df3d 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -25,12 +25,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index af002bb593c..058adf6e76d 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 53c5d7f8572..99229d1b984 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index a9029d85897..5b06b5fbbe5 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -193,4 +193,26 @@ Status LogicalBufferAnalysis::HandleCustomCall(HloInstruction* custom_call) {
   return OkStatus();
 }
 
+// WARNING (b/259460539): output_to_operand_aliasing was moved from
+// HloCustomCallInstruction to HloCallableInstruction so that fusions can
+// also be annotated with this aliasing. This feature might not be complete.
+// Particular to this analysis, we added the HandleFusion function below to
+// accommodate the output-operand aliases (similar to HandleCustomCall).
+// TODO (sacer): We might want to consider the pairs discovered by
+// GetFusionInstructionInPlaceInputOutputPairs() here as well.
+Status LogicalBufferAnalysis::HandleFusion(HloInstruction* fusion) {
+  auto cfusion = Cast<HloFusionInstruction>(fusion);
+  absl::flat_hash_set<ShapeIndex> aliased_outputs;
+  for (const auto& pair : cfusion->output_to_operand_aliasing()) {
+    aliased_outputs.insert(pair.first);
+  }
+  ShapeUtil::ForEachSubshape(cfusion->shape(),
+                             [&](const Shape& shape, const ShapeIndex& index) {
+                               if (!aliased_outputs.contains(index)) {
+                                 NewLogicalBuffer(fusion, index);
+                               }
+                             });
+  return OkStatus();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index b4f4fc488c4..70b8ceba306 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -68,6 +68,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleSend(HloInstruction* send) override;
   Status HandleAddDependency(HloInstruction* add_dependency) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleFusion(HloInstruction* fusion) override;
 
   // A map from the buffer ID to the logical buffer
   std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
diff --git a/tensorflow/compiler/xla/service/logistic_expander.cc b/tensorflow/compiler/xla/service/logistic_expander.cc
index b4d9a2052c0..aac6f38ade5 100644
--- a/tensorflow/compiler/xla/service/logistic_expander.cc
+++ b/tensorflow/compiler/xla/service/logistic_expander.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <optional>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/logistic_expander.h b/tensorflow/compiler/xla/service/logistic_expander.h
index f59bbe137df..cceba91b802 100644
--- a/tensorflow/compiler/xla/service/logistic_expander.h
+++ b/tensorflow/compiler/xla/service/logistic_expander.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
diff --git a/tensorflow/compiler/xla/service/logistic_expander_test.cc b/tensorflow/compiler/xla/service/logistic_expander_test.cc
index 89024f94ab1..a6a788c0a0a 100644
--- a/tensorflow/compiler/xla/service/logistic_expander_test.cc
+++ b/tensorflow/compiler/xla/service/logistic_expander_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
index 7fa9bf9d0b1..4da638cf45f 100644
--- a/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
 
-#include "tensorflow/compiler/xla/service/dump.h"
+#include <memory>
+
 #include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 
 namespace xla {
@@ -143,25 +144,28 @@ static StatusOr<bool> AddControlEdgesForLoopWrites(
 StatusOr<bool> LoopScheduleLinearizer::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module, can_share_buffer_));
+  // Constructing HloAliasAnalysis is expensive, so don't do it until we find at
+  // least one kWhile op in the module.
+  std::unique_ptr<HloAliasAnalysis> alias_analysis;
 
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        StatusOr<bool> updated_loop =
-            AddControlEdgesForLoopWrites(instruction, *alias_analysis);
-        TF_RETURN_IF_ERROR(updated_loop.status());
-        changed |= *updated_loop;
+      if (instruction->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+
+      if (alias_analysis == nullptr) {
+        TF_ASSIGN_OR_RETURN(alias_analysis,
+                            HloAliasAnalysis::Run(module, can_share_buffer_));
       }
+      TF_ASSIGN_OR_RETURN(bool updated_loop, AddControlEdgesForLoopWrites(
+                                                 instruction, *alias_analysis));
+      changed |= updated_loop;
     }
   }
-  DumpHloModuleDuringPassIfEnabled(
-      name(), "after inserting control edges inside while loop bodies",
-      *module);
 
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer.h b/tensorflow/compiler/xla/service/loop_schedule_linearizer.h
index b03ed917301..ac91a00a7fb 100644
--- a/tensorflow/compiler/xla/service/loop_schedule_linearizer.h
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LOOP_SCHEDULE_LINEARIZER_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
index 164cdae3dec..9f22601f142 100644
--- a/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <set>
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/map_inliner.cc b/tensorflow/compiler/xla/service/map_inliner.cc
index 77941a8372d..e3af7201bd0 100644
--- a/tensorflow/compiler/xla/service/map_inliner.cc
+++ b/tensorflow/compiler/xla/service/map_inliner.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/map_inliner.h b/tensorflow/compiler/xla/service/map_inliner.h
index 61856c63492..5d20158d659 100644
--- a/tensorflow/compiler/xla/service/map_inliner.h
+++ b/tensorflow/compiler/xla/service/map_inliner.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MAP_INLINER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MAP_INLINER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/map_inliner_test.cc b/tensorflow/compiler/xla/service/map_inliner_test.cc
index 1e8bc983d1d..90fd662e9c3 100644
--- a/tensorflow/compiler/xla/service/map_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/map_inliner_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index fec62268d06..e3c6c84dd13 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -167,10 +167,10 @@ bool IsCrossProgramPrefetchCandidate(const HloValue& value,
          });
 }
 
-std::optional<MemorySpaceAssignment::BufferInterval>
-FindCrossProgramPrefetchCandidate(const HloAliasAnalysis& alias_analysis,
-                                  const HloLiveRange& hlo_live_range,
-                                  const Options& options) {
+std::vector<MemorySpaceAssignment::BufferInterval>
+FindCrossProgramPrefetchCandidates(const HloAliasAnalysis& alias_analysis,
+                                   const HloLiveRange& hlo_live_range,
+                                   const Options& options) {
   std::vector<MemorySpaceAssignment::BufferInterval> candidates;
   for (const HloBuffer& buffer : alias_analysis.buffers()) {
     CHECK_GE(buffer.values().size(), 1);
@@ -212,13 +212,14 @@ FindCrossProgramPrefetchCandidate(const HloAliasAnalysis& alias_analysis,
                       ? *options.buffer_interval_compare
                       : size_compare;
 
-  auto best_candidate = absl::c_min_element(candidates, compare);
-  if (best_candidate == candidates.end()) {
-    return std::nullopt;
+  absl::c_sort(candidates, compare);
+
+  VLOG(3) << "Cross-program prefetch candidates: " << candidates.size();
+  for (auto& candidate : candidates) {
+    VLOG(3) << "Cross-program prefetch candidate picked: "
+            << candidate.buffer->ToString();
   }
-  VLOG(3) << "Cross-program prefetch candidate picked: "
-          << best_candidate->buffer->ToString();
-  return *best_candidate;
+  return candidates;
 }
 
 Status InsertInstructionAndEnsureOperandsInserted(
@@ -507,7 +508,8 @@ float MemorySpaceAssignmentCostAnalysis::GetAsyncCopyElapsed(
     const Shape& shape) const {
   int64_t size_in_bytes = cost_analysis_.GetShapeSize(shape);
   return static_cast<float>(size_in_bytes) /
-         options().async_copy_bandwidth_bytes_per_second;
+         (options().async_copy_bandwidth_bytes_per_second *
+          options().async_copy_bandwidth_scaling_factor);
 }
 
 int64_t MemorySpaceAssignmentCostAnalysis::GetScheduleEndTime() const {
@@ -586,7 +588,8 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     float min_overlap_to_async_copy_ratio,
     float preferred_overlap_to_async_copy_ratio,
-    float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes)
+    float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
+    const Shape* shape_override)
     : while_nest_level_(
           cost_analysis.hlo_live_range().instruction_schedule().size() + 1, 0),
       computation_nest_level_(
@@ -598,7 +601,9 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
       max_async_copy_elapsed_(
           cost_analysis_.GetAsyncCopyElapsed(
               ShapeUtil::MakeShape(S32, {mem_size_bytes / 4})) *
-          max_overlap_to_mem_size_async_copy_ratio) {
+          max_overlap_to_mem_size_async_copy_ratio),
+      shape_override_(shape_override ? std::optional(*shape_override)
+                                     : std::nullopt) {
   instruction_schedule_ =
       &cost_analysis_.hlo_live_range().instruction_schedule();
 
@@ -688,7 +693,8 @@ bool CostAnalysisPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
   // Even though this method returns if we allow the buffer in alternate memory
   // _without_ asynchronous copies, calculate how long it would have taken to
   // copy it and compare it to the elapsed time in the logical interval.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   float logical_interval_elapsed =
       GetLogicalIntervalElapsed(start_time, end_time);
   return GetMaxElapsedInAlternateMemory(async_copy_elapsed) >
@@ -697,7 +703,8 @@ bool CostAnalysisPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
 
 int64_t CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
     const Shape& shape, int64_t start_time, int64_t latest_end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   int64_t end_time;
   for (end_time = start_time + 1; end_time <= latest_end_time; ++end_time) {
     float logical_interval_elapsed =
@@ -715,7 +722,8 @@ int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
     const Shape& shape, int64_t start_time, int64_t end_time,
     const HloUse* use) const {
   // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   // If there is a use, estimate the time we would save by having this op in
   // alternate memory.
   float inst_elapsed_reduction = 0.0f;
@@ -752,7 +760,8 @@ int64_t CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
     int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const {
   // Between the earliest and latest prefetch interval, find the interval
   // closest to the preferred interval and start iterating from there.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   int64_t preferred_prefetch_start_time = earliest_prefetch_start_time;
   float preferred_interval =
       preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed;
@@ -791,7 +800,8 @@ int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
 
 int64_t CostAnalysisPrefetchIntervalPicker::EstimatedPrefetchEndTime(
     const Shape& shape, int64_t start_time, int64_t end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   int64_t estimated_end_time;
   for (estimated_end_time = start_time + 1; estimated_end_time < end_time;
        ++estimated_end_time) {
@@ -809,7 +819,8 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
   const Shape& shape = ShapeUtil::GetSubshape(
       use.instruction->operand(use.operand_number)->shape(), use.operand_index);
   // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
-  async_copy_elapsed_ = cost_analysis_.GetAsyncCopyElapsed(shape);
+  async_copy_elapsed_ = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   // Estimate the time we would save by having this op in alternate memory.
   float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
   float elapsed_time_in_alternate_mem =
@@ -949,7 +960,8 @@ std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
 
 std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
     const Shape& shape, int64_t start_time, int64_t end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(shape);
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
   float logical_interval_elapsed =
       GetLogicalIntervalElapsed(start_time, end_time);
   return absl::StrCat(
@@ -1269,6 +1281,10 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
       if (other_use.hlo_use.instruction != use.instruction) {
         continue;
       }
+      // Operand 0 is not passed into the computation.
+      if (other_use.hlo_use.operand_number == 0) {
+        continue;
+      }
       HloComputation* called_computation =
           use.instruction->called_computations().at(
               other_use.hlo_use.operand_number - 1);
@@ -1460,12 +1476,16 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   VLOG(1) << "Memory pressure = " << memory_pressure_;
 
   if (options_.enable_cross_program_prefetch) {
-    std::optional<AlternateMemoryBestFitHeap::BufferInterval>
-        prefetch_candidate = FindCrossProgramPrefetchCandidate(
+    std::vector<AlternateMemoryBestFitHeap::BufferInterval>
+        prefetch_candidates = FindCrossProgramPrefetchCandidates(
             alias_analysis_, hlo_live_range_, options_);
-    if (prefetch_candidate) {
-      HloModule* module =
-          prefetch_candidate->buffer->instruction()->GetModule();
+    for (auto& prefetch_candidate : prefetch_candidates) {
+      HloModule* module = prefetch_candidate.buffer->instruction()->GetModule();
+      if (0 <= options().max_cross_program_prefetches &&
+          options().max_cross_program_prefetches <=
+              module->CrossProgramPrefetches().size()) {
+        break;
+      }
       AllocateCrossProgramPrefetchBuffer(module, prefetch_candidate);
     }
   }
@@ -1509,10 +1529,12 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     HloModule* module = inst->GetModule();
 
     // Don't intra-program prefetch a cross program prefetch
+    auto cross_program_prefetches = module->CrossProgramPrefetches();
     if (inst->opcode() == HloOpcode::kParameter &&
-        absl::c_count(module->CrossProgramPrefetches(),
-                      std::make_pair(inst->parameter_number(),
-                                     interval.buffer->index())) > 0) {
+        absl::c_find_if(cross_program_prefetches, [&](auto& info) {
+          return info.parameter == inst->parameter_number() &&
+                 info.index == interval.buffer->index();
+        }) != module->CrossProgramPrefetches().end()) {
       VLOG(3) << "Skip " << interval.buffer->ToShortString()
               << " because it is cross-program prefetched.";
       continue;
@@ -2070,33 +2092,66 @@ void AsynchronousCopyResource::AddCopy(const AsynchronousCopy& copy) {
 }
 
 void AsynchronousCopyResource::RemoveCopy(const AsynchronousCopy& copy) {
-  CHECK(ConsumeResource(copy.start_time, copy.end_time, /*resource=*/0,
+  // The ConsumeResource method can only correctly remove the last copy that
+  // starts at a given start time. So if the copy that is requested to be
+  // removed is not the last copy for this start time, we need to temporarily
+  // remove later copies that has the same start time and then add them back one
+  // by one. To do this, we first find the iterator that points to the earliest
+  // copy after this start time. We then decrement this iterator and temporarily
+  // remove the copies until we find the copy we actually want to remove. After
+  // we remove the copy that we actually want to remove, we add back the
+  // temporarily removed copies one by one in the same order.
+  auto async_copy_time_it = async_copy_time_map_.upper_bound(copy.start_time);
+  auto copy_it = (async_copy_time_it == async_copy_time_map_.end())
+                     ? async_copies_.end()
+                     : async_copy_time_it->second;
+  CHECK(copy_it != async_copies_.begin());
+  --copy_it;
+
+  std::list<AsynchronousCopy> copies_to_add_back;
+  auto prev_copy_it = copy_it;
+  for (; *copy_it != copy; copy_it = prev_copy_it) {
+    CHECK(copy_it != async_copies_.begin());
+    CHECK_EQ(copy_it->start_time, copy.start_time);
+    copies_to_add_back.push_front(*copy_it);
+    VLOG(4) << "RemoveCopy found a copy to temporarily remove and add back: "
+            << copy_it->start_time << " " << copy_it->end_time << " "
+            << copy_it->resource;
+    prev_copy_it = std::prev(copy_it);
+    RemoveCopy(copy_it);
+  }
+  CHECK(*copy_it == copy);
+  RemoveCopy(copy_it);
+
+  for (const AsynchronousCopy& copy_to_add_back : copies_to_add_back) {
+    AddCopy(copy_to_add_back);
+  }
+}
+
+void AsynchronousCopyResource::RemoveCopy(
+    std::list<AsynchronousCopy>::iterator& copy_it) {
+  // This method works only for the latest copy for the given start time.
+  CHECK(std::next(copy_it) == async_copies_.end() ||
+        std::next(copy_it)->start_time > copy_it->start_time);
+  CHECK(ConsumeResource(copy_it->start_time, copy_it->end_time, /*resource=*/0,
                         /*update_current_resource=*/true,
                         /*current_copy=*/nullptr,
-                        /*resource_to_free=*/copy.resource));
-  // Using async_copy_time_map_, find this copy to be removed. Note that the
-  // iterator in async_copy_time_map_ points to the first-seen copy with the
-  // given start time, so the copy to be removed might be later than the first
-  // one.
-  auto async_copy_time_it = async_copy_time_map_.find(copy.start_time);
-  CHECK(async_copy_time_it != async_copy_time_map_.end());
-  auto it = async_copy_time_it->second;
-  for (; it != async_copies_.end() && *it != copy; ++it) {
-  }
-  CHECK(it != async_copies_.end());
+                        /*resource_to_free=*/copy_it->resource));
   // If the copy to be removed is the value pointed by async_copy_time_map_, we
   // make the next copy with the same start time to be pointed by
   // async_copy_time_map_. If there are no such copies, we remove the key for
   // this copy start time.
-  if (it == async_copy_time_it->second) {
-    if (std::next(it) != async_copies_.end() &&
-        std::next(it)->start_time == copy.start_time) {
-      async_copy_time_it->second = std::next(it);
+  int64_t start_time = copy_it->start_time;
+  auto async_copy_time_it = async_copy_time_map_.find(start_time);
+  if (copy_it == async_copy_time_it->second) {
+    if (std::next(copy_it) != async_copies_.end() &&
+        std::next(copy_it)->start_time == start_time) {
+      async_copy_time_it->second = std::next(copy_it);
     } else {
       async_copy_time_map_.erase(async_copy_time_it);
     }
   }
-  async_copies_.erase(it);
+  async_copies_.erase(copy_it);
 }
 
 bool AsynchronousCopyResource::HasEnoughResource(int64_t start_time,
@@ -2143,26 +2198,22 @@ AlternateMemoryBestFitHeap::GetLiveAllocationAt(
 }
 
 void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
-    HloModule* module, std::optional<BufferInterval> prefetch_candidate) {
-  if (!prefetch_candidate) {
-    return;
-  }
-
-  Chunk chunk_candidate = FindChunkCandidate(*prefetch_candidate);
+    HloModule* module, const BufferInterval& prefetch_candidate) {
+  Chunk chunk_candidate = FindChunkCandidate(prefetch_candidate);
   if (chunk_candidate.chunk_end() > available_heap_size()) {
-    LOG(WARNING)
-        << "Could not allocate preferred memory for cross program prefetch";
+    VLOG(3) << "Could not allocate preferred memory for cross program prefetch";
     return;
   }
 
-  const HloValue* buffer = prefetch_candidate->buffer;
+  const HloValue* buffer = prefetch_candidate.buffer;
   int64_t parameter = buffer->instruction()->parameter_number();
+  int cross_program_prefetch_index = module->CrossProgramPrefetches().size();
   module->AddCrossProgramPrefetch(parameter, buffer->index());
 
   MemorySpaceAssignment::AllocationSequence allocations;
   allocations.push_back(std::make_unique<MemorySpaceAssignment::Allocation>(
       buffer->defining_position(), MemorySpace::kDefault, kDummyChunk,
-      prefetch_candidate->start, prefetch_candidate->end,
+      prefetch_candidate.start, prefetch_candidate.end,
       /*is_scoped_allocation=*/false));
 
   // Find the earliest use.
@@ -2180,7 +2231,7 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   // Find the latest use time.
   int64_t last_use_time = instruction_schedule.at(
       absl::c_max_element(uses, use_schedule_compare)->instruction);
-  for (const HloValue* colocation : prefetch_candidate->colocations) {
+  for (const HloValue* colocation : prefetch_candidate.colocations) {
     auto colocation_uses = colocation->GetUses();
     if (!colocation_uses.empty()) {
       last_use_time = std::max(
@@ -2208,10 +2259,16 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
       options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
           0, instruction_schedule.size());
   float buffer_occupied_time =
-      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
-          0, last_use_time) +
       options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
           end_of_program_prefetch_start_time, end_of_program_prefetch_end_time);
+  if (options_.cost_analysis) {
+    buffer_occupied_time = std::max(buffer_occupied_time,
+                                    options_.cost_analysis->GetAsyncCopyElapsed(
+                                        buffer->defining_position().shape()));
+  }
+  buffer_occupied_time +=
+      options_.prefetch_interval_picker->GetLogicalIntervalElapsed(
+          0, last_use_time);
   float buffer_occupied_ratio = buffer_occupied_time / total_execution_time;
   VLOG(2) << "Total execution time = " << total_execution_time
           << ", buffer occupied time = " << buffer_occupied_time
@@ -2226,13 +2283,12 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
        end_of_program_prefetch_start_time > last_use_time &&
        end_of_program_prefetch_start_time < end_of_program_prefetch_end_time);
   int64_t cross_program_prefetch_end_time =
-      free_buffer ? last_use_time : prefetch_candidate->end;
+      free_buffer ? last_use_time : prefetch_candidate.end;
 
   AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate, chunk_candidate,
-               prefetch_candidate->start, cross_program_prefetch_end_time,
+               prefetch_candidate.start, cross_program_prefetch_end_time,
                latest_prefetch_time, &allocations, /*aliased_offset=*/nullptr,
-               /*resource=*/0.0,
-               /*is_cross_program_prefetch=*/true);
+               /*resource=*/0.0, cross_program_prefetch_index);
 
   absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
   AliasedOffset* cross_program_prefetch_offset =
@@ -2258,7 +2314,7 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
       buffer_interval.start = allocation->start_time();
       buffer_interval.end = allocation->end_time();
       buffer_interval.size = allocation->chunk().size;
-      buffer_interval.buffer = prefetch_candidate->buffer;
+      buffer_interval.buffer = prefetch_candidate.buffer;
       AddToPendingChunks(buffer_interval, chunk_candidate);
     }
     allocations_->push_back(std::move(allocation));
@@ -2266,21 +2322,21 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
 
   // Add a repack allocation block for the Allocation objects in alternate
   // memory.
+  std::vector<RepackAllocationBlock*> colocations;
   for (int i = allocations_initial_size; i < allocations_->size(); ++i) {
     const auto& allocation = allocations_->at(i);
     if (allocation->memory_space() == MemorySpace::kAlternate) {
       repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
           allocation->start_time(), allocation->end_time(),
           allocation->chunk().size, allocation->chunk().offset,
-          static_cast<int64_t>(repack_allocation_blocks_.size()),
-          allocation.get()));
+          static_cast<int64_t>(colocations.size()), allocation.get()));
       RepackAllocationBlock* inserted = &repack_allocation_blocks_.back();
-      for (RepackAllocationBlock& colocation : repack_allocation_blocks_) {
-        colocation.colocations.push_back(inserted);
-        if (&colocation != inserted) {
-          inserted->colocations.push_back(&colocation);
-        }
+      for (RepackAllocationBlock* colocation : colocations) {
+        inserted->colocations.push_back(colocation);
+        colocation->colocations.push_back(inserted);
       }
+      inserted->colocations.emplace_back(inserted);
+      colocations.emplace_back(inserted);
     }
   }
 
@@ -2880,8 +2936,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
     allocation_sequence->push_back(
         std::make_unique<MemorySpaceAssignment::Allocation>(
-            defining_position, MemorySpace::kDefault, /*chunk=*/std::nullopt,
-            request.start_time, request.end_time,
+            defining_position, MemorySpace::kDefault,
+            /*chunk=*/std::nullopt, request.start_time, request.end_time,
             /*is_scoped_allocation=*/false));
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   }
@@ -2948,7 +3004,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
     int64_t end_time, int64_t copy_done_schedule_before_time,
     MemorySpaceAssignment::AllocationSequence* allocations,
     AliasedOffset* aliased_offset, float resource,
-    bool is_cross_program_prefetch) {
+    std::optional<int> cross_program_prefetch_index) {
   VLOG(3) << "Copy to "
           << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
                   ? "default"
@@ -2961,7 +3017,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
   allocations->push_back(
       std::make_unique<MemorySpaceAssignment::CopyAllocation>(
           prev_allocation, memory_space, chunk, start_time, end_time,
-          copy_done_schedule_before_time, is_cross_program_prefetch));
+          copy_done_schedule_before_time, cross_program_prefetch_index));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
@@ -3734,7 +3790,7 @@ Status MemorySpaceAssignment::CopyAllocation::Process() {
   HloComputation* computation = producing_instruction->parent();
   copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
       ShapeUtil::MakeTupleShape({shape, shape, ShapeUtil::MakeShape(U32, {})}),
-      producing_instruction, is_cross_program_prefetch_));
+      producing_instruction, cross_program_prefetch_index_));
   copy_done_ = computation->AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
   VLOG(4) << "Created " << copy_start_->name()
@@ -3894,8 +3950,15 @@ Status MemorySpaceAssignment::Process() {
           allocation->defining_position(), allocation->chunk());
       alternate_memory_size_ =
           std::max(alternate_memory_size_, allocation->chunk().chunk_end());
+
+      if (allocation->cross_program_prefetch_index().has_value()) {
+        TF_RETURN_IF_ERROR(module_->SetCrossProgramPrefetchOffset(
+            *allocation->cross_program_prefetch_index(),
+            allocation->chunk().offset));
+      }
     }
   }
+
   // Post-process allocations. This is only used for parent allocations where we
   // update the body root with a reference to the buffer in default memory
   // space.
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 244142931f3..b5849f29a2c 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -390,14 +390,17 @@ class MemorySpaceAssignmentCostAnalysis;
 // The value for buffer size for max async copy is a mechanism to prevent
 // copying small buffers between the two memories unnecessarily. For calculating
 // the max time that the buffer can reside in alternate memory, we use the
-// larger of this value and the actual size of the buffer.
+// larger of this value and the actual size of the buffer. A shape override can
+// also be provided which causes the interval picker to use that shape for async
+// copy durations instead of the actual shape of the copy.
 class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
  public:
   CostAnalysisPrefetchIntervalPicker(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis,
       float min_overlap_to_async_copy_ratio,
       float preferred_overlap_to_async_copy_ratio,
-      float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes);
+      float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
+      const Shape* shape_override = nullptr);
 
   bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
                                           int64_t start_time,
@@ -475,6 +478,10 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64_t decreasing_prefetch_time_iterator_;
 
   std::vector<float> while_execution_counts_;
+  // Shape override is used to override the shape of the shape of the async copy
+  // to treat all async copies the same duration. Having an override forces
+  // prefetches to be scheduled roughly in FIFO order.
+  std::optional<Shape> shape_override_;
 };
 
 // MemorySpaceAssignment assigns memory spaces (default or alternate) to each
@@ -607,6 +614,9 @@ class MemorySpaceAssignment {
     int64_t start_time() const { return start_time_; }
     int64_t end_time() const { return end_time_; }
     bool is_scoped_allocation() const { return is_scoped_allocation_; }
+    virtual std::optional<int64_t> cross_program_prefetch_index() const {
+      return std::nullopt;
+    }
 
     bool operator==(const Allocation& other) const;
     virtual std::string ToString() const;
@@ -631,16 +641,17 @@ class MemorySpaceAssignment {
   // `copy_done_schedule_before_time` or earlier.
   class CopyAllocation : public Allocation {
    public:
-    CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space,
-                   std::optional<Chunk> chunk, int64_t start_time,
-                   int64_t end_time, int64_t copy_done_schedule_before_time,
-                   bool is_cross_program_prefetch = false)
+    CopyAllocation(
+        const Allocation& prev_allocation, MemorySpace memory_space,
+        std::optional<Chunk> chunk, int64_t start_time, int64_t end_time,
+        int64_t copy_done_schedule_before_time,
+        std::optional<int64_t> cross_program_prefetch_index = std::nullopt)
         : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
                      start_time, end_time, /*is_scoped_allocation=*/false),
           prev_allocation_(prev_allocation),
           copy_start_schedule_after_(start_time),
           copy_done_schedule_before_(copy_done_schedule_before_time),
-          is_cross_program_prefetch_(is_cross_program_prefetch) {}
+          cross_program_prefetch_index_(cross_program_prefetch_index) {}
 
     bool is_copy_allocation() const override { return true; }
 
@@ -686,8 +697,8 @@ class MemorySpaceAssignment {
       copy_done_schedule_before_ = copy_done_schedule_before;
     }
 
-    bool is_cross_program_prefetch() const {
-      return is_cross_program_prefetch_;
+    std::optional<int64_t> cross_program_prefetch_index() const override {
+      return cross_program_prefetch_index_;
     }
 
     bool operator==(const CopyAllocation& other) const;
@@ -701,9 +712,9 @@ class MemorySpaceAssignment {
     // is before copy_done_schedule_before_.
     int64_t copy_start_schedule_after_;
     int64_t copy_done_schedule_before_;
-    bool is_cross_program_prefetch_;
     HloInstruction* copy_start_;
     HloInstruction* copy_done_;
+    std::optional<int64_t> cross_program_prefetch_index_;
   };
 
   // An allocation in the default memory space that mirrors another Allocation
@@ -1120,6 +1131,10 @@ struct Options {
   // cross-program-prefetched buffer can be reused.
   bool enable_cross_program_prefetch_freeing = true;
 
+  // The maximum number of cross program prefetches.
+  // TODO(tjablin): Use a heuristic to determine this automatically.
+  int max_cross_program_prefetches = 1;
+
   // Enable redundant eviction optimization in/around while loops. If enabled,
   // this optimization would keep a copy of the buffer in the default memory in
   // addition to alternate memory to eliminate redundant evictions.
@@ -1128,6 +1143,9 @@ struct Options {
   // An optional memory space assignment autotuning config, which is used
   // to sort allocated buffers.
   std::optional<std::vector<uint64_t>> autotuning_config = std::nullopt;
+
+  // Scales effective bandwidth for async copies. Valid range is (0, 1].
+  float async_copy_bandwidth_scaling_factor = 1.0;
 };
 
 // A struct representing an asynchronous copy with its logical start and end
@@ -1201,6 +1219,13 @@ class AsynchronousCopyResource {
       const std::list<AsynchronousCopy>::iterator* current_copy = nullptr,
       float resource_to_free = 0.0);
 
+  // Same as the public RemoveCopy except it works on the async_copies_
+  // iterator. Assumes copy_it points to the last copy for its start time;
+  // otherwise the public RemoveCopy method is supposed to temporarily remove
+  // these later copies that share the same start time before removing the
+  // requested copy.
+  void RemoveCopy(std::list<AsynchronousCopy>::iterator& copy_it);
+
   // We maintain a linked list of asynchronous copies sorted by the start times.
   // This allows us to efficiently find the copy that starts right after another
   // one because adding a copy might push a copy further into the future.
@@ -1236,7 +1261,7 @@ class AlternateMemoryBestFitHeap
   // enables prefetching prefetch_candidate from default memory across program
   // boundaries.
   void AllocateCrossProgramPrefetchBuffer(
-      HloModule* module, std::optional<BufferInterval> prefetch_candidate);
+      HloModule* module, const BufferInterval& prefetch_candidate);
 
   HeapSimulator::Result<HloValue> Finish() override;
 
@@ -1536,13 +1561,13 @@ class AlternateMemoryBestFitHeap
   void ImportRepackedAllocations();
 
   // Adds an asynchronous copy to the allocations.
-  void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
-                    MemorySpace memory_space, std::optional<Chunk> chunk,
-                    int64_t start_time, int64_t end_time,
-                    int64_t copy_done_schedule_before_time,
-                    MemorySpaceAssignment::AllocationSequence* allocations,
-                    AliasedOffset* aliased_offset, float resource,
-                    bool is_cross_program_prefetch = false);
+  void AddAsyncCopy(
+      const MemorySpaceAssignment::Allocation& prev_allocation,
+      MemorySpace memory_space, std::optional<Chunk> chunk, int64_t start_time,
+      int64_t end_time, int64_t copy_done_schedule_before_time,
+      MemorySpaceAssignment::AllocationSequence* allocations,
+      AliasedOffset* aliased_offset, float resource,
+      std::optional<int> cross_program_prefetch_index = std::nullopt);
 
   // This method is used for committing the chunk candidate but adding it to
   // pending_chunks_ so that we can "uncommit" them in case we need to roll back
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
index 5ec8954f083..8d9b343b959 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/memory_space_assignment_best_fit_repacker.h"
 
+#include <algorithm>
+#include <functional>
+#include <tuple>
+
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 
 namespace xla {
@@ -56,6 +60,21 @@ class BestFitRepacker
     }
   }
 
+  // Sorting by initial offset gives better buffer order stability between
+  // related programs improving the success rate for cross-program prefetching.
+  BufferIntervalCompare GetTemporalBufferIntervalCompare() const override {
+    return LessThanByKey([this](const BufferInterval& x) {
+      int64_t x_end = x.end;
+      for (auto colocation : GetTransitiveColocations(x)) {
+        x_end = std::max(x_end, buffer_intervals_.at(colocation).end);
+      }
+      // Sort by duration (descending), size (descending), initial offset
+      // (ascending), buffer (ascending).
+      return std::make_tuple(x.start - x_end, -x.size, x.buffer->initial_offset,
+                             std::cref(*x.buffer));
+    });
+  }
+
   bool Repack() {
     Finish();
     bool success = result_.heap_size <= max_size_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index dfc8b82ed26..67fbcad9afd 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/instruction_hoister.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -6269,6 +6269,89 @@ TEST_F(AsynchronousCopyResourceTest, StartAtZeroAndRemove) {
             std::vector<float>({0.0, 0.0, 0.0, 0.0, 2.0}));
 }
 
+TEST_F(AsynchronousCopyResourceTest, OutOfOrderRemovalSameStartTime) {
+  // time:      0 1 2 3 4
+  // resource:  2 2 2 2 2
+  // add:1,3,1     +-+       OK
+  // resource:  2 2 1 2 2
+  // add:1,4,2     +---+     OK
+  // resource:  2 2 0 1 2
+  // rem:1,3,1     +-+
+  // resource:  2 2 0 2 2
+  // add:1,5,1     +-----+   OK
+  // resource:  2 2 0 1 2
+  // add:1,5,1     +-----+   OK
+  // resource:  2 2 0 0 2
+  // add:1,5,1     +-----+   OK
+  // resource:  2 2 0 0 1
+  // add:1,5,1     +-----+   OK
+  // resource:  2 2 0 0 0
+  // add:1,5,1     +-----+   Violate
+  // rem:1,4,2     +---+
+  // resource:  2 2 0 0 2
+  // rem:1,5,1     +-----+
+  // resource:  2 2 0 1 2
+  // rem:1,5,1     +-----+
+  // resource:  2 2 0 2 2
+  // rem:1,5,1     +-----+
+  // resource:  2 2 1 2 2
+  // rem:1,5,1     +-----+
+  // resource:  2 2 2 2 2
+  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  AsynchronousCopyResource resource({2.0, 2.0, 2.0, 2.0, 2.0});
+  AsynchronousCopy copy1{1, 3, 1.0, alternate_mem_space, 0};
+  AsynchronousCopy copy2{1, 4, 2.0, alternate_mem_space, 1};
+  EXPECT_TRUE(resource.HasEnoughResource(1, 3, 1.0));
+  resource.AddCopy(copy1);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 1.0, 2.0, 2.0}));
+  EXPECT_TRUE(resource.HasEnoughResource(1, 4, 2.0));
+  resource.AddCopy(copy2);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 1.0, 2.0}));
+  resource.RemoveCopy(copy1);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 2.0, 2.0}));
+
+  AsynchronousCopy copy3{1, 5, 1.0, alternate_mem_space, 2};
+  AsynchronousCopy copy4{1, 5, 1.0, alternate_mem_space, 3};
+  AsynchronousCopy copy5{1, 5, 1.0, alternate_mem_space, 4};
+  AsynchronousCopy copy6{1, 5, 1.0, alternate_mem_space, 5};
+  EXPECT_TRUE(resource.HasEnoughResource(1, 5, 1.0));
+  resource.AddCopy(copy3);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 1.0, 2.0}));
+  EXPECT_TRUE(resource.HasEnoughResource(1, 5, 1.0));
+  resource.AddCopy(copy4);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 0.0, 2.0}));
+  EXPECT_TRUE(resource.HasEnoughResource(1, 5, 1.0));
+  resource.AddCopy(copy5);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 0.0, 1.0}));
+  EXPECT_TRUE(resource.HasEnoughResource(1, 5, 1.0));
+  resource.AddCopy(copy6);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 0.0, 0.0}));
+  EXPECT_FALSE(resource.HasEnoughResource(1, 5, 1.0));
+
+  resource.RemoveCopy(copy2);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 0.0, 2.0}));
+  resource.RemoveCopy(copy3);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 1.0, 2.0}));
+  resource.RemoveCopy(copy4);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 0.0, 2.0, 2.0}));
+  resource.RemoveCopy(copy5);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 1.0, 2.0, 2.0}));
+  resource.RemoveCopy(copy6);
+  EXPECT_EQ(resource.GetCurrentResources(),
+            std::vector<float>({2.0, 2.0, 2.0, 2.0, 2.0}));
+}
+
 TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
   HloComputation::Builder builder(TestName());
 
@@ -6302,8 +6385,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 1);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 1);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({}));
   }
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -6312,6 +6395,75 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
                                     op::Parameter(1))));
 }
 
+TEST_P(MemorySpaceAssignmentTest, MultiCrossProgramPrefetchTest) {
+  HloComputation::Builder builder(TestName());
+
+  constexpr int kBatch = 8;
+  constexpr int kFeature = 8;
+  constexpr int kFirstOutput = 4;
+  constexpr int kSecondOutput = 2;
+
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
+  auto first_weight_shape = ShapeUtil::MakeShape(F32, {kFeature, kFirstOutput});
+  auto second_weight_shape =
+      ShapeUtil::MakeShape(F32, {kFirstOutput, kSecondOutput});
+  auto intermediate_shape = ShapeUtil::MakeShape(F32, {kBatch, kFirstOutput});
+  auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kSecondOutput});
+  HloInstruction* lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs_shape, "lhs"));
+  HloInstruction* first_weight = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, first_weight_shape, "first_weight"));
+  HloInstruction* second_weight = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, second_weight_shape, "second_weight"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto first_dot = builder.AddInstruction(
+      HloInstruction::CreateDot(intermediate_shape, lhs, first_weight,
+                                dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto second_dot = builder.AddInstruction(
+      HloInstruction::CreateDot(result_shape, first_dot, second_weight,
+                                dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(
+      computation, {lhs, first_weight, second_weight, first_dot, second_dot});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  Options options;
+  options.max_cross_program_prefetches = -1;
+  options.max_size_in_bytes = 256;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*max_prefetch_interval=*/10, /*min_prefetch_interval=*/2,
+                    options);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 2);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 1);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({}));
+  }
+  if (cross_program_prefetches.size() > 1) {
+    EXPECT_EQ(cross_program_prefetches[1].parameter, 2);
+    EXPECT_EQ(cross_program_prefetches[1].index, ShapeIndex({}));
+  }
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Dot(op::Dot(op::Parameter(0),
+                      op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                                    op::Parameter(1))),
+              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                            op::Parameter(2))));
+}
+
 TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleTest) {
   HloComputation::Builder builder(TestName());
 
@@ -6349,8 +6501,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleTest) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 0);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 0);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
   }
 }
 
@@ -6391,8 +6543,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTest) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 1);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 1);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({}));
   }
 }
 
@@ -6437,8 +6589,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTupleTest) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 0);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 0);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
   }
 }
 
@@ -6916,13 +7068,14 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNoReuse) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 1);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 1);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
       HloDataflowAnalysis::Run(*module));
+  LOG(ERROR) << "module: " << module->ToString();
   const HloValue& cross_program_prefetched_value =
       dataflow_analysis->GetValueDefinedAt(
           module->entry_computation()->parameter_instruction(1), {});
@@ -6930,14 +7083,14 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNoReuse) {
   // cross-program prefetch, the other is the end-of-program prefetch.
   auto is_cross_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           use.instruction->is_cross_program_prefetch();
+           use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_cross_program_prefetch),
             1);
   auto is_end_of_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           !use.instruction->is_cross_program_prefetch();
+           !use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_end_of_program_prefetch),
@@ -6995,8 +7148,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleNoReuse) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 0);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 0);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -7009,14 +7162,14 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleNoReuse) {
   // cross-program prefetch, the other is the end-of-program prefetch.
   auto is_cross_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           use.instruction->is_cross_program_prefetch();
+           use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_cross_program_prefetch),
             1);
   auto is_end_of_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           !use.instruction->is_cross_program_prefetch();
+           !use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_end_of_program_prefetch),
@@ -7074,8 +7227,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchReuse) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 1);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 1);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -7088,14 +7241,14 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchReuse) {
   // prefetch. There shouldn't be an end-of-program prefetch.
   auto is_cross_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           use.instruction->is_cross_program_prefetch();
+           use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_cross_program_prefetch),
             1);
   auto is_end_of_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           !use.instruction->is_cross_program_prefetch();
+           !use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_end_of_program_prefetch),
@@ -7134,8 +7287,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleReuse) {
   auto cross_program_prefetches = module->CrossProgramPrefetches();
   EXPECT_EQ(cross_program_prefetches.size(), 1);
   if (!cross_program_prefetches.empty()) {
-    EXPECT_EQ(cross_program_prefetches[0].first, 0);
-    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+    EXPECT_EQ(cross_program_prefetches[0].parameter, 0);
+    EXPECT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -7148,14 +7301,14 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTupleReuse) {
   // prefetch. There shouldn't be an end-of-program prefetch.
   auto is_cross_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           use.instruction->is_cross_program_prefetch();
+           use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_cross_program_prefetch),
             1);
   auto is_end_of_program_prefetch = [](const HloUse& use) {
     return use.instruction->opcode() == HloOpcode::kCopyStart &&
-           !use.instruction->is_cross_program_prefetch();
+           !use.instruction->cross_program_prefetch_index().has_value();
   };
   EXPECT_EQ(absl::c_count_if(cross_program_prefetched_value.GetUses(),
                              is_end_of_program_prefetch),
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h b/tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h
index 26a8dabe7a5..94ec0febf10 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TUNING_UTILS_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
index e72d606e879..eaea71ea3bd 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 
 namespace xla {
 
@@ -65,13 +65,16 @@ bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
       return false;
     }
 
-    if (auto* custom_call =
-            DynCast<HloCustomCallInstruction>(position.instruction)) {
-      for (const auto& pair : custom_call->output_to_operand_aliasing()) {
+    // WARNING (b/259460539): output_to_operand_aliasing was moved from
+    // HloCustomCallInstruction to HloCallableInstruction so that fusions can
+    // also be annotated with this aliasing. This feature might not be complete.
+    if (auto* callable =
+            DynCast<HloCallableInstruction>(position.instruction)) {
+      for (const auto& pair : callable->output_to_operand_aliasing()) {
         if (position.index == pair.first) {
           VLOG(4) << "Keeping value " << value->ToShortString()
-                  << " in default mem because it is a custom-call output that "
-                     "aliases an operand buffer.";
+                  << " in default mem because it is a custom-call/fusion output"
+                     " that aliases an operand buffer.";
           return false;
         }
       }
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
index 9113f5f2ea1..3b121384fb5 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.h
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/metrics.proto b/tensorflow/compiler/xla/service/metrics.proto
new file mode 100644
index 00000000000..33b909f02b4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/metrics.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package xla;
+
+import "google/protobuf/duration.proto";
+import "google/protobuf/timestamp.proto";
+
+// Defines XLA compilation metrics.
+message CompilationLogEntry {
+  // Time when the event captured by this log entry occurred.
+  google.protobuf.Timestamp timestamp = 1;
+  // Defines compilation stages for which metrics are collected.
+  enum CompilationStage {
+    UNSPECIFIED = 0;
+    END_TO_END = 1;
+    HLO_PASSES = 2;
+    CODE_GENERATION = 3;
+    BACKEND_PASSES = 4;
+  }
+  // Compilation stage recorded by this log entry.
+  CompilationStage stage = 2;
+  // Duration of the given compilation stage.
+  google.protobuf.Duration duration = 3;
+  // Task index from which this log entry was recorded.
+  int32 task_index = 4;
+}
diff --git a/tensorflow/compiler/xla/service/metrics_hook_interface.h b/tensorflow/compiler/xla/service/metrics_hook_interface.h
new file mode 100644
index 00000000000..980b3203d94
--- /dev/null
+++ b/tensorflow/compiler/xla/service/metrics_hook_interface.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/service/metrics.pb.h"
+
+namespace xla {
+
+// MetricsHookInterface is an abstract interface for compiler backends to record
+// stages of their compilation process.
+
+class MetricsHookInterface {
+ public:
+  virtual ~MetricsHookInterface() = default;
+  // Used to record instance of a successful XLA compilation pass happening
+  // under a stage
+  virtual void RecordStagePassCount(absl::string_view stage,
+                                    absl::string_view pass) const = 0;
+
+  // Used to record instance of an error with error_status happening under an
+  // XLA compilation stage and pass
+  virtual void RecordStagePassError(absl::string_view stage,
+                                    absl::string_view pass,
+                                    absl::string_view error_status) const = 0;
+
+  // Used to record instance of a successful XLA compilation stage that does not
+  // encompass its own passes (empty pass field).
+  virtual void RecordStageCount(absl::string_view stage) const = 0;
+
+  // Used to record instance of an error with error_status happening under an
+  // XLA compilation stage (empty pass field)
+  virtual void RecordStageError(absl::string_view stage,
+                                absl::string_view error_status) const = 0;
+
+  // Captures latency for a given XLA compilation stage.
+  virtual void RecordStageLatency(CompilationLogEntry::CompilationStage stage,
+                                  absl::Duration latency) const = 0;
+};
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_METRICS_HOOK_INTERFACE_H_
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index bacd98a9f53..17fcdfff4ca 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index e3c8c310f8c..29b3e660e51 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.cc b/tensorflow/compiler/xla/service/op_expander_pass.cc
index 5b8cc305b54..bc74f7f730d 100644
--- a/tensorflow/compiler/xla/service/op_expander_pass.cc
+++ b/tensorflow/compiler/xla/service/op_expander_pass.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/service/operand_upcaster.cc b/tensorflow/compiler/xla/service/operand_upcaster.cc
index 74a1a4bcae1..33848ceb1f8 100644
--- a/tensorflow/compiler/xla/service/operand_upcaster.cc
+++ b/tensorflow/compiler/xla/service/operand_upcaster.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/operand_upcaster.h b/tensorflow/compiler/xla/service/operand_upcaster.h
index 2a6bb8bdae0..25e75abd09c 100644
--- a/tensorflow/compiler/xla/service/operand_upcaster.h
+++ b/tensorflow/compiler/xla/service/operand_upcaster.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OPERAND_UPCASTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_OPERAND_UPCASTER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
index 61777718bc6..c0ddab25e0d 100644
--- a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
index 1f6d4dea4e1..d66002e4c09 100644
--- a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index eb169a888b1..ec5fd9e69da 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 
 #include <functional>
+#include <ios>
+#include <optional>
+#include <ostream>
 #include <sstream>
 #include <string>
 #include <type_traits>
@@ -25,12 +28,12 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/utility/utility.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -151,13 +154,18 @@ struct MatchOption {
   // If true, actually capture matched item into the user pointer.
   bool capture;
 
+  // If true, require all operands prescribed in pattern to have one user.
+  bool single_user_only;
+
   // An explanation for why we failed to match is streamed here, if not-null.
   std::ostream* explain_os;
 };
 
 template <typename Value, typename Pattern>
 bool Match(Value* value, const Pattern& pattern,
-           MatchOption option = {/*.capture=*/true, /*.explain_os=*/nullptr}) {
+           MatchOption option = {/*.capture=*/true,
+                                 /*.single_user_only=*/false,
+                                 /*.explain_os=*/nullptr}) {
   if (option.capture) {
     auto new_option = option;
     new_option.capture = false;
@@ -168,6 +176,34 @@ bool Match(Value* value, const Pattern& pattern,
   return pattern.Match(value, option);
 }
 
+// Recursively requires all operands of the top-level operation prescribed in
+// pattern (but not the top-level operation itself) to have one user. The
+// behavior is identical to calling Match(value, pattern) with WithOneUser()
+// applied to all prescribed operands at all levels in pattern.
+//
+// Example:
+// p0 = parameter(0)
+// add = add(p0, p0)
+// mul = multiply(p0, p0)
+//
+// MatchSingleUserOnly(p0, m::Op()) -> true
+// (Top-level operation in the pattern is not required to have one user).
+//
+// MatchSingleUserOnly(add, m::Add()) -> true
+// (Only operands prescribed in the pattern are required to have one user).
+//
+// MatchSingleUserOnly(add, m::Add(m::Op(), m::Op()) -> false
+// (Operands prescribed in the pattern have two users).
+//
+// The previous line is equivalent to:
+// Match(add, m::Add(m::Op().WithOneUser(), m::Op().WithOneUser()) -> false.
+template <typename Value, typename Pattern>
+bool MatchSingleUserOnly(Value* value, const Pattern& pattern) {
+  MatchOption option = {/*.capture=*/true, /*.single_user_only=*/true,
+                        /*.explain_os=*/nullptr};
+  return Match(value, pattern, option);
+}
+
 // If `enable_logging` is false, this is identical to Match(instr, pattern).
 //
 // If `enable_logging` is true and the match fails, we try to
@@ -194,7 +230,9 @@ bool MatchAndLogIfFailed(HloInstruction* instr, absl::string_view desc,
     return matched;
   }
   std::stringstream os;
-  CHECK(!Match(instr, pattern, {/*capture=*/false, /*explain_os=*/&os}));
+  CHECK(!Match(
+      instr, pattern,
+      {/*capture=*/false, /*.single_user_only=*/false, /*explain_os=*/&os}));
   LOG(ERROR) << "Failed to match " << desc << ":\n" << os.str();
   return false;
 }
@@ -626,15 +664,6 @@ class AnyOfPattern {
 
 }  // namespace detail
 
-// Returns a pattern that represents the logical disjunction of the input
-// patterns. The returned pattern matches from left to right, and stops on the
-// first match.
-template <typename Item, typename... Patterns>
-auto AnyOf(const Patterns&... patterns) {
-  return detail::AnyOfPattern<typename std::remove_const<Item>::type,
-                              Patterns...>(patterns...);
-}
-
 // Creates a layout pattern that will capture the matched layout in the
 // argument.
 inline constexpr auto Layout(const ::xla::Layout** matched_layout = nullptr) {
@@ -772,7 +801,7 @@ class ShapePatternDimsImpl {
 // A ShapePattern implementation that matches only if the shape is scalar.
 class ShapePatternIsScalarImpl {
  public:
-  explicit constexpr ShapePatternIsScalarImpl() {}
+  explicit constexpr ShapePatternIsScalarImpl() = default;
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
     if (!ShapeUtil::IsScalar(*shape)) {
@@ -790,7 +819,7 @@ class ShapePatternIsScalarImpl {
 // A ShapePattern implementation that matches only if the shape is an array
 class ShapePatternIsArrayImpl {
  public:
-  explicit constexpr ShapePatternIsArrayImpl() {}
+  explicit constexpr ShapePatternIsArrayImpl() = default;
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
     if (!shape->IsArray()) {
@@ -808,7 +837,7 @@ class ShapePatternIsArrayImpl {
 // A ShapePattern implementation that matches only if the shape is an array
 class ShapePatternIsDenseArrayImpl {
  public:
-  explicit constexpr ShapePatternIsDenseArrayImpl() {}
+  explicit constexpr ShapePatternIsDenseArrayImpl() = default;
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
     if (!LayoutUtil::IsDenseArray(*shape)) {
@@ -826,7 +855,7 @@ class ShapePatternIsDenseArrayImpl {
 // A ShapePattern implementation that matches only if the shape is a tuple.
 class ShapePatternIsTupleImpl {
  public:
-  explicit constexpr ShapePatternIsTupleImpl() {}
+  explicit constexpr ShapePatternIsTupleImpl() = default;
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
     if (!shape->IsTuple()) {
@@ -845,7 +874,7 @@ class ShapePatternIsTupleImpl {
 // scalar.
 class ShapePatternEffectiveScalarImpl {
  public:
-  explicit constexpr ShapePatternEffectiveScalarImpl() {}
+  explicit constexpr ShapePatternEffectiveScalarImpl() = default;
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
     if (!ShapeUtil::IsEffectiveScalar(*shape)) {
@@ -1266,58 +1295,6 @@ class HloInstructionPatternOpcodeImpl {
   bool invert_;
 };
 
-// An HloInstructionPattern implementation that optionally matches a unary
-// operand with a given opcode before matching a given pattern.
-template <typename PatternType, typename PatternImpl>
-class HloInstructionPatternOptionalUnaryOpImpl {
- public:
-  explicit HloInstructionPatternOptionalUnaryOpImpl(
-      absl::Span<const HloOpcode> opcodes,
-      const HloInstructionPattern<PatternType, PatternImpl>& pattern)
-      : opcodes_(opcodes), pattern_(pattern) {}
-
-  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    // Compare the opcode of the instruction with the entries of opcodes_.
-    if (absl::c_linear_search(opcodes_, inst->opcode())) {
-      // Additionally, the operand of the instruction must match the given
-      // operand pattern.
-      if (pattern_.Match(HloOperand(inst, 0), option)) {
-        return true;
-      } else {
-        EXPLAIN << " and the ";
-      }
-    } else {
-      EXPLAIN << "The HloInstruction doesn't have one of the opcodes {"
-              << absl::StrJoin(opcodes_, ", ",
-                               [](std::string* out, const HloOpcode opcode) {
-                                 absl::StrAppend(out, HloOpcodeString(opcode));
-                               })
-              << "} and the ";
-    }
-    // In the transparent case, the instruction matches the given operand
-    // pattern.
-    if (pattern_.Match(inst, option, /*explain_instruction=*/false)) {
-      return true;
-    }
-    return false;
-  }
-
-  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
-    *os << "which optionally matches a unary operand with one of the opcodes {"
-        << absl::StrJoin(opcodes_, ", ",
-                         [](std::string* out, const HloOpcode opcode) {
-                           absl::StrAppend(out, HloOpcodeString(opcode));
-                         })
-        << "} before matching ";
-    pattern_.DescribeTo(os, indent);
-    *os << ".";
-  }
-
- private:
-  absl::Span<const HloOpcode> opcodes_;
-  const HloInstructionPattern<PatternType, PatternImpl> pattern_;
-};
-
 // An HloInstructionPattern implementation that matches only if the instruction
 // has one of a given list of custom call targets.
 class HloInstructionCustomCallTargetImpl {
@@ -1451,6 +1428,13 @@ class HloInstructionPatternOperandImpl {
       EXPLAIN << "\nin operand " << operand_index_;
       return false;
     }
+    if (option.single_user_only &&
+        inst->operand(operand_index_)->user_count() != 1) {
+      EXPLAIN << "Operand " << operand_index_ << " of HloInstruction has "
+              << inst->operand(operand_index_)->user_count()
+              << " users. Expected 1.";
+      return false;
+    }
     return true;
   }
 
@@ -1548,6 +1532,16 @@ class HloInstructionPatternBinaryOperandsAnyOrderImpl {
       return false;
     }
 
+    if (option.single_user_only) {
+      for (int i = 0; i < 2; ++i) {
+        if (inst->operand(i)->user_count() != 1) {
+          EXPLAIN << "Operand " << i << " of HloInstruction has "
+                  << inst->operand(i)->user_count() << " users. Expected 1.";
+          return false;
+        }
+      }
+    }
+
     // If we're not generating explanations, this is pretty simple.
     if (!option.explain_os) {
       auto try_match = [&](int64_t idx1, int64_t idx2) {
@@ -1995,27 +1989,11 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternNameImpl(name));
   }
 
-  // Modifies the pattern to optionally match a unary operand with a given
-  // opcode before matching a given pattern.
-  template <typename PatternType, typename PatternImpl>
-  constexpr auto WithOptionalUnaryOp(
-      absl::Span<const HloOpcode> opcodes,
-      const HloInstructionPattern<PatternType, PatternImpl>& pattern) const {
-    return AppendImpl(
-        HloInstructionPatternOptionalUnaryOpImpl<PatternType, PatternImpl>(
-            opcodes, pattern));
-  }
-
   // Modifies the pattern to match only if the instruction has the given opcode.
   auto WithOpcode(HloOpcode opcode) const {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, false));
   }
 
-  // Modifies the pattern to match only the custom call with a given target.
-  auto WithCustomCallTarget(absl::string_view custom_call_target) const {
-    return AppendImpl(HloInstructionCustomCallTargetImpl({custom_call_target}));
-  }
-
   // Modifies the pattern to match a custom call with one of the given targets.
   auto WithCustomCallTarget(
       absl::Span<const absl::string_view> custom_call_targets) const {
@@ -2189,8 +2167,33 @@ class HloInstructionPattern {
   HloInstructionType** matched_inst_;
 };
 
+template <typename Item, typename... Patterns>
+struct AnyOfImpl {
+  auto operator()(const Patterns&... patterns) const {
+    return AnyOfPattern<typename std::remove_const<Item>::type, Patterns...>(
+        patterns...);
+  }
+};
+
+template <typename... Patterns>
+struct AnyOfImpl<HloInstruction, Patterns...> {
+  auto operator()(const Patterns&... patterns) const {
+    auto any_of = AnyOfPattern<HloInstruction, Patterns...>(patterns...);
+    return HloInstructionPattern<HloInstruction, decltype(any_of)>(
+        std::move(any_of), /*matched_inst=*/nullptr);
+  }
+};
+
 }  // namespace detail
 
+// Returns a pattern that represents the logical disjunction of the input
+// patterns. The returned pattern matches from left to right, and stops on the
+// first match.
+template <typename Item, typename... Patterns>
+auto AnyOf(const Patterns&... patterns) {
+  return detail::AnyOfImpl<Item, Patterns...>()(patterns...);
+}
+
 // Creates an instruction pattern that will capture the matched instruction in
 // the argument.
 inline constexpr auto Op(const ::xla::HloInstruction** matched_inst = nullptr) {
@@ -2223,22 +2226,6 @@ XLA_NULLOP_PATTERN(PartitionId)
 XLA_NULLOP_PATTERN(ReplicaId)
 #undef XLA_NULLOP_PATTERN
 
-// A pattern which optionally matches a unary operand with a given opcode before
-// matching a given pattern.
-template <typename Pattern>
-inline auto OptionalUnaryOp(absl::Span<const HloOpcode> ops,
-                            Pattern&& pattern) {
-  return Op().WithOptionalUnaryOp(ops, std::forward<Pattern>(pattern));
-}
-
-template <typename HloInstructionType, typename Pattern>
-inline auto OptionalUnaryOp(HloInstructionType** matched_inst,
-                            absl::Span<const HloOpcode> ops,
-                            Pattern&& pattern) {
-  return Op(matched_inst)
-      .WithOptionalUnaryOp(ops, std::forward<Pattern>(pattern));
-}
-
 // Helpers for unary instructions.
 #define XLA_UNOP_PATTERN(NAME)                                       \
   inline auto NAME() { return Op().WithOpcode(HloOpcode::k##NAME); } \
@@ -2266,6 +2253,8 @@ XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
 XLA_UNOP_PATTERN(AllReduce)
+XLA_UNOP_PATTERN(AllReduceStart)
+XLA_UNOP_PATTERN(AllReduceDone)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
@@ -2280,6 +2269,7 @@ XLA_UNOP_PATTERN(Real)
 XLA_UNOP_PATTERN(Recv)
 XLA_UNOP_PATTERN(RecvDone)
 XLA_UNOP_PATTERN(ReducePrecision)
+XLA_UNOP_PATTERN(ReduceScatter)
 XLA_UNOP_PATTERN(Reshape)
 XLA_UNOP_PATTERN(Reverse)
 XLA_UNOP_PATTERN(Rsqrt)
@@ -2288,6 +2278,7 @@ XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
 XLA_UNOP_PATTERN(Slice)
 XLA_UNOP_PATTERN(Sqrt)
+XLA_UNOP_PATTERN(Tan)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
@@ -2455,12 +2446,6 @@ auto CustomCall(Arg0&& arg0, Args&&... args) {
                               std::forward<Args>(args)...);
 }
 
-template <typename... Args>
-auto CustomCall(absl::string_view custom_call_target, Args&&... args) {
-  return CustomCall(std::forward<Args>(args)...)
-      .WithCustomCallTarget(custom_call_target);
-}
-
 template <typename... Args>
 auto CustomCall(absl::Span<const absl::string_view> custom_call_targets,
                 Args&&... args) {
@@ -2478,13 +2463,6 @@ auto CustomCall(HloInstructionType** matched_inst, Arg0&& arg0,
       /*operand_num=*/0, std::forward<Arg0>(arg0), std::forward<Args>(args)...);
 }
 
-template <typename HloInstructionType, typename... Args>
-auto CustomCall(HloInstructionType** matched_inst,
-                absl::string_view custom_call_target, Args&&... args) {
-  return CustomCall(matched_inst, std::forward<Args>(args)...)
-      .WithCustomCallTarget(custom_call_target);
-}
-
 template <typename HloInstructionType, typename... Args>
 auto CustomCall(HloInstructionType** matched_inst,
                 absl::Span<const absl::string_view> custom_call_targets,
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock.h b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
index a8c263113b5..0578c7bf550 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
@@ -83,7 +83,8 @@ class GmockMatcher {
   template <typename T>
   bool MatchAndExplainImpl(T* t,
                            ::testing::MatchResultListener* listener) const {
-    MatchOption options{/*.capture=*/true, /*.explain_os=*/listener->stream()};
+    MatchOption options{/*.capture=*/true, /*.single_user_only=*/false,
+                        /*.explain_os=*/listener->stream()};
     return Match(t, pattern_, options);
   }
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 9bc4f7e89b4..f38deca8790 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -210,6 +210,22 @@ TEST_F(PatternMatcherTest, AnyOf) {
                                                match::ConstantScalar(2))));
 }
 
+TEST_F(PatternMatcherTest, AnyOfInstructionIsInstructionPattern) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test_module ENTRY test { ROOT constant = f16[] constant(1) })";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  auto* root = hlo_module->entry_computation()->root_instruction();
+
+  EXPECT_TRUE(
+      Match(root, match::AnyOf<HloInstruction>(match::ConstantScalar(0),
+                                               match::ConstantScalar(1))));
+  EXPECT_FALSE(
+      Match(root, match::AnyOf<HloInstruction>(match::ConstantScalar(0),
+                                               match::ConstantScalar(1))
+                      .WithName("foo")));
+}
+
 TEST_F(PatternMatcherTest, ConstantScalar) {
   using match::ConstantEffectiveScalar;
   using match::ConstantScalar;
@@ -555,9 +571,12 @@ std::string Description(const Pattern& pattern) {
 }
 
 template <typename Elem, typename Pattern>
-std::string Explanation(Elem* elem, const Pattern& pattern) {
+std::string Explanation(Elem* elem, const Pattern& pattern,
+                        bool single_user_only = false) {
   std::stringstream ss;
-  MatchOption options{/*.capture=*/true, /*.explain_os=*/&ss};
+  MatchOption options{/*.capture=*/true,
+                      /*.single_user_only=*/single_user_only,
+                      /*.explain_os=*/&ss};
   Match(elem, pattern, options);
   return ss.str();
 }
@@ -614,17 +633,17 @@ TEST_F(PatternMatcherTest, CustomCallTargetMatcherDescribeAndExplain) {
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   auto* root = hlo_module->entry_computation()->root_instruction();
-  EXPECT_TRUE(Match(root, match::Op().WithCustomCallTarget("test_target")));
+  EXPECT_TRUE(Match(root, match::Op().WithCustomCallTarget({"test_target"})));
   EXPECT_TRUE(Match(
       root, match::Op().WithCustomCallTarget({"test_target", "other_target"})));
   EXPECT_TRUE(Match(
       root, match::Op().WithCustomCallTarget({"other_target", "test_target"})));
-  EXPECT_FALSE(Match(root, match::Op().WithCustomCallTarget("other_target")));
+  EXPECT_FALSE(Match(root, match::Op().WithCustomCallTarget({"other_target"})));
   EXPECT_FALSE(Match(root, match::Op().WithCustomCallTarget(
                                {"other_target", "other_target2"})));
 
   EXPECT_DESC_AND_EXPLANATION(
-      root, match::Op().WithCustomCallTarget("other_target"),
+      root, match::Op().WithCustomCallTarget({"other_target"}),
       "an HloInstruction custom call with target 'other_target'",
       "HloInstruction is not a custom call with a target 'other_target'\nin "
       "out = f32[] custom-call(), custom_call_target=\"test_target\"");
@@ -939,23 +958,22 @@ TEST_F(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
 
 TEST_F(PatternMatcherTest, AnyOfMatcherDescribeToAndExplain) {
   EXPECT_DESC_AND_EXPLANATION(
-      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
-      m::AnyOf<HloInstruction>(m::Op().WithName("foo"),
-                               m::Op().WithName("bar")),
+      ShapeUtil::MakeScalarShape(S32),
+      m::AnyOf<Shape>(m::Shape().WithRank(1), m::Shape().WithElementType(F32)),
       "any of:\n"
-      " - an HloInstruction named \"foo\" OR\n"
-      " - an HloInstruction named \"bar\"",
+      " - a shape that has 1 dimension OR\n"
+      " - a shape with element type F32",
       "None of the following matchers succeeded:\n"
       "Matcher #1\n"
-      " - an HloInstruction named \"foo\"\n"
+      " - a shape that has 1 dimension\n"
       "failed with\n"
-      " - HloInstruction not named \"foo\"\n"
-      "   in c = s32[] constant(0)\n"
+      " - Shape does not have rank 1\n"
+      "   in s32[]\n"
       "Matcher #2\n"
-      " - an HloInstruction named \"bar\"\n"
+      " - a shape with element type F32\n"
       "failed with\n"
-      " - HloInstruction not named \"bar\"\n"
-      "   in c = s32[] constant(0)");
+      " - Shape does not have element type F32\n"
+      "   in s32[]");
 }
 
 TEST_F(PatternMatcherTest, Parameter) {
@@ -1037,6 +1055,120 @@ TEST_F(PatternMatcherTest, OneUseAndOneUser) {
             "in p0 = f32[] parameter(0)");
 }
 
+TEST_F(PatternMatcherTest, MatchSingleUserOnlyUnaryOpOneUser) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p");
+  auto reshape =
+      SetName("reshape", HloInstruction::CreateReshape(
+                             ShapeUtil::MakeShape(F32, {1}), param.get()));
+  EXPECT_TRUE(MatchSingleUserOnly(reshape.get(), m::Reshape(m::Op())));
+  // Equivalent call of Match:
+  EXPECT_TRUE(Match(reshape.get(), m::Reshape(m::Op().WithOneUser())));
+}
+
+TEST_F(PatternMatcherTest, MatchSingleUserOnlyUnaryOpTwoUsers) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p");
+  auto reshape =
+      SetName("reshape", HloInstruction::CreateReshape(
+                             ShapeUtil::MakeShape(F32, {1}), param.get()));
+  auto bitcast =
+      SetName("bitcast", HloInstruction::CreateBitcast(
+                             ShapeUtil::MakeShape(F32, {1}), param.get()));
+  EXPECT_TRUE(MatchSingleUserOnly(param.get(), m::Op()));
+  // Equivalent call of Match:
+  EXPECT_TRUE(Match(param.get(), m::Op()));
+
+  EXPECT_TRUE(MatchSingleUserOnly(bitcast.get(), m::Bitcast()));
+  EXPECT_TRUE(Match(bitcast.get(), m::Bitcast()));
+
+  EXPECT_FALSE(MatchSingleUserOnly(bitcast.get(), m::Bitcast(m::Op())));
+  EXPECT_FALSE(Match(bitcast.get(), m::Bitcast(m::Op().WithOneUser())));
+  EXPECT_EQ(Explanation(bitcast.get(), m::Bitcast(m::Op()),
+                        /*single_user_only=*/true),
+            "Operand 0 of HloInstruction has 2 users. Expected 1.\nin bitcast "
+            "= f32[1]{0} bitcast(f32[] p)");
+}
+
+TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpOneUser) {
+  auto param0 =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param0.get(), param0.get()));
+  EXPECT_TRUE(MatchSingleUserOnly(add.get(), m::Add(m::Op(), m::Op())));
+  // Equivalent call of Match:
+  EXPECT_TRUE(
+      Match(add.get(), m::Add(m::Op().WithOneUser(), m::Op().WithOneUser())));
+}
+
+TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsers) {
+  auto param0 =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  auto param1 =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p1");
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param0.get(), param0.get()));
+  auto mul =
+      SetName("mul", HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {}),
+                                                  HloOpcode::kMultiply,
+                                                  param1.get(), param0.get()));
+  EXPECT_TRUE(MatchSingleUserOnly(mul.get(), m::Multiply()));
+  // Equivalent call of Match:
+  EXPECT_TRUE(Match(mul.get(), m::Multiply()));
+
+  EXPECT_FALSE(MatchSingleUserOnly(mul.get(), m::Multiply(m::Op(), m::Op())));
+  EXPECT_FALSE(Match(
+      mul.get(), m::Multiply(m::Op().WithOneUser(), m::Op().WithOneUser())));
+  EXPECT_EQ(Explanation(mul.get(), m::Multiply(m::Op(), m::Op()),
+                        /*single_user_only=*/true),
+            "Operand 1 of HloInstruction has 2 users. Expected 1.\nin mul = "
+            "f32[] multiply(f32[] p1, f32[] p0)");
+
+  EXPECT_FALSE(MatchSingleUserOnly(add.get(), m::Add(m::Op(), m::Op())));
+  EXPECT_FALSE(
+      Match(add.get(), m::Add(m::Op().WithOneUser(), m::Op().WithOneUser())));
+  EXPECT_EQ(Explanation(add.get(), m::Add(m::Op(), m::Op()),
+                        /*single_user_only=*/true),
+            "Operand 0 of HloInstruction has 2 users. Expected 1.\nin add = "
+            "f32[] add(f32[] p0, f32[] p0)");
+}
+
+TEST_F(PatternMatcherTest, MatchSingleUserOnlyBinaryOpTwoUsersLowerLevel) {
+  auto param0 =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  auto param1 =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p1");
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param0.get(), param0.get()));
+  auto mul =
+      SetName("mul", HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {}),
+                                                  HloOpcode::kMultiply,
+                                                  param1.get(), param0.get()));
+  auto div = SetName("div", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}),
+                                HloOpcode::kDivide, add.get(), mul.get()));
+  EXPECT_TRUE(
+      MatchSingleUserOnly(div.get(), m::Divide(m::Add(), m::Multiply())));
+  // Equivalent call of Match:
+  EXPECT_TRUE(Match(div.get(), m::Divide(m::Add().WithOneUser(),
+                                         m::Multiply().WithOneUser())));
+
+  EXPECT_FALSE(MatchSingleUserOnly(
+      div.get(), m::Divide(m::Add(m::Op(), m::Op()), m::Multiply())));
+  EXPECT_FALSE(Match(
+      div.get(),
+      m::Divide(
+          m::Add(m::Op().WithOneUser(), m::Op().WithOneUser()).WithOneUser(),
+          m::Multiply().WithOneUser())));
+  EXPECT_EQ(Explanation(add.get(), m::Add(m::Op(), m::Op()),
+                        /*single_user_only=*/true),
+            "Operand 0 of HloInstruction has 2 users. Expected 1.\nin add = "
+            "f32[] add(f32[] p0, f32[] p0)");
+}
+
 TEST_F(PatternMatcherTest, Comparison) {
   auto shape = ShapeUtil::MakeShape(F32, {1});
   auto p0 = HloInstruction::CreateParameter(0, shape, "param.0");
@@ -1095,9 +1227,9 @@ TEST_F(PatternMatcherTest, CustomCallMatchers) {
   auto* root = hlo_module->entry_computation()->root_instruction();
 
   EXPECT_TRUE(Match(root, m::CustomCall()));
-  EXPECT_TRUE(Match(root, m::CustomCall("test_target")));
+  EXPECT_TRUE(Match(root, m::CustomCall({"test_target"})));
   EXPECT_TRUE(Match(
-      root, m::CustomCall("test_target", m::Parameter(0), m::Parameter(1))));
+      root, m::CustomCall({"test_target"}, m::Parameter(0), m::Parameter(1))));
 
   EXPECT_TRUE(Match(root, m::CustomCall({"test_target", "other_target"})));
   EXPECT_TRUE(Match(root, m::CustomCall({"other_target", "test_target"})));
@@ -1108,107 +1240,21 @@ TEST_F(PatternMatcherTest, CustomCallMatchers) {
 
   HloInstruction* instr;
   EXPECT_TRUE(Match(root, m::CustomCall(&instr)));
-  EXPECT_TRUE(Match(root, m::CustomCall(&instr, "test_target")));
-  EXPECT_TRUE(Match(root, m::CustomCall(&instr, "test_target", m::Parameter(0),
-                                        m::Parameter(1))));
+  EXPECT_TRUE(Match(root, m::CustomCall(&instr, {"test_target"})));
+  EXPECT_TRUE(Match(root, m::CustomCall(&instr, {"test_target"},
+                                        m::Parameter(0), m::Parameter(1))));
 
   const HloInstruction* const_instr;
   EXPECT_TRUE(Match(root, m::CustomCall(&const_instr)));
-  EXPECT_TRUE(Match(root, m::CustomCall(&const_instr, "test_target")));
-  EXPECT_TRUE(Match(root, m::CustomCall(&const_instr, "test_target",
+  EXPECT_TRUE(Match(root, m::CustomCall(&const_instr, {"test_target"})));
+  EXPECT_TRUE(Match(root, m::CustomCall(&const_instr, {"test_target"},
                                         m::Parameter(0), m::Parameter(1))));
 
-  EXPECT_FALSE(Match(root, m::CustomCall("other_target")));
+  EXPECT_FALSE(Match(root, m::CustomCall({"other_target"})));
   EXPECT_FALSE(Match(root, m::CustomCall({"other_target", "other_target2"})));
   EXPECT_FALSE(Match(
-      root, m::CustomCall("test_target", m::Parameter(1), m::Parameter(0))));
+      root, m::CustomCall({"test_target"}, m::Parameter(1), m::Parameter(0))));
 }
 
-TEST_F(PatternMatcherTest, OptionalUnaryOp) {
-  constexpr char kModuleStr[] = R"(
-    HloModule test_module
-
-    ENTRY test {
-      p0 = f32[] parameter(0)
-      cos = cosine(p0)
-      ROOT out = f32[] abs(cos)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  auto* root = hlo_module->entry_computation()->root_instruction();
-
-  EXPECT_TRUE(Match(
-      root,
-      m::OptionalUnaryOp({HloOpcode::kBitcast, HloOpcode::kCos}, m::Abs())));
-  EXPECT_TRUE(
-      Match(root, m::OptionalUnaryOp({HloOpcode::kBitcast, HloOpcode::kCos},
-                                     m::Abs(m::Cos()))));
-  EXPECT_TRUE(Match(
-      root, m::OptionalUnaryOp({HloOpcode::kCos, HloOpcode::kAbs}, m::Cos())));
-  EXPECT_FALSE(Match(
-      root,
-      m::OptionalUnaryOp({HloOpcode::kCos, HloOpcode::kAbs}, m::Bitcast())));
-  EXPECT_FALSE(Match(
-      root,
-      m::OptionalUnaryOp({HloOpcode::kCos, HloOpcode::kBitcast}, m::Cos())));
-
-  std::string description = absl::StrCat(
-      "an HloInstruction which optionally matches a unary operand with one of "
-      "the opcodes {",
-      HloOpcodeString(HloOpcode::kCos), ", ", HloOpcodeString(HloOpcode::kAbs),
-      "} before matching an HloInstruction with opcode ",
-      HloOpcodeString(HloOpcode::kBitcast), ".");
-  std::string explanation = absl::StrCat(
-      "HloInstruction doesn't have opcode ",
-      HloOpcodeString(HloOpcode::kBitcast),
-      "\nin cos = f32[] cosine(f32[] p0) ",
-      "and the HloInstruction doesn't have opcode ",
-      HloOpcodeString(HloOpcode::kBitcast), "\nin out = f32[] abs(f32[] cos)");
-  EXPECT_DESC_AND_EXPLANATION(
-      root,
-      m::OptionalUnaryOp({HloOpcode::kCos, HloOpcode::kAbs}, m::Bitcast()),
-      description.c_str(), explanation.c_str());
-
-  description = absl::StrCat(
-      "an HloInstruction which optionally matches a unary operand with one of "
-      "the opcodes {",
-      HloOpcodeString(HloOpcode::kCos), ", ",
-      HloOpcodeString(HloOpcode::kBitcast),
-      "} before matching an HloInstruction with opcode ",
-      HloOpcodeString(HloOpcode::kCos), ".");
-  explanation = absl::StrCat(
-      "The HloInstruction doesn't have one of the opcodes {",
-      HloOpcodeString(HloOpcode::kCos), ", ",
-      HloOpcodeString(HloOpcode::kBitcast),
-      "} and the HloInstruction doesn't have opcode ",
-      HloOpcodeString(HloOpcode::kCos), "\nin out = f32[] abs(f32[] cos)");
-  EXPECT_DESC_AND_EXPLANATION(
-      root,
-      m::OptionalUnaryOp({HloOpcode::kCos, HloOpcode::kBitcast}, m::Cos()),
-      description.c_str(), explanation.c_str());
-
-  HloInstruction* instr = nullptr;
-  EXPECT_TRUE(Match(
-      root, m::OptionalUnaryOp(&instr, {HloOpcode::kBitcast, HloOpcode::kCos},
-                               m::Abs())));
-  EXPECT_EQ(instr->opcode(), HloOpcode::kAbs);
-  instr = nullptr;
-  EXPECT_TRUE(Match(
-      root, m::OptionalUnaryOp(&instr, {HloOpcode::kBitcast, HloOpcode::kCos},
-                               m::Abs(m::Cos()))));
-  EXPECT_EQ(instr->opcode(), HloOpcode::kAbs);
-  instr = nullptr;
-  EXPECT_TRUE(
-      Match(root, m::OptionalUnaryOp(&instr, {HloOpcode::kCos, HloOpcode::kAbs},
-                                     m::Cos())));
-  EXPECT_EQ(instr->opcode(), HloOpcode::kAbs);
-  EXPECT_FALSE(
-      Match(root, m::OptionalUnaryOp(&instr, {HloOpcode::kCos, HloOpcode::kAbs},
-                                     m::Bitcast())));
-  EXPECT_FALSE(Match(
-      root, m::OptionalUnaryOp(&instr, {HloOpcode::kCos, HloOpcode::kBitcast},
-                               m::Bitcast())));
-}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/real_imag_expander_test.cc b/tensorflow/compiler/xla/service/real_imag_expander_test.cc
index 2a9afbe16c7..f5b6136261c 100644
--- a/tensorflow/compiler/xla/service/real_imag_expander_test.cc
+++ b/tensorflow/compiler/xla/service/real_imag_expander_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
diff --git a/tensorflow/compiler/xla/service/reduce_decomposer.cc b/tensorflow/compiler/xla/service/reduce_decomposer.cc
index d0b8504f8de..a0f6bb11914 100644
--- a/tensorflow/compiler/xla/service/reduce_decomposer.cc
+++ b/tensorflow/compiler/xla/service/reduce_decomposer.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/status.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc b/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
index 703772ad5a7..12e2cc6a814 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_combiner.cc
@@ -24,14 +24,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_combiner_utils.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_combiner.h b/tensorflow/compiler/xla/service/reduce_scatter_combiner.h
index ee7322a470e..6ac1e59ceae 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_combiner.h
+++ b/tensorflow/compiler/xla/service/reduce_scatter_combiner.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_SCATTER_COMBINER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_SCATTER_COMBINER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc b/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
index 83f91dea6cb..2c1d4aa2ba1 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_combiner_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
index 041580d191b..afc6b5e90ed 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #include <limits>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/collective_decomposer_utils.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.h b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.h
index 515aab31475..56cb4791a89 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_decomposer.h
+++ b/tensorflow/compiler/xla/service/reduce_scatter_decomposer.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_decomposer_test.cc b/tensorflow/compiler/xla/service/reduce_scatter_decomposer_test.cc
index 1a87743c0fa..e105678d9a2 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_decomposer_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
index f0dd29d1827..5362118d29e 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <optional>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/all_reduce_key.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_map.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/tsl/platform/errors.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.h b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.h
index e1b13d16ce2..47f04a2036f 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_reassociate.h
+++ b/tensorflow/compiler/xla/service/reduce_scatter_reassociate.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_SCATTER_REASSOCIATE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_REDUCE_SCATTER_REASSOCIATE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc b/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
index a41c6b028f2..9b6ba5eef65 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_reassociate_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/reduce_scatter_reassociate.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_utils.cc b/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
index 24df270c5b1..419db580a6b 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
+++ b/tensorflow/compiler/xla/service/reduce_scatter_utils.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/reduce_scatter_utils.h"
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/reduce_scatter_utils.h b/tensorflow/compiler/xla/service/reduce_scatter_utils.h
index 7e5fba433d8..5ed64fc864b 100644
--- a/tensorflow/compiler/xla/service/reduce_scatter_utils.h
+++ b/tensorflow/compiler/xla/service/reduce_scatter_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/reshape_decomposer.cc b/tensorflow/compiler/xla/service/reshape_decomposer.cc
index 52538334240..f4d26517dca 100644
--- a/tensorflow/compiler/xla/service/reshape_decomposer.cc
+++ b/tensorflow/compiler/xla/service/reshape_decomposer.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/reshape_decomposer.h"
 
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/status.h"
 
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 7fd98f5d7ae..94b6977c37c 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/service/result_caster.h b/tensorflow/compiler/xla/service/result_caster.h
index 8abc7c75633..a315120d897 100644
--- a/tensorflow/compiler/xla/service/result_caster.h
+++ b/tensorflow/compiler/xla/service/result_caster.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_RESULT_CASTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_RESULT_CASTER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc b/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
index d6ab8a2983e..cf301a77b5a 100644
--- a/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
+++ b/tensorflow/compiler/xla/service/rng_bit_generator_expander.cc
@@ -17,15 +17,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/service/rng_bit_generator_expander.h b/tensorflow/compiler/xla/service/rng_bit_generator_expander.h
index abd905e4e93..f307613ac77 100644
--- a/tensorflow/compiler/xla/service/rng_bit_generator_expander.h
+++ b/tensorflow/compiler/xla/service/rng_bit_generator_expander.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_RNG_BIT_GENERATOR_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/root_instruction_sinker.h b/tensorflow/compiler/xla/service/root_instruction_sinker.h
index e0030ef108b..8e3d5cea3e1 100644
--- a/tensorflow/compiler/xla/service/root_instruction_sinker.h
+++ b/tensorflow/compiler/xla/service/root_instruction_sinker.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ROOT_INSTRUCTION_SINKER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index 91da0666fcb..f33dbdb394a 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -16,14 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -484,10 +485,48 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandInstruction(
   return MaybeMakeTuple(results);
 }
 
+namespace {
+
+bool IsCombinerAssociative(const HloComputation* combiner) {
+  // Consider simple binary combiner functions only.
+  if (combiner->instruction_count() != 3) {
+    return false;
+  }
+  switch (combiner->root_instruction()->opcode()) {
+    // Minimum and Maximum are common associative combiners.
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMaximum:
+      return true;
+    // Other common combiners are associative at least for interger arithmetic.
+    case HloOpcode::kAdd:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kOr:
+    case HloOpcode::kXor:
+      return combiner->root_instruction()->shape().IsInteger();
+    default:
+      return false;
+  }
+}
+
+bool IsDeterministic(const HloScatterInstruction* scatter) {
+  if (scatter->unique_indices()) {
+    return true;
+  }
+  if (IsCombinerAssociative(scatter->to_apply())) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
 bool ScatterExpander::InstructionMatchesPattern(HloInstruction* inst) {
   auto* scatter = DynCast<HloScatterInstruction>(inst);
-  return scatter &&
-         (mode_ == kEliminateAllScatters || ScatterTripCount(scatter) == 1);
+  return (scatter != nullptr) && (mode_ == kEliminateAllScatters ||
+                                  (mode_ == kEliminateSimpleScatters &&
+                                   ScatterTripCount(scatter) == 1) ||
+                                  (mode_ == kEliminateIndeterminisitcScatters &&
+                                   !IsDeterministic(scatter)));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h
index 279805a8121..a581216abe7 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.h
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -32,6 +32,12 @@ namespace xla {
 //     pass can strength-reduce "simple" scatters -- specifically, scatters that
 //     can be represented without a loop -- to dynamic-update-slices.
 //
+//   - kEliminateIndeterminisitcScatters: For backends that *do* support
+//     scatter, this pass converts scatters with potentially indeterminisitc
+//     behavior, because of non-unique indices or non-associative combiner
+//     functions. There may be false positives, but no false negatives, i.e.
+//     some scatters are converted even when deterministic in practice.
+//
 // Note that even in kEliminateSimpleScatters mode, this pass may still expand a
 // scatter into a loop (with a trip-count of 1).  It's up to other
 // simplification passes to remove the loop.
@@ -40,6 +46,7 @@ class ScatterExpander : public OpExpanderPass {
   enum Mode {
     kEliminateAllScatters,
     kEliminateSimpleScatters,
+    kEliminateIndeterminisitcScatters,
   };
 
   explicit ScatterExpander(Mode m) : mode_(m) {}
diff --git a/tensorflow/compiler/xla/service/scatter_expander_test.cc b/tensorflow/compiler/xla/service/scatter_expander_test.cc
index 42df492612d..df09199da38 100644
--- a/tensorflow/compiler/xla/service/scatter_expander_test.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -246,5 +246,95 @@ TEST_F(ScatterExpanderTest,
   EXPECT_TRUE(result);
 }
 
+TEST_F(ScatterExpanderTest, DoNotEliminateScatterWithAssociativeCombiner) {
+  const char* const kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      arg1.173 = s32[] parameter(1)
+      arg0.172 = s32[] parameter(0)
+      ROOT add.48 = s32[] add(arg0.172, arg1.173)
+    }
+
+    ENTRY fused_computation {
+      bitcast.2335 = s32[1,4096] parameter(0)
+      pad.96 = s32[4096,2] parameter(1)
+     bitcast.2748 = s32[4096,1,1] parameter(2)
+      ROOT scatter.48 = s32[1,4096] scatter(bitcast.2335, pad.96, bitcast.2748),
+        update_window_dims={1,2}, inserted_window_dims={},
+        scatter_dims_to_operand_dims={0,1}, index_vector_dim=1,
+        to_apply=scatter_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ScatterExpander scatter_expander(
+      ScatterExpander::kEliminateIndeterminisitcScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(ScatterExpanderTest, EliminateScatterWithNonAssociativeCombiner) {
+  const char* const kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      arg1.173 = f32[] parameter(1)
+      arg0.172 = f32[] parameter(0)
+      ROOT add.48 = f32[] add(arg0.172, arg1.173)
+    }
+
+    ENTRY fused_computation {
+      bitcast.2335 = f32[1,4096] parameter(0)
+      pad.96 = s32[4096,2] parameter(1)
+     bitcast.2748 = f32[4096,1,1] parameter(2)
+      ROOT scatter.48 = f32[1,4096] scatter(bitcast.2335, pad.96, bitcast.2748),
+        update_window_dims={1,2}, inserted_window_dims={},
+        scatter_dims_to_operand_dims={0,1}, index_vector_dim=1,
+        to_apply=scatter_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ScatterExpander scatter_expander(
+      ScatterExpander::kEliminateIndeterminisitcScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_TRUE(result);
+}
+
+TEST_F(ScatterExpanderTest, DoNotEliminateScatterWithAssociativeFp32Combiner) {
+  const char* const kModuleStr = R"(
+    HloModule scatter_expander
+
+    scatter_computation {
+      arg1.173 = f32[] parameter(1)
+      arg0.172 = f32[] parameter(0)
+      ROOT max.48 = f32[] maximum(arg0.172, arg1.173)
+    }
+
+    ENTRY fused_computation {
+      bitcast.2335 = f32[1,4096] parameter(0)
+      pad.96 = s32[4096,2] parameter(1)
+     bitcast.2748 = f32[4096,1,1] parameter(2)
+      ROOT scatter.48 = f32[1,4096] scatter(bitcast.2335, pad.96, bitcast.2748),
+        update_window_dims={1,2}, inserted_window_dims={},
+        scatter_dims_to_operand_dims={0,1}, index_vector_dim=1,
+        to_apply=scatter_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  ScatterExpander scatter_expander(
+      ScatterExpander::kEliminateIndeterminisitcScatters);
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&scatter_expander, module.get()));
+  EXPECT_FALSE(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/scatter_simplifier.cc b/tensorflow/compiler/xla/service/scatter_simplifier.cc
index cd01292ee98..177bcd2e284 100644
--- a/tensorflow/compiler/xla/service/scatter_simplifier.cc
+++ b/tensorflow/compiler/xla/service/scatter_simplifier.cc
@@ -22,12 +22,12 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/service/gather_scatter_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/select_and_scatter_expander.cc b/tensorflow/compiler/xla/service/select_and_scatter_expander.cc
index d9672b12bda..3680aefc79e 100644
--- a/tensorflow/compiler/xla/service/select_and_scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/select_and_scatter_expander.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/select_and_scatter_expander_test.cc b/tensorflow/compiler/xla/service/select_and_scatter_expander_test.cc
index 9004edfa3ca..1803292cc24 100644
--- a/tensorflow/compiler/xla/service/select_and_scatter_expander_test.cc
+++ b/tensorflow/compiler/xla/service/select_and_scatter_expander_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 1f1a690e625..eca3b0f1476 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -28,6 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -37,12 +41,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -70,10 +70,6 @@ namespace {
 using absl::StrCat;
 using absl::StrFormat;
 
-// Argument used when calling DumpHloModuleIfEnabled before optimizations are
-// performed on an HloModule.
-constexpr char kBeforeOptimizationsDumpName[] = "before_optimizations";
-
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
 Status RecordArguments(const absl::Span<const ShapedBuffer* const> arguments,
                        se::Stream* stream, TransferManager* transfer_manager,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index eec07daf089..9492823a560 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index b1e72b93658..d516aa97c08 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -263,6 +263,7 @@ StatusOr<PrimitiveType> MaybeUpcast(
     case HloOpcode::kRsqrt:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
       if (!ShapeUtil::ElementIsFloating(shape) &&
           !ShapeUtil::ElementIsComplex(shape)) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 971621b2397..7304e165a5e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index f1ebd783f83..283d4db4e19 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 7946c9b8882..f407817f522 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -30,16 +30,17 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/sharding_op_util.h"
@@ -76,6 +77,33 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
                                      HloInstruction* instruction,
                                      bool may_combine_partial_sharding,
                                      bool allow_aggressive_resharding = false) {
+  // Allows improve from tile maximal shardings to manual shardings.
+  if (instruction->has_sharding()) {
+    bool no_worse = true;
+    bool changed = false;
+    const std::vector<HloSharding>& flattened_instruction_shardings =
+        instruction->sharding().tuple_elements();
+    const std::vector<HloSharding>& flatten_shardings =
+        sharding.tuple_elements();
+    CHECK_EQ(flattened_instruction_shardings.size(), flatten_shardings.size());
+    for (int i = 0; i != flattened_instruction_shardings.size(); ++i) {
+      if (flattened_instruction_shardings[i] != flatten_shardings[i]) {
+        changed = true;
+        if (!flattened_instruction_shardings[i].IsTileMaximal() ||
+            !flatten_shardings[i].IsManual()) {
+          no_worse = false;
+          break;
+        }
+      }
+    }
+    // Replace sharding if we are know that it strictly improves(i.e. the
+    // sharding is changed and no worse than before) from tile maximal
+    // (sub)shardings to manual shardings. Otherwise pass through.
+    if (no_worse && changed) {
+      instruction->set_sharding(sharding);
+      return true;
+    }
+  }
   // We don't want to propagate tile maximal shardings.
   if (!IsSpatiallyPartitioned(sharding)) {
     return false;
@@ -85,6 +113,10 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
     instruction->set_sharding(std::move(sharding));
     return true;
   }
+  // We don't want to propagate manual shardings.
+  if (sharding.IsManual()) {
+    return false;
+  }
   int64_t sharding_tiles = sharding.NumTiles();
   if (hlo_sharding_util::MergeSharding(instruction->sharding(), &sharding,
                                        may_combine_partial_sharding)) {
@@ -94,23 +126,8 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
     if (!allow_aggressive_resharding && instruction->shape().IsArray() &&
         !instruction->sharding().IsTileMaximal() &&
         sharding.NumTiles() == sharding_tiles) {
-      std::vector<int64_t> diff_dims;
-      for (int64_t i = 0; i < instruction->shape().rank(); ++i) {
-        if (instruction->sharding().tile_assignment().dim(i) ==
-            sharding.tile_assignment().dim(i)) {
-          continue;
-        }
-        if (instruction->sharding().tile_assignment().dim(i) != 1) {
-          VLOG(10) << "Not merging because of dim i = " << i
-                   << " sharded differently";
-          VLOG(10) << "Instr sharding: " << instruction->sharding().ToString();
-          VLOG(10) << "New sharding " << sharding.ToString();
-          return false;
-        }
-        diff_dims.push_back(i);
-      }
-      if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-              sharding, diff_dims) != instruction->sharding()) {
+      if (!hlo_sharding_util::IsSubTilingOrEqualSharding(
+              instruction->shape(), sharding, instruction->sharding())) {
         VLOG(10) << "Not merging because of different device distribution";
         VLOG(10) << "Instr sharding: " << instruction->sharding().ToString();
         VLOG(10) << "New sharding " << sharding.ToString();
@@ -233,6 +250,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kStochasticConvert:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
     case HloOpcode::kWhile:
     case HloOpcode::kXor: {
@@ -408,7 +426,7 @@ bool InferDotShardingFromOperands(
                                ? dnums.rhs_non_contracting_dims
                                : dnums.lhs_non_contracting_dims) {
       int64_t d = operand_index == 0 ? dim.lhs : dim.rhs;
-      if (d > 0) {
+      if (d >= 0) {
         contracting_dims.push_back(d);
       }
     }
@@ -780,7 +798,7 @@ bool InferShardingFromUsers(
     return false;
   }
   // Propagate manual sharding.
-  if (!instruction->has_sharding()) {
+  if (!instruction->has_sharding() || instruction->sharding().IsTileMaximal()) {
     for (const HloInstruction* user : instruction->users()) {
       if (!user->has_sharding() || !user->sharding().IsManual() ||
           user->IsCustomCall("SPMDFullToShardShape"))
@@ -939,12 +957,14 @@ Status CheckAndUpdateDeviceAssignmentsInWhileBody(
           return bad_status(instruction, device, channel_instruction,
                             *unique_device);
         }
-      } else if (opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv ||
+      } else if (((opcode == HloOpcode::kSend || opcode == HloOpcode::kRecv) &&
+                  !Cast<HloSendRecvInstruction>(instruction)
+                       ->is_host_transfer())
                  // Cross-replica AllReduces don't have a channel_id, and we
                  // don't enforce any invariant about their device assignment.
-                 ((opcode == HloOpcode::kAllReduce ||
-                   opcode == HloOpcode::kReduceScatter) &&
-                  instruction->channel_id())) {
+                 || ((opcode == HloOpcode::kAllReduce ||
+                      opcode == HloOpcode::kReduceScatter) &&
+                     instruction->channel_id())) {
         channel_instruction = instruction;
         unique_device = device;
         if (!devices_to_instructions.empty()) {
@@ -1275,7 +1295,8 @@ bool IsCSEPreventionSharding(const HloSharding& sharding) {
 
 std::optional<HloSharding> InferBroadcastOperandSharding(
     const HloInstruction& instruction, bool is_spmd) {
-  if (instruction.sharding().IsReplicated()) {
+  if (instruction.sharding().IsReplicated() ||
+      instruction.sharding().IsManual()) {
     return instruction.sharding();
   }
   std::vector<int64_t> dims_to_replicate;
@@ -1367,9 +1388,19 @@ StatusOr<bool> ProcessShardingInstruction(
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     bool replace_sharding_with_copy,
     absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>*
-        unspecified_dims) {
+        unspecified_dims,
+    std::vector<HloSharding>* saved_root_shardings) {
   bool changed = false;
-
+  HloInstruction* root_instr = module->entry_computation()->root_instruction();
+  if (saved_root_shardings != nullptr && root_instr->shape().IsTuple() &&
+      module->entry_computation()->root_instruction()->has_sharding()) {
+    saved_root_shardings->reserve(
+        root_instr->sharding().tuple_elements().size());
+    for (const HloSharding& sharding :
+         root_instr->sharding().tuple_elements()) {
+      saved_root_shardings->push_back(sharding);
+    }
+  }
   for (HloComputation* computation : module->computations(execution_threads)) {
     auto instructions = computation->MakeInstructionPostOrder();
     std::reverse(instructions.begin(), instructions.end());
@@ -1435,19 +1466,15 @@ int64_t ComputeNonRootUsers(const HloInstruction* instr) {
           // Set sharding only if it is different. We don't overwrite the
           // metadata if it has the same sharding besides metadata.
           if (!operand->has_sharding() || operand->sharding() != *sharding) {
-            if (operand->has_sharding() && operand->sharding().IsTuple() &&
-                !sharding->IsTuple()) {
+            HloSharding operand_sharding = *sharding;
+            if (operand->shape().IsTuple() && !sharding->IsTuple()) {
               // Expand sharding into tuple sharding per
               // CloneShardingForDomain() in
               // third_party/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
-              // Create Tuple HloSharding.
-              ShapeTree<HloSharding> output_tuple_sharding(operand->shape(),
-                                                           *sharding);
-              d->mutable_operand(0)->set_sharding(
-                  HloSharding::Tuple(output_tuple_sharding));
-            } else {
-              d->mutable_operand(0)->set_sharding(*sharding);
+              operand_sharding =
+                  HloSharding::SingleTuple(operand->shape(), *sharding);
             }
+            operand->set_sharding(operand_sharding);
           }
         }
         return OkStatus();
@@ -1701,6 +1728,17 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
       return hlo_sharding_util::ReverseSharding(user.sharding(),
                                                 user.dimensions());
     }
+    case HloOpcode::kOutfeed: {
+      if (&instruction != user.operand(0)) {
+        return std::nullopt;
+      }
+      std::vector<Shape> operand_shapes(user.operand_count());
+      for (int i = 0; i < user.operand_count(); ++i) {
+        operand_shapes[i] = user.operand(i)->shape();
+      }
+      return user.sharding().GetSubSharding(
+          ShapeUtil::MakeTupleShape(operand_shapes), {0});
+    }
     case HloOpcode::kGather: {
       if (&instruction == user.operand(1)) {
         return hlo_sharding_util::
@@ -1791,7 +1829,7 @@ bool InferDynamicSliceOrDynamicUpdateSliceShardingFromOperands(
           ? instruction->operand(0)
           : instruction->operand(1);
   auto slice_dim_is_sharded = [&]() {
-    if (!IsSpatiallyPartitioned(operand) ||
+    if (!IsSpatiallyPartitioned(operand) || operand->sharding().IsManual() ||
         operand->sharding().NumTiles() == 1) {
       return false;
     }
@@ -1815,14 +1853,6 @@ bool InferDynamicSliceOrDynamicUpdateSliceShardingFromOperands(
     if (!IsSpatiallyPartitioned(operand)) {
       return false;
     }
-
-    if (operand->sharding().NumTiles() == 1) {
-      return MaybeImproveInstructionSharding(
-          operand->sharding(), instruction, may_combine_partial_sharding,
-          /*allow_aggressive_resharding=*/
-          ComputeNonRootUsers(instruction) == 1);
-    }
-
     if (slice_dim_is_sharded()) {
       return false;
     }
@@ -1863,7 +1893,8 @@ bool ShardingPropagation::InferShardingFromOperands(
   // Propagate manual sharding. Avoid tuple shaped HLOs that group independent
   // together. Reduce, ReduceWindow, and Sort can be tuples but the elements
   // are correlated, so we propagate manual sharding through them.
-  if (!instruction->has_sharding() &&
+  if ((!instruction->has_sharding() ||
+       instruction->sharding().IsTileMaximal()) &&
       (instruction->shape().IsArray() ||
        instruction->opcode() == HloOpcode::kReduce ||
        instruction->opcode() == HloOpcode::kSort ||
@@ -1877,7 +1908,9 @@ bool ShardingPropagation::InferShardingFromOperands(
            instruction->opcode() == HloOpcode::kDynamicSlice)) {
         return false;
       }
-      instruction->set_sharding(HloSharding::Manual(op->sharding().metadata()));
+      instruction->set_sharding(
+          HloSharding::Manual(op->sharding().metadata())
+              .NormalizeTupleSharding(instruction->shape()));
       return true;
     }
   }
@@ -1901,7 +1934,6 @@ bool ShardingPropagation::InferShardingFromOperands(
     }
     return false;
   }
-
   auto get_maybe_tuple_sharding = [&](HloSharding sharding) {
     if (instruction->shape().IsArray()) {
       return sharding;
@@ -1910,7 +1942,6 @@ bool ShardingPropagation::InferShardingFromOperands(
                                    std::move(sharding));
     return HloSharding::Tuple(instruction->shape(), tuple);
   };
-
   switch (instruction->opcode()) {
     case HloOpcode::kGetTupleElement: {
       const HloInstruction* operand = instruction->operand(0);
@@ -2201,12 +2232,16 @@ bool ShardingPropagation::InferShardingFromOperands(
       if (!operand || !IsSpatiallyPartitioned(operand)) {
         return false;
       }
-
+      HloSortInstruction* sort = DynCast<HloSortInstruction>(instruction);
+      CHECK(sort);
+      const int64_t sort_dim = sort->sort_dimension();
       if (!operand->sharding().IsTileMaximal() &&
-          operand->sharding().tile_assignment().dim(
-              instruction->dimensions(0)) != 1) {
-        // Doesn't support sharding the sorting dimension.
-        return false;
+          operand->sharding().tile_assignment().dim(sort_dim) != 1) {
+        // In case of a sort operand sharded along the sort dimension, the
+        // sharding is propagated only if there exists a free (unsharded)
+        // dimension that we can later move the sharding into.
+        if (!hlo_sharding_util::IsSortOperandShardingMovable(operand, sort_dim))
+          return false;
       }
 
       if (instruction->shape().IsTuple()) {
@@ -2440,11 +2475,22 @@ StatusOr<bool> ShardingPropagation::Run(
                      : RemoveShardingMetadata(module, execution_threads);
   absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>
       unspecified_dims;
+  std::vector<HloSharding> saved_root_shardings;
   TF_ASSIGN_OR_RETURN(
       bool changed,
-      ProcessShardingInstruction(module, execution_threads,
-                                 !cse_prevention_only_, &unspecified_dims));
+      ProcessShardingInstruction(
+          module, execution_threads, !cse_prevention_only_, &unspecified_dims,
+          allow_spmd_sharding_propagation_to_output_ ? &saved_root_shardings
+                                                     : nullptr));
   any_changed |= changed;
+  CHECK(!module->entry_computation()->root_instruction()->has_sharding() ||
+        allow_spmd_sharding_propagation_to_output_vector_.size() == 1 ||
+        module->entry_computation()
+                ->root_instruction()
+                ->sharding()
+                .tuple_elements()
+                .size() ==
+            allow_spmd_sharding_propagation_to_output_vector_.size());
 
   // Association of partitionable embedded computations with their parent
   // instruction.
@@ -2528,7 +2574,7 @@ StatusOr<bool> ShardingPropagation::Run(
         // propagate it to the other instructions, so they all share the same
         // sharding, in case the user didn't shard all of them. We don't check
         // that user shardings are consistent, because such check is already
-        // done by HloShardingVerifier.
+        // done by HLO verifier.
         const HloInstruction* sharded_inst = nullptr;
         auto related_instructions = get_related_instructions(instruction);
         for (auto inst : related_instructions) {
@@ -2553,14 +2599,14 @@ StatusOr<bool> ShardingPropagation::Run(
       }
     }
   }
-
   // Collect all pre-sharded instructions as we aren't allowed to modify their
   // sharding.
   absl::flat_hash_set<const HloInstruction*> provided_shardings;
   for (const HloComputation* computation :
        module->computations(execution_threads)) {
     for (const HloInstruction* inst : computation->instructions()) {
-      if (inst->has_sharding()) {
+      if (inst->has_sharding() &&
+          inst != module->entry_computation()->root_instruction()) {
         provided_shardings.insert(inst);
       }
     }
@@ -2651,7 +2697,6 @@ StatusOr<bool> ShardingPropagation::Run(
             changed_last_iter = true;
           }
         }
-
         // Then iterate the HLO graph in reverse post order taking shardings
         // from users.
         for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
@@ -2748,6 +2793,19 @@ StatusOr<bool> ShardingPropagation::Run(
       }
     }
   }
+  HloInstruction* root_instruction =
+      module->entry_computation()->root_instruction();
+  if (saved_root_shardings.size() ==
+          allow_spmd_sharding_propagation_to_output_vector_.size() &&
+      root_instruction->has_sharding()) {
+    HloSharding root_sharding = root_instruction->sharding();
+    for (int i = 0; i < saved_root_shardings.size(); ++i) {
+      if (!allow_spmd_sharding_propagation_to_output_vector_[i]) {
+        root_sharding.tuple_elements()[i] = saved_root_shardings[i];
+      }
+    }
+    root_instruction->set_sharding(root_sharding);
+  }
 
   TF_RETURN_IF_ERROR(CanonicalizeLayouts(module));
 
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.h b/tensorflow/compiler/xla/service/sharding_propagation.h
index 3dc3218be62..c0c217a715a 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.h
+++ b/tensorflow/compiler/xla/service/sharding_propagation.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
@@ -39,7 +41,8 @@ StatusOr<bool> ProcessShardingInstruction(
     const absl::flat_hash_set<absl::string_view>& execution_threads,
     bool replace_sharding_with_copy,
     absl::flat_hash_map<const HloInstruction*, std::vector<int64_t>>*
-        unspecified_dims);
+        unspecified_dims,
+    std::vector<HloSharding>* saved_root_shardings);
 
 int64_t ComputeNonRootUsers(const HloInstruction* instr);
 
@@ -60,13 +63,18 @@ class ShardingPropagation : public HloModulePass {
       absl::flat_hash_map<const HloComputation*, HloInstruction*>;
   explicit ShardingPropagation(
       bool is_spmd = false, bool propagate_metadata = false,
-      bool allow_spmd_sharding_propagation_to_output = false,
+      absl::Span<const bool> allow_spmd_sharding_propagation_to_output =
+          {false},
       bool cse_prevention_only = false,
       std::unique_ptr<CustomCallShardingHelper> sharding_helper = nullptr)
       : is_spmd_(is_spmd),
         propagate_metadata_(propagate_metadata),
         allow_spmd_sharding_propagation_to_output_(
-            allow_spmd_sharding_propagation_to_output),
+            absl::c_any_of(allow_spmd_sharding_propagation_to_output,
+                           [](bool v) { return v; })),
+        allow_spmd_sharding_propagation_to_output_vector_(
+            allow_spmd_sharding_propagation_to_output.begin(),
+            allow_spmd_sharding_propagation_to_output.end()),
         cse_prevention_only_(cse_prevention_only) {
     if (sharding_helper) {
       sharding_helper_ = std::move(sharding_helper);
@@ -109,6 +117,7 @@ class ShardingPropagation : public HloModulePass {
   bool is_spmd_;
   bool propagate_metadata_;
   bool allow_spmd_sharding_propagation_to_output_;
+  std::vector<bool> allow_spmd_sharding_propagation_to_output_vector_;
   // If true, the pass keeps the propagation results only on selected
   // instructions to prevent CSE across unrelated subgraphs. (A common case is
   // scalar broadcasts).
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index 44de5f957cf..60240358624 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_op_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -384,7 +384,7 @@ ENTRY %broadcast {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -476,7 +476,7 @@ ENTRY %broadcast {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -578,7 +578,7 @@ ENTRY %broadcast {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -828,7 +828,7 @@ ENTRY %main {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -880,7 +880,7 @@ ENTRY %gte {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -1010,6 +1010,104 @@ ENTRY %tuple {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, TupleForwardPassAndBackWardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tuple {
+  %param0 =  f32[256,2]{1,0} parameter(0),
+    sharding={manual metadata={op_name="a"}}
+  %param1 =  f32[256,2]{1,0} parameter(1),
+    sharding={devices=[1,2]0,1 metadata={op_name="b"}}
+  %constant = s32[1,2]{1,0} constant({{0,1}})
+  %gather = f32[1,32,2]{2,1,0} gather(param0, constant), offset_dims={1,2}, collapsed_slice_dims={}, start_index_map={0,1}, index_vector_dim=1, slice_sizes={32,2}
+  %tuple = (f32[1,32,2]{2,1,0}, f32[256,2]{1,0}) tuple(
+    %gather, %param1)
+  ROOT %copy = (f32[1,32,2]{2,1,0}, f32[256,2]{1,0}) copy(%tuple)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* tuple = FindInstruction(module.get(), "tuple");
+  ASSERT_NE(tuple, nullptr);
+  // Check that the sharding on param1 is not replicated on tuple element[1].
+  EXPECT_THAT(tuple, op::Sharding("{{manual}, {devices=[1,2]0,1}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(tuple->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("a")}));
+    EXPECT_THAT(tuple->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("b")}));
+  } else {
+    for (const HloSharding& sub_sharding : tuple->sharding().tuple_elements()) {
+      EXPECT_THAT(sub_sharding, ShardingMetadata({}));
+    }
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, TupleShapedBackWardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[]) parameter(0)
+  %count.cond = u32[] get-tuple-element(%vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(%count.cond, %limit), direction=LT
+}
+
+%body {
+  %param = (u32[], f32[]) parameter(0)
+  %count = u32[] get-tuple-element(%param), index=0
+  %after-all = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(%after-all), channel_id=1
+  %recv-done = (f32[], token[]) recv-done(%recv), channel_id=1
+  %data = f32[] get-tuple-element(%recv-done), index=0
+  ROOT %tuple = (u32[], f32[]) tuple(%count, %data)
+}
+
+ENTRY %entry {
+  %zero = u32[] constant(0), sharding={replicated metadata={op_name="a"}}
+  %p0 = f32[] parameter(0), sharding={manual metadata={op_name="b"}}
+  %tuple = (u32[], f32[]) tuple(%zero, %p0)
+  %while = (u32[], f32[]) while(%tuple), body=%body, condition=%cond,
+    sharding={{manual metadata={op_name="c"}},
+              {manual metadata={op_name="d"}}}
+  ROOT %result = f32[] get-tuple-element(%while), index=1
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* tuple = FindInstruction(module.get(), "tuple");
+  ASSERT_NE(tuple, nullptr);
+  // Check that the sharding on param1 is not replicated on tuple element[1].
+  EXPECT_THAT(tuple, op::Sharding("{{manual}, {manual}}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(tuple->sharding().tuple_elements()[0],
+                ShardingMetadata({CreateMetadata("c")}));
+    EXPECT_THAT(tuple->sharding().tuple_elements()[1],
+                ShardingMetadata({CreateMetadata("d")}));
+  } else {
+    for (const HloSharding& sub_sharding : tuple->sharding().tuple_elements()) {
+      EXPECT_THAT(sub_sharding, ShardingMetadata({}));
+    }
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ForwardConvolutionForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3374,7 +3472,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata,
-                          GetParam().allow_root_sharding_propagation)
+                          {GetParam().allow_root_sharding_propagation})
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(changed);
@@ -8342,7 +8440,7 @@ ENTRY %entry {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true,
-                          /*allow_spmd_sharding_propagation_to_output=*/false,
+                          /*allow_spmd_sharding_propagation_to_output=*/{false},
                           /*cse_prevention_only=*/true)
           .Run(module.get()));
   XLA_VLOG_LINES(1, module->ToString());
@@ -8430,5 +8528,270 @@ ENTRY %entry {
   EXPECT_TRUE(changed);
 }
 
+TEST_F(ShardingPropagationTest, ContractingAsNonContractingCrash) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %p0 = f32[20,64,56,56]{3,2,1,0} parameter(0), sharding={replicated}
+  %p1 = f32[1,1,256,64]{2,3,1,0} parameter(1), sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+  %convolution.4512 = f32[20,256,56,56]{3,2,1,0} convolution(%p0, %p1), window={size=1x1}, dim_labels=bf01_01oi->bf01
+  ROOT %copy = f32[20,256,56,56]{3,2,1,0} copy(%convolution.4512)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+}
+
+TEST_F(ShardingPropagationTest, PropagateReduceManualTuple) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+orclone {
+  lhs.1 = u32[] parameter(0)
+  rhs.1 = u32[] parameter(2)
+  or.2 = u32[] or(lhs.1, rhs.1)
+  lhs.0 = u32[] parameter(1)
+  rhs.0 = u32[] parameter(3)
+  or.3 = u32[] or(lhs.0, rhs.0)
+  ROOT tuple.4 = (u32[], u32[]) tuple(or.2, or.3)
+}
+
+ENTRY %main.21 {
+  select.104 = u32[2,2]{1,0} parameter(0), sharding={manual}
+  shift-left.5 = u32[2,2]{1,0} parameter(1), sharding={manual}
+  constant.4183 = u32[] constant(0), sharding={manual}
+  reduce.1 = (u32[2]{0}, u32[2]{0}) reduce(shift-left.5, select.104, constant.4183, constant.4183), dimensions={1}, to_apply=orclone
+  ROOT get-tuple-element.13 = u32[2]{0} get-tuple-element(reduce.1), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  EXPECT_THAT(FindInstruction(module.get(), "reduce.1"),
+              op::Sharding("{{manual}, {manual}}"));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+}
+
+TEST_F(ShardingPropagationTest, MergeCompatibleTiles) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+ENTRY %main.21 {
+  p = bf16[8,4,256,1024,12288]{4,3,2,1,0} parameter(0), sharding={devices=[8,1,1,1,1]0,1,2,3,4,5,6,7}
+  p2 = bf16[8,4,256,1024,12288]{4,3,2,1,0} parameter(1), sharding={devices=[4,1,1,1,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  c0 =  bf16[8,4,256,1024,12288]{4,3,2,1,0} copy(p)
+  c1 =  bf16[8,4,256,1024,12288]{4,3,2,1,0} copy(p2)
+  a = bf16[8,4,256,1024,12288]{4,3,2,1,0} add(c0, c1)
+  ROOT c2 = bf16[8,4,256,1024,12288]{4,3,2,1,0} copy(a), sharding={devices=[8,1,1,1,1]0,1,2,3,4,5,6,7}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[8,1,1,1,1]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, OutfeedUser) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+ENTRY %main.21 {
+  p = f32[10,128]{1,0} parameter(0)
+  c = f32[10,128]{1,0} copy(p)
+  t = (f32[10,128]{1,0}) tuple(c)
+  a = token[] after-all()
+  ROOT of = token[] outfeed((f32[10,128]{1,0}) %t, token[] %a), outfeed_shape=(f32[10,128]{1,0}), sharding={{devices=[2,1]0,1}, {maximal device=0}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, SortOperandShardedOnSortDim_RankOne) {
+  const char* const hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024]{0})->(f32[1024]{0}, s32[1024]{0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024]{0} parameter(0)
+  negate.0 = f32[1024]{0} negate(param.0), sharding={devices=[8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[1024]{0} iota(), iota_dimension=0
+  sort.0 = (f32[1024]{0}, s32[1024]{0}) sort(negate.0, iota.0), dimensions={0}, is_stable=true, to_apply=compare
+  ROOT copy.0 = (f32[1024]{0}, s32[1024]{0}) copy(sort.0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_FALSE(changed);  // Does not propagate the sharding for 1D operands
+}
+
+TEST_F(ShardingPropagationTest, SortOperandShardedOnSortDim_RankTwo) {
+  const char* const hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024,1024]{1,0})->(f32[1024,1024]{1,0}, s32[1024,1024]{1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[1024,1024]{1,0} iota(), iota_dimension=1
+  sort.0 = (f32[1024,1024]{1,0}, s32[1024,1024]{1,0}) sort(negate.0, iota.0), dimensions={1}, is_stable=true, to_apply=compare
+  ROOT copy.0 = (f32[1024,1024]{1,0}, s32[1024,1024]{1,0}) copy(sort.0)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
+          .Run(module.get()));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "iota.0"),
+              op::Sharding("{devices=[1,8]0,1,2,3,4,5,6,7}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "sort.0"),
+      op::Sharding(
+          "{{devices=[1,8]0,1,2,3,4,5,6,7}, {devices=[1,8]0,1,2,3,4,5,6,7}}"));
+}
+
+TEST_F(ShardingPropagationTest, PropagateToOutput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param0 = f32[] parameter(0), sharding={replicated}
+  %br = f32[4] broadcast(%param0), dimensions={}
+  %annotate = f32[4] custom-call(%br), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[0]", sharding={devices=[4]0,1,2,3}
+  ROOT %add = f32[4] add(%annotate, %annotate), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true,
+                          /*allow_spmd_sharding_propagation_to_output=*/{true})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Sharding("{devices=[4]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, PropagateToOutputTuplePartial) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param0 = f32[] parameter(0), sharding={replicated}
+  %br = f32[4] broadcast(%param0), dimensions={}
+  %annotate = f32[4] custom-call(%br), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[0]", sharding={devices=[4]0,1,2,3}
+  %add = f32[4] add(%annotate, %annotate)
+  %param1 = f32[] parameter(1), sharding={replicated}
+  %br1 = f32[4] broadcast(%param1), dimensions={}
+  %annotate1 = f32[4] custom-call(%br1), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[0]", sharding={devices=[4]0,1,2,3}
+  %add1 = f32[4] add(%annotate1, %annotate1)
+  ROOT t = (f32[4], f32[4]) tuple(add, add1), sharding={{replicated},{replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(
+          /*is_spmd=*/true, /*propagate_metadata=*/true,
+          /*allow_spmd_sharding_propagation_to_output=*/{true, false})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Sharding("{{devices=[4]0,1,2,3},{replicated}}"));
+}
+
+TEST_F(ShardingPropagationTest, PropagateToOutputTupleFull) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %entry {
+  %param0 = f32[] parameter(0), sharding={replicated}
+  %br = f32[4] broadcast(%param0), dimensions={}
+  %annotate = f32[4] custom-call(%br), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[0]", sharding={devices=[4]0,1,2,3}
+  %add = f32[4] add(%annotate, %annotate)
+  %param1 = f32[] parameter(1), sharding={replicated}
+  %br1 = f32[4] broadcast(%param1), dimensions={}
+  %annotate1 = f32[4] custom-call(%br1), custom_call_target="Sharding",
+    backend_config="unspecified_dims=[0]", sharding={devices=[4]0,1,2,3}
+  %add1 = f32[4] add(%annotate1, %annotate1)
+  ROOT t = (f32[4], f32[4]) tuple(add, add1), sharding={{replicated},{replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true,
+                          /*allow_spmd_sharding_propagation_to_output=*/{true})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Sharding("{{devices=[4]0,1,2,3},{devices=[4]0,1,2,3}}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sharding_remover.cc b/tensorflow/compiler/xla/service/sharding_remover.cc
index b31c59d6b6f..0ab53f007f3 100644
--- a/tensorflow/compiler/xla/service/sharding_remover.cc
+++ b/tensorflow/compiler/xla/service/sharding_remover.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/sharding_remover.h b/tensorflow/compiler/xla/service/sharding_remover.h
index 41f4778030d..3007f38221a 100644
--- a/tensorflow/compiler/xla/service/sharding_remover.h
+++ b/tensorflow/compiler/xla/service/sharding_remover.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/simplify_fp_conversions.cc b/tensorflow/compiler/xla/service/simplify_fp_conversions.cc
index a8bd06baee2..e47c54b9028 100644
--- a/tensorflow/compiler/xla/service/simplify_fp_conversions.cc
+++ b/tensorflow/compiler/xla/service/simplify_fp_conversions.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/simplify_fp_conversions.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/simplify_fp_conversions.h b/tensorflow/compiler/xla/service/simplify_fp_conversions.h
index d8e30601608..3975a79b861 100644
--- a/tensorflow/compiler/xla/service/simplify_fp_conversions.h
+++ b/tensorflow/compiler/xla/service/simplify_fp_conversions.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SIMPLIFY_FP_CONVERSIONS_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/simplify_fp_conversions_test.cc b/tensorflow/compiler/xla/service/simplify_fp_conversions_test.cc
index 7abfdc3209f..17975a13033 100644
--- a/tensorflow/compiler/xla/service/simplify_fp_conversions_test.cc
+++ b/tensorflow/compiler/xla/service/simplify_fp_conversions_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/simplify_fp_conversions.h"
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
 
diff --git a/tensorflow/compiler/xla/service/slice_sinker_test.cc b/tensorflow/compiler/xla/service/slice_sinker_test.cc
index d4a7e2ccbfe..77729f79708 100644
--- a/tensorflow/compiler/xla/service/slice_sinker_test.cc
+++ b/tensorflow/compiler/xla/service/slice_sinker_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
diff --git a/tensorflow/compiler/xla/service/slow_operation_alarm.cc b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
index 41d0a64b5f2..a0117c7d5fe 100644
--- a/tensorflow/compiler/xla/service/slow_operation_alarm.cc
+++ b/tensorflow/compiler/xla/service/slow_operation_alarm.cc
@@ -85,7 +85,7 @@ void SlowOperationAlarm::ScheduleAlarm(SlowOperationAlarm* alarm) {
   absl::call_once(init_flag, [] {
     ready = new absl::CondVar();
     outstanding_alarms = new std::list<SlowOperationAlarm*>();
-    (void)!tsl::Env::Default()->StartThread(
+    [[maybe_unused]] static tsl::Thread* t = tsl::Env::Default()->StartThread(
         tsl::ThreadOptions(), "SlowOperationAlarm", [] { AlarmLoop(); });
   });
 
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
index a2fd259c1f0..de14ce06ac8 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier.cc
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.h b/tensorflow/compiler/xla/service/sort_simplifier.h
index 5b32e38eafb..4ae6e99f8a3 100644
--- a/tensorflow/compiler/xla/service/sort_simplifier.h
+++ b/tensorflow/compiler/xla/service/sort_simplifier.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.cc b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
index f891ef65b9c..8e6a268e59d 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
@@ -30,20 +30,19 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -1119,6 +1118,22 @@ bool ConvolutionVisitor::CanPropagate(HloInstruction* consumer,
       if (!old_to_new_instrs_.contains(consumer->mutable_operand(0))) {
         found_good_non_window_dilated_conv = false;
       }
+      ConvolutionDimensionNumbers dim_numbers =
+          consumer->convolution_dimension_numbers();
+
+      ConvDetails c = GetConvolutionDetails(consumer, dim_numbers);
+
+      auto retval = GetSpatialDimsToSplit(consumer->mutable_operand(0));
+      std::vector<int64_t> new_spatial_dims = retval.second;
+
+      auto new_activations = old_to_new_instrs_[consumer->mutable_operand(0)];
+      // If low padding is large, there's no benefit in propagating. This
+      // also makes halo creation unnecessarily difficult (b/246862180).
+      if (new_activations->shape().dimensions(retval.second[0]) <
+          c.inherent_low_padding) {
+        return false;
+      }
+
       auto dim_map_val_op_0 = instr_to_dim_map_[consumer->mutable_operand(0)];
 
       if (!are_conv_dims_compatible(consumer->convolution_dimension_numbers(),
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.h b/tensorflow/compiler/xla/service/space_to_batch_converter.h
index 7bfa658ed74..41c25671da6 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter.h
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
index 68345122053..52e323c0834 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 0ebcf424360..a3a4c48104a 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -1,9 +1,10 @@
 # Description: SPMD partitioning pass.
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -43,11 +44,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:custom_call_sharding_helper",
         "//tensorflow/compiler/xla/service:dot_as_convolution_util",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_lexer",
@@ -60,7 +61,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
         "//tensorflow/tsl/platform:statusor",
@@ -76,14 +76,14 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "spmd_partitioner_test",
     srcs = ["spmd_partitioner_test.cc"],
     deps = [
         ":spmd_partitioner",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -98,13 +98,13 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "canonicalize_all_gather_for_cse_test",
     srcs = ["canonicalize_all_gather_for_cse_test.cc"],
     deps = [
         ":canonicalize_all_gather_for_cse",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -121,20 +121,20 @@ cc_library(
     srcs = ["canonicalize_all_gather_for_cse.cc"],
     hdrs = ["canonicalize_all_gather_for_cse.h"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_query",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "schedule_aware_collective_ops_cse_test",
     srcs = ["schedule_aware_collective_ops_cse_test.cc"],
     deps = [
         ":schedule_aware_collective_ops_cse",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -151,9 +151,9 @@ cc_library(
     srcs = ["schedule_aware_collective_ops_cse.cc"],
     hdrs = ["schedule_aware_collective_ops_cse.h"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -164,20 +164,20 @@ cc_library(
     hdrs = ["stateful_rng_spmd_partitioner.h"],
     deps = [
         ":spmd_partitioner",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "stateful_rng_spmd_partitioner_test",
     srcs = ["stateful_rng_spmd_partitioner_test.cc"],
     deps = [
         ":stateful_rng_spmd_partitioner",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -190,3 +190,58 @@ tf_cc_test(
         "//tensorflow/tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "collective_permute_motion",
+    srcs = ["collective_permute_motion.cc"],
+    hdrs = ["collective_permute_motion.h"],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:while_loop_analysis",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_permute_motion_test",
+    srcs = ["collective_permute_motion_test.cc"],
+    deps = [
+        ":collective_permute_motion",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "partition_assignment",
+    srcs = [
+        "partition_assignment.cc",
+    ],
+    hdrs = [
+        "partition_assignment.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
+xla_cc_test(
+    name = "partition_assignment_test",
+    srcs = ["partition_assignment_test.cc"],
+    deps = [
+        ":partition_assignment",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
index 1d3a6151324..8d6cb06194b 100644
--- a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
+++ b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h
index bf2af3701f2..507a99a5a7d 100644
--- a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h
+++ b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CANONICALIZE_ALL_GATHER_FOR_CSE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CANONICALIZE_ALL_GATHER_FOR_CSE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
index 4da109e2985..676b1c7b417 100644
--- a/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/spmd/canonicalize_all_gather_for_cse.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
diff --git a/tensorflow/compiler/xla/service/spmd/collective_permute_motion.cc b/tensorflow/compiler/xla/service/spmd/collective_permute_motion.cc
new file mode 100644
index 00000000000..a1fbd8ccdd7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/collective_permute_motion.cc
@@ -0,0 +1,316 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/collective_permute_motion.h"
+
+#include <cstdint>
+#include <deque>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+absl::flat_hash_set<HloInstruction*> FindLoopConsts(HloComputation* body) {
+  HloInstruction* root = body->root_instruction();
+  CHECK_EQ(root->opcode(), HloOpcode::kTuple);
+  absl::flat_hash_set<HloInstruction*> loop_consts;
+  // Find pass-through inputs.
+  for (int64_t i = 0; i < root->operand_count(); ++i) {
+    HloInstruction* output = root->mutable_operand(i);
+    while (output->opcode() == HloOpcode::kReshape ||
+           output->opcode() == HloOpcode::kCopy) {
+      output = output->mutable_operand(0);
+    }
+    if (output->opcode() == HloOpcode::kGetTupleElement &&
+        output->tuple_index() == i &&
+        output->operand(0) == body->parameter_instruction(0)) {
+      loop_consts.insert(output);
+    }
+  }
+  // Find instructions that depend on only loop consts.
+  for (HloInstruction* inst : body->MakeInstructionPostOrder()) {
+    if (inst->IsConstant() || inst->opcode() == HloOpcode::kIota ||
+        inst->opcode() == HloOpcode::kReplicaId ||
+        inst->opcode() == HloOpcode::kPartitionId) {
+      loop_consts.insert(inst);
+      continue;
+    }
+    if (!inst->IsElementwise() && inst->opcode() != HloOpcode::kBroadcast &&
+        inst->opcode() != HloOpcode::kReduce &&
+        inst->opcode() != HloOpcode::kReshape &&
+        inst->opcode() != HloOpcode::kDynamicSlice &&
+        inst->opcode() != HloOpcode::kTranspose) {
+      continue;
+    }
+    if (inst->HasSideEffectNoRecurse()) {
+      continue;
+    }
+    if (absl::c_all_of(inst->operands(), [&](const HloInstruction* operand) {
+          return loop_consts.contains(operand);
+        })) {
+      loop_consts.insert(inst);
+    }
+  }
+  return loop_consts;
+}
+
+constexpr int64_t kMaxMovableClusterSize = 8;
+
+// A collective permute may need to be moved with some ops after it. We only
+// consider elementwise ops between this collective-permute and loop constants.
+struct MovableCluster {
+  int64_t root_tuple_index;
+  // Last one must be collective-permute.
+  std::vector<HloInstruction*> reverse_order_instructions;
+  HloInstruction* collective_permute = nullptr;
+};
+
+std::optional<MovableCluster> FindMovableClusterAtBodyRoot(
+    HloComputation* body, int64_t root_tuple_index,
+    const absl::flat_hash_set<HloInstruction*>& loop_consts) {
+  HloInstruction* root = body->root_instruction();
+  CHECK_EQ(root->opcode(), HloOpcode::kTuple);
+  MovableCluster cluster;
+  cluster.root_tuple_index = root_tuple_index;
+  std::deque<HloInstruction*> queue;
+  queue.push_back(root->mutable_operand(root_tuple_index));
+  while (!queue.empty()) {
+    HloInstruction* visiting = queue.front();
+    queue.pop_front();
+    if (cluster.reverse_order_instructions.size() >= kMaxMovableClusterSize) {
+      VLOG(2) << "Cannot move: too many instructions to move";
+      return std::nullopt;
+    }
+    if (visiting->user_count() > 1) {
+      // Let's support only single-use.
+      VLOG(2) << "Cannot move: " << visiting->name() << " used multiple times";
+      return std::nullopt;
+    }
+    cluster.reverse_order_instructions.push_back(visiting);
+    if (visiting->opcode() == HloOpcode::kCollectivePermute) {
+      if (cluster.collective_permute != nullptr) {
+        VLOG(2) << "Cannot move: " << visiting->name()
+                << " multiple collective permutes";
+        return std::nullopt;
+      }
+      cluster.collective_permute = visiting;
+      continue;
+    }
+    if (!visiting->IsElementwise() || visiting->HasSideEffectNoRecurse()) {
+      VLOG(2) << "Cannot move: " << visiting->name() << " unsupported op";
+      return std::nullopt;
+    }
+    for (HloInstruction* operand : visiting->mutable_operands()) {
+      if (!loop_consts.contains(operand)) {
+        queue.push_back(operand);
+      }
+    }
+  }
+  return cluster;
+}
+
+absl::flat_hash_set<int64_t> FindIndicesUnusedAfterLoop(HloInstruction* loop) {
+  absl::flat_hash_set<int64_t> indices;
+  int64_t count = loop->shape().tuple_shapes_size();
+  for (int64_t i = 0; i < count; ++i) {
+    indices.insert(i);
+  }
+  for (HloInstruction* user : loop->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      indices.clear();
+      break;
+    }
+    indices.erase(user->tuple_index());
+  }
+  return indices;
+}
+
+StatusOr<bool> MoveCollectivePermutes(HloComputation* computation,
+                                      HloInstruction* loop) {
+  HloComputation* body = loop->while_body();
+  HloInstruction* root = body->root_instruction();
+  if (root->opcode() != HloOpcode::kTuple ||
+      loop->operand(0)->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+  auto maybe_induction_var_idx = GetLoopInductionVarTupleIdx(loop);
+  if (!maybe_induction_var_idx.has_value()) {
+    VLOG(2) << "Skip " << loop->name() << ", no induction var";
+    return false;
+  }
+  absl::flat_hash_map<const HloInstruction*, int64_t> output_appear_counts;
+  for (const HloInstruction* operand : root->operands()) {
+    auto res = output_appear_counts.emplace(operand, 1);
+    if (!res.second) {
+      res.first->second++;
+    }
+  }
+  // We require the loop output is unused, so that we don't need to add a final
+  // collective-permute after the loop to fix the missing iteration.
+  absl::flat_hash_set<int64_t> unused_indices_after_loop =
+      FindIndicesUnusedAfterLoop(loop);
+  const absl::flat_hash_set<HloInstruction*>& loop_consts =
+      FindLoopConsts(body);
+  int64_t induction_var_idx = *maybe_induction_var_idx;
+  std::vector<HloInstruction*> input_gtes(root->operand_count(), nullptr);
+  absl::flat_hash_set<int64_t> multi_use_indices;
+  for (HloInstruction* user : body->parameter_instruction(0)->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      VLOG(2) << "Skip " << loop->name() << ", non-GTE input use";
+      return false;
+    }
+    if (multi_use_indices.contains(user->tuple_index())) {
+      continue;
+    }
+    if (input_gtes[user->tuple_index()] != nullptr) {
+      multi_use_indices.insert(user->tuple_index());
+      input_gtes[user->tuple_index()] = nullptr;
+    } else {
+      input_gtes[user->tuple_index()] = user;
+    }
+  }
+  HloInstruction* ind_var = input_gtes[induction_var_idx];
+  if (ind_var == nullptr || ind_var->shape().rank() > 0) {
+    VLOG(2) << "Skip " << loop->name() << ", non-scalar induction var";
+    return false;
+  }
+  if (root->operand(induction_var_idx)->opcode() != HloOpcode::kAdd &&
+      root->operand(induction_var_idx)->opcode() != HloOpcode::kSubtract) {
+    VLOG(2) << "Skip " << loop->name() << ", non-add/sub induction var";
+    return false;
+  }
+  if (root->operand(induction_var_idx)->operand(0) == ind_var) {
+    if (!root->operand(induction_var_idx)->operand(1)->IsConstant()) {
+      VLOG(2) << "Skip " << loop->name() << ", non-add/sub const induction var";
+      return false;
+    }
+  } else if (root->operand(induction_var_idx)->operand(1) == ind_var) {
+    if (!root->operand(induction_var_idx)->operand(0)->IsConstant()) {
+      VLOG(2) << "Skip " << loop->name() << ", non-add/sub const induction var";
+      return false;
+    }
+  } else {
+    return false;
+  }
+  HloInstruction* ind_var_orig =
+      loop->mutable_operand(0)->mutable_operand(induction_var_idx);
+  if (!ind_var_orig->IsConstant()) {
+    VLOG(2) << "Skip " << loop->name()
+            << ", non-constant initial induction var";
+    return false;
+  }
+
+  bool changed = false;
+  std::vector<MovableCluster> movable_outputs;
+  for (int64_t i = 0; i < root->operand_count(); ++i) {
+    if (output_appear_counts[root->operand(i)] > 1) {
+      VLOG(2) << "Skip " << loop->name() << " index " << i
+              << " appears multiple times in output.";
+      continue;
+    }
+    if (!unused_indices_after_loop.contains(i)) {
+      VLOG(2) << "Skip " << loop->name() << " index " << i
+              << " used after loop.";
+      continue;
+    }
+    auto cluster = FindMovableClusterAtBodyRoot(body, i, loop_consts);
+    if (!cluster.has_value()) {
+      VLOG(2) << "Skip " << loop->name() << " index " << i
+              << " did not find a movable cluster.";
+      continue;
+    }
+    HloInstruction* input = input_gtes[cluster->root_tuple_index];
+    const std::vector<HloInstruction*> original_input_users = input->users();
+    HloInstruction* cp = cluster->collective_permute;
+    if (input == nullptr || cp->operand(0) == input) {
+      VLOG(2) << "Skip " << loop->name() << " index " << i
+              << " collective-permute already at top.";
+      continue;
+    }
+    absl::flat_hash_map<const HloInstruction*, HloInstruction*> replacement;
+    replacement[cp->operand(0)] = input;
+    for (auto it = cluster->reverse_order_instructions.rbegin();
+         it != cluster->reverse_order_instructions.rend(); ++it) {
+      HloInstruction* inst = *it;
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* operand : inst->mutable_operands()) {
+        auto rit = replacement.find(operand);
+        if (rit != replacement.end()) {
+          new_operands.push_back(rit->second);
+        } else {
+          new_operands.push_back(operand);
+        }
+      }
+      HloInstruction* clone = body->AddInstruction(
+          inst->CloneWithNewOperands(inst->shape(), new_operands));
+      replacement[inst] = clone;
+    }
+    HloInstruction* new_input =
+        replacement[cluster->reverse_order_instructions[0]];
+    if (ind_var_orig->parent() != body) {
+      ind_var_orig = body->AddInstruction(ind_var_orig->Clone());
+    }
+    HloInstruction* is_first_iter =
+        body->AddInstruction(HloInstruction::CreateBroadcast(
+            ShapeUtil::ChangeElementType(new_input->shape(), PRED),
+            body->AddInstruction(HloInstruction::CreateCompare(
+                ShapeUtil::MakeScalarShape(PRED), ind_var, ind_var_orig,
+                Comparison::Direction::kEq)),
+            {}));
+    new_input = body->AddInstruction(
+        HloInstruction::CreateTernary(new_input->shape(), HloOpcode::kSelect,
+                                      is_first_iter, input, new_input));
+    for (HloInstruction* user : original_input_users) {
+      TF_RETURN_IF_ERROR(input->ReplaceUseWith(user, new_input));
+    }
+    TF_RETURN_IF_ERROR(root->ReplaceOperandWith(cluster->root_tuple_index,
+                                                cp->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(
+        cluster->reverse_order_instructions[0]));
+    VLOG(2) << "Moved " << loop->name() << " index " << i;
+    changed = true;
+  }
+  return changed;
+}
+
+StatusOr<bool> CollectivePermuteMotion::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instr : computation->MakeInstructionPostOrder()) {
+      if (instr->opcode() == HloOpcode::kWhile) {
+        TF_ASSIGN_OR_RETURN(bool moved,
+                            MoveCollectivePermutes(computation, instr));
+        changed |= moved;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/collective_permute_motion.h b/tensorflow/compiler/xla/service/spmd/collective_permute_motion.h
new file mode 100644
index 00000000000..90f6a35b7f5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/collective_permute_motion.h
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass moves collective permutes at the end of a loop to the beginning,
+// which makes overlapping possible for megascale decomposed ops.
+class CollectivePermuteMotion : public HloModulePass {
+ public:
+  CollectivePermuteMotion() = default;
+  absl::string_view name() const override {
+    return "collective-permute-motion";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_COLLECTIVE_PERMUTE_MOTION_H_
diff --git a/tensorflow/compiler/xla/service/spmd/collective_permute_motion_test.cc b/tensorflow/compiler/xla/service/spmd/collective_permute_motion_test.cc
new file mode 100644
index 00000000000..116d73f4fa5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/collective_permute_motion_test.cc
@@ -0,0 +1,251 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/collective_permute_motion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using CollectivePermuteMotionTest = HloTestBase;
+namespace op = xla::testing::opcode_matchers;
+
+TEST_F(CollectivePermuteMotionTest, SimpleMove) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    add = s32[] add(gte0, constant.1)
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    ROOT tuple = (s32[], f32[4,4]) tuple(add, cp)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4]) tuple(constant.2, param)
+    while = (s32[], f32[4,4]) while(tuple.1), condition=cond, body=body
+    ROOT result = s32[] get-tuple-element(while), index=0
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).value());
+
+  VLOG(1) << module->ToString();
+  const HloInstruction* loop = FindInstruction(module.get(), "while");
+  // Check if the operands are reshaped.
+  const HloInstruction* output =
+      loop->while_body()->root_instruction()->operand(1);
+  auto input =
+      AllOf(op::Shape("f32[4,4]"), op::GetTupleElement(op::Parameter(0)));
+  auto cp = op::CollectivePermute(input);
+  auto select = op::Select(op::Broadcast(op::Compare()), input, cp);
+  EXPECT_THAT(output, op::Multiply(select, select));
+}
+
+TEST_F(CollectivePermuteMotionTest, MoveWithElementwise) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    add = s32[] add(gte0, constant.1)
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    constant.4 = f32[] constant(1)
+    broadcast = f32[4,4] broadcast(constant.4), dimensions={}
+    add1 = f32[4,4] add(cp, broadcast)
+    ROOT tuple = (s32[], f32[4,4]) tuple(add, add1)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4]) tuple(constant.2, param)
+    while = (s32[], f32[4,4]) while(tuple.1), condition=cond, body=body
+    ROOT result = s32[] get-tuple-element(while), index=0
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_TRUE(pass.Run(&*module).value());
+
+  VLOG(1) << module->ToString();
+  const HloInstruction* loop = FindInstruction(module.get(), "while");
+  // Check if the operands are reshaped.
+  const HloInstruction* output =
+      loop->while_body()->root_instruction()->operand(1);
+  auto input =
+      AllOf(op::Shape("f32[4,4]"), op::GetTupleElement(op::Parameter(0)));
+  auto moved =
+      op::Add(op::CollectivePermute(input), op::Broadcast(op::Constant()));
+  auto select = op::Select(op::Broadcast(op::Compare()), input, moved);
+  EXPECT_THAT(output, op::Multiply(select, select));
+}
+
+TEST_F(CollectivePermuteMotionTest, DoNotMoveWithNonConstElementwise) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    add = s32[] add(gte0, constant.1)
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    constant.4 = f32[] constant(1)
+    nonconst = f32[4,4] custom-call(), custom_call_target="unknown"
+    add1 = f32[4,4] add(cp, nonconst)
+    ROOT tuple = (s32[], f32[4,4]) tuple(add, add1)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4]) tuple(constant.2, param)
+    while = (s32[], f32[4,4]) while(tuple.1), condition=cond, body=body
+    ROOT result = s32[] get-tuple-element(while), index=0
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).value());
+}
+
+TEST_F(CollectivePermuteMotionTest, DoNotMoveIfOutputUsed) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    add = s32[] add(gte0, constant.1)
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    ROOT tuple = (s32[], f32[4,4]) tuple(add, cp)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4]) tuple(constant.2, param)
+    while = (s32[], f32[4,4]) while(tuple.1), condition=cond, body=body
+    ROOT result = f32[4,4] get-tuple-element(while), index=1
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).value());
+}
+
+TEST_F(CollectivePermuteMotionTest, DoNotMoveIfIndictionVarUnknown) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    custom = s32[] custom-call(gte0, constant.1), custom_call_target="unknown"
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    ROOT tuple = (s32[], f32[4,4]) tuple(custom, cp)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4]) tuple(constant.2, param)
+    while = (s32[], f32[4,4]) while(tuple.1), condition=cond, body=body
+    ROOT result = s32[] get-tuple-element(while), index=0
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).value());
+}
+
+TEST_F(CollectivePermuteMotionTest, DoNotMoveIfMultiOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule test
+  body {
+    loop_var = (s32[], f32[4,4], f32[4,4]) parameter(0)
+    constant.1 = s32[] constant(1)
+    gte0 = s32[] get-tuple-element(loop_var), index=0
+    add = s32[] add(gte0, constant.1)
+    gte1 = f32[4,4] get-tuple-element(loop_var), index=1
+    mul = f32[4,4] multiply(gte1, gte1)
+    cp = f32[4,4] collective-permute(mul), source_target_pairs={{0,1},{1,2}}
+    ROOT tuple = (s32[], f32[4,4], f32[4,4]) tuple(add, cp, cp)
+  }
+  cond {
+    loop_var = (s32[], f32[4,4], f32[4,4]) parameter(0)
+    gte.cond = s32[] get-tuple-element(loop_var), index=0
+    constant.3 = s32[] constant(5)
+    ROOT lt = pred[] compare(gte.cond, constant.3), direction=LT
+  }
+  ENTRY  main {
+    constant.2 = s32[] constant(0)
+    param = f32[4,4] parameter(0)
+    tuple.1 = (s32[], f32[4,4], f32[4,4]) tuple(constant.2, param, param)
+    while = (s32[], f32[4,4], f32[4,4]) while(tuple.1),
+      condition=cond, body=body
+    ROOT result = s32[] get-tuple-element(while), index=0
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  CollectivePermuteMotion pass;
+  ASSERT_FALSE(pass.Run(&*module).value());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
index 186fd241a76..e2383b4737d 100644
--- a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/functional/function_ref.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
@@ -263,10 +263,9 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
                                   const HloSharding& rhs_sharding) {
     // We currently don't support partitioning input batch or output feature
     // dimensions.
-    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
-               1 ||
-           rhs_sharding.tile_assignment().dim(
-               dnums.kernel_output_feature_dimension()) != 1;
+    return ShardCountAtDim(lhs_sharding, dnums.input_batch_dimension()) != 1 ||
+           ShardCountAtDim(rhs_sharding,
+                           dnums.kernel_output_feature_dimension()) != 1;
   };
 
   if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
@@ -284,18 +283,16 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
   }
 
   if (original_hlo->feature_group_count() > 1 &&
-      (lhs.sharding().tile_assignment().dim(dnums.input_feature_dimension()) >
-           1 ||
-       rhs.sharding().tile_assignment().dim(
-           dnums.kernel_output_feature_dimension()) > 1)) {
+      (ShardCountAtDim(lhs.sharding(), dnums.input_feature_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
     return nullptr;
   }
 
   if (original_hlo->batch_group_count() > 1 &&
-      (lhs.sharding().tile_assignment().dim(dnums.input_batch_dimension()) >
-           1 ||
-       rhs.sharding().tile_assignment().dim(
-           dnums.kernel_output_feature_dimension()) > 1)) {
+      (ShardCountAtDim(lhs.sharding(), dnums.input_batch_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
     return nullptr;
   }
 
@@ -324,7 +321,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
   for (int64_t i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
     int64_t lhs_dimension = dnums.input_spatial_dimensions(i);
     int64_t rhs_dimension = dnums.kernel_spatial_dimensions(i);
-    int64_t shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+    int64_t shard_count = ShardCountAtDim(rhs.sharding(), rhs_dimension);
     const auto& wd = conv_window.dimensions(i);
     if (wd.base_dilation() != 1 || wd.window_reversal()) {
       return nullptr;
@@ -575,10 +572,9 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
 
   auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
                                   const HloSharding& rhs_sharding) {
-    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
-               1 ||
-           rhs_sharding.tile_assignment().dim(
-               dnums.kernel_output_feature_dimension()) != 1;
+    return ShardCountAtDim(lhs_sharding, dnums.input_batch_dimension()) != 1 ||
+           ShardCountAtDim(rhs_sharding,
+                           dnums.kernel_output_feature_dimension()) != 1;
   };
 
   if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
@@ -596,18 +592,16 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
   }
 
   if (original_hlo->feature_group_count() > 1 &&
-      (lhs.sharding().tile_assignment().dim(dnums.input_feature_dimension()) >
-           1 ||
-       rhs.sharding().tile_assignment().dim(
-           dnums.kernel_output_feature_dimension()) > 1)) {
+      (ShardCountAtDim(lhs.sharding(), dnums.input_feature_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
     return nullptr;
   }
 
   if (original_hlo->batch_group_count() > 1 &&
-      (lhs.sharding().tile_assignment().dim(dnums.input_batch_dimension()) >
-           1 ||
-       rhs.sharding().tile_assignment().dim(
-           dnums.kernel_output_feature_dimension()) > 1)) {
+      (ShardCountAtDim(lhs.sharding(), dnums.input_batch_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
     return nullptr;
   }
   // Reshard LHS by exchanging halo such that each shard computes the partial
@@ -634,7 +628,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
   for (int64_t i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
     int64_t lhs_dimension = dnums.input_spatial_dimensions(i);
     int64_t rhs_dimension = dnums.kernel_spatial_dimensions(i);
-    int64_t shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    int64_t shard_count = ShardCountAtDim(lhs.sharding(), lhs_dimension);
     const auto& wd = window.dimensions(i);
     if (wd.base_dilation() != 1) {
       // TODO(wangtao): support parallel dim if it is replicate here.
@@ -756,8 +750,21 @@ StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
   const auto& dnums = original_hlo->convolution_dimension_numbers();
   TF_RET_CHECK(!output_sharding.IsTileMaximal());
   // We don't currently support sharding on output feature dimension.
-  if (output_sharding.tile_assignment().dim(dnums.output_feature_dimension()) >
-      1) {
+  if (ShardCountAtDim(output_sharding, dnums.output_feature_dimension()) > 1) {
+    return nullptr;
+  }
+
+  if (original_hlo->feature_group_count() > 1 &&
+      (ShardCountAtDim(lhs.sharding(), dnums.input_feature_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
+    return nullptr;
+  }
+
+  if (original_hlo->batch_group_count() > 1 &&
+      (ShardCountAtDim(lhs.sharding(), dnums.input_batch_dimension()) > 1 ||
+       ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) > 1)) {
     return nullptr;
   }
 
@@ -878,7 +885,6 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
           PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
               lhs, rhs, output_base_shape, output_sharding, create_sharded_conv,
               conv_window, original_hlo, partition_id, module, b));
-
       if (partitioned_conv) {
         return partitioned_conv;
       }
@@ -891,7 +897,6 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
                         PartitionConvolutionTiledOutput(
                             lhs, rhs, output_base_shape, output_sharding,
                             create_sharded_conv, conv_window, original_hlo, b));
-
     if (partitioned_conv) {
       return partitioned_conv;
     }
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.h b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
index 38016ca50c1..1f127ec63a0 100644
--- a/tensorflow/compiler/xla/service/spmd/convolution_handler.h
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc b/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
index bb004f6afc8..9916560ab06 100644
--- a/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/custom_call_handler.cc
@@ -22,14 +22,14 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_lexer.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
diff --git a/tensorflow/compiler/xla/service/spmd/custom_call_handler.h b/tensorflow/compiler/xla/service/spmd/custom_call_handler.h
index 552d51b31d2..03a475e67c4 100644
--- a/tensorflow/compiler/xla/service/spmd/custom_call_handler.h
+++ b/tensorflow/compiler/xla/service/spmd/custom_call_handler.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 namespace spmd {
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index 75d6bb2283c..a2c98067e0e 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -15,20 +15,21 @@ limitations under the License.
 
 #include <cstdint>
 #include <deque>
+#include <memory>
 #include <optional>
 
 #include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
@@ -387,7 +388,7 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
   int64_t group_size = 1;
   for (int64_t i : replication_dims) {
-    group_size *= sharding.tile_assignment().dim(i);
+    group_size *= ShardCountAtDim(sharding, i);
   }
   std::vector<std::vector<int64_t>> partition_groups(
       sharding.tile_assignment().num_elements() / group_size);
@@ -396,7 +397,7 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
         int64_t group_id = 0;
         for (int64_t i = 0; i < indices.size(); ++i) {
           if (!absl::c_linear_search(replication_dims, i)) {
-            group_id *= sharding.tile_assignment().dim(i);
+            group_id *= ShardCountAtDim(sharding, i);
             group_id += indices[i];
           }
         }
@@ -417,10 +418,10 @@ bool RequiresTransposeSharding(
   int64_t rhs_total_partitions = 1;
   bool has_different_lhs_rhs_dim_sharding = false;
   for (const auto& dim : dims) {
-    int64_t lhs_dim_partitions = lhs_sharding.tile_assignment().dim(dim.lhs);
+    int64_t lhs_dim_partitions = ShardCountAtDim(lhs_sharding, dim.lhs);
     lhs_total_partitions *= lhs_dim_partitions;
 
-    int64_t rhs_dim_partitions = rhs_sharding.tile_assignment().dim(dim.rhs);
+    int64_t rhs_dim_partitions = ShardCountAtDim(rhs_sharding, dim.rhs);
     rhs_total_partitions *= rhs_dim_partitions;
 
     if (lhs_dim_partitions != rhs_dim_partitions) {
@@ -2054,11 +2055,11 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
       lhs_rhs_dims_matching = false;
     }
     lhs_sharding_dims_adjusted_to_output[dim.lhs] =
-        output_sharding.tile_assignment().dim(dim.output);
+        ShardCountAtDim(output_sharding, dim.output);
     rhs_sharding_dims_adjusted_to_output[dim.rhs] =
-        output_sharding.tile_assignment().dim(dim.output);
+        ShardCountAtDim(output_sharding, dim.output);
     output_sharding_dims_adjusted_to_lhs[dim.output] =
-        lhs.sharding().tile_assignment().dim(dim.lhs);
+        ShardCountAtDim(lhs.sharding(), dim.lhs);
   }
   if (require_matching_devices_to_group && lhs_rhs_dims_matching) {
     lhs_rhs_dims_matching =
@@ -2259,8 +2260,7 @@ GroupedSharding GetNonContractingPartitionGroupedShardingForMatchedOperand(
   // defines the same device groups for both matching and output.
   for (const auto& dim : partitioned_dims) {
     int64_t md = lhs_matching ? dim.lhs : dim.rhs;
-    matching_sharding_dims[md] =
-        output_sharding.tile_assignment().dim(dim.output);
+    matching_sharding_dims[md] = ShardCountAtDim(output_sharding, dim.output);
     matching_dims.push_back(md);
     output_dims.push_back(dim.output);
   }
@@ -2294,7 +2294,7 @@ GetNonContractingPartitionGroupedShardingForOtherOperand(
   output_dims.reserve(matching_partitioned_dims.size());
   for (const auto& dim : matching_partitioned_dims) {
     output_dims.push_back(dim.output);
-    group_count *= output_sharding.tile_assignment().dim(dim.output);
+    group_count *= ShardCountAtDim(output_sharding, dim.output);
   }
   GroupedSharding output_grouped =
       hlo_sharding_util::GroupShardingOnDims(output_sharding, output_dims);
@@ -2333,6 +2333,18 @@ GetNonContractingPartitionGroupedShardingForOtherOperand(
   if (other_group_dims.size() == 1 &&
       other_group_dims[0] ==
           other_sharding.tile_assignment().num_dimensions() - 1) {
+    // Try to reuse the device groups from the output to match the partially
+    // replicated dim.
+    if (auto grouped_sharding = hlo_sharding_util::
+            PartialReplicatedGroupShardingWithAssignedDeviceGroups(
+                other_sharding,
+                other_sharding.tile_assignment().dimensions().back() /
+                    group_count,
+                output_grouped.device_groups)) {
+      std::vector<int64_t> group_dim_shards = {
+          other_sharding.tile_assignment().dimensions().back() / group_count};
+      return grouped_sharding.value();
+    }
     std::vector<int64_t> group_dim_shards = {
         other_sharding.tile_assignment().dimensions().back() / group_count};
     return AlignGroupsWith(
@@ -2612,7 +2624,7 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
   for (const auto& dim : partitioned_contracting_dims) {
     lhs_dims.push_back(dim.lhs);
     rhs_dims.push_back(dim.rhs);
-    group_count *= lhs.sharding().tile_assignment().dim(dim.lhs);
+    group_count *= ShardCountAtDim(lhs.sharding(), dim.lhs);
   }
   HloSharding lhs_sharding = HloSharding::Replicate();
   HloSharding rhs_sharding = HloSharding::Replicate();
@@ -2817,9 +2829,15 @@ DotConvDimsMapping ConvertDimsMappingWithFeatureGroupCount(
   new_dims_mapping.lhs_non_contracting_dims.emplace_back();
   new_dims_mapping.lhs_non_contracting_dims.back().lhs =
       dnums.input_batch_dimension();
+  new_dims_mapping.lhs_non_contracting_dims.back().rhs = -1;
+  new_dims_mapping.lhs_non_contracting_dims.back().output = -1;
+  new_dims_mapping.lhs_non_contracting_dims.back().spatial = -1;
   new_dims_mapping.rhs_non_contracting_dims.emplace_back();
   new_dims_mapping.rhs_non_contracting_dims.back().rhs =
       dnums.kernel_input_feature_dimension();
+  new_dims_mapping.rhs_non_contracting_dims.back().lhs = -1;
+  new_dims_mapping.rhs_non_contracting_dims.back().output = -1;
+  new_dims_mapping.rhs_non_contracting_dims.back().spatial = -1;
   return new_dims_mapping;
 }
 
@@ -3062,7 +3080,7 @@ bool PrioritizeContractingDimensionsPartitioning(
   for (const auto& dim : dims_mapping.contracting_dims) {
     lhs_dims.push_back(dim.lhs);
     rhs_dims.push_back(dim.rhs);
-    group_count *= lhs.sharding().tile_assignment().dim(dim.lhs);
+    group_count *= ShardCountAtDim(lhs.sharding(), dim.lhs);
   }
   HloSharding lhs_sharding = HloSharding::Replicate();
   HloSharding rhs_sharding = HloSharding::Replicate();
@@ -3109,12 +3127,12 @@ bool PrioritizeContractingDimensionsPartitioning(
   if (!inner_output_sharding.IsTileMaximal()) {
     for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
       new_output_lhs_non_contracting_partitions *=
-          inner_output_sharding.tile_assignment().dim(dim.output);
+          ShardCountAtDim(inner_output_sharding, dim.output);
     }
     for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
       if (dim.output != -1) {
         new_output_rhs_non_contracting_partitions *=
-            inner_output_sharding.tile_assignment().dim(dim.output);
+            ShardCountAtDim(inner_output_sharding, dim.output);
       }
     }
   }
@@ -3390,12 +3408,12 @@ StatusOr<HloInstruction*> PartitionDot(
         }
         for (const auto& dim : dims) {
           if (lhs_rhs_or_output == 0) {
-            partitions *= sharding.tile_assignment().dim(dim.lhs);
+            partitions *= ShardCountAtDim(sharding, dim.lhs);
           } else if (lhs_rhs_or_output == 1) {
-            partitions *= sharding.tile_assignment().dim(dim.rhs);
+            partitions *= ShardCountAtDim(sharding, dim.rhs);
           } else {
             CHECK_EQ(lhs_rhs_or_output, 2);
-            partitions *= sharding.tile_assignment().dim(dim.output);
+            partitions *= ShardCountAtDim(sharding, dim.output);
           }
         }
         return partitions;
@@ -3467,14 +3485,12 @@ StatusOr<HloInstruction*> PartitionDot(
     // convolution. Case 0.a: Group partitions by feature group count.
     if (original_hlo->feature_group_count() > 1 ||
         original_hlo->batch_group_count() > 1) {
+      const auto& dnums = original_hlo->convolution_dimension_numbers();
       std::optional<DotConvDimsMapping> new_dims_mapping;
       if (original_hlo->feature_group_count() > 1) {
-        const int64_t input_feature_dim =
-            original_hlo->convolution_dimension_numbers()
-                .input_feature_dimension();
+        const int64_t input_feature_dim = dnums.input_feature_dimension();
         const int64_t kernel_output_feature_dim =
-            original_hlo->convolution_dimension_numbers()
-                .kernel_output_feature_dimension();
+            dnums.kernel_output_feature_dimension();
         // If the input and output feature dims are not equal, we require the
         // feature_group_count to be evenly partitioned; otherwise, there will
         // be different padding in the input/output.
@@ -3493,12 +3509,9 @@ StatusOr<HloInstruction*> PartitionDot(
       }
 
       if (original_hlo->batch_group_count() > 1) {
-        const int64_t input_batch_dim =
-            original_hlo->convolution_dimension_numbers()
-                .input_batch_dimension();
+        const int64_t input_batch_dim = dnums.input_batch_dimension();
         const int64_t kernel_output_feature_dim =
-            original_hlo->convolution_dimension_numbers()
-                .kernel_output_feature_dimension();
+            dnums.kernel_output_feature_dimension();
         if (lhs.base_shape().dimensions(input_batch_dim) ==
                 rhs.base_shape().dimensions(kernel_output_feature_dim) ||
             (lhs.sharding().IsTiled() &&
@@ -3547,6 +3560,97 @@ StatusOr<HloInstruction*> PartitionDot(
           return try_partitioned_conv;
         }
       }
+      // For batch/feature grouped convs, we try to at least partiton them on
+      // the batch dimensions and partially replicate other dimensions, instead
+      // of replicating everything.
+      const int64_t max_batch_partitions = std::max(
+          std::max(conv_lhs_batch_partitions, conv_rhs_batch_partitions),
+          conv_output_batch_partitions);
+      if (!require_matching_devices_to_group && max_batch_partitions > 1 &&
+          ((original_hlo->batch_group_count() > 1 &&
+            original_hlo->batch_group_count() % max_batch_partitions == 0) ||
+           (original_hlo->feature_group_count() > 1 &&
+            original_hlo->feature_group_count() % max_batch_partitions == 0))) {
+        const int64_t conv_lhs_batch_dim =
+            original_hlo->batch_group_count() > 1
+                ? dnums.input_batch_dimension()
+                : dnums.input_feature_dimension();
+        const int64_t conv_rhs_batch_dim =
+            dnums.kernel_output_feature_dimension();
+        const int64_t conv_output_batch_dim = dnums.output_feature_dimension();
+        PartitionedHlo resharded_lhs = lhs;
+        PartitionedHlo resharded_rhs = rhs;
+        HloSharding aligned_output_sharding = HloSharding::Replicate();
+        HloInstruction* sharded_conv = nullptr;
+        DotDimensionIndexMapping indices_map = ComputeDimensionIndexMapping(
+            *new_dims_mapping, lhs.base_shape().rank(), rhs.base_shape().rank(),
+            output_base_shape.rank());
+        if (max_batch_partitions == conv_lhs_batch_partitions) {
+          resharded_lhs = resharded_lhs.Reshard(
+              hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
+                  resharded_lhs.sharding(), {conv_lhs_batch_dim}));
+          auto lhs_sharding_transposed_to_match_rhs =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  resharded_lhs.sharding(), indices_map.lhs_to_rhs_indices,
+                  indices_map.rhs_to_lhs_indices);
+          resharded_rhs =
+              resharded_rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+          TF_ASSIGN_OR_RETURN(
+              sharded_conv,
+              create_sharded_dot(resharded_lhs.hlo(), resharded_rhs.hlo(), b,
+                                 conv_window));
+          auto lhs_sharding_transposed_to_match_output =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  resharded_lhs.sharding(), indices_map.lhs_to_output_indices,
+                  indices_map.output_to_lhs_indices);
+          sharded_conv->set_sharding(*lhs_sharding_transposed_to_match_output);
+        } else if (max_batch_partitions == conv_rhs_batch_partitions) {
+          resharded_rhs = resharded_rhs.Reshard(
+              hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
+                  resharded_rhs.sharding(), {conv_rhs_batch_dim}));
+          auto rhs_sharding_transposed_to_match_lhs =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  resharded_rhs.sharding(), indices_map.rhs_to_lhs_indices,
+                  indices_map.lhs_to_rhs_indices);
+          resharded_lhs =
+              resharded_lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+          TF_ASSIGN_OR_RETURN(
+              sharded_conv,
+              create_sharded_dot(resharded_lhs.hlo(), resharded_rhs.hlo(), b,
+                                 conv_window));
+          auto rhs_sharding_transposed_to_match_output =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  resharded_rhs.sharding(), indices_map.rhs_to_output_indices,
+                  indices_map.output_to_rhs_indices);
+          sharded_conv->set_sharding(*rhs_sharding_transposed_to_match_output);
+        } else {
+          // max_batch_partitions == conv_output_batch_partitions
+          HloSharding target_output_sharding =
+              hlo_sharding_util::PartiallyReplicateTiledShardingOnAllDimsExcept(
+                  output_sharding, {conv_output_batch_dim});
+          auto output_sharding_transposed_to_match_lhs =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  target_output_sharding, indices_map.output_to_lhs_indices,
+                  indices_map.lhs_to_output_indices);
+          resharded_lhs =
+              resharded_lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+          auto output_sharding_transposed_to_match_rhs =
+              hlo_sharding_util::TransposeShardingWithCollapsedDims(
+                  target_output_sharding, indices_map.output_to_rhs_indices,
+                  indices_map.rhs_to_output_indices);
+          resharded_rhs =
+              resharded_rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+          TF_ASSIGN_OR_RETURN(
+              sharded_conv,
+              create_sharded_dot(resharded_lhs.hlo(), resharded_rhs.hlo(), b,
+                                 conv_window));
+          sharded_conv->set_sharding(target_output_sharding);
+        }
+
+        return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
+            .Reshard(output_sharding)
+            .hlo();
+      }
       return nullptr;
     }
   }
@@ -3611,9 +3715,9 @@ StatusOr<HloInstruction*> PartitionDot(
              output_lhs_non_contracting_partitions > 1) {
     lhs_matching = true;
     for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
-      int64_t lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      int64_t lhs_partitions = ShardCountAtDim(lhs.sharding(), dim.lhs);
       if (lhs_partitions > 1 &&
-          lhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+          lhs_partitions == ShardCountAtDim(output_sharding, dim.output)) {
         matching_dims.push_back(dim);
       }
     }
@@ -3621,9 +3725,9 @@ StatusOr<HloInstruction*> PartitionDot(
              output_rhs_non_contracting_partitions > 1) {
     lhs_matching = false;
     for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
-      int64_t rhs_partitions = rhs.sharding().tile_assignment().dim(dim.rhs);
+      int64_t rhs_partitions = ShardCountAtDim(rhs.sharding(), dim.rhs);
       if (rhs_partitions > 1 &&
-          rhs_partitions == output_sharding.tile_assignment().dim(dim.output)) {
+          rhs_partitions == ShardCountAtDim(output_sharding, dim.output)) {
         matching_dims.push_back(dim);
       }
     }
@@ -3683,9 +3787,9 @@ StatusOr<HloInstruction*> PartitionDot(
     // If part of contracting dims match, try them.
     std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
     for (const auto& dim : dims_mapping.contracting_dims) {
-      int64_t lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
+      int64_t lhs_partitions = ShardCountAtDim(lhs.sharding(), dim.lhs);
       if (lhs_partitions > 1 &&
-          lhs_partitions == rhs.sharding().tile_assignment().dim(dim.rhs)) {
+          lhs_partitions == ShardCountAtDim(rhs.sharding(), dim.rhs)) {
         matching_dims.push_back(dim);
       }
     }
diff --git a/tensorflow/compiler/xla/service/spmd/fft_handler.cc b/tensorflow/compiler/xla/service/spmd/fft_handler.cc
index 86847022263..ff4bd9ef3cf 100644
--- a/tensorflow/compiler/xla/service/spmd/fft_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/fft_handler.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
diff --git a/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc b/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
index 52909fe7647..dda8f155a77 100644
--- a/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/gather_scatter_handler.cc
@@ -23,16 +23,15 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 #include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -47,7 +46,7 @@ using hlo_sharding_util::GroupedSharding;
 PartitionedHlo PerGroupPartitionedHlo(
     PartitionedHlo& phlo, const GroupedSharding& grouped_sharding,
     SpmdBuilder* b, absl::InlinedVector<std::function<void()>, 3>& clean_ups) {
-  // Make sure the shardings are in consistent state.
+  //  Make sure the shardings are in consistent state.
   phlo = phlo.Reshard(UngroupSharding(grouped_sharding));
   auto per_group_partitioner_state = CreatePerGroupPartitioningState(
       phlo.state(), grouped_sharding.device_groups, b);
@@ -862,10 +861,6 @@ StatusOr<HloInstruction*> PartitionScatterIndexParallelDimensions(
           GatherOutputOrScatterUpdateShardingFromIndicesParallelDimensions(
               indices.sharding(), updates[0].rank(), indices_parallel_dims,
               update_parallel_dims);
-      for (auto& update : updates) {
-        update = update.Reshard(update_sharding);
-      }
-
       // Refine update sharding from the operand. it should be inferred from
       // operand sharding, so that the partitioned scatter can be either 1)
       // directly created on the partitioned operand, or 2) recursively created
@@ -880,6 +875,11 @@ StatusOr<HloInstruction*> PartitionScatterIndexParallelDimensions(
             *maybe_passthrough,
             /*minimum_tiles=*/update_sharding.NumTiles() + 1, &update_sharding);
       }
+
+      for (auto& update : updates) {
+        update = update.Reshard(update_sharding);
+      }
+
       // Construct the offsets for the operand sharding to be used to adjust
       // the indices. Because we know the only dimensions partitioned are the
       // parallel ones and because the partitioning is the same across indices
@@ -1405,7 +1405,8 @@ StatusOr<HloInstruction*> PartitionScatter(
           indices.Replicate().hlo(), update_hlos, scatter->to_apply(),
           scatter->scatter_dimension_numbers(), scatter->indices_are_sorted(),
           scatter->unique_indices()));
-  new_scatter->set_sharding(HloSharding::Replicate());
+  new_scatter->set_sharding(
+      HloSharding::Replicate().NormalizeTupleSharding(new_scatter->shape()));
   new_scatter =
       PartitionedHlo(new_scatter, new_scatter->shape(), operands[0].state())
           .Reshard(output_sharding)
diff --git a/tensorflow/compiler/xla/service/spmd/partition_assignment.cc b/tensorflow/compiler/xla/service/spmd/partition_assignment.cc
new file mode 100644
index 00000000000..be4cf6f49a3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/partition_assignment.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/partition_assignment.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace xla {
+
+PartitioningAlgorithm::PartitioningAlgorithm(AlgorithmKind kind,
+                                             int64_t num_partitions) {
+  kind_ = kind;
+  CHECK_GT(num_partitions, 1) << "Number of partitions must be at least two.";
+  num_partitions_ = num_partitions;
+}
+
+absl::string_view PartitioningAlgorithm::name() const {
+  switch (kind_) {
+    case AlgorithmKind::kNoop:
+    default:
+      return "Noop";
+  }
+}
+
+const PartitioningAlgorithm::AlgorithmKind& PartitioningAlgorithm::kind()
+    const {
+  return kind_;
+}
+
+int64_t PartitioningAlgorithm::num_partitions() const {
+  return num_partitions_;
+}
+
+/* static */ std::unique_ptr<PartitioningAlgorithm>
+PartitioningAlgorithm::CreateNoopPartitioning(int64_t num_partitions) {
+  return std::make_unique<NoopPartitioning>(num_partitions);
+}
+
+NoopPartitioning::NoopPartitioning(int64_t num_partitions)
+    : PartitioningAlgorithm(AlgorithmKind::kNoop, num_partitions) {
+  VLOG(2) << "Created a no-op algorithm with the number of partitions: "
+          << num_partitions;
+}
+
+StatusOr<bool> NoopPartitioning::Run(HloModule* module) const {
+  VLOG(2) << "No-op algorithm was called to partition module: "
+          << module->name();
+  return false;
+}
+
+PartitionAssignment::PartitionAssignment(int64_t num_partitions) {
+  CHECK_GT(num_partitions, 1) << "Number of partitions must be at least two.";
+  num_partitions_ = num_partitions;
+}
+
+absl::string_view PartitionAssignment::name() const {
+  return "partitioning-assignment";
+}
+
+const PartitioningAlgorithm* PartitionAssignment::algorithm() {
+  return algorithm_.get();
+}
+
+int64_t PartitionAssignment::num_partitions() { return num_partitions_; }
+
+StatusOr<bool> PartitionAssignment::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  VLOG(2) << "Running partition assignment on module " << module->name();
+
+  // Create the partitioning algorithm based on the flag. In the future we can
+  // decide on the algorithm by analyzing the module.
+  auto algo = module->config().debug_options().xla_partitioning_algorithm();
+  switch (algo) {
+    case DebugOptions::PARTITIONING_ALGORITHM_NOOP:
+    default:
+      algorithm_ =
+          PartitioningAlgorithm::CreateNoopPartitioning(num_partitions());
+  }
+  // Run the algorithm.
+  return algorithm()->Run(module);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/partition_assignment.h b/tensorflow/compiler/xla/service/spmd/partition_assignment.h
new file mode 100644
index 00000000000..8911c64f865
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/partition_assignment.h
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Base class for the partitioning algorithm. The derived classes will implement
+// different partitioning algorithms using various heuristics and cost models.
+// The aim is to create HLO shardings with small costs.
+class PartitioningAlgorithm {
+ public:
+  // The kind/type/name of the (derived) algorithm.
+  enum class AlgorithmKind {
+    kNoop,
+  };
+
+  // Constructors and destructor.
+  PartitioningAlgorithm() = delete;
+  PartitioningAlgorithm(const PartitioningAlgorithm&) = delete;
+  PartitioningAlgorithm& operator=(const PartitioningAlgorithm&) = delete;
+  virtual ~PartitioningAlgorithm() = default;
+
+  // Factory method to create a Noop partitioning algorithm.
+  static std::unique_ptr<PartitioningAlgorithm> CreateNoopPartitioning(
+      int64_t num_partitions);
+
+  // Returns the kind of this algorithm.
+  const AlgorithmKind& kind() const;
+
+  // Returns the name of this algorithm.
+  absl::string_view name() const;
+
+  // Returns the number of shards/partitions.
+  int64_t num_partitions() const;
+
+  // Assigns shardings to the given module.
+  virtual StatusOr<bool> Run(HloModule* module) const = 0;
+
+ protected:
+  // Internal constructor for a given algorithm kind. Other fields must be
+  // filled by factory methods.
+  explicit PartitioningAlgorithm(AlgorithmKind kind, int64_t num_partitions);
+
+ private:
+  // Kind for this algorithm.
+  AlgorithmKind kind_ = AlgorithmKind::kNoop;
+
+  // Number of requested shards (parts), i.e., number of available devices.
+  int64_t num_partitions_;
+};
+
+// Noop algorithm is essentially 'algorithm 0'.
+class NoopPartitioning : public PartitioningAlgorithm {
+ public:
+  explicit NoopPartitioning(int64_t num_partitions);
+
+  // Assigns shardings to the given module.
+  StatusOr<bool> Run(HloModule* module) const override;
+};
+
+// PartitionAssignment assigns sharding annotations to some HLOs in the given
+// module. The HLOs to target are more important/costly than the others in terms
+// of certain metrics. The plan is to find and assign good sharding annotations
+// to those HLOs in this pass and let the sharding propagation pass propagate
+// those to the remaining HLOs. The current assumption is that the module does
+// not have any sharding annotations yet.
+class PartitionAssignment : public HloModulePass {
+ public:
+  explicit PartitionAssignment(int64_t num_partitions);
+
+  // Returns the name of the pass.
+  absl::string_view name() const override;
+
+  // Runs the pass.
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+  // Returns the algorithm being used.
+  const PartitioningAlgorithm* algorithm();
+
+  // Returns the number of partitions.
+  int64_t num_partitions();
+
+ private:
+  // The partitioning algorithm to be used. For now, it is determined by a flag.
+  std::unique_ptr<PartitioningAlgorithm> algorithm_ = nullptr;
+
+  // The number of partitions (shards) being requested.
+  int64_t num_partitions_;
+};
+
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_PARTITION_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/spmd/partition_assignment_test.cc b/tensorflow/compiler/xla/service/spmd/partition_assignment_test.cc
new file mode 100644
index 00000000000..2a9eb346022
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/partition_assignment_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/partition_assignment.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace xla {
+namespace {
+
+using PartitionAssignmentTest = HloTestBase;
+
+TEST_F(PartitionAssignmentTest, NoopAlg) {
+  absl::string_view hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[16,16]{1,0} parameter(0)
+  ROOT %copy = f32[16,16]{1,0} copy(%param0)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // The following redundantly sets the flag to the default value. We keep it
+  // for the future tests to have the same style.
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_partitioning_algorithm(
+      DebugOptions::PARTITIONING_ALGORITHM_NOOP);
+  PartitionAssignment partition_assignment(/*num_partitions=*/16);
+  EXPECT_EQ(partition_assignment.algorithm(), nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, partition_assignment.Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_NE(partition_assignment.algorithm(), nullptr);
+  EXPECT_EQ(partition_assignment.algorithm()->kind(),
+            PartitioningAlgorithm::AlgorithmKind::kNoop);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.cc b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.cc
index b2a6d2c5873..9dd7ea8f312 100644
--- a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.cc
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h"
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h
index d70a113a1ed..61f36d787fa 100644
--- a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_COLLECTIVE_OPS_CSE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SCHEDULE_AWARE_COLLECTIVE_OPS_CSE_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
index cfb58e0aef4..af16910af80 100644
--- a/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/spmd/schedule_aware_collective_ops_cse.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index cd56da0ed1e..3a4e3a9e5de 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -31,20 +31,20 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -55,7 +55,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
 namespace spmd {
@@ -177,6 +176,20 @@ template <typename F>
 
 namespace {
 
+bool ShouldKeepSharding(const HloInstruction* hlo) {
+  // Keep sharding annotation on Infeed/SendRecv instructions.
+  if (hlo->opcode() == HloOpcode::kInfeed ||
+      hlo->opcode() == HloOpcode::kOutfeed ||
+      DynCast<HloSendRecvInstruction>(hlo) != nullptr) {
+    return true;
+  }
+  if (hlo->opcode() == HloOpcode::kParameter &&
+      hlo->parent() == hlo->GetModule()->entry_computation()) {
+    return true;
+  }
+  return false;
+}
+
 // Clears all sharding attributes from instructions in the module. This must be
 // called only after all SPMD transformation is complete.
 Status ClearShardingAttributes(
@@ -184,13 +197,7 @@ Status ClearShardingAttributes(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   for (HloComputation* computation : module->computations(execution_threads)) {
     for (HloInstruction* hlo : computation->instructions()) {
-      // Keep sharding annotation on Infeed and entry parameters since they're
-      // used by HloReplicationAnalysis later (for ArCrsCombiner).
-      if (hlo->HasSideEffect()) {
-        continue;
-      }
-      if (hlo->opcode() == HloOpcode::kParameter &&
-          computation == module->entry_computation()) {
+      if (ShouldKeepSharding(hlo)) {
         continue;
       }
       hlo->clear_sharding();
@@ -510,9 +517,11 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target,
   // two implementations below.
   if (!sharding().IsReplicated()) {
     if (!target.IsReplicated()) {
-      auto reshard = TryComplexReshardHandling(target);
-      if (reshard.has_value()) {
-        return reshard.value();
+      if (sharding().IsTiled() && target.IsTiled()) {
+        auto reshard = TryComplexReshardHandling(target);
+        if (reshard.has_value()) {
+          return reshard.value();
+        }
       }
       if (!allow_full_replication) {
         return *this;
@@ -682,7 +691,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   Shape padded_shape = base_shape_;
   std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
   std::vector<int64_t> per_shard_window_counts(base_shape_.rank());
-  std::vector<int64_t> explicit_left_padding(base_shape_.rank());
+  std::vector<int64_t> explicit_left_padding(base_shape_.rank(), 0);
   // Track if any shards can be skipped.
   std::vector<int64_t> trimmed_target_sharding_tile_shape(base_shape_.rank());
   // There can be at most 2 ranges of skipped shards on a dimension: 1) on the
@@ -709,13 +718,25 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
     // Do not pad non-partitioned dimensions.
     int64_t shard_count = target.tile_assignment().dim(i);
     trimmed_target_sharding_tile_shape[i] = shard_count;
-    if (shard_count == 1 || can_leave_dimension_partitioned[i]) {
+    if (shard_count == 1) {
       offsets_on_padded_shape[i] = state_.b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
       shard_shape.set_dimensions(
           i, CeilOfRatio(base_shape_.dimensions(i), shard_count));
       continue;
     }
+    if (can_leave_dimension_partitioned[i]) {
+      int64_t shard_size = CeilOfRatio(base_shape_.dimensions(i), shard_count);
+      padded_shape.set_dimensions(i, shard_size * shard_count);
+      offsets_on_padded_shape[i] =
+          state_.b->AddInstruction(HloInstruction::CreateBinary(
+              ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+              partition_ordinals[i],
+              state_.b->AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0<int32_t>(shard_size)))));
+      shard_shape.set_dimensions(i, shard_size);
+      continue;
+    }
     const WindowDimension& wd = window.dimensions(i);
     WindowDimension* swd = shard_window.mutable_dimensions(i);
     const int64_t dilated_size = 1 + (wd.size() - 1) * wd.window_dilation();
@@ -972,16 +993,20 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
       // are already sharded in a way that where the windowed sharding matches
       // the sharding we want.
       if (target.tile_assignment().dim(i) == 1 ||
-          can_leave_dimension_partitioned[i]) {
+          (can_leave_dimension_partitioned[i] && !sharding().IsReplicated())) {
+        // For can_leave_dimension_partitioned[i], we also check sharding() is
+        // not replicated, because handle_all_windowed_dimensions_are_replicated
+        // is invoked in 2 cases: 1) sharding on this dim is consistent, 2)
+        // current sharding is fully replicated. Case 2) still needs resharding.
         padding_config_dim->set_edge_padding_low(0);
         padding_config_dim->set_edge_padding_high(0);
         pad_hlo_shape.set_dimensions(i, hlo_->shape().dimensions(i));
-        continue;
+      } else {
+        padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
+        padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                                  explicit_left_padding[i] -
+                                                  base_shape_.dimensions(i));
       }
-      padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
-      padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
-                                                explicit_left_padding[i] -
-                                                base_shape_.dimensions(i));
     }
     auto padded_hlo =
         ShapeUtil::Compatible(pad_hlo_shape, base_shape_)
@@ -997,7 +1022,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
         get_dynamic_slice_offset_on_output_if_needed()});
   };
 
-  auto sharding_with_non_windowed_dims_replicated =
+  auto sharding_with_windowed_dims_replicated =
       GetShardingReplicatedOnWindowedDimension(target, window);
   // If the currrent HLO is replicated or all windows dimensions are replicated,
   // pad then slice. If the target sharding and current sharding are not the
@@ -1005,11 +1030,11 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   // generating a dynamic slice.
   if (sharding().IsReplicated() ||
       (target != sharding() &&
-       sharding_with_non_windowed_dims_replicated == sharding())) {
+       sharding_with_windowed_dims_replicated == sharding())) {
     return handle_all_windowed_dimensions_are_replicated();
   }
   if (target != sharding() &&
-      sharding_with_non_windowed_dims_replicated != sharding()) {
+      sharding_with_windowed_dims_replicated != sharding()) {
     return Reshard(target).ReshardAsWindowedInput(window, target, pad_value);
   }
   if (Product(trimmed_target_sharding_tile_shape) == 1) {
@@ -1114,10 +1139,10 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
                  "is beyond the neighbor.";
       // If we are already sharded in such a way that all windowed dimensions
       // are replicated then just handle it with pad + slice.
-      if (sharding_with_non_windowed_dims_replicated == sharding()) {
+      if (sharding_with_windowed_dims_replicated == sharding()) {
         return handle_all_windowed_dimensions_are_replicated();
       }
-      return Reshard(sharding_with_non_windowed_dims_replicated)
+      return Reshard(sharding_with_windowed_dims_replicated)
           .ReshardAsWindowedInput(window, target, pad_value);
     }
     visiting_hlo = *resharded;
@@ -1769,13 +1794,17 @@ std::optional<std::pair<HloSharding, int>> PatternMatchReshape(
 // targets instead.
 std::optional<HloSharding> PatternMatchPartiallyReplicateDim(
     const HloSharding& source, const HloSharding& target) {
-  if (!(!source.ReplicateOnLastTileDim() && target.ReplicateOnLastTileDim())) {
+  if (!target.ReplicateOnLastTileDim()) {
     return std::nullopt;
   }
   const int64_t target_replicated_dim = target.SubgroupReplicationDim();
+  const int64_t source_replicated_size =
+      source.HasPartialReplication()
+          ? source.tile_assignment().dim(source.SubgroupReplicationDim())
+          : 1;
   CHECK_NE(target_replicated_dim, -1) << "Expected replicated dim";
-  for (int i = 0; i < source.tile_assignment().num_dimensions(); ++i) {
-    if (source.tile_assignment().dim(i) !=
+  for (int i = 0; i < source.TiledDataRank(); ++i) {
+    if (source.tile_assignment().dim(i) * source_replicated_size !=
         target.tile_assignment().dim(target_replicated_dim)) {
       continue;
     }
@@ -2124,23 +2153,26 @@ Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
     }
   }
 
-  HloSharding sharding = hlo->sharding().HasUniqueDevice()
-                             ? hlo->sharding()
-                             : HloSharding::Replicate();
-  if (hlo->opcode() == HloOpcode::kSend || hlo->opcode() == HloOpcode::kRecv ||
-      hlo->opcode() == HloOpcode::kRecvDone) {
-    sharding = sharding.GetSubSharding(hlo->shape(), {0});
-  }
+  // The base sharding is a non-tuple sharding that is either assigned to a
+  // specific device or replicated.
+  const HloSharding base_sharding = [&]() {
+    if (hlo->sharding().HasUniqueDevice()) {
+      return HloSharding::AssignDevice(hlo->sharding().GetUniqueDevice());
+    }
+    return HloSharding::Replicate();
+  }();
 
-  // If the instruction cannot be partitioned, replicate the instruction unless
-  // the instruction has side-effect.
+  // Reshard operands according to the base_sharding for the instruction.
   std::vector<HloInstruction*> new_operands;
   for (HloInstruction* operand : hlo->operands()) {
-    new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    HloSharding operand_sharding =
+        base_sharding.NormalizeTupleSharding(operand->shape());
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(operand_sharding).hlo());
   }
   auto clone =
       b_.AddInstruction(hlo->CloneWithNewOperands(hlo->shape(), new_operands));
-  clone->set_sharding(sharding);
+  clone->set_sharding(base_sharding.NormalizeTupleSharding(clone->shape()));
   SetPartitionedHlo(hlo,
                     PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
                         .Reshard(hlo->sharding()));
@@ -2176,7 +2208,6 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
 
   if (hlo->opcode() != HloOpcode::kConditional &&
       hlo->opcode() != HloOpcode::kTuple &&
-      hlo->opcode() != HloOpcode::kGetTupleElement &&
       hlo->opcode() != HloOpcode::kParameter &&
       hlo->opcode() != HloOpcode::kWhile && hlo->opcode() != HloOpcode::kRng &&
       hlo->opcode() != HloOpcode::kAllReduce) {
@@ -2206,7 +2237,8 @@ Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
                           [](const HloSharding& sharding) {
                             return sharding.IsManualSubgroup();
                           }));
-      if (has_manual_subgroup && !hlo->IsCustomCall("SPMDFullToShardShape")) {
+      if (has_manual_subgroup && !hlo->IsCustomCall("SPMDFullToShardShape") &&
+          hlo->opcode() != HloOpcode::kGetTupleElement) {
         auto get_grouped_sharding =
             [&](const HloSharding& sharding, const Shape& shape,
                 const GroupedSharding* ref =
@@ -2486,7 +2518,7 @@ Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   }
   if (sharding.HasUniqueDevice()) {
     std::vector<HloInstruction*> new_operands(input_count, nullptr);
-    for (auto i = 0; i != input_count; ++i) {
+    for (int64_t i = 0; i != input_count; ++i) {
       // Handle variadic sort sharding.
       HloSharding subsharding =
           hlo->sharding().IsTuple()
@@ -2600,7 +2632,88 @@ Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
 
     return OkStatus();
   }
-
+  auto sort = DynCast<HloSortInstruction>(hlo);
+  auto sort_dim = sort->sort_dimension();
+  VLOG(2) << "sort dim: " << sort_dim;
+  auto cur_sharding = sharding;
+  bool same_subsharding = true;
+  if (sharding.IsTuple()) {
+    cur_sharding = sharding.GetSubSharding(hlo->shape(), {0});
+    for (int64_t i = 1; i != input_count; ++i) {
+      if (cur_sharding != hlo->sharding().GetSubSharding(hlo->shape(), {i})) {
+        same_subsharding = false;
+        break;
+      }
+    }
+  }
+  auto subshape = hlo->operand(0)->shape();
+  // If the sort is sharded along the sorting dimension, then we try to move the
+  // sharding into another dimension and apply it to all operands if
+  // -- operand rank is at least two
+  // -- output tuple elements have the same sharding
+  // -- the current sharding is tiled
+  if (subshape.rank() > 1 && same_subsharding && cur_sharding.IsTiled() &&
+      !cur_sharding.IsTileMaximal() &&
+      cur_sharding.tile_assignment().dim(sort_dim) != 1) {
+    Array<int64_t> tile_assignment = cur_sharding.tile_assignment();
+    std::vector<int64_t> tile_assignment_dims = tile_assignment.dimensions();
+    // Pick the new dimension to move the sharding into
+    int64_t picked_dim = -1;
+    int64_t first_nonsort_nonsharded_dim = -1;
+    auto nshards = tile_assignment_dims[sort_dim];
+    for (int64_t dim = 0; dim < subshape.rank(); ++dim) {
+      if (dim == sort_dim || tile_assignment_dims[dim] != 1) {
+        continue;
+      }
+      if (first_nonsort_nonsharded_dim == -1) {
+        first_nonsort_nonsharded_dim = dim;
+      }
+      if (subshape.dimensions(dim) % nshards != 0) {
+        continue;
+      }
+      picked_dim = dim;
+      break;
+    }
+    if (picked_dim == -1) {
+      picked_dim = first_nonsort_nonsharded_dim;
+    }
+    VLOG(2)
+        << "Sort partitioning - picked target dimension to move the sharding: "
+        << picked_dim;
+    // The sharding cannot exist in the sort dimension if there are no free
+    // dimensions to move the sharding into. In other words, we propagated the
+    // operand sharding which is on the sort dimension only because we knew we
+    // could pick a free dimension to move it into now.
+    CHECK_NE(picked_dim, -1)
+        << "Sort partitioning - sharding cannot exist in the sort dimension if "
+           "there are no free dimensions to move it into";
+    // Move the sharding to the picked dimension
+    std::vector<int64_t> permutation(
+        cur_sharding.tile_assignment().dimensions());
+    absl::c_iota(permutation, 0);
+    std::swap(permutation[sort_dim], permutation[picked_dim]);
+    auto new_sharding =
+        hlo_sharding_util::TransposeSharding(cur_sharding, permutation);
+    VLOG(2) << "Sort partitioning - new sharding: " << new_sharding.ToString();
+    std::vector<HloInstruction*> new_operands;
+    std::vector<HloSharding> new_shardings;
+    for (auto& operand : hlo->operands()) {
+      new_operands.push_back(
+          GetPartitionedHlo(operand).Reshard(new_sharding).hlo());
+      new_shardings.push_back(new_sharding);
+    }
+    auto new_output_sharding = new_sharding;
+    if (sharding.IsTuple()) {
+      new_output_sharding = HloSharding::Tuple(sort->shape(), new_shardings);
+    }
+    auto final_sort = b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(sort->shape(), new_output_sharding),
+        new_operands));
+    final_sort->set_sharding(new_output_sharding);
+    PartitionedHlo psort(final_sort, sort->shape(), MakePartitioningState());
+    SetPartitionedHlo(sort, psort.Reshard(sort->sharding()));
+    return OkStatus();
+  }
   if (hlo->shape().IsTuple()) {
     // Check that all elements are sharded in the same way.
     if (hlo->shape().tuple_shapes_size() == 0) {
@@ -3280,6 +3393,9 @@ Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
 }
 
 Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  if (hlo->sharding().IsManual()) {
+    return DefaultAction(hlo);
+  }
   const auto& tuple = GetPartitionedHlo(hlo->operand(0));
   auto gte = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetTupleElementShape(tuple.hlo()->shape(), hlo->tuple_index()),
@@ -3708,8 +3824,27 @@ Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
     return HandleSingleDevice(hlo);
   }
 
-  const auto& sharding = hlo->sharding();
+  // TODO(b/260756663): Remove this fixup once this bug is fixed.
+  // The sharding for an outfeed might include sharding for the outfeed_shape
+  // and sharding for the output tuple. Piece out the sharding for the outfeed
+  // shape if needed.
+  HloSharding sharding = hlo->sharding();
   const Shape& shape = hlo->operand(0)->shape();
+  const int64_t required_leaves = HloSharding::RequiredLeaves(shape);
+
+  // if the sharding is a tuple with one extra element as compared to outfeed
+  // shape, "fix up" the sharding to exclude the output tuple.
+  if (sharding.IsTuple() &&
+      sharding.tuple_elements().size() == required_leaves + 1) {
+    if (shape.IsTuple()) {
+      sharding = HloSharding::Tuple(
+          shape,
+          absl::MakeSpan(sharding.tuple_elements().data(), required_leaves));
+    } else {
+      sharding = sharding.tuple_elements().front();
+    }
+  }
+
   auto partitioned_operand =
       GetPartitionedHlo(hlo->operand(0)).Reshard(sharding);
   const auto& shard_shape = partitioned_operand.hlo()->shape();
@@ -4642,18 +4777,6 @@ Status SpmdPartitioner::PreprocessSharding(
           hlo->set_sharding(
               HloSharding::Single(hlo->shape(), HloSharding::Replicate()));
         }
-      } else if (!hlo->sharding().IsTileMaximal() &&
-                 !hlo->sharding().IsManual()) {
-        std::vector<int64_t> available(num_partitions_);
-        std::iota(available.begin(), available.end(), 0);
-        TF_RET_CHECK(num_partitions_ == hlo_sharding_util::DevicesForSharding(
-                                            hlo->sharding(), available)
-                                            .size())
-            << "num_partitions:" << num_partitions_ << "\n"
-            << "SPMD partitioner only supports tile sharding that includes all "
-               "partitions. If you didn't add this sharding annotation in the "
-               "model, please file a bug to XLA team.\n"
-            << hlo->ToString();
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 78112fe8216..0ca64e78e3a 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -16,23 +16,26 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
 
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/functional/function_ref.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/custom_call_sharding_helper.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -344,12 +347,6 @@ class PartitionedHlo {
       : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
     CHECK(hlo->has_sharding())
         << "PartitionedHlo is missing sharding:" << hlo->ToString();
-    // If the tuple shape instruction does not have a tuple sharding, reassign
-    // to use the tuple sharding. Reshard() implementation assumes this.
-    if (hlo_->shape().IsTuple() && !hlo_->sharding().IsTuple()) {
-      hlo_->set_sharding(
-          hlo_->sharding().GetTupleSharding(hlo_->shape()).value());
-    }
   }
 
   PartitionedHlo CloneWithNewHlo(HloInstruction* hlo) const {
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 192d5318052..2c358e77cab 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
@@ -105,24 +105,6 @@ class SpmdPartitioningTest : public HloTestBase {
   }
 };
 
-TEST_F(SpmdPartitioningTest, InvalidSharding) {
-  absl::string_view hlo_string = R"(
-HloModule module
-
-ENTRY entry {
-  token0 = token[] after-all(), sharding={maximal device=0}
-  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
-    sharding={{devices=[2,1]0,1}, {maximal device=0}}
-  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
-    sharding={maximal device=0}
-})";
-  auto module_status = PartitionComputation(hlo_string, /*num_devices=*/4);
-  EXPECT_FALSE(module_status.status().ok());
-  EXPECT_THAT(module_status.status().ToString(),
-              ::testing::HasSubstr(
-                  "only supports tile sharding that includes all partitions"));
-}
-
 TEST_F(SpmdPartitioningTest, SingleDeviceToReplicated) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -2165,6 +2147,29 @@ ENTRY entry {
                           op::Shape("f32[128,17,129]")));
 }
 
+TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimensionReshard) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={replicated}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[128,17,257] pad(%param0, %const), padding=0_0x1_2x0_0,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  const auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,257]"));
+  auto operand = op::DynamicSlice(op::Pad(param0, _), op::Constant(),
+                                  op::Constant(), op::Multiply());
+  EXPECT_THAT(root, AllOf(op::Pad(operand, op::Constant()),
+                          op::Shape("f32[128,17,129]")));
+}
+
 TEST_F(SpmdPartitioningTest, PadAlongPartitionedDimension) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -2906,6 +2911,9 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/2));
   VLOG(1) << module->ToString();
   auto sort = FindInstruction(module.get(), "sort.0");
+  // The shape of the operands changed from [2,209664] to [1,209664] due to
+  // moving the sharding from the sort dim (dim 1) to dim 0. This optimization
+  // was implemented for b/258523376.
   EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
   EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
 }
@@ -3061,6 +3069,301 @@ ENTRY entry {
   EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
 }
 
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_SlowSortBug) {
+  // Test with the sort in b/258523376 (same comparator, shapes, and sharding)
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[32768,65536]{1,0})->(f32[32768,65536]{1,0}, s32[32768,65536]{1,0})}
+
+region_174.7326 {
+  Arg_0.7327 = f32[] parameter(0), sharding={replicated}
+  compare.7339 = pred[] compare(Arg_0.7327, Arg_0.7327), direction=NE, sharding={replicated}
+  constant.7332 = s32[] constant(2143289344), sharding={replicated}
+  constant.7334 = f32[] constant(0), sharding={replicated}
+  compare.7337 = pred[] compare(Arg_0.7327, constant.7334), direction=EQ, sharding={replicated}
+  constant.7333 = s32[] constant(0), sharding={replicated}
+  bitcast-convert.7335 = s32[] bitcast-convert(Arg_0.7327), sharding={replicated}
+  select.7338 = s32[] select(compare.7337, constant.7333, bitcast-convert.7335), sharding={replicated}
+  select.7340 = s32[] select(compare.7339, constant.7332, select.7338), sharding={replicated}
+  constant.1127 = s32[] constant(0), sharding={replicated}
+  compare.7343 = pred[] compare(select.7340, constant.1127), direction=LT, sharding={replicated}
+  constant.7331 = u32[] constant(2147483647), sharding={replicated}
+  bitcast-convert.7336 = u32[] bitcast-convert(Arg_0.7327), sharding={replicated}
+  subtract.7341 = u32[] subtract(constant.7331, bitcast-convert.7336), sharding={replicated}
+  bitcast-convert.7342 = s32[] bitcast-convert(subtract.7341), sharding={replicated}
+  select.7344 = s32[] select(compare.7343, bitcast-convert.7342, select.7340), sharding={replicated}
+  Arg_1.7328 = f32[] parameter(1), sharding={replicated}
+  compare.7349 = pred[] compare(Arg_1.7328, Arg_1.7328), direction=NE, sharding={replicated}
+  constant.1125 = s32[] constant(2143289344), sharding={replicated}
+  constant.1126 = f32[] constant(0), sharding={replicated}
+  compare.7347 = pred[] compare(Arg_1.7328, constant.1126), direction=EQ, sharding={replicated}
+  constant.1128 = s32[] constant(0), sharding={replicated}
+  bitcast-convert.7345 = s32[] bitcast-convert(Arg_1.7328), sharding={replicated}
+  select.7348 = s32[] select(compare.7347, constant.1128, bitcast-convert.7345), sharding={replicated}
+  select.7350 = s32[] select(compare.7349, constant.1125, select.7348), sharding={replicated}
+  constant.1129 = s32[] constant(0), sharding={replicated}
+  compare.7353 = pred[] compare(select.7350, constant.1129), direction=LT, sharding={replicated}
+  constant.1130 = u32[] constant(2147483647), sharding={replicated}
+  bitcast-convert.7346 = u32[] bitcast-convert(Arg_1.7328), sharding={replicated}
+  subtract.7351 = u32[] subtract(constant.1130, bitcast-convert.7346), sharding={replicated}
+  bitcast-convert.7352 = s32[] bitcast-convert(subtract.7351), sharding={replicated}
+  select.7354 = s32[] select(compare.7353, bitcast-convert.7352, select.7350), sharding={replicated}
+  compare.7355 = pred[] compare(select.7344, select.7354), direction=LT, sharding={replicated}
+  compare.24 = pred[] compare(select.7354, select.7344), direction=LT, sharding={replicated}
+  compare.25 = pred[] compare(compare.7355, compare.24), direction=EQ, sharding={replicated}
+  Arg_2.7329 = s32[] parameter(2), sharding={replicated}
+  Arg_3.7330 = s32[] parameter(3), sharding={replicated}
+  compare.26 = pred[] compare(Arg_2.7329, Arg_3.7330), direction=LT, sharding={replicated}
+  ROOT select.21 = pred[] select(compare.25, compare.26, compare.7355), sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[32768,65536]{1,0} parameter(0)
+  negate.7325 = f32[32768,65536]{1,0} negate(param.0), sharding={devices=[1,64]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}
+  iota.30 = s32[32768,65536]{1,0} iota(), iota_dimension=1, sharding={devices=[1,64]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}
+  ROOT sort.0 = (f32[32768,65536]{1,0}, s32[32768,65536]{1,0}) sort(negate.7325, iota.30), dimensions={1}, is_stable=true, to_apply=region_174.7326, sharding={{devices=[1,64]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, {devices=[1,64]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/64));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 512);
+    EXPECT_EQ(operand->shape().dimensions(1), 65536);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_OneOperand) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024,1024]{1,0})->f32[1024,1024]{1,0}}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = f32[1024,1024]{1,0} sort(negate.0), dimensions={1}, is_stable=true, to_apply=compare, sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 128);
+    EXPECT_EQ(operand->shape().dimensions(1), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_TwoOperands) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024,1024]{1,0})->(f32[1024,1024]{1,0},s32[1024,1024]{1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[1024,1024]{1,0} iota(), iota_dimension=1, sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = (f32[1024,1024]{1,0}, s32[1024,1024]{1,0}) sort(negate.0, iota.0), dimensions={1}, is_stable=true, to_apply=compare, sharding={{devices=[1,8]0,1,2,3,4,5,6,7},{devices=[1,8]0,1,2,3,4,5,6,7}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 128);
+    EXPECT_EQ(operand->shape().dimensions(1), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_ThreeOperands) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024,1024]{1,0})->(f32[1024,1024]{1,0},s32[1024,1024]{1,0},s32[1024,1024]{1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  p.2.lhs = s32[] parameter(4), sharding={replicated}
+  p.2.rhs = s32[] parameter(5), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[1024,1024]{1,0} iota(), iota_dimension=0, sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  iota.1 = s32[1024,1024]{1,0} iota(), iota_dimension=1, sharding={devices=[1,8]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = (f32[1024,1024]{1,0}, s32[1024,1024]{1,0}, s32[1024,1024]{1,0}) sort(negate.0, iota.0, iota.1), dimensions={1}, is_stable=true, to_apply=compare, sharding={{devices=[1,8]0,1,2,3,4,5,6,7},{devices=[1,8]0,1,2,3,4,5,6,7},{devices=[1,8]0,1,2,3,4,5,6,7}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 128);
+    EXPECT_EQ(operand->shape().dimensions(1), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_RankOne) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024]{0})->(f32[1024]{0},s32[1024]{0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024]{0} parameter(0)
+  negate.0 = f32[1024]{0} negate(param.0), sharding={devices=[8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[1024]{0} iota(), iota_dimension=0
+  ROOT sort.0 = (f32[1024]{0}, s32[1024]{0}) sort(negate.0, iota.0), dimensions={0}, is_stable=true, to_apply=compare
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_TwoFreeDivisibleDims) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[8,1024,1024]{2,1,0})->(f32[8,1024,1024]{2,1,0},s32[8,1024,1024]{2,1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[8,1024,1024]{2,1,0} parameter(0)
+  negate.0 = f32[8,1024,1024]{2,1,0} negate(param.0), sharding={devices=[1,1,8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[8,1024,1024]{2,1,0} iota(), iota_dimension=2, sharding={devices=[1,1,8]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = (f32[8,1024,1024]{2,1,0}, s32[8,1024,1024]{2,1,0}) sort(negate.0, iota.0), dimensions={2}, is_stable=true, to_apply=compare, sharding={{devices=[1,1,8]0,1,2,3,4,5,6,7},{devices=[1,1,8]0,1,2,3,4,5,6,7}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 1);
+    EXPECT_EQ(operand->shape().dimensions(1), 1024);
+    EXPECT_EQ(operand->shape().dimensions(2), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_OneFreeDivisibleDim) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[7,1024,1024]{2,1,0})->(f32[7,1024,1024]{2,1,0},s32[7,1024,1024]{2,1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[7,1024,1024]{2,1,0} parameter(0)
+  negate.0 = f32[7,1024,1024]{2,1,0} negate(param.0), sharding={devices=[1,1,8]0,1,2,3,4,5,6,7}
+  iota.0 = s32[7,1024,1024]{2,1,0} iota(), iota_dimension=2, sharding={devices=[1,1,8]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = (f32[7,1024,1024]{2,1,0}, s32[7,1024,1024]{2,1,0}) sort(negate.0, iota.0), dimensions={2}, is_stable=true, to_apply=compare, sharding={{devices=[1,1,8]0,1,2,3,4,5,6,7},{devices=[1,1,8]0,1,2,3,4,5,6,7}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 7);
+    EXPECT_EQ(operand->shape().dimensions(1), 128);
+    EXPECT_EQ(operand->shape().dimensions(2), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_OneFreeNondivisibleDim) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[7,1024,1024]{2,1,0})->(f32[7,1024,1024]{2,1,0},s32[7,1024,1024]{2,1,0})}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  p.1.lhs = s32[] parameter(2), sharding={replicated}
+  p.1.rhs = s32[] parameter(3), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[7,1024,1024]{2,1,0} parameter(0)
+  negate.0 = f32[7,1024,1024]{2,1,0} negate(param.0), sharding={devices=[1,2,4]0,1,2,3,4,5,6,7}
+  iota.0 = s32[7,1024,1024]{2,1,0} iota(), iota_dimension=2, sharding={devices=[1,2,4]0,1,2,3,4,5,6,7}
+  ROOT sort.0 = (f32[7,1024,1024]{2,1,0}, s32[7,1024,1024]{2,1,0}) sort(negate.0, iota.0), dimensions={2}, is_stable=true, to_apply=compare, sharding={{devices=[1,2,4]0,1,2,3,4,5,6,7},{devices=[1,2,4]0,1,2,3,4,5,6,7}}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 2);
+    EXPECT_EQ(operand->shape().dimensions(1), 512);
+    EXPECT_EQ(operand->shape().dimensions(2), 1024);
+  }
+}
+
+TEST_F(SpmdPartitioningTest, SortShardedOnSortDim_LastTileDimReplicate) {
+  absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(f32[1024,1024]{1,0})->f32[1024,1024]{1,0}}
+
+compare {
+  p.0.lhs = f32[] parameter(0), sharding={replicated}
+  p.0.rhs = f32[] parameter(1), sharding={replicated}
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT, sharding={replicated}
+}
+
+ENTRY entry {
+  param.0 = f32[1024,1024]{1,0} parameter(0)
+  negate.0 = f32[1024,1024]{1,0} negate(param.0), sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT sort.0 = f32[1024,1024]{1,0} sort(negate.0), dimensions={1}, is_stable=true, to_apply=compare, sharding={devices=[1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort.1");
+  for (auto operand : sort->operands()) {
+    EXPECT_EQ(operand->shape().dimensions(0), 512);
+    EXPECT_EQ(operand->shape().dimensions(1), 1024);
+  }
+}
+
 TEST_F(SpmdPartitioningTest, ShardableTranspose) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -3761,8 +4064,8 @@ TEST_F(SpmdPartitioningTest, Conditional) {
 HloModule module
 
 Negate {
-  x = f32[4,5] parameter(0), sharding={replicated}
-  ROOT negate = f32[4,5] negate(x), sharding={replicated}
+  x = f32[4,5] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT negate = f32[4,5] negate(x), sharding={devices=[2,1]0,1}
 }
 
 Identity {
@@ -3798,8 +4101,8 @@ ENTRY entry {
 
   auto then_branch_root = root->branch_computation(0)->root_instruction();
   EXPECT_THAT(then_branch_root,
-              AllOf(op::DynamicSlice(op::Negate(op::Parameter()), op::Reshape(),
-                                     op::Constant()),
+              AllOf(op::Negate(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
                     op::Shape("f32[2,5]")));
 
   auto else_branch_root = root->branch_computation(1)->root_instruction();
@@ -8866,6 +9169,80 @@ ENTRY entry {
                           op::Shape("f32[3,1,1,1024]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupAlignWithLHSPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,64]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,64]{2,1,0} copy(lhs), sharding={devices=[2,1,4]0,1,2,3,4,5,6,7}
+  %rhs = f32[4,275,64]{2,1,0} parameter(1)
+  %copy.25 = f32[4,275,64]{2,1,0} copy(rhs), sharding={devices=[4,1,2]0,1,2,3,4,5,6,7}
+  ROOT %convolution.6144 = f32[5,1,64]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=f0b_i0o->0bf, batch_group_count=64,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,4,1,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,16]"));
+  const auto rhs = AllOf(op::Shape("f32[4,275,16]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,16]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[5,1,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountAlignWithRHSPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,64]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,64]{2,1,0} copy(lhs), sharding={devices=[4,1,2]0,1,2,3,4,5,6,7}
+  %rhs = f32[4,275,64]{2,1,0} parameter(1)
+  %copy.25 = f32[4,275,64]{2,1,0} copy(rhs), sharding={devices=[2,1,4]0,1,2,3,4,5,6,7}
+  ROOT %convolution.6144 = f32[5,1,64]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=f0b_i0o->0bf, batch_group_count=64,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,4,1,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,16]"));
+  const auto rhs = AllOf(op::Shape("f32[4,275,16]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,16]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[5,1,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountAlignWithOutputPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,64]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,64]{2,1,0} copy(lhs), sharding={devices=[4,1,2]0,1,2,3,4,5,6,7}
+  %rhs = f32[4,275,64]{2,1,0} parameter(1)
+  %copy.25 = f32[4,275,64]{2,1,0} copy(rhs), sharding={devices=[4,1,2]0,1,2,3,4,5,6,7}
+  ROOT %convolution.6144 = f32[5,1,64]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=f0b_i0o->0bf, batch_group_count=64,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,1,4,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,16]"));
+  const auto rhs = AllOf(op::Shape("f32[4,275,16]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,16]")));
+}
+
 TEST_F(SpmdPartitioningTest, PartitionConvWithFeatureGroupCount) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -8938,6 +9315,81 @@ ENTRY entry {
               AllOf(op::Convolution(lhs, rhs), op::Shape("f32[8,1,1,768]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignWithLHSPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,16]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,16]{2,1,0} copy(lhs), sharding={devices=[1,1,4,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %rhs = f32[1,275,16]{2,1,0} parameter(1)
+  %copy.25 = f32[1,275,16]{2,1,0} copy(rhs), sharding={devices=[1,1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %convolution.6144 = f32[5,4,16]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=b0f_i0o->0bf, feature_group_count=16,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,1,2,4]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,4]"));
+  const auto rhs = AllOf(op::Shape("f32[1,275,4]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,4,4]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                              _, op::CollectivePermute(conv), _, _, _)),
+                          op::Shape("f32[5,4,8]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignWithRHSPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,16]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,16]{2,1,0} copy(lhs), sharding={devices=[1,1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %rhs = f32[1,275,16]{2,1,0} parameter(1)
+  %copy.25 = f32[1,275,16]{2,1,0} copy(rhs), sharding={devices=[1,1,4,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %convolution.6144 = f32[5,4,16]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=b0f_i0o->0bf, feature_group_count=16,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,1,2,4]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,4]"));
+  const auto rhs = AllOf(op::Shape("f32[1,275,4]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,4,4]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                              _, op::CollectivePermute(conv), _, _, _)),
+                          op::Shape("f32[5,4,8]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignWithOutputPartial) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[4,275,16]{2,1,0} parameter(0)
+  %multiply.5810 = f32[4,275,16]{2,1,0} copy(lhs), sharding={devices=[1,1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %rhs = f32[1,275,16]{2,1,0} parameter(1)
+  %copy.25 = f32[1,275,16]{2,1,0} copy(rhs), sharding={devices=[1,1,2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %convolution.6144 = f32[5,4,16]{2,1,0} convolution(multiply.5810, copy.25), window={size=275 pad=2_2},
+    dim_labels=b0f_i0o->0bf, feature_group_count=16,
+    operand_precision={HIGH,HIGH}, sharding={devices=[1,1,4,2]0,4,1,5,2,6,3,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  const auto root = module->entry_computation()->root_instruction();
+  const auto lhs = AllOf(op::Shape("f32[4,275,4]"));
+  const auto rhs = AllOf(op::Shape("f32[1,275,4]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,4,4]")));
+}
+
 TEST_F(SpmdPartitioningTest,
        PartitionConvWithFeatureGroupCountRHSAlignWithLHS) {
   absl::string_view hlo_string = R"(
@@ -11023,8 +11475,11 @@ ENTRY %module {
 
   const HloInstruction* sort = FindInstruction(module.get(), "sort.0");
   EXPECT_NE(sort, nullptr);
+  // The subshape of the sort changed from [2,64,32128] to [1,16,32128] due to
+  // moving the sharding from the sort dim (dim 2) to other dimensions. This
+  // optimization was implemented for b/258523376.
   auto sort_match =
-      AllOf(op::Shape("(f32[2,64,32128], s32[2,64,32128])"), op::Sort(_, _));
+      AllOf(op::Shape("(f32[1,16,32128], s32[1,16,32128])"), op::Sort(_, _));
   EXPECT_THAT(sort, sort_match);
 }
 
@@ -12490,6 +12945,106 @@ ENTRY entry {
                         _, _, _));
 }
 
+TEST_F(SpmdPartitioningTest, ManualGetTupleElement) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+orclone {
+  lhs.1 = u32[] parameter(0)
+  rhs.1 = u32[] parameter(2)
+  or.2 = u32[] or(lhs.1, rhs.1)
+  lhs.0 = u32[] parameter(1)
+  rhs.0 = u32[] parameter(3)
+  or.3 = u32[] or(lhs.0, rhs.0)
+  ROOT tuple.4 = (u32[], u32[]) tuple(or.2, or.3)
+}
+
+ENTRY %main.21 {
+  select.104 = u32[2,2]{1,0} parameter(0), sharding={manual}
+  shift-left.5 = u32[2,2]{1,0} parameter(1), sharding={manual}
+  constant.4183 = u32[] constant(0), sharding={manual}
+  reduce.1 = (u32[2]{0}, u32[2]{0}) reduce(shift-left.5, select.104, constant.4183, constant.4183), dimensions={1}, sharding={{manual},{manual}}, to_apply=orclone
+  ROOT get-tuple-element.13 = u32[2]{0} get-tuple-element(reduce.1), index=0, sharding={manual}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::GetTupleElement(op::Reduce(_, _, _, _)));
+}
+
+TEST_F(SpmdPartitioningTest, CombiningScatterPartitiong) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+region_110.8267 {
+  Arg_0.8268 = bf16[] parameter(0)
+  Arg_1.8269 = bf16[] parameter(1)
+  ROOT add.8270 = bf16[] add(Arg_0.8268, Arg_1.8269)
+}
+
+ENTRY %main.21 {
+  broadcast.8659 = bf16[2,8,12288,192,64]{4,3,2,1,0} parameter(0), sharding={devices=[2,1,2,4,1]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  reshape.9796 = bf16[2,1,12288,192,64]{4,3,2,1,0} parameter(1), sharding={devices=[2,1,2,4,1]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  iota.50 = s32[2,1]{1,0} iota(), iota_dimension=0, sharding={devices=[2,1,8]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 last_tile_dim_replicate}
+  constant.1585 = s32[] constant(0), sharding={replicated}
+  broadcast.3764 = s32[2,1]{1,0} broadcast(constant.1585), dimensions={}, sharding={devices=[2,1,8]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 last_tile_dim_replicate}
+  reshape_idx = s32[2,1]{1,0} parameter(2), sharding={devices=[2,1,8]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 last_tile_dim_replicate}
+  concatenate.8907 = s32[2,5]{1,0} concatenate(iota.50, reshape_idx, broadcast.3764, broadcast.3764, broadcast.3764), dimensions={1}, sharding={devices=[2,1,8]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 last_tile_dim_replicate}
+  scatter.9797 = bf16[2,8,12288,192,64]{4,3,2,1,0} scatter(broadcast.8659, concatenate.8907, reshape.9796), update_window_dims={1,2,3,4}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=region_110.8267, sharding={devices=[2,1,2,4,1]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  ROOT c = bf16[2,8,12288,192,64]{4,3,2,1,0} copy(scatter.9797), sharding={devices=[2,1,2,4,1]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Copy(AllOf(op::Shape("bf16[1,8,6144,48,64]"), op::Scatter(_, _, _))));
+  // Check that there is no communication added.
+  EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kAllReduce), nullptr);
+}
+
+TEST_F(SpmdPartitioningTest, MatchOutputAlignmentNonContractingDot) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+ENTRY %main.21 {
+  multiply.3535 = f32[4,4]{1,0} parameter(0), sharding={devices=[2,4,2]0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 last_tile_dim_replicate}
+  reshape.4221 = f32[4,4]{1,0} parameter(1), sharding={devices=[4,1,4]0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15 last_tile_dim_replicate}
+  dot.11597 = f32[4,4]{1,0} dot(multiply.3535, reshape.4221), lhs_contracting_dims={1}, rhs_contracting_dims={0}, sharding={devices=[2,1,8]0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 last_tile_dim_replicate}
+  ROOT copy.1 = f32[4,4]{1,0} copy(dot.11597), sharding={devices=[2,1,8]0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 last_tile_dim_replicate}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kCollectivePermute),
+            nullptr);
+}
+
+TEST_F(SpmdPartitioningTest, ComplexReshardPartialMerging) {
+  const char* const hlo_string = R"(
+HloModule pjit
+
+ENTRY %main.21 {
+  multiply.3535 = f32[256,256,256]{2,1,0} parameter(0), sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT copy.1 = f32[256,256,256]{2,1,0} copy(multiply.3535), sharding={devices=[1,2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_NE(FindInstruction(module.get(), HloOpcode::kAllToAll), nullptr);
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index e5e1447a316..0eec3e34d4f 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -23,14 +23,14 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -1243,6 +1243,10 @@ int64_t ShardCountAtDim(const HloSharding& sharding, int64_t dim) {
   if (sharding.IsTileMaximal()) {
     return 1;
   }
+  if (dim == -1) {
+    // -1 is used as a placeholder in non-existing dims like DotConvDimsMapping.
+    return 1;
+  }
   return sharding.tile_assignment().dim(dim);
 }
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 82b9ec585d0..cb2b10d7f86 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -23,12 +23,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.cc
index 89b138f76d8..882c8dfb0b9 100644
--- a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 namespace spmd {
diff --git a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h
index 14506765101..9415baee712 100644
--- a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 
diff --git a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
index 0449b163891..f23f77bf764 100644
--- a/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner_test.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/spmd/stateful_rng_spmd_partitioner.h"
 
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.cc b/tensorflow/compiler/xla/service/stable_sort_expander.cc
index c947068f3e6..c4a4a2f9dc0 100644
--- a/tensorflow/compiler/xla/service/stable_sort_expander.cc
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.cc
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.h b/tensorflow/compiler/xla/service/stable_sort_expander.h
index 31b6fd92d25..61e83201a86 100644
--- a/tensorflow/compiler/xla/service/stable_sort_expander.h
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/stochastic_convert_decomposer.cc b/tensorflow/compiler/xla/service/stochastic_convert_decomposer.cc
new file mode 100644
index 00000000000..423c1c1d17a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stochastic_convert_decomposer.cc
@@ -0,0 +1,157 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stochastic_convert_decomposer.h"
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+
+Status DecomposeStochasticConvert(HloComputation* comp,
+                                  HloInstruction* instruction) {
+  CHECK(instruction->opcode() == HloOpcode::kStochasticConvert)
+      << "requires a stochastic_convert instruction to decompose, but got: "
+      << instruction->opcode();
+  CHECK(instruction->operand_count() == 2)
+      << "requires 2 operands for stochastic convert, but got: "
+      << instruction->operand_count();
+  HloInstruction* operand = instruction->mutable_operand(0);
+  HloInstruction* random = instruction->mutable_operand(1);
+  PrimitiveType from_type = operand->shape().element_type();
+  PrimitiveType random_type = random->shape().element_type();
+  PrimitiveType to_type = instruction->shape().element_type();
+  TF_RETURN_IF_ERROR(ShapeInference::InferStochasticConvertShape(
+                         operand->shape(), random->shape(), to_type)
+                         .status());
+  VLOG(1) << "Decomposing instruction: " << instruction->ToString();
+
+  // For converting floats to integers, the fractional bits of the operands
+  // are placed into an unsigned integer where the bit representing
+  // 2^-1 is put in the most significant bit. This is then
+  // compared (using an unsigned integer comparison) against the unsigned
+  // random value. The fractional part will be rouneded up if the user-given
+  // random value is less than the fractional bits, otherwise it will be
+  // rounded down.
+  if (primitive_util::IsSignedIntegralType(to_type)) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * operand_sign,
+                        MakeUnaryHlo(HloOpcode::kSign, operand));
+    TF_ASSIGN_OR_RETURN(HloInstruction * should_neg,
+                        MakeCompareHlo(Comparison::Direction::kLt, operand_sign,
+                                       MakeScalarLike(operand_sign, 0)));
+    TF_ASSIGN_OR_RETURN(HloInstruction * operand_abs,
+                        MakeUnaryHlo(HloOpcode::kAbs, operand));
+    TF_ASSIGN_OR_RETURN(HloInstruction * truncated_fp,
+                        MakeUnaryHlo(HloOpcode::kFloor, operand_abs));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fractional,
+        MakeBinaryHlo(HloOpcode::kSubtract, operand_abs, truncated_fp));
+    // Upcasts the operand to F32 as calculating fixed_fractional needs a
+    // multiplier of 2^16  which can't be represented in F16(whose max
+    // value is 2^16 - 2^5).
+    if (from_type == F16) {
+      fractional = MakeConvertToHlo(fractional, F32);
+    }
+    // Compares fractional values against unsigned random values by
+    // normalizing random values into [0, 1): fractional vs. (random /
+    // random_max). This equals to comparing (fractional * random_max) vs.
+    // random.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fixed_fractional,
+        MakeBinaryHlo(
+            HloOpcode::kMultiply, fractional,
+            MakeScalarLike(fractional, IPow<double>(2, primitive_util::BitWidth(
+                                                           random_type)))));
+    // Rounds the integer output up if the fractional pieces is larger than
+    // the input random number.
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * should_round_up,
+        MakeCompareHlo(Comparison::Direction::kLt, random,
+                       MakeConvertToHlo(fixed_fractional, random_type)));
+    HloInstruction* truncated_int = MakeConvertToHlo(truncated_fp, to_type);
+
+    TF_ASSIGN_OR_RETURN(
+        truncated_int,
+        MakeSelectHlo(should_round_up,
+                      MakeBinaryHlo(HloOpcode::kAdd, truncated_int,
+                                    MakeScalarLike(truncated_int, 1))
+                          .value(),
+                      truncated_int));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * result,
+        MakeSelectHlo(should_neg,
+                      MakeUnaryHlo(HloOpcode::kNegate, truncated_int).value(),
+                      truncated_int));
+    auto to_bits = primitive_util::BitWidth(to_type);
+    // Deals with min values
+    auto min = static_cast<int64_t>(
+        (static_cast<uint64_t>(1) + ~static_cast<uint64_t>(1))
+        << (to_bits - 1));
+    TF_ASSIGN_OR_RETURN(HloInstruction * is_min,
+                        MakeCompareHlo(Comparison::Direction::kLe, operand,
+                                       MakeScalarLike(operand, min)));
+    TF_ASSIGN_OR_RETURN(
+        result, MakeSelectHlo(is_min, MakeScalarLike(result, min), result));
+    // Deals with max values
+    auto max =
+        static_cast<int64_t>((static_cast<uint64_t>(1) << (to_bits - 1)) - 1);
+    TF_ASSIGN_OR_RETURN(HloInstruction * is_max,
+                        MakeCompareHlo(Comparison::Direction::kGe, operand,
+                                       MakeScalarLike(operand, max)));
+    TF_ASSIGN_OR_RETURN(
+        result, MakeSelectHlo(is_max, MakeScalarLike(result, max), result));
+
+    TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(result));
+    TF_RETURN_IF_ERROR(comp->RemoveInstruction(instruction));
+    return OkStatus();
+  }
+
+  // TODO(b/232442915): Add support for converting to floats.
+  return InternalError("Unsupported stochastic convert: from %s to %s",
+                       PrimitiveType_Name(from_type),
+                       PrimitiveType_Name(to_type));
+}
+
+StatusOr<bool> StochasticConvertDecomposer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() != HloOpcode::kStochasticConvert) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(DecomposeStochasticConvert(computation, instruction));
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stochastic_convert_decomposer.h b/tensorflow/compiler/xla/service/stochastic_convert_decomposer.h
new file mode 100644
index 00000000000..9d57e9214ef
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stochastic_convert_decomposer.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// StochasticConvertDecomposer is a pass which replaces unsupported
+// stochastic-convert with multiple hlos.
+class StochasticConvertDecomposer : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "stochastic_convert_decomposer";
+  }
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STOCHASTIC_CONVERT_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc b/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc
new file mode 100644
index 00000000000..a3e3718ac0a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stochastic_convert_decomposer_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stochastic_convert_decomposer.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using StochasticConvertDecomposerTest = HloTestBase;
+using ::testing::HasSubstr;
+
+TEST_F(StochasticConvertDecomposerTest, DecomposeStochasticConvertF32ToS32) {
+  const std::string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  %arg_param.1 = f32[65536]{0} parameter(0)
+  %random_param.2 = u32[65536]{0} parameter(1)
+  ROOT %stochastic-convert.3 = s32[65536]{0} stochastic-convert(f32[65536]{0} %arg_param.1, u32[65536]{0} %random_param.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  StochasticConvertDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Select(op::Compare(), op::Broadcast(),
+                         op::Select(op::Compare(), op::Broadcast(),
+                                    op::Select(op::Compare(), op::Negate(),
+                                               op::Select()))));
+}
+
+TEST_F(StochasticConvertDecomposerTest, DecomposeStochasticConvertBF16ToS8) {
+  const std::string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  %arg_param.1 = bf16[65536]{0} parameter(0)
+  %random_param.2 = u16[65536]{0} parameter(1)
+  ROOT %stochastic-convert.3 = s8[65536]{0} stochastic-convert(bf16[65536]{0} %arg_param.1, u16[65536]{0} %random_param.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  StochasticConvertDecomposer decomposer;
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Select(op::Compare(), op::Broadcast(),
+                         op::Select(op::Compare(), op::Broadcast(),
+                                    op::Select(op::Compare(), op::Negate(),
+                                               op::Select()))));
+}
+
+TEST_F(StochasticConvertDecomposerTest, WrongRandomBitWidth) {
+  const std::string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  %arg_param.1 = bf16[65536]{0} parameter(0)
+  %random_param.2 = u32[65536]{0} parameter(1)
+  ROOT %stochastic-convert.3 = s32[65536]{0} stochastic-convert(bf16[65536]{0} %arg_param.1, u32[65536]{0} %random_param.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  StochasticConvertDecomposer decomposer;
+
+  auto result = decomposer.Run(module.get());
+  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_THAT(result.status().error_message(), HasSubstr("have same bits"));
+}
+
+TEST_F(StochasticConvertDecomposerTest, WrongRandomType) {
+  const std::string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  %arg_param.1 = f32[65536]{0} parameter(0)
+  %random_param.2 = s32[65536]{0} parameter(1)
+  ROOT %stochastic-convert.3 = s32[65536]{0} stochastic-convert(f32[65536]{0} %arg_param.1, s32[65536]{0} %random_param.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  StochasticConvertDecomposer decomposer;
+
+  auto result = decomposer.Run(module.get());
+  EXPECT_NE(OkStatus(), result.status());
+  EXPECT_THAT(result.status().error_message(),
+              HasSubstr("must be unsigned integers"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.cc b/tensorflow/compiler/xla/service/topk_rewriter.cc
index 715e4d717d2..9c022b7545f 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <optional>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.h b/tensorflow/compiler/xla/service/topk_rewriter.h
index c4b03d53fe8..1b90b5dfc02 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter.h
+++ b/tensorflow/compiler/xla/service/topk_rewriter.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/topk_rewriter_test.cc b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
index 9bff99e0af7..495346e6ddc 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/topk_rewriter.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
diff --git a/tensorflow/compiler/xla/service/tpu_computation_placer.cc b/tensorflow/compiler/xla/service/tpu_computation_placer.cc
index 43aeaff27b2..1d6a9e9223b 100644
--- a/tensorflow/compiler/xla/service/tpu_computation_placer.cc
+++ b/tensorflow/compiler/xla/service/tpu_computation_placer.cc
@@ -27,11 +27,11 @@ template <typename T>
 using StatusOr = TpuComputationPlacer::StatusOr<T>;
 
 TpuComputationPlacer::TpuComputationPlacer() {
-  placer_ = tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_NewFn();
+  placer_ = stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_NewFn();
 }
 
 TpuComputationPlacer::~TpuComputationPlacer() {
-  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_FreeFn(placer_);
+  stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_FreeFn(placer_);
 }
 
 StatusOr<int> TpuComputationPlacer::DeviceId(int replica, int computation,
@@ -44,7 +44,7 @@ StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
     int replica_count, int computation_count) {
   StatusHelper status;
   xla::DeviceAssignment result(replica_count, computation_count);
-  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignDevicesFn(
+  stream_executor::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignDevicesFn(
       placer_, replica_count, computation_count, result.data(),
       status.c_status);
   if (!status.ok()) {
@@ -59,9 +59,10 @@ TpuComputationPlacer::AssignLocalDevices(TpuHostLocationExternal host_location,
                                          int computation_count) {
   StatusHelper status;
   xla::DeviceAssignment result(replica_count, computation_count);
-  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignLocalDevicesFn(
-      host_location.impl(), replica_count, computation_count, result.data(),
-      status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuComputationPlacer_AssignLocalDevicesFn(
+          host_location.impl(), replica_count, computation_count, result.data(),
+          status.c_status);
   if (!status.ok()) {
     return status.status();
   }
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 358142decc5..c44efa84713 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -45,31 +45,12 @@ TransferManager::GetPlatformTransferManagers() {
   return r;
 }
 
-TransferManager::TransferMetadata::~TransferMetadata() {}
-
 StatusOr<Literal> TransferManager::TransferLiteralFromDevice(
     se::Stream* stream, const ShapedBuffer& device_buffer,
     const TransferMetadata* transfer_metadata) {
-  StatusOr<Literal> ret;
-
-  se::Stream* substream = stream->GetOrCreateSubStream();
-  substream->ThenWaitFor(stream);
-  absl::Cleanup cleanup = [&]() { stream->ReturnSubStream(substream); };
-
-  tsl::Notification n;
-  Status s;
   Literal literal(device_buffer.on_host_shape());
-  TransferLiteralFromDevice(
-      substream, device_buffer, &literal,
-      [&](Status status) {
-        s = status;
-        n.Notify();
-      },
-      transfer_metadata);
-  n.WaitForNotification();
-  if (!s.ok()) {
-    return s;
-  }
+  TF_RETURN_IF_ERROR(TransferLiteralFromDevice(stream, device_buffer, &literal,
+                                               transfer_metadata));
   return std::move(literal);
 }
 
@@ -78,6 +59,7 @@ Status TransferManager::TransferLiteralFromDevice(
     const MutableBorrowingLiteral& literal,
     const TransferMetadata* transfer_metadata) {
   se::Stream* substream = stream->GetOrCreateSubStream();
+  substream->ThenWaitFor(stream);
   absl::Cleanup cleanup = [&]() { stream->ReturnSubStream(substream); };
 
   Status ret;
@@ -111,27 +93,14 @@ Status TransferManager::TransferLiteralToDevice(
 StatusOr<Literal> TransferManager::TransferArrayFromDevice(
     se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
     const TransferMetadata* transfer_metadata) {
-  StatusOr<Literal> ret;
-  // Implement the synchronous version by waiting on the asynchronous version.
-  // Use a substream so that if we are called from a HostCallback we don't
-  // deadlock.
-  se::Stream* substream = stream->GetOrCreateSubStream();
-  absl::Cleanup cleanup = [&]() { stream->ReturnSubStream(substream); };
-
-  tsl::Notification n;
+  TF_RET_CHECK(shape.IsArray());
+  TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+      HostShapeToDeviceShape(shape), shape));
   Literal literal(shape);
-  Status s;
-  TransferArrayFromDevice(
-      substream, shape, source, &literal,
-      [&](Status status) {
-        s = status;
-        n.Notify();
-      },
-      transfer_metadata);
-  n.WaitForNotification();
-  if (!s.ok()) {
-    return s;
-  }
+  ShapedBuffer shaped_buffer(shape, stream->parent()->device_ordinal());
+  shaped_buffer.set_buffer(source, /*index=*/{});
+  TF_RETURN_IF_ERROR(TransferLiteralFromDevice(stream, shaped_buffer, &literal,
+                                               transfer_metadata));
   return std::move(literal);
 }
 
@@ -143,6 +112,7 @@ Status TransferManager::TransferArrayToDevice(
   // Use a substream so that if we are called from a HostCallback we don't
   // deadlock.
   se::Stream* substream = stream->GetOrCreateSubStream();
+  substream->ThenWaitFor(stream);
   absl::Cleanup cleanup = [&]() { stream->ReturnSubStream(substream); };
   TF_RETURN_IF_ERROR(
       TransferArrayToDeviceAsync(substream, literal, dest, transfer_metadata));
@@ -153,49 +123,16 @@ Status TransferManager::TransferArrayToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest,
     const TransferMetadata* transfer_metadata) {
-  const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
-  TF_RET_CHECK(on_device_shape.IsArray())
-      << "On-device representation of "
-      << ShapeUtil::HumanString(literal.shape())
-      << " is not an array: " << ShapeUtil::HumanString(on_device_shape);
-  if (dest.size() < GetByteSizeRequirement(on_device_shape)) {
-    return FailedPrecondition(
-        "Allocation on device not large enough for array: "
-        "%d < %d",
-        dest.size(), GetByteSizeRequirement(on_device_shape));
-  }
-  ShapedBuffer shaped_buffer(on_device_shape,
+  TF_RET_CHECK(literal.shape().IsArray());
+  ShapedBuffer shaped_buffer(HostShapeToDeviceShape(literal.shape()),
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(dest, /*index=*/{});
-  return TransferLiteralToDevice(stream, literal, shaped_buffer,
-                                 transfer_metadata);
-}
-
-void TransferManager::TransferArrayFromDevice(
-    se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source,
-    const MutableBorrowingLiteral& literal, std::function<void(Status)> done,
-    const TransferMetadata* transfer_metadata) {
-  if (!Shape::Equal().MinorToMajorOnlyInLayout()(HostShapeToDeviceShape(shape),
-                                                 shape)) {
-    auto error = StrCat("Shape ", ShapeUtil::HumanString(shape),
-                        " has a differently shaped representation on-device: ",
-                        ShapeUtil::HumanString(HostShapeToDeviceShape(shape)));
-    return done(FailedPrecondition("%s", error));
-  }
-  if (source.size() < GetByteSizeRequirement(shape)) {
-    return done(
-        FailedPrecondition("Allocation on device not large enough for array: "
-                           "%d < %d",
-                           source.size(), GetByteSizeRequirement(shape)));
-  }
-  ShapedBuffer shaped_buffer(shape, stream->parent()->device_ordinal());
-  shaped_buffer.set_buffer(source, /*index=*/{});
-  return TransferLiteralFromDevice(stream, shaped_buffer, literal,
-                                   std::move(done), transfer_metadata);
+  return TransferLiteralToDeviceAsync(stream, literal, shaped_buffer,
+                                      transfer_metadata);
 }
 
 Status TransferManager::ReadDynamicShapes(se::Stream* stream,
-                                          ShapedBuffer* device_buffer,
+                                          const ShapedBuffer* device_buffer,
                                           Shape* device_shape) {
   DCHECK(device_shape->is_dynamic());
   Shape original_device_shape = *device_shape;
@@ -203,8 +140,8 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
 
   TF_ASSIGN_OR_RETURN(auto compiler,
                       Compiler::GetForPlatform(stream->parent()->platform()));
-  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachElementWithStatus(
+      [&](const ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
         const Shape& buffer_shape =
             ShapeUtil::GetSubshape(*device_shape, index);
         if (buffer_shape.IsTuple()) {
@@ -225,7 +162,7 @@ Status TransferManager::ReadDynamicShapes(se::Stream* stream,
         if (metadata_size == 0) {
           return InvalidArgument("Dynamic shape metadata size should not be 0");
         }
-        auto buffer_8 = se::DeviceMemory<uint8_t>(*buffer);
+        auto buffer_8 = se::DeviceMemory<uint8_t>(buffer);
         auto metadata_buffer =
             stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
         TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 88acf7c5074..b5658e3dd85 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -39,7 +39,7 @@ namespace xla {
 // device in terms of padding, leading dimension, etc.
 class TransferManager {
  public:
-  virtual ~TransferManager() {}
+  virtual ~TransferManager() = default;
 
   // Returns the ID of the platform that this transfer manager acts on.
   virtual se::Platform::Id PlatformId() const = 0;
@@ -62,7 +62,7 @@ class TransferManager {
   // subclass this class.
   class TransferMetadata {
    public:
-    virtual ~TransferMetadata() = 0;
+    virtual ~TransferMetadata() = default;
   };
   // Returns a literal containing the data held in the given ShapedBuffer
   // using the provided executor. This operation is performed synchronously
@@ -72,22 +72,14 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  virtual StatusOr<Literal> TransferLiteralFromDevice(
-      se::Stream* stream, const ShapedBuffer& device_buffer,
-      const TransferMetadata* transfer_metadata);
   StatusOr<Literal> TransferLiteralFromDevice(
-      se::Stream* stream, const ShapedBuffer& device_buffer) {
-    return TransferLiteralFromDevice(stream, device_buffer, nullptr);
-  }
-  virtual Status TransferLiteralFromDevice(
+      se::Stream* stream, const ShapedBuffer& device_buffer,
+      const TransferMetadata* transfer_metadata = nullptr);
+
+  Status TransferLiteralFromDevice(
       se::Stream* stream, const ShapedBuffer& device_buffer,
       const MutableBorrowingLiteral& literal,
-      const TransferMetadata* transfer_metadata);
-  Status TransferLiteralFromDevice(se::Stream* stream,
-                                   const ShapedBuffer& device_buffer,
-                                   const MutableBorrowingLiteral& literal) {
-    return TransferLiteralFromDevice(stream, device_buffer, literal, nullptr);
-  }
+      const TransferMetadata* transfer_metadata = nullptr);
 
   // Begins transferring a literal containing the data held in the given
   // ShapedBuffer using the provided executor.
@@ -105,6 +97,7 @@ class TransferManager {
       se::Stream* stream, const ShapedBuffer& device_buffer,
       MutableBorrowingLiteral literal, std::function<void(Status)> done,
       const TransferMetadata* transfer_metadata) = 0;
+
   void TransferLiteralFromDevice(se::Stream* stream,
                                  const ShapedBuffer& device_buffer,
                                  MutableBorrowingLiteral literal,
@@ -124,15 +117,10 @@ class TransferManager {
   //
   // Optionally caller can specify platform-specific transfer metadata that
   // tells the actual implementation to do something special.
-  virtual Status TransferLiteralToDevice(
+  Status TransferLiteralToDevice(
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer,
-      const TransferMetadata* transfer_metadata);
-  Status TransferLiteralToDevice(se::Stream* stream,
-                                 const LiteralSlice& literal,
-                                 const ShapedBuffer& device_buffer) {
-    return TransferLiteralToDevice(stream, literal, device_buffer, nullptr);
-  }
+      const TransferMetadata* transfer_metadata = nullptr);
 
   // Transfers the given literal into the previously allocated device memory
   // represented by the given ShapedBuffer using the given executor. The shape
@@ -154,6 +142,7 @@ class TransferManager {
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata) = 0;
+
   Status TransferLiteralToDeviceAsync(se::Stream* stream,
                                       const LiteralSlice& literal,
                                       const ShapedBuffer& device_buffer) {
@@ -171,16 +160,12 @@ class TransferManager {
       se::Stream* stream, const LiteralSlice& literal,
       const se::DeviceMemoryBase& dest,
       const TransferMetadata* transfer_metadata = nullptr);
-  void TransferArrayFromDevice(
-      se::Stream* stream, const Shape& shape,
-      const se::DeviceMemoryBase& source,
-      const MutableBorrowingLiteral& literal, std::function<void(Status)> done,
-      const TransferMetadata* transfer_metadata = nullptr);
 
   Status TransferArrayToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
       const se::DeviceMemoryBase& dest,
       const TransferMetadata* transfer_metadata = nullptr);
+
   StatusOr<Literal> TransferArrayFromDevice(
       se::Stream* stream, const Shape& shape,
       const se::DeviceMemoryBase& source,
@@ -192,7 +177,7 @@ class TransferManager {
   // The shape of the buffer also have to be compatible with the host shape and
   // device shape.
   virtual Status ReadDynamicShapes(se::Stream* stream,
-                                   ShapedBuffer* device_buffer,
+                                   const ShapedBuffer* device_buffer,
                                    Shape* device_shape);
 
   // Transfers the given literal into the Infeed interface of the device,
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 83076fecae4..9ba9cea17e4 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h
index 35bb1c92763..ba33f4e46ba 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.h
+++ b/tensorflow/compiler/xla/service/transpose_folding.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index ba96de401c6..2cd8e83e2b6 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -21,13 +21,13 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc b/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc
index ad85f9f40ff..8f2a11a2523 100644
--- a/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc
+++ b/tensorflow/compiler/xla/service/tree_reduction_rewriter.cc
@@ -23,13 +23,12 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/tree_reduction_rewriter.h b/tensorflow/compiler/xla/service/tree_reduction_rewriter.h
index 56392908e4a..067d2bcbb27 100644
--- a/tensorflow/compiler/xla/service/tree_reduction_rewriter.h
+++ b/tensorflow/compiler/xla/service/tree_reduction_rewriter.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 8695486605c..59e9ba661f7 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -539,6 +539,40 @@ Status TuplePointsToAnalysis::HandleCustomCall(HloInstruction* custom_call) {
   return OkStatus();
 }
 
+// WARNING:
+// Adding this, which essentially does the same thing as HandleCustomCall
+// Not sure if it is really needed or it will break anything
+Status TuplePointsToAnalysis::HandleFusion(HloInstruction* fusion) {
+  auto cfusion = Cast<HloFusionInstruction>(fusion);
+  PointsToSet& points_to_set = CreateEmptyPointsToSet(fusion);
+  absl::flat_hash_map<ShapeIndex, std::pair<int64_t, ShapeIndex>>
+      aliased_outputs;
+  for (const auto& pair : cfusion->output_to_operand_aliasing()) {
+    aliased_outputs.emplace(pair.first, pair.second);
+  }
+  points_to_set.ForEachMutableElement([&](const ShapeIndex& index,
+                                          PointsToSet::BufferList* buffers) {
+    auto it = aliased_outputs.find(index);
+    if (it == aliased_outputs.end()) {
+      points_to_set.AddPointedToBuffer(
+          logical_buffer_analysis_->GetBuffer(fusion, index), index);
+    } else {
+      const PointsToSet& input_set =
+          *PerInst(cfusion->operand(it->second.first))->points_to_set;
+      for (const LogicalBuffer* input_buffer :
+           input_set.element(it->second.second)) {
+        points_to_set.AddPointedToBuffer(*input_buffer, index);
+      }
+
+      for (HloInstruction* tuple : input_set.tuple_sources(it->second.second)) {
+        points_to_set.add_tuple_source(index, tuple);
+      }
+    }
+  });
+  points_to_set.add_tuple_source({}, fusion);
+  return OkStatus();
+}
+
 Status TuplePointsToAnalysis::HandleOptimizationBarrier(
     HloInstruction* barrier) {
   // A kOptimizationBarrier instruction is a no-op.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 73e5e8936f7..685c2a68a86 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/logical_buffer_analysis.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -258,6 +258,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleSend(HloInstruction* send) override;
   Status HandleAddDependency(HloInstruction* add_dependency) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleFusion(HloInstruction* fusion) override;
   Status HandleOptimizationBarrier(HloInstruction* barrier) override;
 
   std::string ToString() const;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index c94cfb90eda..aa7a68a200d 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 6f3e7b19dab..71aadfb0f44 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -17,15 +17,9 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
 namespace xla {
 
@@ -97,12 +91,13 @@ StatusOr<bool> TupleSimplifier::Run(
         // if only a subset of tuple's elements are used, this transform
         // optimizes them one at a time, and after the last use is optimized,
         // the Tuple will also be deleted.
-        HloInstruction* replacement = nullptr;
-        if (ShapeUtil::Compatible(ancestor.first->shape(),
-                                  instruction->shape())) {
-          replacement = ancestor.first;
-        } else if (ancestor.first->opcode() == HloOpcode::kTuple) {
-          replacement = ancestor.first->mutable_operand(ancestor.second[0]);
+        HloInstruction* replacement = ancestor.first;
+        for (int i = 0; i < ancestor.second.size(); ++i) {
+          if (replacement->opcode() != HloOpcode::kTuple) {
+            replacement = nullptr;
+            break;
+          }
+          replacement = replacement->mutable_operand(ancestor.second[i]);
         }
 
         if (replacement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index 7bc97813cd1..6faa476ecc6 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index 5b749394563..25233153eb8 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -300,5 +300,32 @@ TEST_F(TupleSimplifierTest, ShardingLoss) {
   Run(m.get(), /*change_expected=*/false);
 }
 
+TEST_F(TupleSimplifierTest, NestedTuple) {
+  const char* kModuleStr = R"(
+    HloModule m
+
+    ENTRY test {
+      p0 = s32[10] parameter(0), sharding={devices=[2]0,1}
+      p1 = s32[10] parameter(1), sharding={devices=[2]0,1}
+      p2 = s32[10] parameter(2), sharding={devices=[2]0,1}
+      p3 = s32[10] parameter(3), sharding={devices=[2]0,1}
+      t = (s32[10], s32[10]) tuple(p0, p1), sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+      t2 = ((s32[10], s32[10]), s32[10]) tuple(t, p2), sharding={{devices=[2]0,1}, {devices=[2]0,1}, {devices=[2]0,1}}
+      t3 = (((s32[10], s32[10]), s32[10]), s32[10]) tuple(t2, p3), sharding={{devices=[2]0,1}, {devices=[2]0,1}, {devices=[2]0,1}, {devices=[2]0,1}}
+      gte0 = ((s32[10], s32[10]), s32[10]) get-tuple-element(t3), index=0, sharding={{replicated}, {replicated}, {replicated}}
+      gte1 = (s32[10], s32[10]) get-tuple-element(gte0), index=0, sharding={{replicated}, {replicated}}
+      gte2 = s32[10] get-tuple-element(gte1), index=1, sharding={devices=[2]0,1}
+      gte3 = s32[10] get-tuple-element(gte1), index=0, sharding={replicated}
+      ROOT to = (s32[10], s32[10]) tuple(gte2, gte3)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  Run(m.get(), /*change_expected=*/true);
+  auto* p1 = FindInstruction(m.get(), "p1");
+  auto* gte3 = FindInstruction(m.get(), "gte3");
+  EXPECT_THAT(m->entry_computation()->root_instruction()->operand(0), p1);
+  EXPECT_THAT(m->entry_computation()->root_instruction()->operand(1), gte3);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_util.cc b/tensorflow/compiler/xla/service/tuple_util.cc
index 166a5e9a593..b464785216e 100644
--- a/tensorflow/compiler/xla/service/tuple_util.cc
+++ b/tensorflow/compiler/xla/service/tuple_util.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/tuple_util.h b/tensorflow/compiler/xla/service/tuple_util.h
index 0e12b38ea99..e2067c57f35 100644
--- a/tensorflow/compiler/xla/service/tuple_util.h
+++ b/tensorflow/compiler/xla/service/tuple_util.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TUPLE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_TUPLE_UTIL_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
index a6740899d06..5999d5d1ece 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.cc
@@ -25,16 +25,16 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
index f65c968ede4..40a8c6e13a4 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ALL_REDUCE_CODE_MOTION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
index 4ef6b22b18a..cd868d2abdb 100644
--- a/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_all_reduce_code_motion_test.cc
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 0113dad037e..094d3e4f61e 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index 735ac01bccb..274d16a35f2 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <optional>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/service/while_loop_concat_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_concat_code_motion.cc
index c2e317e4247..0ab9a655110 100644
--- a/tensorflow/compiler/xla/service/while_loop_concat_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_concat_code_motion.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_concat_code_motion.h b/tensorflow/compiler/xla/service/while_loop_concat_code_motion.h
index 12d8ddd5a7a..b27e573ab9a 100644
--- a/tensorflow/compiler/xla/service/while_loop_concat_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_concat_code_motion.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONCAT_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONCAT_CODE_MOTION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_concat_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_concat_code_motion_test.cc
index 9f7b218f25a..7302370531f 100644
--- a/tensorflow/compiler/xla/service/while_loop_concat_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_concat_code_motion_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
index 4c1c52543f7..862c80572a6 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
index dd1a672008e..a4f642a7e82 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.cc
@@ -171,7 +171,9 @@ StatusOr<bool> WhileLoopExpensiveInvariantCodeMotion::
 
   // LICM in the presence of domain instructions is complex, bail.
   for (auto* instruction : while_body->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kDomain) {
+    if (instruction->opcode() == HloOpcode::kDomain ||
+        instruction->IsCustomCall("SPMDFullToShardShape") ||
+        instruction->IsCustomCall("SPMDShardShapeToFull")) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
index cd277cebd4e..b226a2623e4 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_EXPENSIVE_INVARIANT_CODE_MOTION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
index fead0139c10..e0bf5747c14 100644
--- a/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_expensive_invariant_code_motion_test.cc
@@ -235,5 +235,47 @@ ENTRY entry {
   EXPECT_THAT(while_body->instructions(), Not(Contains(op::Dot())));
 }
 
+TEST_F(WhileLoopExpensiveInvariantCodeMotionTest,
+       DoesNotHoistShardingCustomCalls) {
+  constexpr char kModuleWithShardingCustomCalls[] = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[4, 4], f32[4, 4]) parameter(0)
+  a = f32[4, 4] get-tuple-element(p_body), index=0
+  custom-call.1 = f32[4, 4] custom-call(a), custom_call_target="Sharding", sharding={devices=[4,1]0,1,2,3}
+  custom-call.2 = f32[4, 4] custom-call(custom-call.1), custom_call_target="SPMDFullToShardShape", sharding={manual}
+  dot = f32[4, 4] dot(a, a), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  b = f32[4, 4] get-tuple-element(p_body), index=1
+  add = f32[4, 4] add(b, dot)
+  custom-call.3 = f32[4, 4] custom-call(add), custom_call_target="Sharding", sharding={manual}
+  custom-call.4 = f32[4, 4] custom-call(custom-call.3), custom_call_target="SPMDShardToFullShape", sharding={devices=[4,1]0,1,2,3}
+  ROOT root = (f32[4, 4], f32[4, 4]) tuple(a, custom-call.4)
+}
+
+condition {
+  p_cond = (f32[4, 4], f32[4, 4]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  param0 = f32[4, 4] parameter(0)
+  param1 = f32[4, 4] parameter(1)
+  while_init = (f32[4, 4], f32[4, 4]) tuple(param0, param1)
+  ROOT while = (f32[4, 4], f32[4, 4]) while(while_init), condition=condition, body=body
+}
+)";
+  auto m = ParseAndReturnVerifiedModule(kModuleWithShardingCustomCalls).value();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopExpensiveInvariantCodeMotion(
+          /*worth_hoisting_individually=*/[](const HloInstruction& instr) {
+            return instr.opcode() == HloOpcode::kDot;
+          })
+          .Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 5b59a513f4d..ea70f06c0fa 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -195,7 +195,9 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 
   // LICM in the presence of domain instructions is complex, bail.
   for (auto* instruction : while_body->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kDomain) {
+    if (instruction->opcode() == HloOpcode::kDomain ||
+        instruction->IsCustomCall("SPMDFullToShardShape") ||
+        instruction->IsCustomCall("SPMDShardShapeToFull")) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index b9c5a4b0576..150bf9772bc 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/compile_time_cap.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index e5b007a5b0b..bc039b8abbd 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -628,5 +628,56 @@ TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
   EXPECT_FALSE(simplified_loop);
 }
 
+TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistShardingCustomCalls) {
+  auto m = CreateNewVerifiedModule();
+  auto array_s32 = ShapeUtil::MakeShape(S32, {4});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({array_s32, array_s32, array_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(array_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(array_s32, param, 1));
+    HloInstruction* sharded_gte_1 = builder.AddInstruction(
+        HloInstruction::CreateCustomCall(array_s32, {gte_1}, "Sharding"));
+    sharded_gte_1->set_sharding(HloSharding::Tile1D(array_s32, 4));
+    HloInstruction* manually_sharded_gte_1 =
+        builder.AddInstruction(HloInstruction::CreateCustomCall(
+            array_s32, {sharded_gte_1}, "SPMDFullToShardShape"));
+
+    manually_sharded_gte_1->set_sharding(HloSharding::Manual());
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            array_s32, HloOpcode::kAdd, gte_0, manually_sharded_gte_1));
+    HloInstruction* manually_sharded_add_result = builder.AddInstruction(
+        HloInstruction::CreateCustomCall(array_s32, {add_result}, "Sharding"));
+    manually_sharded_add_result->set_sharding(HloSharding::Manual());
+    HloInstruction* sharded_add_result =
+        builder.AddInstruction(HloInstruction::CreateCustomCall(
+            array_s32, {manually_sharded_add_result}, "SPMDShardShapeToFull"));
+    sharded_add_result->set_sharding(HloSharding::Tile1D(array_s32, 4));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_0, gte_1, sharded_add_result}));
+
+    return m->AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
+  LOG(INFO) << "my_test: " << m->ToString();
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 23b5ac6a75b..f97cdeb2dbc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -22,12 +22,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 2402e727915..3792321b13a 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index b4378376eb3..5ea2b12cf10 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h
index 99eee8d4d53..98316880664 100644
--- a/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h
+++ b/tensorflow/compiler/xla/service/while_loop_trip_count_annotator.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index be0e97987a9..9263448320d 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 696223a25e8..4a704037555 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 class WhileUtil {
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_cpu_test.cc b/tensorflow/compiler/xla/service/xla_aot_compile_cpu_test.cc
new file mode 100644
index 00000000000..be1d9fcc3fe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_cpu_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace xla_compile {
+namespace {
+
+TEST(XlaCompileTest, LoadCpuExecutable) {
+  std::string path = tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                                       "xla_aot_compile_test_cpu_executable");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("Host"));
+  if (platform->VisibleDeviceCount() <= 0) {
+    EXPECT_TRUE(false) << "CPU platform has no visible devices.";
+  }
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input1 = LiteralUtil::CreateR1<double>({0.0f, 1.0f, 2.0f});
+  Literal input2 = LiteralUtil::CreateR1<double>({1.0f, 2.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array1,
+      client->LiteralToShapedBuffer(input1, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array2,
+      client->LiteralToShapedBuffer(input2, client->default_device_ordinal()));
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array1, &array2}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR1<double>({1.0f, 3.0f, 6.0f});
+  EXPECT_EQ(expected, output);
+}
+
+}  // namespace
+}  // namespace xla_compile
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc b/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc
new file mode 100644
index 00000000000..7fdb9e4e981
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc
@@ -0,0 +1,292 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace xla_compile {
+namespace {
+
+TEST(XlaCompileTest, LoadGpuExecutable) {
+  std::string path = tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                                       "xla_aot_compile_test_gpu_executable");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("CUDA"));
+  ASSERT_GT(platform->VisibleDeviceCount(), 0);
+
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input1 = LiteralUtil::CreateR1<double>({0.0f, 1.0f, 2.0f});
+  Literal input2 = LiteralUtil::CreateR1<double>({1.0f, 2.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array1,
+      client->LiteralToShapedBuffer(input1, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array2,
+      client->LiteralToShapedBuffer(input2, client->default_device_ordinal()));
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array1, &array2}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR1<double>({1.0f, 3.0f, 6.0f});
+  EXPECT_EQ(expected, output);
+}
+
+TEST(XlaCompileTest, LoadGpuExecutableWithConstant) {
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                        "xla_aot_compile_test_gpu_executable_constant");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("CUDA"));
+  ASSERT_GT(platform->VisibleDeviceCount(), 0);
+
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input = LiteralUtil::CreateR1<double>({3.0f, 3.0f, 3.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array,
+      client->LiteralToShapedBuffer(input, client->default_device_ordinal()));
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR1<double>({4.0f, 5.0f, 6.0f});
+  EXPECT_EQ(expected, output);
+}
+
+TEST(XlaCompileTest, LoadGpuExecutableWithGemm) {
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                        "xla_aot_compile_test_gpu_executable_gemm");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Check that GemmAlgorithmPicker successfully loaded autotune results.
+  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "algorithm = 13 : i64"));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("CUDA"));
+  ASSERT_GT(platform->VisibleDeviceCount(), 0);
+
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input1 = LiteralUtil::CreateR2<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+  Literal input2 = LiteralUtil::CreateR2<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array1,
+      client->LiteralToShapedBuffer(input1, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array2,
+      client->LiteralToShapedBuffer(input2, client->default_device_ordinal()));
+
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array1, &array2}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR2<float>(
+      {{30.0f, 36.0f, 42.0f}, {66.0, 81.0, 96.0}, {102.0, 126.0, 150.0}});
+  EXPECT_EQ(expected, output);
+}
+
+TEST(XlaCompileTest, LoadGpuExecutableWithGemmRuntimeAutotuning) {
+  std::string path = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "service",
+      "xla_aot_compile_test_gpu_executable_gemm_runtime_autotuning");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Check that runtime autotuning is enabled.
+  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "algorithm = -5 : i64"));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("CUDA"));
+  ASSERT_GT(platform->VisibleDeviceCount(), 0);
+
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input1 = LiteralUtil::CreateR2<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+  Literal input2 = LiteralUtil::CreateR2<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array1,
+      client->LiteralToShapedBuffer(input1, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array2,
+      client->LiteralToShapedBuffer(input2, client->default_device_ordinal()));
+
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array1, &array2}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR2<float>(
+      {{30.0f, 36.0f, 42.0f}, {66.0, 81.0, 96.0}, {102.0, 126.0, 150.0}});
+  EXPECT_EQ(expected, output);
+}
+
+TEST(XlaCompileTest, LoadGpuExecutableWithConvolution) {
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                        "xla_aot_compile_test_gpu_executable_convolution");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Check that GpuConvAlgorithmPicker successfully loaded autotune results.
+  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "\"algo_id\":\"3\""));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("CUDA"));
+  ASSERT_GT(platform->VisibleDeviceCount(), 0);
+
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  Literal input1 = LiteralUtil::CreateR4<float>(
+      {{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}, {7.0, 8.0}},
+        {{11.0, 12.0}, {13.0, 14.0}, {15.0, 16.0}, {17.0, 18.0}},
+        {{21.0, 22.0}, {23.0, 24.0}, {25.0, 26.0}, {27.0, 28.0}},
+        {{31.0, 32.0}, {33.0, 34.0}, {35.0, 36.0}, {37.0, 38.0}}}});
+  Literal input2 =
+      LiteralUtil::CreateR4<float>({{{{1.0}, {2.0}}, {{3.0}, {4.0}}},
+                                    {{{5.0}, {6.0}}, {{7.0}, {8.0}}},
+                                    {{{9.0}, {10.0}}, {{11.0}, {12.0}}}});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array1,
+      client->LiteralToShapedBuffer(input1, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer array2,
+      client->LiteralToShapedBuffer(input2, client->default_device_ordinal()));
+
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer result,
+      local_executable->Run({&array1, &array2}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal output,
+                          client->ShapedBufferToLiteral(result));
+  Literal expected = LiteralUtil::CreateR4<float>({{
+      {{1310.0}, {1466.0}, {1622.0}},
+      {{2090.0}, {2246.0}, {2402.0}},
+  }});
+  EXPECT_EQ(expected, output);
+}
+
+}  // namespace
+}  // namespace xla_compile
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_cpu_test.cc b/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
new file mode 100644
index 00000000000..0558065766c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace xla_compile {
+namespace {
+
+TEST(XlaCompileTest, LoadCpuExecutable) {
+  std::string path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service",
+                        "xla_aot_compile_stablehlo_test_cpu_executable");
+  std::string serialized_aot_result;
+  TF_ASSERT_OK(
+      tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
+
+  // Get a LocalClient
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetPlatform("Host"));
+  if (platform->VisibleDeviceCount() <= 0) {
+    EXPECT_TRUE(false) << "CPU platform has no visible devices.";
+  }
+  LocalClientOptions local_client_options;
+  local_client_options.set_platform(platform);
+  TF_ASSERT_OK_AND_ASSIGN(
+      LocalClient * client,
+      ClientLibrary::GetOrCreateLocalClient(local_client_options));
+
+  // Load from AOT result.
+  ExecutableBuildOptions executable_build_options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Load(serialized_aot_result, executable_build_options));
+
+  // Run loaded excutable.
+  auto alpha_literal = xla::LiteralUtil::CreateR0<float>(3.14f);
+  auto x_literal = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  auto y_literal =
+      xla::LiteralUtil::CreateR1<float>({10.5f, 20.5f, 30.5f, 40.5f});
+  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer alpha,
+                          client->LiteralToShapedBuffer(
+                              alpha_literal, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer x,
+                          client->LiteralToShapedBuffer(
+                              x_literal, client->default_device_ordinal()));
+  TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer y,
+                          client->LiteralToShapedBuffer(
+                              y_literal, client->default_device_ordinal()));
+  ExecutableRunOptions executable_run_options;
+  executable_run_options.set_allocator(client->backend().memory_allocator());
+  TF_ASSERT_OK_AND_ASSIGN(
+      ScopedShapedBuffer axpy_result,
+      local_executable->Run({&alpha, &x, &y}, executable_run_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal axpy_result_literal,
+                          client->ShapedBufferToLiteral(axpy_result));
+  xla::LiteralTestUtil::ExpectR1Near<float>({13.64f, 26.78f, 39.92f, 53.06f},
+                                            axpy_result_literal,
+                                            xla::ErrorSpec(0.01f));
+}
+
+}  // namespace
+}  // namespace xla_compile
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_test.mlir b/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_test.mlir
new file mode 100644
index 00000000000..9925ef2a232
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_stablehlo_test.mlir
@@ -0,0 +1,10 @@
+module @axpy {
+  func.func public @main(%alpha: tensor<f32>, %x: tensor<4 x f32>, %y: tensor<4 x f32>) -> tensor<4 x f32> {
+    %a = "stablehlo.broadcast_in_dim" (%alpha) {
+      broadcast_dimensions = dense<[]> : tensor<0 x i64>
+    } : (tensor<f32>) -> tensor<4 x f32>
+    %ax = stablehlo.multiply %a, %x : tensor<4 x f32>
+    %result = stablehlo.add %ax, %y : tensor<4 x f32>
+    return %result: tensor<4 x f32>
+  }
+}
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test.mlir b/tensorflow/compiler/xla/service/xla_aot_compile_test.mlir
new file mode 100644
index 00000000000..6fd314a6555
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test.mlir
@@ -0,0 +1,6 @@
+module @foo {
+  func.func public @main(%arg1: tensor<3 x f64>, %arg2: tensor<3 x f64>) -> tensor<3 x f64> {
+    %a = mhlo.add %arg1, %arg2 : tensor<3 x f64>
+    return %a: tensor<3 x f64>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt b/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt
new file mode 100644
index 00000000000..60e8cb34bb8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -0,0 +1,37 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dots {
+  device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
+  hlo: "f32[3,3]{1,0} custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config=\"{\\\"alpha_real\\\":1,\\\"beta\\\":0,\\\"dot_dimension_numbers\\\":{\\\"lhs_contracting_dimensions\\\":[\\\"1\\\"],\\\"rhs_contracting_dimensions\\\":[\\\"0\\\"],\\\"lhs_batch_dimensions\\\":[],\\\"rhs_batch_dimensions\\\":[]},\\\"alpha_imag\\\":0,\\\"precision_config\\\":{\\\"operand_precision\\\":[\\\"DEFAULT\\\",\\\"DEFAULT\\\"]},\\\"epilogue\\\":\\\"DEFAULT\\\"}\""
+  result {
+    gemm {
+      algorithm: 13
+    }
+  }
+}
+convs {
+  device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
+  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[2,1,4,4]{3,2,1,0}, f32[2,1,3,2]{3,2,1,0}), window={size=2x3}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config=\"{\\\"activation_mode\\\":\\\"0\\\",\\\"conv_result_scale\\\":1,\\\"side_input_scale\\\":0}\""
+  result {
+    run_time {
+      nanos: 45408
+    }
+    algorithm {
+      algo_id: 3
+      workspace_size {
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_constant.mlir b/tensorflow/compiler/xla/service/xla_aot_compile_test_constant.mlir
new file mode 100644
index 00000000000..e433a6dd416
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_constant.mlir
@@ -0,0 +1,7 @@
+module @foo {
+  func.func public @main(%arg1: tensor<3 x f64>) -> tensor<3 x f64> {
+    %a = mhlo.constant dense<[1.0, 2.0, 3.0]> : tensor<3 x f64>
+    %b = mhlo.add %arg1, %a : tensor<3 x f64>
+    return %b: tensor<3 x f64>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_convolution.mlir b/tensorflow/compiler/xla/service/xla_aot_compile_test_convolution.mlir
new file mode 100644
index 00000000000..70f358efda6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_convolution.mlir
@@ -0,0 +1,21 @@
+module @foo {
+  func.func public @main(%inputs : tensor<1x4x4x2xf32>, %weights : tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32> {
+    %res = "mhlo.convolution"(%inputs, %weights) {
+          batch_group_count = 1 : i64,
+          dimension_numbers = #mhlo.conv<raw
+            input_batch_dimension = 0,
+            input_feature_dimension = 3,
+            input_spatial_dimensions = [1, 2],
+            kernel_input_feature_dimension = 2,
+            kernel_output_feature_dimension = 3,
+            kernel_spatial_dimensions = [0, 1],
+            output_batch_dimension = 0,
+            output_feature_dimension = 3,
+            output_spatial_dimensions = [1, 2]
+          >,
+          feature_group_count = 1 : i64,
+          rhs_dilation = dense<1> : tensor<2xi64>,
+          window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32>
+    return %res : tensor<1x2x3x1xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_gemm.mlir b/tensorflow/compiler/xla/service/xla_aot_compile_test_gemm.mlir
new file mode 100644
index 00000000000..07fb00e7e50
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_gemm.mlir
@@ -0,0 +1,12 @@
+module @foo {
+  func.func public @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    %0 = "mhlo.dot_general"(%arg0, %arg1) {
+      dot_dimension_numbers = #mhlo.dot<
+        lhs_contracting_dimensions = [1],
+        rhs_contracting_dimensions = [0]
+      >,
+      precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+    } : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    func.return %0 : tensor<3x3xf32>
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_gpu_target_config.prototxt b/tensorflow/compiler/xla/service/xla_aot_compile_test_gpu_target_config.prototxt
new file mode 100644
index 00000000000..6919246ec2e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_gpu_target_config.prototxt
@@ -0,0 +1,39 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 65536
+  threads_per_core_limit: 2048
+  core_count: 56
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 732160000000
+  l2_cache_size: 4194304
+  clock_rate_ghz: 1.4805
+}
+cuda_compute_capability {
+  major: 6
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 3
+  patch: 2
+}
+device_description_str: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/service/xla_compile.bzl b/tensorflow/compiler/xla/service/xla_compile.bzl
new file mode 100644
index 00000000000..7459967afec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_compile.bzl
@@ -0,0 +1,112 @@
+"""Build macro that compile a Mhlo or StableHlo file into a Aot Result
+
+
+To use from your BUILD file, add the following line to load the macro:
+
+load("//tensorflow/compiler/xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu")
+
+Then call the macro like this:
+
+xla_aot_compile(
+    name = "test_aot_result",
+    module = ":test_module_file",
+)
+
+"""
+
+xla_compile_tool = "//tensorflow/compiler/xla/service:xla_compile"
+
+def xla_aot_compile_cpu(
+        name,
+        module):
+    """Runs xla_compile to compile an MHLO or StableHLO module into an AotCompilationResult for CPU
+
+    Args:
+        name: The name of the build rule.
+        module: The MHLO or StableHLO file to compile.
+    """
+
+    # Run xla_compile to generate the file containing an AotCompilationResult.
+    native.genrule(
+        name = ("gen_" + name),
+        srcs = [module],
+        outs = [name],
+        cmd = ("$(location " + xla_compile_tool + ")" +
+               " --module_file=$(location " + module + ")" +
+               " --output_file=$(location " + name + ")" +
+               " --platform=cpu"),
+        tools = [xla_compile_tool],
+    )
+
+    return
+
+def xla_aot_compile_gpu(
+        name,
+        module,
+        gpu_target_config,
+        autotune_results):
+    """Runs xla_compile to compile an MHLO or StableHLO module into an AotCompilationResult for GPU
+
+    Args:
+        name: The name of the build rule.
+        module: The MHLO or StableHLO file to compile.
+        gpu_target_config: The serialized GpuTargetConfigProto
+        autotune_results: AOT AutotuneResults
+    """
+
+    # Run xla_compile to generate the file containing an AotCompilationResult.
+    native.genrule(
+        name = ("gen_" + name),
+        srcs = [module, gpu_target_config, autotune_results],
+        outs = [name],
+        cmd = (
+            "$(location " + xla_compile_tool + ")" +
+            " --module_file=$(location " + module + ")" +
+            " --output_file=$(location " + name + ")" +
+            " --platform=gpu" +
+            " --gpu_target_config=$(location " + gpu_target_config + ")" +
+            " --autotune_results=$(location " + autotune_results + ")"
+        ),
+        tools = [xla_compile_tool],
+        # copybara:comment_begin(oss-only)
+        target_compatible_with = select({
+            "@local_config_cuda//:is_cuda_enabled": [],
+            "//conditions:default": ["@platforms//:incompatible"],
+        }),
+        # copybara:comment_end
+    )
+
+    return
+
+def xla_aot_compile_gpu_runtime_autotuning(
+        name,
+        module,
+        gpu_target_config):
+    """Runs xla_compile to compile an MHLO or StableHLO module into an AotCompilationResult for GPU
+
+    Args:
+        name: The name of the build rule.
+        module: The MHLO or StableHLO file to compile.
+        gpu_target_config: The serialized GpuTargetConfigProto
+    """
+
+    # Run xla_compile to generate the file containing an AotCompilationResult.
+    native.genrule(
+        name = ("gen_" + name),
+        srcs = [module, gpu_target_config],
+        outs = [name],
+        cmd = (
+            "$(location " + xla_compile_tool + ")" +
+            " --module_file=$(location " + module + ")" +
+            " --output_file=$(location " + name + ")" +
+            " --platform=gpu" +
+            " --gpu_target_config=$(location " + gpu_target_config + ")"
+        ),
+        tools = [xla_compile_tool],
+        # copybara:comment_begin(oss-only)
+        target_compatible_with = select({
+            "@local_config_cuda//:is_cuda_enabled": [],
+            "//conditions:default": ["@platforms//:incompatible"],
+        }),
+        # copybara:comment_end
+    )
diff --git a/tensorflow/compiler/xla/service/xla_compile_main.cc b/tensorflow/compiler/xla/service/xla_compile_main.cc
new file mode 100644
index 00000000000..57cfa3ae6eb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/xla_compile_main.cc
@@ -0,0 +1,222 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/autotune_results.pb.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/init_main.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/service/gpu/executable.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
+#endif
+
+namespace xla {
+namespace xla_compile {
+
+const char kUsageHeader[] =
+    "xla_compile performs ahead-of-time compilation of an MHLO or StableHLO "
+    "module,\nresulting in an AotCompilationResult compiled for CPU.\n"
+    "A typical invocation looks like this:\n"
+    "\n"
+    "   $ xla_compile --module_file=mymodule.mlir --output_file=output "
+    "--platform=cpu"
+    "\n";
+
+StatusOr<std::string> AotCompileCpuExecutable(
+    std::unique_ptr<HloModule> hlo_module) {
+  cpu::CpuCompiler cpu_compiler;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<cpu::CpuExecutable> cpu_executable,
+      cpu_compiler.CompileXlaRuntimeCpuExecutable(std::move(hlo_module)));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
+                      cpu_compiler.Export(cpu_executable.get()));
+  TF_ASSIGN_OR_RETURN(std::string result, aot_result->SerializeAsString());
+  return result;
+}
+
+#if GOOGLE_CUDA
+StatusOr<std::string> AotCompileGpuExecutable(
+    std::unique_ptr<HloModule> hlo_module,
+    const gpu::GpuTargetConfig& gpu_target_config,
+    const AutotuneResults& autotune_results = AutotuneResults()) {
+  gpu::NVPTXCompiler nvptx_compiler;
+  Compiler::CompileOptions compile_options;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module_after_opt,
+                      nvptx_compiler.RunHloPassesWithoutDevice(
+                          std::move(hlo_module), compile_options,
+                          gpu_target_config, autotune_results));
+
+  auto module_group =
+      std::make_unique<HloModuleGroup>(std::move(module_after_opt));
+  AotCompilationOptions aot_options(nvptx_compiler.PlatformId());
+  aot_options.set_target_config(gpu_target_config);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      nvptx_compiler.CompileAheadOfTime(std::move(module_group), aot_options));
+  TF_ASSIGN_OR_RETURN(std::string result, aot_results[0]->SerializeAsString());
+  return result;
+}
+#endif
+
+xla::Status XlaCompileMain(const std::string& module_path,
+                           const std::string& output_path,
+                           const std::string& platform,
+                           const std::string& gpu_target_config_path,
+                           const std::string& autotune_results_path) {
+  std::string module_string;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), module_path, &module_string));
+
+  mlir::DialectRegistry dialects;
+  // TODO(b/248362914): Register all required dialects.
+  dialects.insert<mlir::arith::ArithDialect>();
+  dialects.insert<mlir::mhlo::MhloDialect>();
+  dialects.insert<mlir::func::FuncDialect>();
+  mlir::stablehlo::registerAllDialects(dialects);
+
+  // Parse MHLO module.
+  auto threading = mlir::MLIRContext::Threading::DISABLED;
+  auto ctx = std::make_unique<mlir::MLIRContext>(dialects, threading);
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(module_string, ctx.get());
+
+  // Convert Mhlo to Hlo Module.
+  XlaComputation xla_computation;
+  TF_RETURN_IF_ERROR(
+      MlirToXlaComputation(*module, xla_computation, false, false));
+  HloModuleProto hlo_module_proto = xla_computation.proto();
+
+  TF_ASSIGN_OR_RETURN(ProgramShape shape, xla_computation.GetProgramShape());
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  HloModuleConfig config(shape);
+  config.set_debug_options(debug_options);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                      HloModule::CreateFromProto(hlo_module_proto, config));
+
+  // Run AOT compilation.
+  std::string result;
+  if (platform == "cpu") {
+    TF_ASSIGN_OR_RETURN(result, AotCompileCpuExecutable(std::move(hlo_module)));
+#if GOOGLE_CUDA
+  } else if (platform == "gpu") {
+    // Parse GpuTargetConfig.
+    std::string gpu_target_config_string;
+    TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
+                                             gpu_target_config_path,
+                                             &gpu_target_config_string));
+    stream_executor::GpuTargetConfigProto gpu_target_config_proto;
+    bool ok = tsl::protobuf::TextFormat::ParseFromString(
+        gpu_target_config_string, &gpu_target_config_proto);
+    if (!ok) return FailedPrecondition("Failed to parse GpuTargetConfigProto");
+    gpu::GpuTargetConfig gpu_target_config(gpu_target_config_proto);
+
+    if (autotune_results_path.empty()) {
+      TF_ASSIGN_OR_RETURN(result, AotCompileGpuExecutable(std::move(hlo_module),
+                                                          gpu_target_config));
+    } else {
+      // Parse AutotuneResults.
+      std::string autotune_results_string;
+      TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
+                                               autotune_results_path,
+                                               &autotune_results_string));
+      AutotuneResults autotune_results;
+      if (!tsl::protobuf::TextFormat::ParseFromString(autotune_results_string,
+                                                      &autotune_results)) {
+        return FailedPrecondition("Failed to parse AutotuneResults");
+      }
+
+      TF_ASSIGN_OR_RETURN(
+          result, AotCompileGpuExecutable(std::move(hlo_module),
+                                          gpu_target_config, autotune_results));
+    }
+#endif
+  } else {
+    return Unimplemented("platform %s not supported", platform);
+  }
+
+  TF_RETURN_IF_ERROR(
+      tsl::WriteStringToFile(tsl::Env::Default(), output_path, result));
+  return OkStatus();
+}
+
+}  // end namespace xla_compile
+}  // end namespace xla
+
+// Read the input file containing the MHLO module, and write a Serialized
+// AotCompilationResult to the output file.
+int main(int argc, char* argv[]) {
+  std::string module_path;
+  std::string output_path;
+  std::string platform;
+  std::string gpu_target_config_path;
+  std::string autotune_results_path;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("module_file", &module_path,
+                "The path to the MHLO or StableHLO file"),
+      tsl::Flag("output_file", &output_path, "The path to the output file"),
+      tsl::Flag("platform", &platform,
+                "The platform on which the built executable runs"),
+      tsl::Flag("gpu_target_config", &gpu_target_config_path,
+                "The path to serialized GpuTargetConfig, required when"
+                " compiling for GPU"),
+      tsl::Flag("autotune_results", &autotune_results_path,
+                "The path to AutotuneResults, optional when compiling for"
+                " GPU")};
+
+  tsl::string usage = xla::xla_compile::kUsageHeader;
+  usage += tsl::Flags::Usage(argv[0], flag_list);
+  if (argc > 1 && absl::string_view(argv[1]) == "--help") {
+    std::cerr << usage << "\n";
+    return 0;
+  }
+
+  bool parsed_flags_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+
+  tsl::port::InitMain(usage.c_str(), &argc, &argv);
+
+  xla::Status result = xla::xla_compile::XlaCompileMain(
+      module_path, output_path, platform, gpu_target_config_path,
+      autotune_results_path);
+  if (!result.ok()) {
+    LOG(ERROR) << "Compilation failed: " << result.error_message();
+    return 1;
+  }
+
+  LOG(INFO) << "Compilation succeeded";
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/service/xla_debug_info_manager.h b/tensorflow/compiler/xla/service/xla_debug_info_manager.h
index 4578d8fbcf3..d18a7cf35da 100644
--- a/tensorflow/compiler/xla/service/xla_debug_info_manager.h
+++ b/tensorflow/compiler/xla/service/xla_debug_info_manager.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
index 3aac67bb104..df64072a0bb 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index 61d328685e6..471e70aec75 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 // HLO pass that replaces zero sized Hlos with a zero sized constant literal.
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index ad337187adc..e9e08ceedf3 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index 41429be918d..82185fb84c6 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -93,6 +94,14 @@ ShapeProto Shape::ToProto() const {
   return proto;
 }
 
+void Shape::Print(Printer* printer, bool print_layout) const {
+  if (print_layout) {
+    ShapeUtil::PrintHumanStringWithLayout(printer, *this);
+  } else {
+    ShapeUtil::PrintHumanString(printer, *this);
+  }
+}
+
 std::string Shape::ToString(bool print_layout) const {
   if (print_layout) {
     return ShapeUtil::HumanStringWithLayout(*this);
@@ -152,8 +161,6 @@ void Shape::DeleteDimension(int64_t dim_to_delete) {
   }
 }
 
-int64_t Shape::dimensions(int index) const { return dimensions_.at(index); }
-
 const Shape& Shape::tuple_shapes(int index) const {
   return tuple_shapes_.at(index);
 }
@@ -267,15 +274,12 @@ ProgramShapeProto ProgramShape::ToProto() const {
   return proto;
 }
 
+void ProgramShape::Print(Printer* printer) const {
+  ShapeUtil::PrintHumanString(printer, *this);
+}
+
 std::string ProgramShape::ToString() const {
-  std::vector<std::string> parameter_strings(parameters_size());
-  for (int i = 0; i < parameters_size(); ++i) {
-    parameter_strings[i] = absl::StrCat(
-        i < parameter_names_size() ? parameter_names(i) : "(unknown)", ": ",
-        ShapeUtil::HumanString(parameters(i)));
-  }
-  return absl::StrCat("(", absl::StrJoin(parameter_strings, ", "), ") -> ",
-                      ShapeUtil::HumanString(result()));
+  return ShapeUtil::HumanString(*this);
 }
 
 std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape) {
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 9384731a3cb..445bbe82cc7 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -55,6 +56,10 @@ class Shape {
   // Returns a ShapeProto representation of the Shape.
   ShapeProto ToProto() const;
 
+  // Prints a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  void Print(Printer* printer, bool print_layout = false) const;
+
   // Returns a human-readable string that represents the given shape, with or
   // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
   std::string ToString(bool print_layout = false) const;
@@ -118,7 +123,8 @@ class Shape {
 
   // Methods for accessing the dimensions array.
   int dimensions_size() const { return dimensions_.size(); }
-  int64_t dimensions(int index) const;
+  int64_t dimensions(int index) const { return dimensions_.at(index); }
+
   int64_t dimensions_minor(int index) const {
     CHECK(has_layout());
     return dimensions_.at(layout_->minor_to_major(index));
@@ -171,6 +177,9 @@ class Shape {
   // Recursively clear dynamic dimension of a shape.
   void clear_dynamic_dimensions() {
     if (!IsTuple()) {
+      if (is_dynamic()) {
+        mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(0);
+      }
       for (int64_t i = 0; i < dynamic_dimensions_.size(); ++i) {
         dynamic_dimensions_[i] = false;
       }
@@ -325,6 +334,8 @@ class ProgramShape {
   // Returns a proto representation of the object.
   ProgramShapeProto ToProto() const;
 
+  void Print(Printer* printer) const;
+
   std::string ToString() const;
 
   // The following methods mirror the protobuf generated code interface for the
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index ee444c218c6..c1a7dfc5ad1 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -80,8 +80,8 @@ bool ShapeLayout::MatchesLayoutInShape(const Shape& shape,
 }
 
 const Layout& ShapeLayout::layout() const {
-  CHECK(LayoutIsSet());
-  CHECK(!shape_.IsTuple());
+  DCHECK(LayoutIsSet());
+  DCHECK(!shape_.IsTuple());
   return shape_.layout();
 }
 
@@ -90,17 +90,17 @@ void ShapeLayout::Clear() { LayoutUtil::ClearLayout(&shape_); }
 bool ShapeLayout::LayoutIsSet() const { return LayoutUtil::HasLayout(shape_); }
 
 void ShapeLayout::ResetLayout(const Layout& layout) {
-  CHECK(!shape_.IsTuple());
-  CHECK(!shape_.IsOpaque());
+  DCHECK(!shape_.IsTuple());
+  DCHECK(!shape_.IsOpaque());
   *shape_.mutable_layout() = layout;
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(shape_));
 }
 
 void ShapeLayout::ResetLayout(const Layout& layout,
                               ShapeIndexView shape_index) {
   *ShapeUtil::GetMutableSubshape(&shape_, shape_index)->mutable_layout() =
       layout;
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(shape_));
 }
 
 bool ShapeLayout::operator==(const ShapeLayout& other) const {
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index fa60bea9d02..466a26ebfd9 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -84,6 +85,9 @@ class ShapeLayout {
   // layout. Shape must be a tuple.
   void ResetLayout(const Layout& layout, ShapeIndexView shape_index);
 
+  // Returns a string representation of this object.
+  void Print(Printer* printer) const { shape_.Print(printer, true); }
+
   // Returns a string representation of this object.
   std::string ToString() const { return shape_.ToString(true); }
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 9e24a44e8bd..18b307c6ae2 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -27,14 +27,17 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/overflow_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -46,25 +49,27 @@ namespace {
 // the size of each element of that primitive type, or 0
 // if the PrimitiveType is not a primitive type
 constexpr uint8_t primitive_byte_size[PrimitiveType_ARRAYSIZE] = {
-    0,                  // PRIMITIVE_TYPE_INVALID = 0,
-    sizeof(int8_t),     // PRED = 1
-    sizeof(int8_t),     // S8 = 2
-    sizeof(int16_t),    // S16 = 3
-    sizeof(int32_t),    // S32 = 4
-    sizeof(int64_t),    // S64 = 5
-    sizeof(uint8_t),    // U8 = 6
-    sizeof(uint16_t),   // U16 = 7
-    sizeof(uint32_t),   // U32 = 8
-    sizeof(uint64_t),   // U64 = 9
-    sizeof(float) / 2,  // F16 = 10
-    sizeof(float),      // F32 = 11
-    sizeof(double),     // F64 = 12
-    0,                  // TUPLE = 13
-    0,                  // OPAQUE_TYPE = 14
-    sizeof(complex64),  // C64 = 15
-    sizeof(float) / 2,  // BF16 = 16
-    0,                  // TOKEN = 17
-    sizeof(complex128)  // C128 = 18
+    0,                   // PRIMITIVE_TYPE_INVALID = 0,
+    sizeof(int8_t),      // PRED = 1
+    sizeof(int8_t),      // S8 = 2
+    sizeof(int16_t),     // S16 = 3
+    sizeof(int32_t),     // S32 = 4
+    sizeof(int64_t),     // S64 = 5
+    sizeof(uint8_t),     // U8 = 6
+    sizeof(uint16_t),    // U16 = 7
+    sizeof(uint32_t),    // U32 = 8
+    sizeof(uint64_t),    // U64 = 9
+    sizeof(float) / 2,   // F16 = 10
+    sizeof(float),       // F32 = 11
+    sizeof(double),      // F64 = 12
+    0,                   // TUPLE = 13
+    0,                   // OPAQUE_TYPE = 14
+    sizeof(complex64),   // C64 = 15
+    sizeof(float) / 2,   // BF16 = 16
+    0,                   // TOKEN = 17
+    sizeof(complex128),  // C128 = 18
+    sizeof(float) / 4,   // F8E5M2 = 19
+    sizeof(float) / 4,   // F8E4M3FN = 20
 };
 constexpr int64_t kAnnotationPrintInterval = 5;
 }  // namespace
@@ -315,7 +320,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major,
     absl::Span<const DimLevelType> dim_level_types,
-    absl::Span<const bool> dim_ordered, absl::Span<const bool> dim_unique,
+    absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
     int64_t memory_space, std::optional<Shape> physical_shape) {
   auto ret = MakeShapeWithLayoutInternal(
@@ -515,6 +520,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case S16:
     case S32:
     case S64:
+    case F8E5M2:
+    case F8E4M3FN:
     case F16:
     case BF16:
     case F32:
@@ -596,17 +603,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                               complex_shape.element_type()));
 }
 
-/* static */ int64_t ShapeUtil::ElementsIn(const Shape& shape) {
-  DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
-  DCHECK_EQ(shape.dimensions_size(), shape.rank());
-  if (shape.dimensions().size() == 1) {
-    return shape.dimensions()[0];
-  }
-  return std::accumulate<decltype(shape.dimensions().begin()), int64_t>(
-      shape.dimensions().begin(), shape.dimensions().end(), 1LL,
-      std::multiplies<int64_t>());
-}
-
 /* static */ int64_t ShapeUtil::ElementsInRecursive(const Shape& shape) {
   CHECK(shape.IsArray() || shape.IsTuple());
   if (shape.IsArray()) {
@@ -641,84 +637,106 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsScalar(shape) && shape.element_type() == element_type;
 }
 
-/* static */ std::string ShapeUtil::HumanString(const Shape& shape) {
+/* static */ void ShapeUtil::PrintHumanString(xla::Printer* printer,
+                                              const Shape& shape) {
   if (shape.IsTuple()) {
-    std::string text = "(";
+    printer->Append("(");
     const auto& tuple_shapes = shape.tuple_shapes();
     for (int64_t i = 0; i < tuple_shapes.size(); ++i) {
       const Shape& elem_shape = tuple_shapes[i];
       if (i != 0) {
-        StrAppend(&text, ", ");
+        printer->Append(", ");
         if (i % kAnnotationPrintInterval == 0) {
-          StrAppend(&text, absl::StrFormat("/*index=%lld*/", i));
+          printer->Append(absl::StrFormat("/*index=%lld*/", i));
         }
       }
-      StrAppend(&text, HumanString(elem_shape));
+      PrintHumanString(printer, elem_shape);
     }
-    text += ")";
-    return text;
+    printer->Append(")");
+    return;
   }
-  std::vector<std::string> dim_elements;
+  printer->Append(
+      primitive_util::LowercasePrimitiveTypeName(shape.element_type()));
+  printer->Append("[");
   const auto dimensions_size = shape.dimensions_size();
-  dim_elements.reserve(dimensions_size);
   for (int i = 0; i < dimensions_size; ++i) {
+    if (i != 0) printer->Append(",");
     if (shape.is_dynamic_dimension(i)) {
-      dim_elements.push_back(StrCat("<=", shape.dimensions(i)));
-    } else {
-      dim_elements.push_back(StrCat(shape.dimensions(i)));
+      printer->Append("<=");
     }
+    printer->Append(absl::AlphaNum(shape.dimensions(i)).Piece());
   }
-  return StrCat(
-      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[",
-      absl::StrJoin(dim_elements, ","), "]");
+  printer->Append("]");
 }
 
-/* static */ std::string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+/* static */ void ShapeUtil::PrintHumanStringWithLayout(xla::Printer* printer,
+                                                        const Shape& shape) {
   if (shape.IsTuple()) {
-    std::string text = "(";
+    printer->Append("(");
     const auto& tuple_shapes = shape.tuple_shapes();
     for (int64_t i = 0; i < tuple_shapes.size(); ++i) {
       const Shape& elem_shape = tuple_shapes[i];
       if (i != 0) {
-        StrAppend(&text, ", ");
+        printer->Append(", ");
         if (i % kAnnotationPrintInterval == 0) {
-          StrAppend(&text, absl::StrFormat("/*index=%lld*/", i));
+          printer->Append(absl::StrFormat("/*index=%lld*/", i));
         }
       }
-      StrAppend(&text, HumanStringWithLayout(elem_shape));
+      PrintHumanStringWithLayout(printer, elem_shape);
     }
-    text += ")";
-    return text;
+    printer->Append(")");
+    return;
   }
-  std::string result = HumanString(shape);
+  PrintHumanString(printer, shape);
   if (shape.has_layout()) {
     if (IsScalar(shape)) {
       std::string layout_str = LayoutUtil::HumanString(shape.layout());
       // Don't print "{}" as layout for scalars.
       if (layout_str != "{}") {
-        StrAppend(&result, layout_str);
+        printer->Append(layout_str);
       }
     } else if (shape.IsArray()) {
-      StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
+      LayoutUtil::PrintHumanString(printer, shape.layout());
     }
   }
-  return result;
 }
 
-/* static */ std::string ShapeUtil::HumanString(
-    const ProgramShape& program_shape) {
-  std::vector<std::string> parameters;
+/* static */ void ShapeUtil::PrintHumanString(
+    xla::Printer* printer, const ProgramShape& program_shape) {
+  printer->Append("(");
   const auto& shape_parameters = program_shape.parameters();
-  parameters.reserve(shape_parameters.size());
-  for (const auto& shape : shape_parameters) {
-    const int i = parameters.size();
-    parameters.push_back(StrCat(i < program_shape.parameter_names_size()
-                                    ? program_shape.parameter_names(i)
-                                    : "(unknown)",
-                                ": ", HumanString(shape)));
+  for (int i = 0; i < shape_parameters.size(); ++i) {
+    const auto& shape = shape_parameters[i];
+    if (i != 0) printer->Append(", ");
+    if (i < program_shape.parameter_names_size()) {
+      printer->Append(program_shape.parameter_names(i));
+    } else {
+      printer->Append("(unknown)");
+    }
+    printer->Append(": ");
+    PrintHumanString(printer, shape);
   }
-  return StrCat("(", absl::StrJoin(parameters, ", "), ") -> ",
-                HumanString(program_shape.result()));
+  printer->Append(") -> ");
+  PrintHumanString(printer, program_shape.result());
+}
+
+/* static */ std::string ShapeUtil::HumanString(const Shape& shape) {
+  StringPrinter printer;
+  PrintHumanString(&printer, shape);
+  return std::move(printer).ToString();
+}
+
+/* static */ std::string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+  StringPrinter printer;
+  PrintHumanStringWithLayout(&printer, shape);
+  return std::move(printer).ToString();
+}
+
+/* static */ std::string ShapeUtil::HumanString(
+    const ProgramShape& program_shape) {
+  StringPrinter printer;
+  PrintHumanString(&printer, program_shape);
+  return std::move(printer).ToString();
 }
 
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
@@ -798,6 +816,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
       return sizeof(uint32_t);
     case U64:
       return sizeof(uint64_t);
+    case F8E5M2:
+      return sizeof(float) / 4;
+    case F8E4M3FN:
+      return sizeof(float) / 4;
     case BF16:
       return sizeof(float) / 2;
     case F16:
@@ -1059,82 +1081,6 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
       [&](int64_t dim) -> bool { return shape.dimensions()[dim] != 1; }, shape);
 }
 
-namespace {
-
-// Helper for ForEachSubshape which visits the subshapes of the given shape in
-// DFS pre-order starting with the index.
-Status ForEachSubshapeHelper(const Shape& shape,
-                             const ShapeUtil::StatusVisitorFunction& func,
-                             ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(shape, *index));
-  if (shape.IsTuple()) {
-    for (int64_t i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      index->push_back(i);
-      TF_RETURN_IF_ERROR(ForEachSubshapeHelper(
-          ShapeUtil::GetTupleElementShape(shape, i), func, index));
-      index->pop_back();
-    }
-  }
-  return OkStatus();
-}
-
-// Helper for ForEachMutableSubshape which visits the subshapes of the given
-// shape in DFS pre-order starting with the index.
-Status ForEachMutableSubshapeHelper(
-    Shape* shape, const ShapeUtil::MutatingStatusVisitorFunction& func,
-    ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(shape, *index));
-  if (shape->IsTuple()) {
-    for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
-      index->push_back(i);
-      TF_RETURN_IF_ERROR(ForEachMutableSubshapeHelper(
-          shape->mutable_tuple_shapes(i), func, index));
-      index->pop_back();
-    }
-  }
-  return OkStatus();
-}
-
-}  // namespace
-
-/* static */ void ShapeUtil::ForEachSubshape(const Shape& shape,
-                                             const VisitorFunction& func) {
-  ShapeIndex index;
-  ForEachSubshapeHelper(
-      shape,
-      [&func](const Shape& subshape, const ShapeIndex& index) {
-        func(subshape, index);
-        return OkStatus();
-      },
-      &index)
-      .IgnoreError();
-}
-
-/* static */ void ShapeUtil::ForEachMutableSubshape(
-    Shape* shape, const MutatingVisitorFunction& func) {
-  ShapeIndex index;
-  ForEachMutableSubshapeHelper(
-      shape,
-      [&func](Shape* subshape, const ShapeIndex& index) {
-        func(subshape, index);
-        return OkStatus();
-      },
-      &index)
-      .IgnoreError();
-}
-
-/* static */ Status ShapeUtil::ForEachSubshapeWithStatus(
-    const Shape& shape, const StatusVisitorFunction& func) {
-  ShapeIndex index;
-  return ForEachSubshapeHelper(shape, func, &index);
-}
-
-/* static */ Status ShapeUtil::ForEachMutableSubshapeWithStatus(
-    Shape* shape, const MutatingStatusVisitorFunction& func) {
-  ShapeIndex index;
-  return ForEachMutableSubshapeHelper(shape, func, &index);
-}
-
 /* static */ Shape ShapeUtil::PermuteDimensions(
     absl::Span<const int64_t> permutation, const Shape& shape) {
   Shape new_shape = shape;
@@ -1702,18 +1648,34 @@ ShapeUtil::DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachParallelVisitorFunction& visitor_function) {
   // The parallel version of ForEachIndexInternal can never fail.
-  CHECK(ForEachIndexInternalParallel(shape, base, count, incr, visitor_function)
-            .ok());
+  CHECK(
+      ForEachIndexParallelWithStatus(shape, base, count, incr, visitor_function)
+          .ok());
+}
+
+/* static */ Status ShapeUtil::ForEachIndexParallelWithStatus(
+    const Shape& shape, absl::Span<const int64_t> base,
+    absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+    const ForEachParallelVisitorFunction& visitor_function) {
+  // The parallel version of ForEachIndexInternal can never fail.
+  return ForEachIndexInternalParallel(shape, base, count, incr,
+                                      visitor_function);
 }
 
 /* static */ void ShapeUtil::ForEachIndexParallel(
     const Shape& shape,
     const ForEachParallelVisitorFunction& visitor_function) {
+  CHECK(ForEachIndexParallelWithStatus(shape, visitor_function).ok());
+}
+
+/* static */ Status ShapeUtil::ForEachIndexParallelWithStatus(
+    const Shape& shape,
+    const ForEachParallelVisitorFunction& visitor_function) {
   std::vector<int64_t> base(shape.dimensions_size());
   std::vector<int64_t> incr(shape.dimensions_size(), 1);
-  return ForEachIndexParallel(shape, base,
-                              /*count=*/shape.dimensions(), incr,
-                              visitor_function);
+  return ForEachIndexParallelWithStatus(shape, base,
+                                        /*count=*/shape.dimensions(), incr,
+                                        visitor_function);
 }
 
 /* static */ Status ShapeUtil::ForEachIndexInternal(
@@ -1743,19 +1705,21 @@ ShapeUtil::DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
 namespace {
 
 struct ParallelState {
-  ParallelState() {
-    const int kNumThreads = tsl::port::MaxParallelism();
-    pool.emplace(tsl::Env::Default(), "foreach", kNumThreads);
-  }
-  ~ParallelState() {}
-  void Wait() {
-    // Waits for the scheduled work to complete.
-    pool.reset();
+  explicit ParallelState(int64_t task_count) : counter(task_count) {
+    // If this method is changed, please remember to change
+    // GetForEachIndexParallelThreadCount() as well.
+    static auto* global_pool = new tsl::thread::ThreadPool(
+        tsl::Env::Default(), "foreach", tsl::port::MaxParallelism());
+    pool = global_pool;
   }
+  ~ParallelState() = default;
+  void Wait() { counter.Wait(); }
+  void TaskComplete() { counter.DecrementCount(); }
 
   absl::Mutex mu;
-  std::optional<tsl::thread::ThreadPool> pool;
+  tsl::thread::ThreadPool* pool;
   Status status;  // Guarded by mu
+  absl::BlockingCounter counter;
 };
 
 }  // anonymous namespace
@@ -1764,8 +1728,8 @@ struct ParallelState {
     const Shape& shape, absl::Span<const int64_t> base,
     absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
     const ForEachParallelVisitorFunction& visitor_function) {
-  ParallelState pstate;
   ForEachState s(shape, base, count, incr);
+  ParallelState pstate(s.CalculateNumSteps());
   if (s.IsZeroElementArray()) {
     return pstate.status;
   }
@@ -1783,6 +1747,7 @@ struct ParallelState {
           pstate.status = result.status();
         }
       }
+      pstate.TaskComplete();
     });
     // Increments dimensions in minor to major order.
     n = s.IncrementDim();
@@ -1792,6 +1757,11 @@ struct ParallelState {
   return pstate.status;
 }
 
+/* static */ int ShapeUtil::GetForEachIndexParallelThreadCount() {
+  ParallelState pstate(/*task_count=*/0);
+  return pstate.pool->NumThreads();
+}
+
 /* static */ Shape ShapeUtil::DeleteDimensions(
     absl::Span<int64_t const> dims_to_delete, Shape shape) {
   std::vector<int64_t> dims_to_delete_v(dims_to_delete.begin(),
@@ -1999,4 +1969,23 @@ bool ShapeUtil::ForEachState::IsZeroElementArray() const {
   return ShapeUtil::IsZeroElementArray(shape);
 }
 
+int64_t ShapeUtil::ForEachState::CalculateNumSteps() const {
+  if (IsZeroElementArray()) return 0;
+
+  int64_t size = 1;
+  // This works for rank = 0 as well.
+  for (int64_t i = 0; i < rank; ++i) {
+    // When the count is zero, it can mean that the given dimension is fixed,
+    // but we still iterate over the others.
+    if (count[i] == 0) {
+      continue;
+    }
+
+    CHECK_NE(incr[i], 0);
+    int64_t dim = 1 + (count[i] - 1) / incr[i];
+    size *= dim;
+  }
+  return size;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 5af586a7eff..835e4532fa1 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <initializer_list>
+#include <numeric>
 #include <optional>
 #include <ostream>
 #include <string>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/cpu_info.h"
@@ -103,7 +105,16 @@ class ShapeUtil {
   // Returns the number of elements are contained within the provided shape;
   // e.g. for rank 0 (scalars) the result is always 1.
   // Precondition: shape.IsArray()
-  static int64_t ElementsIn(const Shape& shape);
+  static inline int64_t ElementsIn(const Shape& shape) {
+    DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
+    DCHECK_EQ(shape.dimensions_size(), shape.rank());
+    if (shape.dimensions().size() == 1) {
+      return shape.dimensions()[0];
+    }
+    return std::accumulate<decltype(shape.dimensions().begin()), int64_t>(
+        shape.dimensions().begin(), shape.dimensions().end(), 1LL,
+        std::multiplies<int64_t>());
+  }
 
   // As ElementsIn(), but recurses through tuples.
   static int64_t ElementsInRecursive(const Shape& shape);
@@ -140,6 +151,18 @@ class ShapeUtil {
   // size also includes padding if present in the layout.
   static int64_t ByteSizeOfElements(const Shape& shape);
 
+  // Prints a human-readable string that represents the given shape, with or
+  // without layout. e.g. "f32[42x12] {0, 1}" or "f32[64]".
+  static void PrintHumanString(xla::Printer* printer, const Shape& shape);
+  static void PrintHumanStringWithLayout(xla::Printer* printer,
+                                         const Shape& shape);
+
+  // As above, but for program shapes, prints a string for the form:
+  //
+  // (param_name: f32[42x12], ...) -> f32[24x42]
+  static void PrintHumanString(xla::Printer* printer,
+                               const ProgramShape& program_shape);
+
   // Returns a human-readable string that represents the given shape, with or
   // without layout. e.g. "f32[42x12] {0, 1}" or "f32[64]".
   static std::string HumanString(const Shape& shape);
@@ -360,8 +383,8 @@ class ShapeUtil {
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       absl::Span<const int64_t> minor_to_major,
       absl::Span<const DimLevelType> dim_level_types,
-      absl::Span<const bool> dim_ordered = {},
       absl::Span<const bool> dim_unique = {},
+      absl::Span<const bool> dim_ordered = {},
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
       int64_t memory_space = 0,
@@ -479,24 +502,49 @@ class ShapeUtil {
   // Calls the given visitor function for each subshape of the given shape.
   // Subshapes are visited in DFS pre-order starting with the entire shape
   // (index {}).
-  using VisitorFunction = absl::FunctionRef<void(const Shape& /*subshape*/,
-                                                 const ShapeIndex& /*index*/)>;
-  static void ForEachSubshape(const Shape& shape, const VisitorFunction& func);
-  using MutatingVisitorFunction =
-      absl::FunctionRef<void(Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
-  static void ForEachMutableSubshape(Shape* shape,
-                                     const MutatingVisitorFunction& func);
+  //
+  // The visitor function must have the signature
+  //
+  //   void fn(const Shape& subshape, const ShapeIndex& index), or
+  //   void fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  template <typename Fn>
+  static void ForEachSubshape(const Shape& shape, Fn&& fn) {
+    ForEachSubshapeWithStatus(shape, [&](const Shape& subshape,
+                                         const ShapeIndex& index) {
+      fn(subshape, index);
+      return OkStatus();
+    }).IgnoreError();
+  }
+  template <typename Fn>
+  static void ForEachMutableSubshape(Shape* shape, Fn&& fn) {
+    ForEachMutableSubshapeWithStatus(shape, [&](Shape* subshape,
+                                                const ShapeIndex& index) {
+      fn(subshape, index);
+      return OkStatus();
+    }).IgnoreError();
+  }
 
   // Variants of ForEach(Mutable)Subshape which propagate Status from the
   // visitor function.
-  using StatusVisitorFunction = absl::FunctionRef<Status(
-      const Shape& /*subshape*/, const ShapeIndex& /*index*/)>;
-  static Status ForEachSubshapeWithStatus(const Shape& shape,
-                                          const StatusVisitorFunction& func);
-  using MutatingStatusVisitorFunction = absl::FunctionRef<Status(
-      Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
-  static Status ForEachMutableSubshapeWithStatus(
-      Shape* shape, const MutatingStatusVisitorFunction& func);
+  //
+  // Visitor function must have the signature
+  //
+  //   Status fn(const Shape& subshape, const ShapeIndex& index), or
+  //   Status fn(Shape* subshape, const ShapeIndex& index) (mutable version)
+  //
+  template <typename Fn>
+  static Status ForEachSubshapeWithStatus(const Shape& shape, Fn&& fn) {
+    return ForEachMutableSubshapeWithStatus(
+        const_cast<Shape*>(&shape),
+        [&](Shape* subshape, const ShapeIndex& index) -> Status {
+          return fn(*const_cast<const Shape*>(subshape), index);
+        });
+  }
+  template <typename Fn>
+  static Status ForEachMutableSubshapeWithStatus(Shape* shape, Fn&& fn) {
+    ShapeIndex index;
+    return ForEachMutableSubshapeWithStatusHelper(shape, fn, &index);
+  }
 
   // Returns true if `shape` (which must be an array) with degenerate dimensions
   // (dimensions with bound 1).
@@ -697,17 +745,34 @@ class ShapeUtil {
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
   // the visitor_function is thread-safe and the order of iteration does not
   // matter.
+  //
+  // Please use GetForEachIndexParallelThreadCount() to get the number of
+  // threads in the threadpool of ForEachIndexParallel*. This will not change
+  // during the runtime of the process. Please DO NOT use
+  // tsl::port::MaxParallelism() for this purpose, as it may change.
   static void ForEachIndexParallel(
       const Shape& shape, absl::Span<const int64_t> base,
       absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
       const ForEachParallelVisitorFunction& visitor_function);
 
+  // Returns the number of threads in the threadpool of ForEachIndexParallel*.
+  static int GetForEachIndexParallelThreadCount();
+
+  static Status ForEachIndexParallelWithStatus(
+      const Shape& shape, absl::Span<const int64_t> base,
+      absl::Span<const int64_t> count, absl::Span<const int64_t> incr,
+      const ForEachParallelVisitorFunction& visitor_function);
+
   // Convenience wrapper which doesn't take `base`, `count` and `incr`
   // explicitly, but iterates over every element in `shape` instead.
   static void ForEachIndexParallel(
       const Shape& shape,
       const ForEachParallelVisitorFunction& visitor_function);
 
+  static Status ForEachIndexParallelWithStatus(
+      const Shape& shape,
+      const ForEachParallelVisitorFunction& visitor_function);
+
   // About 0-2-1 transpose:
   //
   // If a shape can be viewed as three logical components 0-1-2 in the order of
@@ -763,6 +828,23 @@ class ShapeUtil {
   // used by both the layout-optional and layout-required public method.
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
 
+  // Helper for ForEachSubshape which visits the subshapes of the given shape in
+  // DFS pre-order starting with the index.
+  template <typename Fn>
+  static Status ForEachMutableSubshapeWithStatusHelper(Shape* shape, Fn&& fn,
+                                                       ShapeIndex* index) {
+    TF_RETURN_IF_ERROR(fn(shape, *index));
+    if (shape->IsTuple()) {
+      for (int64_t i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(ForEachMutableSubshapeWithStatusHelper(
+            shape->mutable_tuple_shapes(i), fn, index));
+        index->pop_back();
+      }
+    }
+    return OkStatus();
+  }
+
   // Keeps track of the iteration state for the ForEach...Internal routines
   struct ForEachState {
     ForEachState(const Shape& s, absl::Span<const int64_t> b,
@@ -778,6 +860,10 @@ class ShapeUtil {
 
     int64_t IncrementDim();
     bool IsZeroElementArray() const;
+
+    // Returns the number of visited elements assuming that the iteration will
+    // not be interrupted.
+    int64_t CalculateNumSteps() const;
   };
 
   static Status ForEachIndexInternal(
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 1f1c61943a5..de48ab6af4a 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/test_benchmark.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 
 namespace xla {
 namespace {
@@ -629,6 +630,23 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   EXPECT_EQ(invocations, 5);
 }
 
+TEST(ShapeUtilTest, GetForEachIndexParallelThreadCount) {
+  const int kThreadCount = ShapeUtil::GetForEachIndexParallelThreadCount();
+
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
+  auto check_func = [kThreadCount](absl::Span<const int64_t> /*indexes*/,
+                                   int thread_id) -> StatusOr<bool> {
+    EXPECT_GE(thread_id, -1);
+    EXPECT_LT(thread_id, kThreadCount);
+    return true;
+  };
+
+  for (int i = 0; i < 10; ++i) {
+    ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 100},
+                                    /*incr=*/{1, 1}, check_func);
+  }
+}
+
 TEST(ShapeUtilTest, ForEachIndexParallel) {
   Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
   int64_t output[10][10];
@@ -641,7 +659,6 @@ TEST(ShapeUtilTest, ForEachIndexParallel) {
 
   ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
                                   /*incr=*/{1, 1}, set_func);
-
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < 10; ++j) {
       EXPECT_EQ(output[i][j], init + i + j);
@@ -649,6 +666,150 @@ TEST(ShapeUtilTest, ForEachIndexParallel) {
   }
 }
 
+TEST(ShapeUtilTest, ForEachIndexParallel_Rank0) {
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+  int64_t output = -1;
+  auto set_func = [&](absl::Span<const int64_t> indexes,
+                      int /*thread_id*/) -> StatusOr<bool> {
+    output = indexes.size();
+    return true;
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{}, /*count=*/{},
+                                  /*incr=*/{}, set_func);
+
+  EXPECT_EQ(output, 0);
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel_Empty) {
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 0});
+  bool called = false;
+  auto set_func = [&](absl::Span<const int64_t> indexes,
+                      int /*thread_id*/) -> StatusOr<bool> {
+    called = true;
+    return true;
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{2, 0},
+                                  /*incr=*/{1, 1}, set_func);
+
+  EXPECT_FALSE(called);
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel_DimensionPinnedWithZeros) {
+  // Some users of ForEachIndex use base = a, count = 0, incr = 0 to indicate
+  // that the given dimension should be pinned to the value "a" during the
+  // iteration. We want to be compatible with this behavior so we test it here.
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  int64_t output[2][2] = {};
+  int init = 5;
+  auto set_func = [&](absl::Span<const int64_t> indexes,
+                      int /*thread_id*/) -> StatusOr<bool> {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+    return true;
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{1, 0}, /*count=*/{0, 2},
+                                  /*incr=*/{0, 1}, set_func);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      if (i == 1) {
+        EXPECT_EQ(output[i][j], init + i + j);
+      } else {
+        EXPECT_EQ(output[i][j], 0);
+      }
+    }
+  }
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel_WithSkips) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64_t output[10][10] = {};
+  int init = 5;
+  auto set_func = [&](absl::Span<const int64_t> indexes,
+                      int /*thread_id*/) -> StatusOr<bool> {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+    return true;
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{2, 3}, /*count=*/{3, 1},
+                                  /*incr=*/{2, 1}, set_func);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      if ((i == 2 || i == 4) && j == 3) {
+        EXPECT_EQ(output[i][j], init + i + j);
+      } else {
+        EXPECT_EQ(output[i][j], 0);
+      }
+    }
+  }
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel_CalledTwice) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64_t output[10][10];
+  int init = 5;
+  auto set_func = [&](absl::Span<const int64_t> indexes,
+                      int /*thread_id*/) -> StatusOr<bool> {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+    return true;
+  };
+  int init2 = 15;
+  auto set_func2 = [&](absl::Span<const int64_t> indexes,
+                       int /*thread_id*/) -> StatusOr<bool> {
+    output[indexes[0]][indexes[1]] = init2 + indexes[0] + indexes[1];
+    return true;
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func);
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func2);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(output[i][j], init2 + i + j);
+    }
+  }
+}
+
+TEST(ShapeUtilTest, ForEachIndexParallel_CalledFromMultipleThreads) {
+  constexpr int kCallingThreads = 10;
+  constexpr int kDim0 = 10;
+  constexpr int kDim1 = 10;
+  constexpr int kInit = 5;
+  const Shape kShape = ShapeUtil::MakeShape(F32, {kDim0, kDim1});
+  int64_t output[kCallingThreads][kDim0][kDim1];
+
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "foreach",
+                                 kCallingThreads);
+    for (int t = 0; t < kCallingThreads; ++t) {
+      pool.Schedule([&output, &kShape, t] {
+        auto set_func = [&output, t](absl::Span<const int64_t> indexes,
+                                     int /*thread_id*/) -> StatusOr<bool> {
+          output[t][indexes[0]][indexes[1]] = kInit + indexes[0] + indexes[1];
+          return true;
+        };
+
+        ShapeUtil::ForEachIndexParallel(kShape, /*base=*/{0, 0},
+                                        /*count=*/{kDim0, kDim1},
+                                        /*incr=*/{1, 1}, set_func);
+      });
+    }
+  }
+
+  for (int t = 0; t < kCallingThreads; ++t) {
+    for (int i = 0; i < kDim0; ++i) {
+      for (int j = 0; j < kDim1; ++j) {
+        EXPECT_EQ(output[t][i][j], kInit + i + j);
+      }
+    }
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
diff --git a/tensorflow/compiler/xla/side_effect_util.cc b/tensorflow/compiler/xla/side_effect_util.cc
index 3ecd5dbf537..c87a18cfc32 100644
--- a/tensorflow/compiler/xla/side_effect_util.cc
+++ b/tensorflow/compiler/xla/side_effect_util.cc
@@ -37,6 +37,8 @@ const char kXlaComputeTypeSparse[] = "sparse";
 
 const char kXlaComputeTypeDense[] = "dense";
 
+const char kXlaComputeTypeHost[] = "host";
+
 const char kXlaMaxIdsPerPartitionAttr[] = "_xla_max_ids_per_partition";
 
 const char kXlaMaxUniqueIdsPerPartitionAttr[] =
diff --git a/tensorflow/compiler/xla/side_effect_util.h b/tensorflow/compiler/xla/side_effect_util.h
index defdcb173a5..743d6b5065d 100644
--- a/tensorflow/compiler/xla/side_effect_util.h
+++ b/tensorflow/compiler/xla/side_effect_util.h
@@ -44,6 +44,7 @@ extern const char kXlaComputeTypeAttr[];
 // XLA frontend attribute values for kXlaComputeTypeAttr
 extern const char kXlaComputeTypeSparse[];
 extern const char kXlaComputeTypeDense[];
+extern const char kXlaComputeTypeHost[];
 
 // XLA frontend attribute name for the maximum number of ids expected per
 // partition *before* an input batch is partitioned.
diff --git a/tensorflow/compiler/xla/status.h b/tensorflow/compiler/xla/status.h
index 3894b72d868..fe7fbef419c 100644
--- a/tensorflow/compiler/xla/status.h
+++ b/tensorflow/compiler/xla/status.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include "tensorflow/tsl/platform/status.h"  // IWYU pragma: export
 
 namespace xla {
-
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::FromAbslStatus;
 using tsl::OkStatus;
 using tsl::Status;  // TENSORFLOW_STATUS_OK
-
+using tsl::ToAbslStatus;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_STATUS_H_
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index 029e2eed0d7..d34edaf188f 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STATUSOR_H_
 
 #include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/stream_executor/BUILD b/tensorflow/compiler/xla/stream_executor/BUILD
index e931bd9afe9..0ad0c1b5289 100644
--- a/tensorflow/compiler/xla/stream_executor/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/BUILD
@@ -4,18 +4,16 @@
 # Throughout this file, all targets are built with the standard crosstool and
 # do not link against restricted binary blobs.
 
-# TODO(rdzhabarov): XLA should have its own tensorflow independent rule.
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
-load("//tensorflow/tsl:tsl.bzl", "tsl_gpu_library")
+load("//tensorflow/tsl:tsl.bzl", "transitive_hdrs", "tsl_gpu_library")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = [
-        ":friends",
-    ],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -60,15 +58,18 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":allocator_stats",
+        ":device_description_proto_cc",
         ":dnn_proto_cc",
         ":host_or_device_scalar",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -77,6 +78,11 @@ cc_library(
     ],
 )
 
+transitive_hdrs(
+    name = "stream_executor_install_hdrs",
+    deps = [":stream_executor_headers"],
+)
+
 cc_library(
     name = "launch_dim",
     hdrs = [
@@ -97,20 +103,31 @@ cc_library(
     deps = [
         ":device_memory",
         ":kernel",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
     ],
 )
 
+tf_proto_library(
+    name = "device_description_proto",
+    srcs = ["device_description.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/compiler/xla:autotune_results_proto",
+    ],
+)
+
 cc_library(
     name = "device_description",
     srcs = ["device_description.cc"],
     hdrs = ["device_description.h"],
     deps = [
+        ":device_description_proto_cc",
         ":launch_dim",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/lib/math:math_util",
+        "//tensorflow/tsl/platform:numbers",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
@@ -136,8 +153,8 @@ cc_library(
     name = "module_spec",
     hdrs = ["module_spec.h"],
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -157,7 +174,24 @@ cc_library(
 cc_library(
     name = "data_type",
     hdrs = ["data_type.h"],
-    deps = [":dnn_proto_cc"],
+    deps = [
+        ":dnn_proto_cc",
+        "//tensorflow/tsl/platform:float8",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "device_id_utils",
+    hdrs = ["device_id_utils.h"],
+    deps = [
+        ":platform",
+        ":stream_executor",
+        "//tensorflow/tsl/framework:device_id_impl",
+        "//tensorflow/tsl/lib/gtl:int_type",
+        "//tensorflow/tsl/platform:str_util",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
@@ -178,8 +212,9 @@ cc_library(
 tf_proto_library(
     name = "dnn_proto",
     srcs = ["dnn.proto"],
-    cc_api_version = 2,
     make_default_target_header_only = True,
+    protodeps = ["//tensorflow/tsl/protobuf:dnn_proto"],
+    exports = ["//tensorflow/tsl/protobuf:dnn_proto"],
 )
 
 cc_library(
@@ -195,11 +230,12 @@ cc_library(
     srcs = ["platform.cc"],
     hdrs = ["platform.h"],
     deps = [
-        ":dnn_proto_cc",
         ":plugin",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -231,6 +267,7 @@ cc_library(
         "//tensorflow/tsl/framework:allocator",
         "//tensorflow/tsl/framework:device_id",
         "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/profiler/lib:traceme",
     ],
 )
 
@@ -274,9 +311,9 @@ cc_library(
         ":allocator_stats",
         ":data_type",
         ":device_description",
+        ":device_description_proto_cc",
         ":device_memory",
         ":device_options",
-        ":dnn_proto_cc",
         ":fft",
         ":kernel_cache_config",
         ":kernel_spec",
@@ -284,15 +321,20 @@ cc_library(
         ":plugin",
         ":rng",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -303,9 +345,10 @@ cc_library(
     deps = [
         ":platform",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -316,9 +359,6 @@ cc_library(
 
 cc_library(
     name = "stream_executor_internal",
-    srcs = [
-        "stream_executor_internal.cc",
-    ],
     hdrs = [
         "stream_executor_internal.h",
     ],
@@ -327,17 +367,17 @@ cc_library(
         ":device_description",
         ":device_memory",
         ":device_options",
-        ":dnn_proto_cc",
         ":kernel",
         ":kernel_cache_config",
         ":kernel_spec",
         ":launch_dim",
         ":plugin_registry",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -352,18 +392,24 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":device_description",
+        ":device_description_proto_cc",
         ":kernel_cache_config",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -393,12 +439,18 @@ tsl_gpu_library(
         ":temporary_device_memory",
         ":temporary_memory_manager",
         ":timer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:stacktrace",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "//tensorflow/tsl/util:env_var",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -440,9 +492,9 @@ cc_library(
         ":allocator_stats",
         ":data_type",
         ":device_description",
+        ":device_description_proto_cc",
         ":device_memory",
         ":device_options",
-        ":dnn_proto_cc",
         ":fft",
         ":kernel_cache_config",
         ":kernel_spec",
@@ -452,14 +504,20 @@ cc_library(
         ":plugin_registry",
         ":rng",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -480,17 +538,20 @@ cc_library(
     deps = [
         ":data_type",
         ":device_description",
-        ":dnn_proto_cc",
+        ":device_description_proto_cc",
         ":kernel_cache_config",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_pimpl_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -500,12 +561,12 @@ cc_library(
     hdrs = ["blas.h"],
     deps = [
         ":data_type",
-        ":dnn_proto_cc",
-        ":host_or_device_scalar",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -551,9 +612,9 @@ cc_library(
         ":blas",
         ":data_type",
         ":device_description",
+        ":device_description_proto_cc",
         ":device_memory",
         ":device_options",
-        ":dnn_proto_cc",
         ":fft",
         ":kernel_cache_config",
         ":kernel_spec",
@@ -562,14 +623,19 @@ cc_library(
         ":plugin",
         ":rng",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -585,7 +651,9 @@ cc_library(
         ":platform",
         ":plugin",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -599,6 +667,7 @@ cc_library(
     hdrs = ["dnn.h"],
     deps = [
         ":data_type",
+        ":device_description_proto_cc",
         ":device_memory",
         ":dnn_proto_cc",
         ":stream_executor_headers",
@@ -611,8 +680,10 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/lib/strings:proto_serialization",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
     ] + if_static(["@com_google_protobuf//:protobuf"]),
 )
 
@@ -625,8 +696,9 @@ cc_library(
         ":stream_executor_headers",
         ":stream_executor_pimpl_header",
         ":temporary_device_memory",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -644,10 +716,10 @@ cc_library(
     hdrs = ["temporary_device_memory.h"],
     deps = [
         ":device_memory",
-        ":dnn_proto_cc",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
     ],
@@ -709,6 +781,7 @@ cc_library(
         ":stream_executor_pimpl",
         ":timer",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc_impl",
     ],
 )
 
@@ -719,9 +792,9 @@ cc_library(
         ":device_memory",
         ":platform",
         ":stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
@@ -736,34 +809,30 @@ cc_library(
     deps = [
         ":device_memory",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
 
-tsl_cc_test(
+xla_cc_test(
     name = "stream_test",
     size = "small",
     srcs = ["stream_test.cc"],
     deps = [
         ":stream_executor",
-        ":stream_executor_impl",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform",
-        "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
 
-tsl_cc_test(
+xla_cc_test(
     name = "dnn_test",
     size = "small",
     srcs = ["dnn_test.cc"],
     deps = [
         ":dnn",
-        ":stream_executor_impl",
-        "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
@@ -778,9 +847,9 @@ cc_library(
         ":device_memory_allocator",
         ":platform",
         ":stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/framework:allocator",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -791,6 +860,7 @@ cc_library(
     deps = [
         ":dnn",
         ":stream_executor_headers",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -803,7 +873,7 @@ alias(
 
 alias(
     name = "rocm_platform",
-    actual = "//tensorflow/stream_executor/rocm:all_runtime",
+    actual = "//tensorflow/compiler/xla/stream_executor/rocm:all_runtime",
 )
 
 # TODO(se-owner): document or remove this.
diff --git a/tensorflow/compiler/xla/stream_executor/blas.h b/tensorflow/compiler/xla/stream_executor/blas.h
index 2b903081bfc..7fa2d170da2 100644
--- a/tensorflow/compiler/xla/stream_executor/blas.h
+++ b/tensorflow/compiler/xla/stream_executor/blas.h
@@ -31,7 +31,7 @@ limitations under the License.
 //  stream
 //    .Init()
 //    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1);
-//  SE_CHECK_OK(stream.BlockHostUntilDone());
+//  TF_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS
@@ -43,12 +43,12 @@ limitations under the License.
 #include <complex>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/data_type.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 namespace Eigen {
 struct half;
@@ -66,7 +66,7 @@ template <typename ElemT>
 class HostOrDeviceScalar;
 
 template <typename T>
-using DeviceMemorySlice = port::ArraySlice<DeviceMemory<T> *>;  // non-absl ok
+using DeviceMemorySlice = absl::Span<DeviceMemory<T> *const>;
 
 namespace blas {
 
@@ -134,6 +134,7 @@ constexpr AlgorithmType kDefaultAlgorithm = -1;
 constexpr AlgorithmType kDefaultBlasGemm = -2;
 constexpr AlgorithmType kDefaultBlasGemv = -3;
 constexpr AlgorithmType kNoAlgorithm = -4;
+constexpr AlgorithmType kRuntimeAutotuning = -5;
 
 // blas uses -1 to represent the default algorithm. This happens to match up
 // with the CUBLAS_GEMM_DFALT constant, so cuda_blas.cc is using static_cast
@@ -186,20 +187,6 @@ class AlgorithmConfig {
 typedef int64_t ComputePrecision;
 constexpr ComputePrecision kDefaultComputePrecision = 0;
 
-// This struct contains the metadata of a matrix, e.g., its base address and
-// dimensions.
-struct MatrixDescriptor {
-  DeviceMemoryBase data;
-  int64_t leading_dim_stride;
-  int64_t batch_stride;
-  Transpose transpose;
-
-  template <typename T>
-  DeviceMemory<T> cast() const {
-    return DeviceMemory<T>(data);
-  }
-};
-
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -344,13 +331,13 @@ class BlasSupport {
   //
   // Alpha/beta type matches `dtype`, unless `dtype` is `Eigen::half`, in that
   // case the expected alpha/beta type is `float`.
-  virtual port::Status DoBlasGemm(Stream *stream, blas::Transpose transa,
-                                  blas::Transpose transb, uint64_t m, uint64 n,
-                                  uint64_t k, DataType dtype, const void *alpha,
-                                  const DeviceMemoryBase &a, int lda,
-                                  const DeviceMemoryBase &b, int ldb,
-                                  const void *beta, DeviceMemoryBase *c,
-                                  int ldc, ComputePrecision precision) = 0;
+  virtual tsl::Status DoBlasGemm(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64 n,
+                                 uint64_t k, DataType dtype, const void *alpha,
+                                 const DeviceMemoryBase &a, int lda,
+                                 const DeviceMemoryBase &b, int ldb,
+                                 const void *beta, DeviceMemoryBase *c, int ldc,
+                                 ComputePrecision precision) = 0;
 
   virtual bool DoBlasGemmWithProfiling(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
@@ -401,7 +388,7 @@ class BlasSupport {
   // output_profile_result->is_valid().  This lets you use this function for
   // choosing the best algorithm among many (some of which may fail) without
   // creating a new Stream for each attempt.
-  virtual port::Status DoBlasGemmWithAlgorithm(
+  virtual tsl::Status DoBlasGemmWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, const void *alpha,
       const DeviceMemoryBase &a, DataType type_a, int lda,
@@ -411,7 +398,7 @@ class BlasSupport {
       blas::ComputePrecision precision,
       ProfileResult *output_profile_result) = 0;
 
-  virtual port::Status DoBlasGemmStridedBatchedWithAlgorithm(
+  virtual tsl::Status DoBlasGemmStridedBatchedWithAlgorithm(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, const void *alpha,
       const DeviceMemoryBase &a, DataType type_a, int lda, int64_t stride_a,
@@ -425,50 +412,56 @@ class BlasSupport {
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
   // and c, which contain batch_count DeviceMemory objects.
-  virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, float alpha,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a,  // non-absl ok
-      int lda,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b,  // non-absl ok
-      int ldb, float beta,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,  // non-absl ok
-      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
-  virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, float alpha,
-      const port::ArraySlice<DeviceMemory<float> *> &a, int lda,  // non-absl ok
-      const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,  // non-absl ok
-      float beta,
-      const port::ArraySlice<DeviceMemory<float> *> &c,  // non-absl ok
-      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
-  virtual bool DoBlasGemmBatched(
-      Stream *stream, blas::Transpose transa, blas::Transpose transb,
-      uint64_t m, uint64_t n, uint64 k, double alpha,
-      const port::ArraySlice<DeviceMemory<double> *> &a,  // non-absl ok
-      int lda,
-      const port::ArraySlice<DeviceMemory<double> *> &b,  // non-absl ok
-      int ldb, double beta,
-      const port::ArraySlice<DeviceMemory<double> *> &c,  // non-absl ok
-      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<Eigen::half> a, int lda,
+                                 DeviceMemorySlice<Eigen::half> b, int ldb,
+                                 float beta, DeviceMemorySlice<Eigen::half> c,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<Eigen::bfloat16> a, int lda,
+                                 DeviceMemorySlice<Eigen::bfloat16> b, int ldb,
+                                 float beta,
+                                 DeviceMemorySlice<Eigen::bfloat16> c, int ldc,
+                                 int batch_count,
+                                 ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<float> a, int lda,
+                                 DeviceMemorySlice<float> b, int ldb,
+                                 float beta, DeviceMemorySlice<float> c,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, double alpha,
+                                 DeviceMemorySlice<double> a, int lda,
+                                 DeviceMemorySlice<double> b, int ldb,
+                                 double beta, DeviceMemorySlice<double> c,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, std::complex<float> alpha,
-      const DeviceMemorySlice<std::complex<float>> &a, int lda,
-      const DeviceMemorySlice<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, const DeviceMemorySlice<std::complex<float>> &c,
+      DeviceMemorySlice<std::complex<float>> a, int lda,
+      DeviceMemorySlice<std::complex<float>> b, int ldb,
+      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
       int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, std::complex<double> alpha,
-      const DeviceMemorySlice<std::complex<double>> &a, int lda,
-      const DeviceMemorySlice<std::complex<double>> &b, int ldb,
-      std::complex<double> beta,
-      const DeviceMemorySlice<std::complex<double>> &c, int ldc,
-      int batch_count, ScratchAllocator *scratch_allocator) = 0;
+      DeviceMemorySlice<std::complex<double>> a, int lda,
+      DeviceMemorySlice<std::complex<double>> b, int ldb,
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
 
   // Batched gemm with strides instead of pointer arrays.
-  virtual port::Status DoBlasGemmStridedBatched(
+  virtual tsl::Status DoBlasGemmStridedBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb,
       uint64_t m, uint64_t n, uint64 k, DataType dtype, const void *alpha,
       const DeviceMemoryBase &a, int lda, int64_t stride_a,
@@ -539,7 +532,7 @@ class BlasSupport {
                                  DeviceMemory<std::complex<double> *> *bs,
                                  int ldb, int batch_count) = 0;
 
-  virtual port::Status GetVersion(std::string *version) = 0;
+  virtual tsl::Status GetVersion(std::string *version) = 0;
 
  protected:
   BlasSupport() {}
@@ -643,7 +636,7 @@ class BlasSupport {
                   double alpha, const DeviceMemory<double> &a, int lda,        \
                   const DeviceMemory<double> &x, int incx, double beta,        \
                   DeviceMemory<double> *y, int incy) override;                 \
-  port::Status DoBlasGemm(                                                     \
+  tsl::Status DoBlasGemm(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, blas::DataType dtype, const void *alpha, \
       const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,  \
@@ -685,7 +678,7 @@ class BlasSupport {
   bool GetBlasGemmAlgorithms(Stream *stream,                                   \
                              std::vector<blas::AlgorithmType> *out_algorithms) \
       override;                                                                \
-  port::Status DoBlasGemmWithAlgorithm(                                        \
+  tsl::Status DoBlasGemmWithAlgorithm(                                         \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, const void *alpha,                       \
       const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
@@ -697,48 +690,51 @@ class BlasSupport {
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, float alpha,                             \
-      const DeviceMemorySlice<Eigen::half> &a, int lda,                        \
-      const DeviceMemorySlice<Eigen::half> &b, int ldb, float beta,            \
-      const DeviceMemorySlice<Eigen::half> &c, int ldc, int batch_count,       \
+      DeviceMemorySlice<Eigen::half> a, int lda,                               \
+      DeviceMemorySlice<Eigen::half> b, int ldb, float beta,                   \
+      DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,              \
       ScratchAllocator *scratch_allocator) override;                           \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, float alpha,                             \
-      const DeviceMemorySlice<float> &a, int lda,                              \
-      const DeviceMemorySlice<float> &b, int ldb, float beta,                  \
-      const DeviceMemorySlice<float> &c, int ldc, int batch_count,             \
+      DeviceMemorySlice<Eigen::bfloat16> a, int lda,                           \
+      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,               \
+      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,          \
       ScratchAllocator *scratch_allocator) override;                           \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, double alpha,                            \
-      const DeviceMemorySlice<double> &a, int lda,                             \
-      const DeviceMemorySlice<double> &b, int ldb, double beta,                \
-      const DeviceMemorySlice<double> &c, int ldc, int batch_count,            \
+      uint64_t m, uint64 n, uint64 k, float alpha, DeviceMemorySlice<float> a, \
+      int lda, DeviceMemorySlice<float> b, int ldb, float beta,                \
+      DeviceMemorySlice<float> c, int ldc, int batch_count,                    \
       ScratchAllocator *scratch_allocator) override;                           \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
-      uint64_t m, uint64 n, uint64 k, std::complex<float> alpha,               \
-      const DeviceMemorySlice<std::complex<float>> &a, int lda,                \
-      const DeviceMemorySlice<std::complex<float>> &b, int ldb,                \
-      std::complex<float> beta,                                                \
-      const DeviceMemorySlice<std::complex<float>> &c, int ldc,                \
+      uint64_t m, uint64 n, uint64 k, double alpha,                            \
+      DeviceMemorySlice<double> a, int lda, DeviceMemorySlice<double> b,       \
+      int ldb, double beta, DeviceMemorySlice<double> c, int ldc,              \
       int batch_count, ScratchAllocator *scratch_allocator) override;          \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64_t m, uint64 n, uint64 k, std::complex<float> alpha,               \
+      DeviceMemorySlice<std::complex<float>> a, int lda,                       \
+      DeviceMemorySlice<std::complex<float>> b, int ldb,                       \
+      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, std::complex<double> alpha,              \
-      const DeviceMemorySlice<std::complex<double>> &a, int lda,               \
-      const DeviceMemorySlice<std::complex<double>> &b, int ldb,               \
-      std::complex<double> beta,                                               \
-      const DeviceMemorySlice<std::complex<double>> &c, int ldc,               \
-      int batch_count, ScratchAllocator *scratch_allocator) override;          \
-  port::Status DoBlasGemmStridedBatched(                                       \
+      DeviceMemorySlice<std::complex<double>> a, int lda,                      \
+      DeviceMemorySlice<std::complex<double>> b, int ldb,                      \
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,    \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
+  tsl::Status DoBlasGemmStridedBatched(                                        \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, blas::DataType dtype, const void *alpha, \
       const DeviceMemoryBase &a, int lda, int64_t stride_a,                    \
       const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,  \
       DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,         \
       blas::ComputePrecision precision) override;                              \
-  port::Status DoBlasGemmStridedBatchedWithAlgorithm(                          \
+  tsl::Status DoBlasGemmStridedBatchedWithAlgorithm(                           \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, const void *alpha,                       \
       const DeviceMemoryBase &a, blas::DataType type_a, int lda,               \
@@ -790,7 +786,7 @@ class BlasSupport {
                          const DeviceMemory<std::complex<double> *> &as,       \
                          int lda, DeviceMemory<std::complex<double> *> *bs,    \
                          int ldb, int batch_count) override;                   \
-  port::Status GetVersion(std::string *version) override;
+  tsl::Status GetVersion(std::string *version) override;
 
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/build_defs.bzl b/tensorflow/compiler/xla/stream_executor/build_defs.bzl
index 5c0e99a559d..0de6f49fd0f 100644
--- a/tensorflow/compiler/xla/stream_executor/build_defs.bzl
+++ b/tensorflow/compiler/xla/stream_executor/build_defs.bzl
@@ -26,3 +26,8 @@ def if_gpu_is_configured(x):
 
 def if_cuda_or_rocm(x):
     return if_gpu_is_configured(x)
+
+# nvlink is not available via the pip wheels, disable it since it will create
+# unnecessary dependency
+def tf_additional_gpu_compilation_copts():
+    return ["-DTF_DISABLE_NVLINK_BY_DEFAULT"]
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/BUILD b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
index 4316e0d6a57..f434884af3b 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   CUDA-platform specific StreamExecutor support code.
 
-load("//tensorflow/tsl:tsl.bzl", "check_deps", "if_google", "tsl_copts")
+load("//tensorflow/tsl:tsl.bzl", "if_google", "tsl_copts")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_gpu_cc_test")
 load(
     "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
@@ -10,15 +10,12 @@ load(
     "tf_additional_cuda_platform_deps",
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_cudnn_plugin_deps",
+    "tf_additional_gpu_compilation_copts",
 )
 load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "//tensorflow/tsl/platform:build_config.bzl",
-    "tsl_cc_test",
-)
 load(
     "//tensorflow/tsl/platform:build_config_root.bzl",
     "if_static",
@@ -30,7 +27,7 @@ load(
 )
 
 package(
-    default_visibility = [":friends"],
+    default_visibility = ["//visibility:public"],
     features = ["-layering_check"],
     licenses = ["notice"],
 )
@@ -62,7 +59,6 @@ cc_library(
             "//tensorflow/compiler/xla/stream_executor:executor_cache",
             "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
             "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
-            "//tensorflow/compiler/xla/stream_executor/lib",
             "//tensorflow/compiler/xla/stream_executor/platform",
         ],
     ) + tf_additional_cuda_platform_deps() + [
@@ -84,8 +80,8 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:platform_port",
     ]) + ["@com_google_absl//absl/strings:str_format"],
 )
 
@@ -93,6 +89,8 @@ cc_library(
 # an intermediate target.
 cc_library(name = "ptxas_wrapper")
 
+cc_library(name = "nvlink_wrapper")
+
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
 cc_library(name = "fatbinary_wrapper")
@@ -111,9 +109,10 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla/stream_executor:device_options",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:static_threadlocal",
     ] + tf_additional_cuda_driver_deps()) + select({
         # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
         "//tensorflow/tsl:is_cuda_enabled_and_oss": ["cudart_stub"],
@@ -149,7 +148,6 @@ tsl_gpu_cc_test(
     ],
     deps = [
         ":cuda_driver",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -168,7 +166,6 @@ tsl_gpu_cc_test(
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -215,7 +212,6 @@ cc_library(
         ":cuda_kernel",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
     ]),
 )
@@ -242,7 +238,6 @@ cc_library(
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor:host_or_device_scalar",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/platform",
@@ -299,7 +294,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
     ]) + if_static([
@@ -317,7 +311,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
     ]) + ["//tensorflow/tsl/platform:errors"],
 )
 
@@ -351,7 +344,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:plugin_registry",
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_helpers_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
     ]) + ["//tensorflow/tsl/platform:errors"],
@@ -377,7 +369,6 @@ cc_library(
         ":cuda_activation_header",
         "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/compiler/xla/stream_executor:plugin_registry",
-        "//tensorflow/compiler/xla/stream_executor/lib",
     ]),
 )
 
@@ -412,7 +403,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/compiler/xla/stream_executor:temporary_device_memory",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
@@ -452,7 +442,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:rng",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_rng_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
     ]),
@@ -499,7 +488,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_kernel_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
     ]),
 )
@@ -524,7 +512,17 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_event",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "cuda_graph",
+    srcs = if_cuda_is_configured(["cuda_graph.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_graph.h"]),
+    deps = if_cuda_is_configured([
+        "@com_google_absl//absl/strings:str_format",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
     ]),
 )
 
@@ -537,7 +535,6 @@ cc_library(
         ":cuda_gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
     ]),
 )
@@ -552,18 +549,20 @@ cc_library(
         ":cuda_stream",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
     ]),
 )
 
 cc_library(
     name = "cuda_asm_compiler",
     srcs = if_cuda_is_configured(["cuda_asm_compiler.cc"]),
+    copts = tf_additional_gpu_compilation_copts(),
     deps = if_cuda_is_configured([
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_diagnostics_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/tsl/platform:errors",
     ]),
 )
@@ -588,29 +587,19 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:stream_executor_internal",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/compiler/xla/stream_executor:timer",
+        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
     ]) + [
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/tsl/platform:numbers",
+        "//tensorflow/tsl/platform:statusor",
     ],
     alwayslink = True,
 )
 
-tsl_cc_test(
-    name = "cuda_gpu_executor_test",
-    srcs = ["cuda_gpu_executor_test.cc"],
-    tags = [
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":cuda_gpu_executor",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_main",
-    ],
-)
-
 cc_library(
     name = "all_runtime",
     copts = tsl_copts(),
@@ -649,36 +638,12 @@ cc_library(
         [
             ":cudart_stub",
         ] + select({
-            "//tensorflow:macos": ["IOKit"],
+            "//tensorflow/tsl:macos": ["IOKit"],
             "//conditions:default": [],
         }),
     ),
 )
 
-# To avoid duplication, check that the C++ or python library does not depend on
-# the stream executor cuda plugins. Targets that want to use cuda APIs should
-# instead depend on the dummy plugins in //tensorflow/tsl/platform/default/build_config
-# and use header only targets.
-check_deps(
-    name = "cuda_plugins_check_deps",
-    disallowed_deps = if_static(
-        [],
-        otherwise = [
-            ":all_runtime",
-            ":cuda_driver",
-            ":cuda_platform",
-            ":cudnn_plugin",
-            ":cufft_plugin",
-            ":curand_plugin",
-            "//tensorflow/compiler/xla/stream_executor:cuda_platform",
-        ],
-    ),
-    deps = [
-        "//tensorflow:tensorflow_cc",
-        "//tensorflow/python:pywrap_tensorflow_internal",
-    ],
-)
-
 tsl_gpu_cc_test(
     name = "redzone_allocator_test",
     srcs = ["redzone_allocator_test.cc"],
@@ -696,6 +661,7 @@ tsl_gpu_cc_test(
         "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
         "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
+        "//tensorflow/tsl/framework:allocator",
         "//tensorflow/tsl/framework:allocator_registry_impl",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env_impl",
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_asm_compiler.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_asm_compiler.cc
index 7e488cbbe1a..5fb60404f3f 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_asm_compiler.cc
@@ -14,13 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/call_once.h"
+#include "absl/cleanup/cleanup.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/subprocess.h"
 
 namespace stream_executor {
 
@@ -33,11 +39,97 @@ namespace stream_executor {
       std::ostringstream oss;                                                 \
       oss << error_string << "\nin " << __FILE__ << "(" << __LINE__ << "): '" \
           << #expr << "'";                                                    \
-      return port::Status(port::error::UNKNOWN, oss.str().c_str());           \
+      return tsl::Status(tsl::error::UNKNOWN, oss.str().c_str());             \
     }                                                                         \
   } while (false)
 
-port::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
+    absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
+    std::vector<CubinOrPTXImage> images) {
+  {
+    static absl::once_flag log_once;
+    absl::call_once(log_once,
+                    [] { LOG(INFO) << "Using nvlink for parallel linking"; });
+  }
+  const std::string bin_path =
+      FindCudaExecutable("nvlink", std::string(preferred_cuda_dir));
+
+  if (images.empty()) {
+    return std::vector<uint8>();
+  }
+
+  auto env = tsl::Env::Default();
+  std::vector<std::string> temp_files;
+  absl::Cleanup cleaners = [&] {
+    for (auto& f : temp_files) {
+      TF_CHECK_OK(tsl::Env::Default()->DeleteFile(f));
+    }
+  };
+  for (int i = 0; i < images.size(); i++) {
+    temp_files.emplace_back();
+    TF_RET_CHECK(env->LocalTempFilename(&temp_files.back()));
+    temp_files.back() += ".cubin";
+    TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+        env, temp_files.back(),
+        absl::string_view(reinterpret_cast<const char*>(images[i].bytes.data()),
+                          images[i].bytes.size())));
+  }
+  std::string output_path;
+  TF_RET_CHECK(env->LocalTempFilename(&output_path));
+  absl::Cleanup output_cleaner = [&] {
+    // CUBIN file may never be created, so the failure to delete it should not
+    // produce TF error.
+    tsl::Env::Default()->DeleteFile(output_path).IgnoreError();
+  };
+  int cc_major;
+  int cc_minor;
+  {
+    TF_ASSIGN_OR_RETURN(auto cu_device,
+                        gpu::GpuDriver::DeviceFromContext(context));
+    TF_RETURN_IF_ERROR(
+        gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, cu_device));
+  }
+  std::vector<std::string> args;
+  args.push_back(bin_path);
+  args.push_back(absl::StrCat("-arch=sm_", cc_major, cc_minor));
+  for (int i = 0; i < images.size(); i++) {
+    args.push_back(temp_files[i]);
+  }
+  args.push_back("-o");
+  args.push_back(output_path);
+
+  tsl::SubProcess process;
+  process.SetProgram(bin_path, args);
+  process.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+
+  TF_RET_CHECK(process.Start());
+  std::string stderr_output;
+  int exit_status = process.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+
+  if (exit_status != 0) {
+    return tsl::errors::Internal(
+        absl::StrFormat("nvlink exited with non-zero error code %d, output: %s",
+                        exit_status, stderr_output));
+  }
+
+  if (!stderr_output.empty()) {
+    if (absl::StrContains(stderr_output, "warning")) {
+      LOG(INFO) << stderr_output;
+    } else {
+      VLOG(2) << stderr_output;
+    }
+  }
+
+  // Read in the result of compilation and return it as a byte vector.
+  std::string cubin;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), output_path, &cubin));
+  std::vector<uint8_t> cubin_vector(cubin.begin(), cubin.end());
+  return cubin_vector;
+}
+
+tsl::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
     gpu::GpuContext* context, std::vector<CubinOrPTXImage> images) {
   gpu::ScopedActivateContext activation(context);
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
index 0937f329598..ab706e0c2fa 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc
@@ -63,13 +63,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 
 namespace stream_executor {
@@ -217,7 +217,8 @@ bool CUDABlas::Init() {
   cublasStatus_t ret = cublasCreate(&blas_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
-    if (ret == CUBLAS_STATUS_NOT_INITIALIZED) {
+    if (ret == CUBLAS_STATUS_NOT_INITIALIZED ||
+        ret == CUBLAS_STATUS_ALLOC_FAILED) {
       LOG(ERROR) << kCublasNotInitializedExplanation;
     }
     return false;
@@ -322,6 +323,13 @@ struct CUDADataType<Eigen::half> {
   static constexpr cudaDataType_t type = SE_CUDA_DATA_HALF;
 };
 
+#if CUDA_VERSION >= 11000
+template <>
+struct CUDADataType<Eigen::bfloat16> {
+  static constexpr cudaDataType_t type = CUDA_R_16BF;  // NOLINT
+};
+#endif  // CUDA_VERSION >= 11000
+
 template <>
 struct CUDADataType<std::complex<Eigen::half>> {
   static constexpr cudaDataType_t type = CUDA_C_16F;
@@ -375,15 +383,14 @@ struct CUDADataType<std::complex<uint8_t>> {
 }  // namespace
 
 template <typename FuncT, typename... Args>
-port::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
-                                          bool pointer_mode_host,
-                                          cublasMath_t math_type,
-                                          Args... args) {
+tsl::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+                                         bool pointer_mode_host,
+                                         cublasMath_t math_type, Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
-    return port::InternalError("Failed setting stream");
+    return tsl::errors::Internal("Failed setting stream");
   }
 
 #if CUDA_VERSION >= 9000
@@ -395,7 +402,7 @@ port::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
   if (math_type == CUBLAS_TENSOR_OP_MATH) {
 #endif
     if (!math_mode.Init(math_type)) {
-      return port::InternalError("Failed initializing math mode");
+      return tsl::errors::Internal("Failed initializing math mode");
     }
   }
 #endif
@@ -404,13 +411,13 @@ port::Status CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
   ScopedCublasPointerMode pointer_mode{blas_};
   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
-    return port::InternalError("Failed setting error mode");
+    return tsl::errors::Internal("Failed setting error mode");
   }
   cublasStatus_t ret = cublas_func(blas_, args...);
   if (ret == CUBLAS_STATUS_SUCCESS) {
     return ::tsl::OkStatus();
   }
-  return port::InternalError(ToString(ret));
+  return tsl::errors::Internal(ToString(ret));
 }
 
 // cublas_func may be overloaded, so we need to figure out which one we really
@@ -607,13 +614,13 @@ bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64_t n,
                         incy);
 }
 
-port::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
-                                  blas::Transpose transb, uint64_t m, uint64 n,
-                                  uint64_t k, blas::DataType dtype,
-                                  const void *alpha, const DeviceMemoryBase &a,
-                                  int lda, const DeviceMemoryBase &b, int ldb,
-                                  const void *beta, DeviceMemoryBase *c,
-                                  int ldc, blas::ComputePrecision precision) {
+tsl::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64 n,
+                                 uint64_t k, blas::DataType dtype,
+                                 const void *alpha, const DeviceMemoryBase &a,
+                                 int lda, const DeviceMemoryBase &b, int ldb,
+                                 const void *beta, DeviceMemoryBase *c, int ldc,
+                                 blas::ComputePrecision precision) {
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 
 #if CUDA_VERSION < 11000
@@ -676,7 +683,7 @@ port::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   switch (dtype) {
     case blas::DataType::kHalf: {
 #if CUDA_VERSION < 7050
-      return port::InternalError(
+      return tsl::errors::Internal(
           "fp16 sgemm is not implemented in this cuBLAS version "
           "(need at least CUDA 7.5)");
 #endif
@@ -741,8 +748,8 @@ port::Status CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
           static_cast<GpuDoubleComplexType *>(c->opaque()), ldc);
     }
     default:
-      return port::InternalError(absl::StrCat("Unsupported datatype for GEMM: ",
-                                              blas::DataTypeString(dtype)));
+      return tsl::errors::Internal("Unsupported datatype for GEMM: ",
+                                   blas::DataTypeString(dtype));
   }
 }
 
@@ -919,43 +926,43 @@ static bool UsesTensorOps(blas::AlgorithmType algo) {
 #endif
 }
 
-static port::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
+static tsl::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
     Stream *stream, blas::AlgorithmType algorithm, blas::DataType type_a,
     blas::DataType type_b, blas::ComputePrecision precision) {
   if (type_a != type_b) {
-    return port::InternalError("Types of inputs mismatch");
+    return tsl::errors::Internal("Types of inputs mismatch");
   }
 
   // GPUs < sm_50 don't support cublasGemmEx.
   CudaComputeCapability cc = stream->GetCudaComputeCapability();
   if (cc.major < 5) {
-    return port::InternalError(absl::StrCat(
-        "sm_", cc.major, " does not support explicit gemm algorithms."));
+    return tsl::errors::Internal("sm_", cc.major,
+                                 " does not support explicit gemm algorithms.");
   }
 
   bool algo_uses_tensor_ops = UsesTensorOps(algorithm);
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
   if (algo_uses_tensor_ops) {
     if (cc.major < 7) {
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Algorithm ", algorithm,
           " uses tensor ops, but tensor ops are not available in sm", cc.major,
-          "X devices."));
+          "X devices.");
     } else if (type_a == blas::DataType::kFloat) {
 #if CUDA_VERSION < 11000
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Algorithm ", algorithm,
-          " uses tensor ops, but tensor ops are not available for fp32"));
+          " uses tensor ops, but tensor ops are not available for fp32");
 #else
       if (cc.major < 8) {
-        return port::InternalError(absl::StrCat(
+        return tsl::errors::Internal(
             "Algorithm ", algorithm,
             " uses tensor ops, but tensor ops are not available in sm",
-            cc.major, "X devices for float input types."));
+            cc.major, "X devices for float input types.");
       } else if (!tsl::tensor_float_32_execution_enabled()) {
-        return port::InternalError(absl::StrCat(
+        return tsl::errors::Internal(
             "Algorithm ", algorithm,
-            " uses tensor ops, but tensor ops are disabled for fp32 inputs"));
+            " uses tensor ops, but tensor ops are disabled for fp32 inputs");
       }
       math_type = CUBLAS_TF32_TENSOR_OP_MATH;
 #endif
@@ -964,9 +971,9 @@ static port::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
       math_type = CUBLAS_TENSOR_OP_MATH;
 #endif
     } else {
-      return port::InternalError(
-          absl::StrCat("Algorithm ", algorithm,
-                       " uses tensor ops which are not supported for input"));
+      return tsl::errors::Internal(
+          "Algorithm ", algorithm,
+          " uses tensor ops which are not supported for input");
     }
   }
   if (precision > blas::kDefaultComputePrecision) {
@@ -978,7 +985,7 @@ static port::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
   if ((algorithm == CUBLAS_GEMM_DEFAULT || algorithm >= CUBLAS_GEMM_ALGO13) &&
       std::max({m, n, k}) >= 2097153 && cc_major < 7) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "DoBlasGemmWithAlgorithm returning false to work around cudnn "
         "<9.2 bug with m, n, or k >= 2097153.  See b/79126339.");
   }
@@ -986,28 +993,28 @@ static port::StatusOr<cublasMath_t> GetMathTypeForGemmEx(
   return math_type;
 }
 
-static port::StatusOr<std::unique_ptr<GpuTimer, GpuTimerDeleter>>
+static tsl::StatusOr<std::unique_ptr<GpuTimer, GpuTimerDeleter>>
 StartGpuTimerForProfile(Stream *stream, GpuExecutor *executor,
                         blas::ProfileResult *output_profile_result) {
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result) {
     timer.reset(new GpuTimer(executor));
     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "output_profile_result given, but unable to create a GpuTimer");
     }
   }
   return timer;
 }
 
-static port::Status PopulateProfileFromTimer(
+static tsl::Status PopulateProfileFromTimer(
     GpuTimer *timer, blas::AlgorithmType algorithm,
     blas::ProfileResult *output_profile_result, Stream *stream) {
   if (timer) {
     // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
     if (!timer->Stop(AsGpuStream(stream))) {
-      return port::InternalError("unable to stop GpuTimer.");
+      return tsl::errors::Internal("unable to stop GpuTimer.");
     }
     output_profile_result->set_is_valid(true);
     output_profile_result->set_algorithm(algorithm);
@@ -1017,7 +1024,7 @@ static port::Status PopulateProfileFromTimer(
   return ::tsl::OkStatus();
 }
 
-port::Status CUDABlas::DoBlasGemmWithAlgorithm(
+tsl::Status CUDABlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, const DeviceMemoryBase &b,
@@ -1048,7 +1055,7 @@ port::Status CUDABlas::DoBlasGemmWithAlgorithm(
   return ::tsl::OkStatus();
 }
 
-port::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
+tsl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
@@ -1183,6 +1190,11 @@ struct HalfAsFloat<Eigen::half> {
   typedef float type;
 };
 
+template <>
+struct HalfAsFloat<Eigen::bfloat16> {
+  typedef float type;
+};
+
 namespace {
 // pass-through for non-complex types that don't need conversion to
 // cublas-specific type.
@@ -1193,7 +1205,7 @@ T inline GpuComplexValue(T v) {
 }  // namespace
 
 template <typename T, typename Scalar, typename FuncT>
-port::Status CUDABlas::DoBlasGemmBatchedInternal(
+tsl::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
     blas::Transpose transb, uint64_t m, uint64 n, uint64 k, Scalar alpha,
     const DeviceMemorySlice<T> &a_ptrs_to_wrappers, int lda,
@@ -1254,9 +1266,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
       !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
       !stream->ThenMemcpy(&c, c_raw_ptrs.data(), size).ok()) {
-    return port::Status(port::error::INTERNAL,
-                        "failed to copy memory from host to device in "
-                        "CUDABlas::DoBlasGemmBatched");
+    return tsl::Status(tsl::error::INTERNAL,
+                       "failed to copy memory from host to device in "
+                       "CUDABlas::DoBlasGemmBatched");
   }
 
   cudaDataType_t data_type = CUDADataType<T>::type;
@@ -1265,7 +1277,14 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->GetCudaComputeCapability().IsAtLeast(5)) {
     cublasMath_t math_type;
     cublasGemmAlgo_t algo;
-    if (data_type == CUDA_R_16F) {
+
+#if CUDA_VERSION >= 11000
+    bool is_16bit = data_type == CUDA_R_16F || data_type == CUDA_R_16BF;
+#else
+    bool is_16bit = data_type == CUDA_R_16F;
+#endif  // CUDA_VERSION >= 11000
+
+    if (is_16bit) {
 #if CUDA_VERSION < 11000
       math_type = CUBLAS_TENSOR_OP_MATH;
 #else
@@ -1285,8 +1304,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
       math_type = CUBLAS_DEFAULT_MATH;
       algo = CUBLAS_GEMM_DFALT;
     }
-    cudaDataType_t compute_type =
-        (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
+    cudaDataType_t compute_type = is_16bit ? CUDA_R_32F : data_type;
     const void **a_void_ptrs = reinterpret_cast<const void **>(
         const_cast<const CUDA_T **>(GpuMemory(a)));
     const void **b_void_ptrs = reinterpret_cast<const void **>(
@@ -1313,8 +1331,8 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     if (ok) {
       return ::tsl::OkStatus();
     }
-    return port::Status(port::error::INTERNAL,
-                        "failed BLAS call, see log for details");
+    return tsl::Status(tsl::error::INTERNAL,
+                       "failed BLAS call, see log for details");
   } else {
     // Fall back to a loop for fp16
     for (int b = 0; b < batch_count; ++b) {
@@ -1332,14 +1350,13 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
 
 bool CUDABlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha,
-    const DeviceMemorySlice<Eigen::half> &a_array, int lda,
-    const DeviceMemorySlice<Eigen::half> &b_array, int ldb, float beta,
-    const DeviceMemorySlice<Eigen::half> &c_array, int ldc, int batch_count,
+    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<Eigen::half> a_array,
+    int lda, DeviceMemorySlice<Eigen::half> b_array, int ldb, float beta,
+    DeviceMemorySlice<Eigen::half> c_array, int ldc, int batch_count,
     ScratchAllocator *scratch_allocator) {
   // Note: The func passed here (cublasSgemmBatched) is not actually called,
   // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
-  port::Status status = DoBlasGemmBatchedInternal(
+  tsl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
@@ -1350,11 +1367,14 @@ bool CUDABlas::DoBlasGemmBatched(
 
 bool CUDABlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, const DeviceMemorySlice<float> &a_array,
-    int lda, const DeviceMemorySlice<float> &b_array, int ldb, float beta,
-    const DeviceMemorySlice<float> &c_array, int ldc, int batch_count,
+    uint64_t n, uint64 k, float alpha,
+    DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
+    DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
+    DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
     ScratchAllocator *scratch_allocator) {
-  port::Status status = DoBlasGemmBatchedInternal(
+  // Note: The func passed here (cublasSgemmBatched) is not actually called,
+  // due to special handling of bf16 inside DoBlasGemmBatchedInternal.
+  tsl::Status status = DoBlasGemmBatchedInternal(
       cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
@@ -1363,14 +1383,32 @@ bool CUDABlas::DoBlasGemmBatched(
   return status.ok();
 }
 
-bool CUDABlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, double alpha,
-    const DeviceMemorySlice<double> &a_array, int lda,
-    const DeviceMemorySlice<double> &b_array, int ldb, double beta,
-    const DeviceMemorySlice<double> &c_array, int ldc, int batch_count,
-    ScratchAllocator *scratch_allocator) {
-  port::Status status = DoBlasGemmBatchedInternal(
+bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<float> a_array, int lda,
+                                 DeviceMemorySlice<float> b_array, int ldb,
+                                 float beta, DeviceMemorySlice<float> c_array,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) {
+  tsl::Status status = DoBlasGemmBatchedInternal(
+      cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
+      b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool CUDABlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, double alpha,
+                                 DeviceMemorySlice<double> a_array, int lda,
+                                 DeviceMemorySlice<double> b_array, int ldb,
+                                 double beta, DeviceMemorySlice<double> c_array,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) {
+  tsl::Status status = DoBlasGemmBatchedInternal(
       cublasDgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
@@ -1382,12 +1420,11 @@ bool CUDABlas::DoBlasGemmBatched(
 bool CUDABlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, std::complex<float> alpha,
-    const DeviceMemorySlice<std::complex<float>> &a_array, int lda,
-    const DeviceMemorySlice<std::complex<float>> &b_array, int ldb,
-    std::complex<float> beta,
-    const DeviceMemorySlice<std::complex<float>> &c_array, int ldc,
-    int batch_count, ScratchAllocator *scratch_allocator) {
-  port::Status status = DoBlasGemmBatchedInternal(
+    DeviceMemorySlice<std::complex<float>> a_array, int lda,
+    DeviceMemorySlice<std::complex<float>> b_array, int ldb,
+    std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  tsl::Status status = DoBlasGemmBatchedInternal(
       cublasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
@@ -1399,12 +1436,11 @@ bool CUDABlas::DoBlasGemmBatched(
 bool CUDABlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, std::complex<double> alpha,
-    const DeviceMemorySlice<std::complex<double>> &a_array, int lda,
-    const DeviceMemorySlice<std::complex<double>> &b_array, int ldb,
-    std::complex<double> beta,
-    const DeviceMemorySlice<std::complex<double>> &c_array, int ldc,
-    int batch_count, ScratchAllocator *scratch_allocator) {
-  port::Status status = DoBlasGemmBatchedInternal(
+    DeviceMemorySlice<std::complex<double>> a_array, int lda,
+    DeviceMemorySlice<std::complex<double>> b_array, int ldb,
+    std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  tsl::Status status = DoBlasGemmBatchedInternal(
       cublasZgemmBatched, stream, transa, transb, m, n, k, alpha, a_array, lda,
       b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
   if (!status.ok()) {
@@ -1413,7 +1449,7 @@ bool CUDABlas::DoBlasGemmBatched(
   return status.ok();
 }
 
-port::Status CUDABlas::DoBlasGemmStridedBatched(
+tsl::Status CUDABlas::DoBlasGemmStridedBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, blas::DataType dtype, const void *alpha,
     const DeviceMemoryBase &a, int lda, int64_t stride_a,
@@ -1553,8 +1589,8 @@ port::Status CUDABlas::DoBlasGemmStridedBatched(
           batch_count);
     }
     default:
-      return port::InternalError(absl::StrCat("Unsupported datatype for GEMM: ",
-                                              blas::DataTypeString(dtype)));
+      return tsl::errors::Internal("Unsupported datatype for GEMM: ",
+                                   blas::DataTypeString(dtype));
   }
 }
 
@@ -1668,20 +1704,20 @@ bool CUDABlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
       reinterpret_cast<double2 **>(GpuMemoryMutable(bs)), ldb, batch_count);
 }
 
-port::Status CUDABlas::GetVersion(std::string *version) {
+tsl::Status CUDABlas::GetVersion(std::string *version) {
   absl::MutexLock lock(&mu_);
 
   int v;
   auto status = cublasGetVersion(blas_, &v);
   if (status != CUBLAS_STATUS_SUCCESS) {
-    return port::InternalError(ToString(status));
+    return tsl::errors::Internal(ToString(status));
   }
   *version = std::to_string(v);
   return ::tsl::OkStatus();
 }
 
 void initialize_cublas() {
-  port::Status status =
+  tsl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
           kCudaPlatformId, kCuBlasPlugin, "cuBLAS",
           [](::stream_executor::internal::StreamExecutorInterface *parent)
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
index 7e97383354a..2ecdaee9dbc 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h
@@ -41,9 +41,6 @@ namespace cuda {
 // Opaque and unique identifier for the cuBLAS plugin.
 extern const PluginId kCuBlasPlugin;
 
-template <typename T>
-using DeviceMemorySlice = port::ArraySlice<DeviceMemory<T> *>;  // non-absl ok
-
 // BLAS plugin for CUDA platform via cuBLAS library.
 //
 // This satisfies the platform-agnostic BlasSupport interface.
@@ -89,9 +86,9 @@ class CUDABlas : public blas::BlasSupport {
   //                     (true) or device (false).
   // args:               Arguments of cuBLAS function.
   template <typename FuncT, typename... Args>
-  port::Status DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
-                                  bool pointer_mode_host,
-                                  cublasMath_t math_type, Args... args);
+  tsl::Status DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
+                                 bool pointer_mode_host, cublasMath_t math_type,
+                                 Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with err_on_failure=true
   // and math_type=CUBLAS_DEFAULT_MATH.
@@ -106,7 +103,7 @@ class CUDABlas : public blas::BlasSupport {
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
   template <typename T, typename Scalar, typename FuncT>
-  port::Status DoBlasGemmBatchedInternal(
+  tsl::Status DoBlasGemmBatchedInternal(
       FuncT cublas_func, Stream *stream, blas::Transpose transa,
       blas::Transpose transb, uint64_t m, uint64 n, uint64 k, Scalar alpha,
       const DeviceMemorySlice<T> &a_array, int lda,
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.cc
index 4ae1e481252..c6962dac721 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -40,7 +40,7 @@ limitations under the License.
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
 
 #define GET_ATTR(getter, handle, attr, ValueT)                            \
-  [&]() -> port::StatusOr<ValueT> {                                       \
+  [&]() -> tsl::StatusOr<ValueT> {                                        \
     ValueT value;                                                         \
     TF_RETURN_IF_ERROR(ToStatus(                                          \
         getter(handle, attr, &value, sizeof(ValueT), nullptr), #getter)); \
@@ -52,32 +52,32 @@ namespace cuda {
 namespace {
 
 template <typename T>
-port::Status SetAttr(cublasLtMatrixLayout_t handle,
-                     cublasLtMatrixLayoutAttribute_t attr, T value) {
+tsl::Status SetAttr(cublasLtMatrixLayout_t handle,
+                    cublasLtMatrixLayoutAttribute_t attr, T value) {
   return SET_ATTR(cublasLtMatrixLayoutSetAttribute, handle, attr, value);
 }
 
 template <typename T>
-port::StatusOr<T> GetAttr(cublasLtMatrixLayout_t handle,
-                          cublasLtMatrixLayoutAttribute_t attr) {
+tsl::StatusOr<T> GetAttr(cublasLtMatrixLayout_t handle,
+                         cublasLtMatrixLayoutAttribute_t attr) {
   return GET_ATTR(cublasLtMatrixLayoutGetAttribute, handle, attr, T);
 }
 
 template <typename T>
-port::Status SetAttr(cublasLtMatmulDesc_t handle,
-                     cublasLtMatmulDescAttributes_t attr, T value) {
+tsl::Status SetAttr(cublasLtMatmulDesc_t handle,
+                    cublasLtMatmulDescAttributes_t attr, T value) {
   return SET_ATTR(cublasLtMatmulDescSetAttribute, handle, attr, value);
 }
 
 template <typename T>
-port::StatusOr<T> GetAttr(cublasLtMatmulDesc_t handle,
-                          cublasLtMatmulDescAttributes_t attr) {
+tsl::StatusOr<T> GetAttr(cublasLtMatmulDesc_t handle,
+                         cublasLtMatmulDescAttributes_t attr) {
   return GET_ATTR(cublasLtMatmulDescGetAttribute, handle, attr, T);
 }
 
 template <typename T>
-port::Status SetAttr(cublasLtMatmulPreference_t handle,
-                     cublasLtMatmulPreferenceAttributes_t attr, T value) {
+tsl::Status SetAttr(cublasLtMatmulPreference_t handle,
+                    cublasLtMatmulPreferenceAttributes_t attr, T value) {
   return SET_ATTR(cublasLtMatmulPreferenceSetAttribute, handle, attr, value);
 }
 
@@ -90,7 +90,7 @@ cublasLtPointerMode_t AsCublasLtPointerMode(BlasLt::PointerMode pointer_mode) {
   }
 }
 
-port::StatusOr<cublasLtEpilogue_t> AsCublasLtEpilogue(
+tsl::StatusOr<cublasLtEpilogue_t> AsCublasLtEpilogue(
     BlasLt::Epilogue epilogue) {
   switch (epilogue) {
     case BlasLt::Epilogue::kDefault:
@@ -101,26 +101,28 @@ port::StatusOr<cublasLtEpilogue_t> AsCublasLtEpilogue(
       return CUBLASLT_EPILOGUE_BIAS;
     case BlasLt::Epilogue::kBiasThenReLU:
       return CUBLASLT_EPILOGUE_RELU_BIAS;
-    case BlasLt::Epilogue::kGeLU:
 #if CUDA_VERSION >= 11040
+    case BlasLt::Epilogue::kGELU:
       return CUBLASLT_EPILOGUE_GELU;
-#else
-      return port::InternalError(absl::StrCat(
-          "CUBLASLT_EPILOGUE_GELU epilog requires cublasLt >= 11.4"));
-#endif
-    case BlasLt::Epilogue::kBiasThenGeLUApproximate:
-#if CUDA_VERSION >= 11040
+    case BlasLt::Epilogue::kGELUWithAux:
+      return CUBLASLT_EPILOGUE_GELU_AUX;
+    case BlasLt::Epilogue::kBiasThenGELU:
       return CUBLASLT_EPILOGUE_GELU_BIAS;
+    case BlasLt::Epilogue::kBiasThenGELUWithAux:
+      return CUBLASLT_EPILOGUE_GELU_AUX_BIAS;
 #else
-      return port::InternalError(absl::StrCat(
-          "CUBLASLT_EPILOGUE_GELU_BIAS epilog requires cublasLt >= 11.4"));
+    case BlasLt::Epilogue::kGELU:
+    case BlasLt::Epilogue::kGELUWithAux:
+    case BlasLt::Epilogue::kBiasThenGELU:
+    case BlasLt::Epilogue::kBiasThenGELUWithAux:
+      return tsl::errors::Internal("GELU epilogues require cublasLt >= 11.4");
 #endif
   }
 }
 
 }  // namespace
 
-port::Status BlasLt::Init() {
+tsl::Status BlasLt::Init() {
   cublasLtHandle_t blas_lt;
   SE_CUBLAS_RETURN_IF_ERROR(cublasLtCreate(&blas_lt));
   absl::MutexLock lock(&mu_);
@@ -128,7 +130,7 @@ port::Status BlasLt::Init() {
   return tsl::OkStatus();
 }
 
-/*static*/ port::StatusOr<BlasLt::MatrixLayout> BlasLt::MatrixLayout::Create(
+/*static*/ tsl::StatusOr<BlasLt::MatrixLayout> BlasLt::MatrixLayout::Create(
     blas::DataType type, size_t num_rows, size_t num_cols,
     BlasLt::MatrixLayout::Order order, size_t batch_size,
     std::optional<int64_t> leading_dim_stride,
@@ -164,7 +166,7 @@ cudaDataType_t BlasLt::MatrixLayout::type() const {
       GetAttr<uint32_t>(handle_.get(), CUBLASLT_MATRIX_LAYOUT_TYPE).value());
 }
 
-/*static*/ port::StatusOr<BlasLt::MatmulDesc> BlasLt::MatmulDesc::Create(
+/*static*/ tsl::StatusOr<BlasLt::MatmulDesc> BlasLt::MatmulDesc::Create(
     blas::ComputationType compute_type, blas::DataType scale_type,
     blas::Transpose trans_a, blas::Transpose trans_b, BlasLt::Epilogue epilogue,
     BlasLt::PointerMode pointer_mode) {
@@ -201,7 +203,7 @@ cublasLtPointerMode_t BlasLt::MatmulDesc::pointer_mode() const {
           .value());
 }
 
-/*static*/ port::StatusOr<BlasLt::MatmulPreference>
+/*static*/ tsl::StatusOr<BlasLt::MatmulPreference>
 BlasLt::MatmulPreference::Create(size_t max_workspace_size) {
   cublasLtMatmulPreference_t cu_preference;
   SE_CUBLAS_RETURN_IF_ERROR(cublasLtMatmulPreferenceCreate(&cu_preference));
@@ -213,10 +215,9 @@ BlasLt::MatmulPreference::Create(size_t max_workspace_size) {
   return std::move(preference);
 }
 
-port::StatusOr<std::vector<BlasLt::MatmulAlgorithm>>
-BlasLt::GetMatmulAlgorithms(const BlasLt::MatmulPlan& plan,
-                            const BlasLt::MatmulPreference& preference,
-                            size_t max_algorithm_count) {
+tsl::StatusOr<std::vector<BlasLt::MatmulAlgorithm>> BlasLt::GetMatmulAlgorithms(
+    const BlasLt::MatmulPlan& plan, const BlasLt::MatmulPreference& preference,
+    size_t max_algorithm_count) {
   max_algorithm_count = std::min(max_algorithm_count, size_t{INT_MAX});
   std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);
   {
@@ -244,14 +245,17 @@ BlasLt::GetMatmulAlgorithms(const BlasLt::MatmulPlan& plan,
   return std::move(algorithms);
 }
 
-port::Status BlasLt::DoMatmul(Stream* stream, const BlasLt::MatmulPlan& plan,
-                              const void* alpha, DeviceMemoryBase a,
-                              DeviceMemoryBase b, const void* beta,
-                              DeviceMemoryBase c, DeviceMemoryBase d,
-                              const BlasLt::MatmulAlgorithm& algorithm,
-                              ScratchAllocator& scratch_allocator,
-                              DeviceMemoryBase bias,
-                              blas::ProfileResult* profile_result) {
+tsl::Status BlasLt::DoMatmul(Stream* stream, const BlasLt::MatmulPlan& plan,
+                             const void* alpha, DeviceMemoryBase a,
+                             DeviceMemoryBase b, const void* beta,
+                             DeviceMemoryBase c, DeviceMemoryBase d,
+                             const BlasLt::MatmulAlgorithm& algorithm,
+                             ScratchAllocator& scratch_allocator,
+                             DeviceMemoryBase bias, DeviceMemoryBase aux,
+                             DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                             DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                             DeviceMemoryBase d_amax,
+                             blas::ProfileResult* profile_result) {
   std::unique_ptr<gpu::GpuTimer, gpu::GpuTimerDeleter> timer;
   if (profile_result != nullptr) {
     timer.reset(new gpu::GpuTimer(parent_));
@@ -270,13 +274,76 @@ port::Status BlasLt::DoMatmul(Stream* stream, const BlasLt::MatmulPlan& plan,
   {
     absl::MutexLock lock(&mu_);
     TF_RET_CHECK(blas_lt_ != nullptr);
-    // We must set the bias pointer while holding the mutex, to avoid a
+    // We must set the bias and aux pointers while holding the mutex, to avoid a
     // potential race condition from multiple threads sharing the same plan.
     if (bias != nullptr) {
       TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
                                  CUBLASLT_MATMUL_DESC_BIAS_POINTER,
                                  bias.opaque()));
     }
+#if CUDA_VERSION >= 11080
+    if (a_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                 a_scale.opaque()));
+    }
+    if (b_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                 b_scale.opaque()));
+    }
+    if (c_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_C_SCALE_POINTER,
+                                 c_scale.opaque()));
+    }
+    if (d_scale != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_D_SCALE_POINTER,
+                                 d_scale.opaque()));
+    }
+    if (d_amax != nullptr) {
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_AMAX_D_POINTER,
+                                 d_amax.opaque()));
+    }
+#else
+    if (a_scale != nullptr || b_scale != nullptr || c_scale != nullptr ||
+        d_scale != nullptr || d_amax != nullptr) {
+      return tsl::errors::Internal(
+          "A/B/C/D scales and amax require cublasLt >= 11.8");
+    }
+#endif
+
+    if (aux != nullptr) {
+#if CUDA_VERSION >= 11040
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+                                 aux.opaque()));
+
+      // Set leading dim and batch stride of auxiliary output to match output.
+      // TODO(cjfj): Set this once at initialization.
+      TF_ASSIGN_OR_RETURN(
+          int64_t output_leading_dim,
+          GetAttr<int64_t>(plan.d_desc.get(), CUBLASLT_MATRIX_LAYOUT_LD));
+
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
+                                 output_leading_dim));
+
+      TF_ASSIGN_OR_RETURN(
+          int64_t output_batch_stride,
+          GetAttr<int64_t>(plan.d_desc.get(),
+                           CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET));
+
+      TF_RETURN_IF_ERROR(SetAttr(plan.op_desc.get(),
+                                 CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE,
+                                 output_batch_stride));
+#else
+      return tsl::errors::Internal(
+          "Auxiliary inputs / outputs require cublasLt >= 11.4");
+#endif
+    }
 
     gpu::ScopedActivateExecutorContext sac{parent_};
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h
index aace5a4bd13..8acfe6c7753 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -27,8 +27,9 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/host_or_device_scalar.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -52,7 +53,7 @@ class BlasLt {
     //  - `num_rows` if `order == kColumnMajor`.
     // If `batch_stride` is not specified, it defaults to `num_rows * num_cols`
     // if `batch_size > 1`, otherwise `0`.
-    static port::StatusOr<MatrixLayout> Create(
+    static tsl::StatusOr<MatrixLayout> Create(
         blas::DataType type, size_t num_rows, size_t num_cols, Order order,
         size_t batch_size = 1,
         std::optional<int64_t> leading_dim_stride = std::nullopt,
@@ -74,9 +75,10 @@ class BlasLt {
     kReLU = 2,                      // Apply point-wise ReLU function
     kBias = 4,                      // Add broadcasted bias vector
     kBiasThenReLU = kBias | kReLU,  // Apply bias and then ReLU transform
-    kGeLU = 32,  // Apply GELU point-wise transform to the results
-    kBiasThenGeLUApproximate =
-        kBias | kGeLU,  // Apply bias and then GeLU Tanh transform
+    kGELU = 32,                // Apply GELU point-wise transform to the results
+    kGELUWithAux = 32 | 1024,  // Apply GELU with auxiliary output.
+    kBiasThenGELU = kBias | kGELU,  // Apply bias and then approximate GELU.
+    kBiasThenGELUWithAux = kBiasThenGELU | 1024,
   };
 
   // Describes the location of pointers for the scaling factors alpha and beta.
@@ -87,7 +89,7 @@ class BlasLt {
 
   class MatmulDesc {
    public:
-    static port::StatusOr<MatmulDesc> Create(
+    static tsl::StatusOr<MatmulDesc> Create(
         blas::ComputationType compute_type, blas::DataType scale_type,
         blas::Transpose trans_a = blas::Transpose::kNoTranspose,
         blas::Transpose trans_b = blas::Transpose::kNoTranspose,
@@ -118,7 +120,7 @@ class BlasLt {
 
   class MatmulPreference {
    public:
-    static port::StatusOr<MatmulPreference> Create(size_t max_workspace_size);
+    static tsl::StatusOr<MatmulPreference> Create(size_t max_workspace_size);
 
     cublasLtMatmulPreference_t get() const { return handle_.get(); }
 
@@ -137,70 +139,95 @@ class BlasLt {
   explicit BlasLt(gpu::GpuExecutor* parent)
       : parent_(parent), blas_lt_(nullptr, cublasLtDestroy) {}
 
-  port::Status Init();
+  tsl::Status Init();
 
   // Returns a list of supported algorithms for DoMatmul. The algorithms are
   // returned in the order of increasing estimated compute time according to an
   // internal heuristic.
-  port::StatusOr<std::vector<MatmulAlgorithm>> GetMatmulAlgorithms(
+  tsl::StatusOr<std::vector<MatmulAlgorithm>> GetMatmulAlgorithms(
       const MatmulPlan& plan, const MatmulPreference& preference,
       size_t max_algorithm_count = 128);
 
-  template <typename AB, typename CD, typename Scale>
-  port::Status DoMatmul(Stream* stream, const MatmulPlan& plan,
-                        const HostOrDeviceScalar<Scale>& alpha,
-                        const DeviceMemory<AB>& a, const DeviceMemory<AB>& b,
-                        const HostOrDeviceScalar<Scale>& beta,
-                        const DeviceMemory<CD>& c, DeviceMemory<CD>& d,
-                        const MatmulAlgorithm& algorithm,
-                        ScratchAllocator& scratch_allocator,
-                        const DeviceMemory<CD>& bias = {},
-                        blas::ProfileResult* profile_result = nullptr) {
+  template <typename A, typename B, typename C, typename D, typename Scale>
+  tsl::Status DoMatmul(Stream* stream, const MatmulPlan& plan,
+                       const HostOrDeviceScalar<Scale>& alpha,
+                       const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+                       const HostOrDeviceScalar<Scale>& beta,
+                       const DeviceMemory<C>& c, DeviceMemory<D>& d,
+                       const MatmulAlgorithm& algorithm,
+                       ScratchAllocator& scratch_allocator,
+                       const DeviceMemory<C>& bias = {},
+                       const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+                       const DeviceMemory<Scale>& a_scale = {},
+                       const DeviceMemory<Scale>& b_scale = {},
+                       const DeviceMemory<Scale>& c_scale = {},
+                       const DeviceMemory<Scale>& d_scale = {},
+                       const DeviceMemory<Scale>& d_amax = {},
+                       blas::ProfileResult* profile_result = nullptr) {
     if (AsCudaDataType(blas::ToDataType<Scale>::value) !=
         plan.op_desc.scale_type()) {
-      return port::InvalidArgumentError("mismatched scale types");
+      return tsl::errors::InvalidArgument("mismatched scale types");
     }
 
     bool expect_scale_factor_on_device =
         (plan.op_desc.pointer_mode() == CUBLASLT_POINTER_MODE_DEVICE);
 
     if (alpha.on_device() != expect_scale_factor_on_device) {
-      return port::InvalidArgumentError("wrong location for alpha");
+      return tsl::errors::InvalidArgument("wrong location for alpha");
     }
 
     if (beta.on_device() != expect_scale_factor_on_device) {
-      return port::InvalidArgumentError("wrong location for beta");
+      return tsl::errors::InvalidArgument("wrong location for beta");
     }
 
-    if (AsCudaDataType(blas::ToDataType<AB>::value) != plan.a_desc.type()) {
-      return port::InvalidArgumentError("mismatched A matrix types");
+    if (AsCudaDataType(blas::ToDataType<A>::value) != plan.a_desc.type()) {
+      return tsl::errors::InvalidArgument("mismatched A matrix types");
     }
 
-    if (AsCudaDataType(blas::ToDataType<AB>::value) != plan.b_desc.type()) {
-      return port::InvalidArgumentError("mismatched B matrix types");
+    if (AsCudaDataType(blas::ToDataType<B>::value) != plan.b_desc.type()) {
+      return tsl::errors::InvalidArgument("mismatched B matrix types");
     }
 
-    if (AsCudaDataType(blas::ToDataType<CD>::value) != plan.c_desc.type()) {
-      return port::InvalidArgumentError("mismatched C matrix types");
+    if (AsCudaDataType(blas::ToDataType<C>::value) != plan.c_desc.type()) {
+      return tsl::errors::InvalidArgument("mismatched C matrix types");
     }
 
-    if (AsCudaDataType(blas::ToDataType<CD>::value) != plan.d_desc.type()) {
-      return port::InvalidArgumentError("mismatched D matrix types");
+    if (AsCudaDataType(blas::ToDataType<D>::value) != plan.d_desc.type()) {
+      return tsl::errors::InvalidArgument("mismatched D matrix types");
     }
 
     return DoMatmul(stream, plan, alpha.opaque(), a, b, beta.opaque(), c, d,
-                    algorithm, scratch_allocator, bias, profile_result);
+                    algorithm, scratch_allocator, bias, aux, a_scale, b_scale,
+                    c_scale, d_scale, d_amax, profile_result);
+  }
+
+  template <typename A, typename B, typename C, typename D, typename Scale>
+  tsl::Status DoMatmul(Stream* stream, const MatmulPlan& plan,
+                       const HostOrDeviceScalar<Scale>& alpha,
+                       const DeviceMemory<A>& a, const DeviceMemory<B>& b,
+                       const HostOrDeviceScalar<Scale>& beta,
+                       const DeviceMemory<C>& c, DeviceMemory<D>& d,
+                       const MatmulAlgorithm& algorithm,
+                       ScratchAllocator& scratch_allocator,
+                       const DeviceMemory<C>& bias = {},
+                       const DeviceMemoryBase& aux = DeviceMemory<uint8_t>{},
+                       blas::ProfileResult* profile_result = nullptr) {
+    return DoMatmul(stream, plan, alpha, a, b, beta, c, d, algorithm,
+                    scratch_allocator, bias, aux, {}, {}, {}, {}, {},
+                    profile_result);
   }
 
  private:
-  port::Status DoMatmul(Stream* stream, const MatmulPlan& plan,
-                        const void* alpha, DeviceMemoryBase a,
-                        DeviceMemoryBase b, const void* beta,
-                        DeviceMemoryBase c, DeviceMemoryBase d,
-                        const MatmulAlgorithm& algorithm,
-                        ScratchAllocator& scratch_allocator,
-                        DeviceMemoryBase bias,
-                        blas::ProfileResult* profile_result);
+  tsl::Status DoMatmul(Stream* stream, const MatmulPlan& plan,
+                       const void* alpha, DeviceMemoryBase a,
+                       DeviceMemoryBase b, const void* beta, DeviceMemoryBase c,
+                       DeviceMemoryBase d, const MatmulAlgorithm& algorithm,
+                       ScratchAllocator& scratch_allocator,
+                       DeviceMemoryBase bias, DeviceMemoryBase aux,
+                       DeviceMemoryBase a_scale, DeviceMemoryBase b_scale,
+                       DeviceMemoryBase c_scale, DeviceMemoryBase d_scale,
+                       DeviceMemoryBase d_amax,
+                       blas::ProfileResult* profile_result);
 
   gpu::GpuExecutor* parent_;
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.cc
index 93f53c90671..af3270a0e4f 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.cc
@@ -31,16 +31,22 @@ const char* ToString(cublasStatus_t status) {
 #endif  // CUDA_VERSION >= 11050
 }
 
-port::Status ToStatus(cublasStatus_t status, const char* prefix) {
+tsl::Status ToStatus(cublasStatus_t status, const char* prefix) {
   if (status != CUBLAS_STATUS_SUCCESS) {
-    return port::Status(port::error::INTERNAL,
-                        absl::StrCat(prefix, ": ", ToString(status)));
+    return tsl::Status(tsl::error::INTERNAL,
+                       absl::StrCat(prefix, ": ", ToString(status)));
   }
   return tsl::OkStatus();
 }
 
 cudaDataType_t AsCudaDataType(blas::DataType type) {
   switch (type) {
+#if CUDA_VERSION >= 11080
+    case blas::DataType::kF8E5M2:
+      return CUDA_R_8F_E5M2;
+    case blas::DataType::kF8E4M3FN:
+      return CUDA_R_8F_E4M3;
+#endif
     case blas::DataType::kHalf:
       return CUDA_R_16F;
     case blas::DataType::kBF16:
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h
index 7ba1157471f..c2161378733 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 #define SE_CUBLAS_RETURN_IF_ERROR(expr) \
   TF_RETURN_IF_ERROR(::stream_executor::cuda::ToStatus(expr, #expr))
@@ -30,7 +30,7 @@ namespace stream_executor {
 namespace cuda {
 
 const char* ToString(cublasStatus_t status);
-port::Status ToStatus(cublasStatus_t status, const char* prefix = "cublasLt");
+tsl::Status ToStatus(cublasStatus_t status, const char* prefix = "cublasLt");
 cudaDataType_t AsCudaDataType(blas::DataType type);
 cublasComputeType_t AsCublasComputeType(blas::ComputationType type);
 cublasOperation_t AsCublasOperation(blas::Transpose trans);
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
index 9ddb6dcfa80..38a1427581d 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -40,15 +40,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace cuda {
@@ -58,7 +57,7 @@ std::string DriverVersionToString(DriverVersion version) {
                          std::get<2>(version));
 }
 
-std::string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+std::string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version) {
   if (!version.ok()) {
     return version.status().ToString();
   }
@@ -66,11 +65,11 @@ std::string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
   return DriverVersionToString(version.value());
 }
 
-port::StatusOr<DriverVersion> StringToDriverVersion(const std::string &value) {
+tsl::StatusOr<DriverVersion> StringToDriverVersion(const std::string &value) {
   std::vector<std::string> pieces = absl::StrSplit(value, '.');
   if (pieces.size() < 2 || pieces.size() > 4) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat(
             "expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
             "for driver version; got \"%s\"",
@@ -80,23 +79,23 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const std::string &value) {
   int major;
   int minor;
   int patch = 0;
-  if (!port::safe_strto32(pieces[0], &major)) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+  if (!absl::SimpleAtoi(pieces[0], &major)) {
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse major version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[0], value));
   }
-  if (!port::safe_strto32(pieces[1], &minor)) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+  if (!absl::SimpleAtoi(pieces[1], &minor)) {
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse minor version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[1].c_str(), value.c_str()));
   }
-  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+  if (pieces.size() == 3 && !absl::SimpleAtoi(pieces[2], &patch)) {
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse patch version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[2], value));
@@ -144,36 +143,36 @@ void Diagnostician::LogDiagnosticInformation() {
     if (!started) {
       LOG(INFO) << "kernel driver is installed, but does not appear to be "
                    "running on this host "
-                << "(" << port::Hostname() << ")";
+                << "(" << tsl::port::Hostname() << ")";
     }
   } else {
     LOG(INFO) << "kernel driver does not appear to be installed on this host "
-              << "(" << port::Hostname() << ")";
+              << "(" << tsl::port::Hostname() << ")";
   }
   CFRelease(kext_infos);
 #elif !defined(PLATFORM_WINDOWS)
   if (access(kDriverVersionPath, F_OK) != 0) {
-    LOG(INFO) << "kernel driver does not appear to be running on this host "
-              << "(" << port::Hostname() << "): "
-              << "/proc/driver/nvidia/version does not exist";
+    VLOG(1) << "kernel driver does not appear to be running on this host "
+            << "(" << tsl::port::Hostname() << "): "
+            << "/proc/driver/nvidia/version does not exist";
     return;
   }
   auto dev0_path = GetDevNodePath(0);
   if (access(dev0_path.c_str(), F_OK) != 0) {
-    LOG(INFO) << "no NVIDIA GPU device is present: " << dev0_path
-              << " does not exist";
+    VLOG(1) << "no NVIDIA GPU device is present: " << dev0_path
+            << " does not exist";
     return;
   }
 #endif
 
   LOG(INFO) << "retrieving CUDA diagnostic information for host: "
-            << port::Hostname();
+            << tsl::port::Hostname();
 
   LogDriverVersionInformation();
 }
 
 /* static */ void Diagnostician::LogDriverVersionInformation() {
-  LOG(INFO) << "hostname: " << port::Hostname();
+  LOG(INFO) << "hostname: " << tsl::port::Hostname();
 #ifndef PLATFORM_WINDOWS
   if (VLOG_IS_ON(1)) {
     const char *value = getenv("LD_LIBRARY_PATH");
@@ -196,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
       closedir(dir);
     }
   }
-  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  tsl::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "libcuda reported version is: "
             << cuda::DriverVersionStatusToString(dso_version);
 
-  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  tsl::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
             << cuda::DriverVersionStatusToString(kernel_version);
 #endif
@@ -215,9 +214,9 @@ void Diagnostician::LogDiagnosticInformation() {
 
 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
 // driver-interfacing DSO version number. Returns it as a string.
-port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
-  port::StatusOr<DriverVersion> result(port::Status(
-      port::error::NOT_FOUND,
+tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  tsl::StatusOr<DriverVersion> result(tsl::Status(
+      tsl::error::NOT_FOUND,
       "was unable to find libcuda.so DSO loaded into this program"));
 
 #if defined(__APPLE__)
@@ -267,7 +266,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       std::string dso_version = dot + strlen(so_suffix);
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
-      auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
+      auto result = static_cast<tsl::StatusOr<DriverVersion> *>(data);
       *result = cuda::StringToDriverVersion(std::string(stripped_dso_version));
       return 1;
     }
@@ -281,13 +280,13 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
   return result;
 }
 
-port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+tsl::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
     const std::string &driver_version_file_contents) {
   static const char *kDriverFilePrelude = "Kernel Module  ";
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == std::string::npos) {
-    return port::Status(
-        port::error::NOT_FOUND,
+    return tsl::Status(
+        tsl::error::NOT_FOUND,
         absl::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
                      driver_version_file_contents, "\""));
@@ -303,8 +302,8 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
-    port::StatusOr<DriverVersion> dso_version,
-    port::StatusOr<DriverVersion> kernel_version) {
+    tsl::StatusOr<DriverVersion> dso_version,
+    tsl::StatusOr<DriverVersion> kernel_version) {
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.value() == kernel_version.value()) {
     LOG(INFO) << "kernel version seems to match DSO: "
@@ -318,8 +317,7 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   }
 }
 
-
-port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+tsl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 #if defined(__APPLE__)
   CFStringRef kext_ids[1];
   kext_ids[0] = kDriverKextIdentifier;
@@ -348,22 +346,22 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return cuda::StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
-  auto status = port::Status(
-      port::error::INTERNAL,
+  auto status = tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrCat(
           "failed to read driver bundle version: ",
           CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
   return status;
 #elif defined(PLATFORM_WINDOWS)
   auto status =
-      port::Status(port::error::UNIMPLEMENTED,
-                   "kernel reported driver version not implemented on Windows");
+      tsl::Status(tsl::error::UNIMPLEMENTED,
+                  "kernel reported driver version not implemented on Windows");
   return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
-    return port::Status(
-        port::error::PERMISSION_DENIED,
+    return tsl::Status(
+        tsl::error::PERMISSION_DENIED,
         absl::StrCat("could not open driver version path for reading: ",
                      kDriverVersionPath));
   }
@@ -384,8 +382,8 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return FindKernelModuleVersion(contents.begin());
   }
 
-  auto status = port::Status(
-      port::error::INTERNAL,
+  auto status = tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrCat(
           "failed to read driver version file contents: ", kDriverVersionPath,
           "; ferror: ", ferror(driver_version_file)));
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h
index e2b50d9c680..0be33ebe6ac 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h
@@ -28,10 +28,10 @@ using DriverVersion = gpu::DriverVersion;
 std::string DriverVersionToString(DriverVersion version);
 
 // Converts a parsed driver version or status value to natural string form.
-std::string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+std::string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version);
 
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
+tsl::StatusOr<DriverVersion> StringToDriverVersion(const std::string& value);
 
 using Diagnostician = gpu::Diagnostician;
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
index 46694388a18..c331220602b 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
@@ -36,11 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
@@ -80,7 +76,7 @@ static_assert(CUDNN_VERSION >= 7300, "cuDNN needs to be version 7.3 or higher");
 #define CHECK_CUDNN_OK(expr) CHECK_EQ(expr, CUDNN_STATUS_SUCCESS)
 
 // If 'expr' doesn't return CUDNN_STATUS_SUCCESS, returns from the current
-// function with a non-successful port::Status.
+// function with a non-successful tsl::Status.
 #define RETURN_IF_CUDNN_ERROR(expr)                                     \
   do {                                                                  \
     cudnnStatus_t _status = (expr);                                     \
@@ -88,7 +84,7 @@ static_assert(CUDNN_VERSION >= 7300, "cuDNN needs to be version 7.3 or higher");
       std::ostringstream oss;                                           \
       oss << CudnnStatusToString(_status) << "\nin " << __FILE__ << "(" \
           << __LINE__ << "): '" << #expr << "'";                        \
-      return port::Status(port::error::UNKNOWN, oss.str());             \
+      return tsl::Status(tsl::error::UNKNOWN, oss.str());               \
     }                                                                   \
   } while (false)
 
@@ -99,7 +95,7 @@ static_assert(CUDNN_VERSION >= 7300, "cuDNN needs to be version 7.3 or higher");
       std::ostringstream oss;                                           \
       oss << CudnnStatusToString(_status) << "\nin " << __FILE__ << "(" \
           << __LINE__ << "): '" << #expr << "' " << (expr).get_error(); \
-      return port::Status(port::error::UNKNOWN, oss.str());             \
+      return tsl::Status(tsl::error::UNKNOWN, oss.str());               \
     }                                                                   \
   } while (false)
 
@@ -309,7 +305,7 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
-port::StatusOr<int> GetCudnnProperty(libraryPropertyType type) {
+tsl::StatusOr<int> GetCudnnProperty(libraryPropertyType type) {
   int value;
   RETURN_IF_CUDNN_ERROR(cudnnGetProperty(type, &value));
   return value;
@@ -330,7 +326,7 @@ cudnnRNNAlgo_t ToCudnnRNNAlgo(std::optional<dnn::AlgorithmDesc> algorithm) {
   }
 }
 
-port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
+tsl::Status GetLoadedCudnnVersion(CudnnVersion* version) {
   TF_ASSIGN_OR_RETURN(version->major_version, GetCudnnProperty(MAJOR_VERSION));
   TF_ASSIGN_OR_RETURN(version->minor_version, GetCudnnProperty(MINOR_VERSION));
   TF_ASSIGN_OR_RETURN(version->patch_level, GetCudnnProperty(PATCH_LEVEL));
@@ -392,7 +388,7 @@ void PreloadCudnnSubLibsHelper(dnn::ConvolutionKind kind) {
 
 CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
 
-port::Status CudnnSupport::Init() {
+tsl::Status CudnnSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
 
   // Peek at the last error to give more information in cases of errors.
@@ -420,7 +416,7 @@ port::Status CudnnSupport::Init() {
           "configuration.");
       LOG(ERROR) << error;
       cudnnDestroy(cudnn_handle);
-      return port::Status(port::error::INTERNAL, error);
+      return tsl::Status(tsl::error::INTERNAL, error);
     }
 
     cudnn_.reset(new CudnnAccess(cudnn_handle));
@@ -444,16 +440,16 @@ port::Status CudnnSupport::Init() {
     }
   }
 
-  return port::Status(port::error::INTERNAL,
-                      absl::StrCat("cudnn library could not create a handle: ",
-                                   CudnnStatusToString(status)));
+  return tsl::Status(tsl::error::INTERNAL,
+                     absl::StrCat("cudnn library could not create a handle: ",
+                                  CudnnStatusToString(status)));
 }
 
 void CudnnSupport::NotifyStreamDestroyed(Stream* stream) /* override */ {
   cudnn_->NotifyStreamDestroyed(stream);
 }
 
-port::StatusOr<perftools::gputools::dnn::VersionInfo>
+tsl::StatusOr<perftools::gputools::dnn::VersionInfo>
 CudnnSupport::GetVersion() {
   CudnnVersion version;
   TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
@@ -601,12 +597,12 @@ CtcLossDescriptor CreateCtcLossDescriptor() {
 }
 #endif
 
-port::StatusOr<PersistentRnnPlan> CreatePersistentRnnPlan(
+tsl::StatusOr<PersistentRnnPlan> CreatePersistentRnnPlan(
     cudnnRNNDescriptor_t rnn_desc, int batch_size, cudnnDataType_t data_type) {
   cudnnPersistentRNNPlan_t result;
   RETURN_IF_CUDNN_ERROR(
       cudnnCreatePersistentRNNPlan(rnn_desc, batch_size, data_type, &result));
-  return port::StatusOr<PersistentRnnPlan>(PersistentRnnPlan(result));
+  return tsl::StatusOr<PersistentRnnPlan>(PersistentRnnPlan(result));
 }
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a
@@ -914,8 +910,9 @@ static bool TensorOpMathAvailable(
   return cuda_compute_capability.IsAtLeast(7);
 }
 
-static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
-  if (!TensorOpMathAvailable(stream->GetCudaComputeCapability())) {
+static bool IsTensorMathEnabled(CudaComputeCapability cuda_compute_capability,
+                                dnn::DataType input_type) {
+  if (!TensorOpMathAvailable(cuda_compute_capability)) {
     return false;
   }
   if (input_type == dnn::DataType::kFloat) {
@@ -930,6 +927,10 @@ static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
   return true;
 }
 
+static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
+  return IsTensorMathEnabled(stream->GetCudaComputeCapability(), input_type);
+}
+
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -1152,6 +1153,8 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
       return sizeof(double);
     case CUDNN_DATA_HALF:
       return sizeof(Eigen::half);
+    case CUDNN_DATA_BFLOAT16:
+      return sizeof(Eigen::bfloat16);
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
@@ -1164,7 +1167,7 @@ class CudnnDropoutDescriptor {
  public:
   CudnnDropoutDescriptor(CudnnDropoutDescriptor&&) = default;
 
-  static port::StatusOr<CudnnDropoutDescriptor> Create(
+  static tsl::StatusOr<CudnnDropoutDescriptor> Create(
       const CudnnHandle& cudnn, float dropout, uint64_t seed,
       ScratchAllocator* state_allocator) {
     DropoutDescriptor handle = CreateDropoutDescriptor();
@@ -1210,7 +1213,7 @@ class CudnnRnnParamsDescriptor {
  public:
   CudnnRnnParamsDescriptor(CudnnRnnParamsDescriptor&&) = default;
 
-  static port::StatusOr<CudnnRnnParamsDescriptor> Create(
+  static tsl::StatusOr<CudnnRnnParamsDescriptor> Create(
       const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
       cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
       cudnnDirectionMode_t direction_mode, int num_layers);
@@ -1261,7 +1264,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
  public:
   CudnnRnnDescriptor(CudnnRnnDescriptor&& other) = default;
 
-  static port::StatusOr<CudnnRnnDescriptor> Create(
+  static tsl::StatusOr<CudnnRnnDescriptor> Create(
       const CudnnHandle& cudnn, int num_layers, int hidden_size, int input_size,
       int cell_size, int batch_size, cudnnRNNInputMode_t input_mode,
       cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
@@ -1295,8 +1298,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
             ? algorithm_config.algorithm()->tensor_ops_enabled()
             : allow_tensor_ops;
     if (use_tensor_ops && !allow_tensor_ops) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
+      return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                         "Algo requests disallowed tensor op evaluation.");
     }
 
 #if CUDNN_VERSION >= 8000
@@ -1346,14 +1349,14 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     }
 #endif
 
-    port::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
+    tsl::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
     PersistentRnnPlan rnn_plan;
     if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
       CHECK_GE(batch_size, 0);
       rnn_plan_wrapper =
           CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
       if (!rnn_plan_wrapper.ok()) {
-        return port::StatusOr<CudnnRnnDescriptor>(rnn_plan_wrapper.status());
+        return tsl::StatusOr<CudnnRnnDescriptor>(rnn_plan_wrapper.status());
       } else {
         rnn_plan = std::move(rnn_plan_wrapper).value();
         RETURN_IF_CUDNN_ERROR(
@@ -1460,7 +1463,7 @@ namespace {
 // Check if the LSTM projection is used. If yes, an additional weight matrix
 // (projection matrix) will be fetched to the 'weights'. Otherwise, nothing will
 // be done.
-port::Status CheckAndFetchProjectionWeights(
+tsl::Status CheckAndFetchProjectionWeights(
     const CudnnHandle& cudnn, cudnnRNNDescriptor_t rnn_desc, const int layer,
     const TensorDescriptor& input_desc, const FilterDescriptor& filter_desc,
     const FilterDescriptor& region_desc_handle,
@@ -1531,7 +1534,7 @@ port::Status CheckAndFetchProjectionWeights(
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
+tsl::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
     const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
     cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
     cudnnDirectionMode_t direction_mode, int num_layers) {
@@ -1650,11 +1653,11 @@ class CudnnRnnSequenceTensorDescriptor
   CudnnRnnSequenceTensorDescriptor(CudnnRnnSequenceTensorDescriptor&&) =
       default;
 
-  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
+  static tsl::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
       GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       cudnnDataType_t data_type) {
     if (max_seq_length <= 0) {
-      return port::Status(port::error::INVALID_ARGUMENT, "max_seq_length <= 0");
+      return tsl::Status(tsl::error::INVALID_ARGUMENT, "max_seq_length <= 0");
     }
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
@@ -1668,12 +1671,12 @@ class CudnnRnnSequenceTensorDescriptor
                                             std::move(tensor_desc));
   }
 
-  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
+  static tsl::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
       GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       const absl::Span<const int>& seq_lengths, bool time_major,
       cudnnDataType_t data_type) {
     if (max_seq_length <= 0) {
-      return port::Status(port::error::INVALID_ARGUMENT, "max_seq_length <= 0");
+      return tsl::Status(tsl::error::INVALID_ARGUMENT, "max_seq_length <= 0");
     }
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
@@ -1770,7 +1773,7 @@ struct RnnModelDims {
 };
 
 template <class T>
-port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
+tsl::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
     const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1800,38 +1803,36 @@ port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
             model_dims.num_layers * model_dims.dir_count &&
         input_h_desc.batch_size() == model_dims.batch_size &&
         input_h_desc.data_size() == model_dims.hidden_size)) {
-    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_h shape");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT, "Invalid input_h shape");
   }
   // The LSTM projection will be used if input_h_desc.data_size() <
   // input_c_desc.data_size()
   if (!(input_h_desc.num_layers() == input_c_desc.num_layers() &&
         input_h_desc.batch_size() == input_c_desc.batch_size() &&
         input_h_desc.data_size() <= input_c_desc.data_size())) {
-    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_c shape");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT, "Invalid input_c shape");
   }
   if (!(output_desc.max_seq_length() == model_dims.max_seq_length &&
         output_desc.batch_size() == model_dims.batch_size &&
         output_desc.data_size() ==
             model_dims.hidden_size * model_dims.dir_count)) {
-    return port::Status(port::error::INVALID_ARGUMENT, "Invalid output shape");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT, "Invalid output shape");
   }
   if (!(input_h_desc.num_layers() == output_h_desc.num_layers() &&
         input_h_desc.batch_size() == output_h_desc.batch_size() &&
         input_h_desc.data_size() == output_h_desc.data_size())) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Invalid output_h shape");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT, "Invalid output_h shape");
   }
   if (!(input_h_desc.num_layers() == output_c_desc.num_layers() &&
         input_h_desc.batch_size() == output_c_desc.batch_size() &&
         input_h_desc.data_size() <= output_c_desc.data_size())) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Invalid output_c shape");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT, "Invalid output_c shape");
   }
 
   return model_dims;
 }
 
-port::Status CheckRNNParameterSize(
+tsl::Status CheckRNNParameterSize(
     const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
@@ -1847,13 +1848,13 @@ port::Status CheckRNNParameterSize(
 #endif
   if (static_cast<int64_t>(params_size_in_bytes) !=
       rnn_desc.ParamsSizeInBytes()) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Mismatching RNN parameter size");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "Mismatching RNN parameter size");
   }
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<DeviceMemory<uint8_t>> CreateRnnWorkspace(
+tsl::StatusOr<DeviceMemory<uint8_t>> CreateRnnWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
@@ -1872,7 +1873,7 @@ port::StatusOr<DeviceMemory<uint8_t>> CreateRnnWorkspace(
 }
 
 #if CUDNN_VERSION >= 7402
-port::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormForwardWorkspace(
+tsl::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn, const cudnnBatchNormMode_t& mode,
     const cudnnBatchNormOps_t& bn_ops,
     const cudnnActivationDescriptor_t& activation_desc,
@@ -1896,7 +1897,7 @@ port::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormForwardWorkspace(
   return workspace_allocator->AllocateBytes(workspace_size_in_bytes);
 }
 
-port::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormBackwardWorkspace(
+tsl::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormBackwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn, const cudnnBatchNormMode_t& mode,
     const cudnnBatchNormOps_t& bn_ops,
     const cudnnActivationDescriptor_t& activation_desc,
@@ -1927,7 +1928,7 @@ port::StatusOr<DeviceMemory<uint8_t>> CreateBatchNormBackwardWorkspace(
 }  // namespace
 
 template <class T>
-port::Status CudnnSupport::DoRnnForwardImpl(
+tsl::Status CudnnSupport::DoRnnForwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1995,7 +1996,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
       // possible. It is still possible for other threads to issue workload on
       // to this stream. So it could take multiple profiling measurements.
       if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
 
@@ -2018,7 +2019,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
 
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       auto algo_desc = *rnn_desc.algorithm_config().algorithm();
       output_profile_result->set_algorithm(algo_desc);
@@ -2056,7 +2057,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
+      return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
     }
   }
 
@@ -2128,7 +2129,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
 
   if (is_profiling) {
     if (!timer->Stop(AsGpuStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
+      return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
@@ -2140,7 +2141,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
 }
 
 template <class T>
-port::Status CudnnSupport::DoRnnBackwardImpl(
+tsl::Status CudnnSupport::DoRnnBackwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -2202,7 +2203,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
       // possible. It is still possible for other threads to issue workload on
       // to this stream. So it could take multiple profiling measurements.
       if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
 
@@ -2251,7 +2252,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
 
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       auto algo_desc = *rnn_desc.algorithm_config().algorithm();
       output_profile_result->set_algorithm(algo_desc);
@@ -2273,7 +2274,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
     if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
+      return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
     }
   }
 
@@ -2360,7 +2361,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
 
   if (is_profiling) {
     if (!timer->Stop(AsGpuStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
+      return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
@@ -2371,7 +2372,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
   return ::tsl::OkStatus();
 }
 
-port::Status CudnnSupport::DoCtcLossImpl(
+tsl::Status CudnnSupport::DoCtcLossImpl(
     Stream* stream, const CudnnRnnStateTensorDescriptor& probs_desc,
     const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
@@ -2402,15 +2403,15 @@ port::Status CudnnSupport::DoCtcLossImpl(
       /*workspace=*/scratch_memory.opaque(),
       /*workSpaceSizeInBytes=*/scratch_memory.size()));
 #else
-  return port::Status(port::error::INVALID_ARGUMENT,
-                      "No supported cudnnCTCLoss when "
-                      "CUDNN_VERSION < 7.6.3");
+  return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                     "No supported cudnnCTCLoss when "
+                     "CUDNN_VERSION < 7.6.3");
 #endif
 
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 CudnnSupport::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
     int batch_size, dnn::RnnInputMode input_mode,
@@ -2433,7 +2434,7 @@ CudnnSupport::createRnnDescriptor(
       new CudnnRnnDescriptor(std::move(rnn_desc)));
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
                                                 int batch_size, int data_size,
                                                 dnn::DataType data_type) {
@@ -2445,7 +2446,7 @@ CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
       new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(
     int max_seq_length, int batch_size, int data_size,
     const absl::Span<const int>& seq_lengths, bool time_major,
@@ -2458,7 +2459,7 @@ CudnnSupport::createRnnSequenceTensorDescriptor(
       new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
                                              dnn::DataType data_type) {
@@ -2761,7 +2762,7 @@ namespace {
 // TODO(csigg): Merge a lot of duplicate code below for forward, backward data,
 // and backward filter.
 
-port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
+tsl::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
     const CudnnHandle& cudnn, const CudnnTensorDescriptor& input_nd,
     const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
@@ -2784,9 +2785,9 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
       return perf_results[r].algo;
     }
   }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionForwardAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
+  return tsl::Status(tsl::error::INTERNAL,
+                     "cudnnGetConvolutionForwardAlgorithm_v7 returned "
+                     "no suitable algorithms. This could be a cudnn bug.");
 #else
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
@@ -2799,7 +2800,7 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
 #endif
 }
 
-port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
+tsl::StatusOr<cudnnConvolutionBwdDataAlgo_t>
 GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
                                     const CudnnTensorDescriptor& input_nd,
                                     const CudnnFilterDescriptor& filter,
@@ -2826,9 +2827,9 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
       return perf_results[r].algo;
     }
   }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionBackwardDataAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
+  return tsl::Status(tsl::error::INTERNAL,
+                     "cudnnGetConvolutionBackwardDataAlgorithm_v7 returned "
+                     "no suitable algorithms. This could be a cudnn bug.");
 #else
   cudnnConvolutionBwdDataPreference_t preference =
       specify_workspace_limit
@@ -2842,7 +2843,7 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
 #endif
 }
 
-port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
+tsl::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
 GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
                                       const CudnnTensorDescriptor& input_nd,
                                       const CudnnFilterDescriptor& filter,
@@ -2868,9 +2869,9 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
       return perf_results[r].algo;
     }
   }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionBackwardFilterAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
+  return tsl::Status(tsl::error::INTERNAL,
+                     "cudnnGetConvolutionBackwardFilterAlgorithm_v7 returned "
+                     "no suitable algorithms. This could be a cudnn bug.");
 #else
   cudnnConvolutionBwdFilterPreference_t preference =
       specify_workspace_limit
@@ -2884,7 +2885,7 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
 #endif
 }
 
-port::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
+tsl::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
@@ -2892,8 +2893,8 @@ port::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "Mismatch between cudnn conv and algorithm descriptors.");
   }
 
@@ -2914,8 +2915,8 @@ port::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
   int64_t size_in_bytes_int64_t = size_in_bytes;
 
   if (ABSL_PREDICT_FALSE(size_in_bytes_int64_t < 0)) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "cudnnGetConvolutionForwardWorkspaceSize() returned "
         "negative sizeInBytes value. This could be a cudnn bug.");
   }
@@ -2925,14 +2926,14 @@ port::StatusOr<DeviceMemory<uint8_t>> AllocateCudnnConvolutionForwardWorkspace(
   }
 
   if (ABSL_PREDICT_FALSE(!scratch_allocator)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No scratch allocator provided");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "No scratch allocator provided");
   }
 
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-port::StatusOr<DeviceMemory<uint8_t>>
+tsl::StatusOr<DeviceMemory<uint8_t>>
 AllocateCudnnConvolutionBackwardDataWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
@@ -2941,8 +2942,8 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "Mismatch between cudnn conv and algorithm descriptors.");
   }
 
@@ -2964,8 +2965,8 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
   int64_t size_in_bytes_int64_t = size_in_bytes;
 
   if (ABSL_PREDICT_FALSE(size_in_bytes_int64_t < 0)) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
         "negative sizeInBytes value. This could be a cudnn bug.");
   }
@@ -2975,14 +2976,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
   }
 
   if (ABSL_PREDICT_FALSE(!scratch_allocator)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No scratch allocator provided");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "No scratch allocator provided");
   }
 
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-port::StatusOr<DeviceMemory<uint8_t>>
+tsl::StatusOr<DeviceMemory<uint8_t>>
 AllocateCudnnConvolutionBackwardFilterWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
@@ -2991,8 +2992,8 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "Mismatch between cudnn conv and algorithm descriptors.");
   }
 
@@ -3014,8 +3015,8 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   int64_t size_in_bytes_int64_t = size_in_bytes;
 
   if (ABSL_PREDICT_FALSE(size_in_bytes_int64_t < 0)) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
         "negative sizeInBytes value. This could be a cudnn bug.");
   }
@@ -3025,21 +3026,21 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   }
 
   if (ABSL_PREDICT_FALSE(!scratch_allocator)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No scratch allocator provided");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "No scratch allocator provided");
   }
 
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
-                                  std::optional<dnn::AlgorithmDesc> desc) {
+tsl::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
+                                 std::optional<dnn::AlgorithmDesc> desc) {
   bool use_tensor_ops;
   if (desc.has_value()) {
     use_tensor_ops = desc->tensor_ops_enabled();
     if (use_tensor_ops && !IsTensorMathEnabled(stream, type)) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disabled tensor op evaluation.");
+      return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                         "Algo requests disabled tensor op evaluation.");
     }
   } else {
     use_tensor_ops = IsTensorMathEnabled(stream, type);
@@ -3050,7 +3051,7 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
 
-port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
+tsl::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
@@ -3097,7 +3098,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
   if (!algo_desc.has_value()) {
-    return port::Status(
+    return tsl::Status(
         scratch_or.status().code(),
         absl::StrCat("The primary convolution algorithm failed, ",
                      "while a secondary algorithm is not provided. ",
@@ -3113,7 +3114,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
   return *algo_desc;
 }
 
-port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
+tsl::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
@@ -3159,8 +3160,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
   if (!algo_desc.has_value()) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         "The primary convolution algorithm failed memory allocation, "
         "while a secondary algorithm is not provided.");
   }
@@ -3174,7 +3175,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
   return *algo_desc;
 }
 
-port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
+tsl::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
@@ -3206,7 +3207,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
-  port::StatusOr<DeviceMemory<uint8_t>> scratch_or =
+  tsl::StatusOr<DeviceMemory<uint8_t>> scratch_or =
       AllocateCudnnConvolutionBackwardFilterWorkspace(
           stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
           scratch_allocator);
@@ -3221,8 +3222,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
   if (!algo_desc.has_value()) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         absl::StrCat(
             "The primary convolution algorithm failed memory allocation, "
             "while a secondary algorithm is not provided. Actual error: ",
@@ -3418,13 +3419,6 @@ dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
 #if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 
 namespace {
-cudnnBackendHeurMode_t GetCudnnFrontendHeurMode() {
-#if CUDNN_VERSION >= 8300
-  return CUDNN_HEUR_MODE_B;
-#else
-  return CUDNN_HEUR_MODE_INSTANT;
-#endif  // CUDNN_VERSION >= 8300
-}
 
 cudnnBackendDescriptorType_t GetCudnnConvolutionType(
     dnn::ConvolutionKind kind) {
@@ -3484,7 +3478,7 @@ std::tuple<int, int> GetTensorVectorSizeAndDim(
   return std::make_tuple(vector_size, vector_dim);
 }
 
-port::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
+tsl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
     int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
     bool is_virtual = false) {
@@ -3501,7 +3495,7 @@ port::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
   return tensor;
 }
 
-port::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
                        dnn::DataType output_type,
                        const dnn::BatchDescriptor& input_descriptor,
@@ -3523,7 +3517,7 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
       dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
 
   if (vector_size == 32) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "cuDNN frontend doesn't support Tx32 at the moment.");
   }
 
@@ -3625,7 +3619,7 @@ bool SideInputNeeded(dnn::ActivationMode activation_mode, double conv_scale,
   return check_activation || check_scale;
 }
 
-port::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnFusedOperationGraph(
     dnn::ConvolutionKind kind, dnn::DataType input_type,
     dnn::DataType bias_type, dnn::DataType output_type, double alpha,
@@ -3658,7 +3652,7 @@ GetCudnnFusedOperationGraph(
       dnn::DataLayout::kBatchDepthYX, vector_size, vector_dim);
 
   if (vector_size == 32) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "cuDNN frontend doesn't support Tx32 at the moment.");
   }
 
@@ -3849,9 +3843,8 @@ GetCudnnFusedOperationGraph(
       RETURN_MSG_IF_CUDNN_ERROR(*act_desc);
       break;
     default:
-      return port::InternalError(
-          absl::StrCat("Unimplemented activation mode ",
-                       dnn::ActivationModeString(activation_mode)));
+      return tsl::errors::Internal("Unimplemented activation mode ",
+                                   dnn::ActivationModeString(activation_mode));
   }
 
   std::optional<cudnn_frontend::Operation_v8> act_op;
@@ -3896,7 +3889,7 @@ GetCudnnFusedOperationGraph(
       new cudnn_frontend::OperationGraph(std::move(op_graph)));
 }
 
-port::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
+tsl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
                          dnn::DataType output_type, bool trans_a, bool trans_b,
                          uint64_t m_u, uint64_t n_u, uint64_t k_u, int64_t lda,
@@ -3998,9 +3991,8 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
       cudnn_activation_mode = CUDNN_POINTWISE_SIGMOID_FWD;
       break;
     default:
-      return port::InternalError(
-          absl::StrCat("Unimplemented activation mode ",
-                       dnn::ActivationModeString(activation_mode)));
+      return tsl::errors::Internal("Unimplemented activation mode ",
+                                   dnn::ActivationModeString(activation_mode));
   }
 
   auto act_desc = cudnn_frontend::PointWiseDescBuilder()
@@ -4042,11 +4034,11 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
 
 }  // namespace
 
-static port::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
+static tsl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
     const CudnnHandle& cudnn, const dnn::AlgorithmDesc& desc,
     const cudnn_frontend::OperationGraph& op_graph) {
   if (!desc.is_cudnn_frontend()) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "Got legacy cuDNN algorithm enum in RebuildExecutionPlan.");
   }
 
@@ -4098,7 +4090,7 @@ static port::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
 
 }  // namespace
 
-port::Status CudnnSupport::DoPrepareForConvolution(
+tsl::Status CudnnSupport::DoPrepareForConvolution(
     dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -4146,8 +4138,8 @@ port::Status CudnnSupport::DoPrepareForConvolution(
       break;
     }
     default:
-      return port::InternalError(
-          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+      return tsl::errors::Internal("Unexpected convolution kind ",
+                                   static_cast<int>(kind));
   }
 
   return ::tsl::OkStatus();
@@ -4156,7 +4148,7 @@ port::Status CudnnSupport::DoPrepareForConvolution(
 class CudnnLegacyConvRunner : public dnn::ConvRunner {
  public:
   // Queries the workspace size and constructs a 'CudnnLegacyConvRunner'.
-  static port::StatusOr<CudnnLegacyConvRunner> Create(
+  static tsl::StatusOr<CudnnLegacyConvRunner> Create(
       GpuExecutor* parent, Stream* stream, CudnnAccess* cudnn,
       const dnn::AlgorithmDesc& algo, dnn::DataType input_type,
       dnn::DataType output_type, dnn::ConvolutionKind kind,
@@ -4202,7 +4194,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
               /*sizeInBytes=*/&workspace_size));
           break;
         default:
-          return port::InternalError(
+          return tsl::errors::Internal(
               "Invalid ConvolutionKind for CudnnLegacyConvRunner.");
       }
     }
@@ -4218,15 +4210,15 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
 
   size_t GetWorkspaceSize() const override { return workspace_size_; }
 
-  port::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
+  tsl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
     return MakeAlgorithmDesc();
   }
 
-  port::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
-                          DeviceMemoryBase scratch_memory,
-                          DeviceMemoryBase input_data,
-                          DeviceMemoryBase filter_data,
-                          DeviceMemoryBase output_data) const override {
+  tsl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+                         DeviceMemoryBase scratch_memory,
+                         DeviceMemoryBase input_data,
+                         DeviceMemoryBase filter_data,
+                         DeviceMemoryBase output_data) const override {
     auto algo = MakeAlgorithmDesc();
 
     // Check that the current stream supports tensor ops if they're requested.
@@ -4234,7 +4226,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
 
     if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "CudnnLegacyConvRunner cached across multiple StreamExecutors.");
     }
 
@@ -4261,17 +4253,17 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
       // possible. It is still possible for other threads to issue workload on
       // to this stream. So it could take multiple profiling measurements.
       if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
 
-    const auto get_fwd_bugs = [&]() -> port::Status {
+    const auto get_fwd_bugs = [&]() -> tsl::Status {
 #if CUDNN_VERSION < 8000
       if (algo_id_ == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM &&
           ToCudnnDataType(input_type_) == CUDNN_DATA_INT8 &&
           ToCudnnDataType(output_type_) == CUDNN_DATA_FLOAT) {
-        return port::Status(
-            port::error::FAILED_PRECONDITION,
+        return tsl::Status(
+            tsl::error::FAILED_PRECONDITION,
             "This configuration potentially produces incorrect results.");
       }
 #else
@@ -4280,11 +4272,9 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
       return ::tsl::OkStatus();
     };
 
-    auto get_bwd_data_bugs = [&]() -> port::Status {
-      return ::tsl::OkStatus();
-    };
+    auto get_bwd_data_bugs = [&]() -> tsl::Status { return ::tsl::OkStatus(); };
 
-    const auto get_bwd_filter_bugs = [&]() -> port::Status {
+    const auto get_bwd_filter_bugs = [&]() -> tsl::Status {
       return ::tsl::OkStatus();
     };
 
@@ -4339,13 +4329,13 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
         break;
       }
       default:
-        return port::InternalError(absl::StrCat("Unexpected convolution kind ",
-                                                static_cast<int>(kind_)));
+        return tsl::errors::Internal("Unexpected convolution kind ",
+                                     static_cast<int>(kind_));
     }
 
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       profile_result->set_algorithm(algo);
       profile_result->set_elapsed_time_in_ms(timer->GetElapsedMilliseconds());
@@ -4398,7 +4388,7 @@ class CudnnLegacyConvRunner : public dnn::ConvRunner {
   CudnnConvolutionDescriptor conv_;
 };
 
-port::Status CudnnSupport::DoConvolve(
+tsl::Status CudnnSupport::DoConvolve(
     dnn::ConvolutionKind kind, dnn::DataType element_type,
     dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -4445,7 +4435,7 @@ struct BackendDescriptorDeleter {
 
 using BackendDescriptor = std::unique_ptr<void, BackendDescriptorDeleter>;
 
-port::StatusOr<BackendDescriptor> CreateBackendDesc(
+tsl::StatusOr<BackendDescriptor> CreateBackendDesc(
     cudnnBackendDescriptorType_t type) {
   void* result;
   RETURN_IF_CUDNN_ERROR(cudnnBackendCreateDescriptor(type, &result));
@@ -4458,7 +4448,7 @@ port::StatusOr<BackendDescriptor> CreateBackendDesc(
 // opposed to a sequence of multiple attributes.  The distinction is a bit
 // meaningless, but this is the presentation the cuDNN docs use, so it may as
 // well be consistent.
-port::StatusOr<std::vector<BackendDescriptor>> GetDescriptorAttribute(
+tsl::StatusOr<std::vector<BackendDescriptor>> GetDescriptorAttribute(
     cudnnBackendDescriptor_t desc, cudnnBackendAttributeName_t name,
     cudnnBackendDescriptorType_t type) {
   int64_t n;
@@ -4487,7 +4477,7 @@ port::StatusOr<std::vector<BackendDescriptor>> GetDescriptorAttribute(
 
 // Extract the engine ID and tuning knobs from the ExecutionPlan, and return
 // them in the form of an AlgorithmDesc for use with RebuildExecutionPlan.
-port::StatusOr<dnn::AlgorithmDesc> ExecutionPlanToAlgorithmDesc(
+tsl::StatusOr<dnn::AlgorithmDesc> ExecutionPlanToAlgorithmDesc(
     const cudnn_frontend::ExecutionPlan& plan, size_t workspace_size) {
   TF_ASSIGN_OR_RETURN(
       auto engine_cfgs,
@@ -4495,7 +4485,7 @@ port::StatusOr<dnn::AlgorithmDesc> ExecutionPlanToAlgorithmDesc(
                              CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
                              CUDNN_BACKEND_ENGINECFG_DESCRIPTOR));
   if (engine_cfgs.size() != 1) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG had more than one element.");
   }
 
@@ -4504,7 +4494,7 @@ port::StatusOr<dnn::AlgorithmDesc> ExecutionPlanToAlgorithmDesc(
       GetDescriptorAttribute(engine_cfgs[0].get(), CUDNN_ATTR_ENGINECFG_ENGINE,
                              CUDNN_BACKEND_ENGINE_DESCRIPTOR));
   if (engines.size() != 1) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "CUDNN_ATTR_ENGINECFG_ENGINE had more than one element.");
   }
 
@@ -4544,21 +4534,21 @@ port::StatusOr<dnn::AlgorithmDesc> ExecutionPlanToAlgorithmDesc(
         cudnnBackendGetAttribute(knob.get(), CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE,
                                  CUDNN_TYPE_KNOB_TYPE, 1, &n, &knob_type));
     if (n != 1) {
-      return port::InternalError(
-          absl::StrCat("Knob should have exactly one KNOB_TYPE; had ", n));
+      return tsl::errors::Internal(
+          "Knob should have exactly one KNOB_TYPE; had ", n);
     }
 
     RETURN_IF_CUDNN_ERROR(
         cudnnBackendGetAttribute(knob.get(), CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE,
                                  CUDNN_TYPE_INT64, 1, &n, &knob_value));
     if (n != 1) {
-      return port::InternalError(
-          absl::StrCat("Knob should have exactly one KNOB_VALUE; had ", n));
+      return tsl::errors::Internal(
+          "Knob should have exactly one KNOB_VALUE; had ", n);
     }
 
     auto emplaced = tuning_knobs.try_emplace(knob_type, knob_value).second;
     if (!emplaced) {
-      return port::InternalError(absl::StrFormat(
+      return tsl::errors::Internal(absl::StrFormat(
           "cuDNN gave multiple knob values for the same knob type.\n"
           "  KNOB_TYPE: %d\n"
           "  new KNOB_VALUE: %d\n"
@@ -4588,16 +4578,16 @@ class CudnnExecutionPlanRunner<void(Args...)>
 
   size_t GetWorkspaceSize() const override { return workspace_size_; }
 
-  port::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
+  tsl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
     return ExecutionPlanToAlgorithmDesc(plan_, workspace_size_);
   }
 
-  port::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
-                          DeviceMemoryBase scratch_memory,
-                          Args... inputs) const override {
+  tsl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+                         DeviceMemoryBase scratch_memory,
+                         Args... inputs) const override {
     if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "CudnnExecutionPlanRunner cached across multiple StreamExecutors.");
     }
 
@@ -4640,7 +4630,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
       // possible. It is still possible for other threads to issue workload on
       // to this stream. So it could take multiple profiling measurements.
       if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
 
@@ -4650,7 +4640,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
 
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       TF_ASSIGN_OR_RETURN(auto desc, ToAlgorithmDesc());
       profile_result->set_algorithm(desc);
@@ -4666,7 +4656,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
     return tsl::OkStatus();
   }
 
-  static port::StatusOr<CudnnExecutionPlanRunner> Create(
+  static tsl::StatusOr<CudnnExecutionPlanRunner> Create(
       GpuExecutor* parent, CudnnAccess* cudnn,
       cudnn_frontend::ExecutionPlan plan, absl::Span<const int64_t> uids,
       bool need_side_input) {
@@ -4731,7 +4721,7 @@ class ScalingParam {
 namespace {
 
 template <typename Sig>
-port::Status CreateOpRunners(
+tsl::Status CreateOpRunners(
     Stream* stream, CudnnHandle& cudnn, GpuExecutor* gpu_executor,
     CudnnAccess* cudnn_access,
     std::unique_ptr<cudnn_frontend::OperationGraph> op_graph,
@@ -4748,60 +4738,12 @@ port::Status CreateOpRunners(
         /*disable_tensor_core*/ !IsTensorMathEnabled(stream, input_type));
   };
 
-  if (!use_fallback) {
-    // In theory, mode GetCudnnFrontendHeurMode() is supposed to fall back to
-    // HEUR_MODE_INSTANT if it can't find any working engines. But there's a
-    // known cudnn issue where it doesn't when dealing with runtime compiled
-    // fusion engines. So we do it manually here.
-    // TODO(kaixih@nvidia): remove this when the cudnn fixes it.
-    cudnn_frontend::EngineHeuristics heuristics = [&] {
-      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-                            .setOperationGraph(*op_graph)
-                            .setHeurMode(GetCudnnFrontendHeurMode())
-                            .build();
-      if (heuristics.get_status() == CUDNN_STATUS_SUCCESS) return heuristics;
-      return cudnn_frontend::EngineHeuristicsBuilder()
-          .setOperationGraph(*op_graph)
-          .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
-          .build();
-    }();
-    RETURN_MSG_IF_CUDNN_ERROR(heuristics);
-
-    // cuDNN frontend sneakily puts error messages on the object and returns
-    // partially-initialized results when there's an error; make sure to check
-    // them.
-    int64_t engine_count = heuristics.getEngineConfigCount();
-    RETURN_MSG_IF_CUDNN_ERROR(heuristics);
-    auto& heuristics_configs = heuristics.getEngineConfig(engine_count);
-    RETURN_MSG_IF_CUDNN_ERROR(heuristics);
-    VLOG(4) << "\nHeuristics engine configs size: "
-            << heuristics_configs.size();
-
-    cudnn_frontend::filter(heuristics_configs, filtered_configs,
-                           generic_filter_fn);
-  } else {
-#if CUDNN_VERSION < 8300
-    auto fallback = cudnn_frontend::EngineFallbackListBuilder()
-                        .setOperationGraph(*op_graph)
-                        .setOperation(GetCudnnConvolutionType(kind))
-                        .build();
-    RETURN_MSG_IF_CUDNN_ERROR(fallback);
-    auto& fallback_configs = fallback.getFallbackList();
-#else
-    auto fallback = cudnn_frontend::EngineHeuristicsBuilder()
-                        .setOperationGraph(*op_graph)
-                        .setHeurMode(CUDNN_HEUR_MODE_FALLBACK)
-                        .build();
-    RETURN_MSG_IF_CUDNN_ERROR(fallback);
-    int64_t engine_count = fallback.getEngineConfigCount();
-    RETURN_MSG_IF_CUDNN_ERROR(fallback);
-    auto& fallback_configs = fallback.getEngineConfig(engine_count);
-    RETURN_MSG_IF_CUDNN_ERROR(fallback);
-#endif
-    VLOG(4) << "\nFallback engine configs size: " << fallback_configs.size();
-
-    cudnn_frontend::filter(fallback_configs, filtered_configs,
-                           generic_filter_fn);
+  std::array<std::string, 1> heur_mode = {use_fallback ? "heuristics_fallback"
+                                                       : "heuristics_mode_b"};
+  std::vector<cudnnStatus_t> ret = cudnn_frontend::get_heuristics_list(
+      heur_mode, *op_graph, generic_filter_fn, filtered_configs);
+  for (auto status : ret) {
+    RETURN_IF_CUDNN_ERROR(status);
   }
   VLOG(4) << "\nFiltered engine configs size: " << filtered_configs.size();
 
@@ -4863,7 +4805,7 @@ port::Status CreateOpRunners(
 }  // namespace
 #endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 
-port::Status CudnnSupport::GetConvolveRunners(
+tsl::Status CudnnSupport::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor,
@@ -4908,23 +4850,24 @@ port::Status CudnnSupport::GetConvolveRunners(
     bool got_algos = false;
     switch (kind) {
       default:
-        return port::InternalError(absl::StrFormat(
+        return tsl::errors::Internal(absl::StrFormat(
             "Unknown ConvolutionKind for unfused conv: %d", kind));
       case dnn::ConvolutionKind::FORWARD:
-        got_algos = GetConvolveAlgorithms(cuda_compute_capability, &algorithms);
+        got_algos = GetConvolveAlgorithms(cuda_compute_capability, input_type,
+                                          &algorithms);
         break;
       case dnn::ConvolutionKind::BACKWARD_FILTER:
-        got_algos = GetConvolveBackwardFilterAlgorithms(cuda_compute_capability,
-                                                        &algorithms);
+        got_algos = GetConvolveBackwardFilterAlgorithms(
+            cuda_compute_capability, input_type, &algorithms);
         break;
       case dnn::ConvolutionKind::BACKWARD_DATA:
         got_algos = GetConvolveBackwardDataAlgorithms(cuda_compute_capability,
-                                                      &algorithms);
+                                                      input_type, &algorithms);
         break;
     }
     if (!got_algos) {
-      return port::Status(
-          port::error::UNKNOWN,
+      return tsl::Status(
+          tsl::error::UNKNOWN,
           absl::StrFormat("Listing algorithms failed for kind %d", kind));
     }
 
@@ -4963,12 +4906,12 @@ port::Status CudnnSupport::GetConvolveRunners(
       input_type, {'x', 'w', 'y'}, use_fallback, out_exec_plans,
       /*need_side_input=*/false);
 #else
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
 #endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 }
 
-port::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
+tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
 CudnnSupport::ConvolveRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
     dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -5024,7 +4967,7 @@ CudnnSupport::ConvolveRunnerFromDesc(
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::ConvSignature>>(
       std::move(runner))};
 #else
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
 #endif
 }
@@ -5032,7 +4975,7 @@ CudnnSupport::ConvolveRunnerFromDesc(
 class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
  public:
   // Queries the workspace size and constructs a 'CudnnLegacyFusedConvRunner'.
-  static port::StatusOr<CudnnLegacyFusedConvRunner> Create(
+  static tsl::StatusOr<CudnnLegacyFusedConvRunner> Create(
       GpuExecutor* parent, Stream* stream, CudnnAccess* cudnn,
       const dnn::AlgorithmDesc& algo, dnn::DataType input_type,
       double conv_scale, double side_input_scale,
@@ -5067,20 +5010,20 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
 
   uint64_t GetWorkspaceSize() const override { return workspace_size_; }
 
-  port::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
+  tsl::StatusOr<dnn::AlgorithmDesc> ToAlgorithmDesc() const override {
     return MakeAlgorithmDesc();
   }
 
-  port::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
-                          DeviceMemoryBase scratch_memory,
-                          DeviceMemoryBase input_data,
-                          DeviceMemoryBase filter_data,
-                          DeviceMemoryBase side_input_data,
-                          DeviceMemoryBase bias_data,
-                          DeviceMemoryBase output_data) const override {
+  tsl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+                         DeviceMemoryBase scratch_memory,
+                         DeviceMemoryBase input_data,
+                         DeviceMemoryBase filter_data,
+                         DeviceMemoryBase side_input_data,
+                         DeviceMemoryBase bias_data,
+                         DeviceMemoryBase output_data) const override {
     if (static_cast<internal::StreamExecutorInterface*>(parent_) !=
         stream->parent()->implementation()) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "CudnnLegacyFusedConvRunner cached across multiple StreamExecutors.");
     }
 
@@ -5093,7 +5036,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
       // possible. It is still possible for other threads to issue workload on
       // to this stream. So it could take multiple profiling measurements.
       if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
     auto side_input_data_ptr = (side_input_scale_ == 0)
@@ -5121,9 +5064,9 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
             << "\noutput_data.opaque() = " << output_data.opaque();
 
     if (IsTensorMathOpSet(conv_) != tensor_ops_enabled_) {
-      return port::Status(port::error::FAILED_PRECONDITION,
-                          "Tensor op math type in dnn::AlgorithmDesc does not "
-                          "match that of the CudnnConvolutionDescriptor");
+      return tsl::Status(tsl::error::FAILED_PRECONDITION,
+                         "Tensor op math type in dnn::AlgorithmDesc does not "
+                         "match that of the CudnnConvolutionDescriptor");
     }
 
     // N.B. the scaling parameters alpha1 and alpha2 are pointers to
@@ -5151,7 +5094,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
 
     if (profile_result) {
       if (!timer->Stop(AsGpuStream(stream))) {
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       profile_result->set_algorithm(algo);
       profile_result->set_elapsed_time_in_ms(timer->GetElapsedMilliseconds());
@@ -5214,7 +5157,7 @@ class CudnnLegacyFusedConvRunner : public dnn::FusedConvRunner {
   CudnnActivationDescriptor activation_desc_;
 };
 
-port::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+tsl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
 CudnnSupport::FusedConvolveRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
     dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -5285,12 +5228,12 @@ CudnnSupport::FusedConvolveRunnerFromDesc(
   return {std::make_unique<CudnnExecutionPlanRunner<dnn::FusedConvSignature>>(
       std::move(runner))};
 #else
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
 #endif
 }
 
-port::Status CudnnSupport::GetFusedConvolveRunners(
+tsl::Status CudnnSupport::GetFusedConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, double conv_scale, double side_input_scale,
@@ -5346,7 +5289,7 @@ port::Status CudnnSupport::GetFusedConvolveRunners(
 
   if (input_type == dnn::DataType::kInt8 &&
       !stream->GetCudaComputeCapability().IsAtLeast(6, 1)) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "cudnnConvolutionBiasActivationForward() for int8 is only supported "
         "on GPUs with compute capability 6.1 or later.");
   }
@@ -5354,7 +5297,7 @@ port::Status CudnnSupport::GetFusedConvolveRunners(
   if (input_type == dnn::DataType::kInt8 &&
       output_type == dnn::DataType::kFloat &&
       (CUDNN_VERSION >= 8000 && CUDNN_VERSION <= 8200)) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "int8 -> float fused conv is disabled for this cuDNN version. See "
         "go/nvbugs/3326122");
   }
@@ -5364,18 +5307,19 @@ port::Status CudnnSupport::GetFusedConvolveRunners(
       activation_mode != dnn::ActivationMode::kElu &&
       activation_mode != dnn::ActivationMode::kLeakyRelu &&
       activation_mode != dnn::ActivationMode::kNone) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "CuDNN fusion only supports activations of "
-                        "{Relu, Relu6, Elu, <None>}.");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "CuDNN fusion only supports activations of "
+                       "{Relu, Relu6, Elu, <None>}.");
   }
 
   if (!actually_use_cudnn_frontend) {
     std::vector<dnn::AlgorithmDesc> algorithms;
 
     auto cuda_compute_capability = stream->GetCudaComputeCapability();
-    if (!GetConvolveAlgorithms(cuda_compute_capability, &algorithms)) {
-      return port::Status(port::error::UNKNOWN,
-                          "Listing fused convolve algorithms failed.");
+    if (!GetConvolveAlgorithms(cuda_compute_capability, input_type,
+                               &algorithms)) {
+      return tsl::Status(tsl::error::UNKNOWN,
+                         "Listing fused convolve algorithms failed.");
     }
 
     for (const auto& algo : algorithms) {
@@ -5409,9 +5353,9 @@ port::Status CudnnSupport::GetFusedConvolveRunners(
       leakyrelu_alpha, input_descriptor, filter_descriptor, bias_descriptor,
       output_descriptor, convolution_descriptor, activation_mode, cudnn);
   if (!op_graph_status.status().ok()) {
-    return port::Status(port::error::INTERNAL,
-                        absl::StrCat("Cudnn graph failed to build: ",
-                                     op_graph_status.status().ToString()));
+    return tsl::Status(tsl::error::INTERNAL,
+                       absl::StrCat("Cudnn graph failed to build: ",
+                                    op_graph_status.status().ToString()));
   }
   auto op_graph = std::move(op_graph_status).value();
 
@@ -5422,12 +5366,12 @@ port::Status CudnnSupport::GetFusedConvolveRunners(
       input_type, {'x', 'w', 'z', 'b', 'y'}, use_fallback, out_exec_plans,
       need_side_input);
 #else
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
 #endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
 }
 
-port::Status CudnnSupport::GetFusedMatmulRunners(
+tsl::Status CudnnSupport::GetFusedMatmulRunners(
     bool use_cudnn_frontend, dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
@@ -5436,7 +5380,7 @@ port::Status CudnnSupport::GetFusedMatmulRunners(
         out_exec_plans) {
 #if CUDNN_VERSION >= 8400 && TF_ENABLE_CUDNN_FRONTEND
   if (!use_cudnn_frontend) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "Cudnn execution plans for matmul are only supported with cudnn "
         "frontend APIs.");
   }
@@ -5446,9 +5390,9 @@ port::Status CudnnSupport::GetFusedMatmulRunners(
       input_type, bias_type, output_type, trans_a, trans_b, m, n, k, lda, ldb,
       ldc, activation_mode, cudnn);
   if (!op_graph_status.status().ok()) {
-    return port::Status(port::error::INTERNAL,
-                        absl::StrCat("Cudnn graph failed to build: ",
-                                     op_graph_status.status().ToString()));
+    return tsl::Status(tsl::error::INTERNAL,
+                       absl::StrCat("Cudnn graph failed to build: ",
+                                    op_graph_status.status().ToString()));
   }
   auto op_graph = std::move(op_graph_status).value();
 
@@ -5461,18 +5405,18 @@ port::Status CudnnSupport::GetFusedMatmulRunners(
       dnn::ConvolutionKind::INVALID, input_type, {'a', 'b', 'z', 'c'},
       use_fallback, out_exec_plans, /*need_side_input=*/true);
 #else
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "Cudnn execution plans for matmul are only supported with Cudnn >= 8.4.");
 #endif  // CUDNN_VERSION >= 8400 && TF_ENABLE_CUDNN_FRONTEND
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvFwd);
 
   bool tensor_op_math_available =
-      TensorOpMathAvailable(cuda_compute_capability);
+      IsTensorMathEnabled(cuda_compute_capability, input_type);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types;
@@ -5526,12 +5470,12 @@ bool CudnnSupport::GetRnnAlgorithms(
 }
 
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdData);
 
   bool tensor_op_math_available =
-      TensorOpMathAvailable(cuda_compute_capability);
+      IsTensorMathEnabled(cuda_compute_capability, input_type);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5561,12 +5505,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
 }
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   PreloadCudnnSubLibs(PreloadCudnnType::ConvBwdFilter);
 
   bool tensor_op_math_available =
-      TensorOpMathAvailable(cuda_compute_capability);
+      IsTensorMathEnabled(cuda_compute_capability, input_type);
   out_algorithms->clear();
 
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
@@ -5647,8 +5591,32 @@ bool CudnnSupport::DoBatchNormalizationForward(
       /*report_error=*/true);
 }
 
+bool CudnnSupport::DoBatchNormalizationForward(
+    Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+    const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+    const DeviceMemory<float>& estimated_mean,
+    const DeviceMemory<float>& estimated_variance,
+    const DeviceMemory<Eigen::bfloat16>& side_input,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    const double exponential_average_factor,
+    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+    DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+    DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+    bool is_training, ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator) {
+  return IsStatusOk(
+      DoBatchNormalizationForwardImpl<Eigen::bfloat16, float>(
+          stream, dnn::DataType::kBF16, dnn::DataType::kFloat, x, scale, offset,
+          estimated_mean, estimated_variance, side_input, x_desc,
+          scale_offset_desc, epsilon, exponential_average_factor,
+          activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+          is_training, reserve_space_allocator, workspace_allocator),
+      /*report_error=*/true);
+}
+
 template <class T, class U>
-port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
+tsl::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     Stream* stream, dnn::DataType input_data_type,
     dnn::DataType scale_data_type, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -5712,11 +5680,11 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
   }
 #endif
 
-  auto check_no_side_input_or_activation = [&]() -> port::Status {
+  auto check_no_side_input_or_activation = [&]() -> tsl::Status {
     if (activation_mode != dnn::ActivationMode::kNone ||
         !side_input.is_null()) {
-      return port::Status(
-          port::error::INTERNAL,
+      return tsl::Status(
+          tsl::error::INTERNAL,
           absl::StrCat(
               "Side input and activation are not supported by cuDNN version: ",
               CUDNN_VERSION));
@@ -5838,8 +5806,30 @@ bool CudnnSupport::DoBatchNormalizationBackward(
       /*report_error=*/true);
 }
 
+bool CudnnSupport::DoBatchNormalizationBackward(
+    Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+    const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+    const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+    const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::bfloat16>& y,
+    const dnn::BatchDescriptor& x_desc,
+    const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+    dnn::ActivationMode activation_mode,
+    DeviceMemory<Eigen::bfloat16>* x_backprop,
+    DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+    DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+    DeviceMemory<uint8_t>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
+  return IsStatusOk(
+      DoBatchNormalizationBackwardImpl(
+          stream, CUDNN_DATA_BFLOAT16, CUDNN_DATA_FLOAT, y_backprop, x, scale,
+          offset, mean, inv_var, y, x_desc, scale_offset_desc, epsilon,
+          activation_mode, x_backprop, scale_backprop, offset_backprop,
+          side_input_backprop, reserve_space_data, workspace_allocator),
+      /*report_error=*/true);
+}
+
 template <class T, class U>
-port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
+tsl::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int cudnn_input_type, int cudnn_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -5921,12 +5911,12 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
         /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
   }
 #endif
-  auto check_no_side_input_or_activation = [&]() -> port::Status {
+  auto check_no_side_input_or_activation = [&]() -> tsl::Status {
     if (activation_mode != dnn::ActivationMode::kNone ||
         !side_input_backprop->is_null()) {
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Side input and activation are not supported by cuDNN version: ",
-          CUDNN_VERSION));
+          CUDNN_VERSION);
     } else {
       return ::tsl::OkStatus();
     }
@@ -5945,7 +5935,7 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return ::tsl::OkStatus();
 }
 
-port::Status CudnnSupport::DoFusedConvolve(
+tsl::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
     dnn::DataType bias_type, dnn::DataType output_type,
     const dnn::BatchDescriptor& conv_input_descriptor,
@@ -5962,7 +5952,7 @@ port::Status CudnnSupport::DoFusedConvolve(
     dnn::ProfileResult* output_profile_result) {
   if (input_type == dnn::DataType::kInt8 &&
       !stream->GetCudaComputeCapability().IsAtLeast(6, 1)) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "cudnnConvolutionBiasActivationForward() for int8 is only supported "
         "on GPUs with compute capability 6.1 or later.");
   }
@@ -5970,16 +5960,16 @@ port::Status CudnnSupport::DoFusedConvolve(
   if (input_type == dnn::DataType::kInt8 &&
       output_type == dnn::DataType::kFloat &&
       (CUDNN_VERSION >= 8000 && CUDNN_VERSION <= 8200)) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "int8 -> float fused conv is disabled for this cuDNN version. See "
         "go/nvbugs/3326122");
   }
 
   if (activation_mode != dnn::ActivationMode::kRelu &&
       activation_mode != dnn::ActivationMode::kNone) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "cudnnConvolutionBiasActivationForward() only supports "
-                        "Relu or None activation.");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "cudnnConvolutionBiasActivationForward() only supports "
+                       "Relu or None activation.");
   }
 
   CudnnTensorDescriptor conv_input_nd(
@@ -6028,7 +6018,7 @@ port::Status CudnnSupport::DoFusedConvolve(
                 filter_data, side_input_data, biases, output_data);
 }
 
-port::Status CudnnSupport::DoPrepareForCtcLoss(
+tsl::Status CudnnSupport::DoPrepareForCtcLoss(
     Stream* stream, dnn::DataType element_type,
     const dnn::RnnStateTensorDescriptor& probs_desc,
     const dnn::RnnStateTensorDescriptor& grads_desc,
@@ -6079,9 +6069,9 @@ port::Status CudnnSupport::DoPrepareForCtcLoss(
   }
   *ctc_loss_algo_id = algo;
 #else
-  return port::Status(port::error::INVALID_ARGUMENT,
-                      "No supported cudnnGetCTCLossWorkspaceSize when "
-                      "CUDNN_VERSION < 7.6.3");
+  return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                     "No supported cudnnGetCTCLossWorkspaceSize when "
+                     "CUDNN_VERSION < 7.6.3");
 #endif
   // Allocate the workspace.
   if (workspace_size_in_bytes == 0) {
@@ -6094,11 +6084,11 @@ port::Status CudnnSupport::DoPrepareForCtcLoss(
     *scratch_memory = scratch_or.value();
     return ::tsl::OkStatus();
   }
-  return port::InternalError(
+  return tsl::errors::Internal(
       "Failed to allocate scratch memory for the CuDNN CTC Loss");
 }
 
-port::Status CudnnSupport::DoCtcLoss(
+tsl::Status CudnnSupport::DoCtcLoss(
     Stream* stream, dnn::DataType element_type,
     const dnn::RnnStateTensorDescriptor& probs_desc,
     const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
@@ -6109,9 +6099,9 @@ port::Status CudnnSupport::DoCtcLoss(
     int ctc_loss_algo_id) {
   // Current cuDNN CTC Loss only supports the float datatype
   if (CUDNN_VERSION < 7603 || element_type != dnn::DataType::kFloat) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "CudnnCtcLossDescriptor is supported only when the "
-                        "CUDNN_VERSION >= 7.6.3 and DataType is float");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "CudnnCtcLossDescriptor is supported only when the "
+                       "CUDNN_VERSION >= 7.6.3 and DataType is float");
   }
   CudnnCtcLossDescriptor cudnn_ctc_loss_desc(ToCudnnDataType(element_type));
   const CudnnRnnStateTensorDescriptor& cudnn_probs_desc =
@@ -6364,7 +6354,7 @@ struct PoolingSplitsSpec {
   int64_t output_offset_in_bytes;
 };
 
-port::StatusOr<std::vector<PoolingSplitsSpec>> GetTensorSplits(
+tsl::StatusOr<std::vector<PoolingSplitsSpec>> GetTensorSplits(
     const dnn::BatchDescriptor& input_descriptor,
     const dnn::BatchDescriptor& output_descriptor, dnn::DataType element_type) {
   std::vector<PoolingSplitsSpec> out;
@@ -6390,8 +6380,8 @@ port::StatusOr<std::vector<PoolingSplitsSpec>> GetTensorSplits(
       std::numeric_limits<int>::max() / elements_per_batch_input;
 
   if (max_batches_per_split == 0) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrCat(
             "Tensor has too many elements for int32 indexing: batches=",
             num_batches, " elements_per_batch=", elements_per_batch_input,
@@ -6413,7 +6403,7 @@ port::StatusOr<std::vector<PoolingSplitsSpec>> GetTensorSplits(
 }
 }  // namespace
 
-port::Status CudnnSupport::DoPoolForward(
+tsl::Status CudnnSupport::DoPoolForward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
@@ -6451,7 +6441,7 @@ port::Status CudnnSupport::DoPoolForward(
   auto splits_or =
       GetTensorSplits(input_dimensions, output_dimensions, element_type);
   if (!splits_or.ok()) {
-    return port::Status(port::error::INTERNAL, "Cudnn pooling failed to split");
+    return tsl::Status(tsl::error::INTERNAL, "Cudnn pooling failed to split");
   }
   auto splits = std::move(splits_or.value());
 
@@ -6479,7 +6469,7 @@ port::Status CudnnSupport::DoPoolForward(
   return ::tsl::OkStatus();
 }
 
-port::Status CudnnSupport::DoPoolBackward(
+tsl::Status CudnnSupport::DoPoolBackward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
@@ -6520,7 +6510,7 @@ port::Status CudnnSupport::DoPoolBackward(
   auto splits_or =
       GetTensorSplits(input_dimensions, output_dimensions, element_type);
   if (!splits_or.ok()) {
-    return port::Status(port::error::INTERNAL, "Cudnn pooling failed to split");
+    return tsl::Status(tsl::error::INTERNAL, "Cudnn pooling failed to split");
   }
   auto splits = std::move(splits_or.value());
 
@@ -6657,7 +6647,7 @@ bool CudnnSupport::DoDepthConcatenate(Stream* stream,
     const auto& dimensions = input_dimensions[i];
     tmp.resize(dimensions.ElementCount());
     stream->ThenMemcpyD2H<float>(*input_data[i], absl::MakeSpan(tmp));
-    port::Status block_status = stream->BlockHostUntilDone();
+    tsl::Status block_status = stream->BlockHostUntilDone();
     if (!block_status.ok()) {
       LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
       return false;
@@ -6755,7 +6745,7 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 }  // namespace gpu
 
 void initialize_cudnn() {
-  port::Status status =
+  tsl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
           cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
           [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
index 30da4970c74..62e55fd7347 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h
@@ -22,11 +22,12 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_device_memory.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -40,12 +41,10 @@ class CudnnCtcLossDescriptor;
 // Opaque and unique identifier for the cuDNN plugin.
 extern const PluginId kCuDnnPlugin;
 
-using BatchDescriptorSlice =
-    port::ArraySlice<dnn::BatchDescriptor>;  // non-absl ok
+using BatchDescriptorSlice = absl::Span<const dnn::BatchDescriptor>;
 
 template <typename T>
-using DeviceMemorySlice =
-    port::ArraySlice<const DeviceMemory<T>*>;  // non-absl ok
+using DeviceMemorySlice = absl::Span<const DeviceMemory<T>* const>;
 
 // cudnn-library based DNN support. For details on overridden interface
 // functions, see dnn.h.
@@ -53,10 +52,10 @@ class CudnnSupport : public dnn::DnnSupport {
  public:
   explicit CudnnSupport(GpuExecutor* parent);
 
-  port::Status Init() override;
-  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
+  tsl::Status Init() override;
+  tsl::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+  tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size, int cell_size,
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -64,19 +63,19 @@ class CudnnSupport : public dnn::DnnSupport {
       float dropout, uint64_t seed, ScratchAllocator* state_allocator,
       bool use_padded_io) override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     dnn::DataType data_type) override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
                                     bool time_major,
                                     dnn::DataType data_type) override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
 
@@ -216,10 +215,10 @@ class CudnnSupport : public dnn::DnnSupport {
                      dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
-  port::Status GetConvolveRunners(
+  tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -232,7 +231,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans)
       override;
 
-  port::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
+  tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
       dnn::ConvolutionKind kind, dnn::DataType input_type,
       dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
@@ -240,7 +239,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor) override;
 
-  port::Status GetFusedConvolveRunners(
+  tsl::Status GetFusedConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType bias_type,
       dnn::DataType output_type, double conv_scale, double side_input_scale,
@@ -254,7 +253,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans)
       override;
 
-  port::Status GetFusedMatmulRunners(
+  tsl::Status GetFusedMatmulRunners(
       bool use_cudnn_frontend, dnn::DataType input_type,
       dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
@@ -263,7 +262,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans) override;
 
-  port::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
   FusedConvolveRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
       dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -280,11 +279,11 @@ class CudnnSupport : public dnn::DnnSupport {
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool DoBatchNormalizationForward(
@@ -316,6 +315,21 @@ class CudnnSupport : public dnn::DnnSupport {
       bool is_training, ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) override;
 
+  bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* saved_mean, DeviceMemory<float>* saved_inv_var,
+      bool is_training, ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) override;
+
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
@@ -343,7 +357,22 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<uint8_t>* reserve_space_data,
       ScratchAllocator* workspace_allocator) override;
 
-  port::Status DoConvolve(
+  bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
+
+  tsl::Status DoConvolve(
       dnn::ConvolutionKind kind, dnn::DataType element_type,
       dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -355,7 +384,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  port::Status DoFusedConvolve(
+  tsl::Status DoFusedConvolve(
       Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
       dnn::DataType bias_type, dnn::DataType output_type,
       const dnn::BatchDescriptor& conv_input_descriptor,
@@ -446,24 +475,24 @@ class CudnnSupport : public dnn::DnnSupport {
                   const DeviceMemory<float>& input_data,
                   DeviceMemory<float>* output_data, uint64_t options) override;
 
-  port::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+  tsl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                            const dnn::PoolingDescriptor& pooling_dimensions,
+                            const dnn::BatchDescriptor& input_dimensions,
+                            DeviceMemoryBase input_data,
+                            const dnn::BatchDescriptor& output_dimensions,
+                            DeviceMemoryBase output_data,
+                            ScratchAllocator* workspace_allocator) override;
+
+  tsl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
                              DeviceMemoryBase input_data,
                              const dnn::BatchDescriptor& output_dimensions,
                              DeviceMemoryBase output_data,
+                             DeviceMemoryBase input_diff_data,
+                             DeviceMemoryBase output_diff_data,
                              ScratchAllocator* workspace_allocator) override;
 
-  port::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
-                              const dnn::PoolingDescriptor& pooling_dimensions,
-                              const dnn::BatchDescriptor& input_dimensions,
-                              DeviceMemoryBase input_data,
-                              const dnn::BatchDescriptor& output_dimensions,
-                              DeviceMemoryBase output_data,
-                              DeviceMemoryBase input_diff_data,
-                              DeviceMemoryBase output_diff_data,
-                              ScratchAllocator* workspace_allocator) override;
-
   bool DoNormalizeWithDimensions(
       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
       const dnn::BatchDescriptor& dimensions,
@@ -517,17 +546,17 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       dnn::BatchDescriptor* output_batch_descriptor);
 
-  port::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
-                         const dnn::RnnStateTensorDescriptor& probs_desc,
-                         const DeviceMemoryBase probs_data,
-                         absl::Span<const int> labels_data,
-                         absl::Span<const int> labels_lengths_data,
-                         absl::Span<const int> input_lengths_data,
-                         DeviceMemoryBase costs_data,
-                         const dnn::RnnStateTensorDescriptor& grads_desc,
-                         DeviceMemoryBase grads_data,
-                         DeviceMemory<uint8_t> scratch_memory,
-                         int ctc_loss_algo_id) override;
+  tsl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
+                        const dnn::RnnStateTensorDescriptor& probs_desc,
+                        const DeviceMemoryBase probs_data,
+                        absl::Span<const int> labels_data,
+                        absl::Span<const int> labels_lengths_data,
+                        absl::Span<const int> input_lengths_data,
+                        DeviceMemoryBase costs_data,
+                        const dnn::RnnStateTensorDescriptor& grads_desc,
+                        DeviceMemoryBase grads_data,
+                        DeviceMemory<uint8_t> scratch_memory,
+                        int ctc_loss_algo_id) override;
 
   bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
                          dnn::DataType input_type,
@@ -545,7 +574,7 @@ class CudnnSupport : public dnn::DnnSupport {
   std::unique_ptr<class CudnnAccess> cudnn_;
 
   template <class T, class U>
-  port::Status DoBatchNormalizationForwardImpl(
+  tsl::Status DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
       dnn::DataType scale_data_type, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -561,7 +590,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator);
 
   template <class T, class U>
-  port::Status DoBatchNormalizationBackwardImpl(
+  tsl::Status DoBatchNormalizationBackwardImpl(
       Stream* stream, int cudnn_input_type, int cudnn_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -575,7 +604,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator);
 
   template <class T>
-  port::Status DoRnnForwardImpl(
+  tsl::Status DoRnnForwardImpl(
       Stream* stream, const CudnnRnnDescriptor& rnn_desc,
       const CudnnRnnSequenceTensorDescriptor& input_desc,
       const DeviceMemory<T>& input_data,
@@ -595,7 +624,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  port::Status DoRnnBackwardImpl(
+  tsl::Status DoRnnBackwardImpl(
       Stream* stream, const CudnnRnnDescriptor& rnn_desc,
       const CudnnRnnSequenceTensorDescriptor& input_desc,
       const DeviceMemory<T>& input_data,
@@ -621,7 +650,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator,
       dnn::ProfileResult* output_profile_result);
 
-  port::Status DoCtcLossImpl(
+  tsl::Status DoCtcLossImpl(
       Stream* stream, const CudnnRnnStateTensorDescriptor& probs_desc,
       const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
@@ -631,7 +660,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id);
 
  private:
-  port::Status DoPrepareForConvolution(
+  tsl::Status DoPrepareForConvolution(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -643,7 +672,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
       DeviceMemory<uint8_t>* scratch_memory) override;
 
-  port::Status DoPrepareForCtcLoss(
+  tsl::Status DoPrepareForCtcLoss(
       Stream* stream, dnn::DataType element_type,
       const dnn::RnnStateTensorDescriptor& probs_desc,
       const dnn::RnnStateTensorDescriptor& grads_desc,
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
index 9d811774c9a..9c0f6c22e87 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
@@ -35,26 +35,25 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/stacktrace.h"
+#include "tensorflow/tsl/platform/static_threadlocal.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
 
-#define RETURN_IF_CUDA_RES_ERROR(expr, ...)                            \
-  do {                                                                 \
-    CUresult _res = (expr);                                            \
-    if (ABSL_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                    \
-      return port::InternalError(absl::StrCat(                         \
-          __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
-    }                                                                  \
+#define RETURN_IF_CUDA_RES_ERROR(expr, ...)                                 \
+  do {                                                                      \
+    CUresult _res = (expr);                                                 \
+    if (ABSL_PREDICT_FALSE(_res != CUDA_SUCCESS)) {                         \
+      return tsl::errors::Internal(__VA_ARGS__, ": ",                       \
+                                   ::stream_executor::gpu::ToString(_res)); \
+    }                                                                       \
   } while (0)
 
 #define FAIL_IF_CUDA_RES_ERROR(expr, ...)                   \
@@ -98,9 +97,9 @@ CUcontext CurrentContext() {
 // stack-limited threads (such as those spawned by a default-argument
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
-port::ThreadPool* GetDriverExecutor() {
-  static port::ThreadPool* thread_pool = new port::ThreadPool(
-      port::Env::Default(), port::ThreadOptions(), "cuda_driver", 1);
+tsl::thread::ThreadPool* GetDriverExecutor() {
+  static tsl::thread::ThreadPool* thread_pool = new tsl::thread::ThreadPool(
+      tsl::Env::Default(), tsl::ThreadOptions(), "cuda_driver", 1);
   return thread_pool;
 }
 
@@ -122,7 +121,7 @@ namespace {
 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
   FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
-                         "Synchronize fail: ", port::CurrentStackTrace());
+                         "Synchronize fail: ", tsl::CurrentStackTrace());
 }
 
 struct ThreadLocalData {
@@ -131,7 +130,7 @@ struct ThreadLocalData {
   int depth;
 };
 
-SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
+TSL_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
 
 }  // namespace
 
@@ -251,7 +250,7 @@ std::string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
 
 // Actually performs the work of CUDA initialization. Wrapped up in one-time
 // execution guard.
-static port::Status InternalInit() {
+static tsl::Status InternalInit() {
   CUresult res = CUDA_ERROR_NO_DEVICE;
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
@@ -262,36 +261,36 @@ static port::Status InternalInit() {
   if (res == CUDA_SUCCESS) {
     return ::tsl::OkStatus();
   } else if (res == CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) {
-    LOG(WARNING) << "failed call to cuInit: " << ToString(res);
+    VLOG(1) << "failed call to cuInit: " << ToString(res);
   } else {
     LOG(ERROR) << "failed call to cuInit: " << ToString(res);
   }
 
   Diagnostician::LogDiagnosticInformation();
-  return port::Status(port::error::ABORTED,
-                      absl::StrCat("failed call to cuInit: ", ToString(res)));
+  return tsl::Status(tsl::error::ABORTED,
+                     absl::StrCat("failed call to cuInit: ", ToString(res)));
 }
 
 }  // namespace
 
-/* static */ port::Status GpuDriver::Init() {
+/* static */ tsl::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
   // called once, but GpuDriver::Init may be called many times.
-  static port::Status* init_retval = [] {
-    return new port::Status(InternalInit());
+  static tsl::Status* init_retval = [] {
+    return new tsl::Status(InternalInit());
   }();
   return *init_retval;
 }
 
-/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
-                                               CUdevice* device) {
+/* static */ tsl::Status GpuDriver::GetDevice(int device_ordinal,
+                                              CUdevice* device) {
   RETURN_IF_CUDA_RES_ERROR(cuDeviceGet(device, device_ordinal),
                            "Failed call to cuDeviceGet");
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::GetDeviceName(CUdevice device,
-                                                   std::string* device_name) {
+/* static */ tsl::Status GpuDriver::GetDeviceName(CUdevice device,
+                                                  std::string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
   RETURN_IF_CUDA_RES_ERROR(
@@ -326,7 +325,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return true;
 }
 
-/* static */ port::Status GpuDriver::CreateContext(
+/* static */ tsl::Status GpuDriver::CreateContext(
     int device_ordinal, CUdevice device, const DeviceOptions& device_options,
     GpuContext** context) {
   *context = nullptr;
@@ -401,7 +400,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
     }
   }
 
-  return port::Status(port::error::INTERNAL, message);
+  return tsl::Status(tsl::error::INTERNAL, message);
 }
 
 /* static */ void GpuDriver::DestroyContext(GpuContext* context) {
@@ -427,22 +426,22 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return context->context();
 }
 
-/* static */ port::Status GpuDriver::FuncGetAttribute(
+/* static */ tsl::Status GpuDriver::FuncGetAttribute(
     CUfunction_attribute attribute, CUfunction func, int* attribute_value) {
   RETURN_IF_CUDA_RES_ERROR(cuFuncGetAttribute(attribute_value, attribute, func),
                            "Failed to query kernel attribute: ", attribute);
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::FuncSetCacheConfig(
+/* static */ tsl::Status GpuDriver::FuncSetCacheConfig(
     CUfunction function, CUfunc_cache cache_config) {
   RETURN_IF_CUDA_RES_ERROR(cuFuncSetCacheConfig(function, cache_config),
                            "Failed to set CUDA kernel cache config");
   return ::tsl::OkStatus();
 }
 
-/* static */ port::StatusOr<CUsharedconfig>
-GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
+/* static */ tsl::StatusOr<CUsharedconfig> GpuDriver::ContextGetSharedMemConfig(
+    GpuContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(cuCtxGetSharedMemConfig(&shared_mem_config),
@@ -450,7 +449,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return shared_mem_config;
 }
 
-/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+/* static */ tsl::Status GpuDriver::ContextSetSharedMemConfig(
     GpuContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(cuCtxSetSharedMemConfig(shared_mem_config),
@@ -458,7 +457,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::LaunchKernel(
+/* static */ tsl::Status GpuDriver::LaunchKernel(
     GpuContext* context, absl::string_view kernel_name, CUfunction function,
     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
     unsigned int block_dim_x, unsigned int block_dim_y,
@@ -480,20 +479,20 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
-                                               const char* cubin_bytes,
-                                               CUmodule* module) {
+/* static */ tsl::Status GpuDriver::LoadCubin(GpuContext* context,
+                                              const char* cubin_bytes,
+                                              CUmodule* module) {
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(cuModuleLoadFatBinary(module, cubin_bytes),
                            "Failed to load in-memory CUBIN");
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
-                                             const char* ptx_contents,
-                                             CUmodule* module) {
+/* static */ tsl::Status GpuDriver::LoadPtx(GpuContext* context,
+                                            const char* ptx_contents,
+                                            CUmodule* module) {
   absl::Notification notification;
-  port::Status ret = ::tsl::OkStatus();
+  tsl::Status ret = ::tsl::OkStatus();
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
     ScopedActivateContext activation(context);
@@ -543,8 +542,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                               : 0] = '\0';
       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
                  << " bytes): " << error_log_buffer.data();
-      ret = port::InternalError(
-          absl::StrCat("Failed to load PTX text as a module: ", ToString(res)));
+      ret = tsl::errors::Internal("Failed to load PTX text as a module: ",
+                                  ToString(res));
       notification.Notify();
     }
 
@@ -560,22 +559,24 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ret;
 }
 
-/* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
-                                               const char* hsaco_contents,
-                                               CUmodule* module) {
-  return port::InternalError(
+/* static */ tsl::Status GpuDriver::LoadHsaco(GpuContext* context,
+                                              const char* hsaco_contents,
+                                              CUmodule* module) {
+  return tsl::errors::Internal(
       "Feature not supported on CUDA platform (LoadHsaco)");
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemsetUint8(
-    GpuContext* context, CUdeviceptr location, uint8_t value, size_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                           CUdeviceptr location,
+                                                           uint8_t value,
+                                                           size_t size) {
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(cuMemsetD8(location, value, size),
                            "Failed to memset memory");
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemsetUint32(
+/* static */ tsl::Status GpuDriver::SynchronousMemsetUint32(
     GpuContext* context, CUdeviceptr location, uint32_t value,
     size_t uint32_count) {
   ScopedActivateContext activation(context);
@@ -584,7 +585,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
+/* static */ tsl::Status GpuDriver::AsynchronousMemsetUint8(
     GpuContext* context, CUdeviceptr location, uint8_t value,
     size_t uint32_count, CUstream stream) {
   ScopedActivateContext activation(context);
@@ -594,7 +595,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
+/* static */ tsl::Status GpuDriver::AsynchronousMemsetUint32(
     GpuContext* context, CUdeviceptr location, uint32_t value,
     size_t uint32_count, CUstream stream) {
   ScopedActivateContext activation(context);
@@ -662,7 +663,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 }
 
-/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
+/* static */ tsl::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
     GpuContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
@@ -671,8 +672,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     return device;
   }
 
-  return port::Status(
-      port::error::INTERNAL,
+  return tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
@@ -733,7 +734,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     // LOG(INFO) because this isn't always important to users (e.g. BFCAllocator
     // implements a retry if the first allocation fails).
     LOG(INFO) << "failed to allocate "
-              << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+              << tsl::strings::HumanReadableNumBytes(bytes) << " (" << bytes
               << " bytes) from device: " << ToString(res);
     return nullptr;
   }
@@ -838,14 +839,14 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 }
 
 #if CUDA_VERSION >= 10020
-/* static */ port::StatusOr<GpuDriver::VmemSpan>
-GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64_t bytes) {
+/* static */ tsl::StatusOr<GpuDriver::VmemSpan> GpuDriver::ReserveVirtualMemory(
+    GpuContext* context, uint64_t bytes) {
   ScopedActivateContext activation(context);
   CUdeviceptr base;
   CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
                                      /*addr=*/0, /*flags=*/0);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
                         bytes, ToString(res)));
   }
@@ -862,7 +863,7 @@ GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64_t bytes) {
   }
 }
 
-/* static */ port::StatusOr<uint64_t> GpuDriver::GetMinAllocationGranularity(
+/* static */ tsl::StatusOr<uint64_t> GpuDriver::GetMinAllocationGranularity(
     GpuDeviceHandle device) {
   CUmemAllocationProp props = {};
   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -873,13 +874,13 @@ GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64_t bytes) {
   CUresult res = cuMemGetAllocationGranularity(
       &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(absl::StrCat(
-        "failed to get min allocation granularity: ", ToString(res)));
+    return tsl::errors::Internal("failed to get min allocation granularity: ",
+                                 ToString(res));
   }
   return granularity;
 }
 
-/* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
+/* static */ tsl::StatusOr<GpuDriver::GenericMemoryHandle>
 GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   ScopedActivateContext activation(context);
   auto device = DeviceFromContext(context);
@@ -896,7 +897,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   CUmemGenericAllocationHandle mem_handle;
   CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         absl::StrFormat("failed to create memory allocation of size %d: %s",
                         bytes, ToString(res)));
   }
@@ -914,7 +915,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   }
 }
 
-/* static */ port::Status GpuDriver::MapMemory(
+/* static */ tsl::Status GpuDriver::MapMemory(
     GpuContext* context, CUdeviceptr va,
     const GpuDriver::GenericMemoryHandle& handle,
     const std::vector<GpuDeviceHandle>& device_handles) {
@@ -929,7 +930,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   CUresult res =
       cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
   if (res != CUDA_SUCCESS) {
-    return port::InternalError(absl::StrFormat(
+    return tsl::errors::Internal(absl::StrFormat(
         "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
   }
 
@@ -948,7 +949,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
       LOG(ERROR)
           << "Failed to unmap memory in GpuDriver::MapMemory error path.";
     }
-    return port::InternalError(absl::StrFormat(
+    return tsl::errors::Internal(absl::StrFormat(
         "Failed to set read/write access on memory mapped at %d: %s", va,
         ToString(res)));
   }
@@ -968,11 +969,11 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
 
 #endif
 
-/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
-                                                  CUevent* event) {
+/* static */ tsl::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                 CUevent* event) {
   if (*event == nullptr) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "input event cannot be null");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
@@ -981,22 +982,22 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
-                                                 CUevent event,
-                                                 CUstream stream) {
+/* static */ tsl::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                CUevent event,
+                                                CUstream stream) {
   ScopedActivateContext activated{context};
   RETURN_IF_CUDA_RES_ERROR(cuEventRecord(event, stream),
                            "Error recording CUDA event");
   return ::tsl::OkStatus();
 }
 
-/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
-                                                            CUevent event) {
+/* static */ tsl::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
+                                                           CUevent event) {
   ScopedActivateContext activated{context};
   CUresult res = cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to query event: %s", ToString(res)));
   }
 
@@ -1041,15 +1042,15 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   CUresult res = cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
+               << " :: " << tsl::CurrentStackTrace();
     return false;
   }
 
   return true;
 }
 
-/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
-                                                       CUstream stream) {
+/* static */ tsl::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                      CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
   RETURN_IF_CUDA_RES_ERROR(cuStreamSynchronize(stream),
@@ -1072,10 +1073,10 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return false;
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
-                                                          void* host_dst,
-                                                          CUdeviceptr gpu_src,
-                                                          uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                         void* host_dst,
+                                                         CUdeviceptr gpu_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(
       cuMemcpyDtoH(host_dst, gpu_src, size),
@@ -1087,10 +1088,10 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
-                                                          CUdeviceptr gpu_dst,
-                                                          const void* host_src,
-                                                          uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                         CUdeviceptr gpu_dst,
+                                                         const void* host_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation(context);
   RETURN_IF_CUDA_RES_ERROR(
       cuMemcpyHtoD(gpu_dst, host_src, size),
@@ -1102,10 +1103,10 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
-                                                          CUdeviceptr gpu_dst,
-                                                          CUdeviceptr gpu_src,
-                                                          uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                         CUdeviceptr gpu_dst,
+                                                         CUdeviceptr gpu_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation(context);
 
   CUresult result;
@@ -1121,14 +1122,14 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
 
     if (static_cast<void*>(dst_context) == nullptr) {
-      port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
+      tsl::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
       if (tmp_context.ok()) {
         dst_context = tmp_context.value()->context();
       }
     }
 
     if (static_cast<void*>(src_context) == nullptr) {
-      port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
+      tsl::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
       if (tmp_context.ok()) {
         src_context = tmp_context.value()->context();
       }
@@ -1206,14 +1207,14 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
         CreatedContexts::GetAnyContext(absl::bit_cast<void*>(gpu_src));
 
     if (static_cast<void*>(dst_context) == nullptr) {
-      port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
+      tsl::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_dst);
       if (tmp_context.ok()) {
         dst_context = tmp_context.value()->context();
       }
     }
 
     if (static_cast<void*>(src_context) == nullptr) {
-      port::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
+      tsl::StatusOr<GpuContext*> tmp_context = GetPointerContext(gpu_src);
       if (tmp_context.ok()) {
         src_context = tmp_context.value()->context();
       }
@@ -1241,9 +1242,9 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return true;
 }
 
-/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
-                                               CUevent* result,
-                                               EventFlags flags) {
+/* static */ tsl::Status GpuDriver::InitEvent(GpuContext* context,
+                                              CUevent* result,
+                                              EventFlags flags) {
   int cuflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -1262,11 +1263,11 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   if (res == CUDA_SUCCESS) {
     return ::tsl::OkStatus();
   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
-    return port::Status(port::error::RESOURCE_EXHAUSTED,
-                        "could not create CUDA event: out of device memory");
+    return tsl::Status(tsl::error::RESOURCE_EXHAUSTED,
+                       "could not create CUDA event: out of device memory");
   } else {
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
+    return tsl::Status(
+        tsl::error::FAILED_PRECONDITION,
         absl::StrCat("could not create CUDA event: ", ToString(res)));
   }
 }
@@ -1285,7 +1286,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return device_count;
 }
 
-/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
+/* static */ tsl::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
     CUdeviceptr pointer) {
   GpuContext* context = nullptr;
   CUresult result =
@@ -1297,20 +1298,20 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
     // handling.  So all is working fine, but TF have a different
     // error then the original one.
     if (context == nullptr) {
-      return port::Status(
-          port::error::UNAVAILABLE,
+      return tsl::Status(
+          tsl::error::UNAVAILABLE,
           "Empty context returned while querying context for device pointer");
     }
     return context;
   }
 
-  return port::Status(
-      port::error::INTERNAL,
+  return tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrCat("failed to query context for device pointer: ",
                    ToString(result)));
 }
 
-/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
+/* static */ tsl::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
   CUresult result =
@@ -1322,21 +1323,21 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
       case CU_MEMORYTYPE_HOST:
         return MemorySpace::kHost;
       default:
-        return port::Status(
-            port::error::INTERNAL,
+        return tsl::Status(
+            tsl::error::INTERNAL,
             absl::StrCat("unknown memory space provided by CUDA API: ", value));
     }
   }
 
-  return port::Status(
-      port::error::INTERNAL,
+  return tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrCat("failed to query device pointer for memory space: ",
                    ToString(result)));
 }
 
-/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
-                                                            CUdeviceptr* base,
-                                                            size_t* size) {
+/* static */ tsl::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                           CUdeviceptr* base,
+                                                           size_t* size) {
   CUresult result = cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return ::tsl::OkStatus();
@@ -1344,19 +1345,19 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
     // We differentiate between "this pointer is unknown" (return here) and
     // "there was an internal error while performing this operation" (return
     // below).
-    return port::Status(
-        port::error::NOT_FOUND,
+    return tsl::Status(
+        tsl::error::NOT_FOUND,
         absl::StrFormat("not a device pointer %p; %s",
                         reinterpret_cast<void*>(dptr), ToString(result)));
   }
 
-  return port::Status(
-      port::error::INTERNAL,
+  return tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
                       reinterpret_cast<void*>(dptr), ToString(result)));
 }
 
-/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
+/* static */ tsl::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
     CUdeviceptr pointer) {
   auto result = GetPointerContext(pointer);
   if (!result.ok()) {
@@ -1366,17 +1367,17 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return DeviceFromContext(result.value());
 }
 
-/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
-                                                          int* cc_minor,
-                                                          CUdevice device) {
+/* static */ tsl::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                         int* cc_minor,
+                                                         CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
 
   CUresult res = cuDeviceGetAttribute(
       cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
   if (res != CUDA_SUCCESS) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed to get compute capability major for device: %s; %d",
             ToString(res), device));
@@ -1385,8 +1386,8 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   res = cuDeviceGetAttribute(
       cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
   if (res != CUDA_SUCCESS) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed to get compute capability minor for device: %s; %d",
             ToString(res), device));
@@ -1395,24 +1396,24 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64_t bytes) {
   return ::tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
-                                                      CUdevice device) {
-  return port::Status{
-      port::error::INTERNAL,
+/* static */ tsl::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                     CUdevice device) {
+  return tsl::Status{
+      tsl::error::INTERNAL,
       "Feature not supported on CUDA platform (GetGpuISAVersion)"};
 }
 
-/* static */ port::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
-  return port::Status{
-      port::error::INTERNAL,
+/* static */ tsl::Status GpuDriver::GetGpuGCNArchName(CUdevice, std::string*) {
+  return tsl::Status{
+      tsl::error::INTERNAL,
       "Feature not supported on CUDA platform (GetGpuGCNArchName)"};
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
 // T and wraps it in a StatusOr.
 template <typename T>
-static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
-                                            CUdevice_attribute attribute) {
+static tsl::StatusOr<T> GetSimpleAttribute(CUdevice device,
+                                           CUdevice_attribute attribute) {
   int value = -1;
   RETURN_IF_CUDA_RES_ERROR(cuDeviceGetAttribute(&value, attribute, device),
                            "Could not retrieve CUDA device attribute (",
@@ -1421,43 +1422,43 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return converted;
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
+/* static */ tsl::StatusOr<int> GpuDriver::GetMultiprocessorCount(
     CUdevice device) {
   return GetSimpleAttribute<int>(device,
                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(
       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(device,
                                      CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(
       device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
     CUdevice device) {
   return GetSimpleAttribute<int64_t>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
 }
@@ -1512,13 +1513,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
+/* static */ tsl::StatusOr<int> GpuDriver::GetDeviceAttribute(
     CUdevice_attribute attribute, CUdevice device) {
   int val;
   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to get device attribute %d for device %d: %s",
                         attribute, device, ToString(res)));
   }
@@ -1615,8 +1616,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return can_access_peer;
 }
 
-/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
-                                                      GpuContext* to) {
+/* static */ tsl::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                     GpuContext* to) {
   if (from == to) {
     return ::tsl::OkStatus();  // A context can always access its own
                                // memory.
@@ -1626,8 +1627,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to enable peer access from %p to %p: %s", from,
                         to, ToString(result)));
   }
@@ -1635,7 +1636,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return ::tsl::OkStatus();
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+/* static */ tsl::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
     GpuContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
   ScopedActivateContext activation(context);
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
index abbb7c23bdc..2ee7338c974 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_event.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
 
 Event::Status GpuEvent::PollForStatus() {
-  port::StatusOr<CUresult> status =
+  tsl::StatusOr<CUresult> status =
       GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
index 43638263bc7..d3914e1becc 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc
@@ -26,15 +26,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -78,13 +77,13 @@ bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
 
 // Populates array of 32b integers from 64b integers, or an error if the
 // numbers don't fit in 32b (signed).
-port::StatusOr<std::array<int32_t, 3>> Downsize64bArray(
+tsl::StatusOr<std::array<int32_t, 3>> Downsize64bArray(
     std::array<long long, 3> source, int32_t rank) {  // NOLINT
   std::array<int32_t, 3> downsized = {0};
   for (int32_t i = 0; i < rank; ++i) {
     if (source[i] > std::numeric_limits<int32_t>::max()) {
-      return port::InvalidArgumentError(absl::StrCat(
-          source[i], " exceeds max 32b signed integer. Conversion failed."));
+      return tsl::errors::InvalidArgument(
+          source[i], " exceeds max 32b signed integer. Conversion failed.");
     }
     downsized[i] = static_cast<int32_t>(source[i]);
   }
@@ -93,13 +92,13 @@ port::StatusOr<std::array<int32_t, 3>> Downsize64bArray(
 
 }  // namespace
 
-port::Status CUDAFftPlan::Initialize(
+tsl::Status CUDAFftPlan::Initialize(
     GpuExecutor *parent, Stream *stream, int rank, uint64_t *elem_count,
     uint64_t *input_embed, uint64 input_stride, uint64 input_distance,
     uint64_t *output_embed, uint64 output_stride, uint64 output_distance,
     fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
   if (IsInitialized()) {
-    return port::InternalError("cuFFT is already initialized.");
+    return tsl::errors::Internal("cuFFT is already initialized.");
   }
   is_initialized_ = true;
   scratch_allocator_ = scratch_allocator;
@@ -130,7 +129,7 @@ port::Status CUDAFftPlan::Initialize(
                             1 /* = batch */);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to create cuFFT 1d plan: " << ret;
-            return port::InternalError("Failed to create cuFFT 1d plan.");
+            return tsl::errors::Internal("Failed to create cuFFT 1d plan.");
           }
           return ::tsl::OkStatus();
         case 2:
@@ -139,7 +138,7 @@ port::Status CUDAFftPlan::Initialize(
                             CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to create cuFFT 2d plan: " << ret;
-            return port::InternalError("Failed to create cuFFT 2d plan.");
+            return tsl::errors::Internal("Failed to create cuFFT 2d plan.");
           }
           return ::tsl::OkStatus();
         case 3:
@@ -148,26 +147,26 @@ port::Status CUDAFftPlan::Initialize(
                             elem_count_[2], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to create cuFFT 3d plan: " << ret;
-            return port::InternalError("Failed to create cuFFT 3d plan.");
+            return tsl::errors::Internal("Failed to create cuFFT 3d plan.");
           }
           return ::tsl::OkStatus();
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::InvalidArgumentError(
+          return tsl::errors::InvalidArgument(
               "cufftPlan only takes rank 1, 2, or 3.");
       }
     } else {
       ret = cufftCreate(&plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to create cuFFT plan: " << ret;
-        return port::InternalError("Failed to create cuFFT plan.");
+        return tsl::errors::Internal("Failed to create cuFFT plan.");
       }
       ret = cufftSetAutoAllocation(plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to set auto allocation for cuFFT plan: " << ret;
-        return port::InternalError(
+        return tsl::errors::Internal(
             "Failed to set auto allocation for cuFFT plan.");
       }
       switch (rank) {
@@ -176,7 +175,7 @@ port::Status CUDAFftPlan::Initialize(
                                 /*batch=*/1, &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to make cuFFT 1d plan: " << ret;
-            return port::InternalError("Failed to make cuFFT 1d plan.");
+            return tsl::errors::Internal("Failed to make cuFFT 1d plan.");
           }
           break;
         case 2:
@@ -184,7 +183,7 @@ port::Status CUDAFftPlan::Initialize(
                                 CUDAFftType(type), &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to make cuFFT 2d plan: " << ret;
-            return port::InternalError("Failed to make cuFFT 2d plan.");
+            return tsl::errors::Internal("Failed to make cuFFT 2d plan.");
           }
           break;
         case 3:
@@ -193,14 +192,14 @@ port::Status CUDAFftPlan::Initialize(
                                 &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "Failed to make cuFFT 3d plan: " << ret;
-            return port::InternalError("Failed to make cuFFT 3d plan.");
+            return tsl::errors::Internal("Failed to make cuFFT 3d plan.");
           }
           break;
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::InvalidArgumentError(
+          return tsl::errors::InvalidArgument(
               "cufftPlan only takes rank 1, 2, or 3.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
@@ -222,19 +221,19 @@ port::Status CUDAFftPlan::Initialize(
           output_stride, output_distance, CUDAFftType(type), batch_count);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to create cuFFT batched plan: " << ret;
-        return port::InternalError("Failed to create cuFFT batched plan.");
+        return tsl::errors::Internal("Failed to create cuFFT batched plan.");
       }
     } else {
       auto ret = cufftCreate(&plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to create cuFFT batched plan: " << ret;
-        return port::InternalError("Failed to create cuFFT batched plan.");
+        return tsl::errors::Internal("Failed to create cuFFT batched plan.");
       }
       ret = cufftSetAutoAllocation(plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to set auto allocation for cuFFT batched plan: "
                    << ret;
-        return port::InternalError(
+        return tsl::errors::Internal(
             "Failed to set auto allocation for cuFFT batched plan.");
       }
       ret = cufftMakePlanMany64(
@@ -245,7 +244,7 @@ port::Status CUDAFftPlan::Initialize(
           &scratch_size_bytes_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "Failed to make cuFFT batched plan: " << ret;
-        return port::InternalError("Failed to make cuFFT batched plan.");
+        return tsl::errors::Internal("Failed to make cuFFT batched plan.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -253,10 +252,10 @@ port::Status CUDAFftPlan::Initialize(
   return ::tsl::OkStatus();
 }
 
-port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
-                                     int rank, uint64_t *elem_count,
-                                     fft::Type type,
-                                     ScratchAllocator *scratch_allocator) {
+tsl::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
+                                    int rank, uint64_t *elem_count,
+                                    fft::Type type,
+                                    ScratchAllocator *scratch_allocator) {
   return Initialize(parent_, stream, rank, elem_count,
                     /*input_embed=*/nullptr, /*input_stride=*/0,
                     /*input_distance=*/0,
@@ -264,7 +263,7 @@ port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                     /*output_distance=*/0, type, 1, scratch_allocator);
 }
 
-port::Status CUDAFftPlan::UpdateScratchAllocator(
+tsl::Status CUDAFftPlan::UpdateScratchAllocator(
     Stream *stream, ScratchAllocator *scratch_allocator) {
   scratch_allocator_ = scratch_allocator;
 
@@ -280,7 +279,7 @@ port::Status CUDAFftPlan::UpdateScratchAllocator(
   cufftResult_t ret = cufftSetWorkArea(plan_, scratch_.opaque());
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "Failed to set work area for cuFFT plan: " << ret;
-    return port::InternalError("Failed to set work area for cuFFT plan.");
+    return tsl::errors::Internal("Failed to set work area for cuFFT plan.");
   }
   return ::tsl::OkStatus();
 }
@@ -316,8 +315,9 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[1] = {num_x};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 1, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
     LOG(ERROR) << "Failed to initialize cufft 1d plan: "
@@ -332,8 +332,8 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlanWithScratchAllocator(
     ScratchAllocator *scratch_allocator) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[1] = {num_x};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
     LOG(ERROR)
@@ -349,8 +349,9 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[2] = {num_x, num_y};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 1, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
     LOG(ERROR) << "Failed to initialize cufft 2d plan: "
@@ -365,8 +366,8 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlanWithScratchAllocator(
     bool in_place_fft, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[2] = {num_x, num_y};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
     LOG(ERROR)
@@ -383,8 +384,9 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[3] = {num_x, num_y, num_z};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 3, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y
                << " num_z: " << num_z;
@@ -400,8 +402,8 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlanWithScratchAllocator(
     bool in_place_fft, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
   uint64_t elem_count[3] = {num_x, num_y, num_z};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y
                << " num_z: " << num_z;
@@ -419,7 +421,7 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
     uint64_t output_stride, uint64 output_distance, fft::Type type,
     bool in_place_fft, int batch_count) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
-  port::Status status = fft_plan_ptr->Initialize(
+  tsl::Status status = fft_plan_ptr->Initialize(
       parent_, stream, rank, elem_count, input_embed, input_stride,
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, /*scratch_allocator=*/nullptr);
@@ -447,7 +449,7 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
     uint64_t output_stride, uint64 output_distance, fft::Type type,
     bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<CUDAFftPlan> fft_plan_ptr{new CUDAFftPlan()};
-  port::Status status = fft_plan_ptr->Initialize(
+  tsl::Status status = fft_plan_ptr->Initialize(
       parent_, stream, rank, elem_count, input_embed, input_stride,
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, scratch_allocator);
@@ -472,7 +474,7 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
 void CUDAFft::UpdatePlanWithScratchAllocator(
     Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
   CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
-  port::Status status =
+  tsl::Status status =
       cuda_fft_plan->UpdateScratchAllocator(stream, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL) << "Failed to update custom allocator for cufft plan: "
@@ -592,7 +594,7 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 }  // namespace gpu
 
 void initialize_cufft() {
-  port::Status status =
+  tsl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
           cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
           [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.h
index 0ffb6786fad..41f921cdb7e 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.h
@@ -67,20 +67,20 @@ class CUDAFftPlan : public fft::Plan {
   }
 
   // Initialize function for batched plan
-  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
-                          uint64_t* elem_count, uint64_t* input_embed,
-                          uint64_t input_stride, uint64 input_distance,
-                          uint64_t* output_embed, uint64_t output_stride,
-                          uint64_t output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator* scratch_allocator);
+  tsl::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                         uint64_t* elem_count, uint64_t* input_embed,
+                         uint64_t input_stride, uint64 input_distance,
+                         uint64_t* output_embed, uint64_t output_stride,
+                         uint64_t output_distance, fft::Type type,
+                         int batch_count, ScratchAllocator* scratch_allocator);
 
   // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
-                          uint64_t* elem_count, fft::Type type,
-                          ScratchAllocator* scratch_allocator);
+  tsl::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                         uint64_t* elem_count, fft::Type type,
+                         ScratchAllocator* scratch_allocator);
 
-  port::Status UpdateScratchAllocator(Stream *stream,
-                                      ScratchAllocator *scratch_allocator);
+  tsl::Status UpdateScratchAllocator(Stream* stream,
+                                     ScratchAllocator* scratch_allocator);
 
   ScratchAllocator* GetScratchAllocator() const { return scratch_allocator_; }
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
index c4646128126..3b01066dc7a 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
 
 #include <cstdint>
+#include <optional>
 #include <utility>
 
 #if defined(__APPLE__)
@@ -27,7 +28,9 @@ limitations under the License.
 #else
 #include <unistd.h>
 #endif
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
@@ -39,15 +42,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/path.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
@@ -55,6 +51,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/stream_executor/timer.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/numbers.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 // LOG(ERROR) uses a const named ERROR, so a macro with the same name is
 // always unwanted. This happens on Windows that defines such a macro.
@@ -126,8 +126,8 @@ GpuExecutor::~GpuExecutor() {
   }
 }
 
-port::Status GpuExecutor::Init(int device_ordinal,
-                               DeviceOptions device_options) {
+tsl::Status GpuExecutor::Init(int device_ordinal,
+                              DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
 
   auto status = GpuDriver::Init();
@@ -158,7 +158,7 @@ bool GpuExecutor::FindOnDiskForComputeCapability(
 
   std::string cc_specific =
       absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
-  if (port::FileExists(cc_specific).ok()) {
+  if (tsl::Env::Default()->FileExists(cc_specific).ok()) {
     VLOG(2) << "found compute-capability-specific file, using that: "
             << cc_specific;
     *found_filename = cc_specific;
@@ -167,7 +167,7 @@ bool GpuExecutor::FindOnDiskForComputeCapability(
 
   VLOG(2) << "could not find compute-capability specific file at: "
           << cc_specific;
-  if (port::FileExists(std::string(filename)).ok()) {
+  if (tsl::Env::Default()->FileExists(std::string(filename)).ok()) {
     *found_filename = std::string(filename);
     return true;
   }
@@ -188,7 +188,7 @@ bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
 //                 returned string. Example: calling this from /usr/bin/foo
 //                 would return /usr/bin.
 static std::string GetBinaryDir(bool strip_exe) {
-  std::string exe_path = port::GetExecutablePath();
+  std::string exe_path = tsl::Env::Default()->GetExecutablePath();
   if (strip_exe) {
     // The exe is the last component of the path, so remove one component.
     std::vector<std::string> components = absl::StrSplit(exe_path, '/');
@@ -198,8 +198,8 @@ static std::string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
-                                              CUmodule* module) {
+tsl::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
+                                             CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
 
@@ -217,7 +217,7 @@ port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
   return ::tsl::OkStatus();
 }
 
-port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
+tsl::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
 
@@ -235,14 +235,14 @@ port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   return ::tsl::OkStatus();
 }
 
-port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
-                                              CUmodule* module) {
-  return port::InternalError(
+tsl::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
+                                             CUmodule* module) {
+  return tsl::errors::Internal(
       "Feature not supported on CUDA platform (LoadModuleFromHsaco)");
 }
 
-port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                    KernelBase* kernel) {
+tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                   KernelBase* kernel) {
   GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const std::string* kernelname;
@@ -259,7 +259,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     kernelname = &spec.cuda_ptx_in_memory().kernelname();
 
     if (cc_major_ == 0 && cc_minor_ == 0) {
-      return port::InternalError("Compute capability not set");
+      return tsl::errors::Internal("Compute capability not set");
     }
 
     const char* ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
@@ -274,12 +274,12 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     TF_RETURN_IF_ERROR(LoadModuleFromPtx(ptx, &module));
     kernel_to_gpu_binary_[kernel] = ptx;
   } else {
-    return port::InternalError("No method of loading CUDA kernel provided");
+    return tsl::errors::Internal("No method of loading CUDA kernel provided");
   }
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                     cuda_kernel->gpu_function_ptr())) {
-    return port::InternalError("Could not find the corresponding function");
+    return tsl::errors::Internal("Could not find the corresponding function");
   }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
@@ -326,8 +326,8 @@ void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                                     ModuleHandle* module_handle) {
+tsl::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                                    ModuleHandle* module_handle) {
   // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   CUmodule cu_module;
@@ -341,11 +341,11 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
     return ::tsl::OkStatus();
   } else if (spec.has_cuda_ptx_in_memory()) {
     if (cc_major_ == 0 && cc_minor_ == 0) {
-      return port::InternalError("Compute capability not set");
+      return tsl::errors::Internal("Compute capability not set");
     }
 
     if (!spec.cuda_ptx_in_memory()) {
-      return port::InternalError("PTX not found in spec");
+      return tsl::errors::Internal("PTX not found in spec");
     }
 
     absl::MutexLock lock{&in_memory_modules_mu_};
@@ -355,7 +355,7 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
         const_cast<void*>(static_cast<const void*>(spec.cuda_ptx_in_memory())));
     return ::tsl::OkStatus();
   }
-  return port::InternalError("No method of loading CUDA module provided");
+  return tsl::errors::Internal("No method of loading CUDA module provided");
 }
 
 bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
@@ -385,7 +385,7 @@ int fpus_per_core(int cc_major, int cc_minor) {
 
 }  // namespace
 
-port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
+tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
                                    const std::vector<uint8_t>& content) {
   absl::MutexLock lock{&shared_constants_mu_};
@@ -412,16 +412,16 @@ GpuExecutor::CreateOrShareConstant(Stream* stream,
     DeviceMemoryBase* new_constant =
         new DeviceMemoryBase(Allocate(content.size(), /*memory_space=*/0));
     if (new_constant->opaque() == nullptr) {
-      return port::InternalError(absl::StrFormat(
+      return tsl::errors::Internal(absl::StrFormat(
           "Failed to allocate %d bytes for new constant", content.size()));
     }
 
-    port::Status status =
+    tsl::Status status =
         stream->ThenMemcpy(new_constant, content.data(), content.size())
             .BlockHostUntilDone();
     if (!status.ok()) {
       Deallocate(new_constant);
-      status.Update(port::InternalError(absl::StrFormat(
+      status.Update(tsl::errors::Internal(absl::StrFormat(
           "Memcpy to device address %p failed", new_constant->opaque())));
       return status;
     }
@@ -439,8 +439,8 @@ GpuExecutor::CreateOrShareConstant(Stream* stream,
   return shared_constant;
 }
 
-port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
-                                            KernelMetadata* kernel_metadata) {
+tsl::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
+                                           KernelMetadata* kernel_metadata) {
   int value;
   TF_RETURN_IF_ERROR(GpuDriver::FuncGetAttribute(
       CU_FUNC_ATTRIBUTE_NUM_REGS, *cuda_kernel->gpu_function_ptr(), &value));
@@ -453,11 +453,12 @@ port::Status GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
   return ::tsl::OkStatus();
 }
 
-port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
-                                 const BlockDim& block_dims,
-                                 const KernelBase& kernel,
-                                 const KernelArgsArrayBase& args) {
-  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                const BlockDim& block_dims,
+                                const KernelBase& kernel,
+                                const KernelArgsArrayBase& args) {
+  CHECK_EQ(kernel.Arity() + (args.number_of_shared_bytes() > 0),
+           args.number_of_arguments());
   CUstream custream = AsGpuStreamValue(stream);
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
@@ -604,8 +605,8 @@ bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }
 
-port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
-                                             uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
+                                            uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return GpuDriver::SynchronousMemsetUint32(
@@ -615,8 +616,8 @@ port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                            0x0, size);
 }
 
-port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                            int value, uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
+                                           int value, uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     // cudaMemset reinterprets "value" as a uint8_t.
@@ -630,28 +631,28 @@ port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
                                            value, size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                            const void* host_src,
-                                            uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                           const void* host_src,
+                                           uint64_t size) {
   return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
                                          host_src, size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
-                                            const DeviceMemoryBase& gpu_src,
-                                            uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                           const DeviceMemoryBase& gpu_src,
+                                           uint64_t size) {
   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
                                          AsCudaDevicePtr(gpu_src), size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+tsl::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
   return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
                                          AsCudaDevicePtr(gpu_src), size);
 }
 
-port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
-                                  uint64_t size) {
+tsl::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                                 uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return Memset32(stream, location, 0x0, size);
@@ -660,8 +661,8 @@ port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
   }
 }
 
-port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
-                                 uint8_t pattern, uint64_t size) {
+tsl::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                                uint8_t pattern, uint64_t size) {
   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
@@ -670,8 +671,8 @@ port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
                                             AsGpuStreamValue(stream));
 }
 
-port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
-                                   uint32_t pattern, uint64_t size) {
+tsl::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                                  uint32_t pattern, uint64_t size) {
   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
@@ -706,13 +707,14 @@ bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
 }
 
 bool GpuExecutor::HostCallback(Stream* stream,
-                               std::function<port::Status()> callback) {
-  auto callback_ptr = new std::function<void()>([callback]() {
-    port::Status s = callback();
-    if (!s.ok()) {
-      LOG(WARNING) << "Host callback failed: " << s;
-    }
-  });
+                               absl::AnyInvocable<tsl::Status() &&> callback) {
+  auto callback_ptr =
+      new absl::AnyInvocable<void() &&>([cb = std::move(callback)]() mutable {
+        tsl::Status s = std::move(cb)();
+        if (!s.ok()) {
+          LOG(WARNING) << "Host callback failed: " << s;
+        }
+      });
   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
                                       InternalHostCallback, callback_ptr);
 }
@@ -720,31 +722,30 @@ bool GpuExecutor::HostCallback(Stream* stream,
 /* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
                                                     CUresult status,
                                                     void* data) {
-  std::function<void()>* callback =
-      reinterpret_cast<std::function<void()>*>(data);
-  (*callback)();
+  auto* callback = reinterpret_cast<absl::AnyInvocable<void() &&>*>(data);
+  std::move (*callback)();
   delete callback;
 }
 
-port::Status GpuExecutor::AllocateEvent(Event* event) {
+tsl::Status GpuExecutor::AllocateEvent(Event* event) {
   return AsGpuEvent(event)->Init();
 }
 
-port::Status GpuExecutor::DeallocateEvent(Event* event) {
+tsl::Status GpuExecutor::DeallocateEvent(Event* event) {
   return AsGpuEvent(event)->Destroy();
 }
 
-port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+tsl::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
   return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }
 
-port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+tsl::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
                                    AsGpuEvent(event)->gpu_event())) {
     return ::tsl::OkStatus();
   } else {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("error recording waiting for CUDA event on stream %p",
                         stream));
   }
@@ -802,13 +803,13 @@ bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }
 
-port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+tsl::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
 blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::BlasFactory> status =
+  tsl::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
@@ -822,7 +823,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
 
 dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::DnnFactory> status =
+  tsl::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
@@ -836,7 +837,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
 
 fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::FftFactory> status =
+  tsl::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
@@ -850,7 +851,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
 
 rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::RngFactory> status =
+  tsl::StatusOr<PluginRegistry::RngFactory> status =
       registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
@@ -870,7 +871,7 @@ bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }
 
-port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+tsl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
@@ -990,12 +991,15 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
   content = buf;
 
   int32_t value;
-  if (port::safe_strto32(content, &value)) {
+  if (absl::SimpleAtoi(content, &value)) {
     if (value < 0) {  // See http://b/18228951 for details on this path.
       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
                 << value
                 << "), but there must be at least one NUMA node"
-                   ", so returning NUMA node zero";
+                   ", so returning NUMA node zero."
+                   " See more at "
+                   "https://github.com/torvalds/linux/blob/v6.0/Documentation/"
+                   "ABI/testing/sysfs-bus-pci#L344-L355";
       fclose(file);
       return 0;
     }
@@ -1012,7 +1016,7 @@ static int TryToReadNumaNode(const std::string& pci_bus_id,
 #endif
 }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
   GpuDeviceHandle device;
   auto status = GpuDriver::GetDevice(device_ordinal, &device);
@@ -1067,35 +1071,31 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
                              CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device)
                              .value();
     builder.set_thread_dim_limit(thread_dim_limit);
-
-    int clock_rate =
-        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
-            .value();
-    builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
   }
 
+  int sm_clock_khz =
+      GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)
+          .value();
+  builder.set_clock_rate_ghz(static_cast<float>(sm_clock_khz) / 1e6);
+
   {
     bool ecc_enabled = false;
     (void)GpuDriver::IsEccEnabled(device, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
-  {
-    uint64_t device_memory_size = -1;
-    (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
-    builder.set_device_memory_size(device_memory_size);
-  }
+  uint64_t device_memory_size = static_cast<uint64_t>(-1);
+  (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
+  builder.set_device_memory_size(device_memory_size);
 
-  {
-    int64_t l2_cache_size =
-        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, device)
-            .value();
-    builder.set_l2_cache_size(l2_cache_size);
-  }
+  int64_t l2_cache_bytes =
+      GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, device)
+          .value();
+  builder.set_l2_cache_size(l2_cache_bytes);
 
-  port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
+  tsl::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal);
-  port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
+  tsl::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal);
   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
     // Times 2 because HBM is DDR memory; it gets two data bits per each data
@@ -1129,7 +1129,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxSharedMemoryPerCore(device).value());
   builder.set_shared_memory_per_block(
       GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
-  builder.set_core_count(GpuDriver::GetMultiprocessorCount(device).value());
+  int core_count = GpuDriver::GetMultiprocessorCount(device).value();
+  builder.set_core_count(core_count);
   builder.set_fpus_per_core(fpus_per_core(cc_major, cc_minor));
   builder.set_threads_per_core_limit(
       GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
@@ -1141,6 +1142,24 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device)
           .value());
 
+  auto value_or = [](const auto& status_or, auto default_val) {
+    if (status_or.ok()) return *status_or;
+    return default_val;
+  };
+
+  // It would be better to use the PCI device ID or some other truly unique
+  // identifier for the GPU model.  But getting this requires using NVML or
+  // other hacks, which we don't have access to in OSS TensorFlow.
+  //
+  // Alternatively you might be tempted to use GpuDriver::GetDeviceName as a
+  // unique identifier, but this is not stable across GPU VBIOS versions.
+  //
+  // For now, this identifier is good enough.
+  builder.set_model_str(absl::StrFormat(
+      "sm_%d.%d with %dB RAM, %d cores, %dKHz clock, %dKHz mem clock, %dB L2$",
+      cc_major, cc_minor, device_memory_size, core_count, sm_clock_khz,
+      value_or(mem_clock_khz, 0), l2_cache_bytes));
+
   return builder.Build();
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor_test.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor_test.cc
deleted file mode 100644
index 406329697a2..00000000000
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor_test.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/compiler/xla/stream_executor/device_description.h"
-
-namespace stream_executor {
-namespace gpu {
-namespace {
-
-TEST(GpuExecutorTest, DeviceDescription) {
-  ASSERT_FALSE(cuInit(/*Flags=*/0));
-  std::unique_ptr<DeviceDescription> d =
-      GpuExecutor::CreateDeviceDescription(/*device_ordinal=*/0).value();
-  const std::string &name = d->name();
-  if (name == "NVIDIA RTX A6000") {
-    EXPECT_EQ(d->l2_cache_size(), 6 * 1024 * 1024);
-  } else if (name == "Quadro P1000") {
-    EXPECT_EQ(d->l2_cache_size(), 1024 * 1024);
-  } else if (name == "Tesla P100-SXM2-16GB") {
-    EXPECT_EQ(d->l2_cache_size(), 4 * 1024 * 1024);
-  } else {
-    VLOG(1) << "L2 cache size not tested for " << name << "; reported value is "
-            << d->l2_cache_size();
-  }
-}
-
-TEST(GpuExecutorTest, FPUCount) {
-  ASSERT_FALSE(cuInit(/*Flags=*/0));
-  std::unique_ptr<DeviceDescription> d =
-      GpuExecutor::CreateDeviceDescription(/*device_ordinal=*/0).value();
-  const std::string &name = d->name();
-  if (name == "NVIDIA RTX A6000") {
-    EXPECT_EQ(d->fpus_per_core(), 128);
-  } else if (name == "Quadro P1000") {
-    EXPECT_EQ(d->fpus_per_core(), 128);
-  } else if (name == "Tesla P100-SXM2-16GB") {
-    EXPECT_EQ(d->fpus_per_core(), 64);
-  } else {
-    VLOG(1) << "FPU count not tested for " << name << "; reported value is "
-            << d->fpus_per_core();
-  }
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace stream_executor
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
new file mode 100644
index 00000000000..7d881e1ca7a
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
@@ -0,0 +1,179 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
+
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/path.h"
+
+namespace stream_executor {
+namespace gpu {
+
+template <typename... Args>
+static tsl::Status InternalError(const absl::FormatSpec<Args...>& format,
+                                 const Args&... args) {
+  return tsl::errors::Internal(absl::StrFormat(format, args...));
+}
+
+//===----------------------------------------------------------------------===//
+// RAII helpers for CUDA graph types.
+//===----------------------------------------------------------------------===//
+
+void CudaGraphSupport::DestroyGraph::operator()(cudaGraph_t graph) {
+  cudaError_t err = cudaGraphDestroy(graph);
+  CHECK(err == cudaSuccess)
+      << "Failed to destroy CUDA graph: " << cudaGetErrorString(err);
+}
+
+void CudaGraphSupport::DestroyGraphExec::operator()(cudaGraphExec_t instance) {
+  cudaError_t err = cudaGraphExecDestroy(instance);
+  CHECK(err == cudaSuccess)
+      << "Failed to destroy CUDA graph instance: " << cudaGetErrorString(err);
+}
+
+tsl::Status OwnedCudaGraphExec::Update(OwnedCudaGraph graph) {
+  VLOG(3) << "Update CUDA graph exec with a new graph after " << num_launches_
+          << " launches since last update "
+          << " #" << num_updates_++;
+
+  num_launches_ = 0;
+
+#if CUDA_VERSION >= 12000
+  cudaGraphExecUpdateResultInfo updated;
+
+  auto err = cudaGraphExecUpdate(get(), graph.get(), &updated);
+  if (err != cudaSuccess || updated.result != cudaGraphExecUpdateSuccess)
+    return InternalError("failed to update cuda graph: %s",
+                         cudaGetErrorString(err));
+
+#else
+  cudaGraphExecUpdateResult updated;
+  cudaGraphNode_t error_node;
+
+  auto err = cudaGraphExecUpdate(get(), graph.get(), &error_node, &updated);
+  if (err != cudaSuccess || updated != cudaGraphExecUpdateSuccess)
+    return InternalError("Failed to update cuda graph %s",
+                         cudaGetErrorString(err));
+#endif
+
+  return tsl::OkStatus();
+}
+
+tsl::Status OwnedCudaGraphExec::Launch(stream_executor::Stream* stream) {
+  VLOG(3) << "Launch CUDA graph " << get()
+          << " on a stream: " << stream->DebugStreamPointers() << " #"
+          << ++num_launches_;
+
+  if (auto err = cudaGraphLaunch(get(), AsGpuStreamValue(stream));
+      err != cudaSuccess)
+    return InternalError("failed to run cuda graph: %s",
+                         cudaGetErrorString(err));
+
+  return tsl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+// CUDA Graph Helpers.
+//===----------------------------------------------------------------------===//
+
+tsl::StatusOr<OwnedCudaGraph> CaptureCudaGraph(
+    stream_executor::Stream* stream, absl::AnyInvocable<tsl::Status()> capture,
+    cudaStreamCaptureMode mode) {
+  VLOG(3) << "Capture CUDA graph on a stream: "
+          << stream->DebugStreamPointers();
+
+  cudaGraph_t graph;
+
+  // Get the underlying CUDA stream for passing to CUDA APIs.
+  auto gpu_stream = AsGpuStreamValue(stream);
+
+  // Capture graph constructed by the exported graph capture function.
+  if (auto err = cudaStreamBeginCapture(gpu_stream, mode); err != cudaSuccess)
+    return InternalError("stream begin capture failed: %s",
+                         cudaGetErrorString(err));
+
+  // Call into graph capture function.
+  auto captured = capture();
+
+  // Always stop capturing the stream before checking `captured` result.
+  if (auto err = cudaStreamEndCapture(gpu_stream, &graph); err != cudaSuccess)
+    return InternalError("stream end capture failed: %s",
+                         cudaGetErrorString(err));
+
+  if (!captured.ok())
+    return InternalError("failed to capture CUDA graph: %s",
+                         captured.error_message());
+
+  VLOG(5) << "Captured CUDA graph " << graph;
+
+  // If verbose logging is enabled print captured CUDA graph debug information.
+  if (VLOG_IS_ON(100)) {
+    if (const char* path = getenv("XLA_CUDA_GRAPH_DEBUG_DIRECTORY"); path) {
+      std::string file = tsl::io::JoinPath(std::string(path), "/cuda_graph-");
+
+      if (tsl::Env::Default()->CreateUniqueFileName(&file, ".dot")) {
+        VLOG(100) << "Print CUDA graph " << graph
+                  << " debug dot file to: " << file;
+
+        /* tanyo(disc): fixme
+        int flags = cudaGraphDebugDotFlagsVerbose;
+        /* tanyo(disc): fixme
+        if (auto err = cudaGraphDebugDotPrint(graph, file.c_str(), flags);
+            err != cudaSuccess) {
+          LOG(WARNING) << "failed to print CUDA graph debug file: "
+                       << cudaGetErrorString(err);
+
+        } else if (VLOG_IS_ON(200)) {
+          std::string data;
+          if (tsl::ReadFileToString(tsl::Env::Default(), file, &data).ok()) {
+            VLOG(200) << "CUDA graph " << graph << " debug file:\n" << data;
+          } else {
+            LOG(WARNING) << "failed to read CUDA graph debug file";
+          }
+        }*/
+
+      } else {
+        LOG(WARNING) << "cannot create unique filename, won't enable CUDA "
+                        "graph debugging";
+      }
+    }
+  }
+
+  return OwnedCudaGraph(graph);
+}
+
+tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph) {
+  cudaGraphExec_t exec;
+
+#if CUDA_VERSION >= 12000
+  if (auto err = cudaGraphInstantiate(&exec, &*graph);
+#else
+  if (auto err = cudaGraphInstantiate(&exec, &*graph, nullptr, nullptr, 0);
+#endif
+      err != cudaSuccess) {
+    return InternalError("graph instantiation failed: %s",
+                         cudaGetErrorString(err));
+  }
+
+  return OwnedCudaGraphExec(exec);
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
new file mode 100644
index 00000000000..f790b1cff3b
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_CUDA_CUDA_GRAPH_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_CUDA_CUDA_GRAPH_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class CudaGraphSupport {
+ public:
+  // Deleters for CUDA graph and graph exec instance that check the returned
+  // status and terminate if it's not `cudaSuccess`.
+  struct DestroyGraph {
+    void operator()(cudaGraph_t);
+  };
+  struct DestroyGraphExec {
+    void operator()(cudaGraphExec_t);
+  };
+};
+
+//===----------------------------------------------------------------------===//
+// RAII helpers for CUDA graph types.
+//===----------------------------------------------------------------------===//
+
+class OwnedCudaGraph
+    : public std::unique_ptr<std::remove_pointer_t<cudaGraph_t>,
+                             CudaGraphSupport::DestroyGraph> {
+  // Bring std::unique_ptr constructors in scope.
+  using std::unique_ptr<std::remove_pointer_t<cudaGraph_t>,
+                        CudaGraphSupport::DestroyGraph>::unique_ptr;
+};
+
+class OwnedCudaGraphExec
+    : public std::unique_ptr<std::remove_pointer_t<cudaGraphExec_t>,
+                             CudaGraphSupport::DestroyGraphExec> {
+  // Bring std::unique_ptr constructors in scope.
+  using std::unique_ptr<std::remove_pointer_t<cudaGraphExec_t>,
+                        CudaGraphSupport::DestroyGraphExec>::unique_ptr;
+
+ public:
+  // Updates executable graph instance with a newly captured graph. Returns an
+  // error if the new graph is not compatible (see `cudaGraphExecUpdate`).
+  tsl::Status Update(OwnedCudaGraph graph);
+
+  // Launches captured graph on a given stream.
+  tsl::Status Launch(stream_executor::Stream* stream);
+
+ private:
+  uint64_t num_updates_ = 0;
+  uint64_t num_launches_ = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// CUDA Graph Helpers.
+//===----------------------------------------------------------------------===//
+
+// Captures all operations added to a `stream` by the `capture` function into
+// the cuda graph instance.
+tsl::StatusOr<OwnedCudaGraph> CaptureCudaGraph(
+    stream_executor::Stream* stream, absl::AnyInvocable<tsl::Status()> capture,
+    cudaStreamCaptureMode mode = cudaStreamCaptureModeThreadLocal);
+
+// Instantiates a captured cuda graph instance into a cuda graph executable.
+tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_CUDA_CUDA_GRAPH_H_
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.cc
index fac02940c51..aa3d7b27b0a 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -106,7 +106,7 @@ int CudaPlatform::DeviceToBus(int device_ordinal) {
   return exec->GetDeviceDescription().numa_node() - min_numa_node_;
 }
 
-port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
+tsl::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
     int bus_ordinal) {
   InspectNumaNodes();
   CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
@@ -116,8 +116,8 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
     }
   }
 
-  return port::Status(
-      port::error::NOT_FOUND,
+  return tsl::Status(
+      tsl::error::NOT_FOUND,
       absl::StrFormat("Executor for bus %d not found.", bus_ordinal));
 }
 
@@ -135,12 +135,12 @@ int CudaPlatform::VisibleDeviceCount() const {
 
 const std::string& CudaPlatform::Name() const { return name_; }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 CudaPlatform::DescriptionForDevice(int ordinal) const {
   return GpuExecutor::CreateDeviceDescription(ordinal);
 }
 
-port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal,
+tsl::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal,
                                                                 void* hash) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -150,7 +150,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal,
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) {
+tsl::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
   config.plugin_config = PluginConfig();
@@ -158,7 +158,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDevice(int ordinal) {
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig(
+tsl::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -167,7 +167,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
+tsl::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   if (config.gpu_stream) {
     // If the GPU stream was provided, it's not possible to get-or-create a
@@ -179,15 +179,15 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
-port::StatusOr<std::unique_ptr<StreamExecutor>>
+tsl::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
       this, std::make_unique<GpuExecutor>(config.plugin_config),
       config.ordinal);
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed initializing StreamExecutor for CUDA device ordinal %d: %s",
             config.ordinal, init_status.ToString()));
@@ -212,7 +212,7 @@ static void InitializeCudaPlatform() {
   // registered platforms.
 
   std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
-  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.h
index 62d6a330300..9ad8ac32a08 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.h
@@ -21,13 +21,13 @@ limitations under the License.
 
 #include "absl/base/thread_annotations.h"
 #include "tensorflow/compiler/xla/stream_executor/executor_cache.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/stream_executor/trace_listener.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace cuda {
@@ -53,7 +53,7 @@ class CudaPlatform : public Platform {
   int DeviceToBus(int device_ordinal);
 
   // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
-  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+  tsl::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
 
   // Platform interface implementation:
   // Returns the same value as kCudaPlatform above.
@@ -64,19 +64,19 @@ class CudaPlatform : public Platform {
 
   const std::string& Name() const override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal, void* hash);
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal, void* hash);
 
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& config) override;
 
-  port::StatusOr<StreamExecutor*> GetExecutor(
+  tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
 
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
index 96272e4d024..d80ef0b5b40 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.cc
@@ -23,11 +23,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
+#include "tensorflow/tsl/platform/status.h"
 // clang-format off
 #include "third_party/gpus/cuda/include/curand.h"
 // clang-format on
@@ -231,7 +230,7 @@ bool GpuRng::SetSeed(Stream* stream, const uint8_t* seed, uint64_t seed_bytes) {
 }  // namespace gpu
 
 void initialize_curand() {
-  port::Status status =
+  tsl::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
           cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
           [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/redzone_allocator_test.cc b/tensorflow/compiler/xla/stream_executor/cuda/redzone_allocator_test.cc
index c7ac98ce59c..36d7d8ac035 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/redzone_allocator_test.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/redzone_allocator_test.cc
@@ -32,13 +32,12 @@ namespace {
 
 using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;
 
-static void EXPECT_REDZONE_OK(port::StatusOr<RedzoneCheckStatus> status) {
+static void EXPECT_REDZONE_OK(tsl::StatusOr<RedzoneCheckStatus> status) {
   EXPECT_TRUE(status.ok());
   EXPECT_TRUE(status.value().ok());
 }
 
-static void EXPECT_REDZONE_VIOLATION(
-    port::StatusOr<RedzoneCheckStatus> status) {
+static void EXPECT_REDZONE_VIOLATION(tsl::StatusOr<RedzoneCheckStatus> status) {
   EXPECT_TRUE(status.ok());
   EXPECT_FALSE(status.value().ok());
 }
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/stream_search_test.cc b/tensorflow/compiler/xla/stream_executor/cuda/stream_search_test.cc
index fe751a78b55..1cba1de795d 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/stream_search_test.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/stream_search_test.cc
@@ -34,7 +34,7 @@ TEST_F(StreamSearchTest, NoMatchBadPtr) {
   StreamExecutorConfig config;
   config.gpu_stream = bad_ptr;
 
-  port::StatusOr<StreamExecutor*> found_executor =
+  tsl::StatusOr<StreamExecutor*> found_executor =
       GetPlatform()->GetExecutor(config);
 
   // No executor found.
@@ -42,8 +42,7 @@ TEST_F(StreamSearchTest, NoMatchBadPtr) {
 }
 
 TEST_F(StreamSearchTest, FoundPrevExecutor) {
-  port::StatusOr<StreamExecutor*> executor =
-      GetPlatform()->ExecutorForDevice(0);
+  tsl::StatusOr<StreamExecutor*> executor = GetPlatform()->ExecutorForDevice(0);
   EXPECT_TRUE(executor.ok());
 
   Stream s(*executor);
@@ -58,8 +57,7 @@ TEST_F(StreamSearchTest, FoundPrevExecutor) {
   StreamExecutorConfig c;
   c.gpu_stream = gpu_ptr;
 
-  port::StatusOr<StreamExecutor*> found_executor =
-      GetPlatform()->GetExecutor(c);
+  tsl::StatusOr<StreamExecutor*> found_executor = GetPlatform()->GetExecutor(c);
   EXPECT_TRUE(found_executor.ok());
   EXPECT_EQ(*found_executor, *executor);
 
diff --git a/tensorflow/compiler/xla/stream_executor/data_type.h b/tensorflow/compiler/xla/stream_executor/data_type.h
index c7d22a5d148..24dc8eca90e 100644
--- a/tensorflow/compiler/xla/stream_executor/data_type.h
+++ b/tensorflow/compiler/xla/stream_executor/data_type.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 namespace Eigen {
 struct bfloat16;
@@ -36,6 +38,14 @@ struct ToDataType;
 // Note: If you add a new specialization below, make sure to add the
 // corresponding definition in stream_executor/dnn.cc.
 template <>
+struct ToDataType<tsl::float8_e4m3fn> {
+  static constexpr DataType value = DataType::kF8E4M3FN;
+};
+template <>
+struct ToDataType<tsl::float8_e5m2> {
+  static constexpr DataType value = DataType::kF8E5M2;
+};
+template <>
 struct ToDataType<float> {
   static constexpr DataType value = DataType::kFloat;
 };
diff --git a/tensorflow/compiler/xla/stream_executor/device_description.cc b/tensorflow/compiler/xla/stream_executor/device_description.cc
index 5a37572d933..0e9bd0dd8cf 100644
--- a/tensorflow/compiler/xla/stream_executor/device_description.cc
+++ b/tensorflow/compiler/xla/stream_executor/device_description.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
+#include "tensorflow/tsl/lib/math/math_util.h"
+#include "tensorflow/tsl/platform/numbers.h"
 
 namespace stream_executor {
 
@@ -34,6 +37,7 @@ DeviceDescription::DeviceDescription()
       runtime_version_(kUndefinedString),
       pci_bus_id_(kUndefinedString),
       name_(kUndefinedString),
+      model_str_(kUndefinedString),
       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
                         kUninitializedUint64),
       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
@@ -64,6 +68,7 @@ std::unique_ptr<std::map<std::string, std::string>> DeviceDescription::ToMap()
   result["Runtime Version"] = runtime_version();
   result["PCI bus ID"] = pci_bus_id_;
   result["Device Name"] = name_;
+  result["Device Description"] = model_str_;
 
   const ThreadDim &thread_dim = thread_dim_limit();
   result["ThreadDim Limit"] =
@@ -79,14 +84,14 @@ std::unique_ptr<std::map<std::string, std::string>> DeviceDescription::ToMap()
 
   result["Device Address Bits"] = absl::StrCat(device_address_bits());
   result["Device Memory Size"] =
-      port::HumanReadableNumBytes::ToString(device_memory_size());
+      tsl::strings::HumanReadableNumBytes(device_memory_size());
   result["Memory Bandwidth"] = absl::StrCat(
-      port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
+      tsl::strings::HumanReadableNumBytes(memory_bandwidth_), "/s");
 
   result["Shared Memory Per Core"] =
-      port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
+      tsl::strings::HumanReadableNumBytes(shared_memory_per_core_);
   result["Shared Memory Per Block"] =
-      port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
+      tsl::strings::HumanReadableNumBytes(shared_memory_per_block_);
 
   result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
 
@@ -136,15 +141,11 @@ bool ThreadDimOk(const DeviceDescription &device_description,
   return ok;
 }
 
-uint64_t DivideCeil(uint64_t x, uint64_t y) {
-  return port::MathUtil::CeilOfRatio(x, y);
-}
-
 void CalculateDimensionality(const DeviceDescription &device_description,
                              int64_t element_count, int64_t *threads_per_block,
                              int64_t *block_count) {
   *threads_per_block = device_description.threads_per_block_limit();
-  *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
+  *block_count = tsl::MathUtil::CeilOfRatio(element_count, *threads_per_block);
   if (*block_count == 1) {
     CHECK_LE(element_count, *threads_per_block);
     *threads_per_block = element_count;
diff --git a/tensorflow/compiler/xla/stream_executor/device_description.h b/tensorflow/compiler/xla/stream_executor/device_description.h
index 096884c8d70..6371f0d844c 100644
--- a/tensorflow/compiler/xla/stream_executor/device_description.h
+++ b/tensorflow/compiler/xla/stream_executor/device_description.h
@@ -27,6 +27,7 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
 
 namespace stream_executor {
@@ -40,7 +41,12 @@ struct CudaComputeCapability {
   int minor = 0;
 
   // MSVC does not like "PASCAL" symbol.
-  enum CudaComputeCapabilities { PASCAL_ = 6, VOLTA = 7, AMPERE = 8 };
+  enum CudaComputeCapabilities {
+    PASCAL_ = 6,
+    VOLTA = 7,
+    AMPERE = 8,
+    HOPPER = 9
+  };
 
   CudaComputeCapability() {}
   CudaComputeCapability(int major, int minor) {
@@ -48,6 +54,11 @@ struct CudaComputeCapability {
     this->minor = minor;
   }
 
+  explicit CudaComputeCapability(const CudaComputeCapabilityProto &proto) {
+    this->major = proto.major();
+    this->minor = proto.minor();
+  }
+
   bool IsAtLeast(int other_major, int other_minor = 0) const {
     return !(*this < CudaComputeCapability{other_major, other_minor});
   }
@@ -93,6 +104,13 @@ struct CudaComputeCapability {
   std::string ToString() const { return absl::StrCat(major, ".", minor); }
 
   std::pair<int, int> ToPair() const { return std::make_pair(major, minor); }
+
+  CudaComputeCapabilityProto ToProto() const {
+    CudaComputeCapabilityProto proto;
+    proto.set_major(major);
+    proto.set_minor(minor);
+    return proto;
+  }
 };
 
 // ROCm compute capability, as reported by the device description.
@@ -103,6 +121,9 @@ class RocmComputeCapability {
   explicit RocmComputeCapability(const std::string &gcn_arch_name)
       : gcn_arch_name_(gcn_arch_name) {}
 
+  explicit RocmComputeCapability(const RocmComputeCapabilityProto &proto)
+      : gcn_arch_name_(proto.gcn_arch_name()) {}
+
   ~RocmComputeCapability() {}
 
   std::string gcn_arch_name() { return gcn_arch_name_; }
@@ -140,6 +161,12 @@ class RocmComputeCapability {
     return gfx_versions_with_fp16_atomics_support().count(gfx_version()) != 0;
   }
 
+  RocmComputeCapabilityProto ToProto() const {
+    RocmComputeCapabilityProto proto;
+    proto.set_gcn_arch_name(gcn_arch_name_);
+    return proto;
+  }
+
  private:
   std::string gcn_arch_name_;
   std::set<std::string> supported_gfx_versions() {
@@ -192,6 +219,16 @@ class DeviceDescription {
   // Returns the name that the device reports. Vendor dependent.
   const std::string &name() const { return name_; }
 
+  // Gets a human-readable description of the device, e.g. "nvidia GPU
+  // supporting sm75 with 32GB RAM, 80 SMs, ...".  This is intended to be the
+  // same if and only if two devices are "the same" (e.g. the same make/model of
+  // GPU), though it may not completely succeed at this for all platforms.
+  //
+  // This string is not guaranteed to be stable between versions.  Please DO NOT
+  // rely on it never changing.  (Within one version of the code, it won't
+  // change, don't worry.)
+  const std::string &model_str() const { return model_str_; }
+
   // Returns the PCI bus identifier for this device, of the form
   // [domain]:[bus]:[device].[function]
   const std::string &pci_bus_id() const { return pci_bus_id_; }
@@ -319,6 +356,7 @@ class DeviceDescription {
   std::string runtime_version_;
   std::string pci_bus_id_;
   std::string name_;
+  std::string model_str_;
 
   ThreadDim thread_dim_limit_;
   BlockDim block_dim_limit_;
@@ -384,6 +422,9 @@ class DeviceDescriptionBuilder {
   void set_name(const std::string &value) {
     device_description_->name_ = value;
   }
+  void set_model_str(const std::string &value) {
+    device_description_->model_str_ = value;
+  }
 
   void set_thread_dim_limit(const ThreadDim &value) {
     device_description_->thread_dim_limit_ = value;
@@ -475,10 +516,6 @@ class DeviceDescriptionBuilder {
 bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim);
 
-// Equivalent to ceil(double(element_count) / threads_per_block).
-ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
-int64_t DivideCeil(int64_t x, int64_t y);
-
 // Calculate the number of threads/blocks required to process element_count
 // elements. Note that you can still end up with more threads than
 // element_count due to rounding, so kernels often start with an "is this
diff --git a/tensorflow/compiler/xla/stream_executor/device_description.proto b/tensorflow/compiler/xla/stream_executor/device_description.proto
new file mode 100644
index 00000000000..eaff1a011a0
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/device_description.proto
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package stream_executor;
+
+import "tensorflow/compiler/xla/autotune_results.proto";
+
+message CudaComputeCapabilityProto {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message RocmComputeCapabilityProto {
+  string gcn_arch_name = 1;
+}
+
+message GpuDeviceInfoProto {
+  int32 threads_per_block_limit = 1;
+  int32 threads_per_warp = 2;
+  int32 shared_memory_per_block = 3;
+  int32 shared_memory_per_core = 4;
+  int32 threads_per_core_limit = 5;
+  int32 core_count = 6;
+  int64 fpus_per_core = 7;
+  int32 block_dim_limit_x = 8;
+  int32 block_dim_limit_y = 9;
+  int32 block_dim_limit_z = 10;
+  int64 memory_bandwidth = 11;
+  int64 l2_cache_size = 12;
+  float clock_rate_ghz = 13;
+  int64 device_memory_size = 14;
+}
+
+message DnnVersionInfoProto {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
+message GpuTargetConfigProto {
+  GpuDeviceInfoProto gpu_device_info = 1;
+  oneof compute_capability {
+    CudaComputeCapabilityProto cuda_compute_capability = 2;
+    RocmComputeCapabilityProto rocm_compute_capability = 3;
+  }
+  string platform_name = 4;
+  DnnVersionInfoProto dnn_version_info = 5;
+
+  // TODO(b/248362914): Autotuning results should be separate from
+  // GpuTargetConfig because autotuning can be updated regularly separate from
+  // the target.
+  xla.AutotuneResults autotune_results = 6;
+
+  string device_description_str = 7;
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/stream_executor/device_host_allocator.h b/tensorflow/compiler/xla/stream_executor/device_host_allocator.h
index 9eac858de7e..2d26ef2e17a 100644
--- a/tensorflow/compiler/xla/stream_executor/device_host_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/device_host_allocator.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_HOST_ALLOCATOR_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_HOST_ALLOCATOR_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace stream_executor {
 // Allocator for pinned CPU RAM that is made known to a StreamExecutor-based
@@ -34,10 +37,13 @@ class DeviceHostAllocator : public tsl::SubAllocator {
         numa_node_(numa_node) {
     CHECK(stream_exec_ != nullptr);
   }
-  ~DeviceHostAllocator() override {}
+
+  ~DeviceHostAllocator() override = default;
 
   void* Alloc(size_t alignment, size_t num_bytes,
               size_t* bytes_received) override {
+    tsl::profiler::TraceMe traceme("DeviceHostAllocator::Alloc");
+
     void* ptr = nullptr;
     *bytes_received = num_bytes;
     if (num_bytes > 0) {
@@ -53,6 +59,8 @@ class DeviceHostAllocator : public tsl::SubAllocator {
   }
 
   void Free(void* ptr, size_t num_bytes) override {
+    tsl::profiler::TraceMe traceme("DeviceHostAllocator::Free");
+
     if (ptr != nullptr) {
       VisitFree(ptr, numa_node_, num_bytes);
       stream_exec_->HostMemoryDeallocate(ptr);
diff --git a/tensorflow/compiler/xla/stream_executor/device_id_utils.h b/tensorflow/compiler/xla/stream_executor/device_id_utils.h
new file mode 100644
index 00000000000..0d96c4cc8e9
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/device_id_utils.h
@@ -0,0 +1,119 @@
+
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_ID_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_ID_UTILS_H_
+
+#include <numeric>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/framework/device_id_manager.h"
+#include "tensorflow/tsl/lib/gtl/int_type.h"
+#include "tensorflow/tsl/platform/str_util.h"
+
+namespace stream_executor {
+
+// Utility methods for translation between TensorFlow device ids and platform
+// device ids.
+class DeviceIdUtil {
+ public:
+  // Convenient methods for getting the associated executor given a TfDeviceId
+  // or PlatformDeviceId.
+  static tsl::StatusOr<StreamExecutor*> ExecutorForPlatformDeviceId(
+      Platform* device_manager, tsl::PlatformDeviceId platform_device_id) {
+    return device_manager->ExecutorForDevice(platform_device_id.value());
+  }
+  static tsl::StatusOr<StreamExecutor*> ExecutorForTfDeviceId(
+      const tsl::DeviceType& type, Platform* device_manager,
+      tsl::TfDeviceId tf_device_id) {
+    tsl::PlatformDeviceId platform_device_id;
+    TF_RETURN_IF_ERROR(tsl::DeviceIdManager::TfToPlatformDeviceId(
+        type, tf_device_id, &platform_device_id));
+    return ExecutorForPlatformDeviceId(device_manager, platform_device_id);
+  }
+
+  // Verify that the platform_device_id associated with a TfDeviceId is
+  // legitimate.
+  static void CheckValidTfDeviceId(const tsl::DeviceType& type,
+                                   Platform* device_manager,
+                                   tsl::TfDeviceId tf_device_id) {
+    tsl::PlatformDeviceId platform_device_id;
+    TF_CHECK_OK(tsl::DeviceIdManager::TfToPlatformDeviceId(
+        type, tf_device_id, &platform_device_id));
+    const int visible_device_count = device_manager->VisibleDeviceCount();
+    CHECK_LT(platform_device_id.value(), visible_device_count)
+        << "platform_device_id is outside discovered device range."
+        << " TF " << type << " id: " << tf_device_id << ", platform " << type
+        << " id: " << platform_device_id
+        << ", visible device count: " << visible_device_count;
+  }
+
+  // Parse `visible_device_list` into a list of platform Device ids.
+  static tsl::Status ParseVisibleDeviceList(
+      const std::string& visible_device_list, const int visible_device_count,
+      std::vector<tsl::PlatformDeviceId>* visible_device_order) {
+    visible_device_order->clear();
+
+    // If the user wants to remap the visible to virtual Device mapping,
+    // check for that here.
+    if (visible_device_list.empty()) {
+      visible_device_order->resize(visible_device_count);
+      // By default, visible to virtual mapping is unchanged.
+      std::iota(visible_device_order->begin(), visible_device_order->end(), 0);
+    } else {
+      const std::vector<std::string> order_str =
+          tsl::str_util::Split(visible_device_list, ',');
+      for (const std::string& platform_device_id_str : order_str) {
+        int32_t platform_device_id;
+        if (!absl::SimpleAtoi(platform_device_id_str, &platform_device_id)) {
+          return tsl::errors::InvalidArgument(
+              "Could not parse entry in 'visible_device_list': '",
+              platform_device_id_str,
+              "'. visible_device_list = ", visible_device_list);
+        }
+        if (platform_device_id < 0 ||
+            platform_device_id >= visible_device_count) {
+          return tsl::errors::InvalidArgument(
+              "'visible_device_list' listed an invalid Device id '",
+              platform_device_id, "' but visible device count is ",
+              visible_device_count);
+        }
+        visible_device_order->push_back(
+            tsl::PlatformDeviceId(platform_device_id));
+      }
+    }
+
+    // Validate no repeats.
+    std::set<tsl::PlatformDeviceId> visible_device_set(
+        visible_device_order->begin(), visible_device_order->end());
+    if (visible_device_set.size() != visible_device_order->size()) {
+      return tsl::errors::InvalidArgument(
+          "visible_device_list contained a duplicate entry: ",
+          visible_device_list);
+    }
+    return tsl::OkStatus();
+  }
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_ID_UTILS_H_
diff --git a/tensorflow/compiler/xla/stream_executor/device_mem_allocator.h b/tensorflow/compiler/xla/stream_executor/device_mem_allocator.h
index 7eafe1c21b4..2ff6114dd02 100644
--- a/tensorflow/compiler/xla/stream_executor/device_mem_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/device_mem_allocator.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_MEM_ALLOCATOR_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_DEVICE_MEM_ALLOCATOR_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace stream_executor {
 
@@ -39,10 +42,13 @@ class DeviceMemAllocator : public tsl::SubAllocator {
         use_unified_memory_(use_unified_memory) {
     CHECK(stream_exec_ != nullptr);
   }
-  ~DeviceMemAllocator() override {}
+
+  ~DeviceMemAllocator() override = default;
 
   void* Alloc(size_t alignment, size_t num_bytes,
               size_t* bytes_received) override {
+    tsl::profiler::TraceMe traceme("DeviceMemAllocator::Alloc");
+
     void* ptr = nullptr;
     *bytes_received = num_bytes;
     if (num_bytes > 0) {
@@ -57,6 +63,8 @@ class DeviceMemAllocator : public tsl::SubAllocator {
   }
 
   void Free(void* ptr, size_t num_bytes) override {
+    tsl::profiler::TraceMe traceme("DeviceMemAllocator::Free");
+
     if (ptr != nullptr) {
       VisitFree(ptr, device_id_.value(), num_bytes);
       if (use_unified_memory_) {
diff --git a/tensorflow/compiler/xla/stream_executor/device_memory_allocator.h b/tensorflow/compiler/xla/stream_executor/device_memory_allocator.h
index df6e230219d..6947be4d28b 100644
--- a/tensorflow/compiler/xla/stream_executor/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/device_memory_allocator.h
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/types.h"
 
 namespace stream_executor {
@@ -141,7 +141,7 @@ class ScopedDeviceMemory {
   int device_ordinal() const { return device_ordinal_; }
 
   // Frees the existing memory, resets the wrapped memory to null.
-  port::Status Free();
+  tsl::Status Free();
 
  private:
   DeviceMemory<ElemT> wrapped_;       // Value we wrap with scoped-release.
@@ -175,10 +175,10 @@ class DeviceMemoryAllocator {
   // fails, the allocation should return immediately without retrying.  An
   // example use case is optional scratch spaces where a failure has only
   // performance impact.
-  virtual port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
-                                                      uint64_t size,
-                                                      bool retry_on_failure,
-                                                      int64_t memory_space) = 0;
+  virtual tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
+                                                     uint64_t size,
+                                                     bool retry_on_failure,
+                                                     int64_t memory_space) = 0;
 
   // Two-arg version of Allocate(), which sets retry-on-failure to true and
   // memory_space to default (0).
@@ -186,22 +186,22 @@ class DeviceMemoryAllocator {
   // (We don't simply use a default argument on the virtual Allocate function
   // because default args on virtual functions are disallowed by the Google
   // style guide.)
-  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
-                                              uint64_t size) {
+  tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal,
+                                             uint64_t size) {
     return Allocate(device_ordinal, size, /*retry_on_failure=*/true,
                     /*memory_space=*/0);
   }
 
   // Three-arg version of Allocate(), which sets memory_space to default (0).
-  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
-                                              bool retry_on_failure) {
+  tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                             bool retry_on_failure) {
     return Allocate(device_ordinal, size, retry_on_failure,
                     /*memory_space=*/0);
   }
 
   // Typed version of the allocation, returning typed memory.
   template <typename ElemT>
-  port::StatusOr<ScopedDeviceMemory<ElemT>> Allocate(
+  tsl::StatusOr<ScopedDeviceMemory<ElemT>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure = true,
       int64_t memory_space = 0) {
     return Allocate(device_ordinal, size, retry_on_failure, memory_space);
@@ -210,7 +210,7 @@ class DeviceMemoryAllocator {
   // Must be a nop for null pointers. Should not be used.
   //
   // TODO(cheshire): Add deprecation notice.
-  virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
+  virtual tsl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
   const Platform *platform() const { return platform_; }
@@ -223,7 +223,7 @@ class DeviceMemoryAllocator {
   // allocated by this allocator. It is not necessary to use the returned stream
   // though, as clients may have additional information letting them safely use
   // a different stream.
-  virtual port::StatusOr<Stream *> GetStream(int device_ordinal) = 0;
+  virtual tsl::StatusOr<Stream *> GetStream(int device_ordinal) = 0;
 
  protected:
   const Platform *platform_;
@@ -244,23 +244,23 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
       const Platform *platform,
       absl::Span<StreamExecutor *const> stream_executors);
 
-  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
-                                              bool retry_on_failure,
-                                              int64_t memory_space) override;
+  tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                             bool retry_on_failure,
+                                             int64_t memory_space) override;
 
   // Pull in two-arg overload that sets retry_on_failure to true.
   using DeviceMemoryAllocator::Allocate;
 
-  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
+  tsl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
   // Gets-or-creates a stream for a given `device_ordinal` from an appropriate
   // stream executor.
-  port::StatusOr<Stream *> GetStream(int device_ordinal) override;
+  tsl::StatusOr<Stream *> GetStream(int device_ordinal) override;
 
   // Gets the stream executor for given device ordinal.
-  port::StatusOr<StreamExecutor *> GetStreamExecutor(int device_ordinal) const;
+  tsl::StatusOr<StreamExecutor *> GetStreamExecutor(int device_ordinal) const;
 
  private:
   // Available stream executors. Each stream executor has a different device
@@ -274,7 +274,7 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
 };
 
 template <typename ElemT>
-port::Status ScopedDeviceMemory<ElemT>::Free() {
+tsl::Status ScopedDeviceMemory<ElemT>::Free() {
   if (!wrapped_.is_null()) {
     CHECK(allocator_ != nullptr) << "Owning pointer in inconsistent state";
     TF_RETURN_IF_ERROR(allocator_->Deallocate(device_ordinal_, wrapped_));
diff --git a/tensorflow/compiler/xla/stream_executor/dnn.cc b/tensorflow/compiler/xla/stream_executor/dnn.cc
index 9f344a8099d..7c648832f1a 100644
--- a/tensorflow/compiler/xla/stream_executor/dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/dnn.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/tsl/lib/strings/proto_serialization.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 namespace dnn {
@@ -46,6 +48,8 @@ bool ProtoMapsEqual(const google::protobuf::Map<int64_t, int64_t>& x,
 
 }  // namespace
 
+constexpr DataType ToDataType<tsl::float8_e4m3fn>::value;
+constexpr DataType ToDataType<tsl::float8_e5m2>::value;
 constexpr DataType ToDataType<float>::value;
 constexpr DataType ToDataType<double>::value;
 constexpr DataType ToDataType<Eigen::half>::value;
@@ -114,12 +118,12 @@ std::vector<std::pair<int64_t, int64_t>> AlgorithmDesc::TuningKnobs() const {
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
-port::Status DnnSupport::GetConvolveRunners(
+tsl::Status DnnSupport::GetConvolveRunners(
     bool /* use_cudnn_frontend */, dnn::ConvolutionKind /*kind*/,
     dnn::DataType /*input_type*/, dnn::DataType /*output_type*/,
     Stream* /*stream*/, const dnn::BatchDescriptor& /*input_descriptor*/,
@@ -131,10 +135,10 @@ port::Status DnnSupport::GetConvolveRunners(
     const dnn::ConvolutionDescriptor& /*convolution_descriptor*/,
     bool /*use_fallback*/, ScratchAllocator* /*scratch_allocator*/,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* /*exec_plans*/) {
-  return port::UnimplementedError("GetConvolveRunners not implemented.");
+  return tsl::errors::Unimplemented("GetConvolveRunners not implemented.");
 }
 
-port::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
+tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
 DnnSupport::ConvolveRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
     dnn::ConvolutionKind kind, dnn::DataType element_type,
@@ -142,10 +146,10 @@ DnnSupport::ConvolveRunnerFromDesc(
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor) {
-  return port::UnimplementedError("ConvolveRunnerFromDesc not implemented.");
+  return tsl::errors::Unimplemented("ConvolveRunnerFromDesc not implemented.");
 }
 
-port::Status DnnSupport::GetFusedConvolveRunners(
+tsl::Status DnnSupport::GetFusedConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType element_type, dnn::DataType bias_type,
     dnn::DataType output_type, double conv_input_scale, double side_input_scale,
@@ -157,10 +161,10 @@ port::Status DnnSupport::GetFusedConvolveRunners(
     const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
     dnn::ActivationMode activation_mode,
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
-  return port::UnimplementedError("GetFusedConvolveRunners not implemented.");
+  return tsl::errors::Unimplemented("GetFusedConvolveRunners not implemented.");
 }
 
-port::Status DnnSupport::GetFusedMatmulRunners(
+tsl::Status DnnSupport::GetFusedMatmulRunners(
     bool use_cudnn_frontend, dnn::DataType element_type,
     dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
     bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k, int64_t lda,
@@ -168,10 +172,10 @@ port::Status DnnSupport::GetFusedMatmulRunners(
     bool use_fallback,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
-  return port::UnimplementedError("GetFusedMatmulRunners not implemented.");
+  return tsl::errors::Unimplemented("GetFusedMatmulRunners not implemented.");
 }
 
-port::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+tsl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
 DnnSupport::FusedConvolveRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
     dnn::ConvolutionKind kind, dnn::DataType element_type,
@@ -183,7 +187,7 @@ DnnSupport::FusedConvolveRunnerFromDesc(
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     dnn::ActivationMode activation_mode) {
-  return port::UnimplementedError(
+  return tsl::errors::Unimplemented(
       "FusedConvolveRunnerFromDesc not implemented.");
 }
 
@@ -206,13 +210,13 @@ bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
 }
 
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
 
 bool DnnSupport::GetConvolveBackwardFilterAlgorithms(
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<AlgorithmDesc>* out_algorithms) {
   return false;
 }
@@ -246,6 +250,8 @@ std::string ActivationModeString(ActivationMode mode) {
       return "tanh";
     case ActivationMode::kBandPass:
       return "bandpass";
+    case ActivationMode::kElu:
+      return "elu";
     default:
       return absl::StrCat("unknown: ", static_cast<int32_t>(mode));
   }
@@ -612,7 +618,7 @@ int64_t BatchDescriptor::FullyConnectedBiasCount(
 }
 
 BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
-    port::ArraySlice<dnn::BatchDescriptor> inputs) {  // non-absl ok
+    absl::Span<const dnn::BatchDescriptor> inputs) {
   if (inputs.empty()) {
     return BatchDescriptor();
   }
@@ -879,7 +885,7 @@ std::string NormalizeDescriptor::ToShortString() const {
                       "_size:", segment_size_);
 }
 
-bool DnnSupport::IsStatusOk(const port::Status& status, bool report_error) {
+bool DnnSupport::IsStatusOk(const tsl::Status& status, bool report_error) {
   if (status.ok()) {
     return true;
   }
@@ -889,7 +895,7 @@ bool DnnSupport::IsStatusOk(const port::Status& status, bool report_error) {
   return false;
 }
 
-port::Status DnnSupport::DoCtcLoss(
+tsl::Status DnnSupport::DoCtcLoss(
     Stream* stream, dnn::DataType element_type,
     const RnnStateTensorDescriptor& probs_desc,
     const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
@@ -897,7 +903,7 @@ port::Status DnnSupport::DoCtcLoss(
     absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
     const RnnStateTensorDescriptor& grads_desc, DeviceMemoryBase grads_data,
     DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id) {
-  return port::UnimplementedError("CtcLoss not implemented");
+  return tsl::errors::Unimplemented("CtcLoss not implemented");
 }
 
 }  // namespace dnn
diff --git a/tensorflow/compiler/xla/stream_executor/dnn.h b/tensorflow/compiler/xla/stream_executor/dnn.h
index c646ae6b65b..e5ccb3e0cb7 100644
--- a/tensorflow/compiler/xla/stream_executor/dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/dnn.h
@@ -36,13 +36,13 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/data_type.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.pb.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace Eigen {
 struct half;
@@ -354,7 +354,7 @@ class BatchDescriptor {
   // dimensions, except possibly for feature_map_count(), though this
   // function does not verify that.
   static BatchDescriptor DepthConcatenateOutputDescriptor(
-      port::ArraySlice<dnn::BatchDescriptor> inputs);  // non-absl ok
+      absl::Span<const dnn::BatchDescriptor> inputs);
 
  private:
   absl::Span<const int64_t> spatial_size() const {
@@ -886,12 +886,12 @@ class OpRunner<void(Args...)> {
   virtual size_t GetWorkspaceSize() const = 0;
 
   // Convert to an AlgorithmDesc for AoT compilation or autotuning.
-  virtual port::StatusOr<AlgorithmDesc> ToAlgorithmDesc() const = 0;
+  virtual tsl::StatusOr<AlgorithmDesc> ToAlgorithmDesc() const = 0;
 
   // Launch the operation, with the signature determined by `Sig`.
-  virtual port::Status operator()(Stream*, ProfileResult*,
-                                  DeviceMemoryBase scratch_memory,
-                                  Args... args) const = 0;
+  virtual tsl::Status operator()(Stream*, ProfileResult*,
+                                 DeviceMemoryBase scratch_memory,
+                                 Args... args) const = 0;
 };
 
 using ConvSignature = void(DeviceMemoryBase /* input_data */,
@@ -1097,6 +1097,17 @@ class VersionInfo {
  public:
   VersionInfo(int major = 0, int minor = 0, int patch = 0)
       : major_(major), minor_(minor), patch_(patch) {}
+  explicit VersionInfo(DnnVersionInfoProto proto)
+      : major_(proto.major()), minor_(proto.minor()), patch_(proto.patch()) {}
+
+  DnnVersionInfoProto ToProto() const {
+    DnnVersionInfoProto proto;
+    proto.set_major(major_);
+    proto.set_minor(minor_);
+    proto.set_patch(patch_);
+    return proto;
+  }
+
   int major_version() const { return major_; }
   int minor_version() const { return minor_; }
   int patch() const { return patch_; }
@@ -1120,7 +1131,7 @@ class VersionInfo {
 //   functions are actually implemented by both backends, the rest are
 //   actually backend-specific. The massive interface creates extra mental
 //   burden.
-// * Poor error handling: the API should return Status objects.
+// * Poor error handling: the API should return tsl::Status objects.
 //
 // PrepareForConvolution is an example for how new APIs should be written.
 class DnnSupport {
@@ -1128,11 +1139,11 @@ class DnnSupport {
   DnnSupport() {}
   virtual ~DnnSupport() {}
 
-  virtual port::Status Init() = 0;
+  virtual tsl::Status Init() = 0;
 
   // Gets the version of the backing library, as a VersionInfo object.
-  virtual port::StatusOr<VersionInfo> GetVersion() {
-    return port::UnimplementedError(
+  virtual tsl::StatusOr<VersionInfo> GetVersion() {
+    return tsl::errors::Unimplemented(
         "DnnSupport::GetVersion not implemented on this platform.");
   }
 
@@ -1203,6 +1214,26 @@ class DnnSupport {
     return false;
   }
 
+  // Performs a bfloat16 forward batch normalization operation onto the
+  // stream. See DoBatchNormalizationForward above for argument details.
+  virtual bool DoBatchNormalizationForward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& x,
+      const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
+      const DeviceMemory<float>& estimated_mean,
+      const DeviceMemory<float>& estimated_variance,
+      const DeviceMemory<Eigen::bfloat16>& side_input,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
+      DeviceMemory<float>* reserve_space_1,
+      DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
   // Performs a single-precision backward batch normalization gradient
   // computation operation onto the stream.
   //
@@ -1254,6 +1285,26 @@ class DnnSupport {
     return false;
   }
 
+  // Performs a bfloat16 backward batch normalization gradient computation
+  // operation onto the stream. See DoBatchNormalizationBackward above for
+  // argument details.
+  virtual bool DoBatchNormalizationBackward(
+      Stream* stream, const DeviceMemory<Eigen::bfloat16>& y_backprop,
+      const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
+      const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
+      const DeviceMemory<float>& inv_var,
+      const DeviceMemory<Eigen::bfloat16>& y,
+      const dnn::BatchDescriptor& x_desc,
+      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<Eigen::bfloat16>* side_input_backprop,
+      DeviceMemory<uint8_t>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
   // Enqueues a fused convolution operation onto the stream.
   // We provide several variants with different types for inputs, biases and
   // scaling parameters.
@@ -1304,7 +1355,7 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual port::Status DoFusedConvolve(
+  virtual tsl::Status DoFusedConvolve(
       Stream* stream, DataType input_type, DataType side_input_type,
       DataType bias_type, DataType output_type,
       const dnn::BatchDescriptor& conv_input_descriptor,
@@ -1319,12 +1370,12 @@ class DnnSupport {
       DeviceMemoryBase output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   template <typename ElementType, typename OutputType>
-  port::Status PrepareForConvolution(
+  tsl::Status PrepareForConvolution(
       ConvolutionKind kind, Stream* stream,
       const BatchDescriptor& batch_descriptor,
       DeviceMemory<ElementType> input_data,
@@ -1377,7 +1428,7 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual port::Status DoConvolve(
+  virtual tsl::Status DoConvolve(
       ConvolutionKind kind, DataType element_type, DataType output_type,
       Stream* stream, const BatchDescriptor& input_descriptor,
       DeviceMemoryBase input_data, const FilterDescriptor& filter_descriptor,
@@ -1390,10 +1441,10 @@ class DnnSupport {
   // Return a list of algorithms supported by the forward convolution pass.
   // cc_major and cc_minor are the compute capabilities of the device.
   virtual bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual port::Status GetConvolveRunners(
+  virtual tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -1405,7 +1456,7 @@ class DnnSupport {
       bool use_fallback, ScratchAllocator* scratch_allocator,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
 
-  virtual port::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
   ConvolveRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
       dnn::ConvolutionKind kind, dnn::DataType element_type,
@@ -1414,7 +1465,7 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       const dnn::ConvolutionDescriptor& convolution_descriptor);
 
-  virtual port::Status GetFusedConvolveRunners(
+  virtual tsl::Status GetFusedConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType element_type, dnn::DataType bias_type,
       dnn::DataType output_type, double conv_input_scale,
@@ -1427,7 +1478,7 @@ class DnnSupport {
       bool use_fallback, dnn::ActivationMode activation_mode,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
 
-  virtual port::Status GetFusedMatmulRunners(
+  virtual tsl::Status GetFusedMatmulRunners(
       bool use_cudnn_frontend, dnn::DataType element_type,
       dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
@@ -1436,7 +1487,7 @@ class DnnSupport {
       std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
           out_exec_plans);
 
-  virtual port::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  virtual tsl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
   FusedConvolveRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
       dnn::ConvolutionKind kind, dnn::DataType element_type,
@@ -1510,13 +1561,13 @@ class DnnSupport {
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
   virtual bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<AlgorithmDesc>* out_algorithms);
 
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
   virtual bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<AlgorithmDesc>* out_algorithms);
 
   // Fully connects the "nodes" (float values) in input_data with
@@ -1637,7 +1688,7 @@ class DnnSupport {
   // the input. The output width and height can be different.
   //
   // See PoolingDescriptor for how to configure the pooling operation.
-  virtual port::Status DoPoolForward(
+  virtual tsl::Status DoPoolForward(
       DataType element_type, Stream* stream,
       const dnn::PoolingDescriptor& pooling_dimensions,
       const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
@@ -1645,7 +1696,7 @@ class DnnSupport {
       DeviceMemoryBase output_data, ScratchAllocator* workspace_allocator) = 0;
 
   // Performs differentiation of the pooling operation.
-  virtual port::Status DoPoolBackward(
+  virtual tsl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
       const dnn::PoolingDescriptor& pooling_dimensions,
       const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
@@ -1723,9 +1774,8 @@ class DnnSupport {
   //  output_data: un-owned device memory region in which to place the
   //    depth concatenate result.
   virtual bool DoDepthConcatenate(
-      Stream* stream,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,  // non-absl ok
-      port::ArraySlice<const DeviceMemory<float>*> input_data,  // non-absl ok
+      Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float>* const> input_data,
       DeviceMemory<float>* output_data) = 0;
 
   // Concatenates several layers into one, by concatenating each in the
@@ -1750,9 +1800,8 @@ class DnnSupport {
   //  concat_direction:  either dnn:SpaceConcatenateMode::XDirection or
   //    dnn::SpaceConcatenateMode::YDirection.
   virtual bool DoSpaceConcatenate(
-      Stream* stream,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,  // non-absl ok
-      port::ArraySlice<const DeviceMemory<float>*> input_data,  // non-absl ok
+      Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float>* const> input_data,
       DeviceMemory<float>* output_data,
       dnn::SpaceConcatenateMode concat_direction) {
     return false;
@@ -1870,8 +1919,8 @@ class DnnSupport {
   //    operation result.
   virtual bool DoElementwiseOperate(
       Stream* stream, ElementwiseOperation operation,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,  // non-absl ok
-      port::ArraySlice<const DeviceMemory<float>*> input_data,  // non-absl ok
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float>* const> input_data,
       const dnn::BatchDescriptor& output_dimensions,
       DeviceMemory<float>* output_data) = 0;
 
@@ -1898,10 +1947,9 @@ class DnnSupport {
   //    operation result.
   virtual bool DoElementwiseOperateScaledQuantized(
       Stream* stream, ElementwiseOperation operation,
-      port::ArraySlice<int> input_multiplicands,  // non-absl ok
-      int output_divisor,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,  // non-absl ok
-      port::ArraySlice<const DeviceMemory<float>*> input_data,  // non-absl ok
+      absl::Span<const int> input_multiplicands, int output_divisor,
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float>* const> input_data,
       const dnn::BatchDescriptor& output_dimensions,
       DeviceMemory<float>* output_data) {
     return false;
@@ -2050,7 +2098,7 @@ class DnnSupport {
   //    for dropout layer. The user has to maintain the memory until the model
   //    is no longer in use.
   //  use_padded_io: a bool to specify whether the input is using padded IO.
-  virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
+  virtual tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
   createRnnDescriptor(int num_layers, int hidden_size, int input_size,
                       int cell_size, int batch_size,
                       dnn::RnnInputMode input_mode,
@@ -2059,8 +2107,8 @@ class DnnSupport {
                       const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64_t seed,
                       ScratchAllocator* state_allocator, bool use_padded_io) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "createRnnDescriptor is unimplemented");
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "createRnnDescriptor is unimplemented");
   }
 
   // Create a RNN sequence descriptor that specifies either the input or output
@@ -2072,29 +2120,29 @@ class DnnSupport {
   //  data_size: the size of the state.
   //  seq_lengths: the lengths of sequences in a batch.
   //  data_type: an enum to specify the type for the underlying data.
-  virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  virtual tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "createRnnSequenceTensorDescriptor is unimplemented");
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "createRnnSequenceTensorDescriptor is unimplemented");
   }
 
-  virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  virtual tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
                                     bool time_major, dnn::DataType data_type) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "createRnnSequenceTensorDescriptor is unimplemented");
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "createRnnSequenceTensorDescriptor is unimplemented");
   }
 
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
-  virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  virtual tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "createRnnStateTensorDescriptor is unimplemented");
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "createRnnStateTensorDescriptor is unimplemented");
   }
 
   // Enqueue a forward operation of the RNN model onto the stream.
@@ -2320,16 +2368,16 @@ class DnnSupport {
   }
 
   template <typename ElementType>
-  port::Status PrepareForCtcLoss(Stream* stream,
-                                 const RnnStateTensorDescriptor& probs_desc,
-                                 DeviceMemory<ElementType> probs_data,
-                                 const RnnStateTensorDescriptor& grads_desc,
-                                 absl::Span<const int> labels_data,
-                                 absl::Span<const int> labels_lengths_data,
-                                 absl::Span<const int> input_lengths_data,
-                                 ScratchAllocator* workspace_allocator,
-                                 DeviceMemory<uint8_t>* scratch_memory,
-                                 int* ctc_loss_algo_id) {
+  tsl::Status PrepareForCtcLoss(Stream* stream,
+                                const RnnStateTensorDescriptor& probs_desc,
+                                DeviceMemory<ElementType> probs_data,
+                                const RnnStateTensorDescriptor& grads_desc,
+                                absl::Span<const int> labels_data,
+                                absl::Span<const int> labels_lengths_data,
+                                absl::Span<const int> input_lengths_data,
+                                ScratchAllocator* workspace_allocator,
+                                DeviceMemory<uint8_t>* scratch_memory,
+                                int* ctc_loss_algo_id) {
     return DoPrepareForCtcLoss(
         stream, ToDataType<ElementType>::value, probs_desc, grads_desc,
         labels_data, labels_lengths_data, input_lengths_data,
@@ -2357,7 +2405,7 @@ class DnnSupport {
   //    workspace memory used by this operation. The caller is responsible for
   //    keeping the memory alive long enough for this operation, and recylces
   //    afterwards.
-  virtual port::Status DoCtcLoss(
+  virtual tsl::Status DoCtcLoss(
       Stream* stream, dnn::DataType element_type,
       const RnnStateTensorDescriptor& probs_desc,
       const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
@@ -2629,10 +2677,10 @@ class DnnSupport {
 
  protected:
   // Returns whether status is 'ok', and potentially logs the error.
-  static bool IsStatusOk(const port::Status& status, bool report_error);
+  static bool IsStatusOk(const tsl::Status& status, bool report_error);
 
  private:
-  virtual port::Status DoPrepareForConvolution(
+  virtual tsl::Status DoPrepareForConvolution(
       ConvolutionKind kind, DataType element_type, Stream* stream,
       const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
       const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
@@ -2646,7 +2694,7 @@ class DnnSupport {
     return ::tsl::OkStatus();
   }
 
-  virtual port::Status DoPrepareForCtcLoss(
+  virtual tsl::Status DoPrepareForCtcLoss(
       Stream* stream, DataType element_type,
       const RnnStateTensorDescriptor& probs_desc,
       const RnnStateTensorDescriptor& grads_desc,
diff --git a/tensorflow/compiler/xla/stream_executor/dnn.proto b/tensorflow/compiler/xla/stream_executor/dnn.proto
index 152974825a0..d66f8e2f634 100644
--- a/tensorflow/compiler/xla/stream_executor/dnn.proto
+++ b/tensorflow/compiler/xla/stream_executor/dnn.proto
@@ -1,171 +1,8 @@
 // LINT: LEGACY_NAMES
 syntax = "proto3";
 
-package stream_executor.dnn;
+package stream_executor.dnn.dummy;
 
-import "google/protobuf/wrappers.proto";
+import public "tensorflow/tsl/protobuf/dnn.proto";
 
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/stream_executor";
-
-// Specifies the data type used by an operation.
-enum DataType {
-  kFloat = 0;
-  kDouble = 1;
-  kHalf = 2;
-  kInt8 = 3;
-  kInt32 = 4;
-  kComplexFloat = 5;
-  kComplexDouble = 6;
-  kBF16 = 7;
-}
-
-// Describes how a convolution input or output layer's data is formatted.
-enum DataLayout {
-  // Naming convention:
-  // Y <-> row or height
-  // X <-> column or width
-  // Batch <-> batch, or N
-  // Depth <-> feature, or channel
-  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
-  //
-  // Note: In cudnn, kBatchDepthYX4 and kBatchDepthYX32 are the same layout
-  // (namely, NCHW_VECT_C).  It differentiates between these two by using a
-  // different data type (int8x4 vs int8x32).  In StreamExecutor we use
-  // different layouts for these, because we don't usually pass an explicit data
-  // type to StreamExecutor functions.
-  kYXDepthBatch = 0;
-  kYXBatchDepth = 1;
-  kBatchYXDepth = 2;    // cuDNN's NHWC layout
-  kBatchDepthYX = 3;    // cuDNN's NCHW layout
-  kBatchDepthYX4 = 4;   // cuDNN's NCHW_VECT_C with 4-elem vectors (e.g. int8x4)
-  kBatchDepthYX32 = 5;  // cuDNN's NCHW_VECT_C with 32-elem vects (e.g. int8x32)
-}
-
-// Describes how a convolution filter is laid out in the memory.
-enum FilterLayout {
-  // Naming convention:
-  // Y <-> row or height
-  // X <-> column or width
-  // Output <-> output feature, or N
-  // Input <-> input feature, or N
-  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
-  kOutputInputYX = 0;    // cuDNN's NCHW layout
-  kOutputYXInput = 1;    // cuDNN's NHWC layout
-  kOutputInputYX4 = 2;   // cuDNN's NCHW_VECT_C layout with 4-elem vectors
-  kOutputInputYX32 = 5;  // cuDNN's NCHW_VECT_C layout with 32-elem vectors
-  kInputYXOutput = 3;
-  kYXInputOutput = 4;
-}
-
-// Describes a kind of non-linearity (threshold-like mathematical function).
-enum ActivationMode {
-  kNone = 0;
-  kSigmoid = 1;
-  // Rectified linear activation: f(x) = x < 0 ? 0 : x
-  kRelu = 2;
-  // Rectified linear activation; where upper maximum is 6.0.
-  kRelu6 = 3;
-  // Rectified linear activation; where upper maximum specified by
-  // BatchDescriptor::value_max().
-  kReluX = 4;
-  kTanh = 5;
-  // Like ReluX; but passes all values in the range [-X,X].
-  kBandPass = 6;
-  // Exponential linear activation: f(x) = x < 0 ? e^x - 1 : x
-  kElu = 7;
-  // Leaky Rectified linear activation: f(x) = x < 0 ? alpha * x : x
-  kLeakyRelu = 8;
-  // Gaussian Error linear unit activation:
-  //   x * P(X <= x) = 0.5 * x * (1 + erf(x / sqrt(2))), where P(X) ~ N(0, 1).
-  kGeluExact = 9;
-}
-
-// Describe the math definition for the conv op. The popular behavior is
-// actually called cross-correlation in math, despite the operation is often
-// referred as convolution. See cuDNN cudnnConvolutionMode_t.
-enum ConvolutionMode {
-  CROSS_CORRELATION = 0;
-  CONVOLUTION = 1;
-}
-
-enum ConvolutionKind {
-  INVALID = 0;
-  FORWARD = 1;
-  BACKWARD_FILTER = 2;
-  BACKWARD_DATA = 3;
-  FORWARD_BIAS_ACTIVATION = 4;
-}
-
-// Generic tensor representation.
-message TensorDescriptorProto {
-  repeated int64 dimensions = 1;
-  DataType data_type = 2;
-  oneof layout_oneof {
-    DataLayout data_layout = 3;
-    FilterLayout filter_layout = 4;
-  }
-}
-
-// Generic algorithm representation.
-message AlgorithmProto {
-  enum MathType {
-    DEFAULT_MATH = 0;
-    // The GPU may operate 4x4 matrix FMA.
-    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
-    TENSOR_OP_MATH = 1;
-  }
-  int64 algo_id = 1;
-  MathType math_type = 2;
-  reserved 3;
-
-  map<int64, int64> tuning_knobs = 4;
-  // Legacy algorithm enums and cuDNN Frontend engine numbers need to coexist in
-  // the same proto medium-term, until we can be confident of no longer needing
-  // the legacy cuDNN convolution API.  Once the migration is complete, we can
-  // stop producing legacy algorithm enums and remove this field.
-  bool is_cudnn_frontend = 5;
-
-  // For ROCm only, it's impossible to re-query the required workspace size
-  // after running the algorithm search, so we must store the workspace size
-  // along with the choice of algorithm.  For consistency and convenience,
-  // cuDNN uses this field in the same way, even though it would be possible to
-  // re-query the workspace size from cuDNN at each use.
-  //
-  // Since this message is persisted in files, we need to be able to distinguish
-  // 0 workspace size from unknown workspace size in an old message, so this is
-  // a message field.
-  google.protobuf.UInt64Value workspace_size = 6;
-}
-
-// Proto definition of AlgorithmConfig in "dnn.h".
-// TODO(ruochengw): After cl/380702564 is submitted, add support for algorithm
-// configs with cuDNN Frontend APIs.
-message AlgorithmConfigProto {
-  // Use oneof to emulate optional semantics in proto2 since older
-  // version of proto3 cannot distinguish "unset field" and "default field".
-  oneof optional_algorithm {
-    AlgorithmProto algorithm = 1;
-  }
-  oneof optional_algorithm_no_scratch {
-    AlgorithmProto algorithm_no_scratch = 2;
-  }
-  oneof optional_scratch_size {
-    int64 scratch_size = 3;
-  }
-}
-
-// Convolution-specific parameters.
-message ConvolutionDescriptorProto {
-  repeated int64 paddings = 1;
-  repeated int64 strides = 2;
-  repeated int64 dilations = 3;
-  // The "accumulator" type. For example, use F32 as an accumulator for F16
-  // convolutions.
-  // See cuDNN's cudnnConvolutionMode_t.
-  DataType compute_mode = 4;
-  // See cuDNN's group count.
-  int32 group_count = 5;
-  ConvolutionMode convolution_mode = 6;
-  // Tensorflow node name, same as in NodeDef, for debugging purposes.
-  string name = 7;
-}
diff --git a/tensorflow/compiler/xla/stream_executor/executor_cache.cc b/tensorflow/compiler/xla/stream_executor/executor_cache.cc
index ba0a361130e..30e2aa420f5 100644
--- a/tensorflow/compiler/xla/stream_executor/executor_cache.cc
+++ b/tensorflow/compiler/xla/stream_executor/executor_cache.cc
@@ -34,7 +34,7 @@ struct hash<se::StreamExecutorConfig> {
 
 namespace stream_executor {
 
-port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
+tsl::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
   // In the fast path case, the cache already has an entry and we can just
@@ -66,7 +66,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
   }
 
   VLOG(2) << "building executor";
-  port::StatusOr<std::unique_ptr<StreamExecutor>> result = factory();
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> result = factory();
   if (!result.ok()) {
     VLOG(2) << "failed to get build executor: " << result.status();
     // If construction failed, leave the cache Entry around, but with a null
@@ -77,7 +77,7 @@ port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
   return entry->configurations.back().second.get();
 }
 
-port::StatusOr<StreamExecutor*> ExecutorCache::Get(
+tsl::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
@@ -94,8 +94,8 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
             }
           }
         }
-        return port::Status(
-            port::error::NOT_FOUND,
+        return tsl::Status(
+            tsl::error::NOT_FOUND,
             absl::StrFormat("No executors own stream %p", config.gpu_stream));
       }
     }
@@ -104,16 +104,16 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     if (it != cache_.end()) {
       entry = &it->second;
     } else {
-      return port::Status(
-          port::error::NOT_FOUND,
+      return tsl::Status(
+          tsl::error::NOT_FOUND,
           absl::StrFormat("No executors registered for (ordinal %d, hash %x)",
                           config.ordinal, config.hash));
     }
   }
   absl::ReaderMutexLock lock{&entry->configurations_mutex};
   if (entry->configurations.empty()) {
-    return port::Status(
-        port::error::NOT_FOUND,
+    return tsl::Status(
+        tsl::error::NOT_FOUND,
         absl::StrFormat("No executors registered for (ordinal %d, hash %x)",
                         config.ordinal, config.hash));
   }
@@ -125,8 +125,8 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
       return iter.second.get();
     }
   }
-  return port::Status(port::error::NOT_FOUND,
-                      "No executor found with a matching config.");
+  return tsl::Status(tsl::error::NOT_FOUND,
+                     "No executor found with a matching config.");
 }
 
 void ExecutorCache::DestroyAllExecutors() {
diff --git a/tensorflow/compiler/xla/stream_executor/executor_cache.h b/tensorflow/compiler/xla/stream_executor/executor_cache.h
index 87230cc7c9b..45275fc15e9 100644
--- a/tensorflow/compiler/xla/stream_executor/executor_cache.h
+++ b/tensorflow/compiler/xla/stream_executor/executor_cache.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <map>
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -35,14 +35,14 @@ class ExecutorCache {
   // Looks up 'config' in the cache. Returns a pointer to the existing executor,
   // if already present, or creates it using 'factory', if it does not.
   // Factories may be executed concurrently for different device ordinals.
-  typedef port::StatusOr<std::unique_ptr<StreamExecutor>> ExecutorFactory();
-  port::StatusOr<StreamExecutor*> GetOrCreate(
+  typedef tsl::StatusOr<std::unique_ptr<StreamExecutor>> ExecutorFactory();
+  tsl::StatusOr<StreamExecutor*> GetOrCreate(
       const StreamExecutorConfig& config,
       const std::function<ExecutorFactory>& factory);
 
   // Returns a pointer to the described executor (if one with a matching config
   // has been created), or a NOT_FOUND status.
-  port::StatusOr<StreamExecutor*> Get(const StreamExecutorConfig& config);
+  tsl::StatusOr<StreamExecutor*> Get(const StreamExecutorConfig& config);
 
   // Destroys all Executors and clears the cache.
   // Performs no synchronization with the executors - undefined behavior may
diff --git a/tensorflow/compiler/xla/stream_executor/fft.h b/tensorflow/compiler/xla/stream_executor/fft.h
index b83ef6326ad..536a18773d8 100644
--- a/tensorflow/compiler/xla/stream_executor/fft.h
+++ b/tensorflow/compiler/xla/stream_executor/fft.h
@@ -35,7 +35,7 @@ limitations under the License.
 //  stream
 //    .Init()
 //    .ThenFft(plan.get(), x, &y);
-//  SE_CHECK_OK(stream.BlockHostUntilDone());
+//  TF_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned FFT
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/BUILD b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
index a658ba9e7be..a429341dffc 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
@@ -6,23 +6,37 @@ load(
     "if_gpu_is_configured",
 )
 load(
-    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "//tensorflow/tsl:tsl.bzl",
+    "if_libtpu",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "if_static",
 )
-load("//tensorflow/tsl:tsl.bzl", "if_libtpu", "tsl_copts")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load(
+    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/tf2xla:__subpackages__",
+        "//tensorflow/compiler/xla:__subpackages__",
+        "//tensorflow/compiler/xla/pjrt:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/common_runtime/gpu:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -51,8 +65,8 @@ cc_library(
     name = "gpu_diagnostics_header",
     hdrs = if_gpu_is_configured(["gpu_diagnostics.h"]),
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -64,13 +78,13 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/common_runtime/gpu:__subpackages__",
         "//tensorflow/core/util/autotune_maps:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     deps = [
         ":gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor:device_options",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
     ] + if_libtpu(
         if_false = ["@local_config_cuda//cuda:cuda_headers"],
         if_true = [],
@@ -84,7 +98,6 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor:event",
-        "//tensorflow/compiler/xla/stream_executor/lib",
     ]),
 )
 
@@ -97,7 +110,8 @@ cc_library(
         ":gpu_executor_header",
         ":gpu_stream",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -109,12 +123,13 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:platform",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:fingerprint",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -128,6 +143,47 @@ cc_library(
     ],
 )
 
+tsl_gpu_library(
+    name = "gpu_init",
+    hdrs = [
+        "gpu_init.h",
+    ],
+    visibility = [
+        "//tensorflow/tsl:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:status",
+    ] + if_static(
+        [":gpu_init_impl"],
+    ),
+)
+
+tsl_gpu_library(
+    name = "gpu_init_impl",
+    srcs = [
+        "gpu_init.cc",
+    ],
+    hdrs = [
+        "gpu_init.h",
+    ],
+    copts = tsl_copts(),
+    linkstatic = True,
+    visibility = [
+        "//tensorflow/compiler/tf2xla:__subpackages__",
+        "//tensorflow/compiler/xla:__subpackages__",
+        "//tensorflow/core/common_runtime/gpu:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
+        "//tensorflow/compiler/xla/stream_executor:platform",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "gpu_kernel_header",
     hdrs = if_gpu_is_configured(["gpu_kernel.h"]),
@@ -170,7 +226,7 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/base:core_headers",
     ],
 )
@@ -194,7 +250,7 @@ cc_library(
         ":gpu_executor_header",
         ":gpu_stream",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:status",
     ],
 )
 
@@ -217,7 +273,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     deps = [
         "@com_google_absl//absl/strings",
@@ -234,7 +289,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
@@ -249,7 +303,6 @@ cc_library(
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:cuda_libdevice_path",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -261,7 +314,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_driver",
-    ]),
+    ]) + ["//tensorflow/tsl/platform:statusor"],
 )
 
 cc_library(
@@ -269,14 +322,7 @@ cc_library(
     srcs = if_gpu_is_configured(["asm_compiler.cc"]),
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tsl_copts(),
-    visibility = [
-        "//tensorflow/compiler/mlir/disc:__subpackages__",
-        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
+    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
         ":gpu_driver_header",
@@ -291,7 +337,6 @@ cc_library(
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:cuda_libdevice_path",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -303,10 +348,11 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_asm_compiler",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
         "//tensorflow/compiler/xla/stream_executor/cuda:ptxas_wrapper",
+        "//tensorflow/compiler/xla/stream_executor/cuda:nvlink_wrapper",
         "//tensorflow/compiler/xla/stream_executor/cuda:fatbinary_wrapper",
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_driver",
-    ]),
+    ]) + ["//tensorflow/tsl/platform:statusor"],
 )
 
 cc_library(
@@ -318,7 +364,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":asm_compiler",
@@ -338,3 +383,44 @@ cc_library(
         "//tensorflow/tsl/platform:status",
     ]),
 )
+
+# TODO(tlongeri): Remove gpu_cudamallocasync_allocator header/impl split
+tsl_gpu_library(
+    name = "gpu_cudamallocasync_allocator_header",
+    hdrs = ["gpu_cudamallocasync_allocator.h"],
+    deps = [
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/tsl/framework:allocator",
+        "//tensorflow/tsl/framework:device_id",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tsl_gpu_library(
+    name = "gpu_cudamallocasync_allocator",
+    srcs = [
+        "gpu_cudamallocasync_allocator.cc",
+    ],
+    hdrs = ["gpu_cudamallocasync_allocator.h"],
+    cuda_deps = [
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_activation",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform",
+    ],
+    deps = [
+        ":gpu_init_impl",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_id_utils",
+        "//tensorflow/tsl/framework:allocator",
+        "//tensorflow/tsl/framework:device_id",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc b/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc
index 262ec00e774..3923687fdb6 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc
@@ -32,17 +32,17 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/tsl/platform/cuda_libdevice_path.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/path.h"
 #include "tensorflow/tsl/platform/regexp.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/subprocess.h"
 
 namespace stream_executor {
 
-static port::StatusOr<absl::string_view> GetPtxasVersionString(
+static tsl::StatusOr<absl::string_view> GetToolVersionString(
     absl::string_view binary_path) {
   static absl::Mutex mu(absl::kConstInit);
   static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
@@ -60,7 +60,7 @@ static port::StatusOr<absl::string_view> GetPtxasVersionString(
   binary.SetProgram(binary_path_str, {binary_path_str, "--version"});
   binary.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
   if (!binary.Start()) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         absl::StrFormat("Couldn't invoke %s --version", binary_path));
   }
 
@@ -68,31 +68,31 @@ static port::StatusOr<absl::string_view> GetPtxasVersionString(
   int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
                                      /*stderr_output=*/nullptr);
   if (exit_code != 0) {
-    return port::InternalError(absl::StrFormat(
+    return tsl::errors::Internal(absl::StrFormat(
         "Running %s --version returned %d", binary_path, exit_code));
   }
   auto emplace_it = seen_binary_paths->emplace(binary_path, std::move(out));
   return absl::string_view(emplace_it.first->second);
 }
 
-port::StatusOr<std::array<int64_t, 3>> GetPtxasVersion(
-    absl::string_view ptxas_path) {
-  port::StatusOr<absl::string_view> ptxas_version =
-      GetPtxasVersionString(ptxas_path);
-  if (!ptxas_version.ok()) {
+tsl::StatusOr<std::array<int64_t, 3>> GetToolVersion(
+    absl::string_view tool_path) {
+  tsl::StatusOr<absl::string_view> tool_version =
+      GetToolVersionString(tool_path);
+  if (!tool_version.ok()) {
     return tsl::errors::FailedPrecondition(
-        "Couldn't get ptxas version string: ", ptxas_version.status());
+        "Couldn't get ptxas/nvlink version string: ", tool_version.status());
   }
   std::array<int64_t, 3> version;
   std::string vmaj_str, vmin_str, vdot_str;
-  if (!RE2::PartialMatch(ptxas_version.value(), R"(\bV(\d+)\.(\d+)\.(\d+)\b)",
+  if (!RE2::PartialMatch(tool_version.value(), R"(\bV(\d+)\.(\d+)\.(\d+)\b)",
                          &vmaj_str, &vmin_str, &vdot_str) ||
       !absl::SimpleAtoi(vmaj_str, &version[0]) ||
       !absl::SimpleAtoi(vmin_str, &version[1]) ||
       !absl::SimpleAtoi(vdot_str, &version[2])) {
     return tsl::errors::FailedPrecondition(
-        "Couldn't parse ptxas version in output of ", ptxas_path,
-        " --version:\n", ptxas_version.value());
+        "Couldn't parse ptxas/nvlink version in output of ", tool_path,
+        " --version:\n", tool_version.value());
   }
   return version;
 }
@@ -104,7 +104,7 @@ port::StatusOr<std::array<int64_t, 3>> GetPtxasVersion(
 //
 // Locks on entry.˝
 static void WarnIfBadPtxasVersion(absl::string_view ptxas_path) {
-  port::StatusOr<std::array<int64_t, 3>> version = GetPtxasVersion(ptxas_path);
+  tsl::StatusOr<std::array<int64_t, 3>> version = GetToolVersion(ptxas_path);
   if (!version.ok()) {
     LOG(WARNING) << "Couldn't get ptxas version : " << version.status();
     return;
@@ -119,10 +119,10 @@ static void WarnIfBadPtxasVersion(absl::string_view ptxas_path) {
   }
 }
 
-port::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
+tsl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
-  using PtxCompilerResult = port::StatusOr<std::vector<uint8_t>>;
+  using PtxCompilerResult = tsl::StatusOr<std::vector<uint8_t>>;
   static absl::Mutex ptx_cache_mutex(absl::kConstInit);
   static auto& ptx_cache ABSL_GUARDED_BY(ptx_cache_mutex) =
       *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
@@ -151,9 +151,9 @@ port::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
   return absl::MakeSpan(compiled);
 }
 
-port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options) {
+tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
+                                                  const char* ptx_contents,
+                                                  GpuAsmOpts options) {
   gpu::GpuDeviceHandle handle;
   TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
   int cc_major;
@@ -163,8 +163,8 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
 }
 
-static std::string FindCudaExecutable(const std::string& binary_name,
-                                      const std::string& preferred_cuda_dir) {
+std::string FindCudaExecutable(const std::string& binary_name,
+                               const std::string& preferred_cuda_dir) {
   static absl::Mutex mu(absl::kConstInit);
   static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
       new absl::flat_hash_map<std::pair<std::string, std::string>,
@@ -186,7 +186,7 @@ static std::string FindCudaExecutable(const std::string& binary_name,
 
   // Try searching in the default PATH first if applicable.
   if (tsl::PreferPtxasFromPath() &&
-      GetPtxasVersionString(binary_filename).ok()) {
+      GetToolVersionString(binary_filename).ok()) {
     VLOG(2) << "Using " << binary_filename;
     seen_binary_paths->emplace(std::move(cache_key), binary_filename);
     return binary_filename;
@@ -200,7 +200,7 @@ static std::string FindCudaExecutable(const std::string& binary_name,
     binary_path = tsl::io::JoinPath(cuda_root, "bin", binary_filename);
     VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
     if (env->FileExists(binary_path).ok() &&
-        GetPtxasVersionString(binary_path).ok()) {
+        GetToolVersionString(binary_path).ok()) {
       break;
     }
   }
@@ -244,15 +244,15 @@ static void AppendArgsFromOptions(GpuAsmOpts options,
               options.extra_flags.end());
 }
 
-port::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
+tsl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
     const std::string& preferred_cuda_dir) {
   std::string ptxas_path = FindCudaExecutable("ptxas", preferred_cuda_dir);
-  return GetPtxasVersion(ptxas_path);
+  return GetToolVersion(ptxas_path);
 }
 
-port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options) {
+tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                  const char* ptx_contents,
+                                                  GpuAsmOpts options) {
   std::string ptxas_path =
       FindCudaExecutable("ptxas", options.preferred_cuda_dir);
 
@@ -262,7 +262,7 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
   std::string ptx_path;
   auto env = tsl::Env::Default();
   if (!env->LocalTempFilename(&ptx_path)) {
-    return port::InternalError("couldn't get temp PTX file name");
+    return tsl::errors::Internal("couldn't get temp PTX file name");
   }
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(env, ptx_path, ptx_contents));
   VLOG(2) << "ptx written to: " << ptx_path;
@@ -274,7 +274,7 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
   // Invoke ptxas and collect its output.
   std::string cubin_path;
   if (!env->LocalTempFilename(&cubin_path)) {
-    return port::InternalError("couldn't get temp CUBIN file name");
+    return tsl::errors::Internal("couldn't get temp CUBIN file name");
   }
   absl::Cleanup cubin_cleaner = [&cubin_path] {
     // CUBIN file may never be created, so the failure to delete it should not
@@ -300,7 +300,7 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
   if (!ptxas_info_dumper.Start()) {
-    return port::InternalError("Failed to launch ptxas");
+    return tsl::errors::Internal("Failed to launch ptxas");
   }
   std::string stderr_output;
   int exit_status = ptxas_info_dumper.Communicate(
@@ -318,7 +318,7 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
           ptxas_path, " ptxas too old. Falling back to the driver to compile.");
     }
 
-    return port::InternalError(
+    return tsl::errors::Internal(
         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
                         exit_status, stderr_output));
   }
@@ -339,7 +339,7 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
   return cubin_vector;
 }
 
-port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
   std::string fatbinary_path =
       FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
@@ -350,7 +350,7 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   for (const CubinOrPTXImage& img : images) {
     std::string img_path;
     if (!env->LocalTempFilename(&img_path)) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Could not get temporary filenames for images.");
     }
     TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
@@ -367,7 +367,7 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   // Prepare temorary result file.
   std::string result_path;
   if (!env->LocalTempFilename(&result_path)) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "Could not get temporary filename for fatbin result.");
   }
   absl::Cleanup result_file_cleaner = [&result_path] {
@@ -400,13 +400,13 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   fatbinary.SetProgram(fatbinary_path, fatbinary_args);
   fatbinary.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
   if (!fatbinary.Start()) {
-    return port::InternalError("Failed to launch fatbinary.");
+    return tsl::errors::Internal("Failed to launch fatbinary.");
   }
   std::string stderr_output;
   int exit_status = fatbinary.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   if (exit_status != 0) {
-    return port::InternalError(absl::StrFormat(
+    return tsl::errors::Internal(absl::StrFormat(
         "fatbinary exited with non-zero error code %d, output: %s", exit_status,
         stderr_output));
   }
@@ -433,7 +433,7 @@ static std::string findRocmExecutable(const std::string& binary_relative_path,
   return binary_path;
 }
 
-port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<HsacoImage> images, const std::string rocm_root_dir) {
   std::string clang_offload_bundler_path =
       findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
@@ -453,7 +453,7 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   for (const HsacoImage& img : images) {
     std::string img_path;
     if (!env->LocalTempFilename(&img_path)) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Could not get temporary filenames for images.");
     }
     TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
@@ -472,7 +472,7 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
   // Prepare temorary result file.
   std::string result_path;
   if (!env->LocalTempFilename(&result_path)) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "Could not get temporary filename for fatbin result.");
   }
   absl::Cleanup result_file_cleaner = [&result_path] {
@@ -494,13 +494,13 @@ port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
                                    clang_offload_bundler_args);
   clang_offload_bundler.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
   if (!clang_offload_bundler.Start()) {
-    return port::InternalError("Failed to launch clang_offload_bundler.");
+    return tsl::errors::Internal("Failed to launch clang_offload_bundler.");
   }
   std::string stderr_output;
   int exit_status = clang_offload_bundler.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   if (exit_status != 0) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         absl::StrFormat("clang_offload_bundler exited with non-zero error "
                         "code %d, output: %s",
                         exit_status, stderr_output));
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h b/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h
index 636fab11a2d..da76c7dba28 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
 #endif  // GOOGLE_CUDA
@@ -46,9 +46,9 @@ class GpuContext;
 //
 // 'options' is used to query for the CUDA location in case it is
 // customized in a passed flag, and for controlling ptxas optimizations.
-port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options);
+tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
+                                                  const char* ptx_contents,
+                                                  GpuAsmOpts options);
 
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array. The generated cubin matches the compute
@@ -56,15 +56,15 @@ port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
 //
 // 'options' is used to query for the CUDA location in case it is
 // customized in a passed flag, and for controlling ptxas optimizations.
-port::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
-                                                   const char* ptx_contents,
-                                                   GpuAsmOpts options);
+tsl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                  const char* ptx_contents,
+                                                  GpuAsmOpts options);
 
 // Same as CompileGpuAsm, but caches the result, and returns unowned view of
 // the compiled binary.
 //
 // A copy of the string provided in ptx will be made.
-port::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
+tsl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
 
 struct CubinOrPTXImage {
@@ -74,7 +74,7 @@ struct CubinOrPTXImage {
 
 // Bundles the GPU machine code (cubins) and PTX if requested and returns the
 // resulting binary (i.e. a fatbin) as a byte array.
-port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<CubinOrPTXImage> images, GpuAsmOpts options);
 
 struct HsacoImage {
@@ -84,22 +84,33 @@ struct HsacoImage {
 
 // Bundles the GPU machine code (HSA Code Object) and returns the resulting
 // binary (i.e. a fatbin) as a byte array.
-port::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<HsacoImage> images, const std::string rocm_root_dir);
 
 // Links multiple relocatable GPU images (e.g. results of ptxas -c) into a
 // single image.
-port::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
+tsl::StatusOr<std::vector<uint8_t>> LinkGpuAsm(
     gpu::GpuContext* context, std::vector<CubinOrPTXImage> images);
 
+tsl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
+    absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
+    std::vector<CubinOrPTXImage> images);
+
+std::string FindCudaExecutable(const std::string& binary_name,
+                               const std::string& preferred_cuda_dir);
+
+// Runs tool --version and parses its version string.
+tsl::StatusOr<std::array<int64_t, 3>> GetToolVersion(
+    absl::string_view tool_path);
+
 // On NVIDIA GPUs, returns the CUDA toolkit version supported by the driver,
-port::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
+tsl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
     const std::string& preferred_cuda_dir);
 
 #if GOOGLE_CUDA
 // Maintains a cache of pointers to loaded kernels
 template <typename... Args>
-port::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
+tsl::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
     StreamExecutor* executor, absl::string_view kernel_name,
     absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
   using KernelPtrCacheKey =
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
similarity index 89%
rename from tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
rename to tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
index fff1c2e49a1..083da0a6ef4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
 
 #include <map>
 #include <memory>
@@ -28,16 +28,16 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/mutex.h"
 #include "tensorflow/tsl/util/env_var.h"
 
-namespace tensorflow {
+namespace stream_executor {
 
 #if GOOGLE_CUDA
 static std::string GetCudaErrorMessage(CUresult result) {
@@ -50,21 +50,19 @@ static std::string GetCudaErrorMessage(CUresult result) {
 }
 #endif  // GOOGLE_CUDA
 
-void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
-  mutex_lock lock(lock_);
-
-  std::map<size_t, int> size_map_historgram;
-  std::vector<string> ptr_size_string;
+void GpuCudaMallocAsyncAllocator::PrintAllocatorStatisticsNoLock() {
+  std::map<size_t, int> size_map_histogram;
+  std::vector<std::string> ptr_size_string;
   for (auto p : size_map_) {
     if (VLOG_IS_ON(8)) {
       ptr_size_string.push_back(
           absl::StrCat("(", absl::Hex(p.first), ",", p.second) + ")");
     }
-    size_map_historgram[p.second]++;
+    size_map_histogram[p.second]++;
   }
   LOG(ERROR) << "Histogram of current allocation: (allocation_size_in_bytes, "
              << "nb_allocation_of_that_sizes), ...;";
-  for (auto p : size_map_historgram) {
+  for (auto p : size_map_histogram) {
     LOG(ERROR) << p.first << ", " << p.second;
   }
 
@@ -104,6 +102,11 @@ void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
 #endif
 }
 
+void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
+  tsl::mutex_lock lock(lock_);
+  PrintAllocatorStatisticsNoLock();
+}
+
 std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
 
 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
@@ -118,8 +121,8 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
   (void)reserve_memory_;
 
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
-  stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                           platform_device_id)
+  stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(
+                     GPUMachineManager(), platform_device_id)
                      .value();
   // Initialized here as it only exist if compiled with a recent
   // enough CUDA.
@@ -145,7 +148,7 @@ GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
           << "Failed to retain context: " << GetCudaErrorMessage(result);
   }
 
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
+  cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
 
   // Check the CUDA runtime is recent enough.
   if (auto status2 = cuDriverGetVersion(&driverVersion)) {
@@ -286,7 +289,14 @@ void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
         << "The instantiation of GpuCudaMallocAsyncAllocator failed."
         << " See previous errors.";
   }
-  se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
+  // The lock is only needed when stats are enabled, but it must be around
+  // the cuMemAllocFromPoolAsync call as well to ensure consistency of the stats
+  // update.
+  std::unique_lock<tsl::mutex> lock(lock_, std::defer_lock);
+  if (stats_) {
+    lock.lock();
+  }
+  cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   void* ptr = nullptr;
   if (auto result =
           cuMemAllocFromPoolAsync(reinterpret_cast<CUdeviceptr*>(&ptr),
@@ -297,17 +307,16 @@ void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
                << " bytes: " << GetCudaErrorMessage(result)
                << "\n Reported by CUDA: Free memory/Total memory: " << free
                << "/" << total;
-    if (auto stats = GetStats())
-      LOG(ERROR) << "Stats: " << stats->DebugString();
-
-    PrintAllocatorStatistics();
+    if (stats_) {
+      LOG(ERROR) << "Stats: " << stats_->DebugString();
+      PrintAllocatorStatisticsNoLock();
+    }
 
     return nullptr;
   }
 
   // Update stats.
   if (stats_) {
-    mutex_lock lock(lock_);
     ++(stats_->num_allocs);
     stats_->bytes_in_use += num_bytes;
     if (stats_->bytes_in_use > stats_->peak_bytes_in_use) {
@@ -318,7 +327,8 @@ void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
         std::max(stats_->peak_bytes_in_use, stats_->bytes_in_use);
     stats_->largest_alloc_size =
         std::max<std::size_t>(stats_->largest_alloc_size, num_bytes);
-    size_map_[ptr] = num_bytes;
+    bool ptr_inserted = size_map_.emplace(ptr, num_bytes).second;
+    DCHECK(ptr_inserted);
   }
   VLOG(10) << Name() << " Allocated " << num_bytes << " at " << ptr;
   return ptr;
@@ -329,6 +339,12 @@ void* GpuCudaMallocAsyncAllocator::AllocateRaw(size_t alignment,
 void GpuCudaMallocAsyncAllocator::DeallocateRaw(void* ptr) {
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   if (ptr == nullptr) return;
+  // The lock is only needed when stats are enabled, but it must be around
+  // the cuMemFreeAsync call as well to ensure consistency of the stats update.
+  std::unique_lock<tsl::mutex> lock(lock_, std::defer_lock);
+  if (stats_) {
+    lock.lock();
+  }
   if (auto result = cuMemFreeAsync(reinterpret_cast<const CUdeviceptr&>(ptr),
                                    cuda_stream_)) {
     if (result == CUDA_ERROR_DEINITIALIZED) {
@@ -338,19 +354,19 @@ void GpuCudaMallocAsyncAllocator::DeallocateRaw(void* ptr) {
       VLOG(1) << "Ignoring CUDA error: " << GetCudaErrorMessage(result);
     } else {
       size_t free, total;
-      se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
+      cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
       cuMemGetInfo(&free, &total);
       LOG(ERROR) << "cudaFreeAsync failed to free " << ptr << ": "
                  << GetCudaErrorMessage(result)
                  << "\n Free memory/Total memory: " << free << "/" << total;
-      if (auto stats = GetStats())
-        LOG(ERROR) << "Stats: " << stats->DebugString();
+      if (stats_) {
+        LOG(ERROR) << "Stats: " << stats_->DebugString();
+      }
     }
   }
 
   // Updates the stats.
   if (stats_) {
-    mutex_lock lock(lock_);
     DCHECK(size_map_.contains(ptr));
     size_t size = size_map_[ptr];
     stats_->bytes_in_use -= size;
@@ -367,25 +383,25 @@ bool GpuCudaMallocAsyncAllocator::TracksAllocationSizes() const {
 
 size_t GpuCudaMallocAsyncAllocator::RequestedSize(const void* ptr) const {
   if (!stats_ || !ptr) return 0;
-  mutex_lock l(lock_);
+  tsl::mutex_lock l(lock_);
   return size_map_.at(ptr);
 }
 
 size_t GpuCudaMallocAsyncAllocator::AllocatedSize(const void* ptr) const {
   if (!stats_ || !ptr) return 0;
-  mutex_lock l(lock_);
+  tsl::mutex_lock l(lock_);
   return size_map_.at(ptr);
 }
 
 std::optional<tsl::AllocatorStats> GpuCudaMallocAsyncAllocator::GetStats() {
   if (!stats_) return std::nullopt;
-  mutex_lock l(lock_);
+  tsl::mutex_lock l(lock_);
   return *stats_;
 }
 
 bool GpuCudaMallocAsyncAllocator::ClearStats() {
   if (!stats_) return false;
-  mutex_lock l(lock_);
+  tsl::mutex_lock l(lock_);
   stats_->num_allocs = 0;
   stats_->peak_bytes_in_use = stats_->bytes_in_use;
   stats_->largest_alloc_size = 0;
@@ -408,7 +424,7 @@ void GpuCudaMallocAsyncAllocator::SetStreamAndPreallocateMemory(void* stream) {
         "Failed to get CUDA pool attribute: " << GetCudaErrorMessage(status);
   }
   cuda_stream_ = new_cuda_stream;
-  int64 prealloc_size = 0;
+  int64_t prealloc_size = 0;
   // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
   // preallocates the total pool size.
   TF_CHECK_OK(tsl::ReadInt64FromEnvVar(
@@ -430,4 +446,4 @@ void GpuCudaMallocAsyncAllocator::SetStreamAndPreallocateMemory(void* stream) {
 #endif
 }
 
-}  // namespace tensorflow
+}  // namespace stream_executor
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
similarity index 85%
rename from tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
rename to tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
index ab0e74083e7..43180f8e1dd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
 
 #include <memory>
 #include <optional>
 #include <string>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/mutex.h"
-#include "tensorflow/tsl/platform/thread_annotations.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -35,7 +35,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 
-namespace tensorflow {
+namespace stream_executor {
 
 // An allocator that wraps cudaMallocAsync. It has fewer fragmentation
 // issues then the BFC memory allocator.  The compute-sanitizer tool
@@ -72,8 +72,9 @@ class GpuCudaMallocAsyncAllocator : public tsl::Allocator {
                                        bool compute_stats = true);
   ~GpuCudaMallocAsyncAllocator() override;
   std::string Name() override { return name_; }
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
-  void DeallocateRaw(void* ptr) override;
+  void* AllocateRaw(size_t alignment,
+                    size_t num_bytes) override ABSL_NO_THREAD_SAFETY_ANALYSIS;
+  void DeallocateRaw(void* ptr) override ABSL_NO_THREAD_SAFETY_ANALYSIS;
 
   bool TracksAllocationSizes() const override;
 
@@ -100,8 +101,10 @@ class GpuCudaMallocAsyncAllocator : public tsl::Allocator {
   }
 
  private:
+  void PrintAllocatorStatisticsNoLock() ABSL_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
-  se::StreamExecutor* stream_exec_;  // Not owned.
+  StreamExecutor* stream_exec_;  // Not owned.
 
   // cudaMallocAsync is stream aware. But TF StreamExecutor use only 1
   // compute stream and already synchronize with the h2d, d2h and d2d
@@ -129,10 +132,10 @@ class GpuCudaMallocAsyncAllocator : public tsl::Allocator {
   // Stats.
   // Structures mutable after construction
   mutable tsl::mutex lock_;
-  std::unique_ptr<tsl::AllocatorStats> stats_ TF_PT_GUARDED_BY(lock_);
-  absl::flat_hash_map<const void*, size_t> size_map_ TF_GUARDED_BY(lock_);
+  std::unique_ptr<tsl::AllocatorStats> stats_ ABSL_PT_GUARDED_BY(lock_);
+  absl::flat_hash_map<const void*, size_t> size_map_ ABSL_GUARDED_BY(lock_);
 };
 
-}  // namespace tensorflow
+}  // namespace stream_executor
 
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_CUDAMALLOCASYNC_ALLOCATOR_H_
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h
index 221774bd104..8e181d2204e 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <tuple>
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -34,10 +34,10 @@ using DriverVersion = std::tuple<int, int, int>;
 // string DriverVersionToString(DriverVersion version);
 //
 //// Converts a parsed driver version or status value to natural string form.
-// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+// string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version);
 //
 //// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+// tsl::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
 
 class Diagnostician {
  public:
@@ -58,15 +58,15 @@ class Diagnostician {
   //
   // This is solely used for more informative log messages when the user is
   // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+  static tsl::StatusOr<DriverVersion> FindKernelModuleVersion(
       const std::string& driver_version_file_contents);
 
   // Extracts the kernel driver version from the current host.
-  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+  static tsl::StatusOr<DriverVersion> FindKernelDriverVersion();
 
   // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
   // driver-interfacing DSO version number. Returns it as a string.
-  static port::StatusOr<DriverVersion> FindDsoVersion();
+  static tsl::StatusOr<DriverVersion> FindDsoVersion();
 
   // Logs information about the kernel driver version and userspace driver
   // library version.
@@ -80,8 +80,8 @@ class Diagnostician {
   // This is solely used for more informative log messages when the user is
   // running on a machine that happens to have a libcuda/kernel driver mismatch.
   static void WarnOnDsoKernelMismatch(
-      port::StatusOr<DriverVersion> dso_version,
-      port::StatusOr<DriverVersion> kernel_version);
+      tsl::StatusOr<DriverVersion> dso_version,
+      tsl::StatusOr<DriverVersion> kernel_version);
 
   // Logs information about the dev nodes present on this machine: their
   // existence, permissions, accessibility from this uid/gid.
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
index 547b8c8fcdb..c8d675b750f 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
@@ -24,9 +24,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/device_options.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -62,12 +62,12 @@ class GpuDriver {
   // the case of failure. Safe to call multiple times; will be fast on all calls
   // after the first.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
-  static port::Status Init();
+  static tsl::Status Init();
 
   // Returns the device associated with the given context.
   // device is an outparam owned by the caller, must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
-  static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
+  static tsl::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
 
   // Creates a new CUDA stream associated with the given context via
   // cuStreamCreate.
@@ -90,13 +90,13 @@ class GpuDriver {
   // Creates a new event associated with the given context.
   // result is an outparam owned by the caller and must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status InitEvent(GpuContext* context, GpuEventHandle* result,
-                                EventFlags flags);
+  static tsl::Status InitEvent(GpuContext* context, GpuEventHandle* result,
+                               EventFlags flags);
 
   // Destroys *event and turns it into a nullptr. event may not be null, but
   // *event may be, via cuEventDestroy
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
-  static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
+  static tsl::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
 
   // Allocates a GPU memory space of size bytes associated with the given
   // context via cuMemAlloc.
@@ -154,8 +154,8 @@ class GpuDriver {
     // Size in bytes.
     uint64_t size_bytes;
   };
-  static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context,
-                                                       uint64_t bytes);
+  static tsl::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context,
+                                                      uint64_t bytes);
 
   // Frees a range of virtual addresses that were previously reserved through
   // ReserveVirtualMemory via cuMemAddressFree.
@@ -165,7 +165,7 @@ class GpuDriver {
   // Calculates the minimum alignment for memory allocations done through
   // cuMemCreate via cuMemGetAllocationGranularity.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
-  static port::StatusOr<uint64_t> GetMinAllocationGranularity(
+  static tsl::StatusOr<uint64_t> GetMinAllocationGranularity(
       GpuDeviceHandle device);
 
   // Allocates physical memory and returns a handle that can be mapped to
@@ -176,7 +176,7 @@ class GpuDriver {
     uint64_t handle;
     uint64_t bytes;
   };
-  static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle(
+  static tsl::StatusOr<GenericMemoryHandle> CreateMemoryHandle(
       GpuContext* context, uint64_t bytes);
 
   // Frees memory represented by the provided MemoryHandle via cuMemRelease.
@@ -188,7 +188,7 @@ class GpuDriver {
   // cuMemMap and sets the appropriate access settings via cuMemSetAccess.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
-  static port::Status MapMemory(
+  static tsl::Status MapMemory(
       GpuContext* context, GpuDevicePtr va, const GenericMemoryHandle& handle,
       const std::vector<GpuDeviceHandle>& device_handles);
 
@@ -205,12 +205,12 @@ class GpuDriver {
   //
   // N.B. these device handles do not have a corresponding destroy function in
   // the CUDA driver API.
-  static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
+  static tsl::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
 
   // Given a device handle, returns the name reported by the driver for the
   // device.
-  static port::Status GetDeviceName(GpuDeviceHandle device,
-                                    std::string* device_name);
+  static tsl::Status GetDeviceName(GpuDeviceHandle device,
+                                   std::string* device_name);
 
   // Given a device to create a context for, returns a context handle into the
   // context outparam, which must not be null.
@@ -219,9 +219,9 @@ class GpuDriver {
   // calling thread. Current documentation on contexts and their influence on
   // userspace processes is given here:
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
-  static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
-                                    const DeviceOptions& device_options,
-                                    GpuContext** context);
+  static tsl::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
+                                   const DeviceOptions& device_options,
+                                   GpuContext** context);
 
   // Destroys the provided context via cuCtxDestroy.
   // Don't do this while clients could still be using the context, per the docs
@@ -238,32 +238,32 @@ class GpuDriver {
   // in terms of integer-sized values, so there's no potential for overrun (as
   // of CUDA 5.5).
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
-  static port::Status FuncGetAttribute(GpuFunctionAttribute attribute,
-                                       GpuFunctionHandle function,
-                                       int* attribute_value);
+  static tsl::Status FuncGetAttribute(GpuFunctionAttribute attribute,
+                                      GpuFunctionHandle function,
+                                      int* attribute_value);
 
   // Sets the preferred cache configuration for the specified function.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
-  static port::Status FuncSetCacheConfig(GpuFunctionHandle function,
-                                         GpuFuncCachePreference cache_config);
+  static tsl::Status FuncSetCacheConfig(GpuFunctionHandle function,
+                                        GpuFuncCachePreference cache_config);
 
   // Gets the preferred shared memory bank configuration for the specified
   // CONTEXT (not function!), either default or four- or eight-byte bank size.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
-  static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
+  static tsl::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
       GpuContext* context);
 
   // Sets the preferred shared memory bank configuration for the specified
   // CONTEXT (not function!), either default or four- or eight-byte bank size.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
-  static port::Status ContextSetSharedMemConfig(
+  static tsl::Status ContextSetSharedMemConfig(
       GpuContext* context, GpuSharedMemConfig shared_mem_config);
 
   // Launches a CUDA kernel via cuLaunchKernel.
   // TODO(leary) describe the structure of kernel_params and extra in a readable
   // way.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static port::Status LaunchKernel(
+  static tsl::Status LaunchKernel(
       GpuContext* context, absl::string_view kernel_name,
       GpuFunctionHandle function, unsigned int grid_dim_x,
       unsigned int grid_dim_y, unsigned int grid_dim_z,
@@ -274,20 +274,20 @@ class GpuDriver {
   // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
   // handle in "module". Any error logs that are produced are logged internally.
   // (supported on CUDA only)
-  static port::Status LoadPtx(GpuContext* context, const char* ptx_contents,
-                              GpuModuleHandle* module);
+  static tsl::Status LoadPtx(GpuContext* context, const char* ptx_contents,
+                             GpuModuleHandle* module);
 
   // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
   // the resulting handle in "module".
   // (supported on CUDA only)
-  static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
-                                GpuModuleHandle* module);
+  static tsl::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
+                               GpuModuleHandle* module);
 
   // Loads HSACO with the ROCM runtime and stores the resulting handle in
   // "module". Any error logs that are produced are logged internally.
   // (supported on ROCm only)
-  static port::Status LoadHsaco(GpuContext* context, const char* hsaco_contents,
-                                GpuModuleHandle* module);
+  static tsl::Status LoadHsaco(GpuContext* context, const char* hsaco_contents,
+                               GpuModuleHandle* module);
 
   // Retrieves a named kernel from a loaded module, and places the resulting
   // handle into function (outparam) on success. Neither kernel_name nor
@@ -312,46 +312,45 @@ class GpuDriver {
 
   // Performs a synchronous memset of the device memory segment via cuMemsetD8.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
-  static port::Status SynchronousMemsetUint8(GpuContext* context,
-                                             GpuDevicePtr location,
-                                             uint8_t value, size_t size);
+  static tsl::Status SynchronousMemsetUint8(GpuContext* context,
+                                            GpuDevicePtr location,
+                                            uint8_t value, size_t size);
 
   // Performs a synchronous memset of the device memory segment via cuMemsetD32.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
-  static port::Status SynchronousMemsetUint32(GpuContext* context,
-                                              GpuDevicePtr location,
-                                              uint32_t value,
-                                              size_t uint32_count);
+  static tsl::Status SynchronousMemsetUint32(GpuContext* context,
+                                             GpuDevicePtr location,
+                                             uint32_t value,
+                                             size_t uint32_count);
 
   // Performs an asynchronous memset of the device memory segment via
   // cuMemsetD8Async.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
-  static port::Status AsynchronousMemsetUint8(GpuContext* context,
-                                              GpuDevicePtr location,
-                                              uint8_t value,
-                                              size_t uint32_count,
-                                              GpuStreamHandle stream);
+  static tsl::Status AsynchronousMemsetUint8(GpuContext* context,
+                                             GpuDevicePtr location,
+                                             uint8_t value, size_t uint32_count,
+                                             GpuStreamHandle stream);
 
   // Performs an asynchronous memset of the device memory segment via
   // cuMemsetD32Async.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
-  static port::Status AsynchronousMemsetUint32(GpuContext* context,
-                                               GpuDevicePtr location,
-                                               uint32_t value,
-                                               size_t uint32_count,
-                                               GpuStreamHandle stream);
+  static tsl::Status AsynchronousMemsetUint32(GpuContext* context,
+                                              GpuDevicePtr location,
+                                              uint32_t value,
+                                              size_t uint32_count,
+                                              GpuStreamHandle stream);
 
   // -- Synchronous memcopies.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
 
-  static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
-                                           GpuDevicePtr gpu_src, uint64_t size);
-  static port::Status SynchronousMemcpyH2D(GpuContext* context,
-                                           GpuDevicePtr gpu_dst,
-                                           const void* host_src, uint64_t size);
-  static port::Status SynchronousMemcpyD2D(GpuContext* context,
-                                           GpuDevicePtr gpu_dst,
-                                           GpuDevicePtr gpu_src, uint64_t size);
+  static tsl::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                          GpuDevicePtr gpu_src, uint64_t size);
+  static tsl::Status SynchronousMemcpyH2D(GpuContext* context,
+                                          GpuDevicePtr gpu_dst,
+                                          const void* host_src, uint64_t size);
+  static tsl::Status SynchronousMemcpyD2D(GpuContext* context,
+                                          GpuDevicePtr gpu_dst,
+                                          GpuDevicePtr gpu_src, uint64_t size);
 
   // -- Asynchronous memcopies.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
@@ -398,8 +397,8 @@ class GpuDriver {
   // amount of time?
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static port::Status SynchronizeStream(GpuContext* context,
-                                        GpuStreamHandle stream);
+  static tsl::Status SynchronizeStream(GpuContext* context,
+                                       GpuStreamHandle stream);
 
   // Blocks the calling thread until the operations associated with the context
   // have been completed, via cuCtxSynchronize.
@@ -426,7 +425,7 @@ class GpuDriver {
 
   // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
-  static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
+  static tsl::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
 
   // Returns the elapsed milliseconds between start and stop via
   // cuEventElapsedTime.
@@ -438,30 +437,29 @@ class GpuDriver {
   // Records that an event occurred when execution reaches the current point in
   // thestream via cuEventRecord.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
-  static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
-                                  GpuStreamHandle stream);
+  static tsl::Status RecordEvent(GpuContext* context, GpuEventHandle event,
+                                 GpuStreamHandle stream);
 
   // Polls (without blocking) to determine the status of an event - pending or
   // complete (or an error status).
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
-                                              GpuEventHandle event);
+  static tsl::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
+                                             GpuEventHandle event);
 
   // -- Pointer-specific calls.
 
   // Returns the context in which pointer was allocated or registered.
-  static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
+  static tsl::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
 
   // Returns the device associated with the context from GetPointerContext().
-  static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
+  static tsl::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
 
   // Returns the memory space addressed by pointer.
-  static port::StatusOr<MemorySpace> GetPointerMemorySpace(
-      GpuDevicePtr pointer);
+  static tsl::StatusOr<MemorySpace> GetPointerMemorySpace(GpuDevicePtr pointer);
 
   // Returns the base address and size of the device pointer dptr.
-  static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
-                                             GpuDevicePtr* base, size_t* size);
+  static tsl::Status GetPointerAddressRange(GpuDevicePtr dptr,
+                                            GpuDevicePtr* base, size_t* size);
 
   // -- Device-specific calls.
 
@@ -469,53 +467,52 @@ class GpuDriver {
   // This is currently done via the deprecated device API.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
   // (supported on CUDA only)
-  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
-                                           GpuDeviceHandle device);
+  static tsl::Status GetComputeCapability(int* cc_major, int* cc_minor,
+                                          GpuDeviceHandle device);
 
   // Returns Gpu ISA version for the device; i.e 803, 900.
   // (supported on ROCm only)
-  static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
+  static tsl::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
 
   // Return the full GCN Architecture Name for the device
   // for eg: amdgcn-amd-amdhsa--gfx908:sramecc+:xnack-
   // (supported on ROCm only)
-  static port::Status GetGpuGCNArchName(GpuDeviceHandle device,
-                                        std::string* gcnArchName);
+  static tsl::Status GetGpuGCNArchName(GpuDeviceHandle device,
+                                       std::string* gcnArchName);
 
 #if TENSORFLOW_USE_ROCM
   // tests the current device for MFMA insn support (ROCm only)
-  static port::StatusOr<bool> GetMFMASupport();
+  static tsl::StatusOr<bool> GetMFMASupport();
 #endif
 
   // Returns the number of multiprocessors on the device (note that the device
   // may be multi-GPU-per-board).
-  static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
+  static tsl::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
 
   // Returns the limit on number of threads that can be resident in a single
   // multiprocessor.
-  static port::StatusOr<int64_t> GetMaxThreadsPerMultiprocessor(
+  static tsl::StatusOr<int64_t> GetMaxThreadsPerMultiprocessor(
       GpuDeviceHandle device);
 
   // Returns the limit on number of threads which may be resident for a single
   // block (cooperative thread array).
-  static port::StatusOr<int64_t> GetMaxThreadsPerBlock(GpuDeviceHandle device);
+  static tsl::StatusOr<int64_t> GetMaxThreadsPerBlock(GpuDeviceHandle device);
 
   // Returns the amount of shared memory available on a single GPU core (i.e.
   // SM on NVIDIA devices).
-  static port::StatusOr<int64_t> GetMaxSharedMemoryPerCore(
+  static tsl::StatusOr<int64_t> GetMaxSharedMemoryPerCore(
       GpuDeviceHandle device);
 
   // Returns the amount of shared memory available for a single block
   // (cooperative thread array).
-  static port::StatusOr<int64_t> GetMaxSharedMemoryPerBlock(
+  static tsl::StatusOr<int64_t> GetMaxSharedMemoryPerBlock(
       GpuDeviceHandle device);
 
   // Returns the maximum supported number of registers per block.
-  static port::StatusOr<int64_t> GetMaxRegistersPerBlock(
-      GpuDeviceHandle device);
+  static tsl::StatusOr<int64_t> GetMaxRegistersPerBlock(GpuDeviceHandle device);
 
   // Returns the number of threads per warp.
-  static port::StatusOr<int64_t> GetThreadsPerWarp(GpuDeviceHandle device);
+  static tsl::StatusOr<int64_t> GetThreadsPerWarp(GpuDeviceHandle device);
 
   // Queries the grid limits for device with cuDeviceGetAttribute calls.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
@@ -534,8 +531,8 @@ class GpuDriver {
   // Gets a specific integer-valued property about the given device.
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
-                                                GpuDeviceHandle device);
+  static tsl::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
+                                               GpuDeviceHandle device);
 
   // Returns whether ECC is enabled for the given GpuDeviceHandle via
   // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
@@ -579,7 +576,7 @@ class GpuDriver {
   // specified kernel/GpuFunctionHandle when launched with the specified
   // parameters.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
-  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+  static tsl::StatusOr<int> GetMaxOccupiedBlocksPerCore(
       GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
       size_t dynamic_shared_memory_bytes);
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.cc
index d45f8989e8b..44e4b1585a7 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -27,16 +27,16 @@ GpuEvent::GpuEvent(GpuExecutor* parent)
 
 GpuEvent::~GpuEvent() {}
 
-port::Status GpuEvent::Init() {
+tsl::Status GpuEvent::Init() {
   return GpuDriver::InitEvent(parent_->gpu_context(), &gpu_event_,
                               GpuDriver::EventFlags::kDisableTiming);
 }
 
-port::Status GpuEvent::Destroy() {
+tsl::Status GpuEvent::Destroy() {
   return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
 }
 
-port::Status GpuEvent::Record(GpuStream* stream) {
+tsl::Status GpuEvent::Record(GpuStream* stream) {
   return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
                                 stream->gpu_stream());
 }
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h
index b66cd88c7c9..1c42a94d3ed 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/event.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -33,14 +33,14 @@ class GpuEvent : public internal::EventInterface {
   ~GpuEvent() override;
 
   // Populates the CUDA-platform-specific elements of this object.
-  port::Status Init();
+  tsl::Status Init();
 
   // Deallocates any platform-specific elements of this object. This is broken
   // out (not part of the destructor) to allow for error reporting.
-  port::Status Destroy();
+  tsl::Status Destroy();
 
   // Inserts the event at the current position into the specified stream.
-  port::Status Record(GpuStream* stream);
+  tsl::Status Record(GpuStream* stream);
 
   // Polls the CUDA platform for the event's current status.
   Event::Status PollForStatus();
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
index bdfce139e21..da89fdf73de 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h
@@ -24,22 +24,25 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <set>
+#include <string>
 #include <type_traits>
 #include <unordered_map>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/stream_executor/event.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_kernel.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/tsl/platform/fingerprint.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -105,25 +108,26 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   ~GpuExecutor() override;
 
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+  tsl::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
+                        KernelBase* kernel) override;
 
-  port::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                         KernelBase* kernel) override;
   // (supported on CUDA only)
   void UnloadKernel(const KernelBase* kernel) override;
-  port::Status LoadModule(const MultiModuleLoaderSpec& spec,
-                          ModuleHandle* module_handle) override;
+  tsl::Status LoadModule(const MultiModuleLoaderSpec& spec,
+                         ModuleHandle* module_handle) override;
   bool UnloadModule(ModuleHandle module_handle) override;
 
   // Allocates and initializes a new constant on the device with the given
   // content. Or, if a device with identical content is already on-device,
   // returns a pointer to that buffer with shared ownership.
-  port::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
+  tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
       Stream* stream, const std::vector<uint8_t>& content) override;
 
-  port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                      const BlockDim& block_dims, const KernelBase& k,
-                      const KernelArgsArrayBase& args) override;
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims, const KernelBase& k,
+                     const KernelArgsArrayBase& args) override;
 
   // (supported on CUDA only)
   int CalculateOccupancy(const DeviceDescription& device_description,
@@ -171,29 +175,28 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   bool SynchronizeAllActivity() override;
 
-  port::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                  uint64_t size) override;
-
-  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+  tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                  uint64_t size) override;
 
-  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                 const void* host_src, uint64_t size) override;
+  tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                uint64_t size) override;
 
-  port::Status SynchronousMemcpy(void* host_dst,
-                                 const DeviceMemoryBase& gpu_src,
-                                 uint64_t size) override;
+  tsl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                                uint64_t size) override;
+
+  tsl::Status SynchronousMemcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                                uint64_t size) override;
 
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64_t size) override;
+  tsl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                              const DeviceMemoryBase& gpu_src,
+                                              uint64_t size) override;
 
-  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                       uint64_t size) override;
-  port::Status Memset(Stream* stream, DeviceMemoryBase* location,
-                      uint8_t pattern, uint64_t size) override;
-  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                        uint32_t pattern, uint64_t size) override;
+  tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                      uint64_t size) override;
+  tsl::Status Memset(Stream* stream, DeviceMemoryBase* location,
+                     uint8_t pattern, uint64_t size) override;
+  tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                       uint32_t pattern, uint64_t size) override;
 
   bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
               uint64_t size) override;
@@ -206,7 +209,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                             uint64_t size) override;
 
   bool HostCallback(Stream* stream,
-                    std::function<port::Status()> callback) override;
+                    absl::AnyInvocable<tsl::Status() &&> callback) override;
 
   bool AllocateStream(Stream* stream) override;
 
@@ -222,21 +225,21 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   bool StopTimer(Stream* stream, Timer* timer) override;
 
-  port::Status AllocateEvent(Event* event) override;
+  tsl::Status AllocateEvent(Event* event) override;
 
-  port::Status DeallocateEvent(Event* event) override;
+  tsl::Status DeallocateEvent(Event* event) override;
 
-  port::Status RecordEvent(Stream* stream, Event* event) override;
+  tsl::Status RecordEvent(Stream* stream, Event* event) override;
 
-  port::Status WaitForEvent(Stream* stream, Event* event) override;
+  tsl::Status WaitForEvent(Stream* stream, Event* event) override;
 
   Event::Status PollForEventStatus(Event* event) override;
 
-  port::Status BlockHostUntilDone(Stream* stream) override;
+  tsl::Status BlockHostUntilDone(Stream* stream) override;
 
   int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
 
-  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+  tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
 
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
 
@@ -248,12 +251,12 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
                  void** mem, size_t* bytes) override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
     return CreateDeviceDescription(device_ordinal_);
   }
 
-  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  static tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
   bool SupportsBlas() const override;
@@ -335,8 +338,8 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                                    void* data);
 
   // Collects metadata for the specified kernel.
-  port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
-                                 KernelMetadata* kernel_metadata);
+  tsl::Status GetKernelMetadata(GpuKernel* cuda_kernel,
+                                KernelMetadata* kernel_metadata);
 
   // Prints to VLOG(2) information about the kernel's occupancy and how it might
   // be improved.
@@ -344,16 +347,16 @@ class GpuExecutor : public internal::StreamExecutorInterface {
                          const BlockDim& block_dims);
 
   // (supported on CUDA only)
-  port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+  tsl::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
   // (supported on CUDA only)
-  port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+  tsl::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // (supported on ROCm only)
-  port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
+  tsl::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   bool UnloadGpuBinary(const void* gpu_binary)
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_init.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_init.cc
new file mode 100644
index 00000000000..9f310117986
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_init.cc
@@ -0,0 +1,52 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
+#include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace stream_executor {
+
+tsl::Status ValidateGPUMachineManager() {
+  return MultiPlatformManager::PlatformWithName(GpuPlatformName()).status();
+}
+
+Platform* GPUMachineManager() {
+  auto result = MultiPlatformManager::PlatformWithName(GpuPlatformName());
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not find Platform with name " << GpuPlatformName();
+    return nullptr;
+  }
+
+  return result.value();
+}
+
+std::string GpuPlatformName() {
+#if TENSORFLOW_USE_ROCM
+  return "ROCM";
+#else
+  // This function will return "CUDA" even when building TF without GPU support
+  // This is done to preserve existing functionality
+  return "CUDA";
+#endif
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h
similarity index 79%
rename from tensorflow/core/common_runtime/gpu/gpu_init.h
rename to tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h
index b1a82390147..270df6f5346 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_init.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h
@@ -13,33 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
 
 #include <string>
-#include "tensorflow/core/lib/core/status.h"
+
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 class Platform;
-}  // namespace stream_executor
-
-namespace tensorflow {
 
 // Initializes the GPU platform and returns OK if the GPU
 // platform could be initialized.
-Status ValidateGPUMachineManager();
+tsl::Status ValidateGPUMachineManager();
 
 // Returns the GPU machine manager singleton, creating it and
 // initializing the GPUs on the machine if needed the first time it is
 // called.  Must only be called when there is a valid GPU environment
 // in the process (e.g., ValidateGPUMachineManager() returns OK).
-stream_executor::Platform* GPUMachineManager();
+Platform* GPUMachineManager();
 
 // Returns the string describing the name of the GPU platform in use.
 // This value is "CUDA" by default, and
 // "ROCM" when TF is built with `--config==rocm`
 std::string GpuPlatformName();
 
-}  // namespace tensorflow
+}  // namespace stream_executor
 
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_INIT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_INIT_H_
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
index adc6f7708c0..8e572c82b31 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -34,7 +34,7 @@ bool GpuStream::Init() {
 
 void GpuStream::Destroy() {
   if (completed_event_ != nullptr) {
-    port::Status status =
+    tsl::Status status =
         GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.cc
index aa88bb0923b..784dfbcb2fb 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -26,8 +26,8 @@ namespace gpu {
 bool GpuTimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
   GpuContext* context = parent_->gpu_context();
-  port::Status status = GpuDriver::InitEvent(context, &start_event_,
-                                             GpuDriver::EventFlags::kDefault);
+  tsl::Status status = GpuDriver::InitEvent(context, &start_event_,
+                                            GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return false;
@@ -50,7 +50,7 @@ bool GpuTimer::Init() {
 
 void GpuTimer::Destroy() {
   GpuContext* context = parent_->gpu_context();
-  port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
+  tsl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
@@ -72,7 +72,7 @@ float GpuTimer::GetElapsedMilliseconds() const {
 }
 
 bool GpuTimer::Start(GpuStream* stream) {
-  port::Status status = GpuDriver::RecordEvent(
+  tsl::Status status = GpuDriver::RecordEvent(
       parent_->gpu_context(), start_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
@@ -81,7 +81,7 @@ bool GpuTimer::Start(GpuStream* stream) {
 }
 
 bool GpuTimer::Stop(GpuStream* stream) {
-  port::Status status = GpuDriver::RecordEvent(
+  tsl::Status status = GpuDriver::RecordEvent(
       parent_->gpu_context(), stop_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
index af093fe3515..ce1a6fdde11 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc
@@ -64,12 +64,12 @@ RedzoneAllocator::RedzoneAllocator(Stream* stream,
       memory_allocator_(memory_allocator),
       gpu_compilation_opts_(ptx_compilation_opts) {}
 
-port::StatusOr<DeviceMemory<uint8_t>> RedzoneAllocator::AllocateBytes(
+tsl::StatusOr<DeviceMemory<uint8_t>> RedzoneAllocator::AllocateBytes(
     int64_t byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes()) {
-    return port::Status(
-        port::error::RESOURCE_EXHAUSTED,
+    return tsl::Status(
+        tsl::error::RESOURCE_EXHAUSTED,
         absl::StrFormat(
             "Allocating %d bytes exceeds the memory limit of %d bytes.",
             byte_size, GetMemoryLimitInBytes()));
@@ -182,7 +182,7 @@ using ComparisonKernelT = TypedKernel<DeviceMemory<uint8_t>, uint8_t, uint64_t,
 // Check that redzones weren't overwritten on a host.
 //
 // Slower, but gives a more useful error message.
-static port::StatusOr<RedzoneCheckStatus> CheckRedzoneHost(
+static tsl::StatusOr<RedzoneCheckStatus> CheckRedzoneHost(
     DeviceMemoryBase redzone, DeviceMemoryBase user_allocation,
     absl::string_view name, Stream* stream, uint8_t redzone_pattern) {
   uint64_t size = redzone.size();
@@ -216,7 +216,7 @@ static port::StatusOr<RedzoneCheckStatus> CheckRedzoneHost(
 // Run the redzone checker on the provided buffer redzone.
 //
 // Increment out_param if mismatch occurs.
-static port::Status RunRedzoneChecker(
+static tsl::Status RunRedzoneChecker(
     Stream* stream, const DeviceMemory<uint8_t>& redzone,
     uint8_t redzone_pattern, const DeviceMemory<uint64_t>& out_param,
     const ComparisonKernelT& comparison_kernel) {
@@ -238,9 +238,8 @@ static port::Status RunRedzoneChecker(
 // with a NaN pattern after a failed check.
 //
 // This function is blocking, since redzone failing is a rare event.
-static port::Status ReinitializeRedzone(Stream* stream,
-                                        DeviceMemoryBase redzone,
-                                        uint8_t redzone_pattern) {
+static tsl::Status ReinitializeRedzone(Stream* stream, DeviceMemoryBase redzone,
+                                       uint8_t redzone_pattern) {
   absl::FixedArray<uint8_t> redzone_array(redzone.size());
   redzone_array.fill(redzone_pattern);
   stream->ThenMemcpy(&redzone, redzone_array.data(), redzone.size());
@@ -251,7 +250,7 @@ static port::Status ReinitializeRedzone(Stream* stream,
 // Check redzones around the user allocation.
 //
 // Precondition: the memory pointed out by out_param is zeroed.
-static port::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
+static tsl::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
     Stream* stream, DeviceMemoryBase memory,
     const DeviceMemory<uint64_t>& out_param,
     const ComparisonKernelT& comparison_kernel, int64_t user_allocation_size,
@@ -303,11 +302,11 @@ static port::StatusOr<RedzoneCheckStatus> CheckRedzonesForBuffer(
   return RedzoneCheckStatus::OK();
 }
 
-port::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
+tsl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
   StreamExecutor* executor = stream_->parent();
 
   absl::Span<const uint8_t> compiled_ptx = {};
-  port::StatusOr<absl::Span<const uint8_t>> compiled_ptx_or =
+  tsl::StatusOr<absl::Span<const uint8_t>> compiled_ptx_or =
       CompileGpuAsmOrGetCached(executor->device_ordinal(), redzone_checker_ptx,
                                gpu_compilation_opts_);
   if (compiled_ptx_or.ok()) {
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h
index b901badd00d..2e3e5ba48f6 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h
@@ -57,7 +57,7 @@ class RedzoneAllocator : public ScratchAllocator {
     return allocated_bytes_excluding_redzones_;
   }
 
-  port::StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override;
+  tsl::StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override;
 
   // Non-empty redzone check status implies that there was a write into a
   // redzone, with a string communicating the location of the write.
@@ -97,7 +97,7 @@ class RedzoneAllocator : public ScratchAllocator {
   //  - RedzoneCheckStatus with a non-empty error message iff a write into a
   //    redzone has been detected.
   //  - A stream error, if loading or launching the kernel has failed.
-  port::StatusOr<RedzoneCheckStatus> CheckRedzones() const;
+  tsl::StatusOr<RedzoneCheckStatus> CheckRedzones() const;
 
   Stream* stream() const { return stream_; }
 
diff --git a/tensorflow/compiler/xla/stream_executor/host/BUILD b/tensorflow/compiler/xla/stream_executor/host/BUILD
index 66a44ac8304..42bce3339fe 100644
--- a/tensorflow/compiler/xla/stream_executor/host/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/host/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
-load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -43,9 +44,8 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:executor_cache",
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
+        "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -62,9 +62,10 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/xla/stream_executor:kernel",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:denormal",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:setround",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -101,18 +102,18 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:kernel",
         "//tensorflow/compiler/xla/stream_executor:rng",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_internal",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",
-        "//tensorflow/compiler/xla/stream_executor:timer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",  # fixdeps: keep
+        "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform/profile_utils:profile_utils_cpu_utils",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
     alwayslink = True,
 )
 
-tsl_cc_test(
+xla_cc_test(
     name = "host_stream_test",
     srcs = ["host_stream_test.cc"],
     deps = [
@@ -120,9 +121,7 @@ tsl_cc_test(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:platform",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
index 2c529180f9d..eb3aa6884cb 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.cc
@@ -21,14 +21,16 @@ limitations under the License.
 #include <string.h>
 
 #include <cstdint>
+#include <memory>
+#include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_timer.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/tsl/platform/mem.h"
@@ -47,15 +49,15 @@ HostExecutor::HostExecutor(const PluginConfig& plugin_config)
 
 HostExecutor::~HostExecutor() {}
 
-port::Status HostExecutor::Init(int device_ordinal,
-                                DeviceOptions device_options) {
+tsl::Status HostExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
   auto it =
       device_options.non_portable_tags.find("host_thread_stack_size_in_bytes");
   if (it != device_options.non_portable_tags.end()) {
     if (!absl::SimpleAtoi(it->second, &thread_stack_size_in_bytes_)) {
-      return port::InvalidArgumentError(absl::StrCat(
+      return tsl::errors::InvalidArgument(
           "Unable to parse host_thread_stack_size_in_bytes as an integer: ",
-          it->second));
+          it->second);
     }
   }
   return ::tsl::OkStatus();
@@ -86,14 +88,14 @@ void HostExecutor::Deallocate(DeviceMemoryBase* mem) {
   tsl::port::AlignedFree(mem->opaque());
 }
 
-port::Status HostExecutor::SynchronousMemZero(DeviceMemoryBase* location,
-                                              uint64_t size) {
+tsl::Status HostExecutor::SynchronousMemZero(DeviceMemoryBase* location,
+                                             uint64_t size) {
   memset(location->opaque(), 0, size);
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                             int value, uint64_t size) {
+tsl::Status HostExecutor::SynchronousMemSet(DeviceMemoryBase* location,
+                                            int value, uint64_t size) {
   memset(location->opaque(), value, size);
   return ::tsl::OkStatus();
 }
@@ -132,8 +134,8 @@ bool HostExecutor::MemcpyDeviceToDevice(Stream* stream,
   return true;
 }
 
-port::Status HostExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
-                                   uint64_t size) {
+tsl::Status HostExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                                  uint64_t size) {
   void* gpu_mem = location->opaque();
   // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
   // with the HostExecutor.
@@ -142,8 +144,8 @@ port::Status HostExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
-                                  uint8 pattern, uint64_t size) {
+tsl::Status HostExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                                 uint8 pattern, uint64_t size) {
   void* gpu_mem = location->opaque();
   // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
   // with the HostExecutor.
@@ -152,8 +154,8 @@ port::Status HostExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
-                                    uint32_t pattern, uint64_t size) {
+tsl::Status HostExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                                   uint32_t pattern, uint64_t size) {
   void* gpu_mem = location->opaque();
   // Enqueue the [asynchronous] memzero on the stream (HostStream) associated
   // with the HostExecutor.
@@ -162,29 +164,29 @@ port::Status HostExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                             const void* host_src,
-                                             uint64_t size) {
+tsl::Status HostExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src,
+                                            uint64_t size) {
   memcpy(gpu_dst->opaque(), host_src, size);
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::SynchronousMemcpy(void* host_dst,
-                                             const DeviceMemoryBase& gpu_src,
-                                             uint64_t size) {
+tsl::Status HostExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64_t size) {
   memcpy(host_dst, gpu_src.opaque(), size);
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
+tsl::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
   memcpy(gpu_dst->opaque(), gpu_src.opaque(), size);
   return ::tsl::OkStatus();
 }
 
 bool HostExecutor::HostCallback(Stream* stream,
-                                std::function<port::Status()> callback) {
-  AsHostStream(stream)->EnqueueTaskWithStatus(callback);
+                                absl::AnyInvocable<tsl::Status() &&> callback) {
+  AsHostStream(stream)->EnqueueTaskWithStatus(std::move(callback));
   return true;
 }
 
@@ -223,15 +225,15 @@ static HostEvent* AsHostEvent(Event* event) {
   return static_cast<HostEvent*>(event->implementation());
 }
 
-port::Status HostExecutor::AllocateEvent(Event* /*event*/) {
+tsl::Status HostExecutor::AllocateEvent(Event* /*event*/) {
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::DeallocateEvent(Event* /*event*/) {
+tsl::Status HostExecutor::DeallocateEvent(Event* /*event*/) {
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::RecordEvent(Stream* stream, Event* event) {
+tsl::Status HostExecutor::RecordEvent(Stream* stream, Event* event) {
   std::shared_ptr<absl::Notification> notification =
       AsHostEvent(event)->notification();
   AsHostStream(stream)->EnqueueTask([notification]() {
@@ -241,7 +243,7 @@ port::Status HostExecutor::RecordEvent(Stream* stream, Event* event) {
   return ::tsl::OkStatus();
 }
 
-port::Status HostExecutor::WaitForEvent(Stream* stream, Event* event) {
+tsl::Status HostExecutor::WaitForEvent(Stream* stream, Event* event) {
   std::shared_ptr<absl::Notification> notification =
       AsHostEvent(event)->notification();
   AsHostStream(stream)->EnqueueTask(
@@ -265,11 +267,11 @@ bool HostExecutor::StopTimer(Stream* stream, Timer* timer) {
   return true;
 }
 
-port::Status HostExecutor::BlockHostUntilDone(Stream* stream) {
+tsl::Status HostExecutor::BlockHostUntilDone(Stream* stream) {
   return AsHostStream(stream)->BlockUntilDone();
 }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 HostExecutor::CreateDeviceDescription(int device_ordinal) {
   internal::DeviceDescriptionBuilder builder;
 
@@ -298,7 +300,7 @@ bool HostExecutor::SupportsBlas() const {
 
 blas::BlasSupport* HostExecutor::CreateBlas() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::BlasFactory> status =
+  tsl::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
@@ -319,7 +321,7 @@ bool HostExecutor::SupportsFft() const {
 
 fft::FftSupport* HostExecutor::CreateFft() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::FftFactory> status =
+  tsl::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
@@ -340,7 +342,7 @@ bool HostExecutor::SupportsRng() const {
 
 rng::RngSupport* HostExecutor::CreateRng() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::RngFactory> status =
+  tsl::StatusOr<PluginRegistry::RngFactory> status =
       registry->GetFactory<PluginRegistry::RngFactory>(kHostPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h
index 1f242e1ed94..316fcdc4386 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h
@@ -21,14 +21,14 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_timer.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
 namespace host {
@@ -50,16 +50,16 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   // The stack size used for host streams can be set via
   // device_options.non_portable_tags["host_stack_size"].
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+  tsl::Status Init(int device_ordinal, DeviceOptions device_options) override;
 
-  port::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                         KernelBase* kernel) override {
-    return port::UnimplementedError("Not Implemented");
+  tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
+                        KernelBase* kernel) override {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
-  port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                      const BlockDim& block_dims, const KernelBase& kernel,
-                      const KernelArgsArrayBase& args) override {
-    return port::UnimplementedError("Not Implemented");
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims, const KernelBase& kernel,
+                     const KernelArgsArrayBase& args) override {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
@@ -82,37 +82,36 @@ class HostExecutor : public internal::StreamExecutorInterface {
                             const DeviceMemoryBase& gpu_src,
                             uint64_t size) override;
 
-  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                       uint64_t size) override;
-  port::Status Memset(Stream* stream, DeviceMemoryBase* location,
-                      uint8_t pattern, uint64_t size) override;
-  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                        uint32_t pattern, uint64_t size) override;
+  tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                      uint64_t size) override;
+  tsl::Status Memset(Stream* stream, DeviceMemoryBase* location,
+                     uint8_t pattern, uint64_t size) override;
+  tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                       uint32_t pattern, uint64_t size) override;
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
-  port::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                  uint64_t size) override;
-
-  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+  tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                  uint64_t size) override;
 
-  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                 const void* host_src, uint64_t size) override;
-  port::Status SynchronousMemcpy(void* host_dst,
-                                 const DeviceMemoryBase& gpu_src,
-                                 uint64_t size) override;
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
-                                               const DeviceMemoryBase& gpu_src,
-                                               uint64_t size) override;
+  tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                uint64_t size) override;
+
+  tsl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst, const void* host_src,
+                                uint64_t size) override;
+  tsl::Status SynchronousMemcpy(void* host_dst, const DeviceMemoryBase& gpu_src,
+                                uint64_t size) override;
+  tsl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                              const DeviceMemoryBase& gpu_src,
+                                              uint64_t size) override;
 
   bool HostCallback(Stream* stream,
-                    std::function<port::Status()> callback) override;
+                    absl::AnyInvocable<tsl::Status() &&> callback) override;
 
-  port::Status AllocateEvent(Event* event) override;
-  port::Status DeallocateEvent(Event* event) override;
-  port::Status RecordEvent(Stream* stream, Event* event) override;
-  port::Status WaitForEvent(Stream* stream, Event* event) override;
+  tsl::Status AllocateEvent(Event* event) override;
+  tsl::Status DeallocateEvent(Event* event) override;
+  tsl::Status RecordEvent(Stream* stream, Event* event) override;
+  tsl::Status WaitForEvent(Stream* stream, Event* event) override;
   Event::Status PollForEventStatus(Event* event) override;
 
   bool AllocateStream(Stream* stream) override;
@@ -128,21 +127,21 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   bool StopTimer(Stream* stream, Timer* timer) override;
 
-  port::Status BlockHostUntilDone(Stream* stream) override;
+  tsl::Status BlockHostUntilDone(Stream* stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
     return CreateDeviceDescription(0);
   }
 
-  static port::StatusOr<std::unique_ptr<DeviceDescription>>
+  static tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
-  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+  tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
     return ::tsl::OkStatus();
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_platform.cc b/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
index 4c4bc89619d..e254ce97093 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_platform.cc
@@ -21,9 +21,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
 namespace host {
@@ -40,12 +39,12 @@ int HostPlatform::VisibleDeviceCount() const {
 
 const std::string& HostPlatform::Name() const { return name_; }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 HostPlatform::DescriptionForDevice(int ordinal) const {
   return HostExecutor::CreateDeviceDescription(ordinal);
 }
 
-port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) {
+tsl::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
   config.plugin_config = PluginConfig();
@@ -53,7 +52,7 @@ port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDevice(int ordinal) {
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDeviceWithPluginConfig(
+tsl::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -62,21 +61,21 @@ port::StatusOr<StreamExecutor*> HostPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
+tsl::StatusOr<StreamExecutor*> HostPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   return executor_cache_.GetOrCreate(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
-port::StatusOr<std::unique_ptr<StreamExecutor>>
+tsl::StatusOr<std::unique_ptr<StreamExecutor>>
 HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
       this, std::make_unique<HostExecutor>(config.plugin_config),
       config.ordinal);
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
             config.ordinal, init_status.ToString().c_str()));
@@ -96,7 +95,7 @@ void HostPlatform::UnregisterTraceListener(TraceListener* listener) {
 
 static void InitializeHostPlatform() {
   std::unique_ptr<Platform> platform(new host::HostPlatform);
-  SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_platform.h b/tensorflow/compiler/xla/stream_executor/host/host_platform.h
index 0694119a3b6..b3aea559152 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_platform.h
+++ b/tensorflow/compiler/xla/stream_executor/host/host_platform.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/executor_cache.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
@@ -50,18 +49,18 @@ class HostPlatform : public Platform {
 
   const std::string& Name() const override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& config) override;
 
-  port::StatusOr<StreamExecutor*> GetExecutor(
+  tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
 
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_stream.cc b/tensorflow/compiler/xla/stream_executor/host/host_stream.cc
index 1d5db636309..3daaba7c857 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_stream.cc
@@ -17,8 +17,13 @@ limitations under the License.
 // the HostExecutor implementation.
 #include "tensorflow/compiler/xla/stream_executor/host/host_stream.h"
 
+#include <queue>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/tsl/platform/denormal.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/setround.h"
 
 namespace stream_executor {
@@ -26,8 +31,8 @@ namespace host {
 
 namespace {
 
-port::ThreadOptions GetThreadOptions(size_t stack_size_in_bytes) {
-  port::ThreadOptions options;
+tsl::ThreadOptions GetThreadOptions(size_t stack_size_in_bytes) {
+  tsl::ThreadOptions options;
   options.stack_size = stack_size_in_bytes;
   return options;
 }
@@ -35,7 +40,7 @@ port::ThreadOptions GetThreadOptions(size_t stack_size_in_bytes) {
 }  // namespace
 
 HostStream::HostStream(size_t stack_size_in_bytes)
-    : thread_(port::Env::Default()->StartThread(
+    : thread_(tsl::Env::Default()->StartThread(
           GetThreadOptions(stack_size_in_bytes), "host_executor",
           [this]() { WorkLoop(); })) {}
 
@@ -48,14 +53,15 @@ HostStream::~HostStream() {
   thread_.reset();
 }
 
-bool HostStream::EnqueueTask(std::function<void()> task) {
-  return EnqueueTaskWithStatus([task = std::move(task)]() {
-    task();
+bool HostStream::EnqueueTask(absl::AnyInvocable<void() &&> task) {
+  return EnqueueTaskWithStatus([task = std::move(task)]() mutable {
+    std::move(task)();
     return ::tsl::OkStatus();
   });
 }
 
-bool HostStream::EnqueueTaskWithStatus(std::function<port::Status()> task) {
+bool HostStream::EnqueueTaskWithStatus(
+    absl::AnyInvocable<tsl::Status() &&> task) {
   CHECK(task != nullptr);
   absl::MutexLock lock(&mu_);
   work_queue_.push(std::move(task));
@@ -71,26 +77,26 @@ void HostStream::WorkLoop() {
   tsl::port::ScopedFlushDenormal flush;
   tsl::port::ScopedSetRound round(FE_TONEAREST);
   while (true) {
-    std::queue<std::function<port::Status()>> queue;
+    std::queue<absl::AnyInvocable<tsl::Status() &&>> queue;
     {
       absl::MutexLock lock(&mu_);
       mu_.Await(absl::Condition(this, &HostStream::WorkAvailable));
       std::swap(queue, work_queue_);
     }
     while (!queue.empty()) {
-      std::function<port::Status()>& fn = queue.front();
+      absl::AnyInvocable<tsl::Status()&&>& fn = queue.front();
       if (!fn) {
         return;
       }
-      status_.Update(fn());
+      status_.Update(std::move(fn)());
       queue.pop();
     }
   }
 }
 
-port::Status HostStream::BlockUntilDone() {
+tsl::Status HostStream::BlockUntilDone() {
   absl::Notification done;
-  port::Status status;
+  tsl::Status status;
   EnqueueTask([&done, &status, this]() {
     // This task is always executed synchronously before 'status_' is updated
     // with the result of the task (always OK() in this case), so we don't need
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_stream.h b/tensorflow/compiler/xla/stream_executor/host/host_stream.h
index dbee4b0097a..1a087bd4994 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_stream.h
+++ b/tensorflow/compiler/xla/stream_executor/host/host_stream.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include <memory>
 #include <queue>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace host {
@@ -41,25 +41,26 @@ class HostStream : public internal::StreamInterface {
   // stop the stream or block any other tasks from executing; rather, the stream
   // will remember the first error encountered and return it from
   // 'BlockUntilDone'.
-  bool EnqueueTaskWithStatus(std::function<port::Status()> task);
+  bool EnqueueTaskWithStatus(absl::AnyInvocable<tsl::Status() &&> task);
   // Enqueue a task that doesn't report any status.
-  bool EnqueueTask(std::function<void()> task);
+  bool EnqueueTask(absl::AnyInvocable<void() &&> task);
 
   void* GpuStreamHack() override { return nullptr; }
   void** GpuStreamMemberHack() override { return nullptr; }
 
   // Blocks until all tasks are done, returns the first error reported by a task
   // (if any) and clears the error status.
-  port::Status BlockUntilDone();
+  tsl::Status BlockUntilDone();
 
  private:
   bool WorkAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void WorkLoop();
 
   absl::Mutex mu_;
-  std::queue<std::function<port::Status()>> work_queue_ ABSL_GUARDED_BY(mu_);
-  std::unique_ptr<port::Thread> thread_;
-  port::Status status_;
+  std::queue<absl::AnyInvocable<tsl::Status() &&>> work_queue_
+      ABSL_GUARDED_BY(mu_);
+  std::unique_ptr<tsl::Thread> thread_;
+  tsl::Status status_;
 };
 
 }  // namespace host
diff --git a/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc b/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
index 9234a9fcdb1..e356e97f4d1 100644
--- a/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
+++ b/tensorflow/compiler/xla/stream_executor/host/host_stream_test.cc
@@ -56,9 +56,9 @@ TEST(HostStream, ReportsHostCallbackError) {
   stream.Init();
 
   stream.ThenDoHostCallbackWithStatus(
-      []() { return se::port::InternalError("error!"); });
+      []() { return tsl::errors::Internal("error!"); });
 
-  se::port::Status status = stream.BlockHostUntilDone();
+  auto status = stream.BlockHostUntilDone();
   ASSERT_EQ(status.code(), tsl::error::INTERNAL);
   ASSERT_EQ(status.error_message(), "error!");
 }
@@ -71,9 +71,9 @@ TEST(HostStream, ReportsFirstHostCallbackError) {
   stream.Init();
 
   stream.ThenDoHostCallbackWithStatus(
-      []() { return se::port::InternalError("error 1"); });
+      []() { return tsl::errors::Internal("error 1"); });
   stream.ThenDoHostCallbackWithStatus(
-      []() { return se::port::InternalError("error 2"); });
+      []() { return tsl::errors::Internal("error 2"); });
 
   // "error 2" is just lost.
   ASSERT_EQ(stream.BlockHostUntilDone().error_message(), "error 1");
diff --git a/tensorflow/compiler/xla/stream_executor/kernel.cc b/tensorflow/compiler/xla/stream_executor/kernel.cc
index 6a95b727a43..04395bf04c7 100644
--- a/tensorflow/compiler/xla/stream_executor/kernel.cc
+++ b/tensorflow/compiler/xla/stream_executor/kernel.cc
@@ -21,11 +21,11 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/demangle.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/tsl/platform/demangle.h"
 
 namespace stream_executor {
 
@@ -95,7 +95,7 @@ void KernelBase::set_name(absl::string_view name) {
 
   // CUDA splitter prefixes stub functions with __device_stub_.
   demangled_name_ =
-      port::Demangle(absl::StripPrefix(name, "__device_stub_").data());
+      tsl::port::Demangle(absl::StripPrefix(name, "__device_stub_").data());
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/kernel.h b/tensorflow/compiler/xla/stream_executor/kernel.h
index 04edbacc9bc..03ff5cd1dda 100644
--- a/tensorflow/compiler/xla/stream_executor/kernel.h
+++ b/tensorflow/compiler/xla/stream_executor/kernel.h
@@ -76,9 +76,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -361,8 +361,7 @@ class KernelArgsArrayBase {
   virtual uint64_t number_of_shared_bytes() const = 0;
 
   // Gets the list of argument addresses.
-  virtual port::ArraySlice<const void *> argument_addresses()  // non-absl ok
-      const = 0;
+  virtual absl::Span<const void *const> argument_addresses() const = 0;
 
   // Gets an iterator to the arguments in the array.
   virtual KernelArgIterator arg_iterator() const = 0;
@@ -448,10 +447,9 @@ class KernelArgsArray : public KernelArgsArrayBase {
   }
 
   // Gets the list of argument addresses.
-  port::ArraySlice<const void *> argument_addresses()  // non-absl ok
-      const override {
-    return port::ArraySlice<const void *>(  // non-absl ok
-        argument_addresses_.data(), number_of_argument_addresses_);
+  absl::Span<const void *const> argument_addresses() const override {
+    return absl::Span<const void *const>(argument_addresses_.data(),
+                                         number_of_argument_addresses_);
   }
 
   // Gets an iterator to the arguments in the array.
diff --git a/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h b/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
index de1cb0c609e..54793046045 100644
--- a/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
+++ b/tensorflow/compiler/xla/stream_executor/lazy_op_runner.h
@@ -16,6 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LAZY_OP_RUNNER_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LAZY_OP_RUNNER_H_
 
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/base/call_once.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 
@@ -50,10 +56,10 @@ class LazyOpRunner {
  public:
   // Construct from a pre-initialized OpRunner; all calls to GetOrCreateRunner
   // will return a pointer to exactly this runner.
-  static port::StatusOr<std::unique_ptr<LazyOpRunner>> FromOpRunner(
+  static tsl::StatusOr<std::unique_ptr<LazyOpRunner>> FromOpRunner(
       std::unique_ptr<const OpRunner<typename Op::Signature>> runner) {
     if (!runner) {
-      return port::InternalError("Null runner argument to FromOpRunner");
+      return tsl::errors::Internal("Null runner argument to FromOpRunner");
     }
     TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
     // Private constructor cannot be called by make_unique :(
@@ -75,23 +81,29 @@ class LazyOpRunner {
   // executor will be errors.
   //
   // The result is owned by LazyOpRunner.
-  port::StatusOr<const OpRunner<typename Op::Signature>*> GetOrCreateRunner(
+  tsl::StatusOr<const OpRunner<typename Op::Signature>*> GetOrCreateRunner(
       typename Op::Config config, Stream* stream) {
-    absl::MutexLock lock(&mu_);
-    if (!runner_) {
-      TF_ASSIGN_OR_RETURN(runner_, Op::RunnerFromAlgorithmDesc(
-                                       desc_, std::move(config), stream));
-    }
+    absl::call_once(once_flag_, [&] {
+      if (runner_) return;  // runner was passed via constructor argument
+
+      auto r = Op::RunnerFromAlgorithmDesc(desc_, std::move(config), stream);
+      if (!r.ok()) {
+        error_ = std::move(r).status();
+      } else {
+        runner_ = std::move(r).value();
+      }
+    });
+
+    if (!error_.ok()) return error_;
     return runner_.get();
   }
 
   // Get the contained runner with the invariant that it's already initialized.
-  port::StatusOr<const OpRunner<typename Op::Signature>*> GetRunner() {
-    absl::MutexLock lock(&mu_);
-    if (!runner_) {
-      return port::InternalError("LazyOpRunner::GetRunner: not initialized");
+  tsl::StatusOr<const OpRunner<typename Op::Signature>*> GetRunner() {
+    if (auto* runner = runner_ptr_.load(std::memory_order_acquire)) {
+      return runner;
     }
-    return runner_.get();
+    return tsl::errors::Internal("LazyOpRunner::GetRunner: not initialized");
   }
 
   bool operator==(const LazyOpRunner& other) const {
@@ -105,12 +117,21 @@ class LazyOpRunner {
  private:
   LazyOpRunner(AlgorithmDesc desc,
                std::unique_ptr<const OpRunner<typename Op::Signature>> runner)
-      : desc_(std::move(desc)), runner_(std::move(runner)) {}
+      : desc_(std::move(desc)),
+        error_(tsl::OkStatus()),
+        runner_(std::move(runner)),
+        runner_ptr_(runner_.get()) {}
 
   AlgorithmDesc desc_;
-  absl::Mutex mu_;
-  std::unique_ptr<const OpRunner<typename Op::Signature>> runner_
-      ABSL_GUARDED_BY(mu_);
+
+  // We use absl::call_once to lazily initialize `runner_` (or `error_`).
+  absl::once_flag once_flag_;
+  tsl::Status error_;  // holds error if runner can't be initialized
+  std::unique_ptr<const OpRunner<typename Op::Signature>> runner_;
+
+  // Once we initialize `runner_` we publish a pointer through atomic so that
+  // `GetRunner` can read it without data races with initialization.
+  std::atomic<const OpRunner<typename Op::Signature>*> runner_ptr_;
 };
 
 // Implementation of the concept required by LazyOpRunner, for ConvRunner.
@@ -126,7 +147,7 @@ struct ConvOp {
     const ConvolutionDescriptor& convolution_descriptor;
   };
 
-  static port::StatusOr<std::unique_ptr<const OpRunner<ConvSignature>>>
+  static tsl::StatusOr<std::unique_ptr<const OpRunner<ConvSignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
     return stream->ConvolveRunnerFromDesc(
@@ -152,7 +173,7 @@ struct FusedConvOp {
     ActivationMode activation_mode;
   };
 
-  static port::StatusOr<std::unique_ptr<const OpRunner<FusedConvSignature>>>
+  static tsl::StatusOr<std::unique_ptr<const OpRunner<FusedConvSignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
     return stream->FusedConvolveRunnerFromDesc(
@@ -174,10 +195,10 @@ struct FusedMatmulOp {
   // this feature.
   struct Config {};
 
-  static port::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
+  static tsl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return port::UnimplementedError("Unimplemented");
+    return tsl::errors::Unimplemented("Unimplemented");
   }
 };
 
diff --git a/tensorflow/compiler/xla/stream_executor/lib/BUILD b/tensorflow/compiler/xla/stream_executor/lib/BUILD
index 3108b8b84f5..ea2b3bd1676 100644
--- a/tensorflow/compiler/xla/stream_executor/lib/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/lib/BUILD
@@ -1,10 +1,8 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl:tsl.bzl", "if_windows")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
-    default_visibility = [":friends"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
@@ -15,28 +13,4 @@ package_group(
 
 cc_library(
     name = "lib",
-    srcs = glob(
-        [
-            "**/*.cc",
-        ],
-        exclude = [
-            "**/*test*",
-        ],
-    ),
-    hdrs = glob(["**/*.h"]),
-    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "//tensorflow/compiler/xla/stream_executor/platform",
-        "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:stacktrace",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
-    ] + if_static([
-        "//tensorflow/tsl/platform:env_impl",
-    ]),
 )
diff --git a/tensorflow/compiler/xla/stream_executor/lib/array_slice.h b/tensorflow/compiler/xla/stream_executor/lib/array_slice.h
deleted file mode 100644
index 62094d2a51f..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/array_slice.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
-
-#include "absl/types/span.h"
-
-namespace stream_executor {
-namespace port {
-
-template <typename T>
-using ArraySlice = absl::Span<const T>;  // non-absl ok
-template <typename T>
-using MutableArraySlice = absl::Span<T>;
-
-}  // namespace port
-}  // namespace stream_executor
-
-namespace perftools {
-namespace gputools {
-
-// Temporarily pull stream_executor into perftools::gputools while we migrate
-// code to the new namespace.  TODO(b/77980417): Remove this once we've
-// completed the migration.
-using namespace stream_executor;  // NOLINT[build/namespaces]
-
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/demangle.cc b/tensorflow/compiler/xla/stream_executor/lib/demangle.cc
deleted file mode 100644
index 5779b47756d..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/demangle.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/lib/demangle.h"
-
-#if (__GNUC__ >= 4 || (__GNUC__ >= 3 && __GNUC_MINOR__ >= 4)) && \
-    !defined(__mips__)
-#  define HAS_CXA_DEMANGLE 1
-#else
-#  define HAS_CXA_DEMANGLE 0
-#endif
-
-#include <stdlib.h>
-#if HAS_CXA_DEMANGLE
-#include <cxxabi.h>
-#endif
-
-namespace stream_executor {
-namespace port {
-
-// The API reference of abi::__cxa_demangle() can be found in
-// libstdc++'s manual.
-// https://gcc.gnu.org/onlinedocs/libstdc++/libstdc++-html-USERS-4.3/a01696.html
-std::string Demangle(const char *mangled) {
-  std::string demangled;
-  int status = 0;
-  char *result = nullptr;
-#if HAS_CXA_DEMANGLE
-  result = abi::__cxa_demangle(mangled, nullptr, nullptr, &status);
-#endif
-  if (status == 0 && result != nullptr) {  // Demangling succeeded.
-    demangled.append(result);
-    free(result);
-  }
-  return demangled;
-}
-
-}  // namespace port
-}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/lib/demangle.h b/tensorflow/compiler/xla/stream_executor/lib/demangle.h
deleted file mode 100644
index 5d2873a5f38..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/demangle.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_DEMANGLE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_DEMANGLE_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-std::string Demangle(const char* mangled);
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_DEMANGLE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/env.h b/tensorflow/compiler/xla/stream_executor/lib/env.h
deleted file mode 100644
index 1793786e4be..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/env.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ENV_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ENV_H_
-
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-#include "tensorflow/tsl/platform/env.h"
-
-namespace stream_executor {
-namespace port {
-
-using tsl::Env;     // NOLINT(misc-unused-using-decls)
-using tsl::Thread;  // NOLINT(misc-unused-using-decls)
-
-inline Status FileExists(const std::string& filename) {
-  return Env::Default()->FileExists(filename);
-}
-
-inline Status FileExists(const absl::string_view& filename) {
-  return Env::Default()->FileExists(std::string(filename));
-}
-
-inline std::string GetExecutablePath() {
-  return Env::Default()->GetExecutablePath();
-}
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ENV_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/error.h b/tensorflow/compiler/xla/stream_executor/lib/error.h
deleted file mode 100644
index 1153a4c75c4..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/error.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ERROR_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ERROR_H_
-
-#include "tensorflow/tsl/protobuf/error_codes.pb.h"  // IWYU pragma: export
-
-namespace stream_executor {
-namespace port {
-
-namespace error = tensorflow::error;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_ERROR_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/human_readable.h b/tensorflow/compiler/xla/stream_executor/lib/human_readable.h
deleted file mode 100644
index 65be4d90e23..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/human_readable.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
-
-#include <assert.h>
-
-#include <limits>
-
-#include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-class HumanReadableNumBytes {
- public:
-  static std::string ToString(int64_t num_bytes) {
-    if (num_bytes == std::numeric_limits<int64_t>::min()) {
-      // Special case for number with not representable nagation.
-      return "-8E";
-    }
-
-    const char* neg_str = GetNegStr(&num_bytes);
-
-    // Special case for bytes.
-    if (num_bytes < 1024LL) {
-      // No fractions for bytes.
-      return absl::StrFormat("%s%dB", neg_str, num_bytes);
-    }
-
-    static const char units[] = "KMGTPE";  // int64_t only goes up to E.
-    const char* unit = units;
-    while (num_bytes >= (1024LL) * (1024LL)) {
-      num_bytes /= (1024LL);
-      ++unit;
-      assert(unit < units + sizeof(units));
-    }
-
-    if (*unit == 'K') {
-      return absl::StrFormat("%s%.1f%c", neg_str, num_bytes / 1024.0, *unit);
-    }
-    return absl::StrFormat("%s%.2f%c", neg_str, num_bytes / 1024.0, *unit);
-  }
-
- private:
-  template <typename T>
-  static const char* GetNegStr(T* value) {
-    if (*value < 0) {
-      *value = -(*value);
-      return "-";
-    } else {
-      return "";
-    }
-  }
-};
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/initialize.h b/tensorflow/compiler/xla/stream_executor/lib/initialize.h
deleted file mode 100644
index 93bec22a8c5..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/initialize.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_INITIALIZE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_INITIALIZE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_INITIALIZE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/mathutil.h b/tensorflow/compiler/xla/stream_executor/lib/mathutil.h
deleted file mode 100644
index 5781239a78c..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/mathutil.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_MATHUTIL_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_MATHUTIL_H_
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <type_traits>
-#include <vector>
-
-#include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-class MathUtil {
- public:
-  template <typename IntegralType>
-  static IntegralType CeilOfRatio(IntegralType numerator,
-                                  IntegralType denominator) {
-    return CeilOrFloorOfRatio<IntegralType, true>(numerator, denominator);
-  }
-  template <typename IntegralType>
-  static IntegralType FloorOfRatio(IntegralType numerator,
-                                   IntegralType denominator) {
-    return CeilOrFloorOfRatio<IntegralType, false>(numerator, denominator);
-  }
-  template <typename IntegralType, bool ceil>
-  static IntegralType CeilOrFloorOfRatio(IntegralType numerator,
-                                         IntegralType denominator);
-};
-
-// ---- CeilOrFloorOfRatio ----
-// This is a branching-free, cast-to-double-free implementation.
-//
-// Casting to double is in general incorrect because of loss of precision
-// when casting an int64_t into a double.
-//
-// There's a bunch of 'recipes' to compute a integer ceil (or floor) on the web,
-// and most of them are incorrect.
-template<typename IntegralType, bool ceil>
-IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
-                                          IntegralType denominator) {
-  static_assert(std::is_integral<IntegralType>::value,
-                 "CeilOfRatio_is_only_defined_for_integral_types");
-  assert(denominator != 0);
-  // Dividing the smallest signed integer by -1 is not supported: it would
-  // SIGFPE
-  assert(!std::is_signed<IntegralType>::value ||
-         numerator != std::numeric_limits<IntegralType>::min() ||
-         denominator != -1);
-
-  const IntegralType rounded_toward_zero = numerator / denominator;
-  const IntegralType intermediate_product = rounded_toward_zero * denominator;
-
-  if (ceil) {  // Compile-time condition: not an actual branching
-    // When rounded_toward_zero is negative, then an adjustment is never needed:
-    // the real ratio is negative, and so rounded toward zero is the ceil.
-    // When rounded_toward_zero is non-negative, an adjustment is needed if the
-    // sign of the difference numerator - intermediate_product is the same as
-    // the sign of the denominator.
-    //
-    // Using a bool and then a static_cast to IntegralType is not strictly
-    // necessary, but it makes the code clear, and anyway the compiler should
-    // get rid of it.
-    const bool needs_adjustment = (rounded_toward_zero >= 0) &&
-        ((denominator > 0 && numerator > intermediate_product) ||
-            (denominator < 0 && numerator < intermediate_product));
-    const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment);
-    const IntegralType ceil_of_ratio = rounded_toward_zero + adjustment;
-    return ceil_of_ratio;
-  } else {
-    // Floor case: symmetrical to the previous one
-    const bool needs_adjustment = (rounded_toward_zero <= 0) &&
-        ((denominator > 0 && numerator < intermediate_product) ||
-         (denominator < 0 && numerator > intermediate_product));
-    const IntegralType adjustment = static_cast<IntegralType>(needs_adjustment);
-    const IntegralType floor_of_ratio = rounded_toward_zero - adjustment;
-    return floor_of_ratio;
-  }
-}
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/numbers.cc b/tensorflow/compiler/xla/stream_executor/lib/numbers.cc
deleted file mode 100644
index 6fca3e3e272..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/numbers.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-
-#include <stdlib.h>
-
-#include <cstdint>
-#include <string>
-
-namespace stream_executor {
-namespace port {
-
-bool safe_strto32(const char* str, int32_t* value) {
-  char* endptr;
-  *value = strtol(str, &endptr, 10);  // NOLINT
-  if (endptr != str) {
-    while (isspace(*endptr)) ++endptr;
-  }
-  return *str != '\0' && *endptr == '\0';
-}
-
-// Convert strings to floating point values.
-// Leading and trailing spaces are allowed.
-// Values may be rounded on over- and underflow.
-bool safe_strto32(const std::string& str, int32_t* value) {
-  return port::safe_strto32(str.c_str(), value);
-}
-
-}  // namespace port
-}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/lib/numbers.h b/tensorflow/compiler/xla/stream_executor/lib/numbers.h
deleted file mode 100644
index f96ecc644aa..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/numbers.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_NUMBERS_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_NUMBERS_H_
-
-#include <cstdint>
-#include <string>
-
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-// Convert strings to floating point values.
-// Leading and trailing spaces are allowed.
-// Values may be rounded on over- and underflow.
-bool safe_strto32(const std::string& str, int32_t* value);
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/path.cc b/tensorflow/compiler/xla/stream_executor/lib/path.cc
deleted file mode 100644
index b4aba9be706..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/path.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/lib/path.h"
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-
-namespace stream_executor {
-namespace port {
-namespace internal {
-
-static bool IsAbsolutePath(absl::string_view path) {
-  return !path.empty() && path[0] == '/';
-}
-
-// For an array of paths of length count, append them all together,
-// ensuring that the proper path separators are inserted between them.
-std::string JoinPathImpl(std::initializer_list<absl::string_view> paths) {
-  std::string result;
-
-  for (absl::string_view path : paths) {
-    if (path.empty()) continue;
-
-    if (result.empty()) {
-      result = std::string(path);
-      continue;
-    }
-
-    if (result[result.size() - 1] == '/') {
-      if (IsAbsolutePath(path)) {
-        absl::StrAppend(&result, path.substr(1));
-      } else {
-        absl::StrAppend(&result, path);
-      }
-    } else {
-      if (IsAbsolutePath(path)) {
-        absl::StrAppend(&result, path);
-      } else {
-        absl::StrAppend(&result, "/", path);
-      }
-    }
-  }
-
-  return result;
-}
-
-}  // namespace internal
-}  // namespace port
-}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/lib/path.h b/tensorflow/compiler/xla/stream_executor/lib/path.h
deleted file mode 100644
index 97d0655d504..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/path.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PATH_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PATH_H_
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-namespace internal {
-// TODO(rspringer): Move to cc/implementation file.
-// Not part of the public API.
-std::string JoinPathImpl(std::initializer_list<absl::string_view> paths);
-}  // namespace internal
-
-// Join multiple paths together.
-// JoinPath unconditionally joins all paths together. For example:
-//
-//  Arguments                  | JoinPath
-//  ---------------------------+---------------------
-//  '/foo', 'bar'              | /foo/bar
-//  '/foo/', 'bar'             | /foo/bar
-//  '/foo', '/bar'             | /foo/bar
-//  '/foo', '/bar', '/baz'     | /foo/bar/baz
-//
-// All paths will be treated as relative paths, regardless of whether or not
-// they start with a leading '/'.  That is, all paths will be concatenated
-// together, with the appropriate path separator inserted in between.
-// Arguments must be convertible to absl::string_view.
-//
-// Usage:
-// string path = file::JoinPath("/var/log", dirname, filename);
-// string path = file::JoinPath(FLAGS_test_srcdir, filename);
-template <typename... T>
-inline std::string JoinPath(const T&... args) {
-  return internal::JoinPathImpl({args...});
-}
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PATH_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/process_state.cc b/tensorflow/compiler/xla/stream_executor/lib/process_state.cc
deleted file mode 100644
index 5e88b9c013b..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/process_state.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-
-#if defined(PLATFORM_WINDOWS)
-#include <direct.h>
-#include <stdlib.h>
-#include <WinSock2.h>
-#else
-#include <errno.h>
-#include <unistd.h>
-#endif
-
-#include <memory>
-
-namespace stream_executor {
-namespace port {
-
-std::string Hostname() {
-  char hostname[1024];
-  gethostname(hostname, sizeof hostname);
-  hostname[sizeof hostname - 1] = 0;
-  return std::string(hostname);
-}
-
-bool GetCurrentDirectory(std::string* dir) {
-  size_t len = 128;
-  std::unique_ptr<char[]> a(new char[len]);
-  for (;;) {
-    char* p = getcwd(a.get(), len);
-    if (p != nullptr) {
-      *dir = p;
-      return true;
-    } else if (errno == ERANGE) {
-      len += len;
-      a.reset(new char[len]);
-    } else {
-      return false;
-    }
-  }
-}
-
-}  // namespace port
-}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/lib/process_state.h b/tensorflow/compiler/xla/stream_executor/lib/process_state.h
deleted file mode 100644
index 418401b6316..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/process_state.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-namespace stream_executor {
-namespace port {
-
-std::string Hostname();
-bool GetCurrentDirectory(std::string* dir);
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/stacktrace.h b/tensorflow/compiler/xla/stream_executor/lib/stacktrace.h
deleted file mode 100644
index 5bce042461a..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/stacktrace.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STACKTRACE_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STACKTRACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-#include "tensorflow/tsl/platform/stacktrace.h"
-
-namespace stream_executor {
-namespace port {
-
-using tsl::CurrentStackTrace;  // NOLINT
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STACKTRACE_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h b/tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h
deleted file mode 100644
index 5525bf41506..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
-
-#ifdef _MSC_VER
-#define __thread __declspec(thread)
-#endif
-
-// For POD types in TLS mode, s_obj_VAR is the thread-local variable.
-#define SE_STATIC_THREAD_LOCAL_POD(_Type_, _var_)               \
-  static __thread _Type_ s_obj_##_var_;                         \
-  namespace {                                                   \
-  class ThreadLocal_##_var_ {                                   \
-  public:                                                       \
-    ThreadLocal_##_var_() {}                                    \
-    void Init() {}                                              \
-    inline _Type_ *pointer() const {                            \
-      return &s_obj_##_var_;                                    \
-    }                                                           \
-    inline _Type_ *safe_pointer() const {                       \
-      return &s_obj_##_var_;                                    \
-    }                                                           \
-    _Type_ &get() const {                                       \
-      return s_obj_##_var_;                                     \
-    }                                                           \
-    bool is_native_tls() const { return true; }                 \
-  private:                                                      \
-    SE_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_);           \
-  } _var_;                                                      \
-  }
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/status.h b/tensorflow/compiler/xla/stream_executor/lib/status.h
deleted file mode 100644
index fd3fe743a77..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/status.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUS_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUS_H_
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"  // IWYU pragma: export
-#include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
-#include "tensorflow/tsl/platform/status.h"
-
-namespace stream_executor {
-namespace port {
-
-using Status = tsl::Status;  // TENSORFLOW_STATUS_OK
-
-#define SE_CHECK_OK(val) TF_CHECK_OK(val)
-#define SE_ASSERT_OK(val) ASSERT_EQ(::stream_executor::port::Status(), (val))
-
-// Define some canonical error helpers.
-inline Status UnimplementedError(absl::string_view message) {
-  return Status(error::UNIMPLEMENTED, message);
-}
-inline Status InvalidArgumentError(absl::string_view message) {
-  return Status(error::INVALID_ARGUMENT, message);
-}
-inline Status InternalError(absl::string_view message) {
-  return Status(error::INTERNAL, message);
-}
-inline Status FailedPreconditionError(absl::string_view message) {
-  return Status(error::FAILED_PRECONDITION, message);
-}
-
-}  // namespace port
-}  // namespace stream_executor
-
-namespace perftools {
-namespace gputools {
-
-// Temporarily pull stream_executor into perftools::gputools while we migrate
-// code to the new namespace.  TODO(b/77980417): Remove this once we've
-// completed the migration.
-using namespace stream_executor;  // NOLINT[build/namespaces]
-
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUS_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/statusor.h b/tensorflow/compiler/xla/stream_executor/lib/statusor.h
deleted file mode 100644
index 7de9672d5ed..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/statusor.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUSOR_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUSOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/tsl/platform/statusor.h"
-
-namespace stream_executor {
-namespace port {
-
-using tsl::StatusOr;  // TENSORFLOW_STATUS_OK // NOLINT
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/thread_options.h b/tensorflow/compiler/xla/stream_executor/lib/thread_options.h
deleted file mode 100644
index 410edd97a85..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/thread_options.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
-
-#include "tensorflow/tsl/platform/env.h"
-
-namespace stream_executor {
-namespace port {
-
-using tsl::ThreadOptions;  // NOLINT(misc-unused-using-decls)
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/stream_executor/lib/threadpool.h b/tensorflow/compiler/xla/stream_executor/lib/threadpool.h
deleted file mode 100644
index b0775eef8a5..00000000000
--- a/tensorflow/compiler/xla/stream_executor/lib/threadpool.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREADPOOL_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREADPOOL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/thread_options.h"
-#include "tensorflow/tsl/platform/threadpool.h"
-
-namespace stream_executor {
-namespace port {
-
-using tsl::Thread;              // NOLINT(misc-unused-using-decls)
-using tsl::thread::ThreadPool;  // NOLINT(misc-unused-using-decls)
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_LIB_THREADPOOL_H_
diff --git a/tensorflow/compiler/xla/stream_executor/module_spec.h b/tensorflow/compiler/xla/stream_executor/module_spec.h
index 44e8fab4cf5..173ab98db21 100644
--- a/tensorflow/compiler/xla/stream_executor/module_spec.h
+++ b/tensorflow/compiler/xla/stream_executor/module_spec.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 
@@ -32,7 +32,7 @@ namespace stream_executor {
 class MultiModuleLoaderSpec {
  public:
   bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; }
-  port::ArraySlice<const uint8_t> cuda_cubin_in_memory() const {  // non-absl ok
+  absl::Span<const uint8_t> cuda_cubin_in_memory() const {
     CHECK(has_cuda_cubin_in_memory());
     return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()};
   }
@@ -43,8 +43,7 @@ class MultiModuleLoaderSpec {
     return cuda_ptx_in_memory_;
   }
 
-  void AddCudaCubinInMemory(
-      port::ArraySlice<const uint8_t> cubin_bytes) {  // non-absl ok
+  void AddCudaCubinInMemory(absl::Span<const uint8_t> cubin_bytes) {
     CHECK(!cubin_bytes.empty());
     has_cuda_cubin_in_memory_ = true;
     cuda_cubin_in_memory_ = cubin_bytes;
@@ -57,7 +56,7 @@ class MultiModuleLoaderSpec {
   }
 
  private:
-  port::ArraySlice<const uint8_t> cuda_cubin_in_memory_;  // non-absl ok
+  absl::Span<const uint8_t> cuda_cubin_in_memory_;
   bool has_cuda_cubin_in_memory_ = false;
   const char* cuda_ptx_in_memory_;
   bool has_cuda_ptx_in_memory_ = false;
diff --git a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
index f79a2aea0aa..f4322e8ef94 100644
--- a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
@@ -24,8 +24,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
@@ -33,48 +32,48 @@ namespace {
 
 class MultiPlatformManagerImpl {
  public:
-  port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
+  tsl::Status RegisterPlatform(std::unique_ptr<Platform> platform)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<Platform*> PlatformWithName(absl::string_view target)
+  tsl::StatusOr<Platform*> PlatformWithName(absl::string_view target)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
+  tsl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<Platform*> PlatformWithName(absl::string_view target,
-                                             bool initialize_platform)
+  tsl::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                            bool initialize_platform)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
-                                           bool initialize_platform)
+  tsl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
+                                          bool initialize_platform)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<Platform*> InitializePlatformWithName(
+  tsl::StatusOr<Platform*> InitializePlatformWithName(
       absl::string_view target,
       const std::map<std::string, std::string>& options)
       ABSL_LOCKS_EXCLUDED(mu_);
-  port::StatusOr<Platform*> InitializePlatformWithId(
+  tsl::StatusOr<Platform*> InitializePlatformWithId(
       const Platform::Id& id, const std::map<std::string, std::string>& options)
       ABSL_LOCKS_EXCLUDED(mu_);
 
-  port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+  tsl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
       const std::function<bool(const Platform*)>& filter,
       bool initialize_platform) ABSL_LOCKS_EXCLUDED(mu_);
 
   using Listener = MultiPlatformManager::Listener;
-  port::Status RegisterListener(std::unique_ptr<Listener> listener)
+  tsl::Status RegisterListener(std::unique_ptr<Listener> listener)
       ABSL_LOCKS_EXCLUDED(mu_);
 
  private:
   // Looks up the platform object with the given name.  Assumes the Platforms
   // mutex is held.
-  port::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
+  tsl::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Looks up the platform object with the given id.  Assumes the Platforms
   // mutex is held.
-  port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
+  tsl::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Returns the names of the initialied platforms satisfying the given filter.
@@ -90,15 +89,15 @@ class MultiPlatformManagerImpl {
   absl::flat_hash_map<std::string, Platform*> name_map_ ABSL_GUARDED_BY(mu_);
 };
 
-port::Status MultiPlatformManagerImpl::RegisterPlatform(
+tsl::Status MultiPlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   std::string key = absl::AsciiStrToLower(platform->Name());
   absl::MutexLock lock(&mu_);
   if (name_map_.find(key) != name_map_.end()) {
-    return port::Status(port::error::INTERNAL,
-                        "platform is already registered with name: \"" +
-                            platform->Name() + "\"");
+    return tsl::Status(tsl::error::INTERNAL,
+                       "platform is already registered with name: \"" +
+                           platform->Name() + "\"");
   }
   Platform* platform_ptr = platform.get();
   CHECK(id_map_.emplace(platform->id(), platform_ptr).second);
@@ -114,17 +113,17 @@ port::Status MultiPlatformManagerImpl::RegisterPlatform(
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
     absl::string_view target) {
   return PlatformWithName(target, /*initialize_platform=*/true);
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
     const Platform::Id& id) {
   return PlatformWithId(id, /*initialize_platform=*/true);
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
     absl::string_view target, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
@@ -136,7 +135,7 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
   return platform;
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
     const Platform::Id& id, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
@@ -148,15 +147,15 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
   return platform;
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
     absl::string_view target,
     const std::map<std::string, std::string>& options) {
   absl::MutexLock lock(&mu_);
 
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (platform->Initialized()) {
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
+    return tsl::Status(
+        tsl::error::FAILED_PRECONDITION,
         absl::StrCat("platform \"", target, "\" is already initialized"));
   }
 
@@ -165,14 +164,14 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
   return platform;
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id, const std::map<std::string, std::string>& options) {
   absl::MutexLock lock(&mu_);
 
   TF_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
-    return port::Status(
-        port::error::FAILED_PRECONDITION,
+    return tsl::Status(
+        tsl::error::FAILED_PRECONDITION,
         absl::StrFormat("platform with id %p is already initialized", id));
   }
 
@@ -181,7 +180,7 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
   return platform;
 }
 
-port::Status MultiPlatformManagerImpl::RegisterListener(
+tsl::Status MultiPlatformManagerImpl::RegisterListener(
     std::unique_ptr<Listener> listener) {
   absl::MutexLock lock(&mu_);
   CHECK(id_map_.empty());
@@ -190,7 +189,7 @@ port::Status MultiPlatformManagerImpl::RegisterListener(
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<std::vector<Platform*>>
+tsl::StatusOr<std::vector<Platform*>>
 MultiPlatformManagerImpl::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter,
     bool initialize_platform) {
@@ -227,12 +226,12 @@ MultiPlatformManagerImpl::InitializedPlatformNamesWithFilter(
   return initialized_platforms_names;
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
     absl::string_view target) {
   auto it = name_map_.find(absl::AsciiStrToLower(target));
   if (it == name_map_.end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
+    return tsl::Status(
+        tsl::error::NOT_FOUND,
         absl::StrCat("Could not find registered platform with name: \"", target,
                      "\". Available platform names are: ",
                      absl::StrJoin(InitializedPlatformNamesWithFilter(), " ")));
@@ -240,12 +239,12 @@ port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
   return it->second;
 }
 
-port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
+tsl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
     const Platform::Id& id) {
   auto it = id_map_.find(id);
   if (it == id_map_.end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
+    return tsl::Status(
+        tsl::error::NOT_FOUND,
         absl::StrFormat("could not find registered platform with id: %p", id));
   }
   return it->second;
@@ -258,56 +257,56 @@ MultiPlatformManagerImpl& Impl() {
 
 }  // namespace
 
-/*static*/ port::Status MultiPlatformManager::RegisterPlatform(
+/*static*/ tsl::Status MultiPlatformManager::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   return Impl().RegisterPlatform(std::move(platform));
 }
 
-/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+/*static*/ tsl::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     absl::string_view target) {
   return Impl().PlatformWithName(target);
 }
 
-/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+/*static*/ tsl::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
   return Impl().PlatformWithId(id);
 }
 
-/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+/*static*/ tsl::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id, bool initialize_platform) {
   return Impl().PlatformWithId(id, initialize_platform);
 }
 
-/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+/*static*/ tsl::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     absl::string_view target, bool initialize_platform) {
   return Impl().PlatformWithName(target, initialize_platform);
 }
 
-/*static*/ port::StatusOr<Platform*>
+/*static*/ tsl::StatusOr<Platform*>
 MultiPlatformManager::InitializePlatformWithName(
     absl::string_view target,
     const std::map<std::string, std::string>& options) {
   return Impl().InitializePlatformWithName(target, options);
 }
 
-/*static*/ port::StatusOr<Platform*>
+/*static*/ tsl::StatusOr<Platform*>
 MultiPlatformManager::InitializePlatformWithId(
     const Platform::Id& id, const std::map<std::string, std::string>& options) {
   return Impl().InitializePlatformWithId(id, options);
 }
 
-/*static*/ port::Status MultiPlatformManager::RegisterListener(
+/*static*/ tsl::Status MultiPlatformManager::RegisterListener(
     std::unique_ptr<Listener> listener) {
   return Impl().RegisterListener(std::move(listener));
 }
 
-/*static*/ port::StatusOr<std::vector<Platform*>>
+/*static*/ tsl::StatusOr<std::vector<Platform*>>
 MultiPlatformManager::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter) {
   return PlatformsWithFilter(filter, /*initialize_platform=*/true);
 }
 
-/*static*/ port::StatusOr<std::vector<Platform*>>
+/*static*/ tsl::StatusOr<std::vector<Platform*>>
 MultiPlatformManager::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter,
     bool initialize_platform) {
diff --git a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.h b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.h
index c17197b6d1a..c4ebf8e6099 100644
--- a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.h
+++ b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.h
@@ -28,7 +28,7 @@ limitations under the License.
 // This will register platform plugins that can be discovered via this
 // interface. Sample API usage:
 //
-//   port::StatusOr<Platform*> platform_status =
+//   tsl::StatusOr<Platform*> platform_status =
 //      se::MultiPlatformManager::PlatformWithName("OpenCL");
 //   if (!platform_status.ok()) { ... }
 //   Platform* platform = platform_status.value();
@@ -36,7 +36,7 @@ limitations under the License.
 //   if (platform->VisibleDeviceCount() <= 0) { return; }
 //
 //   for (int i = 0; i < platform->VisibleDeviceCount(); ++i) {
-//     port::StatusOr<StreamExecutor*> executor_status =
+//     tsl::StatusOr<StreamExecutor*> executor_status =
 //        platform->ExecutorForDevice(i);
 //     if (!executor_status.ok()) {
 //       LOG(INFO) << "could not retrieve executor for device ordinal " << i
@@ -70,11 +70,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -85,7 +85,7 @@ class MultiPlatformManager {
   // already registered. The associated listener, if not null, will be used to
   // trace events for ALL executors for that platform.
   // Takes ownership of platform.
-  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform);
+  static tsl::Status RegisterPlatform(std::unique_ptr<Platform> platform);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -97,15 +97,15 @@ class MultiPlatformManager {
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target);
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
+  static tsl::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static tsl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
 
   // Same functions as above, but allows platforms to be returned without
   // initialization if initialize_platform == false.
-  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target,
-                                                    bool initialize_platform);
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
-                                                  bool initialize_platform);
+  static tsl::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                                   bool initialize_platform);
+  static tsl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id,
+                                                 bool initialize_platform);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -117,20 +117,20 @@ class MultiPlatformManager {
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> InitializePlatformWithName(
+  static tsl::StatusOr<Platform*> InitializePlatformWithName(
       absl::string_view target,
       const std::map<std::string, std::string>& options);
 
-  static port::StatusOr<Platform*> InitializePlatformWithId(
+  static tsl::StatusOr<Platform*> InitializePlatformWithId(
       const Platform::Id& id,
       const std::map<std::string, std::string>& options);
 
   // Retrieves the platforms satisfying the given filter, i.e. returns true.
   // Returned Platforms are always initialized.
-  static port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+  static tsl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
       const std::function<bool(const Platform*)>& filter);
 
-  static port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+  static tsl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
       const std::function<bool(const Platform*)>& filter,
       bool initialize_platform);
 
@@ -156,7 +156,7 @@ class MultiPlatformManager {
   };
   // Registers a listeners to receive notifications about certain events.
   // Precondition: No Platform has been registered yet.
-  static port::Status RegisterListener(std::unique_ptr<Listener> listener);
+  static tsl::Status RegisterListener(std::unique_ptr<Listener> listener);
 };
 
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/platform.cc b/tensorflow/compiler/xla/stream_executor/platform.cc
index c51aa84cd07..aaca9354968 100644
--- a/tensorflow/compiler/xla/stream_executor/platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/platform.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
 
@@ -89,18 +89,18 @@ Platform::~Platform() {}
 
 bool Platform::Initialized() const { return true; }
 
-port::Status Platform::Initialize(
+tsl::Status Platform::Initialize(
     const std::map<std::string, std::string> &platform_options) {
   if (!platform_options.empty()) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "this platform does not support custom initialization");
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "this platform does not support custom initialization");
   }
   return ::tsl::OkStatus();
 }
 
-port::Status Platform::ForceExecutorShutdown() {
-  return port::Status(port::error::UNIMPLEMENTED,
-                      "executor shutdown is not supported on this platform");
+tsl::Status Platform::ForceExecutorShutdown() {
+  return tsl::Status(tsl::error::UNIMPLEMENTED,
+                     "executor shutdown is not supported on this platform");
 }
 
 std::unique_ptr<Platform::PeerAccessMap> Platform::GetPeerAccessMap() {
@@ -118,7 +118,7 @@ std::unique_ptr<Platform::PeerAccessMap> Platform::GetPeerAccessMap() {
   return std::unique_ptr<Platform::PeerAccessMap>{map};
 }
 
-port::Status Platform::EnablePeerAccess() {
+tsl::Status Platform::EnablePeerAccess() {
   auto peer_access_map = GetPeerAccessMap();
   for (const auto &access : *peer_access_map) {
     auto devices = access.first;
diff --git a/tensorflow/compiler/xla/stream_executor/platform.h b/tensorflow/compiler/xla/stream_executor/platform.h
index 0e6ef0dc399..394012634f1 100644
--- a/tensorflow/compiler/xla/stream_executor/platform.h
+++ b/tensorflow/compiler/xla/stream_executor/platform.h
@@ -20,14 +20,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_PLATFORM_H_
 
 #include <map>
+#include <memory>
 
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/device_options.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin.h"
 #include "tensorflow/compiler/xla/stream_executor/trace_listener.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -141,7 +142,7 @@ class Platform {
   // return an error if unrecognized options are provided.  If using
   // MultiPlatformManager, this method will be called automatically by
   // InitializePlatformWithId/InitializePlatformWithName.
-  virtual port::Status Initialize(
+  virtual tsl::Status Initialize(
       const std::map<std::string, std::string>& platform_options);
 
   // Returns a populated DeviceDescription for the device at the given ordinal.
@@ -150,7 +151,7 @@ class Platform {
   //
   // Alternatively callers may call GetDeviceDescription() on the StreamExecutor
   // which returns a cached instance specific to the initialized StreamExecutor.
-  virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
+  virtual tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   DescriptionForDevice(int ordinal) const = 0;
 
   // Returns a device with the given ordinal on this platform with a default
@@ -160,23 +161,23 @@ class Platform {
   //
   // Ownership of the executor is NOT transferred to the caller --
   // the Platform owns the executors in a singleton-like fashion.
-  virtual port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) = 0;
+  virtual tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) = 0;
 
   // Returns a device or error, as above, with the specified plugins.
   //
   // Ownership of the executor is NOT transferred to the caller.
-  virtual port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  virtual tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& plugin_config) = 0;
 
   // Returns a device constructed with the options specified in "config".
   // Ownership of the executor is NOT transferred to the caller.
-  virtual port::StatusOr<StreamExecutor*> GetExecutor(
+  virtual tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) = 0;
 
   // Returns a device constructed with the options specified in "config" without
   // looking in or storing to the Platform's executor cache.
   // Ownership IS transferred to the caller.
-  virtual port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  virtual tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) = 0;
 
   // Warning: this is a dangerous API and should be used with caution.
@@ -190,7 +191,7 @@ class Platform {
   // that have no reason to destroy device contexts.
   //
   // The platform must be reinitialized after this is called.
-  virtual port::Status ForceExecutorShutdown();
+  virtual tsl::Status ForceExecutorShutdown();
 
   // Registers a TraceListener to listen to all StreamExecutors for this
   // platform.
@@ -213,7 +214,7 @@ class Platform {
   // GetPeerAccessMap(). Note that calling this routine will force the creation
   // of a default-argument (see StreamExecutorConfig) StreamExecutor object for
   // each device ordinal in the system, should any not yet exist.
-  virtual port::Status EnablePeerAccess();
+  virtual tsl::Status EnablePeerAccess();
 
  protected:
   // SE_DISALLOW_COPY_AND_ASSIGN declares a constructor, which suppresses the
diff --git a/tensorflow/compiler/xla/stream_executor/platform/BUILD b/tensorflow/compiler/xla/stream_executor/platform/BUILD
index a91f3fe000f..b533bd1f2f5 100644
--- a/tensorflow/compiler/xla/stream_executor/platform/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/platform/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/stream_executor/platform/default/BUILD b/tensorflow/compiler/xla/stream_executor/platform/default/BUILD
index 029cb66e778..8f68af02c21 100644
--- a/tensorflow/compiler/xla/stream_executor/platform/default/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/platform/default/BUILD
@@ -3,10 +3,12 @@ load("//tensorflow/tsl:tsl.bzl", "tsl_copts")
 
 licenses(["notice"])
 
-package(default_visibility = [
-    "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-    "//tensorflow/stream_executor:__subpackages__",
-])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
+    ],
+)
 
 cc_library(
     name = "platform",
@@ -23,7 +25,6 @@ cc_library(
         "nobuilder",
     ],
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:dso_loader",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h b/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h
index 2198d95392b..401da6bd12d 100644
--- a/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/tsl/platform/dso_loader.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace internal {
diff --git a/tensorflow/compiler/xla/stream_executor/plugin_registry.cc b/tensorflow/compiler/xla/stream_executor/plugin_registry.cc
index 326bc44c7ab..9b55a25aca2 100644
--- a/tensorflow/compiler/xla/stream_executor/plugin_registry.cc
+++ b/tensorflow/compiler/xla/stream_executor/plugin_registry.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
 
@@ -69,14 +69,14 @@ void PluginRegistry::MapPlatformKindToId(PlatformKind platform_kind,
 }
 
 template <typename FACTORY_TYPE>
-port::Status PluginRegistry::RegisterFactoryInternal(
+tsl::Status PluginRegistry::RegisterFactoryInternal(
     PluginId plugin_id, const std::string& plugin_name, FACTORY_TYPE factory,
     std::map<PluginId, FACTORY_TYPE>* factories) {
   absl::MutexLock lock{&GetPluginRegistryMutex()};
 
   if (factories->find(plugin_id) != factories->end()) {
-    return port::Status(
-        port::error::ALREADY_EXISTS,
+    return tsl::Status(
+        tsl::error::ALREADY_EXISTS,
         absl::StrFormat("Attempting to register factory for plugin %s when "
                         "one has already been registered",
                         plugin_name));
@@ -88,15 +88,15 @@ port::Status PluginRegistry::RegisterFactoryInternal(
 }
 
 template <typename FACTORY_TYPE>
-port::StatusOr<FACTORY_TYPE> PluginRegistry::GetFactoryInternal(
+tsl::StatusOr<FACTORY_TYPE> PluginRegistry::GetFactoryInternal(
     PluginId plugin_id, const std::map<PluginId, FACTORY_TYPE>& factories,
     const std::map<PluginId, FACTORY_TYPE>& generic_factories) const {
   auto iter = factories.find(plugin_id);
   if (iter == factories.end()) {
     iter = generic_factories.find(plugin_id);
     if (iter == generic_factories.end()) {
-      return port::Status(
-          port::error::NOT_FOUND,
+      return tsl::Status(
+          tsl::error::NOT_FOUND,
           absl::StrFormat("Plugin ID %p not registered.", plugin_id));
     }
   }
@@ -108,7 +108,7 @@ bool PluginRegistry::SetDefaultFactory(Platform::Id platform_id,
                                        PluginKind plugin_kind,
                                        PluginId plugin_id) {
   if (!HasFactory(platform_id, plugin_kind, plugin_id)) {
-    port::StatusOr<Platform*> status =
+    tsl::StatusOr<Platform*> status =
         MultiPlatformManager::PlatformWithId(platform_id);
     std::string platform_name = "<unregistered platform>";
     if (status.ok()) {
@@ -179,21 +179,21 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
 
 // Explicit instantiations to support types exposed in user/public API.
 #define EMIT_PLUGIN_SPECIALIZATIONS(FACTORY_TYPE, FACTORY_VAR, PLUGIN_STRING) \
-  template port::StatusOr<PluginRegistry::FACTORY_TYPE>                       \
+  template tsl::StatusOr<PluginRegistry::FACTORY_TYPE>                        \
   PluginRegistry::GetFactoryInternal<PluginRegistry::FACTORY_TYPE>(           \
       PluginId plugin_id,                                                     \
       const std::map<PluginId, PluginRegistry::FACTORY_TYPE>& factories,      \
       const std::map<PluginId, PluginRegistry::FACTORY_TYPE>&                 \
           generic_factories) const;                                           \
                                                                               \
-  template port::Status                                                       \
+  template tsl::Status                                                        \
   PluginRegistry::RegisterFactoryInternal<PluginRegistry::FACTORY_TYPE>(      \
       PluginId plugin_id, const std::string& plugin_name,                     \
       PluginRegistry::FACTORY_TYPE factory,                                   \
       std::map<PluginId, PluginRegistry::FACTORY_TYPE>* factories);           \
                                                                               \
   template <>                                                                 \
-  port::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>( \
+  tsl::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>(  \
       Platform::Id platform_id, PluginId plugin_id, const std::string& name,  \
       PluginRegistry::FACTORY_TYPE factory) {                                 \
     return RegisterFactoryInternal(plugin_id, name, factory,                  \
@@ -201,7 +201,7 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
   }                                                                           \
                                                                               \
   template <>                                                                 \
-  port::Status PluginRegistry::RegisterFactoryForAllPlatforms<                \
+  tsl::Status PluginRegistry::RegisterFactoryForAllPlatforms<                 \
       PluginRegistry::FACTORY_TYPE>(PluginId plugin_id,                       \
                                     const std::string& name,                  \
                                     PluginRegistry::FACTORY_TYPE factory) {   \
@@ -210,14 +210,14 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
   }                                                                           \
                                                                               \
   template <>                                                                 \
-  port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
+  tsl::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(     \
       Platform::Id platform_id, PluginId plugin_id) {                         \
     if (plugin_id == PluginConfig::kDefault) {                                \
       plugin_id = default_factories_[platform_id].FACTORY_VAR;                \
                                                                               \
       if (plugin_id == kNullPlugin) {                                         \
-        return port::Status(                                                  \
-            port::error::FAILED_PRECONDITION,                                 \
+        return tsl::Status(                                                   \
+            tsl::error::FAILED_PRECONDITION,                                  \
             "No suitable " PLUGIN_STRING                                      \
             " plugin registered. Have you linked in a " PLUGIN_STRING         \
             "-providing plugin?");                                            \
@@ -232,13 +232,13 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
                                                                               \
   /* TODO(b/22689637): Also temporary WRT MultiPlatformManager */             \
   template <>                                                                 \
-  port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
+  tsl::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(     \
       PlatformKind platform_kind, PluginId plugin_id) {                       \
     auto iter = platform_id_by_kind_.find(platform_kind);                     \
     if (iter == platform_id_by_kind_.end()) {                                 \
-      return port::Status(port::error::FAILED_PRECONDITION,                   \
-                          absl::StrFormat("Platform kind %d not registered.", \
-                                          static_cast<int>(platform_kind)));  \
+      return tsl::Status(tsl::error::FAILED_PRECONDITION,                     \
+                         absl::StrFormat("Platform kind %d not registered.",  \
+                                         static_cast<int>(platform_kind)));   \
     }                                                                         \
     return GetFactory<PluginRegistry::FACTORY_TYPE>(iter->second, plugin_id); \
   }
diff --git a/tensorflow/compiler/xla/stream_executor/plugin_registry.h b/tensorflow/compiler/xla/stream_executor/plugin_registry.h
index 3c07040bda0..1a3b23836e1 100644
--- a/tensorflow/compiler/xla/stream_executor/plugin_registry.h
+++ b/tensorflow/compiler/xla/stream_executor/plugin_registry.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/fft.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -61,15 +61,15 @@ class PluginRegistry {
   // Returns a non-successful status if the factory has already been registered
   // with that platform (but execution should be otherwise unaffected).
   template <typename FactoryT>
-  port::Status RegisterFactory(Platform::Id platform_id, PluginId plugin_id,
-                               const std::string& name, FactoryT factory);
+  tsl::Status RegisterFactory(Platform::Id platform_id, PluginId plugin_id,
+                              const std::string& name, FactoryT factory);
 
   // Registers the specified factory as usable by _all_ platform types.
   // Reports errors just as RegisterFactory.
   template <typename FactoryT>
-  port::Status RegisterFactoryForAllPlatforms(PluginId plugin_id,
-                                              const std::string& name,
-                                              FactoryT factory);
+  tsl::Status RegisterFactoryForAllPlatforms(PluginId plugin_id,
+                                             const std::string& name,
+                                             FactoryT factory);
 
   // TODO(b/22689637): Setter for temporary mapping until all users are using
   // MultiPlatformManager / PlatformId.
@@ -89,17 +89,17 @@ class PluginRegistry {
                   PluginId plugin) const;
 
   // Retrieves the factory registered for the specified kind,
-  // or a port::Status on error.
+  // or a tsl::Status on error.
   template <typename FactoryT>
-  port::StatusOr<FactoryT> GetFactory(Platform::Id platform_id,
-                                      PluginId plugin_id);
+  tsl::StatusOr<FactoryT> GetFactory(Platform::Id platform_id,
+                                     PluginId plugin_id);
 
   // TODO(b/22689637): Deprecated/temporary. Will be deleted once all users are
   // on MultiPlatformManager / PlatformId.
   template <typename FactoryT>
   ABSL_DEPRECATED("Use MultiPlatformManager / PlatformId instead.")
-  port::StatusOr<FactoryT> GetFactory(PlatformKind platform_kind,
-                                      PluginId plugin_id);
+  tsl::StatusOr<FactoryT> GetFactory(PlatformKind platform_kind,
+                                     PluginId plugin_id);
 
  private:
   // Containers for the sets of registered factories, by plugin kind.
@@ -121,14 +121,14 @@ class PluginRegistry {
 
   // Actually performs the work of registration.
   template <typename FactoryT>
-  port::Status RegisterFactoryInternal(PluginId plugin_id,
-                                       const std::string& plugin_name,
-                                       FactoryT factory,
-                                       std::map<PluginId, FactoryT>* factories);
+  tsl::Status RegisterFactoryInternal(PluginId plugin_id,
+                                      const std::string& plugin_name,
+                                      FactoryT factory,
+                                      std::map<PluginId, FactoryT>* factories);
 
   // Actually performs the work of factory retrieval.
   template <typename FactoryT>
-  port::StatusOr<FactoryT> GetFactoryInternal(
+  tsl::StatusOr<FactoryT> GetFactoryInternal(
       PluginId plugin_id, const std::map<PluginId, FactoryT>& factories,
       const std::map<PluginId, FactoryT>& generic_factories) const;
 
@@ -161,16 +161,16 @@ class PluginRegistry {
 };
 
 // Explicit specializations are defined in plugin_registry.cc.
-#define DECLARE_PLUGIN_SPECIALIZATIONS(FACTORY_TYPE)                          \
-  template <>                                                                 \
-  port::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>( \
-      Platform::Id platform_id, PluginId plugin_id, const std::string& name,  \
-      PluginRegistry::FACTORY_TYPE factory);                                  \
-  template <>                                                                 \
-  port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
-      Platform::Id platform_id, PluginId plugin_id);                          \
-  template <>                                                                 \
-  port::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
+#define DECLARE_PLUGIN_SPECIALIZATIONS(FACTORY_TYPE)                         \
+  template <>                                                                \
+  tsl::Status PluginRegistry::RegisterFactory<PluginRegistry::FACTORY_TYPE>( \
+      Platform::Id platform_id, PluginId plugin_id, const std::string& name, \
+      PluginRegistry::FACTORY_TYPE factory);                                 \
+  template <>                                                                \
+  tsl::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
+      Platform::Id platform_id, PluginId plugin_id);                         \
+  template <>                                                                \
+  tsl::StatusOr<PluginRegistry::FACTORY_TYPE> PluginRegistry::GetFactory(    \
       PlatformKind platform_kind, PluginId plugin_id)
 
 DECLARE_PLUGIN_SPECIALIZATIONS(BlasFactory);
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/BUILD b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
index a187b45f1ea..beaf0f8826d 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
@@ -16,6 +16,7 @@ load(
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -34,8 +35,8 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/tsl/platform:platform_port",
     ]),
 )
 
@@ -50,10 +51,13 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla/stream_executor:device_options",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
         "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:numbers",
+        "//tensorflow/tsl/platform:stacktrace",
+        "//tensorflow/tsl/platform:static_threadlocal",
     ]),
 )
 
@@ -80,7 +84,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_event_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
     ]),
 )
 
@@ -93,6 +96,7 @@ cc_library(
         ":rocm_event",
         ":rocm_kernel",
         ":rocm_platform_id",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:plugin_registry",
@@ -104,9 +108,9 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_kernel_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -136,7 +140,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:executor_cache",
         "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
     ]),
     alwayslink = True,  # Registers itself with the MultiPlatformManager.
@@ -165,9 +168,10 @@ cc_library(
         ":rocm_gpu_executor",
         ":rocm_platform_id",
         "@local_config_rocm//rocm:rocm_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
@@ -193,7 +197,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_helpers_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
@@ -229,10 +232,10 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_kernel_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
         "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -303,14 +306,15 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/util:env_var",
+        "//tensorflow/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
@@ -339,9 +343,9 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_rng_header",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -362,9 +366,9 @@ cc_library(
         ":rocm_gpu_executor",
         ":rocm_platform_id",
         "@local_config_rocm//rocm:rocm_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -385,9 +389,9 @@ cc_library(
         ":rocm_platform_id",
         ":rocsolver_if_static",
         "@local_config_rocm//rocm:rocm_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -408,9 +412,9 @@ cc_library(
         ":rocm_platform_id",
         ":hipsolver_if_static",
         "@local_config_rocm//rocm:rocm_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -431,9 +435,9 @@ cc_library(
         ":rocm_platform_id",
         ":roctracer_if_static",
         "@local_config_rocm//rocm:rocm_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
+        "//tensorflow/tsl/platform:env",
     ]),
     alwayslink = True,
 )
@@ -459,9 +463,6 @@ cc_library(
         ":rocm_driver",
         ":rocm_platform",
         ":rocm_helpers",
-    ]) + if_dcu([
-        ":rocfft_plugin",
-    ], [
         ":hipfft_plugin",
     ]),
     alwayslink = 1,
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
index 1ec9b174d2a..155be6a04d5 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
@@ -25,9 +25,9 @@ limitations under the License.
 #if TF_ROCM_VERSION >= 40500
 
 #include "rocm/include/hipsolver.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
@@ -52,9 +52,9 @@ namespace wrap {
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = TO_STR(api_name);                            \
       void* f;                                                                \
-      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary(   \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
           stream_executor::internal::CachedDsoLoader::GetHipsolverDsoHandle() \
-              .value(),                                                  \
+              .value(),                                                       \
           kName, &f);                                                         \
       CHECK(s.ok()) << "could not find " << kName                             \
                     << " in hipsolver lib; dlerror: " << s.error_message();   \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
index 93c890b00b8..d6329368a31 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -20,10 +20,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
 
+#if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/hipsparse/hipsparse.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
+#else
+#include "rocm/include/hipsparse.h"
+#endif
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
index 6f3b1ad13ae..519adf9a07b 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -20,11 +20,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
 
-#include "rocm/include/rocblas.h"
+#include "rocm/include/rocblas/rocblas.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_activation.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
@@ -263,7 +263,8 @@ using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
   __macro(rocblas_ztrsm_batched)                \
   __macro(rocblas_create_handle)                \
   __macro(rocblas_destroy_handle)               \
-  __macro(rocblas_set_stream)
+  __macro(rocblas_set_stream)                   \
+  __macro(rocblas_set_atomics_mode)
 
 // clang-format on
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
index f8f9a30ed6b..32a710e662d 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
@@ -32,16 +32,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_helpers.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/tsl/util/determinism.h"
+using tsl::OpDeterminismRequired;
 
 namespace stream_executor {
 namespace gpu {
@@ -194,7 +194,19 @@ bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
   }
 
   gpu::ScopedActivateExecutorContext sac{parent_};
-  rocblas_status ret = rocblas_func(blas_, args...);
+
+  // set the atomics mode, leaving default to library
+  bool allow_atomics = !OpDeterminismRequired();
+  rocblas_status ret;
+  if (!allow_atomics) {
+    ret = wrap::rocblas_set_atomics_mode(blas_, rocblas_atomics_not_allowed);
+    if (err_on_failure && ret != rocblas_status_success) {
+      LOG(ERROR) << "failed to to set atomics mode before "
+                 << rocblas_func.kName << ": " << ToString(ret);
+    }
+  }
+
+  ret = rocblas_func(blas_, args...);
   if (err_on_failure && ret != rocblas_status_success) {
     LOG(ERROR) << "failed to run ROCBLAS routine " << rocblas_func.kName << ": "
                << ToString(ret);
@@ -385,13 +397,13 @@ bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64_t n,
       incx, &beta, GpuMemoryMutable(y), incy);
 }
 
-port::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
-                                  blas::Transpose transb, uint64_t m, uint64 n,
-                                  uint64_t k, blas::DataType dtype,
-                                  const void *alpha, const DeviceMemoryBase &a,
-                                  int lda, const DeviceMemoryBase &b, int ldb,
-                                  const void *beta, DeviceMemoryBase *c,
-                                  int ldc, blas::ComputePrecision precision) {
+tsl::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64 n,
+                                 uint64_t k, blas::DataType dtype,
+                                 const void *alpha, const DeviceMemoryBase &a,
+                                 int lda, const DeviceMemoryBase &b, int ldb,
+                                 const void *beta, DeviceMemoryBase *c, int ldc,
+                                 blas::ComputePrecision precision) {
   blas_log("DoBlasGemm");
   VLOG(1) << absl::StreamFormat(
       "doing rocBLAS GEMM: at=%d bt=%d m=%u n=%u "
@@ -426,7 +438,7 @@ port::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
 
   switch (dtype) {
     case blas::DataType::kHalf: {
-      port::StatusOr<bool> maybe_hasXDLOPS = GpuDriver::GetMFMASupport();
+      tsl::StatusOr<bool> maybe_hasXDLOPS = GpuDriver::GetMFMASupport();
       if (maybe_hasXDLOPS.ok() && maybe_hasXDLOPS.value()) {
         VLOG(1) << "Using rocblas_gemm_ex";
         return DoBlasInternalStatus(
@@ -503,8 +515,8 @@ port::Status ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
           cb_beta, static_cast<rocblas_double_complex *>(c->opaque()), ldc);
     }
     default:
-      return port::InternalError(absl::StrCat("Unsupported datatype for GEMM: ",
-                                              blas::DataTypeString(dtype)));
+      return tsl::errors::Internal("Unsupported datatype for GEMM: ",
+                                   blas::DataTypeString(dtype));
   }
 }
 
@@ -625,7 +637,7 @@ bool ROCMBlas::DoBlasGemmWithProfilingImpl(
   // ROCM TODO: properly implement the interface
   return false;
 }
-port::Status ROCMBlas::DoBlasGemmWithAlgorithm(
+tsl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, const DeviceMemoryBase &b,
@@ -634,10 +646,10 @@ port::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     blas::AlgorithmType algorithm, blas::ComputePrecision precision,
     blas::ProfileResult *output_profile_result) {
   // ROCM TODO: properly implement the interface
-  return port::InternalError("Not implemented on ROCm");
+  return tsl::errors::Internal("Not implemented on ROCm");
 }
 
-port::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
+tsl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
@@ -647,7 +659,7 @@ port::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
     blas::AlgorithmType algorithm, blas::ComputePrecision precision,
     blas::ProfileResult *output_profile_result) {
   // ROCM TODO: properly implement the interface
-  return port::InternalError("Not implemented on ROCm");
+  return tsl::errors::Internal("Not implemented on ROCm");
 }
 
 bool ROCMBlas::GetBlasGemmAlgorithms(
@@ -661,11 +673,11 @@ bool ROCMBlas::GetBlasGemmAlgorithms(
 // The below algorithm tries to minimize the number of memcpy by consolidating
 // neighboring memcpy into a single request
 template <typename MAPPED_T>
-port::Status ReorganizeMemory(Stream *stream,
-                              DeviceMemory<MAPPED_T> *device_memory,
-                              const std::vector<MAPPED_T *> &raw_ptrs,
-                              int batch_count, uint64_t batch_stride,
-                              bool gather) {
+tsl::Status ReorganizeMemory(Stream *stream,
+                             DeviceMemory<MAPPED_T> *device_memory,
+                             const std::vector<MAPPED_T *> &raw_ptrs,
+                             int batch_count, uint64_t batch_stride,
+                             bool gather) {
   assert(batch_count > 0);
   char *device_memory_ptr = static_cast<char *>(device_memory->opaque());
   char *src_ptr = reinterpret_cast<char *>(raw_ptrs[0]);
@@ -684,8 +696,8 @@ port::Status ReorganizeMemory(Stream *stream,
               ? stream->ThenMemcpy(&target_mem, src_mem, cur_stride_size).ok()
               : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
       if (!a_status) {
-        return port::Status(
-            port::error::INTERNAL,
+        return tsl::Status(
+            tsl::error::INTERNAL,
             "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
       }
       src_ptr = reinterpret_cast<char *>(raw_ptrs[i]);
@@ -700,14 +712,14 @@ port::Status ReorganizeMemory(Stream *stream,
       gather ? stream->ThenMemcpy(&target_mem, src_mem, cur_stride_size).ok()
              : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
   if (!a_status)
-    return port::Status(
-        port::error::INTERNAL,
+    return tsl::Status(
+        tsl::error::INTERNAL,
         "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
   return tsl::OkStatus();
 }
 
 template <typename T>
-port::Status ROCMBlas::AllocateStridedBuffer(
+tsl::Status ROCMBlas::AllocateStridedBuffer(
     const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type *>
         &raw_ptrs,
     int batch_count, uint64_t batch_stride, ScratchAllocator *scratch_allocator,
@@ -763,13 +775,13 @@ port::Status ROCMBlas::AllocateStridedBuffer(
 }
 
 template <typename T, typename FuncT>
-port::Status ROCMBlas::DoBlasGemmBatchedInternal(
+tsl::Status ROCMBlas::DoBlasGemmBatchedInternal(
     FuncT rocblas_func, Stream *stream, blas::Transpose transa,
     blas::Transpose transb, uint64_t m, uint64 n, uint64 k, T alpha,
-    const absl::Span<DeviceMemory<T> *const> &a_ptrs_to_wrappers, int lda,
-    const absl::Span<DeviceMemory<T> *const> &b_ptrs_to_wrappers, int ldb,
-    T beta, const absl::Span<DeviceMemory<T> *const> &c_ptrs_to_wrappers,
-    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+    DeviceMemorySlice<T> a_ptrs_to_wrappers, int lda,
+    DeviceMemorySlice<T> b_ptrs_to_wrappers, int ldb, T beta,
+    DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
   using MAPPED_T = typename RocBlasTypeConversionHelper<T>::mapped_type;
 
   // Sanity checks before making any further progress
@@ -813,7 +825,7 @@ port::Status ROCMBlas::DoBlasGemmBatchedInternal(
   // Make sure the temporary memory are in-scope before the function returns
   std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> a_temp;
   bool reallocated_a, reallocated_b, reallocated_c;
-  port::Status a_allocation_status = AllocateStridedBuffer<T>(
+  tsl::Status a_allocation_status = AllocateStridedBuffer<T>(
       a_raw_ptrs, batch_count, batch_stride_a, scratch_allocator, stream,
       &a_temp, &a, true, reallocated_a);
   if (a_allocation_status != tsl::OkStatus()) {
@@ -822,7 +834,7 @@ port::Status ROCMBlas::DoBlasGemmBatchedInternal(
 
   DeviceMemory<MAPPED_T> b;
   std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> b_temp;
-  port::Status b_allocation_status = AllocateStridedBuffer<T>(
+  tsl::Status b_allocation_status = AllocateStridedBuffer<T>(
       b_raw_ptrs, batch_count, batch_stride_b, scratch_allocator, stream,
       &b_temp, &b, true, reallocated_b);
   if (b_allocation_status != tsl::OkStatus()) {
@@ -831,44 +843,60 @@ port::Status ROCMBlas::DoBlasGemmBatchedInternal(
 
   DeviceMemory<MAPPED_T> c;
   std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> c_temp;
-  port::Status c_allocation_status = AllocateStridedBuffer<T>(
+  tsl::Status c_allocation_status = AllocateStridedBuffer<T>(
       c_raw_ptrs, batch_count, batch_stride_c, scratch_allocator, stream,
       &c_temp, &c, true, reallocated_c);  // can disable copy if beta=0
   if (c_allocation_status != tsl::OkStatus()) {
     return c_allocation_status;
   }
 
-  MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
-  MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
-
   bool ok;
-  ok = DoBlasInternal(rocblas_func, stream, /* pointer_mode_host = */ true,
-                      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
-                      n, k, GpuComplex(alpha_ptr), GpuMemory(a), lda,
-                      batch_stride_a, GpuMemory(b), ldb, batch_stride_b,
-                      GpuComplex(beta_ptr), GpuMemoryMutable(&c), ldc,
-                      batch_stride_c, batch_count);
+  if constexpr (std::is_same_v<T, Eigen::bfloat16>) {
+    float alpha_ = static_cast<float>(alpha);
+    float beta_ = static_cast<float>(beta);
+    const void *alpha_ptr = reinterpret_cast<const void *>(&alpha_);
+    const void *beta_ptr = reinterpret_cast<const void *>(&beta_);
+
+    ok = DoBlasInternal(
+        rocblas_func, stream, /* pointer_mode_host = */ true,
+        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+        alpha_ptr, a.opaque(), rocblas_datatype_bf16_r, lda, batch_stride_a,
+        b.opaque(), rocblas_datatype_bf16_r, ldb, batch_stride_b, beta_ptr,
+        c.opaque(), rocblas_datatype_bf16_r, ldc, batch_stride_c, c.opaque(),
+        rocblas_datatype_bf16_r, ldc, batch_stride_c, batch_count,
+        rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, 0);
+  } else {
+    MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
+    MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
+    ok = DoBlasInternal(rocblas_func, stream, /* pointer_mode_host = */ true,
+                        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
+                        n, k, GpuComplex(alpha_ptr), GpuMemory(a), lda,
+                        batch_stride_a, GpuMemory(b), ldb, batch_stride_b,
+                        GpuComplex(beta_ptr), GpuMemoryMutable(&c), ldc,
+                        batch_stride_c, batch_count);
+  }
   if (!ok)
-    return port::Status(port::error::INTERNAL,
-                        "failed BLAS call, see log for details");
+    return tsl::Status(tsl::error::INTERNAL,
+                       "failed BLAS call, see log for details");
   if (reallocated_c)
     return ReorganizeMemory(stream, &c, c_raw_ptrs, batch_count, batch_stride_c,
                             false);
   return tsl::OkStatus();
 }
 
-bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha,
-    const absl::Span<DeviceMemory<Eigen::half> *const> &a, int lda,
-    const absl::Span<DeviceMemory<Eigen::half> *const> &b, int ldb, float beta,
-    const absl::Span<DeviceMemory<Eigen::half> *const> &c, int ldc,
-    int batch_count, ScratchAllocator *scratch_allocator) {
+bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<Eigen::half> a, int lda,
+                                 DeviceMemorySlice<Eigen::half> b, int ldb,
+                                 float beta, DeviceMemorySlice<Eigen::half> c,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) {
   blas_log("DoBlasGemmBatched");
   const Eigen::half alpha_half(alpha);
   const Eigen::half beta_half(beta);
 
-  port::Status status = DoBlasGemmBatchedInternal(
+  tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_hgemm_strided_batched, stream, transa, transb, m, n, k,
       alpha_half, a, lda, b, ldb, beta_half, c, ldc, batch_count,
       scratch_allocator);
@@ -882,12 +910,34 @@ bool ROCMBlas::DoBlasGemmBatched(
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, float alpha,
-    const absl::Span<DeviceMemory<float> *const> &a_array, int lda,
-    const absl::Span<DeviceMemory<float> *const> &b_array, int ldb, float beta,
-    const absl::Span<DeviceMemory<float> *const> &c_array, int ldc,
-    int batch_count, ScratchAllocator *scratch_allocator) {
+    DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
+    DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
+    DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
   blas_log("DoBlasGemmBatched");
-  port::Status status = DoBlasGemmBatchedInternal(
+  const Eigen::bfloat16 alpha_bf16(alpha);
+  const Eigen::bfloat16 beta_bf16(beta);
+
+  tsl::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_gemm_strided_batched_ex, stream, transa, transb, m, n, k,
+      alpha_bf16, a_array, lda, b_array, ldb, beta_bf16, c_array, ldc,
+      batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, float alpha,
+                                 DeviceMemorySlice<float> a_array, int lda,
+                                 DeviceMemorySlice<float> b_array, int ldb,
+                                 float beta, DeviceMemorySlice<float> c_array,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) {
+  blas_log("DoBlasGemmBatched");
+  tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_sgemm_strided_batched, stream, transa, transb, m, n, k,
       alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
       scratch_allocator);
@@ -897,15 +947,16 @@ bool ROCMBlas::DoBlasGemmBatched(
   return status.ok();
 }
 
-bool ROCMBlas::DoBlasGemmBatched(
-    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, double alpha,
-    const absl::Span<DeviceMemory<double> *const> &a_array, int lda,
-    const absl::Span<DeviceMemory<double> *const> &b_array, int ldb,
-    double beta, const absl::Span<DeviceMemory<double> *const> &c_array,
-    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+bool ROCMBlas::DoBlasGemmBatched(Stream *stream, blas::Transpose transa,
+                                 blas::Transpose transb, uint64_t m, uint64_t n,
+                                 uint64 k, double alpha,
+                                 DeviceMemorySlice<double> a_array, int lda,
+                                 DeviceMemorySlice<double> b_array, int ldb,
+                                 double beta, DeviceMemorySlice<double> c_array,
+                                 int ldc, int batch_count,
+                                 ScratchAllocator *scratch_allocator) {
   blas_log("DoBlasGemmBatched");
-  port::Status status = DoBlasGemmBatchedInternal(
+  tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_dgemm_strided_batched, stream, transa, transb, m, n, k,
       alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
       scratch_allocator);
@@ -918,14 +969,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, std::complex<float> alpha,
-    const absl::Span<DeviceMemory<std::complex<float>> *const> &a_array,
-    int lda,
-    const absl::Span<DeviceMemory<std::complex<float>> *const> &b_array,
-    int ldb, std::complex<float> beta,
-    const absl::Span<DeviceMemory<std::complex<float>> *const> &c_array,
+    DeviceMemorySlice<std::complex<float>> a_array, int lda,
+    DeviceMemorySlice<std::complex<float>> b_array, int ldb,
+    std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   blas_log("DoBlasGemmBatched");
-  port::Status status = DoBlasGemmBatchedInternal(
+  tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_cgemm_strided_batched, stream, transa, transb, m, n, k,
       alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
       scratch_allocator);
@@ -938,14 +987,12 @@ bool ROCMBlas::DoBlasGemmBatched(
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, std::complex<double> alpha,
-    const absl::Span<DeviceMemory<std::complex<double>> *const> &a_array,
-    int lda,
-    const absl::Span<DeviceMemory<std::complex<double>> *const> &b_array,
-    int ldb, std::complex<double> beta,
-    const absl::Span<DeviceMemory<std::complex<double>> *const> &c_array,
+    DeviceMemorySlice<std::complex<double>> a_array, int lda,
+    DeviceMemorySlice<std::complex<double>> b_array, int ldb,
+    std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   blas_log("DoBlasGemmBatched");
-  port::Status status = DoBlasGemmBatchedInternal(
+  tsl::Status status = DoBlasGemmBatchedInternal(
       wrap::rocblas_zgemm_strided_batched, stream, transa, transb, m, n, k,
       alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
       scratch_allocator);
@@ -1067,7 +1114,7 @@ bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
       batch_count);
 }
 
-port::Status ROCMBlas::DoBlasGemmStridedBatched(
+tsl::Status ROCMBlas::DoBlasGemmStridedBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
     uint64_t n, uint64 k, blas::DataType dtype, const void *alpha,
     const DeviceMemoryBase &a, int lda, int64_t stride_a,
@@ -1156,13 +1203,13 @@ port::Status ROCMBlas::DoBlasGemmStridedBatched(
           batch_count);
     }
     default:
-      return port::InternalError(absl::StrCat("Unsupported datatype for GEMM: ",
-                                              blas::DataTypeString(dtype)));
+      return tsl::errors::Internal(absl::StrCat(
+          "Unsupported datatype for GEMM: ", blas::DataTypeString(dtype)));
   }
 }
 
-port::Status ROCMBlas::GetVersion(string *version) {
-  return port::UnimplementedError("");
+tsl::Status ROCMBlas::GetVersion(string *version) {
+  return tsl::errors::Unimplemented("");
 }
 
 }  // namespace gpu
@@ -1172,7 +1219,7 @@ void initialize_rocblas() {
       rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
 
   if (!rocBlasAlreadyRegistered) {
-    port::Status status =
+    tsl::Status status =
         PluginRegistry::Instance()
             ->RegisterFactory<PluginRegistry::BlasFactory>(
                 rocm::kROCmPlatformId, gpu::kRocBlasPlugin, "rocBLAS",
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
index 50f63b7e621..cea293a6631 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h
@@ -47,6 +47,11 @@ struct RocBlasTypeConversionHelper<Eigen::half> {
   using mapped_type = rocblas_half;
 };
 
+template <>
+struct RocBlasTypeConversionHelper<Eigen::bfloat16> {
+  using mapped_type = rocblas_bfloat16;
+};
+
 template <>
 struct RocBlasTypeConversionHelper<std::complex<float>> {
   using mapped_type = rocblas_float_complex;
@@ -116,11 +121,11 @@ class ROCMBlas : public blas::BlasSupport {
                               /*err_on_failure=*/true, args...);
   }
 
-  // Same as above, but returns Status.
+  // Same as above, but returns tsl::Status.
   template <typename... Args>
-  port::Status DoBlasInternalStatus(Args... args) {
+  tsl::Status DoBlasInternalStatus(Args... args) {
     if (!DoBlasInternal(args...)) {
-      return port::InternalError("Failed calling rocBLAS");
+      return tsl::errors::Internal("Failed calling rocBLAS");
     }
     return tsl::OkStatus();
   }
@@ -135,7 +140,7 @@ class ROCMBlas : public blas::BlasSupport {
   // A helper allocation function to convert raw pointers memory layout to
   // strided flavor
   template <typename T>
-  port::Status AllocateStridedBuffer(
+  tsl::Status AllocateStridedBuffer(
       const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type *>
           &raw_ptrs,
       int batch_count, uint64_t batch_stride,
@@ -163,13 +168,13 @@ class ROCMBlas : public blas::BlasSupport {
   // It will take advantage of the AllocateStridedBuffer subroutine to
   // reallocate the memory layout to be strided batched.
   template <typename T, typename FuncT>
-  port::Status DoBlasGemmBatchedInternal(
+  tsl::Status DoBlasGemmBatchedInternal(
       FuncT rocblas_func, Stream *stream, blas::Transpose transa,
       blas::Transpose transb, uint64_t m, uint64 n, uint64 k, T alpha,
-      const absl::Span<DeviceMemory<T> *const> &a_ptrs_to_wrappers, int lda,
-      const absl::Span<DeviceMemory<T> *const> &b_ptrs_to_wrappers, int ldb,
-      T beta, const absl::Span<DeviceMemory<T> *const> &c_ptrs_to_wrappers,
-      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
+      DeviceMemorySlice<T> a_ptrs_to_wrappers, int lda,
+      DeviceMemorySlice<T> b_ptrs_to_wrappers, int ldb, T beta,
+      DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
+      ScratchAllocator *scratch_allocator);
 
   // Helper function for implementing DoBlasGemmWithProfiling.
   template <typename T, typename ParamType>
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
index 10997d663d4..e0ccb9be275 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.cc
@@ -30,15 +30,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/host_info.h"
 
 namespace stream_executor {
 namespace rocm {
@@ -48,7 +47,7 @@ string DriverVersionToString(DriverVersion version) {
                          std::get<2>(version));
 }
 
-string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version) {
   if (!version.ok()) {
     return version.status().ToString();
   }
@@ -56,35 +55,35 @@ string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
   return DriverVersionToString(version.value());
 }
 
-port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
+tsl::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
   std::vector<string> pieces = absl::StrSplit(value, '.');
   if (pieces.size() != 2 && pieces.size() != 3) {
-    return port::Status{port::error::INVALID_ARGUMENT,
-                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
-                                        "for driver version; got \"%s\"",
-                                        value.c_str())};
+    return tsl::Status{tsl::error::INVALID_ARGUMENT,
+                       absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
+                                       "for driver version; got \"%s\"",
+                                       value.c_str())};
   }
 
   int major;
   int minor;
   int patch = 0;
-  if (!port::safe_strto32(pieces[0], &major)) {
-    return port::Status{
-        port::error::INVALID_ARGUMENT,
+  if (!absl::SimpleAtoi(pieces[0], &major)) {
+    return tsl::Status{
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse major version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[0].c_str(), value.c_str())};
   }
-  if (!port::safe_strto32(pieces[1], &minor)) {
-    return port::Status{
-        port::error::INVALID_ARGUMENT,
+  if (!absl::SimpleAtoi(pieces[1], &minor)) {
+    return tsl::Status{
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse minor version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[1].c_str(), value.c_str())};
   }
-  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
-    return port::Status{
-        port::error::INVALID_ARGUMENT,
+  if (pieces.size() == 3 && !absl::SimpleAtoi(pieces[2], &patch)) {
+    return tsl::Status{
+        tsl::error::INVALID_ARGUMENT,
         absl::StrFormat("could not parse patch version number \"%s\" as an "
                         "integer from string \"%s\"",
                         pieces[2].c_str(), value.c_str())};
@@ -110,13 +109,13 @@ string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
 
 void Diagnostician::LogDiagnosticInformation() {
   LOG(INFO) << "retrieving ROCM diagnostic information for host: "
-            << port::Hostname();
+            << tsl::port::Hostname();
 
   LogDriverVersionInformation();
 }
 
 /* static */ void Diagnostician::LogDriverVersionInformation() {
-  LOG(INFO) << "hostname: " << port::Hostname();
+  LOG(INFO) << "hostname: " << tsl::port::Hostname();
   if (VLOG_IS_ON(1)) {
     const char* value = getenv("LD_LIBRARY_PATH");
     string library_path = value == nullptr ? "" : value;
@@ -138,11 +137,11 @@ void Diagnostician::LogDiagnosticInformation() {
       closedir(dir);
     }
   }
-  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  tsl::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "librocm reported version is: "
             << rocm::DriverVersionStatusToString(dso_version);
 
-  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  tsl::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
             << rocm::DriverVersionStatusToString(kernel_version);
 
@@ -153,9 +152,9 @@ void Diagnostician::LogDiagnosticInformation() {
 
 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
 // driver-interfacing DSO version number. Returns it as a string.
-port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
-  port::StatusOr<DriverVersion> result{port::Status{
-      port::error::NOT_FOUND,
+tsl::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  tsl::StatusOr<DriverVersion> result{tsl::Status{
+      tsl::error::NOT_FOUND,
       "was unable to find librocm.so DSO loaded into this program"}};
 
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
@@ -181,7 +180,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       string dso_version = dot + strlen(so_suffix);
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
-      auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
+      auto result = static_cast<tsl::StatusOr<DriverVersion>*>(data);
       *result = rocm::StringToDriverVersion(string(stripped_dso_version));
       return 1;
     }
@@ -193,13 +192,13 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
   return result;
 }
 
-port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+tsl::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
     const string& driver_version_file_contents) {
   static const char* kDriverFilePrelude = "Kernel Module  ";
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == string::npos) {
-    return port::Status{
-        port::error::NOT_FOUND,
+    return tsl::Status{
+        tsl::error::NOT_FOUND,
         absl::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
                      driver_version_file_contents, "\"")};
@@ -215,8 +214,8 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
-    port::StatusOr<DriverVersion> dso_version,
-    port::StatusOr<DriverVersion> kernel_version) {
+    tsl::StatusOr<DriverVersion> dso_version,
+    tsl::StatusOr<DriverVersion> kernel_version) {
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.value() == kernel_version.value()) {
     LOG(INFO) << "kernel version seems to match DSO: "
@@ -230,9 +229,9 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   }
 }
 
-port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
-  auto status = port::Status{port::error::UNIMPLEMENTED,
-                             "kernel reported driver version not implemented"};
+tsl::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  auto status = tsl::Status{tsl::error::UNIMPLEMENTED,
+                            "kernel reported driver version not implemented"};
   return status;
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h
index 90659025a88..783973df906 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h
@@ -28,10 +28,10 @@ using DriverVersion = gpu::DriverVersion;
 string DriverVersionToString(DriverVersion version);
 
 // Converts a parsed driver version or status value to natural string form.
-string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+string DriverVersionStatusToString(tsl::StatusOr<DriverVersion> version);
 
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+tsl::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
 
 using Diagnostician = gpu::Diagnostician;
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
index 7310156a9b2..62e4c5bc83c 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
@@ -31,11 +31,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h"
@@ -43,7 +40,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/hash.h"
+#include "tensorflow/tsl/util/determinism.h"
 #include "tensorflow/tsl/util/env_var.h"
 
 namespace {
@@ -222,35 +222,149 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                               \
-  struct DynLoadShim__##__name {                                          \
-    static const char* kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();           \
-      return s.value();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void* f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in miopen DSO; dlerror: " << s.error_message();  \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    miopenStatus_t operator()(Args... args) {                             \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
+#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                         \
+    static const char* kName;                                            \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void* GetDsoHandle() {                                        \
+      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();          \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void* f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in miopen DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    miopenStatus_t operator()(Args... args) {                            \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;                                                              \
   const char* DynLoadShim__##__name::kName = #__name;
 
 #endif
 
+#if (TF_ROCM_VERSION >= 50300)
+// clang-format off
+#define MIOPEN_DNN_ROUTINE_EACH(__macro)                             \
+  __macro(miopenBatchNormalizationBackward)                          \
+  __macro(miopenBatchNormalizationForwardInference)                  \
+  __macro(miopenBatchNormalizationForwardTraining)                   \
+  __macro(miopenGetConvolutionForwardOutputDim)                      \
+  __macro(miopenGetConvolutionNdForwardOutputDim)                    \
+  __macro(miopenFindConvolutionForwardAlgorithm)                     \
+  __macro(miopenCreateTensorDescriptor)                              \
+  __macro(miopenDestroyTensorDescriptor)                             \
+  __macro(miopenSetNdPoolingDescriptor)                              \
+  __macro(miopenSetPoolingIndexType)                                 \
+  __macro(miopenSetLRNDescriptor)                                    \
+  __macro(miopenLRNGetWorkSpaceSize)                                 \
+  __macro(miopenCreateConvolutionDescriptor)                         \
+  __macro(miopenCreatePoolingDescriptor)                             \
+  __macro(miopenDestroyPoolingDescriptor)                            \
+  __macro(miopenCreateLRNDescriptor)                                 \
+  __macro(miopenDestroyLRNDescriptor)                                \
+  __macro(miopenDestroyConvolutionDescriptor)                        \
+  __macro(miopenCreateWithStream)                                    \
+  __macro(miopenDestroy)                                             \
+  __macro(miopenSetStream)                                           \
+  __macro(miopenSetAllocator)                                        \
+  __macro(miopenActivationForward)                                   \
+  __macro(miopenConvolutionForward)                                  \
+  __macro(miopenConvolutionBackwardBias)                             \
+  __macro(miopenConvolutionForwardGetWorkSpaceSize)                  \
+  __macro(miopenInitConvolutionDescriptor)                           \
+  __macro(miopenInitConvolutionNdDescriptor)                         \
+  __macro(miopenGetConvolutionDescriptor)                            \
+  __macro(miopenGetConvolutionNdDescriptor)                          \
+  __macro(miopenSetConvolutionGroupCount)                            \
+  __macro(miopenSet4dTensorDescriptor)                               \
+  __macro(miopenGetTensorDescriptor)                                 \
+  __macro(miopenSetTensorDescriptor)                                 \
+  __macro(miopenGetTensorDescriptorSize)                             \
+  __macro(miopenPoolingForward)                                      \
+  __macro(miopenPoolingGetWorkSpaceSizeV2)                           \
+  __macro(miopenPoolingBackward)                                     \
+  __macro(miopenLRNForward)                                          \
+  __macro(miopenLRNBackward)                                         \
+  __macro(miopenOpTensor)                                            \
+  __macro(miopenConvolutionBackwardData)                             \
+  __macro(miopenConvolutionBackwardWeights)                          \
+  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize)          \
+  __macro(miopenFindConvolutionBackwardDataAlgorithm)                \
+  __macro(miopenFindConvolutionBackwardWeightsAlgorithm)             \
+  __macro(miopenConvolutionBackwardDataGetWorkSpaceSize)             \
+  __macro(miopenCreateRNNDescriptor)                                 \
+  __macro(miopenSetRNNDescriptor)                                    \
+  __macro(miopenDestroyRNNDescriptor)                                \
+  __macro(miopenGetRNNParamsSize)                                    \
+  __macro(miopenGetRNNLayerParam)                                    \
+  __macro(miopenGetRNNLayerBias)                                     \
+  __macro(miopenGetRNNWorkspaceSize)                                 \
+  __macro(miopenGetRNNTrainingReserveSize)                           \
+  __macro(miopenRNNForwardInference)                                 \
+  __macro(miopenRNNForwardTraining)                                  \
+  __macro(miopenRNNBackwardData)                                     \
+  __macro(miopenRNNBackwardWeights)                                  \
+  __macro(miopenGetRNNLayerParamOffset)                              \
+  __macro(miopenGetRNNLayerParamSize)                                \
+  __macro(miopenGetRNNLayerBiasOffset)                               \
+  __macro(miopenGetRNNLayerBiasSize)                                 \
+  __macro(miopenGetRNNParamsDescriptor)                              \
+  __macro(miopenCreateActivationDescriptor)                          \
+  __macro(miopenSetActivationDescriptor)                             \
+  __macro(miopenGetActivationDescriptor)                             \
+  __macro(miopenDestroyActivationDescriptor)                         \
+  __macro(miopenCreateFusionPlan)                                    \
+  __macro(miopenCreateOpConvForward)                                 \
+  __macro(miopenCreateOpBiasForward)                                 \
+  __macro(miopenCreateOpActivationForward)                           \
+  __macro(miopenCreateOpActivationBackward)                          \
+  __macro(miopenCreateOpBatchNormInference)                          \
+  __macro(miopenCreateOpBatchNormForward)                            \
+  __macro(miopenCreateOpBatchNormBackward)                           \
+  __macro(miopenCompileFusionPlan)                                   \
+  __macro(miopenFusionPlanGetOp)                                     \
+  __macro(miopenCreateOperatorArgs)                                  \
+  __macro(miopenSetOpArgsConvForward)                                \
+  __macro(miopenSetOpArgsBiasForward)                                \
+  __macro(miopenSetOpArgsActivForward)                               \
+  __macro(miopenSetOpArgsActivBackward)                              \
+  __macro(miopenSetOpArgsBatchNormInference)                         \
+  __macro(miopenSetOpArgsBatchNormForward)                           \
+  __macro(miopenSetOpArgsBatchNormBackward)                          \
+  __macro(miopenExecuteFusionPlan)                                   \
+  __macro(miopenDestroyOperatorArgs)                                 \
+  __macro(miopenDestroyFusionPlan)                                   \
+  __macro(miopenConvolutionForwardGetSolutionCount)                  \
+  __macro(miopenConvolutionForwardGetSolution)                       \
+  __macro(miopenConvolutionForwardGetSolutionWorkspaceSize)          \
+  __macro(miopenConvolutionForwardCompileSolution)                   \
+  __macro(miopenConvolutionForwardImmediate)                         \
+  __macro(miopenConvolutionBackwardDataGetSolutionCount)             \
+  __macro(miopenConvolutionBackwardDataGetSolution)                  \
+  __macro(miopenConvolutionBackwardDataGetSolutionWorkspaceSize)     \
+  __macro(miopenConvolutionBackwardDataCompileSolution)              \
+  __macro(miopenConvolutionBackwardDataImmediate)                    \
+  __macro(miopenConvolutionBackwardWeightsGetSolutionCount)          \
+  __macro(miopenConvolutionBackwardWeightsGetSolution)               \
+  __macro(miopenConvolutionBackwardWeightsGetSolutionWorkspaceSize)  \
+  __macro(miopenConvolutionBackwardWeightsCompileSolution)           \
+  __macro(miopenConvolutionBackwardWeightsImmediate)                 \
+  __macro(miopenCreateCTCLossDescriptor)                             \
+  __macro(miopenSetCTCLossDescriptor)                                \
+  __macro(miopenGetCTCLossWorkspaceSize)                             \
+  __macro(miopenCTCLoss)                                             \
+  __macro(miopenDestroyCTCLossDescriptor)                            \
+  __macro(miopenSetConvolutionAttribute)  // clang-format on
+#else
 // clang-format off
 #define MIOPEN_DNN_ROUTINE_EACH(__macro)                             \
   __macro(miopenBatchNormalizationBackward)                          \
@@ -363,6 +477,7 @@ namespace wrap {
   __macro(miopenCTCLoss)                                             \
   __macro(miopenDestroyCTCLossDescriptor)
 // clang-format on
+#endif
 
 MIOPEN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_MIOPEN_WRAP)
 
@@ -420,6 +535,8 @@ uint64_t GetHashValue(miopenConvolutionDescriptor_t conv_desc) {
   return hash_value;
 }
 
+bool RequireMIOpenDeterminism() { return tsl::OpDeterminismRequired(); }
+
 // Class to implement a cache of compiled fusion plans
 class CachedFusionPlans {
  public:
@@ -593,22 +710,23 @@ MIOpenSupport::MIOpenSupport(GpuExecutor* parent) : parent_(parent) {
   return_best_algo_only_ = false;
   // but if the env var TF_ROCM_RETURN_BEST_ALGO_ONLY is set, only the best
   // (i.e. most efficient) algorithm will be returned
-  tsl::ReadBoolFromEnvVar("TF_ROCM_RETURN_BEST_ALGO_ONLY", false,
-                          &return_best_algo_only_);
+  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_RETURN_BEST_ALGO_ONLY", false,
+                                      &return_best_algo_only_));
 
   // by default, use Find Mode APIs for convolution
   use_immediate_mode_ = false;
   // swich to Find Mode if env var TF_ROCM_USE_IMMEDIATE_MODE is set
-  tsl::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
-                          &use_immediate_mode_);
+
+  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
+                                      &use_immediate_mode_));
 
   bool enable_pooling_cache = false;
-  tsl::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
-                          &enable_pooling_cache);
+  TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
+                                      &enable_pooling_cache));
   if (enable_pooling_cache) m_pooling_cache_allowed = true;
 }
 
-port::Status MIOpenSupport::Init() {
+tsl::Status MIOpenSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
   miopenHandle_t miopen_handle = nullptr;
   auto status = wrap::miopenCreateWithStream(
@@ -632,12 +750,12 @@ port::Status MIOpenSupport::Init() {
     }
   }
 
-  return port::Status{port::error::INTERNAL,
-                      absl::StrCat("miopen library could not create a handle: ",
-                                   ToString(status))};
+  return tsl::Status{tsl::error::INTERNAL,
+                     absl::StrCat("miopen library could not create a handle: ",
+                                  ToString(status))};
 }
 
-port::StatusOr<perftools::gputools::dnn::VersionInfo>
+tsl::StatusOr<perftools::gputools::dnn::VersionInfo>
 MIOpenSupport::GetVersion() {
   // ROCM TODO: retrieve MIOpen version with its API
   return perftools::gputools::dnn::VersionInfo(1, 3, 0);
@@ -851,6 +969,17 @@ class ScopedConvolutionDescriptor {
       LOG(FATAL) << "could not set miopen convolution group count: "
                  << ToString(status);
     }
+
+#if (TF_ROCM_VERSION >= 50300)
+    if (RequireMIOpenDeterminism()) {
+      status = wrap::miopenSetConvolutionAttribute(
+          handle_, MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, 1);
+      if (status != miopenStatusSuccess) {
+        LOG(FATAL) << "could not set miopen convolution attribute: "
+                   << ToString(status);
+      }
+    }
+#endif
   }
   ~ScopedConvolutionDescriptor() {
     auto status = wrap::miopenDestroyConvolutionDescriptor(handle_);
@@ -1749,7 +1878,7 @@ class MixinBase<void> {};
 #define RETURN_IF_MIOPEN_ERROR(STATUS, ...)                              \
   if (!SE_PREDICT_TRUE((STATUS) == miopenStatusSuccess)) {               \
     string error_msg = absl::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
-    SetFailure(port::Status(port::error::UNKNOWN, error_msg));           \
+    SetFailure(::tsl::Status(tsl::error::UNKNOWN, error_msg));           \
     LOG(ERROR) << error_msg;                                             \
     return;                                                              \
   }
@@ -1758,11 +1887,11 @@ template <typename Base>
 class MIOpenDescriptorCommon : public MixinBase<Base> {
  public:
   bool ok() const { return status_.ok(); }
-  port::Status Status() const { return status_; }
+  tsl::Status Status() const { return status_; }
 
  protected:
-  void SetFailure(const port::Status& status) { status_.Update(status); }
-  port::Status status_;
+  void SetFailure(const tsl::Status& status) { status_.Update(status); }
+  tsl::Status status_;
 };
 
 class MIOpenRnnParamsDescriptor : public MIOpenDescriptorCommon<void> {
@@ -1796,7 +1925,7 @@ class MIOpenRnnParamsDescriptor : public MIOpenDescriptorCommon<void> {
   int64_t params_size_in_bytes_;
   ParamsRegions weights_;
   ParamsRegions biases_;
-  port::Status status_;
+  tsl::Status status_;
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnParamsDescriptor);
 };
 
@@ -1877,7 +2006,7 @@ class MIOpenRnnDescriptor : public MIOpenDescriptorCommon<dnn::RnnDescriptor> {
   miopenRNNDirectionMode_t direction_mode_;
   miopenRNNMode_t rnn_mode_;
   miopenDataType_t data_type_;
-  port::Status status_;
+  tsl::Status status_;
   // no dropout in MIOpen.
   // std::unique_ptr<miopenDropoutDescriptor> miopen_dropout_desc_;
   std::unique_ptr<MIOpenRnnParamsDescriptor> miopen_params_desc_;
@@ -1915,7 +2044,7 @@ class MIOpenRnnSequenceTensorDescriptor
       string error_msg =
           absl::StrCat("sequence length must be positive: ", seq_length);
       LOG(ERROR) << error_msg;
-      SetFailure(port::Status(port::error::UNKNOWN, error_msg));
+      SetFailure(tsl::Status(tsl::error::UNKNOWN, error_msg));
       return;
     }
     auto status = wrap::miopenCreateTensorDescriptor(&handle);
@@ -1952,7 +2081,7 @@ class MIOpenRnnSequenceTensorDescriptor
   int data_size_;
   miopenDataType_t data_type_;
   std::vector<miopenTensorDescriptor_t> handles_;
-  port::Status status_;
+  tsl::Status status_;
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnSequenceTensorDescriptor);
 };
 
@@ -1995,7 +2124,7 @@ class MIOpenRnnStateTensorDescriptor
   int num_layers_;
   int batch_size_;
   int data_size_;
-  port::Status status_;
+  tsl::Status status_;
   miopenDataType_t data_type_;
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenRnnStateTensorDescriptor);
 };
@@ -2430,7 +2559,7 @@ class MIOpenCTCLossDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenCTCLossDescriptor);
 };
 
-port::Status MIOpenSupport::DoPrepareForCtcLoss(
+tsl::Status MIOpenSupport::DoPrepareForCtcLoss(
     Stream* stream, dnn::DataType element_type,
     const dnn::RnnStateTensorDescriptor& probs_desc,
     const dnn::RnnStateTensorDescriptor& grads_desc,
@@ -2461,7 +2590,7 @@ port::Status MIOpenSupport::DoPrepareForCtcLoss(
   if (status != miopenStatusSuccess) {
     LOG(FATAL) << "call to miopenDestroyCTCLossDescriptor failed: "
                << ToString(status);
-    return port::InternalError(
+    return tsl::errors::Internal(
         "Failed to determine scratch memory size for MIOpen CTC Loss");
   }
 
@@ -2470,9 +2599,8 @@ port::Status MIOpenSupport::DoPrepareForCtcLoss(
   // Allocate the workspace.
   if (workspace_size_in_bytes != 0) {
     if (scratch_allocator == nullptr) {
-      return port::InternalError(
-          absl::StrCat("An allocator must be specified when scratch memory is "
-                       "needed"));
+      return tsl::errors::Internal(
+          "An allocator must be specified when scratch memory is needed");
     }
     auto scratch_or = scratch_allocator->AllocateBytes(workspace_size_in_bytes);
     if (scratch_or.ok()) {
@@ -2485,16 +2613,16 @@ port::Status MIOpenSupport::DoPrepareForCtcLoss(
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
              "error";
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Failed to allocate scratch memory for MIOpen CTC Loss, of size: ",
-          workspace_size_in_bytes));
+          workspace_size_in_bytes);
     }
   }
 
   return tsl::OkStatus();
 }
 
-port::Status MIOpenSupport::DoCtcLossImpl(
+tsl::Status MIOpenSupport::DoCtcLossImpl(
     Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
     const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
     absl::Span<const int> labels_lengths_data,
@@ -2518,13 +2646,13 @@ port::Status MIOpenSupport::DoCtcLossImpl(
       scratch_memory.opaque(), scratch_memory.size());
   if (status != miopenStatusSuccess) {
     LOG(FATAL) << "call to miopenCTCLoss failed: " << ToString(status);
-    return port::InternalError("Failure during MIOpen CTC Loss");
+    return tsl::errors::Internal("Failure during MIOpen CTC Loss");
   }
 
   return tsl::OkStatus();
 }
 
-port::Status MIOpenSupport::DoCtcLoss(
+tsl::Status MIOpenSupport::DoCtcLoss(
     Stream* stream, dnn::DataType element_type,
     const dnn::RnnStateTensorDescriptor& probs_desc,
     const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
@@ -2535,9 +2663,9 @@ port::Status MIOpenSupport::DoCtcLoss(
     int ctc_loss_algo_id) {
   // Current MIOPen CTC Loss only supports the float datatype
   if (element_type != dnn::DataType::kFloat) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "MIOpenCTCLossDescriptor is supported only when the "
-                        "DataType is float");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "MIOpenCTCLossDescriptor is supported only when the "
+                       "DataType is float");
   }
 
   MIOpenCTCLossDescriptor miopen_ctc_loss_desc(ToMIOpenDataType(element_type));
@@ -2554,7 +2682,7 @@ port::Status MIOpenSupport::DoCtcLoss(
                        scratch_memory, ctc_loss_algo_id);
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 MIOpenSupport::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
     int batch_size, dnn::RnnInputMode input_mode,
@@ -2565,14 +2693,14 @@ MIOpenSupport::createRnnDescriptor(
   // ROCM TODO: batch_size is used in dynamic persistent RNN algorithm and is
   // not supported by MIOpen now.
   if (use_padded_io) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "ROCm MIOpen only supports packed input output.");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "ROCm MIOpen only supports packed input output.");
   }
 
   bool use_projection = cell_size != 0 && hidden_size < cell_size;
   if (use_projection) {
-    return port::Status(
-        port::error::INVALID_ARGUMENT,
+    return tsl::Status(
+        tsl::error::INVALID_ARGUMENT,
         "ROCm MIOpen does not support RNN ProjectionLayers yet.");
   }
 
@@ -2585,11 +2713,11 @@ MIOpenSupport::createRnnDescriptor(
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
   }
-  return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
+  return tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
       std::move(rnn_desc));
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 MIOpenSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                                  int data_size,
                                                  dnn::DataType data_type) {
@@ -2599,11 +2727,11 @@ MIOpenSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
   if (!seq_desc->ok()) {
     return seq_desc->Status();
   }
-  return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
+  return tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
       std::move(seq_desc));
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 MIOpenSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                               int data_size,
                                               dnn::DataType data_type) {
@@ -2613,7 +2741,7 @@ MIOpenSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
   if (!state_desc->ok()) {
     return state_desc->Status();
   }
-  return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
+  return tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
       std::move(state_desc));
 }
 
@@ -2889,7 +3017,7 @@ void MIOpenDeallocatorCallback(void* ctx, void* mem) {
   // reclaim the memory
 }
 
-port::Status MIOpenSupport::DoPrepareForConvolution(
+tsl::Status MIOpenSupport::DoPrepareForConvolution(
     dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -2914,9 +3042,8 @@ port::Status MIOpenSupport::DoPrepareForConvolution(
   // allocate scratch memory
   if (scratch_memory_size != 0) {
     if (scratch_allocator == nullptr) {
-      return port::InternalError(
-          absl::StrCat("An allocator must be specified when scratch memory is "
-                       "needed"));
+      return tsl::errors::Internal(
+          "An allocator must be specified when scratch memory is needed");
     }
     auto allocated = scratch_allocator->AllocateBytes(scratch_memory_size);
     if (allocated.ok()) {
@@ -2929,8 +3056,8 @@ port::Status MIOpenSupport::DoPrepareForConvolution(
              "larger number (e.g. 8192) to increase the max memory limit.\n"
           << "\tIncreasing the max memory limit might help resolve this "
              "error";
-      return port::InternalError(absl::StrCat(
-          "Failed to allocate scratch memory of size: ", scratch_memory_size));
+      return tsl::errors::Internal(
+          "Failed to allocate scratch memory of size: ", scratch_memory_size);
     }
   }
 
@@ -2963,15 +3090,15 @@ class RocmConvRunner : public dnn::ConvRunner {
 
   size_t GetWorkspaceSize() const override { return workspace_size_; }
 
-  port::StatusOr<AlgorithmDesc> ToAlgorithmDesc() const override {
+  tsl::StatusOr<AlgorithmDesc> ToAlgorithmDesc() const override {
     return {{algo_id_, false, workspace_size_}};
   }
 
-  port::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
-                          DeviceMemoryBase scratch_memory,
-                          DeviceMemoryBase input_data,
-                          DeviceMemoryBase filter_data,
-                          DeviceMemoryBase output_data) const override {
+  tsl::Status operator()(Stream* stream, dnn::ProfileResult* profile_result,
+                         DeviceMemoryBase scratch_memory,
+                         DeviceMemoryBase input_data,
+                         DeviceMemoryBase filter_data,
+                         DeviceMemoryBase output_data) const override {
     auto miopen = miopen_->GetHandle(parent_, stream);
     // Alpha is the scaling factor for input.
     float alpha = 1.0;
@@ -2984,14 +3111,14 @@ class RocmConvRunner : public dnn::ConvRunner {
     if (is_profiling) {
       timer.reset(new GpuTimer(parent_));
       if (!timer->Init()) {
-        return port::Status(port::error::INTERNAL, "Failed to init timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to init timer");
       }
       // The start and stop of the timer should be as close to the MIOpen call
       // as possible. It is still possible for other threads to issue workload
       // on to this stream. So it could take multiple profiling measurements.
       if (!timer->Start(AsGpuStream(stream))) {
         timer->Destroy();
-        return port::Status(port::error::INTERNAL, "Failed to start timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to start timer");
       }
     }
 
@@ -3056,14 +3183,14 @@ class RocmConvRunner : public dnn::ConvRunner {
         break;
       }
       default:
-        return port::InternalError(absl::StrCat("Unexpected convolution kind ",
-                                                static_cast<int>(kind_)));
+        return tsl::errors::Internal("Unexpected convolution kind ",
+                                     static_cast<int>(kind_));
     }
 
     if (is_profiling) {
       if (!timer->Stop(AsGpuStream(stream))) {
         timer->Destroy();
-        return port::Status(port::error::INTERNAL, "Failed to stop timer");
+        return tsl::Status(tsl::error::INTERNAL, "Failed to stop timer");
       }
       if (status == miopenStatusSuccess) {
         dnn::AlgorithmDesc algotype(algo_id_, false);
@@ -3075,9 +3202,8 @@ class RocmConvRunner : public dnn::ConvRunner {
     }
 
     if (status != miopenStatusSuccess) {
-      return port::InternalError(
-          absl::StrCat("Failed to enqueue convolution on stream: ",
-                       ::stream_executor::gpu::ToString(status)));
+      return tsl::errors::Internal("Failed to enqueue convolution on stream: ",
+                                   ::stream_executor::gpu::ToString(status));
     }
 
     return tsl::OkStatus();
@@ -3097,7 +3223,7 @@ class RocmConvRunner : public dnn::ConvRunner {
   ScopedConvolutionDescriptor conv_desc_;
 };
 
-port::Status MIOpenSupport::DoConvolve(
+tsl::Status MIOpenSupport::DoConvolve(
     dnn::ConvolutionKind kind, dnn::DataType element_type,
     dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -3119,7 +3245,7 @@ port::Status MIOpenSupport::DoConvolve(
 
 bool MIOpenSupport::GetConvolveAlgorithms(
     // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -3132,7 +3258,7 @@ bool MIOpenSupport::GetConvolveAlgorithms(
   return true;
 }
 
-port::Status MIOpenSupport::GetConvolveRunners(
+tsl::Status MIOpenSupport::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -3143,7 +3269,7 @@ port::Status MIOpenSupport::GetConvolveRunners(
     ScratchAllocator* scratch_allocator,
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners) {
   if (input_type != output_type) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         absl::StrFormat("MIOpen backend does not support different input and "
                         "output types: %d != %d",
                         input_type, output_type));
@@ -3154,8 +3280,8 @@ port::Status MIOpenSupport::GetConvolveRunners(
           kind, input_type, stream, input_descriptor, input_data,
           filter_descriptor, filter_data, output_descriptor, output_data,
           convolution_descriptor, scratch_allocator, &profile_results)) {
-    return port::Status(
-        port::error::UNKNOWN,
+    return tsl::Status(
+        tsl::error::UNKNOWN,
         "GetConvolveRunners: GetMIOpenConvolveAlgorithms failed");
   }
 
@@ -3171,7 +3297,7 @@ port::Status MIOpenSupport::GetConvolveRunners(
   return tsl::OkStatus();
 }
 
-port::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
+tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
 MIOpenSupport::ConvolveRunnerFromDesc(
     Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
     dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -3180,7 +3306,7 @@ MIOpenSupport::ConvolveRunnerFromDesc(
     const dnn::BatchDescriptor& output_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor) {
   if (input_type != output_type) {
-    return port::UnimplementedError(
+    return tsl::errors::Unimplemented(
         absl::StrFormat("MIOpen backend does not support different input and "
                         "output types: %d != %d",
                         input_type, output_type));
@@ -3188,7 +3314,7 @@ MIOpenSupport::ConvolveRunnerFromDesc(
 
   auto workspace_size = algorithm_desc.workspace_size();
   if (!workspace_size) {
-    return port::InvalidArgumentError(
+    return tsl::errors::InvalidArgument(
         "MIOpenSupport::ConvolveRunnerFromDesc requires "
         "AlgorithmProto.workspace_size, but it was missing.");
   }
@@ -3596,7 +3722,7 @@ bool MIOpenSupport::GetRnnAlgorithms(
 
 bool MIOpenSupport::GetConvolveBackwardDataAlgorithms(
     // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -3611,7 +3737,7 @@ bool MIOpenSupport::GetConvolveBackwardDataAlgorithms(
 
 bool MIOpenSupport::GetConvolveBackwardFilterAlgorithms(
     // ROCM TODO: refactor cc_major / cc_minor
-    CudaComputeCapability cuda_compute_capability,
+    CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   out_algorithms->assign({
       // clang-format off
@@ -3781,7 +3907,7 @@ bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
   return true;
 }
 
-port::Status MIOpenSupport::DoFusedConvolve(
+tsl::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
     dnn::DataType bias_type, dnn::DataType output_type,
     const dnn::BatchDescriptor& conv_input_descriptor,
@@ -3796,7 +3922,7 @@ port::Status MIOpenSupport::DoFusedConvolve(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return port::UnimplementedError("fused convolve not implemented yet");
+  return tsl::errors::Unimplemented("fused convolve not implemented yet");
 }
 
 bool MIOpenSupport::DoTransformTensor(Stream* stream,
@@ -4004,15 +4130,15 @@ bool MIOpenSupport::DoActivate(Stream* stream,
   return false;
 }
 
-port::Status MIOpenSupport::DoPoolForward(
+tsl::Status MIOpenSupport::DoPoolForward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
     const dnn::BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
     ScratchAllocator* workspace_allocator) {
   if (element_type == dnn::DataType::kDouble) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "MIOpen does not support pooling for double type yet");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "MIOpen does not support pooling for double type yet");
   }
 
   auto miopen = miopen_->GetHandle(parent_, stream);
@@ -4037,9 +4163,9 @@ port::Status MIOpenSupport::DoPoolForward(
     auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
         pooling_desc.handle(), dest_desc.handle(), &workspace_size);
     if (status != miopenStatusSuccess) {
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Failed to obtain workspace size for backward pooling on stream: ",
-          ToString(status)));
+          ToString(status));
     }
     if (workspace_size != 0) {
       PoolingWorkspaceDescriptor* pdesc = 0;
@@ -4069,8 +4195,8 @@ port::Status MIOpenSupport::DoPoolForward(
       input_data.opaque(), &beta, dest_desc.handle(), output_data.opaque(),
       do_backward, workspace, workspace_size);
   if (status != miopenStatusSuccess) {
-    return port::InternalError(absl::StrCat(
-        "Failed to enqueue forward pooling on stream: ", ToString(status)));
+    return tsl::errors::Internal(
+        "Failed to enqueue forward pooling on stream: ", ToString(status));
   }
   return tsl::OkStatus();
 }
@@ -4120,7 +4246,8 @@ void PoolingWorkspaceCache::insert(
     // replacing an entry with the same pointer but different attributes
     // (if everything matches, the caller is expected to reuse the entry)
     desc = &it->second;
-    hipStreamSynchronize(hip_stream);
+    CHECK_EQ(hipStreamSynchronize(hip_stream), hipSuccess)
+        << "Failed to sync hipStream";
     memory_used -= desc->workspace_size;
   } else {
     cache[p] = PoolingWorkspaceDescriptor();
@@ -4149,7 +4276,9 @@ void PoolingWorkspaceCache::trim(hipStream_t hip_stream) {
       if (x.second.timestamp + new_size < timestamp)
         old_entries.push_back(x.first);
     if (old_entries.empty()) break;
-    if (must_sync) hipStreamSynchronize(hip_stream);
+    if (must_sync)
+      CHECK_EQ(hipStreamSynchronize(hip_stream), hipSuccess)
+          << "Failed to sync hipStream";
     must_sync = true;
     for (auto x : old_entries) {
       memory_used -= cache[x].workspace_size;
@@ -4159,7 +4288,7 @@ void PoolingWorkspaceCache::trim(hipStream_t hip_stream) {
   }
 }
 
-port::Status MIOpenSupport::DoPoolBackward(
+tsl::Status MIOpenSupport::DoPoolBackward(
     dnn::DataType element_type, Stream* stream,
     const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
@@ -4167,8 +4296,8 @@ port::Status MIOpenSupport::DoPoolBackward(
     DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
     ScratchAllocator* workspace_allocator) {
   if (element_type == dnn::DataType::kDouble) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "MIOpen does not support pooling for double type yet");
+    return tsl::Status(tsl::error::INVALID_ARGUMENT,
+                       "MIOpen does not support pooling for double type yet");
   }
 
   auto miopen = miopen_->GetHandle(parent_, stream);
@@ -4193,9 +4322,9 @@ port::Status MIOpenSupport::DoPoolBackward(
   auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
       pooling_desc.handle(), dest_desc.handle(), &workspace_size_in_bytes);
   if (status != miopenStatusSuccess) {
-    return port::InternalError(absl::StrCat(
+    return tsl::errors::Internal(
         "Failed to obtain workspace size for backward pooling on stream: ",
-        ToString(status)));
+        ToString(status));
   }
 
   // Allocate the workspace.
@@ -4215,7 +4344,7 @@ port::Status MIOpenSupport::DoPoolBackward(
       auto allocated =
           workspace_allocator->AllocateBytes(workspace_size_in_bytes);
       if (!allocated.ok() || (workspace = allocated.value()) == nullptr) {
-        return port::InternalError(
+        return tsl::errors::Internal(
             "Failed to allocate backward pooling workspace");
       }
       DeviceMemory<uint8> dest2;  // duplicated dest from forward:
@@ -4236,7 +4365,7 @@ port::Status MIOpenSupport::DoPoolBackward(
         assert(workspace_allocator);
         auto allocated = workspace_allocator->AllocateBytes(dest2_size);
         if (!allocated.ok() || (dest2 = allocated.value()) == nullptr) {
-          return port::InternalError(
+          return tsl::errors::Internal(
               "Failed to allocate backward pooling workspace");
         }
       } else {
@@ -4250,9 +4379,9 @@ port::Status MIOpenSupport::DoPoolBackward(
           workspace.opaque(), workspace_size_in_bytes);
 
       if (status != miopenStatusSuccess) {
-        return port::InternalError(absl::StrCat(
+        return tsl::errors::Internal(
             "Failed to enqueue forward pooling (before backward) on stream: ",
-            ToString(status)));
+            ToString(status));
       }
       workspace_ptr = reinterpret_cast<uint8*>(workspace.opaque());
     }
@@ -4265,8 +4394,8 @@ port::Status MIOpenSupport::DoPoolBackward(
       output_diff_data.opaque(), workspace_ptr);
 
   if (status != miopenStatusSuccess) {
-    return port::InternalError(absl::StrCat(
-        "Failed to enqueue backward pooling on stream: ", ToString(status)));
+    return tsl::errors::Internal(
+        "Failed to enqueue backward pooling on stream: ", ToString(status));
   }
   return tsl::OkStatus();
 }
@@ -4440,7 +4569,7 @@ bool MIOpenSupport::DoDepthConcatenate(
     const auto& dimensions = input_dimensions[i];
     tmp.resize(dimensions.ElementCount());
     stream->ThenMemcpyD2H<float>(*input_data[i], absl::MakeSpan(tmp));
-    port::Status block_status = stream->BlockHostUntilDone();
+    tsl::Status block_status = stream->BlockHostUntilDone();
     if (!block_status.ok()) {
       LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
       return false;
@@ -5008,7 +5137,7 @@ void initialize_miopen() {
       rocm::kROCmPlatformId, PluginKind::kDnn, gpu::kMIOpenPlugin);
 
   if (!miopenAlreadyRegistered) {
-    port::Status status =
+    tsl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
             rocm::kROCmPlatformId, gpu::kMIOpenPlugin, "MIOpen",
             [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
index 25138f5845f..2c95cf5408c 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "rocm/include/miopen/miopen.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_device_memory.h"
 
@@ -78,10 +77,10 @@ class MIOpenSupport : public dnn::DnnSupport {
  public:
   explicit MIOpenSupport(GpuExecutor* parent);
 
-  port::Status Init() override;
-  port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
+  tsl::Status Init() override;
+  tsl::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+  tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size, int cell_size,
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -89,12 +88,12 @@ class MIOpenSupport : public dnn::DnnSupport {
       float dropout, uint64_t seed, ScratchAllocator* state_allocator,
       bool use_padded_io) override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                     int data_size,
                                     dnn::DataType data_type) override;
 
-  port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
 
@@ -234,10 +233,10 @@ class MIOpenSupport : public dnn::DnnSupport {
                      dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
-  port::Status GetConvolveRunners(
+  tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -250,7 +249,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_runners)
       override;
 
-  port::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
+  tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
       Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
       dnn::ConvolutionKind kind, dnn::DataType input_type,
       dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
@@ -273,11 +272,11 @@ class MIOpenSupport : public dnn::DnnSupport {
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardDataAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool GetConvolveBackwardFilterAlgorithms(
-      CudaComputeCapability cuda_compute_capability,
+      CudaComputeCapability cuda_compute_capability, dnn::DataType input_type,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
   bool DoBatchNormalizationForward(
@@ -336,7 +335,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<uint8>* reserve_space_data,
       ScratchAllocator* workspace_allocator) override;
 
-  port::Status DoConvolve(
+  tsl::Status DoConvolve(
       dnn::ConvolutionKind kind, dnn::DataType element_type,
       dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -348,7 +347,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  port::Status DoFusedConvolve(
+  tsl::Status DoFusedConvolve(
       Stream* stream, dnn::DataType input_type, dnn::DataType side_input_type,
       dnn::DataType bias_type, dnn::DataType output_type,
       const dnn::BatchDescriptor& conv_input_descriptor,
@@ -439,24 +438,24 @@ class MIOpenSupport : public dnn::DnnSupport {
                   const DeviceMemory<float>& input_data,
                   DeviceMemory<float>* output_data, uint64_t options) override;
 
-  port::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+  tsl::Status DoPoolForward(dnn::DataType element_type, Stream* stream,
+                            const dnn::PoolingDescriptor& pooling_dimensions,
+                            const dnn::BatchDescriptor& input_dimensions,
+                            DeviceMemoryBase input_data,
+                            const dnn::BatchDescriptor& output_dimensions,
+                            DeviceMemoryBase output_data,
+                            ScratchAllocator* workspace_allocator) override;
+
+  tsl::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
                              DeviceMemoryBase input_data,
                              const dnn::BatchDescriptor& output_dimensions,
                              DeviceMemoryBase output_data,
+                             DeviceMemoryBase input_diff_data,
+                             DeviceMemoryBase output_diff_data,
                              ScratchAllocator* workspace_allocator) override;
 
-  port::Status DoPoolBackward(dnn::DataType element_type, Stream* stream,
-                              const dnn::PoolingDescriptor& pooling_dimensions,
-                              const dnn::BatchDescriptor& input_dimensions,
-                              DeviceMemoryBase input_data,
-                              const dnn::BatchDescriptor& output_dimensions,
-                              DeviceMemoryBase output_data,
-                              DeviceMemoryBase input_diff_data,
-                              DeviceMemoryBase output_diff_data,
-                              ScratchAllocator* workspace_allocator) override;
-
   bool DoNormalizeWithDimensions(
       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
       const dnn::BatchDescriptor& dimensions,
@@ -608,17 +607,17 @@ class MIOpenSupport : public dnn::DnnSupport {
 
   GpuExecutor* GetParentExecutor() { return parent_; }
 
-  port::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
-                         const dnn::RnnStateTensorDescriptor& probs_desc,
-                         const DeviceMemoryBase probs_data,
-                         absl::Span<const int> labels_data,
-                         absl::Span<const int> labels_lengths_data,
-                         absl::Span<const int> input_lengths_data,
-                         DeviceMemoryBase costs_data,
-                         const dnn::RnnStateTensorDescriptor& grads_desc,
-                         DeviceMemoryBase grads_data,
-                         DeviceMemory<uint8> scratch_memory,
-                         int ctc_loss_algo_id) override;
+  tsl::Status DoCtcLoss(Stream* stream, dnn::DataType element_type,
+                        const dnn::RnnStateTensorDescriptor& probs_desc,
+                        const DeviceMemoryBase probs_data,
+                        absl::Span<const int> labels_data,
+                        absl::Span<const int> labels_lengths_data,
+                        absl::Span<const int> input_lengths_data,
+                        DeviceMemoryBase costs_data,
+                        const dnn::RnnStateTensorDescriptor& grads_desc,
+                        DeviceMemoryBase grads_data,
+                        DeviceMemory<uint8> scratch_memory,
+                        int ctc_loss_algo_id) override;
 
  private:
   GpuExecutor* parent_;  // Parent executor object. Not owned.
@@ -758,7 +757,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop_data,
       dnn::ProfileResult* output_profile_result);
 
-  port::Status DoPrepareForConvolution(
+  tsl::Status DoPrepareForConvolution(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -770,7 +769,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
       DeviceMemory<uint8>* scratch_memory) override;
 
-  port::Status DoCtcLossImpl(
+  tsl::Status DoCtcLossImpl(
       Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
       const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
@@ -779,7 +778,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemoryBase grads_data, const MIOpenCTCLossDescriptor& ctc_loss_desc,
       DeviceMemory<uint8> scratch_memory, int ctc_loss_algo_id);
 
-  port::Status DoPrepareForCtcLoss(
+  tsl::Status DoPrepareForCtcLoss(
       Stream* stream, dnn::DataType element_type,
       const dnn::RnnStateTensorDescriptor& probs_desc,
       const dnn::RnnStateTensorDescriptor& grads_desc,
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
index d8e8b8ba025..4d72ff65908 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
@@ -28,27 +28,27 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/numbers.h"
+#include "tensorflow/tsl/platform/stacktrace.h"
+#include "tensorflow/tsl/platform/static_threadlocal.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 
 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_rocm_device_0_only = false;
 
-#define RETURN_IF_ROCM_ERROR(expr, ...)                                \
-  do {                                                                 \
-    hipError_t _res = (expr);                                          \
-    if (TF_PREDICT_FALSE(_res != hipSuccess)) {                        \
-      return port::InternalError(absl::StrCat(                         \
-          __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
-    }                                                                  \
+#define RETURN_IF_ROCM_ERROR(expr, ...)                                     \
+  do {                                                                      \
+    hipError_t _res = (expr);                                               \
+    if (TF_PREDICT_FALSE(_res != hipSuccess)) {                             \
+      return tsl::errors::Internal(__VA_ARGS__, ": ",                       \
+                                   ::stream_executor::gpu::ToString(_res)); \
+    }                                                                       \
   } while (0)
 
 // Debugging: on each push and pop of a rocm context, verify the current device
@@ -122,9 +122,9 @@ string ToString(hipError_t result) {
 // stack-limited threads (such as those spawned by a default-argument
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
-port::ThreadPool* GetDriverExecutor() {
-  static port::ThreadPool* thread_pool = new port::ThreadPool(
-      port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
+tsl::thread::ThreadPool* GetDriverExecutor() {
+  static tsl::thread::ThreadPool* thread_pool = new tsl::thread::ThreadPool(
+      tsl::Env::Default(), tsl::ThreadOptions(), "rocm_driver", 1);
   return thread_pool;
 }
 
@@ -159,7 +159,7 @@ void SynchronizeOrDie() {
   auto res = wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(FATAL) << "Synchronize found " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
+               << " :: " << tsl::CurrentStackTrace();
   }
 }
 
@@ -168,7 +168,7 @@ struct ThreadLocalData {
   int depth;
 };
 
-SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
+TSL_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
 
 }  // namespace
 
@@ -287,7 +287,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
 
 // Actually performs the work of ROCM initialization. Wrapped up in one-time
 // execution guard.
-static port::Status InternalInit() {
+static tsl::Status InternalInit() {
   hipError_t res = hipErrorNoDevice;
   if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
     LOG(ERROR) << "injecting ROCM init error; initialization will fail";
@@ -301,35 +301,35 @@ static port::Status InternalInit() {
 
   LOG(ERROR) << "failed call to hipInit: " << ToString(res);
   Diagnostician::LogDiagnosticInformation();
-  return port::Status{port::error::ABORTED,
-                      absl::StrCat("failed call to hipInit: ", ToString(res))};
+  return tsl::Status{tsl::error::ABORTED,
+                     absl::StrCat("failed call to hipInit: ", ToString(res))};
 }
 
 }  // namespace
 
-/* static */ port::Status GpuDriver::Init() {
+/* static */ tsl::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as hipInit need only be
   // called once, but GpuDriver::Init may be called many times.
-  static port::Status* init_retval = [] {
-    return new port::Status(InternalInit());
+  static tsl::Status* init_retval = [] {
+    return new tsl::Status(InternalInit());
   }();
   return *init_retval;
 }
 
-/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
-                                               hipDevice_t* device) {
+/* static */ tsl::Status GpuDriver::GetDevice(int device_ordinal,
+                                              hipDevice_t* device) {
   hipError_t res = wrap::hipDeviceGet(device, device_ordinal);
   if (res == hipSuccess) {
     return tsl::OkStatus();
   }
 
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
 }
 
-/* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
-                                                   string* device_name) {
+/* static */ tsl::Status GpuDriver::GetDeviceName(hipDevice_t device,
+                                                  string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
   RETURN_IF_ROCM_ERROR(
@@ -347,7 +347,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return true;
 }
 
-/* static */ port::Status GpuDriver::CreateContext(
+/* static */ tsl::Status GpuDriver::CreateContext(
     int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
     GpuContext** context) {
   // TODO(hanbinyoon): Create a real context, i.e., by calling hipCtxCreate().
@@ -366,20 +366,20 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return nullptr;
 }
 
-/* static */ port::Status GpuDriver::FuncGetAttribute(
+/* static */ tsl::Status GpuDriver::FuncGetAttribute(
     hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
   // TODO(ROCm) properly implement this feature in HIP
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::FuncSetCacheConfig(
+/* static */ tsl::Status GpuDriver::FuncSetCacheConfig(
     hipFunction_t function, hipFuncCache_t cache_config) {
   RETURN_IF_ROCM_ERROR(wrap::hipFuncSetCacheConfig(function, cache_config),
                        "Failed to set ROCM kernel cache config.");
   return tsl::OkStatus();
 }
 
-/* static */ port::StatusOr<hipSharedMemConfig>
+/* static */ tsl::StatusOr<hipSharedMemConfig>
 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipSharedMemConfig shared_mem_config;
   ScopedActivateContext activation{context};
@@ -388,7 +388,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return shared_mem_config;
 }
 
-/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+/* static */ tsl::Status GpuDriver::ContextSetSharedMemConfig(
     GpuContext* context, hipSharedMemConfig shared_mem_config) {
   ScopedActivateContext activation{context};
   RETURN_IF_ROCM_ERROR(wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
@@ -396,7 +396,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::LaunchKernel(
+/* static */ tsl::Status GpuDriver::LaunchKernel(
     GpuContext* context, absl::string_view kernel_name, hipFunction_t function,
     unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
     unsigned int block_dim_x, unsigned int block_dim_y,
@@ -418,25 +418,25 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
-                                             const char* ptx_contents,
-                                             hipModule_t* module) {
+/* static */ tsl::Status GpuDriver::LoadPtx(GpuContext* context,
+                                            const char* ptx_contents,
+                                            hipModule_t* module) {
   LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
-  return port::InternalError("Not Implemented");
+  return tsl::errors::Internal("Not Implemented");
 }
 
-/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
-                                               const char* cubin_bytes,
-                                               hipModule_t* module) {
-  return port::Status{port::error::INTERNAL,
-                      "Feature not supported on ROCm platform (LoadCubin)"};
+/* static */ tsl::Status GpuDriver::LoadCubin(GpuContext* context,
+                                              const char* cubin_bytes,
+                                              hipModule_t* module) {
+  return tsl::Status{tsl::error::INTERNAL,
+                     "Feature not supported on ROCm platform (LoadCubin)"};
 }
 
-/* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
-                                               const char* hsaco_contents,
-                                               hipModule_t* module) {
+/* static */ tsl::Status GpuDriver::LoadHsaco(GpuContext* context,
+                                              const char* hsaco_contents,
+                                              hipModule_t* module) {
   absl::Notification notification;
-  port::Status ret = tsl::OkStatus();
+  tsl::Status ret = tsl::OkStatus();
   GetDriverExecutor()->Schedule(
       [context, hsaco_contents, module, &ret, &notification]() {
         ScopedActivateContext activation{context};
@@ -445,8 +445,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
         hipError_t res = wrap::hipModuleLoadData(module, hsaco_data);
 
         if (res != hipSuccess) {
-          ret = port::InternalError(
-              absl::StrCat("Failed to load HSACO: ", ToString(res)));
+          ret = tsl::errors::Internal("Failed to load HSACO: ", ToString(res));
           notification.Notify();
         }
 
@@ -458,7 +457,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return ret;
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemsetUint8(
+/* static */ tsl::Status GpuDriver::SynchronousMemsetUint8(
     GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
   ScopedActivateContext activation{context};
   RETURN_IF_ROCM_ERROR(wrap::hipMemsetD8(location, value, size),
@@ -466,7 +465,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemsetUint32(
+/* static */ tsl::Status GpuDriver::SynchronousMemsetUint32(
     GpuContext* context, hipDeviceptr_t location, uint32 value,
     size_t uint32_count) {
   ScopedActivateContext activation{context};
@@ -476,7 +475,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
+/* static */ tsl::Status GpuDriver::AsynchronousMemsetUint8(
     GpuContext* context, hipDeviceptr_t location, uint8 value,
     size_t uint32_count, GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
@@ -486,7 +485,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
+/* static */ tsl::Status GpuDriver::AsynchronousMemsetUint32(
     GpuContext* context, hipDeviceptr_t location, uint32 value,
     size_t uint32_count, GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
@@ -605,7 +604,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipError_t res = wrap::hipMalloc(&result, bytes);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to allocate "
-               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << tsl::strings::HumanReadableNumBytes(bytes) << " (" << bytes
                << " bytes) from device: " << ToString(res);
     return nullptr;
   }
@@ -693,11 +692,11 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return true;
 }
 
-/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
-                                                  GpuEventHandle* event) {
+/* static */ tsl::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                 GpuEventHandle* event) {
   if (*event == nullptr) {
-    return port::Status{port::error::INVALID_ARGUMENT,
-                        "input event cannot be null"};
+    return tsl::Status{tsl::error::INVALID_ARGUMENT,
+                       "input event cannot be null"};
   }
 
   ScopedActivateContext activated{context};
@@ -709,21 +708,21 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       return tsl::OkStatus();
     case hipErrorDeinitialized:
     case hipErrorNotInitialized:
-      return port::Status{
-          port::error::FAILED_PRECONDITION,
+      return tsl::Status{
+          tsl::error::FAILED_PRECONDITION,
           absl::StrFormat("error destroying ROCM event in device %d: %s",
                           context->device_ordinal(), ToString(res).c_str())};
     default:
-      return port::Status{
-          port::error::INTERNAL,
+      return tsl::Status{
+          tsl::error::INTERNAL,
           absl::StrFormat("error destroying ROCM event in device %d: %s",
                           context->device_ordinal(), ToString(res).c_str())};
   }
 }
 
-/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
-                                                 GpuEventHandle event,
-                                                 GpuStreamHandle stream) {
+/* static */ tsl::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                GpuEventHandle event,
+                                                GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   hipError_t res = wrap::hipEventRecord(event, stream);
   switch (res) {
@@ -731,25 +730,25 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       return tsl::OkStatus();
     case hipErrorDeinitialized:
     case hipErrorNotInitialized:
-      return port::Status{
-          port::error::FAILED_PRECONDITION,
+      return tsl::Status{
+          tsl::error::FAILED_PRECONDITION,
           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                           ToString(res).c_str())};
     default:
-      return port::Status{
-          port::error::INVALID_ARGUMENT,
+      return tsl::Status{
+          tsl::error::INVALID_ARGUMENT,
           absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                           ToString(res).c_str())};
   }
 }
 
-/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
+/* static */ tsl::StatusOr<hipError_t> GpuDriver::QueryEvent(
     GpuContext* context, GpuEventHandle event) {
   ScopedActivateContext activated{context};
   hipError_t res = wrap::hipEventQuery(event);
   if (res != hipSuccess && res != hipErrorNotReady) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
   }
 
@@ -796,15 +795,15 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipError_t res = wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
+               << " :: " << tsl::CurrentStackTrace();
     return false;
   }
 
   return true;
 }
 
-/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
-                                                       GpuStreamHandle stream) {
+/* static */ tsl::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                      GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
   RETURN_IF_ROCM_ERROR(wrap::hipStreamSynchronize(stream),
@@ -829,9 +828,10 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return false;
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
-    GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src,
-    uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                         void* host_dst,
+                                                         hipDeviceptr_t gpu_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation{context};
   RETURN_IF_ROCM_ERROR(
       wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
@@ -843,9 +843,10 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
-    GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
-    uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                         hipDeviceptr_t gpu_dst,
+                                                         const void* host_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation{context};
   RETURN_IF_ROCM_ERROR(
       wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size),
@@ -857,9 +858,10 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return tsl::OkStatus();
 }
 
-/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
-    GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
-    uint64_t size) {
+/* static */ tsl::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                         hipDeviceptr_t gpu_dst,
+                                                         hipDeviceptr_t gpu_src,
+                                                         uint64_t size) {
   ScopedActivateContext activation{context};
   RETURN_IF_ROCM_ERROR(
       wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
@@ -941,9 +943,9 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return true;
 }
 
-/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
-                                               GpuEventHandle* event,
-                                               EventFlags flags) {
+/* static */ tsl::Status GpuDriver::InitEvent(GpuContext* context,
+                                              GpuEventHandle* event,
+                                              EventFlags flags) {
   int hipflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -962,11 +964,11 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   if (res == hipSuccess) {
     return tsl::OkStatus();
   } else if (res == hipErrorMemoryAllocation) {
-    return port::Status{port::error::RESOURCE_EXHAUSTED,
-                        "could not create ROCM event: out of device memory"};
+    return tsl::Status{tsl::error::RESOURCE_EXHAUSTED,
+                       "could not create ROCM event: out of device memory"};
   } else {
-    return port::Status{
-        port::error::FAILED_PRECONDITION,
+    return tsl::Status{
+        tsl::error::FAILED_PRECONDITION,
         absl::StrCat("could not create ROCM event: ", ToString(res))};
   }
 }
@@ -985,18 +987,19 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return device_count;
 }
 
-/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
-                                                          int* cc_minor,
-                                                          hipDevice_t device) {
-  return port::Status(
-      port::error::INTERNAL,
+/* static */ tsl::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                         int* cc_minor,
+                                                         hipDevice_t device) {
+  return tsl::Status(
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to get compute capability for device: %d "
                       "(unsupported API on AMD Gpus)",
                       device));
 }
 
-/* static */ port::Status GpuDriver::GetPointerAddressRange(
-    hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
+/* static */ tsl::Status GpuDriver::GetPointerAddressRange(hipDeviceptr_t dptr,
+                                                           hipDeviceptr_t* base,
+                                                           size_t* size) {
   hipError_t result = wrap::hipMemGetAddressRange(base, size, dptr);
   if (result == hipSuccess) {
     return tsl::OkStatus();
@@ -1004,19 +1007,19 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     // We differentiate between "this pointer is unknown" (return here) and
     // "there was an internal error while performing this operation" (return
     // below).
-    return port::Status{port::error::NOT_FOUND,
-                        absl::StrFormat("not a device pointer %p; %s",
-                                        reinterpret_cast<void*>(dptr),
-                                        ToString(result).c_str())};
+    return tsl::Status{tsl::error::NOT_FOUND,
+                       absl::StrFormat("not a device pointer %p; %s",
+                                       reinterpret_cast<void*>(dptr),
+                                       ToString(result).c_str())};
   }
 
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to get pointer into for device pointer %p; %s",
                       reinterpret_cast<void*>(dptr), ToString(result).c_str())};
 }
 
-/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
+/* static */ tsl::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     hipDeviceptr_t pointer) {
   unsigned int value;
   hipError_t result = hipSuccess;
@@ -1027,42 +1030,42 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       case hipMemoryTypeHost:
         return MemorySpace::kHost;
       default:
-        return port::Status{
-            port::error::INTERNAL,
+        return tsl::Status{
+            tsl::error::INTERNAL,
             absl::StrCat("unknown memory space provided by ROCM API: ", value)};
     }
   }
 
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrCat("failed to query device pointer for memory space: ",
                    ToString(result))};
 }
 
-/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
+/* static */ tsl::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
     hipDeviceptr_t pointer) {
   hipPointerAttribute_t pointerAttributes;
   hipError_t result =
       wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
   if (result != hipSuccess) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrCat("failed to get device for pointer: ", ToString(result))};
   }
 
   hipDevice_t device;
   result = wrap::hipDeviceGet(&device, pointerAttributes.device);
   if (result != hipSuccess) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrCat("failed to get device for pointer: ", ToString(result))};
   }
 
   return device;
 }
 
-/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
-                                                      hipDevice_t device) {
+/* static */ tsl::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                     hipDevice_t device) {
   hipDeviceProp_t props;
   hipError_t result = wrap::hipGetDeviceProperties(&props, device);
   if (result == hipSuccess) {
@@ -1070,13 +1073,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     return tsl::OkStatus();
   }
   *version = 0;
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
                       device)};
 }
 
-/* static */ port::Status GpuDriver::GetGpuGCNArchName(
+/* static */ tsl::Status GpuDriver::GetGpuGCNArchName(
     hipDevice_t device, std::string* gcnArchName) {
   hipDeviceProp_t props;
   hipError_t result = wrap::hipGetDeviceProperties(&props, device);
@@ -1085,13 +1088,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     return tsl::OkStatus();
   }
   *gcnArchName = "";
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
                       device)};
 }
 
-/* static */ port::StatusOr<bool> GpuDriver::GetMFMASupport() {
+/* static */ tsl::StatusOr<bool> GpuDriver::GetMFMASupport() {
   hipDeviceProp_t props;
   int dev = 0;
   hipError_t result = hipGetDevice(&dev);
@@ -1106,8 +1109,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     VLOG(1) << "GCN arch name (stripped) " << gcnArchName;
     return ((gcnArchName == "908") || (gcnArchName == "909"));
   }
-  return port::Status{
-      port::error::INTERNAL,
+  return tsl::Status{
+      tsl::error::INTERNAL,
       absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
                       dev)};
 }
@@ -1115,13 +1118,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 // Helper function that turns the integer output of hipDeviceGetAttribute to
 // type T and wraps it in a StatusOr.
 template <typename T>
-static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
-                                            hipDeviceAttribute_t attribute) {
+static tsl::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
+                                           hipDeviceAttribute_t attribute) {
   int value = -1;
   hipError_t result = wrap::hipDeviceGetAttribute(&value, attribute, device);
   if (result != hipSuccess) {
-    return port::Status{
-        port::error::NOT_FOUND,
+    return tsl::Status{
+        tsl::error::NOT_FOUND,
         absl::StrCat("could not retrieve ROCM device attribute (", attribute,
                      "): ", ToString(result))};
   }
@@ -1129,42 +1132,42 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   return converted;
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
+/* static */ tsl::StatusOr<int> GpuDriver::GetMultiprocessorCount(
     hipDevice_t device) {
   return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerCore(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(
       device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxSharedMemoryPerBlock(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(device,
                                      hipDeviceAttributeMaxSharedMemoryPerBlock);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerMultiprocessor(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(
       device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxThreadsPerBlock(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(device,
                                      hipDeviceAttributeMaxThreadsPerBlock);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetMaxRegistersPerBlock(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(device,
                                      hipDeviceAttributeMaxRegistersPerBlock);
 }
 
-/* static */ port::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
+/* static */ tsl::StatusOr<int64_t> GpuDriver::GetThreadsPerWarp(
     hipDevice_t device) {
   return GetSimpleAttribute<int64_t>(device, hipDeviceAttributeWarpSize);
 }
@@ -1220,7 +1223,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   return true;
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
+/* static */ tsl::StatusOr<int> GpuDriver::GetDeviceAttribute(
     hipDeviceAttribute_t attribute, hipDevice_t device) {
   return GetSimpleAttribute<int>(device, attribute);
 }
@@ -1300,8 +1303,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   return can_access_peer;
 }
 
-/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
-                                                      GpuContext* to) {
+/* static */ tsl::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                     GpuContext* to) {
   if (from->device_ordinal() == to->device_ordinal()) {
     return tsl::OkStatus();  // A device can always access its own memory.
   }
@@ -1310,8 +1313,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   hipError_t result =
       wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to enable peer access from %d to %d: %s",
                         from->device_ordinal(), to->device_ordinal(),
                         ToString(result).c_str())};
@@ -1320,7 +1323,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   return tsl::OkStatus();
 }
 
-/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+/* static */ tsl::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
     GpuContext* context, hipFunction_t kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
   ScopedActivateContext activation{context};
@@ -1329,8 +1332,8 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   hipError_t result = hipSuccess;
   // TODO(ROCm) implement this feature in HIP
   if (result != hipSuccess) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
                         kernel, ToString(result).c_str())};
   }
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
index 88c789ea171..b826d2cd572 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -23,9 +23,9 @@ limitations under the License.
 #define __HIP_DISABLE_CPP_FUNCTIONS__
 
 #include "rocm/include/hip/hip_runtime.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
@@ -46,22 +46,22 @@ namespace wrap {
 #define TO_STR_(x) #x
 #define TO_STR(x) TO_STR_(x)
 
-#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
-  template <typename... Args>                                               \
-  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
-    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
-    static FuncPtrT loaded = []() -> FuncPtrT {                             \
-      static const char *kName = TO_STR(hipSymbolName);                     \
-      void *f;                                                              \
-      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
-          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()     \
-              .value(),                                                \
-          kName, &f);                                                       \
-      CHECK(s.ok()) << "could not find " << kName                           \
-                    << " in HIP DSO; dlerror: " << s.error_message();       \
-      return reinterpret_cast<FuncPtrT>(f);                                 \
-    }();                                                                    \
-    return loaded(args...);                                                 \
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
+  template <typename... Args>                                            \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;  \
+    static FuncPtrT loaded = []() -> FuncPtrT {                          \
+      static const char *kName = TO_STR(hipSymbolName);                  \
+      void *f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()  \
+              .value(),                                                  \
+          kName, &f);                                                    \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in HIP DSO; dlerror: " << s.error_message();    \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }();                                                                 \
+    return loaded(args...);                                              \
   }
 #endif
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
index 5797811cd7b..5ec86ebaaa7 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_event.cc
@@ -16,13 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
 
 Event::Status GpuEvent::PollForStatus() {
-  port::StatusOr<hipError_t> status =
+  tsl::StatusOr<hipError_t> status =
       GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
index 626411aa3af..633fced55e1 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.cc
@@ -22,15 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_helpers.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -56,32 +55,32 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                               \
-  struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
-      auto s = internal::CachedDsoLoader::GetHipfftDsoHandle();           \
-      return s.value();                                              \
-    }                                                                     \
-    static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
-      CHECK(s.ok()) << "could not find " << kName                         \
-                    << " in rocfft DSO; dlerror: " << s.error_message();  \
-      return reinterpret_cast<FuncPtrT>(f);                               \
-    }                                                                     \
-    static FuncPtrT DynLoad() {                                           \
-      static FuncPtrT f = LoadOrDie();                                    \
-      return f;                                                           \
-    }                                                                     \
-    template <typename... Args>                                           \
-    hipfftResult operator()(GpuExecutor *parent, Args... args) {          \
-      gpu::ScopedActivateExecutorContext sac{parent};                     \
-      return DynLoad()(args...);                                          \
-    }                                                                     \
-  } __name;                                                               \
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                         \
+    static const char *kName;                                            \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void *GetDsoHandle() {                                        \
+      auto s = internal::CachedDsoLoader::GetHipfftDsoHandle();          \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void *f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in rocfft DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) {         \
+      gpu::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;                                                              \
   const char *DynLoadShim__##__name::kName = #__name;
 
 #endif
@@ -154,7 +153,7 @@ bool SetStream(GpuExecutor *parent, hipfftHandle plan, Stream *stream) {
 
 }  // namespace
 
-port::Status ROCMFftPlan::Initialize(
+tsl::Status ROCMFftPlan::Initialize(
     GpuExecutor *parent, Stream *stream, int rank, uint64_t *elem_count,
     uint64_t *input_embed, uint64 input_stride, uint64 input_distance,
     uint64_t *output_embed, uint64 output_stride, uint64 output_distance,
@@ -186,8 +185,8 @@ port::Status ROCMFftPlan::Initialize(
                                    ROCMFftType(type), 1 /* = batch */);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create rocFFT 1d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to create rocFFT 1d plan."};
           }
           return tsl::OkStatus();
         case 2:
@@ -196,8 +195,8 @@ port::Status ROCMFftPlan::Initialize(
                                    elem_count_[1], ROCMFftType(type));
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create rocFFT 2d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to create rocFFT 2d plan."};
           }
           return tsl::OkStatus();
         case 3:
@@ -207,29 +206,29 @@ port::Status ROCMFftPlan::Initialize(
                                  elem_count_[2], ROCMFftType(type));
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to create rocFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create rocFFT 3d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to create rocFFT 3d plan."};
           }
           return tsl::OkStatus();
         default:
           LOG(ERROR) << "Invalid rank value for hipfftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "hipfftPlan only takes rank 1, 2, or 3."};
+          return tsl::Status{tsl::error::INVALID_ARGUMENT,
+                             "hipfftPlan only takes rank 1, 2, or 3."};
       }
     } else {
       ret = wrap::hipfftCreate(parent, &plan_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create rocFFT plan."};
+        return tsl::Status{tsl::error::INTERNAL,
+                           "Failed to create rocFFT plan."};
       }
       ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for rocFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to set auto allocation for rocFFT plan."};
+        return tsl::Status{tsl::error::INTERNAL,
+                           "Failed to set auto allocation for rocFFT plan."};
       }
       switch (rank) {
         case 1:
@@ -238,8 +237,8 @@ port::Status ROCMFftPlan::Initialize(
                                        &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make rocFFT 1d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to make rocFFT 1d plan."};
           }
           break;
         case 2:
@@ -248,8 +247,8 @@ port::Status ROCMFftPlan::Initialize(
                                        &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make rocFFT 2d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to make rocFFT 2d plan."};
           }
           break;
         case 3:
@@ -258,16 +257,16 @@ port::Status ROCMFftPlan::Initialize(
                                        ROCMFftType(type), &scratch_size_bytes_);
           if (ret != HIPFFT_SUCCESS) {
             LOG(ERROR) << "failed to make rocFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make rocFFT 3d plan."};
+            return tsl::Status{tsl::error::INTERNAL,
+                               "Failed to make rocFFT 3d plan."};
           }
           break;
         default:
           LOG(ERROR) << "Invalid rank value for hipfftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "hipfftPlan only takes rank 1, 2, or 3."};
+          return tsl::Status{tsl::error::INVALID_ARGUMENT,
+                             "hipfftPlan only takes rank 1, 2, or 3."};
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -281,22 +280,22 @@ port::Status ROCMFftPlan::Initialize(
           output_distance, ROCMFftType(type), batch_count);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create rocFFT batched plan."};
+        return tsl::Status{tsl::error::INTERNAL,
+                           "Failed to create rocFFT batched plan."};
       }
     } else {
       auto ret = wrap::hipfftCreate(parent, &plan_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create rocFFT batched plan."};
+        return tsl::Status{tsl::error::INTERNAL,
+                           "Failed to create rocFFT batched plan."};
       }
       ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for rocFFT batched plan:"
                    << ret;
-        return port::Status{
-            port::error::INTERNAL,
+        return tsl::Status{
+            tsl::error::INTERNAL,
             "Failed to set auto allocation for rocFFT batched plan."};
       }
       ret = wrap::hipfftMakePlanMany(
@@ -307,8 +306,8 @@ port::Status ROCMFftPlan::Initialize(
           &scratch_size_bytes_);
       if (ret != HIPFFT_SUCCESS) {
         LOG(ERROR) << "failed to make rocFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to make rocFFT batched plan."};
+        return tsl::Status{tsl::error::INTERNAL,
+                           "Failed to make rocFFT batched plan."};
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -316,10 +315,10 @@ port::Status ROCMFftPlan::Initialize(
   return tsl::OkStatus();
 }
 
-port::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
-                                     int rank, uint64_t *elem_count,
-                                     fft::Type type,
-                                     ScratchAllocator *scratch_allocator) {
+tsl::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
+                                    int rank, uint64_t *elem_count,
+                                    fft::Type type,
+                                    ScratchAllocator *scratch_allocator) {
   return Initialize(parent_, stream, rank, elem_count,
                     /*input_embed=*/nullptr, /*input_stride=*/0,
                     /*input_distance=*/0,
@@ -327,7 +326,7 @@ port::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                     /*output_distance=*/0, type, 1, scratch_allocator);
 }
 
-port::Status ROCMFftPlan::UpdateScratchAllocator(
+tsl::Status ROCMFftPlan::UpdateScratchAllocator(
     Stream *stream, ScratchAllocator *scratch_allocator) {
   scratch_allocator_ = scratch_allocator;
   if (scratch_size_bytes_ != 0) {
@@ -341,8 +340,8 @@ port::Status ROCMFftPlan::UpdateScratchAllocator(
   auto ret = wrap::hipfftSetWorkArea(parent_, plan_, scratch_.opaque());
   if (ret != HIPFFT_SUCCESS) {
     LOG(ERROR) << "failed to set work area for rocFFT plan:" << ret;
-    return port::Status(port::error::INTERNAL,
-                        "Failed to set work area for rocFFT plan.");
+    return tsl::Status(tsl::error::INTERNAL,
+                       "Failed to set work area for rocFFT plan.");
   }
   return tsl::OkStatus();
 }
@@ -375,8 +374,9 @@ std::unique_ptr<fft::Plan> ROCMFft::Create1dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[1] = {num_x};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 1, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   // TODO(yangzihao): In the future, send error msg back to TensorFlow
   // so it can fail gracefully,
   if (!status.ok()) {
@@ -391,8 +391,8 @@ std::unique_ptr<fft::Plan> ROCMFft::Create1dPlanWithScratchAllocator(
     ScratchAllocator *scratch_allocator) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[1] = {num_x};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 1d plan with customized allocator: "
@@ -406,8 +406,9 @@ std::unique_ptr<fft::Plan> ROCMFft::Create2dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[2] = {num_x, num_y};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 1, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(FATAL) << "failed to initialize hipfft 2d plan: "
                << status.error_message();
@@ -420,8 +421,8 @@ std::unique_ptr<fft::Plan> ROCMFft::Create2dPlanWithScratchAllocator(
     bool in_place_fft, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[2] = {num_x, num_y};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 2d plan with customized allocator: "
@@ -436,8 +437,9 @@ std::unique_ptr<fft::Plan> ROCMFft::Create3dPlan(Stream *stream, uint64_t num_x,
                                                  bool in_place_fft) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[3] = {num_x, num_y, num_z};
-  port::Status status = fft_plan_ptr->Initialize(
-      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  tsl::Status status =
+      fft_plan_ptr->Initialize(parent_, stream, 3, elem_count, type,
+                               /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
     LOG(FATAL) << "failed to initialize hipfft 3d plan: "
                << status.error_message();
@@ -450,8 +452,8 @@ std::unique_ptr<fft::Plan> ROCMFft::Create3dPlanWithScratchAllocator(
     bool in_place_fft, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
   uint64_t elem_count[3] = {num_x, num_y, num_z};
-  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
-                                                 type, scratch_allocator);
+  tsl::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                type, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL)
         << "failed to initialize hipfft 3d plan with customized allocator: "
@@ -466,7 +468,7 @@ std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlan(
     uint64_t output_stride, uint64 output_distance, fft::Type type,
     bool in_place_fft, int batch_count) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
-  port::Status status = fft_plan_ptr->Initialize(
+  tsl::Status status = fft_plan_ptr->Initialize(
       parent_, stream, rank, elem_count, input_embed, input_stride,
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, /*scratch_allocator=*/nullptr);
@@ -484,7 +486,7 @@ std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
     uint64_t output_stride, uint64 output_distance, fft::Type type,
     bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
   std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
-  port::Status status = fft_plan_ptr->Initialize(
+  tsl::Status status = fft_plan_ptr->Initialize(
       parent_, stream, rank, elem_count, input_embed, input_stride,
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, scratch_allocator);
@@ -499,7 +501,7 @@ std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
 void ROCMFft::UpdatePlanWithScratchAllocator(
     Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
   ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
-  port::Status status =
+  tsl::Status status =
       rocm_fft_plan->UpdateScratchAllocator(stream, scratch_allocator);
   if (!status.ok()) {
     LOG(FATAL) << "failed to update custom allocator for hipfft plan: "
@@ -619,7 +621,7 @@ void initialize_rocfft() {
       rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
 
   if (!rocFftAlreadyRegistered) {
-    port::Status status =
+    tsl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
             rocm::kROCmPlatformId, gpu::kRocFftPlugin, "rocFFT",
             [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.h
index beb54a328bd..a08e692e969 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.h
@@ -23,7 +23,7 @@ limitations under the License.
 #if TENSORFLOW_USE_ROCM
 
 #include "rocm/rocm_config.h"
-#if TF_ROCM_VERSION < 40100 || TENSORFLOW_USE_DCU
+#if TF_ROCM_VERSION < 40100
 #include "rocm/include/rocfft/hipfft.h"
 #else
 #include "rocm/include/hipfft/hipfft.h"
@@ -76,20 +76,20 @@ class ROCMFftPlan : public fft::Plan {
   }
 
   // Initialize function for batched plan
-  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
-                          uint64_t *elem_count, uint64 *input_embed,
-                          uint64_t input_stride, uint64 input_distance,
-                          uint64_t *output_embed, uint64 output_stride,
-                          uint64_t output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator *scratch_allocator);
+  tsl::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                         uint64_t *elem_count, uint64 *input_embed,
+                         uint64_t input_stride, uint64 input_distance,
+                         uint64_t *output_embed, uint64 output_stride,
+                         uint64_t output_distance, fft::Type type,
+                         int batch_count, ScratchAllocator *scratch_allocator);
 
   // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
-                          uint64_t *elem_count, fft::Type type,
-                          ScratchAllocator *scratch_allocator);
+  tsl::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                         uint64_t *elem_count, fft::Type type,
+                         ScratchAllocator *scratch_allocator);
 
-  port::Status UpdateScratchAllocator(Stream *stream,
-                                      ScratchAllocator *scratch_allocator);
+  tsl::Status UpdateScratchAllocator(Stream *stream,
+                                     ScratchAllocator *scratch_allocator);
 
   ScratchAllocator *GetScratchAllocator() const { return scratch_allocator_; }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 2d740d7a6fc..522dd89a2a2 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include <unistd.h>
 
 #include "absl/base/casts.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -26,16 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/path.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
@@ -45,6 +40,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/stream_executor/timer.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 #ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
 #error \
@@ -115,10 +112,10 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   return UnloadGpuBinary(gpu_binary);
 }
 
-port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
+tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
                                    const std::vector<uint8_t>& content) {
-  return port::UnimplementedError("Not implemented for ROCm");
+  return tsl::errors::Unimplemented("Not implemented for ROCm");
 }
 
 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
@@ -159,8 +156,8 @@ void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-port::Status GpuExecutor::Init(int device_ordinal,
-                               DeviceOptions device_options) {
+tsl::Status GpuExecutor::Init(int device_ordinal,
+                              DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
 
   auto status = GpuDriver::Init();
@@ -199,7 +196,7 @@ bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
 
   string cc_specific =
       absl::StrCat(filename, ".cc", version_, canonical_suffix);
-  if (port::FileExists(cc_specific).ok()) {
+  if (tsl::Env::Default()->FileExists(cc_specific).ok()) {
     VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
             << cc_specific;
     *found_filename = cc_specific;
@@ -208,7 +205,7 @@ bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
 
   VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
           << cc_specific;
-  if (port::FileExists(string(filename)).ok()) {
+  if (tsl::Env::Default()->FileExists(string(filename)).ok()) {
     *found_filename = string(filename);
     return true;
   }
@@ -237,8 +234,8 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                    KernelBase* kernel) {
+tsl::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                   KernelBase* kernel) {
   GpuKernel* rocm_kernel = AsGpuKernel(kernel);
   hipModule_t module = nullptr;
   const string* kernelname;
@@ -250,7 +247,7 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   }
 
   if (on_disk_spec != nullptr) {
-    return port::InternalError(
+    return tsl::errors::Internal(
         "Loading ROCM kernel from disk is not supported");
   } else if (spec.has_cuda_cubin_in_memory()) {
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
@@ -264,13 +261,13 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
     }
     kernel_to_gpu_binary_[kernel] = hsaco;
   } else {
-    return port::InternalError("No method of loading ROCM kernel provided");
+    return tsl::errors::Internal("No method of loading ROCM kernel provided");
   }
 
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                     rocm_kernel->gpu_function_ptr())) {
-    return port::InternalError("Failed getting module function");
+    return tsl::errors::Internal("Failed getting module function");
   }
 
   // We have to trust the kernel loader spec arity because there doesn't appear
@@ -284,8 +281,8 @@ port::Status GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
   return tsl::OkStatus();
 }
 
-port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
-                                            KernelMetadata* kernel_metadata) {
+tsl::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
+                                           KernelMetadata* kernel_metadata) {
   int value = 0;
   // TODO(ROCm) implement this feature in HIP
   kernel_metadata->set_registers_per_thread(value);
@@ -295,10 +292,10 @@ port::Status GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
   return tsl::OkStatus();
 }
 
-port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
-                                 const BlockDim& block_dims,
-                                 const KernelBase& kernel,
-                                 const KernelArgsArrayBase& args) {
+tsl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                const BlockDim& block_dims,
+                                const KernelBase& kernel,
+                                const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
   GpuStreamHandle hipstream = AsGpuStreamValue(stream);
   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
@@ -326,25 +323,26 @@ port::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
   // prepare kernargs
   // KernelArgsArrayBase keeps the pointer of arguments
   // deference them here
-  std::vector<void*> kernargs;
-  KernelArgIterator iter = args.arg_iterator();
-  while (iter.has_next()) {
-    KernelArg arg = iter.next();
-    VLOG(2) << "*(arg.address): "
-            << reinterpret_cast<void*>(
-                   *static_cast<const uint64_t*>(arg.address));
-    kernargs.push_back(
-        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
-  }
-
-  size_t size = sizeof(void*) * kernargs.size();
-  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
-                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
-
+//  std::vector<void*> kernargs;
+//  KernelArgIterator iter = args.arg_iterator();
+//  while (iter.has_next()) {
+//    KernelArg arg = iter.next();
+//    VLOG(2) << "*(arg.address): "
+//            << reinterpret_cast<void*>(
+//                   *static_cast<const uint64_t*>(arg.address));
+//    kernargs.push_back(
+//        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
+//  }
+
+//  size_t size = sizeof(void*) * kernargs.size();
+//  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
+//                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+  void** kernel_params = const_cast<void**>(args.argument_addresses().data());
   return GpuDriver::LaunchKernel(
       GetGpuContext(stream), kernel.name(), hipfunc, block_dims.x, block_dims.y,
       block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
-      args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
+      args.number_of_shared_bytes(), hipstream, kernel_params, nullptr);
+//      args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config);
 }
 
 int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
@@ -366,8 +364,8 @@ int GpuExecutor::CompareOccupancy(int* initial_blocks,
   return 0;
 }
 
-port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                                     ModuleHandle* module_handle) {
+tsl::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                                    ModuleHandle* module_handle) {
   // In GpuExecutor we store the pointer to the  HSACO binary  as
   // ModuleHandle::id().
   hipModule_t hip_module = nullptr;
@@ -381,22 +379,22 @@ port::Status GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
         static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
     return tsl::OkStatus();
   } else {
-    return port::InternalError("No HASCO binary found");
+    return tsl::errors::Internal("No HASCO binary found");
   }
 }
 
-port::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
-                                              hipModule_t* module) {
+tsl::Status GpuExecutor::LoadModuleFromCuBin(const char* cubin,
+                                             hipModule_t* module) {
   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
 }
 
-port::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
-                                            hipModule_t* module) {
+tsl::Status GpuExecutor::LoadModuleFromPtx(const char* ptx,
+                                           hipModule_t* module) {
   LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
 }
 
-port::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
-                                              hipModule_t* module) {
+tsl::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
+                                             hipModule_t* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
 
@@ -457,8 +455,8 @@ bool GpuExecutor::SynchronizeAllActivity() {
   return GpuDriver::SynchronizeContext(context_);
 }
 
-port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
-                                             uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
+                                            uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return GpuDriver::SynchronousMemsetUint32(
@@ -468,8 +466,8 @@ port::Status GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                            0x0, size);
 }
 
-port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                            int value, uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
+                                           int value, uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     // hipMemset reinterprets "value" as a uint8.
@@ -483,28 +481,28 @@ port::Status GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location,
                                            value, size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                            const void* host_src,
-                                            uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                           const void* host_src,
+                                           uint64_t size) {
   return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
                                          host_src, size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
-                                            const DeviceMemoryBase& gpu_src,
-                                            uint64_t size) {
+tsl::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                           const DeviceMemoryBase& gpu_src,
+                                           uint64_t size) {
   return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
                                          AsROCmDevicePtr(gpu_src), size);
 }
 
-port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+tsl::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
     DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64_t size) {
   return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
                                          AsROCmDevicePtr(gpu_src), size);
 }
 
-port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
-                                  uint64_t size) {
+tsl::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                                 uint64_t size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return Memset32(stream, location, 0x0, size);
@@ -513,8 +511,8 @@ port::Status GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
   }
 }
 
-port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
-                                 uint8 pattern, uint64_t size) {
+tsl::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                                uint8 pattern, uint64_t size) {
   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
@@ -523,8 +521,8 @@ port::Status GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
                                             AsGpuStreamValue(stream));
 }
 
-port::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
-                                   uint32 pattern, uint64_t size) {
+tsl::Status GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                                  uint32 pattern, uint64_t size) {
   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
@@ -559,13 +557,14 @@ bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
 }
 
 bool GpuExecutor::HostCallback(Stream* stream,
-                               std::function<port::Status()> callback) {
-  auto callback_ptr = new std::function<void()>([callback]() {
-    port::Status s = callback();
-    if (!s.ok()) {
-      LOG(WARNING) << "Host callback failed: " << s;
-    }
-  });
+                               absl::AnyInvocable<tsl::Status() &&> callback) {
+  auto callback_ptr =
+      new absl::AnyInvocable<void() &&>([cb = std::move(callback)]() mutable {
+        tsl::Status s = std::move(cb)();
+        if (!s.ok()) {
+          LOG(WARNING) << "Host callback failed: " << s;
+        }
+      });
   return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
                                       InternalHostCallback, callback_ptr);
 }
@@ -573,31 +572,30 @@ bool GpuExecutor::HostCallback(Stream* stream,
 /* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
                                                     hipError_t status,
                                                     void* data) {
-  std::function<void()>* callback =
-      reinterpret_cast<std::function<void()>*>(data);
-  (*callback)();
+  auto* callback = reinterpret_cast<absl::AnyInvocable<void() &&>*>(data);
+  std::move (*callback)();
   delete callback;
 }
 
-port::Status GpuExecutor::AllocateEvent(Event* event) {
+tsl::Status GpuExecutor::AllocateEvent(Event* event) {
   return AsGpuEvent(event)->Init();
 }
 
-port::Status GpuExecutor::DeallocateEvent(Event* event) {
+tsl::Status GpuExecutor::DeallocateEvent(Event* event) {
   return AsGpuEvent(event)->Destroy();
 }
 
-port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+tsl::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
   return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }
 
-port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+tsl::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
   if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
                                    AsGpuEvent(event)->gpu_event())) {
     return tsl::OkStatus();
   } else {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat("error recording waiting for ROCM event on stream %p",
                         stream)};
   }
@@ -655,13 +653,13 @@ bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
   return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }
 
-port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+tsl::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
   return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
 blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::BlasFactory> status =
+  tsl::StatusOr<PluginRegistry::BlasFactory> status =
       registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
@@ -675,7 +673,7 @@ blas::BlasSupport* GpuExecutor::CreateBlas() {
 
 dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::DnnFactory> status =
+  tsl::StatusOr<PluginRegistry::DnnFactory> status =
       registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
@@ -689,7 +687,7 @@ dnn::DnnSupport* GpuExecutor::CreateDnn() {
 
 fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::FftFactory> status =
+  tsl::StatusOr<PluginRegistry::FftFactory> status =
       registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
@@ -703,7 +701,7 @@ fft::FftSupport* GpuExecutor::CreateFft() {
 
 rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry* registry = PluginRegistry::Instance();
-  port::StatusOr<PluginRegistry::RngFactory> status =
+  tsl::StatusOr<PluginRegistry::RngFactory> status =
       registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
@@ -723,7 +721,7 @@ bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
   return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
 }
 
-port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+tsl::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
   GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
   return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
 }
@@ -838,7 +836,7 @@ static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
   content = buf;
 
   int32_t value;
-  if (port::safe_strto32(content, &value)) {
+  if (absl::SimpleAtoi(content, &value)) {
     if (value < 0) {  // See http://b/18228951 for details on this path.
       LOG(INFO) << "successful NUMA node read from SysFS had negative value ("
                 << value
@@ -859,7 +857,7 @@ static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
   return kUnknownNumaNode;
 }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 GpuExecutor::CreateDeviceDescription(int device_ordinal) {
   GpuDeviceHandle device;
   auto status = GpuDriver::GetDevice(device_ordinal, &device);
@@ -928,11 +926,9 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
     builder.set_ecc_enabled(ecc_enabled);
   }
 
-  {
-    uint64_t device_memory_size = -1;
-    (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
-    builder.set_device_memory_size(device_memory_size);
-  }
+  uint64_t device_memory_size = -1;
+  (void)GpuDriver::GetDeviceTotalMemory(device, &device_memory_size);
+  builder.set_device_memory_size(device_memory_size);
 
   {
     BlockDim block_dim_limit;
@@ -960,8 +956,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxSharedMemoryPerCore(device).value());
   builder.set_shared_memory_per_block(
       GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
-  builder.set_core_count(
-      GpuDriver::GetMultiprocessorCount(device).value());
+  int core_count = GpuDriver::GetMultiprocessorCount(device).value();
+  builder.set_core_count(core_count);
   builder.set_threads_per_core_limit(
       GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
   builder.set_registers_per_block_limit(
@@ -970,6 +966,23 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetThreadsPerWarp(device).value());
   builder.set_registers_per_core_limit(64 * 1024);
 
+  int cc_major = 0;
+  int cc_minor = 0;
+  GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device).IgnoreError();
+
+  // It would be better to use the PCI device ID or some other truly unique
+  // identifier for the GPU model.  But getting this requires using NVML or
+  // other hacks, which we don't have access to in OSS TensorFlow.
+  //
+  // Alternatively you might be tempted to use GpuDriver::GetDeviceName as a
+  // unique identifier, but this is not stable across GPU VBIOS versions.
+  //
+  // TODO(jlebar): This really should be more unique.  In CUDA land, we mix in
+  // the clock speed and L2 cache size.
+  builder.set_model_str(absl::StrFormat("cc_%d.%d with %dB RAM, %d cores",
+                                        cc_major, cc_minor, device_memory_size,
+                                        core_count));
+
   return builder.Build();
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
index 6cbc9eb8918..de1c4d80163 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.cc
@@ -21,10 +21,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -73,7 +72,7 @@ int ROCmPlatform::DeviceToBus(int device_ordinal) {
   return exec->GetDeviceDescription().numa_node() - min_numa_node_;
 }
 
-port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
+tsl::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
     int bus_ordinal) {
   InspectNumaNodes();
   CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
@@ -85,8 +84,8 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
     }
   }
 
-  return port::Status{
-      port::error::NOT_FOUND,
+  return tsl::Status{
+      tsl::error::NOT_FOUND,
       absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
 }
 
@@ -105,12 +104,12 @@ int ROCmPlatform::VisibleDeviceCount() const {
 
 const string& ROCmPlatform::Name() const { return name_; }
 
-port::StatusOr<std::unique_ptr<DeviceDescription>>
+tsl::StatusOr<std::unique_ptr<DeviceDescription>>
 ROCmPlatform::DescriptionForDevice(int ordinal) const {
   return GpuExecutor::CreateDeviceDescription(ordinal);
 }
 
-port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
+tsl::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
   config.plugin_config = PluginConfig();
@@ -118,7 +117,7 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal,
+tsl::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal,
                                                                 void* hash) {
   StreamExecutorConfig config;
   config.ordinal = ordinal;
@@ -128,7 +127,7 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal,
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
+tsl::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
     int device_ordinal, const PluginConfig& plugin_config) {
   StreamExecutorConfig config;
   config.ordinal = device_ordinal;
@@ -137,7 +136,7 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
   return GetExecutor(config);
 }
 
-port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
+tsl::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
     const StreamExecutorConfig& config) {
   if (config.gpu_stream) {
     // If the GPU stream was provided, it's not possible to get-or-create a
@@ -149,15 +148,15 @@ port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
       config, [&]() { return GetUncachedExecutor(config); });
 }
 
-port::StatusOr<std::unique_ptr<StreamExecutor>>
+tsl::StatusOr<std::unique_ptr<StreamExecutor>>
 ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = std::make_unique<StreamExecutor>(
       this, std::make_unique<GpuExecutor>(config.plugin_config),
       config.ordinal);
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
-        port::error::INTERNAL,
+    return tsl::Status{
+        tsl::error::INTERNAL,
         absl::StrFormat(
             "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
             config.ordinal, init_status.ToString().c_str())};
@@ -183,7 +182,7 @@ static void InitializeROCmPlatform() {
   auto status = MultiPlatformManager::PlatformWithName("ROCM");
   if (!status.ok()) {
     std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
-    SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+    TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
   }
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.h
index d16f76ba0ca..44a22b72b5f 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.h
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/stream_executor/executor_cache.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
@@ -52,7 +51,7 @@ class ROCmPlatform : public Platform {
   int DeviceToBus(int device_ordinal);
 
   // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
-  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+  tsl::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
 
   // Platform interface implementation:
   // Returns the same value as kROCmPlatform above.
@@ -63,19 +62,19 @@ class ROCmPlatform : public Platform {
 
   const string& Name() const override;
 
-  port::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
+  tsl::StatusOr<std::unique_ptr<DeviceDescription>> DescriptionForDevice(
       int ordinal) const override;
 
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
-  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal, void* hash);
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+  tsl::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal, void* hash);
 
-  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+  tsl::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
       int ordinal, const PluginConfig& config) override;
 
-  port::StatusOr<StreamExecutor*> GetExecutor(
+  tsl::StatusOr<StreamExecutor*> GetExecutor(
       const StreamExecutorConfig& config) override;
 
-  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+  tsl::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
       const StreamExecutorConfig& config) override;
 
   void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
index 0091248e56a..4bd74fe0031 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_rng.cc
@@ -20,13 +20,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_helpers.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/tsl/platform/env.h"
 
 // Formats hiprandStatus_t to output prettified values into a log stream.
 std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
@@ -80,12 +79,12 @@ namespace wrap {
     using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
     static void* GetDsoHandle() {                                         \
       auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();          \
-      return s.value();                                              \
+      return s.value();                                                   \
     }                                                                     \
     static FuncPtrT LoadOrDie() {                                         \
       void* f;                                                            \
-      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
-                                                          kName, &f);     \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),  \
+                                                         kName, &f);      \
       CHECK(s.ok()) << "could not find " << kName                         \
                     << " in rocrand DSO; dlerror: " << s.error_message(); \
       return reinterpret_cast<FuncPtrT>(f);                               \
@@ -286,7 +285,7 @@ void initialize_rocrand() {
       rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
 
   if (!rocRandAlreadyRegistered) {
-    port::Status status =
+    tsl::Status status =
         PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
             rocm::kROCmPlatformId, gpu::kGpuRandPlugin, "rocRAND",
             [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
index 5d3574de9cb..e442e7ab2d7 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h
@@ -20,10 +20,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
 
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/rocsolver/rocsolver.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
+#else
+#include "rocm/include/rocsolver.h"
+#endif
+
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
@@ -48,9 +54,9 @@ namespace wrap {
     static FuncPtrT loaded = []() -> FuncPtrT {                               \
       static const char* kName = TO_STR(api_name);                            \
       void* f;                                                                \
-      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary(   \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(                     \
           stream_executor::internal::CachedDsoLoader::GetRocsolverDsoHandle() \
-              .value(),                                                  \
+              .value(),                                                       \
           kName, &f);                                                         \
       CHECK(s.ok()) << "could not find " << kName                             \
                     << " in rocsolver lib; dlerror: " << s.error_message();   \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
index 6e61064a5ab..45e4c0c947b 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h
@@ -21,11 +21,16 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
 
 #include "rocm/include/roctracer/roctracer.h"
-#include "rocm/include/roctracer/roctracer_hcc.h"
 #include "rocm/include/roctracer/roctracer_hip.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
+#include "rocm/rocm_config.h"
+#if TF_ROCM_VERSION >= 50300
+#include "rocm/include/roctracer/roctracer_roctx.h"
+#else
+#include "rocm/include/roctracer/roctracer_hcc.h"
+#endif
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/tsl/platform/env.h"
 
 namespace stream_executor {
 namespace wrap {
@@ -60,6 +65,25 @@ namespace wrap {
 
 #endif  // PLATFORM_GOOGLE
 
+#if TF_ROCM_VERSION >= 50300
+#define FOREACH_ROCTRACER_API(DO_FUNC)           \
+  DO_FUNC(roctracer_default_pool_expl)           \
+  DO_FUNC(roctracer_disable_domain_activity)     \
+  DO_FUNC(roctracer_disable_domain_callback)     \
+  DO_FUNC(roctracer_disable_op_activity)         \
+  DO_FUNC(roctracer_disable_op_callback)         \
+  DO_FUNC(roctracer_enable_domain_activity_expl) \
+  DO_FUNC(roctracer_enable_domain_callback)      \
+  DO_FUNC(roctracer_enable_op_activity_expl)     \
+  DO_FUNC(roctracer_enable_op_callback)          \
+  DO_FUNC(roctracer_error_string)                \
+  DO_FUNC(roctracer_flush_activity_expl)         \
+  DO_FUNC(roctracer_get_timestamp)               \
+  DO_FUNC(roctracer_op_string)                   \
+  DO_FUNC(roctracer_open_pool_expl)              \
+  DO_FUNC(roctracer_set_properties)              \
+  DO_FUNC(roctracer_next_record)
+#else
 #define FOREACH_ROCTRACER_API(DO_FUNC)           \
   DO_FUNC(roctracer_default_pool_expl)           \
   DO_FUNC(roctracer_disable_domain_activity)     \
@@ -76,7 +100,7 @@ namespace wrap {
   DO_FUNC(roctracer_op_string)                   \
   DO_FUNC(roctracer_open_pool_expl)              \
   DO_FUNC(roctracer_set_properties)
-
+#endif
 FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER)
 
 #undef FOREACH_ROCTRACER_API
diff --git a/tensorflow/compiler/xla/stream_executor/scratch_allocator.cc b/tensorflow/compiler/xla/stream_executor/scratch_allocator.cc
index 689a8bc0af1..283aa95fae9 100644
--- a/tensorflow/compiler/xla/stream_executor/scratch_allocator.cc
+++ b/tensorflow/compiler/xla/stream_executor/scratch_allocator.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
-port::StatusOr<DeviceMemory<uint8_t>> OneTimeScratchAllocator::AllocateBytes(
+tsl::StatusOr<DeviceMemory<uint8_t>> OneTimeScratchAllocator::AllocateBytes(
     int64_t byte_size) {
   CHECK(temporary_ == nullptr);
   TF_ASSIGN_OR_RETURN(temporary_,
diff --git a/tensorflow/compiler/xla/stream_executor/scratch_allocator.h b/tensorflow/compiler/xla/stream_executor/scratch_allocator.h
index 9e563ffab73..7044a169392 100644
--- a/tensorflow/compiler/xla/stream_executor/scratch_allocator.h
+++ b/tensorflow/compiler/xla/stream_executor/scratch_allocator.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_device_memory.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -51,7 +51,7 @@ class ScratchAllocator {
   //
   // This is a temporary allocation, and the caller is responsible for
   // deallocating at some known-safe point. See the class comment above.
-  virtual port::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+  virtual tsl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
       int64_t byte_size) = 0;
 };
 
@@ -68,7 +68,7 @@ class OneTimeScratchAllocator : public ScratchAllocator {
 
   int64_t GetMemoryLimitInBytes() override { return -1; }
 
-  port::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+  tsl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
       int64_t byte_size) override;
 
  private:
@@ -91,7 +91,7 @@ class OwningScratchAllocator : public ScratchAllocator {
 
   int64_t GetMemoryLimitInBytes() override { return -1; }
 
-  port::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
+  tsl::StatusOr<DeviceMemory<uint8_t>> AllocateBytes(
       int64_t byte_size) override {
     TF_ASSIGN_OR_RETURN(OwningDeviceMemory buffer,
                         allocator_->Allocate(device_ordinal_, byte_size,
diff --git a/tensorflow/compiler/xla/stream_executor/stream.cc b/tensorflow/compiler/xla/stream_executor/stream.cc
index b4ecdc45ca0..21ea5c5c1c7 100644
--- a/tensorflow/compiler/xla/stream_executor/stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/stream.cc
@@ -16,19 +16,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 
 #include <cstdint>
+#include <functional>
+#include <limits>
 #include <memory>
 #include <utility>
+#include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/tsl/platform/stacktrace.h"
 
 namespace stream_executor {
 
@@ -111,6 +115,11 @@ std::string ToVlogString(const std::function<T> &f) {
   return f == nullptr ? "null" : "<non-null function>";
 }
 
+template <class T>
+std::string ToVlogString(const absl::AnyInvocable<T> &f) {
+  return f == nullptr ? "null" : "<non-null function>";
+}
+
 std::string ToVlogString(const DeviceMemoryBase &memory) {
   return ToVlogString(memory.opaque());
 }
@@ -123,6 +132,10 @@ std::string ToVlogString(const Eigen::half &h) {
   return absl::StrCat(static_cast<float>(h));
 }
 
+std::string ToVlogString(const Eigen::bfloat16 &bf) {  // NOLINT
+  return absl::StrCat(static_cast<float>(bf));
+}
+
 std::string ToVlogString(int i) { return absl::StrCat(i); }
 
 std::string ToVlogString(uint32_t i) { return absl::StrCat(i); }
@@ -136,7 +149,7 @@ std::string ToVlogString(float f) { return absl::StrCat(f); }
 std::string ToVlogString(double d) { return absl::StrCat(d); }
 
 template <class T>
-std::string ToVlogString(port::ArraySlice<T> elements) {  // non-absl ok
+std::string ToVlogString(absl::Span<const T> elements) {
   std::string str = absl::StrCat(
       ToVlogString(reinterpret_cast<const void *>(elements.data())), "[",
       elements.size(), "]{");
@@ -162,8 +175,8 @@ std::string ToVlogString(port::ArraySlice<T> elements) {  // non-absl ok
 }
 
 template <class T>
-std::string ToVlogString(port::MutableArraySlice<T> elements) {  // non-absl ok
-  return ToVlogString(port::ArraySlice<T>(elements));            // non-absl ok
+std::string ToVlogString(absl::Span<T> elements) {
+  return ToVlogString(absl::Span<const T>(elements));
 }
 
 std::string ToVlogString(dnn::DepthToSpaceLayout depth_to_space_layout) {
@@ -186,6 +199,8 @@ std::string ToVlogString(dnn::DataType data_type) {
       return "dnn::DataType::kInt8";
     case dnn::DataType::kInt32:
       return "dnn::DataType::kInt32";
+    case dnn::DataType::kBF16:
+      return "dnn::DataType::kBF16";
     default:
       return "unknown DataType";
   }
@@ -216,7 +231,7 @@ std::string CallStr(const char *function_name, Stream *stream,
   }
   absl::StrAppend(&str, ")");
   if (VLOG_IS_ON(10)) {
-    absl::StrAppend(&str, " ", port::CurrentStackTrace(), "\n");
+    absl::StrAppend(&str, " ", tsl::CurrentStackTrace(), "\n");
   }
   return str;
 }
@@ -248,19 +263,19 @@ Stream::Stream(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->GetStreamImplementation()),
       allocated_(false),
-      status_(port::InternalError("Uninitialized stream")),
+      status_(tsl::errors::Internal("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent));
 }
 
-Stream::Stream(StreamExecutor* parent,
+Stream::Stream(StreamExecutor *parent,
                std::unique_ptr<internal::StreamInterface> implementation,
                bool allocated)
     : parent_(parent),
       implementation_(std::move(implementation)),
       allocated_(allocated),
-      status_(allocated ? port::Status::OK()
-                        : port::InternalError("Uninitialized stream")),
+      status_(allocated ? tsl::OkStatus()
+                        : tsl::errors::Internal("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent), PARAM(implementation.get()));
 }
@@ -282,12 +297,12 @@ Stream::~Stream() {
   }
 }
 
-port::Status Stream::RefreshStatus() {
-  port::Status status = parent_->GetStatus(this);
+tsl::Status Stream::RefreshStatus() {
+  tsl::Status status = parent_->GetStatus(this);
   // We should not put the stream in an error state, just because the GetStatus
   // method is unimplemented.
-  if (status != port::Status(port::error::UNIMPLEMENTED,
-                             "GetStatus is not supported on this executor.")) {
+  if (status != tsl::Status(tsl::error::UNIMPLEMENTED,
+                            "GetStatus is not supported on this executor.")) {
     CheckStatus(status);
   }
   return status;
@@ -328,7 +343,7 @@ Stream &Stream::InitWithTimer(Timer *timer) {
 Stream &Stream::ThenRecordEvent(Event *event) {
   VLOG_CALL(PARAM(event));
 
-  port::Status status = parent_->RecordEvent(this, event);
+  tsl::Status status = parent_->RecordEvent(this, event);
   if (!status.ok()) {
     LOG(ERROR) << "Error recording event in stream: " << status.error_message()
                << "; not marking stream as bad, as the Event object may be "
@@ -447,6 +462,63 @@ Stream &Stream::ThenBatchNormalizationBackward(
   return *this;
 }
 
+Stream &Stream::ThenBatchNormalizationForward(
+    const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
+    const DeviceMemory<float> &offset,
+    const DeviceMemory<float> &estimated_mean,
+    const DeviceMemory<float> &estimated_variance,
+    const DeviceMemory<Eigen::bfloat16> &side_input,
+    const dnn::BatchDescriptor &x_desc,
+    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+    const double exponential_average_factor,
+    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16> *y,
+    DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
+    DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
+    bool is_training, ScratchAllocator *reserve_space_allocator,
+    ScratchAllocator *workspace_allocator) {
+  VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
+            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationForward(
+        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
+        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
+        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+        is_training, reserve_space_allocator, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
+  }
+  return *this;
+}
+
+Stream &Stream::ThenBatchNormalizationBackward(
+    const DeviceMemory<Eigen::bfloat16> &y_backprop,
+    const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
+    const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
+    const DeviceMemory<float> &inv_var, const DeviceMemory<Eigen::bfloat16> &y,
+    const dnn::BatchDescriptor &x_desc,
+    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+    dnn::ActivationMode activation_mode,
+    DeviceMemory<Eigen::bfloat16> *x_backprop,
+    DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
+    DeviceMemory<Eigen::bfloat16> *side_input_backprop,
+    DeviceMemory<uint8_t> *reserve_space_data,
+    ScratchAllocator *workspace_allocator) {
+  VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
+            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
+            PARAM(scale_backprop), PARAM(offset_backprop));
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationBackward(
+        this, y_backprop, x, scale, offset, mean, inv_var, y, x_desc,
+        scale_offset_desc, epsilon, activation_mode, x_backprop, scale_backprop,
+        offset_backprop, side_input_backprop, reserve_space_data,
+        workspace_allocator));
+
+  } else {
+    SetErrorAndLogNoDnnSupport();
+  }
+  return *this;
+}
+
 Stream &Stream::ThenConvolve(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -683,8 +755,8 @@ Stream &Stream::ThenActivateWithOptions(dnn::ActivationMode activation_mode,
 }
 
 Stream &Stream::ThenDepthConcatenate(
-    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-    port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+    absl::Span<const dnn::BatchDescriptor> input_dimensions,
+    absl::Span<const DeviceMemory<float> *const> input_data,
     DeviceMemory<float> *output_data) {
   VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data));
 
@@ -711,8 +783,8 @@ Stream &Stream::ThenDepthConcatenate(
 }
 
 Stream &Stream::ThenSpaceConcatenate(
-    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-    port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+    absl::Span<const dnn::BatchDescriptor> input_dimensions,
+    absl::Span<const DeviceMemory<float> *const> input_data,
     DeviceMemory<float> *output_data,
     dnn::SpaceConcatenateMode concat_direction) {
   VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data));
@@ -811,8 +883,8 @@ Stream &Stream::ThenSpaceToDepth(
 
 Stream &Stream::ThenElementwiseOperate(
     dnn::ElementwiseOperation operation,
-    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-    port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+    absl::Span<const dnn::BatchDescriptor> input_dimensions,
+    absl::Span<const DeviceMemory<float> *const> input_data,
     const dnn::BatchDescriptor &output_dimensions,
     DeviceMemory<float> *output_data) {
   VLOG_CALL(PARAM(operation), PARAM(input_dimensions), PARAM(input_data),
@@ -830,10 +902,9 @@ Stream &Stream::ThenElementwiseOperate(
 
 Stream &Stream::ThenElementwiseOperateScaledQuantized(
     dnn::ElementwiseOperation operation,
-    port::ArraySlice<int> input_multiplicands,  // non-absl ok
-    int output_divisor,
-    port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-    port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+    absl::Span<const int> input_multiplicands, int output_divisor,
+    absl::Span<const dnn::BatchDescriptor> input_dimensions,
+    absl::Span<const DeviceMemory<float> *const> input_data,
     const dnn::BatchDescriptor &output_dimensions,
     DeviceMemory<float> *output_data) {
   VLOG_CALL(PARAM(operation), PARAM(input_multiplicands), PARAM(output_divisor),
@@ -1056,7 +1127,7 @@ Stream &Stream::ThenWaitFor(Event *event) {
   VLOG_CALL(PARAM(event));
 
   if (ok()) {
-    port::Status status = parent_->WaitForEvent(this, event);
+    tsl::Status status = parent_->WaitForEvent(this, event);
     if (!status.ok()) {
       LOG(ERROR) << "Error waiting for event in stream: "
                  << status.error_message()
@@ -1706,13 +1777,9 @@ Stream &Stream::ThenBlasTrsmBatched(
 
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a,  // non-absl ok
-    int lda,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b,  // non-absl ok
-    int ldb, float beta,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,  // non-absl ok
-    int ldc, int batch_count) {
+    uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
+    DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
+    DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         /*scratch_allocator=*/nullptr);
@@ -1720,39 +1787,51 @@ Stream &Stream::ThenBlasGemmBatched(
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a,  // non-absl ok
-    int lda,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b,  // non-absl ok
-    int ldb, float beta,
-    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,  // non-absl ok
-    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+    uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
+    DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
+    DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
 
-  ThenBlasImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64, float,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &,  // non-absl ok
-      int,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &,  // non-absl ok
-      int, float,
-      const port::ArraySlice<DeviceMemory<Eigen::half> *> &,  // non-absl ok
-      int, int, ScratchAllocator *>
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               float, DeviceMemorySlice<Eigen::half>, int,
+               DeviceMemorySlice<Eigen::half>, int, float,
+               DeviceMemorySlice<Eigen::half>, int, int, ScratchAllocator *>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
               scratch_allocator);
 }
 
-Stream &Stream::ThenBlasGemmBatched(
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha,
-    const port::ArraySlice<DeviceMemory<float> *> &a,           // non-absl ok
-    int lda, const port::ArraySlice<DeviceMemory<float> *> &b,  // non-absl ok
-    int ldb, float beta,
-    const port::ArraySlice<DeviceMemory<float> *> &c,  // non-absl ok
-    int ldc, int batch_count) {
+    uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
+    DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
+    DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
+
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               float, DeviceMemorySlice<Eigen::bfloat16>, int,
+               DeviceMemorySlice<Eigen::bfloat16>, int, float,
+               DeviceMemorySlice<Eigen::bfloat16>, int, int, ScratchAllocator *>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
+}
+
+Stream &Stream::ThenBlasGemmBatched(blas::Transpose transa,
+                                    blas::Transpose transb, uint64_t m,
+                                    uint64 n, uint64_t k, float alpha,
+                                    DeviceMemorySlice<float> a, int lda,
+                                    DeviceMemorySlice<float> b, int ldb,
+                                    float beta, DeviceMemorySlice<float> c,
+                                    int ldc, int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         /*scratch_allocator=*/nullptr);
@@ -1760,36 +1839,30 @@ Stream &Stream::ThenBlasGemmBatched(
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha,
-    const port::ArraySlice<DeviceMemory<float> *> &a,           // non-absl ok
-    int lda, const port::ArraySlice<DeviceMemory<float> *> &b,  // non-absl ok
-    int ldb, float beta,
-    const port::ArraySlice<DeviceMemory<float> *> &c,  // non-absl ok
+    uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
+    DeviceMemorySlice<float> b, int ldb, float beta, DeviceMemorySlice<float> c,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
 
-  ThenBlasImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64, float,
-      const port::ArraySlice<DeviceMemory<float> *> &, int,    // non-absl ok
-      const port::ArraySlice<DeviceMemory<float> *> &, int,    // non-absl ok
-      float, const port::ArraySlice<DeviceMemory<float> *> &,  // non-absl ok
-      int, int, ScratchAllocator *>
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               float, DeviceMemorySlice<float>, int, DeviceMemorySlice<float>,
+               int, float, DeviceMemorySlice<float>, int, int,
+               ScratchAllocator *>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
               scratch_allocator);
 }
 
-Stream &Stream::ThenBlasGemmBatched(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, double alpha,
-    const port::ArraySlice<DeviceMemory<double> *> &a,           // non-absl ok
-    int lda, const port::ArraySlice<DeviceMemory<double> *> &b,  // non-absl ok
-    int ldb, double beta,
-    const port::ArraySlice<DeviceMemory<double> *> &c,  // non-absl ok
-    int ldc, int batch_count) {
+Stream &Stream::ThenBlasGemmBatched(blas::Transpose transa,
+                                    blas::Transpose transb, uint64_t m,
+                                    uint64 n, uint64_t k, double alpha,
+                                    DeviceMemorySlice<double> a, int lda,
+                                    DeviceMemorySlice<double> b, int ldb,
+                                    double beta, DeviceMemorySlice<double> c,
+                                    int ldc, int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         /*scratch_allocator=*/nullptr);
@@ -1797,23 +1870,18 @@ Stream &Stream::ThenBlasGemmBatched(
 
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, double alpha,
-    const port::ArraySlice<DeviceMemory<double> *> &a,           // non-absl ok
-    int lda, const port::ArraySlice<DeviceMemory<double> *> &b,  // non-absl ok
-    int ldb, double beta,
-    const port::ArraySlice<DeviceMemory<double> *> &c,  // non-absl ok
-    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+    uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
+    DeviceMemorySlice<double> b, int ldb, double beta,
+    DeviceMemorySlice<double> c, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
 
-  ThenBlasImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64, double,
-      const port::ArraySlice<DeviceMemory<double> *> &,       // non-absl ok
-      int, const port::ArraySlice<DeviceMemory<double> *> &,  // non-absl ok
-      int, double,
-      const port::ArraySlice<DeviceMemory<double> *> &,  // non-absl ok
-      int, int, ScratchAllocator *>
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               double, DeviceMemorySlice<double>, int,
+               DeviceMemorySlice<double>, int, double,
+               DeviceMemorySlice<double>, int, int, ScratchAllocator *>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
@@ -1823,15 +1891,9 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
     uint64_t k, std::complex<float> alpha,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &a,
-    int lda,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &b,
-    int ldb, std::complex<float> beta,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &c,
-    int ldc, int batch_count) {
+    DeviceMemorySlice<std::complex<float>> a, int lda,
+    DeviceMemorySlice<std::complex<float>> b, int ldb, std::complex<float> beta,
+    DeviceMemorySlice<std::complex<float>> c, int ldc, int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
                                         /*scratch_allocator=*/nullptr);
@@ -1840,25 +1902,19 @@ Stream &Stream::ThenBlasGemmBatched(
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
     uint64_t k, std::complex<float> alpha,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &a,
-    int lda,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &b,
-    int ldb, std::complex<float> beta,
-    const port::ArraySlice<DeviceMemory<std::complex<float>> *>  // non-absl ok
-        &c,
-    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+    DeviceMemorySlice<std::complex<float>> a, int lda,
+    DeviceMemorySlice<std::complex<float>> b, int ldb, std::complex<float> beta,
+    DeviceMemorySlice<std::complex<float>> c, int ldc, int batch_count,
+    ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
 
-  ThenBlasImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-      std::complex<float>, const DeviceMemorySlice<std::complex<float>> &, int,
-      const DeviceMemorySlice<std::complex<float>> &, int, std::complex<float>,
-      const DeviceMemorySlice<std::complex<float>> &, int, int,
-      ScratchAllocator *>
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               std::complex<float>, DeviceMemorySlice<std::complex<float>>, int,
+               DeviceMemorySlice<std::complex<float>>, int, std::complex<float>,
+               DeviceMemorySlice<std::complex<float>>, int, int,
+               ScratchAllocator *>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
@@ -1868,9 +1924,9 @@ Stream &Stream::ThenBlasGemmBatchedWithScratch(
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
     uint64_t k, std::complex<double> alpha,
-    const DeviceMemorySlice<std::complex<double>> &a, int lda,
-    const DeviceMemorySlice<std::complex<double>> &b, int ldb,
-    std::complex<double> beta, const DeviceMemorySlice<std::complex<double>> &c,
+    DeviceMemorySlice<std::complex<double>> a, int lda,
+    DeviceMemorySlice<std::complex<double>> b, int ldb,
+    std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
     int ldc, int batch_count) {
   return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
                                         b, ldb, beta, c, ldc, batch_count,
@@ -1880,21 +1936,19 @@ Stream &Stream::ThenBlasGemmBatched(
 Stream &Stream::ThenBlasGemmBatchedWithScratch(
     blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
     uint64_t k, std::complex<double> alpha,
-    const DeviceMemorySlice<std::complex<double>> &a, int lda,
-    const DeviceMemorySlice<std::complex<double>> &b, int ldb,
-    std::complex<double> beta,
-    const DeviceMemorySlice<std::complex<double>> &c,  // non-absl ok
+    DeviceMemorySlice<std::complex<double>> a, int lda,
+    DeviceMemorySlice<std::complex<double>> b, int ldb,
+    std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
 
-  ThenBlasImpl<
-      blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-      std::complex<double>, const DeviceMemorySlice<std::complex<double>> &,
-      int, const DeviceMemorySlice<std::complex<double>> &, int,
-      std::complex<double>, const DeviceMemorySlice<std::complex<double>> &,
-      int, int, ScratchAllocator *>
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
+               std::complex<double>, DeviceMemorySlice<std::complex<double>>,
+               int, DeviceMemorySlice<std::complex<double>>, int,
+               std::complex<double>, DeviceMemorySlice<std::complex<double>>,
+               int, int, ScratchAllocator *>
       impl;
   return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
               k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
@@ -2336,19 +2390,15 @@ Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
   return *this;
 }
 
-Stream &Stream::ThenDoHostCallback(std::function<void()> callback) {
-  VLOG_CALL(PARAM(callback));
-
-  if (!ok()) {
-    LOG(INFO) << DebugStreamPointers()
-              << " was in error state before adding host callback";
-  }
-  CheckError(parent_->HostCallback(this, std::move(callback)));
-  return *this;
+Stream &Stream::ThenDoHostCallback(absl::AnyInvocable<void() &&> callback) {
+  return ThenDoHostCallbackWithStatus([cb = std::move(callback)]() mutable {
+    std::move(cb)();
+    return ::tsl::OkStatus();
+  });
 }
 
 Stream &Stream::ThenDoHostCallbackWithStatus(
-    std::function<port::Status()> callback) {
+    absl::AnyInvocable<tsl::Status() &&> callback) {
   VLOG_CALL(PARAM(callback));
 
   if (!ok()) {
@@ -2360,7 +2410,7 @@ Stream &Stream::ThenDoHostCallbackWithStatus(
 }
 
 Stream &Stream::ThenRunAfterNextBlockHostUntilDone(
-    std::function<void()> callback) {
+    absl::AnyInvocable<void() &&> callback) {
   VLOG_CALL(PARAM(callback));
 
   if (!ok()) {
@@ -2378,7 +2428,7 @@ void Stream::CheckError(bool operation_retcode) {
     return;
   }
   absl::MutexLock lock(&mu_);
-  status_ = port::InternalError("Unknown error");
+  status_ = tsl::errors::Internal("Unknown error");
 }
 
 Stream &Stream::ThenFft(fft::Plan *plan,
@@ -2489,14 +2539,14 @@ Stream &Stream::ThenEnqueueOnBackgroundThread(
   });
 }
 
-port::Status Stream::BlockHostUntilDone() {
+tsl::Status Stream::BlockHostUntilDone() {
   VLOG_CALL();
 
   if (!ok()) {
     absl::MutexLock lock(&mu_);
     LOG(INFO) << status_.ToString();
-    port::Status status = port::Status(
-        port::error::INTERNAL,
+    tsl::Status status = tsl::Status(
+        tsl::error::INTERNAL,
         "stream did not block host until done; was already in an error state");
     LOG(INFO) << DebugStreamPointers() << " " << status;
     return status;
@@ -2504,7 +2554,7 @@ port::Status Stream::BlockHostUntilDone() {
 
   temporary_memory_manager_.DeallocateFinalizedTemporaries();
 
-  port::Status error = parent_->BlockHostUntilDone(this);
+  tsl::Status error = parent_->BlockHostUntilDone(this);
   CheckError(error.ok());
 
   RunAfterBlockHostUntilDoneCallbacks();
@@ -2512,13 +2562,13 @@ port::Status Stream::BlockHostUntilDone() {
 }
 
 void Stream::RunAfterBlockHostUntilDoneCallbacks() {
-  std::vector<std::function<void()>> callbacks;
+  std::vector<absl::AnyInvocable<void() &&>> callbacks;
   {
     absl::MutexLock lock(&mu_);
     std::swap(callbacks, after_block_host_until_done_callbacks_);
   }
-  for (const auto &fn : callbacks) {
-    fn();
+  for (auto &fn : callbacks) {
+    std::move(fn)();
   }
 }
 
@@ -2528,7 +2578,7 @@ std::string Stream::DebugStreamPointers() const {
                       ",impl=", ToVlogString(implementation_.get()), "]");
 }
 
-void Stream::CheckStatus(port::Status status) {
+void Stream::CheckStatus(tsl::Status status) {
   if (status.ok()) {
     return;
   }
diff --git a/tensorflow/compiler/xla/stream_executor/stream.h b/tensorflow/compiler/xla/stream_executor/stream.h
index dacfa45ccaf..9f092b6e8f3 100644
--- a/tensorflow/compiler/xla/stream_executor/stream.h
+++ b/tensorflow/compiler/xla/stream_executor/stream.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/fft.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
 #include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h"
@@ -144,7 +144,7 @@ class Stream {
   // devices should also override AllowsSyncOnCompletion to return false.) For
   // these devices, this method can be used after work is finished to retrieve
   // execution status.
-  port::Status RefreshStatus() TF_LOCKS_EXCLUDED(mu_);
+  tsl::Status RefreshStatus() TF_LOCKS_EXCLUDED(mu_);
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
@@ -172,7 +172,7 @@ class Stream {
   // Allocate temporary memories. The stream will deallocate them when blocked
   // or destroyed.
   template <typename T>
-  port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
+  tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
   AllocateTemporaryArray(uint64_t element_count);
 
   // Entrains onto the stream of operations: a kernel launch with the given
@@ -194,8 +194,8 @@ class Stream {
   // spit out helpful static_assert error traces with information as to the
   // argument number and types that were mismatched.
   template <typename... Params, typename... Args>
-  port::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
-                          const TypedKernel<Params...> &kernel, Args... args);
+  tsl::Status ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
+                         const TypedKernel<Params...> &kernel, Args... args);
 
   // Record a "start" event for the interval timer at this point in the
   // stream's execution (relative to the previously and subsequently enqueued
@@ -310,6 +310,36 @@ class Stream {
       DeviceMemory<uint8_t> *reserve_space_data,
       ScratchAllocator *workspace_allocator);
 
+  Stream &ThenBatchNormalizationForward(
+      const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
+      const DeviceMemory<float> &offset,
+      const DeviceMemory<float> &estimated_mean,
+      const DeviceMemory<float> &estimated_variance,
+      const DeviceMemory<Eigen::bfloat16> &side_input,
+      const dnn::BatchDescriptor &x_desc,
+      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+      const double exponential_average_factor,
+      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16> *y,
+      DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
+      DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
+      bool is_training, ScratchAllocator *reserve_space_allocator,
+      ScratchAllocator *workspace_allocator);
+
+  Stream &ThenBatchNormalizationBackward(
+      const DeviceMemory<Eigen::bfloat16> &y_backprop,
+      const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
+      const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
+      const DeviceMemory<float> &inv_var,
+      const DeviceMemory<Eigen::bfloat16> &y,
+      const dnn::BatchDescriptor &x_desc,
+      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
+      dnn::ActivationMode activation_mode,
+      DeviceMemory<Eigen::bfloat16> *x_backprop,
+      DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
+      DeviceMemory<Eigen::bfloat16> *side_input_backprop,
+      DeviceMemory<uint8_t> *reserve_space_data,
+      ScratchAllocator *workspace_allocator);
+
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
                        const dnn::FilterDescriptor &filter_descriptor,
@@ -339,7 +369,7 @@ class Stream {
       DeviceMemory<float> *output_data);
 
   template <typename InputType, typename OutputType>
-  port::Status ConvolveWithAlgorithm(
+  tsl::Status ConvolveWithAlgorithm(
       dnn::ConvolutionKind kind, const dnn::BatchDescriptor &input_descriptor,
       DeviceMemory<InputType> input_data,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -365,12 +395,12 @@ class Stream {
                              convolution_descriptor, algorithm_desc,
                              scratch_memory, output_profile_result);
     }
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
   template <typename InputT, typename ScaleT, typename SideInputT,
             typename BiasT, typename OutputT>
-  port::Status FusedConvolveWithAlgorithm(
+  tsl::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<InputT> &conv_input_data, ScaleT conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -393,10 +423,10 @@ class Stream {
           bias_descriptor, biases, activation_mode, output_descriptor, *output,
           scratch_allocator, algorithm_config, output_profile_result);
     }
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
-  port::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
+  tsl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
       const dnn::AlgorithmDesc &algorithm_desc, dnn::ConvolutionKind kind,
       dnn::DataType element_type, dnn::DataType output_type,
       const dnn::BatchDescriptor &input_descriptor,
@@ -405,14 +435,14 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor) {
     dnn::DnnSupport *dnn_support = parent_->AsDnn();
     if (!dnn_support) {
-      return port::UnimplementedError("DNN library is not found.");
+      return tsl::errors::Unimplemented("DNN library is not found.");
     }
     return dnn_support->ConvolveRunnerFromDesc(
         this, algorithm_desc, kind, element_type, output_type, input_descriptor,
         filter_descriptor, output_descriptor, convolution_descriptor);
   }
 
-  port::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  tsl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
   FusedConvolveRunnerFromDesc(
       const dnn::AlgorithmDesc &algorithm_desc, dnn::ConvolutionKind kind,
       dnn::DataType element_type, dnn::DataType bias_type,
@@ -426,7 +456,7 @@ class Stream {
       dnn::ActivationMode activation_mode) {
     dnn::DnnSupport *dnn_support = parent_->AsDnn();
     if (!dnn_support) {
-      return port::UnimplementedError("DNN library is not found.");
+      return tsl::errors::Unimplemented("DNN library is not found.");
     }
     return dnn_support->FusedConvolveRunnerFromDesc(
         this, algorithm_desc, kind, element_type, bias_type, output_type,
@@ -471,24 +501,23 @@ class Stream {
                       DeviceMemory<float> *output_data);
 
   template <typename ElementType>
-  port::Status ThenPoolForward(
-      const dnn::PoolingDescriptor &pooling_dimensions,
-      const dnn::BatchDescriptor &input_dimensions,
-      const DeviceMemory<ElementType> &input_data,
-      const dnn::BatchDescriptor &output_dimensions,
-      DeviceMemory<ElementType> *output_data,
-      ScratchAllocator *workspace_allocator = nullptr) {
+  tsl::Status ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                              const dnn::BatchDescriptor &input_dimensions,
+                              const DeviceMemory<ElementType> &input_data,
+                              const dnn::BatchDescriptor &output_dimensions,
+                              DeviceMemory<ElementType> *output_data,
+                              ScratchAllocator *workspace_allocator = nullptr) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       return dnn->DoPoolForward(dnn::ToDataType<ElementType>::value, this,
                                 pooling_dimensions, input_dimensions,
                                 input_data, output_dimensions, *output_data,
                                 workspace_allocator);
     }
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
   template <typename ElementType>
-  port::Status ThenPoolBackward(
+  tsl::Status ThenPoolBackward(
       const dnn::PoolingDescriptor &pooling_dimensions,
       const dnn::BatchDescriptor &input_dimensions,
       const DeviceMemory<ElementType> &input_data,
@@ -503,7 +532,7 @@ class Stream {
           input_dimensions, input_data, output_dimensions, output_data,
           input_diff_data, *output_diff_data, workspace_allocator);
     }
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
   Stream &ThenNormalizeWithDimensions(
@@ -534,13 +563,13 @@ class Stream {
                                   uint64_t options);
 
   Stream &ThenDepthConcatenate(
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-      port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float> *const> input_data,
       DeviceMemory<float> *output_data);
 
   Stream &ThenSpaceConcatenate(
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-      port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float> *const> input_data,
       DeviceMemory<float> *output_data,
       dnn::SpaceConcatenateMode concat_direction);
 
@@ -578,17 +607,16 @@ class Stream {
 
   Stream &ThenElementwiseOperate(
       dnn::ElementwiseOperation operation,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-      port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float> *const> input_data,
       const dnn::BatchDescriptor &output_dimensions,
       DeviceMemory<float> *output_data);
 
   Stream &ThenElementwiseOperateScaledQuantized(
       dnn::ElementwiseOperation operation,
-      port::ArraySlice<int> input_multiplicands,  // non-absl ok
-      int output_divisor,
-      port::ArraySlice<dnn::BatchDescriptor> input_dimensions,   // non-absl ok
-      port::ArraySlice<const DeviceMemory<float> *> input_data,  // non-absl ok
+      absl::Span<const int> input_multiplicands, int output_divisor,
+      absl::Span<const dnn::BatchDescriptor> input_dimensions,
+      absl::Span<const DeviceMemory<float> *const> input_data,
       const dnn::BatchDescriptor &output_dimensions,
       DeviceMemory<float> *output_data);
 
@@ -615,13 +643,12 @@ class Stream {
                                  dnn::QuantizedActivationMode mode,
                                  void *host_dst, uint64_t size);
 
-  // Template version of ThenMemcpyD2HQuantized that takes a MutableArraySlice
-  // and uses the Quantization trait to call the generic version of
+  // Template version of ThenMemcpyD2HQuantized that takes a mutable span and
+  // uses the Quantization trait to call the generic version of
   // ThenMemcpyD2HQuantized with the correct QuantizedActivationMode.
   template <typename ElementType>
-  Stream &ThenMemcpyD2HQuantized(
-      const DeviceMemory<float> &gpu_unquantized_src,
-      port::MutableArraySlice<ElementType> host_dst) {
+  Stream &ThenMemcpyD2HQuantized(const DeviceMemory<float> &gpu_unquantized_src,
+                                 absl::Span<ElementType> host_dst) {
     return ThenMemcpyD2HQuantized(
         gpu_unquantized_src, Quantization<ElementType>::kModeId,
         host_dst.data(), host_dst.size() * sizeof(ElementType));
@@ -636,9 +663,8 @@ class Stream {
   // and uses the Quantization trait to call the generic version of
   // ThenMemcpyH2DQuantized with the correct QuantizedActivationMode.
   template <typename ElementType>
-  Stream &ThenMemcpyH2DQuantized(
-      port::ArraySlice<ElementType> host_src,  // non-absl ok
-      DeviceMemory<float> *gpu_unquantized_dst) {
+  Stream &ThenMemcpyH2DQuantized(absl::Span<const ElementType> host_src,
+                                 DeviceMemory<float> *gpu_unquantized_dst) {
     return ThenMemcpyH2DQuantized(
         host_src.data(), host_src.size() * sizeof(ElementType),
         Quantization<ElementType>::kModeId, gpu_unquantized_dst);
@@ -757,12 +783,12 @@ class Stream {
                        DeviceMemory<double> *y, int incy);
 
   template <typename InputType>
-  port::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            DeviceMemory<InputType> *c, int ldc,
-                            blas::ComputePrecision precision) {
+  tsl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
+                           uint64_t m, uint64 n, uint64 k,
+                           const DeviceMemory<InputType> &a, int lda,
+                           const DeviceMemory<InputType> &b, int ldb,
+                           DeviceMemory<InputType> *c, int ldc,
+                           blas::ComputePrecision precision) {
     InputType alpha{1.0};
     InputType beta{0.0};
     return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
@@ -771,22 +797,22 @@ class Stream {
 
   // TODO(parkers): Update all callers to pass kDefaultComputePrecision.
   template <typename InputType>
-  port::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            DeviceMemory<InputType> *c, int ldc) {
+  tsl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
+                           uint64_t m, uint64 n, uint64 k,
+                           const DeviceMemory<InputType> &a, int lda,
+                           const DeviceMemory<InputType> &b, int ldb,
+                           DeviceMemory<InputType> *c, int ldc) {
     return ThenBlasGemm(transa, transb, m, n, k, a, lda, b, ldb, c, ldc,
                         blas::kDefaultComputePrecision);
   }
 
   template <typename InputType, typename ConstantType>
-  port::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k, ConstantType alpha,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            ConstantType beta, DeviceMemory<InputType> *c,
-                            int ldc, blas::ComputePrecision precision) {
+  tsl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
+                           uint64_t m, uint64 n, uint64 k, ConstantType alpha,
+                           const DeviceMemory<InputType> &a, int lda,
+                           const DeviceMemory<InputType> &b, int ldb,
+                           ConstantType beta, DeviceMemory<InputType> *c,
+                           int ldc, blas::ComputePrecision precision) {
     static_assert(
         detail::is_any_of<InputType, Eigen::half, Eigen::bfloat16, float,
                           double, std::complex<float>, std::complex<double>>(),
@@ -801,7 +827,7 @@ class Stream {
         "If input is not Eigen::half, constant and input types have to match");
     blas::BlasSupport *blas = parent()->AsBlas();
     if (!blas) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Attempting to perform BLAS operation using "
           "StreamExecutor without BLAS support");
     }
@@ -819,12 +845,12 @@ class Stream {
 
   // TODO(parkers): Update all callers to pass kDefaultComputePrecision.
   template <typename InputType, typename ConstantType>
-  port::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k, ConstantType alpha,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            ConstantType beta, DeviceMemory<InputType> *c,
-                            int ldc) {
+  tsl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
+                           uint64_t m, uint64 n, uint64 k, ConstantType alpha,
+                           const DeviceMemory<InputType> &a, int lda,
+                           const DeviceMemory<InputType> &b, int ldb,
+                           ConstantType beta, DeviceMemory<InputType> *c,
+                           int ldc) {
     return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
                         ldc, blas::kDefaultComputePrecision);
   }
@@ -868,7 +894,7 @@ class Stream {
       blas::ProfileResult *output_profile_result);
 
   template <typename InputType, typename OutputType>
-  port::Status ThenBlasGemmWithAlgorithm(
+  tsl::Status ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, const DeviceMemory<InputType> &a, int lda,
       const DeviceMemory<InputType> &b, int ldb, DeviceMemory<OutputType> *c,
@@ -884,7 +910,7 @@ class Stream {
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
-  port::Status ThenBlasGemmWithAlgorithm(
+  tsl::Status ThenBlasGemmWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
       const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
@@ -898,7 +924,7 @@ class Stream {
 
     blas::BlasSupport *blas = parent()->AsBlas();
     if (!blas) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Attempting to perform BLAS operation using "
           "StreamExecutor without BLAS support");
     }
@@ -909,7 +935,7 @@ class Stream {
     UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
                                     &beta_storage);
 
-    port::Status st = blas->DoBlasGemmWithAlgorithm(
+    tsl::Status st = blas->DoBlasGemmWithAlgorithm(
         this, transa, transb, m, n, k, alpha_ptr, a,
         blas::ToDataType<InputType>::value, lda, b,
         blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
@@ -923,7 +949,7 @@ class Stream {
   }
 
   template <typename InputType, typename OutputType, typename ConstantType>
-  port::Status ThenBlasGemmStridedBatchedWithAlgorithm(
+  tsl::Status ThenBlasGemmStridedBatchedWithAlgorithm(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
       int64_t stride_a, const DeviceMemory<InputType> &b, int ldb,
@@ -937,7 +963,7 @@ class Stream {
 
     blas::BlasSupport *blas = parent()->AsBlas();
     if (!blas) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Attempting to perform BLAS operation using "
           "StreamExecutor without BLAS support");
     }
@@ -946,7 +972,7 @@ class Stream {
     float alpha_storage, beta_storage;
     UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
                                     &beta_storage);
-    port::Status st = blas->DoBlasGemmStridedBatchedWithAlgorithm(
+    tsl::Status st = blas->DoBlasGemmStridedBatchedWithAlgorithm(
         this, transa, transb, m, n, k, alpha_ptr, a,
         blas::ToDataType<InputType>::value, lda, stride_a, b,
         blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
@@ -960,85 +986,87 @@ class Stream {
   }
 
   template <typename T>
-  using DeviceMemorySlice = port::ArraySlice<DeviceMemory<T> *>;  // non-absl ok
+  using DeviceMemorySlice = absl::Span<DeviceMemory<T> *const>;
 
   // See BlasSupport::DoBlasGemmBatched.
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64_t m, uint64 n, uint64_t k, float alpha,
-                              const DeviceMemorySlice<Eigen::half> &a, int lda,
-                              const DeviceMemorySlice<Eigen::half> &b, int ldb,
-                              float beta,
-                              const DeviceMemorySlice<Eigen::half> &c, int ldc,
-                              int batch_count);
+                              DeviceMemorySlice<Eigen::half> a, int lda,
+                              DeviceMemorySlice<Eigen::half> b, int ldb,
+                              float beta, DeviceMemorySlice<Eigen::half> c,
+                              int ldc, int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64_t m, uint64 n, uint64 k, float alpha,
-                              const DeviceMemorySlice<float> &a, int lda,
-                              const DeviceMemorySlice<float> &b, int ldb,
-                              float beta, const DeviceMemorySlice<float> &c,
-                              int ldc, int batch_count);
-  Stream &ThenBlasGemmBatched(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64 k, double alpha,
-      const port::ArraySlice<DeviceMemory<double> *> &a,  // non-absl ok
-      int lda,
-      const port::ArraySlice<DeviceMemory<double> *> &b,  // non-absl ok
-      int ldb, double beta,
-      const port::ArraySlice<DeviceMemory<double> *> &c,  // non-absl ok
-      int ldc, int batch_count);
+                              DeviceMemorySlice<float> a, int lda,
+                              DeviceMemorySlice<float> b, int ldb, float beta,
+                              DeviceMemorySlice<float> c, int ldc,
+                              int batch_count);
+  Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
+                              uint64_t m, uint64 n, uint64 k, double alpha,
+                              DeviceMemorySlice<double> a, int lda,
+                              DeviceMemorySlice<double> b, int ldb, double beta,
+                              DeviceMemorySlice<double> c, int ldc,
+                              int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64_t m, uint64 n, uint64_t k,
                               std::complex<float> alpha,
-                              const DeviceMemorySlice<std::complex<float>> &a,
-                              int lda,
-                              const DeviceMemorySlice<std::complex<float>> &b,
-                              int ldb, std::complex<float> beta,
-                              const DeviceMemorySlice<std::complex<float>> &c,
-                              int ldc, int batch_count);
+                              DeviceMemorySlice<std::complex<float>> a, int lda,
+                              DeviceMemorySlice<std::complex<float>> b, int ldb,
+                              std::complex<float> beta,
+                              DeviceMemorySlice<std::complex<float>> c, int ldc,
+                              int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64_t m, uint64 n, uint64_t k,
                               std::complex<double> alpha,
-                              const DeviceMemorySlice<std::complex<double>> &a,
+                              DeviceMemorySlice<std::complex<double>> a,
                               int lda,
-                              const DeviceMemorySlice<std::complex<double>> &b,
+                              DeviceMemorySlice<std::complex<double>> b,
                               int ldb, std::complex<double> beta,
-                              const DeviceMemorySlice<std::complex<double>> &c,
+                              DeviceMemorySlice<std::complex<double>> c,
                               int ldc, int batch_count);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, float alpha, const DeviceMemorySlice<Eigen::half> &a, int lda,
-      const DeviceMemorySlice<Eigen::half> &b, int ldb, float beta,
-      const DeviceMemorySlice<Eigen::half> &c, int ldc, int batch_count,
+      uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
+      DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
+      DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
       ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, float alpha, const DeviceMemorySlice<float> &a, int lda,
-      const DeviceMemorySlice<float> &b, int ldb, float beta,
-      const DeviceMemorySlice<float> &c, int ldc, int batch_count,
+      uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
+      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
+      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
       ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(blas::Transpose transa,
+                                         blas::Transpose transb, uint64_t m,
+                                         uint64 n, uint64_t k, float alpha,
+                                         DeviceMemorySlice<float> a, int lda,
+                                         DeviceMemorySlice<float> b, int ldb,
+                                         float beta, DeviceMemorySlice<float> c,
+                                         int ldc, int batch_count,
+                                         ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, double alpha, const DeviceMemorySlice<double> &a, int lda,
-      const DeviceMemorySlice<double> &b, int ldb, double beta,
-      const DeviceMemorySlice<double> &c, int ldc, int batch_count,
+      uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
+      DeviceMemorySlice<double> b, int ldb, double beta,
+      DeviceMemorySlice<double> c, int ldc, int batch_count,
       ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, std::complex<float> alpha,
-      const DeviceMemorySlice<std::complex<float>> &a, int lda,
-      const DeviceMemorySlice<std::complex<float>> &b, int ldb,
-      std::complex<float> beta, const DeviceMemorySlice<std::complex<float>> &c,
+      DeviceMemorySlice<std::complex<float>> a, int lda,
+      DeviceMemorySlice<std::complex<float>> b, int ldb,
+      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
       int ldc, int batch_count, ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, std::complex<double> alpha,
-      const DeviceMemorySlice<std::complex<double>> &a, int lda,
-      const DeviceMemorySlice<std::complex<double>> &b, int ldb,
-      std::complex<double> beta,
-      const DeviceMemorySlice<std::complex<double>> &c, int ldc,
-      int batch_count, ScratchAllocator *scratch_allocator);
+      DeviceMemorySlice<std::complex<double>> a, int lda,
+      DeviceMemorySlice<std::complex<double>> b, int ldb,
+      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
 
   template <typename InputType, typename ConstantType>
-  port::Status ThenBlasGemmStridedBatched(
+  tsl::Status ThenBlasGemmStridedBatched(
       blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
       uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
       int64_t stride_a, const DeviceMemory<InputType> &b, int ldb,
@@ -1055,7 +1083,7 @@ class Stream {
         "Mismatched input and alpha/beta types");
     blas::BlasSupport *blas = parent()->AsBlas();
     if (!blas) {
-      return port::InternalError(
+      return tsl::errors::Internal(
           "Attempting to perform BLAS operation using "
           "StreamExecutor without BLAS support");
     }
@@ -1188,7 +1216,7 @@ class Stream {
   // slice size.
   template <typename T>
   Stream &ThenMemcpyD2H(const DeviceMemory<T> &gpu_src,
-                        port::MutableArraySlice<T> host_dst) {
+                        absl::Span<T> host_dst) {
     auto host_size = host_dst.size() * sizeof(T);
     CHECK(gpu_src.size() == 0 || host_size >= gpu_src.size());
     return ThenMemcpy(host_dst.begin(), gpu_src, host_size);
@@ -1198,7 +1226,7 @@ class Stream {
   // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <typename T>
-  Stream &ThenMemcpyH2D(port::ArraySlice<T> host_src,  // non-absl ok
+  Stream &ThenMemcpyH2D(absl::Span<const T> host_src,
                         DeviceMemory<T> *gpu_dst) {
     auto host_size = host_src.size() * sizeof(T);
     CHECK(gpu_dst->size() == 0 || gpu_dst->size() >= host_size);
@@ -1409,7 +1437,7 @@ class Stream {
   //
   // Returns an OK status if the blocking was successful and the stream is ok().
   // Otherwise returns an error describing why the blocking failed.
-  port::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
+  tsl::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
 
   // Warning! This method interacts with internal threads in
   // sometimes-unpredictable ways and is intended for GPU-Executor-internal
@@ -1441,7 +1469,7 @@ class Stream {
   // This is kept for backward compatibility. Future code should use
   // ThenDoHostCallbackWithStatus and explicitly return a success status.
   // TODO(b/112125301): Eventually remove this method.
-  Stream &ThenDoHostCallback(std::function<void()> callback);
+  Stream &ThenDoHostCallback(absl::AnyInvocable<void() &&> callback);
 
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
@@ -1455,13 +1483,15 @@ class Stream {
   //
   // On certain platforms, ThenDoHostCallback is expected to have significant
   // negative effects on performance.
-  Stream &ThenDoHostCallbackWithStatus(std::function<port::Status()> callback);
+  Stream &ThenDoHostCallbackWithStatus(
+      absl::AnyInvocable<tsl::Status() &&> callback);
 
   // Runs the given callback after the next call to BlockHostUntilDone on this
   // stream (or after the Stream does BlockHostUntilDone in its destructor).
   // This can act as a faster alternative to ThenDoHostCallbackWithStatus for
   // some use cases.
-  Stream &ThenRunAfterNextBlockHostUntilDone(std::function<void()> callback);
+  Stream &ThenRunAfterNextBlockHostUntilDone(
+      absl::AnyInvocable<void() &&> callback);
 
   // Returns the StreamExecutor (parent object) associated with this stream.
   StreamExecutor *parent() const {
@@ -1494,7 +1524,7 @@ class Stream {
 
   // Checks whether types match before a call to extended BLAS version.
   template <typename ABType, typename CType, typename ScaleType>
-  port::Status CheckTypesForExtendedBlas(
+  tsl::Status CheckTypesForExtendedBlas(
       blas::ComputationType computation_type) {
     static_assert(
         detail::is_any_of<ABType, Eigen::half, Eigen::bfloat16, float, double,
@@ -1531,10 +1561,10 @@ class Stream {
     }();
 
     if (!valid_computation_type) {
-      return port::InternalError(absl::StrCat(
+      return tsl::errors::Internal(
           "Invalid computation type ",
           blas::ComputationTypeString(computation_type), " for output type: ",
-          blas::DataTypeString(blas::ToDataType<CType>::value)));
+          blas::DataTypeString(blas::ToDataType<CType>::value));
     }
     return ::tsl::OkStatus();
   }
@@ -1549,7 +1579,7 @@ class Stream {
   void CheckError(bool operation_retcode) TF_LOCKS_EXCLUDED(mu_);
 
   // Checks the status and logs the error message, if any.
-  void CheckStatus(port::Status status) TF_LOCKS_EXCLUDED(mu_);
+  void CheckStatus(tsl::Status status) TF_LOCKS_EXCLUDED(mu_);
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
@@ -1580,7 +1610,7 @@ class Stream {
   bool allocated_ ABSL_GUARDED_BY(mu_);
 
   // The last error (if any) of all method calls.
-  port::Status status_ ABSL_GUARDED_BY(mu_);
+  tsl::Status status_ ABSL_GUARDED_BY(mu_);
 
   // Sub-streams that are generated from this stream. Each element has a pointer
   // to sub-stream and a boolean value indicating if this substream is ready to
@@ -1595,8 +1625,8 @@ class Stream {
   internal::TemporaryMemoryManager temporary_memory_manager_;
 
   // Callbacks enqueued to be run after the next call to BlockHostUntilDone().
-  std::vector<std::function<void()>> after_block_host_until_done_callbacks_
-      ABSL_GUARDED_BY(mu_);
+  std::vector<absl::AnyInvocable<void() &&>>
+      after_block_host_until_done_callbacks_ ABSL_GUARDED_BY(mu_);
 
   // Non-extended BLAS interface requires alpha/beta to be floats when input
   // type is Eigen::half. However, for consistency purposes it is convenient
@@ -1628,10 +1658,10 @@ class Stream {
 // Inlines
 
 template <typename... Params, typename... Args>
-inline port::Status Stream::ThenLaunch(ThreadDim thread_dims,
-                                       BlockDim block_dims,
-                                       const TypedKernel<Params...> &kernel,
-                                       Args... args) {
+inline tsl::Status Stream::ThenLaunch(ThreadDim thread_dims,
+                                      BlockDim block_dims,
+                                      const TypedKernel<Params...> &kernel,
+                                      Args... args) {
   KernelInvocationChecker<std::tuple<Params...>,
                           std::tuple<Args...>>::CheckAllStaticAssert();
 
@@ -1648,7 +1678,7 @@ inline port::Status Stream::ThenLaunch(ThreadDim thread_dims,
 }
 
 template <typename T>
-inline port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
+inline tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
 Stream::AllocateTemporaryArray(uint64_t element_count) {
   return temporary_memory_manager_.AllocateArray<T>(element_count);
 }
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.cc b/tensorflow/compiler/xla/stream_executor/stream_executor_internal.cc
deleted file mode 100644
index eb514ee26b7..00000000000
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
-
-namespace stream_executor {
-namespace internal {
-
-// The default implementation just calls the other HostCallback method.
-// It should make all existing code that uses a void() callback still work.
-bool StreamExecutorInterface::HostCallback(Stream* stream,
-                                           std::function<void()> callback) {
-  return HostCallback(
-      stream, std::function<port::Status()>([callback]() -> port::Status {
-        callback();
-        return ::tsl::OkStatus();
-      }));
-}
-
-}  // namespace internal
-}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h b/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
index 546e705fde3..b2937ae5082 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
@@ -25,9 +25,12 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/stream_executor/allocator_stats.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
@@ -39,13 +42,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel_spec.h"
 #include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/module_spec.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
 #include "tensorflow/compiler/xla/stream_executor/trace_listener.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -169,26 +172,34 @@ class StreamExecutorInterface {
   virtual StreamExecutorInterface* GetUnderlyingExecutor() { return this; }
 
   // See the StreamExecutor interface for comments on the same-named methods.
-  virtual port::Status Init(int device_ordinal,
-                            DeviceOptions device_options) = 0;
+  virtual tsl::Status Init(int device_ordinal,
+                           DeviceOptions device_options) = 0;
 
-  virtual port::Status GetKernel(const MultiKernelLoaderSpec& spec,
-                                 KernelBase* kernel) {
-    return port::UnimplementedError("Not Implemented");
+  // This value is cached by the wrapping StreamExecutor instance, so it's OK if
+  // this function is slow.
+  //
+  // The wrapping StreamExecutor will use the platform name if this is nullopt.
+  virtual std::optional<std::string> MakeDeviceDescriptionStr() const {
+    return std::nullopt;
+  }
+
+  virtual tsl::Status GetKernel(const MultiKernelLoaderSpec& spec,
+                                KernelBase* kernel) {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
   virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
-  virtual port::Status LoadModule(const MultiModuleLoaderSpec& spec,
-                                  ModuleHandle* module_handle) {
-    return port::UnimplementedError("Not Implemented");
+  virtual tsl::Status LoadModule(const MultiModuleLoaderSpec& spec,
+                                 ModuleHandle* module_handle) {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
-  virtual port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
+  virtual tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
   CreateOrShareConstant(Stream* stream, const std::vector<uint8_t>& content) {
-    return port::UnimplementedError("Not Implemented");
+    return tsl::errors::Unimplemented("Not Implemented");
   }
-  virtual port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                              const BlockDim& block_dims, const KernelBase& k,
-                              const KernelArgsArrayBase& args) {
-    return port::UnimplementedError("Not Implemented");
+  virtual tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                             const BlockDim& block_dims, const KernelBase& k,
+                             const KernelArgsArrayBase& args) {
+    return tsl::errors::Unimplemented("Not Implemented");
   }
 
   // Releases any state associated with the kernel.
@@ -214,27 +225,27 @@ class StreamExecutorInterface {
   virtual bool HostMemoryRegister(void* mem, uint64_t size) = 0;
   virtual bool HostMemoryUnregister(void* mem) = 0;
   virtual bool SynchronizeAllActivity() = 0;
-  virtual port::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                          uint64_t size) = 0;
-  virtual port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                         uint64_t size) = 0;
-  virtual port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
-                                         const void* host_src,
-                                         uint64_t size) = 0;
-  virtual port::Status SynchronousMemcpy(void* host_dst,
-                                         const DeviceMemoryBase& gpu_src,
+  virtual tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
                                          uint64_t size) = 0;
-  virtual port::Status SynchronousMemcpyDeviceToDevice(
+  virtual tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                        uint64_t size) = 0;
+  virtual tsl::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                        const void* host_src,
+                                        uint64_t size) = 0;
+  virtual tsl::Status SynchronousMemcpy(void* host_dst,
+                                        const DeviceMemoryBase& gpu_src,
+                                        uint64_t size) = 0;
+  virtual tsl::Status SynchronousMemcpyDeviceToDevice(
       DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src,
       uint64_t size) = 0;
-  virtual port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                               uint64_t size) = 0;
-  virtual port::Status Memset(Stream* stream, DeviceMemoryBase* location,
-                              uint8 pattern, uint64_t size) {
-    return port::InternalError("Not implemented");
+  virtual tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                              uint64_t size) = 0;
+  virtual tsl::Status Memset(Stream* stream, DeviceMemoryBase* location,
+                             uint8 pattern, uint64_t size) {
+    return tsl::errors::Internal("Not implemented");
   }
-  virtual port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                                uint32_t pattern, uint64_t size) = 0;
+  virtual tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                               uint32_t pattern, uint64_t size) = 0;
   virtual bool Memcpy(Stream* stream, void* host_dst,
                       const DeviceMemoryBase& gpu_src, uint64_t size) = 0;
   virtual bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
@@ -242,13 +253,12 @@ class StreamExecutorInterface {
   virtual bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
                                     const DeviceMemoryBase& gpu_src,
                                     uint64_t size) = 0;
-  virtual bool HostCallback(Stream* stream, std::function<void()> callback);
   virtual bool HostCallback(Stream* stream,
-                            std::function<port::Status()> callback) = 0;
-  virtual port::Status AllocateEvent(Event* event) = 0;
-  virtual port::Status DeallocateEvent(Event* event) = 0;
-  virtual port::Status RecordEvent(Stream* stream, Event* event) = 0;
-  virtual port::Status WaitForEvent(Stream* stream, Event* event) = 0;
+                            absl::AnyInvocable<tsl::Status() &&> callback) = 0;
+  virtual tsl::Status AllocateEvent(Event* event) = 0;
+  virtual tsl::Status DeallocateEvent(Event* event) = 0;
+  virtual tsl::Status RecordEvent(Stream* stream, Event* event) = 0;
+  virtual tsl::Status WaitForEvent(Stream* stream, Event* event) = 0;
   virtual Event::Status PollForEventStatus(Event* event) = 0;
   virtual bool AllocateStream(Stream* stream) = 0;
   virtual void DeallocateStream(Stream* stream) = 0;
@@ -257,13 +267,13 @@ class StreamExecutorInterface {
   virtual void DeallocateTimer(Timer* timer) = 0;
   virtual bool StartTimer(Stream* stream, Timer* timer) = 0;
   virtual bool StopTimer(Stream* stream, Timer* timer) = 0;
-  virtual port::Status BlockHostUntilDone(Stream* stream) = 0;
-  virtual port::Status GetStatus(Stream* stream) {
-    return port::Status(port::error::UNIMPLEMENTED,
-                        "GetStatus is not supported on this executor.");
+  virtual tsl::Status BlockHostUntilDone(Stream* stream) = 0;
+  virtual tsl::Status GetStatus(Stream* stream) {
+    return tsl::Status(tsl::error::UNIMPLEMENTED,
+                       "GetStatus is not supported on this executor.");
   }
   virtual int PlatformDeviceCount() = 0;
-  virtual port::Status EnablePeerAccessTo(StreamExecutorInterface* other) = 0;
+  virtual tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface* other) = 0;
 
   virtual int64_t GetDeviceLoad() { return -1; }
@@ -289,7 +299,7 @@ class StreamExecutorInterface {
 
   // Creates a new DeviceDescription object. Ownership is transferred to the
   // caller.
-  virtual port::StatusOr<std::unique_ptr<DeviceDescription>>
+  virtual tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription() const = 0;
 
   // Attempts to register the provided TraceListener with the device-specific
@@ -386,7 +396,7 @@ class StreamExecutorInterface {
   // Clears the compilation cache from volatile memory. Returns OK if no
   // compilation cache exists or if clearing the compilation cache is
   // unsupported. Caches in non-volatile storage are unaffected.
-  virtual port::Status FlushCompilationCache() { return ::tsl::OkStatus(); }
+  virtual tsl::Status FlushCompilationCache() { return ::tsl::OkStatus(); }
 
   // Returns a stream allocated by this executor, or nullptr if not found.
   // Performs linear search over alive GPU streams.
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
index dd92ec96ba9..8dc30fd966e 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.cc
@@ -25,21 +25,21 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/const_init.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
 #include "tensorflow/compiler/xla/stream_executor/fft.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/stacktrace.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/util/env_var.h"
 
 namespace {
@@ -51,7 +51,7 @@ namespace {
 
 std::string StackTraceIfVLOG10() {
   if (VLOG_IS_ON(10)) {
-    return absl::StrCat(" ", port::CurrentStackTrace(), "\n");
+    return absl::StrCat(" ", tsl::CurrentStackTrace(), "\n");
   } else {
     return "";
   }
@@ -59,7 +59,7 @@ std::string StackTraceIfVLOG10() {
 
 // Make sure the executor is done with its work; we know (because this isn't
 // publicly visible) that all enqueued work is quick.
-void BlockOnThreadExecutor(port::ThreadPool* executor) {
+void BlockOnThreadExecutor(tsl::thread::ThreadPool* executor) {
   absl::Notification n;
   executor->Schedule([&n]() { n.Notify(); });
   n.WaitForNotification();
@@ -132,7 +132,7 @@ MakeScopedTracer(StreamExecutor* stream_exec, BeginCallT begin_call,
 // TF_PER_DEVICE_MEMORY_LIMIT_MB environment variable is not set.
 static int64_t GetMemoryLimitBytes() {
   int64_t value;
-  SE_CHECK_OK(
+  TF_CHECK_OK(
       tsl::ReadInt64FromEnvVar("TF_PER_DEVICE_MEMORY_LIMIT_MB", 0, &value));
   return value * (1ll << 20);
 }
@@ -144,8 +144,8 @@ StreamExecutor::StreamExecutor(
     : platform_(platform),
       implementation_(std::move(implementation)),
       device_ordinal_(device_ordinal),
-      background_threads_(new port::ThreadPool(
-          port::Env::Default(), "stream_executor", kNumBackgroundThreads)),
+      background_threads_(new tsl::thread::ThreadPool(
+          tsl::Env::Default(), "stream_executor", kNumBackgroundThreads)),
       live_stream_count_(0),
       tracing_enabled_(false),
       mem_alloc_bytes_(0),
@@ -184,14 +184,16 @@ StreamExecutor::~StreamExecutor() {
   }
 }
 
-port::Status StreamExecutor::Init(DeviceOptions device_options) {
-  return implementation_->Init(device_ordinal_, std::move(device_options));
+tsl::Status StreamExecutor::Init(DeviceOptions device_options) {
+  TF_RETURN_IF_ERROR(
+      implementation_->Init(device_ordinal_, std::move(device_options)));
+  return ::tsl::OkStatus();
 }
 
-port::Status StreamExecutor::Init() { return Init(DeviceOptions::Default()); }
+tsl::Status StreamExecutor::Init() { return Init(DeviceOptions::Default()); }
 
-port::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
-                                       KernelBase* kernel) {
+tsl::Status StreamExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                                      KernelBase* kernel) {
   return implementation_->GetKernel(spec, kernel);
 }
 
@@ -199,8 +201,8 @@ void StreamExecutor::UnloadKernel(const KernelBase* kernel) {
   implementation_->UnloadKernel(kernel);
 }
 
-port::Status StreamExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
-                                        ModuleHandle* module_handle) {
+tsl::Status StreamExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                                       ModuleHandle* module_handle) {
   return implementation_->LoadModule(spec, module_handle);
 }
 
@@ -208,7 +210,7 @@ bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
   return implementation_->UnloadModule(module_handle);
 }
 
-port::StatusOr<std::shared_ptr<DeviceMemoryBase>>
+tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 StreamExecutor::CreateOrShareConstant(Stream* stream,
                                       const std::vector<uint8_t>& content) {
   return implementation_->CreateOrShareConstant(stream, std::move(content));
@@ -234,7 +236,7 @@ bool StreamExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   return implementation_->CanEnablePeerAccessTo(other->implementation_.get());
 }
 
-port::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor* other) {
+tsl::Status StreamExecutor::EnablePeerAccessTo(StreamExecutor* other) {
   return implementation_->EnablePeerAccessTo(other->implementation_.get());
 }
 
@@ -269,7 +271,7 @@ bool StreamExecutor::SupportsDnn() const {
 }
 
 bool StreamExecutor::GetConvolveAlgorithms(
-    dnn::ConvolutionKind kind,
+    dnn::ConvolutionKind kind, dnn::DataType input_type,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
@@ -281,17 +283,20 @@ bool StreamExecutor::GetConvolveAlgorithms(
     case dnn::ConvolutionKind::FORWARD:
     case dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
       return dnn_support->GetConvolveAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), out_algorithms);
+          GetDeviceDescription().cuda_compute_capability(), input_type,
+          out_algorithms);
     case dnn::ConvolutionKind::BACKWARD_DATA:
       return dnn_support->GetConvolveBackwardDataAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), out_algorithms);
+          GetDeviceDescription().cuda_compute_capability(), input_type,
+          out_algorithms);
     case dnn::ConvolutionKind::BACKWARD_FILTER:
       return dnn_support->GetConvolveBackwardFilterAlgorithms(
-          GetDeviceDescription().cuda_compute_capability(), out_algorithms);
+          GetDeviceDescription().cuda_compute_capability(), input_type,
+          out_algorithms);
   }
 }
 
-port::Status StreamExecutor::GetConvolveRunners(
+tsl::Status StreamExecutor::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
     const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -303,7 +308,7 @@ port::Status StreamExecutor::GetConvolveRunners(
     std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
   return dnn_support->GetConvolveRunners(
       use_cudnn_frontend, kind, input_type, output_type, stream,
@@ -312,7 +317,7 @@ port::Status StreamExecutor::GetConvolveRunners(
       scratch_allocator, out_exec_plans);
 }
 
-port::Status StreamExecutor::GetFusedConvolveRunners(
+tsl::Status StreamExecutor::GetFusedConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
     dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, double conv_input_scale, double side_input_scale,
@@ -326,7 +331,7 @@ port::Status StreamExecutor::GetFusedConvolveRunners(
     std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
   return dnn_support->GetFusedConvolveRunners(
       use_cudnn_frontend, kind, input_type, bias_type, output_type,
@@ -335,7 +340,7 @@ port::Status StreamExecutor::GetFusedConvolveRunners(
       convolution_descriptor, use_fallback, activation_mode, out_exec_plans);
 }
 
-port::Status StreamExecutor::GetFusedMatmulRunners(
+tsl::Status StreamExecutor::GetFusedMatmulRunners(
     bool use_cudnn_frontend, dnn::DataType input_type, dnn::DataType bias_type,
     dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
     uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
@@ -344,7 +349,7 @@ port::Status StreamExecutor::GetFusedMatmulRunners(
         out_exec_plans) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::UnimplementedError("DNN library is not found.");
+    return tsl::errors::Unimplemented("DNN library is not found.");
   }
 
   return dnn_support->GetFusedMatmulRunners(
@@ -390,7 +395,7 @@ bool StreamExecutor::GetBlasGemmAlgorithms(
   return blas_support->GetBlasGemmAlgorithms(stream, out_algorithms);
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
     int batch_size, dnn::RnnInputMode input_mode,
@@ -400,8 +405,8 @@ StreamExecutor::createRnnDescriptor(
     bool use_padded_io) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::Status(port::error::UNKNOWN,
-                        "Fail to find the dnn implementation.");
+    return tsl::Status(tsl::error::UNKNOWN,
+                       "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnDescriptor(
       num_layers, hidden_size, input_size, cell_size, batch_size, input_mode,
@@ -409,42 +414,42 @@ StreamExecutor::createRnnDescriptor(
       state_allocator, use_padded_io);
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 StreamExecutor::createRnnSequenceTensorDescriptor(int max_seq_length,
                                                   int batch_size, int data_size,
                                                   dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::Status(port::error::UNKNOWN,
-                        "Fail to find the dnn implementation.");
+    return tsl::Status(tsl::error::UNKNOWN,
+                       "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnSequenceTensorDescriptor(
       max_seq_length, batch_size, data_size, data_type);
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 StreamExecutor::createRnnSequenceTensorDescriptor(
     int max_seq_length, int batch_size, int data_size,
     const absl::Span<const int>& seq_lengths, bool time_major,
     dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::Status(port::error::UNKNOWN,
-                        "Fail to find the dnn implementation.");
+    return tsl::Status(tsl::error::UNKNOWN,
+                       "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnSequenceTensorDescriptor(
       max_seq_length, batch_size, data_size, seq_lengths, time_major,
       data_type);
 }
 
-port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                                int data_size,
                                                dnn::DataType data_type) {
   dnn::DnnSupport* dnn_support = AsDnn();
   if (!dnn_support) {
-    return port::Status(port::error::UNKNOWN,
-                        "Fail to find the dnn implementation.");
+    return tsl::Status(tsl::error::UNKNOWN,
+                       "Fail to find the dnn implementation.");
   }
   return dnn_support->createRnnStateTensorDescriptor(num_layer, batch_size,
                                                      data_size, data_type);
@@ -490,26 +495,25 @@ rng::RngSupport* StreamExecutor::AsRng() {
   return rng_.get();
 }
 
-port::Status StreamExecutor::Launch(Stream* stream,
-                                    const ThreadDim& thread_dims,
-                                    const BlockDim& block_dims,
-                                    const KernelBase& kernel,
-                                    const KernelArgsArrayBase& args) {
+tsl::Status StreamExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                                   const BlockDim& block_dims,
+                                   const KernelBase& kernel,
+                                   const KernelArgsArrayBase& args) {
   SubmitTrace(&TraceListener::LaunchSubmit, stream, thread_dims, block_dims,
               kernel, args);
 
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
-port::Status StreamExecutor::BlockHostUntilDone(Stream* stream) {
-  port::Status result;
+tsl::Status StreamExecutor::BlockHostUntilDone(Stream* stream) {
+  tsl::Status result;
   SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream);
 
   result = implementation_->BlockHostUntilDone(stream);
   return result;
 }
 
-port::Status StreamExecutor::GetStatus(Stream* stream) {
+tsl::Status StreamExecutor::GetStatus(Stream* stream) {
   return implementation_->GetStatus(stream);
 }
 
@@ -531,7 +535,7 @@ DeviceMemoryBase StreamExecutor::Allocate(uint64_t size, int64_t memory_space) {
   return buf;
 }
 
-port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
+tsl::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
   // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
   // be nullptr/0 for consistency with DeviceMemory semantics.
@@ -541,8 +545,8 @@ port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
     return DeviceMemoryBase(opaque, bytes);
   }
 
-  return port::Status(
-      port::error::NOT_FOUND,
+  return tsl::Status(
+      tsl::error::NOT_FOUND,
       absl::StrCat("Check if module containing symbol ", symbol_name,
                    " is loaded (module_handle = ",
                    reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
@@ -610,16 +614,16 @@ bool StreamExecutor::SynchronizeAllActivity() {
   return ok;
 }
 
-port::Status StreamExecutor::SynchronousMemZero(DeviceMemoryBase* location,
-                                                uint64_t size) {
+tsl::Status StreamExecutor::SynchronousMemZero(DeviceMemoryBase* location,
+                                               uint64_t size) {
   VLOG(1) << "Called StreamExecutor::SynchronousMemZero(location=" << location
           << ", size=" << size << ")" << StackTraceIfVLOG10();
 
   return implementation_->SynchronousMemZero(location, size);
 }
 
-port::Status StreamExecutor::SynchronousMemSet(DeviceMemoryBase* location,
-                                               int value, uint64_t size) {
+tsl::Status StreamExecutor::SynchronousMemSet(DeviceMemoryBase* location,
+                                              int value, uint64_t size) {
   VLOG(1) << "Called StreamExecutor::SynchronousMemSet(location=" << location
           << ", value=" << value << ", size=" << size << ")"
           << StackTraceIfVLOG10();
@@ -636,7 +640,7 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase* device_dst,
   // Tracing overloaded methods is very difficult due to issues with type
   // inference on template args. Since use of these overloaded methods is
   // discouraged anyway, this isn't a huge deal.
-  port::Status status =
+  tsl::Status status =
       implementation_->SynchronousMemcpy(device_dst, host_src, size);
   if (!status.ok()) {
     LOG(ERROR) << "synchronous memcpy: " << status;
@@ -651,7 +655,7 @@ bool StreamExecutor::SynchronousMemcpy(void* host_dst,
           << ", device_src=" << device_src.opaque() << ", size=" << size
           << ") D2H" << StackTraceIfVLOG10();
 
-  port::Status status =
+  tsl::Status status =
       implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!status.ok()) {
     LOG(ERROR) << "synchronous memcpy: " << status;
@@ -666,7 +670,7 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase* device_dst,
           << device_dst->opaque() << ", device_src=" << device_src.opaque()
           << ", size=" << size << ") D2D" << StackTraceIfVLOG10();
 
-  port::Status status = implementation_->SynchronousMemcpyDeviceToDevice(
+  tsl::Status status = implementation_->SynchronousMemcpyDeviceToDevice(
       device_dst, device_src, size);
   if (!status.ok()) {
     LOG(ERROR) << "synchronous memcpy: " << status;
@@ -674,20 +678,20 @@ bool StreamExecutor::SynchronousMemcpy(DeviceMemoryBase* device_dst,
   return status.ok();
 }
 
-port::Status StreamExecutor::SynchronousMemcpyD2H(
+tsl::Status StreamExecutor::SynchronousMemcpyD2H(
     const DeviceMemoryBase& device_src, int64_t size, void* host_dst) {
   VLOG(1) << "Called StreamExecutor::SynchronousMemcpyD2H(device_src="
           << device_src.opaque() << ", size=" << size
           << ", host_dst=" << host_dst << ")" << StackTraceIfVLOG10();
 
-  port::Status result;
+  tsl::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, &result, device_src, size,
                host_dst);
 
   result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!result.ok()) {
-    result = port::Status(
-        port::error::INTERNAL,
+    result = tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to synchronously memcpy device-to-host: device "
                         "%p to host %p size %d: %s",
                         device_src.opaque(), host_dst, size,
@@ -697,20 +701,21 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
   return result;
 }
 
-port::Status StreamExecutor::SynchronousMemcpyH2D(
-    const void* host_src, int64_t size, DeviceMemoryBase* device_dst) {
+tsl::Status StreamExecutor::SynchronousMemcpyH2D(const void* host_src,
+                                                 int64_t size,
+                                                 DeviceMemoryBase* device_dst) {
   VLOG(1) << "Called StreamExecutor::SynchronousMemcpyH2D(host_src=" << host_src
           << ", size=" << size << ", device_dst=" << device_dst->opaque() << ")"
           << StackTraceIfVLOG10();
 
-  port::Status result;
+  tsl::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, &result, host_src, size,
                device_dst);
 
   result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
   if (!result.ok()) {
-    result = port::Status(
-        port::error::INTERNAL,
+    result = tsl::Status(
+        tsl::error::INTERNAL,
         absl::StrFormat("failed to synchronously memcpy host-to-device: host "
                         "%p to device %p size %d: %s",
                         host_src, device_dst->opaque(), size,
@@ -738,42 +743,36 @@ bool StreamExecutor::MemcpyDeviceToDevice(Stream* stream,
                                                size);
 }
 
-port::Status StreamExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
-                                     uint64_t size) {
+tsl::Status StreamExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                                    uint64_t size) {
   return implementation_->MemZero(stream, location, size);
 }
 
-port::Status StreamExecutor::Memset32(Stream* stream,
-                                      DeviceMemoryBase* location,
-                                      uint32_t pattern, uint64_t size) {
+tsl::Status StreamExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                                     uint32_t pattern, uint64_t size) {
   CHECK_EQ(0, size % 4)
       << "need 32-bit multiple size to fill with 32-bit pattern";
   return implementation_->Memset32(stream, location, pattern, size);
 }
 
-bool StreamExecutor::HostCallback(Stream* stream,
-                                  std::function<void()> callback) {
-  return implementation_->HostCallback(stream, std::move(callback));
-}
-
-bool StreamExecutor::HostCallback(Stream* stream,
-                                  std::function<port::Status()> callback) {
+bool StreamExecutor::HostCallback(
+    Stream* stream, absl::AnyInvocable<tsl::Status() &&> callback) {
   return implementation_->HostCallback(stream, std::move(callback));
 }
 
-port::Status StreamExecutor::AllocateEvent(Event* event) {
+tsl::Status StreamExecutor::AllocateEvent(Event* event) {
   return implementation_->AllocateEvent(event);
 }
 
-port::Status StreamExecutor::DeallocateEvent(Event* event) {
+tsl::Status StreamExecutor::DeallocateEvent(Event* event) {
   return implementation_->DeallocateEvent(event);
 }
 
-port::Status StreamExecutor::RecordEvent(Stream* stream, Event* event) {
+tsl::Status StreamExecutor::RecordEvent(Stream* stream, Event* event) {
   return implementation_->RecordEvent(stream, event);
 }
 
-port::Status StreamExecutor::WaitForEvent(Stream* stream, Event* event) {
+tsl::Status StreamExecutor::WaitForEvent(Stream* stream, Event* event) {
   return implementation_->WaitForEvent(stream, event);
 }
 
@@ -927,7 +926,7 @@ StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
+tsl::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
@@ -945,8 +944,8 @@ port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
   return OwningDeviceMemory(result, device_ordinal, this);
 }
 
-port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
-                                                       DeviceMemoryBase mem) {
+tsl::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                      DeviceMemoryBase mem) {
   if (!mem.is_null()) {
     TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
                         GetStreamExecutor(device_ordinal));
@@ -957,8 +956,8 @@ port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<StreamExecutor*>
-StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) const {
+tsl::StatusOr<StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
+    int device_ordinal) const {
   if (device_ordinal < 0) {
     return tsl::errors::InvalidArgument(absl::StrFormat(
         "device ordinal value (%d) must be non-negative", device_ordinal));
@@ -977,7 +976,7 @@ bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
   return false;
 }
 
-port::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
+tsl::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
     int device_ordinal) {
   CHECK(!AllowsAsynchronousDeallocation())
       << "The logic below only works for synchronous allocators";
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
index 1cdf185a7bb..14711ef660d 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h
@@ -28,19 +28,22 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/macros.h"
 #include "absl/base/thread_annotations.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/memory/memory.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/rng.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/trace_listener.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 
@@ -83,8 +86,8 @@ class StreamExecutor {
 
   ~StreamExecutor();
 
-  port::Status Init();
-  port::Status Init(DeviceOptions device_options);
+  tsl::Status Init();
+  tsl::Status Init(DeviceOptions device_options);
 
   // Returns the platform that this StreamExecutor is acting upon.
   ABSL_DEPRECATED("Use platform() instead.")
@@ -106,7 +109,7 @@ class StreamExecutor {
   //
   // If an error occurs, or there is no kernel available for the StreamExecutor
   // platform, error status is returned.
-  port::Status GetKernel(const MultiKernelLoaderSpec& spec, KernelBase* kernel);
+  tsl::Status GetKernel(const MultiKernelLoaderSpec& spec, KernelBase* kernel);
 
   // Releases any state associated with the previously loaded kernel.
   void UnloadKernel(const KernelBase* kernel);
@@ -114,15 +117,15 @@ class StreamExecutor {
   // Loads a module for the platform this StreamExecutor is acting upon.
   //
   // `spec` describes the module to be loaded.  On success writes the handle for
-  // the loaded module to `module_handle` and returns Status::OK.
-  // Otherwise, returns the error which has occurred.
-  port::Status LoadModule(const MultiModuleLoaderSpec& spec,
-                          ModuleHandle* module_handle);
+  // the loaded module to `module_handle` and returns OkStatus().  Otherwise,
+  // returns the error which has occurred.
+  tsl::Status LoadModule(const MultiModuleLoaderSpec& spec,
+                         ModuleHandle* module_handle);
 
   // Unloads the module with handle `module_handle`.
   bool UnloadModule(ModuleHandle module_handle);
 
-  port::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
+  tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>> CreateOrShareConstant(
       Stream* stream, const std::vector<uint8_t>& content);
 
   // Synchronously allocates an array on the device of type T with element_count
@@ -179,11 +182,11 @@ class StreamExecutor {
   // - Note: symbol_name should include its namespace as well. For example,
   //         pass "nms0::symbol" if referring to nms0::symbol.
   template <typename T>
-  port::StatusOr<DeviceMemory<T>> GetSymbol(const std::string& symbol_name,
-                                            ModuleHandle module_handle);
+  tsl::StatusOr<DeviceMemory<T>> GetSymbol(const std::string& symbol_name,
+                                           ModuleHandle module_handle);
 
   // An untyped version of GetSymbol.
-  port::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
+  tsl::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
       const std::string& symbol_name, ModuleHandle module_handle);
 
   // Deallocate the DeviceMemory previously allocated via this interface.
@@ -238,13 +241,13 @@ class StreamExecutor {
 
   // Blocks the caller while "size" bytes are zeroed out (in POD fashion) at the
   // given location in device memory.
-  port::Status SynchronousMemZero(DeviceMemoryBase* location,
-                                  uint64_t size) ABSL_MUST_USE_RESULT;
+  tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                 uint64_t size) ABSL_MUST_USE_RESULT;
 
   // Blocks the caller while "size" bytes are initialized to "value" (in POD
   // fashion) at the given location in device memory.
-  port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                                 uint64_t size) ABSL_MUST_USE_RESULT;
+  tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                uint64_t size) ABSL_MUST_USE_RESULT;
 
   // [deprecated] Blocks the caller while a data segment of the given size is
   // copied from the host source to the device destination.
@@ -261,31 +264,30 @@ class StreamExecutor {
                          uint64_t size) ABSL_MUST_USE_RESULT;
 
   // Same as SynchronousMemcpy(DeviceMemoryBase*, ...) above.
-  port::Status SynchronousMemcpyH2D(const void* host_src, int64_t size,
-                                    DeviceMemoryBase* device_dst);
+  tsl::Status SynchronousMemcpyH2D(const void* host_src, int64_t size,
+                                   DeviceMemoryBase* device_dst);
 
   // Alternative interface for memcpying from host to device that takes an
   // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <class T>
-  port::Status SynchronousMemcpyH2D(
-      port::ArraySlice<T> host_src,  // non-absl ok
-      DeviceMemoryBase* device_dst) {
+  tsl::Status SynchronousMemcpyH2D(absl::Span<const T> host_src,
+                                   DeviceMemoryBase* device_dst) {
     auto host_size = host_src.size() * sizeof(T);
     CHECK(device_dst->size() == 0 || device_dst->size() >= host_size);
     return SynchronousMemcpyH2D(host_src.begin(), host_size, device_dst);
   }
 
   // Same as SynchronousMemcpy(void*, ...) above.
-  port::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
-                                    int64_t size, void* host_dst);
+  tsl::Status SynchronousMemcpyD2H(const DeviceMemoryBase& device_src,
+                                   int64_t size, void* host_dst);
 
   // Alternative interface for memcpying from device to host that takes an
   // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <typename T>
-  port::Status SynchronousMemcpyD2H(const DeviceMemory<T>& device_src,
-                                    port::MutableArraySlice<T> host_dst) {
+  tsl::Status SynchronousMemcpyD2H(const DeviceMemory<T>& device_src,
+                                   absl::Span<T> host_dst) {
     auto host_size = host_dst.size() * sizeof(T);
     CHECK(device_src.size() == 0 || host_size >= device_src.size());
     return SynchronousMemcpyD2H(device_src, host_size, host_dst.begin());
@@ -300,15 +302,15 @@ class StreamExecutor {
   // Enqueues an operation onto stream to zero out size bytes at the given
   // device memory location. Neither stream nor location may be null. Returns
   // whether the operation was successfully enqueued onto the stream.
-  port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                       uint64_t size) ABSL_MUST_USE_RESULT;
+  tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                      uint64_t size) ABSL_MUST_USE_RESULT;
 
   // Enqueues an operation onto stream to set 32-bit patterns starting at
   // location, for byte count given by size. size must be 32-bit quantified
   // (i.e. evently divisible by 4). Returns whether the operation was
   // successfully enqueued onto the stream.
-  port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
-                        uint32_t pattern, uint64_t size);
+  tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                       uint32_t pattern, uint64_t size);
 
   // Enables peer access from this StreamExecutor to memory
   // allocated by other, such that launched device code, memcpies, etc may
@@ -317,7 +319,7 @@ class StreamExecutor {
   // Both this StreamExecutor and other must be backed by the same platform (as
   // in
   // CUDA vs OpenCL) implementation.
-  port::Status EnablePeerAccessTo(StreamExecutor* other);
+  tsl::Status EnablePeerAccessTo(StreamExecutor* other);
 
   // Returns whether it's possible to enable peer access from this
   // StreamExecutor
@@ -368,10 +370,11 @@ class StreamExecutor {
   // Returns the list of supported algorithms for the specified convolution
   // operation.
   bool GetConvolveAlgorithms(dnn::ConvolutionKind kind,
+                             dnn::DataType input_type,
                              std::vector<dnn::AlgorithmDesc>* out_algorithms);
 
   // Returns the supported algorithms / execution plans for a convolution.
-  port::Status GetConvolveRunners(
+  tsl::Status GetConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
@@ -383,7 +386,7 @@ class StreamExecutor {
       bool use_fallback, ScratchAllocator* scratch_allocator,
       std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
 
-  port::Status GetFusedConvolveRunners(
+  tsl::Status GetFusedConvolveRunners(
       bool use_cudnn_frontend, dnn::ConvolutionKind kind,
       dnn::DataType input_type, dnn::DataType bias_type,
       dnn::DataType output_type, double conv_input_scale,
@@ -396,7 +399,7 @@ class StreamExecutor {
       bool use_fallback, dnn::ActivationMode activation_mode,
       std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
 
-  port::Status GetFusedMatmulRunners(
+  tsl::Status GetFusedMatmulRunners(
       bool use_cudnn_frontend, dnn::DataType input_type,
       dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
       bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
@@ -427,7 +430,7 @@ class StreamExecutor {
 
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
-  port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+  tsl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size, int cell_size,
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -437,11 +440,11 @@ class StreamExecutor {
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
-  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type);
 
-  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
@@ -449,7 +452,7 @@ class StreamExecutor {
 
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
-  port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
+  tsl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type);
 
@@ -469,7 +472,7 @@ class StreamExecutor {
   // The canonical storage for both ptx and cubin_data should outlive the
   // lifetime of the kernel.
   template <typename... Args>
-  port::StatusOr<std::unique_ptr<TypedKernel<Args...>>> CreateTypedKernel(
+  tsl::StatusOr<std::unique_ptr<TypedKernel<Args...>>> CreateTypedKernel(
       absl::string_view kernel_name, absl::string_view ptx,
       absl::Span<const uint8_t> cubin_data);
 
@@ -486,9 +489,9 @@ class StreamExecutor {
   //
   // This is called by Stream::Launch() to delegate to the platform's launch
   // implementation in StreamExecutorInterface::Launch().
-  port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
-                      const BlockDim& block_dims, const KernelBase& kernel,
-                      const KernelArgsArrayBase& args);
+  tsl::Status Launch(Stream* stream, const ThreadDim& thread_dims,
+                     const BlockDim& block_dims, const KernelBase& kernel,
+                     const KernelArgsArrayBase& args);
 
   // Gets-or-creates (creates with memoization) a FftSupport datatype that can
   // be used to execute FFT routines on the current platform.
@@ -586,10 +589,10 @@ class StreamExecutor {
   // Causes the host code to synchronously wait for operations entrained onto
   // stream to complete. Effectively a join on the asynchronous device
   // operations enqueued on the stream before this program point.
-  port::Status BlockHostUntilDone(Stream* stream);
+  tsl::Status BlockHostUntilDone(Stream* stream);
 
   // Without blocking the device, retrieve the current stream status.
-  port::Status GetStatus(Stream* stream);
+  tsl::Status GetStatus(Stream* stream);
 
   // Finds and retrieves device memory for the symbol on the underlying
   // platform.
@@ -613,26 +616,23 @@ class StreamExecutor {
   bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* device_dst,
                             const DeviceMemoryBase& device_src, uint64_t size);
 
-  // Entrains on a stream a user-specified function to be run on the host.
-  // See Stream::ThenDoHostCallback for full details.
-  bool HostCallback(Stream* stream, std::function<void()> callback);
-
   // Entrains on a stream a user-specified function to be run on the host.
   // See Stream::ThenDoHostCallback for full details.
   // This is the preferred form for a callback that may return an error.
-  bool HostCallback(Stream* stream, std::function<port::Status()> callback);
+  bool HostCallback(Stream* stream,
+                    absl::AnyInvocable<tsl::Status() &&> callback);
 
   // Performs platform-specific allocation and initialization of an event.
-  port::Status AllocateEvent(Event* event);
+  tsl::Status AllocateEvent(Event* event);
 
   // Performs platform-specific deallocation and cleanup of an event.
-  port::Status DeallocateEvent(Event* event);
+  tsl::Status DeallocateEvent(Event* event);
 
   // Inserts the specified event at the end of the specified stream.
-  port::Status RecordEvent(Stream* stream, Event* event);
+  tsl::Status RecordEvent(Stream* stream, Event* event);
 
   // Wait for the specified event at the end of the specified stream.
-  port::Status WaitForEvent(Stream* stream, Event* event);
+  tsl::Status WaitForEvent(Stream* stream, Event* event);
 
   // Requests the current status of the event from the underlying platform.
   Event::Status PollForEventStatus(Event* event);
@@ -665,7 +665,7 @@ class StreamExecutor {
   // ownership transfer to caller.
   std::unique_ptr<DeviceDescription> CreateDeviceDescription() const;
 
-  // Adds a task to the port::ThreadPool work queue. These tasks must be
+  // Adds a task to the tsl::thread::ThreadPool work queue. These tasks must be
   // fire-and-forget and have no external data or timing dependencies; their
   // execution order and completion time have no guarantees.
   // For an example of an appropriate task, see HostBlas::DoBlasGemmInternal;
@@ -748,7 +748,7 @@ class StreamExecutor {
   // here.
   //
   // Immutable post-initialization. Object is thread-safe.
-  std::unique_ptr<port::ThreadPool> background_threads_;
+  std::unique_ptr<tsl::thread::ThreadPool> background_threads_;
 
   // Counter for the current number of live streams. This is used to check
   // for accidentally-outstanding streams at StreamExecutor teardown time, as
@@ -818,7 +818,7 @@ class ScopedModuleHandle {
 // Inlines
 
 template <typename... Args>
-inline port::StatusOr<std::unique_ptr<TypedKernel<Args...>>>
+inline tsl::StatusOr<std::unique_ptr<TypedKernel<Args...>>>
 StreamExecutor::CreateTypedKernel(absl::string_view kernel_name,
                                   absl::string_view ptx,
                                   absl::Span<const uint8_t> cubin_data) {
@@ -843,9 +843,9 @@ inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64_t element_count,
 }
 
 template <typename T>
-inline port::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol(
+inline tsl::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol(
     const std::string& symbol_name, ModuleHandle module_handle) {
-  port::StatusOr<DeviceMemoryBase> untyped_symbol =
+  tsl::StatusOr<DeviceMemoryBase> untyped_symbol =
       GetUntypedSymbol(symbol_name, module_handle);
   if (!untyped_symbol.ok()) {
     return untyped_symbol.status();
diff --git a/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.cc b/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.cc
index 6424b6cb24f..f365717f47d 100644
--- a/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.cc
@@ -92,16 +92,16 @@ bool TemporaryMemoryManager::HasAllocated(const DeviceMemoryBase& device_memory,
   return it->second.allocation_generation == generation;
 }
 
-port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>>
+tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>>
 TemporaryMemoryManager::AllocateArrayBase(uint64_t element_count,
                                           uint64_t element_size) {
   uint64_t byte_size = element_count * element_size;
   DeviceMemoryBase device_memory =
       stream_->parent()->AllocateArray<uint8_t>(byte_size);
   if (device_memory == nullptr) {
-    return port::Status(port::error::RESOURCE_EXHAUSTED,
-                        absl::StrCat("could not allocate temporary memory of ",
-                                     byte_size, " bytes"));
+    return tsl::Status(tsl::error::RESOURCE_EXHAUSTED,
+                       absl::StrCat("could not allocate temporary memory of ",
+                                    byte_size, " bytes"));
   }
 
   uint64_t generation;
diff --git a/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h b/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h
index 868bd4c7407..3bc92a95742 100644
--- a/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/temporary_device_memory.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace internal {
@@ -60,7 +60,7 @@ class TemporaryMemoryManager {
 
   // Allocates a temporary array that is then managed by this object.
   template <typename T>
-  port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> AllocateArray(
+  tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> AllocateArray(
       uint64_t element_count);
 
   // Forces deallocation of all managed temporary memory regions.
@@ -105,7 +105,7 @@ class TemporaryMemoryManager {
   // implementation can live in the source file. Without this base allocation
   // method, we incur a circular dependency between the StreamExecutor
   // definition and this class' definition.
-  port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> AllocateArrayBase(
+  tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> AllocateArrayBase(
       uint64_t element_count, uint64 element_size);
 
   // Mutex to guard temporary record state.
@@ -134,9 +134,9 @@ class TemporaryMemoryManager {
 // Inlines
 
 template <typename T>
-port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
+tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
 TemporaryMemoryManager::AllocateArray(uint64_t element_count) {
-  port::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> temporary_memory =
+  tsl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> temporary_memory =
       AllocateArrayBase(element_count, sizeof(T));
   if (!temporary_memory.ok()) {
     return temporary_memory.status();
diff --git a/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.cc b/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.cc
index 5b3e6600dc5..2dd10d165e1 100644
--- a/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.cc
+++ b/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -34,7 +33,7 @@ TfAllocatorAdapter::TfAllocatorAdapter(tsl::Allocator *wrapped,
 
 TfAllocatorAdapter::~TfAllocatorAdapter() {}
 
-port::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
+tsl::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   CHECK_EQ(memory_space, 0);
@@ -52,13 +51,13 @@ port::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
   return OwningDeviceMemory(DeviceMemoryBase(data, size), device_ordinal, this);
 }
 
-port::Status TfAllocatorAdapter::Deallocate(int device_ordinal,
-                                            DeviceMemoryBase mem) {
+tsl::Status TfAllocatorAdapter::Deallocate(int device_ordinal,
+                                           DeviceMemoryBase mem) {
   wrapped_->DeallocateRaw(mem.opaque());
   return ::tsl::OkStatus();
 }
 
-port::StatusOr<Stream *> TfAllocatorAdapter::GetStream(int device_ordinal) {
+tsl::StatusOr<Stream *> TfAllocatorAdapter::GetStream(int device_ordinal) {
   CHECK_EQ(stream_->parent()->device_ordinal(), device_ordinal);
   return stream_;
 }
diff --git a/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h b/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h
index 347799a5178..5ed2fb7613d 100644
--- a/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h
+++ b/tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/allocator.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -45,11 +45,11 @@ class TfAllocatorAdapter : public DeviceMemoryAllocator {
 
   ~TfAllocatorAdapter() override;
 
-  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
-                                              bool retry_on_failure,
-                                              int64_t memory_space) override;
+  tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                             bool retry_on_failure,
+                                             int64_t memory_space) override;
 
-  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
+  tsl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
 
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
@@ -60,7 +60,7 @@ class TfAllocatorAdapter : public DeviceMemoryAllocator {
   // (This attribute has no effect on CPU.)
   bool AllowsAsynchronousDeallocation() const override { return true; }
 
-  port::StatusOr<Stream *> GetStream(int device_ordinal) override;
+  tsl::StatusOr<Stream *> GetStream(int device_ordinal) override;
 
  private:
   tsl::Allocator *wrapped_;
@@ -90,15 +90,15 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
     }
   }
 
-  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
-                                              bool retry_on_failure,
-                                              int64_t memory_space) override {
+  tsl::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64_t size,
+                                             bool retry_on_failure,
+                                             int64_t memory_space) override {
     CHECK_LT(device_ordinal, per_device_allocators_.size());
     return per_device_allocators_[device_ordinal]->Allocate(
         device_ordinal, size, retry_on_failure, memory_space);
   }
 
-  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override {
+  tsl::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override {
     CHECK_LT(device_ordinal, per_device_allocators_.size());
     return per_device_allocators_[device_ordinal]->Deallocate(device_ordinal,
                                                               mem);
@@ -113,7 +113,7 @@ class MultiDeviceAdapter : public DeviceMemoryAllocator {
   // (This attribute has no effect on CPU.)
   bool AllowsAsynchronousDeallocation() const override { return true; }
 
-  port::StatusOr<Stream *> GetStream(int device_ordinal) override {
+  tsl::StatusOr<Stream *> GetStream(int device_ordinal) override {
     return per_device_allocators_[device_ordinal]->GetStream(device_ordinal);
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/BUILD b/tensorflow/compiler/xla/stream_executor/tpu/BUILD
index 45e820e9a46..8772763146f 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/tpu/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain/experimental/dtensor:__subpackages__",
         "//learning/brain/google/xla/kernels:__subpackages__",
@@ -17,20 +18,10 @@ package(
         "//tensorflow/core/common_runtime/next_pluggable_device:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
     ],
     licenses = ["notice"],
 )
 
-cc_library(
-    name = "pjrt_api",
-    srcs = ["pjrt_api.cc"],
-    hdrs = ["pjrt_api.h"],
-    deps = [
-        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
-    ],
-)
-
 cc_library(
     name = "c_api_decl",
     hdrs = [
@@ -38,8 +29,8 @@ cc_library(
         "c_api_defn.h",
     ],
     deps = [
+        ":libtftpu_header",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/tpu:libtftpu_header",
     ],
 )
 
@@ -49,9 +40,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
-        "//tensorflow/c:tf_attrtype",
-        "//tensorflow/c:tf_status",
-        "//tensorflow/compiler/xla/stream_executor/tpu:libtftpu_header",
+        ":libtftpu_header",
     ],
 )
 
@@ -67,8 +56,8 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -135,11 +124,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/c:tf_tstring",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
-        "//tensorflow/compiler/xla/stream_executor/tpu:libtftpu_header",
-        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
-        "@com_google_absl//absl/types:optional",
+        ":c_api_decl",
+        ":libtftpu_header",
+        ":proto_helper",
     ],
     alwayslink = True,
 )
@@ -165,13 +152,14 @@ cc_library(
         ":tpu_executor_interface",
         ":tpu_platform_interface",
         ":tpu_stream_interface",
-        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
     ],
     alwayslink = True,
@@ -215,10 +203,12 @@ cc_library(
         ":tpu_executor_c_api_hdrs",
         ":tpu_topology_external",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -253,15 +243,17 @@ cc_library(
         ":tpu_platform_id",
         ":tpu_platform_interface",
         ":tpu_stream_interface",
-        "//tensorflow/c:tf_status",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/c:tsl_status",
+        "//tensorflow/tsl/c:tsl_status_helper",
         "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:types",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -286,8 +278,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -300,8 +293,8 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_executor_api",
+        ":tpu_executor_c_api_hdrs",
         ":tpu_ops_c_api_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
     ],
 )
 
@@ -374,8 +367,8 @@ cc_library(
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:casts",
         "@com_google_absl//absl/types:optional",
@@ -406,7 +399,7 @@ cc_library(
         ":tpu_platform_interface",
         ":tpu_topology_external",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:statusor",
     ],
 )
 
@@ -418,19 +411,18 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_tpu_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/lib",
-        "//tensorflow/compiler/xla/stream_executor/tpu:libtftpu_header",
-        "//tensorflow/compiler/xla/stream_executor/tpu:pjrt_api",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api_dlsym_set_fn",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_init_fns",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_library_init_fns",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_ops_c_api_hdrs",
+        ":libtftpu_header",
+        ":tpu_api",
+        ":tpu_api_dlsym_set_fn",
+        ":tpu_executor_base",
+        ":tpu_executor_c_api_hdrs",
+        ":tpu_library_init_fns",
+        ":tpu_ops_c_api_hdrs",
+        "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform/cloud:gcs_file_system",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -442,24 +434,24 @@ cc_library(
     srcs = ["tpu_on_demand_compiler.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":c_api_conversions",
+        ":c_api_decl",
+        ":proto_helper",
+        ":status_helper",
+        ":tpu_executable",
+        ":tpu_executor_c_api_hdrs",
+        ":tpu_executor_hdrs",
+        ":tpu_platform_id",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
-        "//tensorflow/compiler/xla/service:hlo_module_group",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
-        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
-        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executable",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_id",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/types:span",
     ],
@@ -487,8 +479,8 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_profile_printer_data_cc",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
@@ -511,8 +503,8 @@ cc_library(
         ":status_helper",
         ":tpu_executable_interface",
         ":tpu_executor",
+        ":tpu_executor_api",
         ":tpu_executor_c_api_hdrs",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_api",
         "@com_google_absl//absl/cleanup",
     ],
 )
@@ -522,8 +514,8 @@ cc_library(
     srcs = ["tpu_executor_api.cc"],
     hdrs = ["tpu_executor_api.h"],
     deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:libtftpu_header",
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        ":libtftpu_header",
+        ":tpu_executor_c_api_hdrs",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
index 75c3d08cd71..14a362264ce 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
@@ -103,7 +103,7 @@ SE_DeviceMemoryAllocator ToC(
             ->Allocate(device_ordinal, size, retry_on_failure, memory_space);
     if (!allocation.ok()) {
       auto status = allocation.status();
-      tensorflow::tpu::ExecutorApiFn()->TpuStatus_SetFn(
+      stream_executor::tpu::ExecutorApiFn()->TpuStatus_SetFn(
           se_status, status.code(), status.error_message().data(),
           status.error_message().size());
     } else {
@@ -118,7 +118,7 @@ SE_DeviceMemoryAllocator ToC(
     auto status = reinterpret_cast<stream_executor::DeviceMemoryAllocator*>(ctx)
                       ->Deallocate(device_ordinal, ApiConverter::FromC(*base));
     if (!status.ok()) {
-      tensorflow::tpu::ExecutorApiFn()->TpuStatus_SetFn(
+      stream_executor::tpu::ExecutorApiFn()->TpuStatus_SetFn(
           se_status, status.code(), status.error_message().data(),
           status.error_message().size());
     }
@@ -299,6 +299,8 @@ void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
   c_layout->index_primitive_type = layout.index_primitive_type();
   c_layout->pointer_primitive_type = layout.pointer_primitive_type();
   c_layout->memory_space = layout.memory_space();
+  c_layout->dynamic_shape_metadata_prefix_bytes =
+      layout.dynamic_shape_metadata_prefix_bytes();
   CreateVector(layout.tiles(), &c_layout->tiles);
 }
 
@@ -329,7 +331,8 @@ xla::Layout FromC(const XLA_Layout* c_layout) {
       minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
       static_cast<xla::PrimitiveType>(c_layout->index_primitive_type),
       static_cast<xla::PrimitiveType>(c_layout->pointer_primitive_type),
-      c_layout->memory_space);
+      c_layout->memory_space, /*physical_shape=*/nullptr,
+      c_layout->dynamic_shape_metadata_prefix_bytes);
 }
 
 void Destroy(XLA_Layout* c_layout) {
@@ -474,8 +477,8 @@ XLA_HloModuleConfig ToC(const xla::HloModuleConfig& config) {
   hlo_config.num_partitions = config.num_partitions();
   hlo_config.use_spmd_partitioning = config.use_spmd_partitioning();
   hlo_config.use_auto_spmd_partitioning = config.use_auto_spmd_partitioning();
-  hlo_config.allow_spmd_sharding_propagation_to_output =
-      config.allow_spmd_sharding_propagation_to_output();
+  CreateVector(config.allow_spmd_sharding_propagation_to_output(),
+               &hlo_config.allow_spmd_sharding_propagation_to_output);
   CreateVector(config.auto_spmd_partitioning_mesh_shape(),
                &hlo_config.auto_spmd_partitioning_mesh_shape);
   CreateVector(config.auto_spmd_partitioning_mesh_ids(),
@@ -523,7 +526,7 @@ xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config) {
   config.set_use_spmd_partitioning(c_config.use_spmd_partitioning);
   config.set_use_auto_spmd_partitioning(c_config.use_auto_spmd_partitioning);
   config.set_allow_spmd_sharding_propagation_to_output(
-      c_config.allow_spmd_sharding_propagation_to_output);
+      MakeSpan(c_config.allow_spmd_sharding_propagation_to_output));
   absl::Span<const int64_t> mesh_shape_span =
       MakeSpan(c_config.auto_spmd_partitioning_mesh_shape);
   std::vector<int64_t> mesh_shape(mesh_shape_span.begin(),
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
index 1535340c11d..47398b6421d 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
index 068cbe6ec71..e4a2aa69700 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
@@ -23,8 +23,8 @@ limitations under the License.
 
 extern "C" {
 
-struct TF_Status;
-typedef struct TF_Status TF_Status;
+struct TSL_Status;
+typedef TSL_Status TF_Status;
 
 // Maximum number of array elements to inline into structs for performance.
 #define TPU_C_API_MAX_INLINED 6
@@ -40,6 +40,9 @@ enum TpuVersionEnum {
   kTpuV2,
   kTpuV3,
   kTpuV4,
+  // BEGIN-INTERNAL
+  // reserved for internal use
+  // END-INTERNAL
 };
 
 typedef struct TpuRuntimeVersion {
@@ -241,6 +244,7 @@ typedef struct XLA_Layout {
   int index_primitive_type;
   int pointer_primitive_type;
   int64_t memory_space;
+  int64_t dynamic_shape_metadata_prefix_bytes;
 } XLA_Layout;
 
 // Represents an XLA shape tree.
@@ -316,7 +320,7 @@ typedef struct XLA_HloModuleConfig {
   TpuSerializedProto static_device_assignment;
   bool has_entry_computation_layout;
   XLA_ComputationLayout entry_computation_layout;
-  bool allow_spmd_sharding_propagation_to_output;
+  BoolList allow_spmd_sharding_propagation_to_output;
 } XLA_HloModuleConfig;
 
 typedef struct SE_HloExecutionProfile SE_HloExecutionProfile;
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.cc b/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.cc
deleted file mode 100644
index 762b2304966..00000000000
--- a/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h"
-
-namespace tensorflow {
-namespace tpu {
-
-static const PJRT_Api* pjrt_api;
-
-const PJRT_Api* PjrtApi() { return pjrt_api; }
-
-void SetPjrtApi(const PJRT_Api* api) { pjrt_api = api; }
-
-}  // namespace tpu
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h b/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h
deleted file mode 100644
index 217c99c29b5..00000000000
--- a/tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_PJRT_API_H_
-#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_PJRT_API_H_
-
-#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
-
-namespace tensorflow {
-namespace tpu {
-
-// Global PJRT_Api* singleton.
-
-const PJRT_Api* PjrtApi();
-void SetPjrtApi(const PJRT_Api* api);
-
-}  // namespace tpu
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_PJRT_API_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h b/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
index da44d2dd7a4..9d5399a005b 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/status_helper.h
@@ -24,26 +24,27 @@ limitations under the License.
 class StatusHelper {
  public:
   StatusHelper()
-      : c_status(tensorflow::tpu::ExecutorApiFn()->TpuStatus_NewFn()) {}
+      : c_status(stream_executor::tpu::ExecutorApiFn()->TpuStatus_NewFn()) {}
 
   ~StatusHelper() {
-    tensorflow::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
+    stream_executor::tpu::ExecutorApiFn()->TpuStatus_FreeFn(c_status);
   }
 
   static tsl::Status FromC(  // TENSORFLOW_STATUS_OK
       TF_Status* const c_status) {
-    if (tensorflow::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status)) {
+    if (stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status)) {
       return ::tsl::OkStatus();
     } else {
       return tsl::Status(  // TENSORFLOW_STATUS_OK
           tsl::error::Code(
-              tensorflow::tpu::ExecutorApiFn()->TpuStatus_CodeFn(c_status)),
-          tensorflow::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
+              stream_executor::tpu::ExecutorApiFn()->TpuStatus_CodeFn(
+                  c_status)),
+          stream_executor::tpu::ExecutorApiFn()->TpuStatus_MessageFn(c_status));
     }
   }
 
   bool ok() const {
-    return tensorflow::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status);
+    return stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(c_status);
   }
 
   tsl::Status status() const {  // TENSORFLOW_STATUS_OK
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.cc
index ff1f3699420..d7f927e0870 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 TfTpu_BaseFn* InitializeApiFn() {
@@ -29,4 +29,4 @@ const TfTpu_OpsApiFn* OpsApiFn() {
 }
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h
index 8e5857dca7d..36f5dbc15d8 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 TfTpu_BaseFn* InitializeApiFn();
@@ -28,6 +28,6 @@ TfTpu_BaseFn* InitializeApiFn();
 const TfTpu_OpsApiFn* OpsApiFn();
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_API_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h
index 8145c1d41ea..725deef4bb8 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h
@@ -19,14 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 class TpuEvent : public ::stream_executor::internal::EventInterface {
  public:
   explicit TpuEvent(SE_Event* event) : event_(event) {}
   ~TpuEvent() override {
-    tensorflow::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
+    stream_executor::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
   }
 
  private:
@@ -34,6 +34,6 @@ class TpuEvent : public ::stream_executor::internal::EventInterface {
 };
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.cc
index 466213aa12e..64c30a755be 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.cc
@@ -66,7 +66,7 @@ static SE_ExecutableRunOptions ToC(
 
 namespace xla {
 
-using ::tensorflow::tpu::ExecutorApiFn;
+using ::stream_executor::tpu::ExecutorApiFn;
 
 TpuExecutable::~TpuExecutable() {
   ExecutorApiFn()->TpuExecutable_FreeFn(se_executable_);
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h
index fe0544a75ed..f62924162dc 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h
@@ -49,8 +49,9 @@ class TpuExecutable : public xla::TpuExecutableInterface {
       const ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
-      std::optional<stream_executor::DeviceMemoryBase>
-          cross_program_prefetch_addr) override {
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) override {
     LOG(FATAL) << "LoadProgramAndEnqueueToStream unimplemented";
   }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.cc
index 06af4ad2b5c..f171018f428 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h"
 
+#include <limits>
 #include <utility>
 
 #include "absl/algorithm/container.h"
@@ -204,11 +205,11 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
           run_options->run_options().host_to_device_stream()));
 
   // Address of the buffer in TPU memory that is being speculated.
-  std::optional<se::DeviceMemoryBase> cross_program_prefetch_addr;
+  std::vector<se::DeviceMemoryBase> cross_program_prefetch_addrs;
+  std::vector<uint32_t> cross_program_prefetch_offsets;
   if (hlo_module_) {
-    for (const auto& prefetch : hlo_module_->CrossProgramPrefetches()) {
-      const auto& parameter = prefetch.first;
-      const auto& index = prefetch.second;
+    for (const auto& [parameter, index, offset] :
+         hlo_module_->CrossProgramPrefetches()) {
       CHECK_LT(parameter, arguments.size());
       // Ensure the cross program prefetched buffer doesn't alias with any
       // program outputs. If the input and output aliased, the buffer could be
@@ -217,13 +218,18 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
       auto it = arguments[parameter].MutableBuffers()->find({index});
       CHECK(it != arguments[parameter].MutableBuffers()->end());
       CHECK(!it->second.AsDeviceMemoryBase().is_null());
-      if (absl::c_none_of(result.Result().buffers(), [&](auto index_addr_pair) {
+      CHECK(offset);
+      bool is_prefetch_output_alias =
+          absl::c_any_of(result.Result().buffers(), [&](auto index_addr_pair) {
             return index_addr_pair.second.IsSameAs(
                 it->second.AsDeviceMemoryBase());
-          })) {
-        // Supports only one cross-program prefetch address.
-        cross_program_prefetch_addr = it->second.AsDeviceMemoryBase();
-      }
+          });
+      cross_program_prefetch_addrs.emplace_back(
+          is_prefetch_output_alias ? stream_executor::DeviceMemoryBase()
+                                   : it->second.AsDeviceMemoryBase());
+      cross_program_prefetch_offsets.emplace_back(
+          is_prefetch_output_alias ? std::numeric_limits<uint32_t>::max()
+                                   : *offset);
     }
   }
 
@@ -234,7 +240,7 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
 
   TF_RETURN_IF_ERROR(LoadProgramAndEnqueueToStream(
       *run_options, memory_bases, result.Result().root_buffer(),
-      cross_program_prefetch_addr));
+      cross_program_prefetch_addrs, cross_program_prefetch_offsets));
   return std::move(result);
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h
index d13b2f35d58..d78522f2eb6 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -73,8 +73,9 @@ class TpuExecutableInterface : public Executable {
       const ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
-      std::optional<stream_executor::DeviceMemoryBase>
-          cross_program_prefetch_addr) = 0;
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) = 0;
 
   virtual absl::string_view fingerprint() const = 0;
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
index bdbc0153a6b..151f6a3a16e 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.cc
@@ -16,86 +16,86 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h"
 
 #include <cstdint>
+#include <utility>
 
 #include "absl/cleanup/cleanup.h"
-#include "tensorflow/c/tf_status.h"
+#include "absl/functional/any_invocable.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h"
+#include "tensorflow/tsl/c/tsl_status.h"
 
 using stream_executor::DeviceMemoryBase;
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 namespace {
 using xla::Status;
 }  // namespace
 
-TpuExecutor::~TpuExecutor() {
-  tpu::ExecutorApiFn()->TpuExecutor_FreeFn(executor_);
-}
+TpuExecutor::~TpuExecutor() { ExecutorApiFn()->TpuExecutor_FreeFn(executor_); }
 
 Status TpuExecutor::Init(int device_ordinal,
                          ::stream_executor::DeviceOptions device_options) {
   StatusHelper status;
   SE_DeviceOptions* options =
-      tpu::ExecutorApiFn()->TpuExecutor_NewDeviceOptionsFn(
-          device_options.flags());
-  tpu::ExecutorApiFn()->TpuExecutor_InitFn(executor_, device_ordinal, options,
-                                           status.c_status);
-  tpu::ExecutorApiFn()->TpuExecutor_FreeDeviceOptionsFn(options);
+      ExecutorApiFn()->TpuExecutor_NewDeviceOptionsFn(device_options.flags());
+  ExecutorApiFn()->TpuExecutor_InitFn(executor_, device_ordinal, options,
+                                      status.c_status);
+  ExecutorApiFn()->TpuExecutor_FreeDeviceOptionsFn(options);
   return status.status();
 }
 
 int TpuExecutor::PlatformDeviceCount() {
-  return tpu::ExecutorApiFn()->TpuExecutor_PlatformDeviceCountFn(executor_);
+  return ExecutorApiFn()->TpuExecutor_PlatformDeviceCountFn(executor_);
 }
 
 void TpuExecutor::SyncAndForgetFailedStreams() {
-  tpu::ExecutorApiFn()->TpuExecutor_SyncAndForgetFailedStreamsFn(executor_);
+  ExecutorApiFn()->TpuExecutor_SyncAndForgetFailedStreamsFn(executor_);
 }
 
 bool TpuExecutor::SynchronizeAllActivity() {
-  return tpu::ExecutorApiFn()->TpuExecutor_SynchronizeAllActivityFn(executor_);
+  return ExecutorApiFn()->TpuExecutor_SynchronizeAllActivityFn(executor_);
 }
 
 Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
+  ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
       executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::BlockUntilDoneOrFailed() {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_BlockUntilDoneOrFailedFn(executor_,
-                                                             status.c_status);
+  ExecutorApiFn()->TpuExecutor_BlockUntilDoneOrFailedFn(executor_,
+                                                        status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
+  ExecutorApiFn()->TpuExecutor_GetStatusFn(
       executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
-tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal() const {
-  return tpu::TpuCoreLocationExternal(
-      tpu::ExecutorApiFn()->TpuExecutor_GetCoreLocationFn(executor_));
+tensorflow::tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal()
+    const {
+  return tensorflow::tpu::TpuCoreLocationExternal(
+      ExecutorApiFn()->TpuExecutor_GetCoreLocationFn(executor_));
 }
 
 bool TpuExecutor::AllocateStream(Stream* stream) {
-  return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
+  return ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
       executor_, get_stream(stream->implementation()));
 }
 
 void TpuExecutor::DeallocateStream(Stream* stream) {
-  tpu::ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
+  ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
       executor_, get_stream(stream->implementation()));
   tpu_platform().mutex().Lock();
   stream_map().erase(stream->implementation());
@@ -103,7 +103,7 @@ void TpuExecutor::DeallocateStream(Stream* stream) {
 }
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
-  return tpu::ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
+  return ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
       executor_, get_stream(dependent->implementation()),
       get_stream(other->implementation()));
 }
@@ -121,13 +121,13 @@ bool TpuExecutor::AllocateTimer(Timer* timer) { return true; }
 void TpuExecutor::DeallocateTimer(Timer* timer) {}
 
 bool TpuExecutor::StartTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return tpu::ExecutorApiFn()->TpuExecutor_StartTimerFn(
+  return ExecutorApiFn()->TpuExecutor_StartTimerFn(
       executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
 
 bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return tpu::ExecutorApiFn()->TpuExecutor_StopTimerFn(
+  return ExecutorApiFn()->TpuExecutor_StopTimerFn(
       executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
@@ -136,15 +136,14 @@ stream_executor::Event::Status TpuExecutor::PollForEventStatus(
     stream_executor::Event* event) {
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   return stream_executor::Event::Status(
-      tpu::ExecutorApiFn()->TpuExecutor_PollForEventStatusFn(executor_,
-                                                             se_event));
+      ExecutorApiFn()->TpuExecutor_PollForEventStatusFn(executor_, se_event));
 }
 
 Status TpuExecutor::RecordEvent(Stream* stream,
                                 ::stream_executor::Event* event) {
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
-  tpu::ExecutorApiFn()->TpuExecutor_RecordEventFn(
+  ExecutorApiFn()->TpuExecutor_RecordEventFn(
       executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
@@ -154,7 +153,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
                                  ::stream_executor::Event* event) {
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
-  tpu::ExecutorApiFn()->TpuExecutor_WaitForEventFn(
+  ExecutorApiFn()->TpuExecutor_WaitForEventFn(
       executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
@@ -169,8 +168,8 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
 // Called by Timer::Timer
 std::unique_ptr<::stream_executor::internal::TimerInterface>
 TpuExecutor::GetTimerImplementation() {
-  SE_Timer* tpu_timer = tpu::ExecutorApiFn()->TpuTimer_NewFn(executor_);
-  auto ptr = std::make_unique<TpuTimer>(tpu_timer);
+  SE_Timer* tpu_timer = ExecutorApiFn()->TpuTimer_NewFn(executor_);
+  auto ptr = std::make_unique<tensorflow::TpuTimer>(tpu_timer);
   timer_map_[ptr.get()] = tpu_timer;
   return ptr;
 }
@@ -178,8 +177,8 @@ TpuExecutor::GetTimerImplementation() {
 // Called by Stream::Stream
 std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
-  SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
-  auto ptr = std::make_unique<tpu::TpuStream>(tpu_stream);
+  SE_Stream* tpu_stream = ExecutorApiFn()->TpuStream_NewFn(executor_);
+  auto ptr = std::make_unique<tensorflow::tpu::TpuStream>(tpu_stream);
   tpu_platform().mutex().Lock();
   stream_map()[ptr.get()] = tpu_stream;
   tpu_platform().mutex().Unlock();
@@ -189,33 +188,33 @@ TpuExecutor::GetStreamImplementation() {
 // Called by Event::Event
 std::unique_ptr<::stream_executor::internal::EventInterface>
 TpuExecutor::CreateEventImplementation() {
-  SE_Event* tpu_event = tpu::ExecutorApiFn()->TpuEvent_NewFn(executor_);
+  SE_Event* tpu_event = ExecutorApiFn()->TpuEvent_NewFn(executor_);
   auto ptr = std::make_unique<TpuEvent>(tpu_event);
   tpu_platform().InsertEvent(ptr.get(), tpu_event);
   return ptr;
 }
 
 DeviceMemoryBase TpuExecutor::Allocate(uint64_t size, int64_t memory_space) {
-  SE_DeviceMemoryBase se_base = tpu::ExecutorApiFn()->TpuExecutor_AllocateFn(
-      executor_, size, memory_space);
+  SE_DeviceMemoryBase se_base =
+      ExecutorApiFn()->TpuExecutor_AllocateFn(executor_, size, memory_space);
   return ApiConverter::FromC(se_base);
 }
 
 void TpuExecutor::Deallocate(const DeviceMemoryBase& memory) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(memory);
-  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
+  ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 void TpuExecutor::Deallocate(DeviceMemoryBase* memory) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(*memory);
-  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
+  ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 bool TpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   int64_t _free;
   int64_t _total;
-  if (tpu::ExecutorApiFn()->TpuExecutor_DeviceMemoryUsageFn(executor_, &_free,
-                                                            &_total)) {
+  if (ExecutorApiFn()->TpuExecutor_DeviceMemoryUsageFn(executor_, &_free,
+                                                       &_total)) {
     *free = _free;
     *total = _total;
     return true;
@@ -226,8 +225,7 @@ bool TpuExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
 std::optional<stream_executor::AllocatorStats>
 TpuExecutor::GetAllocatorStats() {
   SE_AllocatorStats c_stats;
-  if (tpu::ExecutorApiFn()->TpuExecutor_GetAllocatorStatsFn(executor_,
-                                                            &c_stats)) {
+  if (ExecutorApiFn()->TpuExecutor_GetAllocatorStatsFn(executor_, &c_stats)) {
     ::stream_executor::AllocatorStats stats;
     stats.num_allocs = c_stats.num_allocs;
     stats.bytes_in_use = c_stats.bytes_in_use;
@@ -249,14 +247,14 @@ TpuExecutor::GetAllocatorStats() {
 
 Status TpuExecutor::WaitForInfeedReady(int32_t infeed_queue_index) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_WaitForInfeedReadyFn(
+  ExecutorApiFn()->TpuExecutor_WaitForInfeedReadyFn(
       executor_, infeed_queue_index, status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::WaitForOutfeedReady(int32_t outfeed_queue_index) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_WaitForOutfeedReadyFn(
+  ExecutorApiFn()->TpuExecutor_WaitForOutfeedReadyFn(
       executor_, outfeed_queue_index, status.c_status);
   return status.status();
 }
@@ -265,18 +263,18 @@ void TpuExecutor::DequeueOutfeed(int32_t outfeed_queue_index,
                                  absl::Span<uint8_t> bytes,
                                  StatusCallback done) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_DequeueOutfeedFn(
-      executor_, outfeed_queue_index, bytes.data(), bytes.size(),
-      status.c_status);
+  ExecutorApiFn()->TpuExecutor_DequeueOutfeedFn(executor_, outfeed_queue_index,
+                                                bytes.data(), bytes.size(),
+                                                status.c_status);
   done(status.status());
 }
 
 Status TpuExecutor::EnqueueInfeed(int32_t infeed_queue_index,
                                   absl::Span<const uint8_t> bytes) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_EnqueueInfeedFn(
-      executor_, infeed_queue_index, bytes.data(), bytes.size(),
-      status.c_status);
+  ExecutorApiFn()->TpuExecutor_EnqueueInfeedFn(executor_, infeed_queue_index,
+                                               bytes.data(), bytes.size(),
+                                               status.c_status);
   return status.status();
 }
 
@@ -284,7 +282,7 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          const ::stream_executor::DeviceMemoryBase& device_src,
                          uint64_t size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
-  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
+  return ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
       executor_, get_stream(stream->implementation()), host_dst, &se_base,
       size);
 }
@@ -293,7 +291,7 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          ::stream_executor::DeviceMemoryBase* device_dst,
                          const void* host_src, uint64_t size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
-  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
+  return ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
       executor_, get_stream(stream->implementation()), &se_base, host_src,
       size);
 }
@@ -303,7 +301,7 @@ Status TpuExecutor::SynchronousMemcpy(
     uint64_t size) {
   StatusHelper status;
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
-  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
+  ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
       executor_, &se_base, host_src, size, status.c_status);
   return status.status();
 }
@@ -313,7 +311,7 @@ Status TpuExecutor::SynchronousMemcpy(
     uint64_t size) {
   StatusHelper status;
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
-  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
+  ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
       executor_, host_dst, &se_base, size, status.c_status);
   return status.status();
 }
@@ -321,8 +319,7 @@ Status TpuExecutor::SynchronousMemcpy(
 Status TpuExecutor::SynchronousMemcpyDeviceToDevice(
     ::stream_executor::DeviceMemoryBase* device_dst,
     const ::stream_executor::DeviceMemoryBase& device_src, uint64_t size) {
-  return ::stream_executor::port::UnimplementedError(
-      "This operation not supported on TPU");
+  return tsl::errors::Unimplemented("This operation not supported on TPU");
 }
 
 bool TpuExecutor::MemcpyDeviceToDevice(
@@ -333,36 +330,35 @@ bool TpuExecutor::MemcpyDeviceToDevice(
 
 Status TpuExecutor::UnloadAllPrograms() {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_UnloadAllProgramsFn(executor_,
-                                                        status.c_status);
+  ExecutorApiFn()->TpuExecutor_UnloadAllProgramsFn(executor_, status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::EnqueueCompactionOnStreamForHbm(Stream* compaction_stream) {
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuExecutor_EnqueueCompactionOnStreamForHbmFn(
+  ExecutorApiFn()->TpuExecutor_EnqueueCompactionOnStreamForHbmFn(
       executor_, get_stream(compaction_stream->implementation()),
       status.c_status);
   return status.status();
 }
 
 struct HostCallbackContext {
-  std::function<Status()> callback;
+  absl::AnyInvocable<Status() &&> callback;
 };
 
-TF_Status* HostCallbackTrampoline(void* ctx) {
+TSL_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
-  Status status = host_ctx->callback();
-  TF_Status* c_status = tpu::ExecutorApiFn()->TpuStatus_CreateFn(
+  Status status = std::move(host_ctx->callback)();
+  TSL_Status* c_status = ExecutorApiFn()->TpuStatus_CreateFn(
       status.code(), status.error_message().c_str());
   delete host_ctx;
   return c_status;
 }
 
 bool TpuExecutor::HostCallback(Stream* stream,
-                               std::function<Status()> callback) {
-  HostCallbackContext* ctx = new HostCallbackContext{callback};
-  return tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
+                               absl::AnyInvocable<Status() &&> callback) {
+  HostCallbackContext* ctx = new HostCallbackContext{std::move(callback)};
+  return ExecutorApiFn()->TpuExecutor_HostCallbackFn(
       executor_, get_stream(stream->implementation()), &HostCallbackTrampoline,
       ctx);
 }
@@ -371,12 +367,12 @@ TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
 TpuExecutor::CreateDeviceDescription() const {
   StatusHelper status;
   SE_DeviceDescription* description =
-      tpu::ExecutorApiFn()->TpuDeviceDescription_NewFn();
+      ExecutorApiFn()->TpuDeviceDescription_NewFn();
   absl::Cleanup cleanup = [description]() {
-    tpu::ExecutorApiFn()->TpuDeviceDescription_FreeFn(description);
+    ExecutorApiFn()->TpuDeviceDescription_FreeFn(description);
   };
-  tpu::ExecutorApiFn()->TpuExecutor_CreateDeviceDescriptionFn(
-      executor_, description, status.c_status);
+  ExecutorApiFn()->TpuExecutor_CreateDeviceDescriptionFn(executor_, description,
+                                                         status.c_status);
   if (status.status().ok()) {
     stream_executor::internal::DeviceDescriptionBuilder builder;
     CHECK_NE(description->device_vendor, nullptr);
@@ -393,4 +389,4 @@ TpuExecutor::CreateDeviceDescription() const {
 }
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h
index df2f509ab12..deb550ded0d 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h
@@ -17,14 +17,16 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
 
 #include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/device_options.h"
 #include "tensorflow/compiler/xla/stream_executor/event.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
@@ -36,17 +38,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h"
 #include "tensorflow/tsl/platform/casts.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/platform/types.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
  public:
-  using Status = ::stream_executor::port::Status;
   template <typename T>
-  using StatusOr = ::stream_executor::port::StatusOr<T>;
-  using StatusCallback = std::function<void(const Status&)>;
+  using StatusOr = ::tsl::StatusOr<T>;
+  using StatusCallback = std::function<void(const tsl::Status&)>;
   using Stream = ::stream_executor::Stream;
   using Event = ::stream_executor::Event;
   using Timer = ::stream_executor::Timer;
@@ -65,20 +68,20 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   ~TpuExecutor() override;
 
-  Status Init(int device_ordinal,
-              ::stream_executor::DeviceOptions device_options) override;
+  tsl::Status Init(int device_ordinal,
+                   ::stream_executor::DeviceOptions device_options) override;
 
   DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
 
-  Status AllocateEvent(Event* event) override;
+  tsl::Status AllocateEvent(Event* event) override;
 
   bool AllocateStream(Stream* stream) override;
 
   bool AllocateTimer(Timer* timer) override;
 
-  Status BlockHostUntilDone(::stream_executor::Stream* stream) override;
+  tsl::Status BlockHostUntilDone(::stream_executor::Stream* stream) override;
 
-  Status BlockUntilDoneOrFailed();
+  tsl::Status BlockUntilDoneOrFailed();
 
   StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
   CreateDeviceDescription() const override;
@@ -91,7 +94,7 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 
   void Deallocate(DeviceMemoryBase* memory) override;
 
-  Status DeallocateEvent(Event* event) override;
+  tsl::Status DeallocateEvent(Event* event) override;
 
   void DeallocateTimer(Timer* timer) override;
 
@@ -100,14 +103,15 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   void DequeueOutfeed(int32_t outfeed_queue_index, absl::Span<uint8_t> bytes,
                       StatusCallback done);
 
-  Status EnqueueInfeed(int32_t infeed_queue_index,
-                       absl::Span<const uint8_t> bytes);
+  tsl::Status EnqueueInfeed(int32_t infeed_queue_index,
+                            absl::Span<const uint8_t> bytes);
 
   std::optional<stream_executor::AllocatorStats> GetAllocatorStats() override;
 
-  tpu::TpuCoreLocationExternal GetCoreLocationExternal() const override;
+  tensorflow::tpu::TpuCoreLocationExternal GetCoreLocationExternal()
+      const override;
 
-  Status GetStatus(Stream* stream) override;
+  tsl::Status GetStatus(Stream* stream) override;
 
   std::unique_ptr<::stream_executor::internal::StreamInterface>
   GetStreamImplementation() override;
@@ -118,7 +122,8 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   std::unique_ptr<::stream_executor::internal::EventInterface>
   CreateEventImplementation() override;
 
-  bool HostCallback(Stream* stream, std::function<Status()> callback) override;
+  bool HostCallback(Stream* stream,
+                    absl::AnyInvocable<tsl::Status() &&> callback) override;
 
   bool Memcpy(Stream* stream, void* host_dst,
               const ::stream_executor::DeviceMemoryBase& device_src,
@@ -135,12 +140,12 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   void SyncAndForgetFailedStreams();
   bool SynchronizeAllActivity() override;
 
-  Status SynchronousMemcpy(::stream_executor::DeviceMemoryBase* device_dst,
-                           const void* host_src, uint64_t size) override;
-  Status SynchronousMemcpy(
+  tsl::Status SynchronousMemcpy(::stream_executor::DeviceMemoryBase* device_dst,
+                                const void* host_src, uint64_t size) override;
+  tsl::Status SynchronousMemcpy(
       void* host_dst, const ::stream_executor::DeviceMemoryBase& device_src,
       uint64_t size) override;
-  Status SynchronousMemcpyDeviceToDevice(
+  tsl::Status SynchronousMemcpyDeviceToDevice(
       ::stream_executor::DeviceMemoryBase* device_dst,
       const ::stream_executor::DeviceMemoryBase& device_src,
       uint64_t size) override;
@@ -148,19 +153,22 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   int PlatformDeviceCount() override;
 
   Event::Status PollForEventStatus(Event* event) override;
-  Status RecordEvent(Stream* stream, ::stream_executor::Event* event) override;
-  Status WaitForEvent(Stream* stream, ::stream_executor::Event* event) override;
+  tsl::Status RecordEvent(Stream* stream,
+                          ::stream_executor::Event* event) override;
+  tsl::Status WaitForEvent(Stream* stream,
+                           ::stream_executor::Event* event) override;
 
   bool StartTimer(Stream* stream, ::stream_executor::Timer* timer) override;
   bool StopTimer(Stream* stream, ::stream_executor::Timer* timer) override;
 
-  Status WaitForInfeedReady(int32_t infeed_queue_index);
+  tsl::Status WaitForInfeedReady(int32_t infeed_queue_index);
 
-  Status WaitForOutfeedReady(int32_t outfeed_queue_index);
+  tsl::Status WaitForOutfeedReady(int32_t outfeed_queue_index);
 
-  Status UnloadAllPrograms() override;
+  tsl::Status UnloadAllPrograms() override;
 
-  Status EnqueueCompactionOnStreamForHbm(Stream* compaction_stream) override;
+  tsl::Status EnqueueCompactionOnStreamForHbm(
+      Stream* compaction_stream) override;
 
   const ::tensorflow::tpu::TpuPlatformInterface& platform() const override {
     return *platform_;
@@ -189,15 +197,15 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
                      uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status MemZero(Stream* stream, DeviceMemoryBase* location,
-                 uint64_t size) override {
+  tsl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
+                      uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status Memset32(Stream* stream, DeviceMemoryBase* location, uint32_t pattern,
-                  uint64_t size) override {
+  tsl::Status Memset32(Stream* stream, DeviceMemoryBase* location,
+                       uint32_t pattern, uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
+  tsl::Status EnablePeerAccessTo(StreamExecutorInterface* other) override {
     LOG(FATAL) << "not yet implemented";
   }
   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override {
@@ -216,23 +224,23 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   bool HostMemoryUnregister(void* mem) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status SynchronousMemZero(DeviceMemoryBase* location,
-                            uint64_t size) override {
+  tsl::Status SynchronousMemZero(DeviceMemoryBase* location,
+                                 uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
-  Status SynchronousMemSet(DeviceMemoryBase* location, int value,
-                           uint64_t size) override {
+  tsl::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                uint64_t size) override {
     LOG(FATAL) << "not yet implemented";
   }
 
   SE_StreamExecutor* se_executor() { return executor_; }
 
  private:
-  TpuPlatform& tpu_platform() {
-    return *(tensorflow::down_cast<TpuPlatform*>(platform_));
+  tensorflow::tpu::TpuPlatform& tpu_platform() {
+    return *(tensorflow::down_cast<tensorflow::tpu::TpuPlatform*>(platform_));
   }
 
-  TpuPlatform::StreamMap& stream_map() {
+  tensorflow::tpu::TpuPlatform::StreamMap& stream_map() {
     return *(tpu_platform().stream_map());
   }
 
@@ -247,6 +255,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
 };
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.cc
index 198764c9b1c..5aeea6cb6b1 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 TfTpu_ExecutorApiFn* ExecutorApiFn() {
@@ -44,4 +44,4 @@ bool IsInitialized(TfTpu_ExecutorApiFn* executor_api_fn) {
 }
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h
index e4df2703504..6f441d3109d 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_api.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_c_api.h"
 
-namespace tensorflow {
+namespace stream_executor {
 namespace tpu {
 
 TfTpu_ExecutorApiFn* ExecutorApiFn();
@@ -33,6 +33,6 @@ bool IsStreamExecutorEnabled(TfTpu_ExecutorApiFn* executor_api_fn);
 bool IsInitialized(TfTpu_ExecutorApiFn* executor_api_fn);
 
 }  // namespace tpu
-}  // namespace tensorflow
+}  // namespace stream_executor
 
 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_API_H_
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc
index 7923b4fcbcf..20559d3da6f 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc
@@ -1,7 +1,7 @@
 namespace {
 
-tsl::Status SetExecutorStructFn(void* library_handle) { // TENSORFLOW_STATUS_OK
-  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+tsl::Status SetExecutorStructFn(void* library_handle) {  // TENSORFLOW_STATUS_OK
+  auto* executor_fn = stream_executor::tpu::ExecutorApiFn();
 
   TFTPU_SET_FN(executor_fn, TpuPlatform_New);
   TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h
index 511a14db882..2ae3b5436d9 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tpu {
 class TpuCore;
@@ -34,9 +34,8 @@ namespace tpu {
 class TpuExecutorInterface
     : public stream_executor::internal::StreamExecutorInterface {
  public:
-  using Status = stream_executor::port::Status;
   template <typename T>
-  using StatusOr = stream_executor::port::StatusOr<T>;
+  using StatusOr = tsl::StatusOr<T>;
 
   class TemporaryDeviceMemory {
    public:
@@ -60,9 +59,9 @@ class TpuExecutorInterface
     LOG(FATAL) << "Unimplemented.";
   }
 
-  virtual Status UnloadAllPrograms() { LOG(FATAL) << "Unimplemented."; }
+  virtual tsl::Status UnloadAllPrograms() { LOG(FATAL) << "Unimplemented."; }
 
-  virtual Status EnqueueCompactionOnStreamForHbm(
+  virtual tsl::Status EnqueueCompactionOnStreamForHbm(
       stream_executor::Stream* compaction_stream) {
     LOG(FATAL) << "Unimplemented.";
   }
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.cc
index d65077d98f0..b033ed02c61 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.cc
@@ -24,16 +24,17 @@ limitations under the License.
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <cstring>
 #include <fstream>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_tpu.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/pjrt_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api_dlsym_set_fn.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
@@ -109,7 +110,7 @@ bool IsTpuUsed(int64_t pid) {
 // permission to processes owned by another user.
 // TODO (shahrokhi) use tensorflow/core/platform/filesystem (GetChildren) for
 // this.
-stream_executor::port::StatusOr<int64_t> FindLibtpuProcess() {
+tsl::StatusOr<int64_t> FindLibtpuProcess() {
   DIR* proc = opendir("/proc");
 
   if (proc == nullptr) {
@@ -129,7 +130,7 @@ stream_executor::port::StatusOr<int64_t> FindLibtpuProcess() {
   return tsl::errors::NotFound("did not find which pid uses the libtpu.so");
 }
 
-stream_executor::port::Status TryAcquireTpuLock() {
+tsl::Status TryAcquireTpuLock() {
   static absl::Mutex* mu = new absl::Mutex();
   absl::MutexLock l(mu);
 
@@ -173,14 +174,14 @@ stream_executor::port::Status TryAcquireTpuLock() {
       auto pid = FindLibtpuProcess();
       if (pid.ok()) {
         return tsl::errors::Aborted(absl::StrCat(
-            "libtpu.so is already in use by process with pid ", pid.value(),
+            "The TPU is already in use by process with pid ", pid.value(),
             ". Not attempting to load libtpu.so in this process."));
       } else {
         return tsl::errors::Aborted(
-            "libtpu.so already in use by another process probably owned by "
+            "The TPU is already in use by another process probably owned by "
             "another user. Run \"$ sudo lsof -w /dev/accel0\" to figure out "
-            "which process is using the TPU. Not attempting to load "
-            "libtpu.so in this process.");
+            "which process is using the TPU. If you still get this message, "
+            "run \"$ sudo rm /tmp/libtpu_lockfile\".");
       }
     } else {
       return ::tsl::OkStatus();
@@ -195,8 +196,8 @@ stream_executor::port::Status TryAcquireTpuLock() {
 #if !defined(PLATFORM_GOOGLE)
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc"
 
-stream_executor::port::Status InitializeTpuLibrary(void* library_handle) {
-  stream_executor::port::Status s = InitializeTpuStructFns(library_handle);
+tsl::Status InitializeTpuLibrary(void* library_handle) {
+  tsl::Status s = InitializeTpuStructFns(library_handle);
 
   // Retrieve arguments from environment if applicable
   std::pair<std::vector<std::string>, std::vector<const char*>> args =
@@ -218,18 +219,6 @@ stream_executor::port::Status InitializeTpuLibrary(void* library_handle) {
   return s;
 }
 
-typedef const PJRT_Api* (*PjRtFuncPtr)();
-void InitializePjRt(void* library_handle) {
-  PjRtFuncPtr fptr = &GetTpuPjrtApi;
-  *reinterpret_cast<void**>(&fptr) = dlsym(library_handle, "GetTpuPjrtApi");
-  if (fptr == nullptr) {
-    LOG(INFO) << "GetTpuPjrtApi not found";
-  } else {
-    LOG(INFO) << "GetTpuPjrtApi was found";
-    tensorflow::tpu::SetPjrtApi(fptr());
-  }
-}
-
 namespace {
 void* CreateGcsFilesystemFn() { return new tsl::RetryingGcsFileSystem(); }
 
@@ -267,7 +256,10 @@ void InitializeCreateGcsFileSystemFnPtr() {
   });
 }
 }  // namespace
-stream_executor::port::Status FindAndLoadTpuLibrary() {
+
+// TODO(b/261484192): refactor this function to align with supporting different
+// PJRT plugins.
+tsl::Status FindAndLoadTpuLibrary() {
   const char* env_value = getenv("TPU_LIBRARY_PATH");
   const char* libtpu_path =
       env_value && strlen(env_value) > 0 ? env_value : "libtpu.so";
@@ -278,7 +270,6 @@ stream_executor::port::Status FindAndLoadTpuLibrary() {
     // Try to acquire exclusive access.
     TF_RETURN_IF_ERROR(TryAcquireTpuLock());
     TF_RETURN_IF_ERROR(InitializeTpuLibrary(library));
-    InitializePjRt(library);
   }
 
   InitializeCreateGcsFileSystemFnPtr();
@@ -289,7 +280,7 @@ stream_executor::port::Status FindAndLoadTpuLibrary() {
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc"
 
-stream_executor::port::Status InitializeTpuLibrary() {
+tsl::Status InitializeTpuLibrary() {
   // Retrieve arguments from environment if applicable
   std::pair<std::vector<std::string>, std::vector<const char*>> args =
       GetLibTpuInitArguments();
@@ -301,7 +292,7 @@ stream_executor::port::Status InitializeTpuLibrary() {
   return ::tsl::OkStatus();
 }
 
-stream_executor::port::Status FindAndLoadTpuLibrary() {
+tsl::Status FindAndLoadTpuLibrary() {
   // We can open the shared library which means we are in a TPU environment.
   // Try to acquire exclusive access.
   TF_RETURN_IF_ERROR(TryAcquireTpuLock());
@@ -310,7 +301,7 @@ stream_executor::port::Status FindAndLoadTpuLibrary() {
 }
 
 #else   // PLATFORM_GOOGLE
-stream_executor::port::Status InitializeTpuLibrary(void* library_handle) {
+tsl::Status InitializeTpuLibrary(void* library_handle) {
   return tsl::errors::Unimplemented(
       "You must statically link in a TPU library.");
 }
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h
index 682a1595d30..5fa479ad02f 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -28,9 +28,9 @@ namespace tpu {
 // This will acquire a system-wide lock on behalf of the whole process. Follow
 // up calls to this function will return true if the lock has been acquired and
 // false if we failed to acquire the lock.
-stream_executor::port::Status TryAcquireTpuLock();
+tsl::Status TryAcquireTpuLock();  // TENSORFLOW_STATUS_OK
 // This will check the lock and then load the library.
-stream_executor::port::Status FindAndLoadTpuLibrary();
+tsl::Status FindAndLoadTpuLibrary();  // TENSORFLOW_STATUS_OK
 // Returns arguments (e.g. flags) set in the LIBTPU_INIT_ARGS environment
 // variable. The first return value is the arguments, the second return value is
 // pointers to the arguments suitable for passing into the C API.
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc
index ec71aba71bf..988244a00ac 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_library_init_fns.inc
@@ -1,15 +1,12 @@
-#if defined(PLATFORM_GOOGLE)
-#include "third_party/tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc"
-#else
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_init_fns.inc"
-#endif
 
 namespace {
 
-tsl::Status SetTpuOpsStructFns(void* library_handle) { // TENSORFLOW_STATUS_OK
+tsl::Status SetTpuOpsStructFns(void* library_handle) {  // TENSORFLOW_STATUS_OK
   // Constant cast so that we can initialize the functions. The functions are
   // mutable here because this is the only place where they are initialized.
-  auto* ops_api_fn = const_cast<TfTpu_OpsApiFn*>(tensorflow::tpu::OpsApiFn());
+  auto* ops_api_fn =
+      const_cast<TfTpu_OpsApiFn*>(stream_executor::tpu::OpsApiFn());
 
   TFTPU_SET_FN(ops_api_fn, ConfigureDistributedTpuOp_DoWork);
   TFTPU_SET_FN(ops_api_fn, WaitForDistributedTpuOp_DoWork);
@@ -112,11 +109,13 @@ tsl::Status SetTpuOpsStructFns(void* library_handle) { // TENSORFLOW_STATUS_OK
                TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation);
   TFTPU_SET_FN(ops_api_fn,
                TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation);
+  TFTPU_SET_FN(ops_api_fn, TpuEmbeddingEngine_DedupDataTupleMaskComputation);
 
   return tsl::OkStatus();
 }
 
-tsl::Status InitializeTpuStructFns(void* library_handle) { // TENSORFLOW_STATUS_OK
+tsl::Status InitializeTpuStructFns(  // TENSORFLOW_STATUS_OK
+    void* library_handle) {
   TF_RETURN_IF_ERROR(SetTpuOpsStructFns(library_handle));
   TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.cc
index a2f67b6890e..4b08a171e89 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.cc
@@ -15,51 +15,56 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 
 namespace tensorflow {
 namespace tpu {
 
-using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 /*static*/
 StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
     int device_ordinal) {
   StatusHelper status;
   XLA_TpuNodeContext* node_context =
-      tpu::OpsApiFn()->TpuNodeContext_CreateFn(device_ordinal, status.c_status);
+      stream_executor::tpu::OpsApiFn()->TpuNodeContext_CreateFn(
+          device_ordinal, status.c_status);
   if (!status.status().ok()) {
     // TpuNodeContext_CreateFn allocates a new XLA_TpuNodeContext regardless of
     // status. It needs to be freed if it's not given to a TpuNodeContext below.
-    tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context);
+    stream_executor::tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context);
     return status.status();
   }
   return std::make_unique<TpuNodeContext>(device_ordinal, node_context);
 }
 
 TpuNodeContext::~TpuNodeContext() {
-  tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context_);
+  stream_executor::tpu::OpsApiFn()->TpuNodeContext_FreeFn(node_context_);
 }
 
 /* static */
-Status TpuNodeContext::StopChipHeartbeats() {
+tsl::Status TpuNodeContext::StopChipHeartbeats() {
   StatusHelper status;
-  tpu::OpsApiFn()->TpuNodeContext_StopChipHeartbeatsFn(status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuNodeContext_StopChipHeartbeatsFn(
+      status.c_status);
   return status.status();
 }
 
 /* static */
-Status TpuNodeContext::CloseTpuHost() {
+tsl::Status TpuNodeContext::CloseTpuHost() {
   StatusHelper status;
-  tpu::OpsApiFn()->TpuNodeContext_CloseTpuHostFn(status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuNodeContext_CloseTpuHostFn(
+      status.c_status);
   return status.status();
 }
 
 /* static */
-Status TpuNodeContext::Initialize(int device_ordinal) {
+tsl::Status TpuNodeContext::Initialize(int device_ordinal) {
   StatusHelper status;
-  tpu::OpsApiFn()->TpuNodeContext_InitializeFn(device_ordinal, status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuNodeContext_InitializeFn(
+      device_ordinal, status.c_status);
   return status.status();
 }
 
@@ -84,7 +89,8 @@ stream_executor::StreamExecutor* TpuNodeContext::stream_executor() const {
 }
 
 bool TpuNodeContext::CompactionSupported(int device_ordinal) const {
-  return tpu::OpsApiFn()->TpuNodeContext_CompactionSupportedFn(device_ordinal);
+  return stream_executor::tpu::OpsApiFn()->TpuNodeContext_CompactionSupportedFn(
+      device_ordinal);
 }
 
 }  // namespace tpu
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h
index 0e2b046e6e5..20c66fb2c33 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -39,9 +39,8 @@ namespace tpu {
 // individual nodes.
 class TpuNodeContext final {
  public:
-  using Status = stream_executor::port::Status;
   template <typename T>
-  using StatusOr = stream_executor::port::StatusOr<T>;
+  using StatusOr = tsl::StatusOr<T>;
 
   static StatusOr<std::unique_ptr<TpuNodeContext>> Create(int device_ordinal);
 
@@ -51,11 +50,11 @@ class TpuNodeContext final {
   }
   ~TpuNodeContext();
 
-  static Status StopChipHeartbeats();
+  static tsl::Status StopChipHeartbeats();
 
-  static Status CloseTpuHost();
+  static tsl::Status CloseTpuHost();
 
-  static Status Initialize(int device_ordinal);
+  static tsl::Status Initialize(int device_ordinal);
 
   static TpuPlatformInterface* platform();
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_on_demand_compiler.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
index aac2cc88bb4..02b56476643 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
@@ -16,11 +16,11 @@ limitations under the License.
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
@@ -40,7 +40,7 @@ namespace xla {
 
 namespace {
 
-using ::tensorflow::tpu::ExecutorApiFn;
+using ::stream_executor::tpu::ExecutorApiFn;
 
 class TpuCompiler : public Compiler {
  public:
@@ -67,7 +67,8 @@ class TpuCompiler : public Compiler {
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
         compiler_, &hlo_module,
-        static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
+        static_cast<stream_executor::tpu::TpuExecutor*>(
+            executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -96,7 +97,8 @@ class TpuCompiler : public Compiler {
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunBackendFn(
         compiler_, &hlo_module,
-        static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
+        static_cast<stream_executor::tpu::TpuExecutor*>(
+            executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -136,7 +138,7 @@ class TpuCompiler : public Compiler {
       se_lists_storage.emplace_back(stream_exec[i].size());
       se_lists[i].exec = se_lists_storage.back().data();
       for (int j = 0; j < stream_exec[i].size(); ++j) {
-        se_lists[i].exec[j] = static_cast<tensorflow::tpu::TpuExecutor*>(
+        se_lists[i].exec[j] = static_cast<stream_executor::tpu::TpuExecutor*>(
                                   stream_exec[i][j]->implementation())
                                   ->se_executor();
       }
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.cc
index fdbedb82c0d..d96ffcdf31e 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
@@ -38,25 +40,23 @@ xla::Status TpuOpExecutable::LoadProgramAndEnqueueToStream(
     const xla::ServiceExecutableRunOptions& run_options,
     absl::Span<const se::DeviceMemoryBase> arguments,
     se::DeviceMemoryBase result,
-    std::optional<se::DeviceMemoryBase> cross_program_prefetch_addr) {
-  SE_DeviceMemoryBase* arguments_bases = nullptr;
-  if (!arguments.empty()) {
-    arguments_bases = new SE_DeviceMemoryBase[arguments.size()];
-    for (int i = 0; i < arguments.size(); i++) {
-      arguments_bases[i] =
-          SE_DeviceMemoryBase{const_cast<void*>(arguments[i].opaque()),
-                              arguments[i].size(), arguments[i].payload()};
-    }
-  }
-
-  SE_DeviceMemoryBase result_base{result.opaque(), result.size(),
-                                  result.payload()};
-  SE_DeviceMemoryBase prefetch_base;
-  if (cross_program_prefetch_addr.has_value()) {
-    prefetch_base = SE_DeviceMemoryBase{cross_program_prefetch_addr->opaque(),
-                                        cross_program_prefetch_addr->size(),
-                                        cross_program_prefetch_addr->payload()};
-  }
+    const std::vector<se::DeviceMemoryBase>& cross_program_prefetch_addrs,
+    const std::vector<uint32_t>& cross_program_prefetch_offsets) {
+  auto DeviceMemoryBaseToC = [](const se::DeviceMemoryBase& addr) {
+    return SE_DeviceMemoryBase{const_cast<void*>(addr.opaque()), addr.size(),
+                               addr.payload()};
+  };
+
+  std::vector<SE_DeviceMemoryBase> arguments_bases;
+  arguments_bases.resize(arguments.size());
+  absl::c_transform(arguments, arguments_bases.begin(), DeviceMemoryBaseToC);
+
+  SE_DeviceMemoryBase result_base = DeviceMemoryBaseToC(result);
+
+  std::vector<SE_DeviceMemoryBase> prefetch_bases;
+  prefetch_bases.resize(cross_program_prefetch_addrs.size());
+  absl::c_transform(cross_program_prefetch_addrs, prefetch_bases.begin(),
+                    DeviceMemoryBaseToC);
   int32_t rng_seed = run_options.run_options().rng_seed();
 
   XLA_DeviceAssignment c_dev_assign{/*bytes=*/nullptr, /*size=*/0};
@@ -81,24 +81,29 @@ xla::Status TpuOpExecutable::LoadProgramAndEnqueueToStream(
   params.struct_size = TpuExecutable_LoadProgramAndEnqueueToStream_Params_SIZE;
   params.priv = nullptr;
   params.program = core_program_;
-  params.arguments = arguments_bases;
-  params.arguments_len = arguments.size();
+  params.arguments = arguments_bases.empty() ? nullptr : arguments_bases.data();
+  params.arguments_len = arguments_bases.size();
   params.result = &result_base;
-  params.has_cross_program_prefetch_addr =
-      cross_program_prefetch_addr.has_value();
-  params.cross_program_prefetch_addr =
-      cross_program_prefetch_addr.has_value() ? &prefetch_base : nullptr;
+  params.cross_program_prefetch_addrs =
+      prefetch_bases.empty() ? nullptr : prefetch_bases.data();
+  params.cross_program_prefetch_addrs_len = prefetch_bases.size();
+  params.cross_program_prefetch_offsets =
+      cross_program_prefetch_offsets.empty()
+          ? nullptr
+          : cross_program_prefetch_offsets.data();
+  params.cross_program_prefetch_offsets_len =
+      cross_program_prefetch_offsets.size();
   params.rng_seed = rng_seed;
   params.device_assignment = &c_dev_assign;
   params.stream = stream;
   params.status = status.c_status;
 
-  tpu::OpsApiFn()->TpuExecutable_LoadProgramAndEnqueueToStreamFn(&params);
+  stream_executor::tpu::OpsApiFn()
+      ->TpuExecutable_LoadProgramAndEnqueueToStreamFn(&params);
 
   if (dev_assign != nullptr) {
     stream_executor::tpu::SerializedProto_Free(dev_assign_serialized);
   }
-  delete[] arguments_bases;
   return status.status();
 }
 
@@ -107,8 +112,8 @@ xla::Shape TpuOpExecutable::HostShapeToDeviceShape(
   XLA_Shape c_host_shape;
   XLA_Shape c_device_shape;
   ApiConverter::ToC(host_shape, &c_host_shape);
-  tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(&c_host_shape,
-                                                           &c_device_shape);
+  stream_executor::tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+      &c_host_shape, &c_device_shape);
   xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
   ApiConverter::Destroy(&c_host_shape);
   ApiConverter::Destroy(&c_device_shape);
@@ -118,7 +123,8 @@ xla::Shape TpuOpExecutable::HostShapeToDeviceShape(
 int64_t TpuOpExecutable::ShapeSize(const xla::Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
-  int64_t size = tpu::OpsApiFn()->HardwareLayout_ShapeSizeFn(&c_shape);
+  int64_t size =
+      stream_executor::tpu::OpsApiFn()->HardwareLayout_ShapeSizeFn(&c_shape);
   ApiConverter::Destroy(&c_shape);
   return size;
 }
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h
index ce808a14389..5e11ecd6653 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h
@@ -20,10 +20,11 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <vector>
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -55,8 +56,9 @@ class TpuOpExecutable : public xla::TpuExecutableInterface {
       const xla::ServiceExecutableRunOptions& run_options,
       absl::Span<const stream_executor::DeviceMemoryBase> arguments,
       stream_executor::DeviceMemoryBase result,
-      std::optional<stream_executor::DeviceMemoryBase>
-          cross_program_prefetch_addr) override;
+      const std::vector<stream_executor::DeviceMemoryBase>&
+          cross_program_prefetch_addrs,
+      const std::vector<uint32_t>& cross_program_prefetch_offsets) override;
 
   xla::Shape HostShapeToDeviceShape(const xla::Shape& host_shape) override;
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h
index 07807d576ee..8217cdcfc36 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <stddef.h>
 
 #include <cstdint>
+#include <optional>
 
-#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/libtftpu.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
@@ -227,8 +227,10 @@ typedef struct TpuExecutable_LoadProgramAndEnqueueToStream_Params {
   SE_DeviceMemoryBase* arguments;
   size_t arguments_len;
   SE_DeviceMemoryBase* result;
-  bool has_cross_program_prefetch_addr;
-  SE_DeviceMemoryBase* cross_program_prefetch_addr;
+  size_t cross_program_prefetch_addrs_len;
+  SE_DeviceMemoryBase* cross_program_prefetch_addrs;
+  size_t cross_program_prefetch_offsets_len;
+  const uint32_t* cross_program_prefetch_offsets;
   int32_t rng_seed;
   XLA_DeviceAssignment* device_assignment;
   SE_Stream* stream;
@@ -667,9 +669,9 @@ typedef struct TpuEmbeddingEngine_RecvActivationsComputation_Params {
   int32_t struct_size;
   void* priv;
 
-  size_t config_string_size;
+  TpuSerializedProto tpu_embedding_config;
   XLA_Shape* deduplication_data_shape;
-  const XLA_TpuMeshState* tpu_mesh_state;
+  TpuSerializedProto* op_sharding;
 
   // out
   TpuSerializedProto* xla_computation;
@@ -684,7 +686,8 @@ typedef struct
   int32_t struct_size;
   void* priv;
 
-  const XLA_TpuMeshState* tpu_mesh_state;
+  TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto* op_sharding;
   // out
   TpuSerializedProto* xla_computation;
   TF_Status* status;
@@ -700,10 +703,11 @@ typedef struct TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params {
   void* priv;
 
   int32_t num_inputs;
-  const XLA_TpuMeshState* tpu_mesh_state;
+  TpuSerializedProto tpu_embedding_config;
   XLA_Shape* learning_rate_tuple_shape;
   XLA_Shape* deduplication_data_shape;
   XLA_Shape* gradient_tuple_shape;
+  TpuSerializedProto* op_sharding;
   // out
   TpuSerializedProto* xla_computation;
   TF_Status* status;
@@ -712,6 +716,19 @@ typedef struct TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params {
 TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation(
     TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params* params);
 
+typedef struct TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params {
+  int32_t struct_size;
+  void* priv;
+
+  TpuSerializedProto tpu_embedding_config;
+  // out
+  TpuSerializedProto* xla_computation;
+  TF_Status* status;
+} TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params;
+
+TFTPU_CAPI_EXPORT void TpuEmbeddingEngine_DedupDataTupleMaskComputation(
+    TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params* params);
+
 struct TfTpu_OpsApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_XrtCompileAndBuild);
@@ -812,6 +829,7 @@ struct TfTpu_OpsApiFn {
       TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation);
   TFTPU_ADD_FN_IN_STRUCT(
       TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEmbeddingEngine_DedupDataTupleMaskComputation);
 };
 
 }  // extern "C"
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.cc
index b083bf9c199..bdf0ce899f6 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.cc
@@ -15,12 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h"
 
-#include "tensorflow/c/tf_status.h"
-#include "tensorflow/c/tf_status_helper.h"
+#include <map>
+#include <memory>
+#include <string>
+
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_id.h"
+#include "tensorflow/tsl/c/tsl_status.h"
+#include "tensorflow/tsl/c/tsl_status_helper.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -28,12 +32,11 @@ namespace tpu {
 const ::stream_executor::Platform::Id TpuPlatform::kId = GetTpuPlatformId();
 TpuPlatform* tpu_registered_platform = nullptr;
 
-using Status = ::stream_executor::port::Status;
 template <typename T>
-using StatusOr = ::stream_executor::port::StatusOr<T>;
+using StatusOr = ::tsl::StatusOr<T>;
 
 TpuPlatform::TpuPlatform() : name_("TPU") {
-  platform_ = tpu::ExecutorApiFn()->TpuPlatform_NewFn();
+  platform_ = stream_executor::tpu::ExecutorApiFn()->TpuPlatform_NewFn();
   CHECK(platform_ != nullptr);
 }
 
@@ -41,7 +44,7 @@ TpuPlatform* TpuPlatform::GetRegisteredPlatform() {
   return tpu_registered_platform;
 }
 
-Status TpuPlatform::Initialize(
+tsl::Status TpuPlatform::Initialize(
     const std::map<std::string, std::string>& platform_options) {
   StatusHelper status;
 
@@ -58,7 +61,7 @@ Status TpuPlatform::Initialize(
     i++;
   }
 
-  tpu::ExecutorApiFn()->TpuPlatform_InitializeFn(
+  stream_executor::tpu::ExecutorApiFn()->TpuPlatform_InitializeFn(
       platform_, options_size, options_key, options_value, status.c_status);
 
   free(options_key);
@@ -68,15 +71,17 @@ Status TpuPlatform::Initialize(
 }
 
 bool TpuPlatform::Initialized() const {
-  return tpu::ExecutorApiFn()->TpuPlatform_InitializedFn(platform_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuPlatform_InitializedFn(
+      platform_);
 }
 
 TpuPlatform::~TpuPlatform() {
-  tpu::ExecutorApiFn()->TpuPlatform_FreeFn(platform_);
+  stream_executor::tpu::ExecutorApiFn()->TpuPlatform_FreeFn(platform_);
 }
 
 int TpuPlatform::VisibleDeviceCount() const {
-  return tpu::ExecutorApiFn()->TpuPlatform_VisibleDeviceCountFn(platform_);
+  return stream_executor::tpu::ExecutorApiFn()
+      ->TpuPlatform_VisibleDeviceCountFn(platform_);
 }
 
 StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
@@ -88,21 +93,24 @@ StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
 StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
 TpuPlatform::GetUncachedExecutor(
     const ::stream_executor::StreamExecutorConfig& config) {
-  SE_StreamExecutorConfig* c_config =
-      tpu::ExecutorApiFn()->TpuStreamExecutorConfig_DefaultFn();
+  SE_StreamExecutorConfig* c_config = stream_executor::tpu::ExecutorApiFn()
+                                          ->TpuStreamExecutorConfig_DefaultFn();
 
-  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_SetOrdinalFn(c_config,
-                                                             config.ordinal);
+  stream_executor::tpu::ExecutorApiFn()->TpuStreamExecutorConfig_SetOrdinalFn(
+      c_config, config.ordinal);
 
   StatusHelper status;
-  SE_StreamExecutor* executor = tpu::ExecutorApiFn()->TpuPlatform_GetExecutorFn(
-      platform_, c_config, status.c_status);
-  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_FreeFn(c_config);
+  SE_StreamExecutor* executor =
+      stream_executor::tpu::ExecutorApiFn()->TpuPlatform_GetExecutorFn(
+          platform_, c_config, status.c_status);
+  stream_executor::tpu::ExecutorApiFn()->TpuStreamExecutorConfig_FreeFn(
+      c_config);
   if (!status.ok()) {
     return status.status();
   }
   return std::make_unique<stream_executor::StreamExecutor>(
-      this, std::make_unique<TpuExecutor>(this, executor), config.ordinal);
+      this, std::make_unique<stream_executor::tpu::TpuExecutor>(this, executor),
+      config.ordinal);
 }
 
 ::stream_executor::Platform::Id TpuPlatform::id() const {
@@ -112,26 +120,30 @@ ::stream_executor::Platform::Id TpuPlatform::id() const {
 const std::string& TpuPlatform::Name() const { return name_; }
 
 int64_t TpuPlatform::TpuMemoryLimit() {
-  return tpu::ExecutorApiFn()->TpuPlatform_TpuMemoryLimitFn(platform_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuPlatform_TpuMemoryLimitFn(
+      platform_);
 }
 
 bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
-  return tpu::ExecutorApiFn()
+  return stream_executor::tpu::ExecutorApiFn()
       ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
 const tensorflow::tpu::TpuTopologyPtr TpuPlatform::GetTopologyPtr() {
-  return tpu::ExecutorApiFn()->TpuPlatform_GetTopologyPtrFn(platform_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuPlatform_GetTopologyPtrFn(
+      platform_);
 }
 
 const tensorflow::tpu::TpuHostLocationExternal TpuPlatform::GetTpuHostLocation()
     const {
   return tpu::TpuHostLocationExternal(
-      tpu::ExecutorApiFn()->TpuPlatform_GetHostLocationFn(platform_));
+      stream_executor::tpu::ExecutorApiFn()->TpuPlatform_GetHostLocationFn(
+          platform_));
 }
 
 TpuRuntimeVersion TpuPlatform::version() const {
-  return tpu::ExecutorApiFn()->TpuPlatform_GetRuntimeVersionFn(platform_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuPlatform_GetRuntimeVersionFn(
+      platform_);
 }
 
 void TpuPlatform::InsertEvent(stream_executor::internal::EventInterface* key,
@@ -151,32 +163,35 @@ void TpuPlatform::EraseEvent(stream_executor::internal::EventInterface* key) {
   event_map_.erase(key);
 }
 
-Status TpuPlatform::TpusPerHost(int* tpus) {
-  TF_Status* status = TF_NewStatus();
+tsl::Status TpuPlatform::TpusPerHost(int* tpus) {
+  TSL_Status* status = TSL_NewStatus();
 
-  if (tpu::OpsApiFn()->TpuConfigurationApi_TpusPerHostFn == nullptr) {
+  if (stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_TpusPerHostFn ==
+      nullptr) {
     *tpus = 0;
-    return OkStatus();
+    return tsl::OkStatus();
   }
 
-  tpu::OpsApiFn()->TpuConfigurationApi_TpusPerHostFn(tpus, status);
-  auto ret_status = StatusFromTF_Status(status);
-  TF_DeleteStatus(status);
+  stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_TpusPerHostFn(tpus,
+                                                                      status);
+  auto ret_status = tsl::StatusFromTSL_Status(status);
+  TSL_DeleteStatus(status);
   return ret_status;
 }
 
-Status TpuPlatform::TpuMemoryLimit(int64_t* memory_limit) {
-  TF_Status* status = TF_NewStatus();
+tsl::Status TpuPlatform::TpuMemoryLimit(int64_t* memory_limit) {
+  TSL_Status* status = TSL_NewStatus();
 
-  if (tpu::OpsApiFn()->TpuConfigurationApi_TpuMemoryLimitFn == nullptr) {
+  if (stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_TpuMemoryLimitFn ==
+      nullptr) {
     *memory_limit = 0;
-    return OkStatus();
+    return tsl::OkStatus();
   }
 
-  tpu::OpsApiFn()->TpuConfigurationApi_TpuMemoryLimitFn(
+  stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_TpuMemoryLimitFn(
       reinterpret_cast<int64_t*>(memory_limit), status);
-  auto ret_status = StatusFromTF_Status(status);
-  TF_DeleteStatus(status);
+  auto ret_status = tsl::StatusFromTSL_Status(status);
+  TSL_DeleteStatus(status);
   return ret_status;
 }
 
@@ -184,7 +199,8 @@ bool RegisterTpuPlatform() {
   // Silently bail if the underlying TPU C API isn't initialized. This is useful
   // for code that unconditionally calls RegisterTpuPlatform() but doesn't link
   // in the underlying TPU library when not running on TPU.
-  if (!tpu::IsStreamExecutorEnabled(tpu::ExecutorApiFn())) {
+  if (!stream_executor::tpu::IsStreamExecutorEnabled(
+          stream_executor::tpu::ExecutorApiFn())) {
     return true;
   }
   static bool tpu_platform_registered = false;
@@ -192,7 +208,7 @@ bool RegisterTpuPlatform() {
     tpu_registered_platform = new TpuPlatform();
     std::unique_ptr<stream_executor::Platform> platform(
         tpu_registered_platform);
-    SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+    TF_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
         std::move(platform)));
     tpu_platform_registered = true;
   }
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h
index db9df8a620e..c3ef2614d34 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h
@@ -41,9 +41,8 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   static const ::stream_executor::Platform::Id kId;
 
-  using Status = ::stream_executor::port::Status;
   template <typename T>
-  using StatusOr = ::stream_executor::port::StatusOr<T>;
+  using StatusOr = ::tsl::StatusOr<T>;
 
   TpuPlatform();
 
@@ -70,10 +69,10 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   bool Initialized() const override;
 
-  Status Initialize(
+  tsl::Status Initialize(
       const std::map<std::string, std::string>& platform_options) override;
 
-  Status Reset(bool only_tear_down, absl::string_view reason) override {
+  tsl::Status Reset(bool only_tear_down, absl::string_view reason) override {
     LOG(FATAL) << "Not yet implemented";
   }
 
@@ -132,10 +131,10 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   SE_Platform* se_platform() const { return platform_; }
 
   // Returns the number of TPUs per host.
-  static Status TpusPerHost(int* tpus);
+  static tsl::Status TpusPerHost(int* tpus);
 
   // Returns the memory capacity of the TPUs on this host.
-  static Status TpuMemoryLimit(int64_t* memory_limit);
+  static tsl::Status TpuMemoryLimit(int64_t* memory_limit);
 
   absl::Mutex& mutex() { return event_map_mu_; }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h
index c2b71dca5a5..512e9603c43 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h
@@ -30,8 +30,6 @@ typedef SE_TpuTopology* TpuTopologyPtr;
 
 class TpuPlatformInterface : public stream_executor::Platform {
  public:
-  using Status = stream_executor::port::Status;
-
   // Returns a TPU platform to be used by TPU ops. If multiple TPU platforms are
   // registered, finds the most suitable one. Returns nullptr if no TPU platform
   // is registered or an error occurred.
@@ -43,11 +41,11 @@ class TpuPlatformInterface : public stream_executor::Platform {
   static TpuPlatformInterface* GetRegisteredPlatform(
       bool initialize_platform = true, int num_tries = 5);
 
-  virtual Status Reset(bool only_tear_down, absl::string_view reason) = 0;
+  virtual tsl::Status Reset(bool only_tear_down, absl::string_view reason) = 0;
 
-  Status Reset(absl::string_view reason) { return Reset(false, reason); }
+  tsl::Status Reset(absl::string_view reason) { return Reset(false, reason); }
 
-  Status Reset() { return Reset(false, {}); }
+  tsl::Status Reset() { return Reset(false, {}); }
 
   virtual int64_t TpuMemoryLimit() = 0;
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h
index f3fe11452f2..09d53163053 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h
@@ -28,45 +28,45 @@ namespace tpu {
 
 class TpuStream : public tensorflow::tpu::TpuStreamInterface {
  public:
-  using Status = stream_executor::port::Status;
-
   explicit TpuStream(SE_Stream* stream) : stream_(stream) {}
   ~TpuStream() override {
-    tensorflow::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
+    stream_executor::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
   }
 
   bool IsSameSharedMemoryLocation(
       tensorflow::tpu::TpuStreamInterface* other) override {
-    return tensorflow::tpu::ExecutorApiFn()
+    return stream_executor::tpu::ExecutorApiFn()
         ->TpuStream_IsSameSharedMemoryLocationFn(
             stream_, static_cast<TpuStream*>(other)->stream_);
   }
 
-  Status EnqueueTransferHostToDevice(
+  tsl::Status EnqueueTransferHostToDevice(
       stream_executor::DeviceMemoryBase device_dst, const void* host_src,
       uint64_t size) {
     StatusHelper status;
-    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferHostToDeviceFn(
-        stream_, ApiConverter::ToC(device_dst), const_cast<void*>(host_src),
-        size, status.c_status);
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_EnqueueTransferHostToDeviceFn(
+            stream_, ApiConverter::ToC(device_dst), const_cast<void*>(host_src),
+            size, status.c_status);
     return status.status();
   }
 
-  Status EnqueueTransferDeviceToHost(
+  tsl::Status EnqueueTransferDeviceToHost(
       stream_executor::DeviceMemoryBase device_src, void* host_dst,
       uint64_t size) {
     StatusHelper status;
-    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferDeviceToHostFn(
-        stream_, ApiConverter::ToC(device_src), host_dst, size,
-        status.c_status);
+    stream_executor::tpu::ExecutorApiFn()
+        ->TpuStream_EnqueueTransferDeviceToHostFn(
+            stream_, ApiConverter::ToC(device_src), host_dst, size,
+            status.c_status);
     return status.status();
   }
 
-  Status EnqueueOnTpuDeviceSendRecvLocal(
+  tsl::Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) override {
     StatusHelper status;
-    tensorflow::tpu::ExecutorApiFn()
+    stream_executor::tpu::ExecutorApiFn()
         ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
             stream_, ApiConverter::ToC(send_buffer),
             ApiConverter::ToC(recv_buffer), status.c_status);
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream_interface.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream_interface.h
index 4e6641ff7e8..01aa3b6b63d 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream_interface.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_stream_interface.h
@@ -25,10 +25,8 @@ namespace tpu {
 
 class TpuStreamInterface : public stream_executor::internal::StreamInterface {
  public:
-  using Status = stream_executor::port::Status;
-
   virtual bool IsSameSharedMemoryLocation(TpuStreamInterface* other) = 0;
-  virtual Status EnqueueOnTpuDeviceSendRecvLocal(
+  virtual tsl::Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) = 0;
 };
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h
index 882fab9da62..8cbd0f2f1a0 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h
@@ -27,13 +27,15 @@ class TpuTimer : public ::stream_executor::internal::TimerInterface {
  public:
   explicit TpuTimer(SE_Timer* timer) : timer_(timer) {}
   ~TpuTimer() override {
-    tensorflow::tpu::ExecutorApiFn()->TpuTimer_FreeFn(timer_);
+    stream_executor::tpu::ExecutorApiFn()->TpuTimer_FreeFn(timer_);
   }
   uint64_t Microseconds() const override {
-    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_MicrosecondsFn(timer_);
+    return stream_executor::tpu::ExecutorApiFn()->TpuTimer_MicrosecondsFn(
+        timer_);
   }
   uint64_t Nanoseconds() const override {
-    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_NanosecondsFn(timer_);
+    return stream_executor::tpu::ExecutorApiFn()->TpuTimer_NanosecondsFn(
+        timer_);
   }
 
  private:
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.cc
index 9feb56a63a3..88d0d06cc86 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.h"
 
 #include <cstdint>
+#include <string>
+#include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 
@@ -24,37 +26,41 @@ namespace tpu {
 
 TpuDimensionsExternal TpuCoreLocationExternal::chip_coordinates() const {
   int x, y, z;
-  tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinatesFn(core_location_, &x,
-                                                          &y, &z);
+  stream_executor::tpu::ExecutorApiFn()->TpuCoreLocation_ChipCoordinatesFn(
+      core_location_, &x, &y, &z);
   return {x, y, z};
 }
 
 TpuDimensionsExternal TpuCoreLocationExternal::host_coordinates() const {
   int x, y, z;
-  tpu::ExecutorApiFn()->TpuCoreLocation_HostCoordinatesFn(core_location_, &x,
-                                                          &y, &z);
+  stream_executor::tpu::ExecutorApiFn()->TpuCoreLocation_HostCoordinatesFn(
+      core_location_, &x, &y, &z);
   return {x, y, z};
 }
 
 int32_t TpuCoreLocationExternal::index() const {
-  return tpu::ExecutorApiFn()->TpuCoreLocation_IndexFn(core_location_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuCoreLocation_IndexFn(
+      core_location_);
 }
 
 int32_t TpuCoreLocationExternal::Id() const {
-  return tpu::ExecutorApiFn()->TpuCoreLocation_IdFn(core_location_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuCoreLocation_IdFn(
+      core_location_);
 }
 
 int32_t TpuHostLocationExternal::Id() const {
-  return tpu::ExecutorApiFn()->TpuHostLocation_IdFn(host_location_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuHostLocation_IdFn(
+      host_location_);
 }
 
 std::vector<TpuCoreLocationExternal> TpuHostLocationExternal::Cores(
     TpuCoreTypeEnum core_type) const {
-  int num_cores = tpu::ExecutorApiFn()->TpuHostLocation_NumCoresFn(
-      host_location_, core_type);
+  int num_cores =
+      stream_executor::tpu::ExecutorApiFn()->TpuHostLocation_NumCoresFn(
+          host_location_, core_type);
   std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
-  tpu::ExecutorApiFn()->TpuHostLocation_CoresFn(host_location_, core_type,
-                                                core_ptrs.data());
+  stream_executor::tpu::ExecutorApiFn()->TpuHostLocation_CoresFn(
+      host_location_, core_type, core_ptrs.data());
   std::vector<TpuCoreLocationExternal> result;
   result.reserve(num_cores);
   for (SE_TpuTopology_Core* ptr : core_ptrs) {
@@ -65,54 +71,62 @@ std::vector<TpuCoreLocationExternal> TpuHostLocationExternal::Cores(
 
 int32_t TpuTopologyExternal::LogicalDevicesPerHost(
     TpuCoreTypeEnum core_type) const {
-  return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerHostFn(topology_,
-                                                                   core_type);
+  return stream_executor::tpu::ExecutorApiFn()
+      ->TpuTopology_LogicalDevicesPerHostFn(topology_, core_type);
 }
 
 int32_t TpuTopologyExternal::LogicalDevicesPerChip(
     TpuCoreTypeEnum core_type) const {
-  return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerChipFn(topology_,
-                                                                   core_type);
+  return stream_executor::tpu::ExecutorApiFn()
+      ->TpuTopology_LogicalDevicesPerChipFn(topology_, core_type);
 }
 
 int32_t TpuTopologyExternal::HostCount() const {
-  return tpu::ExecutorApiFn()->TpuTopology_HostCountFn(topology_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuTopology_HostCountFn(
+      topology_);
 }
 
 int32_t TpuTopologyExternal::ChipsPerHost() const {
-  return tpu::ExecutorApiFn()->TpuTopology_ChipsPerHostFn(topology_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuTopology_ChipsPerHostFn(
+      topology_);
 }
 
 TpuTopologyChipBoundsExternal TpuTopologyExternal::chip_bounds() const {
-  return {tpu::ExecutorApiFn()->TpuTopology_ChipBounds_XFn(topology_),
-          tpu::ExecutorApiFn()->TpuTopology_ChipBounds_YFn(topology_),
-          tpu::ExecutorApiFn()->TpuTopology_ChipBounds_ZFn(topology_)};
+  return {stream_executor::tpu::ExecutorApiFn()->TpuTopology_ChipBounds_XFn(
+              topology_),
+          stream_executor::tpu::ExecutorApiFn()->TpuTopology_ChipBounds_YFn(
+              topology_),
+          stream_executor::tpu::ExecutorApiFn()->TpuTopology_ChipBounds_ZFn(
+              topology_)};
 }
 
 bool TpuTopologyExternal::HasChip(int x, int y, int z) const {
-  return tpu::ExecutorApiFn()->TpuTopology_HasChipFn(topology_, x, y, z);
+  return stream_executor::tpu::ExecutorApiFn()->TpuTopology_HasChipFn(topology_,
+                                                                      x, y, z);
 }
 
 TpuCoreLocationExternal TpuTopologyExternal::CoreForId(
     TpuCoreTypeEnum core_type, int id) const {
   return TpuCoreLocationExternal(
-      tpu::ExecutorApiFn()->TpuTopology_CoreForIdFn(topology_, core_type, id));
+      stream_executor::tpu::ExecutorApiFn()->TpuTopology_CoreForIdFn(
+          topology_, core_type, id));
 }
 
 TpuCoreLocationExternal TpuTopologyExternal::Core(TpuCoreTypeEnum core_type,
                                                   int x, int y, int z,
                                                   int index) const {
-  return TpuCoreLocationExternal(tpu::ExecutorApiFn()->TpuTopology_CoreFn(
-      topology_, core_type, x, y, z, index));
+  return TpuCoreLocationExternal(
+      stream_executor::tpu::ExecutorApiFn()->TpuTopology_CoreFn(
+          topology_, core_type, x, y, z, index));
 }
 
 std::vector<TpuCoreLocationExternal> TpuTopologyExternal::cores(
     TpuCoreTypeEnum core_type) const {
-  int num_cores =
-      tpu::ExecutorApiFn()->TpuTopology_NumCoresFn(topology_, core_type);
+  int num_cores = stream_executor::tpu::ExecutorApiFn()->TpuTopology_NumCoresFn(
+      topology_, core_type);
   std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
-  tpu::ExecutorApiFn()->TpuTopology_CoresFn(topology_, core_type,
-                                            core_ptrs.data());
+  stream_executor::tpu::ExecutorApiFn()->TpuTopology_CoresFn(
+      topology_, core_type, core_ptrs.data());
   std::vector<TpuCoreLocationExternal> result;
   result.reserve(num_cores);
   for (SE_TpuTopology_Core* ptr : core_ptrs) {
@@ -122,12 +136,13 @@ std::vector<TpuCoreLocationExternal> TpuTopologyExternal::cores(
 }
 
 int TpuTopologyExternal::IdForHost(TpuDimensionsExternal host) const {
-  return tpu::ExecutorApiFn()->TpuTopology_IdForHostFn(topology_, host.x,
-                                                       host.y, host.z);
+  return stream_executor::tpu::ExecutorApiFn()->TpuTopology_IdForHostFn(
+      topology_, host.x, host.y, host.z);
 }
 
 TpuVersionEnum TpuTopologyExternal::version() const {
-  return tpu::ExecutorApiFn()->TpuTopology_VersionFn(topology_);
+  return stream_executor::tpu::ExecutorApiFn()->TpuTopology_VersionFn(
+      topology_);
 }
 
 std::string TpuVersionEnumToString(TpuVersionEnum version) {
@@ -140,6 +155,7 @@ std::string TpuVersionEnumToString(TpuVersionEnum version) {
       return "TPU v3";
     case kTpuV4:
       return "TPU v4";
+// reserved for internal use
   }
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.cc
index 604741824fd..55bf1de221b 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h"
 
+#include <deque>
+#include <functional>
 #include <utility>
+#include <vector>
 
 #include "absl/cleanup/cleanup.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -36,16 +39,15 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-using Status = stream_executor::port::Status;
 template <typename T>
-using StatusOr = stream_executor::port::StatusOr<T>;
+using StatusOr = tsl::StatusOr<T>;
 
 TpuTransferManager::TpuTransferManager() {
-  manager_ = tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
+  manager_ = stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
 }
 
 TpuTransferManager::~TpuTransferManager() {
-  tpu::ExecutorApiFn()->TpuTransferManager_FreeFn(manager_);
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_FreeFn(manager_);
 }
 
 stream_executor::Platform::Id TpuTransferManager::PlatformId() const {
@@ -59,15 +61,16 @@ xla::Shape TpuTransferManager::HostShapeToDeviceShape(
 
   ApiConverter::ToC(host_shape, &c_host_shape);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_HostShapeToDeviceShapeFn(
-      manager_, &c_host_shape, &c_device_shape);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_HostShapeToDeviceShapeFn(manager_, &c_host_shape,
+                                                    &c_device_shape);
   xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
   ApiConverter::Destroy(&c_host_shape);
   ApiConverter::Destroy(&c_device_shape);
   return device_shape;
 }
 
-Status TpuTransferManager::TransferLiteralToDeviceAsync(
+tsl::Status TpuTransferManager::TransferLiteralToDeviceAsync(
     stream_executor::Stream* stream, const xla::LiteralSlice& literal,
     const xla::ShapedBuffer& device_buffer,
     const TransferMetadata* transfer_metadata) {
@@ -79,37 +82,41 @@ Status TpuTransferManager::TransferLiteralToDeviceAsync(
   XLA_ShapedBuffer c_device_buffer;
   ApiConverter::ToC(device_buffer, &c_device_buffer);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
-      manager_,
-      TpuPlatform::GetRegisteredPlatform()->LookupStream(
-          stream->implementation()),
-      &c_literal, &c_device_buffer, status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
+          manager_,
+          TpuPlatform::GetRegisteredPlatform()->LookupStream(
+              stream->implementation()),
+          &c_literal, &c_device_buffer, status.c_status);
   ApiConverter::Destroy(&c_device_buffer);
   ApiConverter::Destroy(&c_literal);
   return status.status();
 }
 
-Status TpuTransferManager::TransferLiteralToInfeed(
+tsl::Status TpuTransferManager::TransferLiteralToInfeed(
     stream_executor::StreamExecutor* executor,
     const xla::LiteralSlice& literal) {
   StatusHelper status;
   XLA_Literal c_literal;
   ApiConverter::ToC(literal, &c_literal);
-  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
+      executor->implementation());
 
-  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToInfeedFn(
-      manager_, tpu_executor->se_executor(), &c_literal, status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_TransferLiteralToInfeedFn(
+          manager_, tpu_executor->se_executor(), &c_literal, status.c_status);
 
   ApiConverter::Destroy(&c_literal);
 
   return status.status();
 }
 
-Status TpuTransferManager::TransferBuffersToInfeed(
+tsl::Status TpuTransferManager::TransferBuffersToInfeed(
     se::StreamExecutor* executor,
     const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) {
   StatusHelper status;
-  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
+      executor->implementation());
 
   std::vector<int64_t> buffers_size;
   std::vector<uint32_t*> buffers_array;
@@ -123,26 +130,29 @@ Status TpuTransferManager::TransferBuffersToInfeed(
     buffers_size.push_back(span.size());
   }
 
-  tpu::ExecutorApiFn()->TpuTransferManager_TransferBuffersToInfeedFn(
-      manager_, tpu_executor->se_executor(), buffers_array.data(),
-      buffers_size.data(), buffers_size.size(), status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_TransferBuffersToInfeedFn(
+          manager_, tpu_executor->se_executor(), buffers_array.data(),
+          buffers_size.data(), buffers_size.size(), status.c_status);
   return status.status();
 }
 
-Status TpuTransferManager::TransferLiteralFromOutfeed(
+tsl::Status TpuTransferManager::TransferLiteralFromOutfeed(
     stream_executor::StreamExecutor* executor,
     xla::MutableBorrowingLiteral literal) {
   StatusHelper status;
   XLA_Shape c_shape;
   XLA_Literal c_literal;
-  auto* tpu_executor = static_cast<TpuExecutor*>(executor->implementation());
+  auto* tpu_executor = static_cast<stream_executor::tpu::TpuExecutor*>(
+      executor->implementation());
 
   ApiConverter::ToC(literal.shape(), &c_shape);
   ApiConverter::ToC(literal, &c_literal);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromOutfeedFn(
-      manager_, tpu_executor->se_executor(), &c_shape, &c_literal,
-      status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_TransferLiteralFromOutfeedFn(
+          manager_, tpu_executor->se_executor(), &c_shape, &c_literal,
+          status.c_status);
 
   ApiConverter::Destroy(&c_shape);
   ApiConverter::Destroy(&c_literal);
@@ -150,37 +160,38 @@ Status TpuTransferManager::TransferLiteralFromOutfeed(
   return status.status();
 }
 
-Status TpuTransferManager::ResetDevices(
+tsl::Status TpuTransferManager::ResetDevices(
     absl::Span<stream_executor::StreamExecutor* const> executor) {
   StatusHelper status;
   std::vector<SE_StreamExecutor*> se;
   se.reserve(executor.size());
   for (int64_t i = 0; i < executor.size(); ++i) {
-    se.push_back(static_cast<TpuExecutor*>(executor[i]->implementation())
+    se.push_back(static_cast<stream_executor::tpu::TpuExecutor*>(
+                     executor[i]->implementation())
                      ->se_executor());
   }
 
-  tpu::ExecutorApiFn()->TpuTransferManager_ResetDevicesFn(
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_ResetDevicesFn(
       manager_, se.data(), executor.size(), status.c_status);
   return status.status();
 }
 
 struct TransferFromDeviceState {
   std::atomic<int64_t> remaining_transfers;
-  TF_Status* overall_status =
-      tpu::ExecutorApiFn()->TpuStatus_NewFn();  // OK or the first error
-  std::function<void(Status)> done;
+  TF_Status* overall_status = stream_executor::tpu::ExecutorApiFn()
+                                  ->TpuStatus_NewFn();  // OK or the first error
+  std::function<void(tsl::Status)> done;
 
   void TransferFinished(TF_Status* status) {
-    if (!tpu::ExecutorApiFn()->TpuStatus_OkFn(status) &&
-        tpu::ExecutorApiFn()->TpuStatus_OkFn(overall_status)) {
+    if (!stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(status) &&
+        stream_executor::tpu::ExecutorApiFn()->TpuStatus_OkFn(overall_status)) {
       std::swap(overall_status, status);
     }
-    tpu::ExecutorApiFn()->TpuStatus_FreeFn(status);
+    stream_executor::tpu::ExecutorApiFn()->TpuStatus_FreeFn(status);
 
     if (--remaining_transfers == 0) {
       done(StatusHelper::FromC(overall_status));
-      tpu::ExecutorApiFn()->TpuStatus_FreeFn(overall_status);
+      stream_executor::tpu::ExecutorApiFn()->TpuStatus_FreeFn(overall_status);
       delete this;
     }
   }
@@ -192,7 +203,7 @@ void TransferLiteralFromDeviceTrampoline(void* ctx, TF_Status* status) {
 
 void TpuTransferManager::TransferLiteralFromDevice(
     stream_executor::Stream* stream, const xla::ShapedBuffer& device_buffer,
-    xla::MutableBorrowingLiteral literal, std::function<void(Status)> done,
+    xla::MutableBorrowingLiteral literal, std::function<void(tsl::Status)> done,
     const TransferMetadata* transfer_metadata) {
   TransferFromDeviceState* state = new TransferFromDeviceState;
   state->remaining_transfers = 1;
@@ -202,11 +213,13 @@ void TpuTransferManager::TransferLiteralFromDevice(
   XLA_Literal c_literal;
   ApiConverter::ToC(literal, &c_literal);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromDeviceFn(
-      manager_,
-      TpuPlatform::GetRegisteredPlatform()->LookupStream(
-          stream->implementation()),
-      &c_device_buffer, &c_literal, TransferLiteralFromDeviceTrampoline, state);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_TransferLiteralFromDeviceFn(
+          manager_,
+          TpuPlatform::GetRegisteredPlatform()->LookupStream(
+              stream->implementation()),
+          &c_device_buffer, &c_literal, TransferLiteralFromDeviceTrampoline,
+          state);
   ApiConverter::Destroy(&c_device_buffer);
   ApiConverter::Destroy(&c_literal);
 }
@@ -217,8 +230,8 @@ int64_t TpuTransferManager::GetByteSizeRequirement(
   ApiConverter::ToC(shape, &c_shape);
 
   int64_t size_in_bytes =
-      tpu::ExecutorApiFn()->TpuTransferManager_GetByteSizeRequirementFn(
-          manager_, &c_shape);
+      stream_executor::tpu::ExecutorApiFn()
+          ->TpuTransferManager_GetByteSizeRequirementFn(manager_, &c_shape);
 
   ApiConverter::Destroy(&c_shape);
   return size_in_bytes;
@@ -230,8 +243,9 @@ StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
   ApiConverter::ToC(host_shape, &c_host_shape);
   XLA_Shape c_output;
   StatusHelper status;
-  tpu::ExecutorApiFn()->TpuTransferManager_ChooseCompactLayoutForShapeFn(
-      manager_, &c_host_shape, &c_output, status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_ChooseCompactLayoutForShapeFn(
+          manager_, &c_host_shape, &c_output, status.c_status);
   // TODO(skyewm): use a scoped version of XLA_Shape
   ApiConverter::Destroy(&c_host_shape);
   if (!status.status().ok()) {
@@ -246,13 +260,14 @@ StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
 bool TpuTransferManager::CanShapedBufferBeAccessedNow(
     stream_executor::StreamExecutor* executor,
     const xla::ShapedBuffer& device_buffer) const {
-  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  auto* tpu_executor =
+      down_cast<stream_executor::tpu::TpuExecutor*>(executor->implementation());
   XLA_ShapedBuffer c_device_buffer;
   ApiConverter::ToC(device_buffer, &c_device_buffer);
   absl::Cleanup cleanup = [&c_device_buffer]() {
     ApiConverter::Destroy(&c_device_buffer);
   };
-  return tpu::ExecutorApiFn()
+  return stream_executor::tpu::ExecutorApiFn()
       ->TpuTransferManager_CanShapedBufferBeAccessedNowFn(
           manager_, tpu_executor->se_executor(), &c_device_buffer);
 }
@@ -260,15 +275,17 @@ bool TpuTransferManager::CanShapedBufferBeAccessedNow(
 bool TpuTransferManager::CanBufferBeAccessedNow(
     se::StreamExecutor* executor,
     const se::DeviceMemoryBase& device_buffer) const {
-  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  auto* tpu_executor =
+      down_cast<stream_executor::tpu::TpuExecutor*>(executor->implementation());
   SE_DeviceMemoryBase c_device_buffer{const_cast<void*>(device_buffer.opaque()),
                                       device_buffer.size(),
                                       device_buffer.payload()};
-  return tpu::ExecutorApiFn()->TpuTransferManager_CanBufferBeAccessedNowFn(
-      manager_, tpu_executor->se_executor(), &c_device_buffer);
+  return stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_CanBufferBeAccessedNowFn(
+          manager_, tpu_executor->se_executor(), &c_device_buffer);
 }
 
-Status TpuTransferManager::WriteSingleTupleIndexTable(
+tsl::Status TpuTransferManager::WriteSingleTupleIndexTable(
     stream_executor::Stream* stream,
     absl::Span<const stream_executor::DeviceMemoryBase> elements,
     const xla::Shape& shape, stream_executor::DeviceMemoryBase* region) {
@@ -286,18 +303,20 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
                                   region->payload()};
   StatusHelper status;
 
-  tpu::ExecutorApiFn()->TpuTransferManager_WriteSingleTupleIndexTableFn(
-      manager_,
-      TpuPlatform::GetRegisteredPlatform()->LookupStream(
-          stream->implementation()),
-      elements_bases, elements.size(), &c_shape, &region_base, status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_WriteSingleTupleIndexTableFn(
+          manager_,
+          TpuPlatform::GetRegisteredPlatform()->LookupStream(
+              stream->implementation()),
+          elements_bases, elements.size(), &c_shape, &region_base,
+          status.c_status);
 
   delete[] elements_bases;
   ApiConverter::Destroy(&c_shape);
   return status.status();
 }
 
-Status TpuTransferManager::LinearizeToBuffers(
+tsl::Status TpuTransferManager::LinearizeToBuffers(
     const xla::LiteralSlice& literal,
     std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) {
   XLA_Literal c_literal;
@@ -308,9 +327,10 @@ Status TpuTransferManager::LinearizeToBuffers(
   int64_t buffers_array_size;
   StatusHelper status;
 
-  tpu::ExecutorApiFn()->TpuTransferManager_LinearizeToBuffersFn(
-      manager_, &c_literal, &buffers_array, &buffers_size, &buffers_array_size,
-      status.c_status);
+  stream_executor::tpu::ExecutorApiFn()
+      ->TpuTransferManager_LinearizeToBuffersFn(
+          manager_, &c_literal, &buffers_array, &buffers_size,
+          &buffers_array_size, status.c_status);
 
   for (int64_t i = 0; i < buffers_array_size; ++i) {
     tpu::NoncopyableBuffer buf(buffers_size[i]);
@@ -319,23 +339,23 @@ Status TpuTransferManager::LinearizeToBuffers(
     buffers->push_back(std::move(buf));
   }
 
-  tpu::ExecutorApiFn()->TpuTransferManager_FreeBuffersFn(
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_FreeBuffersFn(
       buffers_array, buffers_size, buffers_array_size);
 
   ApiConverter::Destroy(&c_literal);
   return status.status();
 }
 
-Status TpuTransferManager::ReadDynamicShapes(se::Stream* stream,
-                                             xla::ShapedBuffer* device_buffer,
-                                             xla::Shape* device_shape) {
+tsl::Status TpuTransferManager::ReadDynamicShapes(
+    se::Stream* stream, const xla::ShapedBuffer* device_buffer,
+    xla::Shape* device_shape) {
   XLA_ShapedBuffer c_device_buffer;
   XLA_Shape c_device_shape;
   ApiConverter::ToC(*device_buffer, &c_device_buffer);
   ApiConverter::ToC(*device_shape, &c_device_shape);
   XLA_Shape c_updated_shape;
   StatusHelper status;
-  ExecutorApiFn()->TpuTransferManager_ReadDynamicShapesFn(
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_ReadDynamicShapesFn(
       TpuPlatform::GetRegisteredPlatform()->LookupStream(
           stream->implementation()),
       &c_device_buffer, c_device_shape, &c_updated_shape, status.c_status);
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h b/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h
index 480394a41cd..e2a751e4e86 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
 
+#include <deque>
+#include <functional>
+
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -32,37 +35,38 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
   TpuTransferManager();
   ~TpuTransferManager() override;
 
-  using Status = stream_executor::port::Status;
   template <typename T>
-  using StatusOr = stream_executor::port::StatusOr<T>;
+  using StatusOr = tsl::StatusOr<T>;
 
   stream_executor::Platform::Id PlatformId() const override;
 
   xla::Shape HostShapeToDeviceShape(
       const xla::Shape& host_shape) const override;
 
-  Status TransferLiteralToDeviceAsync(
+  tsl::Status TransferLiteralToDeviceAsync(
       stream_executor::Stream* stream, const xla::LiteralSlice& literal,
       const xla::ShapedBuffer& device_buffer,
       const TransferMetadata* transfer_metadata) override;
 
   void TransferLiteralFromDevice(
       stream_executor::Stream* stream, const xla::ShapedBuffer& device_buffer,
-      xla::MutableBorrowingLiteral literal, std::function<void(Status)> done,
+      xla::MutableBorrowingLiteral literal,
+      std::function<void(tsl::Status)> done,
       const TransferMetadata* transfer_metadata) override;
 
-  Status TransferLiteralToInfeed(stream_executor::StreamExecutor* executor,
-                                 const xla::LiteralSlice& literal) override;
+  tsl::Status TransferLiteralToInfeed(
+      stream_executor::StreamExecutor* executor,
+      const xla::LiteralSlice& literal) override;
 
-  Status TransferLiteralFromOutfeed(
+  tsl::Status TransferLiteralFromOutfeed(
       stream_executor::StreamExecutor* executor,
       xla::MutableBorrowingLiteral literal) override;
 
-  Status TransferBuffersToInfeed(
+  tsl::Status TransferBuffersToInfeed(
       se::StreamExecutor* executor,
       const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) override;
 
-  Status ResetDevices(
+  tsl::Status ResetDevices(
       absl::Span<stream_executor::StreamExecutor* const> executor) override;
 
   int64_t GetByteSizeRequirement(const xla::Shape& shape) const override;
@@ -78,18 +82,19 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
       se::StreamExecutor* executor,
       const se::DeviceMemoryBase& device_buffer) const override;
 
-  Status WriteSingleTupleIndexTable(
+  tsl::Status WriteSingleTupleIndexTable(
       stream_executor::Stream* stream,
       absl::Span<const stream_executor::DeviceMemoryBase> elements,
       const xla::Shape& shape,
       stream_executor::DeviceMemoryBase* region) override;
 
-  Status LinearizeToBuffers(
+  tsl::Status LinearizeToBuffers(
       const xla::LiteralSlice& literal,
       std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) override;
 
-  Status ReadDynamicShapes(se::Stream* stream, xla::ShapedBuffer* device_buffer,
-                           xla::Shape* device_shape) override;
+  tsl::Status ReadDynamicShapes(se::Stream* stream,
+                                const xla::ShapedBuffer* device_buffer,
+                                xla::Shape* device_shape) override;
 
  private:
   XLA_TransferManager* manager_;
diff --git a/tensorflow/compiler/xla/stream_executor/trace_listener.h b/tensorflow/compiler/xla/stream_executor/trace_listener.h
index 163e756b077..4dfa1fcada0 100644
--- a/tensorflow/compiler/xla/stream_executor/trace_listener.h
+++ b/tensorflow/compiler/xla/stream_executor/trace_listener.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/kernel.h"
 #include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace stream_executor {
 
@@ -55,18 +55,18 @@ class TraceListener {
                                          const void* host_src, int64_t size,
                                          DeviceMemoryBase* gpu_dst) {}
   virtual void SynchronousMemcpyH2DComplete(int64_t correlation_id,
-                                            const port::Status* result) {}
+                                            const tsl::Status* result) {}
 
   virtual void SynchronousMemcpyD2HBegin(int64_t correlation_id,
                                          const DeviceMemoryBase& gpu_src,
                                          int64_t size, void* host_dst) {}
   virtual void SynchronousMemcpyD2HComplete(int64_t correlation_id,
-                                            const port::Status* result) {}
+                                            const tsl::Status* result) {}
 
   virtual void BlockHostUntilDoneBegin(int64_t correlation_id, Stream* stream) {
   }
   virtual void BlockHostUntilDoneComplete(int64_t correlation_id,
-                                          const port::Status* result) {}
+                                          const tsl::Status* result) {}
 };
 
 }  // namespace stream_executor
diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h
index 71e16f16831..0c609cda521 100644
--- a/tensorflow/compiler/xla/test.h
+++ b/tensorflow/compiler/xla/test.h
@@ -39,7 +39,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
 #include <gmock/gmock.h>  // IWYU pragma: export
 #else
-#include <gmock/gmock-generated-matchers.h>  // IWYU pragma: export
+#include <gmock/gmock-actions.h>
 #include <gmock/gmock-matchers.h>            // IWYU pragma: export
 #include <gmock/gmock-more-matchers.h>       // IWYU pragma: export
 #endif
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6fe56f92ecb..2791ec43a98 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -6,11 +6,12 @@ load(
     "//tensorflow/tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "genrule")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -148,9 +149,6 @@ cc_library(
     hdrs = ["pjrt_client_registry.h"],
     deps = [
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/compiler/xla/service:hlo_runner",
-        "//tensorflow/compiler/xla/service:hlo_runner_interface",
-        "//tensorflow/compiler/xla/service:hlo_runner_pjrt",
     ],
 )
 
@@ -199,6 +197,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo_module_util",
@@ -218,7 +217,7 @@ cc_library(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "local_client_aot_test_helper",
     srcs = ["local_client_aot_test_helper.cc"],
     deps = [
@@ -272,6 +271,7 @@ cc_library(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/lib/core:bitmap",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/memory",
@@ -800,26 +800,6 @@ xla_test(
     ],
 )
 
-cc_library(
-    name = "exhaustive_op_test_utils",
-    testonly = True,
-    srcs = ["exhaustive_op_test_utils.cc"],
-    hdrs = ["exhaustive_op_test_utils.h"],
-    tags = ["no_pip"],
-    deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":xla_internal_test_main",
-        "//tensorflow/compiler/xla:bit_cast",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/service:shaped_buffer",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "conv_depthwise_common",
     testonly = True,
@@ -839,106 +819,6 @@ cc_library(
     ],
 )
 
-xla_test(
-    name = "exhaustive_unary_test_f32_or_smaller",
-    srcs = ["exhaustive_unary_test_f32_or_smaller.cc"],
-    real_hardware_only = True,  # Very slow on the interpreter.
-    shard_count = 48,
-    tags = [
-        "optonly",
-        # This is a big test that we skip for capacity reasons in OSS testing.
-        "no_oss",
-        "no_pip",
-    ],
-    deps = [
-        ":client_library_test_base",
-        ":exhaustive_op_test_utils",
-        "//tensorflow/compiler/xla:util",
-    ],
-)
-
-xla_test(
-    name = "exhaustive_unary_test_f64",
-    srcs = ["exhaustive_unary_test_f64.cc"],
-    backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 48,
-    tags = [
-        "optonly",
-        # This is a big test that we skip for capacity reasons in OSS testing.
-        "no_oss",
-        "no_pip",
-        # TODO(b/151340488): Timed out on 2020-03-18.
-        "nozapfhahn",
-    ],
-    deps = [
-        ":client_library_test_base",
-        ":exhaustive_op_test_utils",
-        "//tensorflow/compiler/xla:util",
-    ],
-)
-
-xla_test(
-    name = "exhaustive_unary_test_complex",
-    srcs = ["exhaustive_unary_test_complex.cc"],
-    backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 48,
-    tags = [
-        "optonly",
-        # This is a big test that we skip for capacity reasons in OSS testing.
-        "no_oss",
-        "no_pip",
-    ],
-    deps = [
-        ":client_library_test_base",
-        ":exhaustive_op_test_utils",
-        "//tensorflow/compiler/xla:util",
-    ],
-)
-
-xla_test(
-    name = "exhaustive_binary_16_bit_test",
-    srcs = ["exhaustive_binary_16_bit_test.cc"],
-    backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 48,
-    tags = [
-        "optonly",
-        # This is a big test that we skip for capacity reasons in OSS testing.
-        "no_oss",
-        "no_pip",
-    ],
-    deps = [
-        ":exhaustive_op_test_utils",
-    ],
-)
-
-xla_test(
-    name = "exhaustive_binary_test_f32_f64",
-    srcs = ["exhaustive_binary_test_f32_f64.cc"],
-    backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 48,
-    tags = [
-        "optonly",
-        # This is a big test that we skip for capacity reasons in OSS testing.
-        "no_oss",
-        "no_pip",
-    ],
-    deps = [
-        ":exhaustive_op_test_utils",
-    ],
-)
-
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
@@ -995,6 +875,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:logging",
@@ -1030,6 +911,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:logging",
@@ -1069,6 +951,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:logging",
@@ -1081,6 +964,7 @@ xla_test(
 xla_test(
     name = "gather_operation_test",
     srcs = ["gather_operation_test.cc"],
+    tags = ["test_hlo_pjrt_runner"],
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -1134,6 +1018,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:logging",
@@ -1171,19 +1056,17 @@ xla_test(
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1376,6 +1259,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
+        "//tensorflow/tsl/platform:tensor_float_32_utils",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1406,6 +1291,22 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "convolution_cudnn_test",
+    timeout = "long",
+    srcs = ["convolution_cudnn_test.cc"],
+    backend_tags = {"gpu": [
+        "gpu",
+        "requires-gpu-sm70",
+    ]},
+    backends = ["gpu"],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/platform:test",
+    ],
+)
+
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
@@ -1769,13 +1670,12 @@ xla_test(
     name = "token_hlo_test",
     srcs = ["token_hlo_test.cc"],
     deps = [
+        ":hlo_test_base",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1808,17 +1708,19 @@ xla_test(
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
         ":xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1916,6 +1818,7 @@ xla_test(
 
 xla_test(
     name = "matrix_ops_simple_test",
+    timeout = "long",
     srcs = ["matrix_ops_simple_test.cc"],
     deps = [
         ":test_macros_header",
@@ -2075,6 +1978,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
@@ -2216,6 +2120,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
@@ -2313,6 +2218,27 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "gpu_dump_mlir_passes_test",
+    srcs = ["gpu_dump_mlir_passes_test.cc"],
+    backends = [
+        "gpu",
+    ],
+    deps = [
+        ":hlo_test_base",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_test(
     name = "value_inference_test",
     srcs = ["value_inference_test.cc"],
@@ -2474,7 +2400,7 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "llvm_compiler_test",
     srcs = ["llvm_compiler_test.cc"],
     tags = tf_cuda_tests_tags(),
@@ -2576,7 +2502,7 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "local_client_aot_test",
     srcs = [
         "local_client_aot_test.cc",
@@ -2665,7 +2591,7 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_metadata_test",
     srcs = [
         "hlo_metadata_test.cc",
@@ -2737,12 +2663,13 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "literal_test_util_test",
     srcs = ["literal_test_util_test.cc"],
     deps = [
+        ":literal_test_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
@@ -2798,7 +2725,7 @@ xla_test(
 )
 
 # A demo of test that loads an hlo module from a file and compares results on gpu and cpu.
-tf_cc_test(
+xla_cc_test(
     name = "sample_file_test",
     srcs = ["sample_file_test.cc"],
     data = ["isolated_convolution.hlo"],
@@ -2859,7 +2786,7 @@ xla_test(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "multiple_devices_on_host_test",
     srcs = ["multiple_devices_on_host_test.cc"],
     args = ["--xla_force_host_platform_device_count=4"],
@@ -2980,3 +2907,23 @@ xla_test(
         "//tensorflow/tsl/platform:logging",
     ],
 )
+
+# TODO(ezhulenev): Move this tests to xla/tests once all XLA backends
+# support XLA runtime FFI calls.
+xla_test(
+    name = "xla_ffi_test",
+    srcs = ["xla_ffi_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/runtime:ffi",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 5db44735a6e..812ef04fa6b 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <array>
 #include <cmath>
 #include <functional>
@@ -44,6 +45,22 @@ limitations under the License.
 namespace xla {
 namespace {
 
+template <typename T>
+std::vector<T> AppendSpecialPositiveValues(std::vector<T> values) {
+  const std::vector<T> kSpecialValues = {0,
+                                         1,
+                                         std::numeric_limits<T>::min(),
+                                         std::numeric_limits<T>::epsilon(),
+                                         std::numeric_limits<T>::max(),
+                                         std::numeric_limits<T>::infinity(),
+                                         std::numeric_limits<T>::quiet_NaN()};
+  values.insert(values.end(), kSpecialValues.begin(), kSpecialValues.end());
+  // Remove duplicate values.
+  auto last = std::unique(values.begin(), values.end());
+  values.erase(last, values.end());
+  return values;
+}
+
 template <typename T>
 std::pair<std::vector<T>, std::vector<T>> AllSignedPairs(
     absl::Span<const T> abs_vals) {
@@ -1575,19 +1592,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
-  // TODO(rmlarsen): The final frontier: getting over- and underflow to match
-  // std::pow.
-  //  auto min = std::numeric_limits<float>::min();
-  //  auto max = std::numeric_limits<float>::max();
-  auto kInf = std::numeric_limits<float>::infinity();
-  auto qnan = std::numeric_limits<float>::quiet_NaN();
   auto eps = std::numeric_limits<float>::epsilon();
 
-  std::vector<float> ys;
+  const std::vector<float> kTestValues = {
+      0.1f, 1.0f / 3.0f, 2.0f / 3.0f, 0.5f,      1.0f + eps,
+      2.0f, 3.0f,        M_PI,        1e6 + 0.1, 1e2};
   std::vector<float> xs;
-  std::tie(xs, ys) = AllSignedPairs<float>(
-      {0.0f, 1e-23, eps, 0.1f, 1.0f / 3.0f, 2.0f / 3.0f, 0.5f, 1.0f, 1.0f + eps,
-       2.0f, 3.0f, M_PI, 1e6 + 0.1, 1e23, kInf, qnan});
+  std::vector<float> ys;
+  std::tie(xs, ys) =
+      AllSignedPairs<float>(AppendSpecialPositiveValues(kTestValues));
   auto lhs = ConstantR1<float>(&builder, xs);
   auto rhs = ConstantR1<float>(&builder, ys);
   Pow(lhs, rhs);
@@ -1672,7 +1685,6 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
 
   for (float exponent : exponents) {
     XlaBuilder b(TestName());
-    LOG(ERROR) << "************* exponent = " << exponent;
     Pow(ConstantR1<float>(&b, values), ConstantR0<float>(&b, exponent));
     ComputeAndCompare(&b, {}, error_spec_);
   }
@@ -2420,20 +2432,84 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
 
 XLA_TEST_F(ArrayElementwiseOpTest, CosF32s) {
   XlaBuilder builder(TestName());
-  auto a = ConstantR1<float>(&builder, {3.14159f, 0.0f, 1.570796f, -0.78539f});
+  // Test a variety of values of both signs that stress trigonometric range
+  // reduction, as well as numbers that fall in different quadrants.
+  // -2.19993846e+10 is a hard case because this number is so close to a
+  // multiple of pi/2 that the leading 31 bits cancel in the Payne-Hanek
+  // algorithm, leading to catastrophic loss of (relative) accuracy unless
+  // 64-bit fixed pont arithmeic is used.
+  //
+  // Also test IEEE special values {+/-0, +/-Inf, NaN}; for the latter two
+  // Cos(x) should return NaN.
+  auto kInf = std::numeric_limits<float>::infinity();
+  auto kQNaN = std::numeric_limits<float>::quiet_NaN();
+  auto a = ConstantR1<float>(
+      &builder,
+      {-1.9938988e-28, 1.9938988e-28, -1e20f, 1e20f, -2.3564024f, -3.14159f,
+       3.14159f, -0.0f, 0.0f, -1.570796f, 1.570796f, -0.78539f, 0.78539f,
+       -2.19993846e+10, -1.70141183e+38, -kInf, kInf, kQNaN});
   Cos(a);
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f, 1.0f, 0.0f, 0.707107f}, {},
-                             error_spec_);
+  // This error spec corresponds to 1 ULP max relative error.
+  ComputeAndCompare(&builder, {},
+                    ErrorSpec(0, std::numeric_limits<float>::epsilon()));
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
   XlaBuilder builder(TestName());
-  auto a = ConstantR1<float>(&builder, {3.14159f, 0.0f, 1.570796f, -0.78539f});
+  // Test a variety of values of both signs that stress trigonometric range
+  // reduction, as well as numbers that fall in different quadrants.
+  // -2.19993846e+10 is a hard case because this number is so close to a
+  // multiple of pi/2 that the leading 31 bits cancel in the Payne-Hanek
+  // algorithm, leading to catastrophic loss of (relative) accuracy unless
+  // 64-bit fixed pont arithmeic is used.
+  //
+  // Also test IEEE special values {+/-0, +/-Inf, NaN}; for the latter two
+  // Sin(x) should return NaN.
+  auto kInf = std::numeric_limits<float>::infinity();
+  auto kQNaN = std::numeric_limits<float>::quiet_NaN();
+  auto a = ConstantR1<float>(
+      &builder,
+      {-1.9938988e-28, 1.9938988e-28, -1e20f, 1e20f, -2.3564024f, -3.14159f,
+       3.14159f, -0.0f, 0.0f, -1.570796f, 1.570796f, -0.78539f, 0.78539f,
+       -2.19993846e+10, -1.70141183e+38, -kInf, kInf, kQNaN});
   Sin(a);
 
-  ComputeAndCompareR1<float>(&builder, {0.0f, 0.0f, 1.0f, -0.707107f}, {},
-                             error_spec_);
+  // This error spec corresponds to 1 ULP max relative error.
+  ComputeAndCompare(&builder, {},
+                    ErrorSpec(0, std::numeric_limits<float>::epsilon()));
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, TanF32s) {
+  XlaBuilder builder(TestName());
+  auto kInf = std::numeric_limits<float>::infinity();
+  auto kQNaN = std::numeric_limits<float>::quiet_NaN();
+  auto a = ConstantR1<float>(
+      &builder,
+      {-1.9938988e-28, 1.9938988e-28, -1e20f, 1e20f, -2.3564024f, -3.14159f,
+       3.14159f, -0.0f, 0.0f, -1.570796f, 1.570796f, -0.78539f, 0.78539f,
+       -2.19993846e+10, -1.70141183e+38, -kInf, kInf, kQNaN});
+  Tan(a);
+
+  // This error spec corresponds to 1 ULP max relative error.
+  ComputeAndCompare(&builder, {},
+                    ErrorSpec(0, 2 * std::numeric_limits<float>::epsilon()));
+}
+
+// TODO(rmlarsen): Fix Sin/Cos for large F64 arguments.
+XLA_TEST_F(ArrayElementwiseOpTest, TanF64s) {
+  XlaBuilder builder(TestName());
+  auto kInf = std::numeric_limits<double>::infinity();
+  auto kQNaN = std::numeric_limits<double>::quiet_NaN();
+  auto a = ConstantR1<double>(
+      &builder,
+      {-1.9938988e-28, 1.9938988e-28, -2.3564024f, -3.14159f, 3.14159f, -0.0f,
+       0.0f, -1.570796f, 1.570796f, -0.78539f, 0.78539f, kInf, kInf, kQNaN});
+  Tan(a);
+
+  // This error spec corresponds to 1 ULP max relative error.
+  ComputeAndCompare(&builder, {},
+                    ErrorSpec(0, 100 * std::numeric_limits<double>::epsilon()));
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, RealF64s) {
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index e433a16dba0..979ae4e5cc5 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -89,6 +90,16 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
   ComputeAndCompareR1<int32_t>(&builder, expected, {});
 }
 
+TEST_F(BitcastConvertTest, ConvertR1F8e4m3fnToR1U8) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint8_t>(&builder, {0x38, 0xC0});
+  BitcastConvertType(a, F8E4M3FN);
+
+  std::vector<tsl::float8_e4m3fn> expected = {tsl::float8_e4m3fn{1.0},
+                                              tsl::float8_e4m3fn{-2.0}};
+  ComputeAndCompareR1<tsl::float8_e4m3fn>(&builder, expected, {});
+}
+
 TEST_F(BitcastConvertTest, ConvertS32Extremes) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<int32_t>(&builder, {std::numeric_limits<int32_t>::min(),
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 7ce2efe2c6a..10b30f63755 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -1,6 +1,6 @@
 """Build rules for XLA testing."""
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
@@ -144,7 +144,7 @@ def xla_test(
             for lib_dep in xla_test_library_deps:
                 backend_deps += ["%s_%s" % (lib_dep, backend)]
 
-        tf_cc_test(
+        xla_cc_test(
             name = test_name,
             srcs = srcs,
             tags = tags + backend_tags.get(backend, []) + this_backend_tags,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 1442dc2f624..2606ca8ea3c 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/bitmap.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -462,6 +463,8 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -487,6 +490,8 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -513,6 +518,8 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -540,6 +547,8 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -567,6 +576,8 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
@@ -593,6 +604,8 @@ void ClientLibraryTestBase::ComputeAndCompare(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
+                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
+                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
                     std::is_same<NativeT, complex64>::value ||
                     std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index b9b6103ab70..30bfd725aa0 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/blocking_counter.h"
 #include "tensorflow/tsl/platform/env.h"
@@ -32,14 +33,22 @@ limitations under the License.
 
 // Tests cross-GPU operations.
 //
-// This test requires at least four GPUs.  For instructions on running this
+// Several tests requires at least four GPUs.  For instructions on running this
 // within Google, see go/multi-gpu-unit-test.
 
+#define SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(x)                     \
+  if (num_devices_ < x) {                                         \
+    GTEST_SKIP() << "Test requires at least " << x << " devices"; \
+  }
+
 namespace xla {
 namespace {
 
 class CollectiveOpsTest : public HloTestBase {
  public:
+  CollectiveOpsTest() : num_devices_(backend().device_count()) {
+    VLOG(1) << "Running with " << num_devices_ << " devices";
+  }
   static void SetUpTestSuite() {
     // Not needed structly, since this test exercises cross replica collective
     // permute which does not use NCCL. But keeping it here for testing.
@@ -63,10 +72,10 @@ class CollectiveOpsTest : public HloTestBase {
 
       ENTRY test_computation {
         p = SHAPE parameter(0)
-        p2 = SHAPE bitcast(p)
+        p2 = SHAPE reshape(p)
         crs = SHAPE all-reduce(p2), replica_groups=REPLICA_GROUPS, to_apply=apply_op
         copy = SHAPE copy(crs)
-        ROOT out = SHAPE bitcast(copy)
+        ROOT out = SHAPE reshape(copy)
       }
     )";
     std::vector<std::string> replica_group_strs;
@@ -80,7 +89,7 @@ class CollectiveOpsTest : public HloTestBase {
       // Exercise the scalar codepath.
       hlo_template = absl::StrReplaceAll(
           hlo_template,
-          {{"DATATYPE[SHAPE] bitcast(p)", "DATATYPE[] bitcast(p)"},
+          {{"DATATYPE[SHAPE] reshape(p)", "DATATYPE[] reshape(p)"},
            {"DATATYPE[SHAPE] all-reduce", "DATATYPE[] all-reduce"},
            {"DATATYPE[SHAPE] copy", "DATATYPE[] copy"}});
     }
@@ -100,8 +109,8 @@ class CollectiveOpsTest : public HloTestBase {
     const int kNumReplicas = 2;
     std::string dtype = primitive_util::LowercasePrimitiveTypeName(
         primitive_util::NativeToPrimitiveType<LiteralType>());
-    auto config = GetModuleConfigForTest();
-    config.set_replica_count(kNumReplicas);
+    HloModuleConfig config =
+        GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
     auto module = MakeCrsModule(
         /*shape=*/input_value.shape(),
         /*replica_groups=*/{}, config,
@@ -109,7 +118,8 @@ class CollectiveOpsTest : public HloTestBase {
     TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
                             ExecuteReplicated(std::move(module), {&input_value},
                                               /*num_replicas=*/kNumReplicas,
-                                              /*use_threads=*/true));
+                                              /*use_threads=*/true,
+                                              /*run_hlo_passes=*/true));
     for (int replica_idx = 0; replica_idx < kNumReplicas; replica_idx++) {
       EXPECT_TRUE(LiteralTestUtil::NearOrEqual(
           expected_value, results[replica_idx], ErrorSpec{1e-5, 1e-5}));
@@ -140,6 +150,9 @@ class CollectiveOpsTest : public HloTestBase {
         /*input_value=*/input_value.Clone(),
         /*expected_value=*/to_literal({cast(1), cast(2), cast(3)}));
   }
+
+ protected:
+  const int64_t num_devices_;
 };
 
 // Returns the non-empty subsets of {0, 1, ..., n}.  For example,
@@ -222,8 +235,7 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceTwoReplicasOneOperand_half) {
   TestAllOpsForReduce<Eigen::half>();
 }
 
-XLA_TEST_F(CollectiveOpsTest,
-           DISABLED_ON_CPU(AllReduceTwoReplicasOneOperand_bfloat16)) {
+XLA_TEST_F(CollectiveOpsTest, AllReduceTwoReplicasOneOperand_bfloat16) {
   TestAllOpsForReduce<bfloat16>();
 }
 
@@ -262,15 +274,14 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceAnd_Pred) {
       id = u32[] replica-id()
       c = u32[] constant(0)
       p = pred[] compare(id, c), direction=EQ
-      p2 = pred[1] bitcast(p)
+      p2 = pred[1] reshape(p)
       crs = pred[1] all-reduce(p2), replica_groups={}, to_apply=apply_op
       copy = pred[1] copy(crs)
-      ROOT out = pred[1] bitcast(copy)
+      ROOT out = pred[1] reshape(copy)
     }
   )";
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(2);
+  HloModuleConfig config = GetModuleConfigForTest(/*replica_count=*/2);
   auto module = ParseAndReturnVerifiedModule(hlo_module, config).value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
                           ExecuteReplicated(std::move(module), {},
@@ -303,15 +314,14 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceOr_Pred) {
       id = u32[] replica-id()
       c = u32[] constant(0)
       p = pred[] compare(id, c), direction=EQ
-      p2 = pred[1] bitcast(p)
+      p2 = pred[1] reshape(p)
       crs = pred[1] all-reduce(p2), replica_groups={}, to_apply=apply_op
       copy = pred[1] copy(crs)
-      ROOT out = pred[1] bitcast(copy)
+      ROOT out = pred[1] reshape(copy)
     }
   )";
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(2);
+  HloModuleConfig config = GetModuleConfigForTest(/*replica_count=*/2);
   auto module = ParseAndReturnVerifiedModule(hlo_module, config).value();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
                           ExecuteReplicated(std::move(module), {},
@@ -323,20 +333,19 @@ XLA_TEST_F(CollectiveOpsTest, AllReduceOr_Pred) {
   }
 }
 
-// Tries all-reduce operations across all 2^kNumDevices - 1 combinations of
+// Tries all-reduce operations across all 2^kNumReplicas - 1 combinations of
 // devices in sequence.
 XLA_TEST_F(CollectiveOpsTest, AllReduce_AllCombinations) {
-  const int64_t kNumDevices = 4;
   const int64_t kNumElems = 1024;
 
-  for (std::vector<int64_t> devices : PowerSetOfIota(kNumDevices)) {
+  for (std::vector<int64_t> devices : PowerSetOfIota(num_devices_)) {
     SCOPED_TRACE(absl::StrFormat("Running on devices {%s}",
                                  absl::StrJoin(devices, ", ")));
 
     DeviceAssignment device_assn = MakeDeviceAssn(devices);
 
-    auto config = GetModuleConfigForTest();
-    config.set_replica_count(devices.size());
+    HloModuleConfig config =
+        GetModuleConfigForTest(/*replica_count=*/devices.size());
     config.set_static_device_assignment(device_assn);
 
     std::vector<float> input_vec(kNumElems);
@@ -356,7 +365,10 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_AllCombinations) {
 
 // Runs the same executable many times concurrently.  The all-reduces should not
 // conflict with one another.
-XLA_TEST_F(CollectiveOpsTest, AllReduce_ManyConcurrentAllReduces) {
+// http://b/259130904 [XLA:GPU] AllReduce_ManyConcurrentAllReduces subtest fails
+//                     with async all-reduce enables
+XLA_TEST_F(CollectiveOpsTest,
+           DISABLED_ON_GPU(AllReduce_ManyConcurrentAllReduces)) {
   const int64_t kNumElems = 1024;
   const int64_t kNumThreads = 200;
   const int64_t kRunsPerThread = 10;
@@ -365,8 +377,7 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_ManyConcurrentAllReduces) {
   absl::c_iota(input_vec, 0);
   auto input_literal = LiteralUtil::CreateR1<float>(input_vec);
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(2);
+  HloModuleConfig config = GetModuleConfigForTest(/*replica_count=*/2);
   auto executable =
       test_runner_
           .CreateExecutable(MakeCrsModule(input_literal.shape(),
@@ -415,8 +426,8 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_CombinableAllReduces) {
     }
   )";
   static constexpr int kNumReplicas = 2;
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
 
@@ -450,9 +461,11 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_CombinableAllReduces) {
 XLA_TEST_F(CollectiveOpsTest, AllReduce_ThreeReplicaGroups) {
   // Test a prime number so it's not all powers of 2.
   const int64_t kNumElems = 137;
+  const int64_t kNumReplicas = 4;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(4);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   std::vector<float> input_vec(kNumElems);
   absl::c_iota(input_vec, 0);
   auto input_literal = LiteralUtil::CreateR1<float>(input_vec);
@@ -496,8 +509,10 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_Degenerate) {
       }
     )";
   static constexpr int kNumReplicas = 4;
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
   TF_ASSERT_OK_AND_ASSIGN(
@@ -527,18 +542,19 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduce)) {
         ROOT done = u32[] all-reduce-done(start)
       }
     )";
-  static constexpr int kNumReplicas = 4;
-  HloModuleConfig config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/num_devices_);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                          ExecuteReplicated(std::move(module), {}, num_devices_,
                                             /*use_threads=*/true));
 
-  ASSERT_EQ(results.size(), kNumReplicas);
-  uint32_t expected = 6;  // sum [0,4)
-  for (int i = 0; i < kNumReplicas; ++i) {
+  ASSERT_EQ(results.size(), num_devices_);
+  // sum [0, num_devices)
+  uint32_t expected = num_devices_ * (num_devices_ - 1) / 2;
+  for (int i = 0; i < num_devices_; ++i) {
     LiteralTestUtil::ExpectR0Equal<uint32_t>(expected, results[i]);
   }
 }
@@ -560,19 +576,22 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduceTwoOperands)) {
         ROOT done = (u32[], u32[]) all-reduce-done(start)
       }
     )";
-  static constexpr int kNumReplicas = 4;
-  HloModuleConfig config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/num_devices_);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                          ExecuteReplicated(std::move(module), {}, num_devices_,
                                             /*use_threads=*/true));
 
-  ASSERT_EQ(results.size(), kNumReplicas);
-  uint32_t expected0 = 6;   // sum [0,4)
-  uint32_t expected1 = 14;  // sum squares [0,4)
-  for (int i = 0; i < kNumReplicas; ++i) {
+  ASSERT_EQ(results.size(), num_devices_);
+  // sum [0, num_devices)
+  uint32_t expected0 = num_devices_ * (num_devices_ - 1) / 2;
+  // sum squares [0, num_devices)
+  uint32_t expected1 =
+      num_devices_ * (num_devices_ - 1) * (2 * num_devices_ - 1) / 6;
+  for (int i = 0; i < num_devices_; ++i) {
     std::vector<Literal> replica_results = results[i].DecomposeTuple();
     LiteralTestUtil::ExpectR0Equal<uint32_t>(expected0, replica_results[0]);
     LiteralTestUtil::ExpectR0Equal<uint32_t>(expected1, replica_results[1]);
@@ -587,19 +606,18 @@ XLA_TEST_F(CollectiveOpsTest, ReplicaId) {
     ROOT out = u32[] copy(id)
   }
   )";
-  const int64_t kNumReplicas = 4;
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/num_devices_);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                          ExecuteReplicated(std::move(module), {}, num_devices_,
                                             /*use_threads=*/true));
 
-  ASSERT_EQ(results.size(), kNumReplicas);
-  for (uint32_t i = 0; i < kNumReplicas; ++i) {
+  ASSERT_EQ(results.size(), num_devices_);
+  for (uint32_t i = 0; i < num_devices_; ++i) {
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR0(i), results[i]));
   }
 }
@@ -617,9 +635,10 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Simple) {
   }
   )";
   const int64_t kNumReplicas = 4;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -638,7 +657,7 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Simple) {
                                      results[3]));
 }
 
-XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Degnerate) {
+XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Degenerate) {
   const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
@@ -651,9 +670,10 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Degnerate) {
   }
   )";
   const int64_t kNumReplicas = 4;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -671,7 +691,7 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Degnerate) {
                                      results[3]));
 }
 
-XLA_TEST_F(CollectiveOpsTest, CollectivePermute_NoDegnerate) {
+XLA_TEST_F(CollectiveOpsTest, CollectivePermute_NotDegenerate) {
   const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
@@ -684,9 +704,10 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_NoDegnerate) {
   }
   )";
   const int64_t kNumReplicas = 4;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -718,9 +739,10 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Rotate) {
   }
   )";
   const int64_t kNumReplicas = 4;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
 
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -738,6 +760,38 @@ XLA_TEST_F(CollectiveOpsTest, CollectivePermute_Rotate) {
                                      results[3]));
 }
 
+XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncCollectivePermute)) {
+  const absl::string_view kModuleStr = R"(
+      HloModule test
+
+      ENTRY test_computation {
+        replica = u32[] replica-id()
+        ten = u32[] constant(10)
+        sum = u32[] add(replica, ten)
+        p = u32[2] broadcast(sum), dimensions={}
+        start = (u32[2], u32[2], u32[], u32[]) collective-permute-start(p), source_target_pairs={{0,1}, {1,0}}
+        ROOT done = u32[2] collective-permute-done(start)
+      }
+    )";
+
+  const int64_t kNumReplicas = 2;
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
+                          ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                                            /*use_threads=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<uint32_t>({11, 11}),
+                                     results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<uint32_t>({10, 10}),
+                                     results[1]));
+}
+
 XLA_TEST_F(CollectiveOpsTest, AllToAll_EmptyReplicaGroups) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -761,7 +815,10 @@ XLA_TEST_F(CollectiveOpsTest, AllToAll_EmptyReplicaGroups) {
   }
   )";
   const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -802,7 +859,10 @@ XLA_TEST_F(CollectiveOpsTest, AllToAll_OrderedReplicaGroups) {
   }
   )";
   const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -837,7 +897,10 @@ XLA_TEST_F(CollectiveOpsTest, AllToAll_TwoReplicaGroups) {
   }
   )";
   const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -864,7 +927,10 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllToAll_SplitDimension)) {
   }
   )";
   const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  SKIP_TEST_IF_NUM_DEVICES_LESS_THAN(kNumReplicas)
+
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -890,12 +956,13 @@ XLA_TEST_F(CollectiveOpsTest, AllGather_Dim0) {
     id2 = u32[1, 2] broadcast(id), dimensions={}
     a0 = u32[1, 2] constant({{10, 15}})
     a1 = u32[1, 2] add(id2, a0)
-    allgather = u32[4, 2] all-gather(a1), dimensions={0}
-    ROOT out = u32[8] reshape(allgather)
+    allgather = u32[2, 2] all-gather(a1), dimensions={0}
+    ROOT out = u32[4] reshape(allgather)
   }
   )";
-  const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -905,8 +972,7 @@ XLA_TEST_F(CollectiveOpsTest, AllGather_Dim0) {
                         /*use_threads=*/true, /*run_hlo_passes=*/true));
   ASSERT_EQ(results.size(), kNumReplicas);
   for (const Literal& result : results) {
-    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16, 12, 17, 13, 18},
-                                             result);
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, result);
   }
 }
 
@@ -918,12 +984,13 @@ XLA_TEST_F(CollectiveOpsTest, AllGather_Dim1) {
     id2 = u32[2, 1] broadcast(id), dimensions={}
     a0 = u32[2, 1] constant({{10}, {15}})
     a1 = u32[2, 1] add(id2, a0)
-    allgather = u32[2, 4] all-gather(a1), dimensions={1}
-    ROOT out = u32[8] reshape(allgather)
+    allgather = u32[2, 2] all-gather(a1), dimensions={1}
+    ROOT out = u32[4] reshape(allgather)
   }
   )";
-  const int64_t kNumReplicas = 4;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -933,12 +1000,17 @@ XLA_TEST_F(CollectiveOpsTest, AllGather_Dim1) {
                         /*use_threads=*/true, /*run_hlo_passes=*/true));
   ASSERT_EQ(results.size(), kNumReplicas);
   for (const Literal& result : results) {
-    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 11, 12, 13, 15, 16, 17, 18},
-                                             result);
+    LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 11, 15, 16}, result);
   }
 }
 
 XLA_TEST_F(CollectiveOpsTest, AllReduce_TupleAllReduce) {
+  if (IsMlirLoweringEnabled()) {
+    // TupleAllReduce is not supported by MHLO. As of late 2022, there is no
+    // known way to generate it from any frontend.
+    GTEST_SKIP();
+  }
+
   std::string hlo_string = R"(
     HloModule test
 
@@ -955,8 +1027,8 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_TupleAllReduce) {
     }
   )";
   static constexpr int kNumReplicas = 2;
-  auto config = GetModuleConfigForTest();
-  config.set_replica_count(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
 
@@ -1001,7 +1073,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllGatherMixedTypes)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1041,7 +1114,8 @@ XLA_TEST_F(CollectiveOpsTest, ReduceScatter) {
   )";
 
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1079,7 +1153,8 @@ XLA_TEST_F(CollectiveOpsTest, ReduceScatter_Dim1) {
   )";
 
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1120,7 +1195,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(ReduceScatterReassociate)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1169,7 +1245,8 @@ XLA_TEST_F(CollectiveOpsTest,
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1212,7 +1289,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllReduceReassociate)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1247,7 +1325,8 @@ XLA_TEST_F(CollectiveOpsTest,
   )";
 
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1288,7 +1367,8 @@ XLA_TEST_F(CollectiveOpsTest,
   )";
 
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1316,7 +1396,7 @@ XLA_TEST_F(CollectiveOpsTest,
                                            results[0]);
 }
 
-XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllGather_16BitInt)) {
+XLA_TEST_F(CollectiveOpsTest, AllGather_16BitInt) {
   const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
@@ -1330,7 +1410,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllGather_16BitInt)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1344,7 +1425,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllGather_16BitInt)) {
   }
 }
 
-XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllToAll_16BitInt)) {
+XLA_TEST_F(CollectiveOpsTest, AllToAll_16BitInt) {
   const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
@@ -1357,7 +1438,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllToAll_16BitInt)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1370,7 +1452,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AllToAll_16BitInt)) {
   LiteralTestUtil::ExpectR1Equal<uint16_t>({15, 16}, results[1]);
 }
 
-XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(CollectivePermute_16BitInt)) {
+XLA_TEST_F(CollectiveOpsTest, CollectivePermute_16BitInt) {
   const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
@@ -1383,7 +1465,8 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(CollectivePermute_16BitInt)) {
   }
   )";
   const int64_t kNumReplicas = 2;
-  auto config = GetModuleConfigForTest(kNumReplicas);
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
@@ -1396,5 +1479,110 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(CollectivePermute_16BitInt)) {
   LiteralTestUtil::ExpectR1Equal<uint16_t>({10, 15}, results[1]);
 }
 
+XLA_TEST_F(CollectiveOpsTest, AllReduce_16BitInt) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  sum {
+    a = u16[] parameter(0)
+    b = u16[] parameter(1)
+    ROOT add.2 = u16[] add(a, b)
+  }
+
+  ENTRY test_computation {
+    id32 = u32[] replica-id()
+    id = u16[] convert(id32)
+    id2 = u16[2] broadcast(id), dimensions={}
+    a0 = u16[2] constant({10, 15})
+    a1 = u16[2] add(id2, a0)
+    ROOT cp = u16[2] all-reduce(a1), replica_groups={}, to_apply=sum
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  for (const Literal& result : results) {
+    LiteralTestUtil::ExpectR1Equal<uint16_t>({21, 31}, result);
+  }
+}
+
+XLA_TEST_F(CollectiveOpsTest, ReduceScatter_16BitInt) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  sum {
+    a = u16[] parameter(0)
+    b = u16[] parameter(1)
+    ROOT add.2 = u16[] add(a, b)
+  }
+
+  ENTRY test_computation {
+    id32 = u32[] replica-id()
+    id = u16[] convert(id32)
+    id2 = u16[2] broadcast(id), dimensions={}
+    a0 = u16[2] constant({10, 15})
+    a1 = u16[2] add(id2, a0)
+    ROOT cp = u16[1]reduce-scatter(a1), dimensions={0}, replica_groups={}, to_apply=sum
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  LiteralTestUtil::ExpectR1Equal<uint16_t>({21}, results[0]);
+  LiteralTestUtil::ExpectR1Equal<uint16_t>({31}, results[1]);
+}
+
+XLA_TEST_F(CollectiveOpsTest, AllReduceBFloat16Min) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  min {
+    a = bf16[] parameter(0)
+    b = bf16[] parameter(1)
+    ROOT min.2 = bf16[] minimum(a, b)
+  }
+
+  ENTRY test_computation {
+    id32 = u32[] replica-id()
+    one = u32[] constant(1)
+    id32_1 = u32[] add(id32, one)
+    id = bf16[] convert(id32_1)
+    id2 = bf16[2] broadcast(id), dimensions={}
+    ROOT cp = bf16[2] all-reduce(id2), replica_groups={}, to_apply=min
+  }
+  )";
+  const int64_t kNumReplicas = 2;
+  HloModuleConfig config =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+  ASSERT_EQ(results.size(), kNumReplicas);
+  const bfloat16 one = static_cast<bfloat16>(1.0f);
+  for (const Literal& result : results) {
+    LiteralTestUtil::ExpectR1Equal<bfloat16>({one, one}, result);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 00f39260999..66a6f4a3ad8 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -30,7 +30,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -75,6 +77,17 @@ TEST_F(ConstantsTest, OneCellU32) {
   ComputeAndCompareR1<uint32_t>(&builder, constant, {});
 }
 
+TEST_F(ConstantsTest, OneCellF8e4m3fn) {
+  std::vector<tsl::float8_e4m3fn> constant = {tsl::float8_e4m3fn{2.0}};
+
+  XlaBuilder builder(TestName());
+  auto c = ConstantR1<tsl::float8_e4m3fn>(&builder, constant);
+  // F8 outputs are not yet supported so convert to F32
+  ConvertElementType(c, F32);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f}, {}, error_spec_);
+}
+
 TEST_F(ConstantsTest, EightCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
 
@@ -208,6 +221,11 @@ class ConstantsHloTest : public HloTestBase {};
 
 // TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
 XLA_TEST_F(ConstantsHloTest, DISABLED_ON_GPU(BitcastOfConstant)) {
+  if (IsMlirLoweringEnabled()) {
+    // Bitcasts are not generated by frontends directly and are not supported by
+    // the MLIR pipeline.
+    GTEST_SKIP() << "Bitcasts are unsupported by MLIR";
+  }
   const char* testcase = R"(
     HloModule module, is_scheduled=true
 
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
index 3bd69fb71ba..70cd793ef88 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -199,5 +199,18 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Bool()),
     BatchGroupedConvolution2DTestDataToString);
 
+XLA_TEST_F(HloTestBase, OutpuChannelsSmallerThanBatch) {
+  const std::string& hlo_string = R"(
+HloModule main, entry_computation_layout={(bf16[4,4,4,1]{3,2,1,0},bf16[2,2,1,2]{3,2,1,0})->bf16[2,2,2,2]{3,2,1,0}}
+
+ENTRY %main.4 (Arg_0.1: bf16[4,4,4,1], Arg_1.2: bf16[2,2,1,2]) -> bf16[2,2,2,2] {
+  %Arg_0.1 = bf16[4,4,4,1] parameter(0)
+  %Arg_1.2 = bf16[2,2,1,2] parameter(1)
+  ROOT %convolution.3 = bf16[2,2,2,2] convolution(bf16[4,4,4,1] %Arg_0.1, bf16[2,2,1,2] %Arg_1.2), window={size=2x2 stride=2x2}, dim_labels=b01f_01io->b01f, batch_group_count=2
+}
+  )";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.01, 0.01}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 24ec6179042..888c4316123 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -40,6 +41,7 @@ class ConvertTest : public ClientLibraryTestBase {
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+    mutable_debug_options()->set_xla_gpu_simplify_all_fp_conversions(false);
   }
 };
 
@@ -540,7 +542,152 @@ XLA_TEST_F(ConvertTest, ConvertBF16F32) {
   xla::XlaOp all_bfloats_bf16 = ConstantR1<bfloat16>(&builder, all_bfloats);
   xla::XlaOp all_bfloats_f32 = ConvertElementType(all_bfloats_bf16, F32);
   BitcastConvertType(all_bfloats_f32, U32);
-  ComputeAndCompareR1<uint32_t>(&builder, expected, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(const auto results, ExecuteAndTransfer(&builder, {}));
+  for (int i = 0; i < expected.size(); ++i) {
+    const auto result = results.Get<uint32_t>({i});
+    const auto correct = expected[i];
+    if (all_bfloats[i] != 0.0f &&
+        all_bfloats[i] < std::numeric_limits<float>::min()) {
+      // Subnormals may not be preserved, zero will do.
+      const float same_signed_zero =
+          Eigen::numext::signbit(all_bfloats[i]) ? -0.0f : 0.0f;
+      if (result != correct) {
+        EXPECT_EQ(result, absl::bit_cast<uint32_t>(same_signed_zero));
+      }
+    } else if (Eigen::numext::isnan(all_bfloats[i])) {
+      // NaNs may not be preserved, any NaN will do.
+      ASSERT_TRUE(std::isnan(absl::bit_cast<float>(correct)));
+      EXPECT_TRUE(std::isnan(absl::bit_cast<float>(result)));
+    } else {
+      EXPECT_EQ(result, correct);
+    }
+  }
+}
+
+XLA_TEST_F(ConvertTest, ConvertF16F8e5m2Roundtrip) {
+  // Convert from FP16 to FP8, then back to FP16
+  XlaBuilder builder(TestName());
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  float inf = std::numeric_limits<float>::infinity();
+
+  struct TestCase {
+    float input;
+    float expected_roundtrip;
+  } test_cases[] = {
+      // clang-format off
+      {0.0, 0.0},
+      {1.0, 1.0},
+      {-1.0, -1.0},
+      {nan, nan},
+      {inf, inf},
+      // clang-format on
+      {0x1.2p0, 0x1p0},        // Round-to-even down
+      {0x1.6p0, 0x1.8p0},      // Round-to-even up
+      {0x1.Cp15, 0x1.Cp15},    // Max value
+      {0x1.DFCp15, 0x1.Cp15},  // Largest number that doesn't overflow
+      {0x1.Ep15, inf},         // Smallest number that overflows
+      {0x1p16, inf},           // Overflow
+      {0x1p-14, 0x1p-14},      // Smallest normal
+      {0x1.8p-15, 0x1.8p-15},  // Denormal
+  };
+
+  std::vector<Eigen::half> inputs;
+  std::vector<Eigen::half> expected_roundtrip;
+  for (auto test_case : test_cases) {
+    inputs.push_back(Eigen::half{test_case.input});
+    expected_roundtrip.push_back(Eigen::half{test_case.expected_roundtrip});
+  }
+
+  auto f8 =
+      ConvertElementType(ConstantR1<Eigen::half>(&builder, inputs), F8E5M2);
+  ConvertElementType(f8, F16);
+  ComputeAndCompareR1<Eigen::half>(&builder, expected_roundtrip, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertF8e5m2F16RoundtripExhaustive) {
+  // Convert from FP8 to FP16, then back to FP8
+  XlaBuilder builder(TestName());
+
+  std::vector<tsl::float8_e5m2> all_f8;
+  for (int i = 0; i < 256; i++) {
+    all_f8.push_back(
+        Eigen::numext::bit_cast<tsl::float8_e5m2>(static_cast<uint8_t>(i)));
+  }
+
+  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e5m2>(&builder, all_f8);
+  xla::XlaOp all_f8_as_f16 = ConvertElementType(all_f8_as_f8, F16);
+  ConvertElementType(all_f8_as_f16, F8E5M2);
+
+  // Pass in ErrorSpec, as this causes all NaNs to be treated as equal.
+  // Round-tripping a NaN will turn it into a quiet NaN and doesn't necessarily
+  // preserve the payload.
+  ComputeAndCompareR1<tsl::float8_e5m2>(&builder, all_f8, {}, ErrorSpec(0.));
+}
+
+XLA_TEST_F(ConvertTest, ConvertF16F8e4m3fnRoundtrip) {
+  // Convert from FP16 to FP8, then back to FP16
+  XlaBuilder builder(TestName());
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  float inf = std::numeric_limits<float>::infinity();
+
+  struct TestCase {
+    float input;
+    float expected_roundtrip;
+  } test_cases[] = {
+      // clang-format off
+      {0.0, 0.0},
+      {1.0, 1.0},
+      {-1.0, -1.0},
+      {inf, nan},
+      // clang-format on
+      {0x1.1p0, 0x1p0},    // Round-to-even down
+      {0x1.3p0, 0x1.4p0},  // Round-to-even up
+      {0x1.Cp8, 0x1.Cp8},  // Max value
+      {0x1.Dp8, 0x1.Cp8},  // Largest number that doesn't overflow
+      {0x1.D04p8, nan},    // Smallest number that overflows
+      {0x1p9, nan},        // Overflow
+      {0x1p-6, 0x1p-6},    // Smallest F8 normal
+      {0x1.Ep-7, 0x1p-6},  // Smallest number rounding up to normal
+
+      // Denormal tests
+      {0x1.0p-8, 0x1.0p-8},    // Denormal without rounding
+      {0x1.4p-8, 0x1.0p-8},    // Round-to-even down
+      {0x1.Cp-8, 0x1.0p-7},    // Round-to-even up
+      {0x1.5p-7, 0x1.4p-7},    // Round-to-nearest down
+      {0x1.3p-7, 0x1.4p-7},    // Round-to-nearest up
+      {0x1p-10, 0},            // Largest number that underflows
+      {0x1.004p-10, 0x1p-9},   // Smallest number that doesn't underflow
+      {0x1.DFCp-7, 0x1.Cp-7},  // Largest number that rounds to denormal
+  };
+
+  std::vector<Eigen::half> inputs;
+  std::vector<Eigen::half> expected_roundtrip;
+  for (auto test_case : test_cases) {
+    inputs.push_back(Eigen::half{test_case.input});
+    expected_roundtrip.push_back(Eigen::half{test_case.expected_roundtrip});
+  }
+
+  auto f8 =
+      ConvertElementType(ConstantR1<Eigen::half>(&builder, inputs), F8E4M3FN);
+  ConvertElementType(f8, F16);
+  ComputeAndCompareR1<Eigen::half>(&builder, expected_roundtrip, {});
+}
+
+XLA_TEST_F(ConvertTest, ConvertF8e4m3fnF16RoundtripExhaustive) {
+  // Convert from FP8 to FP16, then back to FP8
+  XlaBuilder builder(TestName());
+
+  std::vector<tsl::float8_e4m3fn> all_f8;
+  for (int i = 0; i < 256; i++) {
+    all_f8.push_back(
+        Eigen::numext::bit_cast<tsl::float8_e4m3fn>(static_cast<uint8_t>(i)));
+  }
+
+  xla::XlaOp all_f8_as_f8 = ConstantR1<tsl::float8_e4m3fn>(&builder, all_f8);
+  xla::XlaOp all_f8_as_f16 = ConvertElementType(all_f8_as_f8, F16);
+  ConvertElementType(all_f8_as_f16, F8E4M3FN);
+  ComputeAndCompareR1<tsl::float8_e4m3fn>(&builder, all_f8, {});
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc b/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc
new file mode 100644
index 00000000000..596f2be8f77
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/convolution_cudnn_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ConvolutionHloTest : public HloTestBase {};
+
+XLA_TEST_F(ConvolutionHloTest, TestCudnnConvInt8x32) {
+  // This convolution should be transformed to "cudnn-conv" and vectorized as
+  // INT8x32_CONFIG on GPUs.
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  %input = s8[4,48,48,64] parameter(0)
+  %filter = s8[64,3,3,64] parameter(1)
+  ROOT %conv = s8[4,48,48,64] convolution(%input, %filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(ConvolutionHloTest, TestCudnnConvInt8x32Bias) {
+  // This convolution with the following add/relu ops should be transformed to
+  // "cudnn-conv-bias-activation" and vectorized as INT8x32_CONFIG on GPUs.
+  // In order to verify this with non-zero bias and without adding test-specific
+  // code to HLO evaluator, the overflow is then cleared by taking a remainder.
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  %input = s8[4,48,48,64] parameter(0)
+  %filter = s8[64,3,3,64] parameter(1)
+  %zero = s8[] constant(0)
+  %zeros = s8[4,48,48,64] broadcast(%zero), dimensions={}
+  %one = s8[] constant(1)
+  %ones = s8[4,48,48,64] broadcast(%one), dimensions={}
+  %conv = s8[4,48,48,64] convolution(%input, %filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f
+  %result = add(%conv, %ones)
+  %relu = maximum(%result, %zeros)
+  %ceil = s8[] constant(127)
+  %ceil_broadcast = s8[4,48,48,64] broadcast(%ceil), dimensions={}
+  ROOT %output = remainder(%relu, %ceil_broadcast)
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0, 0}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index d45a9405d14..4a574979cfb 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -48,6 +49,17 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase {
 #else
   ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-2);
 #endif
+
+  void SetUp() override {
+    init_tf32_status_ = tsl::tensor_float_32_execution_enabled();
+    tsl::enable_tensor_float_32_execution(false);
+  }
+  void TearDown() override {
+    tsl::enable_tensor_float_32_execution(init_tf32_status_);
+  }
+
+ private:
+  bool init_tf32_status_;
 };
 
 XLA_TEST_F(ConvolutionVariantsTest, Minimal) {
diff --git a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
index 0baa7480fbb..2cf7333d1a8 100644
--- a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
@@ -66,9 +66,8 @@ class CpuGpuFusionTest : public HloTestBase {
     // Create a variable for comparisons since they require the direction.
     bool is_compare = std::is_same<T, bool>::value;
     Array2D<float> operand_data[Arity];
-    for (int i = 0; i < Arity; ++i) {
-      new (&operand_data[i]) Array2D<float>(test_width, test_height);
-    }
+    std::fill(std::begin(operand_data), std::end(operand_data),
+              Array2D<float>(test_width, test_height));
     Array2D<T> answer_data(test_width, test_height);
     for (int i = 0; i < test_width; ++i) {
       for (int j = 0; j < test_height; ++j) {
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 27f9ed1ab68..6fd0ac45074 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <memory>
 #include <utility>
 
 #include "absl/base/dynamic_annotations.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
 #include "tensorflow/compiler/xla/service/custom_call_status.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/test.h"
 
@@ -164,6 +168,12 @@ XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
 }
 
 XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
+  if (IsMlirLoweringEnabled()) {
+    // The MLIR pipeline does /not/ transpose the output here, and there's no
+    // obvious reason why it should.
+    GTEST_SKIP() << "Appears to test an XLA current implementation detail";
+  }
+
   auto module = CreateNewVerifiedModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -313,6 +323,10 @@ XLA_TEST_F(CustomCallTest, TransitiveCustomCallReportsFirstFailure) {
 }
 
 XLA_TEST_F(CustomCallTest, FillStatusMsgWithBackendConfigStr) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Invalid values unsupported by MLIR";
+  }
+
   const char* const kModuleStr = R"(
     HloModule m
     ENTRY test {
@@ -346,5 +360,61 @@ XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
   EXPECT_FALSE(result.ok());
 }
 
+//===----------------------------------------------------------------------===//
+// XLA runtime FFI modules is an external version of custom calls (C API based).
+//===----------------------------------------------------------------------===//
+namespace ffi = ::xla::runtime::ffi;
+
+struct TestFfiModule : ffi::StatelessModule {
+  explicit TestFfiModule(const XLA_FFI_Api* api)
+      : StatelessModule(api, "TestFfiModule",
+                        {{"ffi.add_const", FFI_AddConst}}) {}
+
+  XLA_FFI_DEFINE_FUNCTION(FFI_AddConst, AddConst,
+                          ffi::Ffi::Binding()
+                              .Arg<ffi::StridedBufferArg>()
+                              .Arg<ffi::StridedBufferArg>()
+                              .Attr<float>("cst"));
+
+  static ffi::FfiStatus AddConst(ffi::StridedBufferArg src,
+                                 ffi::StridedBufferArg dst, float cst) {
+    if (src.dtype != ffi::PrimitiveType::F32 ||
+        dst.dtype != ffi::PrimitiveType::F32)
+      return ffi::FfiStatus::Internal("Unsupported data type");
+
+    if (src.sizes.size() != dst.sizes.size())
+      return ffi::FfiStatus::Internal("Sizes must be the same");
+
+    size_t num_values = 1;
+    for (unsigned d = 0; d < src.sizes.size(); ++d) num_values *= src.sizes[d];
+
+    const float* src_data = reinterpret_cast<float*>(src.data);
+    float* dst_data = reinterpret_cast<float*>(dst.data);
+
+    for (size_t i = 0; i < num_values; ++i) dst_data[i] = src_data[i] + cst;
+
+    return ffi::FfiStatus::Ok();
+  }
+};
+
+XLA_REGISTER_FFI_MODULE(std::make_unique<TestFfiModule>(GetXlaFfiApi()));
+
+XLA_TEST_F(CustomCallClientAPITest, FfiAdd) {
+  // TODO(ezhulenev): Remove once XLA runtime is enabled by default.
+  mutable_debug_options()->set_xla_cpu_use_xla_runtime(true);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "ffi.add_const",
+             /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
+             ShapeUtil::MakeShape(F32, {128}),
+             /*opaque=*/"{ cst = 2.0 : f32 }",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}));
+  EXPECT_THAT(result.data<float>(), ::testing::Each(44.0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 49e334acd07..876402cc821 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
 #include <memory>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -65,6 +67,8 @@ using TypesF16F32F64CF64 = ::testing::Types<
 #endif
     float>;
 
+using TypesF8 = ::testing::Types<tsl::float8_e4m3fn>;
+
 // Check that we can safely pass an input tuple's elements to a dot operation.
 XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
@@ -686,8 +690,12 @@ template <typename T>
 class DotOperationTestWithCublasLt_F16F32F64CF64 : public DotOperationTest {
  public:
   DotOperationTestWithCublasLt_F16F32F64CF64() {
+    bool enable_cublas_lt = false;
+#if GOOGLE_CUDA
+    enable_cublas_lt = true;
+#endif
     execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
-        true);
+        enable_cublas_lt);
   }
 };
 TYPED_TEST_CASE(DotOperationTestWithCublasLt_F16F32F64CF64, TypesF16F32F64CF64);
@@ -734,6 +742,328 @@ XLA_TYPED_TEST(DotOperationTestWithCublasLt_F16F32F64CF64,
       &builder, expected, {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
+template <typename T>
+class DotOperationTestWithCublasLt_F8 : public DotOperationTest {
+ public:
+  DotOperationTestWithCublasLt_F8() {
+    execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
+        true);
+  }
+};
+TYPED_TEST_CASE(DotOperationTestWithCublasLt_F8, TypesF8);
+
+XLA_TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABUnscaledDF8) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  XlaOp a =
+      Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({16, 16}), "A");
+  XlaOp b =
+      Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({16, 16}), "B");
+  XlaOp a_scale = Parameter(&builder, 2,
+                            ShapeUtil::MakeShapeWithType<float>({}), "A scale");
+  XlaOp b_scale = Parameter(&builder, 3,
+                            ShapeUtil::MakeShapeWithType<float>({}), "B scale");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(0);
+
+  XlaOp a_f32 = ConvertElementType(a, F32);
+  XlaOp b_f32 = ConvertElementType(b, F32);
+
+  XlaOp a_scale_bcast = Broadcast(a_scale, {16, 16});
+  XlaOp a_scaled_f32 = Mul(a_f32, a_scale_bcast);
+  XlaOp b_scale_bcast = Broadcast(b_scale, {16, 16});
+  XlaOp b_scaled_f32 = Mul(b_f32, b_scale_bcast);
+
+  DotGeneral(a_scaled_f32, b_scaled_f32, dnums);
+
+  auto a_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
+                    .value();
+  auto b_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
+                    .value();
+  auto a_scale_data =
+      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
+          .value();
+  auto b_scale_data =
+      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
+          .value();
+
+  Literal expected_d = LiteralUtil::CreateR2FromArray2D<float>(
+      {{560.0f, 688.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {1336.0f, 1640.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 1640.0f, 1336.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 688.0f, 560.0f}});
+
+  this->ComputeAndCompareTuple(
+      &builder, expected_d,
+      {a_data.get(), b_data.get(), a_scale_data.get(), b_scale_data.get()},
+      this->error_spec_);
+}
+
+XLA_TYPED_TEST(DotOperationTestWithCublasLt_F8, ScaledABScaledDWithDAmaxF8) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  XlaOp a =
+      Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({16, 16}), "A");
+  XlaOp b =
+      Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({16, 16}), "B");
+  XlaOp a_scale = Parameter(&builder, 2,
+                            ShapeUtil::MakeShapeWithType<float>({}), "A scale");
+  XlaOp b_scale = Parameter(&builder, 3,
+                            ShapeUtil::MakeShapeWithType<float>({}), "B scale");
+  XlaOp d_scale = Parameter(&builder, 4,
+                            ShapeUtil::MakeShapeWithType<float>({}), "D scale");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(0);
+
+  XlaOp a_f32 = ConvertElementType(a, F32);
+  XlaOp b_f32 = ConvertElementType(b, F32);
+
+  XlaOp a_scale_bcast = Broadcast(a_scale, {16, 16});
+  XlaOp a_scaled_f32 = Mul(a_f32, a_scale_bcast);
+  XlaOp b_scale_bcast = Broadcast(b_scale, {16, 16});
+  XlaOp b_scaled_f32 = Mul(b_f32, b_scale_bcast);
+
+  XlaOp d_f32 = DotGeneral(a_scaled_f32, b_scaled_f32, dnums);
+  XlaComputation max = CreateScalarMaxComputation(F32, &builder);
+  const XlaOp d_amax = ReduceAll(
+      d_f32,
+      ConstantR0<float>(&builder, -std::numeric_limits<float>::infinity()),
+      max);
+
+  XlaOp d_scale_bcast = Broadcast(d_scale, {16, 16});
+  XlaOp d_scaled_f32 = Div(d_f32, d_scale_bcast);
+  XlaOp d_clamped_f32 =
+      Clamp(ConstantR0<float>(
+                &builder, static_cast<float>(-std::numeric_limits<T>::max())),
+            d_scaled_f32,
+            ConstantR0<float>(
+                &builder, static_cast<float>(std::numeric_limits<T>::max())));
+  const XlaOp d_f8 = ConvertElementType(
+      d_clamped_f32, primitive_util::NativeToPrimitiveType<T>());
+  Tuple(&builder, {d_f8, d_amax});
+
+  auto a_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{2.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {5.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 7.0f, 5.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 2.0f}}))
+                    .value();
+  auto b_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{11.0f, 13.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {17.0f, 19.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 19.0f, 17.0f},
+                         {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 13.0f, 11.0f}}))
+                    .value();
+  auto a_scale_data =
+      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(2.0f))
+          .value();
+  auto b_scale_data =
+      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(4.0f))
+          .value();
+  auto d_scale_data =
+      this->client_->TransferToServer(LiteralUtil::CreateR0<float>(8.0f))
+          .value();
+
+  Literal expected_d = LiteralUtil::CreateR2FromArray2D<T>(
+      {{72.0f, 88.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {160.0f, 208.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 208.0f, 160.0f},
+       {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 88.0f, 72.0f}});
+  Literal expected_amax = LiteralUtil::CreateR0<float>(1640.0f);
+  Literal expected = LiteralUtil::MakeTuple({&expected_d, &expected_amax});
+
+  this->ComputeAndCompareTuple(&builder, expected,
+                               {a_data.get(), b_data.get(), a_scale_data.get(),
+                                b_scale_data.get(), d_scale_data.get()},
+                               this->error_spec_);
+}
+
 XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
   using T = TypeParam;
 
@@ -1679,6 +2009,36 @@ ENTRY TransposeOutput {
 
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
 }
+// There was a bug in the Dot Codegen, which is masked for floating-point since
+// Dot for FP opertions are converted to cuBLAS operations. This one tests
+// integer ones to make sure the Dot-Codegen is producing correct code.
+XLA_TEST_F(DotOperationTextTest, IntegerDotTest) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule dot_int_test
+  ENTRY main.4 {
+  Arg_0.1 = s32[4,3,5]{2,1,0} parameter(0)
+  Arg_1.2 = s32[3,5,6]{2,1,0} parameter(1)
+  ROOT dot.3 = s32[5,4,6]{2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={2}, lhs_contracting_dims={1}, rhs_batch_dims={1}, rhs_contracting_dims={0}, operand_precision={highest,highest}, metadata={op_name="jit(dot_general)/jit(main)/dot_general[dimension_numbers=(((1,), (0,)), ((2,), (1,))) precision=(<Precision.HIGHEST: 2>, <Precision.HIGHEST: 2>) preferred_element_type=None]" source_file="third_party/py/jax/tests/lax_vmap_test.py" source_line=79}
+})";
+  EXPECT_TRUE(RunAndCompare(kHloString, ErrorSpec{0, 0}));
+}
+
+XLA_TEST_F(DotOperationTextTest, FPDotTestNoGEMMRewriter) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule dot_int_test
+  ENTRY main.4 {
+  Arg_0.1 = f32[4,3,5]{2,1,0} parameter(0)
+  Arg_1.2 = f32[3,5,6]{2,1,0} parameter(1)
+  ROOT dot.3 = f32[5,4,6]{2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={2}, lhs_contracting_dims={1}, rhs_batch_dims={1}, rhs_contracting_dims={0}, operand_precision={highest,highest}, metadata={op_name="jit(dot_general)/jit(main)/dot_general[dimension_numbers=(((1,), (0,)), ((2,), (1,))) precision=(<Precision.HIGHEST: 2>, <Precision.HIGHEST: 2>) preferred_element_type=None]" source_file="third_party/py/jax/tests/lax_vmap_test.py" source_line=79}
+})";
+  auto mod_config = GetModuleConfigForTest();
+  auto debug_options = GetDebugOptionsForTest();
+  debug_options.add_xla_disable_hlo_passes("cublas-gemm-rewriter");
+  mod_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString, mod_config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{4e-3, 4e-3}));
+}
 
 XLA_TEST_F(DotOperationTextTest, MatrixVectorComplex) {
   absl::string_view hlo_string =
diff --git a/tensorflow/compiler/xla/tests/exhaustive/BUILD b/tensorflow/compiler/xla/tests/exhaustive/BUILD
new file mode 100644
index 00000000000..99b7c73f269
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/exhaustive/BUILD
@@ -0,0 +1,141 @@
+# Description:
+#   Computationally expensive, exhaustive tests for XLA
+
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+    packages = [
+        "//platforms/testing/tests/...",
+    ],
+)
+
+cc_library(
+    name = "exhaustive_op_test_utils",
+    testonly = True,
+    srcs = ["exhaustive_op_test_utils.cc"],
+    hdrs = ["exhaustive_op_test_utils.h"],
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/compiler/xla:bit_cast",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_unary_test_f32_or_smaller",
+    srcs = ["exhaustive_unary_test_f32_or_smaller.cc"],
+    real_hardware_only = True,  # Very slow on the interpreter.
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_unary_test_f64",
+    srcs = ["exhaustive_unary_test_f64.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+        "no_pip",
+        # TODO(b/151340488): Timed out on 2020-03-18.
+        "nozapfhahn",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_unary_test_complex",
+    srcs = ["exhaustive_unary_test_complex.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_16_bit_test",
+    srcs = ["exhaustive_binary_16_bit_test.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
+
+xla_test(
+    name = "exhaustive_binary_test_f32_f64",
+    srcs = ["exhaustive_binary_test_f32_f64.cc"],
+    backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":exhaustive_op_test_utils",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_16_bit_test.cc
similarity index 98%
rename from tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_16_bit_test.cc
index 3fbe32e943c..f6a7d187887 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_16_bit_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <cmath>
 
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 
 #ifdef __FAST_MATH__
 #error("Can't be compiled with fast math on");
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_test_f32_f64.cc
similarity index 99%
rename from tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_test_f32_f64.cc
index 6b5ac4d7cd9..8254600b46a 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_binary_test_f32_f64.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 
 #ifdef __FAST_MATH__
 #error("Can't be compiled with fast math on");
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
similarity index 99%
rename from tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
index 01fdd9d97a7..d7c14825faa 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 
 #include <array>
 #include <string>
diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
similarity index 99%
rename from tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
index 2566680e2c1..d640cb4e14e 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_OP_TEST_UTILS_H_
-#define TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_OP_TEST_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
 
 #include <algorithm>
 #include <array>
@@ -1065,4 +1065,4 @@ using ExhaustiveBinaryTest = ExhaustiveOpTestBase<T, 2>;
 
 }  // namespace exhaustive_op_test
 }  // namespace xla
-#endif  // TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_OP_TEST_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_EXHAUSTIVE_OP_TEST_UTILS_H_
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
similarity index 99%
rename from tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
index c74f798ff82..c96ceb25530 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_complex.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #ifdef __FAST_MATH__
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
similarity index 87%
rename from tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
index 0938ffbcec6..9c3e08fbdb5 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f32_or_smaller.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #ifdef __FAST_MATH__
@@ -174,10 +174,7 @@ class Exhaustive32BitOrLessUnaryTest
     : public ExhaustiveUnaryTest<T>,
       public ::testing::WithParamInterface<std::pair<int64_t, int64_t>> {
  public:
-  // Sets error parameters appropriately for testing sin/cos.
-  void SetParamsForSinCos();
-
-  // Sets error parameters appropriately for testing sin/cos.
+  // Sets error parameters appropriately for testing tan.
   void SetParamsForTan();
 
  protected:
@@ -465,113 +462,52 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Tanh, {
   Run(Tanh, std::tanh, error_spec_gen);
 })
 
-template <PrimitiveType T>
-void Exhaustive32BitOrLessUnaryTest<T>::SetParamsForSinCos() {
-  if (this->platform_ == "Host" || this->platform_ == "CUDA") {
-    return;
-  }
-
-  // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
-  // and will not provide meaningful results for sin/cos/tan if magnitudes
-  // exceed 2**p.
-  const int kFirstWrongVal = 1 << 16;
-  if (T == F32) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = BitCast<float>(static_cast<uint32_t>(v));
-      return std::abs(f) > kFirstWrongVal;
-    };
-  } else if (T == BF16) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = static_cast<float>(BitCast<bfloat16>(static_cast<uint16_t>(v)));
-      return std::abs(f) > kFirstWrongVal;
-    };
-  } else if (T == F16) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = static_cast<float>(BitCast<half>(static_cast<uint16_t>(v)));
-      return std::abs(f) > kFirstWrongVal;
-    };
-  }
-}
-
-template <PrimitiveType T>
-void Exhaustive32BitOrLessUnaryTest<T>::SetParamsForTan() {
-  if (this->platform_ == "Host" || this->platform_ == "CUDA") {
-    return;
-  }
-
-  // Non CPU/GPU targets may have used the Cody-Waite range reduction technique
-  // and will not provide meaningful results for sin/cos/tan if magnitudes
-  // exceed 2**p.
-  if (T == F32) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = BitCast<float>(static_cast<uint32_t>(v));
-      return std::abs(f) > (1 << 13);
-    };
-  } else if (T == BF16) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = static_cast<float>(BitCast<bfloat16>(static_cast<uint16_t>(v)));
-      return std::abs(f) > (1 << 16);
-    };
-  } else if (T == F16) {
-    this->known_incorrect_fn_ = [](int64_t v) {
-      float f = static_cast<float>(BitCast<half>(static_cast<uint16_t>(v)));
-      return std::abs(f) > (1 << 15);
-    };
-  }
-}
-
 UNARY_TEST_F32(Cos, {
-  SetParamsForSinCos();
   Run(
       Cos, std::cos, +[](NativeT) {
-        return ErrorSpec{0.001, 0.001};
+        // This error spec corresponds to a maximum relative error of 2 ULP.
+        return ErrorSpec(0, 2 * std::numeric_limits<float>::epsilon());
       });
 })
 
-UNARY_TEST_F16(Cos, {
-  SetParamsForSinCos();
-  Run(Cos, std::cos);
-})
+UNARY_TEST_F16(Cos, { Run(Cos, std::cos); })
 
-UNARY_TEST_BF16(Cos, {
-  SetParamsForSinCos();
-  Run(Cos, std::cos);
-})
+UNARY_TEST_BF16(Cos, { Run(Cos, std::cos); })
 
 UNARY_TEST_F32(Sin, {
-  SetParamsForSinCos();
   Run(
       Sin, std::sin, +[](NativeT) {
-        return ErrorSpec{0.001, 0.001};
+        // This error spec corresponds to a maximum relative error of 2 ULP.
+        return ErrorSpec(0, 2 * std::numeric_limits<float>::epsilon());
       });
 })
 
-UNARY_TEST_F16(Sin, {
-  SetParamsForSinCos();
-  Run(Sin, std::sin);
-})
+UNARY_TEST_F16(Sin, { Run(Sin, std::sin); })
 
-UNARY_TEST_BF16(Sin, {
-  SetParamsForSinCos();
-  Run(Sin, std::sin);
-})
+UNARY_TEST_BF16(Sin, { Run(Sin, std::sin); })
 
 UNARY_TEST_F32(Tan, {
-  SetParamsForTan();
   Run(
       Tan, std::tan, +[](NativeT) {
-        return ErrorSpec{0.001, 0.001};
+        // This error spec corresponds to a maximum relative error of 4 ULP.
+        return ErrorSpec(0, 4 * std::numeric_limits<float>::epsilon());
       });
 })
 
 UNARY_TEST_F16(Tan, {
-  SetParamsForTan();
-  Run(Tan, std::tan);
+  Run(
+      Tan, std::tan, +[](NativeT) {
+        // This error spec corresponds to a maximum relative error of 2 ULP.
+        return ErrorSpec(0, 2 * std::numeric_limits<Eigen::half>::epsilon());
+      });
 })
 
 UNARY_TEST_BF16(Tan, {
-  SetParamsForTan();
-  Run(Tan, std::tan);
+  Run(
+      Tan, std::tan, +[](NativeT) {
+        // This error spec corresponds to a maximum relative error of 1 ULP.
+        return ErrorSpec(0, std::numeric_limits<Eigen::bfloat16>::epsilon());
+      });
 })
 
 // TODO(jlebar): Enable these.
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f64.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
similarity index 98%
rename from tensorflow/compiler/xla/tests/exhaustive_unary_test_f64.cc
rename to tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
index 6040b519c5e..f01e5944fb4 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f64.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_unary_test_f64.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
+#include "tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #ifdef __FAST_MATH__
diff --git a/tensorflow/compiler/xla/tests/gpu_dump_mlir_passes_test.cc b/tensorflow/compiler/xla/tests/gpu_dump_mlir_passes_test.cc
new file mode 100644
index 00000000000..2b61d9a9478
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/gpu_dump_mlir_passes_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/file_system.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+
+class DumpMlirPassesTest : public HloTestBase {};
+
+// Ensures that specifying the option --xla_gpu_dump_llvmir also dumps the
+// intermediate IR of the run mlir passes. This is tested by compiling an
+// HLO snippet that should trigger the mlir pipeline and then checking for the
+// existance of a dump file <modulename>.mlir-passes.log.
+XLA_TEST_F(DumpMlirPassesTest, CompileSimpleCWiseTest) {
+  std::string hlo_text = R"(
+  HloModule m1, entry_computation_layout={(f32[3,3]{1,0})->f32[3,3]{1,0}}
+  ENTRY m1 {
+    arg0.1 = f32[3,3]{1,0} parameter(0), parameter_replication={false}
+    constant.4 = f32[] constant(42.0)
+    broadcast.5 = f32[3,3]{1,0} broadcast(constant.4), dimensions={}
+    ROOT multiply.6 = f32[3,3]{1,0} multiply(arg0.1, broadcast.5)
+})";
+  auto config = GetModuleConfigForTest(/*replica_count=*/2,
+                                       /*num_partitions=*/1);
+  auto debug_options = config.debug_options();
+  debug_options.set_xla_gpu_dump_llvmir(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      test_runner_.CreateExecutable(std::move(module), true));
+
+  const std::string basename = absl::StrCat(
+      absl::string_view(tsl::io::Basename(executable->module().name())),
+      ".mlir-passes.log");
+  std::string outputs_dir;
+  tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir);
+  std::string path = tsl::io::JoinPath(outputs_dir, basename);
+
+  tsl::FileSystem *fs;
+  TF_ASSERT_OK(tsl::Env::Default()->GetFileSystemForFile(path, &fs));
+  TF_ASSERT_OK(fs->FileExists(path));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 936615d21d0..e8da9a04aa6 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -20,15 +20,16 @@ limitations under the License.
 #include <set>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_runner_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_runner_pjrt.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/pjrt_client_registry.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -98,6 +100,7 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
       /*layout_sensitive=*/verifier_layout_sensitive,
       /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier,
       instruction_can_change_layout_func);
+  runner_ = GetHloRunner().value();
 }
 
 /*static*/ se::Platform* HloTestBase::GetReferencePlatform() {
@@ -182,6 +185,27 @@ StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
   return status_or;
 }
 
+/* static */
+StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface&& hlo_pass,
+                                       HloModuleGroup* module_group) {
+  const std::string module_group_str_before_run =
+      module_group->ToProto().ShortDebugString();
+  const auto status_or = hlo_pass.RunOnModuleGroup(module_group);
+  if (status_or.status().ok()) {
+    const std::string module_group_str_after_run =
+        module_group->ToProto().ShortDebugString();
+    const bool passChangedHlo = status_or.value();
+    if (passChangedHlo) {
+      // Check that the proto actually changed.
+      EXPECT_NE(module_group_str_after_run, module_group_str_before_run);
+    } else {
+      // Check that the proto remains same.
+      EXPECT_EQ(module_group_str_after_run, module_group_str_before_run);
+    }
+  }
+  return status_or;
+}
+
 /* static */
 PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
   PrecisionConfig precision_config;
@@ -204,7 +228,6 @@ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
-  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   debug_options.set_xla_hlo_evaluator_use_fast_path(true);
   return debug_options;
 }
@@ -230,16 +253,50 @@ void HloTestBase::RunAndFilecheckHloRewrite(
   }
 }
 
+void HloTestBase::RunAndFilecheckHloModuleGroupRewrite(
+    absl::Span<const absl::string_view> hlo_module_strs,
+    HloPassInterface&& hlo_pass,
+    std::optional<absl::Span<const absl::string_view>> expected) {
+  std::vector<std::unique_ptr<HloModule>> modules;
+  for (absl::string_view hlo : hlo_module_strs) {
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                            ParseAndReturnVerifiedModule(hlo));
+    modules.push_back(std::move(module));
+  }
+  HloModuleGroup module_group("test_input_module_group", std::move(modules));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloPass(std::move(hlo_pass), &module_group));
+  EXPECT_EQ(changed, expected.has_value()) << module_group.ToString();
+
+  if (!changed) {
+    return;
+  }
+
+  EXPECT_THAT(module_group.modules(),
+              ::testing::SizeIs(expected.value().size()));
+  int index = 0;
+  for (auto expected_str : expected.value()) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool filecheck_matches,
+        RunFileCheck(module_group.module(index).ToString(
+                         HloPrintOptions{}.set_print_operand_shape(false)),
+                     expected_str));
+    EXPECT_TRUE(filecheck_matches);
+    index++;
+  }
+}
+
 StatusOr<Literal> HloTestBase::Execute(std::unique_ptr<HloModule> module,
                                        absl::Span<Literal* const> arguments) {
-  return test_runner_.Execute(std::move(module), arguments);
+  return runner_->Execute(std::move(module), arguments);
 }
 
 Literal HloTestBase::ExecuteNoHloPasses(std::unique_ptr<HloModule> module,
                                         absl::Span<Literal* const> arguments) {
-  return test_runner_
-      .Execute(std::move(module), arguments,
-               /*run_hlo_passes=*/false)
+  return runner_
+      ->Execute(std::move(module), arguments,
+                /*run_hlo_passes=*/false)
       .value();
 }
 
@@ -258,11 +315,7 @@ StatusOr<std::unique_ptr<HloRunnerInterface>> HloTestBase::GetHloRunner() {
 
 Literal HloTestBase::ExecuteAndTransfer(std::unique_ptr<HloModule> module,
                                         absl::Span<Literal* const> arguments) {
-  auto hlo_runner = GetHloRunner();
-
-  return hlo_runner.value()
-      ->Execute(std::move(module), arguments, true, nullptr)
-      .value();
+  return runner_->Execute(std::move(module), arguments, true, nullptr).value();
 }
 
 StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
@@ -275,7 +328,8 @@ StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
   for (auto argument : arguments) {
     options.arguments.push_back(argument);
   }
-  return test_runner_.ExecuteReplicated(std::move(module), options);
+
+  return runner_->ExecuteReplicated(std::move(module), options);
 }
 
 StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
@@ -289,8 +343,8 @@ StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
   for (auto argument : arguments) {
     options.arguments.push_back(argument);
   }
-  return test_runner_.ExecuteReplicated(std::move(module), options,
-                                        device_assignment);
+  return runner_->ExecuteReplicated(std::move(module), options,
+                                    device_assignment);
 }
 
 StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
@@ -303,9 +357,9 @@ StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
   options.num_replicas = num_replicas;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
-  return test_runner_.ExecuteReplicated(
-      executable_provider, argument_count_provider, argument_provider, options,
-      device_assignment);
+  return runner_->ExecuteReplicated(executable_provider,
+                                    argument_count_provider, argument_provider,
+                                    options, device_assignment);
 }
 
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
@@ -336,9 +390,8 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                       MakeReferenceModule(*module, reference_preprocessor));
 
   // Execute on two backends.
-  TF_ASSIGN_OR_RETURN(
-      auto test,
-      test_runner_.Execute(std::move(module), arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto test, runner_->Execute(std::move(module), arguments,
+                                                  run_hlo_passes));
   TF_ASSIGN_OR_RETURN(auto reference,
                       reference_runner_.Execute(std::move(reference_module),
                                                 arguments, run_hlo_passes));
@@ -414,7 +467,7 @@ ::testing::AssertionResult HloTestBase::Run(std::unique_ptr<HloModule> module,
   }
 
   const auto output =
-      test_runner_.Execute(std::move(module), fake_arguments, run_hlo_passes);
+      runner_->Execute(std::move(module), fake_arguments, run_hlo_passes);
   return output.ok()
              ? ::testing::AssertionSuccess()
              : ::testing::AssertionFailure() << output.status().error_message();
@@ -442,12 +495,10 @@ HloTestBase::RunAndCompareTwoModulesInternal(
   TF_RETURN_IF_ERROR(hlo_verifier_->Run(module_1.get()).status());
 
   // Execute the two modules.
-  TF_ASSIGN_OR_RETURN(
-      auto test_0,
-      test_runner_.Execute(std::move(module_0), arguments, run_hlo_passes));
-  TF_ASSIGN_OR_RETURN(
-      auto test_1,
-      test_runner_.Execute(std::move(module_1), arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto test_0, runner_->Execute(std::move(module_0),
+                                                    arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto test_1, runner_->Execute(std::move(module_1),
+                                                    arguments, run_hlo_passes));
 
   return LiteralTestUtil::NearOrEqual(/*expected=*/test_0, /*actual=*/test_1,
                                       error);
@@ -574,9 +625,9 @@ ::testing::AssertionResult HloTestBase::Run(
                   : ::testing::AssertionFailure() << s.error_message();
   }
 
-  auto output = test_runner_.Execute(std::move(module), fake_argument_ptrs,
-                                     /*run_hlo_passes=*/run_hlo_passes,
-                                     /*profile=*/profile);
+  auto output = runner_->Execute(std::move(module), fake_argument_ptrs,
+                                 /*run_hlo_passes=*/run_hlo_passes,
+                                 /*profile=*/profile);
 
   return output.ok()
              ? ::testing::AssertionSuccess()
@@ -617,7 +668,7 @@ ::testing::AssertionResult HloTestBase::RunReplicated(
   for (auto argument : fake_argument_ptrs) {
     options.arguments.push_back(argument);
   }
-  auto output = test_runner_.ExecuteReplicated(std::move(module), options);
+  auto output = runner_->ExecuteReplicated(std::move(module), options);
 
   return output.ok()
              ? ::testing::AssertionSuccess()
@@ -667,7 +718,7 @@ ::testing::AssertionResult HloTestBase::RunMultipleTimes(
     }
 
     auto executable =
-        test_runner_.CreateExecutable(std::move(module), run_hlo_passes);
+        runner_->CreateExecutable(std::move(module), run_hlo_passes);
     if (!executable.ok()) {
       return ::testing::AssertionFailure()
              << executable.status().error_message();
@@ -677,9 +728,9 @@ ::testing::AssertionResult HloTestBase::RunMultipleTimes(
 
   std::optional<Literal> canonical_output;
   for (int i = 0; i < n; ++i) {
-    StatusOr<Literal> output = test_runner_.ExecuteWithExecutable(
-        executables[i].get(), fake_arguments[i],
-        /*profile=*/&((*profiles)[i]));
+    StatusOr<Literal> output =
+        runner_->ExecuteWithExecutable(executables[i].get(), fake_arguments[i],
+                                       /*profile=*/&((*profiles)[i]));
     if (!output.ok()) {
       return ::testing::AssertionFailure() << output.status().error_message();
     }
@@ -813,4 +864,21 @@ StatusOr<std::unique_ptr<HloModule>> HloTestBase::GetOptimizedModule(
       backend().default_stream_executor()->GetAllocator());
 }
 
+StatusOr<std::unique_ptr<HloRunnerInterface>> HloTestBase::GetHloRunnerForTest(
+    se::Platform* test_platform) {
+  if (ShouldUsePjRt()) {
+    PjRtClientTestFactoryRegistry& pjrt_registry =
+        GetGlobalPjRtClientTestFactory();
+    TF_ASSIGN_OR_RETURN(auto client, pjrt_registry.Get()());
+
+    auto device_shape_representation_fn =
+        pjrt_registry.GetDeviceShapeRepresentationFn(client.get());
+
+    return std::unique_ptr<HloRunnerInterface>(
+        new HloRunnerPjRt(std::move(client), device_shape_representation_fn));
+  } else {
+    return std::unique_ptr<HloRunnerInterface>(new HloRunner(test_platform));
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 78150abc2c2..f5b2480cbbc 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
@@ -113,6 +114,13 @@ class HloTestBase : public ManifestCheckingTest {
     return RunHloPass(&hlo_pass, module);
   }
 
+  // Runs the hlo_pass with the provided module group and returns the result.
+  // This method runs the input HLO module group pass for a `HloModuleGroup` and
+  // it also verifies the module group remains unchanged when hlo_pass returns
+  // false as the StatusOr value.
+  static StatusOr<bool> RunHloPass(HloPassInterface&& hlo_pass,
+                                   HloModuleGroup* module_group);
+
   static PrecisionConfig DefaultPrecisionConfig(int operands);
 
   // Sets most fath math options to be enabled to model the fast math flags
@@ -166,6 +174,13 @@ class HloTestBase : public ManifestCheckingTest {
       std::optional<absl::string_view> expected,
       std::function<void(HloModule*)> after_pass_checks = nullptr);
 
+  // Runs pass `hlo_pass` on a group of input HLO modules `hlo_module_strs`,
+  // and FileChecks the result against `expected`.
+  void RunAndFilecheckHloModuleGroupRewrite(
+      absl::Span<const absl::string_view> hlo_module_strs,
+      HloPassInterface&& hlo_pass,
+      std::optional<absl::Span<const absl::string_view>> expected);
+
   // Populates debug options from command-line flags and adjusts the options for
   // testing. It is recommended to use this when you need to pass in
   // DebugOptions, e.g. when creating a module from a string or a file.
@@ -386,6 +401,7 @@ class HloTestBase : public ManifestCheckingTest {
   static se::Platform* GetTestPlatform();
 
  private:
+  // Either an HloRunner or HloRunnerPjRt depending on if ShouldUsePjRt()
   std::unique_ptr<HloRunnerInterface> runner_;
   se::Platform* test_platform_;
 
@@ -412,6 +428,11 @@ class HloTestBase : public ManifestCheckingTest {
       std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
       const absl::Span<Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes);
+
+  // Returns either an HloRunner or HloRunnerPjRt implementation depending if
+  // there exists a registered PjRtClientFactory.
+  StatusOr<std::unique_ptr<HloRunnerInterface>> GetHloRunnerForTest(
+      se::Platform* test_platform);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index f2ae0d0d94e..fc56e66c83a 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -357,5 +358,278 @@ TEST(LiteralTestUtilTest, DynamicNearEqualityR2Dim1) {
   EXPECT_TRUE(LiteralTestUtil::Near(literal1, literal2, error));
 }
 
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR1) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal1.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal1.SetDynamicSize(0, 5);
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal2.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal2.SetDynamicSize(0, 6);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR1_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal1.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal1.SetDynamicSize(0, 5);
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal2.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal2.SetDynamicSize(0, 6);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, ExpectedIsDynamicActualIsNotR1) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal1.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal1.SetDynamicSize(0, 5);
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal2.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  // Only literal1 is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, ExpectedIsDynamicActualIsNotR1_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal1.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal1.SetDynamicSize(0, 5);
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal2.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  // Only literal1 is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, ActualIsDynamicExpectedIsNotR1) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal1.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {10}));
+  literal2.PopulateR1<uint32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal2.SetDynamicSize(0, 5);
+  // Only literal2 is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, ActualIsDynamicExpectedIsNotR1_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal1.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {10}));
+  literal2.PopulateR1<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  literal2.SetDynamicSize(0, 5);
+  // Only literal2 is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2Dim0) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal1.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(0, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal2.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(0, 3);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2Dim0_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal1.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(0, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal2.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(0, 3);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2Dim1) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal1.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(1, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal2.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(1, 3);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2Dim1_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal1.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(1, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal2.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(1, 3);
+  // Dynamic sizes do not match.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2DifferentDimensions) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal1.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(1, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(U32, {3, 3}));
+  literal2.PopulateR2<uint32_t>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(0, 2);
+  // Different dimensions were set as dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, UnequalDynamicDimensionsR2DifferentDimensions_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal1.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal1.SetDynamicSize(1, 2);
+  auto literal2 = Literal(ShapeUtil::MakeShape(F32, {3, 3}));
+  literal2.PopulateR2<float>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  literal2.SetDynamicSize(0, 2);
+  // Different dimensions were set as dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesAreEqual) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 5);
+  EXPECT_TRUE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesAreNear) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 5);
+  EXPECT_TRUE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesAreEqualWithinDynamicBounds) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 3);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 99, 99});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 3);
+  EXPECT_TRUE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesAreNearWithinDynamicBounds) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 3);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 99, 99});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 3);
+  EXPECT_TRUE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesHaveDifferentDynamicSizes) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 4);
+  // Dynamic sizes are not equal.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, DynamicTuplesHaveDifferentDynamicSizes_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal2.SetDynamicSize(0, {0}, 4);
+  // Dynamic sizes are not equal.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, OneTupleDynamicOneIsNot) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(U32, {5}), ShapeUtil::MakeShape(U32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<uint32_t>({1, 2, 3, 4, 5});
+  // Only one of the tuples is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Equal(literal1, literal2));
+}
+
+TEST(LiteralTestUtilTest, OneTupleDynamicOneIsNot_F32) {
+  auto literal1 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  auto literal2 = Literal(ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5})}));
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal1, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  literal1.SetDynamicSize(0, {0}, 5);
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{0})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  MutableBorrowingLiteral(&literal2, /*view_root=*/{1})
+      .PopulateR1<float>({1, 2, 3, 4, 5});
+  // Only one of the tuples is dynamic.
+  EXPECT_FALSE(LiteralTestUtil::Near(literal1, literal2, ErrorSpec{0.0001}));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index c22dcf6fa5d..b80e0ed65b7 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
 
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -24,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
@@ -44,14 +47,17 @@ class GpuDummyCompiler : public GpuCompiler {
   GpuDummyCompiler() : GpuCompiler(kDummyTestId, kDummyTriple, kDummyLayout) {}
 
   Status OptimizeHloConvolutionCanonicalization(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
+      HloModule* hlo_module, GpuVersion gpu_version,
+      se::dnn::VersionInfo dnn_version,
       se::DeviceMemoryAllocator* device_allocator) {
     return OkStatus();
   }
 
   Status OptimizeHloPostLayoutAssignment(
-      HloModule* hlo_module, se::StreamExecutor* stream_exec,
-      se::DeviceMemoryAllocator* device_allocator) {
+      HloModule* hlo_module, se::StreamExecutor* stream_executor,
+      se::DeviceMemoryAllocator* device_allocator,
+      const GpuTargetConfig& gpu_target_config,
+      const AutotuneResults* autotune_results) override {
     return OkStatus();
   }
 
@@ -61,8 +67,8 @@ class GpuDummyCompiler : public GpuCompiler {
 
   StatusOr<std::pair<std::string, std::vector<uint8_t>>> CompileTargetBinary(
       const HloModuleConfig& module_config, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec, bool relocatable,
-      const HloModule* debug_module) {
+      GpuVersion gpu_version, bool relocatable,
+      const HloModule* debug_module) override {
     std::vector<uint8_t> compiled_results;
     return std::pair<std::string, std::vector<uint8_t>>(
         "", std::move(compiled_results));
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index ebe1ae0f147..0e57e548601 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -793,6 +793,11 @@ XLA_TEST_F(LocalClientExecuteTest, CompilePartitionedExecutable) {
 
 XLA_TEST_F(LocalClientExecuteTest,
            DISABLED_ON_INTERPRETER(SizeOfGeneratedCodeInBytes)) {
+  if (IsMlirLoweringEnabled()) {
+    // SizeOfGeneratedCodeInBytes is not supported by the MLIR pipeline.
+    GTEST_SKIP();
+  }
+
   XlaBuilder builder(TestName());
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "x");
   constexpr int size = 100000;
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 8723e7583ae..8a45dd64343 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -185,6 +185,9 @@ class MatOpsDotAddTest
     bool add_lhs = std::get<1>(GetParam());
     bool transpose = std::get<2>(GetParam());
     bool use_cublaslt = std::get<3>(GetParam());
+#ifndef GOOGLE_CUDA
+    use_cublaslt = false;
+#endif
     execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
         use_cublaslt);
     Array2D<T> lhs({{1.0f, 2.0f}, {3.0f, 4.0f}});
@@ -281,6 +284,9 @@ class MatOpsDotAddTest
     bool row_major = std::get<0>(GetParam());
     bool transpose = std::get<2>(GetParam());
     bool use_cublaslt = std::get<3>(GetParam());
+#ifndef GOOGLE_CUDA
+    use_cublaslt = false;
+#endif
     execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
         use_cublaslt);
     Array2D<T> lhs({{1.0f, 2.0f}, {3.0f, 4.0f}});
@@ -327,6 +333,10 @@ class MatOpsDotAddTest
     bool row_major = std::get<0>(GetParam());
     bool transpose = std::get<2>(GetParam());
     bool use_cublaslt = std::get<3>(GetParam());
+#if GOOGLE_CUDA
+#else
+    use_cublaslt = false;
+#endif
     execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
         use_cublaslt);
     Array2D<T> lhs({{-1.0f, 2.0f}, {3.0f, 4.0f}});
@@ -368,6 +378,9 @@ class MatOpsDotAddTest
     bool row_major = std::get<0>(GetParam());
     bool transpose = std::get<2>(GetParam());
     bool use_cublaslt = std::get<3>(GetParam());
+#ifndef GOOGLE_CUDA
+    use_cublaslt = false;
+#endif
     execution_options_.mutable_debug_options()->set_xla_gpu_enable_cublaslt(
         use_cublaslt);
     Array2D<T> lhs({{-1.0f, 2.0f}, {3.0f, 4.0f}});
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index af2f9dc530b..b5276ae83bd 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -247,13 +247,9 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 // Timeout last observed on 2017-11-20.
 #ifdef NDEBUG
 
-// TODO(b/65525254) Fails on GPU on 2017-09-10 because we try to reserve too
-// much space in parameter memory for the kernel.
-//
 // TODO(b/65526061) Failed on CPU on 2017-09-10 due to timeout in LLVM
 // compilation.
-XLA_TEST_F(ParamsTest,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(ThreeThousandParameters))) {
+XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(ThreeThousandParameters)) {
   XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
@@ -278,13 +274,10 @@ XLA_TEST_F(ParamsTest,
   ComputeAndCompareR0<float>(&builder, target, param_data, ErrorSpec(0.0001f));
 }
 
-// TODO(b/65525254) Fails on GPU on 2017-09-10 because we try to reserve too
-// much space in parameter memory for the kernel.
-//
 // TODO(b/65526061) Failed on CPU on 2017-09-10 due to timeout in LLVM
 // compilation.
-XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
-                           ThreeThousandParametersAndOutputElements))) {
+XLA_TEST_F(ParamsTest,
+           DISABLED_ON_CPU(ThreeThousandParametersAndOutputElements)) {
   XlaBuilder builder(TestName());
 
   std::vector<std::unique_ptr<GlobalData>> param_data_owner;
diff --git a/tensorflow/compiler/xla/tests/pjrt_client_registry.cc b/tensorflow/compiler/xla/tests/pjrt_client_registry.cc
index 7bbb4c9b0c5..91094c8cf48 100644
--- a/tensorflow/compiler/xla/tests/pjrt_client_registry.cc
+++ b/tensorflow/compiler/xla/tests/pjrt_client_registry.cc
@@ -19,9 +19,6 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/service/hlo_runner.h"
-#include "tensorflow/compiler/xla/service/hlo_runner_pjrt.h"
-
 namespace xla {
 
 PjRtClientTestFactoryRegistry& GetGlobalPjRtClientTestFactory() {
@@ -29,20 +26,12 @@ PjRtClientTestFactoryRegistry& GetGlobalPjRtClientTestFactory() {
   return *factory;
 }
 
-StatusOr<std::unique_ptr<HloRunnerInterface>> GetHloRunnerForTest(
-    se::Platform* test_platform) {
-  if (ShouldUsePjRt()) {
-    TF_ASSIGN_OR_RETURN(auto client, GetGlobalPjRtClientTestFactory().Get()());
-    return std::unique_ptr<HloRunnerInterface>(
-        new HloRunnerPjRt(std::move(client)));
-  } else {
-    return std::unique_ptr<HloRunnerInterface>(new HloRunner(test_platform));
-  }
-}
-
 void RegisterPjRtClientTestFactory(
-    std::function<StatusOr<std::unique_ptr<PjRtClient>>()> factory) {
-  GetGlobalPjRtClientTestFactory().Register(std::move(factory));
+    PjRtClientTestFactoryRegistry::PjRtClientFactory factory,
+    PjRtClientTestFactoryRegistry::DeviceShapeRepresentationFnFactory
+        registered_device_shape_representation_fn) {
+  GetGlobalPjRtClientTestFactory().Register(
+      std::move(factory), registered_device_shape_representation_fn);
 }
 
 bool ShouldUsePjRt() {
diff --git a/tensorflow/compiler/xla/tests/pjrt_client_registry.h b/tensorflow/compiler/xla/tests/pjrt_client_registry.h
index 80482a439c4..100c7c403da 100644
--- a/tensorflow/compiler/xla/tests/pjrt_client_registry.h
+++ b/tensorflow/compiler/xla/tests/pjrt_client_registry.h
@@ -24,20 +24,41 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
-#include "tensorflow/compiler/xla/service/hlo_runner_interface.h"
 
 namespace xla {
 
 class PjRtClientTestFactoryRegistry {
  public:
-  void Register(
-      std::function<StatusOr<std::unique_ptr<PjRtClient>>()> factory) {
+  typedef std::function<Shape(const Shape&)> DeviceShapeRepresentationFn;
+  typedef std::function<DeviceShapeRepresentationFn(PjRtClient*)>
+      DeviceShapeRepresentationFnFactory;
+  typedef std::function<StatusOr<std::unique_ptr<PjRtClient>>()>
+      PjRtClientFactory;
+
+  static DeviceShapeRepresentationFn DefaultShapeRepresentationRegisteredFn(
+      StatusOr<PjRtClient*> client) {
+    return [](const Shape& host_shape) { return host_shape; };
+  }
+
+  void Register(PjRtClientFactory factory,
+                DeviceShapeRepresentationFnFactory
+                    registered_device_shape_representation_fn) {
     if (HasRegisteredFactory()) {
       LOG(FATAL) << "A PjRtClient has already been registered.";
       return;
     }
+
     absl::MutexLock lock(&mu_);
     factory_ = std::move(factory);
+    registered_device_shape_representation_fn_ =
+        std::move(registered_device_shape_representation_fn);
+  }
+
+  // Return the device shape representation of 'host_shape'.
+  DeviceShapeRepresentationFn GetDeviceShapeRepresentationFn(
+      PjRtClient* pjrt_client) {
+    absl::MutexLock lock(&mu_);
+    return registered_device_shape_representation_fn_(pjrt_client);
   }
 
   bool HasRegisteredFactory() {
@@ -54,15 +75,17 @@ class PjRtClientTestFactoryRegistry {
   mutable absl::Mutex mu_;
   std::function<StatusOr<std::unique_ptr<PjRtClient>>()> factory_
       ABSL_GUARDED_BY(mu_);
+  DeviceShapeRepresentationFnFactory registered_device_shape_representation_fn_;
 };
 
 PjRtClientTestFactoryRegistry& GetGlobalPjRtClientTestFactory();
 
-StatusOr<std::unique_ptr<HloRunnerInterface>> GetHloRunnerForTest(
-    se::Platform* test_platform);
-
 void RegisterPjRtClientTestFactory(
-    std::function<StatusOr<std::unique_ptr<PjRtClient>>()> factory);
+    PjRtClientTestFactoryRegistry::PjRtClientFactory factory,
+    PjRtClientTestFactoryRegistry::DeviceShapeRepresentationFnFactory
+        registered_device_shape_representation_fn =
+            PjRtClientTestFactoryRegistry::
+                DefaultShapeRepresentationRegisteredFn);
 
 bool ShouldUsePjRt();
 
diff --git a/tensorflow/compiler/xla/tests/sample_file_test.cc b/tensorflow/compiler/xla/tests/sample_file_test.cc
index 638681bf08e..699a7db3572 100644
--- a/tensorflow/compiler/xla/tests/sample_file_test.cc
+++ b/tensorflow/compiler/xla/tests/sample_file_test.cc
@@ -22,10 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/path.h"
-#include "tensorflow/tsl/platform/resource_loader.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -40,9 +37,8 @@ class SampleFileTest : public HloTestBase {
 };
 
 TEST_F(SampleFileTest, Convolution) {
-  const std::string& filename = tsl::GetDataDependencyFilepath(
-      tsl::io::JoinPath("tensorflow", "compiler", "xla", "tests",
-                        "isolated_convolution.hlo"));
+  const std::string& filename = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "tests", "isolated_convolution.hlo");
   EXPECT_TRUE(RunAndCompareFromFile(filename, ErrorSpec{0.01}));
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index f025ec9153e..e81894ce9fa 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -24,7 +24,7 @@ static bool InitModule() {
   VLOG(1) << "DisabledManifestPath: " << *DisabledManifestPath();
   *TestPlatform() = XLA_PLATFORM;
   VLOG(1) << "TestPlatform: " << *TestPlatform();
-  return false;
+  return true;
 }
 
 static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index dc0aa1b86f4..9fe1320e4bd 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
-#include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -222,6 +221,43 @@ void PopulateWithFloatingPointData<bfloat16>(Literal* literal,
   }
 }
 
+template <>
+void PopulateWithFloatingPointData<tsl::float8_e5m2>(Literal* literal,
+                                                     std::minstd_rand0* engine,
+                                                     bool no_duplicates,
+                                                     bool use_large_range) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<tsl::float8_e5m2>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<tsl::float8_e5m2>(literal, engine);
+  } else if (use_large_range) {
+    PopulateWithRandomFullRangeFloatingPointData<tsl::float8_e5m2>(literal,
+                                                                   engine);
+  } else {
+    PopulateWithRandomFloatingPointData<tsl::float8_e5m2, float>(literal,
+                                                                 engine);
+  }
+}
+
+template <>
+void PopulateWithFloatingPointData<tsl::float8_e4m3fn>(
+    Literal* literal, std::minstd_rand0* engine, bool no_duplicates,
+    bool use_large_range) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<tsl::float8_e4m3fn>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<tsl::float8_e4m3fn>(literal, engine);
+  } else if (use_large_range) {
+    PopulateWithRandomFullRangeFloatingPointData<tsl::float8_e4m3fn>(literal,
+                                                                     engine);
+  } else {
+    PopulateWithRandomFloatingPointData<tsl::float8_e4m3fn, float>(literal,
+                                                                   engine);
+  }
+}
+
 // uniform_int_distribution is not defined for 8-bit integers.
 // Use 'short' for those types.
 template <typename IntT>
@@ -301,6 +337,14 @@ StatusOr<Literal> MakeFakeLiteralInternal(
   int64_t max = std::numeric_limits<int64_t>::max();
   int64_t min = std::numeric_limits<int64_t>::lowest();
   switch (shape.element_type()) {
+    case F8E5M2:
+      PopulateWithFloatingPointData<tsl::float8_e5m2>(
+          &literal, engine, no_duplicates, use_large_range);
+      break;
+    case F8E4M3FN:
+      PopulateWithFloatingPointData<tsl::float8_e4m3fn>(
+          &literal, engine, no_duplicates, use_large_range);
+      break;
     case BF16:
       PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates,
                                               use_large_range);
@@ -796,4 +840,14 @@ std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
   return std::make_unique<HloDotInstruction>(
       shape, lhs, rhs, dot_dimension_numbers, precision_config);
 }
+
+bool IsMlirLoweringEnabled() {
+  char* xla_flags = getenv("XLA_FLAGS");
+  if (!xla_flags) {
+    return false;
+  }
+  return !absl::StrContains(xla_flags, "--xla_cpu_use_xla_runtime=false") &&
+         (absl::StrContains(xla_flags, "--xla_cpu_use_xla_runtime"));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 08e9b9d1cbf..2ea4c51b196 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -111,6 +111,10 @@ Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
 std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
                                                       HloInstruction* lhs,
                                                       HloInstruction* rhs);
+
+// Checks whether MLIR lowering is enabled through XLA_FLAGS.
+bool IsMlirLoweringEnabled();
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_UTILS_H_
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index 104cd9a16a3..3abbc9b9a55 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include <array>
 
-#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
@@ -27,6 +27,10 @@ namespace {
 class TokenHloTest : public HloTestBase {};
 
 XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
+  if (IsMlirLoweringEnabled()) {
+    // The MLIR pipeline doesn't support returning tokens.
+    GTEST_SKIP() << "Returning tokens unsupported by MLIR";
+  }
   std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateToken());
@@ -38,6 +42,10 @@ XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
 }
 
 XLA_TEST_F(TokenHloTest, TokenInTuple) {
+  if (IsMlirLoweringEnabled()) {
+    // The MLIR pipeline doesn't support returning tokens.
+    GTEST_SKIP() << "Returning tokens unsupported by MLIR";
+  }
   std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
@@ -52,6 +60,10 @@ XLA_TEST_F(TokenHloTest, TokenInTuple) {
 }
 
 XLA_TEST_F(TokenHloTest, TokenTree) {
+  if (IsMlirLoweringEnabled()) {
+    // The MLIR pipeline doesn't support returning tokens.
+    GTEST_SKIP() << "Returning tokens unsupported by MLIR";
+  }
   std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
@@ -200,6 +212,10 @@ ENTRY %TokenInConditional (param.3: pred[]) -> s32[] {
 }
 
 XLA_TEST_F(TokenHloTest, AddDependency) {
+  if (IsMlirLoweringEnabled()) {
+    // This test generates invalid HLO. The after-all op only takes tokens.
+    GTEST_SKIP() << "Invalid HLO unsupported by MLIR";
+  }
   std::string module_string = R"(
 HloModule AddDependency, is_scheduled=true
 
@@ -228,6 +244,10 @@ ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
 }
 
 XLA_TEST_F(TokenHloTest, AddDependencyOfConstant) {
+  if (IsMlirLoweringEnabled()) {
+    // This test generates invalid HLO. The after-all op only takes tokens.
+    GTEST_SKIP() << "Invalid HLO unsupported by MLIR";
+  }
   std::string module_string = R"(
 HloModule AddDependencyOfConstant, is_scheduled=true
 
@@ -266,6 +286,11 @@ ENTRY %AddDependency (p: f32[3]) -> f32[3] {
 }
 
 XLA_TEST_F(TokenHloTest, TupleShapedAddDependency) {
+  if (IsMlirLoweringEnabled()) {
+    // This test generates invalid MHLO. The add-dependency op doesn't take
+    // tuples.
+    GTEST_SKIP() << "Invalid MHLO";
+  }
   std::string module_string = R"(
 HloModule TupleShapedAddDependency, is_scheduled=true
 ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 48a51fc484e..e3d2093e1f9 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -252,6 +252,10 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
 }
 
 XLA_TEST_F(TupleTest, NestedTuples) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
+  }
+
   XlaBuilder builder(TestName());
   auto inner_tuple = Tuple(&builder, {ConstantR1<float>(&builder, {1.0, 2.0}),
                                       ConstantR0<float>(&builder, 42.0)});
@@ -268,6 +272,10 @@ XLA_TEST_F(TupleTest, NestedTuples) {
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
+  }
+
   XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
@@ -297,6 +305,10 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 }
 
 XLA_TEST_F(TupleTest, ComplexTuples) {
+  if (IsMlirLoweringEnabled()) {
+    GTEST_SKIP() << "Nested tuples are not supported by the MLIR pipeline";
+  }
+
   XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
@@ -375,6 +387,12 @@ XLA_TEST_F(TupleHloTest, BadTupleShapeFailsGracefully) {
 }
 
 XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
+  if (IsMlirLoweringEnabled()) {
+    // Bitcasts are not generated by frontends directly and are not supported by
+    // the MLIR pipeline.
+    GTEST_SKIP() << "Bitcast is not supported by the MLIR pipeline";
+  }
+
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 494ef72ee0d..a28908e9abe 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -433,7 +433,7 @@ XLA_TEST_F(VecOpsSimpleTest, CbrtSevenValues) {
                              1.860867, -inf,    inf,    qnan};
   auto x = ConstantR1<float>(&builder, cube);
   Cbrt(x);
-  ComputeAndCompare(&builder, {}, ErrorSpec(/*aabs=*/1e-7, /*arel=*/3e-7));
+  ComputeAndCompare(&builder, {}, ErrorSpec(/*aabs=*/1e-7, /*arel=*/5e-7));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/xla_ffi_test.cc b/tensorflow/compiler/xla/tests/xla_ffi_test.cc
new file mode 100644
index 00000000000..5d08886ad4b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/xla_ffi_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace gpu {
+
+using ::xla::runtime::ffi::Ffi;
+using ::xla::runtime::ffi::FfiStatus;
+using ::xla::runtime::ffi::StridedBufferArg;
+
+class FfiTest : public HloTestBase {
+ protected:
+  Shape tensor_4xf32_ = ShapeUtil::MakeShape(F32, {4});
+};
+
+// State instantiated for every XLA executable and passed to FFI functions.
+struct TestModuleState {
+  explicit TestModuleState(std::vector<std::string>* trace) : trace(trace) {}
+  std::vector<std::string>* trace = nullptr;
+};
+
+// XLA FFI module encapsulating FFI functions exported to the runtime.
+struct TestModule : public runtime::ffi::StatefulModule<TestModuleState> {
+  using Base = runtime::ffi::StatefulModule<TestModuleState>;
+
+  explicit TestModule(const XLA_FFI_Api* api)
+      : Base(api, "xla-ffi-module", {{"test.ffi", FFI_Impl}}) {}
+
+  std::unique_ptr<TestModuleState> CreateState() final {
+    return std::make_unique<TestModuleState>(&trace);
+  }
+
+  // XLA runtime binding for the C++ function.
+  XLA_FFI_DEFINE_FUNCTION(FFI_Impl, Impl,
+                          Ffi::Binding()
+                              .State<TestModuleState>()
+                              .Arg<StridedBufferArg>()
+                              .Arg<StridedBufferArg>()
+                              .Arg<StridedBufferArg>()
+                              .Attr<float>("foo"));
+
+  // Typed XLA FFI function handler that will be registered with the runtime.
+  //
+  // WARNING: Buffer arguments are placed on the GPU device and we can't touch
+  // the memory they are pointing to on the host.
+  static FfiStatus Impl(TestModuleState* state, StridedBufferArg input0,
+                        StridedBufferArg input1, StridedBufferArg out,
+                        float foo);
+
+  // Trace calls to FFI functions from this module.
+  std::vector<std::string> trace;
+};
+
+FfiStatus TestModule::Impl(TestModuleState* state, StridedBufferArg input0,
+                           StridedBufferArg input1, StridedBufferArg out,
+                           float foo) {
+  state->trace->push_back(std::to_string(foo));
+  state->trace->push_back(input0.ToString());
+  state->trace->push_back(input1.ToString());
+  state->trace->push_back(out.ToString());
+  return FfiStatus::Ok();
+}
+
+XLA_TEST_F(FfiTest, Basic) {
+  // Register XLA FFI module with the runtime.
+  TestModule ffi_module(GetXlaFfiApi());
+
+  absl::string_view mlir_module_str = R"(
+  module @xla_ffi {
+    func.func public @main(%arg0: tensor<4xf32>,
+                           %arg1: tensor<4xf32>) -> tensor<4xf32> {
+      %0 = mhlo.add %arg0, %arg1 : tensor<4xf32>
+
+      %1 = "mhlo.custom_call"(%arg0, %arg1)
+        { api_version = 4 : i32, call_target_name = "test.ffi",
+          has_side_effect = true, backend_config = {foo = 42.0 : f32}
+        } : (tensor<4xf32>, tensor<4xf32>) -> (tensor<4xf32>)
+
+      return %0 : tensor<4xf32>
+    }
+  }
+  )";
+
+  // Convert Mhlo to Hlo Module.
+  XlaComputation xla_computation;
+  TF_ASSERT_OK(ParseMlirModuleStringAndConvertToXlaComputation(
+      mlir_module_str, xla_computation, false, false));
+
+  HloModuleProto proto = xla_computation.proto();
+
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_xla_runtime_executable(true);
+
+  // Instantiate HloModule from the computation.
+  auto config = HloModule::CreateModuleConfigFromProto(proto, debug_options);
+  TF_ASSERT_OK(config.status());
+
+  auto module = HloModule::CreateFromProto(proto, *config);
+  TF_ASSERT_OK(module.status());
+
+  // Prepare test inputs.
+  Array<float> arr0({1.0f, 2.0f, 3.0f, 4.0f});
+  Array<float> arr1({4.0f, 3.0f, 2.0f, 1.0f});
+
+  Literal arg0 = LiteralUtil::CreateFromArray(arr0);
+  Literal arg1 = LiteralUtil::CreateFromArray(arr1);
+
+  Literal result = ExecuteAndTransfer(std::move(*module), {&arg0, &arg1});
+
+  // Check that `mhlo.add` was executed.
+  LiteralTestUtil::ExpectR1Equal<float>({5.0f, 5.0f, 5.0f, 5.0f}, result);
+
+  // Check that FFI handler was also executed.
+  ASSERT_EQ(ffi_module.trace.size(), 4);
+  ASSERT_TRUE(absl::StartsWith(ffi_module.trace.at(0), "42"));
+  ASSERT_EQ(ffi_module.trace.at(1), "Buffer: dtype=f32 sizes=[4] strides=[1]");
+  ASSERT_EQ(ffi_module.trace.at(2), "Buffer: dtype=f32 sizes=[4] strides=[1]");
+  ASSERT_EQ(ffi_module.trace.at(3), "Buffer: dtype=f32 sizes=[4] strides=[1]");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 39b22044a49..e18b214955a 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -362,7 +362,6 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) {
 }  // namespace xla
 
 static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
-  // Intentional "leak".
   char** new_argv = new char*[argc + 2];
   for (int i = 0; i < argc; i++) {
     new_argv[i] = argv[i];
@@ -386,9 +385,15 @@ GTEST_API_ int main(int argc, char** argv) {
   std::vector<tsl::Flag> flag_list;
   xla::AppendDebugOptionsFlags(&flag_list);
   std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
+  std::unique_ptr<char*[]> argv_ptr(argv);
+  char* to_be_freed[] = {argv[argc - 1], argv[argc - 2]};
 
   auto usage = tsl::Flags::Usage(argv[0], flag_list);
-  if (!tsl::Flags::Parse(&argc, argv, flag_list)) {
+  const bool parseResult = tsl::Flags::Parse(&argc, argv, flag_list);
+  for (auto p : to_be_freed) {
+    free(p);
+  }
+  if (!parseResult) {
     LOG(ERROR) << "\n" << usage;
     return 2;
   }
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 375d08899bf..42292e74d92 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -1,20 +1,25 @@
 # Tools and utilities that aid in XLA development and usage.
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load(
-    "//tensorflow:tensorflow.bzl",
+    "//tensorflow/tsl:tsl.bzl",
     "if_cuda_or_rocm",
-    "tf_cc_binary",
-    "tf_cc_test",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
 )
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
-load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla:internal"],
     licenses = ["notice"],
 )
@@ -29,7 +34,7 @@ filegroup(
     visibility = ["//tensorflow/compiler/xla:internal"],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "hex_floats_to_packed_literal",
     srcs = ["hex_floats_to_packed_literal.cc"],
     deps = [
@@ -37,7 +42,6 @@ tf_cc_binary(
         "//tensorflow/tsl/lib/io:buffered_inputstream",
         "//tensorflow/tsl/lib/io:random_inputstream",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
@@ -47,7 +51,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "show_signature",
     srcs = ["show_signature.cc"],
     deps = [
@@ -100,7 +104,7 @@ cc_library(
     alwayslink = True,
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "replay_computation_cpu",
     deps = [
         ":replay_computation_library",
@@ -108,16 +112,33 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_test(
+    name = "replay_computation_bin_test",
+    srcs = ["replay_computation_bin_test.cc"],
+    data = [
+        "data/add.hlo",
+        ":replay_computation_cpu",
+        ":replay_computation_interpreter",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:subprocess",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+xla_cc_binary(
     name = "replay_computation_gpu",
     tags = ["gpu"],
     deps = [
         ":replay_computation_library",
         "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/stream_executor:cuda_platform",
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "replay_computation_interpreter",
     deps = [
         ":replay_computation_library",
@@ -125,7 +146,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "show_literal",
     srcs = ["show_literal.cc"],
     deps = [
@@ -139,7 +160,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "convert_computation",
     srcs = ["convert_computation.cc"],
     deps = [
@@ -153,7 +174,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "show_text_literal",
     srcs = ["show_text_literal.cc"],
     deps = [
@@ -168,7 +189,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "dumped_computation_to_text",
     srcs = ["dumped_computation_to_text.cc"],
     deps = [
@@ -189,7 +210,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "dumped_computation_to_operation_list",
     srcs = ["dumped_computation_to_operation_list.cc"],
     deps = [
@@ -199,8 +220,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
         "//tensorflow/tsl/platform:env",
@@ -212,7 +233,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "hlo_proto_to_json",
     srcs = ["hlo_proto_to_json.cc"],
     deps = [
@@ -227,7 +248,7 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_extractor_test",
     srcs = ["hlo_extractor_test.cc"],
     deps = [
@@ -245,15 +266,15 @@ cc_library(
     hdrs = ["hlo_extractor.h"],
     deps = [
         "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "interactive_graphviz",
     srcs = ["interactive_graphviz.cc"],
     deps = [
@@ -278,13 +299,25 @@ tf_cc_binary(
         "//tensorflow/tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]) + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor:cuda_platform",
     ]),
 )
 
-sh_test(
-    name = "interactive_graphviz_build_only_test",
-    srcs = ["interactive_graphviz_test.sh"],
-    data = [":interactive_graphviz"],
+xla_cc_test(
+    name = "interactive_graphviz_bin_test",
+    srcs = ["interactive_graphviz_bin_test.cc"],
+    data = [
+        "data/add.hlo",
+        ":interactive_graphviz",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:subprocess",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
@@ -293,9 +326,10 @@ cc_library(
     hdrs = ["hlo_module_loader.h"],
     visibility = ["//tensorflow/compiler/xla:friends"],
     deps = [
+        ":run_hlo_module_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
@@ -306,7 +340,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_module_loader_test",
     srcs = ["hlo_module_loader_test.cc"],
     deps = [
@@ -326,13 +360,13 @@ cc_library(
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:despecializer",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_runner_interface",
         "//tensorflow/compiler/xla/stream_executor:platform",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
     ],
 )
 
@@ -355,7 +389,6 @@ xla_py_proto_library(
 
 cc_library(
     name = "run_hlo_module_lib",
-    testonly = True,
     srcs = ["run_hlo_module.cc"],
     hdrs = ["run_hlo_module.h"],
     deps = [
@@ -370,7 +403,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client/lib:testing",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -381,7 +414,7 @@ cc_library(
     ],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "run_hlo_module",
     testonly = True,
     srcs = ["run_hlo_module_main.cc"],
@@ -404,18 +437,37 @@ tf_cc_binary(
         "//tensorflow/tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]) + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor:cuda_platform",
     ]),
 )
 
+xla_cc_test(
+    name = "run_hlo_module_bin_test",
+    srcs = ["run_hlo_module_bin_test.cc"],
+    data = [
+        "data/add.hlo",
+        "data/must_alias.hlo",
+        "data/must_alias_with_sharding.hlo",
+        ":run_hlo_module",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:subprocess",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_control_flow_flattening",
     srcs = ["hlo_control_flow_flattening.cc"],
     hdrs = ["hlo_control_flow_flattening.h"],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:collective_ops_utils",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:tuple_util",
@@ -424,7 +476,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_control_flow_flattening_test",
     srcs = ["hlo_control_flow_flattening_test.cc"],
     deps = [
@@ -450,7 +502,7 @@ cc_library(
     deps = [],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "compute_cost",
     srcs = ["compute_cost.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/tools/data/add.hlo b/tensorflow/compiler/xla/tools/data/add.hlo
new file mode 100644
index 00000000000..c569f98abb1
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/data/add.hlo
@@ -0,0 +1,6 @@
+HloModule f
+
+ENTRY f {
+  arg = f32[2,2]{1,0} parameter(0)
+  ROOT add_result = f32[2,2]{1,0} add(arg, arg)
+}
diff --git a/tensorflow/compiler/xla/tools/data/must_alias.hlo b/tensorflow/compiler/xla/tools/data/must_alias.hlo
new file mode 100644
index 00000000000..89704379202
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/data/must_alias.hlo
@@ -0,0 +1,6 @@
+HloModule f, input_output_alias={{}: (0, {}, must-alias)}
+
+ENTRY f {
+  a = s32[10] parameter(0)
+  ROOT sum = s32[10] add(a, a)
+}
diff --git a/tensorflow/compiler/xla/tools/data/must_alias_with_sharding.hlo b/tensorflow/compiler/xla/tools/data/must_alias_with_sharding.hlo
new file mode 100644
index 00000000000..4563375afae
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/data/must_alias_with_sharding.hlo
@@ -0,0 +1,7 @@
+HloModule f, input_output_alias={{}: (0, {}, must-alias)}
+
+ENTRY f {
+  a = s32[10] parameter(0)
+  b = s32[10] custom-call(a), custom_call_target="Sharding", sharding={replicated}
+  ROOT sum = s32[10] add(b, b)
+}
diff --git a/tensorflow/compiler/xla/tools/driver.cc b/tensorflow/compiler/xla/tools/driver.cc
index 724d291cb71..e4ce2e4690c 100644
--- a/tensorflow/compiler/xla/tools/driver.cc
+++ b/tensorflow/compiler/xla/tools/driver.cc
@@ -113,13 +113,15 @@ enum PrimitiveType {
   F32,
   F64,
   C64,
-  C128
+  C128,
+  F8E5M2,
+  F8E4M3FN,
 };
 
 const std::vector<std::string>& primitive_strings() {
   static auto vec = new std::vector<std::string>(
       {"s16", "s32", "s64", "u8", "u16", "u32", "u64", "f16", "bf16", "f32",
-       "f64", "c64", "c128"});
+       "f64", "c64", "c128", "f8e5m2", "f8e4m3fn"});
   return *vec;
 }
 
@@ -394,6 +396,8 @@ void Fill(void* buffer, const ArrayShape& shape) {
     case F64:
       return FillFloatT<double>(buffer, num_elements);
 
+    case F8E5M2:
+    case F8E4M3FN:
     case F16:
     case BF16:
     case C64:
@@ -439,6 +443,8 @@ void Display(const void* buffer, const ArrayShape& shape) {
     case F64:
       return DisplayT<double>(buffer, num_elements);
 
+    case F8E5M2:
+    case F8E4M3FN:
     case F16:
     case BF16:
     case C64:
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 06d9ab21fae..995435b9c63 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/BUILD b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
new file mode 100644
index 00000000000..951c4e5261c
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
@@ -0,0 +1,93 @@
+# Description:
+#   A tool for reducing a HLO module that produces incorrect results.
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+
+package(
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    licenses = ["notice"],
+)
+
+xla_cc_binary(
+    name = "hlo_bisect",
+    testonly = True,
+    srcs = ["hlo_bisect.cc"],
+    deps = [
+        ":hlo_bisect_utils",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/util:command_line_flags",
+    ],
+)
+
+cc_library(
+    name = "hlo_bisect_state",
+    srcs = ["hlo_bisect_state.cc"],
+    hdrs = ["hlo_bisect_state.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "hlo_bisect_state_test",
+    srcs = ["hlo_bisect_state_test.cc"],
+    deps = [
+        ":hlo_bisect_state",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+cc_library(
+    name = "hlo_bisect_utils",
+    testonly = True,
+    srcs = ["hlo_bisect_utils.cc"],
+    hdrs = ["hlo_bisect_utils.h"],
+    deps = [
+        ":hlo_bisect_state",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_proto_util",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service:hlo_runner_interface",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools:prepare_reference_module",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:subprocess",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect.cc
new file mode 100644
index 00000000000..3d10f2d20e8
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect.cc
@@ -0,0 +1,137 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.h"
+#include "tensorflow/tsl/platform/init_main.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
+
+const char* const kUsage = R"(
+Given an HloModule that manifests an XLA bug, either crashes the compiler or
+an execution engine on a platform or produces observable different results on
+two platforms, hlo_bisect tries to trim down the module to a smaller one that
+still exhibits the same problem. We first repeatedly apply two strategies:
+reduce the number of outputs produced by the entry computation and reduce
+the number of sequential instructions in the entry computation. After that
+we try to replace each instruction with a constant value for all the
+computations to further reduce the module.
+
+Optionally provide the --script argument in order to use an external script for
+verifying the presence of the bug. This should be a path to executable that
+returns a non-zero exit status if the modified HLO module (passed as the command
+line argument path) has a bug.
+
+Usage:
+
+  bazel run hlo_bisect -- \
+    --input=path/to/hlo_module \
+    --test_platform=[CPU|CUDA|Interpreter]
+    --dump_path=/tmp
+)";
+
+struct BisectOptions {
+  std::string input = "";
+  std::string script = "";
+  std::string dump_path = "/tmp/hlo_bisect";
+  std::string output_format = "pb";
+  bool all_computations = false;
+  std::string test_platform = "CUDA";
+  std::string reference_platform = "Interpreter";
+  float abs_error = 0.01;
+  float rel_error = 0.1;
+};
+
+int main(int argc, char** argv) {
+  BisectOptions opts;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("input", &opts.input,
+                "The input HLO module to bisect. Can also pass this as the "
+                "first argument, but this flag is more explicit."),
+      tsl::Flag(
+          "script", &opts.script,
+          "Path to the validator script. If set, then we keep reducing the HLO "
+          "module as long as the script returns a non-zero exit code."),
+      tsl::Flag("dump_path", &opts.dump_path,
+                "The base path for dumping the HLO modules."),
+      tsl::Flag("output_format", &opts.output_format,
+                "The format of the output file. Valid values:\n"
+                "  hlo : HLO textual format\n"
+                "  pb : xla::HloProto in binary proto format"),
+      tsl::Flag(
+          "all_computations", &opts.all_computations,
+          "Run bisection on every computation in the module. Return the "
+          "innermost computation that has the bug, i.e. having the minimal "
+          "module size."),
+      tsl::Flag("test_platform", &opts.test_platform,
+                "The platform that the HloModule will be executed on. "
+                "Supported platforms: CPU, CUDA, Interpreter."),
+      tsl::Flag("reference_platform", &opts.reference_platform,
+                "The platform that the result will be compared against. "
+                "Supported platforms are the same as test_platform."),
+      tsl::Flag("abs_error", &opts.abs_error,
+                "The absolute error bound used when comparing the test and "
+                "reference results."),
+      tsl::Flag("rel_error", &opts.rel_error,
+                "The relative error bound used when comparing the test and "
+                "reference results."),
+  };
+  xla::AppendDebugOptionsFlags(&flag_list);
+
+  // The usage string includes the message at the top of the file, the
+  // DebugOptions flags and the flags defined above.
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+  if (opts.input.empty()) {
+    QCHECK(argc == 2) << "Must specify a single input file";
+    opts.input = argv[1];
+  }
+
+  auto values_or_status =
+      xla::bisect::GetVerifiedModuleAndInputData(opts.input);
+  if (!values_or_status.ok()) {
+    LOG(ERROR) << "Failed to get verified module.";
+    return 1;
+  }
+
+  std::unique_ptr<xla::HloModule> module;
+  std::vector<xla::Literal> inputs;
+  std::tie(module, inputs) = std::move(values_or_status).value();
+
+  std::unique_ptr<xla::bisect::BugCheckerInterface> bug_checker;
+  if (opts.script.empty()) {
+    bug_checker = std::make_unique<xla::bisect::MiscompareChecker>(
+        module.get(), std::move(inputs), opts.test_platform,
+        opts.reference_platform,
+        xla::ErrorSpec(opts.abs_error, opts.rel_error));
+  } else {
+    bug_checker = std::make_unique<xla::bisect::ScriptChecker>(opts.script);
+  }
+
+  auto runner = std::make_unique<xla::bisect::BisectRunner>(
+      std::move(module), std::move(bug_checker));
+  xla::bisect::RunBisect(std::move(runner), opts.all_computations,
+                         opts.dump_path, opts.output_format);
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc
new file mode 100644
index 00000000000..1d85a9f167f
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.cc
@@ -0,0 +1,347 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h"
+
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace bisect {
+namespace {
+
+// Returns the modified post order of the instructions for the computation. In
+// particular, the returned vector of instructions has all the parameter
+// instructions in the front and the rest are in post order.
+std::vector<HloInstruction*> GetModifiedInstructionPostOrder(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> instructions =
+      computation->parameter_instructions();
+  absl::c_copy_if(computation->MakeInstructionPostOrder(),
+                  std::back_inserter(instructions),
+                  [&](const HloInstruction* instr) {
+                    return instr->opcode() != HloOpcode::kParameter;
+                  });
+  return instructions;
+}
+
+// Changes the module by replacing the original root instruction of the entry
+// computation with a new root instruction that is a tuple containing the values
+// in `outputs`.
+Status MorphModuleWithOutputs(HloModule* module,
+                              absl::Span<HloInstruction* const> outputs) {
+  HloComputation* entry_computation = module->entry_computation();
+  HloInstruction* new_root = outputs.size() == 1
+                                 ? outputs[0]
+                                 : entry_computation->AddInstruction(
+                                       HloInstruction::CreateTuple(outputs));
+
+  entry_computation->set_root_instruction(new_root, true);
+  *module->mutable_entry_computation_layout() =
+      module->compute_computation_layout();
+
+  HloDCE dce;
+  StatusOr<bool> dce_result = dce.Run(module);
+  return dce_result.status();
+}
+
+// Changes the module by keeping only the provided instructions of the entry
+// computation (should be sorted in the modified instruction post order),
+// inserting a new root instruction to keep all values live.
+Status MorphModuleWithInstructions(
+    HloModule* module, absl::Span<HloInstruction* const> instructions) {
+  ConstHloInstructionSet in_range_instructions(instructions.begin(),
+                                               instructions.end());
+  auto keep_result = [&](const HloInstruction* instruction) {
+    return instruction->opcode() != HloOpcode::kParameter &&
+           !absl::c_any_of(instruction->users(),
+                           [&](const HloInstruction* user) {
+                             return in_range_instructions.count(user) != 0;
+                           });
+  };
+
+  // If an instruction doesn't have a user within the range, add the result of
+  // the instruction to the outputs to keep the value live.
+  std::vector<HloInstruction*> outputs;
+  absl::c_copy_if(instructions, std::back_inserter(outputs), keep_result);
+  return MorphModuleWithOutputs(module, outputs);
+}
+
+Status MorphModuleWithInstructions(HloModule* module, size_t num_instructions) {
+  std::vector<HloInstruction*> ordered_instructions =
+      GetModifiedInstructionPostOrder(module->entry_computation());
+  HloInstruction* const* instructions_begin = &ordered_instructions.front();
+  return MorphModuleWithInstructions(
+      module, absl::MakeSpan(instructions_begin, num_instructions));
+}
+
+// Changes the module by replacing some instructions in the entry computation
+// with literals.
+Status MorphModuleWithLiterals(
+    HloModule* module, absl::flat_hash_map<std::string, Literal> literal_map) {
+  HloComputation* entry_computation = module->entry_computation();
+
+  // Iterate over instructions, as lookup by instruction name is linear.
+  absl::flat_hash_map<HloInstruction*, Literal> replace_map;
+  for (HloInstruction* instruction : entry_computation->instructions()) {
+    auto it = literal_map.find(instruction->name());
+    if (it != literal_map.end()) {
+      replace_map.emplace(instruction, std::move(it->second));
+    }
+  }
+  for (auto& [instruction, literal] : replace_map) {
+    if (!instruction->IsDead()) {
+      HloInstruction* new_instruction = entry_computation->AddInstruction(
+          HloInstruction::CreateConstant(std::move(literal)));
+      Status replace_status =
+          entry_computation->ReplaceInstruction(instruction, new_instruction);
+      TF_RETURN_IF_ERROR(replace_status);
+    }
+  }
+
+  xla::HloDCE dce;
+  StatusOr<bool> dce_status = dce.Run(module);
+  return dce_status.status();
+}
+
+// We shouldn't replace a constant in a module with another constant for
+// bisecting. We should also avoid straightforwardly replacing compound values
+// produced by kTuple, as they may have constant values inside.
+
+bool InstructionNotReplaceableWithConstant(HloInstruction* instruction) {
+  return instruction->shape().is_dynamic() ||
+         instruction->opcode() == HloOpcode::kConstant ||
+         instruction->opcode() == HloOpcode::kTuple ||
+         instruction->opcode() == HloOpcode::kParameter;
+}
+
+}  // namespace
+
+StatusOr<bool> HloBisectState::ShouldProcess() {
+  // Running the unmodified module should trigger the bug checker.
+  return RunModule(*module_);
+}
+
+StatusOr<bool> HloBisectState::TrimEntryComputation() {
+  bool changed_in_loop = false;
+  bool changed = false;
+  for (int iter = 0; changed || iter < 2; iter++) {
+    if (iter % 2 == 0) {
+      VLOG(2) << "Trimming by outputs, iteration " << iter;
+      TF_ASSIGN_OR_RETURN(changed, TrimByOutputs());
+    } else {
+      VLOG(2) << "Trimming by instructions, iteration " << iter;
+      TF_ASSIGN_OR_RETURN(changed, TrimByInstructions());
+    }
+    changed_in_loop |= changed;
+  }
+  VLOG(2) << "Trimming by replacing instructions with literals";
+  TF_ASSIGN_OR_RETURN(changed, TrimByUsingConstants());
+  VLOG(2) << "Final module: " << module_->ToString();
+  return changed || changed_in_loop;
+}
+
+std::unique_ptr<xla::HloModule>&& HloBisectState::GetResult() {
+  return std::move(module_);
+}
+
+StatusOr<bool> HloBisectState::RunModule(const HloModule& module) {
+  VLOG(3) << "Modified module: " << module.ToString();
+
+  // Run the modified module with the bug checker.
+  StatusOr<bool> bug_result = bug_checker_->Run(module);
+  TF_RETURN_IF_ERROR(bug_result.status());
+  VLOG(3) << "Bug checker result: " << bug_result.value();
+
+  // Update foldable instructions data.
+  if (!bug_result.value()) {
+    for (HloInstruction* instr : module.entry_computation()->instructions()) {
+      foldable_instructions_.insert(instr->name());
+    }
+    for (auto& [key, value] : bug_checker_->GetResults()) {
+      foldable_instructions_values_[key] = std::move(value);
+    }
+  }
+  return bug_result;
+}
+
+StatusOr<bool> HloBisectState::TrimByOutputs() {
+  // Only available if the root instruction is a tuple.
+  HloInstruction* root_instruction =
+      module_->entry_computation()->root_instruction();
+  if (root_instruction->opcode() != HloOpcode::kTuple ||
+      root_instruction->operand_count() < 2) {
+    return false;
+  }
+
+  // Run the modified module and return the error state.
+  auto run_modified = [&](int64_t start, int64_t end) -> StatusOr<bool> {
+    std::unique_ptr<HloModule> new_module = module_->Clone(/*suffix=*/"");
+    HloInstruction* const* new_operands =
+        new_module->entry_computation()->root_instruction()->operands().begin();
+    TF_RETURN_IF_ERROR(MorphModuleWithOutputs(
+        new_module.get(),
+        absl::MakeSpan(new_operands + start, end - start + 1)));
+    return RunModule(*new_module);
+  };
+
+  // Binary search for the operands range that exhibits a bug.
+  int64_t bisect_low = 0;
+  int64_t bisect_high = root_instruction->operand_count() - 1;
+  while (bisect_low < bisect_high) {
+    int64_t cur = bisect_low + (bisect_high - bisect_low) / 2;
+    VLOG(2) << "Number of outputs: " << (cur - bisect_low + 1) << " ["
+            << bisect_low << ".." << cur << "]";
+    TF_ASSIGN_OR_RETURN(bool has_bug, run_modified(bisect_low, cur));
+    if (has_bug) {
+      bisect_high = cur;
+    } else {
+      TF_ASSIGN_OR_RETURN(has_bug, run_modified(cur + 1, bisect_high));
+      if (has_bug) {
+        bisect_low = cur + 1;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Update the current module and verify that the bug is present, if changed.
+  bool changed =
+      (bisect_high - bisect_low) < (root_instruction->operand_count() - 1);
+  if (changed) {
+    TF_RETURN_IF_ERROR(MorphModuleWithOutputs(
+        module_.get(),
+        absl::MakeSpan(root_instruction->operands().begin() + bisect_low,
+                       bisect_high - bisect_low + 1)));
+    TF_RETURN_IF_ERROR(ExpectModuleIsBuggy());
+  }
+  return changed;
+}
+
+StatusOr<bool> HloBisectState::TrimByInstructions() {
+  HloComputation* computation = module_->entry_computation();
+
+  // If the root instruction is a tuple, exclude it from the bisect range.
+  int64_t upper_bound = computation->instruction_count() -
+                        computation->root_instruction()->shape().IsTuple();
+
+  // Binary search for the instructions range that exhibits a bug.
+  int64_t bisect_low = computation->num_parameters() - 1;
+  int64_t bisect_high = upper_bound;
+  while (bisect_low + 1 < bisect_high) {
+    int64_t cur = bisect_low + (bisect_high - bisect_low) / 2;
+    VLOG(2) << "Number of instructions: " << cur << " (of "
+            << computation->instruction_count() << ")";
+    std::unique_ptr<HloModule> new_module = module_->Clone(/*suffix=*/"");
+    TF_RETURN_IF_ERROR(MorphModuleWithInstructions(new_module.get(), cur));
+    TF_ASSIGN_OR_RETURN(bool has_bug, RunModule(*new_module));
+    if (has_bug) {
+      bisect_high = cur;
+    } else {
+      bisect_low = cur;
+    }
+  }
+
+  // Sanity check for the bug checker.
+  if (bisect_high == computation->num_parameters()) {
+    return InternalError(
+        "The checker fails on an empty computation! Something is not right. "
+        "Can't bisect.");
+  }
+
+  // Update the current module and verify that the bug is present, if changed.
+  bool changed = bisect_high < upper_bound;
+  if (changed) {
+    TF_RETURN_IF_ERROR(MorphModuleWithInstructions(module_.get(), bisect_high));
+    TF_RETURN_IF_ERROR(ExpectModuleIsBuggy());
+  }
+  return changed;
+}
+
+StatusOr<bool> HloBisectState::TrimByUsingConstants() {
+  // Use random literals for the instructions which do not trigger the bug
+  // checker and also didn't get a definitive value from it.
+  absl::flat_hash_map<std::string, Literal> literal_map;
+  int64_t random_literals_count = 0;
+  for (HloInstruction* instr : module_->entry_computation()->instructions()) {
+    if (InstructionNotReplaceableWithConstant(instr)) {
+      continue;
+    }
+    if (foldable_instructions_values_.contains(instr->name())) {
+      auto it = foldable_instructions_values_.extract(instr->name());
+      literal_map.insert(std::move(it));
+    } else if (foldable_instructions_.contains(instr->name())) {
+      StatusOr<Literal> literal_status = MakeFakeLiteral(instr->shape());
+      TF_RETURN_IF_ERROR(literal_status.status());
+      literal_map[instr->name()] = std::move(literal_status).value();
+      ++random_literals_count;
+    }
+  }
+  VLOG(2) << "Number of literals: " << literal_map.size()
+          << " (random: " << random_literals_count << ")";
+
+  // Replace instructions with constants and run the bug checker.
+  // It is possible that the random literals will make the bug disappear, in
+  // which case the module will not get reduced.
+  std::unique_ptr<HloModule> new_module = module_->Clone(/*suffix=*/"");
+  TF_RETURN_IF_ERROR(
+      MorphModuleWithLiterals(new_module.get(), std::move(literal_map)));
+  TF_ASSIGN_OR_RETURN(bool has_bug, RunModule(*new_module));
+  if (has_bug) {
+    std::swap(module_, new_module);
+  }
+  return has_bug;
+}
+
+Status HloBisectState::ExpectModuleIsBuggy() {
+  // Verify that the current module has a bug.
+  TF_ASSIGN_OR_RETURN(bool has_bug, RunModule(*module_));
+  if (has_bug) {
+    return OkStatus();
+  }
+
+  // Check for the bug checker stability.
+  const int retry_count = 5;
+  int bug_count = 0;
+  for (int i = 0; i < retry_count; i++) {
+    TF_ASSIGN_OR_RETURN(has_bug, bug_checker_->Run(*module_));
+    if (has_bug) {
+      bug_count++;
+    }
+  }
+  if (bug_count != 0) {
+    return InternalErrorStrCat("The checker is non deterministic! (only ",
+                               bug_count, " failures seen in ",
+                               (retry_count + 1), " runs)");
+  }
+  return InternalError("We \"lost\" the bug while bisecting!");
+}
+
+}  // namespace bisect
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h
new file mode 100644
index 00000000000..2d93cf86f20
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h
@@ -0,0 +1,96 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
+#define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace bisect {
+
+// Processes an HloModule, such as compiling the module or executing the module,
+// to check whether a bug exists. When the module is executed, should provide
+// the resulting literals for the reference implementation.
+class BugCheckerInterface {
+ public:
+  virtual ~BugCheckerInterface() {}
+
+  // Returns true if `module` has a bug we're interested in.
+  virtual StatusOr<bool> Run(const HloModule& module) = 0;
+
+  // Returns mapping of instruction names to their results after the run
+  // (empty if this information is unavailable).
+  virtual absl::flat_hash_map<std::string, Literal> GetResults() = 0;
+};
+
+// Trims down an HloModule that manifests a bug to a smaller module that
+// still exhibits a problem. Only the entry computation is reduced.
+class HloBisectState {
+ public:
+  explicit HloBisectState(std::unique_ptr<HloModule> module,
+                          BugCheckerInterface* bug_checker)
+      : module_(std::move(module)), bug_checker_(bug_checker) {}
+
+  // Returns true if the current module has a bug and should be processed.
+  StatusOr<bool> ShouldProcess();
+
+  // Trims the entry computation until no more reductions are possible. Returns
+  // a boolean to indicate whether the computation has been reduced.
+  StatusOr<bool> TrimEntryComputation();
+
+  // Returns the resulting module.
+  std::unique_ptr<xla::HloModule>&& GetResult();
+
+ private:
+  // Runs a modified module and updates the foldable instructions data, if
+  // available. Returns true if `module` has a bug.
+  StatusOr<bool> RunModule(const HloModule& module);
+
+  // Trims the entry computation by reducing the total number of outputs.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  StatusOr<bool> TrimByOutputs();
+
+  // Trims the entry computation by reducing the total number of instructions.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  StatusOr<bool> TrimByInstructions();
+
+  // Trims the given computation by replacing instructions with constant values.
+  // Returns a boolean to indicate whether the computation has been reduced.
+  StatusOr<bool> TrimByUsingConstants();
+
+  // Asserts that the module still has the bug. If negative, runs the bug
+  // checker repeatedly to verify that it's deterministic.
+  Status ExpectModuleIsBuggy();
+
+  std::unique_ptr<xla::HloModule> module_;
+  BugCheckerInterface* bug_checker_;
+  absl::flat_hash_set<std::string> foldable_instructions_;
+  absl::flat_hash_map<std::string, Literal> foldable_instructions_values_;
+};
+
+}  // namespace bisect
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_STATE_H_
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state_test.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state_test.cc
new file mode 100644
index 00000000000..3098d3c5657
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state_test.cc
@@ -0,0 +1,202 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h"
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace bisect {
+namespace {
+
+namespace m = match;
+
+using HloBisectStateTest = HloTestBase;
+
+// Simple test bug checker, verifies the presence of the given instructions in
+// the entry computation.
+class TestBugSearch : public BugCheckerInterface {
+ public:
+  TestBugSearch(std::initializer_list<HloOpcode> opcodes) : opcodes_(opcodes) {}
+
+  StatusOr<bool> Run(const HloModule& module) override {
+    auto has_opcode = [&](HloOpcode opcode) {
+      return absl::c_any_of(module.entry_computation()->instructions(),
+                            [opcode](const HloInstruction* instr) {
+                              return instr->opcode() == opcode;
+                            });
+    };
+    return absl::c_all_of(opcodes_, has_opcode);
+  }
+
+  absl::flat_hash_map<std::string, Literal> GetResults() override { return {}; }
+
+ private:
+  std::vector<HloOpcode> opcodes_;
+};
+
+Literal CreateLiteral(float value) {
+  Literal result = Literal::CreateFromShape(ShapeUtil::MakeShape(F32, {}));
+  result.PopulateWithValue(value);
+  return result;
+}
+
+TEST_F(HloBisectStateTest, TrimByOutputs) {
+  const char* kModuleStr = R"(
+    HloModule test_module
+    ENTRY test_computation {
+      p1 = s32[8] parameter(0)
+      p2 = s32[8] parameter(1)
+      a = s32[8] add(p1, p2)
+      b = s32[8] multiply(p1, p2)
+      c = s32[8] subtract(p1, p2)
+      ROOT sum = tuple(a, b, c)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  TestBugSearch bug_checker({HloOpcode::kMultiply});
+  HloBisectState bisect(std::move(module), &bug_checker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, bisect.TrimEntryComputation());
+  EXPECT_TRUE(changed);
+  auto reduced_module = std::move(bisect).GetResult();
+  EXPECT_THAT(reduced_module->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(HloBisectStateTest, TrimByInstructions) {
+  const char* kModuleStr = R"(
+    HloModule axpy_module
+    ENTRY axpy_computation {
+      alpha = f32[] parameter(0)
+      broadcast = f32[10] broadcast(alpha), dimensions={}
+      x = f32[10] parameter(1)
+      ax = f32[10] multiply(broadcast, x)
+      y = f32[10] parameter(2)
+      ROOT add = f32[10] add(ax, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  TestBugSearch bug_checker({HloOpcode::kMultiply, HloOpcode::kBroadcast});
+  HloBisectState bisect(std::move(module), &bug_checker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, bisect.TrimEntryComputation());
+  EXPECT_TRUE(changed);
+  auto reduced_module = std::move(bisect).GetResult();
+  EXPECT_THAT(
+      reduced_module->entry_computation()->root_instruction(),
+      GmockMatch(m::Multiply(m::Broadcast(m::Parameter(0)), m::Parameter(1))));
+}
+
+TEST_F(HloBisectStateTest, TrimByUsingRandomConstants) {
+  const char* kModuleStr = R"(
+    HloModule test_module
+    ENTRY test_computation {
+      p1 = f32[4] parameter(0)
+      p2 = f32[4] parameter(1)
+      a = f32[4] multiply(p1, p2)
+      b = f32[4] add(p1, p2)
+      ROOT result = f32[4] power(a, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  TestBugSearch bug_checker({HloOpcode::kPower});
+  HloBisectState bisect(std::move(module), &bug_checker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, bisect.TrimEntryComputation());
+  EXPECT_TRUE(changed);
+  auto reduced_module = std::move(bisect).GetResult();
+  EXPECT_THAT(reduced_module->entry_computation()->root_instruction(),
+              GmockMatch(m::Power(m::Constant(), m::Constant())));
+}
+
+TEST_F(HloBisectStateTest, TrimByUsingReferenceConstants) {
+  class TestBugSearchWithReferenceConstants : public TestBugSearch {
+   public:
+    TestBugSearchWithReferenceConstants()
+        : TestBugSearch({HloOpcode::kPower}) {}
+
+    absl::flat_hash_map<std::string, Literal> GetResults() override {
+      absl::flat_hash_map<std::string, Literal> results;
+      results["a"] = CreateLiteral(2.0f);
+      results["b"] = CreateLiteral(3.0f);
+      return results;
+    }
+  };
+
+  const char* kModuleStr = R"(
+    HloModule test_module
+    ENTRY test_computation {
+      p1 = f32[] parameter(0)
+      p2 = f32[] parameter(1)
+      a = f32[] multiply(p1, p2)
+      b = f32[] add(p1, p2)
+      ROOT result = f32[] power(a, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  TestBugSearchWithReferenceConstants bug_checker;
+  HloBisectState bisect(std::move(module), &bug_checker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, bisect.TrimEntryComputation());
+  EXPECT_TRUE(changed);
+  auto reduced_module = std::move(bisect).GetResult();
+  EXPECT_THAT(reduced_module->entry_computation()->root_instruction(),
+              GmockMatch(m::Power(m::Constant(), m::Constant())));
+}
+
+TEST_F(HloBisectStateTest, TrimByOutputsLostBug) {
+  class CustomBugSearch : public TestBugSearch {
+   public:
+    CustomBugSearch() : TestBugSearch({HloOpcode::kConstant}) {}
+    StatusOr<bool> Run(const HloModule& module) override {
+      TF_ASSIGN_OR_RETURN(bool has_constants, TestBugSearch::Run(module));
+      int program_size = module.entry_computation()->instruction_count();
+      return program_size == 5 && !has_constants;
+    }
+  };
+  const char* kModuleStr = R"(
+    HloModule test_module
+    ENTRY test_computation {
+      p1 = s32[8] parameter(0)
+      p2 = s32[8] parameter(1)
+      a = s32[8] add(p1, p2)
+      b = s32[8] multiply(p1, p2)
+      ROOT sum = tuple(a, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  CustomBugSearch bug_checker;
+  HloBisectState bisect(std::move(module), &bug_checker);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, bisect.TrimEntryComputation());
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace bisect
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc
new file mode 100644
index 00000000000..b4abe2ad507
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.cc
@@ -0,0 +1,384 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.h"
+
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/cleanup/cleanup.h"
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/prepare_reference_module.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/subprocess.h"
+
+namespace xla {
+namespace bisect {
+namespace {
+
+// Executes the module on the given platform with the given input and returns
+// the result literal.
+Literal ExecuteWithRunnerAndRetrieveResult(std::unique_ptr<HloModule> module,
+                                           absl::Span<const Literal> input_data,
+                                           HloRunnerInterface* runner,
+                                           bool run_hlo_passes) {
+  auto result_status =
+      runner->Execute(std::move(module), input_data, run_hlo_passes);
+  TF_CHECK_OK(result_status.status())
+      << "Failed to execute on " << runner->Name();
+  return std::move(result_status).value();
+}
+
+// Loads the given HloProto as HloModule.
+StatusOr<std::unique_ptr<HloModule>> LoadModuleFromHloProto(
+    const HloProto& proto) {
+  const HloModuleProto& module_proto = proto.hlo_module();
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          module_proto, GetDebugOptionsFromFlags()));
+  return CreateModuleFromProto(module_proto, module_config);
+}
+
+StatusOr<std::unique_ptr<HloModule>> LoadModuleAndInputDataFromHloSnapshot(
+    const HloSnapshot& snapshot, std::vector<Literal>* input_data) {
+  for (int64_t i = 0; i < snapshot.arguments_size(); ++i) {
+    TF_ASSIGN_OR_RETURN(Literal literal,
+                        Literal::CreateFromProto(snapshot.arguments(i)));
+    input_data->push_back(std::move(literal));
+  }
+  TF_ASSIGN_OR_RETURN(
+      HloModuleConfig config,
+      HloModule::CreateModuleConfigFromProto(snapshot.hlo().hlo_module(),
+                                             xla::GetDebugOptionsFromFlags()));
+  return HloModule::CreateFromProto(snapshot.hlo().hlo_module(), config);
+}
+
+StatusOr<ModuleWithInputs> GetModuleAndInputData(
+    absl::string_view input_filename) {
+  const std::string input_file(input_filename);
+  tsl::Env* env = tsl::Env::Default();
+  std::unique_ptr<HloModule> module;
+
+  HloSnapshot hlo_snapshot;
+  if (tsl::ReadBinaryProto(env, input_file, &hlo_snapshot).ok()) {
+    std::vector<Literal> input_data;
+    TF_ASSIGN_OR_RETURN(module, LoadModuleAndInputDataFromHloSnapshot(
+                                    hlo_snapshot, &input_data));
+    CHECK_EQ(module->entry_computation()->num_parameters(), input_data.size());
+    return std::make_pair(std::move(module), std::move(input_data));
+  }
+  LOG(INFO) << input_file << " is not HloSnapshot. Trying HLO binary proto.\n";
+  HloProto hlo_proto;
+  StatusOr<std::unique_ptr<HloModule>> module_or_status;
+  if (tsl::ReadBinaryProto(env, input_file, &hlo_proto).ok()) {
+    module_or_status = LoadModuleFromHloProto(hlo_proto);
+    if (!module_or_status.ok()) {
+      LOG(ERROR) << "Failed to load hlo proto"
+                 << module_or_status.status().error_message();
+      return module_or_status.status();
+    }
+    module = std::move(module_or_status).value();
+    return std::make_pair(std::move(module), std::vector<Literal>());
+  }
+  LOG(INFO) << input_file << " is not HloProto. Trying HLO text.\n";
+  std::string hlo_string;
+  Status to_string_status = tsl::ReadFileToString(env, input_file, &hlo_string);
+  if (!to_string_status.ok()) {
+    LOG(ERROR) << input_file << " problem in reading file to string: "
+               << to_string_status.error_message();
+    return to_string_status;
+  }
+
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  module_or_status = ParseAndReturnUnverifiedModule(hlo_string, config);
+  if (!module_or_status.ok()) {
+    LOG(ERROR) << input_file << " is not HLO text either, error in parsing "
+               << module_or_status.status().error_message();
+    return module_or_status.status();
+  }
+
+  module = std::move(module_or_status).value();
+  return std::make_pair(std::move(module), std::vector<Literal>());
+}
+
+// Outputs the given HloModule as HloProto to the given file.
+Status DumpHloModule(HloModule* module, const std::string& file_name,
+                     absl::string_view dir_path,
+                     absl::string_view output_format) {
+  HloProto proto = MakeHloProto(*module);
+  if (output_format == "hlo") {
+    tsl::Env* env = tsl::Env::Default();
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(std::string(dir_path)));
+    std::string file_path =
+        tsl::io::JoinPath(dir_path, SanitizeFileName(file_name)) + ".hlo";
+    LOG(INFO) << "Dumped HLO text to " << file_path;
+    TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+        env, file_path,
+        module->ToString(HloPrintOptions::Canonical()
+                             .set_print_large_constants(true)
+                             .set_compact_operands(false))));
+  } else if (output_format == "pb") {
+    std::string path;
+    TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
+        proto, std::string(dir_path), file_name, &path));
+    LOG(INFO) << "Dumped HLO module proto to " << path;
+
+  } else {
+    LOG(FATAL) << "Unexpected output format: " << output_format;
+  }
+
+  return OkStatus();
+}
+
+}  // namespace
+
+MiscompareChecker::MiscompareChecker(HloModule* module,
+                                     std::vector<Literal>&& input_data,
+                                     absl::string_view test_platform,
+                                     absl::string_view reference_platform,
+                                     ErrorSpec error_spec)
+    : error_spec_(error_spec) {
+  // Generate input data and store the data for all the execution.
+  std::minstd_rand0 rng_engine;
+  if (input_data.empty()) {
+    StatusOr<std::vector<Literal>> input_status =
+        MakeFakeArguments(module, &rng_engine);
+    CHECK(input_status.ok());
+    input_data_ = std::move(input_status).value();
+  } else {
+    VLOG(2) << "Using provided input data";
+    input_data_ = std::move(input_data);
+  }
+
+  // Set up the reference platform.
+  StatusOr<se::Platform*> reference_platform_status =
+      PlatformUtil::GetPlatform(std::string(reference_platform));
+  CHECK(reference_platform_status.ok());
+  reference_runner_ =
+      std::make_unique<HloRunner>(reference_platform_status.value());
+
+  // Set up the test platform.
+  StatusOr<se::Platform*> test_platform_status =
+      PlatformUtil::GetPlatform(std::string(test_platform));
+  CHECK(test_platform_status.ok());
+  test_runner_ =
+      std::make_unique<HloRunner>(std::move(test_platform_status).value());
+}
+
+// Executes the module with the test_runner and the reference_runner and
+// compares the results from the two runs. Returns true if the two results are
+// not near to indicate a bug exists.
+StatusOr<bool> MiscompareChecker::Run(const HloModule& module) {
+  std::unique_ptr<HloModule> test_module = module.Clone(/*suffix=*/"");
+
+  // Make sure that the module config has a non-zero seed, which the CPU and GPU
+  // backends will use for kRng random number generation. A zero seed in the
+  // module config, on the other hand, tells the backend to generate new seeds
+  // for kRng random number generation, which can lead to different results from
+  // the two backends.
+  if (test_module->config().seed() == 0) {
+    HloModuleConfig config = test_module->config();
+    config.set_seed(42);
+    test_module->set_config(config);
+  }
+
+  // Prepare the reference module.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> reference_module,
+      PrepareReferenceModule(*test_module, test_runner_.get()));
+
+  // Run the module on the reference platform.
+  Literal reference_result = ExecuteWithRunnerAndRetrieveResult(
+      std::move(reference_module), input_data_, reference_runner_.get(),
+      /*run_hlo_passes=*/true);
+
+  // Run the module on the test platform.
+  Literal test_result = ExecuteWithRunnerAndRetrieveResult(
+      std::move(test_module), input_data_, test_runner_.get(),
+      /*run_hlo_passes=*/true);
+
+  // Compare the results.
+  StatusOr<::testing::AssertionResult> status_or_result =
+      LiteralTestUtil::Near(/*expected=*/reference_result,
+                            /*actual=*/test_result,
+                            /*error_spec=*/error_spec_,
+                            /*detailed_message=*/true);
+
+  CHECK(status_or_result.ok())
+      << "Problem with running the clone module, may be there is a problem in "
+         "the cloned module itself?";
+  return !static_cast<bool>(std::move(status_or_result).value());
+}
+
+absl::flat_hash_map<std::string, Literal> MiscompareChecker::GetResults() {
+  return {};
+}
+
+StatusOr<std::unique_ptr<HloModule>> MiscompareChecker::PrepareReferenceModule(
+      const HloModule& hlo_module, HloRunnerInterface* hlo_runner) const {
+  // By default clone the test module (could be overridden).
+  return xla::PrepareReferenceModule(hlo_module, hlo_runner);
+}
+
+StatusOr<bool> ScriptChecker::Run(const HloModule& module) {
+  tsl::Env* env = tsl::Env::Default();
+  // Write hlo into a temporary file.
+  std::string hlo_path;
+  if (!env->LocalTempFilename(&hlo_path)) {
+    return InternalError("couldn't get temp HLO file name");
+  }
+
+  absl::Cleanup hlo_cleaner = [&] {
+    TF_CHECK_OK(tsl::Env::Default()->DeleteFile(hlo_path));
+  };
+
+  std::string hlo_contents =
+      module.ToString(HloPrintOptions::Canonical()
+                          .set_print_large_constants(true)
+                          .set_compact_operands(false));
+
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(env, hlo_path, hlo_contents));
+
+  tsl::SubProcess script_subprocess;
+  std::vector<std::string> script_args = {path_to_script_, hlo_path};
+
+  script_subprocess.SetProgram(path_to_script_, script_args);
+  script_subprocess.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  script_subprocess.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  if (!script_subprocess.Start()) {
+    return InternalError("Failed to launch script");
+  }
+
+  std::string stderr_output;
+  std::string stdout_output;
+  int exit_status = script_subprocess.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/&stdout_output,
+      /*stderr_output=*/&stderr_output);
+  VLOG(3) << "Standard output";
+  VLOG(3) << stdout_output;
+  VLOG(3) << "Standard error output";
+  VLOG(3) << stderr_output;
+  VLOG(3) << "Exit status from " << path_to_script_ << " " << exit_status;
+  return /*has_bug=*/exit_status != 0;
+}
+
+absl::flat_hash_map<std::string, Literal> ScriptChecker::GetResults() {
+  return {};
+}
+
+StatusOr<std::unique_ptr<HloModule>> BisectRunner::RunEntry() {
+  HloBisectState hlo_bisect(std::move(module_), bug_checker_.get());
+  TF_ASSIGN_OR_RETURN(bool has_bug, hlo_bisect.ShouldProcess());
+  if (!has_bug) {
+    return InvalidArgument(
+        "Don't see the bug in the unmodified module. Something is not right. "
+        "Can't bisect.");
+  }
+
+  TF_RETURN_IF_ERROR(hlo_bisect.TrimEntryComputation().status());
+  return hlo_bisect.GetResult();
+}
+
+StatusOr<std::unique_ptr<HloModule>> BisectRunner::RunAll() {
+  std::unique_ptr<HloModule> original_module = std::move(module_);
+  std::unique_ptr<HloModule> result;
+  for (HloComputation* c : original_module->computations()) {
+    LOG(INFO) << "Bisecting computation: " << c->name();
+    module_ = original_module->Clone(/*suffix=*/"");
+    StatusOr<std::unique_ptr<HloModule>> new_result;
+    if (c->IsEntryComputation()) {
+      // Run on the entry computation with input data.
+      new_result = RunEntry();
+    } else {
+      // Run on a non-entry computation with no input data (use random).
+      HloComputation* new_entry = module_->GetComputationWithName(c->name());
+      CHECK(new_entry != nullptr) << "Missing computation: " << c->name();
+      module_->ReplaceEntryComputation(new_entry);
+      new_result = RunEntry();
+      if (new_result.status().code() ==
+          tensorflow::error::Code::INVALID_ARGUMENT) {
+        VLOG(2) << "The bug is unaffected by the computation " << c->name();
+        continue;
+      }
+    }
+    if (!new_result.ok()) {
+      return new_result;
+    }
+    if (result == nullptr ||
+        result->computation_count() > new_result.value()->computation_count()) {
+      result = std::move(new_result.value());
+    }
+  }
+  return result;
+}
+
+void RunBisect(std::unique_ptr<BisectRunner> runner, bool all_computations,
+               absl::string_view dump_path, absl::string_view output_format) {
+  StatusOr<std::unique_ptr<HloModule>> bisect_status =
+      all_computations ? runner->RunAll() : runner->RunEntry();
+  CHECK(bisect_status.ok()) << bisect_status.status().error_message();
+
+  std::unique_ptr<HloModule> new_module = std::move(bisect_status.value());
+  Status dump_status =
+      DumpHloModule(new_module.get(), new_module->name() + "_trimmed",
+                    dump_path, output_format);
+  CHECK(dump_status.ok()) << dump_status.error_message();
+}
+
+StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
+    absl::string_view input_filename) {
+  std::unique_ptr<HloModule> module;
+  std::vector<Literal> input_data;
+  TF_ASSIGN_OR_RETURN(std::tie(module, input_data),
+                      GetModuleAndInputData(input_filename));
+
+  // If any instruction doesn't have a layout, set to default layout.
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (!LayoutUtil::HasLayout(instruction->shape())) {
+        LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+      }
+    }
+  }
+  Status verified_status = HloVerifier(/*layout_sensitive=*/false,
+                                       /*allow_mixed_precision=*/false)
+                               .Run(module.get())
+                               .status();
+  if (!verified_status.ok()) {
+    LOG(ERROR) << "Failed to verify hlo module "
+               << verified_status.error_message();
+    return verified_status;
+  }
+
+  return std::make_pair(std::move(module), std::move(input_data));
+}
+
+}  // namespace bisect
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.h b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.h
new file mode 100644
index 00000000000..6398c8cadc0
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_utils.h
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_runner_interface.h"
+#include "tensorflow/compiler/xla/tools/hlo_bisect/hlo_bisect_state.h"
+
+namespace xla {
+namespace bisect {
+
+// Checks whether the execution of an HloModule on the test platform and the
+// reference platform produce different results.
+class MiscompareChecker : public BugCheckerInterface {
+ public:
+  MiscompareChecker(HloModule* module, std::vector<Literal>&& input_data,
+                    absl::string_view test_platform,
+                    absl::string_view reference_platform,
+                    ErrorSpec error_spec);
+  StatusOr<bool> Run(const HloModule& module) override;
+  absl::flat_hash_map<std::string, Literal> GetResults() override;
+
+  virtual StatusOr<std::unique_ptr<HloModule>> PrepareReferenceModule(
+      const HloModule& hlo_module, HloRunnerInterface* hlo_runner) const;
+
+ private:
+  std::vector<Literal> input_data_;
+  std::unique_ptr<HloRunnerInterface> reference_runner_;
+  std::unique_ptr<HloRunnerInterface> test_runner_;
+  absl::flat_hash_map<std::string, Literal> results_;
+  ErrorSpec error_spec_;
+};
+
+// Runs a user provided script and considers an HLO module to be buggy if the
+// script exits with a non-zero exit code.
+class ScriptChecker : public BugCheckerInterface {
+ public:
+  explicit ScriptChecker(std::string path_to_script)
+      : path_to_script_(std::move(path_to_script)) {}
+  StatusOr<bool> Run(const HloModule& module) override;
+  absl::flat_hash_map<std::string, Literal> GetResults() override;
+
+ private:
+  std::string path_to_script_;
+};
+
+// Runner class for the bisect tool.
+class BisectRunner {
+ public:
+  BisectRunner(std::unique_ptr<HloModule> module,
+               std::unique_ptr<BugCheckerInterface> bug_checker)
+      : module_(std::move(module)), bug_checker_(std::move(bug_checker)) {}
+
+  StatusOr<std::unique_ptr<HloModule>> RunEntry();
+  StatusOr<std::unique_ptr<HloModule>> RunAll();
+
+ protected:
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<BugCheckerInterface> bug_checker_;
+};
+
+// Main runner for the bisect tool.
+void RunBisect(std::unique_ptr<BisectRunner> runner, bool all_computations,
+               absl::string_view dump_path, absl::string_view output_format);
+
+// Utility function for getting the verified module and optional inputs.
+using ModuleWithInputs =
+    std::pair<std::unique_ptr<HloModule>, std::vector<Literal>>;
+xla::StatusOr<ModuleWithInputs> GetVerifiedModuleAndInputData(
+    absl::string_view input_filename);
+
+}  // namespace bisect
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_BISECT_HLO_BISECT_UTILS_H_
diff --git a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
index 97c06254b8c..516cba4da23 100644
--- a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
+++ b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.cc
@@ -20,14 +20,14 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 
 namespace xla {
@@ -258,12 +258,13 @@ Status HloControlFlowFlattening::RemoveInfeed(
   HloInstruction* custom_call = computation->AddInstruction(
       HloInstruction::CreateCustomCall(infeed_shape, {}, kNopCustomCallTarget));
 
-  // Create a new tuple consisting op the constant and the token that was
+  // Create a new tuple consisting of the constant and the token that was
   // originally the operand of infeed, and replace the infeed operation.
   auto new_tuple = HloInstruction::CreateTuple(
       {custom_call, infeed_hlo->mutable_operand(0)});
   TF_RETURN_IF_ERROR(
       computation->ReplaceWithNewInstruction(infeed_hlo, std::move(new_tuple)));
+  custom_call->SetAndSanitizeName(infeed_hlo->name());
 
   return OkStatus();
 }
@@ -320,6 +321,7 @@ Status HloControlFlowFlattening::RemoveOutfeed(
   Cast<HloCustomCallInstruction>(custom_call)
       ->set_custom_call_has_side_effect(true);
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(outfeed_hlo, custom_call));
+  custom_call->SetAndSanitizeName(outfeed_hlo->name());
 
   return OkStatus();
 }
diff --git a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
index 8a44aee68a8..3141ec70deb 100644
--- a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
+++ b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening_test.cc b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening_test.cc
index 4bc8a48c31a..a363c285f78 100644
--- a/tensorflow/compiler/xla/tools/hlo_control_flow_flattening_test.cc
+++ b/tensorflow/compiler/xla/tools/hlo_control_flow_flattening_test.cc
@@ -249,7 +249,7 @@ TEST_F(HloControlFlowFlatteningTest, Infeed) {
   HloModule Infeed
   ENTRY Infeed {
     after-all = token[] after-all()
-    ROOT infeed = ((bf16[3]{0}, s32[12,5]{0,1}), token[]) infeed(after-all)
+    ROOT infeed.23 = ((bf16[3]{0}, s32[12,5]{0,1}), token[]) infeed(after-all)
   }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -261,8 +261,11 @@ TEST_F(HloControlFlowFlatteningTest, Infeed) {
                            /*allow_mixed_precision=*/true)
                    .Run(module.get())
                    .status());
+  auto custom_call =
+      module->entry_computation()->GetInstructionWithName("infeed.23");
+  EXPECT_THAT(custom_call, op::CustomCall());
   auto tuple = module->entry_computation()->root_instruction();
-  EXPECT_THAT(tuple, op::Tuple(op::CustomCall(), op::AfterAll()));
+  EXPECT_THAT(tuple, op::Tuple(custom_call, op::AfterAll()));
 }
 
 TEST_F(HloControlFlowFlatteningTest, InfeedPreserveLayout) {
@@ -294,7 +297,7 @@ TEST_F(HloControlFlowFlatteningTest, Outfeed) {
   ENTRY Outfeed {
     param = (bf16[3]{0}, s32[12,5]{0,1}) parameter(0)
     after-all = token[] after-all()
-    ROOT outfeed = token[] outfeed(param, after-all)
+    ROOT outfeed.23 = token[] outfeed(param, after-all)
   }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -307,6 +310,7 @@ TEST_F(HloControlFlowFlatteningTest, Outfeed) {
                    .Run(module.get())
                    .status());
   auto custom_call = module->entry_computation()->root_instruction();
+  EXPECT_EQ(custom_call->name(), "outfeed.23");
   EXPECT_THAT(custom_call, op::CustomCall(op::Parameter(0), op::AfterAll()));
 }
 
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.cc b/tensorflow/compiler/xla/tools/hlo_extractor.cc
index e0527049d8c..6f99177fc16 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor.cc
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.cc
@@ -22,9 +22,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status.h"
 
@@ -44,7 +45,9 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
       const HloModule& old_module,
       absl::flat_hash_set<const HloInstruction*>* boundary)
       : old_module_(old_module),
-        module_(std::make_unique<HloModule>("extracted", config_)),
+        module_(std::make_unique<HloModule>(
+            "extracted", config_,
+            std::make_unique<CompilationEnvironments>(old_module.comp_envs()))),
         clone_context_(module_.get()),
         builder_("entry_computation"),
         boundary_(boundary) {}
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.h b/tensorflow/compiler/xla/tools/hlo_extractor.h
index c4216168caf..04828254089 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor.h
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
 #define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader.cc b/tensorflow/compiler/xla/tools/hlo_module_loader.cc
index 9596ca311d8..444b453c6a0 100644
--- a/tensorflow/compiler/xla/tools/hlo_module_loader.cc
+++ b/tensorflow/compiler/xla/tools/hlo_module_loader.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/logging.h"
@@ -128,4 +128,46 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
   return LoadModuleFromData(data, format, ovr_config, config_modifier_hook);
 }
 
+StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromData(
+    const std::string& data, absl::string_view format) {
+  HloSnapshot proto;
+  if (format == "pb") {
+    if (!proto.ParseFromString(data) &&
+        !proto.mutable_hlo()->ParseFromString(data) &&
+        !proto.mutable_hlo()->mutable_hlo_module()->ParseFromString(data)) {
+      return InvalidArgument("Failed to parse input as HLO protobuf binary");
+    }
+  } else if (format == "pbtxt") {
+    if (!tsl::protobuf::TextFormat::ParseFromString(data, &proto) &&
+        !tsl::protobuf::TextFormat::ParseFromString(data,
+                                                    proto.mutable_hlo()) &&
+        !tsl::protobuf::TextFormat::ParseFromString(
+            data, proto.mutable_hlo()->mutable_hlo_module())) {
+      return InvalidArgument("Failed to parse input as HLO protobuf text");
+    }
+  } else {
+    return InvalidArgument(
+        "Invalid format from file extension: '%s'. Expected: pb, "
+        "or pbtxt",
+        format);
+  }
+
+  auto iteration_literals_proto =
+      std::make_unique<RunHloModuleIterationLiterals>();
+  for (const auto& i : proto.arguments()) {
+    *iteration_literals_proto->add_arguments() = i;
+  }
+  return std::move(iteration_literals_proto);
+}
+
+StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromFile(
+    const std::string& path, std::string format) {
+  std::string data;
+  if (format.empty()) {
+    format = std::string(tsl::io::Extension(path));
+  }
+  TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(), path, &data));
+  return LoadInputFromData(data, format);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader.h b/tensorflow/compiler/xla/tools/hlo_module_loader.h
index c86921fabbe..e918f628818 100644
--- a/tensorflow/compiler/xla/tools/hlo_module_loader.h
+++ b/tensorflow/compiler/xla/tools/hlo_module_loader.h
@@ -20,8 +20,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tools/run_hlo_module.pb.h"
 
 namespace xla {
 namespace hlo_module_loader_details {
@@ -76,6 +78,22 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
     std::string format = "",
     const std::function<void(HloModuleConfig*)>& config_modifier_hook = {});
 
+// Loads an HLO snapshot from a string, only for its inputs
+// The data format must be one of the following:
+// 1) A binary proto (format "pb")
+// 2) A text proto (format "pbtxt")
+StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromData(
+    const std::string& data, absl::string_view format);
+
+// Loads an HLO snapshot from file, only for its inputs
+// The file must be one of the following:
+// 1) A binary proto (with .pb extension)
+// 2) A text proto (with a .pbtxt extension)
+// If the format is specified (not empty), it overrides the one guessed from the
+// file extension.
+StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromFile(
+    const std::string& path, std::string format = "");
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_MODULE_LOADER_H_
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index b47d6482046..265d569bf4b 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -52,7 +52,7 @@ StatusOr<std::string> ToJson(const tsl::protobuf::Message& message) {
                                                          json_options);
   if (!status.ok()) {
     return InternalError("MessageToJsonString failed: %s",
-                         status.error_message().data());
+                         std::string{status.message()});
   }
   return json_output;
 }
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz_bin_test.cc b/tensorflow/compiler/xla/tools/interactive_graphviz_bin_test.cc
new file mode 100644
index 00000000000..788ec1613a3
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz_bin_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/subprocess.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(InteractiveGraphviz, CPU) {
+  // Get path to executable
+  std::string interactive_graphviz_bin = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "tools", "interactive_graphviz");
+
+  // Make string containing "--hlo_text=path/to/tools/add.hlo"
+  std::string hlo_text_flag = "--hlo_text=";
+  absl::StrAppend(&hlo_text_flag,
+                  tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "tools", "data",
+                                    "add.hlo"));
+
+  // We need to specify the platform here to make sure that the binary can
+  // compile the HLO. This makes sure everything necessary for compiling HLOs
+  // was actually linked in. Just compiling the interactive_graphviz binary on
+  // its own doesn't prove this.
+  std::vector<std::string> args = {interactive_graphviz_bin, hlo_text_flag,
+                                   "--platform=Host"};
+
+  // Logging to stderr is the default externally.
+  if (!tsl::testing::kIsOpenSource) args.push_back("--logtostderr");
+
+  tsl::SubProcess proc;
+  proc.SetProgram(interactive_graphviz_bin, args);
+  proc.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+  proc.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+  EXPECT_TRUE(proc.Start());
+
+  // We just want to make sure the executable can compile the HLO we give it
+  // and then exit immediately.
+  std::string in = "quit\n";
+  std::string out, err;
+
+  int status = proc.Communicate(&in, &out, &err);
+  EXPECT_TRUE(WIFEXITED(status));
+  EXPECT_EQ(0, WEXITSTATUS(status));
+  ASSERT_THAT(err, testing::HasSubstr("Compiling module for Host"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
deleted file mode 100755
index a1614c443fe..00000000000
--- a/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#! /bin/bash
-# /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================*/
-
-# This is a placeholder for a compile-only test for interactive_graphviz tool.
-
-exit 0
diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.cc b/tensorflow/compiler/xla/tools/prepare_reference_module.cc
index 68489d35c58..3534e1287de 100644
--- a/tensorflow/compiler/xla/tools/prepare_reference_module.cc
+++ b/tensorflow/compiler/xla/tools/prepare_reference_module.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/despecializer.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.h b/tensorflow/compiler/xla/tools/prepare_reference_module.h
index f168c076cfc..2f85b6bd2d0 100644
--- a/tensorflow/compiler/xla/tools/prepare_reference_module.h
+++ b/tensorflow/compiler/xla/tools/prepare_reference_module.h
@@ -19,12 +19,12 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_runner_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 84d6770edeb..51f52fc0f27 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -136,6 +136,8 @@ StatusOr<std::unique_ptr<LocalExecutable>> CompileExecutable(
   }
   ExecutableBuildOptions exec_build_options;
   *exec_build_options.mutable_debug_options() = GetDebugOptionsFromFlags();
+  exec_build_options.set_result_layout(
+      Shape(computation.proto().host_program_shape().result()));
   TF_ASSIGN_OR_RETURN(
       auto executables,
       client->Compile(computation, argument_layout_ptrs, exec_build_options));
diff --git a/tensorflow/compiler/xla/tools/replay_computation_bin_test.cc b/tensorflow/compiler/xla/tools/replay_computation_bin_test.cc
new file mode 100644
index 00000000000..2cd4d27869e
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/replay_computation_bin_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/subprocess.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+// TODO(ddunleavy): test something more specific.
+
+std::string PathToAddHlo() {
+  return tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "tools", "data",
+                           "add.hlo");
+}
+
+TEST(ReplayComputation, AddHloHost) {
+  // Get relevant paths to run_hlo_module and add.hlo
+  std::string replay_computation_bin = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "tools", "replay_computation_cpu");
+
+  tsl::SubProcess proc;
+  proc.SetProgram(replay_computation_bin,
+                  {replay_computation_bin, PathToAddHlo(), "--use_fake_data"});
+  EXPECT_TRUE(proc.Start());
+
+  // Just make sure that the process's exit code is 0
+  int status = proc.Communicate(nullptr, nullptr, nullptr);
+  EXPECT_TRUE(WIFEXITED(status));
+  ASSERT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(ReplayComputation, AddHloInterpreter) {
+  // Get relevant paths to run_hlo_module and add.hlo
+  std::string replay_computation_bin = tsl::io::JoinPath(
+      tsl::testing::XlaSrcRoot(), "tools", "replay_computation_interpreter");
+
+  tsl::SubProcess proc;
+  proc.SetProgram(replay_computation_bin,
+                  {replay_computation_bin, PathToAddHlo(), "--use_fake_data"});
+  EXPECT_TRUE(proc.Start());
+
+  // Just make sure that the process's exit code is 0
+  int status = proc.Communicate(nullptr, nullptr, nullptr);
+  EXPECT_TRUE(WIFEXITED(status));
+  ASSERT_EQ(0, WEXITSTATUS(status));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.cc b/tensorflow/compiler/xla/tools/run_hlo_module.cc
index 995e7c1ace2..a11244c92f7 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_comparison.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -100,10 +100,15 @@ StatusOr<Literal> ExecuteWithRunner(std::unique_ptr<HloModule> module,
   std::cerr << "Running HLO module with runner " << runner->Name() << "...\n";
   XLA_VLOG_LINES(1, module->ToString());
   const auto start = std::chrono::high_resolution_clock::now();
-  auto result_status = runner->Execute(std::move(module), args, run_hlo_passes);
+  ExecutionProfile profile;
+  auto result_status =
+      runner->Execute(std::move(module), args, run_hlo_passes, &profile);
   const auto end = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> diff = end - start;
   std::cerr << "... compiled and ran in " << diff.count() << "s.\n";
+  double run_time = static_cast<double>(profile.compute_time_ns()) / 1e9;
+  std::cerr << "execution time for runner " << runner->Name() << ": "
+            << run_time << "s.\n";
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       result_status.status(),
@@ -240,6 +245,17 @@ Status RunAndCompare(
       auto test_module,
       LoadModuleFromFile(hlo_filename, hlo_module_loader_details::Config(),
                          options.input_format, config_modifier_hook));
+  std::unique_ptr<RunHloModuleIterationLiterals> iteration_literals_proto_local;
+  if (iteration_literals_proto == nullptr) {
+    // User did not explicitly give input
+    if (options.input_format == "pb" || options.input_format == "pbtxt") {
+      // User is giving a snapshot (which contains inputs)
+      TF_ASSIGN_OR_RETURN(
+          iteration_literals_proto_local,
+          LoadInputFromFile(hlo_filename, options.input_format));
+      iteration_literals_proto = iteration_literals_proto_local.get();
+    }
+  }
   return RunAndCompare(std::move(test_module), test_runner, reference_runner,
                        engine, options, iteration_literals_proto,
                        reference_module_modifier_hook, config_modifier_hook);
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.h b/tensorflow/compiler/xla/tools/run_hlo_module.h
index e2cdbf2ed1a..44bbf65c2a0 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module.h
+++ b/tensorflow/compiler/xla/tools/run_hlo_module.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <random>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/tools/run_hlo_module.pb.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -47,7 +47,8 @@ struct RunHloModuleOptions {
         input_module(""),
         iterations(1),
         output_literals_file(""),
-        input_literals_file("") {}
+        input_literals_file(""),
+        random_init_input_literals(true) {}
   std::string platform;
   std::string reference_platform;
   bool print_literals;
@@ -63,6 +64,7 @@ struct RunHloModuleOptions {
   int iterations;
   std::string output_literals_file;
   std::string input_literals_file;
+  bool random_init_input_literals;
 };
 
 // Runs test_module on the platform with the name
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module_bin_test.cc b/tensorflow/compiler/xla/tools/run_hlo_module_bin_test.cc
new file mode 100644
index 00000000000..442273a3fba
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/run_hlo_module_bin_test.cc
@@ -0,0 +1,89 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/subprocess.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace xla {
+namespace {
+
+class RunHloModuleTest : public ::testing::Test {
+ protected:
+  void RunHlo(const std::string& file_name) {
+    std::string run_hlo_module_bin = tsl::io::JoinPath(
+        tsl::testing::XlaSrcRoot(), "tools", "run_hlo_module");
+
+    std::string hlo_path = tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                                             "tools", "data", file_name);
+
+    tsl::SubProcess proc;
+    proc.SetProgram(run_hlo_module_bin,
+                    {run_hlo_module_bin, hlo_path, "--platform=Host"});
+    proc.SetChannelAction(tsl::CHAN_STDOUT, tsl::ACTION_PIPE);
+    proc.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
+    EXPECT_TRUE(proc.Start());
+
+    stdout_output_ = stderr_output_ = "";
+    int status = proc.Communicate(nullptr, &stdout_output_, &stderr_output_);
+    exited_normally_ = WIFEXITED(status);
+    exit_status_ = exited_normally_ ? WEXITSTATUS(status) : -1;
+  }
+
+  std::string stdout_output_;
+  std::string stderr_output_;
+  bool exited_normally_ = false;
+  int exit_status_ = -1;
+};
+
+TEST_F(RunHloModuleTest, AddHlo) {
+  RunHlo("add.hlo");
+
+  EXPECT_TRUE(exited_normally_);
+  EXPECT_EQ(exit_status_, 0);
+  ASSERT_THAT(
+      stderr_output_,
+      testing::HasSubstr("Results on Host and Interpreter are close enough."));
+  EXPECT_THAT(stderr_output_,
+              testing::Not(testing::HasSubstr("memory allocation bug")));
+}
+
+TEST_F(RunHloModuleTest, MustAlias) {
+  RunHlo("must_alias.hlo");
+
+  EXPECT_TRUE(exited_normally_);
+  EXPECT_EQ(exit_status_, 0);
+  EXPECT_THAT(
+      stderr_output_,
+      testing::HasSubstr("Results on Host and Interpreter are close enough."));
+  EXPECT_THAT(stderr_output_,
+              testing::Not(testing::HasSubstr("memory allocation bug")));
+}
+
+TEST_F(RunHloModuleTest, MustAliasWithSharding) {
+  RunHlo("must_alias_with_sharding.hlo");
+
+  EXPECT_TRUE(exited_normally_);
+  EXPECT_EQ(exit_status_, 255);
+  EXPECT_THAT(stderr_output_,
+              testing::HasSubstr("Failed to execute on Interpreter"));
+  EXPECT_THAT(stderr_output_,
+              testing::Not(testing::HasSubstr("memory allocation bug")));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
index d73f4230194..c17a9e96963 100644
--- a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
+++ b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc
@@ -102,6 +102,9 @@ int main(int argc, char** argv) {
           "than the reference this is necessary because some HLO passes are "
           "legalization passes which must be run prior to code generation."),
 
+      tsl::Flag("random_init_input_literals", &opts.random_init_input_literals,
+                "Initialize input literals with random numbers."
+                "Leave them uninitialized otherwise."),
       tsl::Flag("use_large_float_range", &opts.use_large_float_range,
                 "Generate floating point values using a large uniform-log "
                 "distribution as opposed to a small uniform distribution."),
@@ -156,7 +159,10 @@ int main(int argc, char** argv) {
     hlo_filename = argv[1];
   }
 
-  std::minstd_rand0 engine;
+  std::unique_ptr<std::minstd_rand0> engine;
+  if (opts.random_init_input_literals) {
+    engine = std::make_unique<std::minstd_rand0>();
+  }
   int failure_count = 0;
   const int iteration_count = opts.iterations;
   for (int i = 1; i <= iteration_count; ++i) {
@@ -164,7 +170,7 @@ int main(int argc, char** argv) {
       std::cerr << "\n=== Iteration " << i << "\n";
     }
     xla::Status matched = xla::RunAndCompare(
-        hlo_filename, &test_runner, reference_runner.get(), &engine, opts);
+        hlo_filename, &test_runner, reference_runner.get(), engine.get(), opts);
 
     // The AssertionResult is only meaningful when the reference is
     // used. Without a reference, the test just verifies that nothing blew up
diff --git a/tensorflow/compiler/xla/translate/BUILD b/tensorflow/compiler/xla/translate/BUILD
index b6e133e1a40..d18f647c7bb 100644
--- a/tensorflow/compiler/xla/translate/BUILD
+++ b/tensorflow/compiler/xla/translate/BUILD
@@ -1,19 +1,25 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
-tf_cc_binary(
+xla_cc_binary(
     name = "xla-translate",
     testonly = True,
     srcs = ["xla_translate_main.cc"],
     deps = [
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
+        "//tensorflow/compiler/xla/stream_executor/host:host_platform",
+        "//tensorflow/compiler/xla/stream_executor/platform",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:translate_registration",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:translate_registration",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla:translate_registration",
+        "//tensorflow/tsl/platform:platform_port",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD b/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
index 96293f8e032..2c5b958462b 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
@@ -1,7 +1,8 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -15,6 +16,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "@llvm-project//mlir:IR",
@@ -45,13 +47,15 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SparseTensorDialect",
@@ -78,18 +82,19 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/tsl/platform:bfloat16",
+        "//tensorflow/tsl/platform:float8",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SparseTensorDialect",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "hlo_utils_test",
     srcs = ["hlo_utils_test.cc"],
     deps = [
@@ -116,15 +121,16 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -133,25 +139,30 @@ cc_library(
     srcs = ["location_importer.cc"],
     hdrs = ["location_importer.h"],
     deps = [
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@llvm-project//mlir:IR",
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "mlir_hlo_builder_test",
     srcs = ["mlir_hlo_builder_test.cc"],
     deps = [
         ":hlo_module_importer",
         ":mlir_hlo_builder",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
index 8892510b20c..f0981873686 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #include <sys/types.h>
 
+#include <optional>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -39,7 +41,7 @@ mlir::ArrayAttr ConvertPrecisionConfig(const PrecisionConfig* config,
     operand_precision_attrs.push_back(mlir::mhlo::PrecisionAttr::get(
         builder->getContext(),
         mlir::mhlo::symbolizePrecision(PrecisionConfig_Precision_Name(prec))
-            .getValue()));
+            .value()));
   }
   return builder->getArrayAttr(operand_precision_attrs);
 }
@@ -109,7 +111,7 @@ mlir::mhlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
       arrayref(dnums.output_spatial_dimensions()));
 }
 
-mlir::ArrayAttr ConvertCustomCallOutputOperandAliasing(
+mlir::ArrayAttr ConvertOutputOperandAliasing(
     const std::vector<std::pair<xla::ShapeIndex,
                                 std::pair<int64_t, xla::ShapeIndex>>>& aliaInfo,
     mlir::Builder* builder) {
@@ -126,6 +128,41 @@ mlir::ArrayAttr ConvertCustomCallOutputOperandAliasing(
   return builder->getArrayAttr(attrs);
 }
 
+mlir::ArrayAttr ConvertDynamicParameterBindings(
+    const DynamicParameterBinding dpb, mlir::Builder* builder) {
+  llvm::SmallVector<mlir::Attribute, 4> bindings;
+  (void)dpb.ForEachBinding(
+      [&](const DynamicParameterBinding::DynamicParameter& source,
+          const DynamicParameterBinding::DynamicDimension& target) {
+        llvm::SmallVector<int64_t, 4> dpis;
+        for (auto dpi : source.parameter_index) dpis.push_back(dpi);
+        llvm::SmallVector<int64_t, 4> tpis;
+        for (auto tpi : target.parameter_index) tpis.push_back(tpi);
+        bindings.push_back(mlir::mhlo::DynamicParameterBindingAttr::get(
+            builder->getContext(), source.parameter_num, dpis,
+            target.parameter_num, tpis, target.dimension));
+        return OkStatus();
+      });
+  return mlir::ArrayAttr::get(builder->getContext(), bindings);
+}
+
+mlir::ArrayAttr ConvertCrossProgramPrefetches(
+    const absl::Span<const xla::HloModule::CrossProgramPrefetchInfo> prefetches,
+    mlir::Builder* builder) {
+  llvm::SmallVector<mlir::Attribute, 4> shapes;
+  for (auto [parameter, index, alt_memory_offset] : prefetches) {
+    llvm::SmallVector<int64_t, 4> dims;
+    for (auto dim : index) dims.push_back(dim);
+    llvm::Optional<int64_t> offset =
+        alt_memory_offset ? llvm::Optional<int64_t>(*alt_memory_offset)
+                          : std::nullopt;
+    shapes.push_back(mlir::mhlo::CrossProgramPrefetchAttr::get(
+        builder->getContext(), parameter, dims, offset));
+  }
+
+  return mlir::ArrayAttr::get(builder->getContext(), shapes);
+}
+
 StatusOr<mlir::mhlo::FftType> ConvertFftType(FftType type) {
   switch (type) {
     case FftType::FFT:
@@ -169,6 +206,8 @@ StatusOr<mlir::mhlo::CustomCallApiVersion> ConvertCustomCallApiVersion(
     case xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
       return mlir::mhlo::CustomCallApiVersion::
           API_VERSION_STATUS_RETURNING_UNIFIED;
+    case xla::CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      return mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI;
     default:
       return InvalidArgument("Unknown CustomCallApiVersion enum value #%d (%s)",
                              api_version,
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
index 8efed5bd8ed..15f29868aa3 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -51,11 +53,19 @@ mlir::mhlo::ConvDimensionNumbersAttr ConvertConvDimensionNumbers(
     const xla::ConvolutionDimensionNumbers& dnums, mlir::Builder* builder);
 
 // Converts the output operand aliasing to attributes.
-mlir::ArrayAttr ConvertCustomCallOutputOperandAliasing(
+mlir::ArrayAttr ConvertOutputOperandAliasing(
     const std::vector<std::pair<xla::ShapeIndex,
                                 std::pair<int64_t, xla::ShapeIndex>>>& aliaInfo,
     mlir::Builder* builder);
 
+// Converts the list of prefetches.
+mlir::ArrayAttr ConvertCrossProgramPrefetches(
+    absl::Span<const xla::HloModule::CrossProgramPrefetchInfo> prefetches,
+    mlir::Builder* builder);
+
+mlir::ArrayAttr ConvertDynamicParameterBindings(DynamicParameterBinding dpb,
+                                                mlir::Builder* builder);
+
 StatusOr<mlir::mhlo::FftType> ConvertFftType(FftType type);
 StatusOr<mlir::mhlo::Transpose> ConvertTranspose(
     TriangularSolveOptions_Transpose transpose);
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index 12df39b3092..b7f3af6ceb8 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 
+#include <algorithm>
+#include <optional>
+#include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/types/optional.h"
@@ -23,32 +28,35 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/location_importer.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 using llvm::APInt;
-using llvm::makeArrayRef;
+using llvm::ArrayRef;
 using mlir::DenseIntElementsAttr;
 using mlir::NamedAttribute;
 using mlir::Operation;
@@ -61,7 +69,9 @@ namespace xla {
 
 namespace {
 
+constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
 constexpr char kShardingAttr[] = "mhlo.sharding";
+constexpr char kParameterReplicationAttr[] = "mhlo.parameter_replication";
 
 // Note: This sanitization function causes an irreversible many-to-one mapping
 // and any solution to mitigate this would cause issues with the reverse
@@ -116,20 +126,6 @@ void CleanUpTupleOps(mlir::Block* block, mlir::OpBuilder* builder) {
 
 }  // namespace
 
-constexpr char kInternalFunction[] = "HLO_INTERNAL_";
-
-std::string Gensym(mlir::ModuleOp op, std::string name) {
-  mlir::SymbolTable symbolTable(op);
-
-  int fresh = 0;
-  std::string fresh_name;
-  do {
-    fresh_name = absl::StrCat(kInternalFunction, name, fresh);
-  } while (symbolTable.lookup(fresh_name));
-
-  return fresh_name;
-}
-
 mlir::TypeRange Untuple(const mlir::Type& type) {
   if (type.isa<mlir::TupleType>()) {
     return llvm::dyn_cast<mlir::TupleType>(type).getTypes();
@@ -150,9 +146,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportOldStyleAsyncStart(
   }
   auto func_type = mlir::FunctionType::get(context_, Untuple(result_types[0]),
                                            Untuple(result_types[1]));
-  auto sym = Gensym(module_, func_name);
-  auto function = mlir::OpBuilder(module_.getBodyRegion())
-                      .create<FuncOp>(loc, sym, func_type);
+  auto function = FuncOp::create(loc, func_name, func_type);
+
+  // The new function doesn't need to be inserted in the beginning but is done
+  // to make testing easier and preserve the original behavior.
+  mlir::Block& block = symbol_table_.getOp()->getRegion(0).front();
+  symbol_table_.insert(function, mlir::Block::iterator(block.begin()));
+
   function.setPrivate();
   auto async_builder = mlir::OpBuilder(function.getBody());
 
@@ -200,10 +200,21 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportOldStyleAsyncDone(
   attributes.push_back(builder_->getNamedAttr("execution_thread",
                                               builder_->getStringAttr("main")));
 
-  auto op = func_builder->create<mlir::mhlo::AsyncDoneOp>(
-      loc, Untuple(result_type), operands, attributes);
-  return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
-                                  result_type);
+  auto start_tuple = async_start.getResult()
+                         .getType()
+                         .cast<mlir::mhlo::AsyncBundleType>()
+                         .getTypes()[1]
+                         .dyn_cast<mlir::TupleType>();
+  if (start_tuple && start_tuple.getType(0).isa<mlir::TupleType>()) {
+    auto op = func_builder->create<mlir::mhlo::AsyncDoneOp>(
+        loc, result_type, operands, attributes);
+    return {op};
+  } else {
+    auto op = func_builder->create<mlir::mhlo::AsyncDoneOp>(
+        loc, Untuple(result_type), operands, attributes);
+    return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
+                                    result_type);
+  }
 }
 
 void HloFunctionImporter::ReplaceBlockArgumentsWithImplicitOperands(
@@ -336,19 +347,19 @@ Value HloFunctionImporter::CreateTupleValue(
       .getResult();
 }
 
-Status HloFunctionImporter::ImportAsFunc(
-    const HloComputation& computation, mlir::ModuleOp module,
+StatusOr<mlir::func::FuncOp> HloFunctionImporter::ImportAsFunc(
+    const HloComputation& computation, mlir::SymbolTable& symbol_table,
     std::unordered_map<const HloComputation*, FuncOp>* function_map,
     mlir::Builder* builder, bool is_main) {
-  HloFunctionImporter importer(module, function_map, builder);
-  return importer.ImportAsFunc(computation, is_main).status();
+  HloFunctionImporter importer(symbol_table, function_map, builder);
+  return importer.ImportAsFunc(computation, is_main);
 }
 
 Status HloFunctionImporter::ImportAsRegion(
-    const xla::HloComputation& computation, mlir::Region* region,
-    mlir::Builder* builder, bool flatten_region_arg_tuple) {
-  HloFunctionImporter importer(region->getParentOfType<mlir::ModuleOp>(), {},
-                               builder);
+    const xla::HloComputation& computation, mlir::SymbolTable& symbol_table,
+    mlir::Region* region, mlir::Builder* builder,
+    bool flatten_region_arg_tuple) {
+  HloFunctionImporter importer(symbol_table, {}, builder);
   return importer.ImportAsRegion(computation, region, flatten_region_arg_tuple);
 }
 
@@ -363,11 +374,8 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
     if (*imported) {
       return *imported;
     }
-  } else {
-    TF_RET_CHECK(!module_.lookupSymbol<FuncOp>(computation_name))
-        << "Attempting to redeclare an existing function named "
-        << computation.name();
   }
+
   llvm::SmallVector<Type, 4> args, rets;
   TF_RETURN_IF_ERROR(GetMlirTypes(computation.parameter_instructions(), &args));
   TF_RETURN_IF_ERROR(GetMlirTypes({computation.root_instruction()}, &rets));
@@ -382,13 +390,27 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
   function.setVisibility(visibility);
 
   for (auto& entry : llvm::enumerate(computation.parameter_instructions())) {
-    HloInstruction* parameter = entry.value();
+    HloParameterInstruction* parameter =
+        Cast<HloParameterInstruction>(entry.value());
     if (parameter->has_sharding()) {
       function.setArgAttr(
           entry.index(), kShardingAttr,
           builder_->getStringAttr(
               parameter->sharding().ToProto().SerializeAsString()));
     }
+    if (parameter->parameter_replicated_at_leaf_buffers().has_value()) {
+      bool nontrival = false;
+      llvm::SmallVector<bool> replicated_at_leaf_buffers;
+      for (auto b : parameter->parameter_replicated_at_leaf_buffers().value()) {
+        replicated_at_leaf_buffers.push_back(b);
+        nontrival = nontrival || b;
+      }
+      if (nontrival) {
+        function.setArgAttr(
+            entry.index(), kParameterReplicationAttr,
+            builder_->getBoolArrayAttr(replicated_at_leaf_buffers));
+      }
+    }
   }
   if (computation.root_instruction()->has_sharding()) {
     auto result = computation.root_instruction();
@@ -434,7 +456,7 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
     }
   }
 
-  module_.push_back(function);
+  symbol_table_.insert(function);
 
   // Add to the map right away for function calls if map is set.
   if (imported) {
@@ -582,29 +604,28 @@ Status HloFunctionImporter::ImportInstructions(
 
 StatusOr<Value> HloFunctionImporter::ImportInstructions(
     const xla::HloComputation& computation,
-    const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
+    const llvm::SmallVectorImpl<Value>& arguments,
+    mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder) {
   mlir::Block* block = builder->getBlock();
   if (block == nullptr)
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
 
-  HloFunctionImporter importer(
-      block->getParent()->getParentOfType<mlir::ModuleOp>(), {}, builder);
+  HloFunctionImporter importer(symbol_table, {}, builder);
   return importer.ImportInstructionsImpl(computation, arguments, builder);
 }
 
 StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     const xla::HloInstruction* instr,
     const llvm::SmallVectorImpl<mlir::Value>& operands,
-    mlir::OpBuilder* builder, DynamicShapeHandlingMode mode) {
+    mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
+    DynamicShapeHandlingMode mode) {
   mlir::Block* block = builder->getBlock();
   if (block == nullptr)
     return InvalidArgument(
         "ImportInstructions requires a valid block in the builder");
 
-  HloFunctionImporter importer(
-      block->getParent()->getParentOfType<mlir::ModuleOp>(), {}, builder);
-
+  HloFunctionImporter importer(symbol_table, {}, builder);
   return importer.ImportInstructionWithLayout(instr, operands, builder, mode);
 }
 
@@ -629,6 +650,17 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             instruction->sharding().ToProto().SerializeAsString())));
   }
 
+  llvm::SmallVector<NamedAttribute, 4> frontend_attributes;
+  for (const auto& [k, v] : instruction->frontend_attributes().map()) {
+    frontend_attributes.push_back(
+        builder_->getNamedAttr(k, builder_->getStringAttr(v)));
+  }
+  if (!frontend_attributes.empty()) {
+    attributes.push_back(builder_->getNamedAttr(
+        kFrontendAttributesAttr,
+        builder_->getDictionaryAttr(frontend_attributes)));
+  }
+
   switch (instruction->opcode()) {
     case HloOpcode::kParameter: {
       return nullptr;
@@ -836,6 +868,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             builder_->getNamedAttr("result_layouts", result_layouts));
       }
 
+      attributes.push_back(
+          ConvertCustomCallSchedule(custom_call->custom_call_schedule()));
       TF_ASSIGN_OR_RETURN(
           auto mlir_api_version,
           ConvertCustomCallApiVersion(custom_call->api_version()));
@@ -845,16 +879,45 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(builder_->getNamedAttr(
           "has_side_effect",
           builder_->getBoolAttr(custom_call->custom_call_has_side_effect())));
-      attributes.push_back(builder_->getNamedAttr(
-          "backend_config",
-          builder_->getStringAttr(custom_call->raw_backend_config_string())));
+
+      // For typed FFI API version we need to parse raw backend config string
+      // into a dictionary attribute.
+      auto& raw_backend_config = custom_call->raw_backend_config_string();
+
+      if (custom_call->api_version() ==
+          CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+        if (raw_backend_config.empty()) {
+          attributes.push_back(builder_->getNamedAttr(
+              "backend_config", builder_->getDictionaryAttr({})));
+        } else {
+          mlir::Attribute attr =
+              mlir::parseAttribute(raw_backend_config, builder_->getContext());
+          if (!attr.isa<mlir::DictionaryAttr>())
+            return Internal(
+                "Couldn't parse backend config into a dictionary attribute");
+
+          attributes.push_back(builder_->getNamedAttr("backend_config", attr));
+        }
+      } else {
+        attributes.push_back(builder_->getNamedAttr(
+            "backend_config", builder_->getStringAttr(raw_backend_config)));
+      }
+
+      if (custom_call->HasLiteral()) {
+        const Literal& literal = custom_call->literal();
+        auto attr = CreateDenseElementsAttrFromLiteral(literal, *builder_);
+        if (!attr.ok()) return attr.status();
+        attributes.push_back(
+            builder_->getNamedAttr("mhlo.literal", attr.value()));
+      }
+
       attributes.push_back(builder_->getNamedAttr(
           "api_version", mlir::mhlo::CustomCallApiVersionAttr::get(
                              builder_->getContext(), mlir_api_version)));
       attributes.push_back(builder_->getNamedAttr(
-          "custom_call_output_operand_aliasing",
-          ConvertCustomCallOutputOperandAliasing(
-              instruction->custom_call_output_operand_aliasing(), builder_)));
+          "output_operand_aliases",
+          ConvertOutputOperandAliasing(instruction->output_operand_aliasing(),
+                                       builder_)));
       return func_builder
           ->create<mlir::mhlo::CustomCallOp>(loc, result_type, operands,
                                              attributes)
@@ -908,7 +971,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       return func_builder
           ->create<mlir::mhlo::DynamicSliceOp>(
               loc, result_type, operands[0],
-              makeArrayRef(operands).drop_front(), Convert(slice_sizes))
+              llvm::ArrayRef(operands).drop_front(), Convert(slice_sizes))
           .getOperation();
     }
     case HloOpcode::kDynamicUpdateSlice: {
@@ -933,7 +996,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       TF_RETURN_IF_ERROR(
           ConvertShapeToMlirLayout(instruction->shape(), flattened_attr));
       attributes.push_back(builder_->getNamedAttr(
-          "layout", builder_->getArrayAttr(makeArrayRef(flattened_attr))));
+          "layout", builder_->getArrayAttr(llvm::ArrayRef(flattened_attr))));
 
       // Flatten the return-type if they are tuple-typed.
       llvm::SmallVector<Type> flattened_ret_types;
@@ -1081,9 +1144,21 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kCopyStart: {
       auto copy_start_instruction = Cast<HloCopyStartInstruction>(instruction);
-      if (copy_start_instruction->is_cross_program_prefetch()) {
-        attributes.push_back(builder_->getNamedAttr("is_cross_program_prefetch",
-                                                    builder_->getUnitAttr()));
+      if (auto cross_program_prefetch_index =
+              copy_start_instruction->cross_program_prefetch_index()) {
+        attributes.push_back(builder_->getNamedAttr(
+            "cross_program_prefetch_index",
+            builder_->getIntegerAttr(builder_->getIntegerType(32),
+                                     *cross_program_prefetch_index)));
+        // Cross-program prefetch allows copy ops to accept tuples, in which
+        // case, we need to double-wrap inputs and outputs in tuples.
+        if (operands[0].getType().isa<mlir::TupleType>()) {
+          auto result_types = result_type.cast<mlir::TupleType>().getTypes();
+          result_type = mlir::TupleType::get(
+              context_, {mlir::TupleType::get(context_, {result_types[0]}),
+                         mlir::TupleType::get(context_, {result_types[1]}),
+                         result_types[2]});
+        }
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::CopyOp>(
           attributes, operands, loc, result_type, func_builder, "copy_",
@@ -1111,9 +1186,14 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(builder_->getNamedAttr(
           "is_host_transfer",
           builder_->getBoolAttr(send_op->is_host_transfer())));
-      if (send_op->channel_id().has_value())
-        attributes.push_back(
-            ConvertChannelHandle(send_op->channel_id().value()));
+      if (send_op->channel_id().has_value()) {
+        xla::ChannelHandle channel_handle;
+        channel_handle.set_handle(send_op->channel_id().value());
+        channel_handle.set_type(send_op->is_host_transfer()
+                                    ? xla::ChannelHandle::DEVICE_TO_HOST
+                                    : xla::ChannelHandle::DEVICE_TO_DEVICE);
+        attributes.push_back(ConvertChannelHandle(channel_handle));
+      }
       return ImportOldStyleAsyncStart<mlir::mhlo::SendOp>(
           attributes, operands, loc, async_bundled_tuple, func_builder, "send_",
           [](auto) { return OkStatus(); });
@@ -1140,9 +1220,14 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       attributes.push_back(builder_->getNamedAttr(
           "is_host_transfer",
           builder_->getBoolAttr(recv_op->is_host_transfer())));
-      if (recv_op->channel_id().has_value())
-        attributes.push_back(
-            ConvertChannelHandle(recv_op->channel_id().value()));
+      if (recv_op->channel_id().has_value()) {
+        xla::ChannelHandle channel_handle;
+        channel_handle.set_handle(recv_op->channel_id().value());
+        channel_handle.set_type(recv_op->is_host_transfer()
+                                    ? xla::ChannelHandle::HOST_TO_DEVICE
+                                    : xla::ChannelHandle::DEVICE_TO_DEVICE);
+        attributes.push_back(ConvertChannelHandle(channel_handle));
+      }
       return ImportOldStyleAsyncStart<mlir::mhlo::RecvOp>(
           attributes, operands, loc, async_bundled_tuple, func_builder, "recv_",
           [](auto) { return OkStatus(); });
@@ -1285,7 +1370,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto all_reduce_op = func_builder->create<mlir::mhlo::AllReduceOp>(
           loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
-                                        &all_reduce_op.getComputation()));
+                                        &all_reduce_op.getComputation(),
+                                        /*flatten_region_arg_tuple=*/true));
       return all_reduce_op.getOperation();
     }
     case HloOpcode::kAllReduceStart: {
@@ -1312,37 +1398,55 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                      func_builder);
     }
     case HloOpcode::kAllToAll: {
-      // TODO(b/207152612): all-to-all HLO can either have pre-split operands
-      // (and returns a tuple) or a single operand that is split across
-      // `split_dimension` into the number of replicas in a group. Only the
-      // latter case (array all-to-all) is supported in importer right now and
-      // the former (tuple all-to-all) is not supported yet.
       auto all_to_all = Cast<HloAllToAllInstruction>(instruction);
-      if (all_to_all->shape().IsTuple())
-        return Unimplemented(
-            "Importing tuple all-to-all HLO is not supported yet");
+      auto result_tuple_ty = result_type.dyn_cast<mlir::TupleType>();
 
       // Check invariants of array all-to-all. This is a sanity check and is
       // verified by the HLO verifier.
-      if (!all_to_all->split_dimension().has_value() || operands.size() != 1 ||
-          all_to_all->replica_groups().empty())
-        return InvalidArgument(
-            "Array all-to-all should have a split dimension, one operand and "
-            "non-empty replica groups");
+      if (result_tuple_ty) {
+        if (all_to_all->split_dimension().has_value()) {
+          return InvalidArgument(
+              "Tuple all-to-all should not have a split dimension");
+        }
+      } else {
+        if (!all_to_all->split_dimension().has_value() ||
+            operands.size() != 1 || all_to_all->replica_groups().empty()) {
+          return InvalidArgument(
+              "Array all-to-all should have a split dimension, one operand and "
+              "non-empty replica groups");
+        }
+      }
 
       auto replica_groups_attr =
           ConvertReplicaGroups(all_to_all->replica_groups(), builder_)
               .getValue()
               .cast<DenseIntElementsAttr>();
-      uint64_t split_dim = all_to_all->split_dimension().value();
-      uint64_t concat_dim = split_dim;
-      uint64_t split_count = all_to_all->replica_groups()[0].replica_ids_size();
 
-      return func_builder
-          ->create<mlir::mhlo::AllToAllOp>(loc, result_type, operands[0],
-                                           split_dim, concat_dim, split_count,
-                                           replica_groups_attr)
-          .getOperation();
+      llvm::SmallVector<Type, 4> return_types = {result_type};
+      if (result_tuple_ty) {
+        return_types = llvm::to_vector<4>(result_tuple_ty.getTypes());
+      }
+
+      auto result = func_builder->create<mlir::mhlo::AllToAllOp>(
+          loc, return_types, operands, nullptr, nullptr, nullptr,
+          replica_groups_attr);
+
+      if (all_to_all->channel_id().has_value()) {
+        auto handle = ConvertChannelHandle(all_to_all->channel_id().value());
+        result.setChannelHandleAttr(
+            handle.getValue().cast<mlir::mhlo::ChannelHandleAttr>());
+      }
+
+      if (result_tuple_ty) {
+        return func_builder
+            ->create<mlir::mhlo::TupleOp>(loc, result_type, result.getResults())
+            .getOperation();
+      }
+
+      result.setSplitDimension(all_to_all->split_dimension().value());
+      result.setConcatDimension(all_to_all->split_dimension().value());
+      result.setSplitCount(all_to_all->replica_groups()[0].replica_ids_size());
+      return result.getOperation();
     }
     case HloOpcode::kReduce: {
       // Operands in the first half are reduction inputs and the remaining
@@ -1354,9 +1458,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
 
       auto reduce = func_builder->create<mlir::mhlo::ReduceOp>(
-          loc, return_types,
-          llvm::makeArrayRef(operands).take_front(num_inputs),
-          llvm::makeArrayRef(operands).drop_front(num_inputs),
+          loc, return_types, llvm::ArrayRef(operands).take_front(num_inputs),
+          llvm::ArrayRef(operands).drop_front(num_inputs),
           ConvertDimensions(instruction->dimensions()));
       TF_RETURN_IF_ERROR(ImportAsRegion(*instruction->to_apply(),
                                         &reduce.getBody(),
@@ -1487,7 +1590,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           mlir::mhlo::symbolizeTranspose(
               TriangularSolveOptions::Transpose_Name(
                   instruction->triangular_solve_options().transpose_a()))
-              .getValue());
+              .value());
 
       attributes.push_back(builder_->getNamedAttr("transpose_a", transpose_a));
       return func_builder
@@ -1611,7 +1714,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto fft_type = mlir::mhlo::FftTypeAttr::get(
           builder_->getContext(),
           mlir::mhlo::symbolizeFftType(FftType_Name(instruction->fft_type()))
-              .getValue());
+              .value());
 
       std::vector<int64_t> fft_length(instruction->fft_length().begin(),
                                       instruction->fft_length().end());
@@ -1787,6 +1890,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       NO_ATTRIBUTE_CASE(kSin, SineOp);
       NO_ATTRIBUTE_CASE(kSqrt, SqrtOp);
       NO_ATTRIBUTE_CASE(kSubtract, SubtractOp);
+      NO_ATTRIBUTE_CASE(kTan, TanOp);
       NO_ATTRIBUTE_CASE(kTanh, TanhOp);
       NO_ATTRIBUTE_CASE(kTuple, TupleOp);
       NO_ATTRIBUTE_CASE(kXor, XorOp);
@@ -1810,10 +1914,15 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
       auto fusion_kind = mlir::mhlo::symbolizeFusionKind(
           xla::ToString(instruction->fusion_kind()));
+      attributes.push_back(builder_->getNamedAttr(
+          "fusion_kind", mlir::mhlo::FusionKindAttr::get(
+                             func_builder->getContext(), fusion_kind.value())));
+      attributes.push_back(builder_->getNamedAttr(
+          "output_operand_aliasing",
+          ConvertOutputOperandAliasing(instruction->output_operand_aliasing(),
+                                       builder_)));
       auto fusion = func_builder->create<mlir::mhlo::FusionOp>(
-          loc, flattened_ret_types, flattened_operands,
-          mlir::mhlo::FusionKindAttr::get(func_builder->getContext(),
-                                          fusion_kind.getValue()));
+          loc, flattened_ret_types, flattened_operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(
           *instruction->fused_instructions_computation(),
           &fusion.getFusedComputation(), /*flatten_region_arg_tuple=*/true));
@@ -1927,7 +2036,7 @@ mlir::NamedAttribute HloFunctionImporter::ConvertComparisonDirection(
       mlir::mhlo::ComparisonDirectionAttr::get(
           builder_->getContext(), mlir::mhlo::symbolizeComparisonDirection(
                                       ComparisonDirectionToString(direction))
-                                      .getValue()));
+                                      .value()));
 }
 
 mlir::NamedAttribute HloFunctionImporter::ConvertComparisonType(
@@ -1937,7 +2046,7 @@ mlir::NamedAttribute HloFunctionImporter::ConvertComparisonType(
       mlir::mhlo::ComparisonTypeAttr::get(
           builder_->getContext(),
           mlir::mhlo::symbolizeComparisonType(ComparisonTypeToString(type))
-              .getValue()));
+              .value()));
 }
 
 mlir::DenseIntElementsAttr HloFunctionImporter::ConvertDimensions(
@@ -1964,6 +2073,27 @@ mlir::DenseIntElementsAttr HloFunctionImporter::Convert(
       RankedTensorType::get(elements.size(), builder_->getI1Type()), elements);
 }
 
+mlir::NamedAttribute HloFunctionImporter::ConvertCustomCallSchedule(
+    CustomCallSchedule schedule) {
+  auto converted_schedule = ::mlir::mhlo::CustomCallSchedule::NONE;
+  switch (schedule) {
+    case SCHEDULE_LATEST:
+      converted_schedule = ::mlir::mhlo::CustomCallSchedule::LATEST;
+      break;
+    case SCHEDULE_EARLIEST:
+      converted_schedule = ::mlir::mhlo::CustomCallSchedule::EARLIEST;
+      break;
+    case SCHEDULE_NONE:
+      converted_schedule = ::mlir::mhlo::CustomCallSchedule::NONE;
+      break;
+    default:
+      assert(false && "Unrecognized custom call schedule hint");
+  }
+  return builder_->getNamedAttr(
+      "custom_call_schedule", ::mlir::mhlo::CustomCallScheduleAttr::get(
+                                  builder_->getContext(), converted_schedule));
+}
+
 mlir::NamedAttribute HloFunctionImporter::ConvertPadding(
     llvm::ArrayRef<int64_t> padding) {
   auto ty =
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
index 919912235f7..1756a0f9765 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -29,7 +29,8 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -49,10 +50,11 @@ enum class DynamicShapeHandlingMode { kDynamic, kConvertToStatic };
 // Helper class for importing HloComputations.
 class HloFunctionImporter {
  public:
-  // Imports the given computation as a function in the given module. This also
-  // imports any computations referred by instructions in this computation.
-  static Status ImportAsFunc(
-      const xla::HloComputation& computation, mlir::ModuleOp module,
+  // Imports the given computation as a function in the given symbol table and
+  // returns the FuncOp. This also imports any computations referred by
+  // instructions in this computation.
+  static StatusOr<mlir::func::FuncOp> ImportAsFunc(
+      const xla::HloComputation& computation, mlir::SymbolTable& symbol_table,
       std::unordered_map<const xla::HloComputation*, mlir::func::FuncOp>*
           function_map,
       mlir::Builder* builder, bool is_main);
@@ -61,6 +63,7 @@ class HloFunctionImporter {
   // 'flatten_region_arg_tuple' is true, then flatten the tuple-typed region
   // argument(s) and return value(s).
   static Status ImportAsRegion(const xla::HloComputation& computation,
+                               mlir::SymbolTable& symbol_table,
                                mlir::Region* region, mlir::Builder* builder,
                                bool flatten_region_arg_tuple = false);
 
@@ -69,12 +72,12 @@ class HloFunctionImporter {
   static StatusOr<mlir::Value> ImportInstructions(
       const xla::HloComputation& computation,
       const llvm::SmallVectorImpl<mlir::Value>& arguments,
-      mlir::OpBuilder* builder);
+      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder);
 
   static StatusOr<mlir::Operation*> ImportInstruction(
       const xla::HloInstruction* instr,
       const llvm::SmallVectorImpl<mlir::Value>& operands,
-      mlir::OpBuilder* builder,
+      mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
       DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
 
   static void SetLayoutForMlir(mlir::Operation* op, const Shape& shape,
@@ -132,12 +135,12 @@ class HloFunctionImporter {
       llvm::MutableArrayRef<mlir::Value>& flatten_values, mlir::Type type);
 
  private:
-  HloFunctionImporter(mlir::ModuleOp module,
+  HloFunctionImporter(mlir::SymbolTable& symbol_table,
                       std::unordered_map<const xla::HloComputation*,
                                          mlir::func::FuncOp>* function_map,
                       mlir::Builder* builder)
-      : context_(module.getContext()),
-        module_(module),
+      : context_(symbol_table.getOp()->getContext()),
+        symbol_table_(symbol_table),
         builder_(builder),
         function_map_(function_map) {
     context_->loadDialect<mlir::arith::ArithDialect>();
@@ -208,6 +211,10 @@ class HloFunctionImporter {
   // Converts an XLA Comparison::Type to the corresponding MLIR attribute.
   mlir::NamedAttribute ConvertComparisonType(Comparison::Type type);
 
+  // Converts an XLA CustomCallSchedule to the corresponding MLIR attribute.
+  mlir::NamedAttribute ConvertCustomCallSchedule(
+      xla::CustomCallSchedule schedule);
+
   // Converts the dimensions of an HLO instruction into an MLIR attribute.
   mlir::DenseIntElementsAttr ConvertDimensions(
       absl::Span<const int64_t> op_dimensions);
@@ -260,7 +267,10 @@ class HloFunctionImporter {
       mlir::Type result_type, mlir::OpBuilder* func_builder);
 
   mlir::MLIRContext* context_;
-  mlir::ModuleOp module_;
+
+  // SymbolTable to which new functions should be inserted.
+  mlir::SymbolTable& symbol_table_;
+
   mlir::Builder* builder_;
 
   // Mapping from HloComputation to the created MLIR function.
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index 0245249ea69..e0edf62d058 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
@@ -33,27 +35,61 @@ namespace xla {
 HloModuleImporter::HloModuleImporter(mlir::ModuleOp module,
                                      bool import_all_computation)
     : import_all_computation_(import_all_computation),
-      module_(module),
+      symbol_table_(module),
       builder_(module.getContext()) {
   module.getContext()->loadDialect<mlir::arith::ArithDialect>();
   module.getContext()->loadDialect<mlir::func::FuncDialect>();
   module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
 }
 
-Status HloModuleImporter::Import(const xla::HloModule& module) {
-  module_.setName(module.name());
+Status HloModuleImporter::Import(const xla::HloModule& hlo_module) {
+  auto module = llvm::cast<mlir::ModuleOp>(symbol_table_.getOp());
+  module.setName(hlo_module.name());
+  module->setAttr("mhlo.cross_program_prefetches",
+                  ConvertCrossProgramPrefetches(
+                      hlo_module.CrossProgramPrefetches(), &builder_));
+  module->setAttr("mhlo.dynamic_parameter_bindings",
+                  ConvertDynamicParameterBindings(
+                      hlo_module.dynamic_parameter_binding(), &builder_));
+  module->setAttr(
+      "mhlo.is_dynamic",
+      mlir::BoolAttr::get(builder_.getContext(), hlo_module.is_dynamic()));
+  module->setAttr("mhlo.use_auto_spmd_partitioning",
+                  mlir::BoolAttr::get(builder_.getContext(),
+                                      hlo_module.use_auto_spmd_partitioning()));
+  if (hlo_module.has_spmd_output_sharding()) {
+    module->setAttr(
+        "mhlo.spmd_output_sharding",
+        builder_.getStringAttr(
+            hlo_module.spmd_output_sharding().ToProto().SerializeAsString()));
+  }
+
+  if (hlo_module.has_spmd_parameters_shardings()) {
+    llvm::SmallVector<mlir::Attribute> parameter_shardings;
+    for (const auto& sharding : hlo_module.spmd_parameters_shardings()) {
+      parameter_shardings.push_back(
+          builder_.getStringAttr(sharding.ToProto().SerializeAsString()));
+    }
+    module->setAttr("mhlo.spmd_parameters_shardings",
+                    builder_.getArrayAttr(parameter_shardings));
+  }
+
   if (!import_all_computation_)
     // Only import the entry computation, any reachable one will be imported
     // unless turned into a region operation.
-    return HloFunctionImporter::ImportAsFunc(*module.entry_computation(),
-                                             module_, &function_map_, &builder_,
-                                             /*is_main*/ true);
+    return HloFunctionImporter::ImportAsFunc(*hlo_module.entry_computation(),
+                                             symbol_table_, &function_map_,
+                                             &builder_,
+                                             /*is_main*/ true)
+        .status();
 
-  auto* module_entry_computation = module.entry_computation();
-  for (const auto* computation : module.computations())
+  auto* module_entry_computation = hlo_module.entry_computation();
+  for (const auto* computation : hlo_module.computations())
     TF_RETURN_IF_ERROR(HloFunctionImporter::ImportAsFunc(
-        *computation, module_, &function_map_, &builder_,
-        /*is_main*/ computation == module_entry_computation));
+                           *computation, symbol_table_, &function_map_,
+                           &builder_,
+                           /*is_main*/ computation == module_entry_computation)
+                           .status());
 
   return OkStatus();
 }
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.h
index fab2b888786..16006d23879 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.h
@@ -22,7 +22,8 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -48,7 +49,7 @@ class HloModuleImporter {
 
  private:
   bool import_all_computation_;
-  mlir::ModuleOp module_;
+  mlir::SymbolTable symbol_table_;
   mlir::Builder builder_;
 
   // Map for tracking which MLIR function map to which HLO Computation. This
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 40fa8d90419..21b4a5aeb2e 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -22,8 +22,9 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/tsl/platform/bfloat16.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 namespace {
@@ -40,7 +41,7 @@ ::mlir::DenseElementsAttr CreateDenseAttrFromLiteral(
     const ShapedType& type, const LiteralBase& literal) {
   auto data_span = literal.data<CppType>();
   return ::mlir::DenseElementsAttr::get(
-      type, llvm::makeArrayRef(data_span.data(), data_span.size()));
+      type, llvm::ArrayRef(data_span.data(), data_span.size()));
 }
 
 StatusOr<AffineMap> GetPermutationIfAvailable(const Shape& shape,
@@ -111,6 +112,10 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
   switch (element_type) {
     case PrimitiveType::PRED:
       return CreateDenseAttrFromLiteral<bool>(type, literal);
+    case PrimitiveType::F8E5M2:
+      return CreateDenseAttrFromLiteral<tsl::float8_e5m2>(type, literal);
+    case PrimitiveType::F8E4M3FN:
+      return CreateDenseAttrFromLiteral<tsl::float8_e4m3fn>(type, literal);
     case PrimitiveType::F16:
       return CreateDenseAttrFromLiteral<half>(type, literal);
     case PrimitiveType::BF16:
@@ -169,6 +174,14 @@ Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
     CopyDenseElementsBy<uint64_t>(data, output);
     return OkStatus();
   }
+  if (element_type.isFloat8E5M2()) {
+    CopyDenseElementsBy<tsl::float8_e5m2>(data, output);
+    return OkStatus();
+  }
+  if (element_type.isFloat8E4M3FN()) {
+    CopyDenseElementsBy<tsl::float8_e4m3fn>(data, output);
+    return OkStatus();
+  }
   if (element_type.isBF16()) {
     CopyDenseElementsBy<bfloat16>(data, output);
     return OkStatus();
@@ -226,6 +239,10 @@ StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
   switch (element_type) {
     case PrimitiveType::PRED:
       return builder.getI1Type();
+    case PrimitiveType::F8E5M2:
+      return builder.getFloat8E5M2Type();
+    case PrimitiveType::F8E4M3FN:
+      return builder.getFloat8E4M3FNType();
     case PrimitiveType::F16:
       return builder.getF16Type();
     case PrimitiveType::BF16:
@@ -406,6 +423,8 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
     return xla::HloOpcode::kSin;
   } else if (isa<mlir::mhlo::SqrtOp, mlir::lmhlo::SqrtOp>(op)) {
     return xla::HloOpcode::kSqrt;
+  } else if (isa<mlir::mhlo::TanOp, mlir::lmhlo::TanOp>(op)) {
+    return xla::HloOpcode::kTan;
   } else if (isa<mlir::mhlo::TanhOp, mlir::lmhlo::TanhOp>(op)) {
     return xla::HloOpcode::kTanh;
   } else if (isa<mlir::mhlo::ComplexOp, mlir::lmhlo::ComplexOp>(op)) {
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
index d9038e4d553..7ff26280b17 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -23,9 +23,9 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -60,8 +60,8 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
 
   bool is_dynamic = false;
   int64_t rank = xla_ty.rank();
-  llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamicSize);
-  llvm::SmallVector<int64_t, 4> bounds(rank, mlir::ShapedType::kDynamicSize);
+  llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
+  llvm::SmallVector<int64_t, 4> bounds(rank, mlir::ShapedType::kDynamic);
   for (int64_t dim = 0; dim < rank; ++dim) {
     int64_t dim_size = xla_ty.dimensions(dim);
     if (xla_ty.is_dynamic_dimension(dim)) {
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils_test.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
index 405502808ae..f40bd70f08a 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils_test.cc
@@ -51,11 +51,10 @@ TEST(ConvertTensorShapeToType, Simple) {
         auto type,
         ConvertTensorShapeToType<mlir::RankedTensorType>(shape, builder));
 
-    int64_t bounds[] = {8, mlir::ShapedType::kDynamicSize};
+    int64_t bounds[] = {8, mlir::ShapedType::kDynamic};
     auto extensions = mlir::mhlo::TypeExtensionsAttr::get(&context, bounds);
-    auto expected =
-        mlir::RankedTensorType::get({mlir::ShapedType::kDynamicSize, 128},
-                                    builder.getI32Type(), extensions);
+    auto expected = mlir::RankedTensorType::get(
+        {mlir::ShapedType::kDynamic, 128}, builder.getI32Type(), extensions);
     EXPECT_TRUE(type == expected)
         << " Expected: " << mlir::debugString(expected)
         << " Computed: " << mlir::debugString(type);
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/location_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/location_importer.h
index b66aa888e94..82308a3d01f 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/location_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/location_importer.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_TRANSLATE_HLO_TO_MHLO_LOCATION_IMPORTER_H_
 
 #include "mlir/IR/Location.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 
 namespace mlir {
 namespace mhlo {
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
index eb17a5d241e..93464953bf7 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -25,9 +26,12 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
@@ -37,6 +41,54 @@ limitations under the License.
 
 namespace xla {
 
+constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
+
+// Merge two dictionary attributes into one. This function overrides the
+// first dictionary attributes with the second one if there are attributes
+// with the same name.
+mlir::DictionaryAttr MergeTwoDictionaryAttrs(
+    mlir::Operation* op, mlir::DictionaryAttr& original_attributes,
+    mlir::DictionaryAttr& new_attributes) {
+  if (original_attributes == nullptr || original_attributes.empty())
+    return new_attributes;
+
+  if (new_attributes == nullptr || new_attributes.empty())
+    return original_attributes;
+
+  llvm::SmallDenseMap<mlir::StringAttr, mlir::Attribute> result_map;
+
+  for (const auto& attr : original_attributes) {
+    result_map.insert({attr.getName(), attr.getValue()});
+  }
+
+  for (const auto& attr : new_attributes) {
+    result_map.insert({attr.getName(), attr.getValue()});
+  }
+
+  llvm::SmallVector<mlir::NamedAttribute> result;
+  for (auto& attr : result_map) {
+    result.push_back(mlir::NamedAttribute(attr.first, attr.second));
+  }
+  return mlir::DictionaryAttr::get(op->getContext(), result);
+}
+
+// Add frontend attributes to the op.
+// Frontend attributes provide a way to attach custom metadata to any MHLO op.
+//
+// TODO(ziyinh): Remove this logic and use listener to add frontend
+// attributes once the builder supports multiple listeners.
+void AddFrontendAttributesToOperation(
+    mlir::Operation* op, mlir::DictionaryAttr& frontend_attributes) {
+  if (frontend_attributes == nullptr || frontend_attributes.empty()) return;
+  mlir::DictionaryAttr original_attributes =
+      op->getAttr(kFrontendAttributesAttr)
+          .dyn_cast_or_null<mlir::DictionaryAttr>();
+  mlir::DictionaryAttr updated_attributes =
+      MergeTwoDictionaryAttrs(op, original_attributes, frontend_attributes);
+  if (updated_attributes == nullptr || updated_attributes.empty()) return;
+  op->setAttr(kFrontendAttributesAttr, updated_attributes);
+}
+
 static std::string GetMlirOpName(HloOpcode opcode) {
   std::string op_name = HloOpcodeString(opcode);
   absl::c_replace(op_name, '-', '_');
@@ -57,7 +109,7 @@ static mlir::DenseIntElementsAttr GetI64ElementsAttr(
   auto ty = mlir::RankedTensorType::get({static_cast<int64_t>(values.size())},
                                         builder->getIntegerType(64));
   return mlir::DenseIntElementsAttr::get(
-      ty, llvm::makeArrayRef(values.data(), values.size()));
+      ty, llvm::ArrayRef(values.data(), values.size()));
 }
 
 static mlir::DenseIntElementsAttr ConvertPadding(
@@ -85,6 +137,7 @@ StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
 
   int64_t handle = reinterpret_cast<int64_t>(val.getAsOpaquePointer());
   handle_to_shape_[handle] = std::move(shape);
+  AddFrontendAttributesToOperation(val.getDefiningOp(), frontend_attributes_);
   return XlaOp(handle, this);
 }
 
@@ -97,6 +150,19 @@ XlaOp MlirHloBuilder::ConstantLiteral(const LiteralSlice& literal) {
   });
 }
 
+void MlirHloBuilder::SetFrontendAttributes(
+    const FrontendAttributes& frontend_attributes) {
+  llvm::SmallVector<mlir::NamedAttribute> frontend_named_attributes_vec;
+  for (auto& frontend_attribute : frontend_attributes.map()) {
+    frontend_named_attributes_vec.push_back(builder_.getNamedAttr(
+        frontend_attribute.first,
+        builder_.getStringAttr(frontend_attribute.second)));
+  }
+
+  frontend_attributes_ =
+      builder_.getDictionaryAttr(frontend_named_attributes_vec);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ConvGeneralDilatedInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
     absl::Span<const int64_t> window_strides,
@@ -133,7 +199,7 @@ StatusOr<XlaOp> MlirHloBuilder::FftInternal(
   auto op = builder_.create<mlir::mhlo::FftOp>(
       loc_, ty, GetValue(operand),
       mlir::mhlo::FftTypeAttr::get(builder_.getContext(),
-                                   fft_type_attr.getValue()),
+                                   fft_type_attr.value()),
       GetI64ElementsAttr(fft_length, &builder_));
   return MakeXlaOp(op);
 }
@@ -152,8 +218,6 @@ StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
     CustomCallSchedule schedule, CustomCallApiVersion api_version) {
   TF_RET_CHECK(output_operand_aliasing.empty())
       << "MLIR CustomCallOp does not support output_operand_aliasing yet";
-  TF_RET_CHECK(literal == nullptr)
-      << "MLIR CustomCallOp does not support literal yet";
   TF_RET_CHECK(!window.has_value())
       << "MLIR CustomCallOp does not support ConvolutionDimensionNumbers yet";
   TF_RET_CHECK(!dnums.has_value())
@@ -196,22 +260,23 @@ StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
   attributes.push_back(
       builder_.getNamedAttr("backend_config", builder_.getStringAttr(opaque)));
 
-  if (computation && !computation->IsNull()) {
-    llvm::SmallVector<mlir::Attribute> computation_names;
-    for (const auto& computation_proto : computation->proto().computations()) {
-      computation_names.push_back(mlir::SymbolRefAttr::get(
-          builder_.getContext(), computation_proto.name()));
-    }
-    attributes.push_back(builder_.getNamedAttr(
-        "called_computations", builder_.getArrayAttr(computation_names)));
+  if (literal) {
+    TF_ASSIGN_OR_RETURN(auto literal_attr,
+                        CreateDenseElementsAttrFromLiteral(*literal, builder_));
+    attributes.push_back(builder_.getNamedAttr("mhlo.literal", literal_attr));
+  }
 
+  if (computation && !computation->IsNull()) {
     // Create new function(s) to represent the called computations. As a result,
     // this legalization may only be called during a module pass rather than the
     // typical parallelized func pass which is not permitted to create
     // functions.
-    TF_RETURN_IF_ERROR(ImportComputation(
-        computation->proto(),
-        builder_.getBlock()->getParent()->getParentOfType<mlir::ModuleOp>()));
+    TF_ASSIGN_OR_RETURN(auto func,
+                        ImportComputationAsFunc(computation->proto()));
+
+    attributes.push_back(builder_.getNamedAttr(
+        "called_computations", builder_.getArrayAttr({mlir::SymbolRefAttr::get(
+                                   builder_.getContext(), func.getName())})));
   }
 
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
@@ -235,6 +300,9 @@ StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
   TF_RETURN_IF_ERROR(ImportComputation(computation.proto(), &op.getBody(),
                                        /*flatten_region_arg_tuple*/ true));
   if (op.getNumResults() == 1) return MakeXlaOp(op.getResult(0));
+  // Add frontend attributes to the ReduceOp as no MakeXlaOp is called.
+  // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
+  AddFrontendAttributesToOperation(op, frontend_attributes_);
   auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
   return MakeXlaOp(tuple);
 }
@@ -323,10 +391,14 @@ StatusOr<XlaOp> MlirHloBuilder::SortInternal(const Shape& shape,
   auto op = builder_.create<mlir::mhlo::SortOp>(
       loc_, sort_types, GetValues(operands),
       builder_.getI64IntegerAttr(dimension), builder_.getBoolAttr(is_stable));
-  TF_RETURN_IF_ERROR(
-      ImportComputation(comparator.proto(), &op.getComparator()));
+  TF_RETURN_IF_ERROR(ImportComputation(comparator.proto(), &op.getComparator(),
+                                       /*flatten_region_arg_tuple*/ true));
 
   if (ty.isa<mlir::TupleType>()) {
+    // Add frontend attributes to the SortOp as no MakeXlaOp is called.
+    // TODO(hinsu): Avoid this duplicated call for ops returning multiple
+    // results.
+    AddFrontendAttributesToOperation(op, frontend_attributes_);
     auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
     return MakeXlaOp(tuple);
   }
@@ -357,6 +429,10 @@ StatusOr<XlaOp> MlirHloBuilder::WhileInternal(const Shape& shape,
                                        /*flatten_region_arg_tuple*/ true));
 
   if (ty.isa<mlir::TupleType>()) {
+    // Add frontend attributes to the WhileOp as no MakeXlaOp is called.
+    // TODO(hinsu): Avoid this duplicated call for ops returning multiple
+    // results.
+    AddFrontendAttributesToOperation(op, frontend_attributes_);
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -411,7 +487,8 @@ StatusOr<XlaOp> MlirHloBuilder::ScatterInternal(
       builder_.getBoolAttr(unique_indices));
 
   TF_RETURN_IF_ERROR(ImportComputation(update_computation.proto(),
-                                       &op.getUpdateComputation()));
+                                       &op.getUpdateComputation(),
+                                       /*flatten_region_arg_tuple*/ true));
   return MakeXlaOp(op.getResult(0));
 }
 
@@ -471,6 +548,11 @@ StatusOr<XlaOp> MlirHloBuilder::RngBitGeneratorInternal(
       loc_, flattened_ret_types, algorithm_attr, GetValue(initial_state));
 
   if (ty.isa<mlir::TupleType>()) {
+    // Add frontend attributes to the RngBitGeneratorOp as no MakeXlaOp is
+    // called.
+    // TODO(hinsu): Avoid this duplicated call for ops returning multiple
+    // results.
+    AddFrontendAttributesToOperation(op, frontend_attributes_);
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -539,11 +621,11 @@ StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
       mlir::mhlo::ComparisonDirectionAttr::get(
           builder_.getContext(), mlir::mhlo::symbolizeComparisonDirection(
                                      ComparisonDirectionToString(direction))
-                                     .getValue()),
+                                     .value()),
       mlir::mhlo::ComparisonTypeAttr::get(
           builder_.getContext(),
           mlir::mhlo::symbolizeComparisonType(ComparisonTypeToString(type))
-              .getValue()));
+              .value()));
   return MakeXlaOp(op.getResult());
 }
 
@@ -557,7 +639,7 @@ XlaOp MlirHloBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
 StatusOr<XlaOp> MlirHloBuilder::AddOpWithShape(
     HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
   return CreateOp(GetMlirOpName(opcode), shape,
-                  llvm::makeArrayRef<XlaOp>(operands.data(), operands.size()));
+                  llvm::ArrayRef<XlaOp>(operands.data(), operands.size()));
 }
 
 XlaOp MlirHloBuilder::CreateToken() {
@@ -581,7 +663,7 @@ StatusOr<XlaOp> MlirHloBuilder::TriangularSolveInternal(
           builder_.getContext(),
           ::mlir::mhlo::symbolizeTranspose(
               TriangularSolveOptions::Transpose_Name(options.transpose_a()))
-              .getValue()));
+              .value()));
   return MakeXlaOp(op);
 }
 
@@ -610,6 +692,9 @@ StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
                                                   /*infeed_config=*/config,
                                                   /*layout=*/layout);
 
+  // Add frontend attributes to the InfeedOp as no MakeXlaOp is called.
+  // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
+  AddFrontendAttributesToOperation(op, frontend_attributes_);
   llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
   llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
   auto result = HloFunctionImporter::CreateTupleValue(
@@ -741,16 +826,16 @@ Status MlirHloBuilder::ImportComputation(const HloModuleProto& computation,
   TF_ASSIGN_OR_RETURN(auto hlo_module, CreateHloModuleFromProto(computation));
 
   return HloFunctionImporter::ImportAsRegion(*hlo_module->entry_computation(),
-                                             region, &builder_,
+                                             symbol_table_, region, &builder_,
                                              flatten_region_arg_tuple);
 }
 
-Status MlirHloBuilder::ImportComputation(const HloModuleProto& computation,
-                                         mlir::ModuleOp module) {
+StatusOr<mlir::func::FuncOp> MlirHloBuilder::ImportComputationAsFunc(
+    const HloModuleProto& computation) {
   TF_ASSIGN_OR_RETURN(auto hlo_module, CreateHloModuleFromProto(computation));
 
   return HloFunctionImporter::ImportAsFunc(*hlo_module->entry_computation(),
-                                           module, {}, &builder_,
+                                           symbol_table_, {}, &builder_,
                                            /*is_main=*/false);
 }
 
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
index 0efe604f531..b48cfa4766e 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.h
@@ -20,18 +20,23 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -51,7 +56,10 @@ class MlirHloBuilder : public XlaBuilder {
       : XlaBuilder(func.getName().str()),
         builder_(&func.getBody()),
         loc_(builder_.getUnknownLoc()),
-        build_functions_(false) {}
+        build_functions_(false),
+        symbol_table_(builder_.getBlock()
+                          ->getParent()
+                          ->getParentOfType<mlir::ModuleOp>()) {}
 
   // TODO(hinsu): Add a constructor to build a new MLIR function from scratch
   // and override Build methods.
@@ -61,7 +69,10 @@ class MlirHloBuilder : public XlaBuilder {
       : XlaBuilder(name),
         builder_(builder),
         loc_(loc),
-        build_functions_(build_functions) {}
+        build_functions_(build_functions),
+        symbol_table_(builder_.getBlock()
+                          ->getParent()
+                          ->getParentOfType<mlir::ModuleOp>()) {}
 
   MlirHloBuilder(const MlirHloBuilder&) = delete;
   MlirHloBuilder& operator=(const MlirHloBuilder&) = delete;
@@ -113,6 +124,11 @@ class MlirHloBuilder : public XlaBuilder {
     return builder_.create<OpTy>(loc_, std::forward<Args>(args)...);
   }
 
+  // Sets the FrontendAttributes that will be added to all instructions until
+  // cleared.
+  void SetFrontendAttributes(
+      const FrontendAttributes& frontend_attributes) override;
+
  private:
   XlaOp ConstantLiteral(const LiteralSlice& literal) override;
 
@@ -278,13 +294,16 @@ class MlirHloBuilder : public XlaBuilder {
                            mlir::Region* region,
                            bool flatten_region_arg_tuple = false);
 
-  Status ImportComputation(const HloModuleProto& computation,
-                           mlir::ModuleOp module);
+  StatusOr<mlir::func::FuncOp> ImportComputationAsFunc(
+      const HloModuleProto& computation);
 
   mlir::OpBuilder builder_;
+  mlir::DictionaryAttr frontend_attributes_;
   mlir::Location loc_;
   bool build_functions_;
 
+  mlir::SymbolTable symbol_table_;
+
   absl::flat_hash_map<int64_t, std::unique_ptr<Shape>> handle_to_shape_;
 };
 
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder_test.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder_test.cc
index 8da990c609a..bf8c762e049 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder_test.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder_test.cc
@@ -21,15 +21,21 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace xla {
 
@@ -64,13 +70,56 @@ class XlaBuilderTest : public ::testing::Test {
     return str;
   }
 
-  std::string GetMlirOpString(mlir::func::FuncOp func_op) {
+  std::string GetMlirOpString(mlir::Operation* op) {
     std::string func_op_mlir_str;
     llvm::raw_string_ostream ostream{func_op_mlir_str};
-    func_op.print(ostream);
+    op->print(ostream);
     return func_op_mlir_str;
   }
 
+  Status ValidateCustomOpCallee(XlaOp op) {
+    mlir::Value call_result = xla_builder_.GetValue(op);
+    if (!call_result) return InternalError("No MLIR op for the given XlaOp");
+
+    auto call_op = llvm::dyn_cast_or_null<mlir::mhlo::CustomCallOp>(
+        call_result.getDefiningOp());
+    if (!call_op || call_op.getCalledComputations().size() != 1) {
+      return InternalError("Given XlaOp doesn't point to a CustomCallOp");
+    }
+
+    if (call_op.getCalledComputations().size() != 1) {
+      return InternalError(
+          "CustomCallOp should have exactly one called computation");
+    }
+
+    auto callee_name =
+        call_op.getCalledComputations()[0].dyn_cast<mlir::FlatSymbolRefAttr>();
+    if (!callee_name) {
+      return InternalError(
+          "CustomCallOp called computation isn't a flat symbol ref");
+    }
+
+    auto func_op = module_->lookupSymbol<mlir::func::FuncOp>(callee_name);
+    if (!func_op) {
+      return InternalError(
+          "No function found corresponding to the called computations "
+          "attribute");
+    }
+
+    return tsl::OkStatus();
+  }
+
+  XlaComputation BuildTestComparator() {
+    // Create a test comparator computation to use as the custom computation.
+    auto cmp_builder = xla_builder_.CreateSubBuilder("test_comparator");
+    auto p0 = Parameter(cmp_builder.get(), 0,
+                        xla::ShapeUtil::MakeScalarShape(xla::F32), "p0");
+    auto p1 = Parameter(cmp_builder.get(), 1,
+                        xla::ShapeUtil::MakeScalarShape(xla::F32), "p1");
+    Gt(p0, p1);
+    return cmp_builder->BuildAndNoteError();
+  }
+
   std::string name_;
   mlir::MLIRContext context_;
   mlir::OwningOpRef<mlir::ModuleOp> module_;
@@ -184,29 +233,21 @@ TEST_F(XlaBuilderTest, Pad) {
 }
 
 TEST_F(XlaBuilderTest, CustomCallWithComputation) {
-  // Create a test comparator computation to use as the custom computation.
-  // auto cmp_builder = main_func_builder.CreateSubBuilder("test_comparator");
-  auto cmp_builder = xla_builder_.CreateSubBuilder("test_comparator");
-  auto p0 = Parameter(cmp_builder.get(), 0,
-                      xla::ShapeUtil::MakeScalarShape(xla::F32), "p0");
-  auto p1 = Parameter(cmp_builder.get(), 1,
-                      xla::ShapeUtil::MakeScalarShape(xla::F32), "p1");
-  auto gt = Gt(p0, p1);
-  StatusOr<Shape> output_shape_or = cmp_builder->GetShape(gt);
-  TF_ASSERT_OK(output_shape_or.status());
-  XlaComputation test_comparator = cmp_builder->BuildAndNoteError();
+  XlaComputation test_comparator = BuildTestComparator();
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
 
   // Finally, add the CustomCallOp (with computation) to the module.
+  Shape shape(PrimitiveType::PRED, /*dimensions=*/{}, /*dynamic_dimensions=*/{},
+              /*tuple_shapes=*/{});
   auto custom_call = CustomCallWithComputation(
-      &xla_builder_, "test_call_target", {}, test_comparator,
-      output_shape_or.value(),
+      &xla_builder_, "test_call_target", {}, test_comparator, shape,
       "{\"option1\": foo, \"option2\": bar, \"option3\": \"baz\"}");
 
   TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
 
   ExpectHasSubstr(
       GetMlirOpString(custom_call),
-      R"("mhlo.custom_call"() {api_version = 1 : i32, backend_config = "{\22option1\22: foo, \22option2\22: bar, \22option3\22: \22baz\22}", call_target_name = "test_call_target", called_computations = [@test_comparator.4], has_side_effect = false} : () -> tensor<i1>)");
+      R"(%0 = mhlo.custom_call @test_call_target() {backend_config = "{\22option1\22: foo, \22option2\22: bar, \22option3\22: \22baz\22}", called_computations = [@test_comparator.4]} : () -> tensor<i1>)");
 
   // We should also expect there to be a new function added for the comparator.
   auto actual_func_op = module_->lookupSymbol<mlir::func::FuncOp>(
@@ -220,5 +261,98 @@ TEST_F(XlaBuilderTest, CustomCallWithComputation) {
 })");
 }
 
+// Tests that the same comparator can be used in different custom call ops with
+// appropriate rename.
+TEST_F(XlaBuilderTest, DuplicateCustomCallComparator) {
+  XlaComputation test_comparator = BuildTestComparator();
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  Shape shape(PrimitiveType::PRED, /*dimensions=*/{}, /*dynamic_dimensions=*/{},
+              /*tuple_shapes=*/{});
+  {
+    auto custom_call = CustomCallWithComputation(
+        &xla_builder_, "test_call_target", {}, test_comparator, shape);
+
+    TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+    TF_ASSERT_OK(ValidateCustomOpCallee(custom_call))
+        << GetMlirOpString(*module_);
+  }
+
+  {
+    auto custom_call = CustomCallWithComputation(
+        &xla_builder_, "test_call_target", {}, test_comparator, shape);
+    TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+    TF_ASSERT_OK(ValidateCustomOpCallee(custom_call))
+        << GetMlirOpString(*module_);
+  }
+
+  // Verify that there are no duplicated symbols by creating a SymbolTable.
+  mlir::SymbolTable symbol_table(*module_);
+  (void)symbol_table;
+}
+
+TEST_F(XlaBuilderTest, CustomCallWithFrontendAttributes) {
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  // Create frontend attributes and set it for the CustomCall op.
+  FrontendAttributes attr;
+  attr.mutable_map()->insert({"test_name", "test_value"});
+
+  xla_builder_.SetFrontendAttributes(attr);
+
+  // Add the CustomCallOp to the module.
+  Shape shape(PrimitiveType::PRED, /*dimensions=*/{}, /*dynamic_dimensions=*/{},
+              /*tuple_shapes=*/{});
+  auto custom_call = CustomCall(&xla_builder_, "test_call_target", {}, shape);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  // Verify that the frontend attributes are correctly set for the CustomCall
+  // op.
+  ExpectHasSubstr(
+      GetMlirOpString(custom_call),
+      R"(%0 = mhlo.custom_call @test_call_target() {backend_config = "", mhlo.frontend_attributes = {test_name = "test_value"}} : () -> tensor<i1>)");
+}
+
+TEST_F(XlaBuilderTest, CustomCallWithLiteral) {
+  auto input = ConstantLiteral(&xla_builder_,
+                               LiteralUtil::CreateFromDimensions(F32, {5, 7}));
+  xla::Literal literal = xla::LiteralUtil::CreateR0<int32_t>(16);
+  auto custom_call = CustomCall(&xla_builder_, "OpWithLiteral", {input},
+                                xla_builder_.GetShape(input).value(),
+                                /*opaque=*/"", /*has_side_effect=*/false,
+                                /*output_operand_aliasing=*/{}, &literal);
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  ExpectHasSubstr(
+      GetMlirOpString(custom_call),
+      R"(mhlo.custom_call @OpWithLiteral(%0) {backend_config = "", mhlo.literal = dense<16> : tensor<i32>} : (tensor<5x7xf32>) -> tensor<5x7xf32>)");
+}
+
+TEST_F(XlaBuilderTest, InfeedWithTokenWithFrontendAttributes) {
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  // Create frontend attributes and set it for the CustomCall op.
+  FrontendAttributes attr;
+  attr.mutable_map()->insert({"test_name", "test_value"});
+
+  xla_builder_.SetFrontendAttributes(attr);
+
+  auto token = CreateToken(&xla_builder_);
+  InfeedWithToken(token, ShapeUtil::MakeShape(F32, {4, 8}), "");
+
+  TF_ASSERT_OK(xla_builder_.GetCurrentStatus());
+
+  // Verify that the frontend attributes are correctly set for the entire
+  // module.
+  ExpectHasSubstr(
+      GetMlirOpString(module_.get()),
+      R"(%0 = mhlo.create_token {mhlo.frontend_attributes = {test_name = "test_value"}} : !mhlo.token
+  %1:2 = "mhlo.infeed"(%0) {infeed_config = "", mhlo.frontend_attributes = {test_name = "test_value"}} : (!mhlo.token) -> (tensor<4x8xf32>, !mhlo.token)
+  %2 = mhlo.tuple %1#0, %1#1 {mhlo.frontend_attributes = {test_name = "test_value"}} : tuple<tensor<4x8xf32>, !mhlo.token>)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
index e82aae7d2f0..31413298636 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
@@ -1,9 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow:internal",
+        "//tensorflow/compiler/xla:internal",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/dynamic_param.hlo b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/dynamic_param.hlo
new file mode 100644
index 00000000000..42a7df0b958
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/dynamic_param.hlo
@@ -0,0 +1,104 @@
+# RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
+hlo_module       {
+  name: "main"
+  entry_computation_name: "main.4"
+  computations {
+    name: "main.4"
+    instructions {
+      name: "Arg_0.1"
+      opcode: "parameter"
+      shape {
+        element_type: S32
+        layout {
+        }
+      }
+      metadata {
+      }
+      id: 1
+      frontend_attributes {
+      }
+    }
+    instructions {
+      name: "Arg_1.2"
+      opcode: "parameter"
+      shape {
+        element_type: F32
+        dimensions: 2
+        layout {
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: true
+      }
+      metadata {
+      }
+      parameter_number: 1
+      id: 2
+      frontend_attributes {
+      }
+    }
+    instructions {
+      name: "tuple.3"
+      opcode: "tuple"
+      shape {
+        element_type: TUPLE
+      }
+      metadata {
+      }
+      id: 3
+      frontend_attributes {
+      }
+    }
+    program_shape {
+      parameters {
+        element_type: S32
+        layout {
+        }
+      }
+      parameters {
+        element_type: F32
+        dimensions: 2
+        layout {
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: true
+      }
+      result {
+        element_type: TUPLE
+      }
+      parameter_names: "Arg_0"
+      parameter_names: "Arg_1"
+    }
+    id: 4
+    root_id: 3
+  }
+  host_program_shape {
+    parameters {
+      element_type: S32
+      layout {
+      }
+    }
+    parameters {
+      element_type: F32
+      dimensions: 2
+      layout {
+        minor_to_major: 0
+      }
+      is_dynamic_dimension: true
+    }
+    result {
+      element_type: TUPLE
+    }
+    parameter_names: "Arg_0"
+    parameter_names: "Arg_1"
+  }
+  id: 4
+  entry_computation_id: 4
+  # CHECK: mhlo.dynamic_parameter_bindings = [#mhlo.dynamic_parameter_binding<dynamic_param_num = 0, dynamic_param_indices = [], target_param_num = 1, target_param_indices = [], target_param_dim_num = 0
+  dynamic_parameter_binding {
+    entries {
+      target_param_num: 1
+    }
+  }
+}
+
+# CHECK: func.func @main(%arg0: tensor<i32>, %arg1: tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> tuple<> {
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt
new file mode 100644
index 00000000000..68dab4b5f27
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/frontend_attributes.hlotxt
@@ -0,0 +1,15 @@
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+
+HloModule frontend_attributes, entry_computation_layout={(f32[])->f32[]}
+
+// CHECK-LABEL: func.func @main
+ENTRY main {
+  param = f32[] parameter(0)
+  // CHECK:      mhlo.add
+  // CHECK:      mhlo.frontend_attributes = {
+  // CHECK-DAG:    _dummy_frontend_attribute = "xyz"
+  // CHECK-DAG:    _xla_compute_type = "host"
+  // CHECK:      }
+  ROOT add = f32[] add(param, param), frontend_attributes={
+    _xla_compute_type="host", _dummy_frontend_attribute="xyz"}
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
index 82007217a56..6ba31aafffe 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/fusion.hlotxt
@@ -5,16 +5,16 @@ HloModule main.17
 // CHECK: func @main(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> tensor<f32> {
 // CHECK:   %[[F0:.+]] = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> tensor<f32>
 // CHECK:   %[[F1:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]]) ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
 // CHECK:   %[[F2:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
 // CHECK:   %[[F3:.+]]:2 = "mhlo.fusion"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
 // CHECK:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+// CHECK:   }) {fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliasing = []} : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
 // CHECK: }
 
 %region_0 (Arg_0.4: f32[], Arg_1.5: f32[]) -> f32[] {
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 2c99d702e81..d15de7ee3e7 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1,5 +1,5 @@
-// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
-// RUN: xla-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
 
 // NO_DEAD_FUNCTION-NOT: @test
 
@@ -76,12 +76,25 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 %test_all_to_all {
   %parameter = s32[2,2]{1,0} parameter(0)
   // CHECK-NEXT: "mhlo.all_to_all"([[ARG]]) {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME:   concat_dimension = 1 : i64,
   // CHECK-SAME{LITERAL}:   replica_groups = dense<[[1, 2], [3, 0]]> : tensor<2x2xi64>,
   // CHECK-SAME:   split_count = 2 : i64,
   // CHECK-SAME:   split_dimension = 1 : i64
   // CHECK-SAME: } : (tensor<2x2xi32>) -> tensor<2x2xi32>
-  ROOT %all-to-all = s32[2,2]{1,0} all-to-all(s32[2,2]{1,0} %parameter), replica_groups={{1,2}, {3,0}}, dimensions={1}
+  ROOT %all-to-all = s32[2,2]{1,0} all-to-all(s32[2,2]{1,0} %parameter), channel_id=1, replica_groups={{1,2}, {3,0}}, dimensions={1}
+}
+
+// CHECK-LABEL:  func private @test_tuple_all_to_all
+// CHECK-SAME:   ([[ARG0:%.*]]: tensor<128x4xf32>,
+// CHECK-SAME:    [[ARG1:%.*]]: tensor<128x4xf32>)
+%test_tuple_all_to_all {
+  %p0 = f32[128,4]{0,1} parameter(0)
+  %p1 = f32[128,4]{1,0} parameter(1)
+  // CHECK-NEXT: "mhlo.all_to_all"([[ARG0]], [[ARG1]]) {
+  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
+  // CHECK-SAME: } : (tensor<128x4xf32>, tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>)
+  ROOT %all-to-all = (f32[128,4]{0,1}, f32[128,4]{1,0}) all-to-all(%p0, %p1), channel_id=1, replica_groups={{0,1}}
 }
 
 // Test all-reduce
@@ -102,10 +115,10 @@ add {
   // CHECK:  }) {
   // CHECK-SAME:  channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-NOT: use_global_device_ids
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 8]]> : tensor<2x4xi64>
+  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
   // CHECK-NOT: use_global_device_ids
   // CHECK-SAME: :
-  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {5,6,7,8}}, to_apply=add
+  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
 }
 
 // CHECK-LABEL:  func private @test_all_reduce_global
@@ -118,11 +131,19 @@ add {
   // CHECK:    mhlo.return [[ADD]] : tensor<f32>
   // CHECK:  }) {
   // CHECK-SAME:  channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [5, 6, 7, 8]]> : tensor<2x4xi64>
+  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
   // CHECK-SAME: use_global_device_ids
-  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {5,6,7,8}}, use_global_device_ids=true, to_apply=add
+  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, use_global_device_ids=true, to_apply=add
 }
 
+// CHECK-LABEL:  func private @test_all_reduce_heterogeneous
+// CHECK-SAME:  ([[INPUT:%.*]]: tensor<8xf32>)
+%test_all_reduce_heterogeneous {
+  input = f32[8] parameter(0)
+  // CHECK-NEXT:  "mhlo.all_reduce"([[INPUT]])
+  // CHECK{LITERAL}:  replica_groups = dense<[[0, -1], [1, 2], [3, -1]]> : tensor<3x2xi64>
+  ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0}, {1,2}, {3}}, use_global_device_ids=true, to_apply=add
+}
 
 // CHECK-LABEL:  func private @test_and
 %test_and (Arg_0.1: pred[4], Arg_1.2: pred[4]) -> pred[4] {
@@ -134,13 +155,13 @@ add {
 }
 
 // CHECK-LABEL:  func private @test_atan2
-// CHECK-SAME:    ([[VAL_0:%.*]]: tensor<4xi32>, [[VAL_1:%.*]]: tensor<4xi32>) -> tensor<4xi32>
-%test_atan2 (Arg_0.1: s32[4], Arg_1.2: s32[4]) -> s32[4] {
-  %Arg_0.1 = s32[4] parameter(0)
-  %Arg_1.2 = s32[4] parameter(1)
+// CHECK-SAME:    ([[VAL_0:%.*]]: tensor<4xf32>, [[VAL_1:%.*]]: tensor<4xf32>) -> tensor<4xf32>
+%test_atan2 (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
+  %Arg_0.1 = f32[4] parameter(0)
+  %Arg_1.2 = f32[4] parameter(1)
 
   // CHECK:  mhlo.atan2 [[VAL_0]], [[VAL_1]]
-  ROOT %atan2 = s32[4] atan2(s32[4] %Arg_0.1, s32[4] %Arg_1.2)
+  ROOT %atan2 = f32[4] atan2(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
 // CHECK-LABEL:  func private @test_broadcast_in_dim
@@ -288,6 +309,12 @@ add {
 
   // CHECK: %[[VAL_6:.*]] = mhlo.constant dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
   ROOT %constant.6 = f16[4] constant({1, -4, -65504, 0.015625})
+
+  // CHECK: %[[VAL_7:.*]] = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E5M2>
+  %constant.7 = f8e5m2[4] constant({1, 2, 3, 4})
+
+  // CHECK: %[[VAL_8:.*]] = mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf8E4M3FN>
+  %constant.8 = f8e4m3fn[4] constant({1, 2, 3, 4})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual
@@ -362,6 +389,23 @@ add {
   ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[4] %convert.4)
 }
 
+// CHECK-LABEL:  func private @test_f8_convert(%arg0: tensor<4xf32>) -> tensor<4xf32>
+%test_f8_convert (Arg_0.1: f32[4]) -> f32[4] {
+  %Arg_0.1 = f32[4] parameter(0)
+
+  // CHECK-NEXT:  %0 = mhlo.convert %arg0 : (tensor<4xf32>) -> tensor<4xf8E5M2>
+  %convert.3 = f8e5m2[4] convert(f32[4] %Arg_0.1)
+
+  // CHECK-NEXT:  %1 = mhlo.convert %0 : (tensor<4xf8E5M2>) -> tensor<4xf32>
+  %convert.4 = f32[4] convert(f8e5m2[4] %convert.3)
+
+  // CHECK-NEXT:  %2 = mhlo.convert %1 : (tensor<4xf32>) -> tensor<4xf8E4M3FN>
+  %convert.5 = f8e4m3fn[4] convert(f32[4] %convert.4)
+
+  // CHECK-NEXT:  %3 = mhlo.convert %2 : (tensor<4xf8E4M3FN>) -> tensor<4xf32>
+  ROOT %convert.6 = f32[4] convert(f8e4m3fn[4] %convert.5)
+}
+
 // CHECK-LABEL:  func private @test_stochastic_convert(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xui32>) -> tensor<4x3xi8>
 %test_stochastic_convert (Arg_0.1: f32[4,3], Arg_1.2: u32[4,3]) -> s8[4,3] {
   %Arg_0.1 = f32[4,3] parameter(0)
@@ -379,18 +423,74 @@ add {
   ROOT %cosine.3 = f32[1,16,16,3]{3,2,1,0} cosine(f32[1,16,16,3]{3,2,1,0} %arg0.1)
 }
 
+// CHECK-LABEL:  func private @test_tan(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+%test_tan (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
+  %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
+
+  // CHECK-NEXT:  mhlo.tan %arg0 : tensor<1x16x16x3xf32>
+  ROOT %tan.3 = f32[1,16,16,3]{3,2,1,0} tan(f32[1,16,16,3]{3,2,1,0} %arg0.1)
+}
+
+// CHECK-LABEL:  func private @test_set_bound
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>) -> tensor<2x3xf32>
+%test_set_bound (arg1: f32[2,3]) -> f32[2,3] {
+  %arg1 = f32[2,3] parameter(0)
+  // CHECK:  mhlo.custom_call @SetBound([[ARG_0]]) {
+  // CHECK-SAME: mhlo.literal = dense<1>
+  // CHECK-SAME: : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  ROOT %custom-call = f32[2,3] custom-call(f32[2,3] %arg1), custom_call_target="SetBound", literal=s32[] 1
+}
+
 // CHECK-LABEL:  func private @test_custom_call
 // CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32>
 %test_custom_call (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
   %arg1 = f32[2,3] parameter(0)
   %arg2 = f32[5,5] parameter(1)
-  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: api_version = 1 : i32
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME: backend_config = "bar"
+  // CHECK-SAME: custom_call_schedule = #mhlo<custom_call_schedule LATEST>
+  // CHECK-SAME: has_side_effect = true
+  // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true, schedule=SCHEDULE_LATEST
+}
+
+// CHECK-LABEL:  func private @test_custom_call_schedule_early
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32>
+%test_custom_call_schedule_early(arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
+  %arg1 = f32[2,3] parameter(0)
+  %arg2 = f32[5,5] parameter(1)
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
   // CHECK-SAME: backend_config = "bar"
-  // CHECK-SAME: call_target_name = "foo"
+  // CHECK-SAME: custom_call_schedule = #mhlo<custom_call_schedule EARLIEST>
+  // CHECK-SAME: has_side_effect = true
+  // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true, schedule=SCHEDULE_EARLIEST
+}
+
+// CHECK-LABEL:  func private @test_custom_call_backend_config_dict
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32>
+%test_custom_call_backend_config_dict (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
+  %arg1 = f32[2,3] parameter(0)
+  %arg2 = f32[5,5] parameter(1)
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME: api_version = 4 : i32
+  // CHECK-SAME: backend_config = {bar = 42 : i32}
+  // CHECK-SAME: has_side_effect = true
+  // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", api_version=API_VERSION_TYPED_FFI, backend_config="{bar = 42 : i32}", custom_call_has_side_effect=true
+}
+
+// CHECK-LABEL:  func private @test_custom_call_empty_backend_config_dict
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32>
+%test_custom_call_empty_backend_config_dict (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
+  %arg1 = f32[2,3] parameter(0)
+  %arg2 = f32[5,5] parameter(1)
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME: api_version = 4 : i32
+  // CHECK-SAME: backend_config = {}
   // CHECK-SAME: has_side_effect = true
   // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
-  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", api_version=API_VERSION_TYPED_FFI, backend_config="", custom_call_has_side_effect=true
 }
 
 // CHECK-LABEL:  func private @test_custom_call_layout
@@ -400,10 +500,8 @@ add {
   %arg2 = f32[5,5] parameter(1)
   %arg3 = token[] parameter(2)
   %arg4 = s32[] parameter(3)
-  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_3]]) {
-  // CHECK-SAME: api_version = 1 : i32
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_3]]) {
   // CHECK-SAME: backend_config = "bar"
-  // CHECK-SAME: call_target_name = "foo"
   // CHECK-SAME: has_side_effect = true
   // CHECK-SAME: operand_layouts = [dense<[0, 1]> : tensor<2xindex>, dense<[1, 0]> : tensor<2xindex>, dense<> : tensor<0xindex>, dense<> : tensor<0xindex>]
   // CHECK-SAME: result_layouts = [dense<[0, 2, 1]> : tensor<3xindex>]
@@ -416,10 +514,8 @@ add {
 %test_custom_call_tuple_output (arg1: f32[2,3], arg2: f32[5,5]) -> (f32[1,2,3], s32[3,7,9]) {
   %arg1 = f32[2,3] parameter(0)
   %arg2 = f32[5,5] parameter(1)
-  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: api_version = 1 : i32
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
   // CHECK-SAME: backend_config = "bar"
-  // CHECK-SAME: call_target_name = "foo"
   // CHECK-SAME: has_side_effect = true
   // CHECK-SAME: operand_layouts = [dense<[0, 1]> : tensor<2xindex>, dense<[1, 0]> : tensor<2xindex>]
   // CHECK-SAME: result_layouts = [dense<[0, 2, 1]> : tensor<3xindex>, dense<[2, 0, 1]> : tensor<3xindex>]
@@ -444,9 +540,7 @@ add {
 %test_custom_call_with_computations (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
   %arg1 = f32[2,3] parameter(0)
   %arg2 = f32[5,5] parameter(1)
-  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: api_version = 1 : i32
-  // CHECK-SAME: call_target_name = "foo"
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
   // CHECK-SAME: called_computations = [@custom_call_computation_0, @custom_call_computation_1]
   // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
   ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", called_computations={%custom_call_computation_0, %custom_call_computation_1}
@@ -457,9 +551,8 @@ add {
 %test_custom_call_with_output_operand_alising (arg1: (f32[1,1], f32[2,3]), arg2: f32[5,5]) -> (f32[2,3]) {
   %arg1 = (f32[1,1], f32[2,3]) parameter(0)
   %arg2 = f32[5,5] parameter(1)
-  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: call_target_name = "foo"
-  // CHECK-SAME{LITERAL}: custom_call_output_operand_aliasing = [#mhlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>]
+  // CHECK:  mhlo.custom_call @foo([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME{LITERAL}: output_operand_aliases = [#mhlo.output_operand_alias<output_tuple_indices = [0], operand_index = 0, operand_tuple_indices = [1]>]
   // CHECK-SAME: (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
   ROOT %custom-call = (f32[2,3]) custom-call((f32[1,1], f32[2,3]) %arg1, f32[5,5] %arg2), custom_call_target="foo", output_to_operand_aliasing={{0}: (0, {1})}
 }
@@ -495,7 +588,7 @@ add {
 // CHECK-LABEL:  @test_dot_general
 // CHECK-SAME: [[ARG0:%[a-zA-Z0-9]+]]
 // CHECK-SAME: [[ARG1:%[a-zA-Z0-9]+]]
-%test_dot_general (Arg_0.1: f32[4, 1], Arg_1.2: f32[1, 4]) -> f32[] {
+%test_dot_general (Arg_0.1: f32[4, 1], Arg_1.2: f32[1, 4]) -> f32[1] {
   %Arg_0.1 = f32[4, 1] parameter(0)
   %Arg_1.2 = f32[1, 4] parameter(1)
 
@@ -505,20 +598,20 @@ add {
   // CHECK-SAME: lhs_contracting_dimensions = [0]
   // CHECK-SAME: rhs_contracting_dimensions = [1]
   // CHECK-SAME: precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]
-  dot.3 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={high,highest}
+  dot.3 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={high,highest}
 
   // CHECK-NEXT:  [[R1:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]])
   // CHECK-SAME: precision_config = [#mhlo<precision HIGHEST>, #mhlo<precision DEFAULT>]
-  dot.4 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,default}
+  dot.4 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,default}
 
   // CHECK-NEXT:  [[R2:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]])
   // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  %dot.5 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={default,default}
+  %dot.5 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={default,default}
 
   // TODO(b/129709049) consider making this default precision config inferred.
   // CHECK-NEXT:  "mhlo.dot_general"([[ARG0]], [[ARG1]])
   // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
-  %dot.6 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  %dot.6 = f32[1, 1] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 
   // CHECK: [[rehape:%[a-zA-Z0-9]+]] = mhlo.reshape
   // CHECK-NEXT:  "mhlo.dot_general"([[rehape]], [[ARG1]])
@@ -526,7 +619,7 @@ add {
   // CHECK-SAME: rhs_contracting_dimensions = [1]
   // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]}
   reshape.0 = f32[4]{0} reshape(f32[4, 1] Arg_0.1)
-  ROOT %dot.7 = f32[] dot(%reshape.0, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+  ROOT %dot.7 = f32[1] dot(%reshape.0, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 }
 
 // CHECK-LABEL:  func private @test_dynamic_slice
@@ -827,14 +920,14 @@ add {
 }
 
 // CHECK-LABEL:  func private @test_outfeed_with_sharding
-// CHECK-SAME: ([[DATA:%.*]]: tensor<3xi32>, [[TOKEN:%.*]]: !mhlo.token) -> (!mhlo.token {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"})
+// CHECK-SAME: ([[DATA:%.*]]: tensor<3xi32>, [[TOKEN:%.*]]: !mhlo.token) -> (!mhlo.token {mhlo.sharding = "\08\01\1A\01\01\22\01\00"})
 %test_outfeed_with_sharding (Arg_0.1: s32[3], Arg_1.2: token[]) -> token[] {
   %Arg_0.1 = s32[3] parameter(0)
   %Arg_1.2 = token[] parameter(1)
   // CHECK-NEXT:  "mhlo.outfeed"([[DATA]], [[TOKEN]])
-  // CHECK-SAME: mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"
+  // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
   // CHECK-SAME:  outfeed_config = "foobar"
-  ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar", sharding={devices=[2,1]0,1}
+  ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar", sharding={maximal device=0}
 }
 
 // CHECK-LABEL:  func private @test_outfeed_with_empty_data
@@ -1234,6 +1327,26 @@ add {
 // CHECK:      mhlo.return [[ADD]] : tensor<f32>
 // CHECK:    })
 
+%all_reduce_computation_returning_tuple {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  %sum = f32[] add(f32[] %lhs, f32[] %rhs)
+  ROOT %tuple = (f32[]) tuple(%sum)
+}
+
+%test_all_reduce_with_reducer_returning_tuple {
+  %Arg_0 = f32[4] parameter(0)
+  ROOT %all-reduce = f32[4] all-reduce(f32[4] %Arg_0), replica_groups={{0,2,4},{1,3,5,6}}, to_apply=%all_reduce_computation_returning_tuple
+}
+
+// CHECK-LABEL:  func private @test_all_reduce_with_reducer_returning_tuple
+// CHECK-SAME:   [[ARG_0:%.*]]: tensor<4xf32>) -> tensor<4xf32>
+// CHECK:  "mhlo.all_reduce"([[ARG_0]]) ({
+// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
+// CHECK:      [[ADD:%.*]] = mhlo.add [[LHS]], [[RHS]]
+// CHECK:      mhlo.return [[ADD]] : tensor<f32>
+// CHECK:    })
+
 // CHECK-LABEL:  func private @test_select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32>
 %test_select {
   %Arg_0.1 = pred[2,3] parameter(0)
@@ -1318,7 +1431,7 @@ add {
   %Arg_0.1 = f32[4,4] parameter(0)
   %Arg_1.2 = s32[] parameter(1)
   // CHECK-NEXT:  "mhlo.set_dimension_size"([[ARG]], [[SIZE]]) {dimension = 1 : i64} : (tensor<4x4xf32>, tensor<i32>)
-  // CHECK-SAME: tensor<4x?xf32, #mhlo.type_extensions<bounds = [-1, 4]>>
+  // CHECK-SAME: tensor<4x?xf32, #mhlo.type_extensions<bounds = [?, 4]>>
   ROOT %set-dimension-size.2 = f32[4,<=4] set-dimension-size(f32[4,4] %Arg_0.1, s32[] %Arg_1.2), dimensions={1}
 }
 
@@ -1531,7 +1644,7 @@ add {
 // CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>)
 %reduce_precision (Arg_0.1: f32[3,4]) -> f32[3,4] {
   %Arg_0.1 = f32[3,4] parameter(0)
-  // CHECK: "mhlo.reduce_precision"(%[[ARG0]]) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  // CHECK: mhlo.reduce_precision %[[ARG0]], format = e8m10 : tensor<3x4xf32>
   ROOT %reduce_precision = f32[3,4] reduce-precision(f32[3,4] %Arg_0.1), exponent_bits=8, mantissa_bits=10
 }
 
@@ -1588,7 +1701,7 @@ add {
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<10xf32>)
  %AsyncOp (p0.1: f32[10]) -> f32[20] {
    %p0.1 = f32[10]{0} parameter(0)
-  // CHECK:  "mhlo.custom_call"
+  // CHECK:  mhlo.custom_call
    ROOT %custom-call = f32[20]{0} custom-call(f32[10]{0} %p0.1),
                                   custom_call_target="foo"
  }
@@ -1620,10 +1733,10 @@ add {
  }
 
 // CHECK-LABEL: func private @test_args_and_result_with_sharding
-// CHECK-SAME: ([[Arg:%.*]]: tensor<4xi32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"}) -> (tensor<4xi32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"})
+// CHECK-SAME: ([[Arg:%.*]]: tensor<4xi32> {mhlo.sharding = "\08\03\1A\01\02\22\02\00\01"}) -> (tensor<4xi32> {mhlo.sharding = "\08\03\1A\01\02\22\02\00\01"})
 %test_args_and_result_with_sharding (Arg_0.1: s32[4]) -> s32[4] {
-  %arg0.1 = s32[4] parameter(0), sharding={devices=[2,1]0,1}
-  ROOT %copy.1 = s32[4] copy(%arg0.1), sharding={devices=[2,1]0,1}
+  %arg0.1 = s32[4] parameter(0), sharding={devices=[2]0,1}
+  ROOT %copy.1 = s32[4] copy(%arg0.1), sharding={devices=[2]0,1}
 }
 
 // CHECK-LABEL: func private @named_thread
@@ -1631,7 +1744,16 @@ add {
 // CHECK-SAME: execution_thread = "mythread"
  %named_thread (p0.1: f32[10]) -> f32[20] {
    %p0.1 = f32[10]{0} parameter(0)
-  // CHECK:  "mhlo.custom_call"
+  // CHECK:  mhlo.custom_call
    ROOT %custom-call = f32[20]{0} custom-call(f32[10]{0} %p0.1),
                                   custom_call_target="foo"
  }, execution_thread="mythread"
+
+// CHECK-LABEL: func private @param_replication
+// CHECK-SAME: mhlo.parameter_replication = [true]
+// CHECK-SAME: mhlo.parameter_replication = [false, true]
+%param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
+  %a = f32[] parameter(0), parameter_replication={true}
+  %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
+  ROOT %tuple = (f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0}))) tuple(f32[] %a, (f32[2,4]{1,0}, (f32[2,4]{1,0})) %b)
+}
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
index 9dff8ee273b..3ac32e0b300 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import_async.hlotxt
@@ -1,5 +1,5 @@
-// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
-// RUN: xla-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo %s -o - | FileCheck %s -check-prefix=NO_DEAD_FUNCTION
 
 // NO_DEAD_FUNCTION-NOT: @test
 
@@ -8,17 +8,21 @@ HloModule foobar
 
 // Compiler-generated functions
 
+// CHECK:  func private [[RECV_DTD_GENSYM:@.*recv.*]]([[TOK:%.*]]: !mhlo.token) -> (tensor<128x32xf32>, !mhlo.token) attributes {execution_thread = "main"} {
+  // CHECK-NEXT: "mhlo.recv"([[TOK]]
+  // CHECK-SAME{LITERAL}: {channel_handle = #mhlo.channel_handle<handle = 5, type = 1>, is_host_transfer = false}
+
 // CHECK:  func private [[RECV_GENSYM:@.*recv.*]]([[TOK:%.*]]: !mhlo.token) -> (tensor<128x32xf32>, !mhlo.token) attributes {execution_thread = "main"} {
   // CHECK-NEXT: "mhlo.recv"([[TOK]]
-  // CHECK-SAME{LITERAL}: {channel_handle = #mhlo.channel_handle<handle = 5, type = 0>, is_host_transfer = true}
+  // CHECK-SAME{LITERAL}: {channel_handle = #mhlo.channel_handle<handle = 5, type = 3>, is_host_transfer = true}
 
 // CHECK:  func private [[SEND_GENSYM:@.*send.*]]([[INPUT:%.*]]: tensor<128x32xf32>, %arg1: !mhlo.token) -> !mhlo.token attributes {execution_thread = "main"} {
   // CHECK-NEXT:  "mhlo.send"([[INPUT]]
-  // CHECK-SAME{LITERAL}: {channel_handle = #mhlo.channel_handle<handle = 5, type = 0>, is_host_transfer = true}
+  // CHECK-SAME{LITERAL}: {channel_handle = #mhlo.channel_handle<handle = 5, type = 2>, is_host_transfer = true}
 
 // CHECK:  func private [[COPY_GENSYM:@.*copy.*]]([[INPUT:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> attributes {execution_thread = "main"} {
   // CHECK-NEXT:  mhlo.copy [[INPUT]]
-  // CHECK-SAME: is_cross_program_prefetch
+  // CHECK-SAME: cross_program_prefetch_index
 
 // CHECK: func private [[CP_GENSYM:@.*collective_permute_.*]]([[INPUT:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32> attributes {execution_thread = "main"} {
   // CHECK-NEXT:  "mhlo.collective_permute"([[INPUT]])
@@ -94,7 +98,7 @@ add {
   input = f32[128,32] parameter(0)
   // CHECK-NEXT:  [[COPY_START:%.*]] = "mhlo.async_start"([[INPUT]])
   // CHECK-SAME: called_computation = [[COPY_GENSYM]], execution_thread = "main"
-  copy-start = (f32[128,32], f32[128,32], u32[]) copy-start(input), is_cross_program_prefetch=true
+  copy-start = (f32[128,32], f32[128,32], u32[]) copy-start(input), cross_program_prefetch_index=0
   // CHECK-NEXT: "mhlo.async_done"([[COPY_START]])
   // CHECK-SAME: called_computation = [[COPY_GENSYM]], execution_thread = "main"
   ROOT copy-done = f32[128,32] copy-done(copy-start)
@@ -128,3 +132,18 @@ add {
   recv-done = (f32[128,21], token[]) recv-done(recv-start), channel_id=5, is_host_transfer=true
   ROOT gte = get-tuple-element(recv-done), index=0
 }
+
+// CHECK:  func private @test_recv_dtd
+// CHECK-SAME:  ([[INPUT:%.*]]: tensor<128x32xf32>, [[TOK:%.*]]: !mhlo.token)
+%test_recv_dtd_start {
+  input = f32[128,32] parameter(0)
+  tok = token[] parameter(1)
+  // CHECK-NEXT:  [[RECV_START:%.*]] = "mhlo.async_start"([[TOK]])
+  // CHECK-SAME: called_computation = [[RECV_DTD_GENSYM]], execution_thread = "main"
+  // CHECK-SAME{LITERAL}: -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<128x32xf32>, !mhlo.token>, tensor<ui32>>
+  recv-start = (f32[128,32], u32[], token[]) recv(tok), channel_id=5
+  // CHECK-NEXT: "mhlo.async_done"([[RECV_START]])
+  // CHECK-SAME: called_computation = [[RECV_DTD_GENSYM]], execution_thread = "main"
+  recv-done = (f32[128,21], token[]) recv-done(recv-start), channel_id=5
+  ROOT gte = get-tuple-element(recv-done), index=0
+}
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/location.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/location.hlotxt
index d4490ca1b72..15db4e0e7e0 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/location.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/location.hlotxt
@@ -9,14 +9,16 @@ ENTRY A {
   // CHECK: loc(unknown)
   %arg1 = f32[4] parameter(1)
 
-  // CHECK: loc([[LOC0:.*]])
+  // CHECK: loc(#[[LOC0:.*]])
   %add0 = f32[4] add(f32[4] %arg0, f32[4] %arg1)
-  // CHECK: loc([[LOC1:.*]])
+  // CHECK: loc(#[[LOC1:.*]])
   %add1 = f32[4] add(f32[4] %add0, f32[4] %arg1), metadata={op_type="Add" op_name="embedded_inference/Add_0"}
-  // CHECK: loc([[LOC2:.*]])
+  // CHECK: loc(#[[LOC4:.*]])
   ROOT %add2 = f32[4] add(f32[4] %add1, f32[4] %arg1), metadata={op_type="Add" op_name="embedded_inference/Add_1", source_file="source.txt", source_line=17}
 
-  // CHECK: [[LOC0]] = loc("add0")
-  // CHECK: [[LOC1]] = loc("embedded_inference/Add_0")
-  // CHECK: [[LOC2]] = loc(fused["embedded_inference/Add_1", "source.txt":17:0])
+  // CHECK: #[[LOC0]] = loc("add0")
+  // CHECK: #[[LOC1]] = loc("embedded_inference/Add_0")
+  // CHECK: #[[LOC2:.*]] = loc("embedded_inference/Add_1")
+  // CHECK: #[[LOC3:.*]] = loc("source.txt":17:0)
+  // CHECK: #[[LOC4]] = loc(fused[#[[LOC2]], #[[LOC3]]])
 }
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
new file mode 100644
index 00000000000..300625adf68
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/module_attributes.hlo
@@ -0,0 +1,214 @@
+# RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
+
+# CHECK-LABEL: module @main attributes {
+hlo_module       {
+  name: "main"
+  entry_computation_name: "main.5"
+  computations {
+    name: "main.5"
+    instructions {
+      name: "Arg_0.1"
+      opcode: "parameter"
+      shape {
+        element_type: S32
+      }
+      id: 1
+    }
+    instructions {
+      name: "Arg_1.2"
+      opcode: "parameter"
+      shape {
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: S32
+          dimensions: 2
+          dimensions: 3
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: S32
+        }
+      }
+      parameter_number: 1
+      id: 2
+    }
+    instructions {
+      name: "copy-start.3"
+      opcode: "copy-start"
+      shape {
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: TUPLE
+          tuple_shapes {
+            element_type: S32
+            dimensions: 2
+            dimensions: 3
+            layout {
+              minor_to_major: 1
+              minor_to_major: 0
+            }
+            is_dynamic_dimension: false
+            is_dynamic_dimension: false
+          }
+          tuple_shapes {
+            element_type: S32
+          }
+        }
+        tuple_shapes {
+          element_type: TUPLE
+          tuple_shapes {
+            element_type: S32
+            dimensions: 2
+            dimensions: 3
+            layout {
+              minor_to_major: 1
+              minor_to_major: 0
+            }
+            is_dynamic_dimension: false
+            is_dynamic_dimension: false
+          }
+          tuple_shapes {
+            element_type: S32
+          }
+        }
+        tuple_shapes {
+          element_type: U32
+        }
+      }
+      metadata {
+        source_file: "within split at <stdin>:1 offset "
+        source_line: 10
+      }
+      id: 3
+      operand_ids: 2
+      cross_program_prefetch_index: 0
+    }
+    instructions {
+      name: "copy-done.4"
+      opcode: "copy-done"
+      shape {
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: S32
+          dimensions: 2
+          dimensions: 3
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: S32
+        }
+      }
+      metadata {
+        source_file: "within split at <stdin>:1 offset "
+        source_line: 11
+      }
+      id: 4
+      operand_ids: 3
+    }
+    program_shape {
+      parameters {
+        element_type: S32
+      }
+      parameters {
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: S32
+          dimensions: 2
+          dimensions: 3
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: S32
+        }
+      }
+      result {
+        element_type: TUPLE
+        tuple_shapes {
+          element_type: S32
+          dimensions: 2
+          dimensions: 3
+          layout {
+            minor_to_major: 1
+            minor_to_major: 0
+          }
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
+        tuple_shapes {
+          element_type: S32
+        }
+      }
+      parameter_names: "Arg_0"
+      parameter_names: "Arg_1"
+    }
+    id: 5
+    root_id: 4
+  }
+  host_program_shape {
+    parameters {
+      element_type: S32
+    }
+    parameters {
+      element_type: TUPLE
+      tuple_shapes {
+        element_type: S32
+        dimensions: 2
+        dimensions: 3
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      tuple_shapes {
+        element_type: S32
+      }
+    }
+    result {
+      element_type: TUPLE
+      tuple_shapes {
+        element_type: S32
+        dimensions: 2
+        dimensions: 3
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      tuple_shapes {
+        element_type: S32
+      }
+    }
+    parameter_names: "Arg_0"
+    parameter_names: "Arg_1"
+  }
+  id: 5
+  entry_computation_id: 5
+# CHECK-SAME: mhlo.cross_program_prefetches = [#mhlo.cross_program_prefetch<parameter = 1, indices = [0], offset = 0>]
+  cross_program_prefetches {
+    parameter: 1
+    index: 0
+  }
+# CHECK-SAME: mhlo.is_dynamic = true
+  is_dynamic: true
+# CHECK-SAME: mhlo.use_auto_spmd_partitioning = true
+  use_auto_spmd_partitioning: true
+}
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
new file mode 100644
index 00000000000..4b03803581d
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
@@ -0,0 +1,135 @@
+# RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
+
+# CHECK-LABEL: module @main attributes
+# CHECK-SAME: mhlo.spmd_output_sharding = "\08\03\1A\02\01\02\22\02\00\01"
+# CHECK-SAME: mhlo.spmd_parameters_shardings = ["\08\03\1A\02\01\02\22\02\00\01"]
+
+hlo_module       {
+  name: "main"
+  entry_computation_name: "main.3"
+  computations {
+    name: "main.3"
+    instructions {
+      name: "Arg_0.1"
+      opcode: "parameter"
+      shape {
+        element_type: F32
+        dimensions: 16
+        dimensions: 16
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      metadata {
+      }
+      id: 1
+      frontend_attributes {
+      }
+    }
+    instructions {
+      name: "custom-call.2"
+      opcode: "custom-call"
+      shape {
+        element_type: F32
+        dimensions: 16
+        dimensions: 16
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      metadata {
+        source_file: "within split at <stdin>:1 offset "
+        source_line: 3
+      }
+      custom_call_target: "Sharding"
+      id: 2
+      operand_ids: 1
+      sharding {
+        type: OTHER
+        tile_assignment_dimensions: 1
+        tile_assignment_dimensions: 2
+        tile_assignment_devices: 0
+        tile_assignment_devices: 1
+      }
+      frontend_attributes {
+      }
+      custom_call_api_version: API_VERSION_ORIGINAL
+    }
+    program_shape {
+      parameters {
+        element_type: F32
+        dimensions: 16
+        dimensions: 16
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      result {
+        element_type: F32
+        dimensions: 16
+        dimensions: 16
+        layout {
+          minor_to_major: 1
+          minor_to_major: 0
+        }
+        is_dynamic_dimension: false
+        is_dynamic_dimension: false
+      }
+      parameter_names: "Arg_0"
+    }
+    id: 3
+    root_id: 2
+  }
+  host_program_shape {
+    parameters {
+      element_type: F32
+      dimensions: 16
+      dimensions: 16
+      layout {
+        minor_to_major: 1
+        minor_to_major: 0
+      }
+      is_dynamic_dimension: false
+      is_dynamic_dimension: false
+    }
+    result {
+      element_type: F32
+      dimensions: 16
+      dimensions: 16
+      layout {
+        minor_to_major: 1
+        minor_to_major: 0
+      }
+      is_dynamic_dimension: false
+      is_dynamic_dimension: false
+    }
+    parameter_names: "Arg_0"
+  }
+  id: 3
+  entry_computation_id: 3
+  dynamic_parameter_binding {
+  }
+  spmd_output_sharding {
+    type: OTHER
+    tile_assignment_dimensions: 1
+    tile_assignment_dimensions: 2
+    tile_assignment_devices: 0
+    tile_assignment_devices: 1
+  }
+  spmd_parameters_shardings {
+    type: OTHER
+    tile_assignment_dimensions: 1
+    tile_assignment_dimensions: 2
+    tile_assignment_devices: 0
+    tile_assignment_devices: 1
+  }
+}
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/while.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/while.hlotxt
index dd4ff905443..2ed4016368f 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/while.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/while.hlotxt
@@ -373,7 +373,7 @@ region_cond5 {
 // CHECK-NEXT:        mhlo.return %[[CMP]] : tensor<i1>
 // CHECK-NEXT:    } do {
 // CHECK-NEXT:        %[[TUPLE:.*]] = mhlo.tuple %[[ITER_ARG_1]], %[[ITER_ARG_2]] : tuple<tensor<i32>, tensor<i32>>
-// CHECK-NEXT:        %[[CUSTOMCALL:.*]] = "mhlo.custom_call"(%[[ITER_ARG_1]], %[[TUPLE]]) [[CONFIG:.*]] : (tensor<i32>, tuple<tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>>
+// CHECK-NEXT:        %[[CUSTOMCALL:.*]] = mhlo.custom_call @foo(%[[ITER_ARG_1]], %[[TUPLE]]) [[CONFIG:.*]] : (tensor<i32>, tuple<tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>>
 // CHECK-NEXT:        %[[GTE1:.*]] = mhlo.get_tuple_element %[[CUSTOMCALL]][0] : (tuple<tensor<i32>, tensor<i32>>) -> tensor<i32>
 // CHECK-NEXT:        %[[GTE2:.*]] = mhlo.get_tuple_element %[[CUSTOMCALL]][1] : (tuple<tensor<i32>, tensor<i32>>) -> tensor<i32>
 // CHECK-NEXT:        mhlo.return %[[GTE1]], %[[GTE2]] : tensor<i32>, tensor<i32>
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.cc
index b2575be85b5..8753bca3158 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/translate.h"
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status.h"
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
index 19cff58692c..0e67e5f39e5 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
@@ -1,9 +1,10 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -35,7 +36,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -44,7 +45,6 @@ cc_library(
     srcs = ["location_exporter.cc"],
     hdrs = ["location_exporter.h"],
     deps = [
-        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -73,13 +73,14 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:quantize",
         "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir/utils:error_util",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
-        "//tensorflow/compiler/xla/stream_executor/lib",
+        "//tensorflow/tsl/platform:float8",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -92,10 +93,11 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
-tf_native_cc_binary(
+cc_binary(
     name = "operator_writer_gen",
     srcs = ["operator_writer_gen.cc"],
     deps = [
@@ -111,7 +113,7 @@ gentbl_cc_library(
     compatible_with = get_compatible_with_cloud(),
     tbl_outs = [([], "operator_writers.inc")],
     tblgen = ":operator_writer_gen",
-    td_file = "//tensorflow/compiler/xla/mlir_hlo:include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
+    td_file = "//tensorflow/compiler/xla/mlir_hlo:mhlo/IR/hlo_ops.td",
     deps = [
         "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
@@ -166,7 +168,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+xla_cc_test(
     name = "type_to_shape_test",
     srcs = ["type_to_shape_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
index ca80adc2ad7..e906adbb9a1 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -72,6 +73,8 @@ StatusOr<stream_executor::dnn::ActivationMode> ConvertConvActivationMode(
       return stream_executor::dnn::kReluX;
     case mlir::lmhlo_gpu::Activation::BandPass:
       return stream_executor::dnn::kBandPass;
+    case mlir::lmhlo_gpu::Activation::Elu:
+      return stream_executor::dnn::kElu;
     default:
       return InternalError("Unexpected activation");
   }
@@ -107,7 +110,7 @@ StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
 StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
-    llvm::Optional<mlir::DenseIntElementsAttr> optional_attr) {
+    std::optional<mlir::DenseIntElementsAttr> optional_attr) {
   if (!optional_attr.has_value())
     return std::vector<std::pair<int64_t, int64_t>>{};
   mlir::DenseIntElementsAttr attr = *optional_attr;
@@ -127,7 +130,7 @@ StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
 }
 
 StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
-  llvm::Optional<mlir::mhlo::FftType> type =
+  std::optional<mlir::mhlo::FftType> type =
       mlir::mhlo::symbolizeEnum<mlir::mhlo::FftType>(type_string);
   if (!type) return InvalidArgument("Unknown FFT type %s", type_string.str());
 
@@ -147,7 +150,7 @@ StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
 
 StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string) {
-  llvm::Optional<mlir::mhlo::Transpose> transpose =
+  std::optional<mlir::mhlo::Transpose> transpose =
       mlir::mhlo::symbolizeTranspose(transpose_string);
   if (!transpose)
     return InvalidArgument("Unknown transpose type %s", transpose_string.str());
@@ -166,6 +169,21 @@ StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
   }
 }
 
+StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+    mlir::mhlo::CustomCallSchedule schedule) {
+  switch (schedule) {
+    case mlir::mhlo::CustomCallSchedule::NONE:
+      return xla::CustomCallSchedule::SCHEDULE_NONE;
+    case mlir::mhlo::CustomCallSchedule::LATEST:
+      return xla::CustomCallSchedule::SCHEDULE_LATEST;
+    case mlir::mhlo::CustomCallSchedule::EARLIEST:
+      return xla::CustomCallSchedule::SCHEDULE_EARLIEST;
+    default:
+      return InvalidArgument("Unknown CustomCallSchedule enum value #%d",
+                             schedule);
+  }
+}
+
 StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version) {
   switch (api_version) {
@@ -177,6 +195,8 @@ StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
       return xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING;
     case mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
       return xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED;
+    case mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      return xla::CustomCallApiVersion::API_VERSION_TYPED_FFI;
     default:
       return InvalidArgument("Unknown CustomCallApiVersion enum value #%d",
                              api_version);
@@ -184,7 +204,7 @@ StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
 }
 
 StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
-ConvertCustomCallOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
+ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
   std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>> aliasInfo;
   for (auto attr : aliasArrayAttr.getValue()) {
     auto alias = attr.cast<mlir::mhlo::OutputOperandAliasAttr>();
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
index 49ddfb37718..7d8cf421d75 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -43,16 +43,19 @@ StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
 StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
-    llvm::Optional<mlir::DenseIntElementsAttr> optional_attr);
+    std::optional<mlir::DenseIntElementsAttr> optional_attr);
 
 StatusOr<FftType> ConvertFftType(llvm::StringRef type_string);
 StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string);
 
+StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+    mlir::mhlo::CustomCallSchedule schedule);
+
 StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version);
 
 StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
-ConvertCustomCallOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
+ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h
index 61cd436b77a..0081d54d977 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.cc
index bcd7f97595f..00df937e7ec 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "llvm/ADT/StringExtras.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 namespace mlir {
 namespace mhlo {
@@ -95,19 +94,14 @@ static std::string GetOpTypeFromLoc(Location loc) {
   return llvm::join(loc_op_types.begin(), loc_op_types.end(), ";");
 }
 
-xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op,
-                                             bool legalize_node_names) {
+xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op) {
   xla::OpMetadata metadata;
   mlir::Location loc = op->getLoc();
   if (loc.isa<mlir::UnknownLoc>()) return metadata;
 
   std::string name = GetNameFromLocImpl(loc);
-  if (legalize_node_names) {
-    mlir::LegalizeNodeName(name);
-  }
   metadata.set_op_name(name);
   std::string op_type = GetOpTypeFromLoc(loc);
-  LegalizeNodeName(op_type);
   metadata.set_op_type(op_type);
 
   if (auto name_loc = op->getLoc().dyn_cast<mlir::NameLoc>()) {
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h
index 0beea482df9..a04f1dfa07d 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h
@@ -30,8 +30,7 @@ namespace mhlo {
 // location (converted). FileLineColLoc locations are populated by taking the
 // file name and line number, and populating `source_file` and `source_line`
 // respectively.
-xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op,
-                                             bool legalize_node_names);
+xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op);
 
 // Returns a name that can be used for debugging purposes, e.g., naming
 // variable names in generated IR or producing logging output.
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 00ebbdf9017..c480e93941c 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -52,32 +52,36 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/quantize.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/mlir/utils/error_util.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 using ::int64_t;
-using ::stream_executor::port::StatusOr;
 using ::tsl::int16;
 using ::tsl::int32;
 using ::tsl::int8;
+using ::tsl::StatusOr;  // TENSORFLOW_STATUS_OK
 using ::tsl::uint16;
 using ::tsl::uint32;
 using ::tsl::uint64;
@@ -88,6 +92,8 @@ constexpr char kPaddingArgIndicesAttr[] = "padding_arg_indices";
 constexpr char kShardingAttr[] = "mhlo.sharding";
 constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
 constexpr char kReplicationAttr[] = "mhlo.is_same_data_across_replicas";
+constexpr char kParameterReplicationAttr[] = "mhlo.parameter_replication";
+constexpr char kLiteralAttr[] = "mhlo.literal";
 
 // Array attribute. Same shape as infeed result, but contains a
 // minor_to_major array for every tensor.
@@ -133,12 +139,52 @@ bool IsBoundedOrStatic(mlir::Type ty) {
   int64_t rank = ranked_ty.getRank();
   for (int64_t dim = 0; dim < rank; ++dim) {
     if (ranked_ty.isDynamicDim(dim) &&
-        encoding.getBounds()[dim] == mlir::ShapedType::kDynamicSize)
+        encoding.getBounds()[dim] == mlir::ShapedType::kDynamic)
       return false;
   }
   return true;
 }
 
+StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
+                                                  xla::Layout layout) {
+  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  if (!dense_attr)
+    return tsl::errors::Unimplemented("Only dense elements attr are supported");
+
+  xla::Shape shape = xla::TypeToShape(dense_attr.getType());
+
+#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)                         \
+  case xla_type: {                                                           \
+    xla::Array<cpp_type> source_data(shape.dimensions());                    \
+    source_data.SetValues(dense_attr.getValues<cpp_type>());                 \
+    return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout); \
+  }
+
+  switch (shape.element_type()) {
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::PRED, bool)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F32, float)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F64, double)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S8, int8)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S16, int16)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S32, int32)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S64, int64_t)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U8, uint8)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F16, Eigen::half)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::BF16, Eigen::bfloat16)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E5M2, tsl::float8_e5m2)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E4M3FN, tsl::float8_e4m3fn)
+    default:
+      return tsl::errors::Internal(absl::StrCat(  // NOLINT
+          "Unsupported type: ", xla::PrimitiveType_Name(shape.element_type())));
+  }
+#undef ELEMENTS_ATTR_TO_LITERAL
+}
+
 // Convert APInt into an int.
 // TODO(hpucha): This should be consolidated into a general place.
 static int ConvertAPInt(llvm::APInt i) { return i.getSExtValue(); }
@@ -169,7 +215,7 @@ static std::vector<int64_t> ConvertDenseIntAttr(
 }
 
 static std::vector<int64_t> ConvertDenseIntAttr(
-    llvm::Optional<mlir::DenseIntElementsAttr> attr) {
+    std::optional<mlir::DenseIntElementsAttr> attr) {
   if (!attr) return {};
   return ConvertDenseIntAttr(*attr);
 }
@@ -177,12 +223,39 @@ static std::vector<int64_t> ConvertDenseIntAttr(
 // Converts the broadcast_dimensions attribute into a vector of dimension
 // numbers (empty if the attribute is absent).
 static std::vector<int64_t> Convert_broadcast_dimensions(
-    llvm::Optional<mlir::DenseIntElementsAttr> broadcast_dimensions) {
+    std::optional<mlir::DenseIntElementsAttr> broadcast_dimensions) {
   if (!broadcast_dimensions.has_value()) return {};
 
   return ConvertDenseIntAttr(*broadcast_dimensions);
 }
 
+static std::vector<xla::CrossProgramPrefetch> Convert_cross_program_prefetches(
+    mlir::ArrayAttr prefetches) {
+  std::vector<xla::CrossProgramPrefetch> cross_program_prefetches;
+  for (auto prefetch : prefetches) {
+    auto cpp = prefetch.cast<mlir::mhlo::CrossProgramPrefetchAttr>();
+    xla::CrossProgramPrefetch xla_cpp;
+    xla_cpp.set_parameter(cpp.getParameter());
+    for (auto index : cpp.getIndices()) xla_cpp.add_index(index);
+    cross_program_prefetches.push_back(xla_cpp);
+  }
+  return cross_program_prefetches;
+}
+
+static xla::DynamicParameterBinding Convert_dynamic_parameter_bindings(
+    mlir::ArrayAttr dpbs) {
+  xla::DynamicParameterBinding xla_dpb;
+  for (auto dpb : dpbs) {
+    auto binding = dpb.cast<mlir::mhlo::DynamicParameterBindingAttr>();
+    auto _ = xla_dpb.Bind({binding.getDynamicParamNum(),
+                           xla::ShapeIndex(binding.getDynamicParamIndices())},
+                          {binding.getTargetParamNum(),
+                           xla::ShapeIndex(binding.getTargetParamIndices()),
+                           binding.getTargetParamDimNum()});
+  }
+  return xla_dpb;
+}
+
 // Converts StringRef to xla FftType enum
 static xla::FftType Convert_fft_type(mlir::mhlo::FftType fft_type) {
   xla::FftType fft_type_enum;
@@ -195,18 +268,18 @@ static xla::FftType Convert_fft_type(mlir::mhlo::FftType fft_type) {
 }
 
 static std::vector<std::pair<int64_t, int64_t>> Convert_padding(
-    llvm::Optional<mlir::DenseIntElementsAttr> padding) {
+    std::optional<mlir::DenseIntElementsAttr> padding) {
   return xla::ConvertNx2Attribute(padding).value();
 }
 
 static std::optional<bool> Convert_use_global_device_ids(
-    llvm::Optional<bool> use_global_device_ids) {
+    std::optional<bool> use_global_device_ids) {
   if (!use_global_device_ids) return {};
   return *use_global_device_ids;
 }
 
 static std::vector<std::pair<int64_t, int64_t>> Convert_source_target_pairs(
-    llvm::Optional<mlir::DenseIntElementsAttr> source_target_pairs) {
+    std::optional<mlir::DenseIntElementsAttr> source_target_pairs) {
   return xla::ConvertNx2Attribute(source_target_pairs).value();
 }
 
@@ -222,14 +295,23 @@ static std::vector<xla::Shape> ConvertTypesToShapesWithLayout(
   for (auto type_and_layout : llvm::zip(value_types, layouts)) {
     mlir::Type type = std::get<0>(type_and_layout);
     mlir::Attribute layout = std::get<1>(type_and_layout);
-    assert(!type.isa<mlir::TupleType>() &&
-           "Exporting layout for tuples is not implemented yet");
-    shapes_with_layout.emplace_back(xla::TypeToShape(type));
-    auto& shape = shapes_with_layout.back();
-    shape.mutable_layout()->clear_minor_to_major();
-    for (auto l : layout.cast<mlir::DenseIntElementsAttr>()) {
-      shape.mutable_layout()->mutable_minor_to_major()->push_back(
-          l.getSExtValue());
+
+    if (type.isa<mlir::TensorType>()) {
+      shapes_with_layout.emplace_back(xla::TypeToShape(type));
+      auto& shape = shapes_with_layout.back();
+      shape.mutable_layout()->clear_minor_to_major();
+      for (auto l : layout.cast<mlir::DenseIntElementsAttr>()) {
+        shape.mutable_layout()->mutable_minor_to_major()->push_back(
+            l.getSExtValue());
+      }
+    } else if (type.isa<mlir::mhlo::TokenType>()) {
+      assert(mlir::cast<mlir::DenseElementsAttr>(layout).empty() &&
+             "Invalid layout for token type");
+      shapes_with_layout.emplace_back(xla::TypeToShape(type));
+    } else {
+      assert(!type.isa<mlir::TupleType>() &&
+             "Exporting layout for tuples is not implemented yet");
+      assert(false && "Exporting unknown type with layout");
     }
   }
   return shapes_with_layout;
@@ -271,14 +353,20 @@ static xla::Layout ExtractLayout(
   return xla::LayoutUtil::MakeDescendingLayout(rank);
 }
 
-static xla::Shape ExtractXlaShape(mlir::Operation* op) {
+// Returns a failure or a valid XLA shape corresponding to the given op's
+// results.
+static mlir::FailureOr<xla::Shape> ExtractXlaShape(mlir::Operation* op) {
   if (auto attr = op->getAttrOfType<mlir::StringAttr>(kDefaultLayoutAttrName)) {
     return *xla::ParseShape(
         absl::string_view(attr.getValue().data(), attr.getValue().size()));
   } else {
     std::vector<xla::Shape> subshapes;
-    for (mlir::Value result : op->getResults()) {
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
       subshapes.push_back(xla::TypeToShape(result.getType()));
+      if (subshapes.back().element_type() == xla::PRIMITIVE_TYPE_INVALID) {
+        return op->emitError()
+               << "result #" << index << " type is not supported";
+      }
     }
     if (subshapes.size() > 1) {
       return xla::ShapeUtil::MakeTupleShape(subshapes);
@@ -287,10 +375,10 @@ static xla::Shape ExtractXlaShape(mlir::Operation* op) {
   }
 }
 
-#define I64_ELEMENTS_ATTR_TO_VECTOR(attribute)                \
-  static std::vector<int64_t> Convert_##attribute(            \
-      llvm::Optional<mlir::DenseIntElementsAttr> attribute) { \
-    return ConvertDenseIntAttr(attribute);                    \
+#define I64_ELEMENTS_ATTR_TO_VECTOR(attribute)               \
+  static std::vector<int64_t> Convert_##attribute(           \
+      std::optional<mlir::DenseIntElementsAttr> attribute) { \
+    return ConvertDenseIntAttr(attribute);                   \
   }
 
 I64_ELEMENTS_ATTR_TO_VECTOR(broadcast_sizes);
@@ -307,12 +395,12 @@ I64_ELEMENTS_ATTR_TO_VECTOR(rhs_dilation);
 
 #undef I64_ELEMENTS_ATTR_TO_VECTOR
 
-#define BOOL_ELEMENTS_ATTR_TO_VECTOR(attribute)            \
-  static std::vector<bool> Convert_##attribute(            \
-      llvm::Optional<mlir::DenseElementsAttr> attribute) { \
-    if (!attribute) return {};                             \
-    auto values = attribute->getValues<bool>();            \
-    return {values.begin(), values.end()};                 \
+#define BOOL_ELEMENTS_ATTR_TO_VECTOR(attribute)           \
+  static std::vector<bool> Convert_##attribute(           \
+      std::optional<mlir::DenseElementsAttr> attribute) { \
+    if (!attribute) return {};                            \
+    auto values = attribute->getValues<bool>();           \
+    return {values.begin(), values.end()};                \
   }
 
 BOOL_ELEMENTS_ATTR_TO_VECTOR(window_reversal);
@@ -327,11 +415,11 @@ static std::vector<int64_t> Convert_ArrayRef(llvm::ArrayRef<int64_t> values) {
 // corresponding XLA proto. All the strings are assumed to be valid names of the
 // Precision enum. This should have been checked in the op verify method.
 static std::unique_ptr<xla::PrecisionConfig> Convert_precision_config(
-    llvm::Optional<mlir::ArrayAttr> optional_precision_config_attr) {
+    std::optional<mlir::ArrayAttr> optional_precision_config_attr) {
   if (!optional_precision_config_attr.has_value()) return nullptr;
 
   auto precision_config = std::make_unique<xla::PrecisionConfig>();
-  for (auto attr : optional_precision_config_attr.getValue()) {
+  for (auto attr : optional_precision_config_attr.value()) {
     xla::PrecisionConfig::Precision p;
     auto operand_precision =
         mlir::mhlo::stringifyPrecision(
@@ -396,9 +484,9 @@ xla::ChannelHandle Convert_channel_handle(mlir::mhlo::ChannelHandleAttr attr) {
 }
 
 std::optional<xla::ChannelHandle> Convert_channel_handle(
-    llvm::Optional<mlir::mhlo::ChannelHandleAttr> attr) {
+    std::optional<mlir::mhlo::ChannelHandleAttr> attr) {
   if (!attr.has_value()) return std::nullopt;
-  return Convert_channel_handle(attr.getValue());
+  return Convert_channel_handle(attr.value());
 }
 
 // Converts the comparison_direction string attribute into the XLA enum. The
@@ -503,6 +591,14 @@ static bool AllOptionalShardingsAreSet(
                       });
 }
 
+static bool SomeOptionalShardingsAreSet(
+    llvm::ArrayRef<std::optional<xla::OpSharding>> shardings) {
+  return llvm::any_of(shardings,
+                      [](const std::optional<xla::OpSharding>& sharding) {
+                        return sharding.has_value();
+                      });
+}
+
 // Extracts argument and result shardings from function.
 static void ExtractShardingsFromFunction(
     mlir::func::FuncOp function,
@@ -570,8 +666,7 @@ class ConvertToHloModule {
   // Lower a `mlir::Region` to a `XlaComputation`
   LogicalResult LowerRegionAsComputation(
       mlir::Region* region, xla::XlaComputation* func,
-      llvm::Optional<llvm::ArrayRef<mlir::Value>> implicit_operands =
-          llvm::None,
+      std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands = llvm::None,
       bool ensure_single_arg = false);
 
   // Lower a single `Block` to a `XlaComputation`
@@ -582,7 +677,7 @@ class ConvertToHloModule {
       llvm::ArrayRef<std::optional<xla::OpSharding>> arg_shardings,
       llvm::ArrayRef<std::optional<xla::OpSharding>> ret_shardings,
       xla::XlaComputation* result,
-      llvm::Optional<llvm::ArrayRef<mlir::Value>> implicit_operands =
+      std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands =
           llvm::None);
 
   ::xla::HloModuleProto ConsumeMainProto() {
@@ -692,9 +787,19 @@ mlir::LogicalResult GetXlaOps(mlir::Operation* op,
 // versions of old-style async ops can be exported by downgrading to old-style
 // async ops.
 bool SimplyReturnedOp(mlir::Operation* op) {
-  auto users = op->getResults().getUsers();
-  if (std::distance(users.begin(), users.end()) != 1) return false;
-  if (llvm::dyn_cast<mlir::mhlo::ReturnOp>((*users.begin()))) return true;
+  for (auto operand : op->getOperands()) {
+    if (!llvm::isa<mlir::BlockArgument>(operand)) return false;
+  }
+
+  auto users = op->getUsers();
+  if (users.empty()) return false;
+
+  auto first_user = *users.begin();
+  for (auto user : users) {
+    if (first_user != user) return false;
+  }
+
+  if (llvm::isa<mlir::func::ReturnOp>(first_user)) return true;
   return false;
 }
 
@@ -717,9 +822,11 @@ LogicalResult ExportXlaOp(CstrReshapableOp, OpLoweringContext) {
 }
 
 mlir::LogicalResult ExportXlaOp(mlir::mhlo::CopyOp op, OpLoweringContext ctx) {
-  if (op.getIsCrossProgramPrefetch())
+  // If it's the only thing in a function we assume it's part of an async copy
+  // op
+  if (op.getCrossProgramPrefetchIndex() && !SimplyReturnedOp(op))
     return op->emitOpError() << "synchronous CopyOp should not include "
-                                "is_cross_program_prefetch attribute.";
+                                "cross_program_prefetch_index attribute.";
   auto& value_map = *ctx.values;
   auto result = op.getResult();
   xla::XlaOp xla_arg_0;
@@ -783,6 +890,38 @@ LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(AllToAllOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+
+  SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op.getOperation(), op.getOperands(), ctx, operands))) {
+    return failure();
+  }
+
+  mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(op.getOperation());
+  if (failed(shape_or)) return failure();
+  if (shape_or->IsTuple()) {
+    std::optional<xla::Layout> layout = std::nullopt;
+    if (shape_or->has_layout()) {
+      layout = shape_or->layout();
+    }
+    auto tuple = xla::AllToAllTuple(
+        operands, Convert_replica_groups(op.getReplicaGroups()), layout,
+        Convert_channel_handle(op.getChannelHandle()));
+    for (auto [index, result] : llvm::enumerate(op.getResults())) {
+      value_map[result] = xla::GetTupleElement(tuple, index);
+    }
+  } else {
+    // ArrayAllToAll always has exactly one operand (checked in the verifier).
+    value_map[op->getResults()[0]] = xla::AllToAll(
+        operands[0], *op.getSplitDimension(), *op.getConcatDimension(),
+        *op.getSplitCount(), Convert_replica_groups(op.getReplicaGroups()),
+        /*layout=*/std::nullopt, Convert_channel_handle(op.getChannelHandle()));
+  }
+
+  return success();
+}
+
 LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   xla::XlaOp operand;
@@ -870,35 +1009,89 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
             &all_reduce_op.getComputation(), &computation))) {
       return failure();
     }
-
-    xla::XlaOp operand;
-    if (failed(GetXlaOp(all_reduce_op.getOperand(), value_map, &operand,
-                        all_reduce_op)))
-      return failure();
-
-    value_map[result] = xla::AllReduce(
-        operand, computation,
+    if (operands.size() != 1) return failure();
+    value_map[result] = xla::internal::XlaBuilderFriend::BuildAllReduceStart(
+        ctx.builder, operands[0], computation,
         Convert_replica_groups(all_reduce_op.getReplicaGroups()),
         Convert_channel_handle(all_reduce_op.getChannelHandle()), std::nullopt,
         Convert_use_global_device_ids(all_reduce_op.getUseGlobalDeviceIds()));
     return success();
   }
+  auto collective_permute_op =
+      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
+    value_map[result] =
+        xla::internal::XlaBuilderFriend::BuildCollectivePermuteStart(
+            ctx.builder, operands[0],
+            Convert_source_target_pairs(
+                collective_permute_op.getSourceTargetPairs()),
+            Convert_channel_handle(collective_permute_op.getChannelHandle()));
+    return mlir::success();
+  }
+  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  if (copy_op && SimplyReturnedOp(copy_op)) {
+    std::optional<int> cross_program_prefetch_index =
+        copy_op.getCrossProgramPrefetchIndex()
+            ? std::make_optional(*copy_op.getCrossProgramPrefetchIndex())
+            : std::nullopt;
+    value_map[result] = xla::internal::XlaBuilderFriend::BuildCopyStart(
+        ctx.builder, operands[0], cross_program_prefetch_index);
+    return mlir::success();
+  }
+  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  if (send_op && SimplyReturnedOp(send_op)) {
+    xla::XlaOp operand;
+    if (operands.size() == 2)
+      operand = operands[0];
+    else
+      operand =
+          Tuple(ctx.builder, absl::Span<const xla::XlaOp>(operands).subspan(
+                                 0, operands.size() - 1));
+    xla::XlaOp token = operands[operands.size() - 1];
+
+    value_map[result] = xla::internal::XlaBuilderFriend::BuildSend(
+        ctx.builder, operand, token,
+        Convert_channel_handle(send_op.getChannelHandle()),
+        send_op.getIsHostTransfer());
+    return mlir::success();
+  }
+  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  if (recv_op && SimplyReturnedOp(recv_op)) {
+    auto result_types = result.getType().cast<AsyncBundleType>().getTypes()[1];
+
+    mlir::Type received_type = mlir::TupleType::get(op->getContext(), {});
+    if (isa<TupleType>(result_types)) {
+      received_type = result_types.cast<TupleType>().getType(0);
+    }
+
+    value_map[result] = xla::internal::XlaBuilderFriend::BuildRecv(
+        ctx.builder, operands[0], xla::TypeToShape(received_type),
+        Convert_channel_handle(recv_op.getChannelHandle()),
+        recv_op.getIsHostTransfer());
+    return mlir::success();
+  }
 
   if (failed(ctx.converter->RunOnFunction(callee))) return failure();
   xla::XlaComputation& computation =
       ctx.converter->GetLoweredComputation(callee);
-  computation.mutable_proto()
-      ->mutable_computations()
-      ->at(0)
-      .set_execution_thread(op.getExecutionThread().str());
+  computation.mutable_proto()->mutable_computations(0)->set_execution_thread(
+      op.getExecutionThread().str());
   if (op.getGroupId()) {
-    value_map[result] = xla::internal::XlaBuilderFriend::BuildAsyncStart(
-        ctx.builder, operands, op.getExecutionThread().str(), *op.getGroupId(),
-        computation, xla::TypeToShape(result.getType()));
+    auto [xla_op, computation_id] =
+        xla::internal::XlaBuilderFriend::BuildAsyncStart(
+            ctx.builder, operands, op.getExecutionThread().str(),
+            *op.getGroupId(), computation, xla::TypeToShape(result.getType()));
+    value_map[result] = xla_op;
+    computation.mutable_proto()->mutable_computations(0)->set_id(
+        computation_id);
   } else {
-    value_map[result] = xla::internal::XlaBuilderFriend::BuildAsyncStart(
-        ctx.builder, operands, op.getExecutionThread().str(), computation,
-        xla::TypeToShape(result.getType()));
+    auto [xla_op, computation_id] =
+        xla::internal::XlaBuilderFriend::BuildAsyncStart(
+            ctx.builder, operands, op.getExecutionThread().str(), computation,
+            xla::TypeToShape(result.getType()));
+    value_map[result] = xla_op;
+    computation.mutable_proto()->mutable_computations(0)->set_id(
+        computation_id);
   }
   return success();
 }
@@ -945,20 +1138,17 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
 
   mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
       FlatSymbolRefAttr::get(op->getContext(), op.getCalledComputation()));
-  if (failed(ctx.converter->RunOnFunction(callee))) return failure();
   xla::XlaComputation& computation =
       ctx.converter->GetLoweredComputation(callee);
-  computation.mutable_proto()
-      ->mutable_computations()
-      ->at(0)
-      .set_execution_thread(op.getExecutionThread().str());
   if (op.getGroupId()) {
     value_map[result] = xla::internal::XlaBuilderFriend::BuildAsyncUpdate(
         ctx.builder, operand, op.getExecutionThread().str(), *op.getGroupId(),
-        computation, xla::TypeToShape(result.getType()));
+        computation.proto().computations(0).id(),
+        xla::TypeToShape(result.getType()));
   } else {
     value_map[result] = xla::internal::XlaBuilderFriend::BuildAsyncUpdate(
-        ctx.builder, operand, op.getExecutionThread().str(), computation,
+        ctx.builder, operand, op.getExecutionThread().str(),
+        computation.proto().computations(0).id(),
         xla::TypeToShape(result.getType()));
   }
   return success();
@@ -987,7 +1177,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   auto all_gather_op =
       dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
-    value_map[all_gather_op.getResult()] =
+    value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllGatherDone(
             ctx.builder, operand, xla::TypeToShape(all_gather_op.getType()));
     return success();
@@ -995,20 +1185,54 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   auto all_reduce_op =
       dyn_cast_or_null<AllReduceOp>(callee.getBody().front().front());
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
-    value_map[all_reduce_op.getResult()] =
+    value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllReduceDone(
             ctx.builder, operand, xla::TypeToShape(all_reduce_op.getType()));
     return success();
   }
+  auto collective_permute_op =
+      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
+    value_map[op.getResult(0)] =
+        xla::internal::XlaBuilderFriend::BuildCollectivePermuteDone(
+            ctx.builder, operand,
+            xla::TypeToShape(collective_permute_op.getType()));
+    return success();
+  }
+  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  if (copy_op && SimplyReturnedOp(copy_op)) {
+    value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildCopyDone(
+        ctx.builder, operand, xla::TypeToShape(copy_op.getType()));
+    return success();
+  }
+  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  if (send_op && SimplyReturnedOp(send_op)) {
+    value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildSendDone(
+        ctx.builder, operand,
+        Convert_channel_handle(send_op.getChannelHandle()),
+        send_op.getIsHostTransfer());
+    return success();
+  }
+  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  if (recv_op && SimplyReturnedOp(recv_op)) {
+    auto result_type =
+        op.getBundle().getType().cast<AsyncBundleType>().getTypes()[1];
+    xla::XlaOp xla_recv = xla::internal::XlaBuilderFriend::BuildRecvDone(
+        ctx.builder, operand, xla::TypeToShape(result_type),
+        Convert_channel_handle(recv_op.getChannelHandle()),
+        recv_op.getIsHostTransfer());
+    if (op.getNumResults() == 1) {
+      value_map[op.getResult(0)] = xla_recv;
+    } else {
+      for (const auto& item : llvm::enumerate(op.getResults())) {
+        value_map[item.value()] = xla::GetTupleElement(xla_recv, item.index());
+      }
+    }
+    return success();
+  }
 
-  if (failed(ctx.converter->RunOnFunction(callee))) return failure();
   xla::XlaComputation& computation =
       ctx.converter->GetLoweredComputation(callee);
-  computation.mutable_proto()
-      ->mutable_computations()
-      ->at(0)
-      .set_execution_thread(op.getExecutionThread().str());
-
   std::vector<xla::Shape> subshapes;
   for (const auto& item : op.getResults().getType()) {
     subshapes.push_back(xla::TypeToShape(item));
@@ -1019,11 +1243,11 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (op.getGroupId()) {
     exportedOp = xla::internal::XlaBuilderFriend::BuildAsyncDone(
         ctx.builder, operand, op.getExecutionThread().str(), *op.getGroupId(),
-        computation, data_shape);
+        computation.proto().computations(0).id(), data_shape);
   } else {
     exportedOp = xla::internal::XlaBuilderFriend::BuildAsyncDone(
-        ctx.builder, operand, op.getExecutionThread().str(), computation,
-        data_shape);
+        ctx.builder, operand, op.getExecutionThread().str(),
+        computation.proto().computations(0).id(), data_shape);
   }
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = exportedOp;
@@ -1085,6 +1309,17 @@ LogicalResult ExportXlaOp(CosineOp op, OpLoweringContext ctx) {
   return mlir::success();
 }
 
+LogicalResult ExportXlaOp(TanOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  auto result = op.getResult();
+  xla::XlaOp arg;
+  if (failed(GetXlaOp(*op.getODSOperands(0).begin(), value_map, &arg, op)))
+    return mlir::failure();
+  auto xla_result = xla::Tan(Unwrap(arg));
+  value_map[result] = xla_result;
+  return mlir::success();
+}
+
 LogicalResult ExportXlaOp(DotOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   xla::XlaOp lhs, rhs;
@@ -1182,11 +1417,11 @@ LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   // regions.
   if (failed(ctx.converter->LowerRegionAsComputation(
           &op.getTrueBranch(), &true_branch,
-          llvm::makeArrayRef(implicit_true_operands),
+          llvm::ArrayRef(implicit_true_operands),
           /*ensure_single_arg*/ true)) ||
       failed(ctx.converter->LowerRegionAsComputation(
           &op.getFalseBranch(), &false_branch,
-          llvm::makeArrayRef(implicit_false_operands),
+          llvm::ArrayRef(implicit_false_operands),
           /*ensure_single_arg*/ true))) {
     return failure();
   }
@@ -1260,8 +1495,7 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
     // that region.
     computations_p[i] = &computations[i];
     if (failed(ctx.converter->LowerRegionAsComputation(
-            &branches[i], computations_p[i],
-            llvm::makeArrayRef(implicit_operands),
+            &branches[i], computations_p[i], llvm::ArrayRef(implicit_operands),
             /*ensure_single_arg*/ true)))
       return failure();
   }
@@ -1367,10 +1601,43 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   if (failed(GetTuple(op, op.getInputs(), ctx, args))) return failure();
   auto xla_api_version = xla::ConvertCustomCallApiVersion(op.getApiVersion());
   if (!xla_api_version.ok()) return failure();
+
+  // CustomCallOp backend config can be either a string if we use any of the
+  // older custom call API versions, or a dictionary attribute if we use typed
+  // FFI. We always pass it as a string to the HLO instruction. If it was a
+  // dictionary attribute we rely on MLIR printing to convert it to string.
+  std::string backend_config;
+
+  if (*xla_api_version == xla::CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    // Serialize backend config dictionary as a string.
+    if (auto dict = op.getBackendConfig()
+                        .value_or(mlir::Attribute())
+                        .dyn_cast_or_null<mlir::DictionaryAttr>()) {
+      llvm::raw_string_ostream(backend_config) << dict;
+    }
+  } else {
+    // Forward backend config string to the HLO instruction.
+    if (auto str = op.getBackendConfig()
+                       .value_or(mlir::Attribute())
+                       .dyn_cast_or_null<mlir::StringAttr>()) {
+      llvm::raw_string_ostream(backend_config) << str.strref();
+    }
+  }
+
+  StatusOr<xla::Literal> literal;
+  auto literal_attr = op->getAttrOfType<DenseElementsAttr>(kLiteralAttr);
+  if (literal_attr) {
+    literal = CreateArrayLiteralFromAttr(literal_attr, {});
+    if (!literal.ok()) return failure();
+  }
+
   auto& value_map = *ctx.values;
   auto aliasInfo =
-      xla::ConvertCustomCallOutputOperandAliasing(op.getOutputOperandAliases());
+      xla::ConvertOutputOperandAliasing(op.getOutputOperandAliases());
   auto output_operand_aliasing = absl::MakeSpan(*aliasInfo);
+  auto custom_call_schedule =
+      xla::ConvertCustomCallSchedule(op.getCustomCallSchedule());
+  if (!custom_call_schedule.ok()) return failure();
   if (op.getCalledComputations().size() == 1) {
     mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
         op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
@@ -1379,37 +1646,36 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
         ctx.converter->GetLoweredComputation(callee);
     value_map[result] = xla::CustomCallWithComputation(
         ctx.builder, std::string(op.getCallTargetName()), args, computation,
-        xla::TypeToShape(result.getType()), std::string(op.getBackendConfig()),
+        xla::TypeToShape(result.getType()), backend_config,
         op.getHasSideEffect(), output_operand_aliasing,
-        /*literal=*/nullptr,
-        /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
+        /*literal=*/literal.ok() ? &*literal : nullptr,
+        /*schedule=*/*custom_call_schedule,
         /*api_version=*/*xla_api_version);
     return success();
   }
 
   if (op.getOperandLayouts() && op.getResultLayouts()) {
     auto operand_shapes_with_layout = ConvertTypesToShapesWithLayout(
-        op.getOperandTypes(), op.getOperandLayouts().getValue());
+        op.getOperandTypes(), op.getOperandLayouts().value());
     xla::Shape result_shape_with_layout = GetCustomCallResultShapeWithLayout(
-        result.getType(), op.getResultLayouts().getValue());
+        result.getType(), op.getResultLayouts().value());
     value_map[result] = xla::CustomCallWithLayout(
         ctx.builder, std::string(op.getCallTargetName()), args,
-        result_shape_with_layout, operand_shapes_with_layout,
-        std::string(op.getBackendConfig()), op.getHasSideEffect(),
-        output_operand_aliasing,
-        /*literal=*/nullptr,
-        /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
+        result_shape_with_layout, operand_shapes_with_layout, backend_config,
+        op.getHasSideEffect(), output_operand_aliasing,
+        /*literal=*/literal.ok() ? &*literal : nullptr,
+        /*schedule=*/*custom_call_schedule,
         /*api_version=*/*xla_api_version);
     return success();
   }
 
-  value_map[result] = xla::CustomCall(
-      ctx.builder, std::string(op.getCallTargetName()), args,
-      xla::TypeToShape(result.getType()), std::string(op.getBackendConfig()),
-      op.getHasSideEffect(), output_operand_aliasing,
-      /*literal=*/nullptr,
-      /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
-      /*api_version=*/*xla_api_version);
+  value_map[result] =
+      xla::CustomCall(ctx.builder, std::string(op.getCallTargetName()), args,
+                      xla::TypeToShape(result.getType()), backend_config,
+                      op.getHasSideEffect(), output_operand_aliasing,
+                      /*literal=*/literal.ok() ? &*literal : nullptr,
+                      /*schedule=*/*custom_call_schedule,
+                      /*api_version=*/*xla_api_version);
   return success();
 }
 
@@ -1561,14 +1827,12 @@ LogicalResult ExportXlaOp(RecvOp op, OpLoweringContext ctx) {
   else
     data_shape = xla::ShapeUtil::MakeTupleShape(subshapes);
 
-  xla::XlaOp xla_result;
-  if (op.getIsHostTransfer()) {
-    xla_result = xla::RecvFromHost(
-        token, data_shape, Convert_channel_handle(op.getChannelHandle()));
-  } else {
-    xla_result = xla::RecvWithToken(
-        token, data_shape, Convert_channel_handle(op.getChannelHandle()));
-  }
+  token = xla::internal::XlaBuilderFriend::BuildRecv(
+      ctx.builder, token, data_shape,
+      Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  xla::XlaOp xla_result = xla::internal::XlaBuilderFriend::BuildRecvDone(
+      ctx.builder, token, data_shape,
+      Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
 
   auto data_tuple_element = xla::GetTupleElement(xla_result, 0);
   if (subshapes.size() == 1) {
@@ -1817,14 +2081,12 @@ LogicalResult ExportXlaOp(SendOp op, OpLoweringContext ctx) {
   xla::XlaOp token;
   if (failed(GetXlaOp(op.getToken(), value_map, &token, op))) return failure();
 
-  if (op.getIsHostTransfer()) {
-    value_map[op] = xla::SendToHost(
-        operand, token, operand.builder()->GetShape(operand).value(),
-        Convert_channel_handle(op.getChannelHandle()));
-    return success();
-  }
-  value_map[op] = xla::SendWithToken(
-      operand, token, Convert_channel_handle(op.getChannelHandle()));
+  token = xla::internal::XlaBuilderFriend::BuildSend(
+      ctx.builder, operand, token,
+      Convert_channel_handle(op.getChannelHandle()), op.getIsHostTransfer());
+  value_map[op] = xla::internal::XlaBuilderFriend::BuildSendDone(
+      ctx.builder, token, Convert_channel_handle(op.getChannelHandle()),
+      op.getIsHostTransfer());
   return success();
 }
 
@@ -1973,15 +2235,18 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
     return failure();
 
   auto& values = *ctx.values;
+  auto aliasInfo =
+      xla::ConvertOutputOperandAliasing(op.getOutputOperandAliases());
+  auto output_operand_aliasing = absl::MakeSpan(*aliasInfo);
   llvm::SmallVector<xla::XlaOp, 4> operands;
   for (auto operand : op.getInputs()) operands.push_back(values[operand]);
 
   auto fusion_kind_string =
-      mlir::mhlo::stringifyFusionKind(op.getFusionKind().getValue());
+      mlir::mhlo::stringifyFusionKind(op.getFusionKind().value());
   xla::XlaOp fusion = xla::internal::XlaBuilderFriend::BuildFusion(
       ctx.builder, operands,
       absl::string_view(fusion_kind_string.data(), fusion_kind_string.size()),
-      fused_computation);
+      fused_computation, output_operand_aliasing);
   if (op.getNumResults() == 1) {
     values[op.getResult(0)] = fusion;
   } else {
@@ -2063,44 +2328,6 @@ LogicalResult ExportXlaOp(UniformDequantizeOp op, OpLoweringContext ctx) {
 namespace mlir {
 namespace {
 
-StatusOr<xla::Literal> CreateArrayLiteralFromAttr(ElementsAttr attr,
-                                                  xla::Layout layout) {
-  auto dense_attr = attr.dyn_cast<DenseElementsAttr>();
-  if (!dense_attr)
-    return tsl::errors::Unimplemented("Only dense elements attr are supported");
-
-  xla::Shape shape = xla::TypeToShape(dense_attr.getType());
-
-#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)                         \
-  case xla_type: {                                                           \
-    xla::Array<cpp_type> source_data(shape.dimensions());                    \
-    source_data.SetValues(dense_attr.getValues<cpp_type>());                 \
-    return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout); \
-  }
-
-  switch (shape.element_type()) {
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::PRED, bool)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F32, float)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F64, double)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S8, int8)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S16, int16)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S32, int32)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S64, int64_t)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U8, uint8)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F16, Eigen::half)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::BF16, Eigen::bfloat16)
-    default:
-      return tsl::errors::Internal(absl::StrCat(  // NOLINT
-          "Unsupported type: ", xla::PrimitiveType_Name(shape.element_type())));
-  }
-#undef ELEMENTS_ATTR_TO_LITERAL
-}
-
 LogicalResult ConvertLayout(mlir::Operation* op, const mlir::ArrayAttr& layout,
                             xla::ShapeProto* shape) {
   // In the case of tuples, ShapeProtos can be nested, and so can the mlir
@@ -2292,7 +2519,9 @@ LogicalResult ConvertToHloModule::Lower(
       auto* shape = xla::internal::XlaBuilderFriend::GetInstruction(xla_op)
                         ->mutable_shape();
       // TODO(kramm): merge this with ConvertLayout.
-      *shape = ExtractXlaShape(inst).ToProto();
+      mlir::FailureOr<xla::Shape> mlir_shape_or = ExtractXlaShape(inst);
+      if (failed(mlir_shape_or)) return failure();
+      *shape = mlir_shape_or->ToProto();
     }
 
     return success();
@@ -2427,8 +2656,10 @@ LogicalResult ConvertToHloModule::Lower(
           "expected shaped type during constant mhlo -> hlo translation");
     }
 
+    mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(inst);
+    if (failed(shape_or)) return failure();
     auto literal_or =
-        CreateArrayLiteralFromAttr(const_attr, ExtractXlaShape(inst).layout());
+        CreateArrayLiteralFromAttr(const_attr, shape_or->layout());
     if (!literal_or.ok())
       return inst->emitError(literal_or.status().ToString());
     auto constant = xla::ConstantLiteral(builder, literal_or.value());
@@ -2442,7 +2673,7 @@ LogicalResult ConvertToHloModule::Lower(
     // returned, then return it directly, else create a tuple and return.
     unsigned num_return_values = inst->getNumOperands();
     const bool has_ret_shardings =
-        !ret_shardings.empty() && AllOptionalShardingsAreSet(ret_shardings);
+        !ret_shardings.empty() && SomeOptionalShardingsAreSet(ret_shardings);
     if ((return_tuple_ && is_entry_function) || num_return_values != 1) {
       std::vector<xla::XlaOp> returns(num_return_values);
       for (OpOperand& ret : inst->getOpOperands()) {
@@ -2470,7 +2701,13 @@ LogicalResult ConvertToHloModule::Lower(
         xla::OpSharding sharding;
         sharding.set_type(xla::OpSharding::TUPLE);
         for (auto& ret_sharding : ret_shardings)
-          *sharding.add_tuple_shardings() = *ret_sharding;
+          if (ret_sharding) {
+            *sharding.add_tuple_shardings() = *ret_sharding;
+          } else {
+            xla::OpSharding fallback_sharding;
+            fallback_sharding.set_type(xla::OpSharding::REPLICATED);
+            *sharding.add_tuple_shardings() = fallback_sharding;
+          }
 
         builder->SetSharding(sharding);
       }
@@ -2595,11 +2832,19 @@ LogicalResult ConvertToHloModule::RunOnFunction(mlir::func::FuncOp f) {
   }
   if (auto execution_thread =
           f->getAttrOfType<mlir::StringAttr>("execution_thread")) {
-    computation.mutable_proto()
-        ->mutable_computations()
-        ->at(0)
-        .set_execution_thread(execution_thread.str());
-  }
+    computation.mutable_proto()->mutable_computations(0)->set_execution_thread(
+        execution_thread.str());
+  }
+  for (int i = 0; i < f.getNumArguments(); ++i)
+    if (auto pr =
+            f.getArgAttrOfType<mlir::ArrayAttr>(i, kParameterReplicationAttr))
+      for (auto b : pr.getValue())
+        computation.mutable_proto()
+            ->mutable_computations(0)
+            ->mutable_instructions(i)
+            ->mutable_parameter_replication()
+            ->add_replicated_at_leaf_buffers(
+                b.cast<mlir::BoolAttr>().getValue());
   lowered_computation_[f] = std::move(computation);
   return success();
 }
@@ -2676,7 +2921,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
     llvm::ArrayRef<std::optional<xla::OpSharding>> arg_shardings,
     llvm::ArrayRef<std::optional<xla::OpSharding>> ret_shardings,
     xla::XlaComputation* result,
-    llvm::Optional<llvm::ArrayRef<mlir::Value>> implicit_operands) {
+    std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands) {
   // Mapping from the Value to lowered XlaOp.
   ValueLoweringMap lowering;
 
@@ -2794,7 +3039,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
 
 LogicalResult ConvertToHloModule::LowerRegionAsComputation(
     mlir::Region* region, xla::XlaComputation* func,
-    llvm::Optional<llvm::ArrayRef<mlir::Value>> implicit_operands,
+    std::optional<llvm::ArrayRef<mlir::Value>> implicit_operands,
     bool ensure_single_arg) {
   std::unique_ptr<xla::XlaBuilder> builder =
       module_builder_.CreateSubBuilder(absl::StrCat("region_", region_id_++));
@@ -2849,6 +3094,28 @@ xla::Status ConvertRegionToComputation(mlir::Region* region,
 xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
                                 bool use_tuple_args, bool return_tuple,
                                 MlirToHloConversionOptions options) {
+  // To support the ongoing migration of XLA's compiler interface from MHLO
+  // to StableHLO, we've inserted this fallback to provide support for backends
+  // which are converting incoming ModuleOps directly to HLO.
+  // xla::MlirToXlaComputation is a better API for this purpose because it
+  // supports not just MHLO, but also CHLO and StableHLO, but we will
+  // temporarily support StableHLO to MHLO lowering here as well to ensure
+  // a smooth migration.
+  // TODO(b/263811577): Remove this functionality once we have reasonable
+  // confidence that everyone has migrated from calling ConvertMlirHloToHlo
+  // directly.
+  bool hasStablehloOps = false;
+  module.walk([&](Operation* op) {
+    hasStablehloOps |= isa<stablehlo::StablehloDialect>(op->getDialect());
+    return hasStablehloOps ? WalkResult::interrupt() : WalkResult::advance();
+  });
+  if (hasStablehloOps) {
+    mlir::PassManager pm(module->getContext());
+    pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+    if (failed(pm.run(module)))
+      return tsl::errors::Internal("Unable to convert StableHLO to MHLO");
+  }
+
   TF_RETURN_IF_ERROR(PrepareForExport(module));
   mlir::BaseScopedDiagnosticHandler diag_handler(module.getContext());
   xla::XlaBuilder module_builder("main");
@@ -2859,6 +3126,42 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
   auto hlo_module = converter.ConsumeMainProto();
   StringRef module_name = module.getName() ? *module.getName() : "main";
   hlo_module.set_name(module_name.str());
+  if (auto cross_program_prefetches = module->getAttrOfType<mlir::ArrayAttr>(
+          "mhlo.cross_program_prefetches")) {
+    for (const auto& prefetch :
+         Convert_cross_program_prefetches(cross_program_prefetches)) {
+      *hlo_module.add_cross_program_prefetches() = std::move(prefetch);
+    }
+  }
+  if (auto dynamic_parameter_bindings = module->getAttrOfType<mlir::ArrayAttr>(
+          "mhlo.dynamic_parameter_bindings")) {
+    auto bindings =
+        Convert_dynamic_parameter_bindings(dynamic_parameter_bindings)
+            .ToProto();
+    *hlo_module.mutable_dynamic_parameter_binding() = bindings;
+  }
+  if (auto is_dynamic =
+          module->getAttrOfType<mlir::BoolAttr>("mhlo.is_dynamic")) {
+    hlo_module.set_is_dynamic(is_dynamic.getValue());
+  }
+  if (auto use_auto_spmd_partitioning = module->getAttrOfType<mlir::BoolAttr>(
+          "mhlo.use_auto_spmd_partitioning")) {
+    hlo_module.set_use_auto_spmd_partitioning(
+        use_auto_spmd_partitioning.getValue());
+  }
+  if (auto spmd_output_sharding = module->getAttrOfType<mlir::StringAttr>(
+          "mhlo.spmd_output_sharding")) {
+    *hlo_module.mutable_spmd_output_sharding() =
+        *CreateOpShardingFromStringRef(spmd_output_sharding.getValue());
+  }
+  if (auto spmd_parameters_sharding = module->getAttrOfType<mlir::ArrayAttr>(
+          "mhlo.spmd_parameters_shardings")) {
+    for (const auto& sharding : spmd_parameters_sharding.getValue()) {
+      *hlo_module.add_spmd_parameters_shardings() =
+          *CreateOpShardingFromStringRef(
+              sharding.cast<mlir::StringAttr>().getValue());
+    }
+  }
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
   return ::tsl::OkStatus();
 }
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
index b7271c4d92b..693235d5e7e 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/layout_util.h"
 
 namespace mlir {
@@ -42,9 +42,6 @@ struct MlirToHloConversionOptions {
   // use elemental IR emitters for fused bitcasts without propagating layouts.
   bool propagate_bitcast_layouts_to_backend_config = false;
 
-  // Legalize names to be compatible with TensorFlow.
-  bool legalize_node_names = true;
-
   LayoutPreferenceFn layout_preference_fn;
   ShapeRepresentationFn shape_representation_fn;
 };
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/operator_writer_gen.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/operator_writer_gen.cc
index f30bb190e65..8d27f108cc5 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/operator_writer_gen.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/operator_writer_gen.cc
@@ -143,7 +143,7 @@ static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
   emitSourceFileHeader("MLIR XLA Builders", os);
 
   // Emit all the helper functions.
-  for (const auto* def : records.getAllDerivedDefinitions("HLO_Op")) {
+  for (const auto* def : records.getAllDerivedDefinitions("MHLO_Op")) {
     Operator op(def);
 
     // Skip operations that have a custom exporter.
@@ -170,12 +170,10 @@ static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
   // Create a scoped object to assign op metadata to generated XLA ops.
   os << "  xla::XlaScopedOpMetadataAssignment "
         "op_metadata(lowering_context.builder, "
-        "mlir::mhlo::CreateOpMetadataFromLocation("
-        "op, lowering_context.converter->GetOptions().legalize_node_names"
-        "));\n\n";
+        "mlir::mhlo::CreateOpMetadataFromLocation(op));\n\n";
 
-  // Retrieve all the definitions derived from HLO_Op and sort by record name.
-  for (const auto* def : records.getAllDerivedDefinitions("HLO_Op")) {
+  // Retrieve all the definitions derived from MHLO_Op and sort by record name.
+  for (const auto* def : records.getAllDerivedDefinitions("MHLO_Op")) {
     // Skip operations that have a custom exporter.
     Operator op(def);
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
index bd73be16ebe..f4533845ec7 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -1,9 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow:internal",
+        "//tensorflow/compiler/xla:internal",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
index b795dec0907..22372bc2f59 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -1,5 +1,5 @@
-// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
-// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text --via-builder=true %s | FileCheck %s
+// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text --via-builder=true %s | FileCheck %s
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<2xi1>) -> tensor<2xi1> {
@@ -118,12 +118,12 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x128xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[128,32] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = (f32[128,32], f32[128,128]) all-gather-start(f32[128,32] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = f32[128,128] all-gather-start(f32[128,32] %[[INPUT]])
 // CHECK-SAME: channel_id=1
 // CHECK-SAME{LITERAL}: replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: dimensions={1}
 // CHECK-SAME: use_global_device_ids=true
-// CHECK: ROOT {{.*}} (f32[128,128]) all-gather-done((f32[128,32], f32[128,128]) %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[128,128] all-gather-done(f32[128,128] %[[OUTPUT]]
 
 // -----
 
@@ -153,11 +153,11 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // CHECK: ENTRY
 // CHECK: %[[INPUT:.*]] = f32[10] parameter(0)
-// CHECK: %[[OUTPUT:.*]] = (f32[10], f32[10]) all-reduce-start(f32[10] %[[INPUT]])
+// CHECK: %[[OUTPUT:.*]] = f32[10] all-reduce-start(f32[10] %[[INPUT]])
 // CHECK-SAME:  channel_id=5
 // CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
 // CHECK-SAME: use_global_device_ids=true
-// CHECK: ROOT {{.*}} (f32[10]) all-reduce-done((f32[10], f32[10]) %[[OUTPUT]]
+// CHECK: ROOT {{.*}} f32[10] all-reduce-done(f32[10] %[[OUTPUT]]
 
 // -----
 
@@ -199,7 +199,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
     "mhlo.return"(%max) : (tensor<f32>) -> ()
   })
   {
-    replica_groups = dense<[[0, 2, 4, -1], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    replica_groups = dense<[[0, 2, 4, -1], [1, 3, 5, 6]]> : tensor<2x4xi64>,
     channel_handle = #mhlo.channel_handle<
       handle = 5,
       type = 2
@@ -213,7 +213,7 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
 // CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
 // CHECK-SAME:  channel_id=5
-// CHECK-SAME{LITERAL}:  replica_groups={{0,2,4},{1,3,5,7}}
+// CHECK-SAME{LITERAL}:  replica_groups={{0,2,4},{1,3,5,6}}
 // CHECK-SAME:  to_apply=%[[COMPUTATION]]
 
 // -----
@@ -354,1597 +354,24 @@ func.func @main(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: ten
 // -----
 
 // CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
-  // CHECK:  [[VAL_1:%.*]] = s32[4] parameter(0)
-  // CHECK:  [[VAL_2:%.*]] = s32[4] parameter(1)
-  // CHECK:  [[ATAN2:%.*]] = s32[4] atan2(s32[4] [[VAL_1]], s32[4] [[VAL_2]])
-  %0 = mhlo.atan2 %arg0, %arg1 : tensor<4xi32>
+func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>, %arg2: tensor<4xi32>, %arg3: tensor<4xi32>) -> (tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
+  // CHECK:  [[VAL_1:%.*]] = f32[4] parameter(0)
+  // CHECK:  [[VAL_2:%.*]] = f32[4] parameter(1)
+  // CHECK:  [[ATAN2:%.*]] = f32[4] atan2(f32[4] [[VAL_1]], f32[4] [[VAL_2]])
+  // CHECK:  [[VAL_3:%.*]] = s32[4] parameter(2)
+  // CHECK:  [[VAL_4:%.*]] = s32[4] parameter(3)
+  %0 = mhlo.atan2 %arg0, %arg1 : tensor<4xf32>
 
-  // CHECK:  [[SHL:%.*]] = s32[4] shift-left(s32[4] [[VAL_1]], s32[4] [[VAL_2]])
-  %1 = mhlo.shift_left %arg0, %arg1 : tensor<4xi32>
+  // CHECK:  [[SHL:%.*]] = s32[4] shift-left(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  %1 = mhlo.shift_left %arg2, %arg3 : tensor<4xi32>
 
-  // CHECK:  [[SHRA:%.*]] = s32[4] shift-right-arithmetic(s32[4] [[VAL_1]], s32[4] [[VAL_2]])
-  %2 = mhlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
+  // CHECK:  [[SHRA:%.*]] = s32[4] shift-right-arithmetic(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  %2 = mhlo.shift_right_arithmetic %arg2, %arg3 : tensor<4xi32>
 
-  // CHECK:  [[SHRL:%.*]] = s32[4] shift-right-logical(s32[4] [[VAL_1]], s32[4] [[VAL_2]])
-  %3 = mhlo.shift_right_logical %arg0, %arg1 : tensor<4xi32>
+  // CHECK:  [[SHRL:%.*]] = s32[4] shift-right-logical(s32[4] [[VAL_3]], s32[4] [[VAL_4]])
+  %3 = mhlo.shift_right_logical %arg2, %arg3 : tensor<4xi32>
 
   // CHECK:  ROOT
-  // CHECK-SAME:  [[VAL_7:%.*]] = (s32[4], s32[4], s32[4], s32[4]) tuple(s32[4] [[ATAN2]], s32[4] [[SHL]], s32[4] [[SHRA]], s32[4] [[SHRL]])
-  func.return %0, %1, %2, %3 : tensor<4xi32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>
+  // CHECK-SAME:  [[VAL_9:%.*]] = (f32[4], s32[4], s32[4], s32[4]) tuple(f32[4] [[ATAN2]], s32[4] [[SHL]], s32[4] [[SHRA]], s32[4] [[SHRL]])
+  func.return %0, %1, %2, %3 : tensor<4xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>
 }
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
-  %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG:.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2] bitcast-convert(s32[2] %[[ARG]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>) -> tensor<1x2x3x4xi32> {
-  // CHECK:  [[ARG:%.*]] = s32[4] parameter(0)
-  // CHECK-NEXT:  ROOT %broadcast.2 = s32[1,2,3,4] broadcast(s32[4] [[ARG]]), dimensions={3}
-  %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1,2,3]> : tensor<3xi64>} : (tensor<4xi32>) -> tensor<1x2x3x4xi32>
-  func.return %0 : tensor<1x2x3x4xi32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<1xf32>) -> tensor<1x10xf32> {
-  %result = "mhlo.broadcast_in_dim"(%arg0) {
-    broadcast_dimensions = dense<0> : tensor<1xi64>
-  } : (tensor<1xf32>) -> tensor<1x10xf32>
-  func.return %result : tensor<1x10xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[1] parameter(0)
-// CHECK:  ROOT %broadcast.2 = f32[1,10] broadcast(f32[1] [[ARG]]), dimensions={0}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main() -> !mhlo.token {
-  %0 = "mhlo.create_token"() : () -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ROOT [[TOKEN:%.*]] = token[] after-all()
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  func.call @empty_callee() : () -> ()
-  func.return %arg0 : tensor<4xi32>
-}
-
-func.func @empty_callee() {
-  func.return
-}
-
-// CHECK:       [[CALLEE:%.*]] () -> () {
-// CHECK-NEXT:    ROOT %{{.*}} = () tuple()
-// CHECK-NEXT:  }
-
-// CHECK:       ENTRY [[MAIN:%.*]] ([[ARG:.*]]: s32[4]) -> s32[4] {
-// CHECK-NEXT:    ROOT %[[ARG]] = s32[4] parameter(0)
-// CHECK-NEXT:    [[CALL:%.*]] = () call(), to_apply=[[CALLEE]]
-// CHECK-NEXT:  }
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>) -> tensor<4xi32> {
-  %0 = func.call @callee(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  %1 = func.call @callee(%0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %1 : tensor<4xi32>
-}
-
-func.func @callee(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// CHECK:  [[CALLEE_1:%.*]] ([[ARG_1:.*]]: s32[4], [[ARG_2:.*]]: s32[4]) -> s32[4] {
-// CHECK:  %[[ARG_1]] = s32[4] parameter(0)
-// CHECK:  %[[ARG_2]] = s32[4] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  s32[4] add(s32[4] %[[ARG_1]], s32[4] %[[ARG_2]])
-
-// CHECK:  [[CALLEE_2:%.*]] ([[ARG_3:.*]]: s32[4], [[ARG_4:.*]]: s32[4]) -> s32[4] {
-// CHECK:  %[[ARG_3]] = s32[4] parameter(0)
-// CHECK:  %[[ARG_4]] = s32[4] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  s32[4] add(s32[4] %[[ARG_3]], s32[4] %[[ARG_4]])
-
-// CHECK:  ENTRY [[MAIN:%.*]] ([[ARG:.*]]: s32[4]) -> s32[4] {
-// CHECK:  %[[ARG]] = s32[4] parameter(0)
-// CHECK:  [[CALL_OUT:%.*]] = s32[4] call(s32[4] %[[ARG]], s32[4] %[[ARG]]), to_apply=[[CALLEE_1]]
-// CHECK:  ROOT
-// CHECK-SAME:  s32[4] call(s32[4] [[CALL_OUT]], s32[4] [[CALL_OUT]]), to_apply=[[CALLEE_2]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
-  %0:2 = func.call @callee(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>)
-  func.return %0#0, %0#1 : tensor<4xi32>, tensor<4xi32>
-}
-
-func.func @callee(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>) {
-  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  %1 = "mhlo.multiply"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0, %1 : tensor<4xi32>, tensor<4xi32>
-}
-
-// Get name of callee computation
-// CHECK:  [[CALLEE:%.*]] ({{.*}}) -> ({{.*}}) {
-
-// CHECK:  ENTRY
-// CHECK-SAME:  [[MAIN:%.*]] ([[ARG:.*]]: s32[4]) -> (s32[4], s32[4]) {
-// CHECK:  %[[ARG]] = s32[4] parameter(0)
-// CHECK:  [[CALL_OUT:%.*]] = (s32[4], s32[4]) call(s32[4] %[[ARG]], s32[4] %[[ARG]]), to_apply=[[CALLEE]]
-// CHECK:  [[OUT_0:%.*]] = s32[4] get-tuple-element((s32[4], s32[4]) [[CALL_OUT]]), index=0
-// CHECK:  [[OUT_1:%.*]] = s32[4] get-tuple-element((s32[4], s32[4]) [[CALL_OUT]]), index=1
-// CHECK:  ROOT
-// CHECK-SAME:  (s32[4], s32[4]) tuple(s32[4] [[OUT_0]], s32[4] [[OUT_1]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x32xf32> {
-  %0 = "mhlo.collective_permute"(%arg0) {
-    source_target_pairs = dense<[[0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>,
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 0>
-  } : (tensor<128x32xf32>) -> tensor<128x32xf32>
-  func.return %0 : tensor<128x32xf32>
-}
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[128,32] parameter(0)
-// CHECK:  ROOT [[RESULT:%.*]] = f32[128,32] collective-permute(f32[128,32] [[ARG]]), channel_id=1, source_target_pairs={{\{\{}}0,1},{1,2},{2,3}}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<5x2xf32>,
-           %arg1 : tensor<5x5xf32>,
-           %arg2 : tensor<5x7xf32>) -> tensor<5x14xf32> {
-  %result = "mhlo.concatenate"(%arg0, %arg1, %arg2) {
-    dimension = 1 : i64
-  } : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
-  func.return %result : tensor<5x14xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[5,2] parameter(0)
-// CHECK:  %[[ARG1:.*]] = f32[5,5] parameter(1)
-// CHECK:  %[[ARG2:.*]] = f32[5,7] parameter(2)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[5,14] concatenate(f32[5,2] %[[ARG0]], f32[5,5] %[[ARG1]], f32[5,7] %[[ARG2]]), dimensions={1}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main() {
-  // CHECK:  constant.{{.*}} = s64[] constant(1)
-  %cst = arith.constant dense<1> : tensor<i64>
-  // CHECK:  constant.{{.*}} = f32[2,2,1,1]
-  // CHECK-SAME:  { { /*i0=0*/ { /*i1=0*/ {1} }, { /*i1=1*/ {2} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {4} } } }
-  %cst_0 = arith.constant dense<
-    [[[[1.000000e+00]], [[2.000000e+00]]], [[[3.000000e+00]], [[4.000000e+00]]]]
-  > : tensor<2x2x1x1xf32>
-
-  // CHECK:  s32[1] constant({1})
-  %cst_1 = arith.constant dense<1> : tensor<1xi32>
-
-  // CHECK:  %[[C:.*]] = s32[] constant(1)
-  // CHECK:  s32[10] broadcast(s32[] %[[C]])
-  %cst_2 = arith.constant dense<1> : tensor<10xi32>
-
-  // CHECK:  s32[4] constant({1, 2, 3, 4})
-  %cst_3 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-
-  // CHECK:  s32[2,2] constant({ { 1, 2 }, { 3, 4 } })
-  %cst_4 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-
-  // CHECK:  s32[2,2] constant({ { 3, 2 }, { 1, 4 } })
-  %cst_5 = arith.constant dense<[[3, 2], [1, 4]]> : tensor<2x2xi32>
-
-  // CHECK:  u32[2,2] constant({ { 1, 2 }, { 4, 8 } })
-  %cst_6 = arith.constant dense<[[1, 2], [4, 8]]> : tensor<2x2xui32>
-
-  // CHECK: bf16[4] constant({1, 2, 3, 4})
-  %cst_7 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
-
-  // CHECK: f16[4] constant({1, -4, -65504, 0.015625}
-  %cst_8 = arith.constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
-
-  // CHECK: c64[] constant((1, 0))
-  %cst_9 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
-
-  // CHECK: c128[] constant((1, 0))
-  %cst_10 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
-
-  func.return
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<100x26x26x32xf32>, %arg1 : tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32> {
-  %result = "mhlo.convolution"(%arg0, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = #mhlo.conv<raw
-      input_batch_dimension = 0,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [1, 2],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 0,
-      output_feature_dimension = 3,
-      output_spatial_dimensions = [1, 2]
-    >,
-    feature_group_count = 1 : i64,
-    lhs_dilation = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>,
-    rhs_dilation = dense<1> : tensor<2xi64>,
-    window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<100x26x26x32xf32>, tensor<3x3x1x32xf32>) -> tensor<100x28x28x1xf32>
-  func.return %result : tensor<100x28x28x1xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[100,26,26,32] parameter(0)
-// CHECK:  %[[ARG1:.*]] = f32[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[100,28,28,1] convolution(f32[100,26,26,32] %[[ARG0]], f32[3,3,1,32] %[[ARG1]]),
-// CHECK-SAME:  window={size=3x3 pad=2_2x2_2},
-// CHECK-SAME:  dim_labels=b01f_01oi->b01f
-
-// -----
-
-// Test convolution i8xi8 -> i32.
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<100x26x26x32xi8>, %arg1 : tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
-  %result = "mhlo.convolution"(%arg0, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = #mhlo.conv<raw
-      input_batch_dimension = 0,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [1, 2],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 0,
-      output_feature_dimension = 3,
-      output_spatial_dimensions = [1, 2]
-    >,
-    feature_group_count = 1 : i64,
-    lhs_dilation = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>,
-    rhs_dilation = dense<1> : tensor<2xi64>,
-    window_strides = dense<1> : tensor<2xi64>
-  } : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
-  func.return %result : tensor<100x28x28x1xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = s8[100,26,26,32] parameter(0)
-// CHECK:  %[[ARG1:.*]] = s8[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(s8[100,26,26,32] %[[ARG0]], s8[3,3,1,32] %[[ARG1]]),
-// CHECK-SAME:  window={size=3x3 pad=2_2x2_2},
-// CHECK-SAME:  dim_labels=b01f_01oi->b01f
-
-// -----
-
-// Test convolution with window reversal.
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<100x26x26x32xi8>, %arg1 : tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32> {
-  %result = "mhlo.convolution"(%arg0, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = #mhlo.conv<raw
-      input_batch_dimension = 0,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [1, 2],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 0,
-      output_feature_dimension = 3,
-      output_spatial_dimensions = [1, 2]
-    >,
-    feature_group_count = 1 : i64,
-    lhs_dilation = dense<1> : tensor<2xi64>,
-    padding = dense<2> : tensor<2x2xi64>,
-    rhs_dilation = dense<1> : tensor<2xi64>,
-    window_strides = dense<1> : tensor<2xi64>,
-    window_reversal = dense<1> : tensor<2xi1>
-  } : (tensor<100x26x26x32xi8>, tensor<3x3x1x32xi8>) -> tensor<100x28x28x1xi32>
-  func.return %result : tensor<100x28x28x1xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = s8[100,26,26,32] parameter(0)
-// CHECK:  %[[ARG1:.*]] = s8[3,3,1,32] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[100,28,28,1] convolution(s8[100,26,26,32] %[[ARG0]], s8[3,3,1,32] %[[ARG1]]),
-// CHECK-SAME:  window={size=3x3 pad=2_2x2_2 rhs_reversal=1x1},
-// CHECK-SAME:  dim_labels=b01f_01oi->b01f
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
-  %0 = "mhlo.convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG:.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2] convert(s32[2] %[[ARG]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xui32>) -> tensor<5x5xi8> {
-  %result = "mhlo.stochastic_convert"(%arg0, %arg1) : (tensor<5x5xf32>, tensor<5x5xui32>) -> tensor<5x5xi8>
-  func.return %result : tensor<5x5xi8>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[5,5] parameter(0)
-// CHECK:  %[[ARG1:.*]] = u32[5,5] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = s8[5,5] stochastic-convert(f32[5,5] %[[ARG0]], u32[5,5] %[[ARG1]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "mhlo.copy"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = s32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[2] copy(s32[2] [[ARG]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
-  %0 = mhlo.constant dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi32>
-  %1 = "mhlo.cross-replica-sum"(%arg0) {replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>} : (tensor<10xf32>) -> tensor<10xf32>
-  func.return %1 : tensor<10xf32>
-}
-
-// CHECK:  %[[SUM_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> f32[]
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[10] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10] all-reduce(f32[10] %[[ARG0]])
-// CHECK-SAME{LITERAL}:  replica_groups={{0,2,4,6},{1,3,5,7}}
-// CHECK-SAME:  to_apply=%[[SUM_COMPUTATION]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3xf32> {
-  %0 = "mhlo.custom_call"(%arg0, %arg1) {backend_config = "bar", call_target_name = "foo", has_side_effect = true} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
-  func.return %0 : tensor<1x2x3xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
-// CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
-// CHECK-SAME:  custom_call_target="foo"
-// CHECK-SAME:  custom_call_has_side_effect=true
-// CHECK-SAME:  backend_config="bar"
-
-// -----
-
-// Test dot i8xi8 -> i64
-
-func.func @main(%arg0: tensor<3xi8>, %arg1: tensor<3xi8>) -> tensor<i64> {
-  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3xi8>, tensor<3xi8>) -> tensor<i64>
-  func.return %0 : tensor<i64>
-}
-
-// CHECK: ENTRY
-// CHECK-SAME: ([[ARG0:.*]]: s8[3], [[ARG1:.*]]: s8[3]) -> s64[] {
-// CHECK: %[[ARG0]] = s8[3] parameter(0)
-// CHECK: %[[ARG1]] = s8[3] parameter(1)
-// CHECK: ROOT
-// CHECK-SAME: s64[] dot(s8[3] %[[ARG0]], s8[3] %[[ARG1]]),
-
-// -----
-
-// Test dot i8xi8 -> i32.
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2x2x2xi8>, %arg1: tensor<2x2x3xi8>) -> tensor<2x2x3xi32> {
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_batching_dimensions = [0],
-      lhs_contracting_dimensions = [2],
-      rhs_batching_dimensions = [0],
-      rhs_contracting_dimensions = [1]
-    >,
-    precision_config = []} : (tensor<2x2x2xi8>, tensor<2x2x3xi8>) -> tensor<2x2x3xi32>
-  func.return %0 : tensor<2x2x3xi32>
-}
-
-// CHECK: ENTRY
-// CHECK-SAME: ([[ARG0:.*]]: s8[2,2,2], [[ARG1:.*]]: s8[2,2,3]) -> s32[2,2,3] {
-// CHECK: %[[ARG0]] = s8[2,2,2] parameter(0)
-// CHECK: %[[ARG1]] = s8[2,2,3] parameter(1)
-// CHECK: ROOT
-// CHECK-SAME: s32[2,2,3] dot(s8[2,2,2] %[[ARG0]], s8[2,2,3] %[[ARG1]]),
-// CHECK-SAME: lhs_batch_dims={0}
-// CHECK-SAME: lhs_contracting_dims={2}
-// CHECK-SAME: rhs_batch_dims={0}
-// CHECK-SAME: rhs_contracting_dims={1}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<3x4xi32>, %arg1: tensor<4x5xi32>) -> tensor<3x5xi32> {
-  // Simple einsum is lowered to HLO dot op.
-  // CHECK:  dot(s32[3,4] %{{.*}}, s32[4,5] %{{.*}}), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  %0 = "mhlo.einsum"(%arg0, %arg1) {einsum_config = "ab,bc->ac"} : (tensor<3x4xi32>, tensor<4x5xi32>) -> tensor<3x5xi32>
-  func.return %0 : tensor<3x5xi32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>> {
-  %0 = "mhlo.fft"(%arg0) {fft_length = dense<9> : tensor<1xi64>, fft_type = #mhlo<fft_type RFFT>} : (tensor<3x9xf32>) -> tensor<3x5xcomplex<f32>>
-  func.return %0 : tensor<3x5xcomplex<f32>>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[3,9] parameter(0)
-// CHECK:  c64[3,5] fft(f32[3,9] [[ARG]]), fft_type=RFFT, fft_length={9}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x300xf32> {
-  // CHECK:  [[ARG0:%.*]] = f32[200,100,300] parameter(0)
-  // CHECK:  [[ARG1:%.*]] = s32[10,2] parameter(1)
-  // CHECK:  f32[10,300] gather(f32[200,100,300] [[ARG0]], s32[10,2] [[ARG1]])
-  // CHECK-SAME:  offset_dims={1}
-  // CHECK-SAME:  collapsed_slice_dims={0,1}
-  // CHECK-SAME:  start_index_map={0,1}
-  // CHECK-SAME:  index_vector_dim=1
-  // CHECK-SAME:  slice_sizes={1,1,300}
-  // CHECK-SAME:  indices_are_sorted=true
-  %0 = "mhlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0, 1],
-      index_vector_dim = 1,
-      offset_dims = [1],
-      start_index_map = [0,1],
-    >,
-    indices_are_sorted = true,
-    slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
-  func.return %0 : tensor<10x300xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<4x2xf32>, %size: tensor<i32>) -> tensor<i32> {
-  %0 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 1 : i64} : (tensor<4x2xf32>, tensor<i32>) -> tensor<4x2xf32>
-  %1 = "mhlo.get_dimension_size"(%0) {dimension = 1 : i64} : (tensor<4x2xf32>) -> tensor<i32>
-  func.return %1 : tensor<i32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[4,2] parameter(0)
-// CHECK:  [[SIZE:%.*]] = s32[] parameter(1)
-// CHECK:  [[DYNAMIC:%.*]] = f32[4,<=2] set-dimension-size(f32[4,2] [[ARG]], s32[] [[SIZE]]), dimensions={1}
-// CHECK:  ROOT %[[RESULT:.*]] = s32[] get-dimension-size(f32[4,<=2] [[DYNAMIC]]), dimensions={1}
-
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, -1]>>) -> tensor<8x4xf32> {
-  %size = mhlo.constant dense<8> : tensor<i32>
-  %1 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 0 : i64} : (tensor<?x4xf32, #mhlo.type_extensions<bounds = [8, -1]>>, tensor<i32>) -> tensor<8x4xf32>
-  func.return %1 : tensor<8x4xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[<=8,4] parameter(0)
-// CHECK:  [[SIZE:%.*]] = s32[] constant(8)
-// CHECK:  ROOT [[DYNAMIC:%.*]] = f32[8,4] set-dimension-size(f32[<=8,4] [[ARG]], s32[] [[SIZE]]), dimensions={0}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
-  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = (f32[], s32[]) parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] get-tuple-element((f32[], s32[]) %[[ARG0]]), index=0
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token> {
-  %0:3 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout=[[0, 1], [0]]} : (!mhlo.token) -> (tensor<3x3xi32>, tensor<i1>, !mhlo.token)
-  %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3x3xi32>, tensor<i1>) -> tuple<tensor<3x3xi32>, tensor<i1>>
-  %2 = "mhlo.tuple"(%1, %0#2) : (tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token) -> tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token>
-
-  func.return %2 : tuple<tuple<tensor<3x3xi32>, tensor<i1>>, !mhlo.token>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3], pred[]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE1:%.*]] = (s32[3,3], pred[]) get-tuple-element(((s32[3,3], pred[]), token[]) [[INFEED]]), index=0
-// CHECK:  [[GTE2:%.*]] = s32[3,3] get-tuple-element((s32[3,3], pred[]) [[GTE1]]), index=0
-// CHECK:  [[GTE3:%.*]] = pred[] get-tuple-element((s32[3,3], pred[]) [[GTE1]]), index=1
-// CHECK:  [[GTE4:%.*]] = token[] get-tuple-element(((s32[3,3], pred[]), token[]) [[INFEED]]), index=1
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: !mhlo.token) -> tensor<3x3xi32> {
-  %0:2 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout=[[0,1]]} : (!mhlo.token) -> (tensor<3x3xi32>, !mhlo.token)
-  func.return %0#0 : tensor<3x3xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((s32[3,3]), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:  [[GTE0:%.*]] = (s32[3,3]) get-tuple-element(((s32[3,3]), token[]) [[INFEED]]), index=0
-// CHECK:  ROOT [[GTE1:%.*]] = s32[3,3] get-tuple-element((s32[3,3]) [[GTE0]]), index=0
-// CHECK:  [[GTE2:%.*]] = token[] get-tuple-element(((s32[3,3]), token[]) [[INFEED]]), index=1
-
-// -----
-
-// CHECK:  HloModule
-
-func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.infeed"(%arg0) {infeed_config = "foobar", layout = [], xla_shape = "((), token[])"} : (!mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK:  [[INFEED:%.*]] = ((), token[]) infeed(token[] [[ARG]]), infeed_config="foobar"
-// CHECK:   ROOT [[TOKEN:%.*]] = token[] get-tuple-element(((), token[]) [[INFEED]]), index=1
-
-// -----
-
-// CHECK:  HloModule
-func.func @main() -> tensor<1x10xf32> {
-  %result = "mhlo.iota"() {
-    iota_dimension = 1 : i64
-  } : () -> tensor<1x10xf32>
-  func.return %result : tensor<1x10xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  ROOT %[[RESULT:.*]] = f32[1,10] iota(), iota_dimension=1
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  %0 = "mhlo.map"(%arg0, %arg1) ({
-    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
-    "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// CHECK:  [[COMPUTATION:%.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[] {
-// CHECK:  [[ARG_0:%.*]] = f32[] parameter(0)
-// CHECK:  [[ARG_1:%.*]] = f32[] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[] add(f32[] [[ARG_0]], f32[] [[ARG_1]])
-// CHECK:  }
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG_2:%.*]] = f32[4] parameter(0)
-// CHECK:  [[ARG_3:%.*]] = f32[4] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[4] map(f32[4] [[ARG_2]], f32[4] [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xi32>) -> tensor<4xf32> {
-  %0 = "mhlo.map"(%arg0, %arg1) ({
-    ^bb0(%arg2: tensor<f32>, %arg3: tensor<i32>):
-    "mhlo.return"(%arg2) : (tensor<f32>) -> ()
-  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xi32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// CHECK:  [[COMPUTATION:%.*]] ({{.*}}: f32[], {{.*}}: s32[]) -> f32[] {
-// CHECK:  ROOT [[ARG_0:%.*]] = f32[] parameter(0)
-// CHECK:  }
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG_2:%.*]] = f32[4] parameter(0)
-// CHECK:  [[ARG_3:%.*]] = s32[4] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[4] map(f32[4] [[ARG_2]], s32[4] [[ARG_3]]), dimensions={0}, to_apply=[[COMPUTATION]]
-
-// -----
-
-
-// CHECK:  HloModule
-func.func @main(%data: tensor<3xi32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.outfeed"(%data, %token) {outfeed_config = "foobar"} : (tensor<3xi32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[DATA:%.*]] = s32[3] parameter(0)
-// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[3]) tuple(s32[3] [[DATA]])
-// CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[3]) [[DATATUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[3]{0}), outfeed_config="foobar"
-
-// -----
-
-// The following op sharding is used:
-// Proto debug string:
-//   type: TUPLE
-//   tuple_shardings {
-//     type: OTHER
-//     tile_assignment_dimensions: 2
-//     tile_assignment_dimensions: 1
-//     tile_assignment_devices: 0
-//     tile_assignment_devices: 1
-//   }
-// Serialized string:
-//   "\08\03\1A\02\02\01\22\02\00\01"
-
-// CHECK:  HloModule
-func.func @main(%data: tensor<3x2xi32>, %token: !mhlo.token) -> !mhlo.token {
-  %shard = "mhlo.custom_call"(%data) {api_version = 1 : i32, backend_config = "", call_target_name = "Sharding",  mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<3x2xi32>) -> tensor<3x2xi32>
-  %full_shaped_data = "mhlo.custom_call"(%shard) {api_version = 1 : i32, backend_config = "", call_target_name = "SPMDShardToFullShape",  mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<3x2xi32>) -> tensor<6x2xi32>
-  %0 = "mhlo.outfeed"(%full_shaped_data, %token) {mhlo.sharding = "\08\02*\0A\08\03\1A\02\02\01\22\02\00\01*\08\08\01\1A\01\01\22\01\00", outfeed_config = "foobar"} : (tensor<6x2xi32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[DATA:%.*]] = s32[3,2] parameter(0)
-// CHECK:  [[SHARD:%.*]] = s32[3,2] custom-call(s32[3,2] [[DATA]])
-// CHECK-SAME: custom_call_target="Sharding"
-// CHECK-SAME: sharding={devices=[1,2]0,1}
-// CHECK:  [[FULL:%.*]] = s32[6,2] custom-call(s32[3,2] [[SHARD]])
-// CHECK-SAME: custom_call_target="SPMDShardToFullShape"
-// CHECK-SAME: sharding={devices=[1,2]0,1}
-// CHECK-DAG: [[DATATUPLE:%.*]] = (s32[6,2]) tuple(s32[6,2] [[FULL]])
-// CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[6,2]) [[DATATUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[6,2]{1,0}), outfeed_config="foobar",
-// CHECK-SAME: sharding={
-// CHECK-SAME: {devices=[2,1]0,1}, {maximal device=0}
-// CHECK-SAME: }
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%data1: tensor<3xi32>, %data2: tensor<3xi32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.outfeed"(%data1, %data2,  %token) {outfeed_config = "foobar"} : (tensor<3xi32>, tensor<3xi32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[DATA1:%.*]] = s32[3] parameter(0)
-// CHECK:  [[DATA2:%.*]] = s32[3] parameter(1)
-// CHECK-DAG:  [[TUPLE:%.*]] = (s32[3], s32[3]) tuple(s32[3] [[DATA1]], s32[3] [[DATA2]])
-// CHECK-DAG:  [[TOKEN:%.*]] = token[] parameter(2)
-// CHECK:  ROOT %[[RESULT:.*]] = token[] outfeed((s32[3], s32[3]) [[TUPLE]], token[] [[TOKEN]]), outfeed_shape=(s32[3]{0}, s32[3]{0}), outfeed_config="foobar"
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.outfeed"(%token) {outfeed_config = "foobar"} : (!mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK: ENTRY
-// CHECK-DAG:   [[EMPTY_TUPLE:%.*]] = () tuple()
-// CHECK-DAG:   [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:   ROOT [[RESULT:%.*]] = token[] outfeed(() [[EMPTY_TUPLE]], token[] [[TOKEN]]), outfeed_shape=(), outfeed_config="foobar"
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<4x6xf32>, %pad: tensor<f32>) -> tensor<13x19xf32> {
-  %0 = "mhlo.pad"(%arg, %pad) {edge_padding_high = dense<[4,5]> : tensor<2xi64>, edge_padding_low = dense<[2,3]> : tensor<2xi64>, interior_padding = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>, tensor<f32>) -> tensor<13x19xf32>
-  func.return %0 : tensor<13x19xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[4,6] parameter(0)
-// CHECK:  [[PADDING_VAL:%.*]] = f32[] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[13,19] pad(f32[4,6] [[ARG]], f32[] [[PADDING_VAL]]), padding=2_4_1x3_5_1
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%token: !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token> {
-  %0:2 = "mhlo.recv"(%token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 3  // Host to device channel
-    >,
-    is_host_transfer = true
-  } : (!mhlo.token) -> (tensor<3x4xi32>, !mhlo.token)
-  %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3x4xi32>, !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token>
-  func.return %1 : tuple<tensor<3x4xi32>, !mhlo.token>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5, is_host_transfer=true
-// CHECK:  (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5, is_host_transfer=true
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%token: !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token> {
-  %0:2 = "mhlo.recv"(%token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 1  // Device to device channel
-    >,
-    is_host_transfer = false
-  } : (!mhlo.token) -> (tensor<3x4xi32>, !mhlo.token)
-  %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3x4xi32>, !mhlo.token) -> tuple<tensor<3x4xi32>, !mhlo.token>
-  func.return %1 : tuple<tensor<3x4xi32>, !mhlo.token>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5
-// CHECK:  (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5
-
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.recv"(%token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 1  // Device to device channel
-    >,
-    is_host_transfer = false
-  } : (!mhlo.token) -> (!mhlo.token)
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK-NEXT:  [[ARG:%.*]] = token[] parameter(0)
-// CHECK-NEXT:  [[RECV:%.*]] = ((), u32[], token[]) recv(token[] [[ARG]]), channel_id=5
-// CHECK-NEXT:  [[RECV_DONE:%.*]] = ((), token[]) recv-done(((), u32[], token[]) [[RECV]]), channel_id=5
-// CHECK-NEXT:  [[DATA:%.*]] =   () get-tuple-element(((), token[]) [[RECV_DONE]]), index=0
-// CHECK-NEXT:  ROOT [[TOKEN:%.*]] =   token[] get-tuple-element(((), token[]) [[RECV_DONE]]), index=1
-
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<1x10xf32>, %arg1 : tensor<1x10xi32>, %arg2 : tensor<f32>, %arg3 : tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>) {
-  %result0, %result1 = "mhlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
-    ^bb0(%fa: tensor<f32>, %ia : tensor<i32>, %fb: tensor<f32>, %ib: tensor<i32>):
-      %fmax = "mhlo.maximum"(%fa, %fb) {} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      %imax = "mhlo.maximum"(%ia, %ib) {} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      "mhlo.return"(%fmax, %imax) : (tensor<f32>, tensor<i32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<1x10xi32>, tensor<f32>, tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>)
-  func.return %result0, %result1 : tensor<1xf32>, tensor<1xi32>
-}
-
-// CHECK:  %[[REGION:region_[0-9]+]]
-// CHECK-SAME:  ([[ARG_FA:.*]]: f32[], [[ARG_IA:.*]]: s32[], [[ARG_FB:.*]]: f32[], [[ARG_IB:.*]]: s32[]) -> (f32[], s32[])
-// CHECK:  %[[FMAX:.*]] = f32[] maximum(f32[] %[[ARG_FA]], f32[] %[[ARG_FB]])
-// CHECK:  %[[IMAX:.*]] = s32[] maximum(s32[] %[[ARG_IA]], s32[] %[[ARG_IB]])
-// CHECK:  ROOT %[[RESULT_REGION:.*]] = (f32[], s32[]) tuple(f32[] %[[FMAX]], s32[] %[[IMAX]])
-
-// CHECK:  ENTRY
-// CHECK-SAME:  ([[ARG0:.*]]: f32[1,10], [[ARG1:.*]]: s32[1,10], [[ARG2:.*]]: f32[], [[ARG3:.*]]: s32[]) -> (f32[1], s32[1])
-// CHECK:  %[[RESULT:.*]] = (f32[1], s32[1]) reduce(f32[1,10] %[[ARG0]], s32[1,10] %[[ARG1]], f32[] %[[ARG2]], s32[] %[[ARG3]]), dimensions={1}, to_apply=%[[REGION]]
-// CHECK:  %[[RESULT0:.*]] = f32[1] get-tuple-element((f32[1], s32[1]) %[[RESULT]]), index=0
-// CHECK:  %[[RESULT1:.*]] = s32[1] get-tuple-element((f32[1], s32[1]) %[[RESULT]]), index=1
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[1], s32[1]) tuple(f32[1] %[[RESULT0]], s32[1] %[[RESULT1]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2x17x31x7xi32>) -> tensor<2x5x8x7xi32> {
-  %0 = mhlo.constant dense<-2147483648> : tensor<i32>
-  %1 = "mhlo.reduce_window"(%arg0, %0) ({
-  ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>):
-    %2 = mhlo.maximum %arg1, %arg2 : tensor<i32>
-    "mhlo.return"(%2) : (tensor<i32>) -> ()
-  }) {
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>,
-    padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>,
-    base_dilations = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>
-  } : (tensor<2x17x31x7xi32>, tensor<i32>) -> tensor<2x5x8x7xi32>
-  func.return %1 : tensor<2x5x8x7xi32>
-}
-
-// CHECK:  %[[MAX_COMPUTATION:.*]] ([[ARG0:.*]]: s32[], [[ARG1:.*]]: s32[]) -> s32[]
-// CHECK:  ROOT %[[RESULT:.*]] = s32[] maximum(s32[] %[[ARG0]], s32[] %[[ARG1]])
-
-// CHECK:  ENTRY
-// CHECK-DAG:  %[[ARG0:.*]] = s32[2,17,31,7] parameter(0)
-// CHECK-DAG:  %[[INIT:.*]] = s32[] constant(-2147483648)
-// CHECK:  ROOT %[[RESULT:.*]] = s32[2,5,8,7] reduce-window(s32[2,17,31,7] %[[ARG0]], s32[] %constant.2),
-// CHECK-SAME:  window={size=1x2x2x1 stride=1x4x4x1 pad=0_0x2_0x0_2x0_0 rhs_dilate=1x2x2x1},
-// CHECK-SAME:  to_apply=%[[MAX_COMPUTATION]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2xf32>) -> tensor<1x2xf32> {
-  %0 = "mhlo.reshape"(%arg0) : (tensor<2xf32>) -> tensor<1x2xf32>
-  func.return %0 : tensor<1x2xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[2] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[1,2] reshape(f32[2] %[[ARG0]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0 : tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
-  %result = "mhlo.reverse"(%arg0) {
-    dimensions = dense<[1,2]> : tensor<2xi64>
-  } : (tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32>
-  func.return %result : tensor<10x11x12x13xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[10,11,12,13] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10,11,12,13] reverse(f32[10,11,12,13] %[[ARG0]]), dimensions={1,2}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%mu: tensor<f32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
-  %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %0 = "mhlo.rng"(%mu, %sigma, %shape) {rng_distribution = #mhlo.rng_distribution<NORMAL>} : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-  func.return %0 : tensor<2x3x5xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[MU:.*]] = f32[] parameter(0)
-// CHECK:  %[[SIGMA:.*]] = f32[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(f32[] %[[MU]], f32[] %[[SIGMA]]), distribution=rng_normal
-
-// -----
-
-// CHECK:  HloModule
-func.func @main() -> tensor<2x3x5xf32> {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  %2 = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-  func.return %3 : tensor<2x3x5xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK-DAG:  %[[A:.*]] = f32[] constant(0)
-// CHECK-DAG:  %[[B:.*]] = f32[] constant(1)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[2,3,5] rng(f32[] %[[A]], f32[] %[[B]]), distribution=rng_uniform
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%input_tensor: tensor<200x100x300xf32>, %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) -> tensor<200x100x300xf32> {
-  %0 = "mhlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
-  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-    %add = mhlo.add %lhs, %rhs : tensor<f32>
-    "mhlo.return"(%add) : (tensor<f32>) -> ()
-  }) {
-    scatter_dimension_numbers = #mhlo.scatter<
-      update_window_dims = [1],
-      inserted_window_dims = [0, 1],
-      scatter_dims_to_operand_dims = [0, 1],
-      index_vector_dim = 1
-    >,
-    indices_are_sorted = true,
-    unique_indices = true
-  } : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<10x300xf32>) -> tensor<200x100x300xf32>
-  func.return %0 : tensor<200x100x300xf32>
-}
-
-// CHECK:  [[COMPUTATION:%.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
-// CHECK:  ENTRY
-// CHECK:  [[VAL_1:%.*]] = f32[200,100,300] parameter(0)
-// CHECK:  [[VAL_2:%.*]] = s32[10,2] parameter(1)
-// CHECK:  [[VAL_3:%.*]] = f32[10,300] parameter(2)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[200,100,300] scatter(f32[200,100,300] [[VAL_1]], s32[10,2] [[VAL_2]], f32[10,300] [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=[[COMPUTATION]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi64>, %arg2: tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>) {
-    %0:2 = "mhlo.scatter"(%arg0, %arg0, %arg1, %arg2, %arg2) ({
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>):
-      %2 = mhlo.add %arg3, %arg4 : tensor<f32>
-      %3 = mhlo.add %arg5, %arg6 : tensor<f32>
-      "mhlo.return"(%2, %3) : (tensor<f32>, tensor<f32>) -> ()
-    }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0, 1], scatter_dims_to_operand_dims = [0, 1], index_vector_dim = 1>, unique_indices = false} : (tensor<200x100x300xf32>, tensor<200x100x300xf32>, tensor<10x2xi64>, tensor<10x300xf32>, tensor<10x300xf32>) -> (tensor<200x100x300xf32>, tensor<200x100x300xf32>)
-    return %0#0, %0#1 : tensor<200x100x300xf32>, tensor<200x100x300xf32>
-  }
-
-// CHECK:  [[COMPUTATION:%.*]] ({{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[], {{.*}}: f32[]) -> (f32[], f32[])
-// CHECK:  ENTRY
-// CHECK:  [[VAL_1:%.*]] = f32[200,100,300] parameter(0)
-// CHECK:  [[VAL_2:%.*]] = s64[10,2] parameter(1)
-// CHECK:  [[VAL_3:%.*]] = f32[10,300] parameter(2)
-// CHECK: (f32[200,100,300], f32[200,100,300]) scatter(f32[200,100,300] [[VAL_1]], f32[200,100,300] [[VAL_1]], s64[10,2] [[VAL_2]], f32[10,300] [[VAL_3]], f32[10,300] [[VAL_3]]), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=[[COMPUTATION]]
-
-// -----
-
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK:  %[[ARG0:.*]] = pred[] parameter(0)
-  // CHECK:  %[[COND:.*]] = pred[2,3] broadcast(pred[] %[[ARG0]]), dimensions={}
-  // CHECK:  %[[ARG1:.*]] = s32[2,3] parameter(1)
-  // CHECK:  %[[ARG2:.*]] = s32[2,3] parameter(2)
-
-  // CHECK:  ROOT %[[RES:.*]] = s32[2,3] select(pred[2,3] %[[COND]], s32[2,3] %[[ARG1]], s32[2,3] %[[ARG2]])
-  %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-  func.return %0 : tensor<2x3xi32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "mhlo.select_and_scatter"(%arg0, %arg1, %0) ({
-  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-    %2 = "mhlo.compare"(%arg3, %arg4) {compare_type = #mhlo<comparison_type TOTALORDER>, comparison_direction = #mhlo<comparison_direction GE>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%2) : (tensor<i1>) -> ()
-  },  {
-  ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-    %2 = mhlo.add %arg3, %arg4 : tensor<f32>
-    "mhlo.return"(%2) : (tensor<f32>) -> ()
-  }) {
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
-  } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
-  func.return %1 : tensor<10x24x24x64xf32>
-}
-
-// CHECK:  %[[SELECT_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
-// CHECK:  ROOT %[[RESULT:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GE, type=TOTALORDER
-
-// CHECK:  %[[SCATTER_COMPUTATION:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> f32[] {
-// CHECK:  ROOT %[[RESULT:.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[10,24,24,64] parameter(0)
-// CHECK:  %[[ARG1:.*]] = f32[10,12,12,64] parameter(1)
-// CHECK:  %[[INIT:.*]] = f32[] constant(0)
-
-// CHECK:  ROOT %[[RESULT:.*]] = f32[10,24,24,64]
-// CHECK-SAME:  select-and-scatter(f32[10,24,24,64] %[[ARG0]], f32[10,12,12,64] %[[ARG1]], f32[] %[[INIT]]),
-// CHECK-SAME:  window={size=1x2x2x1 stride=1x2x2x1},
-// CHECK-SAME:  select=%[[SELECT_COMPUTATION]], scatter=%[[SCATTER_COMPUTATION]]
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.send"(%arg, %token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 2  // Device to host channel
-    >,
-    is_host_transfer = true
-  } : (tensor<3x4xi32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
-// CHECK:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send(s32[3,4] [[ARG]], token[] [[TOKEN]]), channel_id=5, is_host_transfer=true
-// CHECK:  ROOT
-// CHECK-SAME:  token[] send-done((s32[3,4], u32[], token[]) [[SEND]]), channel_id=5, is_host_transfer=true
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xi32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.send"(%arg, %token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 1  // Device to device channel
-    >,
-    is_host_transfer = false
-  } : (tensor<3x4xi32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
-// CHECK:  [[TOKEN:%.*]] = token[] parameter(1)
-// CHECK:  [[SEND:%.*]] = (s32[3,4], u32[], token[]) send(s32[3,4] [[ARG]], token[] [[TOKEN]]), channel_id=5
-// CHECK:  ROOT
-// CHECK-SAME:  token[] send-done((s32[3,4], u32[], token[]) [[SEND]]), channel_id=5
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.send"(%token) {
-    channel_handle = #mhlo.channel_handle<
-      handle = 5,
-      type = 1
-    >,
-    is_host_transfer = false
-  } : (!mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK: ENTRY
-// CHECK-DAG:   [[ARG:%.*]] = () tuple()
-// CHECK-DAG:   [[TOKEN:%.*]] = token[] parameter(0)
-// CHECK:   [[SEND:%.*]] = ((), u32[], token[]) send(() [[ARG]], token[] [[TOKEN]]), channel_id=5
-// CHECK:  ROOT
-// CHECK-SAME:   token[] send-done(((), u32[], token[]) [[SEND]]), channel_id=5
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<4x4xf32>, %size: tensor<i32>) -> tensor<4x4xf32> {
-  %0 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 1 : i64} : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x4xf32>
-  func.return %0 : tensor<4x4xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = f32[4,4] parameter(0)
-// CHECK:  [[SIZE:%.*]] = s32[] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[4,<=4] set-dimension-size(f32[4,4] [[ARG]], s32[] [[SIZE]]), dimensions={1}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xi32>) -> tensor<1x2xi32> {
-  %0 = "mhlo.slice"(%arg) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xi32>) -> tensor<1x2xi32>
-  func.return %0 : tensor<1x2xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  [[ARG:%.*]] = s32[3,4] parameter(0)
-// CHECK:  ROOT
-// CHECK-SAME:  s32[1,2] slice(s32[3,4] [[ARG]]), slice={[1:2:1], [0:4:2]}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xi32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xi32> {
-  %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
-  func.return %0 : tensor<1x4xi32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG:.*]] = s32[3,4] parameter(0)
-// CHECK:  %[[ARG1:.*]] = s64[] parameter(1)
-// CHECK:  %[[ARG2:.*]] = s64[] parameter(2)
-// CHECK:  ROOT
-// CHECK-SAME:  s32[1,4] dynamic-slice(s32[3,4] %[[ARG]], s64[] %[[ARG1]], s64[] %[[ARG2]]), dynamic_slice_sizes={1,4}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32> {
-  // CHECK:  [[ARG:%.*]] = s32[1,2,3,4] parameter(0)
-
-  // CHECK-NEXT:  ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] [[ARG]]), dimensions={1,0,3,2}
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-  func.return %0 : tensor<2x1x4x3xi32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
-  %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
-  func.return %0 : tensor<4x3xf32>
-}
-
-// CHECK:  [[ARG_A:%.*]] = f32[4,4] parameter(0)
-// CHECK:  [[ARG_B:%.*]] = f32[4,3] parameter(1)
-// CHECK:  ROOT
-// CHECK-SAME:  f32[4,3] triangular-solve(f32[4,4] [[ARG_A]], f32[4,3] [[ARG_B]]), left_side=true, lower=true, unit_diagonal=true, transpose_a=NO_TRANSPOSE
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<f32>, %arg1 : tensor<i32>) -> tuple<tensor<f32>, tensor<i32>> {
-  %result = "mhlo.tuple"(%arg0, %arg1) {} : (tensor<f32>, tensor<i32>) -> tuple<tensor<f32>, tensor<i32>>
-  func.return %result : tuple<tensor<f32>, tensor<i32>>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[] parameter(0)
-// CHECK:  %[[ARG1:.*]] = s32[] parameter(1)
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[], s32[]) tuple(f32[] %[[ARG0]], s32[] %[[ARG1]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg_f32: tensor<4xf32>, %arg_i32: tensor<4xi32>) -> (tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>) {
-  // CHECK:  [[ARG_F32:%.*]] = f32[4] parameter(0)
-  // CHECK:  [[EXPM1:%.*]] = f32[4] exponential-minus-one(f32[4] [[ARG_F32]])
-  %expm1 = "mhlo.exponential_minus_one"(%arg_f32) : (tensor<4xf32>) -> tensor<4xf32>
-
-  // CHECK:  [[LOG1P:%.*]] = f32[4] log-plus-one(f32[4] [[ARG_F32]])
-  %log1p = "mhlo.log_plus_one"(%arg_f32) : (tensor<4xf32>) -> tensor<4xf32>
-
-  // CHECK:  [[ARG_I32:%.*]] = s32[4] parameter(1)
-  // CHECK:  [[NOT:%.*]] = s32[4] not(s32[4] [[ARG_I32]])
-  %not = "mhlo.not"(%arg_i32) : (tensor<4xi32>) -> tensor<4xi32>
-
-  // CHECK:  [[POPCNT:%.*]] = s32[4] popcnt(s32[4] [[ARG_I32]])
-  %popcnt = "mhlo.popcnt"(%arg_i32) : (tensor<4xi32>) -> tensor<4xi32>
-
-  func.return %expm1, %log1p, %not, %popcnt : tensor<4xf32>, tensor<4xf32>, tensor<4xi32>, tensor<4xi32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK:  [[VAL_1:%.*]] = pred[4] parameter(0)
-  // CHECK:  [[VAL_2:%.*]] = pred[4] parameter(1)
-  %0 = mhlo.xor %arg0, %arg1 : tensor<4xi1>
-  // CHECK:  ROOT [[VAL_3:%.*]] = pred[4] xor(pred[4] [[VAL_1]], pred[4] [[VAL_2]])
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
-  %0:2 = "mhlo.sort"(%input0, %input1) ({
-  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
-    %7 = "mhlo.compare"(%arg0, %arg1) {compare_type = #mhlo<comparison_type FLOAT>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  func.return
-}
-
-// CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[], {{.*}}: s32[], {{.*}}: s32[]) -> pred[] {
-// CHECK:   ROOT %compare.8 = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
-
-// CHECK: [[SORT:%.+]] = (f32[16,16], s32[16,16]) sort(f32[16,16] %Arg_0.1, s32[16,16] %Arg_1.2), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
-// CHECK: [[GET0:%.+]] = f32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=0
-// CHECK: [[GET1:%.+]] = s32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=1
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%input0: tensor<16x16xf32>) {
-  %0 = "mhlo.sort"(%input0) ({
-  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
-    %7 = "mhlo.compare"(%arg0, %arg1) {compare_type = #mhlo<comparison_type FLOAT>, comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>) -> (tensor<16x16xf32>)
-  func.return
-}
-
-// CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> pred[] {
-// CHECK:   ROOT %[[CMP:.*]] = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
-
-// CHECK: %[[RESULT:.*]] = f32[16,16] sort(f32[16,16] %Arg_0.1), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
-
-// -----
-
-// The following op sharding is used:
-// Proto debug string:
-//   type: OTHER
-//   tile_assignment_dimensions: 1
-//   tile_assignment_dimensions: 2
-//   tile_assignment_devices: 0
-//   tile_assignment_devices: 1
-// Serialized string:
-//   "\08\03\1A\02\01\02\22\02\00\01"
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
-  %0 = "mhlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
-  func.return %0 : tensor<16x16xf32>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = f32[16,16] parameter(0)
-// CHECK:  ROOT %[[RESULT:.*]] = f32[16,16] custom-call(f32[16,16] %[[ARG0]])
-// CHECK-SAME: custom_call_target="Sharding"
-// CHECK-SAME: sharding={devices=[1,2]0,1}
-
-// -----
-
-// CHECK:  HloModule
-// CHECK: %[[FOO:.*]] ([[ARG0:.*]]: f32[2,3], [[ARG1:.*]]: f32[5,5]) -> f32[2,3]
-func.func @foo (%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<2x3xf32> {
-  func.return %arg0 : tensor<2x3xf32>
-}
-
-// CHECK: ENTRY
-func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<2x3xf32> {
-  // CHECK:  ROOT
-  // CHECK-SAME:  f32[2,3] custom-call
-  // CHECK-SAME:  called_computations={%[[FOO]]}
-  %0 = "mhlo.custom_call"(%arg0, %arg1) {
-    call_target_name = "foo",
-    called_computations = [@foo]
-  } : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<2x3xf32>
-  func.return %0 : tensor<2x3xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f64>>) -> (tensor<2xf32>, tensor<2xf64>) {
-  %0 = "mhlo.abs"(%arg0) : (tensor<2xcomplex<f32>>) -> (tensor<2xf32>)
-  %1 = "mhlo.abs"(%arg1) : (tensor<2xcomplex<f64>>) -> (tensor<2xf64>)
-  func.return %0, %1 : tensor<2xf32>, tensor<2xf64>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[ARG0:.*]] = c64[2] parameter(0)
-// CHECK:  %[[ABS0:.*]] = f32[2] abs(c64[2] %[[ARG0]])
-// CHECK:  %[[ARG1:.*]] = c128[2] parameter(1)
-// CHECK:  %[[ABS1:.*]] = f64[2] abs(c128[2] %[[ARG1]])
-// CHECK:  ROOT %[[RESULT:.*]] = (f32[2], f64[2]) tuple(f32[2] %[[ABS0]], f64[2] %[[ABS1]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xui8>) -> tensor<4xui8> {
-  %0 = "mhlo.not"(%arg0) : (tensor<4xui8>) -> tensor<4xui8>
-  func.return %0 : tensor<4xui8>
-}
-
-// CHECK: ENTRY
-// CHECK: %[[ARG0:.*]] = u8[4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = u8[4] not(u8[4] %[[ARG0]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4xi32>) -> tensor<*xi32> {
-  %0 = "mhlo.not"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
-  %1 = tensor.cast %0 : tensor<4xi32> to tensor<*xi32>
-  func.return %1 : tensor<*xi32>
-}
-
-// CHECK: ENTRY
-// CHECK: %[[ARG0:.*]] = s32[4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = s32[4] not(s32[4] %[[ARG0]])
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<4xf32>, %size: tensor<i32>) -> tensor<?xf32> {
-  %0 = "mhlo.set_dimension_size"(%arg, %size) {dimension = 0 : i64} : (tensor<4xf32>, tensor<i32>) -> tensor<?xf32, #mhlo.type_extensions<bounds = [4]>>
-  %1 = tensor.cast %0 : tensor<?xf32, #mhlo.type_extensions<bounds = [4]>> to tensor<?xf32>
-  func.return %1 : tensor<?xf32>
-}
-
-// CHECK: ENTRY
-// CHECK: %[[ARG0:.*]] = f32[4] parameter(0)
-// CHECK: %[[ARG1:.*]] = s32[] parameter(1)
-// CHECK: ROOT %[[RESULT:.*]] = f32[<=4] set-dimension-size
-
-// -----
-
-
-// Tests ops with different frontend attributes have such attributes set
-// correctly in HloModule as frontend_attributes.
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> tuple<tensor<3x4xf32>, !mhlo.token> {
-  %0 = "mhlo.send"(%arg, %token) {channel_handle = #mhlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true, mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "channel_dtoh_0"}} : (tensor<3x4xf32>, !mhlo.token) -> !mhlo.token
-  %1:2 = "mhlo.recv"(%0) {channel_handle = #mhlo.channel_handle<handle = 2, type = 3>, is_host_transfer = true, mhlo.frontend_attributes = {_xla_host_transfer_original_type = "f32", _xla_host_transfer_rendezvous = "channel_htod_0"}} : (!mhlo.token) -> (tensor<3x4xf32>, !mhlo.token)
-  %2 = "mhlo.tuple"(%1#0, %1#1) : (tensor<3x4xf32>, !mhlo.token) -> tuple<tensor<3x4xf32>, !mhlo.token>
-  func.return %2 : tuple<tensor<3x4xf32>, !mhlo.token>
-}
-
-// CHECK:  ENTRY
-// CHECK:  %[[SEND:.*]] = (f32[3,4], u32[], token[]) send
-// CHECK-SAME: frontend_attributes={_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="channel_dtoh_0"}
-// CHECK:  %[[SEND_DONE:.*]] = token[] send-done((f32[3,4], u32[], token[]) %[[SEND]])
-// CHECK-SAME: frontend_attributes={_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="channel_dtoh_0"}
-// CHECK:  %[[RECV:.*]] = (f32[3,4], u32[], token[]) recv(token[] %[[SEND_DONE]])
-// CHECK-SAME: frontend_attributes={_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="channel_htod_0"}
-// CHECK:  %{{.*}} = (f32[3,4], token[]) recv-done((f32[3,4], u32[], token[]) %[[RECV]])
-// CHECK-SAME: frontend_attributes={_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="channel_htod_0"}
-
-// -----
-
-// Tests ops with empty frontend attributes do not have frontend_attributes
-// populated in HloModule.
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.send"(%arg, %token) {channel_handle = #mhlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true, mhlo.frontend_attributes = {}} : (tensor<3x4xf32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK-NOT:  frontend_attributes
-
-// -----
-
-// Tests ops with no frontend attributes do not have frontend_attributes
-// populated in HloModule.
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>, %token: !mhlo.token) -> !mhlo.token {
-  %0 = "mhlo.send"(%arg, %token) {channel_handle = #mhlo.channel_handle<handle = 1, type = 2>, is_host_transfer = true} : (tensor<3x4xf32>, !mhlo.token) -> !mhlo.token
-  func.return %0 : !mhlo.token
-}
-
-// CHECK-NOT:  frontend_attributes
-
-// -----
-
-// Checks exporting rng-bit-generator.
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
-// CHECK: %[[ARG0:.*]] = u64[3] parameter(0)
-// CHECK: [[RNG:%.*]] = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %[[ARG0]]), algorithm=rng_philox
-// CHECK:  [[GTE0:%.*]] = u64[3] get-tuple-element((u64[3], u32[2,2]) [[RNG]]), index=0
-// CHECK:  [[GTE1:%.*]] = u32[2,2] get-tuple-element((u64[3], u32[2,2]) [[RNG]]), index=1
-// CHECK:  ROOT
-// CHECK-SAME: [[RES:%.*]] = (u64[3], u32[2,2]) tuple(u64[3] [[GTE0]], u32[2,2] [[GTE1]])
-  %0:2 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
-  %1 = "mhlo.tuple"(%0#0, %0#1) : (tensor<3xui64>, tensor<2x2xui32>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
-  func.return %1 : tuple<tensor<3xui64>, tensor<2x2xui32>>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
-// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] cbrt(f32[3,4] %[[ARG0]])
-  %0 = "mhlo.cbrt"(%arg) : (tensor<3x4xf32>) -> tensor<3x4xf32>
-  func.return %0 : tensor<3x4xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
-// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] reduce-precision(f32[3,4] %[[ARG0]]), exponent_bits=8, mantissa_bits=10
-  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
-  func.return %0 : tensor<3x4xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
-// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4,1] bitcast(f32[3,4] %[[ARG0]])
-  %0 = "mhlo.bitcast"(%arg) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
-  func.return %0 : tensor<3x4x1xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>) {
-// CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
-// CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]])
-// CHECK: %[[RESULT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]])
-  %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
-  func.return %0, %1 : tensor<4x4xf32>, tensor<3x4xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func private @main() -> tensor<ui32> {
-  // CHECK: u32[] partition-id()
-  %1 = "mhlo.partition_id"() : () -> tensor<ui32>
-  return %1 : tensor<ui32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func private @main(%arg0: tensor<ui32>) -> tensor<ui32> {
-  // CHECK: u32[] domain(
-  // CHECK-SAME: domain={kind="sharding", entry={maximal device=1}, exit={}}
-  %1 = "mhlo.domain"(%arg0) {entry_metadata = "\08\01\1A\01\01\22\01\01", exit_metadata = "\08\02", kind = #mhlo<kind sharding>} : (tensor<ui32>) -> tensor<ui32>
-  return %1 : tensor<ui32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
-// CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
-// CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] triangular-solve(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]]), lower=true, transpose_a=NO_TRANSPOSE
-  %0 = "mhlo.triangular_solve"(%arg0, %arg1) {left_side = false, lower = true, transpose_a = #mhlo<transpose NO_TRANSPOSE>, unit_diagonal = false} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
-  func.return %0: tensor<3x4xf32>
-}
-
-// -----
-
-// CHECK: HloModule
-// CHECK: %[[APPLYFN:.*]] ({{.*}}) -> (f32[], s32[]) {
-// CHECK: %[[A0:.*]] = f32[] parameter(0)
-// CHECK: %[[B0:.*]] = f32[] parameter(2)
-// CHECK: %[[ADDF32:.*]] = f32[] add(f32[] %[[A0]], f32[] %[[B0]])
-// CHECK: %[[A1:.*]] = s32[] parameter(1)
-// CHECK: %[[B1:.*]] = s32[] parameter(3)
-// CHECK: %[[ADDS32:.*]] = s32[] add(s32[] %[[A1]], s32[] %[[B1]])
-// CHECK: ROOT %{{.*}} = (f32[], s32[]) tuple(f32[] %[[ADDF32]], s32[] %[[ADDS32]])
-
-// CHECK: ENTRY
-// CHECK: %[[ARG0:.*]] = f32[4,2] parameter(0)
-// CHECK: %[[ARG1:.*]] = s32[4,2] parameter(1)
-// CHECK: %[[ARG2:.*]] = f32[] parameter(2)
-// CHECK: %[[ARG3:.*]] = s32[] parameter(3)
-// CHECK: (f32[2,2], s32[2,2]) reduce-window(f32[4,2] %[[ARG0]], s32[4,2] %[[ARG1]], f32[] %[[ARG2]], s32[] %[[ARG3]])
-// CHECK-SAME: window={size=5x1 stride=3x1 pad=2_2x0_0}
-// CHECK-SAME: to_apply=%[[APPLYFN]]
-func.func @main(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) -> (tensor<2x2xf32>, tensor<2x2xi32>) {
-  %0:2 = "mhlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
-         ^bb0(%a0: tensor<f32>, %a1: tensor<i32>, %b0: tensor<f32>, %b1: tensor<i32>):
-              %2 = mhlo.add %a0, %b0 : tensor<f32>
-              %3 = mhlo.add %a1, %b1 : tensor<i32>
-              "mhlo.return"(%2, %3) : (tensor<f32>, tensor<i32>) -> ()
-            })
-         { padding = dense<[[2, 2], [0, 0]]> : tensor<2x2xi64>,
-           window_dimensions = dense<[5, 1]> : tensor<2xi64>,
-           window_strides = dense<[3, 1]> : tensor<2xi64> } : (tensor<4x2xf32>, tensor<4x2xi32>, tensor<f32>, tensor<i32>) -> (tensor<2x2xf32>, tensor<2x2xi32>)
-  func.return %0#0, %0#1 : tensor<2x2xf32>, tensor<2x2xi32>
-}
-// -----
-
-func.func @AsyncOp(%arg0: tensor<10xf32>) -> tensor<20xf32>
-  attributes {execution_thread = "thread"} {
-  %0 = "mhlo.custom_call"(%arg0) {call_target_name = "foo"} : (tensor<10xf32>) -> tensor<20xf32>
-  return %0 : tensor<20xf32>
-}
-
-// CHECK: HloModule
-// CHECK: ENTRY
-func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
-  // CHECK: %[[ARG0:.*]] = f32[10] parameter(0)
-  // CHECK: %[[START:.*]] = (f32[10], f32[20], s32[]) custom-call-start(f32[10] %[[ARG0]])
-  %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread = "thread"} : (tensor<10xf32>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>
-  // CHECK: %[[UPDATE:.*]] = (f32[10], f32[20], s32[]) custom-call-update((f32[10], f32[20], s32[]) %[[START]])
-  %1 = "mhlo.async_update"(%0) {called_computation = @AsyncOp, execution_thread = "thread"} : (!mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>
-  // CHECK: ROOT %{{.*}} = (f32[20]) custom-call-done((f32[10], f32[20], s32[]) %[[UPDATE]])
-  %2 = "mhlo.async_done"(%1) {called_computation = @AsyncOp, execution_thread = "thread"} : (!mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
-  return %2 : tensor<20xf32>
-}
-
-// -----
-
-func.func @AsyncOp(%arg0: tensor<10xf32>) -> tensor<20xf32>
-  attributes {execution_thread = "thread"} {
-  %1 = "mhlo.custom_call"(%arg0) {call_target_name = "bar"} : (tensor<10xf32>) -> tensor<20xf32>
-  return %1 : tensor<20xf32>
-}
-
-// CHECK: HloModule
-// CHECK: ENTRY
-func.func @main(%arg0: tensor<10xf32>) -> tensor<20xf32> {
-  // CHECK: %[[ARG0:.*]] = f32[10] parameter(0)
-  // CHECK: %[[START:.*]] = (f32[10], f32[20], s32[]) custom-call-start(f32[10] %[[ARG0]]), async_group_id=1, async_execution_thread="thread", custom_call_target="bar"
-  // CHECK: %[[UPDATE:.*]] = (f32[10], f32[20], s32[]) custom-call-update((f32[10], f32[20], s32[]) %[[START]]), async_group_id=1, async_execution_thread="thread", custom_call_target="bar"
-  // CHECK: ROOT
-  // CHECK-SAME: (f32[20]) custom-call-done((f32[10], f32[20], s32[]) %[[UPDATE]]), async_group_id=1, async_execution_thread="thread",  custom_call_target="bar"
-
-  %0 = "mhlo.async_start"(%arg0) {called_computation = @AsyncOp, execution_thread="thread", group_id = 1} : (tensor<10xf32>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>
-  %1 = "mhlo.async_update"(%0) {called_computation = @AsyncOp, execution_thread="thread", group_id=1} : (!mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>) -> !mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>
-  %2 = "mhlo.async_done"(%1) {called_computation = @AsyncOp, execution_thread="thread", group_id=1} : (!mhlo.async_bundle<tensor<10xf32>, tensor<20xf32>, tensor<i32>>) -> tensor<20xf32>
-  return %2 : tensor<20xf32>
-}
-
-
-// -----
-
-// CHECK: HloModule
-func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK: %[[ARG0:.*]] = f32[2] parameter(0)
-  %0 = "mhlo.round_nearest_even"(%arg0) {} : (tensor<2xf32>) -> tensor<2xf32>
-  // CHECK: round-nearest-even(f32[2] %[[ARG0]])
-  func.return %0 : tensor<2xf32>
-}
-
-// -----
-
-// CHECK: HloModule
-// CHECK{LITERAL}: output_to_operand_aliasing={{0}: (0, {1})}
-func.func @main(%arg0: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %arg1: tensor<5x5xf32>) {
-  %0 = "mhlo.custom_call"(%arg0, %arg1) {
-    call_target_name = "foo",
-    output_operand_aliases = [
-      #mhlo.output_operand_alias<output_tuple_indices = [0],
-                                 operand_index = 0,
-                                 operand_tuple_indices = [1]>
-    ]
-  } : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tuple<tensor<2x3xf32>>
-  func.return
-}
-
-// -----
-
-// CHECK: HloModule
-// CHECK{LITERAL}: output_to_operand_aliasing={{}: (0, {1})}
-func.func @main(%arg0: tuple<tensor<1x1xf32>, tensor<2x3xf32>>, %arg1: tensor<5x5xf32>) {
-  %0 = "mhlo.custom_call"(%arg0, %arg1) {
-    call_target_name = "foo",
-    output_operand_aliases = [
-      #mhlo.output_operand_alias<output_tuple_indices = [],
-                                 operand_index = 0,
-                                 operand_tuple_indices = [1]>
-    ]
-  } : (tuple<tensor<1x1xf32>, tensor<2x3xf32>>, tensor<5x5xf32>) -> tensor<2x3xf32>
-  func.return
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
-// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
-// CHECK: %[[TOK:.*]] = token[] after-all()
-// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] add-dependency(f32[3,4] %[[ARG0]], token[] %[[TOK]])
-  %token = "mhlo.create_token"() : () -> !mhlo.token
-  %0 = "mhlo.add_dependency"(%arg, %token) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
-  func.return %0 : tensor<3x4xf32>
-}
-
-// -----
-
-// CHECK:  HloModule
-func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> attributes {execution_thread = "test_thread"} {
-  %token = "mhlo.create_token"() : () -> !mhlo.token
-  %0 = "mhlo.add_dependency"(%arg, %token) : (tensor<3x4xf32>, !mhlo.token) -> tensor<3x4xf32>
-  func.return %0 : tensor<3x4xf32>
-}
-// CHECK-LITERAL: }, execution_thread="test_thread"
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
index c49ea15c2ec..af3c26cdfd3 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export_and_check_layouts.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-translate -mlir-hlo-to-hlo-text --print-layouts=true %s | FileCheck %s
+// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text --print-layouts=true %s | FileCheck %s
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3xf32> {
@@ -12,3 +12,18 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
   } : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
   func.return %0 : tensor<1x2x3xf32>
 }
+
+// -----
+
+// CHECK:  HloModule
+// CHECK: (token[]) custom-call(
+module @jit_f {
+  func.func public @main(%arg0: tensor<0xi1>, %arg1: tensor<i64>) -> tensor<0xi1> {
+    %0 = mhlo.create_token : !mhlo.token
+    %1 = mhlo.constant dense<57202498903760> : tensor<i64>
+    %2 = "mhlo.custom_call"(%1, %0, %arg1) {api_version = 2 : i32, call_target_name ="xla_python_cpu_callback", has_side_effect = true, operand_layouts = [dense<> : tensor<0xindex>, dense<> : tensor<0xindex>, dense<> : tensor<0xindex>], result_layouts = [dense<> : tensor<0xindex>]} : (tensor<i64>, !mhlo.token, tensor<i64>) -> tuple<!mhlo.token>
+    %3 = mhlo.get_tuple_element %2[0] : (tuple<!mhlo.token>) -> !mhlo.token
+    %4 = mhlo.constant dense<> : tensor<0xi1>
+    return %4 : tensor<0xi1>
+  }
+}
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/fusion.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/fusion.mlir
index 499185114c0..62e5ab5664f 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/fusion.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/fusion.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: xla-translate -mlir-hlo-to-hlo-text -split-input-file %s | FileCheck %s
 
 // CHECK: %[[REGION0:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> f32[]
 // CHECK: %[[REGION1:.*]] ({{.*}}: f32[], {{.*}}: f32[]) -> (f32[], f32[])
@@ -20,7 +20,7 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
       %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
       "mhlo.return"(%result) : (tensor<f32>) -> ()
-    }) { fusion_kind = #mhlo<fusion_kind kLoop> } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    }) { fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   %result0, %result1 = "mhlo.fusion"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
       %elem0 = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -35,3 +35,19 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) {
     }) {fusion_kind = #mhlo<fusion_kind kLoop>} : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
   func.return
 }
+
+// -----
+//
+// CHECK{LITERAL}: output_to_operand_aliasing={{}: (0, {})}
+func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) {
+  %result = "mhlo.fusion"(%arg0, %arg1) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%result) : (tensor<f32>) -> ()
+    }) { fusion_kind = #mhlo<fusion_kind kLoop>, output_operand_aliases = [
+      #mhlo.output_operand_alias<output_tuple_indices = [],
+                                 operand_index = 0,
+                                 operand_tuple_indices = []>
+    ] } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/location_to_op_metadata.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/location_to_op_metadata.mlir
index 7f10dd6b014..ec40749815a 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/location_to_op_metadata.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/location_to_op_metadata.mlir
@@ -1,5 +1,4 @@
-// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text --legalize-node-names=false %s | FileCheck %s --dump-input=always --check-prefixes=CHECK,NOLNN
-// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input=always --check-prefixes=CHECK,LNN
+// RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input=always --check-prefixes=CHECK
 
 // CHECK-LABEL: %main
 func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
@@ -52,8 +51,7 @@ func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
 }
 
 // CHECK: after-all
-// NOLNN-SAME: metadata={op_name="name(with)[]"}
-// LNN-SAME: metadata={op_name="name.with..."}
+// CHECK-SAME: metadata={op_name="name(with)[]"}
 
 // -----
 
@@ -64,5 +62,4 @@ func.func @main(%arg0: !mhlo.token) -> !mhlo.token {
 }
 
 // CHECK: after-all
-// NOLNN-SAME: metadata={op_name="name(anothername)" source_file="file_name" source_line=2}
-// LNN-SAME: metadata={op_name="name.anothername." source_file="file_name" source_line=2}
+// CHECK-SAME: metadata={op_name="name(anothername)" source_file="file_name" source_line=2}
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir
new file mode 100644
index 00000000000..94f1ccc7290
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/module_attributes.mlir
@@ -0,0 +1,180 @@
+// RUN: xla-translate -verify-diagnostics -split-input-file -mlir-hlo-to-hlo %s | FileCheck %s
+// RUN: xla-translate -verify-diagnostics -split-input-file -mlir-hlo-to-hlo --via-builder=true %s | FileCheck %s
+
+module attributes { mhlo.cross_program_prefetches = [ #mhlo.cross_program_prefetch<parameter = 1, indices = [0], offset = 0> ] } {
+  func.func @copy(%arg0 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> attributes {execution_thread = "main"} {
+    %0 = "mhlo.copy"(%arg0) {is_cross_program_prefetch} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %0 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+  func.func @main(%arg0 : tensor<i32>, %arg1 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> {
+    %1 = "mhlo.async_start"(%arg1) {called_computation=@copy, execution_thread="main"} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>)
+    %2 = "mhlo.async_done"(%1) {called_computation=@copy, execution_thread="main"} : (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %2 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+}
+// CHECK-LABEL: hlo_module       {
+// CHECK: cross_program_prefetches {
+// CHECK-NEXT:    parameter: 1
+// CHECK-NEXT:    index: 0
+// CHECK-NEXT:  }
+
+// -----
+
+// expected-error@+1 {{cross_program_prefetch: parameter 2 out of range. main has only 2 arguments}}
+module attributes { mhlo.cross_program_prefetches = [ #mhlo.cross_program_prefetch<parameter = 2, indices = [0], offset = 0> ] } {
+  func.func @copy(%arg0 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> attributes {execution_thread = "main"} {
+    %0 = "mhlo.copy"(%arg0) {is_cross_program_prefetch} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %0 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+  func.func @main(%arg0 : tensor<i32>, %arg1 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> {
+    %1 = "mhlo.async_start"(%arg1) {called_computation=@copy, execution_thread="main"} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>)
+    %2 = "mhlo.async_done"(%1) {called_computation=@copy, execution_thread="main"} : (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %2 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+}
+
+// -----
+
+// expected-error@+1 {{cross_program_prefetch: no subshape at given index: 0, 1}}
+module attributes { mhlo.cross_program_prefetches = [ #mhlo.cross_program_prefetch<parameter = 1, indices = [0,1], offset = 0> ] } {
+  func.func @copy(%arg0 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> attributes {execution_thread = "main"} {
+    %0 = "mhlo.copy"(%arg0) {is_cross_program_prefetch} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %0 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+  func.func @main(%arg0 : tensor<i32>, %arg1 : tuple<tensor<2x3xi32>, tensor<i32>>) -> tuple<tensor<2x3xi32>, tensor<i32>> {
+    %1 = "mhlo.async_start"(%arg1) {called_computation=@copy, execution_thread="main"} : (tuple<tensor<2x3xi32>, tensor<i32>>) -> (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>)
+    %2 = "mhlo.async_done"(%1) {called_computation=@copy, execution_thread="main"} : (!mhlo.async_bundle<tuple<tuple<tensor<2x3xi32>, tensor<i32>>>, tuple<tuple<tensor<2x3xi32>, tensor<i32>>>>) -> (tuple<tensor<2x3xi32>, tensor<i32>>)
+    return %2 : tuple<tensor<2x3xi32>, tensor<i32>>
+  }
+}
+
+
+// -----
+
+module attributes {
+  mhlo.use_auto_spmd_partitioning = true,
+  mhlo.is_dynamic = true,
+  mhlo.dynamic_parameter_bindings = [
+    #mhlo.dynamic_parameter_binding<
+      dynamic_param_num = 0,
+      dynamic_param_indices = [],
+      target_param_num = 1,
+      target_param_indices = [],
+      target_param_dim_num = 0>] } {
+  func.func @main(%a : tensor<i32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+    func.return
+  }
+}
+
+// CHECK-LABEL: hlo_module       {
+// CHECK: dynamic_parameter_binding {
+// CHECK-NEXT: entries {
+// CHECK-NEXT:    target_param_num: 1
+// CHECK-NEXT:  }
+// CHECK: is_dynamic: true
+// CHECK: use_auto_spmd_partitioning: true
+
+// -----
+
+// expected-error@+1 {{dynamic_parameter_binding: parameters 5 and 3 out of range. main has only 2 arguments}}
+module attributes {
+ mhlo.dynamic_parameter_bindings = [
+   #mhlo.dynamic_parameter_binding<
+     dynamic_param_num = 5,
+     dynamic_param_indices = [],
+     target_param_num = 3,
+     target_param_indices = [],
+     target_param_dim_num = 0>] } {
+ func.func @main(%a : tensor<i32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+   func.return
+ }
+}
+
+// -----
+
+// expected-error@+1 {{dynamic_parameter_binding: no ranked tensor type at dynamic_param_indices: 8}}
+module attributes {
+ mhlo.dynamic_parameter_bindings = [
+   #mhlo.dynamic_parameter_binding<
+     dynamic_param_num = 0,
+     dynamic_param_indices = [8],
+     target_param_num = 1,
+     target_param_indices = [],
+     target_param_dim_num = 0>] } {
+ func.func @main(%a : tensor<i32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+   func.return
+ }
+}
+
+// -----
+
+// expected-error@+1 {{dynamic_parameter_binding: no dimension number 1 in target subshape}}
+module attributes {
+ mhlo.dynamic_parameter_bindings = [
+   #mhlo.dynamic_parameter_binding<
+     dynamic_param_num = 0,
+     dynamic_param_indices = [],
+     target_param_num = 1,
+     target_param_indices = [],
+     target_param_dim_num = 1>] } {
+ func.func @main(%a : tensor<i32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+   func.return
+ }
+}
+
+// -----
+
+// expected-error@+1 {{dynamic_parameter_binding: dynamic size must be tensor<i32>}}
+module attributes {
+ mhlo.dynamic_parameter_bindings = [
+   #mhlo.dynamic_parameter_binding<
+     dynamic_param_num = 0,
+     dynamic_param_indices = [],
+     target_param_num = 1,
+     target_param_indices = [],
+     target_param_dim_num = 0>] } {
+ func.func @main(%a : tensor<f32>, %b : tensor<?xf32, #mhlo.type_extensions<bounds = [2]>>) -> () {
+   func.return
+ }
+}
+
+
+// -----
+
+// expected-error@+1 {{dynamic_parameter_binding: dimension number 0 in target subshape 'tensor<3xf32>' is not dynamic}}
+module attributes {
+ mhlo.dynamic_parameter_bindings = [
+   #mhlo.dynamic_parameter_binding<
+     dynamic_param_num = 0,
+     dynamic_param_indices = [],
+     target_param_num = 1,
+     target_param_indices = [],
+     target_param_dim_num = 0>] } {
+ func.func @main(%a : tensor<i32>, %b : tensor<3xf32>) -> () {
+   func.return
+ }
+}
+
+// -----
+
+module attributes { mhlo.spmd_output_sharding = "\08\03\1A\02\01\02\22\02\00\01", mhlo.spmd_parameters_shardings = ["\08\03\1A\02\01\02\22\02\00\01"]} {
+  func.func @main(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
+    %0 = "mhlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    func.return %0 : tensor<16x16xf32>
+  }
+}
+
+// CHECK: spmd_output_sharding {
+// CHECK:   type: OTHER
+// CHECK:   tile_assignment_dimensions: 1
+// CHECK:   tile_assignment_dimensions: 2
+// CHECK:   tile_assignment_devices: 0
+// CHECK:   tile_assignment_devices: 1
+// CHECK: }
+// CHECK: spmd_parameters_shardings {
+// CHECK:   type: OTHER
+// CHECK:   tile_assignment_dimensions: 1
+// CHECK:   tile_assignment_dimensions: 2
+// CHECK:   tile_assignment_devices: 0
+// CHECK:   tile_assignment_devices: 1
+// CHECK: }
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index a2820606c5b..ed82732606e 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -25,3 +25,15 @@ func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\
 				 } : (tensor<5x8x128xf32>) -> tensor<5x8x128xf32>
   func.return %0 : tensor<5x8x128xf32>
 }
+
+// -----
+
+// CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[4,4]) -> (f32[4,4], f32[4,4])
+func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\03\02\01\02\22\04\00\01\02\03B\01\00"}, tensor<4x4xf32>) {
+  // CHECK-NEXT: %Arg_0.1 = f32[4,4] parameter(0)
+  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NEXT: [[RESHAPE_1:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NEXT: ROOT {{%.*}} = (f32[4,4], f32[4,4]) tuple(f32[4,4] [[RESHAPE_0]], f32[4,4] [[RESHAPE_1]])
+  // CHECK-SAME: sharding={{\{}}{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}, {replicated}}
+  return %arg0, %arg0 : tensor<4x4xf32>, tensor<4x4xf32>
+}
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/unsupported_type.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/unsupported_type.mlir
new file mode 100644
index 00000000000..865ef1ff932
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/unsupported_type.mlir
@@ -0,0 +1,7 @@
+// RUN: not xla-translate -split-input-file -mlir-hlo-to-hlo-text %s 2>&1 | FileCheck %s
+
+// CHECK: result #0 type is not supported
+func.func @main() {
+  %0 = arith.constant dense<1> : tensor<1xindex>
+  func.return
+}
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
index d6ef81886aa..c59d21c93d2 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
 
+constexpr char kParameterReplicationAttr[] = "mhlo.parameter_replication";
+
 namespace xla {
 
 mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
@@ -81,6 +83,23 @@ Status ConvertMlirHloToHloViaBuilder(mlir::ModuleOp module,
   TF_ASSIGN_OR_RETURN(
       xla::XlaComputation computation,
       return_value.valid() ? builder.Build(return_value) : builder.Build());
+
+  if (auto execution_thread =
+          main->getAttrOfType<mlir::StringAttr>("execution_thread")) {
+    computation.mutable_proto()->mutable_computations(0)->set_execution_thread(
+        execution_thread.str());
+  }
+  for (int i = 0; i < main.getNumArguments(); ++i)
+    if (auto pr = main.getArgAttrOfType<mlir::ArrayAttr>(
+            i, kParameterReplicationAttr))
+      for (auto b : pr.getValue())
+        computation.mutable_proto()
+            ->mutable_computations(0)
+            ->mutable_instructions(i)
+            ->mutable_parameter_replication()
+            ->add_replicated_at_leaf_buffers(
+                b.cast<mlir::BoolAttr>().getValue());
+
   auto hlo_module = computation.proto();
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
 
@@ -89,14 +108,13 @@ Status ConvertMlirHloToHloViaBuilder(mlir::ModuleOp module,
 
 mlir::LogicalResult MlirHloToHloTextTranslateFunction(
     mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
-    bool emit_use_tuple_arg, bool legalize_node_names, bool print_layouts,
-    bool print_large_constants, bool via_builder, bool with_layouts) {
+    bool emit_use_tuple_arg, bool print_layouts, bool print_large_constants,
+    bool print_sugar, bool via_builder, bool with_layouts) {
   if (!module) return mlir::failure();
 
   HloProto hloProto;
   mlir::MlirToHloConversionOptions options;
   options.propagate_layouts = with_layouts;
-  options.legalize_node_names = legalize_node_names;
   Status status =
       via_builder
           ? ConvertMlirHloToHloViaBuilder(module, &hloProto, options)
@@ -120,6 +138,7 @@ mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   output << hlo_module->ToString(
       HloPrintOptions()
           .set_include_layout_in_shapes(print_layouts)
+          .set_syntax_sugar_async_ops(print_sugar)
           .set_print_large_constants(print_large_constants));
 
   // Output alias information as comments in the HLO text.
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.h
index f3a02f83988..b4810da7dc8 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.h
@@ -29,7 +29,7 @@ mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
 
 mlir::LogicalResult MlirHloToHloTextTranslateFunction(
     mlir::ModuleOp module, llvm::raw_ostream& output, bool emit_return_tuple,
-    bool emit_use_tuple_arg, bool legalize_node_names, bool print_layouts,
+    bool emit_use_tuple_arg, bool print_layouts, bool print_sugar,
     bool print_large_constants, bool via_builder, bool with_layouts);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate_registration.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate_registration.cc
index e91a4ddc746..4a46ce311c6 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate_registration.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate_registration.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.h"
 
 namespace {
@@ -35,12 +35,6 @@ llvm::cl::opt<bool> emit_return_tuple(
     llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
     llvm::cl::init(false));
 
-// NOLINTNEXTLINE
-llvm::cl::opt<bool> legalize_node_names(
-    "legalize-node-names",
-    llvm::cl::desc("Legalize nodes names when translating MHLO->XLA HLO"),
-    llvm::cl::init(true));
-
 // NOLINTNEXTLINE
 llvm::cl::opt<bool> with_layouts(
     "with-layouts",
@@ -58,6 +52,13 @@ llvm::cl::opt<bool> print_large_constants(
     llvm::cl::desc("Print large constants in the generated HLO text"),
     llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> print_sugar(
+    "print-sugar",
+    llvm::cl::desc(
+        "Print async ops using syntactic sugar in the generated HLO text"),
+    llvm::cl::init(true));
+
 // NOLINTNEXTLINE
 llvm::cl::opt<bool> via_builder(
     "via-builder", llvm::cl::desc("Translate MHLO->XLA HLO via XLA Builder"),
@@ -73,9 +74,8 @@ static mlir::LogicalResult MlirHloToHloTranslate(mlir::ModuleOp module,
 static mlir::LogicalResult MlirHloToHloTextTranslate(
     mlir::ModuleOp module, llvm::raw_ostream& output) {
   return xla::MlirHloToHloTextTranslateFunction(
-      module, output, emit_return_tuple, emit_use_tuple_arg,
-      legalize_node_names, print_layouts, print_large_constants, via_builder,
-      with_layouts);
+      module, output, emit_return_tuple, emit_use_tuple_arg, print_layouts,
+      print_large_constants, print_sugar, via_builder, with_layouts);
 }
 
 static void RegisterInputDialects(mlir::DialectRegistry& registry) {
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
index 79299c4a974..b0e3ace9369 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -44,7 +44,11 @@ using xla::ShapeUtil;
 namespace xla {
 
 PrimitiveType TypeToPrimitiveType(mlir::Type type) {
-  if (type.isBF16()) {
+  if (type.isFloat8E5M2()) {
+    return PrimitiveType::F8E5M2;
+  } else if (type.isFloat8E4M3FN()) {
+    return PrimitiveType::F8E4M3FN;
+  } else if (type.isBF16()) {
     return PrimitiveType::BF16;
   } else if (type.isF16()) {
     return PrimitiveType::F16;
@@ -164,21 +168,21 @@ Shape TypeToShape(mlir::Type type) {
     if (auto extn = t.getEncoding().dyn_cast_or_null<TypeExtensionsAttr>()) {
       bounds = llvm::to_vector<4>(extn.getBounds());
     } else {
-      bounds.assign(rank, ShapedType::kDynamicSize);
+      bounds.assign(rank, ShapedType::kDynamic);
     }
 
-    llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamicSize);
+    llvm::SmallVector<int64_t, 4> shape(rank, mlir::ShapedType::kDynamic);
     std::vector<bool> is_dynamic(rank, false);
     for (int64_t dim = 0; dim < rank; ++dim) {
       // Only fully static shapes are supported.
       // TODO(b/115638799): Update once xla::Shape can support dynamic shapes.
       int64_t size = t.getDimSize(dim);
-      if (size == ShapedType::kDynamicSize) {
-        if (bounds[dim] == ShapedType::kDynamicSize) return {};
+      if (size == ShapedType::kDynamic) {
+        if (bounds[dim] == ShapedType::kDynamic) return {};
         shape[dim] = bounds[dim];
         is_dynamic[dim] = true;
       } else {
-        if (bounds[dim] != ShapedType::kDynamicSize) return {};
+        if (bounds[dim] != ShapedType::kDynamic) return {};
         shape[dim] = size;
       }
     }
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape_test.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
index 7a017801f17..f4316e62a5c 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
@@ -128,10 +128,10 @@ TEST(TypeToShapeTest, ConvertTensorTypeToTypes) {
       EqualsProto(
           ShapeUtil::MakeShape(PrimitiveType::F32, {8, 128}).ToProto()));
 
-  llvm::SmallVector<int64_t, 4> bounds = {8, mlir::ShapedType::kDynamicSize};
+  llvm::SmallVector<int64_t, 4> bounds = {8, mlir::ShapedType::kDynamic};
   auto extensions = mlir::mhlo::TypeExtensionsAttr::get(&context, bounds);
   EXPECT_THAT(
-      TypeToShape(RankedTensorType::get({mlir::ShapedType::kDynamicSize, 128},
+      TypeToShape(RankedTensorType::get({mlir::ShapedType::kDynamic, 128},
                                         b.getF32Type(), extensions))
           .ToProto(),
       EqualsProto(
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD
new file mode 100644
index 00000000000..52a75aab310
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/BUILD
@@ -0,0 +1,104 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "mhlo_to_lhlo_with_xla",
+    srcs = ["mhlo_to_lhlo_with_xla.cc"],
+    hdrs = ["mhlo_to_lhlo_with_xla.h"],
+    deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/mlir/utils:error_util",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
+        "//tensorflow/compiler/xla/service/gpu:cublas_cudnn",
+        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
+        "//tensorflow/compiler/xla/service/gpu:matmul_utils",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:attribute_importer",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_module_importer",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/types:optional",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TranslateLib",
+    ],
+)
+
+cc_library(
+    name = "translate_registration",
+    testonly = True,
+    srcs = ["translate_registration.cc"],
+    deps = [
+        ":mhlo_to_lhlo_with_xla",
+        "@llvm-project//mlir:TranslateLib",
+    ],
+    alwayslink = 1,
+)
+
+xla_cc_binary(
+    name = "xla-translate-opt",
+    testonly = True,
+    srcs = ["xla_translate_opt_main.cc"],
+    deps = [
+        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service/cpu:hlo_xla_runtime_pipeline",  # buildcleaner: keep
+        "//tensorflow/tsl/platform:platform_port",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+        "@stablehlo//:register",
+    ],
+)
+
+xla_cc_binary(
+    name = "xla-translate-gpu-opt",
+    testonly = True,
+    srcs = ["xla_translate_opt_main.cc"],
+    deps = [
+        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
+        "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
+        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
+        "//tensorflow/compiler/xla/mlir_hlo:all_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+        "//tensorflow/tsl/platform:platform_port",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+        "@stablehlo//:register",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
similarity index 78%
rename from tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
index 12574f08ab2..c977b0c8f5b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
@@ -13,55 +13,53 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 
+#include <algorithm>
+#include <array>
 #include <climits>
+#include <functional>
 #include <memory>
+#include <optional>
+#include <string>
 #include <tuple>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/types/optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassOptions.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/mlir/utils/error_util.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cublas_cudnn.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -74,6 +72,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 using xla::BufferAllocation;
 using xla::BufferAssignment;
@@ -86,7 +87,6 @@ using xla::HloModuleProto;
 using xla::HloOutfeedInstruction;
 using xla::HloProto;
 using xla::Shape;
-using xla::StatusOr;
 
 namespace mlir {
 namespace {
@@ -95,7 +95,7 @@ absl::string_view StringRefToView(llvm::StringRef ref) {
   return {ref.data(), ref.size()};
 }
 
-StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
+tsl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
     const HloProto& hlo_proto) {
   const HloModuleProto& module_proto = hlo_proto.hlo_module();
   TF_ASSIGN_OR_RETURN(const xla::HloModuleConfig module_config,
@@ -108,9 +108,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
 
 // Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
 // given platform.
-Status OptimizeAndConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
-                                    ModuleOp module, StringRef platform_name,
-                                    bool optimize_xla_hlo) {
+tsl::Status OptimizeAndConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
+                                         ModuleOp module,
+                                         StringRef platform_name,
+                                         bool optimize_xla_hlo) {
   auto platform = xla::se::MultiPlatformManager::PlatformWithName(
       StringRefToView(platform_name));
   if (!platform.ok()) {
@@ -126,7 +127,7 @@ Status OptimizeAndConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
         });
     llvm::interleaveComma(available_platforms, os);
     os << ")";
-    return xla::InvalidArgument("%s", os.str().c_str());
+    return tsl::errors::InvalidArgument("%s", os.str().c_str());
   }
 
   xla::BackendOptions backend_options;
@@ -136,7 +137,7 @@ Status OptimizeAndConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
                                   "failed to create XLA Backend ");
   auto backend = std::move(backend_or_err.value());
 
-  StatusOr<std::unique_ptr<HloModule>> optimized_hlo_module;
+  tsl::StatusOr<std::unique_ptr<HloModule>> optimized_hlo_module;
 
   if (optimize_xla_hlo) {
     // Run all HLO passes to produce an optimized module.
@@ -149,8 +150,9 @@ Status OptimizeAndConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
     optimized_hlo_module = std::move(hlo_module);
   }
 
-  StatusOr<std::unique_ptr<BufferAssignment>> assignment =
-      backend->compiler()->AssignBuffers(optimized_hlo_module->get());
+  tsl::StatusOr<std::unique_ptr<BufferAssignment>> assignment =
+      backend->compiler()->AssignBuffers(optimized_hlo_module->get(),
+                                         backend->default_stream_executor());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(assignment.status(),
                                   "running XLA buffer assigment");
 
@@ -193,10 +195,10 @@ class XlaHloToLhloPass
   void runOnOperation() final {
     ModuleOp module = getOperation();
 
-    auto status = [&module, this]() -> Status {
+    auto status = [&module, this]() -> tsl::Status {
       SymbolTable symbol_table(module);
       if (!symbol_table.lookup("main")) {
-        return xla::InvalidArgument(
+        return tsl::errors::InvalidArgument(
             "conversion to HLO module failed: missing main()");
       }
       HloProto hlo_proto;
@@ -241,12 +243,13 @@ class XlaHloToLhloPass
 // Creates MLIR operands corresponding to operands and results of the XLA HLO
 // instruction. If `num_operands` is valid, then only the first `num_operands`
 // operands of the HLO instruction will be considered.
-Status LhloDialectEmitter::CreateOperands(
+tsl::Status LhloDialectEmitter::CreateOperands(
     const HloInstruction* instr, std::optional<int64_t> num_operands,
     TokenLoweringMode token_mode, llvm::SmallVectorImpl<Value>& operands,
     size_t& num_arguments, size_t& num_results) {
   if (num_operands.value_or(0) > instr->operand_count())
-    return xla::InvalidArgument("num_operands must be <= operand count");
+    return tsl::errors::InvalidArgument(
+        "num_operands must be <= operand count");
   for (int64_t i = 0; i < num_operands.value_or(instr->operand_count()); ++i) {
     TF_RETURN_IF_ERROR(GetOrCreateView(instr->operand(i), &operands,
                                        /*result_subset=*/{}, token_mode));
@@ -267,7 +270,7 @@ OpType LhloDialectEmitter::CreateOpWithoutAttrs(const HloInstruction* instr,
 }
 
 template <typename OpType>
-StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
+tsl::StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
     const HloInstruction* instr, size_t& num_arguments, size_t& num_results,
     std::optional<int64_t> num_operands) {
   llvm::SmallVector<Value, 4> operands;
@@ -277,7 +280,7 @@ StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
   return CreateOpWithoutAttrs<OpType>(instr, operands);
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
+tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
     const HloInstruction* instr, ValueRange buffer_operands,
     size_t num_arguments, size_t num_results) {
   Location loc = getLocation(instr);
@@ -308,19 +311,20 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
     std::vector<int64_t> dimensions(instr->dimensions().begin(),
                                     instr->dimensions().end());
     auto reduce_op = b.create<mhlo::ReduceOp>(
-        loc, llvm::makeArrayRef(loads).take_front(loads.size() / 2),
-        llvm::makeArrayRef(loads).drop_front(loads.size() / 2),
+        loc, llvm::ArrayRef(loads).take_front(loads.size() / 2),
+        llvm::ArrayRef(loads).drop_front(loads.size() / 2),
         GetI64DenseElementsAttr(dimensions));
 
     TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-        *instr->called_computations()[0], &reduce_op.getBody(), &builder_,
+        *instr->called_computations()[0], symbol_table_, &reduce_op.getBody(),
+        &builder_,
         /*flatten_region_arg_tuple=*/true));
     op = reduce_op;
   } else {
-    TF_ASSIGN_OR_RETURN(
-        op,
-        xla::HloFunctionImporter::ImportInstruction(
-            instr, loads, &b, xla::DynamicShapeHandlingMode::kConvertToStatic));
+    TF_ASSIGN_OR_RETURN(op,
+                        xla::HloFunctionImporter::ImportInstruction(
+                            instr, loads, symbol_table_, &b,
+                            xla::DynamicShapeHandlingMode::kConvertToStatic));
   }
   TF_RET_CHECK(op->getNumResults() == num_results);
   for (int i = 0; i < results.size(); i++) {
@@ -329,7 +333,7 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
   return op;
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
+tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
     const HloInstruction* instr) {
   llvm::SmallVector<Value, 4> operands;
   size_t num_arguments, num_results;
@@ -341,7 +345,7 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
   return op->getParentOp();
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
+tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
     const HloInstruction* instr) {
   using xla::HloOpcode;
   switch (instr->opcode()) {
@@ -367,6 +371,10 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
       return EmitBitcast(instr);
     case HloOpcode::kCollectivePermute:
       return EmitCollectivePermuteOp(instr);
+    case HloOpcode::kCollectivePermuteStart:
+      return EmitCollectivePermuteStartOp(instr);
+    case HloOpcode::kCollectivePermuteDone:
+      return EmitCollectivePermuteDoneOp(instr);
     case HloOpcode::kConditional:
       return EmitCaseOp(instr);
     case HloOpcode::kFft:
@@ -401,6 +409,14 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
       return EmitRngGetAndUpdateStateOp(instr);
     case HloOpcode::kWhile:
       return EmitWhileOp(instr);
+    case HloOpcode::kSend:
+      return EmitSendOp(instr);
+    case HloOpcode::kSendDone:
+      return EmitSendDoneOp(instr);
+    case HloOpcode::kRecv:
+      return EmitRecvOp(instr);
+    case HloOpcode::kRecvDone:
+      return EmitRecvDoneOp(instr);
 
     case HloOpcode::kAbs:
     case HloOpcode::kAdd:
@@ -459,6 +475,7 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
     case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kStochasticConvert:
+    case HloOpcode::kTan:
     case HloOpcode::kTanh:
     case HloOpcode::kTranspose:
     case HloOpcode::kXor:
@@ -473,11 +490,11 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
   }
 }
 
-Status LhloDialectEmitter::DefaultAction(const HloInstruction* instr) {
+tsl::Status LhloDialectEmitter::DefaultAction(const HloInstruction* instr) {
   return EmitOp(instr).status();
 }
 
-StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(
+tsl::StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<lmhlo::SortOp>(instr));
   auto* sort_instr = xla::Cast<xla::HloSortInstruction>(instr);
@@ -485,13 +502,14 @@ StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(
       builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
   sort.setIsStableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *sort_instr->called_computations()[0], &sort.getComparator(), &builder_));
+      *sort_instr->called_computations()[0], symbol_table_,
+      &sort.getComparator(), &builder_));
   return sort;
 }
 
 // Walks MHLO::TupleOp recursively.
-Status WalkTuplePostOrder(Value v,
-                          const std::function<Status(Value)>& visitor) {
+tsl::Status WalkTuplePostOrder(
+    Value v, const std::function<tsl::Status(Value)>& visitor) {
   if (auto* op = v.getDefiningOp()) {
     if (auto tuple = dyn_cast<mhlo::TupleOp>(op)) {
       for (Value sub_v : tuple.getVal()) {
@@ -503,7 +521,7 @@ Status WalkTuplePostOrder(Value v,
   return visitor(v);
 }
 
-StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
+tsl::StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
     const HloInstruction* root, const Shape& shape,
     xla::ShapeIndex* shape_index, OpBuilder* b, Location loc) {
   if (shape.IsTuple()) {
@@ -548,7 +566,7 @@ StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
 //       ...
 //       tensor_store ..., %ret // store a tensor to a memref
 //     }
-StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
+tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
     const HloInstruction* instr) {
   Location loc = getLocation(instr);
 
@@ -575,7 +593,7 @@ StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
   TF_ASSIGN_OR_RETURN(Value result,
                       xla::HloFunctionImporter::ImportInstructions(
                           *fusion_instr->fused_instructions_computation(),
-                          arguments, &region_builder));
+                          arguments, symbol_table_, &region_builder));
   {
     int i = 0;
     llvm::SmallVector<Value, 4> output;
@@ -617,7 +635,7 @@ StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
   return fusion;
 }
 
-StatusOr<mhlo::ScatterDimensionNumbersAttr>
+tsl::StatusOr<mhlo::ScatterDimensionNumbersAttr>
 LhloDialectEmitter::GetScatterDimensionNumbers(const HloInstruction* instr,
                                                mlir::MLIRContext* context) {
   auto* scatter_instr = xla::Cast<xla::HloScatterInstruction>(instr);
@@ -637,7 +655,7 @@ LhloDialectEmitter::GetScatterDimensionNumbers(const HloInstruction* instr,
   return scatter_dimension_numbers;
 }
 
-StatusOr<lmhlo::ScatterOp> LhloDialectEmitter::EmitScatterOp(
+tsl::StatusOr<lmhlo::ScatterOp> LhloDialectEmitter::EmitScatterOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto scatter,
                       CreateOpWithoutAttrs<lmhlo::ScatterOp>(instr));
@@ -655,14 +673,14 @@ StatusOr<lmhlo::ScatterOp> LhloDialectEmitter::EmitScatterOp(
 
   // import update computation as region
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *scatter_instr->called_computations()[0], &scatter.getUpdateComputation(),
-      &builder_));
+      *scatter_instr->called_computations()[0], symbol_table_,
+      &scatter.getUpdateComputation(), &builder_));
 
   return scatter;
 }
 
-StatusOr<lmhlo::SelectAndScatterOp> LhloDialectEmitter::EmitSelectAndScatterOp(
-    const HloInstruction* instr) {
+tsl::StatusOr<lmhlo::SelectAndScatterOp>
+LhloDialectEmitter::EmitSelectAndScatterOp(const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto select_and_scatter,
                       CreateOpWithoutAttrs<lmhlo::SelectAndScatterOp>(instr));
 
@@ -672,7 +690,8 @@ StatusOr<lmhlo::SelectAndScatterOp> LhloDialectEmitter::EmitSelectAndScatterOp(
   const xla::Window& window = select_and_scatter_instr->window();
 
   if (xla::window_util::HasDilation(window)) {
-    return xla::Unimplemented("Dilation for SelectAndScatter is not supported");
+    return tsl::errors::Unimplemented(
+        "Dilation for SelectAndScatter is not supported");
   }
 
   select_and_scatter.setWindowDimensionsAttr(
@@ -690,15 +709,15 @@ StatusOr<lmhlo::SelectAndScatterOp> LhloDialectEmitter::EmitSelectAndScatterOp(
 
   // import select and scatter computation as region
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *select_and_scatter_instr->select(), &select_and_scatter.getSelect(),
-      &builder_));
+      *select_and_scatter_instr->select(), symbol_table_,
+      &select_and_scatter.getSelect(), &builder_));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *select_and_scatter_instr->scatter(), &select_and_scatter.getScatter(),
-      &builder_));
+      *select_and_scatter_instr->scatter(), symbol_table_,
+      &select_and_scatter.getScatter(), &builder_));
   return select_and_scatter;
 }
 
-StatusOr<mlir::Operation*> LhloDialectEmitter::EmitCustomCallOp(
+tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitCustomCallOp(
     const HloInstruction* instr) {
   auto* custom_call_instr = xla::Cast<xla::HloCustomCallInstruction>(instr);
 
@@ -718,6 +737,10 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitCustomCallOp(
     return EmitCublasLtMatmul(custom_call_instr);
   }
 
+  if (xla::gpu::IsCublasLtMatmulF8(*instr)) {
+    return EmitCublasLtMatmulF8(custom_call_instr);
+  }
+
   if (xla::gpu::IsCustomCallToDnnConvolution(*instr)) {
     return EmitDnnConvolution(custom_call_instr);
   }
@@ -774,19 +797,44 @@ StatusOr<mlir::Operation*> LhloDialectEmitter::EmitCustomCallOp(
       ConvertCustomCallApiVersion(custom_call_instr->api_version()));
   custom_call.setCallTargetNameAttr(
       builder_.getStringAttr(custom_call_instr->custom_call_target()));
-  custom_call.setBackendConfigAttr(
-      builder_.getStringAttr(custom_call_instr->opaque()));
   custom_call.setApiVersionAttr(mhlo::CustomCallApiVersionAttr::get(
       builder_.getContext(), mlir_api_version));
+
+  // For typed custom calls we need to parse user-defined attributes back to the
+  // dictionary attribute, and then add them back to the custom call op.
+  if (mlir_api_version == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    if (custom_call_instr->opaque().empty()) {
+      auto empty = mlir::DictionaryAttr::get(builder_.getContext());
+      custom_call.setBackendConfigAttr(empty);
+    } else {
+      mlir::Attribute attr = mlir::parseAttribute(custom_call_instr->opaque(),
+                                                  builder_.getContext());
+      TF_RET_CHECK(attr.isa<mlir::DictionaryAttr>())
+          << "Couldn't parse backend config into a dictionary attribute";
+      custom_call.setBackendConfigAttr(attr);
+    }
+  } else {
+    custom_call.setBackendConfigAttr(
+        builder_.getStringAttr(custom_call_instr->opaque()));
+  }
+
   const int32_t segments[2] = {static_cast<int32_t>(num_arguments),
                                static_cast<int32_t>(num_results)};
   custom_call->setAttr(lmhlo::CustomCallOp::getOperandSegmentSizeAttr(),
                        builder_.getDenseI32ArrayAttr(segments));
   if (target_mapping) custom_call.setTargetArgMappingAttr(target_mapping);
+
+  for (int i = 0; i < custom_call_instr->called_computations().size(); ++i) {
+    auto& region = custom_call->getRegion(i);
+    TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
+        *custom_call_instr->called_computation(), symbol_table_, &region,
+        &builder_));
+  }
+
   return custom_call.getOperation();
 }
 
-StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitSoftmax(
+tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitSoftmax(
     const HloInstruction* instr) {
   Location loc = getLocation(instr);
 
@@ -811,9 +859,10 @@ StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitSoftmax(
     arguments.push_back(arg);
   }
 
-  TF_ASSIGN_OR_RETURN(Value result,
-                      xla::HloFunctionImporter::ImportInstructions(
-                          *instr->to_apply(), arguments, &region_builder));
+  TF_ASSIGN_OR_RETURN(
+      Value result,
+      xla::HloFunctionImporter::ImportInstructions(
+          *instr->to_apply(), arguments, symbol_table_, &region_builder));
   llvm::SmallVector<Value, 4> output;
   TF_RETURN_IF_ERROR(GetOrCreateView(instr, &output));
   region_builder.create<memref::TensorStoreOp>(loc, result, output[0]);
@@ -821,7 +870,7 @@ StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitSoftmax(
   return fusion;
 }
 
-StatusOr<lmhlo_gpu::CholeskyOp> LhloDialectEmitter::EmitCholesky(
+tsl::StatusOr<lmhlo_gpu::CholeskyOp> LhloDialectEmitter::EmitCholesky(
     const HloCustomCallInstruction* custom_call) {
   TF_ASSIGN_OR_RETURN(auto cholesky_op,
                       CreateOpWithoutAttrs<lmhlo_gpu::CholeskyOp>(custom_call));
@@ -858,21 +907,25 @@ void SetMatmulAttributes(OpT op, const xla::gpu::GemmBackendConfig& config,
       xla::ConvertPrecisionConfig(&config.precision_config(), &builder));
 }
 
-StatusOr<lmhlo_gpu::CublasLtMatmulEpilogue> AsLhloEpilogue(
+tsl::StatusOr<lmhlo_gpu::CublasLtMatmulEpilogue> AsLhloEpilogue(
     xla::gpu::GemmBackendConfig_Epilogue epilogue) {
   switch (epilogue) {
     case xla::gpu::GemmBackendConfig::DEFAULT:
       return lmhlo_gpu::CublasLtMatmulEpilogue::Default;
-      break;
-    case xla::gpu::GemmBackendConfig::BIAS:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::Bias;
-      break;
     case xla::gpu::GemmBackendConfig::RELU:
       return lmhlo_gpu::CublasLtMatmulEpilogue::Relu;
-      break;
-    case xla::gpu::GemmBackendConfig::BIASRELU:
+    case xla::gpu::GemmBackendConfig::GELU:
+      return lmhlo_gpu::CublasLtMatmulEpilogue::Gelu;
+    case xla::gpu::GemmBackendConfig::GELU_AUX:
+      return lmhlo_gpu::CublasLtMatmulEpilogue::GeluAux;
+    case xla::gpu::GemmBackendConfig::BIAS:
+      return lmhlo_gpu::CublasLtMatmulEpilogue::Bias;
+    case xla::gpu::GemmBackendConfig::BIAS_RELU:
       return lmhlo_gpu::CublasLtMatmulEpilogue::BiasRelu;
-      break;
+    case xla::gpu::GemmBackendConfig::BIAS_GELU:
+      return lmhlo_gpu::CublasLtMatmulEpilogue::BiasGelu;
+    case xla::gpu::GemmBackendConfig::BIAS_GELU_AUX:
+      return lmhlo_gpu::CublasLtMatmulEpilogue::BiasGeluAux;
     default:
       return xla::InternalError("unknown epilogue");
   }
@@ -880,7 +933,7 @@ StatusOr<lmhlo_gpu::CublasLtMatmulEpilogue> AsLhloEpilogue(
 
 }  // namespace
 
-StatusOr<Operation*> LhloDialectEmitter::EmitGemm(
+tsl::StatusOr<Operation*> LhloDialectEmitter::EmitGemm(
     const HloCustomCallInstruction* custom_call) {
   TF_ASSIGN_OR_RETURN(
       auto const config,
@@ -903,7 +956,7 @@ StatusOr<Operation*> LhloDialectEmitter::EmitGemm(
   return op.getOperation();
 }
 
-StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmul(
+tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmul(
     const HloCustomCallInstruction* custom_call) {
   TF_ASSIGN_OR_RETURN(
       auto const config,
@@ -915,25 +968,44 @@ StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmul(
       bool has_vector_bias,
       xla::gpu::cublas_lt::EpilogueAddsVectorBias(config.epilogue()));
 
+  TF_ASSIGN_OR_RETURN(
+      bool has_aux_output,
+      xla::gpu::cublas_lt::EpilogueHasAuxiliaryOutput(config.epilogue()));
+
   TF_RET_CHECK(custom_call->operand_count() ==
                2 + int{has_matrix_bias} + int{has_vector_bias});
 
-  llvm::SmallVector<Value, 5> operands;
+  xla::ShapeIndex output_index =
+      has_aux_output ? xla::ShapeIndex{0} : xla::ShapeIndex{};
+
+  llvm::SmallVector<Value, 6> operands;
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(
-      has_matrix_bias ? custom_call->operand(2) : custom_call, &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
+  if (has_matrix_bias) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
+  } else {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
+  }
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
 
   if (has_vector_bias) {
     TF_RETURN_IF_ERROR(GetOrCreateView(
         custom_call->operand(has_matrix_bias ? 3 : 2), &operands));
   }
 
+  if (has_aux_output) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, {1}));
+  }
+
   auto op =
       CreateOpWithoutAttrs<lmhlo_gpu::CublasLtMatmulOp>(custom_call, operands);
   SetMatmulAttributes(op, config, builder_);
 
+  int32_t operand_sizes[] = {
+      1, 1, 1, 1, has_vector_bias ? 1 : 0, has_aux_output ? 1 : 0};
+  op->setAttr(op.getOperandSegmentSizeAttr(),
+              builder_.getDenseI32ArrayAttr(operand_sizes));
+
   TF_ASSIGN_OR_RETURN(lmhlo_gpu::CublasLtMatmulEpilogue epilogue,
                       AsLhloEpilogue(config.epilogue()));
   op.setEpilogueAttr(lmhlo_gpu::CublasLtMatmulEpilogueAttr::get(
@@ -948,7 +1020,44 @@ StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmul(
   return op.getOperation();
 }
 
-static StatusOr<mlir::lmhlo_gpu::Activation> GetLHLOActivation(
+tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmulF8(
+    const HloCustomCallInstruction* custom_call) {
+  TF_ASSIGN_OR_RETURN(
+      auto const config,
+      custom_call->backend_config<xla::gpu::GemmBackendConfig>());
+
+  TF_RET_CHECK(custom_call->operand_count() == 7);
+
+  llvm::SmallVector<Value, 9> operands;
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
+
+  auto op = CreateOpWithoutAttrs<lmhlo_gpu::CublasLtMatmulF8Op>(custom_call,
+                                                                operands);
+
+  SetMatmulAttributes(op, config, builder_);
+
+  TF_ASSIGN_OR_RETURN(lmhlo_gpu::CublasLtMatmulEpilogue epilogue,
+                      AsLhloEpilogue(config.epilogue()));
+  op.setEpilogueAttr(lmhlo_gpu::CublasLtMatmulEpilogueAttr::get(
+      builder_.getContext(), epilogue));
+
+  // Use the first algorithm by default (i.e. fastest according to heuristics).
+  if (config.algorithm_case() !=
+      xla::gpu::GemmBackendConfig::kSelectedAlgorithm) {
+    op.setAlgorithmAttr(builder_.getI64IntegerAttr(0));
+  }
+
+  return op.getOperation();
+}
+
+static tsl::StatusOr<mlir::lmhlo_gpu::Activation> GetLHLOActivation(
     stream_executor::dnn::ActivationMode activation) {
   switch (activation) {
     case stream_executor::dnn::kNone:
@@ -965,12 +1074,14 @@ static StatusOr<mlir::lmhlo_gpu::Activation> GetLHLOActivation(
       return mlir::lmhlo_gpu::Activation::Tanh;
     case stream_executor::dnn::kBandPass:
       return mlir::lmhlo_gpu::Activation::BandPass;
+    case stream_executor::dnn::kElu:
+      return mlir::lmhlo_gpu::Activation::Elu;
     default:
       return xla::InternalError("Unknown activation");
   }
 }
 
-StatusOr<Operation*> LhloDialectEmitter::EmitDnnConvolution(
+tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnConvolution(
     const HloCustomCallInstruction* custom_call) {
   TF_ASSIGN_OR_RETURN(
       auto const backend_config,
@@ -1058,7 +1169,7 @@ StatusOr<Operation*> LhloDialectEmitter::EmitDnnConvolution(
     return op.getOperation();
   };
 
-  auto set_activation = [&, this](auto op) -> Status {
+  auto set_activation = [&, this](auto op) -> tsl::Status {
     auto se_activation = static_cast<stream_executor::dnn::ActivationMode>(
         backend_config.activation_mode());
     TF_ASSIGN_OR_RETURN(mlir::lmhlo_gpu::Activation activation,
@@ -1112,7 +1223,7 @@ StatusOr<Operation*> LhloDialectEmitter::EmitDnnConvolution(
 }
 
 // Convert an XLA HLO constant to a global_memref + get_global_memref pair.
-StatusOr<mlir::memref::GetGlobalOp> LhloDialectEmitter::EmitConstant(
+tsl::StatusOr<mlir::memref::GetGlobalOp> LhloDialectEmitter::EmitConstant(
     const HloInstruction* instr) {
   auto& cached_value = slices_[std::make_pair(instr, xla::ShapeIndex())];
   if (cached_value) {
@@ -1144,7 +1255,7 @@ StatusOr<mlir::memref::GetGlobalOp> LhloDialectEmitter::EmitConstant(
     auto global_var = builder_.create<memref::GlobalOp>(
         loc, constant_name, builder_.getStringAttr("private"), memref_type,
         initial_value, true, /*alignment=*/IntegerAttr());
-    SymbolTable(module_).insert(global_var);
+    symbol_table_.insert(global_var);
     global_var.getOperation()->moveBefore(&module_.front());
 
     // For operations that do not fold this constant value in their codegen, we
@@ -1183,8 +1294,9 @@ void SetupChannelIdAttribute(OpT op, const xla::HloChannelInstruction* instr,
 }
 
 template <typename OpT>
-Status SetupCommonCollectiveOpAttributes(OpT op, const HloInstruction* instr,
-                                         mlir::OpBuilder& builder) {
+tsl::Status SetupCommonCollectiveOpAttributes(OpT op,
+                                              const HloInstruction* instr,
+                                              mlir::OpBuilder& builder) {
   auto* collective = xla::Cast<xla::HloCollectiveInstruction>(instr);
   auto replica_groups_attr = xla::HloFunctionImporter::ConvertReplicaGroups(
       collective->replica_groups(), &builder);
@@ -1196,7 +1308,7 @@ Status SetupCommonCollectiveOpAttributes(OpT op, const HloInstruction* instr,
 }
 }  // namespace
 
-StatusOr<lmhlo::AllToAllOp> LhloDialectEmitter::EmitAllToAllOp(
+tsl::StatusOr<lmhlo::AllToAllOp> LhloDialectEmitter::EmitAllToAllOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto all_to_all_op,
                       CreateOpWithoutAttrs<lmhlo::AllToAllOp>(instr));
@@ -1210,7 +1322,7 @@ StatusOr<lmhlo::AllToAllOp> LhloDialectEmitter::EmitAllToAllOp(
   return all_to_all_op;
 }
 
-StatusOr<lmhlo::AllGatherOp> LhloDialectEmitter::EmitAllGatherOp(
+tsl::StatusOr<lmhlo::AllGatherOp> LhloDialectEmitter::EmitAllGatherOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto all_gather_op,
                       CreateOpWithoutAttrs<lmhlo::AllGatherOp>(instr));
@@ -1224,7 +1336,7 @@ StatusOr<lmhlo::AllGatherOp> LhloDialectEmitter::EmitAllGatherOp(
   return all_gather_op;
 }
 
-StatusOr<lmhlo::AllReduceOp> LhloDialectEmitter::EmitAllReduceOp(
+tsl::StatusOr<lmhlo::AllReduceOp> LhloDialectEmitter::EmitAllReduceOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto all_reduce_op,
                       CreateOpWithoutAttrs<lmhlo::AllReduceOp>(instr));
@@ -1234,23 +1346,23 @@ StatusOr<lmhlo::AllReduceOp> LhloDialectEmitter::EmitAllReduceOp(
   all_reduce_op.setUseGlobalDeviceIdsAttr(
       builder_.getBoolAttr(all_reduce->use_global_device_ids()));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], &all_reduce_op.getComputation(),
-      &builder_));
+      *instr->called_computations()[0], symbol_table_,
+      &all_reduce_op.getComputation(), &builder_));
   return all_reduce_op;
 }
 
-StatusOr<lmhlo_gpu::AllReduceStartOp> LhloDialectEmitter::EmitAllReduceStartOp(
-    const HloInstruction* instr) {
+tsl::StatusOr<lmhlo_gpu::AllReduceStartOp>
+LhloDialectEmitter::EmitAllReduceStartOp(const HloInstruction* instr) {
   llvm::SmallVector<Value, 4> operands;
   for (const HloInstruction* operand : instr->operands()) {
     TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
   }
   TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{}));
 
-  Location loc = getLocation(instr);
+  mlir::Location loc = getLocation(instr);
   mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
   std::array<mlir::Type, 1> result_types = {token_type};
-  lmhlo_gpu::AllReduceStartOp all_reduce_start_op =
+  auto all_reduce_start_op =
       builder_.create<lmhlo_gpu::AllReduceStartOp>(loc, result_types, operands);
 
   auto* all_reduce = xla::Cast<xla::HloAllReduceInstruction>(instr);
@@ -1259,33 +1371,24 @@ StatusOr<lmhlo_gpu::AllReduceStartOp> LhloDialectEmitter::EmitAllReduceStartOp(
   all_reduce_start_op.setUseGlobalDeviceIdsAttr(
       builder_.getBoolAttr(all_reduce->use_global_device_ids()));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], &all_reduce_start_op.getComputation(),
-      &builder_));
+      *instr->called_computations()[0], symbol_table_,
+      &all_reduce_start_op.getComputation(), &builder_));
 
-  TF_RET_CHECK(all_reduce_start_ops_.emplace(instr, all_reduce_start_op).second)
-      << "all-reduce-start already lowered";
+  auto [_, was_inserted] =
+      ret_tokens_.insert({instr, all_reduce_start_op.getToken()});
+  TF_RET_CHECK(was_inserted) << "all-reduce-start already lowered";
   return all_reduce_start_op;
 }
 
-StatusOr<lmhlo_gpu::AllReduceDoneOp> LhloDialectEmitter::EmitAllReduceDoneOp(
-    const HloInstruction* instr) {
-  auto it = all_reduce_start_ops_.find(instr->operand(0));
-  TF_RET_CHECK(it != all_reduce_start_ops_.end())
-      << "didn't find all-reduce-start op";
-
-  llvm::SmallVector<Value, 4> operands;
-  operands.push_back(it->second.getToken());
-  all_reduce_start_ops_.erase(it);
-
-  for (const HloInstruction* operand : instr->operands()) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
-  }
-  // We don't need to add buffers for the outputs, as these always alias inputs.
+tsl::StatusOr<lmhlo_gpu::AllReduceDoneOp>
+LhloDialectEmitter::EmitAllReduceDoneOp(const HloInstruction* instr) {
+  auto token = ret_tokens_.extract(instr->operand(0));
+  TF_RET_CHECK(token) << "didn't find all-reduce-start token";
   return builder_.create<lmhlo_gpu::AllReduceDoneOp>(
-      getLocation(instr), /*resultTypes=*/llvm::None, operands);
+      getLocation(instr), /*resultTypes=*/llvm::None, token.mapped());
 }
 
-StatusOr<lmhlo::ReduceScatterOp> LhloDialectEmitter::EmitReduceScatterOp(
+tsl::StatusOr<lmhlo::ReduceScatterOp> LhloDialectEmitter::EmitReduceScatterOp(
     const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto reduce_scatter_op,
                       CreateOpWithoutAttrs<lmhlo::ReduceScatterOp>(instr));
@@ -1295,14 +1398,14 @@ StatusOr<lmhlo::ReduceScatterOp> LhloDialectEmitter::EmitReduceScatterOp(
   reduce_scatter_op.setUseGlobalDeviceIdsAttr(
       builder_.getBoolAttr(ars->use_global_device_ids()));
   TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], &reduce_scatter_op.getComputation(),
-      &builder_));
+      *instr->called_computations()[0], symbol_table_,
+      &reduce_scatter_op.getComputation(), &builder_));
   reduce_scatter_op.setScatterDimensionAttr(
       builder_.getI64IntegerAttr(ars->scatter_dimension()));
   return reduce_scatter_op;
 }
 
-StatusOr<lmhlo::CollectivePermuteOp>
+tsl::StatusOr<lmhlo::CollectivePermuteOp>
 LhloDialectEmitter::EmitCollectivePermuteOp(const HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto permute_op,
                       CreateOpWithoutAttrs<lmhlo::CollectivePermuteOp>(instr));
@@ -1316,7 +1419,44 @@ LhloDialectEmitter::EmitCollectivePermuteOp(const HloInstruction* instr) {
   return permute_op;
 }
 
-StatusOr<lmhlo::InfeedOp> LhloDialectEmitter::EmitInfeedOp(
+tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
+LhloDialectEmitter::EmitCollectivePermuteStartOp(const HloInstruction* instr) {
+  llvm::SmallVector<Value, 2> operands;
+  for (const HloInstruction* operand : instr->operands()) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
+  }
+  // Ignore the aliased first output and TPU-specific outputs.
+  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{1}));
+
+  mlir::Location loc = getLocation(instr);
+  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
+  std::array<mlir::Type, 1> result_types = {token_type};
+  auto permute_start_op = builder_.create<lmhlo_gpu::CollectivePermuteStartOp>(
+      loc, result_types, operands);
+
+  auto* permute = xla::Cast<xla::HloCollectivePermuteInstruction>(instr);
+  SetupChannelIdAttribute(permute_start_op, permute, builder_);
+  mlir::NamedAttribute source_target_pairs_attr =
+      xla::HloFunctionImporter::ConvertSourceTargetPairs(
+          permute->source_target_pairs(), &builder_);
+  permute_start_op->setAttr(source_target_pairs_attr.getName(),
+                            source_target_pairs_attr.getValue());
+
+  auto [_, was_inserted] =
+      ret_tokens_.insert({instr, permute_start_op.getToken()});
+  TF_RET_CHECK(was_inserted) << "collective-permute-start already lowered";
+  return permute_start_op;
+}
+
+tsl::StatusOr<lmhlo_gpu::CollectivePermuteDoneOp>
+LhloDialectEmitter::EmitCollectivePermuteDoneOp(const HloInstruction* instr) {
+  auto token = ret_tokens_.extract(instr->operand(0));
+  TF_RET_CHECK(token) << "didn't find collective-permute-start token";
+  return builder_.create<lmhlo_gpu::CollectivePermuteDoneOp>(
+      getLocation(instr), /*resultTypes=*/llvm::None, token.mapped());
+}
+
+tsl::StatusOr<lmhlo::InfeedOp> LhloDialectEmitter::EmitInfeedOp(
     const HloInstruction* instr) {
   const HloInfeedInstruction* infeed = xla::Cast<HloInfeedInstruction>(instr);
   // HLO Infeed instruction has a single operand of token type and a tuple
@@ -1329,7 +1469,7 @@ StatusOr<lmhlo::InfeedOp> LhloDialectEmitter::EmitInfeedOp(
   return infeed_op;
 }
 
-StatusOr<lmhlo::OutfeedOp> LhloDialectEmitter::EmitOutfeedOp(
+tsl::StatusOr<lmhlo::OutfeedOp> LhloDialectEmitter::EmitOutfeedOp(
     const HloInstruction* instr) {
   const HloOutfeedInstruction* outfeed =
       xla::Cast<HloOutfeedInstruction>(instr);
@@ -1343,7 +1483,7 @@ StatusOr<lmhlo::OutfeedOp> LhloDialectEmitter::EmitOutfeedOp(
   return outfeed_op;
 }
 
-xla::StatusOr<lmhlo::RngGetAndUpdateStateOp>
+tsl::StatusOr<lmhlo::RngGetAndUpdateStateOp>
 LhloDialectEmitter::EmitRngGetAndUpdateStateOp(
     const xla::HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(
@@ -1353,7 +1493,7 @@ LhloDialectEmitter::EmitRngGetAndUpdateStateOp(
   return rng;
 }
 
-xla::StatusOr<lmhlo::FftOp> LhloDialectEmitter::EmitFftOp(
+tsl::StatusOr<lmhlo::FftOp> LhloDialectEmitter::EmitFftOp(
     const HloInstruction* instr) {
   auto hlo_fft = xla::Cast<xla::HloFftInstruction>(instr);
   TF_ASSIGN_OR_RETURN(auto fft, CreateOpWithoutAttrs<lmhlo::FftOp>(instr));
@@ -1365,7 +1505,7 @@ xla::StatusOr<lmhlo::FftOp> LhloDialectEmitter::EmitFftOp(
   return fft;
 }
 
-xla::StatusOr<lmhlo::TriangularSolveOp>
+tsl::StatusOr<lmhlo::TriangularSolveOp>
 LhloDialectEmitter::EmitTriangularSolveOp(const xla::HloInstruction* instr) {
   auto hlo_triangular_solve =
       xla::Cast<xla::HloTriangularSolveInstruction>(instr);
@@ -1390,7 +1530,7 @@ LhloDialectEmitter::EmitTriangularSolveOp(const xla::HloInstruction* instr) {
   return triangular_solve;
 }
 
-xla::StatusOr<Operation*> LhloDialectEmitter::EmitBitcast(
+tsl::StatusOr<Operation*> LhloDialectEmitter::EmitBitcast(
     const xla::HloInstruction* instr) {
   // XLA buffer assignment should assign the same slice to a bitcast input and
   // output.
@@ -1401,7 +1541,7 @@ xla::StatusOr<Operation*> LhloDialectEmitter::EmitBitcast(
                       assignment_.GetUniqueSlice(instr->operand(0), top_index));
 
   if (input_slice != result_slice) {
-    return xla::InvalidArgument(
+    return tsl::errors::InvalidArgument(
         "Bitcast input and result slice should be same");
   }
   return nullptr;
@@ -1414,24 +1554,27 @@ mlir::DenseIntElementsAttr LhloDialectEmitter::GetLayoutAttribute(
   return builder->getIndexTensorAttr(minor_to_major);
 }
 
-Status LhloDialectEmitter::ImportAsLmhloRegion(xla::HloComputation* computation,
-                                               mlir::Region* region) {
+tsl::Status LhloDialectEmitter::ImportAsLmhloRegion(
+    xla::HloComputation* computation, mlir::Region* region) {
   auto after = builder_.saveInsertionPoint();
   auto reverter = absl::MakeCleanup(
       [this, after] { builder_.restoreInsertionPoint(after); });
 
   builder_ = OpBuilder(region);
+  xla::HloModule* hlo_module = computation->parent();
+  if (!hlo_module->has_schedule()) {
+    return tsl::errors::Unimplemented(
+        "Missing sequential order for the computation");
+  }
   const xla::HloInstructionSequence* schedule =
-      assignment_.hlo_ordering().SequentialOrder(*computation);
-  if (!schedule)
-    return xla::Unimplemented("Missing sequential order for the computation");
+      &hlo_module->schedule().sequence(computation);
   TF_RETURN_IF_ERROR(
       computation->AcceptOrdered(this, schedule->instructions()));
   builder_.create<lmhlo::TerminatorOp>(builder_.getUnknownLoc());
   return ::tsl::OkStatus();
 }
 
-StatusOr<lmhlo::CaseOp> LhloDialectEmitter::EmitCaseOp(
+tsl::StatusOr<lmhlo::CaseOp> LhloDialectEmitter::EmitCaseOp(
     const HloInstruction* instr) {
   Location loc = getLocation(instr);
   llvm::SmallVector<Value, 4> operands;
@@ -1451,7 +1594,7 @@ StatusOr<lmhlo::CaseOp> LhloDialectEmitter::EmitCaseOp(
   return case_op;
 }
 
-xla::StatusOr<lmhlo::WhileOp> LhloDialectEmitter::EmitWhileOp(
+tsl::StatusOr<lmhlo::WhileOp> LhloDialectEmitter::EmitWhileOp(
     const xla::HloInstruction* instr) {
   Location loc = getLocation(instr);
   SmallVector<Value, 1> operands;
@@ -1479,7 +1622,97 @@ xla::StatusOr<lmhlo::WhileOp> LhloDialectEmitter::EmitWhileOp(
   return while_op;
 }
 
-StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
+// TODO(b/264291989): Use enum to define the host transfer type (channel type).
+template <typename Instr, typename OpTy>
+static void CopyChannelAttrs(OpBuilder& b, Instr* instr, OpTy op,
+                             int host_transfer_type) {
+  op.setIsHostTransferAttr(b.getBoolAttr(instr->is_host_transfer()));
+  op.setChannelHandleAttr(mlir::mhlo::ChannelHandleAttr::get(
+      b.getContext(), *instr->channel_id(),
+      instr->is_host_transfer() ? host_transfer_type : /*DEVICE_TO_DEVICE*/ 1));
+}
+
+template <typename Instr, typename OpTy>
+static void CopyFrontendAttrs(OpBuilder& b, Instr* instr, OpTy op) {
+  llvm::SmallVector<NamedAttribute> frontend_attrs;
+  for (auto& [name, value] : instr->frontend_attributes().map()) {
+    frontend_attrs.push_back(b.getNamedAttr(name, b.getStringAttr(value)));
+  }
+  op->setAttr(b.getStringAttr("frontend_attributes"),
+              b.getDictionaryAttr(frontend_attrs));
+}
+
+tsl::StatusOr<lmhlo::SendOp> LhloDialectEmitter::EmitSendOp(
+    const xla::HloInstruction* instr) {
+  llvm::SmallVector<Value, 2> operands;
+  TF_RETURN_IF_ERROR(GetOrCreateView(instr->operand(0), &operands));
+
+  auto token = mhlo::TokenType::get(builder_.getContext());
+  auto send_op = builder_.create<lmhlo::SendOp>(getLocation(instr),
+                                                TypeRange(token), operands);
+
+  // Set point-to-point op communication attributes.
+  auto* send = xla::Cast<xla::HloSendInstruction>(instr);
+  CopyChannelAttrs(builder_, send, send_op, /*host_transfer_type=*/2);
+  CopyFrontendAttrs(builder_, send, send_op);
+
+  auto [_, emplaced] = ret_tokens_.try_emplace(instr, send_op.getToken());
+  TF_RET_CHECK(emplaced) << "send already lowered";
+  return send_op;
+}
+
+tsl::StatusOr<lmhlo::SendDoneOp> LhloDialectEmitter::EmitSendDoneOp(
+    const xla::HloInstruction* instr) {
+  auto token = ret_tokens_.extract(instr->operand(0));
+  TF_RET_CHECK(token) << "didn't find send-done token";
+
+  auto send_done_op = builder_.create<lmhlo::SendDoneOp>(
+      getLocation(instr), /*resultTypes=*/llvm::None, token.mapped());
+
+  // Copy send-done attributes.
+  auto* send_done = xla::Cast<xla::HloSendDoneInstruction>(instr);
+  CopyChannelAttrs(builder_, send_done, send_done_op,
+                   /*host_transfer_type=*/2);
+
+  return send_done_op;
+}
+
+tsl::StatusOr<lmhlo::RecvOp> LhloDialectEmitter::EmitRecvOp(
+    const xla::HloInstruction* instr) {
+  llvm::SmallVector<Value, 2> operands;
+  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, {0}));
+
+  auto token = mhlo::TokenType::get(builder_.getContext());
+  auto recv_op = builder_.create<lmhlo::RecvOp>(getLocation(instr),
+                                                TypeRange(token), operands);
+
+  // Set point-to-point op communication attributes.
+  auto* recv = xla::Cast<xla::HloRecvInstruction>(instr);
+  CopyChannelAttrs(builder_, recv, recv_op, /*host_transfer_type=*/3);
+  CopyFrontendAttrs(builder_, recv, recv_op);
+
+  auto [_, emplaced] = ret_tokens_.try_emplace(instr, recv_op.getToken());
+  TF_RET_CHECK(emplaced) << "recv already lowered";
+  return recv_op;
+}
+
+tsl::StatusOr<lmhlo::RecvDoneOp> LhloDialectEmitter::EmitRecvDoneOp(
+    const xla::HloInstruction* instr) {
+  auto token = ret_tokens_.extract(instr->operand(0));
+  TF_RET_CHECK(token) << "didn't find recv-done token";
+
+  auto recv_done_op = builder_.create<lmhlo::RecvDoneOp>(
+      getLocation(instr), /*resultTypes=*/llvm::None, token.mapped());
+
+  // Copy recv-done attributes.
+  auto* recv_done = xla::Cast<xla::HloRecvDoneInstruction>(instr);
+  CopyChannelAttrs(builder_, recv_done, recv_done_op,
+                   /*host_transfer_type=*/3);
+
+  return recv_done_op;
+}
+
+tsl::StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
     const xla::HloInstruction* instr, const xla::Shape& current_shape,
     const xla::ShapeIndex& shape_index) {
   // For constants, the cache is managed inside EmitConstant since it can
@@ -1546,7 +1779,7 @@ StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
   return cached_value = result;
 }
 
-Status LhloDialectEmitter::GetOrCreateViewImpl(
+tsl::Status LhloDialectEmitter::GetOrCreateViewImpl(
     const HloInstruction* instr, const Shape& current_shape,
     xla::ShapeIndex* current_shape_index, SmallVectorImpl<Value>* values,
     TokenLoweringMode token_mode) {
@@ -1569,7 +1802,7 @@ Status LhloDialectEmitter::GetOrCreateViewImpl(
   if (current_shape.IsToken()) {
     switch (token_mode) {
       case TokenLoweringMode::kFailToLower:
-        return xla::InternalError(
+        return tsl::errors::Internal(
             "Unexpected token kind for %s and shape index %s",
             instr->ToString(), current_shape_index->ToString());
 
@@ -1578,17 +1811,17 @@ Status LhloDialectEmitter::GetOrCreateViewImpl(
         return ::tsl::OkStatus();
     }
   }
-  return xla::InternalError("Unexpected shape kind for %s and shape index %s",
-                            instr->ToString(), current_shape_index->ToString());
+  return tsl::errors::Internal(
+      "Unexpected shape kind for %s and shape index %s", instr->ToString(),
+      current_shape_index->ToString());
 }
 
 // Returns a view for the result of an instruction.
 // We first get a view for the slice in the allocation, and then may need to
 // create another view to adjust the slice for the shape of the instruction.
-Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
-                                           SmallVectorImpl<Value>* values,
-                                           const xla::ShapeIndex& result_subset,
-                                           TokenLoweringMode token_mode) {
+tsl::Status LhloDialectEmitter::GetOrCreateView(
+    const HloInstruction* instr, SmallVectorImpl<Value>* values,
+    const xla::ShapeIndex& result_subset, TokenLoweringMode token_mode) {
   xla::ShapeIndex shape_index = result_subset;
   const Shape& sub_shape =
       xla::ShapeUtil::GetSubshape(instr->shape(), shape_index);
@@ -1596,7 +1829,7 @@ Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
                              token_mode);
 }
 
-Status LhloDialectEmitter::Initialize() {
+tsl::Status LhloDialectEmitter::Initialize() {
   TF_RET_CHECK(computation_.IsEntryComputation());
 
   mlir::IntegerAttr unique_id =
@@ -1659,7 +1892,7 @@ Status LhloDialectEmitter::Initialize() {
       allocation_to_output_info;
   TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
       computation_.root_instruction()->shape(),
-      [&](const Shape& sub_shape, xla::ShapeIndex index) -> Status {
+      [&](const Shape& sub_shape, xla::ShapeIndex index) -> tsl::Status {
         TF_ASSIGN_OR_RETURN(
             auto slice,
             assignment_.GetUniqueSlice(computation_.root_instruction(), index));
@@ -1702,7 +1935,7 @@ Status LhloDialectEmitter::Initialize() {
                         builder_.getIndexAttr(alloc->parameter_number()));
       if (!alloc->param_shape_index().empty()) {
         arg_attr_list.set("lmhlo.param_shape_index",
-                          builder_.getI64TensorAttr(llvm::makeArrayRef(
+                          builder_.getI64TensorAttr(llvm::ArrayRef(
                               alloc->param_shape_index().begin(),
                               alloc->param_shape_index().end())));
       }
@@ -1725,7 +1958,7 @@ Status LhloDialectEmitter::Initialize() {
         continue;
       }
       arg_attr_list.set("lmhlo.output_index",
-                        builder_.getI64TensorAttr(llvm::makeArrayRef(
+                        builder_.getI64TensorAttr(llvm::ArrayRef(
                             shape_index.begin(), shape_index.end())));
       if (auto alias = computation_.parent()
                            ->input_output_alias_config()
@@ -1745,8 +1978,7 @@ Status LhloDialectEmitter::Initialize() {
   func_op.setType(function_type);
   func_op.setAllArgAttrs(args_attrs);
 
-  SymbolTable symbol_table(module_);
-  symbol_table.insert(func_op);
+  symbol_table_.insert(func_op);
   builder_.setInsertionPointToEnd(block);
 
   auto return_op =
@@ -1760,8 +1992,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createXlaHloToLhloWithXlaPass() {
   return std::make_unique<XlaHloToLhloPass>();
 }
 
-Status HloToLhloModule(const BufferAssignment& assignment,
-                       const HloModule& hlo_module, ModuleOp module) {
+tsl::Status HloToLhloModule(const BufferAssignment& assignment,
+                            const HloModule& hlo_module, ModuleOp module) {
   module.getContext()
       ->loadDialect<arith::ArithDialect, bufferization::BufferizationDialect,
                     func::FuncDialect, memref::MemRefDialect, mhlo::MhloDialect,
@@ -1782,22 +2014,23 @@ Status HloToLhloModule(const BufferAssignment& assignment,
 
   const xla::HloInstructionSequence* schedule =
       assignment.hlo_ordering().SequentialOrder(*computation);
-  if (!schedule)
-    return xla::Unimplemented("Missing sequential order for the computation");
-
-  StatusScopedDiagnosticHandler status_handler(module.getContext());
+  if (!schedule) {
+    return tsl::errors::Unimplemented(
+        "Missing sequential order for the computation");
+  }
+  BaseScopedDiagnosticHandler status_handler(module.getContext());
 
   const std::vector<HloInstruction*>& ordering = schedule->instructions();
   TF_RETURN_IF_ERROR(computation->AcceptOrdered(&emitter, ordering));
-  TF_RETURN_IF_ERROR(status_handler.ConsumeStatus());
+  TF_RETURN_IF_ERROR(tsl::FromAbslStatus(status_handler.ConsumeStatus()));
 
   (void)mlir::verify(module);
-  return status_handler.ConsumeStatus();
+  return tsl::FromAbslStatus(status_handler.ConsumeStatus());
 }
 
 OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
     llvm::StringRef input, MLIRContext* context, bool optimize_xla_hlo) {
-  StatusOr<std::unique_ptr<HloModule>> maybe_module =
+  tsl::StatusOr<std::unique_ptr<HloModule>> maybe_module =
       xla::ParseAndReturnUnverifiedModule(
           absl::string_view(input.data(), input.size()));
   TF_CHECK_OK(maybe_module.status());
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
new file mode 100644
index 00000000000..a3e3963ac42
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
@@ -0,0 +1,306 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
+#define TENSORFLOW_COMPILER_XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
+
+#include "absl/types/optional.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+
+// This class will process an HloModule with the supplied BufferAssignment and
+// populate the MLIR ModuleOp with the computation converted in the LHLO
+// dialect.
+class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
+ public:
+  // Initializes internal data structures. It must be called before calling any
+  // of the visitors.
+  tsl::Status Initialize();
+
+  LhloDialectEmitter(const xla::BufferAssignment& assignment,
+                     const xla::HloComputation& computation, ModuleOp module)
+      : assignment_(assignment),
+        computation_(computation),
+        module_(module),
+        symbol_table_(module),
+        builder_(module.getContext()),
+        i8_type_(builder_.getIntegerType(8)) {}
+
+  tsl::StatusOr<mlir::Operation*> EmitOp(const xla::HloInstruction* instr);
+
+  static tsl::StatusOr<mhlo::ScatterDimensionNumbersAttr>
+  GetScatterDimensionNumbers(const xla::HloInstruction* instr,
+                             mlir::MLIRContext* context);
+
+ private:
+  tsl::StatusOr<lmhlo::SortOp> EmitSortOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::FusionOp> EmitFusionOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::ScatterOp> EmitScatterOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::SelectAndScatterOp> EmitSelectAndScatterOp(
+      const xla::HloInstruction* instr);
+
+  tsl::StatusOr<Operation*> EmitCustomCallOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::FusionOp> EmitSoftmax(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo_gpu::CholeskyOp> EmitCholesky(
+      const xla::HloCustomCallInstruction* custom_call);
+  tsl::StatusOr<Operation*> EmitGemm(
+      const xla::HloCustomCallInstruction* custom_call);
+  tsl::StatusOr<Operation*> EmitCublasLtMatmul(
+      const xla::HloCustomCallInstruction* custom_call);
+  tsl::StatusOr<Operation*> EmitCublasLtMatmulF8(
+      const xla::HloCustomCallInstruction* custom_call);
+  tsl::StatusOr<Operation*> EmitDnnConvolution(
+      const xla::HloCustomCallInstruction* custom_call);
+  tsl::StatusOr<Operation*> EmitDnnBatchNorm(
+      const xla::HloCustomCallInstruction* custom_call);
+
+  tsl::StatusOr<memref::GetGlobalOp> EmitConstant(
+      const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::InfeedOp> EmitInfeedOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::OutfeedOp> EmitOutfeedOp(
+      const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::AllToAllOp> EmitAllToAllOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::AllGatherOp> EmitAllGatherOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::AllReduceOp> EmitAllReduceOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo_gpu::AllReduceStartOp> EmitAllReduceStartOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo_gpu::AllReduceDoneOp> EmitAllReduceDoneOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::ReduceScatterOp> EmitReduceScatterOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::CollectivePermuteOp> EmitCollectivePermuteOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
+  EmitCollectivePermuteStartOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo_gpu::CollectivePermuteDoneOp> EmitCollectivePermuteDoneOp(
+      const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::RngGetAndUpdateStateOp> EmitRngGetAndUpdateStateOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::FftOp> EmitFftOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::TriangularSolveOp> EmitTriangularSolveOp(
+      const xla::HloInstruction* instr);
+  tsl::StatusOr<Operation*> EmitBitcast(const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::CaseOp> EmitCaseOp(const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::WhileOp> EmitWhileOp(const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::SendOp> EmitSendOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::SendDoneOp> EmitSendDoneOp(
+      const xla::HloInstruction* instr);
+
+  tsl::StatusOr<lmhlo::RecvOp> EmitRecvOp(const xla::HloInstruction* instr);
+  tsl::StatusOr<lmhlo::RecvDoneOp> EmitRecvDoneOp(
+      const xla::HloInstruction* instr);
+
+  tsl::Status ImportAsLmhloRegion(xla::HloComputation* computation,
+                                  mlir::Region* region);
+
+  // Since LMHLO dialect does not define token types, this enum controls how
+  // token operand/results from XLA:HLO are lowered to MLIR.
+  enum class TokenLoweringMode {
+    kFailToLower,  // Fail lowering if token inputs are encountered.
+    kUseNull,      // Use a null Value in the operand list for each token.
+    // kSkip,        // Skip any token inputs or outputs (not yet needed)
+  };
+
+  // Create LHLO operation operands given an XLA HLO instruction. By default,
+  // all XLA HLO operands and results are converted to MLIR and appended to
+  // `operands`. If `num_operands` is specified, only the first `num_operand`
+  // operands of the instruction are converted to MLIR. The function returns the
+  // actual number of operands and results generated for MLIR in `num_arguments`
+  // and `num_results`.
+  tsl::Status CreateOperands(const xla::HloInstruction* instr,
+                             std::optional<int64_t> num_operands,
+                             TokenLoweringMode token_mode,
+                             SmallVectorImpl<Value>& operands,
+                             size_t& num_arguments, size_t& num_results);
+
+  template <typename OpType>
+  tsl::StatusOr<OpType> CreateOpWithoutAttrs(
+      const xla::HloInstruction* instr,
+      std::optional<int64_t> num_operands = std::nullopt) {
+    size_t unused;
+    return CreateOpWithoutAttrs<OpType>(instr, unused, unused, num_operands);
+  }
+
+  template <typename OpType>
+  tsl::StatusOr<OpType> CreateOpWithoutAttrs(
+      const xla::HloInstruction* instr, size_t& num_arguments,
+      size_t& num_results, std::optional<int64_t> num_operands = std::nullopt);
+
+  template <typename OpType>
+  OpType CreateOpWithoutAttrs(const xla::HloInstruction* instr,
+                              ValueRange operands);
+
+  tsl::StatusOr<mlir::Operation*> CreateOpInFusion(
+      const xla::HloInstruction* instr, ValueRange buffer_operands,
+      size_t num_arguments, size_t num_results);
+
+  tsl::StatusOr<mlir::Operation*> CreateOpInFusion(
+      const xla::HloInstruction* instr);
+
+  template <typename T>
+  DenseIntElementsAttr GetI64DenseElementsAttr(const T& container) {
+    return builder_.getI64TensorAttr(
+        {container.data(), static_cast<size_t>(container.size())});
+  }
+
+  DenseIntElementsAttr GetWindowElements(
+      const xla::Window& window,
+      std::function<int64_t(const xla::WindowDimension& dim)> getter) {
+    llvm::SmallVector<int64_t, 4> elements;
+    elements.reserve(window.dimensions_size());
+    for (const xla::WindowDimension& dim : window.dimensions()) {
+      elements.push_back(getter(dim));
+    }
+    return GetI64DenseElementsAttr(elements);
+  }
+
+  static mlir::DenseIntElementsAttr GetLayoutAttribute(
+      const xla::Layout& layout, Builder* builder);
+
+  tsl::Status DefaultAction(const xla::HloInstruction* instr) final;
+
+  // Computation parameters don't need any specific handling when they are
+  // visited, they are already processed when we enter a new computation.
+  tsl::Status HandleParameter(const xla::HloInstruction* instr) final {
+    return ::tsl::OkStatus();
+  }
+
+  // Helper function that recursively visits the tuple structure in
+  // `current_shape`, and reconstruct a matching lmhlo::TupleOp.
+  // Each leaf node is converted to an std.view op with corresponding offsets.
+  // If no tuple presents, it simply returns a view of the buffer.
+  tsl::Status GetOrCreateViewImpl(const xla::HloInstruction* instr,
+                                  const xla::Shape& current_shape,
+                                  xla::ShapeIndex* current_shape_index,
+                                  SmallVectorImpl<Value>* values,
+                                  TokenLoweringMode token_mode);
+
+  // Helper function to create view/tuple of views to a buffer for a given
+  // instruction result. `result_subset` can be used to for instructions that
+  // have a tuple result and MLIR conversion needs to convert only one of the
+  // tuple elements. Note that if needed, this can be extended to take a list of
+  // ShapeIndex values in case we need finer control on what elements of the
+  // output tuple to be converted to MLIR.
+  tsl::Status GetOrCreateView(
+      const xla::HloInstruction* instr, SmallVectorImpl<Value>* values,
+      const xla::ShapeIndex& result_subset = {},
+      TokenLoweringMode token_mode = TokenLoweringMode::kFailToLower);
+
+  tsl::StatusOr<Value> GetOrCreateArrayView(
+      const xla::HloInstruction* instr, const xla::Shape& current_shape,
+      const xla::ShapeIndex& current_shape_index);
+
+  tsl::StatusOr<Value> RewriteFusionOperand(const xla::HloInstruction* root,
+                                            const xla::Shape& shape,
+                                            xla::ShapeIndex* shape_index,
+                                            OpBuilder* b, Location loc);
+
+  // Return an MLIR location for an HLO instruction.
+  Location getLocation(const xla::HloInstruction* inst) {
+    return NameLoc::get(builder_.getStringAttr(inst->name()));
+  }
+
+  // This map provides access to MLIR buffers for each HLO buffer allocation.
+  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
+  // parameters. It is populated at the beginning of the processing with all
+  // the buffer allocations and is unchanged afterward. Every HLOInstruction
+  // is using a "slice" of the buffer allocation and providing shape, layout,
+  // and Dtype. An MLIR view is used separately to model slices into the
+  // allocations (see below).
+  llvm::DenseMap<const xla::BufferAllocation*, Value> allocations_;
+
+  // This map provides access to MLIR buffers for each HLO instruction, keyed
+  // instruction identity. A slice is contained in a BufferAllocation, and has
+  // an offset and a size.
+  //
+  // As for why we don't use HloInstruction*, see GetOrCreateView(), but
+  // mostly we want to leverage better of the aliased buffers.
+  //
+  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
+  // Otherwise, there will be a single buffer.
+  //
+  // An MLIR buffer is either an input parameter, or a ViewOp in the case
+  // where the slice is only part of its allocation.
+  //
+  // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
+  // process every instruction.
+  absl::flat_hash_map<std::pair<const xla::HloInstruction*, xla::ShapeIndex>,
+                      Value>
+      slices_;
+
+  // The BufferAssignment computed by XLA ahead of time.
+  const xla::BufferAssignment& assignment_;
+
+  // The HLO module that will be converted.
+  const xla::HloComputation& computation_;
+
+  // This is the MLIR module in which a function will be created for every HLO
+  // computation.
+  ModuleOp module_;
+
+  // SymbolTable associated with the module. New functions should be added using
+  // this to avoid name conflicts.
+  mlir::SymbolTable symbol_table_;
+
+  // The builder keeps track of the current insertion point in the MLIR
+  // module.
+  OpBuilder builder_;
+  // Convenient "cached" access to this widely used MLIR type (i8).
+  Type i8_type_;
+
+  // Map ops returning tokens to their output (async collectives start ops, and
+  // point-to-point communication ops), to connect the correct done op.
+  absl::flat_hash_map<const xla::HloInstruction*, mlir::Value> ret_tokens_;
+};
+
+// Populate the MLIR `module` with the computation from the `hlo_module` using
+// the provided buffer `assignment`. The returned `Status` indicates success
+// or failure in the conversion.
+tsl::Status HloToLhloModule(const xla::BufferAssignment& assignment,
+                            const xla::HloModule& hlo_module, ModuleOp module);
+
+tsl::Status OptimizeAndConvertHloToLmhlo(
+    std::unique_ptr<xla::HloModule> hlo_module, ModuleOp module,
+    StringRef platform_name, bool optimize_xla_hlo);
+OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
+    llvm::StringRef input, MLIRContext* context, bool optimize_xla_hlo);
+
+// This register the MLIR pass with the command line.
+void RegisterMhloToLhloWithXlaPass();
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
new file mode 100644
index 00000000000..bed57ce10ff
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
@@ -0,0 +1,43 @@
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    tags_override = {
+        "gpu_ops.mlir": tf_cuda_tests_tags() + [
+            "noasan",
+            "nomsan",
+            "noubsan",
+            "no_cuda_asan",
+            "no_oss",
+        ],
+    },
+    test_file_exts = [
+        "mlir",
+        "hlo",
+        "hlotxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/translate:xla-translate",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla:xla-translate-gpu-opt",
+        "//tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla:xla-translate-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/gpu_ops.mlir
similarity index 94%
rename from tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/gpu_ops.mlir
index 6abd0dab276..938fd7918a0 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/gpu_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt-gpu -split-input-file -xla-hlo-to-lhlo-with-xla=platform=CUDA %s | FileCheck %s
+// RUN: xla-translate-gpu-opt -split-input-file -xla-hlo-to-lhlo-with-xla=platform=CUDA %s | FileCheck %s
 
 // CHECK-LABEL: func @main
 // CHECK-SAME: %[[ARG0:.*]]: memref<36xi8> {lmhlo.params = 0
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
similarity index 79%
rename from tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
index ae077e58662..fa2df82785b 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/hlo_text_to_lhlo_no_opt.hlotxt
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
+// RUN: xla-translate -split-input-file -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
 
 HloModule TestModule
 
@@ -189,6 +189,57 @@ ENTRY main {
 
 // -----
 
+HloModule CublasLtMatmul
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo_gpu.cublas.lt.matmul"
+// CHECK-SAME: alpha_imag = 0.000000e+00 : f64
+// CHECK-SAME: alpha_real = 1.000000e+00 : f64
+// CHECK-SAME: beta = 0.000000e+00 : f64
+// CHECK-NOT: lhs_batching_dimensions
+// CHECK-NOT: rhs_batching_dimensions
+// CHECK-SAME: lhs_contracting_dimensions = [1]
+// CHECK-SAME: rhs_contracting_dimensions = [0]
+// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+// CHECK: (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+
+ENTRY main {
+  %A = f32[2,2]{1,0} parameter(0)
+  %B = f32[2,2]{1,0} parameter(1)
+  ROOT %custom-call = f32[2,2]{1,0} custom-call(f32[2,2]{1,0} %A, f32[2,2]{1,0} %B), custom_call_target="__cublas$lt$matmul",
+    backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}"
+}
+
+// -----
+
+HloModule CublasLtMatmulF8
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo_gpu.cublas.lt.matmul.f8"
+// CHECK-SAME: alpha_imag = 0.000000e+00 : f64
+// CHECK-SAME: alpha_real = 1.000000e+00 : f64
+// CHECK-SAME: beta = 0.000000e+00 : f64
+// CHECK-NOT: lhs_batching_dimensions
+// CHECK-NOT: rhs_batching_dimensions
+// CHECK-SAME: lhs_contracting_dimensions = [1]
+// CHECK-SAME: rhs_contracting_dimensions = [0]
+// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
+// CHECK: (memref<16x16xf8E4M3FN>, memref<16x16xf8E4M3FN>, memref<16x16xf16>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<16x16xf8E4M3FN>, memref<f32>) -> ()
+
+ENTRY main {
+  %A = f8e4m3fn[16,16]{1,0} parameter(0)
+  %B = f8e4m3fn[16,16]{1,0} parameter(1)
+  %C = f16[16,16]{1,0} parameter(2)
+  %A_SCALE = f32[] parameter(3)
+  %B_SCALE = f32[] parameter(4)
+  %C_SCALE = f32[] parameter(5)
+  %D_SCALE = f32[] parameter(6)
+  ROOT %custom-call = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call(f8e4m3fn[16,16]{1,0} %A, f8e4m3fn[16,16]{1,0} %B, f16[16,16]{1,0} %C, f32[] %A_SCALE, f32[] %B_SCALE, f32[] %C_SCALE, f32[] %D_SCALE), custom_call_target="__cublas$lt$matmul$f8",
+    backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}"
+}
+
+// -----
+
 HloModule AllReduce
 
 // Test all-reduce
@@ -288,7 +339,7 @@ add {
   // CHECK:  }) {
   // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
-  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]], [[OUTPUT]])
+  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]])
   start = f32[8] all-reduce-start(param0),
       channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
   ROOT done = f32[8] all-reduce-done(start)
@@ -322,7 +373,7 @@ add {
   // CHECK:  }) {
   // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
   // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
-  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]], [[OUTPUT0]], [[OUTPUT1]])
+  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]])
   start = (f32[8], f32[9]) all-reduce-start(param0, param1),
       channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
   ROOT done = (f32[8], f32[9]) all-reduce-done(start)
@@ -442,7 +493,7 @@ HloModule Outfeed
 
 // CHECK: func @main
 // CHECK: "lmhlo.custom_call"
-// CHECK-SAME: call_target_name = "foo"
+// CHECK: call_target_name = "foo"
 // CHECK: "lmhlo.outfeed"
 // CHECK-SAME: config = ""
 // CHECK-SAME: (memref<3xf32>, memref<5xf16>) -> ()
@@ -465,11 +516,11 @@ HloModule Test
 // CHECK-SAME:   lhs_contracting_dimensions = [2]
 // CHECK-SAME:   rhs_contracting_dimensions = [1]
 // CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-// CHECK-SAME:   : (tensor<1x3x4xf32>, tensor<1x4x5xf32>) -> tensor<1x4x5xf32>
+// CHECK-SAME:   : (tensor<1x3x4xf32>, tensor<1x4x5xf32>) -> tensor<1x3x5xf32>
 ENTRY main {
   %arg0 = f32[1,3,4]{2,1,0} parameter(0)
   %arg1 = f32[1,4,5]{2,1,0} parameter(1)
-  ROOT %out = f32[1,4,5]{2,1,0} dot(%arg0, %arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+  ROOT %out = f32[1,3,5]{2,1,0} dot(%arg0, %arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 }
 
 // -----
@@ -650,14 +701,36 @@ ENTRY main {
 
 // -----
 
-HloModule CustomCallWithTokens
+HloModule CustomCallWithTypedFFIBackendConfig
 
 // CHECK: func @main
 // CHECK: "lmhlo.custom_call"
+// CHECK: api_version = 4 : i32
+// CHECK-SAME: backend_config = {
+// CHECK-SAME:   user_attr0 = 123 : i32
+// CHECK-SAME:   user_attr1 = dense<42> : tensor<i32>
+// CHECK-SAME: }
 // CHECK-SAME: num_args = 1
 // CHECK-SAME: num_results = 2
 // CHECK-SAME: args_to_target_args = []
 // CHECK-SAME: results_to_target_results = [0]
+ENTRY main {
+  %tok = token[] parameter(0)
+  ROOT %call = (f32[3], token[]) custom-call (%tok), custom_call_target="foo",
+                                                     api_version=API_VERSION_TYPED_FFI,
+                                                     backend_config="{user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}"
+}
+
+// -----
+
+HloModule CustomCallWithTokens
+
+// CHECK: func @main
+// CHECK: "lmhlo.custom_call"
+// CHECK: num_args = 1
+// CHECK-SAME: num_results = 2
+// CHECK-SAME: args_to_target_args = []
+// CHECK-SAME: results_to_target_results = [0]
 ENTRY main {
   %tok = token[] parameter(0)
   ROOT %call = (f32[3], token[]) custom-call (%tok), custom_call_target="foo",
@@ -670,7 +743,7 @@ HloModule CustomCallWithTokens
 
 // CHECK: func @main
 // CHECK: "lmhlo.custom_call"
-// CHECK-SAME: num_args = 3
+// CHECK: num_args = 3
 // CHECK-SAME: num_results = 3
 // CHECK-SAME: args_to_target_args = [1]
 // CHECK-SAME: results_to_target_results = [0, 2]
@@ -688,7 +761,7 @@ HloModule CustomCallWithTokens
 
 // CHECK: func @main
 // CHECK: "lmhlo.custom_call"
-// CHECK-SAME: num_args = 3
+// CHECK: num_args = 3
 // CHECK-SAME: num_results = 1
 // CHECK-SAME: args_to_target_args = [1]
 // CHECK-SAME: results_to_target_results = [0]
@@ -706,7 +779,7 @@ HloModule CustomCallWithTokens
 
 // CHECK: func @main
 // CHECK: "lmhlo.custom_call"
-// CHECK-SAME: num_args = 1
+// CHECK: num_args = 1
 // CHECK-SAME: num_results = 4
 // CHECK-SAME: args_to_target_args = [0]
 // CHECK-SAME: results_to_target_results = [1]
@@ -773,3 +846,87 @@ ENTRY main {
   %custom-call = s32[<=2]{0} custom-call(s32[2]{0} %parameter.1, s32[] %parameter.2), custom_call_target="SliceToDynamic"
   ROOT %copy = s32[<=2]{0} copy(s32[<=2]{0} %custom-call)
 }
+
+// -----
+
+HloModule CustomCallNoComputation
+
+// CHECK: "lmhlo.custom_call"
+// CHECK: call_target_name = "__custom"
+
+ENTRY main {
+  param = f32[] parameter(0)
+  ROOT cr = f32[] custom-call(param), custom_call_target="__custom"
+}
+
+// -----
+
+HloModule CustomCallWithComputation
+
+// CHECK: "lmhlo.custom_call"
+// CHECK: %0 = mhlo.add
+// CHECK: mhlo.return %0
+// CHECK: call_target_name = "__custom"
+
+computation1 {
+  param_0 = f32[] parameter(0)
+  ROOT r = f32[] add(param_0, param_0)
+}
+
+ENTRY main {
+  param = f32[] parameter(0)
+  ROOT cr = f32[] custom-call(param), custom_call_target="__custom",
+                                      to_apply=computation1
+}
+
+// -----
+
+HloModule Send
+
+// CHECK: func @main
+// CHECK: %[[ARG1:arg[0-9]+]]: memref<16xi8> {lmhlo.params = 1 : index}
+// CHECK: %[[VIEW:.*]] = memref.view %[[ARG1]][%c0][]
+// CHECK: %[[TOKEN:.*]] = "lmhlo.send"(%[[VIEW]]) {
+// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+// CHECK:   frontend_attributes = {_xla_dcn_recv_channel = "2",
+// CHECK:                          _xla_host_transfer_handler_name = "undef",
+// CHECK                           _xla_host_transfer_is_lower_bits = "false",
+// CHECK:                          _xla_host_transfer_original_type = "f32",
+// CHECK:                          _xla_host_transfer_rendezvous = "undef"}
+// CHECK:   is_host_transfer = true
+// CHECK: } : (memref<4xf32>) -> !mhlo.token
+// CHECK: "lmhlo.send_done"(%0) {
+// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
+// CHECK    is_host_transfer = true
+// CHECK: } : (!mhlo.token) -> ()
+ENTRY main {
+  %tok = token[] parameter(0)
+  %buf = f32[4]{0} parameter(1)
+  %send = (f32[4]{0}, u32[], token[]) send(f32[4]{0} %buf, token[] %tok), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_dcn_recv_channel="2",_xla_host_transfer_handler_name="undef",_xla_host_transfer_is_lower_bits="false",_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="undef"}
+  ROOT %send-done = token[] send-done((f32[4]{0}, u32[], token[]) %send), channel_id=1, is_host_transfer=true
+}
+
+// -----
+
+HloModule Recv
+
+// CHECK: func @main
+// CHECK: %[[ARG1:arg[0-9]+]]: memref<16xi8> {lmhlo.output_index = dense<0> : tensor<1xi64>}
+// CHECK: %[[VIEW:.*]] = memref.view %[[ARG1]][%c0][]
+// CHECK: %[[TOKEN:.*]] = "lmhlo.recv"(%[[VIEW]]) {
+// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
+// CHECK:    frontend_attributes = {_xla_host_transfer_handler_name = "undef",
+// CHECK:                           _xla_host_transfer_is_lower_bits = "false",
+// CHECK:                           _xla_host_transfer_original_type = "f32",
+// CHECK:                           _xla_host_transfer_rendezvous = "undef"}
+// CHECK:   is_host_transfer = true
+// CHECK: } : (memref<4xf32>) -> !mhlo.token
+// CHECK: "lmhlo.recv_done"(%0) {
+// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
+// CHECK    is_host_transfer = true
+// CHECK: } : (!mhlo.token) -> ()
+ENTRY main {
+  %tok = token[] parameter(0)
+  %recv = (f32[4]{0}, u32[], token[]) recv(token[] %tok), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="undef",_xla_host_transfer_is_lower_bits="false",_xla_host_transfer_original_type="f32",_xla_host_transfer_rendezvous="undef"}
+  ROOT %recv-done = (f32[4]{0}, token[]) recv-done((f32[4]{0}, u32[], token[]) %recv), channel_id=1, is_host_transfer=true
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/no_opt_ops.hlotxt b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt
similarity index 94%
rename from tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/no_opt_ops.hlotxt
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt
index 2eee3147b9d..782e38e975d 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/no_opt_ops.hlotxt
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
+// RUN: xla-translate -split-input-file -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
 
 HloModule indexed_conditional
 
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
similarity index 92%
rename from tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
index ccac97bf16b..5d2fd661919 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/non_identity_layouts.hlotxt
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-lhlo %s | FileCheck %s
+// RUN: xla-translate -hlo-text-to-lhlo %s | FileCheck %s
 
 HloModule TestModule
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir
new file mode 100644
index 00000000000..2796a84a003
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir
@@ -0,0 +1,859 @@
+// RUN: xla-translate-opt -split-input-file -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.abs
+  %abs = "mhlo.abs"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %abs : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.add
+// CHECK: lmhlo.terminator
+  %res = "mhlo.add"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.and
+// CHECK: lmhlo.terminator
+  %res = "mhlo.and"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.atan2
+// CHECK: lmhlo.terminator
+  %res = "mhlo.atan2"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.bitcast_convert
+  %res = "mhlo.bitcast_convert"(%value) : (tensor<2x2xf32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.ceil
+  %res = "mhlo.ceil"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.cbrt
+  %res = "mhlo.cbrt"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8> {lmhlo.params = 2
+// CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
+func.func @main(%pred: tensor<2x2xf32>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.clamp
+// CHECK: lmhlo.terminator
+  %0 = "mhlo.clamp"(%pred, %lhs, %rhs) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.count_leading_zeros
+  %res = "mhlo.count_leading_zeros"(%value) : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xi1> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.compare
+// CHECK: lmhlo.terminator
+  %res = "mhlo.compare"(%value0, %value1) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xi1>
+  func.return %res : tensor<2x2xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<8xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<1x2xf32>, %value1: tensor<1x2xf32>) -> tensor<1x2xcomplex<f32>> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.complex
+// CHECK: lmhlo.terminator
+  %res = "mhlo.complex"(%value0, %value1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
+  func.return %res : tensor<1x2xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
+func.func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf16> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<2x2xf16>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.convert
+  %res = "mhlo.convert"(%value) : (tensor<2x2xf32>) -> tensor<2x2xf16>
+  func.return %res : tensor<2x2xf16>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<1x2xcomplex<f32>>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.cosine
+// CHECK: lmhlo.terminator
+  %res = "mhlo.cosine"(%value0) : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xcomplex<f32>>
+  func.return %res : tensor<1x2xcomplex<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.divide
+// CHECK: lmhlo.terminator
+  %res = "mhlo.divide"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.exponential
+  %res = "mhlo.exponential"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.exponential_minus_one
+  %res = "mhlo.exponential_minus_one"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.floor
+  %res = "mhlo.floor"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xi1> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.is_finite
+  %res = "mhlo.is_finite"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xi1>
+  func.return %res : tensor<2x2xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.log
+  %res = "mhlo.log"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.log_plus_one
+  %res = "mhlo.log_plus_one"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.map
+// CHECK: return
+  %res = "mhlo.map"(%value0, %value1) ({
+  ^bb0(%a: tensor<f32>, %b: tensor<f32>):
+    %c = "mhlo.add"(%a, %b) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %ret = "mhlo.add"(%a, %c) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    "mhlo.return"(%ret) : (tensor<f32>) -> ()
+  }) {dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.maximum
+// CHECK: lmhlo.terminator
+  %res = "mhlo.maximum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.minimum
+// CHECK: lmhlo.terminator
+  %res = "mhlo.minimum"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.multiply
+// CHECK: lmhlo.terminator
+  %res = "mhlo.multiply"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.negate
+  %res = "mhlo.negate"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8>
+func.func @main(%value0: tensor<2x2xi1>) -> tensor<2x2xi1> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.not
+  %res = "mhlo.not"(%value0) : (tensor<2x2xi1>) -> tensor<2x2xi1>
+  func.return %res : tensor<2x2xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.not
+  %res = "mhlo.not"(%value0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+func.func @main(%value0: tensor<2x2xi1>, %value1: tensor<2x2xi1>) -> tensor<2x2xi1> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.or
+// CHECK: lmhlo.terminator
+  %res = "mhlo.or"(%value0, %value1) : (tensor<2x2xi1>, tensor<2x2xi1>) -> tensor<2x2xi1>
+  func.return %res : tensor<2x2xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.or
+// CHECK: lmhlo.terminator
+  %res = "mhlo.or"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.popcnt
+  %res = "mhlo.popcnt"(%value0) : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>, %value1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.power
+// CHECK: lmhlo.terminator
+  %res = "mhlo.power"(%value0, %value1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
+func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.real
+  %res = "mhlo.real"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
+  func.return %res : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<8xi8>
+func.func @main(%value0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<8xi8> to memref<1x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.imag
+  %res = "mhlo.imag"(%value0) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
+  func.return %res : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.reduce_precision
+  %res = "mhlo.reduce_precision"(%value0) {exponent_bits=5 : i32, mantissa_bits=12 : i32}: (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.remainder
+// CHECK: lmhlo.terminator
+  %res = "mhlo.remainder"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.round_nearest_afz
+  %res = "mhlo.round_nearest_afz"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.rsqrt
+  %res = "mhlo.rsqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8> {lmhlo.params = 2
+// CHECK-SAME: %[[ARG3:.*]]: memref<16xi8>
+func.func @main(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.select
+// CHECK: lmhlo.terminator
+  %0 = "mhlo.select"(%pred, %lhs, %rhs) : (tensor<2x2xi1>, tensor<2x2xf32>, tensor<2x2xf32>) -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.shift_left
+// CHECK: lmhlo.terminator
+  %res = "mhlo.shift_left"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.shift_right_arithmetic
+// CHECK: lmhlo.terminator
+  %res = "mhlo.shift_right_arithmetic"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.shift_right_logical
+// CHECK: lmhlo.terminator
+  %res = "mhlo.shift_right_logical"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.sign
+  %res = "mhlo.sign"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.sine
+  %res = "mhlo.sine"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.sqrt
+  %res = "mhlo.sqrt"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.subtract
+// CHECK: lmhlo.terminator
+  %res = "mhlo.subtract"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xf32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.tanh
+  %res = "mhlo.tanh"(%value0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  func.return %res : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+func.func @main(%value0: tensor<2x2xi1>, %value1: tensor<2x2xi1>) -> tensor<2x2xi1> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<4xi8> to memref<2x2xi1>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.xor
+// CHECK: lmhlo.terminator
+  %res = "mhlo.xor"(%value0, %value1) : (tensor<2x2xi1>, tensor<2x2xi1>) -> tensor<2x2xi1>
+  func.return %res : tensor<2x2xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
+func.func @main(%value0: tensor<2x2xi32>, %value1: tensor<2x2xi32>) -> tensor<2x2xi32> {
+// CHECK: %[[VIEW:.*]] = {{.*}} memref<16xi8> to memref<2x2xi32>
+// CHECK: lmhlo.fusion
+// CHECK: mhlo.xor
+// CHECK: lmhlo.terminator
+  %res = "mhlo.xor"(%value0, %value1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  func.return %res : tensor<2x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<100xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<100xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<100xi8> {lmhlo.output_index = dense<0>
+// CHECK-SAME: %[[ARG3:.*]]: memref<100xi8> {lmhlo.output_index = dense<1>
+// CHECK: %[[VIEW0:.*]] = memref.view %[[ARG0]]{{.*}} : memref<100xi8> to memref<5x5xi32>
+// CHECK: %[[VIEW1:.*]] = memref.view %[[ARG1]]{{.*}} : memref<100xi8> to memref<5x5xf32>
+// CHECK: %[[VIEW2:.*]] = memref.view %[[ARG2]]{{.*}} : memref<100xi8> to memref<5x5xi32>
+// CHECK: %[[VIEW3:.*]] = memref.view %[[ARG3]]{{.*}} : memref<100xi8> to memref<5x5xf32>
+// CHECK: "lmhlo.sort"(%[[VIEW0]], %[[VIEW1]], %[[VIEW2]], %[[VIEW3]])
+func.func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>) {
+  %res:2 = "mhlo.sort"(%key, %value) ({
+  ^bb0(%a: tensor<i32>, %b: tensor<i32>, %c: tensor<f32>, %d: tensor<f32>):
+    %ret = "mhlo.compare"(%c, %d) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%ret) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true}: (tensor<5x5xi32>, tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>)
+
+  func.return %res#0, %res#1 : tensor<5x5xi32>, tensor<5x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<4xi8> {lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<4xi8> {lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+// CHECK: %[[VIEW0:.*]] = memref.view %[[ARG0]]{{.*}} : memref<4xi8> to memref<f32>
+// CHECK: %[[VIEW1:.*]] = memref.view %[[ARG1]]{{.*}} : memref<4xi8> to memref<f32>
+// CHECK: "lmhlo.fusion"() ({
+// CHECK:   %[[VAR0:.*]] = bufferization.to_tensor %[[VIEW0]] : memref<f32>
+// CHECK:   %[[VAR1:.*]] = bufferization.to_tensor %[[VIEW1]] : memref<f32>
+// CHECK:   %[[VAR2:.*]] = mhlo.add %[[VAR0]], %[[VAR1]] : tensor<f32>
+// CHECK:   tensor_store %[[VAR2]], %[[MEMREF:.*]] : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %result = "mhlo.fusion"(%arg0, %arg1) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%result) : (tensor<f32>) -> ()
+    }) { fusion_kind = #mhlo<fusion_kind kLoop> } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  func.return %result : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo.fusion"() ({
+// CHECK:   %[[VAL0:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
+// CHECK:   %[[VAL1:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
+// CHECK:   %[[VAL2:.*]] = bufferization.to_tensor %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL0]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL1]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL2]], %{{.*}} : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func.func @main(%arg0: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg1: tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
+  %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tuple<tensor<f32>>
+  %1 = "mhlo.get_tuple_element"(%0) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+  %2 = "mhlo.get_tuple_element"(%arg0) {index = 1 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tensor<f32>
+  %3 = "mhlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+  %result:3 = "mhlo.fusion"(%1, %2, %3) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<f32>):
+      "mhlo.return"(%arg2, %arg3, %arg4) : (tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+    }) { fusion_kind = #mhlo<fusion_kind kLoop> } : (tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>, tensor<f32>)
+
+  %4 = "mhlo.tuple"(%result#0, %result#1, %result#2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+  func.return %4 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK:   mhlo.reduce
+// CHECK:   (%[[VAL1:.*]]: tensor<f32>, %[[VAL3:.*]]: tensor<f32>)
+// CHECK-SAME: (%[[VAL2:.*]]: tensor<i32>, %[[VAL4:.*]]: tensor<i32>)
+// CHECK:     %[[VAL5:.*]] = mhlo.maximum %[[VAL1]], %[[VAL3]] : tensor<f32>
+// CHECK:     %[[VAL6:.*]] = mhlo.maximum %[[VAL2]], %[[VAL4:.*]] : tensor<i32>
+// CHECK:     mhlo.return %[[VAL5]], %[[VAL6:.*]] : tensor<f32>, tensor<i32>
+// CHECK:   })
+func.func @main(%arg0 : tensor<1x10xf32>, %arg1 : tensor<1x10xi32>, %arg2 : tensor<f32>, %arg3 : tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>) {
+  %result0, %result1 = "mhlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
+    ^bb0(%fa: tensor<f32>, %ia : tensor<i32>, %fb: tensor<f32>, %ib: tensor<i32>):
+      %fmax = "mhlo.maximum"(%fa, %fb) {} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      %imax = "mhlo.maximum"(%ia, %ib) {} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "mhlo.return"(%fmax, %imax) : (tensor<f32>, tensor<i32>) -> ()
+    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<1x10xi32>, tensor<f32>, tensor<i32>) -> (tensor<1xf32>, tensor<1xi32>)
+  func.return %result0, %result1 : tensor<1xf32>, tensor<1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.concatenate"(%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) {dimension = 1 : i64} : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
+func.func @main(%arg0 : tensor<5x2xf32>,
+           %arg1 : tensor<5x5xf32>,
+           %arg2 : tensor<5x7xf32>) -> tensor<5x14xf32> {
+  %result = "mhlo.concatenate"(%arg0, %arg1, %arg2) {
+    dimension = 1 : i64
+  } : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
+  func.return %result : tensor<5x14xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<1x10xf32>
+func.func @main() -> tensor<1x10xf32> {
+  %result = "mhlo.iota"() {
+    iota_dimension = 1 : i64
+  } : () -> tensor<1x10xf32>
+  func.return %result : tensor<1x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.reverse"(%{{.*}}) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32>
+func.func @main(%arg0 : tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32> {
+  %result = "mhlo.reverse"(%arg0) {
+    dimensions = dense<[1,2]> : tensor<2xi64>
+  } : (tensor<10x11x12x13xf32>) -> tensor<10x11x12x13xf32>
+  func.return %result : tensor<10x11x12x13xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.slice"(%{{.*}}) {
+// CHECK-SAME: limit_indices = dense<[2, 4]> : tensor<2xi64>,
+// CHECK-SAME: start_indices = dense<[1, 0]> : tensor<2xi64>,
+// CHECK-SAME: strides = dense<[1, 2]> : tensor<2xi64>}
+// CHECK-SAME: : (tensor<3x4xf32>) -> tensor<1x2xf32>
+func.func @main(%arg: tensor<3x4xf32>) -> tensor<1x2xf32> {
+  %0 = "mhlo.slice"(%arg) {start_indices = dense<[1, 0]> : tensor<2xi64>, limit_indices = dense<[2, 4]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} : (tensor<3x4xf32>) -> tensor<1x2xf32>
+  func.return %0 : tensor<1x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.gather"(%{{.*}}, %{{.*}}) {
+// CHECK-SAME: dimension_numbers =
+// CHECK-SAME:   offset_dims = [1]
+// CHECK-SAME:   collapsed_slice_dims = [0, 1]
+// CHECK-SAME:   start_index_map = [0, 1]
+// CHECK-SAME:   index_vector_dim = 1
+// CHECK-SAME: slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>}
+// CHECK-SAME: : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+func.func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10x300xf32> {
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      collapsed_slice_dims = [0, 1],
+      index_vector_dim = 1,
+      offset_dims = [1],
+      start_index_map = [0, 1],
+    >,
+    indices_are_sorted = true,
+    slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>
+  } : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+  func.return %0 : tensor<10x300xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "mhlo.dynamic_slice"(%{{.*}}, %{{.*}}, %{{.*}}) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xf32>, tensor<i64>, tensor<i64>) -> tensor<1x4xf32>
+func.func @main(%arg: tensor<3x4xf32>, %start1: tensor<i64>, %start2: tensor<i64>) -> tensor<1x4xf32> {
+  %0 = "mhlo.dynamic_slice"(%arg, %start1, %start2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xf32>, tensor<i64>, tensor<i64>) -> tensor<1x4xf32>
+  func.return %0 : tensor<1x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: mhlo.dynamic_update_slice %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x4xf32>
+func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<4x4xf32> {
+  %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2, %arg3) : (tensor<4x4xf32>, tensor<1x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/passthrough.mlir
similarity index 90%
rename from tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
rename to tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/passthrough.mlir
index 4eee181f9a9..047e122ffa6 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/passthrough.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
+// RUN: xla-translate-opt -xla-hlo-to-lhlo-with-xla %s | FILECHECK_OPTS="" FileCheck --enable-var-scope %s
 
 // Current allocation will lead to one buffer argument for the "value" and
 // another one for the output, an no returned values.
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc
new file mode 100644
index 00000000000..1f4838ee8db
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
+
+namespace {
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> optimize_xla_hlo(
+    "optimize-xla-hlo",
+    llvm::cl::desc("Enable optimizations when translating XLA HLO -> LHLO"),
+    llvm::cl::init(true));
+}  // namespace
+
+//----------------------------------------------------------------------------//
+// Hooks for tf-mlir-translate
+//----------------------------------------------------------------------------/
+
+// MHLO doesn't support explicit layouts, while XLA service does.
+// TODO(timshen): remove it once MHLO supports explicit layouts.
+static mlir::TranslateToMLIRRegistration HloTextToLhloMlirTranslate(
+    "hlo-text-to-lhlo", "hlo-text-to-lhlo",
+    [](llvm::StringRef input, mlir::MLIRContext* context) {
+      return mlir::HloTextToLhloTranslateFunction(input, context,
+                                                  optimize_xla_hlo);
+    });
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/xla_translate_opt_main.cc b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/xla_translate_opt_main.cc
new file mode 100644
index 00000000000..c8be24267a1
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/xla_translate_opt_main.cc
@@ -0,0 +1,44 @@
+/* Copyright 2022 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/InitLLVM.h"
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
+#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/tsl/platform/init_main.h"
+
+int main(int argc, char **argv) {
+  // TODO(jreiffers): Move this to a more appropriate place. It is used by both
+  // translate/mhlo_to_lhlo_with_xla and mlir/framework for testing.
+  llvm::InitLLVM y(argc, argv);
+  int dummyArgc = 1;
+  tsl::port::InitMain(argv[0], &dummyArgc, &argv);
+
+  mlir::registerAllPasses();
+  mlir::RegisterMhloToLhloWithXlaPass();
+  mlir::mhlo::registerXlaFrameworkPasses();
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  registry.insert<mlir::xla_framework::XLAFrameworkDialect>();
+  return failed(mlir::MlirOptMain(
+      argc, argv, "xla translate test pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/translate/xla_translate_main.cc b/tensorflow/compiler/xla/translate/xla_translate_main.cc
index 623954a6451..8570900f1b0 100644
--- a/tensorflow/compiler/xla/translate/xla_translate_main.cc
+++ b/tensorflow/compiler/xla/translate/xla_translate_main.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/ToolUtilities.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
+#include "tensorflow/tsl/platform/init_main.h"
 
 // NOLINTNEXTLINE
 static llvm::cl::opt<std::string> input_filename(llvm::cl::Positional,
@@ -46,11 +47,20 @@ static llvm::cl::opt<bool> splitInputFile(
                    "independently"),
     llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> verifyDiagnostics(
+    "verify-diagnostics",
+    llvm::cl::desc("Check that emitted diagnostics match "
+                   "expected-* lines on the corresponding line"),
+    llvm::cl::init(false));
+
 int main(int argc, char** argv) {
   llvm::InitLLVM y(argc, argv);
+  int dummyArgc = 1;
+  tsl::port::InitMain(argv[0], &dummyArgc, &argv);
 
   // Add flags for all the registered translations.
-  llvm::cl::opt<const mlir::TranslateFunction*, false, mlir::TranslationParser>
+  llvm::cl::opt<const mlir::Translation*, false, mlir::TranslationParser>
       requested_translation("", llvm::cl::desc("Translation to perform"));
   mlir::registerAsmPrinterCLOptions();
   llvm::cl::ParseCommandLineOptions(argc, argv, "XLA translation driver\n");
@@ -71,11 +81,20 @@ int main(int argc, char** argv) {
   // Processes the memory buffer with a new MLIRContext.
   auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                            llvm::raw_ostream& os) {
-    llvm::SourceMgr sourceMgr;
-    sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
+    auto sourceMgr = std::make_shared<llvm::SourceMgr>();
+    sourceMgr->AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
     mlir::MLIRContext context;
-    mlir::SourceMgrDiagnosticHandler diagnostic_handler(sourceMgr, &context);
-    return (*requested_translation)(sourceMgr, os, &context);
+
+    if (!verifyDiagnostics) {
+      mlir::SourceMgrDiagnosticHandler diagnostic_handler(*sourceMgr, &context);
+      return (*requested_translation)(sourceMgr, os, &context);
+    }
+
+    context.printOpOnDiagnostic(false);
+    mlir::SourceMgrDiagnosticVerifierHandler diagnostic_handler(*sourceMgr,
+                                                                &context);
+    (void)(*requested_translation)(sourceMgr, os, &context);
+    return diagnostic_handler.verify();
   };
 
   if (splitInputFile) {
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 258d11f93e5..19868d17bf3 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -134,8 +134,10 @@ std::string Reindent(absl::string_view original,
 
 template <typename FloatT>
 static void RoundTripNanPayload(FloatT value, std::string* result) {
+  static_assert(!std::is_same<FloatT, tsl::float8_e4m3fn>::value,
+                "RoundTripNanPayload does not support E4M3");
   const int kPayloadBits = NanPayloadBits<FloatT>();
-  if (std::isnan(value) && kPayloadBits > 0) {
+  if (Eigen::numext::isnan(value) && kPayloadBits > 0) {
     auto rep = absl::bit_cast<
         typename UnsignedIntegerTypeForSize<sizeof(FloatT)>::type>(value);
     auto payload = rep & NanPayloadBitMask<FloatT>();
@@ -154,6 +156,17 @@ static std::string GenericRoundTripFpToString(FloatT value) {
                          static_cast<double>(value));
 }
 
+std::string RoundTripFpToString(tsl::float8_e5m2 value) {
+  std::string result = GenericRoundTripFpToString(value);
+  RoundTripNanPayload(value, &result);
+  return result;
+}
+
+std::string RoundTripFpToString(tsl::float8_e4m3fn value) {
+  std::string result = GenericRoundTripFpToString(value);
+  return result;
+}
+
 std::string RoundTripFpToString(bfloat16 value) {
   std::string result = GenericRoundTripFpToString(value);
   RoundTripNanPayload(value, &result);
@@ -375,7 +388,7 @@ ConvertedDimensionNumbers ConvertDimensionNumbers(
       }
     } else if (any_present) {
       // Try to find if there is a to dimension that is like (from) [2,32] ->
-      // (to) [4,4,4] to detect that from dimensoin 1 can be partially mapped
+      // (to) [4,4,4] to detect that from dimension 1 can be partially mapped
       // into dimension 1 and 2 of the to sizes with a partial size of 2.
       if (common_factors[i].first + 2 == common_factors[i + 1].first &&
           absl::c_linear_search(from_dimensions, common_factors[i].first + 1)) {
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index b86c3660032..13c71ff03ef 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -326,6 +326,12 @@ std::string VectorString(const std::initializer_list<T>& c) {
   return VectorString<std::initializer_list<T>>(c);
 }
 
+// Returns a string which can losslessly round trip to a float8 E5M2.
+std::string RoundTripFpToString(tsl::float8_e5m2 value);
+
+// Returns a string which can losslessly round trip to a float8 E4M3.
+std::string RoundTripFpToString(tsl::float8_e4m3fn value);
+
 // Returns a string which can losslessly round trip to a bfloat.
 std::string RoundTripFpToString(tsl::bfloat16 value);
 
@@ -475,6 +481,8 @@ constexpr T IPow(T base, int exponent) {
   return result;
 }
 
+// UnsignedIntegerTypeForSize<N> gets an unsigned integer with the given size in
+// bytes.
 template <size_t>
 struct UnsignedIntegerTypeForSize;
 
@@ -498,24 +506,27 @@ struct UnsignedIntegerTypeForSize<8> {
   using type = uint64_t;
 };
 
-template <size_t N>
-struct SignedIntegerTypeForSize {
-  using type = std::make_signed_t<typename UnsignedIntegerTypeForSize<N>::type>;
-};
+template <size_t kBytes>
+using UnsignedIntegerTypeForSizeType =
+    typename UnsignedIntegerTypeForSize<kBytes>::type;
+
+template <size_t kBytes>
+using SignedIntegerTypeForSizeType =
+    std::make_signed_t<UnsignedIntegerTypeForSizeType<kBytes>>;
 
 // Returns the signed magnitude of T.
 template <typename T>
-typename SignedIntegerTypeForSize<sizeof(T)>::type ToSignMagnitude(T input) {
-  auto as_bits =
-      absl::bit_cast<typename SignedIntegerTypeForSize<sizeof(T)>::type>(input);
-  auto sign_mask =
-      absl::bit_cast<typename UnsignedIntegerTypeForSize<sizeof(T)>::type>(
-          tsl::MathUtil::Sign(as_bits));
+SignedIntegerTypeForSizeType<sizeof(T)> ToSignMagnitude(T input) {
+  auto as_bits = absl::bit_cast<SignedIntegerTypeForSizeType<sizeof(T)>>(input);
+  auto sign_mask = absl::bit_cast<UnsignedIntegerTypeForSizeType<sizeof(T)>>(
+      tsl::MathUtil::Sign(as_bits));
   return as_bits ^ (sign_mask >> 1);
 }
 
 template <typename T>
 constexpr int NanPayloadBits() {
+  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
+                "E4M3FN does not have payload");
   // Floating point types with NaNs have payloads.
   if (!std::numeric_limits<T>::has_quiet_NaN) {
     return 0;
@@ -525,14 +536,18 @@ constexpr int NanPayloadBits() {
 
 template <typename T>
 constexpr uint64_t QuietNanWithoutPayload() {
+  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
+                "E4M3FN does not have payload");
   if (const int bits = NanPayloadBits<T>()) {
-    return uint64_t{1} << (bits - 1);
+    return uint64_t{1} << (bits > 0 ? (bits - 1) : 0);
   }
   return 0;
 }
 
 template <typename T>
 constexpr uint64_t NanPayloadBitMask() {
+  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
+                "E4M3FN does not have payload");
   if (const int bits = NanPayloadBits<T>()) {
     return LsbMask<uint64_t>(bits);
   }
@@ -541,7 +556,9 @@ constexpr uint64_t NanPayloadBitMask() {
 
 template <typename T>
 T NanWithSignAndPayload(bool sign, uint64_t nan_payload) {
-  using RepT = typename UnsignedIntegerTypeForSize<sizeof(T)>::type;
+  static_assert(!std::is_same<T, tsl::float8_e4m3fn>::value,
+                "E4M3FN does not have payload");
+  using RepT = UnsignedIntegerTypeForSizeType<sizeof(T)>;
   const T val = std::numeric_limits<T>::quiet_NaN();
   auto rep = absl::bit_cast<RepT>(val);
   rep &= LsbMask<RepT>(std::numeric_limits<RepT>::digits - 1);
diff --git a/tensorflow/compiler/xla/util_test.cc b/tensorflow/compiler/xla/util_test.cc
index 75539067f4d..a6044222ca1 100644
--- a/tensorflow/compiler/xla/util_test.cc
+++ b/tensorflow/compiler/xla/util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
@@ -118,6 +119,18 @@ TEST(UtilTest, SanitizeFileName) {
 }
 
 TEST(UtilTest, RoundTripFpToString) {
+  EXPECT_EQ(RoundTripFpToString(NanWithSignAndPayload<tsl::float8_e5m2>(
+                false, QuietNanWithoutPayload<tsl::float8_e5m2>())),
+            "nan");
+  EXPECT_EQ(RoundTripFpToString(NanWithSignAndPayload<tsl::float8_e5m2>(
+                true, QuietNanWithoutPayload<tsl::float8_e5m2>())),
+            "-nan");
+  EXPECT_EQ(
+      RoundTripFpToString(std::numeric_limits<tsl::float8_e4m3fn>::quiet_NaN()),
+      "nan");
+  EXPECT_EQ(RoundTripFpToString(
+                -std::numeric_limits<tsl::float8_e4m3fn>::quiet_NaN()),
+            "-nan");
   EXPECT_EQ(RoundTripFpToString(NanWithSignAndPayload<half>(
                 false, QuietNanWithoutPayload<half>())),
             "nan");
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index bfa6e6ef8f8..b8ed8453bf3 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,5 +1,24 @@
 """Wrapper around proto libraries used inside the XLA codebase."""
 
+load(
+    "//tensorflow/tsl:tsl.bzl",
+    "clean_dep",
+    "if_tsl_link_protobuf",
+    "tsl_copts",
+)
+load(
+    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "tf_exec_properties",
+)
+
 def xla_py_proto_library(**kwargs):
     # Note: we don't currently define a proto library target for Python in OSS.
     _ignore = kwargs
@@ -14,3 +33,76 @@ ORC_JIT_MEMORY_MAPPER_TARGETS = []
 
 def xla_py_test_deps():
     return []
+
+def xla_cc_binary(deps = None, copts = tsl_copts(), **kwargs):
+    if not deps:
+        deps = []
+
+    # TODO(ddunleavy): some of these should be removed from here and added to
+    # specific targets.
+    deps += [
+        clean_dep("@com_google_protobuf//:protobuf"),
+        "//tensorflow/compiler/xla:xla_proto_cc_impl",
+        "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
+        "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:tensor_float_32_utils",
+        "//tensorflow/tsl/profiler/utils:time_utils_impl",
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack_impl",
+        "//tensorflow/tsl/profiler/backends/cpu:traceme_recorder_impl",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:protos_all_cc_impl",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc_impl",
+        "//tensorflow/tsl/util:determinism",
+    ]
+    native.cc_binary(deps = deps, copts = copts, **kwargs)
+
+def xla_cc_test(
+        name,
+        deps = [],
+        extra_copts = [],
+        **kwargs):
+    native.cc_test(
+        name = name,
+        copts = extra_copts,
+        deps = deps + if_tsl_link_protobuf(
+                   [],
+                   [
+                       clean_dep("@com_google_protobuf//:protobuf"),
+                       # TODO(zacmustin): remove these in favor of more granular dependencies in each test.
+                       "//tensorflow/compiler/xla:xla_proto_cc_impl",
+                       "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
+                       "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+                       "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
+                       "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
+                       "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
+                       "//tensorflow/compiler/xla/stream_executor:device_id_utils",
+                       "//tensorflow/compiler/xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+                       "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init_impl",
+                       "//tensorflow/tsl/profiler/utils:time_utils_impl",
+                       "//tensorflow/tsl/profiler/backends/cpu:annotation_stack_impl",
+                       "//tensorflow/tsl/profiler/backends/cpu:traceme_recorder_impl",
+                       "//tensorflow/tsl/protobuf:autotuning_proto_cc_impl",
+                       "//tensorflow/tsl/protobuf:dnn_proto_cc_impl",
+                       "//tensorflow/tsl/protobuf:protos_all_cc_impl",
+                       "//tensorflow/tsl/platform:env_impl",
+                       "//tensorflow/tsl/framework:allocator",
+                       "//tensorflow/tsl/framework:allocator_registry_impl",
+                       "//tensorflow/tsl/util:determinism",
+                   ],
+               ) +
+               if_cuda_is_configured([
+                   "//tensorflow/compiler/xla/stream_executor/cuda:cuda_stream",
+                   "//tensorflow/compiler/xla/stream_executor/cuda:all_runtime",
+                   "//tensorflow/compiler/xla/stream_executor/cuda:stream_executor_cuda",
+               ]) +
+               if_rocm_is_configured([
+                   "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
+                   "//tensorflow/compiler/xla/stream_executor/rocm:all_runtime",
+                   "//tensorflow/compiler/xla/stream_executor/rocm:stream_executor_rocm",
+               ]),
+        exec_properties = tf_exec_properties(kwargs),
+        **kwargs
+    )
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index d97de257ea1..6598d6ec745 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -17,9 +17,15 @@ syntax = "proto3";
 
 package xla;
 
+import "google/protobuf/any.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 
+// Proto version of `xla::CompilationEnvironments`.
+message CompilationEnvironmentsProto {
+  repeated google.protobuf.Any environments = 1;
+}
+
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields.
 message DebugOptions {
@@ -111,8 +117,7 @@ message DebugOptions {
   // Enable XLA Runtime in the CPU backend.
   bool xla_cpu_use_xla_runtime = 177;
 
-  // Maximum kernel unroll factor for the GPU backend.
-  int32 xla_gpu_max_kernel_unroll_factor = 98;
+  reserved 98;  // Was xla_gpu_max_kernel_unroll_factor
 
   // When true, "unsafe" mathematical optimizations are enabled. These
   // transformations include but are not limited to:
@@ -206,8 +211,7 @@ message DebugOptions {
 
   ShapeChecks xla_gpu_shape_checks = 170;
 
-  // Enable MLIR-based lowering in XLA:CPU instead of LLVM emitters.
-  bool xla_cpu_enable_mlir_lowering = 171;
+  reserved 171;  // Was xla_cpu_enable_mlir_lowering
 
   // If true, use MLIR instead of IR emitter to generate device code for
   // supported lmhlo.fusion ops. See xla::gpu::RewriteFusionOps() for details.
@@ -317,6 +321,9 @@ message DebugOptions {
   // Whether to dump llvm ir when compiling to ptx.
   bool xla_gpu_dump_llvmir = 155;
 
+  // Whether to dump mlir using pretty print form.
+  bool xla_dump_enable_mlir_pretty_form = 185;
+
   // Denylist for cuDNN convolutions.
   string xla_gpu_algorithm_denylist_path = 128;
 
@@ -345,7 +352,7 @@ message DebugOptions {
   // modules will be dumped.
   bool xla_detailed_logging_and_dumping = 143;
 
-  // Overrides normal multi-threaded compilation settting to use this many
+  // Overrides normal multi-threaded compilation setting to use this many
   // threads. Setting to 0 (the default value) means no enforcement.
   int32 xla_gpu_force_compilation_parallelism = 147;
 
@@ -360,6 +367,9 @@ message DebugOptions {
   // Convert synchronous all-reduces ops into asynchronous.
   bool xla_gpu_enable_async_all_reduce = 152;
 
+  // Convert synchronous collective-permute ops into asynchronous.
+  bool xla_gpu_enable_async_collective_permute = 183;
+
   // Size threshold (in bytes) for the GPU all-reduce combiner.
   int64 xla_gpu_all_reduce_combine_threshold_bytes = 157;
 
@@ -405,6 +415,9 @@ message DebugOptions {
   // Whether to use cuBLASLt for GEMMs on GPUs.
   bool xla_gpu_enable_cublaslt = 166;
 
+  // Whether XLA is allowed to use CUDA Graphs.
+  bool xla_gpu_enable_cuda_graphs = 180;
+
   // Size threshold (in megabytes) for the GPU redzone scratch allocator.
   int64 xla_gpu_redzone_scratch_max_megabytes = 167;
 
@@ -428,7 +441,26 @@ message DebugOptions {
   // (much) faster on our hardware.  Set this flag to disable this behavior.
   bool xla_cpu_strict_dot_conv_math = 175;
 
-  // Next id: 180
+  // An option to enable using cuDNN runtime compiled fusion kernels which is
+  // available and recommended for Ampere+ GPUs.
+  bool xla_gpu_use_runtime_fusion = 181;
+
+  bool xla_dump_latency_hiding_schedule = 182;
+
+  // By default, MLIR lowering will use Linalg elementwise fusion. If this flag
+  // is enabled, the pipeline will use tiling, fusion, peeling, vectorization
+  // instead.
+  bool xla_cpu_enable_mlir_tiling_and_fusion = 184;
+
+  bool xla_gpu_enable_latency_hiding_scheduler = 186;
+
+  enum PartitioningAlgorithm {
+    PARTITIONING_ALGORITHM_NOOP = 0;
+  }
+  // The partitioning algorithm to be used in the PartitionAssignment pass.
+  PartitioningAlgorithm xla_partitioning_algorithm = 187;
+
+  // Next id: 188
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
@@ -441,6 +473,12 @@ message DebugOptions {
   reserved 5, 117, 133, 139, 176, 178;
 }
 
+message ShardableValueUpdatePairProto {
+  int64 input_parameter_number = 1;
+  repeated int64 parameter_shape_index = 2;
+  repeated int64 output_shape_index = 3;
+}
+
 // These settings control how XLA compiles and/or runs code.  Not all settings
 // will have an effect on every platform.
 //
@@ -513,7 +551,84 @@ message ExecutionOptions {
   // which can be used by higher level framework as a way to query intermediate
   // sharding of operations when multiple computation would be chained and
   // merged together.
-  bool allow_spmd_sharding_propagation_to_output = 14;
+  // This is a vector of bool, because the user can control (if the output of
+  // the computation is a tuple) which elements of the tuple can have the
+  // sharding substituted and which don't. If only one boolean value is passed
+  // in the vector that's interpreted as the value to be applied for every
+  // single element of the output tuple. One value per element of the tuple
+  // means that each value is attached to one of the output elements.
+  repeated bool allow_spmd_sharding_propagation_to_output = 14;
+
+  // Whether to broadcast args across all replicas. One entry per arg.
+  repeated bool param_requires_broadcast_via_collectives = 18;
+
+  // If enabled, the compiler may generate sharding and unsharding programs as
+  // separate HLO modules, and modify the main program's input and output to
+  // be sharded.
+  bool allow_separate_sharding_programs = 19;
+
+  // The list of input/output pairs in the main program that could be sharded.
+  repeated ShardableValueUpdatePairProto shardable_value_update_pairs = 20;
+}
+
+// Serialization of HloModuleConfig. See the C++ class definition for
+// descriptions of each field.
+// There are no guarantees of backwards or forwards compatibility.
+message HloModuleConfigProto {
+  enum FusionConfigCollection {
+    OFF = 0;       // Do not collect configuration.
+    PER_EDGE = 1;  // Collect per-edge configuration.
+    PER_NODE = 2;  // Collect per-node configuration.
+  }
+
+  message BoolList {
+    repeated bool vals = 1;
+  }
+  message Int64List {
+    repeated int64 vals = 1;
+  }
+  message Int64ListList {
+    repeated Int64List lists = 1;
+  }
+
+  xla.ProgramShapeProto entry_computation_layout = 1;
+  uint64 seed = 2;
+  int32 launch_id = 3;
+  int64 replica_count = 4;
+  int64 num_partitions = 5;
+  repeated bool param_requires_broadcast_via_collectives = 6;
+  bool use_spmd_partitioning = 7;
+  bool use_auto_spmd_partitioning = 8;
+  repeated int64 auto_spmd_partitioning_mesh_shape = 9;
+  repeated int64 auto_spmd_partitioning_mesh_ids = 10;
+  bool deduplicate_hlo = 11;
+  int64 intra_op_parallelism_threads = 12;
+  string device_type = 13;
+
+  DebugOptions debug_options = 14;
+  DeviceAssignmentProto static_device_assignment = 15;
+  bool allow_separate_sharding_programs = 30;
+  repeated ShardableValueUpdatePairProto shardable_value_update_pairs = 16;
+  bool alias_passthrough_params = 17;
+  bool content_aware_computation_sorting = 18;
+  FusionConfigCollection fusion_config_collection = 19;
+
+  repeated BoolList fusion_config = 20;
+  map<string, Int64List> dot_config = 21;
+  repeated Int64ListList layout_config = 22;
+
+  repeated uint64 memory_space_assignment_config = 23;
+  repeated BoolList phase_ordering_config = 24;
+  int32 phase_index = 25;
+  reserved 26;  // Was flag_config
+  repeated bool allow_spmd_sharding_propagation_to_output = 27;
+  map<string, int64> analysis_allowance_map = 28;
+  xla.PrecisionConfig.Precision matrix_unit_operand_precision = 29;
+}
+
+message HloModuleProtoWithConfig {
+  HloModuleProto hlo_module = 1;
+  HloModuleConfigProto config = 2;
 }
 
 message GetDeviceHandlesRequest {
@@ -716,3 +831,20 @@ message UnpackRequest {
 message UnpackResponse {
   repeated GlobalDataHandle tied_data = 1;
 }
+
+// A trace estimated by the Latency Hiding Scheduler.
+message ScheduleProto {
+  message Instruction {
+    // Instruction id (matches the id in HloInstructionProto).
+    int64 id = 1;
+
+    // Start and end timestamps in cycles.
+    double start_timestamp_cycles = 2;
+    double end_timestamp_cycles = 3;
+  }
+  repeated Instruction instructions = 1;
+  // Computation id (matches the id in HloComputationProto).
+  int64 computation_id = 2;
+  HloModuleProto hlo_module = 3;
+  int64 cycles_per_microsecond = 4;
+}
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 0b8df53fdf2..cc96002f282 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -57,6 +57,22 @@ enum PrimitiveType {
 
   F64 = 12;
 
+  // FP8 dtypes, as described in this paper: https://arxiv.org/abs/2209.05433
+  //
+  // F8E5M2 has 5 exponent bits and 2 mantissa bits, and is similar to the
+  // existing IEEE types.
+  //
+  // F8E4M3FN has 4 exponent bits and 3 mantissa bits. The "FN" means only
+  // Finite and NaN values are supported. Unlike IEEE types, infinities are not
+  // supported.  NaN is represented when the exponent and mantissa bits are all
+  // 1s. All other values are finite.
+  //
+  // Support for these dtypes is under development. They do not yet work
+  // properly in most cases.
+  // TODO(b/259609697): Fully support FP8.
+  F8E5M2 = 19;
+  F8E4M3FN = 20;
+
   // Complex values of fixed width.
   C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
   C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
@@ -82,7 +98,7 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 19
+  // Next = 21
 }
 // LINT.ThenChange(
 //   https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc,
@@ -189,6 +205,11 @@ message LayoutProto {
   // a physical shape.
   ShapeProto physical_shape = 10;
 
+  // The dynamic shape metadata size in bytes in front of the shape data. The
+  // field may be non-zero for a static shape whose associated buffer is for a
+  // dynamic shape, e.g. a result of SliceToDynamic.
+  int64 dynamic_shape_metadata_prefix_bytes = 15;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
   // LayoutUtil::Hash appropriately to account for the new field.
 
@@ -473,8 +494,10 @@ message LiteralProto {
   bytes bf16s = 13;
   bytes u16s = 16;
   bytes s16s = 17;
+  bytes f8e5m2s = 19;
+  bytes f8e4m3fns = 20;
   repeated int64 sparse_indices = 14;
-  // Next = 19
+  // Next = 21
 }
 
 message WindowDimension {
@@ -802,9 +825,9 @@ message WhileLoopBackendConfig {
   KnownTripCount known_trip_count = 1;
 }
 
-// Specifies a pair of output/operand buffers for kCustomCall that alias each
-// other.
-message CustomCallOutputOperandAliasing {
+// Specifies a pair of output/operand buffers that alias each other for
+// kCustomCall and kFusion
+message OutputOperandAliasing {
   repeated int64 output_shape_index = 1;
   int64 operand_index = 2;
   repeated int64 operand_shape_index = 3;
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 4189c037ac1..9044954f0e9 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -17,6 +17,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow/compiler/xrt:__subpackages__",
@@ -90,9 +91,9 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
diff --git a/tensorflow/compiler/xrt/cc/BUILD b/tensorflow/compiler/xrt/cc/BUILD
index cbe067dea94..9783aeaafa0 100644
--- a/tensorflow/compiler/xrt/cc/BUILD
+++ b/tensorflow/compiler/xrt/cc/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_gen_op_wrappers_cc")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 28aebafb031..4d4a42468e9 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow/compiler/xrt:__subpackages__",
@@ -66,10 +67,10 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:dump",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
@@ -126,7 +127,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
diff --git a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc b/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
index 5f07b24f9ab..63e93aa4664 100644
--- a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
@@ -127,7 +127,7 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (tpu::OpsApiFn()
+        if (stream_executor::tpu::OpsApiFn()
                 ->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
           return;
         }
diff --git a/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc b/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc
index 25cbf3e0b86..340800d19c3 100644
--- a/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/tpu_execute_op.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index bb867dbe4b7..6d5e736ed8e 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -16,13 +16,14 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -174,7 +175,7 @@ Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
   TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
       stream, *metadata_literal, metadata_buffer));
   // Retain the literal until the end of the transfer.
-  stream->ThenDoHostCallback([metadata_literal]() { return OkStatus(); });
+  stream->ThenDoHostCallback([keep_alive = std::move(metadata_literal)] {});
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index baf0eb13db0..27e6438da91 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//learning/brain:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 7e8f5af1a8b..6e65fe1709a 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -134,9 +134,18 @@ xla::LiteralProto CreateR0(T v) {
   return array.ToProto();
 }
 
+tensorflow::SessionOptions GetSessionOptions() {
+  tensorflow::SessionOptions options;
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  options.config.mutable_experimental()->set_disable_optimize_for_static_graph(
+      true);
+  return options;
+}
+
 class XrtClientSession : public ClientSession {
  public:
-  explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
+  explicit XrtClientSession(const Scope& scope)
+      : ClientSession(scope, GetSessionOptions()) {
     auto clear_all = ops::XRTReleaseAllAllocations(scope);
     std::vector<Tensor> outputs;
     TF_CHECK_OK(Run(ClientSession::FeedType(), {}, {clear_all}, &outputs));
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 3d92de6986c..9ff747ee6c3 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -194,6 +194,9 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
   options.set_xla_dump_include_timestamp(
       ref_options.xla_dump_include_timestamp());
   options.set_xla_dump_max_hlo_modules(ref_options.xla_dump_max_hlo_modules());
+  options.set_xla_dump_enable_mlir_pretty_form(
+      ref_options.xla_dump_enable_mlir_pretty_form());
+
   for (auto& pass : ref_options.xla_disable_hlo_passes()) {
     options.add_xla_disable_hlo_passes(pass);
   }
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index e2cb63a8451..16774d43320 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 9256c31c17d..e32c45adcef 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -69,6 +69,7 @@ load(
     "if_ios",
     "if_libtpu",
     "if_mobile",
+    "if_nccl",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -79,7 +80,7 @@ load(
     "tf_opts_nortti_if_lite_protos",
     "transitive_hdrs",
 )
-load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "if_nccl", "tensorflow_opensource_extra_deps", "tf_monitoring_framework_deps", "tf_selective_registration_deps")
+load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tensorflow_opensource_extra_deps", "tf_monitoring_framework_deps", "tf_selective_registration_deps")
 
 # For platform specific build config
 load(
@@ -88,7 +89,6 @@ load(
     "tf_additional_test_deps",
     "tf_dtensor_tpu_dependencies",
     "tf_jspb_proto_library",
-    "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
     "tf_portable_deps_no_runtime",
     "tf_portable_proto_lib",
@@ -113,11 +113,7 @@ load(
 )
 
 package(
-    default_visibility = [
-        ":dependency_whitelist",
-        "//tensorflow:internal",
-        "//tensorflow_models:__subpackages__",
-    ],
+    default_visibility = ["//visibility:public"],
     features = if_google([
         "-layering_check",
         "-parse_headers",
@@ -126,7 +122,7 @@ package(
 )
 
 package_group(
-    name = "dependency_whitelist",
+    name = "dependency_allowlist",
     packages = [
         "//learning/freud/topic_models/tensorflow/...",
         "//learning/infra/mira/...",
@@ -172,10 +168,14 @@ tf_proto_library(
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto",
         "//tensorflow/core/grappler/costs:op_performance_data",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto",
         "//tensorflow/tsl/protobuf:coordination_config_proto",
         "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto",
     ],
+    cc_libs = [
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -603,6 +603,7 @@ cc_library(
         "//tensorflow/core/kernels:random_index_shuffle_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:stateful_random_ops",
+        "//tensorflow/core/kernels:sync_ops",
         "//tensorflow/core/kernels:random_binomial_op",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:required",
@@ -940,6 +941,7 @@ filegroup(
         "nn_grad",
         "nn_ops_op_lib",
         "no_op_op_lib",
+        "optional_ops_op_lib",
         "parsing_ops_op_lib",
         "portable_op_registrations_and_gradients",
         "ragged_array_ops_op_lib",
@@ -968,6 +970,7 @@ filegroup(
         "stateless_random_ops_v2_op_lib",
         "string_ops_op_lib",
         "summary_ops_op_lib",
+        "sync_ops_op_lib",
         "tpu_configuration_ops_op_lib",
         "tpu_cross_replica_ops_op_lib",
         "tpu_embedding_ops_op_lib",
@@ -1093,7 +1096,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":portable_tensorflow_lib_lite",
-        ":protos_all_cc_impl",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:portable_tensorflow_kernels",
         "//third_party/eigen3",
         "@com_google_protobuf//:protobuf",
@@ -1261,7 +1264,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
     ] + if_static([":lib_internal_impl"]),
 )
 
@@ -1287,7 +1290,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
 
@@ -1442,7 +1445,7 @@ cc_library(
         "//tensorflow/core/platform:tracing",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/core/platform:unbounded_work_queue",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/util:env_var",
         "//tensorflow/core/util:reporter",  # TODO(gunan): REMOVE as soon as cc_shared_library is supported.
         "@snappy",
@@ -1455,17 +1458,21 @@ cc_library(
     }) +
     # The TF proto implementations that we will statically link here.
     [
+        "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc_impl",
-        "//tensorflow/core/profiler:profiler_options_proto_cc_impl",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc_impl",
         "//tensorflow/core/protobuf:autotuning_proto_cc_impl",
         "//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl",
         "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/tsl/protobuf:autotuning_proto_cc_impl",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc_impl",
         "//tensorflow/tsl/protobuf:protos_all_cc_impl",
         "//tensorflow/compiler/xla:xla_proto_cc_impl",
         "//tensorflow/compiler/xla:xla_data_proto_cc_impl",
         "//tensorflow/compiler/xla/service:hlo_proto_cc_impl",
+        "//tensorflow/compiler/xla/service/gpu:backend_configs_cc_impl",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:dnn_proto_cc_impl",
     ] + tf_protos_grappler_impl() + tf_monitoring_framework_deps(),
     # Alwayslink causes a cc_binary to "always link" in the
     # srcs for a given cc_library, even if they are unreferenced, see:
@@ -1615,7 +1622,7 @@ cc_header_only_library(
         ":lib",
         ":lib_internal",
         "//tensorflow/core/framework:bounds_check",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
 
@@ -1674,7 +1681,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:tensor_shape",
         "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform:fingerprint",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/tsl/platform:stringpiece",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/core/profiler/lib:annotated_traceme",
@@ -1781,6 +1788,7 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
+        "//tensorflow/core/common_runtime:stats_publisher_interface",
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:required",
     ]),
@@ -1838,7 +1846,7 @@ cc_library(
 alias(
     name = "test_main",
     actual = "//tensorflow/tsl/platform:test_main",
-    visibility = ["//tensorflow:internal"],
+    visibility = ["//visibility:public"],
 )
 
 tf_cc_tests(
@@ -1928,61 +1936,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_tests(
-    name = "core_higher_level_tests",
-    size = "small",
-    srcs = [
-        "//tensorflow/core/example:feature_util_test.cc",
-        "//tensorflow/core/graph:algorithm_test.cc",
-        "//tensorflow/core/graph:control_flow_test.cc",
-        "//tensorflow/core/graph:costmodel_test.cc",
-        "//tensorflow/core/graph:edgeset_test.cc",
-        "//tensorflow/core/graph:graph_def_builder_test.cc",
-        "//tensorflow/core/graph:graph_partition_test.cc",
-        "//tensorflow/core/graph:graph_test.cc",
-        "//tensorflow/core/graph:node_builder_test.cc",
-        "//tensorflow/core/graph:optimizer_cse_test.cc",
-        "//tensorflow/core/graph:subgraph_test.cc",
-        "//tensorflow/core/graph:tensor_id_test.cc",
-        "//tensorflow/core/graph:validate_test.cc",
-        "//tensorflow/core/util/sparse:higher_level_tests_group",
-    ],
-    linkopts = select({
-        "//tensorflow:macos": ["-headerpad_max_install_names"],
-        "//conditions:default": [],
-    }),
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:cc_ops_internal",
-        "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/cc:sendrecv_ops",
-        "//tensorflow/cc:while_loop",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/platform:regexp",
-        "//tensorflow/core/util:protos_test_cc",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 tf_cc_test_mkl(
     name = "mkl_related_tests",
     size = "small",
diff --git a/tensorflow/core/activity_watcher/BUILD b/tensorflow/core/activity_watcher/BUILD
index ce5de4eb9a1..890b643c58b 100644
--- a/tensorflow/core/activity_watcher/BUILD
+++ b/tensorflow/core/activity_watcher/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_android")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/activity_watcher/activity.cc b/tensorflow/core/activity_watcher/activity.cc
index e7ce898ea7d..28965aa6a55 100644
--- a/tensorflow/core/activity_watcher/activity.cc
+++ b/tensorflow/core/activity_watcher/activity.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace activity_watcher {
-void MaybeEnableMultiWorkersWatching(CoordinationServiceAgent* agent) {}
+void MaybeEnableMultiWorkersWatching(tsl::CoordinationServiceAgent* agent) {}
 
 namespace tfw_internal {
 
diff --git a/tensorflow/core/activity_watcher/activity.h b/tensorflow/core/activity_watcher/activity.h
index 4090a6da8c2..b71b02e260f 100644
--- a/tensorflow/core/activity_watcher/activity.h
+++ b/tensorflow/core/activity_watcher/activity.h
@@ -24,8 +24,11 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace tensorflow {
+namespace tsl {
 class CoordinationServiceAgent;
+}
+
+namespace tensorflow {
 
 namespace activity_watcher {
 
@@ -78,7 +81,7 @@ struct Activity {
 
 // Enable activity wathcer to send own workers activities to coordination
 // service and also fetch all workers' activities.
-void MaybeEnableMultiWorkersWatching(CoordinationServiceAgent* agent);
+void MaybeEnableMultiWorkersWatching(tsl::CoordinationServiceAgent* agent);
 
 namespace tfw_internal {
 
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index b099a518e8f..4884972676b 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -22,6 +22,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/api_def/base_api/BUILD b/tensorflow/core/api_def/base_api/BUILD
index ff68174adc6..06572a0487e 100644
--- a/tensorflow/core/api_def/base_api/BUILD
+++ b/tensorflow/core/api_def/base_api/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt b/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt
index a26e5e24471..26045a313be 100644
--- a/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt
@@ -13,7 +13,7 @@ For example:
 
 ```
 # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.angle(input) ==> [2.0132, 1.056]
+tf.math.angle(input) ==> [2.0132, 1.056]
 ```
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduceScatterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduceScatterV2.pbtxt
new file mode 100644
index 00000000000..114fbe9b8df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduceScatterV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduceScatterV2"
+  summary: "Mutually reduces multiple tensors of identical type and shape and scatters the result."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMask.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMask.pbtxt
new file mode 100644
index 00000000000..ed0c53d5a04
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMask.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "ComputeDedupDataTupleMask"
+  visibility: HIDDEN
+  out_arg {
+    name: "output_shape"
+    description: <<END
+A 2-D int tensor represent mask of deduplication data tuple generated by
+`XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+represents float tensor. The second dimension of `output_shape` gives length of
+each tuple element.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "An op computes tuple mask of deduplication data from embedding core."
+  description: <<END
+The deduplication data receiving from embedding core is a Tensor with
+type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+rank 1 tensors. This op is to represents types and length of these elements.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilterV2.pbtxt
new file mode 100644
index 00000000000..50a51e0ac52
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilterV2.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "Conv2DBackpropFilterV2"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape `[filter_height, filter_width, in_channels, out_channels]`.
+Only shape of tensor is used.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+the `filter` input of the convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution. Must be in the same order as the dimension specified with
+format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of convolution with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInputV2.pbtxt
new file mode 100644
index 00000000000..286416684f1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInputV2.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "Conv2DBackpropInputV2"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.
+Only shape of tensor is used.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+w.r.t. the input of the convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution. Must be in the same order as the dimension specified with
+format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of convolution with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DistributedSave.pbtxt b/tensorflow/core/api_def/base_api/api_def_DistributedSave.pbtxt
new file mode 100644
index 00000000000..665377b7942
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DistributedSave.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DistributedSave"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt b/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt
index b5c44b5e073..3f4c7eda274 100644
--- a/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt
@@ -51,5 +51,12 @@ See `dynamic_stitch` for an example on how to merge partitions back.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
 </div>
+
+
+Raises:
+  * `InvalidArgumentError` in following cases:
+    - If partitions is not in range `[0, num_partiions)`
+    - If `partitions.shape` does not match prefix of `data.shape` argument.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
index e242bd0fc3f..118577f7b97 100644
--- a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -1,7 +1,12 @@
 op {
   graph_op_name: "FakeQuantWithMinMaxArgs"
-  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
+  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same shape and type."
   description: <<END
+
+  Quantization is called fake since the output is still in floating point.
+  The API converts inputs into values within the range [min and max] and returns
+  as output.
+
 Attributes
 
 *   `[min; max]` define the clamping range for the `inputs` data.
@@ -21,7 +26,26 @@ the behavior can be unexpected:
 *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
 `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 
-Quantization is called fake since the output is still in floating point.
+
+Examples
+
+```python
+
+inp = tf.constant ([10.03, -10.23, 3])
+out = tf.quantization.fake_quant_with_min_max_args(inp, min=-5, max=5,
+                                                   num_bits=16)
+print(out)
+
+#  Output:
+#  tf.Tensor([ 4.9999237 -5.0000763  3.0000763], shape=(3,), dtype=float32)
+```
+
+Raises:
+  * InvalidArgumentError:
+    - If num_bits are outside of range [2, 16].
+    - If min >= max.
+  * ValueError: If `inputs` are of any other type than float32.
+
 END
 }
 
diff --git a/tensorflow/core/api_def/base_api/api_def_MergeDedupData.pbtxt b/tensorflow/core/api_def/base_api/api_def_MergeDedupData.pbtxt
new file mode 100644
index 00000000000..b2ba9a214fb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MergeDedupData.pbtxt
@@ -0,0 +1,56 @@
+# proto-file: tensorflow/core/api_def/base_api/api_def_MergeDedupData.pbtxt
+# proto-message: MergeDedupData reference proto
+
+op {
+  graph_op_name: "MergeDedupData"
+  visibility: HIDDEN
+  in_arg {
+    name: "integer_tensor"
+    description: <<END
+A 1-D integer tensor, includes integer elements of deduplication data tuple.
+END
+  }
+  in_arg {
+    name: "float_tensor"
+    description: <<END
+A 1-D float tensor, includes float elements of deduplication data tuple.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An XLA tuple merging integer and float elements as deduplication data tuple.
+END
+  }
+  attr {
+    name: "tuple_mask"
+    description: <<END
+A serialized TensorProto string of output tuple mask. This mask is a 2-D tensor,
+with first column as tuple element type, and second column as span of this type.
+For example, an output tuple of (1, 2, 0.1, 3), its mask is [[0, 2], [1, 1], [0,
+1]]. We expect only two types of elements: integer(0) and float(1).
+END
+  }
+  attr {
+    name: "integer_type"
+    description: <<END
+integer_tensor type. Allowed types: {int32, int64, uint32, uint64}.
+END
+  }
+  attr {
+    name: "float_type"
+    description: <<END
+float_tensor type. Allowed types: {half, bfloat16, float}.
+END
+  }
+  summary: <<END
+An op merges elements of integer and float tensors into deduplication data as
+XLA tuple.
+END
+  description: <<END
+This op merges outputs of SplitDedupDataOp, which gives two 1-D tensors, integer
+and floating point. With respect to tuple_mask, this op merges values of these
+two tensors into an XLA tuple, which should be as same as input to
+SplitDedupDataOp.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDatasetV2.pbtxt
new file mode 100644
index 00000000000..20eada69f26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomDatasetV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "RandomDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  in_arg {
+    name: "seed_generator"
+    description: <<END
+A resource for the random number seed generator.
+END
+  }
+  attr {
+    name: "rerandomize_each_iteration"
+    description: <<END
+A boolean attribute to rerandomize the sequence of random numbers generated
+at each epoch.
+END
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+  description: <<END
+Creates a Dataset that returns a stream of uniformly distributed
+pseudorandom 64-bit signed integers. It accepts a boolean attribute that
+determines if the random number generators are re-applied at each epoch. The
+default value is True which means that the seeds are applied and the same
+sequence of random numbers are generated at each epoch. If set to False, the
+seeds are not re-applied and a different sequence of random numbers are
+generated at each epoch.
+
+In the TensorFlow Python API, you can instantiate this dataset via the
+class `tf.data.experimental.RandomDatasetV2`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProdV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProdV2.pbtxt
new file mode 100644
index 00000000000..544eed6e5f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProdV2.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "SegmentProdV2"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose size is equal to the size of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+The values must be less than `num_segments`.
+
+Caution: The values are always validated to be sorted on CPU, never validated
+on GPU.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for the first `segment_ids.rank`
+dimensions, which are replaced with a single dimensionw which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+for an explanation of segments.
+
+Computes a tensor such that
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If the product is empty for a given segment ID `i`, `output[i] = 1`.
+
+Note: That this op is currently only supported with jit_compile=True.
+
+The only difference with SegmentProd is the additional input  `num_segments`.
+This helps in evaluating the output shape in compile time.
+`num_segments` should be consistent with segment_ids.
+e.g. Max(segment_ids) - 1 should be equal to `num_segments` for a 1-d segment_ids
+With inconsistent num_segments, the op still runs. only difference is, 
+the output takes the size of num_segments irrespective of size of segment_ids and data.
+for num_segments less than expected output size, the last elements are ignored
+for num_segments more than the expected output size, last elements are assigned 1.
+
+For example:
+
+>>> @tf.function(jit_compile=True)
+... def test(c):
+...   return tf.raw_ops.SegmentProdV2(data=c, segment_ids=tf.constant([0, 0, 1]), num_segments=2)
+>>> c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+>>> test(c).numpy()
+array([[4, 6, 6, 4],
+       [5, 6, 7, 8]], dtype=int32)
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSumV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSumV2.pbtxt
new file mode 100644
index 00000000000..159d4319ec5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSumV2.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SegmentSumV2"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose size is equal to the size of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+The values must be less than `num_segments`.
+
+Caution: The values are always validated to be sorted on CPU, never validated
+on GPU.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for the first `segment_ids.rank`
+dimensions, which are replaced with a single dimension which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the sum along segments of a tensor."
+  description: <<END
+Read
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+for an explanation of segments.
+
+Computes a tensor such that
+\\(output_i = \sum_j data_j\\) where sum is over `j` such
+that `segment_ids[j] == i`.
+
+If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+
+Note that this op is currently only supported with jit_compile=True.
+</div>
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SplitDedupData.pbtxt b/tensorflow/core/api_def/base_api/api_def_SplitDedupData.pbtxt
new file mode 100644
index 00000000000..2df91534adc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SplitDedupData.pbtxt
@@ -0,0 +1,55 @@
+# proto-file: tensorflow/core/api_def/base_api/api_def_SplitDedupData.pbtxt
+# proto-message: SplitDedupData reference proto
+
+op {
+  graph_op_name: "SplitDedupData"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+An XLA tuple including integer and float elements as deduplication data tuple.
+END
+  }
+  out_arg {
+    name: "integer_tensor"
+    description: <<END
+A 1-D integer tensor, includes integer elements of deduplication data tuple.
+END
+  }
+  out_arg {
+    name: "float_tensor"
+    description: <<END
+A 1-D float tensor, includes float elements of deduplication data tuple.
+END
+  }
+  attr {
+    name: "integer_type"
+    description: <<END
+integer_tensor type. Allowed types: int32, int64, uint32, uint64.
+END
+  }
+  attr {
+    name: "float_type"
+    description: <<END
+float_tensor type. Allowed types: half, bfloat16, float.
+END
+  }
+  attr {
+    name: "tuple_mask"
+    description: <<END
+A serialized TensorProto string of output tuple mask. This mask is a 2-D tensor,
+with first column as tuple element type, and second column as span of this type.
+For example, an output tuple of (1, 2, 0.1, 3), its mask is [[0, 2], [1, 1], [0,
+1]]. We expect only two types of elements: integer(0) and float(1).
+END
+  }
+  summary: <<END
+An op splits input deduplication data XLA tuple into integer and floating point
+tensors.
+END
+  description: <<END
+Deduplication data is an XLA tuple, which consists of integer and floating point
+values. This op is to split these values into two groups for two types, and
+construct each group as one tensor to return.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SyncDevice.pbtxt b/tensorflow/core/api_def/base_api/api_def_SyncDevice.pbtxt
new file mode 100644
index 00000000000..e38acad5bf1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SyncDevice.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "SyncDevice"
+  visibility: HIDDEN
+  summary: "Synchronizes the device this op is run on."
+  description: <<END
+Only GPU ops are asynchrous in TensorFlow, and so this only has an effect when
+run on GPUs. On GPUs, this op synchronizes the GPU's compute stream.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInputV2.pbtxt
new file mode 100644
index 00000000000..193f3ead0d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInputV2.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "TPUPartitionedInputV2"
+  visibility: HIDDEN
+  summary: "An op that groups a list of partitioned inputs together. Supports ND sharding."
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of partitioned inputs which must have the same shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A handle which represents the full shape of partitioned tensors.
+END
+  }
+  attr {
+    name: "partition_dims"
+    description: <<END
+A list of integers describing how each dimension is partitioned. Emptiness
+indicates the inputs are replicated.
+END
+  }
+  attr {
+    name: "is_packed"
+    description: <<END
+Indicates whether the input is a packed resource.
+END
+  }
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutputV2.pbtxt
new file mode 100644
index 00000000000..10247da8e71
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutputV2.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "TPUPartitionedOutputV2"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A tensor which represents the full shape of partitioned tensors.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A list of partitioned outputs which have the same shape.
+END
+  }
+  attr {
+    name: "partition_dims"
+    description: <<END
+A list of integers describing how each dimension is partitioned. Emptiness
+indicates the inputs are replicated.
+END
+  }
+  summary: "An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned"
+  description: <<END
+outputs outside the XLA computation. Supports ND sharding.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterMax.pbtxt
index 675b6da9aa4..95c46e0ae49 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorScatterMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterMax.pbtxt
@@ -29,7 +29,7 @@ END
 Returns a new tensor copied from `tensor` whose values are element-wise maximum between
 tensor and updates according to the indices.
 
->>> tensor = [0, 0, 0, 0, 0, 0, 0, 0] 
+>>> tensor = [0, 0, 0, 0, 0, 0, 0, 0]
 >>> indices = [[1], [4], [5]]
 >>> updates = [1, -1, 1]
 >>> tf.tensor_scatter_nd_max(tensor, indices, updates).numpy()
diff --git a/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt b/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt
index bf2d07bcf58..a3689029ff3 100644
--- a/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Timestamp.pbtxt
@@ -4,7 +4,14 @@ op {
   description: <<END
 Returns the timestamp as a `float64` for seconds since the Unix epoch.
 
-Note: the timestamp is computed when the op is executed, not when it is added
-to the graph.
+Common usages include:
+* Logging
+* Providing a random number seed
+* Debugging graph execution
+* Generating timing information, mainly through comparison of timestamps
+
+Note: In graph mode, the timestamp is computed when the op is executed,
+not when it is added to the graph.  In eager mode, the timestamp is computed
+when the op is eagerly executed.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt
index ef1b9873139..c33fa639b97 100644
--- a/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt
@@ -1,6 +1,6 @@
 op {
   graph_op_name: "TruncateDiv"
-  summary: "Returns x / y element-wise for integer types."
+  summary: "Returns x / y element-wise, rounded towards zero."
   description: <<END
 Truncation designates that negative numbers will round fractional quantities
 toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
diff --git a/tensorflow/core/api_def/base_api/api_def_UniformQuantizedAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniformQuantizedAdd.pbtxt
new file mode 100644
index 00000000000..c56fd11ea56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniformQuantizedAdd.pbtxt
@@ -0,0 +1,161 @@
+op {
+  graph_op_name: "UniformQuantizedAdd"
+  visibility: HIDDEN
+  in_arg {
+    name: "lhs"
+    description: <<END
+Must be a quantized tensor.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Must be a quantized tensor.
+END
+  }
+  in_arg {
+    name: "lhs_scales"
+    description: <<END
+The float value(s) used as scale factors when quantizing the original data that `lhs` represents.
+END
+  }
+  in_arg {
+    name: "lhs_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+Must have same shape with `lhs_scales`.
+END
+  }
+  in_arg {
+    name: "rhs_scales"
+    description: <<END
+The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+END
+  }
+  in_arg {
+    name: "rhs_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+Must have same shape with `rhs_scales`.
+END
+  }
+  in_arg {
+    name: "output_scales"
+    description: <<END
+The float value(s) to use as scale factors when quantizing original data that `output` represents.
+END
+  }
+  in_arg {
+    name: "output_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that output represents.
+Must have same shape with `output_scales`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The output quantized tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of `lhs`, `rhs`, and `output`.
+END
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `lhs`, only per-tensor quantization is supported.
+Thus, this must be set to -1.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `lhs`.
+For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `lhs`.
+For example, if `Tin` is `qint8`, this must be set to 127.
+END
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `rhs`, only per-tensor quantization
+or per-channel quantization along `kernel_output_feature_dimension` is supported.
+Thus, this must be set to -1 or `dimension_numbers.kernel_output_feature_dimension`.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `rhs`.
+For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `rhs`.
+For example, if `Tin` is `qint8`, this must be set to 127.
+END
+  }
+  attr {
+    name: "output_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `output`, only per-tensor quantization or per-channel quantization along `output_feature_dimension` is supported.
+Thus, this must be set to -1 or `dimension_numbers.output_feature_dimension`.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "output_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `output`.
+For example, if  `Tout` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "output_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `output`.
+For example, if `Tout` is `qint8`, this must be set to 127.
+END
+  }
+  summary: "Perform quantized add of quantized Tensor `lhs` and quantized Tensor `rhs` to make quantized `output`."
+  description: <<END
+Given quantized `lhs` and quantized `rhs`, performs quantized add on `lhs` and `rhs` to make quantized `output`.
+
+`UniformQuantizedAdd` follows Numpy broadcasting rules.
+The two input array shapes are compared element-wise.
+Starting with the trailing dimensions, the two dimensions either have to be equal or one of them needs to be 1.
+
+`lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+```
+quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+```
+`output` is also quantized, using the same formula.
+
+If `lhs` and `output` is both per-axis quantized, the quantization axis must match.
+Also, if `rhs` and `output` is both per-axis quantized, the quantization axis must match.
+*Match* means the axis must match when adding, regarding the broadcasting.
+i.e. For both operands `lhs` and `rhs`,
+if `operand.quantization_axis` >= 0 and `output.quantization_axis` >= 0,
+`operand.dims` - `operand.quantization_axis` must be equal to `output.dims` - `output.quantization_axis`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniformQuantizedConvolution.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniformQuantizedConvolution.pbtxt
new file mode 100644
index 00000000000..c721cefa64f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniformQuantizedConvolution.pbtxt
@@ -0,0 +1,238 @@
+op {
+  graph_op_name: "UniformQuantizedConvolution"
+  visibility: HIDDEN
+  in_arg {
+    name: "lhs"
+    description: <<END
+Must be a quantized tensor, rank >= 3.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Must be a quantized tensor, same rank as `lhs`.
+END
+  }
+  in_arg {
+    name: "lhs_scales"
+    description: <<END
+The float value(s) used as scale factors when quantizing the original data that `lhs` represents.
+Must be a scalar `Tensor` (`lhs` supports only per-tensor quantization).
+END
+  }
+  in_arg {
+    name: "lhs_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+Same shape condition as `lhs_scales`.
+END
+  }
+  in_arg {
+    name: "rhs_scales"
+    description: <<END
+The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+Must be a scalar `Tensor` for per-tensor quantization,
+or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`, for per-channel quantization.
+END
+  }
+  in_arg {
+    name: "rhs_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+Same shape condition as `rhs_scales`.
+END
+  }
+  in_arg {
+    name: "output_scales"
+    description: <<END
+The float value(s) to use as scale factors when quantizing original data that `output` represents.
+Must be a scalar `Tensor` for per-tensor quantization,
+or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`
+- which is equal to `output.dim_size(output_feature_dimension)`,
+for per-channel quantization.
+If `rhs` is per-tensor quantized, output must be also per-tensor quantized.
+This means that if `rhs_scales` and `rhs_zero_points` are scalar `Tensor`s, `output_scales` and `output_zero_points` must be scalar `Tensor`s as well.
+END
+  }
+  in_arg {
+    name: "output_zero_points"
+    description: <<END
+The int32 value(s) used as zero points when quantizing original data that output represents.
+Same shape condition as `output_scales`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The output quantized tensor of `Tout`, same rank as `lhs` and `rhs`.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The type of `lhs` and `rhs` input `Tensor`.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The type of `output` `Tensor`.
+END
+  }
+  attr {
+    name: "window_strides"
+    description: <<END
+The stride of the sliding window for each spatial dimension of `lhs`.
+Must be an empty list (default) or a list of size (number of spatial dimensions).
+If an empty list is provided, the stride for each spatial dimension is set to 1.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+string from: `"SAME"`, `"VALID"`, or `"EXPLICIT"`, indicating the type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_padding"
+    description: <<END
+If `padding` is `"EXPLICIT"`, must be set as a list indicating
+the explicit paddings at the start and end of each `lhs` spatial dimension.
+Otherwise, this must be empty.
+
+(If used,) Must be a list of size `2 * (number of lhs spatial dimensions)`,
+where `(explicit_padding[2 * i], explicit_padding[2 * i + 1])` indicates
+`(start_padding, end_padding)` of `spatial_dimensions[i]`.
+END
+  }
+  attr {
+    name: "lhs_dilation"
+    description: <<END
+The dilation factor to apply in each spatial dimension of `lhs`.
+Must be an empty list (default) or a list of size (number of `lhs` spatial dimensions).
+If empty list, the dilation for each `lhs` spatial dimension is set to 1.
+END
+  }
+  attr {
+    name: "rhs_dilation"
+    description: <<END
+The dilation factor to apply in each spatial dimension of `rhs`.
+Must be an empty list (default) or a list of size (number of `rhs` spatial dimensions).
+If empty list, the dilation for each `rhs` spatial dimension is set to 1.
+END
+  }
+  attr {
+    name: "batch_group_count"
+    description: <<END
+The number of batch groups. Used for grouped filters.
+Must be a divisor of `output_feature`.
+END
+  }
+  attr {
+    name: "feature_group_count"
+    description: <<END
+The number of feature groups. Used for grouped convolutions.
+Must be a divisor of both `lhs_feature` and `output_feature`.
+END
+  }
+  attr {
+    name: "dimension_numbers"
+    description: <<END
+Structure of dimension information for the convolution op.
+Must be an empty string (default) or a serialized string of `tensorflow.UniformQuantizedConvolutionDimensionNumbersAttr` proto.
+If empty string, the default is `("NCHW", "OIHW", "NCHW")` (for a 2D convolution).
+END
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `lhs`, only per-tensor quantization is supported.
+Thus, this must be set to -1.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `lhs`.
+For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `lhs`.
+For example, if `Tin` is `qint8`, this must be set to 127.
+END
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `rhs`, only per-tensor quantization
+or per-channel quantization along `kernel_output_feature_dimension` is supported.
+Thus, this must be set to -1 or `dimension_numbers.kernel_output_feature_dimension`.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `rhs`.
+For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `rhs`.
+For example, if `Tin` is `qint8`, this must be set to 127.
+END
+  }
+  attr {
+    name: "output_quantization_axis"
+    description: <<END
+Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+If set to -1 (default), this indicates per-tensor quantization.
+For the `output`, only per-tensor quantization or per-channel quantization along `output_feature_dimension` is supported.
+Thus, this must be set to -1 or `dimension_numbers.output_feature_dimension`.
+Other values will raise error at OpKernel construction.
+END
+  }
+  attr {
+    name: "output_quantization_min_val"
+    description: <<END
+The min value of the quantized data stored in `output`.
+For example, if  `Tout` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+END
+  }
+  attr {
+    name: "output_quantization_max_val"
+    description: <<END
+The max value of the quantized data stored in `output`.
+For example, if `Tout` is `qint8`, this must be set to 127.
+END
+  }
+  summary: "Perform quantized convolution of quantized Tensor `lhs` and quantized Tensor `rhs`. to make quantized `output`."
+  description: <<END
+Given quantized `lhs` and quantized `rhs`, performs quantized dot on `lhs` and `rhs` to make quantized `output`.
+
+`lhs` and `rhs` must be Tensors of same rank, and meet following shape conditions.
+- `lhs_feature` % `feature_group_count` == 0
+- `lhs_feature` % `rhs_input_feature` == 0
+- `lhs_feature` / `feature_group_count` == `rhs_input_feature`
+- `rhs_output_feature` % `feature_group_count` == 0
+- `lhs_batch` % `batch_group_count` == 0
+- `rhs_output_feature` % `batch_group_count` == 0
+
+`lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+```
+quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+```
+`output` is also quantized, using the same formula.
+If `rhs` is per-tensor quantized, `output` must be also per-tensor quantized.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
index 4f426501918..c7b0adc78da 100644
--- a/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithCountsV2.pbtxt
@@ -49,7 +49,7 @@ For example:
 
 ```
 x = tf.constant([1, 1, 2, 4, 4, 4, 7, 8, 8])
-y, idx, count = UniqueWithCountsV2(x, axis = [0])
+y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis = [0])
 y ==> [1, 2, 4, 7, 8]
 idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 count ==> [2, 1, 3, 1, 2]
@@ -61,7 +61,7 @@ For a `2-D` tensor `x` with `axis = 0`:
 x = tf.constant([[1, 0, 0],
                 [1, 0, 0],
                 [2, 0, 0]])
-y, idx, count = UniqueWithCountsV2(x, axis=[0])
+y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis=[0])
 y ==> [[1, 0, 0],
        [2, 0, 0]]
 idx ==> [0, 0, 1]
@@ -74,7 +74,7 @@ For a `2-D` tensor `x` with `axis = 1`:
 x = tf.constant([[1, 0, 0],
                 [1, 0, 0],
                 [2, 0, 0]])
-y, idx, count = UniqueWithCountsV2(x, axis=[1])
+y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis=[1])
 y ==> [[1, 0],
        [1, 0],
        [2, 0]]
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradients.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradients.pbtxt
index 2aa588ae2a7..b6f082282cb 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradients.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradients.pbtxt
@@ -25,6 +25,18 @@ the ratio of the number of embedding to tensor cores per TPU chip). Each
 element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
 contains indices (DT_UINT32) for embedding lookup on the TensorCore or
 weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+END
+  }
+  attr {
+    name: "NumTables"
+    description: <<END
+number of tables
+END
+  }
+  attr {
+    name: "NumLearningRateTags"
+    description: <<END
+number of learning rate tags
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/java_api/BUILD b/tensorflow/core/api_def/java_api/BUILD
index 9e825f00b61..c86e4e2bf59 100644
--- a/tensorflow/core/api_def/java_api/BUILD
+++ b/tensorflow/core/api_def/java_api/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/api_def/python_api/BUILD b/tensorflow/core/api_def/python_api/BUILD
index 4d83400e3aa..4af5a9eee68 100644
--- a/tensorflow/core/api_def/python_api/BUILD
+++ b/tensorflow/core/api_def/python_api/BUILD
@@ -3,7 +3,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 filegroup(
     name = "python_api_def",
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentProdV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentProdV2.pbtxt
new file mode 100644
index 00000000000..c4cc9302b86
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentProdV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SegmentProdV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentSumV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentSumV2.pbtxt
new file mode 100644
index 00000000000..f1c101eb3ac
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentSumV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SegmentSumV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Xdivy.pbtxt b/tensorflow/core/api_def/python_api/api_def_Xdivy.pbtxt
index 984442ba2b5..f4d7b97f803 100644
--- a/tensorflow/core/api_def/python_api/api_def_Xdivy.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Xdivy.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Xdivy"
-  endpoint {
-    name: "math.xdivy"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 77e86d90f9c..fcc4db2186f 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2,6 +2,8 @@ load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
     "if_libtpu",
+    "if_oss",
+    "if_zendnn",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -16,7 +18,6 @@ load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_core_deps",
-    "tf_kernel_tests_linkstatic",
     "tf_protos_all",
     "tf_protos_grappler",
 )
@@ -42,7 +43,7 @@ default_package_visibility = [
 ]
 
 package(
-    default_visibility = default_package_visibility,
+    default_visibility = ["//visibility:public"],
     features = if_google(
         [
             "-layering_check",
@@ -141,6 +142,7 @@ cc_library(
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core/framework:device_attributes_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/platform:status",
     ],
@@ -196,6 +198,7 @@ filegroup(
         "graph_runner.h",
         "inline_function_utils.h",
         "lower_function_call_inline_policy.h",
+        "optimized_function_graph_info.h",
         "process_function_library_runtime.h",
         "scoped_allocator.h",
         "scoped_allocator_mgr.h",
@@ -218,6 +221,7 @@ tf_cuda_library(
     copts = tf_copts(),
     deps = [
         ":scoped_allocator",
+        ":stats_publisher_interface",
         "//tensorflow/core:graph",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -240,6 +244,7 @@ filegroup(
         ":core_cpu_base_headers",
         "allocator_retry.h",
         "shared_counter.h",
+        "arg_ret_placement.h",
         "base_collective_executor.h",
         "bfc_allocator.h",
         "hierarchical_tree_broadcaster.h",
@@ -264,6 +269,7 @@ filegroup(
         "graph_optimizer.h",
         "gradients.h",
         "input_colocation_exemption_registry.h",
+        "int32_fulltype.h",
         "isolate_placer_inspection_required_ops_pass.h",
         "local_device.h",
         "local_executor_params.h",
@@ -337,6 +343,18 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "arg_ret_placement",
+    srcs = ["arg_ret_placement.cc"],
+    hdrs = [
+        "arg_ret_placement.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+    ],
+)
+
 cc_library(
     name = "base_collective_executor",
     srcs = ["base_collective_executor.cc"],
@@ -776,6 +794,7 @@ cc_library(
     ],
     copts = tf_copts(),
     deps = [
+        ":arg_ret_placement",
         ":composite_device",
         ":device",
         ":device_mgr",
@@ -790,8 +809,11 @@ cc_library(
         ":graph_constructor",
         ":graph_optimizer",
         ":inline_function_utils",
+        ":int32_fulltype",
         ":memory_types",
         ":optimization_registry",
+        ":optimize_function_graph_utils",
+        ":optimized_function_graph_info",
         ":partitioning_utils",
         ":placer",
         ":process_util",
@@ -799,6 +821,7 @@ cc_library(
         ":rendezvous_util",
         ":replicate_per_replica_nodes",
         ":single_threaded_executor",
+        ":stats_publisher_interface",
         ":type_inference",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -811,6 +834,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -1168,6 +1193,22 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "int32_fulltype",
+    srcs = ["int32_fulltype.cc"],
+    hdrs = [
+        "int32_fulltype.h",
+        "optimization_registry.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "lower_if_op",
     srcs = ["lower_if_op.cc"],
@@ -1208,6 +1249,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "layout_pass_util",
+    srcs = ["layout_pass_util.cc"],
+    hdrs = ["layout_pass_util.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:portable_gif_internal",
+    ],
+)
+
+cc_library(
+    name = "zen_layout_pass",
+    srcs = ["zen_layout_pass.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":function",
+        ":layout_pass_util",
+        ":optimization_registry",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/graph:zen_graph_util",
+        "@com_google_absl//absl/base",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "mkl_cpu_allocator",
     srcs = ["mkl_cpu_allocator.cc"],
@@ -1358,6 +1430,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:traceme",
     ],
 )
 
@@ -1829,6 +1902,7 @@ tf_cuda_library(
         ":graph_view",
         ":hierarchical_tree_broadcaster",
         ":input_colocation_exemption_registry",
+        ":int32_fulltype",
         ":isolate_placer_inspection_required_ops_pass",
         ":local_device",
         ":lower_functional_ops",
@@ -1837,6 +1911,7 @@ tf_cuda_library(
         ":mkl_layout_pass",
         ":mkl_tfconversion_pass",
         ":optimization_registry",
+        ":optimized_function_graph_info",
         ":parallel_concat_optimizer",
         ":partitioning_utils",
         ":pending_counts",
@@ -1863,7 +1938,7 @@ tf_cuda_library(
         ":step_stats_collector",
         ":threadpool_device",
         ":threadpool_device_factory",
-    ],
+    ] + if_zendnn([":zen_layout_pass"]),
 )
 
 tf_cuda_library(
@@ -1985,6 +2060,7 @@ tf_cuda_library(
 filegroup(
     name = "gpu_runtime_headers",
     srcs = [
+        "device.h",
         "gpu_device_context.h",
     ],
 )
@@ -2142,7 +2218,6 @@ tf_cc_test(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_windows"],
     deps = [
         ":core",
@@ -2200,7 +2275,6 @@ tf_cc_test(
     srcs = [
         "composite_device_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core_cpu",
         "//tensorflow/core:protos_all_cc",
@@ -2237,7 +2311,6 @@ tf_cc_tests(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2281,7 +2354,6 @@ tf_cc_tests(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2314,7 +2386,6 @@ tf_cuda_cc_test(
     srcs = [
         "ring_reducer_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":collective_test_util",
@@ -2341,7 +2412,6 @@ tf_cuda_cc_test(
     srcs = [
         "ring_gatherer_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":collective_test_util",
@@ -2368,7 +2438,6 @@ tf_cuda_cc_test(
     srcs = [
         "hierarchical_tree_broadcaster_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":collective_test_util",
@@ -2395,7 +2464,6 @@ tf_cuda_cc_test(
     srcs = [
         "permuter_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":collective_test_util",
@@ -2443,7 +2511,6 @@ tf_cuda_cc_test(
     name = "memory_types_test",
     size = "small",
     srcs = ["memory_types_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":memory_types",
@@ -2466,7 +2533,6 @@ tf_cc_test(
     name = "constant_folding_test",
     size = "small",
     srcs = ["constant_folding_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":core",
@@ -2504,7 +2570,6 @@ tf_cc_test(
     srcs = [
         "shape_refiner_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2532,7 +2597,6 @@ tf_cuda_cc_test(
     name = "process_function_library_runtime_test",
     size = "small",
     srcs = ["process_function_library_runtime_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2556,7 +2620,6 @@ tf_cc_test(
     name = "process_util_test",
     size = "small",
     srcs = ["process_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":process_util",
         "//tensorflow/core:test",
@@ -2568,7 +2631,6 @@ tf_cc_test(
     name = "rendezvous_util_test",
     size = "small",
     srcs = ["rendezvous_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":rendezvous_util",
         "//tensorflow/core:lib",
@@ -2581,7 +2643,6 @@ tf_cc_test(
     name = "replicate_per_replica_nodes_test",
     size = "small",
     srcs = ["replicate_per_replica_nodes_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":replicate_per_replica_nodes",
         "//tensorflow/cc:cc_ops",
@@ -2600,7 +2661,6 @@ tf_cc_test(
     name = "optimize_cross_host_control_deps_test",
     size = "small",
     srcs = ["optimize_cross_host_control_deps_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":optimize_cross_host_control_deps",
         "//tensorflow/cc:cc_ops",
@@ -2637,7 +2697,6 @@ tf_cuda_cc_test(
     size = "medium",
     srcs = ["direct_session_test.cc"],
     args = [] + if_cuda(["--heap_check="]),  # The GPU tracer leaks memory
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core_cpu",
         ":core_cpu_internal",
@@ -2677,7 +2736,6 @@ tf_cuda_cc_test(
 tf_cc_test(
     name = "direct_session_with_debug_test",
     srcs = ["direct_session_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2719,7 +2777,6 @@ tf_cc_test(
     size = "small",
     srcs = ["direct_session_with_tracking_alloc_test.cc"],
     args = ["--heap_check="],  # The GPU tracer leaks memory
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = [
         "no_gpu",
         "nomac",  # TODO(b/250242601)
@@ -2755,7 +2812,6 @@ tf_cc_test(
     name = "graph_runner_test",
     size = "small",
     srcs = ["graph_runner_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:array_ops_op_lib",
         ":core",
@@ -2783,7 +2839,6 @@ tf_cc_test(
     name = "executor_test",
     size = "small",
     srcs = ["executor_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2815,7 +2870,6 @@ tf_cc_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = [
         "manual",
         "no_oss",
@@ -2856,7 +2910,6 @@ tf_cc_test(
     name = "function_threadpool_test",
     size = "small",
     srcs = ["function_threadpool_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -2889,7 +2942,6 @@ tf_cc_test(
     name = "scoped_allocator_mgr_test",
     size = "small",
     srcs = ["scoped_allocator_mgr_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":dma_helper",
         ":scoped_allocator",
@@ -2904,7 +2956,6 @@ tf_cc_test(
     name = "inline_function_utils_test",
     size = "small",
     srcs = ["inline_function_utils_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -3130,7 +3181,6 @@ tf_cc_test(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":core",
         ":core_cpu",
@@ -3239,3 +3289,103 @@ tf_cc_test(
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
+
+cc_library(
+    name = "optimized_function_graph_info",
+    srcs = ["optimized_function_graph_info.cc"],
+    hdrs = ["optimized_function_graph_info.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_constructor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:optimized_function_graph_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "optimized_function_graph_info_test",
+    srcs = ["optimized_function_graph_info_test.cc"],
+    tags = if_oss([
+        "no_oss",
+    ]),  # b/169705709, no protobuf matchers in OSS.
+    deps = [
+        ":optimized_function_graph_info",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:function_testlib",
+        "//tensorflow/core/framework:optimized_function_graph_proto_cc",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "optimize_function_graph_utils",
+    srcs = ["optimize_function_graph_utils.cc"],
+    hdrs = ["optimize_function_graph_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        ":composite_device",
+        ":device_set",
+        ":function_body",
+        ":function_optimization_registry",
+        ":function_utils",
+        ":optimization_registry",
+        ":placer",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/framework:graph_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "optimize_function_graph_utils_test",
+    srcs = ["optimize_function_graph_utils_test.cc"],
+    deps = [
+        ":device",
+        ":device_factory",
+        ":device_set",
+        ":optimize_function_graph_utils",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "int32_fulltype_test",
+    size = "small",
+    srcs = [
+        "int32_fulltype_test.cc",
+    ],
+    deps = [
+        ":core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "arg_ret_placement_test",
+    size = "small",
+    srcs = [
+        "arg_ret_placement_test.cc",
+    ],
+    deps = [
+        ":arg_ret_placement",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/arg_ret_placement.cc b/tensorflow/core/common_runtime/arg_ret_placement.cc
new file mode 100644
index 00000000000..b024aa55960
--- /dev/null
+++ b/tensorflow/core/common_runtime/arg_ret_placement.cc
@@ -0,0 +1,223 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/arg_ret_placement.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/full_type_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow::full_type {
+
+// Note that ints_on_device is only true for single device functions
+// (i.e. for cases where Placer is not run).
+static Status SetMemoryTypeHelper(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool is_arg, bool weak_flag, bool ints_on_device,
+    MemoryTypeVector* memory_types,
+    std::vector<AllocatorAttributes>* alloc_attrs) {
+  DCHECK_EQ(nodes.size(), dtypes.size());
+  if (alloc_attrs != nullptr) {
+    alloc_attrs->reserve(nodes.size());
+  }
+  for (int i = 0; i < nodes.size(); ++i) {
+    const auto& node = nodes[i];
+
+    Node* n;
+    int output_idx;
+    if (is_arg) {
+      DCHECK(node->op_def().name() == "_Arg" ||
+             node->op_def().name() == "_DeviceArg");
+      output_idx = 0;
+      n = node;
+    } else {
+      // "_Retval" nodes are sinks, they do not have an output (to any other
+      // node in the subgraph for the function that they are in) so they do
+      // not have any useful full type information. Instead get the full type
+      // of the input to the _Rval op.
+      DCHECK(node->op_def().name() == "_Retval" ||
+             node->op_def().name() == "_DeviceRetval");
+      const Edge* edge;
+      TF_RETURN_IF_ERROR(node->input_edge(0, &edge));
+      n = edge->src();
+      output_idx = edge->src_output();
+    }
+    const auto& t = dtypes[i];
+    MemoryType mt_from_dtype =
+        ints_on_device ? MTypeFromDTypeIntsOnDevice(t) : MTypeFromDType(t);
+    if (t == DT_INT32) {
+      if (n->def().has_experimental_type()) {
+        bool valid_full_type_information = false;
+        auto ft = n->def().experimental_type();
+        if (ft.type_id() == TFT_PRODUCT) {
+          FullTypeId id = GetArgDefaultUnset(ft, output_idx).type_id();
+          if (id == TFT_SHAPE_TENSOR) {
+            valid_full_type_information = mt_from_dtype == HOST_MEMORY;
+          } else if (id == TFT_TENSOR) {
+            valid_full_type_information = mt_from_dtype != HOST_MEMORY;
+          }
+        }
+        if (!valid_full_type_information) {
+          if (weak_flag) {
+            VLOG(1) << "node=" << n->name() << " (op=" << n->def().op()
+                    << ") has an int32 output with unexpected full type "
+                    << "information with ints_on_device=" << ints_on_device
+                    << "\n"
+                    << n->def().DebugString();
+          } else {
+            return errors::Internal(
+                "node=", n->name(), " (op=", n->def().op(),
+                ") has an int32 output with unexpected full type information ",
+                "with ints_on_device=", ints_on_device, "\n",
+                n->def().DebugString());
+          }
+        }
+      } else if (mt_from_dtype == HOST_MEMORY) {
+        if (weak_flag) {
+          VLOG(1) << "node=" << n->name() << " (op=" << n->def().op()
+                  << ") has a HOST_MEMORY int32 output but does not have "
+                  << "(TFT_SHAPE_TENSOR) full type information.";
+        } else {
+          return errors::Internal(
+              "node=", n->name(), " (op=", n->def().op(),
+              ")  has a HOST_MEMORY int32 output but does not have "
+              "(TFT_SHAPE_TENSOR) full type information.");
+        }
+      }
+    }
+    if (memory_types != nullptr) {
+      memory_types->push_back(mt_from_dtype);
+    }
+    if (alloc_attrs != nullptr) {
+      AllocatorAttributes aa;
+      aa.set_on_host(mt_from_dtype == HOST_MEMORY);
+      alloc_attrs->push_back(aa);
+    }
+  }
+  return OkStatus();
+}
+
+Status SetMemoryTypeForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            MemoryTypeVector& memory_types) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/false, /*ints_on_device=*/false,
+                             &memory_types, nullptr);
+}
+
+// TODO(b/258849883) Delete the `Weak...` versions of these functions once
+// everything is working with the version without `Weak`.
+
+Status WeakSetMemoryTypeForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                MemoryTypeVector& memory_types) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/true, /*ints_on_device=*/false,
+                             &memory_types, nullptr);
+}
+
+Status SetMemoryTypeForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            MemoryTypeVector& memory_types) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/false, /*ints_on_device=*/false,
+                             &memory_types, nullptr);
+}
+
+Status WeakSetMemoryTypeForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                MemoryTypeVector& memory_types) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/true, /*ints_on_device=*/false,
+                             &memory_types, nullptr);
+}
+
+Status SetAllocAttrsForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/false, /*ints_on_device=*/false,
+                             nullptr, &alloc_attrs);
+}
+
+Status WeakSetAllocAttrsForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/true, /*ints_on_device=*/false,
+                             nullptr, &alloc_attrs);
+}
+
+Status SetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/false, /*ints_on_device=*/false,
+                             nullptr, &alloc_attrs);
+}
+
+Status WeakSetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/true, /*ints_on_device=*/false,
+                             nullptr, &alloc_attrs);
+}
+
+Status SingleDeviceSetAllocAttrsForArgs(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/false, ints_on_device, nullptr,
+                             &alloc_attrs);
+}
+
+Status WeakSingleDeviceSetAllocAttrsForArgs(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/true,
+                             /*weak_flag=*/true, ints_on_device, nullptr,
+                             &alloc_attrs);
+}
+
+Status SingleDeviceSetAllocAttrsForRets(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/false, ints_on_device, nullptr,
+                             &alloc_attrs);
+}
+
+Status WeakSingleDeviceSetAllocAttrsForRets(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs) {
+  return SetMemoryTypeHelper(nodes, dtypes, /*is_arg=*/false,
+                             /*weak_flag=*/true, ints_on_device, nullptr,
+                             &alloc_attrs);
+}
+
+}  // namespace tensorflow::full_type
diff --git a/tensorflow/core/common_runtime/arg_ret_placement.h b/tensorflow/core/common_runtime/arg_ret_placement.h
new file mode 100644
index 00000000000..53afe90c90e
--- /dev/null
+++ b/tensorflow/core/common_runtime/arg_ret_placement.h
@@ -0,0 +1,136 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow::full_type {
+
+// Set the contents of memory_types for args (inputs to functions, "_Arg" ops)
+// based on dtype. Raises an error if an int32 arg does not have
+// expected full_type information. If an error raised about bad full
+// time information causes a breakage, changing `SetMemoryTypeForArgs` to
+// `WeakSetMemoryTypeForArgs` is a possible work around.
+Status SetMemoryTypeForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            MemoryTypeVector& memory_types);
+
+// TODO(b/258849883) Delete the `Weak...` versions of these functions once
+// everything is working with the version without `Weak`.
+
+// Set the contents of memory_types for args (inputs to functions, "_Arg" ops)
+// based on dtype. Logging of warnings if an int32 arg does not have
+// expected full_type information can be enabled.
+Status WeakSetMemoryTypeForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                MemoryTypeVector& memory_types);
+
+// Set the contents of memory_types for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Raises an error if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information). If an error raised about bad
+// full time information causes a breakage, changing `SetMemoryTypeForRets` to
+// `WeakSetMemoryTypeForRets` is a possible work around.
+Status SetMemoryTypeForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            MemoryTypeVector& memory_types);
+
+// Set the contents of memory_types for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Logging of warnings if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information) can be enabled.
+Status WeakSetMemoryTypeForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                MemoryTypeVector& memory_types);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// based on dtype. Raises an error if an int32 arg does not have
+// expected full_type information. If an error raised about bad full
+// time information causes a breakage, changing `SetAllocAttrsForArgs` to
+// `WeakSetAllocAttrsForArgs` is a possible work around.
+Status SetAllocAttrsForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// based on dtype. Logging of warnings if an int32 arg does not have
+// expected full_type information can be enabled.
+Status WeakSetAllocAttrsForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Raises an error if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information). If an error raised about bad
+// full time information causes a breakage, changing `SetAllocAttrsForRets` to
+// `WeakSetAllocAttrsForRets` is a possible work around.
+Status SetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                            const DataTypeVector& dtypes,
+                            std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) based on dtype. Logging of warnings if an int32 ret does not have
+// expected full_type information (i.e. if the source of the input to the ret
+// does not have expected full type information) can be enabled.
+Status WeakSetAllocAttrsForRets(const gtl::InlinedVector<Node*, 4>& nodes,
+                                const DataTypeVector& dtypes,
+                                std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// for a single device funtion based on dtype. Raises an error if an int32 arg
+// does not have expected full_type information. If an error raised about bad
+// full time information causes a breakage, changing
+// `SingleDeviceSetAllocAttrsForArgs` to `WeakSingleDeviceSetAllocAttrsForArgs`
+// is a possible work around.
+Status SingleDeviceSetAllocAttrsForArgs(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for args (inputs to functions, "_Arg" ops)
+// for a single device based on dtype. Logging of warnings if an int32 arg does
+// not have expected full_type information can be enabled.
+Status WeakSingleDeviceSetAllocAttrsForArgs(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) for a single device based on dtype. Raises an error if an int32 ret does
+// not have expected full_type information (i.e. if the source of the input to
+// the ret does not have expected full type information). If an error raised
+// about bad full time information causes a breakage, changing
+// `SingleDeviceSetAllocAttrsForRets` to `WeakSingleDeviceSetAllocAttrsForRets`
+// is a possible work around.
+Status SingleDeviceSetAllocAttrsForRets(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+// Set the contents of alloc_attrs for rets (outputs from functions, "_Retval"
+// ops) for a single device based on dtype. Logging of warnings if an int32 ret
+// does not have expected full_type information (i.e. if the source of the input
+// to the ret does not have expected full type information) can be enabled.
+Status WeakSingleDeviceSetAllocAttrsForRets(
+    const gtl::InlinedVector<Node*, 4>& nodes, const DataTypeVector& dtypes,
+    bool ints_on_device, std::vector<AllocatorAttributes>& alloc_attrs);
+
+}  // namespace tensorflow::full_type
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_ARG_RET_PLACEMENT_H_
diff --git a/tensorflow/core/common_runtime/arg_ret_placement_test.cc b/tensorflow/core/common_runtime/arg_ret_placement_test.cc
new file mode 100644
index 00000000000..323f302bd93
--- /dev/null
+++ b/tensorflow/core/common_runtime/arg_ret_placement_test.cc
@@ -0,0 +1,328 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/arg_ret_placement.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+class FullTypeGraphUtilsTest : public ::testing::Test {
+ protected:
+  FullTypeGraphUtilsTest()
+      : graph_(OpRegistry::Global()),
+        root_(Scope::NewRootScope().ExitOnError()) {}
+
+  Status MakeArg(Node **arg, DataType dtype) {
+    return NodeBuilder("arg", "_Arg", &root_.graph()->flib_def())
+        .Attr("T", dtype)
+        .Attr("index", 0)
+        .Finalize(root_.graph(), arg);
+  }
+
+  Status MakeRet(Node *src, Node **ret, DataType dtype) {
+    return NodeBuilder("ret", "_Retval", &root_.graph()->flib_def())
+        .Input(src, 0)
+        .Attr("T", dtype)
+        .Attr("index", 0)
+        .Finalize(root_.graph(), ret);
+  }
+
+ public:
+  Status MakeArgRet(Node **arg, Node **ret, DataType dtype) {
+    TF_RETURN_IF_ERROR(MakeArg(arg, dtype));
+    return MakeRet(*arg, ret, dtype);
+  }
+
+  void AddArgFullType(Node *arg, FullTypeId out_id, FullTypeId data_id) {
+    FullTypeDef *t = arg->mutable_def()->mutable_experimental_type();
+    t->set_type_id(TFT_PRODUCT);
+    FullTypeDef data_t;
+    data_t.set_type_id(data_id);
+    FullTypeDef out_t;
+    out_t.set_type_id(out_id);
+    (*out_t.add_args()) = data_t;
+    (*t->add_args()) = out_t;
+  }
+
+ private:
+  Graph graph_;
+  Scope root_;
+};
+
+TEST_F(FullTypeGraphUtilsTest, MemoryTypesArgNoFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT64));
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT64);
+  TF_ASSERT_OK(
+      full_type::WeakSetMemoryTypeForArgs(nodes, dtypes, memory_types));
+  ASSERT_EQ(memory_types.size(), 1);
+  ASSERT_EQ(memory_types[0], MemoryType::DEVICE_MEMORY);
+}
+
+TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgNoFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT64));
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT64);
+  TF_ASSERT_OK(full_type::WeakSetAllocAttrsForArgs(nodes, dtypes, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_FALSE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, MemoryTypesArgWithFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::SetMemoryTypeForArgs(nodes, dtypes, memory_types));
+  ASSERT_EQ(memory_types.size(), 1);
+  ASSERT_EQ(memory_types[0], MemoryType::HOST_MEMORY);
+}
+
+TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::SetAllocAttrsForArgs(nodes, dtypes, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, ArgError) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_TENSOR, TFT_INT32);
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  Status status = full_type::SetMemoryTypeForArgs(nodes, dtypes, memory_types);
+  EXPECT_FALSE(status.ok());
+}
+
+TEST_F(FullTypeGraphUtilsTest, WeakAllocAttrsArgIgnore) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_TENSOR, TFT_INT32);
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::WeakSetAllocAttrsForArgs(nodes, dtypes, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, RetNoFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT64));
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT64);
+  TF_ASSERT_OK(
+      full_type::WeakSetMemoryTypeForRets(nodes, dtypes, memory_types));
+  ASSERT_EQ(memory_types.size(), 1);
+  ASSERT_EQ(memory_types[0], MemoryType::DEVICE_MEMORY);
+}
+
+TEST_F(FullTypeGraphUtilsTest, MemoryTypeRetWithFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  // `ret` does not have an output, so it has no useful full type information.
+  // Add full type information to the input to `ret`, which is `arg`.
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::SetMemoryTypeForRets(nodes, dtypes, memory_types));
+  ASSERT_EQ(memory_types.size(), 1);
+  ASSERT_EQ(memory_types[0], MemoryType::HOST_MEMORY);
+}
+
+TEST_F(FullTypeGraphUtilsTest, AllowAttrRetWithFT) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  // `ret` does not have an output, so it has no useful full type information.
+  // Add full type information to the input to `ret`, which is `arg`.
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::SetAllocAttrsForRets(nodes, dtypes, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, RetError) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  MemoryTypeVector memory_types;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  Status status = full_type::SetMemoryTypeForRets(nodes, dtypes, memory_types);
+  EXPECT_FALSE(status.ok());
+}
+
+TEST_F(FullTypeGraphUtilsTest, WeakAllocAttrsRetIgnore) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::WeakSetAllocAttrsForRets(nodes, dtypes, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, AllocatorAttrsArgWithFTSingleDevice) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_TENSOR, TFT_INT32);  // numeric INT32
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::SingleDeviceSetAllocAttrsForArgs(
+      nodes, dtypes, /*ints_on_device=*/true, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_FALSE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, WeakAllocatorAttrsArgWithFTSingleDevice) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+
+  nodes.push_back(arg);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::WeakSingleDeviceSetAllocAttrsForArgs(
+      nodes, dtypes, /*ints_on_device=*/false, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, SingleDeviceAllocAttrsRetError) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  // test TFT_SHAPE_TENSOR and ints_on_device=true mismatch
+  AddArgFullType(arg, TFT_SHAPE_TENSOR, TFT_INT32);
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  Status status = full_type::SingleDeviceSetAllocAttrsForRets(
+      nodes, dtypes, /*ints_on_device=*/true, alloc_attrs);
+  EXPECT_FALSE(status.ok());
+}
+
+TEST_F(FullTypeGraphUtilsTest, SingleDeviceAllocAttrsNotInt32) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_STRING));
+  // If dtype is not DT_UINT32, then OK to not have full type information
+  nodes.push_back(ret);
+  dtypes.push_back(DT_STRING);
+  TF_ASSERT_OK(full_type::SingleDeviceSetAllocAttrsForRets(
+      nodes, dtypes, /*ints_on_device=*/false, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_TRUE(alloc_attrs[0].on_host());
+}
+
+TEST_F(FullTypeGraphUtilsTest, SingleDeviceWeakAllocAttrsRetIgnore) {
+  gtl::InlinedVector<Node *, 4> nodes;
+  DataTypeVector dtypes;
+  std::vector<AllocatorAttributes> alloc_attrs;
+
+  Node *arg, *ret;
+  TF_ASSERT_OK(MakeArgRet(&arg, &ret, DT_INT32));
+  nodes.push_back(ret);
+  dtypes.push_back(DT_INT32);
+  TF_ASSERT_OK(full_type::WeakSingleDeviceSetAllocAttrsForRets(
+      nodes, dtypes, /*ints_on_device=*/true, alloc_attrs));
+  ASSERT_EQ(alloc_attrs.size(), 1);
+  ASSERT_FALSE(alloc_attrs[0].on_host());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index bdf938db359..1f054bcd9aa 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -303,14 +303,16 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   }
 
   Tensor* output = ctx->mutable_output(0);
-  const Tensor* input = (col_params->instance.type == REDUCTION_COLLECTIVE ||
-                         col_params->instance.type == GATHER_COLLECTIVE ||
-                         col_params->instance.type == PERMUTE_COLLECTIVE ||
-                         col_params->instance.type == ALL_TO_ALL_COLLECTIVE ||
-                         (col_params->instance.type == BROADCAST_COLLECTIVE &&
-                          col_params->is_source))
-                            ? &ctx->input(0)
-                            : nullptr;
+  const Tensor* input =
+      (col_params->instance.type == REDUCTION_COLLECTIVE ||
+       col_params->instance.type == GATHER_COLLECTIVE ||
+       col_params->instance.type == PERMUTE_COLLECTIVE ||
+       col_params->instance.type == ALL_TO_ALL_COLLECTIVE ||
+       col_params->instance.type == REDUCE_SCATTER_COLLECTIVE ||
+       (col_params->instance.type == BROADCAST_COLLECTIVE &&
+        col_params->is_source))
+          ? &ctx->input(0)
+          : nullptr;
   CollectiveImplementationInterface* col_impl = nullptr;
   Status status = CreateCollective(*col_params, &col_impl);
   if (!status.ok()) {
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 367415d04ca..c8e194c80e5 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -76,7 +76,10 @@ const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
       return "Permute";
 
     case ALL_TO_ALL_COLLECTIVE:
-      return "AllToAll";
+      return nccl ? "NcclAllToAll" : "AllToAll";
+
+    case REDUCE_SCATTER_COLLECTIVE:
+      return nccl ? "NcclReduceScatter" : "undef";
 
     default:
       return "undef";
diff --git a/tensorflow/core/common_runtime/collective_test_util.cc b/tensorflow/core/common_runtime/collective_test_util.cc
index 5b366f8e32b..24c85b321ae 100644
--- a/tensorflow/core/common_runtime/collective_test_util.cc
+++ b/tensorflow/core/common_runtime/collective_test_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -140,7 +141,8 @@ std::vector<std::unique_ptr<Device>> CreateGPUDevices() {
 }  // namespace
 
 std::unique_ptr<CollectiveTestEnv> CreateCollectiveTestEnv(
-    int num_workers, int num_devices_per_worker, DeviceType device_type) {
+    int num_workers, int num_devices_per_worker, DeviceType device_type,
+    bool use_nccl) {
   auto test_env = std::make_unique<CollectiveTestEnv>();
   test_env->param_resolver = std::make_unique<TestParamResolver>();
   // We don't create CollecticeExecutor from the CollecticeExecutorMgr so we
@@ -177,6 +179,10 @@ std::unique_ptr<CollectiveTestEnv> CreateCollectiveTestEnv(
   test_env->col_exec.reset(new BaseCollectiveExecutor(
       test_env->col_exec_mgr.get(), test_env->remote_access, kStepId,
       test_env->device_mgr.get(), test_env->work_queue));
+  if (use_nccl) {
+    ConfigProto config_proto;
+    test_env->nccl_communicator = MaybeCreateNcclCommunicator(config_proto);
+  }
 
   return test_env;
 }
@@ -335,7 +341,7 @@ Status RunCollective(CollectiveTestEnv* test_env, CollectiveParams* col_params,
   core::ScopedUnref unref_dev_ctx(dev_ctx);
   op_params.op_device_context = dev_ctx;
   int forward_from = 0;
-  op_params.forward_from_array = &forward_from;
+  op_params.forward_from_array = input == output ? &forward_from : nullptr;
   AllocatorAttributes generic_alloc_attr;
   op_params.output_attr_array = &generic_alloc_attr;
   op_params.resource_manager = device->resource_manager();
@@ -350,7 +356,7 @@ Status RunCollective(CollectiveTestEnv* test_env, CollectiveParams* col_params,
 
   string exec_key = strings::StrCat(col_params->instance.instance_key, ":0:0");
   auto col_ctx = std::make_shared<CollectiveContext>(
-      test_env->col_exec.get(), /*nccl_communicator*/ nullptr,
+      test_env->col_exec.get(), test_env->nccl_communicator.get(),
       test_env->device_mgr.get(), &ctx, &op_params, col_params, exec_key,
       kStepId, &input_buffer, &output_buffer);
   TF_RETURN_IF_ERROR(collective_impl->InitializeCollectiveContext(col_ctx));
diff --git a/tensorflow/core/common_runtime/collective_test_util.h b/tensorflow/core/common_runtime/collective_test_util.h
index 3e938f30721..d6704188b96 100644
--- a/tensorflow/core/common_runtime/collective_test_util.h
+++ b/tensorflow/core/common_runtime/collective_test_util.h
@@ -80,6 +80,7 @@ struct CollectiveTestEnv {
   std::shared_ptr<UnboundedWorkQueue> work_queue;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr;
   std::unique_ptr<DeviceResolverInterface> device_resolver;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator;
   core::RefCountPtr<CollectiveExecutor> col_exec;
   FailTestRMA* remote_access;
 
@@ -87,7 +88,8 @@ struct CollectiveTestEnv {
 };
 
 std::unique_ptr<CollectiveTestEnv> CreateCollectiveTestEnv(
-    int num_workers, int num_devices_per_worker, DeviceType device_type);
+    int num_workers, int num_devices_per_worker, DeviceType device_type,
+    bool use_nccl = false);
 
 core::RefCountPtr<CollectiveParams> CreateCollectiveParams(
     const CollectiveTestEnv& test_env, int rank, const string& collective_name,
diff --git a/tensorflow/core/common_runtime/device/BUILD b/tensorflow/core/common_runtime/device/BUILD
index e1d3b7774af..5e7503892ef 100644
--- a/tensorflow/core/common_runtime/device/BUILD
+++ b/tensorflow/core/common_runtime/device/BUILD
@@ -3,12 +3,10 @@ load(
     "tf_copts",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
-
-# For platform specific build config
 load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
+    "//tensorflow:tensorflow.default.bzl",
+    "filegroup",
+    "tf_cuda_cc_test",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -21,6 +19,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
@@ -86,7 +85,6 @@ filegroup(
         "device_host_allocator.h",
         "device_id.h",
         "device_id_manager.h",
-        "device_id_utils.h",
         "device_mem_allocator.h",
         "//tensorflow/tsl/framework:device_runtime_headers",
     ],
@@ -139,7 +137,6 @@ tf_cuda_cc_test(
     srcs = [
         "device_id_manager_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":device_id",
@@ -162,7 +159,6 @@ tf_cuda_cc_test(
 tf_cuda_cc_test(
     name = "device_event_mgr_test",
     srcs = ["device_event_mgr_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":device_event_mgr",
@@ -175,5 +171,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/tsl/framework:device_id",
     ],
 )
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.cc b/tensorflow/core/common_runtime/device/device_event_mgr.cc
index 4da093f1c5a..1a9c9bf6c9a 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 
+#include <functional>
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 
@@ -103,16 +108,13 @@ EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
 EventMgr::~EventMgr() {
   StopPollingLoop();
 
-  // Events are owned by this object.
-  for (auto& e : free_events_) {
-    delete e;
-  }
-  while (!used_events_.empty()) {
-    InUse* ue = &used_events_[0];
-    delete ue->event;
-    if (ue->func != nullptr) threadpool_.Schedule(ue->func);
-    used_events_.pop_front();
+  for (auto& [stream, stream_callbacks] : callbacks_) {
+    for (auto& [event, callback] : stream_callbacks) {
+      threadpool_.Schedule(std::move(callback));
+    }
   }
+  // The threadpool's destructor will block waiting for all outstanding
+  // callbacks to complete.
 }
 
 void EventMgr::StartPollingLoop() {
@@ -121,7 +123,7 @@ void EventMgr::StartPollingLoop() {
     mutex_lock l(mu_);
     stop_polling_ = false;
   }
-  polling_stopped_.reset(new Notification);
+  polling_stopped_ = std::make_unique<Notification>();
   threadpool_.Schedule([this]() { PollLoop(); });
 }
 
@@ -142,7 +144,6 @@ void EventMgr::StopPollingLoop() {
 // While one or more events is outstanding, poll for completed events.  When no
 // events are outstanding, we sleep until one is enqueued.
 void EventMgr::PollLoop() {
-  ToFreeVector to_free;
   while (true) {
     bool events_still_pending;
     {
@@ -150,14 +151,12 @@ void EventMgr::PollLoop() {
       if (stop_polling_) {
         break;
       }
-      if (used_events_.empty()) {
+      if (callbacks_.empty()) {
         events_pending_.wait(l);
       }
-      PollEvents(true, &to_free);
-      events_still_pending = !used_events_.empty();
+      PollEvents(/*stream=*/nullptr);  // poll all streams
+      events_still_pending = !callbacks_.empty();
     }
-    FreeMemory(to_free);
-    to_free.clear();
 
     if (events_still_pending) {
       Env::Default()->SleepForMicroseconds(polling_active_delay_usecs_);
@@ -166,23 +165,28 @@ void EventMgr::PollLoop() {
   polling_stopped_->Notify();
 }
 
-void EventMgr::QueueInUse(se::Stream* stream, InUse in_use) {
-  VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
-          << " used_events_ " << used_events_.size();
+void EventMgr::EnqueueCallback(se::Stream* stream, std::function<void()> func) {
+  VLOG(2) << "EnqueueCallback with one or more callbacks pending on "
+          << callbacks_.size() << " streams and " << free_events_.size()
+          << " unused event objects.";
   // Events are created on demand, and repeatedly reused.  There is no
   // limit placed here on the number of allocated Events.
   if (free_events_.empty()) {
-    free_events_.push_back(new se::Event(exec_));
+    free_events_.push_back(std::make_unique<se::Event>(exec_));
     free_events_.back()->Init();
   }
-  se::Event* e = free_events_.back();
+
+  std::unique_ptr<se::Event> e = std::move(free_events_.back());
   free_events_.pop_back();
-  stream->ThenRecordEvent(e);
-  in_use.event = e;
-  bool was_empty = used_events_.empty();
-  used_events_.push_back(in_use);
-  // Maybe wake up the polling thread
-  if (was_empty) events_pending_.notify_all();
+  stream->ThenRecordEvent(e.get());
+
+  bool was_empty = callbacks_.empty();
+  callbacks_[stream].push_back({std::move(e), std::move(func)});
+
+  // Wake up the polling thread if it was sleeping.
+  if (was_empty) {
+    events_pending_.notify_all();
+  }
 }
 
 // This function must be called periodically to check whether pending
@@ -192,53 +196,75 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse in_use) {
 // spikes of up to several hundred outstanding.  (If GPUKernelTracker
 // is used to cap pending kernels there should never be more than
 // that many.)
-//
-// NOTE: If all events are on the same stream, no later event will
-// complete before an earlier event, except possibly if the earlier
-// event transitions to an error state, so there's no advantage in
-// looking past the first kPending event.  However, if we're using
-// multiple streams there may be some gain in looking deeper.
-// As a compromise, PollEvent() calls that are triggered by the queueing
-// of a single event never look past the first kPending event.  Consequently
-// those calls do an expected constant amount of work, unaffected by the
-// length of the pending queue.  Calls coming from the dedicated
-// polling thread always sweep the full queue.
-void EventMgr::PollEvents(bool is_dedicated_poller,
-                          gtl::InlinedVector<InUse, 4>* to_free) {
-  VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
-          << " used_events_ " << used_events_.size();
-  // Sweep the remaining events in order.  If this is the dedicated
-  // polling thread, check the entire set.  Otherwise, just sweep up to
-  // the first non-complete record that is still pending.
-  for (auto& iu : used_events_) {
-    if (iu.event == nullptr) continue;
-    se::Event::Status s = iu.event->PollForStatus();
-    switch (s) {
-      case se::Event::Status::kUnknown:
-      case se::Event::Status::kError:
-        // We don't expect to see these.  Someday maybe propagate
-        // a Status error, but for now fail hard.
-        LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
-        break;
-      case se::Event::Status::kPending:
-        if (!is_dedicated_poller) return;  // quit processing queue
-        break;
-      case se::Event::Status::kComplete:
-        // Make a copy of the InUse record so we can free it after releasing
-        // the lock
-        to_free->push_back(iu);
-        free_events_.push_back(iu.event);
-        // Mark this InUse record as completed.
-        iu.event = nullptr;
+void EventMgr::PollEvents(se::Stream* stream /*=nullptr*/) {
+  VLOG(2) << "PollEvents with one or more callbacks pending on "
+          << callbacks_.size() << " streams and " << free_events_.size()
+          << " unused event objects.";
+
+  // Polls the events for one stream.
+  //
+  // `stream_it` should be an iterator into callbacks_.  Modifies stream_it so
+  // it points to the next element of callbacks_.
+  auto poll_events_for_stream_it =
+      [&](auto& stream_it) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        auto& stream_callbacks = stream_it->second;
+
+        auto it = stream_callbacks.begin();
+        while (it != stream_callbacks.end()) {
+          auto& [event, callback] = *it;
+
+          se::Event::Status s = event->PollForStatus();
+          bool keep_looping = true;
+          switch (s) {
+            case se::Event::Status::kUnknown:
+            case se::Event::Status::kError:
+              // We don't expect to see these.  Someday maybe propagate
+              // a Status error, but for now fail hard.
+              LOG(FATAL) << "Unexpected Event status: " << static_cast<int>(s);
+              break;
+            case se::Event::Status::kPending:
+              // If this event is still pending, then all events after it are
+              // guaranteed to be pending as well, so we can stop looping.
+              keep_looping = false;
+              break;
+            case se::Event::Status::kComplete:
+              free_events_.push_back(std::move(event));
+              threadpool_.Schedule(std::move(callback));
+              // std::deque::erase() does invalidate iterators, so we can't
+              // erase `it` here.  Instead, we'll wait until the end of the loop
+              // over stream_callbacks and erase all of the completed events at
+              // that point.
+              ++it;
+              break;
+          }
+
+          if (!keep_looping) {
+            break;
+          }
+        }
+
+        // Erase all completed events from stream_callbacks.
+        stream_callbacks.erase(stream_callbacks.begin(), it);
+
+        if (stream_callbacks.empty()) {
+          // absl::flat_hash_map::erase doesn't invalidate iterators, so this is
+          // safe.
+          callbacks_.erase(stream_it++);
+        } else {
+          stream_it++;
+        }
+      };
+
+  // If `stream` is non-null, poll events just for that stream.  Otherwise, poll
+  // events for all streams.
+  if (stream != nullptr) {
+    auto stream_it = callbacks_.find(stream);
+    if (stream_it != callbacks_.end()) {
+      poll_events_for_stream_it(stream_it);
     }
-  }
-  // Then clear any completed InUse records from the front of the queue.
-  while (!used_events_.empty()) {
-    InUse& iu = used_events_.front();
-    if (iu.event == nullptr) {
-      used_events_.pop_front();
-    } else {
-      break;
+  } else {
+    for (auto stream_it = callbacks_.begin(); stream_it != callbacks_.end();) {
+      poll_events_for_stream_it(stream_it);
     }
   }
 }
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr.h b/tensorflow/core/common_runtime/device/device_event_mgr.h
index a76797a8eec..00b30046680 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr.h
+++ b/tensorflow/core/common_runtime/device/device_event_mgr.h
@@ -17,10 +17,12 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_EVENT_MGR_H_
 
 #include <deque>
+#include <functional>
+#include <memory>
+#include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/log_memory.h"
-#include "tensorflow/core/framework/tensor.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -29,12 +31,6 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace stream_executor {
-class Event;
-class Stream;
-class StreamExecutor;
-}  // namespace stream_executor
-
 namespace tensorflow {
 
 // TODO(annarev): Check if we can use a more general option representation here
@@ -58,74 +54,52 @@ namespace device_event_mgr {
 // trace.
 void WarnIfInCallback(std::function<void()> f);
 }  // namespace device_event_mgr
-#define WARN_IF_IN_EVENT_MGR_THREAD device_event_mgr::WarnIfInCallback(nullptr)
+#define WARN_IF_IN_EVENT_MGR_THREAD \
+  ::tensorflow::device_event_mgr::WarnIfInCallback(nullptr)
 
-// An object to keep track of pending Events in the StreamExecutor streams
-// and associated Tensors that cannot safely be deleted until the associated
-// Events are recorded.
+// EventMgr lets you register a callback to be executed when a given
+// StreamExecutor stream completes all the work that's thus-far been enqueued on
+// the stream.
 class EventMgr {
  public:
   virtual ~EventMgr();
 
-  // Execute func when all pending stream actions have completed.
-  // func must be brief and non-blocking since it executes in the one
-  // thread used for all such callbacks and also buffer deletions.
-  inline void ThenExecute(se::Stream* stream, std::function<void()> func) {
-    ToFreeVector to_free;
-    {
-      mutex_lock l(mu_);
-      QueueFunc(stream, std::move(func));
-      PollEvents(false, &to_free);
-    }
-    FreeMemory(to_free);
+  // Execute `func` when all pending stream actions have completed.  func must
+  // be brief and non-blocking since it executes in the one thread used for all
+  // such callbacks and also buffer deletions.
+  void ThenExecute(se::Stream* stream, std::function<void()> func) {
+    mutex_lock l(mu_);
+    EnqueueCallback(stream, std::move(func));
+    PollEvents(stream);
   }
 
  private:
   friend class TEST_EventMgr;
   friend class TEST_EventMgrHelper;
   friend class EventMgrFactory;
+
   se::StreamExecutor* const exec_;
   const int32 polling_active_delay_usecs_;
   mutex mu_;
   condition_variable events_pending_ TF_GUARDED_BY(mu_);
 
-  struct InUse {
-    se::Event* event;
-    std::function<void()> func;
-  };
-
-  typedef gtl::InlinedVector<InUse, 4> ToFreeVector;
-
   EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options);
 
-  void FreeMemory(const ToFreeVector& to_free) {
-    for (const auto& iu : to_free) {
-      // The function must be called in another thread.
-      if (iu.func != nullptr) threadpool_.Schedule(iu.func);
-    }
-  }
-
-  // Stream-enqueue an unused Event and save with it a collection of
-  // Tensors and/or a BufRec to be deleted only after the Event
-  // records.
-  void QueueInUse(se::Stream* stream, InUse in_use)
+  // Set up `func` to be called once `stream` completes all its outstanding
+  // work.
+  void EnqueueCallback(se::Stream* stream, std::function<void()> func)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  void QueueFunc(se::Stream* stream, std::function<void()> func)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, std::move(func)});
-  }
-
-  // This function should be called at roughly the same tempo as
-  // QueueTensors() to check whether pending events have recorded,
-  // and then retire them.  It appends InUse elements that need cleanup
-  // to "*to_free".  The caller should call FreeMemory(to_free)
-  // when this returns.
-  void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
+  // This function should be called at roughly the same tempo as QueueTensors()
+  // to check whether pending events have recorded, and then retire them.
+  //
+  // If `stream` is not null, we only poll events for that stream.  Otherwise we
+  // poll events for all streams.
+  void PollEvents(se::Stream* stream = nullptr)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // An internal polling loop that runs at a low frequency to clear
-  // straggler Events.
+  // An internal polling loop that runs at a low frequency to clear straggler
+  // Events.
   void PollLoop();
 
   // Setup/Teardown functions for the polling loop.
@@ -133,10 +107,13 @@ class EventMgr {
   void StopPollingLoop();
 
   // A stack of unused events
-  std::vector<se::Event*> free_events_ TF_GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<se::Event>> free_events_ TF_GUARDED_BY(mu_);
 
-  // A FIFO queue of InUse events and associated tensors.
-  std::deque<InUse> used_events_ TF_GUARDED_BY(mu_);
+  // Callbacks waiting on their events to complete.
+  absl::flat_hash_map<
+      se::Stream*,
+      std::deque<std::pair<std::unique_ptr<se::Event>, std::function<void()>>>>
+      callbacks_ TF_GUARDED_BY(mu_);
 
   bool stop_polling_ TF_GUARDED_BY(mu_);
   std::unique_ptr<Notification> polling_stopped_;
@@ -157,7 +134,8 @@ class EventMgrFactory {
 
   // Maintain one EventMgr per physical device (StreamExecutor is
   // per-physical-device).
-  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<se::StreamExecutor*, EventMgr*> event_mgr_map_
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
index f0dca1c86ed..12864bd9a5f 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+#include "tensorflow/core/common_runtime/device/device_event_mgr.h"
+
 #include <atomic>
 
-#include "tensorflow/core/common_runtime/device/device_event_mgr.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/tsl/framework/device_id.h"
 
 namespace tensorflow {
 
@@ -55,7 +57,11 @@ class TEST_EventMgrHelper {
 
   size_t queue_size() {
     mutex_lock l(em_->mu_);
-    return em_->used_events_.size();
+    size_t n = 0;
+    for (const auto& [stream, events_and_callbacks] : em_->callbacks_) {
+      n += events_and_callbacks.size();
+    }
+    return n;
   }
 
   size_t free_size() {
@@ -65,15 +71,8 @@ class TEST_EventMgrHelper {
 
   void PollEvents() {
     while (queue_size() > 0) {
-      // For ordinary tensor frees, this function
-      // should synchronously harvest all complete
-      // events and execute the corresponding memory frees.
-      EventMgr::ToFreeVector to_free;
-      {
-        mutex_lock l(em_->mu_);
-        em_->PollEvents(true, &to_free);
-      }
-      em_->FreeMemory(to_free);
+      mutex_lock l(em_->mu_);
+      em_->PollEvents();
     }
   }
 
@@ -109,7 +108,7 @@ class TestTensorBuffer : public TensorBuffer {
 namespace {
 
 TEST(EventMgr, Empty) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).value();
+  auto stream_exec = se::GPUMachineManager()->ExecutorForDevice(0).value();
   TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, th.queue_size());
@@ -118,7 +117,7 @@ TEST(EventMgr, Empty) {
 
 // Tests that WarnIfInCallback() triggers correctly.
 TEST(EventMgr, WarnIfInCallback) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).value();
+  auto stream_exec = se::GPUMachineManager()->ExecutorForDevice(0).value();
   TEST_EventMgr em(stream_exec, GPUOptions());
   TEST_EventMgrHelper th(&em);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
@@ -149,8 +148,9 @@ class GPUDeviceTestHelper {
         DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
     gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
     gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
-        GPUOptions(), TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
-    host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
+        GPUOptions(), tsl::TfDeviceId(0), memory_limit, /*peer_gpu_ids=*/{});
+    host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(
+        /*options=*/{}, /*numa_node=*/0);
   }
 
   BaseGPUDevice* gpu() { return gpu_.get(); }
@@ -436,7 +436,7 @@ static void BM_no_ops(::testing::benchmark::State& state) {
   const int threads = state.range(0);
   const int iters = state.max_iterations;
 
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).value();
+  auto stream_exec = se::GPUMachineManager()->ExecutorForDevice(0).value();
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
   CHECK(stream);
   stream->Init();
diff --git a/tensorflow/core/common_runtime/device/device_id_utils.h b/tensorflow/core/common_runtime/device/device_id_utils.h
deleted file mode 100644
index 16d56005ada..00000000000
--- a/tensorflow/core/common_runtime/device/device_id_utils.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_UTILS_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_UTILS_H_
-
-#include <numeric>
-
-#include "tensorflow/compiler/xla/stream_executor/platform.h"
-#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/common_runtime/device/device_id.h"
-#include "tensorflow/core/common_runtime/device/device_id_manager.h"
-#include "tensorflow/core/lib/gtl/int_type.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// Utility methods for translation between TensorFlow device ids and platform
-// device ids.
-class DeviceIdUtil {
- public:
-  // Convenient methods for getting the associated executor given a TfDeviceId
-  // or PlatformDeviceId.
-  static se::port::StatusOr<se::StreamExecutor*> ExecutorForPlatformDeviceId(
-      se::Platform* device_manager, PlatformDeviceId platform_device_id) {
-    return device_manager->ExecutorForDevice(platform_device_id.value());
-  }
-  static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfDeviceId(
-      const DeviceType& type, se::Platform* device_manager,
-      TfDeviceId tf_device_id) {
-    PlatformDeviceId platform_device_id;
-    TF_RETURN_IF_ERROR(DeviceIdManager::TfToPlatformDeviceId(
-        type, tf_device_id, &platform_device_id));
-    return ExecutorForPlatformDeviceId(device_manager, platform_device_id);
-  }
-
-  // Verify that the platform_device_id associated with a TfDeviceId is
-  // legitimate.
-  static void CheckValidTfDeviceId(const DeviceType& type,
-                                   se::Platform* device_manager,
-                                   TfDeviceId tf_device_id) {
-    PlatformDeviceId platform_device_id;
-    TF_CHECK_OK(DeviceIdManager::TfToPlatformDeviceId(type, tf_device_id,
-                                                      &platform_device_id));
-    const int visible_device_count = device_manager->VisibleDeviceCount();
-    CHECK_LT(platform_device_id.value(), visible_device_count)
-        << "platform_device_id is outside discovered device range."
-        << " TF " << type << " id: " << tf_device_id << ", platform " << type
-        << " id: " << platform_device_id
-        << ", visible device count: " << visible_device_count;
-  }
-
-  // Parse `visible_device_list` into a list of platform Device ids.
-  static Status ParseVisibleDeviceList(
-      const string& visible_device_list, const int visible_device_count,
-      std::vector<PlatformDeviceId>* visible_device_order) {
-    visible_device_order->clear();
-
-    // If the user wants to remap the visible to virtual Device mapping,
-    // check for that here.
-    if (visible_device_list.empty()) {
-      visible_device_order->resize(visible_device_count);
-      // By default, visible to virtual mapping is unchanged.
-      std::iota(visible_device_order->begin(), visible_device_order->end(), 0);
-    } else {
-      const std::vector<string> order_str =
-          str_util::Split(visible_device_list, ',');
-      for (const string& platform_device_id_str : order_str) {
-        int32_t platform_device_id;
-        if (!strings::safe_strto32(platform_device_id_str,
-                                   &platform_device_id)) {
-          return errors::InvalidArgument(
-              "Could not parse entry in 'visible_device_list': '",
-              platform_device_id_str,
-              "'. visible_device_list = ", visible_device_list);
-        }
-        if (platform_device_id < 0 ||
-            platform_device_id >= visible_device_count) {
-          return errors::InvalidArgument(
-              "'visible_device_list' listed an invalid Device id '",
-              platform_device_id, "' but visible device count is ",
-              visible_device_count);
-        }
-        visible_device_order->push_back(PlatformDeviceId(platform_device_id));
-      }
-    }
-
-    // Validate no repeats.
-    std::set<PlatformDeviceId> visible_device_set(visible_device_order->begin(),
-                                                  visible_device_order->end());
-    if (visible_device_set.size() != visible_device_order->size()) {
-      return errors::InvalidArgument(
-          "visible_device_list contained a duplicate entry: ",
-          visible_device_list);
-    }
-    return OkStatus();
-  }
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_DEVICE_ID_UTILS_H_
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index a6d3a59c720..778ed5646a1 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_zendnn",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_copts",
@@ -14,6 +15,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
@@ -76,6 +78,21 @@ tf_cuda_library(
     }),
 )
 
+tf_cc_test(
+    name = "eager_executor_test",
+    srcs = ["eager_executor_test.cc"],
+    deps = [
+        ":core",
+        ":eager_executor",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+    ],
+)
+
 tf_cuda_library(
     name = "context",
     srcs = [
@@ -111,6 +128,7 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:session_options",
+            "//tensorflow/core/distributed_runtime:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
             "//tensorflow/core/distributed_runtime:session_mgr",
             "//tensorflow/core/distributed_runtime:device_resolver_distributed",
@@ -190,7 +208,6 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -198,16 +215,18 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:session_options",
-            "//tensorflow/core/distributed_runtime:worker_cache",
-            "//tensorflow/core/distributed_runtime:worker_interface",
+            "//tensorflow/core/distributed_runtime:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime:remote_device",
             "//tensorflow/core/distributed_runtime:server_lib",
             "//tensorflow/core/distributed_runtime:session_mgr",
+            "//tensorflow/core/distributed_runtime:worker_cache",
+            "//tensorflow/core/distributed_runtime:worker_interface",
             "//tensorflow/core/distributed_runtime:worker_session",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
             "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
+            "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
             "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
             "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
         ],
@@ -327,6 +346,18 @@ tf_cuda_library(
     }),
 )
 
+tf_cc_test(
+    name = "tensor_handle_data_test",
+    srcs = ["tensor_handle_data_test.cc"],
+    deps = [
+        ":core",
+        ":tensor_handle_data",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status_matchers",
+    ],
+)
+
 tf_cuda_library(
     name = "tensor_handle",
     srcs = [
@@ -367,15 +398,13 @@ tf_cc_test(
     srcs = ["tensor_handle_test.cc"],
     deps = [
         ":core",
-        ":eager_operation",
-        ":execute",
         ":tensor_handle",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/core/platform:status_matchers",
     ],
 )
 
@@ -495,7 +524,7 @@ cc_library(
     hdrs = [
         "execute.h",
         "execute_node.h",
-    ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util_header"]),
+    ],
     copts = if_mkl(["-DINTEL_MKL"]),
     deps = [
         ":context",
@@ -533,7 +562,13 @@ cc_library(
             "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
             "//tensorflow/core/distributed_runtime/eager:remote_copy_node",
         ],
-    }) + if_mkl([":mkl_eager_op_rewrite"]),
+    }) + if_mkl([
+        ":mkl_eager_op_rewrite",
+        "//tensorflow/core/graph:mkl_graph_util",
+    ]) + if_zendnn([
+        ":zen_eager_op_rewrite",
+        "//tensorflow/core/graph:zen_graph_util",
+    ]),
 )
 
 tf_cc_test(
@@ -567,6 +602,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "zen_eager_op_rewrite",
+    srcs = ["zen_eager_op_rewrite.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":eager_op_rewrite_registry",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/graph:zen_graph_util",
+    ],
+    alwayslink = 1,
+)
+
 tf_mkl_kernel_library(
     name = "mkl_eager_op_rewrite",
     srcs = ["mkl_eager_op_rewrite.cc"],
@@ -708,6 +758,27 @@ tf_cuda_library(
     }),
 )
 
+tf_cc_test(
+    name = "placement_utils_test",
+    srcs = ["placement_utils_test.cc"],
+    deps = [
+        ":core",
+        ":eager_operation",
+        ":execute",
+        ":placement_utils",
+        ":tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:device_mgr",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/kernels:array",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cuda_library(
     name = "attr_builder",
     srcs = ["attr_builder.cc"],
@@ -769,7 +840,7 @@ filegroup(
         "tensor_handle_data.h",
     ],
     visibility = [
-        "//tensorflow/core/function:__pkg__",
+        "//tensorflow/core/function/runtime_client:__pkg__",
         "//tensorflow/python:__subpackages__",
     ],
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index e387cb5884b..41f03b4e4f1 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -126,6 +127,9 @@ void EagerContext::LocalRendezvousTable::Remove(int64_t step_id) {
   mutex_lock l(table_lock_);
   auto iter = table_.find(step_id);
   if (iter != table_.end()) {
+    if (step_id != EagerContext::kGlobalRendezvousId) {
+      iter->second->Unref();
+    }
     table_.erase(iter);
   }
 }
@@ -204,8 +208,7 @@ EagerContext::EagerContext(
   // initialization of global_rendezvous_for_functions_ because the latter
   // depends on the former.
   local_rendezvous_table_ = std::make_unique<LocalRendezvousTable>();
-  global_rendezvous_for_functions_ =
-      core::RefCountPtr<Rendezvous>(CreateRendezvous(-1));
+  ResetGlobalRendezvousForFunction();
 }
 
 AbstractTensorInterface* EagerContext::CreateInt64Scalar(int64_t value) {
@@ -269,15 +272,15 @@ void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                              const OptimizerOptions& optimizer_options,
                              thread::ThreadPool* thread_pool,
                              DistributedFunctionLibraryRuntime* cluster_flr) {
-  Rendezvous::Factory rendezvous_factory{
-      [this](const int64_t step_id, const DeviceMgr*, Rendezvous** r) {
-        *r = CreateRendezvous(step_id);
-        return OkStatus();
-      }};
+  Rendezvous::Factory rendezvous_factory = CreateRendezvousFactory();
+  const tensorflow::SessionMetadata* session_metadata = nullptr;
+  if (opts_.config.experimental().has_session_metadata()) {
+    session_metadata = &opts_.config.experimental().session_metadata();
+  }
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       device_mgr, env, config, graph_def_version, lib_def, optimizer_options,
-      thread_pool, cluster_flr,
-      /*session_metadata=*/nullptr, std::move(rendezvous_factory)));
+      thread_pool, cluster_flr, session_metadata, std::move(rendezvous_factory),
+      StatsPublisherInterface::GetStatsPublisherFactory()));
 }
 
 void EagerContext::InitPrioritizedDeviceTypeList() {
@@ -480,14 +483,16 @@ void EagerContext::ClearCachesAndThreadExecutors() {
 }
 
 void EagerContext::ClearCachesAndDefaultExecutor() {
-  // The executor stores pointers to kernels, so we need to make sure that no
-  // async eager ops are still executing. We lock the cache during this time
-  // as well.
-  mutex_lock ml(cache_mu_);
-  default_executor_.WaitForAllPendingNodes().IgnoreError();
-  kernel_cache_.clear();
-  for (auto& entry : registered_functions_) {
-    entry.second->cached_kernel_keys->clear();
+  {
+    // The executor stores pointers to kernels, so we need to make sure that no
+    // async eager ops are still pending to be executed. We lock the cache
+    // during this time as well.
+    mutex_lock ml(cache_mu_);
+    default_executor_.WaitForAllPendingNodes().IgnoreError();
+    kernel_cache_.clear();
+    for (auto& entry : registered_functions_) {
+      entry.second->cached_kernel_keys->clear();
+    }
   }
   {
     mutex_lock dl(device_cache_mu_);
@@ -495,8 +500,8 @@ void EagerContext::ClearCachesAndDefaultExecutor() {
   }
   {
     mutex_lock ml(metadata_mu_);
-    step_container_.reset(new ScopedStepContainer(
-        0, [this](const string& name) { ClearResourceContainer(name); }));
+    step_container_ = std::make_unique<ScopedStepContainer>(
+        0, [this](const string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -1352,7 +1357,7 @@ Status EagerContext::StoreCollectiveOpsServer(
     local_device_manager_.Reset(device_mgr);
     UpdateGlobalRendezvousDeviceManager(local_device_manager_.Get());
     if (rendezvous_ != nullptr) rendezvous_->Unref();
-    rendezvous_ = CreateRendezvous(-1);
+    TF_RETURN_IF_ERROR(RendezvousFactory()(-1, nullptr, &rendezvous_));
   }
   host_cpu_device_ = local_device_manager_.Get()->HostCPU();
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index dc24ac90ad8..2588c732c11 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -81,9 +81,6 @@ namespace eager {
 class RemoteMgr;
 }  // namespace eager
 
-class TensorHandle;
-class EagerOperation;
-
 class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
  public:
   static constexpr uint64 kInvalidContextId = 0;
@@ -299,8 +296,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
     // if it uses local rendezvous type, which forces EagerContext to create a
     // new local rendezvous instance in the table.
     local_rendezvous_table_->Remove(-1);
+    Rendezvous* rendezvous;
+    TF_CHECK_OK(CreateRendezvousFactory()(-1, nullptr, &rendezvous));
     global_rendezvous_for_functions_ =
-        core::RefCountPtr<Rendezvous>(CreateRendezvous(-1));
+        core::RefCountPtr<Rendezvous>(rendezvous);
   }
 
   // Returns the global_rendezvous_for_functions' underlying LocalRendezvous'
@@ -308,14 +307,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // returns OK.
   Status GetGlobalRendezvousForFunctionLocalRendezvousStatus();
 
-  // Returns a function which maps from step_id to rendezvous. This closure
+  // Returns a factory which maps from step_id to rendezvous. This closure
   // respects the value of `SetReuseRendezvousForFunctions` at the time the
   // closure was created, which allows the setting to be toggled around async op
   // launches.
   //
   // The caller of the returned function owns a reference to the resulting
   // Rendezvous.
-  std::function<Rendezvous*(int64_t)> RendezvousCreator() {
+  Rendezvous::Factory RendezvousFactory() {
     // There is an implicit assumption that the global_rendezvous_for_functions_
     // is always an IntraProcessRendezvous to match the behaviour of the
     // EagerContext's rendezvous.
@@ -327,13 +326,17 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
         worker_env_ == nullptr &&
 #endif
         remote_device_mgr() == nullptr) {
-      return [this](int64_t step_id) {
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        Rendezvous** r) {
         mutex_lock l(global_rendezvous_mu_);
+        // Increase the ref that owned by the caller.
         global_rendezvous_for_functions_->Ref();
-        return global_rendezvous_for_functions_.get();
-      };
+        *r = global_rendezvous_for_functions_.get();
+        return OkStatus();
+      }};
     } else {
-      return [this](int64_t step_id) { return CreateRendezvous(step_id); };
+      return CreateRendezvousFactory();
     }
   }
 
@@ -584,27 +587,56 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
         TF_GUARDED_BY(table_lock_);
   };
 
-  Rendezvous* CreateRendezvous(int64_t step_id) const {
+  Rendezvous::Factory CreateRendezvousFactory() const {
     if (rendezvous_creator_ != nullptr) {
-      VLOG(6) << "Creating rendezvous using the rendezvous_creator_.";
-      return rendezvous_creator_(step_id);
+      return Rendezvous::Factory{[this](const int64_t step_id,
+                                        const DeviceMgr* device_mgr,
+                                        Rendezvous** r) {
+        VLOG(6) << "Creating rendezvous using the rendezvous_creator_.";
+        *r = rendezvous_creator_(step_id);
+        return OkStatus();
+      }};
     }
 
 #if !defined(IS_MOBILE_PLATFORM)
     if (worker_env_ != nullptr && worker_env_->rendezvous_mgr != nullptr) {
-      VLOG(6) << "Creating rendezvous using the worker_env's rendezvous_mgr.";
-      auto* remote_r = worker_env_->rendezvous_mgr->Find(step_id);
-      remote_r->Initialize(worker_session_.get()).IgnoreError();
-      return remote_r;
+      return Rendezvous::Factory{
+          [this](const int64_t step_id, const DeviceMgr* device_mgr,
+                 Rendezvous** r) {
+            VLOG(6)
+                << "Creating rendezvous using the worker_env's rendezvous_mgr.";
+            // TODO(hhb): Add a Create method and use it here.
+            auto* remote_r = worker_env_->rendezvous_mgr->Find(step_id);
+            remote_r->Initialize(worker_session_.get()).IgnoreError();
+            *r = remote_r;
+            return OkStatus();
+          },
+          [this](const int64_t step_id) {
+            VLOG(6) << "Cleaning up rendezvous from the rendezvous_mgr. "
+                    << "Step id: " << step_id;
+            worker_env_->rendezvous_mgr->Cleanup(step_id);
+            return OkStatus();
+          }};
     }
 #endif
 
     if (remote_device_mgr() == nullptr) {
-      VLOG(6) << "Creating rendezvous using local_device_mgr.";
-      return local_rendezvous_table_->FindOrCreate(step_id, local_device_mgr());
+      return Rendezvous::Factory{
+          [this](const int64_t step_id, const DeviceMgr* device_mgr,
+                 Rendezvous** r) {
+            VLOG(6) << "Creating rendezvous using local_device_mgr.";
+            *r = local_rendezvous_table_->FindOrCreate(step_id,
+                                                       local_device_mgr());
+            return OkStatus();
+          },
+          [this](const int64_t step_id) {
+            VLOG(6) << "Cleaning up rendezvous from local_device_mgr.";
+            local_rendezvous_table_->Remove(step_id);
+            return OkStatus();
+          }};
     }
 
-    return nullptr;
+    return Rendezvous::Factory();
   }
 
   ~EagerContext() override;
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.h b/tensorflow/core/common_runtime/eager/context_distributed_manager.h
index 38bf8e48650..279a792a87b 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.h
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -49,10 +49,10 @@ class EagerContextDistributedManager
   Status CheckRemoteAlive(const std::string& remote_task_name,
                           bool* is_alive) override;
 
-  CoordinationServiceAgent* GetCoordinationServiceAgent() override {
+  tsl::CoordinationServiceAgent* GetCoordinationServiceAgent() override {
     return coordination_service_agent_;
   }
-  void SetCoordinationServiceAgent(CoordinationServiceAgent* agent) {
+  void SetCoordinationServiceAgent(tsl::CoordinationServiceAgent* agent) {
     coordination_service_agent_ = agent;
   }
   void SetPreemptionNotifier(
@@ -63,7 +63,7 @@ class EagerContextDistributedManager
  private:
   EagerContext* context_;
   // Owned by context_->GetServer()->worker_env()->session_mgr.
-  CoordinationServiceAgent* coordination_service_agent_ = nullptr;
+  tsl::CoordinationServiceAgent* coordination_service_agent_ = nullptr;
   std::unique_ptr<tsl::PreemptionNotifier> preemption_notifier_;
 };
 #endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index a9d22077a5f..cb1174e543a 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -53,10 +53,11 @@ class EagerContextTest : public ::testing::Test {
                    ContextDevicePlacementPolicy policy, bool async = false) {
     ASSERT_EQ(context_, nullptr);
     InitDeviceManager();
-    context_ = core::RefCountPtr<EagerContext>(
-        new EagerContext(opts, policy, async, device_manager_.get(),
-                         /*device_mgr_owned=*/false, /*rendezvous=*/nullptr,
-                         /*cluster_flr=*/nullptr));
+    context_ = core::RefCountPtr<EagerContext>(new EagerContext(
+        opts, policy, async, device_manager_.get(),
+        /*device_mgr_owned=*/false, /*rendezvous=*/nullptr,
+        /*cluster_flr=*/nullptr, /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true));
   }
 
  protected:
@@ -340,19 +341,20 @@ TEST_F(EagerContextTest, XlaCompileDeviceType) {
 
 TEST_F(EagerContextTest, LocalRendezvousCreation) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
-  std::function<Rendezvous*(const int64_t)> rendezvous_creator =
-      context()->RendezvousCreator();
+  auto rendezvous_creator = context()->RendezvousFactory();
 
   // Create a new rendezvous instance.
   // Initially its ref-count is 2:
-  // one added upopn rendezvous creation, the other one added by EagerContext.
-  Rendezvous* rendezvous_1 = rendezvous_creator(1);
+  // one added upon rendezvous creation, the other one added by EagerContext.
+  Rendezvous* rendezvous_1;
+  TF_ASSERT_OK(rendezvous_creator(1, nullptr, &rendezvous_1));
   EXPECT_EQ(rendezvous_1->RefCount(), 2);
 
   // Create another rendezvous instance with the same step-id.
   // This would add one more ref-count to the existing rendezvous insteance
   // insted of creating a new instance.
-  Rendezvous* rendezvous_2 = rendezvous_creator(1);
+  Rendezvous* rendezvous_2;
+  TF_ASSERT_OK(rendezvous_creator(1, nullptr, &rendezvous_2));
   EXPECT_EQ(rendezvous_2->RefCount(), 3);
 
   // Caller releases rendezvous-1.
@@ -368,16 +370,19 @@ void TestGlobalRendezvous(EagerContext* context, bool reuse_global_rendezvous) {
   context->SetReuseRendezvousForFunctions(reuse_global_rendezvous);
   EXPECT_EQ(context->GetReuseRendezvousForFunctions(), reuse_global_rendezvous);
 
-  auto rendezvous_creator = context->RendezvousCreator();
-  Rendezvous* rendezvous_1 = rendezvous_creator(-1);
+  auto rendezvous_creator = context->RendezvousFactory();
+  Rendezvous* rendezvous_1;
+  TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_1));
   EXPECT_EQ(rendezvous_1->RefCount(), 2);
-  Rendezvous* rendezvous_2 = rendezvous_creator(-1);
+  Rendezvous* rendezvous_2;
+  TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_2));
   EXPECT_EQ(rendezvous_2->RefCount(), 3);
 
   // Global rendezvous's ref-count should be back to 1 after resetting.
   context->ResetGlobalRendezvousForFunction();
 
-  Rendezvous* rendezvous_3 = rendezvous_creator(-1);
+  Rendezvous* rendezvous_3;
+  TF_ASSERT_OK(rendezvous_creator(-1, nullptr, &rendezvous_3));
   EXPECT_EQ(rendezvous_3->RefCount(), 2);
 
   // Callers release rendezvous.
diff --git a/tensorflow/core/common_runtime/eager/custom_device_test.cc b/tensorflow/core/common_runtime/eager/custom_device_test.cc
index d2a7ac1b2f9..451c05cf097 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_test.cc
@@ -106,7 +106,8 @@ TEST(CustomDevice, TestTensorHandle) {
   core::RefCountPtr<EagerContext> ctx(new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr));
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true));
   std::string device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:15";
   TestCustomDevice device(device_name);
   core::RefCountPtr<TestCustomDeviceTensorHandle> tensor(
@@ -135,7 +136,8 @@ TEST(CustomDevice, TestTensorHandleUnknownDimNumElements) {
   core::RefCountPtr<EagerContext> ctx(new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr));
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true));
   std::string device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:15";
   TestCustomDevice device(device_name);
   core::RefCountPtr<TestCustomDeviceTensorHandle> tensor(
@@ -153,7 +155,8 @@ TEST(CustomDevice, TestResourcePlacement) {
   core::RefCountPtr<EagerContext> ctx(new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr));
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true));
   std::string custom_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:15";
   TestCustomDevice custom_device(custom_device_name);
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index f93956dec13..daa80b6e92a 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -31,7 +31,8 @@ bool IsAsyncWaitForRemoteFunctionEnabled() {
 }
 }  // namespace
 
-EagerExecutor::EagerExecutor(bool async, bool enable_streaming_enqueue)
+EagerExecutor::EagerExecutor(bool async, bool enable_streaming_enqueue,
+                             int in_flight_nodes_limit)
     : next_node_id_(0),
       ok_(true),
       thread_(async ? tensorflow::Env::Default()->StartThread(
@@ -41,7 +42,13 @@ EagerExecutor::EagerExecutor(bool async, bool enable_streaming_enqueue)
       last_eager_client_(nullptr),
       enable_async_wait_for_remote_function_(
           IsAsyncWaitForRemoteFunctionEnabled()),
-      enable_streaming_enqueue_(enable_streaming_enqueue) {}
+      enable_streaming_enqueue_(enable_streaming_enqueue),
+      in_flight_nodes_limit_(in_flight_nodes_limit) {
+  if (async && in_flight_nodes_limit_ > 0) {
+    VLOG(4) << "EagerExecutor InFlightNodes limit is set to "
+            << in_flight_nodes_limit_;
+  }
+}
 
 EagerExecutor::~EagerExecutor() {
   tensorflow::mutex_lock l(node_queue_mutex_);
@@ -100,7 +107,7 @@ const char* EagerExecutor::StateStringLocked() {
 
 Status EagerExecutor::SyncExecute(EagerNode* node) {
   if (Async()) {
-    return errors::Internal("Executor does not support async execution");
+    return errors::Internal("SyncExecute does not support async execution.");
   }
   if (node->AsAsync() != nullptr) {
     return errors::Internal("Executor does not support executing async nodes");
@@ -156,7 +163,22 @@ Status EagerExecutor::AddOrExecute(std::unique_ptr<EagerNode> node) {
         if (node_queue_.size() == 1) {
           nodes_pending_.notify_all();
         }
-
+        if (in_flight_nodes_limit_ == 0) {
+          return OkStatus();
+        }
+        // Limit the concurrency by controlling the number of in flight nodes.
+        while (true) {
+          int64_t in_flight_nodes_count =
+              node_queue_.size() + unfinished_nodes_.size();
+          if (in_flight_nodes_count < in_flight_nodes_limit_) {
+            break;
+          }
+          VLOG(4) << "Hitting in-flight node limit node_queue_.size() = "
+                  << node_queue_.size()
+                  << " unfinished_nodes_.size() = " << unfinished_nodes_.size()
+                  << ".";
+          nodes_done_.wait(l);
+        }
         return OkStatus();
       }
     }
@@ -275,6 +297,8 @@ void EagerExecutor::NodeDone(const core::RefCountPtr<NodeItem>& item,
     if (need_notification) {
       NotifyWaiters(item->id);
     }
+    // Notify AddOrExecute() some nodes have been done.
+    nodes_done_.notify_all();
   }
 
   for (auto& item : items_to_destroy) {
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index bd6963dbb17..6abfe7a35d6 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -111,7 +111,8 @@ class AsyncRemoteExecuteNode : public AsyncEagerNode {
 // TODO(agarwal): Implement optimizations over EagerNode traces.
 class EagerExecutor {
  public:
-  explicit EagerExecutor(bool async, bool enable_streaming_enqueue = true);
+  explicit EagerExecutor(bool async, bool enable_streaming_enqueue = true,
+                         int in_flight_nodes_limit = 0);
 
   ~EagerExecutor();
 
@@ -221,6 +222,8 @@ class EagerExecutor {
 
   // Used to signal that some EagerNodes are pending execution.
   condition_variable nodes_pending_ TF_GUARDED_BY(node_queue_mutex_);
+  // Used to signal that some EagerNodes are done.
+  condition_variable nodes_done_ TF_GUARDED_BY(node_queue_mutex_);
 
   // Queue of pending NodeItems. Ordered by NodeItem::id.
   std::queue<core::RefCountPtr<NodeItem>> node_queue_
@@ -265,6 +268,11 @@ class EagerExecutor {
 
   // Callbacks to run on destruction.
   std::unordered_map<intptr_t, std::vector<std::function<void()>>> cleanups_;
+
+  // Limit the number of in-flight nodes. When the number of in-flight eager
+  // async nodes reach this number, enqueuing to the eager async queue is
+  // blocked.
+  const int64_t in_flight_nodes_limit_;
 };
 
 inline bool EagerExecutor::Async() const { return thread_ != nullptr; }
diff --git a/tensorflow/core/common_runtime/eager/eager_executor_test.cc b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
new file mode 100644
index 00000000000..f87c6824227
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
@@ -0,0 +1,285 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class TestState {
+ public:
+  enum State { kSuccess, kNotRun, kFailure };
+  TestState() : state_(kNotRun) {}
+  TestState(const TestState&) = delete;
+  TestState& operator=(const TestState&) = delete;
+  State read_state() { return state_; }
+  void update_success_state() { state_ = kSuccess; }
+  void update_run_error_state() { state_ = kFailure; }
+
+ private:
+  State state_;
+};
+
+class TestEagerNode : public EagerNode {
+ public:
+  explicit TestEagerNode(TestState* state,
+                         Status prepare_return_status = OkStatus(),
+                         Status run_return_status = OkStatus())
+      : state_(state),
+        prepare_return_status_(prepare_return_status),
+        run_return_status_(run_return_status) {}
+  TestEagerNode(const TestEagerNode&) = delete;
+  TestEagerNode& operator=(const TestEagerNode&) = delete;
+  Status Prepare() override { return prepare_return_status_; }
+
+  Status Run() override {
+    if (run_return_status_.ok()) {
+      state_->update_success_state();
+    } else {
+      state_->update_run_error_state();
+    }
+    return run_return_status_;
+  };
+
+  void Abort(Status status) override {}
+  string DebugString() const override { return "testEagerNode"; }
+
+ private:
+  TestState* state_;
+  Status prepare_return_status_;
+  Status run_return_status_;
+};
+
+class TestAsyncEagerNode : public AsyncEagerNode {
+ public:
+  explicit TestAsyncEagerNode(TestState* state,
+                              Status prepare_return_status = OkStatus(),
+                              Status run_return_status = OkStatus())
+      : state_(state),
+        prepare_return_status_(prepare_return_status),
+        run_return_status_(run_return_status) {}
+  TestAsyncEagerNode(const TestAsyncEagerNode&) = delete;
+  TestAsyncEagerNode& operator=(const TestAsyncEagerNode&) = delete;
+
+  Status Prepare() override { return prepare_return_status_; }
+
+  void RunAsync(StatusCallback done) override {
+    if (run_return_status_.ok()) {
+      state_->update_success_state();
+    } else {
+      state_->update_run_error_state();
+    }
+    done(run_return_status_);
+  };
+
+  void Abort(Status status) override {}
+  string DebugString() const override { return "testAsyncEagerNode"; }
+
+ private:
+  TestState* state_;
+  Status prepare_return_status_;
+  Status run_return_status_;
+};
+
+TEST(EagerExecutorTest, TestSyncExecutorWithEagerNode) {
+  auto sync_executor = std::make_unique<EagerExecutor>(
+      /*async=*/false, /*enable_streaming_enqueue=*/true);
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get());
+
+  TF_ASSERT_OK(sync_executor->AddOrExecute(std::move(node)));
+  ASSERT_EQ(state->read_state(), TestState::State::kSuccess);
+}
+
+TEST(EagerExecutorTest, TestSyncExecuteMethodFailureCases) {
+  // Async Executor with Eager node fails
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto sync_node = std::make_unique<TestEagerNode>(state.get());
+
+  EXPECT_THAT(async_executor->SyncExecute(sync_node.get()),
+              tensorflow::testing::StatusIs(tensorflow::error::INTERNAL));
+  ASSERT_EQ(state->read_state(), TestState::kNotRun);
+
+  // Sync Executor with Async node fails
+  auto sync_executor = std::make_unique<EagerExecutor>(
+      /*async=*/false, /*enable_streaming_enqueue=*/true);
+
+  state = std::make_unique<TestState>();
+  auto async_node = std::make_unique<TestAsyncEagerNode>(state.get());
+
+  EXPECT_THAT(sync_executor->SyncExecute(async_node.get()),
+              tensorflow::testing::StatusIs(tensorflow::error::INTERNAL));
+  ASSERT_EQ(state->read_state(), TestState::State::kNotRun);
+}
+
+TEST(EagerExecutorTest, TestSyncExecuteMethodSuccessCase) {
+  auto sync_executor = std::make_unique<EagerExecutor>(
+      /*async=*/false, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get());
+
+  TF_ASSERT_OK(sync_executor->SyncExecute(node.get()));
+  ASSERT_EQ(state->read_state(), TestState::State::kSuccess);
+}
+
+TEST(EagerExecutorTest, TestSyncExecutorFailPrepare) {
+  auto sync_executor = std::make_unique<EagerExecutor>(
+      /*async=*/false, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get(),
+                                              errors::InvalidArgument("test"));
+  auto status = sync_executor->AddOrExecute(std::move(node));
+
+  ASSERT_EQ(status.code(), tensorflow::error::INVALID_ARGUMENT);
+  ASSERT_EQ(state->read_state(), TestState::State::kNotRun);
+}
+
+TEST(EagerExecutorTest, TestSyncExecutorFailRun) {
+  auto sync_executor = std::make_unique<EagerExecutor>(
+      /*async=*/false, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get(), OkStatus(),
+                                              errors::Internal("test"));
+
+  auto status = sync_executor->AddOrExecute(std::move(node));
+  ASSERT_EQ(status.code(), tensorflow::error::INTERNAL);
+  ASSERT_EQ(state->read_state(), TestState::State::kFailure);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorWithAsyncEagerNode) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestAsyncEagerNode>(state.get());
+
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node)));
+  TF_ASSERT_OK(async_executor->WaitForAllPendingNodes());
+  ASSERT_EQ(state->read_state(), TestState::State::kSuccess);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorWithInFlightRequestLimit) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true,
+      /*in_flight_nodes_limit=*/1);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestAsyncEagerNode>(state.get());
+
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node)));
+
+  auto node1 = std::make_unique<TestAsyncEagerNode>(state.get());
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node1)));
+  TF_ASSERT_OK(async_executor->WaitForAllPendingNodes());
+  ASSERT_EQ(state->read_state(), TestState::State::kSuccess);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorWithEagerNode) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get());
+
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node)));
+  TF_ASSERT_OK(async_executor->WaitForAllPendingNodes());
+  ASSERT_EQ(state->read_state(), TestState::State::kSuccess);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorFailPrepare) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get(),
+                                              errors::InvalidArgument("test"));
+
+  auto status = async_executor->AddOrExecute(std::move(node));
+
+  ASSERT_EQ(status.code(), tensorflow::error::INVALID_ARGUMENT);
+  ASSERT_EQ(state->read_state(), TestState::State::kNotRun);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorFailRun) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestEagerNode>(state.get(), OkStatus(),
+                                              errors::Internal("test"));
+
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node)));
+  auto status = async_executor->WaitForAllPendingNodes();
+  ASSERT_EQ(status.code(), tensorflow::error::INTERNAL);
+  ASSERT_EQ(state->read_state(), TestState::State::kFailure);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorFailPrepareWithAsyncNode) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestAsyncEagerNode>(
+      state.get(), errors::InvalidArgument("test"));
+  auto status = async_executor->AddOrExecute(std::move(node));
+
+  ASSERT_EQ(status.code(), tensorflow::error::INVALID_ARGUMENT);
+  ASSERT_EQ(state->read_state(), TestState::State::kNotRun);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorFailRunWithAsyncNode) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestAsyncEagerNode>(state.get(), OkStatus(),
+                                                   errors::Internal("test"));
+
+  TF_ASSERT_OK(async_executor->AddOrExecute(std::move(node)));
+
+  auto status = async_executor->WaitForAllPendingNodes();
+  ASSERT_EQ(status.code(), tensorflow::error::INTERNAL);
+  ASSERT_EQ(state->read_state(), TestState::State::kFailure);
+}
+
+TEST(EagerExecutorTest, TestAsyncExecutorAddNodesAfterShutdown) {
+  auto async_executor = std::make_unique<EagerExecutor>(
+      /*async=*/true, /*enable_streaming_enqueue=*/true);
+
+  auto state = std::make_unique<TestState>();
+  auto node = std::make_unique<TestAsyncEagerNode>(state.get());
+
+  TF_ASSERT_OK(async_executor->ShutDown());
+  EXPECT_THAT(
+      async_executor->AddOrExecute(std::move(node)),
+      tensorflow::testing::StatusIs(tensorflow::error::FAILED_PRECONDITION));
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index 84b766a22e0..1350ba720b9 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -53,7 +53,8 @@ TEST(EagerOpRewriteRegistryTest, RegisterRewritePass) {
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr);
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
   EagerOperation orig_op(ctx);
   std::unique_ptr<tensorflow::EagerOperation> out_op;
   EXPECT_EQ(OkStatus(),
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
index de8511280a9..6fbfc5bf38f 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -28,7 +28,8 @@ TEST(EagerOperationTest, DeviceName) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr);
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   auto op = new EagerOperation(ctx);
 
@@ -53,7 +54,8 @@ TEST(EagerOperationTest, EagerFunctionParamsAndStepId) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr);
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   tensorflow::FunctionDef function_def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5e747d43648..096efeb2866 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -94,6 +94,19 @@ namespace tensorflow {
 
 namespace {
 
+constexpr char kCpuDevice[] = "CPU";
+constexpr char kGpuDevice[] = "GPU";
+constexpr char kTpuDevice[] = "TPU";
+
+constexpr char kEnabled[] = "enabled";
+constexpr char kDisabled[] = "disabled";
+
+auto* function_compile_counter =
+    monitoring::Counter<2>::New("/tensorflow/core/tf_function_compile",
+                                "The number of times that TF function is "
+                                "called for different compilation options.",
+                                "device", "compilation_option");
+
 const string& DeviceNameOrUnspecified(Device* device) {
   static string* unspecified_string = new string("<unspecified>");
   return (device == nullptr) ? *unspecified_string : device->name();
@@ -413,8 +426,35 @@ Status GetFuncAttr(const EagerOperation* op, const EagerContext& ctx,
   return status;
 }
 
+// Checks if `op` is a function and contains TPU replication ops.  If `op` does,
+// then `has_tpu_replication` is set to true.  Other `has_tpu_replication` is
+// set to false.
+Status HasTPUReplication(const EagerOperation& op, const EagerContext& ctx,
+                         bool* has_tpu_replication) {
+  *has_tpu_replication = false;
+  if (!op.is_function()) {
+    return OkStatus();
+  }
+
+  const FunctionDef* function_def =
+      ctx.pflr()->GetFunctionLibraryDefinition()->Find(op.Name());
+  if (function_def == nullptr) {
+    return errors::NotFound("Failed to find function '", op.Name(), "'");
+  }
+  for (const NodeDef& node : function_def->node_def()) {
+    if (node.op() == "TPUReplicateMetadata") {
+      *has_tpu_replication = true;
+      return OkStatus();
+    }
+  }
+  return OkStatus();
+}
+
 Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
                           bool* compile_with_xla) {
+#if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
+  *compile_with_xla = false;
+#else
   if (!op->is_function()) {
     *compile_with_xla = false;
     return OkStatus();
@@ -444,10 +484,40 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
   } else {
     *compile_with_xla = false;
   }
+#endif
 
   return OkStatus();
 }
 
+void UpdateCompileCounter(const EagerOperation* op, bool compile_with_xla,
+                          bool has_tpu_replication) {
+  if (has_tpu_replication) {
+    function_compile_counter->GetCell(kTpuDevice, kEnabled)->IncrementBy(1);
+    return;
+  }
+
+  string device_type = "Unknown";
+  if (op->GetDeviceParsedName().type == "XLA_CPU" ||
+      op->GetDeviceParsedName().type == "CPU") {
+    device_type = kCpuDevice;
+  }
+  if (op->GetDeviceParsedName().type == "XLA_GPU" ||
+      op->GetDeviceParsedName().type == "GPU") {
+    device_type = kGpuDevice;
+  }
+  if (op->GetDeviceParsedName().type == "TPU") {
+    device_type = kTpuDevice;
+  }
+
+  string compilation_option = kDisabled;
+  if (compile_with_xla) {
+    compilation_option = kEnabled;
+  }
+
+  function_compile_counter->GetCell(device_type, compilation_option)
+      ->IncrementBy(1);
+}
+
 Status VerifyWrappableInCallOp(const OpDef& opdef, EagerOperation* op) {
   absl::flat_hash_set<string> opdef_attrs;
   for (const auto& attr : opdef.attr()) {
@@ -1117,25 +1187,35 @@ Status GetOrCreateKernelAndDevice(
   if (kernel == nullptr) {
     VLOG(2) << "Creating new kernel for " << op->Name() << " on device "
             << DeviceNameOrUnspecified(absl::get<Device*>(op->Device()));
+
     bool run_function_with_flr = false;
-    bool function_outputs_on_op_device = false;
     absl::optional<string> xla_compile_device_type;
     if (op->is_function()) {
       bool compile_with_xla;
+      // By default we should run functions with FunctionLibraryRuntime.
+      run_function_with_flr = true;
+      // TODO(b/222338429): We can remove checking this once all accelerator
+      // jit_compile runs through flr.
+      bool has_tpu_replication = false;
       TF_RETURN_IF_ERROR(MustCompileWithXLA(op, ctx, &compile_with_xla));
-      if (compile_with_xla) {
+      TF_RETURN_IF_ERROR(HasTPUReplication(*op, ctx, &has_tpu_replication));
+      UpdateCompileCounter(op, compile_with_xla, has_tpu_replication);
+
+      if (compile_with_xla && !has_tpu_replication) {
         if (ctx.JitCompileRewrite()) {
           xla_compile_device_type = op->GetDeviceParsedName().type;
-          run_function_with_flr = true;
         } else {
           // Note that it is not ideal, but currently correct, to set this
           // attribute after computing the kernel cache key above.
           // Note: If the attribute is already set to true, this is a noop.
+          run_function_with_flr = false;
           op->MutableAttrs()->Set(kXlaMustCompileAttr, true);
         }
-      } else {
-        run_function_with_flr = true;
       }
+    }
+
+    bool function_outputs_on_op_device = false;
+    if (op->is_function()) {
       GetFuncAttr(op, ctx, kOutputsOnOpDevice, &function_outputs_on_op_device)
           .IgnoreError();
     }
@@ -1220,7 +1300,7 @@ Status GetOrCreateKernelAndDevice(
 
       ctx.reuse_rendezvous_for_functions_mu()->lock();
       ctx.SetReuseRendezvousForFunctions(reuse_rendezvous_for_functions);
-      auto rendezvous_creator = ctx.RendezvousCreator();
+      auto rendezvous_creator = ctx.RendezvousFactory();
       ctx.SetReuseRendezvousForFunctions(
           reuse_rendezvous_for_functions_original_value);
       ctx.reuse_rendezvous_for_functions_mu()->unlock();
@@ -1848,7 +1928,7 @@ Status EagerKernelExecute(
   // device. We don't call it now because it is an unneeded overhead (it
   // acquires a lock) and we can't recover from errors anyway.
   ScopedStepContainer* container = ctx->StepContainer();
-  CoordinationServiceAgent* coord_agent = nullptr;
+  tsl::CoordinationServiceAgent* coord_agent = nullptr;
 #if !defined(IS_MOBILE_PLATFORM)
   if (ctx->GetDistributedManager() != nullptr)
     coord_agent = ctx->GetDistributedManager()->GetCoordinationServiceAgent();
@@ -2056,7 +2136,7 @@ void EagerKernelExecuteAsync(
     done(s);
     return;
   }
-  CoordinationServiceAgent* coord_agent = nullptr;
+  tsl::CoordinationServiceAgent* coord_agent = nullptr;
 #if !defined(IS_MOBILE_PLATFORM)
   if (ctx->GetDistributedManager() != nullptr)
     coord_agent = ctx->GetDistributedManager()->GetCoordinationServiceAgent();
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 281094d8296..9c0ca0948cc 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -43,7 +43,8 @@ class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
             /*shape_inference_on_tfe_dialect_import=*/true,
             /*int_args_and_retvals_on_device=*/false,
             /*xla_compile_device_type=*/absl::nullopt,
-            /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
+            /*rendezvous_factory=*/Rendezvous::Factory(),
+            /*get_op_id=*/nullptr),
         test_input_devices_(std::move(input_devices)) {}
 
   Device* InputDevice(int i) const override { return test_input_devices_[i]; }
@@ -73,7 +74,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr);
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   // Set a RemoteMgr to the EagerContext.
   auto remote_mgr = std::make_unique<eager::RemoteMgr>(
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 6836fb3e090..fc72bdeca68 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -266,7 +266,7 @@ Status KernelAndDeviceOp::Run(
     CancellationManager* cancellation_manager,
     const absl::optional<EagerFunctionParams>& eager_func_params,
     const absl::optional<ManagedStackTrace>& stack_trace,
-    CoordinationServiceAgent* coordination_service_agent) {
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
@@ -354,7 +354,7 @@ KernelAndDeviceFunc::PrepareForRun(
     CancellationManager* cancellation_manager,
     const absl::optional<EagerFunctionParams>& eager_func_params,
     const absl::optional<ManagedStackTrace>& stack_trace,
-    CoordinationServiceAgent* coordination_service_agent) {
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = nullptr;
   if (eager_func_params.has_value()) {
     const EagerFunctionParams& params = eager_func_params.value();
@@ -382,7 +382,8 @@ KernelAndDeviceFunc::PrepareForRun(
   // We don't pass rendezvous from eager context because we can get tensor
   // name collisions in send/recv ops when running multiple instances
   // of the same multi-device function concurrently.
-  Rendezvous* rendezvous = rendezvous_creator_(opts->step_id);
+  Rendezvous* rendezvous = nullptr;
+  TF_CHECK_OK(rendezvous_factory_(opts->step_id, nullptr, &rendezvous));
   opts->rendezvous = rendezvous;
   opts->create_rendezvous = false;
 
@@ -415,7 +416,7 @@ Status KernelAndDeviceFunc::Run(
     CancellationManager* cancellation_manager,
     const absl::optional<EagerFunctionParams>& eager_func_params,
     const absl::optional<ManagedStackTrace>& stack_trace,
-    CoordinationServiceAgent* coordination_service_agent) {
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
   profiler::TraceMe activity("KernelAndDeviceFunc::Run",
                              profiler::TraceMeLevel::kInfo);
   // Don't try to handle packed or remote inputs synchronously.
@@ -447,6 +448,10 @@ Status KernelAndDeviceFunc::Run(
     delete opts->cancellation_manager;
   }
   static_cast<Rendezvous*>(opts->rendezvous)->Unref();
+  if (opts->cleanup_rendezvous_after_run) {
+    // Clean up the rendezvous created in PrepareForRun.
+    TF_RETURN_IF_ERROR(rendezvous_factory_.CleanUp(opts->step_id));
+  }
   outputs->reserve(rets.size());
   for (auto& v : rets) {
     outputs->push_back(std::move(v));
@@ -459,23 +464,32 @@ void KernelAndDeviceFunc::RunAsync(
     std::vector<EagerKernelRet>* outputs,
     CancellationManager* cancellation_manager,
     const absl::optional<EagerFunctionParams>& eager_func_params,
-    CoordinationServiceAgent* coordination_service_agent,
+    tsl::CoordinationServiceAgent* coordination_service_agent,
     std::function<void(const Status&)> done) {
-  profiler::TraceMe activity("KernelAndDeviceFunc::RunAsync",
-                             profiler::TraceMeLevel::kInfo);
+  profiler::TraceMe activity(
+      [] {
+        return profiler::TraceMeEncode("KernelAndDeviceFunc::RunAsync",
+                                       {{"_r", 1}});
+      },
+      profiler::TraceMeLevel::kInfo);
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = PrepareForRun(
       step_container, outputs, cancellation_manager, eager_func_params,
       absl::nullopt, coordination_service_agent);
 
-  pflr_->Run(
-      *opts, handle_, inputs, outputs,
-      [opts, cancellation_manager, done = std::move(done)](const Status& s) {
-        if (cancellation_manager == nullptr) {
-          delete opts->cancellation_manager;
-        }
-        static_cast<Rendezvous*>(opts->rendezvous)->Unref();
-        done(s);
-      });
+  pflr_->Run(*opts, handle_, inputs, outputs,
+             [this, opts, cancellation_manager,
+              done = std::move(done)](const Status& s) {
+               if (cancellation_manager == nullptr) {
+                 delete opts->cancellation_manager;
+               }
+               static_cast<Rendezvous*>(opts->rendezvous)->Unref();
+               Status status = s;
+               if (opts->cleanup_rendezvous_after_run) {
+                 // Clean up the rendezvous created in PrepareForRun.
+                 status.Update(rendezvous_factory_.CleanUp(opts->step_id));
+               }
+               done(status);
+             });
 }
 
 tensorflow::Device* KernelAndDeviceOp::OutputDevice(int idx) const {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index e7bd57072d7..ffafbb04d23 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -140,7 +140,7 @@ class KernelAndDevice : public core::RefCounted {
       CancellationManager* cancellation_manager,
       const absl::optional<EagerFunctionParams>& eager_func_params,
       const absl::optional<ManagedStackTrace>& stack_trace,
-      CoordinationServiceAgent* coordination_service_agent) = 0;
+      tsl::CoordinationServiceAgent* coordination_service_agent) = 0;
 
   // Execute kernel asynchronously when applicable. Different from `Run` which
   // blocks the caller thread and waits for the execution of the op/function,
@@ -155,7 +155,7 @@ class KernelAndDevice : public core::RefCounted {
       std::vector<EagerKernelRet>* outputs,
       CancellationManager* cancellation_manager,
       const absl::optional<EagerFunctionParams>& eager_func_params,
-      CoordinationServiceAgent* coordination_service_agent,
+      tsl::CoordinationServiceAgent* coordination_service_agent,
       StatusCallback done) = 0;
 
   virtual Device* InputDevice(int i) const = 0;
@@ -213,19 +213,20 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   Status Init(const bool log_device_placement, const NodeDef& ndef,
               GraphCollector* graph_collector) override;
 
-  Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<EagerKernelRet>* outputs,
-             CancellationManager* cancellation_manager,
-             const absl::optional<EagerFunctionParams>& eager_func_params,
-             const absl::optional<ManagedStackTrace>& stack_trace,
-             CoordinationServiceAgent* coordination_service_agent) override;
+  Status Run(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent) override;
 
   void RunAsync(ScopedStepContainer* step_container,
                 const EagerKernelArgs& inputs,
                 std::vector<EagerKernelRet>* outputs,
                 CancellationManager* cancellation_manager,
                 const absl::optional<EagerFunctionParams>& eager_func_params,
-                CoordinationServiceAgent* coordination_service_agent,
+                tsl::CoordinationServiceAgent* coordination_service_agent,
                 StatusCallback done) override {
     // Trivial async implementation on top of the sync version
     done(Run(step_container, inputs, outputs, cancellation_manager,
@@ -282,7 +283,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       const bool shape_inference_on_tfe_dialect_import,
       const bool int_args_and_retvals_on_device,
       absl::optional<string> xla_compile_device_type,
-      std::function<Rendezvous*(const int64_t)> rendezvous_creator,
+      Rendezvous::Factory rendezvous_factory,
       std::function<int64_t()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
@@ -300,7 +301,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
         name_(name),
-        rendezvous_creator_(std::move(rendezvous_creator)),
+        rendezvous_factory_(std::move(rendezvous_factory)),
         get_op_id_(std::move(get_op_id)) {}
 
   ~KernelAndDeviceFunc() override;
@@ -315,19 +316,20 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   Status Init(const bool log_device_placement, const NodeDef& ndef,
               GraphCollector* graph_collector) override;
 
-  Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
-             std::vector<EagerKernelRet>* outputs,
-             CancellationManager* cancellation_manager,
-             const absl::optional<EagerFunctionParams>& eager_func_params,
-             const absl::optional<ManagedStackTrace>& stack_trace,
-             CoordinationServiceAgent* coordination_service_agent) override;
+  Status Run(
+      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+      std::vector<EagerKernelRet>* outputs,
+      CancellationManager* cancellation_manager,
+      const absl::optional<EagerFunctionParams>& eager_func_params,
+      const absl::optional<ManagedStackTrace>& stack_trace,
+      tsl::CoordinationServiceAgent* coordination_service_agent) override;
 
   void RunAsync(ScopedStepContainer* step_container,
                 const EagerKernelArgs& inputs,
                 std::vector<EagerKernelRet>* outputs,
                 CancellationManager* cancellation_manager,
                 const absl::optional<EagerFunctionParams>& eager_func_params,
-                CoordinationServiceAgent* coordination_service_agent,
+                tsl::CoordinationServiceAgent* coordination_service_agent,
                 StatusCallback done) override;
 
   const OpKernel* kernel() const override { return nullptr; }
@@ -350,7 +352,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       CancellationManager* cancellation_manager,
       const absl::optional<EagerFunctionParams>& eager_func_params,
       const absl::optional<ManagedStackTrace>& stack_trace,
-      CoordinationServiceAgent* coordination_service_agent);
+      tsl::CoordinationServiceAgent* coordination_service_agent);
 
   ProcessFunctionLibraryRuntime* const pflr_;  // non-null
   FunctionLibraryRuntime::Handle handle_;
@@ -392,7 +394,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   DataTypeVector output_dtypes_;
   string name_;
 
-  std::function<Rendezvous*(const int64_t)> rendezvous_creator_;
+  Rendezvous::Factory rendezvous_factory_;
   std::function<int64_t()> get_op_id_;
 };
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 0f3c7ac17c1..104ac476d9e 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -112,6 +112,9 @@ MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
   InsertMKLEagerOps(
       {"FusedBatchNormV3", RewriteFusedBatchNormV3, CreateGenericMklOp});
   InsertMKLEagerOps({"MatMul", AlwaysRewrite, CreateGenericMklOp});
+  // TODO(Intel-tf): Support MaxPool, MaxPool3D rewrite, handle workspace.
+  // Note: MaxPoolGrad, MaxPool3DGrad rewrite cannot be supported in eager
+  // mode due to workspace restriction
 };
 
 void MklEagerOpRewrite::InsertMKLEagerOps(MklEagerOp op) {
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index ce7aa82e99f..b01646b7498 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -43,7 +43,8 @@ class EagerOpRewriteTest : public ::testing::Test {
     eager_ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        async, device_mgr.get(), false, rendezvous);
+        async, device_mgr.get(), false, rendezvous, nullptr, nullptr,
+        /*run_eager_op_as_function=*/true);
 
     EagerExecutor executor_(false);
     std::unique_ptr<tensorflow::EagerOperation> op(
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index 1a8b7087381..86185684b1c 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -83,11 +83,12 @@ class PlacementTest : public ::testing::Test {
                    ContextDevicePlacementPolicy policy) {
     ASSERT_EQ(context_, nullptr);
     InitDeviceManager();
-    context_ =
-        new EagerContext(opts, policy,
-                         /* async */ false, device_manager_,
-                         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
-                         /* cluster_flr */ nullptr);
+    context_ = new EagerContext(
+        opts, policy,
+        /* async */ false, device_manager_,
+        /* device_mgr_owned */ false, /* rendezvous */ nullptr,
+        /* cluster_flr */ nullptr, /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true);
   }
 
  protected:
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.h b/tensorflow/core/common_runtime/eager/placement_utils.h
index 9435f9848d3..b182cf68402 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.h
+++ b/tensorflow/core/common_runtime/eager/placement_utils.h
@@ -29,7 +29,6 @@ bool IsColocationExempt(StringPiece op_name);
 bool IsFunction(StringPiece op_name);
 
 // TODO(b/154234908): Unify placement logic.
-// TODO(b/159647422): Add C++ unit tests for placement logic.
 
 // Pin the op to cpu if all op inputs are on the CPU, small (<64 elements) and
 // integers (int32/int64). This can be disabled by setting the environment
diff --git a/tensorflow/core/common_runtime/eager/placement_utils_test.cc b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
new file mode 100644
index 00000000000..fe8388a92e5
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
@@ -0,0 +1,255 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/eager/placement_utils.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+#define DEVICE_CPU0 "/job:localhost/replica:0/task:0/device:CPU:0"
+#define DEVICE_CPU0_TASK1 "/job:localhost/replica:0/task:1/device:CPU:0"
+#define DEVICE_GPU0 "/job:localhost/replica:0/task:0/device:GPU:0"
+
+namespace tensorflow {
+namespace {
+
+TEST(PlacementUtilsTest, IsColocationExemptFalse) {
+  ASSERT_FALSE(eager::IsColocationExempt("Identity"));
+}
+
+TEST(PlacementUtilsTest, IsColocationExemptTrue) {
+  ASSERT_TRUE(eager::IsColocationExempt("IdentityN"));
+}
+
+TEST(PlacementUtilsTest, IsFunctionTrue) {
+  ASSERT_TRUE(eager::IsFunction("MyFunction"));
+}
+
+TEST(PlacementUtilsTest, IsFunctionFalse) {
+  ASSERT_FALSE(eager::IsFunction("Identity"));
+}
+
+// Return a fake (local or remote device) with the specified type and name.
+static Device* CreateDevice(const char* type, const char* name,
+                            bool is_local = true) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr, bool is_local)
+        : Device(nullptr, attr), is_local_(is_local) {}
+    Status Sync() override { return OkStatus(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+    bool IsLocal() const override { return is_local_; }
+
+   private:
+    const bool is_local_;
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  int64_t incarnation = random::New64();
+  while (incarnation == 0) {
+    incarnation = random::New64();
+  }
+  attr.set_incarnation(incarnation);
+  return new FakeDevice(attr, is_local);
+}
+
+static void CreateLocalDeviceVector(
+    std::vector<std::unique_ptr<Device>>& devices) {
+  std::unique_ptr<Device> d0(CreateDevice("CPU", DEVICE_CPU0));
+  devices.emplace_back(std::move(d0));
+  std::unique_ptr<Device> d1(CreateDevice("GPU", DEVICE_GPU0));
+  devices.emplace_back(std::move(d1));
+}
+
+static Device* CreateRemoteDeviceVector(
+    std::vector<std::unique_ptr<Device>>& devices) {
+  std::unique_ptr<Device> d0(CreateDevice("CPU", DEVICE_CPU0_TASK1, false));
+  devices.emplace_back(std::move(d0));
+  return devices.back().get();
+}
+
+struct MaybePinSmallOpsToCpuTestCase {
+  std::string test_name;
+  DataType dtype;
+  TensorShape shape;
+  string op_name;
+  const char* device;
+  bool expect;
+};
+
+class PlacementUtilsSmallOpsTest
+    : public ::testing::TestWithParam<MaybePinSmallOpsToCpuTestCase> {};
+
+TEST_P(PlacementUtilsSmallOpsTest, TestMaybePinSmallOpsToCpu) {
+  const MaybePinSmallOpsToCpuTestCase& test_case = GetParam();
+
+  bool result;
+
+  std::vector<std::unique_ptr<Device>> devices;
+  CreateLocalDeviceVector(devices);
+  StaticDeviceMgr device_mgr(std::move(devices));
+  core::RefCountPtr<EagerContext> context;
+  context = core::RefCountPtr<EagerContext>(new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_EXPLICIT,
+      false, &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true));
+  auto ctx = context.get();
+  ctx->SetRunEagerOpAsFunction(true);
+  std::vector<ImmediateExecutionTensorHandle*> arg;
+  Tensor input_tensor(test_case.dtype, test_case.shape);
+  auto input = core::RefCountPtr<ImmediateExecutionTensorHandle>(
+      ctx->CreateLocalHandleFromTFTensor(input_tensor, test_case.device));
+  if (test_case.op_name != "RefIdentity") {
+    arg.push_back(input.get());
+  }
+
+  TF_ASSERT_OK(eager::MaybePinSmallOpsToCpu(&result, test_case.op_name, arg,
+                                            test_case.device));
+  ASSERT_EQ(result, test_case.expect);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MaybePinSmallOpsToCpuTests, PlacementUtilsSmallOpsTest,
+    ::testing::ValuesIn<MaybePinSmallOpsToCpuTestCase>({
+        {"OkToPin", DT_INT64, {}, "Identity", DEVICE_CPU0, true},
+        {"NotOkToPin_Float", DT_FLOAT, {}, "Identity", DEVICE_CPU0, false},
+        {"NotOkToPin_Function", DT_INT64, {}, "MyFunction", DEVICE_CPU0, false},
+        {"NotOkToPin_NoInputs",
+         DT_INT64,
+         {},
+         "RefIdentity",
+         DEVICE_CPU0,
+         false},
+        {"NotOkToPin_NotCpu", DT_INT64, {}, "Identity", DEVICE_GPU0, false},
+        {"NotOkToPin_TooBig", DT_INT64, {65}, "Identity", DEVICE_CPU0, false},
+    }),
+    [](const ::testing::TestParamInfo<PlacementUtilsSmallOpsTest::ParamType>&
+           info) { return info.param.test_name; });
+
+struct MaybePinToResourceDeviceTestCase {
+  std::string test_name;
+  DataType dtype;
+  string op_name;
+  const char* device;
+  bool expect;
+};
+
+class PlacementUtilsResourceDeviceTest
+    : public ::testing::TestWithParam<MaybePinToResourceDeviceTestCase> {};
+
+TEST_P(PlacementUtilsResourceDeviceTest, TestMaybePinToResourceDevice) {
+  const MaybePinToResourceDeviceTestCase& test_case = GetParam();
+  Device* device = nullptr;
+
+  std::vector<std::unique_ptr<Device>> local_devices;
+  CreateLocalDeviceVector(local_devices);
+  StaticDeviceMgr local_device_mgr(std::move(local_devices));
+
+  core::RefCountPtr<EagerContext> context;
+  context = core::RefCountPtr<EagerContext>(new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_EXPLICIT,
+      false, &local_device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true));
+  auto ctx = context.get();
+  auto op = EagerOperation(ctx);
+  TF_ASSERT_OK(op.Reset(test_case.op_name.c_str(), DEVICE_CPU0));
+
+  Tensor input_tensor(test_case.dtype, {});
+  auto input = core::RefCountPtr<ImmediateExecutionTensorHandle>(
+      ctx->CreateLocalHandleFromTFTensor(input_tensor, test_case.device));
+  TF_ASSERT_OK(op.AddInput(input.get()));
+  ASSERT_TRUE(device == nullptr);
+  TF_ASSERT_OK(eager::MaybePinToResourceDevice(&device, op));
+  ASSERT_EQ(device != nullptr, test_case.expect);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MaybePinToResourceDeviceTestCase, PlacementUtilsResourceDeviceTest,
+    ::testing::ValuesIn<MaybePinToResourceDeviceTestCase>({
+        {"OkToPin", DT_RESOURCE, "Identity", DEVICE_CPU0, true},
+        {"NotOkToPin_NotResource", DT_FLOAT, "Identity", DEVICE_CPU0, false},
+        {"NotOkToPin_ColocationExempt", DT_RESOURCE, "IdentityN", DEVICE_CPU0,
+         false},
+    }),
+    [](const ::testing::TestParamInfo<
+        PlacementUtilsResourceDeviceTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+TEST(PlacementUtilsTest, MaybePinToResourceDevice_OtherDevice) {
+  StaticDeviceMgr device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+  Device* device0 = device_mgr.ListDevices().at(0);
+  auto remote_device_mgr = std::make_unique<DynamicDeviceMgr>();
+  std::vector<std::unique_ptr<Device>> remote_devices;
+  CreateRemoteDeviceVector(remote_devices);
+  TF_ASSERT_OK(remote_device_mgr->AddDevices(std::move(remote_devices)));
+
+  Device* device1 = remote_device_mgr->ListDevices().at(0);
+
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0->name(), device1->name()},
+                                  /*unique_device_id=*/0,
+                                  device_mgr.HostCPU()->parsed_name(), &s);
+  TF_ASSERT_OK(s);
+
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  // Set a RemoteMgr to the EagerContext.
+  auto remote_mgr = std::make_unique<eager::RemoteMgr>(
+      /*is_master=*/true, ctx);
+  TF_ASSERT_OK(ctx->InitializeRemoteMaster(
+      /*server=*/nullptr, /*worker_env=*/nullptr,
+      /*worker_session=*/nullptr, /*remote_eager_workers=*/nullptr,
+      std::move(remote_device_mgr), /*remote_contexts=*/{},
+      EagerContext::NewContextId(),
+      /*r=*/nullptr, &device_mgr, /*keep_alive_secs*/ 600,
+      /*cluster_flr=*/nullptr, std::move(remote_mgr)));
+  ASSERT_NE(ctx->remote_device_mgr(), nullptr);
+
+  auto op = EagerOperation(ctx);
+  TF_ASSERT_OK(op.Reset("Identity", DEVICE_CPU0));
+  TensorHandle* input = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/2, /*output_num=*/1, DT_RESOURCE, device1, /*is_ready=*/true,
+      ctx);
+  TF_ASSERT_OK(op.AddInput(input));
+  ASSERT_NE(input->resource_remote_device_incarnation(), 0);
+
+  Device* device = nullptr;
+  TF_ASSERT_OK(eager::MaybePinToResourceDevice(&device, op));
+  ASSERT_TRUE(device != nullptr);
+  input->Unref();
+  ctx->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc
new file mode 100644
index 00000000000..75ad34cea63
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
+
+#include <utility>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(TensorHandleData, TensorAttribute) {
+  Tensor t(DT_UINT16, TensorShape({2, 2}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  const tensorflow::Tensor* ret_tensor;
+  TF_EXPECT_OK(handle_data.Tensor(&ret_tensor));
+
+  EXPECT_EQ(ret_tensor->dtype(), DT_UINT16);
+  EXPECT_EQ(ret_tensor->dims(), 2);
+}
+
+TEST(TensorHandleData, TensorValueAttribute) {
+  Tensor t(DT_UINT16, TensorShape({2, 2}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  tensorflow::TensorValue tensor_value;
+  TF_EXPECT_OK(handle_data.TensorValue(&tensor_value));
+
+  EXPECT_EQ(tensor_value.dtype(), DT_UINT16);
+}
+
+TEST(TensorHandleData, TensorShapeAttribute) {
+  TensorShape shape({2, 2});
+  Tensor t(DT_UINT16, shape);
+  LocalTensorHandleData handle_data(std::move(t));
+
+  tensorflow::TensorShape tensor_shape;
+  TF_EXPECT_OK(handle_data.Shape(&tensor_shape));
+
+  EXPECT_EQ(tensor_shape, shape);
+}
+
+TEST(TensorHandleData, NumDimsAttribute) {
+  Tensor t(DT_UINT16, TensorShape({2, 2}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  int num_dims;
+  TF_EXPECT_OK(handle_data.NumDims(&num_dims));
+
+  EXPECT_EQ(num_dims, 2);
+}
+
+TEST(TensorHandleData, DimAttribute) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  int64_t dim;
+  TF_EXPECT_OK(handle_data.Dim(1, &dim));
+
+  EXPECT_EQ(dim, 3);
+}
+
+TEST(TensorHandleData, NumElementsAttribute) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  int64_t num_elements;
+  TF_EXPECT_OK(handle_data.NumElements(&num_elements));
+
+  EXPECT_EQ(num_elements, 6);
+}
+
+TEST(TensorHandleData, UnprotectReady) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data(std::move(t));
+  EXPECT_TRUE(handle_data.IsReady());
+
+  TF_EXPECT_OK(handle_data.Unprotect());
+}
+
+TEST(TensorHandleData, UnprotectNotReady) {
+  LocalTensorHandleData handle_data;
+  EXPECT_FALSE(handle_data.IsReady());
+
+  EXPECT_THAT(handle_data.Unprotect(),
+              tensorflow::testing::StatusIs(tensorflow::error::INTERNAL));
+}
+
+TEST(TensorHandleData, DebugString) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data(std::move(t));
+
+  EXPECT_THAT(handle_data.DebugString(),
+              ::testing::HasSubstr("Tensor<type: uint16 shape: [2,3]>"));
+}
+
+TEST(TensorHandleData, NonBlockingControlPoisonHandle) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data(std::move(t));
+  TF_EXPECT_OK(handle_data.IsPoisoned());
+
+  tensorflow::Status fake_failure_status(tensorflow::error::ABORTED,
+                                         "Fake failure.");
+  handle_data.Poison(fake_failure_status);
+
+  // NonBlockingControl can never poison the tensor.
+  TF_EXPECT_OK(handle_data.IsPoisoned());
+}
+
+TEST(TensorHandleData, BlockingControlPoisonHandle) {
+  LocalTensorHandleData handle_data;
+  TF_EXPECT_OK(handle_data.IsPoisoned());
+
+  tensorflow::Status fake_failure_status(tensorflow::error::ABORTED,
+                                         "Fake failure.");
+  handle_data.Poison(fake_failure_status);
+
+  EXPECT_THAT(
+      handle_data.IsPoisoned(),
+      tensorflow::testing::StatusIs(fake_failure_status.code(),
+                                    fake_failure_status.error_message()));
+}
+
+TEST(TensorHandleData, BlockingControlSetTensor) {
+  Tensor t(DT_UINT16, TensorShape({2, 3}));
+  LocalTensorHandleData handle_data;
+
+  TF_EXPECT_OK(handle_data.SetTensor(std::move(t)));
+
+  // Access the underlying tensor to ensure it is currently set.
+  int64_t num_elements;
+  TF_EXPECT_OK(handle_data.NumElements(&num_elements));
+
+  EXPECT_EQ(num_elements, 6);
+}
+
+TEST(TensorHandleData, BlockingControlNotReadyDebugString) {
+  LocalTensorHandleData handle_data;
+
+  EXPECT_THAT(handle_data.DebugString(),
+              ::testing::HasSubstr("LocalTensorHandleData"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 31f575f6471..6163ac70941 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -39,7 +40,8 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &device_mgr, false, nullptr, nullptr);
+      &device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
   TensorHandle* sync_th =
       TensorHandle::CreateLocalHandle(std::move(t), nullptr, nullptr, ctx);
   TensorHandle* async_th = TensorHandle::CreateEmptyLocalHandle(
@@ -107,7 +109,8 @@ class PackedTensorHandleTest : public ::testing::Test {
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
         /* async= */ false, device_mgr_,
         /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-        /* cluster_flr= */ nullptr);
+        /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true);
   }
 
   ~PackedTensorHandleTest() override {
@@ -122,6 +125,9 @@ class PackedTensorHandleTest : public ::testing::Test {
   }
 
   bool IsReady(TensorHandle* handle) const { return handle->IsReady(); }
+  Status WaitReady(TensorHandle* handle) const {
+    return handle->WaitReady("Test");
+  }
 
  private:
   const std::vector<const char*> device_names_ = {
@@ -248,6 +254,36 @@ TEST_F(PackedTensorHandleTest, PackedSingleHandle) {
   packed_handle->Unref();
 }
 
+TEST_F(PackedTensorHandleTest, PoisonHandle) {
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {};
+
+  Tensor t(dtype, shape);
+  Device* d = ListDevices().at(0);
+  TensorHandle* h =
+      TensorHandle::CreateLocalHandle(std::move(t), d, d, d, context());
+  std::vector<TensorHandle*> handles = {h};
+
+  TensorHandle* packed_handle = nullptr;
+  TF_EXPECT_OK(TensorHandle::CreatePackedHandle(std::move(handles), context(),
+                                                &packed_handle));
+  h->Unref();
+
+  // Should be ready on creation.
+  TF_EXPECT_OK(WaitReady(packed_handle));
+
+  // Poisoning the handle will make WaitReady fail.
+  tensorflow::Status fake_failure_status(tensorflow::error::ABORTED,
+                                         "Fake failure.");
+  packed_handle->Poison(fake_failure_status, packed_handle->device());
+  EXPECT_THAT(
+      WaitReady(packed_handle),
+      tensorflow::testing::StatusIs(fake_failure_status.code(),
+                                    fake_failure_status.error_message()));
+
+  packed_handle->Unref();
+}
+
 TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
   std::unique_ptr<Device> d0(
       CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
@@ -255,7 +291,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &local_device_mgr, false, nullptr, nullptr);
+      &local_device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   tensorflow::DataType dtype = DT_RESOURCE;
   TensorShape shape = {2};
@@ -287,7 +324,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &local_device_mgr, false, nullptr, nullptr);
+      &local_device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   std::unique_ptr<Device> d0(
       CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
@@ -343,7 +381,8 @@ class RemoteTensorHandleTest : public ::testing::Test {
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
         /* async= */ false, device_mgr_,
         /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-        /* cluster_flr= */ nullptr);
+        /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true);
   }
 
   ~RemoteTensorHandleTest() override {
@@ -382,7 +421,8 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
       /* async= */ false, &device_mgr,
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-      /* cluster_flr= */ nullptr);
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -405,6 +445,243 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   context->Unref();
 }
 
+TEST_F(RemoteTensorHandleTest, PoisonRemote) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  const string remote_task = "/job:worker/replica:0/task:1";
+  Device* d1 = device_mgr.ListDevices().at(1);
+  TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
+      /*unknown_device=*/true);
+  EXPECT_EQ(h->device(), d1);
+
+  tensorflow::Status fake_failure_status(tensorflow::error::ABORTED,
+                                         "Fake failure.");
+  h->PoisonRemote(fake_failure_status, d1, context->GetContextViewId());
+
+  Device* d2 = device_mgr.ListDevices().at(2);
+  EXPECT_THAT(
+      h->SetRemoteShapeAndDevice(shape, d1, context->GetContextViewId(),
+                                 d2->name()),
+      tensorflow::testing::StatusIs(fake_failure_status.code(),
+                                    fake_failure_status.error_message()));
+
+  h->Unref();
+  context->Unref();
+}
+
+TEST_F(RemoteTensorHandleTest, PoisonRemoteMirror) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  const string remote_task = "/job:worker/replica:0/task:1";
+  Device* d1 = device_mgr.ListDevices().at(1);
+  TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
+      /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
+      /*unknown_device=*/true);
+  EXPECT_EQ(h->device(), d1);
+
+  Device* d2 = device_mgr.ListDevices().at(2);
+  int64_t op_id = 1;
+  int output_num = 2;
+  TF_ASSERT_OK(
+      h->AddUnshapedRemoteMirror(d2, op_id, output_num, remote_task, context));
+
+  tensorflow::Status fake_failure_status(tensorflow::error::ABORTED,
+                                         "Fake failure.");
+  h->PoisonRemote(fake_failure_status, d2, context->GetContextViewId());
+
+  EXPECT_THAT(
+      h->SetRemoteShapeAndDevice(shape, d2, context->GetContextViewId(),
+                                 d2->name()),
+      tensorflow::testing::StatusIs(fake_failure_status.code(),
+                                    fake_failure_status.error_message()));
+
+  h->Unref();
+  context->Unref();
+}
+
+TEST(TensorHandle_LocalTest, TensorFromDeviceSameDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:1"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  Tensor t0(dtype, shape);
+  Device* d0 = device_mgr.ListDevices().at(1);
+  TensorHandle* h =
+      TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context);
+
+  const Tensor* tensor_from_device;
+  TF_EXPECT_OK(h->TensorFromDevice(d0, &tensor_from_device));
+
+  h->Unref();
+  context->Unref();
+}
+
+TEST(TensorHandle_LocalTest, TensorFromDeviceDifferentDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:1"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  Tensor t0(dtype, shape);
+  Device* d0 = device_mgr.ListDevices().at(1);
+  TensorHandle* h =
+      TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context);
+
+  Device* d1 = device_mgr.ListDevices().at(2);
+  tensorflow::Tensor tensor;
+  TF_EXPECT_OK(h->CopyToDevice(*context, d1, &tensor));
+  TF_EXPECT_OK(h->AddLocalMirror(std::move(tensor), d1));
+
+  const Tensor* tensor_from_device;
+  TF_EXPECT_OK(h->TensorFromDevice(d1, &tensor_from_device));
+
+  h->Unref();
+  context->Unref();
+}
+
+TEST(TensorHandle_LocalTest, TensorFromDeviceInvalidDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:1"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_FLOAT;
+  TensorShape shape = {};
+
+  Tensor t0(dtype, shape);
+  Device* d0 = device_mgr.ListDevices().at(1);
+  TensorHandle* h =
+      TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context);
+
+  Device* d1 = device_mgr.ListDevices().at(2);
+
+  const Tensor* tensor_from_device;
+  EXPECT_THAT(h->TensorFromDevice(d1, &tensor_from_device),
+              tensorflow::testing::StatusIs(tensorflow::error::INTERNAL));
+
+  h->Unref();
+  context->Unref();
+}
+
+TEST(TensorHandle_ResourceShapeMirror, CreateAndCheckMirror) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:1"));
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:2"));
+  StaticDeviceMgr device_mgr(std::move(devices));
+
+  EagerContext* context = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      /* async= */ false, &device_mgr,
+      /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
+      /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
+      /*run_eager_op_as_function=*/true);
+
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {};
+
+  Tensor t0(dtype, shape);
+  Device* d0 = device_mgr.ListDevices().at(1);
+  TensorHandle* h =
+      TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context);
+
+  Device* d1 = device_mgr.ListDevices().at(2);
+  int64_t op_id = 1;
+  int output_num = 2;
+  EXPECT_FALSE(h->HasResourceShapeMirror(d1, context->GetContextViewId()));
+
+  TF_EXPECT_OK(h->AddResourceShapeMirror(d1, op_id, output_num, context));
+  EXPECT_TRUE(h->HasResourceShapeMirror(d1, context->GetContextViewId()));
+
+  // Adding a duplicate leads to failure
+  EXPECT_THAT(h->AddResourceShapeMirror(d1, op_id, output_num, context),
+              tensorflow::testing::StatusIs(tensorflow::error::INTERNAL));
+  h->Unref();
+  context->Unref();
+}
+
 TEST(TensorHandle_DeviceNameTest, OnLocalDevice) {
   std::vector<std::unique_ptr<Device>> devices;
   devices.emplace_back(
@@ -415,7 +692,8 @@ TEST(TensorHandle_DeviceNameTest, OnLocalDevice) {
   auto ctx = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
-      &local_device_mgr, false, nullptr, nullptr);
+      &local_device_mgr, false, nullptr, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
 
   Device* dcpu = local_device_mgr.ListDevices()[0];
   Device* dgpu = local_device_mgr.ListDevices()[1];
diff --git a/tensorflow/core/common_runtime/eager/zen_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/zen_eager_op_rewrite.cc
new file mode 100644
index 00000000000..c8dcd78b7bb
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/zen_eager_op_rewrite.cc
@@ -0,0 +1,180 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef AMD_ZENDNN
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/core/graph/zen_graph_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/port.h"
+#include "tensorflow/core/util/zen_util.h"
+
+namespace tensorflow {
+
+class ZenEagerOpRewrite : public EagerOpRewrite {
+ public:
+  ZenEagerOpRewrite(string name, string file, string line);
+
+  // Stores Zen op rewrite rules.
+  struct ZenEagerOp {
+    string op_name;
+    string zen_op_name;
+    std::function<bool(EagerOperation *)> rewrite_rule;
+    std::function<Status(EagerOperation *, std::unique_ptr<EagerOperation> *,
+                         string)>
+        create_zen_op;
+  };
+
+ private:
+  // Maintain record of Zen op to rewrite.
+  std::unordered_map<std::string, ZenEagerOp> zen_eager_ops_;
+
+  // The entry point to execute the op rewrite.
+  Status Run(EagerOperation *orig_op,
+             std::unique_ptr<tensorflow::EagerOperation> *out_op);
+
+  // Generic rewrite for any Zen op that doesn't need special processing. It
+  // initializes new Zen op, sets up its inputs and attributes.
+  static Status CreateGenericZenOp(EagerOperation *orig_op,
+                                   std::unique_ptr<EagerOperation> *zen_op,
+                                   string zen_op_name);
+
+  // Calls op-specific rewrite function to create new Zen op.
+  Status RewriteToZenOp(
+      EagerOperation *orig_op, std::unique_ptr<EagerOperation> *zen_op,
+      std::unordered_map<std::string, ZenEagerOp>::iterator *it);
+
+  // Check whether we can rewrite the original op with Zen op, or not.
+  bool ShouldRewriteOp(
+      EagerOperation *op,
+      std::unordered_map<std::string, ZenEagerOp>::iterator *it);
+
+  // Default rewrite rule that always rewrites for float data type.
+  static bool AlwaysRewriteFloat(EagerOperation *op) {
+    DataType data_type;
+    const NodeDef &kNodeDef = op->MutableAttrs()->BuildNodeDef();
+    TF_CHECK_OK(GetNodeAttr(kNodeDef, "T", &data_type));
+    return (data_type == DT_FLOAT);
+  }
+
+  // Helper function to insert zen_eager_ops to map.
+  void InsertZenEagerOps(ZenEagerOp op);
+
+  // List of eager ops that can be rewritten with Zen ops. The list is sorted in
+  // alphabetical order.
+  const std::vector<string> kAlwaysRewriteOps = {
+      "AvgPool",          "Conv2D", "FusedBatchNorm", "FusedBatchNormV2",
+      "FusedBatchNormV3", "MatMul", "MaxPool",        "Softmax"};
+};
+
+// The priority value must be higher than MklEagerOpRewrite (10000) so that Zen
+// rewrite happens before Mkl rewrite.
+REGISTER_REWRITE(EagerOpRewriteRegistry::POST_PLACEMENT, 30000,
+                 ZenEagerOpRewrite);
+
+// Constructor
+ZenEagerOpRewrite::ZenEagerOpRewrite(string name, string file, string line)
+    : EagerOpRewrite(name, file, line) {
+  for (const auto &op_name : kAlwaysRewriteOps) {
+    InsertZenEagerOps({op_name, zen_op_registry::GetZenOpName(op_name),
+                       AlwaysRewriteFloat, CreateGenericZenOp});
+  }
+}
+
+void ZenEagerOpRewrite::InsertZenEagerOps(ZenEagerOp op) {
+  zen_eager_ops_.insert(std::make_pair(op.op_name, op));
+}
+
+Status ZenEagerOpRewrite::Run(
+    EagerOperation *orig_op,
+    std::unique_ptr<tensorflow::EagerOperation> *out_op) {
+  std::unordered_map<std::string, ZenEagerOp>::iterator it;
+  // Don't rewrite the op if TF-ZenDNN use is disabled at runtime.
+  if (IsZenDnnEnabled() && ShouldRewriteOp(orig_op, &it)) {
+    TF_CHECK_OK(RewriteToZenOp(orig_op, out_op, &it));
+  }
+  return OkStatus();
+}
+
+Status ZenEagerOpRewrite::CreateGenericZenOp(
+    EagerOperation *orig_op, std::unique_ptr<EagerOperation> *zen_op,
+    string zen_op_name) {
+  VLOG(1) << " TF-EAGER-REWRITE Info: OriginalOp= " << orig_op->Name()
+          << " ZenOp=" << zen_op_name;
+
+  zen_op->reset(new tensorflow::EagerOperation(&orig_op->EagerContext()));
+  TF_RETURN_IF_ERROR(zen_op->get()->Reset(zen_op_name.c_str(), nullptr,
+                                          /*is_remote=*/false, nullptr));
+
+  // Add all inputs to the Zen op.
+  for (auto input : orig_op->Inputs()) {
+    TF_RETURN_IF_ERROR((*zen_op)->AddInput(input));
+  }
+
+  // Copy all attributes to the Zen op.
+  const NodeDef &kOrigNodeDef = orig_op->MutableAttrs()->BuildNodeDef();
+  AttrSlice attr_list(kOrigNodeDef);
+  for (const auto &attr : attr_list) {
+    (*zen_op)->MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  (*zen_op)->MutableAttrs()->Set("is_eager", true);
+  (*zen_op)->MutableAttrs()->Set("reorder_before", false);
+  (*zen_op)->MutableAttrs()->Set("reorder_after", false);
+  (*zen_op)->MutableAttrs()->Set("in_links", 1);
+  (*zen_op)->MutableAttrs()->Set("out_links", 1);
+  (*zen_op)->MutableAttrs()->Set("reset", true);
+
+  string device_name = orig_op->DeviceName();
+  return (*zen_op)->SetDeviceName(device_name.c_str());
+}
+
+bool ZenEagerOpRewrite::ShouldRewriteOp(
+    EagerOperation *op,
+    std::unordered_map<std::string, ZenEagerOp>::iterator *it) {
+  // Only rewrite if op is to be run on CPU device.
+  if (op->GetDeviceParsedName().type != "CPU") {
+    return false;
+  }
+  DataType data_type;
+  if (op->Attrs().Get("T", &data_type) != OkStatus()) {
+    return false;
+  }
+  // Find the op and verify the requirements for rewriting it with Zen op.
+  *it = zen_eager_ops_.find(op->Name());
+  if (*it == zen_eager_ops_.end()) {
+    return false;
+  }
+  // Eager op found, verify that a kernel exists for Zen op and rewrite is
+  // possible.
+  return (zen_op_registry::IsZenOpKernelRegistered(
+              zen_op_registry::GetZenOpName(op->Name()), data_type) &&
+          (*it)->second.rewrite_rule(op));
+}
+
+Status ZenEagerOpRewrite::RewriteToZenOp(
+    EagerOperation *orig_op, std::unique_ptr<EagerOperation> *zen_op,
+    std::unordered_map<std::string, ZenEagerOp>::iterator *it) {
+  TF_RETURN_IF_ERROR(
+      (*it)->second.create_zen_op(orig_op, zen_op, (*it)->second.zen_op_name));
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // AMD_ZENDNN
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index da8293645d0..e6d66e95e77 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -356,6 +356,7 @@ class ExecutorState {
   const bool log_memory_;
 
   int64_t step_id_;
+  int64_t trace_id_;  // for profiler.
   int64_t start_time_usecs_ = 0;
   // The deadline for the session to complete by. Empty if unspecified.
   absl::optional<absl::Time> deadline_;
@@ -385,7 +386,7 @@ class ExecutorState {
   const ImmutableExecutorState& immutable_state_;
   ExecutorImpl::KernelStats* const kernel_stats_;
   CancellationManager* cancellation_manager_;
-  CoordinationServiceAgent* coordination_service_agent_;
+  tsl::CoordinationServiceAgent* coordination_service_agent_;
   absl::optional<ManagedStackTrace> stack_trace_ = absl::nullopt;
   // If not null, use this device to schedule intra-op operation
   std::unique_ptr<DeviceBase> user_device_;
@@ -417,6 +418,7 @@ ExecutorState<PropagatorStateType>::ExecutorState(
     : vlog_(VLOG_IS_ON(1)),
       log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
+      trace_id_(args.function_trace_id ? *args.function_trace_id : step_id_),
       start_time_usecs_(args.start_time_usecs),
       deadline_(args.deadline),
       rendezvous_(args.rendezvous),
@@ -785,7 +787,7 @@ void ExecutorState<PropagatorStateType>::ProcessInline(
                 "ExecutorState::Process",
                 {{"id", step_id_}, {"iter_num", tagged_node.get_iter_num()}});
           },
-          profiler::ContextType::kTfExecutor, step_id_,
+          profiler::ContextType::kTfExecutor, trace_id_,
           profiler::TraceMeLevel::kInfo);
       last_iter_num = current_iter_num;
     }
@@ -1187,11 +1189,19 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
       TRACEPRINTF("StartAbort: %s", s.ToString().c_str());
       if (cancellation_manager_) {
         // Only log when the abort happens during the actual run time.
-        // Use VLOG instead of LOG(warning) because error status is expected
-        // when the executor is run under the grappler optimization phase or
-        // when iterating through a tf.data input pipeline.
-        VLOG(1) << "[" << immutable_state_.params().device->name()
-                << "] Executor start aborting: " << s;
+        // Use LOG(INFO) instead of LOG(WARNING) because error status is
+        // expected when the executor is run under the grappler optimization
+        // phase. Do not log OutOfRange erros because they are expected when
+        // iterating through a tf.data input pipeline.
+        if (!errors::IsOutOfRange(s)) {
+          LOG(INFO) << "[" << immutable_state_.params().device->name()
+                    << "] (DEBUG INFO) Executor start aborting (this does not "
+                       "indicate an error and you can ignore this message): "
+                    << s;
+        } else {
+          VLOG(1) << "[" << immutable_state_.params().device->name()
+                  << "] Executor start aborting: " << s;
+        }
       }
 
       if (rendezvous_) {
@@ -1350,6 +1360,7 @@ void ExecutorState<PropagatorStateType>::Finish() {
   auto done_cb = std::move(done_cb_);
   auto runner = std::move(runner_);
   mu_.unlock();
+  int64_t trace_id = trace_id_;
   int64_t step_id = step_id_;
   CHECK(done_cb != nullptr);
   Device* device = immutable_state_.params().device;
@@ -1405,7 +1416,7 @@ void ExecutorState<PropagatorStateType>::Finish() {
       }
     }
     delete this;
-    runner([step_id, status, done_cb = std::move(done_cb)]() {
+    runner([step_id, trace_id, status, done_cb = std::move(done_cb)]() {
       profiler::TraceMeConsumer activity(
           // From TraceMeProducer in KernelAndDeviceFunc::RunAsync,
           // DirectSession::RunInternal or GraphMgr::ExecuteAsync.
@@ -1413,7 +1424,7 @@ void ExecutorState<PropagatorStateType>::Finish() {
             return profiler::TraceMeEncode("ExecutorDoneCallback",
                                            {{"id", step_id}});
           },
-          profiler::ContextType::kTfExecutor, step_id,
+          profiler::ContextType::kTfExecutor, trace_id,
           profiler::TraceMeLevel::kInfo);
       done_cb(status);
     });
@@ -1425,10 +1436,10 @@ void ExecutorState<PropagatorStateType>::Finish() {
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
     // the user until the step (and its side-effects) has actually completed.
-    device->Sync([this, step_id, runner = std::move(runner),
+    device->Sync([this, step_id, trace_id, runner = std::move(runner),
                   done_cb = std::move(done_cb)](const Status& status) mutable {
       delete this;
-      runner([step_id, status, done_cb = std::move(done_cb)]() {
+      runner([step_id, trace_id, status, done_cb = std::move(done_cb)]() {
         profiler::TraceMeConsumer activity(
             // From TraceMeProducer in KernelAndDeviceFunc::RunAsync,
             // DirectSession::RunInternal or GraphMgr::ExecuteAsync.
@@ -1436,14 +1447,14 @@ void ExecutorState<PropagatorStateType>::Finish() {
               return profiler::TraceMeEncode("ExecutorDoneCallback",
                                              {{"id", step_id}});
             },
-            profiler::ContextType::kTfExecutor, step_id,
+            profiler::ContextType::kTfExecutor, trace_id,
             profiler::TraceMeLevel::kInfo);
         done_cb(status);
       });
     });
   } else {
     delete this;
-    runner([step_id, status, done_cb = std::move(done_cb)]() {
+    runner([step_id, trace_id, status, done_cb = std::move(done_cb)]() {
       profiler::TraceMeConsumer activity(
           // From TraceMeProducer in KernelAndDeviceFunc::RunAsync,
           // DirectSession::RunInternal or GraphMgr::ExecuteAsync.
@@ -1451,7 +1462,7 @@ void ExecutorState<PropagatorStateType>::Finish() {
             return profiler::TraceMeEncode("ExecutorDoneCallback",
                                            {{"id", step_id}});
           },
-          profiler::ContextType::kTfExecutor, step_id,
+          profiler::ContextType::kTfExecutor, trace_id,
           profiler::TraceMeLevel::kInfo);
       done_cb(status);
     });
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 54e3179bfe2..e94cc6b1ff9 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_H_
 
+#include <optional>
+
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -91,6 +93,9 @@ class Executor {
   // execution. Used for system-wide latency metrics.
   struct Args {
     int64_t step_id = 0;
+    // Used only by tracer/profiler, applicable only when running under
+    // FunctionRuntimeLibrary, unique per invocation.
+    std::optional<int64_t> function_trace_id;
     RendezvousInterface* rendezvous = nullptr;
     StepStatsCollectorInterface* stats_collector = nullptr;
     CallFrameInterface* call_frame = nullptr;
@@ -102,7 +107,7 @@ class Executor {
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
     thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr;
-    CoordinationServiceAgent* coordination_service_agent = nullptr;
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
     int64_t start_time_usecs = 0;
     // The deadline for the kernel to complete by. Empty if unspecified.
     absl::optional<absl::Time> deadline;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 8c6fe48b749..1c9f5d80c6b 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -535,7 +535,7 @@ class CallOp : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(ctx, lib != nullptr,
                       errors::Internal("No function library is provided."),
                       done);
-    FunctionLibraryRuntime::Options opts;
+    FunctionLibraryRuntime::Options opts(ctx->step_id());
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.step_container = ctx->step_container();
@@ -1012,6 +1012,7 @@ void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
     Executor::Args* exec_args) {
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
+  exec_args->function_trace_id = random::New64();
   exec_args->rendezvous = run_opts.rendezvous;
   exec_args->stats_collector = run_opts.stats_collector;
   exec_args->cancellation_manager = run_opts.cancellation_manager;
@@ -1177,18 +1178,18 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
+
   profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish.
-      [&opts] {
+      [&run_opts] {
         return profiler::TraceMeEncode("FunctionRun",
-                                       {{"id", opts.step_id}, {"_r", 1}});
+                                       {{"id", run_opts.step_id}, {"_r", 1}});
       },
-      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
       profiler::TraceMeLevel::kInfo);
 
-  Executor::Args exec_args;
-  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
-
   bool allow_dead_tensors = run_opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
@@ -1250,17 +1251,17 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish.
       [&opts] {
         return profiler::TraceMeEncode("FunctionRun",
                                        {{"id", opts.step_id}, {"_r", 1}});
       },
-      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::ContextType::kTfExecutor, *exec_args.function_trace_id,
       profiler::TraceMeLevel::kInfo);
 
-  Executor::Args exec_args;
-  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 132cccc4799..3f126c7b3ed 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -5,12 +5,10 @@ load(
     "tf_copts",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
-
-# For platform specific build config
 load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
+    "//tensorflow:tensorflow.default.bzl",
+    "filegroup",
+    "tf_cuda_cc_test",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -23,6 +21,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
@@ -124,12 +123,10 @@ filegroup(
     srcs = [
         "gpu_bfc_allocator.h",
         "gpu_cudamalloc_allocator.h",
-        "gpu_cudamallocasync_allocator.h",
         "gpu_debug_allocator.h",
         "gpu_device.h",
         "gpu_id.h",
         "gpu_id_manager.h",
-        "gpu_init.h",
         "gpu_managed_allocator.h",
         "gpu_process_state.h",
         "gpu_util.h",
@@ -161,12 +158,17 @@ tf_cuda_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         ":gpu_virtual_mem_allocator",
     ],
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
     deps = [
         ":gpu_bfc_allocator",
-        ":gpu_cudamallocasync_allocator",
         ":gpu_id_impl",
-        ":gpu_init_impl",
         ":gpu_lib",
+        "//tensorflow/compiler/xla/stream_executor:device_id_utils",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -187,33 +189,14 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-tf_cuda_library(
-    name = "gpu_cudamallocasync_allocator",
-    srcs = [
-        "gpu_cudamallocasync_allocator.cc",
-        ":gpu_runtime_headers",
-    ],
-    hdrs = ["gpu_cudamallocasync_allocator.h"],
-    cuda_deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform",
-    ],
-    deps = [
-        ":gpu_id_impl",
-        ":gpu_init_impl",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core/platform:stream_executor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-    ],
-)
-
 tf_cuda_library(
     name = "gpu_runtime",
     hdrs = [":gpu_runtime_headers"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/stream_executor:device_id_utils",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -271,46 +254,11 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:allocator",
         "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
 
-tf_cuda_library(
-    name = "gpu_init",
-    hdrs = [
-        "gpu_init.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:stream_executor",
-    ] + if_static(
-        [":gpu_init_impl"],
-    ),
-)
-
-tf_cuda_library(
-    name = "gpu_init_impl",
-    srcs = [
-        "gpu_init.cc",
-    ],
-    hdrs = [
-        "gpu_init.h",
-    ],
-    copts = tf_copts(),
-    linkstatic = 1,
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:stream_executor",
-    ],
-    alwayslink = 1,
-)
-
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -318,7 +266,6 @@ tf_cc_test(
     name = "gpu_device_on_non_gpu_machine_test",
     size = "small",
     srcs = ["gpu_device_on_non_gpu_machine_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":gpu_headers_lib",
         ":gpu_id",
@@ -333,7 +280,6 @@ tf_cuda_cc_test(
     srcs = [
         "gpu_bfc_allocator_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
@@ -360,12 +306,12 @@ tf_cuda_cc_test(
     srcs = [
         "gpu_device_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
         ":gpu_runtime",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_cudamallocasync_allocator_header",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -378,6 +324,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime:core_cpu_internal",
         "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/tsl/framework:device_id",
     ],
 )
 
@@ -387,11 +334,11 @@ tf_cuda_cc_test(
     srcs = [
         "pool_allocator_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -413,7 +360,6 @@ tf_cuda_cc_test(
     srcs = [
         "gpu_device_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     # Runs test on a Guitar cluster that uses P100s to test unified memory
     # allocations.
     tags = tf_cuda_tests_tags() + [
@@ -423,6 +369,7 @@ tf_cuda_cc_test(
     deps = [
         ":gpu_id",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_cudamallocasync_allocator_header",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -442,7 +389,6 @@ tf_cuda_cc_test(
     name = "gpu_allocator_retry_test",
     size = "medium",
     srcs = ["gpu_allocator_retry_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -465,7 +411,6 @@ tf_cuda_cc_test(
     size = "medium",
     srcs = ["gpu_debug_allocator_test.cc"],
     args = ["--gtest_death_test_style=threadsafe"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
@@ -491,12 +436,11 @@ tf_cc_test(
     name = "gpu_virtual_mem_allocator_test",
     size = "small",
     srcs = ["gpu_virtual_mem_allocator_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
-        ":gpu_init",
         ":gpu_virtual_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/compiler/xla/stream_executor/lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 3b33e9793e6..72522a776c7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
@@ -43,11 +43,11 @@ limitations under the License.
 
 namespace tsl {
 namespace {
+using stream_executor::DeviceIdUtil;
+using stream_executor::GPUMachineManager;
 using tensorflow::BinSummary;
-using tensorflow::DeviceIdUtil;
 using tensorflow::DeviceMemAllocator;
 using tensorflow::GPUBFCAllocator;
-using tensorflow::GPUMachineManager;
 using tensorflow::GPUOptions;
 using tensorflow::TypedAllocator;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 622d04b9d67..93e82e9acbf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -29,8 +29,8 @@ namespace tensorflow {
 
 GPUcudaMallocAllocator::GPUcudaMallocAllocator(
     tsl::PlatformDeviceId platform_device_id) {
-  stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                           platform_device_id)
+  stream_exec_ = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                     se::GPUMachineManager(), platform_device_id)
                      .value();
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index b125756e3fd..9d3f7a1e32e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -82,8 +82,8 @@ void InitMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
 GPUDebugAllocator::GPUDebugAllocator(Allocator* allocator,
                                      tsl::PlatformDeviceId platform_device_id)
     : base_allocator_(allocator) {
-  stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                           platform_device_id)
+  stream_exec_ = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                     se::GPUMachineManager(), platform_device_id)
                      .value();
 }
 
@@ -161,8 +161,8 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
 GPUNanResetAllocator::GPUNanResetAllocator(
     Allocator* allocator, tsl::PlatformDeviceId platform_device_id)
     : base_allocator_(allocator) {
-  stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                           platform_device_id)
+  stream_exec_ = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                     se::GPUMachineManager(), platform_device_id)
                      .value();
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 9475e3aecfe..a547cd58eb3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/lib/gtl/inlined_vector.h"
@@ -39,8 +39,8 @@ namespace {
 
 se::StreamExecutor* ExecutorForPlatformDeviceId(
     tsl::PlatformDeviceId platform_device_id) {
-  return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                   platform_device_id)
+  return se::DeviceIdUtil::ExecutorForPlatformDeviceId(se::GPUMachineManager(),
+                                                       platform_device_id)
       .value();
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 2abfe3fc2a6..ad17768a660 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -37,12 +37,12 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_split.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
@@ -69,6 +69,7 @@ limitations under the License.
 #endif
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
+#include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -341,6 +342,15 @@ class BaseGPUDevice::StreamGroupFactory {
     streams_.clear();
   }
 
+  std::optional<tsl::TfDeviceId> FindTfDeviceId(se::Stream* compute) const {
+    for (const auto& item : streams_) {
+      if (item.second.compute == compute) {
+        return tsl::TfDeviceId(std::get<0>(item.first));
+      }
+    }
+    return std::nullopt;
+  }
+
  private:
   // Returns priority for the given virtual GPU id from the session options.
   // Returns 0 if no virtual devices are specified.
@@ -431,8 +441,8 @@ Status BaseGPUDevice::InitScratchBuffers() {
 }
 
 Status BaseGPUDevice::Init(const SessionOptions& options) {
-  auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
-      DEVICE_GPU, GPUMachineManager(), tf_device_id_);
+  auto executor_status = se::DeviceIdUtil::ExecutorForTfDeviceId(
+      DEVICE_GPU, se::GPUMachineManager(), tf_device_id_);
   if (!executor_status.status().ok()) {
     return errors::Internal("Failed to get StreamExecutor for device ",
                             tf_device_id_.value());
@@ -558,6 +568,11 @@ string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
                          stream_id, "]");
 }
 
+std::optional<tsl::TfDeviceId> BaseGPUDevice::FindTfDeviceId(
+    se::Stream* compute) {
+  return StreamGroupFactory::Global().FindTfDeviceId(compute);
+}
+
 namespace {
 const absl::flat_hash_set<std::string>* GetOpsToLogFromEnv() {
   auto* result = new absl::flat_hash_set<std::string>;
@@ -1083,8 +1098,8 @@ Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
                                       int64_t* memory_limit) {
   int64_t total_memory = 0;
   int64_t available_memory = 0;
-  se::StreamExecutor* se = DeviceIdUtil::ExecutorForPlatformDeviceId(
-                               GPUMachineManager(), platform_device_id)
+  se::StreamExecutor* se = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                               se::GPUMachineManager(), platform_device_id)
                                .value();
   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
     return errors::Unknown("Failed to query available memory for GPU ",
@@ -1205,8 +1220,8 @@ Status BaseGPUDeviceFactory::CacheDeviceIds() {
     return OkStatus();
   }
 
-  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-  se::Platform* gpu_manager = GPUMachineManager();
+  TF_RETURN_IF_ERROR(se::ValidateGPUMachineManager());
+  se::Platform* gpu_manager = se::GPUMachineManager();
   if (gpu_manager == nullptr) {
     return OkStatus();
   }
@@ -1242,8 +1257,8 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
   }
   tsl::PlatformDeviceId platform_device_id = cached_device_ids_[device_index];
 
-  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-  se::Platform* gpu_manager = GPUMachineManager();
+  TF_RETURN_IF_ERROR(se::ValidateGPUMachineManager());
+  se::Platform* gpu_manager = se::GPUMachineManager();
   if (gpu_manager == nullptr) {
     return errors::Internal("Cannot get GPUMachineManager");
   }
@@ -1264,8 +1279,8 @@ Status BaseGPUDeviceFactory::GetDeviceDetails(
 Status BaseGPUDeviceFactory::CreateDevices(
     const SessionOptions& options, const string& name_prefix,
     std::vector<std::unique_ptr<Device>>* devices) {
-  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-  se::Platform* gpu_manager = GPUMachineManager();
+  TF_RETURN_IF_ERROR(se::ValidateGPUMachineManager());
+  se::Platform* gpu_manager = se::GPUMachineManager();
   if (gpu_manager == nullptr) {
     return OkStatus();
   }
@@ -1287,7 +1302,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
   // because it treats an empty gpu_options.visible_device_list as 'all GPUs
   // are visible'.
   if (num_gpus_to_use > 0) {
-    TF_RETURN_IF_ERROR(DeviceIdUtil::ParseVisibleDeviceList(
+    TF_RETURN_IF_ERROR(se::DeviceIdUtil::ParseVisibleDeviceList(
         gpu_options.visible_device_list(), gpu_manager->VisibleDeviceCount(),
         &visible_gpu_order));
     bool new_gpu_found = false;
@@ -1369,7 +1384,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
                                 " failed. Status: ", hipGetErrorString(err));
       }
       int priority_low, priority_high;
-      hipDeviceGetStreamPriorityRange(&priority_low, &priority_high);
+      err = hipDeviceGetStreamPriorityRange(&priority_low, &priority_high);
       if (err != hipSuccess) {
         return errors::Internal(
             "hipDeviceGetStreamPriorityRange() on GPU:", original_device,
@@ -1382,7 +1397,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
                          std::make_pair(priority_low, priority_high)));
 #endif
     }
-    // Reset to the original device.
+    // Reset to the original device, if it is a valid visible gpu. Otherwise use
+    // the first valid device.
+    if (std::find(valid_platform_device_ids.begin(),
+                  valid_platform_device_ids.end(),
+                  original_device) == valid_platform_device_ids.end()) {
+      original_device = valid_platform_device_ids[0].value();
+    }
 #if GOOGLE_CUDA
     err = cudaSetDevice(original_device);
     if (err != cudaSuccess) {
@@ -1562,14 +1583,14 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
   CHECK_GE(tf_device_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
-  DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
-                                     tf_device_id);
+  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
+                                         tf_device_id);
   tsl::PlatformDeviceId platform_device_id;
   TF_RETURN_IF_ERROR(
       GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id));
   int numa_node = dev_locality.numa_node();
 
-  se::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = se::GPUMachineManager();
   auto desc_status =
       gpu_manager->DescriptionForDevice(platform_device_id.value());
   if (!desc_status.ok()) {
@@ -1632,12 +1653,12 @@ GetPeerAccessMap(se::Platform* platform,
                        bool>);
   for (tsl::PlatformDeviceId platform_gpu_i : visible_gpu_order) {
     for (tsl::PlatformDeviceId platform_gpu_j : visible_gpu_order) {
-      se::StreamExecutor* from =
-          DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_i)
-              .value();
-      se::StreamExecutor* to =
-          DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_gpu_j)
-              .value();
+      se::StreamExecutor* from = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                                     platform, platform_gpu_i)
+                                     .value();
+      se::StreamExecutor* to = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                                   platform, platform_gpu_j)
+                                   .value();
       (*map)[{platform_gpu_i, platform_gpu_j}] =
           from->CanEnablePeerAccessTo(to);
     }
@@ -1683,7 +1704,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
-    se::Platform* gpu_manager = GPUMachineManager();
+    se::Platform* gpu_manager = se::GPUMachineManager();
     auto desc_status =
         gpu_manager->DescriptionForDevice(platform_device_id.value());
     if (!desc_status.ok()) {
@@ -1845,7 +1866,7 @@ std::vector<se::CudaComputeCapability> GetSupportedCudaComputeCapabilities() {
 
 Status BaseGPUDeviceFactory::EnablePeerAccess(
     const std::vector<tsl::PlatformDeviceId>& visible_gpu_order) {
-  se::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = se::GPUMachineManager();
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
@@ -1853,12 +1874,12 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
       const tsl::PlatformDeviceId platform_gpu_j = visible_gpu_order[j];
       // We have already validated that ExecutorForDevice() calls return OK.
-      se::StreamExecutor* from =
-          DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_i)
-              .value();
-      se::StreamExecutor* to =
-          DeviceIdUtil::ExecutorForPlatformDeviceId(gpu_manager, platform_gpu_j)
-              .value();
+      se::StreamExecutor* from = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                                     gpu_manager, platform_gpu_i)
+                                     .value();
+      se::StreamExecutor* to = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                                   gpu_manager, platform_gpu_j)
+                                   .value();
 
       if (from->CanEnablePeerAccessTo(to)) {
         ++possible_peer_count;
@@ -1890,7 +1911,7 @@ Status BaseGPUDeviceFactory::EnablePeerAccess(
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
     const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
     std::vector<tsl::PlatformDeviceId>* ids) {
-  se::Platform* gpu_manager = GPUMachineManager();
+  se::Platform* gpu_manager = se::GPUMachineManager();
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
     int visible_gpu_id = visible_gpu_order[i].value();
     auto description_status = gpu_manager->DescriptionForDevice(visible_gpu_id);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index def7797c850..3aefa850a8a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -20,13 +20,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_DEVICE_H_
 
+#include <functional>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
@@ -76,7 +78,7 @@ class BaseGPUDevice : public LocalDevice {
  public:
   BaseGPUDevice(const SessionOptions& options, const std::string& name,
                 Bytes memory_limit, const DeviceLocality& locality,
-                TfDeviceId tf_device_id,
+                tsl::TfDeviceId tf_device_id,
                 const std::string& physical_device_desc,
                 Allocator* gpu_allocator, Allocator* cpu_allocator,
                 bool sync_every_op);
@@ -94,7 +96,7 @@ class BaseGPUDevice : public LocalDevice {
                     AsyncOpKernel::DoneCallback done) override;
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
+                             AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
 
   void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
@@ -148,6 +150,10 @@ class BaseGPUDevice : public LocalDevice {
     return stream_->compute->implementation()->GpuStreamMemberHack();
   }
 
+  // Given the compute stream for a GPU or virtual GPU, return the TfDeviceId
+  // for the GPU or vGPU.
+  static std::optional<tsl::TfDeviceId> FindTfDeviceId(se::Stream* compute);
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -175,7 +181,7 @@ class BaseGPUDevice : public LocalDevice {
   GPUDeviceContext* device_context_;
   DeviceBase::AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
   mutex trace_mu_;
-  TfDeviceId tf_device_id_;
+  tsl::TfDeviceId tf_device_id_;
   const bool sync_every_op_ = false;
   EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
@@ -248,7 +254,7 @@ class GPUKernelTracker {
     if (!timing_counter_) {
       // There's not a preexisting counter owned by GPUProcessState, i.e.
       // pending_cap > 0 but timestamped_allocator == false.
-      owned_counter_.reset(new SharedCounter);
+      owned_counter_ = std::make_unique<SharedCounter>();
       timing_counter_ = owned_counter_.get();
     }
   }
@@ -317,10 +323,7 @@ class GPUKernelTracker {
     uint64 queued_count;
     int weight;
     bool terminated;
-    PendingKernel(const PendingKernel& pk)
-        : queued_count(pk.queued_count),
-          weight(pk.weight),
-          terminated(pk.terminated) {}
+    PendingKernel(const PendingKernel& pk) = default;
     PendingKernel() : queued_count(0), weight(0), terminated(false) {}
   };
   mutex mu_;
@@ -371,11 +374,11 @@ class BaseGPUDeviceFactory : public DeviceFactory {
       se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
   struct TfDeviceIdHash {
-    std::size_t operator()(const TfDeviceId& id) const noexcept {
+    std::size_t operator()(const tsl::TfDeviceId& id) const noexcept {
       return std::hash<int>{}(id.value());
     }
   };
-  typedef std::unordered_map<TfDeviceId, DeviceLocality, TfDeviceIdHash>
+  typedef std::unordered_map<tsl::TfDeviceId, DeviceLocality, TfDeviceIdHash>
       LocalityMap;
   // Populates *localities with the DeviceLocality descriptor for
   // every TfDeviceId.
@@ -389,13 +392,13 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // 'devices' vector.
   Status CreateGPUDevice(const SessionOptions& options,
                          const std::string& name_prefix,
-                         TfDeviceId tf_device_id, int64_t memory_limit,
+                         tsl::TfDeviceId tf_device_id, int64_t memory_limit,
                          const DeviceLocality& dev_locality, size_t num_tf_gpus,
                          std::vector<std::unique_ptr<Device>>* devices);
 
   virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
       const SessionOptions& options, const string& name, Bytes memory_limit,
-      const DeviceLocality& dev_locality, TfDeviceId tf_device_id,
+      const DeviceLocality& dev_locality, tsl::TfDeviceId tf_device_id,
       const string& physical_device_desc, Allocator* gpu_allocator,
       Allocator* cpu_allocator) = 0;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 2fba5e7a8e0..fcb61b64e28 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -34,7 +34,8 @@ class GPUDevice : public BaseGPUDevice {
             Allocator* gpu_allocator, Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
-                      false /* sync every op */) {
+                      false /* sync every op */),
+        gpu_options_(options.config.gpu_options()) {
     if (options.config.has_gpu_options()) {
       force_gpu_compatible_ =
           options.config.gpu_options().force_gpu_compatible();
@@ -46,7 +47,7 @@ class GPUDevice : public BaseGPUDevice {
     if (attr.on_host()) {
       if (attr.gpu_compatible() || force_gpu_compatible_) {
         GPUProcessState* ps = GPUProcessState::singleton();
-        return ps->GetGpuHostAllocator(0);
+        return ps->GetGpuHostAllocator(gpu_options_, 0);
       } else {
         return cpu_allocator_;
       }
@@ -56,6 +57,7 @@ class GPUDevice : public BaseGPUDevice {
   }
 
  private:
+  GPUOptions gpu_options_;
   bool force_gpu_compatible_ = false;
 };
 
@@ -84,7 +86,8 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
       : ThreadPoolDevice(options, name, memory_limit, locality, allocator),
-        numa_node_(locality.numa_node()) {
+        numa_node_(locality.numa_node()),
+        gpu_options_(options.config.gpu_options()) {
     if (options.config.has_gpu_options()) {
       force_gpu_compatible_ =
           options.config.gpu_options().force_gpu_compatible();
@@ -95,7 +98,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     GPUProcessState* ps = GPUProcessState::singleton();
     if (attr.gpu_compatible() || force_gpu_compatible_) {
-      return ps->GetGpuHostAllocator(numa_node_);
+      return ps->GetGpuHostAllocator(gpu_options_, numa_node_);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
@@ -104,7 +107,8 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 
  private:
   bool force_gpu_compatible_ = false;
-  int numa_node_ = port::kNUMANoAffinity;
+  int numa_node_;
+  GPUOptions gpu_options_;
 };
 
 // The associated factory.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index d9cd1eb9239..e3364a123d2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -17,16 +17,17 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/framework/device_id.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
@@ -35,10 +36,10 @@ using ::testing::SizeIs;
 
 const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
 
-int64_t GetTotalGPUMemory(PlatformDeviceId gpu_id) {
-  se::StreamExecutor* se =
-      DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
-          .value();
+int64_t GetTotalGPUMemory(tsl::PlatformDeviceId gpu_id) {
+  se::StreamExecutor* se = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                               se::GPUMachineManager(), gpu_id)
+                               .value();
 
   int64_t total_memory, available_memory;
   CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
@@ -46,8 +47,8 @@ int64_t GetTotalGPUMemory(PlatformDeviceId gpu_id) {
 }
 
 se::CudaComputeCapability GetComputeCapability() {
-  return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                   PlatformDeviceId(0))
+  return se::DeviceIdUtil::ExecutorForPlatformDeviceId(se::GPUMachineManager(),
+                                                       tsl::PlatformDeviceId(0))
       .value()
       ->GetDeviceDescription()
       .cuda_compute_capability();
@@ -148,7 +149,7 @@ TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsync)) {
   std::vector<std::unique_ptr<Device>> devices;
   Status status;
   int number_instantiated =
-      GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
+      se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
   {  // The new scope is to trigger the destruction of the object.
     status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
@@ -165,7 +166,7 @@ TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsync)) {
     allocator->DeallocateRaw(ptr);
   }
   EXPECT_EQ(number_instantiated + 1,
-            GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
+            se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
   EXPECT_EQ(status.code(), error::OK);
 }
 
@@ -177,7 +178,7 @@ TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsyncPreallocate)) {
   Status status;
 
   int number_instantiated =
-      GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
+      se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
   {  // The new scope is to trigger the destruction of the object.
     status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
@@ -197,7 +198,7 @@ TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsyncPreallocate)) {
   unsetenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC");
 
   EXPECT_EQ(number_instantiated + 1,
-            GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
+            se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
 
   EXPECT_EQ(status.code(), error::OK);
 }
@@ -267,7 +268,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // This test requires at least two visible GPU hardware.
-  if (GPUMachineManager()->VisibleDeviceCount() < 2) return;
+  if (se::GPUMachineManager()->VisibleDeviceCount() < 2) return;
   // Three entries in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
@@ -448,7 +449,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithDeviceOrdinal) {
 TEST_F(GPUDeviceTest,
        MultipleVirtualDevicesWithDeviceOrdinalOnMultipleDevices) {
   // This test requires at least two visible GPU hardware.
-  if (GPUMachineManager()->VisibleDeviceCount() < 2) return;
+  if (se::GPUMachineManager()->VisibleDeviceCount() < 2) return;
 
   SessionOptions opts =
       MakeSessionOptions("0,1", 0, 2, {{1, 2}, {3, 4}}, {}, {{1, 2}, {1, 2}});
@@ -484,7 +485,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
 // more memory than what is available on the device.
 TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   static constexpr double kGpuMemoryFraction = 1.2;
-  static constexpr PlatformDeviceId kPlatformDeviceId(0);
+  static constexpr tsl::PlatformDeviceId kPlatformDeviceId(0);
 
   // Exit early if running on pre-Pascal GPUs.
   if (!GetComputeCapability().IsAtLeast(se::CudaComputeCapability::PASCAL_)) {
@@ -572,6 +573,22 @@ TEST_F(GPUDeviceTest, DeviceDetails) {
   }
 }
 
+TEST_F(GPUDeviceTest, StreamToIdMultipleVirtualDevices) {
+  // Valid range for priority values on AMD GPUs in (-1,1)
+  // Valid range for priority values on NVidia GPUs in (-2, 0)
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  // Verify FindTfDeviceId() works.
+  for (int i = 0; i < devices.size(); i++) {
+    EXPECT_EQ(tsl::TfDeviceId(i),
+              *BaseGPUDevice::FindTfDeviceId(
+                  devices[i]->tensorflow_accelerator_device_info()->stream));
+  }
+  EXPECT_FALSE(BaseGPUDevice::FindTfDeviceId(nullptr).has_value());
+}
+
 class GPUKernelTrackerTest : public ::testing::Test {
  protected:
   void Init(const GPUKernelTracker::Params& params) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_init.cc b/tensorflow/core/common_runtime/gpu/gpu_init.cc
deleted file mode 100644
index 85ec9c98d90..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_init.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
-
-#include <string>
-
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/stream_executor_util.h"
-
-namespace tensorflow {
-
-Status ValidateGPUMachineManager() {
-  return se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).status();
-}
-
-se::Platform* GPUMachineManager() {
-  auto result = se::MultiPlatformManager::PlatformWithName(GpuPlatformName());
-  if (!result.ok()) {
-    LOG(FATAL) << "Could not find Platform with name " << GpuPlatformName();
-    return nullptr;
-  }
-
-  return result.value();
-}
-
-string GpuPlatformName() {
-#if TENSORFLOW_USE_ROCM
-  return "ROCM";
-#else
-  // This function will return "CUDA" even when building TF without GPU support
-  // This is done to preserve existing functionality
-  return "CUDA";
-#endif
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index dec669cf467..9f0bb857edd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -15,20 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 
+#include <cstdlib>
 #include <cstring>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/compiler/xla/stream_executor/device_mem_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
@@ -87,9 +88,10 @@ GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
 
 int GPUProcessState::BusIdForGPU(tsl::TfDeviceId tf_device_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
-  se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
-                               DEVICE_GPU, GPUMachineManager(), tf_device_id)
-                               .value();
+  se::StreamExecutor* se =
+      se::DeviceIdUtil::ExecutorForTfDeviceId(
+          DEVICE_GPU, se::GPUMachineManager(), tf_device_id)
+          .value();
   int numa_node = se->GetDeviceDescription().numa_node();
   // bus_id must be non-negative.  If the numa_node is not known,
   // use 0.
@@ -101,8 +103,8 @@ static std::unique_ptr<SubAllocator> CreateSubAllocator(
     const GPUOptions& options, tsl::PlatformDeviceId platform_device_id,
     const std::vector<SubAllocator::Visitor>& alloc_visitors,
     size_t total_bytes, const std::vector<tsl::TfDeviceId>& peer_gpu_ids) {
-  auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                            platform_device_id)
+  auto executor = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                      se::GPUMachineManager(), platform_device_id)
                       .value();
 
   // FIXME(imintz): Observed OOM issues when using the virtual memory
@@ -158,8 +160,8 @@ Allocator* GPUProcessState::GetGPUAllocator(
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
-  DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
-                                     tf_device_id);
+  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
+                                         tf_device_id);
 
   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
     gpu_allocators_.resize(tf_device_id.value() + 1);
@@ -229,7 +231,7 @@ Allocator* GPUProcessState::GetGPUAllocator(
       // TODO: **WARNING** probably will not work in a multi-gpu scenario
       gpu_bfc_allocator.reset();
       gpu_allocator =
-          new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
+          new se::GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
     }
 
     Allocator* recording_allocator = nullptr;
@@ -267,8 +269,8 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(
   DCHECK(process_state_);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-  DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, GPUMachineManager(),
-                                     tf_device_id);
+  se::DeviceIdUtil::CheckValidTfDeviceId(DEVICE_GPU, se::GPUMachineManager(),
+                                         tf_device_id);
   mutex_lock l(mu_);
   if (tf_device_id.value() >= static_cast<int64_t>(gpu_allocators_.size())) {
     LOG(ERROR) << "Asked for counter for GPU allocator " << tf_device_id.value()
@@ -291,7 +293,8 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
 
-Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
+Allocator* GPUProcessState::GetGpuHostAllocator(const GPUOptions& options,
+                                                int numa_node) {
   CHECK(process_state_);
   if (!HasGPUDevice() ||
       !process_state_->ProcessState::FLAGS_brain_mem_reg_gpu_dma) {
@@ -328,8 +331,8 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
   se::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i].allocator != nullptr) {
-      se = DeviceIdUtil::ExecutorForTfDeviceId(DEVICE_GPU, GPUMachineManager(),
-                                               tsl::TfDeviceId(i))
+      se = se::DeviceIdUtil::ExecutorForTfDeviceId(
+               DEVICE_GPU, se::GPUMachineManager(), tsl::TfDeviceId(i))
                .value();
       break;
     }
@@ -337,6 +340,19 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
 
   CHECK_NE(nullptr, se);
 
+  int64_t mem_limit_bytes =
+      options.experimental().gpu_host_mem_limit_in_mb() * (1LL << 20);
+  if (mem_limit_bytes <= 0) {
+    int64_t limit_mb = -1;
+    Status status =
+        tsl::ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
+                                 1LL << 17 /*2^17 MB == 128GB*/, &limit_mb);
+    if (!status.ok()) {
+      LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
+    }
+    mem_limit_bytes = limit_mb * (1LL << 20);
+  }
+
   while (static_cast<int>(gpu_host_allocators_.size()) <= numa_node) {
     while (gpu_host_alloc_visitors_.size() <= numa_node) {
       gpu_host_alloc_visitors_.push_back({});
@@ -347,21 +363,13 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
     SubAllocator* sub_allocator = new DeviceHostAllocator(
         se, numa_node, gpu_host_alloc_visitors_[numa_node],
         gpu_host_free_visitors_[numa_node]);
-    // TODO(zheng-xq): evaluate whether 64GB by default is the best choice.
-    int64_t gpu_host_mem_limit_in_mb = -1;
-    Status status = tsl::ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
-                                             1LL << 16 /*64GB max by default*/,
-                                             &gpu_host_mem_limit_in_mb);
-    if (!status.ok()) {
-      LOG(ERROR) << "GetGpuHostAllocator: " << status.error_message();
-    }
-    int64_t gpu_host_mem_limit = gpu_host_mem_limit_in_mb * (1LL << 20);
 
     tsl::BFCAllocator::Options allocator_opts;
-    allocator_opts.allow_growth = true;
-    tsl::Allocator* allocator = new tsl::BFCAllocator(
-        absl::WrapUnique(sub_allocator), gpu_host_mem_limit,
-        /*name=*/"gpu_host_bfc", allocator_opts);
+    allocator_opts.allow_growth =
+        !options.experimental().gpu_host_mem_disallow_growth();
+    tsl::Allocator* allocator =
+        new tsl::BFCAllocator(absl::WrapUnique(sub_allocator), mem_limit_bytes,
+                              /*name=*/"gpu_host_bfc", allocator_opts);
 
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 8e7b59ebb3f..ff04df978e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <map>
+#include <memory>
 #include <unordered_map>
 #include <vector>
 
@@ -77,6 +78,9 @@ class GPUProcessState {
   // underlying allocator.  REQUIRES: Must be a valid type (see
   // config.proto for the list of supported strings.).
   //
+  // `options` is read on the very first call to this function in the process.
+  // After that if you pass in a set of options, they will be ignored.
+  //
   // REQUIRES: tf_device_id must be a valid id for a BaseGPUDevice available in
   // the current system environment.  Otherwise returns nullptr.
   virtual Allocator* GetGPUAllocator(
@@ -93,7 +97,11 @@ class GPUProcessState {
     return gpu_allocators_.size();
   }
 
-  virtual Allocator* GetGpuHostAllocator(int numa_node);
+  // `options` is read on the very first call to this function in the process,
+  // e.g. to set the memory limit on this allocator.  After that if you pass in
+  // a different set of options, they will be ignored.
+  virtual Allocator* GetGpuHostAllocator(const GPUOptions& options,
+                                         int numa_node);
 
   // Registers a Visitor to be invoked on new chunks of memory allocated by the
   // SubAllocator of every GPU proximate to the specified bus.  The AllocVisitor
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index efd03883f27..fdc56d0a35f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -15,12 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 
+#include <algorithm>
+#include <cstring>
+
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_reference.h"
@@ -148,7 +152,8 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   const int64_t total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
     profiler::ScopedAnnotation annotation("SetProtoFromGPU");
-    alloc = GPUProcessState::singleton()->GetGpuHostAllocator(0);
+    alloc =
+        GPUProcessState::singleton()->GetGpuHostAllocator(/*options=*/{}, 0);
     buf = static_cast<char*>(
         alloc->AllocateRaw(Allocator::kAllocatorAlignment, total_bytes));
     if (LogMemory::IsEnabled()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
index c75ab4ca2bb..7d0f5fbb990 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/platform/numbers.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 #if CUDA_VERSION >= 10020
 
@@ -29,8 +29,8 @@ using ::stream_executor::gpu::GpuContext;
 using ::stream_executor::gpu::GpuDeviceHandle;
 using ::stream_executor::gpu::GpuDevicePtr;
 using ::stream_executor::gpu::GpuDriver;
-using ::stream_executor::port::Status;
-using ::stream_executor::port::StatusOr;
+using ::tsl::Status;
+using ::tsl::StatusOr;
 
 // Rounds value up to the specified power of two alignment.
 size_t AlignUp(size_t value, size_t alignment) {
@@ -49,7 +49,7 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
   TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
                       SupportsVirtualAddressManagement(device));
   if (!supports_virtual_address_management) {
-    return stream_executor::port::InternalError(absl::StrFormat(
+    return tsl::errors::Internal(absl::StrFormat(
         "GPU %d does not support virtual memory address management.",
         gpu_id.value()));
   }
@@ -58,13 +58,14 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
 
 }  // namespace
 
-/* static */ stream_executor::port::StatusOr<
-    std::unique_ptr<GpuVirtualMemAllocator>>
+/* static */ tsl::StatusOr<std::unique_ptr<GpuVirtualMemAllocator>>
 GpuVirtualMemAllocator::Create(
     const std::vector<Visitor>& alloc_visitors,
     const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
     tsl::PlatformDeviceId gpu_id, size_t virtual_address_space_size,
     const std::vector<tsl::PlatformDeviceId>& peer_gpu_ids) {
+  tsl::profiler::TraceMe traceme("GpuVirtualMemAllocator::Create");
+
   std::vector<GpuDeviceHandle> access_gpu_handles;
   access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
 
@@ -132,6 +133,8 @@ GpuVirtualMemAllocator::~GpuVirtualMemAllocator() {
 
 void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
                                     size_t* bytes_received) {
+  tsl::profiler::TraceMe traceme("GpuVirtualMemAllocator::Alloc");
+
   if (num_bytes == 0) return nullptr;
   size_t padded_bytes = (num_bytes + granularity_ - 1) & ~(granularity_ - 1);
 
@@ -173,6 +176,8 @@ void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
 }
 
 void GpuVirtualMemAllocator::Free(void* ptr, size_t num_bytes) {
+  tsl::profiler::TraceMe traceme("GpuVirtualMemAllocator::Free");
+
   if (ptr == nullptr) return;
 
   auto mapping_it =
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
index 4e21ff709bf..061cc12772b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/device_id.h"
@@ -43,13 +42,12 @@ namespace tensorflow {
 // This class is not thread-safe.
 class GpuVirtualMemAllocator : public tsl::SubAllocator {
  public:
-  static stream_executor::port::StatusOr<
-      std::unique_ptr<GpuVirtualMemAllocator>>
-  Create(const std::vector<Visitor>& alloc_visitors,
-         const std::vector<Visitor>& free_visitors,
-         stream_executor::gpu::GpuContext& gpu_context,
-         tsl::PlatformDeviceId gpu_id, size_t virtual_address_space_size,
-         const std::vector<tsl::PlatformDeviceId>& peer_gpu_ids);
+  static tsl::StatusOr<std::unique_ptr<GpuVirtualMemAllocator>> Create(
+      const std::vector<Visitor>& alloc_visitors,
+      const std::vector<Visitor>& free_visitors,
+      stream_executor::gpu::GpuContext& gpu_context,
+      tsl::PlatformDeviceId gpu_id, size_t virtual_address_space_size,
+      const std::vector<tsl::PlatformDeviceId>& peer_gpu_ids);
   ~GpuVirtualMemAllocator() override;
 
   // Allocates memory at least as large as requested by num_bytes. Will be
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc
index bc32d8fcf1f..a5d4b3ec11e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #if CUDA_VERSION >= 10020
 
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/tsl/framework/device_id.h"
@@ -36,9 +36,9 @@ constexpr size_t k2MiB{2 << 20};
 // Creates an allocator with 8 MiB of virtual address space.
 std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
   tsl::PlatformDeviceId gpu_id(0);
-  auto executor =
-      DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
-          .value();
+  auto executor = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                      se::GPUMachineManager(), gpu_id)
+                      .value();
   GpuContext* gpu_context = reinterpret_cast<GpuContext*>(
       executor->implementation()->GpuContextHack());
   return GpuVirtualMemAllocator::Create(
@@ -49,9 +49,9 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
 
 TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
   tsl::PlatformDeviceId gpu_id(0);
-  auto executor =
-      DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
-          .value();
+  auto executor = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                      se::GPUMachineManager(), gpu_id)
+                      .value();
   GpuContext* gpu_context = reinterpret_cast<GpuContext*>(
       executor->implementation()->GpuContextHack());
   auto allocator = GpuVirtualMemAllocator::Create(
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index 0aa2c7fc6af..acfe7ff4cc3 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 
-#include "gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
@@ -26,7 +26,7 @@ namespace {
 
 TEST(PoolAllocatorTest, ZeroSizeBuffers) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new DeviceHostAllocator(
@@ -45,7 +45,7 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
 
 TEST(PoolAllocatorTest, ZeroSizePool) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new DeviceHostAllocator(
@@ -79,7 +79,7 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
 
 TEST(PoolAllocatorTest, Alignment) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   PoolAllocator pool(
       0 /*pool_size_limit*/, false /*auto_resize*/,
       new DeviceHostAllocator(
@@ -141,7 +141,7 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
         free_size += size;
       };
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   DeviceHostAllocator* sub_allocator = new DeviceHostAllocator(
       platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0)).value(),
       0 /*numa_node*/, {alloc_visitor}, {free_visitor});
@@ -243,7 +243,7 @@ TEST(PoolAllocatorTest, Pow2Rounder) {
 
 TEST(PoolAllocatorTest, Name) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   PoolAllocator pool(
       2 /*pool_size_limit*/, false /*auto_resize*/,
       new DeviceHostAllocator(
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index 69462e5b1f5..bc854ac58e7 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -95,6 +95,10 @@ class GraphConstructor {
     Options(const GraphConstructorOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(in.allow_internal_ops),
           expect_device_spec(in.expect_device_spec),
+          propagate_device_spec(false),
+          uniquify_names(false),
+          uniquify_prefix(false),
+          skip_mapped_nodes(false),
           importing(false),
           validate_nodes(in.validate_nodes),
           validate_colocation_constraints(false),
@@ -102,6 +106,7 @@ class GraphConstructor {
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
+          propagate_device_spec(in.propagate_device_spec),
           prefix(in.prefix.empty() || str_util::EndsWith(in.prefix, "/")
                      ? in.prefix
                      : in.prefix + "/"),
@@ -120,6 +125,7 @@ class GraphConstructor {
 
     bool allow_internal_ops;
     bool expect_device_spec;
+    bool propagate_device_spec;
 
     string prefix;
     bool uniquify_names;
@@ -773,7 +779,8 @@ Status GraphConstructor::MakeNode(NodeDef&& node_def, Node** node) {
   Status status;
   *node = g_->AddNode(std::move(node_def), &status);
   if (!status.ok()) return status;
-  if (opts_.expect_device_spec) {
+  if (opts_.expect_device_spec ||
+      (opts_.propagate_device_spec && !(*node)->def().device().empty())) {
     (*node)->set_assigned_device_name((*node)->def().device());
   }
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/graph_constructor.h b/tensorflow/core/common_runtime/graph_constructor.h
index c58a4aafd40..a4a602891a9 100644
--- a/tensorflow/core/common_runtime/graph_constructor.h
+++ b/tensorflow/core/common_runtime/graph_constructor.h
@@ -70,7 +70,8 @@ struct ImportGraphDefOptions {
       : uniquify_names(false),
         uniquify_prefix(false),
         skip_mapped_nodes(false),
-        validate_shape(true) {}
+        validate_shape(true),
+        propagate_device_spec(false) {}
 
   // Name prefix to use for nodes imported from the GraphDef.  For example, if
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
@@ -153,6 +154,10 @@ struct ImportGraphDefOptions {
 
   // Try to set default execution device for this grapth.
   string default_device;
+
+  // If true, propagates a node's assigned device. By default the runtime
+  // will recompute the assigned device every time.
+  bool propagate_device_spec;
 };
 
 // Optional results that may be returned by ImportGraphDef.
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index c2fdf469f81..62bbeb0c9ce 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -66,7 +66,8 @@ namespace tensorflow {
 namespace {
 bool IsCollectiveV2(const string& op) {
   return op == "CollectiveReduceV2" || op == "CollectiveGatherV2" ||
-         op == "CollectiveBcastRecvV2" || op == "CollectiveBcastSendV2";
+         op == "CollectiveBcastRecvV2" || op == "CollectiveBcastSendV2" ||
+         op == "ColectiveReduceScatterV2";
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/int32_fulltype.cc b/tensorflow/core/common_runtime/int32_fulltype.cc
new file mode 100644
index 00000000000..804ed92903a
--- /dev/null
+++ b/tensorflow/core/common_runtime/int32_fulltype.cc
@@ -0,0 +1,133 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+
+Int32Fulltype::Int32Fulltype(Graph* graph) : graph_(graph) {}
+
+Int32Fulltype::~Int32Fulltype() = default;
+
+Status Int32Fulltype::Run() {
+  GraphOptimizationPassOptions options;
+  // options.debug_filename_prefix, which is used to create graph dump files,
+  // will be an empty string.
+  return Run(options);
+}
+
+Status Int32Fulltype::Run(const GraphOptimizationPassOptions& options) {
+  for (Node* n : graph_->op_nodes()) {
+    auto output_types = n->output_types();
+    bool needs_annotation = false;
+    for (const auto& output_type : output_types) {
+      // TODO(b/258849883) for the general case, is handling
+      // MTypeFromDTypeIntsOnDevice needed? (This is not needed for the
+      // mechanism in function.cc.)
+      //
+      // MemoryType mtype = ints_on_device
+      //                       ? MTypeFromDTypeIntsOnDevice(output_type)
+      //                      : MTypeFromDType(output_type);
+      MemoryType mtype = MTypeFromDType(output_type);
+      if (mtype == HOST_MEMORY) {
+        needs_annotation = true;
+      }
+    }
+    if (!needs_annotation) {
+      continue;
+    }
+    if (n->def().has_experimental_type()) {
+      FullTypeDef* t = n->mutable_def()->mutable_experimental_type();
+      if (t->type_id() != TFT_PRODUCT) {
+        return Status(
+            error::INVALID_ARGUMENT,
+            absl::StrCat("Full type for node='", n->name(), "' (op='",
+                         n->op_def().name(),
+                         "') does not start with TFT_PRODUCT.\n got:\n",
+                         t->DebugString()));
+      }
+      if (t->args_size() != output_types.size()) {
+        return Status(
+            error::INVALID_ARGUMENT,
+            absl::StrCat("Full type for node='", n->name(), "' (op='",
+                         n->op_def().name(), "') has ", t->args_size(),
+                         " outputs but output_types has ", output_types.size(),
+                         " outputs.\n got:\n", t->DebugString()));
+      }
+      for (int i = 0; i < t->args_size(); ++i) {
+        if (MTypeFromDType(output_types[i]) == HOST_MEMORY) {
+          if (t->args(i).type_id() == TFT_TENSOR) {
+            if (t->args(i).args_size() != 1) {
+              return Status(
+                  error::INVALID_ARGUMENT,
+                  absl::StrCat("Full type for node='", n->name(), "' (op='",
+                               n->op_def().name(), "') has TFT_TENSOR output ",
+                               i, " which has ", t->args(i).args_size(),
+                               " args instead of 1.\n got:\n",
+                               t->DebugString()));
+            }
+            if (t->args(i).args(0).type_id() == TFT_INT32) {
+              t->mutable_args(i)->set_type_id(TFT_SHAPE_TENSOR);
+            }
+          }
+        }
+      }
+      VLOG(2) << "Full type information in node '" << n->name() << "' (op='"
+              << n->op_def().name()
+              << "') modified to use TFT_SHAPE_TENSOR for int32.\n"
+              << t->DebugString();
+    } else {
+      FullTypeDef t;
+      t.set_type_id(TFT_PRODUCT);
+      for (const auto& output_type : output_types) {
+        if (MTypeFromDType(output_type) == HOST_MEMORY) {
+          FullTypeDef data_t;
+          map_dtype_to_tensor(output_type, data_t);
+          FullTypeDef out_t;
+          out_t.set_type_id(TFT_SHAPE_TENSOR);
+          (*out_t.add_args()) = data_t;
+          (*t.add_args()) = out_t;
+        } else {
+          t.add_args();  // Add TFT_UNSET non-HOST_MEMORY outputs
+        }
+      }
+      (*n->mutable_def()->mutable_experimental_type()) = t;
+      VLOG(2) << "Full type information with TFT_SHAPE_TENSOR for int32 added "
+                 "to node '"
+              << n->name() << "' (op='" << n->op_def().name() << "').\n"
+              << t.DebugString();
+    }
+  }
+
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile(
+        strings::StrCat(options.debug_filename_prefix, "int32_fulltype"),
+        *graph_, nullptr);
+  }
+  return OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype.h b/tensorflow/core/common_runtime/int32_fulltype.h
new file mode 100644
index 00000000000..e72e6b8e0a0
--- /dev/null
+++ b/tensorflow/core/common_runtime/int32_fulltype.h
@@ -0,0 +1,53 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// An optimization (graph rewrite) pass to automatically set TFT_SHAPE_TENSOR
+// full type information annotations for all int32 tensors, creating or
+// modifying existing full type information as needed. This allows placement
+// mechanisms using full type information to always place int32 on host.
+class Int32Fulltype {
+ public:
+  // Creates an instance of the algorithm that sets TFT_SHAPE_TENSOR full
+  // type information for all int32 tensors in the given Graph "graph".
+  explicit Int32Fulltype(Graph* graph);
+
+  ~Int32Fulltype();
+
+  // For each node in this graph that outputs int32 tensors, set full
+  // type information such that the int32 tensors use TFT_SHAPE_TENSOR.
+  //
+  // This method is not thread-safe.
+  // Run() may be invoked at most once.
+  Status Run();
+  Status Run(const GraphOptimizationPassOptions& options);
+
+ private:
+  Graph* const graph_;  // Not owned.
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Int32Fulltype);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_INT32_FULLTYPE_H_
diff --git a/tensorflow/core/common_runtime/int32_fulltype_test.cc b/tensorflow/core/common_runtime/int32_fulltype_test.cc
new file mode 100644
index 00000000000..0d1c8d1e81e
--- /dev/null
+++ b/tensorflow/core/common_runtime/int32_fulltype_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/int32_fulltype.h"
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Op registrations to set up the environment.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Register the following ops so they can be added to a Graph.
+REGISTER_OP("FloatInt32").Output("a: float").Output("b: int32");
+REGISTER_OP("FloatInt32Int32FT")
+    .Output("a: float")
+    .Output("b: int32")
+    .Output("c: int32");
+REGISTER_OP("FloatWithoutInt32").Output("a: float");
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// A test for automatic full type annotations has three phases:
+//
+// 1. Build a TensorFlow graph.
+// 2. Run the automatic annotator.
+// 3. Check the result.
+//
+////////////////////////////////////////////////////////////////////////////////
+class Int32FulltypeTest : public ::testing::Test {
+ protected:
+  Int32FulltypeTest() {}
+
+  // Builds the given graph, and (if successful) indexes the node
+  // names for use in placement, and later lookup.
+  Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) {
+    TF_RETURN_IF_ERROR(GraphDefBuilderToGraph(builder, out_graph));
+    RebuildNodeNameMap(*out_graph);
+    return OkStatus();
+  }
+
+  void AddTensorFT(FullTypeDef& t, tensorflow::FullTypeId out_t_id,
+                   tensorflow::FullTypeId data_t_id) {
+    FullTypeDef data_t;
+    data_t.set_type_id(data_t_id);
+    FullTypeDef out_t;
+    out_t.set_type_id(out_t_id);
+    (*out_t.add_args()) = data_t;
+    (*t.add_args()) = out_t;
+  }
+
+  // Invokes the Placer on "graph". If no DeviceSet is specified, the
+  // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
+  //
+  // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
+  Status Int32FulltypeAnnotate(Graph* graph) {
+    Int32Fulltype int32_fulltype(graph);
+    return int32_fulltype.Run();
+  }
+
+  // Returns the node in "graph" with the given name.
+  //
+  // REQUIRES: "graph" was produced by the most recent call to BuildGraph.
+  Node* GetNodeByName(const Graph& graph, const string& name) {
+    const auto search = nodes_by_name_.find(name);
+    CHECK(search != nodes_by_name_.end()) << "Unknown node name: " << name;
+    return graph.FindNodeId(search->second);
+  }
+
+ protected:
+  std::unordered_map<string, int> nodes_by_name_;
+
+ private:
+  void RebuildNodeNameMap(const Graph& graph) {
+    nodes_by_name_.clear();
+    for (Node* node : graph.nodes()) {
+      nodes_by_name_[node->name()] = node->id();
+    }
+  }
+};
+
+// Test creating full type information for int32 given a node that initially
+// does not have any full type information.
+TEST_F(Int32FulltypeTest, CreateFT) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("FloatInt32", b.opts().WithName("float_int32"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Int32FulltypeAnnotate(&g));
+
+  Node* node = GetNodeByName(g, "float_int32");
+  ASSERT_TRUE(node->def().has_experimental_type());
+  const FullTypeDef& ft = node->def().experimental_type();
+  ASSERT_EQ(ft.type_id(), TFT_PRODUCT);
+  ASSERT_EQ(ft.args_size(), 2);
+  ASSERT_EQ(ft.args(0).type_id(), TFT_UNSET);
+  ASSERT_EQ(ft.args(1).type_id(), TFT_SHAPE_TENSOR);
+  ASSERT_EQ(ft.args(1).args_size(), 1);
+  ASSERT_EQ(ft.args(1).args(0).type_id(), TFT_INT32);
+}
+
+// Test that TFT_TENSOR for int32 is changed to TFT_SHAPE_TENSOR without
+// changing other kinds of full type information.
+TEST_F(Int32FulltypeTest, ModifyFT) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* node = ops::SourceOp("FloatInt32Int32FT",
+                               b.opts().WithName("float_int32_int32"));
+    node->mutable_def()->mutable_experimental_type()->set_type_id(TFT_PRODUCT);
+    FullTypeDef& t = *node->mutable_def()->mutable_experimental_type();
+    AddTensorFT(t, TFT_TENSOR, TFT_FLOAT);
+    AddTensorFT(t, TFT_TENSOR, TFT_INT32);
+    AddTensorFT(t, TFT_SHAPE_TENSOR, TFT_INT32);
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Int32FulltypeAnnotate(&g));
+
+  Node* node = GetNodeByName(g, "float_int32_int32");
+  ASSERT_TRUE(node->def().has_experimental_type());
+  const FullTypeDef& ft = node->def().experimental_type();
+  ASSERT_EQ(ft.type_id(), TFT_PRODUCT);
+  ASSERT_EQ(ft.args_size(), 3);
+  ASSERT_EQ(ft.args(0).type_id(), TFT_TENSOR);  // unchanged
+  ASSERT_EQ(ft.args(0).args_size(), 1);
+  ASSERT_EQ(ft.args(0).args(0).type_id(), TFT_FLOAT);
+  ASSERT_EQ(ft.args(1).type_id(), TFT_SHAPE_TENSOR);  // changed
+  ASSERT_EQ(ft.args(1).args_size(), 1);
+  ASSERT_EQ(ft.args(1).args(0).type_id(), TFT_INT32);
+  ASSERT_EQ(ft.args(2).type_id(), TFT_SHAPE_TENSOR);  // unchanged
+  ASSERT_EQ(ft.args(2).args_size(), 1);
+  ASSERT_EQ(ft.args(2).args(0).type_id(), TFT_INT32);
+}
+
+// Test NOT creating full type information for a node that does not have
+// any int32 outputs.
+TEST_F(Int32FulltypeTest, NotCreateFT) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("FloatWithoutInt32",
+                  b.opts().WithName("float_without_int32"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Int32FulltypeAnnotate(&g));
+
+  Node* node = GetNodeByName(g, "float_without_int32");
+  ASSERT_FALSE(node->def().has_experimental_type());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/layout_pass_util.cc b/tensorflow/core/common_runtime/layout_pass_util.cc
new file mode 100644
index 00000000000..cb4ae376e5f
--- /dev/null
+++ b/tensorflow/core/common_runtime/layout_pass_util.cc
@@ -0,0 +1,145 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) || defined(AMD_ZENDNN)
+
+// This file refactors common utility functions from oneDNN and ZenDNN layout
+// rewrite passes.
+// TODO(penporn): Make mkl_layout_pass.cc call these functions.
+
+#include "tensorflow/core/common_runtime/layout_pass_util.h"
+
+#include <utility>
+#include <vector>
+
+namespace tensorflow {
+namespace zendnn {
+
+inline bool ArgIsList(const OpDef::ArgDef &arg) {
+  return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+}
+
+inline int GetTensorListLength(const OpDef::ArgDef &arg, const Node *n) {
+  CHECK_EQ(ArgIsList(arg), true);  // Crash ok.
+  int N = 0;
+  if (!arg.type_list_attr().empty()) {
+    std::vector<DataType> value;
+    TF_CHECK_OK(GetNodeAttr(n->def(), arg.type_list_attr(), &value));
+    N = value.size();
+  } else {
+    TF_CHECK_OK(GetNodeAttr(n->def(), arg.number_attr(), &N));
+  }
+  return N;
+}
+
+bool CanOpRunOnCPUDevice(const Node *n) {
+  bool result = true;
+  string reason;
+
+  const char *const kCPUDeviceSubStr = "CPU";
+  const char *const kXLACPUDeviceSubStr = "XLA_CPU";
+
+  // If Op has been specifically assigned to a non-CPU or XLA_CPU device, then
+  // No.
+  if (!n->assigned_device_name().empty() &&
+      (!absl::StrContains(n->assigned_device_name(), kCPUDeviceSubStr) ||
+       absl::StrContains(n->assigned_device_name(), kXLACPUDeviceSubStr))) {
+    result = false;
+    reason = "Op has been assigned a runtime device that is not CPU.";
+  }
+  // If user has specifically assigned this op to a non-CPU or XLA_CPU device,
+  // then No.
+  if (!n->def().device().empty() &&
+      (!absl::StrContains(n->def().device(), kCPUDeviceSubStr) ||
+       absl::StrContains(n->def().device(), kXLACPUDeviceSubStr))) {
+    result = false;
+    reason = "User has assigned a device that is not CPU.";
+  }
+
+  if (!result) {
+    VLOG(1) << "CanOpRunOnCPUDevice: Node skipped for rewrite"
+            << n->type_string() << " reason : " << reason;
+  }
+
+  return result;
+}
+
+void GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &inputs, int *input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut> *output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());  // Crash ok.
+  CHECK_GT(list_length, 0);             // Crash ok.
+  CHECK_NOTNULL(output_nodes);          // Crash ok.
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);             // Crash ok.
+    CHECK_LT(*input_idx, inputs.size());  // Crash ok.
+    Node *n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+Status CopyInputs(
+    const Node *old_node,
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &old_node_inputs,
+    NodeBuilder *nb) {
+  // Number of input slots to old node.
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  auto old_node_input_size = old_node_inputs.size();
+
+  if (old_node->type_string() == "_FusedConv2D") {
+    // [TODO zendnn-tf]
+    // commit 5be9a5 updates _FusedConv2D with additional host_args in vanilla
+    // tensorflow, temporarily the addtional argument is removed for Zen op
+    // conversion as it is yet to support in ZenDNN.
+    old_node_input_slots--;
+  }
+
+  DCHECK_GE(old_node_input_size, old_node_input_slots);
+
+  // Copy all inputs of old node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    DCHECK_LT(iidx, old_node_input_size);
+    const OpDef::ArgDef &arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, tensor_list_length,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+    } else {
+      nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      iidx++;
+    }
+  }
+  return OkStatus();
+}
+
+}  // namespace zendnn
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL || AMD_ZENDNN
diff --git a/tensorflow/core/common_runtime/layout_pass_util.h b/tensorflow/core/common_runtime/layout_pass_util.h
new file mode 100644
index 00000000000..909ff86f16b
--- /dev/null
+++ b/tensorflow/core/common_runtime/layout_pass_util.h
@@ -0,0 +1,82 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
+
+#if defined(INTEL_MKL) || defined(AMD_ZENDNN)
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Temporarily wrapping these helper functions in the zendnn namespace
+// to avoid crashing with similar functions in mkl_layout_pass.cc.
+// TODO(penporn): Delete the functions in mkl_layout_pass and use the functions
+// here after TF 2.12 branch cut.
+namespace zendnn {
+
+// Is OpDef::ArgDef a list type? It could be N * T or list(type).
+// Refer to opdef.proto for details of list type.
+inline bool ArgIsList(const OpDef::ArgDef &arg);
+
+// Get length of a list in 'n' if 'arg' is of list type. Refer to
+// description of ArgIsList for definition of list type.
+inline int GetTensorListLength(const OpDef::ArgDef &arg, const Node *n);
+
+// Can op represented by node 'n' run on DEVICE_CPU?
+// Op can run on CPU with ZenDNN if the runtime assigned device or the
+// user requested device contains device CPU, or both are empty.
+bool CanOpRunOnCPUDevice(const Node *n);
+
+// Get nodes that will feed a list of TF tensors to the new
+// node that we are constructing.
+//
+// @input inputs - inputs to old node that we are using for constructing
+//                 new inputs,
+// @input input_idx - the index in the 'inputs' vector pointing to the
+//                    current input that we have processed so far
+// @output input_idx - index will be incremented by the number of nodes
+//                     from 'inputs' that are processed
+// @input list_length - The expected length of list of TF tensors
+// @output output_nodes - the list of new nodes creating TF tensors
+//
+// @return None
+void GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &inputs, int *input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut> *output_nodes);
+
+// Create new inputs by copying old inputs 'inputs' for the rewritten node
+// in 'nb' in graph 'g'. Original node is input in 'orig_node'. This is mostly
+// used in the context of rewrite for just operator name change in which
+// inputs of old operator and new operator are same.
+//
+// Returns OkStatus() if setting up inputs is successful, otherwise
+// returns appropriate status code.
+Status CopyInputs(
+    const Node *old_node,
+    const gtl::InlinedVector<std::pair<Node *, int>, 4> &old_node_inputs,
+    NodeBuilder *nb);
+
+}  // namespace zendnn
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL || AMD_ZENDNN
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LAYOUT_PASS_UTIL_H_
diff --git a/tensorflow/core/common_runtime/local_session_selection.h b/tensorflow/core/common_runtime/local_session_selection.h
index 3ed0b7102ba..9e21c8d7ff6 100644
--- a/tensorflow/core/common_runtime/local_session_selection.h
+++ b/tensorflow/core/common_runtime/local_session_selection.h
@@ -30,4 +30,4 @@ LocalSessionImpl GetDefaultLocalSessionImpl();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_SESSION_SELECTION_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOCAL_SESSION_SELECTION_H_
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 1704ba26fe0..6894e677fdd 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -41,6 +41,7 @@ constexpr const char* const kLowerAsMultiDeviceFunctionAttr =
 
 constexpr const char* const kTpuReplicateAttr = "_tpu_replicate";
 constexpr const char* const kXlaClusterAttr = "_xla_compile_id";
+constexpr const char* const kXlaMustCompileAttr = "_XlaMustCompile";
 
 // Checks if boolean attribute is defined and it's value is 'true'.
 bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
@@ -69,7 +70,8 @@ bool MarkedForTpuCompilation(const Node* n) {
 }
 
 bool MarkedForXlaCompilation(const Node* n) {
-  return CheckStringAttr(n, kXlaClusterAttr);
+  return CheckStringAttr(n, kXlaClusterAttr) ||
+         CheckBoolAttr(n, kXlaMustCompileAttr);
 }
 
 bool HasArgsOrRetvals(const Graph& g) {
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 87d3e5552ef..9b0c13667ce 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -316,7 +316,7 @@ class MklCPUAllocator : public Allocator {
 
   // Size in bytes that defines the upper-bound for "small" allocations.
   // Any allocation below this threshold is "small" allocation.
-  static constexpr const size_t kSmallAllocationsThreshold = 4096;
+  static constexpr const size_t kSmallAllocationsThreshold = 262144;
 
   // Prevent copying and assignment
   TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator);
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index d9d5540e1cf..7125cbbd31e 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1787,7 +1787,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"LeakyRelu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "LeakyRelu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "_MklSwish"} ||
             fused_ops == std::vector<string>{"FusedBatchNorm"} ||
+            fused_ops == std::vector<string>{"FusedBatchNorm", "_MklSwish"} ||
             fused_ops == std::vector<string>{"FusedBatchNorm", "Relu"} ||
             fused_ops == std::vector<string>{"FusedBatchNorm", "Relu6"} ||
             fused_ops == std::vector<string>{"FusedBatchNorm", "Elu"} ||
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
new file mode 100644
index 00000000000..7d8e8e92f02
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -0,0 +1,349 @@
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "next_pluggable_device_allocator",
+    srcs = ["next_pluggable_device_allocator.cc"],
+    hdrs = ["next_pluggable_device_allocator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":next_pluggable_device_api",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+    ],
+)
+
+cc_library(
+    name = "next_pluggable_device",
+    srcs = [
+        "next_pluggable_device.cc",
+        "next_pluggable_device_context.cc",
+    ],
+    hdrs = [
+        "next_pluggable_device.h",
+        "next_pluggable_device_context.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":next_pluggable_device_allocator",
+        ":next_pluggable_device_api",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/jit:pjrt_device_context",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/lib:connected_traceme",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tfrt/common:pjrt_state",
+        "@com_google_absl//absl/flags:flag",
+    ],
+)
+
+cc_library(
+    name = "next_pluggable_device_api",
+    srcs = ["next_pluggable_device_api.cc"],
+    hdrs = ["next_pluggable_device_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "next_pluggable_device_factory",
+    srcs = [
+        "next_pluggable_device_factory.cc",
+    ],
+    hdrs = [
+        "next_pluggable_device_factory.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":next_pluggable_device",
+        ":next_pluggable_device_api",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "pjrt_compile_on_demand_op",
+    srcs = [
+        "pjrt_compile_on_demand_op.cc",
+    ],
+    hdrs = [
+        "pjrt_compile_on_demand_op.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":next_pluggable_device_api",
+        ":utils",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/compiler/jit:device_compiler_client",
+        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_launch_util",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/common:async_value_tensor",
+        "//tensorflow/core/tfrt/common:pjrt_util",
+        "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/core/util:determinism",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "plugin_resource",
+    srcs = [
+        "plugin_resource.cc",
+    ],
+    hdrs = [
+        "plugin_resource.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/framework:resource_base",
+    ],
+)
+
+cc_library(
+    name = "plugin_op_kernel",
+    hdrs = ["plugin_op_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core/framework:attr_value_proto_cc",
+        "//tensorflow/core/framework:bfloat16",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "direct_plugin_op_kernel",
+    srcs = ["direct_plugin_op_kernel.cc"],
+    hdrs = ["direct_plugin_op_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":direct_plugin_variable",
+        ":plugin_coordination_service_agent_helper",
+        ":plugin_op_kernel",
+        ":plugin_resource",
+        ":plugin_variable",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+bool_flag(
+    name = "tf_c_api_passthrough",
+    build_setting_default = True,
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "tf_c_api_passthrough_enabled",
+    flag_values = {":tf_c_api_passthrough": "True"},
+)
+
+cc_library(
+    name = "c_plugin_op_kernel",
+    srcs = ["c_plugin_op_kernel.cc"],
+    hdrs = ["c_plugin_op_kernel.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_plugin_variable",
+        ":plugin_coordination_service_agent",
+        ":plugin_coordination_service_agent_helper",
+        ":plugin_op_kernel",
+        ":plugin_variable",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:kernels_experimental",
+        "//tensorflow/c:tf_buffer_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/experimental/next_pluggable_device:c_api",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/framework:resource_handle_proto_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:thread_annotations",
+    ],
+)
+
+cc_library(
+    name = "plugin_op_kernel_helper",
+    hdrs = ["plugin_op_kernel_helper.h"],
+    defines = select({
+        "tf_c_api_passthrough_enabled": ["TF_OPKERNEL_C_API_PASSTHROUGH"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":loose_headers",
+        ":plugin_op_kernel",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:tf_status_helper",
+    ] + select({
+        "tf_c_api_passthrough_enabled": [":direct_plugin_op_kernel"],
+        "//conditions:default": [
+            ":c_plugin_op_kernel",
+        ],
+    }),
+)
+
+# For a more maintainable build this target should not exist and the headers
+# should  be split into the existing cc_library targets, but this change was
+# automatically  done so that we can remove long standing issues and complexity
+# in the build system. It's up to the OWNERS of this package to get rid of it or
+# not. The use of the textual_hdrs attribute is discouraged, use hdrs instead.
+# Here it is used to avoid header parsing errors in packages where the feature
+# parse_headers was enabled since loose headers were not being parsed. See
+# go/loose-lsc-one-target-approach for more details.
+cc_library(
+    name = "loose_headers",
+    tags = ["avoid_dep"],
+    textual_hdrs = ["c_plugin_op_kernel.h"],
+    visibility = [":__pkg__"],
+)
+
+cc_library(
+    name = "plugin_coordination_service_agent",
+    hdrs = ["plugin_coordination_service_agent.h"],
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core/platform:statusor"],
+)
+
+cc_library(
+    name = "direct_plugin_coordination_service_agent",
+    hdrs = ["direct_plugin_coordination_service_agent.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":plugin_coordination_service_agent",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+    ],
+)
+
+cc_library(
+    name = "c_plugin_coordination_service_agent",
+    srcs = ["c_plugin_coordination_service_agent.cc"],
+    hdrs = ["c_plugin_coordination_service_agent.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":plugin_coordination_service_agent",
+        "//tensorflow/c:kernels_experimental_hdrs",
+        "//tensorflow/c:tf_buffer_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/next_pluggable_device:c_api",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "plugin_coordination_service_agent_helper",
+    hdrs = ["plugin_coordination_service_agent_helper.h"],
+    defines = select({
+        "tf_c_api_passthrough_enabled": ["TF_OPKERNEL_C_API_PASSTHROUGH"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":plugin_coordination_service_agent",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:tf_status_helper",
+    ] + select({
+        "tf_c_api_passthrough_enabled": [
+            ":direct_plugin_coordination_service_agent",
+        ],
+        "//conditions:default": [
+            ":c_plugin_coordination_service_agent",
+        ],
+    }),
+)
+
+cc_library(
+    name = "plugin_variable",
+    hdrs = ["plugin_variable.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "direct_plugin_variable",
+    srcs = ["direct_plugin_variable.cc"],
+    hdrs = ["direct_plugin_variable.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":plugin_variable",
+        "//tensorflow/compiler/jit:xla_launch_util",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "c_plugin_variable",
+    srcs = ["c_plugin_variable.cc"],
+    hdrs = ["c_plugin_variable.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":plugin_variable",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/experimental/next_pluggable_device:c_api",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils.cc",
+    ],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/c:c_api_decl",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
new file mode 100644
index 00000000000..d904f29024f
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
@@ -0,0 +1,56 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "plugin_c_api_hdrs",
+    hdrs = ["plugin_c_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/c:c_api_headers",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/compiler/xla/c:c_api_decl",
+        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
+    ],
+)
+
+cc_library(
+    name = "example_plugin",
+    testonly = 1,
+    srcs = ["example_plugin.cc"],
+    hdrs = ["example_plugin.h"],
+    deps = [
+        ":plugin_c_api_hdrs",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/tsl/platform:env",
+        "@tf_runtime//:hostcontext_alwayslink",
+    ],
+)
+
+tf_cc_test(
+    name = "plugin_c_api_test",
+    srcs = ["plugin_c_api_test.cc"],
+    deps = [
+        ":example_plugin",
+        ":plugin_c_api_hdrs",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/core/platform:status",
+        "@com_google_googletest//:gtest_main",
+        "@tf_runtime//:hostcontext_alwayslink",
+    ],
+)
+
+tf_cc_shared_object(
+    name = "test_next_pluggable_device_plugin.so",
+    srcs = ["test_next_pluggable_device_plugin.cc"],
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [
+        ":plugin_c_api_hdrs",
+        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.cc
new file mode 100644
index 00000000000..36bdfb60c88
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.cc
@@ -0,0 +1,86 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h"
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace example_plugin {
+
+void TFNPD_DeviceEventAwait(TFNPD_DeviceEvent* event, TF_Status* status) {
+  tfrt::RCReference<tfrt::AsyncValue> av_event = event->event;
+  tfrt::Await(av_event);
+  CHECK(av_event->IsAvailable());  // Crash OK.
+  if (av_event->IsError()) {
+    TF_SetStatus(status, TF_INTERNAL,
+                 std::string(av_event->GetError().message()).c_str());
+  } else {
+    TF_SetStatus(status, TF_OK, "");
+  }
+}
+
+bool TFNPD_DeviceEventIsReady(TFNPD_DeviceEvent* event) {
+  return event->event->IsAvailable();
+}
+
+void TFNPD_DeviceEventAndThen(TFNPD_DeviceEvent* event, void (*callback)(void*),
+                              void* callback_arg) {
+  event->event->AndThen(
+      [callback, callback_arg]() { (*callback)(callback_arg); });
+}
+
+void TFNPD_DeviceEventDelete(TFNPD_DeviceEvent* event) { delete event; }
+
+TFNPD_DeviceEvent* CreateDeviceEventAndSetAvailable(tfrt::HostContext* host,
+                                                    bool set_as_error) {
+  // Create an AsyncValueRef as the event. Type is not important here.
+  auto av_ref = tfrt::MakeUnconstructedAsyncValueRef<bool>();
+  TFNPD_DeviceEvent* event = new TFNPD_DeviceEvent();
+  event->event = av_ref.CopyRCRef();
+
+  // Sleep for two seconds and set the async value available.
+  tfrt::EnqueueWork(host, [av_ref = av_ref.CopyRef(), set_as_error] {
+    LOG(INFO) << "Sleep for 2 seconds...";
+    tsl::Env::Default()->SleepForMicroseconds(2 * 1000 * 1000);
+    LOG(INFO) << "Slept for 2 seconds. Set the event to be available.";
+    if (set_as_error) {
+      av_ref.SetError("ERROR");
+    } else {
+      av_ref.emplace(true);
+    }
+  });
+  return event;
+}
+
+}  // namespace example_plugin
+
+const TFNPD_Api example_plugin_api = {
+    /*struct_size=*/TFNPD_Api_STRUCT_SIZE,
+    /*priv=*/nullptr,
+
+    /*TFNPD_NewDeviceEvent=*/nullptr,
+    /*TFNPD_DeviceEventAwait=*/example_plugin::TFNPD_DeviceEventAwait,
+    /*TFNPD_DeviceEventIsReady=*/example_plugin::TFNPD_DeviceEventIsReady,
+    /*TFNPD_DeviceEventAndThen=*/example_plugin::TFNPD_DeviceEventAndThen,
+    /*TFNPD_DeviceEventDelete=*/example_plugin::TFNPD_DeviceEventDelete,
+};
+
+const TFNPD_Api* GetExamplePluginApi() { return &example_plugin_api; }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h b/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h
new file mode 100644
index 00000000000..baeebef65ea
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+// This is an example plugin that impelements several basic APIs for event. This
+// is for testing only.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TFNPD_DeviceEvent {
+  tfrt::RCReference<tfrt::AsyncValue> event;
+};
+
+// Does not pass ownership of returned TFNPD_Api* to caller.
+const TFNPD_Api* GetExamplePluginApi();
+
+#ifdef __cplusplus
+}
+#endif
+
+namespace example_plugin {
+
+// A helper method that generates a TFNPD_DeviceEvent, and makes the event
+// available (or ready) in two seconds.
+TFNPD_DeviceEvent* CreateDeviceEventAndSetAvailable(tfrt::HostContext* host,
+                                                    bool set_as_error = false);
+
+}  // namespace example_plugin
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_EXAMPLE_PLUGIN_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
new file mode 100644
index 00000000000..24f1f57a45e
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h
@@ -0,0 +1,165 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
+
+#include <cstddef>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/compiler/xla/c/c_api_decl.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
+
+#define TFNPD_MAJOR 0
+#define TFNPD_MINOR 0
+#define TFNPD_PATCH 1
+
+// Experimental C API for TensorFlow Next Pluggable device (TFNPD).
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ----------------------------  Event  ----------------------------------------
+typedef struct TFNPD_DeviceEvent TFNPD_DeviceEvent;
+
+typedef TFNPD_DeviceEvent* TFNPD_NewDeviceEvent();
+
+typedef void TFNPD_DeviceEventAwait(TFNPD_DeviceEvent* event,
+                                    TF_Status* status);
+
+typedef bool TFNPD_DeviceEventIsReady(TFNPD_DeviceEvent* event);
+
+// Invokes the callback after event becomes ready.
+typedef void TFNPD_DeviceEventAndThen(TFNPD_DeviceEvent* event,
+                                      void (*callback)(void*),
+                                      void* callback_arg);
+
+typedef void TFNPD_DeviceEventDelete(TFNPD_DeviceEvent* event);
+
+// --------------------------  Allocator  --------------------------------------
+typedef struct TFNPD_DeviceAllocator TFNPD_DeviceAllocator;
+
+typedef TFNPD_DeviceAllocator* TFNPD_DeviceAllocatorCreate(int device_ordinal);
+
+typedef void* TFNPD_DeviceAllocateRaw(TFNPD_DeviceAllocator* allocator,
+                                      size_t alignment, size_t num_bytes);
+
+typedef void TFNPD_DeviceDeallocateRaw(TFNPD_DeviceAllocator* allocator,
+                                       void* ptr);
+
+typedef TF_StringView TFNPD_DeviceAllocatorName(
+    TFNPD_DeviceAllocator* allocator);
+
+typedef bool TFNPD_DeviceAllocatorAllocatesOpaqueHandle(
+    TFNPD_DeviceAllocator* allocator);
+
+typedef void TFNPD_DeviceAllocatorDelete(TFNPD_DeviceAllocator* allocator);
+
+// ------------------------  Tensor Transfers  ---------------------------------
+typedef struct TFNPD_DeviceContext TFNPD_DeviceContext;
+
+// TODO(chuanhao): use an option struct to create context. Plugin can define the
+// option so that we support more features in the DeviceContext, e.g.
+// shape_determination_fns.
+typedef TFNPD_DeviceContext* TFNPD_DeviceContextCreate(int device_ordinal);
+
+typedef TFNPD_DeviceEvent* TFNPD_DeviceTensorToHostTensor(
+    TFNPD_DeviceContext* device_context, const TF_Tensor* device_tensor,
+    TF_Tensor* cpu_tensor, TF_Status* status);
+
+typedef TFNPD_DeviceEvent* TFNPD_HostTensorToDeviceTensor(
+    TFNPD_DeviceContext* device_context, const TF_Tensor* cpu_tensor,
+    TF_Tensor* device_tensor, TF_Status* status);
+
+typedef TFNPD_DeviceEvent* TFNPD_SameDeviceTensorCopy(
+    TFNPD_DeviceContext* context);
+
+typedef void TFNPD_DeviceContextDelete(TFNPD_DeviceContext* context);
+
+// ------------------------------  TF2XLA  -------------------------------------
+// TODO(b/254484247): either separate XLA_Shape to its own file, or use PJRT
+// solution when it is ready.
+typedef void TFNPD_XlaShapeToDeviceShapeRepresentation(
+    XLA_Shape* serialized_xla_shape, int data_type, bool use_fast_memory,
+    XLA_LayoutPreference layout_preference, XLA_Shape* serialized_device_shape,
+    TF_Status* tf_status);
+
+// -----------------------  Plugin System related  -----------------------------
+typedef int32_t TFNPD_GetDeviceCount(TF_Status* status);
+
+// Initialize any per-device states or resources that are internal to plugin.
+typedef void TFNPD_InitPluginInternalDeviceStates(TF_Status* status);
+
+// --------------------------- C API access ------------------------------------
+#define TFNPD_API_STRUCT_FN(fn_type) fn_type* fn_type
+
+typedef struct {
+  size_t struct_size;
+  void* priv;
+
+  TFNPD_API_STRUCT_FN(TFNPD_NewDeviceEvent);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventAwait);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventIsReady);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventAndThen);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceEventDelete);
+
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorCreate);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocateRaw);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceDeallocateRaw);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorName);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorAllocatesOpaqueHandle);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceAllocatorDelete);
+
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceContextCreate);
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceContextDelete);
+
+  // TODO(chuanhao): Deprecate the tensor transfer C APIs when PJRT API
+  // development is ready since we plan to adopt PJRT as Device API.
+  TFNPD_API_STRUCT_FN(TFNPD_DeviceTensorToHostTensor);
+  TFNPD_API_STRUCT_FN(TFNPD_HostTensorToDeviceTensor);
+  TFNPD_API_STRUCT_FN(TFNPD_SameDeviceTensorCopy);
+
+  TFNPD_API_STRUCT_FN(TFNPD_XlaShapeToDeviceShapeRepresentation);
+
+  TFNPD_API_STRUCT_FN(TFNPD_GetDeviceCount);
+  TFNPD_API_STRUCT_FN(TFNPD_InitPluginInternalDeviceStates);
+} TFNPD_Api;
+
+const size_t TFNPD_Api_STRUCT_SIZE =
+    TF_OFFSET_OF_END(TFNPD_Api, TFNPD_InitPluginInternalDeviceStates);
+
+#undef TFNPD_API_STRUCT_FN
+
+typedef struct TFNPD_PluginParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  const char* device_type;              // output, set by plugin
+  const char* compilation_device_name;  // output, set by plugin
+} TFNPD_PluginParams;
+const size_t TFNPD_PLUGIN_PARAMS_STRUCT_SIZE =
+    TF_OFFSET_OF_END(TFNPD_PluginParams, compilation_device_name);
+const TFNPD_Api* TFNPD_InitPlugin(TFNPD_PluginParams* params,
+                                  TF_Status* tf_status);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif  // defined(__cplusplus)
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_C_API_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc
new file mode 100644
index 00000000000..44df9ad6d99
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/example_plugin.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+
+namespace {
+
+struct CallbackParams {
+  std::function<void(const tensorflow::Status&)> callback;
+  tensorflow::Status status;
+  const TFNPD_Api* api;
+  TFNPD_DeviceEvent* event;
+
+  ~CallbackParams() {
+    // Explicitly call the deletion API to free the event.
+    api->TFNPD_DeviceEventDelete(event);
+  }
+};
+
+// This function is passed to the AndThen C API. The AndThen C API waits for
+// the event to become ready, and invokes this function. The implementation of
+// this function is not exposed to the plugin.
+void InvokeCallbackFn(void* arg) {
+  CallbackParams* params = reinterpret_cast<CallbackParams*>(arg);
+  params->callback(params->status);
+  // Explicitly delete the params after callback is done.
+  delete params;
+}
+
+class PluginEventTestFixture : public testing::Test {
+ protected:
+  PluginEventTestFixture() {
+    api_ = GetExamplePluginApi();
+    auto diag_handler = [](const tfrt::DecodedDiagnostic& diag) {
+      LOG(ERROR) << diag.message();
+    };
+    std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue =
+        tfrt::CreateMultiThreadedWorkQueue(
+            /*num_threads=*/4, /*num_blocking_threads=*/4);
+    std::unique_ptr<tfrt::HostAllocator> host_allocator =
+        tfrt::CreateMallocAllocator();
+
+    host_ = std::make_unique<tfrt::HostContext>(
+        diag_handler, std::move(host_allocator), std::move(work_queue));
+
+    status_ = TF_NewStatus();
+  }
+
+  ~PluginEventTestFixture() override { TF_DeleteStatus(status_); }
+
+  std::unique_ptr<tfrt::HostContext> host_;
+  const TFNPD_Api* api_;
+  TF_Status* status_;
+};
+
+TEST_F(PluginEventTestFixture, TestAwait) {
+  std::unique_ptr<TFNPD_DeviceEvent> event;
+  event.reset(example_plugin::CreateDeviceEventAndSetAvailable(host_.get()));
+  // Event should be available after two seconds.
+  EXPECT_FALSE(api_->TFNPD_DeviceEventIsReady(event.get()));
+  api_->TFNPD_DeviceEventAwait(event.get(), status_);
+  EXPECT_TRUE(api_->TFNPD_DeviceEventIsReady(event.get()));
+  EXPECT_EQ(TF_GetCode(status_), TF_OK);
+}
+
+TEST_F(PluginEventTestFixture, TestAwaitWithError) {
+  std::unique_ptr<TFNPD_DeviceEvent> event;
+  event.reset(
+      example_plugin::CreateDeviceEventAndSetAvailable(host_.get(),
+                                                       /*set_as_error=*/true));
+  // Event should be available after two seconds.
+  EXPECT_FALSE(api_->TFNPD_DeviceEventIsReady(event.get()));
+  api_->TFNPD_DeviceEventAwait(event.get(), status_);
+  EXPECT_TRUE(api_->TFNPD_DeviceEventIsReady(event.get()));
+  EXPECT_EQ(TF_GetCode(status_), TF_INTERNAL);
+  EXPECT_STREQ(TF_Message(status_), "ERROR");
+}
+
+TEST_F(PluginEventTestFixture, TestInvokeCallback) {
+  auto result_avref = tfrt::MakeUnconstructedAsyncValueRef<int>();
+  std::string tennis_goat = "Sampras";
+
+  auto done = [result_avref = result_avref.CopyRef(),
+               &tennis_goat](const tensorflow::Status& status) {
+    result_avref.emplace(42);
+    LOG(INFO) << "Invoking status callback. Tennis goat is: "
+              << status.error_message();
+    tennis_goat = status.error_message();
+  };
+
+  TFNPD_DeviceEvent* event =
+      example_plugin::CreateDeviceEventAndSetAvailable(host_.get());
+
+  tensorflow::Status status(tensorflow::error::INTERNAL, "Federer");
+
+  // CallbackParams stores the "done" callback function passed in by TF, and
+  // status, which is "done"'s arg. We need to add another indirection since we
+  // can only cast a lambda without captures to be a function pointer.
+  CallbackParams* params =
+      new CallbackParams{std::move(done), status, api_, event};
+
+  api_->TFNPD_DeviceEventAndThen(event, &InvokeCallbackFn,
+                                 /*callback_arg=*/params);
+
+  // The test fixture can be deleted before the closure in AndThen() is
+  // finished. Move the host context here to extend the lifetime.
+  result_avref.AndThen([result_avref = result_avref.CopyRef(), tennis_goat,
+                        host = std::move(host_)] {
+    EXPECT_EQ(result_avref.get(), 42);
+    LOG(INFO) << "Tennis goat: " << tennis_goat;
+    EXPECT_EQ(tennis_goat, "Federer");
+  });
+}
+
+}  // namespace
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/test_next_pluggable_device_plugin.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/test_next_pluggable_device_plugin.cc
new file mode 100644
index 00000000000..6bed32063c1
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/test_next_pluggable_device_plugin.cc
@@ -0,0 +1,40 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+
+const TFNPD_Api example_plugin_api = {
+    /*struct_size=*/TFNPD_Api_STRUCT_SIZE,
+    /*priv=*/nullptr,
+};
+
+const TFNPD_Api* GetExamplePluginApi() { return &example_plugin_api; }
+
+const PJRT_Api example_pjrt_api = {
+    /*struct_size=*/PJRT_Api_STRUCT_SIZE,
+    /*priv=*/nullptr,
+};
+
+extern "C" {
+const TFNPD_Api* TFNPD_InitPlugin(TFNPD_PluginParams* params,
+                                  TF_Status* tf_status) {
+  params->device_type = "GPU";
+  params->compilation_device_name = "GPU";
+  return GetExamplePluginApi();
+}
+
+const PJRT_Api* GetPjrtApi() { return &example_pjrt_api; }
+}
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.cc
new file mode 100644
index 00000000000..83a950e4907
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.cc
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
+
+#include <string>
+
+#include "tensorflow/c/tf_buffer.h"
+#include "tensorflow/c/tf_status_helper.h"
+
+namespace tensorflow {
+
+Status CPluginCoordinationServiceAgent::InsertKeyValue(
+    const std::string& key, const std::string& value) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_CoordinationServiceInsertKeyValue(key.data(), value.data(), agent_,
+                                       status);
+  return StatusFromTF_Status(status);
+}
+
+StatusOr<std::string> CPluginCoordinationServiceAgent::GetKeyValue(
+    const std::string& key) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_Buffer* result_buf =
+      TF_CoordinationServiceGetKeyValue(key.data(), agent_, status);
+
+  if (TF_GetCode(status) != TF_OK) {
+    return StatusFromTF_Status(status);
+  } else {
+    std::string result{static_cast<const char*>(result_buf->data),
+                       result_buf->length};
+    TF_DeleteBuffer(result_buf);
+    return result;
+  }
+}
+
+Status CPluginCoordinationServiceAgent::DeleteKeyValue(const std::string& key) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_CoordinationServiceDeleteKeyValue(key.data(), agent_, status);
+  return StatusFromTF_Status(status);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h
new file mode 100644
index 00000000000..b37c5be5640
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <string>
+
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class CPluginCoordinationServiceAgent : public PluginCoordinationServiceAgent {
+ public:
+  explicit CPluginCoordinationServiceAgent(void* agent)
+      : agent_(reinterpret_cast<TF_CoordinationServiceAgent*>(agent)) {}
+
+  bool IsInitialized() const override {
+    if (agent_ == nullptr) return false;
+    return TF_CoordinationServiceIsInitialized(agent_);
+  }
+
+  Status InsertKeyValue(const std::string& key,
+                        const std::string& value) override;
+
+  StatusOr<std::string> GetKeyValue(const std::string& key) override;
+
+  Status DeleteKeyValue(const std::string& key) override;
+
+ private:
+  TF_CoordinationServiceAgent* agent_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
new file mode 100644
index 00000000000..844546e4324
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.cc
@@ -0,0 +1,335 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
+
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/kernels_experimental.h"
+#include "tensorflow/c/tf_buffer_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_handle.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+constexpr int kInvalidLineNumber = -1;
+
+namespace tensorflow {
+
+// ------------------  CPluginOpKernelConstruction  ----------------------------
+Status CPluginOpKernelConstruction::GetBoolAttr(std::string_view attr_name,
+                                                bool* value) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  unsigned char bool_as_char;
+  TF_OpKernelConstruction_GetAttrBool(ctx_, attr_name.data(), &bool_as_char,
+                                      status);
+  *value = static_cast<bool>(bool_as_char);
+  return StatusFromTF_Status(status);
+}
+
+Status CPluginOpKernelConstruction::GetInt32Attr(std::string_view attr_name,
+                                                 int32_t* value) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_OpKernelConstruction_GetAttrInt32(ctx_, attr_name.data(), value, status);
+  return StatusFromTF_Status(status);
+}
+
+Status CPluginOpKernelConstruction::GetInt32AttrList(
+    std::string_view attr_name, std::vector<int32_t>* value) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  int32_t list_size;
+  int32_t total_size;  // total_size is undefined for int32 attribute.
+  TF_OpKernelConstruction_GetAttrSize(ctx_, attr_name.data(), &list_size,
+                                      &total_size, status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+
+  value->reserve(list_size);
+
+  TF_OpKernelConstruction_GetAttrInt32List(
+      ctx_, attr_name.data(), value->data(), /*max_vals=*/list_size, status);
+  return StatusFromTF_Status(status);
+}
+
+Status CPluginOpKernelConstruction::GetInt64Attr(std::string_view attr_name,
+                                                 int64_t* value) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_OpKernelConstruction_GetAttrInt64(ctx_, attr_name.data(), value, status);
+  return StatusFromTF_Status(status);
+}
+
+Status CPluginOpKernelConstruction::GetStringAttr(std::string_view attr_name,
+                                                  std::string* value) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  int list_size = 0, attr_string_size = 0;  // list_size is not used.
+  TF_OpKernelConstruction_GetAttrSize(ctx_, attr_name.data(), &list_size,
+                                      &attr_string_size, status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  value->resize(attr_string_size);
+  TF_OpKernelConstruction_GetAttrString(ctx_, attr_name.data(), value->data(),
+                                        /*max_length=*/attr_string_size,
+                                        status);
+  return StatusFromTF_Status(status);
+}
+
+Status CPluginOpKernelConstruction::GetFunctionAttr(
+    std::string_view attr_name, NameAttrList* function) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_Buffer* serialized_function =
+      TF_OpKernelConstruction_GetAttrFunction(ctx_, attr_name.data(), status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  TF_RETURN_IF_ERROR(BufferToMessage(serialized_function, function));
+  TF_DeleteBuffer(serialized_function);
+  return OkStatus();
+}
+
+void CPluginOpKernelConstruction::CtxFailure(const Status& status) {
+  CtxFailure(/*file=*/"", /*line=*/kInvalidLineNumber, status);
+}
+
+void CPluginOpKernelConstruction::CtxFailure(const char* file, int line,
+                                             const Status& status) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  Set_TF_Status_from_Status(c_status_ptr.get(), status);
+  if (line != kInvalidLineNumber) {
+    LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
+                 << ": " << status;
+  }
+  TF_OpKernelConstruction_Failure(ctx_, c_status_ptr.get());
+}
+
+// -------------------  CPluginOpKernelContext  -------------------------------
+std::string_view CPluginOpKernelContext::GetResourceMgrDefaultContainerName() {
+  TF_StringView default_container_name =
+      TF_GetResourceMgrDefaultContainerName(ctx_);
+  return {default_container_name.data, default_container_name.len};
+}
+
+Status CPluginOpKernelContext::LookupOrCreateResource(
+    std::string_view container_name, std::string_view plugin_resource_name,
+    void** result_plugin_resource, void* (*create_func)(void*),
+    void* create_func_args, void (*delete_func)(void*)) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Status* status = c_status_ptr.get();
+  TF_LookupOrCreatePluginResource(
+      ctx_,
+      /*container_name=*/container_name.data(),
+      /*plugin_resource_name=*/plugin_resource_name.data(),
+      /*result_plugin_resource=*/result_plugin_resource,
+      /*create_func=*/create_func,
+      /*create_func_args=*/create_func_args,
+      /*delete_func=*/delete_func, /*status=*/status);
+  return StatusFromTF_Status(status);
+}
+
+PluginCoordinationServiceAgent*
+CPluginOpKernelContext::GetPluginCoordinationServiceAgent() const {
+  auto* agent = TF_GetCoordinationServiceAgent(ctx_);
+  return CreatePluginCoordinationServiceAgent(agent);
+}
+
+Status CPluginOpKernelContext::CreatePluginVariable(
+    int index, PluginVariable** variable) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_VariableInfo* c_var_info =
+      TF_CreateVariableInfoFromContext(ctx_, index, c_status_ptr.get());
+  if (TF_GetCode(c_status_ptr.get()) != TF_OK) {
+    return StatusFromTF_Status(c_status_ptr.get());
+  }
+  *variable = new CPluginVariable(c_var_info);
+  return tsl::OkStatus();
+}
+
+Status CPluginOpKernelContext::AllocateTempForPluginVariable(
+    PluginVariable* variable) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  CPluginVariable* c_plugin_variable =
+      reinterpret_cast<CPluginVariable*>(variable);
+  TF_AllocateTempForVariableInfo(ctx_, c_plugin_variable->var_info_,
+                                 c_status_ptr.get());
+  tsl::Status status = StatusFromTF_Status(c_status_ptr.get());
+  if (status.ok()) {
+    // Invalidate the cached tensor since we allocated a new one.
+    c_plugin_variable->tensor_obtained_ = false;
+  }
+  return status;
+}
+
+Status CPluginOpKernelContext::GetInput(int index, Tensor* tensor) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Tensor* c_tensor;
+  TF_GetInput(ctx_, index, &c_tensor, c_status_ptr.get());
+  TF_TensorPtr c_tensor_ptr(c_tensor);
+  if (TF_GetCode(c_status_ptr.get()) != TF_OK) {
+    return StatusFromTF_Status(c_status_ptr.get());
+  }
+  return TF_TensorToTensor(c_tensor, tensor);
+}
+
+Status CPluginOpKernelContext::GetInput(const char* name,
+                                        const Tensor** tensor) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Tensor* c_tensor;
+  TF_GetInputByName(ctx_, name, &c_tensor, c_status_ptr.get());
+  TF_TensorPtr c_tensor_ptr(c_tensor);
+  Tensor tensor_tmp;
+  tsl::Status status = TF_TensorToTensor(c_tensor, &tensor_tmp);
+  if (status.ok()) {
+    mutex_lock lock(mu_);
+    obtained_tensors_.push_back(std::move(tensor_tmp));
+    *tensor = &(obtained_tensors_.back());
+  }
+  return status;
+}
+
+Status CPluginOpKernelContext::GetInputRange(std::string_view name,
+                                             std::pair<int, int>* range) const {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_InputRange_Args args;
+  args.status = c_status_ptr.get();
+  TF_InputRange(ctx_, name.data(), &args);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(args.status));
+  range->first = args.start;
+  range->second = args.stop;
+  return OkStatus();
+}
+
+DataType CPluginOpKernelContext::GetInputDataType(int index) const {
+  return static_cast<DataType>(TF_InputDatatype(ctx_, index));
+}
+
+std::string_view CPluginOpKernelContext::GetOpKernelRequestedInput(
+    int index) const {
+  TF_StringView requested_input = TF_GetOpKernelRequestedInput(ctx_, index);
+  return {requested_input.data, requested_input.len};
+}
+
+std::string_view CPluginOpKernelContext::GetOpKernelName() const {
+  TF_StringView op_kernel_name = TF_GetOpKernelName(ctx_);
+  return {op_kernel_name.data, op_kernel_name.len};
+}
+
+Status CPluginOpKernelContext::GetConfigProto(
+    const ConfigProto** config_proto) const {
+  TF_BufferPtr serialized_config_proto_ptr(TF_NewBuffer());
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_GetSerializedConfigProto(ctx_, serialized_config_proto_ptr.get(),
+                              c_status_ptr.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
+  ConfigProto* config_proto_ptr = new ConfigProto();
+  Status status =
+      BufferToMessage(serialized_config_proto_ptr.get(), config_proto_ptr);
+  *config_proto = config_proto_ptr;
+  return status;
+}
+
+Status CPluginOpKernelContext::GetFunctionLibraryDefinition(
+    const FunctionLibraryDefinition** flib_def) const {
+  TF_BufferPtr serialized_function_library_ptr(TF_NewBuffer());
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+
+  TF_GetSerializedFunctionDefLibrary(
+      ctx_, serialized_function_library_ptr.get(), c_status_ptr.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
+  FunctionDefLibrary fdef_lib;
+  TF_RETURN_IF_ERROR(
+      BufferToMessage(serialized_function_library_ptr.get(), &fdef_lib));
+  auto flib_def_ptr =
+      new FunctionLibraryDefinition(OpRegistry::Global(), fdef_lib);
+  *flib_def = flib_def_ptr;
+  return OkStatus();
+}
+
+Status CPluginOpKernelContext::GetResourceHandle(
+    int index, const ResourceHandle** handle) const {
+  TF_BufferPtr serialized_resource_handle_ptr(TF_NewBuffer());
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+
+  TF_GetSerializedResourceHandleProto(
+      ctx_, index, serialized_resource_handle_ptr.get(), c_status_ptr.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
+
+  ResourceHandleProto handle_proto;
+  TF_RETURN_IF_ERROR(
+      BufferToMessage(serialized_resource_handle_ptr.get(), &handle_proto));
+  const ResourceHandle* handle_ptr = new ResourceHandle(handle_proto);
+
+  *handle = handle_ptr;
+  return OkStatus();
+}
+
+Status CPluginOpKernelContext::AllocateOutput(int index,
+                                              const TensorShape& shape,
+                                              Tensor** out) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  const auto num_dims = shape.dims();
+  int64_t* dim_array = new int64_t[num_dims];
+  for (int i = 0; i < num_dims; ++i) {
+    dim_array[i] = shape.dim_size(i);
+  }
+  // Note: dtype and len in TF_AllocateOutput are dummy.
+  TF_TensorPtr c_tensor_ptr(
+      TF_AllocateOutput(ctx_, index, /*dtype=*/TF_FLOAT, /*dims=*/dim_array,
+                        /*num_dims=*/num_dims, /*len=*/0, c_status_ptr.get()));
+  delete[] dim_array;
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
+  return TF_TensorToTensor(c_tensor_ptr.get(), *out);
+}
+
+Status CPluginOpKernelContext::SetOutput(int index, const Tensor& tensor) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_TensorPtr c_tensor_ptr;
+  Status status;
+  c_tensor_ptr.reset(TF_TensorFromTensor(tensor, &status));
+  TF_RETURN_IF_ERROR(status);
+  TF_SetOutput(ctx_, index, c_tensor_ptr.get(), c_status_ptr.get());
+  return StatusFromTF_Status(c_status_ptr.get());
+}
+
+void CPluginOpKernelContext::CtxFailure(const Status& status) {
+  CtxFailure(/*file=*/"", /*line=*/kInvalidLineNumber, status);
+}
+
+void CPluginOpKernelContext::CtxFailure(const char* file, int line,
+                                        const Status& status) {
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  Set_TF_Status_from_Status(c_status_ptr.get(), status);
+  if (line != kInvalidLineNumber) {
+    LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
+                 << ": " << status;
+  }
+  TF_OpKernelContext_Failure(ctx_, c_status_ptr.get());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h
new file mode 100644
index 00000000000..ebe3e4d3184
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h
@@ -0,0 +1,162 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+
+class CPluginOpKernelConstruction : public PluginOpKernelConstruction {
+ public:
+  explicit CPluginOpKernelConstruction(void* ctx)
+      : ctx_(reinterpret_cast<TF_OpKernelConstruction*>(ctx)) {}
+
+  Status GetBoolAttr(std::string_view attr_name, bool* value) const override;
+  Status GetInt32Attr(std::string_view attr_name, int* value) const override;
+  Status GetInt32AttrList(std::string_view attr_name,
+                          std::vector<int32_t>* value) const override;
+  Status GetInt64Attr(std::string_view attr_name,
+                      int64_t* value) const override;
+  Status GetStringAttr(std::string_view attr_name,
+                       std::string* value) const override;
+  Status GetFunctionAttr(std::string_view attr_name,
+                         NameAttrList* function) const override;
+
+  void CtxFailure(const Status& status) override;
+  void CtxFailure(const char* file, int line, const Status& status) override;
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  TF_OpKernelConstruction* ctx_;  // not owned.
+};
+
+class CPluginOpKernelContext : public PluginOpKernelContext {
+ public:
+  explicit CPluginOpKernelContext(void* ctx)
+      : ctx_(reinterpret_cast<TF_OpKernelContext*>(ctx)) {}
+
+  std::string_view GetResourceMgrDefaultContainerName() override;
+
+  Status LookupOrCreateResource(std::string_view container_name,
+                                std::string_view plugin_resource_name,
+                                void** result_plugin_resource,
+                                void* (*create_func)(void*),
+                                void* create_func_args,
+                                void (*delete_func)(void*)) override;
+
+  PluginCoordinationServiceAgent* GetPluginCoordinationServiceAgent()
+      const override;
+
+  Status CreatePluginVariable(int index,
+                              PluginVariable** variable) const override;
+
+  Status AllocateTempForPluginVariable(PluginVariable* variable) override;
+
+  int NumInputs() const override { return TF_NumInputs(ctx_); }
+
+  Status GetInput(int index, Tensor* tensor) const override;
+
+  Status GetInput(const char* name, const Tensor** tensor) override;
+
+  Status GetInputRange(std::string_view name,
+                       std::pair<int, int>* range) const override;
+
+  DataType GetInputDataType(int index) const override;
+
+  std::string_view GetOpKernelRequestedInput(int index) const override;
+
+  std::string_view GetOpKernelName() const override;
+
+  uint64_t GetFrameId() const override { return TF_GetFrameId(ctx_); }
+
+  int64_t GetIterId() const override { return TF_GetIterId(ctx_); }
+
+  int64_t GetStepId() const override { return TF_GetStepId(ctx_); }
+
+  int GetDeviceId() const override { return TF_GetDeviceId(ctx_); }
+
+  std::string GetSessionName() const override {
+    // TODO(haoyuzhang): Implement with ctx_->session_metadata() if needed.
+    return "";
+  }
+
+  Status GetConfigProto(const ConfigProto** config_proto) const override;
+
+  // Note: this function is only meant to clear up `config_proto` created by the
+  // above `CPluginOpKernelContext::GetConfigProto()`.
+  void MaybeDeleteConfigProto(const ConfigProto* config_proto) const override {
+    delete config_proto;
+  }
+
+  Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const override;
+
+  // Note: this function is only meant to clear up `flib_def` created by the
+  // above `CPluginOpKernelContext::GetFunctionLibraryDefinition()`.
+  void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const override {
+    delete flib_def;
+  }
+
+  Status GetResourceHandle(int index,
+                           const ResourceHandle** handle) const override;
+
+  // Note: this function is only meant to clear up `handle` created by the above
+  // `CPluginOpKernelContext::GetResourceHandle()`.
+  void MaybeDeleteResourceHandle(const ResourceHandle* handle) const override {
+    delete handle;
+  }
+
+  int GetGraphDefVersion() const override {
+    return TF_GetGraphDefVersion(ctx_);
+  }
+
+  Status AllocateOutput(int index, const TensorShape& shape,
+                        Tensor** out) override;
+
+  Status SetOutput(int index, const Tensor& tensor) override;
+
+  void CtxFailure(const Status& status) override;
+  void CtxFailure(const char* file, int line, const Status& status) override;
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  mutable mutex mu_;
+
+  // A cache for tensors obtained from the ctx_. This is needed to extend the
+  // lifetime of the c++ tensorflow::Tensor created from `TF_TensorToTensor`.
+  std::vector<Tensor> obtained_tensors_ TF_GUARDED_BY(mu_);
+  TF_OpKernelContext* ctx_;  // not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_OP_KERNEL_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc
new file mode 100644
index 00000000000..5eaf8dedcd1
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h"
+
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+CPluginVariable::~CPluginVariable() { TF_DeleteVariableInfo(var_info_); }
+
+tsl::Status CPluginVariable::GetTensorInternal() {
+  // Note: we assume once a variable is initialized, it's underlying tensor
+  // won't change during it's lifecycle.
+  if (tensor_obtained_) {
+    return tsl::OkStatus();
+  }
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  TF_Tensor* c_tensor =
+      TF_GetTensorFromVariableInfo(var_info_, c_status_ptr.get());
+  if (TF_GetCode(c_status_ptr.get()) != TF_OK) {
+    return StatusFromTF_Status(c_status_ptr.get());
+  }
+  TF_RETURN_IF_ERROR(TF_TensorToTensor(c_tensor, &tensor_));
+  tensor_obtained_ = true;
+  return tsl::OkStatus();
+}
+
+tsl::Status CPluginVariable::GetTensor(const Tensor** result_tensor) {
+  TF_RETURN_IF_ERROR(GetTensorInternal());
+  *result_tensor = &tensor_;
+  return tsl::OkStatus();
+}
+
+tsl::Status CPluginVariable::GetMutableTensor(Tensor** result_tensor) {
+  // Note: we assume once a variable is initialized, it's underlying tensor
+  // won't change during it's lifecycle.
+  TF_RETURN_IF_ERROR(GetTensorInternal());
+  *result_tensor = &tensor_;
+  return tsl::OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
new file mode 100644
index 00000000000..7cca7481e4d
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
+
+#include "tensorflow/c/experimental/next_pluggable_device/c_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class CPluginOpKernelContext;
+
+class CPluginVariable : public PluginVariable {
+ public:
+  ~CPluginVariable() override;
+  explicit CPluginVariable(TF_VariableInfo* var_info) : var_info_(var_info) {}
+
+  tsl::Status GetTensor(const Tensor** result_tensor) override;
+
+  tsl::Status GetMutableTensor(Tensor** result_tensor) override;
+
+  TF_VariableInfo* GetVariableInfo() { return var_info_; }
+
+  friend class CPluginOpKernelContext;
+
+ private:
+  tsl::Status GetTensorInternal();
+
+  TF_VariableInfo* var_info_;  // Owned. Cleared by destructor.
+  bool tensor_obtained_ = false;
+  tensorflow::Tensor tensor_;  // Tensor obtained from variable.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_C_PLUGIN_VARIABLE_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
new file mode 100644
index 00000000000..057c673b608
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <cstddef>
+#include <string>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+
+namespace tensorflow {
+
+class DirectPluginCoordinationServiceAgent
+    : public PluginCoordinationServiceAgent {
+ public:
+  explicit DirectPluginCoordinationServiceAgent(void* agent)
+      : agent_(reinterpret_cast<tsl::CoordinationServiceAgent*>(agent)) {}
+
+  bool IsInitialized() const override {
+    if (agent_ == nullptr) return false;
+    return agent_->IsInitialized();
+  }
+
+  Status InsertKeyValue(const std::string& key,
+                        const std::string& value) override {
+    return agent_->InsertKeyValue(key, value);
+  }
+
+  StatusOr<std::string> GetKeyValue(const std::string& key) override {
+    return agent_->GetKeyValue(key);
+  }
+
+  Status DeleteKeyValue(const std::string& key) override {
+    return agent_->DeleteKeyValue(key);
+  }
+
+ private:
+  tsl::CoordinationServiceAgent* agent_;  // Not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
new file mode 100644
index 00000000000..66837261343
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
@@ -0,0 +1,143 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+Status DirectPluginOpKernelConstruction::GetBoolAttr(std::string_view attr_name,
+                                                     bool* value) const {
+  return ctx_->GetAttr(attr_name, value);
+}
+
+Status DirectPluginOpKernelConstruction::GetInt32Attr(
+    std::string_view attr_name, int* value) const {
+  return ctx_->GetAttr(attr_name, value);
+}
+
+Status DirectPluginOpKernelConstruction::GetInt32AttrList(
+    std::string_view attr_name, std::vector<int32_t>* value) const {
+  return ctx_->GetAttr(attr_name, value);
+}
+
+Status DirectPluginOpKernelConstruction::GetInt64Attr(
+    std::string_view attr_name, int64_t* value) const {
+  return ctx_->GetAttr(attr_name, value);
+}
+
+Status DirectPluginOpKernelConstruction::GetStringAttr(
+    std::string_view attr_name, std::string* value) const {
+  return ctx_->GetAttr(attr_name, value);
+}
+
+Status DirectPluginOpKernelConstruction::GetFunctionAttr(
+    std::string_view attr_name, NameAttrList* function) const {
+  return ctx_->GetAttr(attr_name, function);
+}
+
+Status DirectPluginOpKernelContext::CreatePluginVariable(
+    int index, PluginVariable** variable) const {
+  const auto& arg_tensor = ctx_->input(index);
+  if (arg_tensor.dtype() != DT_RESOURCE) {
+    return tsl::errors::InvalidArgument(
+        "Trying to obtain resource handle from Input[", index,
+        "], which is not type DT_RESOURCE.");
+  }
+  const ResourceHandle& handle = arg_tensor.flat<ResourceHandle>()(0);
+  Var* var;
+  TF_RETURN_IF_ERROR(LookupResource(ctx_, handle, &var));
+
+  *variable = new DirectPluginVariable(index, handle.name(), var);
+  return tsl::OkStatus();
+}
+
+Status DirectPluginOpKernelContext::AllocateTempForPluginVariable(
+    PluginVariable* variable) {
+  auto* direct_variable = reinterpret_cast<DirectPluginVariable*>(variable);
+  if (direct_variable->var_info_.var() == nullptr) {
+    return tsl::errors::InvalidArgument(
+        "VariableInfo does not track a resource variable.");
+  }
+  Tensor* var_tensor = direct_variable->var_info_.var()->tensor();
+  return ctx_->allocate_temp(var_tensor->dtype(), var_tensor->shape(),
+                             var_tensor);
+}
+
+std::string_view
+DirectPluginOpKernelContext::GetResourceMgrDefaultContainerName() {
+  CHECK(ctx_->resource_manager() != nullptr);  // Crash OK.
+  return ctx_->resource_manager()->default_container();
+}
+
+Status DirectPluginOpKernelContext::LookupOrCreateResource(
+    std::string_view container_name, std::string_view plugin_resource_name,
+    void** result_plugin_resource, void* (*create_func)(void*),
+    void* create_func_args, void (*delete_func)(void*)) {
+  auto* resource_mgr = ctx_->resource_manager();
+  tensorflow::core::RefCountPtr<tensorflow::PluginResource>
+      tf_plugin_resource_ptr;
+  tensorflow::PluginResource* tf_plugin_resource = nullptr;
+
+  TF_RETURN_IF_ERROR(resource_mgr->LookupOrCreate<tensorflow::PluginResource>(
+      std::string(container_name), std::string(plugin_resource_name),
+      &tf_plugin_resource,
+      [plugin_resource_name, create_func, create_func_args,
+       delete_func](tensorflow::PluginResource** new_resource) {
+        void* opaque_plugin_resource = create_func(create_func_args);
+        *new_resource = new tensorflow::PluginResource(
+            opaque_plugin_resource, plugin_resource_name, delete_func);
+        return tensorflow::OkStatus();
+      }));
+  tf_plugin_resource_ptr.reset(tf_plugin_resource);
+  *result_plugin_resource = tf_plugin_resource_ptr->GetOpaquePluginResource();
+  return OkStatus();
+}
+
+Status DirectPluginOpKernelContext::GetInput(int index, Tensor* tensor) const {
+  *tensor = ctx_->input(index);
+  return OkStatus();
+}
+
+Status DirectPluginOpKernelContext::GetInput(const char* name,
+                                             const Tensor** tensor) {
+  return ctx_->input(name, tensor);
+}
+
+Status DirectPluginOpKernelContext::GetInputRange(
+    std::string_view name, std::pair<int, int>* range) const {
+  return ctx_->op_kernel().InputRange(name, &range->first, &range->second);
+}
+
+int DirectPluginOpKernelContext::GetDeviceId() const {
+  const auto* device = ctx_->device();
+  CHECK(device->parsed_name().has_id);  // Crash OK.
+  return device->parsed_name().id;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
new file mode 100644
index 00000000000..64f87d79ceb
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
@@ -0,0 +1,182 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+class DirectPluginOpKernelConstruction : public PluginOpKernelConstruction {
+ public:
+  explicit DirectPluginOpKernelConstruction(void* ctx)
+      : ctx_(reinterpret_cast<OpKernelConstruction*>(ctx)) {}
+
+  Status GetBoolAttr(std::string_view attr_name, bool* value) const override;
+  Status GetInt32Attr(std::string_view attr_name, int* value) const override;
+  Status GetInt32AttrList(std::string_view attr_name,
+                          std::vector<int32_t>* value) const override;
+  Status GetInt64Attr(std::string_view attr_name,
+                      int64_t* value) const override;
+  Status GetStringAttr(std::string_view attr_name,
+                       std::string* value) const override;
+  Status GetFunctionAttr(std::string_view attr_name,
+                         NameAttrList* function) const override;
+
+  void CtxFailure(const Status& status) override { ctx_->CtxFailure(status); }
+
+  void CtxFailure(const char* file, int line, const Status& status) override {
+    ctx_->CtxFailure(file, line, status);
+  }
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  OpKernelConstruction* ctx_;  // not owned.
+};
+
+class DirectPluginOpKernelContext : public PluginOpKernelContext {
+ public:
+  explicit DirectPluginOpKernelContext(void* ctx)
+      : ctx_(reinterpret_cast<OpKernelContext*>(ctx)) {}
+
+  std::string_view GetResourceMgrDefaultContainerName() override;
+
+  Status LookupOrCreateResource(std::string_view container_name,
+                                std::string_view plugin_resource_name,
+                                void** result_plugin_resource,
+                                void* (*create_func)(void*),
+                                void* create_func_args,
+                                void (*delete_func)(void*)) override;
+
+  PluginCoordinationServiceAgent* GetPluginCoordinationServiceAgent()
+      const override {
+    return CreatePluginCoordinationServiceAgent(
+        ctx_->coordination_service_agent());
+  }
+
+  Status CreatePluginVariable(int index,
+                              PluginVariable** variable) const override;
+
+  Status AllocateTempForPluginVariable(PluginVariable* variable) override;
+
+  int NumInputs() const override { return ctx_->num_inputs(); }
+
+  Status GetInput(int index, Tensor* tensor) const override;
+
+  Status GetInput(const char* name, const Tensor** tensor) override;
+
+  Status GetInputRange(std::string_view name,
+                       std::pair<int, int>* range) const override;
+
+  DataType GetInputDataType(int index) const override {
+    return ctx_->input_dtype(index);
+  }
+
+  std::string_view GetOpKernelRequestedInput(int index) const override {
+    return ctx_->op_kernel().requested_input(index);
+  }
+
+  std::string_view GetOpKernelName() const override {
+    return ctx_->op_kernel().name();
+  }
+
+  uint64_t GetFrameId() const override { return ctx_->frame_iter().frame_id; }
+
+  int64_t GetIterId() const override { return ctx_->frame_iter().iter_id; }
+
+  int64_t GetStepId() const override { return ctx_->step_id(); }
+
+  int GetDeviceId() const override;
+
+  std::string GetSessionName() const override {
+    return ctx_->session_metadata() ? ctx_->session_metadata()->name() : "";
+  }
+
+  Status GetConfigProto(const ConfigProto** config_proto) const override {
+    *config_proto = ctx_->function_library()->config_proto();
+    return OkStatus();
+  }
+
+  void MaybeDeleteConfigProto(const ConfigProto* config_proto) const override {
+    // We don't need to specifically delete ConfigProto since it is obtained
+    // from FunctionLibraryRuntime in `ctx_`.
+  }
+
+  Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const override {
+    *flib_def = ctx_->function_library()->GetFunctionLibraryDefinition();
+    return OkStatus();
+  }
+
+  void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const override {
+    // We don't need to specifically delete FunctionLibraryDefinition since it
+    // is obtained from FunctionLibraryRuntime in `ctx_`.
+  }
+
+  Status GetResourceHandle(int index,
+                           const ResourceHandle** handle) const override {
+    *handle = &HandleFromInput(ctx_, index);
+    return OkStatus();
+  }
+
+  void MaybeDeleteResourceHandle(const ResourceHandle* handle) const override {
+    // We don't need to specifically delete ResourceHandle since it is obtained
+    // from `ctx_`.
+  }
+
+  int GetGraphDefVersion() const override {
+    return ctx_->function_library()->graph_def_version();
+  }
+
+  Status AllocateOutput(int index, const TensorShape& shape,
+                        Tensor** out) override {
+    return ctx_->allocate_output(index, shape, out);
+  }
+
+  Status SetOutput(int index, const Tensor& tensor) override {
+    ctx_->set_output(index, tensor);
+    return OkStatus();
+  }
+
+  void CtxFailure(const Status& status) override { ctx_->CtxFailure(status); }
+
+  void CtxFailure(const char* file, int line, const Status& status) override {
+    LOG(WARNING) << "Plugin OP_REQUIRES failed at " << file << ": " << line
+                 << ": " << status;
+    ctx_->CtxFailure(file, line, status);
+  }
+
+  void* GetContext() const override { return ctx_; }
+
+ private:
+  OpKernelContext* ctx_;  // not owned.
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_OP_KERNEL_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.cc b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.cc
new file mode 100644
index 00000000000..ce9a8e226ff
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.cc
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h"
+
+#include <string>
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+
+namespace tensorflow {
+
+DirectPluginVariable::DirectPluginVariable(int index, const std::string& name,
+                                           Var* var) {
+  var_info_ = VariableInfo{index, name, var};
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
new file mode 100644
index 00000000000..9c65118b69d
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
@@ -0,0 +1,52 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
+
+#include <string>
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+class DirectPluginOpKernelContext;
+
+class DirectPluginVariable : public PluginVariable {
+ public:
+  DirectPluginVariable(int index, const std::string& name, Var* var);
+  tsl::Status GetTensor(const Tensor** result_tensor) override {
+    *result_tensor = var_info_.var()->tensor();
+    return tsl::OkStatus();
+  }
+
+  tsl::Status GetMutableTensor(Tensor** result_tensor) override {
+    *result_tensor = var_info_.var()->tensor();
+    return tsl::OkStatus();
+  }
+
+  VariableInfo* GetVariableInfo() { return &var_info_; }
+
+  friend DirectPluginOpKernelContext;
+
+ private:
+  VariableInfo var_info_{0, "", nullptr};
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_DIRECT_PLUGIN_VARIABLE_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
new file mode 100644
index 00000000000..e460a4dcf49
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
@@ -0,0 +1,131 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/flags/flag.h"
+#include "tensorflow/compiler/jit/pjrt_device_context.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+// TODO(b/263832956): remove this flag when next_pluggable_device is open
+// sourced.
+ABSL_FLAG(bool, next_pluggable_device_use_pjrt, true,
+          "Use PjRtClient for data transfer and compile on demand op in next "
+          "pluggable device.");
+
+namespace tensorflow {
+
+// TODO(chuanhao): implement an API to query device memory, and make
+// memory_limit a parameter instead of hard coding.
+static DeviceAttributes BuildNextPluggableDeviceAttributes(
+    const string& name_prefix, const string& device_name, int device_ordinal) {
+  return Device::BuildDeviceAttributes(
+      absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
+      DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
+      absl::StrCat("device: ", device_name, " device"));
+}
+
+NextPluggableDevice::NextPluggableDevice(const SessionOptions& session_options,
+                                         const Options& options)
+    : LocalDevice(session_options,
+                  BuildNextPluggableDeviceAttributes(options.device_name_prefix,
+                                                     options.device_name,
+                                                     options.device_ordinal)),
+      device_ordinal_(options.device_ordinal),
+      compilation_device_type_(options.compilation_device_name) {
+  allocator_ = std::make_unique<NextPluggableDeviceAllocator>(device_ordinal_);
+  if (absl::GetFlag(FLAGS_next_pluggable_device_use_pjrt)) {
+    device_context_ = core::RefCountPtr<DeviceContext>(new PjRtDeviceContext());
+  } else {
+    device_context_ = core::RefCountPtr<DeviceContext>(
+        new NextPluggableDeviceContext(device_ordinal_));
+  }
+
+  // Must set accelerator_device_info, otherwise TF will treat this device as
+  // CPU device.
+  auto accelerator_device_info =
+      std::make_unique<DeviceBase::AcceleratorDeviceInfo>();
+  accelerator_device_info->default_context = device_context_.get();
+  set_tensorflow_accelerator_device_info(accelerator_device_info.get());
+  accelerator_device_info_ = std::move(accelerator_device_info);
+}
+
+NextPluggableDevice::~NextPluggableDevice() = default;
+
+Allocator* NextPluggableDevice::GetAllocator(AllocatorAttributes attr) {
+  if (attr.on_host()) {
+    return cpu_allocator();
+  }
+  return allocator_.get();
+}
+
+void NextPluggableDevice::Compute(OpKernel* op_kernel,
+                                  OpKernelContext* context) {
+  VLOG(1) << "NextPluggableDevice::Compute " << op_kernel->name() << ":"
+          << op_kernel->type_string();
+  op_kernel->Compute(context);
+}
+
+void NextPluggableDevice::ComputeAsync(AsyncOpKernel* op_kernel,
+                                       OpKernelContext* context,
+                                       AsyncOpKernel::DoneCallback done) {
+  VLOG(1) << "NextPluggableDevice::ComputeAsync " << op_kernel->name() << ":"
+          << op_kernel->type_string();
+  op_kernel->ComputeAsync(context, done);
+}
+
+// TODO(chuanhao): implement NextPluggableDevice::Sync().
+Status NextPluggableDevice::Sync() { return OkStatus(); }
+
+// TODO(chuanhao): implement NextPluggableDevice::Sync().
+void NextPluggableDevice::Sync(const DoneCallback& done) {}
+
+Status NextPluggableDevice::TryGetDeviceContext(DeviceContext** out_context) {
+  *out_context = device_context_.get();
+  (*out_context)->Ref();
+  return OkStatus();
+}
+
+Status NextPluggableDevice::MakeTensorFromProto(
+    const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
+    Tensor* tensor) {
+  Tensor parsed(tensor_proto.dtype());
+  if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                   tensor_proto.DebugString());
+  }
+
+  Status status;
+  if (alloc_attrs.on_host()) {
+    *tensor = parsed;
+  } else {
+    Allocator* allocator = GetAllocator(alloc_attrs);
+    Tensor copy(allocator, parsed.dtype(), parsed.shape());
+    TF_RETURN_IF_ERROR(
+        device_context_->CopyCPUTensorToDeviceSync(&parsed, this, &copy));
+    *tensor = copy;
+  }
+  VLOG(2) << "Allocated tensor at " << DMAHelper::base(tensor);
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
new file mode 100644
index 00000000000..a073fd34ddc
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h"
+#include "tensorflow/core/platform/refcount.h"
+
+namespace tensorflow {
+
+class NextPluggableDeviceAllocator;
+
+class NextPluggableDevice : public LocalDevice {
+ public:
+  struct Options {
+    // The device name's prefix (e.g., "/task:7")
+    string device_name_prefix;
+
+    // The name of the  device (e.g., "GPU")
+    string device_name;
+
+    // The name of the compilation device (e.g., "XLA_TPU_JIT");
+    string compilation_device_name;
+
+    // The number of the device.
+    int device_ordinal = -1;
+  };
+
+  NextPluggableDevice(const SessionOptions& session_options,
+                      const Options& options);
+
+  ~NextPluggableDevice() override;
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+  Status Sync() override;
+
+  void Sync(const DoneCallback& done) override;
+
+  Status TryGetDeviceContext(DeviceContext** out_context) override;
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override;
+
+  int GetDeviceOrdinal() const { return device_ordinal_; }
+
+  const std::string& GetCompilationDeviceType() const {
+    return compilation_device_type_;
+  }
+
+ private:
+  int device_ordinal_;
+  std::string compilation_device_type_;
+  // Need to use RefCountPtr since DeviceContext is a ref counted object.
+  core::RefCountPtr<DeviceContext> device_context_;
+  std::unique_ptr<NextPluggableDeviceAllocator> allocator_;
+  std::unique_ptr<DeviceBase::AcceleratorDeviceInfo> accelerator_device_info_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.cc
new file mode 100644
index 00000000000..edfc3755e51
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.cc
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h"
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+
+namespace tensorflow {
+
+NextPluggableDeviceAllocator::NextPluggableDeviceAllocator(int device_ordinal)
+    : device_ordinal_(device_ordinal) {
+  api_ = TfnpdApi();
+  device_allocator_ = api_->TFNPD_DeviceAllocatorCreate(device_ordinal_);
+  TF_StringView device_allocator_name =
+      api_->TFNPD_DeviceAllocatorName(device_allocator_);
+  device_allocator_name_ = device_allocator_name.data;
+  allocates_opaque_handle_ =
+      api_->TFNPD_DeviceAllocatorAllocatesOpaqueHandle(device_allocator_);
+}
+
+NextPluggableDeviceAllocator::~NextPluggableDeviceAllocator() {
+  api_->TFNPD_DeviceAllocatorDelete(device_allocator_);
+}
+
+void* NextPluggableDeviceAllocator::AllocateRaw(size_t alignment,
+                                                size_t num_bytes) {
+  return api_->TFNPD_DeviceAllocateRaw(device_allocator_, alignment, num_bytes);
+}
+
+void NextPluggableDeviceAllocator::DeallocateRaw(void* ptr) {
+  api_->TFNPD_DeviceDeallocateRaw(device_allocator_, ptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h
new file mode 100644
index 00000000000..cdc0acdf80f
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_allocator.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/framework/allocator.h"
+
+class TFNPD_DeviceAllocator;
+
+namespace tensorflow {
+
+class NextPluggableDeviceAllocator : public Allocator {
+ public:
+  explicit NextPluggableDeviceAllocator(int device_ordinal);
+
+  ~NextPluggableDeviceAllocator() override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void DeallocateRaw(void* ptr) override;
+
+  std::string Name() override { return device_allocator_name_; }
+
+  bool AllocatesOpaqueHandle() const override {
+    return allocates_opaque_handle_;
+  }
+
+ private:
+  const TFNPD_Api* api_;
+  int device_ordinal_;
+  std::string device_allocator_name_;
+  bool allocates_opaque_handle_;
+  TFNPD_DeviceAllocator* device_allocator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc
new file mode 100644
index 00000000000..feaa0d839f0
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.cc
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+
+#include <string>
+
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace tensorflow {
+
+static const TFNPD_Api* tfnpd_api;
+
+const TFNPD_Api* TfnpdApi() { return tfnpd_api; }
+
+void SetTfnpdApi(const TFNPD_Api* api) { tfnpd_api = api; }
+
+tsl::Status InitNextPluggableDevicePlugin(
+    TFNPDInitPluginFn init_fn, std::string* device_type,
+    std::string* compilation_device_name) {
+  TFNPD_PluginParams params{TFNPD_PLUGIN_PARAMS_STRUCT_SIZE};
+  TF_StatusPtr c_status_ptr(TF_NewStatus());
+  const TFNPD_Api* api = init_fn(&params, c_status_ptr.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status_ptr.get()));
+
+  SetTfnpdApi(api);
+  *device_type = std::string(params.device_type);
+  *compilation_device_name = std::string(params.compilation_device_name);
+  return ::tensorflow::OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
new file mode 100644
index 00000000000..584145a6328
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+// Global TFNPD_Api* singleton.
+const TFNPD_Api* TfnpdApi();
+void SetTfnpdApi(const TFNPD_Api* api);
+
+typedef const TFNPD_Api* (*TFNPDInitPluginFn)(TFNPD_PluginParams*, TF_Status*);
+tsl::Status InitNextPluggableDevicePlugin(TFNPDInitPluginFn init_fn,
+                                          std::string* device_type,
+                                          std::string* compilation_device_name);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_API_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc
new file mode 100644
index 00000000000..bb942de1746
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.cc
@@ -0,0 +1,137 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h"
+
+#include <functional>
+#include <utility>
+
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+namespace {
+
+struct StatusCallbackInvocationParams {
+  StatusCallback callback;
+  TF_Status* status;
+  TF_Tensor* c_cpu_tensor;
+  TF_Tensor* c_device_tensor;
+  TFNPD_DeviceEvent* event;
+  const TFNPD_Api* api;
+
+  ~StatusCallbackInvocationParams() {
+    TF_DeleteStatus(status);
+    // Release the C Tensors.
+    TF_DeleteTensor(c_cpu_tensor);
+    TF_DeleteTensor(c_device_tensor);
+    api->TFNPD_DeviceEventDelete(event);
+  }
+};
+
+// This function is passed to the AndThen C API. The AndThen C API waits for
+// the event to become ready, and invoke this function.
+void InvokeStatusCallbackFn(void* arg) {
+  StatusCallbackInvocationParams* params =
+      reinterpret_cast<StatusCallbackInvocationParams*>(arg);
+  tensorflow::Status cc_status = StatusFromTF_Status(params->status);
+  // Invokes the "done" callback here.
+  params->callback(cc_status);
+  // Explicitly delete the params after callback is done.
+  delete params;
+}
+
+}  // namespace
+
+NextPluggableDeviceContext::NextPluggableDeviceContext(int device_ordinal) {
+  api_ = TfnpdApi();
+  context_ = api_->TFNPD_DeviceContextCreate(device_ordinal);
+}
+
+NextPluggableDeviceContext::~NextPluggableDeviceContext() {
+  api_->TFNPD_DeviceContextDelete(context_);
+}
+
+void NextPluggableDeviceContext::CopyDeviceTensorToCPU(
+    const Tensor* device_tensor, absl::string_view tensor_name, Device* device,
+    Tensor* cpu_tensor, StatusCallback done) {
+  profiler::TraceMeProducer traceme(
+      [] { return "NextPluggableDeviceContext::CopyDeviceTensorToCPU"; },
+      profiler::ContextType::kGeneric);
+  tensorflow::Status s;
+  TF_Tensor* c_cpu_tensor = TF_TensorFromTensor(*cpu_tensor, &s);
+  if (!s.ok()) {
+    done(s);
+  }
+  TF_Tensor* c_device_tensor = TF_TensorFromTensor(*device_tensor, &s);
+  if (!s.ok()) {
+    done(s);
+  }
+
+  TF_Status* c_status = TF_NewStatus();
+  TFNPD_DeviceEvent* event = api_->TFNPD_DeviceTensorToHostTensor(
+      context_, c_device_tensor, c_cpu_tensor, c_status);
+
+  // Store the std::function to the param because the "done" callback may have
+  // captures and thus cannot be converted to a function pointer.
+  StatusCallbackInvocationParams* params = new StatusCallbackInvocationParams{
+      std::move(done), c_status, c_cpu_tensor, c_device_tensor, event, api_};
+
+  api_->TFNPD_DeviceEventAndThen(event, &InvokeStatusCallbackFn,
+                                 /*callback_arg=*/params);
+}
+
+void NextPluggableDeviceContext::CopyCPUTensorToDevice(
+    const Tensor* cpu_tensor, Device* device, Tensor* device_tensor,
+    StatusCallback done, bool sync_dst_compute) const {
+  profiler::TraceMeProducer traceme(
+      [] { return "NextPluggableDeviceContext::CopyCPUTensorToDevice"; },
+      profiler::ContextType::kGeneric);
+  tensorflow::Status s;
+  TF_Tensor* c_cpu_tensor = TF_TensorFromTensor(*cpu_tensor, &s);
+  if (!s.ok()) {
+    done(s);
+  }
+  TF_Tensor* c_device_tensor = TF_TensorFromTensor(*device_tensor, &s);
+  if (!s.ok()) {
+    done(s);
+  }
+
+  TF_Status* c_status = TF_NewStatus();
+  TFNPD_DeviceEvent* event = api_->TFNPD_HostTensorToDeviceTensor(
+      context_, c_cpu_tensor, c_device_tensor, c_status);
+
+  // Store the std::function to the param because the "done" callback may have
+  // captures and thus cannot be converted to a function pointer.
+  StatusCallbackInvocationParams* params = new StatusCallbackInvocationParams{
+      std::move(done), c_status, c_cpu_tensor, c_device_tensor, event, api_};
+
+  api_->TFNPD_DeviceEventAndThen(event, &InvokeStatusCallbackFn,
+                                 /*callback_arg=*/params);
+}
+
+void NextPluggableDeviceContext::CopyTensorInSameDevice(
+    const Tensor* input_tensor, Device* device, Tensor* output_tensor,
+    StatusCallback done) const {
+  done(errors::Unimplemented("Same-device copies not implemented."));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h
new file mode 100644
index 00000000000..8734df0aee6
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_context.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/status.h"
+
+class TFNPD_DeviceContext;
+
+namespace tensorflow {
+
+// Helper class for managing data transfers between host and accelerator
+// devices.
+class NextPluggableDeviceContext : public DeviceContext {
+ public:
+  explicit NextPluggableDeviceContext(int device_ordinal);
+
+  ~NextPluggableDeviceContext() override;
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             absl::string_view tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
+ private:
+  const TFNPD_Api* api_;
+  TFNPD_DeviceContext* context_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
new file mode 100644
index 00000000000..504db012c0e
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
@@ -0,0 +1,78 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace tensorflow {
+
+Status NextPluggableDeviceFactory::ListPhysicalDevices(
+    std::vector<string>* devices) {
+  TF_Status* c_status = TF_NewStatus();
+  int32_t device_count = api_->TFNPD_GetDeviceCount(c_status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
+  TF_DeleteStatus(c_status);
+
+  for (int i = 0; i < device_count; ++i) {
+    const string device_name =
+        absl::StrCat("/physical_device:", device_type_, ":", i);
+    devices->push_back(device_name);
+  }
+
+  return OkStatus();
+}
+
+Status NextPluggableDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const std::string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
+  TF_Status* c_status = TF_NewStatus();
+
+  // Setup per-device states or resources that are internal to plugin.
+  api_->TFNPD_InitPluginInternalDeviceStates(c_status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
+
+  int32_t device_count = api_->TFNPD_GetDeviceCount(c_status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
+  TF_DeleteStatus(c_status);
+
+  for (int i = 0; i < device_count; ++i) {
+    NextPluggableDevice::Options options;
+    options.device_name_prefix = name_prefix;
+    options.device_name = device_type_;
+    options.compilation_device_name = compilation_device_name_;
+    options.device_ordinal = i;
+
+    auto device =
+        std::make_unique<NextPluggableDevice>(session_options, options);
+    devices->push_back(std::move(device));
+  }
+
+  LOG(INFO) << "Created " << device_count
+            << " TensorFlow NextPluggableDevices. "
+            << "Physical device type: " << device_type_;
+  return OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
new file mode 100644
index 00000000000..6ad3c4be48e
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/c/plugin_c_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+#include "tensorflow/core/framework/device_factory.h"
+
+namespace tensorflow {
+
+class NextPluggableDeviceFactory : public DeviceFactory {
+ public:
+  explicit NextPluggableDeviceFactory(
+      const std::string& device_type,
+      const std::string& compilation_device_name)
+      : api_(TfnpdApi()),
+        device_type_(device_type),
+        compilation_device_name_(compilation_device_name) {}
+
+  Status ListPhysicalDevices(std::vector<string>* devices) override;
+
+  Status CreateDevices(const SessionOptions& session_options,
+                       const std::string& name_prefix,
+                       std::vector<std::unique_ptr<Device>>* devices) override;
+
+ private:
+  const TFNPD_Api* api_;
+  const std::string device_type_;
+  const std::string compilation_device_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
new file mode 100644
index 00000000000..87bc8792d2c
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
@@ -0,0 +1,289 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h"
+
+#include <memory>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/jit/device_compiler_client.h"
+#include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/utils.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/common/async_value_tensor.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+
+static StatusOr<xla::Shape> TpuShapeRepresentation(
+    const TensorShape& shape, DataType type, bool use_fast_memory,
+    XlaLayoutPreference layout_preference) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      tensorflow::TensorShapeToXLAShape(type, shape, &xla_shape));
+  ApiConverter::StackHelper<XLA_Shape> c_xla_shape(xla_shape);
+  ApiConverter::StackHelper<XLA_Shape> c_device_shape;
+  TF_Status* tf_status = TF_NewStatus();
+  TfnpdApi()->TFNPD_XlaShapeToDeviceShapeRepresentation(
+      &c_xla_shape.value, type, use_fast_memory,
+      ConvertToCXlaLayoutPreference(layout_preference), &c_device_shape.value,
+      tf_status);
+  const Status status = StatusFromTF_Status(tf_status);
+  TF_DeleteStatus(tf_status);
+  TF_RETURN_IF_ERROR(status);
+  return c_device_shape.AsCpp<xla::Shape>();
+}
+
+static int GetDeviceOrdinal(const DeviceBase* device) {
+  return device->parsed_name().id;
+}
+
+static XlaCompiler::Options GenerateXlaCompilerOptions(
+    const FunctionLibraryRuntime& function_library,
+    const DeviceBase* device_base) {
+  XlaCompiler::Options options;
+  options.device_ordinal = GetDeviceOrdinal(device_base);
+  options.flib_def = function_library.GetFunctionLibraryDefinition();
+  options.graph_def_version = function_library.graph_def_version();
+  // TODO(b/260799193): currently device_type and shape_determination_fns are
+  // hardcoded for TPU. Support generating compiler options for different
+  // devices. We may introduce a plugin c API to provide options.device_type.
+  options.device_type = DEVICE_TPU_XLA_JIT;
+  options.shape_determination_fns =
+      XlaShapeLayoutHelpers::ShapeDeterminationFns{UseNoPreferenceLayoutFn(),
+                                                   TpuShapeRepresentation};
+  options.allow_cpu_custom_calls = false;
+  options.alias_passthrough_params = false;
+  options.detailed_logging = false;
+  return options;
+}
+
+static std::vector<xla::PjRtBuffer*> PrepareExecutableArguments(
+    int xla_input_sizes, const std::vector<int>& input_mapping,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const absl::flat_hash_map<int, int>& variable_lookup) {
+  std::vector<xla::PjRtBuffer*> args;
+  args.reserve(xla_input_sizes);
+  for (auto arg_num : input_mapping) {
+    const Tensor* tensor;
+    if (auto it = variable_lookup.find(arg_num); it != variable_lookup.end()) {
+      tensor = variables[it->second].var()->tensor();
+    } else {
+      tensor = inputs[arg_num];
+    }
+    AsyncValueTensor* av_tensor = AsyncValueTensor::FromTensor(tensor);
+    if (av_tensor->GetBuffer() == nullptr) {
+      // TODO(b/260799971): verify size 0 argument is supported (cl/387160525).
+      CHECK_EQ(tensor->NumElements(), 0);  // Crash OK
+      continue;
+    }
+    args.push_back(av_tensor->GetBuffer().get());
+  }
+  return args;
+}
+
+static Status PopulateOutputs(
+    OpKernelContext* ctx, const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const absl::flat_hash_map<int, int>& variable_lookup,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::vector<std::unique_ptr<xla::PjRtBuffer>>& execute_outputs) {
+  // Copy XLA results to the OpOutputList.
+  int output_num = 0;
+  for (int i = 0, end = ctx->num_outputs(); i < end; ++i) {
+    const DataType& type = compilation_result.outputs[i].type;
+    VLOG(2) << "Populating output for retval " << i << " type "
+            << DataTypeString(type);
+
+    if (compilation_result.outputs[i].is_constant) {
+      Device* device = dynamic_cast<Device*>(ctx->device());
+      bool requires_copy_to_device = device->device_type() != DEVICE_CPU;
+      TF_RETURN_IF_ERROR(SetOutputForConstant(ctx, requires_copy_to_device,
+                                              &compilation_result, i));
+    } else if (type == DT_RESOURCE) {
+      int input_index = compilation_result.outputs[i].input_index;
+      TF_RET_CHECK(input_index >= 0 && input_index < ctx->num_inputs())
+          << "Invalid input for outputs " << i << ": " << input_index;
+      ctx->set_output(i, *inputs[input_index]);
+    } else {
+      Tensor* output_tensor;
+      TensorShape shape = TensorShape(
+          execute_outputs[output_num]->on_device_shape().dimensions());
+      TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
+      auto output_avt = AsyncValueTensor::FromTensor(output_tensor);
+      output_avt->SetBuffer(std::move(execute_outputs[output_num]));
+      ++output_num;
+    }
+  }
+
+  // Apply variable updates, if any.
+  for (int i = 0, end = compilation_result.resource_updates.size(); i < end;
+       ++i) {
+    const XlaCompiler::ResourceUpdate& write =
+        compilation_result.resource_updates[i];
+    int actual_input_index = write.input_index;
+    CHECK_GE(actual_input_index, 0);                  // Crash OK
+    CHECK_LT(actual_input_index, ctx->num_inputs());  // Crash OK
+    auto it = variable_lookup.find(actual_input_index);
+    if (it == variable_lookup.end()) {
+      continue;
+    }
+    Var* var = variables[it->second].var();
+    CHECK(var);  // Crash OK
+
+    VLOG(2) << "Updating variable #" << i
+            << " at input index: " << actual_input_index << " with shape "
+            << write.shape.DebugString() << "; variable tensor has shape: "
+            << var->tensor()->shape().DebugString();
+
+    if (var->is_initialized && var->tensor()->dtype() != write.type) {
+      return errors::Internal("Mismatched type in variable write");
+    }
+
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(
+        var->tensor()->dtype(), var->tensor()->shape(), var->tensor()));
+    AsyncValueTensor::FromTensor(var->tensor())
+        ->SetBuffer(std::move(execute_outputs[output_num]));
+    var->is_initialized |= write.modified;
+    ++output_num;
+  }
+  return OkStatus();
+}
+
+Status PjRtCompileOnDemandOp::Compile(
+    OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
+    const std::vector<XlaCompiler::Argument>& args,
+    XlaCompiler::CompilationResult* compilation_result,
+    std::unique_ptr<xla::PjRtLoadedExecutable>* executable) {
+  // TODO(b/260798754): use caching when it is ready.
+  const XlaCompiler::Options options =
+      GenerateXlaCompilerOptions(*ctx->function_library(), ctx->device());
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  compile_options.use_tuple_arg = false;
+  compile_options.always_return_tuple = true;
+  XlaCompiler compiler(options);
+  TF_RETURN_IF_ERROR(compiler.CompileSingleOp(
+      compile_options, XlaCompiler::SingleOpCompileArgument(*ctx), args,
+      compilation_result));
+  xla::ExecutableBuildOptions build_options =
+      GetExecutableBuildOptions(options, *compilation_result, -1);
+  xla::CompileOptions pjrt_compile_options;
+  pjrt_compile_options.executable_build_options = build_options;
+  pjrt_compile_options.compile_portable_executable = true;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_executable,
+      pjrt_client->Compile(*compilation_result->computation,
+                           pjrt_compile_options));
+  VLOG(2) << "Compiled PJRT executable " << pjrt_executable->name()
+          << " num_replicas " << pjrt_executable->num_replicas()
+          << " num_partitions " << pjrt_executable->num_partitions();
+  *executable = std::move(pjrt_executable);
+  return OkStatus();
+}
+
+Status PjRtCompileOnDemandOp::Run(
+    OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
+    const std::vector<const Tensor*>& inputs,
+    const std::vector<VariableInfo>& variables,
+    const XlaCompiler::CompilationResult& compilation_result,
+    std::unique_ptr<xla::PjRtLoadedExecutable> executable) {
+  xla::ExecuteOptions options;
+  options.arguments_are_tupled = false;
+  options.untuple_result = true;
+  TF_ASSIGN_OR_RETURN(
+      xla::PjRtDevice * device,
+      pjrt_client->LookupDevice(GetDeviceOrdinal(ctx->device())));
+
+  absl::flat_hash_map<int, int> variable_lookup;
+  for (int i = 0; i < variables.size(); i++) {
+    variable_lookup[variables[i].index()] = i;
+  }
+  const std::vector<xla::PjRtBuffer*> executable_args =
+      PrepareExecutableArguments(compilation_result.xla_input_shapes.size(),
+                                 compilation_result.input_mapping, inputs,
+                                 variables, variable_lookup);
+  // TODO(b/257548614): currently PJRT is compiled as portable (num_replica = 1
+  // and num_partition = 1). Support multiple partitions case.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs,
+      executable->ExecutePortable(executable_args, device, options));
+
+  TF_RETURN_IF_ERROR(PopulateOutputs(ctx, inputs, variables, variable_lookup,
+                                     compilation_result, execute_outputs));
+  return OkStatus();
+}
+
+void PjRtCompileOnDemandOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES_VALUE(xla::PjRtClient * pjrt_client, ctx,
+                    GetOrCreatePjRtClient(DEVICE_TPU));
+  OP_REQUIRES(ctx, ctx->function_library(),
+              errors::Internal("Function library missing"));
+
+  OP_REQUIRES_VALUE(std::vector<int> constant_input_indices, ctx,
+                    GetConstantInputIndicesFromContext(ctx));
+  const std::vector<int> variables_indices =
+      GetResourceVariableIndicesFromContext(ctx);
+  const std::vector<const Tensor*> inputs = InputsFromContext(ctx);
+  std::vector<VariableInfo> variables;
+  variables.reserve(variables_indices.size());
+  OP_REQUIRES_OK(
+      ctx, GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                      inputs, variables_indices, &variables));
+  OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variables)));
+
+  // Compile
+  XlaCompiler::CompilationResult result;
+  std::unique_ptr<xla::PjRtLoadedExecutable> executable;
+  OP_REQUIRES_VALUE(std::vector<XlaCompiler::Argument> args, ctx,
+                    XlaComputationLaunchContext::BuildXlaCompilerArguments(
+                        constant_input_indices, inputs, variables,
+                        static_cast<Device*>(ctx->device())));
+  OP_REQUIRES_OK(ctx, Compile(ctx, pjrt_client, args, &result, &executable));
+
+  // Execute
+  OP_REQUIRES_OK(ctx, Run(ctx, pjrt_client, inputs, variables, result,
+                          std::move(executable)));
+
+  ctx->SetStatus(OkStatus());
+  VLOG(1) << "PjRtCompileOnDemandOp::Compute: " << ctx->op_kernel().name()
+          << " on device " << ctx->device()->name() << " Done.";
+}
+
+void RegisterPjRtCompileOnDemand(const char* device, const char* jit_device) {
+  // Any op assigned to the device that isn't rewritten by the graph rewriter
+  // gets executed by a PjRtCompileOnDemandOp, which compiles it and executes
+  // it just-in-time.
+  auto factory = [](OpKernelConstruction* context) -> OpKernel* {
+    return new PjRtCompileOnDemandOp(context);
+  };
+  XlaOpRegistry::RegisterCompilationKernels();
+  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
+      device, jit_device, factory, "PjRtCompileOnDemandOp");
+  (void)registrations;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h
new file mode 100644
index 00000000000..d587b3ff41c
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PJRT_COMPILE_ON_DEMAND_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PJRT_COMPILE_ON_DEMAND_OP_H_
+
+#include <memory>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/lib/core/status.h"
+
+// TODO(b/253052995): Move the logics to XlaCompileOnDemandOp and delete
+// PjRtCompileOnDemandOp when it is ready.
+namespace tensorflow {
+
+// An OpKernel that compiles an op to an XLA computation and runs it. Unlike
+// XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// vanilla TensorFlow op as long as the bridge supports it.
+class PjRtCompileOnDemandOp : public OpKernel {
+ public:
+  explicit PjRtCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  static Status Compile(OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
+                        const std::vector<XlaCompiler::Argument>& args,
+                        XlaCompiler::CompilationResult* result,
+                        std::unique_ptr<xla::PjRtLoadedExecutable>* executable);
+
+  static Status Run(OpKernelContext* ctx, xla::PjRtClient* pjrt_client,
+                    const std::vector<const Tensor*>& inputs,
+                    const std::vector<VariableInfo>& variables,
+                    const XlaCompiler::CompilationResult& result,
+                    std::unique_ptr<xla::PjRtLoadedExecutable> executable);
+};
+
+void RegisterPjRtCompileOnDemand(const char* device, const char* jit_device);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PJRT_COMPILE_ON_DEMAND_OP_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
new file mode 100644
index 00000000000..2ffa2ce1abe
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
+
+#include <string>
+
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tsl {
+class Status;
+}  // namespace tsl
+namespace tensorflow {
+using tsl::Status;
+
+class PluginCoordinationServiceAgent {
+ public:
+  PluginCoordinationServiceAgent() = default;
+  virtual ~PluginCoordinationServiceAgent() = default;
+
+  virtual bool IsInitialized() const = 0;
+
+  virtual Status InsertKeyValue(const std::string& key,
+                                const std::string& value) = 0;
+
+  virtual StatusOr<std::string> GetKeyValue(const std::string& key) = 0;
+
+  virtual Status DeleteKeyValue(const std::string& key) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
new file mode 100644
index 00000000000..3c0473c7758
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
+
+#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h"
+#else
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
+#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+
+namespace tensorflow {
+
+inline PluginCoordinationServiceAgent* CreatePluginCoordinationServiceAgent(
+    void* agent) {
+#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+  return new DirectPluginCoordinationServiceAgent(agent);
+#else
+  return new CPluginCoordinationServiceAgent(agent);
+#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h
new file mode 100644
index 00000000000..0ffdc6289f7
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h
@@ -0,0 +1,168 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+class ConfigProto;
+class FunctionLibraryDefinition;
+class OpInputList;
+class PluginCoordinationServiceAgent;
+class PluginVariable;
+class Tensor;
+class TensorShape;
+
+// A wrapper base class that provides convenience for developers to implement
+// to plugin OpKernels that suites internal and external requirements, without
+// duplicating code.
+//
+// Internal build: Plugin and TF are built together and statically linked. In
+// this case, we can directly cast between `TF_OpKernelContext*` and
+// `OpKernelContext*`, and directly call C++ API. This way don't need to pay the
+// potential performance panelty (e.g. proto serialization/deserialization)
+// brought by C API.
+//
+// External build: Plugin and TF are built separately (potentially on different
+// platform and by different compilers). Plugin is dynamically loaded by TF.
+// In this case, we need to call C API to ensure binary compatibility.
+//
+// `DirectPluginOpKernel*` and `CPluginOpKernel*` implement `PluginOpKernel*`
+// to support the above mentioned internal and external build cases. OpKernel
+// developers can conveniently use the `Wrapper` C++ API to implement `Create`
+// and `Compute` functions, and use the helper macro to register the functions
+// as a Plugin OpKernel. This method benefit kernel developers in two ways: 1).
+// Plugin OpKernel developers don't have to directly deal with C API. 2). In the
+// OpKernels are performance critical and developers want to introduce an
+// internal version of the same OpKernels, they don't have to implement again
+// with mostly duplicated code.
+class PluginOpKernelConstruction {
+ public:
+  PluginOpKernelConstruction() = default;
+  virtual ~PluginOpKernelConstruction() = default;
+
+  virtual Status GetBoolAttr(std::string_view attr_name, bool* value) const = 0;
+  virtual Status GetInt32Attr(std::string_view attr_name, int* value) const = 0;
+  virtual Status GetInt32AttrList(std::string_view attr_name,
+                                  std::vector<int32_t>* value) const = 0;
+  virtual Status GetInt64Attr(std::string_view attr_name,
+                              int64_t* value) const = 0;
+  virtual Status GetStringAttr(std::string_view attr_name,
+                               std::string* value) const = 0;
+  virtual Status GetFunctionAttr(std::string_view attr_name,
+                                 NameAttrList* function) const = 0;
+
+  virtual void CtxFailure(const Status& status) = 0;
+  virtual void CtxFailure(const char* file, int line, const Status& status) = 0;
+
+  virtual void* GetContext() const = 0;
+};
+
+class PluginOpKernelContext {
+ public:
+  PluginOpKernelContext() = default;
+  virtual ~PluginOpKernelContext() = default;
+
+  virtual std::string_view GetResourceMgrDefaultContainerName() = 0;
+
+  virtual Status LookupOrCreateResource(std::string_view container_name,
+                                        std::string_view plugin_resource_name,
+                                        void** result_plugin_resource,
+                                        void* (*create_func)(void*),
+                                        void* create_func_args,
+                                        void (*delete_func)(void*)) = 0;
+
+  virtual PluginCoordinationServiceAgent* GetPluginCoordinationServiceAgent()
+      const = 0;
+
+  // This method will allocate a new `PluginVariable`. Caller is responsible
+  // for managing it's lifetime.
+  virtual Status CreatePluginVariable(int index,
+                                      PluginVariable** variable) const = 0;
+
+  virtual Status AllocateTempForPluginVariable(PluginVariable* variable) = 0;
+
+  virtual int NumInputs() const = 0;
+
+  virtual Status GetInput(int index, Tensor* tensor) const = 0;
+
+  // This method is not marked const because CPluginOpKernel need to do some
+  // extra bookkeeping work.
+  virtual Status GetInput(const char* name, const Tensor** tensor) = 0;
+
+  virtual Status GetInputRange(std::string_view name,
+                               std::pair<int, int>* range) const = 0;
+
+  virtual DataType GetInputDataType(int index) const = 0;
+
+  virtual std::string_view GetOpKernelRequestedInput(int index) const = 0;
+
+  virtual std::string_view GetOpKernelName() const = 0;
+
+  virtual uint64_t GetFrameId() const = 0;
+
+  virtual int64_t GetIterId() const = 0;
+
+  virtual int64_t GetStepId() const = 0;
+
+  virtual int GetDeviceId() const = 0;
+
+  virtual std::string GetSessionName() const = 0;
+
+  virtual Status GetConfigProto(const ConfigProto** config_proto) const = 0;
+
+  virtual void MaybeDeleteConfigProto(
+      const ConfigProto* config_proto) const = 0;
+
+  virtual Status GetFunctionLibraryDefinition(
+      const FunctionLibraryDefinition** flib_def) const = 0;
+
+  virtual void MaybeDeleteFunctionLibraryDefinition(
+      const FunctionLibraryDefinition* flib_def) const = 0;
+
+  virtual Status GetResourceHandle(int index,
+                                   const ResourceHandle** handle) const = 0;
+
+  virtual void MaybeDeleteResourceHandle(
+      const ResourceHandle* handle) const = 0;
+
+  virtual int GetGraphDefVersion() const = 0;
+
+  virtual Status AllocateOutput(int index, const TensorShape& shape,
+                                Tensor** out) = 0;
+
+  virtual Status SetOutput(int index, const Tensor& tensor) = 0;
+
+  virtual void CtxFailure(const Status& status) = 0;
+  virtual void CtxFailure(const char* file, int line, const Status& status) = 0;
+
+  virtual void* GetContext() const = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
new file mode 100644
index 00000000000..045070c6bcf
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
@@ -0,0 +1,125 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
+
+#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
+#else
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
+#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+
+namespace tensorflow {
+
+inline PluginOpKernelConstruction* CreatePluginOpKernelConstruction(void* ctx) {
+#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+  return new DirectPluginOpKernelConstruction(ctx);
+#else
+  return new CPluginOpKernelConstruction(ctx);
+#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+}
+
+inline void DeletePluginOpKernelConstruction(
+    PluginOpKernelConstruction* wrapper) {
+  delete wrapper;
+}
+
+inline PluginOpKernelContext* CreatePluginOpKernelContext(void* ctx) {
+#ifdef TF_OPKERNEL_C_API_PASSTHROUGH
+  return new DirectPluginOpKernelContext(ctx);
+#else
+  return new CPluginOpKernelContext(ctx);
+#endif  // TF_OPKERNEL_C_API_PASSTHROUGH
+}
+
+inline void DeletePluginOpKernelContext(PluginOpKernelContext* wrapper) {
+  delete wrapper;
+}
+
+#define PLUGIN_OP_REQUIRES_OK(CTX, ...)          \
+  do {                                           \
+    ::tensorflow::Status _s(__VA_ARGS__);        \
+    if (!TF_PREDICT_TRUE(_s.ok())) {             \
+      (CTX)->CtxFailure(__FILE__, __LINE__, _s); \
+      return;                                    \
+    }                                            \
+  } while (0)
+
+// A helper to register C OpKernel. CREATE_FN, COMPUTE_FN, and DELETE_FN are
+// expected to be defined in the same file where this macro is used.
+//
+// HOST_MEMORY_ARGS a string containing names of args to be placed on host
+// memory. Names are expected to be comma separated.
+//
+// TODO(chuanhao): simplify the registration macro. reference:
+// REGISTER_KERNEL_BUILDER
+#define REGISTER_WRAPPED_C_OPKERNEL_HOST_MEM_ARGS(                            \
+    KERNEL_NAME, CREATE_FN, COMPUTE_FN, DELETE_FN, DEVICE, PRIORITY,          \
+    HOST_MEMORY_ARGS)                                                         \
+  {                                                                           \
+    typedef void* (*wrapped_create_func)(TF_OpKernelConstruction*);           \
+    typedef void (*wrapped_compute_func)(void*, TF_OpKernelContext*);         \
+                                                                              \
+    TF_StatusPtr status_ptr(TF_NewStatus());                                  \
+                                                                              \
+    wrapped_create_func create_func =                                         \
+        [](TF_OpKernelConstruction* ctx) -> void* {                           \
+      PluginOpKernelConstruction* ctx_wrapper =                               \
+          CreatePluginOpKernelConstruction(ctx);                              \
+      void* kernel = CREATE_FN(ctx_wrapper);                                  \
+      delete ctx_wrapper;                                                     \
+      return kernel;                                                          \
+    };                                                                        \
+                                                                              \
+    wrapped_compute_func compute_func = [](void* kernel,                      \
+                                           TF_OpKernelContext* ctx) -> void { \
+      PluginOpKernelContext* ctx_wrapper = CreatePluginOpKernelContext(ctx);  \
+      COMPUTE_FN(kernel, ctx_wrapper);                                        \
+      delete ctx_wrapper;                                                     \
+    };                                                                        \
+                                                                              \
+    auto* builder = TF_NewKernelBuilder(KERNEL_NAME, DEVICE, create_func,     \
+                                        compute_func, &DELETE_FN);            \
+                                                                              \
+    /* NOTE: We explicitly set the priority to 1 to overwrite the */          \
+    /* StreamExecutor based OpKernel of the same op.              */          \
+    TF_KernelBuilder_Priority(builder, PRIORITY);                             \
+                                                                              \
+    std::stringstream s_stream(HOST_MEMORY_ARGS);                             \
+    while (s_stream.good()) {                                                 \
+      std::string host_mem_arg;                                               \
+      std::getline(s_stream, host_mem_arg, ',');                              \
+      if (host_mem_arg.empty()) break;                                        \
+      TF_KernelBuilder_HostMemory(builder, host_mem_arg.c_str());             \
+    }                                                                         \
+                                                                              \
+    TF_RegisterKernelBuilder(KERNEL_NAME, builder, status_ptr.get());         \
+    CHECK_EQ(TF_OK, TF_GetCode(status_ptr.get()))                             \
+        << "Error while registering " << KERNEL_NAME << " kernel.";           \
+  }
+
+#define REGISTER_WRAPPED_C_OPKERNEL(KERNEL_NAME, CREATE_FN, COMPUTE_FN, \
+                                    DELETE_FN, DEVICE, PRIORITY)        \
+  REGISTER_WRAPPED_C_OPKERNEL_HOST_MEM_ARGS(                            \
+      KERNEL_NAME, CREATE_FN, COMPUTE_FN, DELETE_FN, DEVICE, PRIORITY, "")
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.cc b/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.cc
new file mode 100644
index 00000000000..66453c522ac
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.cc
@@ -0,0 +1,22 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
+
+namespace tensorflow {
+
+PluginResource::~PluginResource() { delete_func_(resource_); }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h
new file mode 100644
index 00000000000..c72fe952632
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
+
+#include <string>
+#include <string_view>
+
+#include "tensorflow/core/framework/resource_base.h"
+
+namespace tensorflow {
+
+// A wrapper class for plugin to create resources to the ResourceMgr managed by
+// TensorFlow. The main motivation is to make resources in plugin have the same
+// lifetime as TensorFlow ResourceMgr.
+//
+// Usage:
+// Plugin uses a TensorFlow C API `TF_CreatePluginResource()`,
+// to register the `PluginResource` to the ResourceMgr managed by TensorFlow.
+// `PluginResource` holds a opaque pointer and a deleter function. The deleter
+// will be called at `PluginResource`'s destruction.
+class PluginResource : public ResourceBase {
+ public:
+  PluginResource(void* plugin_resource, std::string_view plugin_resource_name,
+                 void (*delete_func)(void* plugin_resource))
+      : resource_(plugin_resource),
+        resource_name_(plugin_resource_name),
+        delete_func_(delete_func) {}
+  ~PluginResource() override;
+
+  void* GetOpaquePluginResource() { return resource_; }
+
+  std::string DebugString() const override { return resource_name_; }
+
+ private:
+  void* resource_;
+  std::string resource_name_;
+  void (*delete_func_)(void* plugin_resource);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_RESOURCE_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
new file mode 100644
index 00000000000..23e3723cfcb
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
+
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+class Tensor;
+
+// A helper base class that wraps tensorflow::VariableInfo for the convenience
+// of passing between plugin and tensorflow. Similar to `PluginOpKernelContext`,
+// the implementations can accomodate for "Internal build" and "External build",
+// meaning the plugin is built with TensorFlow either together or separately. In
+// repsective build modes, the implementations can either include
+// tensorflow::VariableInfo and use C++ API directly, or include the C structure
+// `TF_VariableInfo` and use the corresponding C API.
+class PluginVariable {
+ public:
+  PluginVariable() = default;
+  virtual ~PluginVariable() = default;
+
+  // `result_tensor` will point to the tensor possessed by the variable if
+  // status is ok.
+  virtual tsl::Status GetTensor(const Tensor** result_tensor) = 0;
+
+  virtual tsl::Status GetMutableTensor(Tensor** result_tensor) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_VARIABLE_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/utils.cc b/tensorflow/core/common_runtime/next_pluggable_device/utils.cc
new file mode 100644
index 00000000000..20c6a6ca4df
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/utils.cc
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/next_pluggable_device/utils.h"
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/c/c_api_decl.h"
+
+namespace tensorflow {
+
+XLA_LayoutPreference ConvertToCXlaLayoutPreference(XlaLayoutPreference input) {
+  switch (input) {
+    case XlaLayoutPreference::kNoPreference:
+      return XLA_LayoutPreference::XLA_LayoutPreference_kNoPreference;
+    case XlaLayoutPreference::kTpuPreferCompactChunkPaddedLayout:
+      return XLA_LayoutPreference::
+          XLA_LayoutPreference_kTpuPreferCompactChunkPaddedLayout;
+    case XlaLayoutPreference::kTpuPreferLinearLayout:
+      return XLA_LayoutPreference::XLA_LayoutPreference_kTpuPreferLinearLayout;
+  }
+  LOG(ERROR) << "Unexpected value for XlaLayoutPreference: "
+             << static_cast<int>(input);
+  return XLA_LayoutPreference::XLA_LayoutPreference_kNoPreference;
+}
+
+XlaLayoutPreference ConvertFromCXlaLayoutPreference(
+    XLA_LayoutPreference input) {
+  switch (input) {
+    case XLA_LayoutPreference::XLA_LayoutPreference_kNoPreference:
+      return XlaLayoutPreference::kNoPreference;
+    case XLA_LayoutPreference::
+        XLA_LayoutPreference_kTpuPreferCompactChunkPaddedLayout:
+      return XlaLayoutPreference::kTpuPreferCompactChunkPaddedLayout;
+    case XLA_LayoutPreference::XLA_LayoutPreference_kTpuPreferLinearLayout:
+      return XlaLayoutPreference::kTpuPreferLinearLayout;
+  }
+  LOG(ERROR) << "Unexpected value for XLA_LayoutPreference: "
+             << static_cast<int>(input);
+  return XlaLayoutPreference::kNoPreference;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/utils.h b/tensorflow/core/common_runtime/next_pluggable_device/utils.h
new file mode 100644
index 00000000000..1ff767ea176
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/utils.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/c/c_api_decl.h"
+
+namespace tensorflow {
+
+XLA_LayoutPreference ConvertToCXlaLayoutPreference(XlaLayoutPreference input);
+XlaLayoutPreference ConvertFromCXlaLayoutPreference(XLA_LayoutPreference input);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_UTILS_H_
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index a51044a0b8d..21e8204b472 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -67,6 +67,8 @@ Status OptimizationPassRegistry::RunGrouping(
       VLOG(1) << "Running optimization phase " << phase.first;
       for (auto& pass : phase.second) {
         VLOG(1) << "Running optimization pass: " << pass->name();
+        VLOG(1) << "Graph #nodes " << (*options.graph)->num_nodes()
+                << " #edges " << (*options.graph)->num_edges();
 
         tensorflow::metrics::ScopedCounter<2> pass_timings(
             tensorflow::metrics::GetGraphOptimizationCounter(),
@@ -86,6 +88,10 @@ Status OptimizationPassRegistry::RunGrouping(
     group_timings.ReportAndStop();
   }
   VLOG(1) << "Finished optimization of a group " << grouping;
+  if (group != groups_.end()) {
+    VLOG(1) << "Graph #nodes " << (*options.graph)->num_nodes() << " #edges "
+            << (*options.graph)->num_edges();
+  }
   if (VLOG_IS_ON(3) ||
       (VLOG_IS_ON(2) && grouping == Grouping::POST_REWRITE_FOR_EXEC)) {
     std::string prefix = strings::StrCat(options.debug_filename_prefix,
diff --git a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
index 67b4edb6135..3a6bd9fb91a 100644
--- a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
+++ b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
@@ -93,9 +93,14 @@ Status OptimizeCrossHostControlOutputEdges(Graph* graph,
       TF_RETURN_IF_ERROR(BuildNoopNode(
           *n, graph->NewName(strings::StrCat(n->name(), "/", "control_after")),
           /*device=*/pair.first, graph, &control_after));
-      graph->AddControlEdge(n, control_after);
+
+      // When adding control edges, set `allow_duplicates` to true since the
+      // duplication check is expensive and unnecessary here due to there
+      // shouldn't be duplicated control edges introduced by this pass.
+      graph->AddControlEdge(n, control_after, /*allow_duplicates=*/true);
       for (const Edge* edge : pair.second) {
-        graph->AddControlEdge(control_after, edge->dst());
+        graph->AddControlEdge(control_after, edge->dst(),
+                              /*allow_duplicates=*/true);
         graph->RemoveEdge(edge);
       }
     }
@@ -153,7 +158,7 @@ Status OptimizeCrossHostControlInputEdges(Graph* graph,
       if (pair.second.size() < cross_host_edges_threshold) {
         continue;
       }
-      VLOG(0) << "Optmize cross host input control edge, dst node: "
+      VLOG(1) << "Optmize cross host input control edge, dst node: "
               << dst->name() << " dst device: " << dst_host_device
               << " src host device: " << pair.first
               << " edges size: " << pair.second.size();
@@ -162,9 +167,14 @@ Status OptimizeCrossHostControlInputEdges(Graph* graph,
           *dst,
           graph->NewName(strings::StrCat(dst->name(), "/", "control_before")),
           /*device=*/pair.first, graph, &control_before));
-      graph->AddControlEdge(control_before, dst);
+
+      // When adding control edges, set `allow_duplicates` to true since the
+      // duplication check is expensive and unnecessary here due to there
+      // shouldn't be duplicated control edges introduced by this pass.
+      graph->AddControlEdge(control_before, dst, /*allow_duplicates=*/true);
       for (const Edge* edge : pair.second) {
-        graph->AddControlEdge(edge->src(), control_before);
+        graph->AddControlEdge(edge->src(), control_before,
+                              /*allow_duplicates=*/true);
         graph->RemoveEdge(edge);
       }
     }
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
new file mode 100644
index 00000000000..1a71a1a4e1c
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -0,0 +1,497 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_node_util.h"
+#include "tensorflow/core/util/dump_graph.h"
+
+namespace tensorflow {
+namespace {
+Status ValidateNoListArguments(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args, const char* arg_type,
+    const string& function_name) {
+  for (const OpDef::ArgDef& arg : args) {
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
+      return errors::InvalidArgument(
+          "Function ", function_name, " has an ", arg_type, " named \"",
+          arg.name(),
+          "\" that is a list of tensors."
+          " Multi-device functions support only single-tensor inputs "
+          " and outputs");
+    }
+  }
+  return OkStatus();
+}
+
+Status ValidateMultiDeviceOptions(
+    const FunctionDef& fdef,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
+  const OpDef& signature = fdef.signature();
+  // Multi-device functions currently do not support list inputs or outputs.
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.input_arg(), "input",
+                                             signature.name()));
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.output_arg(), "output",
+                                             signature.name()));
+  if (fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b()) {
+    return errors::Unimplemented(
+        "Function '", signature.name(), "' has `",
+        FunctionLibraryDefinition::kIntsOnDeviceAttr,
+        "` attribute set. This attribute is not currently supported by "
+        "multi-device functions.");
+  }
+  if (options.input_devices.size() != signature.input_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.input_devices must have the same length "
+        "as the number of arguments: input_devices length = ",
+        options.input_devices.size(),
+        " number of arguments = ", signature.input_arg_size());
+  }
+  if (!options.output_devices.empty() &&
+      options.output_devices.size() != signature.output_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.output_devices must either be empty or have the "
+        "same length as the number of arguments: output_devices length = ",
+        options.output_devices.size(),
+        " number of arguments = ", signature.output_arg_size());
+  }
+  return OkStatus();
+}
+
+Status SetArgShape(const std::unordered_map<int, DtypeAndPartialTensorShape>&
+                       input_resource_dtypes_and_shapes,
+                   const std::vector<Node*>& arg_nodes) {
+  for (Node* n : arg_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
+    if (dtype == DT_RESOURCE) {
+      auto dtype_and_shape_iter = input_resource_dtypes_and_shapes.find(index);
+      if (dtype_and_shape_iter != input_resource_dtypes_and_shapes.end()) {
+        AttrValue dtype_attr_value;
+        dtype_attr_value.mutable_list()->add_type(
+            dtype_and_shape_iter->second.dtype);
+        n->AddAttr("_handle_dtypes", dtype_attr_value);
+        TensorShapeProto shape_proto;
+        dtype_and_shape_iter->second.shape.AsProto(&shape_proto);
+        AttrValue shape_attr_value;
+        *shape_attr_value.mutable_list()->add_shape() = shape_proto;
+        n->AddAttr("_handle_shapes", shape_attr_value);
+      }
+    }
+  }
+  return OkStatus();
+}
+
+const string* AssignedOrRequestedDeviceName(const Node& node) {
+  if (node.has_assigned_device_name()) {
+    return &node.assigned_device_name();
+  }
+  return &node.requested_device();
+}
+
+// Sets `group` to the first colocation group specified in `node`. If no
+// group is specified, does not touch `group`.
+void GetColocationGroup(const Node* node, string* group) {
+  // We hoist the conversion from C-style string literal to string here,
+  // so that we can avoid the many repeated calls to strlen().
+  static const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+  const AttrValue* attr_value =
+      node->attrs().Find(kColocationAttrNameStringPiece);
+  if (attr_value != nullptr && attr_value->has_list() &&
+      attr_value->list().s_size() > 0) {
+    *group = attr_value->list().s(0);
+  }
+}
+}  // namespace
+
+Status GetGraphAndArgRets(
+    const string& function_name, AttrSlice attrs, const FunctionDef* fdef,
+    const FunctionLibraryDefinition* lib_def, std::unique_ptr<Graph>* graph,
+    std::vector<Node*>* arg_nodes, std::vector<Node*>* ret_nodes,
+    std::vector<string>* ret_node_names, DataTypeVector* ret_types,
+    std::vector<string>* control_ret_node_names) {
+  std::unique_ptr<FunctionBody> fbody;
+  // TODO(iga): FunctionDefToBodyHelper copies fdef. Avoid this copy.
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, attrs, lib_def, &fbody));
+  if (!fbody) {
+    LOG(ERROR) << "Failed to get FunctionBody for \"" << function_name << "\"";
+    return errors::Internal("Failed to construct FunctionBody for ",
+                            function_name);
+  }
+  *graph = std::unique_ptr<Graph>(fbody->graph);
+  arg_nodes->reserve(fbody->arg_nodes.size());
+  std::copy(fbody->arg_nodes.begin(), fbody->arg_nodes.end(),
+            std::back_inserter(*arg_nodes));
+  ret_nodes->reserve(fbody->ret_nodes.size());
+  std::copy(fbody->ret_nodes.begin(), fbody->ret_nodes.end(),
+            std::back_inserter(*ret_nodes));
+  fbody->graph = nullptr;
+  ret_node_names->reserve(fbody->ret_nodes.size());
+  for (const Node* node : fbody->ret_nodes) {
+    ret_node_names->push_back(node->name());
+  }
+  for (const auto& ret_type : fbody->ret_types) {
+    ret_types->push_back(ret_type);
+  }
+  control_ret_node_names->reserve(fbody->control_ret_nodes.size());
+  for (const Node* node : fbody->control_ret_nodes) {
+    control_ret_node_names->push_back(node->name());
+  }
+  return OkStatus();
+}
+
+Status PinArgsAndRets(const std::vector<string>& input_devices,
+                      const std::vector<string>& output_devices,
+                      const DeviceSet& device_set,
+                      const std::vector<Node*>& arg_nodes,
+                      const std::vector<Node*>& ret_nodes,
+                      const FunctionLibraryDefinition* lib_def,
+                      Device* default_device) {
+  // If output_devices are not specified, we want to set the output device
+  // based on the device of the output producing node. The output producing
+  // node can be an arg node because functions can simply return their
+  // arguments. To make sure that the output producing nodes have assigned
+  // devices, we assign them to arguments first.
+  for (Node* node : arg_nodes) {
+    const AttrValue* attr_value;
+    TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+    int64_t index = attr_value->i();
+    node->set_assigned_device_name(input_devices[index]);
+  }
+
+  for (Node* node : ret_nodes) {
+    if (output_devices.empty()) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &dtype));
+
+      VLOG(3) << "Trying to determine device for node " << node->name()
+              << "[T=" << DataTypeString(dtype) << "]";
+
+      // If output_devices are empty, the node producing retval
+      // must have explicitly assigned device or a colocation constraint
+      // to a node with explicitly assigned device.
+      for (const auto& it : node->in_edges()) {
+        if (it->IsControlEdge()) continue;
+
+        Node* src_node = it->src();
+        const string* src_device = AssignedOrRequestedDeviceName(*src_node);
+        string colocation_group = "";
+        GetColocationGroup(src_node, &colocation_group);
+        VLOG(3) << "Considering src: " << src_node->name()
+                << " src_device: " << *src_device
+                << " colo group: " << colocation_group;
+        while (src_device->empty() && colocation_group.empty() &&
+               src_node->IsIdentity()) {
+          // Only follows the real data input of Identity, not control edges.
+          Node* input_node;
+          TF_RETURN_IF_ERROR(src_node->input_node(0, &input_node));
+          src_node = input_node;
+
+          src_device = AssignedOrRequestedDeviceName(*src_node);
+          GetColocationGroup(src_node, &colocation_group);
+          VLOG(3) << "Considering src: " << src_node->name()
+                  << " src_device: " << *src_device
+                  << " colo group: " << colocation_group;
+        }
+
+        // If resource is produced by a function call node, we can't trust
+        // source node device assignment, because multi-device functions can
+        // return resource placed on multiple devices. In such case we leave
+        // retval device assignment empty, and rely on placer to infer correct
+        // assignment based on actual output device.
+        const bool can_use_src_node_device =
+            !(dtype == DT_RESOURCE && IsFunctionCall(*lib_def, *src_node));
+
+        if (!colocation_group.empty()) {
+          AttrValue::ListValue colo_attr;
+          colo_attr.add_s(colocation_group);
+          std::vector<string> colo_slice = {colocation_group};
+          node->AddAttr(kColocationAttrName, colo_slice);
+        } else if (!src_device->empty() && can_use_src_node_device) {
+          // Do not copy device from src node for variants, unless it is a no-op
+          // forward from input to output. This gets handled in
+          // colocation_graph.cc which has special logic for correctly placing
+          // _Retvals for various variant types.
+          if (dtype == DT_VARIANT && !src_node->IsArg()) {
+            continue;
+          }
+          // src_device can be a partially specified device. Find the
+          // matching device in the device_set.
+          DeviceNameUtils::ParsedName parsed;
+          if (!DeviceNameUtils::ParseFullName(*src_device, &parsed)) {
+            return errors::InvalidArgument(
+                "Failed to parse explicit device specification ", *src_device);
+          }
+          std::vector<Device*> matching_devices;
+          device_set.FindMatchingDevices(parsed, &matching_devices);
+          if (matching_devices.empty()) {
+            if (default_device != nullptr) {
+              matching_devices.push_back(default_device);
+            } else {
+              return errors::InvalidArgument(
+                  "Unable to find any devices for spec ", *src_device);
+            }
+          } else if (matching_devices.size() != 1) {
+            bool on_same_task = true;
+            for (int i = 1; i < matching_devices.size(); ++i) {
+              if (!DeviceNameUtils::IsSameAddressSpace(
+                      matching_devices.at(0)->parsed_name(),
+                      matching_devices.at(i)->parsed_name())) {
+                on_same_task = false;
+                break;
+              }
+            }
+            // If the src node of an output is assigned to a address space (e.g.
+            // py_func), rely on placer to assign a device to the output.
+            if (on_same_task) {
+              continue;
+            }
+            // Compare with default_device if it has a narrower scope matching
+            // requested device.
+            if (default_device != nullptr) {
+              int colocated_on_default_device = 0;
+              for (int i = 0; i < matching_devices.size(); ++i) {
+                if (DeviceNameUtils::IsSameAddressSpace(
+                        default_device->parsed_name(),
+                        matching_devices.at(i)->parsed_name())) {
+                  colocated_on_default_device++;
+                }
+              }
+              // Continue to raise error if multiple colocated devices are
+              // found.
+              if (colocated_on_default_device == 1) {
+                continue;
+              }
+            }
+            // Convert a vector of devices to a string.
+            // Using absl::StrJoin did not work in Android builds.
+            string devices = "[";
+            for (Device* device : matching_devices) {
+              devices.append(device->name());
+              devices.append(", ");
+            }
+            if (devices.size() > 2) {
+              devices.resize(devices.size() - 2);
+            }
+            devices.append("]");
+
+            return errors::InvalidArgument(
+                *src_device,
+                "When FunctionLibraryRuntime::Options.output_devices are "
+                "not specified for a multi-device function, the device "
+                "specification on the output node must match exactly one "
+                "device. Matched devices are ",
+                devices);
+          }
+          VLOG(3) << "Setting output device to " << matching_devices[0]->name()
+                  << " for node " << SummarizeNode(*node);
+          node->set_assigned_device_name(matching_devices[0]->name());
+        } else if (!src_device->empty() && !can_use_src_node_device) {
+          VLOG(3) << "Did not set device for a resource output node "
+                  << SummarizeNode(*node);
+        }
+      }
+    } else {
+      const AttrValue* attr_value;
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int64_t index = attr_value->i();
+      // output_devices size is checked in InstantiateMultiDevice
+      DCHECK_GT(output_devices.size(), index);
+      VLOG(3) << "Setting output device to " << output_devices[index]
+              << " for return at index " << index;
+      node->set_assigned_device_name(output_devices[index]);
+    }
+  }
+  return OkStatus();
+}
+
+StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env) {
+  const FunctionLibraryDefinition* lib_def =
+      options.lib_def == nullptr ? input_lib_def : options.lib_def;
+
+  const FunctionDef* fdef = lib_def->Find(function_name);
+  if (fdef == nullptr) {
+    return errors::InvalidArgument("Failed to find function \"", function_name,
+                                   "\" in function library: ", lib_def);
+  }
+
+  TF_RETURN_IF_ERROR(ValidateMultiDeviceOptions(*fdef, options));
+
+  std::unique_ptr<Graph> graph;
+  std::vector<Node*> arg_nodes, ret_nodes;
+  std::vector<string> ret_node_names;
+  DataTypeVector ret_types;
+  std::vector<string> control_ret_node_names;
+
+  TF_RETURN_IF_ERROR(GetGraphAndArgRets(
+      function_name, attrs, fdef, lib_def, &graph, &arg_nodes, &ret_nodes,
+      &ret_node_names, &ret_types, &control_ret_node_names));
+
+  GraphDef graph_def;
+  graph->ToGraphDef(&graph_def);
+  FunctionLibraryDefinition reachable_lib_def =
+      lib_def->ReachableDefinitions(graph_def);
+  *graph_def.mutable_library() = reachable_lib_def.ToProto();
+  if (options.graph_collector != nullptr) {
+    options.graph_collector->CollectRawGraph(graph_def);
+  }
+
+  // Mark and assign device for each node in the graph to be compiled by
+  // specified device.
+  if (!options.xla_compile_device_type.empty()) {
+    for (Node* node : graph->op_nodes()) {
+      node->AddAttr("_xla_compile_device_type",
+                    options.xla_compile_device_type);
+      if (default_device) {
+        node->set_assigned_device_name(default_device->name());
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(
+      SetArgShape(options.input_resource_dtypes_and_shapes, arg_nodes));
+  TF_RETURN_IF_ERROR(PinArgsAndRets(
+      options.input_devices, options.output_devices, dev_set, arg_nodes,
+      ret_nodes, lib_def,
+      options.config_proto.allow_soft_placement() ? default_device : nullptr));
+
+  // The runtime shouldn't depend on duplication between the function library
+  // owned by the graph and the one owned by the runtime. To ensure this, for
+  // now we ensure that the graph function library is empty and the runtime
+  // library receives the query from LookUps on the graph function library.
+  graph->mutable_flib_def()->set_default_registry(&reachable_lib_def);
+  graph->mutable_flib_def()->Clear();
+
+  // Do not run function/graph optimization passes for component functions,
+  // since they have already processed the main function.
+  const bool should_run_optimization_passes = !options.is_component_function;
+  if (!should_run_optimization_passes) {
+    VLOG(1) << "Skipping function/graph optimization passes when instantiating "
+               "component function "
+            << function_name;
+  }
+
+  // Mapping from a function body node name to the control output name.
+  std::unordered_map<string, string> node_name_to_control_ret;
+
+  bool control_rets_updated = false;
+  if (should_run_optimization_passes) {
+    TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
+        dev_set, options.config_proto, &graph, &reachable_lib_def,
+        &control_ret_node_names, &control_rets_updated));
+  }
+
+  if (control_rets_updated) {
+    // Function graph pass may have resulted in different nodes/node names for
+    // control rets.
+    for (const auto& control_ret : control_ret_node_names) {
+      node_name_to_control_ret.emplace(control_ret, control_ret);
+    }
+  } else {
+    for (const auto& control_ret : fdef->control_ret()) {
+      node_name_to_control_ret.emplace(control_ret.second, control_ret.first);
+    }
+  }
+
+  GraphOptimizationPassOptions optimization_options;
+  // TODO(iga): Thread other relevant options from SessionOptions.
+  SessionOptions session_options;
+  session_options.env = env;
+  session_options.config = options.config_proto;
+  optimization_options.session_options = &session_options;
+  optimization_options.graph = &graph;
+  optimization_options.flib_def = &reachable_lib_def;
+  optimization_options.device_set = &dev_set;
+  optimization_options.is_function_graph = true;
+  optimization_options.composite_devices = &composite_devices;
+  optimization_options.default_function_device = default_device;
+  optimization_options.function_def = fdef;
+  optimization_options.shape_inference_on_tfe_dialect_import =
+      options.shape_inference_on_tfe_dialect_import;
+  optimization_options.debug_filename_prefix = "pflr_optmz_";
+  env->CreateUniqueFileName(&optimization_options.debug_filename_prefix, "_");
+
+  DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
+  if (should_run_optimization_passes) {
+    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+        OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
+  }
+
+  // TODO(b/124993244): Smartly merge options in nested defuns, and raise
+  // exceptions/warnings in case where nested function call options are ignored.
+  DumpGraph("Before calling Placer", graph.get());
+  Placer placer(graph.get(), function_name, optimization_options.flib_def,
+                &dev_set, default_device,
+                options.config_proto.allow_soft_placement(),
+                options.config_proto.log_device_placement());
+  TF_RETURN_IF_ERROR(placer.Run(optimization_options));
+
+  DumpGraph("Before running POST_PLACEMENT passes", graph.get());
+  if (should_run_optimization_passes) {
+    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+        OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
+  }
+
+  if (options.optimize_graph_fn) {
+    DumpGraph("Before running graph optimization fn", graph.get());
+    Status status = options.optimize_graph_fn(
+        std::move(ret_node_names), std::move(control_ret_node_names),
+        &reachable_lib_def, dev_set, cpu_device, &graph);
+    if (!status.ok()) {
+      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
+                   << status.ToString();
+    }
+    DumpGraph("After optimization", graph.get());
+  }
+
+  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
+  if (should_run_optimization_passes) {
+    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+        OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  }
+
+  graph->mutable_flib_def()->set_default_registry(nullptr);
+  graph->mutable_flib_def()->Clear();
+  return OptimizedFunctionGraphInfo{function_name,
+                                    std::move(graph),
+                                    std::move(reachable_lib_def),
+                                    node_name_to_control_ret,
+                                    std::move(ret_types),
+                                    ret_nodes.size()};
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.h b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
new file mode 100644
index 00000000000..ccbb3f46a67
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file contains util functions related to function graph instantiation and
+// optimizations.
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+// TODO(b/246646753): add more tests.
+
+// Generates graph and return information given the input function name,
+// attributes and function definition.
+Status GetGraphAndArgRets(
+    const string& function_name, AttrSlice attrs, const FunctionDef* fdef,
+    const FunctionLibraryDefinition* lib_def, std::unique_ptr<Graph>* graph,
+    std::vector<Node*>* arg_nodes, std::vector<Node*>* ret_nodes,
+    std::vector<string>* ret_node_names, DataTypeVector* ret_types,
+    std::vector<string>* control_ret_node_names);
+
+// TODO(iga): Reword
+// Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
+// corresponding resource lives. This ensures that the Placer assigns ops that
+// access these resources to the appropriate devices.
+Status PinArgsAndRets(const std::vector<string>& input_devices,
+                      const std::vector<string>& output_devices,
+                      const DeviceSet& device_set,
+                      const std::vector<Node*>& arg_nodes,
+                      const std::vector<Node*>& ret_nodes,
+                      const FunctionLibraryDefinition* lib_def,
+                      Device* default_device);
+
+// Outputs graph optimization result after all the graph optimization (up till
+// before graph partitioning); returns error if optimization fails. Note that
+// the `input_lib_def` will be used only if the lib_def in `options` is nullptr.
+StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
+    const std::vector<CompositeDevice*>& composite_devices, Device* cpu_device,
+    Device* default_device, Env* env);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZE_FUNCTION_GRAPH_UTILS_H_
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
new file mode 100644
index 00000000000..093dc6ec646
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace {
+using ::testing::ElementsAre;
+
+constexpr absl::string_view kDevicePrefix = "/job:a/replica:0/task:0/device:";
+
+// Creates a vector of `num_devices` CPU deivces with prefix as `name_prefix` in
+// output `devices`.
+void CreateCpuDeviceList(absl::string_view name_prefix, int num_devices,
+                         std::vector<std::unique_ptr<Device>>& devices) {
+  SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  device_count->insert({"CPU", num_devices});
+  TF_ASSERT_OK(
+      DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0", &devices));
+}
+
+TEST(OptimizeFunctionGraphTest,
+     OptimizeFunctionGraphReturnsErrorIfNoFunctionFound) {
+  FunctionLibraryRuntime::InstantiateOptions opts;
+  opts.is_multi_device_function = true;
+  auto lib_def =
+      std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global());
+
+  std::vector<std::unique_ptr<Device>> devices;
+  CreateCpuDeviceList(kDevicePrefix, 1, devices);
+  DeviceSet device_set;
+  for (const auto& device : devices) {
+    device_set.AddDevice(device.get());
+  }
+
+  // Try to optimize a function called "FindDevice" which does not exist in
+  // library.
+  const StatusOr<OptimizedFunctionGraphInfo> aot_result =
+      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
+                            /*composite_devices=*/{}, devices[0].get(),
+                            devices[0].get(), Env::Default());
+  EXPECT_TRUE(errors::IsInvalidArgument(aot_result.status()))
+      << "Actual status: " << aot_result.status();
+  EXPECT_TRUE(absl::StrContains(aot_result.status().error_message(),
+                                "Failed to find function"))
+      << "Actual error message: " << aot_result.status().error_message();
+}
+
+TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
+  FunctionLibraryRuntime::InstantiateOptions opts;
+  opts.is_multi_device_function = true;
+
+  // Create a function library with a trivial function `FindDevice` which has
+  // one string output.
+  FunctionDefLibrary proto;
+  *(proto.add_function()) = test::function::FindDevice();
+  auto lib_def =
+      std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+
+  std::vector<std::unique_ptr<Device>> devices;
+  CreateCpuDeviceList(kDevicePrefix, 3, devices);
+  DeviceSet device_set;
+  for (const auto& device : devices) {
+    device_set.AddDevice(device.get());
+  }
+
+  const StatusOr<OptimizedFunctionGraphInfo> aot_result =
+      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
+                            /*composite_devices=*/{}, devices[0].get(),
+                            devices[1].get(), Env::Default());
+  TF_EXPECT_OK(aot_result.status());
+  EXPECT_EQ(aot_result->name, "FindDevice");
+  // FindDevice function has one return node.
+  EXPECT_EQ(aot_result->num_return_nodes, 1);
+  // Return node type is string.
+  EXPECT_THAT(aot_result->ret_types, ElementsAre(DT_STRING));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.cc b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
new file mode 100644
index 00000000000..3b775a3f744
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
@@ -0,0 +1,74 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+
+OptimizedFunctionGraph OptimizedFunctionGraphInfo::ToProto(
+    const OptimizedFunctionGraphInfo& info) {
+  OptimizedFunctionGraph proto;
+  proto.set_name(info.name);
+  GraphDef* function_graph_def = proto.mutable_function_graph();
+  info.function_graph->ToGraphDef(function_graph_def);
+  // Set lib_def into the function_graph.
+  *function_graph_def->mutable_library() = info.lib_def.ToProto();
+  *proto.mutable_ret_types() = {info.ret_types.begin(), info.ret_types.end()};
+  proto.set_num_return_nodes(info.num_return_nodes);
+  *proto.mutable_node_name_to_control_ret() = {
+      info.node_name_to_control_ret.begin(),
+      info.node_name_to_control_ret.end()};
+  return proto;
+}
+
+StatusOr<OptimizedFunctionGraphInfo> OptimizedFunctionGraphInfo::FromProto(
+    const OptimizedFunctionGraph& proto) {
+  // Reconstruct the lib_def.
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(),
+                                    proto.function_graph().library());
+
+  // Reconstruct the graph.
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.expect_device_spec = true;
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(options, proto.function_graph(), graph.get()));
+
+  // Clear both library and registry as the op lookup should be from lib_def.
+  graph->mutable_flib_def()->set_default_registry(nullptr);
+  graph->mutable_flib_def()->Clear();
+
+  const int num_ret_types = proto.ret_types_size();
+  DataTypeVector data_type_vector(num_ret_types);
+  for (int i = 0; i < num_ret_types; ++i) {
+    // Need to explicityly convert to the enum type.
+    data_type_vector[i] = static_cast<DataType>(proto.ret_types().at(i));
+  }
+  return OptimizedFunctionGraphInfo{proto.name(),
+                                    std::move(graph),
+                                    std::move(lib_def),
+                                    {proto.node_name_to_control_ret().begin(),
+                                     proto.node_name_to_control_ret().end()},
+                                    std::move(data_type_vector),
+                                    proto.num_return_nodes()};
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.h b/tensorflow/core/common_runtime/optimized_function_graph_info.h
new file mode 100644
index 00000000000..2663c67f5a5
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+
+// Function graph related information after optimizations. This struct can be
+// converted to and from
+// third_party/tensorflow/core/framework/optimized_function_graph.proto.
+struct OptimizedFunctionGraphInfo {
+  // Function name.
+  string name;
+  // Optimized function graph.
+  std::unique_ptr<Graph> function_graph;
+  // Optimized function library.
+  FunctionLibraryDefinition lib_def;
+  // Map from original node names to control return names.
+  std::unordered_map<string, string> node_name_to_control_ret;
+  // Return node types of the function.
+  DataTypeVector ret_types;
+  // Number of return nodes.
+  size_t num_return_nodes;
+
+  // Converts from the struct to OptimizedFunctionGraph proto.
+  static OptimizedFunctionGraph ToProto(const OptimizedFunctionGraphInfo& info);
+
+  // Converts from the proto to struct OptimizedFunctionGraphInfo. Returns error
+  // if the conversion fails.
+  static StatusOr<OptimizedFunctionGraphInfo> FromProto(
+      const OptimizedFunctionGraph& proto);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_OPTIMIZED_FUNCTION_GRAPH_INFO_H_
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
new file mode 100644
index 00000000000..eea88ffde50
--- /dev/null
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
@@ -0,0 +1,179 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/substitute.h"
+#include "third_party/protobuf/text_format.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/optimized_function_graph.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::EqualsProto;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+using ::testing::proto::Partially;
+using ::tsl::testing::StatusIs;
+
+REGISTER_OP("OneOutput").Output("y: float");
+REGISTER_OP("OneInputTwoOutputs")
+    .Input("x: float")
+    .Output("y: float")
+    .Output("z: float");
+
+// Test function graph name.
+constexpr absl::string_view kFunctionName = "test_func";
+// Test library proto content.
+constexpr absl::string_view kLibraryPb =
+    R"pb(library {
+           function {
+             signature {
+               name: "NonZero"
+               input_arg { name: "x" type_attr: "T" }
+               output_arg { name: "y" type_attr: "T" }
+               attr {
+                 name: "T"
+                 type: "type"
+                 allowed_values {
+                   list {
+                     type: DT_FLOAT
+                     type: DT_DOUBLE
+                     type: DT_INT32
+                     type: DT_INT64
+                     type: DT_STRING
+                   }
+                 }
+               }
+             }
+             node_def {
+               name: "y"
+               op: "Identity"
+               input: "x"
+               attr {
+                 key: "T"
+                 value { placeholder: "T" }
+               }
+             }
+             ret { key: "y" value: "y:output:0" }
+           }
+         })pb";
+
+TEST(OptimizedFunctionGraphUtilsTest, ToProtoProducesCorrectResult) {
+  // Create a simple graph with one trivial node.
+  NodeDef node_def;
+  TF_ASSERT_OK(NodeDefBuilder("A", "OneOutput").Finalize(&node_def));
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  Status status;
+  graph->AddNode(node_def, &status);
+  TF_ASSERT_OK(status);
+
+  // Create a simple library with one function.
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), {});
+  TF_ASSERT_OK(lib_def.AddFunctionDef(test::function::NonZero()));
+
+  // Construct an OptimizedFunctionGraphInfo.
+  OptimizedFunctionGraphInfo test_info{
+      std::string(kFunctionName), std::move(graph),
+      std::move(lib_def),         {{"A", "B"}},
+      {DT_FLOAT, DT_DOUBLE},      1};
+
+  const OptimizedFunctionGraph test_result =
+      OptimizedFunctionGraphInfo::ToProto(test_info);
+  EXPECT_THAT(test_result,
+              Partially(EqualsProto(absl::Substitute(
+                  R"pb(
+                    name: "test_func"
+                    function_graph { node { name: "A" op: "OneOutput" } $0 }
+                    node_name_to_control_ret { key: "A" value: "B" }
+                    ret_types: DT_FLOAT
+                    ret_types: DT_DOUBLE
+                    num_return_nodes: 1
+                  )pb",
+                  kLibraryPb))));
+}
+
+TEST(OptimizedFunctionGraphUtilsTest,
+     FromProtoProducesReturnsErrorIfGraphInvalid) {
+  OptimizedFunctionGraph proto;
+  // Invalid proto because no device specified for node B.
+  proto2::TextFormat::ParseFromString(
+      R"pb(
+        name: "test_func",
+        function_graph { node { name: 'B' op: 'OneOutput' } $0 }
+      )pb",
+      &proto);
+
+  EXPECT_THAT(OptimizedFunctionGraphInfo::FromProto(proto),
+              StatusIs(tsl::error::INVALID_ARGUMENT,
+                       "Node 'B' is missing a device specification"));
+}
+
+TEST(OptimizedFunctionGraphUtilsTest, FromProtoProducesCorrectResult) {
+  OptimizedFunctionGraph proto;
+  proto2::TextFormat::ParseFromString(
+      absl::Substitute(
+          R"pb(
+            name: "test_func",
+            function_graph {
+              node { name: 'B' op: 'OneInputTwoOutputs' device: ':CPU' } $0
+            }
+            node_name_to_control_ret { key: "B" value: "A" }
+            ret_types: DT_FLOAT
+            ret_types: DT_DOUBLE
+            ret_types: DT_BOOL
+            num_return_nodes: 2
+          )pb",
+          kLibraryPb),
+      &proto);
+
+  const StatusOr<OptimizedFunctionGraphInfo> test_result =
+      OptimizedFunctionGraphInfo::FromProto(proto);
+  TF_EXPECT_OK(test_result.status());
+  // Compare graph.
+  GraphDef test_result_graph_def;
+  test_result->function_graph->ToGraphDef(&test_result_graph_def);
+  EXPECT_THAT(test_result_graph_def,
+              Partially(EqualsProto(R"pb(node {
+                                           name: 'B'
+                                           op: 'OneInputTwoOutputs'
+                                           device: ':CPU'
+                                         })pb")));
+  // The lib_def in graph is already cleared.
+  EXPECT_EQ(test_result->function_graph->flib_def().Find("NonZero"), nullptr);
+  // The function should be found in result's lib_def.
+  EXPECT_NE(test_result->lib_def.Find("NonZero"), nullptr);
+  EXPECT_THAT(test_result->ret_types,
+              ElementsAre(DT_FLOAT, DT_DOUBLE, DT_BOOL));
+  EXPECT_THAT(test_result->node_name_to_control_ret,
+              UnorderedElementsAre(Pair("B", "A")));
+  EXPECT_EQ(test_result->num_return_nodes, 2);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index 432f98760b0..b9b0c65de8e 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -43,6 +44,7 @@ cc_library(
         ":pluggable_device_simple_allocator",
         "//tensorflow/c/experimental/stream_executor",
         "//tensorflow/c/experimental/stream_executor:stream_executor_internal",
+        "//tensorflow/compiler/xla/stream_executor:device_id_utils",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/compiler/xla/stream_executor:kernel",
@@ -50,12 +52,12 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/device:device_event_mgr",
         "//tensorflow/core/platform:stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
     alwayslink = 1,
 )
@@ -72,26 +74,30 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":pluggable_device_runtime_impl",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c/experimental/grappler",
         "//tensorflow/c/experimental/pluggable_profiler",
         "//tensorflow/c/experimental/pluggable_profiler:pluggable_profiler_internal",
         "//tensorflow/c/experimental/stream_executor",
         "//tensorflow/c/experimental/stream_executor:stream_executor_internal",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/compiler/xla/pjrt:pjrt_api",
+        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
+        "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/compiler/xla/stream_executor:kernel",
         "//tensorflow/core/common_runtime:bfc_allocator",
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/common_runtime:local_device",
+        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_api",
+        "//tensorflow/core/common_runtime/next_pluggable_device:next_pluggable_device_factory",
         "//tensorflow/core/common_runtime:process_state",
         "//tensorflow/core/common_runtime:shared_counter",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:stream_executor",
-        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
-        "//tensorflow/compiler/xla/stream_executor:event",
-        "//tensorflow/compiler/xla/stream_executor:kernel",
-        ":pluggable_device_runtime_impl",
+        "//tensorflow/core:protos_all_cc",
     ] + if_static([
         "//tensorflow/core/common_runtime:copy_tensor",
     ]),
@@ -102,19 +108,20 @@ cc_library(
     hdrs = [":pluggable_device_runtime_headers"],
     linkstatic = 1,
     deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
+        "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/compiler/xla/stream_executor:kernel",
         "//tensorflow/core/common_runtime:core_cpu",
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/common_runtime:shared_counter",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core/lib/core:status",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:stream_executor",
-        "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
-        "//tensorflow/compiler/xla/stream_executor:event",
-        "//tensorflow/compiler/xla/stream_executor:kernel",
+        "//tensorflow/core:protos_all_cc",
     ] + if_static([
         # Temporary workaround for duplicated symbols issues.
         ":pluggable_device_runtime_impl",
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
index b9010da0956..8e58c006524 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_manager.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h"
@@ -164,7 +164,7 @@ PluggableDevice::~PluggableDevice() {
 
 Status PluggableDevice::Init(const SessionOptions& options) {
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
-  auto executor_status = DeviceIdUtil::ExecutorForTfDeviceId(
+  auto executor_status = se::DeviceIdUtil::ExecutorForTfDeviceId(
       DeviceType(device_type()), platform, tf_device_id_);
   if (!executor_status.status().ok()) {
     return errors::Internal("Failed to get StreamExecutor for device",
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
index 497c789a0da..2b33061c9b4 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_manager.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index 8d76466dbfd..11baff6b047 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_manager.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h"
@@ -82,9 +82,9 @@ Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
   int64_t total_memory = 0;
   int64_t available_memory = 0;
   se::Platform* platform = PluggableDeviceMachineManager(platform_name);
-  se::StreamExecutor* se =
-      DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_device_id)
-          .value();
+  se::StreamExecutor* se = se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                               platform, platform_device_id)
+                               .value();
   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
     return errors::Unknown(
         "Failed to query available memory for PluggableDevice ",
@@ -177,7 +177,7 @@ Status PluggableDeviceFactory::CreateDevices(
   std::vector<PlatformDeviceId> visible_device_order;
 
   if (num_tf_devices > 0) {
-    TF_RETURN_IF_ERROR(DeviceIdUtil::ParseVisibleDeviceList(
+    TF_RETURN_IF_ERROR(se::DeviceIdUtil::ParseVisibleDeviceList(
         device_options.visible_device_list(), platform->VisibleDeviceCount(),
         &visible_device_order));
   }
@@ -232,8 +232,8 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
       name_prefix, "/device:", device_type_, ":", tf_device_id.value());
 
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
-  DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
-                                     tf_device_id);
+  se::DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
+                                         tf_device_id);
   PlatformDeviceId platform_device_id;
   TF_RETURN_IF_ERROR(DeviceIdManager::TfToPlatformDeviceId(
       DeviceType(device_type_), tf_device_id, &platform_device_id));
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index 0b7554a4ac9..6359d9f4bfb 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include "tensorflow/c/experimental/grappler/grappler_internal.h"
 #include "tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -60,6 +63,48 @@ static Status InitDeviceModule(void* dso_handle) {
   return OkStatus();
 }
 
+static Status InitNextPluggableDeviceModule(void* dso_handle) {
+  void* dso_symbol;
+  tensorflow::Env* env = tensorflow::Env::Default();
+
+  // Loads the next pluggable device.
+  Status status =
+      env->GetSymbolFromLibrary(dso_handle, "TFNPD_InitPlugin", &dso_symbol);
+  if (errors::IsNotFound(status)) {
+    VLOG(1) << "Next pluggable device module not found.";
+    return OkStatus();
+  } else if (status != OkStatus()) {
+    return status;
+  }
+  auto init_fn = reinterpret_cast<TFNPDInitPluginFn>(dso_symbol);
+  string device_type, compilation_device_name;
+  TF_RETURN_IF_ERROR(InitNextPluggableDevicePlugin(init_fn, &device_type,
+                                                   &compilation_device_name));
+
+  // Loads the PJRT plugin.
+  // TODO(b/265301627): use LoadPjrtPlugin when it supports windows.
+  status = env->GetSymbolFromLibrary(dso_handle, "GetPjrtApi", &dso_symbol);
+  if (errors::IsNotFound(status)) {
+    VLOG(1) << "Loading PJRT plugin failed for " << device_type << ": "
+            << status.error_message();
+    return OkStatus();
+  } else if (!status.ok()) {
+    return status;
+  }
+  auto init_pjrt_fn = reinterpret_cast<pjrt::PjrtApiInitFn>(dso_symbol);
+  TF_RETURN_IF_ERROR(pjrt::InitPjrtPlugin(init_pjrt_fn, device_type));
+
+  // TODO(b/265303775): consider let NextPluggableDevice decide the priority in
+  // TFNPDInitPluginFn.
+  DeviceFactory::Register(device_type,
+                          std::make_unique<NextPluggableDeviceFactory>(
+                              device_type, compilation_device_name),
+                          /*priority=*/200, /*is_pluggable_device=*/false);
+
+  VLOG(1) << "Successfully initialized NextPluggableDevice module.";
+  return OkStatus();
+}
+
 static Status InitGraphModule(void* dso_handle) {
   void* dso_symbol;
   tensorflow::Env* env = tensorflow::Env::Default();
@@ -126,6 +171,7 @@ Status RegisterPluggableDevicePlugin(void* dso_handle) {
   // has issues in loading / initializing.
   // Step 1 Init Device Module.
   TF_RETURN_IF_ERROR(InitDeviceModule(dso_handle));
+  TF_RETURN_IF_ERROR(InitNextPluggableDeviceModule(dso_handle));
 
   // Step 2 Init Kernel Module.
   TF_RETURN_IF_ERROR(InitKernelModule(dso_handle));
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
index a6954ab2855..a5ec1ff5916 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_manager.h"
-#include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_init.h"
@@ -68,7 +68,7 @@ int PluggableDeviceProcessState::BusIdForPluggableDevice(
     TfDeviceId tf_device_id) {
   // Return the NUMA node associated with the PluggableDevice's StreamExecutor.
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
-  se::StreamExecutor* se = DeviceIdUtil::ExecutorForTfDeviceId(
+  se::StreamExecutor* se = se::DeviceIdUtil::ExecutorForTfDeviceId(
                                DeviceType(device_type_), platform, tf_device_id)
                                .value();
   int numa_node = se->GetDeviceDescription().numa_node();
@@ -82,8 +82,8 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceAllocator(
   const string& allocator_type = options.allocator_type();
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   mutex_lock lock(mu_);
-  DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
-                                     tf_device_id);
+  se::DeviceIdUtil::CheckValidTfDeviceId(DeviceType(device_type_), platform,
+                                         tf_device_id);
 
   if (tf_device_id.value() >=
       static_cast<int64_t>(pluggable_device_allocators_.size())) {
@@ -110,11 +110,12 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceAllocator(
 
     bool use_unified_memory = options.per_process_gpu_memory_fraction() > 1.0 ||
                               options.experimental().use_unified_memory();
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        DeviceIdUtil::ExecutorForPlatformDeviceId(platform, platform_device_id)
-            .value(),
-        platform_device_id, use_unified_memory,
-        pluggable_device_visitors_[bus_id], {});
+    DeviceMemAllocator* sub_allocator =
+        new DeviceMemAllocator(se::DeviceIdUtil::ExecutorForPlatformDeviceId(
+                                   platform, platform_device_id)
+                                   .value(),
+                               platform_device_id, use_unified_memory,
+                               pluggable_device_visitors_[bus_id], {});
 
     Allocator* device_allocator = nullptr;
     auto cplatform = dynamic_cast<se::CPlatform*>(platform);
@@ -166,8 +167,8 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceHostAllocator(
   for (int i = 0; i < static_cast<int>(pluggable_device_allocators_.size());
        ++i) {
     if (pluggable_device_allocators_[i].allocator != nullptr) {
-      se = DeviceIdUtil::ExecutorForTfDeviceId(DeviceType(device_type_),
-                                               platform, TfDeviceId(i))
+      se = se::DeviceIdUtil::ExecutorForTfDeviceId(DeviceType(device_type_),
+                                                   platform, TfDeviceId(i))
                .value();
       break;
     }
@@ -188,7 +189,7 @@ Allocator* PluggableDeviceProcessState::GetPluggableDeviceHostAllocator(
         pluggable_device_host_free_visitors_[numa_node]);
     int64_t pluggable_device_host_mem_limit_in_mb = -1;
     Status status = ReadInt64FromEnvVar("TF_GPU_HOST_MEM_LIMIT_IN_MB",
-                                        1LL << 16 /*64GB max by default*/,
+                                        1LL << 17 /*128GB max by default*/,
                                         &pluggable_device_host_mem_limit_in_mb);
     if (!status.ok()) {
       LOG(ERROR) << "GetPluggableDeviceHostAllocator: "
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index b9c20db52ba..3c111f7dacb 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 
 #include <errno.h>
+
 #ifndef _MSC_VER
 #include <strings.h>
 #include <sys/mman.h>  // for munmap
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
 
@@ -259,6 +261,8 @@ void PoolAllocator::EvictOne() {
 
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes,
                                size_t* bytes_received) {
+  tsl::profiler::TraceMe traceme("BasicCPUAllocator::Alloc");
+
   void* ptr = nullptr;
   *bytes_received = num_bytes;
   if (num_bytes > 0) {
@@ -274,6 +278,8 @@ void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes,
 }
 
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
+  tsl::profiler::TraceMe traceme("BasicCPUAllocator::Free");
+
   if (num_bytes > 0) {
     VisitFree(ptr, numa_node_, num_bytes);
     if (numa_node_ == port::kNUMANoAffinity) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 4aafb173461..836d1ad5295 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -17,24 +17,26 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <iterator>
-#include <optional>
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
-#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_optimization_registry.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
 #include "tensorflow/core/common_runtime/partitioning_utils.h"
 #include "tensorflow/core/common_runtime/placer.h"
-#include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
 #include "tensorflow/core/common_runtime/single_threaded_executor.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -45,17 +47,15 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
-#include "tensorflow/core/graph/graph_partition.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/dump_graph.h"
-#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
 #include "tensorflow/tsl/platform/statusor.h"
 #if !defined(IS_MOBILE_PLATFORM)
@@ -95,10 +95,11 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent,
     const SessionMetadata* session_metadata,
-    Rendezvous::Factory rendezvous_factory)
+    Rendezvous::Factory rendezvous_factory,
+    StatsPublisherFactory stats_publisher_factory)
     : parent_(parent),
       env_(env),
-      config_(config ? absl::make_optional(*config) : absl::nullopt),
+      config_(config ? std::make_optional(*config) : std::nullopt),
       device_mgr_(device_mgr),
       lib_def_(lib_def),
       default_thread_pool_(default_thread_pool),
@@ -108,7 +109,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       session_metadata_(session_metadata),
       rendezvous_factory_(std::move(rendezvous_factory)),
       optimizer_options_(optimizer_options),
-      graph_def_version_(graph_def_version) {
+      graph_def_version_(graph_def_version),
+      stats_publisher_factory_(std::move(stats_publisher_factory)) {
   if (device_mgr == nullptr) {
     (*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, config_ ? &(*config_) : nullptr, nullptr,
@@ -370,58 +372,12 @@ ProcessFunctionLibraryRuntime::IsMultiDevice(
 }
 
 namespace {
-// Sets `group` to the first colocation group specified in `node`. If no
-// group is specified, does not touch `group`.
-void GetColocationGroup(const Node* node, string* group) {
-  // We hoist the conversion from C-style string literal to string here,
-  // so that we can avoid the many repeated calls to strlen().
-  static const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
-  const AttrValue* attr_value =
-      node->attrs().Find(kColocationAttrNameStringPiece);
-  if (attr_value != nullptr && attr_value->has_list() &&
-      attr_value->list().s_size() > 0) {
-    *group = attr_value->list().s(0);
-  }
-}
-
-const string* AssignedOrRequestedDeviceName(const Node& node) {
-  if (node.has_assigned_device_name()) {
-    return &node.assigned_device_name();
-  }
-  return &node.requested_device();
-}
-
-Status SetArgShape(const std::unordered_map<int, DtypeAndPartialTensorShape>&
-                       input_resource_dtypes_and_shapes,
-                   const std::vector<Node*>& arg_nodes) {
-  for (Node* n : arg_nodes) {
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
-    DataType dtype;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
-    if (dtype == DT_RESOURCE) {
-      auto dtype_and_shape_iter = input_resource_dtypes_and_shapes.find(index);
-      if (dtype_and_shape_iter != input_resource_dtypes_and_shapes.end()) {
-        AttrValue dtype_attr_value;
-        dtype_attr_value.mutable_list()->add_type(
-            dtype_and_shape_iter->second.dtype);
-        n->AddAttr("_handle_dtypes", dtype_attr_value);
-        TensorShapeProto shape_proto;
-        dtype_and_shape_iter->second.shape.AsProto(&shape_proto);
-        AttrValue shape_attr_value;
-        *shape_attr_value.mutable_list()->add_shape() = shape_proto;
-        n->AddAttr("_handle_shapes", shape_attr_value);
-      }
-    }
-  }
-  return OkStatus();
-}
-
 // Returns the local tensors referred by `args`.
 std::vector<Tensor> GetLocalArgs(gtl::ArraySlice<FunctionArg> args) {
   std::vector<Tensor> tensors;
   for (const auto& arg : args) {
     if (arg.index() == 0) {
+      // NOLINTNEXTLINE
       tensors.push_back(absl::get<Tensor>(arg));
     }
   }
@@ -451,230 +407,12 @@ Status FunctionRetsToTensors(const std::vector<FunctionRet>* function_rets,
       return errors::Internal(
           "Expect a Tensor as a function output but got a TensorShape.");
     }
+    // NOLINTNEXTLINE
     tensors->push_back(absl::get<Tensor>(ret));
   }
   return OkStatus();
 }
-
-}  // anonymous namespace
-
-Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
-    const std::vector<string>& input_devices,
-    const std::vector<string>& output_devices, const DeviceSet& device_set,
-    const std::vector<Node*>& arg_nodes, const std::vector<Node*>& ret_nodes,
-    const FunctionLibraryDefinition* lib_def, Device* default_device) {
-  // If output_devices are not specified, we want to set the output device
-  // based on the device of the output producing node. The output producing
-  // node can be an arg node because functions can simply return their
-  // arguments. To make sure that the output producing nodes have assigned
-  // devices, we assign them to arguments first.
-  for (Node* node : arg_nodes) {
-    const AttrValue* attr_value;
-    TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-    int64_t index = attr_value->i();
-    node->set_assigned_device_name(input_devices[index]);
-  }
-
-  for (Node* node : ret_nodes) {
-    if (output_devices.empty()) {
-      DataType dtype;
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &dtype));
-
-      VLOG(3) << "Trying to determine device for node " << node->name()
-              << "[T=" << DataTypeString(dtype) << "]";
-
-      // If output_devices are empty, the node producing retval
-      // must have explicitly assigned device or a colocation constraint
-      // to a node with explicitly assigned device.
-      for (const auto& it : node->in_edges()) {
-        if (it->IsControlEdge()) continue;
-
-        Node* src_node = it->src();
-        const string* src_device = AssignedOrRequestedDeviceName(*src_node);
-        string colocation_group = "";
-        GetColocationGroup(src_node, &colocation_group);
-        VLOG(3) << "Considering src: " << src_node->name()
-                << " src_device: " << *src_device
-                << " colo group: " << colocation_group;
-        while (src_device->empty() && colocation_group.empty() &&
-               src_node->IsIdentity()) {
-          // Only follows the real data input of Identity, not control edges.
-          Node* input_node;
-          TF_RETURN_IF_ERROR(src_node->input_node(0, &input_node));
-          src_node = input_node;
-
-          src_device = AssignedOrRequestedDeviceName(*src_node);
-          GetColocationGroup(src_node, &colocation_group);
-          VLOG(3) << "Considering src: " << src_node->name()
-                  << " src_device: " << *src_device
-                  << " colo group: " << colocation_group;
-        }
-
-        // If resource is produced by a function call node, we can't trust
-        // source node device assignment, because multi-device functions can
-        // return resource placed on multiple devices. In such case we leave
-        // retval device assignment empty, and rely on placer to infer correct
-        // assignment based on actual output device.
-        const bool can_use_src_node_device =
-            !(dtype == DT_RESOURCE && IsFunctionCall(*lib_def, *src_node));
-
-        if (!colocation_group.empty()) {
-          AttrValue::ListValue colo_attr;
-          colo_attr.add_s(colocation_group);
-          std::vector<string> colo_slice = {colocation_group};
-          node->AddAttr(kColocationAttrName, colo_slice);
-        } else if (!src_device->empty() && can_use_src_node_device) {
-          // Do not copy device from src node for variants, unless it is a no-op
-          // forward from input to output. This gets handled in
-          // colocation_graph.cc which has special logic for correctly placing
-          // _Retvals for various variant types.
-          if (dtype == DT_VARIANT && !src_node->IsArg()) {
-            continue;
-          }
-          // src_device can be a partially specified device. Find the
-          // matching device in the device_set.
-          DeviceNameUtils::ParsedName parsed;
-          if (!DeviceNameUtils::ParseFullName(*src_device, &parsed)) {
-            return errors::InvalidArgument(
-                "Failed to parse explicit device specification ", *src_device);
-          }
-          std::vector<Device*> matching_devices;
-          device_set.FindMatchingDevices(parsed, &matching_devices);
-          if (matching_devices.empty()) {
-            if (default_device != nullptr) {
-              matching_devices.push_back(default_device);
-            } else {
-              return errors::InvalidArgument(
-                  "Unable to find any devices for spec ", *src_device);
-            }
-          } else if (matching_devices.size() != 1) {
-            bool on_same_task = true;
-            for (int i = 1; i < matching_devices.size(); ++i) {
-              if (!DeviceNameUtils::IsSameAddressSpace(
-                      matching_devices.at(0)->parsed_name(),
-                      matching_devices.at(i)->parsed_name())) {
-                on_same_task = false;
-                break;
-              }
-            }
-            // If the src node of an output is assigned to a address space (e.g.
-            // py_func), rely on placer to assign a device to the output.
-            if (on_same_task) {
-              continue;
-            }
-            // Compare with default_device if it has a narrower scope matching
-            // requested device.
-            if (default_device != nullptr) {
-              int colocated_on_default_device = 0;
-              for (int i = 0; i < matching_devices.size(); ++i) {
-                if (DeviceNameUtils::IsSameAddressSpace(
-                        default_device->parsed_name(),
-                        matching_devices.at(i)->parsed_name())) {
-                  colocated_on_default_device++;
-                }
-              }
-              // Continue to raise error if multiple colocated devices are
-              // found.
-              if (colocated_on_default_device == 1) {
-                continue;
-              }
-            }
-            // Convert a vector of devices to a string.
-            // Using absl::StrJoin did not work in Android builds.
-            string devices = "[";
-            for (Device* device : matching_devices) {
-              devices.append(device->name());
-              devices.append(", ");
-            }
-            if (devices.size() > 2) {
-              devices.resize(devices.size() - 2);
-            }
-            devices.append("]");
-
-            return errors::InvalidArgument(
-                *src_device,
-                "When FunctionLibraryRuntime::Options.output_devices are "
-                "not specified for a multi-device function, the device "
-                "specification on the output node must match exactly one "
-                "device. Matched devices are ",
-                devices);
-          }
-          VLOG(3) << "Setting output device to " << matching_devices[0]->name()
-                  << " for node " << SummarizeNode(*node);
-          node->set_assigned_device_name(matching_devices[0]->name());
-        } else if (!src_device->empty() && !can_use_src_node_device) {
-          VLOG(3) << "Did not set device for a resource output node "
-                  << SummarizeNode(*node);
-        }
-      }
-    } else {
-      const AttrValue* attr_value;
-      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-      int64_t index = attr_value->i();
-      // output_devices size is checked in InstantiateMultiDevice
-      DCHECK_GT(output_devices.size(), index);
-      VLOG(3) << "Setting output device to " << output_devices[index]
-              << " for return at index " << index;
-      node->set_assigned_device_name(output_devices[index]);
-    }
-  }
-  return OkStatus();
-}
-
-namespace {
-
-Status ValidateNoListArguments(
-    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args, const char* arg_type,
-    const string& function_name) {
-  for (const OpDef::ArgDef& arg : args) {
-    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-      return errors::InvalidArgument(
-          "Function ", function_name, " has an ", arg_type, " named \"",
-          arg.name(),
-          "\" that is a list of tensors."
-          " Multi-device functions support only single-tensor inputs "
-          " and outputs");
-    }
-  }
-  return OkStatus();
-}
-
-Status ValidateMultiDeviceOptions(
-    const FunctionDef& fdef,
-    const FunctionLibraryRuntime::InstantiateOptions& options) {
-  const OpDef& signature = fdef.signature();
-  // Multi-device functions currently do not support list inputs or outputs.
-  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.input_arg(), "input",
-                                             signature.name()));
-  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.output_arg(), "output",
-                                             signature.name()));
-  if (fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
-      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b()) {
-    return errors::Unimplemented(
-        "Function '", signature.name(), "' has `",
-        FunctionLibraryDefinition::kIntsOnDeviceAttr,
-        "` attribute set. This attribute is not currently supported by "
-        "multi-device functions.");
-  }
-  if (options.input_devices.size() != signature.input_arg_size()) {
-    return errors::InvalidArgument(
-        "InstantiateOptions.input_devices must have the same length "
-        "as the number of arguments: input_devices length = ",
-        options.input_devices.size(),
-        " number of arguments = ", signature.input_arg_size());
-  }
-  if (!options.output_devices.empty() &&
-      options.output_devices.size() != signature.output_arg_size()) {
-    return errors::InvalidArgument(
-        "InstantiateOptions.output_devices must either be empty or have the "
-        "same length as the number of arguments: output_devices length = ",
-        options.output_devices.size(),
-        " number of arguments = ", signature.output_arg_size());
-  }
-  return OkStatus();
-}
-
-}  // anonymous namespace
+}  // namespace
 
 ProcessFunctionLibraryRuntime::AsyncAttributes::Summary
 ProcessFunctionLibraryRuntime::AsyncAttributes::Summarize(const Graph* graph) {
@@ -719,218 +457,46 @@ ProcessFunctionLibraryRuntime::AsyncAttributes::Summarize(const Graph* graph) {
   return AsyncAttributes::kAsyncRequired;
 }
 
-Status GetGraphAndArgRets(
-    const string& function_name, AttrSlice attrs, const FunctionDef* fdef,
-    const FunctionLibraryDefinition* lib_def, std::unique_ptr<Graph>* graph,
-    std::vector<Node*>* arg_nodes, std::vector<Node*>* ret_nodes,
-    std::vector<string>* ret_node_names, DataTypeVector* ret_types,
-    std::vector<string>* control_ret_node_names) {
-  std::unique_ptr<FunctionBody> fbody;
-  // TODO(iga): FunctionDefToBodyHelper copies fdef. Avoid this copy.
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, attrs, lib_def, &fbody));
-  if (!fbody) {
-    LOG(ERROR) << "Failed to get FunctionBody for \"" << function_name << "\"";
-    return errors::Internal("Failed to construct FunctionBody for ",
-                            function_name);
-  }
-  *graph = std::unique_ptr<Graph>(fbody->graph);
-  arg_nodes->reserve(fbody->arg_nodes.size());
-  std::copy(fbody->arg_nodes.begin(), fbody->arg_nodes.end(),
-            std::back_inserter(*arg_nodes));
-  ret_nodes->reserve(fbody->ret_nodes.size());
-  std::copy(fbody->ret_nodes.begin(), fbody->ret_nodes.end(),
-            std::back_inserter(*ret_nodes));
-  fbody->graph = nullptr;
-  ret_node_names->reserve(fbody->ret_nodes.size());
-  for (const Node* node : fbody->ret_nodes) {
-    ret_node_names->push_back(node->name());
-  }
-  for (const auto& ret_type : fbody->ret_types) {
-    ret_types->push_back(ret_type);
-  }
-  control_ret_node_names->reserve(fbody->control_ret_nodes.size());
-  for (const Node* node : fbody->control_ret_nodes) {
-    control_ret_node_names->push_back(node->name());
-  }
-  return OkStatus();
-}
-
-StatusOr<ProcessFunctionLibraryRuntime::OptimizedFunctionGraphInfo>
-ProcessFunctionLibraryRuntime::OptimizeFunctionGraph(
-    const string& function_name, AttrSlice attrs,
-    const FunctionLibraryRuntime::InstantiateOptions& options,
-    const std::shared_ptr<DeviceSet>& dev_set) {
-  const FunctionLibraryDefinition* lib_def =
-      options.lib_def == nullptr ? lib_def_ : options.lib_def;
-
-  const FunctionDef* fdef = lib_def->Find(function_name);
-  if (fdef == nullptr) {
-    return errors::InvalidArgument("Failed to find function \"", function_name,
-                                   "\" in function library: ", lib_def);
-  }
-
-  TF_RETURN_IF_ERROR(ValidateMultiDeviceOptions(*fdef, options));
-
-  std::unique_ptr<Graph> graph;
-  std::vector<Node*> arg_nodes, ret_nodes;
-  std::vector<string> ret_node_names;
-  DataTypeVector ret_types;
-  std::vector<string> control_ret_node_names;
-
-  TF_RETURN_IF_ERROR(GetGraphAndArgRets(
-      function_name, attrs, fdef, lib_def, &graph, &arg_nodes, &ret_nodes,
-      &ret_node_names, &ret_types, &control_ret_node_names));
-
-  GraphDef graph_def;
-  graph->ToGraphDef(&graph_def);
-  FunctionLibraryDefinition reachable_lib_def =
-      lib_def->ReachableDefinitions(graph_def);
-  *graph_def.mutable_library() = reachable_lib_def.ToProto();
-  if (options.graph_collector != nullptr) {
-    options.graph_collector->CollectRawGraph(graph_def);
-  }
-
-  Device* default_device = nullptr;
-  if (options.default_device_to_target && !options.target.empty()) {
-    // Make the `target` device the default device if nothing else is hard
-    // coded. This allows the same function definition to be specialized to
-    // different devices depending on the `PartitionedCallOp` device.
-    FunctionLibraryRuntime* flr = GetFLR(options.target);
-    if (flr == nullptr) {
-      return errors::InvalidArgument(
-          "Cannot instantiate multi-device function with target device ",
-          options.target);
-    }
-    default_device = flr->device();
-  }
-
-  // Mark and assign device for each node in the graph to be compiled by
-  // specified device.
-  if (!options.xla_compile_device_type.empty()) {
-    for (Node* node : graph->op_nodes()) {
-      node->AddAttr("_xla_compile_device_type",
-                    options.xla_compile_device_type);
-      if (default_device) {
-        node->set_assigned_device_name(default_device->name());
-      }
-    }
-  }
-
-  TF_RETURN_IF_ERROR(
-      SetArgShape(options.input_resource_dtypes_and_shapes, arg_nodes));
-  TF_RETURN_IF_ERROR(PinArgsAndRets(
-      options.input_devices, options.output_devices, *dev_set, arg_nodes,
-      ret_nodes, lib_def_,
-      options.config_proto.allow_soft_placement() ? default_device : nullptr));
-
-  // The runtime shouldn't depend on duplication between the function library
-  // owned by the graph and the one owned by the runtime. To ensure this, for
-  // now we ensure that the graph function library is empty and the runtime
-  // library receives the query from LookUps on the graph function library.
-  graph->mutable_flib_def()->set_default_registry(&reachable_lib_def);
-  graph->mutable_flib_def()->Clear();
-
-  // Do not run function/graph optimization passes for component functions,
-  // since they have already processed the main function.
-  const bool should_run_optimization_passes = !options.is_component_function;
-  if (!should_run_optimization_passes) {
-    VLOG(1) << "Skipping function/graph optimization passes when instantiating "
-               "component function "
-            << function_name;
-  }
-
-  // Mapping from a function body node name to the control output name.
-  std::unordered_map<string, string> node_name_to_control_ret;
-
-  bool control_rets_updated = false;
-  if (should_run_optimization_passes) {
-    TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-        *dev_set, options.config_proto, &graph, &reachable_lib_def,
-        &control_ret_node_names, &control_rets_updated));
-  }
-
-  if (control_rets_updated) {
-    // Function graph pass may have resulted in different nodes/node names for
-    // control rets.
-    for (const auto& control_ret : control_ret_node_names) {
-      node_name_to_control_ret.emplace(control_ret, control_ret);
+void ProcessFunctionLibraryRuntime::PublishSubgraphs(
+    const std::string& function_name,
+    std::unique_ptr<std::unordered_map<std::string, std::unique_ptr<Graph>>>
+        subgraphs) {
+  // Use shared_ptr since std::function cannot capture move-only objects
+  auto subgraphs_new =
+      std::shared_ptr<std::unordered_map<std::string, std::unique_ptr<Graph>>>(
+          subgraphs.release());
+  auto completed = std::make_unique<tsl::Notification>();
+  // Converting graphs to GraphDefs involves expensive copies. Delegate the work
+  // to a separate thread to unblock the caller.
+  std::function<void()> thread_fn = [this, function_name, n = completed.get(),
+                                     subgraphs = subgraphs_new]() {
+    std::unique_ptr<StatsPublisherInterface> stats_publisher =
+        stats_publisher_factory_(function_name, BuildGraphOptions(),
+                                 SessionOptions());
+    std::vector<GraphDef> published_graph_defs;
+    published_graph_defs.reserve(subgraphs->size());
+    for (const auto& pair : *subgraphs) {
+      Graph* subgraph = pair.second.get();
+      GraphDef gd;
+      subgraph->ToGraphDef(&gd);
+      published_graph_defs.push_back(std::move(gd));
     }
-  } else {
-    for (const auto& control_ret : fdef->control_ret()) {
-      node_name_to_control_ret.emplace(control_ret.second, control_ret.first);
+    stats_publisher->PublishGraphProto(std::move(published_graph_defs));
+    {
+      mutex_lock l(mu_);
+      stats_publishers_.push_back(std::move(stats_publisher));
     }
-  }
-
-  GraphOptimizationPassOptions optimization_options;
-  // TODO(iga): Thread other relevant options from SessionOptions.
-  SessionOptions session_options;
-  session_options.env = env_;
-  session_options.config = options.config_proto;
-  optimization_options.session_options = &session_options;
-  optimization_options.graph = &graph;
-  optimization_options.flib_def = &reachable_lib_def;
-  optimization_options.device_set = dev_set.get();
-  optimization_options.is_function_graph = true;
-  std::vector<CompositeDevice*> composite_devices;
+    n->Notify();
+  };
   {
-    tf_shared_lock l(mu_);
-    for (auto* d : composite_devices_) composite_devices.push_back(d);
-  }
-  optimization_options.composite_devices = &composite_devices;
-  optimization_options.default_function_device = default_device;
-  optimization_options.function_def = fdef;
-  optimization_options.shape_inference_on_tfe_dialect_import =
-      options.shape_inference_on_tfe_dialect_import;
-  optimization_options.debug_filename_prefix = "pflr_optmz_";
-  env_->CreateUniqueFileName(&optimization_options.debug_filename_prefix, "_");
-
-  DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
-  if (should_run_optimization_passes) {
-    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
-        OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
-  }
-
-  // TODO(b/124993244): Smartly merge options in nested defuns, and raise
-  // exceptions/warnings in case where nested function call options are ignored.
-  DumpGraph("Before calling Placer", graph.get());
-  Placer placer(graph.get(), function_name, optimization_options.flib_def,
-                dev_set.get(), default_device,
-                options.config_proto.allow_soft_placement(),
-                options.config_proto.log_device_placement());
-  TF_RETURN_IF_ERROR(placer.Run(optimization_options));
-
-  DumpGraph("Before running POST_PLACEMENT passes", graph.get());
-  if (should_run_optimization_passes) {
-    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
-        OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
-  }
-
-  Device* cpu_device;
-  TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
-
-  if (options.optimize_graph_fn) {
-    DumpGraph("Before running graph optimization fn", graph.get());
-    Status status = options.optimize_graph_fn(
-        std::move(ret_node_names), std::move(control_ret_node_names),
-        &reachable_lib_def, *dev_set, cpu_device, &graph);
-    if (!status.ok()) {
-      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
-                   << status.ToString();
-    }
-    DumpGraph("After optimization", graph.get());
+    mutex_lock l(mu_);
+    stats_publisher_completed_.push_back(std::move(completed));
   }
-
-  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
-  if (should_run_optimization_passes) {
-    TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
-        OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  if (default_thread_pool_ != nullptr) {
+    default_thread_pool_->Schedule(std::move(thread_fn));
+  } else {
+    env_->SchedClosure(std::move(thread_fn));
   }
-
-  graph->mutable_flib_def()->set_default_registry(nullptr);
-  graph->mutable_flib_def()->Clear();
-  return OptimizedFunctionGraphInfo{
-      std::move(graph), std::move(reachable_lib_def), node_name_to_control_ret,
-      std::move(ret_types), ret_nodes.size()};
 }
 
 Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
@@ -966,20 +532,45 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   }
 
   const std::shared_ptr<DeviceSet> dev_set = device_set();
+  // Get default device.
+  Device* default_device = nullptr;
+  if (options.default_device_to_target && !options.target.empty()) {
+    // Make the `target` device the default device if nothing else is hard
+    // coded. This allows the same function definition to be specialized to
+    // different devices depending on the `PartitionedCallOp` device.
+    FunctionLibraryRuntime* flr = GetFLR(options.target);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Cannot instantiate multi-device function with target device ",
+          options.target);
+    }
+    default_device = flr->device();
+  }
+  // Get composite devices.
+  std::vector<CompositeDevice*> composite_devices;
+  {
+    tf_shared_lock l(mu_);
+    for (auto* d : composite_devices_) composite_devices.push_back(d);
+  }
+  // Get cpu device.
+  Device* cpu_device;
+  TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
+
   const uint64 optimization_start_time_usecs = Env::Default()->NowMicros();
-  TF_ASSIGN_OR_RETURN(
-      auto optimized_graph_info,
-      OptimizeFunctionGraph(function_name, attrs, options, dev_set));
+  TF_ASSIGN_OR_RETURN(auto optimized_graph_info,
+                      OptimizeFunctionGraph(
+                          function_name, attrs, options, *dev_set, lib_def_,
+                          composite_devices, cpu_device, default_device, env_));
 
-  auto& graph = optimized_graph_info.graph;
+  auto& graph = optimized_graph_info.function_graph;
   graph->mutable_flib_def()->set_default_registry(
       &(optimized_graph_info.lib_def));
 
   // Expand the nodes assigned to a CompositeDevice before graph partition to
   // avoid generating a subgraph on a virtual device for execution.
   // This transformation should happen as late as possible, in order to run as
-  // more graph optimization passes (e.g. PRE_PLACEMENT, PLACER,
-  // POST_PLACEMENT, POST_REWRITE_FOR_EXEC) on a smaller graph as possible.
+  // many graph optimization passes (e.g. PRE_PLACEMENT, PLACER,
+  // POST_PLACEMENT, POST_REWRITE_FOR_EXEC) on the smallest graph possible.
   TF_RETURN_IF_ERROR(ReplicatePerReplicaNodesInFunctionGraph(
       options.composite_devices, graph.get()));
 
@@ -995,11 +586,12 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   VLOG(4) << "Main function graph to be partitioned:";
   VLOG(4) << DebugString(graph->ToGraphDefDebug());
 
-  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  auto subgraphs =
+      std::make_unique<std::unordered_map<string, std::unique_ptr<Graph>>>();
   TF_RETURN_IF_ERROR(
-      PartitionFunctionGraph(*dev_set, std::move(graph), &subgraphs));
+      PartitionFunctionGraph(*dev_set, std::move(graph), subgraphs.get()));
 
-  for (const auto& pair : subgraphs) {
+  for (const auto& pair : *subgraphs) {
     DumpGraph(strings::StrCat("Before running POST_PARTITIONING passes (",
                               pair.first, ")"),
               pair.second.get());
@@ -1010,7 +602,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   optimization_options.is_function_graph = true;
   optimization_options.graph = nullptr;
   optimization_options.device_set = nullptr;
-  optimization_options.partition_graphs = &subgraphs;
+  optimization_options.partition_graphs = subgraphs.get();
   optimization_options.debug_filename_prefix = "pflr_imd_";
   env_->CreateUniqueFileName(&optimization_options.debug_filename_prefix, "_");
 
@@ -1022,13 +614,12 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_PARTITIONING, optimization_options));
   }
-
-  for (const auto& pair : subgraphs) {
+  for (const auto& pair : *subgraphs) {
     const auto* optimized_subgraph = pair.second.get();
     DumpGraph(
         strings::StrCat("After all optimization passes (", pair.first, ")"),
         optimized_subgraph);
-    if (VLOG_IS_ON(1)) {
+    if (VLOG_IS_ON(3)) {
       DumpGraphDefToFile(
           strings::StrCat("pflr_after_all_optimization_passes_",
                           reinterpret_cast<uintptr_t>(optimized_subgraph), "_",
@@ -1039,9 +630,12 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   const uint64 optimization_end_time_usecs = Env::Default()->NowMicros();
   metrics::UpdateFunctionGraphOptimizationTime(optimization_end_time_usecs -
                                                optimization_start_time_usecs);
+  VLOG(1) << "Finished graph optimizations for MultiDevice function \""
+          << function_name << "\" with target device \"" << options.target
+          << "\"";
 
   if (options.graph_collector != nullptr) {
-    for (const auto& pair : subgraphs) {
+    for (const auto& pair : *subgraphs) {
       GraphDef def;
       pair.second->ToGraphDef(&def);
       *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
@@ -1057,7 +651,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       [&node_name_to_control_ret](const Node* n) -> absl::optional<string> {
     const auto it = node_name_to_control_ret.find(n->name());
     return it != node_name_to_control_ret.end()
+               // NOLINTNEXTLINE
                ? absl::make_optional<string>(it->second)
+               // NOLINTNEXTLINE
                : absl::nullopt;
   };
 
@@ -1072,7 +668,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   FunctionLibraryDefinition* data_lib_def = &data->lib_def_;
   FunctionNameGenerator name_generator(
       data_lib_def, absl::StrCat(function_name, "_", random::New64()));
-  auto num_subgraphs = subgraphs.size();
+  const int num_subgraphs = subgraphs->size();
   gtl::InlinedVector<Status, 4> instantiate_status(num_subgraphs);
   BlockingCounter counter(static_cast<int>(num_subgraphs));
   auto runner = [this, num_subgraphs](std::function<void()> fn) {
@@ -1090,7 +686,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   data->enable_sync_execution = false;
   if (options.allow_small_function_optimizations) {
     data->enable_sync_execution = true;
-    for (const auto& pair : subgraphs) {
+    for (const auto& pair : *subgraphs) {
       ComponentFunctionData* comp_data = &data->glue_[pair.first];
       const Graph* subgraph = pair.second.get();
       comp_data->async_attributes =
@@ -1103,7 +699,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   }
 
   // Instantiate each component function (subgraph).
-  for (const auto& pair : subgraphs) {
+  for (const auto& pair : *subgraphs) {
     Status* status = &instantiate_status[i];
     string unique_name = name_generator.GetName();
     ComponentFunctionData* comp_data = &data->glue_[pair.first];
@@ -1197,8 +793,10 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   TF_RETURN_IF_ERROR(group.as_summary_status());
 
   *handle = AddMultiDeviceHandle(std::move(data), function_key);
-  VLOG(2) << "Instantiated MultiDevice function \"" << function_name
+  VLOG(1) << "Instantiated MultiDevice function \"" << function_name
           << "\" with handle " << *handle;
+
+  PublishSubgraphs(function_name, std::move(subgraphs));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 36a91066b02..277c291d276 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -16,25 +16,28 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include <functional>
+#include <optional>
+#include <string>
 #include <unordered_map>
 
-// clang-format off
-// Required for IS_MOBILE_PLATFORM
-#include "tensorflow/core/platform/platform.h"
-// clang-format on
-
-#include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimized_function_graph_info.h"
+#include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tsl/platform/notification.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
-#endif  // IS_MOBILE_PLATFORM
+#endif  // !IS_MOBILE_PLATFORM
 
 namespace tensorflow {
 
@@ -71,7 +74,8 @@ class ProcessFunctionLibraryRuntime {
       thread::ThreadPool* thread_pool = nullptr,
       DistributedFunctionLibraryRuntime* parent = nullptr,
       const SessionMetadata* session_metadata = nullptr,
-      Rendezvous::Factory rendezvous_factory = Rendezvous::Factory());
+      Rendezvous::Factory rendezvous_factory = Rendezvous::Factory(),
+      StatsPublisherFactory stats_publisher_factory = CreateNoOpStatsPublisher);
 
   ~ProcessFunctionLibraryRuntime() {
     // Deleting the FunctionLibraryRuntime map will delete the function handles
@@ -80,6 +84,11 @@ class ProcessFunctionLibraryRuntime {
     // since the flr_map_ may have already been deleted. Explicitly releasing
     // flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this.
     flr_map_.reset();
+    // Graph and stats publishers might have pending work in async threads that
+    // requires access to PFLR instance. Wait for completion before destructing.
+    for (const auto& n : stats_publisher_completed_) {
+      n->WaitForNotification();
+    }
   }
 
   // Sends `tensors_to_send` from `source_device` to `target_device` using
@@ -165,18 +174,6 @@ class ProcessFunctionLibraryRuntime {
   Status IsCrossProcess(FunctionLibraryRuntime::Handle handle,
                         bool* is_cross_process) const;
 
-  // TODO(iga): Reword
-  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
-  // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices.
-  static Status PinArgsAndRets(const std::vector<string>& input_devices,
-                               const std::vector<string>& output_devices,
-                               const DeviceSet& device_set,
-                               const std::vector<Node*>& arg_nodes,
-                               const std::vector<Node*>& ret_nodes,
-                               const FunctionLibraryDefinition* lib_def,
-                               Device* default_device);
-
   // Delegates to the local FLR that owns state corresponding to `handle` and
   // tells it to release it. If the `handle` isn't needed at all, the local FLR
   // might call RemoveHandle on this to get rid of the state owned by the Proc
@@ -383,27 +380,6 @@ class ProcessFunctionLibraryRuntime {
 
   Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
 
-  // Function graph related information after optimizations.
-  struct OptimizedFunctionGraphInfo {
-    // Optimized graph.
-    std::unique_ptr<Graph> graph;
-    // Optimized function library.
-    FunctionLibraryDefinition lib_def;
-    // Map from original node names to control return names.
-    std::unordered_map<string, string> node_name_to_control_ret;
-    // Return node types of the function.
-    DataTypeVector ret_types;
-    // Number of return nodes.
-    size_t num_return_nodes;
-  };
-
-  // Outputs graph optimization result after all the graph optimization (up till
-  // before graph partitioning); returns error if optimization fails.
-  StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
-      const string& function_name, AttrSlice attrs,
-      const FunctionLibraryRuntime::InstantiateOptions& options,
-      const std::shared_ptr<DeviceSet>& dev_set);
-
   Status InstantiateMultiDevice(
       const string& function_name, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
@@ -475,6 +451,11 @@ class ProcessFunctionLibraryRuntime {
                            InternalArgs* args)>
           get_component_args) const;
 
+  void PublishSubgraphs(
+      const std::string& function_name,
+      std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>
+          subgraphs);
+
   // Data structure holding information for a single instantiated remote
   // (to be executed on `target_device`) function.
   class FunctionData {
@@ -522,7 +503,7 @@ class ProcessFunctionLibraryRuntime {
   mutable mutex mu_;
 
   Env* const env_;
-  const absl::optional<const ConfigProto> config_;
+  const std::optional<const ConfigProto> config_;
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
@@ -559,6 +540,14 @@ class ProcessFunctionLibraryRuntime {
 
   const OptimizerOptions optimizer_options_;
   const int graph_def_version_;
+
+  StatsPublisherFactory stats_publisher_factory_;
+  // Holds all stats publishers, one for publishing subgraphs of each
+  // instantiated function.
+  std::vector<std::unique_ptr<StatsPublisherInterface>> stats_publishers_
+      TF_GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<tsl::Notification>> stats_publisher_completed_
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 2ec29ac8330..cb8e9309e06 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -447,6 +447,45 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
 }
 
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       SameDeviceXTimesFourInt32MultiDevice) {
+  Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
+  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
+  instantiate_opts.input_devices = {"/job:a/replica:0/task:0/cpu:0"};
+  instantiate_opts.output_devices = {"/job:a/replica:0/task:0/cpu:0"};
+  instantiate_opts.is_multi_device_function = true;
+  Tensor y;
+  TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
+                  {x}, {&y}));
+  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultipleCallsSameDeviceXTimesMultiDevice) {
+  Init({test::function::XTimesTwoInt32(), test::function::XTimesFourInt32()});
+  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  FunctionLibraryRuntime::Options opts;
+  opts.source_device = "/job:a/replica:0/task:0/cpu:0";
+  opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
+  instantiate_opts.input_devices = {"/job:a/replica:0/task:0/cpu:0"};
+  instantiate_opts.output_devices = {"/job:a/replica:0/task:0/cpu:0"};
+  instantiate_opts.is_multi_device_function = true;
+  Tensor y;
+  TF_CHECK_OK(Run("XTimesTwoInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
+                  {x}, {&y}));
+  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
+  TF_CHECK_OK(Run("XTimesFourInt32", opts, {{"T", DT_INT32}}, instantiate_opts,
+                  {x}, {&y}));
+  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({4, 8, 12, 16}));
+}
+
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 590903e65f5..0a50530ac4e 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -614,6 +614,34 @@ int PropagatorState::FrameState::ActivateNodesSlowPathInternal(
   return activated;
 }
 
+int PropagatorState::FrameState::ActivateNodesFastPathLocked(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
+  return ActivateNodesFastPathInternal<false>(item, is_dead, iter_state,
+                                              outputs, ready);
+}
+
+int PropagatorState::FrameState::ActivateNodesFastPathShared(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
+  return ActivateNodesFastPathInternal<true>(item, is_dead, iter_state, outputs,
+                                             ready);
+}
+
+int PropagatorState::FrameState::ActivateNodesSlowPathLocked(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
+  return ActivateNodesSlowPathInternal<false>(item, is_dead, iter_state,
+                                              outputs, ready);
+}
+
+int PropagatorState::FrameState::ActivateNodesSlowPathShared(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
+  return ActivateNodesSlowPathInternal<true>(item, is_dead, iter_state, outputs,
+                                             ready);
+}
+
 bool PropagatorState::FrameState::ActivateNodesAndAdjustOutstanding(
     const NodeItem* item, const bool is_dead, IterationState* iter_state,
     EntryVector* outputs, TaggedNodeSeq* ready, int decrement_activation) {
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index c1ac02e4e48..b4fd4da2716 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -423,48 +423,36 @@ class PropagatorState {
     // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
     // This variant does not use atomic operations to modify the pending counts
     // and thus must hold the exclusive lock.
-    int ActivateNodesFastPathLocked(const NodeItem* item, const bool is_dead,
+    int ActivateNodesFastPathLocked(const NodeItem* item, bool is_dead,
                                     IterationState* iter_state,
                                     EntryVector* outputs, TaggedNodeSeq* ready)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      return ActivateNodesFastPathInternal<false>(item, is_dead, iter_state,
-                                                  outputs, ready);
-    }
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
     // This variant uses atomic operations to modify the pending counts.
-    int ActivateNodesFastPathShared(const NodeItem* item, const bool is_dead,
+    int ActivateNodesFastPathShared(const NodeItem* item, bool is_dead,
                                     IterationState* iter_state,
                                     EntryVector* outputs, TaggedNodeSeq* ready)
-        TF_SHARED_LOCKS_REQUIRED(mu) {
-      return ActivateNodesFastPathInternal<true>(item, is_dead, iter_state,
-                                                 outputs, ready);
-    }
-
-    template <bool atomic>
-    int ActivateNodesFastPathInternal(const NodeItem* item, const bool is_dead,
-                                      IterationState* iter_state,
-                                      EntryVector* outputs,
-                                      TaggedNodeSeq* ready);
+        TF_SHARED_LOCKS_REQUIRED(mu);
 
-    int ActivateNodesSlowPathLocked(const NodeItem* item, const bool is_dead,
+    int ActivateNodesSlowPathLocked(const NodeItem* item, bool is_dead,
                                     IterationState* iter_state,
                                     EntryVector* outputs, TaggedNodeSeq* ready)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      return ActivateNodesSlowPathInternal<false>(item, is_dead, iter_state,
-                                                  outputs, ready);
-    }
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    int ActivateNodesSlowPathShared(const NodeItem* item, const bool is_dead,
+    int ActivateNodesSlowPathShared(const NodeItem* item, bool is_dead,
                                     IterationState* iter_state,
                                     EntryVector* outputs, TaggedNodeSeq* ready)
-        TF_SHARED_LOCKS_REQUIRED(mu) {
-      return ActivateNodesSlowPathInternal<true>(item, is_dead, iter_state,
-                                                 outputs, ready);
-    }
+        TF_SHARED_LOCKS_REQUIRED(mu);
 
+    // Implementation templates. Not for public use.
+    template <bool atomic>
+    int ActivateNodesFastPathInternal(const NodeItem* item, bool is_dead,
+                                      IterationState* iter_state,
+                                      EntryVector* outputs,
+                                      TaggedNodeSeq* ready);
     template <bool atomic>
-    int ActivateNodesSlowPathInternal(const NodeItem* item, const bool is_dead,
+    int ActivateNodesSlowPathInternal(const NodeItem* item, bool is_dead,
                                       IterationState* iter_state,
                                       EntryVector* outputs,
                                       TaggedNodeSeq* ready);
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 033343bdbdc..c2bcaeb0b17 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -254,6 +254,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
     const absl::flat_hash_map<string, const std::vector<string>*>&
         composite_devices,
     Graph* graph) {
+  VLOG(1) << "Starting ReplicatePerReplicaNodesInFunctionGraph";
+  VLOG(1) << "Graph #nodes " << graph->num_nodes() << " #edges "
+          << graph->num_edges();
   std::set<string> composite_device_names;
   for (const auto& it : composite_devices) {
     composite_device_names.insert(it.first);
@@ -322,6 +325,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
   TF_RETURN_IF_ERROR(OptimizeCrossHostControlInputEdges(
       graph, kOptimizeCrossHostEdgesTheshold));
 
+  VLOG(1) << "Finished ReplicatePerReplicaNodesInFunctionGraph";
+  VLOG(1) << "Graph #nodes " << graph->num_nodes() << " #edges "
+          << graph->num_edges();
   return OkStatus();
 }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index c640fdcb662..4c5d3c5a4f3 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -158,7 +158,8 @@ Status ShapeRefiner::InferShapesForFunction(const FunctionDef* function_def,
                                             AttrSlice attributes,
                                             InferenceContext* outer_context) {
   const Graph* graph;
-  auto it = functions_.find(function_def);
+  const string& fname = function_def->signature().name();
+  auto it = functions_.find(fname);
   if (it != functions_.end()) {
     graph = it->second.get();
   } else {
@@ -175,7 +176,7 @@ Status ShapeRefiner::InferShapesForFunction(const FunctionDef* function_def,
     options.allow_internal_ops = true;
     TF_RETURN_IF_ERROR(
         ConvertNodeDefsToGraph(options, result.nodes, new_graph));
-    functions_[function_def].reset(new_graph);
+    functions_[fname].reset(new_graph);
     graph = new_graph;
   }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 8130dc8b271..b592c8906cd 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -322,9 +322,7 @@ class ShapeRefiner {
 
   // Cache the graph corresponding to each function definition for which shapes
   // are refined.
-  absl::flat_hash_map<const FunctionDef*, std::unique_ptr<const Graph>,
-                      hash<const FunctionDef*>>
-      functions_;
+  absl::flat_hash_map<string, std::unique_ptr<const Graph>> functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
 };
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.cc b/tensorflow/core/common_runtime/stats_publisher_interface.cc
index a79b663d43b..3dab626b9f1 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.cc
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
@@ -31,6 +35,8 @@ class NoOpStatsPublisher : public StatsPublisherInterface {
   void PublishGraphProto(
       const std::vector<const GraphDef*>& graph_defs) override {}
 
+  void PublishGraphProto(std::vector<GraphDef> graph_defs) override {}
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(
       uint64 step, int64_t execution_count, const RunOptions& ropts) override {
     return nullptr;
@@ -41,6 +47,27 @@ class NoOpStatsPublisher : public StatsPublisherInterface {
 
 }  // namespace
 
+void StatsPublisherInterface::RegisterStatsPublisher(
+    StatsPublisherFactory factory_fn) {
+  StatsPublisherFactory** factory_ptr = GetStatsPublisherFactoryPtr();
+  if (*factory_ptr == nullptr) {
+    *factory_ptr = new StatsPublisherFactory();
+  } else {
+    LOG(WARNING)
+        << "More than one StatsPublisherFactory functions are registered. Only "
+           "the last registered one will be effective.";
+  }
+  **factory_ptr = factory_fn;
+}
+
+StatsPublisherFactory StatsPublisherInterface::GetStatsPublisherFactory() {
+  const auto* factory_ptr = GetStatsPublisherFactoryPtr();
+  if (*factory_ptr == nullptr) {
+    return CreateNoOpStatsPublisher;
+  }
+  return **factory_ptr;
+}
+
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
     const string& session, const BuildGraphOptions& bopts,
     const SessionOptions& sopts) {
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index cbeee94c322..6f771068101 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -16,13 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
 
+#include <functional>
+#include <memory>
+#include <string>
+
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+class StatsPublisherInterface;
+
+typedef std::function<std::unique_ptr<StatsPublisherInterface>(
+    const std::string&, const BuildGraphOptions&, const SessionOptions&)>
+    StatsPublisherFactory;
+
 // StatsPublisherInterface describes objects that publish information exported
 // by Sessions.
 // NOTE: This interface is experimental and subject to change.
@@ -40,6 +51,7 @@ class StatsPublisherInterface {
   // corresponding to the latest call will be published.
   virtual void PublishGraphProto(
       const std::vector<const GraphDef*>& graph_defs) = 0;
+  virtual void PublishGraphProto(std::vector<GraphDef> graph_defs) = 0;
 
   // Returns a profile handler for the given step based on the execution_count
   // and RunOptions.
@@ -49,11 +61,17 @@ class StatsPublisherInterface {
       uint64 step, int64_t execution_count, const RunOptions& ropts) = 0;
 
   virtual ~StatsPublisherInterface() {}
-};
 
-typedef std::function<std::unique_ptr<StatsPublisherInterface>(
-    const string&, const BuildGraphOptions&, const SessionOptions&)>
-    StatsPublisherFactory;
+  static void RegisterStatsPublisher(StatsPublisherFactory factory_fn);
+
+  static StatsPublisherFactory GetStatsPublisherFactory();
+
+ private:
+  static StatsPublisherFactory** GetStatsPublisherFactoryPtr() {
+    static StatsPublisherFactory* stats_publisher_factory = nullptr;
+    return &stats_publisher_factory;
+  }
+};
 
 std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
     const string& session, const BuildGraphOptions& bopts,
diff --git a/tensorflow/core/common_runtime/zen_layout_pass.cc b/tensorflow/core/common_runtime/zen_layout_pass.cc
new file mode 100644
index 00000000000..42d237c3b83
--- /dev/null
+++ b/tensorflow/core/common_runtime/zen_layout_pass.cc
@@ -0,0 +1,1166 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef AMD_ZENDNN
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <set>
+#include <stack>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/layout_pass_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/zen_graph_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/util/port.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/zen_util.h"
+
+namespace tensorflow {
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Updating nodes in graph
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Pad + Conv2D together.
+// Consider the subgraph below :
+//
+//        [Const Op]
+//                  \
+//  [Sub-Graph 1]-->[Pad Op]-->[Conv2D_1]-->[Sub-Graph 2]
+//
+// As part of fusion, the graph gets transformed to
+//
+// [Sub-Graph 1]-->[Conv2D_2]-->[Sub-Graph 2]
+//
+// This fusion is valid provided Conv2D op supports EXPLICIT padding
+//
+// The padding value from the Pad op is added up to the existing pad value of
+// the Conv op and the Pad op is removed.
+//
+// Only the padding values of the Conv op is updated and the sub-graph linked
+// to Pad op is now linked with the Conv op.
+//
+// Example of B : Rewriting nodes to Zen nodes
+// -------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//              O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called ZenRelu) as:
+//
+//             O = ZenRelu(A)
+//
+// Rewriting prerequisites:
+//  - Rewrite pass requires that op is registered. If the op type is not
+//    registered, then any node of this op type will not be rewritten.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = TopologicalSort(G)  // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (ZenOpNodeRewrite(n))  // Can this node be rewritten with Zen op.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // Copy edges which generate tensors
+//            done
+//            n' = BuildNewNode(G, new_name, E')
+//            MarkRewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order (top-to-bottom fashion). We need this order
+//        because while visiting a node we want that all of its input nodes are
+//        visited and rewritten if applicable. This is because if we need to
+//        rewrite a given node then all of its input nodes need to be fixed (in
+//        other words they cannot be deleted later.)
+//
+class ZenLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  ZenLayoutRewritePass() {
+    // Zen op rewrite information records
+    zen_rewrite_db_.push_back({"Conv2D", "_ZenConv2D",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrsConv2D});
+    zen_rewrite_db_.push_back({"_FusedConv2D", "_ZenFusedConv2D",
+                               CheckValidityFusedConv2D,
+                               UpdateZenOpAttrsFusedConv2D});
+    zen_rewrite_db_.push_back(
+        {"DepthwiseConv2dNative", "_ZenDepthwiseConv2dNative",
+         CheckValidityForDTypeSupported, UpdateZenOpAttrsConv2D});
+    zen_rewrite_db_.push_back(
+        {"_FusedDepthwiseConv2dNative", "_ZenFusedDepthwiseConv2dNative",
+         CheckValidityFusedConv2D, UpdateZenOpAttrsFusedConv2D});
+    zen_rewrite_db_.push_back({"MatMul", "_ZenMatMul",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    zen_rewrite_db_.push_back({"_FusedMatMul", "_ZenFusedMatMul",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    zen_rewrite_db_.push_back({"BatchMatMul", "_ZenBatchMatMul",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    zen_rewrite_db_.push_back({"BatchMatMulV2", "_ZenBatchMatMulV2",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    zen_rewrite_db_.push_back({"MaxPool", "_ZenMaxPool",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    zen_rewrite_db_.push_back({"AvgPool", "_ZenAvgPool",
+                               CheckValidityForDTypeSupported,
+                               UpdateZenOpAttrs});
+    // TF-ZenDNN supports NHWC and blocked format execution. For blocked format,
+    // following rewrites are not supported.
+    if (!IsBlockedFormatEnabled()) {
+      zen_rewrite_db_.push_back({"Softmax", "_ZenSoftmax",
+                                 CheckValidityForDTypeSupported,
+                                 UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back({"ConjugateTranspose", "_ZenConjugateTranspose",
+                                 RewriteValid, UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back(
+          {"Transpose", "_ZenTranspose", RewriteValid, UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back({"InvertPermutation", "_ZenInvertPermutation",
+                                 RewriteValid, UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back({"FusedBatchNorm", "_ZenFusedBatchNorm",
+                                 RewriteValid, UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back({"FusedBatchNormV2", "_ZenFusedBatchNormV2",
+                                 RewriteValid, UpdateZenOpAttrs});
+      zen_rewrite_db_.push_back({"FusedBatchNormV3", "_ZenFusedBatchNormV3",
+                                 RewriteValid, UpdateZenOpAttrs});
+    }
+    // TF-ZenDNN currently only supports inference. The graph must not have any
+    // of the training ops in tensorflow/core/kernels/training_ops.cc
+    tf_training_ops_.push_back("ApplyGradientDescent");
+    tf_training_ops_.push_back("ApplyAdadelta");
+    tf_training_ops_.push_back("ResourceSparseApplyAdadelta");
+    tf_training_ops_.push_back("ApplyProximalGradientDescent");
+    tf_training_ops_.push_back("SparseApplyProximalGradientDescent");
+    tf_training_ops_.push_back("ApplyAdagrad");
+    tf_training_ops_.push_back("ApplyAdagradV2");
+    tf_training_ops_.push_back("ApplyProximalAdagrad");
+    tf_training_ops_.push_back("SparseApplyAdagrad");
+    tf_training_ops_.push_back("SparseApplyAdagradV2");
+    tf_training_ops_.push_back("SparseApplyProximalAdagrad");
+    tf_training_ops_.push_back("ApplyAdagradDA");
+    tf_training_ops_.push_back("SparseApplyAdagradDA");
+    tf_training_ops_.push_back("ApplyFtrl");
+    tf_training_ops_.push_back("ApplyFtrlV2");
+    tf_training_ops_.push_back("SparseApplyFtrl");
+    tf_training_ops_.push_back("SparseApplyFtrlV2");
+    tf_training_ops_.push_back("ApplyMomentum");
+    tf_training_ops_.push_back("ApplyKerasMomentum");
+    tf_training_ops_.push_back("ApplyAdam");
+    tf_training_ops_.push_back("ApplyAdaMax");
+    tf_training_ops_.push_back("ApplyRMSProp");
+    tf_training_ops_.push_back("ApplyCenteredRMSProp");
+    tf_training_ops_.push_back("ApplyAddSign");
+    tf_training_ops_.push_back("ApplyPowerSign");
+  }
+
+  // Standard interface to run optimization passes.
+  Status Run(const GraphOptimizationPassOptions &options) override;
+
+  // Executes fusion and rewrite passes on the graph. Has an option to dump
+  // graph before and after rewrite. Returns true, if and only if the graph
+  // mutated, false otherwise.
+  bool ZenOpRewritePass(std::unique_ptr<Graph> *g);
+
+  // Replaces TF-Vanilla ops with Zen ops. Returns true if one or more rewrites
+  // are successful, false otherwise.
+  bool ZenOpUpdate(std::unique_ptr<Graph> *g);
+
+  // Stores Zen op rewrite rules.
+  typedef struct {
+    string tf_op_name;   // Original name of op of the node in the graph.
+    string zen_op_name;  // New name of the op.
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<bool(const Node *)> check_validity;
+    // Returns true if we should rewrite the node.
+    std::function<void(const Node *, NodeBuilder *)> update_zen_op_attr;
+  } ZenOpRewriteRecord;
+
+ private:
+  // Maintain record about nodes to rewrite.
+  std::vector<ZenOpRewriteRecord> zen_rewrite_db_;
+
+  // TF training ops list from tensorflow/core/kernels/training_ops.cc
+  std::vector<string> tf_training_ops_;
+
+  inline bool HasSubstr(const std::string primary,
+                        const std::string sub) const {
+    return primary.find(sub) != std::string::npos;
+  }
+
+  // Check if the node 'n' has any applicable rewrite rule.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule.
+  const ZenOpRewriteRecord *CheckNodeForZenOpRewrite(const Node *n) const;
+
+  // ZenDNN currently does not support all fusions that grappler performs
+  // together with Conv2D and DepthwiseConv2D. We rewrite _FusedConv2D and
+  // _FusedDepthwiseConv2dNative only if it includes those we support.
+  static bool CheckValidityFusedConv2D(const Node *n) {
+    // Return false if the node is not with data type supported by Zen
+    // inference. Currently Zen supports inference in float only.
+    if (!CheckValidityForDTypeSupported(n)) {
+      return false;
+    }
+    std::vector<string> fused_ops;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "fused_ops", &fused_ops));
+
+    return (fused_ops == std::vector<string>{"BiasAdd"} ||
+            fused_ops == std::vector<string>{"FusedBatchNorm"} ||
+            fused_ops == std::vector<string>{"Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"} ||
+            fused_ops == std::vector<string>{"FusedBatchNorm", "Relu"});
+  }
+
+  // Currently TF-ZenDNN supports FP32 inference only. Returns, true if node is
+  // of float dataype, false otherwise.
+  static bool CheckValidityForDTypeSupported(const Node *n) {
+    DataType data_type;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &data_type));
+    return (data_type == DT_FLOAT);
+  }
+
+  // Method to provide a 'valid' status for nodes that don't require any check.
+  // This method is used in ZenLayoutRewritePass() for creating the record/entry
+  // for rewriting native ops with Zen ops.
+  static bool RewriteValid(const Node *n) { return true; }
+
+  // Method to find whether the graph has inference ops only. It returns error
+  // status if the graph has training ops.
+  Status AreAllInferenceOps(std::unique_ptr<Graph> *g);
+
+  // Rewrites input node to a new node specified by its matching rewrite record.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewrite record,
+  //         reorder_flags - flags to populate reorder attributes of Zen op.
+  // @return OkStatus(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status ZenOpNodeRewrite(std::unique_ptr<Graph> *g, Node *orig_node,
+                          const ZenOpRewriteRecord *rewrite_record,
+                          std::pair<bool, bool> reorder_flags);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  static void UpdateZenOpAttrs(const Node *orig_node, NodeBuilder *nb);
+
+  static void UpdateZenOpAttrsConv2D(const Node *orig_node, NodeBuilder *nb);
+
+  static void UpdateZenOpAttrsFusedConv2D(const Node *orig_node,
+                                          NodeBuilder *nb);
+
+  // This function determines the reorder flags <reorder_before, reorder_after>
+  // for each Zen node. Here reordering means converting tensor layout. The
+  // 'reorder_before' flag indicates whether the tensors need to be reordered
+  // to Zen format before the Zen node. The 'reorder_after' flag indicates
+  // whether the tensors need to be reordered back to native nhwc format after
+  // the Zen node.
+  //
+  // @input  nodes - A vector of Zen nodes marked for rewrite to update reorder
+  //                 flags.
+  // @return An unordered map with nodes as key and value as a pair of reorder
+  //         flags.
+  std::unordered_map<Node *, std::pair<bool, bool>> GetReorderFlags(
+      const std::vector<Node *> &nodes);
+
+  // Update reorder information of all Zen nodes
+  //
+  // @input g - input graph
+  // @return true, if one or more updates are successful; false otherwise.
+  bool AddReorderAttrs(std::unique_ptr<Graph> *g);
+};
+
+// ZenLayoutRewritePass is executed in phase 0, to make sure it is executed
+// before MklLayoutRewritePass (phase 1).
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_PARTITIONING, 0,
+                      ZenLayoutRewritePass);
+
+void DeleteNodeAndUpdateLinks(std::unique_ptr<Graph> *, Node *, Node *, int);
+
+const ZenLayoutRewritePass::ZenOpRewriteRecord *
+ZenLayoutRewritePass::CheckNodeForZenOpRewrite(const Node *n) const {
+  CHECK_NOTNULL(n);  // Crash ok.
+
+  DataType data_type;
+
+  for (auto rewrite_record = zen_rewrite_db_.cbegin();
+       rewrite_record != zen_rewrite_db_.cend(); ++rewrite_record) {
+    if (n->type_string() == rewrite_record->tf_op_name &&
+        rewrite_record->check_validity(n)) {
+      TF_CHECK_OK(GetNodeAttr(n->def(), "T", &data_type));
+      if (!zen_op_registry::IsZenOpKernelRegistered(rewrite_record->zen_op_name,
+                                                    data_type)) {
+        // No Zen kernel is registered for op.
+        return nullptr;
+      }
+      return &*rewrite_record;
+    }
+  }
+  return nullptr;
+}
+
+// Returns true if 'm' is the last Zen node of the graph, false otherwise.
+bool IsLastZenNode(std::unique_ptr<Graph> *g, Node *m) {
+  std::vector<Node *> order;
+  GetPostOrder(**g, &order);
+
+  for (Node *n : order) {
+    if (absl::StrContains((n->type_string()),
+                          zen_op_registry::kZenNodePrefix)) {
+      return (n == m);
+    }
+  }
+  return false;
+}
+
+// Returns the count of incoming data edges to a node.
+int IncomingEdgeCount(const Node *n) {
+  int count = 0;
+  if (n == nullptr) return count;
+  for (const Edge *e : n->in_edges()) {
+    if (!e->IsControlEdge() && e->src()->type_string() != "Const") {
+      count++;
+    }
+  }
+  return count;
+}
+
+// Returns the count of outgoing data edges of a node.
+int OutgoingEdgeCount(const Node *n) {
+  int count = 0;
+  if (n == nullptr) return count;
+  for (const Edge *e : n->out_edges()) {
+    if (!e->IsControlEdge()) {
+      count++;
+    }
+  }
+  return count;
+}
+
+// Upon pattern match, the Pad op is removed. Conv2D op has been updated
+// previously. This pattern (Pad -> Conv2D/FusedConv2D) is observed in ResNet50
+// and other ResNet variants.
+//
+// @input  conv_pattern - a variant of Conv2D pattern.
+//         pad_pattern - a Pad op.
+// @return True if fusion is performed, false otherwise.
+bool ZenFusePadConv(std::unique_ptr<Graph> *g, Node *orig_node,
+                    string conv_pattern, string pad_pattern) {
+  int source_slot;      // Source output of incoming edge for Pad op.
+  string padding = "";  // To check that padding type is set to EXPLICIT.
+  // Padding type that should be in Conv2D.
+  const string kExplicitPad = "EXPLICIT";
+
+  // If current node is not Pad, return false.
+  if (orig_node->type_string() != pad_pattern) {
+    return false;
+  }
+  // Check incoming edges to Pad op (orig_node).
+  for (const Edge *n : orig_node->in_edges()) {
+    if (n->IsControlEdge() || n->src()->type_string() == "Const") {
+      continue;
+    }
+    // Store source output of incoming edge for Pad op.
+    source_slot = n->src_output();
+    // Check outgoing edges from Pad op (orig_node).
+    for (const Edge *e : orig_node->out_edges()) {
+      // Check for 2nd pattern (Conv2D).
+      if (!e->IsControlEdge() && e->dst()->type_string() == conv_pattern) {
+        TF_CHECK_OK(GetNodeAttr((e->dst())->def(), "padding", &padding));
+        // If padding type is not EXPLICIT, fusion of Pad op cannot be performed
+        if (padding != kExplicitPad) {
+          return false;
+        }
+        // Remove Pad node as it's Fused with Conv2D (FusedPadConv2D).
+        DeleteNodeAndUpdateLinks(g, orig_node, n->src(), source_slot);
+        return true;
+      }  // end of if condition to check conv_pattern among non-control edges
+    }    // end of for loop for out edges
+  }      // end of for loop for in edges
+  // return false as Pad removal is not performed
+  return false;
+}
+
+// Delete node 'm' and update the incoming and outgoing links of it.
+//
+// @input  g - input graph.
+// @input  m - node to be deleted.
+// @input  source_node - previous node of 'm'.
+// @input  source_output_slot - source output of node 'm'.
+// @return None.
+void DeleteNodeAndUpdateLinks(std::unique_ptr<Graph> *g, Node *m,
+                              Node *source_node, int source_output_slot) {
+  std::unordered_set<Node *> unique_node;
+
+  // Handle outgoing control edges.
+  for (const Edge *e : m->out_edges()) {
+    if (e->IsControlEdge()) {
+      auto result = unique_node.insert(source_node);
+      if (result.second) {
+        (*g)->AddControlEdge(source_node, e->dst(), true);
+      }
+    } else {
+      auto result = (*g)->AddEdge(source_node, source_output_slot, e->dst(),
+                                  e->dst_input());
+      DCHECK_NE(result, nullptr);
+    }
+  }
+  unique_node.clear();
+
+  // Handle incoming control edges.
+  for (const Edge *e : m->in_edges()) {
+    if (e->IsControlEdge()) {
+      auto result = unique_node.insert(e->src());
+      if (result.second) {
+        (*g)->AddControlEdge(e->src(), source_node, true);
+      }
+    }
+  }
+  unique_node.clear();
+  (*g)->RemoveNode(m);
+}
+
+// Remove the successor of Zen node if it matches with 'pattern'.
+//
+// @input  g - input graph.
+// @input  orig_node - Source Zen node.
+// @input  pattern - Pattern to check in the successor nodes of 'orig_node'.
+// @return True, if the pattern is found in successor nodes of 'orig_node' and
+//         delete the successor node (otherwise false).
+bool ZenOpRemoveSuccessor(std::unique_ptr<Graph> *g, const Node *orig_node,
+                          string pattern) {
+  if (OutgoingEdgeCount(orig_node) != 1) {
+    return false;
+  }
+
+  for (const Edge *e : orig_node->out_edges()) {
+    if (!e->IsControlEdge() && e->dst()->type_string() == pattern &&
+        IncomingEdgeCount(e->dst()) == 1) {
+      DeleteNodeAndUpdateLinks(g, e->dst(), e->src(), e->src_output());
+      return true;
+    }
+  }
+  return false;
+}
+
+// Fuse Conv2D-Bias-ReLU
+bool FuseConv2DBiasRelu(std::unique_ptr<Graph> *g, const Node *orig_node,
+                        string pattern) {
+  return ZenOpRemoveSuccessor(g, orig_node, pattern);
+}
+
+void ZenLayoutRewritePass::UpdateZenOpAttrs(const Node *orig_node,
+                                            NodeBuilder *nb) {
+  string name;
+  AttrSlice attr_list(orig_node->def());
+
+  for (auto iter = attr_list.begin(); iter != attr_list.end(); ++iter) {
+    name = iter->first;
+    // Skip the following attributes because they are handled elsewhere.
+    if (name == "reorder_before" || name == "reorder_after" ||
+        name == "is_eager" || name == "in_links" || name == "out_links" ||
+        name == "reset") {
+      continue;
+    }
+
+    nb->Attr(name, iter->second);
+  }
+}
+
+// Used internally in UpdateZenOpAttrsConv2D and UpdateZenOpAttrsFusedConv2D to
+// update 'padding' attribute according to PadConv2D fusion.
+//
+// @input  padding - 'padding' attribute of 'orig_node'.
+//         orig_node - Node with which Pad op needs to be fused.
+//         explicit_paddings - a vector of padding values for each dimension.
+// @return True if fusion can take place, false otherwise.
+//
+bool UpdateAttributePadConv2D(string padding, const Node *orig_node,
+                              std::vector<int32> &explicit_paddings) {
+  // Part of PadConv2D fusion
+  // If padding is VALID and the current FusedConv2D op is preceded by Pad op,
+  // then we are updating the padding attribute to EXPLICIT and setting
+  // explicit_paddings attribute.
+  // If padding is EXPLICIT and the pattern Pad op -> FusedConv2D op exists,
+  // then we are updating the explicit_paddings attribute only.
+  const string kValidPad = "VALID";
+  const string kExplicitPad = "EXPLICIT";
+  const string kPadPattern = "Pad";
+
+  // Temporary fix for num_host_args argument of _FusedConv2D node.
+  if (orig_node->type_string() == "_FusedConv2D") {
+    string data_format;
+    string filter_format;
+    int num_host_args = 0;
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "filter_format", &filter_format));
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_host_args", &num_host_args));
+
+    if ((data_format != "NCHW" && data_format != "NHWC") ||
+        (filter_format != "HWIO" && filter_format != "OIHW") ||
+        (num_host_args != 0)) {
+      // Not supporting num_host_args for _FusedConv2D and Pad match.
+      VLOG(1) << "ZenLayoutRewritePass::" << orig_node->name()
+              << " can be match with pad but currently" << orig_node->name()
+              << " only supported without host args";
+      return false;
+    }
+  }
+
+  // If padding is not VALID or EXPLICIT, fusion cannot be performed.
+  if (padding != kValidPad && padding != kExplicitPad) {
+    return false;
+  }
+
+  // Check incoming edges to origin node (FusedConv2D).
+  for (const Edge *m : orig_node->in_edges()) {
+    // Skip if previous node is Const.
+    if (m->src()->type_string() == "Const") {
+      continue;
+    }
+    // If previous node is kPadPattern, pattern (Pad op -> FusedConv2D op) has
+    // been found.
+    if (m->src()->type_string() == kPadPattern) {
+      // Get original explicit padding values if padding = EXPLICIT.
+      std::vector<int32> explicit_paddings_orig = {};
+      if (padding == kExplicitPad) {
+        TF_CHECK_OK(GetNodeAttr(orig_node->def(), "explicit_paddings",
+                                &explicit_paddings_orig));
+      }
+      // 'input' will hold the const op before Pad op.
+      Node *input = nullptr;
+      // Index 0 has the input data and Index 1 has the padding values (which is
+      // needed).
+      TF_CHECK_OK((m->src())->input_node(1, &input));
+      // Check if input is constant
+      if (input->IsConstant()) {
+        Tensor explicit_padding_tensor;
+        // value attribute has the Tensor with explicit padding values.
+        TF_CHECK_OK(
+            GetNodeAttr((input)->def(), "value", &explicit_padding_tensor));
+        // Number of elements in explicit_padding_tensor (should be 8).
+        int num_elements = explicit_padding_tensor.NumElements();
+        // 'padding_1d_tensor' is an Eigen Tensor with datatype int32.
+        auto padding_1d_tensor = explicit_padding_tensor.flat<int32>();
+        // For dimension i (starting from 0), the padding values
+        // will be at 2*i and 2*i + 1
+        for (int index_pad = 0; index_pad < num_elements; index_pad++) {
+          if (padding == kValidPad) {
+            explicit_paddings.insert(explicit_paddings.begin() + index_pad,
+                                     padding_1d_tensor(index_pad));
+          } else if (padding == kExplicitPad) {
+            explicit_paddings.insert(explicit_paddings.begin() + index_pad,
+                                     padding_1d_tensor(index_pad) +
+                                         explicit_paddings_orig.at(index_pad));
+          }
+        }  // end of for loop for padding values.
+        // PadConv2D fusion can be performed.
+        return true;
+      }  // end of if condition to check constant op.
+    }    // end of if condition for Pad op.
+  }      // end of for loop for input edges for FusedConv2D op.
+  return false;
+}
+
+// Copies the attributes from Conv2D op to ZenConv2D op. 'padding' and
+// 'explicit_paddings' attributes are updated accordingly to PadConv2D fusion.
+void ZenLayoutRewritePass::UpdateZenOpAttrsConv2D(const Node *orig_node,
+                                                  NodeBuilder *nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  std::vector<int32> explicit_paddings = {};
+
+  // Get attributes from TF op node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+
+  // 'padding_update' determines if padding attributes needs to be modified.
+  bool padding_update = false;
+  // PadConv2D fusion can be done for VALID and EXPLICIT padding.
+  if (padding != "SAME") {
+    // Check if PadConv2D fusion can be done and get the padding values.
+    padding_update =
+        UpdateAttributePadConv2D(padding, orig_node, explicit_paddings);
+  }
+  // Update Zen op with attributes from TF op.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  // Update 'padding' attribute for PadConv2D fusion.
+  if (padding_update == true) {
+    nb->Attr("padding", "EXPLICIT");                   // Updates padding type.
+    nb->Attr("explicit_paddings", explicit_paddings);  // sets padding values.
+  } else {
+    // 'padding' attribute for condition when fusion is not performed.
+    nb->Attr("padding", padding);
+    // If 'padding' is EXPLICIT, then 'explicit_paddings' attribute needs to be
+    // set.
+    if (padding == "EXPLICIT") {
+      std::vector<int32> explicit_paddings_tmp = {};
+      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "explicit_paddings",
+                              &explicit_paddings_tmp));
+      nb->Attr("explicit_paddings", explicit_paddings_tmp);
+    }
+  }
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+}
+
+// Copies the attributes from FusedConv2D op to ZenFusedConv2D op. 'padding' and
+// 'explicit_paddings' attributes are updated accordingly to PadFusedConv2D
+// fusion.
+void ZenLayoutRewritePass::UpdateZenOpAttrsFusedConv2D(const Node *orig_node,
+                                                       NodeBuilder *nb) {
+  DataType T;
+  int num_args;
+  float epsilon;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  std::vector<int32> explicit_paddings = {};
+
+  // Get attributes from TF op node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+
+  // 'padding_update' determines if padding attributes needs to be modified.
+  bool padding_update = false;
+  // PadFusedConv2D fusion can be done for VALID and EXPLICIT padding.
+  if (padding != "SAME") {
+    // Check if PadFusedConv2D fusion can be done and get the padding values.
+    padding_update =
+        UpdateAttributePadConv2D(padding, orig_node, explicit_paddings);
+  }
+  // Update Zen op with attributes from TF op.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  // Update padding attribute for PadConv2D fusion.
+  if (padding_update == true) {
+    nb->Attr("padding", "EXPLICIT");                   // Updates padding type.
+    nb->Attr("explicit_paddings", explicit_paddings);  // sets padding values.
+  } else {
+    // 'padding' attribute for condition when fusion is not performed.
+    nb->Attr("padding", padding);
+    // If 'padding' is EXPLICIT, then 'explicit_paddings' attribute needs to be
+    // set.
+    if (padding == "EXPLICIT") {
+      std::vector<int32> explicit_paddings_tmp = {};
+      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "explicit_paddings",
+                              &explicit_paddings_tmp));
+      nb->Attr("explicit_paddings", explicit_paddings_tmp);
+    }
+  }
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("epsilon", epsilon);
+}
+
+static void FillInputs(const Node *n,
+                       gtl::InlinedVector<Node *, 4> *control_edges,
+                       gtl::InlinedVector<std::pair<Node *, int>, 4> *in) {
+  control_edges->clear();
+  for (const Edge *e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+}
+
+Status ZenLayoutRewritePass::ZenOpNodeRewrite(
+    std::unique_ptr<Graph> *g, Node *orig_node,
+    const ZenOpRewriteRecord *rewrite_record,
+    std::pair<bool, bool> reorder_flags) {
+  DCHECK_NE(rewrite_record, nullptr);
+  DCHECK_NE(orig_node, nullptr);
+
+  Node *new_node = nullptr;
+  std::vector<string> fused_ops = {};
+  int num_data_inputs = 0;
+  for (const Edge *e : orig_node->in_edges()) {
+    if (!e->IsControlEdge()) {
+      num_data_inputs++;
+    }
+  }
+
+  gtl::InlinedVector<Node *, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node *, int>, 4> inputs(num_data_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  NodeBuilder nb(orig_node->name().c_str(),
+                 rewrite_record->zen_op_name.c_str());
+
+  nb.Device(orig_node->def().device());
+  TF_RETURN_IF_ERROR(zendnn::CopyInputs(orig_node, inputs, &nb));
+  rewrite_record->update_zen_op_attr(const_cast<const Node *>(orig_node), &nb);
+
+  nb.Attr("reorder_before", reorder_flags.first);
+  nb.Attr("reorder_after", reorder_flags.second);
+  nb.Attr("in_links", IncomingEdgeCount(orig_node));
+  nb.Attr("out_links", OutgoingEdgeCount(orig_node));
+  nb.Attr("reset", IsLastZenNode(g, orig_node));
+
+  // Add/Update Fused Op Attribute.
+  if (orig_node->type_string() == "_ZenFusedConv2D" ||
+      orig_node->type_string() == "_FusedConv2D" ||
+      orig_node->type_string() == "_FusedDepthwiseConv2dNative" ||
+      orig_node->type_string() == "_ZenFusedDepthwiseConv2dNative") {
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
+    if (FuseConv2DBiasRelu(g, orig_node, "Relu")) {
+      if (fused_ops.size() == 1) {
+        fused_ops.push_back("Relu");
+      }
+    }
+    if (FuseConv2DBiasRelu(g, orig_node, "Relu6")) {
+      if (fused_ops.size() == 1) {
+        fused_ops.push_back("Relu6");
+      }
+    }
+    nb.Attr("fused_ops", fused_ops);
+  }
+  TF_RETURN_IF_ERROR(nb.Finalize(&**g, &new_node));
+
+  std::unordered_set<Node *> unique_node;
+  for (const Edge *e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      auto result = unique_node.insert(e->src());
+      if (result.second) {
+        (*g)->AddControlEdge(e->src(), new_node, true);
+      }
+    }
+  }
+  unique_node.clear();
+
+  for (const Edge *e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      auto result = unique_node.insert(e->dst());
+      if (result.second) {
+        (*g)->AddControlEdge(new_node, e->dst(), true);
+      }
+    } else {
+      auto result =
+          (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input());
+      DCHECK_NE(result, nullptr);
+    }
+  }
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+  (*g)->RemoveNode(orig_node);
+
+  return OkStatus();
+}
+
+std::unordered_map<Node *, std::pair<bool, bool>>
+ZenLayoutRewritePass::GetReorderFlags(const std::vector<Node *> &nodes) {
+  // Map from node to [reorder_before, reorder_after]
+  std::unordered_map<Node *, std::pair<bool, bool>> reorder_flags;
+  bool first_reorder_completed = false;  // assuming only one input
+
+  for (Node *n : nodes) {
+    // When setting reorder_before, we check if the input ops are read ops
+    // typically to avoid considering read ops from filter weights as they are
+    // reordered anyway in the Zen op. However, for the first op, there will be
+    // two read ops, one from weights, and one from input data. To handle this
+    // special case, this bool variable is used.
+    bool reorder_before, reorder_after;
+    reorder_before = reorder_after = false;
+
+    for (const Edge *e : n->out_edges()) {
+      Node *dst = e->dst();
+      if (!dst->IsOp() || e->IsControlEdge()) {
+        continue;
+      }
+
+      auto it = std::find(nodes.begin(), nodes.end(), dst);
+      if (it == nodes.end()) {
+        VLOG(1) << "ZenLayoutRewritePass::GetReorderFlags: At " << n->name()
+                << " " << n->type_string() << ", non-Zen output - "
+                << dst->name() << " " << dst->type_string();
+        // Did not find the next node. This means that the next node is not a
+        // Zen node, thus, we must reorder.
+        reorder_after = true;
+        // Exit the loop as remaining edges will not change this flag.
+        break;
+      }
+    }
+
+    for (const Edge *e : n->in_edges()) {
+      Node *src = e->src();
+      if (!src->IsOp() || e->IsControlEdge() ||
+          HasSubstr(src->type_string(), "Const")) {
+        continue;
+      }
+
+      if (HasSubstr(src->type_string(), "_Arg")) {
+        // Found a placeholder op.
+        VLOG(1) << "ZenLayoutRewritePass::GetReorderFlags: At " << n->name()
+                << " " << n->type_string() << ", a placeholder op "
+                << src->name() << " " << src->type_string();
+        // In this case, we don't need to worry about a read op from data.
+        first_reorder_completed = true;
+        reorder_before = true;
+        break;
+      }
+
+      // Ignore read ops coming from weights.
+      if (HasSubstr(src->name(), "read")) {
+        // Found read op, check if it is the first.
+        if (!first_reorder_completed) {
+          VLOG(1) << "ZenLayoutRewritePass::GetReorderFlags: At " << n->name()
+                  << " " << n->type_string() << ", encountered first read op "
+                  << src->name() << " " << src->type_string();
+          first_reorder_completed = true;
+          reorder_before = true;
+          break;
+        }
+        continue;
+      }
+
+      auto it = std::find(nodes.begin(), nodes.end(), src);
+      if (it == nodes.end()) {
+        VLOG(1) << "ZenLayoutRewritePass::GetReorderFlags: At " << n->name()
+                << " " << n->type_string() << ", non-Zen input - "
+                << src->name() << " " << src->type_string();
+        // Did not find the previous node. This means that the previous node is
+        // not a Zen node, thus, we must reorder.
+        reorder_before = true;
+        // Exit the loop since remaining edges will not change this flag.
+        break;
+      }
+    }
+
+    std::pair<bool, bool> n_flags(reorder_before, reorder_after);
+    reorder_flags[n] = n_flags;
+  }
+
+  // Handle the case of branches separately.
+  // Case 1
+  for (Node *n : nodes) {
+    // Let A and B be Zen nodes, and X be a non-Zen node.
+    // rb - reorder_before, ra - reorder_after
+    // Handle first case of branching:
+    //       A (rb=True, ra)
+    //     /   \
+    //    X     B(rb, ra=False)
+    if (reorder_flags[n].second == false) {
+      for (const Edge *e : n->out_edges()) {
+        Node *dst = e->dst();
+        auto it = std::find(nodes.begin(), nodes.end(), dst);
+        if (it != nodes.end() && reorder_flags[dst].first) {
+          // Found Zen node.
+          reorder_flags[n].second = true;
+          break;
+        }
+      }
+    }
+    // Reorder flags set to true cannot be altered.
+  }
+
+  // Case 2
+  for (Node *n : nodes) {
+    // Let A and B be Zen nodes, and X be a non-Zen node.
+    // rb - reorder_before, ra - reorder_after
+    // Handle second case of branching:
+    //    B(rb=False, ra)   X
+    //                  \  /
+    //                   A(rb,ra=True)
+    if (reorder_flags[n].first == false) {
+      for (const Edge *e : n->in_edges()) {
+        Node *src = e->src();
+        auto it = std::find(nodes.begin(), nodes.end(), src);
+        if (it != nodes.end() && reorder_flags[src].second) {
+          // Found Zen node.
+          reorder_flags[n].first = true;
+          break;
+        }
+      }
+    }
+    // Reorder flags set to true cannot be altered.
+  }
+
+  // Case 3
+  for (Node *n : nodes) {
+    // Let A be a Zen nodes, and B and X be a Zen/Non Zen node.
+    // rb - reorder_before, ra - reorder_after
+    // Handle third case of branching:
+    //    B(rb, ra=True)    X (set ra=True) if one of the siblings has ra=True
+    //                  \  /
+    //                   A
+    if (reorder_flags[n].second == false) {
+      for (const Edge *e : n->out_edges()) {
+        Node *dst = e->dst();
+        for (const Edge *f : dst->in_edges()) {
+          Node *src = f->src();
+          auto it = std::find(nodes.begin(), nodes.end(), src);
+          if (it != nodes.end() && src != n && reorder_flags[src].second) {
+            // Found a sibling with reorder_after set to True.
+            reorder_flags[n].second = true;
+            break;
+          }
+        }
+      }
+    }
+    // Reorder flags set to true cannot be altered.
+  }
+
+  return reorder_flags;
+}
+
+bool ZenLayoutRewritePass::AddReorderAttrs(std::unique_ptr<Graph> *g) {
+  bool result = false;
+  CHECK_NOTNULL(g);  // Crash ok.
+
+  std::vector<Node *> order;
+  GetReversePostOrder(**g, &order);
+  std::vector<Node *> zen_nodes;
+
+  for (Node *n : order) {
+    std::string op_name = n->type_string();
+    bool is_eager;
+
+    // NOTE: Every Zen op must have the prefix "_Zen".
+    auto found = op_name.find(zen_op_registry::kZenNodePrefix);
+    if (found != std::string::npos) {
+      // Found a Zen op.
+      TF_CHECK_OK(GetNodeAttr(n->def(), "is_eager", &is_eager));
+      if (is_eager == false) {
+        zen_nodes.push_back(n);
+      }
+    }
+  }
+
+  std::unordered_map<Node *, std::pair<bool, bool>> reorder_flags =
+      GetReorderFlags(zen_nodes);
+
+  for (Node *n : zen_nodes) {
+    std::string node_name = n->name();
+    std::string op_name = n->type_string();
+    std::pair<bool, bool> n_reorder = reorder_flags[n];
+
+    ZenOpRewriteRecord rewrite_record;
+    for (auto it = zen_rewrite_db_.begin(); it < zen_rewrite_db_.end(); it++) {
+      if (op_name == it->zen_op_name) {
+        rewrite_record = *it;
+        break;
+      }
+    }
+
+    // Rewrite op with a copy containing the new reorder flags.
+    if (ZenOpNodeRewrite(g, n, &rewrite_record, n_reorder) == OkStatus()) {
+      VLOG(1) << "ZenLayoutRewritePass::AddReorderAttrs: Node " << node_name
+              << " " << op_name << " updated reorders to " << n_reorder.first
+              << " " << n_reorder.second;
+      result = true;
+    }
+  }
+
+  return result;
+}
+
+bool ZenLayoutRewritePass::ZenOpUpdate(std::unique_ptr<Graph> *g) {
+  bool result = false;
+  std::vector<Node *> order;
+  GetReversePostOrder(**g, &order);
+  for (Node *n : order) {
+    if (!n->IsOp() || !zendnn::CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const ZenOpRewriteRecord *rewrite_record = nullptr;
+    if ((rewrite_record = CheckNodeForZenOpRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+      std::pair<bool, bool> n_reorder(true, true);
+      if (ZenOpNodeRewrite(g, n, rewrite_record, n_reorder) == OkStatus()) {
+        VLOG(1) << "ZenLayoutRewritePass::ZenOpUpdate: Node " << op_name
+                << " rewritten with ZenOp " << rewrite_record->zen_op_name;
+        result = true;
+      } else {
+        // Rewriting the node with Zen op failed. Hence the existing node will
+        // be there with graph for inference.
+        VLOG(1) << "ZenLayoutRewritePass::ZenOpUpdate: Failed to rewrite node "
+                << node_name << " with Zen op " << op_name;
+      }
+    }
+  }
+  return result;
+}
+
+// Method to find whether the graph has inference ops only. It returns error
+// status if the graph has training ops.
+Status ZenLayoutRewritePass::AreAllInferenceOps(std::unique_ptr<Graph> *g) {
+  std::vector<Node *> order;
+  GetReversePostOrder(**g, &order);
+  for (Node *n : order) {
+    if (!n->IsOp()) {
+      continue;
+    }
+    for (auto op = tf_training_ops_.cbegin(); op != tf_training_ops_.cend();
+         ++op) {
+      if (n->type_string().find(*op) != string::npos) {
+        return Status(error::Code::UNIMPLEMENTED,
+                      "Training operation found! Currently TF-ZenDNN "
+                      "does not support training. Set environment "
+                      "variable TF_ENABLE_ZENDNN_OPTS to '0' for training.");
+      }
+    }
+  }
+  return OkStatus();
+}
+
+bool ZenLayoutRewritePass::ZenOpRewritePass(std::unique_ptr<Graph> *g) {
+  bool result = false;
+  CHECK_NOTNULL(g);  // Crash ok.
+
+  // Before we proceed further for Zen Op rewrites first the graph shall be
+  // checked for inference ops only as TF-ZenDNN currently does not support
+  // training, it supports inference only.
+  TF_CHECK_OK(AreAllInferenceOps(g));
+
+  std::vector<Node *> order;
+
+  DumpGraph("\nBefore ZenRewritePass:\n", &**g);
+
+  // Two passes of Graph optimization:
+
+  // First pass implements Basic Fusion Eg. Conv2D-Bias-Relu.
+  result = ZenOpUpdate(g);
+  if (!result) {
+    VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: No opportunity for Zen "
+            << "op conversion found in first graph optimization pass.";
+  }
+  // Second Pass - Enable Fused Optimizations
+  // Enable Advanced Graph Optimizations
+  GetReversePostOrder(**g, &order);
+  for (Node *n : order) {
+    if (!n->IsOp() || !zendnn::CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    // Fused Optimizations
+
+    // Check and perform Pad fusion with FusedConv2D/Conv2D (Removes Pad op and
+    // expects n to be Pad op).
+    if (ZenFusePadConv(g, n, "_ZenFusedConv2D", "Pad")) {
+      VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: "
+              << "FusedConvPad Successful";
+    } else if (ZenFusePadConv(g, n, "_ZenConv2D", "Pad")) {
+      VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: ConvPad Successful";
+    } else {
+      VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: No opportunity for "
+              << "Conv-Pad fusion found in second graph optimization pass.";
+    }
+  }
+  // Third Pass to implement optimizations over Zen ops.
+  result = ZenOpUpdate(g);
+  if (!result) {
+    VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: No opportunity for Zen "
+            << "op conversion found in third graph optimization pass.";
+  }
+
+  result = AddReorderAttrs(g);
+  if (!result) {
+    VLOG(1) << "ZenLayoutRewritePass::ZenOpRewritePass: No reorder attributes "
+            << "were updated.";
+  }
+  DumpGraph("\nAfter ZenRewritePass:\n", &**g);
+  return result;
+}
+
+Status ZenLayoutRewritePass::Run(const GraphOptimizationPassOptions &options) {
+  if (!IsZenDnnEnabled()) {
+    VLOG(2) << "TF-ZENDNN: ZenDNN Inference is disabled! ";
+    return OkStatus();
+  }
+
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return OkStatus();
+  }
+
+  if (options.graph != nullptr) {
+    std::unique_ptr<Graph> *graph = std::move(options.graph);
+    ZenOpRewritePass(graph);
+    options.graph->reset(graph->release());
+  } else {
+    for (auto &g : *options.partition_graphs) {
+      std::unique_ptr<Graph> *graph = std::move(&g.second);
+      ZenOpRewritePass(graph);
+      (&g.second)->reset(graph->release());
+    }
+  }
+
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // AMD_ZENDNN
diff --git a/tensorflow/core/config/BUILD b/tensorflow/core/config/BUILD
index c97047086f0..bbecd83815d 100644
--- a/tensorflow/core/config/BUILD
+++ b/tensorflow/core/config/BUILD
@@ -3,7 +3,10 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 exports_files(
     ["flags_api_wrapper.cc"],
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 06af2838d13..9028fe83566 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -20,6 +21,8 @@ package(
 exports_files([
     "captured_function.cc",
     "captured_function.h",
+    "compression_utils.cc",
+    "compression_utils.h",
     "dataset_utils.cc",
     "dataset_utils.h",
     "finalization_utils.cc",
@@ -38,6 +41,8 @@ exports_files([
     "split_utils.h",
     "stats_utils.cc",
     "stats_utils.h",
+    "tfdataz_metrics.h",
+    "tfdataz_metrics.cc",
     "unbounded_thread_pool.cc",
     "unbounded_thread_pool.h",
     "utils.cc",
@@ -48,6 +53,7 @@ cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
     hdrs = ["captured_function.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
         ":dataset_utils",
@@ -59,7 +65,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/profiler/lib:traceme",
     ] + if_not_mobile([
         "//tensorflow/core/grappler:grappler_item",
@@ -71,6 +76,8 @@ cc_library(
     name = "compression_utils",
     srcs = ["compression_utils.cc"],
     hdrs = ["compression_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -85,6 +92,7 @@ cc_library(
 tf_cc_test(
     name = "compression_utils_test",
     srcs = ["compression_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "requires-mem:24g",
     ],
@@ -93,10 +101,11 @@ tf_cc_test(
         ":dataset_test_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/tsl/platform:status_matchers",
     ],
 )
 
@@ -105,6 +114,7 @@ cc_library(
     testonly = 1,
     srcs = ["dataset_test_base.cc"],
     hdrs = ["dataset_test_base.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
         ":dataset_utils",
@@ -132,6 +142,7 @@ cc_library(
     name = "dataset_utils",
     srcs = ["dataset_utils.cc"],
     hdrs = ["dataset_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
@@ -150,6 +161,7 @@ tf_cc_test(
     name = "dataset_utils_test",
     size = "small",
     srcs = ["dataset_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":compression_utils",
         ":dataset_test_base",
@@ -171,6 +183,7 @@ cc_library(
     name = "hash_utils",
     srcs = ["hash_utils.cc"],
     hdrs = ["hash_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
@@ -189,6 +202,7 @@ tf_cc_test(
     name = "hash_utils_test",
     size = "small",
     srcs = ["hash_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":hash_utils",
         "//tensorflow/cc:cc_ops",
@@ -202,9 +216,13 @@ tf_cc_test(
 cc_library(
     name = "metric_utils",
     srcs = ["metric_utils.cc"],
-    hdrs = ["metric_utils.h"],
+    hdrs = [
+        "metric_utils.h",
+    ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
+        ":tfdataz_metrics",
         "//tensorflow/core:framework",
         "//tensorflow/core/data:utils",
         "//tensorflow/core/platform:env",
@@ -218,6 +236,7 @@ tf_cc_test(
     name = "metric_utils_test",
     size = "small",
     srcs = ["metric_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":metric_utils",
         "//tensorflow/core:framework",
@@ -231,10 +250,44 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tfdataz_metrics",
+    srcs = ["tfdataz_metrics.cc"],
+    hdrs = ["tfdataz_metrics.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:thread_annotations",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tf_cc_test(
+    name = "tfdataz_metrics_test",
+    size = "small",
+    srcs = ["tfdataz_metrics_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":tfdataz_metrics",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/util:fake_clock_env",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 cc_library(
     name = "name_utils",
     srcs = ["name_utils.cc"],
     hdrs = ["name_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
@@ -246,6 +299,7 @@ tf_cc_test(
     name = "name_utils_test",
     size = "small",
     srcs = ["name_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":name_utils",
         "//tensorflow/core:framework",
@@ -258,6 +312,7 @@ cc_library(
     name = "rewrite_utils",
     srcs = ["rewrite_utils.cc"],
     hdrs = ["rewrite_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_utils",
         ":hash_utils",
@@ -286,6 +341,7 @@ tf_cc_test(
     name = "rewrite_utils_test",
     size = "small",
     srcs = ["rewrite_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":rewrite_utils",
         "//tensorflow/core:framework",
@@ -308,6 +364,7 @@ cc_library(
     name = "root_dataset",
     srcs = ["root_dataset.cc"],
     hdrs = ["root_dataset.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_utils",
         ":name_utils",
@@ -325,11 +382,14 @@ cc_library(
     name = "serialization_utils",
     srcs = ["serialization_utils.cc"],
     hdrs = ["serialization_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
+        ":compression_utils",
         ":dataset_utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/core:status",
     ],
 )
@@ -338,6 +398,7 @@ tf_cc_test(
     name = "serialization_utils_test",
     size = "small",
     srcs = ["serialization_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
@@ -361,6 +422,7 @@ cc_library(
     name = "split_utils",
     srcs = ["split_utils.cc"],
     hdrs = ["split_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -371,6 +433,7 @@ tf_cc_test(
     name = "split_utils_test",
     size = "small",
     srcs = ["split_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_test_base",
         ":serialization_utils",
@@ -387,6 +450,7 @@ cc_library(
     name = "snapshot_utils",
     srcs = ["snapshot_utils.cc"],
     hdrs = ["snapshot_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -406,12 +470,14 @@ tf_cc_test(
     name = "snapshot_utils_test",
     size = "small",
     srcs = ["snapshot_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":snapshot_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service:test_util",
     ],
 )
 
@@ -419,6 +485,7 @@ cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
     hdrs = ["standalone.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":root_dataset",
         ":unbounded_thread_pool",
@@ -426,15 +493,43 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/data:serialization_utils",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:refcount",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
     ],
 )
 
+tf_cc_test(
+    name = "standalone_save_restore_test",
+    srcs = ["standalone_save_restore_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":standalone",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+    ] + tf_protos_all(),
+)
+
 tf_cc_test(
     name = "standalone_test",
     srcs = ["standalone_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":standalone",
         "//tensorflow/core:test",
@@ -446,6 +541,7 @@ cc_library(
     name = "stats_utils",
     srcs = ["stats_utils.cc"],
     hdrs = ["stats_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/base:core_headers",
@@ -456,6 +552,7 @@ cc_library(
     name = "unbounded_thread_pool",
     srcs = ["unbounded_thread_pool.cc"],
     hdrs = ["unbounded_thread_pool.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -469,6 +566,7 @@ cc_library(
     name = "finalization_utils",
     srcs = ["finalization_utils.cc"],
     hdrs = ["finalization_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":root_dataset",
         "//tensorflow/core:framework",
@@ -481,6 +579,7 @@ tf_cc_test(
     name = "unbounded_thread_pool_test",
     size = "small",
     srcs = ["unbounded_thread_pool_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":unbounded_thread_pool",
         "//tensorflow/core:lib_internal",
@@ -493,6 +592,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
     ],
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 5b2f290e7e0..1d45040eb58 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -412,8 +412,11 @@ Status MakeIteratorFromInputElement(
   // Create an iterator for the dataset that was returned by `f`.
   std::string iterator_prefix = strings::StrCat(prefix, "[", thread_index, "]");
 
-  return returned_dataset->MakeIterator(MakeNestedIteratorContext(ctx), parent,
-                                        iterator_prefix, out_iterator);
+  IteratorContext nested_ctx = MakeNestedIteratorContext(ctx);
+  TF_RETURN_IF_ERROR(returned_dataset->MakeIterator(
+      &nested_ctx, parent, iterator_prefix, out_iterator));
+  ctx->MergeCheckpoint(nested_ctx.checkpoint());
+  return OkStatus();
 }
 
 IteratorContext MakeNestedIteratorContext(IteratorContext* ctx) {
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index f68f25cbcc3..b2300165e2d 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -20,6 +20,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/status.h"
@@ -28,14 +31,50 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
+namespace {
+
+// Increment this when making changes to the `CompressedElement` proto. The
+// `UncompressElement` function will determine what to read according to the
+// version.
+constexpr int kCompressedElementVersion = 0;
+
+}  // namespace
+
+class Iov {
+ public:
+  explicit Iov(size_t size) : iov_(size), idx_(0), num_bytes_(0) {}
+
+  void Add(void* base, size_t len) {
+    iov_[idx_].iov_base = base;
+    iov_[idx_].iov_len = len;
+    num_bytes_ += len;
+    ++idx_;
+  }
+
+  iovec* Data() { return iov_.data(); }
+
+  size_t NumBytes() const { return num_bytes_; }
+
+  size_t NumPieces() const { return iov_.size(); }
+
+ private:
+  std::vector<struct iovec> iov_;
+  size_t idx_;
+  size_t num_bytes_;
+};
 
 Status CompressElement(const std::vector<Tensor>& element,
                        CompressedElement* out) {
-  // Determine the total amount of non`memcpy`able tensor data.
+  // First pass: preprocess the non`memcpy`able tensors.
+  size_t num_string_tensors = 0;
+  size_t num_string_tensor_strings = 0;
   std::vector<TensorProto> nonmemcpyable_components;
   size_t total_nonmemcpyable_size = 0;
   for (const auto& component : element) {
-    if (!DataTypeCanUseMemcpy(component.dtype())) {
+    if (component.dtype() == DT_STRING) {
+      ++num_string_tensors;
+      num_string_tensor_strings += component.NumElements();
+    } else if (!DataTypeCanUseMemcpy(component.dtype())) {
       nonmemcpyable_components.emplace_back();
       component.AsProtoTensorContent(&nonmemcpyable_components.back());
       total_nonmemcpyable_size +=
@@ -43,14 +82,13 @@ Status CompressElement(const std::vector<Tensor>& element,
     }
   }
 
-  // Build an array of `iovec`s pointing to the tensor data and compress it.
-  // - `memcpy`able data is pointed to directly using a `TensorBuffer` and does
-  // not take a copy before compression.
-  // - Non`memcpy`able data is serialized and written into a string (a `tstring`
-  // for access to `resize_uninitialized`) and does take a copy before
-  // compression.
-  std::vector<struct iovec> iov(element.size());
-  size_t total_size = 0;
+  // Second pass: build an iov array of the tensor data.
+  // - `memcpy`able tensors are pointed to directly from a single iovec.
+  // - String tensors are pointed to directly from multiple iovecs (one for each
+  // string).
+  // - All other tensors are serialized and copied into a string (a `tstring`
+  // for access to `resize_unitialized`).
+  Iov iov{element.size() + num_string_tensor_strings - num_string_tensors};
   tstring nonmemcpyable;
   nonmemcpyable.resize_uninitialized(total_nonmemcpyable_size);
   char* nonmemcpyable_pos = nonmemcpyable.mdata();
@@ -64,79 +102,93 @@ Status CompressElement(const std::vector<Tensor>& element,
     if (DataTypeCanUseMemcpy(component.dtype())) {
       const TensorBuffer* buffer = DMAHelper::buffer(&component);
       if (buffer) {
-        iov[i].iov_base = buffer->data();
-        iov[i].iov_len = buffer->size();
-        metadata->set_tensor_size_bytes(buffer->size());
+        iov.Add(buffer->data(), buffer->size());
+        metadata->add_uncompressed_bytes(buffer->size());
+      }
+    } else if (component.dtype() == DT_STRING) {
+      const auto& flats = component.unaligned_flat<tstring>();
+      for (int i = 0; i < flats.size(); ++i) {
+        iov.Add(const_cast<char*>(flats.data()[i].data()),
+                flats.data()[i].size());
+        metadata->add_uncompressed_bytes(flats.data()[i].size());
       }
     } else {
       TensorProto& proto =
           nonmemcpyable_components[nonmemcpyable_component_index++];
       proto.SerializeToArray(nonmemcpyable_pos, proto.ByteSizeLong());
-      iov[i].iov_base = nonmemcpyable_pos;
-      iov[i].iov_len = proto.ByteSizeLong();
+      iov.Add(nonmemcpyable_pos, proto.ByteSizeLong());
       nonmemcpyable_pos += proto.ByteSizeLong();
-      metadata->set_tensor_size_bytes(proto.ByteSizeLong());
+      metadata->add_uncompressed_bytes(proto.ByteSizeLong());
     }
-    total_size += iov[i].iov_len;
   }
-  if (total_size > kuint32max) {
+
+  if (iov.NumBytes() > kuint32max) {
     return errors::OutOfRange("Encountered dataset element of size ",
-                              total_size, ", exceeding the 4GB Snappy limit.");
+                              iov.NumBytes(),
+                              ", exceeding the 4GB Snappy limit.");
   }
-  DCHECK_EQ(nonmemcpyable_pos,
-            nonmemcpyable.mdata() + total_nonmemcpyable_size);
-  if (!port::Snappy_CompressFromIOVec(iov.data(), total_size,
+  if (!port::Snappy_CompressFromIOVec(iov.Data(), iov.NumBytes(),
                                       out->mutable_data())) {
     return errors::Internal("Failed to compress using snappy.");
   }
-  VLOG(3) << "Compressed element from " << total_size << " bytes to "
+  out->set_version(kCompressedElementVersion);
+  VLOG(3) << "Compressed element from " << iov.NumBytes() << " bytes to "
           << out->data().size() << " bytes";
   return OkStatus();
 }
 
 Status UncompressElement(const CompressedElement& compressed,
                          std::vector<Tensor>* out) {
+  if (compressed.version() != kCompressedElementVersion) {
+    return errors::Internal("Unsupported compressed element version: ",
+                            compressed.version());
+  }
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);
 
-  // Determine the total amount of non`memcpy`able tensor data.
+  // First pass: preprocess the non`memcpy`able tensors.
+  size_t num_string_tensors = 0;
+  size_t num_string_tensor_strings = 0;
   size_t total_nonmemcpyable_size = 0;
   for (const auto& metadata : compressed.component_metadata()) {
-    if (!DataTypeCanUseMemcpy(metadata.dtype())) {
-      total_nonmemcpyable_size += metadata.tensor_size_bytes();
+    if (metadata.dtype() == DT_STRING) {
+      ++num_string_tensors;
+      num_string_tensor_strings += metadata.uncompressed_bytes_size();
+    } else if (!DataTypeCanUseMemcpy(metadata.dtype())) {
+      total_nonmemcpyable_size += metadata.uncompressed_bytes(0);
     }
   }
 
-  // Step 1: Prepare the memory that we will uncompress into.
-  std::vector<struct iovec> iov(num_components);
-  // We use tstring for access to resize_uninitialized.
+  // Second pass: prepare the memory to be uncompressed into.
+  // - `memcpy`able tensors are directly uncompressed into via a single iovec.
+  // - String tensors are directly uncompressed into via multiple iovecs (one
+  // for each string).
+  // - All other tensors are uncompressed into a string (a `tstring` for access
+  // to `resize_unitialized`).
+  Iov iov{num_components + num_string_tensor_strings - num_string_tensors};
   tstring nonmemcpyable;
   nonmemcpyable.resize_uninitialized(total_nonmemcpyable_size);
   char* nonmemcpyable_pos = nonmemcpyable.mdata();
-  int64_t total_size = 0;
-  for (int i = 0; i < num_components; ++i) {
-    const CompressedComponentMetadata& metadata =
-        compressed.component_metadata(i);
+  for (const auto& metadata : compressed.component_metadata()) {
     if (DataTypeCanUseMemcpy(metadata.dtype())) {
       out->emplace_back(metadata.dtype(), metadata.tensor_shape());
       TensorBuffer* buffer = DMAHelper::buffer(&out->back());
       if (buffer) {
-        iov[i].iov_base = buffer->data();
-        iov[i].iov_len = buffer->size();
-      } else {
-        iov[i].iov_base = nullptr;
-        iov[i].iov_len = 0;
+        iov.Add(buffer->data(), metadata.uncompressed_bytes(0));
+      }
+    } else if (metadata.dtype() == DT_STRING) {
+      out->emplace_back(metadata.dtype(), metadata.tensor_shape());
+      const auto& flats = out->back().unaligned_flat<tstring>();
+      for (int i = 0; i < metadata.uncompressed_bytes_size(); ++i) {
+        flats.data()[i].resize(metadata.uncompressed_bytes(i));
+        iov.Add(flats.data()[i].mdata(), metadata.uncompressed_bytes(i));
       }
     } else {
-      // Allocate an empty Tensor. We will fill it out later after
-      // uncompressing into the tensor_proto_str.
       out->emplace_back();
-      iov[i].iov_base = nonmemcpyable_pos;
-      iov[i].iov_len = metadata.tensor_size_bytes();
-      nonmemcpyable_pos += metadata.tensor_size_bytes();
+      iov.Add(nonmemcpyable_pos, metadata.uncompressed_bytes(0));
+      nonmemcpyable_pos += metadata.uncompressed_bytes(0);
     }
-    total_size += iov[i].iov_len;
   }
 
   // Step 2: Uncompress into the iovec.
@@ -148,60 +200,41 @@ Status UncompressElement(const CompressedElement& compressed,
         "Could not get snappy uncompressed length. Compressed data size: ",
         compressed_data.size());
   }
-  if (uncompressed_size != static_cast<size_t>(total_size)) {
+  if (uncompressed_size != static_cast<size_t>(iov.NumBytes())) {
     return errors::Internal(
         "Uncompressed size mismatch. Snappy expects ", uncompressed_size,
-        " whereas the tensor metadata suggests ", total_size);
+        " whereas the tensor metadata suggests ", iov.NumBytes());
   }
   if (!port::Snappy_UncompressToIOVec(compressed_data.data(),
-                                      compressed_data.size(), iov.data(),
-                                      num_components)) {
+                                      compressed_data.size(), iov.Data(),
+                                      iov.NumPieces())) {
     return errors::Internal("Failed to perform snappy decompression.");
   }
 
-  // Step 3: Deserialize tensor proto strings to tensors.
+  // Third pass: deserialize nonstring, non`memcpy`able tensors.
   nonmemcpyable_pos = nonmemcpyable.mdata();
   for (int i = 0; i < num_components; ++i) {
     const CompressedComponentMetadata& metadata =
         compressed.component_metadata(i);
-    if (!DataTypeCanUseMemcpy(metadata.dtype())) {
+    if (!DataTypeCanUseMemcpy(metadata.dtype()) &&
+        metadata.dtype() != DT_STRING) {
       TensorProto tp;
       if (!tp.ParseFromString(
               {nonmemcpyable_pos,
-               static_cast<size_t>(metadata.tensor_size_bytes())})) {
+               static_cast<size_t>(metadata.uncompressed_bytes(0))})) {
         return errors::Internal("Could not parse TensorProto");
       }
       if (!out->at(i).FromProto(tp)) {
         return errors::Internal("Could not parse Tensor");
       }
-      nonmemcpyable_pos += metadata.tensor_size_bytes();
+      nonmemcpyable_pos += metadata.uncompressed_bytes(0);
     }
   }
   return OkStatus();
 }
 
-StatusOr<std::string> CompressAndSerialize(const std::vector<Tensor>& tensors) {
-  CompressedElement compressed_tensors;
-  TF_RETURN_IF_ERROR(CompressElement(tensors, &compressed_tensors));
-  std::string serialized;
-  if (!compressed_tensors.SerializeToString(&serialized)) {
-    return errors::Internal("Failed to serialize compressed Tensors: ",
-                            compressed_tensors.ShortDebugString());
-  }
-  return serialized;
-}
-
-StatusOr<std::vector<Tensor>> DeserializeAndUncompress(
-    const std::string& serialized_tensors) {
-  CompressedElement compressed_tensors;
-  if (!compressed_tensors.ParseFromString(serialized_tensors)) {
-    return errors::Internal("Failed to deserialize compressed Tensors: ",
-                            serialized_tensors);
-  }
-  std::vector<Tensor> tensors;
-  TF_RETURN_IF_ERROR(UncompressElement(compressed_tensors, &tensors));
-  return tensors;
-}
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(CompressedElement,
+                                       "tensorflow.data.CompressedElement");
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/compression_utils.h b/tensorflow/core/data/compression_utils.h
index bc5ebcfd0f8..3e53e82898a 100644
--- a/tensorflow/core/data/compression_utils.h
+++ b/tensorflow/core/data/compression_utils.h
@@ -39,13 +39,6 @@ Status CompressElement(const std::vector<Tensor>& element,
 Status UncompressElement(const CompressedElement& compressed,
                          std::vector<Tensor>* out);
 
-// Compresses and serializes Tensors.
-StatusOr<std::string> CompressAndSerialize(const std::vector<Tensor>& tensors);
-
-// Deserializes and uncompresses Tensors.
-StatusOr<std::vector<Tensor>> DeserializeAndUncompress(
-    const std::string& serialized_tensors);
-
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/compression_utils_test.cc b/tensorflow/core/data/compression_utils_test.cc
index 95dba49f0e4..44a875e6665 100644
--- a/tensorflow/core/data/compression_utils_test.cc
+++ b/tensorflow/core/data/compression_utils_test.cc
@@ -19,15 +19,16 @@ limitations under the License.
 
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-using ::tensorflow::testing::StatusIs;
 using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
 
 TEST(CompressionUtilsTest, Exceeds4GB) {
   std::vector<Tensor> element = {
@@ -46,11 +47,16 @@ std::vector<std::vector<Tensor>> TestCases() {
       CreateTensors<int64_t>(TensorShape{1}, {{1}, {2}}),
       // Single tstring.
       CreateTensors<tstring>(TensorShape{1}, {{"a"}, {"b"}}),
+      // Multiple tstrings.
+      {CreateTensor<tstring>(TensorShape{1, 2}, {"abc", "xyz"}),
+       CreateTensor<tstring>(TensorShape{2, 1}, {"ijk", "mnk"})},
       // Mix of tstring and int64.
       {CreateTensor<tstring>(TensorShape{1}, {"a"}),
        CreateTensor<int64_t>(TensorShape{1}, {1})},
-      // Empty.
+      // Empty element.
       {},
+      // Empty tensor.
+      {CreateTensor<int64_t>(TensorShape{1, 0})},
       // Larger int64.
       {CreateTensor<int64_t>(TensorShape{128, 128}),
        CreateTensor<int64_t>(TensorShape{64, 2})},
@@ -80,25 +86,25 @@ TEST_P(ParameterizedCompressionUtilsTest, RoundTrip) {
       ExpectEqual(element, round_trip_element, /*compare_order=*/true));
 }
 
-INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
-                         ::testing::ValuesIn(TestCases()));
-
-class ParameterizedCompressionAndSerializationTest
-    : public DatasetOpsTestBase,
-      public ::testing::WithParamInterface<std::vector<Tensor>> {};
+TEST_P(ParameterizedCompressionUtilsTest, CompressedElementVersion) {
+  std::vector<Tensor> element = GetParam();
+  CompressedElement compressed;
+  TF_ASSERT_OK(CompressElement(element, &compressed));
+  EXPECT_EQ(0, compressed.version());
+}
 
-TEST_P(ParameterizedCompressionAndSerializationTest, RoundTrip) {
+TEST_P(ParameterizedCompressionUtilsTest, VersionMismatch) {
   std::vector<Tensor> element = GetParam();
-  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_tensors,
-                          CompressAndSerialize(element));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Tensor> round_trip_element,
-                          DeserializeAndUncompress(serialized_tensors));
-  TF_EXPECT_OK(
-      ExpectEqual(element, round_trip_element, /*compare_order=*/true));
+  CompressedElement compressed;
+  TF_ASSERT_OK(CompressElement(element, &compressed));
+
+  compressed.set_version(1);
+  std::vector<Tensor> round_trip_element;
+  EXPECT_THAT(UncompressElement(compressed, &round_trip_element),
+              StatusIs(error::INTERNAL));
 }
 
-INSTANTIATE_TEST_SUITE_P(Instantiation,
-                         ParameterizedCompressionAndSerializationTest,
+INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
                          ::testing::ValuesIn(TestCases()));
 
 }  // namespace
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 01b56858c1a..d831b26c57e 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -654,6 +654,8 @@ INSTANTIATE_TEST_SUITE_P(Test, GetOptimizationsTest,
                                            GetOptimizationTestCase4()));
 
 TEST(DeterministicOpsTest, GetOptimizations) {
+  // TODO(b/259305727): Re-enable for MacOS when the bug is fixed.
+#if !defined(__APPLE__)
   tsl::test::DeterministicOpsScope det_scope;
   Options options;
   // options.deterministic should be ignored when deterministic ops are enabled.
@@ -663,6 +665,7 @@ TEST(DeterministicOpsTest, GetOptimizations) {
   EXPECT_THAT(std::vector<string>(actual_enabled.begin(), actual_enabled.end()),
               ::testing::UnorderedElementsAreArray({"make_deterministic"}));
   EXPECT_EQ(actual_disabled.size(), 0);
+#endif
 }
 
 REGISTER_DATASET_EXPERIMENT("test_only_experiment",
diff --git a/tensorflow/core/data/finalization_utils.h b/tensorflow/core/data/finalization_utils.h
index f019c203cab..c0ed2afb1b0 100644
--- a/tensorflow/core/data/finalization_utils.h
+++ b/tensorflow/core/data/finalization_utils.h
@@ -32,4 +32,4 @@ StatusOr<DatasetBase*> GetFinalizedDataset(OpKernelContext* ctx,
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DATA_FINALIZATION_UTILS_H_
+#endif  // TENSORFLOW_CORE_DATA_FINALIZATION_UTILS_H_
diff --git a/tensorflow/core/data/metric_utils.cc b/tensorflow/core/data/metric_utils.cc
index bde83f4bef1..8816593e3c0 100644
--- a/tensorflow/core/data/metric_utils.cc
+++ b/tensorflow/core/data/metric_utils.cc
@@ -72,7 +72,9 @@ void IteratorMetricsCollector::RecordStop(absl::Time start_time,
   }
 
   const uint64_t end_time_us = env_.NowMicros();
-  AddLatencySample(safe_sub(end_time_us, absl::ToUnixMicros(start_time)));
+  const int64_t latency_micros =
+      safe_sub(end_time_us, absl::ToUnixMicros(start_time));
+  AddLatencySample(latency_micros);
   IncrementThroughput(GetTotalBytes(output));
   mutex_lock l(mu_);
   metrics::RecordTFDataIteratorLifetime(safe_sub(end_time_us, end_time_us_));
diff --git a/tensorflow/core/data/metric_utils.h b/tensorflow/core/data/metric_utils.h
index ec62ab73920..7d67cb92115 100644
--- a/tensorflow/core/data/metric_utils.h
+++ b/tensorflow/core/data/metric_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/time/time.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index 200ca5b131f..0b7ae51e624 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -161,9 +161,14 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
 
   ~Iterator() override { cancellation_manager_->StartCancel(); }
 
+  bool SymbolicCheckpointCompatible() const override { return true; }
+
   Status Initialize(IteratorContext* ctx) override {
-    return dataset()->input_->MakeIterator(IteratorContext(CreateParams(ctx)),
-                                           this, prefix(), &input_impl_);
+    IteratorContext iter_ctx(CreateParams(ctx));
+    TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(&iter_ctx, this,
+                                                       prefix(), &input_impl_));
+    ctx->MergeCheckpoint(iter_ctx.checkpoint());
+    return OkStatus();
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -177,8 +182,10 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
     if (dataset()->params_.autotune) {
       TF_RETURN_IF_ERROR(EnsureModelThreadStarted(ctx));
     }
-    TF_RETURN_IF_ERROR(input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
-                                            out_tensors, end_of_sequence));
+    IteratorContext iter_ctx(CreateParams(ctx));
+    TF_RETURN_IF_ERROR(
+        input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence));
+    ctx->MergeCheckpoint(iter_ctx.checkpoint());
     {
       mutex_lock l(mu_);
       end_time_usec_ = std::max(ctx->env()->NowMicros(), end_time_usec_);
@@ -200,8 +207,9 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
 
   Status RestoreInternal(IteratorContext* ctx,
                          IteratorStateReader* reader) override {
-    TF_RETURN_IF_ERROR(
-        RestoreInput(IteratorContext(CreateParams(ctx)), reader, input_impl_));
+    IteratorContext iter_ctx(CreateParams(ctx));
+    TF_RETURN_IF_ERROR(RestoreInput(&iter_ctx, reader, input_impl_));
+    ctx->MergeCheckpoint(iter_ctx.checkpoint());
     return OkStatus();
   }
 
@@ -249,7 +257,6 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
       params.runner =
           RunnerWithMaxParallelism(params.runner, max_intra_op_parallelism_);
     }
-    params.options = &dataset()->options();
     return params;
   }
 
diff --git a/tensorflow/core/data/serialization_utils.cc b/tensorflow/core/data/serialization_utils.cc
index c0036799438..a2250ed579f 100644
--- a/tensorflow/core/data/serialization_utils.cc
+++ b/tensorflow/core/data/serialization_utils.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/data/serialization_utils.h"
 
 #include <memory>
@@ -22,8 +21,10 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
@@ -428,7 +429,7 @@ std::string IteratorStateVariant::TypeName() {
 
 IteratorStateVariant::IteratorStateVariant(const IteratorStateVariant& other) {
   if (other.data_) {
-    Decode(*other.data_);
+    data_ = std::make_unique<VariantTensorData>(*other.data_);
   }
 }
 
@@ -439,19 +440,63 @@ Status IteratorStateVariant::InitializeFromVariantData(
 }
 
 void IteratorStateVariant::Encode(VariantTensorData* data) const {
-  *data = *data_;
+  CompressedElement compressed_tensors;
+  Status s = CompressElement(data_->tensors(), &compressed_tensors);
+  if (!s.ok()) {
+    LOG(WARNING) << "Failed to compress iterator state variant: " << s;
+    *data = *data_;
+    return;
+  }
+
+  data->set_type_name(TypeName());
+  data->set_metadata(data_->metadata_string());
+  Tensor tensor(DT_VARIANT, TensorShape({}));
+  tensor.scalar<Variant>()() = std::move(compressed_tensors);
+  *data->add_tensors() = std::move(tensor);
 }
 
 bool IteratorStateVariant::Decode(VariantTensorData data) {
   if (data.type_name() != TypeName()) {
     return false;
   }
-  auto tensor_data = std::make_unique<VariantTensorData>();
-  std::swap(*tensor_data, data);
-  data_ = std::move(tensor_data);
+
+  const CompressedElement* compressed = GetCompressedElement(data);
+  if (!compressed) {
+    data_ = std::make_unique<VariantTensorData>(std::move(data));
+    return true;
+  }
+
+  std::vector<Tensor> tensors;
+  Status s = UncompressElement(*compressed, &tensors);
+  if (!s.ok()) {
+    LOG(WARNING) << "Failed to uncompress iterator state variant: " << s;
+    data_ = std::make_unique<VariantTensorData>(std::move(data));
+    return true;
+  }
+
+  data_ = std::make_unique<VariantTensorData>();
+  data_->set_type_name(TypeName());
+  data_->set_metadata(std::move(data.metadata_string()));
+  for (auto& tensor : tensors) {
+    *data_->add_tensors() = std::move(tensor);
+  }
   return true;
 }
 
+const CompressedElement* IteratorStateVariant::GetCompressedElement(
+    const VariantTensorData& data) {
+  bool should_uncompress =
+      data.tensors_size() == 1 &&
+      TensorShapeUtils::IsScalar(data.tensors(0).shape()) &&
+      data.tensors(0).dtype() == DT_VARIANT;
+  if (!should_uncompress) {
+    return nullptr;
+  }
+
+  const Variant& variant = data.tensors(0).scalar<Variant>()();
+  return variant.get<CompressedElement>();
+}
+
 std::string IteratorStateVariant::DebugString() const {
   if (data_) {
     return strings::StrCat("IteratorStateVariant<", data_->DebugString(), ">");
diff --git a/tensorflow/core/data/serialization_utils.h b/tensorflow/core/data/serialization_utils.h
index f794315e81a..e03fb80c01e 100644
--- a/tensorflow/core/data/serialization_utils.h
+++ b/tensorflow/core/data/serialization_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -161,12 +162,24 @@ class IteratorStateVariant {
   // Returns a borrowed pointer to the underlying VariantTensorData.
   const VariantTensorData* GetData() const { return data_.get(); }
 
+  // Encodes this `IteratorStateVariant` into `*data`. Data will be compressed
+  // and stored as a scalar `CompressedElement` tensor, or left uncompressed if
+  // compression fails.
   void Encode(VariantTensorData* data) const;
+
+  // Decodes from `data`. If `data` contains a single scalar `CompressedElement`
+  // tensor, it is assumed to be compressed by `Encode`, and will be
+  // uncompressed as part of `Decode`.
   bool Decode(VariantTensorData data);
 
   std::string DebugString() const;
 
  private:
+  // Returns the compressed element in `data`. If `data` does not contain a
+  // compressed element, returns nullptr.
+  static const CompressedElement* GetCompressedElement(
+      const VariantTensorData& data);
+
   std::unique_ptr<VariantTensorData> data_;
 };
 
diff --git a/tensorflow/core/data/serialization_utils_test.cc b/tensorflow/core/data/serialization_utils_test.cc
index f8aee1783fc..5b8d79b1fcf 100644
--- a/tensorflow/core/data/serialization_utils_test.cc
+++ b/tensorflow/core/data/serialization_utils_test.cc
@@ -240,6 +240,13 @@ class ParameterizedIteratorStateVariantTest
     decoder.Decode(encoded_data);
     return *decoder.GetData();
   }
+
+  StatusOr<VariantTensorData> DecodeUncompressed(
+      const VariantTensorData& data) const {
+    IteratorStateVariant decoder;
+    decoder.Decode(data);
+    return *decoder.GetData();
+  }
 };
 
 std::vector<std::vector<Tensor>> TestCases() {
@@ -265,6 +272,16 @@ TEST_P(ParameterizedIteratorStateVariantTest, EncodeAndDecode) {
   }
 }
 
+TEST_P(ParameterizedIteratorStateVariantTest, DecodeUncompressed) {
+  VariantTensorData data = GetVariantTensorData();
+  TF_ASSERT_OK_AND_ASSIGN(VariantTensorData result, DecodeUncompressed(data));
+
+  EXPECT_EQ(result.type_name(), data.type_name());
+  for (int i = 0; i < result.tensors_size(); ++i) {
+    test::ExpectEqual(result.tensors(i), data.tensors(i));
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedIteratorStateVariantTest,
                          ::testing::ValuesIn(TestCases()));
 
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index f77540411d2..7d01592f102 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -15,6 +15,7 @@ load(
 package_group(name = "data_transfer_visibility")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -71,6 +72,7 @@ cc_library(
     hdrs = [
         "auto_shard_rewriter.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
@@ -102,6 +104,7 @@ tf_cc_test(
     name = "auto_shard_rewriter_test",
     size = "small",
     srcs = ["auto_shard_rewriter_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":auto_shard_rewriter",
         ":common_proto_cc",
@@ -130,8 +133,10 @@ cc_library(
     hdrs = [
         "common.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
@@ -144,6 +149,7 @@ cc_library(
 tf_cc_test(
     name = "common_test",
     srcs = ["common_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         "//tensorflow/core:lib",
@@ -159,6 +165,7 @@ cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
     hdrs = ["credentials_factory.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
@@ -168,6 +175,7 @@ cc_library(
 tf_cc_test(
     name = "credentials_factory_test",
     srcs = ["credentials_factory_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":credentials_factory",
         "//tensorflow/core:lib",
@@ -180,6 +188,7 @@ tf_cc_test(
 cc_library(
     name = "cross_trainer_cache",
     hdrs = ["cross_trainer_cache.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":logging_utils",
         "//tensorflow/core:framework",
@@ -196,6 +205,7 @@ tf_cc_test(
     name = "cross_trainer_cache_test",
     size = "small",
     srcs = ["cross_trainer_cache_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":cross_trainer_cache",
         "//tensorflow/core:framework",
@@ -221,6 +231,7 @@ tf_cc_test(
 tf_cc_test(
     name = "data_service_test",
     srcs = ["data_service_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":dispatcher_client",
@@ -242,6 +253,7 @@ cc_library(
     name = "data_transfer",
     srcs = ["data_transfer.cc"],
     hdrs = ["data_transfer.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = [
         ":data_transfer_visibility",
     ],
@@ -262,6 +274,7 @@ cc_library(
 tf_cc_test(
     name = "data_transfer_test",
     srcs = ["data_transfer_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":data_transfer",
         "//tensorflow/core:framework",
@@ -279,6 +292,7 @@ cc_library(
     name = "dataset_store",
     srcs = ["dataset_store.cc"],
     hdrs = ["dataset_store.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":dispatcher_state",
@@ -293,6 +307,7 @@ cc_library(
 tf_cc_test(
     name = "dataset_store_test",
     srcs = ["dataset_store_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":dataset_store",
@@ -309,6 +324,7 @@ cc_grpc_library(
     name = "dispatcher_cc_grpc_proto",
     srcs = [":dispatcher_proto"],
     compatible_with = get_compatible_with_cloud(),
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     generate_mocks = True,
     grpc_only = True,
     deps = [":dispatcher_proto_cc"],
@@ -318,11 +334,11 @@ cc_library(
     name = "dispatcher_client",
     srcs = ["dispatcher_client.cc"],
     hdrs = ["dispatcher_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
         ":credentials_factory",
-        ":data_transfer",
         ":dispatcher_cc_grpc_proto",
         ":dispatcher_proto_cc",
         ":grpc_util",
@@ -340,18 +356,21 @@ cc_library(
 tf_cc_test(
     name = "dispatcher_client_test",
     srcs = ["dispatcher_client_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":data_transfer",
         ":dispatcher_client",
         ":test_cluster",
         ":test_util",
-        "//tensorflow/core/platform:status_matchers",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service/snapshot:path_utils",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
@@ -362,6 +381,7 @@ cc_library(
     hdrs = [
         "dispatcher_impl.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
@@ -373,13 +393,17 @@ cc_library(
         ":grpc_util",
         ":journal",
         ":journal_proto_cc",
+        ":split_provider",
         ":task_remover",
+        ":utils",
         ":validate_utils",
         ":worker_cc_grpc_proto",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "//tensorflow/core/data/service/snapshot:snapshot_manager",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -389,7 +413,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:dataset_utils",
         "//tensorflow/core/data:hash_utils",
+        "//tensorflow/core/data:snapshot_utils",
         "//tensorflow/core/data:standalone",
+        "//tensorflow/core/data/service/snapshot:file_utils",
+        "//tensorflow/core/data/service/snapshot:path_utils",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
@@ -408,6 +435,7 @@ cc_library(
     hdrs = [
         "dispatcher_state.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":auto_shard_rewriter",
         ":common",
@@ -426,18 +454,16 @@ cc_library(
 tf_cc_test(
     name = "dispatcher_state_test",
     srcs = ["dispatcher_state_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":dispatcher_state",
-        ":journal",
         ":journal_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/tsl/platform:status_matchers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -446,6 +472,7 @@ cc_library(
     name = "grpc_dispatcher_impl",
     srcs = ["grpc_dispatcher_impl.cc"],
     hdrs = ["grpc_dispatcher_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dispatcher_cc_grpc_proto",
         ":dispatcher_impl",
@@ -458,6 +485,7 @@ cc_library(
 tf_cc_test(
     name = "grpc_dispatcher_impl_test",
     srcs = ["grpc_dispatcher_impl_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
@@ -485,6 +513,7 @@ cc_library(
     hdrs = [
         "grpc_util.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         "//tensorflow/core:lib",
@@ -495,6 +524,7 @@ cc_library(
 tf_cc_test(
     name = "grpc_util_test",
     srcs = ["grpc_util_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_util",
         "//tensorflow/core:lib",
@@ -508,6 +538,7 @@ cc_library(
     name = "grpc_worker_impl",
     srcs = ["grpc_worker_impl.cc"],
     hdrs = ["grpc_worker_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":export_proto_cc",
         ":worker_cc_grpc_proto",
@@ -523,6 +554,7 @@ cc_library(
 tf_cc_test(
     name = "grpc_worker_impl_test",
     srcs = ["grpc_worker_impl_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":credentials_factory",
@@ -549,6 +581,7 @@ cc_library(
     name = "journal",
     srcs = ["journal.cc"],
     hdrs = ["journal.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":journal_proto_cc",
         "//tensorflow/core:lib",
@@ -572,6 +605,7 @@ tf_proto_library(
 tf_cc_test(
     name = "journal_test",
     srcs = ["journal_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":journal",
@@ -589,6 +623,7 @@ cc_library(
     name = "logging_utils",
     srcs = ["logging_utils.cc"],
     hdrs = ["logging_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core/platform:strcat",
     ],
@@ -597,6 +632,7 @@ cc_library(
 tf_cc_test(
     name = "logging_utils_test",
     srcs = ["logging_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":logging_utils",
         "//tensorflow/core:test",
@@ -608,6 +644,7 @@ cc_library(
     name = "py_utils",
     srcs = ["py_utils.cc"],
     hdrs = ["py_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":credentials_factory",
     ],
@@ -617,6 +654,7 @@ cc_library(
     name = "server_lib",
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     linkstatic = True,
     visibility = [
         "//visibility:public",
@@ -624,6 +662,7 @@ cc_library(
     deps = [
         ":credentials_factory",
         ":data_transfer",
+        ":common_proto_cc",
         ":export_proto_cc",
         ":grpc_dispatcher_impl",
         ":grpc_util",
@@ -640,6 +679,7 @@ cc_library(
 # does not pull in the server_lib.h header.
 cc_header_only_library(
     name = "server_lib_headers_lib",
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     features = ["-parse_headers"],
     deps = [
         ":dispatcher_client",
@@ -654,11 +694,19 @@ cc_library(
     hdrs = [
         "split_provider.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
+        ":common_proto_cc",
         ":dispatcher_client",
         ":grpc_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:thread_annotations",
     ],
 )
 
@@ -666,6 +714,7 @@ cc_library(
     name = "task_remover",
     srcs = ["task_remover.cc"],
     hdrs = ["task_remover.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -677,6 +726,7 @@ cc_library(
     name = "task_runner",
     srcs = ["task_runner.cc"],
     hdrs = ["task_runner.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
@@ -695,6 +745,7 @@ cc_library(
 tf_cc_test(
     name = "task_runner_test",
     srcs = ["task_runner_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":data_transfer",
         ":task_runner",
@@ -721,6 +772,7 @@ cc_library(
     testonly = True,
     srcs = ["test_cluster.cc"],
     hdrs = ["test_cluster.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":data_transfer",
@@ -750,11 +802,13 @@ cc_library(
     hdrs = [
         "test_util.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     data = ["//tensorflow/core/data/service/testdata"],
     deps = [
         ":common_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/data:dataset_test_base",
         "//tensorflow/core/framework:function_proto_cc",
@@ -778,6 +832,7 @@ cc_library(
 tf_cc_test(
     name = "test_util_test",
     srcs = ["test_util_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":test_util",
@@ -798,6 +853,7 @@ tf_cc_test(
 cc_library(
     name = "thread_safe_buffer",
     hdrs = ["thread_safe_buffer.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/platform:macros",
@@ -810,6 +866,7 @@ tf_cc_test(
     name = "thread_safe_buffer_test",
     size = "small",
     srcs = ["thread_safe_buffer_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     shard_count = 3,
     deps = [
         ":thread_safe_buffer",
@@ -829,6 +886,7 @@ cc_library(
     name = "url",
     srcs = ["url.cc"],
     hdrs = ["url.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
@@ -839,6 +897,7 @@ tf_cc_test(
     name = "url_test",
     size = "small",
     srcs = ["url_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":url",
         "//tensorflow/core:test",
@@ -850,6 +909,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         "//tensorflow/core:lib",
@@ -860,6 +920,7 @@ cc_library(
 tf_cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common_proto_cc",
         ":utils",
@@ -875,6 +936,7 @@ cc_library(
     name = "validate_utils",
     srcs = ["validate_utils.cc"],
     hdrs = ["validate_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
@@ -888,6 +950,7 @@ tf_cc_test(
     name = "validate_utils_test",
     size = "small",
     srcs = ["validate_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":validate_utils",
         "//tensorflow/core:framework",
@@ -906,6 +969,7 @@ cc_grpc_library(
     name = "worker_cc_grpc_proto",
     srcs = [":worker_proto"],
     compatible_with = get_compatible_with_cloud(),
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     generate_mocks = True,
     grpc_only = True,
     deps = [":worker_proto_cc"],
@@ -915,6 +979,7 @@ cc_library(
     name = "worker_client",
     srcs = ["worker_client.cc"],
     hdrs = ["worker_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":credentials_factory",
@@ -940,6 +1005,7 @@ tf_cc_test(
     name = "worker_client_test",
     size = "small",
     srcs = ["worker_client_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":common_proto_cc",
@@ -975,6 +1041,7 @@ cc_library(
     hdrs = [
         "worker_impl.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":auto_shard_rewriter",
         ":common",
@@ -993,11 +1060,10 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -1006,18 +1072,23 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:standalone",
+        "//tensorflow/core/data/service/snapshot:path_utils",
+        "//tensorflow/core/data/service/snapshot:snapshot_split_provider",
+        "//tensorflow/core/data/service/snapshot:snapshot_stream_writer",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/platform:types",
     ] + tf_grpc_cc_dependencies(),
 )
 
 tf_cc_test(
     name = "worker_impl_test",
     srcs = ["worker_impl_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":test_cluster",
         ":worker_impl",
diff --git a/tensorflow/core/data/service/client/BUILD b/tensorflow/core/data/service/client/BUILD
index 2b85a653daa..e66bb0315f5 100644
--- a/tensorflow/core/data/service/client/BUILD
+++ b/tensorflow/core/data/service/client/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -17,8 +18,8 @@ cc_library(
     hdrs = [
         "common.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data/service:common_proto_cc",
         "@com_google_absl//absl/time",
@@ -29,6 +30,7 @@ cc_library(
     name = "data_service_client",
     srcs = ["data_service_client.cc"],
     hdrs = ["data_service_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":validate_utils",
@@ -60,6 +62,7 @@ cc_library(
 tf_cc_test(
     name = "data_service_client_test",
     srcs = ["data_service_client_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":data_service_client",
@@ -69,6 +72,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service:common",
         "//tensorflow/core/data/service:test_cluster",
         "//tensorflow/core/data/service:test_util",
         "//tensorflow/core/platform:status",
@@ -81,6 +85,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -98,6 +103,7 @@ cc_library(
 tf_cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":utils",
         "//tensorflow/core:framework",
@@ -118,6 +124,7 @@ cc_library(
     name = "validate_utils",
     srcs = ["validate_utils.cc"],
     hdrs = ["validate_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         "//tensorflow/core:framework",
@@ -133,6 +140,7 @@ cc_library(
 tf_cc_test(
     name = "validate_utils_test",
     srcs = ["validate_utils_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":common",
         ":validate_utils",
diff --git a/tensorflow/core/data/service/client/common.h b/tensorflow/core/data/service/client/common.h
index c9bd04f0292..58c0f0a29a1 100644
--- a/tensorflow/core/data/service/client/common.h
+++ b/tensorflow/core/data/service/client/common.h
@@ -18,11 +18,9 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <vector>
 
 #include "absl/time/time.h"
 #include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 
 namespace tensorflow {
@@ -46,24 +44,6 @@ struct DataServiceParams final {
   std::optional<CrossTrainerCacheOptions> cross_trainer_cache_options;
 };
 
-// Container to hold the result of a `GetNext` call.
-struct GetNextResult final {
-  explicit GetNextResult() = default;
-  GetNextResult(const GetNextResult&) = delete;
-  GetNextResult& operator=(const GetNextResult&) = delete;
-  GetNextResult(GetNextResult&&) = default;
-  GetNextResult& operator=(GetNextResult&&) = delete;
-
-  static GetNextResult EndOfSequence() {
-    GetNextResult result;
-    result.end_of_sequence = true;
-    return result;
-  }
-
-  std::vector<Tensor> tensors;
-  bool end_of_sequence = false;
-};
-
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index e59f8d94e4c..cc598a1e172 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/data/service/client/common.h"
+#include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
diff --git a/tensorflow/core/data/service/client/data_service_client_test.cc b/tensorflow/core/data/service/client/data_service_client_test.cc
index 4c9611c102b..4b6a69fc42b 100644
--- a/tensorflow/core/data/service/client/data_service_client_test.cc
+++ b/tensorflow/core/data/service/client/data_service_client_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "tensorflow/core/data/service/client/common.h"
+#include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/test_cluster.h"
 #include "tensorflow/core/data/service/test_util.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/data/service/common.h b/tensorflow/core/data/service/common.h
index efdf068c19e..e1742f66c43 100644
--- a/tensorflow/core/data/service/common.h
+++ b/tensorflow/core/data/service/common.h
@@ -16,10 +16,12 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SERVICE_COMMON_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,13 +32,31 @@ namespace data {
 
 // Increment this when making backwards-incompatible changes to communication
 // between tf.data clients and servers.
-constexpr int kDataServiceVersion = 5;
+constexpr int kDataServiceVersion = 6;
 
 // If the user starts a colocated tf.data worker on each TF host, the worker
 // will be applied a "COLOCATED" tag. This is used to avoid reading from tf.data
 // workers on other TF hosts when the host runs a local tf.data service worker.
 constexpr absl::string_view kColocatedWorkerTag = "COLOCATED";
 
+// Container to hold the result of a `GetNext` call.
+struct GetNextResult final {
+  explicit GetNextResult() = default;
+  GetNextResult(const GetNextResult&) = delete;
+  GetNextResult& operator=(const GetNextResult&) = delete;
+  GetNextResult(GetNextResult&&) = default;
+  GetNextResult& operator=(GetNextResult&&) = delete;
+
+  static GetNextResult EndOfSequence() {
+    GetNextResult result;
+    result.end_of_sequence = true;
+    return result;
+  }
+
+  std::vector<Tensor> tensors;
+  bool end_of_sequence = false;
+};
+
 // Returns true if `processing_mode` specifies no sharding policy.
 bool IsNoShard(const ProcessingModeDef& processing_mode);
 
@@ -84,7 +104,7 @@ class DataServiceClientBase {
   // first RPC will perform any necessary initialization. However, it can be
   // useful to call `Initialize()` proactively so that any errors that happen
   // during initialization can be surfaced earlier.
-  Status Initialize() { return EnsureInitialized(); }
+  virtual Status Initialize() { return EnsureInitialized(); }
 
  protected:
   // Initializes the client if it isn't already initialized.
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index b187b130705..3f1f123c17d 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,10 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/protobuf/data_service.proto";
+import "tensorflow/core/protobuf/error_codes.proto";
+import "tensorflow/core/protobuf/snapshot.proto";
 
 // Next tag: 2
 message DatasetDef {
@@ -71,6 +74,49 @@ message TaskInfo {
   int64 starting_round = 5;
 }
 
+// Next tag: 5
+message SnapshotTaskDef {
+  // The base directory at which the snapshot is being materialized.
+  string base_path = 1;
+  // The index of the stream that the worker has been assigned to process.
+  int64 stream_index = 2;
+  // The number of source datasets (split providers).
+  int64 num_sources = 3;
+  // Snapshot metadata including the element spec and compression method.
+  experimental.DistributedSnapshotMetadata metadata = 4;
+}
+
+// Next tag: 5
+message SnapshotTaskProgress {
+  SnapshotTaskDef snapshot_task = 1;
+  // True if the snapshot is complete successfully. Unset if the snapshot is not
+  // complete or an error has occurred.
+  bool completed = 2;
+  // If any error occurs during the snapshot processing, the `error_code` and
+  // `error_message` will be filled with the error status.
+  error.Code error_code = 3;
+  string error_message = 4;
+}
+
+message SnapshotStreamInfo {
+  // The index of the stream being processed or having been processed.
+  int64 index = 1;
+
+  enum State {
+    // Unspecified.  Invalid state.
+    UNSPECIFIED = 0;
+    // The dispatcher thinks the stream has a live worker.
+    ASSIGNED = 1;
+    // The dispatcher doesn't think the stream has a live worker.
+    ORPHAN = 2;
+    // The dispatcher doesn't know whether or not the stream has a live worker.
+    UNKNOWN = 3;
+    // The dispatcher thinks the stream has no more splits left to be processed.
+    DONE = 4;
+  }
+  State state = 2;
+}
+
 // Specifies which tf.data service workers to read from.
 enum TargetWorkers {
   TARGET_WORKERS_UNSPECIFIED = 0;
diff --git a/tensorflow/core/data/service/dispatcher.proto b/tensorflow/core/data/service/dispatcher.proto
index 3966699e103..b8a25d8a6f6 100644
--- a/tensorflow/core/data/service/dispatcher.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -5,16 +5,9 @@ package tensorflow.data;
 import "tensorflow/core/data/service/common.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/protobuf/data_service.proto";
+import "tensorflow/core/protobuf/snapshot.proto";
 
-// Next tag: 3
-message TaskProgress {
-  // The task that this message is about.
-  int64 task_id = 1;
-  // Whether the task has completed.
-  bool completed = 2;
-}
-
-// Next tag: 6
+// Next tag: 7
 message WorkerHeartbeatRequest {
   string worker_address = 1;
   string transfer_address = 3;
@@ -22,12 +15,24 @@ message WorkerHeartbeatRequest {
   // The UID of the worker Borg job, used for telemetry.
   int64 worker_uid = 5;
   repeated int64 current_tasks = 2;
+  // The status of any active snapshot tasks, keyed by snapshot path.
+  map<string, SnapshotTaskProgress> snapshot_task_progress = 6;
 }
 
-// Next tag: 3
+// Next tag: 4
 message WorkerHeartbeatResponse {
   repeated TaskDef new_tasks = 1;
   repeated int64 tasks_to_delete = 2;
+  // Snapshots to process.
+  repeated SnapshotTaskDef snapshot_tasks = 3;
+}
+
+// Next tag: 3
+message TaskProgress {
+  // The task that this message is about.
+  int64 task_id = 1;
+  // Whether the task has completed.
+  bool completed = 2;
 }
 
 // Next tag: 3
@@ -220,6 +225,63 @@ message GetWorkersResponse {
   repeated WorkerInfo workers = 1;
 }
 
+// Next tag: 4
+message SnapshotRequest {
+  // The dataset to snapshot.
+  DatasetDef dataset = 1;
+
+  // The path to which to materialize the snapshot.
+  string path = 2;
+
+  // The metadata for the snapshot.
+  experimental.DistributedSnapshotMetadata metadata = 3;
+}
+
+// Next tag: 1
+message SnapshotResponse {}
+
+// Next tag: 5
+message GetSnapshotSplitRequest {
+  // The address of the worker requesting the split.
+  string worker_address = 4;
+
+  // The base path of the snapshot materialization.
+  string base_path = 1;
+
+  // The index of the snapshot stream from which to get the split.
+  int64 stream_index = 2;
+
+  // The index of the dataset source from which to get the split.
+  int64 source_index = 3;
+}
+
+// Next tag: 4
+message GetSnapshotSplitResponse {
+  oneof response {
+    // The split to process.
+    TensorProto split = 1;
+
+    // If true, there are no splits left to be processed for this stream.
+    bool end_of_splits = 2;
+  }
+
+  // The local split index within the stream source, starting at zero and
+  // incrementing by one for each split assigned to the worker. This local index
+  // is used by the worker to keep track of which split it has read up to. If
+  // `end_of_splits` is true, this equals the total number of splits.
+  int64 local_split_index = 3;
+}
+
+message GetSnapshotStreamsRequest {
+  // The path at which the snapshot is being materialized.
+  string path = 1;
+}
+
+message GetSnapshotStreamsResponse {
+  // Information about all streams for the snapshot.
+  repeated SnapshotStreamInfo streams = 1;
+}
+
 service DispatcherService {
   // Performs a periodic worker heartbeat.
   rpc WorkerHeartbeat(WorkerHeartbeatRequest) returns (WorkerHeartbeatResponse);
@@ -274,4 +336,18 @@ service DispatcherService {
   // Returns the config of a data service cluster.
   rpc GetDataServiceConfig(GetDataServiceConfigRequest)
       returns (GetDataServiceConfigResponse);
+
+  // Initiates the process of materializing a dataset's output to disk.
+  rpc Snapshot(SnapshotRequest) returns (SnapshotResponse);
+
+  // Gets the next split for the given stream of the given snapshot. Returns an
+  // error if there has been some miscommunication between the worker and
+  // dispatcher regarding stream assignment and the worker should stop (though
+  // due to stream leases this case should never happen).
+  rpc GetSnapshotSplit(GetSnapshotSplitRequest)
+      returns (GetSnapshotSplitResponse);
+
+  // Returns information about all streams for the given snapshot.
+  rpc GetSnapshotStreams(GetSnapshotStreamsRequest)
+      returns (GetSnapshotStreamsResponse);
 }
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index 9c33f048ebf..9a83f09f706 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/service/dispatcher_client.h"
 
+#include <cstdint>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -44,9 +45,46 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+Status DataServiceDispatcherClient::Initialize() {
+  mutex_lock l(mu_);
+  if (stub_) {
+    return OkStatus();
+  }
+  std::shared_ptr<grpc::ChannelCredentials> credentials;
+  TF_RETURN_IF_ERROR(
+      CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
+  grpc::ChannelArguments args;
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
+  auto channel = grpc::CreateCustomChannel(address_, credentials, args);
+  stub_ = DispatcherService::NewStub(channel);
+  GetVersionRequest req;
+  GetVersionResponse resp;
+  grpc::ClientContext ctx;
+  grpc::Status s = stub_->GetVersion(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError(
+        absl::StrCat("Failed to get dispatcher version from dispatcher "
+                     "running at ",
+                     address_),
+        s);
+  }
+
+  if (resp.version() != kDataServiceVersion) {
+    return errors::FailedPrecondition(
+        "Version mismatch with tf.data service server. The server is running "
+        "version ",
+        resp.version(), ", while the client is running version ",
+        kDataServiceVersion,
+        ". Please ensure that the client and server side are running the "
+        "same version of TensorFlow. If you're running an MPM binary, make "
+        "sure the server is running an up-to-date MPM.");
+  }
+  return OkStatus();
+}
+
 StatusOr<WorkerHeartbeatResponse> DataServiceDispatcherClient::WorkerHeartbeat(
     const WorkerHeartbeatRequest& request) {
-  TF_RETURN_IF_ERROR(EnsureInitialized());
   WorkerHeartbeatResponse response;
   grpc::ClientContext client_ctx;
   grpc::Status status = stub_->WorkerHeartbeat(&client_ctx, request, &response);
@@ -59,7 +97,6 @@ StatusOr<WorkerHeartbeatResponse> DataServiceDispatcherClient::WorkerHeartbeat(
 Status DataServiceDispatcherClient::WorkerUpdate(
     const std::string& worker_address,
     std::vector<TaskProgress>& task_progress) {
-  TF_RETURN_IF_ERROR(EnsureInitialized());
   WorkerUpdateRequest req;
   req.set_worker_address(worker_address);
   for (const auto& update : task_progress) {
@@ -76,7 +113,6 @@ Status DataServiceDispatcherClient::WorkerUpdate(
 
 Status DataServiceDispatcherClient::GetDatasetDef(const std::string& dataset_id,
                                                   DatasetDef& dataset_def) {
-  TF_RETURN_IF_ERROR(EnsureInitialized());
   GetDatasetDefRequest req;
   req.set_dataset_id(dataset_id);
   GetDatasetDefResponse resp;
@@ -114,6 +150,53 @@ Status DataServiceDispatcherClient::GetSplit(int64_t iteration_id,
   return OkStatus();
 }
 
+Status DataServiceDispatcherClient::Snapshot(
+    const DatasetDef& dataset, const std::string& path,
+    const experimental::DistributedSnapshotMetadata& metadata) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+
+  SnapshotRequest req;
+  *req.mutable_dataset() = dataset;
+  req.set_path(path);
+  *req.mutable_metadata() = metadata;
+
+  SnapshotResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->Snapshot(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to snapshot", status);
+  }
+  return OkStatus();
+}
+
+Status DataServiceDispatcherClient::GetSnapshotSplit(
+    const std::string& worker_address, const std::string& base_path,
+    int64_t stream_index, int64_t source_index, Tensor& split,
+    int64_t& local_split_index, bool& end_of_splits) {
+  GetSnapshotSplitRequest req;
+  req.set_worker_address(worker_address);
+  req.set_base_path(base_path);
+  req.set_stream_index(stream_index);
+  req.set_source_index(source_index);
+
+  GetSnapshotSplitResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetSnapshotSplit(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to get snapshot split", status);
+  }
+  local_split_index = resp.local_split_index();
+  end_of_splits = resp.end_of_splits();
+  if (end_of_splits) {
+    return OkStatus();
+  }
+  if (!split.FromProto(resp.split())) {
+    return errors::Internal("Failed to parse split tensor proto: ",
+                            resp.split().DebugString());
+  }
+  return OkStatus();
+}
+
 Status DataServiceDispatcherClient::RegisterDataset(
     const DatasetDef& dataset, const DataServiceMetadata& metadata,
     const std::optional<std::string>& requested_dataset_id,
@@ -279,46 +362,9 @@ Status DataServiceDispatcherClient::GetDataServiceConfig(
 }
 
 Status DataServiceDispatcherClient::EnsureInitialized() {
-  mutex_lock l(mu_);
-  if (stub_) {
-    return OkStatus();
-  }
-  std::shared_ptr<grpc::ChannelCredentials> credentials;
-  TF_RETURN_IF_ERROR(
-      CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
-  grpc::ChannelArguments args;
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
-  auto channel = grpc::CreateCustomChannel(address_, credentials, args);
-  stub_ = DispatcherService::NewStub(channel);
-  GetVersionRequest req;
-  GetVersionResponse resp;
-  TF_RETURN_IF_ERROR(grpc_util::Retry(
-      [&] {
-        grpc::ClientContext ctx;
-        grpc::Status s = stub_->GetVersion(&ctx, req, &resp);
-        if (!s.ok()) {
-          return grpc_util::WrapError(
-              absl::StrCat("Failed to get dispatcher version from dispatcher "
-                           "running at ",
-                           address_),
-              s);
-        }
-        return OkStatus();
-      },
-      "check service version",
-      /*deadline_micros=*/kint64max));
-  if (resp.version() != kDataServiceVersion) {
-    return errors::FailedPrecondition(
-        "Version mismatch with tf.data service server. The server is running "
-        "version ",
-        resp.version(), ", while the client is running version ",
-        kDataServiceVersion,
-        ". Please ensure that the client and server side are running the "
-        "same version of TensorFlow. If you're running an MPM binary, make "
-        "sure the server is running an up-to-date MPM.");
-  }
-  return OkStatus();
+  return grpc_util::Retry([this] { return Initialize(); },
+                          "Initialize dispatcher client",
+                          /*deadline_micros=*/kint64max);
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/dispatcher_client.h b/tensorflow/core/data/service/dispatcher_client.h
index 31b18c41438..70129279e85 100644
--- a/tensorflow/core/data/service/dispatcher_client.h
+++ b/tensorflow/core/data/service/dispatcher_client.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_CLIENT_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_CLIENT_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -22,7 +23,6 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/data_transfer.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -30,9 +30,9 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -44,6 +44,8 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
                               const std::string& protocol)
       : DataServiceClientBase(address, protocol) {}
 
+  Status Initialize() override;
+
   // Sends a heartbeat to the dispatcher. If the worker wasn't already
   // registered with the dispatcher, this will register the worker. The
   // dispatcher will report which new tasks the worker should run, and which
@@ -65,6 +67,19 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
                   int64_t split_provider_index, Tensor& split,
                   bool& end_of_splits);
 
+  // Gets the next split for the specified source of a stream of the snapshot in
+  // `base_path`. If `end_of_splits` returns true, then there are no more splits
+  // to be processed for the specified stream source.
+  virtual Status GetSnapshotSplit(const std::string& worker_address,
+                                  const std::string& base_path,
+                                  int64_t stream_index, int64_t source_index,
+                                  Tensor& split, int64_t& local_split_index,
+                                  bool& end_of_splits);
+
+  // Initiates the process of materializing `dataset`'s output to `path`.
+  Status Snapshot(const DatasetDef& dataset, const std::string& path,
+                  const experimental::DistributedSnapshotMetadata& metadata);
+
   // Registers a dataset with the tf.data service, and stores the generated
   // dataset id in `dataset_id`.
   Status RegisterDataset(const DatasetDef& dataset,
diff --git a/tensorflow/core/data/service/dispatcher_client_test.cc b/tensorflow/core/data/service/dispatcher_client_test.cc
index 5f9f2a2a04e..2ea85e4f0d2 100644
--- a/tensorflow/core/data/service/dispatcher_client_test.cc
+++ b/tensorflow/core/data/service/dispatcher_client_test.cc
@@ -14,13 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/service/dispatcher_client.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <optional>
 #include <string>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_transfer.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/test_cluster.h"
 #include "tensorflow/core/data/service/test_util.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -31,14 +34,18 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+using ::tensorflow::data::experimental::DistributedSnapshotMetadata;
+using ::tensorflow::data::testing::CreateDummyDistributedSnapshotMetadata;
 using ::tensorflow::data::testing::EqualsProto;
 using ::tensorflow::data::testing::InfiniteDataset;
+using ::tensorflow::data::testing::LocalTempFilename;
 using ::tensorflow::data::testing::RangeDataset;
 using ::tensorflow::testing::StatusIs;
 using ::testing::AllOf;
@@ -62,11 +69,12 @@ DataServiceMetadata GetDefaultMetadata() {
 
 class DispatcherClientTest : public ::testing::Test {
  protected:
-  void SetUp() override {
-    test_cluster_ = std::make_unique<TestCluster>(/*num_workers=*/1);
-    TF_ASSERT_OK(test_cluster_->Initialize());
+  Status SetUpTfDataService(int64_t num_workers) {
+    test_cluster_ = std::make_unique<TestCluster>(num_workers);
+    TF_RETURN_IF_ERROR(test_cluster_->Initialize());
     dispatcher_client_ = std::make_unique<DataServiceDispatcherClient>(
         test_cluster_->DispatcherAddress(), kProtocol);
+    return OkStatus();
   }
 
   // Creates a dataset and returns the dataset ID.
@@ -79,11 +87,26 @@ class DispatcherClientTest : public ::testing::Test {
     return dataset_id;
   }
 
+  // Starts snapshots and returns the directories.
+  StatusOr<absl::flat_hash_set<std::string>> StartDummySnapshots() {
+    DistributedSnapshotMetadata metadata =
+        CreateDummyDistributedSnapshotMetadata();
+    // Create a set of local file paths to which snapshots will be materialized.
+    absl::flat_hash_set<std::string> directories = {LocalTempFilename(),
+                                                    LocalTempFilename()};
+    for (const auto& directory : directories) {
+      TF_RETURN_IF_ERROR(
+          dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata));
+    }
+    return directories;
+  }
+
   std::unique_ptr<TestCluster> test_cluster_;
   std::unique_ptr<DataServiceDispatcherClient> dispatcher_client_;
 };
 
 TEST_F(DispatcherClientTest, GetDataServiceMetadata) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(10);
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id,
@@ -95,6 +118,7 @@ TEST_F(DispatcherClientTest, GetDataServiceMetadata) {
 }
 
 TEST_F(DispatcherClientTest, DatasetDoesNotExist) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   EXPECT_THAT(
       dispatcher_client_->GetDataServiceMetadata(
@@ -102,13 +126,119 @@ TEST_F(DispatcherClientTest, DatasetDoesNotExist) {
       StatusIs(error::NOT_FOUND, HasSubstr("Dataset id not-found not found")));
 }
 
+TEST_F(DispatcherClientTest, SnapshotAlreadyStarted) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  DistributedSnapshotMetadata metadata =
+      CreateDummyDistributedSnapshotMetadata();
+  std::string directory = LocalTempFilename();
+  TF_ASSERT_OK(
+      dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata));
+  EXPECT_THAT(
+      dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata),
+      StatusIs(error::INVALID_ARGUMENT, HasSubstr("already started")));
+}
+
 TEST_F(DispatcherClientTest, GetDataServiceConfig) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceConfig config;
   TF_ASSERT_OK(dispatcher_client_->GetDataServiceConfig(config));
   EXPECT_EQ(config.deployment_mode(), DEPLOYMENT_MODE_COLOCATED);
 }
 
+TEST_F(DispatcherClientTest, SnapshotSkeletonWritten) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
+                          StartDummySnapshots());
+  for (const auto& path : paths) {
+    TF_ASSERT_OK(Env::Default()->FileExists(CommittedChunksDirectory(path)));
+    TF_ASSERT_OK(Env::Default()->FileExists(StreamsDirectory(path)));
+  }
+}
+
+TEST_F(DispatcherClientTest, SnapshotMetadataAndDatasetDefWritten) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
+                          StartDummySnapshots());
+  for (const auto& path : paths) {
+    TF_ASSERT_OK(
+        Env::Default()->FileExists(io::JoinPath(path, "snapshot.metadata")));
+    TF_ASSERT_OK(
+        Env::Default()->FileExists(io::JoinPath(path, "dataset_def.proto")));
+  }
+}
+
+TEST_F(DispatcherClientTest, SnapshotsInHeartbeat) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
+                          StartDummySnapshots());
+  WorkerHeartbeatRequest worker_heartbeat_request;
+  worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      WorkerHeartbeatResponse worker_heartbeat_response,
+      dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
+  ASSERT_EQ(worker_heartbeat_response.snapshot_tasks_size(), paths.size());
+  for (const auto& snapshot_task : worker_heartbeat_response.snapshot_tasks()) {
+    ASSERT_TRUE(paths.count(snapshot_task.base_path()));
+    ASSERT_EQ(snapshot_task.stream_index(), 0);
+  }
+}
+
+TEST_F(DispatcherClientTest, GetSnapshotSplit) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
+                          StartDummySnapshots());
+  WorkerHeartbeatRequest worker_heartbeat_request;
+  worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      WorkerHeartbeatResponse worker_heartbeat_response,
+      dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
+  for (int64_t i = 0; i < 5; ++i) {
+    for (const auto& snapshot_task :
+         worker_heartbeat_response.snapshot_tasks()) {
+      GetSnapshotSplitRequest get_snapshot_split_request;
+      Tensor split;
+      int64_t local_split_index = 0;
+      bool end_of_splits = false;
+      TF_ASSERT_OK(dispatcher_client_->GetSnapshotSplit(
+          test_cluster_->WorkerAddress(0), snapshot_task.base_path(),
+          snapshot_task.stream_index(),
+          /*source_index=*/0, split, local_split_index, end_of_splits));
+      EXPECT_EQ(local_split_index, i);
+      EXPECT_FALSE(end_of_splits);
+    }
+  }
+}
+
+TEST_F(DispatcherClientTest, GetSnapshotSplitMultipleStreams) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/3));
+  TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
+                          StartDummySnapshots());
+
+  for (int64_t i = 0; i < 3; ++i) {
+    WorkerHeartbeatRequest worker_heartbeat_request;
+    worker_heartbeat_request.set_worker_address(
+        test_cluster_->WorkerAddress(i));
+    TF_ASSERT_OK_AND_ASSIGN(
+        WorkerHeartbeatResponse worker_heartbeat_response,
+        dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
+    for (const auto& snapshot_task :
+         worker_heartbeat_response.snapshot_tasks()) {
+      GetSnapshotSplitRequest get_snapshot_split_request;
+      Tensor split;
+      int64_t local_split_index = 0;
+      bool end_of_splits = false;
+      TF_ASSERT_OK(dispatcher_client_->GetSnapshotSplit(
+          test_cluster_->WorkerAddress(i), snapshot_task.base_path(),
+          snapshot_task.stream_index(),
+          /*source_index=*/0, split, local_split_index, end_of_splits));
+      EXPECT_EQ(local_split_index, 0);
+      EXPECT_FALSE(end_of_splits);
+    }
+  }
+}
+
 TEST_F(DispatcherClientTest, RegisterDatasetWithExplicitId) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(10);
   TF_ASSERT_OK_AND_ASSIGN(
@@ -126,6 +256,7 @@ TEST_F(DispatcherClientTest, RegisterDatasetWithExplicitId) {
 }
 
 TEST_F(DispatcherClientTest, DatasetsDoNotMatch) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(10);
   TF_ASSERT_OK_AND_ASSIGN(
@@ -146,6 +277,7 @@ TEST_F(DispatcherClientTest, DatasetsDoNotMatch) {
 }
 
 TEST_F(DispatcherClientTest, EnableCrossTrainerCache) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(kInfiniteCardinality);
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id,
@@ -173,6 +305,7 @@ TEST_F(DispatcherClientTest, EnableCrossTrainerCache) {
 }
 
 TEST_F(DispatcherClientTest, CreateNamedJob) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(10);
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id,
@@ -197,6 +330,7 @@ TEST_F(DispatcherClientTest, CreateNamedJob) {
 }
 
 TEST_F(DispatcherClientTest, NamedJobsDoNotMatch) {
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   DataServiceMetadata metadata = GetDefaultMetadata();
   metadata.set_cardinality(10);
   TF_ASSERT_OK_AND_ASSIGN(const std::string dataset_id,
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index b18713d4648..2a479eafd63 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -17,12 +17,15 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/tsl/platform/errors.h"
+
 #ifdef PLATFORM_GOOGLE
 #include "file/logging/log_lines.h"
 #endif
@@ -32,6 +35,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/hash_utils.h"
@@ -44,10 +49,14 @@ limitations under the License.
 #include "tensorflow/core/data/service/export.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/journal.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/split_provider.h"
+#include "tensorflow/core/data/service/utils.h"
 #include "tensorflow/core/data/service/validate_utils.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/core/data/standalone.h"
-#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -56,7 +65,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/path.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
@@ -64,7 +72,6 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
-#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
@@ -77,10 +84,11 @@ using ::tensorflow::protobuf::util::MessageDifferencer;
 constexpr char kJournalDir[] = "tf_data_dispatcher_journal";
 // The name of the datasets directory inside the dispatcher's working directory.
 constexpr char kDatasetsDir[] = "datasets";
-constexpr int64_t kDefaultIterationGcCheckIntervalMs =
-    10 * 60 * 1000;                                              // 10 minutes.
-constexpr int64_t kDefaultIterationGcTimeoutMs = 5 * 60 * 1000;  // 5 minutes.
-constexpr int64_t kDefaultClientTimeoutMs = 2 * 60 * 1000;       // 2 minutes.
+
+constexpr absl::Duration kDefaultIterationGcCheckInterval = absl::Minutes(10);
+constexpr absl::Duration kDefaultIterationGcTimeout = absl::Minutes(5);
+constexpr absl::Duration kDefaultClientTimeout = absl::Minutes(2);
+constexpr absl::Duration kDefaultWorkerTimeout = absl::Minutes(1);
 
 constexpr std::array<const char*, 8> kNodeNameSharingOps = {
     "HashTable",
@@ -145,13 +153,20 @@ void PrepareGraph(GraphDef* graph) {
 DispatcherConfig ApplyConfigDefaults(const DispatcherConfig& config) {
   DispatcherConfig new_config(config);
   if (new_config.job_gc_check_interval_ms() == 0) {
-    new_config.set_job_gc_check_interval_ms(kDefaultIterationGcCheckIntervalMs);
+    new_config.set_job_gc_check_interval_ms(
+        absl::ToInt64Milliseconds(kDefaultIterationGcCheckInterval));
   }
   if (new_config.job_gc_timeout_ms() == 0) {
-    new_config.set_job_gc_timeout_ms(kDefaultIterationGcTimeoutMs);
+    new_config.set_job_gc_timeout_ms(
+        absl::ToInt64Milliseconds(kDefaultIterationGcTimeout));
   }
   if (new_config.client_timeout_ms() == 0) {
-    new_config.set_client_timeout_ms(kDefaultClientTimeoutMs);
+    new_config.set_client_timeout_ms(
+        absl::ToInt64Milliseconds(kDefaultClientTimeout));
+  }
+  if (new_config.worker_timeout_ms() == 0) {
+    new_config.set_worker_timeout_ms(
+        absl::ToInt64Milliseconds(kDefaultWorkerTimeout));
   }
   return new_config;
 }
@@ -182,16 +197,16 @@ DataServiceDispatcherImpl::~DataServiceDispatcherImpl() {
   {
     mutex_lock l(mu_);
     cancelled_ = true;
-    iteration_gc_thread_cv_.notify_all();
+    maintenance_thread_cv_.notify_all();
   }
-  iteration_gc_thread_.reset();
+  maintenance_thread_.reset();
 }
 
 Status DataServiceDispatcherImpl::Start() {
   mutex_lock l(mu_);
   if (config_.job_gc_timeout_ms() >= 0) {
-    iteration_gc_thread_ = absl::WrapUnique(env_->StartThread(
-        {}, "iteration-gc-thread", [&] { IterationGcThread(); }));
+    maintenance_thread_ = absl::WrapUnique(env_->StartThread(
+        {}, "maintenance-thread", [&] { MaintenanceThread(); }));
   }
   if (config_.work_dir().empty()) {
     if (config_.fault_tolerant_mode()) {
@@ -241,6 +256,13 @@ Status DataServiceDispatcherImpl::Start() {
   // Initialize the journal writer in `Start` so that we fail fast in case it
   // can't be initialized.
   TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
+
+  for (const auto& path : state_.ListSnapshotPaths()) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<SnapshotManager> snapshot_manager,
+                        SnapshotManager::Resume(path, env_));
+    snapshots_.insert({path, std::move(snapshot_manager)});
+  }
+
   started_ = true;
   return OkStatus();
 }
@@ -284,7 +306,7 @@ Status DataServiceDispatcherImpl::RestoreSplitProviders(
 
 Status DataServiceDispatcherImpl::FindTasksToDelete(
     const absl::flat_hash_set<int64_t>& current_tasks,
-    const std::vector<std::shared_ptr<const Task>> assigned_tasks,
+    const std::vector<std::shared_ptr<const Task>>& assigned_tasks,
     WorkerHeartbeatResponse* response) {
   absl::flat_hash_set<int64_t> assigned_ids;
   for (const auto& assigned : assigned_tasks) {
@@ -336,6 +358,8 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
           << request->worker_address();
   mutex_lock l(mu_);
   const std::string& worker_address = request->worker_address();
+  latest_worker_heartbeats_time_[worker_address] =
+      absl::FromUnixMicros(env_->NowMicros());
   // Assigned tasks from the perspective of the dispatcher.
   std::vector<std::shared_ptr<const Task>> assigned_tasks;
   Status s = state_.TasksForWorker(worker_address, assigned_tasks);
@@ -364,6 +388,10 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
   TF_RETURN_IF_ERROR(
       FindNewTasks(worker_address, current_tasks, assigned_tasks, response));
 
+  for (const auto& [path, snapshot_manager] : snapshots_) {
+    TF_RETURN_IF_ERROR(snapshot_manager->WorkerHeartbeat(*request, *response));
+  }
+
   VLOG(4) << "Finished worker heartbeat for worker at address "
           << request->worker_address();
   return OkStatus();
@@ -459,11 +487,7 @@ Status DataServiceDispatcherImpl::MakeSplitProviders(
   TF_RETURN_IF_ERROR(state_.DatasetFromId(dataset_id, dataset));
   std::shared_ptr<const DatasetDef> dataset_def;
   TF_RETURN_IF_ERROR(GetDatasetDef(*dataset, dataset_def));
-  standalone::Dataset::Params params;
-  std::unique_ptr<standalone::Dataset> standalone_dataset;
-  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
-      params, dataset_def->graph(), &standalone_dataset));
-  TF_RETURN_IF_ERROR(standalone_dataset->MakeSplitProviders(&split_providers));
+  TF_RETURN_IF_ERROR(CreateSplitProviders(*dataset_def, split_providers));
   return OkStatus();
 }
 
@@ -1039,6 +1063,60 @@ Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
   return OkStatus();
 }
 
+Status DataServiceDispatcherImpl::Snapshot(const SnapshotRequest* request,
+                                           SnapshotResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+
+  if (snapshots_.contains(request->path())) {
+    return errors::InvalidArgument("a snapshot at ", request->path(),
+                                   " is already started or completed");
+  }
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<SnapshotManager> snapshot_manager,
+                      SnapshotManager::Start(*request, env_));
+  snapshots_.insert({request->path(), std::move(snapshot_manager)});
+
+  Update update;
+  SnapshotUpdate* snapshot = update.mutable_snapshot();
+  snapshot->set_path(request->path());
+  TF_RETURN_IF_ERROR(Apply(update));
+
+  return OkStatus();
+}
+
+Status DataServiceDispatcherImpl::GetSnapshotStreams(
+    const GetSnapshotStreamsRequest* request,
+    GetSnapshotStreamsResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+
+  auto it = snapshots_.find(request->path());
+  if (it == snapshots_.end()) {
+    return errors::InvalidArgument(
+        "the dispatcher does not know of a snapshot at ", request->path());
+  }
+  TF_RETURN_IF_ERROR(it->second->GetSnapshotStreams(*response));
+  return OkStatus();
+}
+
+Status DataServiceDispatcherImpl::GetSnapshotSplit(
+    const GetSnapshotSplitRequest* request,
+    GetSnapshotSplitResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+
+  auto it = snapshots_.find(request->base_path());
+  if (it == snapshots_.end()) {
+    return errors::InvalidArgument(
+        "the dispatcher does not know of a snapshot at ", request->base_path());
+  }
+
+  TF_RETURN_IF_ERROR(it->second->GetSnapshotSplit(*request, *response));
+
+  return OkStatus();
+}
+
 Status DataServiceDispatcherImpl::PopulateTaskDef(
     std::shared_ptr<const Task> task, TaskDef* task_def) const
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -1113,13 +1191,13 @@ Status DataServiceDispatcherImpl::Apply(const Update& update)
   return state_.Apply(update);
 }
 
-void DataServiceDispatcherImpl::IterationGcThread() {
+void DataServiceDispatcherImpl::MaintenanceThread() {
   int64_t next_check_micros = 0;
   while (true) {
     mutex_lock l(mu_);
     while (!cancelled_ && env_->NowMicros() < next_check_micros) {
       int64_t remaining_micros = next_check_micros - env_->NowMicros();
-      iteration_gc_thread_cv_.wait_for(
+      maintenance_thread_cv_.wait_for(
           l, std::chrono::microseconds(remaining_micros));
     }
     if (cancelled_) {
@@ -1131,13 +1209,18 @@ void DataServiceDispatcherImpl::IterationGcThread() {
         LOG(WARNING) << "Error releasing missing clients: " << s;
       }
     }
-
     {
       Status s = GcOldIterations();
       if (!s.ok()) {
         LOG(WARNING) << "Error garbage collecting old iterations: " << s;
       }
     }
+    {
+      for (const auto& [ignore, snapshot_manager] : snapshots_) {
+        snapshot_manager->UpdateStreams();
+      }
+    }
+    DetectMissingWorkers();
     next_check_micros =
         env_->NowMicros() + (config_.job_gc_check_interval_ms() * 1000);
   }
@@ -1162,6 +1245,24 @@ Status DataServiceDispatcherImpl::ReleaseMissingClients()
   return OkStatus();
 }
 
+void DataServiceDispatcherImpl::DetectMissingWorkers()
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  int64_t now = env_->NowMicros();
+  for (auto it = latest_worker_heartbeats_time_.begin();
+       it != latest_worker_heartbeats_time_.end();) {
+    if (absl::FromUnixMicros(now) >
+        it->second + absl::Milliseconds(config_.worker_timeout_ms())) {
+      for (const auto& [ignore, snapshot_manager] : snapshots_) {
+        snapshot_manager->HandleMissingWorker(it->first);
+      }
+      LOG(INFO) << "Lost worker " << it->first << " due to timeout";
+      latest_worker_heartbeats_time_.erase(it++);
+    } else {
+      ++it;
+    }
+  }
+}
+
 Status DataServiceDispatcherImpl::GcOldIterations()
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::vector<std::shared_ptr<const Iteration>> iterations =
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index 4765b11d9b9..fb3d60c1756 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_DISPATCHER_IMPL_H_
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,12 +25,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dataset_store.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/dispatcher_state.h"
 #include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_manager.h"
 #include "tensorflow/core/data/service/task_remover.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -41,7 +42,6 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
-#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace data {
@@ -175,11 +175,20 @@ class DataServiceDispatcherImpl {
                          ClientHeartbeatResponse* response);
   Status GetWorkers(const GetWorkersRequest* request,
                     GetWorkersResponse* response);
+  Status Snapshot(const SnapshotRequest* request, SnapshotResponse* response);
+  Status GetSnapshotSplit(const GetSnapshotSplitRequest* request,
+                          GetSnapshotSplitResponse* response);
+  Status GetSnapshotStreams(const GetSnapshotStreamsRequest* request,
+                            GetSnapshotStreamsResponse* response);
 
   // Exports the dispatcher state for debugging.
   DispatcherStateExport ExportState() const;
 
  private:
+  // A thread which periodically checks for iterations to clean up, clients to
+  // release, workers to consider missing, and snapshot streams to reassign.
+  void MaintenanceThread();
+
   // Restores split providers from the state in `iteration` and stores them in
   // `restored`.
   Status RestoreSplitProviders(
@@ -229,7 +238,7 @@ class DataServiceDispatcherImpl {
   // response.
   Status FindTasksToDelete(
       const absl::flat_hash_set<int64_t>& current_tasks,
-      const std::vector<std::shared_ptr<const DispatcherState::Task>>
+      const std::vector<std::shared_ptr<const DispatcherState::Task>>&
           assigned_tasks,
       WorkerHeartbeatResponse* response);
   // Finds new tasks that should be assigned to a worker and adds them to
@@ -252,8 +261,8 @@ class DataServiceDispatcherImpl {
       std::vector<std::shared_ptr<const DispatcherState::Task>>& tasks)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Creates a new task for an iteration. The created task may be either pending
-  // or active.
+  // Creates a new task for an iteration. The created task may be either
+  // pending or active.
   Status CreateTask(std::shared_ptr<const DispatcherState::Iteration> iteration,
                     const std::string& worker_address,
                     std::shared_ptr<const DispatcherState::Task>& task)
@@ -299,10 +308,11 @@ class DataServiceDispatcherImpl {
   // used when recovering state when the dispatcher starts.
   Status ApplyWithoutJournaling(const Update& update)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // A thread which periodically checks for iterations to clean up.
-  void IterationGcThread();
   // Releases iteration clients that haven't heartbeated recently.
   Status ReleaseMissingClients() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Checks for workers that haven't heartbeated recently and alerts the
+  // snapshot managers.
+  void DetectMissingWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Scans for old iterations and marks them as finished.
   Status GcOldIterations() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Gets a `DatasetDef` from `dataset_store_` for the given dataset id, and
@@ -332,8 +342,8 @@ class DataServiceDispatcherImpl {
   absl::flat_hash_map<int64_t, std::vector<std::unique_ptr<SplitProvider>>>
       split_providers_ TF_GUARDED_BY(mu_);
   // Mapping from round robin iteration id to the round the iteration is
-  // currently on. This is based on the data provided by client heartbeats, and
-  // may be stale.
+  // currently on. This is based on the data provided by client heartbeats,
+  // and may be stale.
   absl::flat_hash_map<int64_t, int64_t> round_robin_rounds_ TF_GUARDED_BY(mu_);
   // Map from task id to a TaskRemover which determines when to remove the task.
   absl::flat_hash_map<int64_t, std::shared_ptr<TaskRemover>>
@@ -341,13 +351,21 @@ class DataServiceDispatcherImpl {
   // Map from client id to the time of the client's last heartbeat.
   absl::flat_hash_map<int64_t, absl::Time> latest_client_heartbeats_time_
       TF_GUARDED_BY(mu_);
+  // Map from worker address to the time of the worker's last heartbeat.
+  absl::flat_hash_map<std::string, absl::Time> latest_worker_heartbeats_time_
+      TF_GUARDED_BY(mu_);
+
+  // Managers for all snapshot processes created or recovered during the
+  // lifetime of this dispatcher instance.
+  absl::flat_hash_map<std::string, std::unique_ptr<SnapshotManager>> snapshots_
+      TF_GUARDED_BY(mu_);
 
   std::optional<std::unique_ptr<JournalWriter>> journal_writer_
       TF_GUARDED_BY(mu_);
   DispatcherState state_ TF_GUARDED_BY(mu_);
-  // Condition variable for waking up the iteration gc thread.
-  condition_variable iteration_gc_thread_cv_;
-  std::unique_ptr<Thread> iteration_gc_thread_;
+  // Condition variable for waking up the gc thread.
+  condition_variable maintenance_thread_cv_;
+  std::unique_ptr<Thread> maintenance_thread_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceDispatcherImpl);
 };
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index 0fb8d804e2b..36536416c83 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -82,6 +82,9 @@ Status DispatcherState::Apply(const Update& update) {
     case Update::kFinishTask:
       FinishTask(update.finish_task());
       break;
+    case Update::kSnapshot:
+      Snapshot(update.snapshot());
+      break;
     case Update::UPDATE_TYPE_NOT_SET:
       return errors::Internal("Update type not set.");
   }
@@ -484,5 +487,9 @@ StatusOr<int64_t> DispatcherState::GetWorkerIndex(
   return worker_index_resolver_.GetWorkerIndex(worker_address);
 }
 
+void DispatcherState::Snapshot(const SnapshotUpdate& snapshot) {
+  snapshot_paths_.insert(snapshot.path());
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher_state.h b/tensorflow/core/data/service/dispatcher_state.h
index 2d38f90f234..7ce7f7b304d 100644
--- a/tensorflow/core/data/service/dispatcher_state.h
+++ b/tensorflow/core/data/service/dispatcher_state.h
@@ -294,6 +294,12 @@ class DispatcherState {
   // deterministically sharding a dataset among a fixed set of workers.
   StatusOr<int64_t> GetWorkerIndex(absl::string_view worker_address) const;
 
+  // Returns the paths of all snapshots inititated during the lifetime of this
+  // journal.
+  const absl::flat_hash_set<std::string>& ListSnapshotPaths() const {
+    return snapshot_paths_;
+  }
+
  private:
   void RegisterDataset(const RegisterDatasetUpdate& register_dataset);
   void RegisterWorker(const RegisterWorkerUpdate& register_worker);
@@ -311,6 +317,8 @@ class DispatcherState {
   void ClientHeartbeat(const ClientHeartbeatUpdate& client_heartbeat);
   void CreateTask(const CreateTaskUpdate& create_task);
   void FinishTask(const FinishTaskUpdate& finish_task);
+  void Snapshot(const SnapshotUpdate& snapshot);
+
   // Updates the next available dataset ID.
   void UpdateNextAvailableDatasetId();
 
@@ -355,6 +363,8 @@ class DispatcherState {
   // Tasks, keyed by worker addresses. The values are a map from task id to
   // task.
   absl::flat_hash_map<std::string, TasksById> tasks_by_worker_;
+  // Paths for all snapshots initiated during the lifetime of this journal.
+  absl::flat_hash_set<std::string> snapshot_paths_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/dispatcher_state_test.cc b/tensorflow/core/data/service/dispatcher_state_test.cc
index a8c03ed190b..1f63a00165e 100644
--- a/tensorflow/core/data/service/dispatcher_state_test.cc
+++ b/tensorflow/core/data/service/dispatcher_state_test.cc
@@ -22,17 +22,13 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/journal.h"
 #include "tensorflow/core/data/service/journal.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/random.h"
-#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
 namespace data {
@@ -44,11 +40,11 @@ using IterationKey = DispatcherState::IterationKey;
 using Job = DispatcherState::Job;
 using Iteration = DispatcherState::Iteration;
 using Task = DispatcherState::Task;
-using ::tensorflow::testing::StatusIs;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
+using ::tsl::testing::StatusIs;
 
 Status RegisterDataset(const std::string& dataset_id, uint64 fingerprint,
                        DispatcherState& state) {
@@ -148,6 +144,13 @@ Status FinishTask(int64_t task_id, DispatcherState& state) {
   return state.Apply(update);
 }
 
+Status Snapshot(const std::string& path, DispatcherState& state) {
+  Update update;
+  SnapshotUpdate* snapshot = update.mutable_snapshot();
+  snapshot->set_path(path);
+  return state.Apply(update);
+}
+
 }  // namespace
 
 TEST(DispatcherState, RegisterDataset) {
@@ -692,5 +695,14 @@ TEST(DispatcherState, ListActiveClients) {
   EXPECT_THAT(state.ListActiveClientIds(), UnorderedElementsAre(6, 8));
 }
 
+TEST(DispatcherState, ListSnapshotPaths) {
+  DispatcherState state;
+  absl::flat_hash_set<std::string> snapshot_paths = {"p1", "p2"};
+  for (const auto& snapshot_path : snapshot_paths) {
+    TF_EXPECT_OK(Snapshot(snapshot_path, state));
+  }
+  EXPECT_EQ(state.ListSnapshotPaths(), snapshot_paths);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.cc b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
index e1aa5656fb7..58c5a22ec22 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
@@ -63,6 +63,9 @@ HANDLER(ClientHeartbeat);
 HANDLER(GetWorkers);
 HANDLER(GetDataServiceMetadata);
 HANDLER(GetDataServiceConfig);
+HANDLER(Snapshot);
+HANDLER(GetSnapshotSplit);
+HANDLER(GetSnapshotStreams);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
index 855ca2b371a..f90b3f08985 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -58,6 +58,9 @@ class GrpcDispatcherImpl : public DispatcherService::Service {
   HANDLER(GetWorkers);
   HANDLER(GetDataServiceMetadata);
   HANDLER(GetDataServiceConfig);
+  HANDLER(Snapshot);
+  HANDLER(GetSnapshotSplit);
+  HANDLER(GetSnapshotStreams);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index a6a6733a933..9071dc036a3 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -31,11 +31,20 @@ namespace tensorflow {
 namespace data {
 namespace grpc_util {
 
+constexpr char kStreamRemovedMessage[] = "Stream removed";
+
 Status WrapError(const std::string& message, const ::grpc::Status& status) {
   if (status.ok()) {
     return errors::Internal("Expected a non-ok grpc status. Wrapping message: ",
                             message);
   } else {
+    // FromGrpcStatus checks for "Stream removed" as well, but only when the
+    // status code is "Unknown". We have observed that sometimes stream removed
+    // errors use other status codes (b/258285154).
+    // TODO(aaudibert): Upstream this to FromGrpcStatus.
+    if (status.error_message() == kStreamRemovedMessage) {
+      return Status(tensorflow::error::UNAVAILABLE, kStreamRemovedMessage);
+    }
     Status s = FromGrpcStatus(status);
     return Status(s.code(),
                   absl::StrCat(message, ": ", status.error_message()));
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index 9209033076b..626e284fea0 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -66,6 +66,7 @@ WorkerStateExport GrpcWorkerImpl::ExportState() const {
 HANDLER(ProcessTask);
 HANDLER(GetElement);
 HANDLER(GetWorkerTasks);
+HANDLER(GetSnapshotTaskProgresses);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_worker_impl.h b/tensorflow/core/data/service/grpc_worker_impl.h
index 9ca56945170..c2505c29746 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.h
+++ b/tensorflow/core/data/service/grpc_worker_impl.h
@@ -60,6 +60,7 @@ class GrpcWorkerImpl : public WorkerService::Service {
   HANDLER(ProcessTask);
   HANDLER(GetElement);
   HANDLER(GetWorkerTasks);
+  HANDLER(GetSnapshotTaskProgresses);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/journal.proto b/tensorflow/core/data/service/journal.proto
index 4759f8b2d36..1b3a35f43ad 100644
--- a/tensorflow/core/data/service/journal.proto
+++ b/tensorflow/core/data/service/journal.proto
@@ -8,7 +8,7 @@ import "tensorflow/core/protobuf/data_service.proto";
 // Message representing journaled dispatcher metadata updates. When we apply
 // one of these changes to the dispatcher's in-memory state, we also write an
 // Update message to the journal.
-// Next tag: 15
+// Next tag: 16
 message Update {
   oneof update_type {
     RegisterDatasetUpdate register_dataset = 1;
@@ -24,6 +24,7 @@ message Update {
     ClientHeartbeatUpdate client_heartbeat = 10;
     CreateTaskUpdate create_task = 3;
     FinishTaskUpdate finish_task = 4;
+    SnapshotUpdate snapshot = 15;
   }
   reserved 13;
 }
@@ -143,3 +144,8 @@ message CreateTaskUpdate {
 message FinishTaskUpdate {
   int64 task_id = 1;
 }
+
+// Next tag: 2
+message SnapshotUpdate {
+  string path = 1;
+}
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 12d9636ecd1..7226ccd02cc 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/service/server_lib.h"
 
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
@@ -22,6 +23,7 @@ limitations under the License.
 
 #include "grpcpp/server.h"
 #include "grpcpp/server_builder.h"
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/export.pb.h"
 #include "tensorflow/core/data/service/grpc_dispatcher_impl.h"
@@ -37,7 +39,7 @@ constexpr char kPortPlaceholder[] = "%port%";
 }
 
 GrpcDataServerBase::GrpcDataServerBase(
-    int port, const std::string& protocol, const std::string server_type,
+    int port, const std::string& protocol, const std::string& server_type,
     std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> options)
     : requested_port_(port),
       protocol_(protocol),
@@ -134,6 +136,22 @@ Status DispatchGrpcDataServer::NumWorkers(int* num_workers) {
   return OkStatus();
 }
 
+Status DispatchGrpcDataServer::SnapshotStreams(
+    const std::string& path, std::vector<SnapshotStreamInfoWrapper>* streams) {
+  GetSnapshotStreamsRequest req;
+  req.set_path(path);
+  GetSnapshotStreamsResponse resp;
+  ::grpc::ServerContext ctx;
+  ::grpc::Status s = service_->GetSnapshotStreams(&ctx, &req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get snapshot streams", s);
+  }
+  for (const auto& stream : resp.streams()) {
+    streams->push_back(SnapshotStreamInfoWrapper(stream));
+  }
+  return OkStatus();
+}
+
 size_t DispatchGrpcDataServer::NumActiveIterations() {
   return service_->NumActiveIterations();
 }
@@ -198,6 +216,21 @@ Status WorkerGrpcDataServer::NumTasks(int* num_tasks) {
   return OkStatus();
 }
 
+Status WorkerGrpcDataServer::SnapshotTaskProgresses(
+    std::vector<SnapshotTaskProgressWrapper>* snapshot_task_progresses) {
+  GetSnapshotTaskProgressesRequest req;
+  GetSnapshotTaskProgressesResponse resp;
+  ::grpc::ServerContext ctx;
+  ::grpc::Status s = service_->GetSnapshotTaskProgresses(&ctx, &req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get tasks", s);
+  }
+  for (const auto& progress : resp.snapshot_task_progresses()) {
+    snapshot_task_progresses->push_back(SnapshotTaskProgressWrapper(progress));
+  }
+  return OkStatus();
+}
+
 ServerStateExport WorkerGrpcDataServer::ExportState() const {
   ServerStateExport server_state_export;
   *server_state_export.mutable_worker_state_export() = service_->ExportState();
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 279f888d6c3..2a6b32b02a4 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "grpcpp/server.h"
 #include "grpcpp/server_builder.h"
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_transfer.h"
 #include "tensorflow/core/data/service/export.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -44,7 +45,7 @@ class GrpcDataServerBase {
   // found by calling `BoundPort()`.
   GrpcDataServerBase(
       int requested_port, const std::string& protocol,
-      const std::string server_type,
+      const std::string& server_type,
       std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> options = {});
   virtual ~GrpcDataServerBase() = default;
 
@@ -88,6 +89,15 @@ class GrpcDataServerBase {
   std::vector<std::unique_ptr<::grpc::ServerBuilderOption>> server_options_;
 };
 
+// A wrapper for `SnapshotStreamInfo` for use with pybind.
+struct SnapshotStreamInfoWrapper {
+  SnapshotStreamInfoWrapper() = default;
+  explicit SnapshotStreamInfoWrapper(const SnapshotStreamInfo& info)
+      : index(info.index()), state(info.state()) {}
+  int64_t index;
+  int64_t state;
+};
+
 class DispatchGrpcDataServer : public GrpcDataServerBase {
  public:
   explicit DispatchGrpcDataServer(
@@ -100,6 +110,9 @@ class DispatchGrpcDataServer : public GrpcDataServerBase {
   // Returns the number of active (non-finished) iterations running on the
   // dispatcher.
   size_t NumActiveIterations();
+  // Returns information about all the streams for the snapshot at `path`.
+  Status SnapshotStreams(const std::string& path,
+                         std::vector<SnapshotStreamInfoWrapper>* streams);
 
   ServerStateExport ExportState() const override;
 
@@ -113,6 +126,16 @@ class DispatchGrpcDataServer : public GrpcDataServerBase {
   GrpcDispatcherImpl* service_;
 };
 
+// A wrapper for `SnapshotTaskProgress` for use with pybind.
+struct SnapshotTaskProgressWrapper {
+  SnapshotTaskProgressWrapper() = default;
+  explicit SnapshotTaskProgressWrapper(const SnapshotTaskProgress& progress)
+      : snapshot_task_base_path(progress.snapshot_task().base_path()),
+        snapshot_task_stream_index(progress.snapshot_task().stream_index()) {}
+  std::string snapshot_task_base_path;
+  int64_t snapshot_task_stream_index;
+};
+
 class WorkerGrpcDataServer : public GrpcDataServerBase {
  public:
   explicit WorkerGrpcDataServer(
@@ -123,6 +146,11 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   // Returns the number of tasks currently being executed by the worker.
   Status NumTasks(int* num_tasks);
 
+  // Returns the progresses of the snapshot tasks currently being executed by
+  // the worker.
+  Status SnapshotTaskProgresses(
+      std::vector<SnapshotTaskProgressWrapper>* snapshot_task_progresses);
+
   ServerStateExport ExportState() const override;
 
  protected:
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
new file mode 100644
index 00000000000..71d373bbb53
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -0,0 +1,353 @@
+# Distributed snapshot library.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],
+)
+
+tf_cc_test(
+    name = "distributed_snapshot_test",
+    srcs = ["distributed_snapshot_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":path_utils",
+        ":snapshot_reader",
+        ":test_utils",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:test_cluster",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:tstring",
+    ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
+)
+
+cc_library(
+    name = "file_utils",
+    srcs = ["file_utils.cc"],
+    hdrs = ["file_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/platform:random",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "file_utils_test",
+    srcs = ["file_utils_test.cc"],
+    deps = [
+        ":file_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:dataset_test_base",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "path_utils",
+    srcs = ["path_utils.cc"],
+    hdrs = ["path_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "path_utils_test",
+    srcs = ["path_utils_test.cc"],
+    deps = [
+        ":path_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "snapshot_split_provider",
+    srcs = ["snapshot_split_provider.cc"],
+    hdrs = ["snapshot_split_provider.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":path_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:dispatcher_proto_cc",
+        "//tensorflow/core/data/service:grpc_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tf_cc_test(
+    name = "snapshot_split_provider_test",
+    srcs = ["snapshot_split_provider_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    deps = [
+        ":file_utils",
+        ":path_utils",
+        ":snapshot_split_provider",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "snapshot_manager",
+    srcs = ["snapshot_manager.cc"],
+    hdrs = ["snapshot_manager.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":file_utils",
+        ":path_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:dispatcher_proto_cc",
+        "//tensorflow/core/data/service:split_provider",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "snapshot_reader",
+    srcs = ["snapshot_reader.cc"],
+    hdrs = ["snapshot_reader.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":path_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:common",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "snapshot_reader_test",
+    size = "small",
+    srcs = ["snapshot_reader_test.cc"],
+    deps = [
+        ":path_utils",
+        ":snapshot_reader",
+        ":snapshot_stream_writer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/data/service:common",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:task_runner",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "snapshot_stream_writer",
+    srcs = ["snapshot_stream_writer.cc"],
+    hdrs = ["snapshot_stream_writer.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":file_utils",
+        ":path_utils",
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:task_runner",
+        "//tensorflow/core/data/service:worker_proto_cc",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:regexp",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "snapshot_stream_writer_checkpoint_test",
+    size = "small",
+    srcs = ["snapshot_stream_writer_checkpoint_test.cc"],
+    deps = [
+        ":path_utils",
+        ":snapshot_reader",
+        ":snapshot_stream_writer",
+        ":test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:task_runner",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "snapshot_stream_writer_test",
+    size = "small",
+    srcs = ["snapshot_stream_writer_test.cc"],
+    deps = [
+        ":path_utils",
+        ":snapshot_stream_writer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:task_runner",
+        "//tensorflow/core/data/service:test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = ["test_utils.cc"],
+    hdrs = ["test_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":path_utils",
+        ":snapshot_reader",
+        ":snapshot_stream_writer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/data/service:common",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:task_runner",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "utils_test",
+    size = "small",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/tsl/platform:protobuf",
+    ],
+)
diff --git a/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
new file mode 100644
index 00000000000..1014ac9fa14
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
@@ -0,0 +1,198 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
+#include "tensorflow/core/data/service/snapshot/test_utils.h"
+#include "tensorflow/core/data/service/test_cluster.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/io/compression.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using testing::CreateDummyDistributedSnapshotMetadata;
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using testing::LocalTempFilename;
+using testing::RangeDataset;
+using ::testing::UnorderedElementsAre;
+using tsl::testing::IsOkAndHolds;
+
+constexpr const char kProtocol[] = "grpc";
+
+class TestSnapshotCluster {
+ public:
+  explicit TestSnapshotCluster(int64_t num_workers) {
+    TestCluster::Config config;
+    config.num_workers = num_workers;
+    config.worker_heartbeat_interval_ms = 100;
+    test_cluster_ = std::make_unique<TestCluster>(config);
+    TF_CHECK_OK(test_cluster_->Initialize());
+    dispatcher_client_ = std::make_unique<DataServiceDispatcherClient>(
+        test_cluster_->DispatcherAddress(), kProtocol);
+  }
+
+  DataServiceDispatcherClient& dispatcher() const {
+    return *dispatcher_client_;
+  }
+
+ private:
+  std::unique_ptr<TestCluster> test_cluster_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_client_;
+};
+
+tsl::Status WaitForFileExists(const std::string& file_path) {
+  while (true) {
+    tsl::Status status = Env::Default()->FileExists(file_path);
+    if (!errors::IsNotFound(status)) {
+      TF_RETURN_IF_ERROR(status);
+    }
+    if (status.ok()) {
+      return tsl::OkStatus();
+    }
+    Env::Default()->SleepForMicroseconds(
+        absl::ToInt64Microseconds(absl::Seconds(1)));
+  }
+  return tsl::OkStatus();
+}
+
+tsl::Status WaitForSnapshotComplete(const std::string& base_path) {
+  return WaitForFileExists(SnapshotDoneFilePath(base_path));
+}
+
+class DistributedSnapshotTest : public ::testing::TestWithParam<int64_t> {
+ protected:
+  int64_t NumWorkers() const { return GetParam(); }
+};
+
+TEST_P(DistributedSnapshotTest, WriteSnapshot) {
+  TestSnapshotCluster data_service(NumWorkers());
+  DatasetDef dataset = RangeDataset(10);
+  experimental::DistributedSnapshotMetadata metadata =
+      CreateDummyDistributedSnapshotMetadata();
+  std::string snapshot_path = LocalTempFilename();
+  TF_ASSERT_OK(
+      data_service.dispatcher().Snapshot(dataset, snapshot_path, metadata));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshot_path));
+  if (NumWorkers() == 1) {
+    EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path,
+                                               tsl::io::compression::kNone),
+                IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+  } else {  // More than 1 workers: The element order is non-deterministic.
+    EXPECT_THAT(
+        testing::ReadSnapshot<int64_t>(snapshot_path,
+                                       tsl::io::compression::kNone),
+        IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+  }
+}
+
+TEST_P(DistributedSnapshotTest, WriteMultipleSnapshots) {
+  TestSnapshotCluster data_service(NumWorkers());
+  experimental::DistributedSnapshotMetadata metadata =
+      CreateDummyDistributedSnapshotMetadata();
+  std::vector<std::string> snapshots = {
+      LocalTempFilename(), LocalTempFilename(), LocalTempFilename()};
+  TF_ASSERT_OK(data_service.dispatcher().Snapshot(RangeDataset(0), snapshots[0],
+                                                  metadata));
+  TF_ASSERT_OK(data_service.dispatcher().Snapshot(RangeDataset(10),
+                                                  snapshots[1], metadata));
+  TF_ASSERT_OK(data_service.dispatcher().Snapshot(RangeDataset(20),
+                                                  snapshots[2], metadata));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshots[0]));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshots[1]));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshots[2]));
+  EXPECT_THAT(
+      testing::ReadSnapshot<int64_t>(snapshots[0], tsl::io::compression::kNone),
+      IsOkAndHolds(IsEmpty()));
+  EXPECT_THAT(
+      testing::ReadSnapshot<int64_t>(snapshots[1], tsl::io::compression::kNone),
+      IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+  EXPECT_THAT(
+      testing::ReadSnapshot<int64_t>(snapshots[2], tsl::io::compression::kNone),
+      IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                        12, 13, 14, 15, 16, 17, 18, 19)));
+}
+
+TEST_P(DistributedSnapshotTest, EmptyDataset) {
+  TestSnapshotCluster data_service(NumWorkers());
+  DatasetDef dataset = RangeDataset(0);
+  experimental::DistributedSnapshotMetadata metadata =
+      CreateDummyDistributedSnapshotMetadata();
+  std::string snapshot_path = LocalTempFilename();
+  TF_ASSERT_OK(
+      data_service.dispatcher().Snapshot(dataset, snapshot_path, metadata));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshot_path));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path,
+                                             tsl::io::compression::kNone),
+              IsOkAndHolds(IsEmpty()));
+}
+
+INSTANTIATE_TEST_SUITE_P(NumWorkers, DistributedSnapshotTest,
+                         ::testing::Values(1, 5));
+
+class DistributedSnapshotCompressionTest
+    : public ::testing::TestWithParam<std::tuple<int64_t, std::string>> {
+ protected:
+  int64_t NumWorkers() const { return std::get<0>(GetParam()); }
+  std::string Compression() const { return std::get<1>(GetParam()); }
+};
+
+TEST_P(DistributedSnapshotCompressionTest, Compression) {
+  TestSnapshotCluster data_service(NumWorkers());
+  DatasetDef dataset = RangeDataset(20);
+  experimental::DistributedSnapshotMetadata metadata =
+      CreateDummyDistributedSnapshotMetadata();
+  metadata.set_compression(Compression());
+  std::string snapshot_path = LocalTempFilename();
+  TF_ASSERT_OK(
+      data_service.dispatcher().Snapshot(dataset, snapshot_path, metadata));
+  TF_ASSERT_OK(WaitForSnapshotComplete(snapshot_path));
+  EXPECT_THAT(
+      testing::ReadSnapshot<int64_t>(snapshot_path, Compression()),
+      IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                        12, 13, 14, 15, 16, 17, 18, 19)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    NumWorkersAndCompression, DistributedSnapshotCompressionTest,
+    ::testing::Combine(::testing::Values(1, 5),
+                       ::testing::Values(tsl::io::compression::kGzip,
+                                         tsl::io::compression::kSnappy,
+                                         tsl::io::compression::kZlib)));
+
+// TODO(b/258691097): Add tests for multiple sources (e.g., zip, enumerate).
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/file_utils.cc b/tensorflow/core/data/service/snapshot/file_utils.cc
new file mode 100644
index 00000000000..488f7a9b5a4
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/file_utils.cc
@@ -0,0 +1,98 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+
+#include <string>
+
+#include "absl/functional/function_ref.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/random.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+
+tsl::Status AtomicallyWrite(
+    absl::string_view filename, tsl::Env* env,
+    absl::FunctionRef<tsl::Status(const std::string&)> nonatomically_write) {
+  std::string uncommitted_filename =
+      absl::StrCat(filename, "-tmp-", random::New64());
+  TF_RETURN_IF_ERROR(nonatomically_write(uncommitted_filename));
+  return env->RenameFile(uncommitted_filename, std::string(filename));
+}
+
+}  // namespace
+
+tsl::Status AtomicallyWriteStringToFile(absl::string_view filename,
+                                        absl::string_view str, tsl::Env* env) {
+  auto nonatomically_write = [&](const std::string& uncomitted_filename) {
+    TF_RETURN_IF_ERROR(WriteStringToFile(env, uncomitted_filename, str));
+    return tsl::OkStatus();
+  };
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      AtomicallyWrite(filename, env, nonatomically_write),
+      "Requested to write string: ", str);
+  return tsl::OkStatus();
+}
+
+tsl::Status AtomicallyWriteBinaryProto(absl::string_view filename,
+                                       const tsl::protobuf::Message& proto,
+                                       tsl::Env* env) {
+  auto nonatomically_write = [&](const std::string& uncomitted_filename) {
+    TF_RETURN_IF_ERROR(WriteBinaryProto(env, uncomitted_filename, proto));
+    return tsl::OkStatus();
+  };
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      AtomicallyWrite(filename, env, nonatomically_write),
+      "Requested to write proto in binary format: ", proto.DebugString());
+  return tsl::OkStatus();
+}
+
+tsl::Status AtomicallyWriteTextProto(absl::string_view filename,
+                                     const tsl::protobuf::Message& proto,
+                                     tsl::Env* env) {
+  auto nonatomically_write = [&](const std::string& uncomitted_filename) {
+    TF_RETURN_IF_ERROR(WriteTextProto(env, uncomitted_filename, proto));
+    return tsl::OkStatus();
+  };
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      AtomicallyWrite(filename, env, nonatomically_write),
+      "Requested to write proto in text format: ", proto.DebugString());
+  return tsl::OkStatus();
+}
+
+tsl::Status AtomicallyWriteTFRecord(absl::string_view filename,
+                                    const Tensor& tensor, tsl::Env* env) {
+  auto nonatomically_write = [&](const std::string& uncomitted_filename) {
+    snapshot_util::TFRecordWriter writer(uncomitted_filename,
+                                         tsl::io::compression::kNone);
+    TF_RETURN_IF_ERROR(writer.Initialize(env));
+    TF_RETURN_IF_ERROR(writer.WriteTensors({tensor}));
+    return tsl::OkStatus();
+  };
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      AtomicallyWrite(filename, env, nonatomically_write),
+      "Requested to write tensor: ", tensor.DebugString());
+  return tsl::OkStatus();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/file_utils.h b/tensorflow/core/data/service/snapshot/file_utils.h
new file mode 100644
index 00000000000..68f5ffd4d8c
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/file_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+
+// Atomically writes `str` to `filename`. Overwrites existing contents if the
+// file already exists.
+tsl::Status AtomicallyWriteStringToFile(absl::string_view filename,
+                                        absl::string_view str, tsl::Env* env);
+
+// Atomically writes the binary representation of `proto` to `filename`.
+// Overwrites existing contents if the file already exists.
+tsl::Status AtomicallyWriteBinaryProto(absl::string_view filename,
+                                       const tsl::protobuf::Message& proto,
+                                       tsl::Env* env);
+
+// Atomically writes the text representation of `proto` to `filename`.
+// Overwrites existing contents if the file already exists.
+tsl::Status AtomicallyWriteTextProto(absl::string_view filename,
+                                     const tsl::protobuf::Message& proto,
+                                     tsl::Env* env);
+
+// Atomically writes `tensor` to `filename` in TFRecord format. Overwrites
+// existing contents if the file already exists.
+tsl::Status AtomicallyWriteTFRecord(absl::string_view filename,
+                                    const Tensor& tensor, tsl::Env* env);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_FILE_UTILS_H_
diff --git a/tensorflow/core/data/service/snapshot/file_utils_test.cc b/tensorflow/core/data/service/snapshot/file_utils_test.cc
new file mode 100644
index 00000000000..6c7d27a2006
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/file_utils_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/dataset_test_base.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+tsl::StatusOr<std::string> CreateTestDirectory() {
+  std::string directory;
+  if (!tsl::Env::Default()->LocalTempFilename(&directory)) {
+    return tsl::errors::FailedPrecondition(
+        "Failed to create local test directory.");
+  }
+  TF_RETURN_IF_ERROR(tsl::Env::Default()->RecursivelyCreateDir(directory));
+  return directory;
+}
+
+using AtomicallyWriteStringToFileTest = ::testing::TestWithParam<std::string>;
+
+TEST_P(AtomicallyWriteStringToFileTest, WriteString) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string test_file = tsl::io::JoinPath(directory, "test_file");
+  std::string file_contents = GetParam();
+  TF_ASSERT_OK(AtomicallyWriteStringToFile(test_file, file_contents,
+                                           tsl::Env::Default()));
+
+  std::string data;
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(test_file));
+  TF_ASSERT_OK(ReadFileToString(tsl::Env::Default(), test_file, &data));
+  EXPECT_EQ(data, file_contents);
+}
+
+INSTANTIATE_TEST_SUITE_P(FileContents, AtomicallyWriteStringToFileTest,
+                         ::testing::ValuesIn<std::string>({"OK", ""}));
+
+TEST(FileUtilsTest, AtomicallyWriteBinaryProto) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string test_file = tsl::io::JoinPath(directory, "test_file");
+  DatasetDef out = testing::RangeDataset(/*range=*/10);
+  TF_ASSERT_OK(AtomicallyWriteBinaryProto(test_file, out, tsl::Env::Default()));
+
+  DatasetDef in;
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(test_file));
+  TF_ASSERT_OK(ReadBinaryProto(tsl::Env::Default(), test_file, &in));
+  EXPECT_THAT(in, testing::EqualsProto(out));
+}
+
+TEST(FileUtilsTest, AtomicallyWriteTextProto) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string test_file = tsl::io::JoinPath(directory, "test_file");
+  DatasetDef out = testing::RangeDataset(/*range=*/10);
+  TF_ASSERT_OK(AtomicallyWriteTextProto(test_file, out, tsl::Env::Default()));
+
+  DatasetDef in;
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(test_file));
+  TF_ASSERT_OK(ReadTextProto(tsl::Env::Default(), test_file, &in));
+  EXPECT_THAT(in, testing::EqualsProto(out));
+}
+
+TEST(FileUtilsTest, AtomicallyWriteTFRecord) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string directory, CreateTestDirectory());
+  std::string test_file = tsl::io::JoinPath(directory, "test_file");
+  Tensor out = CreateTensor<int64_t>(TensorShape({2}), {1, 2});
+  TF_ASSERT_OK(AtomicallyWriteTFRecord(test_file, out, tsl::Env::Default()));
+
+  std::vector<Tensor> in;
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(test_file));
+  snapshot_util::TFRecordReader reader(test_file, tsl::io::compression::kNone,
+                                       {DT_INT64});
+  TF_ASSERT_OK(reader.Initialize(tsl::Env::Default()));
+  TF_ASSERT_OK(reader.ReadTensors(&in));
+  EXPECT_EQ(out.DebugString(), in.front().DebugString());
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/path_utils.cc b/tensorflow/core/data/service/snapshot/path_utils.cc
new file mode 100644
index 00000000000..c545cb927e8
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/path_utils.cc
@@ -0,0 +1,129 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr const char kDoneFileName[] = "DONE";
+constexpr const char kSnapshotMetadataFileName[] = "snapshot.metadata";
+constexpr const char kDatasetDefFileName[] = "dataset_def.proto";
+constexpr const char kStreamsDirectoryName[] = "streams";
+constexpr const char kSplitsDirectoryName[] = "splits";
+constexpr const char kCheckpointsDirectoryName[] = "checkpoints";
+constexpr const char kCommittedChunksDirectoryName[] = "chunks";
+constexpr const char kUncommittedChunksDirectoryName[] = "uncommitted_chunks";
+
+}  // namespace
+
+std::string StreamsDirectory(absl::string_view snapshot_path) {
+  return tsl::io::JoinPath(snapshot_path, kStreamsDirectoryName);
+}
+
+std::string StreamDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index) {
+  return tsl::io::JoinPath(StreamsDirectory(snapshot_path),
+                           absl::StrCat("stream_", stream_index));
+}
+
+std::string SplitsDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index) {
+  return tsl::io::JoinPath(StreamDirectory(snapshot_path, stream_index),
+                           kSplitsDirectoryName);
+}
+
+std::string SourceDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index, int64_t source_id) {
+  return tsl::io::JoinPath(SplitsDirectory(snapshot_path, stream_index),
+                           absl::StrCat("source_", source_id));
+}
+
+std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
+                      int64_t source_id, int64_t local_index,
+                      int64_t global_index) {
+  return tsl::io::JoinPath(
+      SourceDirectory(snapshot_path, stream_index, source_id),
+      absl::StrCat("split_", local_index, "_", global_index));
+}
+
+tsl::StatusOr<std::pair<int64_t, int64_t>> SplitIndex(
+    absl::string_view split_path) {
+  std::vector<std::string> tokens = absl::StrSplit(split_path, '_');
+  int64_t local_split_index = 0, global_split_index = 0;
+  if (tokens.size() != 3 || tokens[0] != "split" ||
+      !absl::SimpleAtoi(tokens[1], &local_split_index) ||
+      local_split_index < 0 ||
+      !absl::SimpleAtoi(tokens[2], &global_split_index) ||
+      global_split_index < 0) {
+    return tsl::errors::InvalidArgument(
+        "Invalid split file name: ", split_path,
+        ". Expected split_<local_split_index>_<global_split_index>.");
+  }
+  if (local_split_index > global_split_index) {
+    return tsl::errors::InvalidArgument(
+        "Invalid split file name: ", split_path, ". The local split index ",
+        local_split_index, " exceeds the global split index ",
+        global_split_index, ".");
+  }
+  return std::make_pair(local_split_index, global_split_index);
+}
+
+std::string SnapshotMetadataFilePath(absl::string_view snapshot_path_) {
+  return tsl::io::JoinPath(snapshot_path_, kSnapshotMetadataFileName);
+}
+
+std::string DatasetDefFilePath(absl::string_view snapshot_path_) {
+  return tsl::io::JoinPath(snapshot_path_, kDatasetDefFileName);
+}
+
+std::string StreamDoneFilePath(absl::string_view snapshot_path,
+                               int64_t stream_index) {
+  return tsl::io::JoinPath(StreamDirectory(snapshot_path, stream_index),
+                           kDoneFileName);
+}
+
+std::string SnapshotDoneFilePath(absl::string_view snapshot_path) {
+  return tsl::io::JoinPath(snapshot_path, kDoneFileName);
+}
+
+std::string CheckpointsDirectory(absl::string_view snapshot_path,
+                                 int64_t stream_index) {
+  return tsl::io::JoinPath(StreamDirectory(snapshot_path, stream_index),
+                           kCheckpointsDirectoryName);
+}
+
+std::string CommittedChunksDirectory(absl::string_view snapshot_path) {
+  return tsl::io::JoinPath(snapshot_path, kCommittedChunksDirectoryName);
+}
+
+std::string UncommittedChunksDirectory(absl::string_view snapshot_path,
+                                       int64_t stream_index) {
+  return tsl::io::JoinPath(StreamDirectory(snapshot_path, stream_index),
+                           kUncommittedChunksDirectoryName);
+}
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/path_utils.h b/tensorflow/core/data/service/snapshot/path_utils.h
new file mode 100644
index 00000000000..6e28a6c2eb4
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/path_utils.h
@@ -0,0 +1,83 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns the directory path for the assigned streams of a snapshot.
+std::string StreamsDirectory(absl::string_view snapshot_path);
+
+// Returns the directory path for a worker writing one stream of the snapshot.
+std::string StreamDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index);
+
+// Returns the directory path for the assigned splits for a worker writing one
+// stream of a snapshot.
+std::string SplitsDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index);
+
+// Returns the directory path for the assigned splits for one source, for a
+// worker writing one stream of a snapshot.
+std::string SourceDirectory(absl::string_view snapshot_path,
+                            int64_t stream_index, int64_t source_id);
+
+// Returns the file path for an assigned split for a worker writing one stream
+// of a snapshot.
+std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
+                      int64_t source_id, int64_t local_index,
+                      int64_t global_index);
+
+// Returns a pair of {local_split_index, global_split_index} of the split. The
+// expected format of `split_path` is:
+// split_<local_split_index>_<global_split_index>
+tsl::StatusOr<std::pair<int64_t, int64_t>> SplitIndex(
+    absl::string_view split_path);
+
+// Returns the path of the DONE file of a snapshot stream.
+std::string StreamDoneFilePath(absl::string_view snapshot_path,
+                               int64_t stream_index);
+
+// Returns the path of the DONE file of a snapshot stream.
+std::string SnapshotDoneFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the serialized metadata for a snapshot.
+std::string SnapshotMetadataFilePath(absl::string_view snapshot_path);
+
+// Returns the path of the serialized graph of the dataset for a snapshot.
+std::string DatasetDefFilePath(absl::string_view snapshot_path);
+
+// Returns the directory path for snapshot checkpoints.
+std::string CheckpointsDirectory(absl::string_view snapshot_path,
+                                 int64_t stream_index);
+
+// Returns the directory path for committed chunks.
+std::string CommittedChunksDirectory(absl::string_view snapshot_path);
+
+// Returns the directory path for uncommitted chunks.
+std::string UncommittedChunksDirectory(absl::string_view snapshot_path,
+                                       int64_t stream_index);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_PATH_UTILS_H_
diff --git a/tensorflow/core/data/service/snapshot/path_utils_test.cc b/tensorflow/core/data/service/snapshot/path_utils_test.cc
new file mode 100644
index 00000000000..f1460f0c7f9
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/path_utils_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::MatchesRegex;
+using ::testing::Pair;
+using tsl::testing::IsOkAndHolds;
+using tsl::testing::StatusIs;
+
+TEST(PathUtilsTest, StreamsDirectory) {
+  EXPECT_THAT(StreamsDirectory("/path/to/snapshot"),
+              MatchesRegex("/path/to/snapshot.streams"));
+}
+
+TEST(PathUtilsTest, StreamDirectory) {
+  EXPECT_THAT(StreamDirectory("/path/to/snapshot", /*stream_index=*/0),
+              MatchesRegex("/path/to/snapshot.streams.stream_0"));
+}
+
+TEST(PathUtilsTest, SplitsDirectory) {
+  EXPECT_THAT(SplitsDirectory("/path/to/snapshot", /*stream_index=*/0),
+              MatchesRegex("/path/to/snapshot.streams.stream_0.splits"));
+}
+
+TEST(PathUtilsTest, SourceDirectory) {
+  EXPECT_THAT(
+      SourceDirectory("/path/to/snapshot", /*stream_index=*/0, /*source_id=*/1),
+      MatchesRegex("/path/to/snapshot.streams.stream_0.splits.source_1"));
+}
+
+TEST(PathUtilsTest, SplitPath) {
+  EXPECT_THAT(
+      SplitPath("/path/to/snapshot", /*stream_index=*/0, /*source_id=*/1,
+                /*local_index=*/2, /*global_index=*/3),
+      MatchesRegex(
+          "/path/to/snapshot.streams.stream_0.splits.source_1.split_2_3"));
+}
+
+TEST(PathUtilsTest, SplitIndex) {
+  EXPECT_THAT(SplitIndex("split_0_1"), IsOkAndHolds(Pair(0, 1)));
+}
+
+TEST(PathUtilsTest, InvalidSplitFile) {
+  EXPECT_THAT(
+      SplitIndex(""),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected split_<local_split_index>_<global_split_index>")));
+  EXPECT_THAT(
+      SplitIndex("split_123"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected split_<local_split_index>_<global_split_index>")));
+  EXPECT_THAT(
+      SplitIndex("split_-1_(-1)"),
+      StatusIs(error::INVALID_ARGUMENT,
+               HasSubstr(
+                   "Expected split_<local_split_index>_<global_split_index>")));
+  EXPECT_THAT(
+      SplitIndex("split_5_0"),
+      StatusIs(
+          error::INVALID_ARGUMENT,
+          HasSubstr(
+              "The local split index 5 exceeds the global split index 0")));
+}
+
+TEST(PathUtilsTest, StreamDoneFilePath) {
+  EXPECT_THAT(StreamDoneFilePath("/path/to/snapshot", /*stream_index=*/0),
+              MatchesRegex("/path/to/snapshot.streams.stream_0.DONE"));
+}
+
+TEST(PathUtilsTest, SnapshotDoneFilePath) {
+  EXPECT_THAT(SnapshotDoneFilePath("/path/to/snapshot"),
+              MatchesRegex("/path/to/snapshot.DONE"));
+}
+
+TEST(PathUtilsTest, SnapshotMetadataFilePath) {
+  EXPECT_THAT(SnapshotMetadataFilePath("/path/to/snapshot"),
+              MatchesRegex("/path/to/snapshot.snapshot.metadata"));
+}
+
+TEST(PathUtilsTest, DatasetDefFilePath) {
+  EXPECT_THAT(DatasetDefFilePath("/path/to/snapshot"),
+              MatchesRegex("/path/to/snapshot.dataset_def.proto"));
+}
+
+TEST(PathUtilsTest, CheckpointsDirectory) {
+  EXPECT_THAT(CheckpointsDirectory("/path/to/snapshot", /*stream_index=*/0),
+              MatchesRegex("/path/to/snapshot.streams.stream_0.checkpoints"));
+}
+
+TEST(PathUtilsTest, CommittedChunksDirectory) {
+  EXPECT_THAT(CommittedChunksDirectory("/path/to/snapshot"),
+              MatchesRegex("/path/to/snapshot.chunks"));
+}
+
+TEST(PathUtilsTest, UncommittedChunksDirectory) {
+  EXPECT_THAT(
+      UncommittedChunksDirectory("/path/to/snapshot", /*stream_index=*/0),
+      MatchesRegex("/path/to/snapshot.streams.stream_0.uncommitted_chunks"));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
new file mode 100644
index 00000000000..e5419be1215
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -0,0 +1,446 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/snapshot/snapshot_manager.h"
+
+#include <algorithm>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/split_provider.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+using ::tsl::OkStatus;
+using ::tsl::errors::InvalidArgument;
+
+const absl::Duration kWorkerTimeout = absl::Seconds(45);
+
+StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Start(
+    const SnapshotRequest& request, Env* env) {
+  SnapshotManager* snapshot_manager = new SnapshotManager(request.path(), env);
+  TF_RETURN_IF_ERROR(snapshot_manager->Start(request));
+  return absl::WrapUnique(snapshot_manager);
+}
+
+Status SnapshotManager::Start(const SnapshotRequest& request) {
+  if (env_->FileExists(request.path()).ok()) {
+    return InvalidArgument(request.path(), " already exists");
+  }
+  TF_RETURN_IF_ERROR(CreateSplitProviders(request.dataset(), split_providers_));
+  TF_RETURN_IF_ERROR(WriteOnDiskSkeleton());
+  TF_RETURN_IF_ERROR(WriteOnDiskMetadata(request));
+  metadata_ = request.metadata();
+  return OkStatus();
+}
+
+Status SnapshotManager::WriteOnDiskSkeleton() {
+  TF_RETURN_IF_ERROR(
+      env_->RecursivelyCreateDir(CommittedChunksDirectory(path_)));
+  TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(StreamsDirectory(path_)));
+  return OkStatus();
+}
+
+Status SnapshotManager::WriteOnDiskMetadata(const SnapshotRequest& request) {
+  TF_RETURN_IF_ERROR(WriteTextProto(env_, SnapshotMetadataFilePath(path_),
+                                    request.metadata()));
+  TF_RETURN_IF_ERROR(
+      WriteBinaryProto(env_, DatasetDefFilePath(path_), request.dataset()));
+  return OkStatus();
+}
+
+StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Resume(
+    absl::string_view path, Env* env) {
+  SnapshotManager* snapshot_manager =
+      new SnapshotManager(path, env, absl::Microseconds(env->NowMicros()));
+  TF_RETURN_IF_ERROR(snapshot_manager->Resume());
+  return absl::WrapUnique(snapshot_manager);
+}
+
+Status SnapshotManager::Resume() {
+  if (!env_->FileExists(path_).ok()) {
+    return InvalidArgument("failed to recover snapshot at ", path_,
+                           ": the snapshot path doesn't exist");
+  }
+  if (env_->FileExists(SnapshotDoneFilePath(path_)).ok()) {
+    mode_ = Mode::kDone;
+    LOG(INFO) << "attempted to recover snapshot at " << path_
+              << " but it's already done";
+    return OkStatus();
+  }
+  TF_RETURN_IF_ERROR(ReadOnDiskMetadata());
+  TF_RETURN_IF_ERROR(ReadOnDiskStreams());
+  return OkStatus();
+}
+
+Status SnapshotManager::ReadOnDiskMetadata() {
+  if (!env_->FileExists(SnapshotMetadataFilePath(path_)).ok()) {
+    return InvalidArgument("failed to recover snapshot at ", path_,
+                           ": snapshot has no snapshot.metadata");
+  }
+  TF_RETURN_IF_ERROR(
+      ReadTextProto(env_, SnapshotMetadataFilePath(path_), &metadata_));
+
+  if (!env_->FileExists(DatasetDefFilePath(path_)).ok()) {
+    return InvalidArgument("failed to recovery snapshot at ", path_,
+                           ": snapshot has no dataset_def.proto");
+  }
+  DatasetDef dataset_def;
+  TF_RETURN_IF_ERROR(
+      ReadBinaryProto(env_, DatasetDefFilePath(path_), &dataset_def));
+
+  TF_RETURN_IF_ERROR(CreateSplitProviders(dataset_def, split_providers_));
+  return OkStatus();
+}
+
+Status SnapshotManager::ReadOnDiskStreams() {
+  std::string streams_path = StreamsDirectory(path_);
+
+  std::vector<std::string> stream_directories;
+  TF_RETURN_IF_ERROR(env_->GetChildren(streams_path, &stream_directories));
+  streams_.resize(stream_directories.size(), Stream(num_sources()));
+
+  absl::flat_hash_set<int64_t> global_split_indices;
+  for (const auto& stream_directory : stream_directories) {
+    std::string stream_path = io::JoinPath(streams_path, stream_directory);
+
+    // `stream_directory` must have this format: "stream_<stream_index>".
+    std::vector<std::string> tokens = absl::StrSplit(stream_directory, '_');
+    int64_t stream_index;
+    if (tokens.size() != 2 || !absl::SimpleAtoi(tokens[1], &stream_index) ||
+        stream_index < 0) {
+      return InvalidArgument(
+          "can't parse the name of ", stream_path,
+          ": filename must have the format stream_<stream_index>");
+    }
+
+    TF_RETURN_IF_ERROR(ReadOnDiskStream(stream_index, global_split_indices));
+  }
+
+  for (int64_t i = 0; i < global_split_indices.size(); ++i) {
+    if (!global_split_indices.contains(i)) {
+      return InvalidArgument("found missing global split index, ", i, ", in ",
+                             path_);
+    }
+  }
+  num_assigned_splits_ = global_split_indices.size();
+
+  if (!streams_.empty() &&
+      std::all_of(streams_.begin(), streams_.end(),
+                  [](const Stream& stream) { return stream.done; })) {
+    mode_ = Mode::kDone;
+    TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
+                                                   std::string(), env_));
+  }
+
+  return OkStatus();
+}
+
+Status SnapshotManager::ReadOnDiskStream(
+    int64_t stream_index, absl::flat_hash_set<int64_t>& global_split_indices) {
+  std::string splits_path = SplitsDirectory(path_, stream_index);
+  std::vector<std::string> source_directories;
+  TF_RETURN_IF_ERROR(env_->GetChildren(splits_path, &source_directories));
+  for (const auto& source_directory : source_directories) {
+    std::string source_path = io::JoinPath(splits_path, source_directory);
+
+    // `source_directory` must have this format: "source_<source_index>".
+    std::vector<std::string> tokens = absl::StrSplit(source_directory, '_');
+    int64_t source_index;
+    if (tokens.size() != 2 || !absl::SimpleAtoi(tokens[1], &source_index) ||
+        source_index < 0) {
+      return InvalidArgument(
+          "can't parse the name of ", source_path,
+          ": filename must have the format source_<source_index>");
+    }
+    if (source_index >= num_sources()) {
+      return InvalidArgument("found conflict between the number of sources, ",
+                             num_sources(), ", and the filename of ",
+                             source_path);
+    }
+    TF_RETURN_IF_ERROR(
+        ReadOnDiskSource(stream_index, source_index, global_split_indices));
+  }
+
+  if (env_->FileExists(StreamDoneFilePath(path_, stream_index)).ok()) {
+    streams_[stream_index].done = true;
+    return OkStatus();
+  }
+
+  unknowns_.insert(stream_index);
+  return OkStatus();
+}
+
+Status SnapshotManager::ReadOnDiskSource(
+    int64_t stream_index, int64_t source_index,
+    absl::flat_hash_set<int64_t>& global_split_indices) {
+  std::string source_path = SourceDirectory(path_, stream_index, source_index);
+
+  std::vector<std::string> split_filenames;
+  TF_RETURN_IF_ERROR(env_->GetChildren(source_path, &split_filenames));
+
+  Tensor unused_tensor;
+  bool unused_end_of_splits;
+  for (const auto& split_filename : split_filenames) {
+    std::string split_path = io::JoinPath(source_path, split_filename);
+
+    // `split_filename` must have this format:
+    // "split_<local_split_index>_<global_split_index>".
+    TF_ASSIGN_OR_RETURN(auto split_index, SplitIndex(split_filename));
+    auto [local_split_index, global_split_index] = split_index;
+    if (local_split_index > split_filenames.size() - 1) {
+      return InvalidArgument(
+          "found conflict between the number of splits and name of ",
+          split_path);
+    }
+    if (global_split_indices.contains(global_split_index)) {
+      return InvalidArgument("found duplicate global split index in name of ",
+                             split_path);
+    }
+
+    // To account for this split having been assigned, skip a split in the
+    // respective provider.
+    TF_RETURN_IF_ERROR(split_providers_[source_index]->GetNext(
+        &unused_tensor, &unused_end_of_splits));
+    global_split_indices.insert(global_split_index);
+  }
+
+  streams_[stream_index].num_assigned_splits[source_index] =
+      split_filenames.size();
+
+  return OkStatus();
+}
+
+Status SnapshotManager::HandleStreamCompletion(
+    int64_t stream_index, absl::string_view worker_address) {
+  streams_[stream_index].done = true;
+  assignments_.erase(worker_address);
+  if (assignments_.empty() && orphans_.empty() && unknowns_.empty()) {
+    mode_ = Mode::kDone;
+    TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
+                                                   std::string(), env_));
+  }
+  return OkStatus();
+}
+
+std::optional<int64_t> SnapshotManager::MaybeAssignOrphanStream(
+    absl::string_view worker_address) {
+  if (!orphans_.empty()) {
+    int64_t stream_index = *orphans_.begin();
+    orphans_.erase(orphans_.begin());
+    assignments_[worker_address] = stream_index;
+    VLOG(1) << "assigning an existing stream, " << stream_index
+            << ", to worker " << worker_address;
+    return stream_index;
+  }
+  return std::nullopt;
+}
+
+StatusOr<int64_t> SnapshotManager::CreateAndAssignNewStream(
+    absl::string_view worker_address) {
+  int64_t new_stream_index = streams_.size();
+  for (int64_t source_index = 0; source_index < num_sources(); ++source_index) {
+    TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(
+        SourceDirectory(path_, new_stream_index, source_index)));
+  }
+  streams_.push_back(Stream(num_sources()));
+  assignments_[worker_address] = new_stream_index;
+  VLOG(1) << "assigning a new stream, " << new_stream_index << ", to worker "
+          << worker_address;
+  return new_stream_index;
+}
+
+Status SnapshotManager::ReassignPreviouslyAssignedStream(
+    int64_t stream_index, absl::string_view worker_address) {
+  if (!stream_available(stream_index)) {
+    return errors::Internal("worker ", worker_address,
+                            " has no known assignment and its desired stream, ",
+                            stream_index, ", is unavailable");
+  }
+  VLOG(1) << "reassigning a previous assignment of stream " << stream_index
+          << " to worker " << worker_address;
+  assignments_[worker_address] = stream_index;
+  unknowns_.erase(stream_index);
+  return OkStatus();
+}
+
+StatusOr<std::optional<int64_t>>
+SnapshotManager::MaybeGetOrCreateStreamAssignment(
+    absl::string_view worker_address,
+    const SnapshotTaskProgress* snapshot_progress) {
+  std::optional<int64_t> assigned_stream_index;
+  if (auto it = assignments_.find(worker_address); it != assignments_.end()) {
+    assigned_stream_index = it->second;
+  }
+  if (snapshot_progress) {
+    if (assigned_stream_index.has_value() &&
+        *assigned_stream_index !=
+            snapshot_progress->snapshot_task().stream_index()) {
+      return errors::Internal("worker ", worker_address,
+                              " think it's assigned stream ",
+                              " but it's actually assigned assigned stream ",
+                              *assigned_stream_index);
+    }
+    if (!assigned_stream_index) {
+      TF_RETURN_IF_ERROR(ReassignPreviouslyAssignedStream(
+          snapshot_progress->snapshot_task().stream_index(), worker_address));
+      assigned_stream_index = snapshot_progress->snapshot_task().stream_index();
+    }
+    if (snapshot_progress->completed()) {
+      TF_RETURN_IF_ERROR(HandleStreamCompletion(
+          snapshot_progress->snapshot_task().stream_index(), worker_address));
+      assigned_stream_index.reset();
+    }
+  }
+  if (!assigned_stream_index) {
+    assigned_stream_index = MaybeAssignOrphanStream(worker_address);
+  }
+  if (!assigned_stream_index) {
+    if (mode_ != Mode::kActive) {
+      return std::optional<int64_t>();
+    }
+    TF_ASSIGN_OR_RETURN(assigned_stream_index,
+                        CreateAndAssignNewStream(worker_address));
+  }
+  return assigned_stream_index;
+}
+
+Status SnapshotManager::WorkerHeartbeat(const WorkerHeartbeatRequest& request,
+                                        WorkerHeartbeatResponse& response) {
+  if (mode_ == Mode::kDone) {
+    return OkStatus();
+  }
+
+  const SnapshotTaskProgress* snapshot_progress = nullptr;
+  if (auto it = request.snapshot_task_progress().find(path_);
+      it != request.snapshot_task_progress().end()) {
+    snapshot_progress = &it->second;
+  }
+  TF_ASSIGN_OR_RETURN(std::optional<int64_t> assigned_stream_index,
+                      MaybeGetOrCreateStreamAssignment(request.worker_address(),
+                                                       snapshot_progress));
+  if (!assigned_stream_index) {
+    return OkStatus();
+  }
+
+  SnapshotTaskDef* snapshot_task = response.add_snapshot_tasks();
+  snapshot_task->set_base_path(path_);
+  snapshot_task->set_num_sources(num_sources());
+  *snapshot_task->mutable_metadata() = metadata_;
+  snapshot_task->set_stream_index(*assigned_stream_index);
+  return OkStatus();
+}
+
+Status SnapshotManager::GetSnapshotSplit(const GetSnapshotSplitRequest& request,
+                                         GetSnapshotSplitResponse& response) {
+  auto it = assignments_.find(request.worker_address());
+  if (it == assignments_.end()) {
+    if (!unknowns_.contains(request.stream_index())) {
+      return errors::Internal(
+          "worker ", request.worker_address(),
+          " has no known assignment and its desired stream, ",
+          request.stream_index(), ", is unavailable");
+    }
+    TF_RETURN_IF_ERROR(ReassignPreviouslyAssignedStream(
+        request.stream_index(), request.worker_address()));
+  } else if (it->second != request.stream_index()) {
+    return errors::Internal("worker ", request.worker_address(),
+                            " think it's assigned stream ",
+                            request.stream_index(),
+                            " but it's actually assigned stream ", it->second);
+  }
+
+  Tensor split;
+  bool end_of_splits;
+  TF_RETURN_IF_ERROR(split_providers_[request.source_index()]->GetNext(
+      &split, &end_of_splits));
+
+  Stream& stream = streams_[request.stream_index()];
+  int64_t local_split_index =
+      stream.num_assigned_splits[request.source_index()];
+  int64_t global_split_index = num_assigned_splits_;
+  response.set_local_split_index(local_split_index);
+  if (end_of_splits) {
+    if (mode_ == Mode::kActive) {
+      mode_ = Mode::kWindingDown;
+    }
+    response.set_end_of_splits(true);
+    return OkStatus();
+  }
+
+  std::string split_path =
+      SplitPath(path_, request.stream_index(), request.source_index(),
+                local_split_index, global_split_index);
+  TF_RETURN_IF_ERROR(AtomicallyWriteTFRecord(split_path, split, env_));
+  split.AsProtoTensorContent(response.mutable_split());
+
+  ++stream.num_assigned_splits[request.source_index()];
+  ++num_assigned_splits_;
+  return OkStatus();
+}
+
+Status SnapshotManager::GetSnapshotStreams(
+    GetSnapshotStreamsResponse& response) {
+  for (int64_t i = 0; i < streams_.size(); ++i) {
+    SnapshotStreamInfo* stream = response.add_streams();
+    stream->set_index(i);
+    if (orphans_.contains(i)) {
+      stream->set_state(SnapshotStreamInfo::ORPHAN);
+    } else if (unknowns_.contains(i)) {
+      stream->set_state(SnapshotStreamInfo::UNKNOWN);
+    } else {
+      stream->set_state(streams_[i].done ? SnapshotStreamInfo::DONE
+                                         : SnapshotStreamInfo::ASSIGNED);
+    }
+  }
+  return OkStatus();
+}
+
+void SnapshotManager::HandleMissingWorker(absl::string_view worker_address) {
+  if (auto it = assignments_.find(worker_address); it != assignments_.end()) {
+    LOG(INFO) << "deleting assignment for stream " << it->second
+              << " due to lost worker " << worker_address;
+    orphans_.insert(it->second);
+    assignments_.erase(it);
+  }
+}
+
+void SnapshotManager::UpdateStreams() {
+  // Check for streams to move from `unknowns_` to `orphans_`.
+  if (resume_time_micros_.has_value() && !unknowns_.empty() &&
+      absl::Microseconds(env_->NowMicros()) - resume_time_micros_.value() >
+          kWorkerTimeout) {
+    for (auto stream_index : unknowns_) {
+      orphans_.insert(stream_index);
+    }
+    unknowns_.clear();
+  }
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
new file mode 100644
index 00000000000..643d0c748b5
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -0,0 +1,185 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+// A helper used by `DataServiceDispatcherImpl` to manage a call to `Snapshot`.
+//
+// Two mirrored states are maintained:
+// - An in-memory state (objects in the `SnapshotManager` instance).
+// - An on-disk state (files in the `SnapshotManager::path_`).
+//
+// The on-disk state has this structure:
+// - snapshot_path
+//   - DONE
+//   - snapshot.metadata
+//   - dataset_def.proto
+//   - chunks
+//     - chunk_<stream_index>_<chunk_index>
+//   - streams
+//     - stream_0
+//       - DONE
+//       - splits
+//         - source_0
+//           - split_<local_split_index>_<global_split_index>
+//       - uncommitted_chucnks
+//         - chunk_<chunk_index>
+//       - checkpoints
+//         - checkpoint_<chunk_index>
+//
+class SnapshotManager {
+ public:
+  // Initiates a new snapshot process, creating a fresh in-memory state and
+  // writing an on-disk state to `path`. Returns an error if `path` already
+  // exists in the filesystem.
+  static tsl::StatusOr<std::unique_ptr<SnapshotManager>> Start(
+      const SnapshotRequest& request, Env* env);
+  // Resumes an existing snapshot process, reading from the on-disk state in
+  // `path` to derive an in-memory state. Returns an error if `path` is in a bad
+  // state.
+  static tsl::StatusOr<std::unique_ptr<SnapshotManager>> Resume(
+      absl::string_view path, Env* env);
+
+  // Handles the work pertaining to this snapshot process for the respective
+  // `DispatcherService` API calls:
+  // - `WorkerHeartbeat`: Returns a stream assignment for the worker.
+  // - `GetSnapshotSplit`: Returns a split assignment for the worker.
+  // - `GetSnapshotStreams`: Returns information about all streams.
+  tsl::Status WorkerHeartbeat(const WorkerHeartbeatRequest& request,
+                              WorkerHeartbeatResponse& response);
+  tsl::Status GetSnapshotSplit(const GetSnapshotSplitRequest& request,
+                               GetSnapshotSplitResponse& response);
+  tsl::Status GetSnapshotStreams(GetSnapshotStreamsResponse& response);
+
+  // Checks for a stream that should move from `assignments_` to `orphans_` due
+  // to its assigned worker having stopped heartbeating.
+  void HandleMissingWorker(absl::string_view worker_address);
+  // Checks for streams that should move from `unknowns_` to `orphans_` due to
+  // the dispatcher not having gotten a heartbeat from an assigned worker.
+  void UpdateStreams();
+
+ private:
+  SnapshotManager(
+      absl::string_view path, Env* env,
+      std::optional<absl::Duration> resume_time_micros = std::nullopt)
+      : path_(path), env_(env), resume_time_micros_(resume_time_micros) {}
+
+  // Helpers for `Start` above. These update the on-disk state.
+  tsl::Status Start(const SnapshotRequest& request);
+  tsl::Status WriteOnDiskSkeleton();
+  tsl::Status WriteOnDiskMetadata(const SnapshotRequest& request);
+
+  // Helpers for `Resume` above. These update the in-memory state.
+  tsl::Status Resume();
+  tsl::Status ReadOnDiskMetadata();
+  tsl::Status ReadOnDiskStreams();
+  tsl::Status ReadOnDiskStream(
+      int64_t stream_index, absl::flat_hash_set<int64_t>& global_split_indices);
+  tsl::Status ReadOnDiskSource(
+      int64_t stream_index, int64_t source_index,
+      absl::flat_hash_set<int64_t>& global_split_indices);
+
+  // Helpers for `WorkerHeartbeat` above. These may update the in-memory and
+  // on-disk states.
+  tsl::StatusOr<std::optional<int64_t>> MaybeGetOrCreateStreamAssignment(
+      absl::string_view worker_address,
+      const SnapshotTaskProgress* snapshot_progress);
+  tsl::Status ReassignPreviouslyAssignedStream(
+      int64_t stream_index, absl::string_view worker_address);
+  tsl::Status HandleStreamCompletion(int64_t stream_index,
+                                     absl::string_view worker_address);
+  std::optional<int64_t> MaybeAssignOrphanStream(
+      absl::string_view worker_address);
+  tsl::StatusOr<int64_t> CreateAndAssignNewStream(
+      absl::string_view worker_address);
+
+  // The filepath of the on-disk state.
+  const std::string path_;
+  // A tensorflow environment interface used to write to and read from `path_`.
+  tsl::Env* const env_;
+  // Distributed snapshot metadata.
+  experimental::DistributedSnapshotMetadata metadata_;
+  // If `Resume`d, the timestamp of the resumption of the snapshot.
+  std::optional<absl::Duration> resume_time_micros_;
+
+  // A split provider for each input source of the dataset being snapshotted.
+  std::vector<std::unique_ptr<SplitProvider>> split_providers_;
+  int64_t num_sources() const { return split_providers_.size(); }
+
+  struct Stream {
+    explicit Stream(int64_t num_sources) : num_assigned_splits(num_sources) {}
+
+    // A counter of assigned splits for each source.
+    std::vector<int64_t> num_assigned_splits;
+    // If `true`, there are no more splits to be processed for this stream.
+    bool done = false;
+  };
+
+  // All streams for this snapshot.
+  std::vector<Stream> streams_;
+  // Indices of all "assigned" streams, keyed by worker address. A stream is
+  // considered to be assigned if the dispatcher knows of a worker
+  // processing the stream and that worker is heartbeating.
+  absl::flat_hash_map<std::string, int64_t> assignments_;
+  // Indices of all "orphan" streams. A stream is considered to be an orphan if
+  // the dispatcher believes that there is no worker currently processing the
+  // stream. Orphans are eventually assigned to unoccupied workers.
+  absl::flat_hash_set<int64_t> orphans_;
+  // Indices of all "unknown" streams. A stream is considered to be an unknown
+  // if the dispatcher recently restarted and has no idea whether or not there
+  // is a worker currently processing the stream. Unknown streams are eventually
+  // (1) reassigned to the worker processing the stream or (2) considered to be
+  // orphans.
+  absl::flat_hash_set<int64_t> unknowns_;
+  bool stream_available(int64_t stream_index) const {
+    return orphans_.contains(stream_index) || unknowns_.contains(stream_index);
+  }
+
+  // A counter of assigned aplits for this snapshot.
+  int64_t num_assigned_splits_ = 0;
+
+  enum class Mode {
+    // No streams are done.
+    kActive,
+    // At least one source is fully processed, but not all streams are done.
+    kWindingDown,
+    // All streams are done.
+    kDone,
+  };
+
+  // If not `kActive`, at least one source has finished processing and no new
+  // streams are created or assigned.
+  Mode mode_ = Mode::kActive;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_MANAGER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.cc b/tensorflow/core/data/service/snapshot/snapshot_reader.cc
new file mode 100644
index 00000000000..52eb81f1858
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_reader.cc
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+SnapshotReader::SnapshotReader(const SnapshotReaderParams& params)
+    : params_(params) {}
+
+StatusOr<GetNextResult> SnapshotReader::GetNext() {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  while (!end_of_sequence_) {
+    GetNextResult result;
+    Status status = tfrecord_reader_->ReadTensors(&result.tensors);
+    if (status.ok()) {
+      return result;
+    }
+    if (!errors::IsOutOfRange(status)) {
+      return status;
+    }
+    TF_RETURN_IF_ERROR(InitializeNextRecordReader());
+  }
+  return GetNextResult::EndOfSequence();
+}
+
+Status SnapshotReader::EnsureInitialized() {
+  if (!chunk_files_.empty()) {
+    return OkStatus();
+  }
+
+  TF_ASSIGN_OR_RETURN(chunk_files_, GetChunkFiles());
+  TF_RETURN_IF_ERROR(InitializeNextRecordReader());
+  if (end_of_sequence_) {
+    return errors::NotFound("Failed to read distributed tf.data snapshot ",
+                            params_.DebugString(), ": No snapshot is written.");
+  }
+  return OkStatus();
+}
+
+StatusOr<std::vector<std::string>> SnapshotReader::GetChunkFiles() {
+  std::string chunks_directory = params_.CommittedChunksDirectory();
+  std::vector<string> chunk_files;
+  TF_RETURN_IF_ERROR(params_.env->GetChildren(chunks_directory, &chunk_files));
+  for (std::string& chunk_file : chunk_files) {
+    chunk_file = tsl::io::JoinPath(chunks_directory, chunk_file);
+  }
+  return chunk_files;
+}
+
+Status SnapshotReader::InitializeNextRecordReader() {
+  if (next_chunk_index_ >= chunk_files_.size()) {
+    end_of_sequence_ = true;
+    tfrecord_reader_ = nullptr;
+    return OkStatus();
+  }
+  tfrecord_reader_ = std::make_unique<snapshot_util::TFRecordReader>(
+      chunk_files_[next_chunk_index_], params_.metadata.compression(),
+      params_.output_types);
+  TF_RETURN_IF_ERROR(tfrecord_reader_->Initialize(params_.env));
+  LOG(INFO) << "Starting to read distributed tf.data snapshot "
+            << params_.DebugString() << ", chunk " << next_chunk_index_;
+  ++next_chunk_index_;
+  return OkStatus();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.h b/tensorflow/core/data/service/snapshot/snapshot_reader.h
new file mode 100644
index 00000000000..e66f539c66b
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_reader.h
@@ -0,0 +1,101 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+struct SnapshotReaderParams {
+  // The directory path of the snapshot. See the comment on SnapshotManager for
+  // how the directory is structured.
+  std::string snapshot_path;
+
+  // Distributed snapshot metadata.
+  experimental::DistributedSnapshotMetadata metadata;
+
+  // Data types of the snapshot data elements.
+  // TODO(b/258691097): Parse the metadata to get the output types.
+  DataTypeVector output_types;
+
+  // The Tensorflow environment.
+  Env* env = nullptr;
+
+  std::string CommittedChunksDirectory() const {
+    return tensorflow::data::CommittedChunksDirectory(snapshot_path);
+  }
+
+  std::string DebugString() const {
+    return absl::Substitute(
+        "SnapshotReaderParams { base_path: $0, metadata: $1 }", snapshot_path,
+        metadata.DebugString());
+  }
+};
+
+// Reads a distributed tf.data snapshot written by `SnapshotManager` and
+// `SnapshotStreamWriter`. See the comment on SnapshotManager for
+// how the directory is structured.
+// TODO(b/258691097): Support parallel read and make it thread-safe.
+// TODO(b/258691097): Support `reader_func`.
+class SnapshotReader {
+ public:
+  explicit SnapshotReader(const SnapshotReaderParams& params);
+  virtual ~SnapshotReader() = default;
+  SnapshotReader(const SnapshotReader&) = delete;
+  SnapshotReader& operator=(const SnapshotReader&) = delete;
+
+  // Gets the next element from the snapshot.
+  StatusOr<GetNextResult> GetNext();
+
+ private:
+  // Initializes the reader if it's not already initialized. This is called when
+  // `GetNext` is first called.
+  Status EnsureInitialized();
+  // Returns a list of the committed chunks.
+  StatusOr<std::vector<std::string>> GetChunkFiles();
+  // If a chunk file is exhausted, starts reading the next chunk file. If there
+  // are no more files to read, `end_of_sequence_` will be set to true.
+  Status InitializeNextRecordReader();
+
+  const SnapshotReaderParams params_;
+
+  // A list of the committed chunks to read.
+  std::vector<std::string> chunk_files_;
+  // The index of the next chunk to read.
+  uint64_t next_chunk_index_ = 0;
+  bool end_of_sequence_ = false;
+
+  std::unique_ptr<snapshot_util::TFRecordReader> tfrecord_reader_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc b/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc
new file mode 100644
index 00000000000..b179771cdb7
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_reader_test.cc
@@ -0,0 +1,154 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/lib/io/compression.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
+    const DatasetDef& dataset_def) {
+  std::unique_ptr<standalone::Dataset> dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+      standalone::Dataset::Params(), dataset_def.graph(), &dataset));
+  std::unique_ptr<standalone::Iterator> iterator;
+  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
+  return std::make_unique<StandaloneTaskIterator>(std::move(dataset),
+                                                  std::move(iterator));
+}
+
+StatusOr<std::string> CreateSnapshotDirectory() {
+  std::string snapshot_path;
+  if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
+    return errors::FailedPrecondition(
+        "Failed to create local temp file for snapshot.");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  return snapshot_path;
+}
+
+template <class T>
+StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
+                                      const std::string& compression) {
+  experimental::DistributedSnapshotMetadata metadata;
+  metadata.set_compression(compression);
+  SnapshotReaderParams params{base_path, metadata, DataTypeVector{DT_INT64},
+                              Env::Default()};
+  SnapshotReader reader(params);
+  std::vector<T> result;
+  while (true) {
+    TF_ASSIGN_OR_RETURN(GetNextResult next, reader.GetNext());
+    if (next.end_of_sequence) {
+      return result;
+    }
+    result.push_back(next.tensors[0].unaligned_flat<T>().data()[0]);
+  }
+  return result;
+}
+
+TEST(SnapshotReaderTest, ReadSnapshot) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     tsl::io::compression::kNone,
+                                     Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
+              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+}
+
+TEST(SnapshotReaderTest, MultipleWritersAndChunks) {
+  int64_t range = 10, num_writers = 3;
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+
+  std::vector<std::unique_ptr<SnapshotStreamWriter>> writers;
+  for (int i = 0; i < num_writers; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                            TestIterator(testing::RangeDataset(range)));
+    SnapshotWriterParams writer_params{
+        snapshot_path, /*stream_index=*/i, tsl::io::compression::kNone,
+        Env::Default(), /*max_chunk_size_bytes=*/1};
+    writers.push_back(std::make_unique<SnapshotStreamWriter>(
+        writer_params, std::move(iterator)));
+  }
+  for (std::unique_ptr<SnapshotStreamWriter>& writer : writers) {
+    EXPECT_THAT(writer->Wait(), IsOkAndHolds(true));
+  }
+  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
+                                                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
+                                                2, 3, 4, 5, 6, 7, 8, 9)));
+}
+
+TEST(SnapshotReaderTest, EmptyDataset) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(0)));
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     tsl::io::compression::kNone,
+                                     Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_path, tsl::io::compression::kNone),
+              IsOkAndHolds(IsEmpty()));
+}
+
+TEST(SnapshotReaderTest, SnapshotDoesNotExist) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_does_not_exist,
+                          CreateSnapshotDirectory());
+  EXPECT_THAT(ReadSnapshot<int64_t>(snapshot_does_not_exist,
+                                    tsl::io::compression::kNone),
+              StatusIs(error::NOT_FOUND));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
new file mode 100644
index 00000000000..09b46be6ce5
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
@@ -0,0 +1,231 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_split_provider.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/btree_map.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNextSplitIndex[] = "next_split_index";
+
+}  // namespace
+
+SnapshotSplitProvider::SnapshotSplitProvider(
+    const std::string& worker_address, const SnapshotTaskDef& snapshot_task,
+    int64_t source_index, absl::Duration timeout,
+    std::unique_ptr<DataServiceDispatcherClient> dispatcher, Env* env)
+    : worker_address_(worker_address),
+      snapshot_task_(snapshot_task),
+      source_index_(source_index),
+      timeout_(timeout),
+      env_(env) {
+  mutex_lock l(mu_);
+  dispatcher_ = std::move(dispatcher);
+}
+
+Status SnapshotSplitProvider::GetNext(Tensor* split, bool* end_of_splits)
+    TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(GetAndValidateSplit(split, end_of_splits));
+  if (!*end_of_splits) {
+    ++next_split_index_;
+  }
+  return OkStatus();
+}
+
+Status SnapshotSplitProvider::GetAndValidateSplit(Tensor* split,
+                                                  bool* end_of_splits)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (split_to_file_map_.contains(next_split_index_)) {
+    return GetSplitFromFile(split_to_file_map_[next_split_index_], split,
+                            end_of_splits);
+  }
+
+  TF_ASSIGN_OR_RETURN(int64_t dispatcher_split_index,
+                      GetSplitFromDispatcher(split, end_of_splits));
+  if (dispatcher_split_index == next_split_index_) {
+    return OkStatus();
+  }
+
+  TF_ASSIGN_OR_RETURN(split_to_file_map_, GetSplitsFiles(next_split_index_));
+  TF_RETURN_IF_ERROR(ValidateSplitFiles(split_to_file_map_, next_split_index_,
+                                        dispatcher_split_index,
+                                        *end_of_splits));
+  return GetSplitFromFile(split_to_file_map_[next_split_index_], split,
+                          end_of_splits);
+}
+
+Status SnapshotSplitProvider::GetSplitFromFile(const std::string& split_file,
+                                               Tensor* split,
+                                               bool* end_of_splits)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Getting the next split from file: " << split_file;
+  snapshot_util::TFRecordReader reader(split_file, tsl::io::compression::kNone,
+                                       DataTypeVector{1, DT_VARIANT});
+  std::vector<Tensor> tensors;
+  TF_RETURN_IF_ERROR(reader.Initialize(env_));
+  TF_RETURN_IF_ERROR(reader.ReadTensors(&tensors));
+  if (tensors.size() != 1) {
+    return errors::Internal(
+        "A snapshot split file is expected to contain 1 tensor. Got ",
+        tensors.size(), " tensors from ", split_file, ".");
+  }
+  *split = std::move(tensors[0]);
+  *end_of_splits = false;
+  return OkStatus();
+}
+
+StatusOr<int64_t> SnapshotSplitProvider::GetSplitFromDispatcher(
+    Tensor* split, bool* end_of_splits) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  int64_t local_split_index = 0;
+  TF_RETURN_IF_ERROR(grpc_util::Retry(
+      [this, split, &local_split_index, end_of_splits]()
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            return dispatcher_->GetSnapshotSplit(
+                worker_address_, snapshot_task_.base_path(),
+                snapshot_task_.stream_index(), source_index_, *split,
+                local_split_index, *end_of_splits);
+          },
+      "Get next split for snapshot",
+      /*deadline_micros=*/env_->NowMicros() +
+          absl::ToInt64Microseconds(timeout_)));
+  return local_split_index;
+}
+
+StatusOr<absl::btree_map<int64_t, std::string>>
+SnapshotSplitProvider::GetSplitsFiles(int64_t start_index) const
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::string splits_directory = SourceDirectory(
+      snapshot_task_.base_path(), snapshot_task_.stream_index(), source_index_);
+  absl::btree_map<int64_t, std::string> splits;
+  std::vector<std::string> split_filenames;
+  Status status = env_->GetChildren(splits_directory, &split_filenames);
+  if (errors::IsNotFound(status)) {
+    return splits;
+  }
+  TF_RETURN_IF_ERROR(status);
+  for (const std::string& split_filename : split_filenames) {
+    TF_ASSIGN_OR_RETURN(auto split_index, SplitIndex(split_filename));
+    auto [local_split_index, global_split_index] = split_index;
+    if (local_split_index >= next_split_index_) {
+      splits[local_split_index] =
+          tsl::io::JoinPath(splits_directory, split_filename);
+    }
+  }
+  TF_RETURN_IF_ERROR(ValidateSplitFiles(splits, start_index));
+  return splits;
+}
+
+Status SnapshotSplitProvider::ValidateSplitFiles(
+    const absl::btree_map<int64_t, std::string>& split_files,
+    int64_t start_index) const {
+  if (split_files.empty()) {
+    return OkStatus();
+  }
+
+  if (split_files.cbegin()->first != start_index) {
+    return errors::Internal("Failed to get split ", start_index,
+                            " for snapshot ", snapshot_task_.DebugString());
+  }
+
+  int64_t end_index = split_files.rbegin()->first;
+  if (end_index - start_index + 1 != split_files.size()) {
+    return errors::Internal("Failed to get split ", start_index,
+                            ". Some splits between [", start_index, ", ",
+                            end_index, "] are missing for snapshot ",
+                            snapshot_task_.DebugString());
+  }
+  return OkStatus();
+}
+
+Status SnapshotSplitProvider::ValidateSplitFiles(
+    const absl::btree_map<int64_t, std::string>& split_files,
+    int64_t start_index, int64_t end_index, bool end_of_splits) const {
+  TF_RETURN_IF_ERROR(ValidateSplitFiles(split_files, start_index));
+  if (end_index < start_index) {
+    return errors::Internal(
+        "The tf.data service worker is expected to read split ", start_index,
+        ", but the dispatcher returns split ", end_index, " for snapshot ",
+        snapshot_task_.DebugString());
+  }
+
+  if (end_of_splits) {
+    // When `end_of_splits` is true, the dispatcher returns the index past the
+    // the last split index. The actual `end_index` is the one before it.
+    end_index = end_index - 1;
+  }
+
+  if (split_files.empty() || split_files.cbegin()->first != start_index ||
+      split_files.rbegin()->first < end_index) {
+    return errors::Internal(
+        "The tf.data service dispatcher has written split ", end_index,
+        ". However, not all splits between [", start_index, ", ", end_index,
+        "] are found for snapshot ", snapshot_task_.DebugString());
+  }
+  return OkStatus();
+}
+
+Status SnapshotSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kNextSplitIndex), next_split_index_));
+  return OkStatus();
+}
+
+Status SnapshotSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) TF_LOCKS_EXCLUDED(mu_) {
+  int64_t next_split_index = 0;
+  TF_RETURN_IF_ERROR(
+      reader->ReadScalar(full_name(kNextSplitIndex), &next_split_index));
+  mutex_lock l(mu_);
+  next_split_index_ = next_split_index;
+  TF_ASSIGN_OR_RETURN(split_to_file_map_, GetSplitsFiles(next_split_index_));
+  return OkStatus();
+}
+
+Status SnapshotSplitProvider::Reset() {
+  return errors::FailedPrecondition(
+      "tf.data SnapshotSplitProvider does not support `Reset`.");
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider.h b/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
new file mode 100644
index 00000000000..14a2866caea
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/container/btree_map.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/dispatcher.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Split provider that supports writing distributed snapshots.
+class SnapshotSplitProvider : public SplitProvider {
+ public:
+  SnapshotSplitProvider(const std::string& worker_address,
+                        const SnapshotTaskDef& snapshot_task,
+                        int64_t source_index, absl::Duration timeout,
+                        std::unique_ptr<DataServiceDispatcherClient> dispatcher,
+                        Env* env);
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  const std::string worker_address_;
+  const SnapshotTaskDef snapshot_task_;
+  const int64_t source_index_;
+  const absl::Duration timeout_;
+  Env* const env_;
+
+  // Gets the next split from file or dispatcher and validates it.
+  Status GetAndValidateSplit(Tensor* split, bool* end_of_splits);
+
+  // Gets the next split by reading from the splits directory.
+  Status GetSplitFromFile(const std::string& split_file, Tensor* split,
+                          bool* end_of_splits);
+
+  // Gets the next split by sending an RPC to the dispatcher. Returns the local
+  // split index from the dispatcher.
+  StatusOr<int64_t> GetSplitFromDispatcher(Tensor* split, bool* end_of_splits);
+
+  // Reads from the split directory and returns a map of split index to absolute
+  // file path of the split, starting at `start_index`.
+  StatusOr<absl::btree_map<int64_t, std::string>> GetSplitsFiles(
+      int64_t start_index) const;
+
+  // Verifies `split_files` contains consecutive splits starting at
+  // `start_index`.
+  Status ValidateSplitFiles(
+      const absl::btree_map<int64_t, std::string>& split_files,
+      int64_t start_index) const;
+
+  // Verifies `split_files` contains consecutive splits starting at
+  // `start_index` and ending at `end_index`.
+  Status ValidateSplitFiles(
+      const absl::btree_map<int64_t, std::string>& split_files,
+      int64_t start_index, int64_t end_index, bool end_of_splits) const;
+
+  mutable mutex mu_;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_ TF_GUARDED_BY(mu_);
+
+  // The next split to read.
+  int64_t next_split_index_ TF_GUARDED_BY(mu_) = 0;
+
+  // Maps the local split index to the absolute split file path.
+  absl::btree_map<int64_t, std::string> split_to_file_map_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_SPLIT_PROVIDER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc b/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc
new file mode 100644
index 00000000000..48f029db957
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc
@@ -0,0 +1,190 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_split_provider.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::_;
+using testing::CreateDummyDistributedSnapshotMetadata;
+using ::testing::DoAll;
+using ::testing::HasSubstr;
+using testing::LocalTempFilename;
+using ::testing::Return;
+using ::testing::SetArgReferee;
+using tsl::testing::StatusIs;
+
+class MockDispatcherClient : public DataServiceDispatcherClient {
+ public:
+  explicit MockDispatcherClient()
+      : DataServiceDispatcherClient(/*address=*/"localhost",
+                                    /*protocol=*/"grpc") {}
+
+  // NOLINTBEGIN(MOCK_METHOD does not work on Windows build, using deprecated
+  // MOCK_METHOD<N> instead)
+  MOCK_METHOD7(GetSnapshotSplit,
+               Status(const std::string& worker_address,
+                      const std::string& base_path, int64_t stream_index,
+                      int64_t source_index, Tensor& split,
+                      int64_t& local_split_index, bool& end_of_splits));
+  // NOLINTEND
+};
+
+SnapshotTaskDef TestSnapshotTask() {
+  SnapshotTaskDef snapshot_task;
+  snapshot_task.set_base_path(LocalTempFilename());
+  snapshot_task.set_stream_index(0);
+  snapshot_task.set_num_sources(1);
+  *snapshot_task.mutable_metadata() = CreateDummyDistributedSnapshotMetadata();
+  return snapshot_task;
+}
+
+Status WriteSplits(const SnapshotTaskDef& snapshot_task, int64_t num_splits) {
+  std::string source_dir = SourceDirectory(
+      snapshot_task.base_path(), snapshot_task.stream_index(), /*source_id=*/0);
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(source_dir));
+  for (int64_t i = 0; i < num_splits; ++i) {
+    std::string split_filename = absl::StrCat("split_", i, "_", i);
+    std::string split_path = tsl::io::JoinPath(source_dir, split_filename);
+    Tensor split(int64_t{i});
+    TF_RETURN_IF_ERROR(
+        AtomicallyWriteTFRecord(split_path, split, Env::Default()));
+  }
+  return OkStatus();
+}
+
+TEST(SnapshotSplitProviderTest, GetSplitFromDispatcher) {
+  const SnapshotTaskDef snapshot_task = TestSnapshotTask();
+  Tensor split(int64_t{0});
+  auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
+  MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
+  // The dispatcher sends split 0 to the worker.
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<4>(split),
+                      SetArgReferee<5>(0),      // local_split_index
+                      SetArgReferee<6>(false),  // end_of_splits
+                      Return(OkStatus())));
+
+  Tensor result;
+  bool end_of_splits = false;
+  SnapshotSplitProvider split_provider(
+      "worker_address", snapshot_task, /*source_index=*/0,
+      /*timeout=*/absl::Seconds(10), std::move(mock_dispatcher_ptr),
+      Env::Default());
+  TF_EXPECT_OK(split_provider.GetNext(&result, &end_of_splits));
+  test::ExpectTensorEqual<int64_t>(result, split);
+  EXPECT_FALSE(end_of_splits);
+}
+
+TEST(SnapshotSplitProviderTest, GetSplitFromFile) {
+  const SnapshotTaskDef snapshot_task = TestSnapshotTask();
+  Tensor split(int64_t{9});
+  auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
+  MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
+  // The dispatcher sends split 9 to the worker. The worker should get previous
+  // splits from the split files.
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<4>(split),
+                      SetArgReferee<5>(9),      // local_split_index
+                      SetArgReferee<6>(false),  // end_of_splits
+                      Return(OkStatus())));
+  TF_ASSERT_OK(WriteSplits(snapshot_task, /*num_splits=*/10));
+
+  SnapshotSplitProvider split_provider(
+      "worker_address", snapshot_task, /*source_index=*/0,
+      /*timeout=*/absl::Seconds(10), std::move(mock_dispatcher_ptr),
+      Env::Default());
+
+  for (int64_t i = 0; i < 10; ++i) {
+    Tensor result;
+    bool end_of_splits = false;
+    TF_EXPECT_OK(split_provider.GetNext(&result, &end_of_splits));
+    test::ExpectTensorEqual<int64_t>(result, Tensor(int64_t{i}));
+    EXPECT_FALSE(end_of_splits);
+  }
+}
+
+TEST(SnapshotSplitProviderTest, EndOfSplits) {
+  const SnapshotTaskDef snapshot_task = TestSnapshotTask();
+  auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
+  MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
+  // The dispatcher sends `end_of_splits` to the worker.
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<5>(0),     // local_split_index
+                      SetArgReferee<6>(true),  // end_of_splits
+                      Return(OkStatus())));
+
+  SnapshotSplitProvider split_provider(
+      "worker_address", snapshot_task, /*source_index=*/0,
+      /*timeout=*/absl::Seconds(10), std::move(mock_dispatcher_ptr),
+      Env::Default());
+  Tensor result;
+  bool end_of_splits = false;
+  TF_EXPECT_OK(split_provider.GetNext(&result, &end_of_splits));
+  EXPECT_TRUE(end_of_splits);
+}
+
+TEST(SnapshotSplitProviderTest, SplitNotFound) {
+  const SnapshotTaskDef snapshot_task = TestSnapshotTask();
+  Tensor split(int64_t{10});
+  auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
+  MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
+  // The dispatcher sends split 10, but no splits are written.
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<4>(split),
+                      SetArgReferee<5>(10),     // local_split_index
+                      SetArgReferee<6>(false),  // end_of_splits
+                      Return(OkStatus())));
+
+  SnapshotSplitProvider split_provider(
+      "worker_address", snapshot_task, /*source_index=*/0,
+      /*timeout=*/absl::Seconds(10), std::move(mock_dispatcher_ptr),
+      Env::Default());
+  Tensor result;
+  bool end_of_splits = false;
+  EXPECT_THAT(split_provider.GetNext(&result, &end_of_splits),
+              StatusIs(error::INTERNAL,
+                       HasSubstr("not all splits between [0, 10] are found")));
+}
+
+// TODO(b/266126556): Add a test for checkpointing the split provider.
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
new file mode 100644
index 00000000000..41393a1761a
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
@@ -0,0 +1,382 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/service/snapshot/file_utils.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/utils.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/regexp.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// Extracts the index from `filename`. If `filename` is `prefix_<index>`, this
+// returns <index>. If `filename` does not start with `prefix`, returns an
+// internal error.
+StatusOr<int64_t> GetFileIndex(const std::string& filename,
+                               const std::string& prefix) {
+  RE2 kFilenameRe(absl::StrCat(prefix, R"(_(\d+)$)"));
+  int64_t index = 0;
+  if (!RE2::PartialMatch(filename, kFilenameRe, &index)) {
+    return errors::Internal("Failed to extract the index for file `", filename,
+                            "` with prefix `", prefix, "`.");
+  }
+  return index;
+}
+
+}  // namespace
+
+constexpr int64_t SnapshotWriterParams::kDefaultMaxChunkSizeBytes;
+
+SnapshotStreamWriter::SnapshotStreamWriter(
+    const SnapshotWriterParams& params, std::unique_ptr<TaskIterator> iterator)
+    : params_(params), iterator_(std::move(iterator)) {
+  DCHECK_NE(iterator_, nullptr);
+  snapshot_thread_ = absl::WrapUnique(params_.env->StartThread(
+      /*thread_options=*/{}, /*name=*/"tf_data_service_snapshot_thread",
+      [this]() { WriteSnapshotAndLog(); }));
+}
+
+void SnapshotStreamWriter::WriteSnapshotAndLog() TF_LOCKS_EXCLUDED(mu_) {
+  if (StreamAlreadyCompleted()) {
+    LOG(INFO) << "Distributed tf.data snapshot stream has already been "
+              << "completed for " << params_.DebugString();
+    mutex_lock l(mu_);
+    completed_ = true;
+    return;
+  }
+
+  LOG(INFO) << "Writing distributed tf.data snapshot stream: "
+            << params_.DebugString();
+  Status status = WriteSnapshot();
+  status = FinalizeStream(status);
+  mutex_lock l(mu_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to write distributed tf.data snapshot stream: "
+               << params_.DebugString() << ". Status: " << status;
+    completed_ = std::move(status);
+    return;
+  }
+  LOG(INFO) << "Finished writing distributed tf.data snapshot stream: "
+            << params_.DebugString();
+  completed_ = true;
+}
+
+Status SnapshotStreamWriter::WriteSnapshot() TF_LOCKS_EXCLUDED(mu_) {
+  // TODO(b/258691097): Write the "LEASE" file periodically.
+  TF_RETURN_IF_ERROR(InitializeDirectories());
+  TF_RETURN_IF_ERROR(Restore());
+  while (ShouldWriteChunk()) {
+    TF_RETURN_IF_ERROR(WriteChunk());
+  }
+  mutex_lock l(mu_);
+  return completed_.status();
+}
+
+bool SnapshotStreamWriter::StreamAlreadyCompleted() const {
+  std::string done_file_path =
+      StreamDoneFilePath(params_.snapshot_path, params_.stream_index);
+  return params_.env->FileExists(done_file_path).ok();
+}
+
+Status SnapshotStreamWriter::InitializeDirectories() {
+  TF_RETURN_IF_ERROR(
+      params_.env->RecursivelyCreateDir(params_.UncommittedChunksDirectory()));
+  TF_RETURN_IF_ERROR(
+      params_.env->RecursivelyCreateDir(params_.CheckpointsDirectory()));
+  return OkStatus();
+}
+
+bool SnapshotStreamWriter::ShouldWriteChunk() const TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return !end_of_sequence_ && completed_.ok();
+}
+
+Status SnapshotStreamWriter::WriteChunk() {
+  LOG(INFO) << "Writing distributed tf.data snapshot stream "
+            << params_.stream_index << ", chunk " << chunk_index_ << ".";
+  std::string chunk_file_path = GetChunkFilePath();
+  snapshot_util::TFRecordWriter writer(chunk_file_path, params_.compression);
+  TF_RETURN_IF_ERROR(writer.Initialize(params_.env));
+  while (ShouldWriteRecord()) {
+    TF_RETURN_IF_ERROR(WriteRecord(writer));
+  }
+  TF_RETURN_IF_ERROR(writer.Close());
+  return CommitChunk();
+}
+
+Status SnapshotStreamWriter::CommitChunk() {
+  // Writes the checkpoint before committing the chunk. If the worker fails in
+  // between, the restarted worker will synchronize the checkpoint with the
+  // committed chunks.
+  if (ShouldSave()) {
+    TF_RETURN_IF_ERROR(Save());
+  }
+  TF_RETURN_IF_ERROR(
+      params_.env->RenameFile(GetChunkFilePath(), GetCommittedChunkFilePath()));
+  ++chunk_index_;
+  chunk_size_bytes_ = 0;
+  return OkStatus();
+}
+
+std::string SnapshotStreamWriter::GetChunkFilePath() const {
+  return tsl::io::JoinPath(params_.UncommittedChunksDirectory(),
+                           absl::StrCat("chunk_", chunk_index_));
+}
+
+std::string SnapshotStreamWriter::GetCommittedChunkFilePath() const {
+  return tsl::io::JoinPath(
+      params_.CommittedChunksDirectory(),
+      absl::StrCat("chunk_", params_.stream_index, "_", chunk_index_));
+}
+
+bool SnapshotStreamWriter::ShouldWriteRecord() const TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return chunk_size_bytes_ < params_.max_chunk_size_bytes &&
+         !end_of_sequence_ && completed_.ok();
+}
+
+Status SnapshotStreamWriter::WriteRecord(
+    snapshot_util::TFRecordWriter& writer) {
+  std::vector<Tensor> element;
+  TF_RETURN_IF_ERROR(iterator_->GetNext(element, end_of_sequence_));
+  if (end_of_sequence_) {
+    return writer.Close();
+  }
+  TF_RETURN_IF_ERROR(writer.WriteTensors(element));
+  chunk_size_bytes_ += EstimatedSizeBytes(element);
+  return OkStatus();
+}
+
+Status SnapshotStreamWriter::FinalizeStream(Status status) {
+  if (status.ok()) {
+    status = WriteDoneFile();
+  }
+  if (!status.ok()) {
+    // If writing snapshot fails and writing the error file also fails, returns
+    // the former status.
+    WriteErrorFile(status).IgnoreError();
+  }
+  Status s = DeleteCheckpoints();
+  if (!s.ok()) {
+    LOG(ERROR) << "Failed to clean up checkpoints at "
+               << params_.CheckpointsDirectory() << ": " << s;
+  }
+  return status;
+}
+
+Status SnapshotStreamWriter::WriteDoneFile() {
+  std::string done_file_path =
+      StreamDoneFilePath(params_.snapshot_path, params_.stream_index);
+  return AtomicallyWriteStringToFile(done_file_path, "", params_.env);
+}
+
+Status SnapshotStreamWriter::WriteErrorFile(const Status& status) {
+  std::string error_file_path =
+      tsl::io::JoinPath(params_.StreamDirectory(), "ERROR");
+  return AtomicallyWriteStringToFile(error_file_path, status.ToString(),
+                                     params_.env);
+}
+
+StatusOr<bool> SnapshotStreamWriter::Completed() const TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return completed_;
+}
+
+StatusOr<bool> SnapshotStreamWriter::Wait() TF_LOCKS_EXCLUDED(mu_) {
+  snapshot_thread_.reset();
+  mutex_lock l(mu_);
+  return completed_;
+}
+
+void SnapshotStreamWriter::Cancel() TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  completed_ = errors::Cancelled(
+      "The tf.data service snapshot writer has been cancelled.");
+}
+
+bool SnapshotStreamWriter::ShouldSave() const TF_LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  if (end_of_sequence_) {
+    // If this is the last chunk, we only write checkpoints when there are more
+    // than one chunk. For example, if there are 3 chunks, the files will be:
+    // 1. Write checkpoint 1
+    // 2. Commit chunk 1
+    // 3. Write checkpoint 2
+    // 4. Commit chunk 2
+    // 5. Write checkpoint 3
+    // 6. Commit chunk 3
+    // 7. Write DONE file
+    // If there is only one chunk, we do not need to write a checkpoint.
+    return chunk_index_ > 0 && chunk_size_bytes_ > 0;
+  }
+  return completed_.ok();
+}
+
+Status SnapshotStreamWriter::Save() {
+  LOG(INFO) << "Checkpointing distributed tf.data snapshot writer. Stream "
+            << params_.stream_index << ", chunk " << chunk_index_
+            << ", chunk size in bytes: " << chunk_size_bytes_ << ".";
+  std::string uncommitted_checkpoint_path;
+  if (!params_.env->LocalTempFilename(&uncommitted_checkpoint_path)) {
+    return errors::Internal(
+        "Failed to create temp files for distributed snapshot checkpoints.");
+  }
+  std::string committed_checkpoint_path = CheckpointPath(chunk_index_);
+
+  snapshot_util::TFRecordWriter writer(uncommitted_checkpoint_path,
+                                       params_.compression);
+  TF_RETURN_IF_ERROR(writer.Initialize(params_.env));
+  TF_ASSIGN_OR_RETURN(Tensor serialized, iterator_->Save());
+  TF_RETURN_IF_ERROR(writer.WriteTensors({serialized}));
+  TF_RETURN_IF_ERROR(writer.Close());
+  TF_RETURN_IF_ERROR(params_.env->RenameFile(uncommitted_checkpoint_path,
+                                             committed_checkpoint_path));
+  return DeleteOutdatedCheckpoints();
+}
+
+Status SnapshotStreamWriter::DeleteOutdatedCheckpoints() {
+  if (params_.test_only_keep_temp_files) {
+    return OkStatus();
+  }
+
+  std::vector<std::string> checkpoint_filenames;
+  TF_RETURN_IF_ERROR(params_.env->GetChildren(params_.CheckpointsDirectory(),
+                                              &checkpoint_filenames));
+  for (const std::string& checkpoint_filename : checkpoint_filenames) {
+    std::string checkpoint_filepath =
+        tsl::io::JoinPath(params_.CheckpointsDirectory(), checkpoint_filename);
+    TF_ASSIGN_OR_RETURN(int64_t checkpoint_index,
+                        GetFileIndex(checkpoint_filename, "checkpoint"));
+    if (checkpoint_index < chunk_index_) {
+      TF_RETURN_IF_ERROR(params_.env->DeleteFile(checkpoint_filepath));
+    }
+  }
+  return OkStatus();
+}
+
+Status SnapshotStreamWriter::DeleteCheckpoints() {
+  if (params_.test_only_keep_temp_files) {
+    return OkStatus();
+  }
+  if (params_.env->FileExists(params_.CheckpointsDirectory()).ok()) {
+    int64_t undeleted_files, undeleted_dirs;
+    return params_.env->DeleteRecursively(params_.CheckpointsDirectory(),
+                                          &undeleted_files, &undeleted_dirs);
+  }
+  return OkStatus();
+}
+
+Status SnapshotStreamWriter::Restore() {
+  StatusOr<int64_t> checkpoint_index = LastCheckpointIndex();
+  if (errors::IsNotFound(checkpoint_index.status())) {
+    // No checkpoint has been written. Does not restore anything.
+    return OkStatus();
+  }
+  TF_RETURN_IF_ERROR(checkpoint_index.status());
+
+  std::string checkpoint_path = CheckpointPath(*checkpoint_index);
+  snapshot_util::TFRecordReader reader(checkpoint_path, params_.compression,
+                                       DataTypeVector{1, DT_VARIANT});
+  TF_RETURN_IF_ERROR(reader.Initialize(params_.env));
+  std::vector<Tensor> serialized_tensors;
+  TF_RETURN_IF_ERROR(reader.ReadTensors(&serialized_tensors));
+  if (serialized_tensors.size() != 1) {
+    return errors::Internal(
+        "A snapshot checkpoint file is expected to contain 1 Tensor. Got ",
+        serialized_tensors.size(),
+        " tensors from checkpoint file: ", checkpoint_path);
+  }
+  TF_RETURN_IF_ERROR(iterator_->Restore(serialized_tensors[0]));
+  TF_RETURN_IF_ERROR(SyncCheckpointWithChunks(*checkpoint_index));
+  chunk_index_ = *checkpoint_index + 1;
+  LOG(INFO) << "Restored distributed tf.data snapshot writer. Stream "
+            << params_.stream_index << ", chunk " << *checkpoint_index << ".";
+  return OkStatus();
+}
+
+StatusOr<int64_t> SnapshotStreamWriter::LastCheckpointIndex() const {
+  std::vector<std::string> checkpoint_names;
+  TF_RETURN_IF_ERROR(params_.env->GetChildren(params_.CheckpointsDirectory(),
+                                              &checkpoint_names));
+  if (checkpoint_names.empty()) {
+    return errors::NotFound("No checkpoint has been written in directory ",
+                            params_.CheckpointsDirectory());
+  }
+
+  int64_t last_index = 0;
+  for (const std::string& checkpoint_name : checkpoint_names) {
+    TF_ASSIGN_OR_RETURN(int64_t checkpoint_index,
+                        GetFileIndex(checkpoint_name, "checkpoint"));
+    last_index = std::max(last_index, checkpoint_index);
+  }
+  return last_index;
+}
+
+Status SnapshotStreamWriter::SyncCheckpointWithChunks(
+    int64_t checkpoint_index) {
+  // In case the worker fails after writing the checkpoint but before committing
+  // a chunk file, this will synchronize the checkpoint with the chunks. It will
+  // commit uncommitted chunk files written before the checkpoint and delete
+  // chunk files written after the checkpoint.
+  std::vector<std::string> uncommitted_chunks;
+  TF_RETURN_IF_ERROR(params_.env->GetChildren(
+      params_.UncommittedChunksDirectory(), &uncommitted_chunks));
+
+  for (const std::string& uncommitted_chunk : uncommitted_chunks) {
+    std::string uncommitted_chunk_filename = tsl::io::JoinPath(
+        params_.UncommittedChunksDirectory(), uncommitted_chunk);
+    TF_ASSIGN_OR_RETURN(int64_t chunk_index,
+                        GetFileIndex(uncommitted_chunk, "chunk"));
+    std::string committed_chunk_filename = tsl::io::JoinPath(
+        params_.CommittedChunksDirectory(),
+        absl::StrCat("chunk_", params_.stream_index, "_", chunk_index));
+    if (chunk_index <= checkpoint_index) {
+      TF_RETURN_IF_ERROR(params_.env->RenameFile(uncommitted_chunk_filename,
+                                                 committed_chunk_filename));
+    } else {
+      TF_RETURN_IF_ERROR(params_.env->DeleteFile(uncommitted_chunk_filename));
+    }
+  }
+  return OkStatus();
+}
+
+std::string SnapshotStreamWriter::CheckpointPath(int64_t chunk_index) const {
+  return tsl::io::JoinPath(params_.CheckpointsDirectory(),
+                           absl::StrCat("checkpoint_", chunk_index));
+}
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
new file mode 100644
index 00000000000..9b10e9577d5
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
@@ -0,0 +1,230 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/worker.pb.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/protobuf/service_config.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+struct SnapshotWriterParams {
+  // The directory path of the snapshot. See the comment on SnapshotStreamWriter
+  // for how the directory is structured.
+  std::string snapshot_path;
+
+  // The index of the snapshot stream. A stream is one shard of the snapshot
+  // processed by a worker.
+  int64_t stream_index = 0;
+
+  // Compression method as defined in tsl/lib/io/compression.h.
+  std::string compression;
+
+  // The Tensorflow environment.
+  Env* env = nullptr;
+
+  // The maximum number of bytes in each chunk.
+  int64_t max_chunk_size_bytes = kDefaultMaxChunkSizeBytes;
+
+  // If true, keep temporary files (e.g., checkpoints) after completing the
+  // snapshot. Used only for unit testing.
+  bool test_only_keep_temp_files = false;
+
+  std::string StreamDirectory() const {
+    return tensorflow::data::StreamDirectory(snapshot_path, stream_index);
+  }
+
+  std::string CommittedChunksDirectory() const {
+    return tensorflow::data::CommittedChunksDirectory(snapshot_path);
+  }
+
+  std::string UncommittedChunksDirectory() const {
+    return tensorflow::data::UncommittedChunksDirectory(snapshot_path,
+                                                        stream_index);
+  }
+
+  std::string CheckpointsDirectory() const {
+    return tensorflow::data::CheckpointsDirectory(snapshot_path, stream_index);
+  }
+
+  std::string DebugString() const {
+    return absl::Substitute(
+        "SnapshotWriterParams { base_path: $0, stream: $1, compression: $2 }",
+        snapshot_path, stream_index, compression);
+  }
+
+ private:
+  static constexpr int64_t kDefaultMaxChunkSizeBytes =
+      10 * (size_t{1} << 30);  // 10GB
+};
+
+// Responsible for writing one snapshot stream, which is organized as following:
+//
+// - snapshot
+//   - LEASE
+//   - DONE
+//   - snapshot.metadata
+//   - dataset_def.proto
+//   - chunks
+//     - chunk_<stream_index>_<chunk_index>
+//   - streams
+//     - stream_0
+//       - LEASE
+//       - DONE
+//       - splits
+//         - split_<local_split_index>_<global_split_index>
+//       - uncommitted chunks
+//         - chunk_<chunk_index>
+//       - checkpoints
+//         - checkpoint_<chunk_index>
+//
+// This class is thread-safe.
+// TODO(b/258691666): Support chunking, checkpointing, and fault tolerance.
+class SnapshotStreamWriter {
+ public:
+  // Creates a SnapshotStreamWriter. Once created, it will start writing the
+  // snapshot stream. Users can call `Wait` to wait for it to finish.
+  // TODO(b/258691666): Create a new `TaskIterator` that persists splits.
+  explicit SnapshotStreamWriter(const SnapshotWriterParams& params,
+                                std::unique_ptr<TaskIterator> iterator);
+  virtual ~SnapshotStreamWriter() = default;
+  SnapshotStreamWriter(const SnapshotStreamWriter&) = delete;
+  SnapshotStreamWriter& operator=(const SnapshotStreamWriter&) = delete;
+
+  // Returns true if the snapshot stream has completed. A snapshot stream is
+  // completed if the dataset has reached the end of sequence and a DONE file is
+  // written. Returns an error if the snapshot has failed. This does not block
+  // the caller.
+  StatusOr<bool> Completed() const;
+
+  // Waits for the writer to finish writing the snapshot stream and returns the
+  // final status.
+  StatusOr<bool> Wait();
+
+  // Cancels the writer. If cancelled, `Wait` will return a Cancelled error.
+  void Cancel();
+
+ private:
+  // Writes the snapshot and any debugging log when necessary.
+  void WriteSnapshotAndLog();
+
+  // Writes the snapshot. Returns an error if writing fails or the task has been
+  // cancelled.
+  Status WriteSnapshot();
+
+  // Returns true if the stream is already completed and there is no additional
+  // work to perform.
+  bool StreamAlreadyCompleted() const;
+
+  // Creates directories to store uncommitted chunks and checkpoints.
+  Status InitializeDirectories();
+
+  // Returns true until the snapshot stream writer is finished, which may be due
+  // to reaching the end of its iterator, encountering an error, or being
+  // cancelled.
+  bool ShouldWriteChunk() const;
+
+  // Writes the next chunk.
+  Status WriteChunk();
+
+  // Commits the current chunk.
+  Status CommitChunk();
+
+  // Returns the path of the current chunk.
+  std::string GetChunkFilePath() const;
+  std::string GetCommittedChunkFilePath() const;
+
+  // Returns true if the writer should write the next record to the current
+  // chunk.
+  bool ShouldWriteRecord() const;
+
+  // Writes the next record to the current chunk.
+  Status WriteRecord(snapshot_util::TFRecordWriter& writer);
+
+  // Writes a DONE file when the stream is finished. Writes an ERROR file if it
+  // failed.
+  Status FinalizeStream(Status status);
+  Status WriteDoneFile();
+  Status WriteErrorFile(const Status& status);
+
+  // Returns true if the writer should write an iterator checkpoint.
+  bool ShouldSave() const;
+
+  // Saves an iterator checkpoint.
+  Status Save();
+
+  // After committing a checkpoint, deletes the previous checkpoints.
+  Status DeleteOutdatedCheckpoints();
+
+  // Deletes all checkpoints.
+  Status DeleteCheckpoints();
+
+  // Restores from the last checkpoint.
+  Status Restore();
+
+  // Returns the index of the last checkpointed chunk.
+  StatusOr<int64_t> LastCheckpointIndex() const;
+
+  // Synchronizes the checkpoint with the committed chunks. This is called when
+  // the worker restores the snapshot in case the worker fails after writing the
+  // checkpoint but before committing a chunk file.
+  Status SyncCheckpointWithChunks(int64_t checkpoint_index);
+
+  // Returns the path of the checkpoint for `chunk_index`.
+  std::string CheckpointPath(int64_t chunk_index) const;
+
+  const SnapshotWriterParams params_;
+
+  // The dataset iterator that produces the dataset elements.
+  std::unique_ptr<TaskIterator> iterator_;
+
+  // Index of the current chunk.
+  int64_t chunk_index_ = 0;
+  // Size of the current chunk.
+  int64_t chunk_size_bytes_ = 0;
+
+  // True if the dataset is exhausted.
+  bool end_of_sequence_ = false;
+
+  mutable mutex mu_;
+
+  // Whether the writer is completed:
+  // - If the snapshot is successful, this is true.
+  // - If any error happens during the snapshot write, it is the error status.
+  // - If the snapshot has not finished, this is false.
+  StatusOr<bool> completed_ TF_GUARDED_BY(mu_) = false;
+
+  std::unique_ptr<Thread> snapshot_thread_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_STREAM_WRITER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
new file mode 100644
index 00000000000..c23bf19bd0d
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
@@ -0,0 +1,270 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+#include "tensorflow/core/data/service/snapshot/test_utils.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/io/compression.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::UnorderedElementsAre;
+using ::testing::ValuesIn;
+using ::tsl::testing::IsOkAndHolds;
+
+StatusOr<std::string> CreateSnapshotDirectory() {
+  std::string snapshot_path;
+  if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
+    return errors::FailedPrecondition(
+        "Failed to create local temp file for snapshot.");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  return snapshot_path;
+}
+
+StatusOr<int64_t> NumCheckpoints(const std::string& snapshot_path,
+                                 int64_t stream_index) {
+  std::string checkpoints_directory =
+      CheckpointsDirectory(snapshot_path, stream_index);
+  std::vector<std::string> checkpoint_filenames;
+  if (!Env::Default()->FileExists(checkpoints_directory).ok()) {
+    return 0;
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->GetChildren(checkpoints_directory,
+                                                 &checkpoint_filenames));
+  return checkpoint_filenames.size();
+}
+
+using SnapshotStreamWriterParameterizedTest =
+    ::testing::TestWithParam<std::string>;
+
+TEST_P(SnapshotStreamWriterParameterizedTest, SaveAndRestoreFromCheckpoints) {
+  const int64_t range = 10;
+  const std::string compression = GetParam();
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  TF_ASSERT_OK_AND_ASSIGN(
+      testing::PartialSnapshotWriter partial_writer,
+      testing::PartialSnapshotWriter::Create(dataset, snapshot_path,
+                                             stream_index, compression));
+
+  // Generates a snapshot directory with a single checkpoint. The stream writer
+  // should resume from this checkpoint.
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({5}));
+
+  SnapshotWriterParams writer_params{snapshot_path,
+                                     /*stream_index=*/0, compression,
+                                     Env::Default()};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(6, 7, 8, 9)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Compression, SnapshotStreamWriterParameterizedTest,
+                         ValuesIn<std::string>({tsl::io::compression::kNone,
+                                                tsl::io::compression::kGzip,
+                                                tsl::io::compression::kSnappy,
+                                                tsl::io::compression::kZlib}));
+
+TEST(SnapshotStreamWriterCheckpointTest, NoCheckpoint) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(testing::RangeDataset(range)));
+
+  const std::string compression = tsl::io::compression::kSnappy;
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path,
+                                     /*stream_index=*/0,
+                                     compression,
+                                     Env::Default(),
+                                     /*max_chunk_size_bytes=*/kint64max,
+                                     /*test_only_keep_temp_files=*/true};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+
+  // Since there is only one chunk, no checkpoint is needed.
+  EXPECT_THAT(NumCheckpoints(snapshot_path, /*stream_index=*/0),
+              IsOkAndHolds(0));
+}
+
+TEST(SnapshotStreamWriterCheckpointTest, WithCheckpoint) {
+  int64_t range = 5;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(testing::RangeDataset(range)));
+
+  const std::string compression = tsl::io::compression::kSnappy;
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path,
+                                     /*stream_index=*/0,
+                                     compression,
+                                     Env::Default(),
+                                     /*max_chunk_size_bytes=*/20,
+                                     /*test_only_keep_temp_files=*/true};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4)));
+
+  // Expected files:
+  // Checkpoint 0
+  // Chunk 0 (elements 0, 1)
+  // Checkpoint 1
+  // Chunk 1 (elements 2, 3)
+  // Checkpoint 2
+  // Chunk 2 (element 4)
+  // DONE
+  EXPECT_THAT(NumCheckpoints(snapshot_path, /*stream_index=*/0),
+              IsOkAndHolds(3));
+}
+
+TEST(SnapshotStreamWriterCheckpointTest, CleanupCheckpoint) {
+  int64_t range = 5;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(testing::RangeDataset(range)));
+
+  const std::string compression = tsl::io::compression::kSnappy;
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path,
+                                     /*stream_index=*/0,
+                                     compression,
+                                     Env::Default(),
+                                     /*max_chunk_size_bytes=*/20,
+                                     /*test_only_keep_temp_files=*/false};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4)));
+
+  // If test_only_keep_temp_files is false (default), the checkpoints should be
+  // removed once the snapshot is complete.
+  EXPECT_THAT(NumCheckpoints(snapshot_path, /*stream_index=*/0),
+              IsOkAndHolds(0));
+}
+
+TEST(SnapshotStreamWriterCheckpointTest, SyncCheckpointsWithChunksByRenaming) {
+  const int64_t range = 10;
+  const std::string compression = tsl::io::compression::kSnappy;
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  TF_ASSERT_OK_AND_ASSIGN(
+      testing::PartialSnapshotWriter partial_writer,
+      testing::PartialSnapshotWriter::Create(dataset, snapshot_path,
+                                             stream_index, compression));
+
+  // Unlike the previous test, this test has generated some uncommitted chunks.
+  // The stream writer will first move the uncommitted chunks to the committed
+  // chunks directory.
+  TF_ASSERT_OK(partial_writer.WriteUncommittedChunks({0, 1, 2}));
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({5}));
+
+  SnapshotWriterParams writer_params{snapshot_path, stream_index, compression,
+                                     Env::Default()};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotStreamWriter writer(writer_params, std::move(iterator));
+  EXPECT_THAT(writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 6, 7, 8, 9)));
+}
+
+TEST(SnapshotStreamWriterCheckpointTest, SyncCheckpointsWithChunksByDeleting) {
+  const int64_t range = 10;
+  const std::string compression = tsl::io::compression::kNone;
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  // Generates some additional chunks.
+  TF_ASSERT_OK_AND_ASSIGN(testing::PartialSnapshotWriter partial_writer,
+                          testing::PartialSnapshotWriter::Create(
+                              testing::RangeDataset(range + 5), snapshot_path,
+                              stream_index, compression));
+
+  // The writer will first delete uncommitted chunks after the checkpoint when
+  // it is restored. In the end, only chunks 6--9 are written.
+  TF_ASSERT_OK(
+      partial_writer.WriteUncommittedChunks({6, 7, 8, 9, 10, 11, 12, 13, 14}));
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({5}));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotWriterParams writer_params{snapshot_path, stream_index, compression,
+                                     Env::Default()};
+  SnapshotStreamWriter writer(writer_params, std::move(iterator));
+  EXPECT_THAT(writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(6, 7, 8, 9)));
+}
+
+TEST(SnapshotStreamWriterCheckpointTest, SyncCheckpointsWithChunks) {
+  const int64_t range = 10;
+  const std::string compression = tsl::io::compression::kZlib;
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  // Generates some additional chunks.
+  TF_ASSERT_OK_AND_ASSIGN(testing::PartialSnapshotWriter partial_writer,
+                          testing::PartialSnapshotWriter::Create(
+                              testing::RangeDataset(range + 5), snapshot_path,
+                              stream_index, compression));
+
+  // This test combines the previous two test cases: Uncommitted chunks before
+  // the checkpoint are committed; Uncommitted chunks after the checkpoint are
+  // deleted.
+  TF_ASSERT_OK(partial_writer.WriteUncommittedChunks(
+      {1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}));
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({5}));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotWriterParams writer_params{snapshot_path, stream_index, compression,
+                                     Env::Default()};
+  SnapshotStreamWriter writer(writer_params, std::move(iterator));
+  EXPECT_THAT(writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(1, 2, 5, 6, 7, 8, 9)));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
new file mode 100644
index 00000000000..1f08ee4708e
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_test.cc
@@ -0,0 +1,318 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/data/snapshot_utils.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/lib/io/compression.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::ValuesIn;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
+    const DatasetDef& dataset_def) {
+  std::unique_ptr<standalone::Dataset> dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+      standalone::Dataset::Params(), dataset_def.graph(), &dataset));
+  std::unique_ptr<standalone::Iterator> iterator;
+  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
+  return std::make_unique<StandaloneTaskIterator>(std::move(dataset),
+                                                  std::move(iterator));
+}
+
+template <class T>
+class ElementOrErrorIterator : public TaskIterator {
+ public:
+  explicit ElementOrErrorIterator(const std::vector<StatusOr<T>>& elements)
+      : elements_(elements) {}
+
+  Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override {
+    end_of_sequence = (next_ >= elements_.size());
+    if (end_of_sequence) {
+      return OkStatus();
+    }
+    const StatusOr<T>& next_element = elements_[next_++];
+    TF_RETURN_IF_ERROR(next_element.status());
+    element = {Tensor{*next_element}};
+    return OkStatus();
+  }
+
+  StatusOr<Tensor> Save() override { return Tensor(); }
+
+  Status Restore(const Tensor& saved_iterator) override { return OkStatus(); }
+
+  int64_t Cardinality() const override { return elements_.size(); }
+
+ private:
+  const std::vector<StatusOr<T>> elements_;
+  int64_t next_ = 0;
+};
+
+StatusOr<std::string> CreateSnapshotDirectory() {
+  std::string snapshot_path;
+  if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
+    return errors::FailedPrecondition(
+        "Failed to create local temp file for snapshot.");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  return snapshot_path;
+}
+
+StatusOr<std::unique_ptr<snapshot_util::Reader>> CreateSnapshotReader(
+    const std::string& snapshot_path, int64_t num_elements,
+    const std::string& compression, Env* env) {
+  static constexpr int kTFRecordReader = 2;
+  DataTypeVector dtypes(num_elements, DT_INT64);
+  std::unique_ptr<snapshot_util::Reader> reader;
+  TF_RETURN_IF_ERROR(snapshot_util::Reader::Create(
+      env, snapshot_path, compression, kTFRecordReader, dtypes, &reader));
+  return reader;
+}
+
+template <class T>
+StatusOr<std::vector<T>> ReadSnapshot(const std::string& snapshot_path,
+                                      const std::string& compression,
+                                      int64_t num_elements) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<snapshot_util::Reader> reader,
+                      CreateSnapshotReader(snapshot_path, num_elements,
+                                           compression, Env::Default()));
+
+  std::vector<Tensor> tensors;
+  TF_RETURN_IF_ERROR(reader->ReadTensors(&tensors));
+
+  std::vector<T> result;
+  for (const Tensor& tensor : tensors) {
+    result.push_back(tensor.unaligned_flat<T>().data()[0]);
+  }
+  return result;
+}
+
+StatusOr<std::string> ReadStringFromFile(const std::string& filename) {
+  std::string data;
+  TF_RETURN_IF_ERROR(ReadFileToString(Env::Default(), filename, &data));
+  return data;
+}
+
+using SnapshotStreamWriterParameterizedTest =
+    ::testing::TestWithParam<std::string>;
+
+TEST_P(SnapshotStreamWriterParameterizedTest, WriteSnapshot) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+
+  std::string compression = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     compression, Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  // The data is written to the committed chunks directory. The uncommitted
+  // files are deleted.
+  EXPECT_THAT(ReadSnapshot<int64_t>(
+                  tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
+                                    "chunk_0_0"),
+                  compression, range),
+              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+
+  EXPECT_THAT(ReadSnapshot<int64_t>(
+                  tsl::io::JoinPath(writer_params.UncommittedChunksDirectory(),
+                                    "chunk_0"),
+                  compression, range),
+              StatusIs(error::NOT_FOUND));
+}
+
+TEST_P(SnapshotStreamWriterParameterizedTest, StreamAlreadyCompleted) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+
+  std::string compression = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     compression, Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  EXPECT_THAT(ReadSnapshot<int64_t>(
+                  tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
+                                    "chunk_0_0"),
+                  compression, range),
+              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+
+  // Writes the same snapshot.
+  TF_ASSERT_OK_AND_ASSIGN(iterator, TestIterator(testing::RangeDataset(range)));
+  SnapshotStreamWriter duplicate_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(ReadSnapshot<int64_t>(
+                  tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
+                                    "chunk_0_0"),
+                  compression, range),
+              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+}
+
+TEST_P(SnapshotStreamWriterParameterizedTest, WriteSnapshotChunks) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+
+  std::string compression = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     compression, Env::Default(),
+                                     /*max_chunk_size_bytes=*/1};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_THAT(ReadSnapshot<int64_t>(
+                    tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
+                                      absl::StrCat("chunk_0_", i)),
+                    compression,
+                    /*num_elements=*/1),
+                IsOkAndHolds(ElementsAre(i)));
+  }
+}
+
+TEST_P(SnapshotStreamWriterParameterizedTest, WriteDoneFile) {
+  int64_t range = 10;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+
+  std::string compression = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  std::string done_file_path = tsl::io::JoinPath(
+      StreamDirectory(snapshot_path, /*stream_index=*/0), "DONE");
+  std::string error_file_path = tsl::io::JoinPath(
+      StreamDirectory(snapshot_path, /*stream_index=*/0), "ERROR");
+
+  EXPECT_THAT(Env::Default()->FileExists(done_file_path),
+              StatusIs(error::NOT_FOUND));
+  EXPECT_THAT(Env::Default()->FileExists(error_file_path),
+              StatusIs(error::NOT_FOUND));
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     compression, Env::Default(),
+                                     /*max_chunk_size_bytes=*/1};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  TF_EXPECT_OK(Env::Default()->FileExists(done_file_path));
+  EXPECT_THAT(Env::Default()->FileExists(error_file_path),
+              StatusIs(error::NOT_FOUND));
+  EXPECT_THAT(snapshot_writer.Completed(), IsOkAndHolds(true));
+}
+
+TEST_P(SnapshotStreamWriterParameterizedTest, WriteErrorFile) {
+  auto error_iterator = std::make_unique<ElementOrErrorIterator<tstring>>(
+      std::vector<StatusOr<tstring>>{
+          tstring("First element"), errors::InvalidArgument("Invalid argument"),
+          tstring("Second element"), errors::Aborted("Aborted")});
+  std::string compression = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  std::string done_file_path = tsl::io::JoinPath(
+      StreamDirectory(snapshot_path, /*stream_index=*/0), "DONE");
+  std::string error_file_path = tsl::io::JoinPath(
+      StreamDirectory(snapshot_path, /*stream_index=*/0), "ERROR");
+
+  EXPECT_THAT(Env::Default()->FileExists(done_file_path),
+              StatusIs(error::NOT_FOUND));
+  EXPECT_THAT(Env::Default()->FileExists(error_file_path),
+              StatusIs(error::NOT_FOUND));
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     compression, Env::Default(),
+                                     /*max_chunk_size_bytes=*/1};
+  SnapshotStreamWriter snapshot_writer(writer_params,
+                                       std::move(error_iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), StatusIs(error::INVALID_ARGUMENT));
+  EXPECT_THAT(Env::Default()->FileExists(done_file_path),
+              StatusIs(error::NOT_FOUND));
+  TF_EXPECT_OK(Env::Default()->FileExists(error_file_path));
+  EXPECT_THAT(ReadStringFromFile(error_file_path),
+              IsOkAndHolds(HasSubstr("Invalid argument")));
+  EXPECT_THAT(snapshot_writer.Completed(), StatusIs(error::INVALID_ARGUMENT));
+}
+
+INSTANTIATE_TEST_SUITE_P(Compression, SnapshotStreamWriterParameterizedTest,
+                         ValuesIn<std::string>({tsl::io::compression::kNone,
+                                                tsl::io::compression::kGzip,
+                                                tsl::io::compression::kSnappy,
+                                                tsl::io::compression::kZlib}));
+
+TEST(SnapshotStreamWriterTest, EmptyDataset) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(0)));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     tsl::io::compression::kSnappy,
+                                     Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+
+  EXPECT_THAT(ReadSnapshot<int64_t>(
+                  tsl::io::JoinPath(writer_params.CommittedChunksDirectory(),
+                                    "chunk_0_0"),
+                  tsl::io::compression::kSnappy, /*num_elements=*/0),
+              IsOkAndHolds(IsEmpty()));
+}
+
+TEST(SnapshotStreamWriterTest, Cancel) {
+  const int64_t range = 10000;
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          TestIterator(testing::RangeDataset(range)));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  SnapshotWriterParams writer_params{snapshot_path, /*stream_index=*/0,
+                                     tsl::io::compression::kSnappy,
+                                     Env::Default()};
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  snapshot_writer.Cancel();
+  EXPECT_THAT(snapshot_writer.Wait(), StatusIs(error::CANCELLED));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/test_utils.cc b/tensorflow/core/data/service/snapshot/test_utils.cc
new file mode 100644
index 00000000000..2ca9df076d3
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/test_utils.cc
@@ -0,0 +1,199 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/test_utils.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace testing {
+namespace {
+
+tsl::StatusOr<std::string> CreateTmpDirectory() {
+  std::string snapshot_path;
+  if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
+    return errors::FailedPrecondition(
+        "Failed to create local temp file for snapshot.");
+  }
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
+      CommittedChunksDirectory(snapshot_path)));
+  return snapshot_path;
+}
+
+tsl::StatusOr<int64_t> CommittedChunkIndex(const std::string& chunk_file) {
+  std::vector<std::string> tokens = absl::StrSplit(chunk_file, '_');
+  int64_t result = 0;
+  if (tokens.size() != 3 || !absl::SimpleAtoi(tokens[2], &result)) {
+    return errors::Internal("Invalid");
+  }
+  return result;
+}
+
+tsl::StatusOr<int64_t> CheckpointIndex(const std::string& checkpoint_file) {
+  std::vector<std::string> tokens = absl::StrSplit(checkpoint_file, '_');
+  int64_t result = 0;
+  if (tokens.size() != 2 || !absl::SimpleAtoi(tokens[1], &result)) {
+    return errors::Internal("Invalid");
+  }
+  return result;
+}
+
+}  // namespace
+
+PartialSnapshotWriter::PartialSnapshotWriter(const DatasetDef& dataset,
+                                             const std::string& snapshot_path,
+                                             int64_t stream_index,
+                                             const std::string& compression)
+    : dataset_(dataset),
+      snapshot_path_(snapshot_path),
+      stream_index_(stream_index),
+      compression_(compression) {}
+
+tsl::StatusOr<PartialSnapshotWriter> PartialSnapshotWriter::Create(
+    const DatasetDef& dataset, const std::string& snapshot_path,
+    int64_t stream_index, const std::string& compression) {
+  PartialSnapshotWriter writer(dataset, snapshot_path, stream_index,
+                               compression);
+  TF_RETURN_IF_ERROR(writer.Initialize());
+  return writer;
+}
+
+tsl::Status PartialSnapshotWriter::Initialize() {
+  TF_ASSIGN_OR_RETURN(tmp_snapshot_path_, CreateTmpDirectory());
+  // Each chunk contains one record.
+  SnapshotWriterParams writer_params{tmp_snapshot_path_,
+                                     stream_index_,
+                                     compression_,
+                                     Env::Default(),
+                                     /*max_chunk_size_bytes=*/1,
+                                     /*test_only_keep_temp_files=*/true};
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                      TestIterator(dataset_));
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  return snapshot_writer.Wait().status();
+}
+
+tsl::Status PartialSnapshotWriter::WriteCommittedChunks(
+    const absl::flat_hash_set<int64_t>& committed_chunk_indexes) const {
+  std::string tmp_chunks_directory =
+      CommittedChunksDirectory(tmp_snapshot_path_);
+  std::string committed_chunks_directory =
+      CommittedChunksDirectory(snapshot_path_);
+  TF_RETURN_IF_ERROR(
+      Env::Default()->RecursivelyCreateDir(committed_chunks_directory));
+  std::vector<std::string> tmp_chunks;
+  TF_RETURN_IF_ERROR(
+      Env::Default()->GetChildren(tmp_chunks_directory, &tmp_chunks));
+
+  for (const std::string& tmp_chunk : tmp_chunks) {
+    TF_ASSIGN_OR_RETURN(int64_t chunk_index, CommittedChunkIndex(tmp_chunk));
+    if (committed_chunk_indexes.contains(chunk_index)) {
+      std::string tmp_chunk_path =
+          tsl::io::JoinPath(tmp_chunks_directory, tmp_chunk);
+      std::string committed_chunk_path =
+          tsl::io::JoinPath(committed_chunks_directory, tmp_chunk);
+      TF_RETURN_IF_ERROR(
+          Env::Default()->CopyFile(tmp_chunk_path, committed_chunk_path));
+    }
+  }
+  return OkStatus();
+}
+
+tsl::Status PartialSnapshotWriter::WriteUncommittedChunks(
+    const absl::flat_hash_set<int64_t>& uncommitted_chunk_indexes) const {
+  std::string tmp_chunks_directory =
+      CommittedChunksDirectory(tmp_snapshot_path_);
+  std::string uncommitted_chunks_directory =
+      UncommittedChunksDirectory(snapshot_path_, stream_index_);
+  TF_RETURN_IF_ERROR(
+      Env::Default()->RecursivelyCreateDir(uncommitted_chunks_directory));
+  std::vector<std::string> tmp_chunks;
+  TF_RETURN_IF_ERROR(
+      Env::Default()->GetChildren(tmp_chunks_directory, &tmp_chunks));
+
+  for (const std::string& tmp_chunk : tmp_chunks) {
+    TF_ASSIGN_OR_RETURN(int64_t chunk_index, CommittedChunkIndex(tmp_chunk));
+    if (uncommitted_chunk_indexes.contains(chunk_index)) {
+      std::string tmp_chunk_path =
+          tsl::io::JoinPath(tmp_chunks_directory, tmp_chunk);
+      std::string uncommitted_chunk_path =
+          tsl::io::JoinPath(uncommitted_chunks_directory,
+                            absl::StrCat("chunk_", stream_index_, chunk_index));
+      TF_RETURN_IF_ERROR(
+          Env::Default()->CopyFile(tmp_chunk_path, uncommitted_chunk_path));
+    }
+  }
+  return OkStatus();
+}
+
+tsl::Status PartialSnapshotWriter::WriteCheckpoints(
+    const absl::flat_hash_set<int64_t>& checkpoint_indexes) const {
+  std::string tmp_checkpoints_directory =
+      CheckpointsDirectory(tmp_snapshot_path_, stream_index_);
+  std::string checkpoints_directory =
+      CheckpointsDirectory(snapshot_path_, stream_index_);
+  TF_RETURN_IF_ERROR(
+      Env::Default()->RecursivelyCreateDir(checkpoints_directory));
+  std::vector<std::string> tmp_checkpoints;
+  TF_RETURN_IF_ERROR(
+      Env::Default()->GetChildren(tmp_checkpoints_directory, &tmp_checkpoints));
+
+  for (const std::string& tmp_checkpoint : tmp_checkpoints) {
+    TF_ASSIGN_OR_RETURN(int64_t checkpoint_index,
+                        CheckpointIndex(tmp_checkpoint));
+    if (checkpoint_indexes.contains(checkpoint_index)) {
+      std::string tmp_checkpoint_path =
+          tsl::io::JoinPath(tmp_checkpoints_directory, tmp_checkpoint);
+      std::string checkpoint_path =
+          tsl::io::JoinPath(checkpoints_directory, tmp_checkpoint);
+      TF_RETURN_IF_ERROR(
+          Env::Default()->CopyFile(tmp_checkpoint_path, checkpoint_path));
+    }
+  }
+  return OkStatus();
+}
+
+tsl::StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
+    const DatasetDef& dataset_def) {
+  std::unique_ptr<standalone::Dataset> dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+      standalone::Dataset::Params(), dataset_def.graph(), &dataset));
+  std::unique_ptr<standalone::Iterator> iterator;
+  TF_RETURN_IF_ERROR(dataset->MakeIterator(&iterator));
+  return std::make_unique<StandaloneTaskIterator>(std::move(dataset),
+                                                  std::move(iterator));
+}
+
+}  // namespace testing
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/test_utils.h b/tensorflow/core/data/service/snapshot/test_utils.h
new file mode 100644
index 00000000000..99e76f65fd4
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/test_utils.h
@@ -0,0 +1,108 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/data/service/common.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
+#include "tensorflow/core/data/service/task_runner.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+namespace testing {
+
+// Reads the records from a distributed tf.data snapshot written at `base_path`.
+template <class T>
+tsl::StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
+                                           const std::string& compression) {
+  experimental::DistributedSnapshotMetadata metadata;
+  metadata.set_compression(compression);
+  SnapshotReaderParams params{base_path, metadata, DataTypeVector{DT_INT64},
+                              Env::Default()};
+  SnapshotReader reader(params);
+  std::vector<T> result;
+  while (true) {
+    TF_ASSIGN_OR_RETURN(GetNextResult next, reader.GetNext());
+    if (next.end_of_sequence) {
+      return result;
+    }
+    result.push_back(next.tensors[0].unaligned_flat<T>().data()[0]);
+  }
+  return result;
+}
+
+// Writes a partial snapshot to test checkpointing and recovering. It can be
+// used to write the specified committed chunks, uncommitted chunks, and
+// checkpoints.
+class PartialSnapshotWriter {
+ public:
+  static tsl::StatusOr<PartialSnapshotWriter> Create(
+      const DatasetDef& dataset, const std::string& snapshot_path,
+      int64_t stream_index, const std::string& compression);
+  virtual ~PartialSnapshotWriter() = default;
+  PartialSnapshotWriter(const PartialSnapshotWriter&) = delete;
+  PartialSnapshotWriter& operator=(const PartialSnapshotWriter&) = delete;
+  PartialSnapshotWriter(PartialSnapshotWriter&&) = default;
+  PartialSnapshotWriter& operator=(PartialSnapshotWriter&&) = delete;
+
+  // Writes the specified chunks.
+  tsl::Status WriteCommittedChunks(
+      const absl::flat_hash_set<int64_t>& committed_chunk_indexes) const;
+
+  // Writes the specified uncommitted chunks.
+  tsl::Status WriteUncommittedChunks(
+      const absl::flat_hash_set<int64_t>& uncommitted_chunk_indexes) const;
+
+  // Writes the specified checkpoints.
+  tsl::Status WriteCheckpoints(
+      const absl::flat_hash_set<int64_t>& checkpoint_indexes) const;
+
+ private:
+  PartialSnapshotWriter(const DatasetDef& dataset,
+                        const std::string& snapshot_path, int64_t stream_index,
+                        const std::string& compression);
+
+  tsl::Status Initialize();
+
+  const DatasetDef dataset_;
+  const std::string snapshot_path_;
+  const int64_t stream_index_;
+  const std::string compression_;
+
+  std::string tmp_snapshot_path_;
+};
+
+// Creates a test iterator for the input dataset. The iterator will generate all
+// elements of the dataset.
+tsl::StatusOr<std::unique_ptr<StandaloneTaskIterator>> TestIterator(
+    const DatasetDef& dataset_def);
+
+}  // namespace testing
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_TEST_UTILS_H_
diff --git a/tensorflow/core/data/service/snapshot/utils.cc b/tensorflow/core/data/service/snapshot/utils.cc
new file mode 100644
index 00000000000..d733948d9b3
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/utils.cc
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/utils.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+int64_t EstimatedSizeBytes(const std::vector<Tensor>& tensors) {
+  int64_t size_bytes = 0;
+  for (const Tensor& tensor : tensors) {
+    TensorProto proto;
+    tensor.AsProtoTensorContent(&proto);
+    size_bytes += proto.ByteSizeLong();
+  }
+  return size_bytes;
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/utils.h b/tensorflow/core/data/service/snapshot/utils.h
new file mode 100644
index 00000000000..948c62672f4
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+int64_t EstimatedSizeBytes(const std::vector<Tensor>& tensors);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_UTILS_H_
diff --git a/tensorflow/core/data/service/snapshot/utils_test.cc b/tensorflow/core/data/service/snapshot/utils_test.cc
new file mode 100644
index 00000000000..d7fa77da8c7
--- /dev/null
+++ b/tensorflow/core/data/service/snapshot/utils_test.cc
@@ -0,0 +1,69 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/service/snapshot/utils.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(UtilsTest, EstimatedSizeBytes) {
+  // int64 Tensor of size 1000.
+  Tensor tensor(DT_INT64, TensorShape({10, 100}));
+  std::vector<Tensor> Tensors{Tensor(DT_INT64, TensorShape({10, 100}))};
+  EXPECT_GT(EstimatedSizeBytes(Tensors), 1000);
+}
+
+TEST(UtilsTest, EstimatedVariantSizeBytes) {
+  // Variant Tensor of size 1000.
+  std::unique_ptr<CompressedElement> compressed{
+      protobuf::Arena::CreateMessage<CompressedElement>(nullptr)};
+  compressed->set_data(std::string(1000, 'a'));
+  Tensor tensor(DT_VARIANT, TensorShape({}));
+  tensor.scalar<Variant>()() = *compressed;
+
+  EXPECT_GT(EstimatedSizeBytes({tensor}), 1000);
+}
+
+TEST(UtilsTest, EstimatedMixedElementsSizeBytes) {
+  // int64 Tensor of size 1000.
+  Tensor int64_tensor(DT_INT64, TensorShape({10, 100}));
+
+  // Variant Tensor of size 1000.
+  std::unique_ptr<CompressedElement> compressed{
+      protobuf::Arena::CreateMessage<CompressedElement>(nullptr)};
+  compressed->set_data(std::string(1000, 'a'));
+  Tensor variant_tensor(DT_VARIANT, TensorShape({}));
+  variant_tensor.scalar<Variant>()() = *compressed;
+
+  EXPECT_GT(EstimatedSizeBytes({int64_tensor, variant_tensor}), 2000);
+}
+
+TEST(UtilsTest, EmptyTensor) { EXPECT_GT(EstimatedSizeBytes({Tensor()}), 0); }
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/split_provider.cc b/tensorflow/core/data/service/split_provider.cc
index a11776c9a63..c5aef37fb5b 100644
--- a/tensorflow/core/data/service/split_provider.cc
+++ b/tensorflow/core/data/service/split_provider.cc
@@ -18,26 +18,31 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
 
-Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits)
+    TF_LOCKS_EXCLUDED(mu_) {
   mutex_lock l(mu_);
   if (!dispatcher_) {
     dispatcher_ =
         std::make_unique<DataServiceDispatcherClient>(address_, protocol_);
   }
   TF_RETURN_IF_ERROR(grpc_util::Retry(
-      [this, split, end_of_splits] {
+      [this, split, end_of_splits]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         return dispatcher_->GetSplit(iteration_id_, repetition_,
                                      split_provider_index_, *split,
                                      *end_of_splits);
@@ -56,7 +61,7 @@ Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
   return OkStatus();
 }
 
-Status DataServiceSplitProvider::Reset() {
+Status DataServiceSplitProvider::Reset() TF_LOCKS_EXCLUDED(mu_) {
   mutex_lock l(mu_);
   repetition_++;
   return OkStatus();
@@ -76,5 +81,16 @@ Status DataServiceSplitProvider::Restore(
       "Restore is not implemented for DataServiceSplitProvider");
 }
 
+Status CreateSplitProviders(
+    const DatasetDef& dataset_def,
+    std::vector<std::unique_ptr<SplitProvider>>& split_providers) {
+  standalone::Dataset::Params params;
+  std::unique_ptr<standalone::Dataset> standalone_dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(params, dataset_def.graph(),
+                                                    &standalone_dataset));
+  TF_RETURN_IF_ERROR(standalone_dataset->MakeSplitProviders(&split_providers));
+  return OkStatus();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/split_provider.h b/tensorflow/core/data/service/split_provider.h
index ecb81afcf19..c87d3a2eb9b 100644
--- a/tensorflow/core/data/service/split_provider.h
+++ b/tensorflow/core/data/service/split_provider.h
@@ -19,12 +19,15 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
@@ -56,10 +59,15 @@ class DataServiceSplitProvider : public SplitProvider {
   const int64_t timeout_ms_;
 
   mutex mu_;
-  int64_t repetition_ = 0;
-  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
+  int64_t repetition_ TF_GUARDED_BY(mu_) = 0;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_ TF_GUARDED_BY(mu_);
 };
 
+// Makes split providers for `dataset_def` and stores them in `split_providers`.
+Status CreateSplitProviders(
+    const DatasetDef& dataset_def,
+    std::vector<std::unique_ptr<SplitProvider>>& split_providers);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index cf32305c79f..04b2c1f4dfe 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/env.h"
@@ -62,6 +63,12 @@ int64_t StandaloneTaskIterator::Cardinality() const {
   return dataset_->Get()->Cardinality();
 }
 
+StatusOr<Tensor> StandaloneTaskIterator::Save() { return iterator_->Save(); }
+
+Status StandaloneTaskIterator::Restore(const Tensor& saved_iterator) {
+  return iterator_->Restore(saved_iterator);
+}
+
 Status TaskRunner::Create(const experimental::WorkerConfig& worker_config,
                           const TaskDef& task_def,
                           std::unique_ptr<TaskIterator> iterator,
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index 9eefebc3760..6fc119f705d 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -24,6 +24,8 @@ limitations under the License.
 #include "tensorflow/core/data/service/thread_safe_buffer.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
@@ -45,6 +47,20 @@ class TaskIterator {
                          bool& end_of_sequence) = 0;
   // Reports the cardinality of the dataset that created this iterator.
   virtual int64_t Cardinality() const = 0;
+
+  // Returns a serialized representation of the iterator. This is used to write
+  // iterator checkpoints.
+  virtual StatusOr<Tensor> Save() {
+    return errors::Unimplemented(
+        "Serializing a tf.data service task iterator is unsupported.");
+  }
+
+  // Restores the iterator from a serialized representation. `serialized` is a
+  // Tensor produced by `Serialize()`.
+  virtual Status Restore(const Tensor& saved_iterator) {
+    return errors::Unimplemented(
+        "Restoring from a tf.data service task iterator is unsupported.");
+  }
 };
 
 // Implementation of TaskIterator wrapping a standalone iterator.
@@ -57,6 +73,8 @@ class StandaloneTaskIterator : public TaskIterator {
                          std::unique_ptr<standalone::Iterator> iterator);
   Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override;
   int64_t Cardinality() const override;
+  StatusOr<Tensor> Save() override;
+  Status Restore(const Tensor& saved_iterator) override;
 
  private:
   std::unique_ptr<standalone::Dataset> dataset_;
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index a33e97254c7..ffe68e7fbac 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/test_cluster.h"
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -68,12 +69,17 @@ Status TestCluster::Initialize() {
   return OkStatus();
 }
 
-Status TestCluster::AddWorker() {
+Status TestCluster::AddWorker(std::optional<int> port) {
   std::unique_ptr<WorkerGrpcDataServer> worker;
   experimental::WorkerConfig config;
+  if (port.has_value()) {
+    config.set_port(*port);
+  }
   config.set_protocol(kProtocol);
   config.set_dispatcher_address(dispatcher_address_);
-  config.set_worker_address("localhost:%port%");
+  std::string worker_address =
+      port.has_value() ? absl::StrCat("localhost:", *port) : "localhost:%port%";
+  config.set_worker_address(worker_address);
   config.set_heartbeat_interval_ms(config_.worker_heartbeat_interval_ms);
   TF_RETURN_IF_ERROR(NewWorkerServer(config, worker));
   TF_RETURN_IF_ERROR(worker->Start());
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index 39cdb42695c..b21feb5619f 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -63,9 +63,13 @@ class TestCluster {
   // the cluster. Initialize should be called only once.
   Status Initialize();
   // Adds a new worker to the cluster.
-  Status AddWorker();
+  Status AddWorker(std::optional<int> port = std::nullopt);
   // Returns the number of workers in this cluster.
   size_t NumWorkers() const { return workers_.size(); }
+  // Returns the port number of a worker.
+  int WorkerBoundPort(size_t worker_index) const {
+    return workers_[worker_index]->BoundPort();
+  }
   // Returns the number of active iterations.
   StatusOr<size_t> NumActiveIterations() const {
     return dispatcher_->NumActiveIterations();
diff --git a/tensorflow/core/data/service/test_util.cc b/tensorflow/core/data/service/test_util.cc
index 87c4d83d958..8d7abf3a9eb 100644
--- a/tensorflow/core/data/service/test_util.cc
+++ b/tensorflow/core/data/service/test_util.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -54,6 +55,7 @@ constexpr const char kTestdataDir[] =
     "tensorflow/core/data/service/testdata";
 constexpr const char kInterleaveTextlineDatasetFile[] =
     "interleave_textline_dataset.pbtxt";
+constexpr const char kChooseFromDatasetsFile[] = "choose_from_datasets.pbtxt";
 
 NodeDef GetMapNode(absl::string_view name, absl::string_view input_node_name,
                    absl::string_view function_name) {
@@ -88,6 +90,12 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
 }
 }  // namespace
 
+std::string LocalTempFilename() {
+  std::string path;
+  CHECK(Env::Default()->LocalTempFilename(&path));
+  return path;
+}
+
 DatasetDef RangeDataset(int64_t range) {
   DatasetDef dataset_def;
   *dataset_def.mutable_graph() = GDef(
@@ -174,6 +182,28 @@ DatasetDef InfiniteDataset() {
   return dataset_def;
 }
 
+StatusOr<DatasetDef> ChooseFromDatasets() {
+  DatasetDef dataset;
+  std::string graph_file = io::JoinPath(kTestdataDir, kChooseFromDatasetsFile);
+  TF_RETURN_IF_ERROR(
+      ReadTextProto(Env::Default(), graph_file, dataset.mutable_graph()));
+  return dataset;
+}
+
+experimental::DistributedSnapshotMetadata
+CreateDummyDistributedSnapshotMetadata() {
+  StructuredValue decoded_spec;
+  TensorShapeProto::Dim* dim =
+      decoded_spec.mutable_tensor_shape_value()->add_dim();
+  dim->set_size(1);
+  dim->set_name(absl::StrCat("dim"));
+
+  experimental::DistributedSnapshotMetadata metadata;
+  metadata.set_element_spec(decoded_spec.SerializeAsString());
+  metadata.set_compression("");
+  return metadata;
+}
+
 StatusOr<DatasetDef> InterleaveTextlineDataset(
     const std::vector<tstring>& filenames,
     const std::vector<tstring>& contents) {
diff --git a/tensorflow/core/data/service/test_util.h b/tensorflow/core/data/service/test_util.h
index 41763984369..e5e4bfd2c13 100644
--- a/tensorflow/core/data/service/test_util.h
+++ b/tensorflow/core/data/service/test_util.h
@@ -26,11 +26,15 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
 
 namespace tensorflow {
 namespace data {
 namespace testing {
 
+// Creates a local tempfile and returns the path.
+std::string LocalTempFilename();
+
 // Returns a test dataset representing
 // tf.data.Dataset.range(range). Useful for testing dataset graph execution.
 DatasetDef RangeDataset(int64_t range);
@@ -47,6 +51,18 @@ DatasetDef RangeDatasetWithShardHint(int64_t range);
 // tf.data.Dataset.range(100000000).repeat().
 DatasetDef InfiniteDataset();
 
+// Returns a test dataset representing
+// datasets = [tf.data.Dataset.from_tensors("a").repeat(10),
+//             tf.data.Dataset.from_tensors("b").repeat(10),
+//             tf.data.Dataset.from_tensors("c").repeat(10)]
+// choice_dataset = tf.data.Dataset.range(3).repeat()
+// dataset = tf.data.Dataset.choose_from_datasets(datasets, choice_dataset)
+StatusOr<DatasetDef> ChooseFromDatasets();
+
+// Returns a distributed snapshot metadata for a dummy dataset.
+experimental::DistributedSnapshotMetadata
+CreateDummyDistributedSnapshotMetadata();
+
 // Returns a test dataset representing
 // tf.data.Dataset.from_tensor_slices(["filenames"]).interleave(
 //     lambda filepath: tf.data.TextLineDataset(filepath),
diff --git a/tensorflow/core/data/service/test_util_test.cc b/tensorflow/core/data/service/test_util_test.cc
index 2482dd815e2..da94dbe33d7 100644
--- a/tensorflow/core/data/service/test_util_test.cc
+++ b/tensorflow/core/data/service/test_util_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -41,25 +42,22 @@ namespace testing {
 namespace {
 
 using ::tensorflow::testing::IsOkAndHolds;
+using ::testing::ElementsAre;
 using ::testing::IsEmpty;
-using ::testing::SizeIs;
 
-tstring LocalTempFilename() {
-  std::string path;
-  CHECK(Env::Default()->LocalTempFilename(&path));
-  return tstring(path);
-}
-
-StatusOr<std::vector<std::vector<Tensor>>> GetIteratorOutput(
-    standalone::Iterator& iterator) {
-  bool end_of_input = false;
-  std::vector<std::vector<Tensor>> result;
-  while (!end_of_input) {
-    std::vector<tensorflow::Tensor> outputs;
-    TF_RETURN_IF_ERROR(iterator.GetNext(&outputs, &end_of_input));
-    if (!end_of_input) {
-      result.push_back(outputs);
+template <class T>
+StatusOr<std::vector<T>> GetIteratorOutput(standalone::Iterator& iterator) {
+  std::vector<T> result;
+  for (bool end_of_sequence = false; !end_of_sequence;) {
+    std::vector<tensorflow::Tensor> tensors;
+    TF_RETURN_IF_ERROR(iterator.GetNext(&tensors, &end_of_sequence));
+    if (end_of_sequence) {
+      break;
     }
+    if (tensors.size() != 1) {
+      return errors::Internal("GetNext Tensor size is not 1.");
+    }
+    result.push_back(tensors[0].unaligned_flat<T>().data()[0]);
   }
   return result;
 }
@@ -72,13 +70,8 @@ TEST(TestUtilTest, RangeDataset) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<Tensor>> result,
-                          GetIteratorOutput(*iterator));
-
-  ASSERT_EQ(result.size(), 10);
-  for (int i = 0; i < result.size(); ++i) {
-    test::ExpectEqual(result[i][0], Tensor(int64_t{i}));
-  }
+  EXPECT_THAT(GetIteratorOutput<int64_t>(*iterator),
+              IsOkAndHolds(ElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
 }
 
 TEST(TestUtilTest, RangeSquareDataset) {
@@ -89,13 +82,8 @@ TEST(TestUtilTest, RangeSquareDataset) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<Tensor>> result,
-                          GetIteratorOutput(*iterator));
-
-  ASSERT_EQ(result.size(), 10);
-  for (int i = 0; i < result.size(); ++i) {
-    test::ExpectEqual(result[i][0], Tensor(int64_t{i * i}));
-  }
+  EXPECT_THAT(GetIteratorOutput<int64_t>(*iterator),
+              IsOkAndHolds(ElementsAre(0, 1, 4, 9, 16, 25, 36, 49, 64, 81)));
 }
 
 TEST(TestUtilTest, InfiniteDataset) {
@@ -124,7 +112,7 @@ TEST(TestUtilTest, EmptyDataset) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  EXPECT_THAT(GetIteratorOutput(*iterator), IsOkAndHolds(IsEmpty()));
+  EXPECT_THAT(GetIteratorOutput<int64_t>(*iterator), IsOkAndHolds(IsEmpty()));
 }
 
 TEST(TestUtilTest, InterleaveTextline) {
@@ -137,11 +125,8 @@ TEST(TestUtilTest, InterleaveTextline) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<Tensor>> result,
-                          GetIteratorOutput(*iterator));
-  ASSERT_THAT(result, SizeIs(2));
-  test::ExpectEqual(result[0][0], Tensor("0"));
-  test::ExpectEqual(result[1][0], Tensor("1"));
+  EXPECT_THAT(GetIteratorOutput<tstring>(*iterator),
+              IsOkAndHolds(ElementsAre("0", "1")));
 }
 
 TEST(TestUtilTest, InterleaveTextlineWithNewLines) {
@@ -155,12 +140,9 @@ TEST(TestUtilTest, InterleaveTextlineWithNewLines) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::vector<Tensor>> result,
-                          GetIteratorOutput(*iterator));
-  ASSERT_THAT(result, SizeIs(10));
-  for (int64_t i = 0; i < 10; ++i) {
-    test::ExpectEqual(result[i][0], Tensor(absl::StrCat(i)));
-  }
+  EXPECT_THAT(GetIteratorOutput<tstring>(*iterator),
+              IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6", "7",
+                                       "8", "9")));
 }
 
 TEST(TestUtilTest, InterleaveTextlineEmptyFiles) {
@@ -173,7 +155,22 @@ TEST(TestUtilTest, InterleaveTextlineEmptyFiles) {
       standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
   std::unique_ptr<standalone::Iterator> iterator;
   TF_ASSERT_OK(dataset->MakeIterator(&iterator));
-  EXPECT_THAT(GetIteratorOutput(*iterator), IsOkAndHolds(IsEmpty()));
+  EXPECT_THAT(GetIteratorOutput<tstring>(*iterator), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(TestUtilTest, ChooseFromDatasets) {
+  TF_ASSERT_OK_AND_ASSIGN(const DatasetDef dataset_def, ChooseFromDatasets());
+  standalone::Dataset::Params params;
+  std::unique_ptr<standalone::Dataset> dataset;
+  TF_ASSERT_OK(
+      standalone::Dataset::FromGraph(params, dataset_def.graph(), &dataset));
+  std::unique_ptr<standalone::Iterator> iterator;
+  TF_ASSERT_OK(dataset->MakeIterator(&iterator));
+  EXPECT_THAT(GetIteratorOutput<tstring>(*iterator),
+              IsOkAndHolds(ElementsAre("a", "b", "c", "a", "b", "c", "a", "b",
+                                       "c", "a", "b", "c", "a", "b", "c", "a",
+                                       "b", "c", "a", "b", "c", "a", "b", "c",
+                                       "a", "b", "c", "a", "b", "c")));
 }
 
 }  // namespace
diff --git a/tensorflow/core/data/service/testdata/BUILD b/tensorflow/core/data/service/testdata/BUILD
index fe4d01983dd..ba951203746 100644
--- a/tensorflow/core/data/service/testdata/BUILD
+++ b/tensorflow/core/data/service/testdata/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/data/service/testdata/choose_from_datasets.pbtxt b/tensorflow/core/data/service/testdata/choose_from_datasets.pbtxt
new file mode 100644
index 00000000000..9ed48c5dea4
--- /dev/null
+++ b/tensorflow/core/data/service/testdata/choose_from_datasets.pbtxt
@@ -0,0 +1,661 @@
+# proto-file: third_party/tensorflow/core/framework/graph.proto
+# proto-message: GraphDef
+#
+# Proto content generated by
+#
+# import tensorflow as tf
+#
+# datasets = [tf.data.Dataset.from_tensors("a").repeat(10),
+#             tf.data.Dataset.from_tensors("b").repeat(10),
+#             tf.data.Dataset.from_tensors("c").repeat(10)]
+#
+# Define a dataset containing `[0, 1, 2, 0, 1, 2, 0, 1, 2, ...]`.
+# choice_dataset = tf.data.Dataset.range(3).repeat()
+#
+# dataset = tf.data.Dataset.choose_from_datasets(datasets, choice_dataset)
+#
+# g = tf.compat.v1.GraphDef()
+# g.ParseFromString(dataset._as_serialized_graph().numpy())
+# print(g)
+
+node {
+  name: "Const/_0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Const/_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Const/_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "RangeDataset/_3"
+  op: "RangeDataset"
+  input: "Const/_0"
+  input: "Const/_1"
+  input: "Const/_2"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020RangeDataset:103"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "replicate_on_split"
+    value {
+      b: true
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_4"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "RepeatDataset/_5"
+  op: "RepeatDataset"
+  input: "RangeDataset/_3"
+  input: "Const/_4"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\021RepeatDataset:105"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_INT64
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_INT64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_6"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "a"
+      }
+    }
+  }
+}
+node {
+  name: "TensorDataset/_7"
+  op: "TensorDataset"
+  input: "Const/_6"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020TensorDataset:97"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_8"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "RepeatDataset/_9"
+  op: "RepeatDataset"
+  input: "TensorDataset/_7"
+  input: "Const/_8"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020RepeatDataset:98"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_10"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "b"
+      }
+    }
+  }
+}
+node {
+  name: "TensorDataset/_11"
+  op: "TensorDataset"
+  input: "Const/_10"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\020TensorDataset:99"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_12"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "RepeatDataset/_13"
+  op: "RepeatDataset"
+  input: "TensorDataset/_11"
+  input: "Const/_12"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\021RepeatDataset:100"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_14"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "c"
+      }
+    }
+  }
+}
+node {
+  name: "TensorDataset/_15"
+  op: "TensorDataset"
+  input: "Const/_14"
+  attr {
+    key: "Toutput_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\021TensorDataset:101"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const/_16"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "RepeatDataset/_17"
+  op: "RepeatDataset"
+  input: "TensorDataset/_15"
+  input: "Const/_16"
+  attr {
+    key: "metadata"
+    value {
+      s: "\n\021RepeatDataset:102"
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "DirectedInterleaveDataset/_18"
+  op: "DirectedInterleaveDataset"
+  input: "RepeatDataset/_5"
+  input: "RepeatDataset/_9"
+  input: "RepeatDataset/_13"
+  input: "RepeatDataset/_17"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "output_types"
+    value {
+      list {
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    key: "stop_on_empty_dataset"
+    value {
+      b: true
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_PRODUCT
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_STRING
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "dataset"
+  op: "_Retval"
+  input: "DirectedInterleaveDataset/_18"
+  attr {
+    key: "T"
+    value {
+      type: DT_VARIANT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 1372
+}
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 8c0d0eef751..c8c063ab9b7 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -55,6 +55,12 @@ message GetWorkerTasksResponse {
   repeated TaskInfo tasks = 1;
 }
 
+message GetSnapshotTaskProgressesRequest {}
+
+message GetSnapshotTaskProgressesResponse {
+  repeated SnapshotTaskProgress snapshot_task_progresses = 1;
+}
+
 service WorkerService {
   // Processes a task for a dataset, making elements available to clients.
   rpc ProcessTask(ProcessTaskRequest) returns (ProcessTaskResponse);
@@ -64,4 +70,9 @@ service WorkerService {
 
   // Gets the tasks currently being executed by the worker.
   rpc GetWorkerTasks(GetWorkerTasksRequest) returns (GetWorkerTasksResponse);
+
+  // Gets the progresses of the snapshot tasks currently being executed by the
+  // worker.
+  rpc GetSnapshotTaskProgresses(GetSnapshotTaskProgressesRequest)
+      returns (GetSnapshotTaskProgressesResponse);
 }
diff --git a/tensorflow/core/data/service/worker_client.cc b/tensorflow/core/data/service/worker_client.cc
index 9626c61529a..57875ac6ea2 100644
--- a/tensorflow/core/data/service/worker_client.cc
+++ b/tensorflow/core/data/service/worker_client.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/service/worker_client.h"
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
@@ -108,12 +109,20 @@ class GrpcDataTransferClient : public DataTransferClient {
       }
     }
     grpc::ClientContext ctx;
+    gtl::Cleanup<std::function<void()>> cleanup;
     {
       mutex_lock l(mu_);
       active_contexts_.insert(&ctx);
+      cleanup = gtl::MakeCleanup([this, &ctx] {
+        mutex_lock l(mu_);
+        active_contexts_.erase(&ctx);
+      });
     }
     GetElementResponse resp;
     grpc::Status s = stub_->GetElement(&ctx, req, &resp);
+    if (!s.ok()) {
+      return grpc_util::WrapError("Failed to get element", s);
+    }
     result.end_of_sequence = resp.end_of_sequence();
     result.skip = resp.skip_task();
     switch (resp.element_case()) {
@@ -134,13 +143,6 @@ class GrpcDataTransferClient : public DataTransferClient {
       case GetElementResponse::ELEMENT_NOT_SET:
         break;
     }
-    {
-      mutex_lock l(mu_);
-      active_contexts_.erase(&ctx);
-    }
-    if (!s.ok()) {
-      return grpc_util::WrapError("Failed to get element", s);
-    }
     return OkStatus();
   }
 
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 2c13976ef35..a2950cc7b6f 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/data/service/worker_impl.h"
 
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
@@ -27,8 +27,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/time/time.h"
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/data/service/auto_shard_rewriter.h"
 #include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/common.pb.h"
@@ -39,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/export.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/snapshot/path_utils.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_split_provider.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
 #include "tensorflow/core/data/service/split_provider.h"
 #include "tensorflow/core/data/service/task_runner.h"
 #include "tensorflow/core/data/service/utils.h"
@@ -60,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/service_config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -167,21 +169,14 @@ Status DataServiceWorkerImpl::Start(const std::string& worker_address,
   worker_address_ = worker_address;
   transfer_address_ = transfer_address;
 
-  dispatcher_ = std::make_unique<DataServiceDispatcherClient>(
-      config_.dispatcher_address(), config_.protocol());
-  TF_RETURN_IF_ERROR(dispatcher_->Initialize());
-
-  Status s = Heartbeat();
-  while (!s.ok()) {
-    if (!IsPreemptedError(s)) {
-      return s;
-    }
-    LOG(WARNING) << "Failed to register with dispatcher at "
-                 << config_.dispatcher_address() << ": " << s;
-    Env::Default()->SleepForMicroseconds(
-        absl::ToInt64Microseconds(kRetryInterval));
-    s = Heartbeat();
-  }
+  TF_ASSIGN_OR_RETURN(dispatcher_, CreateDispatcherClient());
+  auto should_retry = [this]() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return !cancelled_;
+  };
+  TF_RETURN_IF_ERROR(grpc_util::Retry([this]() { return Heartbeat(); },
+                                      should_retry, "Worker heartbeat.",
+                                      /*deadline_micros=*/kint64max));
   LOG(INFO) << "Worker registered with dispatcher running at "
             << config_.dispatcher_address();
   task_completion_thread_ = absl::WrapUnique(
@@ -228,6 +223,21 @@ Status DataServiceWorkerImpl::ValidateWorkerConfig() const {
   return OkStatus();
 }
 
+StatusOr<std::unique_ptr<DataServiceDispatcherClient>>
+DataServiceWorkerImpl::CreateDispatcherClient() const TF_LOCKS_EXCLUDED(mu_) {
+  auto dispatcher = std::make_unique<DataServiceDispatcherClient>(
+      config_.dispatcher_address(), config_.protocol());
+  auto should_retry = [this]() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return !cancelled_;
+  };
+  TF_RETURN_IF_ERROR(
+      grpc_util::Retry([&dispatcher]() { return dispatcher->Initialize(); },
+                       should_retry, "Initialize dispatcher client.",
+                       /*deadline_micros=*/kint64max));
+  return dispatcher;
+}
+
 Status DataServiceWorkerImpl::GetElementResult(
     const GetElementRequest* request, struct GetElementResult* result) {
   Task* task = nullptr;
@@ -439,6 +449,15 @@ Status DataServiceWorkerImpl::GetWorkerTasks(
   return OkStatus();
 }
 
+Status DataServiceWorkerImpl::GetSnapshotTaskProgresses(
+    const GetSnapshotTaskProgressesRequest* request,
+    GetSnapshotTaskProgressesResponse* response) {
+  for (const auto& snapshot_task_progress : GetSnapshotTaskProgress()) {
+    *response->add_snapshot_task_progresses() = snapshot_task_progress;
+  }
+  return OkStatus();
+}
+
 void DataServiceWorkerImpl::TaskCompletionThread() TF_LOCKS_EXCLUDED(mu_) {
   while (true) {
     {
@@ -515,7 +534,16 @@ void DataServiceWorkerImpl::HeartbeatThread() TF_LOCKS_EXCLUDED(mu_) {
   }
 }
 
-Status DataServiceWorkerImpl::Heartbeat() TF_LOCKS_EXCLUDED(mu_) {
+Status DataServiceWorkerImpl::Heartbeat() {
+  WorkerHeartbeatRequest request = BuildWorkerHeartbeatRequest();
+  TF_ASSIGN_OR_RETURN(WorkerHeartbeatResponse response,
+                      dispatcher_->WorkerHeartbeat(request));
+  UpdateTasks(response);
+  return UpdateSnapshotWriters(response);
+}
+
+WorkerHeartbeatRequest DataServiceWorkerImpl::BuildWorkerHeartbeatRequest()
+    const TF_LOCKS_EXCLUDED(mu_) {
   std::vector<int64_t> current_tasks;
   {
     mutex_lock l(mu_);
@@ -523,6 +551,7 @@ Status DataServiceWorkerImpl::Heartbeat() TF_LOCKS_EXCLUDED(mu_) {
       current_tasks.push_back(task.first);
     }
   }
+
   WorkerHeartbeatRequest request;
   request.set_worker_address(worker_address_);
   request.set_transfer_address(transfer_address_);
@@ -530,9 +559,36 @@ Status DataServiceWorkerImpl::Heartbeat() TF_LOCKS_EXCLUDED(mu_) {
   request.set_worker_uid(worker_uid_);
   *request.mutable_current_tasks() = {current_tasks.begin(),
                                       current_tasks.end()};
-  TF_ASSIGN_OR_RETURN(WorkerHeartbeatResponse response,
-                      dispatcher_->WorkerHeartbeat(request));
+  for (const auto& snapshot_task_progress : GetSnapshotTaskProgress()) {
+    request.mutable_snapshot_task_progress()->insert(
+        {snapshot_task_progress.snapshot_task().base_path(),
+         snapshot_task_progress});
+  }
+  return request;
+}
+
+std::vector<SnapshotTaskProgress>
+DataServiceWorkerImpl::GetSnapshotTaskProgress() const {
+  std::vector<SnapshotTaskProgress> snapshot_task_progress;
+  for (const auto& [snapshot_task, stream_writer] : snapshot_writers_) {
+    SnapshotTaskProgress progress;
+    progress.mutable_snapshot_task()->set_base_path(snapshot_task.base_path);
+    progress.mutable_snapshot_task()->set_stream_index(
+        snapshot_task.stream_index);
+    StatusOr<bool> completed = stream_writer->Completed();
+    if (completed.ok()) {
+      progress.set_completed(*completed);
+    } else {
+      progress.set_error_code(completed.status().code());
+      progress.set_error_message(completed.status().error_message());
+    }
+    snapshot_task_progress.push_back(std::move(progress));
+  }
+  return snapshot_task_progress;
+}
 
+void DataServiceWorkerImpl::UpdateTasks(const WorkerHeartbeatResponse& response)
+    TF_LOCKS_EXCLUDED(mu_) {
   std::vector<std::shared_ptr<Task>> tasks_to_delete;
   {
     mutex_lock l(mu_);
@@ -562,9 +618,71 @@ Status DataServiceWorkerImpl::Heartbeat() TF_LOCKS_EXCLUDED(mu_) {
   for (const auto& task : tasks_to_delete) {
     StopTask(*task);
   }
+}
+
+Status DataServiceWorkerImpl::UpdateSnapshotWriters(
+    const WorkerHeartbeatResponse& response) {
+  absl::flat_hash_set<SnapshotTask> assigned_snapshot_task_keys;
+  for (const SnapshotTaskDef& snapshot_task : response.snapshot_tasks()) {
+    SnapshotTask snapshot_task_key{snapshot_task.base_path(),
+                                   snapshot_task.stream_index()};
+    assigned_snapshot_task_keys.insert(snapshot_task_key);
+    if (snapshot_writers_.contains(snapshot_task_key)) {
+      continue;
+    }
+
+    DatasetDef dataset_def;
+    TF_RETURN_IF_ERROR(ReadBinaryProto(
+        Env::Default(), DatasetDefFilePath(snapshot_task.base_path()),
+        &dataset_def));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                        MakeSnapshotTaskIterator(snapshot_task, dataset_def));
+    snapshot_writers_.emplace(
+        snapshot_task_key,
+        std::make_unique<SnapshotStreamWriter>(
+            SnapshotWriterParams{
+                snapshot_task.base_path(), snapshot_task.stream_index(),
+                snapshot_task.metadata().compression(), Env::Default()},
+            std::move(iterator)));
+  }
+
+  // Cancel writers for snapshots that are no longer assigned by the dispatcher.
+  for (auto it = snapshot_writers_.begin(); it != snapshot_writers_.end();) {
+    if (!assigned_snapshot_task_keys.contains(it->first)) {
+      it->second->Cancel();
+      snapshot_writers_.erase(it++);
+    } else {
+      ++it;
+    }
+  }
+
   return OkStatus();
 }
 
+StatusOr<std::unique_ptr<StandaloneTaskIterator>>
+DataServiceWorkerImpl::MakeSnapshotTaskIterator(
+    const SnapshotTaskDef& snapshot_task, const DatasetDef& dataset_def) const {
+  std::unique_ptr<standalone::Dataset> dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+      standalone::Dataset::Params(), dataset_def.graph(), &dataset));
+
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  split_providers.reserve(snapshot_task.num_sources());
+  for (int i = 0; i < snapshot_task.num_sources(); ++i) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<DataServiceDispatcherClient> dispatcher,
+                        CreateDispatcherClient());
+    split_providers.push_back(std::make_unique<SnapshotSplitProvider>(
+        worker_address_, snapshot_task,
+        /*source_index=*/i, absl::Milliseconds(config_.dispatcher_timeout_ms()),
+        std::move(dispatcher), Env::Default()));
+  }
+  std::unique_ptr<standalone::Iterator> iterator;
+  TF_RETURN_IF_ERROR(
+      dataset->MakeIterator(std::move(split_providers), &iterator));
+  return std::make_unique<StandaloneTaskIterator>(std::move(dataset),
+                                                  std::move(iterator));
+}
+
 void DataServiceWorkerImpl::DeleteLocalTask(const TaskInfo& task_info)
     TF_LOCKS_EXCLUDED(mu_) {
   std::shared_ptr<Task> task;
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 07381dcf75d..b2ffa5d4ff4 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -15,18 +15,22 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERVICE_WORKER_IMPL_H_
 #define TENSORFLOW_CORE_DATA_SERVICE_WORKER_IMPL_H_
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_transfer.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/export.pb.h"
+#include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
 #include "tensorflow/core/data/service/task_runner.h"
 #include "tensorflow/core/data/service/worker.pb.h"
 #include "tensorflow/core/data/standalone.h"
@@ -81,6 +85,9 @@ class DataServiceWorkerImpl {
                     GetElementResponse* response);
   Status GetWorkerTasks(const GetWorkerTasksRequest* request,
                         GetWorkerTasksResponse* response);
+  Status GetSnapshotTaskProgresses(
+      const GetSnapshotTaskProgressesRequest* request,
+      GetSnapshotTaskProgressesResponse* response);
 
   // Exports the worker state for debugging.
   WorkerStateExport ExportState() const;
@@ -96,8 +103,31 @@ class DataServiceWorkerImpl {
     std::unique_ptr<TaskRunner> task_runner;
   };
 
+  struct SnapshotTask {
+    // Base directory of the snapshot.
+    std::string base_path;
+
+    // Index of the snapshot stream written by this worker.
+    int64_t stream_index = 0;
+
+    // This is required to use it as a `flat_hash_map` key.
+    template <typename H>
+    friend H AbslHashValue(H h, const SnapshotTask& task) {
+      return H::combine(std::move(h), task.base_path, task.base_path);
+    }
+
+    friend bool operator==(const SnapshotTask& task1,
+                           const SnapshotTask& task2) {
+      return task1.base_path == task2.base_path &&
+             task1.stream_index == task2.stream_index;
+    }
+  };
+
   // Validates the worker config.
   Status ValidateWorkerConfig() const;
+  // Creates and initializes a dispatcher client.
+  StatusOr<std::unique_ptr<DataServiceDispatcherClient>>
+  CreateDispatcherClient() const;
   // Sends task status to the dispatcher and checks for dispatcher commands.
   Status SendTaskUpdates() TF_LOCKS_EXCLUDED(mu_);
   // Creates an iterator to process a task.
@@ -112,7 +142,21 @@ class DataServiceWorkerImpl {
   // A thread for doing periodic heartbeats to the dispatcher.
   void HeartbeatThread() TF_LOCKS_EXCLUDED(mu_);
   // Performs a heartbeat to the dispatcher.
-  Status Heartbeat() TF_LOCKS_EXCLUDED(mu_);
+  Status Heartbeat();
+  // Builds a heartbeat request.
+  WorkerHeartbeatRequest BuildWorkerHeartbeatRequest() const
+      TF_LOCKS_EXCLUDED(mu_);
+  // Updates the tasks according to the heartbeat response.
+  void UpdateTasks(const WorkerHeartbeatResponse& response)
+      TF_LOCKS_EXCLUDED(mu_);
+  // Updates the distributed snapshot tasks according to the heartbeat response.
+  Status UpdateSnapshotWriters(const WorkerHeartbeatResponse& response);
+  // Creates an dataset iterator for snapshot writers.
+  StatusOr<std::unique_ptr<StandaloneTaskIterator>> MakeSnapshotTaskIterator(
+      const SnapshotTaskDef& snapshot_task,
+      const DatasetDef& dataset_def) const;
+  // Gets the snapshot task progress from the snapshot writers.
+  std::vector<SnapshotTaskProgress> GetSnapshotTaskProgress() const;
   // Gets the DatasetDef for `task_def`.
   StatusOr<DatasetDef> GetDatasetDef(const TaskDef& task_def) const;
   // Creates a dataset from `dataset_def`.
@@ -150,6 +194,10 @@ class DataServiceWorkerImpl {
   condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
   CancellationManager cancellation_manager_;
 
+  absl::flat_hash_map<SnapshotTask, std::unique_ptr<SnapshotStreamWriter>,
+                      absl::Hash<SnapshotTask>>
+      snapshot_writers_;
+
   // A thread for notifying the dispatcher when tasks complete.
   std::unique_ptr<Thread> task_completion_thread_;
   // A thread for performing regular heartbeats to the dispatcher.
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 8f99bbca876..e0c2b11d8cc 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -952,6 +952,17 @@ Status WriteMetadataFile(Env* env, const string& dir,
   return env->RenameFile(tmp_filename, metadata_filename);
 }
 
+Status WriteMetadataFile(
+    Env* env, const string& dir,
+    const experimental::DistributedSnapshotMetadata* metadata) {
+  string metadata_filename = io::JoinPath(dir, kMetadataFilename);
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(dir));
+  std::string tmp_filename =
+      absl::StrCat(metadata_filename, "-tmp-", random::New64());
+  TF_RETURN_IF_ERROR(WriteBinaryProto(env, tmp_filename, *metadata));
+  return env->RenameFile(tmp_filename, metadata_filename);
+}
+
 Status ReadMetadataFile(Env* env, const string& dir,
                         experimental::SnapshotMetadataRecord* metadata,
                         bool* file_exists) {
@@ -966,6 +977,20 @@ Status ReadMetadataFile(Env* env, const string& dir,
   }
 }
 
+Status ReadMetadataFile(Env* env, const string& dir,
+                        experimental::DistributedSnapshotMetadata* metadata,
+                        bool* file_exists) {
+  string metadata_filename = io::JoinPath(dir, kMetadataFilename);
+  Status s = env->FileExists(metadata_filename);
+  *file_exists = s.ok();
+
+  if (*file_exists) {
+    return ReadBinaryProto(env, metadata_filename, metadata);
+  } else {
+    return OkStatus();
+  }
+}
+
 Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
                         const GraphDef* graph) {
   std::string hash_hex =
diff --git a/tensorflow/core/data/snapshot_utils.h b/tensorflow/core/data/snapshot_utils.h
index 58423e3dd84..488c3444b39 100644
--- a/tensorflow/core/data/snapshot_utils.h
+++ b/tensorflow/core/data/snapshot_utils.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
 
 namespace tensorflow {
 
@@ -101,6 +102,8 @@ class TFRecordWriter : public Writer {
   TFRecordWriter(const std::string& filename,
                  const std::string& compression_type);
 
+  Status Initialize(tensorflow::Env* env) override;
+
   Status WriteTensors(const std::vector<Tensor>& tensors) override;
 
   Status Sync() override;
@@ -109,9 +112,6 @@ class TFRecordWriter : public Writer {
 
   ~TFRecordWriter() override;
 
- protected:
-  Status Initialize(tensorflow::Env* env) override;
-
  private:
   const std::string filename_;
   const std::string compression_type_;
@@ -240,12 +240,13 @@ class TFRecordReader : public Reader {
   TFRecordReader(const std::string& filename, const string& compression_type,
                  const DataTypeVector& dtypes);
 
-  Status ReadTensors(std::vector<Tensor>* read_tensors) override;
+  Status Initialize(Env* env) override;
 
-  ~TFRecordReader() override {}
+  // Reads Tensors into `read_tensors`. Returns OK on success, OutOfRange for
+  // end of file, or an error status if there is an error.
+  Status ReadTensors(std::vector<Tensor>* read_tensors) override;
 
- protected:
-  Status Initialize(Env* env) override;
+  ~TFRecordReader() override = default;
 
  private:
   std::string filename_;
@@ -315,11 +316,26 @@ class CustomReader : public Reader {
 Status WriteMetadataFile(Env* env, const string& dir,
                          const experimental::SnapshotMetadataRecord* metadata);
 
+// Writes distributed snapshot metadata to the given directory. An error is
+// returned if `dir` is unable to be created or if `metadata` is unable to be
+// written.
+Status WriteMetadataFile(
+    Env* env, const string& dir,
+    const experimental::DistributedSnapshotMetadata* metadata);
+
 // Reads snapshot metadata from the given directory.
 Status ReadMetadataFile(Env* env, const string& dir,
                         experimental::SnapshotMetadataRecord* metadata,
                         bool* file_exists);
 
+// Reads distributed snapshot metadata from the given directory. If the file
+// doesn't exist in `dir`, `file_exists` is set to true and an ok status is
+// returned. If the file exists in `dir` but is unable to be opened, an error
+// is returned.
+Status ReadMetadataFile(Env* env, const string& dir,
+                        experimental::DistributedSnapshotMetadata* metadata,
+                        bool* file_exists);
+
 // Writes a dataset graph to the given directory.
 Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
                         const GraphDef* graph);
diff --git a/tensorflow/core/data/snapshot_utils_test.cc b/tensorflow/core/data/snapshot_utils_test.cc
index daba92f92b8..e40cbb3a847 100644
--- a/tensorflow/core/data/snapshot_utils_test.cc
+++ b/tensorflow/core/data/snapshot_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/snapshot_utils.h"
 
+#include "tensorflow/core/data/service/test_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/compression.h"
@@ -28,6 +29,9 @@ namespace data {
 namespace snapshot_util {
 namespace {
 
+using ::tensorflow::data::testing::EqualsProto;
+using ::tensorflow::data::testing::LocalTempFilename;
+
 void GenerateTensorVector(tensorflow::DataTypeVector& dtypes,
                           std::vector<Tensor>& tensors) {
   std::string tensor_data(1024, 'a');
@@ -91,6 +95,27 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
   SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
 
+TEST(SnapshotUtilTest, MetadataFileRoundTrip) {
+  experimental::DistributedSnapshotMetadata metadata_in;
+  metadata_in.set_compression(io::compression::kGzip);
+  std::string dir = LocalTempFilename();
+  TF_ASSERT_OK(WriteMetadataFile(Env::Default(), dir, &metadata_in));
+
+  experimental::DistributedSnapshotMetadata metadata_out;
+  bool file_exists;
+  TF_ASSERT_OK(
+      ReadMetadataFile(Env::Default(), dir, &metadata_out, &file_exists));
+  EXPECT_THAT(metadata_in, EqualsProto(metadata_out));
+}
+
+TEST(SnapshotUtilTest, MetadataFileDoesntExist) {
+  experimental::DistributedSnapshotMetadata metadata;
+  bool file_exists;
+  TF_ASSERT_OK(ReadMetadataFile(Env::Default(), LocalTempFilename(), &metadata,
+                                &file_exists));
+  EXPECT_FALSE(file_exists);
+}
+
 void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
                                  std::string compression_type, int version) {
   tensorflow::DataTypeVector dtypes;
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index 32866015ad7..e390714aca5 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -30,13 +31,28 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/data/root_dataset.h"
+#include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/refcount.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -56,12 +72,48 @@ OpKernelContext::Params CreateParams(
 
 }  // namespace
 
+Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx,
+                   SerializationContext* serialization_ctx)
+    : iterator_(iterator), ctx_(ctx), serialization_ctx_(serialization_ctx) {}
+
 Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
   return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
 }
 
-Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
-    : iterator_(iterator), ctx_(ctx) {}
+StatusOr<Tensor> Iterator::Save() {
+  VariantTensorDataWriter writer;
+  TF_RETURN_IF_ERROR(iterator_->Save(serialization_ctx_.get(), &writer));
+  std::vector<std::unique_ptr<VariantTensorData>> data;
+  writer.ReleaseData(&data);
+
+  int64_t num_tensors = data.size();
+  Tensor serialized(DT_VARIANT, TensorShape({num_tensors}));
+  for (size_t i = 0; i < data.size(); ++i) {
+    IteratorStateVariant variant;
+    TF_RETURN_IF_ERROR(variant.InitializeFromVariantData(std::move(data[i])));
+    serialized.vec<Variant>()(i) = std::move(variant);
+  }
+  return serialized;
+}
+
+Status Iterator::Restore(const Tensor& saved_iterator) {
+  int64_t num_tensors = saved_iterator.dim_size(0);
+  auto saved_vec = saved_iterator.vec<Variant>();
+  std::vector<const VariantTensorData*> data;
+  data.reserve(num_tensors);
+  for (int i = 0; i < num_tensors; ++i) {
+    auto* variant = saved_vec(i).get<IteratorStateVariant>();
+    if (!variant) {
+      return errors::Internal(
+          "Cannot initialize an iterator from tensor ",
+          saved_vec(i).DebugString(),
+          ". Expected a variant tensor of type IteratorStateVariant.");
+    }
+    data.push_back(variant->GetData());
+  }
+  VariantTensorDataReader reader(data);
+  return iterator_->Restore(ctx_.get(), &reader);
+}
 
 Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
                           std::unique_ptr<Dataset>* result) {
@@ -143,13 +195,16 @@ Status Dataset::MakeIterator(
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
   params.thread_pool = &unbounded_thread_pool_;
   ctx = std::make_unique<IteratorContext>(std::move(params));
+  SerializationContext::Params serialization_params(&op_ctx);
+  auto serialization_ctx =
+      std::make_unique<SerializationContext>(std::move(serialization_params));
 
   // Create the iterator from the dataset.
   std::unique_ptr<IteratorBase> iterator;
   TF_RETURN_IF_ERROR(finalized_dataset_->MakeIterator(
       ctx.get(), /*parent=*/nullptr, "Iterator", &iterator));
-  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
-
+  *result = WrapUnique(new Iterator(iterator.release(), ctx.release(),
+                                    serialization_ctx.release()));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index c35e6a5403a..cb78525ded0 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -12,19 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_DATA_STANDALONE_H_
 #define TENSORFLOW_CORE_DATA_STANDALONE_H_
 
 #include <functional>
 #include <memory>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/data/unbounded_thread_pool.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -74,13 +81,22 @@ class Iterator {
   // indication of whether the end of the input pipeline has been reached.
   Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
 
+  // Saves a checkpoint of the iterator. Returns a Tensor that can be called
+  // with `Restore()`.
+  StatusOr<Tensor> Save();
+
+  // Restores the iterator from a checkpoint.
+  Status Restore(const Tensor& saved_iterator);
+
  private:
   friend class Dataset;
 
-  Iterator(IteratorBase* iterator, IteratorContext* ctx);
+  Iterator(IteratorBase* iterator, IteratorContext* ctx,
+           SerializationContext* serialization_ctx);
 
   std::unique_ptr<IteratorBase> iterator_;
   std::unique_ptr<IteratorContext> ctx_;
+  std::unique_ptr<SerializationContext> serialization_ctx_;
 };
 
 // Represents an input pipeline as a collection of data sources and a logical
diff --git a/tensorflow/core/data/standalone_save_restore_test.cc b/tensorflow/core/data/standalone_save_restore_test.cc
new file mode 100644
index 00000000000..91ea48c3ddf
--- /dev/null
+++ b/tensorflow/core/data/standalone_save_restore_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/test_util.h"
+#include "tensorflow/core/data/standalone.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+class TestDataset {
+ public:
+  explicit TestDataset(const DatasetDef& dataset_def) {
+    TF_CHECK_OK(
+        Dataset::FromGraph(Dataset::Params(), dataset_def.graph(), &dataset_));
+  }
+
+  StatusOr<std::unique_ptr<Iterator>> MakeIterator() const {
+    std::unique_ptr<Iterator> iterator;
+    TF_RETURN_IF_ERROR(dataset_->MakeIterator(&iterator));
+    return iterator;
+  }
+
+ private:
+  std::unique_ptr<Dataset> dataset_;
+};
+
+template <class T>
+StatusOr<T> GetNext(Iterator& iterator) {
+  std::vector<Tensor> result;
+  bool end_of_sequence = false;
+  TF_RETURN_IF_ERROR(iterator.GetNext(&result, &end_of_sequence));
+  if (end_of_sequence) {
+    return errors::OutOfRange("iterator has reached the end of sequence.");
+  }
+  if (result.size() != 1) {
+    return errors::Internal("GetNext result Tensor size should be 1.");
+  }
+  return result[0].unaligned_flat<T>().data()[0];
+}
+
+TEST(TaskRunnerCheckpointTest, SaveAndRestoreFromCheckpoints) {
+  int64_t range = 10;
+  TestDataset dataset(testing::RangeDataset(range));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Iterator> iterator,
+                          dataset.MakeIterator());
+  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+
+  for (int64_t i = 0; i < range; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
+    TF_ASSERT_OK(iterator->Restore(saved_iterator));
+    EXPECT_THAT(GetNext<int64_t>(*iterator), IsOkAndHolds(i));
+    TF_ASSERT_OK_AND_ASSIGN(saved_iterator, iterator->Save());
+  }
+}
+
+TEST(TaskRunnerCheckpointTest, EmptyDataset) {
+  TestDataset dataset(testing::RangeDataset(0));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Iterator> iterator,
+                          dataset.MakeIterator());
+  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+
+  TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
+  TF_ASSERT_OK(iterator->Restore(saved_iterator));
+  EXPECT_THAT(GetNext<int64_t>(*iterator), StatusIs(error::OUT_OF_RANGE));
+}
+
+TEST(TaskRunnerCheckpointTest, EndOfSequenceIterator) {
+  TestDataset dataset(testing::RangeDataset(0));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Iterator> iterator,
+                          dataset.MakeIterator());
+  EXPECT_THAT(GetNext<int64_t>(*iterator), StatusIs(error::OUT_OF_RANGE));
+
+  TF_ASSERT_OK_AND_ASSIGN(Tensor saved_iterator, iterator->Save());
+  TF_ASSERT_OK_AND_ASSIGN(iterator, dataset.MakeIterator());
+  TF_ASSERT_OK(iterator->Restore(saved_iterator));
+  EXPECT_THAT(GetNext<int64_t>(*iterator), StatusIs(error::OUT_OF_RANGE));
+}
+
+}  // namespace
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/tfdataz_metrics.cc b/tensorflow/core/data/tfdataz_metrics.cc
new file mode 100644
index 00000000000..5516f5f9a4f
--- /dev/null
+++ b/tensorflow/core/data/tfdataz_metrics.cc
@@ -0,0 +1,154 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/tfdataz_metrics.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+ApproximateLatencyEstimator::ApproximateLatencyEstimator(const Env& env)
+    : env_(env),
+      last_updated_time_mins_(0),
+      latency_value_counter_(0),
+      latency_count_counter_(0),
+      next_slot_(0) {
+  for (int i = 0; i < kSlots; ++i) {
+    latency_value_[i] = 0;
+    latency_count_[i] = 0;
+  }
+}
+
+void ApproximateLatencyEstimator::AddLatency(const int64_t latency_usec)
+    TF_LOCKS_EXCLUDED(mu_) {
+  UpdateRingBuffer();
+
+  mutex_lock l(mu_);
+  latency_value_counter_ += latency_usec;
+  latency_count_counter_ += 1;
+}
+
+void ApproximateLatencyEstimator::UpdateRingBuffer() TF_LOCKS_EXCLUDED(mu_) {
+  int64_t now_minutes =
+      absl::ToInt64Minutes(absl::Microseconds(env_.NowMicros()));
+
+  mutex_lock l(mu_);
+  int64_t elapsed_minutes = now_minutes - last_updated_time_mins_;
+  int64_t minutes_to_update = std::min(elapsed_minutes, kSlots);
+  for (int i = 0; i < minutes_to_update; ++i) {
+    latency_value_[next_slot_] = latency_value_counter_;
+    latency_count_[next_slot_] = latency_count_counter_;
+    IncrementNextSlot();
+  }
+  last_updated_time_mins_ = now_minutes;
+}
+
+void ApproximateLatencyEstimator::IncrementNextSlot()
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  next_slot_ = (next_slot_ + 1) % kSlots;
+}
+
+int ApproximateLatencyEstimator::PrevSlot(int steps)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  return (next_slot_ - steps + kSlots) % kSlots;
+}
+
+absl::Duration ApproximateLatencyEstimator::GetAverageLatency(Duration duration)
+    TF_LOCKS_EXCLUDED(mu_) {
+  UpdateRingBuffer();
+
+  mutex_lock l(mu_);
+  double interval_latency =
+      static_cast<double>(latency_value_counter_ -
+                          latency_value_[PrevSlot(static_cast<int>(duration))]);
+  double interval_count =
+      static_cast<double>(latency_count_counter_ -
+                          latency_count_[PrevSlot(static_cast<int>(duration))]);
+  return absl::Duration(absl::Microseconds(interval_latency)) / interval_count;
+}
+
+TfDatazMetricsCollector::TfDatazMetricsCollector(const Env& env,
+                                                 IteratorBase* iterator)
+    : iterator_(iterator), latency_estimator_(env) {}
+
+void TfDatazMetricsCollector::RecordGetNextLatency(
+    int64_t get_next_latency_usec) {
+  if (get_next_latency_usec > 0) {
+    latency_estimator_.AddLatency(get_next_latency_usec);
+  }
+}
+
+absl::Duration TfDatazMetricsCollector::GetAverageLatencyForLastOneMinute() {
+  return latency_estimator_.GetAverageLatency(
+      ApproximateLatencyEstimator::Duration::kMinute);
+}
+
+absl::Duration TfDatazMetricsCollector::GetAverageLatencyForLastFiveMinutes() {
+  return latency_estimator_.GetAverageLatency(
+      ApproximateLatencyEstimator::Duration::kFiveMinutes);
+}
+
+absl::Duration TfDatazMetricsCollector::GetAverageLatencyForLastSixtyMinutes() {
+  return latency_estimator_.GetAverageLatency(
+      ApproximateLatencyEstimator::Duration::kSixtyMinutes);
+}
+
+int64_t TfDatazMetricsCollector::GetIteratorTotalMemoryUsage() {
+  return iterator_->TotalBufferedBytes();
+}
+
+namespace {
+static mutex* get_tfdataz_metrics_registry_lock() {
+  static mutex tfdataz_metrics_registry_lock(LINKER_INITIALIZED);
+  return &tfdataz_metrics_registry_lock;
+}
+
+using TfDatazMetricsCollectors =
+    absl::flat_hash_set<std::shared_ptr<TfDatazMetricsCollector>>;
+TfDatazMetricsCollectors& tfdataz_metric_collectors() {
+  static auto& collectors = *new TfDatazMetricsCollectors();
+  return collectors;
+}
+}  // namespace
+
+void TfDatazMetricsRegistry::Register(
+    std::shared_ptr<TfDatazMetricsCollector> collector) {
+  mutex_lock l(*get_tfdataz_metrics_registry_lock());
+  tfdataz_metric_collectors().insert(collector);
+}
+
+void TfDatazMetricsRegistry::Deregister(
+    std::shared_ptr<TfDatazMetricsCollector> collector) {
+  mutex_lock l(*get_tfdataz_metrics_registry_lock());
+  tfdataz_metric_collectors().erase(collector);
+}
+
+absl::flat_hash_set<std::shared_ptr<TfDatazMetricsCollector>>
+TfDatazMetricsRegistry::GetIteratorMetricCollectors() {
+  mutex_lock l(*get_tfdataz_metrics_registry_lock());
+  return tfdataz_metric_collectors();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/tfdataz_metrics.h b/tensorflow/core/data/tfdataz_metrics.h
new file mode 100644
index 00000000000..bb59a73f8d0
--- /dev/null
+++ b/tensorflow/core/data/tfdataz_metrics.h
@@ -0,0 +1,144 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
+#define TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+
+// Calculates the approximate average latency for past 1, 5 and 60 minutes.
+// The implementation uses ring buffers to maintain the cumulative latency
+// values and count for the past 60 minutes.
+class ApproximateLatencyEstimator {
+ public:
+  enum class Duration {
+    kMinute = 1,
+    kFiveMinutes = 5,
+    kSixtyMinutes = 60,
+  };
+
+  explicit ApproximateLatencyEstimator(const Env& env);
+
+  // Records the latency with the current timestamp.
+  void AddLatency(int64_t latency_usec);
+
+  // Returns the average latency for the duration (1,5 and 60 minutes)
+  // specified.
+  absl::Duration GetAverageLatency(Duration duration);
+
+ private:
+  static constexpr int64_t kSecondsPerMinute = 60;
+  static constexpr int64_t kMinutesPerHour = 60;
+  static constexpr int64_t kSlots = kMinutesPerHour;
+
+  // Updates the latency value and count ring buffers with the latest cumulative
+  // value and count. Resets the entire ring buffer with the last cumulative
+  // values stored if the elapsed time duration is greater than 60 minutes.
+  void UpdateRingBuffer() TF_LOCKS_EXCLUDED(mu_);
+  // Moves the `next_slot_` to the next index in the ring buffer.
+  void IncrementNextSlot() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Returns the slot index which is behind the current slot in ring buffer by
+  // `steps` indices.
+  int PrevSlot(int steps) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const Env& env_;
+
+  // The time when the ring buffer was last updated.
+  int64_t last_updated_time_mins_ TF_GUARDED_BY(mu_);
+
+  mutex mu_;
+
+  // Counters storing the cumulative sums of latency values and counts recorded
+  // so far.
+  int64_t latency_value_counter_ TF_GUARDED_BY(mu_);
+  int64_t latency_count_counter_ TF_GUARDED_BY(mu_);
+
+  // Next slot in the ring buffer.
+  int next_slot_ TF_GUARDED_BY(mu_);
+
+  // Ring buffer storing the cumulative sum of latency values and counts for the
+  // last 60 minutes.
+  int64_t latency_value_[kSlots] TF_GUARDED_BY(mu_);
+  int64_t latency_count_[kSlots] TF_GUARDED_BY(mu_);
+};
+
+// Collects and exports the tf.data performance metrics to /tfdataz.
+class TfDatazMetricsCollector {
+ public:
+  // Constructs a `TfDatazMetricsCollector`.
+  // We only collect metrics for CPU devices. This is a heuristic to avoid
+  // collecting metrics for device-side iterators created by the multi-device
+  // iterator mechanism.
+  TfDatazMetricsCollector(const Env& env, IteratorBase* iterator);
+
+  // Records `GetNext` call latency.
+  void RecordGetNextLatency(int64_t get_next_latency_usec);
+
+  // Returns the average `GetNext` latency for past 1 minute.
+  absl::Duration GetAverageLatencyForLastOneMinute();
+
+  // Returns the average `GetNext` latency for past 5 minutes.
+  absl::Duration GetAverageLatencyForLastFiveMinutes();
+
+  // Returns the average `GetNext` latency for past 60 minutes.
+  absl::Duration GetAverageLatencyForLastSixtyMinutes();
+
+  // Returns the total memory (in bytes) used by the iterator.
+  // Total memory used by the iterator includes the total number of bytes
+  // buffered in all nodes in the subtree.
+  int64_t GetIteratorTotalMemoryUsage();
+
+ private:
+  IteratorBase* iterator_;  // not owned
+  ApproximateLatencyEstimator latency_estimator_;
+};
+
+// Thread-safe global registry for the /tfdataz metrics. All callers to
+// `TfDatazMetricsRegistry` use the same instance to register and deregister
+// iterator's `TfDatazMetricsCollector`.
+class TfDatazMetricsRegistry {
+ public:
+  // Registers the iterator specific `TfDatazMetricsCollector` in the global
+  // TfDatazMetricsRegistry.
+  static void Register(std::shared_ptr<TfDatazMetricsCollector> collector);
+
+  // Deregisters the iterator specific `TfDatazMetricsCollector` from the global
+  // TfDatazMetricsRegistry.
+  static void Deregister(std::shared_ptr<TfDatazMetricsCollector> collector);
+
+  // Returns all the registered `TfDatazMetricsCollector`s.
+  static absl::flat_hash_set<std::shared_ptr<TfDatazMetricsCollector>>
+  GetIteratorMetricCollectors();
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_TFDATAZ_METRICS_H_
diff --git a/tensorflow/core/data/tfdataz_metrics_test.cc b/tensorflow/core/data/tfdataz_metrics_test.cc
new file mode 100644
index 00000000000..2d3d75946e4
--- /dev/null
+++ b/tensorflow/core/data/tfdataz_metrics_test.cc
@@ -0,0 +1,224 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/tfdataz_metrics.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/fake_clock_env.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+static int64_t k1MinutesInMicros = absl::ToInt64Microseconds(absl::Minutes(1));
+static int64_t k2MinutesInMicros = absl::ToInt64Microseconds(absl::Minutes(2));
+static int64_t k5MinutesInMicros = absl::ToInt64Microseconds(absl::Minutes(5));
+static int64_t k59MinutesInMicros =
+    absl::ToInt64Microseconds(absl::Minutes(59));
+static int64_t k60MinutesInMicros =
+    absl::ToInt64Microseconds(absl::Minutes(60));
+static int64_t k61MinutesInMicros =
+    absl::ToInt64Microseconds(absl::Minutes(61));
+
+class TfDatazMetricsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    env_ = std::make_unique<FakeClockEnv>(Env::Default());
+    tfdataz_metrics_ =
+        std::make_unique<TfDatazMetricsCollector>(*env_, iterator_.get());
+  }
+
+  void TearDown() override {
+    env_.reset();
+    tfdataz_metrics_.reset();
+  }
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<FakeClockEnv> env_;
+  std::unique_ptr<TfDatazMetricsCollector> tfdataz_metrics_;
+};
+
+TEST_F(TfDatazMetricsTest, RecordGetNextLatency) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  tfdataz_metrics_->RecordGetNextLatency(2);
+  tfdataz_metrics_->RecordGetNextLatency(3);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastOneMinute()),
+                  2.0);
+}
+
+TEST_F(TfDatazMetricsTest, GetAverageLatencyForLastOneMinute) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  env_->AdvanceByMicroseconds(k2MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(2);
+  tfdataz_metrics_->RecordGetNextLatency(3);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastOneMinute()),
+                  2.5);
+}
+
+TEST_F(TfDatazMetricsTest, GetAverageLatencyForLastFiveMinutes) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  env_->AdvanceByMicroseconds(k5MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(4);
+  tfdataz_metrics_->RecordGetNextLatency(5);
+  tfdataz_metrics_->RecordGetNextLatency(6);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastFiveMinutes()),
+                  5.0);
+}
+
+TEST_F(TfDatazMetricsTest,
+       GetAverageLatencyForLastSixtyMinutesWithAdvanceBySixtyMinutes) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  env_->AdvanceByMicroseconds(k60MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(4);
+  tfdataz_metrics_->RecordGetNextLatency(5);
+  tfdataz_metrics_->RecordGetNextLatency(6);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  5.0);
+}
+
+TEST_F(TfDatazMetricsTest,
+       GetAverageLatencyForLastSixtyMinutesWithAdvanceByFiftyNineMinutes) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  env_->AdvanceByMicroseconds(k59MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(4);
+  tfdataz_metrics_->RecordGetNextLatency(5);
+  tfdataz_metrics_->RecordGetNextLatency(6);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  4.0);
+}
+
+TEST_F(TfDatazMetricsTest,
+       GetAverageLatencyForLastSixtyMinutesWithAdvanceBySixtyOneMinutes) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  env_->AdvanceByMicroseconds(k61MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(2);
+  tfdataz_metrics_->RecordGetNextLatency(3);
+  tfdataz_metrics_->RecordGetNextLatency(4);
+
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  3.0);
+}
+
+TEST_F(TfDatazMetricsTest, GetMultipleAverageLatencies) {
+  tfdataz_metrics_->RecordGetNextLatency(1);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastOneMinute()),
+                  1.0);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastFiveMinutes()),
+                  1.0);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  1.0);
+
+  env_->AdvanceByMicroseconds(k1MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(2);
+  tfdataz_metrics_->RecordGetNextLatency(3);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastOneMinute()),
+                  2.5);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastFiveMinutes()),
+                  2.0);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  2.0);
+
+  env_->AdvanceByMicroseconds(k60MinutesInMicros);
+  tfdataz_metrics_->RecordGetNextLatency(4);
+  tfdataz_metrics_->RecordGetNextLatency(5);
+  tfdataz_metrics_->RecordGetNextLatency(6);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastOneMinute()),
+                  5.0);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastFiveMinutes()),
+                  5.0);
+  EXPECT_FLOAT_EQ(absl::ToDoubleMicroseconds(
+                      tfdataz_metrics_->GetAverageLatencyForLastSixtyMinutes()),
+                  5.0);
+}
+
+class ScopedTfDataMetricsRegistration {
+ public:
+  explicit ScopedTfDataMetricsRegistration(
+      std::shared_ptr<TfDatazMetricsCollector> collector)
+      : collector_(std::move(collector)) {
+    TfDatazMetricsRegistry::Register(collector_);
+  }
+
+  ~ScopedTfDataMetricsRegistration() {
+    TfDatazMetricsRegistry::Deregister(collector_);
+  }
+
+  void Deregister() { TfDatazMetricsRegistry::Deregister(collector_); }
+
+ private:
+  std::shared_ptr<TfDatazMetricsCollector> collector_;
+};
+
+TEST(TfDatazMetricsRegistryTest, Register) {
+  std::unique_ptr<IteratorBase> iterator;
+  auto collector_one = std::make_shared<TfDatazMetricsCollector>(
+      *Env::Default(), iterator.get());
+  auto collector_two = std::make_shared<TfDatazMetricsCollector>(
+      *Env::Default(), iterator.get());
+
+  ScopedTfDataMetricsRegistration scoped_registration_one(collector_one);
+  ScopedTfDataMetricsRegistration scoped_registration_two(collector_two);
+
+  EXPECT_EQ(TfDatazMetricsRegistry::GetIteratorMetricCollectors().size(), 2);
+}
+
+TEST(TfDatazMetricsRegistryTest, Deregister) {
+  std::unique_ptr<IteratorBase> iterator;
+  auto collector_one = std::make_shared<TfDatazMetricsCollector>(
+      *Env::Default(), iterator.get());
+  auto collector_two = std::make_shared<TfDatazMetricsCollector>(
+      *Env::Default(), iterator.get());
+  auto collector_three = std::make_shared<TfDatazMetricsCollector>(
+      *Env::Default(), iterator.get());
+  ScopedTfDataMetricsRegistration scoped_registration_one(collector_one);
+  ScopedTfDataMetricsRegistration scoped_registration_two(collector_two);
+  ScopedTfDataMetricsRegistration scoped_registration_three(collector_three);
+  EXPECT_EQ(TfDatazMetricsRegistry::GetIteratorMetricCollectors().size(), 3);
+
+  scoped_registration_one.Deregister();
+  EXPECT_EQ(TfDatazMetricsRegistry::GetIteratorMetricCollectors().size(), 2);
+
+  scoped_registration_two.Deregister();
+  scoped_registration_three.Deregister();
+  EXPECT_EQ(TfDatazMetricsRegistry::GetIteratorMetricCollectors().size(), 0);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index c5eef463c7c..b685e9fbc3e 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -28,11 +28,11 @@ load(
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
-    "tf_kernel_tests_linkstatic",
     "tf_proto_library",
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -171,7 +171,6 @@ tf_cc_test(
     name = "debug_io_utils_test",
     size = "small",
     srcs = ["debug_io_utils_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = [
         "no_oss",  # TODO(b/137652456): remove when fixed
     ],
@@ -192,7 +191,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/tsl/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
 
@@ -200,7 +199,6 @@ tf_cc_test(
     name = "debug_graph_utils_test",
     size = "small",
     srcs = ["debug_graph_utils_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":debug_graph_utils",
         "//tensorflow/core:lib",
@@ -215,7 +213,6 @@ tf_cc_test(
     name = "grpc_session_debug_test",
     size = "medium",
     srcs = ["grpc_session_debug_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = [
         "no_oss",  # b/62956105: port conflicts.
         "nomac",  # b/38276817
@@ -249,7 +246,6 @@ tf_cc_test(
     name = "debug_grpc_io_utils_test",
     size = "small",
     srcs = ["debug_grpc_io_utils_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = [
         "no_oss",  # b/73962011
     ],
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 097029c7ebe..cc8e359ab10 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -11,12 +11,11 @@ load(
     "tf_copts",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_grpc_cc_dependencies")
-
-# For platform specific build config
 load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
+    "//tensorflow:tensorflow.default.bzl",
+    "filegroup",
+    "tf_cuda_cc_test",
+    "tf_grpc_cc_dependencies",
 )
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -24,6 +23,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = if_google(
         ["//tensorflow:internal"],
         ["//visibility:public"],
@@ -48,6 +48,7 @@ cc_library(
     name = "partial_run_mgr",
     srcs = ["partial_run_mgr.cc"],
     hdrs = ["partial_run_mgr.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_interface",
         "//tensorflow/core:framework",
@@ -59,6 +60,7 @@ tf_cc_test(
     name = "partial_run_mgr_test",
     size = "small",
     srcs = ["partial_run_mgr_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":partial_run_mgr",
         "//tensorflow/core:lib",
@@ -71,6 +73,7 @@ cc_library(
     name = "message_wrappers",
     srcs = ["message_wrappers.cc"],
     hdrs = ["message_wrappers.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -83,6 +86,7 @@ tf_cc_test(
     name = "message_wrappers_test",
     size = "small",
     srcs = ["message_wrappers_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":message_wrappers",
         "//tensorflow/core:protos_all_cc",
@@ -94,18 +98,28 @@ tf_cc_test(
 
 cc_library(
     name = "worker_session",
-    srcs = [
-        "cluster_function_library_runtime.cc",
-        "worker_session.cc",
-    ],
-    hdrs = [
-        "cluster_function_library_runtime.h",
-        "worker_session.h",
-    ],
+    srcs = ["worker_session.cc"],
+    hdrs = ["worker_session.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":graph_mgr",
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "cluster_function_library_runtime",
+    srcs = ["cluster_function_library_runtime.cc"],
+    hdrs = ["cluster_function_library_runtime.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
+    deps = [
         ":worker_cache",
         ":worker_interface",
+        ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -119,11 +133,12 @@ cc_library(
 tf_cc_test(
     name = "cluster_function_library_runtime_test",
     srcs = ["cluster_function_library_runtime_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_windows",
     ],
     deps = [
+        ":cluster_function_library_runtime",
         ":worker_session",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib_internal",
@@ -142,7 +157,9 @@ cc_library(
     name = "session_mgr",
     srcs = ["session_mgr.cc"],
     hdrs = ["session_mgr.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
+        ":cluster_function_library_runtime",
         ":error_payloads",
         ":graph_mgr",
         ":remote_device",
@@ -154,9 +171,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core/activity_watcher",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/core/protobuf:worker_proto_cc",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
     ],
 )
@@ -165,6 +183,7 @@ tf_cc_test(
     name = "session_mgr_test",
     size = "small",
     srcs = ["session_mgr_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":error_payloads",
         ":session_mgr",
@@ -181,6 +200,7 @@ tf_cc_test(
 cc_library(
     name = "worker_env",
     hdrs = ["worker_env.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = ["//tensorflow/core:lib"],
 )
 
@@ -190,6 +210,7 @@ cc_library(
     hdrs = [
         "tensor_coding.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -204,6 +225,7 @@ cc_library(
     hdrs = [
         "worker_interface.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":message_wrappers",
@@ -216,6 +238,7 @@ cc_library(
 cc_library(
     name = "error_payloads",
     hdrs = ["error_payloads.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
 )
 
@@ -225,6 +248,7 @@ tf_cuda_library(
     hdrs = [
         "worker.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":error_payloads",
         ":graph_mgr",
@@ -246,6 +270,7 @@ tf_cuda_library(
 cc_library(
     name = "call_options",
     hdrs = ["call_options.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/tsl/distributed_runtime:call_options",
     ],
@@ -255,6 +280,7 @@ tf_cc_test(
     name = "call_options_test",
     size = "small",
     srcs = ["call_options_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         "//tensorflow/core:test",
@@ -266,6 +292,7 @@ cc_library(
     name = "cancellable_call",
     srcs = ["cancellable_call.cc"],
     hdrs = ["cancellable_call.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":worker_cache",
@@ -278,6 +305,7 @@ tf_cc_test(
     name = "tensor_coding_test",
     size = "small",
     srcs = ["tensor_coding_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     linkstatic = 1,
     deps = [
         ":tensor_coding",
@@ -296,6 +324,7 @@ tf_cc_test(
 cc_library(
     name = "worker_cache",
     hdrs = ["worker_cache.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_interface",
         "//tensorflow/core:lib",
@@ -308,6 +337,7 @@ cc_library(
 cc_library(
     name = "worker_cache_wrapper",
     hdrs = ["worker_cache_wrapper.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [":worker_cache"],
 )
 
@@ -315,6 +345,7 @@ cc_library(
     name = "remote_device",
     srcs = ["remote_device.cc"],
     hdrs = ["remote_device.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_cache",
         ":worker_interface",
@@ -328,6 +359,7 @@ cc_library(
 cc_library(
     name = "master_interface",
     hdrs = ["master_interface.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":message_wrappers",
@@ -341,6 +373,7 @@ cc_library(
     name = "master",
     srcs = ["master.cc"],
     hdrs = ["master.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":master_env",
@@ -365,6 +398,7 @@ cc_library(
     name = "master_session",
     srcs = ["master_session.cc"],
     hdrs = ["master_session.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":master_env",
@@ -389,6 +423,7 @@ cc_library(
     name = "local_master",
     srcs = ["local_master.cc"],
     hdrs = ["local_master.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":master",
         ":master_interface",
@@ -400,6 +435,7 @@ cc_library(
     name = "rendezvous_mgr_interface",
     srcs = [],
     hdrs = ["rendezvous_mgr_interface.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_env",
         "//tensorflow/core:framework_internal",
@@ -411,6 +447,7 @@ cc_library(
     name = "scheduler",
     srcs = ["scheduler.cc"],
     hdrs = ["scheduler.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -423,7 +460,9 @@ cc_library(
     name = "base_rendezvous_mgr",
     srcs = ["base_rendezvous_mgr.cc"],
     hdrs = ["base_rendezvous_mgr.h"],
-    copts = tf_copts(),
+    copts =
+        # copybara:uncomment ["-Wthread-safety-analysis"] +
+        tf_copts(),
     deps = [
         ":rendezvous_mgr_interface",
         ":worker_env",
@@ -443,6 +482,7 @@ cc_library(
 cc_library(
     name = "master_env",
     hdrs = ["master_env.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_cache",
         "//tensorflow/core:protos_all_cc",
@@ -454,6 +494,7 @@ cc_library(
     name = "graph_mgr",
     srcs = ["graph_mgr.cc"],
     hdrs = ["graph_mgr.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":message_wrappers",
         ":rendezvous_mgr_interface",
@@ -475,6 +516,7 @@ cc_library(
     name = "worker_cache_partial",
     srcs = ["worker_cache_partial.cc"],
     hdrs = ["worker_cache_partial.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_cache",
         ":worker_interface",
@@ -489,6 +531,7 @@ cc_library(
     name = "worker_cache_logger",
     srcs = ["worker_cache_logger.cc"],
     hdrs = ["worker_cache_logger.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
@@ -500,6 +543,7 @@ cc_library(
     name = "server_lib",
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -510,6 +554,7 @@ cc_library(
 tf_cc_test(
     name = "server_lib_test",
     srcs = ["server_lib_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":server_lib",
         "//tensorflow/core:lib",
@@ -522,6 +567,7 @@ cc_library(
     name = "rpc_collective_executor_mgr",
     srcs = ["rpc_collective_executor_mgr.cc"],
     hdrs = ["rpc_collective_executor_mgr.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":collective_param_resolver_distributed",
         ":collective_rma_distributed",
@@ -536,6 +582,7 @@ cc_library(
 tf_cc_test(
     name = "rpc_collective_executor_mgr_test",
     srcs = ["rpc_collective_executor_mgr_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":collective_param_resolver_distributed",
         ":device_resolver_distributed",
@@ -556,6 +603,7 @@ cc_library(
     name = "collective_rma_distributed",
     srcs = ["collective_rma_distributed.cc"],
     hdrs = ["collective_rma_distributed.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":call_options",
         ":cancellable_call",
@@ -575,6 +623,7 @@ tf_cc_test(
     name = "collective_rma_distributed_test",
     size = "small",
     srcs = ["collective_rma_distributed_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_windows",  # TODO(b/187863739): Fix failure and remove.
     ],
@@ -598,6 +647,7 @@ cc_library(
     name = "collective_param_resolver_distributed",
     srcs = ["collective_param_resolver_distributed.cc"],
     hdrs = ["collective_param_resolver_distributed.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":cancellable_call",
         ":device_resolver_distributed",
@@ -615,6 +665,7 @@ cc_library(
     name = "test_utils",
     srcs = [],
     hdrs = ["test_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_cache",
         ":worker_interface",
@@ -626,6 +677,7 @@ tf_cc_test(
     name = "collective_param_resolver_distributed_test",
     size = "small",
     srcs = ["collective_param_resolver_distributed_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":collective_param_resolver_distributed",
         ":device_resolver_distributed",
@@ -651,6 +703,7 @@ cc_library(
     name = "device_resolver_distributed",
     srcs = ["device_resolver_distributed.cc"],
     hdrs = ["device_resolver_distributed.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":worker_cache",
         "//tensorflow/core:core_cpu_internal",
@@ -666,6 +719,7 @@ tf_cc_test(
     name = "device_resolver_distributed_test",
     size = "small",
     srcs = ["device_resolver_distributed_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":device_resolver_distributed",
         ":test_utils",
@@ -686,7 +740,7 @@ tf_cuda_cc_test(
     srcs = [
         "master_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":master",
@@ -722,7 +776,7 @@ tf_cuda_cc_test(
     name = "remote_device_test",
     size = "small",
     srcs = ["remote_device_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = tf_cuda_tests_tags() + [
         "manual",
         "notap",  # Memory leak due to b/62910646
@@ -754,6 +808,7 @@ tf_cuda_cc_test(
     name = "rpcbench_test",
     size = "small",
     srcs = ["rpcbench_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     linkstatic = 1,
     tags = tf_cuda_tests_tags(),
     deps = [
@@ -783,6 +838,7 @@ cc_library(
     name = "request_id",
     srcs = ["request_id.cc"],
     hdrs = ["request_id.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -793,6 +849,7 @@ tf_cc_test(
     name = "request_id_test",
     size = "small",
     srcs = ["request_id_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":request_id",
         "//tensorflow/core:test",
@@ -804,6 +861,7 @@ cc_library(
     name = "recent_request_ids",
     srcs = ["recent_request_ids.cc"],
     hdrs = ["recent_request_ids.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":message_wrappers",
         "//tensorflow/core:lib",
@@ -816,6 +874,7 @@ tf_cc_test(
     name = "recent_request_ids_test",
     size = "small",
     srcs = ["recent_request_ids_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":recent_request_ids",
         ":request_id",
@@ -839,7 +898,7 @@ filegroup(
     ],
     visibility = [
         "//tensorflow/core:__pkg__",
-        "//tensorflow/core/function:__pkg__",
+        "//tensorflow/core/function/runtime_client:__pkg__",
         "//tensorflow/python:__subpackages__",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 6643a84a137..bcc8e50b931 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -137,13 +137,22 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 }
 
 // Returns true if "device_name" is a valid full name of local device
-// of the "worker".  This helper is purely based on the worker name
+// of the "worker". This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
 static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
   return absl::StartsWith(device_name, worker_name);
 }
 
+// Returns true if the parsed device name is empty. An empty src device
+// is used to represent a Recv from the local host device when
+// the host device name is not known at the time when the graph node is
+// emitted.
+static bool IsImplicitLocalDevice(
+    const DeviceNameUtils::ParsedName parsed_device_name) {
+  return !DeviceNameUtils::HasSomeDetails(parsed_device_name);
+}
+
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
   CHECK_NE(session, nullptr) << "session must not be null!";
   std::vector<DeferredCall> deferred_calls;
@@ -191,7 +200,8 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
     sess = session_;
   }
 
-  if (!IsLocalDevice(sess->worker_name(), parsed.src_device)) {
+  if (!IsImplicitLocalDevice(parsed.src) &&
+      !IsLocalDevice(sess->worker_name(), parsed.src_device)) {
     return errors::InvalidArgument(
         "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
         sess->worker_name());
@@ -214,7 +224,8 @@ Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
     }
     sess = session_;
   }
-  if (is_src && !IsLocalDevice(sess->worker_name(), parsed.src_device)) {
+  if (is_src && !IsImplicitLocalDevice(parsed.src) &&
+      !IsLocalDevice(sess->worker_name(), parsed.src_device)) {
     return errors::InvalidArgument(
         "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
         sess->worker_name());
@@ -321,7 +332,10 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
 
   profiler::ScopedMemoryDebugAnnotation op_annotation("RecvAsync", step_id_);
   // Are src and dst in the same worker?
-  if (IsSameWorker(parsed.src, parsed.dst)) {
+  // At this point parsed.dst must be a local device asserted by the previous
+  // call to ValidateDevices.
+  if (IsImplicitLocalDevice(parsed.src) ||
+      IsSameWorker(parsed.src, parsed.dst)) {
     // Recv the tensor from local_.
     local_->RecvAsync(
         parsed, recv_args,
@@ -488,12 +502,13 @@ void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
       if (!already_cancelled) {
         it = calls_
                  .emplace(cm,
-                          std::make_unique<PendingCalls>(token, 1, num_shards_))
+                          std::make_unique<PendingCalls>(token, 0, num_shards_))
                  .first;
       }
     }
     DCHECK(it != calls_.end());
     if (!already_cancelled) {
+      it->second->num_calls.fetch_add(1);
       auto& bucket = it->second->buckets[hash];
       mutex_lock bucket_lock(bucket.mu);
       bool emplaced = bucket.calls.emplace(call).second;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 8ec24d47ba7..e4806099683 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -15,9 +15,12 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 
 #include <map>
+#include <memory>
+#include <utility>
+#include <vector>
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/common_runtime/inline_function_utils.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -26,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/named_tensor.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
@@ -337,7 +339,7 @@ void ClusterFunctionLibraryRuntime::Run(
   std::vector<Tensor> tensors;
   for (const auto& arg : args) {
     if (arg.index() == 0) {
-      tensors.push_back(absl::get<Tensor>(arg));
+      tensors.push_back(std::get<Tensor>(arg));
     } else {
       done(
           errors::Internal("ClusterFunctionLibraryRuntime doesn't support "
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index 4655bce44f9..eaad4301576 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "absl/types/optional.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index ee7309942d6..4ddaf1bd47d 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -45,7 +45,9 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
     worker_session_.reset(new WorkerSession(
         "cluster_test_session", "/job:localhost/replica:0/task:0",
         std::move(worker_cache), std::unique_ptr<DeviceMgr>(),
-        std::unique_ptr<GraphMgr>(), nullptr));
+        std::unique_ptr<GraphMgr>(), nullptr,
+        [](WorkerSession* worker_session, bool called,
+           DeviceMgr* remote_device_mgr) { return nullptr; }));
 
     cluster_flr_.reset(new ClusterFunctionLibraryRuntime(worker_session_.get(),
                                                          true, nullptr));
diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index b918b5a8557..d901ad637f3 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -1,12 +1,12 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_grpc_cc_dependencies")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
-    "tf_cuda_library",
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -16,68 +16,27 @@ package(
 cc_library(
     name = "coordination_client",
     hdrs = ["coordination_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
     ],
 )
 
-tf_cuda_library(
-    name = "coordination_service_agent",
-    srcs = ["coordination_service_agent.cc"],
-    hdrs = ["coordination_service_agent.h"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service_error_util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/distributed_runtime:call_options",
-        "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-    ],
-)
-
-tf_cc_test(
-    name = "coordination_service_agent_test",
-    srcs = ["coordination_service_agent_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service_agent",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/distributed_runtime:call_options",
-        "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-    ],
-)
-
 cc_library(
     name = "coordination_service_rpc_handler",
-    srcs = ["coordination_service_rpc_handler.cc"],
     hdrs = [
         "coordination_service_rpc_handler.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        ":coordination_service_error_util",
-        ":coordination_service_agent",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:lib",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
+    ],
 )
 
 cc_library(
     name = "coordination_service_error_util",
     hdrs = ["coordination_service_error_util.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_error_util",
     ],
@@ -87,13 +46,15 @@ cc_library(
     name = "coordination_service_barrier_proxy",
     srcs = ["coordination_service_barrier_proxy.cc"],
     hdrs = ["coordination_service_barrier_proxy.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        ":coordination_service_agent",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
@@ -103,50 +64,31 @@ cc_library(
 tf_cc_test(
     name = "coordination_service_barrier_proxy_test",
     srcs = ["coordination_service_barrier_proxy_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
-        ":coordination_service_agent",
         ":coordination_service_barrier_proxy",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
         "@com_google_absl//absl/time",
     ],
 )
 
-tf_cc_test(
-    name = "coordination_service_recoverable_job_test",
-    srcs = ["coordination_service_recoverable_job_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service_agent",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_client",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
-        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
-        "//tensorflow/tsl/platform:mutex",
-        "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
-)
-
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
         "coordination_client.h",
     ],
     visibility = [
-        "//tensorflow/core/function:__pkg__",
+        "//tensorflow/core/function/runtime_client:__pkg__",
         "//tensorflow/python:__subpackages__",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index b986ae876f2..825b4638e4a 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 namespace tensorflow {
 
@@ -90,7 +90,7 @@ size_t BarrierProxyManager::size() const {
   return barriers_.size();
 }
 
-Status BarrierProxyManager::Wait(CoordinationServiceAgent* agent,
+Status BarrierProxyManager::Wait(tsl::CoordinationServiceAgent* agent,
                                  const std::vector<CoordinatedTask>& tasks,
                                  int num_local_threads, absl::string_view key,
                                  absl::Duration timeout) {
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
index 74993977486..377681e507d 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -24,10 +24,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 namespace tensorflow {
 
@@ -55,7 +56,7 @@ class BarrierProxy {
   // `num_local_threads` specifies the number of threads in this task to
   // particiate. If no tasks are specified, the barrier will block for all the
   // connected tasks.
-  BarrierProxy(CoordinationServiceAgent* agent,
+  BarrierProxy(tsl::CoordinationServiceAgent* agent,
                std::vector<CoordinatedTask> tasks, int num_local_threads,
                absl::string_view key, absl::Duration timeout)
       : key_(key),
@@ -73,7 +74,7 @@ class BarrierProxy {
 
  private:
   const std::string key_;
-  CoordinationServiceAgent* agent_;
+  tsl::CoordinationServiceAgent* agent_;
   const std::vector<CoordinatedTask> tasks_;
   absl::Duration timeout_;
 
@@ -104,7 +105,7 @@ class BarrierProxyManager {
   // `num_local_threads` specifies the number of threads in this task to
   // participate. If no tasks are specified, the barrier will block for all the
   // connected tasks.
-  Status Wait(CoordinationServiceAgent* agent,
+  Status Wait(tsl::CoordinationServiceAgent* agent,
               const std::vector<CoordinatedTask>& tasks, int num_local_threads,
               absl::string_view key, absl::Duration timeout);
   // The number of active BarrierProxies.
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 4d4eb7ad094..14269a6a4a5 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include <vector>
 
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
@@ -35,6 +38,10 @@ namespace {
 
 using ::testing::_;
 using ::testing::Return;
+using tsl::CallOptions;
+using tsl::CoordinationClient;
+using tsl::CoordinationClientCache;
+using tsl::CoordinationServiceAgent;
 
 class MockCoordinationServiceAgent : public CoordinationServiceAgent {
  public:
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.cc
deleted file mode 100644
index 25dee8b5ab4..00000000000
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h"
-
-#include <iterator>
-#include <string>
-#include <utility>
-
-#include "absl/algorithm/container.h"
-#include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
-#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
-
-namespace tensorflow {
-namespace {
-using tsl::CoordinationServiceInterface;
-}
-
-void CoordinationServiceRpcHandler::SetAgentInstance(
-    CoordinationServiceAgent* agent) {
-  mutex_lock l(agent_mu_);
-  agent_ = agent;
-}
-
-void CoordinationServiceRpcHandler::RegisterTaskAsync(
-    const RegisterTaskRequest* request, RegisterTaskResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  const CoordinatedTask& task = request->source_task();
-  const uint64_t incarnation = request->incarnation();
-  const uint64_t leader_incarnation = service->GetServiceIncarnation();
-  response->set_leader_incarnation(leader_incarnation);
-  done(service->RegisterTask(task, incarnation));
-}
-
-void CoordinationServiceRpcHandler::HeartbeatAsync(
-    const HeartbeatRequest* request, HeartbeatResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  const CoordinatedTask& task = request->source_task();
-  const uint64_t incarnation = request->incarnation();
-  const uint64_t leader_incarnation = service->GetServiceIncarnation();
-  Status s = service->RecordHeartbeat(task, incarnation);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  response->set_leader_incarnation(leader_incarnation);
-  done(OkStatus());
-}
-
-void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
-    const WaitForAllTasksRequest* request, WaitForAllTasksResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  service->WaitForAllTasks(
-      request->source_task(), request->device_info(),
-      [response, service, done = std::move(done)](Status s) {
-        if (s.ok()) {
-          *response->mutable_device_info() = service->ListClusterDevices();
-        }
-        done(s);
-      });
-}
-
-void CoordinationServiceRpcHandler::ShutdownTaskAsync(
-    const ShutdownTaskRequest* request, ShutdownTaskResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  service->ShutdownTaskAsync(request->source_task(),
-                             [done](Status s) { done(s); });
-}
-
-void CoordinationServiceRpcHandler::ResetTaskAsync(
-    const ResetTaskRequest* request, ResetTaskResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  done(service->ResetTask(request->source_task()));
-}
-
-void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
-    const ReportErrorToTaskRequest* request,
-    ReportErrorToTaskResponse* response, StatusCallback done) {
-  tf_shared_lock l(agent_mu_);
-  if (agent_ == nullptr) {
-    done(MakeCoordinationError(errors::Internal(
-        "CoordinationServiceAgent is uninitialized or has already shutdown.")));
-    return;
-  }
-  const CoordinationServiceError& error_payload = request->error_payload();
-  Status error(static_cast<error::Code>(request->error_code()),
-               strings::StrCat("Error reported from /job:",
-                               error_payload.source_task().job_name(),
-                               "/task:", error_payload.source_task().task_id(),
-                               ": ", request->error_message()));
-  error = MakeCoordinationError(error, error_payload);
-  agent_->SetError(error);
-  done(OkStatus());
-}
-
-void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
-    const ReportErrorToServiceRequest* request,
-    ReportErrorToServiceResponse* response, StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  done(service->ReportTaskError(
-      request->error_origin(),
-      MakeCoordinationError(
-          Status{static_cast<error::Code>(request->error_code()),
-                 request->error_message()},
-          request->error_origin(),
-          /*is_reported_error=*/true)));
-}
-
-void CoordinationServiceRpcHandler::GetTaskStateAsync(
-    const GetTaskStateRequest* request, GetTaskStateResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  auto result = service->GetTaskState(
-      {request->source_task().begin(), request->source_task().end()});
-  absl::c_move(result,
-               RepeatedFieldBackInserter(response->mutable_task_state()));
-  done(OkStatus());
-}
-
-void CoordinationServiceRpcHandler::InsertKeyValueAsync(
-    const InsertKeyValueRequest* request, InsertKeyValueResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  done(service->InsertKeyValue(request->kv().key(), request->kv().value()));
-}
-
-void CoordinationServiceRpcHandler::GetKeyValueAsync(
-    const GetKeyValueRequest* request, GetKeyValueResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  response->mutable_kv()->set_key(request->key());
-  service->GetKeyValueAsync(
-      request->key(), [response, done = std::move(done)](
-                          const StatusOr<std::string>& status_or_value) {
-        if (status_or_value.ok()) {
-          response->mutable_kv()->set_value(status_or_value.value());
-        }
-        done(status_or_value.status());
-      });
-}
-
-void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
-    const TryGetKeyValueRequest* request, TryGetKeyValueResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  auto result = service->TryGetKeyValue(request->key());
-  if (!result.ok()) {
-    done(MakeCoordinationError(result.status()));
-    return;
-  }
-  response->mutable_kv()->set_key(request->key());
-  response->mutable_kv()->set_value(result.value());
-  done(OkStatus());
-}
-
-void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
-    const GetKeyValueDirRequest* request, GetKeyValueDirResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  std::vector<KeyValueEntry> results =
-      service->GetKeyValueDir(request->directory_key());
-  *response->mutable_kv() = {std::make_move_iterator(results.begin()),
-                             std::make_move_iterator(results.end())};
-  done(OkStatus());
-}
-
-void CoordinationServiceRpcHandler::DeleteKeyValueAsync(
-    const DeleteKeyValueRequest* request, DeleteKeyValueResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  done(service->DeleteKeyValue(request->key()));
-}
-
-void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
-                                                 BarrierResponse* response,
-                                                 StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  std::vector<CoordinatedTask> tasks = {request->tasks().begin(),
-                                        request->tasks().end()};
-  service->BarrierAsync(
-      request->barrier_id(),
-      absl::Milliseconds(request->barrier_timeout_in_ms()),
-      request->source_task(), tasks,
-      [done = std::move(done)](const Status& status) { done(status); });
-}
-
-void CoordinationServiceRpcHandler::CancelBarrierAsync(
-    const CancelBarrierRequest* request, CancelBarrierResponse* response,
-    StatusCallback done) {
-  CoordinationServiceInterface* service =
-      CoordinationServiceInterface::GetCoordinationServiceInstance();
-  if (service == nullptr) {
-    done(MakeCoordinationError(
-        errors::Internal("Coordination service is not enabled.")));
-    return;
-  }
-  done(service->CancelBarrier(request->barrier_id(), request->source_task()));
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
index 11ac8afe224..f4a30c2fc39 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -16,77 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
 
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
 namespace tensorflow {
-class CoordinationServiceAgent;
-
-class CoordinationServiceRpcHandler {
- public:
-  explicit CoordinationServiceRpcHandler() {}
-
-  void SetAgentInstance(CoordinationServiceAgent* agent);
-
-  void RegisterTaskAsync(const RegisterTaskRequest* request,
-                         RegisterTaskResponse* response, StatusCallback done);
-
-  void HeartbeatAsync(const HeartbeatRequest* request,
-                      HeartbeatResponse* response, StatusCallback done);
-
-  void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
-                            WaitForAllTasksResponse* response,
-                            StatusCallback done);
-
-  void ShutdownTaskAsync(const ShutdownTaskRequest* request,
-                         ShutdownTaskResponse* response, StatusCallback done);
-
-  void ResetTaskAsync(const ResetTaskRequest* request,
-                      ResetTaskResponse* response, StatusCallback done);
-
-  void ReportErrorToTaskAsync(const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              StatusCallback done);
-
-  void ReportErrorToServiceAsync(const ReportErrorToServiceRequest* request,
-                                 ReportErrorToServiceResponse* response,
-                                 StatusCallback done);
-
-  void GetTaskStateAsync(const GetTaskStateRequest* request,
-                         GetTaskStateResponse* response, StatusCallback done);
-
-  void InsertKeyValueAsync(const InsertKeyValueRequest* request,
-                           InsertKeyValueResponse* response,
-                           StatusCallback done);
-
-  void GetKeyValueAsync(const GetKeyValueRequest* request,
-                        GetKeyValueResponse* response, StatusCallback done);
-
-  void TryGetKeyValueAsync(const TryGetKeyValueRequest* request,
-                           TryGetKeyValueResponse* response,
-                           StatusCallback done);
-
-  void GetKeyValueDirAsync(const GetKeyValueDirRequest* request,
-                           GetKeyValueDirResponse* response,
-                           StatusCallback done);
-
-  void DeleteKeyValueAsync(const DeleteKeyValueRequest* request,
-                           DeleteKeyValueResponse* response,
-                           StatusCallback done);
-
-  void BarrierAsync(const BarrierRequest* request, BarrierResponse* response,
-                    StatusCallback done);
-
-  void CancelBarrierAsync(const CancelBarrierRequest* request,
-                          CancelBarrierResponse* response, StatusCallback done);
-
- private:
-  mutex agent_mu_;
-  CoordinationServiceAgent* agent_ TF_GUARDED_BY(agent_mu_) = nullptr;
-};
-
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CoordinationServiceRpcHandler;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 0d2bdcadbfb..ebd861895e4 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -15,6 +16,7 @@ package(
 cc_library(
     name = "remote_tensor_handle",
     hdrs = ["remote_tensor_handle.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -29,6 +31,7 @@ cc_library(
     hdrs = [
         "cluster_function_library_runtime.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":eager_client",
         "//tensorflow/core:core_cpu_internal",
@@ -46,6 +49,7 @@ cc_library(
 cc_library(
     name = "destroy_tensor_handle_node",
     hdrs = ["destroy_tensor_handle_node.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":eager_client",
         "//tensorflow/core/common_runtime/eager:eager_executor",
@@ -56,6 +60,7 @@ cc_library(
 cc_library(
     name = "eager_client",
     hdrs = ["eager_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -68,6 +73,7 @@ cc_library(
     name = "remote_execute_node",
     srcs = ["remote_execute_node.cc"],
     hdrs = ["remote_execute_node.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":eager_client",
         "//tensorflow/core:framework",
@@ -88,6 +94,7 @@ cc_library(
     hdrs = [
         "eager_service_impl.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":cluster_function_library_runtime",
         ":remote_mgr",
@@ -118,6 +125,7 @@ cc_library(
 tf_cc_test(
     name = "eager_service_impl_test",
     srcs = ["eager_service_impl_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":cluster_function_library_runtime",
         ":eager_service_impl",
@@ -150,6 +158,7 @@ cc_library(
     hdrs = [
         "remote_mgr.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//tensorflow:internal"],
     deps = [
         ":remote_tensor_handle",
@@ -164,6 +173,7 @@ tf_cc_test(
     name = "remote_mgr_test",
     size = "small",
     srcs = ["remote_mgr_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":remote_mgr",
         "//tensorflow/core:protos_all_cc",
@@ -179,6 +189,7 @@ cc_library(
     name = "remote_tensor_handle_data",
     srcs = ["remote_tensor_handle_data.cc"],
     hdrs = ["remote_tensor_handle_data.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":destroy_tensor_handle_node",
         ":eager_client",
@@ -196,6 +207,7 @@ cc_library(
     hdrs = [
         "remote_copy_node.h",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//tensorflow:internal"],
     deps = [
         ":remote_mgr",
@@ -217,7 +229,7 @@ filegroup(
         "remote_tensor_handle_data.h",
     ],
     visibility = [
-        "//tensorflow/core/function:__pkg__",
+        "//tensorflow/core/function/runtime_client:__pkg__",
         "//tensorflow/python:__subpackages__",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index c7328a368fd..041d1e65c33 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -204,6 +204,31 @@ Status AddOpRetvalsToResponse(
   }
   return sg.as_summary_status();
 }
+
+Status ResetAgentAndConnectToCoordinationService(
+    tsl::CoordinationServiceAgent* coord_agent) {
+  // The error state should already be consumed when a new context is
+  // created. It should be fine to reset the agent.
+  if (coord_agent->IsError()) {
+    const Status s = coord_agent->Reset();
+    if (!s.ok()) {
+      LOG(ERROR) << "Coordination Service agent reset failed " << s;
+      return s;
+    }
+  }
+  // In the scenario of PS strategy, the setup is single client and the error
+  // cannot be propagated. As a result, Coordination Service agent can still
+  // have the status of being connected. We should not let it connect again.
+  if (!coord_agent->IsConnected()) {
+    const Status s = coord_agent->Connect();
+    if (!s.ok()) {
+      LOG(ERROR) << "Coordination Service agent connect failed " << s;
+      return s;
+    }
+  }
+  return OkStatus();
+}
+
 }  // namespace
 
 Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
@@ -322,24 +347,8 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     // TODO(b/254356090): See if enabling health check needs to be inside the
     // Coordination Service.
     if (config.experimental().coordination_config().enable_health_check()) {
-      // The error state should already be consumed when a new context is
-      // created. It should be fine to reset the agent.
-      if (coord_agent->IsError()) {
-        const Status s = coord_agent->Reset();
-        if (!s.ok()) {
-          LOG(ERROR) << "Coordination Service agent reset failed " << s;
-          return s;
-        }
-      }
-      // The Coordination Service agent could still be connected due to not
-      // propagating the error. We should not let it connect again.
-      if (!coord_agent->IsConnected()) {
-        const Status s = coord_agent->Connect();
-        if (!s.ok()) {
-          LOG(ERROR) << "Coordination Service agent connect failed " << s;
-          return s;
-        }
-      }
+      TF_RETURN_IF_ERROR(
+          ResetAgentAndConnectToCoordinationService(coord_agent));
     }
     auto preemption_notifier =
         tsl::PreemptionNotifier::CreatePreemptionNotifier("sigterm",
@@ -454,6 +463,17 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
     return s;
   }
 
+#if !defined(IS_MOBILE_PLATFORM)
+  const auto& config = request->server_def().default_session_config();
+  const bool should_connect =
+      !config.experimental().coordination_config().service_type().empty() &&
+      config.experimental().coordination_config().enable_health_check();
+  if (should_connect) {
+    auto coord_agent = env_->session_mgr->GetCoordinationServiceAgent();
+    TF_RETURN_IF_ERROR(ResetAgentAndConnectToCoordinationService(coord_agent));
+  }
+#endif  // !IS_MOBILE_PLATFORM
+
   std::vector<DeviceAttributes> device_attributes;
   device_mgr->ListDeviceAttributes(&device_attributes);
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 6005996f816..084a8a539cc 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -155,7 +155,8 @@ class EagerServiceImplTest : public ::testing::Test {
                WorkerCacheInterface** worker_cache) {
               *worker_cache = new FakeCache;
               return OkStatus();
-            })) {
+            },
+            /*coordination_handler=*/nullptr)) {
     worker_env_.env = Env::Default();
 
     worker_env_.rendezvous_mgr = &rendezvous_mgr_;
@@ -965,7 +966,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
       /*allow_control_flow_sync_execution=*/false,
       /*shape_inference_on_tfe_dialect_import=*/true,
       /*int_args_and_retvals_on_device=*/false,
-      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousCreator(),
+      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousFactory(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
@@ -1019,7 +1020,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
       /*allow_control_flow_sync_execution=*/false,
       /*shape_inference_on_tfe_dialect_import=*/true,
       /*int_args_and_retvals_on_device=*/false,
-      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousCreator(),
+      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousFactory(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
@@ -1235,7 +1236,8 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      /*async=*/false, device_mgr_.get(), false, rendezvous);
+      /*async=*/false, device_mgr_.get(), false, rendezvous, nullptr, nullptr,
+      /*run_eager_op_as_function=*/true);
   const uint64 context_id = random::New64();
 
   // Set RemoteMgr to ctx.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index c6efe96b605..4cefc9433c2 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -108,7 +108,7 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
   EagerKernelArgs args(1);
   Device* d = ctx_->CanonicalDevice(std::get<Device*>(op->Device()));
   TF_RETURN_IF_ERROR(src_->TensorValue(d, args.MutableInput(0)));
-  CoordinationServiceAgent* coord_agent = nullptr;
+  tsl::CoordinationServiceAgent* coord_agent = nullptr;
   if (ctx_->GetDistributedManager() != nullptr)
     coord_agent = ctx_->GetDistributedManager()->GetCoordinationServiceAgent();
 
@@ -199,7 +199,7 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
 
   EagerKernelArgs args;
   std::vector<EagerKernelRet> rets;
-  CoordinationServiceAgent* coord_agent = nullptr;
+  tsl::CoordinationServiceAgent* coord_agent = nullptr;
   if (ctx_->GetDistributedManager() != nullptr)
     coord_agent = ctx_->GetDistributedManager()->GetCoordinationServiceAgent();
   TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 9e0977e0e14..070edf7f5a9 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -33,7 +33,7 @@ Status WithErrorSourcePayload(Status error) {
   error_source_proto.set_error_source(
       core::platform::ErrorSourceProto::EAGER_REMOTE_MGR);
   error.SetPayload(tensorflow::kErrorSource,
-                   error_source_proto.SerializeAsString());
+                   absl::Cord(error_source_proto.SerializeAsString()));
   return error;
 }
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 88ccfa11eac..26bc6df78f1 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -57,7 +57,8 @@ class RemoteMgrTest : public ::testing::Test {
     ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        /*async=*/false, device_mgr.release(), true, rendezvous, nullptr);
+        /*async=*/false, device_mgr.release(), true, rendezvous, nullptr,
+        nullptr, /*run_eager_op_as_function=*/true);
   }
 
   ~RemoteMgrTest() override { ctx_->Unref(); }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 1baca8948e2..d42be3c8658 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -421,7 +421,8 @@ void GraphMgr::ExecuteAsync(
     const NamedTensors& in, WorkerSession* session,
     StepStatsCollector* collector, MutableRunGraphResponseWrapper* response,
     CancellationManager* cancellation_manager,
-    CoordinationServiceAgent* coordination_service_agent, StatusCallback done) {
+    tsl::CoordinationServiceAgent* coordination_service_agent,
+    StatusCallback done) {
   const uint64 start_time_usecs = Env::Default()->NowMicros();
   profiler::TraceMeProducer activity(
       // To TraceMeConsumers in ExecutorState::Process/Finish or RunGraphDone.
@@ -518,7 +519,8 @@ void GraphMgr::StartParallelExecutors(
     CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
     CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
     WorkerSession* session, int64_t start_time_usecs,
-    CoordinationServiceAgent* coordination_service_agent, StatusCallback done) {
+    tsl::CoordinationServiceAgent* coordination_service_agent,
+    StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
   ScopedStepContainer* step_container = new ScopedStepContainer(
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 4b452329228..c633eb93721 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -37,6 +37,10 @@ limitations under the License.
 #include "tensorflow/core/protobuf/debug.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
 namespace tensorflow {
 
 class ExecutorOpts;
@@ -44,7 +48,6 @@ class StepStatsCollector;
 class RendezvousMgrInterface;
 class DeviceMgr;
 class WorkerSession;
-class CoordinationServiceAgent;
 
 // GraphMgr keeps track of a set of graphs that are registered with a
 // TensorFlow worker. Each registered graph is identified by a handle
@@ -94,7 +97,7 @@ class GraphMgr {
                     WorkerSession* session, StepStatsCollector* collector,
                     MutableRunGraphResponseWrapper* response,
                     CancellationManager* cancellation_manager,
-                    CoordinationServiceAgent* coordination_service_agent,
+                    tsl::CoordinationServiceAgent* coordination_service_agent,
                     StatusCallback done);
 
   Status SendInputs(const int64_t step_id, const NamedTensors& in);
@@ -171,7 +174,7 @@ class GraphMgr {
       CollectiveExecutor::Handle* ce_handle, StepStatsCollector* collector,
       CostGraphDef* cost_graph, CancellationManager* cancellation_manager,
       WorkerSession* session, int64_t start_time_usecs,
-      CoordinationServiceAgent* coordination_service_agent,
+      tsl::CoordinationServiceAgent* coordination_service_agent,
       StatusCallback done);
 
   // Don't attempt to process cost models unless explicitly requested for at
diff --git a/tensorflow/core/distributed_runtime/integration_test/BUILD b/tensorflow/core/distributed_runtime/integration_test/BUILD
index 0e4d4d8c8fe..1fbeafccbaa 100644
--- a/tensorflow/core/distributed_runtime/integration_test/BUILD
+++ b/tensorflow/core/distributed_runtime/integration_test/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -16,9 +17,11 @@ tf_cuda_library(
     name = "coordination_test_opkernel_registration",
     testonly = 1,
     srcs = ["coordination_test_opkernel_registration.cc"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_error_util",
     ],
     alwayslink = 1,
 )
@@ -27,6 +30,7 @@ tf_cuda_cc_test(
     name = "c_api_coordination_test",
     size = "small",
     srcs = ["c_api_coordination_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_cuda_asan",  # TODO(b/193450885)
         "no_windows",  # TODO(b/207281588)
@@ -55,6 +59,7 @@ tf_cuda_cc_test(
     name = "c_api_session_coordination_test",
     size = "small",
     srcs = ["c_api_session_coordination_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_windows",  # TODO(b/207281588)
     ],
@@ -78,6 +83,7 @@ tf_cuda_cc_test(
     name = "c_api_multi_client_test",
     size = "small",
     srcs = ["c_api_multi_client_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_windows",  # TODO(b/207281588)
     ],
@@ -97,9 +103,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/core/platform:env",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 
@@ -107,6 +113,7 @@ tf_cc_test(
     name = "c_api_multi_client_function_test",
     size = "small",
     srcs = ["c_api_multi_client_function_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_oss",  # test uses TFRT
     ],
@@ -129,9 +136,37 @@ tf_cc_test(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/distributed_runtime:server_lib",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/core/platform:blocking_counter",
         "//tensorflow/core/platform:env",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_recoverable_jobs_test",
+    size = "small",
+    srcs = ["c_api_recoverable_jobs_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
+    tags = [
+        "no_cuda_asan",  # TODO(b/193450885)
+        "no_windows",  # TODO(b/207281588)
+    ],
+    deps = [
+        ":coordination_test_opkernel_registration",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
index 2cffde34ffc..41730545982 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
@@ -589,140 +589,6 @@ TEST_P(SingleClientCoordinationServiceTest, TestSetGetConfigInOp) {
   w2.release();
 }
 
-TEST_F(SingleClientCoordinationServiceTest, TestReportingErrorInOp) {
-  tensorflow::ServerDef server_def = GetServerDef("worker", 2);
-  const char task0_name[] = "/job:worker/replica:0/task:0/device:CPU:0";
-  const char task1_name[] = "/job:worker/replica:0/task:1/device:CPU:0";
-
-  ConfigCoordinationService(&server_def,
-                            /*agent_destruction_without_shutdown=*/false,
-                            /*enable_health_check=*/true);
-  ServerFactory* factory;
-  ASSERT_TRUE(ServerFactory::GetFactory(server_def, &factory).ok());
-  server_def.set_job_name("worker");
-  server_def.set_task_index(0);
-  std::unique_ptr<tensorflow::ServerInterface> w0;
-  ASSERT_TRUE(
-      factory->NewServer(server_def, ServerFactory::Options(), &w0).ok());
-  ASSERT_TRUE(w0->Start().ok());
-  server_def.set_task_index(1);
-  std::unique_ptr<tensorflow::ServerInterface> w1;
-  ASSERT_TRUE(
-      factory->NewServer(server_def, ServerFactory::Options(), &w1).ok());
-  ASSERT_TRUE(w1->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  server_def.set_task_index(0);
-  // Add localhost job for the remote client task
-  auto cluster = server_def.mutable_cluster();
-  auto client_job = cluster->add_job();
-  client_job->set_name("localhost");
-  int client_port = tensorflow::testing::PickUnusedPortOrDie();
-  client_job->mutable_tasks()->insert(
-      {0, strings::StrCat("localhost:", client_port)});
-  server_def.set_job_name("localhost");
-  std::string serialized = server_def.SerializeAsString();
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TFE_Op* report_op = TFE_NewOp(ctx, "TestReportErrorToCluster", status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_TensorHandle* error_code =
-      TestScalarTensorHandle(ctx, error::Code::UNAVAILABLE);
-  TFE_OpAddInput(report_op, error_code, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_TensorHandle* error_message =
-      TestScalarTensorHandle(ctx, tstring("test_error_message"));
-  TFE_OpAddInput(report_op, error_message, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Run report op from task1. The error would not propagate.
-  TFE_OpSetDevice(report_op, task1_name, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  int num_retvals = 0;
-  TFE_Execute(report_op, nullptr, &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Run report op from task0. This should succeed since task0 is not in error
-  // state.
-  TFE_OpSetDevice(report_op, task0_name, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_Execute(report_op, nullptr, &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_DeleteTensorHandle(error_code);
-  TFE_DeleteTensorHandle(error_message);
-  TFE_DeleteOp(report_op);
-
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Test the restart of client.
-  TFE_DeleteContext(ctx);
-
-  // Pick a new address for the client.
-  client_port = tensorflow::testing::PickUnusedPortOrDie();
-  auto& jobs = *server_def.mutable_cluster()->mutable_job();
-  for (auto& job : jobs) {
-    if (job.name() == "localhost") {
-      job.mutable_tasks()->clear();
-      job.mutable_tasks()->insert(
-          {0, strings::StrCat("localhost:", client_port)});
-      break;
-    }
-  }
-
-  ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  serialized = server_def.SerializeAsString();
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  // Report error from task 1 again. It should succeed since task 1 is healthy.
-  report_op = TFE_NewOp(ctx, "TestReportErrorToCluster", status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  error_code = TestScalarTensorHandle(ctx, error::Code::UNAVAILABLE);
-  TFE_OpAddInput(report_op, error_code, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  error_message = TestScalarTensorHandle(ctx, tstring("test_error_message"));
-  TFE_OpAddInput(report_op, error_message, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_OpSetDevice(report_op, task1_name, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_Execute(report_op, nullptr, &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_DeleteTensorHandle(error_code);
-  TFE_DeleteTensorHandle(error_message);
-  TFE_DeleteOp(report_op);
-
-  TFE_ContextAsyncWait(ctx, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TF_DeleteStatus(status);
-  TFE_DeleteContext(ctx);
-
-  TF_EXPECT_OK(w0->StopCoordinationService());
-  TF_EXPECT_OK(w1->StopCoordinationService());
-  w0.release();
-  w1.release();
-}
-
 INSTANTIATE_TEST_SUITE_P(CAPI, SingleClientCoordinationServiceTest,
                          ::testing::Bool(),
                          [](const ::testing::TestParamInfo<bool> arg) {
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc
new file mode 100644
index 00000000000..6a49147b169
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_recoverable_jobs_test.cc
@@ -0,0 +1,252 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/time/time.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/protobuf/coordination_config.pb.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(b/249134783): Put the below into a common util.
+
+constexpr char kCoordinationServiceType[] = "standalone";
+
+void ConfigCoordinationService(tensorflow::ServerDef* server_def,
+                               bool agent_destruction_without_shutdown = false,
+                               bool enable_health_check = false) {
+  // Set the number of threads here since in some environment the default number
+  // of threads may be small which could cause RPC to hang.
+  server_def->mutable_default_session_config()
+      ->set_inter_op_parallelism_threads(10);
+  auto coord_config = server_def->mutable_default_session_config()
+                          ->mutable_experimental()
+                          ->mutable_coordination_config();
+  coord_config->set_service_type(kCoordinationServiceType);
+  coord_config->set_service_leader("/job:worker/replica:0/task:0");
+  coord_config->set_agent_destruction_without_shutdown(
+      agent_destruction_without_shutdown);
+  coord_config->set_heartbeat_timeout_in_ms(
+      absl::ToInt64Milliseconds(absl::Seconds(5)));
+  coord_config->set_shutdown_barrier_timeout_in_ms(
+      absl::ToInt64Milliseconds(absl::Seconds(5)));
+  coord_config->set_enable_health_check(enable_health_check);
+}
+
+class SingleClientRecoverableJobsTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(SingleClientRecoverableJobsTest, TestRecoverWorkerFailure) {
+  // If the param is true, it means client is restarted.
+  const bool client_restart = GetParam();
+  tensorflow::ServerDef server_def = GetServerDef("worker", 2);
+  const char task0_name[] = "/job:worker/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:worker/replica:0/task:1/device:CPU:0";
+
+  ConfigCoordinationService(&server_def,
+                            /*agent_destruction_without_shutdown=*/false,
+                            /*enable_health_check=*/true);
+  ServerFactory* factory;
+  ASSERT_TRUE(ServerFactory::GetFactory(server_def, &factory).ok());
+  server_def.set_job_name("worker");
+  server_def.set_task_index(0);
+  std::unique_ptr<tensorflow::ServerInterface> w0;
+  ASSERT_TRUE(
+      factory->NewServer(server_def, ServerFactory::Options(), &w0).ok());
+  ASSERT_TRUE(w0->Start().ok());
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::ServerInterface> w1;
+  ASSERT_TRUE(
+      factory->NewServer(server_def, ServerFactory::Options(), &w1).ok());
+  ASSERT_TRUE(w1->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  server_def.set_task_index(0);
+  // Add localhost job for the remote client task
+  auto cluster = server_def.mutable_cluster();
+  auto client_job = cluster->add_job();
+  client_job->set_name("localhost");
+  int client_port = tensorflow::testing::PickUnusedPortOrDie();
+  client_job->mutable_tasks()->insert(
+      {0, strings::StrCat("localhost:", client_port)});
+  server_def.set_job_name("localhost");
+  std::string serialized = server_def.SerializeAsString();
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* report_op = TFE_NewOp(ctx, "TestReportErrorToCluster", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* error_code =
+      TestScalarTensorHandle(ctx, error::Code::UNAVAILABLE);
+  TFE_OpAddInput(report_op, error_code, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* error_message =
+      TestScalarTensorHandle(ctx, tstring("test_error_message"));
+  TFE_OpAddInput(report_op, error_message, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Run report op from task1. The error would not propagate.
+  TFE_OpSetDevice(report_op, task1_name, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(report_op, nullptr, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  if (client_restart) {
+    // Run report op from task0. This should succeed since task0 is not in error
+    // state.
+    TFE_OpSetDevice(report_op, task0_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_Execute(report_op, nullptr, &num_retvals, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  }
+
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(error_code);
+  TFE_DeleteTensorHandle(error_message);
+  TFE_DeleteOp(report_op);
+
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  if (client_restart) {
+    TFE_DeleteContext(ctx);
+
+    // Pick a new address for the client.
+    client_port = tensorflow::testing::PickUnusedPortOrDie();
+    auto& jobs = *server_def.mutable_cluster()->mutable_job();
+    for (auto& job : jobs) {
+      if (job.name() == "localhost") {
+        job.mutable_tasks()->clear();
+        job.mutable_tasks()->insert(
+            {0, strings::StrCat("localhost:", client_port)});
+        break;
+      }
+    }
+
+    ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    serialized = server_def.SerializeAsString();
+    TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(),
+                            status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  } else {
+    TF_EXPECT_OK(w1->StopCoordinationService());
+    w1.release();
+
+    // Pick a new address for task1 and clear the client job for ServerDef.
+    // Otherwise we cannot restart the server.
+    auto& jobs = *server_def.mutable_cluster()->mutable_job();
+    tensorflow::JobDef saved_client_job;
+    for (auto iter = jobs.begin(); iter != jobs.end();) {
+      if (iter->name() == "worker") {
+        int worker_1_port = tensorflow::testing::PickUnusedPortOrDie();
+        auto& tasks = *iter->mutable_tasks();
+        tasks[1] = strings::StrCat("localhost:", worker_1_port);
+        ++iter;
+      } else if (iter->name() == "localhost") {
+        saved_client_job = *iter;
+        iter = jobs.erase(iter);
+      }
+    }
+
+    server_def.set_job_name("worker");
+    server_def.set_task_index(0);
+    ASSERT_TRUE(ServerFactory::GetFactory(server_def, &factory).ok());
+    server_def.set_task_index(1);
+    ASSERT_TRUE(
+        factory->NewServer(server_def, ServerFactory::Options(), &w1).ok());
+    ASSERT_TRUE(w1->Start().ok());
+
+    server_def.set_job_name("localhost");
+    server_def.set_task_index(0);
+    cluster = server_def.mutable_cluster();
+    *cluster->add_job() = std::move(saved_client_job);
+
+    TFE_DeleteContextOptions(opts);
+    serialized = server_def.SerializeAsString();
+    TFE_ContextUpdateServerDef(ctx, 0, serialized.data(), serialized.size(),
+                               status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
+  // Report error from task 1 again. It should succeed since task 1 is
+  // healthy.
+  report_op = TFE_NewOp(ctx, "TestReportErrorToCluster", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  error_code = TestScalarTensorHandle(ctx, error::Code::UNAVAILABLE);
+  TFE_OpAddInput(report_op, error_code, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  error_message = TestScalarTensorHandle(ctx, tstring("test_error_message"));
+  TFE_OpAddInput(report_op, error_message, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_OpSetDevice(report_op, task1_name, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_Execute(report_op, nullptr, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(error_code);
+  TFE_DeleteTensorHandle(error_message);
+  TFE_DeleteOp(report_op);
+
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+
+  TF_EXPECT_OK(w0->StopCoordinationService());
+  TF_EXPECT_OK(w1->StopCoordinationService());
+  w0.release();
+  w1.release();
+}
+
+INSTANTIATE_TEST_SUITE_P(CAPI, SingleClientRecoverableJobsTest,
+                         ::testing::Bool());
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
index 01e9196ea2e..9b04a419af6 100644
--- a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
 
 namespace tensorflow {
 namespace {
@@ -148,6 +149,8 @@ class TestReportErrorToClusterOp : public OpKernel {
     }
     tensorflow::Status s(static_cast<tensorflow::error::Code>(error_code),
                          error_message);
+    s.SetPayload(tsl::CoordinationErrorPayloadKey(),
+                 absl::Cord("testing error payload"));
     OP_REQUIRES_OK(ctx, coord_agent->ReportError(s));
   }
 };
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index a54b1c0d660..5288033afc9 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -463,6 +463,9 @@ void Master::CreateSession(const CreateSessionRequest* req,
     SessionOptions options;
     options.target = req->target();
     options.config = req->config();
+    // Disable optimizations for static graph to allow calls to Session::Extend.
+    options.config.mutable_experimental()
+        ->set_disable_optimize_for_static_graph(true);
 
     std::vector<string> filtered_worker_list;
     DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
diff --git a/tensorflow/core/distributed_runtime/preemption/BUILD b/tensorflow/core/distributed_runtime/preemption/BUILD
index 444ab6480c8..ad883546c58 100644
--- a/tensorflow/core/distributed_runtime/preemption/BUILD
+++ b/tensorflow/core/distributed_runtime/preemption/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_libs", "tf_gen_op_wrapper_cc", "tf_gen_op_wrapper_py", "tf_kernel_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs", "tf_gen_op_wrapper_cc", "tf_gen_op_wrapper_py", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -10,50 +10,12 @@ package(
 
 cc_library(
     name = "preemption_sync_manager",
-    srcs = ["preemption_sync_manager.cc"],
     hdrs = ["preemption_sync_manager.h"],
     deps = [
-        "//tensorflow/core/distributed_runtime:call_options",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:mutex",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/distributed_runtime/preemption:preemption_sync_manager",
     ],
 )
 
-tf_cc_test(
-    name = "preemption_sync_manager_test",
-    size = "small",
-    srcs = ["preemption_sync_manager_test.cc"],
-    deps = [
-        ":preemption_sync_manager",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_client",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_client",
-        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
-        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
-        "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
-        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
-        "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
-)
-
 cc_library(
     name = "check_preemption_op",
     srcs = ["check_preemption_op.cc"],
@@ -68,7 +30,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:error_payloads",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
index b8436797dfb..e6e3fe44eb2 100644
--- a/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
+++ b/tensorflow/core/distributed_runtime/preemption/check_preemption_op_kernel.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/distributed_runtime/error_payloads.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 namespace tensorflow {
 
@@ -33,8 +33,15 @@ class CheckPreemptionOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto status_or_task =
-        ctx->coordination_service_agent()->TryGetKeyValue(preemption_key_);
+    tsl::CoordinationServiceAgent* agent = ctx->coordination_service_agent();
+    // TODO(b/266752863): Remove this workaround once coordination service is
+    // always enabled (even for single-host deployment).
+    if (agent == nullptr) {
+      LOG_EVERY_N_SEC(WARNING, 30) << "CheckPreemption is no-op because "
+                                      "coordination service is not enabled.";
+      return;
+    }
+    auto status_or_task = agent->TryGetKeyValue(preemption_key_);
 
     // No-op if preemption key is not found.
     if (errors::IsNotFound(status_or_task.status())) {
diff --git a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h b/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
index 1114d442229..1c1f21a3c47 100644
--- a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
+++ b/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h
@@ -15,52 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
 
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 namespace tensorflow {
-
-// Enables multiple tasks to coordinate on a safe sync point if any of the tasks
-// receive a preemption notice. Example: tasks agree on a safe checkpointing
-// step after a preemption notice so that training can resume with minimal
-// disruption after the preemption.
-// Note: the sync point can only be set once whenever the first preemption
-// occurs.
-// TODO(b/230630494): Add Reset() to allow multiple sync points to be set.
-class PreemptionSyncManager {
- public:
-  virtual ~PreemptionSyncManager() = default;
-
-  virtual Status Initialize(CoordinationServiceAgent* agent) = 0;
-  virtual Status Initialize(CoordinationServiceAgent* agent,
-                            const std::string& preemption_notifier_type) = 0;
-  virtual Status Initialize(
-      CoordinationServiceAgent* agent,
-      std::unique_ptr<tsl::PreemptionNotifier> notifier) = 0;
-
-  // Check if the synchronized point has been reached. When a task has been
-  // preempted, a safe sync point will be determined by using the fastest task's
-  // next possible sync point, which is then propagated to all tasks via this
-  // method.
-  // Notes:
-  // 1) This must be called during every possible sync point so that the library
-  //    is aware of each task's progress.
-  // 2) This assumes that each task begins from the same point.
-  //    Internally, it updates a counter to track the last `step_counter` passed
-  //    in as argument to record each task's current progress.
-  // Example use case: this can be called during every training step for every
-  // task. Once a preemption notice is received, all tasks will agree on a safe
-  // step to pause training and handle the preemption (e.g. save checkpoint and
-  // exit, or wait for preempted task to restart, then resume training).
-  virtual bool ReachedSyncPoint(int step_counter) = 0;
-};
-
-std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager();
-
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::CreatePreemptionSyncManager;
+using tsl::PreemptionSyncManager;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index cfccf437072..d19f573c773 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -15,7 +15,6 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
     "tf_protos_profiler_service",
 )
 load(
@@ -24,6 +23,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = if_google(
         ["//tensorflow:internal"],
         ["//visibility:public"],
@@ -43,6 +43,7 @@ cc_library(
     name = "grpc_util",
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
     deps = [
         "//tensorflow/core/distributed_runtime:error_payloads",
@@ -59,16 +60,17 @@ cc_library(
     name = "grpc_client_cq_tag",
     srcs = [],
     hdrs = ["grpc_client_cq_tag.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        ":grpc_util",
-        "//tensorflow/core:lib",
-    ] + tf_grpc_cc_dependencies(),
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_client_cq_tag",
+    ],
 )
 
 cc_library(
     name = "grpc_state",
     srcs = ["grpc_state.cc"],
     hdrs = ["grpc_state.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_util",
@@ -77,6 +79,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_state",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -84,6 +87,7 @@ cc_library(
     name = "grpc_remote_worker",
     srcs = ["grpc_remote_worker.cc"],
     hdrs = ["grpc_remote_worker.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_state",
@@ -104,6 +108,7 @@ cc_library(
 cc_library(
     name = "grpc_channel",
     hdrs = ["grpc_channel.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/tsl/distributed_runtime/rpc:grpc_channel",
     ],
@@ -113,6 +118,7 @@ cc_library(
     name = "grpc_tensor_coding",
     srcs = ["grpc_tensor_coding.cc"],
     hdrs = ["grpc_tensor_coding.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "@com_google_absl//absl/flags:flag",
         "//tensorflow/core:core_cpu_internal",
@@ -128,6 +134,7 @@ cc_library(
     name = "grpc_worker_cache",
     srcs = ["grpc_worker_cache.cc"],
     hdrs = ["grpc_worker_cache.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_channel",
         ":grpc_client_cq_tag",
@@ -150,6 +157,7 @@ tf_cc_test(
     srcs = [
         "grpc_worker_cache_test.cc",
     ],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_worker_cache",
         "//tensorflow/c:tf_status_headers",
@@ -167,6 +175,7 @@ cc_library(
     name = "grpc_response_cache",
     srcs = ["grpc_response_cache.cc"],
     hdrs = ["grpc_response_cache.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -178,6 +187,7 @@ tf_cuda_library(
     name = "grpc_worker_service",
     srcs = ["grpc_worker_service.cc"],
     hdrs = ["grpc_worker_service.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_response_cache",
         ":grpc_tensor_coding",
@@ -207,6 +217,7 @@ cc_library(
     name = "grpc_worker_service_impl",
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_util",
         "//tensorflow/core/distributed_runtime:tensor_coding",
@@ -218,6 +229,7 @@ cc_library(
     name = "grpc_remote_master",
     srcs = ["grpc_remote_master.cc"],
     hdrs = ["grpc_remote_master.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_master_service_impl",
         ":grpc_util",
@@ -236,6 +248,7 @@ cc_library(
     name = "grpc_master_service",
     srcs = ["grpc_master_service.cc"],
     hdrs = ["grpc_master_service.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_master_service_impl",
         ":grpc_util",
@@ -254,6 +267,7 @@ cc_library(
     name = "grpc_master_service_impl",
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core/protobuf:master_proto_cc",
     ] + tf_grpc_cc_dependencies(),
@@ -263,6 +277,7 @@ cc_library(
     name = "rpc_rendezvous_mgr",
     srcs = ["rpc_rendezvous_mgr.cc"],
     hdrs = ["rpc_rendezvous_mgr.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -280,6 +295,7 @@ cc_library(
     name = "grpc_server_lib",
     srcs = ["grpc_server_lib.cc"],
     hdrs = ["grpc_server_lib.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
         ":grpc_channel",
@@ -317,6 +333,7 @@ cc_library(
 
 cc_library(
     name = "grpc_runtime",
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     visibility = ["//visibility:public"],
     deps = [
         ":grpc_server_lib",
@@ -329,6 +346,7 @@ tf_cc_binary(
     srcs = [
         "grpc_tensorflow_server.cc",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_server_lib",
         "//tensorflow/core:core_cpu",
@@ -353,6 +371,7 @@ cc_library(
     srcs = [
         "grpc_testlib_server.cc",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_server_lib",
         "//tensorflow/core:array_ops_op_lib",
@@ -379,6 +398,7 @@ cc_library(
 tf_cc_binary(
     name = "grpc_testlib_server",
     testonly = 1,
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [":grpc_testlib_server_main"],
 )
 
@@ -387,6 +407,7 @@ tf_cuda_library(
     testonly = 1,
     srcs = ["grpc_testlib.cc"],
     hdrs = ["grpc_testlib.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     data = [
         ":grpc_testlib_server",
     ],
@@ -407,6 +428,7 @@ cc_library(
     name = "grpc_session",
     srcs = ["grpc_session.cc"],
     hdrs = ["grpc_session.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_channel",
         ":grpc_remote_master",
@@ -431,11 +453,11 @@ tf_cuda_cc_tests(
     srcs = [
         "rpc_rendezvous_mgr_test.cc",
     ],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [],
     deps = [
         ":grpc_channel",
@@ -462,6 +484,7 @@ tf_cc_test(
     name = "grpc_tensor_coding_test",
     size = "small",
     srcs = ["grpc_tensor_coding_test.cc"],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_mac",
         "no_windows",
@@ -481,30 +504,13 @@ tf_cc_test(
     ] + tf_grpc_cc_dependencies(),
 )
 
-tf_cc_test(
-    name = "grpc_util_test",
-    size = "small",
-    srcs = ["grpc_util_test.cc"],
-    tags = [
-        "no_mac",
-    ],
-    deps = [
-        ":grpc_util",
-        "//tensorflow/core/distributed_runtime:error_payloads",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/protobuf:worker_proto_cc",
-    ] + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
-)
-
 tf_cuda_cc_test(
     name = "grpc_session_test",
     size = "medium",
     # highly variable test time. Typically ~2m, but can spike to 5+min.
     timeout = "long",
     srcs = ["grpc_session_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # b/176448181
         "no_oss",  # b/62956105: port conflicts.
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/BUILD b/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
index 93a5a8c9053..4e30d5b8dce 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -10,38 +10,18 @@ package(
 
 cc_library(
     name = "grpc_coordination_client",
-    srcs = ["grpc_coordination_client.cc"],
     hdrs = ["grpc_coordination_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/distributed_runtime:call_options",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_client",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_state",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-    ] + tf_grpc_cc_dependencies(),
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+    ],
 )
 
 cc_library(
     name = "grpc_coordination_service_impl",
-    srcs = ["grpc_coordination_service_impl.cc"],
     hdrs = ["grpc_coordination_service_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
-        "//tensorflow/core/platform:thread_annotations",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:mutex",
-        "//tensorflow/core:platform_base",
-        "//tensorflow/core:ptr_util",
-        "//tensorflow/core/distributed_runtime/coordination:coordination_service_rpc_handler",
-        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
-        "//tensorflow/tsl/distributed_runtime/rpc:grpc_call",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
-        "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "//tensorflow/core/platform",
-        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-        "//tensorflow/tsl/protobuf:coordination_service_cc_grpc_proto",
-    ] + tf_grpc_cc_dependencies(),
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+    ],
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
index cfedf545229..6c8e3b988b6 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -18,16 +18,13 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 
 namespace tensorflow {
-
-CoordinationClientCache* NewGrpcCoordinationClientCache(
-    std::shared_ptr<tensorflow::GrpcChannelCache> channel);
-
-CoordinationClient* NewGrpcCoordinationClient(
-    std::shared_ptr<::grpc::Channel> channel);
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::NewGrpcCoordinationClient;
+using tsl::NewGrpcCoordinationClientCache;
+// NOLINTEND(misc-unused-using-decls)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
deleted file mode 100644
index fd74cdee9a4..00000000000
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/threadpool.h"
-
-namespace tensorflow {
-
-GrpcCoordinationServiceImpl::GrpcCoordinationServiceImpl(
-    thread::ThreadPool* compute_pool, ::grpc::ServerBuilder* server_builder)
-    : compute_pool_(*compute_pool), shutdown_(false) {
-  server_builder->RegisterService(&service_);
-  cq_ = server_builder->AddCompletionQueue();
-}
-
-void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
-#define ENQUEUE_REQUEST(method)                                         \
-  do {                                                                  \
-    tf_shared_lock l(shutdown_mu_);                                     \
-    if (shutdown_) {                                                    \
-      continue;                                                         \
-    }                                                                   \
-    tsl::Call<GrpcCoordinationServiceImpl,                              \
-              grpc::CoordinationService::AsyncService, method##Request, \
-              method##Response>::                                       \
-        EnqueueRequest(                                                 \
-            &service_, cq_.get(),                                       \
-            &grpc::CoordinationService::AsyncService::Request##method,  \
-            &GrpcCoordinationServiceImpl::method##Handler, false);      \
-  } while (0)
-  ENQUEUE_REQUEST(RegisterTask);
-  ENQUEUE_REQUEST(WaitForAllTasks);
-  ENQUEUE_REQUEST(ShutdownTask);
-  ENQUEUE_REQUEST(ResetTask);
-  ENQUEUE_REQUEST(Heartbeat);
-  ENQUEUE_REQUEST(ReportErrorToTask);
-  ENQUEUE_REQUEST(ReportErrorToService);
-  ENQUEUE_REQUEST(GetTaskState);
-  ENQUEUE_REQUEST(InsertKeyValue);
-  ENQUEUE_REQUEST(GetKeyValue);
-  ENQUEUE_REQUEST(TryGetKeyValue);
-  ENQUEUE_REQUEST(GetKeyValueDir);
-  ENQUEUE_REQUEST(DeleteKeyValue);
-  ENQUEUE_REQUEST(Barrier);
-  ENQUEUE_REQUEST(CancelBarrier);
-#undef ENQUEUE_REQUEST
-
-  void* tag;  // Matches the operation started against this cq_.
-  bool ok;
-
-  while (true) {
-    if (!cq_->Next(&tag, &ok)) {
-      // The queue is shutting down.
-      break;
-    }
-    tsl::GrpcCallTag<GrpcCoordinationServiceImpl>* callback_tag =
-        static_cast<tsl::GrpcCallTag<GrpcCoordinationServiceImpl>*>(tag);
-
-    if (callback_tag) {
-      callback_tag->OnCompleted(this, ok);
-    } else {
-      cq_->Shutdown();
-      break;
-    }
-  }
-}
-
-void GrpcCoordinationServiceImpl::Shutdown() {
-  mutex_lock l(shutdown_mu_);
-  shutdown_ = true;
-  // This enqueues a special event (with a null tag) that causes the completion
-  // queue to be shut down on the polling thread.
-  shutdown_alarm_ = std::make_unique<::grpc::Alarm>(
-      cq_.get(), gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
index 4ac6f22192f..90b0f5b1532 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -16,94 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
 
-#include "grpcpp/alarm.h"
-#include "grpcpp/completion_queue.h"
-#include "grpcpp/server_builder.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_rpc_handler.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/threadpool.h"
-#include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "tensorflow/tsl/distributed_runtime/rpc/grpc_call.h"
-#include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/tsl/protobuf/coordination_service.grpc.pb.h"
-#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 
 namespace tensorflow {
-
-class GrpcCoordinationServiceImpl : public tsl::AsyncServiceInterface {
- public:
-  template <class RequestMessage, class ResponseMessage>
-  using CoordCall = tsl::Call<GrpcCoordinationServiceImpl,
-                              grpc::CoordinationService::AsyncService,
-                              RequestMessage, ResponseMessage>;
-
-  GrpcCoordinationServiceImpl(thread::ThreadPool* compute_pool,
-                              ::grpc::ServerBuilder* server_builder);
-  ~GrpcCoordinationServiceImpl() override {}
-
-  void HandleRPCsLoop() override;
-  void Shutdown() override;
-  void SetCoordinationServiceAgentInstance(CoordinationServiceAgent* agent) {
-    rpc_handler_.SetAgentInstance(agent);
-  }
-
- private:
-#define HANDLER(method)                                                      \
-  void method##Handler(CoordCall<method##Request, method##Response>* call) { \
-    tf_shared_lock l(shutdown_mu_);                                          \
-    if (shutdown_) {                                                         \
-      call->SendResponse(ToGrpcStatus(                                       \
-          errors::Internal("Coordination service has been shut down.")));    \
-      return;                                                                \
-    }                                                                        \
-    compute_pool_.Schedule([this, call]() {                                  \
-      rpc_handler_.method##Async(&call->request, &call->response,            \
-                                 [call](const Status& s) {                   \
-                                   call->ClearCancelCallback();              \
-                                   call->SendResponse(ToGrpcStatus(s));      \
-                                 });                                         \
-    });                                                                      \
-    tsl::Call<GrpcCoordinationServiceImpl,                                   \
-              grpc::CoordinationService::AsyncService, method##Request,      \
-              method##Response>::                                            \
-        EnqueueRequest(                                                      \
-            &service_, cq_.get(),                                            \
-            &grpc::CoordinationService::AsyncService::Request##method,       \
-            &GrpcCoordinationServiceImpl::method##Handler,                   \
-            /*supports_cancel=*/false);                                      \
-  }
-  HANDLER(RegisterTask);
-  HANDLER(WaitForAllTasks);
-  HANDLER(ShutdownTask);
-  HANDLER(ResetTask);
-  HANDLER(Heartbeat);
-  HANDLER(ReportErrorToTask);
-  HANDLER(ReportErrorToService);
-  HANDLER(GetTaskState);
-  HANDLER(InsertKeyValue);
-  HANDLER(GetKeyValue);
-  HANDLER(TryGetKeyValue);
-  HANDLER(GetKeyValueDir);
-  HANDLER(DeleteKeyValue);
-  HANDLER(Barrier);
-  HANDLER(CancelBarrier);
-#undef HANDLER
-
-  thread::ThreadPool& compute_pool_;
-  CoordinationServiceRpcHandler rpc_handler_;
-
-  mutex shutdown_mu_;
-  bool shutdown_ TF_GUARDED_BY(shutdown_mu_);
-  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
-
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
-  grpc::CoordinationService::AsyncService service_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcCoordinationServiceImpl);
-};
-
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GrpcCoordinationServiceImpl;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 8f41808a284..80c32a02b07 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -13,6 +14,7 @@ cc_library(
     name = "grpc_eager_service",
     srcs = ["grpc_eager_service.h"],
     hdrs = ["grpc_eager_service.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core/protobuf:eager_service_cc_grpc_proto",
         "//tensorflow/compiler/xla/stream_executor/platform",
@@ -23,6 +25,7 @@ cc_library(
     name = "grpc_eager_client",
     srcs = ["grpc_eager_client.cc"],
     hdrs = ["grpc_eager_client.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_eager_service",
         "//tensorflow/core/platform:error_payloads",
@@ -44,6 +47,7 @@ cc_library(
     name = "grpc_eager_service_impl",
     srcs = ["grpc_eager_service_impl.cc"],
     hdrs = ["grpc_eager_service_impl.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_eager_service",
         "//tensorflow/core:framework",
@@ -64,6 +68,7 @@ tf_cc_test(
     srcs = [
         "grpc_eager_client_test.cc",
     ],
+    # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":grpc_eager_client",
         "//tensorflow/c:tf_status_headers",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
index 3a7f8e6f218..b48169c3329 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -16,28 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
-#include "grpcpp/grpcpp.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 
 namespace tensorflow {
-
-// Represents a pending asynchronous client call as a tag that can be
-// stored in a `grpc::CompletionQueue`.
-class GrpcClientCQTag {
- public:
-  GrpcClientCQTag() {}
-  virtual ~GrpcClientCQTag() {}
-
-  // OnCompleted is invoked when the RPC has finished.
-  // Implementations of OnCompleted can delete *this.
-  virtual void OnCompleted(bool ok) = 0;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GrpcClientCQTag);
-};
-
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::GrpcClientCQTag;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index d758ed030f7..562b41997cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -281,10 +281,12 @@ class GrpcRemoteWorker : public WorkerInterface {
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
-    new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
-                                 std::move(done), call_opts,
-                                 callback_threadpool_, MaxRetries(),
-                                 /*fail_fast=*/true, &target_);
+    new RPCState<TensorResponse>(
+        &stub_, cq_, method, *request, response, std::move(done), call_opts,
+        callback_threadpool_, MaxRetries(),
+        /*fail_fast=*/true, &target_,
+        // Use optimized proto parse function that avoids a copy.
+        GrpcMaybeParseTensorResponse);
   }
 
   void IssueMarkRecvFinishedRequest(int64_t request_id) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index b5119d52a2c..75a09ebaa8a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -308,6 +308,8 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
         worker_cache, default_worker_name);
   }
 
+  auto* grpc_coordination_service =
+      static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
   // Set up worker environment.
   worker_env_.session_mgr = new SessionMgr(
       &worker_env_, SessionMgr::WorkerNameFromServerDef(server_def_),
@@ -315,7 +317,8 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
       [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
         WorkerCacheFactoryOptions options(server_def);
         return WorkerCacheFactory(options, worker_cache);
-      });
+      },
+      grpc_coordination_service->GetRpcHandler());
   worker_env_.compute_pool = compute_pool;
 
   // Finish setting up master environment.
@@ -498,13 +501,21 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
 // TODO(haoyuzhang): Remove this method once we have a mechanism to directly set
 // field inside the RPC coordination service handler.
 Status GrpcServer::SetCoordinationServiceAgentInstance(
-    CoordinationServiceAgent* agent) {
+    tsl::CoordinationServiceAgent* agent) {
   auto* coord_service =
       static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
   coord_service->SetCoordinationServiceAgentInstance(agent);
   return OkStatus();
 }
 
+Status GrpcServer::SetCoordinationServiceInstance(
+    tsl::CoordinationServiceInterface* service) {
+  auto* coord_service =
+      static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
+  coord_service->SetCoordinationServiceInstance(service);
+  return OkStatus();
+}
+
 Status GrpcServer::StopCoordinationService() {
   // Note: the sequence of events is important here.
   // 1. Agent must be torn down before the service as it needs to notify the
@@ -513,6 +524,7 @@ Status GrpcServer::StopCoordinationService() {
   // them within the session manager to prevent data races.
   TF_RETURN_IF_ERROR(SetCoordinationServiceAgentInstance(nullptr));
   worker_env()->session_mgr->TeardownCoordinationServiceAgent();
+  TF_RETURN_IF_ERROR(SetCoordinationServiceInstance(nullptr));
   coordination_service_->Shutdown();
   worker_env()->session_mgr->TeardownCoordinationService();
   return OkStatus();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 0fd3a423740..64f7c71ddd4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
 
@@ -118,7 +118,7 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def) override;
   // Pass coordination service agent instance to server's RPC handler
   Status SetCoordinationServiceAgentInstance(
-      CoordinationServiceAgent* agent) override;
+      tsl::CoordinationServiceAgent* agent) override;
   // TODO(hanyangtay): Remove this method once gRPC server clean shutdown is
   // supported.
   Status StopCoordinationService() override;
@@ -173,6 +173,9 @@ class GrpcServer : public ServerInterface {
   GrpcWorker* worker_impl() const { return worker_impl_.get(); }
   GrpcWorkerEnv* grpc_worker_env() const { return grpc_worker_env_.get(); }
 
+  Status SetCoordinationServiceInstance(
+      tsl::CoordinationServiceInterface* service);
+
  private:
   Env* env_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
index 9edca51e7a3..c86ad552cd1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 
 namespace tensorflow {
 
@@ -55,7 +56,7 @@ void UntypedStreamingRPCState::Tag::OnCompleted(bool ok) {
 
 void Exchange::Complete(Status status) {
   if (status.ok()) {
-    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
+    if (!tsl::GrpcMaybeParseProto(&response_buf_, response_)) {
       status.Update(errors::Internal("could not parse rpc response"));
     }
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 4648601fb0e..4291619993f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -33,207 +33,12 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_state.h"
 
 namespace tensorflow {
-
-// Object allocated per active RPC.
-// Manage the state of a single asynchronous RPC request.  If `max_retries`
-// is greater than 0, the request will be retried for any transient failures.
-template <class Response>
-class RPCState : public GrpcClientCQTag {
- public:
-  RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
-           const ::grpc::string& method, const protobuf::Message& request,
-           Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, int32_t max_retries = 0,
-           bool fail_fast = true, const string* target = nullptr)
-      : RPCState(
-            stub, cq, method, request, response, std::move(done), call_opts,
-            threadpool,
-            // 1) If GRPC_FAIL_FAST is set to 'true' or 'false',
-            // fail_fast=$GRPC_FAIL_FAST. See b/141948186.
-            // 2) Otherwise if GRPC_FAIL_FAST is set to 'use_caller', use the
-            // fail_fast from the caller. See b/140260119.
-            //
-            // Current default: use caller's fail_fast argument.
-            //
-            // NOTE: Callers mostly set fail_fast=true to prevent job hanging
-            // on worker task failures, except a few cases such as GetStatus
-            // in cluster initialization and collective param resolution.
-            [fail_fast, &done]() -> bool {
-              string fail_fast_env;
-              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
-                                               &fail_fast_env));
-              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
-              if (fail_fast_env_lower == "true") {
-                return true;
-              } else if (fail_fast_env_lower == "use_caller") {
-                return fail_fast;
-              } else if (fail_fast_env_lower == "false") {
-                return false;
-              } else {
-                string error_message = strings::StrCat(
-                    "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
-                LOG(WARNING) << error_message;
-                done(errors::InvalidArgument(error_message));
-                return false;
-              }
-            }(),
-            (call_opts != nullptr ? call_opts->GetTimeout() : 0), max_retries,
-            target) {}
-
-  template <typename Request>
-  RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
-           const ::grpc::string& method, const Request& request,
-           Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, bool fail_fast,
-           int64_t timeout_in_ms, int32_t max_retries, const string* target)
-      : call_opts_(call_opts),
-        threadpool_(threadpool),
-        done_(std::move(done)),
-        timeout_in_ms_(timeout_in_ms),
-        max_retries_(max_retries),
-        cq_(cq),
-        stub_(stub),
-        method_(method),
-        fail_fast_(fail_fast),
-        target_(target) {
-    response_ = response;
-    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
-    if (!s.ok()) {
-      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
-                 << s.error_message();
-      // Skip retry logic if we fail to parse our request.
-      done_(FromGrpcStatus(s));
-      delete this;
-      return;
-    }
-    StartCall();
-  }
-
-  void StartCall() {
-    context_.reset(new ::grpc::ClientContext());
-    context_->set_wait_for_ready(!fail_fast_);
-    if (timeout_in_ms_ > 0) {
-      context_->set_deadline(
-          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
-    }
-    if (call_opts_) {
-      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
-    }
-
-    VLOG(2) << "Starting call: " << method_;
-
-    call_ = stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_);
-    call_->StartCall();
-    call_->Finish(&response_buf_, &status_, this);
-  }
-
-  void OnCompleted(bool ok) override {
-    if (call_opts_) {
-      call_opts_->ClearCancelCallback();
-    }
-
-    VLOG(2) << "Completed call: " << method_;
-
-    Status s = FromGrpcStatus(status_);
-    if (s.ok() && !ok) {
-      // Since this function is only being used for processing the response
-      // to Finish for client-side unary calls, ok should never be false
-      s.Update(
-          errors::Internal("GRPC status is okay but CompletionQueueStatus is "
-                           "not.  This should never happen."));
-    }
-
-    if (s.ok()) {
-      if (threadpool_) {
-        // Run parse and callback in another thread, returning this
-        // one to service more RPCs.
-        threadpool_->Schedule([this]() { ParseAndCallDone(); });
-      } else {
-        ParseAndCallDone();
-      }
-      return;
-    }
-
-    VLOG(1) << method_ << " returned with non-ok status: " << s
-            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
-            << context_->debug_error_string();
-    // Retry if we have any attempts left
-    if (++num_retries_ <= max_retries_ &&
-        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
-      response_buf_.Clear();
-      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
-              << " of " << max_retries_;
-
-      ComputeRetryBackoffMs(/*min_backoff_ms=*/1, /*max_backoff_ms=*/10000);
-      int64_t backoff_us = retry_backoff_ms_ * 1000;
-      Env::Default()->SchedClosureAfter(/*micros=*/backoff_us,
-                                        [this]() { StartCall(); });
-    } else {
-      // Attach additional GRPC error information if any to the final status
-      string error_msg = s.error_message();
-      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
-      if (target_) {
-        strings::StrAppend(&error_msg, " from remote target ", *target_);
-      }
-      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
-      s = errors::CreateWithUpdatedMessage(s, error_msg);
-      // Always treat gRPC cancellation as a derived error. This ensures that
-      // other error types are preferred during status aggregation. (gRPC
-      // cancellation messages do not contain the original status message).
-      if (s.code() == tensorflow::error::Code::CANCELLED) {
-        s = StatusGroup::MakeDerived(s);
-      }
-
-      done_(s);
-      delete this;
-    }
-  }
-
-  void ParseAndCallDone() {
-    Status s;
-    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
-      s.Update(errors::Internal("could not parse rpc response"));
-    }
-    done_(s);
-    delete this;
-  }
-
- private:
-  void ComputeRetryBackoffMs(int min_backoff_ms, int max_backoff_ms) {
-    constexpr float kBackoffBase = 1.3;
-    if (retry_backoff_ms_ < 0) {
-      retry_backoff_ms_ = min_backoff_ms;
-    } else {
-      retry_backoff_ms_ *= kBackoffBase;
-      if (retry_backoff_ms_ > max_backoff_ms) {
-        retry_backoff_ms_ = max_backoff_ms;
-      }
-    }
-  }
-
-  CallOptions* call_opts_;
-  std::unique_ptr<::grpc::ClientContext> context_;
-  thread::ThreadPool* threadpool_;
-  std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
-  Response* response_;
-  ::grpc::ByteBuffer request_buf_;
-  ::grpc::ByteBuffer response_buf_;
-  ::grpc::Status status_;
-  StatusCallback done_;
-  int64_t timeout_in_ms_;
-
-  size_t num_retries_ = 0;
-  size_t max_retries_;
-  double retry_backoff_ms_ = -1;
-
-  ::grpc::CompletionQueue* cq_;
-  ::grpc::GenericStub* stub_;
-  ::grpc::string method_;
-  bool fail_fast_;
-  const string* target_;
-};
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::RPCState;
+// NOLINTEND(misc-unused-using-decls)
 
 // Represents state associated with one streaming RPC call.
 // Similarly to above, we extract the methods of StreamingRPCState that don't
@@ -443,7 +248,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   bool SendNextRequest(const protobuf::Message& request, Response* response,
                        const StatusCallback& done) {
     ::grpc::ByteBuffer request_buf;
-    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf);
+    ::grpc::Status s = tsl::GrpcMaybeUnparseProto(request, &request_buf);
     if (!s.ok()) {
       Status status = FromGrpcStatus(s);
       LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index f7ae89d7d76..d5e45dc5226 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -19,62 +19,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
-                                     grpc::ByteBuffer* dst) {
-  bool own_buffer;
-  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter,
-                                  protobuf::Message>(src, dst, &own_buffer);
-}
-
-// GrpcMaybeUnparseProto from a string simply copies the string to the
-// ByteBuffer.
-::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
-  ::grpc::Slice s(src.data(), src.size());
-  ::grpc::ByteBuffer buffer(&s, 1);
-  dst->Swap(&buffer);
-  return ::grpc::Status::OK;
-}
-
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
-  ::grpc::ProtoBufferReader reader(src);
-  return dst->ParseFromZeroCopyStream(&reader);
-}
-
-// Overload of GrpcParseProto so we can decode a TensorResponse without
-// extra copying.  This overload is used by the RPCState class in
-// grpc_state.h.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst) {
+bool GrpcMaybeParseTensorResponse(::grpc::ByteBuffer* src,
+                                  TensorResponse* dst) {
   ::tensorflow::GrpcByteSource byte_source(src);
   auto s = dst->ParseFrom(&byte_source);
   return s.ok();
 }
 
-// GrpcMaybeParseProto simply copies bytes into the string.
-bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
-  dst->clear();
-  dst->reserve(src->Length());
-  std::vector<::grpc::Slice> slices;
-  if (!src->Dump(&slices).ok()) {
-    return false;
-  }
-  for (const ::grpc::Slice& s : slices) {
-    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  return true;
-}
-
-// GrpcMaybeParseProto simply copies bytes into the tstring.
-bool GrpcMaybeParseProto(grpc::ByteBuffer* src, tstring* dst) {
-  dst->clear();
-  dst->reserve(src->Length());
-  std::vector<::grpc::Slice> slices;
-  if (!src->Dump(&slices).ok()) {
-    return false;
-  }
-  for (const ::grpc::Slice& s : slices) {
-    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  return true;
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 714d6a315d7..8247a2b0233 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -65,26 +65,9 @@ class GrpcByteSource : public TensorResponse::Source {
 
 inline string GrpcIdKey() { return "tf-rpc"; }
 
-// Serialize src and store in *dst.
-::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
-                                     ::grpc::ByteBuffer* dst);
-
-// Parse contents of src and initialize *dst with them.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
-
-// Specialization for TensorResponse
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, TensorResponse* dst);
-
-// Copy string src to grpc buffer *dst.
-::grpc::Status GrpcMaybeUnparseProto(const string& src,
-                                     ::grpc::ByteBuffer* dst);
-
-// Copy grpc buffer src to string *dst.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
-
-// Copy grpc buffer src to tstring *dst.
-bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
-
+// Decode a TensorResponse without extra copying. This function is an optimized
+// variant of tsl::GrpcMaybeParseProto.
+bool GrpcMaybeParseTensorResponse(::grpc::ByteBuffer* src, TensorResponse* dst);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
deleted file mode 100644
index 958e4d56f33..00000000000
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-
-#include "grpcpp/grpcpp.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/protobuf/worker.pb.h"
-
-namespace tensorflow {
-
-namespace {
-string ToString(const grpc::ByteBuffer& buf) {
-  std::vector<grpc::Slice> slices;
-  CHECK(buf.Dump(&slices).ok());
-  string result;
-  for (const grpc::Slice& s : slices) {
-    result.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  return result;
-}
-
-// Return a ByteBuffer that contains str split up into num_slices slices.
-grpc::ByteBuffer MakeBuffer(const string& str, int num_slices) {
-  // Convert to a ByteBuffer.
-  std::vector<::grpc::Slice> slices;
-  const size_t per_slice = (str.size() + num_slices - 1) / num_slices;
-  for (size_t pos = 0; pos < str.size();) {
-    const size_t n = std::min(str.size() - pos, per_slice);
-    slices.emplace_back(&str[pos], n);
-    pos += n;
-  }
-  if (slices.empty()) {
-    slices.emplace_back();
-  }
-  return ::grpc::ByteBuffer(&slices[0], slices.size());
-}
-
-// Make a proto with approximately the specified length.
-CleanupAllRequest MakeProto(int size) {
-  int approx_size = 0;
-  CleanupAllRequest proto;
-  int index = 0;
-  while (approx_size < size) {
-    int item_size = std::min(size - approx_size, 1024);
-    proto.add_container(string(item_size, 'a' + static_cast<char>(index % 26)));
-    approx_size += item_size + 3;  // +3 for encoding overhead.
-    index++;
-  }
-  return proto;
-}
-}  // namespace
-
-TEST(GrpcProto, Unparse) {
-  CleanupAllRequest proto;
-  proto.add_container("hello");
-  proto.add_container("world");
-  grpc::ByteBuffer buf;
-  ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
-  CleanupAllRequest parsed;
-  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
-  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
-}
-
-TEST(GrpcProto, UnparseToString) {
-  CleanupAllRequest proto;
-  proto.add_container("hello");
-  proto.add_container("world");
-  string str;
-  CHECK(proto.SerializeToString(&str));
-  grpc::ByteBuffer buf;
-  ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
-  CleanupAllRequest parsed;
-  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
-  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
-}
-
-TEST(GrpcProto, Parse) {
-  // Test with serialization broken up into a bunch of slices.
-  struct Case {
-    int length;
-    int slices;
-  };
-  for (Case c : std::vector<Case>{
-           {0, 1},
-           {20, 1},
-           {100, 1},
-           {1 << 20, 1},
-           {100, 5},
-           {10000, 50},
-       }) {
-    CleanupAllRequest proto = MakeProto(c.length);
-    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
-    CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
-        << c.length << " " << c.slices;
-    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
-  }
-}
-
-TEST(GrpcProto, ParseFromString) {
-  // Test with serialization broken up into a bunch of slices.
-  struct Case {
-    int length;
-    int slices;
-  };
-  for (Case c : std::vector<Case>{
-           {0, 1},
-           {20, 1},
-           {100, 1},
-           {1 << 20, 1},
-           {100, 5},
-           {10000, 50},
-       }) {
-    CleanupAllRequest proto = MakeProto(c.length);
-    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
-    string parsed_str;
-    CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
-        << c.length << " " << c.slices;
-    ASSERT_TRUE(parsed.ParseFromString(parsed_str));
-    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
-  }
-}
-
-static void BM_UnparseGrpc(::testing::benchmark::State& state) {
-  const int size = state.range(0);
-
-  auto proto = MakeProto(size);
-  for (auto s : state) {
-    grpc::ByteBuffer buf;
-    CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
-  }
-}
-BENCHMARK(BM_UnparseGrpc)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
-
-static void BM_UnparseString(::testing::benchmark::State& state) {
-  const int size = state.range(0);
-
-  auto proto = MakeProto(size);
-
-  for (auto s : state) {
-    string buf;
-    proto.SerializeToString(&buf);
-  }
-}
-BENCHMARK(BM_UnparseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
-
-static void BM_ParseGrpc(::testing::benchmark::State& state) {
-  const int size = state.range(0);
-  const int num_slices = state.range(1);
-
-  CleanupAllRequest proto = MakeProto(size);
-  auto buf = MakeBuffer(proto.SerializeAsString(), num_slices);
-
-  for (auto s : state) {
-    CHECK(GrpcMaybeParseProto(&buf, &proto));
-  }
-}
-BENCHMARK(BM_ParseGrpc)
-    ->ArgPair(1, 1)
-    ->ArgPair(1 << 10, 1)
-    ->ArgPair(1 << 10, 4)
-    ->ArgPair(1 << 20, 1)
-    ->ArgPair(1 << 20, 4);
-
-static void BM_ParseString(::testing::benchmark::State& state) {
-  const int size = state.range(0);
-
-  CleanupAllRequest proto = MakeProto(size);
-  string serial = proto.SerializeAsString();
-
-  for (auto s : state) {
-    CHECK(proto.ParseFromString(serial));
-  }
-}
-BENCHMARK(BM_ParseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index bb775037034..11874494040 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -536,6 +536,10 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
               AllocatorAttributes alloc_attrs;
               alloc_attrs.set_gpu_compatible(true);
               alloc_attrs.set_on_host(true);
+              profiler::ScopedMemoryDebugAnnotation op_annotation(
+                  "GrpcWorker::RecvTensorAsync::consumer_callback",
+                  request->step_id(), "dynamic", val.dtype(),
+                  [shape = val.shape()]() { return shape.DebugString(); });
               Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
               Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
               CHECK(send_dev_context)
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 543f7527745..cd9b98747d2 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -128,7 +128,9 @@ class RpcRendezvousMgrTest : public ::testing::Test {
         worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
                         std::unique_ptr<DeviceMgr>(CreateDeviceMgr()),
-                        std::unique_ptr<GraphMgr>(), nullptr),
+                        std::unique_ptr<GraphMgr>(), nullptr,
+                        [](WorkerSession* worker_session, bool called,
+                           DeviceMgr* remote_device_mgr) { return nullptr; }),
         rmgr_(&env) {
     env.env = Env::Default();
   }
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index 2c82b0cbce1..7865eb7846d 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -22,9 +22,12 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
+namespace tsl {
+class CoordinationServiceAgent;
+}  // namespace tsl
+
 namespace tensorflow {
 
-class CoordinationServiceAgent;
 class DeviceMgr;
 class EagerContext;
 class WorkerEnv;
@@ -77,7 +80,7 @@ class ServerInterface {
       const tensorflow::uint64 context_id, EagerContext* context) = 0;
   // Set coordination service agent instance to coordination service RPC handler
   virtual Status SetCoordinationServiceAgentInstance(
-      CoordinationServiceAgent* agent) = 0;
+      tsl::CoordinationServiceAgent* agent) = 0;
   // TODO(hanyangtay): Remove this method once gRPC server clean shutdown is
   // supported.
   virtual Status StopCoordinationService() = 0;
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 3f911ce7089..4c9081bc45b 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -23,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/activity_watcher/activity.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/error_payloads.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 #include "tensorflow/core/distributed_runtime/remote_device.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 #include "tensorflow/tsl/protobuf/distributed_runtime_payloads.pb.h"
@@ -83,18 +85,26 @@ void SetCoordinatedJobList(const ServerDef& server_def,
 SessionMgr::SessionMgr(
     WorkerEnv* worker_env, const std::string& default_worker_name,
     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-    WorkerCacheFactory worker_cache_factory)
+    WorkerCacheFactory worker_cache_factory,
+    tsl::CoordinationServiceRpcHandler* coordination_handler)
     : worker_env_(worker_env),
       default_worker_cache_(std::move(default_worker_cache)),
       legacy_session_(WorkerSession::CreateWithBorrowedDeviceMgr(
-          "", default_worker_name,
+          /*session_name=*/"", default_worker_name,
           std::unique_ptr<WorkerCacheInterface>(
               new WorkerCacheWrapper(default_worker_cache_.get())),
           worker_env->device_mgr,
-          std::unique_ptr<GraphMgr>(
-              new GraphMgr(worker_env, worker_env->device_mgr)),
-          nullptr)),
-      worker_cache_factory_(std::move(worker_cache_factory)) {}
+          std::make_unique<GraphMgr>(worker_env, worker_env->device_mgr),
+          /*remote_device_mgr=*/nullptr,
+          [](WorkerSession* worker_session, bool create_worker_session_called,
+             DeviceMgr* remote_device_mgr)
+              -> std::unique_ptr<DistributedFunctionLibraryRuntime> {
+            return std::make_unique<ClusterFunctionLibraryRuntime>(
+                worker_session, create_worker_session_called,
+                remote_device_mgr);
+          })),
+      worker_cache_factory_(std::move(worker_cache_factory)),
+      coordination_handler_(coordination_handler) {}
 
 /* static */
 std::string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
@@ -188,6 +198,7 @@ Status SessionMgr::CreateSession(
 
     // Create a private copy of the DeviceMgr for the WorkerSession.
     std::vector<std::unique_ptr<Device>> renamed_devices;
+    renamed_devices.reserve(worker_env_->local_devices.size());
     for (Device* d : worker_env_->local_devices) {
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
@@ -206,11 +217,16 @@ Status SessionMgr::CreateSession(
     }
 
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
-    worker_session.reset(
-        new WorkerSession(session, worker_name,
-                          std::unique_ptr<WorkerCacheInterface>(worker_cache),
-                          std::move(device_mgr), std::move(graph_mgr),
-                          std::move(remote_devices)));
+    worker_session.reset(new WorkerSession(
+        session, worker_name,
+        std::unique_ptr<WorkerCacheInterface>(worker_cache),
+        std::move(device_mgr), std::move(graph_mgr), std::move(remote_devices),
+        [](WorkerSession* worker_session, bool create_worker_session_called,
+           DeviceMgr* remote_device_mgr)
+            -> std::unique_ptr<DistributedFunctionLibraryRuntime> {
+          return std::make_unique<ClusterFunctionLibraryRuntime>(
+              worker_session, create_worker_session_called, remote_device_mgr);
+        }));
   } else {
     AsRemoteDevices(worker_env_->env, cluster_device_attributes, nullptr,
                     &cluster_devices);
@@ -228,7 +244,13 @@ Status SessionMgr::CreateSession(
         session, worker_name,
         std::unique_ptr<WorkerCacheInterface>(worker_cache),
         worker_env_->device_mgr, std::move(graph_mgr),
-        std::move(remote_devices));
+        std::move(remote_devices),
+        [](WorkerSession* worker_session, bool create_worker_session_called,
+           DeviceMgr* remote_device_mgr)
+            -> std::unique_ptr<DistributedFunctionLibraryRuntime> {
+          return std::make_unique<ClusterFunctionLibraryRuntime>(
+              worker_session, create_worker_session_called, remote_device_mgr);
+        });
   }
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
@@ -261,12 +283,15 @@ Status SessionMgr::CreateSession(
       coordination_service_ =
           tsl::CoordinationServiceInterface::EnableCoordinationService(
               worker_env_->env, coordination_config, std::move(client_cache));
+      if (coordination_handler_ != nullptr) {
+        coordination_handler_->SetServiceInstance(coordination_service_.get());
+      }
     }
 
     // Initialize coordination service agent.
     std::unique_ptr<CoordinationClientCache> agent_cache;
     TF_RETURN_IF_ERROR(worker_cache->GetCoordinationClientCache(&agent_cache));
-    coordination_service_agent_ = CreateCoordinationServiceAgent();
+    coordination_service_agent_ = tsl::CreateCoordinationServiceAgent();
     TF_RETURN_IF_ERROR(coordination_service_agent_->Initialize(
         worker_env_->env, server_def.job_name(), server_def.task_index(),
         coordination_config,
@@ -389,7 +414,7 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
   return legacy_session_;
 }
 
-CoordinationServiceAgent* SessionMgr::GetCoordinationServiceAgent() {
+tsl::CoordinationServiceAgent* SessionMgr::GetCoordinationServiceAgent() {
   return coordination_service_agent_.get();
 }
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 281b15824f3..3a6060e3483 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <functional>
 #include <string>
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -27,6 +26,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
 
 namespace tensorflow {
 
@@ -46,7 +47,8 @@ class SessionMgr {
   explicit SessionMgr(
       WorkerEnv* worker_env, const std::string& default_worker_name,
       std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-      WorkerCacheFactory worker_cache_factory);
+      WorkerCacheFactory worker_cache_factory,
+      tsl::CoordinationServiceRpcHandler* coordination_handler);
   ~SessionMgr() {}
 
   // Allocates state for a new session.
@@ -92,10 +94,10 @@ class SessionMgr {
 
   Status DeleteSession(const std::string& session);
 
-  // Provides access to the coordination service. This method should only be
-  // called after the agent has been initialized during session creation, or an
-  // invalid nullptr is returned. Note: the agent is thread-safe and mutable.
-  CoordinationServiceAgent* GetCoordinationServiceAgent();
+  // Provides access to the coordination service agent. This method should only
+  // be called after the agent has been initialized during session creation, or
+  // an invalid nullptr is returned. Note: the agent is thread-safe and mutable.
+  tsl::CoordinationServiceAgent* GetCoordinationServiceAgent();
 
   static std::string WorkerNameFromServerDef(const ServerDef& server_def);
 
@@ -128,12 +130,15 @@ class SessionMgr {
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   std::shared_ptr<WorkerSession> legacy_session_;
   std::unique_ptr<tsl::CoordinationServiceInterface> coordination_service_;
-  std::unique_ptr<CoordinationServiceAgent> coordination_service_agent_;
+  std::unique_ptr<tsl::CoordinationServiceAgent> coordination_service_agent_;
 
   bool is_logging_active_ = false;
 
   const WorkerCacheFactory worker_cache_factory_;
 
+  // Not owned. And should only be used for setting the coordination service.
+  tsl::CoordinationServiceRpcHandler* coordination_handler_ = nullptr;
+
   Status WorkerSessionForSessionLocked(
       const std::string& session_handle,
       std::shared_ptr<WorkerSession>* out_session)
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 7feff7a0961..9202a4b1e45 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -48,7 +48,8 @@ class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(), factory_) {
+             std::unique_ptr<WorkerCacheInterface>(), factory_,
+             /*coordination_handler=*/nullptr) {
     device_mgr_ = std::make_unique<StaticDeviceMgr>(
         FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0"));
     env_.local_devices = device_mgr_->ListDevices();
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index a63b4690b34..9fbf08760ae 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/lib/monitoring/gauge.h"
 
 namespace tensorflow {
@@ -124,12 +128,13 @@ WorkerSession::WorkerSession(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceMgr> device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+    DistributedFunctionLibraryRuntimeCreator cluster_flr_creator)
     : session_name_(session_name),
       worker_name_(worker_name),
       worker_cache_(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr_(std::move(graph_mgr)),
-      cluster_flr_(new ClusterFunctionLibraryRuntime(
+      cluster_flr_(cluster_flr_creator(
           this, !session_name.empty(),
           remote_device_mgr ? remote_device_mgr.get() : nullptr)),
       device_mgr_(std::move(device_mgr)),
@@ -161,23 +166,26 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr) {
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+    DistributedFunctionLibraryRuntimeCreator cluster_flr_creator) {
   return std::shared_ptr<WorkerSession>(new WorkerSession(
       session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
-      std::move(graph_mgr), std::move(remote_device_mgr)));
+      std::move(graph_mgr), std::move(remote_device_mgr),
+      std::move(cluster_flr_creator)));
 }
 
 WorkerSession::WorkerSession(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
+    std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+    DistributedFunctionLibraryRuntimeCreator cluster_flr_creator)
     : session_name_(session_name),
       worker_name_(worker_name),
       worker_cache_(new WorkerFreeListCache(std::move(worker_cache))),
       graph_mgr_(std::move(graph_mgr)),
-      cluster_flr_(new ClusterFunctionLibraryRuntime(
-          this, !session_name.empty(), remote_device_mgr.get())),
+      cluster_flr_(cluster_flr_creator(this, !session_name.empty(),
+                                       remote_device_mgr.get())),
       device_mgr_(nullptr),
       borrowed_device_mgr_(borrowed_device_mgr),
       remote_device_mgr_(std::move(remote_device_mgr)) {
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 3b2d1122558..b97fe1a9929 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -16,12 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
 
+#include <functional>
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/function.h"
 
 namespace tensorflow {
 
@@ -32,6 +35,11 @@ class WorkerCacheInterface;
 // WorkerSession encapsulates all of the state relating to a given session.
 class WorkerSession {
  public:
+  using DistributedFunctionLibraryRuntimeCreator =
+      std::function<std::unique_ptr<DistributedFunctionLibraryRuntime>(
+          WorkerSession* worker_session, bool create_worker_session_called,
+          DeviceMgr* remote_device_mgr)>;
+
   // Collection of local devices. These devices are typically
   // RenamedDevices in all except the SessionMgr.legacy_session_ and
   // sessions created with `isolate_session_state == false`. In the
@@ -52,7 +60,7 @@ class WorkerSession {
   }
   GraphMgr* graph_mgr() const { return graph_mgr_.get(); }
 
-  ClusterFunctionLibraryRuntime* cluster_flr() const {
+  DistributedFunctionLibraryRuntime* cluster_flr() const {
     return cluster_flr_.get();
   }
 
@@ -60,13 +68,15 @@ class WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
-                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+                DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
-      std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
+      std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+      DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   // In the eager runtime we allow WorkerSession to be updated, where the
   // worker cache will be recreated. If WorkerSession upate is expected and a
@@ -92,7 +102,8 @@ class WorkerSession {
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
-                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
+                std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
+                DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   // The name of the session.
   const string session_name_;
@@ -111,7 +122,7 @@ class WorkerSession {
   // Note: graph_mgr must be deleted before device_mgr!
   const std::unique_ptr<GraphMgr> graph_mgr_;
 
-  std::unique_ptr<ClusterFunctionLibraryRuntime> cluster_flr_;
+  std::unique_ptr<DistributedFunctionLibraryRuntime> cluster_flr_;
 
   const std::unique_ptr<DeviceMgr> device_mgr_;
   DeviceMgr* const borrowed_device_mgr_;  // Not owned.
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index c99f2907317..2a1b70a5350 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -17,6 +17,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -56,6 +57,20 @@ cc_library(
     alwayslink = 1,
 )
 
+tf_cc_test(
+    name = "feature_util_test",
+    size = "small",
+    srcs = ["feature_util_test.cc"],
+    deps = [
+        ":example_protos_cc",
+        ":feature_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 # TODO(bmzhao): Refactor this target to use granular dependencies
 # after stage 4 of the TF build refactor is complete:
 # https://github.com/tensorflow/community/pull/179
diff --git a/tensorflow/core/example/feature_util.cc b/tensorflow/core/example/feature_util.cc
index 46a7d04a307..b2915f1e6c0 100644
--- a/tensorflow/core/example/feature_util.cc
+++ b/tensorflow/core/example/feature_util.cc
@@ -181,35 +181,4 @@ const Features& GetFeatures<SequenceExample>(const SequenceExample& proto) {
   return proto.context();
 }
 
-template <>
-const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
-    const Feature& feature);
-
-template <>
-protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
-    Feature* feature);
-
-template <>
-const protobuf::RepeatedField<float>& GetFeatureValues<float>(
-    const Feature& feature);
-
-template <>
-protobuf::RepeatedField<float>* GetFeatureValues<float>(Feature* feature);
-
-template <>
-const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<std::string>(
-    const Feature& feature);
-
-template <>
-const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<tstring>(
-    const Feature& feature);
-
-template <>
-protobuf::RepeatedPtrField<std::string>* GetFeatureValues<std::string>(
-    Feature* feature);
-
-template <>
-protobuf::RepeatedPtrField<std::string>* GetFeatureValues<tstring>(
-    Feature* feature);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index ddc91b747a2..3d27d83e05f 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -277,17 +277,44 @@ template <typename ProtoType>
 typename std::enable_if<TypeHasFeatures<ProtoType>::value, Features*>::type
 GetFeatures(ProtoType* proto);
 
+template <>
+Features* GetFeatures<Features>(Features* proto);
+template <>
+Features* GetFeatures<Example>(Example* proto);
+template <>
+Features* GetFeatures<SequenceExample>(SequenceExample* proto);
+
 template <typename ProtoType>
 typename std::enable_if<TypeHasFeatures<ProtoType>::value,
                         const Features&>::type
 GetFeatures(const ProtoType& proto);
 
+template <>
+const Features& GetFeatures<Features>(const Features& proto);
+template <>
+const Features& GetFeatures<Example>(const Example& proto);
+template <>
+const Features& GetFeatures<SequenceExample>(const SequenceExample& proto);
+
 // Base declaration of a family of template functions to return a read only
 // repeated field of feature values.
 template <typename FeatureType>
 const typename internal::RepeatedFieldTrait<FeatureType>::Type&
 GetFeatureValues(const Feature& feature);
 
+template <>
+const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedField<float>& GetFeatureValues<float>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<tstring>(
+    const Feature& feature);
+template <>
+const protobuf::RepeatedPtrField<std::string>& GetFeatureValues<std::string>(
+    const Feature& feature);
+
 // Returns a read only repeated field corresponding to a feature with the
 // specified name and FeatureType. Supported ProtoTypes: SequenceExample,
 // Example, Features.
@@ -303,6 +330,18 @@ template <typename FeatureType>
 typename internal::RepeatedFieldTrait<FeatureType>::Type* GetFeatureValues(
     Feature* feature);
 
+template <>
+protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
+    Feature* feature);
+template <>
+protobuf::RepeatedField<float>* GetFeatureValues<float>(Feature* feature);
+template <>
+protobuf::RepeatedPtrField<std::string>* GetFeatureValues<tstring>(
+    Feature* feature);
+template <>
+protobuf::RepeatedPtrField<std::string>* GetFeatureValues<std::string>(
+    Feature* feature);
+
 // Returns a mutable repeated field corresponding to a feature with the
 // specified name and FeatureType. Supported ProtoTypes: SequenceExample,
 // Example, Features.
@@ -434,6 +473,15 @@ void AppendFeatureValues(std::initializer_list<ValueType> container,
 template <typename... FeatureType>
 void ClearFeatureValues(Feature* feature);
 
+template <>
+void ClearFeatureValues<protobuf_int64>(Feature* feature);
+template <>
+void ClearFeatureValues<float>(Feature* feature);
+template <>
+void ClearFeatureValues<std::string>(Feature* feature);
+template <>
+void ClearFeatureValues<tstring>(Feature* feature);
+
 // Clears the feature's repeated field (int64, float, or string). Copies
 // elements from the range, defined by [first, last) into the feature's repeated
 // field.
@@ -500,6 +548,18 @@ void SetFeatureValues(std::initializer_list<ValueType> container,
 template <typename... FeatureType>
 bool HasFeature(absl::string_view key, const Features& features);
 
+template <>
+bool HasFeature<>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<protobuf_int64>(absl::string_view key,
+                                const Features& features);
+template <>
+bool HasFeature<float>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<std::string>(absl::string_view key, const Features& features);
+template <>
+bool HasFeature<tstring>(absl::string_view key, const Features& features);
+
 // Returns true if a feature with the specified key belongs to the Example.
 // Doesn't check feature type if used without FeatureType, otherwise the
 // specialized versions return false if the feature has a wrong type.
@@ -516,4 +576,5 @@ bool ExampleHasFeature(absl::string_view key, const Example& example) {
 }
 
 }  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_EXAMPLE_FEATURE_UTIL_H_
diff --git a/tensorflow/core/example/testdata/BUILD b/tensorflow/core/example/testdata/BUILD
index dad32b86fe2..6e94c323b8d 100644
--- a/tensorflow/core/example/testdata/BUILD
+++ b/tensorflow/core/example/testdata/BUILD
@@ -1,6 +1,7 @@
 # Example parser test data.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/example:__pkg__",
     ],
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 596fe4f15ed..99b45d231aa 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1,6 +1,5 @@
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
@@ -21,8 +20,13 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_cc_fuzz_test",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
         "//tensorflow/security/fuzzing:__subpackages__",
@@ -576,7 +580,7 @@ cc_library(
     testonly = 1,
     srcs = ["fake_input.cc"],
     hdrs = ["fake_input.h"],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":attr_value_proto_cc",
         ":op_def_proto_cc",
@@ -619,31 +623,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "float8",
-    srcs = ["float8.cc"],
-    hdrs = ["float8.h"],
-    visibility = [
-        "//tensorflow/core:__subpackages__",
-        "//tensorflow/security/fuzzing:__subpackages__",
-    ],
-    deps = [
-        "//third_party/eigen3",
-    ],
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "float8_test",
-    srcs = ["float8_test.cc"],
-    deps = [
-        ":float8",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/platform:test_benchmark",
-    ],
-)
-
 cc_library(
     name = "numeric_types",
     hdrs = ["numeric_types.h"],
@@ -693,6 +672,7 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/util:overflow",
         "//third_party/eigen3",
@@ -809,6 +789,7 @@ tf_cuda_library(
         "variant_tensor_data.h",
     ],
     visibility = [
+        "//tensorflow:__pkg__",
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/runtime_fallback:__subpackages__",
         "//tensorflow/core/tfrt:__subpackages__",
@@ -1153,7 +1134,6 @@ tf_cuda_cc_test(
     name = "variant_op_copy_test",
     size = "small",
     srcs = ["variant_op_copy_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -1180,7 +1160,6 @@ tf_cc_test(
     name = "framework_run_handler_util_test",
     size = "small",
     srcs = ["run_handler_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1193,7 +1172,6 @@ tf_cc_test(
     name = "framework_run_handler_test",
     size = "small",
     srcs = ["run_handler_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session_internal",
@@ -1217,7 +1195,6 @@ tf_cc_test(
     name = "framework_op_segment_test",
     size = "small",
     srcs = ["op_segment_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core",
@@ -1243,7 +1220,6 @@ tf_cc_test(
     name = "framework_resource_var_test",
     size = "small",
     srcs = ["resource_var_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1329,7 +1305,6 @@ tf_cc_tests(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     visibility = [
         "//tensorflow:internal",
         "//tensorflow/core:__pkg__",
@@ -1817,6 +1792,17 @@ tf_proto_library(
     ],
 )
 
+tf_proto_library(
+    name = "optimized_function_graph_proto",
+    srcs = ["optimized_function_graph.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":types_proto",
+        ":graph_proto",
+    ],
+)
+
 tf_proto_library(
     name = "protos_all",
     cc_api_version = 2,
@@ -1838,6 +1824,7 @@ tf_proto_library(
         ":log_memory_proto",
         ":model_proto",
         ":node_def_proto",
+        ":optimized_function_graph_proto",
         ":op_def_proto",
         ":reader_base_proto",
         ":resource_handle_proto",
@@ -1855,3 +1842,13 @@ tf_proto_library(
         "alt_dep=//third_party/tensorflow/core:protos_all",
     ],
 )
+
+tf_cc_fuzz_test(
+    name = "tensor_shape_fuzz",
+    srcs = ["tensor_shape_fuzz.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/security/fuzzing/cc/core/framework:tensor_shape_domains",
+        "//tensorflow/tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index df708833c26..4943e0ee3c7 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -48,6 +48,7 @@ enum CollectiveType {
   GATHER_COLLECTIVE,
   PERMUTE_COLLECTIVE,
   ALL_TO_ALL_COLLECTIVE,
+  REDUCE_SCATTER_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7b903bffd56..13d8e3124fd 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -2178,7 +2178,7 @@ Status RandomShape(shape_inference::InferenceContext* c) {
   return OkStatus();
 }
 
-Status UnsortedSegmentReductionShapeFn(InferenceContext* c) {
+Status SegmentReductionWithNumSegmentsShapeFn(InferenceContext* c) {
   ShapeHandle s_data = c->input(0);
   ShapeHandle s_segment_ids = c->input(1);
   ShapeHandle s_num_segments = c->input(2);
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index bcd139aedcd..dd517c6f30c 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -206,7 +206,7 @@ Status UnknownShape(shape_inference::InferenceContext* c);
 Status ReductionShape(shape_inference::InferenceContext* c);
 
 // Shape function for unsorted segment operations.
-Status UnsortedSegmentReductionShapeFn(InferenceContext* c);
+Status SegmentReductionWithNumSegmentsShapeFn(InferenceContext* c);
 
 // Shape function for concat operations.
 // <num_inputs_to_concat> is the number of inputs to concatenate and are taken
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 2d2974ff61e..616dac26c9d 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -438,36 +438,46 @@ Status GetCompressedElementFromVariantTensor(
 
 int64_t GetAllocatedBytes(const std::vector<Tensor>& element) {
   int64_t allocated_bytes = 0;
-  DatasetBase* dataset;
-  const CompressedElement* compressed_element;
   for (auto& tensor : element) {
-    if (GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
-      allocated_bytes += dataset->AllocatedBytes();
-    } else if (GetCompressedElementFromVariantTensor(tensor,
-                                                     &compressed_element)
-                   .ok()) {
-      allocated_bytes += compressed_element->ByteSizeLong();
-    } else {
-      allocated_bytes += tensor.AllocatedBytes();
+    if (tensor.dtype() == DT_VARIANT) {
+      // Special case certain variants where AllocatedBytes() doesn't give an
+      // accurate byte count.
+      DatasetBase* dataset;
+      if (GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
+        allocated_bytes += dataset->AllocatedBytes();
+        continue;
+      }
+      const CompressedElement* compressed_element;
+      if (GetCompressedElementFromVariantTensor(tensor, &compressed_element)
+              .ok()) {
+        allocated_bytes += compressed_element->ByteSizeLong();
+        continue;
+      }
     }
+    allocated_bytes += tensor.AllocatedBytes();
   }
   return allocated_bytes;
 }
 
 int64_t GetTotalBytes(const std::vector<Tensor>& element) {
   int64_t total_bytes = 0;
-  DatasetBase* dataset;
-  const CompressedElement* compressed_element;
   for (auto& tensor : element) {
-    if (GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
-      total_bytes += dataset->TotalBytes();
-    } else if (GetCompressedElementFromVariantTensor(tensor,
-                                                     &compressed_element)
-                   .ok()) {
-      total_bytes += compressed_element->ByteSizeLong();
-    } else {
-      total_bytes += tensor.TotalBytes();
+    if (tensor.dtype() == DT_VARIANT) {
+      // Special case certain variants where TotalBytes() doesn't give an
+      // accurate byte count.
+      DatasetBase* dataset;
+      if (GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
+        total_bytes += dataset->TotalBytes();
+        continue;
+      }
+      const CompressedElement* compressed_element;
+      if (GetCompressedElementFromVariantTensor(tensor, &compressed_element)
+              .ok()) {
+        total_bytes += compressed_element->ByteSizeLong();
+        continue;
+      }
     }
+    total_bytes += tensor.TotalBytes();
   }
   return total_bytes;
 }
@@ -725,6 +735,7 @@ Status DatasetBase::MakeIterator(
   Status s = (*iterator)->InitializeBase(ctx, parent);
   if (s.ok()) {
     s.Update((*iterator)->Initialize(ctx));
+    ctx->SaveCheckpoint(iterator->get());
   }
   if (!s.ok()) {
     // Reset the iterator to avoid returning an uninitialized iterator.
@@ -934,6 +945,13 @@ Status DatasetBaseIterator::GetNext(IteratorContext* ctx,
   }
   out_tensors->clear();
   Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
+  ctx->SaveCheckpoint(this);
+  if (!SymbolicCheckpointCompatible()) {
+    ctx->UpdateCheckpointStatus([this]() {
+      return errors::Unimplemented(dataset()->type_string(),
+                                   " does not support symbolic checkpointing.");
+    });
+  }
   if (TF_PREDICT_TRUE(s.ok())) {
     if (TF_PREDICT_TRUE(!*end_of_sequence)) {
       DCHECK_EQ(out_tensors->size(), dataset()->output_dtypes().size());
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 42cb6010959..783b88fd62b 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -16,10 +16,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
 
 #include <deque>
+#include <iterator>
 #include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/cancellation.h"
@@ -46,10 +52,12 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
 // types. Use this macro to expand `m(T)` once for each primitive type
@@ -84,6 +92,7 @@ constexpr int kUnknownCardinality = -2;
 // This constant is a magic number that is used (as a prefix) to identify keys
 // used for serialization of iterator state.
 constexpr char kFullNameRandomHex[] = "60d899aa0d8ce4351e7c3b419e92d25b";
+constexpr int kFullNameRandomHexLen = std::size(kFullNameRandomHex) - 1;
 constexpr char kPipe[] = "|";
 constexpr char kColon[] = ":";
 
@@ -94,6 +103,7 @@ constexpr char kMetadata[] = "metadata";
 constexpr char kCardinalityAttrForRewrite[] = "_cardinality";
 
 class DatasetBase;
+class IteratorContext;
 class SerializationContext;
 
 inline bool IsTFDataFunction(const FunctionDef& func) {
@@ -167,6 +177,17 @@ class IteratorStateWriter {
 // iterator checkpoints should go through this function.
 std::string FullName(const std::string& prefix, const std::string& name);
 
+// Interface for objects that can be checkpointed.
+class Checkpointable {
+ public:
+  Checkpointable() = default;
+  virtual ~Checkpointable() = default;
+
+  virtual Status Save(SerializationContext* ctx,
+                      IteratorStateWriter* writer) = 0;
+  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) = 0;
+};
+
 // Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
 class GraphDefBuilderWrapper {
  public:
@@ -381,6 +402,276 @@ class SplitProvider {
 // Returns the runner threadpool size from an OpKernelContext.
 int32_t GetRunnerThreadpoolSizeFromOpKernelContext(OpKernelContext* ctx);
 
+// In-memory representation of a checkpoint. The checkpoint is represented as a
+// collection of key-value pairs and are expected to be written using the
+// `IteratorStateWriter` interface.
+//
+// The implementation is not thread-safe.
+class MemoryCheckpoint : public IteratorStateWriter {
+ public:
+  MemoryCheckpoint() = default;
+
+  MemoryCheckpoint(MemoryCheckpoint&& other) = default;
+
+  static MemoryCheckpoint CreateRootCheckpoint() {
+    return MemoryCheckpoint(/*is_root=*/true);
+  }
+
+  // BEGIN implementation of `IteratorStateWriter` interface
+  Status WriteScalar(StringPiece key, int64_t val) override {
+    int_values_[key] = val;
+    return OkStatus();
+  }
+  Status WriteScalar(StringPiece name, StringPiece key, int64_t val) override {
+    return WriteScalar(FullName(string(name), string(key)), val);
+  }
+  Status WriteScalar(StringPiece key, const tstring& val) override {
+    str_values_[key] = val;
+    return OkStatus();
+  }
+  Status WriteScalar(StringPiece name, StringPiece key,
+                     const tstring& val) override {
+    return WriteScalar(FullName(string(name), string(key)), val);
+  }
+  Status WriteTensor(StringPiece key, const Tensor& val) override {
+    tensor_values_[key] = val;
+    return OkStatus();
+  }
+  Status WriteTensor(StringPiece name, StringPiece key,
+                     const Tensor& val) override {
+    return WriteTensor(FullName(string(name), string(key)), val);
+  }
+  // END implementation of `IteratorStateWriter` interface
+
+  // String representation for the in-memory checkpoint suitable for debugging.
+  std::string DebugString(bool verbose = false) const {
+    std::string result = absl::StrCat("status=", status_.ToString(),
+                                      ", "
+                                      "root=",
+                                      (is_root_ ? "true" : "false"), "\n");
+    absl::StrAppend(&result, "number of integers: ", int_values_.size(), "\n");
+    if (verbose) {
+      for (const auto& pair : int_values_) {
+        absl::StrAppend(&result, "  ", pair.first, " ", pair.second, "\n");
+      }
+    }
+    absl::StrAppend(&result, "number of strings: ", str_values_.size(), "\n");
+    if (verbose) {
+      for (const auto& pair : str_values_) {
+        absl::StrAppend(&result, "  ", pair.first, " ", pair.second, "\n");
+      }
+    }
+    absl::StrAppend(&result, "number of tensors: ", tensor_values_.size(),
+                    "\n");
+
+    absl::StrAppend(&result,
+                    "number of expired prefixes: ", expired_prefixes_.size(),
+                    "\n");
+    if (verbose) {
+      for (const auto& prefix : expired_prefixes_) {
+        absl::StrAppend(&result, "  ", prefix, "\n");
+      }
+    }
+    return result;
+  }
+
+  // Returns the status of the in-memory checkpoint.
+  Status GetStatus() const { return status_; }
+
+  // Merges key-values pair of another checkpoint with this checkpoint. If a key
+  // exists with another checkpoint, then the key-value pair from the `other`
+  // argument is used.
+  //
+  // Merge also garbage collects expired prefixes.
+  void Merge(MemoryCheckpoint* other) {
+    if (!status_.ok()) {
+      return;
+    }
+
+    if (!other->status_.ok()) {
+      status_ = other->status_;
+      int_values_.clear();
+      str_values_.clear();
+      tensor_values_.clear();
+    }
+
+    for (const auto& pair : other->int_values_) {
+      int_values_[pair.first] = pair.second;
+    }
+    for (const auto& pair : other->str_values_) {
+      str_values_[pair.first] = pair.second;
+    }
+    for (const auto& pair : other->tensor_values_) {
+      tensor_values_[pair.first] = pair.second;
+    }
+
+    // Get the expired prefixes from `other`. Since the info only needs to be
+    // propagated once downstream, we also clean the `expired_prefixes_` of
+    // `other` here.
+    for (const auto& prefix : other->expired_prefixes_) {
+      Purge(prefix);
+    }
+
+    other->expired_prefixes_.clear();
+    VLOG(5) << "MemoryCheckpoint::Merge " << DebugString();
+  }
+
+  // Purge removes all keys with given prefix from checkpoint. It also adds the
+  // prefix for tracking unless it is the root checkpoint.
+  void Purge(const std::string& prefix) {
+    PurgePrefixFromMap(int_values_, prefix);
+    PurgePrefixFromMap(str_values_, prefix);
+    PurgePrefixFromMap(tensor_values_, prefix);
+    if (!is_root_) {
+      expired_prefixes_.insert(prefix);
+    }
+  }
+
+  // Stores the in-memory checkpoint to the given writer.
+  Status Save(IteratorStateWriter* writer) const {
+    for (const auto& pair : int_values_) {
+      const auto& key = pair.first;
+      const auto& value = pair.second;
+      TF_RETURN_IF_ERROR(writer->WriteScalar(key, value));
+    }
+    for (const auto& pair : str_values_) {
+      const auto& key = pair.first;
+      const auto& value = pair.second;
+      TF_RETURN_IF_ERROR(writer->WriteScalar(key, value));
+    }
+    for (const auto& pair : tensor_values_) {
+      const auto& key = pair.first;
+      const auto& value = pair.second;
+      TF_RETURN_IF_ERROR(writer->WriteTensor(key, value));
+    }
+    return OkStatus();
+  }
+
+  // Updates the status of the in-memory checkpoint with the given status.
+  void UpdateStatus(Status status) { status_.Update(status); }
+
+ private:
+  explicit MemoryCheckpoint(bool is_root) : is_root_(is_root) {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MemoryCheckpoint);
+
+  template <typename T>
+  void PurgePrefixFromMap(absl::flat_hash_map<std::string, T>& m,
+                          const std::string& prefix) {
+    // TODO(zhenkai): use trie/prefix tree to speed up lookup.
+    for (auto it = m.cbegin(); it != m.cend();) {
+      // start pos is kFullNameRandomHexLen + 1 (for pipe)
+      if (it->first.compare(kFullNameRandomHexLen + 1, prefix.length(),
+                            prefix) == 0) {
+        m.erase(it++);
+      } else {
+        ++it;
+      }
+    }
+  }
+
+  Status status_ = OkStatus();
+  // Only set to true for the checkpoint in IteratorResource.
+  // Root checkpoint does not track expired prefixes.
+  const bool is_root_ = false;
+  absl::flat_hash_map<std::string, int64_t> int_values_;
+  absl::flat_hash_map<std::string, std::string> str_values_;
+  absl::flat_hash_map<std::string, Tensor> tensor_values_;
+
+  // Keeps track of expired prefixes for propagation. Cleaned after it's merged.
+  absl::flat_hash_set<std::string> expired_prefixes_;
+};
+
+// Aggregates runtime support needed for dataset and iterator serialization.
+class SerializationContext {
+ public:
+  // Handles the external state according to the external state policy.
+  Status HandleCheckExternalStateStatus(Status s) {
+    if (s.ok()) {
+      return s;
+    }
+    switch (params_.external_state_policy) {
+      case ExternalStatePolicy::POLICY_WARN:
+        LOG(WARNING) << s.ToString();
+        return OkStatus();
+      case ExternalStatePolicy::POLICY_IGNORE:
+        VLOG(2) << "Ignoring error status: " << s.ToString();
+        return OkStatus();
+      case ExternalStatePolicy::POLICY_FAIL:
+        return s;
+      default:
+        return errors::InvalidArgument("Unexpected value of external policy: ",
+                                       params_.external_state_policy);
+    }
+  }
+
+  struct Params {
+    explicit Params() = default;
+
+    explicit Params(OpKernelContext* ctx)
+        : resource_mgr(ctx->resource_manager()),
+          device_name(ctx->device()->attributes().name()) {}
+
+    std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+
+    // Indicates what to do if the dataset depends on external state.
+    ExternalStatePolicy external_state_policy =
+        ExternalStatePolicy::POLICY_WARN;
+
+    // Indicates whether the serialization is for rewrites.
+    //
+    // If true:
+    //   * A dataset that doesn't implement serialization is replaced with a
+    //     placeholder returned in `input_list`.
+    //   * Data tensors are replaced with a placeholder returned in
+    //     `input_list`.
+    //   * Datasets that use random seeds should not serialize the random seeds.
+    //     This doesn't affect datasets that use fixed seeds; fixed seeds will
+    //     always be preserved.
+    //   * Cardinality is serialized as an unregistered attribute
+    //     `_cardinality`.
+    // If false:
+    //   * A dataset that doesn't implement serialization should result in an
+    //     error.
+    //   * Data tensors (potentially large) should be serialized.
+    //   * Datasets that use random seeds should serialize the random seeds.
+    bool is_graph_rewrite = false;
+
+    // A resource manager for looking up resources during serialization.
+    ResourceMgr* resource_mgr;
+
+    // The name of the device doing the serialization.
+    std::string device_name;
+
+    // Determines whether checkpointing should represent input pipeline state
+    // symbolically, using cursors into source iterators, or explicitly, by
+    // storing internal state of each iterator.
+    bool symbolic_checkpoint = false;
+  };
+
+  explicit SerializationContext(Params params) : params_(params) {}
+
+  std::vector<std::pair<string, Tensor>>* input_list() {
+    return params_.input_list;
+  }
+
+  ExternalStatePolicy external_state_policy() const {
+    return params_.external_state_policy;
+  }
+
+  bool is_graph_rewrite() const { return params_.is_graph_rewrite; }
+
+  const ResourceMgr* resource_mgr() const { return params_.resource_mgr; }
+
+  const std::string& device_name() const { return params_.device_name; }
+
+  bool symbolic_checkpoint() const { return params_.symbolic_checkpoint; }
+
+ private:
+  Params params_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SerializationContext);
+};
+
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
 // might run computation in an iterator whose lifetime is not nested within the
@@ -405,12 +696,12 @@ class IteratorContext {
           interleave_depth(ctx->interleave_depth()),
           is_restoring(ctx->is_restoring()),
           model(ctx->model()),
-          options(ctx->options()),
           resource_mgr(ctx->resource_mgr()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
           split_providers(ctx->split_providers()),
           stats_aggregator(ctx->stats_aggregator()),
+          symbolic_checkpoint(ctx->symbolic_checkpoint()),
           thread_factory(ctx->thread_factory()),
           thread_pool(ctx->thread_pool()) {}
 
@@ -495,6 +786,9 @@ class IteratorContext {
     // using C++ based implementation for tf.data options (on 4/12/2021).
     std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
 
+    // Indicates whether to use symbolic checkpointing.
+    bool symbolic_checkpoint = false;
+
     // A factory for creating threads to perform blocking work.
     std::shared_ptr<ThreadFactory> thread_factory = nullptr;
 
@@ -508,6 +802,11 @@ class IteratorContext {
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
+  IteratorContext(const IteratorContext& other) : params_(other.params_) {
+    // MemoryCheckpoint should not be copied over as the child context should
+    // not care what's in the checkpoint of parent context.
+  }
+
   Allocator* allocator(AllocatorAttributes attrs) {
     return params_.allocator_getter(attrs);
   }
@@ -532,14 +831,14 @@ class IteratorContext {
     return params_.function_handle_cache;
   }
 
+  MemoryCheckpoint* checkpoint() { return &checkpoint_; }
+
   int64 interleave_depth() { return params_.interleave_depth; }
 
   bool is_restoring() { return params_.is_restoring; }
 
   const std::shared_ptr<model::Model>& model() { return params_.model; }
 
-  const Options* options() { return params_.options; }
-
   ResourceMgr* resource_mgr() { return params_.resource_mgr; }
 
   std::function<void(std::function<void()>)>* runner() {
@@ -556,6 +855,8 @@ class IteratorContext {
     return params_.stats_aggregator;
   }
 
+  bool symbolic_checkpoint() { return params_.symbolic_checkpoint; }
+
   const std::shared_ptr<ThreadFactory>& thread_factory() {
     return params_.thread_factory;
   }
@@ -577,6 +878,52 @@ class IteratorContext {
     }
   }
 
+  // Merges the given checkpoint with the checkpoint of this context.
+  //
+  // The intended for this API is that methods, such as
+  // `IteratorBase::Initialize`, `IteratorBase::GetNextInternal`, or
+  // `IteratorBase::RestoreInternal` that store data in the in-memory
+  // checkpoint, use a separate instance of `IteratorContext` for a nested call,
+  // then the checkpoint collected by the `IteratorContext` instance passed into
+  // the callee should be merged into the `IteratorContext` of the caller:
+  //
+  // ```
+  // Status GetNextInternal(IteratorContext* ctx, ...) {
+  //   ...
+  //   IteratorContext nested_ctx(...);
+  //   TF_RETURN_IF_ERROR(input_impl_->GetNext(&nested_ctx, ...));
+  //   ctx->MergeCheckpoint(nested_ctx->checkpoint());
+  //   ...
+  // }
+  // ```
+  void MergeCheckpoint(MemoryCheckpoint* checkpoint) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.Merge(checkpoint);
+    }
+  }
+
+  // Removes any keys with the given prefix from the checkpoint.
+  //
+  // The intended use for this API is to clean the stale state in checkpoint,
+  // e.g. when a pipeline created by `flat_map` is exhausted, the state
+  // associated with the iterator of that pipeline is no longer needed and
+  // should be removed.
+  void PurgeCheckpoint(const std::string& prefix) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.Purge(prefix);
+    }
+  }
+
+  // Saves the state of the given iterator into the checkpoint.
+  void SaveCheckpoint(Checkpointable* iterator) {
+    if (symbolic_checkpoint()) {
+      SerializationContext::Params params;
+      params.symbolic_checkpoint = true;
+      SerializationContext ctx(std::move(params));
+      checkpoint_.UpdateStatus(iterator->Save(&ctx, &checkpoint_));
+    }
+  }
+
   std::unique_ptr<Thread> StartThread(const string& name,
                                       std::function<void()> fn) {
     if (params_.thread_factory) {
@@ -587,98 +934,22 @@ class IteratorContext {
     }
   }
 
- private:
-  Params params_;
-};
-
-// Aggregates runtime support needed for dataset and iterator serialization.
-class SerializationContext {
- public:
-  // Handles the external state according to the external state policy.
-  Status HandleCheckExternalStateStatus(Status s) {
-    if (s.ok()) {
-      return s;
-    }
-    switch (params_.external_state_policy) {
-      case ExternalStatePolicy::POLICY_WARN:
-        LOG(WARNING) << s.ToString();
-        return OkStatus();
-      case ExternalStatePolicy::POLICY_IGNORE:
-        VLOG(2) << "Ignoring error status: " << s.ToString();
-        return OkStatus();
-      case ExternalStatePolicy::POLICY_FAIL:
-        return s;
-      default:
-        return errors::InvalidArgument("Unexpected value of external policy: ",
-                                       params_.external_state_policy);
+  // Updates the status of the checkpoint with the given status.
+  void UpdateCheckpointStatus(std::function<Status()> status_fn) {
+    if (symbolic_checkpoint()) {
+      checkpoint_.UpdateStatus(status_fn());
     }
   }
 
-  struct Params {
-    explicit Params() {}
-
-    explicit Params(OpKernelContext* ctx)
-        : resource_mgr(ctx->resource_manager()),
-          device_name(ctx->device()->attributes().name()) {}
-
-    std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
-
-    // Indicates what to do if the dataset depends on external state.
-    ExternalStatePolicy external_state_policy =
-        ExternalStatePolicy::POLICY_WARN;
-
-    // Indicates whether the serialization is for rewrites.
-    //
-    // If true:
-    //   * A dataset that doesn't implement serialization is replaced with a
-    //     placeholder returned in `input_list`.
-    //   * Data tensors are replaced with a placeholder returned in
-    //     `input_list`.
-    //   * Datasets that use random seeds should not serialize the random seeds.
-    //     This doesn't affect datasets that use fixed seeds; fixed seeds will
-    //     always be preserved.
-    //   * Cardinality is serialized as an unregistered attribute
-    //     `_cardinality`.
-    // If false:
-    //   * A dataset that doesn't implement serialization should result in an
-    //     error.
-    //   * Data tensors (potentially large) should be serialized.
-    //   * Datasets that use random seeds should serialize the random seeds.
-    bool is_graph_rewrite = false;
-
-    // A resource manager for looking up resources during serialization.
-    ResourceMgr* resource_mgr;
-
-    // The name of the device doing the serialization.
-    std::string device_name;
-  };
-
-  explicit SerializationContext(Params params) : params_(params) {}
-
-  std::vector<std::pair<string, Tensor>>* input_list() {
-    return params_.input_list;
-  }
-
-  ExternalStatePolicy external_state_policy() const {
-    return params_.external_state_policy;
-  }
-
-  bool is_graph_rewrite() const { return params_.is_graph_rewrite; }
-
-  const ResourceMgr* resource_mgr() const { return params_.resource_mgr; }
-
-  const std::string& device_name() const { return params_.device_name; }
-
  private:
   Params params_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(SerializationContext);
+  MemoryCheckpoint checkpoint_;
 };
 
 // Represents the current position in a range of outputs, where the
 // range of outputs is typically represented by an `DatasetBase`,
 // defined below.
-class IteratorBase {
+class IteratorBase : public Checkpointable {
  public:
   virtual ~IteratorBase() {
     for (auto rit = cleanup_fns_.rbegin(); rit != cleanup_fns_.rend(); ++rit) {
@@ -750,6 +1021,9 @@ class IteratorBase {
   // this iterator.
   virtual const string& prefix() const = 0;
 
+  // Indicates whether the iterator is compatible with symbolic checkpointing.
+  virtual bool SymbolicCheckpointCompatible() const { return false; }
+
   // Performs initialization that needs to happen outside of a constructor to
   // properly propagate errors.
   virtual Status Initialize(IteratorContext* ctx) { return OkStatus(); }
@@ -758,7 +1032,7 @@ class IteratorBase {
   Status InitializeBase(IteratorContext* ctx, const IteratorBase* parent);
 
   // Saves the state of this iterator.
-  virtual Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
+  Status Save(SerializationContext* ctx, IteratorStateWriter* writer) override {
     int64_t start_us = EnvTime::NowMicros();
     TF_RETURN_IF_ERROR(SaveInternal(ctx, writer));
     VLOG(1) << "Saved " << prefix() << " in "
@@ -766,24 +1040,35 @@ class IteratorBase {
     return OkStatus();
   }
 
- protected:
-  // Returns a node that models this iterator.
-  virtual std::shared_ptr<model::Node> CreateNode(
-      IteratorContext* ctx, model::Node::Args args) const = 0;
-
   // Restores the state of this iterator.
-  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
+  Status Restore(IteratorContext* ctx, IteratorStateReader* reader) override {
     int64_t start_us = EnvTime::NowMicros();
     TF_RETURN_IF_ERROR(RestoreInternal(ctx, reader));
+    ctx->SaveCheckpoint(this);
     VLOG(1) << "Restored " << prefix() << " in "
             << (EnvTime::NowMicros() - start_us) << "us";
     return OkStatus();
   }
 
+  // Returns the total number of bytes buffered by the iterator across all nodes
+  // in the subtree for which autotuning is enabled.
+  int64_t TotalBufferedBytes() const {
+    if (node_) return node_->TotalBufferedBytes();
+    return 0;
+  }
+
+ protected:
+  // Returns a node that models this iterator.
+  virtual std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const = 0;
+
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their input iterators.
   Status SaveInput(SerializationContext* ctx, IteratorStateWriter* writer,
                    const std::unique_ptr<IteratorBase>& input) {
+    if (ctx->symbolic_checkpoint()) {
+      return OkStatus();
+    }
     return input->Save(ctx, writer);
   }
 
@@ -947,6 +1232,7 @@ class DatasetBase : public core::RefCounted {
     TF_RETURN_IF_ERROR(MakeIterator(&restore_ctx,
                                     /*parent=*/nullptr, output_prefix, &it));
     TF_RETURN_IF_ERROR(it->Restore(&restore_ctx, reader));
+    ctx->MergeCheckpoint(restore_ctx.checkpoint());
     *iterator = std::move(it);
     return OkStatus();
   }
diff --git a/tensorflow/core/framework/dataset.proto b/tensorflow/core/framework/dataset.proto
index f98447c7d34..9dfd03b2ddb 100644
--- a/tensorflow/core/framework/dataset.proto
+++ b/tensorflow/core/framework/dataset.proto
@@ -14,12 +14,18 @@ option cc_enable_arenas = true;
 message CompressedComponentMetadata {
   // The dtype of the component tensor.
   .tensorflow.DataType dtype = 1;
+
   // The shape of the component tensor.
   .tensorflow.TensorShapeProto tensor_shape = 2;
-  // Size of the uncompressed tensor bytes. For tensors serialized as
-  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
-  // this is the size of the buffer underlying the Tensor.
-  int64 tensor_size_bytes = 3;
+
+  // The amount of uncompressed tensor data.
+  // - For string tensors, there is an element for each string indicating the
+  // size of the string.
+  // - For all other tensors, there is a single element indicating the size of
+  // the tensor.
+  repeated uint64 uncompressed_bytes = 4;
+
+  reserved 3;
 }
 
 message CompressedElement {
@@ -27,6 +33,12 @@ message CompressedElement {
   bytes data = 1;
   // Metadata for the components of the element.
   repeated CompressedComponentMetadata component_metadata = 2;
+  // Version of the CompressedElement. CompressedElements may be stored on disk
+  // and read back by later versions of code, so we store a version number to
+  // help readers understand which version they are reading. When you add a new
+  // field to this proto, you need to increment kCompressedElementVersion in
+  // tensorflow/core/data/compression_utils.cc.
+  int32 version = 3;
 }
 
 // An uncompressed dataset element.
diff --git a/tensorflow/core/framework/dataset_options.proto b/tensorflow/core/framework/dataset_options.proto
index b48d58dbb2d..644b431d184 100644
--- a/tensorflow/core/framework/dataset_options.proto
+++ b/tensorflow/core/framework/dataset_options.proto
@@ -173,7 +173,7 @@ enum ExternalStatePolicy {
 // Message stored with Dataset objects to control how datasets are processed and
 // optimized.
 //
-// next: 8
+// next: 9
 message Options {
   // Whether the outputs need to be produced in deterministic order.
   oneof optional_deterministic {
@@ -202,4 +202,18 @@ message Options {
   oneof optional_external_state_policy {
     ExternalStatePolicy external_state_policy = 6;
   }
+  // This option indicates whether to checkpoint input pipeline state
+  // "explicitly", by storing the internal state of iterators for each
+  // tf.data operation, (the default), or "symbolically", by storing metadata
+  // that captures the state of each tf.data operation at the time it processed
+  // the last data seen by tf.data consumer.
+  //
+  // Symbolic checkpoints are expected to be much smaller but not all tf.data
+  // operations are compatible with symbolic checkpointing. In particular,
+  // symbolic checkpointing requires that data is processed in-order and
+  // operations that reorder elements, such as `shuffle()` or non-deterministic
+  // `map()`, are not compatible with symbolic checkpointing.
+  oneof optional_symbolic_checkpoint {
+    bool symbolic_checkpoint = 8;
+  }
 }
diff --git a/tensorflow/core/framework/float8.cc b/tensorflow/core/framework/float8.cc
deleted file mode 100644
index 4a6ab6a0d1c..00000000000
--- a/tensorflow/core/framework/float8.cc
+++ /dev/null
@@ -1,458 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/float8.h"
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <type_traits>
-
-namespace tensorflow {
-namespace float8_internal {
-
-namespace {
-
-// Helper struct for getting a bit representation provided a byte size.
-template <int kNumBytes>
-struct GetUnsignedInteger;
-
-template <>
-struct GetUnsignedInteger<1> {
-  using type = uint8_t;
-};
-
-template <>
-struct GetUnsignedInteger<2> {
-  using type = uint16_t;
-};
-
-template <>
-struct GetUnsignedInteger<4> {
-  using type = uint32_t;
-};
-
-template <>
-struct GetUnsignedInteger<8> {
-  using type = uint64_t;
-};
-
-// Converts between two floating-point types.
-template <typename From, typename To, bool kSaturate, bool kTruncate,
-          typename EnableIf = void>
-struct ConvertImpl;
-
-// Convert to same type.  We need explicit specializations for all combinations
-// of template parameters to avoid ambiguities.
-template <typename Scalar>
-struct IdentityConversion {
-  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& from) {
-    return from;
-  }
-};
-
-template <typename Scalar>
-struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/false, /*kTruncate=*/false,
-                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
-template <typename Scalar>
-struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/false, /*kTruncate=*/true,
-                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
-template <typename Scalar>
-struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/true, /*kTruncate=*/false,
-                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
-template <typename Scalar>
-struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/true, /*kTruncate=*/true,
-                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
-
-// Convert float8 to larger types.
-template <typename From, typename To, bool kSaturate, bool kTruncate>
-struct ConvertImpl<
-    From, To, kSaturate, kTruncate,
-    std::enable_if_t<std::is_base_of_v<float8_base<From>, From> &&
-                     (sizeof(From) < sizeof(To))>> {
-  using FromBits = typename GetUnsignedInteger<sizeof(From)>::type;
-  static constexpr int kFromBits = sizeof(From) * CHAR_BIT;
-  static constexpr int kFromMantissaBits = Eigen::NumTraits<From>::digits() - 1;
-  static constexpr int kFromExponentBits = kFromBits - kFromMantissaBits - 1;
-  static constexpr int kFromExponentBias = (1 << (kFromExponentBits - 1)) - 1;
-  static constexpr FromBits kFromExponentMask =
-      ((static_cast<FromBits>(1) << kFromExponentBits) - 1)
-      << kFromMantissaBits;
-
-  using ToBits = typename GetUnsignedInteger<sizeof(To)>::type;
-  static constexpr int kToBits = sizeof(To) * CHAR_BIT;
-  static constexpr int kToMantissaBits = Eigen::NumTraits<To>::digits() - 1;
-  static constexpr int kToExponentBits = kToBits - kToMantissaBits - 1;
-  static constexpr int kToExponentBias = (1 << (kToExponentBits - 1)) - 1;
-
-  static constexpr int kExponentOffset = kToExponentBias - kFromExponentBias;
-  static constexpr int kDigitShift = kToMantissaBits - kFromMantissaBits;
-
-  static EIGEN_DEVICE_FUNC inline To run(const From& from) {
-    // Shift bits to destination type, without sign bit.
-    const FromBits from_bits = from.rep() & 0x7F;
-    ToBits bits = ToBits{from_bits} << kDigitShift;
-
-    // Adjust the exponent.
-    // Special cases.
-    if (Eigen::numext::isinf(from) || Eigen::numext::isnan(from)) {
-      // Inf or NaN, fill exponent bits with all ones and preserve digits.
-      bits |= ((ToBits{1} << kToExponentBits) - 1) << kToMantissaBits;
-    } else if ((from.rep() & kFromExponentMask) == 0) {
-      // Subnormals.
-
-      // All float8 subnormals become normalized when casting to a type
-      // with a larger number of exponent bits.  To do the conversion, we
-      // construct an explicit map of all subnormal values to the
-      // corresponding normalized values in the destination type.  We do this
-      // by setting the normalized mantissa bits in the source type, shifting
-      // it up to the destination type, then inserting the exponent bits.
-      if constexpr (kFromMantissaBits == 2) {
-        // e5m2, only 4 options:
-        constexpr ToBits kNormalized[4] = {
-            // Mantissa | Exponent
-            ToBits{0x00},
-            ToBits{0x00} | ToBits{kExponentOffset - 1} << kToMantissaBits,
-            ToBits{0x00} | ToBits{kExponentOffset} << kToMantissaBits,
-            (ToBits{0x02} << kDigitShift) |
-                (ToBits{kExponentOffset} << kToMantissaBits),
-        };
-        bits = kNormalized[from_bits];
-      } else if constexpr (kFromMantissaBits == 3) {
-        // e4m3, only 8 options
-        constexpr ToBits kNormalized[8] = {
-            // Mantissa | Exponent
-            ToBits{0x00},
-            ToBits{0x00} | (ToBits{kExponentOffset - 2} << kToMantissaBits),
-            ToBits{0x00} | (ToBits{kExponentOffset - 1} << kToMantissaBits),
-            (ToBits{0x04} << kDigitShift) |
-                (ToBits{kExponentOffset - 1} << kToMantissaBits),
-            ToBits{0x00} | (ToBits{kExponentOffset} << kToMantissaBits),
-            (ToBits{0x02} << kDigitShift) |
-                (ToBits{kExponentOffset} << kToMantissaBits),
-            (ToBits{0x04} << kDigitShift) |
-                (ToBits{kExponentOffset} << kToMantissaBits),
-            (ToBits{0x06} << kDigitShift) |
-                (ToBits{kExponentOffset} << kToMantissaBits),
-        };
-        bits = kNormalized[from_bits];
-      }
-    } else {
-      // Increase exponent by offset difference.
-      bits += ToBits{kExponentOffset} << kToMantissaBits;
-    }
-
-    // Insert sign bit.
-    bits |= static_cast<ToBits>(from.rep() & 0x80) << (kToBits - kFromBits);
-    return Eigen::numext::bit_cast<To>(bits);
-  }
-};
-
-template <typename Bits>
-constexpr inline Bits RoundBitsToNearestEven(Bits bits, int roundoff) {
-  // Round to nearest even by adding a bias term.
-  // Consider a bit pattern
-  //   FFF...FLRTT...T,
-  // where bits RTT...T need to be rounded-off.  We add a bias term to the
-  // bit pattern s.t. a carry is introduced to round up only if
-  // - L is 1, R is 1, OR
-  // - L is 0, R is 1, any T is one.
-  // We do this by adding L to a bit pattern consisting of all T = 1.
-  Bits bias = roundoff == 0 ? 0
-                            : ((bits >> roundoff) & 1) +
-                                  (static_cast<Bits>(1) << (roundoff - 1)) - 1;
-  return bits + bias;
-}
-
-// Convert larger types to float8.
-template <typename From, typename To, bool kSaturate, bool kTruncate>
-struct ConvertImpl<From, To, kSaturate, kTruncate,
-                   std::enable_if_t<std::is_base_of_v<float8_base<To>, To> &&
-                                    (sizeof(To) < sizeof(From))>> {
-  using FromBits = typename GetUnsignedInteger<sizeof(From)>::type;
-  static constexpr int kFromBits = sizeof(From) * CHAR_BIT;
-  static constexpr int kFromMantissaBits = Eigen::NumTraits<From>::digits() - 1;
-  static constexpr int kFromExponentBits = kFromBits - kFromMantissaBits - 1;
-  static constexpr int kFromExponentBias = (1 << (kFromExponentBits - 1)) - 1;
-  static constexpr FromBits kFromExponentMask =
-      ((static_cast<FromBits>(1) << kFromExponentBits) - 1)
-      << kFromMantissaBits;
-
-  using ToBits = typename GetUnsignedInteger<sizeof(To)>::type;
-  static constexpr int kToBits = sizeof(To) * CHAR_BIT;
-  static constexpr int kToMantissaBits = Eigen::NumTraits<To>::digits() - 1;
-  static constexpr int kToExponentBits = kToBits - kToMantissaBits - 1;
-  static constexpr int kToExponentBias = (1 << (kToExponentBits - 1)) - 1;
-
-  static constexpr int kExponentOffset = kFromExponentBias - kToExponentBias;
-  static constexpr int kDigitShift = kFromMantissaBits - kToMantissaBits;
-
-  static_assert(kFromExponentBits > kToExponentBits,
-                "This implementation assumes down-casting to types with fewer "
-                "exponent bits.");
-  static_assert(kDigitShift > 0,
-                "This implementations assumes down-casting to types with fewer "
-                "mantissa bits.");
-
-  // Shift bits in the appropriate directions and add the exponent offset
-  // to convert between bit representations.  The input `in` must be a
-  // positive normalized value.
-  static constexpr inline FromBits ToFromBits(ToBits in) {
-    FromBits out = static_cast<FromBits>(in) << kDigitShift;
-    out += static_cast<FromBits>(kExponentOffset) << kFromMantissaBits;
-    return out;
-  }
-
-  static constexpr inline FromBits SetFromBit(int idx) {
-    return static_cast<FromBits>(1) << idx;
-  }
-
-  static EIGEN_DEVICE_FUNC inline To run(const From& from) {
-    FromBits from_bits = Eigen::numext::bit_cast<FromBits>(from);
-    FromBits from_sign = from_bits & SetFromBit(kFromBits - 1);
-    from_bits ^= from_sign;  // Zeros sign bit to obtain absolute value.
-    ToBits sign = from_sign >> (kFromBits - kToBits);
-
-    // Special values, preserving sign.
-    if (Eigen::numext::isinf(from)) {
-      return sign != 0 ? -Eigen::NumTraits<To>::infinity()
-                       : Eigen::NumTraits<To>::infinity();
-    } else if (Eigen::numext::isnan(from)) {
-      return Eigen::numext::bit_cast<To>(
-          static_cast<uint8_t>(Eigen::NumTraits<To>::quiet_NaN().rep() | sign));
-    }
-
-    // Adjust mantissa.
-    if constexpr (!kTruncate) {
-      from_bits = RoundBitsToNearestEven(from_bits, kDigitShift);
-    }
-    // Zero-out tail bits.
-    from_bits &= ~(SetFromBit(kDigitShift) - 1);
-
-    // Check for overflows.
-    if constexpr (kExponentOffset > 0) {
-      // Shift up exponent and mantissa, add offset to adjust exponent to
-      // source type.
-      constexpr ToBits kToHighest = Eigen::NumTraits<To>::highest().rep();
-      constexpr FromBits kHighest = ToFromBits(kToHighest);
-
-      if (from_bits > kHighest) {
-        ToBits bits =
-            kSaturate ? kToHighest : Eigen::NumTraits<To>::infinity().rep();
-        return Eigen::numext::bit_cast<To>(static_cast<ToBits>(bits | sign));
-      }
-    }
-
-    // Subnormals and zero.
-    constexpr FromBits kLowestNormal =
-        ToFromBits(std::numeric_limits<To>::min().rep());
-    if (from_bits < kLowestNormal) {
-      // Round and shift mantissa down.
-      constexpr FromBits kMantissaMask = SetFromBit(kFromMantissaBits) - 1;
-      int exponent = ((from_bits >> kFromMantissaBits) - kFromExponentBias);
-      int exponent_shift = kDigitShift - exponent - kToExponentBias + 1;
-
-      // Insert the implicit leading 1 bit on the mantissa.  This assumes
-      // the input is normalized.  If it is not, then the mantissa bits -
-      // including the implicit one - will be shifted to zero.
-      from_bits = (SetFromBit(kFromMantissaBits) | (from_bits & kMantissaMask));
-      ToBits bits = 0;
-      // To avoid UB, limit rounding and shifting to the full mantissa plus
-      // leading 1.
-      if (exponent_shift <= kFromMantissaBits + 1) {
-        if constexpr (!kTruncate) {
-          from_bits = RoundBitsToNearestEven(from_bits, exponent_shift);
-        }
-        bits = (from_bits >> exponent_shift);
-      }
-      // Insert sign and return.
-      return Eigen::numext::bit_cast<To>(static_cast<ToBits>(bits | sign));
-    }
-
-    // Adjust exponent.
-    from_bits += static_cast<FromBits>(-kExponentOffset) << kFromMantissaBits;
-
-    // Shift bits and insert sign.
-    ToBits bits = static_cast<ToBits>((from_bits >> kDigitShift) | sign);
-    return Eigen::numext::bit_cast<To>(bits);
-  }
-};
-
-template <bool kSaturate, bool kTruncate>
-struct ConvertImpl<float8_e5m2, float8_e4m3, kSaturate, kTruncate> {
-  static EIGEN_DEVICE_FUNC inline float8_e4m3 run(const float8_e5m2& from) {
-    uint8_t from_bits = from.rep();
-    uint8_t sign = from_bits & 0x80;
-    from_bits ^= sign;
-
-    // Special values (NaN/Inf).
-    if (from_bits > 0x7C) {
-      return float8_e4m3::FromRep(sign | 0x7F);
-    }
-
-    // Subnormals or overflow.
-    if (from_bits < 0x24) {
-      // Subnormal output.
-      int negative_exponent = 15 - (from_bits >> 2);
-      int exponent_shift = negative_exponent - 7;
-      uint8_t bits = ((from_bits & 0x03) | 0x04);
-      if constexpr (!kTruncate) {
-        bits = RoundBitsToNearestEven(bits, exponent_shift);
-      }
-      bits >>= exponent_shift;
-      return float8_e4m3::FromRep(sign | bits);
-    } else if (from_bits > 0x5F) {
-      uint8_t bits = kSaturate ? 0x7E : 0x7F;
-      return float8_e4m3::FromRep(sign | bits);
-    }
-
-    // Subtract exponent offset and shift.
-    uint8_t bits = (from_bits - 0x20) << 1;
-    return float8_e4m3::FromRep(sign | bits);
-  }
-};
-
-template <bool kTruncate>
-struct ConvertImpl<float8_e4m3, float8_e5m2, kTruncate, false> {
-  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const float8_e4m3& from) {
-    uint8_t from_bits = from.rep();
-    uint8_t sign = from_bits & 0x80;
-    from_bits ^= sign;
-
-    // Special values (NaN).
-    if (from_bits == 0x7F) {
-      return float8_e5m2::FromRep(sign | from_bits);
-    }
-
-    // Subnormals.
-    if (from_bits < 0x08) {
-      // Complete map between types, all are normal in e5m3.
-      static constexpr uint8_t kNormalized[8] = {0x00, 0x18, 0x1C, 0x1E,
-                                                 0x20, 0x21, 0x22, 0x23};
-      uint8_t bits = kNormalized[from_bits];
-      return float8_e5m2::FromRep(sign | bits);
-    }
-
-    // Round, truncate to destination type, and add exponent offset.
-    if (!kTruncate) {
-      from_bits = RoundBitsToNearestEven(from_bits, 1);
-    }
-    from_bits = (from_bits >> 1) + 0x20;
-    return float8_e5m2::FromRep(sign | from_bits);
-  }
-};
-
-// Saturation has no impact when casting e4m3 to e5m2.
-template <bool kSaturate, bool kTruncate>
-struct ConvertImpl<float8_e4m3, float8_e5m2, kSaturate, kTruncate> {
-  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const float8_e4m3& from) {
-    return ConvertImpl<float8_e4m3, float8_e5m2, kTruncate, false>::run(from);
-  }
-};
-
-template <bool kTruncate>
-struct ConvertImpl<Eigen::half, float8_e5m2, kTruncate, false> {
-  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const Eigen::half& from) {
-    uint16_t from_bits = Eigen::numext::bit_cast<uint16_t>(from);
-
-    // Special values (Inf or NaN).
-    uint16_t abs_bits = from_bits & 0x7FFF;
-    if (abs_bits == 0x7C00) {
-      return float8_e5m2::FromRep(from_bits >> 8);
-    } else if (abs_bits > 0x7C00) {
-      return float8_e5m2::FromRep((from_bits >> 8) | 0x01);
-    }
-
-    if constexpr (!kTruncate) {
-      from_bits = RoundBitsToNearestEven(from_bits, 8);
-    }
-    return float8_e5m2::FromRep(from_bits >> 8);
-  }
-};
-
-// Saturation has no impact when casting Eigen::half to e5m2.
-template <bool kSaturate, bool kTruncate>
-struct ConvertImpl<Eigen::half, float8_e5m2, kSaturate, kTruncate> {
-  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const Eigen::half& from) {
-    return ConvertImpl<Eigen::half, float8_e5m2, kTruncate, false>::run(from);
-  }
-};
-
-template <>
-struct ConvertImpl<float8_e5m2, Eigen::half, false, false> {
-  static EIGEN_DEVICE_FUNC inline Eigen::half run(const float8_e5m2& from) {
-    return Eigen::numext::bit_cast<Eigen::half>(
-        static_cast<uint16_t>(static_cast<uint16_t>(from.rep()) << 8));
-  }
-};
-
-// Saturation and truncation have no impact when casting e5m2 to Eigen::half.
-template <bool kSaturate, bool kTruncate>
-struct ConvertImpl<float8_e5m2, Eigen::half, kSaturate, kTruncate> {
-  static EIGEN_DEVICE_FUNC inline Eigen::half run(const float8_e5m2& from) {
-    return ConvertImpl<float8_e5m2, Eigen::half, false, false>::run(from);
-  }
-};
-
-}  // namespace
-
-template <typename Derived>
-template <bool kSaturate, bool kTruncate, typename From>
-EIGEN_DEVICE_FUNC Derived float8_base<Derived>::ConvertFrom(const From& from) {
-  return ConvertImpl<From, Derived, kSaturate, kTruncate>::run(from);
-}
-
-template <typename Derived>
-template <typename To, bool kSaturate, bool kTruncate>
-EIGEN_DEVICE_FUNC To float8_base<Derived>::ConvertTo(const Derived& from) {
-  return ConvertImpl<Derived, To, kSaturate, kTruncate>::run(from);
-}
-
-#define DECLARE_CONVERT(Derived, Other)                                 \
-  template EIGEN_DEVICE_FUNC Derived                                    \
-  float8_base<Derived>::ConvertFrom<false, false, Other>(const Other&); \
-  template EIGEN_DEVICE_FUNC Derived                                    \
-  float8_base<Derived>::ConvertFrom<false, true, Other>(const Other&);  \
-  template EIGEN_DEVICE_FUNC Derived                                    \
-  float8_base<Derived>::ConvertFrom<true, false, Other>(const Other&);  \
-  template EIGEN_DEVICE_FUNC Derived                                    \
-  float8_base<Derived>::ConvertFrom<true, true, Other>(const Other&);   \
-  template EIGEN_DEVICE_FUNC Other                                      \
-  float8_base<Derived>::ConvertTo<Other, false, false>(const Derived&); \
-  template EIGEN_DEVICE_FUNC Other                                      \
-  float8_base<Derived>::ConvertTo<Other, false, true>(const Derived&);  \
-  template EIGEN_DEVICE_FUNC Other                                      \
-  float8_base<Derived>::ConvertTo<Other, true, false>(const Derived&);  \
-  template EIGEN_DEVICE_FUNC Other                                      \
-  float8_base<Derived>::ConvertTo<Other, true, true>(const Derived&)
-
-DECLARE_CONVERT(float8_e4m3, double);
-DECLARE_CONVERT(float8_e4m3, float);
-DECLARE_CONVERT(float8_e4m3, Eigen::bfloat16);
-DECLARE_CONVERT(float8_e4m3, Eigen::half);
-DECLARE_CONVERT(float8_e4m3, float8_e5m2);
-DECLARE_CONVERT(float8_e4m3, float8_e4m3);
-
-DECLARE_CONVERT(float8_e5m2, double);
-DECLARE_CONVERT(float8_e5m2, float);
-DECLARE_CONVERT(float8_e5m2, Eigen::bfloat16);
-DECLARE_CONVERT(float8_e5m2, Eigen::half);
-DECLARE_CONVERT(float8_e5m2, float8_e5m2);
-DECLARE_CONVERT(float8_e5m2, float8_e4m3);
-
-#undef DECLARE_CONVERT
-
-}  // namespace float8_internal
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/float8.h b/tensorflow/core/framework/float8.h
deleted file mode 100644
index 3c248b1ae30..00000000000
--- a/tensorflow/core/framework/float8.h
+++ /dev/null
@@ -1,351 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_FRAMEWORK_FLOAT8_H_
-#define TENSORFLOW_CORE_FRAMEWORK_FLOAT8_H_
-
-// 8-bit Floating Point Interchange Format, as described by
-//   https://arxiv.org/abs/2209.05433
-
-#include <cstdint>
-
-#include "third_party/eigen3/Eigen/Core"
-
-namespace tensorflow {
-
-namespace float8_internal {
-
-// Forward-declarations of classes.
-class float8_e4m3;
-class float8_e5m2;
-
-template <typename Derived>
-class float8_base {
- protected:
-  // Constructor tag to allow constexpr construction from bit representation.
-  struct ConstructFromRepTag {};
-
-  constexpr float8_base(uint8_t rep, ConstructFromRepTag) : rep_{rep} {}
-
- public:
-  constexpr uint8_t rep() const { return rep_; }
-
-  constexpr Derived operator-() const {
-    return Derived(static_cast<uint8_t>(rep_ ^ 0x80), ConstructFromRepTag{});
-  }
-
-  constexpr bool operator==(const Derived& other) const {
-    if (Eigen::numext::isnan(derived())) {
-      return false;
-    }
-    return rep() == other.rep();
-  }
-
-  constexpr const Derived& derived() const {
-    return *static_cast<const Derived*>(this);
-  }
-
-  constexpr Derived& derived() { return *static_cast<Derived*>(this); }
-
-  static constexpr Derived FromRep(uint8_t rep) {
-    return Derived(rep, ConstructFromRepTag{});
-  }
-
-  // Conversions allowing saturation and truncation.
-  template <bool kSaturate = false, bool kTruncate = false, typename From>
-  static EIGEN_DEVICE_FUNC Derived ConvertFrom(const From& from);
-
-  template <typename To, bool kSaturate = false, bool kTruncate = false>
-  static EIGEN_DEVICE_FUNC To ConvertTo(const Derived& from);
-
- private:
-  uint8_t rep_;
-};
-
-class float8_e4m3 : public float8_base<float8_e4m3> {
-  // Exponent: 4, Mantissa: 3, bias: 7.
-  // Extended range: no inf, NaN represented by S1111111.
- private:
-  using Base = float8_base<float8_e4m3>;
-  friend class float8_base<float8_e4m3>;
-
-  constexpr float8_e4m3(uint8_t rep, ConstructFromRepTag)
-      : Base(rep, ConstructFromRepTag{}) {}
-
- public:
-  explicit float8_e4m3(double f64) : float8_e4m3(ConvertFrom(f64)) {}
-  explicit float8_e4m3(float f32) : float8_e4m3(ConvertFrom(f32)) {}
-  explicit float8_e4m3(Eigen::bfloat16 bf16) : float8_e4m3(ConvertFrom(bf16)) {}
-  explicit float8_e4m3(Eigen::half f16) : float8_e4m3(ConvertFrom(f16)) {}
-  explicit float8_e4m3(const float8_e5m2& f8) : float8_e4m3(ConvertFrom(f8)) {}
-
-  explicit operator double() const { return ConvertTo<double>(*this); }
-  explicit operator float() const { return ConvertTo<float>(*this); }
-  explicit operator Eigen::bfloat16() const {
-    return ConvertTo<Eigen::bfloat16>(*this);
-  }
-  explicit operator Eigen::half() const {
-    return ConvertTo<Eigen::half>(*this);
-  }
-
-  using Base::operator==;
-  using Base::operator-;
-};
-
-class float8_e5m2 : public float8_base<float8_e5m2> {
-  // Exponent: 5, Mantissa: 2, bias: 15.
-  // IEEE 754.
- private:
-  using Base = float8_base<float8_e5m2>;
-  friend class float8_base<float8_e5m2>;
-
-  constexpr float8_e5m2(uint8_t rep, ConstructFromRepTag)
-      : Base(rep, ConstructFromRepTag{}) {}
-
- public:
-  explicit float8_e5m2(double f64) : float8_e5m2(ConvertFrom(f64)) {}
-  explicit float8_e5m2(float f32) : float8_e5m2(ConvertFrom(f32)) {}
-  explicit float8_e5m2(Eigen::bfloat16 bf16) : float8_e5m2(ConvertFrom(bf16)) {}
-  explicit float8_e5m2(Eigen::half f16) : float8_e5m2(ConvertFrom(f16)) {}
-  explicit float8_e5m2(float8_e4m3 f8) : float8_e5m2(ConvertFrom(f8)) {}
-
-  explicit operator double() const { return ConvertTo<double>(*this); }
-  explicit operator float() const { return ConvertTo<float>(*this); }
-  explicit operator Eigen::bfloat16() const {
-    return ConvertTo<Eigen::bfloat16>(*this);
-  }
-  explicit operator Eigen::half() const {
-    return ConvertTo<Eigen::half>(*this);
-  }
-
-  using Base::operator==;
-  using Base::operator-;
-};
-
-// Structures for use in specializing std::numeric_limits.
-struct numeric_limits_float8_base {
-  // NOLINTBEGIN: these names must match std::numeric_limits.
-  static inline constexpr const bool is_specialized = true;
-  static inline constexpr const bool is_signed = true;
-  static inline constexpr const bool is_integer = false;
-  static inline constexpr const bool is_exact = false;
-  static inline constexpr const bool has_quiet_NaN = true;
-  static inline constexpr const std::float_denorm_style has_denorm =
-      std::denorm_present;
-  static inline constexpr const bool has_denorm_loss = false;
-  static inline constexpr const std::float_round_style round_style =
-      std::round_to_nearest;
-  static inline constexpr const bool is_bounded = true;
-  static inline constexpr const bool is_modulo = false;
-  static inline constexpr const int radix = std::numeric_limits<float>::radix;
-  static inline constexpr const bool traps = std::numeric_limits<float>::traps;
-  static inline constexpr const bool tinyness_before =
-      std::numeric_limits<float>::tinyness_before;
-  // NOLINTEND
-};
-
-template <typename Derived>
-struct numeric_limits_float8 {
-  // NOLINTBEGIN: these names must match std::numeric_limits.
-  static inline constexpr const int digits = 0;
-  static inline constexpr const int digits10 = 0;
-  static inline constexpr const int max_digits10 = 0;
-  static inline constexpr const int min_exponent = 0;
-  static inline constexpr const int min_exponent10 = 0;
-  static inline constexpr const int max_exponent = 0;
-  static inline constexpr const int max_exponent10 = 0;
-  static inline constexpr const bool is_iec559 = false;
-  static inline constexpr const bool has_infinity = false;
-  static inline constexpr const bool has_signaling_NaN = false;
-  // NOLINTEND
-};
-
-template <>
-struct numeric_limits_float8<float8_e4m3> : public numeric_limits_float8_base {
-  // NOLINTBEGIN: these names must match std::numeric_limits.
-  static inline constexpr const int digits = 4;
-  static inline constexpr const int digits10 = 0;      // floor(3 * log10(2));
-  static inline constexpr const int max_digits10 = 3;  // ceil(4 * log10(2) + 1)
-  static inline constexpr const int min_exponent = -5;
-  static inline constexpr const int min_exponent10 = -1;
-  static inline constexpr const int max_exponent = 9;  // Extended format.
-  static inline constexpr const int max_exponent10 = 2;
-  static inline constexpr const bool is_iec559 = false;
-  static inline constexpr const bool has_infinity = false;
-  static inline constexpr const bool has_signaling_NaN = false;
-  // NOLINTEND
-
-  static constexpr float8_e4m3 min() { return float8_e4m3::FromRep(0x08); }
-  static constexpr float8_e4m3 lowest() { return float8_e4m3::FromRep(0xFE); }
-  static constexpr float8_e4m3 max() { return float8_e4m3::FromRep(0x7E); }
-  static constexpr float8_e4m3 epsilon() { return float8_e4m3::FromRep(0x20); }
-  static constexpr float8_e4m3 round_error() {
-    return float8_e4m3::FromRep(0x30);
-  }
-  static constexpr float8_e4m3 infinity() {
-    return float8_e4m3::FromRep(0x7F);
-  }  // NaN.
-  static constexpr float8_e4m3 quiet_NaN() {
-    return float8_e4m3::FromRep(0x7F);
-  }
-  static constexpr float8_e4m3 signaling_NaN() {
-    return float8_e4m3::FromRep(0x7F);
-  }
-  static constexpr float8_e4m3 denorm_min() {
-    return float8_e4m3::FromRep(0x01);
-  }
-};
-
-template <>
-struct numeric_limits_float8<float8_e5m2> : public numeric_limits_float8_base {
-  // NOLINTBEGIN: these names must match std::numeric_limits.
-  static inline constexpr const int digits = 3;
-  static inline constexpr const int digits10 = 0;      // floor(2 * log10(2))
-  static inline constexpr const int max_digits10 = 2;  // ceil(3 * log10(2) + 1)
-  static inline constexpr const int min_exponent = -13;
-  static inline constexpr const int min_exponent10 = -4;
-  static inline constexpr const int max_exponent = 16;
-  static inline constexpr const int max_exponent10 = 4;
-  static inline constexpr const bool is_iec559 = true;
-  static inline constexpr const bool has_infinity = true;
-  static inline constexpr const bool has_signaling_NaN = true;
-  // NOLINTEND
-
-  static constexpr float8_e5m2 min() { return float8_e5m2::FromRep(0x04); }
-  static constexpr float8_e5m2 lowest() { return float8_e5m2::FromRep(0xFB); }
-  static constexpr float8_e5m2 max() { return float8_e5m2::FromRep(0x7B); }
-  static constexpr float8_e5m2 epsilon() { return float8_e5m2::FromRep(0x34); }
-  static constexpr float8_e5m2 round_error() {
-    return float8_e5m2::FromRep(0x38);
-  }
-  static constexpr float8_e5m2 infinity() { return float8_e5m2::FromRep(0x7C); }
-  static constexpr float8_e5m2 quiet_NaN() {
-    return float8_e5m2::FromRep(0x7F);
-  }
-  static constexpr float8_e5m2 signaling_NaN() {
-    return float8_e5m2::FromRep(0x7D);
-  }
-  static constexpr float8_e5m2 denorm_min() {
-    return float8_e5m2::FromRep(0x01);
-  }
-};
-
-// Free-functions for use with ADL and in Eigen.
-constexpr inline float8_e4m3 abs(const float8_e4m3& a) {
-  return float8_e4m3::FromRep(a.rep() & 0x7F);
-}
-
-constexpr inline bool isnan(const float8_e4m3& a) {
-  return (a.rep() & 0x7F) == 0x7F;
-}
-
-constexpr inline bool isinf(const float8_e4m3& a) {
-  return false;  // No inf representation.
-}
-
-constexpr inline float8_e5m2 abs(const float8_e5m2& a) {
-  return float8_e5m2::FromRep(a.rep() & 0x7F);
-}
-
-constexpr inline bool isnan(const float8_e5m2& a) {
-  return (a.rep() & 0x7F) > 0x7C;
-}
-
-constexpr inline bool isinf(const float8_e5m2& a) {
-  return (a.rep() & 0x7F) == 0x7C;
-}
-
-}  // namespace float8_internal
-
-// Exported types.
-using float8_e4m3 = float8_internal::float8_e4m3;
-using float8_e5m2 = float8_internal::float8_e5m2;
-
-}  // namespace tensorflow
-
-// Standard-library overrides.  Note that these are picked up by Eigen as well.
-namespace std {
-template <>
-struct numeric_limits<tensorflow::float8_e4m3>
-    : public tensorflow::float8_internal::numeric_limits_float8<
-          tensorflow::float8_e4m3> {};
-
-template <>
-struct numeric_limits<tensorflow::float8_e5m2>
-    : public tensorflow::float8_internal::numeric_limits_float8<
-          tensorflow::float8_e5m2> {};
-
-}  // namespace std
-
-// Eigen-specific overrides.
-namespace Eigen {
-namespace numext {
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tensorflow::float8_e4m3
-bit_cast<tensorflow::float8_e4m3, uint8_t>(const uint8_t& src) {
-  return tensorflow::float8_e4m3::FromRep(src);
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
-bit_cast<uint8_t, tensorflow::float8_e4m3>(const tensorflow::float8_e4m3& src) {
-  return src.rep();
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tensorflow::float8_e5m2
-bit_cast<tensorflow::float8_e5m2, uint8_t>(const uint8_t& src) {
-  return tensorflow::float8_e5m2::FromRep(src);
-}
-
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
-bit_cast<uint8_t, tensorflow::float8_e5m2>(const tensorflow::float8_e5m2& src) {
-  return src.rep();
-}
-
-}  // namespace numext
-
-// Work-around for isinf/isnan issue on aarch64.
-namespace internal {
-template <>
-EIGEN_DEVICE_FUNC inline bool isinf_impl<tensorflow::float8_e4m3>(
-    const tensorflow::float8_e4m3& x) {
-  return tensorflow::float8_internal::isinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isinf_impl<tensorflow::float8_e5m2>(
-    const tensorflow::float8_e5m2& x) {
-  return tensorflow::float8_internal::isinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isnan_impl<tensorflow::float8_e4m3>(
-    const tensorflow::float8_e4m3& x) {
-  return tensorflow::float8_internal::isnan(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline bool isnan_impl<tensorflow::float8_e5m2>(
-    const tensorflow::float8_e5m2& x) {
-  return tensorflow::float8_internal::isnan(x);
-}
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // TENSORFLOW_CORE_FRAMEWORK_FLOAT8_H_
diff --git a/tensorflow/core/framework/float8_test.cc b/tensorflow/core/framework/float8_test.cc
deleted file mode 100644
index 42538fbe2ac..00000000000
--- a/tensorflow/core/framework/float8_test.cc
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/float8.h"
-
-#include <cmath>
-#include <limits>
-#include <string>
-
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-template <typename Float8_>
-class Float8Test : public ::testing::Test {};
-
-// Helper utility for prettier test names.
-struct Float8TestParamNames {
-  template <typename TypeParam>
-  static std::string GetName(int idx) {
-    if constexpr (std::is_same_v<TypeParam, float8_e4m3>) {
-      return "float8_e4m3";
-    } else if constexpr (std::is_same_v<TypeParam, float8_e5m2>) {
-      return "float8_e5m2";
-    }
-    return absl::StrCat(idx);
-  }
-};
-
-using Float8Types = ::testing::Types<float8_e4m3, float8_e5m2>;
-TYPED_TEST_SUITE(Float8Test, Float8Types, Float8TestParamNames);
-
-TEST(Float8E4m3Test, NumericLimits) {
-  EXPECT_TRUE(
-      Eigen::numext::isnan(std::numeric_limits<float8_e4m3>::quiet_NaN()));
-  EXPECT_TRUE(
-      Eigen::numext::isnan(std::numeric_limits<float8_e4m3>::signaling_NaN()));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::min()),
-            std::exp2(-6));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::max()), 448);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::lowest()),
-            -448);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::epsilon()),
-            0.125);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::round_error()),
-            0.5);
-  // No infinity, represent as NaN.
-  EXPECT_TRUE(
-      Eigen::numext::isnan(std::numeric_limits<float8_e4m3>::infinity()));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3>::denorm_min()),
-            std::exp2(-9));
-}
-
-TEST(Float8E5m2Test, NumericLimits) {
-  EXPECT_TRUE(
-      Eigen::numext::isnan(std::numeric_limits<float8_e5m2>::quiet_NaN()));
-  EXPECT_TRUE(
-      Eigen::numext::isnan(std::numeric_limits<float8_e5m2>::signaling_NaN()));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::min()),
-            std::exp2(-14));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::max()), 57344);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::lowest()),
-            -57344);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::epsilon()),
-            0.25);
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::round_error()),
-            0.5);
-  EXPECT_TRUE(
-      Eigen::numext::isinf(std::numeric_limits<float8_e5m2>::infinity()));
-  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::denorm_min()),
-            std::exp2(-16));
-}
-
-TYPED_TEST(Float8Test, FromRep) {
-  using Float8 = TypeParam;
-  Float8 x = Float8::FromRep(0x4F);
-  EXPECT_EQ(x.rep(), 0x4F);
-}
-
-TYPED_TEST(Float8Test, Negate) {
-  using Float8 = TypeParam;
-  Float8 x = -Float8::FromRep(0x4F);
-  EXPECT_EQ(x.rep(), 0x80 | 0x4F);
-
-  Float8 nan = -std::numeric_limits<Float8>::quiet_NaN();
-  EXPECT_TRUE(Eigen::numext::isnan(nan));
-}
-
-TYPED_TEST(Float8Test, BitCasts) {
-  using Float8 = TypeParam;
-  Float8 x = Float8::FromRep(0x47);
-  EXPECT_EQ(Eigen::numext::bit_cast<uint8_t>(x), 0x47);
-  EXPECT_EQ(Eigen::numext::bit_cast<Float8>(x.rep()).rep(), 0x47);
-}
-
-TYPED_TEST(Float8Test, UpCasts) {
-  using Float8 = TypeParam;
-
-  // Loop through each float8 value.
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    // Cast up to each other floating-point type, and verify they are the same.
-    Float8 f8 = Float8::FromRep(i);
-    double f64 = static_cast<double>(f8);
-    float f32 = static_cast<float>(f8);
-    Eigen::bfloat16 bf16 = static_cast<Eigen::bfloat16>(f8);
-    Eigen::half f16 = static_cast<Eigen::half>(f8);
-
-    if (Eigen::numext::isnan(f8)) {
-      EXPECT_TRUE(Eigen::numext::isnan(f64));
-      EXPECT_TRUE(Eigen::numext::isnan(f32));
-      EXPECT_TRUE(Eigen::numext::isnan(bf16));
-      EXPECT_TRUE(Eigen::numext::isnan(f16));
-    } else {
-      EXPECT_EQ(f64, f32);
-      EXPECT_EQ(f32, bf16);
-      EXPECT_EQ(bf16, f16);
-    }
-  }
-}
-
-TYPED_TEST(Float8Test, DownCasts) {
-  using Float8 = TypeParam;
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    float x = static_cast<float>(Float8::FromRep(i));
-
-    Float8 f64 = static_cast<Float8>(static_cast<double>(x));
-    Float8 f32 = static_cast<Float8>(static_cast<float>(x));
-    Float8 bf16 = static_cast<Float8>(static_cast<Eigen::bfloat16>(x));
-    Float8 f16 = static_cast<Float8>(static_cast<Eigen::half>(x));
-
-    if (Eigen::numext::isnan(x)) {
-      EXPECT_TRUE(Eigen::numext::isnan(f64));
-      EXPECT_TRUE(Eigen::numext::isnan(f32));
-      EXPECT_TRUE(Eigen::numext::isnan(bf16));
-      EXPECT_TRUE(Eigen::numext::isnan(f16));
-    } else {
-      EXPECT_EQ(f64.rep(), i) << i;
-      EXPECT_EQ(f32.rep(), i) << i;
-      EXPECT_EQ(bf16.rep(), i) << i;
-      EXPECT_EQ(f16.rep(), i) << i;
-    }
-  }
-}
-
-TYPED_TEST(Float8Test, ConvertFromWithSaturation) {
-  using Float8 = TypeParam;
-
-  // Saturation above max value.
-  Float8 upper =
-      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
-          static_cast<float>(std::numeric_limits<Float8>::max()) * 2);
-  EXPECT_EQ(upper, std::numeric_limits<Float8>::max());
-
-  Float8 lower =
-      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
-          static_cast<float>(std::numeric_limits<Float8>::lowest()) * 2);
-  EXPECT_EQ(lower, std::numeric_limits<Float8>::lowest());
-
-  // Special values remain with saturation.
-  Float8 nan =
-      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
-          std::numeric_limits<float>::quiet_NaN());
-  EXPECT_TRUE(Eigen::numext::isnan(nan));
-  Float8 inf =
-      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
-          std::numeric_limits<float>::infinity());
-  // E4M3 doesn't have inf, so check inf -> NaN conversion.
-  EXPECT_TRUE(std::numeric_limits<Float8>::has_infinity
-                  ? Eigen::numext::isinf(inf)
-                  : Eigen::numext::isnan(inf));
-  Float8 ninf =
-      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
-          -std::numeric_limits<float>::infinity());
-  EXPECT_TRUE(std::numeric_limits<Float8>::has_infinity
-                  ? Eigen::numext::isinf(ninf)
-                  : Eigen::numext::isnan(ninf));
-}
-
-TYPED_TEST(Float8Test, ConvertFromWithTruncation) {
-  using Float8 = TypeParam;
-
-  // Truncation and rounding of a number ever-so-slightly less than 2.
-  float less_than_two = Eigen::numext::bit_cast<float>(0x3FFFFFFF);
-  Float8 truncated =
-      Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
-          less_than_two);
-  EXPECT_LT(static_cast<float>(truncated), 2);
-
-  Float8 rounded =
-      Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
-          less_than_two);
-  EXPECT_EQ(static_cast<float>(rounded), 2);
-
-  // Truncation and rounding of a subnormal.
-  for (int i = 0x01; i < 0x04; ++i) {
-    float less_than_subnorm =
-        std::nexttoward(static_cast<float>(Float8::FromRep(i)), 0);
-
-    Float8 truncated_subnorm =
-        Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
-            less_than_subnorm);
-    EXPECT_EQ(truncated_subnorm.rep(), i - 1);
-
-    Float8 rounded_subnorm =
-        Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
-            less_than_subnorm);
-    EXPECT_EQ(rounded_subnorm.rep(), i);
-  }
-}
-
-TYPED_TEST(Float8Test, ConvertTo) {
-  using Float8 = TypeParam;
-
-  // Converting to higher precision types doesn't result in either
-  // truncation or saturation, so let's just ensure they all provide the
-  // same results.
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    // Cast up to each other floating-point type, and verify they are the same.
-    Float8 f8 = Float8::FromRep(i);
-    float f32 = static_cast<float>(f8);
-    if (Eigen::numext::isnan(f8)) {
-      EXPECT_TRUE(
-          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/false,
-                                                /*kTruncate=*/false>(f8)));
-      EXPECT_TRUE(
-          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/false,
-                                                /*kTruncate=*/true>(f8)));
-      EXPECT_TRUE(
-          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/true,
-                                                /*kTruncate=*/false>(f8)));
-      EXPECT_TRUE(
-          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/true,
-                                                /*kTruncate=*/true>(f8)));
-    } else {
-      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/false,
-                                                 /*kTruncate=*/false>(f8)));
-      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/false,
-                                                 /*kTruncate=*/true>(f8)));
-      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/true,
-                                                 /*kTruncate=*/false>(f8)));
-      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/true,
-                                                 /*kTruncate=*/true>(f8)));
-    }
-  }
-}
-
-TEST(Float8Test, Float8E5m2_To_Float8E4m3) {
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    float8_e5m2 e5m2 = float8_e5m2::FromRep(i);
-    float8_e4m3 e4m3 = static_cast<float8_e4m3>(e5m2);
-    float8_e4m3 expected = static_cast<float8_e4m3>(static_cast<float>(e5m2));
-    EXPECT_EQ(e4m3.rep(), expected.rep()) << i;
-  }
-
-  // Saturation.
-  float8_e5m2 max = std::numeric_limits<float8_e5m2>::max();
-  float8_e4m3 saturated = float8_e4m3::ConvertFrom</*kSaturate=*/true>(max);
-  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3>::max());
-  saturated = float8_e5m2::ConvertTo<float8_e4m3, /*kSaturate=*/true>(max);
-  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3>::max());
-
-  // Truncation - only occurs for e4m3 subnormals.
-  float8_e5m2 less_than_subnorm = float8_e5m2::FromRep(0x1F);  // 2^-7 - 2^-10.
-  float8_e4m3 rounded_subnorm =
-      float8_e4m3::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
-          less_than_subnorm);
-  EXPECT_EQ(rounded_subnorm.rep(), 0x04);
-  float8_e4m3 truncated_subnorm =
-      float8_e4m3::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
-          less_than_subnorm);
-  EXPECT_EQ(truncated_subnorm.rep(), 0x03);
-}
-
-TEST(Float8Test, Float8E4m3_To_Float8E5m2) {
-  for (int i = 0x00; i <= 0xFF; ++i) {
-    float8_e4m3 e4m3 = float8_e4m3::FromRep(i);
-    float8_e5m2 e5m2 = static_cast<float8_e5m2>(e4m3);
-    float8_e5m2 expected = static_cast<float8_e5m2>(static_cast<float>(e4m3));
-    EXPECT_EQ(e5m2.rep(), expected.rep()) << i;
-  }
-
-  // Truncation and rounding of a number ever-so-slightly less than 2.
-  float8_e4m3 less_than_two = float8_e4m3::FromRep(0x3F);
-  float8_e5m2 truncated =
-      float8_e5m2::template ConvertFrom</*kSaturate=*/false,
-                                        /*kTruncate=*/true>(less_than_two);
-  EXPECT_LT(static_cast<float>(truncated), 2);
-
-  float8_e5m2 rounded =
-      float8_e5m2::template ConvertFrom</*kSaturate=*/false,
-                                        /*kTruncate=*/false>(less_than_two);
-  EXPECT_EQ(static_cast<float>(rounded), 2);
-}
-
-TEST(Float8Test, Half_To_Float8E5m2) {
-  // Special values, NaN.
-  Eigen::half inf =
-      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x7C00));
-  EXPECT_EQ(static_cast<float8_e5m2>(inf).rep(), 0x7C);
-  Eigen::half ninf =
-      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0xFC00));
-  EXPECT_EQ(static_cast<float8_e5m2>(ninf).rep(), 0xFC);
-
-  Eigen::half nan =
-      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x7C01));
-  EXPECT_EQ(static_cast<float8_e5m2>(nan).rep(), 0x7D);
-  Eigen::half nnan =
-      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0xFC01));
-  EXPECT_EQ(static_cast<float8_e5m2>(nnan).rep(), 0xFD);
-
-  // Rounding vs truncation.
-  Eigen::half less_than_two =
-      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x3FFF));
-  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
-                 less_than_two)
-                 .rep()),
-            0x40);
-  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
-                 less_than_two)
-                 .rep()),
-            0x3F);
-  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
-                 -less_than_two)
-                 .rep()),
-            0xC0);
-  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
-                 -less_than_two)
-                 .rep()),
-            0xBF);
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/full_type.proto b/tensorflow/core/framework/full_type.proto
index 1b8983e67ca..19e8da5ab71 100644
--- a/tensorflow/core/framework/full_type.proto
+++ b/tensorflow/core/framework/full_type.proto
@@ -186,6 +186,17 @@ enum FullTypeId {
   //   TFT_ENCODING[TFT_INT32, TFT_STRING] is an integer encoded as string.
   TFT_ENCODED = 1004;
 
+  // The type of "shape tensors" where the runtime value is the shape of
+  // some tensor(s), i.e. the output of tf.shape.
+  // Shape tensors have special, host-only placement, in contrast to
+  // TFT_TENSOR[TFT_INT32] which is the type of a normal numeric tensor
+  // with no special placement.
+  //
+  // Examples:
+  //   TFT_SHAPE_TENSOR[TFT_INT32] is the most common
+  //   TFT_SHAPE_TENSOR[TFT_INT64] is also allowed
+  TFT_SHAPE_TENSOR = 1005;
+
   // Type attributes. These always appear in the parametrization of a type,
   // never alone. For example, there is no such thing as a "bool" TensorFlow
   // object (for now).
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index c9ecf1ea6bf..4ed5b44aaa6 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -828,7 +828,9 @@ class FunctionLibraryRuntime {
   // RPC calls.
   struct Options {
     Options() {}
-    explicit Options(const int64_t step_id) : step_id(step_id) {}
+    explicit Options(const int64_t step_id)
+        : step_id(step_id), cleanup_rendezvous_after_run(false) {}
+
     // Choose a step ID that is guaranteed not to clash with any
     // Session-generated step ID. DirectSession only generates
     // non-negative step IDs (contiguous, starting from 0), and
@@ -836,6 +838,12 @@ class FunctionLibraryRuntime {
     // always 0, so a negative random step ID should suffice.
     const int64_t step_id = -std::abs(static_cast<int64_t>(random::New64()));
 
+    // Whether to clean up rendezvous after run.
+    // If the function is a remote component of a cross-process function, a
+    // higher level component should determine the end of a step, and cleanup
+    // the rendezvous.
+    const bool cleanup_rendezvous_after_run = true;
+
     // op_id of the function running in eager mode. Set when we want to copy
     // remote outputs lazily. All components of a remote multi-device function
     // should use the same op_id, in order to correctly map remote output
@@ -847,7 +855,7 @@ class FunctionLibraryRuntime {
     CollectiveExecutor* collective_executor = nullptr;
     ScopedStepContainer* step_container = nullptr;
     StepStatsCollectorInterface* stats_collector = nullptr;
-    CoordinationServiceAgent* coordination_service_agent = nullptr;
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
 
     absl::optional<ManagedStackTrace> stack_trace = absl::nullopt;
 
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index b7e61094693..d806545d573 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -315,6 +315,24 @@ FunctionDef XTimesFour() {
       {{"y", "y:y:0"}});
 }
 
+FunctionDef XTimesFourInt32() {
+  return FDH::Create(
+      // Name
+      "XTimesFourInt32",
+      // Args
+      {"x: int32"},
+      // Return values
+      {"y: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"x2"}, "XTimesTwoInt32", {"x"}},
+          {{"y"}, "XTimesTwoInt32", {"x2:y:0"}},
+      },
+      {{"y", "y:y:0"}});
+}
+
 FunctionDef XTimes16() {
   return FDH::Create(
       // Name
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 337f1fd8de6..559e0d6d67d 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -98,6 +98,9 @@ FunctionDef XTimesTwoInt32();
 // x: T -> (x * 2) * 2.
 FunctionDef XTimesFour();
 
+// x: T -> (x * 2) * 2, where x is int32
+FunctionDef XTimesFourInt32();
+
 // x: T -> ((x * 2) * 2) * 2.
 FunctionDef XTimes16();
 
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index 3643c29a677..e6c30171910 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -15,15 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/base64.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -89,9 +98,254 @@ TEST(GraphToFunctionDefTest, Basics) {
   string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
+  EXPECT_TRUE(fdefs_equal) << diff;
+}
+
+TEST(GraphToFunctionDefTest, OverrideOutputNames) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  auto b = ops::_Retval(root.WithOpName("H"), a, 0);
+
+  FunctionDef fdef;
+  // Override the output name from h to b.
+  TF_EXPECT_OK(GraphToFunctionDef(*root.graph(), "test_fn", {"b"}, &fdef));
+
+  FunctionDef fdef_expected =
+      FunctionDefHelper::Create("test_fn",      // function name
+                                {"a: float"},   // inputs
+                                {"b: float"},   // outputs
+                                {},             // attrs
+                                {},             // body
+                                {{"b", "a"}});  // return values
+
+  string diff;
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
+  EXPECT_TRUE(fdefs_equal) << diff;
+}
+
+TEST(GraphToFunctionDefTest, DuplicatedOutputNames) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  auto b = ops::_Retval(root.WithOpName("B"), a, 0);
+  auto c = ops::_Retval(root.WithOpName("C"), a, 1);
+
+  FunctionDef fdef;
+  // Duplicated output names.
+  auto status = GraphToFunctionDef(*root.graph(), "test_fn", {"d", "d"}, &fdef);
+
+  EXPECT_THAT(status, tensorflow::testing::StatusIs(
+                          error::INVALID_ARGUMENT,
+                          "Cannot have duplicate output names. Name 'd' "
+                          "appears more than once in 'output_names' array."));
+}
+
+TEST(GraphToFunctionDefTest, ArgAttrShape) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  // Attr "shape" is auto renamed to "_output_shapes".
+  AttrValue shape_attr;
+  *(shape_attr.mutable_shape()) = TensorShape({1, 2}).AsProto();
+  a.node()->AddAttr("shape", shape_attr);
+  auto b = ops::_Retval(root.WithOpName("B"), a, 0);
+
+  FunctionDef fdef;
+  TF_EXPECT_OK(GraphToFunctionDef(*root.graph(), "test_fn", &fdef));
+
+  FunctionDef fdef_expected =
+      FunctionDefHelper::Create("test_fn",      // function name
+                                {"a: float"},   // inputs
+                                {"b: float"},   // outputs
+                                {},             // attrs
+                                {},             // body
+                                {{"b", "a"}});  // return values
+
+  FunctionDef::ArgAttrs attrs;
+  AttrValue output_shapes;
+  *(output_shapes.mutable_list()->add_shape()) = TensorShape({1, 2}).AsProto();
+  attrs.mutable_attr()->insert({"_output_shapes", output_shapes});
+  (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
+
+  string diff;
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
+TEST(GraphToFunctionDefTest, ArgAttrPrivateAttr) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  // Private arg attr starting with "_" are copied to fdef arg_attr.
+  AttrValue private_attr;
+  *(private_attr.mutable_s()) = "value";
+  a.node()->AddAttr("_name", private_attr);
+  auto b = ops::_Retval(root.WithOpName("B"), a, 0);
+
+  FunctionDef fdef;
+  TF_EXPECT_OK(GraphToFunctionDef(*root.graph(), "test_fn", &fdef));
+
+  FunctionDef fdef_expected =
+      FunctionDefHelper::Create("test_fn",      // function name
+                                {"a: float"},   // inputs
+                                {"b: float"},   // outputs
+                                {},             // attrs
+                                {},             // body
+                                {{"b", "a"}});  // return values
+
+  FunctionDef::ArgAttrs attrs;
+  attrs.mutable_attr()->insert({"_name", private_attr});
+  (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
+
+  string diff;
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
+  EXPECT_TRUE(fdefs_equal) << diff;
+}
+
+TEST(GraphToFunctionDefTest, ArgAttrConstInput) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Const(root.WithOpName("A"), 0.0f, {2, 2});
+  // Attr "shape" with dtype other than DT_RESOURCE is copied to fdef arg_attr.
+  Tensor t(DT_FLOAT, TensorShape({2, 2}));
+  TensorProto t_proto;
+  t.AsProtoField(&t_proto);
+  AttrValue attr;
+  *(attr.mutable_tensor()) = std::move(t_proto);
+  a.node()->AddAttr("value", attr);
+  a.node()->AddAttr("index", 0);
+  auto b = ops::_Retval(root.WithOpName("B"), a, 0);
+
+  std::vector<OutputTensor> inputs;
+  std::vector<OutputTensor> outputs;
+  auto add_arg_or_retval = [](Node* node,
+                              std::vector<OutputTensor>* args_or_retvals) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
+    if (index >= args_or_retvals->size()) {
+      args_or_retvals->resize(index + 1);
+    }
+    (*args_or_retvals)[index].node = node;
+    return OkStatus();
+  };
+  for (Node* node : root.graph()->op_nodes()) {
+    // Set const as the input node.
+    if (node->IsConstant()) {
+      TF_EXPECT_OK(add_arg_or_retval(node, &inputs));
+    } else {
+      TF_EXPECT_OK(add_arg_or_retval(node, &outputs));
+    }
+  }
+
+  FunctionDef fdef;
+  // Adds description.
+  TF_EXPECT_OK(GraphToFunctionDef(
+      *root.graph(), "test_fn", /*append_hash_to_fn_name=*/false,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/false, /*body_nodes*/ {}, inputs,
+      outputs,
+      /*output_names*/ {}, /*control_outputs=*/{}, /*control_output_names=*/{},
+      /*description=*/"ArgAttrConstInput", &fdef));
+
+  FunctionDef fdef_expected =
+      FunctionDefHelper::Create("test_fn",      // function name
+                                {"a: float"},   // inputs
+                                {"b: float"},   // outputs
+                                {},             // attrs
+                                {},             // body
+                                {{"b", "a"}});  // return values
+
+  AttrValue value;
+  *(value.mutable_list()->add_shape()) = TensorShape({2, 2}).AsProto();
+  FunctionDef::ArgAttrs attrs;
+  attrs.mutable_attr()->insert({"_output_shapes", value});
+  (*fdef_expected.mutable_arg_attr())[0] = std::move(attrs);
+  (*fdef_expected.mutable_signature()->mutable_description()) =
+      "ArgAttrConstInput";
+
+  string diff;
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
+  EXPECT_TRUE(fdefs_equal) << diff;
+}
+
+TEST(GraphToFunctionDefTest, AppendHashToFnName) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Const(root.WithOpName("A"), 0.0f, {2, 2});
+  AttrValue foo;
+  *foo.mutable_placeholder() = "foo";
+  a.node()->AddAttr("attr_name_not_found", foo);
+
+  std::vector<const Node*> body_nodes;
+  for (Node* node : root.graph()->op_nodes()) {
+    body_nodes.push_back(node);
+  }
+
+  FunctionDef fdef;
+  // Set append_hash_to_fn_name to true.
+  TF_EXPECT_OK(GraphToFunctionDef(
+      *root.graph(), "test_fn", /*append_hash_to_fn_name=*/true,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/false, /*body_nodes*/ body_nodes,
+      /*inputs*/ {},
+      /*outputs*/ {},
+      /*output_names*/ {}, /*control_outputs=*/{}, /*control_output_names=*/{},
+      /*description=*/nullptr, &fdef));
+
+  // Hash appended after "test_fn".
+  EXPECT_TRUE(absl::StartsWith(fdef.signature().name(), "test_fn_"));
+}
+
+TEST(GraphToFunctionDefTest, CopyPlaceholderAttrsFromNodes) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::VarHandleOp(root.WithOpName("var"), DT_FLOAT, {});
+  AttrValue foo;
+  *foo.mutable_placeholder() = "foo";
+  // The op_def of VarHandleOp has a "shared_name" attribute.
+  a.node()->AddAttr("shared_name", foo);
+  std::vector<const Node*> body_nodes;
+  for (Node* node : root.graph()->op_nodes()) {
+    body_nodes.push_back(node);
+  }
+
+  FunctionDef fdef;
+  TF_EXPECT_OK(GraphToFunctionDef(
+      *root.graph(), "test_fn", /*append_hash_to_fn_name=*/false,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/true, body_nodes, /*inputs*/ {},
+      /*outputs*/ {},
+      /*output_names*/ {}, /*control_outputs=*/{}, /*control_output_names=*/{},
+      /*description=*/nullptr, &fdef));
+}
+
+TEST(GraphToFunctionDefTest, CopyPlaceholderAttrsFromNodesUnImplemented) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Const(root.WithOpName("A"), 0.0f, {2, 2});
+  AttrValue foo;
+  *foo.mutable_placeholder() = "foo";
+  // Const op_def doesn't have a "attr_name_not_found" attr.
+  a.node()->AddAttr("attr_name_not_found", foo);
+  std::vector<const Node*> body_nodes;
+  for (Node* node : root.graph()->op_nodes()) {
+    body_nodes.push_back(node);
+  }
+
+  FunctionDef fdef;
+  auto status = GraphToFunctionDef(
+      *root.graph(), "test_fn", /*append_hash_to_fn_name=*/false,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/true, body_nodes, /*inputs*/ {},
+      /*outputs*/ {},
+      /*output_names*/ {}, /*control_outputs=*/{}, /*control_output_names=*/{},
+      /*description=*/nullptr, &fdef);
+
+  EXPECT_EQ(status.code(), error::UNIMPLEMENTED);
+}
+
 // Regression test for a crash if there was a control edge to a _Retval node.
 TEST(GraphToFunctionDefTest, ControlDependencies) {
   Scope root = Scope::NewRootScope().ExitOnError();
@@ -123,6 +377,7 @@ TEST(GraphToFunctionDefTest, ControlDependencies) {
   string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
@@ -163,6 +418,7 @@ TEST(GraphToFunctionDefTest, ControlOutputs) {
   string diff;
   bool fdefs_equal =
       EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
+
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index 1b308e69897..44ad17dce28 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -69,7 +69,7 @@ auto* tf_data_fetch_op_counter = tsl::monitoring::Counter<1>::New(
     "/tensorflow/data/fetch_op",
     "The number of times a tf.data operation that fetches output(s) of a "
     "tf.data input pipeline (e.g. `IteratorGetNext`) was executed.",
-    "name");
+    "fetch_op");
 
 auto* tf_data_autotune_counter = tsl::monitoring::Counter<1>::New(
     "/tensorflow/data/autotune", "tf.data autotuning", "name");
@@ -274,6 +274,11 @@ auto* eager_client_error_counter = tsl::monitoring::Counter<2>::New(
     "Count the errors in eager client as a central place.", "error_source",
     "error_type");
 
+auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<4>::New(
+    "/tensorflow/core/tf_mlir_bridge_first_phase_count",
+    "Tracks processing state in first phase of mlir bridge", "device",
+    "version", "fallback", "result");
+
 tsl::monitoring::Counter<2>* GetGraphOptimizationCounter() {
   static auto* graph_optimization_counter = tsl::monitoring::Counter<2>::New(
       "/tensorflow/core/graph_optimization_usecs",
@@ -553,13 +558,10 @@ void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
                                          const std::string& bridge_version,
                                          bool fallback_enabled,
                                          const std::string& result) {
-  static auto* metric = tsl::monitoring::Counter<4>::New(
-      "/tensorflow/core/tf_mlir_bridge_first_phase_count",
-      "Tracks processing state in first phase of mlir bridge", "device",
-      "version", "fallback", "result");
   std::string fallback_status =
       fallback_enabled ? "fallback_enabled" : "fallback_disabled";
-  metric->GetCell(device_type, bridge_version, fallback_status, result)
+  mlir_bridge_first_phase_counter
+      ->GetCell(device_type, bridge_version, fallback_status, result)
       ->IncrementBy(1);
 }
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 3d41cd26b56..431ff21e16a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -106,9 +106,7 @@ class ModelTimingPriorityQueue {
       DCHECK(model_timing.GetTiming(root.get()) != nullptr);
       const ModelTiming::NodeTiming* root_timing =
           model_timing.GetTiming(root.get());
-      stage_roots_queue_.emplace(
-          root_timing->total_time_nsec * root_timing->pipeline_ratio,
-          root.get());
+      Push(root.get(), *root_timing);
     }
   }
 
@@ -2568,7 +2566,8 @@ void Model::OptimizeStageBasedParallelism(
     const ModelTiming::NodeTiming* root_timing =
         model_timing.GetTiming(critical_root.second);
     // If timing has not improved, stop optimizing.
-    if (critical_root.first <= root_timing->total_time_nsec) {
+    if (critical_root.first <=
+        (root_timing->total_time_nsec * root_timing->pipeline_ratio)) {
       parallelism_parameter->value -= 1.0;
       break;
     }
@@ -2852,7 +2851,13 @@ void ModelTiming::ComputePipelineRatios(const Node::NodeVector& bfs_nodes) {
     if (node->output() != nullptr || timing_nodes_.contains(node->output())) {
       const auto& output_timing = timing_nodes_[node->output()];
       parent_pipeline_ratio = output_timing.pipeline_ratio;
-      parent_ratio = node->output()->Ratio();
+      if (node->num_elements() > 0 && node->output()->num_elements() > 0) {
+        parent_ratio = static_cast<double>(node->num_elements()) /
+                       static_cast<double>(node->output()->num_elements() +
+                                           node->output()->buffered_elements());
+      } else {
+        parent_ratio = node->output()->Ratio();
+      }
       if (parent_ratio <= 0.0) {
         // Parent ratio is unknown, we use 1.0 as a guess.
         parent_ratio = 1.0;
@@ -2876,25 +2881,42 @@ void ModelTiming::ComputeNonAsyncInterleaveManyTotalTime(const Node& node) {
     DCHECK(timing_nodes_.contains(input.get()))
         << "Input " << input->long_name() << " of node " << node.long_name()
         << " has no timing node.";
-
-    input_total_time_nsec += timing_nodes_[input.get()].total_time_nsec;
+    // We use the dynamic ratio of `num_elements` of input over that of output
+    // rather than the static `Ratio()` computed based on the type of the node
+    // (e.g. batch size of a `Batch`) because the static value can be inaccurate
+    // for nodes like an interleave node where the ratio of the first input is
+    // different from the ratio of the interleaved inputs. Moreover, this
+    // dynamic quantity should closely match the static one for nodes other than
+    // interleave nodes and is more generic since its value is specific to an
+    // input to output pair rather than a single numnber for the output node.
+    input_total_time_nsec +=
+        timing_nodes_[input.get()].total_time_nsec *
+        static_cast<double>(input->num_elements()) /
+        static_cast<double>(node.num_elements() + node.buffered_elements());
   }
   node_timing.total_time_nsec =
-      node_timing.self_time_nsec + input_total_time_nsec * node.Ratio();
+      node_timing.self_time_nsec + input_total_time_nsec;
 }
 
 void ModelTiming::ComputeAsyncInterleaveManyTotalTime(const Node& node) {
   DCHECK(timing_nodes_.contains(&node));
   auto& node_timing = timing_nodes_[&node];
+  node_timing.total_time_nsec =
+      node_timing.self_time_nsec +
+      ComputeAsyncInterleaveManyFirstInputTotalTime(node) +
+      ComputeAsyncInterleaveManyInterleavedInputsTotalTime(node);
+}
+
+double ModelTiming::ComputeAsyncInterleaveManyInterleavedInputsTotalTime(
+    const Node& node) {
+  DCHECK(timing_nodes_.contains(&node));
   double max_input_total_time_nsec = 0.0;
   double sum_input_throughput = 0.0;
   auto inputs = node.inputs();
   // `ParallelInterleave` is often used to interleave processing of datasets
   // generated from the first input, e.g. reading from IO where the first input
-  // has the list of all filenames. The first input is typically not the
-  // bottleneck. We exclude the timing of the first input in the throughput
-  // computation of the remaining input. It also excluded from the total time
-  // computation of the async interleave node.
+  // has the list of all filenames. We skip it here to compute the total time of
+  // the other inputs.
   auto input = std::next(inputs.begin());
   // `num_active_inputs` holds the number of inputs that the
   // `ParallelInterleave` is reading from, not including those that are warm
@@ -2959,8 +2981,29 @@ void ModelTiming::ComputeAsyncInterleaveManyTotalTime(const Node& node) {
     }
     input_total_time_nsec = 1.0 / sum_input_throughput;
   }
-  node_timing.total_time_nsec =
-      node_timing.self_time_nsec + input_total_time_nsec;
+  return input_total_time_nsec;
+}
+
+double ModelTiming::ComputeAsyncInterleaveManyFirstInputTotalTime(
+    const Node& node) {
+  DCHECK(timing_nodes_.contains(&node));
+  // `ParallelInterleave` is often used to interleave processing of datasets
+  // generated from the first input, e.g. reading from IO where the first input
+  // has the list of all filenames. The contribution of the first input total
+  // time is proportional to the number of elements it produces over the number
+  // elements the parallel interleave node produces.
+  auto inputs = node.inputs();
+  auto first_input = inputs.begin();
+  if (first_input == inputs.end() || (*first_input)->IsAsync() ||
+      !(*first_input)->autotune() || (*first_input)->num_elements() <= 0) {
+    return 0.0;
+  }
+  DCHECK(timing_nodes_.contains((*first_input).get()))
+      << "Input " << (*first_input)->long_name() << " of node "
+      << node.long_name() << " has no timing node.";
+  return timing_nodes_[(*first_input).get()].total_time_nsec *
+         (*first_input)->num_elements() /
+         (node.num_elements() + node.buffered_elements());
 }
 
 void ModelTiming::ComputeTotalTimes(const Node::NodeVector& reverse_bfs_nodes) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index aba309a1ffb..3a51cb83532 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -172,6 +172,7 @@ class Node {
         name_(std::move(args.name)),
         autotune_(true),
         buffered_bytes_(0),
+        peak_buffered_bytes_(0),
         buffered_elements_(0),
         buffered_elements_low_(std::numeric_limits<int64_t>::max()),
         buffered_elements_high_(std::numeric_limits<int64_t>::min()),
@@ -228,6 +229,11 @@ class Node {
     return buffered_bytes_;
   }
 
+  // Returns the peak number of bytes stored in this node's buffer.
+  int64_t peak_buffered_bytes() const TF_LOCKS_EXCLUDED(mu_) {
+    return peak_buffered_bytes_;
+  }
+
   // Returns the number of elements stored in this node's buffer.
   int64_t buffered_elements() const TF_LOCKS_EXCLUDED(mu_) {
     return buffered_elements_;
@@ -311,6 +317,7 @@ class Node {
   // Records the change in this node's buffer.
   void record_buffer_event(int64_t bytes_delta, int64_t elements_delta) {
     buffered_bytes_ += bytes_delta;
+    peak_buffered_bytes_.store(std::max(peak_buffered_bytes_, buffered_bytes_));
     buffered_elements_ += elements_delta;
     // There is no need to maintain watermarks for synchronous ops because we
     // will not upsize or downsize the buffers of synchronous ops.
@@ -537,8 +544,9 @@ class Node {
   void UpdateProcessingTimeEma() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (previous_processing_time_ == 0) {
       if (num_elements_ > 0) {
-        processing_time_ema_ = static_cast<double>(processing_time_) /
-                               static_cast<double>(num_elements_);
+        processing_time_ema_ =
+            static_cast<double>(processing_time_) /
+            static_cast<double>(num_elements_ + buffered_elements_);
       } else {
         processing_time_ema_ = static_cast<double>(processing_time_);
       }
@@ -680,6 +688,7 @@ class Node {
   // from computation of output time and processing time.
   std::atomic<bool> autotune_;
   std::atomic<int64_t> buffered_bytes_;
+  std::atomic<int64_t> peak_buffered_bytes_;
   std::atomic<int64_t> buffered_elements_;
   std::atomic<int64_t> buffered_elements_low_;
   std::atomic<int64_t> buffered_elements_high_;
@@ -1042,6 +1051,10 @@ class ModelTiming {
 
   // Computes the total time of an async interleave node.
   void ComputeAsyncInterleaveManyTotalTime(const Node& node);
+  // Computes the first input total time of an async interleave node.
+  double ComputeAsyncInterleaveManyFirstInputTotalTime(const Node& node);
+  // Computes the interleaved inputs' total time of an async interleave node.
+  double ComputeAsyncInterleaveManyInterleavedInputsTotalTime(const Node& node);
 
   // Returns a vector of all nodes in the model. The nodes are either in
   // breadth-first search or reverse breadth-first search order depending on the
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index e046bfac03d..1bdc3c97c52 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -85,9 +85,13 @@ TEST_P(AsyncInterleaveManyTest, Model) {
   });
   Model::NodeValues input_times;
   input_times[kModelInputTimeKey] = input_time;
+  EXPECT_EQ(async_interleave_many->buffered_bytes(), 0);
+  EXPECT_EQ(async_interleave_many->peak_buffered_bytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
+  EXPECT_EQ(async_interleave_many->buffered_bytes(), 110);
+  EXPECT_EQ(async_interleave_many->peak_buffered_bytes(), 110);
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 110);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(),
             110 * parallelism / 10);
@@ -155,9 +159,13 @@ TEST_P(AsyncKnownRatioTest, Model) {
   async_known_many->add_input(source2);
   Model::NodeValues input_times;
   input_times[kModelInputTimeKey] = input_time;
+  EXPECT_EQ(async_known_many->buffered_bytes(), 0);
+  EXPECT_EQ(async_known_many->peak_buffered_bytes(), 0);
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
+  EXPECT_EQ(async_known_many->buffered_bytes(), 110);
+  EXPECT_EQ(async_known_many->peak_buffered_bytes(), 110);
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 110);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(),
             num_inputs_per_output == 0
@@ -397,9 +405,13 @@ TEST_P(AsyncUnknownRatioTest, Model) {
   async_unknown_many->add_input(source2);
   Model::NodeValues input_times;
   input_times[kModelInputTimeKey] = input_time;
+  EXPECT_EQ(async_unknown_many->buffered_bytes(), 0);
+  EXPECT_EQ(async_unknown_many->peak_buffered_bytes(), 0);
   EXPECT_EQ(async_unknown_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_unknown_many->TotalMaximumBufferedBytes(), 0);
   async_unknown_many->record_buffer_event(110, 10);
+  EXPECT_EQ(async_unknown_many->buffered_bytes(), 110);
+  EXPECT_EQ(async_unknown_many->peak_buffered_bytes(), 110);
   EXPECT_EQ(async_unknown_many->TotalBufferedBytes(), 110);
   EXPECT_EQ(async_unknown_many->TotalMaximumBufferedBytes(),
             110.0 * parallelism / 10);
@@ -548,23 +560,27 @@ TEST(BufferedBytesTest, Node) {
 
   EXPECT_EQ(node->buffered_bytes(), 0);
   EXPECT_EQ(node->buffered_elements(), 0);
+  EXPECT_EQ(node->peak_buffered_bytes(), 0);
   EXPECT_EQ(node->TotalBufferedBytes(), 0);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 0);
 
   node->record_buffer_event(20, 1);
   EXPECT_EQ(node->buffered_bytes(), 20);
+  EXPECT_EQ(node->peak_buffered_bytes(), 20);
   EXPECT_EQ(node->buffered_elements(), 1);
   EXPECT_EQ(node->TotalBufferedBytes(), 20);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 60);
 
   node->record_buffer_event(10, 1);
   EXPECT_EQ(node->buffered_bytes(), 30);
+  EXPECT_EQ(node->peak_buffered_bytes(), 30);
   EXPECT_EQ(node->buffered_elements(), 2);
   EXPECT_EQ(node->TotalBufferedBytes(), 30);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 45);
 
   node->record_buffer_event(18, 1);
   EXPECT_EQ(node->buffered_bytes(), 48);
+  EXPECT_EQ(node->peak_buffered_bytes(), 48);
   EXPECT_EQ(node->buffered_elements(), 3);
   EXPECT_EQ(node->bytes_produced(), 0);
   EXPECT_EQ(node->num_elements(), 0);
@@ -575,6 +591,7 @@ TEST(BufferedBytesTest, Node) {
   node->record_element();
   node->record_bytes_produced(20);
   EXPECT_EQ(node->buffered_bytes(), 28);
+  EXPECT_EQ(node->peak_buffered_bytes(), 48);
   EXPECT_EQ(node->buffered_elements(), 2);
   EXPECT_EQ(node->bytes_produced(), 20);
   EXPECT_EQ(node->num_elements(), 1);
@@ -585,6 +602,7 @@ TEST(BufferedBytesTest, Node) {
   node->record_element();
   node->record_bytes_produced(10);
   EXPECT_EQ(node->buffered_bytes(), 18);
+  EXPECT_EQ(node->peak_buffered_bytes(), 48);
   EXPECT_EQ(node->buffered_elements(), 1);
   EXPECT_EQ(node->bytes_produced(), 30);
   EXPECT_EQ(node->num_elements(), 2);
@@ -612,6 +630,8 @@ TEST(BufferedBytesTest, Node) {
 
   input->record_buffer_event(28, 1);
   EXPECT_EQ(node->bytes_consumed(), 0);
+  EXPECT_EQ(node->buffered_bytes(), 18);
+  EXPECT_EQ(node->peak_buffered_bytes(), 48);
   EXPECT_EQ(node->TotalBufferedBytes(), 46);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 119.5);
 
@@ -620,6 +640,8 @@ TEST(BufferedBytesTest, Node) {
   input->record_bytes_produced(28);
   node->record_bytes_consumed(28);
   EXPECT_EQ(node->bytes_consumed(), 28);
+  EXPECT_EQ(node->buffered_bytes(), 18);
+  EXPECT_EQ(node->peak_buffered_bytes(), 48);
   EXPECT_EQ(node->TotalBufferedBytes(), 18);
   EXPECT_EQ(node->TotalMaximumBufferedBytes(), 119.5);
 
@@ -1104,18 +1126,24 @@ TEST(SaveModelTest, Model) {
   }
 
   // Make Save->Load roundtrip.
+  Env* env = Env::Default();
+  string tmpFile;
+  EXPECT_TRUE(env->LocalTempFilename(&tmpFile));
+  tmpFile += "_autotune_model_test";
+
   ModelProto::OptimizationParams optimization_params;
   optimization_params.set_algorithm(AutotuneAlgorithm::GRADIENT_DESCENT);
   optimization_params.set_cpu_budget(64);
   optimization_params.set_ram_budget(1024);
   optimization_params.set_model_input_time(43653.34534);
-  TF_ASSERT_OK(model.Save("/tmp/autotune_model_test",
-                          model.output()->Snapshot(), optimization_params));
+  TF_ASSERT_OK(
+      model.Save(tmpFile, model.output()->Snapshot(), optimization_params));
 
   std::unique_ptr<model::Model> restored_model;
   ModelProto::OptimizationParams restored_optimization_params;
-  TF_ASSERT_OK(model.Load("/tmp/autotune_model_test", &restored_model,
-                          &restored_optimization_params));
+  TF_ASSERT_OK(
+      model.Load(tmpFile, &restored_model, &restored_optimization_params));
+  TF_ASSERT_OK(env->DeleteFile(tmpFile));
 
   // Check optimization parameters.
   EXPECT_EQ(optimization_params.algorithm(),
@@ -1408,6 +1436,7 @@ TEST_F(ModelTimingTest, Interleave) {
         node_class: INTERLEAVE_MANY
         inputs: 3
         inputs: 4
+        inputs: 5
         parameters: { name: "cycle_length" value: 2 tunable: false }
       }
     }
@@ -1415,10 +1444,10 @@ TEST_F(ModelTimingTest, Interleave) {
       key: 3
       value: {
         id: 3
-        name: "Batch"
+        name: "Filter"
         autotune: true
-        num_elements: 60
-        processing_time: 1200
+        num_elements: 2
+        processing_time: 10
         node_class: KNOWN_RATIO
         ratio: 1
       }
@@ -1429,8 +1458,20 @@ TEST_F(ModelTimingTest, Interleave) {
         id: 4
         name: "Batch"
         autotune: true
-        num_elements: 40
-        processing_time: 800
+        num_elements: 50
+        processing_time: 1000
+        node_class: KNOWN_RATIO
+        ratio: 1
+      }
+    }
+    nodes: {
+      key: 5
+      value: {
+        id: 5
+        name: "Batch"
+        autotune: true
+        num_elements: 50
+        processing_time: 1000
         node_class: KNOWN_RATIO
         ratio: 1
       }
@@ -1440,18 +1481,21 @@ TEST_F(ModelTimingTest, Interleave) {
 
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/1)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/2)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(0.02, GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
 
   EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/1)->self_time_nsec);
   EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/2)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/3)->self_time_nsec);
+  EXPECT_DOUBLE_EQ(5, GetNodeTiming(/*node_id=*/3)->self_time_nsec);
   EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/4)->self_time_nsec);
+  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/5)->self_time_nsec);
 
-  EXPECT_DOUBLE_EQ(40, GetNodeTiming(/*node_id=*/1)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(30, GetNodeTiming(/*node_id=*/2)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/3)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(40.1, GetNodeTiming(/*node_id=*/1)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(30.1, GetNodeTiming(/*node_id=*/2)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(5, GetNodeTiming(/*node_id=*/3)->total_time_nsec);
   EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/4)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/5)->total_time_nsec);
 }
 
 // When the `parallelism` parameter of a `ParallelInterleave` is not present in
@@ -1497,10 +1541,10 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
           key: 3
           value: {
             id: 3
-            name: "Batch"
+            name: "TensorSlice"
             autotune: true
-            num_elements: 60
-            processing_time: 60
+            num_elements: 2
+            processing_time: 40
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1511,8 +1555,8 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
             id: 4
             name: "Batch"
             autotune: true
-            num_elements: 60
-            processing_time: 1200
+            num_elements: 30
+            processing_time: 600
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1523,8 +1567,8 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
             id: 5
             name: "Batch"
             autotune: true
-            num_elements: 40
-            processing_time: 1200
+            num_elements: 20
+            processing_time: 600
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1535,8 +1579,8 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
             id: 6
             name: "Batch"
             autotune: true
-            num_elements: 60
-            processing_time: 2400
+            num_elements: 30
+            processing_time: 1200
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1547,8 +1591,8 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
             id: 7
             name: "Batch"
             autotune: false  # Marked as an inactive input
-            num_elements: 40
-            processing_time: 2000
+            num_elements: 2
+            processing_time: 40
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1559,23 +1603,19 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
 
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/1)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/2)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(2.0 / 100.0, GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(30.0 / 100.0, GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(20 / 100.0, GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(30 / 100.0, GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(0, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
 
   const double expected_self_time_1 = 1000.0 / 100.0;
   const double expected_self_time_2 = 2000.0 / 100.0 / parallelism;
-  const double expected_self_time_3 = 60.0 / 60.0;
-  const double expected_self_time_4 = 1200.0 / 60.0;
-  const double expected_self_time_5 = 1200.0 / 40.0;
-  const double expected_self_time_6 = 2400.0 / 60.0;
-  const double expected_self_time_7 = 2000.0 / 40.0;
+  const double expected_self_time_3 = 40.0 / 2.0;
+  const double expected_self_time_4 = 600.0 / 30.0;
+  const double expected_self_time_5 = 600.0 / 20.0;
+  const double expected_self_time_6 = 1200.0 / 30.0;
+  const double expected_self_time_7 = 40.0 / 2.0;
 
   EXPECT_DOUBLE_EQ(expected_self_time_1,
                    GetNodeTiming(/*node_id=*/1)->self_time_nsec);
@@ -1618,8 +1658,10 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
     }
     expected_input_time = 1.0 / input_throughput;
   }
-  EXPECT_DOUBLE_EQ(expected_input_time + expected_self_time_2,
-                   GetNodeTiming(/*node_id=*/2)->total_time_nsec);
+  const double expected_first_input_time = expected_self_time_3 * 2.0 / 100.0;
+  EXPECT_DOUBLE_EQ(
+      expected_first_input_time + expected_input_time + expected_self_time_2,
+      GetNodeTiming(/*node_id=*/2)->total_time_nsec);
 }
 
 class ParallelInterleaveTimingTest
@@ -1675,10 +1717,10 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
           key: 3
           value: {
             id: 3
-            name: "Batch"
+            name: "TensorSlice"
             autotune: true
-            num_elements: 60
-            processing_time: 60
+            num_elements: 2
+            processing_time: 40
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1689,8 +1731,8 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
             id: 4
             name: "Batch"
             autotune: true
-            num_elements: 60
-            processing_time: 1200
+            num_elements: 30
+            processing_time: 600
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1701,8 +1743,8 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
             id: 5
             name: "Batch"
             autotune: true
-            num_elements: 40
-            processing_time: 1200
+            num_elements: 20
+            processing_time: 600
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1713,8 +1755,8 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
             id: 6
             name: "Batch"
             autotune: true
-            num_elements: 60
-            processing_time: 2400
+            num_elements: 30
+            processing_time: 1200
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1725,8 +1767,8 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
             id: 7
             name: "Batch"
             autotune: false  # Marked as an inactive input
-            num_elements: 40
-            processing_time: 2000
+            num_elements: 2
+            processing_time: 40
             node_class: KNOWN_RATIO
             ratio: 1
           }
@@ -1737,23 +1779,19 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
 
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/1)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/2)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1.0 / cycle_length,
-                   GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(2.0 / 100.0, GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(30.0 / 100.0, GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(20 / 100.0, GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
+  EXPECT_DOUBLE_EQ(30 / 100.0, GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
   EXPECT_DOUBLE_EQ(0, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
 
   const double expected_self_time_1 = 1000.0 / 100.0;
   const double expected_self_time_2 = 2000.0 / 100.0 / parallelism;
-  const double expected_self_time_3 = 60.0 / 60.0;
-  const double expected_self_time_4 = 1200.0 / 60.0;
-  const double expected_self_time_5 = 1200.0 / 40.0;
-  const double expected_self_time_6 = 2400.0 / 60.0;
-  const double expected_self_time_7 = 2000.0 / 40.0;
+  const double expected_self_time_3 = 40.0 / 2.0;
+  const double expected_self_time_4 = 600.0 / 30.0;
+  const double expected_self_time_5 = 600.0 / 20.0;
+  const double expected_self_time_6 = 1200.0 / 30.0;
+  const double expected_self_time_7 = 40.0 / 2.0;
 
   EXPECT_DOUBLE_EQ(expected_self_time_1,
                    GetNodeTiming(/*node_id=*/1)->self_time_nsec);
@@ -1796,8 +1834,10 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
     }
     expected_input_time = 1.0 / input_throughput;
   }
-  EXPECT_DOUBLE_EQ(expected_input_time + expected_self_time_2,
-                   GetNodeTiming(/*node_id=*/2)->total_time_nsec);
+  const double expected_first_input_time = expected_self_time_3 * 2.0 / 100.0;
+  EXPECT_DOUBLE_EQ(
+      expected_first_input_time + expected_input_time + expected_self_time_2,
+      GetNodeTiming(/*node_id=*/2)->total_time_nsec);
 }
 
 INSTANTIATE_TEST_SUITE_P(ParallelInterleaveTimingTest,
@@ -1848,10 +1888,10 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
       key: 3
       value: {
         id: 3
-        name: "Batch"
+        name: "TensorSlice"
         autotune: true
-        num_elements: 60
-        processing_time: 60
+        num_elements: 2
+        processing_time: 40
         node_class: KNOWN_RATIO
         ratio: 1
       }
@@ -1862,8 +1902,8 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
         id: 4
         name: "Batch"
         autotune: true
-        num_elements: 60
-        processing_time: 1200
+        num_elements: 50
+        processing_time: 1000
         node_class: KNOWN_RATIO
         ratio: 2
         inputs: 6
@@ -1875,8 +1915,8 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
         id: 5
         name: "Batch"
         autotune: true
-        num_elements: 40
-        processing_time: 800
+        num_elements: 50
+        processing_time: 1000
         node_class: KNOWN_RATIO
         ratio: 2
         inputs: 7
@@ -1888,8 +1928,8 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
         id: 6
         name: "ParallelMapV2"
         autotune: true
-        num_elements: 120
-        processing_time: 2400
+        num_elements: 100
+        processing_time: 2000
         node_class: ASYNC_KNOWN_RATIO
         ratio: 1
         parameters: {
@@ -1907,8 +1947,8 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
         id: 7
         name: "ParallelMapV2"
         autotune: true
-        num_elements: 120
-        processing_time: 2400
+        num_elements: 100
+        processing_time: 2000
         node_class: ASYNC_KNOWN_RATIO
         ratio: 1
         parameters: {
@@ -1923,29 +1963,86 @@ TEST_F(ModelTimingTest, ParallelInterleave_Batch_ParallelMap) {
     output: 1
   )pb");
 
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/1)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/2)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(0.5, GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
+  const double expected_pipeline_ratio_1 = 1.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_1,
+                   GetNodeTiming(/*node_id=*/1)->pipeline_ratio);
+  const double expected_pipeline_ratio_2 =
+      expected_pipeline_ratio_1 * 100.0 / 100.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_2,
+                   GetNodeTiming(/*node_id=*/2)->pipeline_ratio);
+  const double expected_pipeline_ratio_3 =
+      expected_pipeline_ratio_2 * 2.0 / 100.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_3,
+                   GetNodeTiming(/*node_id=*/3)->pipeline_ratio);
+  const double expected_pipeline_ratio_4 =
+      expected_pipeline_ratio_2 * 50.0 / 100.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_4,
+                   GetNodeTiming(/*node_id=*/4)->pipeline_ratio);
+  const double expected_pipeline_ratio_5 =
+      expected_pipeline_ratio_2 * 50.0 / 100.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_5,
+                   GetNodeTiming(/*node_id=*/5)->pipeline_ratio);
+  const double expected_pipeline_ratio_6 =
+      expected_pipeline_ratio_4 * 100.0 / 50.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_6,
+                   GetNodeTiming(/*node_id=*/6)->pipeline_ratio);
+  const double expected_pipeline_ratio_7 =
+      expected_pipeline_ratio_5 * 100.0 / 50.0;
+  EXPECT_DOUBLE_EQ(expected_pipeline_ratio_7,
+                   GetNodeTiming(/*node_id=*/7)->pipeline_ratio);
 
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/1)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/2)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/3)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/4)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/5)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/6)->self_time_nsec);
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/7)->self_time_nsec);
+  const double expected_self_time_1 = 1000.0 / 100.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_1,
+                   GetNodeTiming(/*node_id=*/1)->self_time_nsec);
+  const double expected_self_time_2 = 2000.0 / 100.0 / 2.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_2,
+                   GetNodeTiming(/*node_id=*/2)->self_time_nsec);
+  const double expected_self_time_3 = 40.0 / 2.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_3,
+                   GetNodeTiming(/*node_id=*/3)->self_time_nsec);
+  const double expected_self_time_4 = 1000.0 / 50.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_4,
+                   GetNodeTiming(/*node_id=*/4)->self_time_nsec);
+  const double expected_self_time_5 = 1000.0 / 50.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_5,
+                   GetNodeTiming(/*node_id=*/5)->self_time_nsec);
+  const double expected_self_time_6 = 2000.0 / 100.0 / 2.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_6,
+                   GetNodeTiming(/*node_id=*/6)->self_time_nsec);
+  const double expected_self_time_7 = 2000.0 / 100.0 / 2.0;
+  EXPECT_DOUBLE_EQ(expected_self_time_7,
+                   GetNodeTiming(/*node_id=*/7)->self_time_nsec);
 
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/1)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/2)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(1, GetNodeTiming(/*node_id=*/3)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/4)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(20, GetNodeTiming(/*node_id=*/5)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/6)->total_time_nsec);
-  EXPECT_DOUBLE_EQ(10, GetNodeTiming(/*node_id=*/7)->total_time_nsec);
+  const double expected_total_time_7 = expected_self_time_7;
+  const double expected_total_time_6 = expected_self_time_6;
+  // Input of node 5 is a `ParallelMap`, which is an async node that belongs to
+  // a separate stage.
+  const double expected_total_time_5 = expected_self_time_5;
+  // Input of node 4 is a `ParallelMap`, which is an async node that belongs to
+  // a separate stage.
+  const double expected_total_time_4 = expected_self_time_4;
+  const double expected_total_time_3 = expected_self_time_3;
+  const double expected_total_time_2 =
+      expected_total_time_3 * 2.0 / 100.0 +
+      1.0 / (1.0 / expected_total_time_4 + 1.0 / expected_total_time_5) +
+      expected_self_time_2;
+  // Input of node 1 is a `ParallelInterleave`, which is an async node that
+  // belongs to a separate stage.
+  const double expected_total_time_1 = expected_self_time_1;
+  EXPECT_DOUBLE_EQ(expected_total_time_1,
+                   GetNodeTiming(/*node_id=*/1)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_2,
+                   GetNodeTiming(/*node_id=*/2)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_3,
+                   GetNodeTiming(/*node_id=*/3)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_4,
+                   GetNodeTiming(/*node_id=*/4)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_5,
+                   GetNodeTiming(/*node_id=*/5)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_6,
+                   GetNodeTiming(/*node_id=*/6)->total_time_nsec);
+  EXPECT_DOUBLE_EQ(expected_total_time_7,
+                   GetNodeTiming(/*node_id=*/7)->total_time_nsec);
 }
 
 class BufferSizeTest : public ::testing::Test {
@@ -2296,7 +2393,8 @@ TEST_F(ModelTimingTest, OptimizeStageBased_OneStage) {
         id: 1
         name: "ParallelMapV2"
         autotune: true
-        num_elements: 100
+        num_elements: 97
+        buffered_elements: 3
         processing_time: 5000
         bytes_produced: 10000
         node_class: ASYNC_KNOWN_RATIO
@@ -2333,7 +2431,6 @@ TEST_F(ModelTimingTest, OptimizeStageBased_OneStage) {
         num_elements: 100
         processing_time: 1000
         node_class: KNOWN_RATIO
-        ratio: 2
       }
     }
     output: 1
@@ -2475,7 +2572,7 @@ TEST_F(ModelTimingTest, OptimizeStageBased_TwoStages_RamBudgetExceeded) {
         name: "ParallelMapV2"
         autotune: true
         num_elements: 100
-        processing_time: 25000
+        processing_time: 62000
         bytes_produced: 10000
         node_class: ASYNC_KNOWN_RATIO
         ratio: 1
@@ -2497,7 +2594,7 @@ TEST_F(ModelTimingTest, OptimizeStageBased_TwoStages_RamBudgetExceeded) {
         name: "ParallelMapV2"
         autotune: true
         num_elements: 100
-        processing_time: 20000
+        processing_time: 70000
         bytes_produced: 10000
         node_class: ASYNC_KNOWN_RATIO
         ratio: 1
@@ -2596,6 +2693,64 @@ TEST_F(ModelTimingTest, OptimizeStageBased_PipelineRatio) {
     output: 1
   )pb");
 
+  CancellationManager cancellation_manager;
+  model_->Optimize(AutotuneAlgorithm::STAGE_BASED, 20, 10000, 50,
+                   &cancellation_manager);
+
+  EXPECT_EQ(5, GetNode(/*node_id=*/1)->parameter_value("parallelism"));
+}
+
+TEST_F(ModelTimingTest, OptimizeStageBased_PipelineRatioLessThanOne) {
+  BuildModelFromProto(R"pb(
+    nodes: {
+      key: 1
+      value: {
+        id: 1
+        name: "ParallelBatch"
+        autotune: true
+        num_elements: 100
+        processing_time: 40000
+        bytes_produced: 10000
+        node_class: ASYNC_KNOWN_RATIO
+        ratio: 0.5
+        inputs: 2
+        parameters: {
+          name: "parallelism"
+          value: 4
+          min: 1
+          max: 16
+          tunable: true
+        }
+      }
+    }
+    nodes: {
+      key: 2
+      value: {
+        id: 2
+        name: "Map"
+        autotune: true
+        num_elements: 100
+        processing_time: 3000
+        node_class: KNOWN_RATIO
+        ratio: 1
+        inputs: 3
+      }
+    }
+    nodes: {
+      key: 3
+      value: {
+        id: 3
+        name: "SSTable"
+        autotune: true
+        num_elements: 100
+        processing_time: 1000
+        node_class: KNOWN_RATIO
+        ratio: 2
+      }
+    }
+    output: 1
+  )pb");
+
   CancellationManager cancellation_manager;
   model_->Optimize(AutotuneAlgorithm::STAGE_BASED, 20, 10000, 50,
                    &cancellation_manager);
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index b759f54580b..bf6ce2bd33c 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -235,6 +235,11 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) {
           (*errors_ptr)[0], " while building NodeDef '", node_def_.name(),
           "' using ", SummarizeOpDef(*op_def_));
     } else {
+      if (op_def_ == nullptr) {
+        return errors::InvalidArgument(
+            errors_ptr->size(), " errors while building NodeDef '",
+            node_def_.name(), "':\n", absl::StrJoin(*errors_ptr, "\n"));
+      }
       return errors::InvalidArgument(
           errors_ptr->size(), " errors while building NodeDef '",
           node_def_.name(), "' using ", SummarizeOpDef(*op_def_), ":\n",
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 6521f2298cb..33fce730de2 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -161,7 +161,8 @@ TEST(NodeDefUtilTest, Out) {
   ExpectFailure(bad, op,
                 "Value for attr 'T' of string is not in the list of allowed "
                 "values: float, double, int32, uint8, int16, int8, complex64, "
-                "int64, qint8, quint8, qint32, bfloat16, uint16, complex128, "
+                "int64, qint8, quint8, qint32, bfloat16, qint16, quint16, "
+                "uint16, complex128, "
                 "half, uint32, uint64");
 }
 
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index 9b24e3aa004..52cf0167e40 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -124,13 +124,15 @@ TEST_F(OpDefBuilderTest, AttrWithRestrictions) {
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_QINT16, DT_QUINT16] } "
+      "} }");
   ExpectSuccess(
       b().Attr("a:{numbertype, variant}"),
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_VARIANT] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_VARIANT, DT_QINT16, "
+      "DT_QUINT16] } } }");
   ExpectSuccess(b().Attr("a:realnumbertype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index d13de41f143..fc27a63c0f6 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -59,11 +59,23 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
 
+// Used to match ops to kernel sources (and eventually to kernel targets)
+#ifdef TF_LOG_KERNEL_SOURCES
+#define LOG_KERNEL_SOURCES(name) \
+  LOG(INFO) << "Kernel found: " << name << " " << __FILE__ << "\n";
+#else
+#define LOG_KERNEL_SOURCES(name)
+#endif
+
 namespace Eigen {
 struct ThreadPoolDevice;
 struct GpuDevice;
 }  // end namespace Eigen
 
+namespace tsl {
+class CoordinationServiceAgent;
+}
+
 namespace tensorflow {
 
 namespace checkpoint {
@@ -81,7 +93,6 @@ class ResourceMgr;
 class ScopedStepContainer;
 class CollectiveExecutor;
 class StepStatsCollectorInterface;
-class CoordinationServiceAgent;
 
 // A label that is added to kernels that are JIT compiled. These labels will be
 // removed before kernels are looked up, so they can be used without specifying
@@ -681,7 +692,7 @@ class OpKernelContext {
     bool* outputs_required_array = nullptr;
 
     // For access to distributed coordination service.
-    CoordinationServiceAgent* coordination_service_agent = nullptr;
+    tsl::CoordinationServiceAgent* coordination_service_agent = nullptr;
   };
 
   // params must outlive the OpKernelContext.
@@ -1153,7 +1164,7 @@ class OpKernelContext {
   }
 
   // Access to distributed coordination service.
-  CoordinationServiceAgent* coordination_service_agent() const {
+  tsl::CoordinationServiceAgent* coordination_service_agent() const {
     return params_->coordination_service_agent;
   }
 
@@ -1430,6 +1441,7 @@ class Name : public KernelDefBuilder {
                      return new __VA_ARGS__(context);                       \
                    });                                                      \
                (void)registrar;                                             \
+               LOG_KERNEL_SOURCES(op_name)                                  \
                return ::tensorflow::InitOnStartupMarker{};                  \
              })(kernel_builder_expr.Build());
 
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 89edd59f8c8..d67f4681cf7 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
@@ -132,6 +133,42 @@ class TestOp5Gpu : public tensorflow::OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_GPU).Priority(1),
                         TestOp5Gpu);
 
+NodeDef StripNodeDef(OpKernelConstruction* ctx) {
+  const NodeDef& original = ctx->def();
+  NodeDef ret;
+  ret.set_name(original.name());
+  ret.set_op(original.op());
+  return ret;
+}
+
+class StrippedNode : public tensorflow::OpKernel {
+ public:
+  explicit StrippedNode(OpKernelConstruction* context)
+      : OpKernel(context, StripNodeDef(context), false) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_OP("StrippedNode").Input("i: float").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("StrippedNode").Device(DEVICE_CPU), StrippedNode);
+
+class RefInputs : public tensorflow::OpKernel {
+ public:
+  explicit RefInputs(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ctx->delete_ref_input(0, false);
+    Tensor t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, TensorShape({}), &t));
+    OP_REQUIRES_OK(ctx, ctx->replace_ref_input("b", t, false));
+  }
+};
+
+REGISTER_OP("RefInputs")
+    .Input("a: Ref(float)")
+    .Input("b: Ref(float)")
+    .Output("c: float");
+REGISTER_KERNEL_BUILDER(Name("RefInputs").Device(DEVICE_CPU), RefInputs);
+
 static std::vector<DeviceType> DeviceTypes() {
   return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
 }
@@ -204,6 +241,10 @@ TEST_F(OpKernelTest, SuccessCpu) {
   ExpectSuccess("Test1", DEVICE_CPU, {DT_FLOAT_REF, DT_INT32}, {DT_UINT8});
 }
 
+TEST_F(OpKernelTest, SuccessStrippedNode) {
+  ExpectSuccess("StrippedNode", DEVICE_CPU, {DT_FLOAT}, {DT_FLOAT});
+}
+
 TEST_F(OpKernelTest, SuccessGpu) {
   foo::match_signature_ = false;
   ExpectSuccess("Test2", DEVICE_GPU, {DT_INT32}, {DT_INT32});
@@ -341,7 +382,7 @@ TEST_F(OpKernelTest, TooManyInputs) {
   ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
 }
 
-TEST_F(OpKernelTest, MatchSignatureFailes) {
+TEST_F(OpKernelTest, MatchSignatureFails) {
   const auto invalid = error::INVALID_ARGUMENT;
   foo::match_signature_ = true;
   ExpectFailure(CreateNodeDef("Test2", {DT_FLOAT}).DebugString(), DEVICE_GPU,
@@ -385,6 +426,79 @@ TEST_F(OpKernelTest, InputDtype) {
   EXPECT_EQ(dtype, DT_INT32);
 }
 
+TEST_F(OpKernelTest, RefInputs) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  DummyDevice device(env);
+  params.device = &device;
+  Status status;
+  std::unique_ptr<OpKernel> op(
+      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
+                     CreateNodeDef("RefInputs", {DT_FLOAT_REF, DT_FLOAT_REF}),
+                     TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok());
+  params.op_kernel = op.get();
+  Tensor* a = new Tensor(DT_FLOAT, TensorShape({}));
+  Tensor* b = new Tensor(DT_FLOAT, TensorShape({2}));
+  mutex mu_a, mu_b;
+  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&mu_a, a),
+                                            TensorValue(&mu_b, b)};
+  params.inputs = inputs;
+  auto ctx = std::make_unique<OpKernelContext>(&params);
+
+  op->Compute(ctx.get());
+  // We mutate the second input to have a {} Shape in the kernel. Asserting that
+  // here.
+  EXPECT_NE(ctx->mutable_input(1, false).shape(), TensorShape({2}));
+  // a doesn't need deletion since the kernel deletes it.
+  delete b;
+}
+
+TEST_F(OpKernelTest, AllocateOutput) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  DummyDevice device(env);
+  params.device = &device;
+  Status status;
+  std::unique_ptr<OpKernel> op(
+      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
+                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}),
+                     TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok());
+  params.op_kernel = op.get();
+  Tensor a(DT_FLOAT, TensorShape({}));
+  Tensor b(DT_INT32, TensorShape({}));
+  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&a), TensorValue(&b)};
+  params.inputs = inputs;
+  auto ctx = std::make_unique<OpKernelContext>(&params);
+  Tensor* output = nullptr;
+
+  // Allocating to index -1 should fail (Only 0 should work).
+  Status s = ctx->allocate_output(-1, TensorShape({}), &output);
+  EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
+  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=-1"));
+
+  // Allocating to index 1 should fail (Only 0 should work).
+  s = ctx->allocate_output(1, TensorShape({}), &output);
+  EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
+  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=1"));
+
+  // Testing allocate_output when allocator attributes are set.
+  AllocatorAttributes attrs;
+  attrs.set_on_host(true);
+  TF_ASSERT_OK(ctx->allocate_output("o", TensorShape({}), &output, attrs));
+
+  // Index -1 should fail as only 1 output for the op.
+  s = ctx->allocate_output(-1, TensorShape({}), &output, attrs);
+  EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
+  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=-1"));
+
+  // Index 1 should fail as only 1 output for the op.
+  s = ctx->allocate_output(1, TensorShape({}), &output, attrs);
+  EXPECT_THAT(s, tensorflow::testing::StatusIs(error::INTERNAL));
+  EXPECT_THAT(s.error_message(), ::testing::ContainsRegex("bad index=1"));
+}
+
 // A mock device that mimics the behavior of scoped allocator upon calling
 // GetAllocator with a positive scope_id.
 class ScopedAllocatorDevice : public DeviceBase {
@@ -479,6 +593,30 @@ TEST_F(OpKernelTest, ScopedAllocationTest) {
   EXPECT_EQ(sa_device->num_allocations(true), 1);
 }
 
+TEST_F(OpKernelTest, TraceString) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  DummyDevice device(env);
+  params.device = &device;
+
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, params.device, cpu_allocator(),
+      CreateNodeDef("Test4", {DT_FLOAT}), TF_GRAPH_DEF_VERSION, &status));
+  TF_CHECK_OK(status);
+
+  params.op_kernel = op.get();
+  Tensor a(DT_FLOAT, TensorShape({4, 8}));
+  gtl::InlinedVector<TensorValue, 4> inputs{TensorValue(&a)};
+  params.inputs = inputs;
+
+  params.op_kernel = op.get();
+  OpKernelContext ctx(&params);
+
+  EXPECT_THAT(op->TraceString(ctx, true),
+              ::testing::ContainsRegex("Test4-op:Test4#shape"));
+}
+
 REGISTER_OP("BuildCPU");
 REGISTER_KERNEL_BUILDER(Name("BuildCPU").Device(DEVICE_CPU), DummyKernel);
 
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index a0baed6f417..c5fb7796ecf 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -62,7 +62,7 @@ namespace tensorflow {
     if (!TF_PREDICT_TRUE(STATUS.ok())) {                                       \
       CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC");                   \
       if (!PAYLOAD_VALUE.empty()) {                                            \
-        STATUS.SetPayload(PAYLOAD_KEY, PAYLOAD_VALUE);                         \
+        STATUS.SetPayload(PAYLOAD_KEY, absl::Cord(PAYLOAD_VALUE));             \
       }                                                                        \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, STATUS);                \
       return;                                                                  \
diff --git a/tensorflow/core/framework/optimized_function_graph.proto b/tensorflow/core/framework/optimized_function_graph.proto
new file mode 100644
index 00000000000..d3a1c473913
--- /dev/null
+++ b/tensorflow/core/framework/optimized_function_graph.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package tensorflow;
+
+import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/types.proto";
+
+// Optimized function graph after instantiation-related graph optimization
+// passes (up till before graph partitioning). The first half of the proto is
+// representing a GraphDef and the rest of the fields are extra information from
+// graph optimizations.
+message OptimizedFunctionGraph {
+  // Function name. It can be a human-readable SignatureDef's method name, or a
+  // FunctionDef name.
+  string name = 1;
+  // Optimized function graph.
+  GraphDef function_graph = 2;
+  // Maps from node name to control ret. This is an output from running TF/XLA
+  // bridge.
+  map<string, string> node_name_to_control_ret = 3;
+  // Return node types of the function. This is an output of graph
+  // preprocessing.
+  repeated DataType ret_types = 4;
+  // Number of return nodes. This is an output of graph preprocessing.
+  uint32 num_return_nodes = 5;
+}
diff --git a/tensorflow/core/framework/partial_tensor_shape_test.cc b/tensorflow/core/framework/partial_tensor_shape_test.cc
index 21ee6df8e85..a3c394af600 100644
--- a/tensorflow/core/framework/partial_tensor_shape_test.cc
+++ b/tensorflow/core/framework/partial_tensor_shape_test.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 
+#include <limits>
+
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
@@ -67,15 +72,43 @@ TEST(PartialTensorShapeTest, Concatenate) {
 
 TEST(PartialTensorShapeTest, ConcatenateWithStatus) {
   PartialTensorShape s({10, 5, 20});
-  Status status = s.ConcatenateWithStatus(400, &s);
+  PartialTensorShape s2;
+  Status status = s.ConcatenateWithStatus(400, &s2);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(s2.num_elements(), 400000);
+  EXPECT_EQ(s2.dims(), 4);
+
+  PartialTensorShape s3;
+  status = s2.ConcatenateWithStatus(-10, &s3);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(s3.num_elements(), -1);
+  EXPECT_EQ(s3.dims(), 5);
+
+  PartialTensorShape s4;
+  status = s.ConcatenateWithStatus(s, &s4);
   EXPECT_TRUE(status.ok());
-  EXPECT_EQ(400000, s.num_elements());
-  ASSERT_EQ(4, s.dims());
+  EXPECT_EQ(s4.num_elements(), 1000000);
+  EXPECT_EQ(s4.dims(), 6);
 
-  status = s.ConcatenateWithStatus(-10, &s);
+  PartialTensorShape s5;
+  // Concatenate with unknown rank
+  status = s5.ConcatenateWithStatus(s5, &s4);
   EXPECT_TRUE(status.ok());
-  EXPECT_EQ(-1, s.num_elements());
-  ASSERT_EQ(5, s.dims());
+}
+
+TEST(PartialTensorShapeTest, PartialTensorShapeIsValid) {
+  PartialTensorShape s({10, 5, 20});
+  EXPECT_TRUE(s.IsValid());
+
+  PartialTensorShape s2({-1, 5, 20});
+  EXPECT_TRUE(s2.IsValid());
+
+  PartialTensorShape s3;
+  // Default PartialTensorShape has unkown rank that is regarded as invalid.
+  EXPECT_FALSE(s3.IsValid());
+
+  PartialTensorShape s4(s3.AsProto());
+  EXPECT_FALSE(s4.IsValid());
 }
 
 TEST(PartialTensorShapeTest, InvalidShapeProto) {
@@ -103,6 +136,58 @@ TEST(PartialTensorShapeTest, InvalidShapeProto) {
   EXPECT_FALSE(PartialTensorShape::IsValid(proto));
 }
 
+TEST(PartialTensorShapeTest, PartialTensorShapeIsValidShape) {
+  // NOTE: This test is about PartialTensorShape::IsValidShape(proto), which
+  // returns a Status with descriptive error, while the above test is about
+  // PartialTensorShape::IsValid(proto), which returns a bool.
+
+  PartialTensorShape s;
+  // In TensorShapeProto, unkown rank with 0 dimension is regarded as valid.
+  TensorShapeProto proto = s.AsProto();
+  TF_EXPECT_OK(PartialTensorShape::IsValidShape(proto));
+
+  proto.add_dim()->set_size(1);
+  EXPECT_THAT(PartialTensorShape::IsValidShape(proto),
+              testing::StatusIs(
+                  error::Code::INVALID_ARGUMENT,
+                  ::testing::ContainsRegex(
+                      "An unknown shape must not have any dimensions set.")));
+
+  proto.set_unknown_rank(false);
+  proto.add_dim()->set_size(-1);
+  proto.add_dim()->set_size(-2);
+  EXPECT_THAT(PartialTensorShape::IsValidShape(proto),
+              testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                                ::testing::ContainsRegex(
+                                    "has dimensions with values below -1")));
+
+  EXPECT_THAT(TensorShape::IsValidShape(proto),
+              testing::StatusIs(
+                  error::Code::INVALID_ARGUMENT,
+                  ::testing::ContainsRegex("Shape.*is not fully defined")));
+}
+
+TEST(PartialTensorShapeTest, BuildPartialTensorShape) {
+  PartialTensorShape s;
+  TensorShapeProto sp = s.AsProto();
+  PartialTensorShape s2;
+  TF_EXPECT_OK(PartialTensorShape::BuildPartialTensorShape(sp, &s2));
+  EXPECT_EQ(s2.AsProto().DebugString(), sp.DebugString());
+
+  PartialTensorShape s3({-1, 5, 10});
+  TensorShapeProto sp3 = s3.AsProto();
+  PartialTensorShape s4;
+  TF_EXPECT_OK(PartialTensorShape::BuildPartialTensorShape(sp3, &s4));
+  EXPECT_EQ(s4.AsProto().DebugString(), sp3.DebugString());
+
+  sp3.add_dim()->set_size(std::numeric_limits<int64_t>::max());
+  EXPECT_THAT(
+      PartialTensorShape::BuildPartialTensorShape(sp3, &s4),
+      testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                        ::testing::ContainsRegex(
+                            "Encountered overflow when multiplying shape")));
+}
+
 TEST(PartialTensorShapeTest, PartialShapeFullyDefined) {
   const PartialTensorShape a({-1, 0, 1});
   const PartialTensorShape b({1, 0, 1});
@@ -144,7 +229,7 @@ TEST(PartialTensorShapeTest, PartialShapeIdenticalTo) {
   const PartialTensorShape g;
   std::vector<PartialTensorShape> shapes = {a, b, c, d, e, f, g};
   for (int i = 0; i < shapes.size(); ++i) {
-    for (int j = 0; j < i; ++j) {
+    for (int j = 0; j <= i; ++j) {
       if (i == j) {
         EXPECT_TRUE(shapes[i].IsIdenticalTo(shapes[j]));
       } else {
@@ -207,9 +292,7 @@ TEST(PartialTensorShapeTest, PartialShapeMergeWith) {
   const PartialTensorShape b({1, 0, 1});
   const PartialTensorShape c({-1, -1, 1});
   const PartialTensorShape d({1, 0});
-  const PartialTensorShape e({-1, 0, 2});
-  const PartialTensorShape f({});
-  const PartialTensorShape g;
+  const PartialTensorShape e;
 
   PartialTensorShape test;
   EXPECT_EQ(OkStatus(), a.MergeWith(a, &test));
@@ -243,20 +326,40 @@ TEST(PartialTensorShapeTest, PartialShapeMergeWith) {
   EXPECT_EQ(test.dim_size(2), 1);
 
   test = PartialTensorShape();
-  EXPECT_EQ(OkStatus(), a.MergeWith(g, &test));
+  EXPECT_EQ(OkStatus(), a.MergeWith(e, &test));
   EXPECT_EQ(test.dims(), 3);
   EXPECT_EQ(test.dim_size(0), -1);
   EXPECT_EQ(test.dim_size(1), 0);
   EXPECT_EQ(test.dim_size(2), 1);
 
   test = PartialTensorShape();
-  EXPECT_EQ(OkStatus(), g.MergeWith(a, &test));
+  EXPECT_EQ(OkStatus(), e.MergeWith(a, &test));
   EXPECT_EQ(test.dims(), 3);
   EXPECT_EQ(test.dim_size(0), -1);
   EXPECT_EQ(test.dim_size(1), 0);
   EXPECT_EQ(test.dim_size(2), 1);
 }
 
+TEST(PartialTensorShapeTest, PartialShapeMergeWithInvalidData) {
+  PartialTensorShape a = PartialTensorShape({-1, 0, 1});
+  const PartialTensorShape b({-1, 0, 2});
+  const PartialTensorShape c({1, -1, 3});
+  const PartialTensorShape d({-1, std::numeric_limits<int64_t>::max(), -1});
+
+  EXPECT_THAT(a.MergeWith(b, &a),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex("Cannot output result to itself")));
+  EXPECT_THAT(b.MergeWith(c, &a),
+              testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                                ::testing::ContainsRegex(
+                                    "Incompatible shapes during merge")));
+  EXPECT_THAT(c.MergeWith(d, &a),
+              testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                                ::testing::ContainsRegex(
+                                    "Encountered overflow when multiplying")));
+}
+
 TEST(PartialTensorShapeTest, MakePartialShapeEmpty) {
   // Empty made partial shapes should still be fully defined
   const int64_t dims[1] = {};
@@ -285,5 +388,44 @@ TEST(PartialTensorShapeTest, MakePartialShapeInvalid) {
             PartialTensorShape::MakePartialShape(dims, 3, &shape).code());
 }
 
+TEST(PartialTensorShapeUtilsTest, PartialShapeListString) {
+  PartialTensorShape s({2, 5, 20});
+  EXPECT_EQ(PartialTensorShapeUtils::PartialShapeListString({s}), "[[2,5,20]]");
+
+  PartialTensorShape s2;
+  PartialTensorShape s3({-1, -1, 10});
+  EXPECT_EQ(PartialTensorShapeUtils::PartialShapeListString({s, s2, s3}),
+            "[[2,5,20], <unknown>, [?,?,10]]");
+}
+
+TEST(PartialTensorShapeUtilsTest, PartialShapeAreCompatible) {
+  PartialTensorShape s1a({-1, 5, 20});
+  PartialTensorShape s1b({2, 5, 20});
+  PartialTensorShape s2a({-1, -1, 20});
+  PartialTensorShape s2b({5, 10, 20});
+
+  EXPECT_TRUE(PartialTensorShapeUtils::AreCompatible({s1a}, {s1b}));
+  EXPECT_TRUE(PartialTensorShapeUtils::AreCompatible({s1b}, {s1a}));
+  EXPECT_TRUE(PartialTensorShapeUtils::AreCompatible({s1a, s2b}, {s1b, s2b}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreCompatible({s1a}, {s2a, s1a}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreCompatible({s1a, s1b}, {s2a, s2b}));
+}
+
+TEST(PartialTensorShapeUtilsTest, PartialShapeAreIdentical) {
+  PartialTensorShape s1a({-1, 5, 20});
+  PartialTensorShape s1b({2, 5, 20});
+  PartialTensorShape s1c({-1, 5, 20});
+  PartialTensorShape s2a({-1, -1, 20});
+  PartialTensorShape s2b({5, 10, 20});
+
+  EXPECT_TRUE(PartialTensorShapeUtils::AreIdentical({s1a}, {s1a}));
+  EXPECT_TRUE(PartialTensorShapeUtils::AreIdentical({s1a, s1b}, {s1c, s1b}));
+  EXPECT_TRUE(PartialTensorShapeUtils::AreIdentical({s1c}, {s1a}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreIdentical({s1a}, {s1b}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreIdentical({s1a, s2b}, {s1b, s2b}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreIdentical({s1a}, {s2a, s1a}));
+  EXPECT_FALSE(PartialTensorShapeUtils::AreIdentical({s1a, s1b}, {s2a, s2b}));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 5332e6e1e87..5f411b54da5 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -87,6 +87,9 @@ limitations under the License.
 #define TF_CALL_complex128(m) m(::tensorflow::complex128)
 #define TF_CALL_half(m) m(Eigen::half)
 
+#define TF_CALL_float8_e5m2(m) m(::tensorflow::float8_e5m2)
+#define TF_CALL_float8_e4m3fn(m) m(::tensorflow::float8_e4m3fn)
+
 #elif defined(__ANDROID_TYPES_FULL__)
 
 // Only string, half, float, int32, int64, bool, and quantized types
@@ -119,6 +122,9 @@ limitations under the License.
 #define TF_CALL_complex128(m)
 #define TF_CALL_half(m) m(Eigen::half)
 
+#define TF_CALL_float8_e5m2(m)
+#define TF_CALL_float8_e4m3fn(m)
+
 #else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
 
 // Only float, int32, and bool are supported.
@@ -150,6 +156,9 @@ limitations under the License.
 #define TF_CALL_complex128(m)
 #define TF_CALL_half(m)
 
+#define TF_CALL_float8_e5m2(m)
+#define TF_CALL_float8_e4m3fn(m)
+
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
@@ -193,7 +202,7 @@ limitations under the License.
 
 // Call "m" on all number types supported on GPU.
 #define TF_CALL_GPU_NUMBER_TYPES(m) \
-  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
 
 // Call "m" on all types supported on GPU.
 #define TF_CALL_GPU_ALL_TYPES(m) \
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 8388e81ab86..5217e274f8e 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -85,9 +85,9 @@ struct proxy_type {
 #define TF_CALL_CPU_PROXY_TYPES(m)                                     \
   TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
       TF_CALL_int8(m) TF_CALL_complex128(m)
-#define TF_CALL_GPU_PROXY_TYPES(m)                                    \
-  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) \
-      TF_CALL_int8(m)
+#define TF_CALL_GPU_PROXY_TYPES(m)                                       \
+  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_bfloat16(m) \
+      TF_CALL_int32(m) TF_CALL_int8(m)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/registration/BUILD b/tensorflow/core/framework/registration/BUILD
index f86384e5c68..c9d7590fed0 100644
--- a/tensorflow/core/framework/registration/BUILD
+++ b/tensorflow/core/framework/registration/BUILD
@@ -6,6 +6,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "tf_selective_registration_deps")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
     ],
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 99b292790e8..5d8d71dab8f 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -29,8 +29,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 
+#include <cstring>
 #include <memory>
 #include <ostream>
+#include <type_traits>
 #include <utility>
 
 #include "absl/strings/escaping.h"
@@ -193,6 +195,21 @@ struct Helper {
       return nullptr;
     }
     port::CopyToArray(in, data);
+
+    if constexpr (std::is_same_v<typename std::remove_cv<T>::type, bool>) {
+      // Check that contents are valid and not trap representations for bool
+      // TODO(tlongeri): do we need this for any other types?
+      static constexpr bool true_value = true;
+      static constexpr bool false_value = false;
+      for (int64_t i = 0; i < n; ++i) {
+        if (std::memcmp(&true_value, data, sizeof(bool)) &&
+            std::memcmp(&false_value, data, sizeof(bool))) {
+          buf->Unref();
+          return nullptr;
+        }
+        data += sizeof(bool);
+      }
+    }
     return buf;
   }
 
@@ -485,6 +502,30 @@ struct ProtoHelper<Eigen::half> {
   }
 };
 
+template <typename Float8>
+struct Float8ProtoHelper {
+  typedef string RepeatedFieldType;
+  static const Float8* Begin(const TensorProto& proto) {
+    return reinterpret_cast<const Float8*>(proto.float8_val().data());
+  }
+  static size_t NumElements(const TensorProto& proto) {
+    return proto.float8_val().size();
+  }
+  static void Fill(const Float8* data, size_t n, TensorProto* proto) {
+    proto->mutable_float8_val()->reserve(n);
+    for (size_t i = 0; i < n; ++i) {
+      proto->mutable_float8_val()->push_back(
+          Eigen::numext::bit_cast<uint8_t>(data[i]));
+    }
+  }
+};
+
+template <>
+struct ProtoHelper<float8_e5m2> : public Float8ProtoHelper<float8_e5m2> {};
+
+template <>
+struct ProtoHelper<float8_e4m3fn> : public Float8ProtoHelper<float8_e4m3fn> {};
+
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64_t n)
     : BufferBase(a, TypedAllocator::Allocate<T>(a, n, AllocationAttributes())),
@@ -740,7 +781,9 @@ void Tensor::CheckIsAlignedAndSingleElement() const {
 Tensor::~Tensor() { UnrefIfNonNull(buf_); }
 
 std::ostream& operator<<(std::ostream& out, const Tensor& tensor) {
-  out << tensor.DebugString();
+  // The default is to show 3 elements, but this is often insufficient for
+  // debugging.
+  out << tensor.DebugString(/*num_values=*/100);
   return out;
 }
 
@@ -823,6 +866,8 @@ bool Tensor::RefCountIsOne() const {
     CASE(Eigen::half, SINGLE_ARG(STMTS))                       \
     CASE(ResourceHandle, SINGLE_ARG(STMTS))                    \
     CASE(Variant, SINGLE_ARG(STMTS))                           \
+    CASE(float8_e5m2, SINGLE_ARG(STMTS))                       \
+    CASE(float8_e4m3fn, SINGLE_ARG(STMTS))                     \
     case DT_INVALID:                                           \
       INVALID;                                                 \
       break;                                                   \
@@ -1105,6 +1150,14 @@ inline float PrintOneElement(bfloat16 f, bool print_v2) {
   return static_cast<float>(f);
 }
 
+inline float PrintOneElement(float8_e5m2 f, bool print_v2) {
+  return static_cast<float>(f);
+}
+
+inline float PrintOneElement(float8_e4m3fn f, bool print_v2) {
+  return static_cast<float>(f);
+}
+
 // Print from left dim to right dim recursively.
 template <typename T>
 void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
@@ -1244,6 +1297,9 @@ template <>
 string SummarizeArray<bool>(int64_t limit, int64_t num_elts,
                             const TensorShape& tensor_shape, const char* data,
                             const bool print_v2) {
+  if (data == nullptr) {
+    return strings::StrCat("");  // we already print type and shape
+  }
   // We first convert all chars to be 0/1 to not get InvalidEnumValue sanitizer
   // error
   auto mutable_data = std::unique_ptr<char[]>(new char[num_elts]);
@@ -1274,6 +1330,12 @@ string Tensor::SummarizeValue(int64_t max_entries, bool print_v2) const {
       return SummarizeArray<Eigen::half>(limit, num_elts, shape_, data,
                                          print_v2);
       break;
+    case DT_FLOAT8_E5M2:
+      return SummarizeArray<float8_e5m2>(limit, num_elts, shape_, data,
+                                         print_v2);
+    case DT_FLOAT8_E4M3FN:
+      return SummarizeArray<float8_e4m3fn>(limit, num_elts, shape_, data,
+                                           print_v2);
     case DT_FLOAT:
       return SummarizeArray<float>(limit, num_elts, shape_, data, print_v2);
       break;
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index eb057b12735..86012a1045d 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -83,6 +83,10 @@ message TensorProto {
 
   // DT_UINT64
   repeated uint64 uint64_val = 17 [packed = true];
+
+  // DT_FLOAT8_*, use variable-sized set of bytes
+  // (i.e. the equivalent of repeated uint8, if such a thing existed).
+  bytes float8_val = 18;
 }
 
 // Protocol buffer representing the serialization format of DT_VARIANT tensors.
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 8d4e7220d57..7f19abb334b 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -881,7 +881,7 @@ PartialTensorShape PartialTensorShape::Concatenate(int64_t size) const {
 
 Status PartialTensorShape::ConcatenateWithStatus(
     int64_t size, PartialTensorShape* out) const {
-  out = const_cast<PartialTensorShape*>(this);
+  *out = *this;
   return out->AddDimWithStatus(size);
 }
 
@@ -901,7 +901,7 @@ Status PartialTensorShape::ConcatenateWithStatus(
     *out = PartialTensorShape();
     return OkStatus();
   }
-  out = const_cast<PartialTensorShape*>(this);
+  *out = *this;
   for (auto dim : shape) {
     Status s = out->AddDimWithStatus(dim.size);
     if (!s.ok()) return s;
@@ -929,7 +929,7 @@ Status PartialTensorShape::MergeWith(const PartialTensorShape& shape,
 
   if (result == this) {
     return errors::Internal(
-        "PartialTensorShape::MergeWith: cannot merge shape with itself");
+        "PartialTensorShape::MergeWith: Cannot output result to itself");
   }
 
   result->Clear();
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index b3fdb911e4d..998a9e010e4 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
@@ -262,7 +263,7 @@ class TensorShapeBase : public TensorShapeRep {
   /// Same as `RemoveLastDims` but returns a `Status`.
   /// Use if unsure is `0 <= n <= dims()`, to prevent `CHECK`-crashes.
   Status RemoveLastDimsWithStatus(int64_t n) {
-    if (TF_PREDICT_FALSE(n < dims())) {
+    if (TF_PREDICT_FALSE(n > dims())) {
       return errors::Internal("Expected dimension index to be at most ", dims(),
                               " got ", n);
     }
@@ -665,6 +666,7 @@ Status TensorShape::AsEigenDSizesWithStatus(
                             " dimensions");
   }
   *out = AsEigenDSizesCopy<NDIMS, IndexType>();
+  return OkStatus();
 }
 
 template <int NDIMS, typename IndexType>
@@ -677,11 +679,12 @@ template <int NDIMS, typename IndexType>
 Status TensorShape::AsEigenDSizesWithPaddingWithStatus(
     Eigen::DSizes<IndexType, NDIMS>* out) const {
   if (TF_PREDICT_FALSE(NDIMS < dims())) {
-    return errors::Internal("Asking for tensor of at least ", NDIMS,
+    return errors::Internal("Asking for tensor of at most ", NDIMS,
                             " dimensions from a tensor of ", dims(),
                             " dimensions");
   }
   *out = AsEigenDSizesCopyAndPad<NDIMS, IndexType>();
+  return OkStatus();
 }
 
 // ----------------------------------------------------------------------------
diff --git a/tensorflow/core/framework/tensor_shape_fuzz.cc b/tensorflow/core/framework/tensor_shape_fuzz.cc
new file mode 100644
index 00000000000..dcf55f8bff3
--- /dev/null
+++ b/tensorflow/core/framework/tensor_shape_fuzz.cc
@@ -0,0 +1,89 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "testing/fuzzing/fuzztest.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace fuzzing {
+namespace {
+
+void FuzzTensorShape(const std::vector<int64_t>& dim_sizes) {
+  TensorShape out;
+  Status status = TensorShape::BuildTensorShape(dim_sizes, &out);
+  if (!dim_sizes.empty() && dim_sizes.size() < 5) {
+    const auto [min, max] =
+        std::minmax_element(dim_sizes.begin(), dim_sizes.end());
+    if (*max < 10 && *min >= 0) {
+      TF_EXPECT_OK(status);
+      EXPECT_EQ(out.dims(), dim_sizes.size());
+    }
+  }
+}
+FUZZ_TEST(TensorShapeFuzz, FuzzTensorShape);
+
+void FuzzPartialTensorShape(const std::vector<int64_t>& dim_sizes) {
+  PartialTensorShape out;
+  Status status = PartialTensorShape::BuildPartialTensorShape(dim_sizes, &out);
+  if (!dim_sizes.empty() && dim_sizes.size() < 5) {
+    const auto [min, max] =
+        std::minmax_element(dim_sizes.begin(), dim_sizes.end());
+    if (*max < 10 && *min >= -10) {
+      TF_EXPECT_OK(status);
+      EXPECT_EQ(out.dims(), dim_sizes.size());
+    }
+  }
+}
+FUZZ_TEST(TensorShapeFuzz, FuzzPartialTensorShape);
+
+void FuzzSetDimWithStatus(TensorShape shape, int dim, int64_t value) {
+  int initial_rank = shape.dims();
+  bool should_be_ok = shape.dims() == 2 && shape.dim_size(0) <= 100 &&
+                      shape.dim_size(1) <= 100 && dim < 2 && value < 100;
+  Status status = shape.SetDimWithStatus(dim, value);
+  if (status.ok()) {
+    EXPECT_EQ(initial_rank, shape.dims());
+    EXPECT_EQ(value, shape.dim_size(dim));
+  } else {
+    EXPECT_FALSE(should_be_ok);
+  }
+}
+FUZZ_TEST(TensorShapeFuzz, FuzzSetDimWithStatus)
+    .WithDomains(AnyValidTensorShape(), fuzztest::InRange<int>(0, 10),
+                 fuzztest::InRange<int64_t>(0, 1000));
+
+void FuzzRemoveDimWithStatus(TensorShape shape, int dim) {
+  auto initial_rank = shape.dims();
+  bool should_be_ok = shape.dims() == 2 && shape.dim_size(0) <= 100 &&
+                      shape.dim_size(1) <= 100 && dim >= 0 && dim < 2;
+  Status status = shape.RemoveDimWithStatus(dim);
+  if (status.ok()) {
+    EXPECT_EQ(shape.dims(), initial_rank - 1);
+  } else {
+    EXPECT_FALSE(should_be_ok);
+  }
+}
+FUZZ_TEST(TensorShapeFuzz, FuzzRemoveDimWithStatus)
+    .WithDomains(AnyValidTensorShape(), fuzztest::Arbitrary<int>());
+
+}  // namespace
+}  // namespace fuzzing
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 2be99b19af5..435d953dd9c 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -15,14 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include <cstdint>
+#include <limits>
+
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 class TensorShapeTestHelper {
@@ -62,6 +68,19 @@ TEST(TensorShapeTest, RemoveDim) {
   ASSERT_EQ(1, s.dims());
 }
 
+TEST(TensorShapeTest, RemoveDimWithStatus) {
+  TensorShape s({10, 5});
+  TF_EXPECT_OK(s.RemoveDimWithStatus(0));
+  EXPECT_EQ(s.num_elements(), 5);
+  EXPECT_EQ(s.dims(), 1);
+
+  EXPECT_THAT(
+      s.RemoveDimWithStatus(-1),
+      testing::StatusIs(error::Code::INTERNAL,
+                        ::testing::ContainsRegex(
+                            "Expected dimension index to be non-negative")));
+}
+
 TEST(TensorShapeTest, RemoveAndAddDim) {
   TensorShape s({10, 5, 20});
   s.RemoveDim(1);
@@ -83,6 +102,20 @@ TEST(TensorShapeTest, RemoveLastDims) {
   EXPECT_EQ(2, s.dim_size(0));
 }
 
+TEST(TensorShapeTest, RemoveLastDimsWithStatus) {
+  TensorShape s({2, 3, 5, 7});
+  TF_EXPECT_OK(s.RemoveLastDimsWithStatus(1));
+
+  EXPECT_EQ(s.dims(), 3);
+  EXPECT_EQ(s.num_elements(), 30);
+
+  EXPECT_THAT(
+      s.RemoveLastDimsWithStatus(4),
+      testing::StatusIs(error::Code::INTERNAL,
+                        ::testing::ContainsRegex(
+                            "Expected dimension index to be at most 3")));
+}
+
 TEST(TensorShapeTest, RemoveDimRange) {
   TensorShape s0({2, 3, 5, 7});
   // Empty interval => noop.
@@ -118,6 +151,137 @@ TEST(TensorShapeTest, RemoveDimRange) {
   ASSERT_EQ(42, s3.num_elements());
 }
 
+TEST(TensorShapeTest, RemoveDimRangeWithStatusWithEmptyInterval) {
+  TensorShape s0({2, 3, 5, 7});
+
+  // Empty interval => noop.
+  for (int i = -4; i <= 4; ++i) {
+    TF_EXPECT_OK(s0.RemoveDimRangeWithStatus(i, i));
+    EXPECT_EQ(s0.dims(), 4);
+    EXPECT_EQ(s0.num_elements(), 210);
+  }
+}
+
+TEST(TensorShapeTest, RemoveDimRangeWithStatusWithPositiveBeginEnd) {
+  TensorShape s0({2, 3, 5, 7});
+
+  // Begin >= End: Empty interval.
+  TF_EXPECT_OK(s0.RemoveDimRangeWithStatus(3, 1));
+  EXPECT_EQ(s0.dims(), 4);
+  EXPECT_EQ(s0.num_elements(), 210);
+
+  TF_EXPECT_OK(s0.RemoveDimRangeWithStatus(0, 3));
+  EXPECT_EQ(s0.dims(), 1);
+  EXPECT_EQ(s0.dim_size(0), 7);
+
+  TensorShape s1({2, 3, 5, 7});
+  TF_EXPECT_OK(s1.RemoveDimRangeWithStatus(2, 3));
+  EXPECT_EQ(s1.dims(), 3);
+  EXPECT_EQ(s1.num_elements(), 42);
+}
+
+TEST(TensorShapeTest, RemoveDimRangeWithStatusWithNegativeBeginEnd) {
+  TensorShape s2({2, 3, 5, 7});
+
+  // Begin >= End: Empty interval.
+  TF_EXPECT_OK(s2.RemoveDimRangeWithStatus(-2, -3));
+  EXPECT_EQ(s2.dims(), 4);
+  EXPECT_EQ(s2.num_elements(), 210);
+
+  TF_EXPECT_OK(s2.RemoveDimRangeWithStatus(0, -2));
+  EXPECT_EQ(s2.dims(), 1);
+  EXPECT_EQ(s2.dim_size(0), 7);
+
+  TensorShape s3({2, 3, 5, 7});
+  TF_EXPECT_OK(s3.RemoveDimRangeWithStatus(-3, -2));
+  EXPECT_EQ(s3.dims(), 3);
+  EXPECT_EQ(s3.num_elements(), 42);
+}
+
+TEST(TensorShapeTest, RemoveDimRangeWithStatusWithInvalidBeginEnd) {
+  TensorShape s3({2, 5, 7});
+
+  EXPECT_THAT(s3.RemoveDimRangeWithStatus(-5, 0),
+              testing::StatusIs(error::Code::INTERNAL,
+                                ::testing::ContainsRegex(
+                                    "Start index must be non-negative")));
+
+  EXPECT_THAT(s3.RemoveDimRangeWithStatus(5, 0),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex("Start index must be less than 3")));
+
+  EXPECT_THAT(s3.RemoveDimRangeWithStatus(0, -5),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex("End index must be non-negative")));
+
+  EXPECT_THAT(s3.RemoveDimRangeWithStatus(0, 5),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex("End index must be less than 3")));
+}
+
+TEST(TensorShapeTest, InsertDimWithStatus) {
+  TensorShape s({10, 20});
+  TF_EXPECT_OK(s.InsertDimWithStatus(1, 5));
+  EXPECT_EQ(s.dims(), 3);
+  EXPECT_EQ(s.dim_size(1), 5);
+
+  TF_EXPECT_OK(s.InsertDimWithStatus(1, 30));
+  EXPECT_EQ(s.dims(), 4);
+  EXPECT_EQ(s.dim_size(1), 30);
+  EXPECT_EQ(s.dim_size(2), 5);
+}
+
+TEST(TensorShapeTest, InsertDimWithStatusWithInvalidData) {
+  TensorShape s({10, 5, 20});
+
+  EXPECT_THAT(s.InsertDimWithStatus(1, -5),
+              testing::StatusIs(
+                  error::Code::INVALID_ARGUMENT,
+                  ::testing::ContainsRegex("Expected a non-negative size")));
+
+  EXPECT_THAT(
+      s.InsertDimWithStatus(-1, 5),
+      testing::StatusIs(error::Code::INTERNAL,
+                        ::testing::ContainsRegex(
+                            "The insertion index must be non-negative")));
+
+  EXPECT_THAT(s.InsertDimWithStatus(4, 5),
+              testing::StatusIs(error::Code::INTERNAL,
+                                ::testing::ContainsRegex(
+                                    "The insertion index must be at most 3")));
+}
+
+TEST(TensorShapeTest, InsertDimWithStatusWithTooManyDims) {
+  TensorShape s({10, 20});
+  int max_dims_to_add = TensorShape::MaxDimensions() - s.dims();
+  for (int i = 0; i < max_dims_to_add; ++i) {
+    TF_EXPECT_OK(s.InsertDimWithStatus(1, 1));
+  }
+  EXPECT_THAT(s.InsertDimWithStatus(1, 1),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex(
+                      "Shape has.*dimensions which is the maximum allowed")));
+}
+
+TEST(TensorShapeTest, TensorShapeAssignment) {
+  TensorShape s({std::numeric_limits<int64_t>::max()});
+  TensorShape s2({1, std::numeric_limits<int64_t>::max()});
+  TensorShape s3({10});
+
+  s = s2;
+  EXPECT_EQ(s.dims(), 2);
+  EXPECT_EQ(s.dim_size(0), 1);
+  EXPECT_EQ(s.dim_size(1), std::numeric_limits<int64_t>::max());
+
+  s = s3;
+  EXPECT_EQ(s.dims(), 1);
+  EXPECT_EQ(s.dim_size(0), 10);
+}
+
 TEST(TensorShapeTest, InvalidShapeProto) {
   TensorShapeProto proto;
   EXPECT_TRUE(TensorShape::IsValid(proto));
@@ -161,6 +325,61 @@ TEST(TensorShapeTest, SetDimForEmptyTensor) {
   EXPECT_EQ(1400, s.num_elements());
 }
 
+TEST(TensorShapeTest, SetDimWithLargeSizeFormat) {
+  // Create a TensorShape of Rep32 format to test its update logic.
+  TensorShape s({std::numeric_limits<uint32_t>::max() - 2});
+  s.set_dim(0, 5);
+  EXPECT_EQ(s.dim_size(0), 5);
+
+  // Create a TensorShape of Rep64 format to test its update logic.
+  TensorShape s2({std::numeric_limits<int64_t>::max()});
+  s2.set_dim(0, 10);
+  EXPECT_EQ(s2.dim_size(0), 10);
+
+  // Trigger the internal format upgrade from Rep32 to Rep64.
+  s.set_dim(0, std::numeric_limits<int64_t>::max());
+  EXPECT_EQ(s.dim_size(0), std::numeric_limits<int64_t>::max());
+}
+
+TEST(TensorShapeTest, SetDimWithStatus) {
+  TensorShape s({10, 5, 20});
+  TF_EXPECT_OK(s.SetDimWithStatus(1, 2));
+  EXPECT_EQ(s.dims(), 3);
+  EXPECT_EQ(s.dim_size(1), 2);
+
+  EXPECT_THAT(s.SetDimWithStatus(-1, 2),
+              testing::StatusIs(
+                  error::Code::INVALID_ARGUMENT,
+                  ::testing::ContainsRegex("Index must be non-negative")));
+
+  EXPECT_THAT(
+      s.SetDimWithStatus(4, 2),
+      testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                        ::testing::ContainsRegex("Index must be less than 3")));
+
+  EXPECT_THAT(s.SetDimWithStatus(0, -2),
+              testing::StatusIs(
+                  error::Code::INVALID_ARGUMENT,
+                  ::testing::ContainsRegex("Expected a non-negative size")));
+}
+
+TEST(TensorShapeTest, SetDimWithStatusWithLargeSizeFormat) {
+  // Create a TensorShape of Rep32 format to test its update logic.
+  TensorShape s({std::numeric_limits<uint32_t>::max() - 2});
+  TF_EXPECT_OK(s.SetDimWithStatus(0, 2));
+  s.set_dim(0, 5);
+  EXPECT_EQ(s.dim_size(0), 5);
+
+  // Create a TensorShape of Rep64 format to test its update logic.
+  TensorShape s2({std::numeric_limits<int64_t>::max()});
+  TF_EXPECT_OK(s2.SetDimWithStatus(0, 10));
+  EXPECT_EQ(s2.dim_size(0), 10);
+
+  // Trigger the internal format upgrade from Rep32 to Rep64.
+  TF_EXPECT_OK(s.SetDimWithStatus(0, std::numeric_limits<int64_t>::max()));
+  EXPECT_EQ(s.dim_size(0), std::numeric_limits<int64_t>::max());
+}
+
 TEST(TensorShapeTest, AppendShape64BitIndices) {
   TensorShape s({10, 2147483648});
 
@@ -215,6 +434,26 @@ TEST(TensorShapeTest, AddDimWithStatus) {
 
   status = s.AddDimWithStatus(-1);
   EXPECT_EQ(tensorflow::error::INVALID_ARGUMENT, status.code());
+
+  TensorShape s2({std::numeric_limits<int64_t>::max()});
+  EXPECT_THAT(s2.AddDimWithStatus(2),
+              testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                                ::testing::ContainsRegex(
+                                    "Encountered overflow when multiplying")));
+}
+
+TEST(TensorShapeTest, AppendShapeWithStatus) {
+  TensorShape s({10, 5, 20});
+  TensorShape s2({400});
+  TF_EXPECT_OK(s.AppendShapeWithStatus(s2));
+  EXPECT_EQ(s.num_elements(), 400000);
+  EXPECT_EQ(s.dims(), 4);
+
+  TensorShape s3({std::numeric_limits<int64_t>::max()});
+  EXPECT_THAT(s2.AppendShapeWithStatus(s3),
+              testing::StatusIs(error::Code::INVALID_ARGUMENT,
+                                ::testing::ContainsRegex(
+                                    "Encountered overflow when multiplying")));
 }
 
 TEST(TensorShapeTest, Factory) {
@@ -228,6 +467,58 @@ TEST(TensorShapeTest, Factory) {
   EXPECT_EQ(tensorflow::error::INVALID_ARGUMENT, status.code());
 }
 
+TEST(TensorShapeTest, AsEigenDSizess) {
+  TensorShape s({10, 5, 20});
+  Eigen::DSizes<Eigen::DenseIndex, 3> dsizes_expected{10, 5, 20};
+  EXPECT_EQ(s.AsEigenDSizes<3>(), dsizes_expected);
+
+  EXPECT_DEATH(s.AsEigenDSizes<2>(),
+               "tensor of 2 dimensions from a tensor of 3 dimensions");
+
+  Eigen::DSizes<Eigen::DenseIndex, 3> dsizes_out;
+  TF_EXPECT_OK(s.AsEigenDSizesWithStatus<3>(&dsizes_out));
+  EXPECT_EQ(dsizes_out, dsizes_expected);
+
+  Eigen::DSizes<Eigen::DenseIndex, 2> dsizes_out2;
+  EXPECT_THAT(s.AsEigenDSizesWithStatus<2>(&dsizes_out2),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex(
+                      "tensor of 2 dimensions from a tensor of 3 dimensions")));
+}
+
+TEST(TensorShapeTest, AsEigenDSizesWithPadding) {
+  TensorShape s({10, 5, 20});
+  Eigen::DSizes<Eigen::DenseIndex, 5> dsizes_expected{10, 5, 20, 1, 1};
+  EXPECT_EQ(s.AsEigenDSizesWithPadding<5>(), dsizes_expected);
+
+  EXPECT_DEATH(s.AsEigenDSizesWithPadding<2>(),
+               "at most 2 dimensions from a tensor of 3 dimensions");
+
+  Eigen::DSizes<Eigen::DenseIndex, 5> dsizes_out;
+  TF_EXPECT_OK(s.AsEigenDSizesWithPaddingWithStatus<5>(&dsizes_out));
+  EXPECT_EQ(dsizes_out, dsizes_expected);
+
+  Eigen::DSizes<Eigen::DenseIndex, 2> dsizes_out2;
+  EXPECT_THAT(s.AsEigenDSizesWithPaddingWithStatus<2>(&dsizes_out2),
+              testing::StatusIs(
+                  error::Code::INTERNAL,
+                  ::testing::ContainsRegex(
+                      "at most 2 dimensions from a tensor of 3 dimensions")));
+}
+
+TEST(TensorShapeTest, AsProto) {
+  TensorShape s({10, 5});
+  TensorShapeProto sp;
+  s.AsProto(&sp);
+  EXPECT_EQ(sp.dim_size(), 2);
+  EXPECT_EQ(sp.dim(0).size(), 10);
+  EXPECT_EQ(sp.dim(1).size(), 5);
+
+  TensorShapeProto sp2 = s.AsProto();
+  EXPECT_EQ(sp.DebugString(), sp2.DebugString());
+}
+
 // -----------------------------------------------------------------------
 // An old implementation of TensorShape using a different representation,
 // preserved here in the unittest to allow us to have a randomized unittest
@@ -684,6 +975,36 @@ TEST(TensorShapeUtilsTest, EndsWith) {
       TensorShapeUtils::EndsWith(TensorShape({2, 3, 4}), TensorShape({2, 3})));
 }
 
+TEST(TensorShapeUtilsTest, ShapeListString) {
+  EXPECT_EQ(
+      TensorShapeUtils::ShapeListString({TensorShape({}), TensorShape({})}),
+      "[[], []]");
+  EXPECT_EQ(TensorShapeUtils::ShapeListString(
+                {TensorShape({2, 3}), TensorShape({4, 5, 6})}),
+            "[[2,3], [4,5,6]]");
+}
+
+TEST(TensorShapeUtilsTest, NumElements) {
+  int64_t num_elements = 0;
+  TF_EXPECT_OK(TensorShapeUtils::NumElements({}, &num_elements));
+  EXPECT_EQ(num_elements, 1);
+
+  TF_EXPECT_OK(TensorShapeUtils::NumElements({1}, &num_elements));
+  EXPECT_EQ(num_elements, 1);
+
+  TF_EXPECT_OK(TensorShapeUtils::NumElements({2, 3, 4}, &num_elements));
+  EXPECT_EQ(num_elements, 24);
+
+  int64_t int64_max_val = std::numeric_limits<int64_t>::max();
+  EXPECT_THAT(
+      TensorShapeUtils::NumElements({int64_max_val, int64_max_val},
+                                    &num_elements),
+      testing::StatusIs(
+          error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex(
+              "Can't compute total size of shape.*product would overflow")));
+}
+
 // A few different test cases for tensor sizes for benchmarks
 static std::vector<int64_t> MakeSizes(int arg) {
   std::vector<int64_t> sizes;
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index a861410883d..69c20b153c8 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -15,15 +15,22 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 
+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/float8.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -225,6 +232,28 @@ TEST(Tensor_Bfloat16, Simple) {
   TestCopies<bfloat16>(t);
 }
 
+TEST(Tensor_Float8_E5m2, Simple) {
+  Tensor t(DT_FLOAT8_E5M2, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float8_e5m2>()(a, b) = static_cast<float8_e5m2>(a * b);
+    }
+  }
+  TestCopies<float8_e5m2>(t);
+}
+
+TEST(Tensor_Float8_E4m3fn, Simple) {
+  Tensor t(DT_FLOAT8_E4M3FN, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float8_e4m3fn>()(a, b) = static_cast<float8_e4m3fn>(a * b);
+    }
+  }
+  TestCopies<float8_e4m3fn>(t);
+}
+
 TEST(Tensor_Float, Simple) {
   Tensor t(DT_FLOAT, TensorShape({10, 20}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
@@ -236,6 +265,72 @@ TEST(Tensor_Float, Simple) {
   TestCopies<float>(t);
 }
 
+TEST(Tensor_Double, Simple) {
+  Tensor t(DT_DOUBLE, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<double>()(a, b) = static_cast<double>(a * b);
+    }
+  }
+  TestCopies<double>(t);
+}
+
+TEST(Tensor_int8, Simple) {
+  Tensor t(DT_INT8, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<int8>()(a, b) = static_cast<int8>(a * b);
+    }
+  }
+  TestCopies<int8>(t);
+}
+
+TEST(Tensor_int16, Simple) {
+  Tensor t(DT_INT16, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<int16>()(a, b) = static_cast<int16>(a * b);
+    }
+  }
+  TestCopies<int16>(t);
+}
+
+TEST(Tensor_int32, Simple) {
+  Tensor t(DT_INT32, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<int>()(a, b) = static_cast<int>(a * b);
+    }
+  }
+  TestCopies<int>(t);
+}
+
+TEST(Tensor_int64, Simple) {
+  Tensor t(DT_INT64, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<int64_t>()(a, b) = static_cast<int64_t>(a * b);
+    }
+  }
+  TestCopies<int64_t>(t);
+}
+
+TEST(Tensor_uint64, Simple) {
+  Tensor t(DT_UINT64, TensorShape({10, 20}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<uint64_t>()(a, b) = static_cast<uint64_t>(a * b);
+    }
+  }
+  TestCopies<uint64_t>(t);
+}
+
 TEST(Tensor_ResourceHandle, Simple) {
   Tensor t(DT_RESOURCE, TensorShape({}));
   ResourceHandle tmp;
@@ -302,6 +397,14 @@ TEST(Tensor_Variant, Simple) {
     ExpectEqual<Variant>(t, *t3);
     EXPECT_TRUE(t3->IsInitialized());
   }
+  {
+    LOG(INFO) << "AsProtoTensorContent()";
+    TensorProto proto;
+    t.AsProtoTensorContent(&proto);
+    Tensor t2(t.dtype());
+    EXPECT_TRUE(t2.FromProto(proto));
+    ExpectEqual<Variant>(t, t2);
+  }
 }
 
 TEST(Tensor_Variant, Marshal) {
@@ -324,17 +427,50 @@ TEST(Tensor_Variant, Marshal) {
   EXPECT_FLOAT_EQ(out->scalar<float>()(), 42.0f);
 }
 
+TEST(Tensor_Bool, Simple) {
+  Tensor t(DT_BOOL, TensorShape({2, 2}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<bool>()(a, b) = true;
+    }
+  }
+  TestCopies<bool>(t);
+}
+
+TEST(Tensor_UInt8, Simple) {
+  Tensor t(DT_UINT8, TensorShape({2, 2}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<uint8>()(a, b) = static_cast<uint8>(a * b);
+    }
+  }
+  TestCopies<uint8>(t);
+}
+
 TEST(Tensor_UInt16, Simple) {
   Tensor t(DT_UINT16, TensorShape({2, 2}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint16>()(a, b) = uint16(a * b);
+      t.matrix<uint16>()(a, b) = static_cast<uint16>(a * b);
     }
   }
   TestCopies<uint16>(t);
 }
 
+TEST(Tensor_UInt32, Simple) {
+  Tensor t(DT_UINT32, TensorShape({2, 2}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<uint32>()(a, b) = static_cast<uint32>(a * b);
+    }
+  }
+  TestCopies<uint32>(t);
+}
+
 TEST(Tensor_QInt8, Simple) {
   Tensor t(DT_QINT8, TensorShape({2, 2}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
@@ -357,6 +493,28 @@ TEST(Tensor_QUInt8, Simple) {
   TestCopies<Eigen::QUInt8>(t);
 }
 
+TEST(Tensor_QInt16, Simple) {
+  Tensor t(DT_QINT16, TensorShape({2, 2}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<qint16>()(a, b) = qint16(a * b);
+    }
+  }
+  TestCopies<qint16>(t);
+}
+
+TEST(Tensor_QUInt16, Simple) {
+  Tensor t(DT_QUINT16, TensorShape({2, 2}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<Eigen::QUInt16>()(a, b) = Eigen::QUInt16(a * b);
+    }
+  }
+  TestCopies<Eigen::QUInt16>(t);
+}
+
 TEST(Tensor_QInt32, Simple) {
   Tensor t(DT_QINT32, TensorShape({2, 2}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
@@ -935,6 +1093,13 @@ TEST(Tensor_Float, Reshape_And_Slice_Assignment) {
   }
 }
 
+TEST(Tensor_Float, Bitcast_From) {
+  Tensor t(DT_FLOAT, TensorShape({2, 4}));
+  Tensor inp(DT_DOUBLE, TensorShape({2, 2}));
+  auto s = t.BitcastFrom(inp, inp.dtype(), inp.shape());
+  ASSERT_TRUE(s.ok());
+}
+
 TEST(Tensor_String, Simple) {
   Tensor t = test::AsTensor<tstring>(
       {"hello", "world", "machine", "learning", "new", "york"},
@@ -964,6 +1129,37 @@ TEST(Tensor_Float, SimpleWithHelper) {
   ExpectEqual<float>(t2, t3);
 }
 
+TEST(Tensor_Float, SimpleBuildTensor) {
+  Tensor t;
+  auto s = Tensor::BuildTensor(DT_FLOAT, {10, 20}, &t);
+  ASSERT_TRUE(s.ok());
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float>()(a, b) = static_cast<float>(a * b);
+    }
+  }
+  TestCopies<float>(t);
+  ASSERT_EQ(t.AllocatedBytes(), 800);  // 10 * 20 * 4
+}
+
+TEST(Tensor_Float, SimpleWithAllocator) {
+  EnableCPUAllocatorFullStats();
+  auto* a = cpu_allocator();
+  Tensor t(a, DT_FLOAT, TensorShape({10, 20}), {});
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
+  for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<float>()(a, b) = static_cast<float>(a * b);
+    }
+  }
+  TestCopies<float>(t);
+
+  TensorDescription p;
+  t.FillDescription(&p);
+  ASSERT_EQ(p.dtype(), DT_FLOAT);
+}
+
 TEST(Tensor_Int32, SimpleWithHelper) {
   Tensor t1 = test::AsTensor<int32>({0, 1, 2, 3, 4, 5}, {2, 3});
   Tensor t2(t1.dtype(), t1.shape());
@@ -1036,6 +1232,16 @@ TEST(Tensor_Bool, SimpleWithHelper) {
   ExpectEqual<bool>(t1, t2);
 }
 
+TEST(Tensor_Bool, DecodeValidate) {
+  TensorProto proto;
+  proto.set_dtype(DT_BOOL);
+  proto.mutable_tensor_shape()->add_dim()->set_size(5);
+  proto.set_tensor_content(std::string{0, 1, 2, 3, 4});
+
+  Tensor t;
+  EXPECT_FALSE(t.FromProto(proto));
+}
+
 TEST(Tensor_Complex, Simple64) {
   Tensor t(DT_COMPLEX64, {4, 5, 3, 7});
   t.flat<complex64>().setRandom();
@@ -1173,6 +1379,19 @@ TEST(Tensor, SharesBufferWith) {
   EXPECT_TRUE(a.SharesBufferWith(copy));
 }
 
+TEST(TensorFromProto, CompressedTensorProto) {
+  int size = 100;
+  TensorShape shape({size});
+  Allocator* allocator = cpu_allocator();
+  Tensor a(allocator, DT_FLOAT, shape);
+  std::fill_n(a.flat<float>().data(), size, 42.0f);
+  TensorProto p;
+  a.AsProtoField(&p);
+  tensor::CompressTensorProtoInPlace(&p);
+  Tensor b;
+  ASSERT_TRUE(b.FromProto(p));
+}
+
 TEST(Tensor, FailureToAllocate) {
   TensorShape shape({1});
   DummyCPUAllocator allocator;
@@ -1248,6 +1467,13 @@ TEST(Tensor, Slice_Basic) {
         }
       }
     }
+
+    ASSERT_EQ(y.AllocatedBytes(), 2304);  // 4 * 4 * 36 * 4
+
+    TensorDescription p;
+    y.FillDescription(&p);
+    ASSERT_EQ(p.dtype(), DT_FLOAT);
+
     // A simple slice equivalent to identity.
     TestCopies<float>(y);
     y = x.Slice(0, 10);
@@ -1352,6 +1578,19 @@ Tensor MkTensor(DataType dt, const TensorShape& shape,
   return x;
 }
 
+template <typename T>
+Tensor MkTensorNoInitList(DataType dt, const TensorShape& shape,
+                          const std::vector<int>& init_values) {
+  Tensor x(dt, shape);
+  const int limit = x.NumElements();
+  int vi = 0;
+  for (int i = 0; i < limit; ++i) {
+    x.flat<T>()(i) = static_cast<T>(init_values[vi++]);
+    if (vi >= init_values.size()) vi = 0;
+  }
+  return x;
+}
+
 TEST(SummarizeValue, Uninitialized) {
   Tensor x(DT_INT32);
   TensorTestHelper::set_shape(&x, TensorShape({4, 4}));
@@ -1360,17 +1599,138 @@ TEST(SummarizeValue, Uninitialized) {
       x.SummarizeValue(16));
 }
 
-TEST(SummarizeValue, INT32) {
-  Tensor x = MkTensor<int>(DT_INT32, TensorShape({5}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("1 2 3 4 0", x.SummarizeValue(16));
-  x = MkTensor<int>(DT_INT32, TensorShape({2, 2}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("[1 2][3 4]", x.SummarizeValue(16));
-  x = MkTensor<int>(DT_INT32, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("[[[1]][[2]]][[[3]][[4]]]", x.SummarizeValue(16));
-  EXPECT_EQ("[[[1]][[2]]][[[3]]]...", x.SummarizeValue(3));
-  x = MkTensor<int>(DT_INT32, TensorShape({0}), {});
-  EXPECT_EQ("", x.SummarizeValue(16));
-}
+struct SummarizeValueTestCase {
+  std::string test_name;
+  Tensor x_1d;
+  Tensor x_2d;
+  Tensor x_4d;
+  Tensor x_empty;
+};
+
+using SimpleNum = ::testing::TestWithParam<SummarizeValueTestCase>;
+
+TEST_P(SimpleNum, MultiLength) {
+  const SummarizeValueTestCase& test_case = GetParam();
+
+  EXPECT_EQ("1 2 3 4 0", test_case.x_1d.SummarizeValue(16));
+  EXPECT_EQ("[1 2][3 4]", test_case.x_2d.SummarizeValue(16));
+  EXPECT_EQ("[[[1]][[2]]][[[3]][[4]]]", test_case.x_4d.SummarizeValue(16));
+  EXPECT_EQ("[[[1]][[2]]][[[3]]]...", test_case.x_4d.SummarizeValue(3));
+  EXPECT_EQ("", test_case.x_empty.SummarizeValue(16));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SummarizeValue, SimpleNum,
+    ::testing::ValuesIn<SummarizeValueTestCase>({
+        {"DT_QUINT8",
+         MkTensor<Eigen::QUInt8>(DT_QUINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt8>(DT_QUINT8, TensorShape({2, 2}),
+                                 {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt8>(DT_QUINT8, TensorShape({2, 2, 1, 1}),
+                                 {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt8>(DT_QUINT8, TensorShape({0}), {})},
+        {"DT_QUINT16",
+         MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({5}),
+                                  {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({2, 2}),
+                                  {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({2, 2, 1, 1}),
+                                  {1, 2, 3, 4, 0}),
+         MkTensor<Eigen::QUInt16>(DT_QUINT16, TensorShape({0}), {})},
+        {"DT_UINT8",
+         MkTensor<uint8>(DT_UINT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8>(DT_UINT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<uint8>(DT_UINT8, TensorShape({0}), {})},
+        {"DT_UINT16",
+         MkTensor<uint16>(DT_UINT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint16>(DT_UINT16, TensorShape({2, 2, 1, 1}),
+                          {1, 2, 3, 4, 0}),
+         MkTensor<uint16>(DT_UINT16, TensorShape({0}), {})},
+        {"DT_UINT32",
+         MkTensor<uint32>(DT_UINT32, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint32>(DT_UINT32, TensorShape({2, 2, 1, 1}),
+                          {1, 2, 3, 4, 0}),
+         MkTensor<uint32>(DT_UINT32, TensorShape({0}), {})},
+        {"DT_UINT64",
+         MkTensor<uint64>(DT_UINT64, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<uint64>(DT_UINT64, TensorShape({2, 2, 1, 1}),
+                          {1, 2, 3, 4, 0}),
+         MkTensor<uint64>(DT_UINT64, TensorShape({0}), {})},
+        {"DT_INT8", MkTensor<int8>(DT_INT8, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int8>(DT_INT8, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int8>(DT_INT8, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<int8>(DT_INT8, TensorShape({0}), {})},
+        {"DT_INT16",
+         MkTensor<int16>(DT_INT16, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int16>(DT_INT16, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int16>(DT_INT16, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<int16>(DT_INT16, TensorShape({0}), {})},
+        {"DT_INT32", MkTensor<int>(DT_INT32, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int>(DT_INT32, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int>(DT_INT32, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensor<int>(DT_INT32, TensorShape({0}), {})},
+        {"DT_INT64",
+         MkTensor<int64_t>(DT_INT64, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensor<int64_t>(DT_INT64, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensor<int64_t>(DT_INT64, TensorShape({2, 2, 1, 1}),
+                           {1, 2, 3, 4, 0}),
+         MkTensor<int64_t>(DT_INT64, TensorShape({0}), {})},
+        {"DT_FLOAT8_E4M3FN",
+         MkTensorNoInitList<float8_e4m3fn>(DT_FLOAT8_E4M3FN, TensorShape({5}),
+                                           {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e4m3fn>(
+             DT_FLOAT8_E4M3FN, TensorShape({2, 2}), {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e4m3fn>(
+             DT_FLOAT8_E4M3FN, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e4m3fn>(DT_FLOAT8_E4M3FN, TensorShape({0}),
+                                           {})},
+        {"DT_FLOAT8_E5M2",
+         MkTensorNoInitList<float8_e5m2>(DT_FLOAT8_E5M2, TensorShape({5}),
+                                         {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e5m2>(DT_FLOAT8_E5M2, TensorShape({2, 2}),
+                                         {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e5m2>(
+             DT_FLOAT8_E5M2, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float8_e5m2>(DT_FLOAT8_E5M2, TensorShape({0}), {})},
+        {"DT_HALF",
+         MkTensorNoInitList<Eigen::half>(DT_HALF, TensorShape({5}),
+                                         {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<Eigen::half>(DT_HALF, TensorShape({2, 2}),
+                                         {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<Eigen::half>(DT_HALF, TensorShape({2, 2, 1, 1}),
+                                         {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<Eigen::half>(DT_HALF, TensorShape({0}), {})},
+        {"DT_BFLOAT16",
+         MkTensorNoInitList<bfloat16>(DT_BFLOAT16, TensorShape({5}),
+                                      {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<bfloat16>(DT_BFLOAT16, TensorShape({2, 2}),
+                                      {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<bfloat16>(DT_BFLOAT16, TensorShape({2, 2, 1, 1}),
+                                      {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<bfloat16>(DT_BFLOAT16, TensorShape({0}), {})},
+        {"DT_FLOAT",
+         MkTensorNoInitList<float>(DT_FLOAT, TensorShape({5}), {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float>(DT_FLOAT, TensorShape({2, 2}),
+                                   {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float>(DT_FLOAT, TensorShape({2, 2, 1, 1}),
+                                   {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<float>(DT_FLOAT, TensorShape({0}), {})},
+        {"DT_DOUBLE",
+         MkTensorNoInitList<double>(DT_DOUBLE, TensorShape({5}),
+                                    {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<double>(DT_DOUBLE, TensorShape({2, 2}),
+                                    {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<double>(DT_DOUBLE, TensorShape({2, 2, 1, 1}),
+                                    {1, 2, 3, 4, 0}),
+         MkTensorNoInitList<double>(DT_DOUBLE, TensorShape({0}), {})},
+    }),
+    [](const ::testing::TestParamInfo<SimpleNum::ParamType>& info) {
+      return info.param.test_name;
+    });
 
 TEST(SummarizeValue, INT32Dims) {
   Tensor x = MkTensor<int>(DT_INT32, TensorShape({3, 4}),
@@ -1379,18 +1739,6 @@ TEST(SummarizeValue, INT32Dims) {
   EXPECT_EQ("[1 2 3 4][5 6 7 8][9 10...]...", x.SummarizeValue(10));
 }
 
-TEST(SummarizeValue, FLOAT) {
-  Tensor x = MkTensor<float>(DT_FLOAT, TensorShape({5}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("1 2 3 4 0", x.SummarizeValue(16));
-  x = MkTensor<float>(DT_FLOAT, TensorShape({2, 2}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("[1 2][3 4]", x.SummarizeValue(16));
-  x = MkTensor<float>(DT_FLOAT, TensorShape({2, 2, 1, 1}), {1, 2, 3, 4, 0});
-  EXPECT_EQ("[[[1]][[2]]][[[3]][[4]]]", x.SummarizeValue(16));
-  EXPECT_EQ("[[[1]][[2]]][[[3]]]...", x.SummarizeValue(3));
-  x = MkTensor<float>(DT_FLOAT, TensorShape({0}), {});
-  EXPECT_EQ("", x.SummarizeValue(16));
-}
-
 TEST(SummarizeValue, BOOL) {
   Tensor x = MkTensor<bool>(DT_BOOL, TensorShape({5}), {false, true, true});
   EXPECT_EQ("0 1 1 0 1", x.SummarizeValue(16));
@@ -1468,6 +1816,28 @@ TEST(SummarizeValue, STRING_PRINT_V2) {
             x.SummarizeValue(16, true));
 }
 
+TEST(SummarizeValue, DT_RESOURCE) {
+  Tensor t(DT_RESOURCE, TensorShape({}));
+  ResourceHandle tmp;
+  tmp.set_name("a");
+  t.flat<ResourceHandle>()(0) = tmp;
+  EXPECT_EQ(
+      "<ResourceHandle(name=\"a\", device=\"\", container=\"\", "
+      "type=\"\", dtype and shapes : \"[  ]\")>",
+      t.SummarizeValue(-1, true));
+}
+
+TEST(SummarizeValue, DT_VARIANT) {
+  Tensor t(DT_VARIANT, TensorShape({}));
+
+  Tensor internal(DT_FLOAT, TensorShape({}));
+  internal.flat<float>()(0) = 42.0f;
+  t.flat<Variant>()(0) = internal;
+
+  EXPECT_EQ("<Tensor<type: float shape: [] values: 42>>",
+            t.SummarizeValue(-1, true));
+}
+
 void BM_CreateAndDestroy(::testing::benchmark::State& state) {
   TensorShape shape({10, 20});
   for (auto s : state) {
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 70e473d7acd..d50e3d81e7c 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -239,6 +239,10 @@ void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
       return ExpectEqual<bfloat16>(x, y, t);
     case DT_HALF:
       return ExpectEqual<Eigen::half>(x, y, t);
+    case DT_FLOAT8_E5M2:
+      return ExpectEqual<float8_e5m2>(x, y, t);
+    case DT_FLOAT8_E4M3FN:
+      return ExpectEqual<float8_e4m3fn>(x, y, t);
     default:
       EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype());
   }
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 74432386a03..ee19fc3f5f0 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -304,10 +304,10 @@ static bool IsNegativeZero(Eigen::QUInt16 value) { return false; }
 static bool IsNegativeZero(Eigen::QInt16 value) { return false; }
 static bool IsNegativeZero(Eigen::QInt32 value) { return false; }
 static bool IsNegativeZero(Eigen::half value) {
-  return IsNegativeZero<float>(value);
+  return IsNegativeZero<float>(static_cast<float>(value));
 }
 static bool IsNegativeZero(Eigen::bfloat16 value) {
-  return IsNegativeZero<float>(value);
+  return IsNegativeZero<float>(static_cast<float>(value));
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 8b53a464a57..9fb56070337 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -121,6 +121,10 @@ string DataTypeStringInternal(DataType dtype) {
       return "bfloat16";
     case DT_HALF:
       return "half";
+    case DT_FLOAT8_E5M2:
+      return "float8_e5m2";
+    case DT_FLOAT8_E4M3FN:
+      return "float8_e4m3fn";
     case DT_RESOURCE:
       return "resource";
     case DT_VARIANT:
@@ -215,6 +219,12 @@ bool DataTypeFromString(StringPiece sp, DataType* dt) {
   } else if (sp == "half" || sp == "float16") {
     *dt = DT_HALF;
     return true;
+  } else if (sp == "float8_e5m2") {
+    *dt = DT_FLOAT8_E5M2;
+    return true;
+  } else if (sp == "float8_e4m3fn") {
+    *dt = DT_FLOAT8_E4M3FN;
+    return true;
   } else if (sp == "resource") {
     *dt = DT_RESOURCE;
     return true;
@@ -262,6 +272,8 @@ int DataTypeSize(DataType dt) {
     // bitcast.
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
+    CASE(tsl::float8_e5m2);
+    CASE(tsl::float8_e4m3fn);
 
     default:
       return 0;
@@ -294,6 +306,8 @@ DEFINE_DATATYPETOENUM_VALUE(quint16);
 DEFINE_DATATYPETOENUM_VALUE(qint32);
 DEFINE_DATATYPETOENUM_VALUE(bfloat16);
 DEFINE_DATATYPETOENUM_VALUE(Eigen::half);
+DEFINE_DATATYPETOENUM_VALUE(float8_e5m2);
+DEFINE_DATATYPETOENUM_VALUE(float8_e4m3fn);
 DEFINE_DATATYPETOENUM_VALUE(ResourceHandle);
 DEFINE_DATATYPETOENUM_VALUE(Variant);
 #undef DEFINE_DATATYPETOENUM_VALUE
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 4dded3767c6..60761b42f74 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -243,7 +243,7 @@ constexpr DataTypeSet kAllTypes =
     ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) |
     ToSet(DT_QUINT16) | ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_RESOURCE) |
     ToSet(DT_VARIANT) | ToSet(DT_UINT32) | ToSet(DT_UINT64) |
-    ToSet(DT_BFLOAT16);
+    ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT8_E5M2) | ToSet(DT_FLOAT8_E4M3FN);
 inline const DataTypeSet& AllTypes() { return kAllTypes; }
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
@@ -262,8 +262,8 @@ const DataTypeSet kNumberTypes =
     ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT64) | ToSet(DT_INT32) |
     ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
     ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_QINT8) |
-    ToSet(DT_QUINT8) | ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_UINT32) |
-    ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
+    ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_HALF) | ToSet(DT_UINT32) | ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
 inline const DataTypeSet& NumberTypes() { return kNumberTypes; }
 
 constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
@@ -377,6 +377,8 @@ MATCH_TYPE_AND_ENUM(quint16, DT_QUINT16);
 MATCH_TYPE_AND_ENUM(qint32, DT_QINT32);
 MATCH_TYPE_AND_ENUM(bfloat16, DT_BFLOAT16);
 MATCH_TYPE_AND_ENUM(Eigen::half, DT_HALF);
+MATCH_TYPE_AND_ENUM(float8_e5m2, DT_FLOAT8_E5M2);
+MATCH_TYPE_AND_ENUM(float8_e4m3fn, DT_FLOAT8_E4M3FN);
 MATCH_TYPE_AND_ENUM(ResourceHandle, DT_RESOURCE);
 MATCH_TYPE_AND_ENUM(Variant, DT_VARIANT);
 
@@ -453,14 +455,16 @@ constexpr DataTypeSet kDataTypesCanUseMemcpy =
     ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_INT64) |
     ToSet(DT_UINT64) | ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
     ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
-    ToSet(DT_BFLOAT16) | ToSet(DT_HALF);
+    ToSet(DT_BFLOAT16) | ToSet(DT_HALF) | ToSet(DT_FLOAT8_E5M2) |
+    ToSet(DT_FLOAT8_E4M3FN);
 inline bool DataTypeCanUseMemcpy(DataType dt) {
   return kDataTypesCanUseMemcpy.Contains(dt);
 }
 
 // Returns true iff 'dt' is a real, non-quantized floating point type.
 constexpr DataTypeSet kDataTypeIsFloating =
-    ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE);
+    ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) |
+    ToSet(DT_FLOAT8_E4M3FN) | ToSet(DT_FLOAT8_E5M2);
 inline bool DataTypeIsFloating(DataType dt) {
   return kDataTypeIsFloating.Contains(dt);
 }
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 47663692462..2a01c028c04 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -30,7 +30,7 @@ enum DataType {
   DT_QINT8 = 11;     // Quantized int8
   DT_QUINT8 = 12;    // Quantized uint8
   DT_QINT32 = 13;    // Quantized int32
-  DT_BFLOAT16 = 14;  // Float32 truncated to 16 bits.  Only for cast ops.
+  DT_BFLOAT16 = 14;  // Float32 truncated to 16 bits.
   DT_QINT16 = 15;    // Quantized int16
   DT_QUINT16 = 16;   // Quantized uint16
   DT_UINT16 = 17;
@@ -40,6 +40,9 @@ enum DataType {
   DT_VARIANT = 21;  // Arbitrary C++ data types
   DT_UINT32 = 22;
   DT_UINT64 = 23;
+  DT_FLOAT8_E5M2 = 24;    // 5 exponent bits, 2 mantissa bits.
+  DT_FLOAT8_E4M3FN = 25;  // 4 exponent bits, 3 mantissa bits, finite-only, with
+                          // 2 NaNs (0bS1111111).
 
   // Do not use!  These are only for parameters.  Every enum above
   // should have a corresponding value below (verified by types_test).
@@ -66,6 +69,8 @@ enum DataType {
   DT_VARIANT_REF = 121;
   DT_UINT32_REF = 122;
   DT_UINT64_REF = 123;
+  DT_FLOAT8_E5M2_REF = 124;
+  DT_FLOAT8_E4M3FN_REF = 125;
 }
 // LINT.ThenChange(
 //    https://www.tensorflow.org/code/tensorflow/c/tf_datatype.h,
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 060e86ed72b..23806f9c866 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -99,6 +100,10 @@ TEST(TypesTest, DataTypeFromString) {
   EXPECT_EQ(DT_QUINT8_REF, dt);
   ASSERT_TRUE(DataTypeFromString("bfloat16", &dt));
   EXPECT_EQ(DT_BFLOAT16, dt);
+  ASSERT_TRUE(DataTypeFromString("float8_e5m2", &dt));
+  EXPECT_EQ(DT_FLOAT8_E5M2, dt);
+  ASSERT_TRUE(DataTypeFromString("float8_e4m3fn", &dt));
+  EXPECT_EQ(DT_FLOAT8_E4M3FN, dt);
 }
 
 template <typename T>
@@ -128,6 +133,8 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_INT16));
   EXPECT_FALSE(DataTypeIsQuantized(DT_INT32));
   EXPECT_FALSE(DataTypeIsQuantized(DT_BFLOAT16));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E5M2));
+  EXPECT_FALSE(DataTypeIsQuantized(DT_FLOAT8_E4M3FN));
 }
 
 TEST(TypesTest, ComplexTypes) {
diff --git a/tensorflow/core/function/BUILD b/tensorflow/core/function/BUILD
deleted file mode 100644
index 50a67fa0499..00000000000
--- a/tensorflow/core/function/BUILD
+++ /dev/null
@@ -1,154 +0,0 @@
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-
-package(
-    default_visibility = ["//tensorflow/core/function:__subpackages__"],
-)
-
-licenses(["notice"])
-
-cc_library(
-    name = "runtime_client_cc",
-    srcs = [
-        "runtime_client.cc",
-    ],
-    hdrs = [
-        "runtime_client.h",
-    ],
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:immediate_execution_context",
-        "//tensorflow/c/eager:immediate_execution_operation",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime:device_mgr",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:core",
-        "//tensorflow/core/framework:function_proto_cc",
-        "//tensorflow/core/framework:graph_proto_cc",
-        "//tensorflow/core/framework:op_def_proto_cc",
-        "//tensorflow/core/ir:Dialect",
-        "//tensorflow/core/ir/importexport:graphdef_export",
-        "//tensorflow/core/ir/importexport:graphdef_import",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/core/platform:stringpiece",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-    ],
-    # TODO(mdan): Get rid of alwayslink, it's nonstandard.
-    alwayslink = 1,
-)
-
-# TODO(mdan): Pull these transitive header deps in a more decent fashion.
-# TODO(mdan): Get rid of headers-only lib, it's nonstandard. Use cc_shared_library?
-cc_library(
-    name = "runtime_client_headers",
-    textual_hdrs = [
-        "runtime_client.h",
-        "//tensorflow/c/eager:pywrap_required_hdrs",
-        "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/core/config:flags_headers",
-        "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
-        "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
-        "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
-        "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-    ],
-)
-
-tf_cc_test(
-    name = "runtime_client_cc_test",
-    srcs = ["runtime_client_test.cc"],
-    deps = [
-        ":runtime_client_cc",
-        "//tensorflow/c:c_api_experimental",  # buildcleaner: keep (registers CPU ops?)
-        "//tensorflow/c:tensor_interface",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/core:test",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/framework:function_proto_cc",
-        "//tensorflow/core/framework:op_def_proto_cc",
-        "//tensorflow/core/framework:types_proto_cc",
-        "//tensorflow/core/function/testing:test_pass_cc",
-        "//tensorflow/core/ir:Dialect",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:Parser",
-    ],
-)
-
-# TODO(b/221223841): Get rid of if_static, it's nonstandard.
-tf_python_pybind_extension(
-    name = "runtime_client_pybind",
-    srcs = ["runtime_client_pybind.cc"],
-    deps = [
-        ":runtime_client_headers",
-        "@pybind11",
-        "//tensorflow/python/lib/core:pybind11_status",
-    ] + if_static(
-        extra_deps = [
-            "//tensorflow/core/framework:function_proto_cc",
-            "//tensorflow/core/protobuf:eager_service_proto_cc",
-            "//tensorflow/core/protobuf:master_proto_cc",
-            "//tensorflow/core/protobuf:worker_proto_cc",
-            "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
-        ],
-        otherwise = [
-            "//tensorflow/core/framework:function_proto_cc_headers_only",
-            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
-            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
-            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
-            "//tensorflow/tsl/protobuf:coordination_service_proto_cc_headers_only",
-        ],
-    ),
-)
-
-# TODO(mdan): Drop function_proto_py_pb2 once pybind11_protobuf is available.
-pytype_strict_library(
-    name = "runtime_client_py",
-    srcs = [
-        "runtime_client.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":runtime_client_pybind",
-        "//tensorflow/core/framework:function_proto_py",
-        "//tensorflow/python:pywrap_tensorflow",  # buildcleaner: keep (required for TF pybind)
-    ],
-)
-
-py_strict_test(
-    name = "runtime_client_py_test",
-    srcs = ["runtime_client_test.py"],
-    main = "runtime_client_test.py",
-    python_version = "PY3",
-    tags = ["no_oss"],  # TODO(b/219089812)
-    deps = [
-        ":runtime_client_py",
-        "//tensorflow/core/framework:function_proto_py",
-        "//tensorflow/core/function/testing:test_pass_py",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:execute",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
diff --git a/tensorflow/core/function/capture/BUILD b/tensorflow/core/function/capture/BUILD
index 9258847c30e..ebea85f74c2 100644
--- a/tensorflow/core/function/capture/BUILD
+++ b/tensorflow/core/function/capture/BUILD
@@ -1,9 +1,11 @@
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/function/capture:__subpackages__",
         "//tensorflow/python/eager/polymorphic_function:__subpackages__",
+        "//tensorflow/python/framework:__subpackages__",
     ],
 )
 
@@ -34,3 +36,63 @@ py_strict_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_strict_test(
+    name = "by_ref_capture_test",
+    srcs = ["by_ref_capture_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "capture_container",
+    srcs = [
+        "capture_container.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/types",
+        "//tensorflow/python/util",
+    ],
+)
+
+py_strict_test(
+    name = "capture_container_test",
+    srcs = ["capture_container_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":capture_container",
+        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager/polymorphic_function:composite_tensor_utils",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "restore_captures",
+    srcs = ["restore_captures.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/trackable:resource",
+    ],
+)
diff --git a/tensorflow/core/function/capture/by_ref_capture_test.py b/tensorflow/core/function/capture/by_ref_capture_test.py
new file mode 100644
index 00000000000..2853559254b
--- /dev/null
+++ b/tensorflow/core/function/capture/by_ref_capture_test.py
@@ -0,0 +1,106 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for detecting free vars in tf.function."""
+
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ByRefCaptureTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(val_type=[int, constant_op.constant]))
+  def test_direct_capture_mutation(self, val_type):
+    x = val_type(1)
+
+    @def_function.function
+    def f():
+      graph = ops.get_default_graph()
+      cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+      return cap_x + 1
+
+    self.assertEqual(f(), 2)
+    x = val_type(2)
+    self.assertEqual(f(), 3)
+
+  def test_capture_in_nested_function(self):
+    x = 1
+
+    @def_function.function
+    def f():
+      graph = ops.get_default_graph()
+      # Capture the same x for the outer tf.function
+      graph._experimental_capture_side_input_by_ref("x", lambda: x)
+
+      @def_function.function
+      def g():
+        graph = ops.get_default_graph()
+        cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+        return cap_x + 1
+
+      return g()
+
+    self.assertEqual(f(), 2)
+    x = 2
+    self.assertEqual(f(), 3)
+
+  def test_capture_in_outer_function(self):
+    x = 1
+
+    def g():
+      graph = ops.get_default_graph()
+      cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+      return cap_x + 1
+
+    @def_function.function
+    def f():
+      return g()
+
+    self.assertEqual(f(), 2)
+    x = 2
+    self.assertEqual(f(), 3)
+
+  def test_capture_in_outer_tf_function(self):
+    x = 1
+
+    @def_function.function
+    def g():
+      graph = ops.get_default_graph()
+      cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+      return cap_x + 1
+
+    @def_function.function
+    def f():
+      # Call `_experimental_capture_side_input_by_ref` so that the outer
+      # tf.function will retrace when needed.
+      graph = ops.get_default_graph()
+      graph._experimental_capture_side_input_by_ref("x", lambda: x)
+      return g()
+
+    self.assertEqual(f(), 2)
+    x = 2
+    self.assertEqual(f(), 3)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/core/function/capture/capture_container.py b/tensorflow/core/function/capture/capture_container.py
new file mode 100644
index 00000000000..badcbfabea8
--- /dev/null
+++ b/tensorflow/core/function/capture/capture_container.py
@@ -0,0 +1,148 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FuncGraph and related functionality."""
+
+import collections as py_collections
+import dataclasses
+import inspect
+from typing import Any, Callable, Hashable, Mapping
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import type_spec
+from tensorflow.python.types import core
+from tensorflow.python.util import nest
+from tensorflow.python.util import object_identity
+
+
+@dataclasses.dataclass(frozen=True)
+class CaptureContainer():
+  """A container for both by-reference and by-value captures.
+
+  external: Used to record the tensor external to the func_graph.
+     For by-value captures, it would be the original tensor.
+     For by-reference captures, it would be the lambda function, which will be
+     called later to get the capture's runtime value.
+  internal: An internal placeholder for the capture, or a constant tensor.
+    The external value of the capture will be fed to this internal placeholder
+    when executing the func_graph as a side input.
+  idf: A Hashable identifier for the capture.
+  is_by_ref: A bool indicates if the capture is call by reference or value.
+    This flag will determine how `CaptureContainer.internal` is used.
+  """
+  external: Any
+  internal: core.Tensor
+  idf: Hashable
+  is_by_ref: bool = False
+
+
+class FunctionCaptures(object):
+  """A container for all capture usages within FuncGraph."""
+
+  def __init__(self):
+    # Dict that maps capture identifier -> CaptureContainer
+    self._by_ref = py_collections.OrderedDict()
+    self._by_val = py_collections.OrderedDict()
+    # Set of external ops on which the graph has a control dependency
+    self.control = object_identity.ObjectIdentitySet()
+
+  def capture_by_val(self,
+                     value: Any,
+                     idf: Hashable = None):
+    """Create a by-value capture if not exists."""
+    raise NotImplementedError()
+
+  def capture_by_ref(self,
+                     lam: Callable[[], Any],
+                     idf: Hashable = None):
+    """Create a by-referece capture if not exists."""
+    # check if the capture exist in self._by_ref
+    if idf is not None and idf in self._by_ref:
+      capture = self._by_ref[idf]
+      return capture.internal
+    if idf is None:
+      idf = len(self._by_ref)
+
+    if context.executing_eagerly():
+      return lam()
+    placeholder = self._create_capture_placeholder(lam)
+    capture = CaptureContainer(lam, placeholder, idf, is_by_ref=True)
+    self._by_ref[idf] = capture
+    return capture.internal
+
+  def merge_by_ref_with(self, other: "FunctionCaptures"):
+    """Add by-ref captures from `other` to `self` if not exist."""
+    assert isinstance(other, FunctionCaptures)
+    for key, capture in other.by_ref_captures.items():
+      if key not in self._by_ref:
+        self._by_ref[key] = capture
+
+  def get_by_ref_snapshot(self) -> Mapping[Hashable, Any]:
+    """Get a snapshot of current values of by-ref captures."""
+    snapshot = {}
+    for key, capture in self._by_ref.items():
+      func = capture.external
+      snapshot[key] = func()
+    return snapshot
+
+  # TODO(panzf): Use FunctionType/TraceType to create placeholder here.
+  def _create_capture_placeholder(self, func: Callable[[], Any]) -> ...:
+    """Create placeholder if the input is tensor."""
+    values_nest = func()
+
+    values_flat = nest.flatten(values_nest)
+    # Return values in flat format. It consists of placeholders and non-tensor
+    # values.
+    return_flat = []
+    tensor_spec_flat = []
+    # Create return_flat and replace tensors with None. Later, each None is
+    # replaced again by corresponding placeholders
+    for value in values_flat:
+      if isinstance(value, core.Tensor):
+        return_flat.append(None)
+        tensor_spec_flat.append(type_spec.type_spec_from_value(value))
+      elif isinstance(value, set) or isinstance(value, frozenset):
+        raise NotImplementedError(
+            (f"Side input returned by '{inspect.getsource(func).strip()}' "
+             f"has element of {type(value)} type, which is currently not "
+             "supported by tf.function."))
+      else:
+        return_flat.append(value)
+    if tensor_spec_flat:
+
+      def tensor_func():
+        values = nest.flatten(func())
+        return [value for value in values if isinstance(value, core.Tensor)]
+      # TODO(panzf): remove get_default_graph after moving
+      # capture_call_time_value to this class.
+      graph = ops.get_default_graph()
+      placeholder_flat = graph.capture_call_time_value(
+          tensor_func, tensor_spec_flat)
+      # replace None that represents tensors with placehoders
+      flat_ptr = 0
+      for idx, item in enumerate(return_flat):
+        if item is None:
+          return_flat[idx] = placeholder_flat[flat_ptr]
+          flat_ptr += 1
+    return_nest = nest.pack_sequence_as(values_nest, return_flat)
+    return return_nest
+
+  @property
+  def by_ref_captures(self):
+    return self._by_ref
+
+  @property
+  def by_val_captures(self):
+    return self._by_val
diff --git a/tensorflow/core/function/capture/capture_container_test.py b/tensorflow/core/function/capture/capture_container_test.py
new file mode 100644
index 00000000000..4bbb4a622cc
--- /dev/null
+++ b/tensorflow/core/function/capture/capture_container_test.py
@@ -0,0 +1,120 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.function capture containers."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.function.capture import capture_container
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.platform import test
+
+
+class CaptureContainerTest(test.TestCase, parameterized.TestCase):
+
+  def _prepare_function_captures(self):
+    container = capture_container.FunctionCaptures()
+    container.capture_by_ref(lambda: 1, "1")
+    container.capture_by_ref(lambda: 2, "2")
+    return container
+
+  @combinations.generate(combinations.combine(mode=["graph"]))
+  def test_capture_by_ref_dict_sz(self):
+    container = self._prepare_function_captures()
+    self.assertLen(container.by_ref_captures, 2)
+
+  @combinations.generate(combinations.combine(mode=["graph"]))
+  def test_capture_by_ref_default_idf(self):
+    container = self._prepare_function_captures()
+    idf = len(container.by_ref_captures)
+    container.capture_by_ref(lambda: 12345)
+    capture = container.by_ref_captures[idf]
+    lam = capture.external
+    self.assertEqual(lam(), 12345)
+
+  @combinations.generate(combinations.combine(mode=["graph"]))
+  def test_capture_by_ref_is_by_ref(self):
+    container = self._prepare_function_captures()
+    capture = container.by_ref_captures["1"]
+    self.assertTrue(capture.is_by_ref)
+
+  @combinations.generate(combinations.combine(mode=["graph"]))
+  def test_capture_by_ref_with_duplicate_idf(self):
+    container = self._prepare_function_captures()
+    container.capture_by_ref(lambda: 3, "1")
+    self.assertLen(container.by_ref_captures, 2)
+
+  @combinations.generate(combinations.combine(mode=["graph"]))
+  def test_get_by_ref_snapshot(self):
+    container = self._prepare_function_captures()
+    snaptshot = container.get_by_ref_snapshot()
+    self.assertDictEqual(snaptshot, {"1": 1, "2": 2})
+
+  @combinations.generate(combinations.combine(mode=["eager",]))
+  def test_create_capture_placeholder_eager(self):
+    container = self._prepare_function_captures()
+    lam = lambda: 12345
+    res = container._create_capture_placeholder(lam)
+    self.assertEqual(res, 12345)
+
+  @combinations.generate(combinations.combine(mode=["graph",]))
+  def test_create_capture_placeholder_graph_tensor(self):
+    container = self._prepare_function_captures()
+    lam = lambda: constant_op.constant(123)
+    spec = tensor_spec.TensorSpec([], np.int32, name="Placeholder:0")
+    graph = func_graph.FuncGraph("graph")
+    with graph.as_default():
+      placeholder = container._create_capture_placeholder(lam)
+      self.assertEqual(placeholder.shape, spec.shape)
+      self.assertEqual(placeholder.dtype, spec.dtype)
+      self.assertEqual(placeholder.name, spec.name)
+
+  @combinations.generate(combinations.combine(mode=["graph",]))
+  def test_create_capture_placeholder_graph_nested_tensor(self):
+    container = self._prepare_function_captures()
+    a = constant_op.constant(1)
+    b = constant_op.constant(2.0)
+    c = constant_op.constant([1, 2, 3])
+    spec_a = tensor_spec.TensorSpec([], np.int32)
+    spec_b = tensor_spec.TensorSpec([], np.float32)
+    spec_c = tensor_spec.TensorSpec([3,], np.int32)
+
+    value = [{"a": a}, [b, c]]
+    lam = lambda: value
+    graph = func_graph.FuncGraph("graph")
+    with graph.as_default():
+      placeholder = container._create_capture_placeholder(lam)
+    self.assertLen(placeholder, 2)
+    self.assertIn("a", placeholder[0])
+    self.assertLen(placeholder[1], 2)
+
+    placeholder_a = placeholder[0]["a"]
+    placeholder_b = placeholder[1][0]
+    placeholder_c = placeholder[1][1]
+    self.assertEqual(placeholder_a.shape, spec_a.shape)
+    self.assertEqual(placeholder_a.dtype, spec_a.dtype)
+    self.assertEqual(placeholder_b.shape, spec_b.shape)
+    self.assertEqual(placeholder_b.dtype, spec_b.dtype)
+    self.assertEqual(placeholder_c.shape, spec_c.shape)
+    self.assertEqual(placeholder_c.dtype, spec_c.dtype)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/core/function/capture/restore_captures.py b/tensorflow/core/function/capture/restore_captures.py
new file mode 100644
index 00000000000..f390fa11ec0
--- /dev/null
+++ b/tensorflow/core/function/capture/restore_captures.py
@@ -0,0 +1,133 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""A shim layer for working with functions exported/restored from saved models.
+
+This functionality should ultimately be moved into a first-class core API.
+"""
+
+import warnings
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import handle_data_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.trackable import asset
+from tensorflow.python.trackable import resource
+
+
+def get_tensor_from_node(node):
+  """Resolves a saved model graph node into a tensor to be captured.
+
+  Args:
+    node: a tensor, variable, or resource to be resolved into a capturable
+      tensor
+
+  Returns:
+    A list of tensors.
+  Raises:
+    ValueError: if the node cannot be converted into a tensor.
+  """
+  with ops.init_scope():
+    # TODO(b/210144904): Use __tf_tensor__ instead of `is_[...]` checks
+    if getattr(node, "is_distributed_variable", False):
+      return node
+    elif getattr(node, "is_distributed_table", False):
+      return node
+    elif getattr(node, "is_sharded_variable", False):
+      return node
+    elif resource_variable_ops.is_resource_variable(node):
+      return node.handle
+    elif isinstance(node, asset.Asset):
+      return node.asset_path
+    elif tensor_util.is_tf_type(node):
+      return node
+    elif isinstance(node, resource.CapturableResource):
+      # Note: this executes restored functions in the CapturableResource.
+      return node.resource_handle
+    raise ValueError(f"Cannot convert node {node} to tensor.")
+
+
+def restore_captures(concrete_function, inputs):
+  """Restore captures for the concrete function.
+
+  Used at deserialization time.  For functions that are being deserialized,
+  saved model restores objects that tensors were captured from, but functions
+  only know about their tensors -- object information is destroyed by tracing.
+  This additional logic extracts the tensors which the function originally
+  captured.
+
+  Args:
+    concrete_function: the concrete function for which to restore captures
+    inputs: a list tensors or other Python objects (such as variables) which
+      contain tensors that were originally captured by the function
+  """
+  bound_inputs = [get_tensor_from_node(obj) for obj in inputs]
+  # pylint: disable=g-complex-comprehension
+  bound_variables = [
+      obj
+      for obj in inputs
+      if isinstance(
+          obj,
+          (variables_lib.Variable, resource_variable_ops.BaseResourceVariable),
+      )
+  ]
+  # TODO(b/205010575): This is only injecting the captured inputs into the
+  # concrete function, note that we did not modify the FuncGraph
+  # itself.
+  captured_inputs_list = []
+  concrete_function.set_variables(bound_variables)
+  if bound_inputs:
+    for bound_input, internal_capture in zip(
+        bound_inputs, concrete_function.inputs[-len(bound_inputs) :]
+    ):
+      # Distributed inputs have special logic for capturing, so we call their
+      # custom restoration methods
+      if hasattr(bound_input, "__tf_experimental_restore_capture__"):
+        captured_inputs_list.append(
+            bound_input.__tf_experimental_restore_capture__(
+                concrete_function, internal_capture
+            )
+        )
+      else:
+        captured_inputs_list.append(bound_input)
+        concrete_function.graph.replace_capture(bound_input, internal_capture)
+        if internal_capture.dtype == dtypes.resource:
+          if resource_variable_ops.is_resource_variable(bound_input):
+            try:
+              handle = bound_input.handle
+            except ValueError:
+              # For mirrored variables we'll copy handle data for components
+              # as they get captured.
+              pass
+            else:
+              handle_data_util.copy_handle_data(handle, internal_capture)
+          else:
+            # TODO(b/213451747): Remove need to call copy_handle_data
+            handle_data_util.copy_handle_data(bound_input, internal_capture)
+        # Setting "captures" first means "capture" won't create a new
+        # placeholder for this input.
+        concrete_function.graph.capture(bound_input)
+
+  if any([inp is None for inp in captured_inputs_list]):
+    warnings.warn(
+        "Trying to load ShardedVariables using tf.saved_model.load. "
+        "This won't work if using a tf.distribute.Strategy, and may "
+        "use excess memory if not using a Strategy. Ignore this "
+        "warning if using tf.keras.models.load_model."
+    )
+  concrete_function.set_external_captures(captured_inputs_list)
diff --git a/tensorflow/core/function/integration_test/BUILD b/tensorflow/core/function/integration_test/BUILD
index cde46778ed1..5c6f6d189d9 100644
--- a/tensorflow/core/function/integration_test/BUILD
+++ b/tensorflow/core/function/integration_test/BUILD
@@ -1,7 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
-    default_visibility = ["//tensorflow/core/function:__subpackages__"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/core/function/runtime_client:__subpackages__"],
 )
 
 licenses(["notice"])
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 40db8bf2704..0f637c5b2c6 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -1,7 +1,12 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -69,18 +74,45 @@ pytype_strict_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        "//tensorflow/core/function/polymorphism:function_type_proto_py",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/core/function/trace_type:serialization",
         "//tensorflow/python/types",
+        "@absl_py//absl/logging",
+    ],
+)
+
+tf_proto_library(
+    name = "function_type_proto",
+    srcs = [
+        "function_type.proto",
+    ],
+    cc_api_version = 2,
+    protodeps = [
+        "//tensorflow/core/function/trace_type:serialization_proto",
     ],
+    visibility = ["//tensorflow:internal"],
 )
 
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "function_type_py_pb2",
+#     api_version = 2,
+#     visibility = ["//tensorflow:internal"],
+#     deps = [":function_type_proto"],
+# )
+# copybara:uncomment_end
+
 py_strict_test(
     name = "function_type_test",
     srcs = ["function_type_test.py"],
     python_version = "PY3",
     deps = [
         ":function_type",
+        "//tensorflow/core/function/polymorphism:function_type_proto_py",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/core/function/trace_type:serialization",
+        "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/types",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/core/function/polymorphism/function_cache_test.py b/tensorflow/core/function/polymorphism/function_cache_test.py
index eee6ad8a2d4..0d079bc3e0d 100644
--- a/tensorflow/core/function/polymorphism/function_cache_test.py
+++ b/tensorflow/core/function/polymorphism/function_cache_test.py
@@ -42,6 +42,9 @@ def is_subtype_of(self, other):
   def most_specific_common_supertype(self, others):
     return None
 
+  def placeholder_value(self, placeholder_context=None):
+    raise NotImplementedError
+
   def __eq__(self, other):
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -64,6 +67,9 @@ def most_specific_common_supertype(self, others):
 class MockSubtypeOf2(MockGenericType):
 
   def is_subtype_of(self, other):
+    if not isinstance(other, MockGenericType):
+      return False
+
     return other._object == 2
 
 
@@ -105,6 +111,9 @@ def __hash__(self) -> int:
   def __eq__(self, other: "MockShape") -> bool:
     return self.shape == other.shape
 
+  def placeholder_value(self, placeholder_context):
+    raise NotImplementedError
+
 
 def make_single_param_type(type_constraint):
   return function_type.FunctionType([
diff --git a/tensorflow/core/function/polymorphism/function_type.proto b/tensorflow/core/function/polymorphism/function_type.proto
new file mode 100644
index 00000000000..c9d13b28f91
--- /dev/null
+++ b/tensorflow/core/function/polymorphism/function_type.proto
@@ -0,0 +1,36 @@
+syntax = "proto2";
+
+package tensorflow.core.function.polymorphism.function_type;
+
+import "tensorflow/core/function/trace_type/serialization.proto";
+
+// Represents a serialized Parameter type.
+message Parameter {
+  enum Kind {
+    UNDEFINED = 0;
+    POSITIONAL_ONLY = 1;
+    POSITIONAL_OR_KEYWORD = 2;
+    VAR_POSITIONAL = 3;
+    KEYWORD_ONLY = 4;
+    VAR_KEYWORD = 5;
+  }
+  optional string name = 1;
+  optional Kind kind = 2;
+  optional bool is_optional = 3;
+  optional tensorflow.core.function.trace_type.serialization.SerializedTraceType
+      type_constraint = 4;
+}
+
+// Represents a serialized Capture type.
+message Capture {
+  optional string name = 1;
+  optional tensorflow.core.function.trace_type.serialization.SerializedTraceType
+      type_constraint = 2;
+}
+
+// Represents a serialized FunctionType.
+message FunctionType {
+  repeated Parameter parameters = 1;
+  repeated Capture captures = 2;
+  // TODO(fmuham): Add support for return type.
+}
diff --git a/tensorflow/core/function/polymorphism/function_type.py b/tensorflow/core/function/polymorphism/function_type.py
index 1e457edba4f..86dc3cdd708 100644
--- a/tensorflow/core/function/polymorphism/function_type.py
+++ b/tensorflow/core/function/polymorphism/function_type.py
@@ -18,13 +18,32 @@
 import inspect
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple
 
+from absl import logging
+
 from tensorflow.core.function import trace_type
+from tensorflow.core.function.polymorphism import function_type_pb2
+from tensorflow.core.function.trace_type import serialization
 from tensorflow.python.types import trace
 
 # Represents a defined parameter default value that is saved alongside the
 # function's captures.
 CAPTURED_DEFAULT_VALUE = object()
 
+PROTO_TO_PY_ENUM = {
+    function_type_pb2.Parameter.Kind.POSITIONAL_ONLY:
+        inspect.Parameter.POSITIONAL_ONLY,
+    function_type_pb2.Parameter.Kind.POSITIONAL_OR_KEYWORD:
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+    function_type_pb2.Parameter.Kind.VAR_POSITIONAL:
+        inspect.Parameter.VAR_POSITIONAL,
+    function_type_pb2.Parameter.Kind.KEYWORD_ONLY:
+        inspect.Parameter.KEYWORD_ONLY,
+    function_type_pb2.Parameter.Kind.VAR_KEYWORD:
+        inspect.Parameter.VAR_KEYWORD,
+}
+
+PY_TO_PROTO_ENUM = {v: k for k, v in PROTO_TO_PY_ENUM.items()}
+
 
 class Parameter(inspect.Parameter):
   """Represents a parameter to a function."""
@@ -39,6 +58,9 @@ def __init__(self, name: str, kind: Any, optional: bool,
           " is optional and its kind must be one of {POSITIONAL_ONLY, " +
           "KEYWORD_ONLY, POSITIONAL_OR_KEYWORD}. Got: " + str(kind))
 
+    if type_constraint and kind in [self.VAR_POSITIONAL, self.VAR_KEYWORD]:
+      raise TypeError("Variable args/kwargs can not have type constraints.")
+
     if not isinstance(type_constraint, (trace.TraceType, type(None))):
       raise TypeError(
           "Type constraints can only be an instance of a TraceType but got " +
@@ -51,6 +73,22 @@ def __init__(self, name: str, kind: Any, optional: bool,
         annotation=type_constraint
         if type_constraint is not None else self.empty)
 
+  @classmethod
+  def from_proto(cls, proto: Any) -> "Parameter":
+    deserialized_type_constraint = serialization.deserialize(
+        proto.type_constraint) if proto.HasField("type_constraint") else None
+    return Parameter(proto.name, PROTO_TO_PY_ENUM[proto.kind],
+                     proto.is_optional, deserialized_type_constraint)
+
+  def to_proto(self) -> function_type_pb2.Parameter:
+    serialized_type_constraint = serialization.serialize(
+        self.type_constraint) if self.type_constraint else None
+    return function_type_pb2.Parameter(
+        name=self.name,
+        kind=PY_TO_PROTO_ENUM[self.kind],
+        is_optional=self.optional,
+        type_constraint=serialized_type_constraint)
+
   @property
   def optional(self) -> bool:
     """If this parameter might not be supplied for a call."""
@@ -106,7 +144,7 @@ def __hash__(self):
     return hash((self.name, self.kind, self.optional, self.type_constraint))
 
   def __repr__(self):
-    return ("Parameter(name=" + self.name + ", kind" + str(self.kind) +
+    return ("Parameter(name=" + self.name + ", kind=" + str(self.kind) +
             ", optional=" + repr(self.optional) + ", type_constraint=" +
             repr(self.type_constraint) + ")")
 
@@ -116,7 +154,7 @@ def __reduce__(self):
 
 
 class FunctionType(inspect.Signature):
-  """Represents the parameters of a polymorphic function."""
+  """Represents the signature of a polymorphic/monomorphic function."""
 
   def __init__(self,
                parameters: Sequence[inspect.Parameter],
@@ -147,11 +185,6 @@ def from_callable(cls,
         for p in signature.parameters.values()
     ]
 
-    if inspect.ismethod(obj):
-      parameters = [
-          Parameter("self", Parameter.POSITIONAL_OR_KEYWORD, False, None)
-      ] + parameters
-
     return FunctionType(parameters)
 
   @classmethod
@@ -167,6 +200,38 @@ def get_default_values(cls,
         default_values[p.name] = p.default
     return default_values
 
+  @classmethod
+  def from_proto(cls, proto: Any) -> "FunctionType":
+    return FunctionType([Parameter.from_proto(p) for p in proto.parameters],
+                        collections.OrderedDict([
+                            (c.name,
+                             serialization.deserialize(c.type_constraint))
+                            for c in proto.captures
+                        ]))
+
+  def to_proto(self) -> Any:
+    return function_type_pb2.FunctionType(
+        parameters=[p.to_proto() for p in self.parameters.values()],
+        captures=[
+            function_type_pb2.Capture(
+                name=n, type_constraint=serialization.serialize(t))
+            for n, t in self.captures.items()
+        ])
+
+  def bind_with_defaults(self, args, kwargs, default_values):
+    """Returns BoundArguments with default values filled in."""
+    bound_arguments = self.bind(*args, **kwargs)
+    bound_arguments.apply_defaults()
+
+    with_default_args = collections.OrderedDict()
+    for name, value in bound_arguments.arguments.items():
+      if value is CAPTURED_DEFAULT_VALUE:
+        with_default_args[name] = default_values[name]
+      else:
+        with_default_args[name] = value
+    bound_arguments = inspect.BoundArguments(self, with_default_args)
+    return bound_arguments
+
   def is_supertype_of(self, other: "FunctionType") -> bool:
     """Returns True if self is a supertype of other FunctionType."""
     if len(self.parameters) != len(other.parameters):
@@ -219,7 +284,9 @@ def most_specific_common_subtype(
 
     return FunctionType(subtyped_parameters, subtyped_captures)
 
-  def placeholder_arguments(self) -> inspect.BoundArguments:
+  def placeholder_arguments(
+      self, placeholder_context: trace.PlaceholderContext
+  ) -> inspect.BoundArguments:
     """Returns BoundArguments of values that can be used for tracing."""
     arguments = collections.OrderedDict()
     for parameter in self.parameters.values():
@@ -230,8 +297,9 @@ def placeholder_arguments(self) -> inspect.BoundArguments:
       if not parameter.type_constraint:
         raise ValueError("Can not generate placeholder value for "
                          "partially defined function type.")
-
-      arguments[parameter.name] = parameter.type_constraint._placeholder_value()  # pylint: disable=protected-access
+      placeholder_context.update_naming_scope(parameter.name)
+      arguments[parameter.name] = parameter.type_constraint.placeholder_value(
+          placeholder_context)
 
     return inspect.BoundArguments(self, arguments)
 
@@ -250,51 +318,176 @@ def __repr__(self):
             f"captures={self.captures})")
 
 
+MAX_SANITIZATION_WARNINGS = 5
+sanitization_warnings_given = 0
+
+
+# TODO(fmuham): In future, replace warning with exception.
+# TODO(fmuham): Sanitize to graph node conventions.
+def sanitize_arg_name(name: str) -> str:
+  """Sanitizes function argument names.
+
+  Matches Python symbol naming rules.
+
+  Without sanitization, names that are not legal Python parameter names can be
+  set which makes it challenging to represent callables supporting the named
+  calling capability.
+
+  Args:
+    name: The name to sanitize.
+
+  Returns:
+    A string that meets Python parameter conventions.
+  """
+  # Lower case and replace non-alphanumeric chars with '_'
+  swapped = "".join([c if c.isalnum() else "_" for c in name])
+  result = swapped if swapped[0].isalpha() else "arg_" + swapped
+
+  global sanitization_warnings_given
+  if name != result and sanitization_warnings_given < MAX_SANITIZATION_WARNINGS:
+    logging.warning(
+        "`%s` is not a valid tf.function parameter name. Sanitizing to `%s`.",
+        name, result)
+    sanitization_warnings_given += 1
+
+  return result
+
+
 # TODO(fmuham): Consider forcing kind to be always POSITIONAL_OR_KEYWORD.
 def _make_validated_mono_param(name, value, kind, type_context, poly_type):
   """Generates and validates a parameter for Monomorphic FunctionType."""
   mono_type = trace_type.from_value(value, type_context)
 
   if poly_type and not mono_type.is_subtype_of(poly_type):
-    raise TypeError(f"Parameter {name} was expected to be of type "
+    raise TypeError(f"Parameter `{name}` was expected to be of type "
                     f"{poly_type} but is {mono_type}")
 
   return Parameter(name, kind, False, mono_type)
 
 
 def canonicalize_to_monomorphic(
-    args: Tuple[Any, ...], kwargs: Dict[Any,
-                                        Any], polymorphic_type: FunctionType,
-    monomorphic_type_context: trace_type.InternalTracingContext
-) -> Tuple[inspect.BoundArguments, FunctionType]:
+    args: Tuple[Any, ...], kwargs: Dict[Any, Any], default_values: Dict[Any,
+                                                                        Any],
+    captures: Dict[Any, Any], polymorphic_type: FunctionType
+) -> Tuple[inspect.BoundArguments, FunctionType,
+           trace_type.InternalTracingContext]:
   """Converts polymorphic parameters to monomorphic and associated type."""
   poly_bound_arguments = polymorphic_type.bind(*args, **kwargs)
+  poly_bound_arguments.apply_defaults()
+
+  # Inject Default Values.
+  default_values_injected = poly_bound_arguments.arguments
+  for name, value in default_values_injected.items():
+    if value is CAPTURED_DEFAULT_VALUE:
+      default_values_injected[name] = default_values[name]
+  poly_bound_arguments = inspect.BoundArguments(poly_bound_arguments.signature,
+                                                default_values_injected)
+
   parameters = []
+  type_context = trace_type.InternalTracingContext()
+  has_var_positional = any(p.kind is Parameter.VAR_POSITIONAL
+                           for p in polymorphic_type.parameters.values())
 
   for name, arg in poly_bound_arguments.arguments.items():
     poly_parameter = polymorphic_type.parameters[name]
-    if poly_parameter.kind is Parameter.VAR_POSITIONAL:
+    if (has_var_positional and
+        poly_parameter.kind is Parameter.POSITIONAL_OR_KEYWORD):
+      # If there is a VAR_POSITIONAL, all POSITIONAL_OR_KEYWORD become
+      # POSITIONAL_ONLY.
+      parameters.append(
+          _make_validated_mono_param(name, arg, Parameter.POSITIONAL_ONLY,
+                                     type_context,
+                                     poly_parameter.type_constraint))
+
+    elif poly_parameter.kind is Parameter.VAR_POSITIONAL:
+      # Unbundle VAR_POSITIONAL into individual POSITIONAL_ONLY args.
       for i, value in enumerate(arg):
         parameters.append(
             _make_validated_mono_param(f"{poly_parameter.name}_{i}", value,
-                                       Parameter.POSITIONAL_ONLY,
-                                       monomorphic_type_context,
+                                       Parameter.POSITIONAL_ONLY, type_context,
                                        poly_parameter.type_constraint))
+
     elif poly_parameter.kind is Parameter.VAR_KEYWORD:
-      for kwarg_name, kwarg_value in arg.items():
+      # Unbundle VAR_KEYWORD into individual KEYWORD_ONLY args.
+      for kwarg_name in sorted(arg.keys()):
         parameters.append(
-            _make_validated_mono_param(kwarg_name, kwarg_value,
-                                       Parameter.KEYWORD_ONLY,
-                                       monomorphic_type_context,
+            _make_validated_mono_param(kwarg_name, arg[kwarg_name],
+                                       Parameter.KEYWORD_ONLY, type_context,
                                        poly_parameter.type_constraint))
     else:
       parameters.append(
           _make_validated_mono_param(name, arg, poly_parameter.kind,
-                                     monomorphic_type_context,
+                                     type_context,
                                      poly_parameter.type_constraint))
 
-  monomorphic_function_type = FunctionType(parameters)
+  capture_types = collections.OrderedDict()
+  for name, value in captures.items():
+    capture_types[name] = trace_type.from_value(value, type_context)
+
+  monomorphic_function_type = FunctionType(parameters, capture_types)
   mono_bound_arguments = monomorphic_function_type.bind(
       *poly_bound_arguments.args, **poly_bound_arguments.kwargs)
 
-  return mono_bound_arguments, monomorphic_function_type
+  return mono_bound_arguments, monomorphic_function_type, type_context
+
+
+# TODO(fmuham): Share code with canonicalize_to_monomorphic.
+# TODO(fmuham): Lift unnecessary restrictions on input_signature validity.
+def add_type_constraints(function_type: FunctionType, input_signature: Any,
+                         default_values: Dict[str, Any]):
+  """Adds type constraints to a FunctionType based on the input_signature."""
+  context = trace_type.InternalTracingContext(is_legacy_signature=True)
+  constraints = [trace_type.from_value(c, context) for c in input_signature]
+  parameters = []
+
+  has_var_pos = any(
+      p.kind is p.VAR_POSITIONAL for p in function_type.parameters.values())
+
+  for param in function_type.parameters.values():
+    # VAR_POSITIONAL does not allow POSITIONAL_OR_KEYWORD args.
+    sanitized_kind = (
+        param.POSITIONAL_ONLY if has_var_pos and
+        param.kind is param.POSITIONAL_OR_KEYWORD else param.kind)
+
+    if param.name == "self":
+      # Type constraints do not apply on them.
+      parameters.append(Parameter("self", sanitized_kind, param.optional, None))
+
+    elif param.kind is param.VAR_KEYWORD:
+      # Disabled when input_signature is specified.
+      continue
+
+    elif param.kind is param.VAR_POSITIONAL:
+      # Convert into Positional Only args based on length of constraints.
+      for i in range(len(constraints)):
+        parameters.append(
+            Parameter(param.name + "_" + str(i), Parameter.POSITIONAL_ONLY,
+                      False, constraints.pop(0)))
+
+    elif (param.kind in [
+        param.POSITIONAL_ONLY, param.POSITIONAL_OR_KEYWORD, param.KEYWORD_ONLY
+    ]):
+      if param.kind is param.KEYWORD_ONLY and param.name not in default_values:
+        raise TypeError(
+            "Since input_signature is defined, keyword-only parameter"
+            f" `{param.name}` must have a default value"
+        )
+
+      if constraints:
+        parameters.append(
+            Parameter(param.name, sanitized_kind, param.optional,
+                      constraints.pop(0)))
+      elif param.name in default_values:
+        type_constraint = trace_type.from_value(default_values[param.name])
+        parameters.append(
+            Parameter(param.name, sanitized_kind, param.optional,
+                      type_constraint))
+      else:
+        raise TypeError(
+            f"input_signature missing type constraint for {param.name}")
+
+  if constraints:
+    raise TypeError(
+        f"input_signature contains {len(constraints)} extra type constraints.")
+
+  return FunctionType(parameters)
diff --git a/tensorflow/core/function/polymorphism/function_type_test.py b/tensorflow/core/function/polymorphism/function_type_test.py
index 7b198652f73..b52a9965e56 100644
--- a/tensorflow/core/function/polymorphism/function_type_test.py
+++ b/tensorflow/core/function/polymorphism/function_type_test.py
@@ -22,6 +22,9 @@
 
 from tensorflow.core.function import trace_type
 from tensorflow.core.function.polymorphism import function_type
+from tensorflow.core.function.polymorphism import function_type_pb2
+from tensorflow.core.function.trace_type import serialization
+from tensorflow.python.framework import func_graph
 from tensorflow.python.platform import test
 from tensorflow.python.types import trace
 
@@ -105,11 +108,8 @@ def foo(self, x, y=1):
         constraint,
         function_type.FunctionType(
             (function_type.Parameter(
-                "self", function_type.Parameter.POSITIONAL_OR_KEYWORD, False,
+                "x", function_type.Parameter.POSITIONAL_OR_KEYWORD, False,
                 None),
-             function_type.Parameter(
-                 "x", function_type.Parameter.POSITIONAL_OR_KEYWORD, False,
-                 None),
              function_type.Parameter(
                  "y", function_type.Parameter.POSITIONAL_OR_KEYWORD, True,
                  None))))
@@ -251,8 +251,8 @@ def foo(x, y, z):
       del x, y, z
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
@@ -283,8 +283,8 @@ def foo(x=1, y=2, z=3):
       del x, y, z
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
@@ -311,10 +311,11 @@ def foo(x=1, y=2, z=3):
       del x, y, z
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, function_type.FunctionType.get_default_values(foo), {},
+        polymorphic_type)
 
-    self.assertEqual(bound_args.args, (1, 2))
+    self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
 
     type_context = trace_type.InternalTracingContext()
@@ -325,6 +326,9 @@ def foo(x=1, y=2, z=3):
         function_type.Parameter("y",
                                 function_type.Parameter.POSITIONAL_OR_KEYWORD,
                                 False, trace_type.from_value(2, type_context)),
+        function_type.Parameter("z",
+                                function_type.Parameter.POSITIONAL_OR_KEYWORD,
+                                False, trace_type.from_value(3, type_context)),
     ])
 
     self.assertEqual(mono_type, expected_type)
@@ -340,8 +344,8 @@ def foo(x, y, z=3):
       del x, y, z
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
@@ -367,8 +371,8 @@ def foo(*my_var_args):
       del my_var_args
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        (1, 2, 3), {}, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        (1, 2, 3), {}, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
@@ -394,11 +398,11 @@ def foo(**kwargs):
       del kwargs
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic((), {
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic((), {
         "x": 1,
         "y": 2,
         "z": 3
-    }, polymorphic_type, trace_type.InternalTracingContext())
+    }, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, ())
     self.assertEqual(bound_args.kwargs, {"x": 1, "y": 2, "z": 3})
@@ -421,10 +425,10 @@ def foo(*args, **kwargs):
       del args, kwargs
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic((1,), {
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic((1,), {
         "y": 2,
         "z": 3
-    }, polymorphic_type, trace_type.InternalTracingContext())
+    }, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1,))
     self.assertEqual(bound_args.kwargs, {"y": 2, "z": 3})
@@ -452,8 +456,8 @@ def foo(x, y, *, z):
       del x, y, z
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2))
     self.assertEqual(bound_args.kwargs, {"z": 3})
@@ -485,8 +489,8 @@ def test_posonly(self, args, kwargs):
     foo = eval("lambda x, y, /, z: x + y + z")  # pylint: disable=eval-used
 
     polymorphic_type = function_type.FunctionType.from_callable(foo)
-    bound_args, mono_type = function_type.canonicalize_to_monomorphic(
-        args, kwargs, polymorphic_type, trace_type.InternalTracingContext())
+    bound_args, mono_type, _ = function_type.canonicalize_to_monomorphic(
+        args, kwargs, {}, {}, polymorphic_type)
 
     self.assertEqual(bound_args.args, (1, 2, 3))
     self.assertEqual(bound_args.kwargs, {})
@@ -567,6 +571,9 @@ def is_subtype_of(self, other: trace.TraceType) -> bool:
       def most_specific_common_supertype(self, others):
         return self
 
+      def placeholder_value(self, placeholder_context=None):
+        raise NotImplementedError
+
       def __eq__(self, other):
         return self is other
 
@@ -583,6 +590,9 @@ def is_subtype_of(self, other) -> bool:
       def most_specific_common_supertype(self, others):
         return supertype
 
+      def placeholder_value(self, placeholder_context=None):
+        raise NotImplementedError
+
       def __eq__(self, other):
         return self is other
 
@@ -620,9 +630,12 @@ def test_placeholder_arg(self):
         function_type.Parameter("z", function_type.Parameter.KEYWORD_ONLY,
                                 False, trace_type.from_value(3, type_context)),
     ])
-
-    self.assertEqual(foo.placeholder_arguments().args, (1, 2))
-    self.assertEqual(foo.placeholder_arguments().kwargs, {"z": 3})
+    context_graph = func_graph.FuncGraph("test")
+    placeholder_context = trace_type.InternalPlaceholderContext(context_graph)
+    self.assertEqual(
+        foo.placeholder_arguments(placeholder_context).args, (1, 2))
+    self.assertEqual(
+        foo.placeholder_arguments(placeholder_context).kwargs, {"z": 3})
 
 
 class CapturesTest(test.TestCase):
@@ -683,5 +696,75 @@ def testCapturesSupertype(self):
     self.assertEmpty(supertype_5.captures)
 
 
+class SanitizationTest(test.TestCase):
+
+  def testRename(self):
+    self.assertEqual("arg_42", function_type.sanitize_arg_name("42"))
+    self.assertEqual("a42", function_type.sanitize_arg_name("a42"))
+    self.assertEqual("arg__42", function_type.sanitize_arg_name("_42"))
+    self.assertEqual("a___", function_type.sanitize_arg_name("a%$#"))
+    self.assertEqual("arg____", function_type.sanitize_arg_name("%$#"))
+    self.assertEqual("foo", function_type.sanitize_arg_name("foo"))
+    self.assertEqual("Foo", function_type.sanitize_arg_name("Foo"))
+    self.assertEqual("arg_96ab_cd___53",
+                     function_type.sanitize_arg_name("96ab.cd//?53"))
+
+  def testLogWarning(self):
+
+    with self.assertLogs(level="WARNING") as logs:
+      result = function_type.sanitize_arg_name("96ab.cd//?53")
+
+    self.assertEqual(result, "arg_96ab_cd___53")
+
+    expected_message = (
+        "WARNING:absl:`96ab.cd//?53` is not a valid tf.function parameter name."
+        " Sanitizing to `arg_96ab_cd___53`.")
+    self.assertIn(expected_message, logs.output)
+
+
+class SerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.product(
+      name=["arg_0", "param"],
+      kind=[
+          function_type.Parameter.POSITIONAL_ONLY,
+          function_type.Parameter.POSITIONAL_OR_KEYWORD
+      ],
+      optional=[True, False],
+      type_contraint=[None, trace_type.from_value(1)])
+  def testParameter(self, name, kind, optional, type_contraint):
+    original = function_type.Parameter(name, kind, optional, type_contraint)
+    expected_type_constraint = serialization.serialize(
+        type_contraint) if type_contraint else None
+    expected = function_type_pb2.Parameter(
+        name=name,
+        kind=function_type.PY_TO_PROTO_ENUM[kind],
+        is_optional=optional,
+        type_constraint=expected_type_constraint)
+    self.assertEqual(original.to_proto(), expected)
+    self.assertEqual(function_type.Parameter.from_proto(expected), original)
+
+  def testFunctionType(self):
+    original = function_type.FunctionType([
+        function_type.Parameter("a", function_type.Parameter.POSITIONAL_ONLY,
+                                False, None),
+    ], collections.OrderedDict([("b", trace_type.from_value(1))]))
+    expected = function_type_pb2.FunctionType(
+        parameters=[
+            function_type_pb2.Parameter(
+                name="a",
+                kind=function_type_pb2.Parameter.Kind.POSITIONAL_ONLY,
+                is_optional=False)
+        ],
+        captures=[
+            function_type_pb2.Capture(
+                name="b",
+                type_constraint=serialization.serialize(
+                    trace_type.from_value(1)))
+        ])
+    self.assertEqual(original.to_proto(), expected)
+    self.assertEqual(function_type.FunctionType.from_proto(expected), original)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/core/function/polymorphism/type_dispatch_test.py b/tensorflow/core/function/polymorphism/type_dispatch_test.py
index 470237545f7..0b59850e446 100644
--- a/tensorflow/core/function/polymorphism/type_dispatch_test.py
+++ b/tensorflow/core/function/polymorphism/type_dispatch_test.py
@@ -44,6 +44,9 @@ def most_specific_common_supertype(self, others):
     ]
     return MockShape(*dims)
 
+  def placeholder_value(self, placeholder_context=None):
+    raise NotImplementedError
+
   def __str__(self):
     return str(self.shape)
 
diff --git a/tensorflow/core/function/runtime_client/BUILD b/tensorflow/core/function/runtime_client/BUILD
new file mode 100644
index 00000000000..91061f42566
--- /dev/null
+++ b/tensorflow/core/function/runtime_client/BUILD
@@ -0,0 +1,156 @@
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
+load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/core/function:__subpackages__"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "runtime_client_cc",
+    srcs = [
+        "runtime_client.cc",
+    ],
+    hdrs = [
+        "runtime_client.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:device_mgr",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/framework:op_def_proto_cc",
+        "//tensorflow/core/ir:Dialect",
+        "//tensorflow/core/ir/importexport:graphdef_export",
+        "//tensorflow/core/ir/importexport:graphdef_import",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    # TODO(mdan): Get rid of alwayslink, it's nonstandard.
+    alwayslink = 1,
+)
+
+# TODO(mdan): Pull these transitive header deps in a more decent fashion.
+# TODO(mdan): Get rid of headers-only lib, it's nonstandard. Use cc_shared_library?
+cc_library(
+    name = "runtime_client_headers",
+    textual_hdrs = [
+        "runtime_client.h",
+        "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/core/config:flags_headers",
+        "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
+        "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
+        "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+    ],
+)
+
+tf_cc_test(
+    name = "runtime_client_cc_test",
+    srcs = ["runtime_client_test.cc"],
+    deps = [
+        ":runtime_client_cc",
+        "//tensorflow/c:c_api_experimental",  # buildcleaner: keep (registers CPU ops?)
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/framework:op_def_proto_cc",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/function/testing:test_pass_cc",
+        "//tensorflow/core/ir:Dialect",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+# TODO(b/221223841): Get rid of if_static, it's nonstandard.
+tf_python_pybind_extension(
+    name = "runtime_client_pybind",
+    srcs = ["runtime_client_pybind.cc"],
+    deps = [
+        ":runtime_client_headers",
+        "@pybind11",
+        "//tensorflow/python/lib/core:pybind11_status",
+    ] + if_static(
+        extra_deps = [
+            "//tensorflow/core/framework:function_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
+            "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        ],
+        otherwise = [
+            "//tensorflow/core/framework:function_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
+            "//tensorflow/tsl/protobuf:coordination_service_proto_cc_headers_only",
+        ],
+    ),
+)
+
+# TODO(mdan): Drop function_proto_py_pb2 once pybind11_protobuf is available.
+pytype_strict_library(
+    name = "runtime_client_py",
+    srcs = [
+        "runtime_client.py",
+    ],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow/core/function/transform:__subpackages__"],
+    deps = [
+        ":runtime_client_pybind",
+        "//tensorflow/core/framework:function_proto_py",
+        "//tensorflow/python:pywrap_tensorflow",  # buildcleaner: keep (required for TF pybind)
+    ],
+)
+
+py_strict_test(
+    name = "runtime_client_py_test",
+    srcs = ["runtime_client_test.py"],
+    main = "runtime_client_test.py",
+    python_version = "PY3",
+    tags = ["no_oss"],  # TODO(b/219089812)
+    deps = [
+        ":runtime_client_py",
+        "//tensorflow/core/framework:function_proto_py",
+        "//tensorflow/core/function/testing:test_pass_py",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:execute",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
diff --git a/tensorflow/core/function/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
similarity index 99%
rename from tensorflow/core/function/runtime_client.cc
rename to tensorflow/core/function/runtime_client/runtime_client.cc
index 55884855453..65bae9ba7ef 100644
--- a/tensorflow/core/function/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/function/runtime_client.h"
+#include "tensorflow/core/function/runtime_client/runtime_client.h"
 
 #include <memory>
 #include <string>
diff --git a/tensorflow/core/function/runtime_client.h b/tensorflow/core/function/runtime_client/runtime_client.h
similarity index 93%
rename from tensorflow/core/function/runtime_client.h
rename to tensorflow/core/function/runtime_client/runtime_client.h
index 1fa97922445..cd34be10c0f 100644
--- a/tensorflow/core/function/runtime_client.h
+++ b/tensorflow/core/function/runtime_client/runtime_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_H_
-#define TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_H_
+#ifndef TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
+#define TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
 
 #include <vector>
 
@@ -82,4 +82,4 @@ class Runtime {
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_H_
+#endif  // TENSORFLOW_CORE_FUNCTION_RUNTIME_CLIENT_RUNTIME_CLIENT_H_
diff --git a/tensorflow/core/function/runtime_client.py b/tensorflow/core/function/runtime_client/runtime_client.py
similarity index 95%
rename from tensorflow/core/function/runtime_client.py
rename to tensorflow/core/function/runtime_client/runtime_client.py
index 304aa6fd8b1..f61661a6fef 100644
--- a/tensorflow/core/function/runtime_client.py
+++ b/tensorflow/core/function/runtime_client/runtime_client.py
@@ -18,7 +18,7 @@
 from tensorflow.python import pywrap_tensorflow  # pylint:disable=g-bad-import-order,unused-import
 
 from tensorflow.core.framework import function_pb2
-from tensorflow.core.function import runtime_client_pybind
+from tensorflow.core.function.runtime_client import runtime_client_pybind
 
 GlobalEagerContext = runtime_client_pybind.GlobalEagerContext
 GlobalPythonEagerContext = runtime_client_pybind.GlobalPythonEagerContext
diff --git a/tensorflow/core/function/runtime_client_pybind.cc b/tensorflow/core/function/runtime_client/runtime_client_pybind.cc
similarity index 97%
rename from tensorflow/core/function/runtime_client_pybind.cc
rename to tensorflow/core/function/runtime_client/runtime_client_pybind.cc
index 8db72c425a4..26c31ff4762 100644
--- a/tensorflow/core/function/runtime_client_pybind.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client_pybind.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/function/runtime_client.h"
+#include "tensorflow/core/function/runtime_client/runtime_client.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
 PYBIND11_MAKE_OPAQUE(tensorflow::EagerContext);
diff --git a/tensorflow/core/function/runtime_client_test.cc b/tensorflow/core/function/runtime_client/runtime_client_test.cc
similarity index 99%
rename from tensorflow/core/function/runtime_client_test.cc
rename to tensorflow/core/function/runtime_client/runtime_client_test.cc
index 4a377bbf500..bfa88f69b81 100644
--- a/tensorflow/core/function/runtime_client_test.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/function/runtime_client.h"
+#include "tensorflow/core/function/runtime_client/runtime_client.h"
 
 #include <stdint.h>
 
diff --git a/tensorflow/core/function/runtime_client_test.py b/tensorflow/core/function/runtime_client/runtime_client_test.py
similarity index 98%
rename from tensorflow/core/function/runtime_client_test.py
rename to tensorflow/core/function/runtime_client/runtime_client_test.py
index 2b7173fbe34..a9522b93727 100644
--- a/tensorflow/core/function/runtime_client_test.py
+++ b/tensorflow/core/function/runtime_client/runtime_client_test.py
@@ -16,7 +16,7 @@
 
 from google.protobuf import text_format
 from tensorflow.core.framework import function_pb2
-from tensorflow.core.function import runtime_client
+from tensorflow.core.function.runtime_client import runtime_client
 from tensorflow.core.function.testing import test_pass
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
diff --git a/tensorflow/core/function/testing/BUILD b/tensorflow/core/function/testing/BUILD
index a0313ab1971..18e73a19b54 100644
--- a/tensorflow/core/function/testing/BUILD
+++ b/tensorflow/core/function/testing/BUILD
@@ -1,8 +1,11 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
 
+visibility = ["//tensorflow/core/function:__subpackages__"]
+
 package(
-    default_visibility = ["//tensorflow/core/function:__subpackages__"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = visibility,
 )
 
 licenses(["notice"])
@@ -40,6 +43,7 @@ pytype_strict_library(
         "test_pass.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility,
     deps = [
         ":test_pass_pybind",
         "//tensorflow/python:pywrap_tensorflow",  # buildcleaner: keep (required for TF pybind)
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index 9e29c35be11..0b952c6b5af 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -52,6 +53,7 @@ py_strict_test(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
@@ -71,6 +73,7 @@ pytype_strict_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":serialization",
+        ":util",
         "//tensorflow/core/function/trace_type:default_types_proto_py",
         "//tensorflow/python/types",
     ],
diff --git a/tensorflow/core/function/trace_type/__init__.py b/tensorflow/core/function/trace_type/__init__.py
index d1106f689b5..03a16d6264a 100644
--- a/tensorflow/core/function/trace_type/__init__.py
+++ b/tensorflow/core/function/trace_type/__init__.py
@@ -31,5 +31,7 @@
 from tensorflow.core.function.trace_type.serialization import serialize
 from tensorflow.core.function.trace_type.serialization import SerializedTraceType
 from tensorflow.core.function.trace_type.trace_type_builder import from_value
+from tensorflow.core.function.trace_type.trace_type_builder import InternalCastContext
+from tensorflow.core.function.trace_type.trace_type_builder import InternalPlaceholderContext
 from tensorflow.core.function.trace_type.trace_type_builder import InternalTracingContext
 from tensorflow.core.function.trace_type.trace_type_builder import WeakrefDeletionObserver
diff --git a/tensorflow/core/function/trace_type/default_types.py b/tensorflow/core/function/trace_type/default_types.py
index b351768c444..234a2cd1aca 100644
--- a/tensorflow/core/function/trace_type/default_types.py
+++ b/tensorflow/core/function/trace_type/default_types.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """TraceType implementations for common Python types."""
 
+import collections
 from typing import Any, Hashable, Optional, Sequence, Type
 from typing import Dict as PythonDict
 from typing import Tuple as PythonTuple
@@ -21,6 +22,7 @@
 
 from tensorflow.core.function.trace_type import default_types_pb2
 from tensorflow.core.function.trace_type import serialization
+from tensorflow.core.function.trace_type import util
 from tensorflow.python.types import trace
 
 
@@ -82,9 +84,16 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedLiteral:
     raise ValueError("Can not serialize Literal of type " +
                      type(self.value).__name__)
 
-  def _placeholder_value(self) -> Any:
+  def placeholder_value(self, placeholder_context=None) -> Any:
+    # TODO(b/263505796): Remove this check when a range's placeholder output
+    # is expected to be a range and not a list.
+    if isinstance(self.value, range):
+      return list(self.value)
     return self.value
 
+  def _to_tensors(self, value: Any):
+    return []
+
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -116,9 +125,12 @@ def most_specific_common_supertype(
       self, types: Sequence[trace.TraceType]) -> Optional["Weakref"]:
     return self if all(self == other for other in types) else None
 
-  def _placeholder_value(self) -> Any:
+  def placeholder_value(self, placeholder_context=None) -> Any:
     return self._ref()
 
+  def _to_tensors(self, value: Any) -> Any:
+    return []
+
   def __eq__(self, other):
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -187,13 +199,25 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedTuple:
     return default_types_pb2.SerializedTuple(
         components=[serialization.serialize(c) for c in self.components])
 
-  def _placeholder_value(self) -> Any:
+  def placeholder_value(self, placeholder_context) -> Any:
     components = [
-        component._placeholder_value()  # pylint: disable=protected-access
+        component.placeholder_value(placeholder_context)
         for component in self.components
     ]
     return tuple(components)
 
+  def _to_tensors(self, value) -> Any:
+    assert isinstance(value, tuple)
+    flattened_values = []
+    for comp_value, comp_type in zip(value, self.components):
+      flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
+    return flattened_values
+
+  def _cast(self, value: Any, casting_context) -> Any:
+    assert isinstance(value, tuple), f"Cannot cast {value!r} to tuple type."
+    return tuple(component._cast(  # pylint: disable=protected-access
+        v, casting_context) for v, component in zip(value, self.components))
+
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -228,8 +252,11 @@ def most_specific_common_supertype(
     if not all(isinstance(other, List) for other in others):
       return None
 
-    supertyped_components_tuple = self.components_tuple.most_specific_common_supertype(
-        [other.components_tuple for other in others])
+    supertyped_components_tuple = (
+        self.components_tuple.most_specific_common_supertype(
+            [other.components_tuple for other in others]
+        )
+    )
 
     if supertyped_components_tuple is None:
       return None
@@ -250,8 +277,17 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedList:
     return default_types_pb2.SerializedList(
         components_tuple=self.components_tuple.experimental_as_proto())
 
-  def _placeholder_value(self) -> Any:
-    return list(self.components_tuple._placeholder_value())  # pylint: disable=protected-access
+  def placeholder_value(self, placeholder_context) -> Any:
+    return list(self.components_tuple.placeholder_value(placeholder_context))
+
+  def _to_tensors(self, value):
+    assert isinstance(value, list)
+    return self.components_tuple._to_tensors(tuple(value))  # pylint: disable=protected-access
+
+  def _cast(self, value: Any, casting_context) -> Any:
+    assert isinstance(value, list), f"Cannot cast {value!r} to list type."
+    return [component._cast(v, casting_context) for v, component in zip(  # pylint: disable=protected-access
+        value, self.components_tuple.components)]
 
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, trace.TraceType):
@@ -332,7 +368,7 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedNamedTuple:
         attribute_names=list(self.attribute_names),
         attributes=self.attributes.experimental_as_proto())
 
-  def _placeholder_value(self) -> Any:
+  def placeholder_value(self, placeholder_context) -> Any:
     if self._placeholder_type is None:
       # We don't need to trace after serialization so it is not needed but we
       # can generate a placeholder type using the description if ever needed.
@@ -340,11 +376,35 @@ def _placeholder_value(self) -> Any:
                        " unspecified placeholder_type. Note: placeholder_type "
                        "is lost during serialization.")
     attribute_placeholders = [
-        attribute._placeholder_value()  # pylint: disable=protected-access
+        attribute.placeholder_value(placeholder_context)
         for attribute in self.attributes.components
     ]
     return self._placeholder_type(*attribute_placeholders)
 
+  def _to_tensors(self, value: Any):
+    assert util.is_namedtuple(value)
+    flattened_values = []
+    for attribute_name, attribute_type in zip(
+        self.attribute_names, self.attributes.components):
+      attribute_value = getattr(value, attribute_name)
+      flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
+    return flattened_values
+
+  def _cast(self, value: Any, casting_context) -> Any:
+    # Value must have same attributes with the TraceType
+    assert (
+        isinstance(value, self._placeholder_type)  # pylint: disable=unidiomatic-typecheck
+    ), f"Cannot cast {value!r} to type {self._placeholder_type!r}."
+    cast_value = {}
+    value_dict = value._asdict()
+    assert set(value_dict.keys()) == set(
+        self.attribute_names
+    ), f"{value!r} has different attributes with the TraceType {self!r}"
+
+    for k, v in zip(self.attribute_names, self.attributes.components):
+      cast_value[k] = v._cast(getattr(value, k), casting_context)  # pylint: disable=protected-access
+    return self._placeholder_type(**cast_value)
+
   def __hash__(self) -> int:
     return hash((self.type_name, self.attribute_names, self.attributes))
 
@@ -396,8 +456,11 @@ def most_specific_common_supertype(
     if not all(isinstance(other, Attrs) for other in others):
       return None
 
-    supertyped_attributes = self.named_attributes.most_specific_common_supertype(
-        [other.named_attributes for other in others])
+    supertyped_attributes = (
+        self.named_attributes.most_specific_common_supertype(
+            [other.named_attributes for other in others]
+        )
+    )
 
     if supertyped_attributes is None:
       return None
@@ -424,7 +487,7 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedAttrs:
     return default_types_pb2.SerializedAttrs(
         named_attributes=self.named_attributes.experimental_as_proto())
 
-  def _placeholder_value(self) -> Any:
+  def placeholder_value(self, placeholder_context) -> Any:
     if self._placeholder_type is None:
       # We don't need to trace after serialization so it is not needed but we
       # can generate a placeholder type using the description if ever needed.
@@ -432,11 +495,33 @@ def _placeholder_value(self) -> Any:
                        " unspecified placeholder_type. Note: placeholder_type "
                        "is lost during serialization.")
     attribute_placeholders = [
-        attribute._placeholder_value()  # pylint: disable=protected-access
+        attribute.placeholder_value(placeholder_context)
         for attribute in self.named_attributes.attributes.components
     ]
     return self._placeholder_type(*attribute_placeholders)
 
+  def _to_tensors(self, value: Any):
+    assert util.is_attrs(value)
+    flattened_values = []
+    for attribute_name, attribute_type in zip(
+        self.named_attributes.attribute_names,
+        self.named_attributes.attributes.components):
+      attribute_value = getattr(value, attribute_name)
+      flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
+    return flattened_values
+
+  def _cast(self, value: Any, casting_context) -> Any:
+    assert util.is_attrs(value)
+    value_cast = {}
+    for attribute_name, attribute_type in zip(
+        self.named_attributes.attribute_names,
+        self.named_attributes.attributes.components):
+      attribute_value = getattr(value, attribute_name)
+      value_cast[attribute_name] = attribute_type._cast(  # pylint: disable=protected-access
+          attribute_value, casting_context)
+
+    return self._placeholder_type(**value_cast)
+
   def __hash__(self) -> int:
     return hash(self.named_attributes)
 
@@ -462,8 +547,11 @@ class Dict(trace.TraceType, serialization.Serializable):
     mapping: A mapping from keys to corresponding TraceTypes of the dict values.
   """
 
-  def __init__(self, mapping: PythonDict[Hashable, trace.TraceType]):
+  def __init__(self,
+               mapping: PythonDict[Hashable, trace.TraceType],
+               placeholder_type: Optional[Type[Any]] = None):
     self.mapping = mapping
+    self._placeholder_type = placeholder_type
 
   def _has_same_structure(self, other):
     if not isinstance(other, Dict):
@@ -499,7 +587,7 @@ def most_specific_common_supertype(
       else:
         new_mapping[key] = common
 
-    return Dict(new_mapping)
+    return Dict(new_mapping, self._placeholder_type)
 
   @classmethod
   def experimental_type_proto(cls) -> Type[default_types_pb2.SerializedDict]:
@@ -518,11 +606,44 @@ def experimental_as_proto(self) -> default_types_pb2.SerializedDict:
         keys=[Literal(k).experimental_as_proto() for k in self.mapping.keys()],
         values=[serialization.serialize(v) for v in self.mapping.values()])
 
-  def _placeholder_value(self) -> Any:
-    return {
-        key: value._placeholder_value()  # pylint: disable=protected-access
+  def placeholder_value(self, placeholder_context) -> Any:
+    if self._placeholder_type is None:
+      raise ValueError("Can not generate placeholder value for Dict with"
+                       " unspecified placeholder_type. Note: placeholder_type "
+                       "is lost during serialization.")
+    attribute_placeholders = [
+        (key, value.placeholder_value(placeholder_context))
         for key, value in self.mapping.items()
-    }
+    ]
+    if self._placeholder_type is collections.defaultdict:
+      return dict(attribute_placeholders)
+    return self._placeholder_type(attribute_placeholders)
+
+  def _to_tensors(self, value: Any):
+    assert isinstance(value, collections.abc.Mapping)
+    flattened_values = []
+    for key in sorted(self.mapping.keys()):
+      comp_value, comp_type = value[key], self.mapping[key]
+      flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
+    return flattened_values
+
+  def _cast(self, value: Any, casting_context) -> Any:
+    # Value must have same keys with the TraceType
+    assert isinstance(
+        value, dict
+    ), f"Cannot cast {value!r} to Python dict type."
+    assert set(value.keys()) == set(
+        self.mapping.keys()
+    ), f"{value!r} has different keys with the TraceType {self!r}."
+
+    cast_value = {}
+    for k in value:
+      assert k in self.mapping, f"Key {k} does not exist in TraceType {self!r}."
+      cast_value[k] = self.mapping[k]._cast(value[k], casting_context)  # pylint: disable=protected-access
+    if self._placeholder_type is None:
+      return cast_value
+    else:
+      return self._placeholder_type(**cast_value)
 
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
diff --git a/tensorflow/core/function/trace_type/default_types_test.py b/tensorflow/core/function/trace_type/default_types_test.py
index 2c2d8ae31c9..312b1c29133 100644
--- a/tensorflow/core/function/trace_type/default_types_test.py
+++ b/tensorflow/core/function/trace_type/default_types_test.py
@@ -39,6 +39,9 @@ def most_specific_common_supertype(self, others):
     else:
       return None
 
+  def placeholder_value(self, placeholder_context=None):
+    raise NotImplementedError
+
   def __eq__(self, other) -> bool:
     return isinstance(other, type(self)) and self._object == other._object
 
diff --git a/tensorflow/core/function/trace_type/trace_type_builder.py b/tensorflow/core/function/trace_type/trace_type_builder.py
index 674b32096fb..47fcece4945 100644
--- a/tensorflow/core/function/trace_type/trace_type_builder.py
+++ b/tensorflow/core/function/trace_type/trace_type_builder.py
@@ -15,7 +15,7 @@
 """Utitiles for Cache Key generation based on Function Trace Type."""
 
 import collections.abc
-from typing import Any, Callable, Hashable
+from typing import Any, Callable, Hashable, Optional, Dict
 import weakref
 
 from tensorflow.core.function.trace_type import default_types
@@ -61,6 +61,8 @@ class InternalTracingContext(trace.TracingContext):
   def __init__(self, is_legacy_signature: bool = False):
     self._deletion_observer = WeakrefDeletionObserver()
     self._global_to_local_id = {}
+    self._alias_id_to_placeholder = {}
+    self._spec_id_to_handledata = {}
     self._is_legacy_signature = is_legacy_signature
 
   def alias_global_id(self, global_id: Hashable) -> Hashable:
@@ -69,6 +71,18 @@ def alias_global_id(self, global_id: Hashable) -> Hashable:
 
     return self._global_to_local_id[global_id]
 
+  def add_placeholder(self, alias_id: Hashable, variable) -> None:
+    self._alias_id_to_placeholder[alias_id] = variable
+
+  def get_placeholder_mapping(self) -> Dict[Hashable, Any]:
+    return self._alias_id_to_placeholder
+
+  def add_handledata(self, spec_id: Hashable, handledata: Any) -> None:
+    self._spec_id_to_handledata[spec_id] = handledata
+
+  def get_handledata_mapping(self) -> Dict[Hashable, Any]:
+    return self._spec_id_to_handledata
+
   @property
   def deletion_observer(self) -> WeakrefDeletionObserver:
     """Returns a functor which invalidates the current key when called."""
@@ -84,6 +98,64 @@ def is_legacy_signature(self) -> bool:
     return self._is_legacy_signature
 
 
+class InternalPlaceholderContext(trace.PlaceholderContext):
+  """Container with mappings shared across TraceTypes for placeholder values."""
+
+  def __init__(self,
+               context_graph=None,
+               placeholder_mapping=None,
+               handledata_mapping=None,
+               unnest_only=False):
+    self._alias_id_to_placeholder = placeholder_mapping or {}
+    self._spec_id_to_handledata = handledata_mapping or {}
+    self._naming_scope = None
+    self._context_graph = context_graph
+    self._unnest_only = unnest_only
+
+  def has_placeholder(self, alias_id: Hashable) -> bool:
+    return alias_id in self._alias_id_to_placeholder
+
+  def get_placeholder(self, alias_id: Hashable) -> Hashable:
+    if not self.has_placeholder(alias_id):
+      raise KeyError(f"alias_id: {alias_id} not found in this instance of "
+                     "placeholder context.")
+    return self._alias_id_to_placeholder[alias_id]
+
+  def add_placeholder(self, alias_id: Hashable, placeholder: Hashable) -> None:
+    if alias_id in self._alias_id_to_placeholder:
+      raise KeyError(f"alias id: {alias_id} is already stored in this "
+                     "instance of placeholder context.")
+    self._alias_id_to_placeholder[alias_id] = placeholder
+
+  def has_handledata(self, spec_id: Hashable) -> bool:
+    return spec_id in self._spec_id_to_handledata
+
+  def get_handledata(self, spec_id: Hashable) -> Any:
+    if not self.has_handledata(spec_id):
+      raise KeyError("Could not find handle data for TraceType with "
+                     f"id: {spec_id} in this instance of placeholder context.")
+    return self._spec_id_to_handledata[spec_id]
+
+  def update_naming_scope(self, naming_scope: Optional[str]) -> None:
+    self._naming_scope = naming_scope
+
+  @property
+  def naming_scope(self) -> Optional[str]:
+    return self._naming_scope
+
+  @property
+  def context_graph(self):
+    return self._context_graph
+
+  @property
+  def unnest_only(self) -> bool:
+    return self._unnest_only
+
+
+class InternalCastContext(trace.CastContext):
+  """Default casting behaviors."""
+
+
 def from_value(value: Any,
                context: trace.TracingContext = None) -> trace.TraceType:
   """Returns a TraceType corresponding to the value based on the context.
@@ -102,7 +174,12 @@ def from_value(value: Any,
   if context.is_legacy_signature and isinstance(value, trace.TraceType):
     return value
   elif isinstance(value, trace.SupportsTracingProtocol):
-    return value.__tf_tracing_type__(context)
+    generated_type = value.__tf_tracing_type__(context)
+    if not isinstance(generated_type, trace.TraceType):
+      raise TypeError(
+          "Expected an instance of TraceType for Tracing Protocol call to " +
+          str(value) + " but got " + str(generated_type))
+    return generated_type
 
   if hasattr(value, "__wrapped__"):
     return from_value(value.__wrapped__, context)
@@ -119,7 +196,9 @@ def from_value(value: Any,
       return default_types.Tuple(*(from_value(c, context) for c in value))
 
   if isinstance(value, collections.abc.Mapping):
-    return default_types.Dict({k: from_value(value[k], context) for k in value})
+    mapping_type = type(value)
+    return default_types.Dict(
+        {k: from_value(value[k], context) for k in value}, mapping_type)
 
   if util.is_attrs(value):
     return default_types.Attrs.from_type_and_attributes(
@@ -139,6 +218,7 @@ def from_value(value: Any,
     try:
       return default_types.Literal(value)
     except:
-      raise TypeError(
-          f"Python object could not be represented through the generic tracing "
-          f"type. Consider implementing the Tracing Protocol for it: {value!r}")
+      raise TypeError(  # pylint: disable=raise-missing-from
+          f"Could not generate a generic TraceType for {value!r}."
+          f"Please verify that it is immutable/hashable. Otheriwse, consider "
+          f"implementing the Tracing Protocol for it.")
diff --git a/tensorflow/core/function/trace_type/trace_type_test.py b/tensorflow/core/function/trace_type/trace_type_test.py
index 313c2cbf083..a7c8a584ff8 100644
--- a/tensorflow/core/function/trace_type/trace_type_test.py
+++ b/tensorflow/core/function/trace_type/trace_type_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
@@ -57,7 +58,6 @@ def __eq__(self, other):
 
 class DummyGenericClass:
   """Helps test memory leaks for GenericType."""
-  pass
 
 
 class TraceTypeBuilderTest(test.TestCase, parameterized.TestCase):
@@ -218,14 +218,15 @@ def __eq__(self, o):
     obj = CustomUnhashable()
     with self.assertRaisesRegex(
         TypeError,
-        r'could not be represented through the generic tracing type'):
+        r'Could not generate a generic TraceType for'):
       trace_type.from_value(obj)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def testGetPlaceholderValue(self):
+  def testGetDefaultPlaceholderValue(self):
+    placeholder_context = trace_type.InternalPlaceholderContext()
     composite_value = [1, 2, (3, [4, 5]), {6: [7]}, TestAttrsClass(8, (10, 11))]
     composite_type = trace_type.from_value(composite_value)
-    placeholder_value = composite_type._placeholder_value()
+    placeholder_value = composite_type.placeholder_value(placeholder_context)
     self.assertEqual(composite_value, placeholder_value)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -243,6 +244,106 @@ class MockWrapper(tuple):
         trace_type.from_value(MockWrapper()),
         trace_type.from_value(ActualType(1, 2, 3)))
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testBadReturnType(self):
+    class MyClass:
+
+      def __tf_tracing_type__(self, _):
+        return 1
+
+    with self.assertRaises(TypeError):
+      trace_type.from_value(MyClass())
+
+
+class CastDefaultTypesTest(test.TestCase, parameterized.TestCase):
+
+  def testLiteral(self):
+    trace_float = default_types.Literal(1.5)
+    ctx = trace_type.InternalCastContext()
+    value = trace_float._cast(1.5, ctx)
+    self.assertEqual(value, 1.5)
+    with self.assertRaises(AssertionError):
+      _ = trace_float._cast(1, ctx)
+
+  @parameterized.parameters(list, tuple)
+  def testTupleAndList(self, container_type):
+    foo = (
+        constant_op.constant(1.0, dtypes.float32),
+        constant_op.constant(2.0, dtypes.float32))
+    foo = container_type(foo)
+    trace_foo = trace_type.from_value(foo)
+    bar = (1, 2)
+    bar = container_type(bar)
+    ctx = trace_type.InternalCastContext()
+    value = trace_foo._cast(bar, ctx)
+
+    self.assertIsInstance(value, container_type)
+    self.assertLen(value, len(bar))
+    self.assertSequenceEqual(value, bar)
+    self.assertEqual(value[0].dtype, dtypes.float32)
+    self.assertEqual(value[1].dtype, dtypes.float32)
+
+  @parameterized.parameters(
+      (list, tuple),
+      (tuple, list))
+  def testTupleAndListCannotBeCasted(self, type_a, type_b):
+    foo = (
+        constant_op.constant(1.0, dtypes.float32),
+        constant_op.constant(2.0, dtypes.float32))
+    foo = type_a(foo)
+    trace_foo = trace_type.from_value(foo)
+    bar = (1, 2)
+    bar = type_b(bar)
+    ctx = trace_type.InternalCastContext()
+    with self.assertRaises(AssertionError):
+      _ = trace_foo._cast(bar, ctx)
+
+  def testNamedTuple(self):
+    Foo = collections.namedtuple('Foo', ['x', 'y'])
+    foo = Foo(
+        constant_op.constant(1.0, dtypes.float32),
+        constant_op.constant(2.0, dtypes.float32))
+    trace_foo = trace_type.from_value(foo)
+    bar = Foo(1, 2)
+    ctx = trace_type.InternalCastContext()
+    value = trace_foo._cast(bar, ctx)
+
+    self.assertIsInstance(value, Foo)
+    self.assertLen(value, len(bar))
+    self.assertSequenceEqual(value, bar)
+    self.assertEqual(value[0].dtype, dtypes.float32)
+    self.assertEqual(value[1].dtype, dtypes.float32)
+
+  def testAttrs(self):
+    foo = TestAttrsClass(
+        constant_op.constant(1.0, dtypes.float32),
+        constant_op.constant(2.0, dtypes.float32),)
+    trace_foo = trace_type.from_value(foo)
+    bar = TestAttrsClass(1, 2)
+    ctx = trace_type.InternalCastContext()
+    value = trace_foo._cast(bar, ctx)
+
+    self.assertIsInstance(value, TestAttrsClass)
+    self.assertEqual(value.a.dtype, dtypes.float32)
+    self.assertEqual(value.b.dtype, dtypes.float32)
+
+  def testDict(self):
+    foo = {'x': constant_op.constant(1.0, dtypes.float32),
+           'y': constant_op.constant(2.0, dtypes.float32)}
+    trace_foo = trace_type.from_value(foo)
+    bar = {'x': 1, 'y': 2}
+    ctx = trace_type.InternalCastContext()
+    value = trace_foo._cast(bar, ctx)
+
+    self.assertIsInstance(value, dict)
+    self.assertSequenceEqual(
+        set(value.keys()),
+        set(bar.keys()))
+    self.assertIn('x', value)
+    self.assertIn('y', value)
+    self.assertEqual(value['x'].dtype, dtypes.float32)
+    self.assertEqual(value['y'].dtype, dtypes.float32)
+
 
 class SignatureToTraceTypeTest(test.TestCase):
 
diff --git a/tensorflow/core/function/transform/BUILD b/tensorflow/core/function/transform/BUILD
index fcbbe3ccbba..0e2d2745885 100644
--- a/tensorflow/core/function/transform/BUILD
+++ b/tensorflow/core/function/transform/BUILD
@@ -2,8 +2,9 @@ load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/core/function:__subpackages__",
+        "//tensorflow/core/function/runtime_client:__subpackages__",
     ],
 )
 
@@ -22,17 +23,18 @@ pytype_strict_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/framework:function_proto_py",
-        "//tensorflow/core/function:runtime_client_py",
+        "//tensorflow/core/function/capture:restore_captures",
+        "//tensorflow/core/function/runtime_client:runtime_client_py",
         "//tensorflow/python:default_gradient",
         "//tensorflow/python:gradients",
         "//tensorflow/python:handle_data_util",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform",
         "//tensorflow/python/util",
     ],
 )
diff --git a/tensorflow/core/function/transform/transform.py b/tensorflow/core/function/transform/transform.py
index 25a056ac24f..51e5bfff1a1 100644
--- a/tensorflow/core/function/transform/transform.py
+++ b/tensorflow/core/function/transform/transform.py
@@ -14,21 +14,21 @@
 # ==============================================================================
 """High level TF Function transformation API."""
 
-from typing import Optional, Callable, Union, List, Iterator, Dict
+from typing import Any, Callable, Iterator, Optional, Union
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
-from tensorflow.core.function import runtime_client
+from tensorflow.core.function.capture import restore_captures
+from tensorflow.core.function.runtime_client import runtime_client
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
-from tensorflow.python.eager.polymorphic_function import saved_model_utils
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import custom_gradient as custom_gradient_lib
 from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import handle_data_util
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import compat
 
 _TensorType = Union[ops.EagerTensor, ops.Tensor]
@@ -37,14 +37,17 @@
 
 def transform_function(
     f: def_function.Function,
-    inputs: Optional[Union[List[tensor_spec.TensorSpec],
-                           List[ops.Tensor]]] = None,
+    inputs: Optional[list[Any]] = None,
+    kw_inputs: Optional[dict[str, Any]] = None,
     transform_fn: Optional[Union[_FunctionDefTransformerType,
-                                 List[_FunctionDefTransformerType]]] = None,
-    mlir_pipeline: Optional[Union[str, List[str]]] = None,
-    nested_fn_transforms: Optional[Dict[
+                                 list[_FunctionDefTransformerType]]] = None,
+    mlir_pipeline: Optional[Union[str, list[str]]] = None,
+    nested_fn_transforms: Optional[dict[
         str, Optional[Union[_FunctionDefTransformerType,
-                            List[_FunctionDefTransformerType]]]]] = None
+                            list[_FunctionDefTransformerType]]]]] = None,
+    nested_mlir_transforms: Optional[dict[str,
+                                          Optional[Union[str,
+                                                         list[str]]]]] = None,
 ) -> function_lib.ConcreteFunction:
   """Applies a transformation to a tf.function to produce a new callable.
 
@@ -59,7 +62,9 @@ def transform_function(
 
   Optionally, `transform_fn` could be a list of transformation functions and
   `mlir_pipeline` could be a a list of MLIR transformations. The transformations
-  will be applied in order of the list.
+  will be applied in order of the list. For each nested `FunctionDef`, MLIR
+  transformations will be applied before Python function based transformations.
+
 
   Example:
   ```python
@@ -88,13 +93,19 @@ def add(x, y):
     inputs: The inputs or input_signature of the tf.function. This does not need
       to be specified if the `input_signature` was specified in the tf.function
       decorator.
+    kw_inputs: The keyword inputs of the tf.function. This does not need to be
+      specified if the `input_signature` was specified in the tf.function
+      decorator.
     transform_fn: A single transformation function or a list of transformation
       functions to apply on the `FunctionDef`.
     mlir_pipeline: A single MLIR pass or a list of MLIR passes to transform the
       `FunctionDef`.
-    nested_fn_transforms: A dict of transformations to apply on functions in the
-      library of `f`. The keys are the names of the library functions being
-      targeted for transformation.
+    nested_fn_transforms: A dict of Python function based transformations to
+      apply on functions in the library of `f`. The keys are the names of the
+      library functions being targeted for transformation.
+    nested_mlir_transforms: A dict of MLIR pass based transformations to apply
+      on functions in the library of `f`. The keys are the names of the library
+      functions being targeted for transformation.
 
   Returns:
     The transformed function.
@@ -117,17 +128,24 @@ def add(x, y):
   else:
     mlir_pipelines = [mlir_pipeline]
 
+  nested_fn_transforms = (
+      nested_fn_transforms if nested_fn_transforms is not None else {})
+  nested_mlir_transforms = (
+      nested_mlir_transforms if nested_mlir_transforms is not None else {})
+
   # Extract the `ConcreteFunction` from the `tf.function.`
-  if inputs is not None:
-    cf = f.get_concrete_function(*inputs)
+  if inputs is not None or kw_inputs is not None:
+    inputs = [] if inputs is None else inputs
+    kw_inputs = {} if kw_inputs is None else kw_inputs
+    cf = f.get_concrete_function(*inputs, **kw_inputs)
   else:
     cf = f.get_concrete_function()
 
   # Promote all library functions to the parent scope so that any replicated
   # functions can also re-use them.
   graph = ops.get_default_graph()
-  for _, eager_def_func in cf._func_graph._functions.items():  # pylint: disable=protected-access
-    eager_def_func.add_to_graph(graph)
+  for edf in cf.graph._functions.values():  # pylint: disable=protected-access
+    edf.add_to_graph(graph, overwrite=False)
 
   # Initialize the `runtime_client`.
   eager_ctx = runtime_client.GlobalPythonEagerContext()
@@ -147,8 +165,8 @@ def add(x, y):
     transform_fn(fndef)
 
   # Apply a transform to any of the nested _EagerDefinedFunctions(EDF) if
-  # `nested_fn_transforms` is provided.
-  if nested_fn_transforms is not None:
+  # `nested_fn_transforms` or `nested_mlir_transforms` is provided.
+  if nested_fn_transforms or nested_mlir_transforms:
     nested_functions = cf.graph._functions  # pylint: disable=protected-access
 
     # Store the new transformed functions.
@@ -158,12 +176,15 @@ def add(x, y):
     # transformed function names.
     nested_transforms_map = {}
 
-    # Transform every nested function specified in `nested_fn_transforms`.
-    for edf_name, edf_transform in nested_fn_transforms.items():
+    # Transform every nested function specified in `nested_fn_transforms` and
+    # `nested_mlir_transforms`.
+    for edf_name in nested_mlir_transforms.keys() | nested_fn_transforms.keys():
       if edf_name in nested_functions:
+        edf_transform_fn = nested_fn_transforms.get(edf_name, [])
+        edf_mlir_pipeline = nested_mlir_transforms.get(edf_name, [])
         transformed_edf = transform_eager_defined_function(
-            rt, nested_functions[edf_name], edf_transform)
-        transformed_edf.add_to_graph(graph)
+            rt, nested_functions[edf_name], edf_transform_fn, edf_mlir_pipeline)
+        transformed_edf.add_to_graph(graph, overwrite=True)
         transformed_edf_name = compat.as_str(transformed_edf.name)
         transformed_nested_functions[transformed_edf_name] = transformed_edf
         nested_transforms_map[edf_name] = transformed_edf_name
@@ -185,7 +206,8 @@ def add(x, y):
     func_graph = function_def_lib.function_def_to_graph(
         fndef,
         structured_input_signature=structured_input_signature,
-        structured_outputs=structured_outputs_signature)
+        structured_outputs=structured_outputs_signature,
+        propagate_device_spec=True)
 
   # Set handle data.
   for i, output in enumerate(cf.outputs):
@@ -213,28 +235,38 @@ def add(x, y):
   # Set arg_keywords and positional_args
   updated_cf._arg_keywords = cf._arg_keywords
   updated_cf._num_positional_args = cf._num_positional_args
-  saved_model_utils.restore_captures(updated_cf, cf.captured_inputs)
+  restore_captures.restore_captures(updated_cf, cf.captured_inputs)
   # pylint: enable=protected-access
 
   # Register the ConcreteFunction with the python Graph.
-  if nested_fn_transforms is not None:
+  if nested_fn_transforms or nested_mlir_transforms:
     for transformed_edf in transformed_nested_functions.values():
-      transformed_edf.add_to_graph(updated_cf.graph)
-  updated_cf.add_to_graph(graph)
+      transformed_edf.add_to_graph(updated_cf.graph, overwrite=True)
+  updated_cf.add_to_graph(graph, overwrite=True)
 
   return updated_cf
 
 
 def transform_eager_defined_function(
-    rt: runtime_client.Runtime, f: function_lib._EagerDefinedFunction,
+    rt: runtime_client.Runtime,
+    f: function_lib._EagerDefinedFunction,
     transform_fn: Union[_FunctionDefTransformerType,
-                        List[_FunctionDefTransformerType]]
+                        list[_FunctionDefTransformerType]],
+    mlir_pipeline: Union[str, list[str]],
 ) -> function_lib._EagerDefinedFunction:
-  """Applies a transform on an _EagerDefinedFunction."""
-  # Transform the `FunctionDef`
-  fndef = rt.GetFunctionProto(f.definition.signature.name)
+  """Applies transforms on an _EagerDefinedFunction."""
   transform_fns = (
       transform_fn if isinstance(transform_fn, list) else [transform_fn])
+  mlir_pipelines = (
+      mlir_pipeline if isinstance(mlir_pipeline, list) else [mlir_pipeline])
+  # First apply the MLIR based transformation.
+  for mlir_pipeline in mlir_pipelines:
+    rt.TransformFunction(f.signature.name, mlir_pipeline)
+
+  # Get the `FunctionDef` after MLIR transformation.
+  fndef = rt.GetFunctionProto(f.signature.name)
+
+  # Apply the Python function based transformation.
   for transform_fn in transform_fns:
     transform_fn(fndef)
   rt.CreateFunction(fndef)
@@ -245,14 +277,15 @@ def transform_eager_defined_function(
     func_graph = function_def_lib.function_def_to_graph(
         fndef,
         structured_input_signature=f.graph.structured_input_signature,
-        structured_outputs=f.graph.structured_outputs)
+        structured_outputs=f.graph.structured_outputs,
+        propagate_device_spec=True)
 
   # pylint: disable=protected-access
   # Ref: third_party/tensorflow/python/ops/control_flow_util_v2.py
   # Generate a new `_EagerDefinedFunction`.
   edf = function_lib._EagerDefinedFunction(fndef.signature.name, func_graph,
                                            func_graph.inputs,
-                                           func_graph.outputs, {})
+                                           func_graph.outputs, fndef.attr)
   # pylint: enable=protected-access
 
   return edf
@@ -283,9 +316,11 @@ def _replicate_gradient_functions(
     try:
       grad_fn = def_function.function(custom_gradient).get_concrete_function(
           None, *op.inputs)
-    except Exception as e:
-      raise ValueError("Error when tracing gradients for",
-                       replicated_graph) from e
+    except Exception:  # pylint: disable=broad-except
+      # TODO(xjun): Figure out why tracing of custom_gradient will fail.
+      tf_logging.exception(
+          f"Error when tracing gradients for {replicated_graph}.")
+      continue
 
     # Re-bind all captures to values within the replicated graph.
     remapped_captures = []
@@ -306,7 +341,7 @@ def _replicate_gradient_functions(
 
       remapped_captures.append(
           replicated_graph.get_tensor_by_name(outer_capture.name))
-    saved_model_utils.restore_captures(grad_fn, remapped_captures)
+    restore_captures.restore_captures(grad_fn, remapped_captures)
     new_gradient_op_type = custom_gradient_lib.generate_name()
     op._set_attr(  # pylint: disable=protected-access
         "_gradient_op_type",
@@ -349,7 +384,7 @@ def _get_outer_most_capture(
 
 
 def _ops_with_custom_gradients(
-    operations: List[ops.Operation]) -> Iterator[tuple[str, ops.Operation]]:
+    operations: list[ops.Operation]) -> Iterator[tuple[str, ops.Operation]]:
   """Returns an iterator over ops having custom_gradients."""
   for op in operations:
     try:
diff --git a/tensorflow/core/function/transform/transform_test.py b/tensorflow/core/function/transform/transform_test.py
index 9e3b02cffb3..fc39f605902 100644
--- a/tensorflow/core/function/transform/transform_test.py
+++ b/tensorflow/core/function/transform/transform_test.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Tests for high-level function transformation API."""
 
+import collections
+
 from absl.testing import parameterized
 
 from tensorflow.core.function.testing import test_pass
@@ -126,6 +128,48 @@ def f_g(x, y):
     self.assertEqual(
         f_g(constant_op.constant(2.0), constant_op.constant(3.0)), 5.0)
 
+  @test_util.run_v2_only
+  def test_transform_with_keywords(self):
+
+    @def_function.function
+    def f(x, y):
+      return math_ops.add(x, y, name="x_plus_y")
+
+    one = constant_op.constant(1.0)
+
+    # transfrom f(x, y): x + y -> f(x, y): x * y
+    g = transform.transform_function(
+        f,
+        inputs=[one],
+        kw_inputs={"y": one},
+        transform_fn=add_to_multiply,
+        mlir_pipeline="test-pass")
+
+    self.assertEqual(g(one, one), 1.0)
+    self.assertEqual(g(one, y=one), 1.0)
+    self.assertEqual(g(x=one, y=one), 1.0)
+
+  @test_util.run_v2_only
+  def test_transform_with_keywords_only(self):
+
+    @def_function.function
+    def f(x, y):
+      return math_ops.add(x, y, name="x_plus_y")
+
+    one = constant_op.constant(1.0)
+
+    # transfrom f(x, y): x + y -> f(x, y): x * y
+    g = transform.transform_function(
+        f,
+        inputs=None,
+        kw_inputs={"x": one, "y": one},
+        transform_fn=add_to_multiply,
+        mlir_pipeline="test-pass")
+
+    self.assertEqual(g(one, one), 1.0)
+    self.assertEqual(g(one, y=one), 1.0)
+    self.assertEqual(g(x=one, y=one), 1.0)
+
   @test_util.run_v2_only
   def test_function_spec(self):
 
@@ -218,6 +262,191 @@ def add():
         nested_fn_transforms=nested_transforms)
     self.assertEqual(updated_f(*inputs), 4.0)  # 1 x (1 x 2 x 2) = 4
 
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="fn",
+          transform_fn=add_to_multiply),
+      dict(
+          testcase_name="mlir",
+          mlir_pipeline="test-pass"),
+      dict(
+          testcase_name="fn_and_mlir",
+          transform_fn=add_to_multiply,
+          mlir_pipeline="test-pass"),
+  )
+  @test_util.run_v2_only
+  def test_nested_python_function_transform_with(self,
+                                                 transform_fn=None,
+                                                 mlir_pipeline=None):
+
+    @def_function.function
+    def f(x, y):
+
+      def inner_add():
+        return math_ops.add(x, y, name="x_plus_y")
+
+      return inner_add()
+
+    inputs = [1.0, 2.0]
+    self.assertEqual(f(*inputs), 3.0)  # 1 + 2 = 3
+
+    # Transform `f`.
+    updated_f = transform.transform_function(
+        f,
+        inputs=inputs,
+        transform_fn=transform_fn,
+        mlir_pipeline=mlir_pipeline)
+    # Nested Python functions should be transformed.
+    self.assertEqual(updated_f(*inputs), 2.0)  # 1 x 2 = 2
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="fn_and_nested_fn",
+          transform_fn=add_to_multiply,
+          nested_fn=True),
+      dict(
+          testcase_name="fn_and_nested_mlir",
+          transform_fn=add_to_multiply,
+          nested_mlir=True),
+      dict(
+          testcase_name="mlir_and_nested_fn",
+          mlir_pipeline="test-pass",
+          nested_fn=True),
+      dict(
+          testcase_name="mlir_and_nested_mlir",
+          mlir_pipeline="test-pass",
+          nested_mlir=True),
+      dict(
+          testcase_name="mlir_fn_and_nested_mlir_fn",
+          transform_fn=add_to_multiply,
+          mlir_pipeline="test-pass",
+          nested_fn=True,
+          nested_mlir=True),
+  )
+  @test_util.run_v2_only
+  def test_nested_transform_with(self,
+                                 transform_fn=None,
+                                 mlir_pipeline=None,
+                                 nested_fn=False,
+                                 nested_mlir=False):
+
+    @def_function.function
+    def f(x, y, z):
+
+      @def_function.function
+      def inner_add():
+        return math_ops.add(y, z, name="x_plus_y")
+
+      return math_ops.add(x, inner_add(), name="x_plus_y")
+
+    # 1, 2, 4 are picked so the following combinations create different results.
+    # 1 + (2 + 4) = 7
+    # 1 + (2 * 4) = 9
+    # 1 * (2 + 4) = 6
+    # 1 * (2 * 4) = 8
+    inputs = [1.0, 2.0, 4.0]
+    self.assertEqual(f(*inputs), 7.0)  # 1 + (2 + 4) = 7
+
+    # Extract all the functions in `f`'s library that we want to transform.
+    nested_fn_transforms = {}
+    nested_mlir_transforms = {}
+
+    cf = f.get_concrete_function(*inputs)
+    gdef = cf.graph.as_graph_def()
+    for fdef in gdef.library.function:
+      fdef_name = fdef.signature.name
+      if nested_fn:
+        nested_fn_transforms[fdef_name] = add_to_multiply
+      if nested_mlir:
+        nested_mlir_transforms[fdef_name] = "test-pass"
+
+    # Transform `f` and all of its library functions.
+    updated_f = transform.transform_function(
+        f,
+        inputs=inputs,
+        transform_fn=transform_fn,
+        mlir_pipeline=mlir_pipeline,
+        nested_fn_transforms=nested_fn_transforms,
+        nested_mlir_transforms=nested_mlir_transforms)
+    self.assertEqual(updated_f(*inputs), 8.0)  # 1 x (2 x 4) = 8
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="fn_and_nested_fn",
+          transform_fn=add_to_multiply,
+          nested_fn=True),
+      dict(
+          testcase_name="fn_and_nested_mlir",
+          transform_fn=add_to_multiply,
+          nested_mlir=True),
+      dict(
+          testcase_name="mlir_and_nested_fn",
+          mlir_pipeline="test-pass",
+          nested_fn=True),
+      dict(
+          testcase_name="mlir_and_nested_mlir",
+          mlir_pipeline="test-pass",
+          nested_mlir=True),
+      dict(
+          testcase_name="mlir_fn_and_nested_mlir_fn",
+          transform_fn=add_to_multiply,
+          mlir_pipeline="test-pass",
+          nested_fn=True,
+          nested_mlir=True),
+  )
+  @test_util.run_v2_only
+  def test_nested_transform_in_tf_function_with(self,
+                                                transform_fn=None,
+                                                mlir_pipeline=None,
+                                                nested_fn=False,
+                                                nested_mlir=False):
+
+    @def_function.function
+    def g(x, y, z):
+
+      @def_function.function
+      def f(x, y, z):
+
+        @def_function.function
+        def inner_add():
+          return math_ops.add(y, z, name="x_plus_y")
+
+        return math_ops.add(x, inner_add(), name="x_plus_y")
+
+      nested_fn_transforms = {}
+      nested_mlir_transforms = {}
+
+      cf = f.get_concrete_function(*inputs)
+      gdef = cf.graph.as_graph_def()
+      for fdef in gdef.library.function:
+        fdef_name = fdef.signature.name
+        if nested_fn:
+          nested_fn_transforms[fdef_name] = add_to_multiply
+        if nested_mlir:
+          nested_mlir_transforms[fdef_name] = "test-pass"
+
+      updated_f = transform.transform_function(
+          f,
+          inputs=inputs,
+          transform_fn=transform_fn,
+          mlir_pipeline=mlir_pipeline,
+          nested_fn_transforms=nested_fn_transforms,
+          nested_mlir_transforms=nested_mlir_transforms)
+
+      return updated_f(x, y, z)
+
+    inputs = [1.0, 2.0, 4.0]
+    graph_def = g.get_concrete_function(*inputs).graph.as_graph_def()
+    # Confirm all "AddV2" nodes in the library functions of graph_def are
+    # transformed to "Mul".
+    ops = collections.Counter()
+    for fdef in graph_def.library.function:
+      for node in fdef.node_def:
+        ops[node.op] += 1
+
+    self.assertNotIn("AddV2", ops)
+    self.assertEqual(ops["Mul"], 2)
+
   @test_util.run_v2_only
   def test_save_transform_for_all_signatures(self):
     m = Model()
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index d4e5f507d72..d29de9199ed 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_cc_tests",
 )
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -10,6 +11,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
     ],
@@ -22,7 +24,18 @@ package(
 cc_library(
     name = "mkl_graph_util",
     hdrs = ["mkl_graph_util.h"],
-    deps = ["@com_google_absl//absl/container:flat_hash_map"],
+    deps = [
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "zen_graph_util",
+    hdrs = ["zen_graph_util.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 tf_cc_test(
@@ -219,3 +232,56 @@ exports_files(
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
+
+tf_cc_tests(
+    name = "higher_level_tests",
+    size = "small",
+    srcs = [
+        "algorithm_test.cc",
+        "control_flow_test.cc",
+        "costmodel_test.cc",
+        "edgeset_test.cc",
+        "graph_def_builder_test.cc",
+        "graph_partition_test.cc",
+        "graph_test.cc",
+        "node_builder_test.cc",
+        "optimizer_cse_test.cc",
+        "subgraph_test.cc",
+        "tensor_id_test.cc",
+        "validate_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/cc:while_loop",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/core/util:protos_test_cc",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 5478a657bf8..37d1e69c5b3 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -70,7 +70,13 @@ void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
         CHECK_EQ(num_slots, slot_bytes_[global_id].size());
       }
       for (int s = 0; s < num_slots; ++s) {
-        slot_bytes_[global_id][s] += cm.slot_bytes_[local_id][s];
+        auto& current_v = slot_bytes_[global_id][s];
+        auto other_v = cm.slot_bytes_[local_id][s];
+        if (current_v < 0) {
+          current_v = other_v;
+        } else if (other_v > 0) {
+          current_v += other_v;
+        }
       }
     }
   }
@@ -81,10 +87,10 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
   CHECK_EQ(true, cm.is_global());
   const int num_nodes = cm.count_.size();
   for (int i = num_nodes - 1; i >= 0; --i) {
-    count_[i] += cm.count_[i];
-    time_[i] += cm.time_[i];
     int num_slots = cm.slot_bytes_[i].size();
     Ensure(i, num_slots);
+    count_[i] += cm.count_[i];
+    time_[i] += cm.time_[i];
     if (num_slots > 0) {
       if (slot_bytes_[i].empty()) {
         slot_bytes_[i].resize(num_slots);
@@ -92,7 +98,13 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
         CHECK_EQ(num_slots, slot_bytes_[i].size());
       }
       for (int s = 0; s < num_slots; ++s) {
-        slot_bytes_[i][s] += cm.slot_bytes_[i][s];
+        auto& current_v = slot_bytes_[i][s];
+        auto other_v = cm.slot_bytes_[i][s];
+        if (current_v < 0) {
+          current_v = other_v;
+        } else if (other_v > 0) {
+          current_v += other_v;
+        }
       }
     }
   }
@@ -118,8 +130,14 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
         if (static_cast<size_t>(si) >= slot_bytes_[global_id].size()) {
           slot_bytes_[global_id].resize(1 + si);
         }
-        slot_bytes_[global_id][si] +=
+        auto& current_v = slot_bytes_[global_id][si];
+        auto other_v =
             no.tensor_description().allocation_description().requested_bytes();
+        if (current_v < 0) {
+          current_v = other_v;
+        } else if (other_v > 0) {
+          current_v += other_v;
+        }
       }
     }
   }
@@ -306,7 +324,7 @@ DataType CostModel::MaxMemoryType(const Node* node, int slot) const {
 
 Bytes CostModel::TempMemorySize(const Node* node) const {
   const int id = Id(node);
-  if (id < 0) {
+  if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size()) {
     return Bytes(0);
   }
   return max_mem_usage_[id].temp_memory_size;
@@ -314,7 +332,7 @@ Bytes CostModel::TempMemorySize(const Node* node) const {
 
 Bytes CostModel::PersistentMemorySize(const Node* node) const {
   const int id = Id(node);
-  if (id < 0) {
+  if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size()) {
     return Bytes(0);
   }
   return max_mem_usage_[id].persistent_memory_size;
@@ -324,6 +342,7 @@ void CostModel::RecordMemoryStats(const Node* node,
                                   const MemoryStats& memory_stats) {
   const int id = Id(node);
   if (id < 0) return;
+  Ensure(id, node->num_outputs());
   max_mem_usage_[id].temp_memory_size = memory_stats.temp_memory_size();
   max_mem_usage_[id].persistent_memory_size =
       memory_stats.persistent_memory_size();
@@ -370,12 +389,7 @@ bool CostModel::IsPersistentTensor(const Node* node, int64_t alloc_id) const {
   if (persistent_alloc_ids_.count(alloc_id) > 0) {
     return true;
   }
-  if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
-      persistent_alloc_ids_by_devices_.end()) {
-    return false;
-  }
-  return persistent_alloc_ids_by_devices_.at(node->assigned_device_name())
-      .count(alloc_id);
+  return false;
 }
 
 Microseconds CostModel::CopyTimeEstimate(Bytes b, double network_latency_millis,
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 9fc08254514..56d105fcc9f 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
 #define TENSORFLOW_CORE_GRAPH_COSTMODEL_H_
 
+#include <set>
 #include <unordered_map>
 #include <vector>
 
@@ -228,7 +229,6 @@ class CostModel {
   std::vector<gtl::InlinedVector<int64_t, 2>> output_port_alloc_ids_;
 
   std::set<int64_t> persistent_alloc_ids_;
-  std::map<string, std::set<int64_t>> persistent_alloc_ids_by_devices_;
 
   TensorShapeProto unknown_shape_;
 
diff --git a/tensorflow/core/graph/costmodel_test.cc b/tensorflow/core/graph/costmodel_test.cc
index 2719876eb50..67d39adeafb 100644
--- a/tensorflow/core/graph/costmodel_test.cc
+++ b/tensorflow/core/graph/costmodel_test.cc
@@ -17,14 +17,19 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -33,6 +38,24 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::Not;
+
+// Work-around for lack of ShapeProtoEquals in OSS.
+MATCHER_P(ShapeProtoEquals, other, "") {
+  if (arg.unknown_rank()) {
+    return other.unknown_rank();
+  }
+  if (arg.dim_size() != other.dim_size()) {
+    return false;
+  }
+  for (int i = 0; i < arg.dim_size(); ++i) {
+    if (arg.dim(i).size() != other.dim(i).size()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 static void InitGraph(const string& s, Graph* graph) {
   GraphDef graph_def;
 
@@ -42,6 +65,54 @@ static void InitGraph(const string& s, Graph* graph) {
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
 }
 
+static void InitModelFromGraph(const Graph& graph, CostModel& cm) {
+  // This adjusts the model to include all of the graph's nodes.
+  // Unlike CostModel::InitFromGraph(), this method does not add
+  // default estimates for sizes or times.
+  for (const auto& node : graph.nodes()) {
+    cm.SetNumOutputs(node, node->num_outputs());
+  }
+}
+
+// Creates a graph with two multiply nodes.
+static std::unique_ptr<Graph> CreateBasicTestGraph() {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }",
+      graph.get());
+  return graph;
+}
+
+Node* FindNode(const Graph& graph, std::string name) {
+  for (const auto& node : graph.nodes()) {
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+Node* AddNode(Graph& graph, const string& name, const string& node_type,
+              int num_inputs) {
+  auto builder = NodeDefBuilder(name, node_type);
+  for (int i = 0; i < num_inputs; ++i) {
+    builder = builder.Input(strings::StrCat("node_", i), i, DT_FLOAT);
+  }
+
+  NodeDef node_def;
+  TF_CHECK_OK(builder.Finalize(&node_def));
+
+  Status s;
+  Node* node = graph.AddNode(node_def, &s);
+  TF_CHECK_OK(s);
+  return node;
+}
+
 static void GenerateStepStats(Graph* graph, StepStats* step_stats,
                               const string& device_name) {
   // Fill RunMetadata's step_stats and partition_graphs fields.
@@ -55,12 +126,10 @@ static void GenerateStepStats(Graph* graph, StepStats* step_stats,
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
 
-TEST(CostModelTest, GlobalId) {
+TEST(CostModelTest, WorksWithManager) {
   Scope scope = Scope::NewRootScope().ExitOnError();
-  std::unique_ptr<Graph> graph1 =
-      std::unique_ptr<Graph>(new Graph(OpRegistry::Global()));
-  std::unique_ptr<Graph> graph2 =
-      std::unique_ptr<Graph>(new Graph(OpRegistry::Global()));
+  auto graph1 = std::make_unique<Graph>(OpRegistry::Global());
+  auto graph2 = std::make_unique<Graph>(OpRegistry::Global());
   InitGraph(
       "node { name: 'A1' op: 'Input'}"
       "node { name: 'B1' op: 'Input'}"
@@ -100,5 +169,497 @@ TEST(CostModelTest, GlobalId) {
   }
 }
 
+TEST(CostModelTest, GlobalId) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm_local(/*is_global=*/false);
+  CostModel cm_global(/*is_global=*/true);
+
+  constexpr int kOffset = 7;
+  for (const auto& node : graph->nodes()) {
+    // Local cost models use the local id and offset.
+    EXPECT_EQ(cm_local.GlobalId(node, kOffset), node->id() + kOffset);
+    // Global cost modesl use the cost id.
+    EXPECT_EQ(cm_global.GlobalId(node, kOffset), node->cost_id());
+  }
+}
+
+TEST(CostModelTest, RecordTime) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+
+  constexpr int kIters = 100;
+  constexpr int kMicrosPerIter = 1000;
+  for (int i = 0; i < kIters; ++i) {
+    for (const auto& node : graph->op_nodes()) {
+      cm.RecordTime(node, node->id() * Microseconds(kMicrosPerIter));
+    }
+  }
+
+  for (const auto& node : graph->op_nodes()) {
+    EXPECT_EQ(cm.TotalTime(node),
+              Microseconds(node->id() * kIters * kMicrosPerIter));
+  }
+
+  // Total time for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.TotalTime(E), Microseconds(0));
+}
+
+TEST(CostModelTest, RecordCount) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+
+  constexpr int kIters = 100;
+  constexpr int kCountPerIter = 4;
+  for (int i = 0; i < kIters; ++i) {
+    for (const auto& node : graph->op_nodes()) {
+      cm.RecordCount(node, node->id() * kCountPerIter);
+    }
+  }
+
+  for (const auto& node : graph->op_nodes()) {
+    EXPECT_EQ(cm.TotalCount(node), node->id() * kIters * kCountPerIter);
+  }
+
+  // Total count for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.TotalCount(E), 0);
+}
+
+TEST(CostModelTest, RecordSize) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+
+  constexpr int kIters = 100;
+  constexpr int kBytesPerIter = 4;
+  for (int i = 0; i < kIters; ++i) {
+    for (const auto& node : graph->op_nodes()) {
+      for (int slot = 0; slot < node->num_outputs(); ++slot) {
+        cm.RecordSize(node, slot, Bytes((node->id() + slot) * kBytesPerIter));
+      }
+    }
+  }
+
+  for (const auto& node : graph->op_nodes()) {
+    for (int slot = 0; slot < node->num_outputs(); ++slot) {
+      EXPECT_EQ(cm.TotalBytes(node, slot),
+                Bytes((node->id() + slot) * kIters * kBytesPerIter));
+    }
+  }
+
+  // Total size for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.TotalBytes(E, 0), Bytes(0));
+}
+
+TEST(CostModelTest, SizeEstimate) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+  Node* C = FindNode(*graph, "C");
+
+  // Size estimate should be total bytes / total count.
+  constexpr int kBytesPerCount = 31;
+  constexpr int kCount = 17;
+  cm.RecordCount(C, kCount);
+  cm.RecordSize(C, 0, Bytes(kCount * kBytesPerCount));
+  EXPECT_EQ(cm.SizeEstimate(C, 0), Bytes(kBytesPerCount));
+}
+
+TEST(CostModelTest, TimeEstimate) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+  Node* C = FindNode(*graph, "C");
+
+  // Time estimate should be total time / total count.
+  constexpr int kMicrosPerCount = 31;
+  constexpr int kCount = 17;
+  cm.RecordCount(C, kCount);
+  cm.RecordTime(C, Microseconds(kCount * kMicrosPerCount));
+  EXPECT_EQ(cm.TimeEstimate(C), Microseconds(kMicrosPerCount));
+}
+
+TensorShapeProto CreateTensorShapeProto(gtl::ArraySlice<int64_t> dims) {
+  TensorShapeProto shape;
+  for (int i = 0; i < dims.size(); ++i) {
+    shape.add_dim()->set_size(dims[i]);
+  }
+  return shape;
+}
+
+int64_t Count(const TensorShapeProto& shape) {
+  int64_t count = 1;
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    count *= shape.dim(i).size();
+  }
+  return count;
+}
+
+TEST(CostModelTest, RecordMaxMemorySize) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  Node* C = FindNode(*graph, "C");
+  InitModelFromGraph(*graph, cm);
+
+  EXPECT_EQ(cm.MaxMemorySize(C, 0), Bytes(-1));
+
+  {
+    const TensorShapeProto shape = CreateTensorShapeProto({2, 5, 10});
+    const DataType dtype = DataType::DT_FLOAT;
+    const Bytes bytes = Bytes(Count(shape) * sizeof(float));
+    cm.RecordMaxMemorySize(C, 0, bytes, shape, dtype);
+    EXPECT_EQ(cm.MaxMemorySize(C, 0), bytes);
+    EXPECT_EQ(cm.MaxMemoryType(C, 0), dtype);
+    EXPECT_THAT(cm.MaxMemoryShape(C, 0), ShapeProtoEquals(shape));
+  }
+
+  // Records higher memory value.
+  {
+    const TensorShapeProto shape = CreateTensorShapeProto({3, 6, 11});
+    const DataType dtype = DataType::DT_DOUBLE;
+    const Bytes bytes = Bytes(Count(shape) * sizeof(double));
+    cm.RecordMaxMemorySize(C, 0, bytes, shape, dtype);
+    EXPECT_EQ(cm.MaxMemorySize(C, 0), bytes);
+    EXPECT_EQ(cm.MaxMemoryType(C, 0), dtype);
+    EXPECT_THAT(cm.MaxMemoryShape(C, 0), ShapeProtoEquals(shape));
+  }
+
+  // Lower memory value ignored.
+  {
+    const TensorShapeProto shape = CreateTensorShapeProto({1, 1, 1});
+    const DataType dtype = DataType::DT_BFLOAT16;
+    const Bytes bytes = Bytes(Count(shape) * sizeof(double));
+    cm.RecordMaxMemorySize(C, 0, bytes, shape, dtype);
+    EXPECT_GT(cm.MaxMemorySize(C, 0), bytes);
+    EXPECT_NE(cm.MaxMemoryType(C, 0), dtype);
+    EXPECT_THAT(cm.MaxMemoryShape(C, 0), Not(ShapeProtoEquals(shape)));
+  }
+
+  // Bytes computed from shape/dtype.
+  {
+    const TensorShapeProto shape = CreateTensorShapeProto({100, 100, 100});
+    const DataType dtype = DataType::DT_BFLOAT16;
+    cm.RecordMaxMemorySize(C, 0, Bytes(-1), shape, dtype);
+    EXPECT_EQ(cm.MaxMemorySize(C, 0), Bytes(Count(shape) * sizeof(bfloat16)));
+    EXPECT_EQ(cm.MaxMemoryType(C, 0), dtype);
+    EXPECT_THAT(cm.MaxMemoryShape(C, 0), ShapeProtoEquals(shape));
+  }
+
+  // Max memory size for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.MaxMemorySize(E, 0), Bytes(0));
+  EXPECT_THAT(cm.MaxMemoryType(E, 0), DataType::DT_INVALID);
+  TensorShapeProto unknown;
+  unknown.set_unknown_rank(true);
+  EXPECT_THAT(cm.MaxMemoryShape(E, 0), ShapeProtoEquals(unknown));
+}
+
+TEST(CostModelTest, RecordMaxExecutionTime) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+  Node* C = FindNode(*graph, "C");
+
+  EXPECT_EQ(cm.MaxExecutionTime(C), Microseconds(0));
+
+  cm.RecordMaxExecutionTime(C, Microseconds(13));
+  EXPECT_EQ(cm.MaxExecutionTime(C), Microseconds(13));
+  cm.RecordMaxExecutionTime(C, Microseconds(27));
+  EXPECT_EQ(cm.MaxExecutionTime(C), Microseconds(27));
+  cm.RecordMaxExecutionTime(C, Microseconds(9));
+  EXPECT_EQ(cm.MaxExecutionTime(C), Microseconds(27));
+
+  // Max execution time for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.MaxExecutionTime(E), Microseconds(0));
+}
+
+TEST(CostModelTest, RecordMemoryStats) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+  Node* C = FindNode(*graph, "C");
+
+  MemoryStats stats;
+  stats.set_temp_memory_size(256);
+  stats.set_persistent_memory_size(16);
+  stats.add_persistent_tensor_alloc_ids(1);
+  stats.add_persistent_tensor_alloc_ids(3);
+  stats.add_persistent_tensor_alloc_ids(5);
+  stats.add_persistent_tensor_alloc_ids(5);  // Intentional duplicate.
+
+  cm.RecordMemoryStats(C, stats);
+  EXPECT_EQ(cm.TempMemorySize(C), stats.temp_memory_size());
+  EXPECT_EQ(cm.PersistentMemorySize(C), stats.persistent_memory_size());
+  EXPECT_TRUE(cm.IsPersistentTensor(C, 1));
+  EXPECT_TRUE(cm.IsPersistentTensor(C, 3));
+  EXPECT_TRUE(cm.IsPersistentTensor(C, 5));
+  EXPECT_FALSE(cm.IsPersistentTensor(C, 31));
+
+  // Info for unrecorded node is 0.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.TempMemorySize(E), Bytes(0));
+  EXPECT_EQ(cm.PersistentMemorySize(E), Bytes(0));
+}
+
+TEST(CostModelTest, RecordAllocationId) {
+  auto graph = CreateBasicTestGraph();
+  CostModel cm(/*is_global=*/false);
+  InitModelFromGraph(*graph, cm);
+  Node* C = FindNode(*graph, "C");
+
+  cm.RecordAllocationId(C, /*output_slot=*/0, /*alloc_id=*/13);
+  EXPECT_EQ(cm.AllocationId(C, /*output_slot=*/0), 13);
+
+  // Invalid slot returns -1.
+  EXPECT_EQ(cm.AllocationId(C, /*output_slot=*/7), -1);
+  // Unrecorded node returns -1.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  EXPECT_EQ(cm.AllocationId(E, /*output_slot=*/0), -1);
+}
+
+TEST(CostModelTest, CopyTimeEstimate) {
+  // Current estimate is a linear model bytes / rate + latency.
+  int64_t bytes = 32568;
+  double latency_ms = 10.2;
+  double gbps = 2.2;
+  double bytes_per_usec = gbps * 1000 / 8;
+  double cost_usecs = (bytes / bytes_per_usec + latency_ms * 1000);
+
+  EXPECT_EQ(CostModel::CopyTimeEstimate(Bytes(bytes), latency_ms, gbps),
+            Microseconds(static_cast<uint64_t>(cost_usecs)));
+}
+
+TEST(CostModelTest, ComputationTimeEstimate) {
+  // Current estimate is 1000 math ops per microsecond.
+  constexpr int64_t kNumMathOps = 32150;
+  EXPECT_EQ(CostModel::ComputationTimeEstimate(kNumMathOps),
+            Microseconds(kNumMathOps / 1000));
+}
+
+TEST(CostModel, UpdateTimes) {
+  CostModel cm(/*is_global=*/false);
+  EXPECT_EQ(cm.GetUpdateTimes(), 0);
+
+  constexpr int kNumUpdates = 111;
+  for (int i = 0; i < kNumUpdates; ++i) {
+    cm.IncrementUpdateTimes();
+  }
+  EXPECT_EQ(cm.GetUpdateTimes(), kNumUpdates);
+}
+
+TEST(CostModel, SuppressInfrequent) {
+  // Infrequent count is used in the size and time estimates.
+  CostModel cm(/*is_global=*/false);
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  Node* A = AddNode(*graph, "A", "Mul", 2);
+  Node* B = AddNode(*graph, "B", "Mul", 2);
+  Node* C = AddNode(*graph, "B", "Mul", 2);
+  InitModelFromGraph(*graph, cm);
+
+  // A and B are frequent, C is not.
+  cm.RecordCount(A, 1000);
+  cm.RecordSize(A, 0, Bytes(8 * 1000));
+  cm.RecordTime(A, Microseconds(8 * 1000));
+  cm.RecordCount(B, 2000);
+  cm.RecordSize(B, 0, Bytes(2000 * 10));
+  cm.RecordTime(B, Microseconds(2000 * 10));
+  cm.RecordCount(C, 17);
+  cm.RecordSize(C, 0, Bytes(32 * 17));
+  cm.RecordTime(C, Microseconds(32 * 17));
+
+  // Estimate size and time without suppression.
+  EXPECT_EQ(cm.SizeEstimate(A, 0), Bytes(8));
+  EXPECT_EQ(cm.TimeEstimate(A), Microseconds(8));
+  EXPECT_EQ(cm.SizeEstimate(B, 0), Bytes(10));
+  EXPECT_EQ(cm.TimeEstimate(B), Microseconds(10));
+  EXPECT_EQ(cm.SizeEstimate(C, 0), Bytes(32));
+  EXPECT_EQ(cm.TimeEstimate(C), Microseconds(32));
+
+  cm.SuppressInfrequent();
+  // Sizes and times suppressed for C but not A, B.
+  EXPECT_EQ(cm.SizeEstimate(A, 0), Bytes(8));
+  EXPECT_EQ(cm.TimeEstimate(A), Microseconds(8));
+  EXPECT_EQ(cm.SizeEstimate(B, 0), Bytes(10));
+  EXPECT_EQ(cm.TimeEstimate(B), Microseconds(10));
+  EXPECT_EQ(cm.SizeEstimate(C, 0), Bytes(0));
+  EXPECT_EQ(cm.TimeEstimate(C), Microseconds(1));  // kMinTimeEstimate.
+}
+
+TEST(CostModelTest, MergeFromLocal) {
+  CostModel cm_global(/*is_global=*/true);
+  CostModel cm_local(/*is_global=*/false);
+
+  auto graph = CreateBasicTestGraph();
+  InitModelFromGraph(*graph, cm_global);
+
+  // Populate global model.
+  Node* C = FindNode(*graph, "C");
+  Node* D = FindNode(*graph, "D");
+  cm_global.RecordCount(C, 23);
+  cm_global.RecordSize(C, 0, Bytes(23));
+  cm_global.RecordTime(C, Microseconds(123));
+  cm_global.RecordCount(D, 17);
+  cm_global.RecordSize(D, 0, Bytes(17));
+  cm_global.RecordTime(D, Microseconds(117));
+
+  // Add new nodes and add cost to a local model.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  graph->AddEdge(C, 0, E, 0);
+  graph->AddEdge(D, 0, E, 1);
+  Node* F = AddNode(*graph, "F", "Mul", 2);
+  graph->AddEdge(E, 0, F, 0);
+  graph->AddEdge(D, 0, F, 1);
+  InitModelFromGraph(*graph, cm_local);
+
+  cm_local.RecordCount(E, 37);
+  cm_local.RecordSize(E, 0, Bytes(37));
+  cm_local.RecordTime(E, Microseconds(137));
+  cm_local.RecordCount(F, 41);
+  cm_local.RecordSize(F, 0, Bytes(41));
+  cm_local.RecordTime(F, Microseconds(141));
+  // Add existing node to check stats are added.
+  cm_local.RecordCount(C, 1);
+  cm_local.RecordSize(C, 0, Bytes(1));
+  cm_local.RecordTime(C, Microseconds(100));
+
+  // Merge and check that stats from local are now in global.
+  cm_global.MergeFromLocal(*graph, cm_local);
+  EXPECT_EQ(cm_global.TotalCount(E), cm_local.TotalCount(E));
+  EXPECT_EQ(cm_global.TotalBytes(E, 0), cm_local.TotalBytes(E, 0));
+  EXPECT_EQ(cm_global.TotalTime(E), cm_local.TotalTime(E));
+  EXPECT_EQ(cm_global.TotalCount(F), cm_local.TotalCount(F));
+  EXPECT_EQ(cm_global.TotalBytes(F, 0), cm_local.TotalBytes(F, 0));
+  EXPECT_EQ(cm_global.TotalTime(F), cm_local.TotalTime(F));
+  // Stats for C are added.
+  EXPECT_EQ(cm_global.TotalCount(C), Microseconds(24));
+  EXPECT_EQ(cm_global.TotalBytes(C, 0), Bytes(24));
+  EXPECT_EQ(cm_global.TotalTime(C), Microseconds(223));
+}
+
+TEST(CostModelTest, MergeFromGlobal) {
+  CostModel cm1(/*is_global=*/true);
+  CostModel cm2(/*is_global=*/true);
+
+  auto graph = CreateBasicTestGraph();
+  InitModelFromGraph(*graph, cm1);
+
+  // Populate global model.
+  Node* C = FindNode(*graph, "C");
+  Node* D = FindNode(*graph, "D");
+  cm1.RecordCount(C, 23);
+  cm1.RecordSize(C, 0, Bytes(23));
+  cm1.RecordTime(C, Microseconds(123));
+  cm1.RecordCount(D, 17);
+  cm1.RecordSize(D, 0, Bytes(17));
+  cm1.RecordTime(D, Microseconds(117));
+
+  // Add new nodes and add cost to a local model.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  graph->AddEdge(C, 0, E, 0);
+  graph->AddEdge(D, 0, E, 1);
+  Node* F = AddNode(*graph, "F", "Mul", 2);
+  graph->AddEdge(E, 0, F, 0);
+  graph->AddEdge(D, 0, F, 1);
+  InitModelFromGraph(*graph, cm2);
+
+  cm2.RecordCount(E, 37);
+  cm2.RecordSize(E, 0, Bytes(37));
+  cm2.RecordTime(E, Microseconds(137));
+  cm2.RecordCount(F, 41);
+  cm2.RecordSize(F, 0, Bytes(41));
+  cm2.RecordTime(F, Microseconds(141));
+  // Add existing node to check stats are added.
+  cm2.RecordCount(C, 1);
+  cm2.RecordSize(C, 0, Bytes(1));
+  cm2.RecordTime(C, Microseconds(100));
+
+  // Merge and check that stats are merged.
+  cm1.MergeFromGlobal(cm2);
+  EXPECT_EQ(cm1.TotalCount(E), cm2.TotalCount(E));
+  EXPECT_EQ(cm1.TotalBytes(E, 0), cm2.TotalBytes(E, 0));
+  EXPECT_EQ(cm1.TotalTime(E), cm2.TotalTime(E));
+  EXPECT_EQ(cm1.TotalCount(F), cm2.TotalCount(F));
+  EXPECT_EQ(cm1.TotalBytes(F, 0), cm2.TotalBytes(F, 0));
+  EXPECT_EQ(cm1.TotalTime(F), cm2.TotalTime(F));
+  // Stats for C are added.
+  EXPECT_EQ(cm1.TotalCount(C), Microseconds(24));
+  EXPECT_EQ(cm1.TotalBytes(C, 0), Bytes(24));
+  EXPECT_EQ(cm1.TotalTime(C), Microseconds(223));
+}
+
+NodeExecStats CreateNodeExecStats(const Node* node, int64_t time,
+                                  int64_t bytes) {
+  NodeExecStats stats;
+  stats.set_node_name(node->name());
+  stats.set_op_start_rel_micros(10);
+  stats.set_op_end_rel_micros(10 + time);
+  for (int i = 0; i < node->num_outputs(); ++i) {
+    NodeOutput* no = stats.add_output();
+    no->set_slot(i);
+    no->mutable_tensor_description()
+        ->mutable_allocation_description()
+        ->set_requested_bytes(bytes);
+  }
+  return stats;
+}
+
+TEST(CostModelTest, MergeFromStats) {
+  CostModel cm(/*is_global=*/true);
+  auto graph = CreateBasicTestGraph();
+  InitModelFromGraph(*graph, cm);
+
+  // Populate global model.
+  Node* C = FindNode(*graph, "C");
+  Node* D = FindNode(*graph, "D");
+  cm.RecordCount(C, 23);
+  cm.RecordTime(C, Microseconds(123));
+  cm.RecordCount(D, 17);
+  cm.RecordTime(D, Microseconds(117));
+
+  // Add new nodes and create stats.
+  Node* E = AddNode(*graph, "E", "Mul", 2);
+  graph->AddEdge(C, 0, E, 0);
+  graph->AddEdge(D, 0, E, 1);
+  Node* F = AddNode(*graph, "F", "Mul", 2);
+  graph->AddEdge(E, 0, F, 0);
+  graph->AddEdge(D, 0, F, 1);
+
+  StepStats stats;
+  DeviceStepStats* dstats = stats.add_dev_stats();
+  *(dstats->add_node_stats()) = CreateNodeExecStats(C, 10, 10);
+  *(dstats->add_node_stats()) = CreateNodeExecStats(D, 10, 10);
+  *(dstats->add_node_stats()) = CreateNodeExecStats(E, 20, 20);
+  *(dstats->add_node_stats()) = CreateNodeExecStats(E, 20, 20);
+  *(dstats->add_node_stats()) = CreateNodeExecStats(F, 30, 30);
+  *(dstats->add_node_stats()) = CreateNodeExecStats(F, 30, 30);
+
+  NodeNameToCostIdMap id_map;
+  for (const auto& node : graph->nodes()) {
+    id_map.emplace(node->name(), node->cost_id());
+  }
+  cm.MergeFromStats(id_map, stats);
+
+  // Stats for C/D are added to existing.
+  EXPECT_EQ(cm.TotalCount(C), 24);
+  EXPECT_EQ(cm.TotalTime(C), Microseconds(133));
+  EXPECT_EQ(cm.TotalBytes(C, 0), Bytes(10));
+  EXPECT_EQ(cm.TotalCount(D), 18);
+  EXPECT_EQ(cm.TotalTime(D), Microseconds(127));
+  EXPECT_EQ(cm.TotalBytes(D, 0), Bytes(10));
+
+  // Stats for E/F are accumulated.
+  EXPECT_EQ(cm.TotalCount(E), 2);
+  EXPECT_EQ(cm.TotalTime(E), Microseconds(40));
+  EXPECT_EQ(cm.TotalBytes(E, 0), Bytes(40));
+  EXPECT_EQ(cm.TotalCount(F), 2);
+  EXPECT_EQ(cm.TotalTime(F), Microseconds(60));
+  EXPECT_EQ(cm.TotalBytes(F, 0), Bytes(60));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 86359710f19..886c4051c8c 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -92,9 +92,7 @@ bool inline DoesControlEdgeExist(const Node* src, const Node* dst) {
 // TODO(intel_tf): Cleanup shall be done in future:
 //                 (1) Remove this method;
 //                 (2) Update related code wherever it is called.
-bool inline NativeFormatEnabled() {
-  return true;
-}
+bool inline NativeFormatEnabled() { return true; }
 
 // Check if the data_format attribute in the node def represents 5D tensor
 bool inline Check5DFormat(const NodeDef& ndef) {
@@ -148,7 +146,8 @@ inline string GetMklNativeOpName(const string& name) {
        0 == name.compare("BatchMatMul") || 0 == name.compare("BatchMatMulV2") ||
        0 == name.compare("Einsum") || 0 == name.compare("MatMul") ||
        0 == name.compare("Transpose") || 0 == name.compare("QuantizeV2") ||
-       0 == name.compare("Dequantize") || 0 == name.rfind("Quantized", 0));
+       0 == name.compare("Dequantize") || 0 == name.compare("Softmax") ||
+       0 == name.rfind("Quantized", 0));
 
   if (result) {
     return string(kMklOpPrefix) + name;
diff --git a/tensorflow/core/graph/mkl_testlib.cc b/tensorflow/core/graph/mkl_testlib.cc
index eabb743d3f6..2029207ee43 100644
--- a/tensorflow/core/graph/mkl_testlib.cc
+++ b/tensorflow/core/graph/mkl_testlib.cc
@@ -36,6 +36,15 @@ Node* oneDNNMatmul(Graph* g, Node* in0, Node* in1, bool transpose_a,
   return ret;
 }
 
+Node* oneDNNSoftmax(Graph* g, Node* input) {
+  Node* ret = nullptr;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_MklSoftmax")
+                  .Input(input)
+                  .Attr("_kernel", mkl_op_registry::kMklNameChangeOpLabel)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 }  // namespace graph
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_testlib.h b/tensorflow/core/graph/mkl_testlib.h
index 294b6adae04..7f3fb726c80 100644
--- a/tensorflow/core/graph/mkl_testlib.h
+++ b/tensorflow/core/graph/mkl_testlib.h
@@ -28,6 +28,8 @@ namespace graph {
 Node* oneDNNMatmul(Graph* g, Node* in0, Node* in1, bool transpose_a,
                    bool transpose_b);
 
+Node* oneDNNSoftmax(Graph* g, Node* input);
+
 }  // namespace graph
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/regularization/BUILD b/tensorflow/core/graph/regularization/BUILD
index e039dd64cfa..543e490dfe2 100644
--- a/tensorflow/core/graph/regularization/BUILD
+++ b/tensorflow/core/graph/regularization/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index f56cd73fa8a..160efcfc9c8 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/op_def_builder.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -178,5 +180,80 @@ TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
             tensorflow::error::ALREADY_EXISTS);
 }
 
+TEST(ValidateGraphHasNoCycleTest, NoCyclePasses) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'B' op: 'FloatInput' }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+
+  Graph graph(OpRegistry::Global());
+  GraphConstructorOptions opts;
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+
+  TF_EXPECT_OK(graph::ValidateGraphHasNoCycle(graph));
+}
+
+TEST(ValidateGraphHasNoCycleTest, NoCycleWithMergePasses) {
+  const string graph_def_str =
+      R"EOF(
+      node { name: 'A' op: 'FloatInput' }
+      node { name: 'merge' op: 'Merge' input: [ 'A:0', 'next:0' ]
+             attr { key: "N" value: { i: 2 } }
+             attr { key: "T" value: { type: DT_FLOAT } } }
+      node { name: 'B' op: 'Mul'
+             attr { key: 'T' value { type: DT_FLOAT } }
+             input: [ 'merge:0', 'merge:0' ] }
+      node { name: 'next' op: 'NextIteration' input: ['B:0']
+             attr { key: "T" value: { type: DT_FLOAT } } }
+      )EOF";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+
+  Graph graph(OpRegistry::Global());
+  GraphConstructorOptions opts;
+  TF_ASSERT_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+
+  TF_EXPECT_OK(graph::ValidateGraphHasNoCycle(graph));
+}
+
+Node* AddNodeFromNodeDef(Graph& graph, const string& name,
+                         const string& node_type, int num_inputs) {
+  auto builder = NodeDefBuilder(name, node_type);
+  for (int i = 0; i < num_inputs; ++i) {
+    builder = builder.Input(strings::StrCat("node_", i), i, DT_FLOAT);
+  }
+
+  NodeDef node_def;
+  TF_CHECK_OK(builder.Finalize(&node_def));
+
+  Status s;
+  Node* node = graph.AddNode(node_def, &s);
+  TF_CHECK_OK(s);
+  return node;
+}
+
+TEST(ValidateGraphHasNoCycleTest, CycleFails) {
+  // Need to construct graph explicitly, since GraphDefToGraph has its own
+  // cycle validation routine.
+  Graph graph(OpRegistry::Global());
+  GraphConstructorOptions opts;
+
+  Node* a = AddNodeFromNodeDef(graph, "A", "FloatInput", 0);
+  Node* c = AddNodeFromNodeDef(graph, "B", "Mul", 2);
+  graph.AddEdge(a, 0, c, 0);
+  graph.AddEdge(c, 0, c, 1);  // Loop from C->C.
+
+  EXPECT_THAT(
+      graph::ValidateGraphHasNoCycle(graph),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("Graph is invalid, contains a cycle")));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/zen_graph_util.h b/tensorflow/core/graph/zen_graph_util.h
new file mode 100644
index 00000000000..7dc23fbc345
--- /dev/null
+++ b/tensorflow/core/graph/zen_graph_util.h
@@ -0,0 +1,83 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
+#ifdef AMD_ZENDNN
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+namespace zen_op_registry {
+
+// Prefix that we add to Tensorflow op name to construct Zen op name.
+static const char* const kZenNodePrefix = "_Zen";
+
+// Get the name of Zen op from original TensorFlow op.
+// We prefix the original op with "Zen" to get Zen op.
+inline string GetZenOpName(const string& name) {
+  return string(kZenNodePrefix) + name;
+}
+
+// Check whether op name with type T is registered as Zen operator
+// that will go through name change or layout change pass.
+//
+// @input  op_name - name of the op.
+// @input  T - datatype to be used for checking op.
+// @return true if op name is registered as Zen op that will go through name
+// change or layout change pass; false otherwise.
+static inline bool IsZenOpKernelRegistered(const string& op_name, DataType T) {
+  string registered_kernels_key = op_name + string(DataType_Name(T));
+  thread_local static auto* registered_kernels_map =
+      new absl::flat_hash_map<string, bool>();
+  auto kernel_element = registered_kernels_map->find(registered_kernels_key);
+  bool kernel_registered = false;
+
+  if (kernel_element == registered_kernels_map->end()) {
+    string registered_kernels = KernelsRegisteredForOp(op_name);
+    // String returned by KernelsRegisteredForOp looks like below:
+    //
+    // Op = ZenMatMul, kernels =
+    // device='CPU'; T in [DT_FLOAT]
+    // device='CPU'; T in [DT_DOUBLE]
+
+    // If we have multiple kernels registered for the op. We need to verify
+    // our datatype
+    if (registered_kernels.find(string(DataType_Name(T))) != string::npos) {
+      kernel_registered = true;
+    }
+    registered_kernels_map->insert(
+        std::make_pair(registered_kernels_key, kernel_registered));
+  } else {
+    // Kernel is visited at least once. Return stored registration result.
+    kernel_registered = kernel_element->second;
+  }
+  return kernel_registered;
+}
+
+}  // namespace zen_op_registry
+}  // namespace tensorflow
+
+#endif  // AMD_ZENDNN
+#endif  // TENSORFLOW_CORE_GRAPH_ZEN_GRAPH_UTIL_H_
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index de07f565bb4..8574d42dc35 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -73,7 +74,7 @@ tf_cuda_library(
     srcs = ["devices.cc"],
     hdrs = ["devices.h"],
     cuda_deps = [
-        "//tensorflow/core/common_runtime/gpu:gpu_init",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core/platform:stream_executor",
     ],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index fecc86015b3..663d22099ec 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 9d97fbd866b..637a64c561d 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -13,6 +13,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -72,6 +73,7 @@ cc_library(
     deps = [
         ":utils",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/types:optional",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
diff --git a/tensorflow/core/grappler/costs/cost_estimator.cc b/tensorflow/core/grappler/costs/cost_estimator.cc
index 879d5cdb193..e96d38dc5c4 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/cost_estimator.cc
@@ -27,6 +27,7 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
   result.execution_time += right.execution_time;
   result.compute_time += right.compute_time;
   result.memory_time += right.memory_time;
+  result.network_time += right.network_time;
   result.intermediate_memory_time += right.intermediate_memory_time;
   result.intermediate_memory_read_time += right.intermediate_memory_read_time;
   result.intermediate_memory_write_time += right.intermediate_memory_write_time;
@@ -68,6 +69,7 @@ Costs MultiplyCosts(const Costs& costs, int multiplier) {
   result.execution_time *= multiplier;
   result.compute_time *= multiplier;
   result.memory_time *= multiplier;
+  result.network_time *= multiplier;
   result.intermediate_memory_time *= multiplier;
   result.intermediate_memory_read_time *= multiplier;
   result.intermediate_memory_write_time *= multiplier;
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 1be858274e7..d0fd28626cc 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -142,6 +142,9 @@ struct Costs {
   Duration intermediate_memory_read_time;   // Intermediate memory read cost.
   Duration intermediate_memory_write_time;  // Intermediate memory write cost.
 
+  // Network time (colelctived ops - all gather, all reduce, etc.)
+  Duration network_time;
+
   // This field can be a very pessimistic estimate of the main memory
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
@@ -194,6 +197,7 @@ Costs::Costs() {
   compute_time = Duration::zero();
   memory_time = Duration::zero();
   intermediate_memory_time = Duration::zero();
+  network_time = Duration::zero();
   max_memory = kMemoryUnknown;
   persistent_memory = kMemoryUnknown;
   temporary_memory = kMemoryUnknown;
@@ -207,6 +211,7 @@ Costs Costs::ZeroCosts(bool inaccurate) {
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
   costs.intermediate_memory_time = Duration::zero();
+  costs.network_time = Duration::zero();
   costs.max_memory = kZeroMemory;
   costs.persistent_memory = kZeroMemory;
   costs.temporary_memory = kZeroMemory;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 0f9d812e9ac..c519a00571d 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+#include "absl/hash/hash.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -60,7 +61,9 @@ const int kThresholdToSkipConstTensorInstantiation = 128;
 
 template <typename Handle>
 struct HashHandle {
-  std::size_t operator()(const Handle& h) const { return h.Handle(); }
+  std::size_t operator()(const Handle& h) const {
+    return absl::HashOf(h.Handle());
+  }
 };
 template <typename Handle>
 struct CompareHandle {
@@ -1148,13 +1151,14 @@ class SymbolicShapeRefiner {
   struct ShapeId {
     const NodeDef* node;
     int port_id;
-    bool operator==(const ShapeId& other) const {
-      return node == other.node && port_id == other.port_id;
+
+    friend bool operator==(const ShapeId& lhs, const ShapeId& rhs) {
+      return lhs.node == rhs.node && lhs.port_id == rhs.port_id;
     }
-  };
-  struct HashShapeId {
-    std::size_t operator()(const ShapeId& shp) const {
-      return std::hash<const NodeDef*>{}(shp.node) + shp.port_id;
+
+    template <typename H>
+    friend H AbslHashValue(H h, const ShapeId& s) {
+      return H::combine(std::move(h), s.node, s.port_id);
     }
   };
 
@@ -1162,16 +1166,15 @@ class SymbolicShapeRefiner {
     const NodeDef* node;
     int port_id;
     int dim_index;
-    bool operator==(const DimId& other) const {
-      return node == other.node && port_id == other.port_id &&
-             dim_index == other.dim_index;
+
+    friend bool operator==(const DimId& lhs, const DimId& rhs) {
+      return lhs.node == rhs.node && lhs.port_id == rhs.port_id &&
+             lhs.dim_index == rhs.dim_index;
     }
-  };
 
-  struct HashDimId {
-    std::size_t operator()(const DimId& dim) const {
-      return std::hash<const NodeDef*>{}(dim.node) + dim.port_id +
-             dim.dim_index;
+    template <typename H>
+    friend H AbslHashValue(H h, const DimId& d) {
+      return H::combine(std::move(h), d.node, d.port_id, d.dim_index);
     }
   };
 
@@ -2043,8 +2046,8 @@ class SymbolicShapeRefiner {
   const GraphView& graph_;
   int graph_def_version_;
   absl::flat_hash_map<const NodeDef*, NodeContext> node_to_context_;
-  absl::flat_hash_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
-  absl::flat_hash_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+  absl::flat_hash_map<ShapeId, ShapeHandle> unknown_shapes_;
+  absl::flat_hash_map<DimId, DimensionHandle> unknown_dims_;
   // Store function instantiations only for valid function. If function
   // instantiation failed it will have an `absl::nullopt`.
   absl::flat_hash_map<string, absl::optional<GrapplerFunctionItem>>
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/BUILD b/tensorflow/core/grappler/costs/graph_properties_testdata/BUILD
index 6c47415d751..92ba17953ff 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/BUILD
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 8140ea74864..d80e073bc1e 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -40,8 +40,8 @@ int GetNumAvailableGPUs(
   }
 #endif
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (ValidateGPUMachineManager().ok()) {
-    se::Platform* gpu_manager = GPUMachineManager();
+  if (se::ValidateGPUMachineManager().ok()) {
+    se::Platform* gpu_manager = se::GPUMachineManager();
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
@@ -84,7 +84,7 @@ int GetNumAvailableGPUs(
 int64_t AvailableGPUMemory(int gpu_id) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // Look up the device, to see its attributes.
-  se::Platform* gpu_platform = GPUMachineManager();
+  se::Platform* gpu_platform = se::GPUMachineManager();
   CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
   se::StreamExecutor* se = gpu_platform->ExecutorForDevice(gpu_id).value();
   int64_t total_memory, available_memory;
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index e17c7e8a932..8b4f8c3ad0d 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index c4f61713808..76256fff334 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -166,7 +166,7 @@ Status UpdatePlaceholderShape(
       for (const auto& dim : output_shapes.dim()) {
         auto size = dim.size();
         if (size == -1) size = cfg.placeholder_unknown_output_shape_dim;
-        shape.AddDim(size);
+        TF_RETURN_IF_ERROR(shape.AddDimWithStatus(size));
         shape_proto.add_dim()->set_size(size);
       }
     }
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index a01f5030e66..73c6edc643a 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/inputs/testdata/BUILD b/tensorflow/core/grappler/inputs/testdata/BUILD
index b4303c52d2f..2ebbefa6d09 100644
--- a/tensorflow/core/grappler/inputs/testdata/BUILD
+++ b/tensorflow/core/grappler/inputs/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index a5f8dd72332..743a925cb74 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -186,11 +186,11 @@ void SwapFanoutsMapValues(FanoutsMap* fanouts,
   if (from_exists && to_exists) {
     std::swap(from_fanouts->second, to_fanouts->second);
   } else if (from_exists) {
-    fanouts->emplace(to_port, std::move(from_fanouts->second));
-    fanouts->erase(from_port);
+    auto node = fanouts->extract(from_fanouts);
+    fanouts->emplace(to_port, std::move(node.mapped()));
   } else if (to_exists) {
-    fanouts->emplace(from_port, std::move(to_fanouts->second));
-    fanouts->erase(to_port);
+    auto node = fanouts->extract(to_port);
+    fanouts->emplace(from_port, std::move(node.mapped()));
   }
 }
 
@@ -206,11 +206,10 @@ void SwapRegularFanoutsAndMaxPortValues(FanoutsMap* fanouts,
                                    int end) {
     for (int i = start; i <= end; ++i) {
       MutableGraphView::OutputPort from_port(from, i);
-      auto from_fanouts = fanouts->find(from_port);
-      if (from_fanouts != fanouts->end()) {
+      auto node = fanouts->extract(from_port);
+      if (!node.empty()) {
         MutableGraphView::OutputPort to_port(to, i);
-        fanouts->emplace(to_port, std::move(from_fanouts->second));
-        fanouts->erase(from_port);
+        fanouts->emplace(to_port, std::move(node.mapped()));
       }
     }
   };
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 06652796601..cb1739c4392 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -8,8 +8,13 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     features = ["-layering_check"],
     licenses = ["notice"],
 )
@@ -1165,7 +1170,12 @@ cc_library(
         "//tensorflow/core/grappler/clusters:cluster",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-    ],
+    ] + if_static(
+        ["//tensorflow/core/platform:tensor_float_32_utils"],
+        ["//tensorflow/core/platform:tensor_float_32_hdr_lib"],
+    ) + if_rocm_is_configured([
+        "//tensorflow/core/platform:stream_executor",
+    ]),
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index a3eb84a640c..58b3e78a6fc 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -133,7 +133,10 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
         "LSTMBlockCellGrad",
         "MatMul",
         "Mha",
+        "MhaV2",
         "Tmlp",
+        "TmlpV2",
+        "TmlpV3",
     };
 #if TENSORFLOW_USE_ROCM
     if (true) {
@@ -376,10 +379,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "MatMul",
                                      "BatchMatMul",
                                      "BatchMatMulV2",
-#ifndef DNNL_AARCH64_USE_ACL
-                                     "Einsum"
-#endif  // DNNL_AARCH64_USE_ACL
-    };
+                                     "Einsum"};
 
     UpdateList("ALLOWLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
@@ -399,6 +399,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "BiasAdd",
                                      "BiasAddGrad",
                                      "BiasAddV1",
+                                     "Erf",
                                      "FusedBatchNormV2",
                                      "FusedBatchNormGradV2",
                                      "FusedBatchNormV3",
@@ -429,7 +430,9 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "Softsign",
                                      "SoftsignGrad",
                                      "Sqrt",
+                                     "Square",
                                      "SquaredDifference",
+                                     "Sum"
                                      "Tanh",
                                      "TanhGrad"};
     UpdateList("INFERLIST", &list);
@@ -448,7 +451,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "SaveV2",
         "SoftmaxCrossEntropyWithLogits",
         "SparseSoftmaxCrossEntropyWithLogits",
-        "Sum",
     };
     UpdateList("DENYLIST", &list);
     // For backwards compatibility, keeping the original env variable here.
@@ -527,6 +529,7 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
         "ReverseSequence",
         "ReverseV2",
         "Round",
+        "ScatterNd",
         "Select",
         "SelectV2",
         "Shape",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 511718a6915..2eed9cd4006 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -1221,7 +1221,7 @@ TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
 
 TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
   TestSimpleUnaryInferOp(
-      -5, 5, 1.0e-3, 1.0e-3,
+      -5, 5, 2.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softplus(scope, input);
       });
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 0fa9132ccd9..fae54db3fb1 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -1955,112 +1956,6 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
-TEST_F(ConstantFoldingTest, MergeNodes) {
-  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
-
-  Output x =
-      ops::RandomNormal(scope.WithOpName("x"), {3, 5}, DataType::DT_FLOAT);
-  Output y =
-      ops::RandomNormal(scope.WithOpName("y"), {3, 5}, DataType::DT_FLOAT);
-  Output const1 =
-      ops::Const(scope.WithOpName("const1").WithControlDependencies(x), 2.7f,
-                 TensorShape({3, 5}));
-  Output const2 =
-      ops::Const(scope.WithOpName("const2"), 3.14f, TensorShape({3, 5}));
-  Output const3 =
-      ops::Const(scope.WithOpName("const3").WithControlDependencies(x), 3.14f,
-                 TensorShape({3, 5}));
-
-  // Create 3 merge nodes: m1 is foldable, m2 and m3 aren't.
-  ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
-  ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
-  ops::Merge m3(scope.WithOpName("m3"), {x, y});
-  // m4 is not foldable because the only constant input
-  // has a control input, so we cannot know if it will be
-  // triggered.
-  ops::Merge m4(scope.WithOpName("m4"), {x, const1});
-
-  ops::Identity out1(scope.WithOpName("out1"), m1.output);
-  ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
-  ops::Identity out2(scope.WithOpName("out2"), m2.output);
-  ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
-  ops::Identity out3(scope.WithOpName("out3"), m3.output);
-  ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
-  ops::Identity out4(scope.WithOpName("out4"), m4.output);
-  ops::Identity idx4(scope.WithOpName("idx4"), m4.value_index);
-
-  GrapplerItem item;
-  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
-  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef output;
-  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  EXPECT_EQ(19, output.node_size());
-  int found_nodes = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "out1") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^m1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "idx1") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^m1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "ConstantFolding/m1") {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^m1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "ConstantFolding/m1_index") {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^m1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "out2") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m2", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "idx2") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m2:1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "out3") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m3", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "idx3") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m3:1", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "out4") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m4", node.input(0));
-      ++found_nodes;
-    } else if (node.name() == "idx4") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("m4:1", node.input(0));
-      ++found_nodes;
-    }
-  }
-  // Make sure the graph contains all the nodes we're expecting.
-  EXPECT_EQ(8, found_nodes);
-
-  std::vector<string> fetch = {"out1", "idx1"};
-  auto tensors = EvaluateNodes(output, fetch);
-  EXPECT_EQ(2, tensors.size());
-  const Tensor& out_value = tensors[0];
-  EXPECT_EQ(3 * 5, out_value.NumElements());
-  for (int i = 0; i < 3 * 5; ++i) {
-    EXPECT_EQ(3.14f, out_value.flat<float>()(i));
-  }
-  const Tensor& out_idx = tensors[1];
-  EXPECT_EQ(1, out_idx.NumElements());
-  EXPECT_EQ(2, out_idx.flat<int32>()(0));
-}
-
 TEST_F(ConstantFoldingTest, SplitRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 23ce869be60..7f8e775d34c 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/data:__pkg__",
         "//tensorflow/core/data/service:__pkg__",
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 74987f28cf3..4b6896cedda 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/auto_shard.h"
 
+#include <array>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/match.h"
@@ -73,7 +75,8 @@ constexpr char kOutputShapes[] = "output_shapes";
 constexpr char kOutputTypes[] = "output_types";
 
 // clang-format off
-constexpr std::array<const char*, 5> kReaderDatasetOps = {
+constexpr std::array<const char*, 6> kReaderDatasetOps = {
+    "ArrayRecordDataset",
     "FixedLengthRecordDataset",
     "RecordIODataset",
     "SSTableDataset",
@@ -352,8 +355,7 @@ bool ReaderOpInFunction(const NodeDef& node,
   for (int i = 0; i < func->node_def_size(); i++) {
     NodeDef node_in_func = func->node_def(i);
     if (IsDatasetNodeOfType(node_in_func, kReaderDatasetOps) &&
-        node_in_func.input_size() > 0 &&
-        absl::StartsWith(node_in_func.input(0), "args_0")) {
+        node_in_func.input_size() > 0) {
       return true;
     }
     if (IsDatasetNodeOfType(func->node_def(i), kFuncDatasetOps) &&
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 03b127783f5..d6fc1087e11 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -857,17 +857,16 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
        "EnqueueTPUEmbeddingRaggedTensorBatch",
-       "EnqueueTPUEmbeddingArbitraryTensorBatch"
+       "EnqueueTPUEmbeddingArbitraryTensorBatch",
+       "DynamicEnqueueTPUEmbeddingArbitraryTensorBatch",
 
        // SaveV2 and RestoreV2 should be allowed to operate in parallel on
        // multiple hosts.
-       "SaveV2",
-       "RestoreV2"
+       "SaveV2", "RestoreV2",
 
        // InfeedEnqueue are stateful but should not be serialized for the
        // input pipeline
-       "InfeedEnqueue",
-       "InfeedEnqueueTuple"});
+       "InfeedEnqueue", "InfeedEnqueueTuple"});
   // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
   return exemption->contains(op);
 }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index ddcf667add4..46f625b46be 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h"
 #include "tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -37,8 +38,8 @@ namespace {
 
 constexpr char kNHWC[] = "NHWC";
 constexpr char kNCHW[] = "NCHW";
-constexpr float kVoltaGPURatioThreshold = 0.5;
-constexpr float kConvGPUFP16Threshold = 0.5;
+constexpr float kGPURatioThreshold = 0.5;
+constexpr float kConvGPUExpectedDtypeThreshold = 0.5;
 
 struct MutableNodeViewFormatter {
   void operator()(std::string* out, utils::MutableNodeView* node_view) const {
@@ -46,34 +47,39 @@ struct MutableNodeViewFormatter {
   }
 };
 
-inline std::pair<int, int> GetNumGPUs(const Cluster& cluster) {
+struct GpuStats {
+  int num_gpus;
+  int num_voltas;
+  int num_amperes;
+};
+
+inline GpuStats GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
-  int num_gpus = 0;
-  int num_volta = 0;
+  GpuStats gpu_stats{};
   for (const auto& device : devices) {
     if (device.second.type() != kGPU) {
       continue;
     }
-    num_gpus++;
+    gpu_stats.num_gpus++;
     auto compute_capability_it =
         device.second.environment().find("architecture");
     if (compute_capability_it == device.second.environment().end()) {
       continue;
     }
     double compute_capability = 0.0;
-    if (absl::SimpleAtod(compute_capability_it->second, &compute_capability) &&
-        compute_capability >= 7.0) {
-      num_volta++;
+    if (absl::SimpleAtod(compute_capability_it->second, &compute_capability)) {
+      if (compute_capability >= 7.0) gpu_stats.num_voltas++;
+      if (compute_capability >= 8.0) gpu_stats.num_amperes++;
     }
   }
-  return {num_gpus, num_volta};
+  return gpu_stats;
 }
 
 inline bool NumConvOnDeviceWithDataTypeOverThreshold(
     const TransposeContext& context, absl::string_view device,
     const DataType& data_type) {
   int num_conv_gpu = 0;
-  int num_conv_gpu_fp16 = 0;
+  int num_conv_gpu_expected_dtype = 0;
 
   for (const auto& node : context.graph_view->GetNodes()) {
     const auto* node_def = node.node();
@@ -94,33 +100,80 @@ inline bool NumConvOnDeviceWithDataTypeOverThreshold(
       continue;
     }
     if (t_attr->type() == data_type) {
-      num_conv_gpu_fp16++;
+      num_conv_gpu_expected_dtype++;
     }
   }
 
   if (num_conv_gpu == 0) return false;
 
-  return (static_cast<float>(num_conv_gpu_fp16) /
-          static_cast<float>(num_conv_gpu)) >= kConvGPUFP16Threshold;
+  return (static_cast<float>(num_conv_gpu_expected_dtype) /
+          static_cast<float>(num_conv_gpu)) >= kConvGPUExpectedDtypeThreshold;
+}
+
+inline bool ConvBackpropExists(const TransposeContext& context,
+                               absl::string_view device,
+                               const DataType& data_type) {
+  for (const auto& node : context.graph_view->GetNodes()) {
+    const auto* node_def = node.node();
+    if (!IsConv2DBackpropFilter(*node_def) &&
+        !IsConv2DBackpropInput(*node_def) &&
+        !IsConv3DBackpropFilterV2(*node_def) &&
+        !IsConv3DBackpropInputV2(*node_def)) {
+      continue;
+    }
+
+    const string& device_name = GetDeviceName(*node_def);
+    string device_type;
+    string task;
+    if (!DeviceNameUtils::SplitDeviceName(device_name, &task, &device_type) ||
+        !absl::StrContains(absl::AsciiStrToLower(device_type),
+                           absl::AsciiStrToLower(device))) {
+      continue;
+    }
+
+    const auto* t_attr = node.GetAttr("T");
+    if (t_attr == nullptr) {
+      continue;
+    }
+    if (t_attr->type() == data_type) {
+      return true;
+    }
+  }
+  return false;
 }
 
 inline std::pair<string, string> GetSrcAndDstDataFormats(
-    const TransposeContext& context, int num_gpus, int num_voltas) {
+    const TransposeContext& context, GpuStats gpu_stats) {
   string src_format = kNHWC;
   string dst_format = kNCHW;
 
   const bool is_NHWC_enforced =
       (!context.enforced_layout.empty() && context.enforced_layout == "NHWC");
+  const bool volta_ready =
+      (static_cast<float>(gpu_stats.num_voltas) /
+       static_cast<float>(gpu_stats.num_gpus)) >= kGPURatioThreshold;
+  const bool ampere_ready =
+      (static_cast<float>(gpu_stats.num_amperes) /
+       static_cast<float>(gpu_stats.num_gpus)) >= kGPURatioThreshold;
+
+  // We swap the src_format and dst_format when:
+  //   (1): Volta+ GPUs AND half-dtype conv nodes >= 50% of total conv nodes.
+  //   (2): Ampere+ GPUs AND TF32-dtype conv nodes >= 50% AND no backprop nodes.
   const bool should_swap =
-      ((static_cast<float>(num_voltas) / static_cast<float>(num_gpus)) >=
-       kVoltaGPURatioThreshold) &&
-      NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF);
+      volta_ready &&
+      (NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_HALF) ||
+       (ampere_ready && tensor_float_32_execution_enabled() &&
+        NumConvOnDeviceWithDataTypeOverThreshold(context, kGPU, DT_FLOAT) &&
+        !ConvBackpropExists(context, kGPU, DT_FLOAT)));
   // We swap only if NHWC is enforced or no layout is enforced and the devices
   // config meet the thresholds
   if (is_NHWC_enforced || (context.enforced_layout.empty() && should_swap)) {
     std::swap(src_format, dst_format);
   }
 
+  VLOG(2) << "Layout conversion of " << src_format << " to " << dst_format
+          << " will take place.";
+
   return {src_format, dst_format};
 }
 
@@ -422,20 +475,18 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster,
         absl::StrCat("Invalid value for enforced_layout: ", enforced_layout_,
                      ". Supported layouts: 'NHWC', 'NCHW'."));
   }
-  const auto num_gpus_and_num_volta = GetNumGPUs(*cluster);
-  const int num_gpus = num_gpus_and_num_volta.first;
+  const auto gpu_stats = GetNumGPUs(*cluster);
 
   const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
 
   TransposeContext context;
   context.enforced_layout = enforced_layout_;
 
-  if (num_gpus > 0) {
+  if (gpu_stats.num_gpus > 0) {
     TF_RETURN_IF_ERROR(TransposeContext::InitializeTransposeContext(
         /*assume_valid_feeds=*/is_aggressive, item, cluster, &context));
 
-    const auto src_dst_formats = GetSrcAndDstDataFormats(
-        context, num_gpus, num_gpus_and_num_volta.second);
+    const auto src_dst_formats = GetSrcAndDstDataFormats(context, gpu_stats);
     context.AssignDeviceAndDataFormats(kGPU, src_dst_formats.first,
                                        src_dst_formats.second);
   } else {
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 92feae4484a..81107660f3b 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -137,6 +137,11 @@ struct ComparatorByNodeNameAndIndex {
 };
 
 bool IsHostMemory(const NodeDef& node, int output_port) {
+  // If a node contains the attribute _xla_input=True, the node will be compiled
+  // via XLA GPU. This node will not be in host memory.
+  if (node.attr().contains("_xla_input") && node.attr().at("_xla_input").b())
+    return false;
+
   DeviceNameUtils::ParsedName parsed_name;
   if (DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
     DeviceType device_type(parsed_name.type);
diff --git a/tensorflow/core/grappler/optimizers/inference/BUILD b/tensorflow/core/grappler/optimizers/inference/BUILD
index 11c0a2f34ac..b9a02d70242 100644
--- a/tensorflow/core/grappler/optimizers/inference/BUILD
+++ b/tensorflow/core/grappler/optimizers/inference/BUILD
@@ -9,6 +9,7 @@ DEFAULT_VISIBILITY = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = DEFAULT_VISIBILITY,
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index eba284ec867..19bca1c8b62 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -650,7 +650,7 @@ CREATE_REPLACEADDWITHBIASADD_TEST_1(DepthConv2D, Add, DT_FLOAT);
 
 class FusedMatMulBiasAddAndGeluTest : public GrapplerTest {
  public:
-  template <DataType DTYPE>
+  template <DataType DTYPE, bool is_pattern2>
   void RunTest() {
     using ::tensorflow::ops::Placeholder;
 
@@ -668,20 +668,40 @@ class FusedMatMulBiasAddAndGeluTest : public GrapplerTest {
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
 
     // Add Gelu approximate with smaller ops
-    auto square_root_one_half =
-        ops::Const(s.WithOpName("square_root_one_half"), {0.707106f}, {});
+    auto square_root_one_half_const =
+        ops::Const(s.WithOpName("square_root_one_half_const"), {0.707106f}, {});
+    // For some cases, eg. BF16, there will be a Cast, Const -> Cast -> Mul
+    auto square_root_one_half = ops::Cast(s.WithOpName("square_root_one_half"),
+                                          square_root_one_half_const, DTYPE);
     auto bias_add_times_square_root_one_half =
         ops::Mul(s.WithOpName("bias_add_times_square_root_one_half"), bias_add,
                  square_root_one_half);
     auto erf =
         ops::Erf(s.WithOpName("erf"), bias_add_times_square_root_one_half);
-    auto one = ops::Const(s.WithOpName("one"), {1.0f}, {});
+
+    auto one_const = ops::Const(s.WithOpName("one_const"), {1.0f}, {});
+    // For some cases, eg. BF16, there will be a Cast, Const -> Cast -> AddV2
+    auto one = ops::Cast(s.WithOpName("one"), one_const, DTYPE);
     auto erf_plus_one = ops::AddV2(s.WithOpName("one_plus_erf"), erf, one);
-    auto one_half = ops::Const(s.WithOpName("one_half"), {0.5f}, {});
-    auto erf_plus_one_times_one_half = ops::Mul(
-        s.WithOpName("erf_plus_one_times_one_half"), erf_plus_one, one_half);
-    auto gelu = ops::Mul(s.WithOpName("fusion_output"),
-                         erf_plus_one_times_one_half, bias_add);
+
+    auto one_half_const =
+        ops::Const(s.WithOpName("one_half_const"), {0.5f}, {});
+    // For some cases, eg. BF16, there will be a Cast, Const -> Cast -> Mul
+    auto one_half = ops::Cast(s.WithOpName("one_half"), one_half_const, DTYPE);
+
+    Output gelu;
+    if (is_pattern2) {
+      auto bias_add_times_one_half = ops::Mul(
+          s.WithOpName("erf_plus_one_times_one_half"), bias_add, one_half);
+      gelu = ops::Mul(s.WithOpName("fusion_output"), erf_plus_one,
+                      bias_add_times_one_half);
+    } else {
+      auto erf_plus_one_times_one_half = ops::Mul(
+          s.WithOpName("erf_plus_one_times_one_half"), erf_plus_one, one_half);
+      auto gelu = ops::Mul(s.WithOpName("fusion_output"),
+                           erf_plus_one_times_one_half, bias_add);
+    }
+
     auto fetch = ops::Identity(s.WithOpName("fetch"), gelu);
 
     auto lhs_t = GenerateTensorWithSetRandom<DTYPE>({8, 32});
@@ -730,12 +750,22 @@ class FusedMatMulBiasAddAndGeluTest : public GrapplerTest {
   }
 };
 
-// Gelu has two implementations (1) exact and (2) approximate. Exact cannot be
-// used with bfloat16 numeric since the Erf is not supported in bfloat16 yet.
-// Here gelu-exact is tested for float32 numeric only. Gelu-approximate test
-// is added in tensorflow/python/grappler/remapper_test.py, since the pattern is
+// Fused {MatMul + BiasAdd + Gelu-exact} has 2 subgraph patterns. We test both
+// patterns with float32 and bfloat16 data types here. Gelu-approximate test is
+// added in `tensorflow/python/grappler/remapper_test.py` since the pattern is
 // changed by other optimizers before the remapper optimizer.
-TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact) { RunTest<DT_FLOAT>(); }
+TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact) {
+  RunTest<DT_FLOAT, false>();
+}
+TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact) {
+  RunTest<DT_BFLOAT16, false>();
+}
+TEST_F(FusedMatMulBiasAddAndGeluTest, Float32GeluExact2) {
+  RunTest<DT_FLOAT, true>();
+}
+TEST_F(FusedMatMulBiasAddAndGeluTest, BFloat16GeluExact2) {
+  RunTest<DT_BFLOAT16, true>();
+}
 
 class MklFusedBatchMatMul : public MklRemapperTest {
  public:
@@ -925,6 +955,173 @@ class MklRemapperSwishTest : public GrapplerTest {
 TEST_F(MklRemapperSwishTest, F32) { RunTest<DT_FLOAT>(); }
 TEST_F(MklRemapperSwishTest, BF16) { RunTest<DT_BFLOAT16>(); }
 
+class MklRemapperConv2dBiasAddSwishTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void RunTest() {
+    using ::tensorflow::ops::Placeholder;
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto input_shape = Placeholder::Shape({1, 32, 32, 3});
+    auto filter_shape = Placeholder::Shape({16, 1, 1, 3});
+    auto bias_shape = Placeholder::Shape({3});
+
+    auto input = Placeholder(s.WithOpName("input"), DTYPE, input_shape);
+    auto filter = Placeholder(s.WithOpName("filter"), DTYPE, filter_shape);
+    auto bias = Placeholder(s.WithOpName("bias"), DTYPE, bias_shape);
+
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto conv =
+        ops::Conv2D(s.WithOpName("conv2d"), input, filter, strides, "SAME");
+    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+    ops::Identity fetch = [&]() -> ops::Identity {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      auto sigmoid = ops::Sigmoid(s.WithOpName("sigmoid"), bias_add);
+      return ops::Identity(fetch, ops::Mul(activate, bias_add, sigmoid));
+    }();
+
+    auto input_t = GenerateTensorWithSetRandom<DTYPE>({1, 32, 32, 3});
+    auto filter_t = GenerateTensorWithSetRandom<DTYPE>({16, 1, 1, 3});
+    auto bias_t = GenerateTensorWithSetRandom<DTYPE>({3});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() != "bias_add" && node.name() != "activation") continue;
+
+      EXPECT_EQ(node.op(), "_FusedConv2D");
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "input");
+      EXPECT_EQ(node.input(1), "filter");
+      EXPECT_EQ(node.attr().at("num_args").i(), 1);
+      EXPECT_EQ(node.input(2), "bias");
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(node.name(), "activation");
+      ASSERT_EQ(fused_ops.size(), 2);
+      EXPECT_EQ(fused_ops[0], "BiasAdd");
+      EXPECT_EQ(fused_ops[1], "_MklSwish");
+      found++;
+    }
+    EXPECT_EQ(found, 1);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    float atol = 1e-6, rtol = 1e-4;
+    if (DTYPE == DT_BFLOAT16) {
+      atol = 1e-2;
+      rtol = 1e-2;
+    }
+    test::ExpectClose(tensors[0], tensors_expected[0], atol, rtol);
+  }
+};
+
+TEST_F(MklRemapperConv2dBiasAddSwishTest, F32) { RunTest<DT_FLOAT>(); }
+TEST_F(MklRemapperConv2dBiasAddSwishTest, BF16) { RunTest<DT_BFLOAT16>(); }
+
+class MklRemapperConv2dFusedBatchNormSwishTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void RunTest() {
+    using ::tensorflow::ops::Placeholder;
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto input_shape = Placeholder::Shape({1, 32, 32, 3});
+    auto filter_shape = Placeholder::Shape({16, 1, 1, 3});
+    auto bn_param_shape = Placeholder::Shape({3});
+
+    auto input = Placeholder(s.WithOpName("input"), DTYPE, input_shape);
+    auto filter = Placeholder(s.WithOpName("filter"), DTYPE, filter_shape);
+    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+    float epsilon = 0.1f;
+
+    std::vector<int> strides = {1, 1, 1, 1};
+    auto conv =
+        ops::Conv2D(s.WithOpName("conv2d"), input, filter, strides, "SAME");
+    auto fbn = ops::FusedBatchNormV3(
+        s.WithOpName("fused_batch_norm"), conv, scale, offset, mean, var,
+        ops::FusedBatchNormV3::IsTraining(false).Epsilon(epsilon).DataFormat(
+            "NHWC"));
+    ops::Identity fetch = [&]() -> ops::Identity {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      auto sigmoid = ops::Sigmoid(s.WithOpName("sigmoid"), fbn.y);
+      return ops::Identity(fetch, ops::Mul(activate, fbn.y, sigmoid));
+    }();
+
+    auto input_t = GenerateTensorWithSetRandom<DTYPE>({1, 32, 32, 3});
+    auto filter_t = GenerateTensorWithSetRandom<DTYPE>({16, 1, 1, 3});
+    auto scale_t = GenerateTensorWithSetRandom<DT_FLOAT>({3});
+    auto offset_t = GenerateTensorWithSetRandom<DT_FLOAT>({3});
+    auto mean_t = GenerateTensorWithSetRandom<DT_FLOAT>({3});
+    auto var_t = GenerateTensorWithSetRandom<DT_FLOAT>({3});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t},   {"filter", filter_t}, {"scale", scale_t},
+                 {"offset", offset_t}, {"mean", mean_t},     {"var", var_t}};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() != "activation") continue;
+
+      EXPECT_EQ(node.op(), "_FusedConv2D");
+      ASSERT_EQ(node.input_size(), 6);
+      EXPECT_EQ(node.input(0), "input");
+      EXPECT_EQ(node.input(1), "filter");
+      EXPECT_EQ(node.attr().at("num_args").i(), 4);
+      EXPECT_EQ(node.input(2), "scale");
+      EXPECT_EQ(node.input(3), "offset");
+      EXPECT_EQ(node.input(4), "mean");
+      EXPECT_EQ(node.input(5), "var");
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(node.name(), "activation");
+      ASSERT_EQ(fused_ops.size(), 2);
+      EXPECT_EQ(fused_ops[0], "FusedBatchNorm");
+      EXPECT_EQ(fused_ops[1], "_MklSwish");
+      found++;
+    }
+    EXPECT_EQ(found, 1);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectClose(tensors[0], tensors_expected[0], 1e-6, 1e-4);
+  }
+};
+
+TEST_F(MklRemapperConv2dFusedBatchNormSwishTest, F32) { RunTest<DT_FLOAT>(); }
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index efbdbfa95dd..b825e3c8a91 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include <algorithm>
+#include <map>
 #include <set>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -1121,36 +1126,227 @@ bool FindContractionWithBiasAndAddActivation(
   return true;
 }
 
+bool FindConv2DSwish(RemapperContext* ctx, int node_index,
+                     std::map<string, int>* matched_nodes_map,
+                     std::set<int>* remove_node_indices) {
+  using utils::MatchingDirection;
+  using utils::NodeStatus;
+  // clang-format off
+
+  //    Fuse Conv2D + BiasAdd/FusedBatchNorm + Sigmoid + Mul to _FuesdConv2D
+  //   From Graph                                To Graph
+  //   -----------                              ---------
+  //    Conv2D
+  //      !
+  //      V
+  //  BiasAdd / FusedBatchNorm/V2/V3
+  //      !
+  //      V
+  //  ---- ----
+  //  !       !                    ----->       _FusedConv2D
+  //  !       V
+  //  !    Sigmoid
+  //  !       !
+  //  ---   ---
+  //     !  !
+  //     V  V
+  //      Mul
+  //      !
+  //      V
+
+  utils::OpTypePattern conv2dbiasaddswish_pattern{
+    "Mul", "mulToswish", NodeStatus::kReplace,
+    {
+      { "Sigmoid", "sigmoid", NodeStatus::kRemove,
+        {
+          { "BiasAdd", "biasadd", NodeStatus::kRemove,
+            {
+              { "Conv2D", "conv", NodeStatus::kRemove},
+              { "*", "bias", NodeStatus::kRemain}
+            }
+          }
+        }
+      },
+      { "BiasAdd", "biasadd", NodeStatus::kRemove}
+    }
+  };
+
+  utils::OpTypePattern conv2dbatchnormswish_pattern{
+    "Mul", "mulToswish", NodeStatus::kReplace,
+    {
+      { "Sigmoid", "sigmoid", NodeStatus::kRemove,
+        {
+          { "FusedBatchNorm", "fusebatchnorm", NodeStatus::kRemove,
+            {
+              { "Conv2D", "conv", NodeStatus::kRemove},
+              { "*", "scale", NodeStatus::kRemain},
+              { "*", "offset", NodeStatus::kRemain},
+              { "*", "mean", NodeStatus::kRemain},
+              { "*", "var", NodeStatus::kRemain}
+            }
+          }
+        }
+      },
+      { "FusedBatchNorm", "fusebatchnorm", NodeStatus::kRemove}
+    }
+  };
+
+  utils::OpTypePattern conv2dbatchnormv2swish_pattern{
+    "Mul", "mulToswish", NodeStatus::kReplace,
+    {
+      { "Sigmoid", "sigmoid", NodeStatus::kRemove,
+        {
+          { "FusedBatchNormV2", "fusebatchnorm", NodeStatus::kRemove,
+            {
+              { "Conv2D", "conv", NodeStatus::kRemove},
+              { "*", "scale", NodeStatus::kRemain},
+              { "*", "offset", NodeStatus::kRemain},
+              { "*", "mean", NodeStatus::kRemain},
+              { "*", "var", NodeStatus::kRemain}
+            }
+          }
+        }
+      },
+      { "FusedBatchNormV2", "fusebatchnorm", NodeStatus::kRemove}
+    }
+  };
+
+  utils::OpTypePattern conv2dbatchnormv3swish_pattern{
+    "Mul", "mulToswish", NodeStatus::kReplace,
+    {
+      { "Sigmoid", "sigmoid", NodeStatus::kRemove,
+        {
+          { "FusedBatchNormV3", "fusebatchnorm", NodeStatus::kRemove,
+            {
+              { "Conv2D", "conv", NodeStatus::kRemove},
+              { "*", "scale", NodeStatus::kRemain},
+              { "*", "offset", NodeStatus::kRemain},
+              { "*", "mean", NodeStatus::kRemain},
+              { "*", "var", NodeStatus::kRemain}
+            }
+          }
+        }
+      },
+      { "FusedBatchNormV3", "fusebatchnorm", NodeStatus::kRemove}
+    }
+  };
+  // clang-format on
+  // check for data types
+  auto* mul_node_def = ctx->graph_view.GetNode(node_index)->node();
+  if (!HasDataType(mul_node_def, DT_FLOAT) &&
+      !HasDataType(mul_node_def, DT_BFLOAT16))
+    return false;
+
+  if (!NodeIsOnCpu(mul_node_def)) return false;
+  // Check first if the swish pattern is present
+  bool found_op_type_match = false;
+  bool is_biasadd_pattern = false;
+  utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+      &(ctx->graph_view));
+  matched_nodes_map->clear();
+  remove_node_indices->clear();
+  found_op_type_match = graph_matcher.GetMatchedNodes(
+      conv2dbiasaddswish_pattern, {}, ctx->graph_view.GetNode(node_index),
+      matched_nodes_map, remove_node_indices);
+  is_biasadd_pattern = found_op_type_match;
+
+  // If Conv2D + BiasAdd + Sigmoid + Mul Not found , check for FusedBatchNorm
+  // pattern
+  if (!found_op_type_match) {
+    utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+        &(ctx->graph_view));
+    matched_nodes_map->clear();
+    remove_node_indices->clear();
+    found_op_type_match = graph_matcher.GetMatchedNodes(
+        conv2dbatchnormswish_pattern, {}, ctx->graph_view.GetNode(node_index),
+        matched_nodes_map, remove_node_indices);
+  }
+
+  // if above fails check for FusedBatchNormV2 pattern
+  if (!found_op_type_match) {
+    utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+        &(ctx->graph_view));
+    matched_nodes_map->clear();
+    remove_node_indices->clear();
+    found_op_type_match = graph_matcher.GetMatchedNodes(
+        conv2dbatchnormv2swish_pattern, {}, ctx->graph_view.GetNode(node_index),
+        matched_nodes_map, remove_node_indices);
+  }
+
+  // if above fails check for FusedBatchNormV3 pattern
+  if (!found_op_type_match) {
+    utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
+        &(ctx->graph_view));
+    matched_nodes_map->clear();
+    remove_node_indices->clear();
+    found_op_type_match = graph_matcher.GetMatchedNodes(
+        conv2dbatchnormv3swish_pattern, {}, ctx->graph_view.GetNode(node_index),
+        matched_nodes_map, remove_node_indices);
+  }
+
+  // Check if the Conv2d to be fused is CPU compatible
+  if (found_op_type_match) {
+    NodeDef* conv2d_node =
+        ctx->graph_view.GetNode(matched_nodes_map->at("conv"))->node();
+    if (!IsCpuCompatibleConv2D(*ctx, conv2d_node)) return false;
+    if (!is_biasadd_pattern) {
+      NodeDef* fusedbatchnorm_node =
+          ctx->graph_view.GetNode(matched_nodes_map->at("fusebatchnorm"))
+              ->node();
+      // Check if FusedBatchNorm node is in inference mode
+      bool is_training = true;
+      if (!TryGetNodeAttr(*fusedbatchnorm_node, kIsTraining, &is_training) ||
+          is_training)
+        return false;
+
+      if (fusedbatchnorm_node->op() != "FusedBatchNorm" &&
+          (!HasDataType(fusedbatchnorm_node, DT_FLOAT, "U") ||
+           (HasDataType(fusedbatchnorm_node, DT_FLOAT, "U") &&
+            !HasDataType(fusedbatchnorm_node, DT_FLOAT, "T")))) {
+        return false;
+      }
+    }
+  }
+
+  return found_op_type_match;
+}
+
 inline bool VerifyConstants(RemapperContext* ctx,
                             std::map<string, int>* nodes_map,
                             std::map<string, float>* values_map) {
   using utils::MutableNodeView;
+
   for (auto it = values_map->begin(); it != values_map->end(); ++it) {
     int node_idx = nodes_map->at(it->first);
     MutableNodeView* node_view = ctx->graph_view.GetNode(node_idx);
     NodeDef* node_def = node_view->node();
     Tensor const_tensor;
-    if (node_def != nullptr && node_def->op() == "Const" &&
-        const_tensor.FromProto(node_def->attr().at("value").tensor())) {
-      if (const_tensor.NumElements() == 1) {
-        DataType dtype = const_tensor.dtype();
-        float const_value;
-        if (dtype == DT_FLOAT) {
-          const_value = const_tensor.flat<float>()(0);
-        } else if (dtype == DT_BFLOAT16) {
-          const_value = const_tensor.flat<bfloat16>()(0);
-        } else if (dtype == DT_HALF) {
-          const_value = const_tensor.flat<Eigen::half>()(0);
-        } else {
-          return false;
-        }
-        if (std::abs(const_value - it->second) > 1e-2) return false;
-      } else {
-        return false;
-      }
+
+    // If node is a Cast, look for Const in fan-ins.
+    if (node_def != nullptr && node_def->op() == "Cast") {
+      const auto& regular_fanin_0 = node_view->GetRegularFanin(0);
+      const auto* regular_node_view = regular_fanin_0.node_view();
+      node_def = regular_node_view->node();
+    }
+
+    // Verify if the node is a constant.
+    if (node_def == nullptr || node_def->op() != "Const" ||
+        !const_tensor.FromProto(node_def->attr().at("value").tensor()) ||
+        const_tensor.NumElements() != 1) {
+      return false;
+    }
+    DataType dtype = const_tensor.dtype();
+    float const_value;
+    if (dtype == DT_FLOAT) {
+      const_value = const_tensor.flat<float>()(0);
+    } else if (dtype == DT_BFLOAT16) {
+      const_value = static_cast<float>(const_tensor.flat<bfloat16>()(0));
+    } else if (dtype == DT_HALF) {
+      const_value = static_cast<float>(const_tensor.flat<Eigen::half>()(0));
     } else {
       return false;
     }
+    if (std::abs(const_value - it->second) > 1e-2) return false;
   }
   return true;
 }
@@ -1163,38 +1359,84 @@ bool IsMatchedMatMulBiasAddAndGeluExact(
   using utils::MatchingDirection;
   using utils::NodeStatus;
   // clang-format off
+  // Pattern 1:
+  //    Const: 1/sqrt(2)        Const: 1    Const: 1/2
+  //                  \               \         \
+  //  * --> BiasAdd --> Mul --> Erf --> AddV2 --> Mul --> Mul
+  //        /       \____________________________________/
+  //  MatMul
   static utils::OpTypePattern* gelu_exact_pattern = new utils::OpTypePattern
     {"Mul", "output", NodeStatus::kReplace,
       {
         {"Mul", "erf_plus_one_times_one_half", NodeStatus::kRemove,
           {
-            {"AddV2", "erf_plus_one", NodeStatus::kRemove,
+            {"Add|AddV2", "erf_plus_one", NodeStatus::kRemove,
               {
                 {"Erf", "erf", NodeStatus::kRemove,
                   {
-                    {"Mul", "bias_add_times_square_root_one_half",
+                    {"Mul", "bias_add_x_sqrt_one_half",
                      NodeStatus::kRemove,
                       {
                         {"BiasAdd", "bias_add", NodeStatus::kRemove},
-                        {"Const", "square_root_one_half", NodeStatus::kRemain}
+                        {"Cast|Const", "sqrt_one_half", NodeStatus::kRemain}
                       }
-                    }
+                    }  // Mul: "bias_add_x_sqrt_one_half"
                   }
-                },
-                {"Const", "one", NodeStatus::kRemain}
-              }
+                },  // Erf: "erf"
+                {"Cast|Const", "one", NodeStatus::kRemain}
+              }  // Add|AddV2: "erf_plus_one"
             },
-            {"Const", "one_half", NodeStatus::kRemain}
+            {"Cast|Const", "one_half", NodeStatus::kRemain}
           }
-        },
+        },  // Mul: "erf_plus_one_times_one_half"
         {"BiasAdd", "bias_add", NodeStatus::kRemove,
           {
             {"MatMul", "matmul", NodeStatus::kRemove},
             {"*", "bias", NodeStatus::kRemain}
           }
-        }
-      }
+        }  // BiasAdd: "bias_add"
+      }  // Mul: "output"
     };
+
+  // Pattern 2:
+  //  Cast|Const: 1/sqrt(2)    Cast|Const: 1
+  //                  \               \
+  //  * --> BiasAdd --> Mul --> Erf --> Add|AddV2 --> Mul
+  //      /         \                                 /
+  // MatMul           ----------------------------> Mul
+  //                                                /
+  //                                  Cast|Const: 1/2
+  static utils::OpTypePattern* gelu_exact_pattern2 = new utils::OpTypePattern
+    {"Mul", "output", NodeStatus::kReplace,
+      {
+        {"Add|AddV2", "erf_plus_one", NodeStatus::kRemove,
+          {
+            {"Erf", "erf", NodeStatus::kRemove,
+              {
+                {"Mul", "bias_add_x_sqrt_one_half", NodeStatus::kRemove,
+                  {
+                    {"BiasAdd", "bias_add", NodeStatus::kRemove},
+                    {"Cast|Const", "sqrt_one_half", NodeStatus::kRemain}
+                  }
+                }  // Mul: "bias_add_x_sqrt_one_half"
+              }
+            },  // Erf: "erf"
+            {"Cast|Const", "one", NodeStatus::kRemain}
+          }
+        },  // Add|AddV2: "erf_plus_one"
+        {"Mul", "erf_plus_one_times_one_half", NodeStatus::kRemove,
+          {
+            {"BiasAdd", "bias_add", NodeStatus::kRemove,
+              {
+                {"MatMul", "matmul", NodeStatus::kRemove},
+                {"*", "bias", NodeStatus::kRemain}
+              }
+            },  // BiasAdd: "bias_add"
+            {"Cast|Const", "one_half", NodeStatus::kRemain}
+          }
+        }  // Mul: "erf_plus_one_times_one_half"
+      }
+    };  // Mul: "output"
   // clang-format on
 
   utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
@@ -1202,10 +1444,19 @@ bool IsMatchedMatMulBiasAddAndGeluExact(
   // Find GeluExact
   std::map<string, int> dummy_matched_nodes_map;
   std::set<int> dummy_remove_node_indices;
-  return graph_matcher.GetMatchedNodes(
-      *gelu_exact_pattern, ctx.nodes_to_preserve, node_view,
-      matched_nodes_map ? matched_nodes_map : &dummy_matched_nodes_map,
-      remove_node_indices ? remove_node_indices : &dummy_remove_node_indices);
+  if (!matched_nodes_map) matched_nodes_map = &dummy_matched_nodes_map;
+  if (!remove_node_indices) remove_node_indices = &dummy_remove_node_indices;
+  if (graph_matcher.GetMatchedNodes(*gelu_exact_pattern, ctx.nodes_to_preserve,
+                                    node_view, matched_nodes_map,
+                                    remove_node_indices)) {
+    return true;
+  }
+  // Pattern 1 not matched, check for pattern 2
+  matched_nodes_map->clear();
+  remove_node_indices->clear();
+  return graph_matcher.GetMatchedNodes(*gelu_exact_pattern2,
+                                       ctx.nodes_to_preserve, node_view,
+                                       matched_nodes_map, remove_node_indices);
 }
 
 // Gelu in python api generates a number of nodes in the graph. Depending on the
@@ -1314,6 +1565,7 @@ bool FindMatMulBiasAddAndGelu(RemapperContext* ctx, int node_index,
     utils::SubGraphMatcher<MatchingDirection::kFollowInputs> graph_matcher(
         &(ctx->graph_view));
 
+    // Find GeluApproximate
     matched_nodes_map->clear();
     remove_node_indices->clear();
     found_gelu_approximate = graph_matcher.GetMatchedNodes(
@@ -1364,7 +1616,7 @@ bool FindMatMulBiasAddAndGelu(RemapperContext* ctx, int node_index,
 
     // Check if the matched constants have desired values.
     std::map<string, float> values_map = {
-        {"square_root_one_half", 0.707106}, {"one", 1.0}, {"one_half", 0.5}};
+        {"sqrt_one_half", 0.707106}, {"one", 1.0}, {"one_half", 0.5}};
     if (!VerifyConstants(ctx, matched_nodes_map, &values_map)) return false;
   } else if (found_gelu_approximate) {
     NodeDef* matmul_node =
@@ -1542,7 +1794,7 @@ bool FindSigmoidAndMul(RemapperContext* ctx, int node_index,
 // LayerNormalization api.
 bool FindMklLayerNorm(RemapperContext* ctx, int node_index,
                       std::map<string, int>* matched_nodes_map,
-                      std::set<int>* remove_node_indices) {
+                      std::set<int>* remove_node_indices, float* epsilon) {
   if (!IsMKLEnabled()) return false;
 
   // The following pattern will be searched in the graph with additional
@@ -1629,6 +1881,11 @@ bool FindMklLayerNorm(RemapperContext* ctx, int node_index,
     NodeDef* fused_batch_norm_node =
         ctx->graph_view.GetNode(matched_nodes_map->at("fused_batch_norm"))
             ->node();
+    if (fused_batch_norm_node->attr().count("epsilon")) {
+      *epsilon = fused_batch_norm_node->attr().at("epsilon").f();
+    } else {
+      *epsilon = 0.001;  // default value.
+    }
     bool is_training = false;
     if (!TryGetNodeAttr(*fused_batch_norm_node, kIsTraining, &is_training) ||
         !is_training)
@@ -2690,6 +2947,58 @@ Status AddFusedContractionNode(
   return OkStatus();
 }
 
+Status FuseConv2DSwish(RemapperContext* ctx,
+                       const std::map<string, int>& matched_nodes_map,
+                       const std::set<int>& remove_node_indices,
+                       std::vector<bool>* invalidated_nodes,
+                       std::vector<bool>* nodes_to_delete) {
+  const NodeDef* mul =
+      ctx->graph_view.GetNode(matched_nodes_map.at("mulToswish"))->node();
+  const NodeDef* conv2d =
+      ctx->graph_view.GetNode(matched_nodes_map.at("conv"))->node();
+
+  NodeDef fused_op;
+  fused_op.set_name(mul->name());
+  fused_op.set_op(kFusedConv2D);
+  fused_op.set_device(mul->device());
+  fused_op.add_input(conv2d->input(0));
+  fused_op.add_input(conv2d->input(1));
+  // Check if the pattern has Conv2d + BiasAdd
+  if (matched_nodes_map.find("biasadd") != matched_nodes_map.end()) {
+    auto* bias_add_node =
+        ctx->graph_view.GetNode(matched_nodes_map.at("biasadd"))->node();
+    fused_op.add_input(bias_add_node->input(1));
+    SetFusedOpAttributes(&fused_op, {"BiasAdd", "_MklSwish"});
+  } else {
+    // pattern is conv2D + FuseBatchNorm/v2/v3
+    auto* fusebatchnorm_node =
+        ctx->graph_view.GetNode(matched_nodes_map.at("fusebatchnorm"))->node();
+    fused_op.add_input(fusebatchnorm_node->input(1));
+    fused_op.add_input(fusebatchnorm_node->input(2));
+    fused_op.add_input(fusebatchnorm_node->input(3));
+    fused_op.add_input(fusebatchnorm_node->input(4));
+    float epsilon;
+    TF_CHECK_OK(GetNodeAttr(*fusebatchnorm_node, "epsilon", &epsilon));
+    SetFusedOpAttributes(&fused_op, {"FusedBatchNorm", "_MklSwish"},
+                         /*num_args=*/4, /*epsilon=*/epsilon);
+  }
+  CopyConv2DAttributes(*conv2d, &fused_op);
+
+  utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
+  Status status;
+  mutation->AddNode(std::move(fused_op), &status);
+  TF_RETURN_IF_ERROR(status);
+  TF_RETURN_IF_ERROR(mutation->Apply());
+
+  (*invalidated_nodes)[matched_nodes_map.at("mulToswish")] = true;
+
+  for (const auto& node_index : remove_node_indices) {
+    (*nodes_to_delete)[node_index] = true;
+  }
+
+  return OkStatus();
+}
+
 Status AddFusedMatMulBiasAddAndGelu(
     RemapperContext* ctx, const std::map<string, int>& matched_nodes_map,
     const std::set<int>& remove_node_indices,
@@ -2737,7 +3046,8 @@ Status AddMklLayerNorm(RemapperContext* ctx,
                        const std::map<string, int>& matched_nodes_map,
                        const std::set<int>& remove_node_indices,
                        std::vector<bool>* invalidated_nodes,
-                       std::vector<bool>* nodes_to_delete) {
+                       std::vector<bool>* nodes_to_delete,
+                       const float epsilon) {
   auto* pre_reshape_node =
       ctx->graph_view.GetNode(matched_nodes_map.at("pre_reshape"))->node();
   auto* scale_node =
@@ -2755,6 +3065,7 @@ Status AddMklLayerNorm(RemapperContext* ctx,
   auto* attr = fused_node.mutable_attr();
   auto& src_attr = output_node->attr();
   (*attr)["T"] = src_attr.at("T");
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);
 
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
@@ -3684,6 +3995,17 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
 
+      // Fuse Conv2d + BiasAdd/FusedBatchNorm + Swish.
+      std::map<string, int> fusedconv2dSwish_matched_nodes_map;
+      std::set<int> fusedconv2dSwish_remove_node_indices;
+      if (FindConv2DSwish(&ctx, i, &fusedconv2dSwish_matched_nodes_map,
+                          &fusedconv2dSwish_remove_node_indices)) {
+        TF_RETURN_IF_ERROR(
+            FuseConv2DSwish(&ctx, fusedconv2dSwish_matched_nodes_map,
+                            fusedconv2dSwish_remove_node_indices,
+                            &invalidated_nodes, &nodes_to_delete));
+        continue;
+      }
       // Remap Maximum(x, alpha * x) pattern, fuse them into the LeakyRelu(x).
       std::map<string, int> mulmax_matched_nodes_map;
       std::set<int> mulmax_remove_node_indices;
@@ -3709,10 +4031,12 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       // Remap smaller ops from layernorm python api into _MklLayerNorm
       matched_nodes_map.clear();
       remove_node_indices.clear();
-      if (FindMklLayerNorm(&ctx, i, &matched_nodes_map, &remove_node_indices)) {
+      float epsilon = 0.001;
+      if (FindMklLayerNorm(&ctx, i, &matched_nodes_map, &remove_node_indices,
+                           &epsilon)) {
         TF_RETURN_IF_ERROR(
             AddMklLayerNorm(&ctx, matched_nodes_map, remove_node_indices,
-                            &invalidated_nodes, &nodes_to_delete));
+                            &invalidated_nodes, &nodes_to_delete, epsilon));
         continue;
       }
     }
diff --git a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
index eb78752b31f..fc45c83c354 100644
--- a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
+++ b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook_test.cc
@@ -97,9 +97,8 @@ TEST(TFGOptimizerTest, TestCustomPipelineName) {
     mgr.addNestedPass<mlir::tfg::GraphOp>(
         std::make_unique<mlir::tfg::TestPass>());
   });
-  EXPECT_EQ(
-      optimizer.name(),
-      "tfg_optimizer{builtin.module(tfg.graph(grappler-hook-test-pass))}");
+  EXPECT_EQ(optimizer.name(),
+            "tfg_optimizer{any(tfg.graph(grappler-hook-test-pass))}");
 }
 
 TEST(TFGOptimizerTest, TestImportErrorReturnsAborted) {
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index a8d0c51c8ac..da360b37ca3 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/grappler/utils/pattern_utils.h b/tensorflow/core/grappler/utils/pattern_utils.h
index 492194cf1ce..de4eecb8329 100644
--- a/tensorflow/core/grappler/utils/pattern_utils.h
+++ b/tensorflow/core/grappler/utils/pattern_utils.h
@@ -226,6 +226,18 @@ class SubGraphMatcher {
   }
 };
 
+template <>
+bool SubGraphMatcher<MatchingDirection::kFollowInputs>::DoesOpTypePatternMatch(
+    const OpTypePattern& pattern, MutableNodeView* node_view,
+    NodeViewMatch* match);
+
+template <>
+bool SubGraphMatcher<MatchingDirection::kFollowInputs>::GetMatchedNodes(
+    const OpTypePattern& pattern,
+    const std::unordered_set<string>& nodes_to_preserve,
+    MutableNodeView* node_view, std::map<string, int>* matched_nodes_map,
+    std::set<int>* remove_node_indices);
+
 }  // namespace utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index a6f96f3411b..2d3cc54c72e 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/ir/BUILD b/tensorflow/core/ir/BUILD
index 045442f5348..c1618ee0cdf 100644
--- a/tensorflow/core/ir/BUILD
+++ b/tensorflow/core/ir/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
@@ -216,7 +217,6 @@ tf_cc_test(
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/core/ir/dialect.h b/tensorflow/core/ir/dialect.h
index 9347af32948..91c19d82570 100644
--- a/tensorflow/core/ir/dialect.h
+++ b/tensorflow/core/ir/dialect.h
@@ -26,43 +26,45 @@ limitations under the License.
 namespace mlir {
 namespace tfg {
 // Include the relevant TensorFlow attrs/types directly in the TFG namespace.
-using mlir::tf_type::Bfloat16RefType;    // NOLINT
-using mlir::tf_type::BoolRefType;        // NOLINT
-using mlir::tf_type::Complex128RefType;  // NOLINT
-using mlir::tf_type::Complex64RefType;   // NOLINT
-using mlir::tf_type::ControlType;        // NOLINT
-using mlir::tf_type::DoubleRefType;      // NOLINT
-using mlir::tf_type::FloatRefType;       // NOLINT
-using mlir::tf_type::FuncAttr;           // NOLINT
-using mlir::tf_type::HalfRefType;        // NOLINT
-using mlir::tf_type::Int16RefType;       // NOLINT
-using mlir::tf_type::Int32RefType;       // NOLINT
-using mlir::tf_type::Int64RefType;       // NOLINT
-using mlir::tf_type::Int8RefType;        // NOLINT
-using mlir::tf_type::OpaqueTensorType;   // NOLINT
-using mlir::tf_type::PlaceholderAttr;    // NOLINT
-using mlir::tf_type::Qint16RefType;      // NOLINT
-using mlir::tf_type::Qint16Type;         // NOLINT
-using mlir::tf_type::Qint32RefType;      // NOLINT
-using mlir::tf_type::Qint32Type;         // NOLINT
-using mlir::tf_type::Qint8RefType;       // NOLINT
-using mlir::tf_type::Qint8Type;          // NOLINT
-using mlir::tf_type::Quint16RefType;     // NOLINT
-using mlir::tf_type::Quint16Type;        // NOLINT
-using mlir::tf_type::Quint8RefType;      // NOLINT
-using mlir::tf_type::Quint8Type;         // NOLINT
-using mlir::tf_type::ResourceRefType;    // NOLINT
-using mlir::tf_type::ResourceType;       // NOLINT
-using mlir::tf_type::ShapeAttr;          // NOLINT
-using mlir::tf_type::StringRefType;      // NOLINT
-using mlir::tf_type::StringType;         // NOLINT
-using mlir::tf_type::Uint16RefType;      // NOLINT
-using mlir::tf_type::Uint32RefType;      // NOLINT
-using mlir::tf_type::Uint64RefType;      // NOLINT
-using mlir::tf_type::Uint8RefType;       // NOLINT
-using mlir::tf_type::VariantRefType;     // NOLINT
-using mlir::tf_type::VariantType;        // NOLINT
-using mlir::tf_type::VersionAttr;        // NOLINT
+using mlir::tf_type::Bfloat16RefType;      // NOLINT
+using mlir::tf_type::BoolRefType;          // NOLINT
+using mlir::tf_type::Complex128RefType;    // NOLINT
+using mlir::tf_type::Complex64RefType;     // NOLINT
+using mlir::tf_type::ControlType;          // NOLINT
+using mlir::tf_type::DoubleRefType;        // NOLINT
+using mlir::tf_type::Float8E4M3FNRefType;  // NOLINT
+using mlir::tf_type::Float8E5M2RefType;    // NOLINT
+using mlir::tf_type::FloatRefType;         // NOLINT
+using mlir::tf_type::FuncAttr;             // NOLINT
+using mlir::tf_type::HalfRefType;          // NOLINT
+using mlir::tf_type::Int16RefType;         // NOLINT
+using mlir::tf_type::Int32RefType;         // NOLINT
+using mlir::tf_type::Int64RefType;         // NOLINT
+using mlir::tf_type::Int8RefType;          // NOLINT
+using mlir::tf_type::OpaqueTensorType;     // NOLINT
+using mlir::tf_type::PlaceholderAttr;      // NOLINT
+using mlir::tf_type::Qint16RefType;        // NOLINT
+using mlir::tf_type::Qint16Type;           // NOLINT
+using mlir::tf_type::Qint32RefType;        // NOLINT
+using mlir::tf_type::Qint32Type;           // NOLINT
+using mlir::tf_type::Qint8RefType;         // NOLINT
+using mlir::tf_type::Qint8Type;            // NOLINT
+using mlir::tf_type::Quint16RefType;       // NOLINT
+using mlir::tf_type::Quint16Type;          // NOLINT
+using mlir::tf_type::Quint8RefType;        // NOLINT
+using mlir::tf_type::Quint8Type;           // NOLINT
+using mlir::tf_type::ResourceRefType;      // NOLINT
+using mlir::tf_type::ResourceType;         // NOLINT
+using mlir::tf_type::ShapeAttr;            // NOLINT
+using mlir::tf_type::StringRefType;        // NOLINT
+using mlir::tf_type::StringType;           // NOLINT
+using mlir::tf_type::Uint16RefType;        // NOLINT
+using mlir::tf_type::Uint32RefType;        // NOLINT
+using mlir::tf_type::Uint64RefType;        // NOLINT
+using mlir::tf_type::Uint8RefType;         // NOLINT
+using mlir::tf_type::VariantRefType;       // NOLINT
+using mlir::tf_type::VariantType;          // NOLINT
+using mlir::tf_type::VersionAttr;          // NOLINT
 
 class TFGraphOpAsmInterface;
 class TFOp;
diff --git a/tensorflow/core/ir/dialect.td b/tensorflow/core/ir/dialect.td
index 8b2f153dc66..48a5f53c58c 100644
--- a/tensorflow/core/ir/dialect.td
+++ b/tensorflow/core/ir/dialect.td
@@ -131,7 +131,7 @@ def TFGraphDialect : Dialect {
 
     // Returns the hook to parse an operation belonging to this dialect, even
     // if unregistered.
-    Optional<ParseOpHook> getParseOperationHook(StringRef opName) const
+    std::optional<ParseOpHook> getParseOperationHook(StringRef opName) const
       override;
 
     // Returns the took to print an operation belonging to this dialect, even
@@ -172,7 +172,7 @@ def TFGraphDialect : Dialect {
   let useDefaultAttributePrinterParser = 1;
   let hasNonDefaultDestructor = 1;
   let hasOperationInterfaceFallback = 1;
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 #endif // TFG_DIALECT
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index af52745ac12..0572d6683a1 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":__subpackages__",
         "//tensorflow/compiler/mlir:__subpackages__",
@@ -95,6 +96,7 @@ cc_library(
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/platform:float8",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
@@ -217,6 +219,7 @@ tf_cc_binary(
         ":graphdef_export",
         ":graphdef_import",
         ":load_proto",
+        "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/core:ops",  # Ops need to be registered for import.
         "//tensorflow/core/ir:Dialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/core/ir/importexport/convert_attributes.cc b/tensorflow/core/ir/importexport/convert_attributes.cc
index d3624d5ecb3..a23a1b57b94 100644
--- a/tensorflow/core/ir/importexport/convert_attributes.cc
+++ b/tensorflow/core/ir/importexport/convert_attributes.cc
@@ -359,8 +359,7 @@ StatusOr<Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
         attrs.push_back(FuncAttr::get(builder.getContext(), func_attr.name(),
                                       builder.getDictionaryAttr(subattrs)));
       }
-      return builder.getArrayAttr(
-          llvm::makeArrayRef(attrs.begin(), attrs.end()));
+      return builder.getArrayAttr(llvm::ArrayRef(attrs.begin(), attrs.end()));
     }
     case AttrValue::VALUE_NOT_SET:
       return builder.getUnitAttr();
diff --git a/tensorflow/core/ir/importexport/convert_tensor.cc b/tensorflow/core/ir/importexport/convert_tensor.cc
index 2c6def949a1..3c5c7303636 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.cc
+++ b/tensorflow/core/ir/importexport/convert_tensor.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace mlir {
 namespace tfg {
@@ -80,19 +81,19 @@ tensorflow::StatusOr<ElementsAttr> ConvertFlatTensor(const Tensor& input_tensor,
                                                      ShapedType type) {
   auto arr = input_tensor.flat<T>();
   return ElementsAttr(
-      DenseElementsAttr::get(type, llvm::makeArrayRef(arr.data(), arr.size())));
+      DenseElementsAttr::get(type, llvm::ArrayRef(arr.data(), arr.size())));
 }
 
 ElementsAttr ConvertBf16Tensor(const Tensor& input_tensor,
                                RankedTensorType type) {
-  auto buffer = llvm::makeArrayRef(static_cast<char*>(input_tensor.data()),
-                                   input_tensor.TotalBytes());
+  auto buffer = llvm::ArrayRef(static_cast<char*>(input_tensor.data()),
+                               input_tensor.TotalBytes());
   return DenseElementsAttr::getFromRawBuffer(type, buffer);
 }
 
 ElementsAttr ConvertHalfTensor(const Tensor& tensor, RankedTensorType type) {
-  auto buffer = llvm::makeArrayRef(static_cast<char*>(tensor.data()),
-                                   tensor.TotalBytes());
+  auto buffer =
+      llvm::ArrayRef(static_cast<char*>(tensor.data()), tensor.TotalBytes());
   return DenseElementsAttr::getFromRawBuffer(type, buffer);
 }
 
@@ -250,7 +251,8 @@ PartialTensorShape ConvertTypeToTensorShape(const Type& type) {
 
   if (auto tensor_type = type.dyn_cast<RankedTensorType>()) {
     TensorShapeProto tensor_shape_proto;
-    ConvertToTensorShapeProto(tensor_type.getShape(), &tensor_shape_proto);
+    ConvertToTensorShapeProto(ConvertMlirShapeToTF(tensor_type.getShape()),
+                              &tensor_shape_proto);
     return PartialTensorShape(tensor_shape_proto);
   }
 
@@ -265,7 +267,9 @@ ShapeAttr ConvertTypeToTensorShapeAttr(const Type& type) {
   }
 
   if (auto tensor_type = type.dyn_cast<RankedTensorType>()) {
-    return ShapeAttr::get(type.getContext(), tensor_type.getShape());
+    return ShapeAttr::get(
+        type.getContext(),
+        llvm::ArrayRef(ConvertMlirShapeToTF(tensor_type.getShape())));
   }
 
   // If type is not a RankedTensor or UnrankedTensor, it must be a scalar.
@@ -283,7 +287,7 @@ tensorflow::StatusOr<ShapeAttr> ConvertTensorShapeProto(
   for (const auto& dim : shape.dim()) {
     dims.push_back(dim.size());
   }
-  return ShapeAttr::get(context, llvm::makeArrayRef(dims));
+  return ShapeAttr::get(context, llvm::ArrayRef(dims));
 }
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
@@ -341,14 +345,19 @@ void ConvertFloatElementsAttr(const DenseElementsAttr attr,
 // specified repeated field.
 void ConvertHalfElementsAttr(const DenseElementsAttr attr,
                              RepeatedField<int>* output) {
+  // Half values are stored as bit representations in int, requiring a bit_cast.
   if (attr.isSplat()) {
-    auto value = attr.getSplatValue<Eigen::half>().x;
-    if (value != Eigen::half(0) || std::signbit(static_cast<float>(value)))
-      output->Add(value);
+    uint16_t bits =
+        Eigen::numext::bit_cast<uint16_t>(attr.getSplatValue<Eigen::half>());
+    // Only +0 has a 0 bit representation.
+    if (bits != 0) {
+      output->Add(bits);
+    }
   } else {
     output->Reserve(attr.getNumElements());
-    for (const Eigen::half value : attr.getValues<Eigen::half>())
-      output->AddAlreadyReserved(value.x);
+    for (const Eigen::half value : attr.getValues<Eigen::half>()) {
+      output->AddAlreadyReserved(Eigen::numext::bit_cast<uint16_t>(value));
+    }
   }
 }
 
@@ -380,13 +389,39 @@ void ConvertUIntElementsAttr(const DenseElementsAttr attr,
 
 void ConvertBfloat16ElementsAttr(const DenseElementsAttr attr,
                                  RepeatedField<int>* output) {
+  // Bfloat16 values are stored as bit representations in int, requiring a
+  // bit_cast.
   if (attr.isSplat()) {
-    if (attr.getSplatValue<bfloat16>().value != bfloat16(0))
-      output->Add(attr.getSplatValue<bfloat16>().value);
+    uint16_t bits =
+        Eigen::numext::bit_cast<uint16_t>(attr.getSplatValue<bfloat16>());
+    // Only +0 has a 0 bit representation.
+    if (bits != 0) {
+      output->Add(bits);
+    }
   } else {
     output->Reserve(attr.getNumElements());
-    for (const bfloat16 value : attr.getValues<bfloat16>())
-      output->AddAlreadyReserved(value.value);
+    for (const bfloat16 value : attr.getValues<bfloat16>()) {
+      output->AddAlreadyReserved(Eigen::numext::bit_cast<uint16_t>(value));
+    }
+  }
+}
+
+template <typename T>
+void ConvertFloat8ElementsAttr(const DenseElementsAttr attr,
+                               std::string* output) {
+  // Float8 values are stored as bit representations in int, requiring a
+  // bit_cast.
+  if (attr.isSplat()) {
+    uint8_t bits = Eigen::numext::bit_cast<uint8_t>(attr.getSplatValue<T>());
+    // Only +0 has a 0 bit representation.
+    if (bits != 0) {
+      output->push_back(bits);
+    }
+  } else {
+    output->reserve(attr.getNumElements());
+    for (const T value : attr.getValues<T>()) {
+      output->push_back(Eigen::numext::bit_cast<uint8_t>(value));
+    }
   }
 }
 
@@ -428,6 +463,14 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       ConvertFloatElementsAttr(dense_attr, output->mutable_float_val(),
                                output->mutable_tensor_content());
       break;
+    case tensorflow::DT_FLOAT8_E5M2:
+      ConvertFloat8ElementsAttr<tsl::float8_e5m2>(dense_attr,
+                                                  output->mutable_float8_val());
+      break;
+    case tensorflow::DT_FLOAT8_E4M3FN:
+      ConvertFloat8ElementsAttr<tsl::float8_e4m3fn>(
+          dense_attr, output->mutable_float8_val());
+      break;
     case tensorflow::DT_QUINT8:
     case tensorflow::DT_INT8:
       ConvertUIntElementsAttr<int, int8_t>(dense_attr,
@@ -486,5 +529,24 @@ Status ConvertToTensor(const ElementsAttr attr, Tensor* output_tensor) {
   return ::tensorflow::OkStatus();
 }
 
+llvm::SmallVector<int64_t> ConvertMlirShapeToTF(llvm::ArrayRef<int64_t> shape) {
+  return llvm::to_vector(llvm::map_range(shape, [](int64_t dim) {
+    return mlir::ShapedType::isDynamic(dim) ? -1 : dim;
+  }));
+}
+
+llvm::SmallVector<int64_t> ConvertTFShapeToMlir(llvm::ArrayRef<int64_t> shape) {
+  return llvm::to_vector(llvm::map_range(shape, [](int64_t dim) {
+    return dim == -1 ? mlir::ShapedType::kDynamic : dim;
+  }));
+}
+
+mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
+                                                mlir::Type elementType,
+                                                mlir::Attribute encoding) {
+  return mlir::RankedTensorType::get(ConvertTFShapeToMlir(shape), elementType,
+                                     encoding);
+}
+
 }  // namespace tfg
 }  // namespace mlir
diff --git a/tensorflow/core/ir/importexport/convert_tensor.h b/tensorflow/core/ir/importexport/convert_tensor.h
index 9298adeb745..d20245e2f9f 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.h
+++ b/tensorflow/core/ir/importexport/convert_tensor.h
@@ -76,6 +76,17 @@ tensorflow::Status ConvertToTensorProto(ElementsAttr attr,
 tensorflow::Status ConvertToTensor(ElementsAttr attr,
                                    tensorflow::Tensor* output_tensor);
 
+// Converts a TF shape to MLIR shape, i.e. -1 becomes kDynamicSize.
+llvm::SmallVector<int64_t> ConvertTFShapeToMlir(llvm::ArrayRef<int64_t> shape);
+
+// Converts an MLIR shape to TF shape, i.e. kDynamicSize becomes -1.
+llvm::SmallVector<int64_t> ConvertMlirShapeToTF(llvm::ArrayRef<int64_t> shape);
+
+// Creates a TF TensorShape using MLIR shape, element type and encoding.
+mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
+                                                mlir::Type elementType,
+                                                mlir::Attribute encoding = {});
+
 }  // namespace tfg
 }  // namespace mlir
 
diff --git a/tensorflow/core/ir/importexport/convert_types.cc b/tensorflow/core/ir/importexport/convert_types.cc
index e745f9be042..e87303a46ff 100644
--- a/tensorflow/core/ir/importexport/convert_types.cc
+++ b/tensorflow/core/ir/importexport/convert_types.cc
@@ -82,6 +82,12 @@ Status ConvertDataType(DataType dtype, Builder& builder, Type* type) {
     case tensorflow::DT_COMPLEX128:
       *type = ComplexType::get(builder.getF64Type());
       return ::tensorflow::OkStatus();
+    case tensorflow::DT_FLOAT8_E4M3FN:
+      *type = builder.getFloat8E4M3FNType();
+      return ::tensorflow::OkStatus();
+    case tensorflow::DT_FLOAT8_E5M2:
+      *type = builder.getFloat8E5M2Type();
+      return ::tensorflow::OkStatus();
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
   case tensorflow::DT_##enumerant:              \
     *type = builder.getType<tftype##Type>();    \
@@ -107,6 +113,12 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
   } else if (type.isBF16()) {
     *dtype = tensorflow::DT_BFLOAT16;
     return ::tensorflow::OkStatus();
+  } else if (type.isFloat8E4M3FN()) {
+    *dtype = ::tensorflow::DT_FLOAT8_E4M3FN;
+    return ::tensorflow::OkStatus();
+  } else if (type.isFloat8E5M2()) {
+    *dtype = ::tensorflow::DT_FLOAT8_E5M2;
+    return ::tensorflow::OkStatus();
   } else if (auto itype = type.dyn_cast<IntegerType>()) {
     switch (itype.getWidth()) {
       case 1:
@@ -185,8 +197,8 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape,
     }
     // This isn't really expected, but Grappler is using such shapes for its
     // symbolic shape analysis and it may spill into here.
-    if (d.size() < ShapedType::kDynamicSize)
-      shape->push_back(ShapedType::kDynamicSize);
+    if (d.size() < ShapedType::kDynamic)
+      shape->push_back(ShapedType::kDynamic);
     else
       shape->push_back(d.size());
   }
diff --git a/tensorflow/core/ir/importexport/functiondef_import.cc b/tensorflow/core/ir/importexport/functiondef_import.cc
index 35c2d9270ff..d1d54b24788 100644
--- a/tensorflow/core/ir/importexport/functiondef_import.cc
+++ b/tensorflow/core/ir/importexport/functiondef_import.cc
@@ -86,11 +86,11 @@ class ValueMapManager {
     return ::tensorflow::OkStatus();
   }
 
-  Value GetValueOrCreatePlaceholder(StringRef full_name) {
+  tensorflow::StatusOr<Value> GetValueOrCreatePlaceholder(StringRef full_name) {
     StringRef node_name;
     StringRef output_name = "";
     bool is_control_dep = full_name[0] == '^';
-    int output_num = 0;
+    size_t output_num = 0;
     if (is_control_dep) full_name = full_name.drop_front();
     {
       size_t colon_sep = full_name.find_first_of(':');
@@ -105,8 +105,16 @@ class ValueMapManager {
         // NOLINTNEXTLINE: type matching the API taking a reference.
         unsigned long long value;
         if (!llvm::getAsUnsignedInteger(output_name.drop_front(colon_sep + 1),
-                                        10, value))
-          output_num = value;
+                                        10, value)) {
+          if (LLVM_LIKELY(
+                  value <=
+                  std::numeric_limits<llvm::SmallVectorSizeType<Value>>::max() -
+                      1))
+            output_num = value;
+          else
+            return InvalidArgument("Output index ", value,
+                                   " is invalid (too large)");
+        }
         output_name = output_name.take_front(colon_sep);
       }
     }
@@ -171,8 +179,9 @@ Status ImportNodes(ValueMapManager value_manager,
     for (const std::string& input : node.input()) {
       if (input.empty())
         return InvalidArgument("Node '", node.name(), "' has an empty input");
-      state.operands.push_back(
-          value_manager.GetValueOrCreatePlaceholder(input));
+      TF_ASSIGN_OR_RETURN(Value value,
+                          value_manager.GetValueOrCreatePlaceholder(input));
+      state.operands.push_back(value);
     }
     // Retrieve the entry in the nodes_map for this node and infer the result
     // count from what was inferred during the first traversal above.
@@ -376,9 +385,8 @@ Status ImportGenericFunction(
     args_attrs.push_back(NamedAttrList{}.getDictionary(context));
     arg_num++;
   }
-  attrs.push_back(
-      builder.getNamedAttr(function_interface_impl::getArgDictAttrName(),
-                           builder.getArrayAttr(args_attrs)));
+  attrs.push_back(builder.getNamedAttr(func_op.getArgAttrsAttrName(),
+                                       builder.getArrayAttr(args_attrs)));
 
   // Process the results attributes now.
   int res_num = 0;
@@ -395,9 +403,8 @@ Status ImportGenericFunction(
     res_attrs.push_back(output_attrs.getDictionary(context));
     ++res_num;
   }
-  attrs.push_back(
-      builder.getNamedAttr(function_interface_impl::getResultDictAttrName(),
-                           builder.getArrayAttr(res_attrs)));
+  attrs.push_back(builder.getNamedAttr(func_op.getResAttrsAttrName(),
+                                       builder.getArrayAttr(res_attrs)));
 
   values_map.clear();
   Block* body = new Block();
@@ -472,8 +479,9 @@ Status ImportGenericFunction(
       return InvalidArgument("Function '", func.signature().name(),
                              "' has empty result name");
     }
-    ret_vals[position->second] =
-        value_manager.GetValueOrCreatePlaceholder(ret_val.second);
+    TF_ASSIGN_OR_RETURN(
+        ret_vals[position->second],
+        value_manager.GetValueOrCreatePlaceholder(ret_val.second));
   }
   for (const auto& ret_val : func.control_ret()) {
     auto position = control_output_to_position.find(ret_val.first);
@@ -487,8 +495,8 @@ Status ImportGenericFunction(
       return InvalidArgument("Function '", func.signature().name(),
                              "' has empty control result name");
     }
-    Value result = value_manager.GetValueOrCreatePlaceholder(
-        (Twine("^") + ret_val.second).str());
+    TF_ASSIGN_OR_RETURN(Value result, value_manager.GetValueOrCreatePlaceholder(
+                                          (Twine("^") + ret_val.second).str()));
     if (!result.getType().isa<ControlType>())
       return InvalidArgument("failed to map returned value ", ret_val.second,
                              ", isn't a control output");
diff --git a/tensorflow/core/ir/importexport/graphdef_export.cc b/tensorflow/core/ir/importexport/graphdef_export.cc
index 5612fb2eca5..2dea67cfc61 100644
--- a/tensorflow/core/ir/importexport/graphdef_export.cc
+++ b/tensorflow/core/ir/importexport/graphdef_export.cc
@@ -88,8 +88,8 @@ class GraphDefExporter {
 
   // Export a TFG graph function to a FunctionDef. If the function has a
   // gradient, add it to the graph afterwards to preserve thread-safety.
-  StatusOr<Optional<GradientDef>> ExportFunction(GraphFuncOp func,
-                                                 FunctionDef *def);
+  StatusOr<std::optional<GradientDef>> ExportFunction(GraphFuncOp func,
+                                                      FunctionDef *def);
 
  private:
   // Export just the input and outputs of a function signature. When
@@ -170,7 +170,7 @@ Status GraphDefExporter::ExportToGraphDef(ModuleOp module, GraphDef *graph) {
   }
 
   const auto convert_func = [this](GraphFuncOp func, FunctionDef *def,
-                                   Optional<GradientDef> &gradient) {
+                                   std::optional<GradientDef> &gradient) {
     // Generic functions are not on the hot path and skip the conversion to
     // Graph so just call the existing exporter.
     if (func.getGeneric()) {
@@ -193,7 +193,7 @@ Status GraphDefExporter::ExportToGraphDef(ModuleOp module, GraphDef *graph) {
       GraphFuncOp func;
       FunctionDef *def;
       Status status;
-      Optional<GradientDef> gradient;
+      std::optional<GradientDef> gradient;
     };
     std::vector<Argument> args;
     for (auto func : module.getOps<GraphFuncOp>())
@@ -213,7 +213,7 @@ Status GraphDefExporter::ExportToGraphDef(ModuleOp module, GraphDef *graph) {
     }
   } else {
     for (auto func : module.getOps<GraphFuncOp>()) {
-      Optional<GradientDef> gradient;
+      std::optional<GradientDef> gradient;
       TF_RETURN_IF_ERROR(convert_func(
           func, graph->mutable_library()->add_function(), gradient));
       if (gradient)
@@ -242,15 +242,15 @@ static Status ConvertAttributes(
   return ::tensorflow::OkStatus();
 }
 
-StatusOr<Optional<GradientDef>> GraphDefExporter::ExportFunction(
+StatusOr<std::optional<GradientDef>> GraphDefExporter::ExportFunction(
     GraphFuncOp func, FunctionDef *def) {
   std::string func_name = func.getSymName().str();
 
   // TODO(jeffniu): Exploit the sorted order of the function attributes.
 
   // Get a gradient, if there is one.
-  Optional<GradientDef> gradient;
-  if (Optional<StringRef> gradient_name = func.getGradient()) {
+  std::optional<GradientDef> gradient;
+  if (std::optional<StringRef> gradient_name = func.getGradient()) {
     gradient.emplace();
     gradient->set_gradient_func(gradient_name->str());
     gradient->set_function_name(func_name);
@@ -259,7 +259,7 @@ StatusOr<Optional<GradientDef>> GraphDefExporter::ExportFunction(
   // Convert the first-class attributes.
   OpDef *signature = def->mutable_signature();
   signature->set_name(func_name);
-  if (Optional<StringRef> description = func.getDescription())
+  if (std::optional<StringRef> description = func.getDescription())
     signature->set_description(description->str());
   signature->set_is_stateful(func.getIsStateful());
 
@@ -286,7 +286,7 @@ StatusOr<Optional<GradientDef>> GraphDefExporter::ExportFunction(
 
   // Convert the arguments.
   for (int i = 0, e = func.getNumArguments(); i < e; i += 2) {
-    auto attrs = func.getArgAttrs().getValue()[i].cast<DictionaryAttr>();
+    auto attrs = func.getArgAttrs().value()[i].cast<DictionaryAttr>();
     TF_ASSIGN_OR_RETURN(OpDef::ArgDef &arg = *signature->add_input_arg(),
                         ConvertArgumentAttributes(attrs));
     DataType dtype;
@@ -467,7 +467,7 @@ static StatusOr<std::string> GetValueName(
     // If the block argument is a control token, use the attributes of the
     // associated data argument (which preceeds it).
     auto attrs = func.getArgAttrs()
-                     .getValue()[arg.getArgNumber() - is_control]
+                     .value()[arg.getArgNumber() - is_control]
                      .cast<DictionaryAttr>();
     auto name_attr =
         attrs.getAs<StringAttr>(dialect->getTfgNameAttrIdentifier());
@@ -624,7 +624,7 @@ Status ConvertToFunctionDef(GraphFuncOp func,
                             FunctionLibraryDefinition &library) {
   GraphDefExporter exporter(func.getDialect(), *OpRegistry::Global(), &library);
   FunctionDef def;
-  TF_ASSIGN_OR_RETURN(Optional<GradientDef> gradient,
+  TF_ASSIGN_OR_RETURN(std::optional<GradientDef> gradient,
                       exporter.ExportFunction(func, &def));
   const std::string &name = def.signature().name();
   if (library.Contains(name)) {
diff --git a/tensorflow/core/ir/importexport/graphdef_import.cc b/tensorflow/core/ir/importexport/graphdef_import.cc
index 0eb52e9a83e..dcffca5c740 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.cc
+++ b/tensorflow/core/ir/importexport/graphdef_import.cc
@@ -476,7 +476,7 @@ Location GraphDefImporter::ConvertLocation(StringRef node_name,
   // Use the first location to generate a name location.
   Location node_name_loc = NameLoc::get(name_loc_id, locs.front());
   // Generate a stack trace using the remaining locations.
-  ArrayRef<Location> callsite_locs = llvm::makeArrayRef(locs).drop_front();
+  ArrayRef<Location> callsite_locs = llvm::ArrayRef(locs).drop_front();
   return callsite_locs.empty() ? node_name_loc
                                : CallSiteLoc::get(node_name_loc, callsite_locs);
 }
diff --git a/tensorflow/core/ir/importexport/tests/BUILD b/tensorflow/core/ir/importexport/tests/BUILD
index 91b9ed48cc0..17a02621637 100644
--- a/tensorflow/core/ir/importexport/tests/BUILD
+++ b/tensorflow/core/ir/importexport/tests/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [":__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
index 91b9ed48cc0..17a02621637 100644
--- a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [":__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_function_named_edge_index.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_function_named_edge_index.pbtxt
new file mode 100644
index 00000000000..7275ac6f1e6
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_function_named_edge_index.pbtxt
@@ -0,0 +1,52 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: Output index 18446744073709551615 is invalid (too large)
+
+library {
+  function {
+    signature {
+      name: "foo"
+      attr {
+        name: "T"
+        type: "type"
+      }
+    }
+    node_def {
+      name: "two"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {}
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "a"
+      op: "Cast"
+      input: "two:output:18446744073709551615"
+      attr {
+        key: "DstT"
+        value {
+          placeholder: "T"
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
index 91b9ed48cc0..17a02621637 100644
--- a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
+++ b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [":__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
index 2411abb1694..e5f95da8976 100644
--- a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
+++ b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
@@ -3,6 +3,7 @@ load(":roundtrip.bzl", "glob_roundtrip_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/ir/importexport:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h b/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h
index 2f8be08db7b..516ede67c29 100644
--- a/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h
+++ b/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_TESTS_ROUNDTRIP_ROUNDTRIP_H_
 #define TENSORFLOW_CORE_IR_IMPORTEXPORT_TESTS_ROUNDTRIP_ROUNDTRIP_H_
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ir/importexport/tests/saved_model/BUILD b/tensorflow/core/ir/importexport/tests/saved_model/BUILD
index ceb359ed398..646f6ddafd7 100644
--- a/tensorflow/core/ir/importexport/tests/saved_model/BUILD
+++ b/tensorflow/core/ir/importexport/tests/saved_model/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/ir/importexport/tfg-translate.cc b/tensorflow/core/ir/importexport/tfg-translate.cc
index be629b01c88..73f81f556d8 100644
--- a/tensorflow/core/ir/importexport/tfg-translate.cc
+++ b/tensorflow/core/ir/importexport/tfg-translate.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/MlirTranslateMain.h"  // from @llvm-project
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/importexport/graphdef_export.h"
 #include "tensorflow/core/ir/importexport/graphdef_import.h"
@@ -63,6 +64,7 @@ TranslateFromMLIRRegistration mlir_to_graphdef(
 
 int main(int argc, char **argv) {
   mlir::registerAsmPrinterCLOptions();
+  tensorflow::InitMlir y(&argc, &argv);
   return failed(
       mlir::mlirTranslateMain(argc, argv, "Graph(Def)<->TFG Translation Tool"));
 }
diff --git a/tensorflow/core/ir/ops.cc b/tensorflow/core/ir/ops.cc
index b67a179f750..9845523afc3 100644
--- a/tensorflow/core/ir/ops.cc
+++ b/tensorflow/core/ir/ops.cc
@@ -64,7 +64,7 @@ namespace tfg {
 
 // Name operation results with the operation name, except control outputs which
 // are named "ctl". MLIR will automatically use a numerical suffix to unique.
-static void GenericGetAsmResultNames(Operation *op,
+static void GenericGetAsmResultNames(Operation* op,
                                      OpAsmSetValueNameFn set_name_fn) {
   // We only name the results when there are results to name, an op like `print`
   // which does not have results will just use the `ctl` name for the control
@@ -83,14 +83,14 @@ static void GenericGetAsmResultNames(Operation *op,
 // Gives prettier names to SSA values.
 struct TFGraphOpAsmInterface
     : public OpAsmOpInterface::FallbackModel<TFGraphOpAsmInterface> {
-  static bool classof(Operation *op) { return true; }
+  static bool classof(Operation* op) { return true; }
 
-  void getAsmResultNames(Operation *op, OpAsmSetValueNameFn set_name_fn) const {
+  void getAsmResultNames(Operation* op, OpAsmSetValueNameFn set_name_fn) const {
     GenericGetAsmResultNames(op, set_name_fn);
   }
-  void getAsmBlockArgumentNames(Operation *op, Region &region,
+  void getAsmBlockArgumentNames(Operation* op, Region& region,
                                 OpAsmSetValueNameFn setNameFn) const {}
-  void getAsmBlockNames(Operation *op,
+  void getAsmBlockNames(Operation* op,
                         mlir::OpAsmSetBlockNameFn setNameFn) const {}
 };
 
@@ -141,7 +141,7 @@ void TFGraphDialect::initialize() {
 }
 
 // Provides a hook for op interface.
-void *TFGraphDialect::getRegisteredInterfaceForOp(TypeID interface,
+void* TFGraphDialect::getRegisteredInterfaceForOp(TypeID interface,
                                                   OperationName opName) {
   if (interface == TypeID::get<OpAsmOpInterface>()) {
     return fallbackOpAsmInterface_;
@@ -153,17 +153,17 @@ void *TFGraphDialect::getRegisteredInterfaceForOp(TypeID interface,
   }
 
   if (interface == TypeID::get<TensorFlowRegistryInterface>()) {
-    if (auto *instance =
+    if (auto* instance =
             getRegisteredInterface<TensorFlowRegistryInterfaceBase>()) {
       // Important: cast to (Concept *) to shift the pointer off the vtable.
-      return static_cast<TensorFlowRegistryInterfaceBase::Concept *>(
-          const_cast<TensorFlowRegistryInterfaceBase *>(instance));
+      return static_cast<TensorFlowRegistryInterfaceBase::Concept*>(
+          const_cast<TensorFlowRegistryInterfaceBase*>(instance));
     }
   } else if (interface == TypeID::get<MemoryEffectOpInterface>()) {
-    auto *instance = getRegisteredInterface<StatefulMemoryEffectInterface>();
+    auto* instance = getRegisteredInterface<StatefulMemoryEffectInterface>();
     assert(instance && "expected the memory interface to be registered");
-    return static_cast<StatefulMemoryEffectInterface::Concept *>(
-        const_cast<StatefulMemoryEffectInterface *>(instance));
+    return static_cast<StatefulMemoryEffectInterface::Concept*>(
+        const_cast<StatefulMemoryEffectInterface*>(instance));
   }
 
   return nullptr;
@@ -175,7 +175,7 @@ TFGraphDialect::~TFGraphDialect() { delete fallbackOpAsmInterface_; }
 static std::array<StringRef, 3> keyword_attrs{
     "_mlir_device", "_mlir_assigned_device", "_mlir_name"};
 
-static void PrintKeywordAttributes(Operation *op, OpAsmPrinter &printer,
+static void PrintKeywordAttributes(Operation* op, OpAsmPrinter& printer,
                                    ArrayRef<StringRef> elided_attrs = {}) {
   // Handles the optional "device" and "name" attribute.
   for (StringRef keyword : keyword_attrs) {
@@ -197,8 +197,8 @@ static void PrintKeywordAttributes(Operation *op, OpAsmPrinter &printer,
 //   tfg.OpName(%input1, %input2, %input3) [%control_dep1, %control_dep2]
 //           name("<node_name>") device("<device>") { attribute-dict } :
 //           (input types) -> (result_types)
-void TFGraphDialect::printCustomTfOp(Operation *op,
-                                     OpAsmPrinter &printer) const {
+void TFGraphDialect::printCustomTfOp(Operation* op,
+                                     OpAsmPrinter& printer) const {
   ControlType control_ty = getControlType();
 
   // Check that all control dependencies are after the regular values,
@@ -255,22 +255,22 @@ void TFGraphDialect::printCustomTfOp(Operation *op,
 }
 
 // Print a custom TFG op.
-static void PrintCustomTfOp(Operation *op, OpAsmPrinter &printer) {
+static void PrintCustomTfOp(Operation* op, OpAsmPrinter& printer) {
   cast<TFGraphDialect>(op->getDialect())->printCustomTfOp(op, printer);
 }
 
-llvm::unique_function<void(Operation *, OpAsmPrinter &)>
-TFGraphDialect::getOperationPrinter(Operation *op) const {
-  return [this](Operation *op, OpAsmPrinter &printer) {
+llvm::unique_function<void(Operation*, OpAsmPrinter&)>
+TFGraphDialect::getOperationPrinter(Operation* op) const {
+  return [this](Operation* op, OpAsmPrinter& printer) {
     this->printCustomTfOp(op, printer);
   };
 }
 
 // Try to parse optional keyword attributes and prefix them with `_mlir_`, of
 // `device`, `assigned_device`, and `name`.
-static ParseResult ParseKeywordAttributes(OpAsmParser &parser,
-                                          OperationState &result) {
-  for (const char *keyword : {"device", "assigned_device", "name"}) {
+static ParseResult ParseKeywordAttributes(OpAsmParser& parser,
+                                          OperationState& result) {
+  for (const char* keyword : {"device", "assigned_device", "name"}) {
     if (succeeded(parser.parseOptionalKeyword(keyword))) {
       StringAttr value;
       if (parser.parseLParen() ||
@@ -289,9 +289,9 @@ static ParseResult ParseKeywordAttributes(OpAsmParser &parser,
 //   tfg.OpName(%input1, %input2, %input3) [%control_dep1, %control_dep2]
 //           name("<node_name>") device("<device>") { attribute-dict } :
 //           (input types) -> (result_types)
-static ParseResult ParseCustomTfOp(OpAsmParser &parser,
-                                   OperationState &result) {
-  MLIRContext *context = parser.getBuilder().getContext();
+static ParseResult ParseCustomTfOp(OpAsmParser& parser,
+                                   OperationState& result) {
+  MLIRContext* context = parser.getBuilder().getContext();
   // Parse optional argument list
   SmallVector<OpAsmParser::UnresolvedOperand, 4> op_infos;
   if (parser.parseOperandList(op_infos, AsmParser::Delimiter::OptionalParen))
@@ -333,13 +333,13 @@ static ParseResult ParseCustomTfOp(OpAsmParser &parser,
   return success();
 }
 
-Optional<Dialect::ParseOpHook> TFGraphDialect::getParseOperationHook(
+std::optional<Dialect::ParseOpHook> TFGraphDialect::getParseOperationHook(
     StringRef opName) const {
   return ParseOpHook(ParseCustomTfOp);
 }
 
-static bool VerifyGenericTFGOperation(Operation &op) {
-  TFGraphDialect *dialect = dyn_cast<TFGraphDialect>(op.getDialect());
+static bool VerifyGenericTFGOperation(Operation& op) {
+  TFGraphDialect* dialect = dyn_cast<TFGraphDialect>(op.getDialect());
   if (!dialect) return true;
   ControlType control_ty = dialect->getControlType();
 
@@ -347,7 +347,7 @@ static bool VerifyGenericTFGOperation(Operation &op) {
   // inputs (or results).
   auto check_ctl_at_end = [&](TypeRange types, StringRef input_or_output) {
     int has_control_dep = -1;
-    for (auto &indexed_operand : llvm::enumerate(types)) {
+    for (auto& indexed_operand : llvm::enumerate(types)) {
       if (indexed_operand.value() == control_ty) {
         has_control_dep = indexed_operand.index();
         continue;
@@ -410,19 +410,13 @@ bool GraphFuncOp::isMarkedForCompilation() {
 // Hook for OpTrait::FunctionLike, called after verifying that the 'type'
 // attribute is present and checks if it holds a function type. Ensures
 // getType, getNumFuncArguments, and getNumFuncResults can be called safely
-LogicalResult GraphFuncOp::verifyType() {
-  auto type = getFunctionTypeAttr().getValue();
-  if (!type.isa<FunctionType>())
-    return emitOpError("requires '" + getTypeAttrName() +
-                       "' attribute of function type");
-  return success();
-}
+LogicalResult GraphFuncOp::verifyType() { return success(); }
 
 // Hook for OpTrait::FunctionLike, called after verifying the function
 // type and the presence of the (potentially empty) function body.
 LogicalResult GraphFuncOp::verifyBody() {
   FunctionType type = getFunctionType();
-  Block *body = SingleBlock::getBody();
+  Block* body = SingleBlock::getBody();
   // Check that the body is terminated with a tfg.return.
   if (getRegion().empty() || body->empty())
     return emitOpError() << "expects a non empty body";
@@ -431,7 +425,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     return emitOpError() << "function type indicated " << type.getNumInputs()
                          << " args but block has " << body->getNumArguments();
 
-  for (auto &arg_types :
+  for (auto& arg_types :
        llvm::enumerate(llvm::zip(type.getInputs(), body->getArgumentTypes()))) {
     Type signature_arg = std::get<0>(arg_types.value());
     Type block_arg = std::get<1>(arg_types.value());
@@ -452,7 +446,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     return emitOpError() << "expects " << type.getNumResults()
                          << " returned values but tfg.return has "
                          << return_op->getNumOperands() << " operands";
-  for (auto &indexed_type : llvm::enumerate(type.getResults())) {
+  for (auto& indexed_type : llvm::enumerate(type.getResults())) {
     Type expected_type = indexed_type.value();
     int res_num = indexed_type.index();
     Type actual_type = return_op->getOperand(res_num).getType();
@@ -463,7 +457,7 @@ LogicalResult GraphFuncOp::verifyBody() {
     }
   }
   Type control_type = getDialect()->getControlType();
-  for (auto &indexed_type : llvm::enumerate(llvm::drop_begin(
+  for (auto& indexed_type : llvm::enumerate(llvm::drop_begin(
            return_op->getOperandTypes(), type.getNumResults()))) {
     Type actual_type = indexed_type.value();
     if (actual_type != control_type) {
@@ -483,7 +477,7 @@ LogicalResult GraphFuncOp::verifyBody() {
 }
 
 LogicalResult GraphFuncOp::canonicalize(GraphFuncOp func_op,
-                                        PatternRewriter &rewriter) {
+                                        PatternRewriter& rewriter) {
   // Prune function body: the body is a graph where feeds/fetches a materialized
   // with function arguments and returned values. As such any operation not
   // reachable from the "fetches" can be pruned. The return statement also has
@@ -492,7 +486,7 @@ LogicalResult GraphFuncOp::canonicalize(GraphFuncOp func_op,
   bool changed = true;
   while (changed) {
     changed = false;
-    for (Operation &op : llvm::make_early_inc_range(
+    for (Operation& op : llvm::make_early_inc_range(
              llvm::reverse(*func_op.SingleBlock::getBody()))) {
       if (isa<ReturnOp>(op)) continue;
       if (op.getUses().empty()) {
@@ -521,14 +515,14 @@ LogicalResult GraphFuncOp::verify() {
   return success();
 }
 
-ParseResult GraphFuncOp::parse(OpAsmParser &parser, OperationState &result) {
+ParseResult GraphFuncOp::parse(OpAsmParser& parser, OperationState& result) {
   SmallVector<OpAsmParser::UnresolvedOperand> entry_args;
   SmallVector<Attribute> arg_attrs;
   SmallVector<Attribute> result_attrs;
   SmallVector<Type> arg_types;
   SmallVector<Type> result_types;
-  auto &builder = parser.getBuilder();
-  MLIRContext *context = builder.getContext();
+  auto& builder = parser.getBuilder();
+  MLIRContext* context = builder.getContext();
 
   // Parse visibility.
   StringRef visibility;
@@ -611,7 +605,8 @@ ParseResult GraphFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   }
 
   auto type = builder.getFunctionType(arg_types, result_types);
-  result.addAttribute(GraphFuncOp::getTypeAttrName(), TypeAttr::get(type));
+  result.addAttribute(GraphFuncOp::getFunctionTypeAttrName(result.name),
+                      TypeAttr::get(type));
 
   // If function attributes are present, parse them.
   NamedAttrList parsed_attributes;
@@ -622,20 +617,18 @@ ParseResult GraphFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   // Add the attributes to the function arguments.
   assert(arg_attrs.size() == arg_types.size());
   assert(result_attrs.size() == result_types.size());
-  result.attributes.append(
-      builder.getNamedAttr(FunctionOpInterface::getArgDictAttrName(),
-                           builder.getArrayAttr(arg_attrs)));
-  result.attributes.append(
-      builder.getNamedAttr(FunctionOpInterface::getResultDictAttrName(),
-                           builder.getArrayAttr(result_attrs)));
+  result.attributes.append(builder.getNamedAttr(
+      getArgAttrsAttrName(result.name), builder.getArrayAttr(arg_attrs)));
+  result.attributes.append(builder.getNamedAttr(
+      getResAttrsAttrName(result.name), builder.getArrayAttr(result_attrs)));
 
   // Parse the function body.
-  auto *body = result.addRegion();
+  auto* body = result.addRegion();
   llvm::SMLoc loc = parser.getCurrentLocation();
   SmallVector<OpAsmParser::Argument> args;
   if (entry_args.size()) {
     for (auto argAndType : llvm::zip(entry_args, arg_types)) {
-      auto &arg = args.emplace_back();
+      auto& arg = args.emplace_back();
       arg.ssaName = std::get<0>(argAndType);
       arg.type = std::get<1>(argAndType);
     }
@@ -651,9 +644,9 @@ ParseResult GraphFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   return success();
 }
 
-void GraphFuncOp::print(OpAsmPrinter &p) {
+void GraphFuncOp::print(OpAsmPrinter& p) {
   // Print the operation and the function name.
-  Operation *op = *this;
+  Operation* op = *this;
   p << " ";
   int argIndentSize = op->getName().getStringRef().size() + 3;
   StringRef visibility_attr_name = SymbolTable::getVisibilityAttrName();
@@ -712,16 +705,18 @@ void GraphFuncOp::print(OpAsmPrinter &p) {
   if (!op->getAttrs().empty()) {
     p.printNewline();
     function_interface_impl::printFunctionAttributes(
-        p, *this, fnType.getNumInputs(), fnType.getNumResults(),
-        {"generic", SymbolTable::getVisibilityAttrName()});
+        p, *this,
+        {"generic", SymbolTable::getVisibilityAttrName(),
+         getFunctionTypeAttrName(), getArgAttrsAttrName(),
+         getResAttrsAttrName()});
   }
   // Print body.
   p << ' ';
   p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
 }
 
-GraphFuncOp GraphFuncOp::getCalledFunction(Operation *op,
-                                           SymbolTable &symbol_table) {
+GraphFuncOp GraphFuncOp::getCalledFunction(Operation* op,
+                                           SymbolTable& symbol_table) {
   // Check if a node does indirect function call via PartitionedCallOp.
   // TODO(aminim): consider replacing with isa<...> when possible.
   if (op->getName().getStringRef() == "tfg.PartitionCall" ||
@@ -743,13 +738,13 @@ BlockArgument GraphFuncOp::getControlTokenOf(BlockArgument data) {
   return data.getOwner()->getArgument(data.getArgNumber() + 1);
 }
 
-BlockArgument GraphFuncOp::getDataValue(Region &region, unsigned idx) {
+BlockArgument GraphFuncOp::getDataValue(Region& region, unsigned idx) {
   return region.getArgument(idx * 2);
 }
 
 // This is naming block arguments for GraphFuncOp, we rely on the arg attributes
 // for computing the names.
-void GraphFuncOp::getAsmBlockArgumentNames(Region &region,
+void GraphFuncOp::getAsmBlockArgumentNames(Region& region,
                                            OpAsmSetValueNameFn set_name_fn) {
   ArrayRef<BlockArgument> args = SingleBlock::getBody()->getArguments();
   ControlType control_ty = ControlType::get(getContext());
@@ -791,7 +786,7 @@ LogicalResult ReturnOp::verify() {
   return success();
 }
 
-ParseResult ReturnOp::parse(OpAsmParser &parser, OperationState &result) {
+ParseResult ReturnOp::parse(OpAsmParser& parser, OperationState& result) {
   // ReturnOp has the same assembly format as generic TFG ops except that the
   // control result attributes are embedded with the control operands:
   // [%ctl {tfg.name = "foo"}, %ctl_0 {tfg.name = "bar"}]
@@ -805,8 +800,8 @@ ParseResult ReturnOp::parse(OpAsmParser &parser, OperationState &result) {
     do {
       NamedAttrList attrs;
       OptionalParseResult parse_result = parser.parseOptionalOperand(operand);
-      if (!parse_result.hasValue()) break;
-      if (failed(parse_result.getValue())) return failure();
+      if (!parse_result.has_value()) break;
+      if (failed(parse_result.value())) return failure();
       if (parser.parseOptionalAttrDict(attrs)) return failure();
       control_ret_attrs.push_back(attrs.getDictionary(result.getContext()));
       operands.push_back(std::move(operand));
@@ -827,7 +822,7 @@ ParseResult ReturnOp::parse(OpAsmParser &parser, OperationState &result) {
   return success();
 }
 
-void ReturnOp::print(OpAsmPrinter &printer) {
+void ReturnOp::print(OpAsmPrinter& printer) {
   TFOp tfg_op(*this);
   OperandRange data = tfg_op.getNonControlOperands();
   if (!data.empty()) printer << '(' << data << ')';
@@ -851,7 +846,7 @@ void ReturnOp::print(OpAsmPrinter &printer) {
   if (!data.empty()) printer << " : " << data.getTypes();
 }
 
-void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+void ReturnOp::build(OpBuilder& odsBuilder, OperationState& odsState,
                      ValueRange operands, ValueRange control_operands) {
   odsState.addOperands(operands);
   odsState.addOperands(control_operands);
@@ -873,7 +868,7 @@ void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState,
 
 // Verify that all control operands follow non-control operands, and return the
 // subrange of non-control operands.
-static FailureOr<TypeRange> VerifyOperands(Operation *op) {
+static FailureOr<TypeRange> VerifyOperands(Operation* op) {
   ControlType control_ty =
       cast<TFGraphDialect>(op->getDialect())->getControlType();
   Operation::operand_type_iterator it =
@@ -888,7 +883,7 @@ static FailureOr<TypeRange> VerifyOperands(Operation *op) {
 
 // Verify that the last result of an operation is the only control result, and
 // return a subrange of the non-control results.
-static FailureOr<TypeRange> VerifyResults(Operation *op) {
+static FailureOr<TypeRange> VerifyResults(Operation* op) {
   ControlType control_ty =
       cast<TFGraphDialect>(op->getDialect())->getControlType();
   Operation::result_type_iterator it =
@@ -904,9 +899,9 @@ static FailureOr<TypeRange> VerifyResults(Operation *op) {
 
 // Verify that the signature of the function matches the operation's operands
 // and results.
-static LogicalResult VerifySignature(GraphFuncOp func, Operation *op,
+static LogicalResult VerifySignature(GraphFuncOp func, Operation* op,
                                      TypeRange operands, TypeRange results,
-                                     const Twine &func_name) {
+                                     const Twine& func_name) {
   auto attach_func = [&](InFlightDiagnostic diag) -> LogicalResult {
     return diag.attachNote(func.getLoc()).appendOp(*func, OpPrintingFlags())
            << "\nsee referenced function";
@@ -927,7 +922,7 @@ static LogicalResult VerifySignature(GraphFuncOp func, Operation *op,
 
   if (func.getGeneric()) return success();
 
-  for (auto &it : llvm::enumerate(operands)) {
+  for (auto& it : llvm::enumerate(operands)) {
     Type arg_type = arguments[it.index() * 2];
     Type op_type = it.value();
     if (!tf_type::HasCompatibleElementTypes(arg_type, op_type)) {
@@ -937,7 +932,7 @@ static LogicalResult VerifySignature(GraphFuncOp func, Operation *op,
           << " is not compatible with corresponding operand type: " << op_type);
     }
   }
-  for (auto &it : llvm::enumerate(results)) {
+  for (auto& it : llvm::enumerate(results)) {
     Type ret_type = returns[it.index()];
     Type res_type = it.value();
     if (!tf_type::HasCompatibleElementTypes(ret_type, res_type)) {
@@ -953,7 +948,7 @@ static LogicalResult VerifySignature(GraphFuncOp func, Operation *op,
 // This function verifies that the types of `values`, which are either operands
 // or results of `op`, match the types specified in `types`, which is expected
 // to be an array of type attributes.
-static LogicalResult VerifyTypeArray(Operation *op, ValueRange values,
+static LogicalResult VerifyTypeArray(Operation* op, ValueRange values,
                                      ArrayAttr types, StringRef kind) {
   // Don't verify if the types are not present.
   if (!types) return success();
@@ -1017,7 +1012,7 @@ static LogicalResult VerifyTypeArrayAttributes(OpT op) {
 
 template <typename IfLikeOp>
 static LogicalResult VerifyIfLikeOp(IfLikeOp op,
-                                    SymbolTableCollection &symbol_table) {
+                                    SymbolTableCollection& symbol_table) {
   if (failed(op.verifyInvariants())) return failure();
   FailureOr<TypeRange> ins = VerifyOperands(op);
   if (failed(ins)) return failure();
@@ -1047,7 +1042,7 @@ static LogicalResult VerifyIfLikeOp(IfLikeOp op,
 
 template <typename CaseLikeOp>
 static LogicalResult VerifyCaseLikeOp(CaseLikeOp op,
-                                      SymbolTableCollection &symbol_table) {
+                                      SymbolTableCollection& symbol_table) {
   if (failed(op.verifyInvariants())) return failure();
   FailureOr<TypeRange> ins = VerifyOperands(op);
   if (failed(ins)) return failure();
@@ -1057,7 +1052,7 @@ static LogicalResult VerifyCaseLikeOp(CaseLikeOp op,
   // The first operand is the branch index and is not passed to the functions.
   TypeRange func_args = ins->drop_front();
 
-  for (auto &it : llvm::enumerate(op.getBranches())) {
+  for (auto& it : llvm::enumerate(op.getBranches())) {
     SymbolRefAttr func_name = it.value().template cast<FuncAttr>().getName();
     auto func =
         symbol_table.lookupNearestSymbolFrom<GraphFuncOp>(op, func_name);
@@ -1074,7 +1069,7 @@ static LogicalResult VerifyCaseLikeOp(CaseLikeOp op,
 
 template <typename WhileLikeOp>
 static LogicalResult VerifyWhileLikeOp(WhileLikeOp op,
-                                       SymbolTableCollection &symbol_table) {
+                                       SymbolTableCollection& symbol_table) {
   if (failed(op.verifyInvariants())) return failure();
   FailureOr<TypeRange> ins = VerifyOperands(op);
   if (failed(ins)) return failure();
@@ -1101,7 +1096,7 @@ static LogicalResult VerifyWhileLikeOp(WhileLikeOp op,
 //===----------------------------------------------------------------------===//
 // ForOp
 
-LogicalResult ForOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+LogicalResult ForOp::verifySymbolUses(SymbolTableCollection& symbolTable) {
   if (failed(verifyInvariants())) return failure();
   FailureOr<TypeRange> ins = VerifyOperands(*this);
   if (failed(ins)) return failure();
@@ -1126,14 +1121,14 @@ LogicalResult ForOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 
 // If a region op has preserved attributes, verify that they match the number of
 // results and block arguments.
-static LogicalResult VerifyPreservedAttrs(Operation *op,
+static LogicalResult VerifyPreservedAttrs(Operation* op,
                                           ArrayRef<Attribute> preserved_attrs) {
   assert(op->getNumRegions() == preserved_attrs.size());
   for (auto it : llvm::zip(preserved_attrs, op->getRegions())) {
     // Preserved attributes for a particular region may not exist.
     auto attrs = std::get<0>(it).dyn_cast_or_null<RegionAttr>();
     if (!attrs) continue;
-    Region &region = std::get<1>(it);
+    Region& region = std::get<1>(it);
 
     const auto emit_region_error = [&](StringRef msg) {
       return op->emitOpError("region #")
@@ -1150,7 +1145,7 @@ static LogicalResult VerifyPreservedAttrs(Operation *op,
     // All regions are terminated by either a YieldOp or a ConditionOp. In the
     // latter case, the function will only have one result.
     unsigned num_rets;
-    Operation *terminator = region.front().getTerminator();
+    Operation* terminator = region.front().getTerminator();
     if (isa<ConditionOp>(terminator)) {
       num_rets = 1;
     } else {
@@ -1171,12 +1166,12 @@ static LogicalResult VerifyPreservedAttrs(Operation *op,
 // YieldOp
 
 MutableOperandRange YieldOp::getMutableSuccessorOperands(
-    Optional<unsigned> index) {
+    std::optional<unsigned> index) {
   // Get the subrange of non-control operands.
   return getArgsMutable();
 }
 
-static bool TerminatedByYield(Block &block) {
+static bool TerminatedByYield(Block& block) {
   return isa<YieldOp>(block.getTerminator());
 }
 
@@ -1210,8 +1205,8 @@ static Optional<bool> GetStaticallyKnownBranch(Attribute cond_attr) {
 // Get the successor of the regions of an if-like op.
 template <typename IfLikeRegionOp>
 void GetIfLikeRegionOpSuccessorRegions(
-    IfLikeRegionOp op, Optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
+    IfLikeRegionOp op, std::optional<unsigned> index,
+    ArrayRef<Attribute> operands, SmallVectorImpl<RegionSuccessor>& regions) {
   assert(index.has_value() ||
          !operands.empty() && "if-like op expected at least 1 operand");
   // Both regions branch back to the parent op.
@@ -1221,7 +1216,7 @@ void GetIfLikeRegionOpSuccessorRegions(
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else if (auto cond = GetStaticallyKnownBranch(operands[0])) {
     // Add only 1 possible successor if the condition is known.
-    Region &region = *cond ? op.getThenRegion() : op.getElseRegion();
+    Region& region = *cond ? op.getThenRegion() : op.getElseRegion();
     regions.emplace_back(&region, GetLoopRegionDataArgs(region));
   } else {
     // Unknown successor.
@@ -1238,7 +1233,7 @@ void GetIfLikeRegionOpSuccessorRegions(
 // Verify a case-like region op.
 template <typename CaseLikeRegionOp>
 static LogicalResult VerifyCaseLikeRegionOp(CaseLikeRegionOp op) {
-  for (auto &it : llvm::enumerate(op.getBranches())) {
+  for (auto& it : llvm::enumerate(op.getBranches())) {
     if (!TerminatedByYield(it.value().front())) {
       return op.emitOpError("branch region #")
              << it.index() << " is not terminated by a 'tfg.yield' op";
@@ -1276,8 +1271,8 @@ static Optional<unsigned> GetStaticallyKnownCaseBranch(Attribute branch_attr) {
 // Get the successor of the regions of a case-like op.
 template <typename CaseLikeRegionOp>
 void GetCaseLikeRegionOpSuccessorRegions(
-    CaseLikeRegionOp op, Optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
+    CaseLikeRegionOp op, std::optional<unsigned> index,
+    ArrayRef<Attribute> operands, SmallVectorImpl<RegionSuccessor>& regions) {
   assert(index.has_value() ||
          !operands.empty() && "case-like op expected at least 1 operand");
   // All branch regions branch back to the parent op.
@@ -1287,11 +1282,11 @@ void GetCaseLikeRegionOpSuccessorRegions(
         ResultRange(op->result_begin(), std::prev(op->result_end())));
   } else if (auto branch_index = GetStaticallyKnownCaseBranch(operands[0])) {
     // Add only 1 possible successor if the condition is known.
-    Region &region = op.getBranches()[*branch_index];
+    Region& region = op.getBranches()[*branch_index];
     regions.emplace_back(&region, GetLoopRegionDataArgs(region));
   } else {
     // Unknown successor. Add all of them.
-    for (Region &branch : op.getBranches())
+    for (Region& branch : op.getBranches())
       regions.emplace_back(&branch, GetLoopRegionDataArgs(branch));
   }
 }
@@ -1300,7 +1295,7 @@ void GetCaseLikeRegionOpSuccessorRegions(
 // ConditionOp
 
 MutableOperandRange ConditionOp::getMutableSuccessorOperands(
-    Optional<unsigned> index) {
+    std::optional<unsigned> index) {
   // Get the subrange of non-control operands that are forwarded to the
   // successor region.
   return getArgsMutable();
@@ -1313,7 +1308,7 @@ MutableOperandRange ConditionOp::getMutableSuccessorOperands(
 // immediately following N data values in their entry block arguments.
 // `RegionBranchOpInterface` will verify the number of arguments and their
 // types.
-static LogicalResult VerifyLoopRegionArgs(Operation *op, Region &region) {
+static LogicalResult VerifyLoopRegionArgs(Operation* op, Region& region) {
   const auto arg_error = [&](BlockArgument arg) {
     return op->emitOpError("region #")
            << region.getRegionNumber() << " argument #" << arg.getArgNumber()
@@ -1352,8 +1347,8 @@ static LogicalResult VerifyWhileLikeRegionOp(WhileLikeRegionOp op) {
 
 template <typename WhileLikeRegionOp>
 static void GetWhileLikeRegionOpSuccessorRegions(
-    WhileLikeRegionOp op, Optional<unsigned> index,
-    ArrayRef<Attribute> operands, SmallVectorImpl<RegionSuccessor> &regions) {
+    WhileLikeRegionOp op, std::optional<unsigned> index,
+    ArrayRef<Attribute> operands, SmallVectorImpl<RegionSuccessor>& regions) {
   // The parent op and the body region always branch to the condion region.
   if (!index || *index == 1) {
     regions.emplace_back(&op.getCondRegion(),
@@ -1402,13 +1397,14 @@ LogicalResult ForRegionOp::verify() {
   return VerifyPreservedAttrs(*this, {getRegionAttrsAttr()});
 }
 
-OperandRange ForRegionOp::getSuccessorEntryOperands(Optional<unsigned> index) {
+OperandRange ForRegionOp::getSuccessorEntryOperands(
+    std::optional<unsigned> index) {
   return getInit();
 }
 
 void ForRegionOp::getSuccessorRegions(
-    Optional<unsigned> index, ArrayRef<Attribute> operands,
-    SmallVectorImpl<RegionSuccessor> &regions) {
+    std::optional<unsigned> index, ArrayRef<Attribute> operands,
+    SmallVectorImpl<RegionSuccessor>& regions) {
   // Both the parent op and the body region branch to the body. Ignore the loop
   // index block argument, as it is not modified by the loop body itself.
   regions.emplace_back(&getBodyRegion(),
@@ -1424,10 +1420,10 @@ BlockArgument ForRegionOp::getDataValueOf(BlockArgument ctl) {
 BlockArgument ForRegionOp::getControlTokenOf(BlockArgument data) {
   return GetLoopRegionControlOf(data);
 }
-BlockArgument ForRegionOp::getDataValue(Region &region, unsigned idx) {
+BlockArgument ForRegionOp::getDataValue(Region& region, unsigned idx) {
   return GetLoopRegionDataArgs(region)[idx];
 }
-BlockArgument ForRegionOp::getControlToken(Region &region, unsigned idx) {
+BlockArgument ForRegionOp::getControlToken(Region& region, unsigned idx) {
   return GetLoopRegionControlTokens(region)[idx];
 }
 
@@ -1438,21 +1434,21 @@ BlockArgument ForRegionOp::getControlToken(Region &region, unsigned idx) {
 FunctionTable::FunctionTable(ModuleOp module) {
   // Collect function names (to be used for disambiguating legacy call
   // behavior).
-  for (auto &op : module.getOps()) {
+  for (auto& op : module.getOps()) {
     if (auto func = dyn_cast<GraphFuncOp>(op)) functions.insert(func.getName());
   }
 }
 
-bool FunctionTable::MayBeCall(Operation *op) const {
+bool FunctionTable::MayBeCall(Operation* op) const {
   if (IsLegacyCall(op)) return true;
   // The operation might be a call if it references a symbol.
   bool references_symbol = false;
-  op->getAttrDictionary().walkSubAttrs(
+  op->getAttrDictionary().walk(
       [&](Attribute attr) { references_symbol |= attr.isa<SymbolRefAttr>(); });
   return references_symbol;
 }
 
-bool FunctionTable::IsLegacyCall(Operation *op) const {
+bool FunctionTable::IsLegacyCall(Operation* op) const {
   // If the operation name refers to a function in the module, then it is
   // guaranteed to be a legacy call. Otherwise, it is not.
   return functions.count(op->getName().stripDialect());
diff --git a/tensorflow/core/ir/ops.td b/tensorflow/core/ir/ops.td
index 44229ff4563..da87adf4f6e 100644
--- a/tensorflow/core/ir/ops.td
+++ b/tensorflow/core/ir/ops.td
@@ -219,7 +219,7 @@ def TFGraph_ReturnOp : TFG_IntrinsicOp<"return",
     similarly for the function results,
   }];
 
-  let arguments = (ins Variadic<AnyType>:$operands,
+  let arguments = (ins Variadic<AnyType>,
                        TFGraph_DictionaryArrayAttr:$control_ret_attrs);
 
   let builders = [
@@ -499,7 +499,7 @@ def TFGraph_YieldOp : TFG_IntrinsicOp<"yield",
     data operands matches the number of results of the parent operation.
 
     The semantics of function calls in TensorFlow mean that the region "returns"
-    data operands as they become avaiable so that users can start executing
+    data operands as they become available so that users can start executing
     immediately. All functional control-flow operations behave as multi-device
     function calls, which means that the control result of the parent operation
     depends on the control operands of the `yield`.
@@ -582,7 +582,7 @@ class TFGraph_IfLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
     }
 
     void $cppClass::getSuccessorRegions(
-        Optional<unsigned> index, ArrayRef<Attribute> operands,
+        std::optional<unsigned> index, ArrayRef<Attribute> operands,
         SmallVectorImpl<RegionSuccessor> &regions) {
       GetIfLikeRegionOpSuccessorRegions(*this, index, operands, regions);
     }
@@ -695,7 +695,7 @@ class TFGraph_CaseLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
     }
 
     void $cppClass::getSuccessorRegions(
-        Optional<unsigned> index, ArrayRef<Attribute> operands,
+        std::optional<unsigned> index, ArrayRef<Attribute> operands,
         SmallVectorImpl<RegionSuccessor> &regions) {
       GetCaseLikeRegionOpSuccessorRegions(*this, index, operands, regions);
     }
@@ -849,11 +849,11 @@ class TFGraph_WhileLikeRegionOp<string mnemonic> : TFGraph_RegionOp<
     }
 
     OperandRange $cppClass::getSuccessorEntryOperands(
-        Optional<unsigned> index) {
+        std::optional<unsigned> index) {
       return getInit();
     }
     void $cppClass::getSuccessorRegions(
-        Optional<unsigned> index, ArrayRef<Attribute> operands,
+        std::optional<unsigned> index, ArrayRef<Attribute> operands,
         SmallVectorImpl<RegionSuccessor> &regions) {
       return GetWhileLikeRegionOpSuccessorRegions(*this, index, operands,
                                                   regions);
diff --git a/tensorflow/core/ir/tests/BUILD b/tensorflow/core/ir/tests/BUILD
index 9ee5f2310fc..971ffa799e6 100644
--- a/tensorflow/core/ir/tests/BUILD
+++ b/tensorflow/core/ir/tests/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [":__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/ir/tests/types.mlir b/tensorflow/core/ir/tests/types.mlir
index c360bfa6d1d..8079d72394f 100644
--- a/tensorflow/core/ir/tests/types.mlir
+++ b/tensorflow/core/ir/tests/types.mlir
@@ -58,6 +58,10 @@ module attributes {tfg.type = !tf_type.complex64ref} {}
 module attributes {tfg.type = !tf_type.complex128ref} {}
 // CHECK: module attributes {tfg.type = !tf_type.halfref
 module attributes {tfg.type = !tf_type.halfref} {}
+// CHECK: module attributes {tfg.type = !tf_type.float8e4m3fnref
+module attributes {tfg.type = !tf_type.float8e4m3fnref} {}
+// CHECK: module attributes {tfg.type = !tf_type.float8e5m2ref
+module attributes {tfg.type = !tf_type.float8e5m2ref} {}
 // CHECK: module attributes {tfg.type = !tf_type.control
 module attributes {tfg.type = !tf_type.control} {}
 // CHECK: module attributes {tfg.type = !tf_type.tensor
diff --git a/tensorflow/core/ir/tf_op_names.cc b/tensorflow/core/ir/tf_op_names.cc
index 23f0dd5ac75..0aadaf92888 100644
--- a/tensorflow/core/ir/tf_op_names.cc
+++ b/tensorflow/core/ir/tf_op_names.cc
@@ -811,6 +811,11 @@ bool TFGraphDialect::IsSoftmax(TFOp op) const {
   return op_name == softmax_;
 }
 
+bool TFGraphDialect::IsSoftplus(TFOp op) const {
+  StringAttr op_name = op->getName().getIdentifier();
+  return op_name == softplus_;
+}
+
 bool TFGraphDialect::IsSoftplusGrad(TFOp op) const {
   StringAttr op_name = op->getName().getIdentifier();
   return op_name == softplus_grad_;
diff --git a/tensorflow/core/ir/tf_op_names.inc b/tensorflow/core/ir/tf_op_names.inc
index fa5d3a8fd2c..39db159457e 100644
--- a/tensorflow/core/ir/tf_op_names.inc
+++ b/tensorflow/core/ir/tf_op_names.inc
@@ -151,6 +151,7 @@ bool IsSize(TFOp op) const;
 bool IsSlice(TFOp op) const;
 bool IsSnapshot(TFOp op) const;
 bool IsSoftmax(TFOp op) const;
+bool IsSoftplus(TFOp op) const;
 bool IsSoftplusGrad(TFOp op) const;
 bool IsSoftsignGrad(TFOp op) const;
 bool IsSplit(TFOp op) const;
@@ -391,6 +392,7 @@ StringAttr size_;
 StringAttr slice_;
 StringAttr snapshot_;
 StringAttr softmax_;
+StringAttr softplus_;
 StringAttr softplus_grad_;
 StringAttr softsign_grad_;
 StringAttr sparse_matmul_;
@@ -676,6 +678,7 @@ size_ = StringAttr::get(getContext(), "tfg.Size");
 slice_ = StringAttr::get(getContext(), "tfg.Slice");
 snapshot_ = StringAttr::get(getContext(), "tfg.Snapshot");
 softmax_ = StringAttr::get(getContext(), "tfg.Softmax");
+softplus_ = StringAttr::get(getContext(), "tfg.Softplus");
 softplus_grad_ = StringAttr::get(getContext(), "tfg.SoftplusGrad");
 softsign_grad_ = StringAttr::get(getContext(), "tfg.SoftsignGrad");
 sparse_matmul_ = StringAttr::get(getContext(), "tfg.SparseMatMul");
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index ff79c05afb5..e5232d945db 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = ["//tensorflow/core:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
@@ -24,7 +25,6 @@ td_library(
         "@llvm-project//mlir:InferTypeOpInterfaceTdFiles",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
-        "@llvm-project//mlir:SubElementInterfacesTdFiles",
     ],
 )
 
@@ -109,6 +109,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/core/ir/types/attributes.td b/tensorflow/core/ir/types/attributes.td
index e924fda4a4f..346cc9faa92 100644
--- a/tensorflow/core/ir/types/attributes.td
+++ b/tensorflow/core/ir/types/attributes.td
@@ -17,7 +17,6 @@ limitations under the License.
 #define TF_TYPE_ATTRIBUTES
 
 include "mlir/IR/AttrTypeBase.td"
-include "mlir/IR/SubElementInterfaces.td"
 include "tensorflow/core/ir/types/dialect.td"
 include "mlir/IR/BuiltinAttributeInterfaces.td"
 
@@ -202,6 +201,17 @@ def TFType_FullTypeId : I32EnumAttr<"FullTypeId", "", [
   //   TFT_ENCODING[TFT_INT32, TFT_STRING] is an integer encoded as string.
   I32EnumAttrCase<"TFT_ENCODED", 1004, "encoded">,
 
+  // The type of "shape tensors" where the runtime value is the shape of
+  // some tensor(s), i.e. the output of tf.shape.
+  // Shape tensors have special, host-only placement, in contrast to
+  // TFT_TENSOR[TFT_INT32] which is the type of a normal numeric tensor
+  // with no special placement.
+  //
+  // Examples:
+  //   TFT_SHAPE_TENSOR[TFT_INT32] is the most common
+  //   TFT_SHAPE_TENSOR[TFT_INT64] is also allowed
+  I32EnumAttrCase<"TFT_SHAPE_TENSOR", 1005, "shape_tensor">,
+
   // Type attributes. These always appear in the parametrization of a type,
   // never alone. For example, there is no such thing as a "bool" TensorFlow
   // object (for now).
@@ -322,10 +332,7 @@ def TFType_FullTypeAttr : AttrDef<TFTypeDialect, "FullType"> {
 // FuncAttr
 //===----------------------------------------------------------------------===//
 
-def TFType_FuncAttr : TFType_Attr<"Func", [
-    DeclareAttrInterfaceMethods<SubElementAttrInterface,
-        ["replaceImmediateSubElements"]>
-  ]> {
+def TFType_FuncAttr : TFType_Attr<"Func", []> {
   let mnemonic = "func";
   let summary = "Models the `AttrValue.value.func` proto attribute value as a "
     "pair of SymbolRef and DictionaryAttr";
@@ -339,10 +346,6 @@ def TFType_FuncAttr : TFType_Attr<"Func", [
 
     where the first element is the `SymbolRefAttr` and the second element is the
     `DictionaryAttr`.
-
-    So that the symbol reference and any symbol references nested in the
-    `DictionaryAttr` are visible to symbol tables, this attribute implements the
-    `SubElementAttrInterface`.
   }];
 
   let parameters = (ins
diff --git a/tensorflow/core/ir/types/dialect.cc b/tensorflow/core/ir/types/dialect.cc
index 6f5fe7d683e..df7fad69b19 100644
--- a/tensorflow/core/ir/types/dialect.cc
+++ b/tensorflow/core/ir/types/dialect.cc
@@ -74,7 +74,7 @@ void TFTypeDialect::initialize() {
 
 namespace {
 template <typename TypeWithSubtype>
-Type ParseTypeWithSubtype(MLIRContext *context, DialectAsmParser &parser) {
+Type ParseTypeWithSubtype(MLIRContext* context, DialectAsmParser& parser) {
   // Default type without inferred subtypes.
   if (failed(parser.parseOptionalLess())) return TypeWithSubtype::get(context);
 
@@ -100,7 +100,7 @@ Type ParseTypeWithSubtype(MLIRContext *context, DialectAsmParser &parser) {
 
 template <typename TypeWithSubtype>
 void PrintTypeWithSubtype(StringRef type, TypeWithSubtype ty,
-                          DialectAsmPrinter &os) {
+                          DialectAsmPrinter& os) {
   os << type;
   ArrayRef<TensorType> subtypes = ty.getSubtypes();
   if (subtypes.empty()) return;
@@ -109,19 +109,19 @@ void PrintTypeWithSubtype(StringRef type, TypeWithSubtype ty,
   interleaveComma(subtypes, os);
   os << ">";
 }
-Type ParseResourceType(MLIRContext *context, DialectAsmParser &parser) {
+Type ParseResourceType(MLIRContext* context, DialectAsmParser& parser) {
   return ParseTypeWithSubtype<ResourceType>(context, parser);
 }
 
-void PrintResourceType(ResourceType ty, DialectAsmPrinter &os) {
+void PrintResourceType(ResourceType ty, DialectAsmPrinter& os) {
   return PrintTypeWithSubtype("resource", ty, os);
 }
 
-Type ParseVariantType(MLIRContext *context, DialectAsmParser &parser) {
+Type ParseVariantType(MLIRContext* context, DialectAsmParser& parser) {
   return ParseTypeWithSubtype<VariantType>(context, parser);
 }
 
-void PrintVariantType(VariantType ty, DialectAsmPrinter &os) {
+void PrintVariantType(VariantType ty, DialectAsmPrinter& os) {
   return PrintTypeWithSubtype("variant", ty, os);
 }
 
@@ -129,13 +129,13 @@ void PrintVariantType(VariantType ty, DialectAsmPrinter &os) {
 
 // Entry point for Type parsing, TableGen generated code will handle the
 // dispatch to the individual classes.
-Type TFTypeDialect::parseType(DialectAsmParser &parser) const {
+Type TFTypeDialect::parseType(DialectAsmParser& parser) const {
   StringRef type_tag;
   llvm::SMLoc loc = parser.getNameLoc();
 
   Type genType;
   auto parse_result = generatedTypeParser(parser, &type_tag, genType);
-  if (parse_result.hasValue()) return genType;
+  if (parse_result.has_value()) return genType;
 
 #define HANDLE_TF_TYPE(tftype, enumerant, name) \
   if (type_tag == name) return tftype##Type::get(getContext());
@@ -161,7 +161,7 @@ Type TFTypeDialect::parseType(DialectAsmParser &parser) const {
 
 // Entry point for Type parsing, TableGen generated code will handle the
 // dispatch to the individual classes.
-void TFTypeDialect::printType(Type type, DialectAsmPrinter &printer) const {
+void TFTypeDialect::printType(Type type, DialectAsmPrinter& printer) const {
 #define HANDLE_TF_TYPE(tftype, enumerant, name)          \
   if (auto derived_ty = type.dyn_cast<tftype##Type>()) { \
     printer << name;                                     \
@@ -183,7 +183,7 @@ void TFTypeDialect::printType(Type type, DialectAsmPrinter &printer) const {
 // Attributes
 //===----------------------------------------------------------------------===//
 
-Attribute VersionAttr::parse(AsmParser &parser, Type) {
+Attribute VersionAttr::parse(AsmParser& parser, Type) {
   if (failed(parser.parseLess())) return {};
 
   int32_t producer, min_consumer;
@@ -212,8 +212,8 @@ Attribute VersionAttr::parse(AsmParser &parser, Type) {
                           bad_consumers);
 }
 
-void VersionAttr::print(AsmPrinter &printer) const {
-  llvm::raw_ostream &os = printer.getStream();
+void VersionAttr::print(AsmPrinter& printer) const {
+  llvm::raw_ostream& os = printer.getStream();
   os << "<producer = " << getProducer()
      << ", min_consumer = " << getMinConsumer();
   ArrayRef<int32_t> badConsumers = getBadConsumers();
@@ -225,7 +225,7 @@ void VersionAttr::print(AsmPrinter &printer) const {
   os << ">";
 }
 
-FailureOr<FullTypeAttr> RawFullTypeAttrParser(AsmParser &parser) {
+FailureOr<FullTypeAttr> RawFullTypeAttrParser(AsmParser& parser) {
   SmallVector<FullTypeAttr> args;
 
   // Parse variable 'type_id'
@@ -237,7 +237,7 @@ FailureOr<FullTypeAttr> RawFullTypeAttrParser(AsmParser &parser) {
         "'type_id'");
     return failure();
   }
-  Optional<FullTypeId> type_id = symbolizeFullTypeId(type_id_str);
+  std::optional<FullTypeId> type_id = symbolizeFullTypeId(type_id_str);
   if (!type_id) {
     parser.emitError(parser.getCurrentLocation(),
                      "failed to parse TFType_FullTypeAttr parameter "
@@ -246,13 +246,14 @@ FailureOr<FullTypeAttr> RawFullTypeAttrParser(AsmParser &parser) {
   }
 
   // Parse variable 'args'
-  if (parser.parseCommaSeparatedList(
-      AsmParser::Delimiter::OptionalLessGreater, [&]() {
-        FailureOr<tf_type::FullTypeAttr> arg = RawFullTypeAttrParser(parser);
-        if (failed(arg)) return failure();
-        args.push_back(*arg);
-        return success();
-      }))
+  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::OptionalLessGreater,
+                                     [&]() {
+                                       FailureOr<tf_type::FullTypeAttr> arg =
+                                           RawFullTypeAttrParser(parser);
+                                       if (failed(arg)) return failure();
+                                       args.push_back(*arg);
+                                       return success();
+                                     }))
     return failure();
 
   // Parse variable 'attr'
@@ -262,14 +263,14 @@ FailureOr<FullTypeAttr> RawFullTypeAttrParser(AsmParser &parser) {
                            args, attr);
 }
 
-Attribute FullTypeAttr::parse(AsmParser &parser, Type odsType) {
+Attribute FullTypeAttr::parse(AsmParser& parser, Type odsType) {
   if (failed(parser.parseLess())) return {};
   FailureOr<tf_type::FullTypeAttr> ret = RawFullTypeAttrParser(parser);
   if (succeeded(ret) && failed(parser.parseGreater())) return {};
-  return ret.getValueOr(FullTypeAttr());
+  return ret.value_or(FullTypeAttr());
 }
 
-static void RawFullTypeAttrPrint(FullTypeAttr tfattr, AsmPrinter &printer) {
+static void RawFullTypeAttrPrint(FullTypeAttr tfattr, AsmPrinter& printer) {
   printer << stringifyFullTypeId(tf_type::FullTypeId(tfattr.getTypeId()));
   if (!tfattr.getArgs().empty()) {
     printer << "<";
@@ -287,7 +288,7 @@ static void RawFullTypeAttrPrint(FullTypeAttr tfattr, AsmPrinter &printer) {
   }
 }
 
-void FullTypeAttr::print(AsmPrinter &printer) const {
+void FullTypeAttr::print(AsmPrinter& printer) const {
   printer << "<";
   RawFullTypeAttrPrint(*this, printer);
   printer << ">";
@@ -299,7 +300,7 @@ void FullTypeAttr::print(AsmPrinter &printer) const {
 // or
 //   #tf.func<"", {attr = "value"}>
 // in case of null symbol ref.
-void FuncAttr::print(AsmPrinter &os) const {
+void FuncAttr::print(AsmPrinter& os) const {
   if (getName().getRootReference().getValue().empty())
     os << "<\"\", " << getAttrs() << ">";
   else
@@ -312,7 +313,7 @@ void FuncAttr::print(AsmPrinter &os) const {
 //
 // where the first element is a SymbolRefAttr and the second element is a
 // DictionaryAttr.
-Attribute FuncAttr::parse(AsmParser &parser, Type type) {
+Attribute FuncAttr::parse(AsmParser& parser, Type type) {
   if (failed(parser.parseLess())) return {};
   llvm::SMLoc loc = parser.getCurrentLocation();
   Attribute name, dict;
@@ -345,27 +346,11 @@ Attribute FuncAttr::parse(AsmParser &parser, Type type) {
                        dict.cast<DictionaryAttr>());
 }
 
-void FuncAttr::walkImmediateSubElements(
-    function_ref<void(Attribute)> walkAttrsFn,
-    function_ref<void(Type)> walkTypesFn) const {
-  // Walk the dictionary attribute first, so that its index is always 0.
-  walkAttrsFn(getAttrs());
-  // Walk the symbol ref attribute if it isn't empty.
-  if (!getName().getRootReference().getValue().empty()) walkAttrsFn(getName());
-}
-
-Attribute FuncAttr::replaceImmediateSubElements(
-    ArrayRef<Attribute> replAttrs, ArrayRef<Type> replTypes) const {
-  assert(replAttrs.size() == 2 && "invalid number of replacement attributes");
-  return get(getContext(), replAttrs[1].cast<SymbolRefAttr>(),
-             replAttrs[0].cast<DictionaryAttr>());
-}
-
-void PlaceholderAttr::print(AsmPrinter &os) const {
+void PlaceholderAttr::print(AsmPrinter& os) const {
   os << "<" << StringAttr::get(getContext(), getValue()) << ">";
 }
 
-Attribute PlaceholderAttr::parse(AsmParser &parser, Type type) {
+Attribute PlaceholderAttr::parse(AsmParser& parser, Type type) {
   if (failed(parser.parseLess())) return {};
   std::string content;
   if (failed(parser.parseOptionalString(&content))) {
@@ -377,11 +362,11 @@ Attribute PlaceholderAttr::parse(AsmParser &parser, Type type) {
   return PlaceholderAttr::get(parser.getContext(), content);
 }
 
-void ShapeAttr::print(AsmPrinter &os) const {
+void ShapeAttr::print(AsmPrinter& os) const {
   os << "<";
   if (hasRank()) {
     auto print_dim = [&](int64_t dim) {
-      if (dim != ShapedType::kDynamicSize)
+      if (dim != ShapedType::kDynamic)
         os << dim;
       else
         os << "?";
@@ -393,7 +378,7 @@ void ShapeAttr::print(AsmPrinter &os) const {
   os << ">";
 }
 
-Attribute ShapeAttr::parse(AsmParser &parser, Type type) {
+Attribute ShapeAttr::parse(AsmParser& parser, Type type) {
   if (failed(parser.parseLess())) return {};
 
   if (succeeded(parser.parseOptionalStar())) {
@@ -412,7 +397,7 @@ Attribute ShapeAttr::parse(AsmParser &parser, Type type) {
       shape.emplace_back();
       llvm::SMLoc loc = parser.getCurrentLocation();
       if (succeeded(parser.parseOptionalQuestion())) {
-        shape.back() = ShapedType::kDynamicSize;
+        shape.back() = ShapedType::kDynamic;
       } else if (failed(parser.parseInteger(shape.back()))) {
         parser.emitError(loc)
             << "expected an integer or `?` when parsing a tf.shape attribute";
@@ -426,11 +411,11 @@ Attribute ShapeAttr::parse(AsmParser &parser, Type type) {
         return {};
     }
   }
-  return ShapeAttr::get(parser.getContext(), llvm::makeArrayRef(shape));
+  return ShapeAttr::get(parser.getContext(), llvm::ArrayRef(shape));
 }
 
 // Get or create a shape attribute.
-ShapeAttr ShapeAttr::get(MLIRContext *context,
+ShapeAttr ShapeAttr::get(MLIRContext* context,
                          llvm::Optional<ArrayRef<int64_t>> shape) {
   if (shape) return Base::get(context, *shape, /*unranked=*/false);
 
@@ -438,7 +423,7 @@ ShapeAttr ShapeAttr::get(MLIRContext *context,
 }
 
 // Get or create a shape attribute.
-ShapeAttr ShapeAttr::get(MLIRContext *context, ShapedType shaped_type) {
+ShapeAttr ShapeAttr::get(MLIRContext* context, ShapedType shaped_type) {
   if (shaped_type.hasRank())
     return Base::get(context, shaped_type.getShape(), /*unranked=*/false);
 
@@ -483,7 +468,7 @@ Optional<ArrayRef<int64_t>> GetShape(Value value) {
 // precise than the two input shapes.
 bool GetCastCompatibleShape(ArrayRef<int64_t> a_shape,
                             ArrayRef<int64_t> b_shape,
-                            SmallVectorImpl<int64_t> *refined_shape) {
+                            SmallVectorImpl<int64_t>* refined_shape) {
   if (a_shape.size() != b_shape.size()) return false;
   int64_t rank = a_shape.size();
   refined_shape->reserve(rank);
@@ -542,7 +527,7 @@ bool TensorFlowRefType::classof(Type type) {
 }
 
 TensorFlowType TensorFlowRefType::get(Type type) {
-  MLIRContext *ctx = type.getContext();
+  MLIRContext* ctx = type.getContext();
   type = getElementTypeOrSelf(type);
   if (type.isF16()) {
     return HalfRefType::get(ctx);
@@ -552,6 +537,10 @@ TensorFlowType TensorFlowRefType::get(Type type) {
     return DoubleRefType::get(ctx);
   } else if (type.isBF16()) {
     return Bfloat16RefType::get(ctx);
+  } else if (type.isFloat8E4M3FN()) {
+    return Float8E4M3FNRefType::get(ctx);
+  } else if (type.isFloat8E5M2()) {
+    return Float8E5M2RefType::get(ctx);
   } else if (auto complex_type = type.dyn_cast<ComplexType>()) {
     Type etype = complex_type.getElementType();
     if (etype.isF32()) {
@@ -591,11 +580,13 @@ TensorFlowType TensorFlowRefType::get(Type type) {
 }
 
 Type TensorFlowRefType::RemoveRef() {
-  MLIRContext *ctx = getContext();
+  MLIRContext* ctx = getContext();
   if (isa<HalfRefType>()) return FloatType::getF16(ctx);
   if (isa<FloatRefType>()) return FloatType::getF32(ctx);
   if (isa<DoubleRefType>()) return FloatType::getF64(ctx);
   if (isa<Bfloat16RefType>()) return FloatType::getBF16(ctx);
+  if (isa<Float8E4M3FNType>()) return FloatType::getFloat8E4M3FN(ctx);
+  if (isa<Float8E5M2Type>()) return FloatType::getFloat8E5M2(ctx);
   if (isa<BoolRefType>()) return IntegerType::get(ctx, 1);
   if (isa<Int8RefType>()) return IntegerType::get(ctx, 8);
   if (isa<Int16RefType>()) return IntegerType::get(ctx, 16);
@@ -625,7 +616,7 @@ bool TensorFlowTypeWithSubtype::classof(Type type) {
 }
 
 Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
-  MLIRContext *ctx = getContext();
+  MLIRContext* ctx = getContext();
   if (isa<VariantType>()) return VariantType::get(ctx);
   if (isa<ResourceType>()) return ResourceType::get(ctx);
   llvm_unreachable("unexpected tensorflow type with subtypes kind");
@@ -633,7 +624,7 @@ Type TensorFlowTypeWithSubtype::RemoveSubtypes() {
 
 TensorFlowTypeWithSubtype TensorFlowTypeWithSubtype::clone(
     ArrayRef<TensorType> new_subtypes) {
-  MLIRContext *ctx = getContext();
+  MLIRContext* ctx = getContext();
   if (isa<VariantType>())
     return VariantType::get(new_subtypes, ctx)
         .cast<TensorFlowTypeWithSubtype>();
@@ -858,7 +849,7 @@ Type DropRefType(Type ty) { return DropTypeHelper<TensorFlowRefType>(ty); }
 
 Type DropRefAndSubTypes(Type ty) { return DropRefType(DropSubTypes(ty)); }
 
-Attribute TensorProtoAttr::parse(AsmParser &parser, Type type) {
+Attribute TensorProtoAttr::parse(AsmParser& parser, Type type) {
   if (parser.parseColon()) {
     return nullptr;
   }
@@ -876,7 +867,7 @@ Attribute TensorProtoAttr::parse(AsmParser &parser, Type type) {
   return TensorProtoAttr::get(type, bytes_data);
 }
 
-void TensorProtoAttr::print(mlir::AsmPrinter &printer) const {
+void TensorProtoAttr::print(mlir::AsmPrinter& printer) const {
   StringRef bytes_str = getValue();
   printer << " : \"0x" << llvm::toHex(bytes_str) << "\"";
 }
diff --git a/tensorflow/core/ir/types/dialect.h b/tensorflow/core/ir/types/dialect.h
index dab4fede1fc..3e28f67e7b0 100644
--- a/tensorflow/core/ir/types/dialect.h
+++ b/tensorflow/core/ir/types/dialect.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
@@ -50,7 +51,8 @@ class TensorFlowType : public Type {
 
 // Returns true if the specified type is a valid TensorFlow element type.
 inline bool IsValidTFElementType(Type type) {
-  return type.isa<ComplexType, FloatType, IntegerType, TensorFlowType>();
+  return type.isa<ComplexType, FloatType, IntegerType, TensorFlowType,
+                  quant::QuantizedType>();
 }
 
 // Returns true if this is a valid TensorFlow tensor type.
diff --git a/tensorflow/core/ir/types/dialect.td b/tensorflow/core/ir/types/dialect.td
index 417b8709777..608a1788f24 100644
--- a/tensorflow/core/ir/types/dialect.td
+++ b/tensorflow/core/ir/types/dialect.td
@@ -47,6 +47,7 @@ def TFTypeDialect : Dialect {
      void printType(::mlir::Type type, ::mlir::DialectAsmPrinter &printer) const;
   }];
   let useDefaultAttributePrinterParser = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/core/ir/types/types.def b/tensorflow/core/ir/types/types.def
index 0ceb09d26c7..eccf7b26770 100644
--- a/tensorflow/core/ir/types/types.def
+++ b/tensorflow/core/ir/types/types.def
@@ -64,6 +64,8 @@ HANDLE_TF_REF_TYPE(Complex64Ref, COMPLEX64_REF, "complex64ref")
 HANDLE_TF_REF_TYPE(Complex128Ref, COMPLEX128_REF, "complex128ref")
 HANDLE_TF_REF_TYPE(HalfRef, HALF_REF, "halfref")
 HANDLE_TF_REF_TYPE(ResourceRef, RESOURCE_REF, "resourceref")
+HANDLE_TF_REF_TYPE(Float8E4M3FNRef, FLOAT8_E4M3FN_REF, "float8e4m3fnref")
+HANDLE_TF_REF_TYPE(Float8E5M2Ref, FLOAT8_E5M2_REF, "float8e5m2ref")
 
 #ifndef HANDLE_LAST_TF_TYPE
 #define HANDLE_LAST_TF_TYPE(class, enumerant, name) \
diff --git a/tensorflow/core/ir/utils/shape_inference_utils.cc b/tensorflow/core/ir/utils/shape_inference_utils.cc
index df7b311c951..d7be8103ff7 100644
--- a/tensorflow/core/ir/utils/shape_inference_utils.cc
+++ b/tensorflow/core/ir/utils/shape_inference_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -85,19 +86,19 @@ NamedAttrList GetAllAttributesFromOperation(Operation* op) {
 // those where num_elements overflows.
 // TODO(tlongeri): Should num_elements overflow be handled by the MLIR
 // verifier? Are there other cases?
-Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
+std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
   if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
     tensorflow::PartialTensorShape shape;
     const tensorflow::Status status =
         tensorflow::PartialTensorShape::BuildPartialTensorShape(
-            ranked_type.getShape(), &shape);
+            ConvertMlirShapeToTF(ranked_type.getShape()), &shape);
     if (status.ok()) return shape;
   }
-  return None;
+  return llvm::None;
 }
 
 // Extracts a PartialTensorShape from the MLIR attr.
-Optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
+std::optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
   // Function arguments may have shape attr to describe its output shape.
   if (auto arg = v.dyn_cast<BlockArgument>()) {
     Operation* parent_op = arg.getOwner()->getParentOp();
@@ -105,7 +106,7 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
       int arg_idx = arg.getArgNumber();
       auto attrs =
           func_op.getArgAttrOfType<ArrayAttr>(arg_idx, "tf._output_shapes");
-      if (!attrs || attrs.size() != 1) return None;
+      if (!attrs || attrs.size() != 1) return llvm::None;
 
       // "tf._output_shapes" in certain models may not store the shape as
       // ShapeAttr, ignore them because we don't know how to interpret it.
@@ -114,7 +115,7 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirAttr(Value v) {
         return tensorflow::PartialTensorShape(shape_attr.getShape());
     }
   }
-  return None;
+  return llvm::None;
 }
 
 // Gets the subtype's shape and data type for `type`. Templated to support both
@@ -156,7 +157,7 @@ GetSubtypes(Type type) {
 }
 
 // Log a shape inference function call failure.
-LogicalResult ReportErrorFromShapeFunction(Optional<Location> location,
+LogicalResult ReportErrorFromShapeFunction(std::optional<Location> location,
                                            llvm::StringRef op_name,
                                            llvm::StringRef error_message) {
   VLOG(3) << "TensorFlow shape inference function errored for op '"
@@ -165,9 +166,9 @@ LogicalResult ReportErrorFromShapeFunction(Optional<Location> location,
 }
 
 // Extracts shape from a shape handle and inference context.
-Optional<SmallVector<int64_t, 8>> GetShapeFromHandle(InferenceContext& context,
-                                                     const ShapeHandle& sh) {
-  if (!context.RankKnown(sh)) return None;
+std::optional<SmallVector<int64_t, 8>> GetShapeFromHandle(
+    InferenceContext& context, const ShapeHandle& sh) {
+  if (!context.RankKnown(sh)) return llvm::None;
   SmallVector<int64_t, 8> shape;
   for (int dim : llvm::seq<int>(0, context.Rank(sh)))
     shape.push_back(context.Value(context.Dim(sh, dim)));
@@ -179,7 +180,7 @@ TensorType CreateTensorType(InferenceContext& context, const ShapeHandle& sh,
                             Type element_type) {
   auto shape = GetShapeFromHandle(context, sh);
   if (shape.has_value())
-    return GetTypeFromTFTensorShape(shape.getValue(), element_type, {});
+    return GetTypeFromTFTensorShape(shape.value(), element_type, {});
   return UnrankedTensorType::get(element_type);
 }
 
@@ -189,30 +190,15 @@ ShapedTypeComponents CreateShapedTypeComponents(InferenceContext& context,
                                                 Type element_type) {
   auto shape = GetShapeFromHandle(context, sh);
   if (shape.has_value())
-    return ShapedTypeComponents(shape.getValue(), element_type);
+    return ShapedTypeComponents(ConvertTFShapeToMlir(shape.value()),
+                                element_type);
   return ShapedTypeComponents(element_type);
 }
 
-llvm::SmallVector<int64_t> ConvertTFShapeToMlir(
-    llvm::ArrayRef<int64_t> shapes) {
-  return llvm::to_vector(llvm::map_range(shapes, [](int64_t shape) {
-    return shape == InferenceContext::kUnknownDim
-               ? mlir::ShapedType::kDynamicSize
-               : shape;
-  }));
-}
-
 }  // namespace
 
-mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
-                                                mlir::Type elementType,
-                                                mlir::Attribute encoding) {
-  return mlir::RankedTensorType::get(ConvertTFShapeToMlir(shape), elementType,
-                                     encoding);
-}
-
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, ValueRange operands,
+    std::optional<Location> location, Operation* op, ValueRange operands,
     int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
     OpResultAsShapeFn op_result_as_shape_fn,
     ResultElementTypeFn result_element_type_fn,
@@ -407,7 +393,7 @@ LogicalResult InferReturnTypeComponentsForTFOp(
 }
 
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, ValueRange operands,
+    std::optional<Location> location, Operation* op, ValueRange operands,
     int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
     OpResultAsShapeFn op_result_as_shape_fn,
     ResultElementTypeFn result_element_type_fn,
diff --git a/tensorflow/core/ir/utils/shape_inference_utils.h b/tensorflow/core/ir/utils/shape_inference_utils.h
index 9ae09f5267d..a1a4da8d946 100644
--- a/tensorflow/core/ir/utils/shape_inference_utils.h
+++ b/tensorflow/core/ir/utils/shape_inference_utils.h
@@ -66,7 +66,7 @@ using GetAttrValuesFn = llvm::function_ref<tensorflow::Status(
 // in TFG importer. For operations that has different format of attributes, they
 // should give the `get_attr_values_fn` to read the attributes correctly.
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, ValueRange operands,
+    std::optional<Location> location, Operation* op, ValueRange operands,
     int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
     OpResultAsShapeFn op_result_as_shape_fn,
     ResultElementTypeFn result_element_type_fn,
@@ -77,16 +77,12 @@ LogicalResult InferReturnTypeComponentsForTFOp(
 // ConvertOperationToNode to convert the operation to NodeDef to get the attr
 // values.
 LogicalResult InferReturnTypeComponentsForTFOp(
-    Optional<Location> location, Operation* op, ValueRange operands,
+    std::optional<Location> location, Operation* op, ValueRange operands,
     int64_t graph_version, OperandAsConstantFn operand_as_constant_fn,
     OpResultAsShapeFn op_result_as_shape_fn,
     ResultElementTypeFn result_element_type_fn,
     SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
 
-mlir::RankedTensorType GetTypeFromTFTensorShape(llvm::ArrayRef<int64_t> shape,
-                                                mlir::Type elementType,
-                                                mlir::Attribute encoding = {});
-
 }  // namespace tfg
 }  // namespace mlir
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 8a7192f9e85..72cb5307abd 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6,6 +6,7 @@ load(
     "if_cuda_or_rocm",
     "if_google",
     "if_mobile",
+    "if_nccl",
     "if_not_windows",
     "if_oss",
     "tf_cc_binary",
@@ -24,14 +25,13 @@ load(
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_fingerprint_deps",
-    "tf_kernel_tests_linkstatic",
 )
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
     "mkl_deps",
 )
-load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "if_nccl", "tf_cc_shared_library", "tf_cuda_cc_test", "tf_cuda_cc_tests", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
+load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tf_cc_shared_library", "tf_cuda_cc_test", "tf_cuda_cc_tests", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
@@ -46,17 +46,8 @@ load(
 # Description:
 # Op kernel implementations for TensorFlow.
 #
-# Note: Any test that uses GPU support and which we would like to
-# benchmark should be linked statically so that it can be executed
-# from a py_binary or cuda_py_test test logger.  For such a test,
-# append "_gpu" to the test name to invoke the GPU benchmarks.  Example:
-#
-#   # for CPU tests
-#   $ bazel test --config opt //third_party/tensorflow/core/kernels:my_op_test
-#   # for GPU benchmarks
-#   $ bazel run --config opt --config=cuda //third_party/tensorflow/core/kernels:my_op_test_gpu -- --benchmark_filter=..
-#
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = if_google(
         [
@@ -182,6 +173,8 @@ tf_kernel_library(
     srcs = if_nccl([
         "collective_nccl.h",
         "collective_nccl.cc",
+        "collective_nccl_all_to_all.h",
+        "collective_nccl_all_to_all.cc",
         "collective_nccl_broadcaster.h",
         "collective_nccl_broadcaster.cc",
         "collective_nccl_gatherer.h",
@@ -219,6 +212,8 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:collective_test_util",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/nccl:collective_communicator",
     ],
 )
@@ -274,6 +269,7 @@ tf_kernel_library(
         "conv_2d_gpu_double.cu.cc",
         "conv_2d_gpu_float.cu.cc",
         "conv_2d_gpu_half.cu.cc",
+        "conv_2d_gpu_bfloat16.cu.cc",
         "conv_2d_gpu_int.cu.cc",
         "conv_2d_gpu_int_spatial_convolution.cu.cc",
         "conv_2d_gpu_int_spatial_convolution_backward.cu.cc",
@@ -689,6 +685,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/util/tensor_bundle",
     ],
 )
@@ -746,107 +743,20 @@ cc_library(
 
 # Private support libraries ---------------------------------------------------
 
-# Depending on a build configuration this target provides custom kernel for Eigen
-# tensor contractions (small matrix multiplication kernel used to multiple together
-# blocks of the original tensors).
-#
-# 1) Default:
-#    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
-#    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
-#
-# 2) Eigen: --define tensorflow_mkldnn_contraction_kernel=0 (disable mkldnn)
-#    Use Eigen contraction kernel: Eigen::internal::gebp_kernel.
-#
-# If you use `tensor.contract(other_tensor)` in your code, you must include additional header
-# to get the benefit of custom contraction kernel:
-#
-#   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#   #include "third_party/tensorflow/core/kernels/eigen_contraction_kernel.h"
-#   #endif
-#
-# We define a two-level target because if we just add
-#   ":no_mkldnn_contraction_kernel": []
-# in the same select list with //third_party/tensorflow:{android,arm,ios,ppc},
-# there can be more than one match, e.g., when building for android and MKL-DNN
-# contraction kernel is disabled. Bazel doesn't allow multiple matches.
-# See more details in
-#   https://github.com/tensorflow/tensorflow/issues/24414
-cc_library(
-    name = "eigen_contraction_kernel",
-    hdrs = ["eigen_contraction_kernel.h"],
-    compatible_with = get_compatible_with_portable(),
-    # Hack to disable breaking AVX512 special GemmKernel. There is a conflicting
-    # specialization there causing build breakages.  This must be added here
-    # as "defines" so that the header is excluded in all dependent targets.
-    # TODO(b/238649163): remove this once no longer necessary.
-    defines = ["GEMM_KERNEL_H"],
-    deps = select({
-        ":no_mkldnn_contraction_kernel": [":eigen_contraction_kernel_no_mkl"],
-        "//conditions:default": [":eigen_contraction_kernel_with_mkl"],
-    }) + ["@com_google_absl//absl/base"],
-)
-
-cc_library(
-    name = "eigen_contraction_kernel_with_mkl",
-    srcs = ["eigen_contraction_kernel.cc"],
-    hdrs = ["eigen_contraction_kernel.h"],
-    defines = select({
-        "//tensorflow:android_x86": [],
-        "//tensorflow:arm_any": [],
-        "//tensorflow:fuchsia_x86_64": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_ppc64le": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:macos_arm64": [],
-        "//conditions:default": [
-            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
-            "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
-        ],
-    }),
-    deps = [
-        "@com_google_absl//absl/base",
-        "//third_party/eigen3",
-        "//tensorflow/core/platform:dynamic_annotations",
-        "//tensorflow/tsl/framework/fixedpoint",
-    ] + select({
-        "//tensorflow:android_x86": [],
-        "//tensorflow:arm_any": [],
-        "//tensorflow:fuchsia_x86_64": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_ppc64le": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:macos_arm64": [],
-        "//conditions:default": ["@mkl_dnn_v1//:mkl_dnn"],
-    }),
-)
-
-cc_library(
-    name = "eigen_contraction_kernel_no_mkl",
-    srcs = ["eigen_contraction_kernel.cc"],
-    hdrs = ["eigen_contraction_kernel.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tensorflow/core/platform:dynamic_annotations",
-        "//tensorflow/tsl/framework/fixedpoint",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/base",
-    ],
-)
-
 filegroup(
     name = "xla_cpu_runtime_hdrs",
     srcs = [
-        "eigen_contraction_kernel.h",
-        "eigen_convolution_helpers.h",
-        "eigen_spatial_convolutions.h",
-        "eigen_spatial_convolutions-inl.h",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.h",
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers.h",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions.h",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl.h",
     ],
 )
 
 filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = [
-        "eigen_contraction_kernel.cc",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.cc",
     ],
 )
 
@@ -879,14 +789,15 @@ cc_library(
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_spatial_convolutions.h",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions.h",
     ],
     compatible_with = get_compatible_with_portable(),
     defines = ["EIGEN_NEON_GEBP_NR=4"],
     deps = [
-        ":eigen_contraction_kernel",
-        ":eigen_convolution_helpers",
-        ":eigen_spatial_convolutions-inl",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl",
         "//third_party/eigen3",
     ],
 )
@@ -900,43 +811,15 @@ cc_library(
         "eigen_backward_spatial_convolutions.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_spatial_convolutions.h",
     ],
     defines = ["EIGEN_NEON_GEBP_NR=4"],
     deps = [
-        ":eigen_convolution_helpers",
-        ":eigen_spatial_convolutions-inl",
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl",
         "//third_party/eigen3",
     ],
 )
 
-cc_library(
-    name = "eigen_spatial_convolutions-inl",
-    hdrs = [
-        "eigen_spatial_convolutions-inl.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    # Hack to disable breaking AVX512 special GemmKernel. There is a conflicting
-    # specialization there causing build breakages.  This must be added here
-    # as "defines" so that the header is excluded in all dependent targets.
-    # TODO(b/238649163): remove this once no longer necessary.
-    defines = ["GEMM_KERNEL_H"],
-    deps = [
-        ":eigen_convolution_helpers",
-    ],
-)
-
-cc_library(
-    name = "eigen_convolution_helpers",
-    hdrs = [
-        "eigen_convolution_helpers.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    defines = [
-        "EIGEN_ALTIVEC_USE_CUSTOM_PACK=0",
-    ],
-)
-
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -1263,6 +1146,7 @@ tf_kernel_library(
     gpu_srcs = [
         "tile_functor.h",
         "tile_functor_gpu.h",
+        "tile_functor_gpu_bfloat16.cu.cc",
         "tile_functor_gpu_bool.cu.cc",
         "tile_functor_gpu_complex64.cu.cc",
         "tile_functor_gpu_complex128.cu.cc",
@@ -1667,6 +1551,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/platform:status_matchers",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
     ],
 )
 
@@ -1779,6 +1664,7 @@ tf_cuda_cc_test(
     srcs = ["conv_ops_test.cc"],
     tags = [
         "no_cuda_asan",  # TODO(b/171342275): re-enable.
+        "no_mac_arm64",
         "requires-gpu-sm70",
     ],
     deps = [
@@ -1826,8 +1712,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+    ] + if_cuda([
         "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
-    ] + if_mkl(["//tensorflow/core/kernels/mkl:mkl_conv_op"]),
+    ]) + if_mkl([
+        "//tensorflow/core/kernels/mkl:mkl_conv_op",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -1841,7 +1730,6 @@ tf_cuda_cc_test(
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1850,7 +1738,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -1864,7 +1754,6 @@ tf_cuda_cc_test(
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1873,7 +1762,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -1953,7 +1844,6 @@ tf_cuda_cc_test(
         ":fused_batch_norm_op",
         ":ops_testutil",
         ":ops_util",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1961,7 +1851,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -1978,9 +1870,10 @@ tf_cuda_cc_test(
         ":ops_testutil",
         ":ops_util",
         ":relu_op",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
@@ -1990,9 +1883,9 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_cc_test(
@@ -2004,7 +1897,6 @@ tf_cc_test(
         ":in_topk_op",
         ":ops_testutil",
         ":ops_util",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2012,7 +1904,9 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2215,7 +2109,6 @@ tf_cc_test(
         ":one_hot_op",
         ":ops_testutil",
         ":ops_util",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2223,7 +2116,9 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
+    ]),
 )
 
 tf_cc_test(
@@ -2603,7 +2498,6 @@ tf_kernel_library(
 tf_cuda_cc_test(
     name = "scoped_allocator_ops_test",
     srcs = ["scoped_allocator_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),  #Required for benchmarking
     deps = [
         ":cwise_op",
         ":dense_update_ops",
@@ -3058,7 +2952,6 @@ tf_cc_tests(
         "eigen_backward_cuboid_convolutions_test.cc",
         "eigen_backward_spatial_convolutions_test.cc",
         "eigen_pooling_test.cc",
-        "eigen_spatial_convolutions_test.cc",
     ],
     deps = [
         ":eigen_helpers",
@@ -3085,9 +2978,9 @@ tf_cc_test(
     }),
     tags = ["mkldnn_contraction_kernel"],
     deps = [
-        ":eigen_contraction_kernel",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -3434,6 +3327,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:types_proto_cc",
     ],
 )
 
@@ -3465,7 +3359,7 @@ MATH_DEPS = [
 tf_kernel_library(
     name = "sparse_matmul_op",
     prefix = "sparse_matmul_op",
-    deps = MATH_DEPS + [":eigen_contraction_kernel"],
+    deps = MATH_DEPS + ["//tensorflow/tsl/framework/contraction:eigen_contraction_kernel"],
 )
 
 cc_library(
@@ -3528,7 +3422,7 @@ tf_kernel_library(
     hdrs = ["matmul_op_impl.h"],
     prefix = "matmul_op",
     deps = MATH_DEPS + [
-        ":eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         ":fused_eigen_output_kernels",
         ":loose_headers",
     ] + mkl_deps() + if_cuda([
@@ -3685,6 +3579,7 @@ tf_kernel_library(
     gpu_srcs = [
         "scan_ops.h",
         "scan_ops_gpu.h",
+        "scan_ops_gpu_bfloat16.cu.cc",
         "scan_ops_gpu_double.cu.cc",
         "scan_ops_gpu_float.cu.cc",
         "scan_ops_gpu_half.cu.cc",
@@ -4047,11 +3942,12 @@ tf_kernel_library(
     ],
     prefix = "conv_ops",
     deps = [
+        ":cast_op",
         ":conv_grad_shape_utils",
         ":conv_2d",
         ":conv_3d",
         ":cwise_lib_hdrs",
-        ":eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         ":fill_functor",
         ":fused_eigen_output_kernels",
         ":loose_headers",
@@ -4112,6 +4008,7 @@ tf_kernel_library(
     gpu_srcs = [
         "depthwise_conv_op.h",
         "depthwise_conv_op_gpu.h",
+        "depthwise_conv_op_gpu_bfloat16.cu.cc",
         "depthwise_conv_op_gpu_double.cu.cc",
         "depthwise_conv_op_gpu_float.cu.cc",
         "depthwise_conv_op_gpu_half.cu.cc",
@@ -4188,7 +4085,7 @@ cc_library(
 )
 
 NN_DEPS = if_cuda_or_rocm([":conv_2d"]) + [
-    ":eigen_contraction_kernel",
+    "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
     ":ops_util",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -4233,6 +4130,7 @@ tf_kernel_library(
     name = "fused_batch_norm_op",
     prefix = "fused_batch_norm_op",
     deps = NN_DEPS + [
+        ":cast_op",
         ":fill_functor",
         ":redux_functor",
         ":transpose_functor",
@@ -4321,6 +4219,7 @@ tf_kernel_library(
     gpu_srcs = [
         "topk_op.h",
         "topk_op_gpu.h",
+        "topk_op_gpu_bfloat16.cu.cc",
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
@@ -4487,6 +4386,7 @@ tf_kernel_library(
         "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
+        ":cast_op",
         ":conv_2d",
         ":conv_3d",
         ":conv_ops",
@@ -4903,7 +4803,6 @@ tf_kernel_library(
 tf_cc_test(
     name = "sendrecv_ops_test",
     srcs = ["sendrecv_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),  # Required for benchmarking
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -5212,7 +5111,6 @@ tf_cc_test(
     name = "sdca_ops_test",
     size = "small",
     srcs = ["sdca_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),  # Required for benchmarking
     deps = [
         ":ops_util",
         "//tensorflow/core:all_kernels",
@@ -5245,11 +5143,11 @@ cc_library(
     srcs = ["sdca_internal.cc"],
     hdrs = ["sdca_internal.h"],
     deps = [
-        ":eigen_contraction_kernel",
         ":loss_updaters",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -6028,6 +5926,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:status_matchers",
     ],
 )
 
@@ -6184,11 +6083,8 @@ filegroup(
         "eigen_attention.h",
         "eigen_backward_cuboid_convolutions.h",
         "eigen_backward_spatial_convolutions.h",
-        "eigen_convolution_helpers.h",
         "eigen_cuboid_convolution.h",
         "eigen_pooling.h",
-        "eigen_spatial_convolutions.h",
-        "eigen_spatial_convolutions-inl.h",
         "fifo_queue.h",
         "initializable_lookup_table.cc",
         "initializable_lookup_table.h",
@@ -6202,6 +6098,9 @@ filegroup(
         "queue_base.h",
         "queue_op.h",
         "typed_queue.h",
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers.h",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions.h",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl.h",
     ],
 )
 
@@ -6221,7 +6120,6 @@ filegroup(
         "assign_op.h",
         "bias_op.cc",
         "bias_op.h",
-        "bias_op_gpu.h",
         "cast_op.cc",
         "cast_op.h",
         "cast_op_impl.h",
@@ -6231,6 +6129,7 @@ filegroup(
         "cast_op_impl_complex64.cc",
         "cast_op_impl_double.cc",
         "cast_op_impl_float.cc",
+        "cast_op_impl_float8.cc",
         "cast_op_impl_half.cc",
         "cast_op_impl_int16.cc",
         "cast_op_impl_int32.cc",
@@ -6508,6 +6407,8 @@ filegroup(
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
         "cwise_op_ceil.cc",
+        "cwise_op_clip.h",
+        "cwise_op_clip.cc",
         "cwise_op_complex.cc",
         "cwise_op_conj.cc",
         "cwise_op_cos.cc",
@@ -6570,8 +6471,8 @@ filegroup(
         "depthwise_conv_grad_op.cc",
         "depthwise_conv_op.cc",
         "dynamic_partition_op.cc",
-        "eigen_contraction_kernel.cc",
-        "eigen_contraction_kernel.h",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.cc",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel.h",
         "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
@@ -6850,8 +6751,8 @@ filegroup(
 )
 
 ANDROID_TEXTUAL_HDRS = [
-    "eigen_convolution_helpers.h",
-    "eigen_spatial_convolutions-inl.h",
+    "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers.h",
+    "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl.h",
     "gather_nd_op_cpu_impl.h",
     "gemm_functors.h",
     "scatter_nd_op_cpu_impl.h",
@@ -6914,7 +6815,6 @@ filegroup(
             "debug_ops.*",
             "mutex_ops.*",
             "batch_kernels.*",
-            "regex_replace_op.cc",
             "string_lower_op.cc",  # Requires ICU for unicode.
             "string_upper_op.cc",  # Requires ICU for unicode.
             "unicode_ops.cc",
@@ -6978,10 +6878,27 @@ cc_library(
         "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core:portable_tensorflow_lib_lite",
-        "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/example:example_protos_cc",
+        "//tensorflow/core/framework:attr_value_proto_cc",
+        "//tensorflow/core/framework:dataset_options_proto_cc",
+        "//tensorflow/core/framework:dataset_proto_cc",
+        "//tensorflow/core/framework:full_type_proto_cc",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/framework:model_proto_cc",
+        "//tensorflow/core/framework:node_def_proto_cc",
+        "//tensorflow/core/framework:reader_base_proto_cc",
+        "//tensorflow/core/framework:summary_proto_cc",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/lib/png:png_io",
         "//tensorflow/core/platform:strong_hash",
         "//tensorflow/core/platform:types",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto_cc",
         "//tensorflow/tsl/framework/fixedpoint",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
@@ -7008,6 +6925,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:portable_tensorflow_lib_lite",
+        "//tensorflow/core/framework:reader_base_proto_cc",
     ],
     alwayslink = 1,
 )
@@ -7569,6 +7487,7 @@ tf_cc_test(
     name = "quantized_instance_norm_test",
     size = "small",
     srcs = ["quantized_instance_norm_test.cc"],
+    tags = ["no_mac_arm64"],
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -7699,6 +7618,14 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "sync_ops",
+    prefix = "sync_ops",
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+)
+
 # Library to link with when compiling the cwise_op kernels directly,
 # e.g. for selective registration.
 # should not be linked by projects that also link the cwise_op library.
@@ -7766,6 +7693,11 @@ cc_library(
     visibility = [
         "//visibility:public",
     ],
+    deps = [
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/framework:node_def_proto_cc",
+        "//tensorflow/core/framework:types_proto_cc",
+    ],
 )
 
 # Shared object that links all the kernels TF needs.
@@ -7849,7 +7781,6 @@ test_suite(
         ":diag_op_test_cpu",
         ":eigen_activations_test",
         ":eigen_pooling_test",
-        ":eigen_spatial_convolutions_test",
         ":gather_nd_op_test_cpu",
         ":matmul_op_test_cpu",
         ":mfcc_test",
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 2efcbc500ec..4bed7c9e56b 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -18,10 +18,9 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/aggregate_ops.h"
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/aggregate_ops.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index 92789c47c0b..3cbb276174d 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/argmax_op.h"
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -72,7 +73,9 @@ class ArgOp : public OpKernel {
     TensorShape output_shape;
     const TensorShape& input_shape = input.shape();
     for (int d = 0; d < input_dims - 1; ++d) {
-      output_shape.AddDim(input_shape.dim_size((d < axis) ? d : d + 1));
+      OP_REQUIRES_OK(context,
+                     output_shape.AddDimWithStatus(
+                         input_shape.dim_size((d < axis) ? d : d + 1)));
     }
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index bc536d18751..166f17dae0c 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -106,8 +106,11 @@ class AvgPoolingOp : public UnaryOp<T> {
                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
 
     Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, params.forward_output_shape(), &output));
+                                0, params_forward_output_shape, &output));
 
     SpatialAvgPool<Device, T>(context, output, tensor_in, params, padding_);
   }
@@ -185,7 +188,8 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
     OP_REQUIRES(context, tensor_in.dims() == 4,
                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
 
-    TensorShape output_shape = params.forward_output_shape();
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context, params.forward_output_shape(&output_shape));
     if (output_shape.num_elements() == 0) {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -236,6 +240,7 @@ namespace functor {
   extern template struct SpatialAvgPooling<GPUDevice, T>;
 
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
@@ -244,6 +249,9 @@ DECLARE_GPU_SPEC(double);
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     AvgPoolingOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::bfloat16>("T"),
+    AvgPoolingOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     AvgPoolingOp<GPUDevice, float>);
@@ -342,6 +350,19 @@ class AvgPoolingGradOp : public OpKernel {
     const T* out_backprop_ptr = out_backprop.flat<T>().data();
     T* input_backprop_ptr = output->flat<T>().data();
 
+    for (int64_t r = 0; r < out_backprop_rows; ++r) {
+      int rindex, rsize;
+      OP_REQUIRES_OK(context,
+                     GetBroadcastSize(r, in_rows, window_rows, row_stride,
+                                      pad_rows, &rindex, &rsize));
+      for (int64_t c = 0; c < out_backprop_cols; ++c) {
+        int cindex, csize;
+        OP_REQUIRES_OK(context,
+                       GetBroadcastSize(c, in_cols, window_cols, col_stride,
+                                        pad_cols, &cindex, &csize));
+      }
+    }
+
     auto shard = [context, out_backprop_ptr, input_backprop_ptr,
                   out_backprop_rows, out_backprop_cols, out_backprop_depth,
                   in_rows, in_cols, window_rows, window_cols, row_stride,
@@ -647,6 +668,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .TypeConstraint<Eigen::half>("T")
                             .HostMemory("orig_input_shape"),
                         AvgPoolingGradOpCustomGPUKernel<Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<Eigen::bfloat16>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index f97312adf0e..59d79a1ed78 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -34,6 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::SpatialAvgPooling<GPUDevice, T>;
 
 DEFINE_GPU_KERNELS(Eigen::half)
+DEFINE_GPU_KERNELS(Eigen::bfloat16)
 DEFINE_GPU_KERNELS(float)
 DEFINE_GPU_KERNELS(double)
 
@@ -98,24 +99,19 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
   return d.ok();
 }
 
-template bool RunAvePoolBackwardNHWC(
-    const double* const top_diff, const int num, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    double* const bottom_diff, const GPUDevice& d);
-template bool RunAvePoolBackwardNHWC(
-    const float* const top_diff, const int num, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    float* const bottom_diff, const GPUDevice& d);
-template bool RunAvePoolBackwardNHWC(
-    const Eigen::half* const top_diff, const int num, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    Eigen::half* const bottom_diff, const GPUDevice& d);
+#define DECLARE_GPU_SPEC(T)                                           \
+  template bool RunAvePoolBackwardNHWC(                               \
+      const T* const top_diff, const int num, const int height,       \
+      const int width, const int channels, const int pooled_height,   \
+      const int pooled_width, const int kernel_h, const int kernel_w, \
+      const int stride_h, const int stride_w, const int pad_t,        \
+      const int pad_l, T* const bottom_diff, const GPUDevice& d);
+
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 75cc503ea80..992521cb3b0 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -146,10 +146,10 @@ class BatchResource : public serving::BatchResourceBase {
 
     resource->reset(new BatchResource(
         fhandle, flib, std::move(batcher),
-        GetBatcherQueueOptions(num_batch_threads, max_execution_batch_size,
-                               batch_timeout_micros, max_enqueued_batches,
-                               allowed_batch_sizes,
-                               enable_large_batch_splitting),
+        GetBatcherQueueOptions(
+            num_batch_threads, max_execution_batch_size, batch_timeout_micros,
+            max_enqueued_batches, allowed_batch_sizes,
+            enable_large_batch_splitting, /*disable_padding=*/false),
         allowed_batch_sizes));
     return OkStatus();
   }
@@ -169,7 +169,8 @@ class BatchResource : public serving::BatchResourceBase {
         fhandle, flib, std::move(batcher),
         GetAdaptiveBatcherQueueOptions(
             max_batch_size, batch_timeout_micros, max_enqueued_batches,
-            true /* enable large batch split */, allowed_batch_sizes),
+            true /* enable large batch split */, allowed_batch_sizes,
+            /*disable_padding=*/false),
         allowed_batch_sizes));
     return OkStatus();
   }
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 0fb084bd3c1..84e092180cf 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index a969f9e0262..06d4d34e4b9 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -168,6 +168,9 @@ class AdaptiveSharedBatchScheduler
                          int max_batch_size,
                          std::vector<std::unique_ptr<TaskType>>* output_tasks)>
         split_input_task_func;
+
+    // If true, the padding will not be appended.
+    bool disable_padding = false;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index efb937db47a..5fcebf69825 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -347,7 +347,7 @@ BatchResourceBase::GetBatcherQueueOptions(
     int32_t num_batch_threads, int32_t max_batch_size,
     int32_t batch_timeout_micros, int32_t max_enqueued_batches,
     const std::vector<int32>& allowed_batch_sizes,
-    bool enable_large_batch_splitting) {
+    bool enable_large_batch_splitting, bool disable_padding) {
   BatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.input_batch_size_limit = max_batch_size;
   batcher_queue_options.max_enqueued_batches = max_enqueued_batches;
@@ -370,6 +370,7 @@ BatchResourceBase::GetBatcherQueueOptions(
           *allowed_batch_sizes.rbegin();
     }
   }
+  batcher_queue_options.disable_padding = disable_padding;
 
   return batcher_queue_options;
 }
@@ -378,7 +379,7 @@ BatchResourceBase::GetBatcherQueueOptions(
 BatchResourceBase::GetAdaptiveBatcherQueueOptions(
     int32_t max_batch_size, int32_t batch_timeout_micros,
     int32_t max_enqueued_batches, bool enable_large_batch_splitting,
-    const std::vector<int32>& allowed_batch_sizes) {
+    const std::vector<int32>& allowed_batch_sizes, bool disable_padding) {
   AdaptiveBatcherT::QueueOptions batcher_queue_options;
   batcher_queue_options.max_input_task_size =
       absl::make_optional(max_batch_size);
@@ -399,6 +400,7 @@ BatchResourceBase::GetAdaptiveBatcherQueueOptions(
                             max_batch_size, output_tasks);
     };
   }
+  batcher_queue_options.disable_padding = disable_padding;
 
   return batcher_queue_options;
 }
@@ -443,10 +445,12 @@ Status BatchResourceBase::ConcatInputTensors(
 
   const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
   const int padding_amount = padded_batch_size - batch.size();
-  profiler::TraceMe trace_me([padded_batch_size, padding_amount]() {
+  profiler::TraceMe trace_me([padded_batch_size, padding_amount,
+                              disable_padding = disable_padding_]() {
     return profiler::TraceMeEncode(
         "ConcatInputTensors", {{"batch_size_after_padding", padded_batch_size},
-                               {"padding_amount", padding_amount}});
+                               {"padding_amount", padding_amount},
+                               {"disable_padding", disable_padding}});
   });
   RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size,
                     context->op_kernel().name());
@@ -472,9 +476,9 @@ Status BatchResourceBase::ConcatInputTensors(
       to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
     }
 
-    // Add padding as needed. Use the first row of the first task's tensor as
-    // the data for padding.
-    if (padding_amount > 0) {
+    // Add padding as needed if padding is allowed. Use the first row of the
+    // first task's tensor as the data for padding.
+    if (padding_amount > 0 && !disable_padding_) {
       const Tensor& padding_source = batch.task(0).inputs.at(i);
       Tensor padding;
       if (padding_source.shape().dim_size(0) == 0) {
@@ -611,7 +615,9 @@ Status BatchResourceBase::SplitOutputTensors(
     task_sizes_plus_optional_padding.push_back(batch->task(i).size());
   }
   const int padding_size =
-      RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+      disable_padding_
+          ? 0
+          : RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
   if (padding_size > 0) {
     task_sizes_plus_optional_padding.push_back(padding_size);
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 772c6c376b5..0927e0621ce 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -133,7 +133,8 @@ class BatchResourceBase : public ResourceBase {
       : has_process_batch_function_(has_process_batch_function),
         batcher_(std::move(batcher)),
         batcher_queue_options_(batcher_queue_options),
-        allowed_batch_sizes_(std::move(allowed_batch_sizes)) {
+        allowed_batch_sizes_(std::move(allowed_batch_sizes)),
+        disable_padding_(batcher_queue_options.disable_padding) {
     allowed_batch_sizes_str_ = absl::StrJoin(allowed_batch_sizes_, ",");
   }
 
@@ -144,18 +145,19 @@ class BatchResourceBase : public ResourceBase {
       : has_process_batch_function_(has_process_batch_function),
         adaptive_batcher_(std::move(batcher)),
         adaptive_batcher_queue_options_(batcher_queue_options),
-        allowed_batch_sizes_(std::move(allowed_batch_sizes)) {}
+        allowed_batch_sizes_(std::move(allowed_batch_sizes)),
+        disable_padding_(batcher_queue_options.disable_padding) {}
 
   static BatcherT::QueueOptions GetBatcherQueueOptions(
       int32_t num_batch_threads, int32_t max_batch_size,
       int32_t batch_timeout_micros, int32_t max_enqueued_batches,
       const std::vector<int32>& allowed_batch_sizes,
-      bool enable_large_batch_splitting);
+      bool enable_large_batch_splitting, bool disable_padding);
 
   static AdaptiveBatcherT::QueueOptions GetAdaptiveBatcherQueueOptions(
       int32_t max_batch_size, int32_t batch_timeout_micros,
       int32_t max_enqueued_batches, bool enable_large_batch_splitting,
-      const std::vector<int32>& allowed_batch_sizes);
+      const std::vector<int32>& allowed_batch_sizes, bool disable_padding);
 
   // Split 'input' of 'input_task_ptr' along 0th dimension, into a list of
   // 'output_tasks'.
@@ -264,6 +266,9 @@ class BatchResourceBase : public ResourceBase {
   // A concatenated string of <allowed_batch_sizes_>, separated by ",". This is
   // used to record batching parameter.
   string allowed_batch_sizes_str_;
+
+  // If true, the padding will not be appended.
+  bool disable_padding_;
 };
 
 }  // namespace serving
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 2b6836d3175..0e5c0b2f210 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -218,6 +218,9 @@ class SharedBatchScheduler
     // submit batches whose size is in a small set of allowed sizes, that can be
     // done by adding padding in the process-batch callback.
     size_t max_execution_batch_size = 1000;
+
+    // If true, the padding will not be appended.
+    bool disable_padding = false;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
@@ -409,10 +412,10 @@ class Queue {
   // lock on 'mu_'.
   size_t SchedulingCapacityInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Returns true if queue doesn't have capacity for this task.
+  // Returns an error if queue doesn't have capacity for this task.
   //
   // `task` must outlive this method.
-  bool BatchTaskExceedQueueCapacity(TaskType* task) const
+  Status ValidateBatchTaskQueueCapacity(TaskType* task) const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // The task size of the last batch in the queue.
@@ -818,11 +821,8 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
 
     DCHECK(!closed_);
 
-    if (BatchTaskExceedQueueCapacity((*task).get())) {
-      return errors::Unavailable(
-          "The batch scheduling queue to which this task was submitted is "
-          "full");
-    }
+    TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
+
     const int64 open_batch_capacity =
         max_execution_batch_size - this->tail_batch_task_size();
 
@@ -892,11 +892,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     // TODO(b/161857471):
     // Add test coverage when when concurrent incoming batches arrives and
     // use up all queue capacity.
-    if (BatchTaskExceedQueueCapacity((*task).get())) {
-      return errors::Unavailable(
-          "The batch scheduling queue to which this task was submitted is "
-          "full");
-    }
+    TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
 
     const int64_t open_batch_remaining_slot =
         max_execution_batch_size() - batches_.back()->size();
@@ -984,12 +980,23 @@ size_t Queue<TaskType>::SchedulingCapacityInternal() const {
 }
 
 template <typename TaskType>
-bool Queue<TaskType>::BatchTaskExceedQueueCapacity(TaskType* task) const {
+Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
   // Queue creation requires that `enable_large_batch_splitting` is true
   // when `enable_lazy_split` is true, so this covers both eager split and
   // lazy split.
   if (options_.enable_large_batch_splitting) {
-    return task->size() > SchedulingCapacityInternal();
+    if (task->size() > SchedulingCapacityInternal()) {
+      return errors::Unavailable(
+          "The batch scheduling queue to which this task was submitted is "
+          "full; task size is ",
+          task->size(), " but scheduling capacity is only ",
+          SchedulingCapacityInternal(),
+          " (num_enqueued_batches=", num_enqueued_batches(),
+          ", max_enqueued_batches=", options_.max_enqueued_batches,
+          ", open_batch_size=", tail_batch_task_size(),
+          ", max_execution_batch_size=", max_execution_batch_size(), ")");
+    }
+    return OkStatus();
   }
 
   // NOTE, the capacity checking below is loose and is retained
@@ -1000,14 +1007,17 @@ bool Queue<TaskType>::BatchTaskExceedQueueCapacity(TaskType* task) const {
   // allows such models to continue to work.
   //
   // We need to revisit/remove this check after we fix model configs.
-  bool batch_task_exceed_queue_capacity = false;
   if (batches_.back()->size() + task->size() >
       options_.input_batch_size_limit) {
     if (batches_.size() >= options_.max_enqueued_batches) {
-      batch_task_exceed_queue_capacity = true;
+      return errors::Unavailable(
+          "The batch scheduling queue to which this task was submitted is "
+          "full; currently ",
+          batches_.size(), " batches enqueued and max_enqueued_batches is ",
+          options_.max_enqueued_batches);
     }
   }
-  return batch_task_exceed_queue_capacity;
+  return OkStatus();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 5b8d66c62e1..4c7aa87a05d 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -40,6 +40,8 @@ namespace tensorflow {
 namespace serving {
 namespace {
 
+using ::testing::HasSubstr;
+
 class FakeTask : public BatchTask {
  public:
   explicit FakeTask(size_t size) : size_(size) {}
@@ -593,8 +595,8 @@ TEST_P(SharedBatchSchedulerTest, ConstMethods) {
     EXPECT_THAT(
         ScheduleTask(1, queue.get()),
         testing::StatusIs(error::UNAVAILABLE,
-                          "The batch scheduling queue to which this task was "
-                          "submitted is full"));
+                          HasSubstr("The batch scheduling queue to which this "
+                                    "task was submitted is full")));
 
     EXPECT_EQ(max_enqueued_batches * 2, queue->NumEnqueuedTasks());
     EXPECT_EQ(0, queue->SchedulingCapacity());
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index 2ffb40809d0..0015e137b96 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -21,8 +21,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/core/kernels/spacetobatch_functor.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -31,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/spacetobatch_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -134,16 +133,19 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
   // The actual output shape exposed to callers.
   TensorShape external_output_shape;
 
-  external_output_shape.AddDim(orig_input_batch_size / block_shape_product);
+  OP_REQUIRES_OK(context, external_output_shape.AddDimWithStatus(
+                              orig_input_batch_size / block_shape_product));
 
   int64_t input_batch_size = orig_input_batch_size;
   for (int block_dim = 0; block_dim < removed_prefix_block_dims; ++block_dim) {
     const int64_t size = orig_input_tensor.dim_size(block_dim + 1);
     input_batch_size *= size;
-    external_output_shape.AddDim(size);
+    OP_REQUIRES_OK(context, external_output_shape.AddDimWithStatus(size));
   }
-  internal_input_shape.AddDim(input_batch_size);
-  internal_output_shape.AddDim(input_batch_size / block_shape_product);
+  OP_REQUIRES_OK(context,
+                 internal_input_shape.AddDimWithStatus(input_batch_size));
+  OP_REQUIRES_OK(context, internal_output_shape.AddDimWithStatus(
+                              input_batch_size / block_shape_product));
 
   for (int block_dim = removed_prefix_block_dims;
        block_dim < block_dims - removed_suffix_block_dims; ++block_dim) {
@@ -158,20 +160,22 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
     OP_REQUIRES(context, cropped_size >= 0,
                 errors::InvalidArgument("cropped_shape[", block_dim, "]=",
                                         cropped_size, " must be non-negative"));
-    internal_input_shape.AddDim(input_size);
-    internal_output_shape.AddDim(cropped_size);
-    external_output_shape.AddDim(cropped_size);
+    OP_REQUIRES_OK(context, internal_input_shape.AddDimWithStatus(input_size));
+    OP_REQUIRES_OK(context,
+                   internal_output_shape.AddDimWithStatus(cropped_size));
+    OP_REQUIRES_OK(context,
+                   external_output_shape.AddDimWithStatus(cropped_size));
   }
 
   int64_t depth = 1;
   for (int dim = block_dims - removed_suffix_block_dims + 1; dim < input_dims;
        ++dim) {
     const int64_t size = orig_input_tensor.dim_size(dim);
-    external_output_shape.AddDim(size);
+    OP_REQUIRES_OK(context, external_output_shape.AddDimWithStatus(size));
     depth *= size;
   }
-  internal_input_shape.AddDim(depth);
-  internal_output_shape.AddDim(depth);
+  OP_REQUIRES_OK(context, internal_input_shape.AddDimWithStatus(depth));
+  OP_REQUIRES_OK(context, internal_output_shape.AddDimWithStatus(depth));
 
   // Allocate output tensor.
   Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 16c1391f333..00ebc3af93e 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/core/kernels/bias_op_gpu.h"
+
 #include <algorithm>
 
 #include "tensorflow/core/framework/register_types.h"
@@ -24,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bias_op.h"
-#include "tensorflow/core/kernels/bias_op_gpu.h"
 #include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
 #include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/platform/types.h"
@@ -48,6 +49,11 @@ struct AccumulatorType<Eigen::half> {
   typedef float type;
 };
 
+template <>
+struct AccumulatorType<Eigen::bfloat16> {
+  typedef float type;
+};
+
 // Definition of the GPU implementations declared in bias_op.cc.
 
 template <typename T>
diff --git a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
index 0e9ec7a4c01..d9a7cb769ba 100644
--- a/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/broadcast_to_op_gpu.cu.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/broadcast_to_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/broadcast_to_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index 94eb7f2738e..5907ac33785 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -180,15 +180,14 @@ class FixedUnigramCandidateSamplerOp : public BaseCandidateSamplerOp {
     OP_REQUIRES_OK(context, context->GetAttr("num_shards", &num_shards));
     int64_t shard;
     OP_REQUIRES_OK(context, context->GetAttr("shard", &shard));
-
-    if (!vocab_file.empty()) {
-      set_sampler(new FixedUnigramSampler(context->env(), range_max, vocab_file,
-                                          distortion, num_reserved_ids,
-                                          num_shards, shard));
-    } else {
-      set_sampler(new FixedUnigramSampler(range_max, unigrams, distortion,
-                                          num_reserved_ids, num_shards, shard));
-    }
+    FixedUnigramSampler* sampler = new FixedUnigramSampler(
+        range_max, distortion, num_reserved_ids, num_shards, shard);
+    if (!vocab_file.empty())
+      OP_REQUIRES_OK(
+          context, sampler->SetDistributionSampler(context->env(), vocab_file));
+    else
+      OP_REQUIRES_OK(context, sampler->SetDistributionSampler(unigrams));
+    set_sampler(sampler);
   }
 };
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index e1fc7e5c0fb..13687b8d862 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -156,6 +156,10 @@ Status CpuCastOp::Prepare() {
     work_ = GetCpuCastFromComplex128(dst_dtype_);
   } else if (src_dtype_ == DT_BFLOAT16) {
     work_ = GetCpuCastFromBfloat(dst_dtype_);
+  } else if (src_dtype_ == DT_FLOAT8_E5M2) {
+    work_ = GetCpuCastFromFloat8e5m2(dst_dtype_);
+  } else if (src_dtype_ == DT_FLOAT8_E4M3FN) {
+    work_ = GetCpuCastFromFloat8e4m3fn(dst_dtype_);
   }
 
   // TODO(sesse): If CPU casting to or from Eigen::half ever becomes a
@@ -210,6 +214,10 @@ class GpuCastOp : public CastOpBase {
       work_ = GetGpuCastFromComplex128(dst_dtype_);
     } else if (src_dtype_ == DT_BFLOAT16) {
       work_ = GetGpuCastFromBfloat(dst_dtype_);
+    } else if (src_dtype_ == DT_FLOAT8_E5M2) {
+      work_ = GetGpuCastFromFloat8e5m2(dst_dtype_);
+    } else if (src_dtype_ == DT_FLOAT8_E4M3FN) {
+      work_ = GetGpuCastFromFloat8e4m3fn(dst_dtype_);
     }
 
     return work_ == nullptr ? Unimplemented() : OkStatus();
@@ -248,7 +256,27 @@ CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<double>);
 #endif
 
 REGISTER_CAST_GPU(float, bfloat16);
+REGISTER_CAST_GPU(float, float8_e5m2);
+REGISTER_CAST_GPU(float, float8_e4m3fn);
+
 REGISTER_CAST_GPU(bfloat16, float);
+REGISTER_CAST_GPU(bfloat16, float8_e5m2);
+REGISTER_CAST_GPU(bfloat16, float8_e4m3fn);
+
+REGISTER_CAST_GPU(Eigen::half, float8_e5m2);
+REGISTER_CAST_GPU(Eigen::half, float8_e4m3fn);
+
+REGISTER_CAST_GPU(float8_e5m2, float);
+REGISTER_CAST_GPU(float8_e5m2, bfloat16);
+REGISTER_CAST_GPU(float8_e5m2, Eigen::half);
+REGISTER_CAST_GPU(float8_e5m2, float8_e5m2);
+REGISTER_CAST_GPU(float8_e5m2, float8_e4m3fn);
+
+REGISTER_CAST_GPU(float8_e4m3fn, float);
+REGISTER_CAST_GPU(float8_e4m3fn, bfloat16);
+REGISTER_CAST_GPU(float8_e4m3fn, Eigen::half);
+REGISTER_CAST_GPU(float8_e4m3fn, float8_e5m2);
+REGISTER_CAST_GPU(float8_e4m3fn, float8_e4m3fn);
 
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index eb5fdf613d6..7501ca183c9 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -27,34 +27,34 @@ limitations under the License.
 // Note that the GPU cast functor templates need to be instantiated unlike the
 // CPU ones, and hence their specializations are different than that for CPUs.
 #ifdef SPECIALIZE_FOR_GPUS
-#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_TYPE)                  \
   template <typename Device>                                        \
-  struct CastFunctor<Device, OUT_TYPE, IN_OUT> {                    \
+  struct CastFunctor<Device, OUT_TYPE, IN_TYPE> {                   \
     void operator()(const Device& d,                                \
                     typename TTypes<OUT_TYPE>::Flat out_tensor,     \
-                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,  \
                     bool truncate = false) {                        \
       if (truncate) {                                               \
         out_tensor.device(d) =                                      \
-            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_TYPE, OUT_TYPE>()) \
                 .template cast<OUT_TYPE>();                         \
       } else {                                                      \
         out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
       }                                                             \
     }                                                               \
   };                                                                \
-  template struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT>;
+  template struct CastFunctor<DEVICE, OUT_TYPE, IN_TYPE>;
 #else
-#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_TYPE)                  \
   template <>                                                       \
-  struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT> {                    \
+  struct CastFunctor<DEVICE, OUT_TYPE, IN_TYPE> {                   \
     void operator()(const DEVICE& d,                                \
                     typename TTypes<OUT_TYPE>::Flat out_tensor,     \
-                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,  \
                     bool truncate = false) {                        \
       if (truncate) {                                               \
         out_tensor.device(d) =                                      \
-            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_TYPE, OUT_TYPE>()) \
                 .template cast<OUT_TYPE>();                         \
       } else {                                                      \
         out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
@@ -73,11 +73,20 @@ limitations under the License.
   SPECIALIZE_CAST(devname, Eigen::half, std::complex<double>)         \
   SPECIALIZE_CAST(devname, Eigen::half, std::complex<float>)          \
   SPECIALIZE_CAST(devname, bfloat16, float)                           \
-  template <typename OUT_TYPE, typename IN_OUT>                       \
-  struct CastFunctor<devname, OUT_TYPE, IN_OUT> {                     \
+  SPECIALIZE_CAST(devname, float8_e5m2, double)                       \
+  SPECIALIZE_CAST(devname, float8_e5m2, float)                        \
+  SPECIALIZE_CAST(devname, float8_e5m2, bfloat16)                     \
+  SPECIALIZE_CAST(devname, float8_e5m2, Eigen::half)                  \
+  SPECIALIZE_CAST(devname, float8_e5m2, float8_e4m3fn)                \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, double)                     \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, float)                      \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, bfloat16)                   \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, Eigen::half)                \
+  template <typename OUT_TYPE, typename IN_TYPE>                      \
+  struct CastFunctor<devname, OUT_TYPE, IN_TYPE> {                    \
     void operator()(const devname& d,                                 \
                     typename TTypes<OUT_TYPE>::Flat out_tensor,       \
-                    typename TTypes<IN_OUT>::ConstFlat in_tensor,     \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor,    \
                     bool truncate = false) {                          \
       out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();     \
     }                                                                 \
@@ -89,18 +98,27 @@ limitations under the License.
 // Eigen::half to float, because it is used in depthwise_conv_grad_op.cc. We
 // still need the specialized cast from float to double because it is used in
 // resize_bilinear_op.cc.
-#define CAST_FUNCTORS_SUBSET(devname)                                 \
-  SPECIALIZE_CAST(devname, float, double)                             \
-  SPECIALIZE_CAST(devname, Eigen::half, float)                        \
-  SPECIALIZE_CAST(devname, bfloat16, float)                           \
-  template <typename OUT_TYPE, typename IN_OUT>                       \
-  struct CastFunctor<devname, OUT_TYPE, IN_OUT> {                     \
-    void operator()(const devname& d,                                 \
-                    typename TTypes<OUT_TYPE>::Flat out_tensor,       \
-                    typename TTypes<IN_OUT>::ConstFlat in_tensor,     \
-                    bool truncate = false) {                          \
-      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();     \
-    }                                                                 \
+#define CAST_FUNCTORS_SUBSET(devname)                              \
+  SPECIALIZE_CAST(devname, float, double)                          \
+  SPECIALIZE_CAST(devname, Eigen::half, float)                     \
+  SPECIALIZE_CAST(devname, bfloat16, float)                        \
+  SPECIALIZE_CAST(devname, float8_e5m2, double)                    \
+  SPECIALIZE_CAST(devname, float8_e5m2, float)                     \
+  SPECIALIZE_CAST(devname, float8_e5m2, bfloat16)                  \
+  SPECIALIZE_CAST(devname, float8_e5m2, Eigen::half)               \
+  SPECIALIZE_CAST(devname, float8_e5m2, float8_e4m3fn)             \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, double)                  \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, float)                   \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, bfloat16)                \
+  SPECIALIZE_CAST(devname, float8_e4m3fn, Eigen::half)             \
+  template <typename OUT_TYPE, typename IN_TYPE>                   \
+  struct CastFunctor<devname, OUT_TYPE, IN_TYPE> {                 \
+    void operator()(const devname& d,                              \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,    \
+                    typename TTypes<IN_TYPE>::ConstFlat in_tensor, \
+                    bool truncate = false) {                       \
+      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();  \
+    }                                                              \
   };
 #endif
 
@@ -169,14 +187,12 @@ struct CastFunctor {
                   typename TTypes<Tin>::ConstFlat i, bool truncate = false);
 };
 
-// Only enable LSBZeroSetterHelper for 64 and 32 bit input data types.
-// Specialize for others if needed in future.
 template <typename I>
 typename std::enable_if<sizeof(I) == 8, void>::type EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
   // Only zero the bits for non-NaNs.
   // For NaNs, let the non-truncation version handle it.
-  if (!std::isnan(t)) {
+  if (!Eigen::numext::isnan(t)) {
     uint64_t* p = reinterpret_cast<uint64_t*>(&t);
     *p &= (0xFFFFFFFFFFFFFFFF << n);
   }
@@ -187,16 +203,38 @@ typename std::enable_if<sizeof(I) == 4, void>::type EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
   // Only zero the bits for non-NaNs.
   // For NaNs, let the non-truncation version handle it.
-  if (!std::isnan(t)) {
+  if (!Eigen::numext::isnan(t)) {
     uint32_t* p = reinterpret_cast<uint32_t*>(&t);
     *p &= (0xFFFFFFFF << n);
   }
 }
 
+template <typename I>
+typename std::enable_if<sizeof(I) == 2, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint16_t* p = reinterpret_cast<uint16_t*>(&t);
+    *p &= (0xFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 1, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!Eigen::numext::isnan(t)) {
+    uint8_t* p = reinterpret_cast<uint8_t*>(&t);
+    *p &= (0xFF << n);
+  }
+}
+
 // Set n least significant bits to 0
 template <typename I, typename O>
 struct LSBZeroSetter {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const I operator()(const I& a) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE I operator()(const I& a) const {
     constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
     static_assert(
         bits > 0,
@@ -209,14 +247,14 @@ struct LSBZeroSetter {
 
 template <typename I, typename O>
 struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<I> operator()(
       const std::complex<I>& a) const {
     constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
     static_assert(
         bits > 0,
         "The output type must have fewer mantissa bits than the input type\n");
-    I re = std::real(a);
-    I img = std::imag(a);
+    I re = Eigen::numext::real(a);
+    I img = Eigen::numext::imag(a);
     LSBZeroSetterHelper(re, bits);
     LSBZeroSetterHelper(img, bits);
     std::complex<I> toReturn(re, img);
@@ -227,14 +265,14 @@ struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
 template <typename I, typename O>
 struct LSBZeroSetter<std::complex<I>, O> {
   // Sets the 16 LSBits of the float to 0
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<I> operator()(
       const std::complex<I>& a) const {
     constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
     static_assert(
         bits > 0,
         "The output type must have fewer mantissa bits than the input type\n");
-    I re = std::real(a);
-    I img = std::imag(a);
+    I re = Eigen::numext::real(a);
+    I img = Eigen::numext::imag(a);
     LSBZeroSetterHelper(re, bits);
     LSBZeroSetterHelper(img, bits);
     std::complex<I> toReturn(re, img);
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index 505e5a9b902..db989d45428 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -54,6 +54,19 @@ CAST_FUNCTORS(GPUDevice);
 
 DEFINE(float, bfloat16);
 
+// Required functors not previously specialized for truncation.
+DEFINE(double, float8_e5m2);
+DEFINE(float, float8_e5m2);
+DEFINE(bfloat16, float8_e5m2);
+DEFINE(Eigen::half, float8_e5m2);
+DEFINE(float8_e5m2, float8_e5m2);
+DEFINE(float8_e4m3fn, float8_e5m2);
+DEFINE(double, float8_e4m3fn);
+DEFINE(float, float8_e4m3fn);
+DEFINE(bfloat16, float8_e4m3fn);
+DEFINE(Eigen::half, float8_e4m3fn);
+DEFINE(float8_e4m3fn, float8_e4m3fn);
+
 #if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 
 // The cast from float to double is still needed for resize_bilinear_op.cc
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 8911c1c4030..3b393b25f9b 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -96,6 +96,10 @@ CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype);
 
 CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype);
 
+CastFunctorType GetCpuCastFromFloat8e5m2(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromFloat8e4m3fn(DataType dst_dtype);
+
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 // Same, for GPU.
@@ -129,6 +133,10 @@ CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype);
 
 CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
 
+CastFunctorType GetGpuCastFromFloat8e5m2(DataType dst_dtype);
+
+CastFunctorType GetGpuCastFromFloat8e4m3fn(DataType dst_dtype);
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index 0a399d011e9..4944304d032 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -24,20 +24,17 @@ typedef Eigen::GpuDevice GPUDevice;
 
 CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, bfloat16);
+  CAST_CASE(CPUDevice, bfloat16, float8_e5m2);
+  CAST_CASE(CPUDevice, bfloat16, float8_e4m3fn);
   return nullptr;
 }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype) {
-  if (dst_dtype == DT_FLOAT) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,
-              bool truncate) {
-      functor::CastFunctor<GPUDevice, float, bfloat16> func;
-      func(ctx->eigen_device<GPUDevice>(), out->flat<float>(),
-           inp.flat<bfloat16>(), truncate);
-    };
-  }
+  CAST_CASE(GPUDevice, bfloat16, float);
+  CAST_CASE(GPUDevice, bfloat16, float8_e5m2);
+  CAST_CASE(GPUDevice, bfloat16, float8_e4m3fn);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 606838ef60a..98c4c7deb63 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -24,6 +24,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 CastFunctorType GetCpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, float);
+  CAST_CASE(CPUDevice, float, float8_e5m2);
+  CAST_CASE(CPUDevice, float, float8_e4m3fn);
   return nullptr;
 }
 
@@ -35,6 +37,8 @@ CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
 #else
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
 #endif
+  CAST_CASE(GPUDevice, float, float8_e5m2);
+  CAST_CASE(GPUDevice, float, float8_e4m3fn);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cast_op_impl_float8.cc b/tensorflow/core/kernels/cast_op_impl_float8.cc
new file mode 100644
index 00000000000..80d5a9df1f8
--- /dev/null
+++ b/tensorflow/core/kernels/cast_op_impl_float8.cc
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cast_op_impl.h"
+
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+CastFunctorType GetCpuCastFromFloat8e5m2(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, CPUDevice, float8_e5m2);
+  CAST_CASE(CPUDevice, float8_e5m2, float8_e5m2);
+  CAST_CASE(CPUDevice, float8_e5m2, float8_e4m3fn);
+  return nullptr;
+}
+
+CastFunctorType GetCpuCastFromFloat8e4m3fn(DataType dst_dtype) {
+  CURRY_TYPES3(CAST_CASE, CPUDevice, float8_e4m3fn);
+  CAST_CASE(CPUDevice, float8_e4m3fn, float8_e5m2);
+  CAST_CASE(CPUDevice, float8_e4m3fn, float8_e4m3fn);
+  return nullptr;
+}
+
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
+    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+CastFunctorType GetGpuCastFromFloat8e5m2(DataType dst_dtype) {
+  CAST_CASE(GPUDevice, float8_e5m2, float);
+  CAST_CASE(GPUDevice, float8_e5m2, bfloat16);
+  CAST_CASE(GPUDevice, float8_e5m2, Eigen::half);
+  CAST_CASE(GPUDevice, float8_e5m2, float8_e5m2);
+  CAST_CASE(GPUDevice, float8_e5m2, float8_e4m3fn);
+  return nullptr;
+}
+
+CastFunctorType GetGpuCastFromFloat8e4m3fn(DataType dst_dtype) {
+  CAST_CASE(GPUDevice, float8_e4m3fn, float);
+  CAST_CASE(GPUDevice, float8_e4m3fn, bfloat16);
+  CAST_CASE(GPUDevice, float8_e4m3fn, Eigen::half);
+  CAST_CASE(GPUDevice, float8_e4m3fn, float8_e5m2);
+  CAST_CASE(GPUDevice, float8_e4m3fn, float8_e4m3fn);
+  return nullptr;
+}
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index 9f0224e47ce..04a2429efa3 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -22,17 +22,19 @@ typedef Eigen::GpuDevice GPUDevice;
 
 CastFunctorType GetCpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, Eigen::half);
+  CAST_CASE(CPUDevice, Eigen::half, float8_e5m2);
+  CAST_CASE(CPUDevice, Eigen::half, float8_e4m3fn);
   return nullptr;
 }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromHalf(DataType dst_dtype) {
-#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CAST_CASE(GPUDevice, float, bfloat16);
-#else
+#if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
 #endif
+  CAST_CASE(GPUDevice, Eigen::half, float8_e5m2);
+  CAST_CASE(GPUDevice, Eigen::half, float8_e4m3fn);
   return nullptr;
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index 5c1310ee865..9a1cb900f3b 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -44,6 +44,12 @@ Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
     case GATHER_COLLECTIVE:
       expected_name = "NcclGather";
       break;
+    case REDUCE_SCATTER_COLLECTIVE:
+      expected_name = "NcclReduceScatter";
+      break;
+    case ALL_TO_ALL_COLLECTIVE:
+      expected_name = "NcclAllToAll";
+      break;
     default:
       return errors::Internal("Unexpected CollectiveType ", type_);
   }
diff --git a/tensorflow/core/kernels/collective_nccl_all_to_all.cc b/tensorflow/core/kernels/collective_nccl_all_to_all.cc
new file mode 100644
index 00000000000..a24183ee71e
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_all_to_all.cc
@@ -0,0 +1,34 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_all_to_all.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+void NcclAllToAll::Run(StatusCallback done) {
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done));
+}
+
+REGISTER_COLLECTIVE(NcclAllToAll, NcclAllToAll);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_all_to_all.h b/tensorflow/core/kernels/collective_nccl_all_to_all.h
new file mode 100644
index 00000000000..4ba624c9577
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_all_to_all.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
+
+#include "tensorflow/core/kernels/collective_nccl.h"
+
+namespace tensorflow {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+class NcclAllToAll : public NcclBase {
+ public:
+  NcclAllToAll() : NcclBase(ALL_TO_ALL_COLLECTIVE, "NcclAllToAll") {}
+  ~NcclAllToAll() override = default;
+
+  // Hands off all-to-all to NcclManager.
+  void Run(StatusCallback done) override;
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_ALL_TO_ALL_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index 66db2a58ec8..46fbc2dc12d 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -122,6 +122,7 @@ void NcclReducer::Run(StatusCallback done) {
 }
 
 REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
+REGISTER_COLLECTIVE(NcclReduceScatter, NcclReduceScatterer);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.h b/tensorflow/core/kernels/collective_nccl_reducer.h
index b3f4b60852a..b95d5720679 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.h
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -23,12 +23,21 @@ namespace tensorflow {
 class NcclReducer : public NcclBase {
  public:
   NcclReducer() : NcclBase(REDUCTION_COLLECTIVE, "NcclReduce") {}
+  NcclReducer(CollectiveType type, const string& name) : NcclBase(type, name) {}
   ~NcclReducer() override = default;
 
   // Hands off all reduce to NcclManager.
   void Run(StatusCallback done) override;
 };
 
+class NcclReduceScatterer : public NcclReducer {
+ public:
+  NcclReduceScatterer()
+      : NcclReducer(REDUCE_SCATTER_COLLECTIVE, "NcclReduceScatter") {}
+  ~NcclReduceScatterer() override = default;
+  // Uses same Run() as NcclReducer.
+};
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index d487c9a8325..c698aec415b 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/device_attributes.pb.h"
 #ifdef GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/collective_nccl.h"
-
 #include <algorithm>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_test_util.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -34,13 +31,10 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/collective_nccl_broadcaster.h"
-#include "tensorflow/core/kernels/collective_nccl_gatherer.h"
-#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/unbounded_work_queue.h"
@@ -84,80 +78,27 @@ class NcclTestBase : public ::testing::Test {
   class DeviceInstance;
 
   NcclTestBase(CollectiveType collective_type, const string& collective_name)
-      : collective_type_(collective_type),
-        collective_name_(collective_name),
-        nccl_communicator_(MaybeCreateNcclCommunicator(config_proto_)),
-        work_queue_(std::make_shared<UnboundedWorkQueue>(
-            Env::Default(), "collective_executor")),
-        col_exec_(nullptr),
-        col_params_(nullptr) {}
-
-  ~NcclTestBase() override {
-    if (col_exec_) col_exec_->Unref();
-    if (col_params_) col_params_->Unref();
-  }
+      : collective_type_(collective_type), collective_name_(collective_name) {}
 
-  void SetUp() {
-    std::vector<std::unique_ptr<Device>> all_devices;
-    TF_CHECK_OK(DeviceFactory::GetFactory(DEVICE_GPU)
-                    ->AddDevices(SessionOptions(), "", &all_devices));
-    for (std::unique_ptr<Device>& d : all_devices) {
-      if (d->device_type() == "GPU") {
-        gpus_.emplace_back(std::move(d));
-      }
-    }
-  }
-
-  void Init(const int num_ranks, const int instance_key) {
+  void Init(const int num_ranks) {
     setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
     setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
-    std::vector<std::unique_ptr<Device>> local_devices;
-    std::vector<string> device_names;
-    CHECK_LE(num_ranks, gpus_.size());
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      local_devices.emplace_back(std::move(gpus_[rank]));
-    }
-    int num_gpus = local_devices.size();
-    for (const auto& device : local_devices) {
-      device_names.push_back(device->name());
-      VLOG(2) << device->name();
-    }
-    if (!dev_mgr_)
-      dev_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(local_devices));
-    col_exec_ =
-        new BaseCollectiveExecutor(&col_exec_mgr_, /*remote_access=*/nullptr,
-                                   kStepId, dev_mgr_.get(), work_queue_);
-
-    // Initialize collective params.
-    col_params_ = new CollectiveParams();
-    col_params_->name = "test_nccl_collective_op";
-    const int group_key = num_ranks;
-    col_params_->group.group_key = group_key;
-    col_params_->group.device_type = DEVICE_GPU;
-    col_params_->group.group_size = num_ranks;
-    col_params_->instance.instance_key = instance_key;
-    col_params_->instance.type = collective_type_;
-    col_params_->instance.data_type = DT_FLOAT;
-    col_params_->instance.impl_details.collective_name = collective_name_;
-    const string task_name = "/job:worker/replica:0/task:0";
-    col_params_->group.num_devices_per_task[task_name] = num_ranks;
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      CollGroupMember member;
-      member.device.set_name(device_names[rank % num_gpus]);
-      col_params_->group.members.push_back(member);
-    }
+    test_env_ = CreateCollectiveTestEnv(/*num_workers*/ 1,
+                                        /*num_devices_per_worker*/ num_ranks,
+                                        DEVICE_GPU, /*use_nccl=*/true);
     for (int rank = 0; rank < num_ranks; ++rank) {
       instances_.push_back(std::make_unique<DeviceInstance>(
-          rank, col_params_->group.members[rank].device.name(), this));
+          rank, collective_name_, collective_type_, test_env_.get()));
     }
   }
 
   // Initialize `input` tensor at rank `rank`.
   virtual void InitInput(Tensor* input, const int rank) = 0;
 
-  // Initialize `expected` output at all `num_ranks` ranks.
+  // Initialize `expected` output at `current_rank` out of `num_ranks` ranks.
   virtual void InitExpected(std::vector<float>* expected,
-                            const int tensor_length, const int num_ranks) = 0;
+                            const int tensor_length, const int current_rank,
+                            const int num_ranks) = 0;
 
   // Initialize device `di` specific to the collective op.
   virtual void InitDevice(DeviceInstance* di) = 0;
@@ -166,266 +107,128 @@ class NcclTestBase : public ::testing::Test {
   virtual void RunCollectiveOnDevice(DeviceInstance* di) = 0;
 
   void RunCollective() {
-    int done = 0;
-    mutex done_mu;
-    condition_variable done_cv;
+    std::atomic<int> done(0);
     for (const auto& instance : instances_) {
       DeviceInstance* di = instance.get();
       InitDevice(di);
-      SchedClosure([this, di, &done, &done_mu, &done_cv] {
+      SchedClosure([this, di, &done] {
         RunCollectiveOnDevice(di);
-        mutex_lock l(done_mu);
         ++done;
-        done_cv.notify_all();
       });
     }
-
-    mutex_lock l(done_mu);
-    while (done < instances_.size()) done_cv.wait(l);
+    while (done < static_cast<int>(instances_.size())) {
+      Env::Default()->SleepForMicroseconds(1000);
+    }
   }
 
-  void RunTest(int num_ranks, int input_length, int instance_key) {
-    if (num_ranks > gpus_.size()) {
+  void RunTest(int num_ranks, int input_length) {
+    Init(num_ranks);
+    if (num_ranks > test_env_->device_mgr->NumDevices()) {
       LOG(WARNING) << "Skipping test because required " << num_ranks
-                   << " GPUs but found " << gpus_.size();
+                   << " GPUs but found " << test_env_->device_mgr->NumDevices();
       return;
     }
-    Init(num_ranks, instance_key);
-    std::vector<float> expected;
-    InitExpected(&expected, input_length, num_ranks);
-    if (VLOG_IS_ON(3)) {
-      string str_buf;
-      for (const auto& x : expected) {
-        strings::StrAppend(&str_buf, " ", x);
-      }
-      VLOG(3) << "Expected output " << str_buf;
-    }
     for (int rank = 0; rank < num_ranks; ++rank) {
-      DeviceInstance* instance = instances_[rank].get();
-      instance->InitTensor(DT_FLOAT, TensorShape({input_length}),
-                           [this, rank](Tensor* t) { InitInput(t, rank); });
+      instances_[rank]->InitTensor(
+          DT_FLOAT, TensorShape({input_length}),
+          [this, rank](Tensor* t) { InitInput(t, rank); });
     }
     RunCollective();
-    // Confirm that every rank computed the same correct value.
+    // Check output.
     for (int rank = 0; rank < instances_.size(); ++rank) {
+      std::vector<float> expected;
+      InitExpected(&expected, input_length, rank, num_ranks);
+      if (VLOG_IS_ON(3)) {
+        string str_buf;
+        for (const auto& x : expected) {
+          strings::StrAppend(&str_buf, " ", x);
+        }
+        VLOG(3) << "Expected output " << str_buf;
+      }
+
       TF_ASSERT_OK(instances_[rank]->status_);
-      Tensor* output = &instances_[rank]->output_;
-      const int output_length = output->NumElements();
-      VLOG(2) << "rank " << rank << " output " << output << " buf "
-              << DMAHelper::base(output);
-      Tensor actual(DT_FLOAT, TensorShape({output_length}));
-      Device* dev = instances_[rank]->device_;
-      auto* dev_info = dev->tensorflow_accelerator_device_info();
-      TF_CHECK_OK(dev_info->default_context->CopyDeviceTensorToCPUSync(
-          output, /*tensor_name=*/"", dev, &actual));
+      VLOG(2) << "rank " << rank << " output " << &instances_[rank]->output_
+              << " buf " << DMAHelper::base(&instances_[rank]->output_);
       VLOG(3) << "rank " << rank << " got output tensor "
-              << actual.DebugString(output_length);
-      for (int i = 0; i < output_length; ++i) {
-        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
-            << "Mismatch at rank " << rank << " index " << i;
-      }
+              << instances_[rank]->output_.DebugString(
+                     instances_[rank]->output_.NumElements());
+      test::ExpectTensorEqual<float>(test::AsTensor<float>(expected),
+                                     instances_[rank]->output_);
     }
   }
 
-  std::unique_ptr<OpKernel> GetCollectiveReduceOpKernel(
-      const CollectiveParams& params, Tensor* input, DeviceBase* device) {
-    mutex_lock l(mu_);
-    NodeDef node_def;
-    NodeDefBuilder builder(strings::StrCat("collective_reduce_", op_counter_++),
-                           "CollectiveReduce");
-    TF_CHECK_OK(
-        builder.Attr("T", params.instance.data_type)
-            .Attr("merge_op", "Add")
-            .Attr("final_op", "Div")
-            .Attr("group_size", params.group.group_size)
-            .Attr("group_key", params.group.group_key)
-            .Attr("instance_key", params.instance.instance_key)
-            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
-            .Input(FakeInput(params.instance.data_type))
-            .Finalize(&node_def));
-    return GetKernel(node_def, device);
-  }
-
   class DeviceInstance {
    public:
-    DeviceInstance(int rank, const string& device_name, NcclTestBase* parent)
-        : parent_(parent),
-          device_name_(device_name),
-          rank_(rank),
-          col_params_(new CollectiveParams()) {
-      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
-          << "Could not find device " << device_name_ << " existing devices "
-          << parent_->dev_mgr_->DebugString();
+    DeviceInstance(int rank, const string& collective_name,
+                   CollectiveType collective_type, CollectiveTestEnv* test_env)
+        : test_env_(test_env) {  // TODO(tmorris): tensor_?
+      col_params_ =
+          CreateCollectiveParams(*test_env_, rank, collective_name,
+                                 collective_type, DT_FLOAT, TensorShape());
+      string device_name = col_params_->group.members[rank].device.name();
+      TF_CHECK_OK(test_env_->device_mgr->LookupDevice(device_name, &device_))
+          << "Could not find device " << device_name << " existing devices "
+          << test_env_->device_mgr->DebugString();
       merge_op_ = GetAdd(device_);
       final_op_ = GetDiv(device_);
-      col_params_->name = parent_->col_params_->name;
-      col_params_->default_rank = rank;
-      col_params_->group = parent_->col_params_->group;
-      col_params_->instance = parent->col_params_->instance;
     }
 
-    ~DeviceInstance() { col_params_->Unref(); }
-
     void InitTensor(DataType dtype, const TensorShape& shape,
                     const std::function<void(Tensor*)>& init_f) {
-      input_ =
-          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
-      Tensor cpu_tensor(dtype, shape);
-      init_f(&cpu_tensor);
+      input_ = Tensor(dtype, shape);
+      init_f(&input_);
       if (VLOG_IS_ON(3)) {
-        VLOG(3) << "input tensor "
-                << cpu_tensor.DebugString(shape.num_elements());
+        VLOG(3) << "input tensor " << input_.DebugString(shape.num_elements());
       } else {
-        VLOG(2) << "input tensor " << cpu_tensor.DebugString();
+        VLOG(2) << "input tensor " << input_.DebugString();
       }
-      auto* dev_info = device_->tensorflow_accelerator_device_info();
-      TF_CHECK_OK(dev_info->default_context->CopyCPUTensorToDeviceSync(
-          &cpu_tensor, device_, &input_));
-    }
-
-    void PrepareDeviceContext(OpKernelContext::Params* params) {
-      params->step_id = kStepId;
-      params->device = device_;
-      DeviceContext* dev_ctx = nullptr;
-      auto* dev_info = device_->tensorflow_accelerator_device_info();
-      if (dev_info) {
-        dev_ctx = dev_info->default_context;
-        dev_ctx->Ref();
-      } else {
-        dev_ctx = new DeviceContext;
-      }
-      params->op_device_context = dev_ctx;
     }
 
     void RunReduce() {
-      // Prepare an OpKernelContext.
-      OpKernelContext::Params op_params;
-      PrepareDeviceContext(&op_params);
-
-      // Prepare inputs and outputs to OpKernel.
-      gtl::InlinedVector<TensorValue, 4> inputs;
-      inputs.push_back(TensorValue(&input_));
-      op_params.inputs = inputs;
-      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
-          {AllocatorAttributes()});
-      op_params.input_alloc_attrs = input_aa;
-      int forward_from = 0;
-      op_params.forward_from_array = &forward_from;
-      AllocatorAttributes generic_alloc_attr;
-      op_params.output_attr_array = &generic_alloc_attr;
-      std::unique_ptr<OpKernel> op =
-          parent_->GetCollectiveReduceOpKernel(*col_params_, &input_, device_);
-      op_params.op_kernel = op.get();
-      OpKernelContext ctx(&op_params, 1);
-      // We never actually execute the kernel, so we need to do the output
-      // allocation it would do, ourselves.
-      Tensor* output_tensor_ptr = nullptr;
-      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, input_.shape(),
-                                                       &output_tensor_ptr));
-      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
-
-      // Run the all-reduce.
-      string exec_key =
-          strings::StrCat(col_params_->instance.instance_key, ":0:0");
-      auto* reducer = new NcclReducer();
-      auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->nccl_communicator_.get(),
-          parent_->dev_mgr_.get(),
-          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
-          /*input=*/&input_, /*output=*/&input_);
-      TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
-      Notification note;
-      reducer->Run([this, &note](Status s) {
-        status_ = s;
-        note.Notify();
-      });
-      note.WaitForNotification();
-      if (status_.ok()) {
-        CHECK(output_.CopyFrom(*ctx.mutable_output(0), input_.shape()));
-      }
+      output_ = input_;
+      status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
+                                          &input_, &output_);
+    }
 
-      reducer->Unref();
-      op_params.op_device_context->Unref();
+    void RunReduceScatter() {
+      // Allocate output. We can't reuse the input because output has a
+      // different shape.
+      auto output_shape = input_.shape();
+      output_shape.set_dim(
+          0, output_shape.dim_size(0) / col_params_->group.group_size);
+      output_ = Tensor(DT_FLOAT, output_shape);
+      status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
+                                          &input_, &output_);
     }
 
     void RunBroadcast() {
-      VLOG(2) << "RunBroadcast name " << parent_->collective_name_ << " rank "
-              << col_params_->default_rank;
-      // Prepare an OpKernelContext.
-      OpKernelContext::Params op_params;
-      PrepareDeviceContext(&op_params);
-      OpKernelContext ctx(&op_params, 1);
-
-      // Run broadcast.
-      string exec_key =
-          strings::StrCat(col_params_->instance.instance_key, ":0:0");
-      auto* broadcaster = new NcclBroadcaster();
-      auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->nccl_communicator_.get(),
-          parent_->dev_mgr_.get(),
-          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
-          /*input=*/col_params_->is_source ? &input_ : nullptr,
-          /*output=*/&input_);
-      TF_CHECK_OK(broadcaster->InitializeCollectiveContext(col_ctx));
-      Notification note;
-      broadcaster->Run([this, &note](Status s) {
-        status_ = s;
-        note.Notify();
-      });
-      note.WaitForNotification();
-      if (status_.ok()) {
-        CHECK(output_.CopyFrom(input_, input_.shape()));
-      }
-
-      broadcaster->Unref();
-      op_params.op_device_context->Unref();
+      output_ = input_;
+      status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
+                                          &input_, &output_);
     }
 
     void RunGather() {
-      VLOG(2) << "RunGather name " << parent_->collective_name_ << " rank "
-              << col_params_->default_rank;
-      // Prepare an OpKernelContext.
-      OpKernelContext::Params op_params;
-      PrepareDeviceContext(&op_params);
-      OpKernelContext ctx(&op_params, 1);
-
-      // Allocate output.  We can't reuse the input because output has a
+      // Allocate output. We can't reuse the input because output has a
       // different shape.
       auto output_shape = input_.shape();
       output_shape.set_dim(
           0, output_shape.dim_size(0) * col_params_->group.group_size);
-      output_ = Tensor(device_->GetAllocator(AllocatorAttributes()), DT_FLOAT,
-                       output_shape);
-
-      // Run gather.
-      string exec_key =
-          strings::StrCat(col_params_->instance.instance_key, ":0:0");
-      auto* gatherer = new NcclGatherer();
-      auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->nccl_communicator_.get(),
-          parent_->dev_mgr_.get(),
-          /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
-          /*input=*/&input_,
-          /*output=*/&output_);
-      TF_CHECK_OK(gatherer->InitializeCollectiveContext(col_ctx));
-      Notification note;
-      gatherer->Run([this, &note](Status s) {
-        status_ = s;
-        note.Notify();
-      });
-      note.WaitForNotification();
+      output_ = Tensor(DT_FLOAT, output_shape);
+      status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
+                                          &input_, &output_);
+    }
 
-      gatherer->Unref();
-      op_params.op_device_context->Unref();
+    void RunAllToAll() {
+      output_ = input_;
+      status_ = tensorflow::RunCollective(test_env_, col_params_.get(), device_,
+                                          &input_, &output_);
     }
 
-    NcclTestBase* parent_;
-    string device_name_;
-    int rank_;
+    CollectiveTestEnv* test_env_;
     Tensor input_;
     Tensor output_;
     Device* device_;
-    CollectiveParams* col_params_;
+    core::RefCountPtr<CollectiveParams> col_params_;
     std::unique_ptr<OpKernel> merge_op_;
     std::unique_ptr<OpKernel> final_op_;
     Status status_;
@@ -433,17 +236,10 @@ class NcclTestBase : public ::testing::Test {
 
   CollectiveType collective_type_;
   const string collective_name_;
-  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
-  TestCollectiveExecutorMgr col_exec_mgr_;
-  ConfigProto config_proto_;
-  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
-  std::shared_ptr<UnboundedWorkQueue> work_queue_;
-  CollectiveExecutor* col_exec_;
-  std::unique_ptr<DeviceMgr> dev_mgr_;
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
-  CollectiveParams* col_params_;
   mutex mu_;
   int32 op_counter_ TF_GUARDED_BY(mu_) = 0;
+  std::unique_ptr<CollectiveTestEnv> test_env_;
 };
 
 class NcclReducerTest : public NcclTestBase {
@@ -461,7 +257,7 @@ class NcclReducerTest : public NcclTestBase {
   }
 
   void InitExpected(std::vector<float>* expected, const int tensor_length,
-                    const int num_ranks) override {
+                    const int current_rank, const int num_ranks) override {
     expected->resize(tensor_length);
     for (int i = 0; i < tensor_length; ++i) {
       float expected_sum = 0.0;
@@ -481,6 +277,44 @@ class NcclReducerTest : public NcclTestBase {
   void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunReduce(); }
 };
 
+class NcclReduceScattererTest : public NcclTestBase {
+ protected:
+  NcclReduceScattererTest()
+      : NcclTestBase(/*collective_type=*/REDUCE_SCATTER_COLLECTIVE,
+                     /*collective_name=*/"NcclReduceScatter") {}
+  ~NcclReduceScattererTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      float value = pow(10, rank) * i;
+      input->flat<float>()(i) = value;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int current_rank, const int num_ranks) override {
+    const int output_length = tensor_length / num_ranks;
+    expected->resize(output_length);
+    for (int i = 0; i < output_length; ++i) {
+      float expected_sum = 0.0;
+      for (int rank = 0; rank < num_ranks; ++rank) {
+        float value = pow(10, rank) * (i + current_rank * output_length);
+        expected_sum += value;
+      }
+      (*expected)[i] = expected_sum / num_ranks;
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {
+    di->col_params_->merge_op = di->merge_op_.get();
+    di->col_params_->final_op = di->final_op_.get();
+  }
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override {
+    di->RunReduceScatter();
+  }
+};
+
 class NcclBroadcasterTest : public NcclTestBase {
  protected:
   NcclBroadcasterTest()
@@ -496,7 +330,7 @@ class NcclBroadcasterTest : public NcclTestBase {
   }
 
   void InitExpected(std::vector<float>* expected, const int tensor_length,
-                    const int num_ranks) override {
+                    const int current_rank, const int num_ranks) override {
     expected->resize(tensor_length);
     for (int i = 0; i < tensor_length; ++i) {
       (*expected)[i] = i;
@@ -530,7 +364,7 @@ class NcclGathererTest : public NcclTestBase {
   }
 
   void InitExpected(std::vector<float>* expected, const int tensor_length,
-                    const int num_ranks) override {
+                    const int current_rank, const int num_ranks) override {
     expected->resize(tensor_length * num_ranks, -1);
     for (int rank = 0, i = 0; rank < num_ranks; ++rank) {
       for (int j = 0; j < tensor_length; ++j, ++i) {
@@ -546,54 +380,120 @@ class NcclGathererTest : public NcclTestBase {
   int source_rank_ = 0;
 };
 
+class NcclAllToAllTest : public NcclTestBase {
+ protected:
+  NcclAllToAllTest()
+      : NcclTestBase(/*collective_type=*/ALL_TO_ALL_COLLECTIVE,
+                     /*collective_name=*/"NcclAllToAll") {}
+  ~NcclAllToAllTest() override = default;
+
+  void InitInput(Tensor* input, const int rank) override {
+    for (size_t i = 0; i < input->NumElements(); ++i) {
+      float value = rank * input->NumElements() + i;
+      input->flat<float>()(i) = value;
+    }
+  }
+
+  void InitExpected(std::vector<float>* expected, const int tensor_length,
+                    const int current_rank, const int num_ranks) override {
+    expected->resize(tensor_length);
+    // Each rank will have num_ranks parts of size part_size from each rank.
+    const int part_size = tensor_length / num_ranks;
+    for (int rank = 0, i = 0; rank < num_ranks; ++rank) {
+      for (int j = 0; j < part_size; ++j, ++i) {
+        const int part_index = current_rank + rank * num_ranks;
+        (*expected)[i] = part_index * part_size + j;
+      }
+    }
+  }
+
+  void InitDevice(DeviceInstance* di) override {}
+
+  void RunCollectiveOnDevice(DeviceInstance* di) override { di->RunAllToAll(); }
+
+  int source_rank_ = 0;
+};
+
 TEST_F(NcclReducerTest, Test2Dev16Len) {
-  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16);
 }
 TEST_F(NcclReducerTest, Test4Dev16Len) {
-  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16);
 }
 TEST_F(NcclReducerTest, Test8Dev16Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16);
 }
 TEST_F(NcclReducerTest, Test8Dev128Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128);
 }
-TEST_F(NcclReducerTest, Test8Dev1045991Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+TEST_F(NcclReducerTest, Test8Dev1048576Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576);
 }
 
 TEST_F(NcclBroadcasterTest, Test2Dev16LenSrc0) {
-  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16);
 }
 TEST_F(NcclBroadcasterTest, Test4Dev16LenSrc1) {
   source_rank_ = 1;
-  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16);
 }
 TEST_F(NcclBroadcasterTest, Test8Dev16LenSrc7) {
   source_rank_ = 7;
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16);
 }
 TEST_F(NcclBroadcasterTest, Test8Dev128LenSrc0) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/24);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128);
 }
-TEST_F(NcclBroadcasterTest, Test8Dev1045991LenSrc0) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+TEST_F(NcclBroadcasterTest, Test8Dev1048576LenSrc0) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576);
 }
 
 TEST_F(NcclGathererTest, Test2Dev16Len) {
-  RunTest(/*num_ranks=*/2, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16);
 }
 TEST_F(NcclGathererTest, Test4Dev16Len) {
-  RunTest(/*num_ranks=*/4, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16);
 }
 TEST_F(NcclGathererTest, Test8Dev16Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/16, /*instance_key=*/23);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16);
 }
 TEST_F(NcclGathererTest, Test8Dev128Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/128, /*instance_key=*/24);
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128);
+}
+TEST_F(NcclGathererTest, Test8Dev1048576Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576);
+}
+
+TEST_F(NcclReduceScattererTest, Test2Dev16Len) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16);
+}
+TEST_F(NcclReduceScattererTest, Test4Dev16Len) {
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16);
+}
+TEST_F(NcclReduceScattererTest, Test8Dev16Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16);
+}
+TEST_F(NcclReduceScattererTest, Test8Dev128Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128);
+}
+TEST_F(NcclReduceScattererTest, Test8Dev1048576Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576);
+}
+
+TEST_F(NcclAllToAllTest, Test2Dev16Len) {
+  RunTest(/*num_ranks=*/2, /*tensor_length=*/16);
+}
+TEST_F(NcclAllToAllTest, Test4Dev16Len) {
+  RunTest(/*num_ranks=*/4, /*tensor_length=*/16);
+}
+TEST_F(NcclAllToAllTest, Test8Dev16Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/16);
+}
+TEST_F(NcclAllToAllTest, Test8Dev128Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/128);
 }
-TEST_F(NcclGathererTest, Test8Dev1045991Len) {
-  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576, /*instance_key=*/23);
+TEST_F(NcclAllToAllTest, Test8Dev1048576Len) {
+  RunTest(/*num_ranks=*/8, /*tensor_length=*/1048576);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index fd8146361df..151ec25aeed 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -724,13 +724,15 @@ class CollectiveReduceV2OpKernel : public CollectiveOpV2Kernel {
                                               /*group_key*/ c->input(2),
                                               /*instance_key*/ c->input(3)),
                          done_with_cleanup);
+    col_params->instance.impl_details.max_subdivs_per_device =
+        max_subdivs_per_device_;
     col_params->instance.shape = c->input(0).shape();
     col_params->merge_op = merge_op_.get();
     col_params->final_op = final_op_.get();
     VLOG(1) << "CollectiveReduceV2 group_size " << col_params->group.group_size
             << " group_key " << col_params->group.group_key << " instance_key "
             << col_params->instance.instance_key;
-    // Allocate the output tensor, trying to reuse the input.
+    // Allocate the output tensor.
     Tensor* output = nullptr;
     OP_REQUIRES_OK_ASYNC(c,
                          c->forward_input_or_allocate_output(
@@ -951,7 +953,7 @@ class CollectiveInitializeCommunicatorOpKernel : public AsyncOpKernel {
     device_type_ = c->device_type();
   }
 
-  Status CheckInputs(Tensor group_size_t, Tensor group_key_t) {
+  Status CheckInputs(Tensor group_size_t, Tensor group_key_t, Tensor rank_t) {
     if (group_size_t.dims() > 0) {
       return errors::InvalidArgument(
           "Unexpected dimensions on input group_size. "
@@ -960,15 +962,32 @@ class CollectiveInitializeCommunicatorOpKernel : public AsyncOpKernel {
     }
     if (group_key_t.dims() > 0) {
       return errors::InvalidArgument(
-          "Unexpected dimensions on input group_key, got ",
+          "Unexpected dimensions on input group_key. ",
+          "It shoulbe a scalar, got tensor with shape ",
           group_key_t.shape().DebugString());
     }
+    if (rank_t.dims() > 0) {
+      return errors::InvalidArgument(
+          "Unexpected dimensions on input rank. ",
+          "It shoulbe a scalar, got tensor with shape ",
+          rank_t.shape().DebugString());
+    }
 
     auto group_size = group_size_t.unaligned_flat<int32>()(0);
     if (group_size <= 0) {
       return errors::InvalidArgument(
           "group_size must be positive integer but got ", group_size);
     }
+    auto rank = rank_t.unaligned_flat<int32>()(0);
+    if (rank < 0) {
+      return errors::InvalidArgument(
+          "rank must be non-negative integer but got ", rank);
+    }
+    if (rank >= group_size) {
+      return errors::InvalidArgument(
+          "rank must be less than group size but got ", rank,
+          " >= ", group_size);
+    }
     return OkStatus();
   }
 
@@ -977,7 +996,8 @@ class CollectiveInitializeCommunicatorOpKernel : public AsyncOpKernel {
     auto rank_t = c->input(1);
     auto group_size_t = c->input(2);
 
-    OP_REQUIRES_OK_ASYNC(c, CheckInputs(group_size_t, group_key_t), done);
+    OP_REQUIRES_OK_ASYNC(c, CheckInputs(group_size_t, group_key_t, rank_t),
+                         done);
 
     auto group_size = group_size_t.unaligned_flat<int32>()(0);
     auto group_key = group_key_t.unaligned_flat<int32>()(0);
@@ -1263,5 +1283,84 @@ REGISTER_KERNEL_BUILDER(Name("CollectiveAllToAllV3").Device(DEVICE_CPU),
                         CollectiveAllToAllV3OpKernel);
 REGISTER_KERNEL_BUILDER(Name("CollectiveAllToAllV3").Device(DEVICE_GPU),
                         CollectiveAllToAllV3OpKernel);
+
+class CollectiveReduceScatterV2OpKernel : public CollectiveOpV2Kernel {
+ public:
+  explicit CollectiveReduceScatterV2OpKernel(OpKernelConstruction* c)
+      : CollectiveOpV2Kernel(c) {
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    if (merge_op_name == "Max") {
+      merge_op_name = "Maximum";
+    } else if (merge_op_name == "Min") {
+      merge_op_name = "Minimum";
+    }
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("max_subdivs_per_device", &max_subdivs_per_device_));
+    // Prepare OpKernels for reduction and final operations.
+    // The merge_op takes two inputs
+    NodeDef sub_node;
+    sub_node.add_input(c->def().input(0));
+    sub_node.add_input(c->def().input(0));
+    sub_node.set_device(c->def().device());
+    SetAttrValue(data_type_, &(*sub_node.mutable_attr())["T"]);
+    merge_op_ = BuildOpKernel(c, merge_op_name, &sub_node);
+    final_op_ = BuildOpKernel(c, final_op_name, &sub_node);
+    name_ = strings::StrCat(c->def().name(), ": ReduceScatterV2(",
+                            merge_op_name, ",", final_op_name, ")");
+    VLOG(2) << "CollectiveReduceScatterV2 " << this << " name " << name_
+            << " communication_hint " << communication_hint_;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    auto col_params = new CollectiveParams();
+    auto done_with_cleanup = [col_params, done = std::move(done)]() {
+      done();
+      col_params->Unref();
+    };
+    OP_REQUIRES_OK_ASYNC(
+        c,
+        FillCollectiveParams(col_params, REDUCE_SCATTER_COLLECTIVE,
+                             /*group_size*/ c->input(1),
+                             /*group_key*/ c->input(2),
+                             /*instance_key*/ c->input(3)),
+        done_with_cleanup);
+    col_params->instance.impl_details.max_subdivs_per_device =
+        max_subdivs_per_device_;
+    auto output_shape = c->input(0).shape();
+    output_shape.set_dim(
+        0, output_shape.dim_size(0) / col_params->group.group_size);
+    col_params->instance.shape = output_shape;
+    col_params->merge_op = merge_op_.get();
+    col_params->final_op = final_op_.get();
+    VLOG(1) << "CollectiveReduceScatterV2 group_size "
+            << col_params->group.group_size << " group_key "
+            << col_params->group.group_key << " instance_key "
+            << col_params->instance.instance_key;
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_output(0, col_params->instance.shape, &output),
+        done_with_cleanup);
+    Run(c, col_params, std::move(done_with_cleanup));
+  }
+
+ private:
+  int max_subdivs_per_device_;
+  std::unique_ptr<OpKernel> merge_op_;
+  std::unique_ptr<OpKernel> final_op_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceScatterV2").Device(DEVICE_CPU),
+                        CollectiveReduceScatterV2OpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceScatterV2")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("group_size")
+                            .HostMemory("group_key")
+                            .HostMemory("instance_key"),
+                        CollectiveReduceScatterV2OpKernel);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 69e03906278..533f6ab4a96 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -66,7 +66,6 @@ void ConcatGPU(
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 15e59533a06..151372ac340 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -100,7 +100,6 @@ void ConcatGPU(
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/concat_lib_gpu.h b/tensorflow/core/kernels/concat_lib_gpu.h
index 159ba21f03f..5c154a1b77f 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.h
+++ b/tensorflow/core/kernels/concat_lib_gpu.h
@@ -67,7 +67,6 @@ void ConcatGPUImpl(const Eigen::GpuDevice& d,
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 8dcd79d09f4..05b70b16d3d 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -202,19 +202,15 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_GPUCONCAT32);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER_GPUCONCAT32);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT32);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_GPUCONCAT64);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER_GPUCONCAT64);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT64);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_GPU32);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER_GPU32);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU32);
 
 TF_CALL_INTEGRAL_TYPES(REGISTER_GPU64);  // int32 Needed for TensorLists.
-TF_CALL_bfloat16(REGISTER_GPU64);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU64);
 
 #undef REGISTER_GPUCONCAT32
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 8ade62d26d6..a1d77a78c51 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -155,7 +155,7 @@ class ConcatBaseOp : public OpKernel {
     TensorShape output_shape(input_shape);
     // TODO(rmlarsen): Remove rank 0 case once !allow_legacy_scalars()?
     if (output_shape.dims() == 0) {
-      output_shape.AddDim(output_concat_dim);
+      OP_REQUIRES_OK(c, output_shape.AddDimWithStatus(output_concat_dim));
     } else {
       output_shape.set_dim(axis, output_concat_dim);
     }
@@ -222,7 +222,6 @@ REGISTER_CONCAT(qint32);
                           ConcatV2Op<GPUDevice, type>)
 
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU);
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 74fe21dcbaa..e8b8534c2e4 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -219,6 +219,7 @@ REGISTER_KERNEL(GPU, int16);
 REGISTER_KERNEL(GPU, int64_t);
 REGISTER_KERNEL(GPU, bool);
 // Currently we do not support filling strings on GPU
+#endif
 
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -231,7 +232,6 @@ REGISTER_KERNEL_BUILDER(Name("Fill")
                             .HostMemory("value")
                             .HostMemory("output"),
                         FillOp<CPUDevice, int32, int32>);
-#endif
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 850a21be09b..e5c0c51441a 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
-#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h"
 
 // Returns true if TF_CONV2D_USE_FP16_ACCUMULATE == 1, false otherwise.
 static bool Conv2dUseFp16Accumulate() {
@@ -356,6 +356,30 @@ struct MatMulConvFunctor {
   }
 };
 
+// Use float32 accumulation for float16 by default to deal with precision
+// accumulation issues.  To enable float16 accumulation, set the environment
+// variable TF_CONV2D_USE_FP16_ACCUMULATE.
+template <typename Device, typename OutputKernel>
+struct MatMulConvFunctor<Device, Eigen::half, OutputKernel> {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, typename TTypes<Eigen::half, 2>::Tensor out,
+      typename TTypes<Eigen::half, 2>::ConstTensor in0,
+      typename TTypes<Eigen::half, 2>::ConstTensor in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    if (Conv2dUseFp16Accumulate()) {
+      out.device(d) = in0.contract(in1, dim_pair, output_kernel);
+    } else {
+      out.device(d) =
+          in0.cast<float>()
+              .contract(in1.template cast<float>(), dim_pair, output_kernel)
+              .template cast<Eigen::half>();
+    }
+  }
+};
+
 // Use float32 accumulation for bfloat16 to deal with precision accumulation
 // issues.
 template <typename Device, typename OutputKernel>
diff --git a/tensorflow/core/kernels/conv_2d_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..e5723d10a4b
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_bfloat16.cu.cc
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::bfloat16>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::bfloat16, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::bfloat16, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::bfloat16, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::bfloat16, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::bfloat16, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::bfloat16, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::bfloat16, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::bfloat16, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::bfloat16, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::bfloat16, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index c300c789d0d..d523eb535ec 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -33,22 +33,22 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "tensorflow/core/profiler/lib/scoped_annotation.h"
-
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/scoped_annotation.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
@@ -580,7 +580,7 @@ typedef AutotuneSingleton<ConvBackwardFilterAutotuneGroup, ConvParameters,
     AutotuneConvBwdFilter;
 
 template <typename T>
-void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
+void LaunchConv2DBackpropFilterOpImpl(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
@@ -723,12 +723,14 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
     const int64_t input_pad_left = padding_left - common_padding_cols;
     const int64_t input_pad_right = padding_right - common_padding_cols;
+    TensorShape compatible_input_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(data_format, dims.batch_size,
+                                       new_in_rows, new_in_cols, dims.in_depth,
+                                       &compatible_input_shape));
     OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(
-                 DataTypeToEnum<T>::value,
-                 ShapeFromFormat(data_format, dims.batch_size, new_in_rows,
-                                 new_in_cols, dims.in_depth),
-                 &compatible_input));
+        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                compatible_input_shape, &compatible_input));
 
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
@@ -744,12 +746,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       << "Negative row or col paddings: (" << common_padding_rows << ", "
       << common_padding_cols << ")";
 
-  // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with
-  // fp16 in NHWC data layout. In all other configurations it's more efficient
-  // to run computation in NCHW data format.
-  const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
-                               stream->GetCudaComputeCapability().IsAtLeast(
-                                   se::CudaComputeCapability::VOLTA);
+  const bool compute_in_nhwc =
+      ComputeInNhwcEnabled(DataTypeToEnum<T>::value, stream);
 
   // We only do one directional conversion: NHWC->NCHW. We never convert in the
   // other direction. Grappler layout optimizer selects the preferred layout and
@@ -822,9 +820,12 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   Tensor transformed_out_backprop;
   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
-    TensorShape compute_shape = ShapeFromFormat(
-        compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size,
-        dims.spatial_dims[1].output_size, dims.out_depth);
+    TensorShape compute_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(compute_data_format, dims.batch_size,
+                                       dims.spatial_dims[0].output_size,
+                                       dims.spatial_dims[1].output_size,
+                                       dims.out_depth, &compute_shape));
     if (dims.out_depth > 1) {
       OP_REQUIRES_OK(ctx,
                      ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
@@ -843,11 +844,14 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the `input` tensor from NHWC to NCHW.";
-    TensorShape compute_shape = ShapeFromFormat(
-        compute_data_format, GetTensorDim(compatible_input, data_format, 'N'),
-        GetTensorDim(compatible_input, data_format, 'H'),
-        GetTensorDim(compatible_input, data_format, 'W'),
-        GetTensorDim(compatible_input, data_format, 'C'));
+    TensorShape compute_shape;
+    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                            compute_data_format,
+                            GetTensorDim(compatible_input, data_format, 'N'),
+                            GetTensorDim(compatible_input, data_format, 'H'),
+                            GetTensorDim(compatible_input, data_format, 'W'),
+                            GetTensorDim(compatible_input, data_format, 'C'),
+                            &compute_shape));
     if (compute_shape.dim_size(1) > 1) {
       OP_REQUIRES_OK(ctx,
                      ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
@@ -875,9 +879,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
   static int64_t ConvolveBackwardFilterScratchSize =
       GetDnnWorkspaceLimitOrDefault();
-  int device_id = stream->parent()->device_ordinal();
-  DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
+      stream->parent(),
       dims.batch_size,                     // batch
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
@@ -893,9 +896,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         dims.spatial_dims[1].stride}},     // stride_cols
       {{common_padding_rows,               // padding_rows
         common_padding_cols}},             // padding_cols
-      dtype,                               // tensor datatype
-      device_id,                           // device_id
-      conv_desc.group_count()              // group_count
+      input.dtype(),                       // tensor datatype
+      conv_desc.group_count(),             // group_count
   };
 
   auto entry_or = AutotuneUnfusedConv(
@@ -927,6 +929,71 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       filter_backprop->tensor<T, 4>());
 }
 
+template <typename T>
+void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
+  LaunchConv2DBackpropFilterOpImpl<T>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      filter_backprop, data_format);
+}
+
+template <>
+void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, Eigen::bfloat16>::
+operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+           const Tensor& out_backprop, const Tensor& input, int row_dilation,
+           int col_dilation, int row_stride, int col_stride,
+           const Padding& padding,
+           const std::vector<int64_t>& explicit_paddings,
+           Tensor* filter_backprop, TensorFormat data_format) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = ctx->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+
+  if (cast_to_float) {
+    Tensor casted_input = input;
+    Tensor casted_out_backprop = out_backprop;
+    Tensor casted_filter_backprop = *filter_backprop;
+
+    const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
+    cast(device, casted_input.template flat<float>(),
+         input.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                           &casted_out_backprop));
+    cast(device, casted_out_backprop.template flat<float>(),
+         out_backprop.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                           &casted_filter_backprop));
+
+    LaunchConv2DBackpropFilterOpImpl<float>(
+        ctx, use_cudnn, cudnn_use_autotune, casted_out_backprop, casted_input,
+        row_dilation, col_dilation, row_stride, col_stride, padding,
+        explicit_paddings, &casted_filter_backprop, data_format);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+    cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
+              casted_filter_backprop_const.template flat<float>());
+    return;
+  }
+
+  LaunchConv2DBackpropFilterOpImpl<Eigen::bfloat16>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, input, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      filter_backprop, data_format);
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                             \
@@ -947,6 +1014,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
@@ -966,6 +1034,11 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .TypeConstraint<Eigen::half>("T")
                             .HostMemory("filter_sizes"),
                         Conv2DBackpropFilterOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .HostMemory("filter_sizes"),
+                        Conv2DBackpropFilterOp<GPUDevice, Eigen::bfloat16>);
 
 // To be used inside depthwise_conv_grad_op.cc.
 // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 377846fcde3..85d4c73153f 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -70,7 +71,7 @@ struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
-void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
+void LaunchConv2DBackpropInputOpGpuImpl(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
@@ -201,8 +202,10 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         dims.spatial_dims[0].input_size + padding_rows_diff;
     const int64_t new_in_cols =
         dims.spatial_dims[1].input_size + padding_cols_diff;
-    compatible_input_shape = ShapeFromFormat(
-        data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(data_format, dims.batch_size,
+                                       new_in_rows, new_in_cols, dims.in_depth,
+                                       &compatible_input_shape));
   } else {
     compatible_input_shape = input_shape;
   }
@@ -211,12 +214,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       << "Negative row or col paddings: (" << common_padding_rows << ", "
       << common_padding_cols << ")";
 
-  // The Tensor Core in NVIDIA Volta+ GPUs supports efficient convolution with
-  // fp16 in NHWC data layout. In all other configurations it's more efficient
-  // to run computation in NCHW data format.
-  const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
-                               stream->GetCudaComputeCapability().IsAtLeast(
-                                   se::CudaComputeCapability::VOLTA);
+  const bool compute_in_nhwc =
+      ComputeInNhwcEnabled(DataTypeToEnum<T>::value, stream);
 
   // We only do one directional conversion: NHWC->NCHW. We never convert in the
   // other direction. Grappler layout optimizer selects the preferred layout and
@@ -309,9 +308,12 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   Tensor transformed_out_backprop;
   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the `out_backprop` tensor from NHWC to NCHW.";
-    TensorShape compute_shape = ShapeFromFormat(
-        compute_data_format, dims.batch_size, dims.spatial_dims[0].output_size,
-        dims.spatial_dims[1].output_size, dims.out_depth);
+    TensorShape compute_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(compute_data_format, dims.batch_size,
+                                       dims.spatial_dims[0].output_size,
+                                       dims.spatial_dims[1].output_size,
+                                       dims.out_depth, &compute_shape));
     if (dims.out_depth > 1) {
       OP_REQUIRES_OK(ctx,
                      ctx->allocate_temp(DataTypeToEnum<T>::value, compute_shape,
@@ -328,16 +330,18 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   }
 
   Tensor pre_transformed_in_backprop;
-  OP_REQUIRES_OK(
-      ctx, ctx->allocate_temp(
-               DataTypeToEnum<T>::value,
-               ShapeFromFormat(
-                   compute_data_format,
-                   GetTensorDim(compatible_input_shape, data_format, 'N'),
-                   GetTensorDim(compatible_input_shape, data_format, 'H'),
-                   GetTensorDim(compatible_input_shape, data_format, 'W'),
-                   GetTensorDim(compatible_input_shape, data_format, 'C')),
-               &pre_transformed_in_backprop));
+  TensorShape pre_transformed_in_backprop_shape;
+  OP_REQUIRES_OK(ctx,
+                 ShapeFromFormatWithStatus(
+                     compute_data_format,
+                     GetTensorDim(compatible_input_shape, data_format, 'N'),
+                     GetTensorDim(compatible_input_shape, data_format, 'H'),
+                     GetTensorDim(compatible_input_shape, data_format, 'W'),
+                     GetTensorDim(compatible_input_shape, data_format, 'C'),
+                     &pre_transformed_in_backprop_shape));
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                         pre_transformed_in_backprop_shape,
+                                         &pre_transformed_in_backprop));
 
   auto out_backprop_ptr =
       AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -352,9 +356,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   static int64_t ConvolveBackwardDataScratchSize =
       GetDnnWorkspaceLimitOrDefault();
 
-  int device_id = stream->parent()->device_ordinal();
-  DataType dtype = out_backprop.dtype();
   ConvParameters conv_parameters = {
+      stream->parent(),
       dims.batch_size,                     // batch
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
@@ -370,9 +373,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         dims.spatial_dims[1].stride}},     // stride_cols
       {{common_padding_rows,               // padding_rows
         common_padding_cols}},             // padding_cols
-      dtype,                               // tensor data type
-      device_id,                           // device_id
-      conv_desc.group_count()              // group_count
+      out_backprop.dtype(),                // tensor data type
+      conv_desc.group_count(),             // group_count
   };
 
   auto entry_or = AutotuneUnfusedConv(
@@ -396,15 +398,17 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
   if (padding_top != padding_bottom || padding_left != padding_right) {
     Tensor in_backprop_remove_padding;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(
-                 DataTypeToEnum<T>::value,
-                 ShapeFromFormat(compute_data_format,
-                                 GetTensorDim(input_shape, data_format, 'N'),
-                                 GetTensorDim(input_shape, data_format, 'H'),
-                                 GetTensorDim(input_shape, data_format, 'W'),
-                                 GetTensorDim(input_shape, data_format, 'C')),
-                 &in_backprop_remove_padding));
+    TensorShape in_backprop_remove_padding_shape;
+    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                            compute_data_format,
+                            GetTensorDim(input_shape, data_format, 'N'),
+                            GetTensorDim(input_shape, data_format, 'H'),
+                            GetTensorDim(input_shape, data_format, 'W'),
+                            GetTensorDim(input_shape, data_format, 'C'),
+                            &in_backprop_remove_padding_shape));
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           in_backprop_remove_padding_shape,
+                                           &in_backprop_remove_padding));
 
     // Remove the padding that was added to the input shape above.
     const int64_t input_pad_top = padding_top - common_padding_rows;
@@ -436,6 +440,70 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   }
 }
 
+template <typename T>
+void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* in_backprop,
+    TensorFormat data_format) {
+  LaunchConv2DBackpropInputOpGpuImpl<T>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      in_backprop, data_format);
+}
+
+template <>
+void LaunchConv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* in_backprop,
+    TensorFormat data_format) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = ctx->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+
+  if (cast_to_float) {
+    Tensor casted_out_backprop = out_backprop;
+    Tensor casted_filter = filter;
+    Tensor casted_in_backprop = *in_backprop;
+
+    const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                           &casted_out_backprop));
+    cast(device, casted_out_backprop.template flat<float>(),
+         out_backprop.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_FLOAT, filter.shape(), &casted_filter));
+    cast(device, casted_filter.template flat<float>(),
+         filter.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in_backprop->shape(),
+                                           &casted_in_backprop));
+
+    LaunchConv2DBackpropInputOpGpuImpl<float>(
+        ctx, use_cudnn, cudnn_use_autotune, casted_out_backprop, casted_filter,
+        row_dilation, col_dilation, row_stride, col_stride, padding,
+        explicit_paddings, &casted_in_backprop, data_format);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_in_backprop_const = casted_in_backprop;
+    cast_back(device, in_backprop->template flat<Eigen::bfloat16>(),
+              casted_in_backprop_const.template flat<float>());
+    return;
+  }
+
+  LaunchConv2DBackpropInputOpGpuImpl<Eigen::bfloat16>(
+      ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings,
+      in_backprop, data_format);
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                             \
@@ -456,6 +524,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
@@ -497,6 +566,11 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .TypeConstraint<Eigen::half>("T")
                             .HostMemory("input_sizes"),
                         Conv2DBackpropInputOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .HostMemory("input_sizes"),
+                        Conv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.h b/tensorflow/core/kernels/conv_grad_input_ops.h
index 14084d86cd4..f330ee672a6 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.h
+++ b/tensorflow/core/kernels/conv_grad_input_ops.h
@@ -48,7 +48,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 73cb7b02c7c..fabb4f5bc14 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -40,10 +40,11 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/platform/stream_executor.h"
 using stream_executor::dnn::DimIndex;
 #include "tensorflow/core/protobuf/autotuning.pb.h"
@@ -1233,6 +1234,7 @@ namespace functor {
       const T& padding_value);
 
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
@@ -1248,6 +1250,374 @@ typedef AutotuneSingleton<Conv3dBackwardDataAutotuneGroup, ConvParameters,
 
     AutotuneConv3dBwdData;
 
+template <typename T>
+void LaunchConvBackpropInputOpImpl(
+    OpKernelContext* context, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& filter,
+    const std::vector<int32>& dilation, const std::vector<int32>& stride,
+    const Padding& padding, Tensor* in_backprop, TensorFormat data_format) {
+  const TensorShape& filter_shape = filter.shape();
+  const TensorShape& out_backprop_shape = out_backprop.shape();
+  const TensorShape& input_shape = in_backprop->shape();
+
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(
+      context, ConvBackpropComputeDimensionsV2(
+                   "Conv3DBackpropInputOp", /*num_spatial_dims=*/3, input_shape,
+                   filter_shape, out_backprop_shape, dilation, stride, padding,
+                   /*explicit_paddings=*/{}, data_format, &dims));
+
+  auto* stream = context->op_device_context()->stream();
+  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+  bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
+  if (!is_grouped_convolution && dims.filter_size(0) == 1 &&
+      dims.filter_size(1) == 1 && dims.filter_size(2) == 1 &&
+      dims.dilation(0) == 1 && dims.dilation(1) == 1 && dims.dilation(2) == 1 &&
+      dims.stride(0) == 1 && dims.stride(1) == 1 && dims.stride(2) == 1 &&
+      data_format == FORMAT_NHWC) {
+    const uint64 m = dims.batch_size * dims.input_size(0) * dims.input_size(1) *
+                     dims.input_size(2);
+    const uint64 k = dims.out_depth;
+    const uint64 n = dims.in_depth;
+
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+                                in_backprop->template flat<T>().size());
+
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+
+    OP_REQUIRES_OK(
+        context,
+        stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k, a_ptr,
+                             k, &c_ptr, n, se::blas::kDefaultComputePrecision));
+    return;
+  } else if (!is_grouped_convolution &&
+             dims.filter_size(0) == dims.input_size(0) &&
+             dims.filter_size(1) == dims.input_size(1) &&
+             dims.filter_size(2) == dims.input_size(2) &&
+             padding == Padding::VALID && data_format == FORMAT_NHWC) {
+    const uint64 m = dims.batch_size;
+    const uint64 k = dims.out_depth;
+    const uint64 n = dims.input_size(0) * dims.input_size(1) *
+                     dims.input_size(2) * dims.in_depth;
+
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+                                in_backprop->template flat<T>().size());
+
+    auto transpose = se::blas::Transpose::kTranspose;
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+
+    OP_REQUIRES_OK(
+        context,
+        stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k, a_ptr,
+                             k, &c_ptr, n, se::blas::kDefaultComputePrecision));
+    return;
+  }
+
+  int padding_planes = dims.SpatialPadding(padding, 0);
+  int padding_rows = dims.SpatialPadding(padding, 1);
+  int padding_cols = dims.SpatialPadding(padding, 2);
+  const bool planes_odd = (padding_planes % 2 != 0);
+  const bool rows_odd = (padding_rows % 2 != 0);
+  const bool cols_odd = (padding_cols % 2 != 0);
+
+  TensorShape compatible_input_shape;
+  if (rows_odd || cols_odd || planes_odd) {
+    // cuDNN only supports the same amount of padding on both sides.
+    compatible_input_shape = {
+        dims.batch_size,
+        dims.in_depth,
+        dims.input_size(0) + planes_odd,
+        dims.input_size(1) + rows_odd,
+        dims.input_size(2) + cols_odd,
+    };
+  } else {
+    compatible_input_shape = {dims.batch_size, dims.in_depth,
+                              dims.input_size(0), dims.input_size(1),
+                              dims.input_size(2)};
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
+      << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", "
+      << padding_planes << ")";
+
+  const bool compute_in_nhwc = ComputeInNhwcEnabled(
+      DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
+
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
+
+  VLOG(3) << "Compute Conv3DBackpropInput with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
+
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+  se::dnn::BatchDescriptor input_desc(3);
+  input_desc.set_count(dims.batch_size)
+      .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
+      .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
+      .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
+      .set_feature_map_count(dims.in_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::BatchDescriptor output_desc(3);
+  output_desc.set_count(dims.batch_size)
+      .set_spatial_dim(DimIndex::X, dims.output_size(2))
+      .set_spatial_dim(DimIndex::Y, dims.output_size(1))
+      .set_spatial_dim(DimIndex::Z, dims.output_size(0))
+      .set_feature_map_count(dims.out_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::FilterDescriptor filter_desc(3);
+  filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
+      .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
+      .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
+      .set_input_feature_map_count(filter_shape.dim_size(3))
+      .set_output_feature_map_count(filter_shape.dim_size(4))
+      .set_layout(filter_layout);
+  se::dnn::ConvolutionDescriptor conv_desc(3);
+  conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
+      .set_dilation_rate(DimIndex::Y, dims.dilation(1))
+      .set_dilation_rate(DimIndex::Z, dims.dilation(0))
+      .set_filter_stride(DimIndex::X, dims.stride(2))
+      .set_filter_stride(DimIndex::Y, dims.stride(1))
+      .set_filter_stride(DimIndex::Z, dims.stride(0))
+      .set_zero_padding(DimIndex::X, padding_cols / 2)
+      .set_zero_padding(DimIndex::Y, padding_rows / 2)
+      .set_zero_padding(DimIndex::Z, padding_planes / 2)
+      .set_group_count(dims.in_depth / filter_shape.dim_size(3));
+
+  // Shape: out, in, z, y, x.
+  Tensor transformed_filter;
+  auto dst_format =
+      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+  TensorShape dst_shape =
+      dst_format == FORMAT_OIHW
+          ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                         dims.filter_size(0), dims.filter_size(1),
+                         dims.filter_size(2)})
+          : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
+                         dims.filter_size(1), dims.filter_size(2),
+                         filter_shape.dim_size(3)});
+  OP_REQUIRES_OK(context,
+                 context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                        &transformed_filter));
+
+  functor::TransformFilter<GPUDevice, T, int, 5>()(
+      context->eigen_device<GPUDevice>(), dst_format,
+      To32Bit(filter.tensor<T, 5>()),
+      To32Bit(transformed_filter.tensor<T, 5>()));
+
+  // Shape: batch, filters, z, y, x.
+  Tensor transformed_out_backprop;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
+                              dims.output_size(0), dims.output_size(1),
+                              dims.output_size(2)};
+    if (dims.out_depth > 1) {
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                          &transformed_out_backprop));
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+          transformed_out_backprop.tensor<T, 5>());
+    } else {
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    }
+  } else {
+    transformed_out_backprop = out_backprop;
+  }
+  // Shape: batch, filters, z, y, x.
+  Tensor pre_transformed_in_backprop;
+  TensorShape pre_transformed_in_backprop_shape;
+  OP_REQUIRES_OK(
+      context, ShapeFromFormatWithStatus(compute_data_format,
+                                         compatible_input_shape.dim_size(0),
+                                         {{compatible_input_shape.dim_size(2),
+                                           compatible_input_shape.dim_size(3),
+                                           compatible_input_shape.dim_size(4)}},
+                                         compatible_input_shape.dim_size(1),
+                                         &pre_transformed_in_backprop_shape));
+  OP_REQUIRES_OK(context,
+                 context->allocate_temp(DataTypeToEnum<T>::value,
+                                        pre_transformed_in_backprop_shape,
+                                        &pre_transformed_in_backprop));
+
+  auto out_backprop_ptr =
+      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                     transformed_out_backprop.template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto in_backprop_ptr =
+      AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
+                     pre_transformed_in_backprop.template flat<T>().size());
+
+  static int64_t ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 33);  // 8GB by default
+
+  // To make sure the Conv3DBackpropInputV2 get the correct dtype, we infer
+  // the dtype from 2nd input, i.e., out_backprop.
+  DataType dtype = context->input(2).dtype();
+  const ConvParameters conv_parameters = {
+      stream->parent(),
+      dims.batch_size,
+      dims.in_depth,
+      {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
+      compute_data_format,
+      dims.out_depth,
+      {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
+      {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
+      {{dims.stride(0), dims.stride(1), dims.stride(2)}},
+      {{padding_planes, padding_rows, padding_cols}},
+      dtype,
+      conv_desc.group_count(),
+  };
+
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
+
+  auto entry_or = AutotuneUnfusedConv(
+      cudnn_use_autotune, AutotuneConv3dBwdData::GetInstance(), conv_parameters,
+      context, se::dnn::ConvolutionKind::BACKWARD_DATA, input_desc,
+      in_backprop_ptr, filter_desc, filter_ptr, conv_desc, output_desc,
+      out_backprop_ptr, ConvolveBackwardDataScratchSize);
+  OP_REQUIRES_OK(context, entry_or.status());
+  auto autotune_entry = std::move(entry_or).value();
+
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                        context);
+  Status cudnn_launch_status =
+      LaunchAutotunedConv(autotune_entry, &scratch_allocator,
+                          se::dnn::ConvolutionKind::BACKWARD_DATA, stream,
+                          input_desc, in_backprop_ptr, filter_desc, filter_ptr,
+                          conv_desc, output_desc, out_backprop_ptr);
+  if (!cudnn_launch_status.ok()) {
+    context->SetStatus(cudnn_launch_status);
+    return;
+  }
+
+  if (rows_odd || cols_odd || planes_odd) {
+    Tensor in_backprop_remove_padding;
+    TensorShape in_backprop_remove_padding_shape;
+    OP_REQUIRES_OK(
+        context,
+        ShapeFromFormatWithStatus(
+            compute_data_format, dims.batch_size,
+            {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
+            dims.in_depth, &in_backprop_remove_padding_shape));
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          in_backprop_remove_padding_shape,
+                                          &in_backprop_remove_padding));
+
+    // Remove the padding for odd spatial dimensions.
+    functor::PadInput<GPUDevice, T, int, 5>()(
+        context->eigen_device<GPUDevice>(),
+        To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
+                    .tensor<T, 5>()),
+        {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
+        To32Bit(in_backprop_remove_padding.tensor<T, 5>()), compute_data_format,
+        T{});
+
+    pre_transformed_in_backprop = in_backprop_remove_padding;
+  }
+
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 5>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(pre_transformed_in_backprop).template tensor<T, 5>(),
+        in_backprop->tensor<T, 5>());
+  } else {
+    *in_backprop = pre_transformed_in_backprop;
+  }
+}
+
+template <typename T>
+struct LaunchConvBackpropInputOp {
+  static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                     const Tensor& out_backprop, const Tensor& filter,
+                     const std::vector<int32>& dilation,
+                     const std::vector<int32>& stride, const Padding& padding,
+                     Tensor* in_backprop, TensorFormat data_format) {
+    LaunchConvBackpropInputOpImpl<T>(context, cudnn_use_autotune, out_backprop,
+                                     filter, dilation, stride, padding,
+                                     in_backprop, data_format);
+  }
+};
+
+template <>
+struct LaunchConvBackpropInputOp<Eigen::bfloat16> {
+  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                     const Tensor& out_backprop, const Tensor& filter,
+                     const std::vector<int32>& dilation,
+                     const std::vector<int32>& strides, const Padding& padding,
+                     Tensor* in_backprop, TensorFormat data_format) {
+    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+    auto* stream = ctx->op_device_context()->stream();
+    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+        se::CudaComputeCapability::AMPERE);
+
+    if (cast_to_float) {
+      Tensor casted_out_backprop = out_backprop;
+      Tensor casted_filter = filter;
+      Tensor casted_in_backprop = *in_backprop;
+
+      const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                             &casted_out_backprop));
+      cast(device, casted_out_backprop.template flat<float>(),
+           out_backprop.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_FLOAT, filter.shape(), &casted_filter));
+      cast(device, casted_filter.template flat<float>(),
+           filter.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in_backprop->shape(),
+                                             &casted_in_backprop));
+
+      LaunchConvBackpropInputOpImpl<float>(
+          ctx, cudnn_use_autotune, casted_out_backprop, casted_filter, dilation,
+          strides, padding, &casted_in_backprop, data_format);
+
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_in_backprop_const = casted_in_backprop;
+      cast_back(device, in_backprop->template flat<Eigen::bfloat16>(),
+                casted_in_backprop_const.template flat<float>());
+      return;
+    }
+
+    LaunchConvBackpropInputOpImpl<Eigen::bfloat16>(
+        ctx, cudnn_use_autotune, out_backprop, filter, dilation, strides,
+        padding, in_backprop, data_format);
+  }
+};
+
 template <typename T>
 class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
  public:
@@ -1299,10 +1669,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
   }
   void Compute(OpKernelContext* context) override {
     const Tensor& filter = context->input(1);
-    const TensorShape& filter_shape = filter.shape();
 
     const Tensor& out_backprop = context->input(2);
-    const TensorShape& out_backprop_shape = out_backprop.shape();
 
     TensorShape input_shape;
     if (takes_shape_) {
@@ -1312,301 +1680,18 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       input_shape = context->input(0).shape();
     }
 
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
-                                "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
-                                input_shape, filter_shape, out_backprop_shape,
-                                dilation_, stride_, padding_,
-                                /*explicit_paddings=*/{}, data_format_, &dims));
-
     Tensor* in_backprop;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
-
-    auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-    bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
-    if (!is_grouped_convolution && dims.filter_size(0) == 1 &&
-        dims.filter_size(1) == 1 && dims.filter_size(2) == 1 &&
-        dims.dilation(0) == 1 && dims.dilation(1) == 1 &&
-        dims.dilation(2) == 1 && dims.stride(0) == 1 && dims.stride(1) == 1 &&
-        dims.stride(2) == 1 && data_format_ == FORMAT_NHWC) {
-      const uint64 m = dims.batch_size * dims.input_size(0) *
-                       dims.input_size(1) * dims.input_size(2);
-      const uint64 k = dims.out_depth;
-      const uint64 n = dims.in_depth;
-
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
-                                  in_backprop->template flat<T>().size());
-
-      auto transpose = se::blas::Transpose::kTranspose;
-      auto no_transpose = se::blas::Transpose::kNoTranspose;
-
-      OP_REQUIRES_OK(
-          context, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr,
-                                        k, a_ptr, k, &c_ptr, n,
-                                        se::blas::kDefaultComputePrecision));
-      return;
-    } else if (!is_grouped_convolution &&
-               dims.filter_size(0) == dims.input_size(0) &&
-               dims.filter_size(1) == dims.input_size(1) &&
-               dims.filter_size(2) == dims.input_size(2) &&
-               padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) {
-      const uint64 m = dims.batch_size;
-      const uint64 k = dims.out_depth;
-      const uint64 n = dims.input_size(0) * dims.input_size(1) *
-                       dims.input_size(2) * dims.in_depth;
-
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
-                                  in_backprop->template flat<T>().size());
-
-      auto transpose = se::blas::Transpose::kTranspose;
-      auto no_transpose = se::blas::Transpose::kNoTranspose;
-
-      OP_REQUIRES_OK(
-          context, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr,
-                                        k, a_ptr, k, &c_ptr, n,
-                                        se::blas::kDefaultComputePrecision));
-      return;
-    }
-
-    int padding_planes = dims.SpatialPadding(padding_, 0);
-    int padding_rows = dims.SpatialPadding(padding_, 1);
-    int padding_cols = dims.SpatialPadding(padding_, 2);
-    const bool planes_odd = (padding_planes % 2 != 0);
-    const bool rows_odd = (padding_rows % 2 != 0);
-    const bool cols_odd = (padding_cols % 2 != 0);
-
-    TensorShape compatible_input_shape;
-    if (rows_odd || cols_odd || planes_odd) {
-      // cuDNN only supports the same amount of padding on both sides.
-      compatible_input_shape = {
-          dims.batch_size,
-          dims.in_depth,
-          dims.input_size(0) + planes_odd,
-          dims.input_size(1) + rows_odd,
-          dims.input_size(2) + cols_odd,
-      };
-    } else {
-      compatible_input_shape = {dims.batch_size, dims.in_depth,
-                                dims.input_size(0), dims.input_size(1),
-                                dims.input_size(2)};
-    }
-
-    CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
-        << "Negative paddings: (" << padding_rows << ", " << padding_cols
-        << ", " << padding_planes << ")";
-
-#if GOOGLE_CUDA
-    const bool compute_in_nhwc =
-        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
-#else
-    // fast NDHWC implementation is a CUDA only feature
-    const bool compute_in_nhwc = false;
-#endif
-    const TensorFormat compute_data_format =
-        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
-                                                         : FORMAT_NCHW;
-
-    VLOG(3) << "Compute Conv3DBackpropInput with cuDNN:"
-            << " data_format=" << ToString(data_format_)
-            << " compute_data_format=" << ToString(compute_data_format);
-
-    constexpr auto kComputeInNHWC =
-        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
-                        se::dnn::FilterLayout::kOutputYXInput);
-    constexpr auto kComputeInNCHW =
-        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
-                        se::dnn::FilterLayout::kOutputInputYX);
-
-    se::dnn::DataLayout compute_data_layout;
-    se::dnn::FilterLayout filter_layout;
-
-    std::tie(compute_data_layout, filter_layout) =
-        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
-
-    se::dnn::BatchDescriptor input_desc(3);
-    input_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
-        .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
-        .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
-        .set_feature_map_count(dims.in_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::BatchDescriptor output_desc(3);
-    output_desc.set_count(dims.batch_size)
-        .set_spatial_dim(DimIndex::X, dims.output_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.output_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.output_size(0))
-        .set_feature_map_count(dims.out_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::FilterDescriptor filter_desc(3);
-    filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
-        .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
-        .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
-        .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4))
-        .set_layout(filter_layout);
-    se::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
-        .set_dilation_rate(DimIndex::Y, dims.dilation(1))
-        .set_dilation_rate(DimIndex::Z, dims.dilation(0))
-        .set_filter_stride(DimIndex::X, dims.stride(2))
-        .set_filter_stride(DimIndex::Y, dims.stride(1))
-        .set_filter_stride(DimIndex::Z, dims.stride(0))
-        .set_zero_padding(DimIndex::X, padding_cols / 2)
-        .set_zero_padding(DimIndex::Y, padding_rows / 2)
-        .set_zero_padding(DimIndex::Z, padding_planes / 2)
-        .set_group_count(dims.in_depth / filter_shape.dim_size(3));
-
-    // Shape: out, in, z, y, x.
-    Tensor transformed_filter;
-    auto dst_format =
-        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
-    TensorShape dst_shape =
-        dst_format == FORMAT_OIHW
-            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
-                           dims.filter_size(0), dims.filter_size(1),
-                           dims.filter_size(2)})
-            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
-                           dims.filter_size(1), dims.filter_size(2),
-                           filter_shape.dim_size(3)});
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
-                                          &transformed_filter));
-
-    functor::TransformFilter<GPUDevice, T, int, 5>()(
-        context->eigen_device<GPUDevice>(), dst_format,
-        To32Bit(filter.tensor<T, 5>()),
-        To32Bit(transformed_filter.tensor<T, 5>()));
-
-    // Shape: batch, filters, z, y, x.
-    Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
-                                dims.output_size(0), dims.output_size(1),
-                                dims.output_size(2)};
-      if (dims.out_depth > 1) {
-        OP_REQUIRES_OK(context, context->allocate_temp(
-                                    DataTypeToEnum<T>::value, nchw_shape,
-                                    &transformed_out_backprop));
-        functor::NHWCToNCHW<GPUDevice, T, 5>()(
-            context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-            transformed_out_backprop.tensor<T, 5>());
-      } else {
-        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    for (std::size_t i = 0; i < input_shape.dims(); ++i) {
+      if (input_shape.dim_size(i) == 0) {
+        return;
       }
-    } else {
-      transformed_out_backprop = out_backprop;
-    }
-    // Shape: batch, filters, z, y, x.
-    Tensor pre_transformed_in_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(
-                       DataTypeToEnum<T>::value,
-                       ShapeFromFormat(compute_data_format,
-                                       compatible_input_shape.dim_size(0),
-                                       {{compatible_input_shape.dim_size(2),
-                                         compatible_input_shape.dim_size(3),
-                                         compatible_input_shape.dim_size(4)}},
-                                       compatible_input_shape.dim_size(1)),
-                       &pre_transformed_in_backprop));
-
-    auto out_backprop_ptr =
-        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                       transformed_out_backprop.template flat<T>().size());
-    auto filter_ptr =
-        AsDeviceMemory(transformed_filter.template flat<T>().data(),
-                       transformed_filter.template flat<T>().size());
-    auto in_backprop_ptr =
-        AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
-                       pre_transformed_in_backprop.template flat<T>().size());
-
-    static int64_t ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
-        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 33);  // 8GB by default
-
-    const int device_id = stream->parent()->device_ordinal();
-    // To make sure the Conv3DBackpropInputV2 get the correct dtype, we infer
-    // the dtype from 2nd input, i.e., out_backprop.
-    DataType dtype = context->input(2).dtype();
-    const ConvParameters conv_parameters = {
-        dims.batch_size,
-        dims.in_depth,
-        {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        compute_data_format,
-        dims.out_depth,
-        {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
-        {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
-        {{dims.stride(0), dims.stride(1), dims.stride(2)}},
-        {{padding_planes, padding_rows, padding_cols}},
-        dtype,
-        device_id,
-        conv_desc.group_count()};
-
-    using se::dnn::AlgorithmConfig;
-    using se::dnn::AlgorithmDesc;
-    using se::dnn::ProfileResult;
-
-    auto entry_or = AutotuneUnfusedConv(
-        cudnn_use_autotune_, AutotuneConv3dBwdData::GetInstance(),
-        conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_DATA,
-        input_desc, in_backprop_ptr, filter_desc, filter_ptr, conv_desc,
-        output_desc, out_backprop_ptr, ConvolveBackwardDataScratchSize);
-    OP_REQUIRES_OK(context, entry_or.status());
-    auto autotune_entry = std::move(entry_or).value();
-
-    DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                          context);
-    Status cudnn_launch_status = LaunchAutotunedConv(
-        autotune_entry, &scratch_allocator,
-        se::dnn::ConvolutionKind::BACKWARD_DATA, stream, input_desc,
-        in_backprop_ptr, filter_desc, filter_ptr, conv_desc, output_desc,
-        out_backprop_ptr);
-    if (!cudnn_launch_status.ok()) {
-      context->SetStatus(cudnn_launch_status);
-      return;
     }
 
-    if (rows_odd || cols_odd || planes_odd) {
-      Tensor in_backprop_remove_padding;
-      OP_REQUIRES_OK(
-          context, context->allocate_temp(
-                       DataTypeToEnum<T>::value,
-                       ShapeFromFormat(compute_data_format, dims.batch_size,
-                                       {{dims.input_size(0), dims.input_size(1),
-                                         dims.input_size(2)}},
-                                       dims.in_depth),
-                       &in_backprop_remove_padding));
-
-      // Remove the padding for odd spatial dimensions.
-      functor::PadInput<GPUDevice, T, int, 5>()(
-          context->eigen_device<GPUDevice>(),
-          To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
-                      .tensor<T, 5>()),
-          {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
-          To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
-          compute_data_format, T{});
-
-      pre_transformed_in_backprop = in_backprop_remove_padding;
-    }
-
-    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-      functor::NCHWToNHWC<GPUDevice, T, 5>()(
-          context->eigen_device<GPUDevice>(),
-          toConstTensor(pre_transformed_in_backprop).template tensor<T, 5>(),
-          in_backprop->tensor<T, 5>());
-    } else {
-      *in_backprop = pre_transformed_in_backprop;
-    }
+    LaunchConvBackpropInputOp<T>::launch(
+        context, cudnn_use_autotune_, out_backprop, filter, dilation_, stride_,
+        padding_, in_backprop, data_format_);
   }
 
  private:
@@ -1628,193 +1713,139 @@ typedef AutotuneSingleton<Conv3dBackwardFilterAutotuneGroup, ConvParameters,
     AutotuneConv3dBwdFilter;
 
 template <typename T>
-class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
- public:
-  explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context),
-        data_format_(FORMAT_NHWC),
-        takes_shape_(type_string().find("V2") != std::string::npos) {
-    // data_format is only available in V2.
-    if (takes_shape_) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                  errors::InvalidArgument("Invalid data format"));
-    }
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-    OP_REQUIRES(context, dilation_.size() == 5,
-                errors::InvalidArgument("Dilation rates field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
-                 GetTensorDim(dilation_, data_format_, 'N') == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilation rates in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
-         GetTensorDim(dilation_, data_format_, '1') > 0 &&
-         GetTensorDim(dilation_, data_format_, '2') > 0),
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 5,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, 'C') == 1 &&
-         GetTensorDim(stride_, data_format_, 'N') == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, '0') > 0 &&
-         GetTensorDim(stride_, data_format_, '1') > 0 &&
-         GetTensorDim(stride_, data_format_, '2') > 0),
-        errors::InvalidArgument("Spatial strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    cudnn_use_autotune_ = CudnnUseAutotune();
+void LaunchConvBackpropFilterOpImpl(
+    OpKernelContext* context, bool cudnn_use_autotune, const Tensor& input,
+    const Tensor& out_backprop, const std::vector<int32>& dilation,
+    const std::vector<int32>& stride, const Padding& padding,
+    Tensor* filter_backprop, TensorFormat data_format) {
+  const TensorShape& input_shape = input.shape();
+  const TensorShape& out_backprop_shape = out_backprop.shape();
+  const TensorShape& filter_shape = filter_backprop->shape();
+
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
+                              "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
+                              input_shape, filter_shape, out_backprop_shape,
+                              dilation, stride, padding,
+                              /*explicit_paddings=*/{}, data_format, &dims));
+
+  auto* stream = context->op_device_context()->stream();
+  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+  if (DataTypeToEnum<T>::value == DT_BFLOAT16 &&
+      !stream->GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    context->SetStatus(errors::Unimplemented(
+        "Conv3DBackpropFilter for GPU with bfloat16 is only supported "
+        "with cuDNN on Ampere GPUs or later."));
+    return;
   }
 
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const TensorShape& input_shape = input.shape();
-
-    const Tensor& out_backprop = context->input(2);
-    const TensorShape& out_backprop_shape = out_backprop.shape();
+  bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
+  if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
+      dims.filter_size(2) == 1 && dims.filter_size(0) == 1 &&
+      dims.dilation(2) == 1 && dims.dilation(1) == 1 && dims.dilation(0) == 1 &&
+      dims.stride(2) == 1 && dims.stride(1) == 1 && dims.stride(0) == 1 &&
+      data_format == FORMAT_NHWC) {
+    const uint64 m = dims.in_depth;
+    const uint64 k = dims.batch_size * dims.input_size(1) * dims.input_size(2) *
+                     dims.input_size(0);
+    const uint64 n = dims.out_depth;
+
+    // The shape of output backprop is
+    //   [batch, out_z, out_y, out_x, out_depth]
+    // From cublas's perspective, it is: n x k
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+
+    // The shape of input is:
+    //   [batch, in_z, in_y, in_x, in_depth],
+    // From cublas's perspective, it is: m x k
+    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+
+    // The shape of the filter backprop is:
+    //   [1, 1, 1, in_depth, out_depth]
+    // From cublas's perspective, it is: n x m
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
 
-    TensorShape filter_shape;
-    if (takes_shape_) {
-      const Tensor& filter_sizes = context->input(1);
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-                  errors::InvalidArgument(
-                      "filter_sizes shape must be rank 1 but is rank ",
-                      filter_sizes.shape().dims()));
-      OP_REQUIRES_OK(context, tensor::MakeShape(filter_sizes, &filter_shape));
-    } else {
-      filter_shape = context->input(1).shape();
-    }
-
-    ConvBackpropDimensions dims;
     OP_REQUIRES_OK(
-        context,
-        ConvBackpropComputeDimensionsV2(
-            "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, input_shape,
-            filter_shape, out_backprop_shape, dilation_, stride_, padding_,
-            /*explicit_paddings=*/{}, data_format_, &dims));
+        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                      se::blas::Transpose::kTranspose, n, m, k,
+                                      a_ptr, n, b_ptr, m, &c_ptr, n,
+                                      se::blas::kDefaultComputePrecision));
+    return;
+  } else if (!is_grouped_convolution &&
+             dims.filter_size(0) == dims.input_size(0) &&
+             dims.filter_size(1) == dims.input_size(1) &&
+             dims.filter_size(2) == dims.input_size(2) &&
+             padding == Padding::VALID && data_format == FORMAT_NHWC) {
+    const uint64 m = dims.input_size(0) * dims.input_size(1) *
+                     dims.input_size(2) * dims.in_depth;
+    const uint64 k = dims.batch_size;
+    const uint64 n = dims.out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
 
-    Tensor* filter_backprop;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
+    OP_REQUIRES_OK(
+        context, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
+                                      se::blas::Transpose::kTranspose, n, m, k,
+                                      b_ptr, n, a_ptr, m, &c_ptr, n,
+                                      se::blas::kDefaultComputePrecision));
+    return;
+  }
 
-    auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-    bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
-    if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
-        dims.filter_size(2) == 1 && dims.filter_size(0) == 1 &&
-        dims.dilation(2) == 1 && dims.dilation(1) == 1 &&
-        dims.dilation(0) == 1 && dims.stride(2) == 1 && dims.stride(1) == 1 &&
-        dims.stride(0) == 1 && data_format_ == FORMAT_NHWC) {
-      const uint64 m = dims.in_depth;
-      const uint64 k = dims.batch_size * dims.input_size(1) *
-                       dims.input_size(2) * dims.input_size(0);
-      const uint64 n = dims.out_depth;
-
-      // The shape of output backprop is
-      //   [batch, out_z, out_y, out_x, out_depth]
-      // From cublas's perspective, it is: n x k
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-
-      // The shape of input is:
-      //   [batch, in_z, in_y, in_x, in_depth],
-      // From cublas's perspective, it is: m x k
-      auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
-
-      // The shape of the filter backprop is:
-      //   [1, 1, 1, in_depth, out_depth]
-      // From cublas's perspective, it is: n x m
-      auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                  filter_backprop->template flat<T>().size());
-
-      OP_REQUIRES_OK(context,
-                     stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                          se::blas::Transpose::kTranspose, n, m,
-                                          k, a_ptr, n, b_ptr, m, &c_ptr, n,
-                                          se::blas::kDefaultComputePrecision));
-      return;
-    } else if (!is_grouped_convolution &&
-               dims.filter_size(0) == dims.input_size(0) &&
-               dims.filter_size(1) == dims.input_size(1) &&
-               dims.filter_size(2) == dims.input_size(2) &&
-               padding_ == Padding::VALID && data_format_ == FORMAT_NHWC) {
-      const uint64 m = dims.input_size(0) * dims.input_size(1) *
-                       dims.input_size(2) * dims.in_depth;
-      const uint64 k = dims.batch_size;
-      const uint64 n = dims.out_depth;
-
-      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                  filter_backprop->template flat<T>().size());
-
-      OP_REQUIRES_OK(context,
-                     stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                          se::blas::Transpose::kTranspose, n, m,
-                                          k, b_ptr, n, a_ptr, m, &c_ptr, n,
-                                          se::blas::kDefaultComputePrecision));
-      return;
-    }
+  int padding_planes = dims.SpatialPadding(padding, 0);
+  int padding_rows = dims.SpatialPadding(padding, 1);
+  int padding_cols = dims.SpatialPadding(padding, 2);
+  const bool planes_odd = (padding_planes % 2 != 0);
+  const bool rows_odd = (padding_rows % 2 != 0);
+  const bool cols_odd = (padding_cols % 2 != 0);
 
-    int padding_planes = dims.SpatialPadding(padding_, 0);
-    int padding_rows = dims.SpatialPadding(padding_, 1);
-    int padding_cols = dims.SpatialPadding(padding_, 2);
-    const bool planes_odd = (padding_planes % 2 != 0);
-    const bool rows_odd = (padding_rows % 2 != 0);
-    const bool cols_odd = (padding_cols % 2 != 0);
-
-    Tensor compatible_input;
-    if (rows_odd || cols_odd || planes_odd) {
-      OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         ShapeFromFormat(data_format_, dims.batch_size,
-                                         {{dims.input_size(0) + planes_odd,
-                                           dims.input_size(1) + rows_odd,
-                                           dims.input_size(2) + cols_odd}},
-                                         dims.in_depth),
-                         &compatible_input));
-      functor::PadInput<GPUDevice, T, int, 5>()(
-          context->template eigen_device<GPUDevice>(),
-          To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
-          {{planes_odd, rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
-    } else {
-      compatible_input = input;
-    }
+  Tensor compatible_input;
+  if (rows_odd || cols_odd || planes_odd) {
+    TensorShape compatible_input_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                data_format, dims.batch_size,
+                                {{dims.input_size(0) + planes_odd,
+                                  dims.input_size(1) + rows_odd,
+                                  dims.input_size(2) + cols_odd}},
+                                dims.in_depth, &compatible_input_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   compatible_input_shape,
+                                                   &compatible_input));
+    functor::PadInput<GPUDevice, T, int, 5>()(
+        context->template eigen_device<GPUDevice>(),
+        To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
+        {{planes_odd, rows_odd, cols_odd}},
+        To32Bit(compatible_input.tensor<T, 5>()), data_format, T{});
+  } else {
+    compatible_input = input;
+  }
 
-    CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
-        << "Negative paddings: (" << padding_rows << ", " << padding_cols
-        << ", " << padding_planes << ")";
+  CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
+      << "Negative paddings: (" << padding_rows << ", " << padding_cols << ", "
+      << padding_planes << ")";
 
 #if GOOGLE_CUDA
-    const bool compute_in_nhwc =
-        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
+    const bool compute_in_nhwc = ComputeInNhwcEnabled(
+        DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
 #else
     // fast NDHWC implementation is a CUDA only feature
     const bool compute_in_nhwc = false;
 #endif
     const TensorFormat compute_data_format =
-        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
-                                                         : FORMAT_NCHW;
+        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                        : FORMAT_NCHW;
 
     VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
-            << " data_format=" << ToString(data_format_)
+            << " data_format=" << ToString(data_format)
             << " compute_data_format=" << ToString(compute_data_format);
 
     constexpr auto kComputeInNHWC =
@@ -1833,11 +1864,11 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X,
-                         GetTensorDim(compatible_input, data_format_, '2'))
+                         GetTensorDim(compatible_input, data_format, '2'))
         .set_spatial_dim(DimIndex::Y,
-                         GetTensorDim(compatible_input, data_format_, '1'))
+                         GetTensorDim(compatible_input, data_format, '1'))
         .set_spatial_dim(DimIndex::Z,
-                         GetTensorDim(compatible_input, data_format_, '0'))
+                         GetTensorDim(compatible_input, data_format, '0'))
         .set_feature_map_count(dims.in_depth)
         .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
@@ -1882,43 +1913,43 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                                           &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
-      TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
-                                dims.output_size(0), dims.output_size(1),
-                                dims.output_size(2)};
-      OP_REQUIRES_OK(
-          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
+    TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
+                              dims.output_size(0), dims.output_size(1),
+                              dims.output_size(2)};
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
                                           &transformed_out_backprop));
-      if (dims.out_depth > 1) {
-        functor::NHWCToNCHW<GPUDevice, T, 5>()(
-            context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
-            transformed_out_backprop.tensor<T, 5>());
-      } else {
-        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
-      }
+    if (dims.out_depth > 1) {
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 5>(),
+          transformed_out_backprop.tensor<T, 5>());
     } else {
-      transformed_out_backprop = out_backprop;
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    }
+    } else {
+    transformed_out_backprop = out_backprop;
     }
     Tensor transformed_input;
-    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
-      TensorShape nchw_shape = {
-          dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
-          compatible_input.dim_size(2), compatible_input.dim_size(3)};
-      if (dims.in_depth > 1) {
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DataTypeToEnum<T>::value,
-                                              nchw_shape, &transformed_input));
-        functor::NHWCToNCHW<GPUDevice, T, 5>()(
-            context->eigen_device<GPUDevice>(),
-            const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
-            transformed_input.tensor<T, 5>());
-      } else {
-        CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
-      }
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
+    TensorShape nchw_shape = {
+        dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
+        compatible_input.dim_size(2), compatible_input.dim_size(3)};
+    if (dims.in_depth > 1) {
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                          &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(compatible_input).tensor<T, 5>(),
+          transformed_input.tensor<T, 5>());
+    } else {
+      CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
+    }
     } else {
-      transformed_input = compatible_input;
+    transformed_input = compatible_input;
     }
 
     auto out_backprop_ptr =
@@ -1934,9 +1965,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     static int64_t ConvolveBackwardFilterScratchSize =
         GetDnnWorkspaceLimitOrDefault();
 
-    const int device_id = stream->parent()->device_ordinal();
-    DataType dtype = input.dtype();
     const ConvParameters conv_parameters = {
+        stream->parent(),
         dims.batch_size,
         dims.in_depth,
         {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
@@ -1946,16 +1976,16 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
         {{dims.stride(0), dims.stride(1), dims.stride(2)}},
         {{padding_planes, padding_rows, padding_cols}},
-        dtype,
-        device_id,
-        conv_desc.group_count()};
+        input.dtype(),
+        conv_desc.group_count(),
+    };
 
     using se::dnn::AlgorithmConfig;
     using se::dnn::AlgorithmDesc;
     using se::dnn::ProfileResult;
 
     auto entry_or = AutotuneUnfusedConv(
-        cudnn_use_autotune_, AutotuneConv3dBwdFilter::GetInstance(),
+        cudnn_use_autotune, AutotuneConv3dBwdFilter::GetInstance(),
         conv_parameters, context, se::dnn::ConvolutionKind::BACKWARD_FILTER,
         input_desc, input_ptr, filter_desc, filter_backprop_ptr, conv_desc,
         output_desc, out_backprop_ptr, ConvolveBackwardFilterScratchSize);
@@ -1979,15 +2009,153 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
         toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
         filter_backprop->tensor<T, 5>());
-  }
+}
 
- private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-  bool takes_shape_;
-  bool cudnn_use_autotune_;
+template <typename T>
+struct LaunchConvBackpropFilterOp {
+    static void launch(OpKernelContext* context, bool cudnn_use_autotune,
+                       const Tensor& input, const Tensor& out_backprop,
+                       const std::vector<int32>& dilation,
+                       const std::vector<int32>& stride, const Padding& padding,
+                       Tensor* filter_backprop, TensorFormat data_format) {
+      LaunchConvBackpropFilterOpImpl<T>(context, cudnn_use_autotune, input,
+                                        out_backprop, dilation, stride, padding,
+                                        filter_backprop, data_format);
+    }
+};
+
+template <>
+struct LaunchConvBackpropFilterOp<Eigen::bfloat16> {
+    static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                       const Tensor& input, const Tensor& out_backprop,
+                       const std::vector<int32>& dilation,
+                       const std::vector<int32>& stride, const Padding& padding,
+                       Tensor* filter_backprop, TensorFormat data_format) {
+      // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+      // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+      auto* stream = ctx->op_device_context()->stream();
+      const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE);
+
+      if (cast_to_float) {
+      Tensor casted_input = input;
+      Tensor casted_out_backprop = out_backprop;
+      Tensor casted_filter_backprop = *filter_backprop;
+
+      const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_FLOAT, input.shape(), &casted_input));
+      cast(device, casted_input.template flat<float>(),
+           input.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                             &casted_out_backprop));
+      cast(device, casted_out_backprop.template flat<float>(),
+           out_backprop.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, filter_backprop->shape(),
+                                             &casted_filter_backprop));
+
+      LaunchConvBackpropFilterOpImpl<float>(
+          ctx, cudnn_use_autotune, casted_input, casted_out_backprop, dilation,
+          stride, padding, &casted_filter_backprop, data_format);
+
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_filter_backprop_const = casted_filter_backprop;
+      cast_back(device, filter_backprop->template flat<Eigen::bfloat16>(),
+                casted_filter_backprop_const.template flat<float>());
+      return;
+      }
+
+      LaunchConvBackpropFilterOpImpl<Eigen::bfloat16>(
+          ctx, cudnn_use_autotune, input, out_backprop, dilation, stride,
+          padding, filter_backprop, data_format);
+    }
+};
+
+template <typename T>
+class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
+   public:
+    explicit Conv3DBackpropFilterOp(OpKernelConstruction* context)
+        : OpKernel(context),
+          data_format_(FORMAT_NHWC),
+          takes_shape_(type_string().find("V2") != std::string::npos) {
+      // data_format is only available in V2.
+      if (takes_shape_) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      }
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+      OP_REQUIRES(context, dilation_.size() == 5,
+                  errors::InvalidArgument("Dilation rates field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(dilation_, data_format_, 'C') == 1 &&
+                   GetTensorDim(dilation_, data_format_, 'N') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "dilation rates in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+           GetTensorDim(dilation_, data_format_, '1') > 0 &&
+           GetTensorDim(dilation_, data_format_, '2') > 0),
+          errors::InvalidArgument("Dilated rates should be larger than 0."));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 5,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 5 dimensions"));
+      OP_REQUIRES(context,
+                  (GetTensorDim(stride_, data_format_, 'C') == 1 &&
+                   GetTensorDim(stride_, data_format_, 'N') == 1),
+                  errors::InvalidArgument(
+                      "Current implementation does not yet support "
+                      "strides in the batch and depth dimensions."));
+      OP_REQUIRES(
+          context,
+          (GetTensorDim(stride_, data_format_, '0') > 0 &&
+           GetTensorDim(stride_, data_format_, '1') > 0 &&
+           GetTensorDim(stride_, data_format_, '2') > 0),
+          errors::InvalidArgument("Spatial strides should be larger than 0."));
+      OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+      cudnn_use_autotune_ = CudnnUseAutotune();
+    }
+
+    void Compute(OpKernelContext* context) override {
+      const Tensor& input = context->input(0);
+      const Tensor& out_backprop = context->input(2);
+
+      TensorShape filter_shape;
+      if (takes_shape_) {
+      const Tensor& filter_sizes = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_sizes.shape()),
+                  errors::InvalidArgument(
+                      "filter_sizes shape must be rank 1 but is rank ",
+                      filter_sizes.shape().dims()));
+      OP_REQUIRES_OK(context, tensor::MakeShape(filter_sizes, &filter_shape));
+      } else {
+      filter_shape = context->input(1).shape();
+      }
+
+      Tensor* filter_backprop;
+      OP_REQUIRES_OK(
+          context, context->allocate_output(0, filter_shape, &filter_backprop));
+
+      LaunchConvBackpropFilterOp<T>::launch(
+          context, cudnn_use_autotune_, input, out_backprop, dilation_, stride_,
+          padding_, filter_backprop, data_format_);
+    }
+
+   private:
+    std::vector<int32> dilation_;
+    std::vector<int32> stride_;
+    Padding padding_;
+    TensorFormat data_format_;
+    bool takes_shape_;
+    bool cudnn_use_autotune_;
 };
 
 #define REGISTER_GPU_KERNEL(T)                                                \
@@ -2008,6 +2176,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                               .HostMemory("filter_sizes"),                    \
                           Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 85d2b4ccc54..f2686e1dd6c 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -196,9 +196,8 @@ Status Conv2DBackpropComputeInputShape(const Tensor& input_sizes,
           "Conv2DBackpropInput: elements of input_sizes must be >= 0, not ",
           output_height, "x", output_width);
     }
-    *input_shape = ShapeFromFormat(data_format, batch_size, output_height,
-                                   output_width, output_depth);
-    return OkStatus();
+    return ShapeFromFormatWithStatus(data_format, batch_size, output_height,
+                                     output_width, output_depth, input_shape);
   }
 
   return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 7d3f9c5ca38..b8a0b44ce7c 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
@@ -616,9 +617,11 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context,
                    ComputeConv2DDimension(params_, input, filter, &dimensions));
 
-    TensorShape out_shape = ShapeFromFormat(
-        params_.data_format, dimensions.batch, dimensions.out_rows,
-        dimensions.out_cols, dimensions.out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(
+                     params_.data_format, dimensions.batch, dimensions.out_rows,
+                     dimensions.out_cols, dimensions.out_depth, &out_shape));
 
     // Output tensor is of the following dimensions:
     // [ in_batch, out_rows, out_cols, out_depth ]
@@ -725,12 +728,13 @@ int64_t GetDnnWorkspaceLimitOrDefault() {
 }
 
 template <typename T>
-void LaunchConv2DOp<GPUDevice, T>::operator()(
-    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& input_param, const Tensor& filter, int row_dilation,
-    int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    const std::vector<int64_t>& explicit_paddings, Tensor* output,
-    TensorFormat data_format) {
+void LaunchConv2DOpImpl(OpKernelContext* ctx, bool use_cudnn,
+                        bool cudnn_use_autotune, const Tensor& input_param,
+                        const Tensor& filter, int row_dilation,
+                        int col_dilation, int row_stride, int col_stride,
+                        const Padding& padding,
+                        const std::vector<int64_t>& explicit_paddings,
+                        Tensor* output, TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -809,17 +813,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     return;
   }
 
-#if GOOGLE_CUDA
-  // Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16
-  // in NHWC data layout. In all other configurations it's more efficient to
-  // run computation in NCHW data format.
-  const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF &&
-                               stream->GetCudaComputeCapability().IsAtLeast(
-                                   se::CudaComputeCapability::VOLTA);
-#else
+  const bool compute_in_nhwc =
+      ComputeInNhwcEnabled(DataTypeToEnum<T>::value, stream);
   // fast NHWC implementation is a CUDA only feature
-  const bool compute_in_nhwc = false;
-#endif
 
   // We only do one directional conversion: NHWC->NCHW. We never convert in the
   // other direction. Grappler layout optimizer selects preferred layout and
@@ -881,11 +877,13 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
     const int64_t new_in_rows = in_rows + padding_rows_diff;
     const int64_t new_in_cols = in_cols + padding_cols_diff;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                            DataTypeToEnum<T>::value,
-                            ShapeFromFormat(data_format, in_batch, new_in_rows,
-                                            new_in_cols, in_depths),
-                            &transformed_input));
+    TensorShape transformed_input_shape;
+    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                            data_format, in_batch, new_in_rows, new_in_cols,
+                            in_depths, &transformed_input_shape));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                transformed_input_shape, &transformed_input));
 
     const int64_t input_pad_top = padding_top - common_padding_rows;
     const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
@@ -916,8 +914,10 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
     VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
 
-    TensorShape nchw_shape =
-        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+    TensorShape nchw_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch, in_rows, in_cols,
+                                       in_depths, &nchw_shape));
     if (in_depths > 1) {
       Tensor transformed_input;
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
@@ -1018,11 +1018,13 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   Tensor transformed_output;
   if (data_format != compute_data_format) {
     VLOG(4) << "Allocate temporary memory for output in compute data format";
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                            compute_data_format, out_batch, out_rows, out_cols,
+                            out_depths, &transformed_output_shape));
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                ShapeFromFormat(compute_data_format, out_batch,
-                                                out_rows, out_cols, out_depths),
-                                &transformed_output));
+                                transformed_output_shape, &transformed_output));
   } else {
     transformed_output = *output;
   }
@@ -1038,26 +1040,26 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault();
 
-  int device_id = stream->parent()->device_ordinal();
-  DataType dtype = input.dtype();
-  ConvParameters conv_parameters = {in_batch,             // batch
-                                    in_depths,            // in_depths
-                                    {{in_rows,            // in_rows
-                                      in_cols}},          // in_cols
-                                    compute_data_format,  // compute_data_format
-                                    out_depths,           // out_depths
-                                    {{patch_rows,         // filter_rows
-                                      patch_cols,         // filter_cols
-                                      patch_depths}},     // filter_depths
-                                    {{row_dilation,       // dilation_rows
-                                      col_dilation}},     // dilation_cols
-                                    {{row_stride,         // stride_rows
-                                      col_stride}},       // stride_cols
-                                    {{common_padding_rows,    // padding_rows
-                                      common_padding_cols}},  // padding_cols
-                                    dtype,                    // tensor datatype
-                                    device_id,                // device_id
-                                    conv_desc.group_count()};
+  ConvParameters conv_parameters = {
+      stream->parent(),
+      in_batch,                 // batch
+      in_depths,                // in_depths
+      {{in_rows,                // in_rows
+        in_cols}},              // in_cols
+      compute_data_format,      // compute_data_format
+      out_depths,               // out_depths
+      {{patch_rows,             // filter_rows
+        patch_cols,             // filter_cols
+        patch_depths}},         // filter_depths
+      {{row_dilation,           // dilation_rows
+        col_dilation}},         // dilation_cols
+      {{row_stride,             // stride_rows
+        col_stride}},           // stride_cols
+      {{common_padding_rows,    // padding_rows
+        common_padding_cols}},  // padding_cols
+      input.dtype(),            // tensor datatype
+      conv_desc.group_count(),
+  };
 
   auto entry_or = AutotuneUnfusedConv(
       cudnn_use_autotune, ConvAutotuneMap::GetInstance(), conv_parameters, ctx,
@@ -1085,6 +1087,69 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   }
 }
 
+template <typename T>
+void LaunchConv2DOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
+  LaunchConv2DOpImpl<T>(ctx, use_cudnn, cudnn_use_autotune, input_param, filter,
+                        row_dilation, col_dilation, row_stride, col_stride,
+                        padding, explicit_paddings, output, data_format);
+}
+
+template <>
+void LaunchConv2DOp<GPUDevice, Eigen::bfloat16>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    const std::vector<int64_t>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = ctx->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+
+  if (cast_to_float) {
+    Tensor casted_input = input_param;
+    Tensor casted_filter = filter;
+    Tensor casted_out = *output;
+
+    const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_FLOAT, input_param.shape(), &casted_input));
+    cast(device, casted_input.template flat<float>(),
+         input_param.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_FLOAT, filter.shape(), &casted_filter));
+    cast(device, casted_filter.template flat<float>(),
+         filter.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_FLOAT, output->shape(), &casted_out));
+
+    LaunchConv2DOpImpl<float>(ctx, use_cudnn, cudnn_use_autotune, casted_input,
+                              casted_filter, row_dilation, col_dilation,
+                              row_stride, col_stride, padding,
+                              explicit_paddings, &casted_out, data_format);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_out_const = casted_out;
+    cast_back(device, output->template flat<Eigen::bfloat16>(),
+              casted_out_const.template flat<float>());
+    return;
+  }
+
+  LaunchConv2DOpImpl<Eigen::bfloat16>(
+      ctx, use_cudnn, cudnn_use_autotune, input_param, filter, row_dilation,
+      col_dilation, row_stride, col_stride, padding, explicit_paddings, output,
+      data_format);
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                 \
@@ -1130,6 +1195,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(int32);
 #undef DECLARE_GPU_SPEC
@@ -1140,6 +1206,9 @@ DECLARE_GPU_SPEC(int32);
 REGISTER_KERNEL_BUILDER(
     Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     Conv2DOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<Eigen::bfloat16>("T"),
+    Conv2DOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("Conv2D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv2DOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 78af9e77a66..60eb3afe596 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
@@ -182,8 +183,11 @@ class Conv3DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(
         context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
                                    padding_, &out, &padding));
-    TensorShape out_shape = ShapeFromFormat(
-        data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(data_format_, in_batch,
+                                             {{out[0], out[1], out[2]}},
+                                             out_depth, &out_shape));
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
@@ -226,309 +230,371 @@ typedef AutotuneSingleton<Conv3dAutotuneGroup, ConvParameters,
 
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
 template <typename T>
-struct LaunchConvOp<GPUDevice, T> {
-  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-                     const Tensor& input_param, const Tensor& filter,
-                     const std::array<int64, 3>& dilations,
-                     const std::array<int64, 3>& strides, const Padding padding,
-                     TensorFormat data_format, Tensor* output) {
-    auto* stream = ctx->op_device_context()->stream();
-    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+void LaunchConvOpImpl(OpKernelContext* ctx, bool cudnn_use_autotune,
+                      const Tensor& input_param, const Tensor& filter,
+                      const std::array<int64, 3>& dilations,
+                      const std::array<int64, 3>& strides,
+                      const Padding padding, TensorFormat data_format,
+                      Tensor* output) {
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  Tensor input = input_param;
+
+  const int64_t in_batch = GetTensorDim(input, data_format, 'N');
+  int64_t in_planes = GetTensorDim(input, data_format, '0');
+  int64_t in_rows = GetTensorDim(input, data_format, '1');
+  int64_t in_cols = GetTensorDim(input, data_format, '2');
+  const int64_t in_depth = GetTensorDim(input, data_format, 'C');
+
+  const int64_t filter_planes = filter.dim_size(0);
+  const int64_t filter_rows = filter.dim_size(1);
+  const int64_t filter_cols = filter.dim_size(2);
+  const int64_t filter_depth = filter.dim_size(3);
+  const int64_t out_depth = filter.dim_size(4);
+
+  int64_t pad_planes = 0, pad_rows = 0, pad_cols = 0;
+  int64_t out_planes = GetTensorDim(*output, data_format, '0');
+  int64_t out_rows = GetTensorDim(*output, data_format, '1');
+  int64_t out_cols = GetTensorDim(*output, data_format, '2');
+
+  if (padding == Padding::SAME) {
+    pad_planes = std::max<int64_t>(
+        0, (out_planes - 1) * strides[0] + filter_planes - in_planes);
+    pad_rows = std::max<int64_t>(
+        0, (out_rows - 1) * strides[1] + filter_rows - in_rows);
+    pad_cols = std::max<int64_t>(
+        0, (out_cols - 1) * strides[2] + filter_cols - in_cols);
+  }
 
-    Tensor input = input_param;
+  bool is_grouped_convolution = filter_depth != in_depth;
+
+  // NOTE: This only works in NHWC.
+  if (!is_grouped_convolution && filter_planes == 1 && filter_rows == 1 &&
+      filter_cols == 1 && dilations[0] == 1 && dilations[1] == 1 &&
+      dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 &&
+      strides[2] == 1 && data_format == FORMAT_NHWC) {
+    // 1x1 filter, so call cublas directly.
+    const uint64 m = in_batch * in_planes * in_rows * in_cols;
+    const uint64 k = in_depth;
+    const uint64 n = out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                output->template flat<T>().size());
+
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+    OP_REQUIRES_OK(
+        ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
+                                  a_ptr, k, &c_ptr, n,
+                                  se::blas::kDefaultComputePrecision));
+    return;
+  } else if (!is_grouped_convolution && filter_planes == in_planes &&
+             filter_rows == in_rows && filter_cols == in_cols &&
+             padding == Padding::VALID && data_format == FORMAT_NHWC) {
+    // The input data and filter have the same planes/height/width, so call
+    // cublas directly.
+    const uint64 m = in_batch;
+    const uint64 k = in_planes * in_rows * in_cols * in_depth;
+    const uint64 n = out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                output->template flat<T>().size());
+
+    auto no_transpose = se::blas::Transpose::kNoTranspose;
+    OP_REQUIRES_OK(
+        ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
+                                  a_ptr, k, &c_ptr, n,
+                                  se::blas::kDefaultComputePrecision));
+    return;
+  }
 
-    const int64_t in_batch = GetTensorDim(input, data_format, 'N');
-    int64_t in_planes = GetTensorDim(input, data_format, '0');
-    int64_t in_rows = GetTensorDim(input, data_format, '1');
-    int64_t in_cols = GetTensorDim(input, data_format, '2');
-    const int64_t in_depth = GetTensorDim(input, data_format, 'C');
+  if (padding == Padding::SAME) {
+    const bool rows_odd = (pad_rows % 2 != 0);
+    const bool cols_odd = (pad_cols % 2 != 0);
+    const bool planes_odd = (pad_planes % 2 != 0);
+
+    // Necessary because cuDNN only supports symmetric padding.
+    // TODO(mjanusz): Consider making this optional? This would save some
+    // overhead and would work as long as an op trained this way is only
+    // used on GPU.
+    if (rows_odd || cols_odd || planes_odd) {
+      const int64_t new_in_rows = in_rows + rows_odd;
+      const int64_t new_in_cols = in_cols + cols_odd;
+      const int64_t new_in_planes = in_planes + planes_odd;
+
+      Tensor transformed_input;
+      TensorShape transformed_shape;
+      OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                              data_format, in_batch,
+                              {{new_in_planes, new_in_rows, new_in_cols}},
+                              in_depth, &transformed_shape));
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
+                                  &transformed_input));
+
+      functor::PadInput<GPUDevice, T, int, 5>()(
+          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
+          {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
+          To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
+      in_planes = new_in_planes;
+    }
+  }
 
-    const int64_t filter_planes = filter.dim_size(0);
-    const int64_t filter_rows = filter.dim_size(1);
-    const int64_t filter_cols = filter.dim_size(2);
-    const int64_t filter_depth = filter.dim_size(3);
-    const int64_t out_depth = filter.dim_size(4);
+  const bool compute_in_nhwc = ComputeInNhwcEnabled(
+      DataTypeToEnum<T>::value, stream, /*use_4d_tensor=*/false);
 
-    int64_t pad_planes = 0, pad_rows = 0, pad_cols = 0;
-    int64_t out_planes = GetTensorDim(*output, data_format, '0');
-    int64_t out_rows = GetTensorDim(*output, data_format, '1');
-    int64_t out_cols = GetTensorDim(*output, data_format, '2');
-
-    if (padding == Padding::SAME) {
-      pad_planes = std::max<int64_t>(
-          0, (out_planes - 1) * strides[0] + filter_planes - in_planes);
-      pad_rows = std::max<int64_t>(
-          0, (out_rows - 1) * strides[1] + filter_rows - in_rows);
-      pad_cols = std::max<int64_t>(
-          0, (out_cols - 1) * strides[2] + filter_cols - in_cols);
-    }
+  const TensorFormat compute_data_format =
+      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                      : FORMAT_NCHW;
 
-    bool is_grouped_convolution = filter_depth != in_depth;
+  VLOG(3) << "Compute Conv3D with cuDNN:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
 
-    // NOTE: This only works in NHWC.
-    if (!is_grouped_convolution && filter_planes == 1 && filter_rows == 1 &&
-        filter_cols == 1 && dilations[0] == 1 && dilations[1] == 1 &&
-        dilations[2] == 1 && strides[0] == 1 && strides[1] == 1 &&
-        strides[2] == 1 && data_format == FORMAT_NHWC) {
-      // 1x1 filter, so call cublas directly.
-      const uint64 m = in_batch * in_planes * in_rows * in_cols;
-      const uint64 k = in_depth;
-      const uint64 n = out_depth;
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the input tensor from NDHWC to NCDHW.";
+    TensorShape nchw_shape;
+    OP_REQUIRES_OK(ctx,
+                   ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch,
+                                             {{in_planes, in_rows, in_cols}},
+                                             in_depth, &nchw_shape));
+    if (in_depth > 1) {
+      Tensor transformed_input;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             nchw_shape, &transformed_input));
+      // input: [b, x, y, z, d]
+      // t_input: [b, d, x, y, z]
+      // NCDHW is the only format universally supported by cuDNN.
+      functor::NHWCToNCHW<GPUDevice, T, 5>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(input).tensor<T, 5>(),
+          transformed_input.tensor<T, 5>());
+      input = transformed_input;
+    } else {
+      CHECK(input.CopyFrom(input, nchw_shape));
+    }
+  } else {
+    CHECK(data_format == compute_data_format)  // Crash OK
+        << "Illegal data and compute format pair:"
+        << " data_format=" << ToString(data_format)
+        << " compute_data_format=" << ToString(compute_data_format);
+  }
 
-      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
-                                  output->template flat<T>().size());
+  constexpr auto kComputeInNHWC =
+      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                      se::dnn::FilterLayout::kOutputYXInput);
+  constexpr auto kComputeInNCHW =
+      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                      se::dnn::FilterLayout::kOutputInputYX);
+
+  se::dnn::DataLayout compute_data_layout;
+  se::dnn::FilterLayout filter_layout;
+
+  std::tie(compute_data_layout, filter_layout) =
+      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
+  CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
+      << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
+      << pad_planes << ")";
+  se::dnn::BatchDescriptor input_desc(3);
+  input_desc.set_count(in_batch)
+      .set_feature_map_count(in_depth)
+      .set_spatial_dim(DimIndex::X, in_cols)
+      .set_spatial_dim(DimIndex::Y, in_rows)
+      .set_spatial_dim(DimIndex::Z, in_planes)
+      .set_layout(compute_data_layout);
+  se::dnn::BatchDescriptor output_desc(3);
+  output_desc.set_count(in_batch)
+      .set_spatial_dim(DimIndex::X, out_cols)
+      .set_spatial_dim(DimIndex::Y, out_rows)
+      .set_spatial_dim(DimIndex::Z, out_planes)
+      .set_feature_map_count(out_depth)
+      .set_layout(compute_data_layout);
+  se::dnn::FilterDescriptor filter_desc(3);
+  filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
+      .set_spatial_dim(DimIndex::Y, filter_rows)
+      .set_spatial_dim(DimIndex::Z, filter_planes)
+      .set_input_feature_map_count(filter_depth)
+      .set_output_feature_map_count(out_depth)
+      .set_layout(filter_layout);
+  se::dnn::ConvolutionDescriptor conv_desc(3);
+  conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
+      .set_dilation_rate(DimIndex::Y, dilations[1])
+      .set_dilation_rate(DimIndex::Z, dilations[0])
+      .set_filter_stride(DimIndex::X, strides[2])
+      .set_filter_stride(DimIndex::Y, strides[1])
+      .set_filter_stride(DimIndex::Z, strides[0])
+      .set_zero_padding(DimIndex::X, pad_cols / 2)
+      .set_zero_padding(DimIndex::Y, pad_rows / 2)
+      .set_zero_padding(DimIndex::Z, pad_planes / 2)
+      .set_group_count(in_depth / filter_depth);
+
+  Tensor transformed_filter;
+  auto dst_format =
+      compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+  VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO) << " to "
+          << ToString(dst_format);
+  TensorShape dst_shape =
+      dst_format == FORMAT_OIHW
+          ? TensorShape({filter.dim_size(4), filter.dim_size(3),
+                         filter.dim_size(0), filter.dim_size(1),
+                         filter.dim_size(2)})
+          : TensorShape({filter.dim_size(4), filter.dim_size(0),
+                         filter.dim_size(1), filter.dim_size(2),
+                         filter.dim_size(3)});
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                         &transformed_filter));
+  // filter: [x, y, z, in, out]
+  // t_filter: [out, in, x, y, z] (NCDHW) or
+  // t_filter: [out, x, y, z, in] (NDHWC)
+  functor::TransformFilter<GPUDevice, T, int, 5>()(
+      ctx->eigen_device<GPUDevice>(), dst_format,
+      To32Bit(filter.tensor<T, 5>()),
+      To32Bit(transformed_filter.tensor<T, 5>()));
+
+  Tensor transformed_output;
+  if (data_format != compute_data_format) {
+    VLOG(4) << "Allocate temporary memory for output in compute data format";
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(
+        ctx, ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch,
+                                       {{out_planes, out_rows, out_cols}},
+                                       out_depth, &transformed_output_shape));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                transformed_output_shape, &transformed_output));
+  } else {
+    transformed_output = *output;
+  }
 
-      auto no_transpose = se::blas::Transpose::kNoTranspose;
-      OP_REQUIRES_OK(
-          ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr,
-                                    n, a_ptr, k, &c_ptr, n,
-                                    se::blas::kDefaultComputePrecision));
-      return;
-    } else if (!is_grouped_convolution && filter_planes == in_planes &&
-               filter_rows == in_rows && filter_cols == in_cols &&
-               padding == Padding::VALID && data_format == FORMAT_NHWC) {
-      // The input data and filter have the same planes/height/width, so call
-      // cublas directly.
-      const uint64 m = in_batch;
-      const uint64 k = in_planes * in_rows * in_cols * in_depth;
-      const uint64 n = out_depth;
-
-      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                   input.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
-                                  output->template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto output_ptr =
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+
+  static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault();
+
+  ConvParameters conv_parameters = {
+      stream->parent(),
+      in_batch,
+      in_depth,
+      {{in_planes, in_rows, in_cols}},
+      compute_data_format,
+      out_depth,
+      {{filter_planes, filter_rows, filter_cols}},
+      {{dilations[0], dilations[1], dilations[2]}},
+      {{strides[0], strides[1], strides[2]}},
+      {{pad_planes, pad_rows, pad_cols}},
+      input.dtype(),
+      conv_desc.group_count(),
+  };
+
+  using se::dnn::AlgorithmConfig;
+  using se::dnn::AlgorithmDesc;
+  using se::dnn::ProfileResult;
+
+  auto config_or = AutotuneUnfusedConv(
+      cudnn_use_autotune, AutotuneConv3d::GetInstance(), conv_parameters, ctx,
+      se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc,
+      filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize);
+  OP_REQUIRES_OK(ctx, config_or.status());
+  auto autotune_entry = std::move(config_or).value();
+
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  Status cudnn_launch_status = LaunchAutotunedConv(
+      autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
+      stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+      output_desc, output_ptr);
+  if (!cudnn_launch_status.ok()) {
+    ctx->SetStatus(cudnn_launch_status);
+    return;
+  }
 
-      auto no_transpose = se::blas::Transpose::kNoTranspose;
-      OP_REQUIRES_OK(
-          ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr,
-                                    n, a_ptr, k, &c_ptr, n,
-                                    se::blas::kDefaultComputePrecision));
-      return;
-    }
+  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+    VLOG(4) << "Convert the output tensor back from NCDHW to NDHWC.";
+    // t_output: [b, out, x, y, z]
+    // output: [b, x, y, z, out]
+    functor::NCHWToNHWC<GPUDevice, T, 5>()(
+        ctx->eigen_device<GPUDevice>(),
+        const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
+        output->tensor<T, 5>());
+  }
+}
 
-    if (padding == Padding::SAME) {
-      const bool rows_odd = (pad_rows % 2 != 0);
-      const bool cols_odd = (pad_cols % 2 != 0);
-      const bool planes_odd = (pad_planes % 2 != 0);
-
-      // Necessary because cuDNN only supports symmetric padding.
-      // TODO(mjanusz): Consider making this optional? This would save some
-      // overhead and would work as long as an op trained this way is only
-      // used on GPU.
-      if (rows_odd || cols_odd || planes_odd) {
-        const int64_t new_in_rows = in_rows + rows_odd;
-        const int64_t new_in_cols = in_cols + cols_odd;
-        const int64_t new_in_planes = in_planes + planes_odd;
-
-        Tensor transformed_input;
-        TensorShape transformed_shape = ShapeFromFormat(
-            data_format, in_batch, {{new_in_planes, new_in_rows, new_in_cols}},
-            in_depth);
-        OP_REQUIRES_OK(
-            ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, transformed_shape,
-                                    &transformed_input));
-
-        functor::PadInput<GPUDevice, T, int, 5>()(
-            ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
-            {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
-            To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
-        input = transformed_input;
-        in_rows = new_in_rows;
-        in_cols = new_in_cols;
-        in_planes = new_in_planes;
-      }
-    }
+template <typename T>
+struct LaunchConvOp<GPUDevice, T> {
+  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                     const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
+    LaunchConvOpImpl<T>(ctx, cudnn_use_autotune, input_param, filter, dilations,
+                        strides, padding, data_format, output);
+  }
+};
 
-#if GOOGLE_CUDA
-    const bool compute_in_nhwc =
-        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
-#else
-    // fast NHWC implementation is a CUDA only feature
-    const bool compute_in_nhwc = false;
-#endif
-    const TensorFormat compute_data_format =
-        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
-                                                        : FORMAT_NCHW;
-
-    VLOG(3) << "Compute Conv3D with cuDNN:"
-            << " data_format=" << ToString(data_format)
-            << " compute_data_format=" << ToString(compute_data_format);
-
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      VLOG(4) << "Convert the input tensor from NDHWC to NCDHW.";
-      const TensorShape nchw_shape = ShapeFromFormat(
-          FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
-      if (in_depth > 1) {
-        Tensor transformed_input;
-        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                               nchw_shape, &transformed_input));
-        // input: [b, x, y, z, d]
-        // t_input: [b, d, x, y, z]
-        // NCDHW is the only format universally supported by cuDNN.
-        functor::NHWCToNCHW<GPUDevice, T, 5>()(
-            ctx->eigen_device<GPUDevice>(),
-            const_cast<const Tensor&>(input).tensor<T, 5>(),
-            transformed_input.tensor<T, 5>());
-        input = transformed_input;
-      } else {
-        CHECK(input.CopyFrom(input, nchw_shape));
-      }
-    } else {
-      CHECK(data_format == compute_data_format)  // Crash OK
-          << "Illegal data and compute format pair:"
-          << " data_format=" << ToString(data_format)
-          << " compute_data_format=" << ToString(compute_data_format);
-    }
+template <>
+struct LaunchConvOp<GPUDevice, Eigen::bfloat16> {
+  static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+                     const Tensor& input_param, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
+    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+    auto* stream = ctx->op_device_context()->stream();
+    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+        se::CudaComputeCapability::AMPERE);
+
+    if (cast_to_float) {
+      Tensor casted_input = input_param;
+      Tensor casted_filter = filter;
+      Tensor casted_out = *output;
+
+      const GPUDevice& device = ctx->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, input_param.shape(),
+                                             &casted_input));
+      cast(device, casted_input.template flat<float>(),
+           input_param.template flat<Eigen::bfloat16>());
 
-    constexpr auto kComputeInNHWC =
-        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
-                        se::dnn::FilterLayout::kOutputYXInput);
-    constexpr auto kComputeInNCHW =
-        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
-                        se::dnn::FilterLayout::kOutputInputYX);
-
-    se::dnn::DataLayout compute_data_layout;
-    se::dnn::FilterLayout filter_layout;
-
-    std::tie(compute_data_layout, filter_layout) =
-        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
-
-    CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
-        << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
-        << pad_planes << ")";
-    se::dnn::BatchDescriptor input_desc(3);
-    input_desc.set_count(in_batch)
-        .set_feature_map_count(in_depth)
-        .set_spatial_dim(DimIndex::X, in_cols)
-        .set_spatial_dim(DimIndex::Y, in_rows)
-        .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(compute_data_layout);
-    se::dnn::BatchDescriptor output_desc(3);
-    output_desc.set_count(in_batch)
-        .set_spatial_dim(DimIndex::X, out_cols)
-        .set_spatial_dim(DimIndex::Y, out_rows)
-        .set_spatial_dim(DimIndex::Z, out_planes)
-        .set_feature_map_count(out_depth)
-        .set_layout(compute_data_layout);
-    se::dnn::FilterDescriptor filter_desc(3);
-    filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
-        .set_spatial_dim(DimIndex::Y, filter_rows)
-        .set_spatial_dim(DimIndex::Z, filter_planes)
-        .set_input_feature_map_count(filter_depth)
-        .set_output_feature_map_count(out_depth)
-        .set_layout(filter_layout);
-    se::dnn::ConvolutionDescriptor conv_desc(3);
-    conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
-        .set_dilation_rate(DimIndex::Y, dilations[1])
-        .set_dilation_rate(DimIndex::Z, dilations[0])
-        .set_filter_stride(DimIndex::X, strides[2])
-        .set_filter_stride(DimIndex::Y, strides[1])
-        .set_filter_stride(DimIndex::Z, strides[0])
-        .set_zero_padding(DimIndex::X, pad_cols / 2)
-        .set_zero_padding(DimIndex::Y, pad_rows / 2)
-        .set_zero_padding(DimIndex::Z, pad_planes / 2)
-        .set_group_count(in_depth / filter_depth);
-
-    Tensor transformed_filter;
-    auto dst_format =
-        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
-    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
-            << " to " << ToString(dst_format);
-    TensorShape dst_shape =
-        dst_format == FORMAT_OIHW
-            ? TensorShape({filter.dim_size(4), filter.dim_size(3),
-                           filter.dim_size(0), filter.dim_size(1),
-                           filter.dim_size(2)})
-            : TensorShape({filter.dim_size(4), filter.dim_size(0),
-                           filter.dim_size(1), filter.dim_size(2),
-                           filter.dim_size(3)});
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
-                                           &transformed_filter));
-    // filter: [x, y, z, in, out]
-    // t_filter: [out, in, x, y, z] (NCDHW) or
-    // t_filter: [out, x, y, z, in] (NDHWC)
-    functor::TransformFilter<GPUDevice, T, int, 5>()(
-        ctx->eigen_device<GPUDevice>(), dst_format,
-        To32Bit(filter.tensor<T, 5>()),
-        To32Bit(transformed_filter.tensor<T, 5>()));
-
-    Tensor transformed_output;
-    if (data_format != compute_data_format) {
-      VLOG(4) << "Allocate temporary memory for output in compute data format";
       OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DataTypeToEnum<T>::value,
-              ShapeFromFormat(FORMAT_NCHW, in_batch,
-                              {{out_planes, out_rows, out_cols}}, out_depth),
-              &transformed_output));
-    } else {
-      transformed_output = *output;
-    }
+          ctx, ctx->allocate_temp(DT_FLOAT, filter.shape(), &casted_filter));
+      cast(device, casted_filter.template flat<float>(),
+           filter.template flat<Eigen::bfloat16>());
+
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_FLOAT, output->shape(), &casted_out));
 
-    auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                    input.template flat<T>().size());
-    auto filter_ptr =
-        AsDeviceMemory(transformed_filter.template flat<T>().data(),
-                       transformed_filter.template flat<T>().size());
-    auto output_ptr =
-        AsDeviceMemory(transformed_output.template flat<T>().data(),
-                       transformed_output.template flat<T>().size());
-
-    static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault();
-
-    int device_id = stream->parent()->device_ordinal();
-    DataType dtype = input.dtype();
-    ConvParameters conv_parameters = {
-        in_batch,
-        in_depth,
-        {{in_planes, in_rows, in_cols}},
-        compute_data_format,
-        out_depth,
-        {{filter_planes, filter_rows, filter_cols}},
-        {{dilations[0], dilations[1], dilations[2]}},
-        {{strides[0], strides[1], strides[2]}},
-        {{pad_planes, pad_rows, pad_cols}},
-        dtype,
-        device_id,
-        conv_desc.group_count()};
-
-    using se::dnn::AlgorithmConfig;
-    using se::dnn::AlgorithmDesc;
-    using se::dnn::ProfileResult;
-
-    auto config_or = AutotuneUnfusedConv(
-        cudnn_use_autotune, AutotuneConv3d::GetInstance(), conv_parameters, ctx,
-        se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc,
-        filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize);
-    OP_REQUIRES_OK(ctx, config_or.status());
-    auto autotune_entry = std::move(config_or).value();
-
-    DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-    Status cudnn_launch_status = LaunchAutotunedConv(
-        autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
-        stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-        output_desc, output_ptr);
-    if (!cudnn_launch_status.ok()) {
-      ctx->SetStatus(cudnn_launch_status);
+      LaunchConvOpImpl<float>(ctx, cudnn_use_autotune, casted_input,
+                              casted_filter, dilations, strides, padding,
+                              data_format, &casted_out);
+
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_out_const = casted_out;
+      cast_back(device, output->template flat<Eigen::bfloat16>(),
+                casted_out_const.template flat<float>());
       return;
     }
 
-    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
-      VLOG(4) << "Convert the output tensor back from NCDHW to NDHWC.";
-      // t_output: [b, out, x, y, z]
-      // output: [b, x, y, z, out]
-      functor::NCHWToNHWC<GPUDevice, T, 5>()(
-          ctx->eigen_device<GPUDevice>(),
-          const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
-          output->tensor<T, 5>());
-    }
+    LaunchConvOpImpl<Eigen::bfloat16>(ctx, cudnn_use_autotune, input_param,
+                                      filter, dilations, strides, padding,
+                                      data_format, output);
   }
 };
 
@@ -564,6 +630,7 @@ namespace functor {
       typename TTypes<T, 5>::Tensor out);
 
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
@@ -574,6 +641,9 @@ DECLARE_GPU_SPEC(double);
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     Conv3DOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::bfloat16>("T"),
+    Conv3DOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index dd2db39fbd1..8e89430ccb8 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -739,7 +739,8 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
                                     before, ", ", after, " not less than ",
                                     resized_shape.dim_size(d)));
       }
-      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
+      OP_REQUIRES_OK(context, padded_shape.AddDimWithStatus(
+                                  before + resized_shape.dim_size(d) + after));
     }
 
     OP_REQUIRES(
@@ -830,8 +831,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
                                          padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(FORMAT_NHWC, batch, out_rows,
+                                             out_cols, out_depth, &out_shape));
     OP_REQUIRES(context, (out_shape.num_elements() > 0),
                 errors::InvalidArgument("Output tensor can't be empty"));
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index c3c779a20af..7da493750f6 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -401,12 +401,14 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
           std::abs(dimensions.pad_cols_after - dimensions.pad_cols_before);
       const int64_t new_in_rows = in_rows + padding_rows_diff;
       const int64_t new_in_cols = in_cols + padding_cols_diff;
+      TensorShape transformed_input_shape;
       OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         ShapeFromFormat(params.data_format, in_batch,
-                                         new_in_rows, new_in_cols, in_depths),
-                         &transformed_input));
+                     ShapeFromFormatWithStatus(
+                         params.data_format, in_batch, new_in_rows, new_in_cols,
+                         in_depths, &transformed_input_shape));
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     transformed_input_shape,
+                                                     &transformed_input));
       const int64_t input_pad_top =
           dimensions.pad_rows_before - common_padding_rows;
       const int64_t input_pad_bottom =
@@ -441,8 +443,10 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
                                      se::CudaComputeCapability::VOLTA);
     if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
       // Convert the input tensor from NHWC to NCHW.
-      TensorShape nchw_shape =
-          ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+      TensorShape nchw_shape;
+      OP_REQUIRES_OK(
+          context, ShapeFromFormatWithStatus(FORMAT_NCHW, in_batch, in_rows,
+                                             in_cols, in_depths, &nchw_shape));
       if (in_depths > 1) {
         Tensor transformed_input;
         OP_REQUIRES_OK(context,
@@ -557,12 +561,13 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     Tensor transformed_output;
     if (!compute_in_nhwc && params.data_format == FORMAT_NHWC) {
       // Only allocate temporary memory when a layout transformation is needed.
-      OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
-                                         out_cols, out_depths),
-                         &transformed_output));
+      TensorShape transformed_output_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NCHW, out_batch, out_rows, out_cols,
+                                  out_depths, &transformed_output_shape));
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     transformed_output_shape,
+                                                     &transformed_output));
     } else {
       transformed_output = *output;
     }
@@ -585,9 +590,9 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     constexpr double kSideInputScale = 0.0;
     double leakyrelu_alpha = fusion_args.leakyrelu_alpha;
 
-    int device_id = stream->parent()->device_ordinal();
     DataType dtype = input.dtype();
     ConvParameters conv_parameters = {
+        stream->parent(),
         in_batch,                      // batch
         in_depths,                     // in_depths
         {{in_rows,                     // in_rows
@@ -604,7 +609,6 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
         {{common_padding_rows,         // padding_rows
           common_padding_cols}},       // padding_cols
         dtype,                         // tensor datatype
-        device_id,                     // device_id
         conv_desc.group_count(),
         ConvParameters::FusionInfo{kConvScale, kSideInputScale, leakyrelu_alpha,
                                    dnn_activation_mode,  // activation_mode
@@ -756,9 +760,11 @@ class FusedConv2DOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    ComputeConv2DDimension(params_, input, filter, &dimensions));
 
-    TensorShape out_shape = ShapeFromFormat(
-        params_.data_format, dimensions.batch, dimensions.out_rows,
-        dimensions.out_cols, dimensions.out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(
+                     params_.data_format, dimensions.batch, dimensions.out_rows,
+                     dimensions.out_cols, dimensions.out_depth, &out_shape));
 
     // Output tensor is of the following dimensions:
     // [ in_batch, out_rows, out_cols, out_depth ]
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 056d5458657..53eb3692e2b 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -443,13 +443,14 @@ void operator()(
         using VectT = int32;
         auto pad_data_format = FORMAT_NCHW;
 
-        OP_REQUIRES_OK(
-            ctx,
-            ctx->allocate_temp(
-                DataTypeToEnum<T>::value,
-                ShapeFromFormat(data_format, batch_size, new_conv_input_rows,
-                                new_conv_input_cols, conv_input_depth),
-                &maybe_padded_conv_input));
+        TensorShape maybe_padded_conv_input_shape;
+        OP_REQUIRES_OK(ctx, ShapeFromFormatWithStatus(
+                                data_format, batch_size, new_conv_input_rows,
+                                new_conv_input_cols, conv_input_depth,
+                                &maybe_padded_conv_input_shape));
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                               maybe_padded_conv_input_shape,
+                                               &maybe_padded_conv_input));
 
         auto conv_input_eigen_tensor =
             To32Bit(input_param.reinterpret_last_dimension<VectT, 4>());
@@ -547,8 +548,8 @@ void operator()(
   const float side_input_scale = side_input_scale_param.scalar<float>()();
 
   constexpr double leakyrelu_alpha = 0;  // This op doesn't support leaky relu
-  int device_id = stream->parent()->device_ordinal();
   ConvParameters fused_conv_parameters = {
+      stream->parent(),
       batch_size,
       conv_input_depth,
       {{conv_input_rows, conv_input_cols}},
@@ -560,7 +561,6 @@ void operator()(
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
       conv_input->dtype(),
-      device_id,
       /*group_count=*/1,  // This op doesn't support grouped convolutions.
       ConvParameters::FusionInfo{conv_scale, side_input_scale, leakyrelu_alpha,
                                  dnn_activation_mode,
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index 196bb59ed34..26034707b6a 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -23,14 +23,43 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
 #include "tensorflow/core/kernels/autotune_conv_impl.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
+bool ComputeInNhwcEnabled(DataType data_type, se::Stream* stream,
+                          bool use_4d_tensor) {
+#if GOOGLE_CUDA
+  // Tensor Core supports efficient convolution with fp16 for NVIDIA Volta+
+  // GPUs and bf16/tf32 for Ampere+ GPUs in NHWC data layout. In all other
+  // configurations it's more efficient to run computation in NCHW data format.
+  bool use_nhwc_tf32 = data_type == DT_FLOAT &&
+                       stream->GetCudaComputeCapability().IsAtLeast(
+                           se::CudaComputeCapability::AMPERE) &&
+                       tensorflow::tensor_float_32_execution_enabled();
+  bool use_nhwc_fp16 =
+      data_type == DT_HALF && stream->GetCudaComputeCapability().IsAtLeast(
+                                  se::CudaComputeCapability::VOLTA);
+  bool use_nhwc_bf16 =
+      data_type == DT_BFLOAT16 && stream->GetCudaComputeCapability().IsAtLeast(
+                                      se::CudaComputeCapability::AMPERE);
+  if (use_4d_tensor) {
+    return use_nhwc_fp16 || use_nhwc_tf32 || use_nhwc_bf16;
+  }
+  return CUDNN_VERSION >= 8000 &&
+         (use_nhwc_fp16 || use_nhwc_tf32 || use_nhwc_bf16);
+#else
+  // fast NHWC implementation is a CUDA only feature
+  return false;
+#endif  // GOOGLE_CUDA
+}
+
 // Finds the best convolution algorithm for the given ConvLaunch (cuda
 // convolution on the stream) and parameters, by running all possible
 // algorithms and measuring execution time.
@@ -372,42 +401,27 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
   return autotune_entry;
 }
 
-template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<double>(
-    bool cudnn_use_autotune,
-    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
-    const ConvParameters& conv_parameters, OpKernelContext* ctx,
-    se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
-    se::DeviceMemory<double> input_ptr,
-    const se::dnn::FilterDescriptor& filter_desc,
-    se::DeviceMemory<double> filter_ptr,
-    const se::dnn::ConvolutionDescriptor& conv_desc,
-    const se::dnn::BatchDescriptor& output_desc,
-    se::DeviceMemory<double> output_ptr, int64_t scratch_size_limit);
-
-template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<float>(
-    bool cudnn_use_autotune,
-    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
-    const ConvParameters& conv_parameters, OpKernelContext* ctx,
-    se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
-    se::DeviceMemory<float> input_ptr,
-    const se::dnn::FilterDescriptor& filter_desc,
-    se::DeviceMemory<float> filter_ptr,
-    const se::dnn::ConvolutionDescriptor& conv_desc,
-    const se::dnn::BatchDescriptor& output_desc,
-    se::DeviceMemory<float> output_ptr, int64_t scratch_size_limit);
-
-template StatusOr<AutotuneEntry<se::dnn::ConvOp>>
-AutotuneUnfusedConv<Eigen::half>(
-    bool cudnn_use_autotune,
-    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
-    const ConvParameters& conv_parameters, OpKernelContext* ctx,
-    se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
-    se::DeviceMemory<Eigen::half> input_ptr,
-    const se::dnn::FilterDescriptor& filter_desc,
-    se::DeviceMemory<Eigen::half> filter_ptr,
-    const se::dnn::ConvolutionDescriptor& conv_desc,
-    const se::dnn::BatchDescriptor& output_desc,
-    se::DeviceMemory<Eigen::half> output_ptr, int64_t scratch_size_limit);
+#define DECLARE_GPU_SPEC(T)                                                 \
+  template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<T>( \
+      bool cudnn_use_autotune,                                              \
+      AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>*          \
+          autotune_map,                                                     \
+      const ConvParameters& conv_parameters, OpKernelContext* ctx,          \
+      se::dnn::ConvolutionKind kind,                                        \
+      const se::dnn::BatchDescriptor& input_desc,                           \
+      se::DeviceMemory<T> input_ptr,                                        \
+      const se::dnn::FilterDescriptor& filter_desc,                         \
+      se::DeviceMemory<T> filter_ptr,                                       \
+      const se::dnn::ConvolutionDescriptor& conv_desc,                      \
+      const se::dnn::BatchDescriptor& output_desc,                          \
+      se::DeviceMemory<T> output_ptr, int64_t scratch_size_limit);
+
+DECLARE_GPU_SPEC(double);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
+
+#undef DECLARE_GPU_SPEC
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index ea74f6a002f..f0f638563e0 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -32,6 +32,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+bool ComputeInNhwcEnabled(DataType data_type, se::Stream* stream,
+                          bool use_4d_tensor = true);
+
 // Get the Dnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
@@ -50,18 +53,18 @@ class DnnScratchAllocator : public se::ScratchAllocator {
   DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64 GetMemoryLimitInBytes() override { return memory_limit_; }
-  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return se::port::Status{se::port::error::INVALID_ARGUMENT,
-                              "Requested negative byte size!"};
+      return tsl::Status{tsl::error::INVALID_ARGUMENT,
+                         "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return se::port::Status{se::port::error::UNAVAILABLE,
-                              absl::StrCat("Requested memory size (", byte_size,
-                                           ") exceeds the max memory limit (",
-                                           memory_limit_, ").")};
+      return tsl::Status{tsl::error::UNAVAILABLE,
+                         absl::StrCat("Requested memory size (", byte_size,
+                                      ") exceeds the max memory limit (",
+                                      memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -69,8 +72,8 @@ class DnnScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return se::port::Status{
-          se::port::error::UNAVAILABLE,
+      return tsl::Status{
+          tsl::error::UNAVAILABLE,
           absl::StrCat("Failed to allocate the requested memory size (",
                        byte_size, ").")};
     }
@@ -78,7 +81,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return se::port::StatusOr<se::DeviceMemory<uint8>>(
+    return tsl::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index d932870f391..c83f1f03033 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -745,8 +745,10 @@ class FusedConv2DOpTest : public OpsTestBase {
 
     if (v > 1) {
       {
-        Tensor input_data_nchwv(
-            dtype, ShapeFromFormat(FORMAT_NCHW_VECT_C, n, h, w, ic));
+        TensorShape shape;
+        TF_EXPECT_OK(
+            ShapeFromFormatWithStatus(FORMAT_NCHW_VECT_C, n, h, w, ic, &shape));
+        Tensor input_data_nchwv(dtype, shape);
         input_data_nchwv.tensor<T, 5>() =
             input_data.shaped<T, 5>({n, h, w, ic / v, v})
                 .shuffle(Eigen::array<int, 5>{0, 3, 1, 2, 4});
@@ -810,8 +812,10 @@ class FusedConv2DOpTest : public OpsTestBase {
       ASSERT_TRUE(
           GetWindowedOutputSize(w, kw, stride, padding_type, &ow, &ow_padding)
               .ok());
-      side_input =
-          Tensor(dtype, ShapeFromFormat(FORMAT_NCHW_VECT_C, n, oh, ow, oc));
+      TensorShape shape;
+      TF_EXPECT_OK(
+          ShapeFromFormatWithStatus(FORMAT_NCHW_VECT_C, n, oh, ow, oc, &shape));
+      side_input = Tensor(dtype, shape);
       side_input.flat<T>() = side_input.flat<T>().setConstant(0);
     }
 
@@ -863,7 +867,10 @@ class FusedConv2DOpTest : public OpsTestBase {
       // Convert the output from NCHW_VECT_C to NHWC
       const int oh = GetTensorDim(*output, FORMAT_NCHW_VECT_C, 'H');
       const int ow = GetTensorDim(*output, FORMAT_NCHW_VECT_C, 'W');
-      Tensor output_nhwc(dtype, ShapeFromFormat(FORMAT_NHWC, n, oh, ow, oc));
+      TensorShape shape;
+      TF_EXPECT_OK(
+          ShapeFromFormatWithStatus(FORMAT_NHWC, n, oh, ow, oc, &shape));
+      Tensor output_nhwc(dtype, shape);
       output_nhwc.tensor<T, 4>() =
           output->tensor<T, 5>()
               .shuffle(Eigen::array<int, 5>{0, 2, 3, 1, 4})
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index ade16a3f35a..1be1ec63a0d 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -526,8 +526,10 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context,
                    GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
                                          padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(data_format_, batch, out_rows,
+                                             out_cols, out_depth, &out_shape));
 
     // Output tensor is of the following dimensions:
     // [ in_batch, out_rows, out_cols, out_depth ]
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index cf62c207186..bfa680e905b 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <array>
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_3d.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
@@ -32,13 +33,13 @@ namespace tensorflow {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
-void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
-                                se::dnn::PoolingMode pooling_mode,
-                                const std::array<int64_t, 3>& window,
-                                const std::array<int64_t, 3>& stride,
-                                const std::array<int64_t, 3>& padding,
-                                TensorFormat data_format,
-                                const Tensor& tensor_in, Tensor* output) {
+void DnnPooling3dImpl(OpKernelContext* context,
+                      se::dnn::PoolingMode pooling_mode,
+                      const std::array<int64_t, 3>& window,
+                      const std::array<int64_t, 3>& stride,
+                      const std::array<int64_t, 3>& padding,
+                      TensorFormat data_format, const Tensor& tensor_in,
+                      Tensor* output) {
   const auto in_shape = tensor_in.shape();
   const auto out_shape = output->shape();
 
@@ -47,11 +48,13 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
 
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC) {
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
-                                                data_format),
-                                &transformed_input));
+    TensorShape transformed_input_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, tensor_in.shape(), data_format,
+                                &transformed_input_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_input_shape,
+                                                   &transformed_input));
     functor::NHWCToNCHW<GPUDevice, T, 5>()(context->eigen_device<GPUDevice>(),
                                            tensor_in.tensor<T, 5>(),
                                            transformed_input.tensor<T, 5>());
@@ -60,11 +63,13 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
   }
   Tensor transformed_output;
   if (data_format == FORMAT_NHWC) {
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(
-                       DataTypeToEnum<T>::value,
-                       ShapeFromFormat(FORMAT_NCHW, out_shape, data_format),
-                       &transformed_output));
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(FORMAT_NCHW, out_shape, data_format,
+                                           &transformed_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
   } else {
     transformed_output = *output;
   }
@@ -125,7 +130,55 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
 }
 
 template <typename T>
-void DnnPooling3dGradOp<T>::Compute(
+void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
+                                se::dnn::PoolingMode pooling_mode,
+                                const std::array<int64_t, 3>& window,
+                                const std::array<int64_t, 3>& stride,
+                                const std::array<int64_t, 3>& padding,
+                                TensorFormat data_format,
+                                const Tensor& tensor_in, Tensor* output) {
+  DnnPooling3dImpl<T>(context, pooling_mode, window, stride, padding,
+                      data_format, tensor_in, output);
+}
+
+template <>
+void DnnPooling3dOp<Eigen::bfloat16>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::array<int64_t, 3>& window, const std::array<int64_t, 3>& stride,
+    const std::array<int64_t, 3>& padding, TensorFormat data_format,
+    const Tensor& tensor_in, Tensor* output) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = context->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+  if (cast_to_float) {
+    Tensor casted_in;
+    Tensor casted_output;
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, tensor_in.shape(),
+                                                   &casted_in));
+    cast(device, casted_in.template flat<float>(),
+         tensor_in.template flat<Eigen::bfloat16>());
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, output->shape(),
+                                                   &casted_output));
+
+    DnnPooling3dImpl<float>(context, pooling_mode, window, stride, padding,
+                            data_format, casted_in, &casted_output);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_output_const = casted_output;
+    cast_back(device, output->template flat<Eigen::bfloat16>(),
+              casted_output_const.template flat<float>());
+    return;
+  }
+  DnnPooling3dImpl<Eigen::bfloat16>(context, pooling_mode, window, stride,
+                                    padding, data_format, tensor_in, output);
+}
+
+template <typename T>
+void DnnPooling3dGradImpl(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::array<int64_t, 3>& window, const std::array<int64_t, 3>& stride,
     const std::array<int64_t, 3>& padding,
@@ -137,14 +190,20 @@ void DnnPooling3dGradOp<T>::Compute(
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
 
+  // If input is empty, we are done.
+  if (tensor_in_shape.num_elements() == 0) {
+    return;
+  }
+
   const int64_t in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
   const int64_t in_features = GetTensorDim(tensor_in_shape, data_format, 'C');
 
   Tensor transformed_input;
   TensorShape transformed_input_shape;
   if (data_format == FORMAT_NHWC || tensor_in == nullptr) {
-    transformed_input_shape =
-        ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, tensor_in_shape, data_format,
+                                &transformed_input_shape));
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
                                                    transformed_input_shape,
                                                    &transformed_input));
@@ -154,8 +213,9 @@ void DnnPooling3dGradOp<T>::Compute(
   Tensor transformed_output;
   TensorShape transformed_output_shape;
   if (data_format == FORMAT_NHWC || tensor_out == nullptr) {
-    transformed_output_shape =
-        ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, out_backprop.shape(), data_format,
+                                &transformed_output_shape));
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
                                                    transformed_output_shape,
                                                    &transformed_output));
@@ -263,10 +323,86 @@ void DnnPooling3dGradOp<T>::Compute(
   }
 }
 
+template <typename T>
+void DnnPooling3dGradOp<T>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::array<int64_t, 3>& window, const std::array<int64_t, 3>& stride,
+    const std::array<int64_t, 3>& padding,
+    const std::array<int64_t, 3>& output_size, TensorFormat data_format,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
+  DnnPooling3dGradImpl<T>(
+      context, pooling_mode, window, stride, padding, output_size, data_format,
+      out_backprop, tensor_in_shape, tensor_in, tensor_out, input_backprop);
+}
+
+template <>
+void DnnPooling3dGradOp<Eigen::bfloat16>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::array<int64_t, 3>& window, const std::array<int64_t, 3>& stride,
+    const std::array<int64_t, 3>& padding,
+    const std::array<int64_t, 3>& output_size, TensorFormat data_format,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    const Tensor* tensor_in, const Tensor* tensor_out, Tensor* input_backprop) {
+  // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+  // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+  auto* stream = context->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+  if (cast_to_float) {
+    Tensor casted_out_backprop;
+    Tensor casted_tensor_in;
+    Tensor casted_tensor_out;
+    Tensor casted_input_backprop;
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                          &casted_out_backprop));
+    cast(device, casted_out_backprop.template flat<float>(),
+         out_backprop.template flat<Eigen::bfloat16>());
+    if (tensor_in != nullptr) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, tensor_in->shape(),
+                                            &casted_tensor_in));
+      cast(device, casted_tensor_in.template flat<float>(),
+           tensor_in->template flat<Eigen::bfloat16>());
+    }
+    if (tensor_out != nullptr) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, tensor_out->shape(),
+                                            &casted_tensor_out));
+      cast(device, casted_tensor_out.template flat<float>(),
+           tensor_out->template flat<Eigen::bfloat16>());
+    }
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_FLOAT, input_backprop->shape(),
+                                          &casted_input_backprop));
+    DnnPooling3dGradImpl<float>(
+        context, pooling_mode, window, stride, padding, output_size,
+        data_format, casted_out_backprop, tensor_in_shape,
+        tensor_in != nullptr ? &casted_tensor_in : nullptr,
+        tensor_out != nullptr ? &casted_tensor_out : nullptr,
+        &casted_input_backprop);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_input_backprop_const = casted_input_backprop;
+    cast_back(device, input_backprop->template flat<Eigen::bfloat16>(),
+              casted_input_backprop_const.template flat<float>());
+    return;
+  }
+  DnnPooling3dGradImpl<Eigen::bfloat16>(
+      context, pooling_mode, window, stride, padding, output_size, data_format,
+      out_backprop, tensor_in_shape, tensor_in, tensor_out, input_backprop);
+}
+
 #define DEFINE_DNN_OPS(T)           \
   template class DnnPooling3dOp<T>; \
   template class DnnPooling3dGradOp<T>;
+
 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
+    TF_CALL_bfloat16(DEFINE_DNN_OPS)
+
 #undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 7c7b9ad5aff..95307b3323e 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -133,7 +133,7 @@ using se::dnn::RnnMode;
 using se::dnn::RnnSequenceTensorDescriptor;
 using se::dnn::RnnStateTensorDescriptor;
 using se::dnn::ToDataType;
-using se::port::StatusOr;
+using tsl::StatusOr;
 
 uint64 HashList(const std::vector<int>& list) {
   if (list.empty()) {
@@ -323,22 +323,22 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
   return DeviceMemoryBase(offset_ptr, size);
 }
 
-inline Status FromExecutorStatus(const se::port::Status& s) {
+inline Status FromExecutorStatus(const tsl::Status& s) {
   return s.ok() ? OkStatus()
                 : Status(static_cast<error::Code>(static_cast<int>(s.code())),
                          s.error_message());
 }
 
 template <typename T>
-inline Status FromExecutorStatus(const se::port::StatusOr<T>& s) {
+inline Status FromExecutorStatus(const tsl::StatusOr<T>& s) {
   return FromExecutorStatus(s.status());
 }
 
-inline se::port::Status ToExecutorStatus(const Status& s) {
+inline tsl::Status ToExecutorStatus(const Status& s) {
   return s.ok() ? OkStatus()
-                : se::port::Status(static_cast<se::port::error::Code>(
-                                       static_cast<int>(s.code())),
-                                   s.error_message());
+                : tsl::Status(
+                      static_cast<tsl::error::Code>(static_cast<int>(s.code())),
+                      s.error_message());
 }
 
 template <typename>
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 21636ef4330..0f32478dcc7 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -28,6 +28,8 @@ REGISTER4(UnaryOp, GPU, "Abs", functor::abs, Eigen::half, float, double, int64);
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
 #endif
 
+REGISTER(UnaryOp, GPU, "Abs", functor::abs, Eigen::bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 58b38042f27..fdb3de69b65 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -29,6 +29,9 @@ REGISTER3(BinaryOp, GPU, "Add", functor::add, float, Eigen::half, double);
 REGISTER3(BinaryOp, GPU, "AddV2", functor::add, float, Eigen::half, double);
 #endif
 
+REGISTER(BinaryOp, GPU, "Add", functor::add, Eigen::bfloat16);
+REGISTER(BinaryOp, GPU, "AddV2", functor::add, Eigen::bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index f6b99c61ab4..8150c6b709f 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -77,16 +77,35 @@ class ClipOp : public OpKernel {
 
 namespace functor {
 // Unary functor for clip [Tensor, Scalar, Scalar]
-template <typename T>
+template <typename T, bool is_complex = Eigen::NumTraits<T>::IsComplex>
 struct UnaryClipFunc {
   UnaryClipFunc(const T& value_min, const T& value_max)
       : value_min(value_min), value_max(value_max) {}
-  const T operator()(const T& value) const {
+  T operator()(const T& value) const {
     return std::max(std::min(value, value_max), value_min);
   }
   T value_min;
   T value_max;
 };
+
+template <typename T>
+struct UnaryClipFunc<T, /*is_complex=*/true> {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min(value_min), value_max(value_max) {}
+  T operator()(const T& value) const {
+    // Clip real and imaginary component separately, as if the clipping bounds
+    // form a box in the imaginary plane.
+    return T{std::max(std::min(Eigen::numext::real(value),
+                               Eigen::numext::real(value_max)),
+                      Eigen::numext::real(value_min)),
+             std::max(std::min(Eigen::numext::imag(value),
+                               Eigen::numext::imag(value_max)),
+                      Eigen::numext::imag(value_min))};
+  }
+  T value_min;
+  T value_max;
+};
+
 template <typename T>
 struct UnaryClipOp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
@@ -98,15 +117,30 @@ struct UnaryClipOp<CPUDevice, T> {
 };
 
 // Binary functor for clip [Tensor, Scalar, Tensor]
-template <typename T>
+template <typename T, bool is_complex = Eigen::NumTraits<T>::IsComplex>
 struct BinaryRightClipFunc {
   explicit BinaryRightClipFunc(const T& value_min) : value_min(value_min) {}
-  const T operator()(const T& value, const T& value_max) const {
+  T operator()(const T& value, const T& value_max) const {
     return std::max(std::min(value, value_max), value_min);
   }
   T value_min;
 };
 template <typename T>
+struct BinaryRightClipFunc<T, /*is_complex=*/true> {
+  explicit BinaryRightClipFunc(const T& value_min) : value_min(value_min) {}
+  T operator()(const T& value, const T& value_max) const {
+    // Clip real and imaginary component separately, as if the clipping bounds
+    // form a box in the imaginary plane.
+    return T{std::max(std::min(Eigen::numext::real(value),
+                               Eigen::numext::real(value_max)),
+                      Eigen::numext::real(value_min)),
+             std::max(std::min(Eigen::numext::imag(value),
+                               Eigen::numext::imag(value_max)),
+                      Eigen::numext::imag(value_min))};
+  }
+  T value_min;
+};
+template <typename T>
 struct BinaryRightClipOp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                   typename TTypes<T>::ConstFlat& in1_flat,
@@ -118,15 +152,30 @@ struct BinaryRightClipOp<CPUDevice, T> {
 };
 
 // Binary functor for clip [Tensor, Tensor, Scalar]
-template <typename T>
+template <typename T, bool is_complex = Eigen::NumTraits<T>::IsComplex>
 struct BinaryLeftClipFunc {
   explicit BinaryLeftClipFunc(const T& value_max) : value_max(value_max) {}
-  const T operator()(const T& value, const T& value_min) const {
+  T operator()(const T& value, const T& value_min) const {
     return std::max(std::min(value, value_max), value_min);
   }
   T value_max;
 };
 template <typename T>
+struct BinaryLeftClipFunc<T, /*is_complex=*/true> {
+  explicit BinaryLeftClipFunc(const T& value_max) : value_max(value_max) {}
+  T operator()(const T& value, const T& value_min) const {
+    // Clip real and imaginary component separately, as if the clipping bounds
+    // form a box in the imaginary plane.
+    return T{std::max(std::min(Eigen::numext::real(value),
+                               Eigen::numext::real(value_max)),
+                      Eigen::numext::real(value_min)),
+             std::max(std::min(Eigen::numext::imag(value),
+                               Eigen::numext::imag(value_max)),
+                      Eigen::numext::imag(value_min))};
+  }
+  T value_max;
+};
+template <typename T>
 struct BinaryLeftClipOp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                   typename TTypes<T>::ConstFlat& in1_flat,
@@ -138,13 +187,56 @@ struct BinaryLeftClipOp<CPUDevice, T> {
 };
 
 // Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T, bool is_complex = Eigen::NumTraits<T>::IsComplex>
+struct BinaryClipAboveFunc {
+  explicit BinaryClipAboveFunc() = default;
+  T operator()(const T& value, const T& value_max) const {
+    return std::min(value, value_max);
+  }
+};
+template <typename T>
+struct BinaryClipAboveFunc<T, /*is_complex=*/true> {
+  explicit BinaryClipAboveFunc() = default;
+  T operator()(const T& value, const T& value_max) const {
+    // Clip real and imaginary component separately, as if the clipping bounds
+    // form a box in the imaginary plane.
+    return T{
+        std::min(Eigen::numext::real(value), Eigen::numext::real(value_max)),
+        std::min(Eigen::numext::imag(value), Eigen::numext::imag(value_max))};
+  }
+};
+template <typename T, bool is_complex = Eigen::NumTraits<T>::IsComplex>
+struct BinaryClipBelowFunc {
+  explicit BinaryClipBelowFunc() = default;
+  T operator()(const T& value, const T& value_min) const {
+    return std::max(value, value_min);
+  }
+};
+template <typename T>
+struct BinaryClipBelowFunc<T, /*is_complex=*/true> {
+  explicit BinaryClipBelowFunc() = default;
+  T operator()(const T& value, const T& value_min) const {
+    // Clip real and imaginary component separately, as if the clipping bounds
+    // form a box in the imaginary plane.
+    return T{
+        std::max(Eigen::numext::real(value), Eigen::numext::real(value_min)),
+        std::max(Eigen::numext::imag(value), Eigen::numext::imag(value_min))};
+  }
+};
+
 template <typename T>
 struct TernaryClipOp<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
                   typename TTypes<T>::ConstFlat& in1_flat,
                   typename TTypes<T>::ConstFlat& in2_flat,
                   typename TTypes<T>::Flat& out_flat) const {
-    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+    if constexpr (Eigen::NumTraits<T>::IsComplex) {
+      out_flat.device(d) =
+          in0_flat.binaryExpr(in2_flat, BinaryClipAboveFunc<T>())
+              .binaryExpr(in1_flat, BinaryClipBelowFunc<T>());
+    } else {
+      out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+    }
   }
 };
 
@@ -163,6 +255,8 @@ INSTANTIATE_CPU(int32);
 INSTANTIATE_CPU(int64_t);
 INSTANTIATE_CPU(uint8);
 INSTANTIATE_CPU(uint16);
+INSTANTIATE_CPU(std::complex<float>);
+INSTANTIATE_CPU(std::complex<double>);
 #undef INSTANTIATE_CPU
 }  // namespace functor
 
@@ -181,6 +275,8 @@ REGISTER_CPU_KERNEL(int32);
 REGISTER_CPU_KERNEL(int64_t);
 REGISTER_CPU_KERNEL(uint8);
 REGISTER_CPU_KERNEL(uint16);
+REGISTER_CPU_KERNEL(std::complex<float>);
+REGISTER_CPU_KERNEL(std::complex<double>);
 #undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index e95bbd90453..2a6f2012613 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -23,6 +23,8 @@ REGISTER8(BinaryOp, CPU, "Div", functor::safe_div, uint8, uint16, uint32,
           uint64, int8, int16, int32, int64_t);
 REGISTER8(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16,
           uint32, uint64, int8, int16, int32, int64_t);
+REGISTER4(BinaryOp, CPU, "TruncateDiv", functor::truncate_div_real, Eigen::half,
+          bfloat16, float, double);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
 REGISTER6(BinaryOp, CPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
@@ -47,6 +49,8 @@ REGISTER5(BinaryOp, GPU, "RealDiv", functor::div, float, Eigen::half, double,
 REGISTER5(BinaryOp, GPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
           double, complex64, complex128);
 #endif
+REGISTER(BinaryOp, GPU, "Div", functor::div, bfloat16);
+REGISTER(BinaryOp, GPU, "RealDiv", functor::div, bfloat16);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
index f65a7c5a085..2d823e1d745 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_UNARY6(abs, Eigen::half, float, double, int64, complex64, complex128);
 #endif
+
+DEFINE_UNARY1(abs, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
index b3d82388a4c..8f724898ee2 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc
@@ -23,6 +23,8 @@ namespace functor {
 DEFINE_BINARY10(add, Eigen::half, float, double, uint8, uint16, uint32, uint64,
                 int64, complex64, complex128);
 #endif
+
+DEFINE_BINARY1(add, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
index 03635798e03..226f41eac19 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc
@@ -24,11 +24,15 @@ namespace functor {
 DEFINE_BINARY5(div_no_nan, Eigen::half, float, double, complex64, complex128);
 #endif
 
+DEFINE_BINARY1(div_no_nan, bfloat16);
+
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_BINARY10(div, Eigen::half, float, double, uint8, uint16, int16, int32,
                 int64, complex64, complex128);
 #endif
 
+DEFINE_BINARY1(div, bfloat16);
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
index a6d7e4b7285..1137836ceb0 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc
@@ -25,7 +25,9 @@ DEFINE_UNARY6(inverse, Eigen::half, float, double, int64, complex64,
               complex128);
 #endif
 
-DEFINE_SIMPLE_BINARY3(inverse_grad, Eigen::half, float, double);
+DEFINE_UNARY1(inverse, Eigen::bfloat16);
+
+DEFINE_SIMPLE_BINARY4(inverse_grad, Eigen::half, bfloat16, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
index cfa41229355..af1a1d94eea 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_BINARY6(maximum, Eigen::half, float, double, uint8, int16, int64);
 #endif
+
+DEFINE_BINARY1(maximum, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
index 056a12ae913..7e85c38c8ac 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_BINARY6(minimum, Eigen::half, float, double, uint8, int16, int64);
 #endif
+
+DEFINE_BINARY1(minimum, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
index 1905257fe63..b7751f2b488 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -28,8 +28,9 @@ DEFINE_BINARY8(mul, Eigen::half, int8, int16, int64, uint8, uint16, uint32,
 DEFINE_BINARY5(mul_no_nan, Eigen::half, float, double, complex64, complex128);
 #endif
 
-// TODO(b/179783573): Also disable the float, double and complex kernels.
-DEFINE_BINARY4(mul, float, double, complex64, complex128);
+// TODO(b/179783573): Also disable the bfloat16, float, double and complex
+// kernels.
+DEFINE_BINARY5(mul, bfloat16, float, double, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
index 8ea9759ad12..c19fa2f4569 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_UNARY6(sign, Eigen::half, float, double, int64, complex64, complex128);
 #endif
+
+DEFINE_UNARY1(sign, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
index 5ba23c4b0b1..c5844b9fc30 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_UNARY4(square, Eigen::half, float, double, int64);
 #endif
+
+DEFINE_UNARY1(square, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_squared_difference.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_squared_difference.cu.cc
index 8439bd8e9b9..165ff3060a3 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_squared_difference.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_squared_difference.cu.cc
@@ -22,6 +22,8 @@ namespace functor {
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 DEFINE_BINARY4(squared_difference, float, Eigen::half, double, int64);
 #endif
+
+DEFINE_BINARY1(squared_difference, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
index f440919e11e..82e8985ce11 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
@@ -23,6 +23,8 @@ namespace functor {
 DEFINE_BINARY8(sub, Eigen::half, float, double, int64, uint32, uint64,
                complex64, complex128);
 #endif
+
+DEFINE_BINARY1(sub, bfloat16);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_xdivy.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_xdivy.cu.cc
index 9a1338523c8..5f6cde311db 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_xdivy.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_xdivy.cu.cc
@@ -24,6 +24,8 @@ namespace functor {
 DEFINE_BINARY5(xdivy, Eigen::half, float, double, complex64, complex128);
 #endif
 
+DEFINE_BINARY1(xdivy, bfloat16);
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 7b427f0cea4..9be2a3a0fc9 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -26,6 +26,8 @@ REGISTER6(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
           double, uint8, int16, int64);
 #endif
 
+REGISTER(BinaryOp, GPU, "Maximum", functor::maximum, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 155ab8a67c7..67d1c6a8452 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -26,6 +26,8 @@ REGISTER6(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
           double, uint8, int16, int64);
 #endif
 
+REGISTER(BinaryOp, GPU, "Minimum", functor::minimum, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index cfabec5daaf..9af31086762 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -34,6 +34,9 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, Eigen::half, float, double,
           uint8);
 #endif
+
+REGISTER(BinaryOp, GPU, "Mul", functor::mul, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index e3ac9c92226..e321bddaae9 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -23,13 +23,14 @@ REGISTER6(UnaryOp, CPU, "Inv", functor::inverse, float, Eigen::half, double,
 REGISTER6(UnaryOp, GPU, "Inv", functor::inverse, float, Eigen::half, double,
           int64, complex64, complex128);
 #endif
+REGISTER(UnaryOp, GPU, "Inv", functor::inverse, bfloat16);
 #endif
 
 REGISTER6(SimpleBinaryOp, CPU, "InvGrad", functor::inverse_grad, float,
           Eigen::half, double, bfloat16, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
-          Eigen::half, double);
+REGISTER4(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double);
 #endif
 
 REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
@@ -39,12 +40,13 @@ REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER6(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64, complex64, complex128);
 #endif
+REGISTER(UnaryOp, GPU, "Reciprocal", functor::inverse, bfloat16);
 #endif
 
 REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
-          Eigen::half, double);
+REGISTER4(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double);
 #endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 5dc74df6a44..895280a22ab 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -25,6 +25,8 @@ REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
           int64, complex64, complex128);
 #endif
 
+REGISTER(UnaryOp, GPU, "Sign", functor::sign, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 45acc8b90d2..e8122ba19a2 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -28,6 +28,8 @@ REGISTER4(UnaryOp, GPU, "Square", functor::square, float, Eigen::half, double,
           int64);
 #endif
 
+REGISTER(UnaryOp, GPU, "Square", functor::square, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 13251f0d0a0..2a34dd2c529 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -24,6 +24,8 @@ REGISTER8(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
 REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
           float, Eigen::half, double, int64);
 #endif
+REGISTER(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
+         bfloat16);
 #endif
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 78d45c42237..db8c81db3cf 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -35,6 +35,8 @@ REGISTER8(BinaryOp, GPU, "Sub", functor::sub, float, Eigen::half, double, int64,
           complex64, complex128, uint32, uint64);
 #endif
 
+REGISTER(BinaryOp, GPU, "Sub", functor::sub, bfloat16);
+
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
diff --git a/tensorflow/core/kernels/cwise_op_xdivy.cc b/tensorflow/core/kernels/cwise_op_xdivy.cc
index e7236eb6788..988125c774d 100644
--- a/tensorflow/core/kernels/cwise_op_xdivy.cc
+++ b/tensorflow/core/kernels/cwise_op_xdivy.cc
@@ -17,14 +17,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Xdivy", functor::xdivy, Eigen::half, bfloat16, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER5(BinaryOp, GPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
           complex64, complex128);
 #endif
+REGISTER(BinaryOp, GPU, "Xdivy", functor::xdivy, bfloat16);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_xlog1py.cc b/tensorflow/core/kernels/cwise_op_xlog1py.cc
index a38252401a8..013cae9150e 100644
--- a/tensorflow/core/kernels/cwise_op_xlog1py.cc
+++ b/tensorflow/core/kernels/cwise_op_xlog1py.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Xlog1py", functor::xlog1py, float, Eigen::half,
-          double, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Xlog1py", functor::xlog1py, Eigen::half, bfloat16,
+          float, double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_xlogy.cc b/tensorflow/core/kernels/cwise_op_xlogy.cc
index 04a1def16c0..bcf0a5e9088 100644
--- a/tensorflow/core/kernels/cwise_op_xlogy.cc
+++ b/tensorflow/core/kernels/cwise_op_xlogy.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Xlogy", functor::xlogy, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Xlogy", functor::xlogy, Eigen::half, bfloat16, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 2f9f6107759..98065d86306 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -210,102 +210,62 @@ struct functor_traits<mul_no_nan_op<T>> {
 // constructor. Similarly, scalar_right<> is a unary functor g_y(x) =
 // f(x, y).
 
-template <typename Tout, typename Tin, typename Binary,
-          bool is_scalar_in_host_memory = false>
+template <typename Tout, typename Tin, typename Binary>
 struct scalar_left : private Binary {
   using result_type = Tout;
-  using TinPacket = typename Eigen::internal::packet_traits<Tin>::type;
 
   const Tin* left;
-  TinPacket left_packet;  // initialized iff is_scalar_in_host_memory == true
 
   inline scalar_left(const scalar_left& other) = default;
 
   template <typename... Args>
   EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c, Args... args)
-      : Binary(args...), left(c) {
-    if (is_scalar_in_host_memory) {
-      left_packet = Eigen::internal::pset1<TinPacket>(*left);
-    }
-  }
+      : Binary(args...), left(c) {}
 
   EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const {
     return Binary::operator()(*left, right);
   }
 
-  template <typename Packet,
-            typename std::enable_if<!is_scalar_in_host_memory ||
-                                        !std::is_same<TinPacket, Packet>::value,
-                                    int>::type = 0>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const {
-    const Packet left_packet = Eigen::internal::pset1<Packet>(*left);
-    return Binary::packetOp(left_packet, right_packet);
-  }
-
-  template <typename Packet,
-            typename std::enable_if<is_scalar_in_host_memory &&
-                                        std::is_same<TinPacket, Packet>::value,
-                                    int>::type = 0>
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const {
-    return Binary::packetOp(left_packet, right_packet);
+    return Binary::packetOp(Eigen::internal::pset1<Packet>(*left),
+                            right_packet);
   }
 };
 
-template <typename Tout, typename Tin, typename Binary,
-          bool is_scalar_in_host_memory>
-struct functor_traits<
-    scalar_left<Tout, Tin, Binary, is_scalar_in_host_memory>> {
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_left<Tout, Tin, Binary>> {
   enum {
     Cost = functor_traits<Binary>::Cost,
     PacketAccess = functor_traits<Binary>::PacketAccess,
   };
 };
 
-template <typename Tout, typename Tin, typename Binary,
-          bool is_scalar_in_host_memory = false>
+template <typename Tout, typename Tin, typename Binary>
 struct scalar_right : private Binary {
   using result_type = Tout;
-  using TinPacket = typename Eigen::internal::packet_traits<Tin>::type;
 
   const Tin* right;
-  TinPacket right_packet;  // initialized iff is_scalar_in_host_memory == true
 
   inline scalar_right(const scalar_right& other) = default;
 
   template <typename... Args>
   EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c, Args... args)
-      : Binary(args...), right(c) {
-    if (is_scalar_in_host_memory) {
-      right_packet = Eigen::internal::pset1<TinPacket>(*right);
-    }
-  }
+      : Binary(args...), right(c) {}
 
   EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const {
     return Binary::operator()(left, *right);
   }
 
-  template <typename Packet,
-            typename std::enable_if<!is_scalar_in_host_memory ||
-                                        !std::is_same<TinPacket, Packet>::value,
-                                    int>::type = 0>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const {
-    const Packet right_packet = Eigen::internal::pset1<Packet>(*right);
-    return Binary::packetOp(left_packet, right_packet);
-  }
-
-  template <typename Packet,
-            typename std::enable_if<is_scalar_in_host_memory &&
-                                        std::is_same<TinPacket, Packet>::value,
-                                    int>::type = 0>
+  template <typename Packet>
   EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const {
-    return Binary::packetOp(left_packet, right_packet);
+    return Binary::packetOp(left_packet,
+                            Eigen::internal::pset1<Packet>(*right));
   }
 };
 
-template <typename Tout, typename Tin, typename Binary,
-          bool is_scalar_in_host_memory>
-struct functor_traits<
-    scalar_right<Tout, Tin, Binary, is_scalar_in_host_memory>> {
+template <typename Tout, typename Tin, typename Binary>
+struct functor_traits<scalar_right<Tout, Tin, Binary>> {
   enum {
     Cost = functor_traits<Binary>::Cost,
     PacketAccess = functor_traits<Binary>::PacketAccess,
@@ -508,6 +468,33 @@ struct functor_traits<google_floor_mod<Scalar>> {
   };
 };
 
+template <typename T, typename Enable = void>
+struct google_truncate_div_real {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x,
+                                                     const T& y) const {
+    EIGEN_USING_STD(trunc)
+    return static_cast<T>(trunc(x / y));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x,
+                                                        const Packet& y) const {
+    const Packet z = pdiv(x, y);
+    return pselect(pcmp_lt(z, pzero(z)), pceil(z), pfloor(z));
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<google_truncate_div_real<Scalar>> {
+  enum {
+    Cost = 2 * Eigen::internal::scalar_div_cost<
+                   Scalar, packet_traits<Scalar>::HasDiv>::value +
+           3 * NumTraits<Scalar>::AddCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasFloor &&
+        packet_traits<Scalar>::HasCeil && packet_traits<Scalar>::HasCmp
+  };
+};
+
 #if EIGEN_COMP_GNUC && __cplusplus > 199711L
 #define DISABLE_FLOAT_EQUALITY_WARNING \
   _Pragma("GCC diagnostic push")       \
@@ -1054,6 +1041,10 @@ struct safe_floor_div : base<T, Eigen::internal::safe_div_or_mod_op<
 template <typename T>
 struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 
+template <typename T>
+struct truncate_div_real
+    : base<T, Eigen::internal::google_truncate_div_real<T>> {};
+
 template <typename T>
 struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
 
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 9bdc107f9f5..fd7ee451daa 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -370,10 +370,7 @@ struct BinaryFunctor<CPUDevice, Functor, NDIMS, false> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef
-        typename Eigen::internal::scalar_left<Tout, Tin, Binary,
-                                              /*is_scalar_in_host_memory=*/true>
-            Unary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data())));
   }
 
@@ -383,9 +380,7 @@ struct BinaryFunctor<CPUDevice, Functor, NDIMS, false> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef typename Eigen::internal::scalar_right<
-        Tout, Tin, Binary, /*is_scalar_in_host_memory=*/true>
-        Unary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data())));
   }
 
@@ -431,10 +426,7 @@ struct BinaryFunctor<CPUDevice, Functor, 2, false> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef
-        typename Eigen::internal::scalar_left<Tout, Tin, Binary,
-                                              /*is_scalar_in_host_memory=*/true>
-            Unary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data())));
   }
 
@@ -444,9 +436,7 @@ struct BinaryFunctor<CPUDevice, Functor, 2, false> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef typename Eigen::internal::scalar_right<
-        Tout, Tin, Binary, /*is_scalar_in_host_memory=*/true>
-        Unary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data())));
   }
 
@@ -569,10 +559,7 @@ struct BinaryFunctor<CPUDevice, Functor, NDIMS, true> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef
-        typename Eigen::internal::scalar_left<Tout, Tin, Binary,
-                                              /*is_scalar_in_host_memory=*/true>
-            Unary;
+    typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data(), error)));
   }
 
@@ -582,9 +569,7 @@ struct BinaryFunctor<CPUDevice, Functor, NDIMS, true> {
     typedef typename Functor::out_type Tout;
     typedef typename Functor::in_type Tin;
     typedef typename Functor::func Binary;
-    typedef typename Eigen::internal::scalar_right<
-        Tout, Tin, Binary, /*is_scalar_in_host_memory=*/true>
-        Unary;
+    typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary;
     Assign(d, out, in.unaryExpr(Unary(scalar.data(), error)));
   }
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 4c7c078254b..8a67d15b3de 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -8,6 +8,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -429,11 +430,13 @@ tf_kernel_library(
         "//tensorflow/core/data:metric_utils",
         "//tensorflow/core/data:root_dataset",
         "//tensorflow/core/data:serialization_utils",
+        "//tensorflow/core/data:tfdataz_metrics",
         "//tensorflow/core/data:unbounded_thread_pool",
         "//tensorflow/core/data:utils",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
     ],
@@ -569,6 +572,7 @@ tf_kernel_library(
         "//tensorflow/core/data:finalization_utils",
         "//tensorflow/core/data:metric_utils",
         "//tensorflow/core/data:root_dataset",
+        "//tensorflow/core/data:tfdataz_metrics",
         "//tensorflow/core/data:unbounded_thread_pool",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:env",
@@ -1504,6 +1508,7 @@ filegroup(
     name = "portable_all_op_kernels_headers",
     srcs = [
         "//tensorflow/core/data:captured_function.h",
+        "//tensorflow/core/data:compression_utils.h",
         "//tensorflow/core/data:dataset_utils.h",
         "//tensorflow/core/data:finalization_utils.h",
         "//tensorflow/core/data:metric_utils.h",
@@ -1513,6 +1518,7 @@ filegroup(
         "//tensorflow/core/data:serialization_utils.h",
         "//tensorflow/core/data:split_utils.h",
         "//tensorflow/core/data:stats_utils.h",
+        "//tensorflow/core/data:tfdataz_metrics.h",
         "//tensorflow/core/data:unbounded_thread_pool.h",
         "//tensorflow/core/data:utils.h",
         "//tensorflow/core/kernels/data/experimental:portable_all_op_kernels_headers",
@@ -1530,9 +1536,11 @@ filegroup(
     srcs = [
         ":portable_all_op_kernels_headers",
         "//tensorflow/core/data:captured_function.cc",
+        "//tensorflow/core/data:compression_utils.cc",
         "//tensorflow/core/data:dataset_utils.cc",
         "//tensorflow/core/data:finalization_utils.cc",
         "//tensorflow/core/data:metric_utils.cc",
+        "//tensorflow/core/data:tfdataz_metrics.cc",
         "//tensorflow/core/data:name_utils.cc",
         "//tensorflow/core/data:rewrite_utils.cc",
         "//tensorflow/core/data:root_dataset.cc",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 1867d24a1b1..4ad01741543 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -180,6 +180,8 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
@@ -246,9 +248,9 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (!input_impl_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty), ""));
-      } else {
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
       return OkStatus();
@@ -257,7 +259,10 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (!reader->Contains(full_name(kInputImplEmpty))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (!static_cast<bool>(input_empty)) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       } else {
         input_impl_.reset();
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index e682023df9e..f69c49613a6 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -52,8 +52,11 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     auto os_input = input->output_shapes();
     auto os_concatenate = to_concatenate->output_shapes();
     for (int i = 0; i < os_input.size(); i++) {
-      output_shapes_.push_back(
-          MostSpecificCompatibleShape(os_input[i], os_concatenate[i]));
+      PartialTensorShape output_tensorshape({});
+      OP_REQUIRES_OK(ctx,
+                     MostSpecificCompatibleShape(os_input[i], os_concatenate[i],
+                                                 &output_tensorshape));
+      output_shapes_.push_back(output_tensorshape);
     }
   }
   ~Dataset() override {
@@ -155,12 +158,16 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params), i_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       TF_ASSIGN_OR_RETURN(input_contexts_,
                           CreateInputIteratorContexts(ctx, dataset()));
-      return dataset()->input_->MakeIterator(&input_contexts_[0], this,
-                                             strings::StrCat(prefix(), "[0]"),
-                                             &input_impl_);
+      TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
+          &input_contexts_[0], this, strings::StrCat(prefix(), "[0]"),
+          &input_impl_));
+      ctx->MergeCheckpoint(input_contexts_[0].checkpoint());
+      return OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -174,6 +181,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       while (i_ < 2) {
         TF_RETURN_IF_ERROR(input_impl_->GetNext(&input_contexts_[i_],
                                                 out_tensors, end_of_sequence));
+        ctx->MergeCheckpoint(input_contexts_[i_].checkpoint());
         if (!*end_of_sequence) {
           return OkStatus();
         }
@@ -199,11 +207,11 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIndex), i_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kInputImplUninitialized),
+                              static_cast<int64_t>(!input_impl_)));
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      } else {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(kInputImplUninitialized), ""));
       }
       return OkStatus();
     }
@@ -212,7 +220,10 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kIndex), &i_));
-      if (reader->Contains(full_name(kInputImplUninitialized))) {
+      int64_t input_uninitialized;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kInputImplUninitialized),
+                                            &input_uninitialized));
+      if (static_cast<bool>(input_uninitialized)) {
         input_impl_.reset();
         return OkStatus();
       }
@@ -237,20 +248,20 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     std::vector<IteratorContext> input_contexts_;
   };
 
-  static PartialTensorShape MostSpecificCompatibleShape(
-      const PartialTensorShape& ts1, const PartialTensorShape& ts2) {
+  Status MostSpecificCompatibleShape(const PartialTensorShape& ts1,
+                                     const PartialTensorShape& ts2,
+                                     PartialTensorShape* output_tensorshape) {
     if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
-      return PartialTensorShape();
-    PartialTensorShape output_tensorshape({});
+      return OkStatus();
     auto dims1 = ts1.dim_sizes();
     auto dims2 = ts2.dim_sizes();
     for (int d = 0; d < ts1.dims(); d++) {
       if (dims1[d] == dims2[d])
-        output_tensorshape.AddDim(dims1[d]);
+        TF_RETURN_IF_ERROR(output_tensorshape->AddDimWithStatus(dims1[d]));
       else
-        output_tensorshape.AddDim(-1);
+        TF_RETURN_IF_ERROR(output_tensorshape->AddDimWithStatus(-1));
     }
-    return output_tensorshape;
+    return OkStatus();
   }
 
   const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 48bf2899d85..2be32f54b01 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -9,6 +9,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -246,6 +247,22 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "distributed_save_op",
+    srcs = ["distributed_save_op.cc"],
+    hdrs = ["distributed_save_op.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:serialization_utils",
+        "//tensorflow/core/data/service:common_proto_cc",
+        "//tensorflow/core/data/service:dispatcher_client",
+        "//tensorflow/core/data/service:grpc_util",
+        "//tensorflow/core/data/service:py_utils",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 tf_kernel_library(
     name = "data_service_ops",
     srcs = ["data_service_ops.cc"],
@@ -608,6 +625,7 @@ tf_kernel_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/data:finalization_utils",
+        "//tensorflow/core/data:unbounded_thread_pool",
         "//tensorflow/core/kernels/data:iterator_ops",
         "//tensorflow/core/platform",
     ],
@@ -623,7 +641,9 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
+        "//tensorflow/core/kernels/data:random_seed_ops",
     ],
 )
 
@@ -641,6 +661,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/data:dataset_test_base",
+        "//tensorflow/core/kernels/data:random_seed_ops",
         "//third_party/eigen3",
     ],
 )
@@ -891,6 +912,7 @@ tf_kernel_library(
     deps = [
         ":data_service_dataset_op",
         ":data_service_ops",
+        ":distributed_save_op",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index e73654f1250..76c5ef950bc 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -96,6 +96,8 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params), num_elements_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index 9f0d0b8c7e5..0f6b92dc057 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -43,6 +44,8 @@ namespace experimental {
     DirectedInterleaveDatasetOp::kNumInputDatasets;
 
 constexpr char kCycleLength[] = "cycle_length";
+constexpr char kDataInputImplEmpty[] = "data_input_impl_empty";
+constexpr char kSelectorInputImplEmpty[] = "selector_input_impl_empty";
 
 class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
  public:
@@ -158,18 +161,22 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params),
           num_active_inputs_(params.dataset->data_inputs_.size()) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
       TF_ASSIGN_OR_RETURN(input_contexts_,
                           CreateInputIteratorContexts(ctx, dataset()));
       TF_RETURN_IF_ERROR(dataset()->selector_input_->MakeIterator(
           &input_contexts_[0], this, prefix(), &selector_input_impl_));
+      ctx->MergeCheckpoint(input_contexts_[0].checkpoint());
       data_input_impls_.resize(dataset()->data_inputs_.size());
       for (size_t i = 0; i < data_input_impls_.size(); ++i) {
         const DatasetBase* data_input = dataset()->data_inputs_[i];
         TF_RETURN_IF_ERROR(data_input->MakeIterator(
             &input_contexts_[i + 1], this,
             strings::StrCat(prefix(), "[", i, "]"), &data_input_impls_[i]));
+        ctx->MergeCheckpoint(input_contexts_[i + 1].checkpoint());
       }
       return OkStatus();
     }
@@ -188,6 +195,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
         *end_of_sequence = false;
         TF_RETURN_IF_ERROR(selector_input_impl_->GetNext(
             &input_contexts_[0], &selector_result, end_of_sequence));
+        ctx->MergeCheckpoint(input_contexts_[0].checkpoint());
         if (*end_of_sequence) {
           ResetInputs();
           return OkStatus();
@@ -205,11 +213,15 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           TF_RETURN_IF_ERROR(data_input_impls_[selected_input]->GetNext(
               &input_contexts_[selected_input + 1], out_tensors,
               &end_of_selected_input));
-
+          ctx->MergeCheckpoint(
+              input_contexts_[selected_input + 1].checkpoint());
           if (!end_of_selected_input) {
             return OkStatus();
           }
 
+          // End of selected input here. Do cleanup on checkpoints.
+          ctx->PurgeCheckpoint(data_input_impls_[selected_input]->prefix());
+
           if (dataset()->stop_on_empty_dataset_) {
             *end_of_sequence = true;
             ResetInputs();
@@ -242,20 +254,19 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kSelectorInputImplEmpty),
+                              static_cast<int64_t>(!selector_input_impl_)));
       if (selector_input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, selector_input_impl_));
-      } else {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("selector_input_impl_empty"), ""));
       }
       for (size_t i = 0; i < data_input_impls_.size(); ++i) {
         const auto& data_input_impl = data_input_impls_[i];
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(kDataInputImplEmpty, "[", i, "]")),
+            static_cast<int64_t>(!data_input_impl)));
         if (data_input_impl) {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, data_input_impl));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("data_input_impl_empty[", i, "]")),
-              ""));
         }
       }
       return OkStatus();
@@ -264,14 +275,19 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (!reader->Contains(full_name("selector_input_impl_empty"))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kSelectorInputImplEmpty), &input_empty));
+      if (!static_cast<bool>(input_empty)) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, selector_input_impl_));
       } else {
         selector_input_impl_.reset();
       }
       for (size_t i = 0; i < data_input_impls_.size(); ++i) {
-        if (!reader->Contains(
-                full_name(strings::StrCat("data_input_impl_empty[", i, "]")))) {
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(kDataInputImplEmpty, "[", i, "]")),
+            &input_empty));
+        if (!static_cast<bool>(input_empty)) {
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, data_input_impls_[i]));
         } else {
           data_input_impls_[i].reset();
diff --git a/tensorflow/core/kernels/data/experimental/distributed_save_op.cc b/tensorflow/core/kernels/data/experimental/distributed_save_op.cc
new file mode 100644
index 00000000000..9e75dc4e3de
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/distributed_save_op.cc
@@ -0,0 +1,107 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/experimental/distributed_save_op.h"
+
+#include <utility>
+
+#include "absl/time/time.h"
+#include "tensorflow/core/data/serialization_utils.h"
+#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/service/dispatcher_client.h"
+#include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/py_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/protobuf/snapshot.pb.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+namespace {
+
+const absl::Duration kRetryTimeout = absl::Hours(1);
+
+constexpr char kDistributedSave[] = "DistributedSave";
+
+}  // namespace
+
+DistributedSaveOp::DistributedSaveOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  if (ctx->HasAttr(kMetadata)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kMetadata, &serialized_metadata_));
+  }
+}
+
+void DistributedSaveOp::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+  OP_REQUIRES(
+      ctx, dataset->Cardinality() != kInfiniteCardinality,
+      errors::InvalidArgument("Saving an infinite dataset is not allowed: ",
+                              dataset->DebugString()));
+
+  tstring directory;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kDirectory, &directory));
+  OP_REQUIRES(ctx, !directory.empty(),
+              errors::InvalidArgument(kDirectory, " must be nonempty"));
+
+  tstring address;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kAddress, &address));
+  OP_REQUIRES(ctx, !address.empty(),
+              errors::InvalidArgument(kAddress, " must be nonempty"));
+
+  SerializationContext::Params params(ctx);
+  SerializationContext serialization_ctx(params);
+  DatasetDef dataset_def;
+  Status s = AsGraphDef(dataset, std::move(serialization_ctx),
+                        dataset_def.mutable_graph());
+  if (!s.ok()) {
+    OP_REQUIRES_OK(
+        ctx,
+        errors::FailedPrecondition(
+            "Serialization error while trying to save dataset with tf.data "
+            "service. The dataset may depend on a resource located on a "
+            "different device. To address this, call `distributed_save` from "
+            "the device with the resource. Original error: ",
+            s));
+  }
+
+  experimental::DistributedSnapshotMetadata metadata;
+  if (!serialized_metadata_.empty()) {
+    OP_REQUIRES(ctx, metadata.ParseFromString(serialized_metadata_),
+                errors::InvalidArgument(
+                    "Failed to parse DistributedSnapshotMetadata from string: ",
+                    std::string(serialized_metadata_)));
+  }
+
+  DataServiceDispatcherClient client(address, DefaultProtocol());
+  int64_t deadline_micros =
+      EnvTime::NowMicros() + absl::ToInt64Microseconds(kRetryTimeout);
+  OP_REQUIRES_OK(
+      ctx,
+      grpc_util::Retry(
+          [&]() { return client.Snapshot(dataset_def, directory, metadata); },
+          /*description=*/
+          strings::StrCat("save with tf.data service dispatcher at ", address),
+          deadline_micros));
+}
+
+REGISTER_KERNEL_BUILDER(Name(kDistributedSave).Device(DEVICE_CPU),
+                        DistributedSaveOp);
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/distributed_save_op.h b/tensorflow/core/kernels/data/experimental/distributed_save_op.h
new file mode 100644
index 00000000000..d88642f2c0e
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/distributed_save_op.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+// Initiates the process of distributedly saving a dataset to disk.
+class DistributedSaveOp : public OpKernel {
+ public:
+  static constexpr const char* const kDirectory = "directory";
+  static constexpr const char* const kAddress = "address";
+  static constexpr const char* const kMetadata = "metadata";
+
+  explicit DistributedSaveOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::string serialized_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_DISTRIBUTED_SAVE_OP_H_
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
index 77bedf4043a..9eba8acee1a 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
@@ -130,6 +130,8 @@ class ListDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       if (ctx->split_providers().empty()) {
         split_provider_ =
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index a225b18a9cb..b4f3a336507 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -213,6 +213,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (deregister_fn_) deregister_fn_();
     }
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(*mu_);
       interleave_depth_ = ctx->interleave_depth();
@@ -226,8 +228,10 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
           [this]() { CancelThreads(/*wait=*/false); }, &deregister_fn_));
       IteratorContext::Params params(ctx);
       params.cancellation_manager = cancellation_manager_.get();
+      IteratorContext iter_ctx(params);
       TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
-          IteratorContext(params), this, prefix(), &input_impl_));
+          &iter_ctx, this, prefix(), &input_impl_));
+      ctx->MergeCheckpoint(iter_ctx.checkpoint());
       return dataset()->captured_func_->Instantiate(
           ctx, &instantiated_captured_func_);
     }
@@ -264,6 +268,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (result->output_allocated) {
         RecordBufferDequeue(ctx, result->output);
       }
+      ctx->MergeCheckpoint(&result->checkpoint);
       TF_RETURN_IF_ERROR(
           ProcessBatch(dataset()->batch_size_, result->num_elements,
                        dataset()->drop_remainder_, result->status, ctx,
@@ -284,6 +289,12 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
+      if (ctx->symbolic_checkpoint()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCallCounter), 0));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kBatchResultsSize), 0));
+        return OkStatus();
+      }
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
       while (num_calls_ > 0) {
@@ -380,6 +391,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       int64_t status_offset TF_GUARDED_BY(mu);
       // Counts the number of outstanding calls for this batch.
       int64_t num_calls TF_GUARDED_BY(&Iterator::mu_);
+      MemoryCheckpoint checkpoint TF_GUARDED_BY(mu);
       const uint64 uid = -1;
     };
 
@@ -415,6 +427,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       bool return_early;
       {
         mutex_lock l(result->mu);
+        result->checkpoint.Merge(ctx->checkpoint());
         result->end_of_input = result->end_of_input || end_of_input;
         result->status.Update(status);
         return_early = result->end_of_input || !result->status.ok();
diff --git a/tensorflow/core/kernels/data/experimental/random_access_ops.h b/tensorflow/core/kernels/data/experimental/random_access_ops.h
index bcc54ba59de..ddc5e9dabf5 100644
--- a/tensorflow/core/kernels/data/experimental/random_access_ops.h
+++ b/tensorflow/core/kernels/data/experimental/random_access_ops.h
@@ -26,20 +26,33 @@ namespace data {
 namespace experimental {
 
 // An operation that can get an element at a specified index in a dataset.
-class GetElementAtIndexOp : public HybridAsyncOpKernel {
+class GetElementAtIndexOp : public AsyncOpKernel {
  public:
   explicit GetElementAtIndexOp(OpKernelConstruction* ctx)
-      : HybridAsyncOpKernel(ctx, "tf_data_get_element_at_index") {
+      : AsyncOpKernel(ctx),
+        unbounded_threadpool_(ctx->env(), "tf_data_get_element_at_index") {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
 
   ~GetElementAtIndexOp() override {}
 
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    unbounded_threadpool_.Schedule([this, ctx, done = std::move(done)]() {
+      ctx->SetStatus(DoCompute(ctx));
+      done();
+    });
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    ctx->SetStatus(DoCompute(ctx));
+  }
+
  protected:
-  Status DoCompute(OpKernelContext* ctx) override;
+  Status DoCompute(OpKernelContext* ctx);
 
  private:
+  UnboundedThreadPool unbounded_threadpool_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
 };
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index e00ca24c4ba..b6630b63219 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/random_seed_ops.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -39,16 +41,41 @@ namespace experimental {
 /* static */ constexpr const char* const RandomDatasetOp::kSeed2;
 /* static */ constexpr const char* const RandomDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const RandomDatasetOp::kOutputShapes;
+/* static */ constexpr const char* const
+    RandomDatasetOp::kRerandomizeEachIteration;
+
+namespace {
+
+constexpr char kRandomDatasetV1[] = "RandomDataset";
+constexpr char kRandomDatasetV2[] = "RandomDatasetV2";
+constexpr char kSeedGenerator[] = "SeedGenerator";
+constexpr char kEpochNumRandomSamples[] = "epoch_num_random_samples";
+constexpr char kNumRandomSamples[] = "num_random_samples";
+
+}  // namespace
 
 class RandomDatasetOp::Dataset : public DatasetBase {
  public:
-  Dataset(OpKernelContext* ctx, int64_t seed, int64_t seed2)
-      : DatasetBase(DatasetContext(ctx)), seeds_(seed, seed2) {}
+  Dataset(OpKernelContext* ctx, RandomSeeds&& seeds,
+          SeedGeneratorManager* manager, ResourceHandle&& resource_handle,
+          bool owns_resource, int op_version)
+      : DatasetBase(DatasetContext(ctx)),
+        seeds_(std::move(seeds)),
+        op_version_(op_version),
+        manager_(manager),
+        resource_handle_(resource_handle),
+        resource_mgr_(ctx->resource_manager()),
+        owns_resource_(owns_resource) {}
 
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::make_unique<Iterator>(
-        Iterator::Params{this, strings::StrCat(prefix, "::Random")});
+  ~Dataset() override {
+    manager_->Unref();
+    if (owns_resource_) {
+      Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete RNG resource: " << s.ToString();
+      }
+    }
   }
 
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
@@ -61,6 +88,13 @@ class RandomDatasetOp::Dataset : public DatasetBase {
     return OkStatus();
   }
 
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return std::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Random")},
+        manager_->get().get());
+  }
+
   const DataTypeVector& output_dtypes() const override {
     static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
     return *dtypes;
@@ -73,8 +107,11 @@ class RandomDatasetOp::Dataset : public DatasetBase {
   }
 
   string DebugString() const override {
-    return strings::StrCat("RandomDatasetOp(", seeds_.first, ", ",
-                           seeds_.second, ")::Dataset");
+    name_utils::DatasetDebugStringParams params;
+    params.op_version = op_version_;
+    params.set_args(seeds_.input_seed(), seeds_.input_seed2());
+    return name_utils::DatasetDebugString(RandomDatasetOp::kDatasetType,
+                                          params);
   }
 
   int64_t CardinalityInternal() const override { return kInfiniteCardinality; }
@@ -89,23 +126,44 @@ class RandomDatasetOp::Dataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
-    Node* seed = nullptr;
-    Node* seed2 = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.first, &seed));
-    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.second, &seed2));
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {seed, seed2}, output));
-    return OkStatus();
+    Node* seed_node = nullptr;
+    Node* seed2_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed(), &seed_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.input_seed2(), &seed2_node));
+    if (op_version_ == 1) {
+      return b->AddDataset(this, {seed_node, seed2_node}, output);
+    }
+    Node* resource_handle_node = nullptr;
+    Tensor handle(DT_RESOURCE, TensorShape({}));
+    handle.scalar<ResourceHandle>()() = resource_handle_;
+    TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
+    AttrValue rerandomize_each_iteration;
+    b->BuildAttrValue(manager_->get()->reshuffle_each_iteration(),
+                      &rerandomize_each_iteration);
+    return b->AddDataset(
+        this, {seed_node, seed2_node, resource_handle_node},
+        {std::make_pair(kRerandomizeEachIteration, rerandomize_each_iteration)},
+        output);
   }
 
  private:
   class Iterator : public DatasetIterator<Dataset> {
    public:
-    explicit Iterator(const Params& params)
+    Iterator(const Params& params, SeedGenerator* seed_generator)
         : DatasetIterator<Dataset>(params),
-          seeds_(MaybeOverrideSeeds(dataset()->seeds_)),
-          parent_generator_(seeds_.first, seeds_.second),
+          seed_generator_(seed_generator),
+          parent_generator_(seed_generator_->seed(), seed_generator_->seed2()),
           generator_(&parent_generator_) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
+    Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(mu_);
+      seed_generator_->GenerateSeeds(&seed_, &seed2_);
+      ResetRngs();
+      return OkStatus();
+    }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -117,7 +175,6 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       return OkStatus();
     }
 
-   protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
       return model::MakeSourceNode(std::move(args));
@@ -126,21 +183,41 @@ class RandomDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
+      // Save state needed to restore the random number generators.
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kEpochNumRandomSamples),
+                              seed_generator_->num_random_samples()));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kNumRandomSamples),
                                              num_random_samples_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kSeed), seed_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kSeed2), seed2_));
       return OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
+      // Restore the random number generators.
+      int64_t num_random_samples;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kEpochNumRandomSamples),
+                                            &num_random_samples));
+      seed_generator_->set_num_random_samples(num_random_samples);
+      seed_generator_->Reset();
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kNumRandomSamples),
                                             &num_random_samples_));
-      parent_generator_ = random::PhiloxRandom(seeds_.first, seeds_.second);
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kSeed), &seed_));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kSeed2), &seed2_));
+      ResetRngs();
+      return OkStatus();
+    }
+
+   protected:
+    void ResetRngs() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // Reset the generators based on the current iterator seeds.
+      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
       generator_ =
           random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
       generator_.Skip(num_random_samples_);
-      return OkStatus();
     }
 
    private:
@@ -150,32 +227,96 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       auto out = generator_();
       return out;
     }
-    const std::pair<int64_t, int64_t> seeds_;
+
     mutex mu_;
+    SeedGenerator* const seed_generator_ TF_GUARDED_BY(mu_);  // Not owned.
     random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
     random::SingleSampleAdapter<random::PhiloxRandom> generator_
         TF_GUARDED_BY(mu_);
     int64_t num_random_samples_ TF_GUARDED_BY(mu_) = 0;
+    int64_t seed_ TF_GUARDED_BY(mu_) = 0;
+    int64_t seed2_ TF_GUARDED_BY(mu_) = 0;
   };
 
-  const std::pair<int64_t, int64_t> seeds_;
-};  // RandomDatasetOp::Dataset
+ private:
+  const RandomSeeds seeds_;
+  const int op_version_;
+  SeedGeneratorManager* const manager_;  // Owned
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+  const bool owns_resource_;
+};
 
 RandomDatasetOp::RandomDatasetOp(OpKernelConstruction* ctx)
-    : DatasetOpKernel(ctx) {}
+    : DatasetOpKernel(ctx) {
+  auto& op_name = ctx->def().op();
+  if (op_name == kRandomDatasetV2) {
+    op_version_ = 2;
+  } else if (op_name == kRandomDatasetV1) {
+    op_version_ = 1;
+  }
+  if (ctx->HasAttr(kRerandomizeEachIteration)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kRerandomizeEachIteration,
+                                     &rerandomize_each_iteration_));
+  }
+}
 
 void RandomDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
   int64_t seed;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, "seed", &seed));
-
   int64_t seed2;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, "seed2", &seed2));
+  RandomSeeds seeds(seed, seed2);
+  static std::atomic<int64_t> resource_id_counter(0);
+  const string& container = ctx->resource_manager()->default_container();
+  auto name = strings::StrCat(ctx->op_kernel().name(), "/", kSeedGenerator, "_",
+                              resource_id_counter.fetch_add(1));
+  SeedGeneratorManager* manager = nullptr;
+  ResourceHandle handle;
+  bool owns_resource = true;
+  if (op_version_ == 2) {
+    handle = HandleFromInput(ctx, 2);
+    Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
+        handle.container(), handle.name(), &manager);
+    owns_resource = false;
+    if (errors::IsNotFound(s)) {
+      owns_resource = true;
+    } else {
+      OP_REQUIRES_OK(ctx, s);
+    }
+  }
 
-  *output = new Dataset(ctx, seed, seed2);
+  // TODO(b/259308104): Rather than managing resources directly, use ref
+  // counting resource handles: go/tf-ref-counting-resource-handles.
+  if (owns_resource) {
+    OP_REQUIRES_OK(
+        ctx,
+        ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+            container, name, &manager,
+            [rerandomize = rerandomize_each_iteration_,
+             &seeds](SeedGeneratorManager** manager) {
+              if (rerandomize) {
+                *manager =
+                    new SeedGeneratorManager(new RandomSeedGenerator(seeds));
+              } else {
+                *manager =
+                    new SeedGeneratorManager(new FixedSeedGenerator(seeds));
+              }
+              return OkStatus();
+            }));
+    handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
+  }
+
+  *output = new RandomDatasetOp::Dataset(ctx, std::move(seeds), manager,
+                                         std::move(handle), owns_resource,
+                                         op_version_);
 }
+
 namespace {
 
-REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name(kRandomDatasetV1).Device(DEVICE_CPU),
+                        RandomDatasetOp);
+REGISTER_KERNEL_BUILDER(Name(kRandomDatasetV2).Device(DEVICE_CPU),
                         RandomDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.h b/tensorflow/core/kernels/data/experimental/random_dataset_op.h
index e93265f5969..2b3624fedfc 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.h
@@ -33,6 +33,8 @@ class RandomDatasetOp : public DatasetOpKernel {
   static constexpr const char* const kSeed2 = "seed2";
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kRerandomizeEachIteration =
+      "rerandomize_each_iteration";
 
   explicit RandomDatasetOp(OpKernelConstruction* ctx);
 
@@ -41,6 +43,8 @@ class RandomDatasetOp : public DatasetOpKernel {
 
  private:
   class Dataset;
+  int32_t op_version_;
+  bool rerandomize_each_iteration_ = false;
 };
 
 }  // namespace experimental
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
index 018c41b0c2e..f93cf82dbee 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
@@ -11,7 +11,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/random_dataset_op.h"
 
+#include <vector>
+
 #include "tensorflow/core/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/random_seed_ops.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 
@@ -26,40 +29,81 @@ constexpr char kIteratorPrefix[] = "Iterator";
 // Number of random samples generated per test
 constexpr int kCount = 10;
 
+void GenerateExpectedEpochData(int64_t seed, int64_t seed2, int count,
+                               std::vector<Tensor>* epoch_data) {
+  auto parent_generator = random::PhiloxRandom(seed, seed2);
+  auto generator =
+      random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator);
+  for (int i = 0; i < count; ++i) {
+    epoch_data->push_back(
+        CreateTensor<int64_t>(TensorShape({}), {generator()}));
+  }
+}
+
 // Generate the first `count` random numbers that the kernel should produce
 // for a given seed/seed2 combo.
 // For compatibility with the test harness, return value is a vector of scalar
 // Tensors.
-std::vector<Tensor> GenerateExpectedData(int64_t seed, int64_t seed2,
-                                         int count) {
+std::vector<Tensor> GenerateExpectedData(int64_t seed, int64_t seed2, int count,
+                                         bool rerandomize_each_iteration,
+                                         int iterations) {
+  RandomSeedGenerator parent_seed_generator(RandomSeeds(seed, seed2));
   std::vector<Tensor> ret;
-  auto parent_generator = random::PhiloxRandom(seed, seed2);
-  auto generator =
-      random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator);
+  for (int j = 0; j < iterations; ++j) {
+    if (rerandomize_each_iteration) {
+      parent_seed_generator.GenerateSeeds(&seed, &seed2);
+    }
+    GenerateExpectedEpochData(seed, seed2, count, &ret);
+  }
+  return ret;
+}
 
-  for (int i = 0; i < count; ++i) {
-    ret.push_back(CreateTensor<int64_t>(TensorShape({}), {generator()}));
+std::vector<Tensor> GenerateExpectedSaveAndRestoreData(
+    int64_t seed, int64_t seed2, int count, bool rerandomize_each_iteration) {
+  RandomSeedGenerator parent_seed_generator(RandomSeeds(seed, seed2));
+  if (rerandomize_each_iteration) {
+    // The save and restore test harness `ITERATOR_SAVE_AND_RESTORE_TEST_P`
+    // calls `MakeIterator` twice. There are therefore 2 calls here to generate
+    // seeds where the first set of seeds are skipped in order to generate
+    // the expected random numbers generated.
+    parent_seed_generator.GenerateSeeds(&seed, &seed2);
+    parent_seed_generator.GenerateSeeds(&seed, &seed2);
   }
+  std::vector<Tensor> ret;
+  GenerateExpectedEpochData(seed, seed2, count, &ret);
   return ret;
 }
 
 class RandomDatasetParams : public DatasetParams {
  public:
-  RandomDatasetParams(int64_t seed, int64_t seed2, DataTypeVector output_dtypes,
+  RandomDatasetParams(int64_t seed, int64_t seed2, int32_t op_version,
+                      bool rerandomize_each_iteration,
+                      DataTypeVector output_dtypes,
                       std::vector<PartialTensorShape> output_shapes,
                       string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         seed_(CreateTensor<int64_t>(TensorShape({}), {seed})),
-        seed2_(CreateTensor<int64_t>(TensorShape({}), {seed2})) {}
+        seed2_(CreateTensor<int64_t>(TensorShape({}), {seed2})),
+        dummy_resource_handle_(CreateDummyResourceHandle()),
+        seed_generator_resource_(CreateTensor<ResourceHandle>(
+            TensorShape({}), {dummy_resource_handle_})),
+        rerandomize_each_iteration_(rerandomize_each_iteration) {
+    op_version_ = op_version;
+  }
+
+  ResourceHandle CreateDummyResourceHandle() { return ResourceHandle(); }
 
   virtual std::vector<Tensor> GetInputTensors() const override {
-    return {seed_, seed2_};
+    return {seed_, seed2_, seed_generator_resource_};
   }
 
   virtual Status GetInputNames(
       std::vector<string>* input_names) const override {
     *input_names = {RandomDatasetOp::kSeed, RandomDatasetOp::kSeed2};
+    if (op_version_ == 2) {
+      input_names->emplace_back("seed_generator");
+    }
     return OkStatus();
   }
 
@@ -67,6 +111,10 @@ class RandomDatasetParams : public DatasetParams {
     *attributes = {{"output_types", output_dtypes_},
                    {"output_shapes", output_shapes_},
                    {"metadata", ""}};
+    if (op_version_ == 2) {
+      attributes->emplace_back("rerandomize_each_iteration",
+                               rerandomize_each_iteration_);
+    }
     return OkStatus();
   }
 
@@ -77,6 +125,9 @@ class RandomDatasetParams : public DatasetParams {
  private:
   Tensor seed_;
   Tensor seed2_;
+  ResourceHandle dummy_resource_handle_;
+  Tensor seed_generator_resource_;
+  bool rerandomize_each_iteration_;
 };
 
 class RandomDatasetOpTest : public DatasetOpsTestBase {};
@@ -84,7 +135,10 @@ class RandomDatasetOpTest : public DatasetOpsTestBase {};
 RandomDatasetParams FortyTwo() {
   return {/*seed=*/42,
           /*seed2=*/42,
-          /*output_dtypes=*/{DT_INT64},
+          /*op_version=*/1,
+          /*rerandomize_each_iteration=*/false,
+          /*output_dtypes=*/
+          {DT_INT64},
           /*output_shapes=*/{PartialTensorShape({})},
           /*node_name=*/kNodeName};
 }
@@ -93,6 +147,8 @@ RandomDatasetParams FortyTwo() {
 RandomDatasetParams ChangeSeed() {
   return {/*seed=*/1000,
           /*seed2=*/42,
+          /*op_version=*/1,
+          /*rerandomize_each_iteration=*/false,
           /*output_dtypes=*/{DT_INT64},
           /*output_shapes=*/{PartialTensorShape({})},
           /*node_name=*/kNodeName};
@@ -102,6 +158,74 @@ RandomDatasetParams ChangeSeed() {
 RandomDatasetParams ChangeSeed2() {
   return {/*seed=*/42,
           /*seed2=*/1000,
+          /*op_version=*/1,
+          /*rerandomize_each_iteration=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+RandomDatasetParams FortyTwoV2RerandomizeEachIterationFalse() {
+  return {/*seed=*/42,
+          /*seed2=*/42,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/false,
+          /*output_dtypes=*/
+          {DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+// Change just first seed relative to FortyTwo
+RandomDatasetParams ChangeSeedV2RerandomizeEachIterationFalse() {
+  return {/*seed=*/1000,
+          /*seed2=*/42,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+// Change just second seed relative to FortyTwo
+RandomDatasetParams ChangeSeed2V2RerandomizeEachIterationFalse() {
+  return {/*seed=*/42,
+          /*seed2=*/1000,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/false,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+RandomDatasetParams FortyTwoV2RerandomizeEachIterationTrue() {
+  return {/*seed=*/42,
+          /*seed2=*/42,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/true,
+          /*output_dtypes=*/
+          {DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+// Change just first seed relative to FortyTwo
+RandomDatasetParams ChangeSeedV2RerandomizeEachIterationTrue() {
+  return {/*seed=*/1000,
+          /*seed2=*/42,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/true,
+          /*output_dtypes=*/{DT_INT64},
+          /*output_shapes=*/{PartialTensorShape({})},
+          /*node_name=*/kNodeName};
+}
+
+// Change just second seed relative to FortyTwo
+RandomDatasetParams ChangeSeed2V2RerandomizeEachIterationTrue() {
+  return {/*seed=*/42,
+          /*seed2=*/1000,
+          /*op_version=*/2,
+          /*rerandomize_each_iteration=*/true,
           /*output_dtypes=*/{DT_INT64},
           /*output_shapes=*/{PartialTensorShape({})},
           /*node_name=*/kNodeName};
@@ -113,11 +237,47 @@ class ParameterizedGetNextTest : public RandomDatasetOpTest,
 
 std::vector<GetNextTestCase<RandomDatasetParams>> GetNextTestCases() {
   return {{/*dataset_params=*/FortyTwo(),
-           /*expected_outputs=*/GenerateExpectedData(42, 42, kCount)},
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 42, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
           {/*dataset_params=*/ChangeSeed(),
-           /*expected_outputs=*/GenerateExpectedData(1000, 42, kCount)},
+           /*expected_outputs=*/GenerateExpectedData(
+               1000, 42, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
           {/*dataset_params=*/ChangeSeed2(),
-           /*expected_outputs=*/GenerateExpectedData(42, 1000, kCount)}};
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 1000, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
+          {/*dataset_params=*/FortyTwoV2RerandomizeEachIterationFalse(),
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 42, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
+          {/*dataset_params=*/ChangeSeedV2RerandomizeEachIterationFalse(),
+           /*expected_outputs=*/GenerateExpectedData(
+               1000, 42, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
+          {/*dataset_params=*/ChangeSeed2V2RerandomizeEachIterationFalse(),
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 1000, kCount,
+               /*rerandomize_each_iteration=*/false,
+               /*iterations=*/2)},
+          {/*dataset_params=*/FortyTwoV2RerandomizeEachIterationTrue(),
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 42, kCount,
+               /*rerandomize_each_iteration=*/true, /*iterations=*/2)},
+          {/*dataset_params=*/ChangeSeedV2RerandomizeEachIterationTrue(),
+           /*expected_outputs=*/GenerateExpectedData(
+               1000, 42, kCount,
+               /*rerandomize_each_iteration=*/true, /*iterations=*/2)},
+          {/*dataset_params=*/ChangeSeed2V2RerandomizeEachIterationTrue(),
+           /*expected_outputs=*/GenerateExpectedData(
+               42, 1000, kCount,
+               /*rerandomize_each_iteration=*/true, /*iterations=*/2)}};
 }
 
 TEST_P(ParameterizedGetNextTest, GetNext) {
@@ -128,7 +288,20 @@ TEST_P(ParameterizedGetNextTest, GetNext) {
   // under test produces unbounded input.
   bool end_of_sequence = false;
   std::vector<Tensor> out_tensors;
-  while (out_tensors.size() < test_case.expected_outputs.size()) {
+  while (out_tensors.size() < kCount) {
+    std::vector<Tensor> next;
+    TF_ASSERT_OK(
+        iterator_->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
+
+    ASSERT_FALSE(end_of_sequence);  // Dataset should never stop
+
+    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
+  }
+
+  TF_ASSERT_OK(dataset_->MakeIterator(
+      iterator_ctx_.get(), /*parent=*/nullptr,
+      test_case.dataset_params.iterator_prefix(), &iterator_));
+  while (out_tensors.size() < 2 * kCount) {
     std::vector<Tensor> next;
     TF_ASSERT_OK(
         iterator_->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
@@ -167,8 +340,8 @@ DATASET_TYPE_STRING_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
 
 std::vector<DatasetOutputDtypesTestCase<RandomDatasetParams>>
 DatasetOutputDtypesTestCases() {
-  return {
-      {/*dataset_params=*/FortyTwo(), /*expected_output_dtypes=*/{DT_INT64}}};
+  return {{/*dataset_params=*/FortyTwo(),
+           /*expected_output_dtypes=*/{DT_INT64}}};
 }
 
 DATASET_OUTPUT_DTYPES_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
@@ -193,8 +366,8 @@ DATASET_CARDINALITY_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
 
 std::vector<IteratorOutputDtypesTestCase<RandomDatasetParams>>
 IteratorOutputDtypesTestCases() {
-  return {
-      {/*dataset_params=*/FortyTwo(), /*expected_output_dtypes=*/{DT_INT64}}};
+  return {{/*dataset_params=*/FortyTwo(),
+           /*expected_output_dtypes=*/{DT_INT64}}};
 }
 
 ITERATOR_OUTPUT_DTYPES_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
@@ -222,7 +395,22 @@ ITERATOR_PREFIX_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
 std::vector<IteratorSaveAndRestoreTestCase<RandomDatasetParams>>
 IteratorSaveAndRestoreTestCases() {
   return {{/*dataset_params=*/FortyTwo(), /*breakpoints=*/{2, 5, 8},
-           /*expected_outputs=*/GenerateExpectedData(42, 42, 9 /* 8 + 1 */)}};
+           /*expected_outputs=*/
+           GenerateExpectedSaveAndRestoreData(
+               42, 42, 9 /* 8 + 1 */,
+               /*rerandomize_each_iteration=*/false)},
+          {/*dataset_params=*/FortyTwoV2RerandomizeEachIterationFalse(),
+           /*breakpoints=*/{2, 5, 8},
+           /*expected_outputs=*/
+           GenerateExpectedSaveAndRestoreData(
+               42, 42, 9 /* 8 + 1 */,
+               /*rerandomize_each_iteration=*/false)},
+          {/*dataset_params=*/FortyTwoV2RerandomizeEachIterationTrue(),
+           /*breakpoints=*/{2, 5, 8},
+           /*expected_outputs=*/
+           GenerateExpectedSaveAndRestoreData(
+               42, 42, 9 /* 8 + 1 */,
+               /*rerandomize_each_iteration=*/true)}};
 }
 
 ITERATOR_SAVE_AND_RESTORE_TEST_P(RandomDatasetOpTest, RandomDatasetParams,
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 5e654cbc132..59c4adf0d1a 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -179,6 +179,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params),
             state_(params.dataset->initial_state_) {}
 
+      bool SymbolicCheckpointCompatible() const override { return true; }
+
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -274,13 +276,11 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             dataset()->captured_func_->CheckExternalState()));
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        if (!state_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("state_size"), state_.size()));
-          for (int idx = 0; idx < state_.size(); idx++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat("state[", idx, "]")), state_[idx]));
-          }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("state_size"), state_.size()));
+        for (int idx = 0; idx < state_.size(); idx++) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat("state[", idx, "]")), state_[idx]));
         }
         return OkStatus();
       }
@@ -289,16 +289,13 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        if (reader->Contains(full_name("state_size"))) {
-          int64_t size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("state_size"), &size));
-          state_.resize(size);
-          for (int idx = 0; idx < size; idx++) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                ctx->flr(), full_name(strings::StrCat("state[", idx, "]")),
-                &state_[idx]));
-          }
+        int64_t size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("state_size"), &size));
+        state_.resize(size);
+        for (int idx = 0; idx < size; idx++) {
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              ctx->flr(), full_name(strings::StrCat("state[", idx, "]")),
+              &state_[idx]));
         }
         return OkStatus();
       }
diff --git a/tensorflow/core/kernels/data/experimental/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
index 9b9b8af66fe..fc0f47fc7f3 100644
--- a/tensorflow/core/kernels/data/experimental/sql/BUILD
+++ b/tensorflow/core/kernels/data/experimental/sql/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index af5261bccad..05ba54121b7 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -29,6 +29,8 @@ namespace data {
 namespace experimental {
 namespace {
 
+constexpr char kInputImplEmpty[] = "input_impl_empty";
+
 class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
@@ -125,6 +127,8 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params) {}
 
+      bool SymbolicCheckpointCompatible() const override { return true; }
+
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -179,11 +183,10 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
         TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
             dataset()->captured_func_->CheckExternalState()));
         mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
         if (input_impl_) {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impls_empty"), ""));
         }
         return OkStatus();
       }
@@ -191,7 +194,10 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (reader->Contains(full_name("input_impls_empty"))) {
+        int64_t input_empty;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+        if (static_cast<bool>(input_empty)) {
           input_impl_.reset();
         } else {
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index e48e07b7f2c..10a97c9e365 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -27,6 +27,8 @@ namespace data {
 namespace experimental {
 namespace {
 
+constexpr char kInputImplEmpty[] = "input_impl_empty";
+
 class UnbatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit UnbatchDatasetOp(OpKernelConstruction* ctx)
@@ -118,6 +120,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
             current_batch_size_(0),
             shapes_(params.dataset->output_shapes().size()) {}
 
+      bool SymbolicCheckpointCompatible() const override { return true; }
+
       Status Initialize(IteratorContext* ctx) override {
         return dataset()->input_->MakeIterator(ctx, this, prefix(),
                                                &input_impl_);
@@ -195,11 +199,10 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(SerializationContext* ctx,
                           IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
         if (input_impl_) {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
         }
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("current_index"), current_index_));
@@ -217,7 +220,10 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        if (!reader->Contains(full_name("input_impl_empty"))) {
+        int64_t input_empty;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+        if (!static_cast<bool>(input_empty)) {
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 2e4447040c5..7128048c25e 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -41,7 +41,7 @@ namespace data {
 /* static */ constexpr const char* const FilterDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const FilterDatasetOp::kOutputShapes;
 
-constexpr char kInputImplsEmpty[] = "input_impls_empty";
+constexpr char kInputImplEmpty[] = "input_impl_empty";
 constexpr char kFilteredElements[] = "filtered_elements";
 constexpr char kDroppedElements[] = "dropped_elements";
 
@@ -114,6 +114,8 @@ class FilterDatasetOp::Dataset : public DatasetBase {
           filtered_elements_(0),
           dropped_elements_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       TF_RETURN_IF_ERROR(
           dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -121,13 +123,12 @@ class FilterDatasetOp::Dataset : public DatasetBase {
           ctx, &instantiated_captured_func_);
     }
 
+    // NOTE(mrry): This method is thread-safe as long as `input_impl_` and `f`
+    // are thread-safe. However, if multiple threads enter this method,
+    // outputs may be observed in a non-deterministic order.
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      // NOTE(mrry): This method is thread-safe as long as
-      // `input_impl_` and `f` are thread-safe. However, if multiple
-      // threads enter this method, outputs may be observed in a
-      // non-deterministic order.
       auto stats_aggregator = ctx->stats_aggregator();
       bool matched;
       do {
@@ -204,11 +205,11 @@ class FilterDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       mutex_lock l(mu_);
-      if (input_impl_)
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      else
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(kInputImplsEmpty), ""));
+      }
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kFilteredElements),
                                              filtered_elements_));
       TF_RETURN_IF_ERROR(
@@ -219,10 +220,14 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (reader->Contains(full_name(kInputImplsEmpty)))
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (static_cast<bool>(input_empty)) {
         input_impl_.reset();
-      else
+      } else {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+      }
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kFilteredElements),
                                             &filtered_elements_));
       TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 75d47196959..81701ae2bb2 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -125,6 +125,8 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       TF_RETURN_IF_ERROR(
           dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -145,8 +147,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           // We are currently processing a mapped element, so try to get the
           // next subelement.
           bool end_of_element;
+          auto nested_ctx = MakeNestedIteratorContext(ctx);
           TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
-              MakeNestedIteratorContext(ctx), out_tensors, &end_of_element));
+              &nested_ctx, out_tensors, &end_of_element));
+          ctx->MergeCheckpoint(nested_ctx.checkpoint());
           if (!end_of_element) {
             // Produce the subelement as output.
             *end_of_sequence = false;
@@ -155,6 +159,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
 
           // We have reached the end of the current element, so maybe move on
           // to the next element.
+          ctx->PurgeCheckpoint(current_element_iterator_->prefix());
           current_element_iterator_.reset();
         }
 
@@ -223,10 +228,15 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kExhausted), static_cast<int64_t>(!input_impl_)));
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name(kElementIndex), element_index_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(kCurrentElementIteratorUninitialized),
+            static_cast<int64_t>(!current_element_iterator_)));
         if (current_element_iterator_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name(kInputsSize), inputs_.size()));
@@ -235,12 +245,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
                 full_name(strings::StrCat(kInputs, "[", i, "]")), inputs_[i]));
           }
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, current_element_iterator_));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(kCurrentElementIteratorUninitialized), ""));
         }
-      } else {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kExhausted), ""));
       }
       return OkStatus();
     }
@@ -252,7 +257,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       element_index_ = 0;
       current_element_iterator_.reset();
       inputs_.clear();
-      if (!reader->Contains(full_name(kExhausted))) {
+      int64_t input_exhausted;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kExhausted), &input_exhausted));
+      if (!static_cast<bool>(input_exhausted)) {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
@@ -262,8 +270,11 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
               reader->ReadScalar(full_name(kElementIndex), &temp));
           element_index_ = temp;
         }
-        if (!reader->Contains(
-                full_name(kCurrentElementIteratorUninitialized))) {
+        int64_t current_element_iterator_uninitialized;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(kCurrentElementIteratorUninitialized),
+                               &current_element_iterator_uninitialized));
+        if (!static_cast<bool>(current_element_iterator_uninitialized)) {
           size_t inputs_size;
           {
             int64_t temp;
@@ -293,17 +304,11 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     Status BuildCurrentElementIteratorLocked(IteratorContext* ctx,
                                              bool is_get_next)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      if (is_get_next) {
-        return MakeIteratorFromInputElement(
-            ctx, this, inputs_, element_index_++, *instantiated_captured_func_,
-            prefix(), &current_element_iterator_, model_node());
-      } else {
-        // NOTE: We intentionally ignore resource modeling outside GetNext().
-        return MakeIteratorFromInputElement(
-            ctx, this, inputs_, element_index_++, *instantiated_captured_func_,
-            prefix(), &current_element_iterator_,
-            /*node=*/nullptr);
-      }
+      // NOTE: We intentionally ignore resource modeling outside GetNext().
+      std::shared_ptr<model::Node> node = is_get_next ? model_node() : nullptr;
+      return MakeIteratorFromInputElement(
+          ctx, this, inputs_, element_index_++, *instantiated_captured_func_,
+          prefix(), &current_element_iterator_, node);
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index abf889f92df..bc4d20d5d97 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -47,6 +47,7 @@ constexpr char kEndOfInput[] = "end_of_input";
 constexpr char kNumOpen[] = "num_open";
 constexpr char kArgsSize[] = "args_size";
 constexpr char kArgsList[] = "args_list_";
+constexpr char kCurrentElementsUnitialized[] = "current_elements_uninitialized";
 
 class InterleaveDatasetOp::Dataset : public DatasetBase {
  public:
@@ -131,6 +132,8 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
           current_elements_(params.dataset->cycle_length_),
           args_list_(params.dataset->cycle_length_) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       TF_RETURN_IF_ERROR(
           dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -169,6 +172,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
           }
           // We have reached the end of the current element, so move
           // on to the next element in the cycle.
+          // TODO(b/267256258): clean up prefixes in checkpoints here.
           current_elements_[cycle_index_].reset();
           args_list_[cycle_index_].clear();
           --num_open_;
@@ -255,9 +259,8 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
           writer->WriteScalar(full_name(kCycleIndex), cycle_index_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(full_name(kBlockIndex), block_index_));
-      if (end_of_input_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kEndOfInput), ""));
-      }
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kEndOfInput), static_cast<int64_t>(end_of_input_)));
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kNumOpen), num_open_));
       TF_RETURN_IF_ERROR(SaveCurrentElements(ctx, writer));
       return OkStatus();
@@ -273,7 +276,10 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       cycle_index_ = size_t(cycle_index);
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(full_name(kBlockIndex), &block_index_));
-      if (reader->Contains(full_name(kEndOfInput))) end_of_input_ = true;
+      int64_t end_of_input;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kEndOfInput), &end_of_input));
+      end_of_input_ = static_cast<bool>(end_of_input);
       int64_t num_open;
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNumOpen), &num_open));
       num_open_ = size_t(num_open);
@@ -290,6 +296,10 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
                                IteratorStateWriter* writer)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       for (int idx = 0; idx < current_elements_.size(); idx++) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(
+                strings::StrCat(kCurrentElementsUnitialized, "[", idx, "]")),
+            !current_elements_[idx]));
         if (current_elements_[idx]) {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, current_elements_[idx]));
           TF_RETURN_IF_ERROR(writer->WriteScalar(
@@ -309,8 +319,12 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
                                   IteratorStateReader* reader)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       for (int idx = 0; idx < current_elements_.size(); idx++) {
-        if (reader->Contains(
-                full_name(strings::StrCat(kArgsSize, "[", idx, "]")))) {
+        int64_t current_element_uninitialized;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(strings::StrCat(
+                                   kCurrentElementsUnitialized, "[", idx, "]")),
+                               &current_element_uninitialized));
+        if (!current_element_uninitialized) {
           int64_t args_size;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
               full_name(strings::StrCat(kArgsSize, "[", idx, "]")),
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 3598d71297b..4aec1b24e92 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/data/finalization_utils.h"
 #include "tensorflow/core/data/metric_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -65,6 +66,12 @@ const char kIteratorVariantTypeName[] = "tensorflow::Iterator";
 const char kOutputShapes[] = "output_shapes";
 const char kOutputTypes[] = "output_types";
 
+bool SymbolicCheckpointEnabled(const Options& options) {
+  return options.optional_symbolic_checkpoint_case() ==
+             Options::kSymbolicCheckpoint &&
+         options.symbolic_checkpoint();
+}
+
 }  // namespace
 
 /* static */ constexpr const char* const
@@ -79,6 +86,7 @@ IteratorResource::IteratorResource(
     FunctionLibraryRuntime* flr)
     : metrics_collector_(flr->device()->device_type(), *env),
       unbounded_thread_pool_(env, "tf_data_iterator_resource"),
+      env_(*env),
       device_mgr_(std::move(device_mgr)),
       iterator_state_(std::make_shared<State>(std::move(flib_def),
                                               std::move(pflr), flr,
@@ -89,6 +97,7 @@ IteratorResource::IteratorResource(
 }
 
 IteratorResource::~IteratorResource() {
+  TfDatazMetricsRegistry::Deregister(tf_dataz_metrics_collector_);
   VLOG(2) << "destroying iterator resource";
 }
 
@@ -100,49 +109,68 @@ Status IteratorResource::GetNext(OpKernelContext* ctx,
     tf_shared_lock l(mu_);
     captured_state = iterator_state_;
   }
-  if (!captured_state->iterator()) {
+  auto iterator = captured_state->iterator();
+  if (!iterator) {
     return errors::FailedPrecondition(
         "GetNext() failed because the iterator has not been initialized. "
         "Ensure that you have run the initializer operation for this iterator "
         "before getting the next element.");
   }
+  auto* dataset = captured_state->dataset();
   IteratorContext::Params params(ctx);
+  params.cancellation_manager = captured_state->cancellation_manager();
   params.flr = captured_state->flr();
   params.function_handle_cache = captured_state->function_handle_cache();
   params.resource_mgr = captured_state->resource_mgr();
+  params.symbolic_checkpoint = SymbolicCheckpointEnabled(dataset->options());
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
   params.thread_pool = &unbounded_thread_pool_;
-  params.cancellation_manager = captured_state->cancellation_manager();
   std::function<void()> deregister_fn;
   TF_RETURN_IF_ERROR(RegisterCancellationCallback(
       ctx->cancellation_manager(),
       [cm = params.cancellation_manager]() { cm->StartCancel(); },
       &deregister_fn));
   auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
-
+  IteratorContext iter_ctx(std::move(params));
   const absl::Time start_time = metrics_collector_.RecordStart();
-  auto iterator_ = captured_state->iterator();
-  auto status = iterator_->GetNext(IteratorContext(std::move(params)),
-                                   out_tensors, end_of_sequence);
+  auto status = iterator->GetNext(&iter_ctx, out_tensors, end_of_sequence);
   metrics_collector_.RecordStop(start_time, *out_tensors);
+  const int64_t get_next_latency_micros =
+      env_.NowMicros() - absl::ToUnixMicros(start_time);
+  tf_dataz_metrics_collector_->RecordGetNextLatency(get_next_latency_micros);
+  captured_state->MergeCheckpoint(iter_ctx.checkpoint());
   return status;
 }
 
-Status IteratorResource::Save(SerializationContext* ctx,
+Status IteratorResource::Save(OpKernelContext* ctx,
+                              ExternalStatePolicy external_state_policy,
                               IteratorStateWriter* writer) {
   std::shared_ptr<State> captured_state;
   {
     tf_shared_lock l(mu_);
     captured_state = iterator_state_;
   }
-  auto iterator_ = captured_state->iterator();
-  if (iterator_) {
-    return iterator_->Save(ctx, writer);
+  auto iterator = captured_state->iterator();
+  if (!iterator) {
+    return errors::FailedPrecondition(
+        "Save() failed because the iterator has not been initialized. Ensure "
+        "that you have run the initializer operation for this iterator before "
+        "saving it.");
   }
-  return errors::FailedPrecondition(
-      "Save() failed because the iterator has not been initialized. Ensure "
-      "that you have run the initializer operation for this iterator before "
-      "saving it.");
+  auto* dataset = captured_state->dataset();
+  if (SymbolicCheckpointEnabled(dataset->options())) {
+    const auto& checkpoint = captured_state->checkpoint();
+    if (!checkpoint.GetStatus().ok()) {
+      return checkpoint.GetStatus();
+    }
+    TF_RETURN_IF_ERROR(checkpoint.Save(writer));
+    return OkStatus();
+  }
+  SerializationContext::Params params(ctx);
+  params.external_state_policy = external_state_policy;
+  params.symbolic_checkpoint = SymbolicCheckpointEnabled(dataset->options());
+  SerializationContext serialization_ctx(params);
+  return iterator->Save(&serialization_ctx, writer);
 }
 
 Status IteratorResource::Restore(OpKernelContext* ctx,
@@ -152,14 +180,14 @@ Status IteratorResource::Restore(OpKernelContext* ctx,
   const DatasetBase* input_dataset;
   {
     tf_shared_lock l(mu_);
-    if (!iterator_state_->iterator()) {
+    auto iterator = iterator_state_->iterator();
+    if (!iterator) {
       return errors::FailedPrecondition(
           "Restore() failed because the iterator has not been initialized. "
           "Ensure that you have run the initializer operation for this "
           "iterator before restoring it.");
     }
-    auto iterator_ = iterator_state_->iterator();
-    dataset = iterator_->dataset();
+    dataset = iterator->dataset();
     // Hang onto a reference until we've created the new iterator, which will
     // then hold its own reference to keep the dataset alive.
     dataset->Ref();
@@ -171,24 +199,27 @@ Status IteratorResource::Restore(OpKernelContext* ctx,
   }
   core::ScopedUnref scoped_unref(dataset);
   IteratorContext::Params params(ctx);
+  params.cancellation_manager = new_state->cancellation_manager();
   params.flr = new_state->flr();
   params.function_handle_cache = new_state->function_handle_cache();
   params.resource_mgr = new_state->resource_mgr();
+  params.symbolic_checkpoint =
+      SymbolicCheckpointEnabled(input_dataset->options());
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
   params.thread_pool = &unbounded_thread_pool_;
-  params.cancellation_manager = new_state->cancellation_manager();
   std::function<void()> deregister_fn;
   TF_RETURN_IF_ERROR(RegisterCancellationCallback(
       ctx->cancellation_manager(),
       [cm = params.cancellation_manager]() { cm->StartCancel(); },
       &deregister_fn));
   auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
+  IteratorContext iter_ctx(IteratorContext(std::move(params)));
   std::unique_ptr<IteratorBase> iterator_base;
   TF_RETURN_IF_ERROR(dataset->MakeIteratorFromCheckpoint(
-      IteratorContext(std::move(params)), "Iterator", reader, &iterator_base));
+      &iter_ctx, "Iterator", reader, &iterator_base));
   new_state->DowncastAndSetIteratorAndDataset(std::move(iterator_base),
                                               input_dataset);
-
+  new_state->MergeCheckpoint(iter_ctx.checkpoint());
   mutex_lock l(mu_);
   std::swap(iterator_state_, new_state);
   return OkStatus();
@@ -207,28 +238,29 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
 
   // Create new iterator.
   IteratorContext::Params params(ctx);
+  params.cancellation_manager = new_state->cancellation_manager();
   params.flr = new_state->flr();
   params.function_handle_cache = new_state->function_handle_cache();
   params.resource_mgr = new_state->resource_mgr();
+  params.symbolic_checkpoint = SymbolicCheckpointEnabled(dataset->options());
   params.thread_factory = unbounded_thread_pool_.get_thread_factory();
   params.thread_pool = &unbounded_thread_pool_;
-  params.cancellation_manager = new_state->cancellation_manager();
   std::function<void()> deregister_fn;
   TF_RETURN_IF_ERROR(RegisterCancellationCallback(
       ctx->cancellation_manager(),
       [cm = params.cancellation_manager]() { cm->StartCancel(); },
       &deregister_fn));
   auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
-
+  IteratorContext iter_ctx(IteratorContext(std::move(params)));
   std::unique_ptr<IteratorBase> iterator;
   if (ctx->function_library()->device()->device_type() == DEVICE_CPU) {
     DatasetBase* finalized_dataset;
     TF_ASSIGN_OR_RETURN(finalized_dataset, GetFinalizedDataset(ctx, dataset));
-    TF_RETURN_IF_ERROR(finalized_dataset->MakeIterator(
-        IteratorContext(std::move(params)),
-        /*parent=*/nullptr, "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(finalized_dataset->MakeIterator(&iter_ctx,
+                                                       /*parent=*/nullptr,
+                                                       "Iterator", &iterator));
   } else {
-    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx,
                                              /*parent=*/nullptr, "Iterator",
                                              &iterator));
   }
@@ -236,14 +268,31 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
       VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
   TF_RETURN_IF_ERROR(
       VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
-
   new_state->DowncastAndSetIteratorAndDataset(std::move(iterator), dataset);
-
+  new_state->MergeCheckpoint(iter_ctx.checkpoint());
   mutex_lock l(mu_);
   std::swap(iterator_state_, new_state);
+  tf_dataz_metrics_collector_ = std::make_shared<TfDatazMetricsCollector>(
+      env_, iterator_state_->iterator());
+  TfDatazMetricsRegistry::Register(tf_dataz_metrics_collector_);
   return OkStatus();
 }
 
+void IteratorResource::State::DowncastAndSetIteratorAndDataset(
+    std::unique_ptr<IteratorBase> it, const DatasetBase* dataset) {
+  iterator_.reset(static_cast<DatasetBaseIterator*>(it.release()));
+  if (dataset) {
+    dataset->Ref();
+    dataset_.reset(const_cast<DatasetBase*>(dataset));
+  }
+}
+
+void IteratorResource::State::MergeCheckpoint(MemoryCheckpoint* other) {
+  if (SymbolicCheckpointEnabled(dataset_->options())) {
+    checkpoint_.Merge(other);
+  }
+}
+
 namespace {
 
 // A helper class that uses a list of IteratorStateVariant objects to represent
@@ -266,10 +315,12 @@ class IteratorVariantSerializer {
 
   // Calls `Save` on the iterator_resource to build up the list of
   // IteratorStateVariant objects.
-  Status InitializeFromIterator(SerializationContext* serialization_ctx,
+  Status InitializeFromIterator(OpKernelContext* ctx,
+                                ExternalStatePolicy external_state_policy,
                                 IteratorResource* iterator_resource) {
     VariantTensorDataWriter writer;
-    TF_RETURN_IF_ERROR(iterator_resource->Save(serialization_ctx, &writer));
+    TF_RETURN_IF_ERROR(
+        iterator_resource->Save(ctx, external_state_policy, &writer));
     std::vector<std::unique_ptr<VariantTensorData>> data;
     writer.ReleaseData(&data);
     variants_.clear();
@@ -989,11 +1040,8 @@ void SerializeIteratorOp::Compute(OpKernelContext* ctx) {
       ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
   core::ScopedUnref unref_iterator(iterator_resource);
   IteratorVariantSerializer serializer;
-  SerializationContext::Params params(ctx);
-  params.external_state_policy = external_state_policy_;
-  SerializationContext serialization_ctx(params);
-  OP_REQUIRES_OK(ctx, serializer.InitializeFromIterator(&serialization_ctx,
-                                                        iterator_resource));
+  OP_REQUIRES_OK(ctx, serializer.InitializeFromIterator(
+                          ctx, external_state_policy_, iterator_resource));
   Tensor* serialized_t;
   OP_REQUIRES_OK(ctx,
                  ctx->allocate_output(0, TensorShape({serializer.NumTensors()}),
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index bc4c47fefcd..3923236f64b 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/metric_utils.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/data/unbounded_thread_pool.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
@@ -55,7 +56,8 @@ class IteratorResource : public ResourceBase {
                  bool* end_of_sequence);
 
   // Saves a checkpoint of the state of the iterator through the given `writer`.
-  Status Save(SerializationContext* ctx, IteratorStateWriter* writer);
+  Status Save(OpKernelContext* ctx, ExternalStatePolicy external_state_policy,
+              IteratorStateWriter* writer);
 
   // Restores the state of the iterator from a checkpoint created by `Save`.
   Status Restore(OpKernelContext* ctx, IteratorStateReader* reader);
@@ -87,21 +89,11 @@ class IteratorResource : public ResourceBase {
           flr_(flr),
           pflr_(std::move(pflr)),
           function_handle_cache_(std::make_unique<FunctionHandleCache>(flr)),
-          iterator_(std::move(iterator)) {}
+          iterator_(std::move(iterator)),
+          checkpoint_(MemoryCheckpoint::CreateRootCheckpoint()) {}
 
     ~State() { cancellation_manager_.StartCancel(); }
 
-    // Downcasts the given `IteratorBase` to a `DatasetBaseIterator`, and uses
-    // it to set the `iterator` and the `dataset` field.
-    void DowncastAndSetIteratorAndDataset(std::unique_ptr<IteratorBase> it,
-                                          const DatasetBase* dataset) {
-      iterator_.reset(static_cast<DatasetBaseIterator*>(it.release()));
-      if (dataset) {
-        dataset->Ref();
-        dataset_.reset(const_cast<DatasetBase*>(dataset));
-      }
-    }
-
     std::shared_ptr<FunctionLibraryDefinition> flib_def() { return flib_def_; }
 
     FunctionLibraryRuntime* flr() { return flr_; }
@@ -120,8 +112,18 @@ class IteratorResource : public ResourceBase {
 
     DatasetBaseIterator* iterator() { return iterator_.get(); }
 
+    const MemoryCheckpoint& checkpoint() const { return checkpoint_; }
+
     DatasetBase* dataset() { return dataset_.get(); }
 
+    // Downcasts the given `IteratorBase` to a `DatasetBaseIterator`, and uses
+    // it to set the `iterator` and the `dataset` field.
+    void DowncastAndSetIteratorAndDataset(std::unique_ptr<IteratorBase> it,
+                                          const DatasetBase* dataset);
+
+    // Merges the given checkpoint with the checkpoint of this state.
+    void MergeCheckpoint(MemoryCheckpoint* other);
+
    private:
     std::shared_ptr<FunctionLibraryDefinition> flib_def_;
     FunctionLibraryRuntime* flr_ = nullptr;  // not owned
@@ -131,12 +133,15 @@ class IteratorResource : public ResourceBase {
     CancellationManager cancellation_manager_;
     std::unique_ptr<DatasetBaseIterator> iterator_;
     core::RefCountPtr<DatasetBase> dataset_;
+    MemoryCheckpoint checkpoint_;
   };
 
   IteratorMetricsCollector metrics_collector_;
+  std::shared_ptr<TfDatazMetricsCollector> tf_dataz_metrics_collector_;
   UnboundedThreadPool unbounded_thread_pool_;
 
   mutex mu_;
+  const Env& env_;
   const std::unique_ptr<DeviceMgr> device_mgr_ TF_GUARDED_BY(mu_);
   std::shared_ptr<State> iterator_state_ TF_GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 023656ed254..7110e99884f 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -159,6 +159,8 @@ class MapDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       TF_RETURN_IF_ERROR(
           dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
@@ -166,14 +168,12 @@ class MapDatasetOp::Dataset : public DatasetBase {
           ctx, &instantiated_captured_func_);
     }
 
+    // NOTE(mrry): This method is thread-safe as long as `input_impl_` and `f`
+    // are thread-safe. However, if multiple threads enter this method,
+    // outputs may be observed in a non-deterministic order.
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      // NOTE(mrry): This method is thread-safe as long as
-      // `input_impl_` and `f` are thread-safe. However, if multiple
-      // threads enter this method, outputs may be observed in a
-      // non-deterministic order.
-
       std::vector<Tensor> args;
       TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
       if (*end_of_sequence) {
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 856462aeaf1..70bd0fd1fbc 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -187,6 +187,8 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
@@ -245,17 +247,21 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (input_impl_)
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kExhausted), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      else
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kExhausted), ""));
+      }
       return OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (reader->Contains(full_name(kExhausted))) {
+      int64_t input_exhausted;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kExhausted), &input_exhausted));
+      if (static_cast<bool>(input_exhausted)) {
         input_impl_.reset();
       } else {
         TF_RETURN_IF_ERROR(
@@ -291,9 +297,10 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
 
         for (int dim = 0; dim < padded_shape.dims(); ++dim) {
           if (padded_shape.dim_size(dim) == -1) {
-            batch_component_shape.AddDim(0);
+            TF_RETURN_IF_ERROR(batch_component_shape.AddDimWithStatus(0));
           } else {
-            batch_component_shape.AddDim(padded_shape.dim_size(dim));
+            TF_RETURN_IF_ERROR(batch_component_shape.AddDimWithStatus(
+                padded_shape.dim_size(dim)));
           }
         }
 
@@ -342,7 +349,8 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
         // element in the batch.
         TensorShape component_shape({});
         for (int i = 1; i < batch_component_shape.dims(); ++i) {
-          component_shape.AddDim(batch_component_shape.dim_size(i));
+          TF_RETURN_IF_ERROR(component_shape.AddDimWithStatus(
+              batch_component_shape.dim_size(i)));
         }
         auto copy_element_fn = [component_index, &batch_elements,
                                 &batch_component, &component_shape](int index) {
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 13a8fcd223d..6d647b6d456 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -60,7 +60,6 @@ namespace {
 constexpr char kParallelMapDatasetV1[] = "ParallelMapDataset";
 constexpr char kParallelMapDatasetV2[] = "ParallelMapDatasetV2";
 
-constexpr char kComponent[] = "component";
 constexpr char kInvocationResults[] = "invocation_results";
 constexpr char kSize[] = "size";
 constexpr char kEndOfInput[] = "end_of_input";
@@ -251,6 +250,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       if (deregister_fn_) deregister_fn_();
     }
 
+    bool SymbolicCheckpointCompatible() const override {
+      return deterministic_;
+    }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(*mu_);
       interleave_depth_ = ctx->interleave_depth();
@@ -264,8 +267,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
           [this]() { CancelThreads(/*wait=*/false); }, &deregister_fn_));
       IteratorContext::Params params(ctx);
       params.cancellation_manager = cancellation_manager_.get();
+      IteratorContext iter_ctx(std::move(params));
       TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
-          IteratorContext(params), this, prefix(), &input_impl_));
+          &iter_ctx, this, prefix(), &input_impl_));
+      ctx->MergeCheckpoint(iter_ctx.checkpoint());
       return dataset()->captured_func_->Instantiate(
           ctx, &instantiated_captured_func_);
     }
@@ -310,6 +315,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
+      if (ctx->symbolic_checkpoint()) {
+        return writer->WriteScalar(
+            full_name(absl::StrCat(kInvocationResults, "_", kSize)), 0);
+      }
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
       while (num_calls_ > 0) {
@@ -320,26 +329,26 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             "Unexpected outstanding calls encountered.");
       }
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      TF_RETURN_IF_ERROR(
-          writer->WriteScalar(absl::StrCat(prefix(), "::", kInvocationResults),
-                              kSize, invocation_results_.size()));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(absl::StrCat(kInvocationResults, "_", kSize)),
+          invocation_results_.size()));
       for (size_t i = 0; i < invocation_results_.size(); i++) {
         const auto& result = *(invocation_results_[i]);
         std::string element_prefix =
-            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+            absl::StrCat(kInvocationResults, "[", i, "]");
         TF_RETURN_IF_ERROR(
             WriteStatusLocked(writer, element_prefix, result.status));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(element_prefix, kSize,
-                                               result.return_values.size()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(absl::StrCat(element_prefix, "_", kSize)),
+            result.return_values.size()));
         for (size_t j = 0; j < result.return_values.size(); j++) {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
-              element_prefix, absl::StrCat(kComponent, "[", j, "]"),
+              full_name(absl::StrCat(element_prefix, "[", j, "]")),
               result.return_values[j]));
         }
-        if (result.end_of_input) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(element_prefix, kEndOfInput, ""));
-        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(absl::StrCat(element_prefix, "_", kEndOfInput)),
+            static_cast<int64_t>(result.end_of_input)));
       }
       return OkStatus();
     }
@@ -349,21 +358,22 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(*mu_);
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       int64_t invocation_results_size;
-      TF_RETURN_IF_ERROR(
-          reader->ReadScalar(absl::StrCat(prefix(), "::", kInvocationResults),
-                             kSize, &invocation_results_size));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(absl::StrCat(kInvocationResults, "_", kSize)),
+          &invocation_results_size));
       DCHECK(invocation_results_.empty());
       for (size_t i = 0; i < invocation_results_size; i++) {
         invocation_results_.push_back(std::make_shared<InvocationResult>());
         auto& result = *invocation_results_.back();
         std::string element_prefix =
-            absl::StrCat(prefix(), "::", kInvocationResults, "::", i);
+            absl::StrCat(kInvocationResults, "[", i, "]");
         TF_RETURN_IF_ERROR(
             ReadStatusLocked(reader, element_prefix, &result.status));
         size_t num_return_values;
         {
           int64_t size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(element_prefix, kSize, &size));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(absl::StrCat(element_prefix, "_", kSize)), &size));
           num_return_values = static_cast<size_t>(size);
           if (num_return_values != size) {
             return errors::InvalidArgument(
@@ -375,10 +385,14 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         for (size_t j = 0; j < num_return_values; j++) {
           result.return_values.emplace_back();
           TF_RETURN_IF_ERROR(reader->ReadTensor(
-              ctx->flr(), element_prefix, absl::StrCat(kComponent, "[", j, "]"),
+              ctx->flr(), full_name(absl::StrCat(element_prefix, "[", j, "]")),
               &result.return_values.back()));
         }
-        result.end_of_input = reader->Contains(element_prefix, kEndOfInput);
+        int64_t end_of_input;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(absl::StrCat(element_prefix, "_", kEndOfInput)),
+            &end_of_input));
+        result.end_of_input = static_cast<bool>(end_of_input);
         RecordBufferEnqueue(ctx, result.return_values);
         result.notification.Notify();
       }
@@ -417,6 +431,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       Status status;
       std::vector<Tensor> return_values;
       bool end_of_input = false;
+      MemoryCheckpoint checkpoint;
       const int64_t uid;
     };
 
@@ -466,6 +481,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> input_element;
       result->status = input_impl_->GetNext(ctx.get(), &input_element,
                                             &result->end_of_input);
+      result->checkpoint.Merge(ctx->checkpoint());
       if (result->end_of_input || !result->status.ok()) {
         CallCompleted(ctx, result);
         return;
@@ -515,6 +531,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
                          const std::shared_ptr<InvocationResult>& result,
                          std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) TF_LOCKS_EXCLUDED(*mu_) {
+      ctx->MergeCheckpoint(&result->checkpoint);
       if (!result->end_of_input && result->status.ok()) {
         *out_tensors = std::move(result->return_values);
         RecordBufferDequeue(ctx, *out_tensors);
@@ -640,27 +657,32 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteStatusLocked(IteratorStateWriter* writer,
-                             const std::string& key, const Status& status)
+                             const std::string& prefix, const Status& status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-      TF_RETURN_IF_ERROR(writer->WriteScalar(
-          key, kErrorCode, static_cast<int64_t>(status.code())));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(absl::StrCat(prefix, "_", kErrorCode)),
+                              static_cast<int64_t>(status.code())));
       if (!status.ok()) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(key, kErrorMessage, status.error_message()));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(absl::StrCat(prefix, "_", kErrorMessage)),
+            status.error_message()));
       }
       return OkStatus();
     }
 
-    Status ReadStatusLocked(IteratorStateReader* reader, const std::string& key,
-                            Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+    Status ReadStatusLocked(IteratorStateReader* reader,
+                            const std::string& prefix, Status* status)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64_t code_int;
-      TF_RETURN_IF_ERROR(reader->ReadScalar(key, kErrorCode, &code_int));
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          full_name(absl::StrCat(prefix, "_", kErrorCode)), &code_int));
       error::Code code = static_cast<error::Code>(code_int);
 
       if (code != error::Code::OK) {
         tstring error_message;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(key, kErrorMessage, &error_message));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(absl::StrCat(prefix, "_", kErrorMessage)),
+            &error_message));
         *status = Status(code, error_message);
       } else {
         *status = OkStatus();
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
index 1c696ee2ed2..5f898725ba5 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 
+#include <algorithm>
+#include <optional>
+
 #include "tensorflow/core/framework/model.h"
 
 namespace tensorflow {
@@ -36,7 +39,8 @@ namespace {
 size_t kBufferLimitThreshold = 2048;
 }  // namespace
 
-void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
+void PrefetchAutotuner::RecordConsumption(
+    size_t current_buffer_size, std::optional<int64_t> free_memory_bytes) {
   switch (mode_) {
     case Mode::kDisabled:
       return;
@@ -48,10 +52,30 @@ void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
     case Mode::kDownswing:
       if (current_buffer_size == 0) {
         if (buffer_limit_ >= static_cast<int64_t>(kBufferLimitThreshold)) {
+          VLOG(3) << "Increasing buffer limit from " << buffer_limit_ << " by "
+                  << kBufferLimitThreshold << " to "
+                  << buffer_limit_ + kBufferLimitThreshold;
           buffer_limit_ += kBufferLimitThreshold;
         } else {
+          VLOG(3) << "Increasing buffer limit from " << buffer_limit_ << " to "
+                  << buffer_limit_ * 2;
           buffer_limit_ *= 2;
         }
+        // Use the element size and the free memory to compute the maximum
+        // buffer size.
+        if (free_memory_bytes.has_value() && element_size_bytes_.has_value() &&
+            free_memory_bytes.value() > 0 && element_size_bytes_.value() > 0) {
+          int64_t max_buffer_size =
+              free_memory_bytes.value() / element_size_bytes_.value();
+          if (buffer_limit_ > max_buffer_size) {
+            VLOG(3) << "Increasing buffer limit to " << buffer_limit_
+                    << " would result in memory usage that can exceed the free "
+                       "memory value of "
+                    << free_memory_bytes.value() << " bytes. Lowering it to "
+                    << max_buffer_size << " elements.";
+            buffer_limit_ = max_buffer_size;
+          }
+        }
         mode_ = Mode::kUpswing;
       }
       return;
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.h b/tensorflow/core/kernels/data/prefetch_autotuner.h
index a74438011f0..cde27988c9d 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner.h
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
 
+#include <optional>
+
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -39,13 +42,33 @@ namespace data {
 // PrefetchAutotuner is NOT thread safe.
 class PrefetchAutotuner {
  public:
+  // The autotuner is enabled if the `initial_buffer_size` is `AUTOTUNE`. It is
+  // disabled otherwise.
   explicit PrefetchAutotuner(int64_t initial_buffer_size,
                              int64_t buffer_size_min);
 
+  // Returns the upper limit of the buffer size, i.e. the number of buffer
+  // elements the buffer is allowed to hold.
   int64_t buffer_limit() const { return buffer_limit_; }
-
-  void RecordConsumption(size_t current_buffer_size);
-  void RecordEmpty() { RecordConsumption(0); }
+  // Returns the element size in bytes.
+  int64_t element_size() const {
+    return element_size_bytes_.has_value() ? element_size_bytes_.value() : 0;
+  }
+  // Records the size of each element used to compute the max buffer size.
+  void RecordElementSize(int64_t element_size_bytes) {
+    element_size_bytes_ = element_size_bytes;
+  }
+  // Tells the autotuner the `current_buffer_size`. The autotuner uses this
+  // information to monitor the ongoing values of the buffer size and increases
+  // its limit if necessary up to the `free_memory_bytes`.
+  void RecordConsumption(size_t current_buffer_size,
+                         std::optional<int64_t> free_memory_bytes);
+  void RecordConsumption(size_t current_buffer_size) {
+    RecordConsumption(current_buffer_size, std::nullopt);
+  }
+  // Tells the autotuner that the buffer is empty. This may trigger the
+  // autotuner to increase its buffer limit.
+  void RecordEmpty() { RecordConsumption(0, port::GetMemoryInfo().free); }
 
  private:
   // PrefetchAutotuner operates as a state machine.
@@ -62,7 +85,12 @@ class PrefetchAutotuner {
     kDownswing,
   };
 
+  // This is the upper limit of the buffer size.
   int64_t buffer_limit_;
+  // This is the size of each element in the buffer in bytes.
+  std::optional<int64_t> element_size_bytes_;
+
+  // Operating mode/state of the autotuner.
   Mode mode_ = Mode::kDisabled;
 };
 
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
index f69f2b754e4..bd9d8cfa466 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 
+#include <vector>
+
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -103,6 +105,39 @@ TEST(PrefetchAutotuner, StartWithMin) {
   }
 }
 
+TEST(PrefetchAutotunerTest, MemoryConsumption) {
+  PrefetchAutotuner t(model::kAutotune, 0);
+  t.RecordElementSize(10);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1, 80);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0, 80);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2, 80);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0, 80);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(4, 80);
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(0, 80);  // Expect buffer limit to increase.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(8, 80);
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0, 80);  // Expect buffer limit to stay the same. No
+                               // memory left.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(8, 80);
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0, 80);  // Expect buffer limit to stay the same. No
+                               // memory left.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(8, 80);
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0, 200);  // Expect buffer limit to increase. Free memory
+                                // is increased.
+  EXPECT_EQ(16, t.buffer_limit());
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 20ee3a59b1b..f23b9fcaf85 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -163,6 +163,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       if (deregister_fn_) deregister_fn_();
     }
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(*mu_);
       interleave_depth_ = ctx->interleave_depth();
@@ -176,8 +178,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           &deregister_fn_));
       IteratorContext::Params params(ctx);
       params.cancellation_manager = cancellation_manager_.get();
-      return dataset()->input_->MakeIterator(IteratorContext(params), this,
-                                             prefix(), &input_impl_);
+      IteratorContext iter_ctx(params);
+      TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(
+          &iter_ctx, this, prefix(), &input_impl_));
+      ctx->MergeCheckpoint(iter_ctx.checkpoint());
+      return OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -246,13 +251,17 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
+      if (ctx->symbolic_checkpoint()) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kBufferSize), 0));
+        return OkStatus();
+      }
       // Acquire both locks to ensure that the prefetch thread and
       // all GetNext threads are blocked.
       mutex_lock input_l(input_mu_);
       mutex_lock l(*mu_);
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       TF_RETURN_IF_ERROR(
-          writer->WriteScalar(prefix(), kBufferSize, buffer_.size()));
+          writer->WriteScalar(full_name(kBufferSize), buffer_.size()));
       for (size_t i = 0; i < buffer_.size(); i++) {
         auto& buffer_element = buffer_[i];
         TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
@@ -279,7 +288,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       size_t buffer_size;
       {
         int64_t temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kBufferSize, &temp));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kBufferSize), &temp));
         buffer_size = static_cast<size_t>(temp);
       }
       for (size_t i = 0; i < buffer_size; i++) {
@@ -364,6 +373,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> value;
       int64_t created_us;
       const uint64 uid;
+      MemoryCheckpoint checkpoint;
     };
 
     int64_t buffer_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -422,7 +432,12 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           VLOG(2) << "Setting slack_us_: " << slack_us_;
         }
         *out_tensors = std::move(buffer_.front().value);
+        ctx->MergeCheckpoint(&buffer_.front().checkpoint);
         RecordBufferDequeue(ctx, *out_tensors);
+        // Tells the legacy prefetch autotuner the size of an element.
+        if (legacy_autotune_ && auto_tuner_.element_size() == 0) {
+          auto_tuner_.RecordElementSize(GetAllocatedBytes(*out_tensors));
+        }
       } else {
         // If status not ok, we still record the dequeue event to make sure each
         // enqueue event is paired with a dequeue event even in the presence of
@@ -506,6 +521,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
               profiler::kInfo);
           buffer_element.status = input_impl_->GetNext(
               ctx.get(), &buffer_element.value, &end_of_sequence);
+          buffer_element.checkpoint.Merge(ctx->checkpoint());
         }
         if (buffer_element.status.ok() && end_of_sequence) {
           mutex_lock l(*mu_);
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 42ae2aaca54..367db9799b4 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -80,7 +80,7 @@ class RangeCounter {
       return -1;
     }
     *end_of_counter = false;
-    int result = next_;
+    int64_t result = next_;
     next_ += step_;
     return result;
   }
@@ -185,27 +185,21 @@ class RangeDatasetOp::Dataset : public DatasetBase {
   }
 
   int64_t CardinalityInternal() const override {
-    // If start_ == stop_ or if the sign of stop_ - start_ and step do not agree
-    // (or are zero), return zero.
+    // If the signs of `stop_ - start_` and `step_` are different or either of
+    // the values is zero, the range will be empty.
     if (sgn(stop_ - start_) * sgn(step_) <= 0) {
       return 0;
     } else if (step_ > 0) {
-      return std::max(int64_t{0}, (stop_ - start_ - 1) / step_ + 1);
+      // Invariant: stop_ - start_ > 0 && step_ > 0
+      return (stop_ - start_ - 1) / step_ + 1;
     } else {
-      return std::max(int64_t{0}, (start_ - stop_ - 1) / -step_ + 1);
+      // Invariant: start_ - stop_ > 0 && step_ < 0
+      return (start_ - stop_ - 1) / -step_ + 1;
     }
   }
 
   int64_t CardinalityInternal(CardinalityOptions options) const override {
-    // If start_ == stop_ or if the sign of stop_ - start_ and step do not agree
-    // (or are zero), return zero.
-    if (sgn(stop_ - start_) * sgn(step_) <= 0) {
-      return 0;
-    } else if (step_ > 0) {
-      return std::max(int64_t{0}, (stop_ - start_ - 1) / step_ + 1);
-    } else {
-      return std::max(int64_t{0}, (start_ - stop_ - 1) / -step_ + 1);
-    }
+    return CardinalityInternal();
   }
 
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
@@ -255,6 +249,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       if (ctx->split_providers().empty() || dataset()->replicate_on_split_) {
         counter_ = std::make_unique<RangeCounter>(
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 81a76476d89..3eec250f6b1 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -30,6 +30,16 @@ RangeDatasetParams NegativeStepRangeDatasetParams() {
   return RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/-3);
 }
 
+RangeDatasetParams Int32OverflowRangeDatasetParames() {
+  return RangeDatasetParams(/*start=*/2'147'483'647LL, /*stop=*/2'147'483'649LL,
+                            /*step=*/1);
+}
+
+RangeDatasetParams UnsignedInt32OverflowRangeDatasetParames() {
+  return RangeDatasetParams(/*start=*/4'294'967'295LL, /*stop=*/4'294'967'297LL,
+                            /*step=*/1);
+}
+
 RangeDatasetParams ZeroStepRangeDatasetParams() {
   return RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/0);
 }
@@ -50,7 +60,15 @@ std::vector<GetNextTestCase<RangeDatasetParams>> GetNextTestCases() {
            CreateTensors<int64_t>(TensorShape({}), {{0}, {3}, {6}, {9}})},
           {/*dataset_params=*/NegativeStepRangeDatasetParams(),
            /*expected_outputs=*/
-           CreateTensors<int64_t>(TensorShape({}), {{10}, {7}, {4}, {1}})}};
+           CreateTensors<int64_t>(TensorShape({}), {{10}, {7}, {4}, {1}})},
+          {/*dataset_params=*/Int32OverflowRangeDatasetParames(),
+           /*expected_outputs=*/
+           CreateTensors<int64_t>(TensorShape({}),
+                                  {{2'147'483'647LL}, {2'147'483'648LL}})},
+          {/*dataset_params=*/UnsignedInt32OverflowRangeDatasetParames(),
+           /*expected_outputs=*/
+           CreateTensors<int64_t>(TensorShape({}),
+                                  {{4'294'967'295LL}, {4'294'967'296LL}})}};
 }
 
 ITERATOR_GET_NEXT_TEST_P(RangeDatasetOpTest, RangeDatasetParams,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 744265ff617..be70e8c7bb1 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 
@@ -140,6 +141,9 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
    public:
     explicit EmptyIterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
+
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -169,6 +173,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     explicit FiniteIterator(const Params& params)
         : DatasetIterator<Dataset>(params), i_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
@@ -210,9 +216,9 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCurIteration), i_));
-      if (!input_impl_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty), ""));
-      } else {
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
       return OkStatus();
@@ -222,7 +228,10 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurIteration), &i_));
-      if (!reader->Contains(full_name(kInputImplEmpty))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (static_cast<bool>(!input_empty)) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       } else {
         input_impl_.reset();
@@ -243,6 +252,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
           input_impl_(nullptr),
           first_call_(true) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
@@ -262,9 +273,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
         DCHECK(!*end_of_sequence || out_tensors->empty());
         if (first_call_ && *end_of_sequence && ctx->split_providers().empty()) {
           // If the first call to GetNext() fails because the end of sequence
-          // has been reached, we terminate the iteration immediately.
-          // Otherwise, this iterator would loop infinitely and never produce a
-          // value.
+          // has been reached, we return EOF. Otherwise, this iterator could
+          // loop infinitely and never produce a value.
           input_impl_.reset();
           return OkStatus();
         }
@@ -290,17 +300,21 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (!first_call_)
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      else
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kUninitialized), ""));
+      }
       return OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (reader->Contains(full_name(kUninitialized))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (static_cast<bool>(input_empty)) {
         input_impl_.reset();
         first_call_ = true;
       } else {
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 1ce7b5fbc30..8546c2d4dd0 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -135,6 +135,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params), next_index_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       if (dataset()->num_shards_ == kShardHint) {
         return errors::FailedPrecondition(
@@ -221,9 +223,9 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (!input_impl_) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty), ""));
-      } else {
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
+      if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name(kNextIndex), next_index_));
@@ -234,7 +236,10 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (!reader->Contains(full_name(kInputImplEmpty))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (!static_cast<bool>(input_empty)) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name(kNextIndex), &next_index_));
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index b492d5d155f..dfb5bd02d9e 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -115,6 +115,9 @@ class SkipDatasetOp::Dataset : public DatasetBase {
    public:
     explicit EmptyIterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
+
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -145,6 +148,8 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     explicit FiniteIterator(const Params& params)
         : DatasetIterator<Dataset>(params), i_(0) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
@@ -191,10 +196,10 @@ class SkipDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCurIndex), i_));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          full_name(kInputImplEmpty), static_cast<int64_t>(!input_impl_)));
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      } else {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty), ""));
       }
       return OkStatus();
     }
@@ -203,7 +208,10 @@ class SkipDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurIndex), &i_));
-      if (!reader->Contains(full_name(kInputImplEmpty))) {
+      int64_t input_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+      if (!static_cast<bool>(input_empty)) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       } else {
         input_impl_.reset();
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 95bc7c08fe9..4df1bdc5c2e 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -108,6 +108,9 @@ class TakeDataset::EmptyIterator : public DatasetIterator<TakeDataset> {
  public:
   explicit EmptyIterator(const Params& params)
       : DatasetIterator<TakeDataset>(params) {}
+
+  bool SymbolicCheckpointCompatible() const override { return true; }
+
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) override {
     *end_of_sequence = true;
@@ -137,6 +140,8 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
   explicit FiniteIterator(const Params& params)
       : DatasetIterator<TakeDataset>(params), i_(0) {}
 
+  bool SymbolicCheckpointCompatible() const override { return true; }
+
   Status Initialize(IteratorContext* ctx) override {
     return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
   }
@@ -173,10 +178,10 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
                       IteratorStateWriter* writer) override {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCurIndex), i_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty),
+                                           static_cast<int64_t>(!input_impl_)));
     if (input_impl_) {
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-    } else {
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kInputImplEmpty), ""));
     }
     return OkStatus();
   }
@@ -185,7 +190,10 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
                          IteratorStateReader* reader) override {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurIndex), &i_));
-    if (!reader->Contains(full_name(kInputImplEmpty))) {
+    int64_t input_empty;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(full_name(kInputImplEmpty), &input_empty));
+    if (!static_cast<bool>(input_empty)) {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
     } else {
       input_impl_.reset();
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 01745f2e918..0722c053c42 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -121,6 +121,8 @@ class TensorDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params), produced_(false) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       if (!ctx->split_providers().empty()) {
         TF_ASSIGN_OR_RETURN(split_provider_,
@@ -161,15 +163,17 @@ class TensorDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (produced_)
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kProduced), ""));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kProduced),
+                                             static_cast<int64_t>(produced_)));
       return OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      produced_ = reader->Contains(full_name(kProduced));
+      int64_t produced;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kProduced), &produced));
+      produced_ = static_cast<bool>(produced);
       return OkStatus();
     }
 
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 9c396a5c7de..6c7f1adfa54 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -151,6 +151,8 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       if (ctx->split_providers().empty() || dataset()->replicate_on_split_) {
         split_provider_ = std::make_shared<IndexSplitProvider>(
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index b77c1dfb006..659dbff68e0 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -164,6 +164,8 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params) {}
 
+    bool SymbolicCheckpointCompatible() const override { return true; }
+
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
       TF_ASSIGN_OR_RETURN(input_contexts_,
@@ -173,6 +175,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
             &input_contexts_[i], this, strings::StrCat(prefix(), "[", i, "]"),
             &input_impls_[i]));
+        ctx->MergeCheckpoint(input_contexts_[i].checkpoint());
       }
       return OkStatus();
     }
@@ -195,6 +198,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
         bool component_end_of_sequence = false;
         status.Update(input_impl->GetNext(&input_contexts_[i], &input_tensors,
                                           &component_end_of_sequence));
+        ctx->MergeCheckpoint(input_contexts_[i].checkpoint());
         *end_of_sequence |= component_end_of_sequence;
         // Even if an error is encountered for one of the components,
         // we need to make sure to advance all components, to keep them in sync.
@@ -209,6 +213,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
             Status s =
                 input_impls_[j]->GetNext(&input_contexts_[j], &input_tensors,
                                          &component_end_of_sequence);
+            ctx->MergeCheckpoint(input_contexts_[j].checkpoint());
           }
           break;
         }
@@ -236,12 +241,11 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
-      if (input_impls_.empty()) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(kInputImplsEmpty), ""));
-      } else {
-        for (auto& input_impl : input_impls_)
-          TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(full_name(kInputImplsEmpty),
+                              static_cast<int64_t>(input_impls_.empty())));
+      for (auto& input_impl : input_impls_) {
+        TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl));
       }
       return OkStatus();
     }
@@ -249,7 +253,10 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
-      if (reader->Contains(full_name(kInputImplsEmpty))) {
+      int64_t inputs_empty;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(full_name(kInputImplsEmpty), &inputs_empty));
+      if (static_cast<bool>(inputs_empty)) {
         input_impls_.clear();
       } else {
         DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
diff --git a/tensorflow/core/kernels/decode_padded_raw_op.cc b/tensorflow/core/kernels/decode_padded_raw_op.cc
index e2f2c55c6c8..45f6b3b77f4 100644
--- a/tensorflow/core/kernels/decode_padded_raw_op.cc
+++ b/tensorflow/core/kernels/decode_padded_raw_op.cc
@@ -63,7 +63,7 @@ class DecodePaddedRawOp : public OpKernel {
     int width = fixed_length / sizeof(T);
 
     TensorShape out_shape = input.shape();
-    out_shape.AddDim(width);
+    OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(width));
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output("output", out_shape, &output_tensor));
@@ -85,7 +85,7 @@ class DecodePaddedRawOp : public OpKernel {
     // can copy the memory directly.
     if (!convert_data_endianness_ || sizeof(T) == 1) {
       for (int64_t i = 0; i < flat_in.size(); ++i) {
-        const auto to_copy =
+        const size_t to_copy =
             std::min(flat_in(i).size(), static_cast<size_t>(fixed_length));
         memcpy(out_data, flat_in(i).data(), to_copy);
         // Note: increase out_data by width since it's already of type T* so
@@ -102,10 +102,25 @@ class DecodePaddedRawOp : public OpKernel {
         char* out_data_bytes = reinterpret_cast<char*>(out_data);
         const char* p_in = in_data_bytes;
         char* p_out = out_data_bytes;
-        for (; p_in < in_data_bytes + fixed_length;
+
+        // Only reverse up to the end of flat_in(i).  There is also an edge-case
+        // where the input has fewer bytes than required for the final element,
+        // in which case we need to extend the input bytes with zeros prior to
+        // converting endianness.  There are internal workloads that seem to
+        // rely on this zero-padding behavior.
+        const size_t to_copy =
+            std::min(flat_in(i).size(), static_cast<size_t>(fixed_length));
+        for (; p_in + sizeof(T) <= in_data_bytes + to_copy;
              p_in += sizeof(T), p_out += sizeof(T)) {
           std::reverse_copy(p_in, p_in + sizeof(T), p_out);
         }
+        // Final element - reverse with appended zeros if required.
+        const ptrdiff_t trailing = (in_data_bytes + to_copy) - p_in;
+        if (trailing > 0) {
+          const size_t offset = sizeof(T) - trailing;
+          std::reverse_copy(p_in, p_in + trailing, p_out + offset);
+        }
+
         // Note: increase out_data by width since it's already of type T* so
         // each shift amount is implicitly multiplied by sizeof(T) according to
         // pointer arithmetic rules.
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index ff42f52097c..46ffde11479 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -735,7 +735,7 @@ class DecodeProtoOp : public OpKernel {
     const TensorShape& shape_prefix = buf_tensor.shape();
 
     TensorShape sizes_shape = shape_prefix;
-    sizes_shape.AddDim(field_count);
+    OP_REQUIRES_OK(ctx, sizes_shape.AddDimWithStatus(field_count));
     Tensor* sizes_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, sizes_shape, &sizes_tensor));
 
@@ -792,7 +792,7 @@ class DecodeProtoOp : public OpKernel {
       TensorShape flat_shape = {static_cast<int64_t>(message_count),
                                 max_sizes[fi]};
       TensorShape out_shape = shape_prefix;
-      out_shape.AddDim(max_sizes[fi]);
+      OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(max_sizes[fi]));
 
       // Surprisingly we don't specify the types from the output_types
       // attribute: that is done for us based on the Op declaration:
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 931025df422..38bfe48f33c 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -58,7 +58,7 @@ class DecodeRawOp : public OpKernel {
     }
     TensorShape out_shape = input.shape();
     if (str_size == -1 || str_size == 0) {  // Empty input
-      out_shape.AddDim(0);
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(0));
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
                                                        &output_tensor));
@@ -70,7 +70,7 @@ class DecodeRawOp : public OpKernel {
                                 " that is not a multiple of ", sizeof(T),
                                 ", the size of ", DataTypeString(out_type_)));
     const int64_t added_dim = str_size / sizeof(T);
-    out_shape.AddDim(added_dim);
+    OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(added_dim));
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output("output", out_shape, &output_tensor));
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 4319c893bc5..569241cc22d 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/dense_update_functor.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 
 namespace tensorflow {
 
@@ -64,7 +63,6 @@ TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
   template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index be64595ae6e..912c42ec184 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -113,7 +113,6 @@ TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc
index 72037dd3736..35c8a36f2d7 100644
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@@ -103,12 +103,13 @@ class DepthToSpaceOp : public OpKernel {
 
     // Allocate output tensor.
     Tensor* outputs_tensor = nullptr;
+    TensorShape outputs_tensor_shape;
     OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0,
-                       ShapeFromFormat(data_format_, batch_size, output_height,
-                                       output_width, output_depth),
-                       &outputs_tensor));
+                   ShapeFromFormatWithStatus(
+                       data_format_, batch_size, output_height, output_width,
+                       output_depth, &outputs_tensor_shape));
+    OP_REQUIRES_OK(context, context->allocate_output(0, outputs_tensor_shape,
+                                                     &outputs_tensor));
     auto Tinput = input.tensor<T, kDims>();
     auto Toutput = outputs_tensor->tensor<T, kDims>();
 
@@ -191,6 +192,10 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     DepthToSpaceOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthToSpace")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T"),
+                        DepthToSpaceOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("DepthToSpace").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     DepthToSpaceOp<GPUDevice, qint8>);
diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
index 7ed6eee17f5..b1e4dfb5c18 100644
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -248,6 +248,12 @@ template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
                                                FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::bfloat16.
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NCHW>;
+template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NHWC>;
+
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index eac26cefd68..634506b3db6 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -39,15 +39,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif
 
-#include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 namespace tensorflow {
 
 // Gradient operations for depthwise convolution.
@@ -537,11 +532,14 @@ extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_grad_input_ops.cc.
+extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>;
 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
 
 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
+                                                          Eigen::bfloat16>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
                                                           Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
@@ -594,7 +592,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
     // release-notes/rel_8.html#rel_8)
     use_cudnn_grouped_conv_ =
-        dtype_ == DT_HALF &&
+        (dtype_ == DT_HALF || dtype_ == DT_BFLOAT16) &&
         ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
          (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
           (stride_ == 1 || stride_ == 2)));
@@ -643,9 +641,10 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
     bool use_cudnn =
         std::is_same<Device, GPUDevice>::value &&
-        (in_depth == 1 || (use_cudnn_grouped_conv_ &&
-                           ShouldCudnnGroupedConvolutionBeUsed(
-                               filter_rows, filter_cols, in_depth, out_depth)));
+        (in_depth == 1 ||
+         (use_cudnn_grouped_conv_ && UseCudnnWith16BitFloat(context, dtype_) &&
+          ShouldCudnnGroupedConvolutionBeUsed(filter_rows, filter_cols,
+                                              in_depth, out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -731,6 +730,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("input_sizes"),            \
                           DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
 
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
@@ -755,6 +755,7 @@ class DepthwiseConv2dGroupedConvBackpropInputOp
                               .Label("cudnn_grouped_convolution"),   \
                           DepthwiseConv2dGroupedConvBackpropInputOp<T>)
 
+TF_CALL_bfloat16(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
@@ -1040,11 +1041,14 @@ extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_grad_filter_ops.cc.
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::bfloat16>;
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
 
 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
+                                                           Eigen::bfloat16>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
                                                            Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
@@ -1100,16 +1104,13 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     } else {
       LOG(ERROR) << "Only bfloat16, half, float, and double are supported.";
     }
-#if CUDNN_VERSION >= 7603
+#if CUDNN_VERSION >= 8000
+    use_cudnn_grouped_conv_ = dtype_ == DT_HALF || dtype_ == DT_BFLOAT16;
+#elif CUDNN_VERSION >= 7603
     // Use CuDNN grouped conv (filter gradients) when input/output is
     // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
     // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
-    //
-    // Grouped convolution was added to cuDNN in version 7.0.1 but
-    // TensorFlow op-determinism has been added only for cuDNN versions 7.6.3
-    // and later intentionally. This is to avoid potential issues with earlier
-    // versions of cuDNN.
-    use_cudnn_grouped_conv_ = OpDeterminismRequired() || dtype_ == DT_HALF;
+    use_cudnn_grouped_conv_ = dtype_ == DT_HALF;
 #else
     use_cudnn_grouped_conv_ = false;
 #endif
@@ -1146,12 +1147,21 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
-                     (in_depth == 1 ||
-                      (use_cudnn_grouped_conv_ &&
-                       (ShouldCudnnGroupedConvolutionBeUsed(
-                            filter_rows, filter_cols, in_depth, out_depth) ||
-                        OpDeterminismRequired())));
+    //
+    // Grouped convolution was added to cuDNN in version 7.0.1 but
+    // TensorFlow op-determinism has been added only for cuDNN versions 7.6.3
+    // and later intentionally. This is to avoid potential issues with earlier
+    // versions of cuDNN.
+    bool determinism_required = false;
+#if CUDNN_VERSION >= 7603
+    determinism_required = OpDeterminismRequired();
+#endif  // CUDNN_VERSION >= 7603
+    bool use_cudnn =
+        std::is_same<Device, GPUDevice>::value &&
+        (in_depth == 1 || determinism_required ||
+         (use_cudnn_grouped_conv_ && UseCudnnWith16BitFloat(context, dtype_) &&
+          ShouldCudnnGroupedConvolutionBeUsed(filter_rows, filter_cols,
+                                              in_depth, out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -1189,10 +1199,11 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       return;
     }
 
-    // For GPU inputs with type half, we cast inputs to float and outputs back
-    // to half, as half implementation is slow and does not use full precision
-    // accumulation in some cases.
-    constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
+    // For GPU inputs with type half (or bfloat16), we cast inputs to float and
+    // outputs back to half (or bfloat16), as half (or bfloat16) implementation
+    // is slow and does not use full precision accumulation in some cases.
+    constexpr bool cast_to_float = (std::is_same<T, Eigen::half>::value ||
+                                    std::is_same<T, Eigen::bfloat16>::value) &&
                                    std::is_same<Device, GPUDevice>::value;
     using U = typename std::conditional<cast_to_float, float, T>::type;
     Tensor casted_out_backprop = out_backprop;
@@ -1200,16 +1211,16 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     Tensor casted_filter_backprop = *filter_backprop;
     const Device& device = context->template eigen_device<Device>();
     if (cast_to_float) {
-      functor::CastFunctor<Device, float, Eigen::half> cast;
+      functor::CastFunctor<Device, float, T> cast;
       OP_REQUIRES_OK(context,
                      context->allocate_temp(DT_FLOAT, out_backprop.shape(),
                                             &casted_out_backprop));
       cast(device, casted_out_backprop.template flat<float>(),
-           out_backprop.template flat<Eigen::half>());
+           out_backprop.template flat<T>());
       OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
                                                      &casted_input));
       cast(device, casted_input.template flat<float>(),
-           input.template flat<Eigen::half>());
+           input.template flat<T>());
       OP_REQUIRES_OK(context,
                      context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
                                             &casted_filter_backprop));
@@ -1223,9 +1234,9 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
         data_format_);
 
     if (cast_to_float) {
-      functor::CastFunctor<Device, Eigen::half, float> cast;
+      functor::CastFunctor<Device, T, float> cast;
       const Tensor& casted_filter_backprop_const = casted_filter_backprop;
-      cast(device, filter_backprop->template flat<Eigen::half>(),
+      cast(device, filter_backprop->template flat<T>(),
            casted_filter_backprop_const.template flat<float>());
     }
   }
@@ -1270,6 +1281,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("filter_sizes"),            \
                           DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
 
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
@@ -1294,6 +1306,7 @@ class DepthwiseConv2dGroupedConvBackpropFilterOp
                               .Label("cudnn_grouped_convolution"),    \
                           DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
 
+TF_CALL_bfloat16(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 1dcc7ae80b6..b282855666b 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -41,15 +41,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif
 
-#include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 namespace tensorflow {
 
 // In depthwise convolution, one input is convolved into depth_multipler
@@ -62,6 +57,20 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+bool UseCudnnWith16BitFloat(OpKernelContext* ctx, DataType dtype) {
+#if GOOGLE_CUDA
+  if (dtype == DT_HALF) {
+    return true;
+  } else if (dtype == DT_BFLOAT16) {
+    auto* stream = ctx->op_device_context()->stream();
+    if (!stream) return false;
+    return stream->GetCudaComputeCapability().IsAtLeast(
+        se::CudaComputeCapability::AMPERE);
+  }
+#endif
+  return false;
+}
+
 // Computes the vectorized product of 'input_buffer' and 'filter' and stores
 // result in 'output' at location specified by 'out_r' and 'out_c'.
 //
@@ -258,11 +267,13 @@ extern template struct LaunchConv2DOp<CPUDevice, double>;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Extern template instantiated in conv_ops.cc.
+extern template struct LaunchConv2DOp<GPUDevice, Eigen::bfloat16>;
 extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
 extern template struct LaunchConv2DOp<GPUDevice, float>;
 extern template struct LaunchConv2DOp<GPUDevice, double>;
 
 // Extern template instantiated in depthwise_conv_op_gpu.cc.
+extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::bfloat16>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
@@ -329,7 +340,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
     // release-notes/rel_8.html#rel_8)
     use_cudnn_grouped_conv_ =
-        dtype_ == DT_HALF &&
+        (dtype_ == DT_HALF || dtype_ == DT_BFLOAT16) &&
         (data_format_ == FORMAT_NCHW ||
          (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
           (stride_ == 1 || stride_ == 2)));
@@ -406,8 +417,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 input_cols, filter_cols, stride_, padding_,
                                 &out_cols, &pad_left, &pad_right));
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context,
+                   ShapeFromFormatWithStatus(data_format_, batch, out_rows,
+                                             out_cols, out_depth, &out_shape));
     OP_REQUIRES(
         context,
         (!std::is_same<Device, GPUDevice>::value ||
@@ -428,9 +441,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
     bool use_cudnn =
         std::is_same<Device, GPUDevice>::value &&
-        (in_depth == 1 || (use_cudnn_grouped_conv_ &&
-                           ShouldCudnnGroupedConvolutionBeUsed(
-                               filter_rows, filter_cols, in_depth, out_depth)));
+        (in_depth == 1 ||
+         (use_cudnn_grouped_conv_ && UseCudnnWith16BitFloat(context, dtype_) &&
+          ShouldCudnnGroupedConvolutionBeUsed(filter_rows, filter_cols,
+                                              in_depth, out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNative: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -527,6 +541,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
       Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       DepthwiseConv2dNativeOp<GPUDevice, T>)
 
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
@@ -549,6 +564,7 @@ class DepthwiseConv2dGroupedConvOp
                               .Label("cudnn_grouped_convolution"), \
                           DepthwiseConv2dGroupedConvOp<T>)
 
+TF_CALL_bfloat16(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index 7a86c5f2adf..c17c873143a 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -20,6 +20,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 namespace tensorflow {
 
 struct DepthwiseArgs {
@@ -80,6 +84,8 @@ struct LaunchDepthwiseConvBackpropFilterOp {
                   TensorFormat data_format);
 };
 
+bool UseCudnnWith16BitFloat(OpKernelContext* ctx, DataType dtype);
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index 7ec4eb6ed77..8b9043ff4e9 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -47,6 +47,10 @@ template <>
 struct PseudoHalfType<Eigen::half> {
   using Type = float;
 };
+template <>
+struct PseudoHalfType<Eigen::bfloat16> {
+  using Type = float;
+};
 }  // namespace detail
 
 using Eigen::GpuDevice;
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..802984215d0
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_bfloat16.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::bfloat16>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::bfloat16>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::bfloat16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index ccd0fe60fd6..4901b6a76fc 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -52,10 +52,10 @@ class DiagOp : public OpKernel {
         errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
-      out_shape.AddDim(diagonal.dim_size(i));
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(diagonal.dim_size(i)));
     }
     for (int i = 0; i < num_dims; ++i) {
-      out_shape.AddDim(diagonal.dim_size(i));
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(diagonal.dim_size(i)));
     }
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
@@ -92,7 +92,7 @@ class DiagPartOp : public OpKernel {
 
     TensorShape out_shape;
     for (int i = 0; i < out_dims; ++i) {
-      out_shape.AddDim(tensor.dim_size(i));
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(tensor.dim_size(i)));
     }
 
     Tensor* output = nullptr;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 55408772fb4..28ccfae9ae6 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -69,9 +69,9 @@ class DynamicPartitionOp_Shared : public OpKernel {
     OP_REQUIRES_OK(c, c->output_list("outputs", Tout));
     for (int p = 0; p < num_partitions_; p++) {
       TensorShape shape;
-      shape.AddDim(partition_count[p]);
+      OP_REQUIRES_OK(c, shape.AddDimWithStatus(partition_count[p]));
       for (int i = (*partitions)->dims(); i < (*data)->dims(); i++) {
-        shape.AddDim((*data)->dim_size(i));
+        OP_REQUIRES_OK(c, shape.AddDimWithStatus((*data)->dim_size(i)));
       }
       Tensor* out;
       OP_REQUIRES_OK(c, Tout->allocate(p, shape, &out));
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index ef172f39962..5ed83657573 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -337,9 +337,16 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // Determine temporary device storage requirements.
     Tensor cub_temp_storage;
     size_t temp_storage_bytes = 0;
-    gpuprim::DeviceRadixSort::SortPairs(
+    auto gpuResult = gpuprim::DeviceRadixSort::SortPairs(
         NULL, temp_storage_bytes, partitions_ptr, partitions_out_ptr,
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
+
+    OP_REQUIRES(
+        c, gpuResult == gpuSuccess,
+        errors::Internal(
+            "Failed to launch gpuprim::DeviceRadixSort::SortPairs to calculate",
+            "temp_storage_bytes, status: ", GpuGetErrorString(gpuResult)));
+
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
         c,
@@ -348,10 +355,17 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
             &cub_temp_storage),
         done);
     // Radix-sort the partition information.
-    gpuprim::DeviceRadixSort::SortPairs(
+    gpuResult = gpuprim::DeviceRadixSort::SortPairs(
         cub_temp_storage.flat<int8>().data(), temp_storage_bytes,
         partitions_ptr, partitions_out_ptr, indices_in_ptr, indices_out_ptr, N,
         0, sizeof(int32) * 8, cu_stream);
+
+    OP_REQUIRES(
+        c, gpuResult == gpuSuccess,
+        errors::Internal("Failed to launch gpuprim::DeviceRadixSort::SortPairs"
+                         "temp_storage_bytes: ",
+                         temp_storage_bytes,
+                         "status: ", GpuGetErrorString(gpuResult)));
   }  // At this point cub_temp_storage will be marked for deallocation.
 
   void CountAndSortParts(OpKernelContext* c, const Tensor* partitions,
@@ -413,9 +427,16 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // Determine temporary device storage requirements
     Tensor cub_temp_storage;
     size_t temp_storage_bytes = 0;
-    gpuprim::DeviceReduce::ReduceByKey(
+    auto gpuResult = gpuprim::DeviceReduce::ReduceByKey(
         NULL, temp_storage_bytes, keys_in_ptr, unique_out_it, values_in,
         aggregates_out_it, num_runs_ptr, reduction_op, N, cu_stream);
+
+    OP_REQUIRES(
+        c, gpuResult == gpuSuccess,
+        errors::Internal(
+            "Failed to launch gpuprim::DeviceReduce::ReduceByKey ",
+            "temp_storage_bytes, status: ", GpuGetErrorString(gpuResult)));
+
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
         c,
@@ -427,10 +448,17 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // each index appears in partitions. The distinct indices are stored
     // in unique_out, while the count is stored in aggregates_out.
     // The total number of distinct indices is stored in num_runs.
-    gpuprim::DeviceReduce::ReduceByKey(
+    gpuResult = gpuprim::DeviceReduce::ReduceByKey(
         cub_temp_storage.flat<int8>().data(), temp_storage_bytes, keys_in_ptr,
         unique_out_it, values_in, aggregates_out_it, num_runs_ptr, reduction_op,
         N, cu_stream);
+
+    OP_REQUIRES(
+        c, gpuResult == gpuSuccess,
+        errors::Internal("Failed to launch gpuprim::DeviceReduce::ReduceByKey ",
+                         "temp_storage_bytes: ", temp_storage_bytes,
+                         ", status: ", GpuGetErrorString(gpuResult)));
+
     // We are not done yet. unique_out only contains the indices that appeared
     // at least once in partitions. We move each value from aggregates_out
     // to the corresponding position in partition_count. This will handle
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 50cb3acbb03..23fc151b4eb 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -97,6 +97,17 @@ class DynamicStitchOpImplBase : public OpKernel {
 
     *first_dim_size = max_index + 1;
 
+    for (const Tensor& indices : *indices_inputs) {
+      auto indices_vec = indices.flat<int32>();
+
+      for (int i = 0; i < indices_vec.size(); i++) {
+        int32_t index = internal::SubtleMustCopy(indices_vec(i));
+        OP_REQUIRES(
+            c, FastBoundsCheck(index, *first_dim_size),
+            errors::InvalidArgument("indices[", i, "] is out of range"));
+      }
+    }
+
     // Validate that data[i].shape = indices[i].shape + constant
     OP_REQUIRES_OK(c, c->input_list("data", data_inputs));
     const Tensor& data0 = (*data_inputs)[0];
@@ -125,9 +136,9 @@ class DynamicStitchOpImplBase : public OpKernel {
     // Allocate result tensor of shape
     //   [*first_dim_size] + data.shape[indices.dims:]
     TensorShape result_shape;
-    result_shape.AddDim(*first_dim_size);
+    OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(*first_dim_size));
     for (int d = indices0.dims(); d < data0.dims(); d++) {
-      result_shape.AddDim(data0.dim_size(d));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(data0.dim_size(d)));
     }
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, result_ptr));
   }
@@ -265,9 +276,6 @@ class DynamicStitchOpImplCPU : public DynamicStitchOpImplBase<T> {
           const T* data_base = data_flat.data();
           for (int i = 0; i < indices_vec.size(); i++) {
             int32_t index = internal::SubtleMustCopy(indices_vec(i));
-            OP_REQUIRES(
-                c, FastBoundsCheck(index, first_dim_size),
-                errors::InvalidArgument("indices[", i, "] is out of range"));
             memcpy(merged_base + index * slice_size, data_base + i * slice_size,
                    slice_bytes);
           }
@@ -277,9 +285,6 @@ class DynamicStitchOpImplCPU : public DynamicStitchOpImplBase<T> {
             // Copy slice data[i] to merged[indices[i]]
             Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0);
             int32_t index = internal::SubtleMustCopy(indices_vec(i));
-            OP_REQUIRES(
-                c, FastBoundsCheck(index, first_dim_size),
-                errors::InvalidArgument("indices[", i, "] is out of range"));
             Eigen::DSizes<Eigen::DenseIndex, 2> merged_indices(index, 0);
             merged_flat.slice(merged_indices, sizes) =
                 data_flat.slice(data_indices, sizes);
diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc
index 4a0ae3c3f79..e6f0d99c497 100644
--- a/tensorflow/core/kernels/edit_distance_op.cc
+++ b/tensorflow/core/kernels/edit_distance_op.cc
@@ -162,8 +162,9 @@ class EditDistanceOp : public OpKernel {
 
     TensorShape output_shape;
     for (int d = 0; d < static_cast<int>(group_dims.size()); ++d) {
-      output_shape.AddDim(std::max(hypothesis_st_shape.dim_size(d),
-                                   truth_st_shape.dim_size(d)));
+      OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(
+                              std::max(hypothesis_st_shape.dim_size(d),
+                                       truth_st_shape.dim_size(d))));
     }
     const auto output_elements = output_shape.num_elements();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index 136cfa4b2f2..b0edb976ddf 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h"
 
 namespace Eigen {
 
diff --git a/tensorflow/core/kernels/eigen_benchmark.h b/tensorflow/core/kernels/eigen_benchmark.h
index 8b35bfdcd64..08b760a4d40 100644
--- a/tensorflow/core/kernels/eigen_benchmark.h
+++ b/tensorflow/core/kernels/eigen_benchmark.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
-#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h"
 
 using ::tensorflow::TTypes;
 
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index ad63cded903..ef9f37d90e1 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -19,10 +19,10 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
-#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
+#include "tensorflow/tsl/framework/convolution/eigen_convolution_helpers.h"
 
 namespace Eigen {
 
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolutions_test.cc b/tensorflow/core/kernels/eigen_cuboid_convolutions_test.cc
new file mode 100644
index 00000000000..1e1a3979d3e
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_cuboid_convolutions_test.cc
@@ -0,0 +1,1309 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace Eigen {
+
+#define EigenApprox(a, b) \
+  { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); }
+static int ceil_div(int a, int b) { return (a + b - 1) / b; }
+
+TEST(EigenCuboidConvolutionsTest, Cuboid) {
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 8;
+  const int in_cols = 7;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 4;
+  const int kern_height = 4;
+
+  const int out_depth = in_depth;
+  const int out_height = in_rows;
+  const int out_width = in_cols;
+
+  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
+  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
+                          kern_width);
+  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(0), kern_filters);
+  EXPECT_EQ(result.dimension(1), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(3), out_width);
+
+  const int off_p = (kern_depth - 1) / 2;
+  const int off_r = (kern_height - 1) / 2;
+  const int off_c = (kern_width - 1) / 2;
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
+                      c - off_c + k >= 0 && p - off_p + i < in_depth &&
+                      r - off_r + j < in_rows && c - off_c + k < in_cols) {
+                    expected +=
+                        input(id, p - off_p + i, r - off_r + j, c - off_c + k) *
+                        kernel(od, id, p, r, c);
+                  }
+                }
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, k), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, CuboidRowMajor) {
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 8;
+  const int in_cols = 7;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 4;
+  const int kern_height = 4;
+
+  const int out_depth = in_depth;
+  const int out_height = in_rows;
+  const int out_width = in_cols;
+
+  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
+  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
+                                    in_channels, kern_filters);
+  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
+                                    kern_filters);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(3), kern_filters);
+  EXPECT_EQ(result.dimension(2), out_depth);
+  EXPECT_EQ(result.dimension(1), out_height);
+  EXPECT_EQ(result.dimension(0), out_width);
+
+  const int off_p = (kern_depth - 1) / 2;
+  const int off_r = (kern_height - 1) / 2;
+  const int off_c = (kern_width - 1) / 2;
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
+                      c - off_c + k >= 0 && p - off_p + i < in_depth &&
+                      r - off_r + j < in_rows && c - off_c + k < in_cols) {
+                    expected +=
+                        input(c - off_c + k, r - off_r + j, p - off_p + i, id) *
+                        kernel(c, r, p, id, od);
+                  }
+                }
+              }
+            }
+          }
+          EigenApprox(result(k, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, ValidCuboid) {
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 5;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int out_depth = 3;
+  const int out_height = 3;
+  const int out_width = 3;
+
+  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
+  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
+                          kern_width);
+  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), kern_filters);
+  EXPECT_EQ(result.dimension(1), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(3), out_width);
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  expected +=
+                      input(id, p + i, r + j, c + k) * kernel(od, id, p, r, c);
+                }
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, k), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, ValidCuboidRowMajor) {
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 5;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int out_depth = 3;
+  const int out_height = 3;
+  const int out_width = 3;
+
+  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
+  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
+                                    in_channels, kern_filters);
+  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
+                                    kern_filters);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(3), kern_filters);
+  EXPECT_EQ(result.dimension(2), out_depth);
+  EXPECT_EQ(result.dimension(1), out_height);
+  EXPECT_EQ(result.dimension(0), out_width);
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  expected +=
+                      input(c + k, r + j, p + i, id) * kernel(c, r, p, id, od);
+                }
+              }
+            }
+          }
+          EigenApprox(result(k, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, BatchedCuboid) {
+  const int batches = 2;
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 8;
+  const int in_cols = 7;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 4;
+  const int kern_height = 4;
+
+  const int out_depth = in_depth;
+  const int out_height = in_rows;
+  const int out_width = in_cols;
+
+  Tensor<float, 5> input(in_channels, in_depth, in_rows, in_cols, batches);
+  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
+                          kern_width);
+  Tensor<float, 5> result(kern_filters, out_depth, out_height, out_width,
+                          batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(0), kern_filters);
+  EXPECT_EQ(result.dimension(1), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(3), out_width);
+  EXPECT_EQ(result.dimension(4), batches);
+
+  const int off_p = (kern_depth - 1) / 2;
+  const int off_r = (kern_height - 1) / 2;
+  const int off_c = (kern_width - 1) / 2;
+
+  for (int b = 0; b < batches; b++) {
+    for (int od = 0; od < kern_filters; ++od) {
+      for (int i = 0; i < out_depth; ++i) {
+        for (int j = 0; j < out_height; ++j) {
+          for (int k = 0; k < out_width; ++k) {
+            float expected = 0.0f;
+            for (int c = 0; c < kern_width; ++c) {
+              for (int r = 0; r < kern_height; ++r) {
+                for (int p = 0; p < kern_depth; ++p) {
+                  for (int id = 0; id < in_channels; ++id) {
+                    if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
+                        c - off_c + k >= 0 && p - off_p + i < in_depth &&
+                        r - off_r + j < in_rows && c - off_c + k < in_cols) {
+                      expected += input(id, p - off_p + i, r - off_r + j,
+                                        c - off_c + k, b) *
+                                  kernel(od, id, p, r, c);
+                    }
+                  }
+                }
+              }
+            }
+            EigenApprox(result(od, i, j, k, b), expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, BatchedCuboidRowMajor) {
+  const int batches = 2;
+  const int in_channels = 10;
+  const int in_depth = 5;
+  const int in_rows = 8;
+  const int in_cols = 7;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 4;
+  const int kern_height = 4;
+
+  const int out_depth = in_depth;
+  const int out_height = in_rows;
+  const int out_width = in_cols;
+
+  Tensor<float, 5, RowMajor> input(batches, in_cols, in_rows, in_depth,
+                                   in_channels);
+  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
+                                    in_channels, kern_filters);
+  Tensor<float, 5, RowMajor> result(batches, out_width, out_height, out_depth,
+                                    kern_filters);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = CuboidConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(4), kern_filters);
+  EXPECT_EQ(result.dimension(3), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(1), out_width);
+  EXPECT_EQ(result.dimension(0), batches);
+
+  const int off_p = (kern_depth - 1) / 2;
+  const int off_r = (kern_height - 1) / 2;
+  const int off_c = (kern_width - 1) / 2;
+
+  for (int b = 0; b < batches; b++) {
+    for (int od = 0; od < kern_filters; ++od) {
+      for (int i = 0; i < out_depth; ++i) {
+        for (int j = 0; j < out_height; ++j) {
+          for (int k = 0; k < out_width; ++k) {
+            float expected = 0.0f;
+            for (int c = 0; c < kern_width; ++c) {
+              for (int r = 0; r < kern_height; ++r) {
+                for (int p = 0; p < kern_depth; ++p) {
+                  for (int id = 0; id < in_channels; ++id) {
+                    if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
+                        c - off_c + k >= 0 && p - off_p + i < in_depth &&
+                        r - off_r + j < in_rows && c - off_c + k < in_cols) {
+                      expected += input(b, c - off_c + k, r - off_r + j,
+                                        p - off_p + i, id) *
+                                  kernel(c, r, p, id, od);
+                    }
+                  }
+                }
+              }
+            }
+            EigenApprox(result(b, k, j, i, od), expected);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, StridedValidCuboid) {
+  const int in_channels = 10;
+  const int in_depth = 8;
+  const int in_rows = 7;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int out_depth = 3;
+  const int out_height = 3;
+  const int out_width = 2;
+
+  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
+  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
+                          kern_width);
+  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  const int stride = 2;
+  result =
+      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), kern_filters);
+  EXPECT_EQ(result.dimension(1), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(3), out_width);
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  expected += input(id, p + stride * i, r + stride * j,
+                                    c + stride * k) *
+                              kernel(od, id, p, r, c);
+                }
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, k), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, StridedValidCuboidRowMajor) {
+  const int in_channels = 10;
+  const int in_depth = 8;
+  const int in_rows = 7;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int out_depth = 3;
+  const int out_height = 3;
+  const int out_width = 2;
+
+  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
+  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
+                                    in_channels, kern_filters);
+  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
+                                    kern_filters);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  const int stride = 2;
+  result =
+      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(3), kern_filters);
+  EXPECT_EQ(result.dimension(2), out_depth);
+  EXPECT_EQ(result.dimension(1), out_height);
+  EXPECT_EQ(result.dimension(0), out_width);
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  expected += input(c + stride * k, r + stride * j,
+                                    p + stride * i, id) *
+                              kernel(c, r, p, id, od);
+                }
+              }
+            }
+          }
+          EigenApprox(result(k, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, StridedSameCuboid) {
+  const int in_channels = 10;
+  const int in_depth = 8;
+  const int in_rows = 7;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int stride = 2;
+  const int out_depth = ceil_div(in_depth, stride);
+  const int out_height = ceil_div(in_rows, stride);
+  const int out_width = ceil_div(in_cols, stride);
+
+  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
+  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
+                          kern_width);
+  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result =
+      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
+
+  EXPECT_EQ(result.dimension(0), kern_filters);
+  EXPECT_EQ(result.dimension(1), out_depth);
+  EXPECT_EQ(result.dimension(2), out_height);
+  EXPECT_EQ(result.dimension(3), out_width);
+
+  const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
+  const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
+  const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
+
+  // Number of pixels the input is extended with at the lower end in every
+  // dimension.
+  const int dp = pad_p / 2;
+  const int dr = pad_r / 2;
+  const int dc = pad_c / 2;
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  const int in_p = p - dp + i * stride;
+                  const int in_r = r - dr + j * stride;
+                  const int in_c = c - dc + k * stride;
+                  if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
+                      in_r < in_rows && in_c < in_cols) {
+                    expected +=
+                        input(id, in_p, in_r, in_c) * kernel(od, id, p, r, c);
+                  }
+                }
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, k), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenCuboidConvolutionsTest, StridedSameCuboidRowMajor) {
+  const int in_channels = 10;
+  const int in_depth = 8;
+  const int in_rows = 7;
+  const int in_cols = 5;
+
+  const int kern_filters = 7;
+  const int kern_depth = 3;
+  const int kern_width = 3;
+  const int kern_height = 3;
+
+  const int stride = 2;
+  const int out_depth = ceil_div(in_depth, stride);
+  const int out_height = ceil_div(in_rows, stride);
+  const int out_width = ceil_div(in_cols, stride);
+
+  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
+  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
+                                    in_channels, kern_filters);
+  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
+                                    kern_filters);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result =
+      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
+
+  EXPECT_EQ(result.dimension(3), kern_filters);
+  EXPECT_EQ(result.dimension(2), out_depth);
+  EXPECT_EQ(result.dimension(1), out_height);
+  EXPECT_EQ(result.dimension(0), out_width);
+
+  const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
+  const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
+  const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
+
+  // Number of pixels the input is extended with at the lower end in every
+  // dimension.
+  const int dp = pad_p / 2;
+  const int dr = pad_r / 2;
+  const int dc = pad_c / 2;
+
+  for (int od = 0; od < kern_filters; ++od) {
+    for (int i = 0; i < out_depth; ++i) {
+      for (int j = 0; j < out_height; ++j) {
+        for (int k = 0; k < out_width; ++k) {
+          float expected = 0.0f;
+          for (int c = 0; c < kern_width; ++c) {
+            for (int r = 0; r < kern_height; ++r) {
+              for (int p = 0; p < kern_depth; ++p) {
+                for (int id = 0; id < in_channels; ++id) {
+                  const int in_p = p - dp + i * stride;
+                  const int in_r = r - dr + j * stride;
+                  const int in_c = c - dc + k * stride;
+                  if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
+                      in_r < in_rows && in_c < in_cols) {
+                    expected +=
+                        input(in_c, in_r, in_p, id) * kernel(c, r, p, id, od);
+                  }
+                }
+              }
+            }
+          }
+          EigenApprox(result(k, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void PackRhsHelper(::testing::benchmark::State& state,
+                          /* Input dimensions: */
+                          int input_batches, int input_cols, int input_rows,
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          Eigen::PaddingType padding,
+                          /* Input strides: */
+                          int col_strides, int row_strides,
+                          /* Patch inflate strides: */
+                          int patch_col_inflate_stride,
+                          int patch_row_inflate_stride,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (channels aka depth in this case).
+  Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
+
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
+
+  // Reshape dimensions.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<T>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
+
+  using Evaluator = TensorEvaluator<
+      const TensorReshapingOp<
+          NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
+      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
+                                                ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_rhs<T, Eigen::Index, SubMapper,  //
+                                     Traits::nr,                  //
+                                     ColMajor,                    //
+                                     /*Conjugate*/ false,         //
+                                     /*PanelMode*/ false>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // Actual contract dimensions are not important.
+  const Eigen::Index not_important = -1234;
+  nocontract_t nocontract_dim = {not_important};
+  contract_t contract_dim = {not_important};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<T, 4> packed(input_dims);
+
+  // We generate multiple input tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = input_dims.TotalSize() * sizeof(T);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_inputs =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<T, 4>> inputs;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  inputs.reserve(num_inputs);
+  evaluators.reserve(num_inputs);
+  input_mappers.reserve(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs.emplace_back(input_dims);
+    inputs[i].setRandom();
+
+    ArgType tensor_map(inputs[i].data(), input_dims);
+
+    // 1. Extract image patches from input tensor. All strides are `1`.
+    const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
+        tensor_map,                                          //
+        filter_rows, filter_cols,                            //
+        row_strides, col_strides,                            //
+        /*in_row_strides=*/1, /*in_col_strides=*/1,          //
+        patch_row_inflate_stride, patch_col_inflate_stride,  //
+        padding, /*padding_value=*/0.0);
+
+    // 2. Reshape extracted patches into "virtual" 2d tensor.
+    Index input_rows_eff = (input_rows - 1) * patch_row_inflate_stride + 1;
+    Index input_cols_eff = (input_cols - 1) * patch_col_inflate_stride + 1;
+
+    Index output_rows = 0;
+    Index output_cols = 0;
+
+    if (padding == Eigen::PADDING_SAME) {
+      output_rows = input_rows_eff / row_strides;
+      output_cols = input_cols_eff / col_strides;
+    } else if (padding == Eigen::PADDING_VALID) {
+      output_rows =
+          numext::ceil((input_rows_eff - filter_rows + 1.f) / row_strides);
+      output_cols =
+          numext::ceil((input_cols_eff - filter_cols + 1.f) / col_strides);
+    } else {
+      eigen_assert(false && "not supported");
+    }
+
+    NewDimension reshape_dims;
+    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
+    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
+            image_patch_op, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
+                               contract_dim, contract_dim);
+  }
+
+  // We read properties of extracted image patches directly from evaluator.
+  const Index patch_depth = evaluators[0].impl().dimensions()[0];
+  const Index patch_rows = evaluators[0].impl().dimensions()[1];
+  const Index patch_cols = evaluators[0].impl().dimensions()[2];
+
+  // Number of patches is the same as the maximum column available through the
+  // InputMapper (SubMapper).
+  const Index num_patches = evaluators[0].impl().dimensions()[3];
+
+  // The size of a single patch, it's the same as the maximum depth available
+  // through the InputMapper (SubMapper).
+  const Index patch_size = patch_depth * patch_rows * patch_cols;
+
+  PackRhsImpl pack_rhs;
+
+  const Index packed_total_size = input_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  for (auto s : state) {
+    int input_idx =
+        num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
+
+    // Depth offset must be a multiple packet size.
+    Index depth_offset =
+        (patch_size > block_rows)
+            ? round_up(internal::random<Index>(0, patch_size - 10))
+            : 0;
+    Index col_offset = internal::random<Index>(0, num_patches - 10);
+
+    Index depth = std::min(block_rows, patch_size - depth_offset);
+    Index cols = std::min(block_cols, num_patches - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_size = depth * cols;
+    Index packed_offset =
+        internal::random<Index>(0, packed_total_size - packed_size - 1);
+
+    SubMapper sub_mapper =
+        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
+    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
+  }
+
+  state.SetLabel(
+      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
+                   "; num_patches=", num_patches, " patch_size=", patch_size,
+                   " num_inputs=", num_inputs, " padding=", padding));
+}
+
+template <typename T>
+static void PackLhsHelper(::testing::benchmark::State& state,
+                          /* Input dimensions: */
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  eigen_assert(block_rows <= filter_count);
+  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (`filter count` aka `kernel filers`).
+  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
+
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
+
+  // We are going to reshape filter into 2D tensor.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the ReshapeOp. It is the tensorflow TTypes<T>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
+
+  using Evaluator =
+      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
+                      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
+                                                ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_lhs<T, Eigen::Index, SubMapper,          //
+                                     Traits::mr,                          //
+                                     Traits::LhsProgress,                 //
+                                     typename Traits::LhsPacket4Packing,  //
+                                     ColMajor>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // We will reshape kernel into 2D tensor.
+  NewDimension reshape_dims;
+  reshape_dims[0] = filter_count;
+  reshape_dims[1] = input_depth * filter_rows * filter_cols;
+
+  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
+  nocontract_t nocontract_dim = {0};
+  contract_t contract_dim = {1};
+
+  // These values computed using the algorithm in TensorContraction.h, with
+  // 'nocontract_dim' and 'contract_dim' values specified above.
+  nocontract_t nocontract_strides = {1};
+  contract_t contract_strides = {filter_count};
+  nocontract_t i_strides = {1};
+  contract_t k_strides = {1};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<T, 4> packed(filter_dims);
+
+  // We generate multiple filter tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(T);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_filters =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<T, 4>> filters;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  filters.reserve(num_filters);
+  evaluators.reserve(num_filters);
+  input_mappers.reserve(num_filters);
+
+  for (int i = 0; i < num_filters; ++i) {
+    filters.emplace_back(filter_dims);
+    filters[i].setRandom();
+
+    ArgType tensor_map(filters[i].data(), filter_dims);
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
+                               contract_strides, k_strides);
+  }
+
+  PackLhsImpl pack_lhs;
+
+  const Index packed_total_size = filter_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  // Block rows is in the [0, filter_count) range.
+  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
+
+  const Index max_row = filter_count;
+  const Index max_col = filter_rows * filter_cols * input_depth;
+
+  for (auto s : state) {
+    int filter_idx =
+        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
+
+    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
+    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
+
+    Index rows = std::min(block_rows, max_row - row_offset);
+    Index cols = std::min(block_cols, max_col - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_offset = round_up(
+        internal::random<Index>(0, packed_total_size - rows * cols - 1));
+
+    SubMapper sub_mapper =
+        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
+
+// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+// and accepts block rows and cols in the same order for lhs and rhs.
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
+#else
+    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
+#endif
+  }
+  state.SetLabel(absl::StrCat(
+      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
+      "; input: depth=", input_depth, "; num_filers=", num_filters));
+}
+
+// -------------------------------------------------------------------------- //
+// Pack RHS
+//
+// Macro argument names:
+//    N: batch size
+//    H: height
+//    W: width
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   SH: stride in height dimensions
+//   SW: stride in width dimensions
+//  ISH: patch inflate stride in height dimension
+//  ISW: patch inflate stride in width dimension
+//   BR: block rows
+//   BC: block cols
+
+#define BM_CONCAT(a, b) a##b
+
+#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, \
+                    BR, BC)                                                   \
+  BM_CONCAT(                                                                  \
+      BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW,     \
+      _##PAD##_s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC)
+
+#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC)  \
+  static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW,    \
+                          ISH, ISW, BR,                                       \
+                          BC)(::testing::benchmark::State & state) {          \
+    PackRhsHelper<T>(state, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
+                     ISH, ISW, BR, BC);                                       \
+  }                                                                           \
+  BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \
+                        ISW, BR, BC))                                         \
+      ->UseRealTime()
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+// Fast path: input channel dimension is the multiple of the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Slow path: input channel dimension is not the multiple of the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Slow path with input channel dimension smaller than the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 4,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 4,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+// Non standard patches with inflated strides.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 32, 32,               //
+           /*channels*/ 96,                //
+           /*num_filters*/ 96,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 2, 2,  //
+           /*block*/ 272, 240);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 32, 32,               //
+           /*channels*/ 96,                //
+           /*num_filters*/ 96,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 2, 2,  //
+           /*block*/ 272, 240);
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+using qint8 = Eigen::QInt8;
+BM_PackRhs(/*type*/ qint8,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+// -------------------------------------------------------------------------- //
+// Pack LHS
+//
+// Macro argument names:
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   BR: block rows
+//   BC: block cols
+
+#define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+
+#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                         \
+  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR,             \
+                          BC)(::testing::benchmark::State & state) { \
+    PackLhsHelper<T>(state, C, FC, FH, FW, BR, BC);                  \
+  }                                                                  \
+  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))->UseRealTime()
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 56, 256);
+
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 30,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 50,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 56, 256);
+}  // namespace Eigen
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
index dccd9316b6b..d24eb2f7d7d 100644
--- a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #include "tensorflow/tsl/framework/fixedpoint/FixedPoint.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 128cfdc0eac..4897518fbb9 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -133,12 +133,12 @@ template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
     const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
     const TensorReductionOp<
-        internal::MaxReducer<float>,
-        const Eigen::IndexList<Eigen::type2index<1> >,
+        internal::MaxReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        const Eigen::IndexList<Eigen::type2index<1>>,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
-                                      const Input> > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>>>>
 CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
                  DenseIndex patchRows, DenseIndex patchCols,
                  DenseIndex stridePlanes, DenseIndex strideRows,
@@ -199,13 +199,16 @@ CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
     pre_reduce_dims[2] = post_reduce_dims[4];
   }
 
+  typedef std::remove_const_t<typename internal::traits<Input>::Scalar>
+      CoeffReturnType;
+
   // Take advantage of cxx11 to give the compiler information it can use to
   // optimize the code.
   Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
   return input
       .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
                               strideRows, strideCols, padding_type,
-                              -Eigen::NumTraits<float>::highest())
+                              -Eigen::NumTraits<CoeffReturnType>::highest())
       .reshape(pre_reduce_dims)
       .maximum(reduction_dims)
       .reshape(post_reduce_dims);
@@ -457,12 +460,12 @@ template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
     const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
     const TensorReductionOp<
-        internal::AvgPoolMeanReducer<float>,
-        const Eigen::IndexList<Eigen::type2index<1> >,
+        internal::AvgPoolMeanReducer<
+            std::remove_const_t<typename internal::traits<Input>::Scalar>>,
+        const Eigen::IndexList<Eigen::type2index<1>>,
         const TensorReshapingOp<
             const Eigen::DSizes<DenseIndex, 3>,
-            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
-                                      const Input> > > >
+            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input>>>>
 CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
                  DenseIndex patchRows, DenseIndex patchCols,
                  DenseIndex stridePlanes, DenseIndex strideRows,
@@ -532,7 +535,7 @@ CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
   return input
       .extract_volume_patches(patchPlanes, patchRows, patchCols, stridePlanes,
                               strideRows, strideCols, padding_type,
-                              -Eigen::NumTraits<float>::highest())
+                              -Eigen::NumTraits<CoeffReturnType>::highest())
       .reshape(pre_reduce_dims)
       .reduce(reduction_dims, mean_with_nan)
       .reshape(post_reduce_dims);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
deleted file mode 100644
index 6f16df351f5..00000000000
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ /dev/null
@@ -1,2048 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace Eigen {
-
-#define EigenApprox(a, b) \
-  { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); }
-static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-
-TEST(EigenSpatialConvolutionsTest, Simple) {
-  const int input_depth = 7;
-  const int input_rows = 4;
-  const int input_cols = 5;
-  const int output_depth = 10;
-  const int patch_rows = 3;
-  const int patch_cols = 4;
-  const int output_rows = input_rows;
-  const int output_cols = input_cols;
-
-  Tensor<float, 3> input(input_depth, input_rows, input_cols);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 3> result(output_depth, output_rows, output_cols);
-
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = SpatialConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-
-  for (int od = 0; od < output_depth; ++od) {
-    for (int i = 0; i < output_rows; ++i) {
-      for (int j = 0; j < output_cols; ++j) {
-        float expected = 0.0f;
-        for (int c = 0; c < patch_cols; ++c) {
-          for (int r = 0; r < patch_rows; ++r) {
-            for (int id = 0; id < input_depth; ++id) {
-              if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
-                  c - 1 + j < output_cols) {
-                expected +=
-                    input(id, r - 1 + i, c - 1 + j) * kernel(od, id, r, c);
-              }
-            }
-          }
-        }
-        EigenApprox(result(od, i, j), expected);
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, SimpleRowMajor) {
-  const int input_depth = 7;
-  const int input_rows = 4;
-  const int input_cols = 5;
-  const int output_depth = 10;
-  const int patch_rows = 3;
-  const int patch_cols = 4;
-  const int output_rows = input_rows;
-  const int output_cols = input_cols;
-
-  Tensor<float, 3, RowMajor> input(input_cols, input_rows, input_depth);
-  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
-                                    output_depth);
-  Tensor<float, 3, RowMajor> result(output_cols, output_rows, output_depth);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = SpatialConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(0), output_cols);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_depth);
-
-  for (int od = 0; od < output_depth; ++od) {
-    for (int i = 0; i < output_rows; ++i) {
-      for (int j = 0; j < output_cols; ++j) {
-        float expected = 0.0f;
-        for (int c = 0; c < patch_cols; ++c) {
-          for (int r = 0; r < patch_rows; ++r) {
-            for (int id = 0; id < input_depth; ++id) {
-              if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
-                  c - 1 + j < output_cols) {
-                expected +=
-                    input(c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
-              }
-            }
-          }
-        }
-        EigenApprox(result(j, i, od), expected);
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolution) {
-  Tensor<float, 4> input(10, 5, 5, 13);
-  Tensor<float, 4> kernel(7, 10, 3, 3);
-  Tensor<float, 4> result(7, 5, 5, 13);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = SpatialConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(0), 7);
-  EXPECT_EQ(result.dimension(1), 5);
-  EXPECT_EQ(result.dimension(2), 5);
-
-  for (int b = 0; b < 13; ++b) {
-    for (int od = 0; od < 7; ++od) {
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 5; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < 3; ++c) {
-            for (int r = 0; r < 3; ++r) {
-              for (int id = 0; id < 10; ++id) {
-                if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
-                    c - 1 + j < 5) {
-                  expected +=
-                      input(id, r - 1 + i, c - 1 + j, b) * kernel(od, id, r, c);
-                }
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolutionRowMajor) {
-  Tensor<float, 4, RowMajor> input(13, 5, 5, 10);
-  Tensor<float, 4, RowMajor> kernel(3, 3, 10, 7);
-  Tensor<float, 4, RowMajor> result(13, 5, 5, 7);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = SpatialConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(1), 5);
-  EXPECT_EQ(result.dimension(2), 5);
-  EXPECT_EQ(result.dimension(3), 7);
-
-  for (int b = 0; b < 13; ++b) {
-    for (int od = 0; od < 7; ++od) {
-      for (int i = 0; i < 5; ++i) {
-        for (int j = 0; j < 5; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < 3; ++c) {
-            for (int r = 0; r < 3; ++r) {
-              for (int id = 0; id < 10; ++id) {
-                if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
-                    c - 1 + j < 5) {
-                  expected +=
-                      input(b, c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
-                }
-              }
-            }
-          }
-          EigenApprox(result(b, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolution) {
-  const int input_depth = 10;
-  const int input_rows = 5;
-  const int input_cols = 5;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 4;
-  const int patch_cols = 4;
-  const int output_rows = input_rows - patch_rows + 1;
-  const int output_cols = input_cols - patch_cols + 1;
-
-  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
-  // of 1.
-  const int stride = 1;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-  EXPECT_EQ(result.dimension(3), num_batches);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(id, r + i, c + j, b) * kernel(od, id, r, c);
-              }
-            }
-          }
-          if (result(od, i, j, b) != expected) {
-            std::cout << "at od=" << od << " b=" << b << " i=" << i
-                      << " j=" << j << " " << result(od, i, j, b) << " vs "
-                      << expected << std::endl;
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionUnequalStrides) {
-  const int input_depth = 10;
-  const int input_rows = 5;
-  const int input_cols = 5;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 4;
-  const int patch_cols = 4;
-
-  const int row_stride = 1;
-  const int col_stride = 2;
-  const int output_rows = 2;
-  const int output_cols = 1;
-
-  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
-  // of 1.
-  result =
-      SpatialConvolution(input, kernel, row_stride, col_stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-  EXPECT_EQ(result.dimension(3), num_batches);
-  if (true) return;
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected +=
-                    input(id, r + row_stride * i, c + col_stride * j, b) *
-                    kernel(od, id, r, c);
-              }
-            }
-          }
-          if (result(od, i, j, b) != expected) {
-            std::cout << "at od=" << od << " b=" << b << " i=" << i
-                      << " j=" << j << " " << result(od, i, j, b) << " vs "
-                      << expected << std::endl;
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionRowMajor) {
-  const int input_depth = 10;
-  const int input_rows = 5;
-  const int input_cols = 5;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 4;
-  const int patch_cols = 4;
-  const int output_rows = input_rows - patch_rows + 1;
-  const int output_cols = input_cols - patch_cols + 1;
-
-  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
-                                   input_depth);
-  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
-                                    output_depth);
-  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
-                                    output_depth);
-
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
-  // of 1.
-  const int stride = 1;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), num_batches);
-  EXPECT_EQ(result.dimension(1), output_cols);
-  EXPECT_EQ(result.dimension(2), output_rows);
-  EXPECT_EQ(result.dimension(3), output_depth);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_rows; ++c) {
-            for (int r = 0; r < patch_cols; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(b, c + j, r + i, id) * kernel(c, r, id, od);
-              }
-            }
-          }
-          if (result(b, j, i, od) != expected) {
-            std::cout << "at od=" << od << " b=" << b << " i=" << i
-                      << " j=" << j << " " << result(b, j, i, od) << " vs "
-                      << expected << std::endl;
-          }
-          EigenApprox(result(b, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolution) {
-  const int input_depth = 10;
-  const int input_rows = 5;
-  const int input_cols = 5;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 3;
-  const int patch_cols = 3;
-  const int output_rows = 2;
-  const int output_cols = 2;
-
-  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
-  // of 2.
-  int stride = 2;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-  EXPECT_EQ(result.dimension(3), num_batches);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(id, r + stride * i, c + stride * j, b) *
-                            kernel(od, id, r, c);
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, KernelSmallerThanStride) {
-  const int input_depth = 2;
-  const int input_rows = 3;
-  const int input_cols = 3;
-  const int num_batches = 5;
-  const int output_depth = 6;
-  const int patch_rows = 1;
-  const int patch_cols = 1;
-  const int output_rows = 2;
-  const int output_cols = 2;
-
-  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 1x1 kernel, valid padding, and a stride
-  // of 2.
-  int stride = 2;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-  EXPECT_EQ(result.dimension(3), num_batches);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(id, r + stride * i, c + stride * j, b) *
-                            kernel(od, id, r, c);
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolutionRowMajor) {
-  const int input_depth = 10;
-  const int input_rows = 5;
-  const int input_cols = 5;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 3;
-  const int patch_cols = 3;
-  const int output_rows = 2;
-  const int output_cols = 2;
-
-  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
-                                   input_depth);
-  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
-                                    output_depth);
-  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
-                                    output_depth);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
-  // of 2.
-  int stride = 2;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), num_batches);
-  EXPECT_EQ(result.dimension(1), output_cols);
-  EXPECT_EQ(result.dimension(2), output_rows);
-  EXPECT_EQ(result.dimension(3), output_depth);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(b, c + stride * j, r + stride * i, id) *
-                            kernel(c, r, id, od);
-              }
-            }
-          }
-          EigenApprox(result(b, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, AtrousSpatial) {
-  const int input_depth = 10;
-  const int input_rows = 7;
-  const int input_cols = 7;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 3;
-  const int patch_cols = 3;
-  const int output_rows = 3;
-  const int output_cols = 3;
-
-  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
-  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
-  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 3x3 kernel, valid padding
-  // output (standard) stride 1, and input (atrous) stride of 2.
-  int stride = 1;
-  int in_stride = 2;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
-                              in_stride, in_stride);
-
-  EXPECT_EQ(result.dimension(0), output_depth);
-  EXPECT_EQ(result.dimension(1), output_rows);
-  EXPECT_EQ(result.dimension(2), output_cols);
-  EXPECT_EQ(result.dimension(3), num_batches);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(id, in_stride * r + stride * i,
-                                  in_stride * c + stride * j, b) *
-                            kernel(od, id, r, c);
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, b), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajor) {
-  const int input_depth = 10;
-  const int input_rows = 7;
-  const int input_cols = 7;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 3;
-  const int patch_cols = 3;
-  const int output_rows = 3;
-  const int output_cols = 3;
-
-  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
-                                   input_depth);
-  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
-                                    output_depth);
-  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
-                                    output_depth);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 3x3 kernel, valid padding
-  // output (standard) stride 1, and input (atrous) stride of 2.
-  int stride = 1;
-  int in_stride = 2;
-  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
-                              in_stride, in_stride);
-
-  EXPECT_EQ(result.dimension(0), num_batches);
-  EXPECT_EQ(result.dimension(1), output_cols);
-  EXPECT_EQ(result.dimension(2), output_rows);
-  EXPECT_EQ(result.dimension(3), output_depth);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(b, in_stride * c + stride * j,
-                                  in_stride * r + stride * i, id) *
-                            kernel(c, r, id, od);
-              }
-            }
-          }
-          EigenApprox(result(b, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajorUnequalStrides) {
-  const int input_depth = 10;
-  const int input_rows = 7;
-  const int input_cols = 7;
-  const int num_batches = 13;
-  const int output_depth = 7;
-  const int patch_rows = 3;
-  const int patch_cols = 3;
-  const int output_rows = 1;
-  const int output_cols = 3;
-
-  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
-                                   input_depth);
-  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
-                                    output_depth);
-  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
-                                    output_depth);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  // Apply a spatial convolution using a 3x3 kernel, valid padding
-  // output (standard) stride 1, and input (atrous) stride of 2.
-  int row_stride = 1;
-  int col_stride = 2;
-  int row_in_stride = 3;
-  int col_in_stride = 1;
-  result = SpatialConvolution(input, kernel, row_stride, col_stride,
-                              PADDING_VALID, row_in_stride, col_in_stride);
-
-  EXPECT_EQ(result.dimension(0), num_batches);
-  EXPECT_EQ(result.dimension(1), output_cols);
-  EXPECT_EQ(result.dimension(2), output_rows);
-  EXPECT_EQ(result.dimension(3), output_depth);
-
-  for (int b = 0; b < num_batches; ++b) {
-    for (int od = 0; od < output_depth; ++od) {
-      for (int i = 0; i < output_rows; ++i) {
-        for (int j = 0; j < output_cols; ++j) {
-          float expected = 0.0f;
-          for (int c = 0; c < patch_cols; ++c) {
-            for (int r = 0; r < patch_rows; ++r) {
-              for (int id = 0; id < input_depth; ++id) {
-                expected += input(b, col_in_stride * c + col_stride * j,
-                                  row_in_stride * r + row_stride * i, id) *
-                            kernel(c, r, id, od);
-              }
-            }
-          }
-          EigenApprox(result(b, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, Cuboid) {
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 8;
-  const int in_cols = 7;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 4;
-  const int kern_height = 4;
-
-  const int out_depth = in_depth;
-  const int out_height = in_rows;
-  const int out_width = in_cols;
-
-  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
-  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
-                          kern_width);
-  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(0), kern_filters);
-  EXPECT_EQ(result.dimension(1), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(3), out_width);
-
-  const int off_p = (kern_depth - 1) / 2;
-  const int off_r = (kern_height - 1) / 2;
-  const int off_c = (kern_width - 1) / 2;
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
-                      c - off_c + k >= 0 && p - off_p + i < in_depth &&
-                      r - off_r + j < in_rows && c - off_c + k < in_cols) {
-                    expected +=
-                        input(id, p - off_p + i, r - off_r + j, c - off_c + k) *
-                        kernel(od, id, p, r, c);
-                  }
-                }
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, k), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, CuboidRowMajor) {
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 8;
-  const int in_cols = 7;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 4;
-  const int kern_height = 4;
-
-  const int out_depth = in_depth;
-  const int out_height = in_rows;
-  const int out_width = in_cols;
-
-  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
-  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
-                                    in_channels, kern_filters);
-  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
-                                    kern_filters);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(3), kern_filters);
-  EXPECT_EQ(result.dimension(2), out_depth);
-  EXPECT_EQ(result.dimension(1), out_height);
-  EXPECT_EQ(result.dimension(0), out_width);
-
-  const int off_p = (kern_depth - 1) / 2;
-  const int off_r = (kern_height - 1) / 2;
-  const int off_c = (kern_width - 1) / 2;
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
-                      c - off_c + k >= 0 && p - off_p + i < in_depth &&
-                      r - off_r + j < in_rows && c - off_c + k < in_cols) {
-                    expected +=
-                        input(c - off_c + k, r - off_r + j, p - off_p + i, id) *
-                        kernel(c, r, p, id, od);
-                  }
-                }
-              }
-            }
-          }
-          EigenApprox(result(k, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, ValidCuboid) {
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 5;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int out_depth = 3;
-  const int out_height = 3;
-  const int out_width = 3;
-
-  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
-  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
-                          kern_width);
-  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), kern_filters);
-  EXPECT_EQ(result.dimension(1), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(3), out_width);
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  expected +=
-                      input(id, p + i, r + j, c + k) * kernel(od, id, p, r, c);
-                }
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, k), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, ValidCuboidRowMajor) {
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 5;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int out_depth = 3;
-  const int out_height = 3;
-  const int out_width = 3;
-
-  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
-  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
-                                    in_channels, kern_filters);
-  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
-                                    kern_filters);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(3), kern_filters);
-  EXPECT_EQ(result.dimension(2), out_depth);
-  EXPECT_EQ(result.dimension(1), out_height);
-  EXPECT_EQ(result.dimension(0), out_width);
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  expected +=
-                      input(c + k, r + j, p + i, id) * kernel(c, r, p, id, od);
-                }
-              }
-            }
-          }
-          EigenApprox(result(k, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, BatchedCuboid) {
-  const int batches = 2;
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 8;
-  const int in_cols = 7;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 4;
-  const int kern_height = 4;
-
-  const int out_depth = in_depth;
-  const int out_height = in_rows;
-  const int out_width = in_cols;
-
-  Tensor<float, 5> input(in_channels, in_depth, in_rows, in_cols, batches);
-  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
-                          kern_width);
-  Tensor<float, 5> result(kern_filters, out_depth, out_height, out_width,
-                          batches);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(0), kern_filters);
-  EXPECT_EQ(result.dimension(1), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(3), out_width);
-  EXPECT_EQ(result.dimension(4), batches);
-
-  const int off_p = (kern_depth - 1) / 2;
-  const int off_r = (kern_height - 1) / 2;
-  const int off_c = (kern_width - 1) / 2;
-
-  for (int b = 0; b < batches; b++) {
-    for (int od = 0; od < kern_filters; ++od) {
-      for (int i = 0; i < out_depth; ++i) {
-        for (int j = 0; j < out_height; ++j) {
-          for (int k = 0; k < out_width; ++k) {
-            float expected = 0.0f;
-            for (int c = 0; c < kern_width; ++c) {
-              for (int r = 0; r < kern_height; ++r) {
-                for (int p = 0; p < kern_depth; ++p) {
-                  for (int id = 0; id < in_channels; ++id) {
-                    if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
-                        c - off_c + k >= 0 && p - off_p + i < in_depth &&
-                        r - off_r + j < in_rows && c - off_c + k < in_cols) {
-                      expected += input(id, p - off_p + i, r - off_r + j,
-                                        c - off_c + k, b) *
-                                  kernel(od, id, p, r, c);
-                    }
-                  }
-                }
-              }
-            }
-            EigenApprox(result(od, i, j, k, b), expected);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, BatchedCuboidRowMajor) {
-  const int batches = 2;
-  const int in_channels = 10;
-  const int in_depth = 5;
-  const int in_rows = 8;
-  const int in_cols = 7;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 4;
-  const int kern_height = 4;
-
-  const int out_depth = in_depth;
-  const int out_height = in_rows;
-  const int out_width = in_cols;
-
-  Tensor<float, 5, RowMajor> input(batches, in_cols, in_rows, in_depth,
-                                   in_channels);
-  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
-                                    in_channels, kern_filters);
-  Tensor<float, 5, RowMajor> result(batches, out_width, out_height, out_depth,
-                                    kern_filters);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result = CuboidConvolution(input, kernel);
-
-  EXPECT_EQ(result.dimension(4), kern_filters);
-  EXPECT_EQ(result.dimension(3), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(1), out_width);
-  EXPECT_EQ(result.dimension(0), batches);
-
-  const int off_p = (kern_depth - 1) / 2;
-  const int off_r = (kern_height - 1) / 2;
-  const int off_c = (kern_width - 1) / 2;
-
-  for (int b = 0; b < batches; b++) {
-    for (int od = 0; od < kern_filters; ++od) {
-      for (int i = 0; i < out_depth; ++i) {
-        for (int j = 0; j < out_height; ++j) {
-          for (int k = 0; k < out_width; ++k) {
-            float expected = 0.0f;
-            for (int c = 0; c < kern_width; ++c) {
-              for (int r = 0; r < kern_height; ++r) {
-                for (int p = 0; p < kern_depth; ++p) {
-                  for (int id = 0; id < in_channels; ++id) {
-                    if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
-                        c - off_c + k >= 0 && p - off_p + i < in_depth &&
-                        r - off_r + j < in_rows && c - off_c + k < in_cols) {
-                      expected += input(b, c - off_c + k, r - off_r + j,
-                                        p - off_p + i, id) *
-                                  kernel(c, r, p, id, od);
-                    }
-                  }
-                }
-              }
-            }
-            EigenApprox(result(b, k, j, i, od), expected);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedValidCuboid) {
-  const int in_channels = 10;
-  const int in_depth = 8;
-  const int in_rows = 7;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int out_depth = 3;
-  const int out_height = 3;
-  const int out_width = 2;
-
-  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
-  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
-                          kern_width);
-  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  const int stride = 2;
-  result =
-      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(0), kern_filters);
-  EXPECT_EQ(result.dimension(1), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(3), out_width);
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  expected += input(id, p + stride * i, r + stride * j,
-                                    c + stride * k) *
-                              kernel(od, id, p, r, c);
-                }
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, k), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedValidCuboidRowMajor) {
-  const int in_channels = 10;
-  const int in_depth = 8;
-  const int in_rows = 7;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int out_depth = 3;
-  const int out_height = 3;
-  const int out_width = 2;
-
-  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
-  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
-                                    in_channels, kern_filters);
-  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
-                                    kern_filters);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  const int stride = 2;
-  result =
-      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
-
-  EXPECT_EQ(result.dimension(3), kern_filters);
-  EXPECT_EQ(result.dimension(2), out_depth);
-  EXPECT_EQ(result.dimension(1), out_height);
-  EXPECT_EQ(result.dimension(0), out_width);
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  expected += input(c + stride * k, r + stride * j,
-                                    p + stride * i, id) *
-                              kernel(c, r, p, id, od);
-                }
-              }
-            }
-          }
-          EigenApprox(result(k, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedSameCuboid) {
-  const int in_channels = 10;
-  const int in_depth = 8;
-  const int in_rows = 7;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int stride = 2;
-  const int out_depth = ceil_div(in_depth, stride);
-  const int out_height = ceil_div(in_rows, stride);
-  const int out_width = ceil_div(in_cols, stride);
-
-  Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
-  Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
-                          kern_width);
-  Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result =
-      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
-
-  EXPECT_EQ(result.dimension(0), kern_filters);
-  EXPECT_EQ(result.dimension(1), out_depth);
-  EXPECT_EQ(result.dimension(2), out_height);
-  EXPECT_EQ(result.dimension(3), out_width);
-
-  const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
-  const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
-  const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
-
-  // Number of pixels the input is extended with at the lower end in every
-  // dimension.
-  const int dp = pad_p / 2;
-  const int dr = pad_r / 2;
-  const int dc = pad_c / 2;
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  const int in_p = p - dp + i * stride;
-                  const int in_r = r - dr + j * stride;
-                  const int in_c = c - dc + k * stride;
-                  if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
-                      in_r < in_rows && in_c < in_cols) {
-                    expected +=
-                        input(id, in_p, in_r, in_c) * kernel(od, id, p, r, c);
-                  }
-                }
-              }
-            }
-          }
-          EigenApprox(result(od, i, j, k), expected);
-        }
-      }
-    }
-  }
-}
-
-TEST(EigenSpatialConvolutionsTest, StridedSameCuboidRowMajor) {
-  const int in_channels = 10;
-  const int in_depth = 8;
-  const int in_rows = 7;
-  const int in_cols = 5;
-
-  const int kern_filters = 7;
-  const int kern_depth = 3;
-  const int kern_width = 3;
-  const int kern_height = 3;
-
-  const int stride = 2;
-  const int out_depth = ceil_div(in_depth, stride);
-  const int out_height = ceil_div(in_rows, stride);
-  const int out_width = ceil_div(in_cols, stride);
-
-  Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
-  Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
-                                    in_channels, kern_filters);
-  Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
-                                    kern_filters);
-  input = input.constant(11.0f) + input.random();
-  kernel = kernel.constant(2.0f) + kernel.random();
-  result.setRandom();
-
-  result =
-      CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
-
-  EXPECT_EQ(result.dimension(3), kern_filters);
-  EXPECT_EQ(result.dimension(2), out_depth);
-  EXPECT_EQ(result.dimension(1), out_height);
-  EXPECT_EQ(result.dimension(0), out_width);
-
-  const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
-  const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
-  const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
-
-  // Number of pixels the input is extended with at the lower end in every
-  // dimension.
-  const int dp = pad_p / 2;
-  const int dr = pad_r / 2;
-  const int dc = pad_c / 2;
-
-  for (int od = 0; od < kern_filters; ++od) {
-    for (int i = 0; i < out_depth; ++i) {
-      for (int j = 0; j < out_height; ++j) {
-        for (int k = 0; k < out_width; ++k) {
-          float expected = 0.0f;
-          for (int c = 0; c < kern_width; ++c) {
-            for (int r = 0; r < kern_height; ++r) {
-              for (int p = 0; p < kern_depth; ++p) {
-                for (int id = 0; id < in_channels; ++id) {
-                  const int in_p = p - dp + i * stride;
-                  const int in_r = r - dr + j * stride;
-                  const int in_c = c - dc + k * stride;
-                  if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
-                      in_r < in_rows && in_c < in_cols) {
-                    expected +=
-                        input(in_c, in_r, in_p, id) * kernel(c, r, p, id, od);
-                  }
-                }
-              }
-            }
-          }
-          EigenApprox(result(k, j, i, od), expected);
-        }
-      }
-    }
-  }
-}
-
-// A test case discovered when testing backward spatial convolution where the
-// special tensor contraction mapper for spatial convolution contains a bug.
-TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
-  // We have a 3x4 input image with 2x2 patch and stride of 2.
-  // The output has size 1x2.
-  typedef Tensor<float, 1>::DimensionPair DimPair;
-  Tensor<float, 4> out(1, 1, 2, 1);
-  Tensor<float, 4> kern(1, 1, 2, 2);
-  for (int i = 0; i < kern.size(); ++i) {
-    kern.coeffRef(i) = static_cast<float>(i) + 1;
-  }
-  for (int i = 0; i < out.size(); ++i) {
-    out.coeffRef(i) = static_cast<float>(i) + 1;
-  }
-
-  DSizes<ptrdiff_t, 4> strides;
-  strides[0] = 1;
-  strides[1] = 2;
-  strides[2] = 2;
-  strides[3] = 1;
-
-  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
-  paddings[0] = std::make_pair(0, 0);
-  paddings[1] = std::make_pair(1, 2);
-  paddings[2] = std::make_pair(1, 1);
-  paddings[3] = std::make_pair(0, 0);
-
-  DSizes<ptrdiff_t, 3> out_dim;
-  out_dim[0] = 1;
-  out_dim[1] = 4;
-  out_dim[2] = 12;
-
-  array<bool, 4> kernel_reverse;
-  kernel_reverse[0] = false;
-  kernel_reverse[1] = false;
-  kernel_reverse[2] = true;
-  kernel_reverse[3] = true;
-
-  DSizes<ptrdiff_t, 3> k_dims;
-  k_dims[0] = 1;
-  k_dims[1] = 1;
-  k_dims[2] = 4;
-
-  array<DimPair, 2> contract_dims;
-  contract_dims[0] = DimPair(0, 0);
-  contract_dims[1] = DimPair(2, 1);
-
-  DSizes<ptrdiff_t, 4> in_dim;
-  in_dim[0] = 1;
-  in_dim[1] = 3;
-  in_dim[2] = 4;
-  in_dim[3] = 1;
-
-  DSizes<ptrdiff_t, 2> in_dbg_dim;
-  in_dbg_dim[0] = 3;
-  in_dbg_dim[1] = 4;
-
-  DSizes<ptrdiff_t, 2> out_dbg_dim;
-  out_dbg_dim[0] = 4;
-  out_dbg_dim[1] = 12;
-
-  // This is the formula for computing the backward prop for input with a
-  // spatial convolution.
-  Tensor<float, 4> direct =
-      kern.reverse(kernel_reverse)
-          .reshape(k_dims)
-          .contract(
-              out.extract_image_patches(2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 0)
-                  .reshape(out_dim),
-              contract_dims)
-          .reshape(in_dim);
-
-  Tensor<float, 4> indirect =
-      kern.reverse(kernel_reverse)
-          .reshape(k_dims)
-          .contract(
-              out.inflate(strides)
-                  .pad(paddings)
-                  .extract_image_patches(2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
-                  .reshape(out_dim),
-              contract_dims)
-          .reshape(in_dim);
-
-  eigen_assert(dimensions_match(direct.dimensions(), indirect.dimensions()));
-  for (size_t i = 0; i < direct.dimensions().TotalSize(); ++i) {
-    EigenApprox(direct.data()[i], indirect.data()[i]);
-  }
-  EigenApprox(1.0f, direct(0, 0, 0, 0));
-  EigenApprox(3.0f, direct(0, 0, 1, 0));
-  EigenApprox(2.0f, direct(0, 0, 2, 0));
-  EigenApprox(6.0f, direct(0, 0, 3, 0));
-
-  EigenApprox(2.0f, direct(0, 1, 0, 0));
-  EigenApprox(4.0f, direct(0, 1, 1, 0));
-  EigenApprox(4.0f, direct(0, 1, 2, 0));
-  EigenApprox(8.0f, direct(0, 1, 3, 0));
-}
-
-template <typename T>
-static void PackRhsHelper(::testing::benchmark::State& state,
-                          /* Input dimensions: */
-                          int input_batches, int input_cols, int input_rows,
-                          int input_depth,
-                          /* Filter (kernel) dimensions: */
-                          int filter_count, int filter_cols, int filter_rows,
-                          Eigen::PaddingType padding,
-                          /* Input strides: */
-                          int col_strides, int row_strides,
-                          /* Patch inflate strides: */
-                          int patch_col_inflate_stride,
-                          int patch_row_inflate_stride,
-                          /* Block dimensions: */
-                          Index block_rows, Index block_cols) {
-  // Set random seed for benchmark repeatability.
-  srand(12345);
-
-  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
-
-  // Default Eigen::Tensor layout is column major, so we configure dimensions
-  // starting from the inner most (channels aka depth in this case).
-  Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
-
-  static const int packet_size = Eigen::internal::packet_traits<T>::size;
-
-  // Reshape dimensions.
-  using NewDimension = Eigen::DSizes<Index, 2>;
-
-  // Contraction dimensions.
-  using nocontract_t = Eigen::array<Eigen::Index, 1>;
-  using contract_t = Eigen::array<Eigen::Index, 1>;
-
-  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<T>::Tensor
-  // with ColMajor layout, instead of RowMajor. But that doesn't make any
-  // difference, because TensorContraction swaps LHS with RHS for row major
-  // inputs, and contraction mapper always works with column major data.
-  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
-
-  using Evaluator = TensorEvaluator<
-      const TensorReshapingOp<
-          NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
-      Eigen::DefaultDevice>;
-
-  using InputMapper = Eigen::internal::TensorContractionInputMapper<
-      T, Index, Eigen::internal::Rhs, Evaluator,  //
-      nocontract_t, contract_t,                   //
-      packet_size,                                //
-      /*inner_dim_contiguous*/ true,              //
-      /*inner_dim_reordered*/ false,              //
-      /*Alignment*/ 0>;
-
-  using SubMapper = Eigen::internal::TensorContractionSubMapper<
-      T, Index, Eigen::internal::Rhs, Evaluator,  //
-      nocontract_t, contract_t,                   //
-      packet_size,                                //
-      /*inner_dim_contiguous*/ true,              //
-      /*inner_dim_reordered*/ false,              //
-      /*Alignment*/ 0>;
-
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-  using PackRhsImpl =
-      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
-                                                ColMajor>;
-#else
-  using Traits = typename Eigen::internal::gebp_traits<T, T>;
-  using PackRhsImpl =
-      Eigen::internal::gemm_pack_rhs<T, Eigen::Index, SubMapper,  //
-                                     Traits::nr,                  //
-                                     ColMajor,                    //
-                                     /*Conjugate*/ false,         //
-                                     /*PanelMode*/ false>;
-#endif
-
-  Eigen::DefaultDevice device;
-
-  // Actual contract dimensions are not important.
-  const Eigen::Index not_important = -1234;
-  nocontract_t nocontract_dim = {not_important};
-  contract_t contract_dim = {not_important};
-
-  // We use tensor of the same dimensions to store packed data.
-  Tensor<T, 4> packed(input_dims);
-
-  // We generate multiple input tensors, around 512mb in total size to measure
-  // realistic workload when input data in not in L1-L3 cache.
-  size_t input_bytes = input_dims.TotalSize() * sizeof(T);
-  size_t mem_size_bytes = 1024 * 1024 * 512;
-  size_t num_inputs =
-      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
-
-  std::vector<Tensor<T, 4>> inputs;
-  std::vector<Evaluator> evaluators;
-  std::vector<InputMapper> input_mappers;
-
-  inputs.reserve(num_inputs);
-  evaluators.reserve(num_inputs);
-  input_mappers.reserve(num_inputs);
-
-  for (int i = 0; i < num_inputs; ++i) {
-    inputs.emplace_back(input_dims);
-    inputs[i].setRandom();
-
-    ArgType tensor_map(inputs[i].data(), input_dims);
-
-    // 1. Extract image patches from input tensor. All strides are `1`.
-    const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
-        tensor_map,                                          //
-        filter_rows, filter_cols,                            //
-        row_strides, col_strides,                            //
-        /*in_row_strides=*/1, /*in_col_strides=*/1,          //
-        patch_row_inflate_stride, patch_col_inflate_stride,  //
-        padding, /*padding_value=*/0.0);
-
-    // 2. Reshape extracted patches into "virtual" 2d tensor.
-    Index input_rows_eff = (input_rows - 1) * patch_row_inflate_stride + 1;
-    Index input_cols_eff = (input_cols - 1) * patch_col_inflate_stride + 1;
-
-    Index output_rows = 0;
-    Index output_cols = 0;
-
-    if (padding == Eigen::PADDING_SAME) {
-      output_rows = input_rows_eff / row_strides;
-      output_cols = input_cols_eff / col_strides;
-    } else if (padding == Eigen::PADDING_VALID) {
-      output_rows =
-          numext::ceil((input_rows_eff - filter_rows + 1.f) / row_strides);
-      output_cols =
-          numext::ceil((input_cols_eff - filter_cols + 1.f) / col_strides);
-    } else {
-      eigen_assert(false && "not supported");
-    }
-
-    NewDimension reshape_dims;
-    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
-    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
-
-    const auto reshape_op =
-        TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
-            image_patch_op, reshape_dims);
-
-    evaluators.emplace_back(reshape_op, device);
-
-    input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
-                               contract_dim, contract_dim);
-  }
-
-  // We read properties of extracted image patches directly from evaluator.
-  const Index patch_depth = evaluators[0].impl().dimensions()[0];
-  const Index patch_rows = evaluators[0].impl().dimensions()[1];
-  const Index patch_cols = evaluators[0].impl().dimensions()[2];
-
-  // Number of patches is the same as the maximum column available through the
-  // InputMapper (SubMapper).
-  const Index num_patches = evaluators[0].impl().dimensions()[3];
-
-  // The size of a single patch, it's the same as the maximum depth available
-  // through the InputMapper (SubMapper).
-  const Index patch_size = patch_depth * patch_rows * patch_cols;
-
-  PackRhsImpl pack_rhs;
-
-  const Index packed_total_size = input_dims.TotalSize();
-
-  // Round up row/col/memory offsets to make them multiple of packet size.
-  const auto round_up = [](const Index idx) {
-    return (idx / packet_size) * packet_size;
-  };
-
-  for (auto s : state) {
-    int input_idx =
-        num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
-
-    // Depth offset must be a multiple packet size.
-    Index depth_offset =
-        (patch_size > block_rows)
-            ? round_up(internal::random<Index>(0, patch_size - 10))
-            : 0;
-    Index col_offset = internal::random<Index>(0, num_patches - 10);
-
-    Index depth = std::min(block_rows, patch_size - depth_offset);
-    Index cols = std::min(block_cols, num_patches - col_offset);
-
-    // Write packed data to random memory location to emulate cold caches.
-    Index packed_size = depth * cols;
-    Index packed_offset =
-        internal::random<Index>(0, packed_total_size - packed_size - 1);
-
-    SubMapper sub_mapper =
-        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
-    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
-  }
-
-  state.SetLabel(
-      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
-                   "; num_patches=", num_patches, " patch_size=", patch_size,
-                   " num_inputs=", num_inputs, " padding=", padding));
-}
-
-template <typename T>
-static void PackLhsHelper(::testing::benchmark::State& state,
-                          /* Input dimensions: */
-                          int input_depth,
-                          /* Filter (kernel) dimensions: */
-                          int filter_count, int filter_cols, int filter_rows,
-                          /* Block dimensions: */
-                          Index block_rows, Index block_cols) {
-  // Set random seed for benchmark repeatability.
-  srand(12345);
-
-  eigen_assert(block_rows <= filter_count);
-  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
-
-  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
-
-  // Default Eigen::Tensor layout is column major, so we configure dimensions
-  // starting from the inner most (`filter count` aka `kernel filers`).
-  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
-
-  static const int packet_size = Eigen::internal::packet_traits<T>::size;
-
-  // We are going to reshape filter into 2D tensor.
-  using NewDimension = Eigen::DSizes<Index, 2>;
-
-  // Contraction dimensions.
-  using nocontract_t = Eigen::array<Eigen::Index, 1>;
-  using contract_t = Eigen::array<Eigen::Index, 1>;
-
-  // Input to the ReshapeOp. It is the tensorflow TTypes<T>::Tensor
-  // with ColMajor layout, instead of RowMajor. But that doesn't make any
-  // difference, because TensorContraction swaps LHS with RHS for row major
-  // inputs, and contraction mapper always works with column major data.
-  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
-
-  using Evaluator =
-      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
-                      Eigen::DefaultDevice>;
-
-  using InputMapper = Eigen::internal::TensorContractionInputMapper<
-      T, Index, Eigen::internal::Lhs, Evaluator,  //
-      nocontract_t, contract_t,                   //
-      packet_size,                                //
-      /*inner_dim_contiguous*/ true,              //
-      /*inner_dim_reordered*/ false,              //
-      /*Alignment*/ 0>;
-
-  using SubMapper = Eigen::internal::TensorContractionSubMapper<
-      T, Index, Eigen::internal::Lhs, Evaluator,  //
-      nocontract_t, contract_t,                   //
-      packet_size,                                //
-      /*inner_dim_contiguous*/ true,              //
-      /*inner_dim_reordered*/ false,              //
-      /*Alignment*/ 0>;
-
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-  using PackLhsImpl =
-      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
-                                                ColMajor>;
-#else
-  using Traits = typename Eigen::internal::gebp_traits<T, T>;
-  using PackLhsImpl =
-      Eigen::internal::gemm_pack_lhs<T, Eigen::Index, SubMapper,          //
-                                     Traits::mr,                          //
-                                     Traits::LhsProgress,                 //
-                                     typename Traits::LhsPacket4Packing,  //
-                                     ColMajor>;
-#endif
-
-  Eigen::DefaultDevice device;
-
-  // We will reshape kernel into 2D tensor.
-  NewDimension reshape_dims;
-  reshape_dims[0] = filter_count;
-  reshape_dims[1] = input_depth * filter_rows * filter_cols;
-
-  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
-  nocontract_t nocontract_dim = {0};
-  contract_t contract_dim = {1};
-
-  // These values computed using the algorithm in TensorContraction.h, with
-  // 'nocontract_dim' and 'contract_dim' values specified above.
-  nocontract_t nocontract_strides = {1};
-  contract_t contract_strides = {filter_count};
-  nocontract_t i_strides = {1};
-  contract_t k_strides = {1};
-
-  // We use tensor of the same dimensions to store packed data.
-  Tensor<T, 4> packed(filter_dims);
-
-  // We generate multiple filter tensors, around 512mb in total size to measure
-  // realistic workload when input data in not in L1-L3 cache.
-  size_t input_bytes = filter_dims.TotalSize() * sizeof(T);
-  size_t mem_size_bytes = 1024 * 1024 * 512;
-  size_t num_filters =
-      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
-
-  std::vector<Tensor<T, 4>> filters;
-  std::vector<Evaluator> evaluators;
-  std::vector<InputMapper> input_mappers;
-
-  filters.reserve(num_filters);
-  evaluators.reserve(num_filters);
-  input_mappers.reserve(num_filters);
-
-  for (int i = 0; i < num_filters; ++i) {
-    filters.emplace_back(filter_dims);
-    filters[i].setRandom();
-
-    ArgType tensor_map(filters[i].data(), filter_dims);
-
-    const auto reshape_op =
-        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
-
-    evaluators.emplace_back(reshape_op, device);
-
-    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
-                               contract_strides, k_strides);
-  }
-
-  PackLhsImpl pack_lhs;
-
-  const Index packed_total_size = filter_dims.TotalSize();
-
-  // Round up row/col/memory offsets to make them multiple of packet size.
-  const auto round_up = [](const Index idx) {
-    return (idx / packet_size) * packet_size;
-  };
-
-  // Block rows is in the [0, filter_count) range.
-  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
-
-  const Index max_row = filter_count;
-  const Index max_col = filter_rows * filter_cols * input_depth;
-
-  for (auto s : state) {
-    int filter_idx =
-        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
-
-    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
-    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
-
-    Index rows = std::min(block_rows, max_row - row_offset);
-    Index cols = std::min(block_cols, max_col - col_offset);
-
-    // Write packed data to random memory location to emulate cold caches.
-    Index packed_offset = round_up(
-        internal::random<Index>(0, packed_total_size - rows * cols - 1));
-
-    SubMapper sub_mapper =
-        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
-
-// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
-// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
-// and accepts block rows and cols in the same order for lhs and rhs.
-#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
-    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
-#else
-    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
-#endif
-  }
-  state.SetLabel(absl::StrCat(
-      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
-      "; input: depth=", input_depth, "; num_filers=", num_filters));
-}
-
-// -------------------------------------------------------------------------- //
-// Pack RHS
-//
-// Macro argument names:
-//    N: batch size
-//    H: height
-//    W: width
-//    C: input channels
-//   FC: filter channels
-//   FH: filter height
-//   FW: filter width
-//   SH: stride in height dimensions
-//   SW: stride in width dimensions
-//  ISH: patch inflate stride in height dimension
-//  ISW: patch inflate stride in width dimension
-//   BR: block rows
-//   BC: block cols
-
-#define BM_CONCAT(a, b) a##b
-
-#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, \
-                    BR, BC)                                                   \
-  BM_CONCAT(                                                                  \
-      BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW,     \
-      _##PAD##_s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC)
-
-#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC)  \
-  static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW,    \
-                          ISH, ISW, BR,                                       \
-                          BC)(::testing::benchmark::State & state) {          \
-    PackRhsHelper<T>(state, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
-                     ISH, ISW, BR, BC);                                       \
-  }                                                                           \
-  BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \
-                        ISW, BR, BC))                                         \
-      ->UseRealTime()
-
-// Number of input channel (input depth) it equal to the number of patch
-// channels (patch depth).
-
-// Fast path: input channel dimension is the multiple of the packet size.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 32,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 32,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 32,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 32,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-// Slow path: input channel dimension is not the multiple of the packet size.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 30,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 30,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 30,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 30,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-// Slow path with input channel dimension smaller than the packet size.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 256, 256,             //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 8, 8,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 256, 256,             //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 8, 8,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 256, 256,             //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 8, 8,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 2, 4,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 256, 256,             //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 8, 8,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 2, 4,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-
-// Short and wide block with small input channel dimension.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 3, 3,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 36, 432);
-
-// Short and wide block with small input channel dimension.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 3, 3,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 36, 432);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 3, 3,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 36, 432);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 4,                 //
-           /*num_filters*/ 16,             //
-           /*filter*/ 3, 3,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 2, 2,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 36, 432);
-
-// Non standard patches with inflated strides.
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 32, 32,               //
-           /*channels*/ 96,                //
-           /*num_filters*/ 96,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 2, 2,  //
-           /*block*/ 272, 240);
-
-BM_PackRhs(/*type*/ float,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 32, 32,               //
-           /*channels*/ 96,                //
-           /*num_filters*/ 96,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ VALID,              //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 2, 2,  //
-           /*block*/ 272, 240);
-
-#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-using qint8 = Eigen::QInt8;
-BM_PackRhs(/*type*/ qint8,                 //
-           /*batch*/ 32,                   //
-           /*image*/ 64, 64,               //
-           /*channels*/ 32,                //
-           /*num_filters*/ 64,             //
-           /*filter*/ 5, 5,                //
-           /*padding*/ SAME,               //
-           /*stride*/ 1, 1,                //
-           /*patch inflate stride*/ 1, 1,  //
-           /*block*/ 256, 56);
-#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-
-// -------------------------------------------------------------------------- //
-// Pack LHS
-//
-// Macro argument names:
-//    C: input channels
-//   FC: filter channels
-//   FH: filter height
-//   FW: filter width
-//   BR: block rows
-//   BC: block cols
-
-#define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
-  BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
-
-#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                         \
-  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR,             \
-                          BC)(::testing::benchmark::State & state) { \
-    PackLhsHelper<T>(state, C, FC, FH, FW, BR, BC);                  \
-  }                                                                  \
-  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))->UseRealTime()
-
-// Number of input channel (input depth) it equal to the number of patch
-// channels (patch depth).
-
-BM_PackLhs(/*type*/ float,            //
-           /*input channels*/ 128,    //
-           /*filter channels*/ 1024,  //
-           /*filter dims*/ 3, 3,      //
-           /*block*/ 256, 56);
-
-BM_PackLhs(/*type*/ float,            //
-           /*input channels*/ 128,    //
-           /*filter channels*/ 1024,  //
-           /*filter dims*/ 3, 3,      //
-           /*block*/ 56, 256);
-
-BM_PackLhs(/*type*/ float,          //
-           /*input channels*/ 30,   //
-           /*filter channels*/ 64,  //
-           /*filter dims*/ 3, 3,    //
-           /*block*/ 256, 56);
-
-BM_PackLhs(/*type*/ float,          //
-           /*input channels*/ 50,   //
-           /*filter channels*/ 64,  //
-           /*filter dims*/ 3, 3,    //
-           /*block*/ 56, 256);
-}  // namespace Eigen
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index 49823efba91..7220e8e5d60 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -574,7 +574,8 @@ class EncodeProtoOp : public OpKernel {
     }
 
     TensorShape expected_sizes_shape = common_prefix;
-    expected_sizes_shape.AddDim(field_descs_.size());
+    OP_REQUIRES_OK(ctx,
+                   expected_sizes_shape.AddDimWithStatus(field_descs_.size()));
 
     OP_REQUIRES(ctx, sizes_tensor->shape() == expected_sizes_shape,
                 errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index c261c8fe644..e241665f297 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
@@ -62,6 +63,9 @@ class ParseExampleOp : public OpKernel {
 
     // Grab the inputs.
     OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized));
+    OP_REQUIRES(ctx, serialized->NumElements() == 0 || serialized->data(),
+                errors::InvalidArgument(
+                    "Serialized data must not be null if there are elements"));
     OP_REQUIRES_OK(ctx, ctx->input("names", &names));
     if (op_version_ == 2) {
       OP_REQUIRES_OK(ctx, GetTensorKeys(ctx, "dense_keys", &dense_keys_t));
@@ -978,7 +982,7 @@ class ParseSingleSequenceExampleOp : public OpKernel {
     for (int d = 0; d < attrs_.num_context_dense; ++d) {
       TensorShape out_shape;
       for (const int dim : attrs_.context_dense_shapes[d].dim_sizes())
-        out_shape.AddDim(dim);
+        OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(dim));
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx, context_dense_values.allocate(d, out_shape, &out));
     }
@@ -1095,9 +1099,9 @@ class ParseSingleSequenceExampleOp : public OpKernel {
       const FeatureList& fl = (feature_list_missing)
                                   ? empty_feature_list
                                   : feature_list_found->second;
-      out_shape.AddDim(fl.feature_size());
+      OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(fl.feature_size()));
       for (const int dim : attrs_.feature_list_dense_shapes[d].dim_sizes()) {
-        out_shape.AddDim(dim);
+        OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(dim));
       }
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 872850bfead..6e0bb331c45 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/errors.h"
 #define EIGEN_USE_THREADS
 
 // See docs in ../ops/fft_ops.cc.
@@ -32,8 +31,12 @@ limitations under the License.
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if defined(GOOGLE_CUDA) && GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"  // CUDA_VERSION
+#endif
 
 namespace tensorflow {
 
@@ -224,7 +227,7 @@ class FFTCPU : public FFTBase {
     TensorShape temp_shape{input_dims[0]};
     for (int i = 1; i <= FFTRank; ++i) {
       input_slice_sizes[i] = fft_shape[i - 1];
-      temp_shape.AddDim(fft_shape[i - 1]);
+      OP_REQUIRES_OK(ctx, temp_shape.AddDimWithStatus(fft_shape[i - 1]));
     }
     OP_REQUIRES(ctx, temp_shape.num_elements() > 0,
                 errors::InvalidArgument("Obtained a FFT shape of 0 elements: ",
@@ -267,7 +270,7 @@ class FFTCPU : public FFTBase {
     for (auto i = 1; i <= FFTRank; i++) {
       input_slice_sizes[i] =
           i == FFTRank ? fft_shape[i - 1] / 2 + 1 : fft_shape[i - 1];
-      full_fft_shape.AddDim(fft_shape[i - 1]);
+      OP_REQUIRES_OK(ctx, full_fft_shape.AddDimWithStatus(fft_shape[i - 1]));
     }
     OP_REQUIRES(ctx, full_fft_shape.num_elements() > 0,
                 errors::InvalidArgument("Obtained a FFT shape of 0 elements: ",
@@ -351,6 +354,146 @@ REGISTER_KERNEL_BUILDER(Name("IRFFT3D").Device(DEVICE_CPU),
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
 namespace {
+
+// Info required for caching an FFT plan.
+struct FftPlanInfo {
+  int rank = 0;
+  gtl::InlinedVector<uint64_t, 3> shape{};
+  gtl::InlinedVector<uint64_t, 3> input_embed{};
+  uint64_t input_stride = 1;
+  uint64_t input_distance = 0;
+  gtl::InlinedVector<uint64_t, 3> output_embed{};
+  uint64_t output_stride = 1;
+  uint64_t output_distance = 0;
+  se::fft::Type type = se::fft::Type::kInvalid;
+  int batch = 0;
+
+  FftPlanInfo() = default;
+
+  template <typename H>
+  friend inline H AbslHashValue(H h, const FftPlanInfo& key) {
+    return H::combine(std::move(h), key.rank, key.shape, key.input_embed,
+                      key.input_stride, key.input_distance, key.output_embed,
+                      key.output_stride, key.output_distance, key.type,
+                      key.batch);
+  }
+
+  friend inline bool operator==(const FftPlanInfo& lhs,
+                                const FftPlanInfo& rhs) {
+    return lhs.rank == rhs.rank && lhs.shape == rhs.shape &&
+           lhs.input_embed == rhs.input_embed &&
+           lhs.input_stride == rhs.input_stride &&
+           lhs.input_distance == rhs.input_distance &&
+           lhs.output_embed == rhs.output_embed &&
+           lhs.output_stride == rhs.output_stride &&
+           lhs.output_distance == rhs.output_distance && lhs.type == rhs.type &&
+           lhs.batch == rhs.batch;
+  }
+
+  // Create a key to be used for caching plans.
+  static FftPlanInfo Create(int rank, const uint64_t* shape,
+                            const uint64_t* input_embed, uint64_t input_stride,
+                            uint64_t input_distance,
+                            const uint64_t* output_embed,
+                            uint64_t output_stride, uint64_t output_distance,
+                            se::fft::Type type, int batch) {
+    FftPlanInfo info;
+    info.rank = rank;
+    info.shape.reserve(rank);
+    for (int i = 0; i < rank; ++i) {
+      info.shape.push_back(shape[i]);
+    }
+    if (input_embed != nullptr) {
+      info.input_embed.reserve(rank);
+      for (int i = 0; i < rank; ++i) {
+        info.input_embed.push_back(input_embed[i]);
+      }
+      info.input_stride = input_stride;
+      info.input_distance = input_distance;
+    }
+    if (output_embed != nullptr) {
+      info.output_embed.reserve(rank);
+      for (int i = 0; i < rank; ++i) {
+        info.output_embed.push_back(output_embed[i]);
+      }
+      info.output_stride = output_stride;
+      info.output_distance = output_distance;
+    }
+    info.type = type;
+    info.batch = batch;
+    return info;
+  }
+};
+
+// Multimap for storing FFT plans.
+//
+// Plans can be inserted into the cache as long as there is capacity.  They
+// can only be extracted from the cache for use.  The multimap is to allow
+// inserting multiple identical plans, since each can only have one simultaneous
+// user.
+//
+// Thread-safe after initialization.
+class FftPlanCache {
+ public:
+  using Key = FftPlanInfo;
+  using Value = std::unique_ptr<se::fft::Plan>;
+
+  FftPlanCache(size_t capacity)
+      : mutex_(), size_(0), capacity_(capacity), cache_() {}
+
+  // Finds and removes a plan from the cache if it exists.  Otherwise,
+  // returns std::nullopt.
+  std::optional<Value> Extract(const Key& key) {
+    tsl::mutex_lock lock(mutex_);
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return std::nullopt;
+    }
+    Value value = std::move(it->second.back());
+    it->second.pop_back();
+    if (it->second.empty()) {
+      cache_.erase(it);
+    }
+    --size_;
+    // Explicitly create an optional to avoid a compiler bug with gcc-7.
+    return std::optional<Value>(std::move(value));
+  }
+
+  // Inserts a plan into the cache as long as there is still capacity.
+  void Insert(Key key, Value value) {
+    tsl::mutex_lock lock(mutex_);
+    if (size_ < capacity_) {
+      auto it_inserted = cache_.try_emplace(std::move(key));
+      it_inserted.first->second.push_back(std::move(value));
+      ++size_;
+    } else {
+      static bool already_warned = false;
+      if (!already_warned) {
+        LOG(WARNING) << "The CUDA FFT plan cache capacity of " << capacity_
+                     << " has been exceeded. This may lead to extra time being"
+                     << " spent constantly creating new plans."
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && CUDA_VERSION < 12000
+                     << " For CUDA 11.x, there is also a memory leak in cuFFT "
+                     << " plan creation which may cause GPU memory usage to "
+                     << " slowly increase.  If this causes an issue, try"
+                     << " modifying your fft parameters to increase cache hits,"
+                     << " or build TensorFlow with CUDA 10.x or 12.x, or use"
+                     << " explicit device placement to run frequently-changing"
+                     << " FFTs on CPU."
+#endif
+            ;  // NOLINT
+        already_warned = true;
+      }
+    }
+  }
+
+ private:
+  tsl::mutex mutex_;
+  size_t size_ TF_GUARDED_BY(mutex_);
+  size_t capacity_ TF_GUARDED_BY(mutex_);
+  absl::flat_hash_map<Key, std::vector<Value>> cache_ TF_GUARDED_BY(mutex_);
+};
+
 template <typename T>
 se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
   se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
@@ -376,11 +519,11 @@ class CufftScratchAllocator : public se::ScratchAllocator {
   CufftScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
-  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return se::port::StatusOr<se::DeviceMemory<uint8>>();
+      return tsl::StatusOr<se::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -388,13 +531,13 @@ class CufftScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return se::port::StatusOr<se::DeviceMemory<uint8>>();
+      return tsl::StatusOr<se::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return se::port::StatusOr<se::DeviceMemory<uint8>>(
+    return tsl::StatusOr<se::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -432,7 +575,12 @@ class FFTGPUBase : public FFTBase {
   using FFTBase::FFTBase;
 
  protected:
-  static int64_t CufftScratchSize;
+  static const int64_t kCufftScratchSize;
+  // Capacity is somewhat arbitrary.  Plans don't take up any GPU memory
+  // since the scratch space is provided externally.  We don't anticipate
+  // ever hitting this limit in practice.
+  static constexpr size_t kFftPlanCacheCapacity = 512;
+
   void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
              Tensor* out) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -476,12 +624,35 @@ class FFTGPUBase : public FFTBase {
                            : (is_complex128 ? se::fft::Type::kZ2ZInverse
                                             : se::fft::Type::kC2CInverse));
 
-    CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
-    auto plan =
-        stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
-            stream, fft_rank, fft_shape, input_embed, input_stride,
-            input_distance, output_embed, output_stride, output_distance,
-            kFftType, kInPlaceFft, batch_size, &scratch_allocator);
+    CufftScratchAllocator scratch_allocator(kCufftScratchSize, ctx);
+
+    // Plan cache singleton with safe no-destructor initialization.
+    static FftPlanCache* plan_cache = new FftPlanCache(kFftPlanCacheCapacity);
+
+    // Look for plan in cache.
+    FftPlanInfo plan_info = FftPlanInfo::Create(
+        fft_rank, fft_shape, input_embed, input_stride, input_distance,
+        output_embed, output_stride, output_distance, kFftType, batch_size);
+    std::unique_ptr<se::fft::Plan> plan = nullptr;
+    {
+      auto plan_or = plan_cache->Extract(plan_info);
+      if (plan_or.has_value()) {
+        plan = std::move(*plan_or);
+      }
+    }
+
+    // Create a new plan if one doesn't exist.  Otherwise, we need only set
+    // the scratch allocator.
+    if (plan == nullptr) {
+      plan = stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+          stream, fft_rank, fft_shape, input_embed, input_stride,
+          input_distance, output_embed, output_stride, output_distance,
+          kFftType, kInPlaceFft, batch_size, &scratch_allocator);
+    } else {
+      stream->parent()->AsFft()->UpdatePlanWithScratchAllocator(
+          stream, plan.get(), &scratch_allocator);
+    }
+
     OP_REQUIRES(
         ctx, plan != nullptr,
         errors::Internal(
@@ -526,6 +697,8 @@ class FFTGPUBase : public FFTBase {
                                             output_distance, in, out);
       }
     }
+
+    plan_cache->Insert(std::move(plan_info), std::move(plan));
   }
 
  private:
@@ -567,7 +740,7 @@ class FFTGPUBase : public FFTBase {
   }
 };
 
-int64_t FFTGPUBase::CufftScratchSize = GetCufftWorkspaceLimit(
+const int64_t FFTGPUBase::kCufftScratchSize = GetCufftWorkspaceLimit(
     // default value is in bytes despite the name of the environment variable
     "TF_CUFFT_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
 );
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index bc9e6a9af90..26cc62efa74 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -61,6 +61,8 @@ DEFINE_SETZERO_CPU(qint32);
 DEFINE_SETZERO_CPU(complex64);
 DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
+DEFINE_SETZERO_CPU(float8_e5m2);
+DEFINE_SETZERO_CPU(float8_e4m3fn);
 #undef DEFINE_SETZERO_CPU
 
 
@@ -88,6 +90,8 @@ DEFINE_SETONE_CPU(int32);
 DEFINE_SETONE_CPU(int64_t);
 DEFINE_SETONE_CPU(complex64);
 DEFINE_SETONE_CPU(complex128);
+DEFINE_SETONE_CPU(float8_e5m2);
+DEFINE_SETONE_CPU(float8_e4m3fn);
 #undef DEFINE_SETONE_CPU
 
 template <typename T>
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 3bb20686608..7f8777ff540 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -51,7 +51,7 @@ class FractionalAvgPoolOp : public OpKernel {
                       pooling_ratio_[i]));
     }
     OP_REQUIRES(
-        context, pooling_ratio_[0] == 1 || pooling_ratio_[3] == 1,
+        context, pooling_ratio_[0] == 1 && pooling_ratio_[3] == 1,
         errors::Unimplemented("Fractional average pooling is not yet "
                               "supported on the batch nor channel dimension."));
     OP_REQUIRES_OK(context, context->GetAttr("deterministic", &deterministic_));
@@ -322,7 +322,8 @@ class FractionalAvgPoolGradOp : public OpKernel {
     // Transform orig_input_tensor_shape into TensorShape
     TensorShape in_shape;
     for (auto i = 0; i < tensor_in_and_out_dims; ++i) {
-      in_shape.AddDim(orig_input_tensor_shape_flat(i));
+      OP_REQUIRES_OK(
+          context, in_shape.AddDimWithStatus(orig_input_tensor_shape_flat(i)));
     }
 
     // Create intermediate in_backprop.
diff --git a/tensorflow/core/kernels/fractional_max_pool_op.cc b/tensorflow/core/kernels/fractional_max_pool_op.cc
index ec08b5c5028..1e9238da3bd 100644
--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@@ -53,7 +53,7 @@ class FractionalMaxPoolOp : public OpKernel {
     }
 
     OP_REQUIRES(
-        context, pooling_ratio_[0] == 1 || pooling_ratio_[3] == 1,
+        context, pooling_ratio_[0] == 1 && pooling_ratio_[3] == 1,
         errors::Unimplemented("Fractional max pooling is not yet "
                               "supported on the batch nor channel dimension."));
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 02074809cdd..b46ad01deda 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -104,6 +104,8 @@ REGISTER_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_TPU_SYSTEM), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER);
 TF_CALL_QUANTIZED_TYPES(REGISTER);
 TF_CALL_bool(REGISTER);
+TF_CALL_float8_e5m2(REGISTER);
+TF_CALL_float8_e4m3fn(REGISTER);
 
 REGISTER_KERNEL_BUILDER(
     Name(kDeviceArgOp).Device(DEVICE_DEFAULT).TypeConstraint<int32>("T"),
@@ -142,6 +144,8 @@ TF_CALL_qint16(REGISTER);
 TF_CALL_quint16(REGISTER);
 REGISTER(Variant);
 TF_CALL_bool(REGISTER);
+TF_CALL_float8_e5m2(REGISTER);
+TF_CALL_float8_e4m3fn(REGISTER);
 
 REGISTER_KERNEL_BUILDER(Name(kRetOp)
                             .Device(DEVICE_DEFAULT)
@@ -234,7 +238,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
-    FunctionLibraryRuntime::Options opts;
+    FunctionLibraryRuntime::Options opts(ctx->step_id());
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.collective_executor = ctx->collective_executor();
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 5f338f12583..b20360873e2 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -160,7 +160,8 @@ class IfOp : public AsyncOpKernel {
           then_handle_(then_handle),
           else_handle_(else_handle),
           done_(std::move(done)),
-          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          opts_(ctx->step_id()) {
       SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
       for (int i = 1; i < ctx_->num_inputs(); ++i) {
         args_.push_back(ctx_->input(i));
@@ -286,7 +287,8 @@ class CaseOp : public AsyncOpKernel {
           branch_(branch),
           branch_handles_(branch_handles),
           done_(std::move(done)),
-          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          opts_(ctx->step_id()) {
       SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
       for (int i = 1; i < ctx_->num_inputs(); ++i) {
         args_.push_back(ctx_->input(i));
@@ -507,7 +509,8 @@ class WhileOp : public AsyncOpKernel {
           cond_handle_(cond_handle),
           body_handle_(body_handle),
           done_(std::move(done)),
-          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          opts_(ctx->step_id()) {
       SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
       GetArgsFromContext(ctx, &args_, &loop_var_types_);
       body_frame_ =
@@ -751,6 +754,7 @@ class ForOp : public AsyncOpKernel {
           ctx_(ctx),
           done_(std::move(done)),
           lib_(CHECK_NOTNULL(ctx_->function_library())),
+          opts_(ctx->step_id()),
           args_(1 + ctx_->num_inputs() - 3) {
       args_[0] = Tensor(DT_INT32, {});
       iter_ = &args_[0].scalar<int32>()();
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index b214ddd9d85..9050ed42c17 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/kernels/redux_functor.h"
@@ -51,7 +52,7 @@ namespace functor {
 using se::DeviceMemory;
 using se::ScratchAllocator;
 using se::Stream;
-using se::port::StatusOr;
+using tsl::StatusOr;
 #endif
 
 string ToString(FusedBatchNormActivationMode activation_mode) {
@@ -132,16 +133,20 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ true> {
       const int64_t in_rows = GetTensorDim(x_input, tensor_format, 'H');
       const int64_t in_cols = GetTensorDim(x_input, tensor_format, 'W');
       const int64_t in_depths = GetTensorDim(x_input, tensor_format, 'C');
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_x));
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_y));
+      TensorShape transformed_x_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NHWC, in_batch, in_rows, in_cols,
+                                  in_depths, &transformed_x_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_x_shape, &transformed_x));
+      TensorShape transformed_y_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NHWC, in_batch, in_rows, in_cols,
+                                  in_depths, &transformed_y_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_y_shape, &transformed_y));
       // Perform NCHW to NHWC
       std::vector<int32> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
@@ -273,16 +278,20 @@ struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ false> {
       const int64_t in_rows = GetTensorDim(x_input, tensor_format, 'H');
       const int64_t in_cols = GetTensorDim(x_input, tensor_format, 'W');
       const int64_t in_depths = GetTensorDim(x_input, tensor_format, 'C');
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_x));
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_y));
+      TensorShape transformed_x_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NHWC, in_batch, in_rows, in_cols,
+                                  in_depths, &transformed_x_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_x_shape, &transformed_x));
+      TensorShape transformed_y_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NHWC, in_batch, in_rows, in_cols,
+                                  in_depths, &transformed_y_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          transformed_y_shape, &transformed_y));
       // Perform NCHW to NHWC
       std::vector<int32> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
@@ -374,21 +383,31 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
       const int64_t in_rows = GetTensorDim(x_input, tensor_format, 'H');
       const int64_t in_cols = GetTensorDim(x_input, tensor_format, 'W');
       const int64_t in_depths = GetTensorDim(x_input, tensor_format, 'C');
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_y_backprop_input));
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_x_input));
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
-                                                  in_rows, in_cols, in_depths),
-                                  &transformed_x_backprop_output));
+      TensorShape transformed_y_backprop_input_shape;
+      OP_REQUIRES_OK(context,
+                     ShapeFromFormatWithStatus(
+                         FORMAT_NHWC, in_batch, in_rows, in_cols, in_depths,
+                         &transformed_y_backprop_input_shape));
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            transformed_y_backprop_input_shape,
+                                            &transformed_y_backprop_input));
+      TensorShape transformed_x_input_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NHWC, in_batch, in_rows, in_cols,
+                                  in_depths, &transformed_x_input_shape));
+      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                     transformed_x_input_shape,
+                                                     &transformed_x_input));
+      TensorShape transformed_x_backprop_output_shape;
+      OP_REQUIRES_OK(context,
+                     ShapeFromFormatWithStatus(
+                         FORMAT_NHWC, in_batch, in_rows, in_cols, in_depths,
+                         &transformed_x_backprop_output_shape));
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            transformed_x_backprop_output_shape,
+                                            &transformed_x_backprop_output));
       // Perform NCHW to NHWC
       std::vector<int32> perm = {0, 2, 3, 1};
       OP_REQUIRES_OK(
@@ -765,7 +784,7 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
 };
 
 template <typename T, typename U, bool is_training>
-struct FusedBatchNorm<GPUDevice, T, U, is_training> {
+struct FusedBatchNormImplGPU {
   void operator()(OpKernelContext* context, const Tensor& x,
                   const Tensor& scale, const Tensor& offset,
                   const Tensor& estimated_mean,
@@ -793,9 +812,10 @@ struct FusedBatchNorm<GPUDevice, T, U, is_training> {
     //   from
     //       FusedBatchNormV3, i.e. use_reserved_space is true.
     const bool fast_nhwc_batch_norm =
-        !is_training ||
-        (BatchnormSpatialPersistentEnabled() &&
-         DataTypeToEnum<T>::value == DT_HALF && use_reserved_space);
+        !is_training || (BatchnormSpatialPersistentEnabled() &&
+                         (DataTypeToEnum<T>::value == DT_HALF ||
+                          DataTypeToEnum<T>::value == DT_BFLOAT16) &&
+                         use_reserved_space);
 #else
     // fast NHWC implementation is a CUDA only feature
     const bool fast_nhwc_batch_norm = false;
@@ -869,22 +889,26 @@ struct FusedBatchNorm<GPUDevice, T, U, is_training> {
     if (tensor_format == compute_format) {
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*y);
     } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(compute_format, batch_size,
-                                                  height, width, channels),
-                                  &x_transformed));
+      TensorShape x_transformed_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  compute_format, batch_size, height, width,
+                                  channels, &x_transformed_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          x_transformed_shape, &x_transformed));
       functor::NHWCToNCHW<GPUDevice, T, 4>()(
           context->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(x_maybe_transformed).tensor<T, 4>(),
           x_transformed.tensor<T, 4>());
       x_maybe_transformed = x_transformed;
 
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(compute_format, batch_size,
-                                                  height, width, channels),
-                                  &y_transformed));
+      TensorShape y_transformed_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  compute_format, batch_size, height, width,
+                                  channels, &y_transformed_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          y_transformed_shape, &y_transformed));
       y_ptr = StreamExecutorUtil::AsDeviceMemory<T>(y_transformed);
     } else {
       context->SetStatus(errors::Internal(
@@ -990,8 +1014,85 @@ struct FusedBatchNorm<GPUDevice, T, U, is_training> {
   }
 };
 
+template <typename T, typename U, bool is_training>
+struct FusedBatchNorm<GPUDevice, T, U, is_training> {
+  void operator()(OpKernelContext* context, const Tensor& x,
+                  const Tensor& scale, const Tensor& offset,
+                  const Tensor& estimated_mean,
+                  const Tensor& estimated_variance, const Tensor* side_input,
+                  U epsilon, U exponential_avg_factor,
+                  FusedBatchNormActivationMode activation_mode, Tensor* y,
+                  Tensor* batch_mean, Tensor* batch_var, Tensor* saved_mean,
+                  Tensor* saved_inv_var, TensorFormat tensor_format,
+                  bool use_reserved_space) {
+    FusedBatchNormImplGPU<T, U, is_training>()(
+        context, x, scale, offset, estimated_mean, estimated_variance,
+        side_input, epsilon, exponential_avg_factor, activation_mode, y,
+        batch_mean, batch_var, saved_mean, saved_inv_var, tensor_format,
+        use_reserved_space);
+  }
+};
+
+template <bool is_training>
+struct FusedBatchNorm<GPUDevice, Eigen::bfloat16, float, is_training> {
+  void operator()(OpKernelContext* context, const Tensor& x,
+                  const Tensor& scale, const Tensor& offset,
+                  const Tensor& estimated_mean,
+                  const Tensor& estimated_variance, const Tensor* side_input,
+                  float epsilon, float exponential_avg_factor,
+                  FusedBatchNormActivationMode activation_mode, Tensor* y,
+                  Tensor* batch_mean, Tensor* batch_var, Tensor* saved_mean,
+                  Tensor* saved_inv_var, TensorFormat tensor_format,
+                  bool use_reserved_space) {
+    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+    auto* stream = context->op_device_context()->stream();
+    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+        se::CudaComputeCapability::AMPERE);
+    if (cast_to_float) {
+      Tensor casted_x = x;
+      Tensor casted_side_input;
+      Tensor casted_y = *y;
+
+      const GPUDevice& device = context->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, x.shape(), &casted_x));
+      cast(device, casted_x.template flat<float>(),
+           x.template flat<Eigen::bfloat16>());
+      if (side_input != nullptr) {
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DT_FLOAT, side_input->shape(),
+                                              &casted_side_input));
+        cast(device, casted_side_input.template flat<float>(),
+             side_input->template flat<Eigen::bfloat16>());
+      }
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, y->shape(), &casted_y));
+
+      FusedBatchNormImplGPU<float, float, is_training>()(
+          context, casted_x, scale, offset, estimated_mean, estimated_variance,
+          (side_input != nullptr) ? &casted_side_input : nullptr, epsilon,
+          exponential_avg_factor, activation_mode, &casted_y, batch_mean,
+          batch_var, saved_mean, saved_inv_var, tensor_format,
+          use_reserved_space);
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_y_const = casted_y;
+      cast_back(device, y->template flat<Eigen::bfloat16>(),
+                casted_y_const.template flat<float>());
+      return;
+    }
+
+    FusedBatchNormImplGPU<Eigen::bfloat16, float, is_training>()(
+        context, x, scale, offset, estimated_mean, estimated_variance,
+        side_input, epsilon, exponential_avg_factor, activation_mode, y,
+        batch_mean, batch_var, saved_mean, saved_inv_var, tensor_format,
+        use_reserved_space);
+  }
+};
+
 template <typename T, typename U>
-struct FusedBatchNormGrad<GPUDevice, T, U> {
+struct FusedBatchNormGradImplGPU {
   void operator()(OpKernelContext* context, const Tensor& y_backprop,
                   const Tensor& x, const Tensor& scale, const Tensor* offset,
                   const Tensor& mean, const Tensor& inv_variance,
@@ -1012,9 +1113,11 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
     // Check if cuDNN batch normalization has a fast NHWC implementation:
     //   (1) Tensorflow enabled batchnorm spatial persistence, and
     //       FusedBatchNormGradV3 passed non-null reserve space and allocator.
-    const bool fast_nhwc_batch_norm = BatchnormSpatialPersistentEnabled() &&
-                                      DataTypeToEnum<T>::value == DT_HALF &&
-                                      use_reserved_space;
+    const bool fast_nhwc_batch_norm =
+        BatchnormSpatialPersistentEnabled() &&
+        (DataTypeToEnum<T>::value == DT_HALF ||
+         DataTypeToEnum<T>::value == DT_BFLOAT16) &&
+        use_reserved_space;
 #else
     // fast NHWC implementation is a CUDA only feature
     const bool fast_nhwc_batch_norm = false;
@@ -1050,11 +1153,14 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
       x_backprop_ptr = StreamExecutorUtil::AsDeviceMemory<T>(*x_backprop);
     } else if (tensor_format == FORMAT_NHWC && compute_format == FORMAT_NCHW) {
       // Transform inputs from 'NHWC' to 'NCHW'
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
-                                                  height, width, channels),
-                                  &y_backprop_transformed));
+      TensorShape y_backprop_transformed_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NCHW, batch_size, height, width,
+                                  channels, &y_backprop_transformed_shape));
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            y_backprop_transformed_shape,
+                                            &y_backprop_transformed));
       functor::NHWCToNCHW<GPUDevice, T, 4>()(
           context->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(y_backprop_maybe_transformed)
@@ -1062,11 +1168,13 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
           y_backprop_transformed.tensor<T, 4>());
       y_backprop_maybe_transformed = y_backprop_transformed;
 
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
-                                                  height, width, channels),
-                                  &x_transformed));
+      TensorShape x_transformed_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(FORMAT_NCHW, batch_size,
+                                                        height, width, channels,
+                                                        &x_transformed_shape));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                          x_transformed_shape, &x_transformed));
       functor::NHWCToNCHW<GPUDevice, T, 4>()(
           context->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(x_maybe_transformed).tensor<T, 4>(),
@@ -1074,11 +1182,14 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
       x_maybe_transformed = x_transformed;
 
       // Allocate memory for transformed outputs in 'NCHW'
-      OP_REQUIRES_OK(context, context->allocate_temp(
-                                  DataTypeToEnum<T>::value,
-                                  ShapeFromFormat(FORMAT_NCHW, batch_size,
-                                                  height, width, channels),
-                                  &x_backprop_transformed));
+      TensorShape x_backprop_transformed_shape;
+      OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                  FORMAT_NCHW, batch_size, height, width,
+                                  channels, &x_backprop_transformed_shape));
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DataTypeToEnum<T>::value,
+                                            x_backprop_transformed_shape,
+                                            &x_backprop_transformed));
       x_backprop_ptr =
           StreamExecutorUtil::AsDeviceMemory<T>(x_backprop_transformed);
     } else {
@@ -1172,6 +1283,100 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
   }
 };
 
+template <typename T, typename U>
+struct FusedBatchNormGrad<GPUDevice, T, U> {
+  void operator()(OpKernelContext* context, const Tensor& y_backprop,
+                  const Tensor& x, const Tensor& scale, const Tensor* offset,
+                  const Tensor& mean, const Tensor& inv_variance,
+                  const Tensor* y, U epsilon,
+                  FusedBatchNormActivationMode activation_mode,
+                  Tensor* x_backprop, Tensor* scale_backprop,
+                  Tensor* offset_backprop, Tensor* side_input_backprop,
+                  bool use_reserved_space, TensorFormat tensor_format) {
+    FusedBatchNormGradImplGPU<T, U>()(
+        context, y_backprop, x, scale, offset, mean, inv_variance, y, epsilon,
+        activation_mode, x_backprop, scale_backprop, offset_backprop,
+        side_input_backprop, use_reserved_space, tensor_format);
+  }
+};
+
+template <>
+struct FusedBatchNormGrad<GPUDevice, Eigen::bfloat16, float> {
+  void operator()(OpKernelContext* context, const Tensor& y_backprop,
+                  const Tensor& x, const Tensor& scale, const Tensor* offset,
+                  const Tensor& mean, const Tensor& inv_variance,
+                  const Tensor* y, float epsilon,
+                  FusedBatchNormActivationMode activation_mode,
+                  Tensor* x_backprop, Tensor* scale_backprop,
+                  Tensor* offset_backprop, Tensor* side_input_backprop,
+                  bool use_reserved_space, TensorFormat tensor_format) {
+    // Performant bfloat16 operations are supported for Ampere+ GPUs. For
+    // pre-Ampere GPUs, we cast inputs to float and outputs back to bfloat16.
+    auto* stream = context->op_device_context()->stream();
+    const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+        se::CudaComputeCapability::AMPERE);
+    if (cast_to_float) {
+      Tensor casted_y_backprop = y_backprop;
+      Tensor casted_x = x;
+      Tensor casted_y;
+      Tensor casted_x_backprop = *x_backprop;
+      Tensor casted_side_input_backprop;
+
+      const GPUDevice& device = context->eigen_device<GPUDevice>();
+      functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, y_backprop.shape(),
+                                            &casted_y_backprop));
+      cast(device, casted_y_backprop.template flat<float>(),
+           y_backprop.template flat<Eigen::bfloat16>());
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, x.shape(), &casted_x));
+      cast(device, casted_x.template flat<float>(),
+           x.template flat<Eigen::bfloat16>());
+      if (y != nullptr) {
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DT_FLOAT, y->shape(), &casted_y));
+        cast(device, casted_y.template flat<float>(),
+             y->template flat<Eigen::bfloat16>());
+      }
+
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, x_backprop->shape(),
+                                            &casted_x_backprop));
+      if (side_input_backprop != nullptr) {
+        OP_REQUIRES_OK(context, context->allocate_temp(
+                                    DT_FLOAT, side_input_backprop->shape(),
+                                    &casted_side_input_backprop));
+      }
+
+      FusedBatchNormGradImplGPU<float, float>()(
+          context, casted_y_backprop, casted_x, scale, offset, mean,
+          inv_variance, (y != nullptr) ? &casted_y : nullptr, epsilon,
+          activation_mode, &casted_x_backprop, scale_backprop, offset_backprop,
+          (side_input_backprop != nullptr) ? &casted_side_input_backprop
+                                           : nullptr,
+          use_reserved_space, tensor_format);
+
+      functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+      const Tensor& casted_x_backprop_const = casted_x_backprop;
+      cast_back(device, x_backprop->template flat<Eigen::bfloat16>(),
+                casted_x_backprop_const.template flat<float>());
+      if (side_input_backprop != nullptr) {
+        const Tensor& casted_side_input_backprop_const =
+            casted_side_input_backprop;
+        cast_back(device, side_input_backprop->template flat<Eigen::bfloat16>(),
+                  casted_side_input_backprop_const.template flat<float>());
+      }
+      return;
+    }
+
+    FusedBatchNormGradImplGPU<Eigen::bfloat16, float>()(
+        context, y_backprop, x, scale, offset, mean, inv_variance, y, epsilon,
+        activation_mode, x_backprop, scale_backprop, offset_backprop,
+        side_input_backprop, use_reserved_space, tensor_format);
+  }
+};
+
 // Forward declarations of the functor specializations for GPU.
 #define DECLARE_GPU_SPEC(T, U)                                                 \
   template <>                                                                  \
@@ -1196,6 +1401,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
 
 DECLARE_GPU_SPEC(float, float);
 DECLARE_GPU_SPEC(Eigen::half, float);
+DECLARE_GPU_SPEC(Eigen::bfloat16, float);
 
 #undef DECLARE_GPU_SPEC
 
@@ -1299,8 +1505,10 @@ class FusedBatchNormOpBase : public OpKernel {
       int64_t in_rows = GetTensorDim(x, tensor_format_, '1');
       int64_t in_cols = GetTensorDim(x, tensor_format_, '2');
       const int64_t in_depth = GetTensorDim(x, tensor_format_, 'C');
-      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
-                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES_OK(context,
+                     ShapeFromFormatWithStatus(tensor_format_, in_batch,
+                                               {{in_planes, in_rows * in_cols}},
+                                               in_depth, &dest_shape));
       OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
                   errors::InvalidArgument("Error during tensor copy."));
     }
@@ -1544,8 +1752,10 @@ class FusedBatchNormGradOpBase : public OpKernel {
       int64_t in_rows = GetTensorDim(x, tensor_format_, '1');
       int64_t in_cols = GetTensorDim(x, tensor_format_, '2');
       const int64_t in_depth = GetTensorDim(x, tensor_format_, 'C');
-      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
-                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES_OK(context,
+                     ShapeFromFormatWithStatus(tensor_format_, in_batch,
+                                               {{in_planes, in_rows * in_cols}},
+                                               in_depth, &dest_shape));
       OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
                   errors::InvalidArgument("Error during tensor copy."));
       OP_REQUIRES(context, y_backprop.CopyFrom(y_backprop, dest_shape),
@@ -1716,6 +1926,18 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<CPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOp<CPUDevice, bfloat16, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOp<CPUDevice, bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
@@ -1768,12 +1990,25 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormOp<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOp<GPUDevice, Eigen::bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(
+    Name("FusedBatchNormGradV2")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<Eigen::bfloat16>("T")
+        .TypeConstraint<float>("U"),
+    FusedBatchNormGradOp<GPUDevice, Eigen::bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<float>("T")
@@ -1804,24 +2039,50 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormOpV3<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpV3<GPUDevice, Eigen::bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormOpEx<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpEx<GPUDevice, Eigen::bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOpV3<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(
+    Name("FusedBatchNormGradV3")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<Eigen::bfloat16>("T")
+        .TypeConstraint<float>("U"),
+    FusedBatchNormGradOpV3<GPUDevice, Eigen::bfloat16, float>);
+
 REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormGradEx")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<Eigen::half>("T")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOpEx<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(
+    Name("_FusedBatchNormGradEx")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<Eigen::bfloat16>("T")
+        .TypeConstraint<float>("U"),
+    FusedBatchNormGradOpEx<GPUDevice, Eigen::bfloat16, float>);
+
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index bcf7cbf979e..8f9201410d4 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -108,6 +108,7 @@ struct FusedBatchNormFreezeGrad<GPUDevice, T, U> {
 
 template struct FusedBatchNormFreezeGrad<GPUDevice, float, float>;
 template struct FusedBatchNormFreezeGrad<GPUDevice, Eigen::half, float>;
+template struct FusedBatchNormFreezeGrad<GPUDevice, Eigen::bfloat16, float>;
 
 // -------------------------------------------------------------------------- //
 // FusedBatchNormInferenceFunctor implementation.                             //
@@ -390,6 +391,8 @@ struct FusedBatchNormInferenceFunctor<GPUDevice, T, U> {
 
 template struct FusedBatchNormInferenceFunctor<GPUDevice, float, float>;
 template struct FusedBatchNormInferenceFunctor<GPUDevice, Eigen::half, float>;
+template struct FusedBatchNormInferenceFunctor<GPUDevice, Eigen::bfloat16,
+                                               float>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 0311e235f07..30c90cf17e9 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib", "tf_oss_fuzz_corpus", "tf_oss_fuzz_dict")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index 8f004182232..4afdadf4cc9 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/gather_functor.h"
+
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_functor_batched.cc b/tensorflow/core/kernels/gather_functor_batched.cc
index 79d59837f08..a3d135c518a 100644
--- a/tensorflow/core/kernels/gather_functor_batched.cc
+++ b/tensorflow/core/kernels/gather_functor_batched.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/gather_functor_batched.h"
+
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
index 40b9894776d..b5b2de6b674 100644
--- a/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_batched_gpu.cu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/gather_functor_batched_gpu.cu.h"
+
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 2f3b6abab36..2d77df61a4f 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
+
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 6538b92bdf1..a6f3e534350 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/gather_nd_op.h"
+
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index ca417984ed9..c675f1aeaf3 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -139,18 +139,18 @@ class GatherOp : public OpKernel {
     int64_t inner_size = 1;
 
     for (int i = 0; i < batch_dims; ++i) {
-      result_shape.AddDim(params.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(params.dim_size(i)));
       batch_size *= params.dim_size(i);
     }
     for (int i = batch_dims; i < axis; ++i) {
-      result_shape.AddDim(params.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(params.dim_size(i)));
       outer_size *= params.dim_size(i);
     }
     for (int i = batch_dims; i < indices.dims(); ++i) {
-      result_shape.AddDim(indices.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(indices.dim_size(i)));
     }
     for (int i = axis + 1; i < params.dims(); ++i) {
-      result_shape.AddDim(params.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(params.dim_size(i)));
       inner_size *= params.dim_size(i);
     }
 
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 51eae87370c..f1eb17a87cd 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -28,6 +28,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_GEMM_FUNCTORS_H_
 
 #include <string.h>
+
 #include <map>
 #include <vector>
 
@@ -37,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 // Apple provides an optimized BLAS library that is better than Eigen for their
diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h
index 2e6df7d5b6b..3dbf8c27113 100644
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@@ -86,8 +86,7 @@ namespace gpuprim = ::hipcub;
 // Required for sorting Eigen::half and bfloat16.
 namespace rocprim {
 namespace detail {
-
-#if (TF_ROCM_VERSION >= 50200)
+#if (TF_ROCM_VERSION >= 50200) && (!TENSORFLOW_USE_DCU)
 template <>
 struct float_bit_mask<Eigen::half> {
   static constexpr uint16_t sign_bit = 0x8000;
@@ -104,7 +103,6 @@ struct float_bit_mask<Eigen::bfloat16> {
   using bit_type = uint16_t;
 };
 #endif
-
 template <>
 struct radix_key_codec_base<Eigen::half>
     : radix_key_codec_floating<Eigen::half, uint16_t> {};
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 8e8b1567489..43877e73f9a 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -62,7 +62,7 @@ void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
   if (RedzoneCheckDisabled()) {
     return;
   }
-  se::port::StatusOr<se::RedzoneAllocator::RedzoneCheckStatus> rz_status =
+  tsl::StatusOr<se::RedzoneAllocator::RedzoneCheckStatus> rz_status =
       rz_allocator.CheckRedzones();
   if (!rz_status.ok()) {
     static absl::once_flag failure_logged;
@@ -111,7 +111,7 @@ namespace {
 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
   tensorflow::CudnnVersion cudnn_version;
   if (auto* dnn = stream_executor->AsDnn()) {
-    se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    tsl::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
     if (version_or.ok()) {
       const auto& version = version_or.value();
       cudnn_version.set_major(version.major_version());
diff --git a/tensorflow/core/kernels/histogram_op.cc b/tensorflow/core/kernels/histogram_op.cc
index 67bfebe9630..d00cb33f3e8 100644
--- a/tensorflow/core/kernels/histogram_op.cc
+++ b/tensorflow/core/kernels/histogram_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/histogram_op.h"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -70,15 +71,17 @@ struct HistogramFixedWidthFunctor<CPUDevice, T, Tout> {
     //   step = (b - a) / nbins
     //   (x - a) / step
     // , then the entries are mapped to output.
-
-    // Bug fix: Switch the order of cwiseMin and int32-casting to avoid
-    // producing a negative index when casting an big int64 number to int32
-    index_to_bin.device(d) =
-        ((values.cwiseMax(value_range(0)) - values.constant(value_range(0)))
-             .template cast<double>() /
-         step)
-            .cwiseMin(nbins_minus_1)
-            .template cast<int32>();
+    //
+    // Bound range and cast to double _before_ subtracting the lower-bound to
+    // avoid overflow.  Otherwise the difference may not fit within the type
+    // (e.g. int32).
+    index_to_bin.device(d) = ((values.cwiseMax(value_range(0))
+                                   .cwiseMin(value_range(1))
+                                   .template cast<double>() -
+                               static_cast<double>(value_range(0))) /
+                              step)
+                                 .cwiseMin(nbins_minus_1)
+                                 .template cast<int32>();
 
     out.setZero();
     for (int32_t i = 0; i < index_to_bin.size(); i++) {
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index 5efdd6a17e7..e55d2f912c7 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -25,6 +25,7 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
         "//tensorflow:__subpackages__",
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index b34d3c0c670..5719588d703 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -919,7 +919,9 @@ inline void RunIfBoxIndexIsValid<GPUDevice>(
                               .TypeConstraint<T>("T"),             \
                           CropAndResizeGradBoxesOp<GPUDevice, T>);
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
index 4fa079b5695..73b9121b66d 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op_gpu.cu.cc
@@ -495,7 +495,9 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
   template struct CropAndResizeBackpropImage<GPUDevice, T>; \
   template struct CropAndResizeBackpropBoxes<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_half(DEFINE_GPU_SPECS);
+TF_CALL_float(DEFINE_GPU_SPECS);
+TF_CALL_double(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 91137f8343c..41ac1473507 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -452,12 +452,13 @@ class DecodeImageV2Op : public OpKernel {
     // allocation til after dtype conversion is done. `gif`::Decode` supports
     // uint8 only.
     Tensor* output = nullptr;
-    int buffer_size = 0;
+    int64_t buffer_size = 0;
     string error_string;
     uint8* buffer = gif::Decode(
         input.data(), input.size(),
         [&](int num_frames, int width, int height, int channels) -> uint8* {
-          buffer_size = num_frames * height * width * channels;
+          buffer_size =
+              static_cast<int64_t>(num_frames) * height * width * channels;
 
           Status status;
           // By the existing API, we support decoding GIF with `decode_jpeg` or
diff --git a/tensorflow/core/kernels/image/mirror_pad_op.cc b/tensorflow/core/kernels/image/mirror_pad_op.cc
index b4bf3b39975..bd98e74059c 100644
--- a/tensorflow/core/kernels/image/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/image/mirror_pad_op.cc
@@ -325,7 +325,7 @@ class MirrorPadGradOp : public OpKernel {
                                             before, ", ", after,
                                             " not less than ", out_size));
       }
-      output_shape.AddDim(out_size);
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(out_size));
     }
 
     if (output_shape == in0.shape()) {
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cc
index 763e294846f..8a5de65a113 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <cmath>
 #include <functional>
+#include <limits>
 #include <queue>
 #include <vector>
 
@@ -518,9 +519,12 @@ void BatchedNonMaxSuppressionOp(
   d.parallelFor(length, cost, shard_nms);
 
   int per_batch_size = total_size_per_batch;
+  // Avoid overflow.
+  int max_total_size = static_cast<int>(
+      std::min(static_cast<int64_t>(std::numeric_limits<int>::max()),
+               static_cast<int64_t>(max_size_per_class) * num_classes));
   if (pad_per_class) {
-    per_batch_size =
-        std::min(total_size_per_batch, max_size_per_class * num_classes);
+    per_batch_size = std::min(total_size_per_batch, max_total_size);
   }
 
   Tensor* valid_detections_t = nullptr;
@@ -535,8 +539,7 @@ void BatchedNonMaxSuppressionOp(
           nmsed_boxes[batch_idx], nmsed_scores[batch_idx],
           nmsed_classes[batch_idx], result_candidate_vec[batch_idx],
           final_valid_detections, batch_idx, total_size_per_batch,
-          pad_per_class, max_size_per_class * num_classes, clip_boxes,
-          per_batch_size);
+          pad_per_class, max_total_size, clip_boxes, per_batch_size);
       valid_detections_flat(batch_idx) = final_valid_detections[batch_idx];
     }
   };
diff --git a/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
index 26afc8b4635..8b6b7cfdc71 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op.cu.cc
@@ -276,10 +276,10 @@ StatusOr<int> CountIf(OpKernelContext* context, const float* dev_array,
   size_t workspace_size = 0;
   auto cuda_stream = tensorflow::GetGpuStream(context);
   auto device = context->eigen_gpu_device();
-  gpuprim::DeviceSelect::If(nullptr, workspace_size,
-                            static_cast<float*>(nullptr),
-                            static_cast<float*>(nullptr),
-                            static_cast<int*>(nullptr), num_elements, op);
+  TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::If(
+      nullptr, workspace_size, static_cast<float*>(nullptr),
+      static_cast<float*>(nullptr), static_cast<int*>(nullptr), num_elements,
+      op));
 
   Tensor scratch_output;
   TF_RETURN_IF_ERROR(context->allocate_temp(
@@ -745,13 +745,14 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   TF_RETURN_IF_CUDA_ERROR(cudaGetLastError());
   // do Cub::deviceSelect::flagged
   size_t flagged_buffer_size = 0;
-  gpuprim::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
-                                 flagged_buffer_size,
-                                 static_cast<int*>(nullptr),   // input
-                                 static_cast<char*>(nullptr),  // selection flag
-                                 static_cast<int*>(nullptr),   // selected items
-                                 static_cast<int*>(nullptr),   // num_selected
-                                 num_boxes, device.stream());
+  TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::Flagged(
+      static_cast<void*>(nullptr),  // temp_storage
+      flagged_buffer_size,
+      static_cast<int*>(nullptr),   // input
+      static_cast<char*>(nullptr),  // selection flag
+      static_cast<int*>(nullptr),   // selected items
+      static_cast<int*>(nullptr),   // num_selected
+      num_boxes, device.stream()));
   Tensor cub_scratch;
   TF_RETURN_IF_ERROR(context->allocate_temp(
       DataType::DT_INT8, TensorShape({(int64)flagged_buffer_size}),
@@ -760,19 +761,19 @@ Status NmsGpu(const float* d_sorted_boxes_float_ptr, const int num_boxes,
   TF_RETURN_IF_ERROR(context->allocate_temp(DataType::DT_INT32,
                                             TensorShape({1}), &d_num_selected));
 
-  gpuprim::DeviceSelect::Flagged(
+  TF_RETURN_IF_CUDA_ERROR(gpuprim::DeviceSelect::Flagged(
       (void*)cub_scratch.flat<int8>().data(),  // temp_storage
       flagged_buffer_size,
       d_indices.flat<int>().data(),  // input
       selected,                      // selection flag
       d_selected_indices,            // selected items
-      h_selected_count, num_boxes, device.stream());
+      h_selected_count, num_boxes, device.stream()));
   gpuEvent_t copy_done;
   TF_RETURN_IF_CUDA_ERROR(
       gpuEventCreateWithFlags(&copy_done, gpuEventDisableTiming));
   TF_RETURN_IF_CUDA_ERROR(gpuEventRecord(copy_done, device.stream()));
   TF_RETURN_IF_CUDA_ERROR(gpuEventSynchronize(copy_done));
-  gpuEventDestroy(copy_done);
+  TF_RETURN_IF_CUDA_ERROR(gpuEventDestroy(copy_done));
 
   *h_nkeep = *h_selected_count;
   return OkStatus();
diff --git a/tensorflow/core/kernels/image/resize_area_op_test.cc b/tensorflow/core/kernels/image/resize_area_op_test.cc
index 9fa9d260a9d..2d91114668d 100644
--- a/tensorflow/core/kernels/image/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_area_op_test.cc
@@ -28,11 +28,13 @@ namespace tensorflow {
 
 class ResizeAreaOpTest : public OpsTestBase {
  protected:
-  ResizeAreaOpTest() {
+  ResizeAreaOpTest() = default;
+
+  void CreateOp(bool align_corners) {
     TF_EXPECT_OK(NodeDefBuilder("resize_area_op", "ResizeArea")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
-                     .Attr("align_corners", false)
+                     .Attr("align_corners", align_corners)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
@@ -175,26 +177,42 @@ class ResizeAreaOpTest : public OpsTestBase {
 };
 
 TEST_F(ResizeAreaOpTest, TestAreaRandom141x186) {
+  CreateOp(/*align_corners=*/false);
   RunRandomTest(141, 186, 299, 299, 3 /* channels */);
 }
 
 TEST_F(ResizeAreaOpTest, TestAreaRandom183x229) {
+  CreateOp(/*align_corners=*/false);
   RunRandomTest(183, 229, 299, 299, 3 /* channels */);
 }
 
 TEST_F(ResizeAreaOpTest, TestAreaRandom749x603) {
+  CreateOp(/*align_corners=*/false);
   RunRandomTest(749, 603, 299, 299, 3 /* channels */);
 }
 
+TEST_F(ResizeAreaOpTest, TestAreaRandom1x1) {
+  CreateOp(/*align_corners=*/false);
+  RunRandomTest(1, 1, 8, 8, 3 /* channels */);
+}
+
+TEST_F(ResizeAreaOpTest, TestAreaRandom1x1AlignCorners) {
+  CreateOp(/*align_corners=*/true);
+  RunRandomTest(1, 1, 8, 8, 3 /* channels */);
+}
+
 TEST_F(ResizeAreaOpTest, TestAreaRandomDataSeveralInputsSizes1Channel) {
+  CreateOp(/*align_corners=*/false);
   RunManyRandomTests(1);
 }
 
 TEST_F(ResizeAreaOpTest, TestAreaRandomDataSeveralInputsSizes3Channels) {
+  CreateOp(/*align_corners=*/false);
   RunManyRandomTests(3);
 }
 
 TEST_F(ResizeAreaOpTest, TestAreaRandomDataSeveralInputsSizes4Channels) {
+  CreateOp(/*align_corners=*/false);
   RunManyRandomTests(4);
 }
 
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index 344cf1d2a2a..cda91bcaf52 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -252,7 +252,7 @@ __global__ void ResizeBilinearDeterministicGradKernel(
         max(0, __float2int_ru(
                    (out_x_center - 1 + offset) * inverse_width_scale - offset));
     const float out_x_start = (in_x_start + offset) * width_scale - offset;
-    T acc = 0;
+    T acc = T(0);
     // For clarity, prior to C++17, while loops are preferable to for loops here
     float out_y = out_y_start;
     int in_y = in_y_start;
@@ -515,7 +515,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
 #define DEFINE_GRAD_GPU_SPEC(T) \
   template struct ResizeBilinearGrad<GPUDevice, T>;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GRAD_GPU_SPEC);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GRAD_GPU_SPEC);
 
 #undef DEFINE_GPU_SPEC
 #undef DEFINE_GRAD_GPU_SPEC
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
index 1e6acf453f2..8807e603f0c 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -323,11 +323,12 @@ class ScaleAndTranslateOp : public OpKernel {
     GetValues(context, 3, &row_translation, &col_translation);
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0,
-                                TensorShape({input.dim_size(0), output_height,
-                                             output_width, input.dim_size(3)}),
-                                &output));
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(input.dim_size(0)));
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(output_height));
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(output_width));
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(input.dim_size(3)));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     if (!context->status().ok()) return;
 
     // Return if the output is empty.
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index abda03164fd..5aa38f67a5f 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -78,7 +78,7 @@ class ParallelConcatUpdate : public OpKernel {
     OP_REQUIRES(
         ctx, value.dim_size(0) > loc_,
         errors::InvalidArgument("0th dimension of value = ", value.dim_size(0),
-                                " is less than loc_=", loc_));
+                                " must be greater than loc_ = ", loc_));
 
     auto update = ctx->input(1);
 
@@ -173,7 +173,7 @@ typedef Eigen::GpuDevice GPUDevice;
                               .Device(DEVICE_GPU)             \
                               .TypeConstraint<type>("dtype"), \
                           ParallelConcatStart<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START);
 #undef REGISTER_PARALLEL_CONCAT_START
 
 #define REGISTER_PARALLEL_CONCAT(type)                                     \
@@ -188,7 +188,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT);
                               .Device(DEVICE_GPU)         \
                               .TypeConstraint<type>("T"), \
                           ParallelConcatUpdate<GPUDevice>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 #undef REGISTER
 
 // Register versions that operate on int32 data on the CPU even though the op
@@ -464,11 +464,13 @@ REGISTER_KERNEL_BUILDER(
 REGISTER(float);
 REGISTER(double);
 REGISTER(Eigen::half);
+REGISTER(Eigen::bfloat16);
 REGISTER(int64_t);
 
 REGISTER_EMPTY(float, GPU);
 REGISTER_EMPTY(double, GPU);
 REGISTER_EMPTY(Eigen::half, GPU);
+REGISTER_EMPTY(Eigen::bfloat16, GPU);
 REGISTER_EMPTY(int64_t, GPU);
 REGISTER_EMPTY(int32, GPU);
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 50f933a1c23..f2536da06be 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -69,6 +69,7 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
     CASE(float)
     CASE(double)
     CASE(Eigen::half)
+    CASE(Eigen::bfloat16)
 // Using TF_CALL_GPU_NUMBER_TYPES(CASE) results in the compiler complaining
 // that CASE is not defined...hence the above construction
 #undef CASE
@@ -175,6 +176,7 @@ Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
     CASE(float)
     CASE(double)
     CASE(Eigen::half)
+    CASE(Eigen::bfloat16)
     CASE(int64)
 #undef CASE
     default:
@@ -197,6 +199,7 @@ Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
     CASE(float)
     CASE(double)
     CASE(Eigen::half)
+    CASE(Eigen::bfloat16)
     CASE(complex64)
     CASE(complex128)
     CASE(int64)
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index a2c288c36d1..e5c30e1ae8e 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -68,6 +68,7 @@ class L2LossOp<GPUDevice, T> : public OpKernel {
 REGISTER_GPU_KERNEL(float);
 REGISTER_GPU_KERNEL(double);
 REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(Eigen::bfloat16);
 #undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index 43a09feafbb..46195c547a3 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -11,6 +11,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
diff --git a/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
index 72ad08292e6..d6ef3e40469 100644
--- a/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
@@ -249,8 +249,8 @@ class BandedTriangularSolveOpCpu : public OpKernel {
                     "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
                     in0.shape().DebugString(), " ", in1.shape().DebugString(),
                     " ", lower_, " ", adjoint_));
-    out_shape.AddDim(d1);
-    out_shape.AddDim(d3);
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d1));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d3));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
     if (out->NumElements() == 0) {
diff --git a/tensorflow/core/kernels/linalg/cholesky_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/cholesky_op_gpu.cu.cc
index 44ec38ffdaf..0e1d1aaa500 100644
--- a/tensorflow/core/kernels/linalg/cholesky_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/cholesky_op_gpu.cu.cc
@@ -146,7 +146,7 @@ class CholeskyOpGpu : public AsyncOpKernel {
     const int64_t batch_size = input_reshaped.dimension(0);
     std::vector<DeviceLapackInfo> dev_info;
 
-#if CUDA_VERSION >= 9020
+#if CUDA_VERSION >= 9020 || TENSORFLOW_USE_ROCM
     // Decide whether to use the batched API.
     // TODO(rmlarsen): The value 128 was found to be optimal for the equivalent
     // split in matrix_solve_op. Tune this heuristic.
@@ -193,7 +193,7 @@ class CholeskyOpGpu : public AsyncOpKernel {
             done);
       }
 
-#if CUDA_VERSION >= 9020
+#if CUDA_VERSION >= 9020 || TENSORFLOW_USE_ROCM
     }
 #endif
 
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
index 0d5e40eb853..08c8d853234 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -222,7 +222,8 @@ struct EinsumHelper {
     }
     TensorShape transposed_shape;
     for (int i = 0; i < input.dims(); ++i) {
-      transposed_shape.AddDim(input.dim_size(permutation[i]));
+      TF_RETURN_IF_ERROR(
+          transposed_shape.AddDimWithStatus(input.dim_size(permutation[i])));
     }
     // For empty Tensors, just change the shape. E.g. we may need to transpose
     // from shape [1, 0, 5] to [5, 1, 0].
@@ -382,7 +383,7 @@ struct EinsumHelper {
       int64_t dim = input_deduped.dim_size(label_idx);
       if (label_types[label] == EinsumDimensionType::kBroadcasting ||
           label_types[label] == EinsumDimensionType::kBatch) {
-        output_shape.AddDim(dim);
+        TF_RETURN_IF_ERROR(output_shape.AddDimWithStatus(dim));
       } else if (label_types[label] == EinsumDimensionType::kFree) {
         free_labels->push_back(label);
       }
@@ -391,8 +392,10 @@ struct EinsumHelper {
     if (*swap_free_and_contract)
       std::swap(reshape[EinsumDimensionType::kFree],
                 reshape[EinsumDimensionType::kContract]);
-    output_shape.AddDim(reshape[EinsumDimensionType::kFree]);
-    output_shape.AddDim(reshape[EinsumDimensionType::kContract]);
+    TF_RETURN_IF_ERROR(
+        output_shape.AddDimWithStatus(reshape[EinsumDimensionType::kFree]));
+    TF_RETURN_IF_ERROR(
+        output_shape.AddDimWithStatus(reshape[EinsumDimensionType::kContract]));
 
     if (reshape[EinsumDimensionType::kReduce] ==
         1) {  // No need to actually reduce.
@@ -451,7 +454,8 @@ struct EinsumHelper {
     for (int i = 0; i < inputs.size(); ++i) {
       const int64_t free_axis =
           inputs[i].dims() - (swap_free_and_contract[i] ? 1 : 2);
-      output_shape.AddDim(inputs[i].dim_size(free_axis));
+      TF_RETURN_IF_ERROR(
+          output_shape.AddDimWithStatus(inputs[i].dim_size(free_axis)));
     }
     bool trans_x = swap_free_and_contract[0];
     bool trans_y = !swap_free_and_contract[1];
@@ -546,7 +550,8 @@ class EinsumOp : public OpKernel {
     for (int i = 0; i < num_inputs; ++i) {
       for (int label : free_labels[i]) {
         result_labels.push_back(label);
-        result_shape.AddDim(label_to_dim_sizes[label]);
+        OP_REQUIRES_OK(
+            ctx, result_shape.AddDimWithStatus(label_to_dim_sizes[label]));
       }
     }
 
@@ -649,9 +654,7 @@ namespace functor {
   DECLARE_GPU_SPEC(T, 5);    \
   DECLARE_GPU_SPEC(T, 6);
 
-DECLARE_GPU_SPECS(Eigen::half);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(float);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 // TODO(rocm): Enable once complex types are supported.
 #if GOOGLE_CUDA
 DECLARE_GPU_SPECS(complex64);
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc b/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
index e2e13052df5..16a153752ed 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl_bfloat16.cc
@@ -26,6 +26,12 @@ namespace tensorflow {
 TF_CALL_bfloat16(REGISTER_CPU);
 #undef REGISTER_CPU
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define REGISTER_GPU(TYPE) REGISTER_EINSUM(GPU, TYPE)
+TF_CALL_bfloat16(REGISTER_GPU);
+#undef REGISTER_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #undef REGISTER_EINSUM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/linalg_ops_common.cc b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
index a64bdbed182..e341c10c54d 100644
--- a/tensorflow/core/kernels/linalg/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg/linalg_ops_common.cc
@@ -133,7 +133,8 @@ void LinearAlgebraOp<InputScalar, OutputScalar>::AnalyzeInputs(
       // dimensions as matrices, and loop over all the other outer ("batch")
       // dimensions to compute the results.
       for (int dim = 0; dim < input_rank - 2; ++dim) {
-        batch_shape->AddDim(in.dim_size(dim));
+        OP_REQUIRES_OK(context,
+                       batch_shape->AddDimWithStatus(in.dim_size(dim)));
       }
     } else {
       // Make sure that all inputs have the same rank and outer dimensions.
diff --git a/tensorflow/core/kernels/linalg/lu_op.cc b/tensorflow/core/kernels/linalg/lu_op.cc
index 0234058db7c..558edff8000 100644
--- a/tensorflow/core/kernels/linalg/lu_op.cc
+++ b/tensorflow/core/kernels/linalg/lu_op.cc
@@ -79,7 +79,8 @@ class LuOp : public OpKernel {
     TensorShape input_matrix_shape;
     TensorShape batch_shape;
     for (int dim = 0; dim < input_rank - 2; ++dim) {
-      batch_shape.AddDim(input.dim_size(dim));
+      OP_REQUIRES_OK(context,
+                     batch_shape.AddDimWithStatus(input.dim_size(dim)));
     }
     const int64_t num_rows = input.dim_size(input_rank - 2);
     const int64_t num_cols = input.dim_size(input_rank - 1);
@@ -91,7 +92,7 @@ class LuOp : public OpKernel {
     // packed_triangular_factors is a matrix with the same shape as the input;
     // permutation is a vector.
     TensorShape permutation_shape = batch_shape;
-    permutation_shape.AddDim(num_rows);
+    OP_REQUIRES_OK(context, permutation_shape.AddDimWithStatus(num_rows));
 
     TensorShapes output_matrix_shapes({input.shape(), permutation_shape});
 
diff --git a/tensorflow/core/kernels/linalg/matrix_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
index 1506df17121..6815f7ceedd 100644
--- a/tensorflow/core/kernels/linalg/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_diag_op.cc
@@ -127,14 +127,17 @@ class MatrixDiagPartOp : public OpKernel {
 
     TensorShape output_shape;
     for (int i = 0; i < rank - 2; ++i) {
-      output_shape.AddDim(input_shape.dim_size(i));
+      OP_REQUIRES_OK(context,
+                     output_shape.AddDimWithStatus(input_shape.dim_size(i)));
     }
     const Eigen::Index num_diags = upper_diag_index - lower_diag_index + 1;
-    if (num_diags > 1) output_shape.AddDim(num_diags);
+    if (num_diags > 1) {
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(num_diags));
+    }
     const int32_t max_diag_len =
         std::min(num_rows + std::min(upper_diag_index, 0),
                  num_cols - std::max(lower_diag_index, 0));
-    output_shape.AddDim(max_diag_len);
+    OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(max_diag_len));
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
@@ -265,7 +268,7 @@ class MatrixDiagOp : public OpKernel {
     TensorShape output_shape = diagonal_shape;
     if (num_diags == 1) {  // Output has rank `rank+1`.
       output_shape.set_dim(diag_rank - 1, num_rows);
-      output_shape.AddDim(num_cols);
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(num_cols));
     } else {  // Output has rank `rank`.
       output_shape.set_dim(diag_rank - 2, num_rows);
       output_shape.set_dim(diag_rank - 1, num_cols);
diff --git a/tensorflow/core/kernels/linalg/matrix_inverse_op.cc b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
index d546aa49daa..97cb714c9a0 100644
--- a/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_inverse_op.cc
@@ -83,6 +83,48 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
 };
 
+// For Eigen::half, compute inverse via float32 - otherwise precision is
+// too poor for meaningful results.
+template <>
+class MatrixInverseOp<Eigen::half> : public LinearAlgebraOp<Eigen::half> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Eigen::half);
+
+  explicit MatrixInverseOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
+    if (input.rows() == 0) {
+      // By definition, an empty matrix's inverse is an empty matrix.
+      return;
+    }
+
+    using FloatMatrix =
+        Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+    Eigen::PartialPivLU<FloatMatrix> lu_decomposition;
+    if (adjoint_) {
+      lu_decomposition.compute(input.adjoint().template cast<float>());
+    } else {
+      lu_decomposition.compute(input.template cast<float>());
+    }
+    const float min_abs_pivot =
+        lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
+    OP_REQUIRES(context, min_abs_pivot > 0.0,
+                errors::InvalidArgument("Input is not invertible."));
+    outputs->at(0).noalias() =
+        lu_decomposition.inverse().template cast<Eigen::half>();
+  }
+
+ private:
+  bool adjoint_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
+};
+
 #if GOOGLE_CUDA
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -280,10 +322,14 @@ REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<complex128>),
 
 #endif  // GOOGLE_CUDA
 
+REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<Eigen::half>),
+                   Eigen::half);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double>), double);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<complex64>), complex64);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<complex128>), complex128);
+REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<Eigen::half>),
+                   Eigen::half);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<double>), double);
 
diff --git a/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
index 4e89433718b..f0d861231a0 100644
--- a/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/linalg/matrix_set_diag_op.cc
@@ -136,11 +136,13 @@ class MatrixSetDiagOp : public OpKernel {
 
     TensorShape expected_diag_shape = input_shape;
     expected_diag_shape.RemoveLastDims(2);
-    if (num_diags > 1) expected_diag_shape.AddDim(num_diags);
+    if (num_diags > 1) {
+      OP_REQUIRES_OK(context, expected_diag_shape.AddDimWithStatus(num_diags));
+    }
     const int32_t max_diag_len =
         std::min(num_rows + std::min(upper_diag_index, 0),
                  num_cols - std::max(lower_diag_index, 0));
-    expected_diag_shape.AddDim(max_diag_len);
+    OP_REQUIRES_OK(context, expected_diag_shape.AddDimWithStatus(max_diag_len));
     OP_REQUIRES(
         context, expected_diag_shape == diag_shape,
         errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
index 6640e55c534..85c497bd6c9 100644
--- a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
@@ -186,8 +186,8 @@ class BaseMatrixTriangularSolveOp : public OpKernel {
                     "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
                     in0.shape().DebugString(), " ", in1.shape().DebugString(),
                     " ", lower_, " ", adjoint_));
-    out_shape.AddDim(d0);
-    out_shape.AddDim(d3);
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d0));
+    OP_REQUIRES_OK(ctx, out_shape.AddDimWithStatus(d3));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
     if (out->NumElements() == 0) {
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 715ed4f908c..1bf5c04347c 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -37,6 +37,11 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/platform/errors.h"
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS) && defined(__APPLE__) && \
+    !defined(ANDROID) && !defined(__ANDROID__) && !TARGET_OS_IOS
+#define PLUGGABLE_DEVICE_SUPPORTED_MACOS 1
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -491,7 +496,6 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 TF_CALL_GPU_ALL_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_int32(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
-REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
 #undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -506,7 +510,6 @@ REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_DEFAULT);
 TF_CALL_int32(REGISTER_TENSOR_LIST_SET_ITEM_DEFAULT);
 TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_DEFAULT);
-REGISTER_TENSOR_LIST_SET_ITEM_DEFAULT(bfloat16)
 #undef REGISTER_TENSOR_LIST_SET_ITEM_DEFAULT
 
 class TensorListConcatLists : public OpKernel {
@@ -713,6 +716,7 @@ REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
 
+#if !defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
 #define REGISTER_TENSOR_LIST_OPS_DEFAULT(T)                                \
   REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
                               .TypeConstraint<T>("element_dtype")          \
@@ -784,8 +788,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
 
 TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_DEFAULT);
 TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_DEFAULT);
-TF_CALL_bfloat16(REGISTER_TENSOR_LIST_OPS_DEFAULT);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_DEFAULT);
 
 #undef REGISTER_TENSOR_LIST_OPS_DEFAULT
+#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index b95a065edb8..e3ade70c4ab 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/list_kernels.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/list_kernels.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/util.h"
@@ -107,7 +106,6 @@ typedef Eigen::GpuDevice GPUDevice;
 
 TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_GPU);
-TF_CALL_bfloat16(REGISTER_TENSOR_LIST_OPS_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
 
 #undef REGISTER_TENSOR_LIST_OPS_GPU
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index 0d6d24d3dce..0d488b5455a 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -95,13 +95,13 @@ struct LaunchLRN<CPUDevice, T> {
 #if defined(IS_MOBILE_PLATFORM)
     SingleThreadedLRN(in, batch, rows, cols, depth, output);
 #else
-    const int nodes = cols * rows;
     if (depth > kSingleThreadedLRNDepthCutoff &&
         (beta_ == T(0.5) || beta_ == T(1))) {
       SingleThreadedLRN(in, batch, rows, cols, depth, output);
       return;
     }
 
+    const int nodes = cols * rows;
     auto in_shaped = in.shaped<T, 2>({nodes * batch, depth});
 
     // Multiplying the input with the band matrix has the effect of reducing the
@@ -237,21 +237,25 @@ struct LaunchLRN<GPUDevice, T> {
     const int depth = static_cast<int>(in.dim_size(3));
 
     Tensor transformed_input;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(
-                       DataTypeToEnum<T>::value,
-                       ShapeFromFormat(FORMAT_NCHW, in.shape(), FORMAT_NHWC),
-                       &transformed_input));
+    TensorShape transformed_input_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(FORMAT_NCHW, in.shape(), FORMAT_NHWC,
+                                           &transformed_input_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_input_shape,
+                                                   &transformed_input));
     functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                            in.tensor<T, 4>(),
                                            transformed_input.tensor<T, 4>());
 
     Tensor transformed_output;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
-                     &transformed_output));
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, output->shape(), FORMAT_NHWC,
+                                &transformed_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
 
     perftools::gputools::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
@@ -531,41 +535,49 @@ struct LaunchLRNGrad<GPUDevice, T> {
     const int64 depth = in_grads.dim_size(3);
 
     Tensor transformed_in_grads;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, in_grads.shape(),
-                                                FORMAT_NHWC),
-                                &transformed_in_grads));
+    TensorShape transformed_in_grads_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, in_grads.shape(), FORMAT_NHWC,
+                                &transformed_in_grads_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_in_grads_shape,
+                                                   &transformed_in_grads));
     functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                            in_grads.tensor<T, 4>(),
                                            transformed_in_grads.tensor<T, 4>());
 
     Tensor transformed_in_image;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, in_image.shape(),
-                                                FORMAT_NHWC),
-                                &transformed_in_image));
+    TensorShape transformed_in_image_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, in_image.shape(), FORMAT_NHWC,
+                                &transformed_in_image_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_in_image_shape,
+                                                   &transformed_in_image));
     functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                            in_image.tensor<T, 4>(),
                                            transformed_in_image.tensor<T, 4>());
 
     Tensor transformed_out_image;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, out_image.shape(),
-                                                FORMAT_NHWC),
-                                &transformed_out_image));
+    TensorShape transformed_out_image_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, out_image.shape(), FORMAT_NHWC,
+                                &transformed_out_image_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_out_image_shape,
+                                                   &transformed_out_image));
     functor::NHWCToNCHW<GPUDevice, T, 4>()(
         context->eigen_device<GPUDevice>(), out_image.tensor<T, 4>(),
         transformed_out_image.tensor<T, 4>());
 
     Tensor transformed_output;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
-                     &transformed_output));
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, output->shape(), FORMAT_NHWC,
+                                &transformed_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
 
     perftools::gputools::dnn::BatchDescriptor dimensions_desc;
     dimensions_desc.set_count(batch)
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 51a4d0e8aa6..3e3eeba0f87 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index d4e775f2619..b5cee706935 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -48,7 +48,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if GOOGLE_CUDA
@@ -183,7 +183,7 @@ StatusOr<se::cuda::BlasLt::Epilogue> GetBlasLtEpilogOp(
   } else if (fusion == FusedComputationType::kBiasAddWithRelu) {
     return se::cuda::BlasLt::Epilogue::kBiasThenReLU;
   } else if (fusion == FusedComputationType::kBiasAddWithGeluApproximate) {
-    return se::cuda::BlasLt::Epilogue::kBiasThenGeLUApproximate;
+    return se::cuda::BlasLt::Epilogue::kBiasThenGELU;
   } else {
     return errors::Internal("Unsupported fusion for BlasLt Matmul");
   }
@@ -471,21 +471,20 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
 
     // The Gelu exact fusion is supported by the cuDNN.
     if (use_cudnn) {
-      int device_id = stream->parent()->device_ordinal();
-      DataType ab_dtype = a.dtype();
-      DataType c_dtype = output->dtype();
-      MatmulParameters cudnn_matmul_params = {/*ab_type=*/ab_dtype,
-                                              /*c_type=*/c_dtype,
-                                              trans_a,
-                                              trans_b,
-                                              static_cast<uint64_t>(m),
-                                              static_cast<uint64_t>(n),
-                                              static_cast<uint64_t>(k),
-                                              a.dim_size(1),
-                                              b.dim_size(1),
-                                              output->dim_size(1),
-                                              matmul_activation_mode,
-                                              device_id};
+      MatmulParameters cudnn_matmul_params = {
+          stream->parent(),
+          /*ab_type=*/a.dtype(),
+          /*c_type=*/output->dtype(),
+          trans_a,
+          trans_b,
+          static_cast<uint64_t>(m),
+          static_cast<uint64_t>(n),
+          static_cast<uint64_t>(k),
+          a.dim_size(1),
+          b.dim_size(1),
+          output->dim_size(1),
+          matmul_activation_mode,
+      };
 
       auto entry_or = AutotuneFusedMatmul<T>(
           use_autotune, FusedMatmulAutotuneMap::GetInstance(),
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index a2cf873c097..62dca5b94d3 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -41,7 +41,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -314,13 +314,12 @@ class BlasScratchAllocator : public se::ScratchAllocator {
 
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
 
-  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
-      int64_t byte_size) override {
+  tsl::StatusOr<DeviceMemoryBytes> AllocateBytes(int64_t byte_size) override {
     Tensor temporary_memory;
 
     if (memory_limit_ > 0 && byte_size > memory_limit_) {
-      return se::port::Status{
-          se::port::error::UNAVAILABLE,
+      return tsl::Status{
+          tsl::error::UNAVAILABLE,
           absl::StrCat("Requested memory size (", byte_size,
                        ") exceeds the memory limit (", memory_limit_, ").")};
     }
@@ -329,8 +328,8 @@ class BlasScratchAllocator : public se::ScratchAllocator {
     Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return se::port::Status{
-          se::port::error::UNAVAILABLE,
+      return tsl::Status{
+          tsl::error::UNAVAILABLE,
           absl::StrCat("Failed to allocate requested memory of (", byte_size,
                        ").")};
     }
@@ -338,10 +337,9 @@ class BlasScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return se::port::StatusOr<DeviceMemoryBytes>(
-        DeviceMemoryBytes::MakeFromByteSize(
-            temporary_memory.flat<uint8>().data(),
-            temporary_memory.flat<uint8>().size()));
+    return tsl::StatusOr<DeviceMemoryBytes>(DeviceMemoryBytes::MakeFromByteSize(
+        temporary_memory.flat<uint8>().data(),
+        temporary_memory.flat<uint8>().size()));
   }
   int64 TotalByteSize() { return total_byte_size_; }
 
@@ -393,11 +391,11 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     bool is_full_broadcast =
         std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
 
-    // Use float as coefficient type for half precision inputs, otherwise use
-    // the input type.
-    typedef std::conditional_t<std::is_same_v<Scalar, Eigen::half>, float,
-                               Scalar>
-        Coefficient;
+    // Use float as coefficient type for half and bfloat16 precision inputs,
+    // otherwise use the input type.
+    constexpr bool is_16bit_input = std::is_same_v<Scalar, Eigen::half> ||
+                                    std::is_same_v<Scalar, Eigen::bfloat16>;
+    using Coefficient = std::conditional_t<is_16bit_input, float, Scalar>;
 
 #if GOOGLE_CUDA
     static const bool use_autotune = MatmulAutotuneEnable();
@@ -588,7 +586,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
         // overhead of the scratch allocator and the batch interface.
         // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
-        if constexpr (!std::is_same_v<Scalar, Eigen::half>) {
+        if constexpr (!std::is_same_v<Scalar, Eigen::half> &&
+                      !std::is_same_v<Scalar, Eigen::bfloat16>) {
           if (n == 1 &&
               blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
               blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
@@ -742,11 +741,8 @@ class BaseBatchMatMulOp : public OpKernel {
                 out_reshaped.CopyFrom(*out, TensorShape({batch_size, d0, d3})),
                 errors::Internal("Failed to reshape output from ",
                                  out->shape().DebugString()));
-    if (std::is_same<Ta, bfloat16>::value &&
-        std::is_same<Tb, bfloat16>::value) {
-      bool is_cpu = std::is_same<Device, CPUDevice>::value;
-      OP_REQUIRES(ctx, is_cpu,
-                  errors::Internal("bfloat16 matmul is not supported by GPU"));
+    if (std::is_same_v<Device, CPUDevice> && std::is_same_v<Ta, bfloat16> &&
+        std::is_same_v<Tb, bfloat16>) {
       Tensor in0_reshaped_float, in1_reshaped_float, out_reshaped_float;
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, in0_reshaped.shape(),
                                              &in0_reshaped_float));
diff --git a/tensorflow/core/kernels/matmul_op_real.cc b/tensorflow/core/kernels/matmul_op_real.cc
index 20d189bd69e..46fbf83a53e 100644
--- a/tensorflow/core/kernels/matmul_op_real.cc
+++ b/tensorflow/core/kernels/matmul_op_real.cc
@@ -47,6 +47,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATMUL_GPU);
 REGISTER_BATCH_MATMUL_TOUT_GPU(Eigen::half, Eigen::half, Eigen::half);
 REGISTER_BATCH_MATMUL_TOUT_GPU(float, float, float);
 REGISTER_BATCH_MATMUL_TOUT_GPU(double, double, double);
+REGISTER_BATCH_MATMUL_TOUT_GPU(Eigen::bfloat16, Eigen::bfloat16,
+                               Eigen::bfloat16);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_util.h b/tensorflow/core/kernels/matmul_util.h
index 7f63b3be68f..96cb89c8a5d 100644
--- a/tensorflow/core/kernels/matmul_util.h
+++ b/tensorflow/core/kernels/matmul_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
@@ -95,19 +96,22 @@ Status DoBlasLtMatmul(se::Stream* stream,
   se::cuda::BlasLt* blas_lt = se::cuda::GetBlasLt(stream);
   // TF_RET_CHECK(blas_lt != nullptr);
 
+  se::DeviceMemory<T> aux{};  // We don't use the auxilary buffers.
+
   // The scale type may be f32 if the data type is f16 and bf16.
   if constexpr (std::is_same_v<T, Eigen::half> ||
                 std::is_same_v<T, Eigen::bfloat16>) {
     if (plan.op_desc.scale_type() == CUDA_R_32F) {
       return blas_lt->DoMatmul(stream, plan, se::HostOrDeviceScalar<float>(1.0),
                                b, a, se::HostOrDeviceScalar<float>(0.0), c, c,
-                               algorithm, scratch_allocator, bias,
+                               algorithm, scratch_allocator, bias, aux,
                                profile_result);
     }
   }
   return blas_lt->DoMatmul(stream, plan, se::HostOrDeviceScalar<T>(T(1.0)), b,
                            a, se::HostOrDeviceScalar<T>(T(0.0)), c, c,
-                           algorithm, scratch_allocator, bias, profile_result);
+                           algorithm, scratch_allocator, bias, aux,
+                           profile_result);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 4fb198690d5..c53687f3807 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -325,13 +325,18 @@ class MaxPoolingGradOp : public OpKernel {
     if (!context->status().ok()) {
       return;
     }
-    OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, tensor_out.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected orig_output shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", tensor_out.shape()));
-    OP_REQUIRES(context, out_backprop.shape() == params.forward_output_shape(),
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, out_backprop.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected grad shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", out_backprop.shape()));
 
     Tensor* output = nullptr;
@@ -549,9 +554,12 @@ class MaxPoolingGradGradOp : public OpKernel {
     if (!context->status().ok()) {
       return;
     }
-    OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, tensor_out.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected orig_output shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", tensor_out.shape()));
     OP_REQUIRES(
         context, out_grad_backprop.shape() == tensor_in.shape(),
@@ -765,9 +773,12 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
     if (!context->status().ok()) {
       return;
     }
-    OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, tensor_out.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected orig_output shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", tensor_out.shape()));
     OP_REQUIRES(
         context, out_grad_backprop.shape() == tensor_in.shape(),
@@ -966,6 +977,11 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_.size() == 4,
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify 4 dimensions"));
+    for (int i = 0; i < ksize_.size(); ++i) {
+      OP_REQUIRES(context, ksize_[i] > 0,
+                  errors::InvalidArgument(
+                      "ksize must be a postive int32 value, got:", ksize_[i]));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
     OP_REQUIRES(context, stride_.size() == 4,
                 errors::InvalidArgument("Sliding window stride field must "
@@ -1127,13 +1143,18 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
     if (!context->status().ok()) {
       return;
     }
-    OP_REQUIRES(context, grad_in.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, grad_in.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected grad shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", grad_in.shape()));
-    OP_REQUIRES(context, argmax.shape() == params.forward_output_shape(),
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, argmax.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected argmax shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", argmax.shape()));
 
     TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
@@ -1199,9 +1220,12 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
         context, grad_in.shape() == tensor_in.shape(),
         errors::InvalidArgument("Expected grad shape to be ", tensor_in.shape(),
                                 ", but got ", grad_in.shape()));
-    OP_REQUIRES(context, argmax.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, argmax.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected argmax shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", argmax.shape()));
 
     TensorShape out_shape({params.tensor_in_batch, params.out_height,
@@ -1264,9 +1288,11 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
       return;
     }
 
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
-                        params.out_width, params.depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(data_format_, params.tensor_in_batch,
+                                           params.out_height, params.out_width,
+                                           params.depth, &out_shape));
 
     // Degenerate pooling output should return an empty tensor.
     if (out_shape.num_elements() == 0) {
@@ -1399,9 +1425,11 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
       return;
     }
 
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
-                        params.out_width, params.depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(
+        context, ShapeFromFormatWithStatus(data_format_, params.tensor_in_batch,
+                                           params.out_height, params.out_width,
+                                           params.depth, &out_shape));
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
                                stride, padding_, explicit_paddings_,
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 5006be3957e..759811dd74e 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+
 #include <stdio.h>
 
 #include <cfloat>
@@ -25,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/maxpooling_op.h"
-#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 92b0647246b..471bb9c23f9 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
@@ -35,7 +36,7 @@ MKL_DEPS = MKL_SHORT_DEPS + [
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/core/kernels:concat_lib",
     "//tensorflow/core/kernels:conv_2d",
-    "//tensorflow/core/kernels:eigen_contraction_kernel",
+    "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
     "//tensorflow/core/kernels:fill_functor",
     "//tensorflow/core/kernels:gather_functor",
     "//tensorflow/core/kernels:transpose_functor",
@@ -253,6 +254,7 @@ tf_cc_test_mkl(
     deps = [
         ":mkl_eltwise_activation_base_op",
         ":mkl_swish_op",
+        "@com_google_absl//absl/strings",
         "//tensorflow/cc:math_ops",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core:direct_session",
@@ -515,9 +517,27 @@ tf_cc_test_mkl(
 
 tf_mkl_kernel_library(
     name = "mkl_kernel_util",
-    srcs = ["mkl_kernel_util.h"],
+    srcs = ["mkl_kernel_util.cc"],
+    hdrs = ["mkl_kernel_util.h"],
     deps = MKL_DEPS + [
         ":mkl_quantize_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/core/kernels:quantization_utils",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:direct_session",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
     ],
 )
+
+tf_cc_test_mkl(
+    name = "onednn_nn_ops_benchmark",
+    size = "small",
+    srcs = ["onednn_nn_ops_benchmark.cc"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core/kernels:softmax_op",
+        "//tensorflow/core/kernels/mkl:mkl_softmax_op",
+    ] + MKL_TEST_DEPS,
+)
diff --git a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
index 00a0f10db92..0604260189d 100644
--- a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
@@ -193,6 +193,22 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       const Tensor& grad_tensor =
           MklGetInput(context, kInputTensorIndexInputGradient);
 
+      // For empty tensor, avg_pool_3d_grad in oneDNN doesn't handle this case.
+      // Follow what native TF does in this case.
+
+      TensorShape output_shape;
+      auto shape_vec = orig_input_tensor.vec<int32>();
+      for (int64_t i = 0; i < orig_input_tensor.NumElements(); ++i) {
+        OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(shape_vec(i)));
+      }
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, output_shape, &output_tensor));
+      output_tensor->flat<T>().setZero();
+
+      if (output_shape.num_elements() == 0 || grad_tensor.NumElements() == 0) {
+        return;
+      }
       MklDnnShape orig_input_mkl_shape, grad_mkl_shape;
       GetMklShape(context, kInputTensorIndexInputShape, &orig_input_mkl_shape,
                   this->native_format_);
@@ -203,15 +219,9 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       // Used to allocate output_diff_src/diff_src.
       MklDnnData<T> grad_dnn_data(&cpu_engine_);
       MklPoolParameters pool_params;
-      auto shape_vec = orig_input_tensor.vec<int32>();
-      TensorShape orig_input_shape;
-      for (int i = 0; i < orig_input_tensor.NumElements(); i++) {
-        (void)orig_input_shape.AddDimWithStatus(shape_vec(i));
-      }
-
       bool is_pool2d = (this->ksize_.size() == 4);
       this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape,
-                                  orig_input_shape);
+                                  output_shape);
 
       memory::dims filter_dims, strides, padding_left, padding_right;
       this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
@@ -220,10 +230,9 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       memory::dims orig_input_dims_mkl_order =
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
-              : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                      this->data_format_tf_)
-                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                       this->data_format_tf_);
+          : is_pool2d
+              ? TFShapeToMklDnnDimsInNCHW(output_shape, this->data_format_tf_)
+              : TFShapeToMklDnnDimsInNCDHW(output_shape, this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
@@ -264,11 +273,6 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       std::shared_ptr<stream> bwd_cpu_stream;
       MklDnnThreadPool eigen_tp(context);
       bwd_cpu_stream.reset(CreateStream(&eigen_tp, pooling_bwd->GetEngine()));
-      Tensor* output_tensor = nullptr;
-      this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
-                                 orig_input_dims_mkl_order,
-                                 this->tensor_format_mkldnn_, &output_tensor);
-
       // TODO(intel-tf): Refactor (lines 249-262) common code for
       // max & avg pooling into superclass or common utils function.
       // Check whether we need to reorder diff_dst.
@@ -335,6 +339,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 
 TF_CALL_float(REGISTER_MKL_AVGPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL3D_KERNELS);
+#undef REGISTER_MKL_AVGPOOL3D_KERNELS
 
 #define REGISTER_MKL_AVGPOOL_KERNELS(T)                                       \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -362,6 +367,7 @@ TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL3D_KERNELS);
 
 TF_CALL_float(REGISTER_MKL_AVGPOOL_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL_KERNELS);
+#undef REGISTER_MKL_AVGPOOL_KERNELS
 
 REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index bd9a16e906c..748ea42d603 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -388,7 +388,9 @@ class MklConvCustomBackpropFilterOp
                         "filter_sizes shape must be rank 1 but is rank ",
                         filter_tensor.shape().dims()));
       }
-      TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
+      TensorShape filter_tf_shape;
+      OP_REQUIRES_OK(
+          context, MakeFilterTfShape(context, filter_tensor, &filter_tf_shape));
       TensorShape diff_dst_tf_shape =
           GetTfShape(context, kDiffDstIdx, native_format);
 
@@ -664,15 +666,15 @@ class MklConvCustomBackpropFilterOp
   }
 
   // Get TensorFlow shape of filter tensor.
-  TensorShape MakeFilterTfShape(OpKernelContext* context,
-                                const Tensor& filter_tensor) {
-    TensorShape filter_tf_shape;
-    CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
-                                         &filter_tf_shape)
-                 .ok(),
-             true);
-    return filter_tf_shape;
+  Status MakeFilterTfShape(OpKernelContext* context,
+                           const Tensor& filter_tensor,
+                           TensorShape* filter_tf_shape) {
+    if (!TensorShapeUtils::IsVector(filter_tensor.shape())) {
+      return errors::InvalidArgument("filter_tensor must be a vecotr, got ",
+                                     filter_tensor.shape());
+    }
+    return TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                       filter_tf_shape);
   }
 
   // Get Tensorflow shape of output tensor (diff_filter),
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 63c24706baf..b1379304a08 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -1488,6 +1488,16 @@ class MklFusedConvOp
       this->set_fuse_bn(true, epsilon);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu,
                                 leakyrelu_alpha);
+    } else if (fused_ops ==
+               std::vector<string>{"FusedBatchNorm", "_MklSwish"}) {
+      float epsilon;
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument(
+              "Fused Conv2D with batchnorm must have 4 extra argument"));
+      this->set_fuse_bn(true, epsilon);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_swish, 1.0);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
@@ -1526,6 +1536,12 @@ class MklFusedConvOp
           context, num_args == 2,
           errors::InvalidArgument(
               "Fused Conv2D must have two extra arguments: bias and add."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "_MklSwish"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_swish, 1.0);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
     } else {
       OP_REQUIRES(context, false,
                   errors::Unimplemented("Fusion is not implemented: [",
@@ -1938,7 +1954,6 @@ class MklQuantizedConvOp
 
     if (this->get_fuse_add()) {
       // Calculate the scale (beta in oneDNN api term) for sum
-      auto iter = params.post_op_params.begin() + post_op_to_idx_["sum"];
       if (std::is_same<Toutput, quint8>::value) {
         DataType summand_dt = this->input_type(this->get_input_add_idx());
         bool summand_condition =
@@ -2198,12 +2213,11 @@ class MklQuantizedConvOp
   }
 
   inline oneDNNFusedOps StrToEnum(const string op) {
-    if (str_to_enum_.count(op) != 0) {
-      return str_to_enum_[op];
-    } else {
-      TF_CHECK_OK(
-          Status(error::Code::UNKNOWN, "Error: Unknown post op: " + op));
-    }
+    // It was not doing template substitution for the second parameter of
+    // CHECK_EQ and thus I had to do this to make it work.
+    CHECK_EQ(str_to_enum_.find(op) != str_to_enum_.end(), true)  // Crash OK
+        << "Error: Unknown post op: " << op;
+    return str_to_enum_[op];
   }
   // Allocate tensors for cached bias data and
   // cached bias memory descriptor (data format)
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.h b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
index c4de5c1e0ab..2ee653c1869 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.h
@@ -223,6 +223,11 @@ class MklDnnConvUtil {
                       input_depth, " vs ", filter_in_depth));
       *is_grouped_convolution = filter_in_depth != input_depth;
       int group_count = input_depth / filter_in_depth;
+      OP_REQUIRES(context_, group_count > 0,
+                  errors::InvalidArgument(
+                      "grouped convolution must have at least one group: ",
+                      group_count, " groups"));
+
       // oneDNN always needs filter in OIHW format for regular convolutions
       // and GOIHW for grouped/depthwise convolutions,
       // OIHW = (out_depth, in_depth, rows, cols)
@@ -494,12 +499,17 @@ class MklDnnConvUtil {
     //     Conv2D: NHWC or NCHW
     //     Conv3D: NDHWC or NCDHW
     // oneDNN uses asymmetric padding.
-    TensorShape out_shape =
-        is_conv2d
-            ? ShapeFromFormat(data_format_, out_batch, out_rows, out_cols,
-                              out_depth)
-            : ShapeFromFormat(data_format_, out_batch,
-                              {{out_planes, out_rows, out_cols}}, out_depth);
+    TensorShape out_shape;
+    if (is_conv2d) {
+      OP_REQUIRES_OK(
+          context_, ShapeFromFormatWithStatus(data_format_, out_batch, out_rows,
+                                              out_cols, out_depth, &out_shape));
+    } else {
+      OP_REQUIRES_OK(context_, ShapeFromFormatWithStatus(
+                                   data_format_, out_batch,
+                                   {{out_planes, out_rows, out_cols}},
+                                   out_depth, &out_shape));
+    }
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
     if (is_grouped_convolution) {
       int out_depth = GetTensorDim(out_shape, data_format_, 'C');
diff --git a/tensorflow/core/kernels/mkl/mkl_einsum_op.cc b/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
index 2b791ed0e81..b65e64f5ab9 100644
--- a/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_einsum_op.cc
@@ -115,6 +115,49 @@ struct MklEinsumHelper {
         MklMatMulPrimitiveFactory<T, T, T, T>::Get(
             *params, false /* value for do_not_cache */);
 
+    T* weight_data = const_cast<T*>(rhs.flat<T>().data());
+
+#ifdef DNNL_AARCH64_USE_ACL
+    memory::format_tag weight_format;
+    switch (params->b_dims.size()) {
+      case 2:
+        weight_format =
+            trans_y ? memory::format_tag::ba : memory::format_tag::ab;
+        break;
+      case 3:
+        weight_format =
+            trans_y ? memory::format_tag::acb : memory::format_tag::abc;
+        break;
+      case 4:
+        weight_format =
+            trans_y ? memory::format_tag::abdc : memory::format_tag::abcd;
+        break;
+      case 5:
+        weight_format =
+            trans_y ? memory::format_tag::abced : memory::format_tag::abcde;
+        break;
+      default:
+        weight_format = memory::format_tag::undef;
+    }
+    engine cpu_engine = engine(engine::kind::cpu, 0);
+    MklDnnData<T> weights_mkl(&cpu_engine);
+    if (weight_format != memory::format_tag::undef) {
+      auto weight_md =
+          memory::desc(params->b_dims, MklDnnType<T>(), weight_format);
+      std::shared_ptr<dnnl::matmul::primitive_desc> matmul_pd =
+          matmul_prim->GetPrimitiveDesc();
+      // Reorder weights if necessary.
+      // Check whether we need to do reorder.
+      if (weight_md != matmul_pd->weights_desc()) {
+        weights_mkl.SetUsrMem(weight_md, weight_data);
+        weights_mkl.CheckReorderToOpMem(matmul_pd.get()->weights_desc(),
+                                        cpu_engine, ctx);
+        weight_data =
+            reinterpret_cast<T*>(weights_mkl.GetOpMem().get_data_handle());
+      }
+    }
+#endif  // DNNL_AARCH64_USE_ACL
+
     UserScratchPad<unsigned char> scratch_pad;
     scratch_pad.AllocateSPTensor(matmul_prim, ctx);
     // Execute matmul primitive.
@@ -122,7 +165,7 @@ struct MklEinsumHelper {
     MklDnnThreadPool eigen_tp(ctx);
     cpu_stream.reset(CreateStream(&eigen_tp, matmul_prim->GetEngine()));
 
-    matmul_prim->Execute(cpu_stream, lhs.flat<T>().data(), rhs.flat<T>().data(),
+    matmul_prim->Execute(cpu_stream, lhs.flat<T>().data(), weight_data,
                          output->flat<T>().data(), scratch_pad.Get());
 
     Tensor output_reshaped;
@@ -272,7 +315,6 @@ class MklEinsum : public OpKernel {
   bool mkl_output_has_ellipsis_ = false;
 };
 
-#ifndef DNNL_AARCH64_USE_ACL
 #define REGISTER_EINSUM_MKL(TYPE)                                             \
   REGISTER_KERNEL_BUILDER(Name("_MklEinsum")                                  \
                               .Device(DEVICE_CPU)                             \
@@ -281,6 +323,5 @@ class MklEinsum : public OpKernel {
                           MklEinsum<CPUDevice, TYPE>)
 TF_CALL_float(REGISTER_EINSUM_MKL);
 TF_CALL_bfloat16(REGISTER_EINSUM_MKL);
-#endif  // !DNNL_AARCH64_USE_ACL
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_kernel_util.cc b/tensorflow/core/kernels/mkl/mkl_kernel_util.cc
new file mode 100644
index 00000000000..a4731ea7888
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/mkl_kernel_util.cc
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mkl/mkl_kernel_util.h"
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+void MklTestingUtil::RunMklQuantizeOp(const Tensor& input,
+                                      const float input_min,
+                                      const float input_max, DataType type,
+                                      string mode, Tensor* output) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Node* input_node = test::graph::Constant(&*graph, input, "input");
+
+  Tensor min(DT_FLOAT, TensorShape());
+  Tensor max(DT_FLOAT, TensorShape());
+  min.scalar<float>()() = input_min;
+  max.scalar<float>()() = input_max;
+  Node* min_node = test::graph::Constant(&*graph, Tensor(min), "min");
+  Node* max_node = test::graph::Constant(&*graph, Tensor(max), "max");
+
+  Node* quantize_op;
+  TF_CHECK_OK(NodeBuilder("mkl_quantizeV2", "_MklQuantizeV2")
+                  .Input(input_node)
+                  .Input(min_node)
+                  .Input(max_node)
+                  .Attr("T", type)
+                  .Attr("mode", mode)
+                  .Attr("round_mode", "HALF_TO_EVEN")
+                  .Attr("_kernel", "QuantizedMklOp")
+                  .Finalize(&*graph, &quantize_op));
+
+  GraphDef graph_def;
+  graph->ToGraphDef(&graph_def);
+  RunGraph(graph_def, "mkl_quantizeV2", output);
+}
+
+void MklTestingUtil::RunDequantizeOp(const Tensor& input,
+                                     const Tensor& input_min,
+                                     const Tensor& input_max, string mode,
+                                     Tensor* output) {
+  auto root = tensorflow::Scope::NewRootScope();
+  string op_name = "dequantize_op";
+  auto input_op =
+      ops::Const(root.WithOpName("input"), Input::Initializer(input));
+  auto input_min_op =
+      ops::Const(root.WithOpName("input_min"), Input::Initializer(input_min));
+  auto input_max_op =
+      ops::Const(root.WithOpName("input_max"), Input::Initializer(input_max));
+
+  ops::Dequantize::Attrs attrs;
+  attrs = attrs.Mode(mode);
+
+  auto out_op = ops::Dequantize(root.WithOpName(op_name), input_op,
+                                input_min_op, input_max_op, attrs);
+  tensorflow::GraphDef graph_def;
+  TF_CHECK_OK(root.ToGraphDef(&graph_def));
+  RunGraph(graph_def, op_name, output);
+}
+
+void MklTestingUtil::RunGraph(const tensorflow::GraphDef graph_def,
+                              const string& fetch, Tensor* output) {
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+
+  std::vector<Tensor> output_tensors;
+  TF_CHECK_OK(session->Run({}, {fetch}, {}, &output_tensors));
+
+  *output = output_tensors[0];
+}
+#endif  // INTEL_MKL
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl/mkl_kernel_util.h b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
index e50ecf7b18c..32ecc313c11 100644
--- a/tensorflow/core/kernels/mkl/mkl_kernel_util.h
+++ b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
@@ -18,20 +18,9 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -39,74 +28,20 @@ class MklTestingUtil {
  public:
   static void RunMklQuantizeOp(const Tensor& input, const float input_min,
                                const float input_max, DataType type,
-                               string mode, Tensor* output) {
-    std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-    Node* input_node = test::graph::Constant(&*graph, input, "input");
-
-    Tensor min(DT_FLOAT, TensorShape());
-    Tensor max(DT_FLOAT, TensorShape());
-    min.scalar<float>()() = input_min;
-    max.scalar<float>()() = input_max;
-    Node* min_node = test::graph::Constant(&*graph, Tensor(min), "min");
-    Node* max_node = test::graph::Constant(&*graph, Tensor(max), "max");
-
-    Node* quantize_op;
-    TF_CHECK_OK(NodeBuilder("mkl_quantizeV2", "_MklQuantizeV2")
-                    .Input(input_node)
-                    .Input(min_node)
-                    .Input(max_node)
-                    .Attr("T", type)
-                    .Attr("mode", mode)
-                    .Attr("round_mode", "HALF_TO_EVEN")
-                    .Attr("_kernel", "QuantizedMklOp")
-                    .Finalize(&*graph, &quantize_op));
-
-    GraphDef graph_def;
-    graph->ToGraphDef(&graph_def);
-    RunGraph(graph_def, "mkl_quantizeV2", output);
-  }
-
+                               string mode, Tensor* output);
   static void RunDequantizeOp(const Tensor& input, const Tensor& input_min,
                               const Tensor& input_max, string mode,
-                              Tensor* output) {
-    auto root = tensorflow::Scope::NewRootScope();
-    string op_name = "dequantize_op";
-    auto input_op =
-        ops::Const(root.WithOpName("input"), Input::Initializer(input));
-    auto input_min_op =
-        ops::Const(root.WithOpName("input_min"), Input::Initializer(input_min));
-    auto input_max_op =
-        ops::Const(root.WithOpName("input_max"), Input::Initializer(input_max));
-
-    ops::Dequantize::Attrs attrs;
-    attrs = attrs.Mode(mode);
-
-    auto out_op = ops::Dequantize(root.WithOpName(op_name), input_op,
-                                  input_min_op, input_max_op, attrs);
-    tensorflow::GraphDef graph_def;
-    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-    RunGraph(graph_def, op_name, output);
-  }
+                              Tensor* output);
 
   static void RunGraph(const tensorflow::GraphDef graph_def,
-                       const string& fetch, Tensor* output) {
-    std::unique_ptr<tensorflow::Session> session(
-        tensorflow::NewSession(tensorflow::SessionOptions()));
-    TF_ASSERT_OK(session->Create(graph_def));
-
-    std::vector<Tensor> output_tensors;
-    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &output_tensors));
-
-    *output = output_tensors[0];
-  }
-
+                       const string& fetch, Tensor* output);
   template <typename T>
-  static void ComputeMinMax(const Tensor& tf_tensor, T* tenosr_min,
+  static void ComputeMinMax(const Tensor& tf_tensor, T* tensor_min,
                             T* tensor_max) {
     auto eigen_tensor = tf_tensor.flat<T>();
     Eigen::Tensor<T, 0, Eigen::RowMajor> min = eigen_tensor.minimum();
     Eigen::Tensor<T, 0, Eigen::RowMajor> max = eigen_tensor.maximum();
-    *tenosr_min = min();
+    *tensor_min = min();
     *tensor_max = max();
   }
 };
diff --git a/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
index 96cd7d77124..6e35395890f 100644
--- a/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_layer_norm_op.cc
@@ -61,7 +61,9 @@ class MklLayerNormOp : public OpKernel {
                                   "tensors are not same."));
 
       auto cpu_engine = engine(engine::kind::cpu, 0);
-      auto engine_stream = stream(cpu_engine);
+      MklDnnThreadPool eigen_tp(ctx);
+      auto cpu_stream =
+          std::unique_ptr<stream>(CreateStream(&eigen_tp, cpu_engine));
 
       memory::dims src_dims = TFShapeToMklDnnDims(src_tensor.shape());
       auto src_md =
@@ -104,7 +106,7 @@ class MklLayerNormOp : public OpKernel {
       std::unordered_map<int, memory> scale_reorder_args;
       scale_reorder_args.insert({DNNL_ARG_FROM, scale_mem_src});
       scale_reorder_args.insert({DNNL_ARG_TO, scale_mem_dst});
-      scale_reorder_prim.execute(engine_stream, scale_reorder_args);
+      scale_reorder_prim.execute(*cpu_stream, scale_reorder_args);
 
       void* shift_buf_src =
           static_cast<void*>(const_cast<T*>(shift_tensor.flat<T>().data()));
@@ -122,7 +124,7 @@ class MklLayerNormOp : public OpKernel {
       std::unordered_map<int, memory> shift_reorder_args;
       shift_reorder_args.insert({DNNL_ARG_FROM, shift_mem_src});
       shift_reorder_args.insert({DNNL_ARG_TO, shift_mem_dst});
-      shift_reorder_prim.execute(engine_stream, shift_reorder_args);
+      shift_reorder_prim.execute(*cpu_stream, shift_reorder_args);
 
       // Create layer_normalization primitive
       auto lnorm_desc = layer_normalization_forward::desc(
@@ -150,7 +152,7 @@ class MklLayerNormOp : public OpKernel {
       lnorm_args.insert({DNNL_ARG_VARIANCE, variance_mem});
       lnorm_args.insert({DNNL_ARG_SCALE_SHIFT, scale_shift_mem});
       lnorm_args.insert({DNNL_ARG_DST, dst_mem});
-      lnorm_prim.execute(engine_stream, lnorm_args);
+      lnorm_prim.execute(*cpu_stream, lnorm_args);
     } catch (dnnl::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 7c284f15eae..1c2632d8438 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -757,7 +757,7 @@ class MklMatMulPrimitive : public MklPrimitive {
     context_.b_mem.reset(
         new dnnl::memory(*context_.b_md, cpu_engine_, DummyData));
     context_.c_mem.reset(
-        new dnnl::memory(*context_.b_md, cpu_engine_, DummyData));
+        new dnnl::memory(*context_.c_md, cpu_engine_, DummyData));
     auto scratchpad_md = context_.prim_desc->scratchpad_desc();
     context_.sp_mem.reset(
         new dnnl::memory(scratchpad_md, cpu_engine_, DummyData));
diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index 27e7b359d62..fbc2830ea24 100644
--- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -259,6 +259,19 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolParameters pool_params;
       TensorShape orig_input_shape = orig_input_tensor.shape();
 
+      if (orig_input_tensor.NumElements() == 0 ||
+          grad_tensor.NumElements() == 0) {
+        Tensor* output = nullptr;
+        TensorShape output_shape;
+        auto shape_vec = orig_input_tensor.vec<int32>();
+        for (int64_t i = 0; i < orig_input_tensor.NumElements(); ++i) {
+          OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(shape_vec(i)));
+        }
+        OP_REQUIRES_OK(context,
+                       context->allocate_output(0, output_shape, &output));
+        output->flat<T>().setZero();
+        return;
+      }
       bool is_pool2d = (this->ksize_.size() == 4);
       this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape,
                                   orig_input_shape);
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
index ec722bb66a6..eb069a9aa27 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op.cc
@@ -317,8 +317,20 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
       // This is the case the inner-product and requantization are fused.
       // "min_freezed_output" and "max_freezed_output" are the requested range
       // for the output.
-      min_output_value = context->input(7).flat<float>()(0);
-      max_output_value = context->input(8).flat<float>()(0);
+      const Tensor& min_freezed_tensor = context->input(7);
+      const Tensor& max_freezed_tensor = context->input(8);
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsScalar(min_freezed_tensor.shape()),
+                  errors::InvalidArgument(
+                      "`min_freezed_output` must be rank 0 but is rank ",
+                      min_freezed_tensor.dims()));
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsScalar(max_freezed_tensor.shape()),
+                  errors::InvalidArgument(
+                      "`max_freezed_output` must be rank 0 but is rank ",
+                      max_freezed_tensor.dims()));
+      min_output_value = min_freezed_tensor.scalar<float>()();
+      max_output_value = max_freezed_tensor.scalar<float>()();
     } else {
       ComputeOutputRangeForInt32(context, &min_output_value, &max_output_value);
     }
@@ -344,10 +356,10 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
   void ComputeOutputRangeForInt32(OpKernelContext* context,
                                   float* min_output_value,
                                   float* max_output_value) {
-    const float min_input = context->input(3).flat<float>()(0);
-    const float max_input = context->input(4).flat<float>()(0);
-    const float min_weight = context->input(5).flat<float>()(0);
-    const float max_weight = context->input(6).flat<float>()(0);
+    const float min_input = context->input(3).scalar<float>()();
+    const float max_input = context->input(4).scalar<float>()();
+    const float min_weight = context->input(5).scalar<float>()();
+    const float max_weight = context->input(6).scalar<float>()();
     MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
         min_input, max_input, min_weight, max_weight, min_output_value,
         max_output_value);
@@ -361,6 +373,25 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
     params.dtypes.append(typeid(Tbias).name());
     params.dtypes.append(typeid(Toutput).name());
 
+    // min-max values for input and weight should be scalar.
+    const Tensor& min_input_tensor = context->input(3);
+    const Tensor& max_input_tensor = context->input(4);
+    const Tensor& min_weight_tensor = context->input(5);
+    const Tensor& max_weight_tensor = context->input(6);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_input_tensor.shape()),
+                errors::InvalidArgument("`min_a` must be rank 0 but is rank ",
+                                        min_input_tensor.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_input_tensor.shape()),
+                errors::InvalidArgument("`max_a` must be rank 0 but is rank ",
+                                        max_input_tensor.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_weight_tensor.shape()),
+                errors::InvalidArgument("`min_b` must be rank 0 but is rank ",
+                                        min_weight_tensor.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_weight_tensor.shape()),
+                errors::InvalidArgument("`max_b` must be rank 0 but is rank ",
+                                        max_weight_tensor.dims()));
+
     // When the output type is quint8, the output data is requantized into
     // quint8. A post_op "output_scale" is added to do the conversion.
     if (std::is_same<Toutput, quint8>::value ||
@@ -371,8 +402,21 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
       ComputeOutputRangeForInt32(context, &min_output_value, &max_output_value);
       float scale_int32 =
           std::max(std::abs(min_output_value), std::abs(max_output_value));
-      const float min_freezed_output = context->input(7).flat<float>()(0);
-      const float max_freezed_output = context->input(8).flat<float>()(0);
+      const Tensor& min_freezed_tensor = context->input(7);
+      const Tensor& max_freezed_tensor = context->input(8);
+      // min-max values of freezed output range should be scalar.
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsScalar(min_freezed_tensor.shape()),
+                  errors::InvalidArgument(
+                      "`min_freezed_output` must be rank 0 but is rank ",
+                      min_freezed_tensor.dims()));
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsScalar(max_freezed_tensor.shape()),
+                  errors::InvalidArgument(
+                      "`max_freezed_output` must be rank 0 but is rank ",
+                      max_freezed_tensor.dims()));
+      const float min_freezed_output = min_freezed_tensor.scalar<float>()();
+      const float max_freezed_output = max_freezed_tensor.scalar<float>()();
       float scale_eightbit =
           std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
       float scale = 1.0;
diff --git a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
index de41b3d1264..724f1758d91 100644
--- a/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_qmatmul_op_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+#if defined(INTEL_MKL)
 #define EIGEN_USE_THREADS
 
 #include <functional>
@@ -64,10 +64,10 @@ TEST_F(QuantizedMatMulTest, Small_withBias) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {1, 2, 3, 4});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -116,10 +116,10 @@ TEST_F(QuantizedMatMulTest, Small_withNegBias) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {100, -200, 300, -400});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -178,10 +178,10 @@ TEST_F(QuantizedMatMulTest, Small_WithNegInp) {
   AddInputFromArray<qint8>(TensorShape({3, 2}), {1, 4, 2, 5, 3, 6});
   // Bias
   AddInputFromArray<float>(TensorShape({2}), {10.0f, 20.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-12.0f});
-  AddInputFromArray<float>(TensorShape({1}), {243.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {-12.0f});
+  AddInputFromArray<float>(TensorShape({}), {243.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // First calculate C = A * B,
@@ -240,12 +240,12 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReq) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {10, -20, 30, -40});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -308,12 +308,12 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndDeq) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {10, -20, 30, -40});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -375,10 +375,10 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndRelu) {
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<float>(TensorShape({4}),
                            {100.0f, -200.0f, 300.0f, -400.0f});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -431,12 +431,12 @@ TEST_F(QuantizedMatMulTest, Small_withBiasAndReluAndReq) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {10, -20, 30, -40});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
 
   TF_ASSERT_OK(RunOpKernel());
   // Here are the results we expect, from hand calculations:
@@ -502,10 +502,10 @@ TEST_F(QuantizedMatMulTest, Small_withWeightCached) {
   AddInputFromArray<qint8>(TensorShape({3, 4}),
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   AddInputFromArray<qint32>(TensorShape({4}), {1, 2, 3, 4});
-  AddInputFromArray<float>(TensorShape({1}), {0});
-  AddInputFromArray<float>(TensorShape({1}), {255.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {127.0f});
+  AddInputFromArray<float>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {255.0f});
+  AddInputFromArray<float>(TensorShape({}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({}), {127.0f});
 
   int64 start_time = Env::Default()->NowMicros();
   TF_ASSERT_OK(RunOpKernel());
@@ -543,4 +543,4 @@ TEST_F(QuantizedMatMulTest, Small_withWeightCached) {
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL && ENABLE_MKL
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
index 52fc6fa5308..98676000d6c 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantized_conv_ops_test.cc
@@ -674,8 +674,6 @@ class QuantizedConvTest : public OpsTestBase {
                           const float tol = 1.0) {
     bool fuse_bias = std::find(fused_ops.begin(), fused_ops.end(), "BiasAdd") !=
                      fused_ops.end();
-    bool fuse_relu = std::find(fused_ops.begin(), fused_ops.end(), "Relu") !=
-                     fused_ops.end();
     bool fuse_sum =
         std::find(fused_ops.begin(), fused_ops.end(), "Sum") != fused_ops.end();
     bool fuse_requantize = std::find(fused_ops.begin(), fused_ops.end(),
diff --git a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
index 3c7a2a6739d..3bd13f84cd5 100644
--- a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
@@ -17,18 +17,19 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "dnnl.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 #ifdef DNNL_AARCH64_USE_ACL
 #include "tensorflow/core/platform/mutex.h"
 #endif
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using dnnl::prop_kind;
 using dnnl::softmax_forward;
@@ -39,12 +40,12 @@ namespace tensorflow {
 class MklSoftmaxParams {
  public:
   memory::dims src_dims;
-  MklTensorFormat src_fmt;
+  memory::format_tag src_fmt;
   int axis;
 #ifdef DNNL_AARCH64_USE_ACL
   int aarch64_counter;
 #endif
-  MklSoftmaxParams(memory::dims src_dims, MklTensorFormat src_fmt, int axis)
+  MklSoftmaxParams(memory::dims src_dims, memory::format_tag src_fmt, int axis)
       : src_dims(src_dims), src_fmt(src_fmt), axis(axis) {}
 };
 
@@ -121,7 +122,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward primitive setup
   void Setup(const MklSoftmaxParams& fwdParams) {
     // Create memory descriptors for softmax data with specified format.
-    auto src_format = MklTensorFormatToMklDnnDataFormat(fwdParams.src_fmt);
+    auto src_format = fwdParams.src_fmt;
     context_.src_md.reset(
         new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
@@ -213,61 +214,26 @@ class MklSoftmaxOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(engine::kind::cpu, 0);
-      // src_tensor points to the 0-th input of global data struct "context".
-      size_t src_idx = 0;
-      const Tensor& src_tensor = MklGetInput(context, src_idx);
-      MklDnnShape src_mkl_shape;
-      GetMklShape(context, src_idx, &src_mkl_shape);
-
-      // src_dims is the dimension of src_tensor.
-      // Dim of the dst will also be same as src_dims.
-      auto src_tf_shape = src_mkl_shape.IsMklTensor()
-                              ? src_mkl_shape.GetTfShape()
-                              : src_tensor.shape();
-      const int input_dims = src_tf_shape.dims();
-      memory::dims src_dims;
-      int axis;
-      if (src_mkl_shape.IsMklTensor()) {
-        src_dims = src_mkl_shape.GetSizesAsMklDnnDims();
-        axis = 1;
-      } else {
-        src_dims = TFShapeToMklDnnDims(src_tf_shape);
-        axis = input_dims - 1;
-      }
-      MklTensorFormat layout_type;
-      // In MKL, data format passed to mkl softmax op depends on dimension of
-      // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
-      // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
-      // and "ncdhw" for 5 dim tensor. Each of the symbols has the following
-      // meaning: n = batch, c = channels, t = sequence length, h = height, w =
-      // width, d = depth. When src tensor is MKL, layout_type here is only used
-      // for setting TF layout type of output tensor. When input is TF Tensor,
-      // layout here is no special sense. We use axis to define on which
-      // dimension to do softmax.
+      const Tensor& src_tensor = context->input(0);
+      auto src_shape = src_tensor.shape();
+      const int input_dims = src_shape.dims();
+      memory::format_tag src_fmt;
+      // TODO(intel-tf): Add support for dimensions larger than 5.
       switch (input_dims) {
         case 1:
-          layout_type = MklTensorFormat::FORMAT_X;
+          src_fmt = memory::format_tag::a;
           break;
         case 2:
-          layout_type = MklTensorFormat::FORMAT_NC;
+          src_fmt = memory::format_tag::ab;
           break;
         case 3:
-          layout_type = MklTensorFormat::FORMAT_TNC;
+          src_fmt = memory::format_tag::abc;
           break;
         case 4:
-          if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MklTensorFormat::FORMAT_NHWC;
-          } else {
-            layout_type = MklTensorFormat::FORMAT_NCHW;
-          }
+          src_fmt = memory::format_tag::abcd;
           break;
         case 5:
-          if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MklTensorFormat::FORMAT_NDHWC;
-          } else {
-            layout_type = MklTensorFormat::FORMAT_NCDHW;
-          }
+          src_fmt = memory::format_tag::abcde;
           break;
         default:
           OP_REQUIRES_OK(context,
@@ -275,13 +241,9 @@ class MklSoftmaxOp : public OpKernel {
           return;
       }
 
-      // If input is in MKL layout, then simply get the format from input;
-      // otherwise, use TF layout defined before.
-      auto src_fmt = src_mkl_shape.IsMklTensor()
-                         ? MklTensorFormat::FORMAT_BLOCKED
-                         : layout_type;
-
       // Get a softmax fwd primitive from primitive pool.
+      auto src_dims = TFShapeToMklDnnDims(src_shape);
+      int axis = input_dims - 1;
       MklSoftmaxParams fwdParams(src_dims, src_fmt, axis);
 #ifdef DNNL_AARCH64_USE_ACL
       // ACL does not support reuse of primitives with different data.
@@ -295,29 +257,9 @@ class MklSoftmaxOp : public OpKernel {
       MklSoftmaxPrimitive<T>* softmax_fwd =
           MklSoftmaxPrimitiveFactory<T>::Get(fwdParams);
 
-      // Prepare for creating output tensor.
       Tensor* output_tensor = nullptr;
-      MklDnnShape output_mkl_shape;
-      TensorShape output_tf_shape;  // shape of output TF tensor.
-
-      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->dst_desc();
-
-      // If input is MKL shape, output is also MKL shape.
-      // If input is TF shape, output is also TF shape.
-      if (src_mkl_shape.IsMklTensor()) {
-        output_mkl_shape.SetMklTensor(true);
-        output_mkl_shape.SetMklLayout(&dst_pd);
-        output_mkl_shape.SetElemType(MklDnnType<T>());
-        output_mkl_shape.SetTfLayout(src_dims.size(), src_dims, layout_type);
-        output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-      } else {
-        output_mkl_shape.SetMklTensor(false);
-        output_tf_shape = MklDnnDimsToTFShape(src_dims);
-      }
-      // Allocate output tensor.
-      AllocateOutputSetMklShape(context, 0, &output_tensor, output_tf_shape,
-                                output_mkl_shape);
-
+      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                  {0}, 0, src_tensor.shape(), &output_tensor));
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
       std::shared_ptr<stream> fwd_cpu_stream;
@@ -337,13 +279,12 @@ class MklSoftmaxOp : public OpKernel {
 
 /* Register DNN kernels for supported operations and supported types - right now
  * it is only Softmax and f32 */
-#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)     \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklSoftmax")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklSoftmaxOp<CPUDevice, type>);
+#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("_MklSoftmax")                                 \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<type>("T")                      \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklSoftmaxOp<CPUDevice, type>);
 
 TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
diff --git a/tensorflow/core/kernels/mkl/mkl_swish_op_test.cc b/tensorflow/core/kernels/mkl/mkl_swish_op_test.cc
index 412337b31a7..9e9c5f4baa0 100644
--- a/tensorflow/core/kernels/mkl/mkl_swish_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_swish_op_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "absl/strings/match.h"
-#include "dnnl.hpp"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
diff --git a/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc b/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc
new file mode 100644
index 00000000000..58fcf115c6b
--- /dev/null
+++ b/tensorflow/core/kernels/mkl/onednn_nn_ops_benchmark.cc
@@ -0,0 +1,115 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+
+#include <initializer_list>
+
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/mkl_testlib.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+template <typename T>
+static void BM_Softmax(::testing::benchmark::State& state,
+                       std::initializer_list<int64_t> dims, int num_threads,
+                       const string& label, bool onednn) {
+  DataType dtype = DataTypeToEnum<T>::v();
+  TensorShape shape = TensorShape(dims);
+  Tensor input(dtype, shape);
+  input.flat<T>().setRandom();
+  Graph* g = new Graph(OpRegistry::Global());
+  if (onednn) {
+    test::graph::oneDNNSoftmax(g, test::graph::Constant(g, input));
+  } else {
+    auto root = Scope::NewRootScope().ExitOnError();
+    auto softmax = ops::Softmax(root, input);
+    TF_CHECK_OK(root.status());
+    TF_CHECK_OK(root.ToGraph(g));
+  }
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(1);
+  opts.config.set_intra_op_parallelism_threads(num_threads);
+  opts.config.set_use_per_session_threads(true);
+  opts.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_opt_level(OptimizerOptions::L0);
+  test::Benchmark("cpu", g, &opts, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(shape.num_elements() * state.iterations());
+  state.SetLabel(label);
+}
+
+// For a tensor shape {a, b, c, d}, we want to produce a token a_b_c_d
+#define CONCAT_DIMS1(a) _##a
+#define CONCAT_DIMS2(a, b) _##a##_##b
+#define CONCAT_DIMS3(a, b, c) _##a##_##b##_##c
+#define CONCAT_DIMS4(a, b, c, d) _##a##_##b##_##c##_##d
+#define CONCAT_DIMS5(a, b, c, d, e) _##a##_##b##_##c##_##d##_e
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x##y
+
+// Wrapping BENCHMARK to get benchmark name appropriate, so that the argument
+// expansion takes place before the expansion of BENCHMARK.
+#define WRAP_BENCHMARK(FUNC) BENCHMARK(FUNC)
+
+#define BM_oneDNN_Softmax(dtype, num_threads, label, num_dims, ...)          \
+  static void JOIN(BM_oneDNN_Softmax_##dtype##_intraop_##num_threads##_dims, \
+                   JOIN(CONCAT_DIMS, num_dims)(__VA_ARGS__))(                \
+      ::testing::benchmark::State & state) {                                 \
+    BM_Softmax<dtype>(state, {__VA_ARGS__}, num_threads, label, true);       \
+  }                                                                          \
+  WRAP_BENCHMARK(                                                            \
+      JOIN(BM_oneDNN_Softmax_##dtype##_intraop_##num_threads##_dims,         \
+           JOIN(CONCAT_DIMS, num_dims)(__VA_ARGS__)))                        \
+      ->MeasureProcessCPUTime()
+
+#define BM_Eigen_Softmax(dtype, num_threads, label, num_dims, ...)             \
+  static void JOIN(BM_Eigen_Softmax_##dtype##_intraop_##num_threads##_dims,    \
+                   JOIN(CONCAT_DIMS, num_dims)(__VA_ARGS__))(                  \
+      ::testing::benchmark::State & state) {                                   \
+    BM_Softmax<dtype>(state, {__VA_ARGS__}, num_threads, label, false);        \
+  }                                                                            \
+  WRAP_BENCHMARK(JOIN(BM_Eigen_Softmax_##dtype##_intraop_##num_threads##_dims, \
+                      JOIN(CONCAT_DIMS, num_dims)(__VA_ARGS__)))               \
+      ->MeasureProcessCPUTime()
+
+#define BM_Softmax(dtype, num_threads, label, num_dims, ...)           \
+  BM_oneDNN_Softmax(dtype, num_threads, label, num_dims, __VA_ARGS__); \
+  BM_Eigen_Softmax(dtype, num_threads, label, num_dims, __VA_ARGS__);
+
+BM_Softmax(float, 4, "float32_BERT_batch_size_1", 4, 1, 16, 384, 384);
+BM_Softmax(float, 4, "float32_BERT_batch_size_16", 4, 16, 16, 384, 384);
+BM_Softmax(float, 1, "float32_ImageNet_batch_size_32", 2, 32, 1008);
+BM_Softmax(float, 1, "float32_ImageNet_batch_size_128", 2, 128, 1008);
+BM_Softmax(float, 4, "float32_ImageNet_batch_size_32", 2, 32, 1008);
+BM_Softmax(float, 4, "float32_ImageNet_batch_size_128", 2, 128, 1008);
+BM_Softmax(bfloat16, 4, "bfloat16_BERT_batch_size_1", 4, 1, 16, 384, 384);
+BM_Softmax(bfloat16, 4, "bfloat16_BERT_batch_size_16", 4, 16, 16, 384, 384);
+BM_Softmax(bfloat16, 1, "bfloat16_ImageNet_batch_size_32", 2, 32, 1008);
+BM_Softmax(bfloat16, 1, "bfloat16_ImageNet_batch_size_128", 2, 128, 1008);
+BM_Softmax(bfloat16, 4, "bfloat16_ImageNet_batch_size_32", 2, 32, 1008);
+BM_Softmax(bfloat16, 4, "bfloat16_ImageNet_batch_size_128", 2, 128, 1008);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 1b7124eacfa..8187a9d33c2 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -18,6 +18,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/kernels:__subpackages__",
     ],
@@ -27,7 +28,7 @@ package(
 package_group(
     name = "friends",
     packages = [
-        "//third_party/car/...",
+        "//waymo/...",
     ],
 )
 
@@ -127,18 +128,10 @@ tf_kernel_library(
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
-        ":gpu_abs_kernels",
-        ":gpu_acos_kernels",
-        ":gpu_acosh_kernels",
         ":gpu_angle_kernels",
-        ":gpu_asin_kernels",
-        ":gpu_asinh_kernels",
-        ":gpu_atan_kernels",
         ":gpu_ceil_kernels",
         ":gpu_complex_abs_kernels",
         ":gpu_conj_kernels",
-        ":gpu_cos_kernels",
-        ":gpu_cosh_kernels",
         ":gpu_digamma_kernels",
         ":gpu_erf_kernels",
         ":gpu_erfc_kernels",
@@ -151,8 +144,8 @@ tf_kernel_library(
         ":gpu_is_inf_kernels",
         ":gpu_is_nan_kernels",
         ":gpu_lgamma_kernels",
-        ":gpu_log1p_kernels",
         ":gpu_log_kernels",
+        ":gpu_log1p_kernels",
         ":gpu_logical_not_kernels",
         ":gpu_neg_kernels",
         ":gpu_real_kernels",
@@ -162,16 +155,40 @@ tf_kernel_library(
         ":gpu_rsqrt_kernels",
         ":gpu_sigmoid_kernels",
         ":gpu_sign_kernels",
-        ":gpu_sin_kernels",
-        ":gpu_sinh_kernels",
         ":gpu_sqrt_kernels",
         ":gpu_square_kernels",
-        ":gpu_tan_kernels",
-        ":gpu_tanh_kernels",
         "//third_party/eigen3",
     ]) + if_mlir_generated_experimental_kernels_enabled(
-        [":gpu_atanh_kernels_experimental"],
-        [":gpu_atanh_kernels"],
+        [
+            ":gpu_acos_kernels_experimental",
+            ":gpu_acosh_kernels_experimental",
+            ":gpu_asin_kernels_experimental",
+            ":gpu_asinh_kernels_experimental",
+            ":gpu_atan_kernels_experimental",
+            ":gpu_cos_kernels_experimental",
+            ":gpu_cosh_kernels_experimental",
+            ":gpu_sin_kernels_experimental",
+            ":gpu_sinh_kernels_experimental",
+            ":gpu_tan_kernels_experimental",
+            ":gpu_tanh_kernels_experimental",
+            ":gpu_abs_kernels_experimental",
+            ":gpu_atanh_kernels_experimental",
+        ],
+        [
+            ":gpu_abs_kernels",
+            ":gpu_atanh_kernels",
+            ":gpu_acos_kernels",
+            ":gpu_acosh_kernels",
+            ":gpu_asin_kernels",
+            ":gpu_asinh_kernels",
+            ":gpu_atan_kernels",
+            ":gpu_cos_kernels",
+            ":gpu_cosh_kernels",
+            ":gpu_sin_kernels",
+            ":gpu_sinh_kernels",
+            ":gpu_tan_kernels",
+            ":gpu_tanh_kernels",
+        ],
     ),
 )
 
@@ -469,6 +486,27 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "gpu_unary_ops_large_tensor_test",
+    size = "large",
+    srcs = ["gpu_unary_ops_large_tensor_test.cc"],
+    extra_copts = if_mlir_generated_experimental_kernels_enabled([
+        "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
+    ]) + if_mlir_generated_gpu_kernels_enabled(
+        ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
+    ),
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_asan",  # TODO(b/171341759): re-enable.
+        "no_cuda",  # TODO(b/196608406): re-enable
+    ],
+    deps = [
+        ":base_ops_test",
+        ":base_unary_ops_test",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+    ],
+)
+
 cc_library(
     name = "base_binary_ops_test",
     testonly = 1,
@@ -517,157 +555,136 @@ tf_cuda_cc_test(
     ],
 )
 
-# TODO(b/160731748): Re-enable when it works again.
-# gpu_kernel_library(
-#     name = "gpu_bias_add_kernels",
-#     op = "bias_add",
-#     tile_size = "16x16",
-#     types = [
-#         "f16",
-#         "f32",
-#         "f64",
-#     ],
-# )
+# Trigonometric kernels
 
 gpu_kernel_library(
-    name = "gpu_relu_kernels",
-    jit_types = [
-        "i8",
-        "i16",
-        "i64",
-        "ui8",
-        "ui16",
-        "ui32",
-        "ui64",
-    ],
-    op = "relu",
+    name = "gpu_acos_kernels",
+    op = "acos",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
-    unroll_factors = "16B",
+    # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_elu_kernels",
-    op = "elu",
-    tile_size = "256",
-    types = [
+    name = "gpu_acos_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
+    op = "acos",
+    tile_size = "256",
+    types = [],
+    # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_selu_kernels",
-    op = "selu",
+    name = "gpu_acosh_kernels",
+    op = "acosh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
+    # May be compute-bound.
 )
 
 gpu_kernel_library(
-    name = "gpu_sigmoid_kernels",
-    op = "sigmoid",
-    tile_size = "256",
-    types = [
+    name = "gpu_acosh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
+    op = "acosh",
+    tile_size = "256",
+    types = [],
+    # May be compute-bound.
 )
 
-# TODO(b/25387198): Add an int32 kernel.
 gpu_kernel_library(
-    name = "gpu_abs_kernels",
-    jit_types = [
-        "i8",
-        "i16",
-    ],
-    op = "abs",
+    name = "gpu_asin_kernels",
+    op = "asin",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
-        "i64",
     ],
-    unroll_factors = "4",
+    # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_acos_kernels",
-    op = "acos",
-    tile_size = "256",
-    types = [
+    name = "gpu_asin_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
+    op = "asin",
+    tile_size = "256",
+    types = [],
     # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_acosh_kernels",
-    op = "acosh",
+    name = "gpu_asinh_kernels",
+    op = "asinh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
-    # May be compute-bound.
-    # unroll_factors = "4",
+    # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_angle_kernels",
-    op = "angle",
-    output_types = [
+    name = "gpu_asinh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
         "f32",
         "f64",
     ],
+    op = "asinh",
     tile_size = "256",
-    types = [
-        "c64",
-        "c128",
-    ],
-    unroll_factors = "2",
+    types = [],
+    # Cannot vectorize.
 )
 
 gpu_kernel_library(
-    name = "gpu_asin_kernels",
-    op = "asin",
+    name = "gpu_atan_kernels",
+    op = "atan",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
-    # Cannot vectorize.
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_asinh_kernels",
-    op = "asinh",
-    tile_size = "256",
-    types = [
+    name = "gpu_atan_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
-    # Cannot vectorize.
+    op = "atan",
+    tile_size = "256",
+    types = [],
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_atan_kernels",
-    op = "atan",
+    name = "gpu_atanh_kernels",
+    op = "atanh",
     tile_size = "256",
     types = [
         "f16",
@@ -678,9 +695,22 @@ gpu_kernel_library(
 )
 
 gpu_kernel_library(
-    name = "gpu_atanh_kernels",
+    name = "gpu_atanh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
     op = "atanh",
     tile_size = "256",
+    types = [],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_atan2_kernels",
+    op = "atan2",
+    tile_size = "256",
     types = [
         "f16",
         "f32",
@@ -690,29 +720,34 @@ gpu_kernel_library(
 )
 
 gpu_kernel_library(
-    name = "gpu_atanh_kernels_experimental",
-    jit_i64_indexed_for_large_tensors_types = ["f16"],
-    op = "atanh",
-    test_tags = [
-        "noasan",  # TODO(b/248071004): JIT-compiled code will not be instrumented.
+    name = "gpu_cos_kernels",
+    jit_types = [
+        "c64",
+        "c128",
     ],
+    op = "cos",
     tile_size = "256",
     types = [
+        "f16",
         "f32",
         "f64",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_conj_kernels",
-    op = "conj",
-    tile_size = "256",
-    types = [
+    name = "gpu_cos_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    jit_types = [
         "c64",
         "c128",
     ],
-    unroll_factors = "2",
+    op = "cos",
+    tile_size = "256",
+    types = [],
 )
 
 gpu_kernel_library(
@@ -729,522 +764,1127 @@ gpu_kernel_library(
         "f64",
     ],
     # May be compute-bound.
-    # unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_erf_kernels",
-    op = "erf",
-    tile_size = "256",
-    types = [
+    name = "gpu_cosh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
-    unroll_factors = "4",
+    jit_types = [
+        "c64",
+        "c128",
+    ],
+    op = "cosh",
+    tile_size = "256",
+    types = [],
+    # May be compute-bound.
 )
 
 gpu_kernel_library(
-    name = "gpu_erfc_kernels",
-    op = "erfc",
+    name = "gpu_sin_kernels",
+    jit_types = [
+        "c64",
+        "c128",
+    ],
+    op = "sin",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_imag_kernels",
-    op = "imag",
-    output_types = [
+    name = "gpu_sin_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
         "f32",
         "f64",
     ],
-    tile_size = "256",
-    types = [
+    jit_types = [
         "c64",
         "c128",
     ],
+    op = "sin",
+    tile_size = "256",
+    types = [],
 )
 
 gpu_kernel_library(
-    name = "gpu_logical_not_kernels",
-    op = "logical_not",
+    name = "gpu_sinh_kernels",
+    jit_types = [
+        "c64",
+        "c128",
+    ],
+    op = "sinh",
     tile_size = "256",
-    types = ["i1"],
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    # May be compute-bound.
 )
 
 gpu_kernel_library(
-    name = "gpu_real_kernels",
-    op = "real",
-    output_types = [
+    name = "gpu_sinh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
         "f32",
         "f64",
     ],
-    tile_size = "256",
-    types = [
+    jit_types = [
         "c64",
         "c128",
     ],
+    op = "sinh",
+    tile_size = "256",
+    types = [],
+    # May be compute-bound.
 )
 
 gpu_kernel_library(
-    name = "gpu_reciprocal_kernels",
-    op = "reciprocal",
-    tile_size = "256",
-    types = [
+    name = "gpu_tan_kernels",
+    jit_types = [
         "c64",
         "c128",
+    ],
+    op = "tan",
+    tile_size = "256",
+    types = [
         "f16",
         "f32",
         "f64",
-        "i64",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_polygamma_kernels",
-    op = "polygamma",
-    tile_size = "256",
-    types = [
+    name = "gpu_tan_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
         "f32",
         "f64",
     ],
+    jit_types = [
+        "c64",
+        "c128",
+    ],
+    op = "tan",
+    tile_size = "256",
+    types = [],
 )
 
 gpu_kernel_library(
-    name = "gpu_digamma_kernels",
-    op = "digamma",
+    name = "gpu_tanh_kernels",
+    op = "tanh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_lgamma_kernels",
-    op = "lgamma",
-    tile_size = "256",
-    types = [
+    name = "gpu_tanh_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
         "f64",
     ],
+    op = "tanh",
+    tile_size = "256",
+    types = [],
+    unroll_factors = "4",
 )
 
+# Rounding kernels
+
 gpu_kernel_library(
-    name = "gpu_sign_kernels",
-    jit_types = [
-        "i8",
-        "i16",
-    ],
-    op = "sign",
+    name = "gpu_ceil_kernels",
+    op = "ceil",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
-        "i32",
-        "i64",
-        "c64",
-        "c128",
     ],
     unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_sinh_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "sinh",
+    name = "gpu_floor_kernels",
+    op = "floor",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
-    # May be compute-bound.
-    # unroll_factors = "4",
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_square_kernels",
+    name = "gpu_floor_mod_kernels",
     jit_types = [
         "i8",
         "i16",
+        "i64",
         "ui8",
         "ui16",
         "ui32",
         "ui64",
+        "f16",
+        "f32",
+        "f64",
     ],
-    op = "square",
+    op = "floor_mod",
+    tile_size = "1024",
+    types = [],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_rint_kernels",
+    jit_types = ["f16"],
+    op = "rint",
     tile_size = "1024",
     types = [
-        "f16",
         "f32",
         "f64",
-        "i64",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_squared_difference_kernels",
-    op = "squared_difference",
+    name = "gpu_round_kernels",
+    op = "round",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
+        "i32",
         "i64",
     ],
-    unroll_factors = "4",
 )
 
+# Predicate kernels
+
 gpu_kernel_library(
-    name = "gpu_add_v2_kernels",
-    op = "add_v2",
+    name = "gpu_equal_kernels",
+    op = "equal",
+    output_types = ["i1"] * 10,
     tile_size = "1024",
     types = [
+        "c64",
+        "c128",
         "f16",
         "f32",
         "f64",
+        "i1",
         "i8",
         "i16",
         "i32",
         "i64",
-        "c64",
-        "c128",
     ],
     unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_sub_kernels",
-    jit_types = [
+    name = "gpu_greater_kernels",
+    op = "greater",
+    output_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
         "i8",
         "i16",
+        "i64",
         "ui8",
         "ui16",
+        "ui32",
+        "ui64",
     ],
-    op = "sub",
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_greater_equal_kernels",
+    op = "greater_equal",
+    output_types = ["i1"] * 10,
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
-        "i32",
+        "i8",
+        "i16",
         "i64",
-        "c64",
-        "c128",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
     ],
     unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_complex_kernels",
-    op = "complex",
-    output_types = [
-        "c64",
-        "c128",
-    ],
-    tile_size = "1024",
+    name = "gpu_is_finite_kernels",
+    op = "is_finite",
+    output_types = ["i1"] * 3,
+    tile_size = "256",
     types = [
+        "f16",
         "f32",
         "f64",
     ],
-    unroll_factors = "2",
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_complex_abs_kernels",
-    op = "complex_abs",
-    output_types = [
+    name = "gpu_is_inf_kernels",
+    op = "is_inf",
+    output_types = ["i1"] * 3,
+    tile_size = "256",
+    types = [
+        "f16",
         "f32",
         "f64",
     ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_is_nan_kernels",
+    op = "is_nan",
+    output_types = ["i1"] * 3,
     tile_size = "256",
     types = [
-        "c64",
-        "c128",
+        "f16",
+        "f32",
+        "f64",
     ],
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_div_kernels",
-    jit_types = [
-        "i8",
-        "ui32",
-        "ui64",
-    ],
-    op = "div",
+    name = "gpu_less_kernels",
+    op = "less",
+    output_types = ["i1"] * 10,
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
+        "i8",
         "i16",
         "i64",
         "ui8",
         "ui16",
-        "c64",
-        "c128",
+        "ui32",
+        "ui64",
     ],
-    unroll_factors = "16B",
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_truncate_div_kernels",
-    jit_types = [
+    name = "gpu_less_equal_kernels",
+    op = "less_equal",
+    output_types = ["i1"] * 10,
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
         "i8",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
         "ui32",
         "ui64",
     ],
-    op = "truncate_div",
-    tile_size = "1024",
-    types = [],
+    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_mul_kernels",
-    op = "mul",
+    name = "gpu_not_equal_kernels",
+    op = "not_equal",
+    output_types = ["i1"] * 10,
     tile_size = "1024",
     types = [
+        "c64",
+        "c128",
         "f16",
         "f32",
         "f64",
-        "c64",
-        "c128",
+        "i1",
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
-    # For complex MulOp kernels, we don't use unrolling, it would only cause
-    # slowdowns.
-    unroll_factors_override = {
-        "c64": None,
-        "c128": None,
-    },
 )
 
+# Complex-specifc kernels
+
 gpu_kernel_library(
-    name = "gpu_mul_no_nan_kernels",
-    op = "mul_no_nan",
+    name = "gpu_angle_kernels",
+    op = "angle",
+    output_types = [
+        "f32",
+        "f64",
+    ],
+    tile_size = "256",
+    types = [
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "2",
+)
+
+gpu_kernel_library(
+    name = "gpu_complex_abs_kernels",
+    op = "complex_abs",
+    output_types = [
+        "f32",
+        "f64",
+    ],
+    tile_size = "256",
+    types = [
+        "c64",
+        "c128",
+    ],
+)
+
+gpu_kernel_library(
+    name = "gpu_complex_kernels",
+    op = "complex",
+    output_types = [
+        "c64",
+        "c128",
+    ],
     tile_size = "1024",
     types = [
-        "f16",
         "f32",
         "f64",
+    ],
+    unroll_factors = "2",
+)
+
+gpu_kernel_library(
+    name = "gpu_conj_kernels",
+    op = "conj",
+    tile_size = "256",
+    types = [
         "c64",
         "c128",
     ],
-    unroll_factors = "4",
-    # For complex MulNoNanOp kernels, we don't use unrolling, it would only
-    # cause slowdowns.
-    unroll_factors_override = {
-        "c64": None,
-        "c128": None,
-    },
+    unroll_factors = "2",
 )
 
-# Bitwise operations.
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        tile_size = "1024",
-        types = [
-            "i8",
-            "i16",
-            "i32",
-            "i64",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "bitwise_and",
-        "bitwise_or",
-        "bitwise_xor",
-        "invert",
-        "left_shift",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_imag_kernels",
+    op = "imag",
+    output_types = [
+        "f32",
+        "f64",
+    ],
+    tile_size = "256",
+    types = [
+        "c64",
+        "c128",
+    ],
+)
 
 gpu_kernel_library(
-    name = "gpu_right_shift_kernels",
-    op = "right_shift",
-    tile_size = "1024",
+    name = "gpu_real_kernels",
+    op = "real",
+    output_types = [
+        "f32",
+        "f64",
+    ],
+    tile_size = "256",
     types = [
-        "i8",
-        "i16",
-        "i32",
-        "i64",
-        "ui8",
-        "ui16",
-        "ui32",
-        "ui64",
+        "c64",
+        "c128",
     ],
-    unroll_factors = "4",
 )
 
+# Arithmetic kernels
+
+# TODO(b/25387198): Add an int32 kernel.
 gpu_kernel_library(
-    name = "gpu_atan2_kernels",
-    op = "atan2",
+    name = "gpu_abs_kernels",
+    jit_types = [
+        "i8",
+        "i16",
+    ],
+    op = "abs",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
+        "i64",
     ],
     unroll_factors = "4",
 )
 
-# Logical operations.
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        tile_size = "1024",
-        types = [
-            "i1",
-        ],
-    )
-    for op in [
-        "logical_and",
-        "logical_or",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_abs_kernels_experimental",
+    jit_i64_indexed_for_large_tensors_types = [
+        "f16",
+        "f32",
+        "f64",
+        "i64",
+    ],
+    jit_types = [
+        "i8",
+        "i16",
+    ],
+    op = "abs",
+    tile_size = "256",
+    types = [],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_add_v2_kernels",
+    op = "add_v2",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_div_kernels",
+    jit_types = [
+        "i8",
+        "ui32",
+        "ui64",
+    ],
+    op = "div",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i16",
+        "i64",
+        "ui8",
+        "ui16",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "16B",
+)
+
+gpu_kernel_library(
+    name = "gpu_div_no_nan_kernels",
+    op = "div_no_nan",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+)
+
+gpu_kernel_library(
+    name = "gpu_floor_div_kernels",
+    jit_types = [
+        "i8",
+        "ui32",
+        "ui64",
+    ],
+    op = "floor_div",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "i16",
+        "i64",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_maximum_kernels",
+    jit_types = [
+        "i8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "maximum",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i16",
+        "i64",
+        "ui8",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_minimum_kernels",
+    jit_types = [
+        "i8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "minimum",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i16",
+        "i64",
+        "ui8",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_mul_kernels",
+    op = "mul",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+    # For complex MulOp kernels, we don't use unrolling, it would only cause
+    # slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
+
+gpu_kernel_library(
+    name = "gpu_mul_no_nan_kernels",
+    op = "mul_no_nan",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+    # For complex MulNoNanOp kernels, we don't use unrolling, it would only
+    # cause slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
+
+gpu_kernel_library(
+    name = "gpu_neg_kernels",
+    op = "neg",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_pow_kernels",
+    jit_types = [
+        "i8",
+        "i16",
+    ],
+    op = "pow",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i64",
+    ],
+)
+
+gpu_kernel_library(
+    name = "gpu_reciprocal_kernels",
+    op = "reciprocal",
+    tile_size = "256",
+    types = [
+        "c64",
+        "c128",
+        "f16",
+        "f32",
+        "f64",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_sign_kernels",
+    jit_types = [
+        "i8",
+        "i16",
+    ],
+    op = "sign",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_sub_kernels",
+    jit_types = [
+        "i8",
+        "i16",
+        "ui8",
+        "ui16",
+    ],
+    op = "sub",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i32",
+        "i64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_truncate_div_kernels",
+    jit_types = [
+        "i8",
+        "ui32",
+        "ui64",
+        "f16",
+        "f32",
+        "f64",
+    ],
+    op = "truncate_div",
+    tile_size = "1024",
+    types = [],
+)
+
+gpu_kernel_library(
+    name = "gpu_xdivy_kernels",
+    op = "xdivy",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+# Logarithmic and exponential kernels
+gpu_kernel_library(
+    name = "gpu_exp_kernels",
+    op = "exp",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_expm1_kernels",
+    op = "expm1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_log_kernels",
+    op = "log",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_log1p_kernels",
+    op = "log1p",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_xlogy_kernels",
+    op = "xlogy",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+    # For complex XlogyOp kernels, we don't use unrolling, it would only cause
+    # slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
 
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        output_types = ["i1"] * 10,
-        tile_size = "1024",
-        types = [
-            "c64",
-            "c128",
-            "f16",
-            "f32",
-            "f64",
-            "i1",
-            "i8",
-            "i16",
-            "i32",
-            "i64",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "equal",
-        "not_equal",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_xlog1py_kernels",
+    op = "xlog1py",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+    # For complex Xlog1pyOp kernels, we don't use unrolling, it would only cause
+    # slowdowns.
+    unroll_factors_override = {
+        "c64": None,
+        "c128": None,
+    },
+)
 
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        output_types = ["i1"] * 10,
-        tile_size = "1024",
-        types = [
-            "f16",
-            "f32",
-            "f64",
-            "i8",
-            "i16",
-            "i64",
-            "ui8",
-            "ui16",
-            "ui32",
-            "ui64",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "less",
-        "less_equal",
-        "greater",
-        "greater_equal",
-    ]
-]
+# Square and square root kernels
 
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        jit_types = [
-            "i8",
-            "ui16",
-            "ui32",
-            "ui64",
-        ],
-        op = op,
-        tile_size = "1024",
-        types = [
-            "f16",
-            "f32",
-            "f64",
-            "i16",
-            "i64",
-            "ui8",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "maximum",
-        "minimum",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_sqrt_kernels",
+    op = "sqrt",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_rsqrt_kernels",
+    op = "rsqrt",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_square_kernels",
+    jit_types = [
+        "i8",
+        "i16",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    op = "square",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_squared_difference_kernels",
+    op = "squared_difference",
+    tile_size = "1024",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+# Bitwise operations.
+
+gpu_kernel_library(
+    name = "gpu_bitwise_and_kernels",
+    op = "bitwise_and",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_bitwise_or_kernels",
+    op = "bitwise_or",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_bitwise_xor_kernels",
+    op = "bitwise_xor",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_invert_kernels",
+    op = "invert",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_left_shift_kernels",
+    op = "left_shift",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_right_shift_kernels",
+    op = "right_shift",
+    tile_size = "1024",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+        "ui8",
+        "ui16",
+        "ui32",
+        "ui64",
+    ],
+    unroll_factors = "4",
+)
+
+# Logical kernels
+
+gpu_kernel_library(
+    name = "gpu_logical_not_kernels",
+    op = "logical_not",
+    tile_size = "256",
+    types = ["i1"],
+)
+
+gpu_kernel_library(
+    name = "gpu_logical_and_kernels",
+    op = "logical_and",
+    tile_size = "1024",
+    types = [
+        "i1",
+    ],
+)
+
+gpu_kernel_library(
+    name = "gpu_logical_or_kernels",
+    op = "logical_or",
+    tile_size = "1024",
+    types = [
+        "i1",
+    ],
+)
+
+# Erf kernels
+
+gpu_kernel_library(
+    name = "gpu_erf_kernels",
+    op = "erf",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gpu_kernel_library(
+    name = "gpu_erfc_kernels",
+    op = "erfc",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+# Gamma kernels
+
+gpu_kernel_library(
+    name = "gpu_polygamma_kernels",
+    op = "polygamma",
+    tile_size = "256",
+    types = [
+        "f32",
+        "f64",
+    ],
+)
 
 gpu_kernel_library(
-    name = "gpu_neg_kernels",
-    op = "neg",
+    name = "gpu_digamma_kernels",
+    op = "digamma",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
-        "i8",
-        "i16",
-        "i32",
-        "i64",
-        "c64",
-        "c128",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_floor_div_kernels",
-    jit_types = [
-        "i8",
-        "ui32",
-        "ui64",
-    ],
-    op = "floor_div",
-    tile_size = "1024",
+    name = "gpu_lgamma_kernels",
+    op = "lgamma",
+    tile_size = "256",
     types = [
         "f16",
         "f32",
-        "i16",
-        "i64",
         "f64",
     ],
-    unroll_factors = "4",
 )
 
 gpu_kernel_library(
-    name = "gpu_floor_mod_kernels",
+    # The zeta kernels needs many registers so tile at 256.
+    name = "gpu_zeta_kernels",
+    op = "zeta",
+    tile_size = "256",
+    types = [
+        "f32",
+        "f64",
+    ],
+    # TODO(b/178388085): Enable unrolling after vectorization is fixed.
+    # unroll_factors = "4",
+)
+
+# TODO(b/160731748): Re-enable when it works again.
+# gpu_kernel_library(
+#     name = "gpu_bias_add_kernels",
+#     op = "bias_add",
+#     tile_size = "16x16",
+#     types = [
+#         "f16",
+#         "f32",
+#         "f64",
+#     ],
+# )
+
+gpu_kernel_library(
+    name = "gpu_relu_kernels",
     jit_types = [
         "i8",
         "i16",
@@ -1253,87 +1893,54 @@ gpu_kernel_library(
         "ui16",
         "ui32",
         "ui64",
+    ],
+    op = "relu",
+    tile_size = "256",
+    types = [
         "f16",
         "f32",
         "f64",
     ],
-    op = "floor_mod",
-    tile_size = "1024",
-    types = [],
-    unroll_factors = "4",
+    unroll_factors = "16B",
 )
 
-# Kernels that support all floating-point types.
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        tile_size = "256",
-        types = [
-            "f16",
-            "f32",
-            "f64",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "ceil",
-        "expm1",
-        "floor",
-        "log",
-        "log1p",
-        "relu_grad",
-        "rsqrt",
-        "softplus",
-        "softsign",
-        "sqrt",
-        "tanh",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_elu_kernels",
+    op = "elu",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+)
 
 gpu_kernel_library(
-    name = "gpu_exp_kernels",
-    op = "exp",
+    name = "gpu_selu_kernels",
+    op = "selu",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
-        "c64",
-        "c128",
     ],
-    unroll_factors = "4",
 )
 
-# Kernels that support all floating-point types but have i1 output.
-[
-    gpu_kernel_library(
-        name = "gpu_" + op + "_kernels",
-        op = op,
-        output_types = ["i1"] * 3,
-        tile_size = "256",
-        types = [
-            "f16",
-            "f32",
-            "f64",
-        ],
-        unroll_factors = "4",
-    )
-    for op in [
-        "is_finite",
-        "is_inf",
-        "is_nan",
-    ]
-]
+gpu_kernel_library(
+    name = "gpu_sigmoid_kernels",
+    op = "sigmoid",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+)
 
-# Kernels that support all floating-point types but cannot be vectorized.
+# Kernels that support all floating-point types.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
-        jit_types = [
-            "c64",
-            "c128",
-        ],
         op = op,
         tile_size = "256",
         types = [
@@ -1341,11 +1948,12 @@ gpu_kernel_library(
             "f32",
             "f64",
         ],
+        unroll_factors = "4",
     )
     for op in [
-        "cos",
-        "sin",
-        "tan",
+        "relu_grad",
+        "softplus",
+        "softsign",
     ]
 ]
 
@@ -1382,35 +1990,6 @@ gpu_kernel_library(
             ["c64"] * 14 + ["c128"] * 14,
 )
 
-gpu_kernel_library(
-    name = "gpu_pow_kernels",
-    jit_types = [
-        "i8",
-        "i16",
-    ],
-    op = "pow",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "i64",
-    ],
-)
-
-gpu_kernel_library(
-    # The zeta kernels needs many registers so tile at 256.
-    name = "gpu_zeta_kernels",
-    op = "zeta",
-    tile_size = "256",
-    types = [
-        "f32",
-        "f64",
-    ],
-    # TODO(b/178388085): Enable unrolling after vectorization is fixed.
-    # unroll_factors = "4",
-)
-
 gpu_kernel_library(
     name = "gpu_select_v2_kernels",
     jit_types = [
@@ -1436,97 +2015,6 @@ gpu_kernel_library(
     ],
 )
 
-gpu_kernel_library(
-    name = "gpu_xlogy_kernels",
-    op = "xlogy",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "c64",
-        "c128",
-    ],
-    unroll_factors = "4",
-    # For complex XlogyOp kernels, we don't use unrolling, it would only cause
-    # slowdowns.
-    unroll_factors_override = {
-        "c64": None,
-        "c128": None,
-    },
-)
-
-gpu_kernel_library(
-    name = "gpu_xdivy_kernels",
-    op = "xdivy",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "c64",
-        "c128",
-    ],
-    unroll_factors = "4",
-)
-
-gpu_kernel_library(
-    name = "gpu_xlog1py_kernels",
-    op = "xlog1py",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "c64",
-        "c128",
-    ],
-    unroll_factors = "4",
-    # For complex Xlog1pyOp kernels, we don't use unrolling, it would only cause
-    # slowdowns.
-    unroll_factors_override = {
-        "c64": None,
-        "c128": None,
-    },
-)
-
-gpu_kernel_library(
-    name = "gpu_div_no_nan_kernels",
-    op = "div_no_nan",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "c64",
-        "c128",
-    ],
-)
-
-gpu_kernel_library(
-    name = "gpu_rint_kernels",
-    jit_types = ["f16"],
-    op = "rint",
-    tile_size = "1024",
-    types = [
-        "f32",
-        "f64",
-    ],
-)
-
-gpu_kernel_library(
-    name = "gpu_round_kernels",
-    op = "round",
-    tile_size = "1024",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "i32",
-        "i64",
-    ],
-)
-
 gpu_kernel_library(
     name = "gpu_zeros_like_kernels",
     jit_types = [
diff --git a/tensorflow/core/kernels/mlir_generated/base_ops_test.cc b/tensorflow/core/kernels/mlir_generated/base_ops_test.cc
index a858781fc0a..a45cc9b9ec4 100644
--- a/tensorflow/core/kernels/mlir_generated/base_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/base_ops_test.cc
@@ -20,5 +20,10 @@ namespace test {
 
 TensorShape DefaultInputShape() { return TensorShape{7, 13}; }
 
+TensorShape DefaultInputShapeExceedingInt32() {
+  const int64_t kExceetingInt32 = (static_cast<int64_t>(1) << 32) + 123;
+  return TensorShape{kExceetingInt32};
+}
+
 }  // namespace test
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/base_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_ops_test.h
index 225db341adb..685800fcd46 100644
--- a/tensorflow/core/kernels/mlir_generated/base_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_ops_test.h
@@ -45,9 +45,10 @@ absl::InlinedVector<T, 10> InputAsVector(
 
 template <typename T>
 absl::InlinedVector<T, 10> RepeatInputToMatchShape(
-    absl::InlinedVector<T, 10> input, int size) {
+    absl::InlinedVector<T, 10> input, int64_t size) {
   absl::InlinedVector<T, 10> result;
-  for (int i = 0; i < size; i++) {
+  result.reserve(size);
+  for (int64_t i = 0; i < size; i++) {
     auto value = input[i % input.size()];
     result.push_back(value);
   }
@@ -56,10 +57,11 @@ absl::InlinedVector<T, 10> RepeatInputToMatchShape(
 
 template <typename T>
 absl::InlinedVector<T, 10> RepeatElements(absl::InlinedVector<T, 10> input,
-                                          int num_repeats) {
+                                          int64_t num_repeats) {
   absl::InlinedVector<T, 10> result;
+  result.reserve(input.size() * num_repeats);
   for (T value : input) {
-    for (int i = 0; i < num_repeats; ++i) {
+    for (int64_t i = 0; i < num_repeats; ++i) {
       result.push_back(value);
     }
   }
@@ -69,6 +71,7 @@ absl::InlinedVector<T, 10> RepeatElements(absl::InlinedVector<T, 10> input,
 /// Helper functions to get default input shapes.
 
 TensorShape DefaultInputShape();
+TensorShape DefaultInputShapeExceedingInt32();
 
 /// Helper functions to configure tests.
 
diff --git a/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
index 9feb8575124..3a4f8e474f0 100644
--- a/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
@@ -166,7 +166,8 @@ class UnaryOpsTestBase : public OpsTestBase {
       absl::InlinedVector<T, 10> input,
       const BaselineCallback& baseline_callback) {
     absl::InlinedVector<OutT, 10> expected_output;
-    for (int i = 0; i < input.size(); i++) {
+    expected_output.reserve(input.size());
+    for (int64_t i = 0; i < input.size(); i++) {
       auto arg = static_cast<BaselineT>(input[i]);
       auto result = static_cast<OutT>(baseline_callback(arg));
       expected_output.push_back(result);
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 0e1e175c5a2..dbe565687c6 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -308,7 +308,7 @@ def _gen_kernel_library(
 
     # Partially JIT-compiled kernels
     true_i64jits = [True for i in jit_i64_indexed_for_large_tensors_types]
-    false_jits = [True for i in jit_i64_indexed_for_large_tensors_types]
+    false_jits = [False for i in jit_i64_indexed_for_large_tensors_types]
     all_paratial_jit_kernels = zip(
         jit_i64_indexed_for_large_tensors_types,
         output_jit_i64_indexed_for_large_tensors_types,
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
index 08fb7a5bcb8..8591693ca02 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
@@ -392,6 +392,13 @@ TEST_F(BinaryOpsTest, DivComplex128SpecialCases) {
 
 /// Test `tf.TruncatedDiv`
 
+template <typename T>
+T baseline_truncate_div(T lhs, T rhs) {
+  T res = lhs / rhs;
+  if (res < 0) return ceil(res);
+  return floor(res);
+}
+
 // These kernels are JIT-compiled.
 #if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
@@ -406,6 +413,18 @@ GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
     TruncateDiv, /*test_name=*/Uint64, uint64_t, uint64_t,
     test::DefaultInput<uint64_t>(), test::DefaultInputNonZero<uint64_t>(),
     baseline_div, test::OpsTestConfig().ExpectStrictlyEqual())
+GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
+    TruncateDiv, /*test_name=*/Half, Eigen::half, Eigen::half,
+    test::DefaultInput<Eigen::half>(), test::DefaultInputNonZero<Eigen::half>(),
+    baseline_truncate_div, test::OpsTestConfig().ExpectStrictlyEqual())
+GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
+    TruncateDiv, /*test_name=*/Float, float, float, test::DefaultInput<float>(),
+    test::DefaultInputNonZero<float>(), baseline_truncate_div,
+    test::OpsTestConfig().ExpectStrictlyEqual())
+GENERATE_DEFAULT_TESTS_WITH_SPECIFIC_INPUT_VALUES(
+    TruncateDiv, /*test_name=*/Double, double, double,
+    test::DefaultInput<double>(), test::DefaultInputNonZero<double>(),
+    baseline_truncate_div, test::OpsTestConfig().ExpectStrictlyEqual())
 #endif
 
 /// Test `tf.DivNoNan`.
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_op_truncate_div.cc b/tensorflow/core/kernels/mlir_generated/gpu_op_truncate_div.cc
index 1ffaff778e6..05ca6ca3e77 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_op_truncate_div.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_op_truncate_div.cc
@@ -22,5 +22,8 @@ namespace tensorflow {
 GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_INT8);
 GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_UINT32);
 GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_UINT64);
+GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_HALF);
+GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_FLOAT);
+GENERATE_AND_REGISTER_BINARY_JIT_GPU_KERNEL(TruncateDiv, DT_DOUBLE);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc
new file mode 100644
index 00000000000..46784feb8b7
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/core/kernels/mlir_generated/base_ops_test.h"
+#include "tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h"
+
+namespace tensorflow {
+namespace {
+
+// Test fixture `UnaryOpsLargeTensorTest` that sets the TF device is expected by
+// the TEST macros below.
+class UnaryOpsLargeTensorTest : public UnaryOpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+};
+
+/// Test `tf.Abs`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(UnaryOpsLargeTensorTest, Abs) {
+  Test<float, float, float, float>(
+      "Abs", test::DefaultInputShapeExceedingInt32(),
+      test::DefaultInput<float>(), std::abs,
+      test::OpsTestConfig().ExpectStrictlyEqual().SuppressTolerance());
+}
+
+#endif
+
+/// Test `tf.Atanh`.
+
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
+    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
+
+TEST_F(UnaryOpsLargeTensorTest, Atanh) {
+  Test<float, float, float, float>("Atanh",
+                                   test::DefaultInputShapeExceedingInt32(),
+                                   test::DefaultInput<float>(), std::atanh,
+                                   test::OpsTestConfig().ExpectStrictlyEqual());
+}
+
+#endif
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 45bc1bc5860..6136177effa 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -82,6 +82,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     Eigen::IndexList<Eigen::type2index<1>, int, Eigen::type2index<1>> oso;
     oso.set(1, num_samples);
 
+    Eigen::IndexList<int> flat_shape;
+    flat_shape.set(0, batch_size * num_samples * num_classes);
+
     // Calculates "scores = logits - log(-log(noises))"; B*C*S elements.
     // NOTE: we don't store back to "noises" because having it appear on both
     // sides is potentially unsafe (e.g. Eigen may use ldg() to load RHS data).
@@ -89,9 +92,12 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     // not affect any of the other numbers (smallest is ~1e-7), but not so small
     // that log(x) == -inf, which is why it needs to be larger than 0 in the
     // first place.
-    To32Bit(scores).device(d) =
-        To32Bit(logits).reshape(boc).broadcast(oso).template cast<float>() -
-        ((-((To32Bit(noises) + 2e-30f).log())).log());
+    To32Bit(scores).device(d) = To32Bit(logits)
+                                    .reshape(boc)
+                                    .broadcast(oso)
+                                    .template cast<float>()
+                                    .reshape(flat_shape) -
+                                ((-((To32Bit(noises) + 2e-30f).log())).log());
 
     // Max-reduce along classes for each (batch, sample).
     typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index e6af9115141..b8ade086a8f 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -20,7 +20,11 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
 #elif TENSORFLOW_USE_ROCM
+#if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif
 #endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index ad31bdb6a9c..104a4c9421d 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -66,7 +66,7 @@ class NthElementOp : public OpKernel {
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
     for (int i = 0; i < num_dims - 1; ++i) {
-      out_shape.AddDim(input_in.dim_size(i));
+      OP_REQUIRES_OK(context, out_shape.AddDimWithStatus(input_in.dim_size(i)));
     }
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 0fa3a6fbb0d..093ac06fa65 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/one_hot_op.h"
 
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 7a8f0a8c64e..f5667d77cf3 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -174,6 +174,8 @@ Status OpsTestBase::RunOpKernel() {
   test::SetOutputAttrs(params_.get(), &attrs);
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
   params_->slice_reader_cache = &slice_reader_cache_wrapper;
+  CancellationManager default_cancellation_manager;
+  params_->cancellation_manager = &default_cancellation_manager;
   params_->resource_manager = device_->resource_manager();
   params_->function_library = pflr_->GetFLR(device_->name());
 
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 85bcef6c383..9b823369e13 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -144,7 +144,6 @@ REGISTER_PACK(tstring);
       Name("Pack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       PackOp<GPUDevice, type>)
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
 TF_CALL_int16(REGISTER_GPU);
 TF_CALL_uint32(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index a0a6a2f2f3c..df367f87a0e 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -391,5 +391,4 @@ REGISTER_KERNEL_BUILDER(Name("PadV2")
                         PadOp<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index 2ef238af9d5..ef795e3b9ac 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/pad_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/pad_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 60f1718f240..33e971b66df 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -87,9 +87,10 @@ Pool3dParameters::Pool3dParameters(OpKernelContext* context,
                                        padding, &out_width, &pad_cols));
 }
 
-TensorShape Pool3dParameters::forward_output_shape() {
-  return ShapeFromFormat(data_format, tensor_in_batch,
-                         {{out_plane, out_height, out_width}}, depth);
+Status Pool3dParameters::forward_output_shape(TensorShape* shape) {
+  return ShapeFromFormatWithStatus(data_format, tensor_in_batch,
+                                   {{out_plane, out_height, out_width}}, depth,
+                                   shape);
 }
 
 template <typename T>
@@ -187,8 +188,10 @@ class Pooling3DOp : public UnaryOp<T> {
     OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
                                             padding_, &out, &padding));
 
-    TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
-                                            {{out[2], out[1], out[0]}}, depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                data_format_, in_batch,
+                                {{out[2], out[1], out[0]}}, depth, &out_shape));
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     if (out_shape.num_elements() == 0) return;
@@ -284,7 +287,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           mat0.setZero();
           select_slice =
               ((tensor_in_slice - tensor_out_slice.broadcast(bcast)).abs() <
-               tensor_in_slice.constant(1e-5))
+               tensor_in_slice.constant(T(1e-5)))
                   .select(out_backprop_slice.broadcast(bcast), mat0);
 
           output->tensor<T, 5>()
@@ -365,8 +368,10 @@ class MaxPooling3dGradOp : public OpKernel {
 
     const int64_t depth = GetTensorDim(tensor_in, data_format_, 'C');
     const int64_t in_batch = GetTensorDim(tensor_in, data_format_, 'N');
-    TensorShape out_shape = ShapeFromFormat(data_format_, in_batch,
-                                            {{out[2], out[1], out[0]}}, depth);
+    TensorShape out_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                data_format_, in_batch,
+                                {{out[2], out[1], out[0]}}, depth, &out_shape));
     OP_REQUIRES(
         context, tensor_out.shape() == out_shape,
         errors::InvalidArgument("Expected orig_output shape to be ", out_shape,
@@ -461,7 +466,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
               out_backprop.tensor<T, 5>().slice(src_indices, src_sizes);
           // Divide by the size of the actual patch (psize * rsize * csize).
           float divide_size = rsize * csize * psize * 1.0f;
-          slices *= slices.constant(1.0f / divide_size);
+          slices *= slices.constant(T(1.0f / divide_size));
 
           output->tensor<T, 5>()
               .slice(dst_indices, dst_sizes)
@@ -717,9 +722,12 @@ class MaxPooling3dGradGradOp : public OpKernel {
     Pool3dParameters params{context,  ksize_,       stride_,
                             padding_, data_format_, tensor_in.shape()};
     if (!context->status().ok()) return;  // params is invalid
-    OP_REQUIRES(context, tensor_out.shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, tensor_out.shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected orig_output shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", tensor_out.shape()));
     OP_REQUIRES(
         context, out_grad_backprop.shape() == tensor_in.shape(),
@@ -788,6 +796,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
 
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
 TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -876,6 +885,7 @@ struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
 
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
+    TF_CALL_bfloat16(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index 09419bcd0c4..eab41a60d69 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -45,7 +45,7 @@ struct Pool3dParameters {
                    const TensorShape& tensor_in_shape);
 
   // Returns the shape of the output for "forward" pooling operations.
-  TensorShape forward_output_shape();
+  Status forward_output_shape(TensorShape* shape);
 
   int depth;
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
index 92e52c3b60f..d3b47c7f8df 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
+
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 11aea72094e..e1ec075487c 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #if TENSORFLOW_USE_ROCM
@@ -193,37 +194,40 @@ PoolParameters::PoolParameters(OpKernelContext* context,
                 errors::Unimplemented("Depthwise max pooling is currently "
                                       "only implemented for CPU devices."));
 
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
     pad_depth = 0;
     out_depth = depth / depth_window;
   }
 }
 
-TensorShape PoolParameters::forward_output_shape() {
+Status PoolParameters::forward_output_shape(TensorShape* shape) {
   if (depth_window == 1) {
     // Spatial pooling
-    return ShapeFromFormat(data_format, tensor_in_batch, out_height, out_width,
-                           depth);
+    return ShapeFromFormatWithStatus(data_format, tensor_in_batch, out_height,
+                                     out_width, depth, shape);
   } else {
     // Depthwise pooling
-    return TensorShape(
+    *shape = TensorShape(
         {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
   }
+  return OkStatus();
 }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
-void DnnPoolingOp<T>::Compute(OpKernelContext* context,
-                              se::dnn::PoolingMode pooling_mode,
-                              const std::vector<int32>& size,
-                              const std::vector<int32>& stride, Padding padding,
-                              std::vector<int64_t> explicit_paddings,
-                              TensorFormat data_format, const Tensor& tensor_in,
-                              const TensorShape& tensor_out_shape,
-                              bool propagate_nans) {
-  Tensor* tensor_out = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(0, tensor_out_shape, &tensor_out));
+void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+                    const std::vector<int32>& size,
+                    const std::vector<int32>& stride, Padding padding,
+                    std::vector<int64_t> explicit_paddings,
+                    TensorFormat data_format, const Tensor& tensor_in,
+                    const TensorShape& tensor_out_shape, bool propagate_nans,
+                    Tensor* tensor_out) {
   if (tensor_in.shape().num_elements() == 0) {
     return;
   }
@@ -245,12 +249,14 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC) {
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, tensor_in.shape(),
-                                                data_format),
-                                &transformed_input));
-    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
+    TensorShape transformed_input_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, tensor_in.shape(), data_format,
+                                &transformed_input_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_input_shape,
+                                                   &transformed_input));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                            tensor_in.tensor<T, 4>(),
                                            transformed_input.tensor<T, 4>());
   } else {
@@ -258,11 +264,13 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   }
   Tensor transformed_output;
   if (data_format == FORMAT_NHWC) {
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                ShapeFromFormat(FORMAT_NCHW, tensor_out_shape,
-                                                data_format),
-                                &transformed_output));
+    TensorShape transformed_output_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, tensor_out_shape, data_format,
+                                &transformed_output_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
+                                                   transformed_output_shape,
+                                                   &transformed_output));
   } else {
     transformed_output = *tensor_out;
   }
@@ -314,12 +322,13 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     const int64_t new_in_rows = tensor_in_rows + padding_rows_diff;
     const int64_t new_in_cols = tensor_in_cols + padding_cols_diff;
 
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(DataTypeToEnum<T>::value,
-                               ShapeFromFormat(data_format, batch_size,
-                                               new_in_rows, new_in_cols, depth),
-                               &padded_input));
+    TensorShape padded_input_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                data_format, batch_size, new_in_rows,
+                                new_in_cols, depth, &padded_input_shape));
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          padded_input_shape, &padded_input));
     const int64_t input_pad_top = params.pad_top - common_padding_rows;
     const int64_t input_pad_bottom = params.pad_bottom - common_padding_rows;
     const int64_t input_pad_left = params.pad_left - common_padding_cols;
@@ -411,13 +420,72 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
     using RT = typename RawType<T>::type;
     functor::NCHWToNHWC<GPUDevice, RT, 4>()(
-        context->eigen_device<Device>(),
+        context->eigen_device<GPUDevice>(),
         toConstTensor(transformed_output).template tensor<RT, 4>(),
         tensor_out->tensor<RT, 4>());
   }
 #endif
 }
 
+template <typename T>
+void DnnPoolingOp<T>::Compute(OpKernelContext* context,
+                              se::dnn::PoolingMode pooling_mode,
+                              const std::vector<int32>& size,
+                              const std::vector<int32>& stride, Padding padding,
+                              std::vector<int64_t> explicit_paddings,
+                              TensorFormat data_format, const Tensor& tensor_in,
+                              const TensorShape& tensor_out_shape,
+                              bool propagate_nans) {
+  Tensor* tensor_out = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_out_shape, &tensor_out));
+  DnnPoolingImpl<T>(context, pooling_mode, size, stride, padding,
+                    explicit_paddings, data_format, tensor_in, tensor_out_shape,
+                    propagate_nans, tensor_out);
+}
+
+template <>
+void DnnPoolingOp<Eigen::bfloat16>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::vector<int32>& size, const std::vector<int32>& stride,
+    Padding padding, std::vector<int64_t> explicit_paddings,
+    TensorFormat data_format, const Tensor& tensor_in,
+    const TensorShape& tensor_out_shape, bool propagate_nans) {
+  Tensor* tensor_out = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_out_shape, &tensor_out));
+
+  auto* stream = context->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+  if (cast_to_float) {
+    Tensor casted_tensor_in;
+    Tensor casted_tensor_out;
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, tensor_in.shape(),
+                                                   &casted_tensor_in));
+    cast(device, casted_tensor_in.template flat<float>(),
+         tensor_in.template flat<Eigen::bfloat16>());
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_FLOAT, tensor_out->shape(),
+                                          &casted_tensor_out));
+
+    DnnPoolingImpl<float>(context, pooling_mode, size, stride, padding,
+                          explicit_paddings, data_format, casted_tensor_in,
+                          tensor_out_shape, propagate_nans, &casted_tensor_out);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_tensor_out_const = casted_tensor_out;
+    cast_back(device, tensor_out->template flat<Eigen::bfloat16>(),
+              casted_tensor_out_const.template flat<float>());
+    return;
+  }
+  DnnPoolingImpl<Eigen::bfloat16>(context, pooling_mode, size, stride, padding,
+                                  explicit_paddings, data_format, tensor_in,
+                                  tensor_out_shape, propagate_nans, tensor_out);
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                             \
@@ -432,26 +500,26 @@ namespace functor {
 
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(int32);
 }  // namespace functor
 
 template <typename T>
-void DnnPoolingGradOp<T>::Compute(
-    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
-    Padding padding, std::vector<int64_t> explicit_paddings,
-    TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
-    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
-    bool propagate_nans) {
+void DnnPoolingGradImpl(OpKernelContext* context,
+                        se::dnn::PoolingMode pooling_mode,
+                        const std::vector<int32>& size,
+                        const std::vector<int32>& stride, Padding padding,
+                        std::vector<int64_t> explicit_paddings,
+                        TensorFormat data_format, const Tensor* tensor_in,
+                        const Tensor* tensor_out, const Tensor& out_backprop,
+                        const TensorShape& tensor_in_shape, bool propagate_nans,
+                        Tensor* input_backprop) {
   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
          "specified";
 
-  Tensor* input_backprop = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(0, tensor_in_shape, &input_backprop));
   if (tensor_in_shape.num_elements() == 0) {
     return;
   }
@@ -462,15 +530,21 @@ void DnnPoolingGradOp<T>::Compute(
     return;
   }
   if (tensor_out) {
-    OP_REQUIRES(context, tensor_out->shape() == params.forward_output_shape(),
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    OP_REQUIRES(context, tensor_out->shape() == params_forward_output_shape,
                 errors::InvalidArgument("Expected orig_output shape to be ",
-                                        params.forward_output_shape(),
+                                        params_forward_output_shape,
                                         ", but got ", tensor_out->shape()));
   }
-  OP_REQUIRES(context, out_backprop.shape() == params.forward_output_shape(),
+  TensorShape params_forward_output_shape;
+  OP_REQUIRES_OK(context,
+                 params.forward_output_shape(&params_forward_output_shape));
+  OP_REQUIRES(context, out_backprop.shape() == params_forward_output_shape,
               errors::InvalidArgument("Expected grad shape to be ",
-                                      params.forward_output_shape(),
-                                      ", but got ", out_backprop.shape()));
+                                      params_forward_output_shape, ", but got ",
+                                      out_backprop.shape()));
 
   TensorFormat transformed_input_data_format = data_format;
 
@@ -480,8 +554,9 @@ void DnnPoolingGradOp<T>::Compute(
   Tensor transformed_input;
   TensorShape transformed_input_shape;
   if (data_format == FORMAT_NHWC || !tensor_in) {
-    transformed_input_shape =
-        ShapeFromFormat(FORMAT_NCHW, tensor_in_shape, data_format);
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, tensor_in_shape, data_format,
+                                &transformed_input_shape));
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
                                                    transformed_input_shape,
                                                    &transformed_input));
@@ -491,8 +566,9 @@ void DnnPoolingGradOp<T>::Compute(
   Tensor transformed_output;
   TensorShape transformed_output_shape;
   if (data_format == FORMAT_NHWC || !tensor_out) {
-    transformed_output_shape =
-        ShapeFromFormat(FORMAT_NCHW, out_backprop.shape(), data_format);
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                FORMAT_NCHW, out_backprop.shape(), data_format,
+                                &transformed_output_shape));
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::value,
                                                    transformed_output_shape,
                                                    &transformed_output));
@@ -524,7 +600,7 @@ void DnnPoolingGradOp<T>::Compute(
       // For AvgPoolGrad, the original input tensor is not necessary. However,
       // cudnn still requires them to run, although they do not affect the
       // results.
-      functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                              tensor_in->tensor<T, 4>(),
                                              transformed_input.tensor<T, 4>());
       transformed_input_data_format = FORMAT_NCHW;
@@ -533,12 +609,12 @@ void DnnPoolingGradOp<T>::Compute(
       // For AvgPoolGrad, the original output tensor is not necessary. However,
       // cudnn still requires them to run, although they do not affect the
       // results.
-      functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
                                              tensor_out->tensor<T, 4>(),
                                              transformed_output.tensor<T, 4>());
     }
     functor::NHWCToNCHW<GPUDevice, T, 4>()(
-        context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+        context->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
         transformed_output_backprop.tensor<T, 4>());
   }
   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
@@ -619,19 +695,24 @@ void DnnPoolingGradOp<T>::Compute(
             << params.window_rows << " kernel_col" << params.window_cols
             << " stride_rows" << params.row_stride;
 
+    TensorShape padded_input_shape;
     OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     ShapeFromFormat(transformed_input_data_format, batch_size,
-                                     new_in_rows, new_in_cols, depth),
-                     &padded_input));
+        context, ShapeFromFormatWithStatus(transformed_input_data_format,
+                                           batch_size, new_in_rows, new_in_cols,
+                                           depth, &padded_input_shape));
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          padded_input_shape, &padded_input));
 
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     ShapeFromFormat(transformed_input_data_format, batch_size,
-                                     new_in_rows, new_in_cols, depth),
-                     &transformed_and_padded_input_backprop));
+    TensorShape transformed_and_padded_input_backprop_shape;
+    OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
+                                transformed_input_data_format, batch_size,
+                                new_in_rows, new_in_cols, depth,
+                                &transformed_and_padded_input_backprop_shape));
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                transformed_and_padded_input_backprop_shape,
+                                &transformed_and_padded_input_backprop));
 
     input_pad_top = params.pad_top - common_padding_rows;
     input_pad_bottom = params.pad_bottom - common_padding_rows;
@@ -751,13 +832,95 @@ void DnnPoolingGradOp<T>::Compute(
     /// Transform the output data from NCHW back to NHWC.
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
-        context->eigen_device<Device>(),
+        context->eigen_device<GPUDevice>(),
         toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
         input_backprop->tensor<T, 4>());
   }
 #endif  // CUDNN_VERSION < 7300
 }
 
+template <typename T>
+void DnnPoolingGradOp<T>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::vector<int32>& size, const std::vector<int32>& stride,
+    Padding padding, std::vector<int64_t> explicit_paddings,
+    TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    bool propagate_nans) {
+  Tensor* input_backprop = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_in_shape, &input_backprop));
+  DnnPoolingGradImpl<T>(context, pooling_mode, size, stride, padding,
+                        explicit_paddings, data_format, tensor_in, tensor_out,
+                        out_backprop, tensor_in_shape, propagate_nans,
+                        input_backprop);
+}
+
+template <>
+void DnnPoolingGradOp<Eigen::bfloat16>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::vector<int32>& size, const std::vector<int32>& stride,
+    Padding padding, std::vector<int64_t> explicit_paddings,
+    TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    bool propagate_nans) {
+  Tensor* input_backprop = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, tensor_in_shape, &input_backprop));
+  auto* stream = context->op_device_context()->stream();
+  const bool cast_to_float = !stream->GetCudaComputeCapability().IsAtLeast(
+      se::CudaComputeCapability::AMPERE);
+  if (cast_to_float) {
+    Tensor casted_tensor_in;
+    Tensor casted_tensor_out;
+    Tensor casted_out_backprop;
+    Tensor casted_input_backprop;
+
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    functor::CastFunctor<GPUDevice, float, Eigen::bfloat16> cast;
+    if (tensor_in != nullptr) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, tensor_in->shape(),
+                                            &casted_tensor_in));
+      cast(device, casted_tensor_in.template flat<float>(),
+           tensor_in->template flat<Eigen::bfloat16>());
+    }
+    if (tensor_out != nullptr) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_FLOAT, tensor_out->shape(),
+                                            &casted_tensor_out));
+      cast(device, casted_tensor_out.template flat<float>(),
+           tensor_out->template flat<Eigen::bfloat16>());
+    }
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_FLOAT, out_backprop.shape(),
+                                          &casted_out_backprop));
+    cast(device, casted_out_backprop.template flat<float>(),
+         out_backprop.template flat<Eigen::bfloat16>());
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DT_FLOAT, input_backprop->shape(),
+                                          &casted_input_backprop));
+
+    DnnPoolingGradImpl<float>(
+        context, pooling_mode, size, stride, padding, explicit_paddings,
+        data_format, tensor_in != nullptr ? &casted_tensor_in : nullptr,
+        tensor_out != nullptr ? &casted_tensor_out : nullptr,
+        casted_out_backprop, tensor_in_shape, propagate_nans,
+        &casted_input_backprop);
+
+    functor::CastFunctor<GPUDevice, Eigen::bfloat16, float> cast_back;
+    const Tensor& casted_input_backprop_const = casted_input_backprop;
+    cast_back(device, input_backprop->template flat<Eigen::bfloat16>(),
+              casted_input_backprop_const.template flat<float>());
+    return;
+  }
+  DnnPoolingGradImpl<Eigen::bfloat16>(
+      context, pooling_mode, size, stride, padding, explicit_paddings,
+      data_format, tensor_in, tensor_out, out_backprop, tensor_in_shape,
+      propagate_nans, input_backprop);
+}
+
 #define DEFINE_DNN_OPS(T)         \
   template class DnnPoolingOp<T>; \
   template class DnnPoolingGradOp<T>;
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 8890e24a32a..88a890162de 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -53,7 +53,7 @@ struct PoolParameters {
                  TensorFormat data_format, const TensorShape& tensor_in_shape);
 
   // Returns the shape of the output for "forward" pooling operations.
-  TensorShape forward_output_shape();
+  Status forward_output_shape(TensorShape* shape);
 
   int depth;
 
@@ -136,8 +136,11 @@ class MaxPoolingOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, params.forward_output_shape(), &output));
+                                0, params_forward_output_shape, &output));
 
     if (params.depth_window > 1) {
       // Validate spec against the current implementation.  A
@@ -405,8 +408,11 @@ class MaxPoolingV2Op : public OpKernel {
     }
 
     Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, params.forward_output_shape(), &output));
+                                0, params_forward_output_shape, &output));
 
     if (params.depth_window > 1) {
       // Validate spec against the current implementation.  A
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 5673fb6ee00..e185f099580 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -90,14 +90,19 @@ class QuantizedAvgPoolingOp : public OpKernel {
                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
 
     Tensor* output = nullptr;
+    TensorShape params_forward_output_shape;
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, params.forward_output_shape(), &output));
+                                0, params_forward_output_shape, &output));
     const int32_t highest = static_cast<int32>(Eigen::NumTraits<T>::highest());
     const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T>::lowest());
 
     // TODO(vrv): Switch this to the Eigen::Tensor version of
     // SpatialAvgPooling once that version is running quickly.
-    Tensor int32_output(DT_INT32, params.forward_output_shape());
+    OP_REQUIRES_OK(context,
+                   params.forward_output_shape(&params_forward_output_shape));
+    Tensor int32_output(DT_INT32, params_forward_output_shape);
     // Cast input to int32 tensor and call SpatialAvgPool.
     Tensor int32_input(DT_INT32, tensor_in.shape());
     int32_input.flat<int32>() = tensor_in.flat<T>().template cast<int32>();
diff --git a/tensorflow/core/kernels/ragged_range_op.cc b/tensorflow/core/kernels/ragged_range_op.cc
index 469ef06b4b3..0c18d66e2e0 100644
--- a/tensorflow/core/kernels/ragged_range_op.cc
+++ b/tensorflow/core/kernels/ragged_range_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -79,7 +80,7 @@ class RaggedRangeOp : public OpKernel {
       T limit = broadcast_limits ? limits(0) : limits(row);
       T delta = broadcast_deltas ? deltas(0) : deltas(row);
       OP_REQUIRES(context, delta != 0, InvalidArgument("Requires delta != 0"));
-      int64_t size;  // The number of elements in the specified range.
+      SPLITS_TYPE size;  // The number of elements in the specified range.
       if (((delta > 0) && (limit < start)) ||
           ((delta < 0) && (limit > start))) {
         size = 0;
@@ -95,8 +96,15 @@ class RaggedRangeOp : public OpKernel {
             context, size_auto <= std::numeric_limits<int64_t>::max(),
             errors::InvalidArgument("Requires ((limit - start) / delta) <= ",
                                     std::numeric_limits<int64_t>::max()));
-        size = static_cast<int64_t>(size_auto);
+        size = static_cast<SPLITS_TYPE>(size_auto);
       }
+      OP_REQUIRES(context, size >= 0, InvalidArgument("Requires size >= 0"));
+      OP_REQUIRES(
+          context,
+          size <=
+              std::numeric_limits<SPLITS_TYPE>::max() - rt_nested_splits(row),
+          InvalidArgument("The total range size overflowed. Consider using "
+                          "int64 instead of int32 for row_splits_dtype."));
       rt_nested_splits(row + 1) = rt_nested_splits(row) + size;
     }
     SPLITS_TYPE nvals = rt_nested_splits(nrows);
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index ce3e8b0437c..79e5889454b 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -233,36 +233,37 @@ void UnigramSampler::Update(ArraySlice<int64_t> values) {
   unsafe_sampler_.Update(values);
 }
 
-FixedUnigramSampler::FixedUnigramSampler(Env* env, int64_t range,
-                                         const string& vocab_file,
-                                         float distortion,
+FixedUnigramSampler::FixedUnigramSampler(int64_t range, float distortion,
                                          int32_t num_reserved_ids,
                                          int32_t num_shards, int32_t shard)
     : RangeSampler(range),
       total_weight_(0.0),
       num_shards_(num_shards),
-      shard_(shard) {
+      shard_(shard),
+      distortion_(distortion) {
   FillReservedIds(num_reserved_ids);
-  // TODO(vanhoucke): make this non-crashing.
-  TF_CHECK_OK(LoadFromFile(env, vocab_file, distortion));
-  CHECK_EQ(range, weights_.size());
+}
+
+Status FixedUnigramSampler::SetDistributionSampler(Env* env,
+                                                   const string& vocab_file) {
+  TF_RETURN_IF_ERROR(LoadFromFile(env, vocab_file, distortion_));
+  if (!TF_PREDICT_TRUE(FixedUnigramSampler::range() == weights_.size()))
+    return (errors::InvalidArgument("range is ", FixedUnigramSampler::range(),
+                                    " must be equal to weights size  ",
+                                    weights_.size()));
   dist_sampler_.reset(new random::DistributionSampler(weights_));
+  return OkStatus();
 }
 
-FixedUnigramSampler::FixedUnigramSampler(int64_t range,
-                                         const std::vector<float>& unigrams,
-                                         float distortion,
-                                         int32_t num_reserved_ids,
-                                         int32_t num_shards, int32_t shard)
-    : RangeSampler(range),
-      total_weight_(0.0),
-      num_shards_(num_shards),
-      shard_(shard) {
-  FillReservedIds(num_reserved_ids);
-  LoadFromUnigrams(unigrams, distortion);
-  // TODO(vanhoucke): make this non-crashing.
-  CHECK_EQ(range, weights_.size());
+Status FixedUnigramSampler::SetDistributionSampler(
+    const std::vector<float>& unigrams) {
+  LoadFromUnigrams(unigrams, distortion_);
+  if (!TF_PREDICT_TRUE(FixedUnigramSampler::range() == weights_.size()))
+    return (errors::InvalidArgument("range is ", FixedUnigramSampler::range(),
+                                    " must be equal to weights size  ",
+                                    weights_.size()));
   dist_sampler_.reset(new random::DistributionSampler(weights_));
+  return OkStatus();
 }
 
 float FixedUnigramSampler::Probability(int64_t value) const {
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index 9b8d52c90c2..8710e109773 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -212,16 +212,12 @@ class UnigramSampler : public RangeSampler {
 // distribution by applying a distortion power to the weights.
 class FixedUnigramSampler : public RangeSampler {
  public:
+  FixedUnigramSampler(int64_t range, float distortion, int32_t num_reserved_ids,
+                      int32_t num_shards, int32_t shard);
   // The vocab_file is assumed to be a CSV, with the last entry of each row a
   // value representing the counts or probabilities for the corresponding ID.
-  FixedUnigramSampler(Env* env, int64_t range, const string& vocab_file,
-                      float distortion, int32_t num_reserved_ids,
-                      int32_t num_shards, int32_t shard);
-
-  FixedUnigramSampler(int64_t range, const std::vector<float>& unigrams,
-                      float distortion, int32_t num_reserved_ids,
-                      int32_t num_shards, int32_t shard);
-
+  Status SetDistributionSampler(Env* env, const string& vocab_file);
+  Status SetDistributionSampler(const std::vector<float>& unigrams);
   float Probability(int64_t value) const override;
 
   int64_t Sample(random::SimplePhilox* rnd) const override;
@@ -239,7 +235,7 @@ class FixedUnigramSampler : public RangeSampler {
   // such smaller range, identified by the shard number.
   int32 num_shards_;
   int32 shard_;
-
+  float distortion_;
   // Fill the sampler with the appropriate number of reserved IDs.
   void FillReservedIds(int32_t num_reserved_ids);
   // Load IDs to sample from a CSV file. It is assumed that the last item of
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index 56a799a841c..329bc32c81a 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -157,32 +157,56 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
   Env* env = Env::Default();
   string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
-  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
+  sampler_.reset(test_sampler);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
   for (int i = 0; i < 9; i++) {
     ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
   }
 }
+TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
+  Env* env = Env::Default();
+  string fname = "NoExistingFile";
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  Status s = test_sampler->SetDistributionSampler(env, fname);
+  sampler_.reset(test_sampler);
+  EXPECT_TRUE(errors::IsNotFound(s)) << s;
+}
+TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
+  Env* env = Env::Default();
+  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(8, 0.8, 0, 1, 0);
+  Status s = test_sampler->SetDistributionSampler(env, fname);
+  sampler_.reset(test_sampler);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+}
 TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
   Env* env = Env::Default();
   string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
-  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
+  sampler_.reset(test_sampler);
   CheckProbabilitiesSumToOne();
 }
-
 TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
   Env* env = Env::Default();
   string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
-  sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
+  sampler_.reset(test_sampler);
   CheckHistogram(1000, 0.05);
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
   Env* env = Env::Default();
   string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
-  sampler_.reset(new FixedUnigramSampler(env, 10, fname, 0.8, 1, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(10, 0.8, 1, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
+  sampler_.reset(test_sampler);
   ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
   for (int i = 1; i < 10; i++) {
@@ -193,7 +217,9 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
   Env* env = Env::Default();
   string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
-  sampler_.reset(new FixedUnigramSampler(env, 11, fname, 0.8, 2, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(11, 0.8, 2, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
+  sampler_.reset(test_sampler);
   ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
   ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
@@ -203,7 +229,9 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesFromVector) {
   std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
-  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(weights));
+  sampler_.reset(test_sampler);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
   for (int i = 0; i < 9; i++) {
     ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4);
@@ -211,17 +239,23 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesFromVector) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramChecksumFromVector) {
   std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
-  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(weights));
+  sampler_.reset(test_sampler);
   CheckProbabilitiesSumToOne();
 }
 TEST_F(RangeSamplerTest, FixedUnigramHistogramFromVector) {
   std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
-  sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(weights));
+  sampler_.reset(test_sampler);
   CheckHistogram(1000, 0.05);
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1FromVector) {
   std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
-  sampler_.reset(new FixedUnigramSampler(10, weights, 0.8, 1, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(10, 0.8, 1, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(weights));
+  sampler_.reset(test_sampler);
   ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
   for (int i = 1; i < 10; i++) {
@@ -230,7 +264,9 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1FromVector) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2FromVector) {
   std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256};
-  sampler_.reset(new FixedUnigramSampler(11, weights, 0.8, 2, 1, 0));
+  FixedUnigramSampler* test_sampler = new FixedUnigramSampler(11, 0.8, 2, 1, 0);
+  TF_CHECK_OK(test_sampler->SetDistributionSampler(weights));
+  sampler_.reset(test_sampler);
   ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4);
   ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4);
   // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 6ca50303374..fd87098e548 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -123,26 +123,28 @@ struct DividesBy<std::complex<double>> {
 };
 #endif  // GOOGLE_CUDA
 
-template <>
-struct DividesBy<float, Eigen::half> {
+template <typename T>
+struct DividesBy<float, T> {
   float divisor;
 
   __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {}
 
-  __host__ __device__ Eigen::half operator()(const float& x) const {
-    return Eigen::half(x / divisor);
+  __host__ __device__ T operator()(const float& x) const {
+    return T(x / divisor);
   }
 };
 
+template <typename T>
 struct HalfToFloat {
-  __host__ __device__ float operator()(const Eigen::half& x) const {
+  __host__ __device__ float operator()(const T& x) const {
     return static_cast<float>(x);
   }
 };
 
+template <typename T>
 struct FloatToHalf {
-  __host__ __device__ Eigen::half operator()(const float& x) const {
-    return static_cast<Eigen::half>(x);
+  __host__ __device__ T operator()(const float& x) const {
+    return static_cast<T>(x);
   }
 };
 
@@ -1135,6 +1137,41 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
   }
 };
 
+// Specialization for bfloat16 with fp32 accumulation.
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<Eigen::bfloat16>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(
+      OpKernelContext* ctx, OUT_T out, IN_T in,
+      const ReductionAxes& reduction_axes,
+      const Eigen::internal::SumReducer<Eigen::bfloat16>& reducer) {
+    typedef gpuprim::TransformInputIterator<float, HalfToFloat<Eigen::bfloat16>,
+                                            Eigen::bfloat16*>
+        inputIterType;
+    inputIterType input_itr((Eigen::bfloat16*)in.data(),
+                            HalfToFloat<Eigen::bfloat16>());
+
+    typedef TransformOutputIterator<Eigen::bfloat16, float,
+                                    FloatToHalf<Eigen::bfloat16>>
+        outputIterType;
+    outputIterType itr((Eigen::bfloat16*)out.data(),
+                       FloatToHalf<Eigen::bfloat16>());
+
+    ReduceImpl<float, gpuprim::Sum, outputIterType, inputIterType,
+               ReductionAxes>(ctx, itr, input_itr, in.rank(), in.dimension(0),
+                              in.rank() >= 2 ? in.dimension(1) : 1,
+                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
+                              reduction_axes, gpuprim::Sum());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::SumReducer<Eigen::bfloat16>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
 // TODO(rmlarsen): Specialize for float16.
 template <typename T>
 struct ReduceFunctor<GPUDevice, functor::EuclideanNormReducer<T>> {
@@ -1196,40 +1233,46 @@ struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
   }
 };
 
+template <typename T, typename OUT_T, typename IN_T, typename ReductionAxes>
+void ReduceMeanWithFloatAccumulationImpl(
+    OpKernelContext* ctx, OUT_T out, IN_T in,
+    const ReductionAxes& reduction_axes,
+    const functor::MeanReducer<T>& reducer) {
+  float divisor = 1.f;
+  if (out.rank() == 0)
+    divisor = in.size();
+  else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+    divisor = in.dimension(0);
+  else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+    divisor = in.dimension(1);
+  else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+           reduction_axes[1] == 2)
+    divisor = in.dimension(0) * in.dimension(2);
+  else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+    divisor = in.dimension(1);
+  DividesBy<float, T> div_op(divisor);
+
+  typedef gpuprim::TransformInputIterator<float, HalfToFloat<T>, T*>
+      inputIterType;
+  inputIterType input_itr((T*)in.data(), HalfToFloat<T>());
+
+  typedef TransformOutputIterator<T, float, DividesBy<float, T>> outputIterType;
+  outputIterType itr((T*)out.data(), div_op);
+
+  ReduceImpl<float, gpuprim::Sum, outputIterType, inputIterType, ReductionAxes>(
+      ctx, itr, input_itr, in.rank(), in.dimension(0),
+      in.rank() >= 2 ? in.dimension(1) : 1,
+      in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+      gpuprim::Sum());
+}
+
 template <>
 struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::half>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
   static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const functor::MeanReducer<Eigen::half>& reducer) {
-    float divisor = 1.f;
-    if (out.rank() == 0)
-      divisor = in.size();
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
-      divisor = in.dimension(0);
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
-             reduction_axes[1] == 2)
-      divisor = in.dimension(0) * in.dimension(2);
-    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-    DividesBy<float, Eigen::half> div_op(divisor);
-
-    typedef gpuprim::TransformInputIterator<float, HalfToFloat, Eigen::half*>
-        inputIterType;
-    inputIterType input_itr((Eigen::half*)in.data(), HalfToFloat());
-
-    typedef TransformOutputIterator<Eigen::half, float,
-                                    DividesBy<float, Eigen::half>>
-        outputIterType;
-    outputIterType itr((Eigen::half*)out.data(), div_op);
-
-    ReduceImpl<float, gpuprim::Sum, outputIterType, inputIterType,
-               ReductionAxes>(ctx, itr, input_itr, in.rank(), in.dimension(0),
-                              in.rank() >= 2 ? in.dimension(1) : 1,
-                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
-                              reduction_axes, gpuprim::Sum());
+    ReduceMeanWithFloatAccumulationImpl(ctx, out, in, reduction_axes, reducer);
   }
 
   template <typename OUT_T>
@@ -1239,6 +1282,23 @@ struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::half>> {
   }
 };
 
+template <>
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::bfloat16>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::MeanReducer<Eigen::bfloat16>& reducer) {
+    ReduceMeanWithFloatAccumulationImpl(ctx, out, in, reduction_axes, reducer);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const functor::MeanReducer<Eigen::bfloat16>& reducer) {
+    FillIdentityEigenImpl(d, out, reducer);
+  }
+};
+
 template <typename T>
 struct ReduceFunctor<GPUDevice,
                      Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>> {
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index f55b4b9f274..ff89e45a91a 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -175,6 +175,7 @@ struct Identity {
     }                                                   \
   };
 FIX_MEAN_IDENTITY(Eigen::half)
+FIX_MEAN_IDENTITY(Eigen::bfloat16)
 FIX_MEAN_IDENTITY(float)
 FIX_MEAN_IDENTITY(double)
 #undef FIX_MEAN_IDENTITY
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
index 3fa7df945b5..fc40ee1224e 100644
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -58,5 +58,4 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..d510faaeead
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_gpu_bfloat16.cu.cc
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Derive Index type. int (32-bit) or long (64-bit) depending on the
+// compile-time configuration. "float" here is not relevant.
+// TODO(zhifengc): Moves the definition to TTypes.
+typedef TTypes<float>::Tensor::Index Index;
+
+// T: the data type
+// REDUCER: the reducer functor
+// NUM_AXES: the number of axes to reduce
+// IN_DIMS: the number of dimensions of the input tensor
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                          \
+  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(             \
+      OpKernelContext* ctx, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+      TTypes<T, IN_DIMS>::ConstTensor in,                              \
+      const Eigen::array<Index, NUM_AXES>& reduction_axes,             \
+      const REDUCER& reducer);
+
+#define DEFINE_IDENTITY(T, REDUCER)                              \
+  template void ReduceFunctor<GPUDevice, REDUCER>::FillIdentity( \
+      const GPUDevice& d, TTypes<T>::Vec out, const REDUCER& reducer);
+
+#define SINGLE_ARG(...) __VA_ARGS__
+
+#define DEFINE_FOR_TYPE_AND_R(T, R) \
+  DEFINE(T, SINGLE_ARG(R), 1, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 2, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 1);   \
+  DEFINE(T, SINGLE_ARG(R), 3, 2);   \
+  DEFINE_IDENTITY(T, SINGLE_ARG(R))
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                                         \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);                \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);                       \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>);              \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MinReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(                                                   \
+      T, SINGLE_ARG(Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>)); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
+
+DEFINE_FOR_ALL_REDUCERS(Eigen::bfloat16);
+#undef SINGLE_ARG
+#undef DEFINE_FOR_ALL_REDUCERS
+#undef DEFINE_FOR_TYPE_AND_R
+#undef DEFINE
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index 6960b3ffbb6..59d7c89b779 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -56,6 +56,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                   Eigen::internal::MaxReducer<type, Eigen::PropagateNaN>>);
 
 REGISTER_GPU_KERNELS(Eigen::half);
+REGISTER_GPU_KERNELS(Eigen::bfloat16);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 REGISTER_GPU_KERNELS(int64_t);
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index 01372925911..f15751735c3 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -58,5 +58,4 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 6bea77d3f74..d493cc7514b 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -55,6 +55,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
       ReductionOp<GPUDevice, type, int64,                                   \
                   Eigen::internal::MinReducer<type, Eigen::PropagateNaN>>);
 REGISTER_GPU_KERNELS(Eigen::half);
+REGISTER_GPU_KERNELS(Eigen::bfloat16);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index b37bdba8066..c494d925160 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -68,5 +68,4 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 75c01abfdde..01ba07fa82d 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -126,50 +126,50 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_NO_MLIR_KERNELS);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
-  template <>                                                                  \
-  void ReluGrad<GPUDevice, T>::operator()(                                     \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor features,                                \
-      typename TTypes<T>::Tensor backprops);                                   \
-  extern template struct ReluGrad<GPUDevice, T>;                               \
-                                                                               \
-  template <>                                                                  \
-  void Relu6<GPUDevice, T>::operator()(                                        \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
-      typename TTypes<T>::Tensor activations);                                 \
-  extern template struct Relu6<GPUDevice, T>;                                  \
-                                                                               \
-  template <>                                                                  \
-  void Relu6Grad<GPUDevice, T>::operator()(                                    \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor features,                                \
-      typename TTypes<T>::Tensor backprops);                                   \
-  extern template struct Relu6Grad<GPUDevice, T>;                              \
-                                                                               \
-  template <>                                                                  \
-  void LeakyRelu<GPUDevice, T>::operator()(LeakyReluArgs args);                \
-  extern template struct LeakyRelu<GPUDevice, T>;                              \
-                                                                               \
-  template <>                                                                  \
-  void LeakyReluGrad<GPUDevice, T>::operator()(                                \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor features, T alpha,                       \
-      typename TTypes<T>::Tensor backprops);                                   \
-  extern template struct LeakyReluGrad<GPUDevice, T>;                          \
-                                                                               \
-  template <>                                                                  \
-  void EluGrad<GPUDevice, T>::operator()(                                      \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor activations,                             \
-      typename TTypes<T>::Tensor backprops);                                   \
-  extern template struct EluGrad<GPUDevice, T>;                                \
-                                                                               \
-  template <>                                                                  \
-  void SeluGrad<GPUDevice, T>::operator()(                                     \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
-      typename TTypes<T>::ConstTensor activations,                             \
-      typename TTypes<T>::Tensor backprops);                                   \
+#define DECLARE_GPU_SPEC(T)                                          \
+  template <>                                                        \
+  void ReluGrad<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct ReluGrad<GPUDevice, T>;                     \
+                                                                     \
+  template <>                                                        \
+  void Relu6<GPUDevice, T>::operator()(                              \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,  \
+      typename TTypes<T>::Tensor activations);                       \
+  extern template struct Relu6<GPUDevice, T>;                        \
+                                                                     \
+  template <>                                                        \
+  void Relu6Grad<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features,                      \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct Relu6Grad<GPUDevice, T>;                    \
+                                                                     \
+  template <>                                                        \
+  void LeakyRelu<GPUDevice, T>::operator()(LeakyReluArgs args);      \
+  extern template struct LeakyRelu<GPUDevice, T>;                    \
+                                                                     \
+  template <>                                                        \
+  void LeakyReluGrad<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor features, T alpha,             \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct LeakyReluGrad<GPUDevice, T>;                \
+                                                                     \
+  template <>                                                        \
+  void EluGrad<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor activations,                   \
+      typename TTypes<T>::Tensor backprops);                         \
+  extern template struct EluGrad<GPUDevice, T>;                      \
+                                                                     \
+  template <>                                                        \
+  void SeluGrad<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \
+      typename TTypes<T>::ConstTensor activations,                   \
+      typename TTypes<T>::Tensor backprops);                         \
   extern template struct SeluGrad<GPUDevice, T>;
 
 template <>
diff --git a/tensorflow/core/kernels/reshape_util.cc b/tensorflow/core/kernels/reshape_util.cc
index e64f0b0751b..939901dfd86 100644
--- a/tensorflow/core/kernels/reshape_util.cc
+++ b/tensorflow/core/kernels/reshape_util.cc
@@ -122,13 +122,13 @@ void ReshapeSparseTensor(OpKernelContext *context,
                                   "not both ",
                                   unknown_index, " and ", d));
       unknown_index = d;
-      output_shape.AddDim(1);
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(1));
     } else {
       OP_REQUIRES(context, size >= 0,
                   errors::InvalidArgument("size ", d,
                                           " must be non-negative, not ", size));
       product *= size;
-      output_shape.AddDim(size);
+      OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(size));
     }
   }
   if (unknown_index != -1) {
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index d616ec34236..0d03d7f37e1 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -72,6 +72,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/resource_variable_util.h"
 #include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/scatter_nd_util.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -288,7 +289,6 @@ REGISTER_KERNEL_BUILDER(Name("VarHandleOp").Device(DEVICE_CPU), VarHandleOp);
   }
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
@@ -303,7 +303,6 @@ TF_CALL_variant(REGISTER_GPU_KERNELS);
                           VarHandleOp)
 TF_CALL_GPU_ALL_TYPES(REGISTER_DEFAULT_KERNELS);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_DEFAULT_KERNELS);
-TF_CALL_bfloat16(REGISTER_DEFAULT_KERNELS);
 TF_CALL_variant(REGISTER_DEFAULT_KERNELS);
 #undef REGISTER_DEFAULT_KERNELS
 
@@ -564,7 +563,6 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU_KERNELS);
-TF_CALL_bfloat16(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -709,13 +707,13 @@ class ResourceGatherOp : public OpKernel {
     // indices.shape[batch_dims:] + params.shape[batch_dims+1:].
     TensorShape result_shape;
     for (int i = 0; i < batch_dims_; ++i) {
-      result_shape.AddDim(params.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(params.dim_size(i)));
     }
     for (int i = batch_dims_; i < indices.dims(); ++i) {
-      result_shape.AddDim(indices.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(indices.dim_size(i)));
     }
     for (int i = batch_dims_ + 1; i < params.dims(); ++i) {
-      result_shape.AddDim(params.dim_size(i));
+      OP_REQUIRES_OK(c, result_shape.AddDimWithStatus(params.dim_size(i)));
     }
 
     Tensor* out = nullptr;
@@ -824,6 +822,9 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GATHER_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GATHER_GPU);
 
+#undef REGISTER_GATHER_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 // Variant objects themselves sit on CPU, even if they contain data
 // pointing to a device.
 REGISTER_KERNEL_BUILDER(Name("ResourceGather")
@@ -841,9 +842,6 @@ REGISTER_KERNEL_BUILDER(Name("ResourceGather")
                             .TypeConstraint<int64_t>("Tindices"),
                         ResourceGatherOp<CPUDevice, Variant, int64>)
 
-#undef REGISTER_GATHER_GPU
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 #undef REGISTER_GATHER_CPU
 #undef REGISTER_GATHER_ALL_INDICES
 #undef REGISTER_GATHER_FULL
@@ -1008,7 +1006,7 @@ Status DoScatter(OpKernelContext* c, Tensor* params, const Tensor& indices,
                  const Tensor& updates, Index num_indices) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (std::is_same<Device, GPUDevice>::value &&
-      tensorflow::OpDeterminismRequired()) {
+      tensorflow::OpDeterminismRequired() && !DisableScatterOpDeterminism()) {
     return DoScatterOnCpu<T, Index, op>(c, params, indices, updates,
                                         num_indices);
   }
@@ -1200,19 +1198,24 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_GPU);
 TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_SCATTER_UPDATE_GPU);
 
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
-                            .Device(DEVICE_DEFAULT)
+                            .Device(DEVICE_GPU)
                             .HostMemory("resource")
-                            .HostMemory("indices")
-                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<bool>("dtype")
                             .TypeConstraint<int32>("Tindices"),
-                        ResourceScatterUpdateOp<CPUDevice, Variant, int32,
+                        ResourceScatterUpdateOp<GPUDevice, bool, int32,
                                                 scatter_op::UpdateOp::ASSIGN>)
+#undef REGISTER_SCATTER_ARITHMETIC_GPU
+#undef REGISTER_SCATTER_MINMAX_GPU
+#undef REGISTER_SCATTER_UPDATE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
-                            .Device(DEVICE_GPU)
+                            .Device(DEVICE_DEFAULT)
                             .HostMemory("resource")
-                            .TypeConstraint<bool>("dtype")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
                             .TypeConstraint<int32>("Tindices"),
-                        ResourceScatterUpdateOp<GPUDevice, bool, int32,
+                        ResourceScatterUpdateOp<CPUDevice, Variant, int32,
                                                 scatter_op::UpdateOp::ASSIGN>)
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .Device(DEVICE_DEFAULT)
@@ -1222,10 +1225,6 @@ REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .TypeConstraint<int64_t>("Tindices"),
                         ResourceScatterUpdateOp<CPUDevice, Variant, int64,
                                                 scatter_op::UpdateOp::ASSIGN>)
-#undef REGISTER_SCATTER_ARITHMETIC_GPU
-#undef REGISTER_SCATTER_MINMAX_GPU
-#undef REGISTER_SCATTER_UPDATE_GPU
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef REGISTER_SCATTER_ARITHMETIC
 #undef REGISTER_SCATTER_ARITHMETIC_CPU
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 5bb95a40b69..b72ee3c4b22 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/reverse_op.h"
+
 #include <memory>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -85,6 +87,7 @@ struct data_type_can_memcpy {
       std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
       std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
       std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
+      std::is_same<T, Eigen::bfloat16>::value ||
       std::is_same<T, int32>::value || std::is_same<T, float>::value ||
       std::is_same<T, int64_t>::value || std::is_same<T, double>::value ||
       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index 28c50bc66df..8afd9c762ff 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/reverse_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/reverse_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 1e122906daa..69b9b0874e6 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include <memory>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/risc/BUILD b/tensorflow/core/kernels/risc/BUILD
index 8af443d910d..6295c62f467 100644
--- a/tensorflow/core/kernels/risc/BUILD
+++ b/tensorflow/core/kernels/risc/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/kernels/risc/experimental/BUILD b/tensorflow/core/kernels/risc/experimental/BUILD
index cbf4b0cca23..72bafd5b357 100644
--- a/tensorflow/core/kernels/risc/experimental/BUILD
+++ b/tensorflow/core/kernels/risc/experimental/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 4f640e3c0ae..24f8eee0570 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -16,6 +16,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
 )
 
@@ -32,9 +33,9 @@ tf_gpu_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
         "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h
index 71ca745ea40..3fe247337cf 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.h
+++ b/tensorflow/core/kernels/rnn/blas_gemm.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/rnn/gru_ops.cc b/tensorflow/core/kernels/rnn/gru_ops.cc
index 702e618804d..93aa88ca067 100644
--- a/tensorflow/core/kernels/rnn/gru_ops.cc
+++ b/tensorflow/core/kernels/rnn/gru_ops.cc
@@ -49,61 +49,68 @@ class GRUCellBlockOp : public OpKernel {
     const Tensor* b_c_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("b_c", &b_c_tensor));
 
+    // Sanity checks for input shapes.
+
+    // Shape of 'x' must be [batch_size, input_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(x_tensor->shape()),
+                errors::InvalidArgument("Rank of x must be 2", x_tensor->dims(),
+                                        " vs. 2"));
     const int64_t batch_size = x_tensor->dim_size(0);
     const int64_t input_size = x_tensor->dim_size(1);
-    const int64_t cell_size = h_prev_tensor->dim_size(1);
-
-    // Sanity checks for input shapes.
 
     // Shape of 'h' must be [batch_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(h_prev_tensor->shape()),
+                errors::InvalidArgument("Rank of h_prev must be 2, got ",
+                                        h_prev_tensor->dims()));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument("h_prev.dims(0) != batch_size: ",
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
-    OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument(
-                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
-                    " vs. ", cell_size));
+    const int64_t cell_size = h_prev_tensor->dim_size(1);
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_ru_tensor->shape()),
+                errors::InvalidArgument("Rank of w_ru_ must be 2, got ",
+                                        w_ru_tensor->dims()));
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w_ru.dim_size(0) != input_size + cell_size: ",
                     w_ru_tensor->dim_size(0), " vs. ", input_size + cell_size));
-
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(1) == cell_size * 2,
                 errors::InvalidArgument("w_ru.dim_size(1) != cell_size * 2: ",
                                         w_ru_tensor->dim_size(1), " vs. ",
                                         cell_size * 2));
 
     // Shape of 'w_c' must be [input_size+cell_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_c_tensor->shape()),
+                errors::InvalidArgument("Rank of w_c must be 2, got ",
+                                        w_c_tensor->dims()));
     OP_REQUIRES(ctx, w_c_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
-
     OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
                 errors::InvalidArgument(
                     "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
                     " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(b_ru_tensor->shape()),
+                errors::InvalidArgument("Rank of b_ru must be 1, got ",
+                                        b_ru_tensor->dims()));
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
                 errors::InvalidArgument("b_ru.dim_size(0) != cell_size * 2: ",
                                         b_ru_tensor->dim_size(0), " vs. ",
                                         cell_size * 2));
 
-    OP_REQUIRES(ctx, b_ru_tensor->dims() == 1,
-                errors::InvalidArgument("Rank of b_ru must be 1",
-                                        b_ru_tensor->dims(), " vs. 1", 1));
     // Shape of 'b_c' must be [cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(b_c_tensor->shape()),
+                errors::InvalidArgument("Rank of b_c must be 1, got ",
+                                        b_c_tensor->dims()));
     OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
                 errors::InvalidArgument(
                     "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
                     " vs. ", cell_size));
-    OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
-                errors::InvalidArgument("Rank of b_c must be 1",
-                                        b_c_tensor->dims(), " vs. 1"));
 
     // Create output tensors.
     Tensor* r_tensor = nullptr;
@@ -204,65 +211,71 @@ class GRUBlockCellGradOp : public OpKernel {
     const Tensor* d_h_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("d_h", &d_h_tensor));
 
+    // Shape of 'x' must be [batch_size, input_size]
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(x_tensor->shape()),
+        errors::InvalidArgument("Rank of x must be 2, got ", x_tensor->dims()));
     const int64_t batch_size = x_tensor->dim_size(0);
     const int64_t input_size = x_tensor->dim_size(1);
-    const int64_t cell_size = h_prev_tensor->dim_size(1);
 
-    // Sanity checks for input shapes.
-
-    // Shape of 'h_prev' must be [batch_size, cell_size]
+    // Shape of 'h' must be [batch_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(h_prev_tensor->shape()),
+                errors::InvalidArgument("Rank of h_prev must be 2, got ",
+                                        h_prev_tensor->dims()));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument("h_prev.dims(0) != batch_size: ",
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
-    OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument(
-                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
-                    " vs. ", cell_size));
+    const int64_t cell_size = h_prev_tensor->dim_size(1);
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_ru_tensor->shape()),
+                errors::InvalidArgument("Rank of w_ru_ must be 2, got ",
+                                        w_ru_tensor->dims()));
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w_ru.dim_size(0) != input_size + cell_size: ",
                     w_ru_tensor->dim_size(0), " vs. ", input_size + cell_size));
-
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(1) == cell_size * 2,
                 errors::InvalidArgument("w_ru.dim_size(1) != cell_size * 2: ",
                                         w_ru_tensor->dim_size(1), " vs. ",
                                         cell_size * 2));
 
     // Shape of 'w_c' must be [input_size+cell_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(w_c_tensor->shape()),
+                errors::InvalidArgument("Rank of w_c must be 2, got ",
+                                        w_c_tensor->dims()));
     OP_REQUIRES(ctx, w_c_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
-
     OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
                 errors::InvalidArgument(
                     "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
                     " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(b_ru_tensor->shape()),
+                errors::InvalidArgument("Rank of b_ru must be 1, got ",
+                                        b_ru_tensor->dims()));
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
                 errors::InvalidArgument("b_ru.dim_size(0) != cell_size * 2: ",
                                         b_ru_tensor->dim_size(0), " vs. ",
                                         cell_size * 2));
 
-    OP_REQUIRES(ctx, b_ru_tensor->dims() == 1,
-                errors::InvalidArgument("Rank of b_ru must be 1",
-                                        b_ru_tensor->dims(), " vs. 1"));
-
     // Shape of 'b_c' must be [cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(b_c_tensor->shape()),
+                errors::InvalidArgument("Rank of b_c must be 1, got ",
+                                        b_c_tensor->dims()));
     OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
                 errors::InvalidArgument(
                     "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
                     " vs. ", cell_size));
 
-    OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
-                errors::InvalidArgument("Rank of b_c must be 1 ",
-                                        b_c_tensor->dims(), " vs. 1"));
-
     // Shape of 'r' must be [batch_size, cell_size]
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(r_tensor->shape()),
+        errors::InvalidArgument("Rank of r must be 2, got ", r_tensor->dims()));
     OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
                     "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ",
@@ -273,6 +286,9 @@ class GRUBlockCellGradOp : public OpKernel {
                     cell_size));
 
     // Shape of 'u' must be [batch_size, cell_size]
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(u_tensor->shape()),
+        errors::InvalidArgument("Rank of u must be 2, got ", u_tensor->dims()));
     OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
                     "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ",
@@ -283,6 +299,9 @@ class GRUBlockCellGradOp : public OpKernel {
                     cell_size));
 
     // Shape of 'c' must be [batch_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(c_tensor->shape()),
+                errors::InvalidArgument("Rank of w_c must be 2, got ",
+                                        c_tensor->dims()));
     OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
                     "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ",
@@ -293,6 +312,9 @@ class GRUBlockCellGradOp : public OpKernel {
                     cell_size));
 
     // Shape of 'd_h' must be [batch_size, cell_size]
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(d_h_tensor->shape()),
+                errors::InvalidArgument("Rank of d_h must be 2, got ",
+                                        d_h_tensor->dims()));
     OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
                     "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0),
diff --git a/tensorflow/core/kernels/rnn/lstm_ops.cc b/tensorflow/core/kernels/rnn/lstm_ops.cc
index b0a27c1914a..7bd99581fcd 100644
--- a/tensorflow/core/kernels/rnn/lstm_ops.cc
+++ b/tensorflow/core/kernels/rnn/lstm_ops.cc
@@ -901,6 +901,10 @@ class BlockLSTMOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* seq_len_max_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(seq_len_max_tensor->shape()),
+                errors::InvalidArgument(
+                    "`seq_len_max_tensor` must be rank 0 but is rank ",
+                    seq_len_max_tensor->dims()));
 
     const Tensor* x;
     OP_REQUIRES_OK(ctx, ctx->input("x", &x));
@@ -1135,6 +1139,10 @@ class BlockLSTMGradOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor* seq_len_max_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(seq_len_max_tensor->shape()),
+                errors::InvalidArgument(
+                    "`seq_len_max_tensor` must be rank 0 but is rank ",
+                    seq_len_max_tensor->dims()));
 
     const Tensor* x;
     OP_REQUIRES_OK(ctx, ctx->input("x", &x));
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 2919dc04893..5163efa3fce 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -109,7 +109,7 @@ DECLARE_FOR_ALL_REDUCERS(int64_t);
 #undef DECLARE_FOR_ALL_REDUCERS
 
 #define DECLARE_FOR_LOGSUMEXP_REDUCER(T) DECLARE(LogSumExpReducer<T>, T);
-TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_LOGSUMEXP_REDUCER)
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_LOGSUMEXP_REDUCER);
 #undef DECLARE_FOR_LOGSUMEXP_REDUCER
 
 #undef DECLARE
@@ -150,7 +150,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64_t>("Tidx")                               \
           .HostMemory("axis"),                                           \
       ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>, int64>)
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 REGISTER_GPU_KERNELS(int32);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
@@ -189,7 +189,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64_t>("Tidx")                                \
           .HostMemory("axis"),                                            \
       ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>, int64>)
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 REGISTER_GPU_KERNELS(int32);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
@@ -208,7 +208,7 @@ REGISTER_GPU_KERNELS(int64_t);
   REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int32) \
   REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int64_t)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scan_ops_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..630e0a7d490
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_bfloat16.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::SumReducer<Eigen::bfloat16>, Eigen::bfloat16>;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::ProdReducer<Eigen::bfloat16>, Eigen::bfloat16>;
+template struct functor::Scan<
+    GpuDevice, functor::LogSumExpReducer<Eigen::bfloat16>, Eigen::bfloat16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_functor.cc b/tensorflow/core/kernels/scatter_functor.cc
index 07b52b62ae3..ae9992e6ab9 100644
--- a/tensorflow/core/kernels/scatter_functor.cc
+++ b/tensorflow/core/kernels/scatter_functor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/scatter_functor.h"
+
 #include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index 1ba6070bdcd..073039b3fa0 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -43,6 +43,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 DEFINE_GPU_SPECS(Eigen::half);
+DEFINE_GPU_SPECS(Eigen::bfloat16);
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 443a5c1bd38..9effcca70f0 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -1052,7 +1052,7 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
                    bool allocate) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (std::is_same<Device, GPUDevice>::value &&
-      tensorflow::OpDeterminismRequired()) {
+      tensorflow::OpDeterminismRequired() && !DisableScatterOpDeterminism()) {
     return DoScatterNdOnCpu<T, Index, Op>(c, indices, updates, shape, out,
                                           allocate);
   }
diff --git a/tensorflow/core/kernels/scatter_nd_util.h b/tensorflow/core/kernels/scatter_nd_util.h
index c2285f89e99..11ef3a8c7cd 100644
--- a/tensorflow/core/kernels/scatter_nd_util.h
+++ b/tensorflow/core/kernels/scatter_nd_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
 
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/tsl/util/env_var.h"
 
 namespace tensorflow {
 
@@ -25,6 +26,22 @@ Status ValidateScatterNdUpdateShape(const TensorShape& params_shape,
                                     const TensorShape& indices_shape,
                                     const TensorShape& updates_shape);
 
+inline bool DisableScatterOpDeterminism() {
+  static bool cached_disable = [] {
+    bool disable = false;
+    // When determinism is enabled, the kernels for various scatter ops like
+    // ScatterNdAdd will still use the faster non-deterministic versions if this
+    // environmental variable is true. This is useful if the user is certain the
+    // scatter inputs don't have duplicate indices (in which cases scatter ops
+    // are always deterministic), since the deterministic implementations are
+    // currently slow.
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_DISABLE_SCATTER_OP_DETERMINISM",
+                                        /*default_val=*/false, &disable));
+    return disable;
+  }();
+  return cached_disable;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index de89eded74d..12a8b153ebb 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/util/determinism.h"
 #include "tensorflow/core/util/util.h"
 
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -155,7 +154,6 @@ class ScatterUpdateOp : public OpKernel {
   }
 };
 
-
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
                               .Device(DEVICE_##dev)                    \
diff --git a/tensorflow/core/kernels/scatter_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
index 099604646fa..829b7fe8418 100644
--- a/tensorflow/core/kernels/scatter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
@@ -42,6 +42,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 DEFINE_GPU_SPECS(Eigen::half);
+DEFINE_GPU_SPECS(Eigen::bfloat16);
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
 // TODO: The following fails to compile.
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 79dc01baa4a..dda535da1b4 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/simple_philox.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
index e1f7c4e644e..b288630e55d 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -689,6 +689,11 @@ struct ReduceType<functor::Sum, Eigen::half> {
   using type = float;
 };
 
+template <>
+struct ReduceType<functor::Sum, Eigen::bfloat16> {
+  using type = float;
+};
+
 namespace functor {
 
 template <typename T, typename Index, typename InitialValueF,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 530ebba1b71..c502ea5ba0b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -1094,7 +1094,7 @@ struct SparseSegmentGradFunctor<CPUDevice, T, Index, SegmentId> {
                            ? static_cast<T>(1)
                            : static_cast<T>(scaling[idx]));
       if (is_modified[output_idx]) {
-        if (scale == 1.0) {
+        if (scale == T{1.0}) {
           output_flat.template chip<0>(output_idx) +=
               input_flat.template chip<0>(idx);
         } else {
@@ -1102,7 +1102,7 @@ struct SparseSegmentGradFunctor<CPUDevice, T, Index, SegmentId> {
               input_flat.template chip<0>(idx) * scale;
         }
       } else {
-        if (scale == 1.0) {
+        if (scale == T{1.0}) {
           output_flat.template chip<0>(output_idx) =
               input_flat.template chip<0>(idx);
         } else {
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 9175c42841a..2cd47f15df4 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -38,10 +38,8 @@ struct RangeFunctor<CPUDevice, T> {
   void operator()(OpKernelContext* context, int64_t size, T start, T delta,
                   typename TTypes<T>::Flat output) const {
     (void)context;
-    T val = start;
     for (int64_t i = 0; i < size; ++i) {
-      output(i) = T(val);
-      val += delta;
+      output(i) = start + static_cast<T>(i) * delta;
     }
   }
 };
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 3570e6229f6..f8ab82de8ec 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -87,10 +87,10 @@ Status SparseTensorFromContext(OpKernelContext* ctx, const int32_t base_index,
   return tensor->IndicesValid();
 }
 
-// TODO(ptucker): CheckGroup is just a sanity check on the result of
+// TODO(ptucker): CheckGroup is just a redundant check on the result of
 // SparseTensor.group, consider removing.
 // `sparse_tensor_shape` is the shape of the `SparseTensor` from which group
-// was created, and is used to sanity check the indices in `group'.
+// was created, and is used to validate the indices in `group'.
 template <typename T>
 void CheckGroup(OpKernelContext* ctx, const sparse::Group& group,
                 const VarDimArray& sparse_tensor_shape) {
@@ -288,6 +288,9 @@ void SetSizeOp<T>::Compute(OpKernelContext* ctx) {
     const auto group_key = group.group();
     const auto output_index = std::inner_product(
         group_key.begin(), group_key.end(), output_strides.begin(), 0LL);
+    OP_REQUIRES(ctx, output_index < out.size(),
+                errors::InvalidArgument("Index out of range, ", group.indices(),
+                                        " vs ", output_shape_ts.DebugString()));
     out(output_index) = group_set.size();
   }
 }
@@ -510,7 +513,7 @@ void SetOperationOp<T>::ComputeDenseToDense(OpKernelContext* ctx) const {
 
   TensorShape output_shape;
   OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
-  output_shape.AddDim(max_set_size);
+  OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(max_set_size));
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
 
@@ -591,7 +594,7 @@ void SetOperationOp<T>::ComputeDenseToSparse(OpKernelContext* ctx) const {
 
   TensorShape output_shape;
   OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
-  output_shape.AddDim(max_set_size);
+  OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(max_set_size));
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
 
@@ -709,7 +712,7 @@ void SetOperationOp<T>::ComputeSparseToSparse(OpKernelContext* ctx) const {
 
   TensorShape output_shape;
   OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(group_shape, &output_shape));
-  output_shape.AddDim(max_set_size);
+  OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(max_set_size));
   OutputSparseTensor<T>(ctx, output_shape, num_result_values, group_sets);
 }
 
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 63db904b818..6b384e01a74 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -109,7 +109,7 @@ void SharedSliceValidation(OpKernelContext* context, const Tensor& input,
           errors::InvalidArgument("Expected size[", i, "] in [0, ",
                                   input.dim_size(i) - b, "], but ", "got ", s));
     }
-    output_shape->AddDim(s);
+    OP_REQUIRES_OK(context, output_shape->AddDimWithStatus(s));
     const bool take_all = (b == 0) && (s == input.dim_size(i));
     (*is_identity) &= take_all;
     (*slice_dim0) &= (i == 0) || take_all;
@@ -297,7 +297,6 @@ namespace functor {
   DECLARE_GPU_SPEC(T, 7); \
   DECLARE_GPU_SPEC(T, 8);
 
-TF_CALL_bfloat16(DECLARE_FOR_N);
 TF_CALL_int8(DECLARE_FOR_N);
 TF_CALL_int32(DECLARE_FOR_N);
 TF_CALL_int64(DECLARE_FOR_N);
@@ -315,7 +314,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N);
                               .HostMemory("size"),       \
                           SliceOp<GPUDevice, type>)
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int8(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index c20d01751d9..c50af0d0e21 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -17,10 +17,9 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/slice_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/slice_op.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -37,7 +36,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Slice<GPUDevice, T, 7>; \
   template struct functor::Slice<GPUDevice, T, 8>;
 
-TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 TF_CALL_int8(DEFINE_GPU_KERNELS);
 TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/sobol_op.cc b/tensorflow/core/kernels/sobol_op.cc
index 484bac99463..144257bda54 100644
--- a/tensorflow/core/kernels/sobol_op.cc
+++ b/tensorflow/core/kernels/sobol_op.cc
@@ -158,6 +158,10 @@ class SobolSampleOp : public OpKernel {
                 num_results < std::numeric_limits<int32_t>::max() - skip,
                 errors::InvalidArgument("num_results+skip must be less than ",
                                         std::numeric_limits<int32_t>::max()));
+    OP_REQUIRES(context,
+                num_results < std::numeric_limits<int32_t>::max() / dim,
+                errors::InvalidArgument("num_results*dim must be less than ",
+                                        std::numeric_limits<int32_t>::max()));
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 160cf4f4b24..53a7544257e 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -34,30 +34,6 @@ namespace tensorflow {
 
 namespace {
 
-template <typename U, typename T>
-__device__ __host__ EIGEN_STRONG_INLINE
-    typename std::enable_if<!std::is_same<T, U>::value, U>::type
-    strict_cast(T t);
-
-template <typename U, typename T>
-__device__ __host__ EIGEN_STRONG_INLINE
-    typename std::enable_if<std::is_same<T, U>::value, U>::type
-    strict_cast(T t) {
-  return t;
-}
-
-template <>
-__device__ __host__ EIGEN_STRONG_INLINE float strict_cast<float, Eigen::half>(
-    Eigen::half t) {
-  return functor::HalfToFloat()(t);
-}
-
-template <>
-__device__ __host__ EIGEN_STRONG_INLINE Eigen::half
-strict_cast<Eigen::half, float>(float t) {
-  return functor::FloatToHalf()(t);
-}
-
 template <typename T>
 struct softmax_traits {
   using accumulator_type = T;
@@ -68,6 +44,11 @@ struct softmax_traits<Eigen::half> {
   using accumulator_type = float;
 };
 
+template <>
+struct softmax_traits<Eigen::bfloat16> {
+  using accumulator_type = float;
+};
+
 template <typename T, typename U, int kUnroll>
 __global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
                                        const T* max_logits, T* output,
@@ -84,8 +65,8 @@ __global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
     row = tid / num_cols;
     col = tid % num_cols;
     if (row < num_rows && col < num_cols) {
-      input[i] = strict_cast<U>(logits[tid]);
-      max_val[i] = strict_cast<U>(ldg(max_logits + row));
+      input[i] = static_cast<U>(logits[tid]);
+      max_val[i] = static_cast<U>(ldg(max_logits + row));
     }
     tid += gridDim.x * blockDim.x;
   }
@@ -100,7 +81,7 @@ __global__ void GenerateNormalizedProb(const T* logits, const U* sum_probs,
       } else {
         result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row);
       }
-      output[tid] = strict_cast<T>(result[i]);
+      output[tid] = static_cast<T>(result[i]);
     }
     tid += gridDim.x * blockDim.x;
   }
@@ -130,14 +111,14 @@ __global__ void GenerateNormalizedProb<Eigen::half, float, 8>(
     for (int i = 0; i < kUnroll; i++) {
       idx[i] = tid * kUnroll + i;
       row[i] = idx[i] / num_cols;
-      input[i] = strict_cast<float>(logits_h[i]);
-      max_val[i] = strict_cast<float>(ldg(max_logits + row[i]));
+      input[i] = static_cast<float>(logits_h[i]);
+      max_val[i] = static_cast<float>(ldg(max_logits + row[i]));
       if (in_log_space) {
         result[i] = input[i] - max_val[i] - log(ldg(sum_probs + row[i]));
       } else {
         result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row[i]);
       }
-      output_h[i] = strict_cast<Eigen::half>(result[i]);
+      output_h[i] = static_cast<Eigen::half>(result[i]);
     }
 
     *reinterpret_cast<ulonglong2*>(output + tid * kUnroll) = output_d;
@@ -146,14 +127,14 @@ __global__ void GenerateNormalizedProb<Eigen::half, float, 8>(
       if (tid * kUnroll + i < num_rows * num_cols) {
         idx[i] = tid * kUnroll + i;
         row[i] = idx[i] / num_cols;
-        input[i] = strict_cast<float>(logits[idx[i]]);
-        max_val[i] = strict_cast<float>(ldg(max_logits + row[i]));
+        input[i] = static_cast<float>(logits[idx[i]]);
+        max_val[i] = static_cast<float>(ldg(max_logits + row[i]));
         if (in_log_space) {
           result[i] = input[i] - max_val[i] - log(ldg(sum_probs + row[i]));
         } else {
           result[i] = exp(input[i] - max_val[i]) / ldg(sum_probs + row[i]);
         }
-        output[idx[i]] = strict_cast<Eigen::half>(result[i]);
+        output[idx[i]] = static_cast<Eigen::half>(result[i]);
       }
     }
   }
@@ -169,7 +150,7 @@ struct SubtractAndExpFunctor {
   __host__ __device__ U operator()(const int gid) const {
     // TODO(jamesqin): change to half2 load when inputs are Eigen::half.
     const U diff =
-        strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
+        static_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_));
     return exp(diff);
   }
 
@@ -227,7 +208,6 @@ class SoftmaxOpGPU : public OpKernel {
           context, const_cast<T*>(max_logits.flat<T>().data()),
           reinterpret_cast<const T*>(logits_in_.flat<T>().data()), rows, cols);
 
-
       gpuprim::CountingInputIterator<int> counting_iterator(0);
       using InputIterType =
           gpuprim::TransformInputIterator<acc_type,
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index 98a53db4e2d..305c4822e35 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/softplus_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
index 6c90e5bc6ca..e17d8885914 100644
--- a/tensorflow/core/kernels/softplus_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -20,10 +20,9 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/softplus_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/softplus_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index ffcbff876af..5206ceb754a 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/softsign_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -127,7 +128,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_SOFTSIGN_GRAD_GPU_SPEC);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SOFTSIGN_GPU_KERNELS);
 #endif
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SOFTSIGN_GRAD_GPU_KERNELS);
-
 #undef REGISTER_SOFTSIGN_GPU_KERNELS
 #undef REGISTER_SOFTSIGN_GRAD_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/softsign_op_gpu.cu.cc b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
index 79c09212a3f..0a836b6d2ff 100644
--- a/tensorflow/core/kernels/softsign_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softsign_op_gpu.cu.cc
@@ -19,10 +19,9 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/softsign_op.h"
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/softsign_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index 6234bad1f20..bdeb782dc47 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -158,7 +158,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
 
-TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_FOR_T)
+TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_FOR_T);
 
 #undef INSTANTIATE_FOR_T
 #undef INSTANTIATE
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 6cd08b2a67b..56930dbc70c 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -119,12 +119,13 @@ class SpaceToDepthOp : public OpKernel {
 
     // Allocate output tensor.
     Tensor* outputs_tensor = nullptr;
+    TensorShape outputs_tensor_shape;
     OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0,
-                       ShapeFromFormat(data_format_, batch_size, output_height,
-                                       output_width, output_depth),
-                       &outputs_tensor));
+                   ShapeFromFormatWithStatus(
+                       data_format_, batch_size, output_height, output_width,
+                       output_depth, &outputs_tensor_shape));
+    OP_REQUIRES_OK(context, context->allocate_output(0, outputs_tensor_shape,
+                                                     &outputs_tensor));
 
     if (std::is_same<Device, GPUDevice>::value) {
       using RT = typename RawType<T>::type;
@@ -208,6 +209,10 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
     SpaceToDepthOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("SpaceToDepth")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::bfloat16>("T"),
+                        SpaceToDepthOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     SpaceToDepthOp<GPUDevice, qint8>);
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index 610cb5eed59..8bb9474ca9b 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -245,6 +245,12 @@ template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
                                                FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for Eigen::bfloat16.
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NCHW>;
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::bfloat16,
+                                               FORMAT_NHWC>;
+
 // Instantiate the GPU implementations for uint8.
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NCHW>;
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NHWC>;
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index 8be732152fa..aea9b3fe163 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -13,6 +13,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = ["-layering_check"],
     licenses = ["notice"],
@@ -106,6 +107,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:status_matchers",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index 2d214cd0f50..12f4556e829 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -94,6 +94,7 @@ class DenseToCSRSparseMatrixCPUOp : public OpKernel {
 
     const int64_t batch_size = (rank == 2) ? 1 : dense_tensor_shape.dim_size(0);
     const int64_t num_rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1);
+    const int64_t num_cols = dense_tensor_shape.dim_size((rank == 2) ? 1 : 2);
     const int64_t total_nnz = indices.NumElements() / rank;
 
     Tensor values;
@@ -111,10 +112,10 @@ class DenseToCSRSparseMatrixCPUOp : public OpKernel {
 
     // Convert from COO to CSR format.
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
-    OP_REQUIRES_OK(ctx,
-                   coo_to_csr(batch_size, num_rows, indices.matrix<int64_t>(),
-                              batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                              csr_col_ind.vec<int32>()));
+    OP_REQUIRES_OK(
+        ctx, coo_to_csr(batch_size, num_rows, num_cols,
+                        indices.matrix<int64_t>(), batch_ptr.vec<int32>(),
+                        csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
 
     CSRSparseMatrix output_csr_matrix;
     OP_REQUIRES_OK(ctx, CSRSparseMatrix::CreateCSRSparseMatrix(
diff --git a/tensorflow/core/kernels/sparse/kernels.cc b/tensorflow/core/kernels/sparse/kernels.cc
index 4cf375cc317..9f116af324a 100644
--- a/tensorflow/core/kernels/sparse/kernels.cc
+++ b/tensorflow/core/kernels/sparse/kernels.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace functor {
 
 Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
-    const int64_t batch_size, const int num_rows,
+    int64_t batch_size, int num_rows, int num_cols,
     TTypes<int64_t>::ConstMatrix indices, TTypes<int32>::Vec batch_ptr,
     TTypes<int32>::Vec csr_row_ptr, TTypes<int32>::Vec csr_col_ind) {
   // Validate inputs.
@@ -50,6 +50,11 @@ Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
         "Expected batch_size == 1 when rank is 2. Got batch_size: ",
         batch_size);
   }
+  if (rank < 2 || rank > 3) {
+    return errors::InvalidArgument(
+        "Indices must have either 2 or 3 columns.  Got size ",
+        indices.dimensions());
+  }
   if (csr_col_ind.size() != total_nnz) {
     return errors::InvalidArgument(
         "Expected csr_col_ind.size() == total_nnz. Got: ", csr_col_ind.size(),
@@ -63,21 +68,57 @@ Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
     ++prev_batch;
 
     for (int64_t i = 0; i < total_nnz; ++i) {
+      int64_t row = indices(i, 0);
+      if (row < 0 || row >= num_rows) {
+        return errors::InvalidArgument("Row index ", row,
+                                       " is outside of valid range [0, ",
+                                       num_rows, ")");
+      }
+      int64_t col = indices(i, 1);
+      if (col < 0 || col >= num_cols) {
+        return errors::InvalidArgument("Column index ", col,
+                                       " is outside of valid range [0, ",
+                                       num_cols, ")");
+      }
       // For now, the rows pointers store the corresponding row counts.
-      int64_t ix = indices(i, 0) + 1;
+      int64_t ix = row + 1;
       if (ix >= csr_row_ptr.size()) {
         return errors::InvalidArgument("Got an index ", ix,
                                        " that is outside of csr_row_ptr");
       }
-      csr_row_ptr(indices(i, 0) + 1) += 1;
-      csr_col_ind(i) = indices(i, 1);
+
+      csr_row_ptr(ix) += 1;
+      csr_col_ind(i) = col;
     }
   } else {  // rank == 3
     for (int64_t i = 0; i < total_nnz; ++i) {
       const int cur_batch = indices(i, 0);
+      if (cur_batch < 0 || cur_batch >= batch_size) {
+        return errors::InvalidArgument("Batch index ", cur_batch,
+                                       " is outside of valid range [0, ",
+                                       batch_size, ")");
+      }
+      int64_t row = indices(i, 1);
+      if (row < 0 || row >= num_rows) {
+        return errors::InvalidArgument("Row index ", row,
+                                       " is outside of valid range [0, ",
+                                       num_rows, ")");
+      }
+      int64_t col = indices(i, 2);
+      if (col < 0 || col >= num_cols) {
+        return errors::InvalidArgument("Column index ", col,
+                                       " is outside of valid range [0, ",
+                                       num_cols, ")");
+      }
+
       // For now, the rows pointers store the corresponding row counts.
-      csr_row_ptr(cur_batch * (num_rows + 1) + indices(i, 1) + 1) += 1;
-      csr_col_ind(i) = indices(i, 2);
+      int64_t ix = cur_batch * (num_rows + 1) + row + 1;
+      if (ix >= csr_row_ptr.size()) {
+        return errors::InvalidArgument("Got an index ", ix,
+                                       " that is outside of csr_row_ptr");
+      }
+      csr_row_ptr(ix) += 1;
+      csr_col_ind(i) = col;
 
       // We're at a new batch and might have skipped over empty batches.
       while (prev_batch < cur_batch) {
diff --git a/tensorflow/core/kernels/sparse/kernels.h b/tensorflow/core/kernels/sparse/kernels.h
index f1a31d952c5..62583980f21 100644
--- a/tensorflow/core/kernels/sparse/kernels.h
+++ b/tensorflow/core/kernels/sparse/kernels.h
@@ -119,7 +119,7 @@ struct COOSparseMatrixToCSRSparseMatrix {
 //   Also csr_row_ptr should be initially filled with zeros.
 //
 struct SparseTensorToCSRSparseMatrixCPUFunctor {
-  Status operator()(const int64_t batch_size, const int num_rows,
+  Status operator()(int64_t batch_size, int num_rows, int num_cols,
                     TTypes<int64_t>::ConstMatrix indices,
                     TTypes<int32>::Vec batch_ptr,
                     TTypes<int32>::Vec csr_row_ptr,
diff --git a/tensorflow/core/kernels/sparse/kernels_test.cc b/tensorflow/core/kernels/sparse/kernels_test.cc
index 39ab6fa011b..c98775e65f3 100644
--- a/tensorflow/core/kernels/sparse/kernels_test.cc
+++ b/tensorflow/core/kernels/sparse/kernels_test.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
 namespace {
 
 TEST(SparseTensorToCSRSparseMatrix, SingleBatchConversion) {
+  // Single matrix of size 4 x 5.
   const auto indices =
       test::AsTensor<int64_t>({0, 0, 2, 3, 2, 4, 3, 0}, TensorShape({4, 2}));
   Tensor batch_ptr(DT_INT32, {2});
@@ -34,7 +37,7 @@ TEST(SparseTensorToCSRSparseMatrix, SingleBatchConversion) {
   auto csr_row_ptr = test::AsTensor<int32>({0, 0, 0, 0, 0});
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
-  TF_EXPECT_OK(coo_to_csr(1 /* batch_size */, 4 /* num_rows */,
+  TF_EXPECT_OK(coo_to_csr(/*batch_size=*/1, /*num_rows=*/4, /*num_cols=*/5,
                           indices.template matrix<int64_t>(),
                           batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
                           csr_col_ind.vec<int32>()));
@@ -59,7 +62,7 @@ TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
   test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
-  TF_EXPECT_OK(coo_to_csr(3 /* batch_size */, 3 /* num_rows */,
+  TF_EXPECT_OK(coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
                           indices.template matrix<int64_t>(),
                           batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
                           csr_col_ind.vec<int32>()));
@@ -73,6 +76,92 @@ TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
   test::ExpectTensorEqual<int32>(csr_col_ind, test::AsTensor<int32>({0, 3, 1}));
 }
 
+TEST(SparseTensorToCSRSparseMatrix, InvalidBatchThrowsIllegalArgument) {
+  const auto indices =
+      test::AsTensor<int64_t>({0, 0, 0,  //
+                               4, 2, 3,  // Batch out of bounds.
+                               2, 0, 1},
+                              TensorShape({3, 3}));
+  Tensor batch_ptr(DT_INT32, {4});
+  Tensor csr_col_ind(DT_INT32, {3});
+  // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
+  Tensor csr_row_ptr(DT_INT32, {12});
+  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+
+  functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
+  EXPECT_THAT(
+      coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
+                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+      tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                             ::testing::ContainsRegex(
+                                 "Batch index .* is outside of valid range")));
+}
+
+TEST(SparseTensorToCSRSparseMatrix, InvalidRowThrowsIllegalArgument) {
+  const auto indices = test::AsTensor<int64_t>({0, 0, 0,   //
+                                                1, 4, 3,   // Row out of bounds.
+                                                2, 0, 1},  //
+                                               TensorShape({3, 3}));
+  Tensor batch_ptr(DT_INT32, {4});
+  Tensor csr_col_ind(DT_INT32, {3});
+  // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
+  Tensor csr_row_ptr(DT_INT32, {12});
+  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+
+  functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
+  EXPECT_THAT(
+      coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
+                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("Row index .* is outside of valid range")));
+}
+
+TEST(SparseTensorToCSRSparseMatrix, InvalidColThrowsIllegalArgument) {
+  const auto indices = test::AsTensor<int64_t>({0, 0, 0,   //
+                                                1, 2, 6,   // Col out of bounds.
+                                                2, 0, 1},  //
+                                               TensorShape({3, 3}));
+  Tensor batch_ptr(DT_INT32, {4});
+  Tensor csr_col_ind(DT_INT32, {3});
+  // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
+  Tensor csr_row_ptr(DT_INT32, {12});
+  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+
+  functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
+  EXPECT_THAT(
+      coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
+                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+      tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                             ::testing::ContainsRegex(
+                                 "Column index .* is outside of valid range")));
+}
+
+TEST(SparseTensorToCSRSparseMatrix, InvalidRankIllegalArgument) {
+  const auto indices =
+      test::AsTensor<int64_t>({0, 0, 0, 0,           //
+                               1, 2, 2, 3,           //
+                               2, 0, 1, 2},          //
+                              TensorShape({3, 4}));  // Too many columns.
+  Tensor batch_ptr(DT_INT32, {4});
+  Tensor csr_col_ind(DT_INT32, {3});
+  // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
+  Tensor csr_row_ptr(DT_INT32, {12});
+  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+
+  functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
+  EXPECT_THAT(
+      coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
+                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+      tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                             ::testing::ContainsRegex(
+                                 "Indices must have either 2 or 3 columns.")));
+}
+
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index fec0d7514b6..ece82184824 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -245,7 +245,9 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
                         Tensor** output, Tensor* output_transposed,
                         Tensor** matmul_result) {
     TensorShape output_shape;
-    if (rank == 3) output_shape.AddDim(batch_size);
+    if (rank == 3) {
+      TF_RETURN_IF_ERROR(output_shape.AddDimWithStatus(batch_size));
+    }
 
     if (!transpose_output) {
       output_shape.AppendShape({num_rows, num_cols});
@@ -531,13 +533,15 @@ class CSRMatMulGPUOp : public CSRMatMulOp<GPUDevice, T> {
     const int64_t b_slice_size = b_inner_dim * b_outer_dim;
 
     TensorShape c_shape;
-    if (rank == 3) c_shape.AddDim(batch_size);
+    if (rank == 3) {
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(batch_size));
+    }
     if (this->transpose_output_) {
-      c_shape.AddDim(b_outer_dim);
-      c_shape.AddDim(a_outer_dim);
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(b_outer_dim));
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(a_outer_dim));
     } else {
-      c_shape.AddDim(a_outer_dim);
-      c_shape.AddDim(b_outer_dim);
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(a_outer_dim));
+      OP_REQUIRES_OK(ctx, c_shape.AddDimWithStatus(b_outer_dim));
     }
 
     const int64_t c_matrix_lhs = c_shape.dim_size(row_dim);
@@ -647,10 +651,12 @@ class CSRMatMulGPUOp : public CSRMatMulOp<GPUDevice, T> {
     } else {
       TensorShape b_t_transposed_shape;
       if (rank == 3) {
-        b_t_transposed_shape.AddDim(batch_size);
+        OP_REQUIRES_OK(ctx, b_t_transposed_shape.AddDimWithStatus(batch_size));
       }
-      b_t_transposed_shape.AddDim(b_t.dim_size(row_dim + 1));
-      b_t_transposed_shape.AddDim(b_t.dim_size(row_dim));
+      OP_REQUIRES_OK(ctx, b_t_transposed_shape.AddDimWithStatus(
+                              b_t.dim_size(row_dim + 1)));
+      OP_REQUIRES_OK(
+          ctx, b_t_transposed_shape.AddDimWithStatus(b_t.dim_size(row_dim)));
       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
                                              b_t_transposed_shape, &b_t_input));
       const GPUDevice& d = ctx->eigen_device<GPUDevice>();
@@ -735,14 +741,12 @@ namespace {
 template <typename T>
 struct GPUDataType;
 
-// GPUDataType templates are currently not instantiated in the ROCm flow
-// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now
-// hipblas library is not (yet) being pulled in via rocm_configure.bzl
-// so cannot reference tyeps from hipblas headers here
 template <>
 struct GPUDataType<Eigen::half> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_16F;
+#else
+  static constexpr hipDataType type = HIP_R_16F;
 #endif
 };
 
@@ -750,6 +754,8 @@ template <>
 struct GPUDataType<float> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_32F;
+#else
+  static constexpr hipDataType type = HIP_R_32F;
 #endif
 };
 
@@ -757,6 +763,8 @@ template <>
 struct GPUDataType<std::complex<float>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_32F;
+#else
+  static constexpr hipDataType type = HIP_C_32F;
 #endif
 };
 
@@ -764,6 +772,8 @@ template <>
 struct GPUDataType<double> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_64F;
+#else
+  static constexpr hipDataType type = HIP_R_64F;
 #endif
 };
 
@@ -771,6 +781,8 @@ template <>
 struct GPUDataType<std::complex<double>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_64F;
+#else
+  static constexpr hipDataType type = HIP_C_64F;
 #endif
 };
 
@@ -856,10 +868,14 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
           cusparseCreateDnMat(&matC, m, n, ldc, c.data(), GPUDataType<T>::type,
                               CUSPARSE_ORDER_COL));
 
+#if CUDA_VERSION >= 12000
+      cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT;
+#else
+      cusparseSpMMAlg_t algo = CUSPARSE_MM_ALG_DEFAULT;
+#endif
       size_t bufferSize = 0;
       TF_RETURN_IF_ERROR(cuda_sparse.SpMMBufferSize(
-          transA, transB, &alpha, matA, matB, &beta, matC,
-          CUSPARSE_MM_ALG_DEFAULT, &bufferSize));
+          transA, transB, &alpha, matA, matB, &beta, matC, algo, &bufferSize));
 
       Tensor buffer;
       TF_RETURN_IF_ERROR(ctx->allocate_temp(
@@ -867,7 +883,7 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       DCHECK(buffer.flat<int8>().data() != nullptr);
 
       TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
-                                          &beta, matC, CUSPARSE_MM_ALG_DEFAULT,
+                                          &beta, matC, algo,
                                           buffer.flat<int8>().data()));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matB));
@@ -883,16 +899,16 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateCsr(
           &matA, m, k, nnz, const_cast<int*>(a.row_ptr.data()),
           const_cast<int*>(a.col_ind.data()), const_cast<T*>(a.values.data()),
-          CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO,
+          HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO,
           GPUDataType<T>::type));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateDnMat(
           &matB, n, k, ldb, const_cast<T*>(b.data()), GPUDataType<T>::type,
-          HIPSPARSE_ORDER_COL));
+          HIPSPARSE_ORDER_COLUMN));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseCreateDnMat(
           &matC, m, n, ldc, c.data(), GPUDataType<T>::type,
-          HIPSPARSE_ORDER_COL));
+          HIPSPARSE_ORDER_COLUMN));
 
       size_t bufferSize = 0;
       TF_RETURN_IF_ERROR(cuda_sparse.SpMMBufferSize(
@@ -905,7 +921,7 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       DCHECK(buffer.flat<int8>().data() != nullptr);
 
       TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
-                                          &beta, matC, CUSPARSE_MM_ALG_DEFAULT,
+                                          &beta, matC, HIPSPARSE_MM_ALG_DEFAULT,
                                           buffer.flat<int8>().data()));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(se::wrap::hipsparseDestroyDnMat(matB));
diff --git a/tensorflow/core/kernels/sparse/nnz_op.cc b/tensorflow/core/kernels/sparse/nnz_op.cc
index 749f9041993..0156cfdbfbf 100644
--- a/tensorflow/core/kernels/sparse/nnz_op.cc
+++ b/tensorflow/core/kernels/sparse/nnz_op.cc
@@ -49,7 +49,8 @@ class CSRNNZOp : public OpKernel {
     Tensor* nnz_t;
     TensorShape nnz_shape;
     if (csr_sparse_matrix->dims() == 3) {
-      nnz_shape.AddDim(csr_sparse_matrix->batch_size());
+      OP_REQUIRES_OK(
+          c, nnz_shape.AddDimWithStatus(csr_sparse_matrix->batch_size()));
     }
     OP_REQUIRES_OK(c, c->allocate_output(0, nnz_shape, &nnz_t));
     auto nnz = nnz_t->flat<int32>();
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index 7cce44015c2..6899cd9e47e 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/sparse/kernels.h"
 #include "tensorflow/core/kernels/sparse/sparse_matrix.h"
@@ -57,6 +58,28 @@ void SwapDimSizes(const int dim_a, const int dim_b, TensorShape* shape) {
   shape->set_dim(dim_b, size_a);
 }
 
+#if GOOGLE_CUDA
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T, all inputs
+// have more than zero elements and ouput is preallocated.
+template <typename T>
+void ConcatHelper(OpKernelContext* context, const std::vector<Tensor>& inputs,
+                  Tensor* output) {
+  // ConcatGPU expects 2D {1, vec_size} shapes.
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+        input.shaped<T, 2>({1, input.NumElements()})));
+  }
+  auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+  ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+}
+
+#endif
+
 }  // namespace
 
 // Op to compute the matrix multiplication of two CSR Sparse Matrices.
@@ -370,12 +393,6 @@ class CSRSparseMatMulGPUOp : public OpKernel {
     CSRSparseMatrix c;
     Tensor c_row_ptrs;
 
-    // TODO(ebrevdo): Re-enable transposing within the GEMM kernel when cuSparse
-    // stops spitting out CUSPARSE_STATUS_INTERNAL_ERROR values for transposes.
-    functor::CSRSparseSparseMatrixMatMul<Device, T> csr_gemm(
-        ctx, /*transpose_a=*/false, /*adjoint_a=*/false, /*transpose_b=*/false);
-    OP_REQUIRES_OK(ctx, csr_gemm.Initialize());
-
     Tensor c_batch_ptr_t(cpu_allocator(), DT_INT32,
                          TensorShape({batch_size + 1}));
     auto c_batch_ptr = c_batch_ptr_t.vec<int32>();
@@ -417,6 +434,151 @@ class CSRSparseMatMulGPUOp : public OpKernel {
     }
     auto b_input_dense_shape = b_input_matrix->dense_shape().vec<int64_t>();
 
+    // TODO(ebrevdo): Re-enable transposing within the GEMM kernel when cuSparse
+    // stops spitting out CUSPARSE_STATUS_INTERNAL_ERROR values for transposes.
+#if GOOGLE_CUDA && (CUDA_VERSION >= 12000)
+    GpuSparse cudaSparse(ctx);
+    OP_REQUIRES_OK(ctx, cudaSparse.Initialize());
+
+    // Intermediate products, to be concatenated into final result
+    std::vector<Tensor> colidx_vec, values_vec;
+    colidx_vec.reserve(batch_size);
+    values_vec.reserve(batch_size);
+
+    // Temporary buffers reused across batch
+    Tensor buffer1_t, buffer2_t;
+
+    // Compute intermediate results
+    for (int i = 0; i < batch_size; ++i) {
+      GpuSparseSpGEMMDescr gemmDesc;
+      GpuSparseConstSpMatDescr matA;
+      GpuSparseConstSpMatDescr matB;
+      GpuSparseSpMatDescr matC;
+      OP_REQUIRES_OK(ctx, gemmDesc.Initialize());
+      OP_REQUIRES_OK(ctx,
+                     matA.InitializeCsr(
+                         a_input_dense_shape(a_input_dense_shape.size() - 2),
+                         a_input_dense_shape(a_input_dense_shape.size() - 1),
+                         a_input_matrix->col_indices_vec(i).size(),
+                         a_input_matrix->row_pointers_vec(i).data(),
+                         a_input_matrix->col_indices_vec(i).data(),
+                         a_input_matrix->values_vec<T>(i).data()));
+      OP_REQUIRES_OK(ctx,
+                     matB.InitializeCsr(
+                         b_input_dense_shape(b_input_dense_shape.size() - 2),
+                         b_input_dense_shape(b_input_dense_shape.size() - 1),
+                         b_input_matrix->col_indices_vec(i).size(),
+                         b_input_matrix->row_pointers_vec(i).data(),
+                         b_input_matrix->col_indices_vec(i).data(),
+                         b_input_matrix->values_vec<T>(i).data()));
+      OP_REQUIRES_OK(ctx,
+                     matC.InitializeCsr<int, T>(
+                         a_input_dense_shape(a_input_dense_shape.size() - 2),
+                         b_input_dense_shape(b_input_dense_shape.size() - 1), 0,
+                         nullptr, nullptr, nullptr));
+
+      // Check required size for buffer1 and possibly re-allocate
+      size_t bufferSize1;
+      OP_REQUIRES_OK(
+          ctx, cudaSparse.SpGEMM_workEstimation<T>(matA, matB, matC, gemmDesc,
+                                                   &bufferSize1, nullptr));
+      if (bufferSize1 > buffer1_t.NumElements()) {
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_temp(
+                     DT_INT8, TensorShape({static_cast<int64_t>(bufferSize1)}),
+                     &buffer1_t));
+      }
+      void* buffer1 = buffer1_t.flat<int8>().data();
+
+      // Do workEstimation using buffer1.
+      // buffer1 implicitly captured in gemmDesc for use in the compute call.
+      OP_REQUIRES_OK(
+          ctx, cudaSparse.SpGEMM_workEstimation<T>(matA, matB, matC, gemmDesc,
+                                                   &bufferSize1, buffer1));
+
+      // Compute size for buffer2 and possibly re-allocate
+      size_t bufferSize2;
+      OP_REQUIRES_OK(ctx,
+                     cudaSparse.SpGEMM_compute<T>(matA, matB, matC, gemmDesc,
+                                                  &bufferSize2, nullptr));
+      if (bufferSize2 > buffer2_t.NumElements()) {
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_temp(
+                     DT_INT8, TensorShape({static_cast<int64_t>(bufferSize2)}),
+                     &buffer2_t));
+      }
+      void* buffer2 = buffer2_t.flat<int8>().data();
+
+      // Compute the gemm.
+      // Note that buffer1 is implicitly consumed here and buffer2 is implicitly
+      // captured for use by by the copy call.
+      OP_REQUIRES_OK(ctx,
+                     cudaSparse.SpGEMM_compute<T>(matA, matB, matC, gemmDesc,
+                                                  &bufferSize2, buffer2));
+
+      // Get output dimensions and update batch pointer.
+      int64_t cRows, cCols, cNnz;
+      OP_REQUIRES(
+          ctx,
+          cusparseSpMatGetSize(matC.get(), &cRows, &cCols, &cNnz) ==
+              CUSPARSE_STATUS_SUCCESS,
+          errors::Internal("Failed to obtain dimensions from SpMatDescr."));
+      c_batch_ptr(i + 1) = c_batch_ptr(i) + cNnz;
+
+      Tensor colidx_tmp, values_tmp;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_INT32, TensorShape({cNnz}), &colidx_tmp));
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             TensorShape({cNnz}), &values_tmp));
+
+      // Copy product to final c_row_ptr and intermediate column and values
+      // tensors.
+      void* row_ptr = &c_row_ptr(i * (rows + 1));
+      void* col_ptr = colidx_tmp.flat<int32>().data();
+      void* val_ptr = values_tmp.flat<T>().data();
+      cusparseStatus_t cusp_status =
+          cusparseCsrSetPointers(matC.get(), row_ptr, col_ptr, val_ptr);
+      OP_REQUIRES(
+          ctx, cusp_status == CUSPARSE_STATUS_SUCCESS,
+          errors::Internal("Failed to update CSR pointers in SpMatDesc."));
+      OP_REQUIRES_OK(ctx,
+                     cudaSparse.SpGEMM_copy<T>(matA, matB, matC, gemmDesc));
+
+      // We don't record empty column index or value tensors because Concat
+      // expects only non-empty inputs.
+      if (cNnz != 0) {
+        colidx_vec.emplace_back(std::move(colidx_tmp));
+        values_vec.emplace_back(std::move(values_tmp));
+      }
+    }  // End for over batch_size
+
+    // Create final buffers
+    Tensor c_col_ind_t, c_values_t;
+    int total_nnz = c_batch_ptr(batch_size);
+    if (colidx_vec.size() == 1) {
+      c_col_ind_t = std::move(colidx_vec[0]);
+      c_values_t = std::move(values_vec[0]);
+    } else if (total_nnz > 0) {
+      // Multiple intermeidates must be concated together
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, TensorShape({total_nnz}),
+                                             &c_col_ind_t));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                        TensorShape({total_nnz}), &c_values_t));
+      ConcatHelper<int>(ctx, colidx_vec, &c_col_ind_t);
+      ConcatHelper<T>(ctx, values_vec, &c_values_t);
+    }
+
+    OP_REQUIRES_OK(ctx,
+                   CSRSparseMatrix::CreateCSRSparseMatrix(
+                       DataTypeToEnum<T>::value, c_dense_shape_t, c_batch_ptr_t,
+                       c_row_ptr_t, c_col_ind_t, c_values_t, &c));
+
+#else
+    functor::CSRSparseSparseMatrixMatMul<Device, T> csr_gemm(
+        ctx, /*transpose_a=*/false, /*adjoint_a=*/false, /*transpose_b=*/false);
+    OP_REQUIRES_OK(ctx, csr_gemm.Initialize());
+
 #if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     size_t maxWorkspaceSize = 0;
     for (int i = 0; i < batch_size; ++i) {
@@ -497,6 +659,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                              c.values_vec<T>(i), c_dense_shape};
       OP_REQUIRES_OK(ctx, csr_gemm.Compute(a_comp, b_comp, &c_comp, workspace));
     }
+#endif
 
     Tensor c_t(cpu_allocator(), DT_VARIANT, TensorShape({}));
     c_t.scalar<Variant>()() = std::move(c);
@@ -544,7 +707,7 @@ REGISTER_GPU(complex128)
 
 #undef REGISTER
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA && (CUDA_VERSION < 12000) || TENSORFLOW_USE_ROCM
 namespace functor {
 template <typename T>
 struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index 55d85fc654f..f5e551e91ea 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -80,6 +80,7 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     auto dense_shape_vec = dense_shape.vec<int64_t>();
     const int64_t batch_size = (rank == 2) ? 1 : dense_shape_vec(0);
     const int64_t num_rows = dense_shape_vec((rank == 2) ? 0 : 1);
+    const int64_t num_cols = dense_shape_vec((rank == 2) ? 1 : 2);
     const int64_t total_nnz = values.NumElements();
 
     // Allocate output Tensors.
@@ -104,9 +105,9 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
     OP_REQUIRES_OK(
         ctx,
-        coo_to_csr(batch_size, num_rows, indices.template matrix<int64_t>(),
-                   batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                   csr_col_ind.vec<int32>()));
+        coo_to_csr(batch_size, num_rows, num_cols,
+                   indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
+                   csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
 
     // Create the CSRSparseMatrix object from its component Tensors and prepare
     // the Variant output Tensor.
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index 29534bf0a26..71fb8f31376 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -162,7 +162,7 @@ class SparseDenseBinaryOpShared : public OpKernel {
                                   "dense side with broadcasted shape"));       \
       dense_gathered_flat(i) = rhs_ref.coeff(idx);                             \
       if (op_is_div) {                                                         \
-        OP_REQUIRES(ctx, dense_gathered_flat(i) != 0,                          \
+        OP_REQUIRES(ctx, dense_gathered_flat(i) != T{0},                       \
                     errors::InvalidArgument(                                   \
                         "SparseDenseCwiseDiv cannot divide by zero,"           \
                         "but input dense tensor contains zero "));             \
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 1aa2efb5e17..2e5aa6bb3ec 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 #endif
 
 #define ALWAYS_INLINE EIGEN_ALWAYS_INLINE
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index 8d0642eb1d4..0863e5e3794 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/sparse_utils.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -131,22 +132,12 @@ class SparseSparseBinaryOpShared : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape_t));
 
     // Validations.
-    OP_REQUIRES(
-        ctx,
-        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
-        errors::InvalidArgument("Inputs a_indices and b_indices should be "
-                                "matrices but received shapes: ",
-                                a_indices_t->shape().DebugString(), ", ",
-                                b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                    TensorShapeUtils::IsVector(b_values_t->shape()),
-                errors::InvalidArgument(
-                    "Inputs a_values and b_values should be vectors "
-                    "but received shapes: ",
-                    a_values_t->shape().DebugString(), " and ",
-                    b_values_t->shape().DebugString()));
+    OP_REQUIRES_OK(ctx, sparse_utils::ValidateSparseTensor<int64_t>(
+                            *a_indices_t, *a_values_t, *a_shape_t,
+                            sparse_utils::IndexValidation::kUnordered));
+    OP_REQUIRES_OK(ctx, sparse_utils::ValidateSparseTensor<int64_t>(
+                            *b_indices_t, *b_values_t, *b_shape_t,
+                            sparse_utils::IndexValidation::kUnordered));
 
     const int64_t a_nnz = a_indices_t->dim_size(0);
     const int64_t b_nnz = b_indices_t->dim_size(0);
@@ -154,25 +145,7 @@ class SparseSparseBinaryOpShared : public OpKernel {
     const auto a_values = a_values_t->vec<T>();
     const auto b_values = b_values_t->vec<T>();
 
-    OP_REQUIRES(
-        ctx, a_values.size() == a_nnz && b_values.size() == b_nnz,
-        errors::InvalidArgument("Expected ", a_nnz, " and ", b_nnz,
-                                " non-empty input values, got ",
-                                a_values.size(), " and ", b_values.size()));
-
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                    TensorShapeUtils::IsVector(b_shape_t->shape()),
-                errors::InvalidArgument(
-                    "Input shapes should be a vector but received shapes ",
-                    a_shape_t->shape().DebugString(), " and ",
-                    b_shape_t->shape().DebugString()));
     const int num_dims = a_indices_t->dim_size(1);
-    OP_REQUIRES(
-        ctx, a_shape_t->NumElements() == num_dims,
-        errors::InvalidArgument("Second dimension of a_indices and length of "
-                                "a_shape must match, got ",
-                                num_dims, " and ", a_shape_t->NumElements()));
     OP_REQUIRES(ctx, num_dims > 0,
                 errors::InvalidArgument("Tensors must not be empty"));
     OP_REQUIRES(ctx, a_shape_t->IsSameSize(*b_shape_t),
@@ -192,7 +165,7 @@ class SparseSparseBinaryOpShared : public OpKernel {
     const auto a_indices_mat = a_indices_t->matrix<int64_t>();
     const auto b_indices_mat = b_indices_t->matrix<int64_t>();
     std::vector<T> a_augmented_values, b_augmented_values;
-    std::vector<std::pair<bool, int64>> entries_to_copy;  // from_a?, idx
+    std::vector<std::pair<bool, int64_t>> entries_to_copy;  // from_a?, idx
     UnionSparseIndicesAndValues(a_indices_mat, a_values, a_nnz, b_indices_mat,
                                 b_values, b_nnz, num_dims, &a_augmented_values,
                                 &b_augmented_values, &entries_to_copy);
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 547679c80a7..dda270b7d2f 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -277,8 +277,8 @@ class SparseToDenseGPU : public AsyncOpKernel {
   REGISTER_GPU_KERNELS(type, int64_t);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS_ALL);
-TF_CALL_INTEGRAL_TYPES(REGISTER_GPU_KERNELS_ALL)
-REGISTER_GPU_KERNELS_ALL(bool)
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPU_KERNELS_ALL);
+REGISTER_GPU_KERNELS_ALL(bool);
 
 #undef REGISTER_GPU_KERNELS_ALL
 #undef REGISTER_GPU_KERNELS
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_to_dense_op_gpu.cu.cc
index 9e87d83d67a..410edfb1ac5 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_gpu.cu.cc
@@ -244,9 +244,9 @@ void LaunchSparseToDense<T, Index>::operator()(
   template struct functor::LaunchSparseToDense<T, int64>; \
   template struct functor::LaunchSparseToDense<T, int32>;
 
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC)
-TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_SPEC)
-DEFINE_GPU_SPEC(bool)
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_SPEC);
+DEFINE_GPU_SPEC(bool);
 
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index e82f4bc568c..501be41d23f 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -145,6 +145,8 @@ REGISTER(GPU, float, int32)
 REGISTER(GPU, float, int64_t)
 REGISTER(GPU, Eigen::half, int32)
 REGISTER(GPU, Eigen::half, int64_t)
+REGISTER(GPU, Eigen::bfloat16, int32)
+REGISTER(GPU, Eigen::bfloat16, int64_t)
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
index 862048603f5..a89e5bac475 100644
--- a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
@@ -74,7 +74,10 @@ struct SparseXentFunctor<GPUDevice, T, Index> {
   template struct functor::SparseXentFunctor<GPUDevice, float, Index>;       \
   template class generator::SparseXentGradGenerator<float, Index>;           \
   template struct functor::SparseXentFunctor<GPUDevice, Eigen::half, Index>; \
-  template class generator::SparseXentGradGenerator<Eigen::half, Index>;
+  template class generator::SparseXentGradGenerator<Eigen::half, Index>;     \
+  template struct functor::SparseXentFunctor<GPUDevice, Eigen::bfloat16,     \
+                                             Index>;                         \
+  template class generator::SparseXentGradGenerator<Eigen::bfloat16, Index>;
 REGISTER(int32)
 REGISTER(int64)
 #undef REGISTER
diff --git a/tensorflow/core/kernels/special_math/BUILD b/tensorflow/core/kernels/special_math/BUILD
index a10fbac0c89..4f84af23483 100644
--- a/tensorflow/core/kernels/special_math/BUILD
+++ b/tensorflow/core/kernels/special_math/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 # Implementation of Special Functions kernels.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         "parse_headers",
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
index 380c51e6508..3ffb83a83f4 100644
--- a/tensorflow/core/kernels/spectrogram_op_test.cc
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/ops/audio_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
@@ -29,6 +31,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
 namespace ops {
@@ -140,6 +145,42 @@ TEST(SpectrogramOpTest, MultichannelTest) {
   }
 }
 
+TEST(SpectrogramOpTest, InvalidWindowSize) {
+  Scope root = Scope::NewRootScope();
+  const int audio_size = 8;
+  const int channel_size = 2;
+  Tensor audio_tensor(DT_FLOAT, TensorShape({audio_size, channel_size}));
+  test::FillValues<float>(
+      &audio_tensor, {-1.0f, -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, -1.0f,
+                      -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f});
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op,
+                       /*window_size=*/1, /*stride=*/1);
+  EXPECT_THAT(root.status(),
+              tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                                     ::testing::ContainsRegex("window size")));
+}
+
+TEST(SpectrogramOpTest, InvalidStride) {
+  Scope root = Scope::NewRootScope();
+  const int audio_size = 8;
+  const int channel_size = 2;
+  Tensor audio_tensor(DT_FLOAT, TensorShape({audio_size, channel_size}));
+  test::FillValues<float>(
+      &audio_tensor, {-1.0f, -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, -1.0f,
+                      -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f});
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op,
+                       /*window_size=*/2, /*stride=*/0);
+  EXPECT_THAT(root.status(),
+              tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                                     ::testing::ContainsRegex("stride")));
+}
+
 }  // namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_data/BUILD b/tensorflow/core/kernels/spectrogram_test_data/BUILD
index 2c939fae386..b029b7c810c 100644
--- a/tensorflow/core/kernels/spectrogram_test_data/BUILD
+++ b/tensorflow/core/kernels/spectrogram_test_data/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index a6c63d3cbef..90b28292ac0 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -17,13 +17,14 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/core/kernels/split_lib_gpu.h"
+
 #include <stdio.h>
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 #include "tensorflow/core/kernels/split_lib.h"
-#include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
@@ -52,7 +53,6 @@ void SplitCustom<Device, T>::operator()(
   template struct Split<Eigen::GpuDevice, T, 3>;
 
 TF_CALL_int64(DEFINE_GPU_KERNELS);
-TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 TF_CALL_uint8(DEFINE_GPU_KERNELS);
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 
@@ -61,7 +61,6 @@ TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_COMPLEX_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
@@ -245,7 +244,6 @@ void SplitVOpGPULaunch<T, IntType>::Run(
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
   template struct SplitVOpGPULaunch<T, int8>;  \
@@ -254,7 +252,6 @@ TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNEL);
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_gpu.h b/tensorflow/core/kernels/split_lib_gpu.h
index bd59319f31f..ce8525e9735 100644
--- a/tensorflow/core/kernels/split_lib_gpu.h
+++ b/tensorflow/core/kernels/split_lib_gpu.h
@@ -51,7 +51,6 @@ struct SplitVOpGPULaunch {
   extern template struct SplitVOpGPULaunch<T, int32>; \
   extern template struct SplitVOpGPULaunch<T, int64_t>;
 
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_uint8(REGISTER_GPU_KERNEL);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 4e966e1b67d..83a40c9207f 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -322,7 +322,6 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
                               .Device(DEVICE_CPU)        \
@@ -344,12 +343,10 @@ REGISTER_SPLIT(quint8);
                               .HostMemory("split_dim"),  \
                           SplitOpGPU<type>)
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index ee9fff0f8b4..fade09dee00 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -241,37 +241,36 @@ class SplitVOpCPUImpl {
          input_element_count >= std::min(num_threads, num_split) * 4096 &&
          input_element_count < num_split * 180 * 1024);
 
-    auto range_output_func = [&indices, context, &input_shape, split_dim,
-                              &split_sizes_vec, &split_start_points,
-                              use_parallelism_between_outputs, &input_reshaped,
-                              &make_sizes,
-                              &reshape_result](int64_t start, int64_t limit) {
-      for (int64_t i = start; i < limit; ++i) {
-        TensorShape output_shape(input_shape);
-        output_shape.set_dim(split_dim, split_sizes_vec[i]);
-        Tensor* result = nullptr;
-        OP_REQUIRES_OK(context,
-                       context->allocate_output(i, output_shape, &result));
-
-        const auto sizes = make_sizes(split_sizes_vec[i]);
-
-        if (sizes.TotalSize() > 0) {
-          auto result_shaped = reshape_result(result, split_sizes_vec[i]);
-
-          auto current_indices = indices;
-          current_indices[NDims - 2] = split_start_points[i];
-          if (use_parallelism_between_outputs) {
-            // Use sequential implementation for single output.
-            result_shaped = input_reshaped.slice(current_indices, sizes);
-          } else {
-            // This implementation may be parallel internally.
-            functor::Split<CPUDevice, T, NDims>()(
-                context->eigen_device<CPUDevice>(), result_shaped,
-                input_reshaped, current_indices, sizes);
+    auto range_output_func =
+        [&indices, context, &input_shape, split_dim, &split_sizes_vec,
+         &split_start_points, use_parallelism_between_outputs, &input_reshaped,
+         &make_sizes, &reshape_result](int64_t start, int64_t limit) {
+          for (int64_t i = start; i < limit; ++i) {
+            TensorShape output_shape(input_shape);
+            output_shape.set_dim(split_dim, split_sizes_vec[i]);
+            Tensor* result = nullptr;
+            OP_REQUIRES_OK(context,
+                           context->allocate_output(i, output_shape, &result));
+
+            const auto sizes = make_sizes(split_sizes_vec[i]);
+
+            if (sizes.TotalSize() > 0) {
+              auto result_shaped = reshape_result(result, split_sizes_vec[i]);
+
+              auto current_indices = indices;
+              current_indices[NDims - 2] = split_start_points[i];
+              if (use_parallelism_between_outputs) {
+                // Use sequential implementation for single output.
+                result_shaped = input_reshaped.slice(current_indices, sizes);
+              } else {
+                // This implementation may be parallel internally.
+                functor::Split<CPUDevice, T, NDims>()(
+                    context->eigen_device<CPUDevice>(), result_shaped,
+                    input_reshaped, current_indices, sizes);
+              }
+            }
           }
-        }
-      }
-    };
+        };
 
     if (use_parallelism_between_outputs) {
       // A thread maps a output tensor, this thread will traverse all the data,
@@ -508,7 +507,6 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
   REGISTER_GPU(type, int32);   \
   REGISTER_GPU(type, int64_t);
 
-TF_CALL_bfloat16(REGISTER_GPU_LEN);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU_LEN);
 #undef REGISTER_GPU_LEN
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h
index bddd70016e9..6b32fbf83fe 100644
--- a/tensorflow/core/kernels/string_util.h
+++ b/tensorflow/core/kernels/string_util.h
@@ -56,7 +56,7 @@ bool ForwardNUTF8CharPositions(const StringPiece in,
     // move forward one utf-8 character
     do {
       ++*pos;
-    } while (IsTrailByte(in[*pos]) && *pos < size);
+    } while (*pos < size && IsTrailByte(in[*pos]));
     ++utf8_chars_counted;
   }
   return utf8_chars_counted == num_utf8_chars_to_shift;
diff --git a/tensorflow/core/kernels/sync_ops.cc b/tensorflow/core/kernels/sync_ops.cc
new file mode 100644
index 00000000000..fe2650512ce
--- /dev/null
+++ b/tensorflow/core/kernels/sync_ops.cc
@@ -0,0 +1,59 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+namespace tensorflow {
+namespace {
+
+class SyncDeviceOp : public OpKernel {
+ public:
+  explicit SyncDeviceOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {}
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(SyncDeviceOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("SyncDevice").Device(DEVICE_DEFAULT),
+                        SyncDeviceOp);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+class SyncDeviceGpuOp : public OpKernel {
+ public:
+  explicit SyncDeviceGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const DeviceBase::AcceleratorDeviceInfo* info =
+        context->device()->tensorflow_accelerator_device_info();
+    if (info && info->stream) {
+      OP_REQUIRES_OK(context, info->stream->BlockHostUntilDone());
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(SyncDeviceGpuOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("SyncDevice").Device(DEVICE_GPU), SyncDeviceGpuOp);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 4a0d403a816..97e4cd45b08 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_TENSOR_ARRAY_H_
 
 #include <limits.h>
+
 #include <vector>
 
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 72accb86aa2..bd23fceb6a5 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -80,8 +80,9 @@ Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
     TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
     ResourceMgr* rm = ctx->resource_manager();
     if (rm == nullptr) return errors::Internal("No resource manager.");
-    TF_RETURN_IF_ERROR(
-        ctx->step_container()->Lookup(rm, container + ta_handle, tensor_array));
+    ScopedStepContainer* sc = ctx->step_container();
+    if (sc == nullptr) return errors::Internal("No step container.");
+    TF_RETURN_IF_ERROR(sc->Lookup(rm, container + ta_handle, tensor_array));
     return OkStatus();
   } else {
     return LookupResource(ctx, HandleFromInput(ctx, 0), tensor_array);
@@ -257,7 +258,6 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
                           TensorArrayOp);
 
 TF_CALL_int64(REGISTER_GPU);
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -481,7 +481,6 @@ TF_CALL_ALL_TYPES(REGISTER_WRITE);
                               .HostMemory("index"),             \
                           TensorArrayWriteOp<GPUDevice, type>);
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -570,7 +569,6 @@ TF_CALL_ALL_TYPES(REGISTER_READ)
                           TensorArrayReadOp<GPUDevice, type>);
 
 TF_CALL_int64(REGISTER_GPU);
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -769,7 +767,6 @@ REGISTER_GATHER_AND_PACK(qint32);
           .HostMemory("handle"),                                            \
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -985,7 +982,6 @@ REGISTER_CONCAT(qint32);
                               .HostMemory("handle"),             \
                           TensorArrayConcatOp<GPUDevice, type>)
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/tensor_cord.h b/tensorflow/core/kernels/tensor_cord.h
index d091212e79a..2d3d4e3fb01 100644
--- a/tensorflow/core/kernels/tensor_cord.h
+++ b/tensorflow/core/kernels/tensor_cord.h
@@ -51,7 +51,7 @@ class TensorCord {
   //   delete static_cast<TensorProto*>(ptr);
   // }
   //
-  // auto p = absl::MakeUnique<TensorProto>(...);
+  // auto p = std::make_unique<TensorProto>(...);
   // absl::string_view content(p->tensor_content());
   // TensorCord tc(content, TensorProtoDeleter, p.release());
   //
diff --git a/tensorflow/core/kernels/tile_functor_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..22d0b7b1255
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_bfloat16.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, Eigen::bfloat16, int32>;
+template struct Tile<GpuDevice, Eigen::bfloat16, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index f86bd4139b9..427fc2cf86e 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -92,6 +92,8 @@ extern template struct Tile<GPUDevice, complex128, int32>;
 extern template struct Tile<GPUDevice, complex128, int64_t>;
 extern template struct Tile<GPUDevice, Eigen::half, int32>;
 extern template struct Tile<GPUDevice, Eigen::half, int64_t>;
+extern template struct Tile<GPUDevice, Eigen::bfloat16, int32>;
+extern template struct Tile<GPUDevice, Eigen::bfloat16, int64_t>;
 extern template struct Tile<GPUDevice, int16, int32>;
 extern template struct Tile<GPUDevice, int16, int64_t>;
 extern template struct Tile<GPUDevice, int32, int32>;
@@ -311,6 +313,7 @@ TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
+TF_CALL_bfloat16(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -566,6 +569,7 @@ TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
+TF_CALL_bfloat16(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -634,6 +638,7 @@ TF_CALL_bool(REGISTER_GPU_TILE);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_half(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int16(REGISTER_GPU);
 TF_CALL_int32(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index fd1268d9a40..f1bbbf1e79d 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -42,18 +42,19 @@ limitations under the License.
   template struct TileGrad<Eigen::GpuDevice, T, NDIM>; \
   template struct ReduceAndReshape<Eigen::GpuDevice, T, NDIM, 1>;
 
-#define DEFINE_TILE_OPS(NDIM)   \
-  namespace tensorflow {        \
-  namespace functor {           \
-  DEFINE_DIM(int16, NDIM)       \
-  DEFINE_DIM(int32, NDIM)       \
-  DEFINE_DIM(int64, NDIM)       \
-  DEFINE_DIM(Eigen::half, NDIM) \
-  DEFINE_DIM(float, NDIM)       \
-  DEFINE_DIM(double, NDIM)      \
-  DEFINE_DIM(complex64, NDIM)   \
-  DEFINE_DIM(complex128, NDIM)  \
-  }                             \
+#define DEFINE_TILE_OPS(NDIM)       \
+  namespace tensorflow {            \
+  namespace functor {               \
+  DEFINE_DIM(int16, NDIM)           \
+  DEFINE_DIM(int32, NDIM)           \
+  DEFINE_DIM(int64, NDIM)           \
+  DEFINE_DIM(Eigen::half, NDIM)     \
+  DEFINE_DIM(Eigen::bfloat16, NDIM) \
+  DEFINE_DIM(float, NDIM)           \
+  DEFINE_DIM(double, NDIM)          \
+  DEFINE_DIM(complex64, NDIM)       \
+  DEFINE_DIM(complex128, NDIM)      \
+  }                                 \
   }
 
 #endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index b7f9f99cba3..59556f74547 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <algorithm>
 #include <numeric>
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h
index e47dcf54a90..9c8f3a94f3d 100644
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -486,8 +486,9 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
 #if GOOGLE_CUDA
     constexpr bool is_supported = true;
 #else
-    // GpuRadixSortDescending is not supported on ROCm for fp16.
-    constexpr bool is_supported = !std::is_same<T, Eigen::half>::value;
+    // GpuRadixSortDescending is not supported on ROCm for fp16/bf16.
+    constexpr bool is_supported = !std::is_same<T, Eigen::half>::value &&
+                                  !std::is_same<T, Eigen::bfloat16>::value;
 #endif
     if constexpr (is_supported) {
       // Note: DeviceSegmentedRadixSort is very slow when num_segments=1 because
diff --git a/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc
new file mode 100644
index 00000000000..26feed6ce79
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_bfloat16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, Eigen::bfloat16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index e6de0795977..c45160036f7 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -155,7 +155,7 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
         errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
     bits[d] = true;
     const auto dim_size = input.dim_size(d);
-    shape.AddDim(dim_size);
+    OP_REQUIRES_OK(ctx, shape.AddDimWithStatus(dim_size));
     if (d != i) {
       is_identity = false;
     }
diff --git a/tensorflow/core/kernels/uniform_quant_ops/BUILD b/tensorflow/core/kernels/uniform_quant_ops/BUILD
index c25aae579de..cb3cb5b3f52 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/BUILD
+++ b/tensorflow/core/kernels/uniform_quant_ops/BUILD
@@ -5,10 +5,6 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
 
 # copybara:uncomment_begin(google-only)
 # # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
@@ -16,6 +12,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = ["-layering_check"],
     licenses = ["notice"],
@@ -38,6 +35,7 @@ filegroup(
         "math_utils.cc",
         "tensor_utils.cc",
         "uniform_quantize_op.cc",
+        "uniform_quantized_add_op.cc",
         "uniform_dequantize_op.cc",
         "uniform_requantize_op.cc",
         "uniform_quantized_dot_ops.cc",
@@ -52,6 +50,7 @@ tf_kernel_library(
     srcs = [
         "uniform_dequantize_op.cc",
         "uniform_quantize_op.cc",
+        "uniform_quantized_add_op.cc",
         "uniform_quantized_clip_by_value_op.cc",
         "uniform_quantized_convolution_ops.cc",
         "uniform_quantized_dot_ops.cc",
@@ -92,7 +91,6 @@ tf_cc_test(
     name = "uniform_quantize_op_test",
     size = "small",
     srcs = ["uniform_quantize_op_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
@@ -106,7 +104,6 @@ tf_cc_test(
     name = "uniform_requantize_op_test",
     size = "small",
     srcs = ["uniform_requantize_op_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
@@ -120,7 +117,6 @@ tf_cc_test(
     name = "uniform_dequantize_op_test",
     size = "small",
     srcs = ["uniform_dequantize_op_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
@@ -134,7 +130,6 @@ tf_cc_test(
     name = "uniform_quantized_dot_ops_test",
     size = "small",
     srcs = ["uniform_quantized_dot_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
@@ -148,7 +143,6 @@ tf_cc_test(
     name = "uniform_quantized_convolution_ops_test",
     size = "small",
     srcs = ["uniform_quantized_convolution_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
@@ -160,11 +154,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "uniform_quantized_add_op_test",
+    size = "small",
+    srcs = ["uniform_quantized_add_op_test.cc"],
+    deps = [
+        ":kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "uniform_quantized_clip_by_value_op_test",
     size = "small",
     srcs = ["uniform_quantized_clip_by_value_op_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":kernels",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/uniform_quant_ops/math_utils.h b/tensorflow/core/kernels/uniform_quant_ops/math_utils.h
index 90d9b4c8bde..8b342b42951 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/math_utils.h
+++ b/tensorflow/core/kernels/uniform_quant_ops/math_utils.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <limits>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -202,6 +203,132 @@ Tout AffineRequantizeWithQuantizedMultiplierAndShift(
                         quantization_min_val));
 }
 
+namespace internal {
+
+// Requantize from per-tensor to per-tensor.
+template <typename Tin, typename Tout>
+Status PerTensorToPerTensorRequantize(
+    const Tensor& input, float input_scale, int32_t input_zero_point,
+    float output_scale, int32_t output_zero_point, int32_t quantization_min_val,
+    int32_t quantization_max_val, Tensor& output) {
+  const double effective_multiplier =
+      static_cast<double>(input_scale) / output_scale;
+  int32_t effective_quantized_multiplier;
+  int32_t effective_shift;
+  TF_RETURN_IF_ERROR(QuantizeMultiplier(
+      effective_multiplier, effective_quantized_multiplier, effective_shift));
+
+  output.flat<Tout>() = input.flat<Tin>().unaryExpr(
+      [effective_quantized_multiplier, effective_shift, input_zero_point,
+       output_zero_point, quantization_min_val,
+       quantization_max_val](Tin input_val) {
+        return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
+            input_val, effective_quantized_multiplier, effective_shift,
+            input_zero_point, output_zero_point, quantization_min_val,
+            quantization_max_val);
+      });
+  return OkStatus();
+}
+
+// Requantize where the input or output contains any per-axis quantized cases.
+// - From per-tensor to per-axis.
+// - From per-axis to per-tensor.
+// - From per-axis to per-axis.
+template <typename Tin, typename Tout>
+Status PerAxisRequantize(OpKernelContext* context, const Tensor& input,
+                         const Tensor& input_scales,
+                         const Tensor& input_zero_points,
+                         const Tensor& output_scales,
+                         const Tensor& output_zero_points,
+                         int quantization_axis, int32_t quantization_min_val,
+                         int32_t quantization_max_val, Tensor& output) {
+  const bool input_per_axis_quantization = input_scales.dims() == 1;
+  const bool output_per_axis_quantization = output_scales.dims() == 1;
+  const auto& per_axis_scales_shape = input_per_axis_quantization
+                                          ? input_scales.shape()
+                                          : output_scales.shape();
+
+  Tensor effective_quantized_multipliers;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
+                                            &effective_quantized_multipliers));
+  Tensor effective_shifts;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
+                                            &effective_shifts));
+
+  const float* input_scales_data = input_scales.flat<float>().data();
+  const float* output_scales_data = output_scales.flat<float>().data();
+  int32_t* effective_quantized_multipliers_data =
+      effective_quantized_multipliers.flat<int32_t>().data();
+  int32_t* effective_shifts_data = effective_shifts.flat<int32_t>().data();
+
+  const int64_t quantization_dim_size = output.dim_size(quantization_axis);
+
+  for (int64_t i = 0; i < quantization_dim_size; ++i) {
+    const double effective_multiplier =
+        static_cast<double>(
+            input_scales_data[input_per_axis_quantization ? i : 0]) /
+        output_scales_data[output_per_axis_quantization ? i : 0];
+    TF_RETURN_IF_ERROR(QuantizeMultiplier(
+        effective_multiplier, effective_quantized_multipliers_data[i],
+        effective_shifts_data[i]));
+  }
+
+  const int32* input_zero_points_data = input_zero_points.flat<int32>().data();
+  const int32* output_zero_points_data =
+      output_zero_points.flat<int32>().data();
+
+  auto input_tensor =
+      input.template flat_inner_outer_dims<Tin, 3>(quantization_axis - 1);
+  auto output_tensor =
+      output.template flat_inner_outer_dims<Tout, 3>(quantization_axis - 1);
+
+  for (int i = 0; i < quantization_dim_size; ++i) {
+    output_tensor.template chip<1>(i) =
+        input_tensor.template chip<1>(i).unaryExpr(
+            [effective_quantized_multipliers_data, effective_shifts_data,
+             input_zero_points_data, output_zero_points_data,
+             quantization_min_val, quantization_max_val,
+             input_per_axis_quantization, output_per_axis_quantization,
+             i](Tin input_val) {
+              return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
+                  input_val, effective_quantized_multipliers_data[i],
+                  effective_shifts_data[i],
+                  input_zero_points_data[input_per_axis_quantization ? i : 0],
+                  output_zero_points_data[output_per_axis_quantization ? i : 0],
+                  quantization_min_val, quantization_max_val);
+            });
+  }
+  return OkStatus();
+}
+
+}  // namespace internal
+
+template <typename Tin, typename Tout>
+Status EvalRequantize(OpKernelContext* context, const Tensor& input,
+                      const Tensor& input_scales,
+                      const Tensor& input_zero_points,
+                      const Tensor& output_scales,
+                      const Tensor& output_zero_points,
+                      int input_quantization_axis, int output_quantization_axis,
+                      int32_t quantization_min_val,
+                      int32_t quantization_max_val, Tensor& output) {
+  if (input_quantization_axis == -1 && output_quantization_axis == -1) {
+    return internal::PerTensorToPerTensorRequantize<Tin, Tout>(
+        input, input_scales.scalar<float>()(),
+        input_zero_points.scalar<int32>()(), output_scales.scalar<float>()(),
+        output_zero_points.scalar<int32>()(), quantization_min_val,
+        quantization_max_val, output);
+  } else {
+    const int quantization_axis = input_quantization_axis >= 0
+                                      ? input_quantization_axis
+                                      : output_quantization_axis;
+    return internal::PerAxisRequantize<Tin, Tout>(
+        context, input, input_scales, input_zero_points, output_scales,
+        output_zero_points, quantization_axis, quantization_min_val,
+        quantization_max_val, output);
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_UNIFORM_QUANT_OPS_MATH_UTILS_H_
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op.cc
new file mode 100644
index 00000000000..1a3291dfa54
--- /dev/null
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op.cc
@@ -0,0 +1,268 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/uniform_quant_ops/math_utils.h"
+#include "tensorflow/core/kernels/uniform_quant_ops/tensor_utils.h"
+
+namespace tensorflow {
+
+namespace {
+
+using errors::InvalidArgument;
+
+StatusOr<TensorShape> CalculateOutputShape(const TensorShape& lhs_shape,
+                                           const TensorShape& rhs_shape) {
+  if (lhs_shape.dims() == 0) {
+    return rhs_shape;
+  } else if (rhs_shape.dims() == 0) {
+    return lhs_shape;
+  }
+
+  std::vector<int64_t> reversed_output_shape;
+  int l_dim = lhs_shape.dims() - 1;
+  int r_dim = rhs_shape.dims() - 1;
+  while (l_dim >= 0 || r_dim >= 0) {
+    const int64_t l_dim_size = l_dim >= 0 ? lhs_shape.dim_size(l_dim) : 1;
+    const int64_t r_dim_size = r_dim >= 0 ? rhs_shape.dim_size(r_dim) : 1;
+    if (l_dim_size != 1 && r_dim_size != 1 && l_dim_size != r_dim_size) {
+      return InvalidArgument("Cannot Add tensors of shapes: ",
+                             lhs_shape.DebugString(), rhs_shape.DebugString());
+    }
+    reversed_output_shape.push_back(l_dim_size == 1 ? r_dim_size : l_dim_size);
+    --l_dim;
+    --r_dim;
+  }
+  absl::c_reverse(reversed_output_shape);
+  TensorShape output_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShape::BuildTensorShape(reversed_output_shape, &output_shape));
+  return output_shape;
+}
+
+template <typename T>
+void QuantizedAdd(const Tensor& lhs, const Tensor& rhs,
+                  const Tensor& output_zero_points,
+                  int output_quantization_min_val,
+                  int output_quantization_max_val, int lhs_quantization_axis,
+                  int rhs_quantization_axis, int output_quantizaiton_axis,
+                  Tensor& output) {
+  const T* lhs_data = lhs.flat<T>().data();
+  const T* rhs_data = rhs.flat<T>().data();
+  T* output_data = output.flat<T>().data();
+
+  const int32* output_zero_points_data =
+      output_zero_points.flat<int32>().data();
+
+  for (int64_t output_idx = 0; output_idx < output.NumElements();
+       ++output_idx) {
+    int64_t output_idx_remain = output_idx;
+    int64_t lhs_idx = 0;
+    int64_t rhs_idx = 0;
+    int64_t lhs_inner_dim_size = 1;
+    int64_t rhs_inner_dim_size = 1;
+    int64_t output_zero_points_idx_of_quantization_axis = 0;
+    for (int output_dim = output.dims() - 1; output_dim >= 0; --output_dim) {
+      const int64_t output_idx_of_dim =
+          output_idx_remain % output.dim_size(output_dim);
+      output_idx_remain /= output.dim_size(output_dim);
+      if (output_quantizaiton_axis == output_dim) {
+        output_zero_points_idx_of_quantization_axis = output_idx_of_dim;
+      }
+
+      const int lhs_dim = output_dim - (output.dims() - lhs.dims());
+      if (lhs_dim >= 0) {
+        const int64_t lhs_idx_of_dim =
+            lhs.dim_size(lhs_dim) == 1 ? 0 : output_idx_of_dim;
+        lhs_idx += lhs_idx_of_dim * lhs_inner_dim_size;
+        lhs_inner_dim_size *= lhs.dim_size(lhs_dim);
+      }
+      const int rhs_dim = output_dim - (output.dims() - rhs.dims());
+      if (rhs_dim >= 0) {
+        const int64_t rhs_idx_of_dim =
+            rhs.dim_size(rhs_dim) == 1 ? 0 : output_idx_of_dim;
+        rhs_idx += rhs_idx_of_dim * rhs_inner_dim_size;
+        rhs_inner_dim_size *= rhs.dim_size(rhs_dim);
+      }
+    }
+
+    const int32_t output_zero_point =
+        output_zero_points_data[output_zero_points_idx_of_quantization_axis];
+
+    const int32_t unclamped = static_cast<int32_t>(lhs_data[lhs_idx]) +
+                              static_cast<int32_t>(rhs_data[rhs_idx]) +
+                              output_zero_point;
+    output_data[output_idx] = static_cast<T>(std::clamp(
+        unclamped, output_quantization_min_val, output_quantization_max_val));
+  }
+}
+
+template <typename T>
+Status EvalQuantizedAdd(OpKernelContext* context, const Tensor& lhs,
+                        const Tensor& rhs, const Tensor& lhs_scales,
+                        const Tensor& lhs_zero_points, const Tensor& rhs_scales,
+                        const Tensor& rhs_zero_points,
+                        const Tensor& output_scales,
+                        const Tensor& output_zero_points,
+                        int output_quantization_min_val,
+                        int output_quantization_max_val,
+                        int lhs_quantization_axis, int rhs_quantization_axis,
+                        int output_quantization_axis, Tensor& output) {
+  const DataType dtype = DataTypeToEnum<T>::v();
+
+  Tensor zeros_of_output_scales_shape;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, output_scales.shape(),
+                                            &zeros_of_output_scales_shape));
+  zeros_of_output_scales_shape.flat<int32_t>().setZero();
+
+  Tensor lhs_requantized;
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(dtype, lhs.shape(), &lhs_requantized));
+  const int lhs_requantize_output_quantization_axis =
+      output_quantization_axis == -1 ? -1 : lhs_quantization_axis;
+  TF_RETURN_IF_ERROR(EvalRequantize<T, T>(
+      context, lhs, lhs_scales, lhs_zero_points, output_scales,
+      /*output_zero_points=*/zeros_of_output_scales_shape,
+      lhs_quantization_axis, lhs_requantize_output_quantization_axis,
+      /*quantization_min_val=*/std::numeric_limits<T>::min(),
+      /*quantization_max_val=*/std::numeric_limits<T>::max(), lhs_requantized));
+
+  Tensor rhs_requantized;
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(dtype, rhs.shape(), &rhs_requantized));
+  TF_RETURN_IF_ERROR(EvalRequantize<T, T>(
+      context, rhs, rhs_scales, rhs_zero_points, output_scales,
+      /*output_zero_points=*/zeros_of_output_scales_shape,
+      rhs_quantization_axis, output_quantization_axis,
+      /*quantization_min_val=*/std::numeric_limits<T>::min(),
+      /*quantization_max_val=*/std::numeric_limits<T>::max(), rhs_requantized));
+
+  QuantizedAdd<T>(lhs_requantized, rhs_requantized, output_zero_points,
+                  output_quantization_min_val, output_quantization_max_val,
+                  lhs_quantization_axis, rhs_quantization_axis,
+                  output_quantization_axis, output);
+
+  return OkStatus();
+}
+
+}  // namespace
+
+template <typename T>
+class UniformQuantizedAddOp : public OpKernel {
+ public:
+  explicit UniformQuantizedAddOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES(context, (std::is_same<T, qint32>()),
+                InvalidArgument("Unsupported operand type."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_min_val",
+                                             &output_quantization_min_val_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_max_val",
+                                             &output_quantization_max_val_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("lhs_quantization_axis",
+                                             &lhs_quantization_axis_));
+    OP_REQUIRES_OK(context, context->GetAttr("rhs_quantization_axis",
+                                             &rhs_quantization_axis_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_axis",
+                                             &output_quantization_axis_));
+
+    OP_REQUIRES(
+        context,
+        (lhs_quantization_axis_ >= -1 && rhs_quantization_axis_ >= -1 &&
+         output_quantization_axis_ >= -1),
+        InvalidArgument("lhs, rhs and output quantization_axis must be -1 or "
+                        "within [0, dims)"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& lhs = context->input(0);
+    const Tensor& rhs = context->input(1);
+    const Tensor& lhs_scales = context->input(2);
+    const Tensor& lhs_zero_points = context->input(3);
+    const Tensor& rhs_scales = context->input(4);
+    const Tensor& rhs_zero_points = context->input(5);
+    const Tensor& output_scales = context->input(6);
+    const Tensor& output_zero_points = context->input(7);
+
+    OP_REQUIRES_OK(
+        context, QuantizationAxisAndShapeValid(lhs.shape(), lhs_scales.shape(),
+                                               lhs_zero_points.shape(),
+                                               lhs_quantization_axis_));
+    OP_REQUIRES_OK(
+        context, QuantizationAxisAndShapeValid(rhs.shape(), rhs_scales.shape(),
+                                               rhs_zero_points.shape(),
+                                               rhs_quantization_axis_));
+
+    auto output_shape_status = CalculateOutputShape(lhs.shape(), rhs.shape());
+    OP_REQUIRES_OK(context, output_shape_status.status());
+
+    const auto& output_shape = output_shape_status.value();
+    OP_REQUIRES_OK(context,
+                   QuantizationAxisAndShapeValid(
+                       output_shape, output_scales.shape(),
+                       output_zero_points.shape(), output_quantization_axis_));
+
+    OP_REQUIRES(
+        context,
+        (!(lhs_quantization_axis_ >= 0 && output_quantization_axis_ >= 0) ||
+         (lhs.dims() - lhs_quantization_axis_ ==
+          output_shape.dims() - output_quantization_axis_)),
+        InvalidArgument("If lhs and output is both per-axis quantized, the "
+                        "quantization axis must match."));
+    OP_REQUIRES(
+        context,
+        (!(rhs_quantization_axis_ >= 0 && output_quantization_axis_ >= 0) ||
+         (rhs.dims() - rhs_quantization_axis_ ==
+          output_shape.dims() - output_quantization_axis_)),
+        InvalidArgument("If rhs and output is both per-axis quantized, the "
+                        "quantization axis must match."));
+
+    OP_REQUIRES(context,
+                (AllElementsPositive<float>(lhs_scales) &&
+                 AllElementsPositive<float>(rhs_scales) &&
+                 AllElementsPositive<float>(output_scales)),
+                InvalidArgument(
+                    "lhs/rhs/output scales elements must be all positive."));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    OP_REQUIRES_OK(
+        context, EvalQuantizedAdd<T>(
+                     context, lhs, rhs, lhs_scales, lhs_zero_points, rhs_scales,
+                     rhs_zero_points, output_scales, output_zero_points,
+                     output_quantization_min_val_, output_quantization_max_val_,
+                     lhs_quantization_axis_, rhs_quantization_axis_,
+                     output_quantization_axis_, *output));
+  }
+
+ private:
+  int lhs_quantization_axis_;
+  int rhs_quantization_axis_;
+  int output_quantization_axis_;
+
+  int output_quantization_min_val_;
+  int output_quantization_max_val_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("UniformQuantizedAdd").Device(DEVICE_CPU).TypeConstraint<qint32>("T"),
+    UniformQuantizedAddOp<qint32>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc
new file mode 100644
index 00000000000..c1be8f71b46
--- /dev/null
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_add_op_test.cc
@@ -0,0 +1,491 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+using errors::IsInvalidArgument;
+
+constexpr int32_t kInt32Min = std::numeric_limits<int32_t>::min();
+constexpr int32_t kInt32Max = std::numeric_limits<int32_t>::max();
+
+}  // namespace
+
+class UniformQuantizedAddOpTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(UniformQuantizedAddOpTest, InvalidShape) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", 1)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({2}), {-100, 0});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {-20, 0, 20});
+  AddInputFromArray<float>(TensorShape({2}), {2, 3});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {-40, 0, 40});
+
+  EXPECT_TRUE(IsInvalidArgument(RunOpKernel()));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerChannelSameScale) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", 1)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({3}), {-100, 0, 100});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {-20, 0, 20});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({3}), {-40, 0, 40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-126, -4, 118, -120, 2, 124});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorSameScaleLhsMultiDims) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({3}), {-100, 0, 100});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-126, -24, 78, -120, -18, 84});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorSameScaleRhsMultiDims) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({3}), {-100, 0, 100});
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-126, -24, 78, -120, -18, 84});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerChannelDifferentScale) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", 1)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({3}), {-100, 0, 100});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 1});
+  AddInputFromArray<int32>(TensorShape({3}), {-20, 0, 20});
+  AddInputFromArray<float>(TensorShape({3}), {1, 3, 2});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<float>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int32>(TensorShape({3}), {-40, 0, 40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-58, -4, 129, -55, 2, 132});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerChannelDifferentScaleBroadcastLhs) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", 1)
+                   .Attr("rhs_quantization_axis", 1)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({1, 3}), {-100, 0, 100});
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<float>(TensorShape({3}), {1, 3, 2});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<float>(TensorShape({3}), {2, 3, 1});
+  AddInputFromArray<int32>(TensorShape({3}), {-20, 0, 20});
+  AddInputFromArray<float>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int32>(TensorShape({3}), {-40, 0, 40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-58, -4, 129, -55, 2, 132});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorDifferentScale) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({3}), {-100, 0, 100});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {4});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-58, -32, -6, -55, -29, -3});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorSameScaleTensorAddScalar) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<qint32>(TensorShape({}), {-100});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-126, -124, -122, -120, -118, -116});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorSameScaleScalarAddTensor) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({}), {-100});
+  AddInputFromArray<qint32>(TensorShape({2, 3}), {-6, -4, -2, 0, 2, 4});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3}));
+  test::FillValues<qint32>(&expected, {-126, -124, -122, -120, -118, -116});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, PerTensorSameScaleScalarAddScalar) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({}), {-6});
+  AddInputFromArray<qint32>(TensorShape({}), {-100});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({}));
+  test::FillValues<qint32>(&expected, {-126});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, TensorAddEmptyTensor) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({2, 1, 1}), {-6, -12});
+  AddInputFromArray<qint32>(TensorShape({2, 0, 1}), {});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 0, 1}));
+  test::FillValues<qint32>(&expected, {});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedAddOpTest, ScalarAddEmptyTensor) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedAdd")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("T", DT_QINT32)
+                   .Attr("lhs_quantization_axis", -1)
+                   .Attr("rhs_quantization_axis", -1)
+                   .Attr("output_quantization_axis", -1)
+                   .Attr("lhs_quantization_min_val", kInt32Min)
+                   .Attr("lhs_quantization_max_val", kInt32Max)
+                   .Attr("rhs_quantization_min_val", kInt32Min)
+                   .Attr("rhs_quantization_max_val", kInt32Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  AddInputFromArray<qint32>(TensorShape({}), {-6});
+  AddInputFromArray<qint32>(TensorShape({2, 0, 1}), {});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-20});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {2});
+  AddInputFromArray<int32>(TensorShape({}), {-40});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 0, 1}));
+  test::FillValues<qint32>(&expected, {});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops.cc
index 93fcecf967b..3381dbc3c99 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops.cc
@@ -267,6 +267,125 @@ void ConvWithAccFunctionAndOutFunction(
   }
 }
 
+// Quantized Conv on per-tensor quantized padded and dilated transposed lhs and
+// per-tensor quantized transposed rhs.
+template <typename Tin, typename Tout>
+Status EvalLhsPerTensorAndRhsPerTensorQuantizedConv(
+    const Tensor& lhs, const Tensor& rhs,
+    const UniformQuantizedConvolutionParams& convolution_params,
+    const float lhs_scale, const int32_t lhs_zero_point, const float rhs_scale,
+    const int32_t rhs_zero_point, const float output_scale,
+    const int32_t output_zero_point, const int output_quantization_min_val,
+    const int output_quantization_max_val, Tensor& out) {
+  const double effective_multiplier =
+      static_cast<double>(lhs_scale) * rhs_scale / output_scale;
+  int32_t effective_quantized_multiplier;
+  int effective_shift;
+  TF_RETURN_IF_ERROR(QuantizeMultiplier(
+      effective_multiplier, effective_quantized_multiplier, effective_shift));
+
+  ConvWithAccFunctionAndOutFunction<Tin, Tin, Tout>(
+      lhs, rhs, convolution_params, out,
+      /*acc_f=*/
+      [lhs_zero_point, rhs_zero_point](Tin lhs_val, Tin rhs_val,
+                                       int64_t lhs_batch_idx,
+                                       int64_t out_feature_idx) {
+        return (static_cast<int32_t>(lhs_val) - lhs_zero_point) *
+               (static_cast<int32_t>(rhs_val) - rhs_zero_point);
+      },
+      /*out_f=*/
+      [effective_quantized_multiplier, effective_shift, output_zero_point,
+       output_quantization_min_val, output_quantization_max_val](
+          int32_t acc, int64_t lhs_batch_idx, int64_t out_feature_idx) {
+        return AffineRequantizeWithQuantizedMultiplierAndShift<int32_t, Tout>(
+            acc, effective_quantized_multiplier, effective_shift,
+            /*input_zero_point=*/0, output_zero_point,
+            output_quantization_min_val, output_quantization_max_val);
+      });
+  return OkStatus();
+}
+
+// Quantized Conv on per-tensor quantized padded and dilated transposed lhs and
+// per-channel quantized transposed rhs.
+template <typename Tin, typename Tout>
+Status EvalLhsPerTensorAndRhsPerChannelQuantizedConv(
+    OpKernelContext* context, const Tensor& lhs, const Tensor& rhs,
+    const UniformQuantizedConvolutionParams& convolution_params,
+    const float lhs_scale, const int32_t lhs_zero_point,
+    const Tensor& rhs_scales, const Tensor& rhs_zero_points,
+    const Tensor& output_scales, const Tensor& output_zero_points,
+    const int output_quantization_min_val,
+    const int output_quantization_max_val, Tensor& out) {
+  const int64_t out_feature_size = out.dim_size(1);
+  const float* rhs_scales_data = rhs_scales.flat<float>().data();
+  const int32_t* rhs_zero_points_data = rhs_zero_points.flat<int32_t>().data();
+
+  Tensor effective_quantized_multipliers;
+  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, rhs_scales.shape(),
+                                            &effective_quantized_multipliers));
+  Tensor effective_shifts;
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(DT_INT32, rhs_scales.shape(), &effective_shifts));
+  int32_t* effective_quantized_multipliers_data =
+      effective_quantized_multipliers.flat<int32_t>().data();
+  int32_t* effective_shifts_data = effective_shifts.flat<int32_t>().data();
+
+  const bool is_output_scales_scalar = output_scales.dims() == 0;
+
+  if (!is_output_scales_scalar) {
+    const float* output_scales_data = output_scales.flat<float>().data();
+    for (int64_t out_feature_idx = 0; out_feature_idx < out_feature_size;
+         ++out_feature_idx) {
+      const double effective_multiplier = static_cast<double>(lhs_scale) *
+                                          rhs_scales_data[out_feature_idx] /
+                                          output_scales_data[out_feature_idx];
+      TF_RETURN_IF_ERROR(QuantizeMultiplier(
+          effective_multiplier,
+          effective_quantized_multipliers_data[out_feature_idx],
+          effective_shifts_data[out_feature_idx]));
+    }
+  } else {
+    const float output_scale = output_scales.scalar<float>()();
+    for (int64_t out_feature_idx = 0; out_feature_idx < out_feature_size;
+         ++out_feature_idx) {
+      const double effective_multiplier = static_cast<double>(lhs_scale) *
+                                          rhs_scales_data[out_feature_idx] /
+                                          output_scale;
+      TF_RETURN_IF_ERROR(QuantizeMultiplier(
+          effective_multiplier,
+          effective_quantized_multipliers_data[out_feature_idx],
+          effective_shifts_data[out_feature_idx]));
+    }
+  }
+
+  const int32_t* output_zero_points_data =
+      output_zero_points.flat<int32_t>().data();
+  ConvWithAccFunctionAndOutFunction<Tin, Tin, Tout>(
+      lhs, rhs, convolution_params, out,
+      /*acc_f=*/
+      [lhs_zero_point, rhs_zero_points_data](Tin lhs_val, Tin rhs_val,
+                                             int64_t lhs_batch_idx,
+                                             int64_t out_feature_idx) {
+        return (static_cast<int32_t>(lhs_val) - lhs_zero_point) *
+               (static_cast<int32_t>(rhs_val) -
+                rhs_zero_points_data[out_feature_idx]);
+      },
+      /*out_f=*/
+      [effective_quantized_multipliers_data, effective_shifts_data,
+       output_zero_points_data, output_quantization_min_val,
+       output_quantization_max_val, is_output_scales_scalar](
+          int32_t acc, int64_t lhs_batch_idx, int64_t out_feature_idx) {
+        return AffineRequantizeWithQuantizedMultiplierAndShift<int32_t, Tout>(
+            acc, effective_quantized_multipliers_data[out_feature_idx],
+            effective_shifts_data[out_feature_idx],
+            /*input_zero_point=*/0,
+            output_zero_points_data[is_output_scales_scalar ? 0
+                                                            : out_feature_idx],
+            output_quantization_min_val, output_quantization_max_val);
+      });
+  return OkStatus();
+}
+
 // Quantized Conv on per-batch quantized padded and dilated transposed lhs and
 // per-tensor quantized transposed rhs.
 template <typename Tlhs, typename Trhs>
@@ -327,6 +446,72 @@ void EvalLhsPerBatchAndRhsPerChannelQuantizedConv(
       });
 }
 
+// Given quantized `lhs` and quantized `rhs`, performs quantized convolution and
+// writes to `out`. Assumes that `out` is already allocated with correct size.
+template <typename Tin, typename Tout>
+Status EvalQuantizedConv(
+    OpKernelContext* context, const Tensor& lhs, const Tensor& rhs,
+    const UniformQuantizedConvolutionParams& convolution_params,
+    const Tensor& lhs_scales, const Tensor& lhs_zero_points,
+    const Tensor& rhs_scales, const Tensor& rhs_zero_points,
+    const Tensor& output_scales, const Tensor& output_zero_points,
+    int output_quantization_min_val, int output_quantization_max_val,
+    Tensor& out) {
+  const auto& dimension_numbers = convolution_params.dimension_numbers();
+  // Transpose lhs.
+  const auto& lhs_perm = LhsTransposePerm(dimension_numbers, lhs.dims());
+  Tensor lhs_transposed;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      lhs.dtype(), TransposedShape(lhs.shape(), lhs_perm), &lhs_transposed));
+  Transpose<Tin>(lhs, lhs_perm, lhs_transposed);
+  // Transpose rhs.
+  const auto& rhs_perm = RhsTransposePerm(dimension_numbers, rhs.dims());
+  Tensor rhs_transposed;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      rhs.dtype(), TransposedShape(rhs.shape(), rhs_perm), &rhs_transposed));
+  Transpose<Tin>(rhs, rhs_perm, rhs_transposed);
+  // Allocate tranposed_out.
+  const auto& out_perm = OutTransposePerm(dimension_numbers, out.dims());
+  Tensor out_transposed;
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      out.dtype(), TransposedShape(out.shape(), out_perm), &out_transposed));
+
+  Tensor lhs_padded_and_dilated;
+  TF_RETURN_IF_ERROR(
+      context->allocate_temp(lhs_transposed.dtype(),
+                             PaddedAndDilatedTransposedLhsShape(
+                                 lhs_transposed.shape(), convolution_params),
+                             &lhs_padded_and_dilated));
+  PadAndDilateTransposedLhs<Tin>(lhs_transposed, convolution_params,
+                                 lhs_zero_points, lhs_padded_and_dilated);
+
+  const float lhs_scale = lhs_scales.scalar<float>()();
+  const int32_t lhs_zero_point = lhs_zero_points.scalar<int32_t>()();
+  if (rhs_scales.dims() != 0) {
+    TF_RETURN_IF_ERROR(EvalLhsPerTensorAndRhsPerChannelQuantizedConv<Tin, Tout>(
+        context, lhs_padded_and_dilated, rhs_transposed, convolution_params,
+        lhs_scale, lhs_zero_point, rhs_scales, rhs_zero_points, output_scales,
+        output_zero_points, output_quantization_min_val,
+        output_quantization_max_val, out_transposed));
+  } else {
+    DCHECK_EQ(output_scales.dims(), 0);
+    const float rhs_scale = rhs_scales.scalar<float>()();
+    const int32_t rhs_zero_point = rhs_zero_points.scalar<int32_t>()();
+    const float output_scale = output_scales.scalar<float>()();
+    const int32_t output_zero_point = output_zero_points.scalar<int32_t>()();
+    TF_RETURN_IF_ERROR(EvalLhsPerTensorAndRhsPerTensorQuantizedConv<Tin, Tout>(
+        lhs_padded_and_dilated, rhs_transposed, convolution_params, lhs_scale,
+        lhs_zero_point, rhs_scale, rhs_zero_point, output_scale,
+        output_zero_point, output_quantization_min_val,
+        output_quantization_max_val, out_transposed));
+  }
+
+  // Transpose transposed_out back to out.
+  const auto& out_perm_back = OutBackTransposePerm(out_perm);
+  Transpose<Tout>(out_transposed, out_perm_back, out);
+  return OkStatus();
+}
+
 // Given float `lhs` and quantized `rhs`, performs per-batch dynamic range
 // quantization on `lhs`, and then performs quantized convolution on
 // quantized_lhs and `rhs`, and writes to `out`. Assumes that `out` is already
@@ -413,6 +598,122 @@ Status EvalHybridConv(
 
 }  // namespace
 
+template <typename Tin, typename Tout>
+class UniformQuantizedConvolutionOp : public OpKernel {
+ public:
+  explicit UniformQuantizedConvolutionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, convolution_params_.LoadFromAttrs(*context));
+
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_min_val",
+                                             &output_quantization_min_val_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_max_val",
+                                             &output_quantization_max_val_));
+
+    int lhs_quantization_axis;
+    OP_REQUIRES_OK(context, context->GetAttr("lhs_quantization_axis",
+                                             &lhs_quantization_axis));
+    OP_REQUIRES(
+        context, (lhs_quantization_axis == -1),
+        InvalidArgument("lhs_quantization_axis Attr must be -1 (per-tensor)."));
+    OP_REQUIRES_OK(context, context->GetAttr("rhs_quantization_axis",
+                                             &rhs_quantization_axis_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_quantization_axis",
+                                             &output_quantization_axis_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& lhs = context->input(0);
+    const Tensor& rhs = context->input(1);
+    const Tensor& lhs_scales = context->input(2);
+    const Tensor& lhs_zero_points = context->input(3);
+    const Tensor& rhs_scales = context->input(4);
+    const Tensor& rhs_zero_points = context->input(5);
+    const Tensor& output_scales = context->input(6);
+    const Tensor& output_zero_points = context->input(7);
+
+    OP_REQUIRES(context, (AllElementsPositive<float>(lhs_scales)),
+                InvalidArgument("lhs scales elements must be all positive."));
+    OP_REQUIRES(context, (AllElementsPositive<float>(rhs_scales)),
+                InvalidArgument("rhs scales elements must be all positive."));
+    OP_REQUIRES(
+        context, (AllElementsPositive<float>(output_scales)),
+        InvalidArgument("output scales elements must be all positive."));
+
+    OP_REQUIRES_OK(context,
+                   convolution_params_.ValidateOrFillParamsAndValidateShape(
+                       lhs.shape(), rhs.shape()));
+
+    // Check lhs scales/zero_points shapes.
+    OP_REQUIRES(
+        context,
+        (lhs_scales.IsSameSize(lhs_zero_points) && lhs_scales.dims() == 0),
+        InvalidArgument(
+            "lhs scales/zero_points must be all scalar tensors. Given: ",
+            lhs_scales.shape().DebugString(),
+            lhs_zero_points.shape().DebugString()));
+
+    // Check rhs axis.
+    OP_REQUIRES(
+        context,
+        (rhs_quantization_axis_ == -1 ||
+         rhs_quantization_axis_ == convolution_params_.dimension_numbers()
+                                       .kernel_output_feature_dimension()),
+        InvalidArgument("rhs_quantization_axis Attr must be -1 (per-tensor) or "
+                        "dimension_numbers.kernel_output_feature_dimension "
+                        "(per-channel)."));
+    // Check rhs scales/zero_points shapes.
+    OP_REQUIRES_OK(
+        context, QuantizationAxisAndShapeValid(rhs.shape(), rhs_scales.shape(),
+                                               rhs_zero_points.shape(),
+                                               rhs_quantization_axis_));
+
+    // Check output axis.
+    OP_REQUIRES(
+        context,
+        (output_quantization_axis_ == -1 ||
+         output_quantization_axis_ == convolution_params_.dimension_numbers()
+                                          .output_feature_dimension()),
+        InvalidArgument(
+            "output_quantization_axis Attr must be -1 (per-tensor) or "
+            "dimension_numbers.output_feature_dimension (per-channel)."));
+
+    auto output_shape =
+        convolution_params_.CalculateOutputShape(lhs.shape(), rhs.shape());
+    OP_REQUIRES_OK(context, output_shape.status());
+    // Check output scales/zero_points shapes.
+    OP_REQUIRES_OK(context,
+                   QuantizationAxisAndShapeValid(
+                       output_shape.value(), output_scales.shape(),
+                       output_zero_points.shape(), output_quantization_axis_));
+    OP_REQUIRES(
+        context, (rhs_scales.dims() > 0 || output_scales.dims() == 0),
+        InvalidArgument(
+            "If rhs is per-tensor quantized, output must be also per-tensor "
+            "quantized. Given output scales/zero_points of rank ",
+            output_scales.dims()));
+
+    Tensor* output;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape.value(), &output));
+
+    OP_REQUIRES_OK(
+        context,
+        EvalQuantizedConv<Tin, Tout>(
+            context, lhs, rhs, convolution_params_, lhs_scales, lhs_zero_points,
+            rhs_scales, rhs_zero_points, output_scales, output_zero_points,
+            output_quantization_min_val_, output_quantization_max_val_,
+            *output));
+  }
+
+ private:
+  UniformQuantizedConvolutionParams convolution_params_;
+  int rhs_quantization_axis_;
+  int output_quantization_axis_;
+  int output_quantization_min_val_;
+  int output_quantization_max_val_;
+};
+
 // This kernel internally quantizes lhs with following conditions, which aligns
 // with current TFLite behavior.
 // - lhs_quantization_min = -128 (narrow_range = false)
@@ -472,6 +773,12 @@ class UniformQuantizedConvolutionHybridOp : public OpKernel {
   int rhs_quantization_axis_;
 };
 
+REGISTER_KERNEL_BUILDER(Name("UniformQuantizedConvolution")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("Tin")
+                            .TypeConstraint<qint32>("Tout"),
+                        UniformQuantizedConvolutionOp<qint8, qint32>);
+
 REGISTER_KERNEL_BUILDER(
     Name("UniformQuantizedConvolutionHybrid")
         .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops_test.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops_test.cc
index 568b95ac1ed..7515839bf3a 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_quantized_convolution_ops_test.cc
@@ -30,6 +30,8 @@ using protobuf::TextFormat;
 
 constexpr int32_t kInt8Min = std::numeric_limits<int8_t>::min();
 constexpr int32_t kInt8Max = std::numeric_limits<int8_t>::max();
+constexpr int32_t kInt32Min = std::numeric_limits<int32_t>::min();
+constexpr int32_t kInt32Max = std::numeric_limits<int32_t>::max();
 
 template <typename T>
 std::vector<T> Arange(int start, int stop, int step = 1) {
@@ -48,6 +50,804 @@ class UniformQuantizedConvolutionTest : public OpsTestBase {
  protected:
 };
 
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedDefaultAttrs) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Finalize(node_def()));
+  // Uses default Attrs (and default conv_params settings).
+  //
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected, {4062,  3830,  3134,  2902,  990,   950,   830,   790,
+                  -2082, -1930, -1474, -1322, -1506, -1738, -2434, -2666,
+                  30,    -10,   -130,  -170,  1566,  1718,  2174,  2326});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetStrides) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Attr("window_strides", {2, 3})
+                   .Finalize(node_def()));
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 1, 1}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(&expected, {4062, 990, -2082, -1506, 30, 1566});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetExplicitPadding) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "EXPLICIT")
+                   .Attr("explicit_padding", {0, 1, 1, 2})
+                   .Finalize(node_def()));
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 3, 5}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected,
+      {2694,  4062,  3830,  2550,  1272,  2096,  3134,  2902,  1910,  942,
+       968,   1432,  1304,  848,   414,   582,   990,   950,   694,   376,
+       496,   830,   790,   566,   302,   296,   472,   440,   304,   158,
+       -1530, -2082, -1930, -1162, -520,  -1104, -1474, -1322, -778,  -338,
+       -376,  -488,  -424,  -240,  -98,   -890,  -1506, -1738, -1290, -712,
+       -1488, -2434, -2666, -1930, -1042, -1016, -1640, -1768, -1264, -674,
+       70,    30,    -10,   -74,   -72,   -16,   -130,  -170,  -202,  -146,
+       -152,  -296,  -328,  -272,  -162,  1030,  1566,  1718,  1142,  568,
+       1456,  2174,  2326,  1526,  750,   712,   1048,  1112,  720,   350});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetDimensionNumbers) {
+  UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            input_batch_dimension: 1
+                                            input_feature_dimension: 3
+                                            input_spatial_dimensions: 2
+                                            input_spatial_dimensions: 0
+                                            kernel_output_feature_dimension: 2
+                                            kernel_input_feature_dimension: 1
+                                            kernel_spatial_dimensions: 0
+                                            kernel_spatial_dimensions: 3
+                                            output_batch_dimension: 2
+                                            output_feature_dimension: 1
+                                            output_spatial_dimensions: 3
+                                            output_spatial_dimensions: 0
+                                          )pb",
+                                          &dimension_numbers));
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "UniformQuantizedConvolution")
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("Tin", DT_QINT8)
+          .Attr("Tout", DT_QINT32)
+          .Attr("lhs_quantization_min_val", kInt8Min)
+          .Attr("lhs_quantization_max_val", kInt8Max)
+          .Attr("rhs_quantization_min_val", kInt8Min)
+          .Attr("rhs_quantization_max_val", kInt8Max)
+          .Attr("output_quantization_min_val", kInt32Min)
+          .Attr("output_quantization_max_val", kInt32Max)
+          .Attr("padding", "VALID")
+          .Attr("dimension_numbers", dimension_numbers.SerializeAsString())
+          .Finalize(node_def()));
+  // strides = [1, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({4, 2, 3, 2}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected,
+      {1323, 1147, 795,  619,  771, 691, 531, 451, 219, 235, 267, 283,
+       267,  91,   -261, -437, 291, 211, 51,  -29, 315, 331, 363, 379});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest,
+       PerTensorQuantizedSetFeatureGroupCount) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Attr("feature_group_count", 2)
+                   .Finalize(node_def()));
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // batch_group_count = 1
+  // strides = [1, 1]
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 4, 3, 4}), Arange<qint8>(-48, 48));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({4, 2, 2, 3}), Arange<qint8>(-24, 24));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 4, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected, {13470, 13142, 12158, 11830, 5790,  5654,  5246,  5110,
+                  -546,  -490,  -322,  -266,  -3618, -3370, -2626, -2378,
+                  -2274, -2602, -3586, -3914, -738,  -874,  -1282, -1418,
+                  2142,  2198,  2366,  2422,  8286,  8534,  9278,  9526});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetBatchGroupCount) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Attr("batch_group_count", 2)
+                   .Finalize(node_def()));
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({4, 2, 3, 4}), Arange<qint8>(-48, 48));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({4, 2, 2, 3}), Arange<qint8>(-24, 24));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 4, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected,
+      {13470, 13142, 12158, 11830, 5790, 5654, 5246, 5110, 798,  854,  1022,
+       1078,  2334,  2582,  3326,  3574, 5598, 5270, 4286, 3958, 2526, 2390,
+       1982,  1846,  2142,  2198,  2366, 2422, 8286, 8534, 9278, 9526});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetLhsDilation) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Attr("lhs_dilation", {2, 2})
+                   .Finalize(node_def()));
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // strides = [1, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 4, 5}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(
+      &expected,
+      {1680, 819,  1595, 776,  1510, 1107, 536,  1038, 502,  968,  1339, 648,
+       1254, 606,  1168, 830,  398,  760,  363,  691,  496,  243,  475,  232,
+       454,  179,  88,   174,  86,   168,  411,  200,  390,  190,  368,  158,
+       78,   152,  75,   147,  -688, -333, -645, -312, -602, -749, -360, -690,
+       -330, -632, -517, -248, -474, -226, -432, -514, -242, -456, -213, -397,
+       -368, -205, -453, -248, -538, -557, -296, -626, -330, -696, -709, -376,
+       -794, -418, -880, -834, -434, -904, -469, -973, -16,  -13,  -37,  -24,
+       -58,  51,   24,   46,   22,   40,   -101, -56,  -122, -66,  -144, 30,
+       14,   24,   11,   19,   336,  179,  379,  200,  422,  659,  344,  718,
+       374,  776,  507,  264,  550,  286,  592,  894,  462,  952,  491,  1011});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerTensorQuantizedSetRhsDilation) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Attr("rhs_dilation", {2, 2})
+                   .Finalize(node_def()));
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // strides = [1, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 4, 5}), Arange<qint8>(-40, 40));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 2, 1}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], [(rhs - 2) * 2.0])
+  test::FillValues<qint32>(&expected, {6192, 5032, 1584, 1384, -3024, -2264,
+                                       -3088, -4248, -16, -216, 3056, 3816});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerChannelQuantizedDefaultAttrs) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Finalize(node_def()));
+  // Uses default Attrs (and default conv_params settings).
+  //
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({3}), {2.0, 4.0, 2.0});
+  AddInputFromArray<int32>(TensorShape({3}), {2, 4, 2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], Per-channel-dequantized-rhs)
+  test::FillValues<qint32>(
+      &expected, {4062,  3830,  3134,  2902,  3000,  2856,  2424,  2280,
+                  -2082, -1930, -1474, -1322, -1506, -1738, -2434, -2666,
+                  -456,  -600,  -1032, -1176, 1566,  1718,  2174,  2326});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest,
+       PerChannelQuantizedRhsAndOutputDefaultAttrs) {
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Finalize(node_def()));
+  // Uses default Attrs (and default conv_params settings).
+  //
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 2, 3, 4}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({3, 2, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({3}), {2.0, 4.0, 2.0});
+  AddInputFromArray<int32>(TensorShape({3}), {2, 4, 2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({3}), {3.0, 2.0, 1.0});
+  AddInputFromArray<int32>(TensorShape({3}), {3, 2, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 3, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], Per-channel-dequantized-rhs)
+  test::FillValues<qint32>(
+      &expected, {4062,  3830,  3134,  2902,  4498,  4282,  3634,  3418,
+                  -6255, -5799, -4431, -3975, -1506, -1738, -2434, -2666,
+                  -686,  -902,  -1550, -1766, 4689,  5145,  6513,  6969});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerChannelQuantizedTFConv2DLikeConfig) {
+  // Like TF Conv2D Default (data_format=NHWC),
+  // dimension_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            input_batch_dimension: 0
+                                            input_feature_dimension: 3
+                                            input_spatial_dimensions: 1
+                                            input_spatial_dimensions: 2
+                                            kernel_output_feature_dimension: 3
+                                            kernel_input_feature_dimension: 2
+                                            kernel_spatial_dimensions: 0
+                                            kernel_spatial_dimensions: 1
+                                            output_batch_dimension: 0
+                                            output_feature_dimension: 3
+                                            output_spatial_dimensions: 1
+                                            output_spatial_dimensions: 2
+                                          )pb",
+                                          &dimension_numbers));
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "UniformQuantizedConvolution")
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("Tin", DT_QINT8)
+          .Attr("Tout", DT_QINT32)
+          .Attr("rhs_quantization_axis", 3)
+          .Attr("lhs_quantization_min_val", kInt8Min)
+          .Attr("lhs_quantization_max_val", kInt8Max)
+          .Attr("rhs_quantization_min_val", kInt8Min)
+          .Attr("rhs_quantization_max_val", kInt8Max)
+          .Attr("output_quantization_min_val", kInt32Min)
+          .Attr("output_quantization_max_val", kInt32Max)
+          .Attr("padding", "VALID")
+          .Attr("dimension_numbers", dimension_numbers.SerializeAsString())
+          .Finalize(node_def()));
+  // strides = [1, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 4, 2}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 2, 3}), Arange<qint8>(-18, 18));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  // AddInputFromArray<float>(TensorShape({}), {2.0});
+  // AddInputFromArray<int32>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({3}), {2.0, 4.0, 2.0});
+  AddInputFromArray<int32>(TensorShape({3}), {2, 4, 2});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 2, 2, 3}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], Per-channel-dequantized-rhs)
+  test::FillValues<qint32>(
+      &expected,
+      {1755, 4099, 1163, 1643, 3811, 1115, 1307, 2947, 971, 1195, 2659, 923,
+       411,  643,  587,  299,  355,  539,  -37,  -509, 395, -149, -797, 347});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest,
+       PerChannelQuantizedTFDepthwiseConv2DLikeConfig) {
+  // Like TF DepthwiseConv2D Default (data_format=NHWC),
+  // dimension_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+  // And set feature_group_count to input feature dimension size.
+  UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            input_batch_dimension: 0
+                                            input_feature_dimension: 3
+                                            input_spatial_dimensions: 1
+                                            input_spatial_dimensions: 2
+                                            kernel_output_feature_dimension: 3
+                                            kernel_input_feature_dimension: 2
+                                            kernel_spatial_dimensions: 0
+                                            kernel_spatial_dimensions: 1
+                                            output_batch_dimension: 0
+                                            output_feature_dimension: 3
+                                            output_spatial_dimensions: 1
+                                            output_spatial_dimensions: 2
+                                          )pb",
+                                          &dimension_numbers));
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "UniformQuantizedConvolution")
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("Tin", DT_QINT8)
+          .Attr("Tout", DT_QINT32)
+          .Attr("rhs_quantization_axis", 3)
+          .Attr("lhs_quantization_min_val", kInt8Min)
+          .Attr("lhs_quantization_max_val", kInt8Max)
+          .Attr("rhs_quantization_min_val", kInt8Min)
+          .Attr("rhs_quantization_max_val", kInt8Max)
+          .Attr("output_quantization_min_val", kInt32Min)
+          .Attr("output_quantization_max_val", kInt32Max)
+          .Attr("padding", "VALID")
+          .Attr("feature_group_count", 2)
+          .Attr("dimension_numbers", dimension_numbers.SerializeAsString())
+          .Finalize(node_def()));
+  // strides = [1, 1]
+  // batch_group_count = 1
+  // lhs_dilation = [1, 1]
+  // rhs_dilation = [1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 4, 2}), Arange<qint8>(-24, 24));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 1, 2}), Arange<qint8>(-6, 6));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({2}), {2.0, 4.0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 4});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 2, 2, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], Per-channel-dequantized-rhs)
+  test::FillValues<qint32>(
+      &expected, {576, 1390, 528, 1262, 384, 878, 336, 750, 0, -146, -48, -274,
+                  -192, -658, -240, -786});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
+TEST_F(UniformQuantizedConvolutionTest, PerChannelQuantizedTFConv3DLikeConfig) {
+  // Like TF Conv3D Default (data_format=NDHWC),
+  // dimension_numbers = [b, 0, 1, 2, f]x[0, 1, 2, i, o]->[b, 0, 1, 2, f]
+  UniformQuantizedConvolutionDimensionNumbersAttr dimension_numbers;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            input_batch_dimension: 0
+                                            input_feature_dimension: 4
+                                            input_spatial_dimensions: 1
+                                            input_spatial_dimensions: 2
+                                            input_spatial_dimensions: 3
+                                            kernel_output_feature_dimension: 4
+                                            kernel_input_feature_dimension: 3
+                                            kernel_spatial_dimensions: 0
+                                            kernel_spatial_dimensions: 1
+                                            kernel_spatial_dimensions: 2
+                                            output_batch_dimension: 0
+                                            output_feature_dimension: 4
+                                            output_spatial_dimensions: 1
+                                            output_spatial_dimensions: 2
+                                            output_spatial_dimensions: 3
+                                          )pb",
+                                          &dimension_numbers));
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "UniformQuantizedConvolution")
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_QINT8))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_INT32))
+          .Attr("Tin", DT_QINT8)
+          .Attr("Tout", DT_QINT32)
+          .Attr("rhs_quantization_axis", 4)
+          .Attr("lhs_quantization_min_val", kInt8Min)
+          .Attr("lhs_quantization_max_val", kInt8Max)
+          .Attr("rhs_quantization_min_val", kInt8Min)
+          .Attr("rhs_quantization_max_val", kInt8Max)
+          .Attr("output_quantization_min_val", kInt32Min)
+          .Attr("output_quantization_max_val", kInt32Max)
+          .Attr("padding", "VALID")
+          .Attr("dimension_numbers", dimension_numbers.SerializeAsString())
+          .Finalize(node_def()));
+  // strides = [1, 1, 1]
+  // batch_group_count = 1
+  // feature_group_count = 1
+  // lhs_dilation = [1, 1, 1]
+  // rhs_dilation = [1, 1, 1]
+  TF_ASSERT_OK(InitOp());
+
+  // lhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 4, 2, 2}),
+                           Arange<qint8>(-50, 46));
+  // rhs (quantized) tensor.
+  AddInputFromArray<qint8>(TensorShape({2, 3, 2, 2, 2}),
+                           Arange<qint8>(-24, 24));
+  // lhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {2.0});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  // rhs scales and zero_points.
+  AddInputFromArray<float>(TensorShape({2}), {2.0, 4.0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 4});
+  // output scales and zero_points.
+  AddInputFromArray<float>(TensorShape({}), {3.0});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QINT32, TensorShape({2, 2, 2, 1, 2}));
+  // Dequantized output [(output - 3) * 3.0] should be equal to
+  // conv([(lhs - 1) * 2.0], Per-channel-dequantized-rhs)
+  test::FillValues<qint32>(
+      &expected, {7438, 17272, 7054, 16248, 5902, 13176, 5518, 12152, 2830,
+                  4984, 2446, 3960, 1294, 888, 910, -136});
+  test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
+}
+
 TEST_F(UniformQuantizedConvolutionTest, HybridPerTensorQuantizedDefaultAttrs) {
   TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolutionHybrid")
                    .Input(FakeInput(DT_FLOAT))
diff --git a/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op.cc b/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op.cc
index 99ed28f3ddd..3e93b69b5a1 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/uniform_requantize_op.cc
@@ -18,134 +18,9 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
-namespace {
 
 using tensorflow::errors::InvalidArgument;
 
-// Requantize from per-tensor to per-tensor.
-template <typename Tin, typename Tout>
-Status PerTensorToPerTensorRequantize(
-    const Tensor& input, float input_scale, int32_t input_zero_point,
-    float output_scale, int32_t output_zero_point, int32_t quantization_min_val,
-    int32_t quantization_max_val, Tensor& output) {
-  const double effective_multiplier =
-      static_cast<double>(input_scale) / output_scale;
-  int32_t effective_quantized_multiplier;
-  int32_t effective_shift;
-  TF_RETURN_IF_ERROR(QuantizeMultiplier(
-      effective_multiplier, effective_quantized_multiplier, effective_shift));
-
-  output.flat<Tout>() = input.flat<Tin>().unaryExpr(
-      [effective_quantized_multiplier, effective_shift, input_zero_point,
-       output_zero_point, quantization_min_val,
-       quantization_max_val](Tin input_val) {
-        return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
-            input_val, effective_quantized_multiplier, effective_shift,
-            input_zero_point, output_zero_point, quantization_min_val,
-            quantization_max_val);
-      });
-  return OkStatus();
-}
-
-// Requantize where the input or output contains any per-axis quantized cases.
-// - From per-tensor to per-axis.
-// - From per-axis to per-tensor.
-// - From per-axis to per-axis.
-template <typename Tin, typename Tout>
-Status PerAxisRequantize(OpKernelContext* context, const Tensor& input,
-                         const Tensor& input_scales,
-                         const Tensor& input_zero_points,
-                         const Tensor& output_scales,
-                         const Tensor& output_zero_points,
-                         int quantization_axis, int32_t quantization_min_val,
-                         int32_t quantization_max_val, Tensor& output) {
-  const bool input_per_axis_quantization = input_scales.dims() == 1;
-  const bool output_per_axis_quantization = output_scales.dims() == 1;
-  const auto& per_axis_scales_shape = input_per_axis_quantization
-                                          ? input_scales.shape()
-                                          : output_scales.shape();
-
-  Tensor effective_quantized_multipliers;
-  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
-                                            &effective_quantized_multipliers));
-  Tensor effective_shifts;
-  TF_RETURN_IF_ERROR(context->allocate_temp(DT_INT32, per_axis_scales_shape,
-                                            &effective_shifts));
-
-  const float* input_scales_data = input_scales.flat<float>().data();
-  const float* output_scales_data = output_scales.flat<float>().data();
-  int32_t* effective_quantized_multipliers_data =
-      effective_quantized_multipliers.flat<int32_t>().data();
-  int32_t* effective_shifts_data = effective_shifts.flat<int32_t>().data();
-
-  const int64_t quantization_dim_size = output.dim_size(quantization_axis);
-
-  for (int64_t i = 0; i < quantization_dim_size; ++i) {
-    const double effective_multiplier =
-        static_cast<double>(
-            input_scales_data[input_per_axis_quantization ? i : 0]) /
-        output_scales_data[output_per_axis_quantization ? i : 0];
-    TF_RETURN_IF_ERROR(QuantizeMultiplier(
-        effective_multiplier, effective_quantized_multipliers_data[i],
-        effective_shifts_data[i]));
-  }
-
-  const int32* input_zero_points_data = input_zero_points.flat<int32>().data();
-  const int32* output_zero_points_data =
-      output_zero_points.flat<int32>().data();
-
-  auto input_tensor =
-      input.template flat_inner_outer_dims<Tin, 3>(quantization_axis - 1);
-  auto output_tensor =
-      output.template flat_inner_outer_dims<Tout, 3>(quantization_axis - 1);
-
-  for (int i = 0; i < quantization_dim_size; ++i) {
-    output_tensor.template chip<1>(i) =
-        input_tensor.template chip<1>(i).unaryExpr(
-            [effective_quantized_multipliers_data, effective_shifts_data,
-             input_zero_points_data, output_zero_points_data,
-             quantization_min_val, quantization_max_val,
-             input_per_axis_quantization, output_per_axis_quantization,
-             i](Tin input_val) {
-              return AffineRequantizeWithQuantizedMultiplierAndShift<Tin, Tout>(
-                  input_val, effective_quantized_multipliers_data[i],
-                  effective_shifts_data[i],
-                  input_zero_points_data[input_per_axis_quantization ? i : 0],
-                  output_zero_points_data[output_per_axis_quantization ? i : 0],
-                  quantization_min_val, quantization_max_val);
-            });
-  }
-  return OkStatus();
-}
-
-template <typename Tin, typename Tout>
-Status EvalRequantize(OpKernelContext* context, const Tensor& input,
-                      const Tensor& input_scales,
-                      const Tensor& input_zero_points,
-                      const Tensor& output_scales,
-                      const Tensor& output_zero_points,
-                      int input_quantization_axis, int output_quantization_axis,
-                      int32_t quantization_min_val,
-                      int32_t quantization_max_val, Tensor& output) {
-  if (input_quantization_axis == -1 && output_quantization_axis == -1) {
-    return PerTensorToPerTensorRequantize<Tin, Tout>(
-        input, input_scales.scalar<float>()(),
-        input_zero_points.scalar<int32>()(), output_scales.scalar<float>()(),
-        output_zero_points.scalar<int32>()(), quantization_min_val,
-        quantization_max_val, output);
-  } else {
-    const int quantization_axis = input_quantization_axis >= 0
-                                      ? input_quantization_axis
-                                      : output_quantization_axis;
-    return PerAxisRequantize<Tin, Tout>(
-        context, input, input_scales, input_zero_points, output_scales,
-        output_zero_points, quantization_axis, quantization_min_val,
-        quantization_max_val, output);
-  }
-}
-
-}  // namespace
-
 // Changing input_quantization_min/max_val is no-op for this kernel.
 template <typename Tin, typename Tout>
 class UniformRequantizeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 2d883a3e7a2..131bbb891b0 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -137,7 +137,6 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
       Name("Unpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       UnpackOp<GPUDevice, type>)
 
-TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_uint8(REGISTER_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/while_op_test.cc b/tensorflow/core/kernels/while_op_test.cc
index 05f3afd5843..9292bcb400c 100644
--- a/tensorflow/core/kernels/while_op_test.cc
+++ b/tensorflow/core/kernels/while_op_test.cc
@@ -166,7 +166,7 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
           std::move(platform_fns_),
           stream_executor::test_util::DestroyPlatformFns,
           std::move(device_fns_), std::move(se_), std::move(timer_fns_)));
-  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+  TF_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
       std::move(cplatform)));
 
   DeviceFactory::Register(
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 0041775d01b..8d4061f2dbc 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -135,18 +135,15 @@ TF_CALL_bfloat16(REGISTER_CPU);
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        SoftmaxXentWithLogitsOp<GPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxXentWithLogitsOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxXentWithLogitsOp<GPUDevice, double>);
+
+#define REGISTER_GPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<T>("T"),          \
+                          SoftmaxXentWithLogitsOp<GPUDevice, T>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc
index 2b1ac45ab4c..562ee8d6179 100644
--- a/tensorflow/core/kernels/xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc
@@ -48,8 +48,9 @@ struct XentFunctor<GPUDevice, T> {
 };
 }  // end namespace functor
 
-// Instantiate the GPU implementation for half, float and double.
+// Instantiate the GPU implementation for half, bfloat16, float and double.
 template struct functor::XentFunctor<GPUDevice, Eigen::half>;
+template struct functor::XentFunctor<GPUDevice, Eigen::bfloat16>;
 template struct functor::XentFunctor<GPUDevice, float>;
 template struct functor::XentFunctor<GPUDevice, double>;
 
diff --git a/tensorflow/core/lib/bfloat16/BUILD b/tensorflow/core/lib/bfloat16/BUILD
index 083bd0d8d42..2b14554598d 100644
--- a/tensorflow/core/lib/bfloat16/BUILD
+++ b/tensorflow/core/lib/bfloat16/BUILD
@@ -4,6 +4,7 @@
 # )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/core/lib/bmp/BUILD b/tensorflow/core/lib/bmp/BUILD
index 45139dbc06d..2858b169035 100644
--- a/tensorflow/core/lib/bmp/BUILD
+++ b/tensorflow/core/lib/bmp/BUILD
@@ -2,6 +2,7 @@
 # bmp test data package alias.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/bmp/testdata/BUILD b/tensorflow/core/lib/bmp/testdata/BUILD
index ce80404fcac..9f0cfe686b5 100644
--- a/tensorflow/core/lib/bmp/testdata/BUILD
+++ b/tensorflow/core/lib/bmp/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 6628ee2bcc8..09e61b4a3b7 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:__subpackages__",
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index 9c93d2c1b3d..e53b1b9aac1 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -177,7 +177,7 @@ TEST(StatusGroup, AggregateWithMultipleErrorStatus) {
 
 TEST(Status, InvalidPayloadGetsIgnored) {
   Status s = Status();
-  s.SetPayload("Invalid", "Invalid Val");
+  s.SetPayload("Invalid", absl::Cord("Invalid Val"));
   ASSERT_FALSE(s.GetPayload("Invalid").has_value());
   bool is_err_erased = s.ErasePayload("Invalid");
   ASSERT_EQ(is_err_erased, false);
@@ -185,15 +185,15 @@ TEST(Status, InvalidPayloadGetsIgnored) {
 
 TEST(Status, SetPayloadSetsOrUpdatesIt) {
   Status s(error::INTERNAL, "Error message");
-  s.SetPayload("Error key", "Original");
+  s.SetPayload("Error key", absl::Cord("Original"));
   ASSERT_EQ(s.GetPayload("Error key"), absl::Cord("Original"));
-  s.SetPayload("Error key", "Updated");
+  s.SetPayload("Error key", absl::Cord("Updated"));
   ASSERT_EQ(s.GetPayload("Error key"), absl::Cord("Updated"));
 }
 
 TEST(Status, ErasePayloadRemovesIt) {
   Status s(error::INTERNAL, "Error message");
-  s.SetPayload("Error key", "Original");
+  s.SetPayload("Error key", absl::Cord("Original"));
 
   bool is_err_erased = s.ErasePayload("Error key");
   ASSERT_EQ(is_err_erased, true);
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 43cd523ffeb..143e6bf3628 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/lib/gif/BUILD b/tensorflow/core/lib/gif/BUILD
index 4ad98e4cf0e..575d7bc5ae5 100644
--- a/tensorflow/core/lib/gif/BUILD
+++ b/tensorflow/core/lib/gif/BUILD
@@ -14,6 +14,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core:__pkg__"],
     features = [
         "-layering_check",
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index ba4aa1156db..dfe1ee91f76 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -105,7 +105,7 @@ uint8* Decode(const void* srcdata, int datasize,
   uint8* const dstdata =
       allocate_output(target_num_frames, width, height, channel);
   if (!dstdata) return nullptr;
-  for (int k = 0; k < target_num_frames; k++) {
+  for (int64_t k = 0; k < target_num_frames; k++) {
     uint8* this_dst = dstdata + k * width * channel * height;
 
     SavedImage* this_image = &gif_file->SavedImages[k];
@@ -125,10 +125,10 @@ uint8* Decode(const void* srcdata, int datasize,
 
     if (k > 0) {
       uint8* last_dst = dstdata + (k - 1) * width * channel * height;
-      for (int i = 0; i < height; ++i) {
+      for (int64_t i = 0; i < height; ++i) {
         uint8* p_dst = this_dst + i * width * channel;
         uint8* l_dst = last_dst + i * width * channel;
-        for (int j = 0; j < width; ++j) {
+        for (int64_t j = 0; j < width; ++j) {
           p_dst[j * channel + 0] = l_dst[j * channel + 0];
           p_dst[j * channel + 1] = l_dst[j * channel + 1];
           p_dst[j * channel + 2] = l_dst[j * channel + 2];
@@ -141,9 +141,9 @@ uint8* Decode(const void* srcdata, int datasize,
       // If the first frame does not fill the entire canvas then fill the
       // unoccupied canvas with zeros (black).
       if (k == 0) {
-        for (int i = 0; i < height; ++i) {
+        for (int64_t i = 0; i < height; ++i) {
           uint8* p_dst = this_dst + i * width * channel;
-          for (int j = 0; j < width; ++j) {
+          for (int64_t j = 0; j < width; ++j) {
             p_dst[j * channel + 0] = 0;
             p_dst[j * channel + 1] = 0;
             p_dst[j * channel + 2] = 0;
@@ -165,9 +165,9 @@ uint8* Decode(const void* srcdata, int datasize,
       return nullptr;
     }
 
-    for (int i = imgTop; i < imgBottom; ++i) {
+    for (int64_t i = imgTop; i < imgBottom; ++i) {
       uint8* p_dst = this_dst + i * width * channel;
-      for (int j = imgLeft; j < imgRight; ++j) {
+      for (int64_t j = imgLeft; j < imgRight; ++j) {
         GifByteType color_index =
             this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
                                    (j - img_desc->Left)];
diff --git a/tensorflow/core/lib/gif/gif_io_test.cc b/tensorflow/core/lib/gif/gif_io_test.cc
index 38c18191169..c80e183133f 100644
--- a/tensorflow/core/lib/gif/gif_io_test.cc
+++ b/tensorflow/core/lib/gif/gif_io_test.cc
@@ -52,7 +52,8 @@ void TestDecodeGif(Env* env, DecodeGifTestCase testcase) {
         w = width;
         h = height;
         c = channels;
-        return new uint8[frame_cnt * height * width * channels];
+        return new uint8[static_cast<int64_t>(frame_cnt) * height * width *
+                         channels];
       },
       &error_string));
   ASSERT_NE(imgdata, nullptr);
@@ -72,7 +73,8 @@ TEST(GifTest, Gif) {
        {testdata_path + "optimized.gif", 12, 20, 40, 3},
        {testdata_path + "red_black.gif", 1, 16, 16, 3},
        {testdata_path + "scan.gif", 12, 20, 40, 3},
-       {testdata_path + "squares.gif", 2, 16, 16, 3}});
+       {testdata_path + "squares.gif", 2, 16, 16, 3},
+       {testdata_path + "3g_multiframe.gif", 519, 1920, 1080, 3}});
 
   for (const auto& tc : testcases) {
     TestDecodeGif(env, tc);
diff --git a/tensorflow/core/lib/gif/testdata/3g_multiframe.gif b/tensorflow/core/lib/gif/testdata/3g_multiframe.gif
new file mode 100644
index 00000000000..f9286dc85c0
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/3g_multiframe.gif differ
diff --git a/tensorflow/core/lib/gif/testdata/BUILD b/tensorflow/core/lib/gif/testdata/BUILD
index 540482ab5ac..1d7e74af90b 100644
--- a/tensorflow/core/lib/gif/testdata/BUILD
+++ b/tensorflow/core/lib/gif/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -15,6 +16,7 @@ filegroup(
         "scan.gif",
         "red_black.gif",
         "squares.gif",
+        "3g_multiframe.gif",
         "pendulum_sm.gif",
         # Add groundtruth frames for `pendulum_sm.gif`.
         # PNG format because it's lossless.
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index c01a830a20a..a0df4d035ce 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/gtl/subtle/BUILD b/tensorflow/core/lib/gtl/subtle/BUILD
index 1069ffc06fb..4fef9315341 100644
--- a/tensorflow/core/lib/gtl/subtle/BUILD
+++ b/tensorflow/core/lib/gtl/subtle/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/hash/BUILD b/tensorflow/core/lib/hash/BUILD
index 8833795dc94..cedcceb80c3 100644
--- a/tensorflow/core/lib/hash/BUILD
+++ b/tensorflow/core/lib/hash/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
@@ -19,6 +20,8 @@ package(
         "//tensorflow/tsl/platform:__subpackages__",
         # tensorflow/core/framework:tensor depends on hash
         "//tensorflow/core/framework:__pkg__",
+        # For dependency on hash
+        "//tensorflow/cc/experimental/tf2:__subpackages__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/lib/histogram/BUILD b/tensorflow/core/lib/histogram/BUILD
index 5578c4794ca..3a394e1d480 100644
--- a/tensorflow/core/lib/histogram/BUILD
+++ b/tensorflow/core/lib/histogram/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index e6ba715c446..4124346d74e 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/c/experimental/filesystem:__pkg__",
         "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
diff --git a/tensorflow/core/lib/jpeg/BUILD b/tensorflow/core/lib/jpeg/BUILD
index 3d5a2b50b08..a77bc0db22e 100644
--- a/tensorflow/core/lib/jpeg/BUILD
+++ b/tensorflow/core/lib/jpeg/BUILD
@@ -14,6 +14,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core:__pkg__"],
     features = ["-parse_headers"],
     licenses = ["notice"],
diff --git a/tensorflow/core/lib/jpeg/testdata/BUILD b/tensorflow/core/lib/jpeg/testdata/BUILD
index b1eeeb663c5..da75e6e4610 100644
--- a/tensorflow/core/lib/jpeg/testdata/BUILD
+++ b/tensorflow/core/lib/jpeg/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/llvm_rtti/BUILD b/tensorflow/core/lib/llvm_rtti/BUILD
index 8d7038316f6..4f7636e22e8 100644
--- a/tensorflow/core/lib/llvm_rtti/BUILD
+++ b/tensorflow/core/lib/llvm_rtti/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/core/lib/lmdb/BUILD b/tensorflow/core/lib/lmdb/BUILD
index 1a35957dbb7..193237ce0d1 100644
--- a/tensorflow/core/lib/lmdb/BUILD
+++ b/tensorflow/core/lib/lmdb/BUILD
@@ -2,6 +2,7 @@
 # lmdb test data packages alias.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/lmdb/testdata/BUILD b/tensorflow/core/lib/lmdb/testdata/BUILD
index d284a6f5d07..6bf293e9faa 100644
--- a/tensorflow/core/lib/lmdb/testdata/BUILD
+++ b/tensorflow/core/lib/lmdb/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/math/BUILD b/tensorflow/core/lib/math/BUILD
index eb2434e3410..7928e7bd871 100644
--- a/tensorflow/core/lib/math/BUILD
+++ b/tensorflow/core/lib/math/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index 7c13ea1a9b7..f7cc26257f9 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/monitoring/cell_reader_test.cc b/tensorflow/core/lib/monitoring/cell_reader_test.cc
index 1599d24b8c3..3e3fb99d020 100644
--- a/tensorflow/core/lib/monitoring/cell_reader_test.cc
+++ b/tensorflow/core/lib/monitoring/cell_reader_test.cc
@@ -1232,7 +1232,7 @@ TEST(CellReaderTest, PercentilesRepeatedSetAndRead) {
   EXPECT_FLOAT_EQ(percentiles.sum(), -111.0);
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(CellReaderTest, WrongNumberOfLabels) {
   CellReader<int64_t> cell_reader("/tensorflow/monitoring/test/counter");
   EXPECT_EQ(cell_reader.Read(), 0);
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index afd2a6873ea..46f167b5e60 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/core/lib/png/testdata/BUILD b/tensorflow/core/lib/png/testdata/BUILD
index 30fdcdfd2a9..523e96875d5 100644
--- a/tensorflow/core/lib/png/testdata/BUILD
+++ b/tensorflow/core/lib/png/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/lib/psnr/BUILD b/tensorflow/core/lib/psnr/BUILD
index 43d4c2df21b..7a9040281d5 100644
--- a/tensorflow/core/lib/psnr/BUILD
+++ b/tensorflow/core/lib/psnr/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__pkg__",
     ],
diff --git a/tensorflow/core/lib/psnr/testdata/BUILD b/tensorflow/core/lib/psnr/testdata/BUILD
index bc6e52a186f..1db0a1c0ca8 100644
--- a/tensorflow/core/lib/psnr/testdata/BUILD
+++ b/tensorflow/core/lib/psnr/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/lib/psnr:__pkg__",
     ],
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index 7c4ebd019b3..b11a0975755 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_po
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/ssim/BUILD b/tensorflow/core/lib/ssim/BUILD
index 1f85569788d..93ca03bf48e 100644
--- a/tensorflow/core/lib/ssim/BUILD
+++ b/tensorflow/core/lib/ssim/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__pkg__",
     ],
diff --git a/tensorflow/core/lib/ssim/testdata/BUILD b/tensorflow/core/lib/ssim/testdata/BUILD
index 1005e46bbf1..f1a12976499 100644
--- a/tensorflow/core/lib/ssim/testdata/BUILD
+++ b/tensorflow/core/lib/ssim/testdata/BUILD
@@ -2,6 +2,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/lib/ssim:__pkg__",
     ],
diff --git a/tensorflow/core/lib/strings/BUILD b/tensorflow/core/lib/strings/BUILD
index d07c2917692..48abfab2979 100644
--- a/tensorflow/core/lib/strings/BUILD
+++ b/tensorflow/core/lib/strings/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
@@ -14,6 +15,8 @@ package(
         "//tensorflow/core/platform:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
+        # For proto_serialization library
+        "//tensorflow/cc/experimental/tf2:__subpackages__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/lib/wav/BUILD b/tensorflow/core/lib/wav/BUILD
index edc75a5c0f1..e4833156503 100644
--- a/tensorflow/core/lib/wav/BUILD
+++ b/tensorflow/core/lib/wav/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core:__pkg__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index c1a07b7ddd0..67e09a6f4f4 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -3,8 +3,8 @@
 #   APIs are meant to change over time.
 
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "if_nccl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "tf_copts")
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "if_nccl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
@@ -13,6 +13,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:__subpackages__"],
     features = ["-layering_check"],
     licenses = ["notice"],
diff --git a/tensorflow/core/nccl/collective_communicator.cc b/tensorflow/core/nccl/collective_communicator.cc
index 3771cb6d71a..1555ed9aac4 100644
--- a/tensorflow/core/nccl/collective_communicator.cc
+++ b/tensorflow/core/nccl/collective_communicator.cc
@@ -160,6 +160,22 @@ void NcclCommunicator::Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
       }
       break;
     }
+    case REDUCE_SCATTER_COLLECTIVE: {
+      ncclRedOp_t reduction_op;
+      Status s =
+          ReductionOp(col_params->merge_op->type_string(), &reduction_op);
+      if (!s.ok()) {
+        participant->done_callback(s);
+        return;
+      }
+      nccl_manager_.AddToReduceScatter(std::move(participant), context,
+                                       reduction_op);
+      break;
+    }
+    case ALL_TO_ALL_COLLECTIVE: {
+      nccl_manager_.AddToAllToAll(std::move(participant), context);
+      break;
+    }
     default: {
       participant->done_callback(errors::Internal("Unexpected CollectiveType ",
                                                   col_params->instance.type));
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 4dadf5d9f0a..9fa523fa99e 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -388,6 +388,14 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
   }
 
   std::vector<ncclComm_t> nccl_comms(collective->num_local_devices);
+  VLOG(2) << "Created nccl Communicator with "
+          << "num_global_devices = " << collective->num_global_devices
+          << " num_local_devices = " << collective->num_local_devices
+          << " communicator_key ="
+          << absl::StrJoin(
+                 std::vector<int>{collective->communicator_key.begin(),
+                                  collective->communicator_key.end()},
+                 " ");
 #if NCCL_MAJOR >= 2
   // For NCCL 2, we always initialize using ncclCommInitRank guarded by NCCL
   // group primitives.
@@ -441,6 +449,18 @@ void NcclManager::AddToAllGather(std::unique_ptr<Participant> participant,
                  ncclSum /* unused */);
 }
 
+void NcclManager::AddToReduceScatter(std::unique_ptr<Participant> participant,
+                                     const Context& context,
+                                     ncclRedOp_t reduction_op) {
+  AddParticipant(std::move(participant), context, kReduceScatter, reduction_op);
+}
+
+void NcclManager::AddToAllToAll(std::unique_ptr<Participant> participant,
+                                const Context& context) {
+  AddParticipant(std::move(participant), context, kAllToAll,
+                 ncclSum /* unused */);
+}
+
 void NcclManager::AddBroadcastSend(std::unique_ptr<Participant> participant,
                                    const Context& context) {
   participant->root = true;
@@ -828,6 +848,57 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
                                     data_type, nccl_comm, *cu_stream);
         break;
       }
+      case kReduceScatter: {
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+
+        VLOG(2) << "call NcclReduceScatter collective_key "
+                << collective->collective_key << " participant " << p_idx
+                << " num_participants " << collective->participants.size()
+                << " sendbuff " << sendbuff << " recvbuff " << recvbuff
+                << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream
+                << " cuda_stream " << cu_stream;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "ncclReduceScatter",
+              {{"buffer_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "reduce_scatter"}});
+        });
+        nccl_result = ncclReduceScatter(
+            sendbuff, recvbuff, p->output->NumElements(), data_type,
+            collective->reduction_op, nccl_comm, *cu_stream);
+        break;
+      }
+      case kAllToAll: {
+        const char* sendbuff = p->input->tensor_data().data();
+        char* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+        size_t count =
+            p->input->NumElements() / collective->participants.size();
+        size_t rank_offset = count * DataTypeSize(collective->data_type);
+
+        VLOG(2) << "call Nccl All to All collective_key "
+                << collective->collective_key << " participant " << p_idx
+                << " num_participants " << collective->participants.size()
+                << " sendbuff " << static_cast<const char*>(sendbuff)
+                << " recvbuff " << static_cast<char*>(recvbuff) << " nccl_comm "
+                << nccl_comm << " comm_stream " << comm_stream
+                << " cuda_stream " << cu_stream;
+        profiler::AnnotatedTraceMe traceme([&] {
+          return profiler::TraceMeEncode(
+              "ncclAllToAll",
+              {{"buffer_size", ComputeBufferSize(p, collective->data_type)},
+               {"collective_type", "all_to_all"}});
+        });
+        ncclGroupStart();
+        for (int r = 0; r < collective->participants.size(); ++r) {
+          ncclSend(sendbuff + r * rank_offset, count, data_type, r, nccl_comm,
+                   *cu_stream);
+          ncclRecv(recvbuff + r * rank_offset, count, data_type, r, nccl_comm,
+                   *cu_stream);
+        }
+        nccl_result = ncclGroupEnd();
+        break;
+      }
     }
 
     // Run the done_callback when the nccl kernel finishes running.
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 1f30e2d4acd..671fc8151cf 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -30,7 +30,11 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
 #elif TENSORFLOW_USE_ROCM
+#if (TF_ROCM_VERSION >= 50200)
 #include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
@@ -170,6 +174,10 @@ class NcclManager {
   void AddToAllGather(std::unique_ptr<Participant> participant,
                       const Context& context);
 
+  // Adds one participant to a reduce-scatter.
+  void AddToReduceScatter(std::unique_ptr<Participant> participant,
+                          const Context& context, ncclRedOp_t reduction_op);
+
   // AddBroadcastSend and AddBroadcastRecv combine to send data from one sender
   // to all receivers.
   void AddBroadcastSend(std::unique_ptr<Participant> participant,
@@ -184,6 +192,10 @@ class NcclManager {
   void AddReduceRecv(std::unique_ptr<Participant> participant,
                      const Context& context, ncclRedOp_t reduction_op);
 
+  // Adds one participant to an all-to-all.
+  void AddToAllToAll(std::unique_ptr<Participant> participant,
+                     const Context& context);
+
   // Signals that the `Collective` corresponding to `key` is ready to launch
   // across all nodes participating in this multi-node collective operation.
   //
@@ -206,6 +218,8 @@ class NcclManager {
     kBroadcast = 2,
     kReduce = 3,
     kAllGather = 4,
+    kReduceScatter = 5,
+    kAllToAll = 6,
   };
   struct Collective;
   struct Communicator;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index 09790f0cb37..26ac0652d00 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -269,7 +269,7 @@ class NcclManagerTest : public ::testing::Test {
         auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<Scalar>().data());
         stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                            out_cpu.TotalBytes());
-        SE_ASSERT_OK(stream->BlockHostUntilDone());
+        TF_ASSERT_OK(stream->BlockHostUntilDone());
         VLOG(1) << "Verifying rank " << global_rank << " expected shape "
                 << test_case->expected.shape() << " out shape "
                 << out_cpu.shape();
@@ -522,7 +522,7 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(num_ranks, /*node=*/0, rank);
       auto* stream = device->tensorflow_accelerator_device_info()->stream;
-      SE_ASSERT_OK(stream->BlockHostUntilDone());
+      TF_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
     std::shuffle(case_and_rank.begin(), case_and_rank.end(),
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
index f0684037eb9..eb8bfe6afa3 100644
--- a/tensorflow/core/ops/BUILD
+++ b/tensorflow/core/ops/BUILD
@@ -5,12 +5,10 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_libs")
-
-# For platform specific build config
 load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
+    "//tensorflow:tensorflow.default.bzl",
+    "filegroup",
+    "tf_gen_op_libs",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -24,6 +22,7 @@ load(
 # A lot of packages try to minimize binary size by depending on individual ops,\
 # so they need access here.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -71,6 +70,7 @@ tf_gen_op_libs(
         "nccl_ops",
         "nn_ops",
         "no_op",
+        "optional_ops",
         "parsing_ops",
         "random_grad",
         "random_index_shuffle_ops",
@@ -90,6 +90,7 @@ tf_gen_op_libs(
         "state_ops",
         "stateless_random_ops",
         "stateless_random_ops_v2",
+        "sync_ops",
         "summary_ops",
         "training_ops",
     ],
@@ -97,6 +98,7 @@ tf_gen_op_libs(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+	"//tensorflow/tsl/protobuf:protos_all_cc",
     ],
 )
 
@@ -136,6 +138,7 @@ tf_gen_op_libs(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+	"//tensorflow/tsl/protobuf:protos_all_cc",
     ],
 )
 
@@ -284,6 +287,7 @@ cc_library(
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
+        ":sync_ops_op_lib",
         ":list_ops_op_lib",
         ":map_ops_op_lib",
         ":logging_ops_op_lib",
@@ -293,6 +297,7 @@ cc_library(
         ":nccl_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
+        ":optional_ops_op_lib",
         ":parsing_ops_op_lib",
         ":ragged_ops",
         ":random_index_shuffle_ops_op_lib",
@@ -436,7 +441,6 @@ tf_cc_test(
     name = "ops_array_grad_test",
     size = "small",
     srcs = ["array_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":ops",
         "//tensorflow/cc:cc_ops",
@@ -464,7 +468,6 @@ tf_cc_test(
     name = "ops_math_grad_test",
     size = "small",
     srcs = ["math_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_gpu"],
     deps = [
         ":ops",
@@ -519,7 +522,6 @@ tf_cc_test(
         "training_ops_test.cc",
         "uniform_quant_ops_test.cc",
     ],
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":ops",
         "//tensorflow/cc:cc_ops",
@@ -534,6 +536,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:type_inference",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index e33a40ba897..72b3c8871f8 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <ostream>
+#include <vector>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -24,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 #include "tensorflow/core/util/padding.h"
@@ -307,6 +310,12 @@ REGISTER_OP("ParallelConcat")
           return errors::InvalidArgument(
               "All input shapes must be fully defined.");
         }
+        if (c->Rank(c->input(i)) < 1) {
+          return errors::InvalidArgument(
+              "The rank of all input shapes must be greater than 0, "
+              "but input ",
+              i, " had rank ", c->Rank(c->input(i)), ".");
+        }
         DimensionHandle unused;
         if (!c->WithValue(c->Dim(c->input(i), 0), 1, &unused).ok()) {
           return errors::InvalidArgument("Size of first dimension must be 1.");
@@ -1071,13 +1080,24 @@ REGISTER_OP("EditDistance")
         // or else the output shape is unknown.
         return shape_inference::UnknownShape(c);
       }
-
       if (hypothesis_shape_t->NumElements() != truth_shape_t->NumElements()) {
         return errors::InvalidArgument(
             "Num elements of hypothesis_shape does not match truth_shape: ",
             hypothesis_shape_t->NumElements(), " vs. ",
             truth_shape_t->NumElements());
       }
+      if (hypothesis_shape_t->NumElements() < 2) {
+        return errors::InvalidArgument(
+            "Input Hypothesis SparseTensors must have rank at least 2, but "
+            "hypothesis_shape rank is: ",
+            hypothesis_shape_t->NumElements());
+      }
+      if (truth_shape_t->NumElements() < 2) {
+        return errors::InvalidArgument(
+            "Input Truth SparseTensors must have rank at least 2, but "
+            "truth_shape rank is: ",
+            truth_shape_t->NumElements());
+      }
 
       auto h_values = hypothesis_shape_t->flat<int64_t>();
       auto t_values = truth_shape_t->flat<int64_t>();
@@ -1591,6 +1611,7 @@ REGISTER_OP("Shape")
     .Output("output: out_type")
     .Attr("T: type")
     .Attr("out_type: {int32, int64} = DT_INT32")
+    .SetTypeConstructor(full_type::Unary(TFT_SHAPE_TENSOR, "out_type"))
     .SetShapeFn(ShapeShapeFn);
 
 REGISTER_OP("ShapeN")
@@ -2883,6 +2904,10 @@ REGISTER_OP("QuantizeAndDequantizeV2")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
+        if (axis >= kint32max) {
+          return errors::InvalidArgument(
+              "Axis cannot be >= kint32max value, got ", axis);
+        }
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
         DimensionHandle depth;
         TF_RETURN_IF_ERROR(
@@ -2918,6 +2943,10 @@ REGISTER_OP("QuantizeAndDequantizeV4")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
+        if (axis >= kint32max) {
+          return errors::InvalidArgument(
+              "Axis cannot be >= kint32max value, got ", axis);
+        }
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
         DimensionHandle depth;
         TF_RETURN_IF_ERROR(
@@ -2949,6 +2978,10 @@ REGISTER_OP("QuantizeAndDequantizeV4Grad")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
+        if (axis >= kint32max) {
+          return errors::InvalidArgument(
+              "Axis cannot be >= kint32max value, got ", axis);
+        }
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
         DimensionHandle depth;
         TF_RETURN_IF_ERROR(
@@ -2985,6 +3018,10 @@ REGISTER_OP("QuantizeAndDequantizeV3")
                                        axis);
       } else if (axis != -1) {
         ShapeHandle input;
+        if (axis >= kint32max) {
+          return errors::InvalidArgument(
+              "Axis cannot be >= kint32max value, got ", axis);
+        }
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
         DimensionHandle depth;
         TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 3de484f7001..0b42309b349 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/type_inference.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -21,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
@@ -404,11 +407,64 @@ TEST(ArrayOpsTest, GatherNd_ShapeFn) {
 
 TEST(ArrayOpsTest, Shape_ShapeFn) {
   ShapeInferenceTestOp op("Shape");
+  AddNodeAttr("out_type", DT_INT32, &op.node_def);
   INFER_OK(op, "?", "[?]");
   INFER_OK(op, "[?]", "[1]");
   INFER_OK(op, "[?,2,3,4,5]", "[5]");
 }
 
+// Runs type inference pass on graph
+static Status type_inference(Graph& graph) {
+  GraphOptimizationPassOptions opt_options;
+  std::unique_ptr<Graph> graph_ptr(new Graph(OpRegistry::Global()));
+  graph_ptr->Copy(graph);
+  opt_options.graph = &graph_ptr;
+  opt_options.flib_def = graph.mutable_flib_def();
+  TypeInferencePass pass;
+  return pass.Run(opt_options);
+}
+
+// TODO(b/222556529) when Const has a type constructor, remove the following
+// REGISTER_OP definiton for ArrayOpsTest>ConstTypeCtor and use the Const
+// op instead of ArrayOpsTest>ConstTypeCtor in the Shape_TypeCtor test.
+REGISTER_OP("ArrayOpsTest>ConstTypeCtor")
+    .Output("output: dtype")
+    .Attr("value: tensor")
+    .Attr("dtype: type")
+    .SetTypeConstructor(full_type::Unary(TFT_TENSOR, "dtype"))
+    .SetShapeFn(shape_inference::UnknownShape);
+
+TEST(ArrayOpsTest, Shape_TypeCtor) {
+  Graph graph(OpRegistry::Global());
+  Node* input_tensor_op;
+  TensorProto tensor_proto;
+  TF_EXPECT_OK(NodeBuilder("input_tensor_op", "ArrayOpsTest>ConstTypeCtor")
+                   .Attr("value", tensor_proto)
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(&graph, &input_tensor_op));
+  Node* shape_op;
+  TF_EXPECT_OK(NodeBuilder("shape_op", "Shape")
+                   .Input(input_tensor_op)
+                   .Attr("T", DT_FLOAT)
+                   .Attr("out_type", DT_INT32)
+                   .Finalize(&graph, &shape_op));
+  TF_EXPECT_OK(type_inference(graph));
+  FullTypeDef expected_shape_op_t;
+  protobuf::TextFormat::Parser parser;
+  CHECK(parser.ParseFromString(
+      R"pb(type_id: TFT_PRODUCT
+           args {
+             type_id: TFT_SHAPE_TENSOR
+             args { type_id: TFT_INT32 }
+           })pb",
+      &expected_shape_op_t));
+  EXPECT_TRUE(full_type::IsEqual(shape_op->def().experimental_type(),
+                                 expected_shape_op_t))
+      << "fulltype is\n"
+      << shape_op->def().experimental_type().DebugString() << "\nexpected\n"
+      << expected_shape_op_t.DebugString();
+}
+
 TEST(ArrayOpsTest, ShapeN_ShapeFn) {
   ShapeInferenceTestOp op("ShapeN");
   int n = 3;
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
index 807cd6d9b48..240567c5307 100644
--- a/tensorflow/core/ops/audio_ops.cc
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -72,8 +73,17 @@ Status SpectrogramShapeFn(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
   int32_t window_size;
   TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size));
+  if (window_size <= 1) {
+    return errors::InvalidArgument("window size must be > 1, got ",
+                                   window_size);
+  }
+
   int32_t stride;
   TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride));
+  if (stride <= 0) {
+    return errors::InvalidArgument("stride must be strictly positive, got ",
+                                   stride);
+  }
 
   DimensionHandle input_length = c->Dim(input, 0);
   DimensionHandle input_channels = c->Dim(input, 1);
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 155294440b7..d30f8f1bb47 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -142,6 +142,35 @@ REGISTER_OP("CollectiveReduceV2")
     .SetIsDistributedCommunication()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveReduceScatterV2")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {bfloat16, float, float16, float64, int32, int64}")
+    .Input("group_size: int32")
+    .Input("group_key: int32")
+    .Input("instance_key: int32")
+    .Input("ordering_token: Nordering_token * resource")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
+    .Attr("Nordering_token: int >= 0 = 0")
+    .Attr("max_subdivs_per_device: int = -1")
+    .SetIsStateful()
+    .SetIsDistributedCommunication()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Scalar input is not supported.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // This output should have the same shape as its input except the first
+      // dimension is unknown, since the group size is unknown.
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), /*dim_index=*/0, c->UnknownDim(), &out));
+      c->set_output(0, out);
+      return OkStatus();
+    });
+
 REGISTER_OP("CollectiveGatherV2")
     .Input("input: T")
     .Output("data: T")
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 20b0a5c93d8..c9f57498196 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/ops/compat/ops_history.v2.pbtxt b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
index 63cfda3c4df..9208954a9ad 100644
--- a/tensorflow/core/ops/compat/ops_history.v2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
@@ -659,6 +659,31 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -6804,6 +6829,31 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Asinh"
   input_arg {
@@ -7451,6 +7501,31 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Atan2"
   input_arg {
@@ -81756,6 +81831,31 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt
index 3ed45186f6e..a9e413ffcb7 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Acos.pbtxt
@@ -78,3 +78,28 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt
index 7df768f7c66..85498177cd5 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Asin.pbtxt
@@ -78,3 +78,28 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt
index 86f0628ab53..4fe0a84ebc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Atan.pbtxt
@@ -78,3 +78,29 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BUILD b/tensorflow/core/ops/compat/ops_history_v1/BUILD
index d79cb10f5a7..427633be53e 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/BUILD
+++ b/tensorflow/core/ops/compat/ops_history_v1/BUILD
@@ -6,6 +6,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt
index 7dc7f84fd38..8e17829a201 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Tan.pbtxt
@@ -78,3 +78,28 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
index 6c693610848..7a44ffd9fda 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulateNV2.pbtxt
@@ -144,3 +144,54 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
index ba07ba0685b..e920df37858 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorApplyGradient.pbtxt
@@ -158,3 +158,46 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
index 56783bf417f..de3a79650b8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AccumulatorTakeGradient.pbtxt
@@ -158,3 +158,46 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
index 78e2885c3eb..577976c6e4c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
@@ -107,3 +107,28 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
index fa555ea7556..106bb1a9a9b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddN.pbtxt
@@ -220,3 +220,51 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
index 89144e8db0d..23796980cd6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AllToAll.pbtxt
@@ -142,3 +142,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
index 1c51ff91119..fe6e128a180 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdaMax.pbtxt
@@ -77,3 +77,84 @@ op 	 {
     }
   }
 }
+op {
+  name: "ApplyAdaMax"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
index 13340fda068..211c8d4a64e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdadelta.pbtxt
@@ -278,3 +278,76 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
index c375dd3fda3..5ede2bc76ea 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagrad.pbtxt
@@ -291,3 +291,70 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
index 518466e2d02..3d00cb6f01b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradDA.pbtxt
@@ -294,3 +294,80 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
index d2c26144ea8..f7673fe1029 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdagradV2.pbtxt
@@ -67,3 +67,74 @@ op 	 {
     }
   }
 }
+op {
+  name: "ApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
index 593e49490d6..fdcf81e0d03 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAdam.pbtxt
@@ -434,3 +434,95 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
index b7e8b7c0eff..9e485fcfec6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyAddSign.pbtxt
@@ -207,3 +207,75 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
index efb6e21ad49..3e50dc38c7c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyCenteredRMSProp.pbtxt
@@ -314,3 +314,85 @@ op {
     }
   }
 }
+op {
+  name: "ApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
index d75fd58b7e6..6a67647bad7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrl.pbtxt
@@ -376,3 +376,87 @@ op {
     }
   }
 }
+op {
+  name: "ApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
index 0b09cdb2769..201d61b2fa8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyFtrlV2.pbtxt
@@ -396,3 +396,91 @@ op {
     }
   }
 }
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
index 43e3ae30123..25fb5723ebd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyGradientDescent.pbtxt
@@ -206,3 +206,58 @@ op {
     }
   }
 }
+op {
+  name: "ApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
index 9951382e1be..289c6ea6151 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyMomentum.pbtxt
@@ -270,3 +270,74 @@ op {
     }
   }
 }
+op {
+  name: "ApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
index b8bf3db312f..fb3838c02a2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyPowerSign.pbtxt
@@ -207,3 +207,75 @@ op {
     }
   }
 }
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
index 59a71da5384..c25959517fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalAdagrad.pbtxt
@@ -258,3 +258,71 @@ op {
     }
   }
 }
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
index 4db97e2b544..3482b511cff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyProximalGradientDescent.pbtxt
@@ -238,3 +238,66 @@ op {
     }
   }
 }
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
index 6d2d8b38a48..24fe49118c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApplyRMSProp.pbtxt
@@ -294,3 +294,80 @@ op {
     }
   }
 }
+op {
+  name: "ApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
index b23e9b7228e..be01e038fde 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ApproximateEqual.pbtxt
@@ -186,3 +186,53 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "ApproximateEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
index fc4ba8f0a9b..90679987916 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMax.pbtxt
@@ -25,8 +25,6 @@ op 	 {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -75,8 +73,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -138,8 +134,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -203,8 +197,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -267,14 +259,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -333,14 +323,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -400,14 +388,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -469,14 +455,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -514,3 +498,143 @@ op {
     }
   }
 }
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
index 93cc18bfe58..d2113ec185e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ArgMin.pbtxt
@@ -25,8 +25,6 @@ op 	 {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -75,8 +73,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -138,8 +134,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -203,8 +197,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -267,14 +259,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -333,14 +323,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -375,3 +363,137 @@ op {
     }
   }
 }
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ArgMin"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
index 5a2cd88e3a2..690d2a556e4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
@@ -107,3 +107,28 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
index 6df1f5843fd..eea9bd9ad00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignAdd.pbtxt
@@ -190,3 +190,54 @@ op {
     }
   }
 }
+op {
+  name: "AssignAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
index 5363660ea6d..3e13c2c8c52 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssignSub.pbtxt
@@ -190,3 +190,54 @@ op {
     }
   }
 }
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
index e3894961a97..78fd00eea4d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
@@ -107,3 +107,28 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BUILD b/tensorflow/core/ops/compat/ops_history_v2/BUILD
index 96327dc7385..1775c91064b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BUILD
+++ b/tensorflow/core/ops/compat/ops_history_v2/BUILD
@@ -6,6 +6,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
index a6198d6135b..aa446c0a492 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchMatMulV2.pbtxt
@@ -89,3 +89,53 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMulV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
index ec0cde0309d..846475ea1bf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalization.pbtxt
@@ -246,3 +246,68 @@ op {
     version: 9
   }
 }
+op {
+  name: "BatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
index 794cfbac394..c8b1b878780 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -310,3 +310,84 @@ op {
     version: 9
   }
 }
+op {
+  name: "BatchNormWithGlobalNormalizationGrad"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dx"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
index 4b25c7b3cdd..15e7dad6982 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAdd.pbtxt
@@ -206,3 +206,58 @@ op {
     }
   }
 }
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
index 5ad3c722351..ea11e9ee5dd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAddGrad.pbtxt
@@ -190,3 +190,54 @@ op {
     }
   }
 }
+op {
+  name: "BiasAddGrad"
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
index fbfd247c087..b1f6b0cc1fc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BiasAddV1.pbtxt
@@ -154,3 +154,45 @@ op {
     }
   }
 }
+op {
+  name: "BiasAddV1"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
index 88e93b7472b..821ca0c2425 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ClipByValue.pbtxt
@@ -42,3 +42,49 @@ op 	 {
     }
   }
 }
+op {
+  name: "ClipByValue"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "clip_value_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
index a959da2b42a..9a4ba13ea19 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectivePermute.pbtxt
@@ -79,3 +79,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt
new file mode 100644
index 00000000000..d359f7cfdb6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceScatterV2.pbtxt
@@ -0,0 +1,95 @@
+op 	 {
+  name: "CollectiveReduceScatterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "max_subdivs_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+  is_distributed_communication: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt
new file mode 100644
index 00000000000..d04dc21c954
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMask.pbtxt
@@ -0,0 +1,12 @@
+op 	 {
+  name: "ComputeDedupDataTupleMask"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
index 7b09476738a..fbb1e42c13c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConditionalAccumulator.pbtxt
@@ -267,3 +267,70 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt
new file mode 100644
index 00000000000..c56a507acdd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropFilterV2.pbtxt
@@ -0,0 +1,86 @@
+op 	 {
+  name: "Conv2DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt
new file mode 100644
index 00000000000..82148176cad
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/Conv2DBackpropInputV2.pbtxt
@@ -0,0 +1,87 @@
+op 	 {
+  name: "Conv2DBackpropInputV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
index 19f9584db78..9ae7fcfd322 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cumprod.pbtxt
@@ -262,3 +262,72 @@ op {
     }
   }
 }
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
index 86e73548af1..a5817e2d446 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Cumsum.pbtxt
@@ -262,3 +262,72 @@ op {
     }
   }
 }
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
index 4c8b0fdbd5c..e6e47e700f2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CumulativeLogsumexp.pbtxt
@@ -51,3 +51,57 @@ op 	 {
     }
   }
 }
+op {
+  name: "CumulativeLogsumexp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt
new file mode 100644
index 00000000000..3cb284d442a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DistributedSave.pbtxt
@@ -0,0 +1,23 @@
+op 	 {
+  name: "DistributedSave"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "directory"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
index e048ee1847e..b4916ba2c7e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EuclideanNorm.pbtxt
@@ -58,3 +58,65 @@ op 	 {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
index c6b64260960..369f763e947 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatMul.pbtxt
@@ -174,3 +174,52 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
index 778cb167320..10769852ec7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Mean.pbtxt
@@ -234,3 +234,65 @@ op {
     }
   }
 }
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt
new file mode 100644
index 00000000000..ba7d6431165
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/MergeDedupData.pbtxt
@@ -0,0 +1,42 @@
+op 	 {
+  name: "MergeDedupData"
+  input_arg {
+    name: "integer_tensor"
+    type_attr: "integer_type"
+  }
+  input_arg {
+    name: "float_tensor"
+    type_attr: "float_type"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "tuple_mask"
+    type: "string"
+  }
+  attr {
+    name: "integer_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "float_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
index 777147d635f..d3c71eacd78 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Prod.pbtxt
@@ -234,3 +234,65 @@ op {
     }
   }
 }
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt
new file mode 100644
index 00000000000..d561045e0b7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomDatasetV2.pbtxt
@@ -0,0 +1,66 @@
+op 	 {
+  name: "RandomDatasetV2"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "rerandomize_each_iteration"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
index 1ee3178cfd6..05ec4d8a404 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorApplyGradient.pbtxt
@@ -39,3 +39,46 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
index ec93d982428..5f37567706f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceAccumulatorTakeGradient.pbtxt
@@ -39,3 +39,46 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
index 7ba8624e945..ef87929c4f5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdaMax.pbtxt
@@ -70,3 +70,77 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdaMax"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
index b07c373d690..2a671e082a6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdadelta.pbtxt
@@ -250,3 +250,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
index 15d03c358d1..67881c44ac8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagrad.pbtxt
@@ -261,3 +261,64 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
index 6243af508ff..1bab7daf7ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradDA.pbtxt
@@ -266,3 +266,73 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
index b02c4ce2b43..66aaa456bed 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdagradV2.pbtxt
@@ -61,3 +61,68 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
index 1dae72bc59f..e76225de607 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdam.pbtxt
@@ -399,3 +399,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
index 59ba0cfb480..ab8696a3263 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAdamWithAmsgrad.pbtxt
@@ -78,3 +78,85 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
index 93ddcaaadff..44f5b7d88e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyAddSign.pbtxt
@@ -189,3 +189,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
index 4c2046df5c0..b7c69b1c832 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyCenteredRMSProp.pbtxt
@@ -282,3 +282,77 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
index 253c8bb5088..f94944686d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrl.pbtxt
@@ -341,3 +341,80 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
index 75e0e3da456..597ce4ab164 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyFtrlV2.pbtxt
@@ -361,3 +361,84 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
index 4bf14d3cdec..6bd3170e2c6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyGradientDescent.pbtxt
@@ -186,3 +186,53 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
index 659a3c1a6cd..12e22d4167d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyKerasMomentum.pbtxt
@@ -61,3 +61,68 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
index 3ff5727a673..20de47ab289 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyMomentum.pbtxt
@@ -246,3 +246,68 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
index 6b701657851..261463a5512 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyPowerSign.pbtxt
@@ -189,3 +189,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
index 42ee9471307..2c6007597ca 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalAdagrad.pbtxt
@@ -234,3 +234,65 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
index 60ae5d4d399..dbe02a88ff0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyProximalGradientDescent.pbtxt
@@ -218,3 +218,61 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
index bc8847d4f08..90f24a83fb8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceApplyRMSProp.pbtxt
@@ -266,3 +266,73 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
index a9b8fc4c96e..389486faef4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceConditionalAccumulator.pbtxt
@@ -62,3 +62,69 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
index 12e5ab39d7b..5b10cf3f16c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterAdd.pbtxt
@@ -198,3 +198,56 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterAdd"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
index ca67cc035de..0b94ef0dec4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterDiv.pbtxt
@@ -49,3 +49,56 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterDiv"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
index f9471c941e4..d6e97844047 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMax.pbtxt
@@ -49,3 +49,56 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
index 8f2fbc5df39..d012a861218 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMin.pbtxt
@@ -49,3 +49,56 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
index f3c1b66e7a9..aa859ad4d25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterMul.pbtxt
@@ -49,3 +49,56 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterMul"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
index 5e3aaaf7888..32257603e3a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceScatterSub.pbtxt
@@ -49,3 +49,56 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
index dbee36a88d9..21aa0addab9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdadelta.pbtxt
@@ -306,3 +306,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
index 52a83bb5fc5..7cc9e1f5cd2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagrad.pbtxt
@@ -331,3 +331,78 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
index a4c3f1ac2b7..c28ddc89e06 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradDA.pbtxt
@@ -322,3 +322,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
index 44a58564769..a14ba1dc5d2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyAdagradV2.pbtxt
@@ -75,3 +75,82 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
index f4bc286bdd1..855c9982f3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -338,3 +338,91 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
index 0b3df9f3015..7bb28e90106 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrl.pbtxt
@@ -411,3 +411,94 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
index 1bf15d8f6a0..c43d4a60f27 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyFtrlV2.pbtxt
@@ -431,3 +431,98 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
index d4595c34e8e..96b22b6cf0b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyKerasMomentum.pbtxt
@@ -75,3 +75,82 @@ op 	 {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
index 7e1b4da4a1c..03b05570cb5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyMomentum.pbtxt
@@ -302,3 +302,82 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
index 46be4b29469..0093fb2c50f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalAdagrad.pbtxt
@@ -290,3 +290,79 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
index 6f733d760a9..0844c8d4349 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -274,3 +274,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
index e2dd20b4bef..87803f3e893 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ResourceSparseApplyRMSProp.pbtxt
@@ -322,3 +322,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
index eb1019d56ac..70f2fc7f5ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterAdd.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
index f5228533413..c3477653835 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterDiv.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterDiv"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
index 33a855d8d75..aa6f863e0bc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterMul.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterMul"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
index eb782571256..409f7d35e5f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdAdd.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
index 03eaf75d4cb..1425d91fc05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMax.pbtxt
@@ -61,3 +61,68 @@ op 	 {
     }
   }
 }
+op {
+  name: "ScatterNdMax"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
index 1a7e6f987fc..996fd4036e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdMin.pbtxt
@@ -61,3 +61,68 @@ op 	 {
     }
   }
 }
+op {
+  name: "ScatterNdMin"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
index 9702b4e8c50..5cf7d9f91f1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdNonAliasingAdd.pbtxt
@@ -265,3 +265,60 @@ op {
     }
   }
 }
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
index 84b251ae177..6e85f0669ef 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterNdSub.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterNdSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
index ac09dec020e..bf168e22283 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScatterSub.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "ScatterSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
index abaae5089e5..a5a4a423220 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentMean.pbtxt
@@ -224,3 +224,55 @@ op {
     }
   }
 }
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
index b4bb3fda5e0..4726236438d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentProd.pbtxt
@@ -194,3 +194,55 @@ op {
     }
   }
 }
+op {
+  name: "SegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt
new file mode 100644
index 00000000000..a66300ef3e4
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentProdV2.pbtxt
@@ -0,0 +1,69 @@
+op 	 {
+  name: "SegmentProdV2"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
index fd10eeb5015..9f033b0db19 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentSum.pbtxt
@@ -194,3 +194,55 @@ op {
     }
   }
 }
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt
new file mode 100644
index 00000000000..f0bc5dce91d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SegmentSumV2.pbtxt
@@ -0,0 +1,69 @@
+op 	 {
+  name: "SegmentSumV2"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
index 06ebff9298f..c679caa24aa 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Shape.pbtxt
@@ -26,3 +26,38 @@ op 	 {
     }
   }
 }
+op {
+  name: "Shape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+    experimental_full_type {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_VAR
+        s: "out_type"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
index 85b84fbfd8f..cbd19b93064 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorApplyGradient.pbtxt
@@ -206,3 +206,58 @@ op {
     type: "bool"
   }
 }
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
index e3f18f7b75d..5a35297ec76 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAccumulatorTakeGradient.pbtxt
@@ -190,3 +190,54 @@ op {
     }
   }
 }
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
index 651b4a49257..84b00a504d9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAdd.pbtxt
@@ -342,3 +342,93 @@ op {
     }
   }
 }
+op {
+  name: "SparseAdd"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
+  output_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Treal"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
index 480a00b3f7d..96192d00941 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseAddGrad.pbtxt
@@ -202,3 +202,57 @@ op {
     }
   }
 }
+op {
+  name: "SparseAddGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "b_val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
index 2a622d26697..abff5aab28c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdadelta.pbtxt
@@ -334,3 +334,90 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
index 667a4b821f1..248c28c0b8e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagrad.pbtxt
@@ -361,3 +361,84 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
index 0748505b809..8dcc79dc49a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradDA.pbtxt
@@ -350,3 +350,94 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
index fff7a9a600b..00a3ca86f43 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyAdagradV2.pbtxt
@@ -81,3 +81,88 @@ op 	 {
     }
   }
 }
+op {
+  name: "SparseApplyAdagradV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
index f963c3a41d1..a5ae87d42c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyCenteredRMSProp.pbtxt
@@ -370,3 +370,99 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
index 7c268add4ae..d43fe26ffc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrl.pbtxt
@@ -446,3 +446,101 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyFtrl"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
index 240a48be3c4..4ced4fe22d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyFtrlV2.pbtxt
@@ -466,3 +466,105 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "multiply_linear_by_lr"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
index 3ad381470fd..17e60ae80df 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyMomentum.pbtxt
@@ -326,3 +326,88 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
index f0b9747faa1..5fb249592fb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalAdagrad.pbtxt
@@ -314,3 +314,85 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
index d753e302156..f04e6553369 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyProximalGradientDescent.pbtxt
@@ -294,3 +294,80 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
index 664e0b2b906..aa27af72a44 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseApplyRMSProp.pbtxt
@@ -350,3 +350,94 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyRMSProp"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
index 29939f6eea7..26bc9fa77b6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseConditionalAccumulator.pbtxt
@@ -267,3 +267,70 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reduction_type"
+    type: "string"
+    default_value {
+      s: "MEAN"
+    }
+    allowed_values {
+      list {
+        s: "MEAN"
+        s: "SUM"
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
index b2e925d1689..474457187b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseAdd.pbtxt
@@ -186,3 +186,53 @@ op {
     }
   }
 }
+op {
+  name: "SparseDenseCwiseAdd"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
index 614e2ae5ffa..d91c4c89585 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseDiv.pbtxt
@@ -186,3 +186,53 @@ op {
     }
   }
 }
+op {
+  name: "SparseDenseCwiseDiv"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
index 6ddedcbb4c7..f6fd9e956c0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseDenseCwiseMul.pbtxt
@@ -186,3 +186,53 @@ op {
     }
   }
 }
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
index b9c96e1c690..d0e5258dad8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSum.pbtxt
@@ -214,3 +214,60 @@ op {
     }
   }
 }
+op {
+  name: "SparseReduceSum"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
index 519b16f7154..12a18fbda1c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseReduceSumSparse.pbtxt
@@ -246,3 +246,68 @@ op {
     }
   }
 }
+op {
+  name: "SparseReduceSumSparse"
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "reduction_axes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
index b3f64eaac7c..d3d6693044a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSliceGrad.pbtxt
@@ -46,3 +46,53 @@ op 	 {
     }
   }
 }
+op {
+  name: "SparseSliceGrad"
+  input_arg {
+    name: "backprop_val_grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "input_start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "val_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
index 18963203449..7d1569221cc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSparseMinimum.pbtxt
@@ -234,3 +234,65 @@ op {
     }
   }
 }
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
index 6af7487afdd..c1b647cef05 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorDenseAdd.pbtxt
@@ -226,3 +226,63 @@ op {
     }
   }
 }
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt
new file mode 100644
index 00000000000..15ad7121a9b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SplitDedupData.pbtxt
@@ -0,0 +1,42 @@
+op 	 {
+  name: "SplitDedupData"
+  input_arg {
+    name: "input"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "integer_tensor"
+    type_attr: "integer_type"
+  }
+  output_arg {
+    name: "float_tensor"
+    type_attr: "float_type"
+  }
+  attr {
+    name: "integer_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "float_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "tuple_mask"
+    type: "string"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
index 98797eb9e42..4a71be5f59c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Sum.pbtxt
@@ -234,3 +234,65 @@ op {
     }
   }
 }
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt
new file mode 100644
index 00000000000..cc6bf6b6f0c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SyncDevice.pbtxt
@@ -0,0 +1,4 @@
+op 	 {
+  name: "SyncDevice"
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt
new file mode 100644
index 00000000000..52ec50cc1bf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedInputV2.pbtxt
@@ -0,0 +1,33 @@
+op 	 {
+  name: "TPUPartitionedInputV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "partition_dims"
+    type: "list(int)"
+  }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt
new file mode 100644
index 00000000000..83b7375fa2f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUPartitionedOutputV2.pbtxt
@@ -0,0 +1,26 @@
+op 	 {
+  name: "TPUPartitionedOutputV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_splits"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "num_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "partition_dims"
+    type: "list(int)"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
index d038977bf5b..45442d6d712 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
@@ -107,3 +107,28 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt
new file mode 100644
index 00000000000..480e70a6693
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedAdd.pbtxt
@@ -0,0 +1,93 @@
+op 	 {
+  name: "UniformQuantizedAdd"
+  input_arg {
+    name: "lhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "lhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "rhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "rhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_zero_points"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt
new file mode 100644
index 00000000000..68c4746c521
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniformQuantizedConvolution.pbtxt
@@ -0,0 +1,159 @@
+op 	 {
+  name: "UniformQuantizedConvolution"
+  input_arg {
+    name: "lhs"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "lhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "lhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "rhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "rhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_zero_points"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "window_strides"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+  }
+  attr {
+    name: "explicit_padding"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lhs_dilation"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "rhs_dilation"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "batch_group_count"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "feature_group_count"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "dimension_numbers"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_max_val"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
index 17bcce53932..e1543a33441 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentProd.pbtxt
@@ -127,3 +127,72 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentProd"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
index f82562fa844..f7229f87936 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnsortedSegmentSum.pbtxt
@@ -236,3 +236,72 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
index 6dce31cc4ca..b23dca17037 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Where.pbtxt
@@ -128,3 +128,45 @@ op {
     }
   }
 }
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
index f195a1a0026..65363225527 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xdivy.pbtxt
@@ -26,3 +26,32 @@ op 	 {
     }
   }
 }
+op {
+  name: "Xdivy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
index 57421014095..f37a09eea17 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xlog1py.pbtxt
@@ -26,3 +26,32 @@ op 	 {
     }
   }
 }
+op {
+  name: "Xlog1py"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
index a8668129b40..8da356ec49f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Xlogy.pbtxt
@@ -26,3 +26,32 @@ op 	 {
     }
   }
 }
+op {
+  name: "Xlogy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 0b65c11ba32..7b6b97d4d1d 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -1038,55 +1038,6 @@ REGISTER_OP("OptimizeDatasetV2")
                                                            "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("OptionalFromValue")
-    .Input("components: Toutput_types")
-    .Output("optional: variant")
-    .Attr("Toutput_types: list(type) >= 1")
-    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_OPTIONAL,
-                                                           "Toutput_types"))
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<DataType> dtypes;
-      TF_RETURN_IF_ERROR(c->GetAttr("Toutput_types", &dtypes));
-      c->set_output(0, c->Scalar());
-      std::vector<shape_inference::ShapeAndType> shapes_and_types;
-      shapes_and_types.reserve(c->num_inputs());
-      const FullTypeDef& ret_types = c->ret_types();
-      for (int i = 0; i < c->num_inputs(); ++i) {
-        // TODO(mdan): output_type(i) == optional is incorrect.
-        // "Optional" is the type of the whole container, not of individual
-        // elements.
-        //
-        // Why ret_types.args(0) and not args(i) --
-        // For example if Toutput_types is (int32, float32), then
-        // ret_types.args[0] (i.e. the 0th output) is
-        // Optional[Record[Tensor[int32, s1], Tensor[float32, s2]]]
-        // set_output_handle_shapes_and_types tracks the same thing, but in
-        // a transposed way:
-        // {ShapeAndType(in32, s1, Optional), ShapeAndType(in32, s2, Optional)}
-        // That should be corrected in the future (see todo above).
-        shapes_and_types.emplace_back(c->input(i), dtypes[i],
-                                      ret_types.args(0));
-      }
-      c->set_output_handle_shapes_and_types(0, shapes_and_types);
-      return OkStatus();
-    });
-
-REGISTER_OP("OptionalNone")
-    .Output("optional: variant")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("OptionalHasValue")
-    .Input("optional: variant")
-    .Output("has_value: bool")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("OptionalGetValue")
-    .Input("optional: variant")
-    .Output("components: output_types")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::DatasetIteratorShape);
-
 REGISTER_OP("IteratorGetNextAsOptional")
     .Input("iterator: resource")
     .Output("optional: variant")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 0a69871fba5..f46c53f69ab 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -890,9 +890,30 @@ REGISTER_OP("RandomDataset")
                                                            "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
+      // seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("RandomDatasetV2")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("seed_generator: resource")
+    .Output("handle: variant")
+    .Attr("rerandomize_each_iteration: bool = false")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("metadata: string = ''")
+    .SetDoNotOptimize()
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // seed, seed2, and seed_generator should be scalars.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::ScalarShape(c);
     });
 
@@ -1491,6 +1512,14 @@ REGISTER_OP("DataServiceDatasetV4")
                                                            "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("DistributedSave")
+    .Input("dataset: variant")
+    .Input("directory: string")
+    .Input("address: string")
+    .Attr("metadata: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs);
+
 REGISTER_OP("RegisterDataset")
     .Input("dataset: variant")
     .Input("address: string")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 0d04a792b78..6778ea651bd 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -234,7 +236,7 @@ Status CombinedNMSShapeFn(InferenceContext* c) {
   DimensionHandle size_per_class;
   TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(2, &size_per_class));
 
-  int64_t output_size;
+  int64_t output_size = -1;
   bool pad_per_class;
   TF_RETURN_IF_ERROR(c->GetAttr("pad_per_class", &pad_per_class));
   if (!pad_per_class) {
@@ -245,8 +247,11 @@ Status CombinedNMSShapeFn(InferenceContext* c) {
           "max_output_size_per_class must be > 0 "
           "if pad_per_class is set to true ");
     }
-    output_size = std::min(c->Value(output_dim),
-                           c->Value(size_per_class) * c->Value(class_dim));
+    if (c->ValueKnown(size_per_class) && c->ValueKnown(class_dim)) {
+      output_size = std::min(
+          static_cast<int64_t>(c->Value(output_dim)),
+          static_cast<int64_t>(c->Value(size_per_class)) * c->Value(class_dim));
+    }
   }
   c->set_output(0, c->MakeShape({batch_dim, output_size, 4}));
   c->set_output(1, c->MakeShape({batch_dim, output_size}));
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 2c279776dca..4e074e0ef74 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -309,9 +309,10 @@ REGISTER_OP("LookupTableImportV2")
 
       ShapeHandle keys;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      ShapeHandle values;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &values));
       DimensionHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->Merge(c->Dim(keys, 0), c->Dim(c->input(2), 0), &unused));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(keys, 0), c->Dim(values, 0), &unused));
       return OkStatus();
     });
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 929b66f662e..d634c4c9756 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
+#include <vector>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -131,8 +133,8 @@ REGISTER_OP("BatchMatMulV2")
     .Input("y: T")
     .Output("output: T")
     .Attr(
-        "T: {bfloat16, half, float, double, int16, int32, int64, complex64, "
-        "complex128}")
+        "T: {bfloat16, half, float, double, int16, int32, int64, uint8, "
+        "uint16, uint32, uint64, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn(shape_inference::BatchMatMulV2Shape);
@@ -318,13 +320,13 @@ REGISTER_OP("Sin").UNARY_COMPLEX();
 
 REGISTER_OP("Cos").UNARY_COMPLEX();
 
-REGISTER_OP("Tan").UNARY();
+REGISTER_OP("Tan").UNARY_COMPLEX();
 
-REGISTER_OP("Asin").UNARY();
+REGISTER_OP("Asin").UNARY_COMPLEX();
 
-REGISTER_OP("Acos").UNARY();
+REGISTER_OP("Acos").UNARY_COMPLEX();
 
-REGISTER_OP("Atan").UNARY();
+REGISTER_OP("Atan").UNARY_COMPLEX();
 
 REGISTER_OP("_UnaryOpsComposition")
     .Input("x: T")
@@ -547,21 +549,21 @@ REGISTER_OP("Xlogy")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Xlog1py")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Xdivy")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 #undef BINARY_FEWER
@@ -950,8 +952,8 @@ REGISTER_OP("MatMul")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
     .Attr(
-        "T: {bfloat16, half, float, double, int32, int64, complex64, "
-        "complex128}")
+        "T: {bfloat16, half, float, double, int32, int64, uint8, "
+        "uint16, uint32, uint64, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 #ifdef INTEL_MKL
@@ -1136,7 +1138,7 @@ REGISTER_OP("ArgMax")
     .Input("input: T")
     .Input("dimension: Tidx")
     .Output("output: output_type")
-    .Attr("T: {numbertype, bool}")
+    .Attr("T: {realnumbertype, quantizedtype, bool}")
     .Attr("Tidx: {int16, int32, int64} = DT_INT32")
     .Attr("output_type: {int16, uint16, int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape);
@@ -1145,7 +1147,7 @@ REGISTER_OP("ArgMin")
     .Input("input: T")
     .Input("dimension: Tidx")
     .Output("output: output_type")
-    .Attr("T: {numbertype, bool}")
+    .Attr("T: {realnumbertype, quantizedtype, bool}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("output_type: {int32, int64} = DT_INT64")
     .SetShapeFn(ArgOpShape);
@@ -1278,6 +1280,17 @@ REGISTER_OP("SegmentSum")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
+// TODO(hinsu): Introduce Segment{Min,Max}V2 ops, similarly.
+REGISTER_OP("SegmentSumV2")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
+
 REGISTER_OP("SegmentMean")
     .Input("data: T")
     .Input("segment_ids: Tindices")
@@ -1294,6 +1307,16 @@ REGISTER_OP("SegmentProd")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
+REGISTER_OP("SegmentProdV2")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
+
 REGISTER_OP("SegmentMin")
     .Input("data: T")
     .Input("segment_ids: Tindices")
@@ -1318,7 +1341,7 @@ REGISTER_OP("UnsortedSegmentSum")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnsortedSegmentReductionShapeFn);
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
@@ -1328,7 +1351,7 @@ REGISTER_OP("UnsortedSegmentMax")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnsortedSegmentReductionShapeFn);
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("UnsortedSegmentMin")
     .Input("data: T")
@@ -1338,7 +1361,7 @@ REGISTER_OP("UnsortedSegmentMin")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnsortedSegmentReductionShapeFn);
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("UnsortedSegmentProd")
     .Input("data: T")
@@ -1348,7 +1371,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnsortedSegmentReductionShapeFn);
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
@@ -1873,7 +1896,7 @@ REGISTER_OP("CumulativeLogsumexp")
     .Attr("exclusive: bool = false")
     .Attr("reverse: bool = false")
     .Output("out: T")
-    .Attr("T: {float16, float32, float64}")
+    .Attr("T: {bfloat16, float16, float32, float64}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::UnchangedShape);
 
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 74a954e400b..c93fd088c58 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -1852,6 +1852,11 @@ REGISTER_OP("_MklFusedBatchMatMulV2")
     .Attr("adj_y: bool = false")
     .Attr("num_args: int >= 0")
     .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
+    // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::BatchMatMulV2Shape)
     .Doc(R"doc(
 *NOTE*: Do not invoke this operator directly in Python. Grappler is
@@ -1878,6 +1883,21 @@ REGISTER_OP("_MklLayerNorm")
     .Attr("epsilon: float = 0.001")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("_MklSoftmax")
+    .Input("logits: T")
+    .Output("softmax: T")
+    .Attr("T: {bfloat16, float} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+oneDNN version of Softmax operator. Uses oneDNN APIs to perform softmax
+operation.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index c16563001e4..560bb5072d7 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -396,6 +396,22 @@ REGISTER_OP("Conv2DBackpropInput")
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv2DBackpropInputShape);
 
+REGISTER_OP("Conv2DBackpropInputV2")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double, int32}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    });
+
 // TODO(jeff): Instead of 'use_cudnn_for_gpu', maybe we should have a
 // more general string attribute ('kernel_impl'?) that can be used to
 // select among several possible implementations.
@@ -419,6 +435,25 @@ REGISTER_OP("Conv2DBackpropFilter")
       return OkStatus();
     });
 
+REGISTER_OP("Conv2DBackpropFilterV2")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &out));
+      c->set_output(0, out);
+      return OkStatus();
+    });
+
 REGISTER_OP("_FusedConv2D")
     .Input("input: T")
     .Input("filter: T")
@@ -972,6 +1007,14 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Output("argmax: Targmax")
     .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
+      std::vector<int32> ksize;
+      TF_RETURN_IF_ERROR(c->GetAttr("ksize", &ksize));
+      for (int i = 0; i < ksize.size(); ++i) {
+        if (ksize[i] <= 0) {
+          return errors::InvalidArgument(
+              "ksize must be a postive int32 value, got:", ksize[i]);
+        }
+      }
       TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
       c->set_output(1, c->output(0));
       return OkStatus();
@@ -1395,10 +1438,17 @@ Status ApproxTopKShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("aggregate_to_topk", &aggregate_to_topk));
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input_shape));
+  int64_t r_dim_copy = reduction_dimension;
+  int64_t rank = c->Rank(input_shape);
   if (reduction_dimension < 0) {
     // Reverse index
     reduction_dimension += c->Rank(input_shape);
   }
+  if (reduction_dimension >= c->Rank(input_shape)) {
+    return errors::InvalidArgument("Invalid reduction dimension: ", r_dim_copy,
+                                   ". Must be within the range of [", -rank,
+                                   ", ", rank - 1, "]");
+  }
   int64_t reduction_dim_value =
       c->Value(c->Dim(input_shape, reduction_dimension));
 
@@ -1406,6 +1456,10 @@ Status ApproxTopKShape(shape_inference::InferenceContext* c) {
     return errors::InvalidArgument("input must have last dimension >= k = ", k,
                                    " but was ", reduction_dim_value);
   }
+  if (recall_target > 1.0 || recall_target <= 0.) {
+    return errors::InvalidArgument("Invalid recall target: ", recall_target,
+                                   ". Valid value range in : [0, 1.0].");
+  }
 
   int64_t output_dim_value = [&] {
     if (aggregate_to_topk) {
@@ -2334,20 +2388,6 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("_MklSoftmax")
-    .Input("logits: T")
-    .Input("mkl_logits: uint8")
-    .Output("softmax: T")
-    .Output("mkl_softmax: uint8")
-    .Attr("T: {bfloat16, half, float, double}")
-    .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
-linear gradients for Relu operation.
-)doc");
-
 REGISTER_OP("_MklTanh")
     .Input("features: T")
     .Input("mkl_features: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index bc114d9134e..8ccbbb693ce 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -76,6 +76,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -123,6 +125,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -188,6 +192,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -216,10 +222,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -358,6 +360,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -689,6 +693,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1125,6 +1131,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1196,6 +1204,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1254,6 +1264,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1336,6 +1348,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1398,6 +1412,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1488,6 +1504,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1565,6 +1583,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1645,6 +1665,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1720,6 +1742,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1806,6 +1830,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1866,6 +1892,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -1928,6 +1956,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2005,6 +2035,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2071,6 +2103,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2132,6 +2166,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2207,6 +2243,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2320,6 +2358,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2362,17 +2402,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_BOOL
       }
     }
@@ -2432,17 +2472,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_BOOL
       }
     }
@@ -2563,10 +2603,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -2833,6 +2869,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -2898,6 +2936,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -3007,10 +3047,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -4165,6 +4201,10 @@ op {
         type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -4578,6 +4618,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -4658,6 +4700,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -5212,6 +5256,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -5261,6 +5307,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -5314,6 +5362,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -8048,6 +8098,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -8567,6 +8619,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -8664,6 +8718,101 @@ op {
   is_stateful: true
   is_distributed_communication: true
 }
+op {
+  name: "CollectiveReduceScatterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "ordering_token"
+    type: DT_RESOURCE
+    number_attr: "Nordering_token"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "Nordering_token"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "max_subdivs_per_device"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+  is_distributed_communication: true
+}
 op {
   name: "CollectiveReduceV2"
   input_arg {
@@ -9057,6 +9206,18 @@ op {
     type: DT_INT64
   }
 }
+op {
+  name: "ComputeDedupDataTupleMask"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "Concat"
   input_arg {
@@ -9223,6 +9384,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -9636,6 +9799,92 @@ op {
     }
   }
 }
+op {
+  name: "Conv2DBackpropFilterV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Conv2DBackpropInput"
   input_arg {
@@ -9723,6 +9972,93 @@ op {
     }
   }
 }
+op {
+  name: "Conv2DBackpropInputV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Conv3D"
   input_arg {
@@ -11966,6 +12302,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -12033,6 +12371,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -12088,6 +12428,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -14753,6 +15094,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DistributedSave"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "directory"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Div"
   input_arg {
@@ -16120,6 +16484,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -25984,6 +26350,10 @@ op {
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -27623,6 +27993,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -27671,6 +28043,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MergeDedupData"
+  input_arg {
+    name: "integer_tensor"
+    type_attr: "integer_type"
+  }
+  input_arg {
+    name: "float_tensor"
+    type_attr: "float_type"
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "tuple_mask"
+    type: "string"
+  }
+  attr {
+    name: "integer_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "float_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "MergeSummary"
   input_arg {
@@ -33073,6 +33487,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -38435,6 +38851,72 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RandomDatasetV2"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "rerandomize_each_iteration"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "RandomGamma"
   input_arg {
@@ -40926,6 +41408,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -40991,6 +41475,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41056,6 +41542,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41120,6 +41608,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41172,6 +41662,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41247,6 +41739,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41303,6 +41797,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41386,6 +41882,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41473,6 +41971,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41537,6 +42037,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41609,6 +42111,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41677,6 +42181,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41756,6 +42262,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41811,6 +42319,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41867,6 +42377,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -41930,6 +42442,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42001,6 +42515,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42061,6 +42577,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42117,6 +42635,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42185,6 +42705,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42225,6 +42747,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42397,6 +42921,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42448,6 +42974,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42499,6 +43027,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42550,6 +43080,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42601,6 +43133,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42837,6 +43371,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -42938,6 +43474,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43004,6 +43542,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43093,6 +43633,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43163,6 +43705,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43256,6 +43800,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43338,6 +43884,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43431,6 +43979,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43508,6 +44058,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43585,6 +44137,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43666,6 +44220,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43736,6 +44292,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -43818,6 +44376,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47277,6 +47837,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47340,6 +47902,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47507,6 +48071,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47604,6 +48170,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47667,6 +48235,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47730,6 +48300,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47791,6 +48363,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47848,6 +48422,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -47953,6 +48529,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -48368,6 +48946,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -48463,6 +49043,64 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentProdV2"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -48481,6 +49119,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SegmentSum"
@@ -48513,6 +49164,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -48532,6 +49185,75 @@ op {
     }
   }
 }
+op {
+  name: "SegmentSumV2"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Select"
   input_arg {
@@ -48968,6 +49690,13 @@ op {
   output_arg {
     name: "output"
     type_attr: "out_type"
+    experimental_full_type {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_VAR
+        s: "out_type"
+      }
+    }
   }
   attr {
     name: "T"
@@ -50640,6 +51369,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -50693,6 +51424,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -50761,6 +51494,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -50833,6 +51568,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -50901,6 +51638,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -50973,6 +51712,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51069,6 +51810,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51145,6 +51888,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51246,6 +51991,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51335,6 +52082,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51435,6 +52184,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51518,6 +52269,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51605,6 +52358,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51680,6 +52435,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51769,6 +52526,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -51919,6 +52678,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -52303,6 +53064,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -52351,6 +53114,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -52399,6 +53164,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -53011,6 +53778,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -53074,6 +53843,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -53803,6 +54574,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -53992,6 +54765,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -54084,6 +54859,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -54380,6 +55157,48 @@ op {
     type: "type"
   }
 }
+op {
+  name: "SplitDedupData"
+  input_arg {
+    name: "input"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "integer_tensor"
+    type_attr: "integer_type"
+  }
+  output_arg {
+    name: "float_tensor"
+    type_attr: "float_type"
+  }
+  attr {
+    name: "integer_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "float_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "tuple_mask"
+    type: "string"
+  }
+}
 op {
   name: "SplitV"
   input_arg {
@@ -57293,6 +58112,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -57433,6 +58254,10 @@ op {
     type: "func"
   }
 }
+op {
+  name: "SyncDevice"
+  is_stateful: true
+}
 op {
   name: "TFRecordDataset"
   input_arg {
@@ -57759,6 +58584,39 @@ op {
     }
   }
 }
+op {
+  name: "TPUPartitionedInputV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "partition_dims"
+    type: "list(int)"
+  }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "TPUPartitionedOutput"
   input_arg {
@@ -57788,6 +58646,32 @@ op {
     }
   }
 }
+op {
+  name: "TPUPartitionedOutputV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    number_attr: "num_splits"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "num_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "partition_dims"
+    type: "list(int)"
+  }
+}
 op {
   name: "TPUReplicateMetadata"
   attr {
@@ -58146,10 +59030,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -61560,6 +62440,99 @@ op {
     type: "int"
   }
 }
+op {
+  name: "UniformQuantizedAdd"
+  input_arg {
+    name: "lhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "lhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "rhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "rhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_zero_points"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT32
+      }
+    }
+  }
+}
 op {
   name: "UniformQuantizedClipByValue"
   input_arg {
@@ -61611,6 +62584,165 @@ op {
     type: "int"
   }
 }
+op {
+  name: "UniformQuantizedConvolution"
+  input_arg {
+    name: "lhs"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "lhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "lhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "rhs_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "rhs_zero_points"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_scales"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_zero_points"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "window_strides"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+  }
+  attr {
+    name: "explicit_padding"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lhs_dilation"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "rhs_dilation"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "batch_group_count"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "feature_group_count"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "dimension_numbers"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "lhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "lhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "lhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "rhs_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "rhs_quantization_max_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_quantization_min_val"
+    type: "int"
+  }
+  attr {
+    name: "output_quantization_max_val"
+    type: "int"
+  }
+}
 op {
   name: "UniformQuantizedConvolutionHybrid"
   input_arg {
@@ -62461,6 +63593,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -62528,6 +63662,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -62814,6 +63950,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
+        type: DT_QINT16
+        type: DT_QUINT16
         type: DT_UINT16
         type: DT_COMPLEX128
         type: DT_HALF
@@ -63304,6 +64442,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -63575,6 +64714,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -63603,6 +64743,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
diff --git a/tensorflow/core/ops/optional_ops.cc b/tensorflow/core/ops/optional_ops.cc
new file mode 100644
index 00000000000..686b920d468
--- /dev/null
+++ b/tensorflow/core/ops/optional_ops.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+
+REGISTER_OP("OptionalFromValue")
+    .Input("components: Toutput_types")
+    .Output("optional: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_OPTIONAL,
+                                                           "Toutput_types"))
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<DataType> dtypes;
+      TF_RETURN_IF_ERROR(c->GetAttr("Toutput_types", &dtypes));
+      c->set_output(0, c->Scalar());
+      std::vector<shape_inference::ShapeAndType> shapes_and_types;
+      shapes_and_types.reserve(c->num_inputs());
+      const FullTypeDef& ret_types = c->ret_types();
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        // TODO(mdan): output_type(i) == optional is incorrect.
+        // "Optional" is the type of the whole container, not of individual
+        // elements.
+        //
+        // Why ret_types.args(0) and not args(i) --
+        // For example if Toutput_types is (int32, float32), then
+        // ret_types.args[0] (i.e. the 0th output) is
+        // Optional[Record[Tensor[int32, s1], Tensor[float32, s2]]]
+        // set_output_handle_shapes_and_types tracks the same thing, but in
+        // a transposed way:
+        // {ShapeAndType(in32, s1, Optional), ShapeAndType(in32, s2, Optional)}
+        // That should be corrected in the future (see todo above).
+        shapes_and_types.emplace_back(c->input(i), dtypes[i],
+                                      ret_types.args(0));
+      }
+      c->set_output_handle_shapes_and_types(0, shapes_and_types);
+      return OkStatus();
+    });
+
+REGISTER_OP("OptionalNone")
+    .Output("optional: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("OptionalHasValue")
+    .Input("optional: variant")
+    .Output("has_value: bool")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("OptionalGetValue")
+    .Input("optional: variant")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/random_index_shuffle_ops.cc b/tensorflow/core/ops/random_index_shuffle_ops.cc
index 70d8fd739b4..59f41388510 100644
--- a/tensorflow/core/ops/random_index_shuffle_ops.cc
+++ b/tensorflow/core/ops/random_index_shuffle_ops.cc
@@ -44,15 +44,24 @@ static Status StatelessRandomPermuteShape(InferenceContext* c) {
 
   // Check that last dimension of seed is 3.
   if (seed_rank == 1 && c->Value(c->Dim(seed_shape, 0)) != 3) {
-    return errors::InvalidArgument("Seed must have shape [3] but got [",
-                                   c->Value(c->Dim(seed_shape, 0)), "].");
+    return errors::InvalidArgument(
+        "Seed must have shape [3] or [n, 3] but got [",
+        c->Value(c->Dim(seed_shape, 0)), "].");
   }
   if (seed_rank == 2 && c->Value(c->Dim(seed_shape, 1)) != 3) {
-    return errors::InvalidArgument("Seed must have shape [n, 3] but got [",
-                                   c->Value(c->Dim(seed_shape, 0)), ", ",
-                                   c->Value(c->Dim(seed_shape, 1)), "].");
+    return errors::InvalidArgument(
+        "Seed must have shape [3] or [n, 3] but got [",
+        c->Value(c->Dim(seed_shape, 0)), ", ", c->Value(c->Dim(seed_shape, 1)),
+        "].");
   }
 
+  // Below we handle 3 cases:
+  // 1. If all inputs are scalars the output is a scalar.
+  // 2. If we cannot decide if the output is a scalar or a vector we output
+  //    unknown shape.
+  // 3. The output must be a vector and try to compute it's size.
+
+  // Case 1.
   // If all inputs are scalars the output is a scalar.
   const bool output_is_scalar =
       (index_rank == 0 && seed_rank == 1 && max_index_rank == 0);
@@ -61,39 +70,52 @@ static Status StatelessRandomPermuteShape(InferenceContext* c) {
     return OkStatus();
   }
 
-  if (!c->FullyDefined(index_shape) || !c->FullyDefined(seed_shape) ||
-      !c->FullyDefined(max_index_shape)) {
-    const bool output_is_vector =
-        (index_rank == 1 || seed_rank == 2 || max_index_rank == 1);
-    if (output_is_vector) {
-      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
-    }
+  // Case 2.
+  // If we know for certain that the output is a vector we should proceed to
+  // calculate the size below. Otherwise the output could be a scalar or a
+  // vector.
+  const bool output_must_be_vector =
+      (index_rank == 1 || seed_rank == 2 || max_index_rank == 1);
+  if (!output_must_be_vector) {
+    c->set_output(0, c->UnknownShape());
     return OkStatus();
   }
 
-  // Shape is fully defined and the output is a vector.
-  const int64_t num_indices = index_rank ? c->Value(c->Dim(index_shape, 0)) : 1;
-  const int64_t num_seeds =
-      seed_rank == 2 ? c->Value(c->Dim(seed_shape, 0)) : 1;
-  const int64_t num_max_indices =
-      max_index_rank ? c->Value(c->Dim(max_index_shape, 0)) : 1;
-  const int64_t num_outputs =
-      std::max(std::max(num_indices, num_seeds), num_max_indices);
-  if (num_indices != 1 && num_indices != num_outputs) {
-    return errors::InvalidArgument("Index has shape [", num_indices,
-                                   "] but must have shape [", num_outputs,
-                                   "].");
-  }
-  if (num_seeds != 1 && num_seeds != num_outputs) {
-    return errors::InvalidArgument("Seed has shape [", num_seeds,
-                                   "3, ] but must have shape [", num_outputs,
-                                   ", 3].");
+  // Case 3.
+  // Output is a vector and we try to compute the size `num_outputs`. The result
+  // can be kUknownDim.
+  int64_t num_outputs = InferenceContext::kUnknownDim;
+
+  // Check index.
+  if (index_rank == 1) num_outputs = c->Value(c->Dim(index_shape, 0));
+
+  // Check seed.
+  if (seed_rank == 2) {
+    const int64_t num_seeds = c->Value(c->Dim(seed_shape, 0));
+    if (num_outputs == InferenceContext::kUnknownDim) {
+      num_outputs = num_seeds;
+    } else if (num_outputs > 1 && num_seeds != InferenceContext::kUnknownDim &&
+               num_seeds > 1 && num_seeds != num_outputs) {
+      return errors::InvalidArgument(
+          "Seed has shape [", num_seeds, ", 3] but must have shape [",
+          num_outputs, ", 3]. since index had shape [", num_outputs, "].");
+    }
   }
-  if (num_max_indices != 1 && num_max_indices != num_outputs) {
-    return errors::InvalidArgument("Max index has shape [", num_max_indices,
-                                   "] but must have shape [", num_outputs,
-                                   "].");
+
+  // Check max index.
+  if (max_index_rank == 1) {
+    int64_t num_max_indices = c->Value(c->Dim(max_index_shape, 0));
+    if (num_outputs == InferenceContext::kUnknownDim) {
+      num_outputs = num_max_indices;
+    } else if (num_outputs > 1 &&
+               num_max_indices != InferenceContext::kUnknownDim &&
+               num_max_indices > 1 && num_max_indices != num_outputs) {
+      return errors::InvalidArgument("Max index has shape [", num_max_indices,
+                                     "] but must have shape [", num_outputs,
+                                     "].");
+    }
   }
+
   c->set_output(0, c->Vector(num_outputs));
   return OkStatus();
 }
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index eb9cc090533..fa666e7c7c2 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -124,7 +124,7 @@ REGISTER_OP("UnsortedSegmentJoin")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Output("output: string")
-    .SetShapeFn(shape_inference::UnsortedSegmentReductionShapeFn);
+    .SetShapeFn(shape_inference::SegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("AsString")
     .Input("input: T")
diff --git a/tensorflow/core/ops/sync_ops.cc b/tensorflow/core/ops/sync_ops.cc
new file mode 100644
index 00000000000..a0021baeefb
--- /dev/null
+++ b/tensorflow/core/ops/sync_ops.cc
@@ -0,0 +1,27 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+// SyncDevice is stateful because it has a side effect: it synchronizes the GPU
+// steam. If it weren't stateful, optimization passes like dead code elimination
+// might incorrectly remove it.
+REGISTER_OP("SyncDevice")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/uniform_quant_ops.cc b/tensorflow/core/ops/uniform_quant_ops.cc
index b924fceb8af..16c0da1bf4d 100644
--- a/tensorflow/core/ops/uniform_quant_ops.cc
+++ b/tensorflow/core/ops/uniform_quant_ops.cc
@@ -142,13 +142,44 @@ Status DotHybridShape(shape_inference::InferenceContext* context) {
   return OkStatus();
 }
 
-Status ConvolutionHybridShape(shape_inference::InferenceContext* context) {
+struct ShapeCommonParams {
   ShapeHandle lhs;
-  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(0), 3, &lhs));
   ShapeHandle rhs;
-  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(1), 3, &rhs));
-  const int32_t lhs_rank = shape_inference::InferenceContext::Rank(lhs);
-  const int32_t rhs_rank = shape_inference::InferenceContext::Rank(rhs);
+  ShapeHandle lhs_scales;
+  ShapeHandle lhs_zero_points;
+  ShapeHandle rhs_scales;
+  ShapeHandle rhs_zero_points;
+  ShapeHandle output_scales;
+  ShapeHandle output_zero_points;
+  bool is_output_scales_zero_points_set;
+
+  ShapeCommonParams(ShapeHandle lhs, ShapeHandle rhs, ShapeHandle lhs_scales,
+                    ShapeHandle lhs_zero_points, ShapeHandle rhs_scales,
+                    ShapeHandle rhs_zero_points, ShapeHandle output_scales,
+                    ShapeHandle output_zero_points)
+      : lhs(lhs),
+        rhs(rhs),
+        lhs_scales(lhs_scales),
+        lhs_zero_points(lhs_zero_points),
+        rhs_scales(rhs_scales),
+        rhs_zero_points(rhs_zero_points),
+        output_scales(output_scales),
+        output_zero_points(output_zero_points),
+        is_output_scales_zero_points_set(true) {}
+
+  ShapeCommonParams(ShapeHandle lhs, ShapeHandle rhs, ShapeHandle rhs_scales,
+                    ShapeHandle rhs_zero_points)
+      : lhs(lhs),
+        rhs(rhs),
+        rhs_scales(rhs_scales),
+        rhs_zero_points(rhs_zero_points),
+        is_output_scales_zero_points_set(false) {}
+};
+
+Status ConvolutionShapeCommon(shape_inference::InferenceContext* context,
+                              const ShapeCommonParams& params) {
+  const int32_t lhs_rank = shape_inference::InferenceContext::Rank(params.lhs);
+  const int32_t rhs_rank = shape_inference::InferenceContext::Rank(params.rhs);
 
   if (lhs_rank == shape_inference::InferenceContext::kUnknownRank &&
       rhs_rank == shape_inference::InferenceContext::kUnknownRank) {
@@ -166,15 +197,8 @@ Status ConvolutionHybridShape(shape_inference::InferenceContext* context) {
     return InvalidArgument("lhs and rhs must have same rank.");
   }
 
-  ShapeHandle rhs_scales;
-  TF_RETURN_IF_ERROR(
-      context->WithRankAtMost(context->input(2), 1, &rhs_scales));
-  ShapeHandle rhs_zero_points;
-  TF_RETURN_IF_ERROR(
-      context->WithRankAtMost(context->input(3), 1, &rhs_zero_points));
-
-  auto lhs_shape = ToTensorShape(lhs, lhs_rank);
-  auto rhs_shape = ToTensorShape(rhs, rhs_rank);
+  auto lhs_shape = ToTensorShape(params.lhs, lhs_rank);
+  auto rhs_shape = ToTensorShape(params.rhs, rhs_rank);
   if (!lhs_shape.ok() || !rhs_shape.ok()) {
     context->set_output(0, context->UnknownShapeOfRank(lhs_rank));
     return OkStatus();
@@ -185,11 +209,22 @@ Status ConvolutionHybridShape(shape_inference::InferenceContext* context) {
   TF_RETURN_IF_ERROR(convolution_params.ValidateOrFillParamsAndValidateShape(
       lhs_shape.value(), rhs_shape.value()));
 
-  DimensionHandle rhs_output_feature = context->Dim(
-      rhs,
+  DimensionHandle output_feature = context->Dim(
+      params.rhs,
       convolution_params.dimension_numbers().kernel_output_feature_dimension());
-  TF_RETURN_IF_ERROR(ScalesZeroPointsShapeValid(context, rhs_output_feature,
-                                                rhs_scales, rhs_zero_points));
+  TF_RETURN_IF_ERROR(ScalesZeroPointsShapeValid(
+      context, output_feature, params.rhs_scales, params.rhs_zero_points));
+  if (params.is_output_scales_zero_points_set) {
+    TF_RETURN_IF_ERROR(ScalesZeroPointsShapeValid(context, output_feature,
+                                                  params.output_scales,
+                                                  params.output_zero_points));
+    if (shape_inference::InferenceContext::Rank(params.output_scales) > 0) {
+      DimensionHandle scales_merged;
+      TF_RETURN_IF_ERROR(context->Merge(context->Dim(params.rhs_scales, 0),
+                                        context->Dim(params.output_scales, 0),
+                                        &scales_merged));
+    }
+  }
 
   TF_ASSIGN_OR_RETURN(const auto& out_shape,
                       convolution_params.CalculateOutputShape(
@@ -201,6 +236,52 @@ Status ConvolutionHybridShape(shape_inference::InferenceContext* context) {
   return OkStatus();
 }
 
+Status ConvolutionShape(shape_inference::InferenceContext* context) {
+  ShapeHandle lhs;
+  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(0), 2, &lhs));
+  ShapeHandle rhs;
+  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(1), 2, &rhs));
+  // lhs scales and zero_points must be scalar tensors.
+  ShapeHandle lhs_scales;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(2), 0, &lhs_scales));
+  ShapeHandle lhs_zero_points;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(3), 0, &lhs_zero_points));
+  ShapeHandle rhs_scales;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(4), 1, &rhs_scales));
+  ShapeHandle rhs_zero_points;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(5), 1, &rhs_zero_points));
+  ShapeHandle output_scales;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(6), 1, &output_scales));
+  ShapeHandle output_zero_points;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(7), 1, &output_zero_points));
+
+  return ConvolutionShapeCommon(
+      context,
+      ShapeCommonParams(lhs, rhs, lhs_scales, lhs_zero_points, rhs_scales,
+                        rhs_zero_points, output_scales, output_zero_points));
+}
+
+Status ConvolutionHybridShape(shape_inference::InferenceContext* context) {
+  ShapeHandle lhs;
+  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(0), 2, &lhs));
+  ShapeHandle rhs;
+  TF_RETURN_IF_ERROR(context->WithRankAtLeast(context->input(1), 2, &rhs));
+  ShapeHandle rhs_scales;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(2), 1, &rhs_scales));
+  ShapeHandle rhs_zero_points;
+  TF_RETURN_IF_ERROR(
+      context->WithRankAtMost(context->input(3), 1, &rhs_zero_points));
+  return ConvolutionShapeCommon(
+      context, ShapeCommonParams(lhs, rhs, rhs_scales, rhs_zero_points));
+}
+
 }  // namespace
 
 REGISTER_OP("UniformQuantize")
@@ -281,6 +362,37 @@ REGISTER_OP("UniformQuantizedDotHybrid")
     .Attr("rhs_quantization_max_val: int")
     .SetShapeFn(DotHybridShape);
 
+REGISTER_OP("UniformQuantizedConvolution")
+    .Input("lhs: Tin")
+    .Input("rhs: Tin")
+    .Input("lhs_scales: float")
+    .Input("lhs_zero_points: int32")
+    .Input("rhs_scales: float")
+    .Input("rhs_zero_points: int32")
+    .Input("output_scales: float")
+    .Input("output_zero_points: int32")
+    .Output("output: Tout")
+    .Attr("Tin: {qint8}")
+    .Attr("Tout: {qint32}")
+    .Attr("window_strides: list(int) = []")
+    .Attr("padding: string")
+    .Attr("explicit_padding: list(int) = []")
+    .Attr("lhs_dilation: list(int) = []")
+    .Attr("rhs_dilation: list(int) = []")
+    .Attr("batch_group_count: int = 1")
+    .Attr("feature_group_count: int = 1")
+    .Attr("dimension_numbers: string = ''")
+    .Attr("lhs_quantization_axis: int = -1")
+    .Attr("lhs_quantization_min_val: int")
+    .Attr("lhs_quantization_max_val: int")
+    .Attr("rhs_quantization_axis: int = -1")
+    .Attr("rhs_quantization_min_val: int")
+    .Attr("rhs_quantization_max_val: int")
+    .Attr("output_quantization_axis: int = -1")
+    .Attr("output_quantization_min_val: int")
+    .Attr("output_quantization_max_val: int")
+    .SetShapeFn(ConvolutionShape);
+
 REGISTER_OP("UniformQuantizedConvolutionHybrid")
     .Input("lhs: Tlhs")
     .Input("rhs: Trhs")
@@ -303,6 +415,28 @@ REGISTER_OP("UniformQuantizedConvolutionHybrid")
     .Attr("rhs_quantization_max_val: int")
     .SetShapeFn(ConvolutionHybridShape);
 
+REGISTER_OP("UniformQuantizedAdd")
+    .Input("lhs: T")
+    .Input("rhs: T")
+    .Input("lhs_scales: float")
+    .Input("lhs_zero_points: int32")
+    .Input("rhs_scales: float")
+    .Input("rhs_zero_points: int32")
+    .Input("output_scales: float")
+    .Input("output_zero_points: int32")
+    .Output("output: T")
+    .Attr("lhs_quantization_axis: int = -1")
+    .Attr("lhs_quantization_min_val: int")
+    .Attr("lhs_quantization_max_val: int")
+    .Attr("rhs_quantization_axis: int = -1")
+    .Attr("rhs_quantization_min_val: int")
+    .Attr("rhs_quantization_max_val: int")
+    .Attr("output_quantization_axis: int = -1")
+    .Attr("output_quantization_min_val: int")
+    .Attr("output_quantization_max_val: int")
+    .Attr("T: {qint32}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 REGISTER_OP("UniformQuantizedClipByValue")
     .Input("operand: T")
     .Input("min: T")
diff --git a/tensorflow/core/ops/uniform_quant_ops_test.cc b/tensorflow/core/ops/uniform_quant_ops_test.cc
index d78659b62eb..22568fd0c97 100644
--- a/tensorflow/core/ops/uniform_quant_ops_test.cc
+++ b/tensorflow/core/ops/uniform_quant_ops_test.cc
@@ -28,6 +28,8 @@ namespace {
 
 constexpr int32_t kInt8Min = std::numeric_limits<int8_t>::min();
 constexpr int32_t kInt8Max = std::numeric_limits<int8_t>::max();
+constexpr int32_t kInt32Min = std::numeric_limits<int32_t>::min();
+constexpr int32_t kInt32Max = std::numeric_limits<int32_t>::max();
 
 }  // namespace
 
@@ -68,42 +70,114 @@ TEST(UniformQuantizedOpsTest, UniformQuantizedDotHybridShapeInference) {
 }
 
 TEST(UniformQuantizedOpsTest,
-     UniformQuantizedConvolutionHybridShapeInferencePerTensor) {
-  ShapeInferenceTestOp op("UniformQuantizedConvolutionHybrid");
-  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolutionHybrid")
+     UniformQuantizedConvolutionShapeInferencePerTensor) {
+  ShapeInferenceTestOp op("UniformQuantizedConvolution");
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
                    .Input(FakeInput(DT_QINT8))
                    .Input(FakeInput(DT_QINT8))
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_INT32))
-                   .Attr("Tlhs", DT_QINT8)
-                   .Attr("Trhs", DT_QINT8)
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
                    .Attr("Tout", DT_QINT32)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
                    .Attr("rhs_quantization_min_val", kInt8Min)
                    .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
                    .Attr("padding", "VALID")
                    .Finalize(&op.node_def));
   // Uses default Attrs (and default conv_params settings).
-  //
-  // batch_group_count = 1
   // feature_group_count = 1
+  // batch_group_count = 1
   // strides = [1, 1]
   // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
-  // lhs_dilation = [1, 1]
-  // rhs_dilation = [1, 1]
-
-  INFER_OK(op, "[2,3,40,50];[6,3,4,5];[];[]", "[2,6,37,46]");
 
-  INFER_OK(op, "?;?;[];[]", "?");
-  INFER_OK(op, "[2,3,40,50];?;[];[]", "[?,?,?,?]");
-  INFER_OK(op, "?;[6,3,4,5];[];[]", "[?,?,?,?]");
-  INFER_OK(op, "[?,3,40,50];[6,3,4,5];[];[]", "[?,?,?,?]");
+  INFER_OK(op, "[2,3,40,50];[6,3,4,5];[];[];[];[];[];[]", "[2,6,37,46]");
 
-  // lhs and rhs must have same rank.
-  INFER_ERROR("lhs and rhs must have same rank", op,
-              "[2,3,40,50];[6,3,4];[];[]");
+  // lhs feature dimension size and rhs input feature dimension size must match.
+  INFER_ERROR("", op, "[2,3,40,50];[6,9,4,5];[];[];[];[];[];[]");
+  // lhs scales and zero_points must be scalar tensors.
+  INFER_ERROR("", op, "[2,3,40,50];[6,3,4,5];[2];[2];[];[];[];[]");
   // scales and zero_points must have same rank.
   INFER_ERROR("scales and zero_points must have same rank.", op,
-              "[2,3,40,50];[6,3,4,5];[6];[]");
+              "[2,3,40,50];[6,3,4,5];[];[];[6];[];[];[]");
+  // Output scales and zero_points must be scalar tensors is rhs scales and
+  // zero_points are scalar tensors.
+  INFER_ERROR("", op, "[2,3,40,50];[6,3,4,5];[];[];[];[];[12];[12]");
+}
+
+TEST(UniformQuantizedOpsTest,
+     UniformQuantizedConvolutionShapeInferencePerChannelRhs) {
+  ShapeInferenceTestOp op("UniformQuantizedConvolution");
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Finalize(&op.node_def));
+  // Uses default Attrs (and default conv_params settings).
+  // feature_group_count = 1
+  // batch_group_count = 1
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+
+  INFER_OK(op, "[2,3,40,50];[6,3,4,5];[];[];[6];[6];[];[]", "[2,6,37,46]");
+
+  // If rhs scales and zero_points are not scalar tensors, both of their
+  // dim_size[0] must be equal to rhs output feature dimension size.
+  INFER_ERROR("", op, "[2,3,40,50];[6,3,4,5];[];[];[12];[12];[];[]");
+}
+
+TEST(UniformQuantizedOpsTest,
+     UniformQuantizedConvolutionShapeInferencePerChannelRhsAndOutput) {
+  ShapeInferenceTestOp op("UniformQuantizedConvolution");
+  TF_ASSERT_OK(NodeDefBuilder("test", "UniformQuantizedConvolution")
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_QINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Attr("Tin", DT_QINT8)
+                   .Attr("Tout", DT_QINT32)
+                   .Attr("rhs_quantization_axis", 0)
+                   .Attr("output_quantization_axis", 1)
+                   .Attr("lhs_quantization_min_val", kInt8Min)
+                   .Attr("lhs_quantization_max_val", kInt8Max)
+                   .Attr("rhs_quantization_min_val", kInt8Min)
+                   .Attr("rhs_quantization_max_val", kInt8Max)
+                   .Attr("output_quantization_min_val", kInt32Min)
+                   .Attr("output_quantization_max_val", kInt32Max)
+                   .Attr("padding", "VALID")
+                   .Finalize(&op.node_def));
+  // Uses default Attrs (and default conv_params settings).
+  // feature_group_count = 1
+  // batch_group_count = 1
+  // strides = [1, 1]
+  // dimension_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
+
+  INFER_OK(op, "[2,3,40,50];[6,3,4,5];[];[];[6];[6];[6];[6]", "[2,6,37,46]");
 }
 
 TEST(UniformQuantizedOpsTest,
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index a7ddd4b7f60..9abfe8b9513 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -17,7 +17,6 @@ load(
     "tf_additional_lib_hdrs",
     "tf_additional_tensor_coding_deps",
     "tf_google_mobile_srcs_only_runtime",
-    "tf_kernel_tests_linkstatic",
     "tf_platform_alias",
     "tf_platform_deps",
     "tf_windows_aware_platform_deps",
@@ -38,6 +37,10 @@ load(
     "tf_copts",  # @unused
     "tf_cuda_library",
 )
+load(
+    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -46,6 +49,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable", "tf_cuda_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -71,6 +75,7 @@ exports_files(
         "env_time.h",
         "file_system.h",
         "file_system_helper.h",
+        "float8.h",
         "host_info.h",
         "human_readable_json.h",
         "init_main.h",
@@ -125,6 +130,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "float8",
+    hdrs = ["float8.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/tsl/platform:float8",
+    ],
+)
+
 cc_library(
     name = "blocking_counter",
     hdrs = ["blocking_counter.h"],
@@ -194,6 +208,12 @@ cc_library(
     ],
 )
 
+# Required to avoid ODR issues in the pybind wrapper.
+cc_library(
+    name = "cpu_feature_guard_hdr",
+    hdrs = ["cpu_feature_guard.h"],
+)
+
 cc_library(
     name = "casts",
     hdrs = ["casts.h"],
@@ -915,6 +935,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":bfloat16",
+        ":float8",
         ":platform",
         ":tstring",
         "//tensorflow/tsl/framework:device_type",
@@ -1012,13 +1033,23 @@ cc_binary(
     ] + if_not_android([":rocm_rocdl_path"]),
 )
 
+cc_library(
+    name = "gpu_all_runtime",
+    visibility = ["//visibility:private"],
+    deps = if_cuda_is_configured(
+        ["//tensorflow/compiler/xla/stream_executor/cuda:all_runtime"],
+    ) + if_rocm_is_configured(
+        ["//tensorflow/compiler/xla/stream_executor/rocm:all_runtime"],
+    ),
+)
+
 # This do-nothing lib should only be used as a dep in cuda_deps in :stream_executor cuda lib because
 # the "+" operator doesn't work for bazel in stream_executor cuda_deps.
 cc_library(
     name = "private_static_dep",
     visibility = ["//visibility:private"],
     deps = if_static(
-        ["//tensorflow/compiler/xla/stream_executor/cuda:all_runtime"],
+        [":gpu_all_runtime"],
     ),
 )
 
@@ -1173,6 +1204,7 @@ tf_cc_tests(
         "//tensorflow/tsl/platform/profile_utils:cpu_utils_test.cc",
     ],
     create_named_test_suite = True,
+    tags = ["no_mac_arm64"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -1281,7 +1313,6 @@ tf_cuda_cc_test(
     name = "rocm_rocdl_path_test",
     size = "small",
     srcs = ["rocm_rocdl_path_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_gpu_tests_tags(),
     deps = [
         ":rocm_rocdl_path",
@@ -1348,6 +1379,7 @@ filegroup(
         "file_system.h",
         "file_system_helper.h",
         "fingerprint.h",
+        "float8.h",
         "init_main.h",
         "logger.h",
         "mem.h",
@@ -1430,6 +1462,7 @@ filegroup(
         "byte_order.h",
         "cpu_info.h",
         "dynamic_annotations.h",
+        "float8.h",
         "macros.h",
         "mutex.h",
         "platform.h",
@@ -1448,6 +1481,7 @@ filegroup(
     name = "lib_internal_private_hdrs",
     srcs = [
         "bfloat16.h",
+        "float8.h",
         "raw_coding.h",
         "scanner.h",
         "str_util.h",
@@ -1499,6 +1533,7 @@ filegroup(
     name = "tflite_portable_logging_hdrs",
     srcs = [
         "bfloat16.h",
+        "float8.h",
         "logging.h",
         "macros.h",
         "platform.h",
@@ -1577,6 +1612,7 @@ filegroup(
         "file_statistics.h",
         "file_system.h",
         "file_system_helper.h",
+        "float8.h",
         "hash.h",
         "host_info.h",
         "init_main.h",
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index 32307a4c5bf..47d714775b0 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -18,7 +18,6 @@ load(
     _tf_google_mobile_srcs_no_runtime = "tf_google_mobile_srcs_no_runtime",
     _tf_google_mobile_srcs_only_runtime = "tf_google_mobile_srcs_only_runtime",
     _tf_jspb_proto_library = "tf_jspb_proto_library",
-    _tf_kernel_tests_linkstatic = "tf_kernel_tests_linkstatic",
     _tf_lib_proto_parsing_deps = "tf_lib_proto_parsing_deps",
     _tf_logging_deps = "tf_logging_deps",
     _tf_platform_alias = "tf_platform_alias",
@@ -64,7 +63,6 @@ tf_fingerprint_deps = _tf_fingerprint_deps
 tf_google_mobile_srcs_no_runtime = _tf_google_mobile_srcs_no_runtime
 tf_google_mobile_srcs_only_runtime = _tf_google_mobile_srcs_only_runtime
 tf_jspb_proto_library = _tf_jspb_proto_library
-tf_kernel_tests_linkstatic = _tf_kernel_tests_linkstatic
 tf_lib_proto_parsing_deps = _tf_lib_proto_parsing_deps
 tf_logging_deps = _tf_logging_deps
 tf_platform_alias = _tf_platform_alias
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 48622aac9ff..f6936faec1e 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -8,14 +8,15 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        ":dependency_whitelist",
+        ":dependency_allowlist",
     ],
     licenses = ["notice"],
 )
 
 package_group(
-    name = "dependency_whitelist",
+    name = "dependency_allowlist",
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
diff --git a/tensorflow/core/platform/cloud/testdata/BUILD b/tensorflow/core/platform/cloud/testdata/BUILD
index 302317dc409..8cf334b021d 100644
--- a/tensorflow/core/platform/cloud/testdata/BUILD
+++ b/tensorflow/core/platform/cloud/testdata/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 9e581ab5a10..245f3263a9e 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -126,16 +126,6 @@ absl::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   absl::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     std::string missing_instructions;
-#if defined(_MSC_VER) && !defined(__clang__)
-
-#ifndef __AVX__
-    CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
-#endif  // __AVX__
-#ifndef __AVX2__
-    CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
-#endif  // __AVX2__
-
-#else  // if defined(_MSC_VER) && !defined(__clang__)
 
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
@@ -188,14 +178,13 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
-      LOG(INFO) << "This TensorFlow binary is optimized with "
-                << "oneAPI Deep Neural Network Library (oneDNN) "
-                << "to use the following CPU instructions in performance-"
-                << "critical operations: " << missing_instructions << std::endl
-                << "To enable them in other operations, rebuild TensorFlow "
-                << "with the appropriate compiler flags.";
+      LOG(INFO) << "This TensorFlow binary is optimized "
+                << "to use available CPU instructions in performance-"
+                << "critical operations." << std::endl
+                << "To enable the following instructions:"
+                << missing_instructions << ", in other operations, rebuild "
+                << "TensorFlow with the appropriate compiler flags.";
     }
   });
 }
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
new file mode 100644
index 00000000000..082d755c629
--- /dev/null
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -0,0 +1,26 @@
+# Description:
+# Platform-specific build configurations.
+
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "platformlib",
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/platform/default/build_config:gif",
+        # "//tensorflow/tsl/platform/default/build_config:jpeg",
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+        "@fft2d",
+        "@highwayhash//:sip_hash",
+        "@zlib",
+    ],
+)
diff --git a/tensorflow/core/platform/error_payloads.cc b/tensorflow/core/platform/error_payloads.cc
index ecc0ede7e10..f208ce7d57a 100644
--- a/tensorflow/core/platform/error_payloads.cc
+++ b/tensorflow/core/platform/error_payloads.cc
@@ -27,7 +27,7 @@ void OkOrSetErrorCounterPayload(
     ErrorSourceProto error_source_proto;
     error_source_proto.set_error_source(error_source);
     status.SetPayload(tensorflow::kErrorSource,
-                      error_source_proto.SerializeAsString());
+                      absl::Cord(error_source_proto.SerializeAsString()));
   }
 }
 
diff --git a/tensorflow/core/platform/float8.h b/tensorflow/core/platform/float8.h
new file mode 100644
index 00000000000..321674becdd
--- /dev/null
+++ b/tensorflow/core/platform/float8.h
@@ -0,0 +1,26 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
+#define TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
+
+#include "tensorflow/tsl/platform/float8.h"
+
+namespace tensorflow {
+typedef tsl::float8_e4m3fn float8_e4m3fn;
+typedef tsl::float8_e5m2 float8_e5m2;
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FLOAT8_H_
diff --git a/tensorflow/core/platform/profile_utils/BUILD b/tensorflow/core/platform/profile_utils/BUILD
index 42c2dea6ffd..15f4509dcbe 100644
--- a/tensorflow/core/platform/profile_utils/BUILD
+++ b/tensorflow/core/platform/profile_utils/BUILD
@@ -12,6 +12,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/default:__pkg__",
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index a934a3f8bfa..28d17aad911 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/event.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 47483969616..bd034102d7c 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/event.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
diff --git a/tensorflow/core/platform/testdata/BUILD b/tensorflow/core/platform/testdata/BUILD
index 847e8e3cdeb..b16c7da76b3 100644
--- a/tensorflow/core/platform/testdata/BUILD
+++ b/tensorflow/core/platform/testdata/BUILD
@@ -3,6 +3,7 @@
 #   Thus helping write cross platform tests.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/platform:__pkg__",
     ],
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index d1a63100815..90bc7a117e8 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_TYPES_H_
 
 #include "tensorflow/core/platform/bfloat16.h"
+#include "tensorflow/core/platform/float8.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/tsl/platform/types.h"
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 30c7d86a5bd..cea1673db80 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -29,7 +30,13 @@ tf_proto_library(
     srcs = ["profiler_service_monitor_result.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_monitor_result_proto",
+    ],
     visibility = ["//visibility:public"],
+    exports = [
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_monitor_result_proto",
+    ],
 )
 
 tf_proto_library(
@@ -37,7 +44,9 @@ tf_proto_library(
     srcs = ["profiler_options.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
+    protodeps = ["//tensorflow/tsl/profiler/protobuf:profiler_options_proto"],
     visibility = ["//visibility:public"],
+    exports = ["//tensorflow/tsl/profiler/protobuf:profiler_options_proto"],
 )
 
 # This is needed because of how tf_android_core_proto_sources parses proto paths.
@@ -49,28 +58,27 @@ exports_files(
 tf_proto_library(
     name = "profiler_service_proto",
     srcs = ["profiler_service.proto"],
-    has_services = 1,
     cc_api_version = 2,
-    create_grpc_library = True,
     make_default_target_header_only = True,
     protodeps = [
-        ":profiler_options_proto",
-        ":profiler_service_monitor_result_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto",
     ],
     use_grpc_namespace = True,
     visibility = ["//visibility:public"],
+    exports = ["//tensorflow/tsl/profiler/protobuf:profiler_service_proto"],
 )
 
 tf_proto_library(
     name = "profiler_analysis_proto",
     srcs = ["profiler_analysis.proto"],
-    has_services = 1,
     cc_api_version = 2,
-    create_grpc_library = True,
     make_default_target_header_only = True,
-    protodeps = [":profiler_service_proto"],
+    protodeps = [
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_proto",
+    ],
     use_grpc_namespace = True,
     visibility = ["//visibility:public"],
+    exports = ["//tensorflow/tsl/profiler/protobuf:profiler_analysis_proto"],
 )
 
 tf_proto_library(
@@ -82,8 +90,13 @@ tf_proto_library(
         "tfprof_output.proto",
     ],
     cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
+    protodeps = [
+        "//tensorflow/tsl/profiler/protobuf:profile_proto",
+    ] + tf_additional_all_protos(),
     visibility = [":friends"],
+    exports = [
+        "//tensorflow/tsl/profiler/protobuf:profile_proto",
+    ],
 )
 
 cc_library(
@@ -91,6 +104,7 @@ cc_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
+        "//tensorflow/security/fuzzing/cc/ops:__pkg__",
     ],
     deps = [
         "//tensorflow/core/profiler/backends/cpu:annotation_stack_impl",
diff --git a/tensorflow/core/profiler/backends/cpu/BUILD b/tensorflow/core/profiler/backends/cpu/BUILD
index cb050a52f02..8cccadc803b 100644
--- a/tensorflow/core/profiler/backends/cpu/BUILD
+++ b/tensorflow/core/profiler/backends/cpu/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -22,30 +23,20 @@ cc_library(
 
 cc_library(
     name = "host_tracer",
-    srcs = ["host_tracer_factory.cc"],
     deps = [
-        ":host_tracer_impl",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:host_tracer",
     ],
     alwayslink = True,
 )
 
 cc_library(
     name = "host_tracer_impl",
-    srcs = ["host_tracer.cc"],
     hdrs = ["host_tracer.h"],
     copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        ":host_tracer_utils",
-        ":traceme_recorder",
-        "//tensorflow/core:lib",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:host_tracer_impl",
         "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:time_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
     ],
 )
 
@@ -59,13 +50,13 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -135,45 +126,29 @@ cc_library(
 
 cc_library(
     name = "python_tracer",
-    srcs = ["python_tracer_factory.cc"],
     deps = [
-        ":python_tracer_impl",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:python_tracer",
     ],
     alwayslink = True,
 )
 
 cc_library(
     name = "python_tracer_impl",
-    srcs = ["python_tracer.cc"],
     hdrs = ["python_tracer.h"],
     copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/python/profiler/internal:python_hooks",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:python_tracer_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_interface",
     ],
 )
 
 cc_library(
     name = "metadata_collector",
-    srcs = ["metadata_collector.cc"],
     copts = tf_profiler_copts(),
     deps = [
-        ":metadata_utils",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/service:xla_debug_info_manager",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_factory",
-        "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/compiler/xla/backends/profiler/cpu:metadata_collector",
     ],
     alwayslink = True,
 )
@@ -183,6 +158,7 @@ cc_library(
     hdrs = ["metadata_utils.h"],
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/cpu:metadata_utils",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/profiler/convert:xla_op_utils",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/backends/cpu/host_tracer.cc b/tensorflow/core/profiler/backends/cpu/host_tracer.cc
deleted file mode 100644
index 2a9d1edb166..00000000000
--- a/tensorflow/core/profiler/backends/cpu/host_tracer.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/backends/cpu/host_tracer.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/backends/cpu/host_tracer_utils.h"
-#include "tensorflow/core/profiler/backends/cpu/traceme_recorder.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Controls TraceMeRecorder and converts TraceMeRecorder::Events into XEvents.
-//
-// Thread-safety: This class is go/thread-compatible.
-class HostTracer : public ProfilerInterface {
- public:
-  explicit HostTracer(int host_trace_level);
-  ~HostTracer() override;
-
-  Status Start() override;
-
-  Status Stop() override;
-
-  Status CollectData(XSpace* space) override;
-
- private:
-  // Level of host tracing.
-  const int host_trace_level_;
-
-  // True if currently recording.
-  bool recording_ = false;
-
-  // Timestamp at the start of tracing.
-  uint64 start_timestamp_ns_ = 0;
-
-  // Container of all traced events.
-  TraceMeRecorder::Events events_;
-};
-
-HostTracer::HostTracer(int host_trace_level)
-    : host_trace_level_(host_trace_level) {}
-
-HostTracer::~HostTracer() { Stop().IgnoreError(); }
-
-Status HostTracer::Start() {
-  if (recording_) {
-    return errors::Internal("TraceMeRecorder already started");
-  }
-
-  // All TraceMe captured should have a timestamp greater or equal to
-  // start_timestamp_ns_ to prevent timestamp underflow in XPlane.
-  // Therefore this have to be done before TraceMeRecorder::Start.
-  start_timestamp_ns_ = GetCurrentTimeNanos();
-  recording_ = TraceMeRecorder::Start(host_trace_level_);
-  if (!recording_) {
-    return errors::Internal("Failed to start TraceMeRecorder");
-  }
-  return OkStatus();
-}
-
-Status HostTracer::Stop() {
-  if (!recording_) {
-    return errors::Internal("TraceMeRecorder not started");
-  }
-  events_ = TraceMeRecorder::Stop();
-  recording_ = false;
-  return OkStatus();
-}
-
-Status HostTracer::CollectData(XSpace* space) {
-  VLOG(2) << "Collecting data to XSpace from HostTracer.";
-  if (recording_) {
-    return errors::Internal("TraceMeRecorder not stopped");
-  }
-  if (events_.empty()) {
-    return OkStatus();
-  }
-  XPlane* plane = FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
-  ConvertCompleteEventsToXPlane(start_timestamp_ns_, std::exchange(events_, {}),
-                                plane);
-  return OkStatus();
-}
-
-}  // namespace
-
-std::unique_ptr<ProfilerInterface> CreateHostTracer(
-    const HostTracerOptions& options) {
-  if (options.trace_level == 0) return nullptr;
-  return absl::make_unique<HostTracer>(options.trace_level);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/host_tracer.h b/tensorflow/core/profiler/backends/cpu/host_tracer.h
index aca6fcb2152..78859f43c31 100644
--- a/tensorflow/core/profiler/backends/cpu/host_tracer.h
+++ b/tensorflow/core/profiler/backends/cpu/host_tracer.h
@@ -17,25 +17,15 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/backends/profiler/cpu/host_tracer.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 
 namespace tensorflow {
 namespace profiler {
 
-struct HostTracerOptions {
-  // Levels of host tracing:
-  // - Level 0 is used to disable host traces.
-  // - Level 1 enables tracing of only user instrumented (or default) TraceMe.
-  // - Level 2 enables tracing of all level 1 TraceMe(s) and instrumented high
-  //           level program execution details (expensive TF ops, XLA ops, etc).
-  //           This is the default.
-  // - Level 3 enables tracing of all level 2 TraceMe(s) and more verbose
-  //           (low-level) program execution details (cheap TF ops, etc).
-  int trace_level = 2;
-};
-
-std::unique_ptr<ProfilerInterface> CreateHostTracer(
-    const HostTracerOptions& options);
+using xla::profiler::HostTracerOptions;  // NOLINT
+
+using xla::profiler::CreateHostTracer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/host_tracer_factory.cc b/tensorflow/core/profiler/backends/cpu/host_tracer_factory.cc
deleted file mode 100644
index c4313446cd5..00000000000
--- a/tensorflow/core/profiler/backends/cpu/host_tracer_factory.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/backends/cpu/host_tracer.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-std::unique_ptr<ProfilerInterface> CreateHostTracer(
-    const ProfileOptions& profile_options) {
-  HostTracerOptions options;
-  options.trace_level = profile_options.host_tracer_level();
-  return CreateHostTracer(options);
-}
-
-auto register_host_tracer_factory = [] {
-  RegisterProfilerFactory(&CreateHostTracer);
-  return 0;
-}();
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/metadata_collector.cc b/tensorflow/core/profiler/backends/cpu/metadata_collector.cc
deleted file mode 100644
index 47029fbe028..00000000000
--- a/tensorflow/core/profiler/backends/cpu/metadata_collector.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/backends/cpu/metadata_utils.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// MetadataCollector collect miscellaneous metadata for xprof, e.g. HLO protos
-// from XLA runtime etc.
-//
-// Thread-safety: This class is go/thread-compatible.
-class MetadataCollector : public ProfilerInterface {
- public:
-  MetadataCollector() = default;
-
-  Status Start() override {
-    if (!trace_active_) {
-      xla::XlaDebugInfoManager::Get()->StartTracing();
-      trace_active_ = true;
-    }
-    return OkStatus();
-  }
-
-  Status Stop() override {
-    if (trace_active_) {
-      xla::XlaDebugInfoManager::Get()->StopTracing(&debug_info_);
-      trace_active_ = false;
-    }
-    return OkStatus();
-  }
-
-  Status CollectData(XSpace* space) override {
-    if (!debug_info_.empty()) {
-      XPlane* plane = FindOrAddMutablePlaneWithName(space, kMetadataPlaneName);
-      MetadataXPlaneBuilder metadata_plane(plane);
-      for (auto& hlo_proto : debug_info_) {
-        metadata_plane.AddHloProto(hlo_proto->hlo_module().id(), *hlo_proto);
-        hlo_proto.reset();
-      }
-      debug_info_.clear();
-    }
-    return OkStatus();
-  }
-
- private:
-  std::vector<std::unique_ptr<xla::HloProto>> debug_info_;
-  bool trace_active_ = false;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(MetadataCollector);
-};
-
-std::unique_ptr<ProfilerInterface> CreatMetadataCollector(
-    const ProfileOptions& options) {
-  return options.enable_hlo_proto() ? std::make_unique<MetadataCollector>()
-                                    : nullptr;
-}
-
-}  // namespace
-
-auto register_metadata_collector_factory = [] {
-  RegisterProfilerFactory(&CreatMetadataCollector);
-  return 0;
-}();
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/metadata_utils.h b/tensorflow/core/profiler/backends/cpu/metadata_utils.h
index 9912a67ce63..4022d2aad07 100644
--- a/tensorflow/core/profiler/backends/cpu/metadata_utils.h
+++ b/tensorflow/core/profiler/backends/cpu/metadata_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_CPU_METADATA_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_BACKENDS_CPU_METADATA_UTILS_H_
 
+#include "tensorflow/compiler/xla/backends/profiler/cpu/metadata_utils.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/profiler/convert/xla_op_utils.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
@@ -25,30 +26,9 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-class MetadataXPlaneBuilder {
- public:
-  explicit MetadataXPlaneBuilder(XPlane* raw_plane)
-      : plane_(raw_plane),
-        hlo_proto_stat_(plane_.GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kHloProto))) {}
-
-  void AddHloProto(uint64_t program_id, const xla::HloProto& hlo_proto) {
-    XEventMetadata* event_metadata =
-        plane_.GetOrCreateEventMetadata(program_id);
-    if (event_metadata->name().empty()) {
-      event_metadata->set_name(HloModuleNameWithProgramId(
-          hlo_proto.hlo_module().name(), program_id));
-      XStatsBuilder<XEventMetadata> event_stats(event_metadata, &plane_);
-      event_stats.AddStatValue(*hlo_proto_stat_, hlo_proto);
-    }
-  }
-
- private:
-  XPlaneBuilder plane_;
-  const XStatMetadata* hlo_proto_stat_ = nullptr;
-};
+using xla::profiler::MetadataXPlaneBuilder;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // _TENSORFLOW_CORE_PROFILER_BACKENDS_CPU_METADATA_UTILS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_CPU_METADATA_UTILS_H_
diff --git a/tensorflow/core/profiler/backends/cpu/python_tracer.cc b/tensorflow/core/profiler/backends/cpu/python_tracer.cc
deleted file mode 100644
index 8fa870ec2de..00000000000
--- a/tensorflow/core/profiler/backends/cpu/python_tracer.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/backends/cpu/python_tracer.h"
-
-#include <memory>
-
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/python/profiler/internal/python_hooks.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// This profiler interface enables Python function call tracing.
-class PythonTracer : public ProfilerInterface {
- public:
-  explicit PythonTracer(const PythonHooksOptions& options)
-      : options_(options) {}
-  ~PythonTracer() override;
-
-  Status Start() override;
-
-  Status Stop() override;
-
-  Status CollectData(XSpace* space) override;
-
- private:
-  bool recording_ = false;
-  const PythonHooksOptions options_;
-  std::unique_ptr<tensorflow::profiler::PythonHookContext> context_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(PythonTracer);
-};
-
-PythonTracer::~PythonTracer() {
-  Stop().IgnoreError();
-}
-
-Status PythonTracer::Start() {
-  if (recording_) {
-    return errors::Internal("PythonTracer already started");
-  }
-  VLOG(1) << __FUNCTION__;
-  recording_ = true;
-  PythonHooks::GetSingleton()->Start(options_);
-  return OkStatus();
-}
-
-Status PythonTracer::Stop() {
-  if (!recording_) {
-    return errors::Internal("PythonTracer not started");
-  }
-  VLOG(1) << __FUNCTION__;
-  context_ = PythonHooks::GetSingleton()->Stop();
-  recording_ = false;
-  return OkStatus();
-}
-
-Status PythonTracer::CollectData(XSpace* space) {
-  VLOG(2) << "Collecting data to XSpace from PythonTracer.";
-  if (context_) {
-    context_->Finalize(space);
-    context_.reset();
-  }
-  return OkStatus();
-}
-
-}  // namespace
-
-std::unique_ptr<ProfilerInterface> CreatePythonTracer(
-    const PythonTracerOptions& options) {
-  if (!options.enable_trace_python_function && !options.enable_python_traceme) {
-    return nullptr;
-  }
-  PythonHooksOptions pyhooks_options;
-  pyhooks_options.enable_trace_python_function =
-      options.enable_trace_python_function;
-  pyhooks_options.enable_python_traceme = options.enable_python_traceme;
-  pyhooks_options.end_to_end_mode = options.end_to_end_mode;
-  return absl::make_unique<PythonTracer>(pyhooks_options);
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/python_tracer.h b/tensorflow/core/profiler/backends/cpu/python_tracer.h
index 40a6d08b2e8..29f75c3e13e 100644
--- a/tensorflow/core/profiler/backends/cpu/python_tracer.h
+++ b/tensorflow/core/profiler/backends/cpu/python_tracer.h
@@ -17,25 +17,15 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
+#include "tensorflow/compiler/xla/backends/profiler/cpu/python_tracer.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
 
 namespace tensorflow {
 namespace profiler {
 
-struct PythonTracerOptions {
-  // Whether to enable python function calls tracing.
-  // NOTE: Runtime overhead ensues if enabled.
-  bool enable_trace_python_function = false;
+using xla::profiler::PythonTracerOptions;  // NOLINT
 
-  // Whether to enable python TraceMe instrumentation.
-  bool enable_python_traceme = true;
-
-  // Whether profiling stops within an atexit handler.
-  bool end_to_end_mode = false;
-};
-
-std::unique_ptr<ProfilerInterface> CreatePythonTracer(
-    const PythonTracerOptions& options);
+using xla::profiler::CreatePythonTracer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/cpu/python_tracer_factory.cc b/tensorflow/core/profiler/backends/cpu/python_tracer_factory.cc
deleted file mode 100644
index e9b8b8c2b85..00000000000
--- a/tensorflow/core/profiler/backends/cpu/python_tracer_factory.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/backends/cpu/python_tracer.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-std::unique_ptr<ProfilerInterface> CreatePythonTracer(
-    const ProfileOptions& profile_options) {
-  PythonTracerOptions options;
-  options.enable_trace_python_function = profile_options.python_tracer_level();
-  options.enable_python_traceme = profile_options.host_tracer_level();
-  return CreatePythonTracer(options);
-}
-
-auto register_python_tracer_factory = [] {
-  RegisterProfilerFactory(&CreatePythonTracer);
-  return 0;
-}();
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 48c278e41ec..2e0f992ff7f 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -6,11 +6,6 @@ load(
     "tf_cuda_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_device_tracer_srcs",
-    "tf_kernel_tests_linkstatic",
-)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -26,6 +21,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     features = [
         "-layering_check",
@@ -33,44 +29,18 @@ package(
     licenses = ["notice"],
 )
 
-tf_cuda_library(
-    name = "device_tracer",
-    srcs = tf_additional_device_tracer_srcs(),
-    copts = tf_profiler_copts() + tf_copts(),
-    cuda_deps = [
-        ":cupti_tracer",
-        ":cupti_wrapper",
-        ":rocm_tracer",
-    ],
-    deps = [
-        ":cupti_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler/lib:profiler_factory",
-        "//tensorflow/core/profiler/lib:profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-    ],
-    alwayslink = 1,
-)
-
 tf_cuda_cc_test(
     name = "device_tracer_test",
     size = "small",
     srcs = ["device_tracer_test.cc"],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "nomac",
         "gpu_cupti",
     ],
     deps = [
-        ":device_tracer",
+        ":cupti_collector",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:device_tracer",
         "@com_google_absl//absl/strings",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
@@ -105,6 +75,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_interface",
     ] + tf_additional_cupti_deps(),
 )
 
@@ -117,18 +88,19 @@ tf_cuda_library(
         ":cupti_interface",
     ],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:mock_cupti",
         "//tensorflow/core:test",
     ],
 )
 
 tf_cuda_library(
     name = "cupti_error_manager",
-    srcs = if_cuda(["cupti_error_manager.cc"]),
     hdrs = if_cuda(["cupti_error_manager.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     cuda_deps = [
         ":cupti_interface",
         ":cupti_wrapper",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_error_manager",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -138,33 +110,9 @@ tf_cuda_library(
     ],
 )
 
-tf_cuda_cc_test(
-    name = "cupti_error_manager_test",
-    size = "small",
-    srcs = ["cupti_error_manager_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + [
-        "nomac",
-        "gpu_cupti",
-    ],
-    deps = [
-        "//tensorflow/core:test_main",
-    ] + if_cuda_is_configured([
-        ":cuda_test",
-        ":cupti_error_manager",
-        ":cupti_tracer",
-        ":cupti_utils",
-        ":cupti_wrapper",
-        ":mock_cupti",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/core/profiler/utils:time_utils",
-    ]),
-)
-
 cuda_library(
     name = "cuda_test",
     testonly = 1,
-    srcs = ["cuda_test.cu.cc"],
     hdrs = ["cuda_test.h"],
     copts = select({
         "@local_config_cuda//cuda:using_nvcc": [
@@ -175,6 +123,7 @@ cuda_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cuda_test",
         "//tensorflow/core:test",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudart",
@@ -189,19 +138,18 @@ cuda_library(
 # that the wrapper is about the only direct user.
 tf_cuda_library(
     name = "cupti_wrapper",
-    srcs = if_cuda(["cupti_wrapper.cc"]),
     hdrs = if_cuda(["cupti_wrapper.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_wrapper",
     ] + tf_additional_cupti_deps(),
 )
 
 tf_cuda_library(
     name = "cupti_tracer",
-    srcs = if_cuda(["cupti_tracer.cc"]),
     hdrs = if_cuda(["cupti_tracer.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
@@ -210,10 +158,11 @@ tf_cuda_library(
         ":cupti_interface",
         ":cupti_utils",
         ":nvtx_utils",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_tracer",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/backends/cpu:annotation_stack",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-        "//tensorflow/core/profiler/utils:buffer_pool",
+        "//tensorflow/tsl/profiler/utils:buffer_pool",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
@@ -223,11 +172,11 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "rocm_tracer",
-    srcs = if_rocm(["rocm_tracer.cc"]),
     hdrs = if_rocm(["rocm_tracer.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:rocm_tracer",
         "//tensorflow/compiler/xla/stream_executor/rocm:roctracer_wrapper",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/backends/cpu:annotation_stack",
@@ -242,17 +191,16 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "nvtx_utils",
-    srcs = if_cuda(["nvtx_utils.cc"]),
     hdrs = if_cuda(["nvtx_utils.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:nvtx_utils",
         "//tensorflow/core:lib",
     ],
 )
 
 tf_cuda_library(
     name = "cupti_collector",
-    srcs = if_cuda(["cupti_collector.cc"]),
     hdrs = if_cuda(["cupti_collector.h"]),
     copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
@@ -262,6 +210,7 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_collector",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:parse_annotation",
@@ -277,6 +226,7 @@ cc_library(
     hdrs = ["cupti_collector.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_collector_header",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:parse_annotation",
@@ -293,12 +243,12 @@ cc_library(
 
 tf_cuda_library(
     name = "cupti_utils",
-    srcs = if_cuda(["cupti_utils.cc"]),
     copts = tf_profiler_copts() + tf_copts(),
     cuda_deps = [
         ":cupti_error_manager",
         ":cupti_interface",
         ":cupti_wrapper",
+        "//tensorflow/compiler/xla/backends/profiler/gpu:cupti_utils",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/tensorflow/core/profiler/backends/gpu/cuda_test.h b/tensorflow/core/profiler/backends/gpu/cuda_test.h
index 0d44d1bbf18..16fef4eda62 100644
--- a/tensorflow/core/profiler/backends/gpu/cuda_test.h
+++ b/tensorflow/core/profiler/backends/gpu/cuda_test.h
@@ -16,37 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUDA_TEST_H_
 #define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUDA_TEST_H_
 
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cuda_test.h"
+
 namespace tensorflow {
 namespace profiler {
 namespace test {
-// Calls a function on the device to print a string as many times as indicated
-// by iters.
-void PrintfKernel(int iters = 1);
-
-// Calls an empty kernel (named "empty") on the device as many times as
-// indicated by iters.
-void EmptyKernel(int iters = 1);
-
-// Waits for device activity to complete.
-void Synchronize();
-
-// Copies a few bytes of memory from host to device.
-void MemCopyH2D();
-
-// Copies a few bytes of memory from device to host, asynchronously.
-void MemCopyH2D_Async();
-
-// Copies a few bytes of memory from device to host.
-void MemCopyD2H();
-
-// Returns true if it s possible to copy bytes from device 0 to device 1.
-bool MemCopyP2PAvailable();
-
-// Copies a few bytes of memory from device 0 to device 1.
-void MemCopyP2PImplicit();
 
-// Copies a few bytes of memory from device 0 to device 1.
-void MemCopyP2PExplicit();
+using xla::profiler::test::EmptyKernel;          // NOLINT
+using xla::profiler::test::MemCopyD2H;           // NOLINT
+using xla::profiler::test::MemCopyH2D;           // NOLINT
+using xla::profiler::test::MemCopyH2D_Async;     // NOLINT
+using xla::profiler::test::MemCopyP2PAvailable;  // NOLINT
+using xla::profiler::test::MemCopyP2PExplicit;   // NOLINT
+using xla::profiler::test::MemCopyP2PImplicit;   // NOLINT
+using xla::profiler::test::PrintfKernel;         // NOLINT
+using xla::profiler::test::Synchronize;          // NOLINT
 
 }  // namespace test
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_collector.h b/tensorflow/core/profiler/backends/gpu/cupti_collector.h
index 76dc6c4604c..5624cf4ef38 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_collector.h
+++ b/tensorflow/core/profiler/backends/gpu/cupti_collector.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_set.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_collector.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,244 +31,24 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32 destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-  // This contains CUpti_ActivityMemcpyKind for activity event (on device).
-  // For events from other CuptiTracerEventSource, it is always 0.
-  int8 copy_kind;
-  // CUpti_ActivityMemoryKind of source.
-  int8 src_mem_kind;
-  // CUpti_ActivityMemoryKind of destination.
-  int8 dst_mem_kind;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-struct MemAllocDetails {
-  // Size of memory to be written over in bytes.
-  size_t num_bytes;
-  // The CUpti_ActivityMemoryKind value for this activity event.
-  int8 mem_kind;
-  // The virtual address of allocation. 0 if it is a free operation.
-  uint64 address;
-};
+using xla::profiler::AnnotationMap;                // NOLINT
+using xla::profiler::CreateCuptiCollector;         // NOLINT
+using xla::profiler::CuptiTraceCollector;          // NOLINT
+using xla::profiler::CuptiTracerCollectorOptions;  // NOLINT
+using xla::profiler::CuptiTracerEvent;             // NOLINT
+using xla::profiler::CuptiTracerEventSource;       // NOLINT
+using xla::profiler::CuptiTracerEventType;         // NOLINT
+using xla::profiler::GetMemoryKindName;            // NOLINT
+using xla::profiler::GetTraceEventTypeName;        // NOLINT
+using xla::profiler::KernelDetails;                // NOLINT
+using xla::profiler::MemAllocDetails;              // NOLINT
+using xla::profiler::MemcpyDetails;                // NOLINT
+using xla::profiler::MemsetDetails;                // NOLINT
+using xla::profiler::ToXStat;                      // NOLINT
 
 using MemFreeDetails = MemAllocDetails;
-
-// Memory residency contains details read from CUpti_ActivityMemory type. This
-// is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
-// event. The start of this even corresponse to a cudaMalloc, and the end
-// corresponds to a cudaFree.
 using MemoryResidencyDetails = MemAllocDetails;
 
-struct MemsetDetails {
-  // Size of memory to be written over in bytes.
-  size_t num_bytes;
-  // The CUpti_ActivityMemoryKind value for this activity event.
-  int8 mem_kind;
-  // Whether or not the memset is asynchronous.
-  bool async;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32 registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32 static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32 dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32 block_x;
-  // Y-dimension of a thread block.
-  uint32 block_y;
-  // Z-dimension of a thread block.
-  uint32 block_z;
-  // X-dimension of a grid.
-  uint32 grid_x;
-  // Y-dimension of a grid.
-  uint32 grid_y;
-  // Z-dimension of a grid.
-  uint32 grid_z;
-
-  // ID of the hardware channel on which this operation ran.
-  uint32_t channel_id = -1;
-  // CUpti_ChannelType of the channel above.
-  int8_t channel_type = 0;  // CUPTI_CHANNEL_TYPE_INVALID
-};
-
-inline std::string ToXStat(const KernelDetails& kernel_info,
-                           double occupancy_pct) {
-  return absl::StrCat(
-      "regs:", kernel_info.registers_per_thread,
-      " static_shared:", kernel_info.static_shared_memory_usage,
-      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-      kernel_info.block_y, ",", kernel_info.block_z,
-      " occ_pct:", occupancy_pct);
-}
-
-// Gets the name of the CUpti_ActivityMemoryKind value.
-absl::string_view GetMemoryKindName(int8_t memory_kind);
-
-enum class CuptiTracerEventType {
-  Unsupported = 0,
-  Kernel = 1,
-  MemcpyH2D = 2,
-  MemcpyD2H = 3,
-  MemcpyD2D = 4,
-  MemcpyP2P = 5,
-  MemcpyOther = 6,
-  MemoryAlloc = 7,
-  Overhead = 8,
-  UnifiedMemory = 9,
-  MemoryFree = 10,
-  Memset = 11,
-  MemoryResidency = 12,
-  Generic = 100,
-};
-
-const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
-
-enum class CuptiTracerEventSource {
-  Invalid = 0,
-  DriverCallback = 1,
-  Activity = 2,
-  // Maybe consider adding runtime callback and metric api in the future.
-};
-
-struct CuptiTracerEvent {
-  static constexpr uint32 kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32 kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64 kInvalidContextId =
-      std::numeric_limits<uint64_t>::max();
-  static constexpr uint64 kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
-  CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
-  // Although CUpti_CallbackData::functionName is persistent, however
-  // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
-  // it.
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view nvtx_range;
-  uint64 start_time_ns = 0;
-  uint64 end_time_ns = 0;
-  uint32 device_id = 0;
-  uint32 correlation_id = kInvalidCorrelationId;
-  uint32 thread_id = kInvalidThreadId;
-  int64_t context_id = kInvalidContextId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    // For Memcpy API and activities. `type` must be Memcpy*.
-    MemcpyDetails memcpy_info;
-    // Used for MemAlloc API. `type` must be MemoryAlloc.
-    MemAllocDetails memalloc_info;
-    // Used for kernel activities. `type` must be Kernel.
-    KernelDetails kernel_info;
-    // Used for MemFree activities. `type` must be MemoryFree.
-    MemFreeDetails memfree_info;
-    // Used for Memset API and activities. `type` must be Memset.
-    MemsetDetails memset_info;
-    // Used for Memory residency activities. `type` must be MemoryResidency.
-    MemoryResidencyDetails memory_residency_info;
-  };
-};
-
-struct CuptiTracerCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64 max_callback_api_events = 2 * 1024 * 1024;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64 max_activity_api_events = 2 * 1024 * 1024;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64 max_annotation_strings = 1024 * 1024;
-  // Number of GPUs involved.
-  uint32 num_gpus;
-};
-
-class AnnotationMap {
- public:
-  struct AnnotationInfo {
-    absl::string_view annotation;
-    absl::string_view nvtx_range;
-  };
-
-  explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
-      : max_size_(max_size), per_device_map_(num_gpus) {}
-  void Add(uint32 device_id, uint32 correlation_id,
-           const absl::string_view annotation,
-           const absl::string_view nvtx_range);
-  AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
-
- private:
-  struct PerDeviceAnnotationMap {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::node_hash_set<std::string> nvtx_ranges;
-    absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
-  };
-  const uint64 max_size_;
-  absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
-};
-
-class CuptiTraceCollector {
- public:
-  explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
-      : options_(options),
-        annotation_map_(options.max_annotation_strings, options.num_gpus) {}
-  virtual ~CuptiTraceCollector() {}
-
-  // Producer side functions (i.e. called by CuptiTracer).
-  virtual void AddEvent(CuptiTracerEvent&& event) = 0;
-  virtual void OnEventsDropped(const std::string& reason,
-                               uint32 num_events) = 0;
-  virtual void Flush() = 0;
-
-  // Consumer side functions (i.e. called by GPU tracer);
-  virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
-  virtual std::string ReportNumEventsIfDropped() { return ""; }
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
-
- protected:
-  CuptiTracerCollectorOptions options_;
-
- private:
-  AnnotationMap annotation_map_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
-};
-
-std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
-    const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
-    const uint64 start_gputime_ns);
-
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h b/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h
index f045695ee3d..c057f63afca 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h
+++ b/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_error_manager.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
@@ -32,244 +33,7 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-class CuptiErrorManager : public tensorflow::profiler::CuptiInterface {
- public:
-  explicit CuptiErrorManager(std::unique_ptr<CuptiInterface> interface);
-
-  // Returns whether CUPTI is disabled.
-  bool Disabled() const override { return disabled_.load(); }
-
-  // CUPTI activity API: all thread-safe
-  // Disables activity monitoring.
-  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
-
-  // Enables activity monitoring. If this is successfully executed, we add
-  // ActivityDisable to the undo log.
-  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
-
-  // Flushes all outstanding activities.
-  CUptiResult ActivityFlushAll(uint32_t flag) override;
-
-  // Gets a next activity record from a pool of already collected activity
-  // records.
-  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
-                                    size_t valid_buffer_size_bytes,
-                                    CUpti_Activity** record) override;
-
-  // Reports the number of dropped activity records.
-  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
-                                           uint32_t stream_id,
-                                           size_t* dropped) override;
-
-  CUptiResult ActivityConfigureUnifiedMemoryCounter(
-      CUpti_ActivityUnifiedMemoryCounterConfig* config,
-      uint32_t count) override;
-
-  // Registers callback functions handling activity.
-  CUptiResult ActivityRegisterCallbacks(
-      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
-      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
-
-  // Returns device ID for a given context.
-  CUptiResult GetDeviceId(CUcontext context, uint32_t* device_id) override;
-
-  // Returns CUPTI timestamp.
-  CUptiResult GetTimestamp(uint64_t* timestamp) override;
-
-  // Explicitly destroys and cleans up all resources associated with CUPTI in
-  // the current process.
-  CUptiResult Finalize() override;
-
-  // CUPTI callback API
-  // Enables or disables callback. If we successfully enables callback, we add
-  // EnableCallback to disable callback to the undo log.
-  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
-                             CUpti_CallbackDomain domain,
-                             CUpti_CallbackId callback_id) override;
-
-  // Enables or disables callback domain. If we successfully enables a domain,
-  // we add EnableDomain to disable the domain to the undo log.
-  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
-                           CUpti_CallbackDomain domain) override;
-
-  // Subscribes callbacks. If we successfully subscribes the callback, we add
-  // Unsubscribe to the undo log.
-  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
-                        CUpti_CallbackFunc callback, void* userdata) override;
-
-  // Unsubscribes callbacks.
-  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
-
-  // CUPTI event API
-  // Returns a list of event domains.
-  CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) override;
-
-  // Returns domain attributes.
-  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
-                                            CUpti_EventDomainID event_domain,
-                                            CUpti_EventDomainAttribute attrib,
-                                            size_t* value_size,
-                                            void* value) override;
-
-  // Disables kernel replay mode.
-  CUptiResult DisableKernelReplayMode(CUcontext context) override;
-
-  // Enables kernel replay mode. If we successfully enable kernel replay mode,
-  // we add DisableKernelReplayMode to the undo log.
-  CUptiResult EnableKernelReplayMode(CUcontext context) override;
-
-  // Returns the number of event domains.
-  CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                       uint32_t* num_domains) override;
-
-  // Returns a list of events.
-  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                    size_t* array_size_bytes,
-                                    CUpti_EventID* event_array) override;
-
-  // Returns the number of events.
-  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                      uint32_t* num_events) override;
-
-  // Returns an event attribute.
-  CUptiResult EventGetAttribute(CUpti_EventID event,
-                                CUpti_EventAttribute attrib, size_t* value_size,
-                                void* value) override;
-
-  // Convverts event ID from event name.
-  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
-                                 CUpti_EventID* event) override;
-
-  // Disables event group.
-  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
-
-  // Enables event group. If we successfully enable an event group, we add
-  // EventGroupDisable to the undo log.
-  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
-
-  // Returns an event group attribute.
-  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t* value_size, void* value) override;
-
-  // Returns a performance counter value.
-  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                  CUpti_ReadEventFlags flags,
-                                  CUpti_EventID event,
-                                  size_t* event_value_buffer_size_bytes,
-                                  uint64_t* event_value_buffer) override;
-
-  // Returns an event group set attribute.
-  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t value_size, void* value) override;
-
-  // Creates an event group set. If we successfully creates an event group set,
-  // we add EventGroupSetsDestroy to the undo log.
-  CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) override;
-
-  // Destroys an event group set.
-  CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) override;
-
-  // CUPTI metric API: all thread-safe
-  // Enumerates metrics.
-  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                CUpti_MetricID* metricArray) override;
-
-  // Returns the number of metrics.
-  CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                  uint32_t* num_metrics) override;
-
-  // Converts a metric ID to a metric name.
-  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
-                                  CUpti_MetricID* metric) override;
-
-  // Returns the number of events required to calculate a particular metric.
-  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                 uint32_t* num_events) override;
-
-  // Returns a list of events required to calculate a particular metric.
-  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                               size_t* event_id_array_size_bytes,
-                               CUpti_EventID* event_id_array) override;
-
-  // Returns a metric attribute.
-  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                 CUpti_MetricAttribute attrib,
-                                 size_t* value_size, void* value) override;
-
-  // Returns a metric value.
-  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                             size_t event_id_array_size_bytes,
-                             CUpti_EventID* event_id_array,
-                             size_t event_value_array_size_bytes,
-                             uint64_t* event_value_array,
-                             uint64_t time_duration,
-                             CUpti_MetricValue* metric_value) override;
-
-  CUptiResult GetResultString(CUptiResult result, const char** str) override;
-
-  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
-
-  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
-                            uint8_t per_thread_stream,
-                            uint32_t* stream_id) override;
-
-  // Clears Undo stack. We are maintaining undo stack for each profiling phase.
-  // Once the profiling is done, we need to clear the undo stack.
-  void CleanUp() override;
-
- private:
-  typedef std::function<CUptiResult()> UndoFunction;
-
-  // Register undo function.
-  void RegisterUndoFunction(const UndoFunction& func);
-
-  // Resets profiling status by calling some undo functions registered,
-  // and then disables profiling.
-  void UndoAndDisable();
-
-  // Returns a descriptive string for a CUptiResult.
-  std::string ResultString(CUptiResult result) const;
-
-  // Contains a pointer to a cupti interface instance. Normally, this will point
-  // to a real CUPTI interface that interacts with underlying hardware, but for
-  // testing, we often replace this with a CUPTI mock object to mock hardware
-  // behavior. This will be set when CuptiBase singleton was created and an
-  // object that this variable points to will die when CuptiBase singleton dies,
-  // i.e., at the end of program execution.
-  std::unique_ptr<CuptiInterface> interface_;
-
-  // A vector of functions that needs to be called by Undo upon an error
-  // detected. This vector is managed like a statck through push_back and
-  // pop_back. Whenever an API function is successfully executed, its
-  // corresponding undo function will be pushed into this stack and Undo will
-  // pop and execute the unroll function upon detecting an error.
-  std::vector<UndoFunction> undo_stack_ TF_GUARDED_BY(undo_stack_mu_);
-
-  // A mutex to guarantee atomicity for undo_stack_. Given that threads that
-  // can update undo_stack_ are a profiling control thread such as a webserver
-  // thread or a thread that executes a kernel during performance counter
-  // profiling, which is already serialized, the contention for this lock will
-  // be extremely low. In other words, it will be contended only when the
-  // profiling is being enabled or disabled, and we will have at most two
-  // threads that will contend for this mutex.
-  mutex undo_stack_mu_;
-
-  // Once an error is detected, we will ignore any CUPTI API call.
-  std::atomic<int> disabled_;
-
-  // Prevent recursive undo if an UndoFunction fails.
-  bool undo_disabled_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiErrorManager);
-};
+using xla::profiler::CuptiErrorManager;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_interface.h b/tensorflow/core/profiler/backends/gpu/cupti_interface.h
index e864c803b55..f6e008cd839 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_interface.h
+++ b/tensorflow/core/profiler/backends/gpu/cupti_interface.h
@@ -21,182 +21,15 @@ limitations under the License.
 
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_interface.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// Provides a wrapper interface to every single CUPTI API function. This class
-// is needed to create an easy mock object for CUPTI API calls. All member
-// functions are defined in the following order: activity related APIs, callback
-// related APIs, Event APIs, and metric APIs. Within each category, we follow
-// the order in the original CUPTI documentation.
-class CuptiInterface {
- public:
-  CuptiInterface() {}
-
-  virtual ~CuptiInterface() {}
-
-  // CUPTI activity API
-  virtual CUptiResult ActivityDisable(CUpti_ActivityKind kind) = 0;
-
-  virtual CUptiResult ActivityEnable(CUpti_ActivityKind kind) = 0;
-
-  virtual CUptiResult ActivityFlushAll(uint32_t flag) = 0;
-
-  virtual CUptiResult ActivityGetNextRecord(uint8_t* buffer,
-                                            size_t valid_buffer_size_bytes,
-                                            CUpti_Activity** record) = 0;
-
-  virtual CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
-                                                   uint32_t stream_id,
-                                                   size_t* dropped) = 0;
-
-  virtual CUptiResult ActivityConfigureUnifiedMemoryCounter(
-      CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) = 0;
-
-  virtual CUptiResult ActivityRegisterCallbacks(
-      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
-      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
-
-  virtual CUptiResult GetDeviceId(CUcontext context, uint32* deviceId) = 0;
-
-  virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
-
-  virtual CUptiResult Finalize() = 0;
-
-  // CUPTI callback API
-  virtual CUptiResult EnableCallback(uint32_t enable,
-                                     CUpti_SubscriberHandle subscriber,
-                                     CUpti_CallbackDomain domain,
-                                     CUpti_CallbackId cbid) = 0;
-
-  virtual CUptiResult EnableDomain(uint32_t enable,
-                                   CUpti_SubscriberHandle subscriber,
-                                   CUpti_CallbackDomain domain) = 0;
-
-  virtual CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
-                                CUpti_CallbackFunc callback,
-                                void* userdata) = 0;
-
-  virtual CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) = 0;
-
-  // CUPTI event API
-  virtual CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) = 0;
-
-  virtual CUptiResult DeviceGetEventDomainAttribute(
-      CUdevice device, CUpti_EventDomainID event_domain,
-      CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult DisableKernelReplayMode(CUcontext context) = 0;
-
-  virtual CUptiResult EnableKernelReplayMode(CUcontext context) = 0;
-
-  virtual CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                               uint32_t* num_domains) = 0;
-
-  virtual CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                            size_t* array_size_bytes,
-                                            CUpti_EventID* event_array) = 0;
-
-  virtual CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                              uint32_t* num_events) = 0;
-
-  virtual CUptiResult EventGetAttribute(CUpti_EventID event,
-                                        CUpti_EventAttribute attrib,
-                                        size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult EventGetIdFromName(CUdevice device,
-                                         const char* event_name,
-                                         CUpti_EventID* event) = 0;
-
-  virtual CUptiResult EventGroupDisable(CUpti_EventGroup event_group) = 0;
-
-  virtual CUptiResult EventGroupEnable(CUpti_EventGroup event_group) = 0;
-
-  virtual CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                             CUpti_EventGroupAttribute attrib,
-                                             size_t* value_size,
-                                             void* value) = 0;
-
-  virtual CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                          CUpti_ReadEventFlags flags,
-                                          CUpti_EventID event,
-                                          size_t* event_value_buffer_size_bytes,
-                                          uint64_t* eventValueBuffer) = 0;
-
-  virtual CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                             CUpti_EventGroupAttribute attrib,
-                                             size_t value_size,
-                                             void* value) = 0;
-
-  virtual CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) = 0;
-
-  virtual CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) = 0;
-
-  // CUPTI metric API
-  virtual CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                        CUpti_MetricID* metricArray) = 0;
-
-  virtual CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                          uint32_t* num_metrics) = 0;
-
-  virtual CUptiResult MetricGetIdFromName(CUdevice device,
-                                          const char* metric_name,
-                                          CUpti_MetricID* metric) = 0;
-
-  virtual CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                         uint32_t* num_events) = 0;
-
-  virtual CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                                       size_t* event_id_array_size_bytes,
-                                       CUpti_EventID* event_id_array) = 0;
-
-  virtual CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                         CUpti_MetricAttribute attrib,
-                                         size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                     size_t event_id_array_size_bytes,
-                                     CUpti_EventID* event_id_array,
-                                     size_t event_value_array_size_bytes,
-                                     uint64_t* event_value_array,
-                                     uint64_t time_duration,
-                                     CUpti_MetricValue* metric_value) = 0;
-
-  virtual CUptiResult GetResultString(CUptiResult result, const char** str) = 0;
-
-  virtual CUptiResult GetContextId(CUcontext context, uint32_t* context_id) = 0;
-
-  virtual CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
-                                    uint8_t per_thread_stream,
-                                    uint32_t* stream_id) = 0;
-
-  // Interface maintenance functions. Not directly related to CUPTI, but
-  // required for implementing an error resilient layer over CUPTI API.
-
-  // Performance any clean up work that is required each time profile session
-  // is done. Therefore this can be called multiple times during process life
-  // time.
-  virtual void CleanUp() = 0;
-
-  // Whether CUPTI API is currently disabled due to unrecoverable errors.
-  // All subsequent calls will fail immediately without forwarding calls to
-  // CUPTI library.
-  virtual bool Disabled() const = 0;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CuptiInterface);
-};
-
-CuptiInterface* GetCuptiInterface();
+using xla::profiler::CuptiInterface;     // NOLINT
+using xla::profiler::GetCuptiInterface;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_tracer.h b/tensorflow/core/profiler/backends/gpu/cupti_tracer.h
index 97ea072090e..26dfdf3390c 100644
--- a/tensorflow/core/profiler/backends/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/backends/gpu/cupti_tracer.h
@@ -19,136 +19,19 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/cupti_tracer.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
 #include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-#include "tensorflow/core/profiler/utils/buffer_pool.h"
+#include "tensorflow/tsl/profiler/utils/buffer_pool.h"
 
 namespace tensorflow {
 namespace profiler {
 
-struct CuptiTracerOptions {
-  bool enable_activity_api = true;
-
-  // Use cuda events to enclose the kernel/memcpy to measure device activity.
-  // enable_event_based_activity, if true, will override the enable_activity_api
-  // setting.
-  bool enable_event_based_activity = false;
-
-  bool required_callback_api_events = true;
-  // The callback ids that will be enabled and monitored, if empty, all
-  // Callback ids to be enabled using Callback API.
-  // We only care CUPTI_CB_DOMAIN_DRIVER_API domain for now. It is kind of
-  // redundant to have both CUPTI_CB_DOMAIN_DRIVER_API and
-  // CUPTI_CB_DOMAIN_RUNTIME_API.
-  std::vector<CUpti_driver_api_trace_cbid_enum> cbids_selected;
-  // Activity kinds to be collected using Activity API. If empty, the Activity
-  // API is disable.
-  std::vector<CUpti_ActivityKind> activities_selected;
-  // Whether to call cuptiFinalize.
-  bool cupti_finalize = false;
-  // Whether to call cuCtxSynchronize for each device before Stop().
-  bool sync_devices_before_stop = false;
-  // Whether to enable NVTX tracking, we need this for TensorRT tracking.
-  bool enable_nvtx_tracking = false;
-};
-
-class CuptiDriverApiHook {
- public:
-  virtual ~CuptiDriverApiHook() {}
-
-  virtual Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
-                                  CUpti_CallbackId cbid,
-                                  const CUpti_CallbackData* callback_info) = 0;
-  virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
-                                 CUpti_CallbackId cbid,
-                                 const CUpti_CallbackData* callback_info) = 0;
-  virtual Status SyncAndFlush() = 0;
-
- protected:
-  static Status AddDriverApiCallbackEvent(
-      CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
-      int device_id, uint64 start_tsc, uint64 end_tsc,
-      CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
-      const CUpti_CallbackData* callback_info);
-};
-
-// The class use to enable cupti callback/activity API and forward the collected
-// trace events to CuptiTraceCollector. There should be only one CuptiTracer
-// per process.
-class CuptiTracer {
- public:
-  // Not copyable or movable
-  CuptiTracer(const CuptiTracer&) = delete;
-  CuptiTracer& operator=(const CuptiTracer&) = delete;
-
-  // Returns a pointer to singleton CuptiTracer.
-  static CuptiTracer* GetCuptiTracerSingleton();
-
-  // Only one profile session can be live in the same time.
-  bool IsAvailable() const;
-  bool NeedRootAccess() const { return need_root_access_; }
-
-  void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
-  void Disable();
-
-  Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
-                        const CUpti_CallbackData* callback_info);
-
-  // Returns a buffer and its size for CUPTI to store activities. This buffer
-  // will be reclaimed when CUPTI makes a callback to ProcessActivityBuffer.
-  void RequestActivityBuffer(uint8_t** buffer, size_t* size);
-
-  // Parses CUPTI activity events from activity buffer, and emits events for
-  // CuptiTraceCollector. This function is public because called from registered
-  // callback.
-  Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
-                               uint8_t* buffer, size_t size);
-
-  static uint64 GetTimestamp();
-  static int NumGpus();
-  // Returns the error (if any) when using libcupti.
-  static std::string ErrorIfAny();
-
- protected:
-  // protected constructor for injecting mock cupti interface for testing.
-  explicit CuptiTracer(CuptiInterface* cupti_interface);
-
- private:
-  // Buffer size and alignment, 32K and 8 as in CUPTI samples.
-  static constexpr size_t kBufferSizeInBytes = 32 * 1024;
-
-  Status EnableApiTracing();
-  Status EnableActivityTracing();
-  Status DisableApiTracing();
-  Status DisableActivityTracing();
-  Status Finalize();
-  void ConfigureActivityUnifiedMemoryCounter(bool enable);
-  Status HandleNVTXCallback(CUpti_CallbackId cbid,
-                            const CUpti_CallbackData* cbdata);
-
-  int num_gpus_;
-  absl::optional<CuptiTracerOptions> option_;
-  CuptiInterface* cupti_interface_ = nullptr;
-  CuptiTraceCollector* collector_ = nullptr;
-
-  // CUPTI 10.1 and higher need root access to profile.
-  bool need_root_access_ = false;
-
-  bool api_tracing_enabled_ = false;
-  // Cupti handle for driver or runtime API callbacks. Cupti permits a single
-  // subscriber to be active at any time and can be used to trace Cuda runtime
-  // as and driver calls for all contexts and devices.
-  CUpti_SubscriberHandle subscriber_;  // valid when api_tracing_enabled_.
-
-  bool activity_tracing_enabled_ = false;
-
-  std::unique_ptr<CuptiDriverApiHook> cupti_driver_api_hook_;
-
-  BufferPool buffer_pool_;
-};
+using xla::profiler::CuptiTracer;         // NOLINT
+using xla::profiler::CuptiTracerOptions;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index cac27cb8186..75d1f92c172 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -53,15 +53,21 @@ limitations under the License.
 // TODO(b/186367334)
 #define CUPTI_NVBUG_3299481_WAR (10000 <= CUDA_VERSION && CUDA_VERSION < 11000)
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+namespace xla {
+namespace profiler {
+extern std::unique_ptr<tensorflow::profiler::ProfilerInterface> CreateGpuTracer(
+    const tensorflow::ProfileOptions& options);
+}  // namespace profiler
+}  // namespace xla
+#endif
 namespace tensorflow {
 namespace profiler {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-extern std::unique_ptr<ProfilerInterface> CreateGpuTracer(
-    const ProfileOptions& options);
 std::unique_ptr<ProfilerInterface> CreateGpuTracer() {
   ProfileOptions options = ProfilerSession::DefaultOptions();
-  return CreateGpuTracer(options);
+  return xla::profiler::CreateGpuTracer(options);
 }
 
 #else
diff --git a/tensorflow/core/profiler/backends/gpu/mock_cupti.h b/tensorflow/core/profiler/backends/gpu/mock_cupti.h
index 96c484f3490..d3ff4cff7dd 100644
--- a/tensorflow/core/profiler/backends/gpu/mock_cupti.h
+++ b/tensorflow/core/profiler/backends/gpu/mock_cupti.h
@@ -21,146 +21,14 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/compiler/xla/backends/profiler/gpu/mock_cupti.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// A mock object automatically generated by gmock_gen.py.
-class MockCupti : public tensorflow::profiler::CuptiInterface {
- public:
-  MOCK_METHOD(CUptiResult, ActivityDisable, (CUpti_ActivityKind kind),
-              (override));
-  MOCK_METHOD(CUptiResult, ActivityEnable, (CUpti_ActivityKind kind),
-              (override));
-  MOCK_METHOD(CUptiResult, ActivityFlushAll, (uint32_t flag), (override));
-  MOCK_METHOD(CUptiResult, ActivityGetNextRecord,
-              (uint8_t * buffer, size_t valid_buffer_size_bytes,
-               CUpti_Activity** record),
-              (override));
-  MOCK_METHOD(CUptiResult, ActivityGetNumDroppedRecords,
-              (CUcontext context, uint32_t stream_id, size_t* dropped),
-              (override));
-  MOCK_METHOD(CUptiResult, ActivityConfigureUnifiedMemoryCounter,
-              (CUpti_ActivityUnifiedMemoryCounterConfig * config,
-               uint32_t count),
-              (override));
-  MOCK_METHOD(CUptiResult, ActivityRegisterCallbacks,
-              (CUpti_BuffersCallbackRequestFunc func_buffer_requested,
-               CUpti_BuffersCallbackCompleteFunc func_buffer_completed),
-              (override));
-  MOCK_METHOD(CUptiResult, GetDeviceId, (CUcontext context, uint32_t* deviceId),
-              (override));
-  MOCK_METHOD(CUptiResult, GetTimestamp, (uint64_t * timestamp), (override));
-  MOCK_METHOD(CUptiResult, Finalize, (), (override));
-  MOCK_METHOD(CUptiResult, EnableCallback,
-              (uint32_t enable, CUpti_SubscriberHandle subscriber,
-               CUpti_CallbackDomain domain, CUpti_CallbackId cbid),
-              (override));
-  MOCK_METHOD(CUptiResult, EnableDomain,
-              (uint32_t enable, CUpti_SubscriberHandle subscriber,
-               CUpti_CallbackDomain domain),
-              (override));
-  MOCK_METHOD(CUptiResult, Subscribe,
-              (CUpti_SubscriberHandle * subscriber, CUpti_CallbackFunc callback,
-               void* userdata),
-              (override));
-  MOCK_METHOD(CUptiResult, Unsubscribe, (CUpti_SubscriberHandle subscriber),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceEnumEventDomains,
-              (CUdevice device, size_t* array_size_bytes,
-               CUpti_EventDomainID* domain_array),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetEventDomainAttribute,
-              (CUdevice device, CUpti_EventDomainID event_domain,
-               CUpti_EventDomainAttribute attrib, size_t* value_size,
-               void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, DisableKernelReplayMode, (CUcontext context),
-              (override));
-  MOCK_METHOD(CUptiResult, EnableKernelReplayMode, (CUcontext context),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetNumEventDomains,
-              (CUdevice device, uint32_t* num_domains), (override));
-  MOCK_METHOD(CUptiResult, EventDomainEnumEvents,
-              (CUpti_EventDomainID event_domain, size_t* array_size_bytes,
-               CUpti_EventID* event_array),
-              (override));
-  MOCK_METHOD(CUptiResult, EventDomainGetNumEvents,
-              (CUpti_EventDomainID event_domain, uint32_t* num_events),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGetAttribute,
-              (CUpti_EventID event, CUpti_EventAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGetIdFromName,
-              (CUdevice device, const char* event_name, CUpti_EventID* event),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupDisable, (CUpti_EventGroup event_group),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupEnable, (CUpti_EventGroup event_group),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupGetAttribute,
-              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupReadEvent,
-              (CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
-               CUpti_EventID event, size_t* event_value_buffer_size_bytes,
-               uint64_t* eventValueBuffer),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetAttribute,
-              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-               size_t value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetsCreate,
-              (CUcontext context, size_t event_id_array_size_bytes,
-               CUpti_EventID* event_id_array,
-               CUpti_EventGroupSets** event_group_passes),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetsDestroy,
-              (CUpti_EventGroupSets * event_group_sets), (override));
-  MOCK_METHOD(CUptiResult, DeviceEnumMetrics,
-              (CUdevice device, size_t* arraySizeBytes,
-               CUpti_MetricID* metricArray),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetNumMetrics,
-              (CUdevice device, uint32_t* num_metrics), (override));
-  MOCK_METHOD(CUptiResult, MetricGetIdFromName,
-              (CUdevice device, const char* metric_name,
-               CUpti_MetricID* metric),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetNumEvents,
-              (CUpti_MetricID metric, uint32_t* num_events), (override));
-  MOCK_METHOD(CUptiResult, MetricEnumEvents,
-              (CUpti_MetricID metric, size_t* event_id_array_size_bytes,
-               CUpti_EventID* event_id_array),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetAttribute,
-              (CUpti_MetricID metric, CUpti_MetricAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetValue,
-              (CUdevice device, CUpti_MetricID metric,
-               size_t event_id_array_size_bytes, CUpti_EventID* event_id_array,
-               size_t event_value_array_size_bytes, uint64_t* event_value_array,
-               uint64_t time_duration, CUpti_MetricValue* metric_value),
-              (override));
-  MOCK_METHOD(CUptiResult, GetResultString,
-              (CUptiResult result, const char** str), (override));
-
-  MOCK_METHOD(CUptiResult, GetContextId,
-              (CUcontext context, uint32_t* context_id), (override));
-
-  MOCK_METHOD(CUptiResult, GetStreamIdEx,
-              (CUcontext context, CUstream stream, uint8_t per_thread_stream,
-               uint32_t* stream_id),
-              (override));
-
-  MOCK_METHOD(void, CleanUp, (), (override));
-  MOCK_METHOD(bool, Disabled, (), (const, override));
-};
+using xla::profiler::MockCupti;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/nvtx_utils.h b/tensorflow/core/profiler/backends/gpu/nvtx_utils.h
index 902860fca3a..ea2d57ead1e 100644
--- a/tensorflow/core/profiler/backends/gpu/nvtx_utils.h
+++ b/tensorflow/core/profiler/backends/gpu/nvtx_utils.h
@@ -19,38 +19,13 @@ limitations under the License.
 #include <stack>
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/nvtx_utils.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 namespace profiler {
 
-/***
- * We have no intention to use NVTX in tensorflow right now, we use this class
- * to track NVTX instrumentation inside NVIDIA libraries (such as TensorRT).
- * This bears a lot of resemblance to ScopedAnnotation for now.  In the future,
- * we will use TraceMe to keep track trace context within a thread.
- */
-class NVTXRangeTracker {
- public:
-  static void EnterRange(const std::string& range) {
-    auto& range_stack = GetRangeStack();
-    range_stack.push(range);
-  }
-  static void ExitRange() {
-    auto& range_stack = GetRangeStack();
-    if (!range_stack.empty()) range_stack.pop();
-  }
-  static const absl::string_view CurrentRange() {
-    auto& range_stack = GetRangeStack();
-    if (!range_stack.empty()) return range_stack.top();
-    return "";
-  }
-
- private:
-  static std::stack<std::string>& GetRangeStack();
-
-  TF_DISALLOW_COPY_AND_ASSIGN(NVTXRangeTracker);
-};
+using xla::profiler::NVTXRangeTracker;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/rocm_tracer.h b/tensorflow/core/profiler/backends/gpu/rocm_tracer.h
index dc4fa3a0292..99f9f567a3a 100644
--- a/tensorflow/core/profiler/backends/gpu/rocm_tracer.h
+++ b/tensorflow/core/profiler/backends/gpu/rocm_tracer.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/backends/profiler/gpu/rocm_tracer.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -30,364 +31,27 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-};
-
-struct MemsetDetails {
-  // The number of memory elements getting set
-  size_t num_bytes;
-  // Whether or not the memset is asynchronous.
-  bool async;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64_t num_bytes;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // kernel address. Used for calculating core occupancy
-  void* func_ptr;
-};
-
-// RocmTracerSyncTypes forward decleration
-enum class RocmTracerSyncTypes;
-struct SynchronizationDetails {
-  RocmTracerSyncTypes sync_type;
-};
-
-enum class RocmTracerEventType {
-  Unsupported = 0,
-  Kernel,
-  MemcpyH2D,
-  MemcpyD2H,
-  MemcpyD2D,
-  MemcpyP2P,
-  MemcpyOther,
-  MemoryAlloc,
-  MemoryFree,
-  Memset,
-  Synchronization,
-  Generic,
-};
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
-
-enum class RocmTracerEventSource {
-  Invalid = 0,
-  ApiCallback,
-  Activity,
-};
-
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
-
-enum class RocmTracerEventDomain {
-  InvalidDomain = 0,
-  HIP_API,
-  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
-};
-enum class RocmTracerSyncTypes {
-  InvalidSync = 0,
-  StreamSynchronize,  // caller thread wait stream to become empty
-  EventSynchronize,   // caller thread will block until event happens
-  StreamWait          // compute stream will wait for event to happen
-};
-
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
-
-struct RocmTracerEvent {
-  static constexpr uint32_t kInvalidDeviceId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  RocmTracerEventType type;
-  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
-  RocmTracerEventDomain domain;
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view roctx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = kInvalidDeviceId;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint32_t thread_id = kInvalidThreadId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;                    // If type == Memcpy*
-    MemsetDetails memset_info;                    // If type == Memset*
-    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
-    KernelDetails kernel_info;                    // If type == Kernel
-    SynchronizationDetails synchronization_info;  // If type == Synchronization
-  };
-};
-
-void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const string& message);
-
-struct RocmTracerOptions {
-  std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
-
-  // map of domain --> ops for which we need to enable the API callbacks
-  // If the ops vector is empty, then enable API callbacks for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> > api_callbacks;
-
-  // map of domain --> ops for which we need to enable the Activity records
-  // If the ops vector is empty, then enable Activity records for entire domain
-  absl::flat_hash_map<activity_domain_t, std::vector<uint32_t> >
-      activity_tracing;
-};
-
-struct RocmTraceCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64_t max_callback_api_events;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64_t max_activity_api_events;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64_t max_annotation_strings;
-  // Number of GPUs involved.
-  uint32_t num_gpus;
-};
-
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
-  void Add(uint32_t correlation_id, const std::string& annotation);
-  absl::string_view LookUp(uint32_t correlation_id);
-
- private:
-  struct AnnotationMapImpl {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
-  };
-  const uint64_t max_size_;
-  AnnotationMapImpl map_;
-
- public:
-  // Disable copy and move.
-  AnnotationMap(const AnnotationMap&) = delete;
-  AnnotationMap& operator=(const AnnotationMap&) = delete;
-};
-
-class RocmTraceCollector {
- public:
-  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
-      : options_(options), annotation_map_(options.max_annotation_strings) {}
-  virtual ~RocmTraceCollector() {}
-
-  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
-  virtual void OnEventsDropped(const std::string& reason,
-                               uint32_t num_events) = 0;
-  virtual void Flush() = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
-
- protected:
-  RocmTraceCollectorOptions options_;
-
- private:
-  AnnotationMap annotation_map_;
-
- public:
-  // Disable copy and move.
-  RocmTraceCollector(const RocmTraceCollector&) = delete;
-  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
-};
-
-class RocmTracer;
-
-class RocmApiCallbackImpl {
- public:
-  RocmApiCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                      RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
-
- private:
-  void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddNormalMemcpyEventUponApiExit(uint32_t cbid,
-                                       const hip_api_data_t* data,
-                                       uint64_t enter_time, uint64_t exit_time);
-  void AddMemcpyPeerEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint64_t enter_time, uint64_t exit_time);
-  void AddMemsetEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                 uint64_t enter_time, uint64_t exit_time);
-  void AddMallocFreeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                     uint32_t device_id, uint64_t enter_time,
-                                     uint64_t exit_time);
-  void AddStreamSynchronizeEventUponApiExit(uint32_t cbid,
-                                            const hip_api_data_t* data,
-                                            uint64_t enter_time,
-                                            uint64_t exit_time);
-  void AddSynchronizeEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
-                                      uint64_t enter_time, uint64_t exit_time);
-
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
-  tsl::mutex api_call_start_mutex_;
-  // TODO(rocm-profiler): replace this with absl hashmap
-  // keep a map from the corr. id to enter time for API callbacks.
-  std::map<uint32_t, uint64_t> api_call_start_time_
-      TF_GUARDED_BY(api_call_start_mutex_);
-};
-
-class RocmActivityCallbackImpl {
- public:
-  RocmActivityCallbackImpl(const RocmTracerOptions& options, RocmTracer* tracer,
-                           RocmTraceCollector* collector)
-      : options_(options), tracer_(tracer), collector_(collector) {}
-
-  Status operator()(const char* begin, const char* end);
-
- private:
-  void AddHipKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipMemsetActivityEvent(const roctracer_record_t* record);
-  void AddHipMallocActivityEvent(const roctracer_record_t* record);
-  void AddHipStreamSynchronizeActivityEvent(const roctracer_record_t* record);
-  void AddHccKernelActivityEvent(const roctracer_record_t* record);
-  void AddNormalHipOpsMemcpyActivityEvent(const roctracer_record_t* record);
-  void AddHipOpsMemsetActivityEvent(const roctracer_record_t* record);
-  RocmTracerOptions options_;
-  RocmTracer* tracer_ = nullptr;
-  RocmTraceCollector* collector_ = nullptr;
-};
-
-// The class use to enable cupti callback/activity API and forward the collected
-// trace events to RocmTraceCollector. There should be only one RocmTracer
-// per process.
-class RocmTracer {
- public:
-  // Returns a pointer to singleton RocmTracer.
-  static RocmTracer* GetRocmTracerSingleton();
-
-  // Only one profile session can be live in the same time.
-  bool IsAvailable() const;
-
-  void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
-  void Disable();
-
-  void ApiCallbackHandler(uint32_t domain, uint32_t cbid, const void* cbdata);
-  void ActivityCallbackHandler(const char* begin, const char* end);
-
-  static uint64_t GetTimestamp();
-  static int NumGpus();
-
-  void AddToPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Add(correlation_id);
-  }
-
-  void RemoveFromPendingActivityRecords(uint32_t correlation_id) {
-    pending_activity_records_.Remove(correlation_id);
-  }
-
-  void ClearPendingActivityRecordsCount() { pending_activity_records_.Clear(); }
-
-  size_t GetPendingActivityRecordsCount() {
-    return pending_activity_records_.Count();
-  }
-
- protected:
-  // protected constructor for injecting mock cupti interface for testing.
-  explicit RocmTracer() : num_gpus_(NumGpus()) {}
-
- private:
-  Status EnableApiTracing();
-  Status DisableApiTracing();
-
-  Status EnableActivityTracing();
-  Status DisableActivityTracing();
-
-  int num_gpus_;
-  absl::optional<RocmTracerOptions> options_;
-  RocmTraceCollector* collector_ = nullptr;
-
-  bool api_tracing_enabled_ = false;
-  bool activity_tracing_enabled_ = false;
-
-  RocmApiCallbackImpl* api_cb_impl_;
-  RocmActivityCallbackImpl* activity_cb_impl_;
-
-  class PendingActivityRecords {
-   public:
-    // add a correlation id to the pending set
-    void Add(uint32_t correlation_id) {
-      absl::MutexLock lock(&mutex);
-      pending_set.insert(correlation_id);
-    }
-    // remove a correlation id from the pending set
-    void Remove(uint32_t correlation_id) {
-      absl::MutexLock lock(&mutex);
-      pending_set.erase(correlation_id);
-    }
-    // clear the pending set
-    void Clear() {
-      absl::MutexLock lock(&mutex);
-      pending_set.clear();
-    }
-    // count the number of correlation ids in the pending set
-    size_t Count() {
-      absl::MutexLock lock(&mutex);
-      return pending_set.size();
-    }
-
-   private:
-    // set of co-relation ids for which the hcc activity record is pending
-    absl::flat_hash_set<uint32_t> pending_set;
-    // the callback which processes the activity records (and consequently
-    // removes items from the pending set) is called in a separate thread
-    // from the one that adds item to the list.
-    absl::Mutex mutex;
-  };
-  PendingActivityRecords pending_activity_records_;
-
- public:
-  // Disable copy and move.
-  RocmTracer(const RocmTracer&) = delete;
-  RocmTracer& operator=(const RocmTracer&) = delete;
-};
+using xla::profiler::AnnotationMap;                 // NOLINT
+using xla::profiler::DumpRocmTracerEvent;           // NOLINT
+using xla::profiler::GetRocmTracerEventDomainName;  // NOLINT
+using xla::profiler::GetRocmTracerEventSourceName;  // NOLINT
+using xla::profiler::GetRocmTracerEventTypeName;    // NOLINT
+using xla::profiler::KernelDetails;                 // NOLINT
+using xla::profiler::MemAllocDetails;               // NOLINT
+using xla::profiler::MemcpyDetails;                 // NOLINT
+using xla::profiler::MemsetDetails;                 // NOLINT
+using xla::profiler::RocmActivityCallbackImpl;      // NOLINT
+using xla::profiler::RocmApiCallbackImpl;           // NOLINT
+using xla::profiler::RocmTraceCollector;            // NOLINT
+using xla::profiler::RocmTraceCollectorOptions;     // NOLINT
+using xla::profiler::RocmTracer;                    // NOLINT
+using xla::profiler::RocmTracerEvent;               // NOLINT
+using xla::profiler::RocmTracerEventDomain;         // NOLINT
+using xla::profiler::RocmTracerEventSource;         // NOLINT
+using xla::profiler::RocmTracerEventType;           // NOLINT
+using xla::profiler::RocmTracerOptions;             // NOLINT
+using xla::profiler::RocmTracerSyncTypes;           // NOLINT
+using xla::profiler::SynchronizationDetails;        // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/builds/BUILD b/tensorflow/core/profiler/builds/BUILD
index e097ce48b40..be734b64a21 100644
--- a/tensorflow/core/profiler/builds/BUILD
+++ b/tensorflow/core/profiler/builds/BUILD
@@ -1,10 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],
 )
-
-# ONLY FOR DEV TESTING. DO NOT USE IF YOU DO NOT KNOW ABOUT IT ALREADY.
-config_setting(
-    name = "profiler_build_oss",
-    define_values = {"profiler_build": "oss"},
-)
diff --git a/tensorflow/core/profiler/builds/build_config.bzl b/tensorflow/core/profiler/builds/build_config.bzl
index 47ce9c0f71e..6168c461e89 100644
--- a/tensorflow/core/profiler/builds/build_config.bzl
+++ b/tensorflow/core/profiler/builds/build_config.bzl
@@ -1,23 +1,14 @@
 """Provides a redirection point for platform specific implementations of Starlark utilities."""
 
-load(
-    "//tensorflow/core/profiler/builds/oss:build_config.bzl",
-    _tf_profiler_alias = "tf_profiler_alias",
-    _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
-    _tf_profiler_xla_proto_header = "tf_profiler_xla_proto_header",
-)
 load(
     "//tensorflow/tsl/profiler/builds:build_config.bzl",
+    _if_profiler_oss = "if_profiler_oss",
+    _tf_profiler_alias = "tf_profiler_alias",
     _tf_profiler_copts = "tf_profiler_copts",
+    _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
 )
 
 tf_profiler_alias = _tf_profiler_alias
-tf_profiler_xla_proto_header = _tf_profiler_xla_proto_header
 tf_profiler_pybind_cc_library_wrapper = _tf_profiler_pybind_cc_library_wrapper
 tf_profiler_copts = _tf_profiler_copts
-
-def if_profiler_oss(if_true, if_false = []):
-    return select({
-        "//tensorflow/core/profiler/builds:profiler_build_oss": if_true,
-        "//conditions:default": if_false,
-    })
+if_profiler_oss = _if_profiler_oss
diff --git a/tensorflow/core/profiler/builds/oss/BUILD b/tensorflow/core/profiler/builds/oss/BUILD
deleted file mode 100644
index d143d7494b0..00000000000
--- a/tensorflow/core/profiler/builds/oss/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Tensorflow default + linux implementations of tensorflow/core/profiler libraries.
-
-package(licenses = ["notice"])
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 0443d2db6bb..f6b1879feb3 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -1,8 +1,9 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
-load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts", "tf_profiler_pybind_cc_library_wrapper", "tf_profiler_xla_proto_header")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],
 )
@@ -305,36 +306,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "trace_events_to_json",
-    srcs = ["trace_events_to_json.cc"],
-    hdrs = ["trace_events_to_json.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/utils:format_utils",
-        "//tensorflow/core/profiler/utils:math_utils",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/strings",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_cc_test(
-    name = "trace_events_to_json_test",
-    srcs = ["trace_events_to_json_test.cc"],
-    deps = [
-        ":trace_events_to_json",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
 cc_library(
     name = "xplane_to_op_stats",
     srcs = ["xplane_to_op_stats.cc"],
@@ -342,8 +313,6 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_db_combiner",
-        ":op_stats_combiner",
-        ":preprocess_single_host_xplane",
         ":repository",
         ":step_events_to_steps_db",
         ":xplane_to_kernel_stats_db",
@@ -373,11 +342,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "multi_xplanes_to_op_stats",
+    srcs = ["multi_xplanes_to_op_stats.cc"],
+    hdrs = ["multi_xplanes_to_op_stats.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":op_stats_combiner",
+        ":preprocess_single_host_xplane",
+        ":repository",
+        ":xplane_to_op_stats",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:hardware_type_utils",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
 tf_cc_test(
     name = "xplane_to_op_stats_test",
     size = "small",
     srcs = ["xplane_to_op_stats_test.cc"],
     deps = [
+        ":multi_xplanes_to_op_stats",
         ":repository",
         ":step_events_to_steps_db",
         ":xplane_to_op_stats",
@@ -395,6 +383,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_test_utils",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -439,42 +428,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "xplane_to_trace_events",
-    srcs = ["xplane_to_trace_events.cc"],
-    hdrs = ["xplane_to_trace_events.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "//tensorflow/core/profiler/utils:xplane_visitor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-    ],
-)
-
-tf_cc_test(
-    name = "xplane_to_trace_events_test",
-    size = "small",
-    srcs = ["xplane_to_trace_events_test.cc"],
-    deps = [
-        ":xplane_to_trace_events",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-    ],
-)
-
 cc_library(
     name = "xplane_to_kernel_stats_db",
     srcs = ["xplane_to_kernel_stats_db.cc"],
@@ -617,8 +570,10 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:topology_proto_cc",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:step_intersection",
@@ -658,7 +613,8 @@ cc_library(
     hdrs = ["xplane_to_tools_data.h"],
     copts = tf_profiler_copts(),
     deps = [
-        ":hlo_to_tools_data_headers_only",
+        ":hlo_to_tools_data",
+        ":multi_xplanes_to_op_stats",
         ":op_stats_to_input_pipeline_analysis",
         ":op_stats_to_op_profile",
         ":op_stats_to_overview_page",
@@ -671,7 +627,6 @@ cc_library(
         ":xplane_to_op_stats",
         ":xplane_to_tf_data_stats",
         ":xplane_to_tool_names",
-        ":xplane_to_trace_events",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
@@ -686,6 +641,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -699,7 +655,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
@@ -756,11 +711,12 @@ cc_library(
 )
 
 cc_library(
-    name = "hlo_to_tools_data_impl",
+    name = "hlo_to_tools_data",
     srcs = ["hlo_to_tools_data.cc"],
     hdrs = ["hlo_to_tools_data.h"],
     copts = tf_profiler_copts(),
     visibility = [
+        "//tensorflow:__pkg__",
         "//tensorflow/python:__pkg__",
     ],
     deps = [
@@ -774,12 +730,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/strings",
     ],
-    alwayslink = True,
-)
-
-tf_profiler_pybind_cc_library_wrapper(
-    name = "hlo_to_tools_data_headers_only",
-    actual = ":hlo_to_tools_data_impl",
 )
 
 cc_library(
@@ -811,7 +761,6 @@ tf_cc_test(
     deps = [
         ":hlo_proto_to_memory_visualization_utils",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -829,7 +778,7 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         ":repository",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
@@ -837,7 +786,8 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:file_system_utils",
         "//tensorflow/core/profiler/utils:hlo_proto_map",
-    ] + tf_profiler_xla_proto_header(),
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
@@ -875,7 +825,7 @@ cc_library(
     hdrs = ["hlo_proto_to_graph_view.h"],
     deps = [
         ":tool_options",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/platform:errors",
@@ -929,6 +879,26 @@ cc_library(
         ":repository",
         ":xplane_to_hlo",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "xplane_to_tool_names_test",
+    size = "small",
+    srcs = ["xplane_to_tool_names_test.cc"],
+    deps = [
+        ":repository",
+        ":xplane_to_tool_names",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
index c9a6e2263b5..936b6496bcb 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/tool_options.h"
@@ -130,44 +130,51 @@ static constexpr char kGraphTypeName[] = "graph";
 static constexpr char kShortTxtTypeName[] = "short_txt";
 static constexpr char kLongTxtTypeName[] = "long_txt";
 static constexpr char kDefaultFormatString[] = "url";
+static constexpr int kDefaultWidth = 3;
+static constexpr int kDefaultShowMetadata = 0;
+static constexpr int kDefaultMergeFusion = 0;
 
 }  // namespace
 
-StatusOr<GraphViewerParams> ParseGraphViewerParams(
-    const HloToolOptions& options) {
+StatusOr<GraphViewerParams> ParseGraphViewerParams(const ToolOptions& options) {
   GraphViewerParams params;
-  if (!options.type.has_value()) {
+  std::optional<std::string> type = GetParam<std::string>(options, "type");
+  if (!type.has_value()) {
     return errors::InvalidArgument("Graph viewer must provide a type option.");
   }
 
   // For graph type.
-  if (options.type == kGraphTypeName) {
-    params.type = options.type.value();
-    if (options.node_name.has_value()) {
-      params.node_name = options.node_name.value();
+  if (type == kGraphTypeName) {
+    params.type = type.value();
+    if (std::optional<std::string> node_name =
+            GetParam<std::string>(options, "node_name")) {
+      params.node_name = node_name.value();
     }
 
-    params.graph_width = options.graph_width;
-    params.render_options.show_backend_config = options.show_metadata;
-    params.render_options.show_fusion_subcomputations = !options.merge_fusion;
-    params.format =
-        GetRenderFormat(options.format.has_value() ? options.format.value()
-                                                   : kDefaultFormatString);
+    params.graph_width =
+        GetParamWithDefault<int>(options, "graph_width", kDefaultWidth);
+    params.render_options.show_backend_config = GetParamWithDefault<int>(
+        options, "show_metadata", kDefaultShowMetadata);
+    params.render_options.show_fusion_subcomputations =
+        !GetParamWithDefault<int>(options, "merge_fusion", kDefaultMergeFusion);
+    params.format = GetRenderFormat(GetParamWithDefault<std::string>(
+        options, "format", kDefaultFormatString));
 
     return params;
   }
 
   // For txt type.
-  if (options.type == kShortTxtTypeName || options.type == kLongTxtTypeName) {
-    params.type = options.type.value();
-    params.verbose = (options.type == kLongTxtTypeName);
-    params.show_metadata = options.show_metadata;
+  if (type == kShortTxtTypeName || type == kLongTxtTypeName) {
+    params.type = type.value();
+    params.verbose = (type == kLongTxtTypeName);
+    params.show_metadata =
+        GetParamWithDefault(options, "show_metadata", kDefaultShowMetadata);
     return params;
   }
 
   // Unknown type.
   return errors::InvalidArgument("Unknown graph viewer type option: ",
-                                 options.type.value());
+                                 type.value());
 }
 
 xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string) {
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
index 1b33d7de0e4..e37e5de3871 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
@@ -43,8 +43,7 @@ struct GraphViewerParams {
 };
 
 // Parse tool options to get the parameters for graph viewer.
-StatusOr<GraphViewerParams> ParseGraphViewerParams(
-    const HloToolOptions& options);
+StatusOr<GraphViewerParams> ParseGraphViewerParams(const ToolOptions& options);
 
 // Get graph render format.
 xla::RenderedGraphFormat GetRenderFormat(const std::string& format_string);
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc
index 1a7e14968de..ecba6d7f71b 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view_test.cc
@@ -35,9 +35,8 @@ TEST(GraphViewerParamsTest, GraphType) {
   // Default for graph type.
   ToolOptions options1;
   options1["type"] = "graph";
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params1,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options1)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
+                          ParseGraphViewerParams(options1));
   EXPECT_EQ(params1.type, "graph");
   EXPECT_EQ(params1.node_name, "");
   EXPECT_EQ(params1.graph_width, 3);
@@ -53,9 +52,8 @@ TEST(GraphViewerParamsTest, GraphType) {
   options2["show_metadata"] = 1;
   options2["merge_fusion"] = 1;
   options2["format"] = "html";
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params2,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options2)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
+                          ParseGraphViewerParams(options2));
   EXPECT_EQ(params2.type, "graph");
   EXPECT_EQ(params2.node_name, "fusion.111");
   EXPECT_EQ(params2.graph_width, 10);
@@ -68,9 +66,8 @@ TEST(GraphViewerParamsTest, ShortTxtType) {
   // Default for short txt type.
   ToolOptions options1;
   options1["type"] = "short_txt";
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params1,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options1)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
+                          ParseGraphViewerParams(options1));
   EXPECT_EQ(params1.type, "short_txt");
   EXPECT_EQ(params1.verbose, false);
   EXPECT_EQ(params1.show_metadata, false);
@@ -79,9 +76,8 @@ TEST(GraphViewerParamsTest, ShortTxtType) {
   ToolOptions options2;
   options2["type"] = "short_txt";
   options2["show_metadata"] = 1;
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params2,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options2)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
+                          ParseGraphViewerParams(options2));
   EXPECT_EQ(params2.type, "short_txt");
   EXPECT_EQ(params2.verbose, false);
   EXPECT_EQ(params2.show_metadata, true);
@@ -91,9 +87,8 @@ TEST(GraphViewerParamsTest, LongTxtType) {
   // Default for long txt type.
   ToolOptions options1;
   options1["type"] = "long_txt";
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params1,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options1)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params1,
+                          ParseGraphViewerParams(options1));
   EXPECT_EQ(params1.type, "long_txt");
   EXPECT_EQ(params1.verbose, true);
   EXPECT_EQ(params1.show_metadata, false);
@@ -102,9 +97,8 @@ TEST(GraphViewerParamsTest, LongTxtType) {
   ToolOptions options2;
   options2["type"] = "long_txt";
   options2["show_metadata"] = 1;
-  TF_ASSERT_OK_AND_ASSIGN(
-      GraphViewerParams params2,
-      ParseGraphViewerParams(ToolOptionsToHloToolOptions(options2)));
+  TF_ASSERT_OK_AND_ASSIGN(GraphViewerParams params2,
+                          ParseGraphViewerParams(options2));
   EXPECT_EQ(params2.type, "long_txt");
   EXPECT_EQ(params2.verbose, true);
   EXPECT_EQ(params2.show_metadata, true);
@@ -112,13 +106,13 @@ TEST(GraphViewerParamsTest, LongTxtType) {
 
 TEST(GraphViewerParamsTest, OtherTypes) {
   ToolOptions options1;
-  EXPECT_THAT(ParseGraphViewerParams(ToolOptionsToHloToolOptions(options1)),
+  EXPECT_THAT(ParseGraphViewerParams(options1),
               StatusIs(error::INVALID_ARGUMENT,
                        HasSubstr("Graph viewer must provide a type option")));
 
   ToolOptions options2;
   options2["type"] = "abcd";
-  EXPECT_THAT(ParseGraphViewerParams(ToolOptionsToHloToolOptions(options2)),
+  EXPECT_THAT(ParseGraphViewerParams(options2),
               StatusIs(error::INVALID_ARGUMENT,
                        HasSubstr("Unknown graph viewer type option: abcd")));
 }
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index b21735f6c81..f083643001b 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -16,7 +16,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
+#include <iterator>
+#include <list>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -29,23 +33,20 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/protobuf/memory_viewer_preprocess.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
-using absl::StrFormat;
 using ::xla::BufferAllocationProto;
 using ::xla::HeapSimulatorTrace;
 using ::xla::HloInstructionProto;
@@ -63,6 +64,134 @@ const Shape* ResolveShapeIndex(const Shape* shape,
   return shape;
 }
 
+std::string ShapeDescription(const Shape& shape) {
+  return ShapeUtil::HumanStringWithLayout(shape);
+}
+
+// A wrapper around ShapeUtil::ByteSizeOf that clears out the layout/padding,
+// since that is considered in the ByteSizeOf calculation.
+int64_t ShapeUnpaddedSize(Shape shape) {
+  // Ensure the layout has no padding by making it the default layout.
+  LayoutUtil::SetToDefaultLayout(&shape);
+  // Note: we make a simplifying assumption here that a "minimal" size for a
+  // tuple member would be the size of a `void*` -- there may be even fancier
+  // ways of doing things, but this should give a good enough approximation of
+  // what a minimal tuple size is.
+  return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+}
+
+class BufferAllocationStruct {
+ public:
+  explicit BufferAllocationStruct(const BufferAllocationProto& proto)
+      : buffer_allocation_((proto)) {}
+  bool IsIndefinite() const {
+    return buffer_allocation_.is_thread_local() ||
+           buffer_allocation_.is_entry_computation_parameter() ||
+           buffer_allocation_.is_constant() ||
+           buffer_allocation_.maybe_live_out();
+  }
+  const BufferAllocationProto& proto() const { return buffer_allocation_; }
+  size_t size() const { return buffer_allocation_.size(); }
+  int64_t color() const { return buffer_allocation_.color(); }
+  int64_t index() const { return buffer_allocation_.index(); }
+  std::optional<int64_t> heap_simulator_trace_id() const {
+    return heap_simulator_trace_id_;
+  }
+  void set_heap_simulator_trace_id(int64_t id) {
+    heap_simulator_trace_id_ = id;
+  }
+
+  // Get buffer allocation category.
+  std::string category() const {
+    if (buffer_allocation_.is_entry_computation_parameter()) {
+      return "Parameter";
+    } else if (buffer_allocation_.maybe_live_out()) {
+      return "Output";
+    } else if (buffer_allocation_.is_thread_local()) {
+      return "Thread-local";
+    } else if (buffer_allocation_.is_constant()) {
+      return "Constant";
+    } else {
+      return "Temporary";
+    }
+  }
+
+  std::string description() const {
+    return absl::StrFormat(
+        "buffer_allocation_id:%d\nsize:%d\nbuffer_counts:%d\n",
+        buffer_allocation_.index(), size(), buffer_allocation_.assigned_size());
+  }
+
+ private:
+  const BufferAllocationProto& buffer_allocation_;
+  std::optional<int64_t> heap_simulator_trace_id_;
+};
+
+struct LogicalBufferStruct {
+  LogicalBufferStruct(const LogicalBufferProto& p,
+                      const BufferAllocationStruct& b,
+                      const ::xla::HloInstructionProto& i, uint64_t offset)
+      : proto(p), buffer_allocation(b), hlo_instruction(i), offset(offset) {
+    // Get shape of logical buffer.
+    const Shape top_level_shape(hlo_instruction.shape());
+    shape =
+        *ResolveShapeIndex(&top_level_shape, proto.defined_at().shape_index());
+  }
+
+  absl::string_view instruction_name() const { return hlo_instruction.name(); }
+
+  int64_t color() const { return proto.color(); }
+  size_t size() const { return proto.size(); }
+  size_t unpadded_size() const { return ShapeUnpaddedSize(shape); }
+
+  // reference counting related
+  int64_t inc() {
+    if (canonical_buffer) return canonical_buffer->inc();
+    return ++ref_count;
+  }
+  int64_t dec() {
+    if (canonical_buffer) return canonical_buffer->dec();
+    return --ref_count;
+  }
+  int64_t share_with(LogicalBufferStruct* buffer) {
+    canonical_buffer = buffer;
+    return canonical_buffer->inc();
+  }
+  LogicalBufferStruct* get_canonical_buffer() {
+    return canonical_buffer ? canonical_buffer->get_canonical_buffer() : this;
+  }
+
+  // Get the instruction name with shape index for a logical buffer.
+  std::string GetInstructionNameWithShapeIndex() const {
+    if (proto.defined_at().shape_index().empty()) {
+      return std::string(instruction_name());
+    } else {
+      return absl::StrCat(instruction_name(), "{",
+                          absl::StrJoin(proto.defined_at().shape_index(), ","),
+                          "}");
+    }
+  }
+
+  std::string description() const {
+    return absl::StrFormat(
+        "buffer_id:%d\nhlo_op:%s\nshape:%s\nsize:%d\nunpadded_size:%d\n"
+        "offset:%d\nspan:(%lld,%lld)",
+        proto.id(), instruction_name(), ShapeDescription(shape), size(),
+        unpadded_size(), offset, span ? span->first : -1,
+        span ? span->second : -1);
+  }
+
+  const LogicalBufferProto& proto;
+  const BufferAllocationStruct& buffer_allocation;
+  const ::xla::HloInstructionProto& hlo_instruction;
+  uint64_t offset;  // within the buffer allocation;
+  // Span within the specific simulator trace.
+  std::optional<std::pair<uint64_t, uint64_t>> span;
+  xla::Shape shape;
+  int64_t ref_count = 0;
+  LogicalBufferStruct* canonical_buffer = nullptr;
+};
+
 // A wrapper of HLO BufferAssignment, with lookup maps for logical buffers and
 // buffer allocations.
 class HloProtoBufferWrapper {
@@ -72,132 +201,189 @@ class HloProtoBufferWrapper {
     Init();
   }
 
+  // Get the heap simulator trace ID using memory color.
+  // If unable to find the heap simulator trace, return -1.
+  int64_t GetHeapSimulatorTraceId(const int64_t memory_color) const {
+    int64_t id = GetHeapSimulatorTraceIdFromBufferAllocationIndex(memory_color);
+    if (id != -1) {
+      return id;
+    }
+    return GetHeapSimulatorTraceIdFromEvents(memory_color);
+  }
+
   // Get the raw HLO proto.
   const ::xla::HloProto& GetHloProto() const { return hlo_proto_; }
 
-  // Helper functions to get LogicalBuffer and BufferAllocation.
-  // We use map.at() directly in these function assuming the HLO proto is
-  // invalid.
-  const xla::LogicalBufferProto& GetLogicalBuffer(
-      int64_t logical_buffer_id) const {
-    return *id_to_logical_buffer_.at(logical_buffer_id);
-  }
-
-  const xla::BufferAllocationProto& GetBufferAllocation(
-      const xla::LogicalBufferProto& logical_buffer) const {
-    return *logical_buffer_to_buffer_allocation_.at(&logical_buffer);
+  const BufferAllocationStruct& GetBufferAllocation(
+      int64_t buffer_allocation_id) const {
+    return *id_to_buffer_allocation_.at(buffer_allocation_id);
   }
 
-  const xla::HloInstructionProto& GetHloInstruction(
-      const xla::LogicalBufferProto& logical_buffer) const {
-    return *name_to_hlo_.at(logical_buffer.defined_at().instruction_name());
+  std::vector<const BufferAllocationStruct*> GetBufferAllocations(
+      int64_t memory_color) const {
+    std::vector<const BufferAllocationStruct*> buffer_allocations;
+    for (const auto& iter : id_to_buffer_allocation_) {
+      if (iter.second->proto().color() != memory_color) continue;
+      buffer_allocations.push_back(iter.second.get());
+    }
+    return buffer_allocations;
   }
 
-  const std::vector<const ::xla::LogicalBufferProto*>&
-  GetLogicalBuffersFromBufferAllocation(
-      const xla::BufferAllocationProto& buffer_allocation) const {
-    return buffer_allocation_to_logical_buffers_.at(&buffer_allocation);
+  LogicalBufferStruct& GetLogicalBuffer(int64_t logical_buffer_id) const {
+    return *id_to_logical_buffer_.at(logical_buffer_id);
   }
 
-  const xla::Shape& GetLogicalBufferShape(
-      const xla::LogicalBufferProto& logical_buffer) const {
-    return logical_buffer_to_shape_.at(&logical_buffer);
+  // Get the logical buffers with indefinite lifetime (excluding thread_local).
+  std::vector<const LogicalBufferStruct*> LogicalBuffersWithIndefiniteLifetime(
+      int64_t memory_color) const {
+    std::vector<const LogicalBufferStruct*> indefinite_logical_buffers;
+
+    for (const auto& buffer_assignment : GetBufferAllocations(memory_color)) {
+      if (!buffer_assignment->IsIndefinite()) continue;
+      if (buffer_assignment->proto().is_thread_local()) continue;
+      // A indefinite buffer allocation will contain multiple logical buffers.
+      // None of them have a offset, and may have different size than the buffer
+      // allocation's size. In most cases, if not all cases, one of the logical
+      // buffer will have the size equal to buffer allocation's size. We will
+      // pick the biggest logical buffer.
+      const LogicalBufferStruct* best_logical_buffer = nullptr;
+      size_t best_size = 0;
+      for (const auto& assigned : buffer_assignment->proto().assigned()) {
+        const auto& logical_buffer_struct =
+            GetLogicalBuffer(assigned.logical_buffer_id());
+        if (logical_buffer_struct.size() > best_size) {
+          best_size = logical_buffer_struct.size();
+          best_logical_buffer = &logical_buffer_struct;
+        }
+      }
+      if (best_logical_buffer) {
+        indefinite_logical_buffers.push_back(best_logical_buffer);
+      }
+    }
+    return indefinite_logical_buffers;
   }
 
  private:
   // Initialize the mappings of logical buffers and buffer allocations.
   void Init() {
+    // A mapping from name to HLO instruction.
+    absl::flat_hash_map<absl::string_view, const ::xla::HloInstructionProto*>
+        name_to_hlo;
+    absl::flat_hash_map<uint64_t, const ::xla::HloInstructionProto*>
+        unique_id_to_hlo;
+
     for (const auto& computation : hlo_proto_.hlo_module().computations()) {
       for (const auto& instruction : computation.instructions()) {
-        name_to_hlo_[instruction.name()] = &instruction;
+        name_to_hlo[instruction.name()] = &instruction;
+        unique_id_to_hlo[instruction.id()] = &instruction;
       }
     }
+
+    absl::flat_hash_map<int64_t, const LogicalBufferProto*>
+        id_to_logical_buffer_proto;
     for (const auto& logical_buffer :
          hlo_proto_.buffer_assignment().logical_buffers()) {
-      id_to_logical_buffer_[logical_buffer.id()] = &logical_buffer;
-      // Get shape of logical buffer.
-      const auto& instruction_name =
-          logical_buffer.defined_at().instruction_name();
-      const Shape top_level_shape(name_to_hlo_.at(instruction_name)->shape());
-      const Shape* shape = ResolveShapeIndex(
-          &top_level_shape, logical_buffer.defined_at().shape_index());
-      logical_buffer_to_shape_[&logical_buffer] = *shape;
+      id_to_logical_buffer_proto[logical_buffer.id()] = &logical_buffer;
     }
+
     for (const auto& buffer_allocation :
          hlo_proto_.buffer_assignment().buffer_allocations()) {
+      auto& buffer_allocation_s =
+          id_to_buffer_allocation_[buffer_allocation.index()];
+      buffer_allocation_s =
+          std::make_unique<BufferAllocationStruct>(buffer_allocation);
       for (const auto& assigned : buffer_allocation.assigned()) {
-        const auto* logical_buffer =
-            id_to_logical_buffer_.at(assigned.logical_buffer_id());
-        buffer_allocation_to_logical_buffers_[&buffer_allocation].push_back(
-            logical_buffer);
-        logical_buffer_to_buffer_allocation_[logical_buffer] =
-            &buffer_allocation;
+        const auto id = assigned.logical_buffer_id();
+        const auto* logical_buffer = id_to_logical_buffer_proto.at(id);
+        const auto& instruction_name =
+            logical_buffer->defined_at().instruction_name();
+        const auto* instruction =
+            instruction_name.empty()
+                ? unique_id_to_hlo.at(
+                      logical_buffer->defined_at().instruction_id())
+                : name_to_hlo.at(instruction_name);
+        id_to_logical_buffer_[id] = std::make_unique<LogicalBufferStruct>(
+            *logical_buffer, *buffer_allocation_s, *instruction,
+            assigned.offset());
+      }
+    }
+
+    const auto& heap_simulator_traces =
+        hlo_proto_.buffer_assignment().heap_simulator_traces();
+    for (int64_t i = 0; i < heap_simulator_traces.size(); i++) {
+      // The trace's buffer_allocation_index is not trustful, so we are trying
+      // to obtain the buffer allocation index ourselves.
+      if (heap_simulator_traces[i].events().empty()) continue;
+      int logical_buffer_id = heap_simulator_traces[i].events(0).buffer_id();
+      auto* logical_buffer = id_to_logical_buffer_[logical_buffer_id].get();
+      auto buffer_allocation_index = logical_buffer->buffer_allocation.index();
+      id_to_buffer_allocation_[buffer_allocation_index]
+          ->set_heap_simulator_trace_id(i);
+    }
+  }
+
+  // From a list of heap simulator traces, identify the one that has the largest
+  // number of memory events with color <memory_color>.
+  int64_t GetHeapSimulatorTraceIdFromEvents(const int64_t memory_color) const {
+    int64_t best_index = -1;
+    int64_t best_event_count = 0;
+    for (int64_t i = 0;
+         i < hlo_proto_.buffer_assignment().heap_simulator_traces_size(); i++) {
+      const auto& heap_simulator_trace =
+          hlo_proto_.buffer_assignment().heap_simulator_traces(i);
+      int64_t event_count = 0;
+      for (const auto& event : heap_simulator_trace.events()) {
+        const auto& logical_buffer =
+            id_to_logical_buffer_.at(event.buffer_id());
+        if (logical_buffer->color() == memory_color) {
+          event_count++;
+        }
+      }
+      if (event_count > best_event_count) {
+        best_index = i;
+        best_event_count = event_count;
+      }
+    }
+    return best_index;
+  }
+
+  // Tries to get heap simulator trace based on buffer_allocation_index.
+  int64_t GetHeapSimulatorTraceIdFromBufferAllocationIndex(
+      const int64_t memory_color) const {
+    auto buffer_allocations = GetBufferAllocations(memory_color);
+    for (const auto* buffer_allocation : buffer_allocations) {
+      if (buffer_allocation->IsIndefinite()) continue;
+      // TODO(xprof): handle multiple temporary buffer allocations for the same
+      // color.
+      if (buffer_allocation->heap_simulator_trace_id()) {
+        return *buffer_allocation->heap_simulator_trace_id();
       }
     }
+    return -1;
   }
 
   // Reference to the original HLO proto.
   const ::xla::HloProto& hlo_proto_;
 
-  // A mapping from name to HLO instruction.
-  absl::flat_hash_map<absl::string_view, const ::xla::HloInstructionProto*>
-      name_to_hlo_;
-
   // A mapping from logical buffer ID to logical buffer.
-  absl::flat_hash_map<int64_t, const ::xla::LogicalBufferProto*>
+  absl::flat_hash_map<int64_t, std::unique_ptr<LogicalBufferStruct>>
       id_to_logical_buffer_;
 
-  // A mapping from logical buffer to the buffer allocation it exists inside
-  // (there must be only one).
-  absl::flat_hash_map<const ::xla::LogicalBufferProto*,
-                      const ::xla::BufferAllocationProto*>
-      logical_buffer_to_buffer_allocation_;
-
-  // The reverse mapping from buffer allocation to all the logical buffers that
-  // exist inside it.
-  absl::flat_hash_map<const ::xla::BufferAllocationProto*,
-                      std::vector<const ::xla::LogicalBufferProto*>>
-      buffer_allocation_to_logical_buffers_;
-
-  // A mapping from logical buffer to its shape.
-  absl::flat_hash_map<const ::xla::LogicalBufferProto*, xla::Shape>
-      logical_buffer_to_shape_;
+  // A mapping from buffer allocation ID to BufferAllocationProto.
+  absl::flat_hash_map<int64_t, std::unique_ptr<BufferAllocationStruct>>
+      id_to_buffer_allocation_;
 };
 
 double BytesToMiB(int64_t bytes) {
-  return static_cast<double>(bytes) / tensorflow::MathUtil::IPow(2, 20);
-}
-
-// Get buffer allocation property.
-std::string GetAllocationGroupName(
-    const BufferAllocationProto& buffer_allocation) {
-  if (buffer_allocation.is_entry_computation_parameter()) {
-    return "Parameter";
-  } else if (buffer_allocation.maybe_live_out()) {
-    return "Output";
-  } else if (buffer_allocation.is_thread_local()) {
-    return "Thread-local";
-  } else {
-    return "Temporary";
-  }
-}
-
-// Get the instruction name with shape index for a logical buffer.
-std::string GetInstructionNameWithShapeIndex(
-    const LogicalBufferProto& logical_buffer) {
-  if (logical_buffer.defined_at().shape_index().empty()) {
-    return logical_buffer.defined_at().instruction_name();
-  } else {
-    return absl::StrCat(
-        logical_buffer.defined_at().instruction_name(), "{",
-        absl::StrJoin(logical_buffer.defined_at().shape_index(), ""), "}");
-  }
+  return static_cast<double>(bytes) / (1ULL << 20);
 }
 
-HeapObject MakeHeapObjectCommon(std::string label, int logical_buffer_id,
+HeapObject MakeHeapObjectCommon(std::string label, int32_t color,
+                                int64_t logical_buffer_id,
                                 int64_t logical_buffer_size_bytes,
                                 int64_t unpadded_shape_bytes) {
   HeapObject result;
+  result.set_numbered(color);
   result.set_label(std::move(label));
   result.set_logical_buffer_id(logical_buffer_id);
   result.set_logical_buffer_size_mib(BytesToMiB(logical_buffer_size_bytes));
@@ -205,33 +391,22 @@ HeapObject MakeHeapObjectCommon(std::string label, int logical_buffer_id,
   return result;
 }
 
-HeapObject MakeHeapObject(const std::string& tf_op_name,
-                          const std::string& shape_string,
-                          const std::string& op_code,
-                          std::string instruction_name, std::string group_name,
-                          std::string label, int color, int logical_buffer_id,
-                          int64_t logical_buffer_size_bytes,
-                          int64_t unpadded_shape_bytes) {
-  HeapObject result =
-      MakeHeapObjectCommon(std::move(label), logical_buffer_id,
-                           logical_buffer_size_bytes, unpadded_shape_bytes);
-  result.set_numbered(color);
-  result.set_instruction_name(std::move(instruction_name));
-  result.set_group_name(std::move(group_name));
-  result.set_tf_op_name(tf_op_name);
+HeapObject MakeHeapObject(const LogicalBufferStruct& logical_buffer,
+                          int32_t color) {
+  const HloInstructionProto& hlo_instruction = logical_buffer.hlo_instruction;
+  std::string shape_string = ShapeDescription(logical_buffer.shape);
+  std::string label =
+      absl::StrFormat("%s: %s # %s", logical_buffer.instruction_name(),
+                      shape_string, hlo_instruction.metadata().op_name());
+  HeapObject result = MakeHeapObjectCommon(
+      std::move(label), color, logical_buffer.proto.id(), logical_buffer.size(),
+      logical_buffer.unpadded_size());
+  result.set_instruction_name(
+      logical_buffer.GetInstructionNameWithShapeIndex());
+  result.set_group_name(logical_buffer.buffer_allocation.category());
+  result.set_tf_op_name(hlo_instruction.metadata().op_name());
   result.set_shape_string(shape_string);
-  result.set_op_code(op_code);
-  return result;
-}
-
-HeapObject MakeHeapObject(std::string color, std::string label,
-                          int logical_buffer_id,
-                          int64_t logical_buffer_size_bytes,
-                          int64_t unpadded_shape_bytes) {
-  HeapObject result =
-      MakeHeapObjectCommon(std::move(label), logical_buffer_id,
-                           logical_buffer_size_bytes, unpadded_shape_bytes);
-  result.set_named(std::move(color));
+  result.set_op_code(hlo_instruction.opcode());
   return result;
 }
 
@@ -242,42 +417,16 @@ BufferSpan MakeBufferSpan(int32 start, int32 limit) {
   return result;
 }
 
-// A wrapper around ShapeUtil::ByteSizeOf that clears out the layout/padding,
-// since that is considered in the ByteSizeOf calculation.
-int64_t ShapeUnpaddedSize(Shape shape) {
-  // Ensure the layout has no padding by making it the default layout.
-  LayoutUtil::SetToDefaultLayout(&shape);
-  // Note: we make a simplifying assumption here that a "minimal" size for a
-  // tuple member would be the size of a `void*` -- there may be even fancier
-  // ways of doing things, but this should give a good enough approximation of
-  // what a minimal tuple size is.
-  return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
-}
-
-std::string ShapeDescription(const Shape& shape) {
-  return ShapeUtil::HumanStringWithLayout(shape);
-}
-
-std::string BufferAllocationDescription(
-    const BufferAllocationProto& buffer_allocation) {
-  // Clear out the assigned logical buffers when stringifying the buffer
-  // allocation, as it can be a long list.
-  auto copy = buffer_allocation;
-  copy.mutable_assigned()->Clear();
-  return copy.ShortDebugString();
-}
-
 void Convert(const xla::BufferAllocationProto_Assigned& assigned,
              const HloProtoBufferWrapper& wrapper, LogicalBuffer* result) {
   result->set_id(assigned.logical_buffer_id()),
       result->set_size_mib(BytesToMiB(assigned.size()));
-  const LogicalBufferProto& proto =
+  const auto& logical_buffer =
       wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
-  const std::string& instruction_name = proto.defined_at().instruction_name();
-  result->set_hlo_name(instruction_name);
-  result->mutable_shape_index()->CopyFrom(proto.defined_at().shape_index());
-  const Shape& shape = wrapper.GetLogicalBufferShape(proto);
-  result->set_shape(ShapeDescription(shape));
+  result->set_hlo_name(std::string(logical_buffer.instruction_name()));
+  result->mutable_shape_index()->CopyFrom(
+      logical_buffer.proto.defined_at().shape_index());
+  result->set_shape(ShapeDescription(logical_buffer.shape));
 }
 
 bool IsReusable(const BufferAllocationProto& buffer_allocation) {
@@ -363,18 +512,15 @@ struct HeapSimulatorStats {
     unpadded_heap_size_bytes_timeline.push_back(unpadded_heap_size_bytes);
     const auto& logical_buffer = wrapper.GetLogicalBuffer(event.buffer_id());
     seen_logical_buffers.insert(&logical_buffer);
-    seen_buffer_allocations.insert(
-        &wrapper.GetBufferAllocation(logical_buffer));
+    seen_buffer_allocations.insert(&logical_buffer.buffer_allocation.proto());
   }
 
   // Update stats when memory usage increase.
-  void IncreaseMemoryUsage(const LogicalBufferProto& canonical_logical_buffer,
+  void IncreaseMemoryUsage(LogicalBufferStruct* canonical_logical_buffer,
                            bool init_buffer_span) {
-    logical_buffers.push_back(canonical_logical_buffer.id());
-    heap_size_bytes += canonical_logical_buffer.size();
-    const Shape& shape =
-        wrapper.GetLogicalBufferShape(canonical_logical_buffer);
-    unpadded_heap_size_bytes += ShapeUnpaddedSize(shape);
+    logical_buffers.push_back(canonical_logical_buffer->proto.id());
+    heap_size_bytes += canonical_logical_buffer->size();
+    unpadded_heap_size_bytes += canonical_logical_buffer->unpadded_size();
 
     // Increase peak memory usage if needed.
     int64_t prior_peak_heap_size_bytes = peak_heap_size_bytes;
@@ -390,30 +536,26 @@ struct HeapSimulatorStats {
     if (init_buffer_span) {
       // Initialize the buffer span from the current event to the last event in
       // heap simulator trace.
-      logical_buffer_spans[canonical_logical_buffer.id()] = {
-          heap_size_bytes_timeline.size() - 1, simulator_trace_event_size - 1};
+      canonical_logical_buffer->span.emplace(
+          heap_size_bytes_timeline.size() - 1, simulator_trace_event_size - 1);
     }
   }
 
   // Update stats when memory usage decrease.
-  Status DecreaseMemoryUsage(
-      const LogicalBufferProto& canonical_logical_buffer) {
-    int64_t canonical_buffer_id = canonical_logical_buffer.id();
-    logical_buffers.erase(
-        std::remove(logical_buffers.begin(), logical_buffers.end(),
-                    canonical_buffer_id),
-        logical_buffers.end());
-    heap_size_bytes -= canonical_logical_buffer.size();
+  Status DecreaseMemoryUsage(LogicalBufferStruct* canonical_logical_buffer) {
+    int64_t canonical_buffer_id = canonical_logical_buffer->proto.id();
+    logical_buffers.remove(canonical_buffer_id);
+    heap_size_bytes -= canonical_logical_buffer->size();
     if (heap_size_bytes < 0) {
       return errors::InvalidArgument(absl::StrCat(
           "Heap size should be non-negative, but get: ", heap_size_bytes));
     }
-    const Shape& shape =
-        wrapper.GetLogicalBufferShape(canonical_logical_buffer);
-    unpadded_heap_size_bytes -= ShapeUnpaddedSize(shape);
+    unpadded_heap_size_bytes -= canonical_logical_buffer->unpadded_size();
     // Mark the end of this buffer.
-    logical_buffer_spans[canonical_buffer_id].second =
-        heap_size_bytes_timeline.size() - 1;
+    if (canonical_logical_buffer->span) {
+      canonical_logical_buffer->span->second =
+          heap_size_bytes_timeline.size() - 1;
+    }
     return OkStatus();
   }
 
@@ -449,10 +591,11 @@ struct HeapSimulatorStats {
   int64_t peak_unpadded_heap_size_bytes = 0;
 
   // Keep track of logical buffer IDs when iterating through heap simulator
-  // trace events.
-  std::vector<int64_t> logical_buffers;
+  // trace events. It is important this is in "program order", i.e. heap
+  // simulator's order.
+  std::list<int64_t> logical_buffers;
   // Logical buffer IDs at peak.
-  std::vector<int64_t> peak_logical_buffers;
+  std::list<int64_t> peak_logical_buffers;
 
   // Heap size timeline.
   std::vector<int64_t> heap_size_bytes_timeline;
@@ -462,107 +605,69 @@ struct HeapSimulatorStats {
   int64_t peak_heap_size_position = 0;
 
   // Logical buffers and buffer allocations that exists in heap simulator trace.
-  absl::flat_hash_set<const LogicalBufferProto*> seen_logical_buffers;
+  absl::flat_hash_set<const LogicalBufferStruct*> seen_logical_buffers;
   absl::flat_hash_set<const BufferAllocationProto*> seen_buffer_allocations;
 
-  // Lifetime span of logical buffer.
-  absl::flat_hash_map<int64_t, std::pair<int64_t, int64_t>>
-      logical_buffer_spans;
-
   // Constants while iterating through heap simulator trace.
   const HloProtoBufferWrapper& wrapper;
   int64_t simulator_trace_event_size;
 };
 
-// Tracker for logical buffer sharing.
-class LogicalBufferShareTracker {
- public:
-  // Canonical logical buffer ID and its ref count.
-  struct BufferRefCount {
-    int64_t canonical_buffer_id;
-    int32_t ref_count;
-    BufferRefCount(int64_t id, int32_t count)
-        : canonical_buffer_id(id), ref_count(count) {}
-  };
-
-  // Process ALLOC event.
-  void ProcessAllocEvent(const HeapSimulatorTrace::Event& event) {
-    // The first time a canonical buffer is allocated.
-    canonical_buffer_ref_count_[event.buffer_id()] = 1;
-  }
-
-  // Process FREE event, return canonical buffer and its ref count.
-  StatusOr<BufferRefCount> ProcessFreeEvent(
-      const HeapSimulatorTrace::Event& event) {
-    // Get the canonical buffer ID of this free event.
-    int64_t canonical_buffer_id = event.buffer_id();
-    if (const int64_t* canonical_id =
-            gtl::FindOrNull(share_with_to_canonical_, event.buffer_id())) {
-      canonical_buffer_id = *canonical_id;
-    }
-    // Decrease the ref count of canonical buffer.
-    int32_t& ref_count = canonical_buffer_ref_count_[canonical_buffer_id];
-    --ref_count;
-    if (ref_count < 0) {
-      return errors::InvalidArgument(absl::StrCat(
-          "Buffer ", canonical_buffer_id, "is freed multiple times."));
-    }
-    return BufferRefCount(canonical_buffer_id, ref_count);
-  }
-
-  // Process SHARE_WITH event, return canonical buffer and its ref count.
-  BufferRefCount ProcessShareWithEvent(const HeapSimulatorTrace::Event& event) {
-    int64_t canonical_buffer_id = event.share_with_canonical_id();
-    share_with_to_canonical_[event.buffer_id()] = canonical_buffer_id;
-    // Increase the ref count of canonical buffer.
-    int32_t& ref_count = canonical_buffer_ref_count_[canonical_buffer_id];
-    ++ref_count;
-    return BufferRefCount(canonical_buffer_id, ref_count);
+Status ProcessHeapSimulatorTrace(const HloProtoBufferWrapper& wrapper,
+                                 const int64_t memory_color,
+                                 HeapSimulatorStats* stats) {
+  int64_t heap_simulator_trace_id =
+      wrapper.GetHeapSimulatorTraceId(memory_color);
+
+  // If unable to get a valid heap simulator trace id, skip heap simulator
+  // trace and process the rest of the buffers.
+  if (heap_simulator_trace_id < 0 ||
+      heap_simulator_trace_id >= wrapper.GetHloProto()
+                                     .buffer_assignment()
+                                     .heap_simulator_traces_size()) {
+    return OkStatus();
   }
 
- private:
-  // Map from the logical buffer ID of the SHARE_WITH buffer to the logical
-  // buffer ID of the canonical buffer being shared.
-  absl::flat_hash_map<int64_t, int64_t> share_with_to_canonical_;
-  // Number of times a canonical buffer is referenced.
-  absl::flat_hash_map<int64_t, int32_t> canonical_buffer_ref_count_;
-};
+  // Run through all the simulator events in the given trace, and simulate the
+  // heap in order to find the point of peak memory usage and record its
+  // associated metadata.
+  const auto& trace =
+      wrapper.GetHloProto().buffer_assignment().heap_simulator_traces(
+          heap_simulator_trace_id);
 
-Status ProcessHeapSimulatorTrace(const HloProtoBufferWrapper& wrapper,
-                                 const HeapSimulatorTrace& trace,
-                                 HeapSimulatorStats* stats) {
-  LogicalBufferShareTracker share_tracker;
   stats->SetSimulatorTraceEventSize(trace.events_size());
   for (const auto& event : trace.events()) {
     stats->UpdateOnSimulatorEvent(event);
-    const LogicalBufferProto& logical_buffer =
-        wrapper.GetLogicalBuffer(event.buffer_id());
+    auto& logical_buffer = wrapper.GetLogicalBuffer(event.buffer_id());
     if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
-      share_tracker.ProcessAllocEvent(event);
       // ALLOC event increases memory usage and initializes the buffer lifetime
       // span.
-      stats->IncreaseMemoryUsage(logical_buffer,
+      logical_buffer.inc();
+      stats->IncreaseMemoryUsage(&logical_buffer,
                                  /*init_buffer_span=*/true);
     } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-      TF_ASSIGN_OR_RETURN(auto buffer_ref_count,
-                          share_tracker.ProcessFreeEvent(event));
-      if (buffer_ref_count.ref_count == 0) {
+      auto ref_count = logical_buffer.dec();
+      if (ref_count < 0) {
+        return errors::InvalidArgument(absl::StrCat(
+            "Buffer ", logical_buffer.proto.id(), "is freed multiple times."));
+      }
+      if (ref_count == 0) {
         // There is no more reference to the canonical buffer, the canonical
         // buffer is finally freed. Update memory usage and memory timespan
         // using the metadata of canonical buffer.
-        const LogicalBufferProto& canonical_buffer =
-            wrapper.GetLogicalBuffer(buffer_ref_count.canonical_buffer_id);
-        TF_RETURN_IF_ERROR(stats->DecreaseMemoryUsage(canonical_buffer));
+        auto& canonical_buffer = *logical_buffer.get_canonical_buffer();
+        TF_RETURN_IF_ERROR(stats->DecreaseMemoryUsage(&canonical_buffer));
       }
     } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-      auto buffer_ref_count = share_tracker.ProcessShareWithEvent(event);
-      if (buffer_ref_count.ref_count == 1) {
+      int64_t canonical_buffer_id = event.share_with_canonical_id();
+      auto& canonical_buffer = wrapper.GetLogicalBuffer(canonical_buffer_id);
+      auto ref_count = logical_buffer.share_with(&canonical_buffer);
+
+      if (ref_count == 1) {
         // SHARE_WITH happens after the FREE of a canonical buffer.
-        const LogicalBufferProto& canonical_buffer =
-            wrapper.GetLogicalBuffer(buffer_ref_count.canonical_buffer_id);
         // SHARE_WITH event does not initialize buffer lifetime span, it was
         // initialized by ALLOC event using the canonical logical buffer.
-        stats->IncreaseMemoryUsage(canonical_buffer,
+        stats->IncreaseMemoryUsage(&canonical_buffer,
                                    /*init_buffer_span=*/false);
       }
     } else {
@@ -574,134 +679,284 @@ Status ProcessHeapSimulatorTrace(const HloProtoBufferWrapper& wrapper,
   return OkStatus();
 }
 
-}  // namespace
+// The stats when processing buffer allocations and logical buffers.
+struct PeakUsageSnapshot {
+  PeakUsageSnapshot(const HloProtoBufferWrapper& wrapper,
+                    const HeapSimulatorStats& simulator_stats,
+                    int64_t small_buffer_size)
+      : wrapper(wrapper),
+        simulator_stats(simulator_stats),
+        small_buffer_size(small_buffer_size) {}
+
+  // Add a HeapObject derived from logical buffer and buffer allocation.
+  void AddHeapObject(const LogicalBufferStruct& logical_buffer) {
+    if (logical_buffer.size() < small_buffer_size) {
+      // Accumulate small buffers, don't make a HeapObject.
+      total_small_buffer_size_bytes += logical_buffer.size();
+    } else {
+      // Make a new HeapObject, assign a new color to visualize it.
+      max_heap_objects.push_back(MakeHeapObject(logical_buffer, colorno++));
+    }
+  }
 
-absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
-    const HloProto& hlo_proto, int64_t small_buffer_size,
-    int64_t heap_simulator_trace_id, int64_t memory_color) {
-  HloProtoBufferWrapper wrapper(hlo_proto);
-  HeapSimulatorStats simulator_stats(wrapper);
+  void FinalizeBufferUsage() {
+    // Buffers from HeapSimulatorTrace.
+    for (const int64_t logical_buffer_id :
+         simulator_stats.peak_logical_buffers) {
+      const auto& logical_buffer = wrapper.GetLogicalBuffer(logical_buffer_id);
+      AddHeapObject(logical_buffer);
+    }
 
-  // Run through all the simulator events in the given trace, and simulate the
-  // heap in order to find the point of peak memory usage and record its
-  // associated metadata.
-  if (heap_simulator_trace_id >= 0 &&
-      heap_simulator_trace_id <
-          hlo_proto.buffer_assignment().heap_simulator_traces_size()) {
-    auto status = ProcessHeapSimulatorTrace(
-        wrapper,
-        hlo_proto.buffer_assignment().heap_simulator_traces(
-            heap_simulator_trace_id),
-        &simulator_stats);
-    if (!status.ok()) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Failed to process heap simulator trace: ", status.error_message()));
+    // Make a single HeapObject out of all the small buffers.
+    if (total_small_buffer_size_bytes != 0) {
+      max_heap_objects.push_back(MakeHeapObjectCommon(
+          absl::StrFormat("small (<%d bytes)", small_buffer_size), colorno++,
+          /*logical_buffer_id=*/-1, total_small_buffer_size_bytes,
+          /*unpadded_shape_bytes=*/0));
     }
   }
 
-  // Process indefinite memory usage.
+  // All the HeapObjects at peak memory time.
+  std::vector<HeapObject> max_heap_objects;
+  // The total size of all memory buffers with indefinite lifetime.
   int64_t indefinite_memory_usage_bytes = 0;
-  std::vector<HeapObject> max_heap;
-  int colorno = 0;
-  int64_t rest = 0;
-
-  // Helper lambda that adds the logical buffer as an element in the "max heap"
-  // view with constitutent logical buffers.
-  auto add_heap_object = [&](const LogicalBufferProto& logical_buffer,
-                             const BufferAllocationProto& buffer_allocation) {
-    if (logical_buffer.size() <= small_buffer_size) {
-      rest += logical_buffer.size();
-      return;
-    }
+  // The accumulated size of all small buffers.
+  int64_t total_small_buffer_size_bytes = 0;
+  // Tracker of memory viewer color.
+  int32_t colorno = 0;
+
+  const HloProtoBufferWrapper& wrapper;
+  const HeapSimulatorStats& simulator_stats;
+  const int64_t small_buffer_size;
+};
+
+void CreatePeakUsageSnapshot(const HloProtoBufferWrapper& wrapper,
+                             int64_t memory_color,
+                             PeakUsageSnapshot* peak_snapshot) {
+  // Add indefinite (global) buffers to peak usage snapshot.
+  for (const auto* logical_buffer :
+       wrapper.LogicalBuffersWithIndefiniteLifetime(memory_color)) {
+    const auto& buffer_allocation = logical_buffer->buffer_allocation;
+    peak_snapshot->indefinite_memory_usage_bytes += buffer_allocation.size();
+    peak_snapshot->AddHeapObject(*logical_buffer);
+  }
 
-    const Shape& shape = wrapper.GetLogicalBufferShape(logical_buffer);
-    const HloInstructionProto& hlo_instruction =
-        wrapper.GetHloInstruction(logical_buffer);
-    std::string shape_string = ShapeDescription(shape);
-    int64_t unpadded_shape_bytes = ShapeUnpaddedSize(shape);
-    std::string label =
-        StrFormat("%s: %s # %s", logical_buffer.defined_at().instruction_name(),
-                  shape_string, hlo_instruction.metadata().op_name());
-    max_heap.push_back(MakeHeapObject(
-        hlo_instruction.metadata().op_name(), shape_string,
-        hlo_instruction.opcode(),
-        GetInstructionNameWithShapeIndex(logical_buffer),
-        GetAllocationGroupName(buffer_allocation), std::move(label), colorno++,
-        logical_buffer.id(), logical_buffer.size(), unpadded_shape_bytes));
+  // Add temporary buffers (traced by heap simulator) to peak usage snapshot.
+  peak_snapshot->FinalizeBufferUsage();
+}
+
+void ConvertAllocationTimeline(const HloProtoBufferWrapper& wrapper,
+                               const HeapSimulatorStats& simulator_stats,
+                               const int64_t memory_color,
+                               PreprocessResult* result) {
+  // The color constants from https://graphviz.org/doc/info/colors.html.
+  const char* lb_colors[] = {
+      "antiquewhite3",
+      "aqua",
+      "aquamarine",
+      "bisque",
+      "blanchedalmond",
+      "blue",
+      "blueviolet",
+      "brown",
+      "burlywood",
+      "cadetblue",
+      "chartreuse",
+      "chocolate",
+      "coral",
+      "cornflowerblue",
+      "crimson",
+      "cyan",
+      "darkblue",
+      "darkcyan",
+      "darkgoldenrod",
+      "darkgray",
+      "darkgreen",
+      "darkkhaki",
+      "darkmagenta",
+      "darkolivegreen",
+      "darkorange",
+      "darkorchid",
+      "darkred",
+      "darksalmon",
+      "darkseagreen",
+      "darkslateblue",
+      "darkslategray",
+      "darkturquoise",
+      "darkviolet",
+      "deeppink",
+      "deepskyblue",
+      "dimgray",
+      "dodgerblue",
+      "firebrick",
+      "floralwhite",
+      "forestgreen",
+      "fuchsia",
+      "gainsboro",
+      "gold",
+      "goldenrod",
+      "green",
+      "greenyellow",
+      "goldenrod",
+      "greenyellow",
+      "honeydew",
+      "hotpink",
+      "indianred",
+      "indigo",
+      "ivory3",
+      "khaki",
+      "lavender",
+      "lavenderblush",
+      "lawngreen",
+      "lemonchiffon",
+      "lightblue",
+      "lightcoral",
+      "lightcyan",
+      "lightpink",
+      "limegreen",
+      "lightsalmon",
+      "lightseagreen",
+      "lightskyblue",
+      "lime",
+      "magenta",
+      "maroon",
+      "mediumaquamarine",
+      "mediumblue",
+      "mediumorchid",
+      "mediumpurple",
+      "midnightblue",
+      "mediumvioletred",
+      "mistyrose",
+      "moccasin",
+      "olive",
+      "orange",
+      "orangered",
+      "orchid",
+      "palegoldenrod"
+      "palegreen",
+      "paleturquoise",
+      "palevioletred",
+      "papayawhip",
+      "peachpuff",
+      "peachpuff",
+      "pink",
+      "plum",
+      "powderblue",
+      "purple",
+      "rebeccapurple",
+      "red",
+      "rosybrown",
+      "royalblue",
+      "salmon",
+      "sandybrown",
+      "seagreen",
+      "seashell",
+      "sienna",
+      "skyblue",
+      "tan",
+      "teal",
+      "turquoise",
+      "tomato",
+      "violet",
+      "violetred",
+      "yellow",
   };
 
-  // Now look for all logical buffers which have not been seen, and assume they
-  // have indefinite lifetime if they are not in thread-local buffer
-  // allocations.
-  absl::flat_hash_set<const LogicalBufferProto*> unseen;
-  for (const LogicalBufferProto& logical_buffer :
-       wrapper.GetHloProto().buffer_assignment().logical_buffers()) {
-    if (!simulator_stats.seen_logical_buffers.contains(&logical_buffer)) {
-      unseen.insert(&logical_buffer);
-    }
+  struct RenderOptions {
+    size_t graph_width = 2048;
+    size_t graph_height = 2048;
+  } render_options;
+
+  const char* ba_colors[] = {
+      "azure",
+      "beige",
+      "cornsilk",
+  };
+
+  int num_lb_colors = sizeof(lb_colors) / sizeof(lb_colors[0]);
+  int num_ba_colors = sizeof(ba_colors) / sizeof(ba_colors[0]);
+  std::vector<size_t> buffer_allocation_offsets;
+  size_t total_y_size = 0;  // Range of y dimension.
+  size_t total_x_size = 0;  // Range of x dimension.
+  std::vector<std::string> rects;
+  auto buffer_allocations = wrapper.GetBufferAllocations(memory_color);
+  const auto& heap_simulator_traces =
+      wrapper.GetHloProto().buffer_assignment().heap_simulator_traces();
+  for (const auto& buffer_allocation : buffer_allocations) {
+    // Exclude BAs for "global variables". The timeline provides little value.
+    if (buffer_allocation->IsIndefinite()) continue;
+    auto heap_simulator_trace_id = buffer_allocation->heap_simulator_trace_id();
+    if (!heap_simulator_trace_id) continue;
+    buffer_allocation_offsets.push_back(total_y_size);
+    total_y_size += buffer_allocation->size();
+    total_x_size = std::max<size_t>(
+        total_x_size,
+        heap_simulator_traces.at(*heap_simulator_trace_id).events_size());
   }
-  for (const LogicalBufferProto* logical_buffer : unseen) {
-    const BufferAllocationProto& buffer_allocation =
-        wrapper.GetBufferAllocation(*logical_buffer);
-    if (buffer_allocation.is_thread_local()) {
-      continue;
-    }
-    if (logical_buffer->color() != memory_color) {
-      continue;
-    }
-    if (simulator_stats.seen_buffer_allocations.insert(&buffer_allocation)
-            .second) {
-      indefinite_memory_usage_bytes += buffer_allocation.size();
-      const auto& logical_buffers =
-          wrapper.GetLogicalBuffersFromBufferAllocation(buffer_allocation);
-      if (logical_buffers.size() == 1) {
-        add_heap_object(*logical_buffers.front(), buffer_allocation);
-      } else {
-        VLOG(1) << "Indefinite lifetime, no heap object shown due to "
-                   "multiple logical buffers in buffer allocation: "
-                << logical_buffer->ShortDebugString()
-                << " :: " << BufferAllocationDescription(buffer_allocation);
-      }
-      if (buffer_allocation.size() > small_buffer_size) {
-        VLOG(1) << "Indefinite memory usage now: "
-                << indefinite_memory_usage_bytes << " bytes (+"
-                << buffer_allocation.size() << " bytes)";
-      }
+  if (!total_y_size || !total_x_size) return;
+  double scale_x =
+      static_cast<double>(render_options.graph_width) / total_x_size;
+  double scale_y =
+      static_cast<double>(render_options.graph_height) / total_y_size;
+
+  int node_id = 0;
+  auto add_rect = [&](size_t x, size_t y, size_t width, size_t height,
+                      const string& description, const char* color) {
+    size_t center_x = x + (width >> 1);
+    size_t center_y = y + (height >> 1);
+    int pos_x = center_x * scale_x;
+    int pos_y = center_y * scale_y;
+    int rect_w = width * scale_x;
+    int rect_h = height * scale_y;
+    // Skip when block size is smaller than half a pixel in output size.
+    if (height * scale_y < 0.5) return;
+    rect_h = std::max(rect_h, 1);  // Rounding up.
+    std::string rect = absl::StrFormat(
+        R"("%d" [tooltip="%s", pos="%d,%d!", width="%d!", height="%d!", color=%s];)",
+        node_id++, description, pos_x, pos_y, rect_w, rect_h, color);
+    rects.push_back(rect);
+  };
+  int buffer_id = 0;
+  for (const auto& buffer_allocation : buffer_allocations) {
+    // Exclude BAs for "global variables". The timeline provides little value.
+    if (buffer_allocation->IsIndefinite()) continue;
+    auto buffer_allocation_offset = buffer_allocation_offsets[buffer_id++];
+    add_rect(0, buffer_allocation_offset, total_x_size,
+             buffer_allocation->size(), buffer_allocation->description(),
+             ba_colors[buffer_id % num_ba_colors]);
+
+    for (const auto& assigned : buffer_allocation->proto().assigned()) {
+      const LogicalBufferStruct& logical_buffer =
+          wrapper.GetLogicalBuffer(assigned.logical_buffer_id());
+      // Exclude non-canonical logical buffers.
+      if (!logical_buffer.span || logical_buffer.canonical_buffer) continue;
+      size_t width = logical_buffer.span->second - logical_buffer.span->first;
+      size_t height = buffer_allocation_offset + logical_buffer.size();
+      add_rect(logical_buffer.span->first, logical_buffer.offset, width, height,
+               logical_buffer.description(),
+               lb_colors[node_id % num_lb_colors]);
     }
   }
+  VLOG(1) << "rects:" << rects.size();
+  result->set_allocation_timeline(
+      absl::StrFormat("graph G {\n node [shape=box,style=filled];\n %s\n}",
+                      absl::StrJoin(rects, "\n")));
+}
 
-  // For the buffers that have indefinite lifetime (that is, lifetime not
-  // reflected by the heap simulation) add it to the peak values and the vectors
-  // of heap sizes.
-  simulator_stats.peak_heap_size_bytes += indefinite_memory_usage_bytes;
-  simulator_stats.peak_unpadded_heap_size_bytes +=
-      indefinite_memory_usage_bytes;
-  for (int i = 0; i < simulator_stats.heap_size_bytes_timeline.size(); ++i) {
-    simulator_stats.heap_size_bytes_timeline[i] +=
-        indefinite_memory_usage_bytes;
-    simulator_stats.unpadded_heap_size_bytes_timeline[i] +=
-        indefinite_memory_usage_bytes;
-  }
-
-  // Accumulate data for use in a stacked bar plot.
-  //
-  // We accumulate it in "program order" -- the order in which it was placed
-  // into the logical_buffers sequence above was program order, and we iterate
-  // that order to create data points.
-  for (int logical_buffer_id : simulator_stats.peak_logical_buffers) {
-    const auto& logical_buffer = wrapper.GetLogicalBuffer(logical_buffer_id);
-    const auto& buffer_allocation = wrapper.GetBufferAllocation(logical_buffer);
-    add_heap_object(logical_buffer, buffer_allocation);
-  }
-  if (rest != 0) {
-    max_heap.push_back(MakeHeapObject(
-        "gray", StrFormat("small (<%d bytes)", small_buffer_size), -1, rest,
-        0));
-  }
-
+void GeneratePreprocessResult(const HloProtoBufferWrapper& wrapper,
+                              const HeapSimulatorStats& simulator_stats,
+                              const PeakUsageSnapshot& peak_snapshot,
+                              const int64_t memory_color,
+                              PreprocessResult* result) {
+  // Module info.
+  result->set_module_name(wrapper.GetHloProto().hlo_module().name());
+  result->set_entry_computation_name(
+      wrapper.GetHloProto().hlo_module().entry_computation_name());
+
+  // Build HeapObjects and index.
   std::vector<const HeapObject*> max_heap_by_size;
-  max_heap_by_size.reserve(max_heap.size());
-  for (const auto& object : max_heap) {
+  max_heap_by_size.reserve(peak_snapshot.max_heap_objects.size());
+  for (const auto& object : peak_snapshot.max_heap_objects) {
     max_heap_by_size.push_back(&object);
   }
   std::sort(max_heap_by_size.begin(), max_heap_by_size.end(),
@@ -711,8 +966,8 @@ absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
             });
 
   std::vector<int> max_heap_to_by_size;
-  max_heap_to_by_size.reserve(max_heap.size());
-  for (const auto& object : max_heap) {
+  max_heap_to_by_size.reserve(max_heap_by_size.size());
+  for (const auto& object : peak_snapshot.max_heap_objects) {
     auto it =
         std::find(max_heap_by_size.begin(), max_heap_by_size.end(), &object);
     int index = std::distance(max_heap_by_size.begin(), it);
@@ -721,124 +976,79 @@ absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
 
   std::vector<int> by_size_to_max_heap;
   for (const auto* object : max_heap_by_size) {
-    int index = object - &max_heap[0];
+    int index = object - &peak_snapshot.max_heap_objects[0];
     by_size_to_max_heap.push_back(index);
   }
 
-  PreprocessResult result;
-  result.set_module_name(hlo_proto.hlo_module().name());
-  result.set_entry_computation_name(
-      hlo_proto.hlo_module().entry_computation_name());
+  *result->mutable_max_heap() = {peak_snapshot.max_heap_objects.begin(),
+                                 peak_snapshot.max_heap_objects.end()};
+  result->mutable_max_heap_by_size()->Reserve(max_heap_by_size.size());
+  for (const HeapObject* o : max_heap_by_size) {
+    *result->add_max_heap_by_size() = *o;
+  }
+  *result->mutable_max_heap_to_by_size() = {max_heap_to_by_size.begin(),
+                                            max_heap_to_by_size.end()};
+  *result->mutable_by_size_to_max_heap() = {by_size_to_max_heap.begin(),
+                                            by_size_to_max_heap.end()};
+
+  // For the buffers that have indefinite lifetime (that is, lifetime not
+  // reflected by the heap simulation) add it to the peak values and the vectors
+  // of heap sizes.
   size_t timeline_size = simulator_stats.heap_size_bytes_timeline.size();
-  result.mutable_heap_sizes()->Reserve(timeline_size);
-  result.mutable_unpadded_heap_sizes()->Reserve(timeline_size);
+  double add_mib = BytesToMiB(peak_snapshot.indefinite_memory_usage_bytes);
+  result->mutable_heap_sizes()->Reserve(timeline_size);
+  result->mutable_unpadded_heap_sizes()->Reserve(timeline_size);
   for (size_t i = 0; i < timeline_size; i++) {
-    result.add_heap_sizes(
-        BytesToMiB(simulator_stats.heap_size_bytes_timeline[i]));
-    result.add_unpadded_heap_sizes(
-        BytesToMiB(simulator_stats.unpadded_heap_size_bytes_timeline[i]));
-  }
-  *result.mutable_max_heap() = {max_heap.begin(), max_heap.end()};
-  for (const HeapObject* o : max_heap_by_size) {
-    *result.add_max_heap_by_size() = *o;
+    result->add_heap_sizes(
+        BytesToMiB(simulator_stats.heap_size_bytes_timeline[i]) + add_mib);
+    result->add_unpadded_heap_sizes(
+        BytesToMiB(simulator_stats.unpadded_heap_size_bytes_timeline[i]) +
+        add_mib);
   }
-  *result.mutable_max_heap_to_by_size() = {max_heap_to_by_size.begin(),
-                                           max_heap_to_by_size.end()};
-  *result.mutable_by_size_to_max_heap() = {by_size_to_max_heap.begin(),
-                                           by_size_to_max_heap.end()};
-  result.set_peak_heap_mib(BytesToMiB(simulator_stats.peak_heap_size_bytes));
-  result.set_peak_unpadded_heap_mib(
-      BytesToMiB(simulator_stats.peak_unpadded_heap_size_bytes));
-  result.set_peak_heap_size_position(simulator_stats.peak_heap_size_position);
 
-  for (const auto& item : simulator_stats.logical_buffer_spans) {
-    (*result.mutable_logical_buffer_spans())[item.first] =
-        MakeBufferSpan(item.second.first, item.second.second);
+  result->set_peak_heap_mib(BytesToMiB(simulator_stats.peak_heap_size_bytes) +
+                            add_mib);
+  result->set_peak_unpadded_heap_mib(
+      BytesToMiB(simulator_stats.peak_unpadded_heap_size_bytes) + add_mib);
+  result->set_peak_heap_size_position(simulator_stats.peak_heap_size_position);
+
+  // Build buffer lifespan.
+  for (const auto* logical_buffer : simulator_stats.seen_logical_buffers) {
+    if (!logical_buffer->span) continue;
+    (*result->mutable_logical_buffer_spans())[logical_buffer->proto.id()] =
+        MakeBufferSpan(logical_buffer->span->first,
+                       logical_buffer->span->second);
   }
 
-  NoteSpecialAllocations(wrapper, small_buffer_size, &result);
-  return result;
+  NoteSpecialAllocations(wrapper, peak_snapshot.small_buffer_size, result);
+
+  ConvertAllocationTimeline(wrapper, simulator_stats, memory_color, result);
 }
 
-// From a list of heap simulator traces, identify the one that has the largest
-// number of memory events with color <memory_color>.
-// If unable to find the heap simulator trace, return -1, and
-// ConvertHloProtoToPreprocessResult will not consider heap_simulator_traces
-// during preprocess.
-int64_t GetHeapSimulatorTraceIdFromEvents(const HloProto& proto,
-                                          int64_t memory_color) {
-  absl::flat_hash_map<int64_t, const xla::LogicalBufferProto*>
-      id_to_logical_buffer;
-  for (const auto& logical_buffer :
-       proto.buffer_assignment().logical_buffers()) {
-    id_to_logical_buffer[logical_buffer.id()] = &logical_buffer;
-  }
-  int64_t best_index = -1;
-  int64_t best_event_count = 0;
-  for (int64_t i = 0;
-       i < proto.buffer_assignment().heap_simulator_traces_size(); i++) {
-    const auto& heap_simulator_trace =
-        proto.buffer_assignment().heap_simulator_traces(i);
-    int64_t event_count = 0;
-    for (const auto& event : heap_simulator_trace.events()) {
-      const auto iter = id_to_logical_buffer.find(event.buffer_id());
-      if (iter == id_to_logical_buffer.end()) {
-        continue;
-      }
-      if (iter->second->color() == memory_color) {
-        event_count++;
-      }
-    }
-    if (event_count > best_event_count) {
-      best_index = i;
-      best_event_count = event_count;
-    }
-  }
+}  // namespace
 
-  return best_index;
-}
+absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
+    const HloProto& hlo_proto, int64_t small_buffer_size,
+    int64_t memory_color) {
+  HloProtoBufferWrapper wrapper(hlo_proto);
 
-// Tries to get the correct heap simulator trace based on
-// buffer_allocation_index.
-int64_t GetHeapSimulatorTraceIdFromBufferAllocationIndex(const HloProto& proto,
-                                                         int64_t memory_color) {
-  absl::flat_hash_map<int64_t, const xla::BufferAllocationProto*>
-      id_to_buffer_allocation;
-  for (const auto& buffer_allocation :
-       proto.buffer_assignment().buffer_allocations()) {
-    id_to_buffer_allocation[buffer_allocation.index()] = &buffer_allocation;
-  }
-  for (int64_t i = 0;
-       i < proto.buffer_assignment().heap_simulator_traces_size(); ++i) {
-    int64_t buffer_allocation_index = proto.buffer_assignment()
-                                          .heap_simulator_traces(i)
-                                          .buffer_allocation_index();
-    const auto iter = id_to_buffer_allocation.find(buffer_allocation_index);
-    if (buffer_allocation_index && iter != id_to_buffer_allocation.end()) {
-      // Find the heap simulator trace that corresponds to the HLO temporaries
-      // buffer allocation, where is_thread_local,
-      // is_entry_computation_parameter, is_constant, and maybe_live_out will
-      // all be false.
-      const auto* buffer_allocation = iter->second;
-      if (buffer_allocation->color() == memory_color &&
-          !buffer_allocation->is_thread_local() &&
-          !buffer_allocation->is_entry_computation_parameter() &&
-          !buffer_allocation->is_constant() &&
-          !buffer_allocation->maybe_live_out()) {
-        return i;
-      }
-    }
+  // Process heap simulator trace.
+  HeapSimulatorStats simulator_stats(wrapper);
+  auto status =
+      ProcessHeapSimulatorTrace(wrapper, memory_color, &simulator_stats);
+  if (!status.ok()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Failed to process heap simulator trace: ", status.error_message()));
   }
-  return -1;
-}
 
-int64_t GetHeapSimulatorTraceId(const HloProto& proto, int64_t memory_color) {
-  int64_t id =
-      GetHeapSimulatorTraceIdFromBufferAllocationIndex(proto, memory_color);
-  if (id != -1) {
-    return id;
-  }
-  return GetHeapSimulatorTraceIdFromEvents(proto, memory_color);
+  // Process buffers with indefinite lifetime.
+  PeakUsageSnapshot peak_snapshot(wrapper, simulator_stats, small_buffer_size);
+  CreatePeakUsageSnapshot(wrapper, memory_color, &peak_snapshot);
+
+  PreprocessResult result;
+  GeneratePreprocessResult(wrapper, simulator_stats, peak_snapshot,
+                           memory_color, &result);
+  return result;
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
index e88677d85ad..e89f2b5fd2e 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.h
@@ -25,21 +25,18 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+constexpr int kSmallBufferSize = 16 * 1024;
+
 // Convert HloProto to PreprocessResult proto for memory visualization.
 // small_buffer_size sets the byte size within which we collapse buffer entries
 // for the max-heap display.
-// heap_simulator_trace_id sets the index of heap simulator trace to be
-// displayed. If it is set to -1, then HLOProto.heap_simulator_traces will not
-// be considered during the preprocess.
+// <heap_simulator_trace_id> is the index of heap simulator trace to be
+// displayed. By default it is -1, which means the profiler will infer the heap
+// simulator trace id from <memory_color>.
 // By default the memory color is 0, which is HBM.
 absl::StatusOr<PreprocessResult> ConvertHloProtoToPreprocessResult(
-    const xla::HloProto& hlo_proto, int64_t small_buffer_size,
-    int64_t heap_simulator_trace_id, int64_t memory_color = 0);
-
-// Get the heap simulator trace ID from HLO proto.
-// By default the memory color is 0, which is HBM.
-int64_t GetHeapSimulatorTraceId(const xla::HloProto& proto,
-                                int64_t memory_color = 0);
+    const xla::HloProto& hlo_proto,
+    int64_t small_buffer_size = kSmallBufferSize, int64_t memory_color = 0);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc
index 7b5817cc392..179e2a95ab4 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils_test.cc
@@ -38,17 +38,19 @@ static constexpr char kHLOBase[] = R"pb(
       name: "test_computation"
       instructions {
         name: "fusion.1"
+        id: 0
         shape { tuple_shapes { element_type: U64 } }
       }
       instructions {
         name: "fusion.2"
+        id: 1
         shape { tuple_shapes { element_type: U64 } }
       }
     }
   }
   buffer_assignment {
     buffer_allocations {
-      index: 1
+      index: 0
       size: 1048576
       color: 0
       assigned { logical_buffer_id: 1 offset: 0 size: 524288 }
@@ -58,13 +60,13 @@ static constexpr char kHLOBase[] = R"pb(
       id: 1
       size: 524288
       color: 0
-      defined_at { instruction_name: "fusion.1" shape_index: 0 }
+      defined_at { instruction_id: 0 shape_index: 0 }
     }
     logical_buffers {
       id: 2
       size: 524288
       color: 0
-      defined_at { instruction_name: "fusion.2" shape_index: 0 }
+      defined_at { instruction_id: 1 shape_index: 0 }
     }
     heap_simulator_traces { %s }
   }
@@ -84,8 +86,7 @@ TEST(MemoryViewerTest, TestHeapSimulatorTraceShareWith_1) {
       proto_utils::ParseTextFormatFromString(hlo_string, &hlo_proto).ok());
   TF_ASSERT_OK_AND_ASSIGN(
       PreprocessResult preprocess_result,
-      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0,
-                                        /*heap_simulator_trace_id=*/0));
+      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0));
   EXPECT_EQ(preprocess_result.peak_heap_mib(), 0.5);
 }
 
@@ -103,9 +104,9 @@ TEST(MemoryViewerTest, TestHeapSimulatorTraceShareWith_2) {
       proto_utils::ParseTextFormatFromString(hlo_string, &hlo_proto).ok());
   TF_ASSERT_OK_AND_ASSIGN(
       PreprocessResult preprocess_result,
-      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0,
-                                        /*heap_simulator_trace_id=*/0));
+      ConvertHloProtoToPreprocessResult(hlo_proto, /*small_buffer_size=*/0));
   EXPECT_EQ(preprocess_result.peak_heap_mib(), 0.5);
+  EXPECT_FALSE(preprocess_result.allocation_timeline().empty());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
index 96d726e3a46..51f1b76805a 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.cc
@@ -42,8 +42,7 @@ StatusOr<std::string> ConvertHloProtoToMemoryViewer(
   static constexpr int kMemorySpaceColor = 0;         // HBM
 
   auto result_or = ConvertHloProtoToPreprocessResult(
-      hlo_proto, kSmallBufferSize,
-      GetHeapSimulatorTraceId(hlo_proto, kMemorySpaceColor), kMemorySpaceColor);
+      hlo_proto, kSmallBufferSize, kMemorySpaceColor);
   if (!result_or.ok()) {
     return errors::Internal(
         "Failed to convert HLO proto to memory viewer result: ",
@@ -66,7 +65,7 @@ StatusOr<std::string> ConvertHloProtoToMemoryViewer(
 }
 
 StatusOr<std::string> ConvertHloProtoToGraphViewer(
-    const xla::HloProto& hlo_proto, const HloToolOptions& options) {
+    const xla::HloProto& hlo_proto, const ToolOptions& options) {
   TF_ASSIGN_OR_RETURN(GraphViewerParams params,
                       ParseGraphViewerParams(options));
   if (params.type == "graph") {
@@ -83,9 +82,10 @@ StatusOr<std::string> ConvertHloProtoToGraphViewer(
 
 StatusOr<std::string> ConvertHloProtoToToolData(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
-    const HloToolOptions& options) {
+    const ToolOptions& options) {
   // <options> must provide a hlo module_name field to identify the HLO module.
-  std::optional<std::string> hlo_module_name = options.module_name;
+  std::optional<std::string> hlo_module_name =
+      GetParam<std::string>(options, "module_name");
   if (!hlo_module_name.has_value() || hlo_module_name->empty()) {
     return errors::InvalidArgument(
         "Can not find HLO module name from options.");
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.h b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
index bede2f8a316..7f43c2df30c 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.h
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
@@ -33,7 +33,7 @@ namespace profiler {
 // successful, else return an error status.
 StatusOr<std::string> ConvertHloProtoToToolData(
     const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
-    const HloToolOptions& options);
+    const ToolOptions& options);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc
new file mode 100644
index 00000000000..27f97d42f08
--- /dev/null
+++ b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.cc
@@ -0,0 +1,66 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
+#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+Status ConvertMultiXSpacesToCombinedOpStats(
+    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
+    OpStats* combined_op_stats) {
+  // Read multiple XSpaces and convert to multiple OpStats.
+  // TODO(profiler): Change the combiner to convert and combine one OpStats at a
+  // time, to reduce peak memory usage.
+  std::vector<OpStats> all_op_stats;
+  all_op_stats.reserve(session_snapshot.XSpaceSize());
+  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
+                        session_snapshot.GetXSpace(i));
+    PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
+                               /*derived_timeline=*/false);
+    all_op_stats.push_back(ConvertXSpaceToOpStats(*xspace, options));
+  }
+
+  // Combine OpStats.
+  std::vector<OpStatsInfo> all_op_stats_info;
+  all_op_stats_info.reserve(all_op_stats.size());
+  for (int i = 0; i < all_op_stats.size(); i++) {
+    all_op_stats_info.emplace_back(
+        &all_op_stats[i],
+        ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
+  }
+
+  // Do not limit the maximum number of steps during the merge of OpStats.
+  StepIntersection step_intersection =
+      ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
+  CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
+
+  return OkStatus();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
new file mode 100644
index 00000000000..ddacc41a6c6
--- /dev/null
+++ b/tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Converts and combines multiple XSpace protos into a single OpStats
+// <combined_op_stats>.
+// Return the first error status during conversion, or return OkStatus() if
+// there is no error.
+Status ConvertMultiXSpacesToCombinedOpStats(
+    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
+    OpStats* combined_op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_MULTI_XPLANES_TO_OP_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.h b/tensorflow/core/profiler/convert/op_metrics_to_record.h
index 6c285f92cbb..ae8814b120f 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.h
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_METRICS_TO_RECORD_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
@@ -34,16 +35,41 @@ inline double GigaFlopsPerSecondPerCore(const OpMetrics& metrics) {
   return SafeDivide(metrics.flops(), PicoToNano(metrics.time_ps()));
 }
 
-inline double GigaBytesPerSecondPerCore(const OpMetrics& metrics) {
+// Return BW for memory_space.  If invert_memory_space is true, returns BW
+// for all other memory spaces except the specified memory_space.
+inline double GigaBytesPerSecondPerCore(
+    const OpMetrics& metrics, uint64_t memory_space,
+    OpMetrics::MemoryAccessed::OperationType operation_type) {
+  uint64_t bytes = 0;
+  if (memory_space == MemorySpace::MEMORY_SPACE_ALL) {
+    bytes = metrics.bytes_accessed();
+  } else {
+    for (const auto& breakdown : metrics.memory_accessed_breakdown()) {
+      // Count either on-chip or off-chip bytes.
+      if ((breakdown.operation_type() != operation_type) &&
+          (operation_type != OpMetrics::MemoryAccessed::UNKNOWN)) {
+        continue;
+      }
+      if (((memory_space == MemorySpace::MEMORY_SPACE_HBM) &&
+           (breakdown.memory_space() == MemorySpace::MEMORY_SPACE_HBM)) ||
+          ((memory_space == MemorySpace::MEMORY_SPACE_ON_CHIP) &&
+           (breakdown.memory_space() != MemorySpace::MEMORY_SPACE_HBM))) {
+        bytes += breakdown.bytes_accessed();
+      }
+    }
+  }
+
   // bytes_accessed and time_ps are accumulated across all occurrences on all
   // cores.
   // time_ps is used instead of self_time_ps because bytes_accessed for an op
   // includes the bytes accessed by children (nested) ops.
-  return SafeDivide(metrics.bytes_accessed(), PicoToNano(metrics.time_ps()));
+  return SafeDivide(bytes, PicoToNano(metrics.time_ps()));
 }
 
-inline double GibiBytesPerSecondPerCore(const OpMetrics& metrics) {
-  return GigaToGibi(GigaBytesPerSecondPerCore(metrics));
+inline double GibiBytesPerSecondPerCore(
+    const OpMetrics& metrics, uint64_t memory_space,
+    OpMetrics::MemoryAccessed::OperationType op_type) {
+  return GigaToGibi(GigaBytesPerSecondPerCore(metrics, memory_space, op_type));
 }
 
 template <typename Record>
@@ -104,7 +130,9 @@ inline void SetRooflineMetrics(const OpMetrics& metrics,
                                Record* record) {
   using ::tensorflow::profiler::PicoToNano;
   record->set_measured_flop_rate(GigaFlopsPerSecondPerCore(metrics));
-  record->set_measured_memory_bw(GigaBytesPerSecondPerCore(metrics));
+  record->set_measured_memory_bw(
+      GigaBytesPerSecondPerCore(metrics, MemorySpace::MEMORY_SPACE_ALL,
+                                OpMetrics::MemoryAccessed::UNKNOWN));
   record->set_operational_intensity(
       SafeDivide(metrics.flops(), metrics.bytes_accessed()));
   record->set_bound_by((metrics.bytes_accessed() != 0)
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index ab3919382f9..357d1cf1d9b 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -109,7 +109,7 @@ void SortAndPruneChildren(int k, int level, Node* root) {
     } else {
       *root->mutable_children() =
           TopKChildren(root, k, [](const Node* a, const Node* b) {
-            return a->metrics().time() > b->metrics().time();
+            return a->metrics().raw_time() > b->metrics().raw_time();
           }).children();
     }
   }
@@ -166,22 +166,29 @@ int64_t GetComputationSize(Node node) {
 }
 
 // Fills op metrics into a node.
-void PopulateOpMetricsNode(const OpMetrics& op_metrics,
-                           double peak_gigaflops_per_second_per_core,
-                           double peak_gibibytes_per_second_per_core,
-                           uint64_t total_time_ps, Node* node) {
+void PopulateOpMetricsNode(
+    const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
+    std::vector<double> peak_mem_gibibytes_per_second_per_core,
+    uint64_t total_time_ps, Node* node) {
   DCHECK_EQ(ChildrenTimePs(op_metrics), 0);
 
+  // TODO(dfinchel): remove this temporary change to avoid crash.
+  // This is only needed while we make an update to proto version that is not
+  // backwards compatible.
+  if (peak_mem_gibibytes_per_second_per_core.size() !=
+      (MemBwType_MAX - MemBwType_MIN + 1)) {
+    peak_mem_gibibytes_per_second_per_core.clear();
+    for (int i = MemBwType_MIN; i <= MemBwType_MAX; ++i) {
+      peak_mem_gibibytes_per_second_per_core.push_back(0);
+    }
+  }
+
   Metrics* metrics = node->mutable_metrics();
   // The UI computes flops_rate = raw_flops / raw_time
   // and memory_bandwidth = raw_bytes_accessed / raw_time. See:
   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
   metrics->set_raw_time(op_metrics.time_ps());
   metrics->set_raw_flops(op_metrics.flops());
-  metrics->set_raw_bytes_accessed(op_metrics.bytes_accessed());
-
-  // "time" is the op or category fraction of total time.
-  metrics->set_time(SafeDivide(op_metrics.time_ps(), total_time_ps));
 
   // Hack to approximate utilization for INT8/4 convolution HLOs:
   // Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor detemrined
@@ -193,21 +200,63 @@ void PopulateOpMetricsNode(const OpMetrics& op_metrics,
   }
   double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
                                         peak_gigaflops_per_second_per_core);
-  // The UI expects flops_utilization = flops / time. See:
+  // The UI expects flops_utilization = flop_util / time_fraction. See:
   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
-  metrics->set_flops(flops_utilization * metrics->time());
-
-  // TODO(b/219984562): Use hierarchical roofline.
-  double mem_bw_utilization = SafeDivide(GibiBytesPerSecondPerCore(op_metrics),
-                                         peak_gibibytes_per_second_per_core);
-  metrics->set_memory_bandwidth(mem_bw_utilization);
+  const double time_fraction = SafeDivide(op_metrics.time_ps(), total_time_ps);
+  metrics->set_flops(flops_utilization * time_fraction);
+
+  // Capture both on-chip and off-chip memory utilization.
+  const double hbm_gibibytes_per_second =
+      GigaToGibi(GigaBytesPerSecondPerCore(op_metrics,
+                                           MemorySpace::MEMORY_SPACE_HBM,
+                                           OpMetrics::MemoryAccessed::READ)) +
+      GigaToGibi(GigaBytesPerSecondPerCore(op_metrics,
+                                           MemorySpace::MEMORY_SPACE_HBM,
+                                           OpMetrics::MemoryAccessed::WRITE));
+  const double hbm_bw_utilization = SafeDivide(
+      hbm_gibibytes_per_second,
+      peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_HBM_RW]);
+  metrics->add_bandwidth_utils(hbm_bw_utilization);
+  double hbm_bytes =
+      GibiToGiga(hbm_gibibytes_per_second) * PicoToNano(op_metrics.time_ps());
+
+  const double sram_rd_gibibytes_per_second = GigaToGibi(
+      GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
+                                OpMetrics::MemoryAccessed::READ));
+  const double sram_rd_bw_utilization = SafeDivide(
+      sram_rd_gibibytes_per_second,
+      peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_SRAM_RD]);
+  metrics->add_bandwidth_utils(sram_rd_bw_utilization);
+  double sram_rd_bytes = GibiToGiga(sram_rd_gibibytes_per_second) *
+                         PicoToNano(op_metrics.time_ps());
+
+  const double sram_wr_gibibytes_per_second = GigaToGibi(
+      GigaBytesPerSecondPerCore(op_metrics, MemorySpace::MEMORY_SPACE_ON_CHIP,
+                                OpMetrics::MemoryAccessed::WRITE));
+  const double sram_wr_bw_utilization = SafeDivide(
+      sram_wr_gibibytes_per_second,
+      peak_mem_gibibytes_per_second_per_core[MemBwType::MEM_BW_TYPE_SRAM_WR]);
+  metrics->add_bandwidth_utils(sram_wr_bw_utilization);
+  double sram_wr_bytes = GibiToGiga(sram_wr_gibibytes_per_second) *
+                         PicoToNano(op_metrics.time_ps());
+
+  // Check if number of bytes is consistent.
+  const auto total_bytes = op_metrics.bytes_accessed();
+  if ((hbm_bytes + sram_rd_bytes + sram_wr_bytes) < (0.99 * total_bytes)) {
+    // If inconsistent, assume total_bytes are all off-chip.
+    hbm_bytes = total_bytes;
+    sram_rd_bytes = 0;
+    sram_wr_bytes = 0;
+  }
+  metrics->add_raw_bytes_accessed_array(hbm_bytes);
+  metrics->add_raw_bytes_accessed_array(sram_rd_bytes);
+  metrics->add_raw_bytes_accessed_array(sram_wr_bytes);
 }
 
 // Sets the total time on the root node metrics.
 void SetTotalTime(uint64_t total_time_ps, Node* root) {
   Metrics* metrics = root->mutable_metrics();
   metrics->set_raw_time(total_time_ps);
-  metrics->set_time(1.0);
 }
 
 // Recursively insert "fused instruction" nodes (with raw flops).
@@ -327,12 +376,13 @@ void OpProfileBuilder::AddOp(const OpMetrics& op_metrics) {
   }
 }
 
-void OpProfileBuilder::Finalize(double peak_gigaflops_per_second_per_core,
-                                double peak_gibibytes_per_second_per_core,
-                                uint64_t total_time_ps) {
+void OpProfileBuilder::Finalize(
+    double peak_gigaflops_per_second_per_core,
+    std::vector<double> peak_mem_gibibytes_per_second_per_core,
+    uint64_t total_time_ps) {
   for (const auto& [node, op_metrics] : metrics_) {
     PopulateOpMetricsNode(op_metrics, peak_gigaflops_per_second_per_core,
-                          peak_gibibytes_per_second_per_core, total_time_ps,
+                          peak_mem_gibibytes_per_second_per_core, total_time_ps,
                           node);
   }
   SetTotalTime(total_time_ps, root_);
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.h b/tensorflow/core/profiler/convert/op_profile_builder.h
index 91af90266d2..516138c5fa5 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.h
+++ b/tensorflow/core/profiler/convert/op_profile_builder.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
@@ -43,7 +44,7 @@ class OpProfileBuilder {
   void AddOp(const OpMetrics& op_metrics);
 
   void Finalize(double peak_gigaflops_per_second_per_core,
-                double peak_gibibytes_per_second_per_core,
+                std::vector<double> peak_mem_gibibytes_per_second_per_core,
                 uint64_t total_time_ps);
 
  private:
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
index 03f83ce488c..2263b2cad83 100644
--- a/tensorflow/core/profiler/convert/op_stats_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/topology.pb.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/step_intersection.h"
@@ -89,7 +91,7 @@ void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
     dst->set_replica_count(std::max(src.replica_count(), dst->replica_count()));
     dst->set_num_cores_per_replica(
         std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
-    *dst->mutable_topology() = src.topology();
+    *dst->mutable_system_topology() = src.system_topology();
   } else if (dst->device_type().empty()) {
     dst->set_device_type(src.device_type());
   }
@@ -104,8 +106,13 @@ void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
 // Combines the src PerfEnv into the dst PerfEnv.
 void CombinePerfEnv(const PerfEnv& src, PerfEnv* dst) {
   dst->set_peak_tera_flops_per_second(src.peak_tera_flops_per_second());
-  dst->set_peak_hbm_bw_giga_bytes_per_second(
-      src.peak_hbm_bw_giga_bytes_per_second());
+  if (src.peak_bws_giga_bytes_per_second_size() > 0) {
+    for (int i = MemBwType::MEM_BW_TYPE_FIRST; i <= MemBwType::MEM_BW_TYPE_MAX;
+         ++i) {
+      dst->add_peak_bws_giga_bytes_per_second(
+          src.peak_bws_giga_bytes_per_second(i));
+    }
+  }
   dst->set_ridge_point(src.ridge_point());
 }
 
@@ -211,6 +218,12 @@ StepIntersection ComputeStepIntersectionToMergeOpStats(
 void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
                        const StepIntersection& step_intersection,
                        OpStats* combined_op_stats) {
+  // A shortcut code path for a single OpStats. There is no need to merge.
+  if (all_op_stats_info.size() == 1) {
+    *combined_op_stats = *all_op_stats_info[0].op_stats;
+    return;
+  }
+
   StepDatabaseResult* combined_step_db = combined_op_stats->mutable_step_db();
   // Initialize the StepDatabaseResult field that depends on the number of
   // steps.
diff --git a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc b/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
index 3e384a50ba8..a00355f61ae 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_op_profile.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_stats_to_op_profile.h"
 
 #include <string>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "tensorflow/core/platform/logging.h"
@@ -62,10 +63,11 @@ void BuildOpProfileNodeTree(const OpStats& op_stats, bool group_by_program,
   const auto& perf_env = op_stats.perf_env();
   double max_gigaflops_per_second_per_core =
       TeraToGiga(perf_env.peak_tera_flops_per_second());
-  double max_gibibytes_per_second_per_core =
-      GigaToGibi(perf_env.peak_hbm_bw_giga_bytes_per_second());
-  builder.Finalize(max_gigaflops_per_second_per_core,
-                   max_gibibytes_per_second_per_core,
+  std::vector<double> peak_bws;
+  for (auto bw : perf_env.peak_bws_giga_bytes_per_second()) {
+    peak_bws.push_back(GigaToGibi(bw));
+  }
+  builder.Finalize(max_gigaflops_per_second_per_core, peak_bws,
                    TotalTimePs(metrics_db, exclude_idle_ops));
 }
 
diff --git a/tensorflow/core/profiler/convert/tool_options.h b/tensorflow/core/profiler/convert/tool_options.h
index 942e53779e3..c8349185906 100644
--- a/tensorflow/core/profiler/convert/tool_options.h
+++ b/tensorflow/core/profiler/convert/tool_options.h
@@ -25,17 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-// Tool options for HloProtoToToolData conversion.
-struct HloToolOptions {
-  std::optional<std::string> module_name;
-  std::optional<std::string> type;
-  std::optional<std::string> node_name;
-  std::optional<std::string> format;
-  int graph_width;
-  bool show_metadata;
-  bool merge_fusion;
-};
-
 using ToolOptions =
     absl::flat_hash_map<std::string, std::variant<int, std::string>>;
 
@@ -64,20 +53,6 @@ T GetParamWithDefault(const ToolOptions& options, const std::string& key,
   return default_param;
 }
 
-inline HloToolOptions ToolOptionsToHloToolOptions(const ToolOptions& options) {
-  HloToolOptions hlo_options;
-  hlo_options.module_name = GetParam<std::string>(options, "module_name");
-  hlo_options.type = GetParam<std::string>(options, "type");
-  hlo_options.node_name = GetParam<std::string>(options, "node_name");
-  hlo_options.format = GetParam<std::string>(options, "format");
-  hlo_options.graph_width = GetParamWithDefault<int>(options, "graph_width", 3);
-  hlo_options.show_metadata =
-      GetParamWithDefault<int>(options, "show_metadata", 0);
-  hlo_options.merge_fusion =
-      GetParamWithDefault<int>(options, "merge_fusion", 0);
-  return hlo_options;
-}
-
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_events_to_json.h
deleted file mode 100644
index 15db9e2d954..00000000000
--- a/tensorflow/core/profiler/convert/trace_events_to_json.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
-
-#include <string>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-// Converts trace events in the trace proto to a JSON string that can be
-// consumed by catapult trace viewer.
-std::string TraceEventsToJson(const Trace& trace);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 7721e04f87d..fc1f9412808 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
-#include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
@@ -66,13 +64,15 @@ std::string Hostname(const XSpace& space) {
 }  // namespace
 
 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
-                    double peak_hbm_bw_giga_bytes_per_second) {
+                    std::vector<double> peak_bws) {
   PerfEnv result;
   result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
-  result.set_peak_hbm_bw_giga_bytes_per_second(
-      peak_hbm_bw_giga_bytes_per_second);
+
+  for (const auto bw : peak_bws) {
+    result.add_peak_bws_giga_bytes_per_second(bw);
+  }
   result.set_ridge_point(TeraToGiga(peak_tera_flops_per_second) /
-                         peak_hbm_bw_giga_bytes_per_second);
+                         peak_bws[MemBwType::MEM_BW_TYPE_HBM_RW]);
   return result;
 }
 
@@ -81,15 +81,39 @@ PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
   if (!absl::StartsWith(device_plane.name(), kTpuPlanePrefix)) {
     return MakePerfEnv(
         GigaToTera(GetFlopMaxThroughputPerSM(cap)) * cap.num_cores(),
-        UniToGiga(cap.memory_bandwidth()));
+        // Ideally, the cap should report separate hbm BW, for now set to same.
+        {UniToGiga(cap.memory_bandwidth()), UniToGiga(cap.memory_bandwidth()),
+         UniToGiga(cap.memory_bandwidth()), UniToGiga(cap.memory_bandwidth())});
   } else {
     XPlaneVisitor visitor = CreateTfXPlaneVisitor(&device_plane);
     auto peak_tera_flops_per_second =
         visitor.GetStat(StatType::kDevCapPeakTeraflopsPerSecond);
+    auto peak_tera_flops_per_second_val =
+        peak_tera_flops_per_second.has_value()
+            ? peak_tera_flops_per_second->DoubleValue()
+            : 0.0;
     auto peak_hbm_bw_giga_bytes_per_second =
         visitor.GetStat(StatType::kDevCapPeakHbmBwGigabytesPerSecond);
-    return MakePerfEnv(peak_tera_flops_per_second->DoubleValue(),
-                       peak_hbm_bw_giga_bytes_per_second->DoubleValue());
+    auto peak_hbm_bw_giga_bytes_per_second_val =
+        peak_hbm_bw_giga_bytes_per_second.has_value()
+            ? peak_hbm_bw_giga_bytes_per_second->DoubleValue()
+            : 0.0;
+    auto peak_sram_rd_bw_giga_bytes_per_second =
+        visitor.GetStat(StatType::kDevCapPeakSramRdBwGigabytesPerSecond);
+    auto peak_sram_rd_bw_giga_bytes_per_second_val =
+        peak_sram_rd_bw_giga_bytes_per_second.has_value()
+            ? peak_sram_rd_bw_giga_bytes_per_second->DoubleValue()
+            : 0.0;
+    auto peak_sram_wr_bw_giga_bytes_per_second =
+        visitor.GetStat(StatType::kDevCapPeakSramWrBwGigabytesPerSecond);
+    auto peak_sram_wr_bw_giga_bytes_per_second_val =
+        peak_sram_wr_bw_giga_bytes_per_second.has_value()
+            ? peak_sram_wr_bw_giga_bytes_per_second->DoubleValue()
+            : 0.0;
+    return MakePerfEnv(peak_tera_flops_per_second_val,
+                       {peak_hbm_bw_giga_bytes_per_second_val,
+                        peak_sram_rd_bw_giga_bytes_per_second_val,
+                        peak_sram_wr_bw_giga_bytes_per_second_val});
   }
 }
 
@@ -235,47 +259,5 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
   return op_stats;
 }
 
-Status ConvertMultiXSpacesToCombinedOpStats(
-    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
-    OpStats* combined_op_stats) {
-  // A shortcut code path for a single XSpace. There is no need to merge OpStats
-  // if there is only a single XSpace.
-  if (session_snapshot.XSpaceSize() == 1) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(0));
-    *combined_op_stats = ConvertXSpaceToOpStats(*xspace, options);
-    return OkStatus();
-  }
-
-  // Read multiple XSpaces and convert to multiple OpStats.
-  // TODO(profiler): Change the combiner to convert and combine one OpStats at a
-  // time, to reduce peak memory usage.
-  std::vector<OpStats> all_op_stats;
-  all_op_stats.reserve(session_snapshot.XSpaceSize());
-  for (int i = 0; i < session_snapshot.XSpaceSize(); i++) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
-                        session_snapshot.GetXSpace(i));
-    PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
-                               /*derived_timeline=*/false);
-    all_op_stats.push_back(ConvertXSpaceToOpStats(*xspace, options));
-  }
-
-  // Combine OpStats.
-  std::vector<OpStatsInfo> all_op_stats_info;
-  all_op_stats_info.reserve(all_op_stats.size());
-  for (int i = 0; i < all_op_stats.size(); i++) {
-    all_op_stats_info.emplace_back(
-        &all_op_stats[i],
-        ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
-  }
-
-  // Do not limit the maximum number of steps during the merge of OpStats.
-  StepIntersection step_intersection =
-      ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
-  CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
-
-  return OkStatus();
-}
-
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index ca6ba7e9156..23aeae3fc98 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 
-#include "tensorflow/core/platform/status.h"
+#include <vector>
+
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
@@ -44,19 +45,11 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
 
 // Populates PerfEnv.
 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
-                    double peak_hbm_bw_giga_bytes_per_second);
+                    std::vector<double> peak_bws);
 
 // Extracts PerfEnv from XPlane stats.
 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane);
 
-// Converts and combines multiple XSpace protos into a single OpStats
-// <combined_op_stats>.
-// Return the first error status during conversion, or return OkStatus() if
-// there is no error.
-Status ConvertMultiXSpacesToCombinedOpStats(
-    const SessionSnapshot& session_snapshot, const OpStatsOptions& options,
-    OpStats* combined_op_stats);
-
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index fce3c12a386..5729f3e4fa8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
@@ -34,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -43,7 +46,7 @@ using ::testing::Property;
 using ::testing::UnorderedElementsAre;
 
 TEST(ConvertXPlaneToOpStats, GpuPerfEnv) {
-  XSpace space;
+  auto space = std::make_unique<XSpace>();
   constexpr double kMaxError = 0.01;
   constexpr int kClockRateKHz = 1530000;
   constexpr int kCoreCount = 80;
@@ -54,7 +57,7 @@ TEST(ConvertXPlaneToOpStats, GpuPerfEnv) {
   constexpr int kComputeCapMinor = 0;
 
   XPlaneBuilder device_plane(
-      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
+      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
   device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
                                 GetStatTypeStr(StatType::kDevVendor)),
                             kDeviceVendorNvidia);
@@ -72,31 +75,46 @@ TEST(ConvertXPlaneToOpStats, GpuPerfEnv) {
       *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
       kComputeCapMinor);
 
-  GroupTfEvents(&space);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   const PerfEnv& perf_env = op_stats.perf_env();
   EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError);
-  EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError);
+  EXPECT_NEAR(
+      900,
+      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_HBM_RW),
+      kMaxError);
   EXPECT_NEAR(156.67, perf_env.ridge_point(), kMaxError);
 }
 
 TEST(ConvertXPlaneToOpStats, GpuRunEnvironment) {
-  XSpace space;
+  auto space = std::make_unique<XSpace>();
   XPlaneBuilder device_plane1(
-      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
+      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
   device_plane1.AddStatValue(*device_plane1.GetOrCreateStatMetadata(
                                  GetStatTypeStr(StatType::kDevVendor)),
                              kDeviceVendorNvidia);
   XPlaneBuilder device_plane2(
-      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/1));
+      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/1));
   device_plane2.AddStatValue(*device_plane2.GetOrCreateStatMetadata(
                                  GetStatTypeStr(StatType::kDevVendor)),
                              kDeviceVendorNvidia);
 
-  GroupTfEvents(&space);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(
+      session_snapshot_or.value(), OpStatsOptions(), &op_stats));
   const RunEnvironment& run_env = op_stats.run_environment();
 
   EXPECT_EQ("Nvidia GPU", run_env.device_type());
@@ -109,27 +127,38 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   constexpr int64_t kStepNum = 123;
   constexpr int64_t kStepId = 0;
 
-  XSpace space;
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
+  auto space = std::make_unique<XSpace>();
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(space.get()));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
 
-  GroupTfEvents(&space);
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
   options.generate_step_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -140,36 +169,47 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   constexpr int64_t kStepId = 0;
   constexpr int64_t kCorrelationId = 100;
 
-  XSpace space;
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
+  auto space = std::make_unique<XSpace>();
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(space.get()));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 20,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
                {{StatType::kCorrelationId, kCorrelationId}});
 
   XPlaneBuilder device_plane_builder(
-      GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
+      GetOrCreateGpuXPlane(space.get(), /*device_ordinal=*/0));
   device_plane_builder.ReserveLines(1);
 
   auto stream = device_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space);
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
   options.generate_step_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -214,15 +254,19 @@ void BuildXSpaceForTest(XSpace& xspace, absl::string_view hostname) {
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kStepId}});
 
   auto executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kStepId}});
   // Create a TensorFlow op that runs for 70 ps.
   CreateXEvent(&host_plane_builder, &executor_thread, "aaa:bbb", 30, 70);
-  GroupTfEvents(&xspace);
   xspace.add_hostnames(std::string(hostname));
 }
 
@@ -296,7 +340,7 @@ TEST(ConvertXPlaneToOpStats, RunEnvironmentExtractedFromTpuPlane) {
 }
 
 TEST(ConvertXPlaneToOpStats, TpuPerfEnv) {
-  XSpace space;
+  auto space = std::make_unique<XSpace>();
   constexpr double kMaxError = 0.01;
   constexpr int kClockRateKHz = 1530000;
   constexpr int kCoreCount = 80;
@@ -309,8 +353,8 @@ TEST(ConvertXPlaneToOpStats, TpuPerfEnv) {
   constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 900.0;
 
   XPlaneBuilder device_plane(GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, "TPU V4", kDevCapPeakTeraflopsPerSecond,
-      kDevCapPeakHbmBwGigabytesPerSecond));
+      space.get(), /*device_ordinal=*/0, "TPU V4",
+      kDevCapPeakTeraflopsPerSecond, kDevCapPeakHbmBwGigabytesPerSecond));
   /*device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kDevVendor)),
                         kDeviceVendorNvidia); // "Google, Inc.");*/
@@ -328,25 +372,40 @@ TEST(ConvertXPlaneToOpStats, TpuPerfEnv) {
       *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
       kComputeCapMinor);
 
-  GroupTfEvents(&space);
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   const PerfEnv& perf_env = op_stats.perf_env();
   EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError);
-  EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError);
+  EXPECT_NEAR(
+      900,
+      perf_env.peak_bws_giga_bytes_per_second(MemBwType::MEM_BW_TYPE_HBM_RW),
+      kMaxError);
   EXPECT_NEAR(156.67, perf_env.ridge_point(), kMaxError);
 }
 
 TEST(ConvertXPlaneToOpStats, TpuRunEnvironment) {
-  XSpace space;
+  auto space = std::make_unique<XSpace>();
   XPlaneBuilder device_plane1(
-      GetOrCreateTpuXPlane(&space, /*device_ordinal=*/0, "TPU V4", 0, 0));
+      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/0, "TPU V4", 0, 0));
   XPlaneBuilder device_plane2(
-      GetOrCreateTpuXPlane(&space, /*device_ordinal=*/1, "TPU V4", 0, 0));
+      GetOrCreateTpuXPlane(space.get(), /*device_ordinal=*/1, "TPU V4", 0, 0));
 
-  GroupTfEvents(&space);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(
+      session_snapshot_or.value(), OpStatsOptions(), &op_stats));
   const RunEnvironment& run_env = op_stats.run_environment();
 
   EXPECT_EQ("TPU V4", run_env.device_type());
@@ -361,28 +420,33 @@ TEST(ConvertXPlaneToOpStats, TpuStepDbTest) {
   constexpr int64_t kCorrelationId = 100;
   constexpr int kCoreCount = 80;
   constexpr double kDevCapPeakTeraflopsPerSecond = 141.0;
-  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 900.0;
+  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 1000.0;
 
-  XSpace space;
-  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&space));
+  auto space = std::make_unique<XSpace>();
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(space.get()));
   host_plane_builder.ReserveLines(2);
 
   auto main_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 20,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
                {{StatType::kCorrelationId, kCorrelationId}});
 
   XPlaneBuilder device_plane_builder(GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, "TPU V4", kDevCapPeakTeraflopsPerSecond,
-      kDevCapPeakHbmBwGigabytesPerSecond));
+      space.get(), /*device_ordinal=*/0, "TPU V4",
+      kDevCapPeakTeraflopsPerSecond, kDevCapPeakHbmBwGigabytesPerSecond));
   device_plane_builder.ReserveLines(1);
   device_plane_builder.AddStatValue(
       *device_plane_builder.GetOrCreateStatMetadata("core_count"), kCoreCount);
@@ -391,11 +455,17 @@ TEST(ConvertXPlaneToOpStats, TpuStepDbTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space);
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
   options.generate_step_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -407,12 +477,12 @@ TEST(ConvertXPlaneToOpStats, TpuStepDbTest) {
 }
 
 TEST(ConvertXPlaneToOpStats, TpuDeviceTraceToStepDb) {
-  XSpace space;
+  auto space = std::make_unique<XSpace>();
   constexpr double kDevCapPeakTeraflopsPerSecond = 141.0;
-  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 900.0;
+  constexpr double kDevCapPeakHbmBwGigabytesPerSecond = 1000.0;
   XPlaneBuilder xplane_builder(GetOrCreateTpuXPlane(
-      &space, /*device_ordinal=*/0, "TPU V4", kDevCapPeakTeraflopsPerSecond,
-      kDevCapPeakHbmBwGigabytesPerSecond));
+      space.get(), /*device_ordinal=*/0, "TPU V4",
+      kDevCapPeakTeraflopsPerSecond, kDevCapPeakHbmBwGigabytesPerSecond));
 
   XEventMetadata* event_metadata = xplane_builder.GetOrCreateEventMetadata(1);
   event_metadata->set_name("op_name");
@@ -441,7 +511,14 @@ TEST(ConvertXPlaneToOpStats, TpuDeviceTraceToStepDb) {
 
   OpStatsOptions options;
   options.generate_op_metrics_db = true;
-  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(space));
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"test_xspace"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  OpStats op_stats;
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(session_snapshot_or.value(),
+                                                   options, &op_stats));
   EXPECT_THAT(op_stats.device_op_metrics_db().metrics_db(),
               UnorderedElementsAre(Property(&OpMetrics::name, "op_name"),
                                    Property(&OpMetrics::name, "IDLE")));
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index daef8fdb2e8..8d9715e5c52 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -53,21 +53,31 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                0, 100, {{StatType::kStepNum, kFirstStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kFirstStepId}});
+               10, 90,
+               {{StatType::kStepId, kFirstStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kFirstStepId}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
                300, 100, {{StatType::kStepNum, kSecondStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               310, 90, {{StatType::kStepId, kSecondStepId}});
+               310, 90,
+               {{StatType::kStepId, kSecondStepId},
+                {StatType::kProducerType, int64_t{1}},
+                {StatType::kProducerId, kSecondStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 20,
-               {{StatType::kStepId, kFirstStepId}});
+               {{StatType::kStepId, kFirstStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kFirstStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 10,
                {{StatType::kCorrelationId, kFirstCorrelationId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 320, 20,
-               {{StatType::kStepId, kSecondStepId}});
+               {{StatType::kStepId, kSecondStepId},
+                {StatType::kConsumerType, int64_t{1}},
+                {StatType::kConsumerId, kSecondStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 330, 10,
                {{StatType::kCorrelationId, kSecondCorrelationId}});
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
index 4055b9d6d89..2aeb9c7b556 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
 
+#include <optional>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
@@ -72,12 +74,12 @@ void SetIteratorMetadata(int64_t id, const XEventVisitor& event,
 
 // Returns the parent iterator's id if it is a root of a device input
 // pipeline.
-absl::optional<int64_t> FindDeviceInputPipeline(const XEventVisitor& event) {
+std::optional<int64_t> FindDeviceInputPipeline(const XEventVisitor& event) {
   if (event.Type() == HostEventType::kDeviceInputPipelineSecondIterator) {
     auto parent_id_stat = event.GetStat(StatType::kParentId);
     if (parent_id_stat.has_value()) return parent_id_stat->IntValue();
   }
-  return absl::nullopt;
+  return std::nullopt;
 }
 
 // Processes EventForest to do the following:
@@ -129,7 +131,7 @@ void ProcessEventForest(
       // First time processing this iterator.
       SetIteratorMetadata(iterator_id, iterator_event_visitor, &metadata);
       // Find and record device input pipeline ids.
-      absl::optional<int64_t> device_input_pipeline_id =
+      std::optional<int64_t> device_input_pipeline_id =
           FindDeviceInputPipeline(iterator_event_visitor);
       if (device_input_pipeline_id.has_value()) {
         device_input_pipeline_ids->insert(*device_input_pipeline_id);
@@ -155,7 +157,8 @@ void SetInputPipelineMetadata(int64_t id, int64_t name_id,
 
 void ProcessIteratorEvent(const EventNode& iterator_event,
                           InputPipelineStat* input_pipeline_stat,
-                          bool is_blocking) {
+                          bool is_blocking, int level = 0) {
+  if (level > 100) return;
   const XEventVisitor& visitor = iterator_event.GetEventVisitor();
   auto iterator_id_stat = visitor.GetStat(StatType::kStepId);
   if (!iterator_id_stat.has_value()) return;
@@ -177,7 +180,7 @@ void ProcessIteratorEvent(const EventNode& iterator_event,
       int64_t overlap_duration_ps =
           self_time_span.OverlappedDurationPs(child_visitor.GetTimespan());
       ProcessIteratorEvent(*child, input_pipeline_stat,
-                           is_blocking && overlap_duration_ps);
+                           is_blocking && overlap_duration_ps, level + 1);
       // Note: Assume no overlap between child events.
       self_time_ps -= overlap_duration_ps;
     }
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
index 844c5526366..ff7d5bf6240 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -22,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/xplane_to_hlo.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -30,24 +33,31 @@ StatusOr<std::string> GetAvailableToolNames(
     const SessionSnapshot& session_snapshot) {
   std::vector<std::string> tools;
   if (session_snapshot.XSpaceSize() != 0) {
-    tools.reserve(9);
+    tools.reserve(11);
     tools.push_back("trace_viewer");
     tools.push_back("overview_page");
     tools.push_back("input_pipeline_analyzer");
     tools.push_back("tensorflow_stats");
-    tools.push_back("kernel_stats");
     tools.push_back("memory_profile");
     tools.push_back("pod_viewer");
     tools.push_back("tf_data_bottleneck_analysis");
     tools.push_back("op_profile");
-  }
 
-  TF_ASSIGN_OR_RETURN(bool has_hlo,
-                      ConvertMultiXSpaceToHloProto(session_snapshot));
-  if (has_hlo) {
-    tools.push_back("memory_viewer");
-    tools.push_back("graph_viewer");
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<XSpace> xspace,
+                        session_snapshot.GetXSpace(0));
+
+    if (!FindPlanesWithPrefix(*xspace, kGpuPlanePrefix).empty()) {
+      tools.push_back("kernel_stats");
+    }
+
+    TF_ASSIGN_OR_RETURN(bool has_hlo,
+                        ConvertMultiXSpaceToHloProto(session_snapshot));
+    if (has_hlo) {
+      tools.push_back("memory_viewer");
+      tools.push_back("graph_viewer");
+    }
   }
+
   return absl::StrJoin(tools, ",");
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
new file mode 100644
index 00000000000..7d4ef3aecc7
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/convert/repository.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+struct XPlaneToToolsTestCase {
+  std::string test_name;
+  std::string_view plane_name;
+  bool has_hlo_module;
+  std::vector<std::string> expected_tools;
+};
+
+SessionSnapshot CreateSessionSnapshot(std::unique_ptr<XSpace> xspace,
+                                      bool has_hlo_module) {
+  std::string test_name =
+      ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  std::string path = absl::StrCat("ram://", test_name, "/");
+  std::unique_ptr<WritableFile> xplane_file;
+  tensorflow::Env::Default()
+      ->NewAppendableFile(absl::StrCat(path, "hostname.xplane.pb"),
+                          &xplane_file)
+      .IgnoreError();
+  std::vector<std::string> paths = {path};
+
+  if (has_hlo_module) {
+    tensorflow::Env::Default()
+        ->NewAppendableFile(absl::StrCat(path, "module_name.hlo_proto.pb"),
+                            &xplane_file)
+        .IgnoreError();
+  } else {
+    tensorflow::Env::Default()
+        ->NewAppendableFile(absl::StrCat(path, "NO_MODULE.hlo_proto.pb"),
+                            &xplane_file)
+        .IgnoreError();
+  }
+
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  xspaces.push_back(std::move(xspace));
+
+  StatusOr<SessionSnapshot> session_snapshot =
+      SessionSnapshot::Create(paths, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot.status());
+  return std::move(session_snapshot.value());
+}
+
+using XPlaneToToolsTest = ::testing::TestWithParam<XPlaneToToolsTestCase>;
+
+TEST_P(XPlaneToToolsTest, ToolsList) {
+  const XPlaneToToolsTestCase& test_case = GetParam();
+  auto xspace = std::make_unique<XSpace>();
+  FindOrAddMutablePlaneWithName(xspace.get(), test_case.plane_name);
+
+  SessionSnapshot sessionSnapshot =
+      CreateSessionSnapshot(std::move(xspace), test_case.has_hlo_module);
+
+  StatusOr<std::string> toolsString = GetAvailableToolNames(sessionSnapshot);
+  ASSERT_TRUE(toolsString.ok());
+
+  std::vector<std::string> tools = absl::StrSplit(toolsString.value(), ',');
+
+  std::vector<std::string> expected_tools = {"trace_viewer",
+                                             "overview_page",
+                                             "input_pipeline_analyzer",
+                                             "tensorflow_stats",
+                                             "memory_profile",
+                                             "pod_viewer",
+                                             "tf_data_bottleneck_analysis",
+                                             "op_profile"};
+  expected_tools.insert(expected_tools.end(), test_case.expected_tools.begin(),
+                        test_case.expected_tools.end());
+  EXPECT_THAT(tools, ::testing::UnorderedElementsAreArray(expected_tools));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    XPlaneToToolsTests, XPlaneToToolsTest,
+    ::testing::ValuesIn<XPlaneToToolsTestCase>({
+        {"ToolsForTpuWithoutHloModule", kTpuPlanePrefix, false, {}},
+        {"ToolsForTpuWithHloModule",
+         kTpuPlanePrefix,
+         true,
+         {"graph_viewer", "memory_viewer"}},
+        {"ToolsForGpuWithoutHloModule",
+         kGpuPlanePrefix,
+         false,
+         {"kernel_stats"}},
+        {"ToolsForGpuWithHloModule",
+         kGpuPlanePrefix,
+         true,
+         {"kernel_stats", "graph_viewer", "memory_viewer"}},
+    }),
+    [](const ::testing::TestParamInfo<XPlaneToToolsTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index f5e6f789286..3cc3fb07557 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/hlo_to_tools_data.h"
+#include "tensorflow/core/profiler/convert/multi_xplanes_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_op_profile.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
-#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -68,7 +69,7 @@ StatusOr<std::string> ConvertXSpaceToTraceEvents(
   PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
                              /*derived_timeline=*/true);
   std::string content;
-  ConvertXSpaceToTraceEventsString(*xspace, &content);
+  tsl::profiler::ConvertXSpaceToTraceEventsString(*xspace, &content);
   return content;
 }
 
@@ -252,8 +253,7 @@ StatusOr<std::string> ConvertMultiXSpacesToToolData(
   } else if (tool_name == "op_profile") {
     return ConvertMultiXSpacesToOpProfileViewer(session_snapshot);
   } else if (tool_name == "memory_viewer" || tool_name == "graph_viewer") {
-    return ConvertHloProtoToToolData(session_snapshot, tool_name,
-                                     ToolOptionsToHloToolOptions(options));
+    return ConvertHloProtoToToolData(session_snapshot, tool_name, options);
   } else if (tool_name == "tool_names") {
     return GetAvailableToolNames(session_snapshot);
   } else if (tool_name == "_xplane.pb") {  // internal test only.
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.h b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
deleted file mode 100644
index b7bddb7b366..00000000000
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
-#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
-
-#include <string>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace);
-
-void ConvertXSpaceToTraceEventsString(const XSpace& xspace,
-                                      std::string* content);
-
-// Not Public API, Testing only.
-void MaybeDropEventsForTraceViewer(Trace* trace, uint32 limit);
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
deleted file mode 100644
index a9b6a29704f..00000000000
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
-
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-void CreateXSpace(XSpace* space) {
-  XPlaneBuilder host_plane(space->add_planes());
-  host_plane.SetName(kHostThreadsPlaneName);
-  XLineBuilder thread1 = host_plane.GetOrCreateLine(10);
-  thread1.SetName("thread1");
-  XEventBuilder event1 =
-      thread1.AddEvent(*host_plane.GetOrCreateEventMetadata("event1"));
-  event1.SetTimestampNs(150000);
-  event1.SetDurationNs(10000);
-  event1.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                      *host_plane.GetOrCreateStatMetadata("Relu"));
-  XLineBuilder thread2 = host_plane.GetOrCreateLine(20);
-  thread2.SetName("thread2");
-  XEventBuilder event2 =
-      thread2.AddEvent(*host_plane.GetOrCreateEventMetadata("event2"));
-  event2.SetTimestampNs(160000);
-  event2.SetDurationNs(10000);
-  event2.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
-                      *host_plane.GetOrCreateStatMetadata("Conv2D"));
-
-  XPlaneBuilder device_plane(space->add_planes());
-  device_plane.SetName(GpuPlaneName(0));
-  device_plane.SetId(0);
-  XLineBuilder stream1 = device_plane.GetOrCreateLine(30);
-  stream1.SetName("gpu stream 1");
-  XEventBuilder event3 =
-      stream1.AddEvent(*device_plane.GetOrCreateEventMetadata("kernel1"));
-  event3.SetTimestampNs(180000);
-  event3.SetDurationNs(10000);
-  event3.AddStatValue(*device_plane.GetOrCreateStatMetadata("correlation id"),
-                      55);
-}
-
-TEST(ConvertXPlaneToTraceEvents, Convert) {
-  XSpace xspace;
-  CreateXSpace(&xspace);
-
-  Trace trace;
-  ConvertXSpaceToTraceEvents(xspace, &trace);
-
-  ASSERT_EQ(trace.devices_size(), 2);
-  EXPECT_EQ(trace.devices().at(kHostThreadsDeviceId).resources_size(), 2);
-  EXPECT_EQ(trace.devices().at(kFirstDeviceId).resources_size(), 1);
-  EXPECT_EQ(trace.trace_events_size(), 3);
-}
-
-TEST(ConvertXPlaneToTraceEvents, Drop) {
-  Trace trace;
-  for (int i = 0; i < 100; i++) {
-    trace.add_trace_events()->set_timestamp_ps((100 - i) % 50);
-  }
-
-  MaybeDropEventsForTraceViewer(&trace, 150);
-  EXPECT_EQ(trace.trace_events_size(), 100);  // No dropping.
-
-  MaybeDropEventsForTraceViewer(&trace, 50);
-  EXPECT_EQ(trace.trace_events_size(), 50);
-  for (const auto& event : trace.trace_events()) {
-    EXPECT_LT(event.timestamp_ps(), 25);
-  }
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 89ee654f8ad..e2cdaccd224 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index 988775c0e40..d8fe28c8b96 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/profiler/internal/testdata/BUILD b/tensorflow/core/profiler/internal/testdata/BUILD
index 12a60a7d3d5..6f2c00d3ee6 100644
--- a/tensorflow/core/profiler/internal/testdata/BUILD
+++ b/tensorflow/core/profiler/internal/testdata/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 924592071e5..a3878a899a9 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
@@ -24,7 +25,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + if_not_android([
         ":profiler_session",
         "//tensorflow/core/profiler/convert:xplane_to_step_stats",
@@ -43,44 +44,20 @@ cc_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
     ] + if_not_android([
         ":profiler_interface",
         ":profiler_lock",
     ]) + if_static([
-        ":profiler_session_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_session_impl",
     ]),
 )
 
-cc_library(
+alias(
     name = "profiler_session_impl",
-    srcs = [
-        "profiler_session.cc",
-        "profiler_session.h",
-    ],
-    copts = tf_profiler_copts(),
-    visibility = [
-        "//tensorflow/core/profiler:__pkg__",
-        "//tensorflow/python:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "@com_google_absl//absl/memory",
-    ] + if_not_android([
-        "//tensorflow/core:lib_internal",
-        ":profiler_interface",
-        ":profiler_factory",
-        ":profiler_collection",
-        ":profiler_lock",
-        "//tensorflow/core/profiler/convert:post_process_single_host_xplane",
-        "//tensorflow/core/profiler/utils:time_utils",
-    ]),
-    alwayslink = True,
+    actual = "//tensorflow/tsl/profiler/lib:profiler_session_impl",
 )
 
 cc_library(
@@ -88,7 +65,7 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/lib:profiler_factory",
     ] + if_static([
         ":profiler_factory_impl",
@@ -108,9 +85,9 @@ cc_library(
         ":profiler_controller",
         ":profiler_interface",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/lib:profiler_factory",
         "//tensorflow/tsl/profiler/lib:profiler_factory_impl",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
@@ -130,17 +107,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "profiler_collection",
-    srcs = ["profiler_collection.cc"],
-    hdrs = ["profiler_collection.h"],
-    deps = [
-        ":profiler_interface",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-    ],
-)
-
 cc_library(
     name = "profiler_controller",
     hdrs = ["profiler_controller.h"],
@@ -154,24 +120,13 @@ cc_library(
 
 tf_cuda_library(
     name = "profiler_backends",
-    cuda_deps = [
-        "//tensorflow/core/profiler/backends/gpu:device_tracer",
-    ],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/compiler/xla/backends/profiler:profiler_backends",
-        "//tensorflow/core/profiler/backends/cpu:host_tracer",
-        "//tensorflow/core/profiler/backends/cpu:metadata_collector",
     ],
     alwayslink = True,
 )
 
-tf_profiler_pybind_cc_library_wrapper(
-    name = "traceme_for_pybind",
-    actual = ":traceme",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-)
-
 cc_library(
     name = "traceme",
     hdrs = ["traceme.h"],
@@ -222,6 +177,7 @@ cc_library(
         ":traceme",
         ":traceme_encode",
         "//tensorflow/core:lib",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -243,10 +199,10 @@ cc_library(
 
 cc_library(
     name = "scoped_memory_debug_annotation",
-    srcs = ["scoped_memory_debug_annotation.cc"],
     hdrs = ["scoped_memory_debug_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/tsl/profiler/lib:scoped_memory_debug_annotation",
     ],
 )
 
@@ -305,7 +261,6 @@ filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
         "scoped_annotation.h",
-        "scoped_memory_debug_annotation.cc",
         "scoped_memory_debug_annotation.h",
         "traceme.h",
         "traceme_encode.h",
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index 6575a1b905f..4a5b652baed 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -23,98 +23,13 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/context_types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
 
 namespace tensorflow {
 namespace profiler {
 
-/*
- * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
- * different threads. TraceMeProducer generates the context information to be
- * passed to TraceMeConsumer, which consists of the context id and optionally
- * the context type. They may be provided by the user. Then, the events of the
- * same context information can be correlated during the analysis.
- *
- * Example Usages:
- * (1) Using the user-provided context type and id. The user is responsible for
- *     providing the same context type and id to TraceMeProducer and
- *     TraceMeConsumer.
- * [Producer Thread]
- * // user_context_id is provided by the user.
- * TraceMeProducer producer(
- *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
- *     ContextType::kTfExecutor, user_context_id);
- * [Consumer Thread]
- * // user_context_id is provided by the user.
- * TraceMeConsumer consumer(
- *     [&] { return "op_execute"; }, ContextType::kTfExecutor, user_context_id);
- *
- * (2) Using the user-provided context type and generic id. The user is
- *     responsible for passing the TraceMeProducer's context id to
- *     TraceMeConsumer as well as providing the same context type to
- *     TraceMeProducer and TraceMeConsumer.
- * [Producer Thread]
- * TraceMeProducer producer(
- *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
- *     ContextType::kTfExecutor);
- * context_id = producer.GetContextId();
- * // Pass context_id to the consumer thread.
- * [Consumer Thread]
- * // context_id is passed from the producer thread.
- * TraceMeConsumer consumer(
- *     [&] { return "op_execute"; }, ContextType::kTfExecutor, context_id);
- *
- * (3) Using the generic context information. The user is responsible for
- *     passing the TraceMeProducer's context id to TraceMeConsumer.
- * [Producer Thread]
- * TraceMeProducer producer(
- *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
- * context_id = producer.GetContextId();
- * // Pass context_id to the consumer thread.
- * [Consumer Thread]
- * // context_id is passed from the producer thread.
- * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
- */
-class TraceMeProducer {
- public:
-  template <typename NameT>
-  explicit TraceMeProducer(NameT&& name,
-                           ContextType context_type = ContextType::kGeneric,
-                           absl::optional<uint64> context_id = absl::nullopt,
-                           int level = 2)
-      : context_id_(context_id.has_value() ? context_id.value()
-                                           : TraceMe::NewActivityId()),
-        trace_me_(std::forward<NameT>(name), level) {
-    trace_me_.AppendMetadata([&] {
-      return TraceMeEncode({{"_pt", context_type}, {"_p", context_id_}});
-    });
-  }
-
-  uint64 GetContextId() const { return context_id_; }
-
- private:
-  uint64 context_id_;
-  TraceMe trace_me_;
-};
-
-class TraceMeConsumer {
- public:
-  template <typename NameT>
-  TraceMeConsumer(NameT&& name, ContextType context_type, uint64 context_id,
-                  int level = 2)
-      : trace_me_(std::forward<NameT>(name), level) {
-    trace_me_.AppendMetadata([&] {
-      return TraceMeEncode({{"_ct", context_type}, {"_c", context_id}});
-    });
-  }
-
-  template <typename NameT>
-  TraceMeConsumer(NameT&& name, uint64 context_id, int level = 2)
-      : TraceMeConsumer(std::forward<NameT>(name), ContextType::kGeneric,
-                        context_id, level) {}
-
- private:
-  TraceMe trace_me_;
-};
+using tsl::profiler::TraceMeConsumer;  // NOLINT
+using tsl::profiler::TraceMeProducer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/device_profiler_session.h b/tensorflow/core/profiler/lib/device_profiler_session.h
index 23d7efa953c..a1e75a6dcac 100644
--- a/tensorflow/core/profiler/lib/device_profiler_session.h
+++ b/tensorflow/core/profiler/lib/device_profiler_session.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #endif
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tensorflow {
 
@@ -54,7 +54,7 @@ class DeviceProfilerSession {
     return errors::Unimplemented("Profiling not supported on mobile platform.");
 #else
     profiler::XSpace space;
-    TF_RETURN_IF_ERROR(profiler_session_.CollectData(&space));
+    TF_RETURN_IF_ERROR(profiler_session_->CollectData(&space));
     profiler::ConvertGpuXSpaceToStepStats(space, step_stats);
     return OkStatus();
 #endif
@@ -64,17 +64,18 @@ class DeviceProfilerSession {
   // Constructs an instance of the class and starts profiling
   explicit DeviceProfilerSession(const ProfileOptions& options)
 #if !defined(IS_MOBILE_PLATFORM)
-      : profiler_session_(options)
+      : profiler_session_(ProfilerSession::Create(options))
 #endif
   {
   }
 
-  // DeviceProfilerSession is neither copyable or movable.
+  // DeviceProfilerSession is neither copyable nor movable.
   DeviceProfilerSession(const DeviceProfilerSession&) = delete;
   DeviceProfilerSession& operator=(const DeviceProfilerSession&) = delete;
 
 #if !defined(IS_MOBILE_PLATFORM)
-  ProfilerSession profiler_session_;
+  // TODO(b/256013238)
+  std::unique_ptr<ProfilerSession> profiler_session_;
 #endif
 };
 
diff --git a/tensorflow/core/profiler/lib/profiler_factory.h b/tensorflow/core/profiler/lib/profiler_factory.h
index 1d5620d79cb..001f8b33ef4 100644
--- a/tensorflow/core/profiler/lib/profiler_factory.h
+++ b/tensorflow/core/profiler/lib/profiler_factory.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
deleted file mode 100644
index 3b165c1bbe0..00000000000
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/lib/profiler_session.h"
-
-#include <memory>
-#include <utility>
-
-#include "absl/memory/memory.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-
-#if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/platform/host_info.h"
-#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
-#include "tensorflow/core/profiler/lib/profiler_collection.h"
-#include "tensorflow/core/profiler/lib/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/lib/profiler_lock.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-#endif
-
-namespace tensorflow {
-namespace {
-
-ProfileOptions GetOptions(const ProfileOptions& opts) {
-  if (opts.version()) return opts;
-  ProfileOptions options = ProfilerSession::DefaultOptions();
-  options.set_include_dataset_ops(opts.include_dataset_ops());
-  return options;
-}
-
-};  // namespace
-
-/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
-    const ProfileOptions& options) {
-  return absl::WrapUnique(new ProfilerSession(options));
-}
-
-tensorflow::Status ProfilerSession::Status() {
-  mutex_lock l(mutex_);
-  return status_;
-}
-
-#if !defined(IS_MOBILE_PLATFORM)
-Status ProfilerSession::CollectDataInternal(profiler::XSpace* space) {
-  mutex_lock l(mutex_);
-  TF_RETURN_IF_ERROR(status_);
-  LOG(INFO) << "Profiler session collecting data.";
-  if (profilers_ != nullptr) {
-    profilers_->Stop().IgnoreError();
-    profilers_->CollectData(space).IgnoreError();
-    profilers_.reset();  // data has been collected.
-  }
-  // Allow another session to start.
-  profiler_lock_.ReleaseIfActive();
-  return OkStatus();
-}
-#endif
-
-Status ProfilerSession::CollectData(profiler::XSpace* space) {
-#if !defined(IS_MOBILE_PLATFORM)
-  space->add_hostnames(port::Hostname());
-  TF_RETURN_IF_ERROR(CollectDataInternal(space));
-  PostProcessSingleHostXSpace(space, start_time_ns_);
-#endif
-  return OkStatus();
-}
-
-ProfilerSession::ProfilerSession(const ProfileOptions& options)
-#if defined(IS_MOBILE_PLATFORM)
-    : status_(errors::Unimplemented(
-          "Profiler is unimplemented for mobile platforms.")) {
-#else
-    : options_(GetOptions(options)) {
-  auto profiler_lock = profiler::ProfilerLock::Acquire();
-  if (!profiler_lock.ok()) {
-    status_ = profiler_lock.status();
-    return;
-  }
-  profiler_lock_ = *std::move(profiler_lock);
-
-  LOG(INFO) << "Profiler session initializing.";
-  // Sleep until it is time to start profiling.
-  if (options_.start_timestamp_ns() > 0) {
-    int64_t sleep_duration_ns =
-        options_.start_timestamp_ns() - profiler::GetCurrentTimeNanos();
-    if (sleep_duration_ns < 0) {
-      LOG(WARNING) << "Profiling is late by " << -sleep_duration_ns
-                   << " nanoseconds and will start immediately.";
-    } else {
-      LOG(INFO) << "Delaying start of profiler session by "
-                << sleep_duration_ns;
-      profiler::SleepForNanos(sleep_duration_ns);
-    }
-  }
-
-  LOG(INFO) << "Profiler session started.";
-  start_time_ns_ = profiler::GetCurrentTimeNanos();
-
-  DCHECK(profiler_lock_.Active());
-  profilers_ = absl::make_unique<profiler::ProfilerCollection>(
-      profiler::CreateProfilers(options_));
-  profilers_->Start().IgnoreError();
-#endif
-}
-
-ProfilerSession::~ProfilerSession() {
-#if !defined(IS_MOBILE_PLATFORM)
-  LOG(INFO) << "Profiler session tear down.";
-#endif
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1d5dd5c77e6..b22a12ad37f 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -15,80 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-
-#if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/lib/profiler_lock.h"
-#endif
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
 
 namespace tensorflow {
 
-// A profiler which will start profiling when creating the object and will stop
-// when either the object is destroyed or CollectData is called.
-// Multiple instances can be created, but at most one of them will profile.
-// Status() will return OK only for the instance that is profiling.
-// Thread-safety: ProfilerSession is thread-safe.
-class ProfilerSession {
- public:
-  // Creates a ProfilerSession and starts profiling.
-  static std::unique_ptr<ProfilerSession> Create(const ProfileOptions& options);
-
-  static ProfileOptions DefaultOptions() {
-    ProfileOptions options;
-    options.set_version(1);
-    options.set_device_tracer_level(1);
-    options.set_host_tracer_level(2);
-    options.set_device_type(ProfileOptions::UNSPECIFIED);
-    options.set_python_tracer_level(0);
-    options.set_enable_hlo_proto(true);
-    options.set_include_dataset_ops(true);
-    return options;
-  }
-
-  // Deletes an existing Profiler and enables starting a new one.
-  ~ProfilerSession();
-
-  tensorflow::Status Status() TF_LOCKS_EXCLUDED(mutex_);
-
-  // Collects profile data into XSpace.
-  tensorflow::Status CollectData(profiler::XSpace* space)
-      TF_LOCKS_EXCLUDED(mutex_);
-
- private:
-  friend class DeviceProfilerSession;
-
-  // Constructs an instance of the class and starts profiling
-  explicit ProfilerSession(const ProfileOptions& options);
-
-  // ProfilerSession is neither copyable or movable.
-  ProfilerSession(const ProfilerSession&) = delete;
-  ProfilerSession& operator=(const ProfilerSession&) = delete;
-
-#if !defined(IS_MOBILE_PLATFORM)
-  // Collects profile data into XSpace without post-processsing.
-  tensorflow::Status CollectDataInternal(profiler::XSpace* space);
-
-  profiler::ProfilerLock profiler_lock_ TF_GUARDED_BY(mutex_);
-
-  std::unique_ptr<profiler::ProfilerInterface> profilers_ TF_GUARDED_BY(mutex_);
-
-  uint64 start_time_ns_;
-  ProfileOptions options_;
-#endif
-  tensorflow::Status status_ TF_GUARDED_BY(mutex_);
-  mutex mutex_;
-};
+using tsl::ProfilerSession;  // NOLINT
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
index 5edceff96e9..860574da60a 100644
--- a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
@@ -20,91 +20,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h"
+
 namespace tensorflow {
 namespace profiler {
 
-// Annotations for memory profiling and debugging purpose.
-// ScopedMemoryDebugAnnotation will cache the annotations in thread-local
-// memory, and some allocators will try to tag allocations with the annotations.
-struct MemoryDebugAnnotation {
-  const char* pending_op_name = nullptr;
-  int64_t pending_step_id = 0;
-  const char* pending_region_type = nullptr;
-  int32_t pending_data_type = 0;
-  // A lambda function, when invoked, it will generate the string that describe
-  // the shape of the pending tensor. By default, the TensorShape string is an
-  // empty string.
-  std::function<std::string()> pending_shape_func = []() { return ""; };
-};
-
-// Wrapper class of MemoryDebugAnnotation for RAII.
-class ScopedMemoryDebugAnnotation {
- public:
-  static const MemoryDebugAnnotation& CurrentAnnotation() {
-    return *ThreadMemoryDebugAnnotation();
-  }
-
-  explicit ScopedMemoryDebugAnnotation(const char* op_name) {
-    MemoryDebugAnnotation* thread_local_annotation =
-        ThreadMemoryDebugAnnotation();
-    last_annotation_ = *thread_local_annotation;
-    *thread_local_annotation = MemoryDebugAnnotation();
-    thread_local_annotation->pending_op_name = op_name;
-  }
-
-  explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id) {
-    MemoryDebugAnnotation* thread_local_annotation =
-        ThreadMemoryDebugAnnotation();
-    last_annotation_ = *thread_local_annotation;
-    *thread_local_annotation = MemoryDebugAnnotation();
-    thread_local_annotation->pending_op_name = op_name;
-    thread_local_annotation->pending_step_id = step_id;
-  }
-
-  // This constructor keeps the pending_op_name and pending_step_id from parent
-  // (if any).  Otherwise it overwrites with op_name.
-  explicit ScopedMemoryDebugAnnotation(
-      const char* op_name, const char* region_type, int32_t data_type,
-      std::function<std::string()>&& pending_shape_func) {
-    MemoryDebugAnnotation* thread_local_annotation =
-        ThreadMemoryDebugAnnotation();
-    last_annotation_ = *thread_local_annotation;
-    if (!thread_local_annotation->pending_op_name) {
-      thread_local_annotation->pending_op_name = op_name;
-    }
-    thread_local_annotation->pending_region_type = region_type;
-    thread_local_annotation->pending_data_type = data_type;
-    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
-  }
-
-  explicit ScopedMemoryDebugAnnotation(
-      const char* op_name, int64_t step_id, const char* region_type,
-      int32_t data_type, std::function<std::string()>&& pending_shape_func) {
-    MemoryDebugAnnotation* thread_local_annotation =
-        ThreadMemoryDebugAnnotation();
-    last_annotation_ = *thread_local_annotation;
-    thread_local_annotation->pending_op_name = op_name;
-    thread_local_annotation->pending_step_id = step_id;
-    thread_local_annotation->pending_region_type = region_type;
-    thread_local_annotation->pending_data_type = data_type;
-    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
-  }
-
-  ~ScopedMemoryDebugAnnotation() {
-    *ThreadMemoryDebugAnnotation() = last_annotation_;
-  }
-
- private:
-  // Returns a pointer to the MemoryDebugAnnotation for the current thread.
-  static MemoryDebugAnnotation* ThreadMemoryDebugAnnotation();
-
-  // Stores the previous values in case the annotations are nested.
-  MemoryDebugAnnotation last_annotation_;
-
-  ScopedMemoryDebugAnnotation(const ScopedMemoryDebugAnnotation&) = delete;
-  ScopedMemoryDebugAnnotation& operator=(const ScopedMemoryDebugAnnotation&) =
-      delete;
-};
+using tsl::profiler::MemoryDebugAnnotation;        // NOLINT
+using tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/profile.proto b/tensorflow/core/profiler/profile.proto
index 27aa904c4a5..5e630d145de 100644
--- a/tensorflow/core/profiler/profile.proto
+++ b/tensorflow/core/profiler/profile.proto
@@ -1,71 +1,6 @@
 // This proto intends to match format expected by pprof tool.
 syntax = "proto3";
 
-package tensorflow.tfprof.pprof;
+package tensorflow.tfprof.pprof.dummy;
 
-message Profile {
-  repeated ValueType sample_type = 1;
-  repeated Sample sample = 2;
-  repeated Mapping mapping = 3;
-  repeated Location location = 4;
-  repeated Function function = 5;
-  repeated string string_table = 6;
-  int64 drop_frames = 7;
-  int64 keep_frames = 8;
-  int64 time_nanos = 9;
-  int64 duration_nanos = 10;
-  ValueType period_type = 11;
-  int64 period = 12;
-  repeated int64 comment = 13;
-  int64 default_sample_type = 14;
-}
-
-message ValueType {
-  int64 type = 1;
-  int64 unit = 2;
-}
-
-message Sample {
-  repeated uint64 location_id = 1;
-  repeated int64 value = 2;
-  repeated Label label = 3;
-}
-
-message Label {
-  int64 key = 1;
-  int64 str = 2;
-  int64 num = 3;
-}
-
-message Mapping {
-  uint64 id = 1;
-  uint64 memory_start = 2;
-  uint64 memory_limit = 3;
-  uint64 file_offset = 4;
-  int64 filename = 5;
-  int64 build_id = 6;
-  bool has_functions = 7;
-  bool has_filenames = 8;
-  bool has_line_numbers = 9;
-  bool has_inline_frames = 10;
-}
-
-message Location {
-  uint64 id = 1;
-  uint64 mapping_id = 2;
-  uint64 address = 3;
-  repeated Line line = 4;
-}
-
-message Line {
-  uint64 function_id = 1;
-  int64 line = 2;
-}
-
-message Function {
-  uint64 id = 1;
-  int64 name = 2;
-  int64 system_name = 3;
-  int64 filename = 4;
-  int64 start_line = 5;
-}
+import public "tensorflow/tsl/profiler/protobuf/profile.proto";
diff --git a/tensorflow/core/profiler/profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
index 7d768161762..6103d256f57 100644
--- a/tensorflow/core/profiler/profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -1,81 +1,5 @@
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.dummy;
 
-import "tensorflow/core/profiler/profiler_service.proto";
-
-message NewProfileSessionRequest {
-  ProfileRequest request = 1;
-  // The place where we will dump profile data. We will normally use
-  // MODEL_DIR/plugins/profile as the repository root.
-  string repository_root = 2;
-  repeated string hosts = 3;  // host or host:port, port will be ignored.
-  string session_id = 4;
-}
-
-message NewProfileSessionResponse {
-  // Auxiliary error_message.
-  string error_message = 1;
-
-  // Whether all hosts had returned a empty trace.
-  bool empty_trace = 2;
-}
-
-message EnumProfileSessionsAndToolsRequest {
-  string repository_root = 1;
-}
-
-message ProfileSessionInfo {
-  string session_id = 1;
-  // Which tool data is available for consumption.
-  repeated string available_tools = 2;
-}
-
-message EnumProfileSessionsAndToolsResponse {
-  // Auxiliary error_message.
-  string error_message = 1;
-  // If success, the returned sessions information are stored here.
-  repeated ProfileSessionInfo sessions = 2;
-}
-
-message ProfileSessionDataRequest {
-  // The place where we will read profile data. We will normally use
-  // MODEL_DIR/plugins/profile as the repository root.
-  string repository_root = 1;
-  string session_id = 2;
-  // Which host the data is associated. if empty, data from all hosts are
-  // aggregated.
-  string host_name = 5;
-  // Which tool
-  string tool_name = 3;
-  // Tool's specific parameters. e.g. TraceViewer's viewport etc
-  map<string, string> parameters = 4;
-}
-
-message ProfileSessionDataResponse {
-  // Auxiliary error_message.
-  string error_message = 1;
-
-  // Output format. e.g. "json" or "proto" or "blob"
-  string output_format = 2;
-
-  // TODO(jiesun): figure out whether to put bytes or oneof tool specific proto.
-  bytes output = 3;
-}
-////////////////////////////////////////////////////////////////////////////////
-// ProfileAnalysis service provide entry point for profiling TPU and for
-// serving profiled data to Tensorboard through GRPC
-////////////////////////////////////////////////////////////////////////////////
-service ProfileAnalysis {
-  // Starts a profiling session, blocks until it completes.
-  // TPUProfileAnalysis service delegate this to TPUProfiler service.
-  // Populate the profiled data in repository, then return status to caller.
-  rpc NewSession(NewProfileSessionRequest) returns (NewProfileSessionResponse) {
-  }
-  // Enumerate existing sessions and return available profile tools.
-  rpc EnumSessions(EnumProfileSessionsAndToolsRequest)
-      returns (EnumProfileSessionsAndToolsResponse) {}
-  // Retrieve specific tool's data for specific session.
-  rpc GetSessionToolData(ProfileSessionDataRequest)
-      returns (ProfileSessionDataResponse) {}
-}
+import public "tensorflow/tsl/profiler/protobuf/profiler_analysis.proto";
diff --git a/tensorflow/core/profiler/profiler_options.proto b/tensorflow/core/profiler/profiler_options.proto
index ec4d2177c9d..8b0012fa2d1 100644
--- a/tensorflow/core/profiler/profiler_options.proto
+++ b/tensorflow/core/profiler/profiler_options.proto
@@ -1,88 +1,5 @@
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.dummy;
 
-// Next ID: 11
-message ProfileOptions {
-  // Some default value of option are not proto3 default value. Use this version
-  // to determine if we should use default option value instead of proto3
-  // default value.
-  uint32 version = 5;
-
-  enum DeviceType {
-    UNSPECIFIED = 0;
-    CPU = 1;
-    GPU = 2;
-    TPU = 3;
-    PLUGGABLE_DEVICE = 4;
-  }
-
-  // Device type to profile/trace: (version >= 1)
-  // DeviceType::UNSPECIFIED: All registered device profiler will be enabled.
-  // DeviceType::CPU: only CPU will be profiled.
-  // DeviceType::GPU: only CPU/GPU will be profiled.
-  // DeviceType::TPU: only CPU/TPU will be profiled.
-  // DeviceType::PLUGGABLE_DEVICE: only CPU/pluggable devices with profilers
-  // will be profiled.
-  DeviceType device_type = 6;
-
-  // We don't collect the dataset ops by default for better trace-viewer
-  // scalability. The caller can mannually set this field to include the ops.
-  bool include_dataset_ops = 1;
-
-  // Levels of host tracing: (version >= 1)
-  // - Level 0 is used to disable host traces.
-  // - Level 1 enables tracing of only user instrumented (or default) TraceMe.
-  // - Level 2 enables tracing of all level 1 TraceMe(s) and instrumented high
-  //           level program execution details (expensive TF ops, XLA ops, etc).
-  //           This is the default.
-  // - Level 3 enables tracing of all level 2 TraceMe(s) and more verbose
-  //           (low-level) program execution details (cheap TF ops, etc).
-  uint32 host_tracer_level = 2;
-
-  // Levels of device tracing: (version >= 1)
-  // - Level 0 is used to disable device traces.
-  // - Level 1 is used to enable device traces.
-  // - More levels might be defined for specific device for controlling the
-  //   verbosity of the trace.
-  uint32 device_tracer_level = 3;
-
-  // Whether enable python function calls tracing. Runtime overhead ensues if
-  // enabled. Default off. (version >= 1)
-  uint32 python_tracer_level = 4;
-
-  // Whether serialize hlo_proto when XLA is used. (version >= 1)
-  bool enable_hlo_proto = 7;
-
-  // The local profiler starts profiling at this Unix timestamp in nanoseconds.
-  uint64 start_timestamp_ns = 8;
-
-  // The local profiler collects `duration_ms` milliseconds of data. If the
-  // value is 0, profiling continues until interrupted.
-  uint64 duration_ms = 9;
-
-  // Directory to save profile data to. No-op when empty.
-  string repository_path = 10;
-}
-
-// Options for remote profiler session manager.
-// Next ID: 6
-message RemoteProfilerSessionManagerOptions {
-  // Options for each local profiler.
-  ProfileOptions profiler_options = 1;
-
-  // List of servers to profile. Supported formats: host:port.
-  repeated string service_addresses = 2;
-
-  // Unix timestamp of when the session was started.
-  uint64 session_creation_timestamp_ns = 3;
-
-  // Maximum time (in milliseconds) a profiling session manager waits for all
-  // profilers to finish after issuing gRPC request. If value is 0, session
-  // continues until interrupted. Otherwise, value must be greater than
-  // profiler_options.duration_ms.
-  uint64 max_session_duration_ms = 4;
-
-  // Start of profiling is delayed by this much (in milliseconds).
-  uint64 delay_ms = 5;
-}
+import public "tensorflow/tsl/profiler/protobuf/profiler_options.proto";
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index f32b10f01bd..03e93d2b847 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -1,121 +1,5 @@
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.dummy;
 
-import "tensorflow/core/profiler/profiler_options.proto";
-import "tensorflow/core/profiler/profiler_service_monitor_result.proto";
-
-// The ProfilerService service retrieves performance information about
-// the programs running on connected devices over a period of time.
-service ProfilerService {
-  // Starts a profiling session, blocks until it completes, and returns data.
-  rpc Profile(ProfileRequest) returns (ProfileResponse) {}
-  // Signal to terminate the Profile rpc for a on-going profiling session,
-  // The Profile rpc will return successfully and prematurely without timeout.
-  // This is used by programmatic mode to end the session in workers.
-  rpc Terminate(TerminateRequest) returns (TerminateResponse) {}
-  // Collects profiling data and returns user-friendly metrics.
-  rpc Monitor(MonitorRequest) returns (MonitorResponse) {}
-}
-
-message ToolRequestOptions {
-  // Required formats for the tool, it should be one of "json", "proto", "raw"
-  // etc. If not specified (backward compatible), use default format, i.e. most
-  // tools use json format.
-  string output_formats = 2;
-
-  // Whether save the result directly to repository or pass it back to caller.
-  // Default to false for backward compatibilities.
-  bool save_to_repo = 3;
-}
-
-// Next-ID: 9
-message ProfileRequest {
-  // In future, the caller will be able to customize when profiling starts and
-  // stops. For now, it collects `duration_ms` milliseconds worth of data.
-  uint64 duration_ms = 1;
-
-  // The maximum number of events to return. By default (value 0), return all
-  // events.
-  uint64 max_events = 2;
-
-  // Required profiling tools name such as "input_pipeline_analyzer" etc
-  repeated string tools = 3;
-
-  // Specifies the requirement for each tools.
-  map<string, ToolRequestOptions> tool_options = 8;
-
-  // Optional profiling options that control how a TF session will be profiled.
-  ProfileOptions opts = 4;
-
-  // The place where we will dump profile data. We will normally use
-  // MODEL_DIR/plugins/profile/ as the repository root.
-  string repository_root = 5;
-
-  // The user provided profile session identifier.
-  string session_id = 6;
-
-  // The hostname of system where the profile should happen.
-  // We use it as identifier in part of our output filename.
-  string host_name = 7;
-
-  // In future, the caller will indicate which TF session is being profiled, and
-  // only data relating to that program will be returned. For now, we assume
-  // all activity during the profiling period is relevant.
-}
-
-message ProfileToolData {
-  // The file name which this data is associated (e.g. "input_pipeline.json",
-  // "cluster_xxx.memory_viewer.json").
-  string name = 1;
-
-  // The data payload (likely json) for the specific tool.
-  bytes data = 2;
-}
-
-// Next-ID: 8
-message ProfileResponse {
-  // Data payload for each required tools.
-  repeated ProfileToolData tool_data = 6;
-
-  // When we write profiling data directly to repository directory, we need a
-  // way to figure out whether the captured trace is empty.
-  bool empty_trace = 7;
-
-  reserved 1, 2, 3, 4, 5;
-}
-
-message TerminateRequest {
-  // Which session id to terminate.
-  string session_id = 1;
-}
-
-message TerminateResponse {}
-
-// Next-ID: 4
-message MonitorRequest {
-  // Duration for which to profile between each update.
-  uint64 duration_ms = 1;
-
-  // Indicates the level at which we want to monitor. Currently, two levels are
-  // supported:
-  // Level 1: An ultra lightweight mode that captures only some utilization
-  // metrics.
-  // Level 2: More verbose than level 1. Collects utilization metrics, device
-  // information, step time information, etc. Do not use this option if the TPU
-  // host is being very heavily used.
-  int32 monitoring_level = 2;
-  // True to display timestamp in monitoring result.
-  bool timestamp = 3;
-}
-
-// Next-ID: 11
-message MonitorResponse {
-  // Properly formatted string data that can be directly returned back to user.
-  string data = 1;
-
-  // A collection of monitoring results for each field show in data.
-  ProfilerServiceMonitorResult monitor_result = 10;
-
-  reserved 2, 3, 4, 5, 6, 7, 8, 9;
-}
+import public "tensorflow/tsl/profiler/protobuf/profiler_service.proto";
diff --git a/tensorflow/core/profiler/profiler_service_monitor_result.proto b/tensorflow/core/profiler/profiler_service_monitor_result.proto
index 48ec2113e2c..adc8bae4dab 100644
--- a/tensorflow/core/profiler/profiler_service_monitor_result.proto
+++ b/tensorflow/core/profiler/profiler_service_monitor_result.proto
@@ -1,39 +1,5 @@
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.dummy;
 
-message ProfilerServiceMonitorResult {
-  // Represents the different types of responses from the profiling service.
-  enum ResponseType {
-    // No result is returned from the profiling service.
-    EMPTY_RESULT = 0;
-    // Only device utilization is available.
-    UTIL_ONLY = 1;
-    // Both device utilization and device idle time are available.
-    UTIL_IDLE = 2;
-    // Device utilization, device idle time, step time, and infeed percentage
-    // are all available.
-    UTIL_IDLE_STEP = 3;
-  }
-
-  // Type of profiling responses.
-  ResponseType response_type = 1;
-  // Percentage of time when device is idle.
-  double device_idle_time_percent = 2;
-  // TPU matrix unit utilization percentage.
-  double matrix_unit_utilization_percent = 3;
-  // Average step time in millisecond.
-  double step_time_ms_avg = 4;
-  // Minimum step time in millisecond.
-  double step_time_ms_min = 5;
-  // Maximum step time in millisecond.
-  double step_time_ms_max = 6;
-  // Average infeed percentage.
-  double infeed_percent_avg = 7;
-  // Minimum infeed percentage.
-  double infeed_percent_min = 8;
-  // Maximum infeed percentage.
-  double infeed_percent_max = 9;
-
-  // next-field: 10
-}
+import public "tensorflow/tsl/profiler/protobuf/profiler_service_monitor_result.proto";
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index 2720b525eca..ac2c4aa29df 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],
 )
@@ -53,6 +54,15 @@ tf_proto_library(
     ],
 )
 
+tf_proto_library(
+    name = "topology_proto",
+    srcs = ["topology.proto"],
+    cc_api_version = 2,
+    visibility = [
+        ":friends",
+    ],
+)
+
 tf_proto_library(
     name = "input_pipeline_proto",
     srcs = ["input_pipeline.proto"],
@@ -126,6 +136,7 @@ tf_proto_library(
     srcs = ["op_stats.proto"],
     cc_api_version = 2,
     protodeps = [
+        ":topology_proto",
         ":diagnostics_proto",
         ":kernel_stats_proto",
         ":op_metrics_proto",
@@ -164,7 +175,13 @@ tf_proto_library(
     name = "trace_events_proto",
     srcs = ["trace_events.proto"],
     cc_api_version = 2,
+    protodeps = [
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto",
+    ],
     visibility = [":friends"],
+    exports = [
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto",
+    ],
 )
 
 tf_proto_library(
@@ -223,4 +240,11 @@ tf_proto_library(
 #     visibility = [":memory_viewer_friends"],
 #     deps = [":memory_viewer_preprocess_proto"],
 # )
+#
+# py_proto_library(
+#     name = "op_profile_py_pb2",
+#     api_version = 2,
+#     visibility = ["//visibility:public"],
+#     deps = [":op_profile_proto"],
+# )
 # copybara:uncomment_end
diff --git a/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto b/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
index c593124ad40..d3e0981dfa3 100644
--- a/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
+++ b/tensorflow/core/profiler/protobuf/memory_viewer_preprocess.proto
@@ -95,4 +95,6 @@ message PreprocessResult {
   double maybe_live_out_mib = 15;
 
   repeated BufferAllocation indefinite_lifetimes = 16;
+
+  string allocation_timeline = 17;
 }
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index 7c888b5dfd2..13d9033738f 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -2,6 +2,37 @@ syntax = "proto3";
 
 package tensorflow.profiler;
 
+// Types of memory bandwidth we track in the system.
+enum MemBwType {
+  option allow_alias = true;
+
+  // We use FIRST and LAST enum values to be able to iterate over this enum
+  // in TypeScript, since the _MIN and _MAX values are not automatically
+  // available as in C++.
+  MEM_BW_TYPE_FIRST = 0;
+  // Aggregated BW across on-chip and off-chip memory.
+  MEM_BW_TYPE_HBM_RW = 0;
+  // On-chip memory read bw.
+  MEM_BW_TYPE_SRAM_RD = 1;
+  // On-chip memory write bw.
+  MEM_BW_TYPE_SRAM_WR = 2;
+
+  // Leave last.
+  MEM_BW_TYPE_MAX = 2;
+}
+
+// Tensorflow generic memory space names.
+enum MemorySpace {
+  MEMORY_SPACE_UNDEFINED = 0;
+  // Off-chip memory.
+  // Assume all backends use 1 for HBM/off-chip memory.
+  MEMORY_SPACE_HBM = 1;
+  // On-chip memory.
+  MEMORY_SPACE_ON_CHIP = 0x7FFFFFFE;
+  // Any memory.
+  MEMORY_SPACE_ALL = 0x7FFFFFFF;
+}
+
 // What the dimension represents, e.g. spatial, feature or batch.
 enum LayoutDimensionSemantics {
   UNKNOWN_SEMANTICS = 0;
diff --git a/tensorflow/core/profiler/protobuf/op_profile.proto b/tensorflow/core/profiler/protobuf/op_profile.proto
index 1069cc7b3b7..9c29d1777eb 100644
--- a/tensorflow/core/profiler/protobuf/op_profile.proto
+++ b/tensorflow/core/profiler/protobuf/op_profile.proto
@@ -63,8 +63,6 @@ message Node {
 // Measurements of an operation (or aggregated set of operations).
 // Metrics are always "total" rather than "self".
 message Metrics {
-  // Core-time taken by this operation, as a fraction of all operations.
-  double time = 1;
   // Floating point computations performed by this operation, as a fraction of
   // peak core FLOPS * program time. This representation has useful properties:
   //  - it is proportional to the number of floating point operations performed
@@ -75,9 +73,15 @@ message Metrics {
 
   // The memory bandwidth used to load operands, as a fraction of
   // thereotical memory bandwidth on the specific hardware.
-  double memory_bandwidth = 3;
+  // Index into array using MemBwType enum.
+  repeated double bandwidth_utils = 5;
 
-  double raw_time = 11;            // Elapsed core-time in picoseconds.
-  double raw_flops = 12;           // Total floating-point operations performed.
-  double raw_bytes_accessed = 13;  // Total bytes accessed (include read/write).
+  // The raw stats below are aggregated across all occurrences.
+  double raw_time = 11;   // Elapsed core-time in picoseconds.
+  double raw_flops = 12;  // Total floating-point operations performed.
+  // Total bytes accessed for each memory type.
+  // Index into array using MemBwType enum.
+  repeated double raw_bytes_accessed_array = 15;
+
+  reserved 1, 3, 4, 13, 14;
 }
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 866926bbdf8..3cd1aa71f8a 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -7,13 +7,21 @@ import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
 import "tensorflow/core/profiler/protobuf/op_metrics.proto";
 import "tensorflow/core/profiler/protobuf/steps_db.proto";
 import "tensorflow/core/profiler/protobuf/tf_function.proto";
+import "tensorflow/core/profiler/protobuf/topology.proto";
 
 // Performance environment, e.g the peak performance capabilities of the device.
 message PerfEnv {
   // Peak performance of a TPU core or a GPU in TFLOP/s.
   double peak_tera_flops_per_second = 1;
   // Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
+  double peak_bw_giga_bytes_per_second = 4;
+  // Peak off-chip memory bandwidth of a TPU core or a GPU in GiBs/s.
   double peak_hbm_bw_giga_bytes_per_second = 2;
+  // Peak memory bandwidths of a TPU core or a GPU in GiBs/s.
+  // Index into array using MemBwType enum.
+  // TODO: remove the 2 above fields and bump up the proto version to maintain
+  // backwards compatibility.
+  repeated double peak_bws_giga_bytes_per_second = 5;
   // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational
   // intensity required to achieve maximum performance).
   double ridge_point = 3;
@@ -83,7 +91,9 @@ message RunEnvironment {
   // If there is no model parallelism, then num_cores_per_replica = 1
   int32 num_cores_per_replica = 10;
   // The chip interconnection topology.
-  SystemTopology topology = 11;
+  SystemTopology topology = 11 [deprecated = true];
+  // The chip and host interconnection topology.
+  Topology system_topology = 13;
   // Host trace level.
   uint32 host_trace_level = 12;
   reserved 6;
diff --git a/tensorflow/core/profiler/protobuf/pod_viewer.proto b/tensorflow/core/profiler/protobuf/pod_viewer.proto
index 3cfaa096c54..1691c05e0ef 100644
--- a/tensorflow/core/profiler/protobuf/pod_viewer.proto
+++ b/tensorflow/core/profiler/protobuf/pod_viewer.proto
@@ -89,17 +89,17 @@ message PodViewerSummary {
 // Topology graph draws all the cores in the system in a 2-D rectangle or
 // 3-D cube. It is hierarchically grouped by host, chip and core.
 message PodViewerTopology {
-  // Number of cores in the x dimension of the rectangle/cube.
+  // Number of chips in the x dimension of the rectangle/cube.
   int32 x_dimension = 1;
-  // Number of cores in the y dimension of the rectangle/cube.
+  // Number of chips in the y dimension of the rectangle/cube.
   int32 y_dimension = 2;
-  // Number of cores in the z dimension of the cube.
+  // Number of chips in the z dimension of the cube.
   int32 z_dimension = 3;
-  // Number of cores in the x dimension of each host.
+  // Number of chips in the x dimension of each host.
   int32 host_x_stride = 4;
-  // Number of cores in the y dimension of each host.
+  // Number of chips in the y dimension of each host.
   int32 host_y_stride = 5;
-  // Number of cores in the z dimension of each host.
+  // Number of chips in the z dimension of each host.
   int32 host_z_stride = 6;
   // Number of cores per chip.
   int32 num_cores_per_chip = 7;
diff --git a/tensorflow/core/profiler/protobuf/topology.proto b/tensorflow/core/profiler/protobuf/topology.proto
new file mode 100644
index 00000000000..92c8ee3ef94
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/topology.proto
@@ -0,0 +1,18 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+message TopologyDimension {
+  int32 x = 1;
+  int32 y = 2;
+  int32 z = 3;
+}
+
+// Topology of the system.
+// Describes the number of chips and hosts and their connectivity.
+message Topology {
+  // Topology of chips per host.
+  TopologyDimension chips_per_host_bounds = 1;
+  // Topology of hosts.
+  TopologyDimension host_bounds = 2;
+}
diff --git a/tensorflow/core/profiler/protobuf/trace_events.proto b/tensorflow/core/profiler/protobuf/trace_events.proto
index ea1ca85bb8c..2b2918f1d91 100644
--- a/tensorflow/core/profiler/protobuf/trace_events.proto
+++ b/tensorflow/core/profiler/protobuf/trace_events.proto
@@ -1,72 +1,5 @@
 syntax = "proto3";
 
-package tensorflow.profiler;
+package tensorflow.profiler.empty;
 
-option cc_enable_arenas = true;
-option java_outer_classname = "TraceEventsProtos";
-option java_multiple_files = true;
-option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
-
-// A 'Trace' contains metadata for the individual traces of a system.
-message Trace {
-  // The devices that this trace has information about. Maps from device_id to
-  // more data about the specific device.
-  map<uint32, Device> devices = 1;
-
-  // All trace events capturing in the profiling period.
-  repeated TraceEvent trace_events = 4;
-}
-
-// A 'device' is a physical entity in the system and is comprised of several
-// resources.
-message Device {
-  // The name of the device.
-  string name = 1;
-
-  // The id of this device, unique in a single trace.
-  uint32 device_id = 2;
-
-  // The resources on this device, keyed by resource_id;
-  map<uint32, Resource> resources = 3;
-}
-
-// A 'resource' generally is a specific computation component on a device. These
-// can range from threads on CPUs to specific arithmetic units on hardware
-// devices.
-message Resource {
-  // The name of the resource.
-  string name = 1;
-
-  // The id of the resource. Unique within a device.
-  uint32 resource_id = 2;
-
-  // The sort index of the resource. Resources within a device are ordered by
-  // this value. if absent, use resource id as sort index.
-  uint32 sort_index = 3;
-}
-
-message TraceEvent {
-  // The id of the device that this event occurred on. The full dataset should
-  // have this device present in the Trace object.
-  uint32 device_id = 1;
-
-  // The id of the resource that this event occurred on. The full dataset should
-  // have this resource present in the Device object of the Trace object. A
-  // resource_id is unique on a specific device, but not necessarily within the
-  // trace.
-  uint32 resource_id = 2;
-
-  // The name of this trace event.
-  string name = 3;
-
-  // The timestamp that this event occurred at (in picos since tracing started).
-  uint64 timestamp_ps = 9;
-
-  // The duration of the event in picoseconds if applicable.
-  // Events without duration are called instant events.
-  uint64 duration_ps = 10;
-
-  // Extra arguments that will be displayed in trace view.
-  map<string, string> args = 11;
-}
+import public "tensorflow/tsl/profiler/protobuf/trace_events.proto";
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index c560e4276e6..c0fa43fbdc7 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],
 )
@@ -30,7 +31,6 @@ exports_files(
 # Linked to pywrap_tensorflow.
 cc_library(
     name = "profiler_service_impl",
-    srcs = ["profiler_service_impl.cc"],
     hdrs = ["profiler_service_impl.h"],
     copts = tf_profiler_copts(),
     visibility = tf_external_workspace_visible(
@@ -48,12 +48,14 @@ cc_library(
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler:profiler_service_cc_grpc_proto",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:file_system_utils",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
     ] + tf_grpc_cc_dependencies(),
 )
 
@@ -65,20 +67,22 @@ tf_profiler_pybind_cc_library_wrapper(
 
 cc_library(
     name = "profiler_server_impl",
-    srcs = ["profiler_server.cc"],
     hdrs = ["profiler_server.h"],
     copts = tf_profiler_copts(),
     visibility = [
+        "//tensorflow:__pkg__",
         "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core/profiler:internal",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
+        "//tensorflow/tsl/profiler/rpc/client:__pkg__",
     ],
     deps = [
         ":profiler_service_impl",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = True,
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 552bdf00c58..9d3f30800d0 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,25 +1,18 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/core/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
     "tf_profiler_pybind_cc_library_wrapper",
 )
 
-# For platform specific build config
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_protos_profiler_service",
-)
-
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
 cc_library(
     name = "capture_profile",
-    srcs = ["capture_profile.cc"],
     hdrs = ["capture_profile.h"],
     copts = tf_profiler_copts(),
     visibility = [
@@ -27,35 +20,26 @@ cc_library(
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
-        ":profiler_client_for_pybind",
-        ":remote_profiler_session_manager",
-        ":save_profile",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/rpc/client:capture_profile",
     ],
 )
 
 cc_library(
     name = "save_profile",
-    srcs = ["save_profile.cc"],
     hdrs = ["save_profile.h"],
     copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:file_system_utils",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/rpc/client:save_profile",
     ],
 )
 
@@ -71,8 +55,10 @@ cc_library(
     deps = [
         ":profiler_client_impl",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_analysis_cc_grpc_proto",
-        "//tensorflow/core/profiler:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
@@ -82,83 +68,35 @@ cc_library(
 cc_library(
     name = "profiler_client_impl",
     srcs = [
-        "profiler_client.cc",
         "profiler_client.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/python:__pkg__"],
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
     deps = [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_analysis_cc_grpc_proto",
-        "//tensorflow/core/profiler:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = True,
 )
 
-cc_library(
-    name = "profiler_client_test_util",
-    testonly = 1,
-    hdrs = ["profiler_client_test_util.h"],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",
-    ] + tf_protos_profiler_service(),
-)
-
-tf_cc_test(
-    name = "profiler_client_test",
-    srcs = ["profiler_client_test.cc"],
-    deps = [
-        ":profiler_client",
-        ":profiler_client_impl",  # for oss
-        ":profiler_client_test_util",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",
-    ] + tf_protos_profiler_service(),
-)
-
 cc_library(
     name = "remote_profiler_session_manager",
-    srcs = ["remote_profiler_session_manager.cc"],
     hdrs = ["remote_profiler_session_manager.h"],
     copts = tf_profiler_copts(),
     deps = [
         ":profiler_client_for_pybind",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/tsl/profiler/rpc/client:remote_profiler_session_manager",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
     ],
 )
-
-tf_cc_test(
-    name = "remote_profiler_session_manager_test",
-    srcs = ["remote_profiler_session_manager_test.cc"],
-    deps = [
-        ":profiler_client_impl",  # for oss
-        ":profiler_client_test_util",
-        ":remote_profiler_session_manager",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",
-    ] + tf_protos_profiler_service(),
-)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 96f3dafa06a..2814e072ce7 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -20,28 +20,17 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/capture_profile.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// Convert XSpace to tool data and saves under <logdir>/plugins/profile/.
-Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir);
-
-// Collects one sample of monitoring profile and shows user-friendly metrics.
-// If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
-Status Monitor(const std::string& service_addr, int duration_ms,
-               int monitoring_level, bool display_timestamp,
-               std::string* result);
-
-// Starts tracing on a single or multiple hosts. Each host will save the result
-// in the given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts. Assumes that options have been validated.
-Status Trace(const std::string& logdir, int num_tracing_attempts,
-             RemoteProfilerSessionManagerOptions& opts,
-             bool is_cloud_tpu_session);
+using tsl::profiler::CaptureRemoteTrace;   // NOLINT
+using tsl::profiler::ExportToTensorBoard;  // NOLINT
+using tsl::profiler::Monitor;              // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.h b/tensorflow/core/profiler/rpc/client/profiler_client.h
index 1beb5a8a9f0..3c90d6e9045 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.h
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.h
@@ -23,73 +23,17 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_analysis.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
 
 namespace tensorflow {
 namespace profiler {
 
-// Note that tensorflow/tools/def_file_filter/symbols_pybind.txt is incompatible
-// with absl::string_view.
-Status ProfileGrpc(const std::string& service_address,
-                   const ProfileRequest& request, ProfileResponse* response);
-
-Status NewSessionGrpc(const std::string& service_address,
-                      const NewProfileSessionRequest& request,
-                      NewProfileSessionResponse* response);
-
-Status MonitorGrpc(const std::string& service_address,
-                   const MonitorRequest& request, MonitorResponse* response);
-
-class RemoteProfilerSession {
- public:
-  // Creates an instance and starts a remote profiling session immediately.
-  // This is a non-blocking call and does not wait for a response.
-  // Response must outlive the instantiation.
-  static std::unique_ptr<RemoteProfilerSession> Create(
-      const std::string& service_address, absl::Time deadline,
-      const ProfileRequest& profile_request);
-
-  // Not copyable or movable.
-  RemoteProfilerSession(const RemoteProfilerSession&) = delete;
-  RemoteProfilerSession& operator=(const RemoteProfilerSession&) = delete;
-
-  ~RemoteProfilerSession();
-
-  absl::string_view GetServiceAddress() const { return service_address_; }
-
-  // Blocks until a response has been received or until deadline expiry,
-  // whichever is first. Subsequent calls after the first will yield nullptr and
-  // an error status.
-  std::unique_ptr<ProfileResponse> WaitForCompletion(Status& out_status);
-
- private:
-  explicit RemoteProfilerSession(const std::string& service_addr,
-                                 absl::Time deadline,
-                                 const ProfileRequest& profile_request);
-
-  // Starts a remote profiling session. This is a non-blocking call.
-  // Will be called exactly once during instantiation.
-  // RPC will write to response.profile_response eagerly. However, since
-  // response.status requires a conversion from grpc::Status, it can only be
-  //  evaluated lazily at WaitForCompletion() time.
-  void ProfileAsync();
-
-  Status status_on_completion_;
-  std::unique_ptr<ProfileResponse> response_;
-  // Client address and connection attributes.
-  std::string service_address_;
-  std::unique_ptr<grpc::ProfilerService::Stub> stub_;
-  absl::Time deadline_;
-  ::grpc::ClientContext grpc_context_;
-  std::unique_ptr<::grpc::ClientAsyncResponseReader<ProfileResponse>> rpc_;
-  ::grpc::Status grpc_status_ = ::grpc::Status::OK;
-
-  // Asynchronous completion queue states.
-  ::grpc::CompletionQueue cq_;
-
-  ProfileRequest profile_request_;
-};
+using tsl::profiler::MonitorGrpc;            // NOLINT
+using tsl::profiler::NewSessionGrpc;         // NOLINT
+using tsl::profiler::ProfileGrpc;            // NOLINT
+using tsl::profiler::RemoteProfilerSession;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
index 10d2b54f557..d2900683aa9 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
@@ -27,56 +27,13 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
 
 namespace tensorflow {
 namespace profiler {
 
-using AddressResolver = std::function<std::string(absl::string_view)>;
-
-// Manages one or more remote profiling sessions.
-class RemoteProfilerSessionManager {
- public:
-  struct Response {
-    std::string service_address;
-    std::unique_ptr<ProfileResponse> profile_response;
-    Status status;
-  };
-  // Instantiates a collection of RemoteProfilerSessions starts profiling on
-  // each of them immediately. Assumes that options have already been validated.
-  static std::unique_ptr<RemoteProfilerSessionManager> Create(
-      const RemoteProfilerSessionManagerOptions& options,
-      const ProfileRequest& request, tensorflow::Status& out_status,
-      AddressResolver resolver = nullptr);
-
-  // Awaits for responses from remote profiler sessions and returns them as a
-  // list. Subsequent calls beyond the first will yield a list of errors.
-  std::vector<Response> WaitForCompletion();
-
-  // Not copyable or movable.
-  RemoteProfilerSessionManager(const RemoteProfilerSessionManager&) = delete;
-  RemoteProfilerSessionManager& operator=(const RemoteProfilerSessionManager&) =
-      delete;
-
-  ~RemoteProfilerSessionManager();
-
- private:
-  explicit RemoteProfilerSessionManager(
-      RemoteProfilerSessionManagerOptions options, ProfileRequest request,
-      AddressResolver resolver);
-
-  // Initialization of all client contexts.
-  Status Init();
-
-  mutex mutex_;
-  // Remote profiler session options.
-  RemoteProfilerSessionManagerOptions options_ TF_GUARDED_BY(mutex_);
-  ProfileRequest request_ TF_GUARDED_BY(mutex_);
-  // List of clients, each connects to a profiling service.
-  std::vector<std::unique_ptr<RemoteProfilerSession>> clients_
-      TF_GUARDED_BY(mutex_);
-  // Resolves an address into a format that gRPC understands.
-  AddressResolver resolver_ TF_GUARDED_BY(mutex_);
-};
+using tsl::profiler::AddressResolver;               // NOLINT
+using tsl::profiler::RemoteProfilerSessionManager;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index f6b8d3a5889..d20ef11fe2b 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -21,38 +21,18 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/save_profile.h"
 
 namespace tensorflow {
 namespace profiler {
 
-std::string GetCurrentTimeStampAsString();
-
-// Returns the profile plugin directory given a logdir to TensorBoard.
-std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
-
-// Creates an empty event file if not already exists, which indicates that we
-// have a plugins/profile/ directory in the current logdir.
-Status MaybeCreateEmptyEventFile(const std::string& logdir);
-
-// Saves all profiling tool data in a profile to <repository_root>/<run>/.
-// This writes user-facing log messages to `os`.
-// Note: this function creates a directory even when all fields in
-// ProfileResponse are unset/empty.
-Status SaveProfile(const std::string& repository_root, const std::string& run,
-                   const std::string& host, const ProfileResponse& response,
-                   std::ostream* os);
-
-// Gzip the data and save to <repository_root>/<run>/.
-Status SaveGzippedToolData(const std::string& repository_root,
-                           const std::string& run, const std::string& host,
-                           const std::string& tool_name,
-                           const std::string& data);
-
-// Save XSpace to <repository_root>/<run>/<host>_<port>.<kXPlanePb>.
-Status SaveXSpace(const std::string& repository_root, const std::string& run,
-                  const std::string& host, const XSpace& xspace);
+using tsl::profiler::GetCurrentTimeStampAsString;     // NOLINT
+using tsl::profiler::GetTensorBoardProfilePluginDir;  // NOLINT
+using tsl::profiler::SaveGzippedToolData;             // NOLINT
+using tsl::profiler::SaveProfile;                     // NOLINT
+using tsl::profiler::SaveXSpace;                      // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
index 57b5813655d..b104bffff89 100644
--- a/tensorflow/core/profiler/rpc/oss/BUILD
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -3,7 +3,10 @@ load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//tensorflow/core/profiler:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/core/profiler/rpc:__pkg__",  # Scheuklappen: keep
+    ],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
index 89e51e84408..2871b165c5f 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -19,21 +19,13 @@ limitations under the License.
 
 #include "grpcpp/grpcpp.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_server.h"
 
 namespace tensorflow {
 namespace profiler {
 
-class ProfilerServer {
- public:
-  ~ProfilerServer();
-  // Starts a profiler server with a given port.
-  void StartProfilerServer(int32_t port);
-
- private:
-  std::unique_ptr<grpc::ProfilerService::Service> service_;
-  std::unique_ptr<::grpc::Server> server_;
-};
+using tsl::profiler::ProfilerServer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
deleted file mode 100644
index 6dd69120f76..00000000000
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
-
-#include <memory>
-
-#include "grpcpp/support/status.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_replace.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/profiler/utils/file_system_utils.h"
-#include "tensorflow/core/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-// Collects data in XSpace format. The data is saved to a repository
-// unconditionally.
-Status CollectDataToRepository(const ProfileRequest& request,
-                               ProfilerSession* profiler,
-                               ProfileResponse* response) {
-  response->set_empty_trace(true);
-  // Read the profile data into xspace.
-  XSpace xspace;
-  TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
-  VLOG(3) << "Collected XSpace to repository.";
-  response->set_empty_trace(IsEmpty(xspace));
-
-  return SaveXSpace(request.repository_root(), request.session_id(),
-                    request.host_name(), xspace);
-}
-
-class ProfilerServiceImpl : public grpc::ProfilerService::Service {
- public:
-  ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
-                         MonitorResponse* response) override {
-    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
-  }
-
-  ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
-                         ProfileResponse* response) override {
-    VLOG(1) << "Received a profile request: " << req->DebugString();
-    std::unique_ptr<ProfilerSession> profiler =
-        ProfilerSession::Create(req->opts());
-    Status status = profiler->Status();
-    if (!status.ok()) {
-      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
-                            status.error_message());
-    }
-
-    Env* env = Env::Default();
-    uint64 duration_ns = MilliToNano(req->opts().duration_ms());
-    uint64 deadline = GetCurrentTimeNanos() + duration_ns;
-    while (GetCurrentTimeNanos() < deadline) {
-      env->SleepForMicroseconds(EnvTime::kMillisToMicros);
-      if (ctx->IsCancelled()) {
-        return ::grpc::Status::CANCELLED;
-      }
-      if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
-        mutex_lock lock(mutex_);
-        stop_signals_per_session_.erase(req->session_id());
-        break;
-      }
-    }
-
-    status = CollectDataToRepository(*req, profiler.get(), response);
-    if (!status.ok()) {
-      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
-                            status.error_message());
-    }
-
-    return ::grpc::Status::OK;
-  }
-
-  ::grpc::Status Terminate(::grpc::ServerContext* ctx,
-                           const TerminateRequest* req,
-                           TerminateResponse* response) override {
-    mutex_lock lock(mutex_);
-    stop_signals_per_session_[req->session_id()] = true;
-    return ::grpc::Status::OK;
-  }
-
- private:
-  bool IsStopped(const std::string& session_id) {
-    mutex_lock lock(mutex_);
-    auto it = stop_signals_per_session_.find(session_id);
-    return it != stop_signals_per_session_.end() && it->second;
-  }
-
-  mutex mutex_;
-  absl::flat_hash_map<std::string, bool> stop_signals_per_session_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace
-
-std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
-  return std::make_unique<ProfilerServiceImpl>();
-}
-
-}  // namespace profiler
-}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 5aca89d891f..bf0166afc36 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -17,12 +17,13 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_service_impl.h"
 
 namespace tensorflow {
 namespace profiler {
 
-std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService();
+using tsl::profiler::CreateProfilerService;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index b6db47e805e..dc6bc3ba13b 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -1,11 +1,12 @@
 syntax = "proto3";
 
 package tensorflow.tfprof;
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/profiler/tfprof_log_go_proto";
 
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/step_stats.proto";
 
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/profiler/protos_all_go_proto";
+
 // It specifies the Python callstack that creates an op.
 message CodeDef {
   repeated Trace traces = 1;
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index e32da74b40f..dcf4f9af8f6 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -1,8 +1,9 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts", "tf_profiler_xla_proto_header")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],
 )
@@ -68,9 +69,7 @@ cc_library(
 cc_library(
     name = "format_utils",
     hdrs = ["format_utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-    ],
+    deps = ["//tensorflow/tsl/profiler/utils:format_utils"],
 )
 
 cc_library(
@@ -277,6 +276,8 @@ cc_library(
     deps = [
         ":gpu_event_stats",
         ":group_events",
+        ":hlo_module_map",
+        ":hlo_proto_map",
         ":math_utils",
         ":tf_op_utils",
         ":tf_xplane_visitor",
@@ -394,6 +395,7 @@ cc_library(
     copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core/platform",
+        "//tensorflow/tsl/profiler/utils:file_system_utils",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -425,26 +427,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "buffer_pool",
-    srcs = ["buffer_pool.cc"],
-    hdrs = ["buffer_pool.h"],
-    copts = tf_profiler_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "buffer_pool_test",
-    srcs = ["buffer_pool_test.cc"],
-    deps = [
-        ":buffer_pool",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "gpu_event_stats",
     srcs = ["gpu_event_stats.cc"],
@@ -478,14 +460,15 @@ cc_library(
         ":xplane_schema",
         ":xplane_utils",
         ":xplane_visitor",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core/profiler/convert:xla_op_utils",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "//tensorflow/core/profiler/convert:xla_op_utils",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-    ] + tf_profiler_xla_proto_header(),
+    ],
 )
 
 cc_library(
@@ -495,10 +478,29 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/platform:statusor",
-        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+tf_cuda_library(
+    name = "hlo_module_map",
+    srcs = ["hlo_module_map.cc"],
+    hdrs = ["hlo_module_map.h"],
+    cuda_deps = [
+        "//tensorflow/compiler/xla/service/gpu:gpu_hlo_cost_analysis",
+    ],
+    visibility = [":friends"],
+    deps = [
+        ":hlo_proto_to_module",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/profiler/lib:traceme_encode",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 6fc055024d0..ad62a1b8e28 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/hlo_module_map.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -372,14 +374,38 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
 
 void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               XSpace* space) {
-  // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
-  // to look up tensorflow op name from hlo_module/hlo_op.
-  auto dummy_symbol_resolver =
-      [](absl::optional<uint64_t> program_id, absl::string_view hlo_module,
-         absl::string_view hlo_op) { return Symbol(); };
-  for (XPlane* plane : FindMutablePlanesWithPrefix(space, kGpuPlanePrefix)) {
+  HloModuleMap hlo_module_map;
+  {
+    HloProtoMap hlo_proto_map;
+    hlo_proto_map.AddHloProtosFromXSpace(*space);
+    for (const auto& [program_id, hlo_proto] : hlo_proto_map) {
+      AddHloProto(hlo_module_map, program_id, *hlo_proto);
+    }
+  }
+
+  auto symbol_resolver = [&](absl::optional<uint64_t> program_id,
+                             absl::string_view hlo_module,
+                             absl::string_view hlo_op) -> Symbol {
+    Symbol output;
+    const auto* hlo_instruction =
+        GetHloInstruction(hlo_module_map, program_id, hlo_op);
+    if (hlo_instruction != nullptr) {
+      output.tf_op_name = hlo_instruction->op_full_name();
+      output.source_info = std::string(hlo_instruction->source_info());
+    }
+    return output;
+  };
+
+  std::vector<XPlane*> device_planes =
+      FindMutablePlanesWithPrefix(space, kGpuPlanePrefix);
+  for (XPlane* plane : device_planes) {
     DeriveStepEventsFromGroups(group_metadata_map, plane);
-    DeriveEventsFromAnnotations(dummy_symbol_resolver, plane);
+    DeriveEventsFromAnnotations(symbol_resolver, plane);
+  }
+
+  const XPlane* host_plane = FindPlaneWithName(*space, kHostThreadsPlaneName);
+  if (host_plane) {
+    DeriveEventsFromHostTrace(host_plane, group_metadata_map, device_planes);
   }
   for (XPlane* plane : FindMutableTensorCorePlanes(space)) {
     DeriveLinesFromStats(plane);
diff --git a/tensorflow/core/profiler/utils/file_system_utils.h b/tensorflow/core/profiler/utils/file_system_utils.h
index e0cebbef6fc..49e3b3ab59a 100644
--- a/tensorflow/core/profiler/utils/file_system_utils.h
+++ b/tensorflow/core/profiler/utils/file_system_utils.h
@@ -24,44 +24,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/platform/platform.h"
-
-#ifdef PLATFORM_WINDOWS
-const absl::string_view kPathSep = "\\";
-#else
-const absl::string_view kPathSep = "/";
-#endif
+#include "tensorflow/tsl/profiler/utils/file_system_utils.h"
 
 namespace tensorflow {
 namespace profiler {
 
-inline std::string ProfilerJoinPathImpl(
-    std::initializer_list<absl::string_view> paths) {
-  std::string result;
-  for (absl::string_view path : paths) {
-    if (path.empty()) continue;
-
-    if (result.empty()) {
-      result = std::string(path);
-      continue;
-    }
-
-    path = absl::StripPrefix(path, kPathSep);
-    if (absl::EndsWith(result, kPathSep)) {
-      absl::StrAppend(&result, path);
-    } else {
-      absl::StrAppend(&result, kPathSep, path);
-    }
-  }
-
-  return result;
-}
-
-// A local duplication of ::tensorflow::io::JoinPath that supports windows.
-// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
-template <typename... T>
-std::string ProfilerJoinPath(const T&... args) {
-  return ProfilerJoinPathImpl({args...});
-}
+using tsl::profiler::ProfilerJoinPath;      // NOLINT
+using tsl::profiler::ProfilerJoinPathImpl;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/format_utils.h b/tensorflow/core/profiler/utils/format_utils.h
index c571e93263a..2def374e953 100644
--- a/tensorflow/core/profiler/utils/format_utils.h
+++ b/tensorflow/core/profiler/utils/format_utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,46 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_FORMAT_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_FORMAT_UTILS_H_
 
-#include <stdio.h>
-
-#include <string>
-
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/tsl/profiler/utils/format_utils.h"
 
 namespace tensorflow {
 namespace profiler {
-namespace internal {
-
-inline std::string FormatDouble(const char* fmt, double d) {
-  constexpr int kBufferSize = 32;
-  char buffer[kBufferSize];
-  int result = snprintf(buffer, kBufferSize, fmt, d);
-  DCHECK(result > 0 && result < kBufferSize);
-  return std::string(buffer);
-}
-
-}  // namespace internal
-
-// Formats d with one digit after the decimal point.
-inline std::string OneDigit(double d) {
-  return internal::FormatDouble("%.1f", d);
-}
-
-// Formats d with 2 digits after the decimal point.
-inline std::string TwoDigits(double d) {
-  return internal::FormatDouble("%.2f", d);
-}
-
-// Formats d with 3 digits after the decimal point.
-inline std::string ThreeDigits(double d) {
-  return internal::FormatDouble("%.3f", d);
-}
-
-// Formats d with maximum precision to allow parsing the result back to the same
-// number.
-inline std::string MaxPrecision(double d) {
-  return internal::FormatDouble("%.17g", d);
-}
+
+using tsl::profiler::MaxPrecision;  // NOLINT
+using tsl::profiler::OneDigit;      // NOLINT
+using tsl::profiler::ThreeDigits;   // NOLINT
+using tsl::profiler::TwoDigits;     // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_module_map.cc b/tensorflow/core/profiler/utils/hlo_module_map.cc
new file mode 100644
index 00000000000..e9164baf96e
--- /dev/null
+++ b/tensorflow/core/profiler/utils/hlo_module_map.cc
@@ -0,0 +1,125 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/hlo_module_map.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_cost_analysis.h"
+#endif
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+#if GOOGLE_CUDA
+int64_t ShapeSize(const xla::Shape& shape) {
+  constexpr int64_t kPointerSize = 8;
+  return xla::ShapeUtil::ByteSizeOf(shape, kPointerSize);
+}
+#endif
+
+}  // namespace
+
+HloInstructionWrapper::HloInstructionWrapper(
+    const xla::HloInstruction* instr, const xla::HloCostAnalysis* cost_analysis)
+    : instr_(instr),
+      op_full_name_(TraceMeOp(Metadata().op_name(), Metadata().op_type())) {
+  if (cost_analysis != nullptr) {
+    flops_ = cost_analysis->flop_count(*instr_);
+    bytes_accessed_ = cost_analysis->bytes_accessed(*instr_);
+  }
+}
+
+HloModuleWrapper::HloModuleWrapper(
+    const xla::HloProto& hlo_proto,
+    std::function<int64_t(const xla::Shape&)> shape_func)
+    : HloModuleWrapper(ConvertHloProtoToModuleIgnoringErrors(hlo_proto),
+                       shape_func) {}
+
+HloModuleWrapper::HloModuleWrapper(
+    std::unique_ptr<xla::HloModule> module,
+    std::function<int64_t(const xla::Shape&)> shape_func)
+    : module_(std::move(module)) {
+  if (module_ == nullptr) return;
+
+  const xla::HloCostAnalysis* cost_analysis = nullptr;
+#if GOOGLE_CUDA
+  if (shape_func == nullptr) shape_func = ShapeSize;
+  xla::HloCostAnalysis::Options options;
+  options.shape_size = shape_func;
+  xla::gpu::GpuHloCostAnalysis gpu_cost_analysis(options);
+
+  const xla::HloComputation* hlo_computation = module_->entry_computation();
+  gpu_cost_analysis.ReserveVisitStates(hlo_computation->instruction_count());
+  tsl::Status analysis_status = hlo_computation->Accept(&gpu_cost_analysis);
+  if (analysis_status.ok()) {
+    // Clear the visit state as it isn't used by anybody and it uses a lot of
+    // memory.
+    gpu_cost_analysis.DestroyVisitState();
+  } else {
+    LOG(ERROR) << "Failed to create cost analysis: " << analysis_status;
+  }
+  cost_analysis = &gpu_cost_analysis;
+#endif
+
+  for (const xla::HloComputation* computation : module_->computations()) {
+    for (const xla::HloInstruction* instr : computation->instructions()) {
+      instructions_by_name_.try_emplace(
+          instr->name(), HloInstructionWrapper(instr, cost_analysis));
+    }
+  }
+}
+
+const HloInstructionWrapper* HloModuleWrapper::GetHloInstruction(
+    absl::string_view hlo_name) const {
+  auto it = instructions_by_name_.find(hlo_name);
+  if (it != instructions_by_name_.end()) return &it->second;
+  return nullptr;
+}
+
+std::string HloInstructionWrapper::source_info() const {
+  if (!Metadata().source_file().empty()) {
+    return absl::StrCat(io::Basename(Metadata().source_file()), ":",
+                        Metadata().source_line());
+  } else {
+    return std::string();
+  }
+}
+
+void AddHloProto(HloModuleMap& hlo_module_map, uint64_t program_id,
+                 const xla::HloProto& hlo_proto) {
+  auto hlo_module = ConvertHloProtoToModule(hlo_proto);
+  if (!hlo_module.ok()) {
+    LOG(ERROR) << hlo_module.status();
+    return;
+  }
+  hlo_module_map.try_emplace(program_id,
+                             HloModuleWrapper(std::move(hlo_module).value(),
+                                              /*shape_func=*/nullptr));
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hlo_module_map.h b/tensorflow/core/profiler/utils/hlo_module_map.h
new file mode 100644
index 00000000000..f4a99a2c4ba
--- /dev/null
+++ b/tensorflow/core/profiler/utils/hlo_module_map.h
@@ -0,0 +1,145 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// This wrapper allows caching the results of HloInstruction methods.
+// This wrapper is not thread safe.
+class HloInstructionWrapper {
+ public:
+  explicit HloInstructionWrapper(
+      const xla::HloInstruction* instr,
+      const xla::HloCostAnalysis* cost_analysis = nullptr);
+
+  // Non copiable
+  HloInstructionWrapper(const HloInstructionWrapper&) = delete;
+  HloInstructionWrapper& operator=(const HloInstructionWrapper&) = delete;
+  // Movable.
+  HloInstructionWrapper(HloInstructionWrapper&&) = default;
+  HloInstructionWrapper& operator=(HloInstructionWrapper&&) = default;
+
+  absl::string_view Name() const { return instr_->name(); }
+
+  xla::HloOpcode HloOpcode() const { return instr_->opcode(); }
+
+  std::string HloOpcodeString() const {
+    return xla::HloOpcodeString(instr_->opcode());
+  }
+
+  const xla::OpMetadata& Metadata() const { return instr_->metadata(); }
+
+  size_t flops() const { return flops_; }
+  size_t bytes_accessed() const { return bytes_accessed_; }
+
+  std::string_view op_full_name() const { return op_full_name_; }
+  std::string source_info() const;
+
+ private:
+  const xla::HloInstruction* instr_;
+  std::string op_full_name_;
+  size_t flops_ = 0;
+  size_t bytes_accessed_ = 0;
+};
+
+// Wrahps HLO module and provides an interface that maps HLO names to
+// HloInstructionWrappers.
+class HloModuleWrapper {
+ public:
+  explicit HloModuleWrapper(
+      const xla::HloProto& hlo_proto,
+      std::function<int64_t(const xla::Shape&)> shape_func = nullptr);
+
+  explicit HloModuleWrapper(
+      std::unique_ptr<xla::HloModule> module,
+      std::function<int64_t(const xla::Shape&)> shape_func);
+
+  const HloInstructionWrapper* GetHloInstruction(
+      absl::string_view hlo_name) const;
+
+  bool Empty() const { return instructions_by_name_.empty(); }
+
+  absl::string_view Name() const { return module_->name(); }
+
+ private:
+  std::unique_ptr<xla::HloModule> module_;
+
+  // Map of HloInstructionWrappers by name.
+  using HloInstructionMap =
+      absl::flat_hash_map<absl::string_view, HloInstructionWrapper>;
+  HloInstructionMap instructions_by_name_;
+};
+
+// Map of HloModuleWrappers by program_id.
+using HloModuleMap =
+    absl::flat_hash_map<uint64_t /*program_id*/, HloModuleWrapper>;
+
+void AddHloProto(HloModuleMap& hlo_module_map, uint64_t program_id,
+                 const xla::HloProto& hlo_proto);
+
+// WARNING: The returned pointer will be invalidated if HloModuleMap is mutated.
+inline const HloModuleWrapper* GetHloModule(const HloModuleMap& hlo_module_map,
+                                            uint64_t program_id) {
+  auto iter = hlo_module_map.find(program_id);
+  if (iter == hlo_module_map.end()) return nullptr;
+  return &iter->second;
+}
+
+inline const HloInstructionWrapper* GetHloInstruction(
+    const HloModuleMap& hlo_module_map, std::optional<uint64_t> program_id,
+    absl::string_view hlo_name) {
+  if (!program_id.has_value()) return nullptr;
+  const auto* hlo_module = GetHloModule(hlo_module_map, *program_id);
+  if (hlo_module == nullptr) return nullptr;
+  return hlo_module->GetHloInstruction(hlo_name);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_MAP_H_
diff --git a/tensorflow/core/profiler/utils/hlo_proto_map.cc b/tensorflow/core/profiler/utils/hlo_proto_map.cc
index 808aed8d9af..44697cecd50 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_map.cc
+++ b/tensorflow/core/profiler/utils/hlo_proto_map.cc
@@ -156,6 +156,16 @@ std::vector<absl::string_view> HloProtoMap::GetSortedModuleListByHeapTraceSize()
   return module_list;
 }
 
+absl::StatusOr<const xla::HloProto*> HloProtoMap::GetHloProtoByProgramId(
+    uint64_t program_id) const {
+  auto iter = hlo_protos_by_program_id_.find(program_id);
+  if (iter != hlo_protos_by_program_id_.end()) {
+    return iter->second;
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Program id: ", program_id, " is not found."));
+}
+
 absl::StatusOr<const xla::HloProto*> HloProtoMap::GetHloProtoByModuleName(
     absl::string_view module_name) const {
   auto iter = hlo_protos_by_name_.find(module_name);
diff --git a/tensorflow/core/profiler/utils/hlo_proto_map.h b/tensorflow/core/profiler/utils/hlo_proto_map.h
index 9f2000d5595..44734e38ee0 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_map.h
+++ b/tensorflow/core/profiler/utils/hlo_proto_map.h
@@ -50,6 +50,10 @@ class HloProtoMap {
     return hlo_protos_by_name_.contains(name);
   }
 
+  bool contains(uint64_t program_id) const {
+    return hlo_protos_by_program_id_.contains(program_id);
+  }
+
   // Returns a list of module names (not sorted).
   std::vector<absl::string_view> GetModuleList() const;
 
@@ -63,8 +67,11 @@ class HloProtoMap {
   absl::StatusOr<const xla::HloProto*> GetHloProtoByModuleName(
       absl::string_view module_name) const;
 
+  absl::StatusOr<const xla::HloProto*> GetHloProtoByProgramId(
+      uint64_t program_id) const;
+
  private:
-  absl::flat_hash_map<int64_t, const xla::HloProto*> hlo_protos_by_program_id_;
+  absl::flat_hash_map<uint64_t, const xla::HloProto*> hlo_protos_by_program_id_;
   absl::flat_hash_map<std::string, const xla::HloProto*> hlo_protos_by_name_;
   std::vector<std::unique_ptr<const xla::HloProto>> owned_hlo_protos_;
 };
diff --git a/tensorflow/core/profiler/utils/hlo_proto_to_module.cc b/tensorflow/core/profiler/utils/hlo_proto_to_module.cc
index 27cbc127ba2..65790c4f7b7 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_to_module.cc
+++ b/tensorflow/core/profiler/utils/hlo_proto_to_module.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/statusor.h"
diff --git a/tensorflow/core/profiler/utils/hlo_proto_to_module.h b/tensorflow/core/profiler/utils/hlo_proto_to_module.h
index fe71fbff0ce..961b3b55fc6 100644
--- a/tensorflow/core/profiler/utils/hlo_proto_to_module.h
+++ b/tensorflow/core/profiler/utils/hlo_proto_to_module.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index f2b2d791383..c345a958444 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -305,7 +305,7 @@ void InsertOrUpdateKernelReport(const KernelReport& kernel,
         std::min(element.min_duration_ns, value.min_duration_ns);
     element.max_duration_ns =
         std::max(element.max_duration_ns, value.max_duration_ns);
-    element.occurrences += 1;
+    element.occurrences += value.occurrences;
   }
 }
 
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 0c9f71ef666..c8bebfd02e2 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -16,6 +16,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//tensorflow/core:__subpackages__",
@@ -61,9 +62,12 @@ COMMON_PROTO_SRCS = [
 tf_proto_library(
     name = "autotuning_proto",
     srcs = ["autotuning.proto"],
-    cc_api_version = 2,
     make_default_target_header_only = True,
-    protodeps = ["//tensorflow/compiler/xla/stream_executor:dnn_proto"],
+    protodeps = ["//tensorflow/tsl/protobuf:autotuning_proto"],
+    exports = ["//tensorflow/tsl/protobuf:autotuning_proto"],
+    cc_libs = [
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
+    ]
 )
 
 tf_proto_library(
@@ -74,6 +78,9 @@ tf_proto_library(
     protodeps = [
         "//tensorflow/compiler/xla/stream_executor:dnn_proto",
     ],
+    cc_libs = [
+        "//tensorflow/tsl/protobuf:dnn_proto_cc",
+    ]
 )
 
 tf_proto_library(
diff --git a/tensorflow/core/protobuf/autotuning.proto b/tensorflow/core/protobuf/autotuning.proto
index 11c595bd2c5..1dd42f9b8a1 100644
--- a/tensorflow/core/protobuf/autotuning.proto
+++ b/tensorflow/core/protobuf/autotuning.proto
@@ -1,106 +1,7 @@
-// This file defines protos that store the results of autotuning various
-// operations.
-//
-// They are in proto format because we want to log them structured. They offer
-// tremendous statistical, testing, and debugging value.
 syntax = "proto3";
 
-package tensorflow;
+package tensorflow.dummy;
 
-import "google/protobuf/any.proto";
-import "google/protobuf/duration.proto";
-import "tensorflow/compiler/xla/stream_executor/dnn.proto";
+import public "tensorflow/tsl/protobuf/autotuning.proto";
 
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
-
-message CudnnVersion {
-  int32 major = 1;
-  int32 minor = 2;
-  int32 patch = 3;
-}
-
-message ComputeCapability {
-  int32 major = 1;
-  int32 minor = 2;
-}
-
-message AutotuneResult {
-  enum FailureKind {
-    UNKNOWN = 0;
-
-    // Algorithm wrote memory outside its output buffers.
-    REDZONE_MODIFIED = 1;
-
-    // Algorithm gave a different result from a reference algorithm.
-    WRONG_RESULT = 2;
-
-    // Algorithm was rejected for failing to run or for known bugs.
-    DISQUALIFIED = 3;
-  }
-
-  message FailureResult {
-    FailureKind kind = 1;
-    string msg = 2;
-
-    // For failure_kind == WRONG_RESULT, this field indicates the reference
-    // configuration that we compared against.
-    //
-    // Note that the reference algorithm isn't always correct.  However,
-    // empirically it's more correct, as it's "algo 0", less fancy than the
-    // compared one.
-    oneof key {
-      ConvKey reference_conv = 11;
-      GemmKey reference_gemm = 12;
-      CudaConvPlanKey reference_cuda_conv_plan = 14;
-      stream_executor.dnn.AlgorithmProto reference_algorithm = 15;
-    }
-
-    int64 buffer_address = 13;
-  }
-
-  // Legacy and unused in new data; superseded by AlgorithmProto.
-  message ConvKey {
-    int64 algorithm = 1;
-    bool tensor_ops_enabled = 2;
-  }
-
-  message GemmKey {
-    int64 algorithm = 1;
-  }
-
-  // Legacy and unused in new data; superseded by AlgorithmProto.
-  message CudaConvPlanKey {
-    string exec_plan_id = 1;
-  }
-
-  int64 scratch_bytes = 8;
-  google.protobuf.Duration run_time = 9;
-
-  FailureResult failure = 7;
-
-  oneof key {
-    ConvKey conv = 5;
-    GemmKey gemm = 6;
-    CudaConvPlanKey cuda_conv_plan = 15;
-    stream_executor.dnn.AlgorithmProto algorithm = 16;
-  }
-
-  // Next ID: 17
-}
-
-message AutotuningLog {
-  google.protobuf.Any instr = 1;
-
-  // Records all auto-tuning results per algorithm.
-  repeated AutotuneResult results = 2;
-
-  CudnnVersion cudnn_version = 3;
-  ComputeCapability compute_capability = 4;
-
-  // stream_executor::DeviceDescription::pci_bus_id.
-  string device_pci_bus_id = 5;
-
-  string blas_version = 6;
-
-  // Next ID: 7
-}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index ab04c0e017d..6a6d622e0b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -18,9 +18,9 @@ option java_package = "org.tensorflow.framework";
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 message GPUOptions {
-  // Fraction of the available GPU memory to allocate for each process.
+  // Fraction of the total GPU memory to allocate for each process.
   // 1 means to allocate all of the GPU memory, 0.5 means the process
-  // allocates up to ~50% of the available GPU memory.
+  // allocates up to ~50% of the total GPU memory.
   //
   // GPU memory is pre-allocated unless the allow_growth option is enabled.
   //
@@ -239,6 +239,19 @@ message GPUOptions {
     // hopes that another thread will free up memory in the meantime.  Setting
     // this to true disables the sleep; instead we'll OOM immediately.
     bool disallow_retry_on_allocation_failure = 12;
+
+    // Memory limit for "GPU host allocator", aka pinned memory allocator.  This
+    // can also be set via the envvar TF_GPU_HOST_MEM_LIMIT_IN_MB.
+    float gpu_host_mem_limit_in_mb = 13;
+
+    // If true, then the host allocator allocates its max memory all upfront and
+    // never grows.  This can be useful for latency-sensitive systems, because
+    // growing the GPU host memory pool can be expensive.
+    //
+    // You probably only want to use this in combination with
+    // gpu_host_mem_limit_in_mb, because the default GPU host memory limit is
+    // quite high.
+    bool gpu_host_mem_disallow_growth = 14;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -582,7 +595,8 @@ message ConfigProto {
     // If set, this can be used by the runtime and the Ops for debugging,
     // monitoring, etc.
     //
-    // NOTE: This is currently used and propagated only by the direct session.
+    // NOTE: This is currently used and propagated only by the direct session
+    // and EagerContext.
     SessionMetadata session_metadata = 11;
 
     // If true, the session may treat the graph as being static for optimization
@@ -615,18 +629,9 @@ message ConfigProto {
       MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
       // Disabling the MLIR bridge disables it for all graphs in this session.
       MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
-      // Enable the MLIR bridge on a per graph basis based on an analysis of
-      // the features used in the graph. If the features used by the graph are
-      // supported by the MLIR bridge, the MLIR bridge will be used to run the
-      // graph.
-      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED = 3;
-      // Enable the MLIR bridge in a fallback mode on a per graph basis based
-      // on an analysis of the features used in the graph.
-      // Running the MLIR bridge in the fallback mode means that it is
-      // executed and it commits all the changes to the TF graph in case
-      // of success. And it does not in case of failures and let the old bridge
-      // to process the TF graph.
-      MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED = 4;
+      reserved 3, 4;
+      reserved "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED",
+          "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED";
     }
     // Whether to enable the MLIR-based TF->XLA bridge.
     MlirBridgeRollout mlir_bridge_rollout = 17;
@@ -675,7 +680,18 @@ message ConfigProto {
     // Distributed coordination service configurations.
     CoordinationServiceConfig coordination_config = 23;
 
-    // Next: 24
+    // If true, the session will treat the graph as being non-static for
+    // optimization purposes.
+    //
+    // If this option is set to true when a session is created, the full
+    // GraphDef will be retained to enable calls to Session::Extend().
+    // Calling Extend() without setting this flag will result in errors.
+    //
+    // This option is meant to replace `optimize_for_static_graph` and it
+    // aims to negate its value.
+    bool disable_optimize_for_static_graph = 24;
+
+    // Next: 25
   }
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/fingerprint.proto b/tensorflow/core/protobuf/fingerprint.proto
index 5aaee0d52f4..837b9a04d61 100644
--- a/tensorflow/core/protobuf/fingerprint.proto
+++ b/tensorflow/core/protobuf/fingerprint.proto
@@ -15,8 +15,8 @@ option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobu
 // If there are multiple MetaGraphDefs in the SavedModel, the FingerprintDef
 // corresponds to the first one.
 message FingerprintDef {
-  // Hash of the graph_def, referred to as a "checksum".
-  uint64 graph_def_checksum = 1;
+  // Hash of the saved_model.pb, referred to as a "checksum".
+  uint64 saved_model_checksum = 1;
   // Hash of regularized graph_def.
   uint64 graph_def_program_hash = 2;
   // Hash of the regularized (sorted) SignatureDefs.
diff --git a/tensorflow/core/protobuf/service_config.proto b/tensorflow/core/protobuf/service_config.proto
index 9eb746f31da..dfef9ff4b66 100644
--- a/tensorflow/core/protobuf/service_config.proto
+++ b/tensorflow/core/protobuf/service_config.proto
@@ -7,7 +7,7 @@ import "tensorflow/core/protobuf/data_service.proto";
 option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto";
 
 // Configuration for a tf.data service DispatchServer.
-// Next id: 10
+// Next id: 11
 message DispatcherConfig {
   // The port for the dispatcher to bind to. A value of 0 indicates that the
   // dispatcher may bind to any available port.
@@ -41,6 +41,9 @@ message DispatcherConfig {
   // heartbeated to the dispatcher. A value of 0 indicates that the timeout
   // should be left to the runtime.
   int64 client_timeout_ms = 8;
+  // How long to wait for a worker to heartbeat before considering it missing.
+  // A value of 0 indicates that the timeout should be left to the runtime.
+  int64 worker_timeout_ms = 10;
 }
 
 // Configuration for a tf.data service WorkerServer.
diff --git a/tensorflow/core/protobuf/snapshot.proto b/tensorflow/core/protobuf/snapshot.proto
index 74a1850c6e3..109f9740601 100644
--- a/tensorflow/core/protobuf/snapshot.proto
+++ b/tensorflow/core/protobuf/snapshot.proto
@@ -45,3 +45,14 @@ message TensorMetadata {
 message SnapshotTensorMetadata {
   repeated TensorMetadata tensor_metadata = 1;
 }
+
+// Metadata for a `tf.data.Dataset` distributed snapshot.
+message DistributedSnapshotMetadata {
+  // The element spec of the snapshotted dataset.
+  bytes element_spec = 1;
+
+  // Whether and how to compress the snapshot.  Supported values are defined in
+  // `tsl::io::compression`.  In particular, an empty string specifies not to
+  // compress.
+  string compression = 2;
+}
diff --git a/tensorflow/core/protobuf/tpu/BUILD b/tensorflow/core/protobuf/tpu/BUILD
index 1948ad2e8e4..55a5158b017 100644
--- a/tensorflow/core/protobuf/tpu/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 5138020670d..cd7722b55e4 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -476,11 +476,14 @@ message LowDimensionalPackingStatus {
 // that is currently disabled (by default).
 message HotIdReplicationConfiguration {
   // Whether to enable or disable hot ID optimization.
-  // If UNSPECIFIED (default), hot ID optimization is DISABLED.
+  // If set to UNSPECIFIED (default), hot ID optimization is DISABLED.
+  // If set to ENABLED, hot ID replication is turned ON.
+  // If set to MIGRATION_ONLY, hot ID migration is turned ON.
   enum Status {
     UNSPECIFIED = 0;
     ENABLED = 1;
     DISABLED = 2;
+    MIGRATION_ONLY = 3;
   }
   Status status = 1;
 }
diff --git a/tensorflow/core/public/BUILD b/tensorflow/core/public/BUILD
index 0adfdf33807..c00f922c3b3 100644
--- a/tensorflow/core/public/BUILD
+++ b/tensorflow/core/public/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
     ],
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 06f17fba74d..9fc3ecefd70 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 12
+#define TF_MINOR_VERSION 13
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1302  // Updated: 2022/11/1
+#define TF_GRAPH_DEF_VERSION 1399  // Updated: 2023/2/6
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index 06a9506fbf5..985cd1d8136 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -15,6 +15,7 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     licenses = ["notice"],
 )
@@ -41,7 +42,6 @@ tf_cc_binary(
         "@tf_runtime//:hostcontext_alwayslink",
         "@tf_runtime//:tensor_alwayslink",
         "@tf_runtime//:test_kernels_alwayslink",
-        "@tf_runtime//:data_alwayslink",
         # copybara:uncomment "@tf_runtime//backends/cpu:proto_alwayslink",
         # copybara:uncomment "@tf_runtime//backends/cpu:image_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
diff --git a/tensorflow/core/runtime_fallback/conversion/BUILD b/tensorflow/core/runtime_fallback/conversion/BUILD
index 8d7c484a92e..9b90ab2aa3d 100644
--- a/tensorflow/core/runtime_fallback/conversion/BUILD
+++ b/tensorflow/core/runtime_fallback/conversion/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/runtime_fallback:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/runtime_fallback/conversion/conversion.cc b/tensorflow/core/runtime_fallback/conversion/conversion.cc
index b087c9067ca..36da527c5e6 100644
--- a/tensorflow/core/runtime_fallback/conversion/conversion.cc
+++ b/tensorflow/core/runtime_fallback/conversion/conversion.cc
@@ -41,7 +41,7 @@ static RuntimeFallbackTensor ConvertKernelFallbackToRuntimeFallbackTensor(
               tensorflow::tfd::kEagerContextResourceName);
   assert(optional_eager_resource.has_value());
   auto expected_eager_context =
-      optional_eager_resource.getValue()->GetTFEagerContext();
+      optional_eager_resource.value()->GetTFEagerContext();
   assert(expected_eager_context);
   Device *d;
   Status s =
diff --git a/tensorflow/core/runtime_fallback/kernel/BUILD b/tensorflow/core/runtime_fallback/kernel/BUILD
index ecf9ed8a9b2..41919355711 100644
--- a/tensorflow/core/runtime_fallback/kernel/BUILD
+++ b/tensorflow/core/runtime_fallback/kernel/BUILD
@@ -7,6 +7,7 @@ load(
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/runtime_fallback:internal"],
     features = ["-layering_check"],
     licenses = ["notice"],
@@ -355,6 +356,7 @@ cc_library(
         ":kernel_fallback_tensor",
         ":kernel_fallback_tensor_conversion_alwayslink",
         ":kernel_fallback_utils",
+        "@com_google_absl//absl/base",
         "@llvm-project//llvm:Support",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
@@ -399,9 +401,11 @@ cc_library(
         "//third_party/tf_runtime_google:__pkg__",
         # Sync fallback kernels need access to the fallback state.
         "//learning/brain/experimental/tfrt/native_lowering/kernels:__subpackages__",
+        "//tensorflow/core/tfrt/graph_executor:__subpackages__",
     ],
     deps = [
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
@@ -436,3 +440,29 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
+
+cc_library(
+    name = "gpurt_kernels",
+    srcs = ["gpurt_kernels.cc"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__pkg__",
+        "//tensorflow/core/runtime_fallback:internal",
+        "//tensorflow/core/tfrt/eager:__pkg__",
+        "//tensorflow/core/tfrt/graph_executor:__pkg__",
+        "//tensorflow/core/tfrt/saved_model:__pkg__",
+    ],
+    deps = [
+        ":kernel_fallback_compat_request_state",
+        ":kernel_fallback_tensor",
+        ":kernel_fallback_utils",
+        ":tensor_util",
+        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "//tensorflow/core/tfrt/utils:tensor_util",
+        "@tf_runtime//:core_runtime",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:support",
+        "@tf_runtime//:tensor_alwayslink",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc b/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc
new file mode 100644
index 00000000000..5bab1c1405f
--- /dev/null
+++ b/tensorflow/core/runtime_fallback/kernel/gpurt_kernels.cc
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/runtime_fallback/kernel/tensor_util.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tensorflow/core/tfrt/utils/gpu_variables_table.h"
+#include "tensorflow/core/tfrt/utils/tensor_util.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/kernel_registry.h"  // from @tf_runtime
+#include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
+#include "tfrt/support/forward_decls.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace gpu {
+
+namespace {
+using tfrt_stub::FallbackTensor;
+
+constexpr char kGpuDeviceName[] = "/device:GPU:0";
+
+// Transfers `tensor` from `src_device` to `dst_device`.
+tfrt::AsyncValueRef<FallbackTensor> TransferTensor(
+    const tfrt::ExecutionContext& exec_ctx, const FallbackTensor& tensor,
+    Device* src_device, Device* dst_device) {
+  const tensorflow::Tensor& src = tensor.tensor();
+  return tfd::TransferTensorToDevice<FallbackTensor>(exec_ctx, src, src_device,
+                                                     dst_device);
+}
+
+struct Devices {
+  Device* cpu_device = nullptr;
+  Device* gpu_device = nullptr;
+};
+
+// Gets CPU and GPU devices from the fallback state. Currently, we only consider
+// a single GPU device.
+Status GetDevices(const tfrt::ExecutionContext& exec_ctx, Devices* devices) {
+  tfrt::RequestContext* req_ctx = exec_ctx.request_ctx();
+  const auto* fallback_request_state =
+      req_ctx->GetDataIfExists<tfd::KernelFallbackCompatRequestState>();
+  if (!fallback_request_state) {
+    return tensorflow::errors::Internal("Fallback request state is not found.");
+  }
+
+  devices->cpu_device = fallback_request_state->device_manager().HostCPU();
+  if (!devices->cpu_device) {
+    return tensorflow::errors::Internal(
+        "Fallback request state must have a valid host cpu device.");
+  }
+  TF_RETURN_IF_ERROR(fallback_request_state->device_manager().LookupDevice(
+      kGpuDeviceName, &devices->gpu_device));
+  return OkStatus();
+}
+
+// Kernel for transferring `tensor` from host to device.
+tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> TransferToDevice(
+    const tfrt_stub::FallbackTensor& tensor,
+    const tfrt::ExecutionContext& exec_ctx) {
+  Devices devices;
+  Status status = GetDevices(exec_ctx, &devices);
+  if (!status.ok()) {
+    return tfrt::MakeErrorAsyncValueRef(
+        absl::InternalError(status.error_message()));
+  }
+  return TransferTensor(exec_ctx, tensor, devices.cpu_device,
+                        devices.gpu_device);
+}
+
+// Kernel for transferring `tensor` from device to host.
+tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> TransferFromDevice(
+    const tfrt_stub::FallbackTensor& tensor,
+    const tfrt::ExecutionContext& exec_ctx) {
+  Devices devices;
+  Status status = GetDevices(exec_ctx, &devices);
+  if (!status.ok()) {
+    return tfrt::MakeErrorAsyncValueRef(
+        absl::InternalError(status.error_message()));
+  }
+  return TransferTensor(exec_ctx, tensor, devices.gpu_device,
+                        devices.cpu_device);
+}
+
+// Kernel for transferring `variable` from host to device. If it has been
+// transferred, the variable will be returned from the variable cache.
+tfrt::AsyncValueRef<tfrt_stub::FallbackTensor> MaybeTransferVariable(
+    const tfrt_stub::FallbackTensor& variable,
+    const tfrt::ExecutionContext& exec_ctx) {
+  // For now, we only consider a single GPU device.
+  const int kCopyIndex = 0;
+  auto vars_table = exec_ctx.resource_context()
+                        ->GetOrCreateResource<tfrt::gpu::GpuVariablesTable>(
+                            tfrt::gpu::kGpuVariablesTableResourceName);
+  auto cached_device_variable =
+      vars_table->GetDeviceVariable(variable, kCopyIndex);
+  if (cached_device_variable) {
+    return cached_device_variable.CopyRef();
+  }
+
+  // The variable has not been transferred, so we transfer the variable and save
+  // the device copy in the variable table.
+  Devices devices;
+  Status status = GetDevices(exec_ctx, &devices);
+  if (!status.ok()) {
+    return tfrt::MakeErrorAsyncValueRef(
+        absl::InternalError(status.error_message()));
+  }
+  auto device_variable = TransferTensor(exec_ctx, variable, devices.cpu_device,
+                                        devices.gpu_device);
+  if (device_variable.IsError()) return device_variable;
+
+  vars_table->AddOrUpdateDeviceVariable(variable, kCopyIndex,
+                                        std::move(device_variable));
+  return vars_table->GetDeviceVariable(variable, kCopyIndex).CopyRef();
+}
+
+}  // namespace
+
+void RegisterGpurtKernels(tfrt::KernelRegistry* registry) {
+  registry->AddKernel("gpurt.transfer_to_device",
+                      TFRT_KERNEL(TransferToDevice));
+  registry->AddKernel("gpurt.transfer_from_device",
+                      TFRT_KERNEL(TransferFromDevice));
+  registry->AddKernel("gpurt.maybe_transfer_variable",
+                      TFRT_KERNEL(MaybeTransferVariable));
+}
+
+TFRT_STATIC_KERNEL_REGISTRATION(RegisterGpurtKernels);
+
+}  // namespace gpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
index 06b53a0d714..c4ac4e2b1c4 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
@@ -20,16 +20,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device.h"
-#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
-#include "tfrt/host_context/async_value.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/support/pointer_util.h"  // from @tf_runtime
 
@@ -142,6 +141,14 @@ class KernelFallbackCompatRequestState {
 
   const SessionMetadata& session_metadata() const { return session_metadata_; }
 
+  // Nullable.
+  tensorflow::tfrt_stub::CostRecorder* cost_recorder() const {
+    return cost_recorder_;
+  }
+  void set_cost_recorder(tensorflow::tfrt_stub::CostRecorder* cost_recorder) {
+    cost_recorder_ = cost_recorder;
+  }
+
  private:
   // Below are resources needed by current tensorflow.
   std::function<void(std::function<void()>)>* runner_ = nullptr;
@@ -173,6 +180,9 @@ class KernelFallbackCompatRequestState {
   const tensorflow::ProcessFunctionLibraryRuntime* pflr_ = nullptr;
 
   bool log_device_placement_ = false;
+
+  // Records the cost per op.
+  tensorflow::tfrt_stub::CostRecorder* cost_recorder_ = nullptr;
 };
 
 }  // namespace tfd
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.cc
index 99a42ed9a35..82cccaa5771 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute.cc
@@ -83,8 +83,7 @@ bool KernelFallbackExecute(
 
         // Forward kernel construction error.
         if (op_kernel_construction.error().has_value()) {
-          SetError(exec_ctx, &outputs,
-                   op_kernel_construction.error().getValue());
+          SetError(exec_ctx, &outputs, op_kernel_construction.error().value());
           return;
         }
 
@@ -102,7 +101,7 @@ bool KernelFallbackExecute(
 
         // Forward the context's error or outputs to raii_frame.
         if (op_kernel_context.error().has_value()) {
-          SetError(exec_ctx, &outputs, op_kernel_context.error().getValue());
+          SetError(exec_ctx, &outputs, op_kernel_context.error().value());
           return;
         } else {
           for (int i = 0, e = outputs.size(); i != e; ++i) {
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index 22a48630ee1..377e9774844 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 
@@ -117,7 +118,8 @@ Status SetUpKernelFallbackCompatRequestContext(
     const tensorflow::ProcessFunctionLibraryRuntime* pflr,
     tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool,
     const absl::optional<SessionMetadata>& model_metadata,
-    std::function<void(std::function<void()>)>* runner) {
+    std::function<void(std::function<void()>)>* runner,
+    tfrt_stub::CostRecorder* cost_recorder) {
   DCHECK(builder);
   DCHECK(device_manager);
   DCHECK(pflr);
@@ -130,10 +132,13 @@ Status SetUpKernelFallbackCompatRequestContext(
       builder->resource_context()->GetOrCreateResource<FallbackResourceArray>(
           kFallbackResourceArray);
 
-  builder->context_data().emplace<KernelFallbackCompatRequestState>(
-      runner ? runner : GetDefaultRunner(), device_manager, builder->id(),
-      runner_table, resource_array, user_intra_op_threadpool, model_metadata,
-      pflr);
+  auto& fallback_request_state =
+      builder->context_data().emplace<KernelFallbackCompatRequestState>(
+          runner ? runner : GetDefaultRunner(), device_manager, builder->id(),
+          runner_table, resource_array, user_intra_op_threadpool,
+          model_metadata, pflr);
+
+  fallback_request_state.set_cost_recorder(cost_recorder);
 
   return OkStatus();
 }
@@ -154,14 +159,20 @@ Status SetUpKernelFallbackCompatRequestContext(
 
   auto step_id = builder->id();
 
+  Rendezvous::Factory creator = eager_context->RendezvousFactory();
+  Rendezvous* rendezvous;
+  TF_RETURN_IF_ERROR(
+      creator(step_id, eager_context->local_device_mgr(), &rendezvous));
+
+  // TODO(hhb): Clean up rendezvous from factory after run.
+
   auto& fallback_request_state =
       builder->context_data().emplace<KernelFallbackCompatRequestState>(
           GetDefaultRunner(), eager_context->local_device_mgr(), step_id,
           tfrt::OwnedOrUnownedPtr<ScopedStepContainer>{
               eager_context->StepContainer()},
           eager_context->GetCollectiveExecutorHandle(),
-          tensorflow::core::RefCountPtr<tensorflow::Rendezvous>(
-              eager_context->RendezvousCreator()(step_id)),
+          tensorflow::core::RefCountPtr<tensorflow::Rendezvous>(rendezvous),
           runner_table, resource_array, user_intra_op_threadpool,
           model_metadata, eager_context->pflr());
 
@@ -405,7 +416,7 @@ std::string GetTracingMetadata(llvm::ArrayRef<tfrt::AsyncValue*> args,
   auto request_id = exec_ctx.request_ctx()->id();
   // Get Long Name
   auto debug_info = exec_ctx.location().GetDebugInfo();
-  auto long_name = debug_info.has_value() ? debug_info.getValue().info : "";
+  auto long_name = debug_info.has_value() ? debug_info.value().info : "";
 
   if (!profiler::TfOpDetailsEnabled()) {
     return profiler::TraceMeEncode(
@@ -539,10 +550,6 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
 
   SetUpParams(kernel_runner, fallback_request_state, device, run_state);
 
-  bool is_cost_measurement_enabled =
-      exec_ctx.request_ctx()->IsCostMeasurementEnabled();
-  auto run_start_time =
-      is_cost_measurement_enabled ? Env::Default()->NowMicros() : 0;
   if (is_async) {
     KernelFallbackExecuteCompatAsyncInternal<
         tensorflow::tfrt_stub::FallbackTensor>(
@@ -553,17 +560,6 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
         exec_ctx, &fallback_request_state, &run_state, kernel_runner, op_chain,
         results);
   }
-  if (is_cost_measurement_enabled) {
-    op_chain->AndThen([run_start_time, exec_ctx, frame] {
-      // Adds 1 to make sure it's a positive integer.
-      auto execution_time = Env::Default()->NowMicros() - run_start_time + 1;
-      // Adds op_key as a suffix to distinguish the same operation with
-      // different shape.
-      exec_ctx.host()
-          ->GetOrCreateSharedContext<tensorflow::tfrt_stub::CostRecorder>()
-          .RecordCost(frame.op_key().GetValue(), execution_time);
-    });
-  }
 }
 
 TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
@@ -584,6 +580,15 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
     return;
   }
 
+  // Start recording the op execution time, given a non-null cost recorder.
+  auto* cost_recorder = fallback_request_state->cost_recorder();
+  uint64_t run_start_time_ns = 0;
+  tfrt::AsyncValueRef<tfrt::Chain> cost_chain;
+  if (cost_recorder != nullptr) {
+    run_start_time_ns = Env::Default()->NowNanos();
+    if (op_chain == nullptr) op_chain = &cost_chain;
+  }
+
   auto* runner_table = fallback_request_state->runner_table();
   DCHECK(runner_table);
 
@@ -598,6 +603,15 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
   KernelFallbackExecuteOpInternal(args, results, op_chain, frame, exec_ctx,
                                   *fallback_request_state, *kernel_runner,
                                   kernel_runner->IsAsync(), device);
+
+  // Finish recording the op execution time, given a non-null cost recorder.
+  if (cost_recorder != nullptr) {
+    op_chain->AndThen(
+        [cost_recorder, run_start_time_ns, op_key = frame.op_key().GetValue()] {
+          cost_recorder->RecordCostNanosecond(
+              op_key, Env::Default()->NowNanos() - run_start_time_ns);
+        });
+  }
 }
 
 // The BEF kernel for creating tensorflow::OpKernel to be used in kernel
@@ -720,18 +734,9 @@ void FallbackAsyncExecuteOp(tfrt::AsyncKernelFrame* frame) {
       ->GetOrCreateSharedContext<OpLogger>()
       .LogOp(attr_frame.op_name().GetValue());
 #endif
-  // Create op_chain only when cost measurement is enabled. It is used for
-  // measuring async op's actual latency.
-  if (frame->GetExecutionContext().request_ctx()->IsCostMeasurementEnabled()) {
-    auto op_chain = tfrt::MakeUnconstructedAsyncValueRef<tfrt::Chain>();
-    KernelFallbackExecuteOp(frame->GetArguments(), frame->GetResults(),
-                            &op_chain, attr_frame,
-                            frame->GetExecutionContext());
-  } else {
-    KernelFallbackExecuteOp(frame->GetArguments(), frame->GetResults(),
-                            /*op_chain=*/nullptr, attr_frame,
-                            frame->GetExecutionContext());
-  }
+  KernelFallbackExecuteOp(frame->GetArguments(), frame->GetResults(),
+                          /*op_chain=*/nullptr, attr_frame,
+                          frame->GetExecutionContext());
 }
 
 // The implementation of tfrt_fallback_async.executeop.seq kernel. It executes a
@@ -858,20 +863,35 @@ void KernelFallbackExecuteOpCustomAllocatorInternal(
   auto* allocator = args.front()->get<tensorflow::Allocator*>();
   args = args.drop_front();
 
-  DeviceWithCustomAllocator device_with_custom_allocator(
-      GetDeviceFromFallbackState(*fallback_request_state, *kernel_runner),
-      allocator);
-
-  // Different from FallbackAsyncExecuteOp, async execution is not allowed due
-  // to the lifetime of the wrapper device cannot be extended.
-  //
-  // TODO(b/200575143): Consider allowing async execution and extending the
-  // lifetime of the wrapping device.
-  KernelFallbackExecuteOpInternal(args, results,
-                                  /*op_chain=*/op_chain, attr_frame, exec_ctx,
-                                  *fallback_request_state, *kernel_runner,
-                                  /*is_async=*/false,
-                                  &device_with_custom_allocator);
+  auto* device =
+      GetDeviceFromFallbackState(*fallback_request_state, *kernel_runner);
+
+  if (!kernel_runner->IsAsync()) {
+    DeviceWithCustomAllocator device_with_custom_allocator(device, allocator);
+
+    KernelFallbackExecuteOpInternal(args, results,
+                                    /*op_chain=*/op_chain, attr_frame, exec_ctx,
+                                    *fallback_request_state, *kernel_runner,
+                                    /*is_async=*/false,
+                                    &device_with_custom_allocator);
+  } else {
+    auto device_with_custom_allocator =
+        std::make_unique<DeviceWithCustomAllocator>(device, allocator);
+
+    tfrt::AsyncValueRef<tfrt::Chain> op_ch;
+    if (op_chain == nullptr) {
+      op_chain = &op_ch;
+    }
+
+    KernelFallbackExecuteOpInternal(args, results,
+                                    /*op_chain=*/op_chain, attr_frame, exec_ctx,
+                                    *fallback_request_state, *kernel_runner,
+                                    /*is_async=*/true,
+                                    device_with_custom_allocator.get());
+
+    DCHECK(op_chain);
+    op_chain->AndThen([d = std::move(device_with_custom_allocator)]() {});
+  }
 }
 
 void FallbackAsyncExecuteOpWithAllocator(tfrt::AsyncKernelFrame* frame) {
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
index 66ebf70f3e4..5a762542a29 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_EXECUTE_COMPAT_H_
 
+#include <functional>
 #include <optional>
 #include <string>
 
@@ -51,7 +52,8 @@ Status SetUpKernelFallbackCompatRequestContext(
     const tensorflow::ProcessFunctionLibraryRuntime* pflr,
     tensorflow::thread::ThreadPoolInterface* user_intra_op_threadpool = nullptr,
     const absl::optional<SessionMetadata>& model_metadata = absl::nullopt,
-    std::function<void(std::function<void()>)>* runner = nullptr);
+    std::function<void(std::function<void()>)>* runner = nullptr,
+    tfrt_stub::CostRecorder* cost_recorder = nullptr);
 
 // Runner_table can be nullptr. In that case, kernel_fallback will use
 // the default runner_table.
diff --git a/tensorflow/core/runtime_fallback/kernel/tensor_util.cc b/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
index add46e47965..5b7d94f2969 100644
--- a/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tensor_util.cc
@@ -14,12 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/tensor_util.h"
 
-#include "tensorflow/core/common_runtime/copy_tensor.h"
-#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
-#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/device.h"  // from @tf_runtime
 #include "tfrt/support/string_util.h"  // from @tf_runtime
 
@@ -29,21 +25,7 @@ namespace tfd {
 tfrt::AsyncValueRef<KernelFallbackTensor> TransferTensorToDevice(
     const tfrt::ExecutionContext& exec_ctx, const KernelFallbackTensor& tensor,
     const tfrt::Device& src_device, const tfrt::Device& dst_device) {
-  const bool is_same_device =
-      (&src_device == &dst_device) || (src_device.name() == dst_device.name());
-
-  // Note: source and destination CPU devices are expected to be on the same
-  // host. Currently TFRT doesn't support checking if a CPU is remote CPU,
-  // we may consider adding a remote CPU device type in the future.
-  const bool is_between_cpu_devices =
-      src_device.IsDeviceType(tfrt::CpuDevice::kDeviceType) &&
-      dst_device.IsDeviceType(tfrt::CpuDevice::kDeviceType);
-
-  const tensorflow::Tensor* src = tensor.GetTensor();
-
-  if (is_same_device || is_between_cpu_devices) {
-    return tfrt::MakeAvailableAsyncValueRef<KernelFallbackTensor>(*src);
-  }
+  const tensorflow::Tensor& src = *tensor.GetTensor();
 
   auto expected_src = GetTfDevice(exec_ctx, src_device);
   if (!expected_src) {
@@ -55,73 +37,9 @@ tfrt::AsyncValueRef<KernelFallbackTensor> TransferTensorToDevice(
   }
   tensorflow::Device* srcd = expected_src.get();
   tensorflow::Device* dstd = expected_dst.get();
-  const bool src_cpu = srcd->tensorflow_accelerator_device_info() == nullptr;
-  const bool dst_cpu = dstd->tensorflow_accelerator_device_info() == nullptr;
-
-  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
-                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
-    return tfrt::MakeErrorAsyncValueRef(
-        tfrt::StrCat("Can't copy Tensor with type ",
-                     tensorflow::DataTypeString(src->dtype()), " to device ",
-                     dstd->name(), "."));
-  }
-  tensorflow::AllocatorAttributes attr;
-  if (src->dtype() == tensorflow::DT_VARIANT) {
-    attr.set_on_host(true);
-  }
-  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
-  if (src->shape().num_elements() == 0) {
-    return tfrt::MakeAvailableAsyncValueRef<KernelFallbackTensor>(dst);
-  }
 
-  auto result = tfrt::MakeUnconstructedAsyncValueRef<KernelFallbackTensor>();
-  bool enqueued = tfrt::EnqueueBlockingWork(
-      exec_ctx.host(), [result = result.CopyRef(), src_cpu, dst_cpu, srcd, dstd,
-                        src = *src, dst = std::move(dst)]() mutable {
-        tensorflow::DeviceContext* src_device_context = nullptr;
-        if (!src_cpu) {
-          src_device_context =
-              srcd->tensorflow_accelerator_device_info()->default_context;
-        }
-        tensorflow::DeviceContext* dst_device_context = nullptr;
-        if (!dst_cpu) {
-          dst_device_context =
-              dstd->tensorflow_accelerator_device_info()->default_context;
-        }
-        // TODO(tfrt-devs): The Sync() call below may be more aggressive than
-        // necessary. It is based on knowledge of implementation details - that
-        // GPU devices are implemented using 3 streams - one for host->device
-        // copies, one for device->host copies and one for sending operations to
-        // the GPU. With that setup, Sync()ing across all 3 streams should be
-        // sufficient but more than necessary (since it waits for operations
-        // that might have nothing to do with this tensor to complete).
-        Status s = srcd->Sync();
-        if (!s.ok()) {
-          result.SetError(absl::InternalError(s.error_message()));
-          return;
-        }
-        tensorflow::Notification n;
-        tensorflow::Status status;
-        tensorflow::CopyTensor::ViaDMA(
-            "copy", src_device_context, dst_device_context, srcd, dstd,
-            tensorflow::AllocatorAttributes(),
-            tensorflow::AllocatorAttributes(), &src, &dst,
-            0 /*dev_to_dev_stream_index*/,
-            [&status, &n](const tensorflow::Status& s) {
-              status = s;
-              n.Notify();
-            });
-        n.WaitForNotification();
-        if (status.ok()) {
-          result.emplace(std::move(dst));
-        }
-      });
-
-  if (!enqueued) {
-    return tfrt::MakeErrorAsyncValueRef(
-        "Failed to enqueu blocking task to transfer tensor");
-  }
-  return result;
+  return TransferTensorToDevice<KernelFallbackTensor>(exec_ctx, src, srcd,
+                                                      dstd);
 }
 
 llvm::Expected<Device*> GetTfDevice(const tfrt::ExecutionContext& exec_ctx,
diff --git a/tensorflow/core/runtime_fallback/kernel/tensor_util.h b/tensorflow/core/runtime_fallback/kernel/tensor_util.h
index 39d1de6a134..47359210a67 100644
--- a/tensorflow/core/runtime_fallback/kernel/tensor_util.h
+++ b/tensorflow/core/runtime_fallback/kernel/tensor_util.h
@@ -15,18 +15,110 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TENSOR_UTIL_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TENSOR_UTIL_H_
 
+#include <utility>
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/framework/device.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 
 namespace tfrt {
 class Device;
-class ExecutionContext;
 }  // namespace tfrt
 
 namespace tensorflow {
 class KernelFallbackTensor;
-class Device;
 namespace tfd {
 
+// Transfers tensor `src` from `src_device` to `dst_device`.
+// Returns the transferred tensor on `dst_device` wrapped as
+// `TensorWrapperType`.
+template <typename TensorWrapperType>
+tfrt::AsyncValueRef<TensorWrapperType> TransferTensorToDevice(
+    const tfrt::ExecutionContext& exec_ctx, const Tensor& src,
+    Device* src_device, Device* dst_device) {
+  const bool is_same_device =
+      (src_device == dst_device) || (src_device->name() == dst_device->name());
+
+  // Note: source and destination CPU devices are expected to be on the same
+  // host. Currently TFRT doesn't support checking if a CPU is remote CPU,
+  // we may consider adding a remote CPU device type in the future.
+  const bool src_cpu =
+      src_device->tensorflow_accelerator_device_info() == nullptr;
+  const bool dst_cpu =
+      dst_device->tensorflow_accelerator_device_info() == nullptr;
+  const bool is_between_cpu_devices = dst_cpu && src_cpu;
+
+  if (is_same_device || is_between_cpu_devices) {
+    return tfrt::MakeAvailableAsyncValueRef<TensorWrapperType>(src);
+  }
+
+  if (!dst_cpu && (src.dtype() != tensorflow::DT_VARIANT &&
+                   !tensorflow::DataTypeCanUseMemcpy(src.dtype()))) {
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(tfrt::StrCat(
+        "Can't copy Tensor with type ", tensorflow::DataTypeString(src.dtype()),
+        " to device ", dst_device->name(), ".")));
+  }
+  tensorflow::AllocatorAttributes attr;
+  if (src.dtype() == tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  tensorflow::Tensor dst(dst_device->GetAllocator(attr), src.dtype(),
+                         src.shape());
+  if (src.shape().num_elements() == 0) {
+    return tfrt::MakeAvailableAsyncValueRef<TensorWrapperType>(dst);
+  }
+
+  auto result = tfrt::MakeUnconstructedAsyncValueRef<TensorWrapperType>();
+  bool enqueued = tfrt::EnqueueBlockingWork(
+      exec_ctx.host(), [result = result.CopyRef(), src_cpu, dst_cpu, src_device,
+                        dst_device, src, dst = std::move(dst)]() mutable {
+        tensorflow::DeviceContext* src_device_context = nullptr;
+        if (!src_cpu) {
+          src_device_context =
+              src_device->tensorflow_accelerator_device_info()->default_context;
+        }
+        tensorflow::DeviceContext* dst_device_context = nullptr;
+        if (!dst_cpu) {
+          dst_device_context =
+              dst_device->tensorflow_accelerator_device_info()->default_context;
+        }
+        // TODO(tfrt-devs): The Sync() call below may be more aggressive than
+        // necessary. It is based on knowledge of implementation details - that
+        // GPU devices are implemented using 3 streams - one for host->device
+        // copies, one for device->host copies and one for sending operations to
+        // the GPU. With that setup, Sync()ing across all 3 streams should be
+        // sufficient but more than necessary (since it waits for operations
+        // that might have nothing to do with this tensor to complete).
+        Status s = src_device->Sync();
+        if (!s.ok()) {
+          result.SetError(absl::InternalError(s.error_message()));
+          return;
+        }
+        tensorflow::Notification n;
+        tensorflow::Status status;
+        tensorflow::CopyTensor::ViaDMA(
+            "copy", src_device_context, dst_device_context, src_device,
+            dst_device, tensorflow::AllocatorAttributes(),
+            tensorflow::AllocatorAttributes(), &src, &dst,
+            0 /*dev_to_dev_stream_index*/,
+            [&status, &n](const tensorflow::Status& s) {
+              status = s;
+              n.Notify();
+            });
+        n.WaitForNotification();
+        if (status.ok()) {
+          result.emplace(std::move(dst));
+        }
+      });
+
+  if (!enqueued) {
+    return tfrt::MakeErrorAsyncValueRef(absl::InternalError(
+        "Failed to enqueue blocking task to transfer tensor."));
+  }
+  return result;
+}
+
 tfrt::AsyncValueRef<KernelFallbackTensor> TransferTensorToDevice(
     const tfrt::ExecutionContext& exec_ctx, const KernelFallbackTensor& tensor,
     const tfrt::Device& src_device, const tfrt::Device& dst_device);
diff --git a/tensorflow/core/runtime_fallback/runtime/BUILD b/tensorflow/core/runtime_fallback/runtime/BUILD
index 9826176f400..eefd739ce85 100644
--- a/tensorflow/core/runtime_fallback/runtime/BUILD
+++ b/tensorflow/core/runtime_fallback/runtime/BUILD
@@ -12,6 +12,7 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     features = ["-layering_check"],
     licenses = ["notice"],
diff --git a/tensorflow/core/runtime_fallback/runtime/conversion_function.cc b/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
index 098c2f75f2b..b24dee8919a 100644
--- a/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
+++ b/tensorflow/core/runtime_fallback/runtime/conversion_function.cc
@@ -108,7 +108,7 @@ ConvertScalarHostTensorToRuntimeFallbackTensor(
         "error copying ScalarHostTensor to DenseHostTensor");
 
   return tfrt::MakeAvailableAsyncValueRef<RuntimeFallbackTensor>(
-      CopyRefDHTToRuntimeFallbackTensor(optional_dht.getValue(), host));
+      CopyRefDHTToRuntimeFallbackTensor(optional_dht.value(), host));
 }
 
 static tfrt::AsyncValueRef<RuntimeFallbackTensor>
@@ -145,7 +145,7 @@ TransferRuntimeFallbackToAnotherDevice(const RuntimeFallbackTensor &tensor,
     return tfrt::MakeStringError(
         "Cannot get EagerContext from ExecutionContext.");
   auto expected_eager_context =
-      eager_context_resource.getValue()->GetTFEagerContext();
+      eager_context_resource.value()->GetTFEagerContext();
   if (!expected_eager_context) return expected_eager_context.takeError();
   auto *eager_context = expected_eager_context.get();
 
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 6ee971426dc..39ad25e4493 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
@@ -119,7 +120,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
                        int32_t max_enqueued_batches,
                        ArrayRef<int32_t> allowed_batch_sizes,
                        RCReference<const tfrt::Function> bef_func,
-                       bool enable_large_batch_splitting,
+                       bool enable_large_batch_splitting, bool disable_padding,
                        const tfrt::ExecutionContext& exec_ctx,
                        std::unique_ptr<FallbackBatchResource>* resource) {
     BatcherT::Options batcher_options;
@@ -132,7 +133,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
         GetBatcherQueueOptions(num_batch_threads, max_batch_size,
                                batch_timeout_micros, max_enqueued_batches,
                                allowed_batch_sizes,
-                               enable_large_batch_splitting),
+                               enable_large_batch_splitting, disable_padding),
         allowed_batch_sizes));
     return OkStatus();
   }
@@ -141,7 +142,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
       AdaptiveBatcherT::Options adaptive_shared_batch_scheduler_options,
       int32_t max_batch_size, int32_t batch_timeout_micros,
       int32_t max_enqueued_batches, ArrayRef<int32_t> allowed_batch_sizes,
-      RCReference<const tfrt::Function> bef_func,
+      RCReference<const tfrt::Function> bef_func, bool disable_padding,
       const tfrt::ExecutionContext& exec_ctx,
       std::unique_ptr<FallbackBatchResource>* resource) {
     std::shared_ptr<AdaptiveBatcherT> batcher;
@@ -150,9 +151,10 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
 
     resource->reset(new FallbackBatchResource(
         exec_ctx, std::move(bef_func), std::move(batcher),
-        GetAdaptiveBatcherQueueOptions(
-            max_batch_size, batch_timeout_micros, max_enqueued_batches,
-            true /* enable large batch split */, allowed_batch_sizes),
+        GetAdaptiveBatcherQueueOptions(max_batch_size, batch_timeout_micros,
+                                       max_enqueued_batches,
+                                       true /* enable large batch split */,
+                                       allowed_batch_sizes, disable_padding),
         allowed_batch_sizes));
     return OkStatus();
   }
@@ -253,6 +255,7 @@ class BatchFunctionFallbackKernel : public AsyncOpKernel {
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
   bool enable_large_batch_splitting_;
+  bool disable_padding_;
   RCReference<const tfrt::Function> bef_func_;
 
   // Parameters for adaptive batch scheduler only.
@@ -294,7 +297,12 @@ BatchFunctionFallbackKernel::BatchFunctionFallbackKernel(
         tfrt::FormRef(absl::bit_cast<const tfrt::Function*>(bef_func_intptr));
   }
 
-  DCHECK(!shared_name_.empty());
+  if (shared_name_.empty()) {
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    shared_name_ = name();
+  }
+
   VLOG(1) << "BatchFunctionFallbackKernel(" << this
           << ") container attribute: \"" << container_
           << "\", shared_name attribute: \"" << shared_name_
@@ -307,6 +315,12 @@ BatchFunctionFallbackKernel::BatchFunctionFallbackKernel(
     enable_large_batch_splitting_ = false;
   }
 
+  if (c->HasAttr("disable_padding")) {
+    OP_REQUIRES_OK(c, c->GetAttr("disable_padding", &disable_padding_));
+  } else {
+    disable_padding_ = false;
+  }
+
   // Helper function `SetAdaptiveBatchSchedulerOptions` calls
   // `OP_REQUIRES_OK`, which exits the current function upon error.
   // So validate status of `op-kernel-construction`.
@@ -367,7 +381,7 @@ void BatchFunctionFallbackKernel::ComputeAsync(OpKernelContext* c,
       TF_RETURN_IF_ERROR(FallbackBatchResource::Create(
           adaptive_shared_batch_scheduler_options, max_batch_size_,
           batch_timeout_micros_, max_enqueued_batches_, allowed_batch_sizes_,
-          bef_func_, *exec_ctx, &new_resource));
+          bef_func_, disable_padding_, *exec_ctx, &new_resource));
       *r = new_resource.release();
       return OkStatus();
     };
@@ -379,7 +393,8 @@ void BatchFunctionFallbackKernel::ComputeAsync(OpKernelContext* c,
       TF_RETURN_IF_ERROR(FallbackBatchResource::Create(
           num_batch_threads_, max_batch_size_, batch_timeout_micros_,
           max_enqueued_batches_, allowed_batch_sizes_, bef_func_,
-          enable_large_batch_splitting_, *exec_ctx, &new_resource));
+          enable_large_batch_splitting_, disable_padding_, *exec_ctx,
+          &new_resource));
       *r = new_resource.release();
       return OkStatus();
     };
@@ -644,6 +659,7 @@ REGISTER_OP("_BatchFunctionFallback")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
     .Attr("enable_large_batch_splitting: bool = false")
+    .Attr("disable_padding: bool = false")
     // TFRT BEF function pointer.
     .Attr("tfrt_bef_func: int")
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
index bb5987c9d03..3102aa9f85d 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
@@ -417,9 +417,9 @@ static tensorflow::Status PrepareAttributes(EagerOperation* eager_op,
 
         // DTypes in BEF attributes are tfrt::DType enums. So we need
         // to convert then to tensorflow data types first.
-        auto bef_dtypes = llvm::makeArrayRef(
-            static_cast<const tfrt::DType*>(op_attr.GetData()),
-            op_attr.element_count);
+        auto bef_dtypes =
+            llvm::ArrayRef(static_cast<const tfrt::DType*>(op_attr.GetData()),
+                           op_attr.element_count);
 
         llvm::SmallVector<tensorflow::DataType, 4> tf_dtypes;
         tf_dtypes.reserve(bef_dtypes.size());
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h
index 7f00d4576bb..53c6ab69750 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h
@@ -48,8 +48,7 @@ class RuntimeFallbackTensor final
   template <typename T>
   static void PrintTensorValues(void* data, ssize_t size,
                                 llvm::raw_ostream& os) {
-    llvm::ArrayRef<T> elements =
-        llvm::makeArrayRef(static_cast<T*>(data), size);
+    llvm::ArrayRef<T> elements = llvm::ArrayRef(static_cast<T*>(data), size);
     llvm::interleaveComma(elements, os);
   }
 
diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index 19490d559bf..a8185dd4f3f 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -17,6 +17,7 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     features = ["-layering_check"],
     licenses = ["notice"],
diff --git a/tensorflow/core/runtime_fallback/util/attr_util.cc b/tensorflow/core/runtime_fallback/util/attr_util.cc
index 1ce8a09190f..a21bcf36887 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util.cc
@@ -119,8 +119,8 @@ llvm::Error FillAttrValueMapUsingArray(const OpAttrsRawEntry& entry,
       // DTypes in BEF attributes are tfrt::DType enums. So we need
       // to convert then to tensorflow data types first.
       auto bef_dtypes =
-          llvm::makeArrayRef(static_cast<const tfrt::DType*>(op_attr.GetData()),
-                             op_attr.element_count);
+          llvm::ArrayRef(static_cast<const tfrt::DType*>(op_attr.GetData()),
+                         op_attr.element_count);
 
       llvm::SmallVector<tensorflow::DataType, 4> tf_dtypes;
       tf_dtypes.reserve(bef_dtypes.size());
diff --git a/tensorflow/core/runtime_fallback/util/tensor_util.h b/tensorflow/core/runtime_fallback/util/tensor_util.h
index f9ddb2b22f7..bb913c53087 100644
--- a/tensorflow/core/runtime_fallback/util/tensor_util.h
+++ b/tensorflow/core/runtime_fallback/util/tensor_util.h
@@ -60,8 +60,8 @@ inline tfrt::TensorMetadata GetTensorMetadata(
   auto dim_sizes = tf_tensor.shape().dim_sizes();
   static_assert(sizeof(tfrt::Index) == sizeof(dim_sizes.front()),
                 "Invalid dimension type size");
-  auto shape = llvm::makeArrayRef(
-      reinterpret_cast<tfrt::Index*>(dim_sizes.data()), dim_sizes.size());
+  auto shape = llvm::ArrayRef(reinterpret_cast<tfrt::Index*>(dim_sizes.data()),
+                              dim_sizes.size());
   return tfrt::TensorMetadata(dtype, shape);
 }
 
diff --git a/tensorflow/core/runtime_fallback/util/tensor_util_test.cc b/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
index 730283f5b54..fefb288e8a3 100644
--- a/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
@@ -43,7 +43,7 @@ TEST(TensorUtilTest, MoveHostBufferToTfTensorOk) {
   ssize_t size = 4 * 4;
   auto host_context = CreateTestHostContext();
   tfrt::TensorMetadata metadata(tfrt::DType(tfrt::DType::I32),
-                                llvm::makeArrayRef(4));
+                                llvm::ArrayRef(4));
   auto* ptr =
       host_context->AllocateBytes(size, /*alignment=*/EIGEN_MAX_ALIGN_BYTES);
   tfrt::DenseHostTensor dht(
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 0212a78b675..4e356d86ef4 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index 3e221f0af0d..e8d1f07ce5e 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -58,7 +58,7 @@ const uint64 kIdTiers[] = {
     0x7fffffffffffULL,  // 47-bit (5 bytes on disk)
                         // remaining bits for future use
 };
-const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64);
+const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64) - 1;
 const int kIdCollisionDelayMicros = 10;
 const int kMaxIdCollisions = 21;  // sum(2**i*10µs for i in range(21))~=21s
 const int64_t kAbsent = 0LL;
diff --git a/tensorflow/core/tfrt/BUILD b/tensorflow/core/tfrt/BUILD
index 03172310109..35eacb25619 100644
--- a/tensorflow/core/tfrt/BUILD
+++ b/tensorflow/core/tfrt/BUILD
@@ -1 +1,3 @@
 # Currently empty build file for the tensorflow/core/tfrt package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index af0aabca713..093321964b3 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -1,6 +1,8 @@
-load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -14,6 +16,8 @@ package_group(
         # copybara:uncomment "//learning/brain/google/xla/...",
         # copybara:uncomment "//learning/brain/tfrc/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
+        "//tensorflow/c/...",
+        "//tensorflow/compiler/jit/...",
         "//tensorflow/core/common_runtime/next_pluggable_device/...",
         "//tensorflow/core/tfrt/...",
         "//tensorflow/core/tfrt/eager/backends/tpu/...",
@@ -31,10 +35,6 @@ cc_library(
     hdrs = [
         "global_state.h",
     ],
-    compatible_with = if_google([
-        "//buildenv/target:libtpu",
-        "//buildenv/target:gce",
-    ]),
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework",
@@ -52,10 +52,6 @@ cc_library(
     hdrs = [
         "async_value_tensor.h",
     ],
-    compatible_with = if_google([
-        "//buildenv/target:libtpu",
-        "//buildenv/target:gce",
-    ]),
     visibility = [":friends"],
     deps = [
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
@@ -73,10 +69,6 @@ cc_library(
     hdrs = [
         "pjrt_state.h",
     ],
-    compatible_with = if_google([
-        "//buildenv/target:libtpu",
-        "//buildenv/target:gce",
-    ]),
     visibility = [":friends"],
     deps = [
         ":global_state",
@@ -95,10 +87,6 @@ cc_library(
     hdrs = [
         "pjrt_util.h",
     ],
-    compatible_with = if_google([
-        "//buildenv/target:libtpu",
-        "//buildenv/target:gce",
-    ]),
     visibility = [":friends"],
     deps = [
         ":global_state",
@@ -136,8 +124,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
         "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core/platform:status_matchers",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
diff --git a/tensorflow/core/tfrt/common/async_value_tensor.h b/tensorflow/core/tfrt/common/async_value_tensor.h
index 5563344330f..c0b3fe9be72 100644
--- a/tensorflow/core/tfrt/common/async_value_tensor.h
+++ b/tensorflow/core/tfrt/common/async_value_tensor.h
@@ -61,6 +61,7 @@ class AsyncValueAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
 
   bool AllocatesOpaqueHandle() const override { return true; }
+  string Name() override { return "async-value"; }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 4723f5da8b2..7c06514ad0a 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -20,8 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
 
-ABSL_FLAG(bool, tf_use_pjrt, false, "Use PjRtClient in Tensorflow.");
-
 namespace tensorflow {
 
 PjRtState* PjRtState::Create() { return new PjRtState(); }
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 9b361b97c58..33477fdfa9f 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
 
-ABSL_DECLARE_FLAG(bool, tf_use_pjrt);
-
 namespace tensorflow {
 
 const char kPjRtStateResourceName[] = "pjrt_state";
diff --git a/tensorflow/core/tfrt/common/pjrt_util.cc b/tensorflow/core/tfrt/common/pjrt_util.cc
index 32cca61d04e..ae7e6373f6f 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 
 #include <memory>
+#include <optional>
+#include <set>
 
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
@@ -65,15 +67,38 @@ Status DeletePjRtClientFromTFGlobalResourceManagerIfResourceExists(
   return OkStatus();
 }
 
-StatusOr<xla::PjRtClient*> GetPjRtClientFromTFGlobalResourceManager(
-    const DeviceType& device_type) {
+StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
+    const DeviceType& device_type,
+    std::optional<std::set<int>> allowed_devices) {
   ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
   PjRtState* pjrt_state;
-  TF_RETURN_IF_ERROR(rmgr->Lookup(rmgr->default_container(),
-                                  kPjRtStateResourceName, &pjrt_state));
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<PjRtState>(
+      rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
+      [&](PjRtState** ret) {
+        *ret = PjRtState::Create();
+        return OkStatus();
+      }));
   core::ScopedUnref pjrt_state_ref(pjrt_state);
-  TF_ASSIGN_OR_RETURN(auto pjrt_client, pjrt_state->GetPjRtClient(device_type));
-  return pjrt_client;
+  StatusOr<xla::PjRtClient*> existing_pjrt_client =
+      pjrt_state->GetPjRtClient(device_type);
+  // Checks whether a PJRT client is found first as the DeviceType can choose to
+  // create the PJRT client explicitly (e.g. in ops).
+  if (existing_pjrt_client.ok()) {
+    return *existing_pjrt_client;
+  }
+  // Returns directly if the error is not NotFound.
+  if (!tsl::errors::IsNotFound(existing_pjrt_client.status())) {
+    return existing_pjrt_client;
+  }
+  // TODO(b/260799193): use XlaPlatformInfo to pass device-specific options.
+  // This info should be set in the plugin init for next pluggable device.
+  // TODO(b/265435743): add GetStreamExecutorGpuClient for DEVICE_GPU when the
+  // cuda_platform dependency in se_gpu_pjrt_client is changed to be compatible
+  // with tf_cuda_cc_test.
+  return errors::Unimplemented(
+      "The PJRT client for ", device_type,
+      " is not created explicitly before its first use and creating this "
+      "PJRT client on the first use is not implemented.");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util.h b/tensorflow/core/tfrt/common/pjrt_util.h
index dc1332e424b..ac48356201a 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.h
+++ b/tensorflow/core/tfrt/common/pjrt_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
 
 #include <memory>
+#include <optional>
+#include <set>
 
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
@@ -32,8 +34,19 @@ Status SetPjRtClientInTFGlobalResourceManager(
 Status DeletePjRtClientFromTFGlobalResourceManagerIfResourceExists(
     const DeviceType& device_type);
 
-StatusOr<xla::PjRtClient*> GetPjRtClientFromTFGlobalResourceManager(
-    const DeviceType& device_type);
+// Gets PJRT client from TFGlobalResourceManager. If it is not found, creates a
+// PJRT client and adds it to TFGlobalResourceManager. Different `DeviceType`
+// can choose to create the PJRT client explicitly (e.g. in ops) and add it to
+// TFGlobalResourceManager, or create a PJRT client on the first use implicitly
+// in this method.
+// The inputs are the device_type of the caller, and an optional
+// set of device IDs `allowed_devices` for which the stream executor will be
+// created. `allowed_devices` is only used for GPU.
+// TODO(b/260802979): consider passing `XlaPlatformInfo` for the options to
+// create a client, or creating a class similar to `LocalClientOptions`.
+StatusOr<xla::PjRtClient*> GetOrCreatePjRtClient(
+    const DeviceType& device_type,
+    std::optional<std::set<int>> allowed_devices = std::nullopt);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tfrt/common/pjrt_util_test.cc b/tensorflow/core/tfrt/common/pjrt_util_test.cc
index 84b5fd2a52f..6c32f2115aa 100644
--- a/tensorflow/core/tfrt/common/pjrt_util_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util_test.cc
@@ -17,24 +17,25 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/platform/status_matchers.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/common/global_state.h"
 #include "tensorflow/core/tfrt/common/pjrt_state.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace {
 
-using ::tensorflow::testing::StatusIs;
 using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
 
 TEST(PjRtUtilTest, SetGetAndDeletePjRtClient) {
   TF_ASSERT_OK(SetPjRtClientInTFGlobalResourceManager(
       DEVICE_CPU,
       xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1)
           .value()));
-  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client,
-                          GetPjRtClientFromTFGlobalResourceManager(DEVICE_CPU));
+  TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(DEVICE_CPU));
   EXPECT_THAT(pjrt_client, ::testing::NotNull());
   TF_ASSERT_OK(
       DeletePjRtClientFromTFGlobalResourceManagerIfResourceExists(DEVICE_CPU));
@@ -63,14 +64,13 @@ TEST(PjRtUtilTest, DeleteNoPjRtStateOk) {
       DeletePjRtClientFromTFGlobalResourceManagerIfResourceExists(DEVICE_TPU));
 }
 
-TEST(PjRtStateResourceManagerTest, GetNotExistPjRtClient) {
-  TF_ASSERT_OK(SetPjRtClientInTFGlobalResourceManager(
-      DEVICE_CPU,
-      xla::GetTfrtCpuClient(/*asynchronous=*/true, /*cpu_device_count=*/1)
-          .value()));
-  EXPECT_THAT(GetPjRtClientFromTFGlobalResourceManager(DEVICE_TPU),
-              StatusIs(error::NOT_FOUND,
-                       HasSubstr("PjRt client not found for device type")));
+TEST(PjRtStateResourceManagerTest, GetNotExistPjRtClientNotImplemented) {
+  EXPECT_THAT(
+      GetOrCreatePjRtClient(DEVICE_TPU),
+      StatusIs(error::UNIMPLEMENTED,
+               HasSubstr("The PJRT client for TPU is not created explicitly "
+                         "before its first use and creating this PJRT client "
+                         "on the first use is not implemented.")));
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/eager/BUILD b/tensorflow/core/tfrt/eager/BUILD
index 8dfda80b643..ff9f5068a52 100644
--- a/tensorflow/core/tfrt/eager/BUILD
+++ b/tensorflow/core/tfrt/eager/BUILD
@@ -3,12 +3,9 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -39,6 +36,7 @@ cc_library(
         "//tensorflow/core/common_runtime:function_body",
         "//tensorflow/core/common_runtime:function_optimization_registry",
         "//tensorflow/core/common_runtime:optimization_registry",
+        "//tensorflow/core/common_runtime:optimize_function_graph_utils",
         "//tensorflow/core/common_runtime:placer",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/grappler:grappler_item",
@@ -48,14 +46,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "c_api_tfrt_distributed_interface",
-    hdrs = [
-        "c_api_tfrt_distributed_interface.h",
-    ],
-    deps = ["//tensorflow/c/eager:immediate_execution_distributed_manager"],
-)
-
 cc_library(
     name = "c_api_tfrt",
     srcs = [
@@ -69,7 +59,6 @@ cc_library(
         "op_cache.h",
     ],
     deps = [
-        ":c_api_tfrt_distributed_interface",
         ":core_runtime",
         ":tfrt_context",
         ":transform_graph_function",
@@ -118,7 +107,6 @@ cc_library(
         "@tf_runtime//:tensor_alwayslink",
         "@tf_runtime//backends/cpu:core_runtime_alwayslink",
         "@tf_runtime//backends/common:eigentype",
-        "@tf_runtime//:distributed_kernels_alwayslink",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
     ] + if_cuda([
@@ -159,36 +147,9 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "c_api_tfrt_distributed",
-    srcs = [
-        "c_api_tfrt_distributed_impl.cc",
-    ],
-    hdrs = [
-        "c_api_tfrt_distributed_impl.h",
-    ],
-    deps = [
-        ":c_api_tfrt_distributed_interface",
-        "@com_google_absl//absl/synchronization",
-        "//tensorflow/c/eager:immediate_execution_distributed_manager",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/distributed_runtime:remote_device",
-        # copybara:uncomment "//third_party/tf_runtime_google:grpc_communicator_alwayslink",
-        "@tf_runtime//:cluster_config_cc_proto",
-        "@tf_runtime//:distributed_runtime",
-    ],
-)
-
 cc_library(
     name = "virtual_device",
     hdrs = ["virtual_device.h"],
-    # TODO(fishx): Change the whole BUILD file to be gce compatible.
-    # copybara:uncomment compatible_with = ["//buildenv/target:gce"],
     deps = [
         "@tf_runtime//:hostcontext",
     ],
@@ -204,13 +165,13 @@ cc_library(
 # go/loose-lsc-one-target-approach for more details.
 cc_library(
     name = "loose_headers",
-    compatible_with = ["//buildenv/target:gce"],
     tags = ["avoid_dep"],
     textual_hdrs = [
         "c_api_tfrt.h",
-        "c_api_tfrt_distributed_impl.h",
     ],
-    visibility = ["//tensorflow/c/eager:__pkg__"],
+    visibility = [
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
+    ],
 )
 
 alias(
@@ -225,7 +186,6 @@ tf_cc_test(
         "function_cache_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_oss"],
     deps = [
         ":c_api_tfrt",
@@ -243,6 +203,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "@com_google_absl//absl/types:span",
         "@tf_runtime//backends/cpu:tf_ops_alwayslink",
     ],
@@ -255,7 +216,6 @@ tf_cc_test(
         "op_cache_test.cc",
     ],
     args = ["--heap_check="],
-    linkstatic = tf_kernel_tests_linkstatic(),
     tags = ["no_oss"],
     deps = [
         ":c_api_tfrt",
diff --git a/tensorflow/core/tfrt/eager/backends/BUILD b/tensorflow/core/tfrt/eager/backends/BUILD
index d5bc1f14e67..6fda64b3595 100644
--- a/tensorflow/core/tfrt/eager/backends/BUILD
+++ b/tensorflow/core/tfrt/eager/backends/BUILD
@@ -1,4 +1,5 @@
 # Empty build file to create a package.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/tfrt/eager/backends/cpu/BUILD b/tensorflow/core/tfrt/eager/backends/cpu/BUILD
index d572f8fb330..ec8dd8ac17d 100644
--- a/tensorflow/core/tfrt/eager/backends/cpu/BUILD
+++ b/tensorflow/core/tfrt/eager/backends/cpu/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/tfrt/eager/backends/gpu/BUILD b/tensorflow/core/tfrt/eager/backends/gpu/BUILD
index 95ef26a3343..0c53e0755d2 100644
--- a/tensorflow/core/tfrt/eager/backends/gpu/BUILD
+++ b/tensorflow/core/tfrt/eager/backends/gpu/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     features = ["-layering_check"],
     licenses = ["notice"],
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.cc b/tensorflow/core/tfrt/eager/c_api_tfrt.cc
index 7f49eaabab0..4709f905251 100644
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.cc
+++ b/tensorflow/core/tfrt/eager/c_api_tfrt.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/util/attr_util.h"
 #include "tensorflow/core/runtime_fallback/util/tensor_util.h"
-#include "tensorflow/core/tfrt/eager/c_api_tfrt_distributed_interface.h"
 #include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
 #include "tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h"
 #include "tensorflow/core/tfrt/eager/virtual_device.h"
@@ -105,7 +104,6 @@ namespace {
 using tensorflow::down_cast;
 
 constexpr char kGpuDeviceName[] = "GPU";
-constexpr char kEnableNativeOpsAttr[] = "TFRT_TEST_enable_native_ops";
 constexpr char kEnableGrapplerAttr[] = "TFRT_TEST_enable_grappler";
 
 TensorMetadata CreateMetadata(DType dtype, absl::Span<const Index> dim_sizes) {
@@ -370,16 +368,16 @@ tensorflow::DataType TensorHandleInterface::DataType() const {
   // If dtype_ field is set, use it instead of waiting for the underlying
   // TensorHandle's metadata to be available.
   if (dtype_) {
-    return dtype_.getValue();
+    return dtype_.value();
   }
   auto metadata = Metadata();
-  if (!metadata.hasValue()) {
+  if (!metadata.has_value()) {
     LOG(ERROR)
         << "Failed to get DataType due to error metadata: "
         << value_.get<TensorHandle>().GetAsyncMetadata().GetError().message();
     return tensorflow::DT_INVALID;
   }
-  auto kind = metadata.getValue()->dtype;
+  auto kind = metadata.value()->dtype;
   if (kind == DType::Unsupported) {
     AsyncValue* async_tensor = value_.get<TensorHandle>().GetAsyncTensor();
     if (!async_tensor->IsAvailable()) {
@@ -404,7 +402,7 @@ tensorflow::Status TensorHandleInterface::TensorHandleStatus() const {
     return ::tensorflow::OkStatus();
   } else {
     auto metadata = Metadata();
-    if (!metadata.hasValue()) {
+    if (!metadata.has_value()) {
       LOG(ERROR)
           << "Metadata in the tensor handle is an error metadata: "
           << value_.get<TensorHandle>().GetAsyncMetadata().GetError().message();
@@ -430,27 +428,27 @@ tensorflow::Status TensorHandleInterface::TensorHandleStatus() const {
 tensorflow::Status TensorHandleInterface::Shape(
     tensorflow::PartialTensorShape* shape) const {
   auto metadata = Metadata();
-  if (!metadata.hasValue()) {
+  if (!metadata.has_value()) {
     return tensorflow::FromAbslStatus(
         value_.get<TensorHandle>().GetAsyncMetadata().GetError());
   }
-  int num_dims = metadata.getValue()->shape.GetRank();
+  int num_dims = metadata.value()->shape.GetRank();
   if (num_dims == -1) {
     return ::tensorflow::OkStatus();
   }
   llvm::SmallVector<Index, 8> dims;
-  metadata.getValue()->shape.GetDimensions(&dims);
+  metadata.value()->shape.GetDimensions(&dims);
   TF_RETURN_IF_ERROR(tensorflow::TensorShapeUtils::MakeShape(dims, shape));
   return ::tensorflow::OkStatus();
 }
 
 tensorflow::Status TensorHandleInterface::NumDims(int* num_dims) const {
   auto metadata = Metadata();
-  if (!metadata.hasValue()) {
+  if (!metadata.has_value()) {
     return tensorflow::FromAbslStatus(
         value_.get<TensorHandle>().GetAsyncMetadata().GetError());
   }
-  *num_dims = metadata.getValue()->shape.GetRank();
+  *num_dims = metadata.value()->shape.GetRank();
 
   return ::tensorflow::OkStatus();
 }
@@ -458,11 +456,11 @@ tensorflow::Status TensorHandleInterface::NumDims(int* num_dims) const {
 tensorflow::Status TensorHandleInterface::NumElements(
     int64_t* num_elements) const {
   auto metadata = Metadata();
-  if (!metadata.hasValue()) {
+  if (!metadata.has_value()) {
     return tensorflow::FromAbslStatus(
         value_.get<TensorHandle>().GetAsyncMetadata().GetError());
   }
-  *num_elements = metadata.getValue()->shape.GetNumElements();
+  *num_elements = metadata.value()->shape.GetNumElements();
 
   return ::tensorflow::OkStatus();
 }
@@ -470,11 +468,11 @@ tensorflow::Status TensorHandleInterface::NumElements(
 tensorflow::Status TensorHandleInterface::Dim(int dim_index,
                                               int64_t* dim) const {
   auto metadata = Metadata();
-  if (!metadata.hasValue()) {
+  if (!metadata.has_value()) {
     return tensorflow::FromAbslStatus(
         value_.get<TensorHandle>().GetAsyncMetadata().GetError());
   }
-  *dim = metadata.getValue()->shape.GetDimensionSize(dim_index);
+  *dim = metadata.value()->shape.GetDimensionSize(dim_index);
 
   return ::tensorflow::OkStatus();
 }
@@ -575,10 +573,9 @@ llvm::Optional<const TensorMetadata*> TensorHandleInterface::Metadata() const {
 ContextInterface::ContextInterface(
     const tensorflow::SessionOptions& opts,
     tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-    bool is_async, bool use_tfrt_distributed_runtime)
+    bool is_async)
     : ImmediateExecutionContext(kTfrt),
-      context_(opts, default_device_placement_policy, is_async),
-      use_tfrt_distributed_runtime_(use_tfrt_distributed_runtime) {
+      context_(opts, default_device_placement_policy, is_async) {
   LOG(INFO) << "TFRT Enabled";
   metrics::AddTFRTVersionMetric();
 
@@ -620,7 +617,7 @@ static TensorInterface* MakeScalarTensor(T value, HostContext* host) {
     LOG(ERROR) << "Failed to create DenseHostTensor";
     return nullptr;
   }
-  auto& dht = t.getValue();
+  auto& dht = t.value();
   MutableDHTArrayView<T> view{&dht};
   view.Elements()[0] = value;
 
@@ -744,7 +741,7 @@ tensorflow::AbstractTensorInterface* ContextInterface::CreateTensor(
   } else {
     auto t = DenseHostTensor::CreateUninitialized(md, GetHostContext());
     return new TensorInterface(
-        MakeAvailableAsyncValueRef<DenseHostTensor>(std::move(t.getValue())));
+        MakeAvailableAsyncValueRef<DenseHostTensor>(std::move(t.value())));
   }
 }
 
@@ -953,9 +950,6 @@ void ContextInterface::EndStep() { GetEagerContext()->EndStep(); }
 
 tensorflow::Status ContextInterface::EnableCollectiveOps(
     const tensorflow::ServerDef& server_def) {
-  if (use_tfrt_distributed_runtime_) {
-    return distributed_manager_->EnableCollectiveOps(server_def);
-  }
   // Preserve the local virtual device names, since local virtual devices are
   // added by TFRT and we need to add it back after worker server is
   // initialized. Currently one such use case is the TPU_SYSTEM device, which
@@ -1024,10 +1018,6 @@ tensorflow::Status ContextInterface::BuildFunctionRequestContext(
 
   TF_RETURN_IF_ERROR(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
       &request_context_builder, runner_table, GetEagerContext()));
-  if (distributed_manager_ != nullptr) {
-    down_cast<DistributedManagerContextInterface*>(distributed_manager_.get())
-        ->UpdateRequestContextBuilder(&request_context_builder);
-  }
   auto expected_request_context = std::move(request_context_builder).build();
   if (!expected_request_context) {
     return tensorflow::errors::Internal(
@@ -1236,7 +1226,7 @@ tensorflow::Status ContextInterface::RunMetadataRecordFunction(
   mutex_lock l(run_metadata_mu_);
   auto* function_graphs = run_metadata_->add_function_graphs();
   *function_graphs->mutable_pre_optimization_graph() = def;
-  // TODO(b/b/171600738): Figure out a way to record the right post optimization
+  // TODO(b/171600738): Figure out a way to record the right post optimization
   // graph and partition graph.
   *function_graphs->mutable_post_optimization_graph() = def;
   *function_graphs->add_partition_graphs() = def;
@@ -1284,8 +1274,8 @@ bool OpAttrsInterface::GetType(absl::string_view attr_name,
                                tensorflow::DataType* result) const {
   auto optional_type =
       attrs_->GetOptional<OpAttrType>({attr_name.data(), attr_name.size()});
-  if (!optional_type.hasValue()) return false;
-  *result = tensorflow::tfd::ConvertToTfDataType(optional_type.getValue());
+  if (!optional_type.has_value()) return false;
+  *result = tensorflow::tfd::ConvertToTfDataType(optional_type.value());
   return true;
 }
 
@@ -1497,12 +1487,6 @@ tensorflow::Status OperationInterface::Initialize() {
     return tensorflow::errors::NotFound("Cannot find device manager");
   // TODO(tfrt-devs): support remote devices in TFRT.
   for (auto d : device_mgr->ListDevices()) dev_set.AddDevice(d);
-  if (context_->GetDistributedManager() != nullptr &&
-      context_->UseTfrtDistributedRuntime()) {
-    down_cast<DistributedManagerContextInterface*>(
-        context_->GetDistributedManager())
-        ->PopulateRemoteDevices(&dev_set);
-  }
   FunctionCache::FunctionCacheResult result;
 
   tensorflow::TfrtFunctionCompileOptions compile_options;
@@ -1513,22 +1497,10 @@ tensorflow::Status OperationInterface::Initialize() {
       device_name_.empty() ? context_->GetEagerContext()->HostCPUName()
                            : device_name_;
 
-  // TODO(b/172659131): Do not use TFRT native ops for TF integration for now.
-  // Re-enable once we have a concrete plan to implement feature complete
-  // TFRT native ops (kernels).
-  compile_options.enable_native_ops = false;
-
   if (fallback_attrs_.NumAttributes() > 0) {
     const auto& ndef = NodeDef();
     // TODO(tfrt-devs): If we are to create more attributes, consider packing
     // them into a proto.
-    {
-      const auto& it = ndef.attr().find(kEnableNativeOpsAttr);
-      if (it != ndef.attr().end()) {
-        compile_options.enable_native_ops = it->second.b();
-      }
-    }
-
     {
       const auto& it = ndef.attr().find(kEnableGrapplerAttr);
       if (it != ndef.attr().end()) {
@@ -1704,8 +1676,8 @@ tensorflow::Status OperationInterface::SetAttrShape(const char* attr_name,
     }
 
     // Set RankedShapeAttr.
-    offset = bef_attr_encoder_.EncodeRankedShapeAttr(
-        llvm::makeArrayRef(dims, num_dims));
+    offset =
+        bef_attr_encoder_.EncodeRankedShapeAttr(llvm::ArrayRef(dims, num_dims));
   }
   fallback_attrs_.Set(attr_name, proto);
 
@@ -1761,7 +1733,7 @@ static size_t SerializeTFETensorToDenseAttr(
   for (int i = 0; i < tensor->NumDims(); ++i) {
     shape.push_back(tensor->Dim(i));
   }
-  auto elements = llvm::makeArrayRef(
+  auto elements = llvm::ArrayRef(
       reinterpret_cast<const uint8_t*>(tensor->Data()), tensor->ByteSize());
   return encoder->EncodeDenseAttr(static_cast<DType>(element_type), shape,
                                   elements);
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.h b/tensorflow/core/tfrt/eager/c_api_tfrt.h
index 7308ba78a59..bd65a91039a 100644
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.h
+++ b/tensorflow/core/tfrt/eager/c_api_tfrt.h
@@ -69,7 +69,7 @@ class ContextInterface : public tensorflow::ImmediateExecutionContext {
   ContextInterface(
       const tensorflow::SessionOptions& opts,
       tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-      bool is_async, bool use_tfrt_distributed_runtime);
+      bool is_async);
   ~ContextInterface() override;
 
   void Release() override { delete this; }
@@ -265,22 +265,16 @@ class ContextInterface : public tensorflow::ImmediateExecutionContext {
 
   std::vector<std::string> GetLoggedOpsTestonly() override;
 
-  bool UseTfrtDistributedRuntime() { return use_tfrt_distributed_runtime_; }
-
 #if !defined(IS_MOBILE_PLATFORM)
   void SetDistributedManager(
       std::unique_ptr<tensorflow::ImmediateExecutionDistributedManager>
           distributed) override {
-    distributed_manager_ = std::move(distributed);
+    llvm_unreachable("unimplemented method.");
   }
 
   tensorflow::ImmediateExecutionDistributedManager* GetDistributedManager()
       override {
-    if (use_tfrt_distributed_runtime_) {
-      return distributed_manager_.get();
-    } else {
-      return context_.GetEagerContext()->GetDistributedManager();
-    }
+    return context_.GetEagerContext()->GetDistributedManager();
   }
 #endif  // !IS_MOBILE_PLATFORM
 
@@ -311,14 +305,6 @@ class ContextInterface : public tensorflow::ImmediateExecutionContext {
   mutex run_metadata_mu_;
   std::unique_ptr<tensorflow::RunMetadata> run_metadata_
       TFRT_GUARDED_BY(run_metadata_mu_);
-
-  // Use TFRT's implementation of distributed manager.
-  bool use_tfrt_distributed_runtime_ = false;
-
-  // A distributed manager that helps setup, update, and check liveness of
-  // member tasks in the cluster.
-  std::unique_ptr<tensorflow::ImmediateExecutionDistributedManager>
-      distributed_manager_;
 };
 
 class TensorInterface : public tensorflow::AbstractTensorInterface {
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.cc b/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.cc
deleted file mode 100644
index 56bb68c43a9..00000000000
--- a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.h"
-
-#include "absl/synchronization/notification.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/distributed_runtime/remote_device.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/random.h"
-#include "tensorflow/core/protobuf/cluster.pb.h"
-#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
-#include "tfrt/distributed_runtime/distributed_context.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/distributed_init_helper.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/fabric_communicator.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/proto/cluster_config.pb.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/server_context.h"  // from @tf_runtime
-#include "tfrt/distributed_runtime/task_name_util.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-namespace {
-
-constexpr char kRemotePrefix[] = "remote_";
-
-std::string GetTensorFlowDeviceType(string_view name) {
-  int pos = name.find(kRemotePrefix);
-  return absl::AsciiStrToUpper(
-      pos == 0 ? name.substr(strlen(kRemotePrefix)).str() : name.str());
-}
-
-DistributedContextConfiguration ConvertServerDefToDistributedConfiguration(
-    const tensorflow::ServerDef& server_def) {
-  DistributedContextConfiguration dist_config;
-  dist_config.set_job_name(server_def.job_name());
-  dist_config.set_task_id(server_def.task_index());
-  ClusterConfiguration* cluster_config = dist_config.mutable_cluster_config();
-  // Currently take the first task in the first job as collective group leader.
-  // TODO(haoyuzhang): Make this configurable from API by reading from
-  // `config.experimental.collective_group_leader`.
-  cluster_config->set_lead_task_name(TaskNameUtil::ConcatTaskName(
-      server_def.cluster().job(0).name(), /*task_id=*/0));
-  for (const auto& job_def : server_def.cluster().job()) {
-    JobConfiguration* job_config = cluster_config->add_jobs();
-    job_config->set_name(job_def.name());
-    *job_config->mutable_tasks() = job_def.tasks();
-  }
-  return dist_config;
-}
-
-std::unique_ptr<ServerContext> CreateServer(
-    const DistributedContextConfiguration& dist_config, HostContext* host_ctx) {
-  const std::string& job_name = dist_config.job_name();
-  const int task_id = dist_config.task_id();
-  std::string server_address;
-  for (const auto& job_config : dist_config.cluster_config().jobs()) {
-    if (job_config.name() == job_name) {
-      server_address = job_config.tasks().at(task_id);
-      break;
-    }
-  }
-  FabricCommunicatorConfiguration fabric_config{"grpc_communicator",
-                                                server_address};
-  ServerContextConfiguration server_config{fabric_config};
-  return std::make_unique<ServerContext>(host_ctx, server_config);
-}
-
-}  // namespace
-
-class DistributedManagerContextImpl
-    : public DistributedManagerContextInterface {
- public:
-  explicit DistributedManagerContextImpl(HostContext* host_context);
-
-  tensorflow::Status SetOrUpdateServerDef(
-      const tensorflow::ServerDef& server_def, bool reset_context,
-      int keep_alive_secs) override;
-
-  tensorflow::Status EnableCollectiveOps(
-      const tensorflow::ServerDef& server_def) override;
-
-  tensorflow::Status CheckRemoteAlive(const std::string& remote_task_name,
-                                      bool* is_alive) override;
-
-  tensorflow::CoordinationServiceAgent* GetCoordinationServiceAgent() override;
-
-  void UpdateRequestContextBuilder(RequestContextBuilder* builder) override;
-  void PopulateRemoteDevices(tensorflow::DeviceSet* dev_set) override;
-
- private:
-  HostContext* host_context_;
-  std::unique_ptr<tfrt::ServerContext> server_context_;
-  AsyncValueRef<tfrt::DistributedContext> dist_context_;
-  std::unique_ptr<tensorflow::StaticDeviceMgr> tf_devices_;
-};
-
-DistributedManagerContextImpl::DistributedManagerContextImpl(
-    HostContext* host_context)
-    : host_context_(host_context) {
-  TaskNameUtil::SetUseReplicaInTaskName();
-}
-
-tensorflow::Status DistributedManagerContextImpl::SetOrUpdateServerDef(
-    const tensorflow::ServerDef& server_def, bool reset_context,
-    int keep_alive_secs) {
-#if defined(PLATFORM_GOOGLE)
-  DistributedContextConfiguration dist_config =
-      ConvertServerDefToDistributedConfiguration(server_def);
-  server_context_ = CreateServer(dist_config, host_context_);
-
-  // Create distributed contexts on current and remote tasks. Implemented as a
-  // blocking call to be consistent with the behavior of current TF.
-  const DistributedInitHelper* init_helper =
-      server_context_->GetDistributedInitHelper();
-  absl::Notification n;
-  init_helper->InitializeSingleClientDistributedContext(
-      std::move(dist_config),
-      [&n, this](Expected<DistributedContext*> expected) mutable {
-        if (!expected) tfrt::DieIfError(expected.takeError());
-        const uint64_t cid = expected.get()->GetContextId();
-        dist_context_ = server_context_->GetDistributedContextAsyncValue(cid);
-        n.Notify();
-      });
-  n.WaitForNotification();
-
-  auto device_refs =
-      dist_context_->GetRemoteDeviceManager()->ListDevices<Device>();
-  std::vector<std::unique_ptr<tensorflow::Device>> tf_devices;
-  for (auto& device_ref : device_refs) {
-    tensorflow::DeviceAttributes da;
-    da.set_name(device_ref->name().str());
-    da.set_device_type(GetTensorFlowDeviceType(device_ref->type().name()));
-    // TF Devices created here might not have all of the attributes needed.
-    // Currently, it is only used by Placer during TFRT Function creation.
-    tf_devices.emplace_back(NewRemoteDevice(tensorflow::Env::Default(), da));
-  }
-  tf_devices_ =
-      std::make_unique<tensorflow::StaticDeviceMgr>(std::move(tf_devices));
-  return ::tensorflow::OkStatus();
-#endif  // PLATFORM_GOOGLE
-  return tensorflow::errors::Unimplemented(
-      "SetOrUpdateServerDef in open source is not yet implemented.");
-}
-
-tensorflow::Status DistributedManagerContextImpl::EnableCollectiveOps(
-    const tensorflow::ServerDef& server_def) {
-#if defined(PLATFORM_GOOGLE)
-  DistributedContextConfiguration dist_config =
-      ConvertServerDefToDistributedConfiguration(server_def);
-  server_context_ = CreateServer(dist_config, host_context_);
-
-  DistributedInitHelper* init_helper =
-      server_context_->GetDistributedInitHelper();
-  absl::Notification n;
-  init_helper->InitializeMultiClientDistributedContext(
-      std::move(dist_config),
-      [&n, this](Expected<DistributedContext*> expected) mutable {
-        if (!expected) tfrt::DieIfError(expected.takeError());
-        const uint64_t cid = expected.get()->GetContextId();
-        dist_context_ = server_context_->GetDistributedContextAsyncValue(cid);
-        n.Notify();
-      });
-  n.WaitForNotification();
-
-  return ::tensorflow::OkStatus();
-#endif  // PLATFORM_GOOGLE
-  return tensorflow::errors::Unimplemented(
-      "EnableCollectiveOps in open source is not yet implemented.");
-}
-
-tensorflow::Status DistributedManagerContextImpl::CheckRemoteAlive(
-    const std::string& remote_task_name, bool* is_alive) {
-  return tensorflow::errors::Unimplemented(
-      "CheckRemoteAlive in TFRT is not yet implemented.");
-}
-
-tensorflow::CoordinationServiceAgent*
-DistributedManagerContextImpl::GetCoordinationServiceAgent() {
-  TFRT_LOG(FATAL) << "Coordination service in TFRT is not yet enabled.";
-  return nullptr;
-}
-
-void DistributedManagerContextImpl::UpdateRequestContextBuilder(
-    RequestContextBuilder* builder) {
-  builder->context_data().insert(dist_context_.CopyRef());
-}
-
-void DistributedManagerContextImpl::PopulateRemoteDevices(
-    tensorflow::DeviceSet* dev_set) {
-  if (tf_devices_ == nullptr) {
-    return;
-  }
-  for (auto& device : tf_devices_->ListDevices()) {
-    dev_set->AddDevice(device);
-  }
-}
-
-std::unique_ptr<DistributedManagerContextInterface>
-CreateDistributedManagerContext(HostContext* host_context) {
-  return std::make_unique<DistributedManagerContextImpl>(host_context);
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.h b/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.h
deleted file mode 100644
index 08329e1876d..00000000000
--- a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_impl.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_IMPL_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_IMPL_H_
-
-#include "tensorflow/core/tfrt/eager/c_api_tfrt_distributed_interface.h"
-
-namespace tfrt {
-class HostContext;
-
-namespace tf {
-std::unique_ptr<DistributedManagerContextInterface>
-CreateDistributedManagerContext(HostContext* host_context);
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_IMPL_H_
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_interface.h b/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_interface.h
deleted file mode 100644
index cafb25272c3..00000000000
--- a/tensorflow/core/tfrt/eager/c_api_tfrt_distributed_interface.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_INTERFACE_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_INTERFACE_H_
-
-#include "tensorflow/c/eager/immediate_execution_distributed_manager.h"
-
-namespace tensorflow {
-class DeviceSet;
-}
-namespace tfrt {
-class RequestContextBuilder;
-
-namespace tf {
-class DistributedManagerContextInterface
-    : public tensorflow::ImmediateExecutionDistributedManager {
- public:
-  virtual void UpdateRequestContextBuilder(RequestContextBuilder* builder) = 0;
-  virtual void PopulateRemoteDevices(tensorflow::DeviceSet* dev_set) = 0;
-};
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_DISTRIBUTED_INTERFACE_H_
diff --git a/tensorflow/core/tfrt/eager/core_runtime/BUILD b/tensorflow/core/tfrt/eager/core_runtime/BUILD
index a7f07c6112e..deb7341c88f 100644
--- a/tensorflow/core/tfrt/eager/core_runtime/BUILD
+++ b/tensorflow/core/tfrt/eager/core_runtime/BUILD
@@ -2,6 +2,7 @@
 # core_runtime libraries.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/core/tfrt/eager:__pkg__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD
index dabd95d484c..b6d015df142 100644
--- a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD
+++ b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD
@@ -3,6 +3,8 @@ load(
     "tf_cc_test",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 tf_cc_test(
diff --git a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
index e7ef057e826..48c7a18f94c 100644
--- a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
+++ b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
@@ -356,7 +356,8 @@ class SelectorTest : public ::testing::Test {
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
         /* async */ false, device_manager_,
         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
-        /* cluster_flr */ nullptr);
+        /* cluster_flr */ nullptr, /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true);
     corert_ = CreateCoreRuntime();
     fallback_op_handler_ = CreateOpHandler();
     cpu_op_handler_ = CreateOpHandler();
diff --git a/tensorflow/core/tfrt/eager/function_cache_test.cc b/tensorflow/core/tfrt/eager/function_cache_test.cc
index 6a987624bdb..7da3fbaf070 100644
--- a/tensorflow/core/tfrt/eager/function_cache_test.cc
+++ b/tensorflow/core/tfrt/eager/function_cache_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/eager/function_cache.h"
 
 #include <memory>
+#include <utility>
 
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
 #include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 
 namespace tfrt {
@@ -226,19 +228,27 @@ TEST_P(CppTests, TestFunctionCacheWithAdd) {
   for (auto d : device_mgr->ListDevices()) dev_set.AddDevice(d);
   auto& device = corert->GetHostContext()->GetHostDevice();
   const Device* input_devices[2] = {&device, &device};
-  auto req_ctx = RequestContextBuilder(corert->GetHostContext(),
-                                       /*resource_context=*/nullptr)
-                     .build();
+  tfrt::ResourceContext resource_context;
+  RequestContextBuilder req_ctx_builder(corert->GetHostContext(),
+                                        &resource_context);
+  TF_ASSERT_OK(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
+      &req_ctx_builder, /*runner_table=*/nullptr, tfrt_ctx->GetEagerContext(),
+      /*user_intra_op_threadpool=*/nullptr, /*model_metadata=*/std::nullopt));
+  auto req_ctx = std::move(req_ctx_builder).build();
   ExecutionContext exec_ctx(std::move(*req_ctx));
 
   auto request_ctx_fn =
-      [host = corert->GetHostContext()](
+      [host = corert->GetHostContext(),
+       eager_context = tfrt_ctx->GetEagerContext(), &resource_context](
           tensorflow::tfrt_stub::OpKernelRunnerTable* runner_table,
           RCReference<RequestContext>* request_ctx) {
-        *request_ctx =
-            std::move(*RequestContextBuilder(host,
-                                             /*resource_context=*/nullptr)
-                           .build());
+        RequestContextBuilder req_ctx_builder(host, &resource_context);
+        TF_RETURN_IF_ERROR(
+            tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
+                &req_ctx_builder, runner_table, eager_context,
+                /*user_intra_op_threadpool=*/nullptr,
+                /*model_metadata=*/std::nullopt));
+        *request_ctx = *std::move(req_ctx_builder).build();
         return ::tensorflow::OkStatus();
       };
 
diff --git a/tensorflow/core/tfrt/eager/transform_graph_function.cc b/tensorflow/core/tfrt/eager/transform_graph_function.cc
index 628fa3285a8..1791a5b421d 100644
--- a/tensorflow/core/tfrt/eager/transform_graph_function.cc
+++ b/tensorflow/core/tfrt/eager/transform_graph_function.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -86,7 +87,7 @@ Status TransformGraphFunction(const std::string& func_name,
     VLOG(1) << "TransformGraphFunction(): " << device_name << " is unknown."
             << " default device for placer is not set.";
 
-  TF_RETURN_IF_ERROR(ProcessFunctionLibraryRuntime::PinArgsAndRets(
+  TF_RETURN_IF_ERROR(PinArgsAndRets(
       input_device_names, output_device_names, device_set, arg_nodes, ret_nodes,
       func_lib_def,
       eager_ctx->AllowSoftPlacement() ? default_device : nullptr));
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 9a8f1163d8c..a190c53c6c6 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -15,6 +15,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -34,7 +35,6 @@ cc_library(
     name = "fallback_state",
     srcs = ["fallback_state.cc"],
     hdrs = ["fallback_state.h"],
-    # copybara:uncomment compatible_with = ["//buildenv/target:gce"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -67,6 +67,7 @@ cc_library(
     visibility = [
         # copybara:uncomment "//tensorflow/core/runtime_fallback:internal",
         # copybara:uncomment "//tensorflow/core/tfrt/eager:__pkg__",
+        "//tensorflow/core/tfrt/graph_executor:__subpackages__",
         "//tensorflow/lite/delegates/flex:__pkg__",
     ],
     deps = [
@@ -89,7 +90,7 @@ cc_library(
     hdrs = ["op_kernel_runner_cache.h"],
     deps = [
         ":op_kernel_runner",
-        "//tensorflow/core/framework:node_def_util",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@tf_runtime//:hostcontext",
     ],
@@ -105,7 +106,6 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/util:env_var",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@tf_runtime//:hostcontext",
     ],
 )
 
@@ -127,7 +127,9 @@ tf_cuda_cc_test(
     srcs = ["op_kernel_runner_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":fallback_state",
         ":op_kernel_runner",
+        ":op_kernel_runner_cache",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder.cc b/tensorflow/core/tfrt/fallback/cost_recorder.cc
index f3f49767f8e..48fd4c67029 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder.cc
+++ b/tensorflow/core/tfrt/fallback/cost_recorder.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 
+#include <limits>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -27,33 +27,46 @@ limitations under the License.
 namespace tensorflow {
 namespace tfrt_stub {
 
-void CostRecorder::RecordCost(int64_t op_key, const uint64_t execution_time) {
+void CostRecorder::RecordCostNanosecond(int64_t op_key,
+                                        uint64_t execution_time_ns) {
   mutex_lock l(op_cost_map_mutex_);
-  op_cost_map_[op_key].first += execution_time;
+  op_cost_map_[op_key].first += execution_time_ns;
   op_cost_map_[op_key].second += 1;
 }
 
-size_t CostRecorder::size() {
+uint64_t CostRecorder::GetCostNanosecond(int64_t op_key) const {
   tf_shared_lock l(op_cost_map_mutex_);
-  return op_cost_map_.size();
+
+  const auto iter = op_cost_map_.find(op_key);
+  if (iter == op_cost_map_.end()) return std::numeric_limits<uint32_t>::max();
+
+  const auto total_cost = iter->second.first;
+  const auto num_ops = iter->second.second;
+
+  return total_cost / num_ops;
 }
 
-Status CostRecorder::WriteToFile() {
+Status CostRecorder::WriteToFile() const {
   OpCostMapProto op_cost_map_proto;
   {
     tf_shared_lock l(op_cost_map_mutex_);
     for (const auto& [op_key, op_cost] : op_cost_map_) {
-      uint64_t avg_op_cost = op_cost.first / op_cost.second;
+      const uint64_t avg_op_cost = op_cost.first / op_cost.second;
       (*op_cost_map_proto.mutable_op_cost_map())[op_key] = avg_op_cost;
     }
   }
 
   std::string measured_cost_path;
-  TF_RETURN_IF_ERROR(ReadStringFromEnvVar("TF_TFRT_MEASURED_COST_PATH", "",
+  TF_RETURN_IF_ERROR(ReadStringFromEnvVar(MesuredCostPathEnvVarName(), "",
                                           &measured_cost_path));
   return tensorflow::WriteTextProto(tensorflow::Env::Default(),
                                     measured_cost_path, op_cost_map_proto);
 }
 
+size_t CostRecorder::size() const {
+  tf_shared_lock l(op_cost_map_mutex_);
+  return op_cost_map_.size();
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder.h b/tensorflow/core/tfrt/fallback/cost_recorder.h
index a1cdfb76a7e..3c2610b6925 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder.h
+++ b/tensorflow/core/tfrt/fallback/cost_recorder.h
@@ -26,29 +26,43 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/tfrt/fallback/op_cost_map.pb.h"
-#include "tfrt/host_context/shared_context.h"  // from @tf_runtime
-
-namespace tfrt {
-class HostContext;
-}  // namespace tfrt
 
 namespace tensorflow {
 namespace tfrt_stub {
-class CostRecorder : public tfrt::SharedContext {
+
+// Thread-safe.
+// Maintains the execution durations by `op_key`. Note that `op_key` is only
+// unique within a model.
+class CostRecorder {
  public:
-  explicit CostRecorder(tfrt::HostContext* host) {}
+  // Records an execution duration for the op keyed by `op_key`.
+  void RecordCostNanosecond(int64_t op_key, uint64_t execution_time_ns);
+
+  // Returns the average execution duration of the op keyed by `op_key`. If
+  // there is no record for `op_key`, returns the uint32_t::max to avoid stream
+  // merging. Note that we don't use uint64_t::max because otherwise adding op
+  // costs would cause overflow. (See details in go/tfrt-stream-analysis-doc.)
+  uint64_t GetCostNanosecond(int64_t op_key) const;
 
-  void RecordCost(int64_t op_key, const uint64_t execution_time);
+  // Writes the op cost map (in format of `OpCostMapProto`) to a file specified
+  // by the env var name `MesuredCostPathEnvVarName()`.
+  // TODO(b/263837451): Fix the op_key unstableness during serialization.
+  Status WriteToFile() const;
 
-  size_t size();
-  Status WriteToFile();
+  size_t size() const;
+
+  static const char* MesuredCostPathEnvVarName() {
+    return "TF_TFRT_MEASURED_COST_PATH";
+  }
 
  private:
-  tensorflow::mutex op_cost_map_mutex_;
-  // Map op key to {sum of op execution time in microseconds, number of op}.
+  mutable tensorflow::mutex op_cost_map_mutex_;
+  // Map op key to {sum of op execution duration in nanoseconds, #occurences of
+  // the op}.
   absl::flat_hash_map<int64_t, std::pair<uint64_t, uint64_t>> op_cost_map_
       TF_GUARDED_BY(op_cost_map_mutex_);
 };
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder_test.cc b/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
index 3005d3a1143..befb09253d4 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
+++ b/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 
+#include <limits>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -31,16 +31,34 @@ constexpr uint64_t kTestCost = 1234;
 constexpr uint64_t kTestAvgCost = 1851;
 
 TEST(CostRecorderTest, RecordCostTest) {
-  CostRecorder recorder = CostRecorder(nullptr);
+  CostRecorder recorder;
+
+  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
+  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
+
+  EXPECT_EQ(recorder.size(), 1);
+}
+
+TEST(CostRecorderTest, GetCostTest) {
+  CostRecorder recorder;
 
-  recorder.RecordCost(kTestOpKey, kTestCost);
-  recorder.RecordCost(kTestOpKey, kTestCost);
+  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
+  recorder.RecordCostNanosecond(kTestOpKey, 2 * kTestCost);
 
   EXPECT_EQ(recorder.size(), 1);
+  EXPECT_EQ(recorder.GetCostNanosecond(kTestOpKey), kTestAvgCost);
+}
+
+TEST(CostRecorderTest, GetCostDefaultValueTest) {
+  CostRecorder recorder;
+  ASSERT_EQ(recorder.size(), 0);
+
+  EXPECT_EQ(recorder.GetCostNanosecond(kTestOpKey),
+            std::numeric_limits<uint32_t>::max());
 }
 
 TEST(CostRecorderTest, WriteToFileTest) {
-  CostRecorder recorder = CostRecorder(nullptr);
+  CostRecorder recorder;
   ASSERT_EQ(recorder.size(), 0);
 
   std::string measured_cost_path;
@@ -57,17 +75,18 @@ TEST(CostRecorderTest, WriteToFileTest) {
 }
 
 TEST(CostRecorderTest, ProtoRecordsTest) {
-  CostRecorder recorder = CostRecorder(nullptr);
+  CostRecorder recorder;
 
   // Records the cost of op.
-  recorder.RecordCost(kTestOpKey, kTestCost);
-  recorder.RecordCost(kTestOpKey, 2 * kTestCost);
+  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
+  recorder.RecordCostNanosecond(kTestOpKey, 2 * kTestCost);
   ASSERT_EQ(recorder.size(), 1);
 
   // Writes op's cost to the disk.
   std::string measured_cost_path;
   tensorflow::Env::Default()->LocalTempFilename(&measured_cost_path);
-  ASSERT_EQ(setenv("TF_TFRT_MEASURED_COST_PATH", measured_cost_path.c_str(), 1),
+  ASSERT_EQ(setenv(CostRecorder::MesuredCostPathEnvVarName(),
+                   measured_cost_path.c_str(), 1),
             0);
   TF_CHECK_OK(recorder.WriteToFile());
 
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 8e24ebbff0b..ff0f2f3ecfa 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -75,5 +75,9 @@ FallbackState::CreateGraphExecutionState(GraphDef graph_def) const {
   return execution_state;
 }
 
+Status FallbackState::AddFunctionDef(const FunctionDef &func_def) {
+  return func_lib_def_.AddFunctionDef(func_def);
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index f59686352db..7cd18de0691 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -47,6 +47,9 @@ class FallbackState {
   StatusOr<std::unique_ptr<GraphExecutionState>> CreateGraphExecutionState(
       GraphDef graph_def) const;
 
+  // Adds `func_def` to the function library.
+  Status AddFunctionDef(const FunctionDef &func_def);
+
   const SessionOptions &session_options() const { return session_options_; }
 
   const DeviceMgr &device_manager() const { return device_manager_; }
diff --git a/tensorflow/core/tfrt/fallback/op_cost_map.proto b/tensorflow/core/tfrt/fallback/op_cost_map.proto
index 10d928b24ff..1fa14c2a781 100644
--- a/tensorflow/core/tfrt/fallback/op_cost_map.proto
+++ b/tensorflow/core/tfrt/fallback/op_cost_map.proto
@@ -6,6 +6,6 @@ package tensorflow.tfrt_stub;
 // details.
 // NEXT_ID: 2
 message OpCostMapProto {
-  // Maps an op_key to a cost measured in microseconds.
+  // Maps an op_key to a cost measured in nanoseconds.
   map<int64, uint64> op_cost_map = 1;
 }
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index 9f93bf399f7..ac454827586 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
+#include <string>
+
 #include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
@@ -39,10 +41,10 @@ Status CheckOpDefCompatibility(const tensorflow::OpDef& op_def) {
 
 // Create a tensorflow::NodeDef from the tensorflow::OpDef and the attributes.
 StatusOr<tensorflow::NodeDef> BuildNodeDef(
-    const tensorflow::OpDef& op_def, int num_args,
+    const tensorflow::OpDef& op_def, absl::string_view node_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder) {
   tensorflow::NodeDef node_def;
-  node_def.set_name(op_def.name());
+  node_def.set_name(std::string(node_name));
   node_def.set_op(op_def.name());
   for (int i = 0; i < num_args; ++i) {
     node_def.add_input("dummy_input");
@@ -69,7 +71,7 @@ tensorflow::Status CreateOpKernel(
     std::unique_ptr<tensorflow::OpKernel>* result) {
   std::shared_ptr<const tensorflow::NodeProperties> props;
   TF_RETURN_IF_ERROR(tensorflow::NodeProperties::CreateFromNodeDef(
-      ndef, flr->GetFunctionLibraryDefinition(), &props));
+      std::move(ndef), flr->GetFunctionLibraryDefinition(), &props));
   tensorflow::OpKernel* k = nullptr;
   TF_RETURN_IF_ERROR(flr->CreateKernel(props, &k));
   result->reset(k);
@@ -79,7 +81,8 @@ tensorflow::Status CreateOpKernel(
 }  // namespace
 
 StatusOr<OpKernelRunner> OpKernelRunner::Create(
-    absl::string_view op_name, absl::string_view device_name, int num_args,
+    absl::string_view op_name, absl::string_view node_name,
+    absl::string_view device_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
     const tensorflow::DeviceMgr& device_manager,
     const tensorflow::ProcessFunctionLibraryRuntime&
@@ -95,12 +98,12 @@ StatusOr<OpKernelRunner> OpKernelRunner::Create(
     device = device_manager.HostCPU();
   }
 
-  return Create(op_name, num_args, attr_builder,
+  return Create(op_name, node_name, num_args, attr_builder,
                 process_function_library_runtime, device);
 }
 
 StatusOr<OpKernelRunner> OpKernelRunner::Create(
-    absl::string_view op_name, int num_args,
+    absl::string_view op_name, absl::string_view node_name, int num_args,
     const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
     const tensorflow::ProcessFunctionLibraryRuntime&
         process_function_library_runtime,
@@ -113,7 +116,7 @@ StatusOr<OpKernelRunner> OpKernelRunner::Create(
           << op_def->DebugString();
 
   TF_ASSIGN_OR_RETURN(auto node_def,
-                      BuildNodeDef(*op_def, num_args, attr_builder));
+                      BuildNodeDef(*op_def, node_name, num_args, attr_builder));
 
   VLOG(1) << "KernelFallbackExecuteCompat created NodeDef: "
           << node_def.DebugString();
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index 09da7e3f12e..5da59ed8bf8 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -40,19 +40,43 @@ namespace tfrt_stub {
 class OpKernelRunner {
  public:
   static StatusOr<OpKernelRunner> Create(
-      absl::string_view op_name, absl::string_view device_name, int num_args,
+      absl::string_view op_name, absl::string_view node_name,
+      absl::string_view device_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
       const tensorflow::DeviceMgr& device_manager,
       const tensorflow::ProcessFunctionLibraryRuntime&
           process_function_library_runtime);
 
+  ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
   static StatusOr<OpKernelRunner> Create(
-      absl::string_view op_name, int num_args,
+      absl::string_view op_name, absl::string_view device_name, int num_args,
+      const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime) {
+    return Create(op_name, /*node_name=*/op_name, device_name, num_args,
+                  attr_builder, device_manager,
+                  process_function_library_runtime);
+  }
+
+  static StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, absl::string_view node_name, int num_args,
       const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
       const tensorflow::ProcessFunctionLibraryRuntime&
           process_function_library_runtime,
       tensorflow::Device* device);
 
+  ABSL_DEPRECATED("Please use the Create() method that takes node_name.")
+  static StatusOr<OpKernelRunner> Create(
+      absl::string_view op_name, int num_args,
+      const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime,
+      tensorflow::Device* device) {
+    return Create(op_name, /*node_name=*/op_name, num_args, attr_builder,
+                  process_function_library_runtime, device);
+  }
+
   OpKernelRunner() = default;
 
   explicit operator bool() const { return op_kernel_ != nullptr; }
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
index eb875dce609..ad10ca28858 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.cc
@@ -16,8 +16,11 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 
+#include "absl/base/casts.h"
+
 namespace tensorflow {
 namespace tfrt_stub {
 
@@ -33,7 +36,7 @@ StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
     tf_shared_lock lock(mu_);
     auto it = map_.find(key);
     if (it != map_.end()) {
-      DCHECK_EQ(it->second->op_kernel()->name(), op_name);
+      DCHECK_EQ(it->second->op_kernel()->def().op(), op_name);
       return it->second.get();
     }
   }
@@ -49,10 +52,13 @@ StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
   VLOG(1) << "KernelFallbackExecuteCompat creating op " << op_name
           << " at location " << loc.data << " on device " << device_name;
 
+  std::string node_name = absl::StrCat(
+      op_name, "_", loc.data, "_", absl::bit_cast<uintptr_t>(loc.GetHandler()));
+
   TF_ASSIGN_OR_RETURN(
-      auto runner,
-      OpKernelRunner::Create(op_name, device_name, num_args, attr_builder,
-                             device_manager, process_function_library_runtime));
+      auto runner, OpKernelRunner::Create(
+                       op_name, node_name, device_name, num_args, attr_builder,
+                       device_manager, process_function_library_runtime));
 
   auto runner_uptr = std::make_unique<OpKernelRunner>(std::move(runner));
 
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
index 3bd1e9e9ca8..125ad64ba4c 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tfrt/host_context/location.h"  // from @tf_runtime
 
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
index a68b015986a..040a9590214 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
 #include <string>
@@ -26,8 +25,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
 
 namespace tensorflow {
+namespace tfrt_stub {
 namespace {
 
 using ::testing::IsNull;
@@ -39,6 +41,85 @@ constexpr const char* kDeviceType = "GPU";
 constexpr const char* kDeviceType = "CPU";
 #endif
 
+class TestOpKernel : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  ~TestOpKernel() override = default;
+
+  void Compute(OpKernelContext* context) override {
+    context->set_output(0, context->input(0));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TestOp").Device(DEVICE_CPU), TestOpKernel);
+
+// Identical to BatchFunction except it has 2 extra TFRT attributes and it does
+// not have `f` attribute. Users will not invoke this op directly.
+REGISTER_OP("TestOp").Input("x: int32").Output("y: int32");
+
+TEST(OpKernelRunnerTest, Create) {
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state,
+                          FallbackState::Create(session_options, fdef_lib));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto runner,
+      OpKernelRunner::Create(
+          /*op_name=*/"TestOp", /*node_name=*/"TestOp_node_name",
+          /*device_name=*/"/job:localhost/replica:0/task:0/device:CPU:0",
+          /*num_args=*/1,
+          /*attr_builder=*/[](tensorflow::AttrValueMap*) { return OkStatus(); },
+          fallback_state->device_manager(),
+          fallback_state->process_function_library_runtime()));
+
+  ASSERT_TRUE(runner);
+
+  EXPECT_EQ(runner.op_kernel()->name(), "TestOp_node_name");
+}
+
+TEST(OpKernelRunnerTest, OpKernelRunnerCache) {
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state,
+                          FallbackState::Create(session_options, fdef_lib));
+
+  OpKernelRunnerCache cache;
+
+  tfrt::Location loc(/*handler=*/nullptr, /*data=*/100);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* runner,
+      cache.GetOrCreate(
+          loc,
+          /*op_name=*/"TestOp",
+          /*device_name=*/"/job:localhost/replica:0/task:0/device:CPU:0",
+          /*num_args=*/1,
+          /*attr_builder=*/[](tensorflow::AttrValueMap*) { return OkStatus(); },
+          fallback_state->device_manager(),
+          fallback_state->process_function_library_runtime()));
+
+  ASSERT_TRUE(runner);
+
+  EXPECT_EQ(runner->op_kernel()->name(), "TestOp_100_0");
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      runner,
+      cache.GetOrCreate(
+          loc,
+          /*op_name=*/"TestOp",
+          /*device_name=*/"/job:localhost/replica:0/task:0/device:CPU:0",
+          /*num_args=*/1,
+          /*attr_builder=*/[](tensorflow::AttrValueMap*) { return OkStatus(); },
+          fallback_state->device_manager(),
+          fallback_state->process_function_library_runtime()));
+
+  ASSERT_TRUE(runner);
+
+  EXPECT_EQ(runner->op_kernel()->name(), "TestOp_100_0");
+}
+
 TEST(OpKernelRunnerTest, OpKernelRunState) {
   SessionOptions options;
   auto* device_count = options.config.mutable_device_count();
@@ -65,7 +146,7 @@ TEST(OpKernelRunnerTest, OpKernelRunState) {
   Tensor c(DT_UINT8, TensorShape({}));
   gtl::InlinedVector<TensorValue, 4> new_inputs{TensorValue(&c)};
 
-  tfrt_stub::OpKernelRunState run_state(new_inputs, params);
+  OpKernelRunState run_state(new_inputs, params);
 
   EXPECT_THAT(run_state.input_tf_tensors, SizeIs(1));
   EXPECT_THAT(run_state.input_tf_tensor_values, SizeIs(1));
@@ -75,4 +156,5 @@ TEST(OpKernelRunnerTest, OpKernelRunState) {
 }
 
 }  // namespace
+}  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 51253e43bd1..0efacfd7d17 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -35,10 +36,18 @@ cc_library(
     tags = ["no_oss"],
     deps = [
         ":graph_execution_options",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
+        "//learning/brain/experimental/tfrt/native_lowering/kernels:sync_context",
         "//learning/brain/experimental/tfrt/native_lowering/saved_model:saved_model_translate",
+        "//learning/infra/mira/mlrt/bytecode",
+        "//learning/infra/mira/mlrt/bytecode:executable",
+        "//learning/infra/mira/mlrt/interpreter:context",
+        "//learning/infra/mira/mlrt/interpreter:execute",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:tf_jitrt_request_context",
+        "//tensorflow/compiler/mlir/tfrt:transforms/update_op_cost_in_tfrt_mlir",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:core_cpu_internal",
@@ -51,14 +60,18 @@ cc_library(
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "//tensorflow/core/tfrt/tpu:tpu_resources",
         "//tensorflow/core/tfrt/utils",
-        "//tensorflow/core/tfrt/utils:error_util",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
@@ -70,6 +83,7 @@ cc_library(
         "@tf_runtime//:befexecutor",
         "@tf_runtime//:core_runtime",
         "@tf_runtime//:hostcontext",
+        "@tf_runtime//:mlirtobef",
         "@tf_runtime//:support",
     ],
 )
@@ -80,7 +94,10 @@ tf_cc_test(
     tags = ["no_oss"],
     deps = [
         ":graph_executor",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
+        "//learning/infra/mira/mlrt/interpreter:context",
+        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/cc:array_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:const_op",
@@ -92,8 +109,8 @@ tf_cc_test(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "//tensorflow/core/tfrt/tpu:tpu_resources",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
-        "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
         "@tf_runtime//cpp_tests:common",
     ],
@@ -105,12 +122,16 @@ cc_library(
     hdrs = ["synchronous_graph_executor.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":graph_executor",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
+        "//learning/brain/experimental/tfrt/native_lowering/kernels",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
+        "//learning/infra/mira/mlrt/interpreter:context",
+        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/core/framework:graph_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/fallback:fallback_state",
-        "//tensorflow/core/tfrt/graph_executor",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/utils:error_util",
         "@com_google_absl//absl/status:statusor",
@@ -123,7 +144,9 @@ tf_cc_test(
     srcs = ["synchronous_graph_executor_test.cc"],
     deps = [
         ":synchronous_graph_executor",
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
+        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/cc:array_ops",
         "//tensorflow/core/tfrt/utils:error_util",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
index 50be83e0299..38e56829431 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.cc
@@ -58,24 +58,25 @@ tensorflow::SessionOptions CreateDefaultSessionOptions(
 void UpdateTpuTargetByBridgeCompatibility(
     tensorflow::tfrt_stub::GraphExecutionOptions& options,
     const tensorflow::GraphDef& graph_def) {
-  if (options.compile_options.tpu_target ==
-      tensorflow::TfrtTpuInfraTarget::kBridgeFallback) {
+  if (options.compile_options.device_target ==
+      tensorflow::TfrtDeviceInfraTarget::kBridgeFallback) {
     auto s = tfrt::CheckTpuMlirBridgeCompatibility(graph_def);
     if (!s.ok()) {
       LOG(INFO)
           << "TFRT detected Bridge unsupported feature, using TF fallback";
-      options.compile_options.tpu_target =
-          tensorflow::TfrtTpuInfraTarget::kTfFallback;
+      options.compile_options.device_target =
+          tensorflow::TfrtDeviceInfraTarget::kTfFallback;
     } else {
-      options.compile_options.tpu_target =
-          tensorflow::TfrtTpuInfraTarget::kTpurt;
+      options.compile_options.device_target =
+          tensorflow::TfrtDeviceInfraTarget::kTpurt;
     }
   }
   if (!tfrt::CheckSpmdGraph(graph_def).ok()) {
-    options.compile_options.tpu_target =
-        tensorflow::TfrtTpuInfraTarget::kTfFallback;
+    options.compile_options.device_target =
+        tensorflow::TfrtDeviceInfraTarget::kTfFallback;
   }
-  LOG(INFO) << "TFRT uses TPU target " << options.compile_options.tpu_target;
+  LOG(INFO) << "TFRT uses device target "
+            << options.compile_options.device_target;
 }
 
 std::ostream& operator<<(std::ostream& os,
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
index 234702499d2..7800171270d 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -52,6 +52,11 @@ struct GraphExecutionOptions {
   // Model metadata used for monitoring and tracing.
   tensorflow::SessionMetadata model_metadata;
 
+  // If true, for each client graph, the op costs of the first request will be
+  // recorded and used to re-compile the client graph.
+  // TODO(b/266251216): Maybe flip the default value or remote it.
+  bool enable_online_cost_analysis = false;
+
   tensorflow::TfrtCompileOptions compile_options;
 };
 
@@ -78,8 +83,9 @@ struct GraphExecutionRunOptions {
   // in the tensorflow::tfrt_stub::Runtime will be used.
   tensorflow::tfrt_stub::WorkQueueInterface* work_queue = nullptr;
 
-  // If true, the cost of the op will be measured at the execution time.
-  bool enable_cost_measurement = false;
+  // If true, just-in-time host compilation is disabled, and then if the
+  // specified graph is not compiled, the execution will return an error.
+  bool disable_compilation = false;
 };
 
 // Creates the default `SessionOptions` from a `GraphExecutionOptions`.
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 086e61c837a..763cc362be3 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -25,7 +25,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
 #include "learning/brain/experimental/tfrt/native_lowering/saved_model/saved_model_translate.h"
+#include "learning/infra/mira/mlrt/bytecode/executable.h"
+#include "learning/infra/mira/mlrt/interpreter/execute.h"
+#include "absl/base/call_once.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
@@ -34,7 +40,9 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_request_context.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -47,21 +55,23 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/tpu/tpu_resources.h"
-#include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
-#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
@@ -79,63 +89,67 @@ namespace {
 constexpr char kDeadlineExceededMessage[] = "Deadline exceeded.";
 constexpr char kTensorNameJoiningDelimiter[] = "-";
 constexpr char kArgumentTypeJoiningDelimiter[] = "^";
+constexpr char kFallbackInitFunction[] = "_tfrt_fallback_init";
+constexpr char kResourceInitFunction[] = "_tfrt_resource_init";
 
 }  // namespace
 
-StatusOr<std::unique_ptr<RequestInfo>> SetUpRequestContext(
+StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
     const GraphExecutionRunOptions& run_options,
-    const SessionMetadata& model_metadata, tfrt::HostContext* host,
+    const SessionMetadata& model_metadata, const Runtime& runtime,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
     tfrt::ResourceContext* resource_context,
-    const tensorflow::tfrt_stub::FallbackState& fallback_state) {
-  DCHECK(host);
-  DCHECK(work_queue);
-  // Create request context and prepare deadline tracker.
+    const tensorflow::tfrt_stub::FallbackState& fallback_state,
+    CostRecorder* cost_recorder) {
+  auto request_info = std::make_unique<RequestInfo>();
+
+  // Set the request queue.
   // TODO(tfrt-devs): Consider using an ID unique within each model to reduce
   // contention.
-  int64_t request_id = work_queue->id();
-  if (request_id == 0) request_id = tfrt::GetUniqueInt();
-  tfrt::RequestContextBuilder request_context_builder(
-      host, resource_context, request_id, run_options.enable_cost_measurement);
-
-  // TODO(b/198671794): `intra_op_threadpool` should be passed through Run()
-  // directly.
-  tensorflow::thread::ThreadPoolInterface* intra_op_threadpool = nullptr;
-
-  // TODO(b/198671794): The per-request queue should be passed through Run()
-  // directly.
-  TF_ASSIGN_OR_RETURN(auto request_queue,
-                      work_queue->InitializeRequest(&request_context_builder,
-                                                    &intra_op_threadpool));
-
-  auto request_info = std::make_unique<RequestInfo>();
+  int64_t request_id = 0;
+  if (work_queue != nullptr) {
+    // If the user provides a work_queue, we use it for inter-op tasks.
+    request_id = work_queue->id();
+    // If the user does not provide a valid id, we need to generate one.
+    if (request_id == 0) request_id = tfrt::GetUniqueInt();
+    request_info->request_queue = work_queue;
+  } else {
+    // Otherwise we use the global queue in `runtime`.
+    request_id = tfrt::GetUniqueInt();
+    TF_ASSIGN_OR_RETURN(request_info->request_queue_owner,
+                        runtime.CreateRequestQueue(request_id));
+    request_info->request_queue = request_info->request_queue_owner.get();
+  }
+  auto* request_queue = request_info->request_queue;
 
-  // If a per-request queue is not provided, use the original queue in the
-  // tensorflow::Executor::Args::Runner.
-  auto* inter_op_queue = request_queue ? request_queue.get() : work_queue;
-  request_info->runner = [inter_op_queue](std::function<void()> f) {
-    inter_op_queue->AddTask(std::move(f));
+  // Create a `tensorflow::Executor::Args::Runner` with the above request queue.
+  request_info->runner = [request_queue](std::function<void()> f) {
+    request_queue->AddTask(std::move(f));
   };
 
-  request_info->request_queue = std::move(request_queue);
-
+  // Create a request context builder.
+  tfrt::RequestContextBuilder request_context_builder(
+      runtime.core_runtime()->GetHostContext(), resource_context, request_id);
+  // Set up the request contexts in the builder.
+  // Note: if the intra-op thread pool from the request queue is null, the
+  // thread pool in `tensorflow::Device` will be used.
   TF_RETURN_IF_ERROR(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
       &request_context_builder, &fallback_state.device_manager(),
-      &fallback_state.process_function_library_runtime(), intra_op_threadpool,
-      model_metadata, &request_info->runner));
-
+      &fallback_state.process_function_library_runtime(),
+      request_queue->GetIntraOpThreadPool(), model_metadata,
+      &request_info->runner, cost_recorder));
   TF_RETURN_IF_ERROR(
       tensorflow::SetUpTfJitRtRequestContext(&request_context_builder));
+  // Set priority in the builder.
   tfrt::RequestOptions request_options;
   request_options.priority = run_options.priority;
   request_context_builder.set_request_options(request_options);
-
+  // Create the request context with the builder.
   auto expected_req_ctx = std::move(request_context_builder).build();
   if (!expected_req_ctx) {
     return tensorflow::errors::Internal(
         tfrt::StrCat(expected_req_ctx.takeError()));
   }
-
   request_info->tfrt_request_context = std::move(expected_req_ctx.get());
 
   return request_info;
@@ -146,19 +160,16 @@ tensorflow::Status GraphExecutionRunOnFunction(
     const GraphExecutionRunOptions& run_options,
     absl::string_view signature_name, const tfrt::Function& func,
     absl::Span<const tensorflow::Tensor> inputs,
-    absl::Span<const tensorflow::Tensor> captures,
     std::vector<tensorflow::Tensor>* outputs,
     tfrt::ResourceContext* resource_context, const Runtime& runtime,
     const FallbackState& fallback_state,
-    tfrt::RequestDeadlineTracker& req_deadline_tracker) {
-  auto* host = runtime.core_runtime()->GetHostContext();
-
+    tfrt::RequestDeadlineTracker* req_deadline_tracker,
+    CostRecorder* cost_recorder) {
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      SetUpRequestContext(run_options, options.model_metadata, host,
-                          run_options.work_queue ? run_options.work_queue
-                                                 : runtime.work_queue(),
-                          resource_context, fallback_state));
+      CreateRequestInfo(run_options, options.model_metadata, runtime,
+                        run_options.work_queue, resource_context,
+                        fallback_state, cost_recorder));
 
   tensorflow::profiler::TraceMeProducer traceme(
       // To TraceMeConsumers in RunHandlerThreadPool::WorkerLoop.
@@ -181,7 +192,11 @@ tensorflow::Status GraphExecutionRunOnFunction(
     if (absl::ToChronoTime(absl::Now()) > deadline) {
       return tensorflow::errors::DeadlineExceeded(kDeadlineExceededMessage);
     }
-    req_deadline_tracker.CancelRequestOnDeadline(
+    if (req_deadline_tracker == nullptr) {
+      return tensorflow::errors::InvalidArgument(
+          "req_deadline_tracker must be non-null");
+    }
+    req_deadline_tracker->CancelRequestOnDeadline(
         deadline, request_info->tfrt_request_context);
   }
 
@@ -191,7 +206,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
     // in `run_options` is specified.
     exec_ctx.set_work_queue(run_options.work_queue);
   } else if (request_info->request_queue) {
-    exec_ctx.set_work_queue(request_info->request_queue.get());
+    exec_ctx.set_work_queue(request_info->request_queue);
   } else {
     exec_ctx.set_work_queue(runtime.work_queue());
   }
@@ -210,9 +225,6 @@ tensorflow::Status GraphExecutionRunOnFunction(
         tfrt::MakeAvailableAsyncValueRef<FallbackTensor>(input).release());
   }
 
-  DCHECK(captures.empty()) << "signature should have no captures, which is "
-                              "guaranteed by the compiler";
-
   if (arguments.size() != func.argument_types().size())
     return tensorflow::errors::Internal("incorrect number of inputs.");
 
@@ -262,10 +274,6 @@ tensorflow::Status GraphExecutionRunOnFunction(
     outputs->push_back(host_tensor);
   }
 
-  // TODO(b/171926578): Explicitly clear the context data. Remove it after the
-  // b/171926578 is fixed.
-  exec_ctx.request_ctx()->ClearData();
-
   // Check if error is due to cancellation.
   // TODO(tfrt-devs): report cancellation reason from runtime.
   if (request_info->tfrt_request_context->IsCancelled()) {
@@ -279,14 +287,14 @@ tensorflow::Status GraphExecutionRunOnFunction(
 std::unique_ptr<tfrt::ResourceContext> CreateResourceContext(
     const tensorflow::tfrt_stub::Runtime& runtime,
     tfrt::tpu::TpuModelResource* tpu_model_resource,
-    tensorflow::TfrtTpuInfraTarget tpu_target) {
+    tensorflow::TfrtDeviceInfraTarget device_target) {
   auto resource_context = std::make_unique<tfrt::ResourceContext>();
   runtime.CreateRuntimeResources(resource_context.get());
 
   // TODO(b/178227859): We should make TPU resource init code pluggable, as
   // opposed to linking it in. We can do this by adding a callback with
   // `Runtime::AddCreateRuntimeResourceFn`.
-  if (tpu_target == tensorflow::TfrtTpuInfraTarget::kTpurt) {
+  if (device_target == tensorflow::TfrtDeviceInfraTarget::kTpurt) {
     AddTpuResources(resource_context.get(), tpu_model_resource);
   }
   return resource_context;
@@ -295,7 +303,8 @@ std::unique_ptr<tfrt::ResourceContext> CreateResourceContext(
 StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
     Options options, const FallbackState& fallback_state,
     tfrt::tpu::TpuModelResource* tpu_model_resource,
-    tensorflow::GraphDef graph_def) {
+    tensorflow::GraphDef graph_def,
+    std::unique_ptr<mlrt::KernelRegistry> kernel_registry) {
   if (options.runtime == nullptr) {
     return errors::InvalidArgument("options.runtime must be non-null ");
   }
@@ -304,14 +313,16 @@ StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
   graph_execution_state_options.run_placer_grappler_on_functions =
       options.run_placer_grappler_on_functions;
   graph_execution_state_options.enable_tfrt_gpu = options.enable_tfrt_gpu;
+  graph_execution_state_options.use_bridge_for_gpu =
+      options.compile_options.use_bridge_for_gpu;
 
   TF_ASSIGN_OR_RETURN(
       auto graph_execution_state,
       TfrtGraphExecutionState::Create(graph_execution_state_options,
                                       std::move(graph_def), fallback_state));
-  return std::make_unique<GraphExecutor>(std::move(options), fallback_state,
-                                         tpu_model_resource,
-                                         std::move(graph_execution_state));
+  return std::make_unique<GraphExecutor>(
+      std::move(options), fallback_state, tpu_model_resource,
+      std::move(graph_execution_state), std::move(kernel_registry));
 }
 
 namespace {
@@ -381,13 +392,13 @@ tensorflow::Status GraphExecutor::Run(
   std::sort(sorted_target_node_names.begin(), sorted_target_node_names.end());
 
   // Load the client graph.
-  TF_ASSIGN_OR_RETURN(
-      const LoadedClientGraph& loaded_client_graph,
-      GetOrCreateLoadedClientGraph(
-          sorted_input_names, sorted_input_dtypes, sorted_output_names,
-          sorted_target_node_names, run_options.work_queue));
+  TF_ASSIGN_OR_RETURN(LoadedClientGraph & loaded_client_graph,
+                      GetOrCreateLoadedClientGraph(
+                          run_options, sorted_input_names, sorted_input_dtypes,
+                          sorted_output_names, sorted_target_node_names,
+                          run_options.work_queue));
 
-  const auto* func = loaded_client_graph.bef_file->GetFunction(
+  const auto* func = loaded_client_graph.bef_context()->bef_file->GetFunction(
       tensorflow::kImportModelDefaultGraphFuncName);
   DCHECK(func);
 
@@ -399,12 +410,22 @@ tensorflow::Status GraphExecutor::Run(
     flat_inputs.push_back(inputs.at(original_index).second);
   }
 
+  // Conduct cost analysis for the first request on this `loaded_client_graph`.
+  std::unique_ptr<CostRecorder> cost_recorder;
+  if (options_.enable_online_cost_analysis) {
+    cost_recorder = loaded_client_graph.MaybeCreateCostRecorder();
+  }
+
   std::vector<tensorflow::Tensor> flat_outputs;
   TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
-      options_, run_options, loaded_client_graph.name, *func, flat_inputs,
-      /*captures=*/{}, &flat_outputs,
-      loaded_client_graph.resource_context.get(), runtime(), fallback_state_,
-      req_deadline_tracker_));
+      options_, run_options, loaded_client_graph.name(), *func, flat_inputs,
+      &flat_outputs, &loaded_client_graph.resource_context(), runtime(),
+      fallback_state_, &req_deadline_tracker_, cost_recorder.get()));
+
+  if (cost_recorder != nullptr) {
+    TF_RETURN_IF_ERROR(
+        loaded_client_graph.UpdateCost(*cost_recorder, runtime()));
+  }
 
   // Create the outputs from the actual function results, which are sorted
   // according to the output tensor names.
@@ -425,16 +446,11 @@ tensorflow::Status GraphExecutor::Extend(const GraphDef& graph) {
 StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
 GraphExecutor::ImportAndCompileClientGraph(
     const GraphExecutor::ClientGraph& client_graph) {
-  auto loaded_client_graph = std::make_unique<LoadedClientGraph>();
-  loaded_client_graph->name = client_graph.name;
-  loaded_client_graph->resource_context = CreateResourceContext(
-      runtime(), tpu_model_resource_, options_.compile_options.tpu_target);
-
   // Step 1 of loading: Import the client graph from proto to an MLIR module.
   auto import_start_time = absl::Now();
-  mlir::MLIRContext context;
+  auto context = std::make_unique<mlir::MLIRContext>();
   ASSIGN_OR_RETURN_IN_IMPORT(
-      auto module, ImportClientGraphToMlirModule(client_graph, &context));
+      auto module, ImportClientGraphToMlirModule(client_graph, context.get()));
   auto import_duration = absl::Now() - import_start_time;
   LOG(INFO) << "TFRT finished importing client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(import_duration)
@@ -445,23 +461,33 @@ GraphExecutor::ImportAndCompileClientGraph(
   // TODO(b/229261464): Unify the sync and async lowering passes so we do not
   // need this branch.
   auto compile_start_time = absl::Now();
+  std::shared_ptr<BefContext> bef_context = nullptr;
+  mlrt::bc::Buffer bytecode_buffer;
+  std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable = nullptr;
   if (options_.compile_options.compile_to_sync_tfrt_dialect) {
     ASSIGN_OR_RETURN_IN_COMPILE(
-        loaded_client_graph->bef,
-        tfrt::CompileTfMlirModuleToSyncBef(module.get()));
+        bytecode_buffer, tfrt::CompileTfMlirModuleToBytecode(module.get()));
+    mlrt::bc::Executable executable(bytecode_buffer.data());
+    bytecode_executable =
+        std::make_unique<mlrt::LoadedExecutable>(executable, *kernel_registry_);
   } else {
-    ASSIGN_OR_RETURN_IN_COMPILE(loaded_client_graph->bef,
-                                CompileMlirModuleToBef(module.get()));
+    ASSIGN_OR_RETURN_IN_COMPILE(auto bef, CompileMlirModuleToBef(module.get()));
+    ASSIGN_OR_RETURN_IN_COMPILE(
+        auto bef_file, tfrt::CreateBefFileFromBefBuffer(runtime(), bef));
+    bef_context =
+        std::make_shared<BefContext>(std::move(bef), std::move(bef_file));
   }
-  ASSIGN_OR_RETURN_IN_COMPILE(
-      loaded_client_graph->bef_file,
-      tfrt::CreateBefFileFromBefBuffer(runtime(), loaded_client_graph->bef));
   auto compile_duration = absl::Now() - compile_start_time;
   LOG(INFO) << "TFRT finished compiling client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(compile_duration)
             << " ms. Client graph name: " << client_graph.name;
 
-  return loaded_client_graph;
+  return std::make_unique<LoadedClientGraph>(
+      client_graph.name,
+      CreateResourceContext(runtime(), tpu_model_resource_,
+                            options_.compile_options.device_target),
+      std::move(context), std::move(module), std::move(bef_context),
+      std::move(bytecode_buffer), std::move(bytecode_executable));
 }
 
 StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
@@ -475,9 +501,17 @@ GraphExecutor::LoadClientGraph(
 
   // Step 3 of loading: Initialize runtime states using special BEF functions.
   auto init_start_time = absl::Now();
-  RETURN_IF_ERROR_IN_INIT(InitBef(loaded_client_graph->bef_file.get(),
-                                  loaded_client_graph->resource_context.get(),
-                                  work_queue));
+  if (loaded_client_graph->bef_context() != nullptr) {
+    RETURN_IF_ERROR_IN_INIT(
+        InitBef(loaded_client_graph->bef_context()->bef_file.get(),
+                &loaded_client_graph->resource_context(), work_queue));
+  } else if (loaded_client_graph->bytecode_executable() != nullptr) {
+    RETURN_IF_ERROR_IN_INIT(InitBytecode(loaded_client_graph.get()));
+  } else {
+    return tsl::errors::FailedPrecondition(
+        "Found neither a BEF buffer nor MLRT bytecode in the results of the "
+        "compilation.");
+  }
   auto init_duration = absl::Now() - init_start_time;
   LOG(INFO) << "TFRT finished initializing client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(init_duration)
@@ -529,12 +563,10 @@ StatusOr<tfrt::BefBuffer> GraphExecutor::CompileMlirModuleToBef(
 tensorflow::Status GraphExecutor::InitBef(
     tfrt::BEFFile* bef_file, tfrt::ResourceContext* resource_context,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue) {
-  auto* host = runtime().core_runtime()->GetHostContext();
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      SetUpRequestContext(/*run_options=*/{}, /*model_metadata=*/{}, host,
-                          work_queue ? work_queue : runtime().work_queue(),
-                          resource_context, fallback_state_));
+      CreateRequestInfo(/*run_options=*/{}, /*model_metadata=*/{}, runtime(),
+                        work_queue, resource_context, fallback_state_));
 
   tfrt::ExecutionContext exec_ctx(request_info->tfrt_request_context);
 
@@ -542,19 +574,60 @@ tensorflow::Status GraphExecutor::InitBef(
   // is the special function created by compiler, which calls a sequence of
   // tfrt_fallback_async.createop to create all fallback ops used in this BEF.
   TF_RETURN_IF_ERROR(
-      RunRuntimeInitializer(exec_ctx, bef_file, "_tfrt_fallback_init"));
+      RunRuntimeInitializer(exec_ctx, bef_file, kFallbackInitFunction));
 
   // After we initialized all the resources in the original graph, we can run
   // the "_tfrt_resource_init" function to set these resources in runtime
   // states, so that later it can be efficiently retrieved without any locking.
   TF_RETURN_IF_ERROR(
-      RunRuntimeInitializer(exec_ctx, bef_file, "_tfrt_resource_init"));
+      RunRuntimeInitializer(exec_ctx, bef_file, kResourceInitFunction));
+
+  return OkStatus();
+}
+
+tensorflow::Status GraphExecutor::InitBytecode(
+    LoadedClientGraph* loaded_graph) {
+  auto fallback_init_function =
+      loaded_graph->bytecode_executable()->GetFunction(kFallbackInitFunction);
+  auto resource_init_function =
+      loaded_graph->bytecode_executable()->GetFunction(kResourceInitFunction);
+  TF_ASSIGN_OR_RETURN(
+      auto request_info,
+      CreateRequestInfo(/*run_options=*/{}, /*model_metadata=*/{},
+                        *options_.runtime, options_.runtime->work_queue(),
+                        &loaded_graph->resource_context(), fallback_state_));
+  tfrt::ExecutionContext exec_ctx{request_info->tfrt_request_context};
+
+  mlrt::ExecutionContext execution_context(loaded_graph->bytecode_executable());
+
+  auto sync_context = std::make_unique<tfrt::SyncContext>(&exec_ctx);
+  execution_context.AddUserContext(std::move(sync_context));
+
+  auto tf_context = std::make_unique<tensorflow::tf_mlrt::Context>(
+      &request_info->tfrt_request_context
+           ->GetData<tensorflow::tfd::KernelFallbackCompatRequestState>());
+  execution_context.AddUserContext(std::move(tf_context));
+
+  execution_context.Call(fallback_init_function, absl::Span<mlrt::Value>(),
+                         absl::Span<mlrt::Value>());
+  mlrt::Execute(execution_context);
+  if (!execution_context.status().ok()) {
+    return tsl::FromAbslStatus(execution_context.status());
+  }
+
+  execution_context.Call(resource_init_function, absl::Span<mlrt::Value>(),
+                         absl::Span<mlrt::Value>());
+  mlrt::Execute(execution_context);
+  if (!execution_context.status().ok()) {
+    return tsl::FromAbslStatus(execution_context.status());
+  }
 
   return OkStatus();
 }
 
-StatusOr<std::reference_wrapper<const GraphExecutor::LoadedClientGraph>>
+StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
 GraphExecutor::GetOrCreateLoadedClientGraph(
+    const RunOptions& run_options,
     absl::Span<const std::string> input_tensor_names,
     absl::Span<const tensorflow::DataType> input_tensor_dtypes,
     absl::Span<const std::string> output_tensor_names,
@@ -580,6 +653,13 @@ GraphExecutor::GetOrCreateLoadedClientGraph(
   const auto iter = loaded_client_graphs_.find(joined_name);
   if (iter != loaded_client_graphs_.end()) return {*iter->second};
 
+  if (run_options.disable_compilation) {
+    return tensorflow::errors::InvalidArgument(
+        absl::StrCat("GraphExecutor: compilation is disabled in execution but "
+                     "the compiled graph is not found for ",
+                     joined_name));
+  }
+
   // Cache miss; populate a `ClientGraph` and load it.
   tensorflow::GraphImportConfig::InputArrays input_nodes;
   DCHECK_EQ(input_tensor_names.size(), input_tensor_dtypes.size());
@@ -601,47 +681,86 @@ GraphExecutor::GetOrCreateLoadedClientGraph(
                       LoadClientGraph(client_graph, work_queue));
 
   // Store the new loaded client graph in cache and return.
-  const auto* loaded_client_graph_ptr = loaded_client_graph.get();
+  auto* loaded_client_graph_ptr = loaded_client_graph.get();
   loaded_client_graphs_[joined_name] = std::move(loaded_client_graph);
   return {*loaded_client_graph_ptr};
 }
 
 tensorflow::Status GraphExecutor::RunWithSyncInterpreter(
-    const std::string& graph_name, absl::Span<tfrt::Value*> input_values,
+    const std::string& graph_name, absl::Span<mlrt::Value> input_values,
     absl::Span<const std::string> input_names,
     absl::Span<const tensorflow::DataType> input_dtypes,
     absl::Span<const std::string> output_tensor_names,
     absl::Span<const std::string> target_tensor_names,
-    absl::Span<tfrt::Value*> outputs) {
-  TF_ASSIGN_OR_RETURN(
-      const LoadedClientGraph& loaded_client_graph,
-      GetOrCreateLoadedClientGraph(input_names, input_dtypes,
-                                   output_tensor_names, target_tensor_names,
-                                   /*work_queue=*/nullptr, graph_name));
-
-  const auto* func = loaded_client_graph.bef_file->GetFunction(
-      tensorflow::kImportModelDefaultGraphFuncName);
-  DCHECK(func);
-
-  auto* host = options_.runtime->core_runtime()->GetHostContext();
+    absl::Span<mlrt::Value> outputs) {
+  TF_ASSIGN_OR_RETURN(const LoadedClientGraph& loaded_client_graph,
+                      GetOrCreateLoadedClientGraph(
+                          /*run_options=*/{}, input_names, input_dtypes,
+                          output_tensor_names, target_tensor_names,
+                          /*work_queue=*/nullptr, graph_name));
 
   TF_ASSIGN_OR_RETURN(
       auto request_info,
-      SetUpRequestContext(/*run_options=*/{}, /*model_metadata=*/{}, host,
-                          options_.runtime->work_queue(),
-                          loaded_client_graph.resource_context.get(),
-                          fallback_state_));
+      CreateRequestInfo(/*run_options=*/{}, /*model_metadata=*/{},
+                        *options_.runtime, options_.runtime->work_queue(),
+                        &loaded_client_graph.resource_context(),
+                        fallback_state_));
   tfrt::ExecutionContext exec_ctx{request_info->tfrt_request_context};
 
-  if (auto err = ExecuteSyncBEFFunction(
-          *func, exec_ctx,
-          llvm::ArrayRef<tfrt::Value*>(input_values.data(),
-                                       input_values.size()),
-          llvm::ArrayRef<tfrt::Value*>(outputs.data(), outputs.size()))) {
-    return tensorflow::errors::Internal(
-        tfrt::StrCat("Failed to run function", err));
+  mlrt::ExecutionContext execution_context(
+      loaded_client_graph.bytecode_executable());
+
+  auto sync_context = std::make_unique<tfrt::SyncContext>(&exec_ctx);
+  execution_context.AddUserContext(std::move(sync_context));
+
+  auto tf_context = std::make_unique<tensorflow::tf_mlrt::Context>(
+      &request_info->tfrt_request_context
+           ->GetData<tensorflow::tfd::KernelFallbackCompatRequestState>());
+  execution_context.AddUserContext(std::move(tf_context));
+
+  auto serving_function =
+      loaded_client_graph.bytecode_executable()->GetFunction("main");
+
+  execution_context.Call(serving_function, input_values, outputs);
+  mlrt::Execute(execution_context);
+  return tsl::FromAbslStatus(execution_context.status());
+}
+
+std::unique_ptr<CostRecorder>
+GraphExecutor::LoadedClientGraph::MaybeCreateCostRecorder() const {
+  std::unique_ptr<CostRecorder> cost_recorder;
+  absl::call_once(create_cost_recorder_once_,
+                  [&]() { cost_recorder = std::make_unique<CostRecorder>(); });
+  return cost_recorder;
+}
+
+Status GraphExecutor::LoadedClientGraph::UpdateCost(
+    const CostRecorder& cost_recorder, const Runtime& runtime) {
+  auto tfrt_mlir = std::move(tfrt_mlir_);
+  mlir::StatusScopedDiagnosticHandler diag_handler(
+      tfrt_mlir.get().getContext());
+  // TODO(b/259602527): Update costs in bytecode path.
+  // Update costs in MLIR.
+  tfrt_compiler::UpdateOpCostInTfrtMlir(tfrt_mlir.get(), cost_recorder);
+  // Create a new `BefContext` with the updated MLIR.
+  auto bef = tfrt::ConvertMLIRToBEF(tfrt_mlir.get(),
+                                    /*disable_optional_sections=*/true);
+
+  if (bef.empty()) {
+    return diag_handler.Combine(
+        tensorflow::errors::Internal("failed to convert MLIR to BEF."));
   }
-  return tensorflow::OkStatus();
+  bef.shrink_to_fit();
+  TF_ASSIGN_OR_RETURN(auto bef_file,
+                      tfrt::CreateBefFileFromBefBuffer(runtime, bef));
+  auto bef_context =
+      std::make_shared<BefContext>(std::move(bef), std::move(bef_file));
+  // Swap in the new `BefContext`.
+  tensorflow::mutex_lock lock(bef_context_mu_);
+  // TODO(b/259602527): Add test cases that fail when code is changed. E.g., add
+  // a test kernel that examines the cost.
+  bef_context_ = std::move(bef_context);
+  return OkStatus();
 }
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index 12c7a528e1e..07ff02458e7 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -22,13 +22,22 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
+#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
+#include "learning/infra/mira/mlrt/bytecode/executable.h"
+#include "learning/infra/mira/mlrt/interpreter/context.h"
+#include "absl/base/call_once.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/tpu/tpu_resources.h"  // NOLINT(unused-includes): For tfrt::tpu::TpuModelResource
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
@@ -43,17 +52,24 @@ namespace tfrt_stub {
 // Contains request related info.
 struct RequestInfo {
   tfrt::RCReference<tfrt::RequestContext> tfrt_request_context;
-  std::unique_ptr<WorkQueueInterface> request_queue;
+  // If this request needs to create a new queue, it is stored here. Otherwise,
+  // it can be nullptr.
+  std::unique_ptr<WorkQueueInterface> request_queue_owner;
+  // The inter-op thread pool to be used for this request, and it must not be
+  // nullptr. If `request_queue_owner` is not nullptr, then `request_queue` is
+  // the raw pointer inside `request_queue_owner`.
+  WorkQueueInterface* request_queue = nullptr;
+  // The task runner used by tensorflow::OpKernel.
   std::function<void(std::function<void()>)> runner;
 };
 
 // Creates a `RequestInfo` given relative data.
-StatusOr<std::unique_ptr<RequestInfo>> SetUpRequestContext(
+StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
     const GraphExecutionRunOptions& run_options,
-    const SessionMetadata& model_metadata, tfrt::HostContext* host,
+    const SessionMetadata& model_metadata, const Runtime& runtime,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue,
     tfrt::ResourceContext* resource_context,
-    const FallbackState& fallback_state);
+    const FallbackState& fallback_state, CostRecorder* cost_recorder = nullptr);
 
 // Runs on a function given input/output and other info.
 tensorflow::Status GraphExecutionRunOnFunction(
@@ -61,11 +77,11 @@ tensorflow::Status GraphExecutionRunOnFunction(
     const GraphExecutionRunOptions& run_options,
     absl::string_view signature_name, const tfrt::Function& func,
     absl::Span<const tensorflow::Tensor> inputs,
-    absl::Span<const tensorflow::Tensor> captures,
     std::vector<tensorflow::Tensor>* outputs,
     tfrt::ResourceContext* resource_context, const Runtime& runtime,
     const FallbackState& fallback_state,
-    tfrt::RequestDeadlineTracker& req_deadline_tracker);
+    tfrt::RequestDeadlineTracker* req_deadline_tracker,
+    CostRecorder* cost_recorder = nullptr);
 
 // Creates a ResourceContext and populate it with per model resource from
 // Runtime. If `tpu_target` is set to kTpurt, also call a special
@@ -74,7 +90,7 @@ tensorflow::Status GraphExecutionRunOnFunction(
 // TODO(b/178227859): Remove the need for the special handling for TPU here.
 std::unique_ptr<tfrt::ResourceContext> CreateResourceContext(
     const Runtime& runtime, tfrt::tpu::TpuModelResource* tpu_model_resource,
-    tensorflow::TfrtTpuInfraTarget tpu_target);
+    tensorflow::TfrtDeviceInfraTarget tpu_target);
 
 // Loads (if not yet) and runs a subgraph in a graph as per each request.
 class GraphExecutor {
@@ -82,12 +98,70 @@ class GraphExecutor {
   using Options = GraphExecutionOptions;
   using RunOptions = GraphExecutionRunOptions;
 
-  // The loading result of a `ClientGraph`.
-  struct LoadedClientGraph {
-    std::string name;
+  // Stores BEF-related data.
+  struct BefContext {
+    BefContext(tfrt::BefBuffer bef, tfrt::RCReference<tfrt::BEFFile> bef_file)
+        : bef(std::move(bef)), bef_file(std::move(bef_file)) {}
+
     tfrt::BefBuffer bef;
     tfrt::RCReference<tfrt::BEFFile> bef_file;
-    std::unique_ptr<tfrt::ResourceContext> resource_context;
+  };
+
+  // The loading result of a `ClientGraph`.
+  class LoadedClientGraph {
+   public:
+    LoadedClientGraph(
+        std::string name,
+        std::unique_ptr<tfrt::ResourceContext> resource_context,
+        std::unique_ptr<mlir::MLIRContext> mlir_context,
+        mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
+        std::shared_ptr<BefContext> bef_context,
+        mlrt::bc::Buffer bytecode_buffer,
+        std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable)
+        : name_(std::move(name)),
+          resource_context_(std::move(resource_context)),
+          mlir_context_(std::move(mlir_context)),
+          tfrt_mlir_(std::move(tfrt_mlir)),
+          bef_context_(std::move(bef_context)),
+          bytecode_buffer_(std::move(bytecode_buffer)),
+          bytecode_executable_(std::move(bytecode_executable)) {}
+
+    // Returns a `CostRecorder` if none has been created before for this
+    // `LoadedClientGraph`.
+    std::unique_ptr<CostRecorder> MaybeCreateCostRecorder() const;
+
+    // Updates the op cost values in this `LoadedClientGraph` with records from
+    // `cost_recorder`.
+    Status UpdateCost(const CostRecorder& cost_recorder,
+                      const Runtime& runtime);
+
+    // Getters.
+    std::shared_ptr<BefContext> bef_context() const {
+      tensorflow::mutex_lock lock(bef_context_mu_);
+      return bef_context_;
+    }
+    absl::string_view name() const { return name_; }
+    tfrt::ResourceContext& resource_context() const {
+      return *resource_context_;
+    }
+    mlrt::LoadedExecutable* bytecode_executable() const {
+      return bytecode_executable_.get();
+    }
+
+   private:
+    std::string name_;
+    std::unique_ptr<tfrt::ResourceContext> resource_context_;
+    std::unique_ptr<mlir::MLIRContext> mlir_context_;
+    // Thread-safety resulted from `create_cost_recorder_once_`.
+    mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir_;
+    // Only one of `bef_context_` or `bytecode_executable_` should be filled for
+    // a single `LoadedClientGraph`.
+    mutable tensorflow::mutex bef_context_mu_;
+    // Can be updated if online cost analysis is enabled.
+    std::shared_ptr<BefContext> bef_context_ TF_GUARDED_BY(bef_context_mu_);
+    mlrt::bc::Buffer bytecode_buffer_;
+    std::unique_ptr<mlrt::LoadedExecutable> bytecode_executable_ = nullptr;
+    mutable absl::once_flag create_cost_recorder_once_;
   };
 
   // A subgraph constructed by specifying input/output tensors.
@@ -108,19 +182,22 @@ class GraphExecutor {
   static StatusOr<std::unique_ptr<GraphExecutor>> Create(
       Options options, const FallbackState& fallback_state,
       tfrt::tpu::TpuModelResource* tpu_model_resource,
-      tensorflow::GraphDef graph_def);
+      tensorflow::GraphDef graph_def,
+      std::unique_ptr<mlrt::KernelRegistry> kernel_registry);
 
   // Ctor. Public for `Create()`. Do not use directly.
   GraphExecutor(Options options, const FallbackState& fallback_state,
                 tfrt::tpu::TpuModelResource* tpu_model_resource,
                 std::unique_ptr<tensorflow::tfrt_stub::TfrtGraphExecutionState>
-                    graph_execution_state)
+                    graph_execution_state,
+                std::unique_ptr<mlrt::KernelRegistry> kernel_registry)
       : options_(std::move(options)),
         fallback_state_(fallback_state),
         tpu_model_resource_(tpu_model_resource),
         graph_execution_state_(std::move(graph_execution_state)),
         req_deadline_tracker_(
-            options_.runtime->core_runtime()->GetHostContext()) {}
+            options_.runtime->core_runtime()->GetHostContext()),
+        kernel_registry_(std::move(kernel_registry)) {}
 
   // Runs on the graph according to given input/output.
   tensorflow::Status Run(
@@ -136,12 +213,12 @@ class GraphExecutor {
   // graphs, since this name is used to lookup compiled graphs in the cache. The
   // graph is run synchronously with the TFRT interpreter.
   tensorflow::Status RunWithSyncInterpreter(
-      const std::string& graph_name, absl::Span<tfrt::Value*> input_values,
+      const std::string& graph_name, absl::Span<mlrt::Value> input_values,
       absl::Span<const std::string> input_names,
       absl::Span<const tensorflow::DataType> input_dtypes,
       absl::Span<const std::string> output_tensor_names,
       absl::Span<const std::string> target_tensor_names,
-      absl::Span<tfrt::Value*> outputs);
+      absl::Span<mlrt::Value> outputs);
 
   // Extends the current graph by `graph`.
   tensorflow::Status Extend(const GraphDef& graph);
@@ -151,10 +228,6 @@ class GraphExecutor {
     return *graph_execution_state_;
   }
 
-  // Compiles and returns a graph that is specified by `client_graph`.
-  StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
-  ImportAndCompileClientGraph(const GraphExecutor::ClientGraph& client_graph);
-
   // Returns the underlying runtime.
   const tensorflow::tfrt_stub::Runtime& runtime() const {
     DCHECK(options_.runtime);
@@ -166,6 +239,8 @@ class GraphExecutor {
   StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>> LoadClientGraph(
       const GraphExecutor::ClientGraph& client_graph,
       tensorflow::tfrt_stub::WorkQueueInterface* work_queue);
+  StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
+  ImportAndCompileClientGraph(const GraphExecutor::ClientGraph& client_graph);
   tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
   ImportClientGraphToMlirModule(const GraphExecutor::ClientGraph& client_graph,
                                 mlir::MLIRContext* context) const;
@@ -174,10 +249,13 @@ class GraphExecutor {
       tfrt::BEFFile* bef_file, tfrt::ResourceContext* resource_context,
       tensorflow::tfrt_stub::WorkQueueInterface* work_queue);
 
+  tensorflow::Status InitBytecode(LoadedClientGraph* loaded_graph);
+
   // Returns a `LoadedClientGraph` given input/output tensor info. If there is
   // no existing one yet, creates one first.
-  StatusOr<std::reference_wrapper<const GraphExecutor::LoadedClientGraph>>
+  StatusOr<std::reference_wrapper<GraphExecutor::LoadedClientGraph>>
   GetOrCreateLoadedClientGraph(
+      const RunOptions& run_options,
       absl::Span<const std::string> input_tensor_names,
       absl::Span<const tensorflow::DataType> input_tensor_dtypes,
       absl::Span<const std::string> output_tensor_names,
@@ -202,6 +280,8 @@ class GraphExecutor {
   absl::flat_hash_map<std::string /*joined_name*/,
                       std::unique_ptr<LoadedClientGraph>>
       loaded_client_graphs_ TF_GUARDED_BY(loaded_client_graphs_mu_);
+
+  std::unique_ptr<mlrt::KernelRegistry> kernel_registry_;
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
index 172606bffcc..eba719e8494 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
@@ -19,6 +19,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
+#include "learning/infra/mira/mlrt/interpreter/context.h"
+#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/array_ops.h"
@@ -26,13 +31,13 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/core/tfrt/tpu/tpu_resources.h"  // NOLINT(unused-includes): For tfrt::tpu::TpuModelResource
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tfrt/cpp_tests/test_util.h""  // from @tf_runtime
-#include "tfrt/host_context/value.h"  // from @tf_runtime
 #include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
 
 namespace tensorflow {
@@ -50,23 +55,66 @@ tensorflow::Status GetSimpleGraphDef(GraphDef& graph_def) {
   return scope.ToGraphDef(&graph_def);
 }
 
+std::unique_ptr<mlrt::KernelRegistry> GetKernelRegistry() {
+  auto kernel_registry = std::make_unique<mlrt::KernelRegistry>();
+  tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
+  tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
+  tfrt::cpu::RegisterMlrtFallbackCompatKernels(kernel_registry.get());
+
+  return kernel_registry;
+}
+
 TEST_F(GraphExecutorTest, Vanilla) {
   GraphDef graph_def;
   TF_ASSERT_OK(GetSimpleGraphDef(graph_def));
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   GraphExecutor::Options options(runtime.get());
-  auto statusor_fallback_state = tensorflow::tfrt_stub::FallbackState::Create(
-      CreateDefaultSessionOptions(options), graph_def.library());
-  ASSERT_TRUE(statusor_fallback_state.ok());
-  tensorflow::tfrt_stub::FallbackState* fallback_state =
-      statusor_fallback_state.value().get();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fallback_state,
+      tensorflow::tfrt_stub::FallbackState::Create(
+          CreateDefaultSessionOptions(options), graph_def.library()))
   auto tpu_model_resource = std::make_unique<tfrt::tpu::TpuModelResource>();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto graph_executor,
+      GraphExecutor::Create(std::move(options), *fallback_state,
+                            tpu_model_resource.get(), graph_def,
+                            GetKernelRegistry()));
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<std::pair<std::string, tensorflow::Tensor>> inputs;
+  inputs.push_back({"input", CreateTfTensor<int32_t>(
+                                 /*shape=*/{1, 3}, /*data=*/{1, 1, 1})});
+
+  std::vector<tensorflow::Tensor> outputs;
+
+  TF_ASSERT_OK(graph_executor->Run(/*run_options=*/{}, inputs,
+                                   /*output_tensor_names=*/{"rank"},
+                                   /*target_tensor_names=*/{}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
 
-  auto status_or_graph_executor = GraphExecutor::Create(
-      std::move(options), *fallback_state, tpu_model_resource.get(), graph_def);
-  ASSERT_TRUE(status_or_graph_executor.ok());
-  GraphExecutor* graph_executor = status_or_graph_executor.value().get();
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({2}));
+}
+
+TEST_F(GraphExecutorTest, BasicWithOnlineCostAnalysis) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(GetSimpleGraphDef(graph_def));
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  GraphExecutor::Options options(runtime.get());
+  options.enable_online_cost_analysis = true;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fallback_state,
+      tensorflow::tfrt_stub::FallbackState::Create(
+          CreateDefaultSessionOptions(options), graph_def.library()));
+  auto tpu_model_resource = std::make_unique<tfrt::tpu::TpuModelResource>();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto graph_executor,
+      GraphExecutor::Create(std::move(options), *fallback_state,
+                            tpu_model_resource.get(), graph_def,
+                            GetKernelRegistry()));
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<std::pair<std::string, tensorflow::Tensor>> inputs;
@@ -75,6 +123,16 @@ TEST_F(GraphExecutorTest, Vanilla) {
 
   std::vector<tensorflow::Tensor> outputs;
 
+  // A first run should trigger online cost analysis.
+  TF_ASSERT_OK(graph_executor->Run(/*run_options=*/{}, inputs,
+                                   /*output_tensor_names=*/{"rank"},
+                                   /*target_tensor_names=*/{}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({2}));
+
+  // A second run should use re-compiled graph with online profiled costs.
   TF_ASSERT_OK(graph_executor->Run(/*run_options=*/{}, inputs,
                                    /*output_tensor_names=*/{"rank"},
                                    /*target_tensor_names=*/{}, &outputs));
@@ -84,6 +142,24 @@ TEST_F(GraphExecutorTest, Vanilla) {
               ::testing::ElementsAreArray({2}));
 }
 
+TEST_F(GraphExecutorTest, DoOnlineCostAnalysisExactlyOnce) {
+  GraphExecutor::LoadedClientGraph loaded_client_graph_0(
+      "name0", /*resource_context=*/nullptr, /*mlir_context=*/nullptr,
+      /*tfrt_mlir=*/{}, /*bef_context=*/nullptr, /*bytecode_buffer=*/{},
+      /*bytecode_executable=*/nullptr);
+  GraphExecutor::LoadedClientGraph loaded_client_graph_1(
+      "name1", /*resource_context=*/nullptr, /*mlir_context=*/nullptr,
+      /*tfrt_mlir=*/{}, /*bef_context=*/nullptr, /*bytecode_buffer=*/{},
+      /*bytecode_executable=*/nullptr);
+
+  // For each `LoadedClientGraph`, `MaybeCreateCostRecorder()` only returns a
+  // cost recorder for once.
+  EXPECT_TRUE(loaded_client_graph_0.MaybeCreateCostRecorder() != nullptr);
+  EXPECT_TRUE(loaded_client_graph_1.MaybeCreateCostRecorder() != nullptr);
+  EXPECT_TRUE(loaded_client_graph_0.MaybeCreateCostRecorder() == nullptr);
+  EXPECT_TRUE(loaded_client_graph_1.MaybeCreateCostRecorder() == nullptr);
+}
+
 TEST_F(GraphExecutorTest, Extend) {
   GraphDef graph_def;
   {
@@ -100,15 +176,19 @@ TEST_F(GraphExecutorTest, Extend) {
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   GraphExecutor::Options options(runtime.get());
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto fallback_state,
-      tensorflow::tfrt_stub::FallbackState::Create(
-          CreateDefaultSessionOptions(options), graph_def.library()));
+  auto session_options = CreateDefaultSessionOptions(options);
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  session_options.config.mutable_experimental()
+      ->set_disable_optimize_for_static_graph(true);
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state,
+                          tensorflow::tfrt_stub::FallbackState::Create(
+                              session_options, graph_def.library()));
   auto tpu_model_resource = std::make_unique<tfrt::tpu::TpuModelResource>();
   TF_ASSERT_OK_AND_ASSIGN(
       auto graph_executor,
       GraphExecutor::Create(std::move(options), *fallback_state,
-                            tpu_model_resource.get(), graph_def));
+                            tpu_model_resource.get(), graph_def,
+                            GetKernelRegistry()));
 
   GraphDef extension;
   {
@@ -138,6 +218,52 @@ TEST_F(GraphExecutorTest, Extend) {
               ::testing::ElementsAreArray({2}));
 }
 
+TEST_F(GraphExecutorTest, DisableCompilation) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(GetSimpleGraphDef(graph_def));
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  GraphExecutor::Options options(runtime.get());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fallback_state,
+      tensorflow::tfrt_stub::FallbackState::Create(
+          CreateDefaultSessionOptions(options), graph_def.library()));
+  tfrt::tpu::TpuModelResource tpu_model_resource;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto graph_executor,
+      GraphExecutor::Create(std::move(options), *fallback_state,
+                            &tpu_model_resource, graph_def,
+                            GetKernelRegistry()));
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<std::pair<std::string, tensorflow::Tensor>> inputs;
+  inputs.push_back({"input", CreateTfTensor<int32_t>(
+                                 /*shape=*/{1, 3}, /*data=*/{1, 1, 1})});
+
+  std::vector<tensorflow::Tensor> outputs;
+
+  GraphExecutor::RunOptions run_options;
+  run_options.disable_compilation = true;
+
+  auto status = graph_executor->Run(run_options, inputs,
+                                    /*output_tensor_names=*/{"rank"},
+                                    /*target_tensor_names=*/{}, &outputs);
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("GraphExecutor: compilation is disabled in "
+                           "execution but the compiled graph is not found"));
+
+  run_options.disable_compilation = false;
+  TF_ASSERT_OK(graph_executor->Run(run_options, inputs,
+                                   /*output_tensor_names=*/{"rank"},
+                                   /*target_tensor_names=*/{}, &outputs));
+  ASSERT_EQ(outputs.size(), 1);
+
+  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
+              ::testing::ElementsAreArray({2}));
+}
+
 TEST_F(GraphExecutorTest, SyncExecute) {
   GraphDef graph_def;
   TF_ASSERT_OK(GetSimpleGraphDef(graph_def));
@@ -152,29 +278,25 @@ TEST_F(GraphExecutorTest, SyncExecute) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto graph_executor,
       GraphExecutor::Create(std::move(options), *fallback_state,
-                            tpu_model_resource.get(), graph_def));
+                            tpu_model_resource.get(), graph_def,
+                            GetKernelRegistry()));
 
-  std::vector<tfrt::Value> input;
-  std::vector<tfrt::Value*> input_ptrs;
+  std::vector<mlrt::Value> inputs;
   tfrt::DenseHostTensor dht =
       tfrt::CreateTensorFromValues<int32_t>({1, 3}, {1, 1, 1});
-  input.emplace_back(std::move(dht));
-  input_ptrs.push_back(&input[0]);
-  std::vector<tfrt::Value> results;
+  inputs.emplace_back(std::move(dht));
+  std::vector<mlrt::Value> results;
   results.resize(1);
-  std::vector<tfrt::Value*> result_ptrs;
-  result_ptrs.resize(1);
-  result_ptrs[0] = &results[0];
 
   TF_ASSERT_OK(graph_executor->RunWithSyncInterpreter(
-      "test_graph", absl::Span<tfrt::Value*>(input_ptrs),
+      "test_graph", absl::Span<mlrt::Value>(inputs),
       /*input_names=*/{"input"}, /*input_dtypes=*/{DT_INT32},
       /*output_tensor_names=*/{"rank"},
-      /*target_tensor_names=*/{}, absl::Span<tfrt::Value*>(result_ptrs)));
+      /*target_tensor_names=*/{}, absl::Span<mlrt::Value>(results)));
   tfrt::DenseHostTensor expected =
       tfrt::CreateTensorFromValues<int32_t>({}, {2});
 
-  EXPECT_EQ(expected, results[0].get<tfrt::DenseHostTensor>());
+  EXPECT_EQ(expected, results[0].Get<tfrt::DenseHostTensor>());
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
index dac6fb0523d..b6e98cbe312 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
 #include "absl/status/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -37,9 +40,9 @@ tensorflow::tfrt_stub::GraphExecutionOptions GetGraphExecutionOptions(
   compile_options.variable_device = tensorflow::DeviceNameUtils::FullName(
       /*job=*/"localhost", /*replica=*/0,
       /*task=*/0, /*type=*/"CPU", /*id=*/0);
-  compile_options.enable_native_ops = false;
   compile_options.enable_grappler = true;
   compile_options.hoist_invariant_ops = true;
+  compile_options.sink_in_invariant_ops = false;
   compile_options.cost_threshold = 1024;
   compile_options.compile_to_sync_tfrt_dialect = true;
   return options;
@@ -48,7 +51,9 @@ tensorflow::tfrt_stub::GraphExecutionOptions GetGraphExecutionOptions(
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<SynchronousGraphExecutor>>
-SynchronousGraphExecutor::Create(const GraphDef& graph) {
+SynchronousGraphExecutor::Create(
+    const GraphDef& graph,
+    std::unique_ptr<mlrt::KernelRegistry> kernel_registry) {
   auto runtime =
       tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/1);
   tensorflow::tfrt_stub::GraphExecutionOptions graph_execution_options =
@@ -63,10 +68,16 @@ SynchronousGraphExecutor::Create(const GraphDef& graph) {
     return absl::InternalError(fallback_state.status().ToString());
   }
 
+  // Register infra and standard math kernels
+  tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
+  tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
+  tfrt::cpu::RegisterMlrtFallbackCompatKernels(kernel_registry.get());
+
   tensorflow::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::GraphExecutor>>
       graph_executor = tensorflow::tfrt_stub::GraphExecutor::Create(
           graph_execution_options, *(*fallback_state),
-          /*tpu_model_resource=*/nullptr, std::move(graph));
+          /*tpu_model_resource=*/nullptr, std::move(graph),
+          std::move(kernel_registry));
   if (!graph_executor.ok()) {
     return absl::InternalError(graph_executor.status().ToString());
   }
@@ -77,12 +88,12 @@ SynchronousGraphExecutor::Create(const GraphDef& graph) {
 }
 
 absl::Status SynchronousGraphExecutor::Run(
-    const std::string& graph_name, absl::Span<tfrt::Value*> input_values,
+    const std::string& graph_name, absl::Span<mlrt::Value> input_values,
     absl::Span<const std::string> input_names,
     absl::Span<const tensorflow::DataType> input_dtypes,
     absl::Span<const std::string> output_tensor_names,
     absl::Span<const std::string> target_tensor_names,
-    absl::Span<tfrt::Value*> outputs) {
+    absl::Span<mlrt::Value> outputs) {
   return tfrt::AbslStatusFromTfStatus(graph_executor_->RunWithSyncInterpreter(
       graph_name, input_values, input_names, input_dtypes, output_tensor_names,
       target_tensor_names, outputs));
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
index 3593bb298e1..0963bdaea10 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "learning/infra/mira/mlrt/interpreter/context.h"
+#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include "absl/status/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -38,7 +40,8 @@ class SynchronousGraphExecutor {
  public:
   // Creates and returns a SynchronousGraphExecutor for the given `graph`.
   static absl::StatusOr<std::unique_ptr<SynchronousGraphExecutor>> Create(
-      const tensorflow::GraphDef& graph);
+      const tensorflow::GraphDef& graph,
+      std::unique_ptr<mlrt::KernelRegistry> kernel_registry);
 
   // Runs the graph identified by `graph_name` using the input `inputs` and
   // stores the output of the execution in `outputs`. It is the client's
@@ -46,12 +49,12 @@ class SynchronousGraphExecutor {
   // graphs, since this name is used to lookup compiled graphs in the cache. The
   // graph is run synchronously with the TFRT interpreter.
   absl::Status Run(const std::string& graph_name,
-                   absl::Span<tfrt::Value*> input_values,
+                   absl::Span<mlrt::Value> input_values,
                    absl::Span<const std::string> input_names,
                    absl::Span<const tensorflow::DataType> input_dtypes,
                    absl::Span<const std::string> output_tensor_names,
                    absl::Span<const std::string> target_tensor_names,
-                   absl::Span<tfrt::Value*> outputs);
+                   absl::Span<mlrt::Value> outputs);
 
   // Returns the TFRT host context for allocating tensors.
   // TODO(rohitju): This should ideally not be exposed to the client.
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
index bc678f3857b..36f01cfb203 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
@@ -14,9 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h"
 
+#include <memory>
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
+#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/array_ops.h"
@@ -40,30 +45,31 @@ TEST(TfrtSynchronousSessionTest, Sanity) {
   tensorflow::GraphDef graph_def;
   ASSERT_OK(GetSimpleGraphDef(graph_def));
 
-  ASSERT_OK_AND_ASSIGN(auto session,
-                       SynchronousGraphExecutor::Create(graph_def));
+  auto kernel_registry = std::make_unique<mlrt::KernelRegistry>();
+  tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
+  tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
+  tfrt::cpu::RegisterMlrtFallbackCompatKernels(kernel_registry.get());
 
-  std::vector<tfrt::Value> input;
-  std::vector<tfrt::Value*> input_ptrs;
+  ASSERT_OK_AND_ASSIGN(
+      auto session,
+      SynchronousGraphExecutor::Create(graph_def, std::move(kernel_registry)));
+
+  std::vector<mlrt::Value> inputs;
   tfrt::DenseHostTensor dht =
       tfrt::CreateTensorFromValues<int32_t>({1, 3}, {1, 1, 1});
-  input.emplace_back(std::move(dht));
-  input_ptrs.push_back(&input[0]);
-  std::vector<tfrt::Value> results;
+  inputs.emplace_back(std::move(dht));
+  std::vector<mlrt::Value> results;
   results.resize(1);
-  std::vector<tfrt::Value*> result_ptrs;
-  result_ptrs.resize(1);
-  result_ptrs[0] = &results[0];
 
-  ASSERT_OK(session->Run("test_graph", absl::Span<tfrt::Value*>(input_ptrs),
+  ASSERT_OK(session->Run("test_graph", absl::Span<mlrt::Value>(inputs),
                          /*input_names=*/{"input"}, /*input_dtypes=*/{DT_INT32},
                          /*output_tensor_names=*/{"rank"},
                          /*target_tensor_names=*/{},
-                         absl::Span<tfrt::Value*>(result_ptrs)));
+                         absl::Span<mlrt::Value>(results)));
   tfrt::DenseHostTensor expected =
       tfrt::CreateTensorFromValues<int32_t>({}, {2});
 
-  EXPECT_EQ(expected, results[0].get<tfrt::DenseHostTensor>());
+  EXPECT_EQ(expected, results[0].Get<tfrt::DenseHostTensor>());
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/mla/BUILD b/tensorflow/core/tfrt/mla/BUILD
index 42c123b123d..c2740bf512c 100644
--- a/tensorflow/core/tfrt/mla/BUILD
+++ b/tensorflow/core/tfrt/mla/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index 1bc2bbf904a..414f49e267c 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -15,7 +16,6 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/cpp_tests/tpu_model/...",
         # copybara:uncomment "//learning/brain/tfrt/tfrt_session/...",
-        # copybara:uncomment "//third_party/auroraml/...",
         "//tensorflow/core/tfrt/...",
         "//tensorflow/core/runtime_fallback/runtime/...",
         "//tensorflow_serving/...",
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
index 6411e247b8e..e324e272bc1 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
 
 #include <cstddef>
+#include <utility>
 
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
@@ -412,7 +413,9 @@ class RunHandlerThreadPool {
 class RunHandlerWorkQueue : public tensorflow::tfrt_stub::WorkQueueInterface {
  public:
   explicit RunHandlerWorkQueue(std::unique_ptr<RunHandler> run_handler)
-      : run_handler_(std::move(run_handler)) {
+      : WorkQueueInterface(run_handler->step_id(),
+                           run_handler->AsIntraThreadPoolInterface()),
+        run_handler_(std::move(run_handler)) {
     DCHECK(run_handler_);
   }
   ~RunHandlerWorkQueue() override = default;
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
index 97c5356edb3..4f2b0e8eab7 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
@@ -59,22 +59,16 @@ RunHandlerThreadWorkQueue::RunHandlerThreadWorkQueue(const Options& options)
 }
 
 tensorflow::StatusOr<std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
-RunHandlerThreadWorkQueue::InitializeRequest(
-    tfrt::RequestContextBuilder* request_context_builder,
-    tensorflow::thread::ThreadPoolInterface** intra_op_threadpool) const {
-  DCHECK(intra_op_threadpool);
+RunHandlerThreadWorkQueue::InitializeRequest(int64_t request_id) const {
   RunHandlerOptions options;
-  options.priority = request_context_builder->request_options().priority;
-  std::unique_ptr<RunHandler> handler = handler_pool_->Get(
-      request_context_builder->id(), options_.init_timeout_ms, options);
+  std::unique_ptr<RunHandler> handler =
+      handler_pool_->Get(request_id, options_.init_timeout_ms, options);
   if (!handler) {
     return tensorflow::errors::Internal(absl::StrCat(
         "Could not obtain RunHandler for request after waiting for ",
         options_.init_timeout_ms, " ms."));
   }
 
-  *intra_op_threadpool = handler->AsIntraThreadPoolInterface();
-
   return {std::make_unique<RunHandlerWorkQueue>(std::move(handler))};
 }
 
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
index a3f572deb49..3540397cbcc 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
@@ -91,9 +91,7 @@ class RunHandlerThreadWorkQueue
 
   tensorflow::StatusOr<
       std::unique_ptr<tensorflow::tfrt_stub::WorkQueueInterface>>
-  InitializeRequest(tfrt::RequestContextBuilder* request_context_builder,
-                    tensorflow::thread::ThreadPoolInterface**
-                        intra_op_threadpool) const override;
+  InitializeRequest(int64_t request_id) const override;
 
   int GetParallelismLevel() const override {
     return options_.num_main_threads + options_.num_complementary_threads;
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
index fb7d0194b16..b0513102796 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
@@ -59,9 +59,7 @@ class RunHandlerThreadWorkQueueTest : public ::testing::Test {
                                           std::move(work_queue));
     RequestContextBuilder req_ctx_builder{host_.get(),
                                           /*resource_context=*/nullptr};
-    tensorflow::thread::ThreadPoolInterface* intra_op_threadpool = nullptr;
-    auto queue =
-        pool_->InitializeRequest(&req_ctx_builder, &intra_op_threadpool);
+    auto queue = pool_->InitializeRequest(/*request_id=*/100);
     TF_CHECK_OK(queue.status());
     queue_ = std::move(*queue);
     auto req_ctx = std::move(req_ctx_builder).build();
@@ -181,10 +179,9 @@ TEST_F(RunHandlerThreadWorkQueueTest, NoHandlerReturnsError) {
   options.init_timeout_ms = 1;
   options.max_concurrent_handler = 0;
   auto queue = std::make_unique<RunHandlerThreadWorkQueue>(options);
-  tensorflow::thread::ThreadPoolInterface* interface;
   tfrt::RequestContextBuilder ctx_builder(nullptr, nullptr);
   EXPECT_THAT(
-      queue->InitializeRequest(&ctx_builder, &interface),
+      queue->InitializeRequest(/*request_id=*/100),
       tensorflow::testing::StatusIs(
           tensorflow::error::INTERNAL,
           "Could not obtain RunHandler for request after waiting for 1 ms."));
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 2bb7dcf72bc..956a08cb43e 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -11,7 +12,6 @@ package_group(
     name = "friends",
     packages = [
         # Authorized users go here.
-        # copybara:uncomment "//third_party/auroraml/...",
         "//tensorflow/compiler/xla/service/gpu/...",
         "//tensorflow/core/tfrt/...",
         "//tensorflow/core/runtime_fallback/...",
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index 4b98f2c0c8a..55d32623cc9 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -88,11 +88,29 @@ class Runtime {
     }
   }
 
+  void SetCreateRequestQueueFn(
+      std::function<StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+          create_request_queue_fn) {
+    create_request_queue_fn_ = std::move(create_request_queue_fn);
+  }
+
+  // Creates a work queue for a request.
+  StatusOr<std::unique_ptr<WorkQueueInterface>> CreateRequestQueue(
+      int64_t request_id) const {
+    if (create_request_queue_fn_) {
+      return create_request_queue_fn_(request_id);
+    }
+
+    return work_queue_->InitializeRequest(request_id);
+  }
+
  private:
   explicit Runtime(std::unique_ptr<tfrt::CoreRuntime> core_runtime,
                    WorkQueueInterface* work_queue);
 
   std::unique_ptr<tfrt::CoreRuntime> core_runtime_;
+  std::function<StatusOr<std::unique_ptr<WorkQueueInterface>>(int64_t)>
+      create_request_queue_fn_;
   WorkQueueInterface* work_queue_ = nullptr;
   std::vector<std::function<void(tfrt::ResourceContext*)>>
       runtime_resource_fns_;
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
index 9af18ab7b80..4bbb905dfcd 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
@@ -34,15 +34,9 @@ namespace tfrt_stub {
 using ::tensorflow::thread::ThreadPoolInterface;
 
 StatusOr<std::unique_ptr<WorkQueueInterface>>
-TfThreadPoolWorkQueue::InitializeRequest(
-    ::tfrt::RequestContextBuilder* request_context_builder,
-    ThreadPoolInterface** intra_op_threadpool) const {
-  DCHECK(intra_op_threadpool);
-  *intra_op_threadpool = intra_op_threadpool_;
-
-  return {std::make_unique<TfThreadPoolWorkQueue>(request_context_builder->id(),
-                                                  intra_op_threadpool_,
-                                                  inter_op_threadpool_)};
+TfThreadPoolWorkQueue::InitializeRequest(int64_t request_id) const {
+  return {std::make_unique<TfThreadPoolWorkQueue>(
+      request_id, intra_op_threadpool_, inter_op_threadpool_)};
 }
 
 void TfThreadPoolWorkQueue::AddTask(tfrt::TaskFunction work) {
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
index b45fc06f009..27da09634a8 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
@@ -43,14 +43,12 @@ class TfThreadPoolWorkQueue : public WorkQueueInterface {
   TfThreadPoolWorkQueue(
       int64_t id, tensorflow::thread::ThreadPoolInterface* intra_op_threadpool,
       tensorflow::thread::ThreadPoolInterface* inter_op_threadpool)
-      : WorkQueueInterface(id),
+      : WorkQueueInterface(id, intra_op_threadpool),
         intra_op_threadpool_(intra_op_threadpool),
         inter_op_threadpool_(inter_op_threadpool) {}
 
   StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
-      ::tfrt::RequestContextBuilder* request_context_builder,
-      tensorflow::thread::ThreadPoolInterface** intra_op_threadpool)
-      const override;
+      int64_t request_id) const override;
 
   int GetParallelismLevel() const override {
     return tensorflow::port::MaxParallelism();
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue_test.cc b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue_test.cc
index d168831ead1..6694300becf 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue_test.cc
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue_test.cc
@@ -52,12 +52,10 @@ TEST_F(TfThreadpoolWorkQueueTest, GetNameOk) {
 TEST_F(TfThreadpoolWorkQueueTest, InitializeRequestOk) {
   tfrt::RequestContextBuilder ctx_builder(/*host=*/nullptr,
                                           /*resource_context=*/nullptr);
-  tensorflow::thread::ThreadPoolInterface* intra_op_threadpool = nullptr;
-  auto queue =
-      tf_threadpool_cwq_->InitializeRequest(&ctx_builder, &intra_op_threadpool);
+  auto queue = tf_threadpool_cwq_->InitializeRequest(/*request_id=*/0);
   TF_ASSERT_OK(queue.status());
   EXPECT_NE(*queue, nullptr);
-  EXPECT_NE(intra_op_threadpool, nullptr);
+  EXPECT_NE((*queue)->GetIntraOpThreadPool(), nullptr);
 }
 
 TEST_F(TfThreadpoolWorkQueueTest, IsInWorkerThreadOk) {
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.cc b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
index b7456f57204..5970e91cac8 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.cc
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 
+#include <memory>
 #include <utility>
 
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
@@ -22,25 +23,39 @@ namespace tensorflow {
 namespace tfrt_stub {
 namespace {
 
-class DefaultWorkQueueWrapperBase : public WorkQueueInterface {
+class DefaultWorkQueueWrapper : public WorkQueueInterface {
  public:
-  explicit DefaultWorkQueueWrapperBase(int64_t id,
-                                       tfrt::ConcurrentWorkQueue* work_queue)
-      : id_(id), work_queue_(work_queue) {}
+  explicit DefaultWorkQueueWrapper(
+      std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue)
+      : WorkQueueInterface(/*id=*/0),
+        work_queue_owner_(std::move(work_queue)),
+        work_queue_(work_queue_owner_.get()) {}
+
+  DefaultWorkQueueWrapper(std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue,
+                          thread::ThreadPoolInterface* intra_thread_pool)
+      : WorkQueueInterface(/*id=*/0, intra_thread_pool),
+        work_queue_owner_(std::move(work_queue)),
+        work_queue_(work_queue_owner_.get()) {}
+
+  DefaultWorkQueueWrapper(int64_t request_id,
+                          tfrt::ConcurrentWorkQueue* work_queue,
+                          thread::ThreadPoolInterface* intra_thread_pool)
+      : WorkQueueInterface(request_id, intra_thread_pool),
+        work_queue_(work_queue) {}
 
-  ~DefaultWorkQueueWrapperBase() override = default;
+  ~DefaultWorkQueueWrapper() override = default;
 
  private:
   std::string name() const override { return work_queue_->name(); }
 
   void AddTask(tfrt::TaskFunction work) override {
-    work_queue_->AddTask(WrapWork(id_, "inter", std::move(work)));
+    work_queue_->AddTask(WrapWork(id(), "inter", std::move(work)));
   }
 
   llvm::Optional<tfrt::TaskFunction> AddBlockingTask(
       tfrt::TaskFunction work, bool allow_queuing) override {
     return work_queue_->AddBlockingTask(
-        WrapWork(id_, "blocking", std::move(work)), allow_queuing);
+        WrapWork(id(), "blocking", std::move(work)), allow_queuing);
   }
 
   void Await(
@@ -58,43 +73,19 @@ class DefaultWorkQueueWrapperBase : public WorkQueueInterface {
     return work_queue_->IsInWorkerThread();
   }
 
- private:
-  int64_t id_ = 0;
-  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
-};
-
-class DefaultWorkQueueWrapper final : public DefaultWorkQueueWrapperBase {
- public:
-  explicit DefaultWorkQueueWrapper(
-      std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue)
-      : DefaultWorkQueueWrapperBase(/*id=*/0, work_queue.get()),
-        work_queue_(std::move(work_queue)) {}
-
-  ~DefaultWorkQueueWrapper() override = default;
-
-  DefaultWorkQueueWrapper(std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue,
-                          thread::ThreadPoolInterface* intra_thread_pool)
-      : DefaultWorkQueueWrapperBase(/*id=*/0, work_queue.get()),
-        work_queue_(std::move(work_queue)),
-        intra_thread_pool_(intra_thread_pool) {}
-
   StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
-      tfrt::RequestContextBuilder* request_context_builder,
-      thread::ThreadPoolInterface** intra_op_threadpool) const override {
-    *intra_op_threadpool = intra_thread_pool_;
-
-    int64_t id = 0;
-    if (request_context_builder) {
-      id = request_context_builder->id();
-    }
-
-    return {
-        std::make_unique<DefaultWorkQueueWrapperBase>(id, work_queue_.get())};
+      int64_t request_id) const override {
+    return {std::make_unique<DefaultWorkQueueWrapper>(request_id, work_queue_,
+                                                      GetIntraOpThreadPool())};
   }
 
  private:
-  std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue_;
-  tensorflow::thread::ThreadPoolInterface* intra_thread_pool_ = nullptr;
+  // Optionally the wrapper can own a work queue. In that case, it is stored in
+  // `work_queue_owner_`.
+  std::unique_ptr<tfrt::ConcurrentWorkQueue> work_queue_owner_;
+  // The non-owning pointer to the underlying work queue. If `work_queue_owner_`
+  // is not nullptr, then `work_queue_` is the same as `work_queue_owner_`.
+  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
 };
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.h b/tensorflow/core/tfrt/runtime/work_queue_interface.h
index eb48fb05f23..72107da26b9 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.h
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.h
@@ -33,10 +33,19 @@ namespace tfrt_stub {
 // methods (eg. create an intra op thread pool) without changing TFRT core.
 class WorkQueueInterface : public tfrt::ConcurrentWorkQueue {
  public:
-  explicit WorkQueueInterface(int64_t id) : id_(id) {}
   WorkQueueInterface() = default;
+  explicit WorkQueueInterface(int64_t id) : id_(id) {}
+  explicit WorkQueueInterface(int64_t id,
+                              thread::ThreadPoolInterface* intra_op_threadpool)
+      : id_(id), intra_op_threadpool_(intra_op_threadpool) {}
   ~WorkQueueInterface() override = 0;
 
+  int64_t id() const { return id_; }
+
+  thread::ThreadPoolInterface* GetIntraOpThreadPool() const {
+    return intra_op_threadpool_;
+  }
+
   // Returns per-request work queue if possible. A nullptr should be returned if
   // the implementation does not implement the per-request work queue.
   //
@@ -45,16 +54,13 @@ class WorkQueueInterface : public tfrt::ConcurrentWorkQueue {
   // should be handled separately.
   ABSL_DEPRECATED("Create the instance directly instead.")
   virtual StatusOr<std::unique_ptr<WorkQueueInterface>> InitializeRequest(
-      tfrt::RequestContextBuilder* request_context_builder,
-      thread::ThreadPoolInterface** intra_op_threadpool) const {
-    *intra_op_threadpool = nullptr;
+      int64_t request_id) const {
     return {nullptr};
   }
 
-  int64_t id() const { return id_; }
-
  private:
   int64_t id_ = 0;
+  thread::ThreadPoolInterface* intra_op_threadpool_ = nullptr;
 };
 
 inline WorkQueueInterface::~WorkQueueInterface() = default;
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc b/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
index cc9337b49d3..7e980f52263 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
@@ -85,13 +85,10 @@ TEST(DefaultWorkQueueWrapperTest, IntraOpThreadPool) {
   auto work_queue_wrapper =
       WrapDefaultWorkQueue(std::move(work_queue), &intra_op_thread_pool);
 
-  thread::ThreadPoolInterface* got_intra_op_threadpool;
-  auto statusor_queue = work_queue_wrapper->InitializeRequest(
-      /*request_context_builder=*/nullptr, &got_intra_op_threadpool);
-  TF_ASSERT_OK(statusor_queue.status());
-  EXPECT_NE(statusor_queue.value(), nullptr);
-
-  EXPECT_EQ(got_intra_op_threadpool, &intra_op_thread_pool);
+  TF_ASSERT_OK_AND_ASSIGN(auto queue, work_queue_wrapper->InitializeRequest(
+                                          /*request_id=*/0));
+  EXPECT_NE(queue, nullptr);
+  EXPECT_EQ(queue->GetIntraOpThreadPool(), &intra_op_thread_pool);
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 51eb99b2f65..f890a01602a 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -13,7 +14,6 @@ package_group(
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/infra/mira/...",
         # copybara:uncomment "//learning/serving/...",
-        # copybara:uncomment "//third_party/auroraml/...",
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/saved_model/tests/...",
         "//tensorflow/core/tfrt/graph_executor/...",
@@ -34,7 +34,10 @@ cc_library(
     hdrs = ["saved_model.h"],
     tags = ["no_oss"],
     deps = [
+        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
+        "//learning/brain/experimental/tfrt/native_lowering/kernels",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
@@ -60,6 +63,7 @@ cc_library(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
+        "//tensorflow/core/runtime_fallback/kernel:gpurt_kernels",
         "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/graph_executor",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 773f49bc049..232df9e7cef 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -25,6 +25,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
@@ -50,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
 #include "tensorflow/core/tfrt/mla/mla_utils.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
@@ -58,6 +63,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value.h"  // from @tf_runtime
@@ -74,15 +80,20 @@ namespace tfrt_stub {
 namespace {
 
 constexpr absl::string_view kSignatureJoiningDelimiter = "+";
-constexpr absl::string_view kTensorNameJoiningDelimiter = "-";
-constexpr absl::string_view kArgumentTypeJoiningDelimiter = "^";
 
 using SignatureMap = absl::flat_hash_map<std::string, internal::Signature>;
 using ::tensorflow::SessionMetadata;
 using ::tensorflow::StatusOr;
 
+struct Initializer {
+  std::string name;
+  std::vector<tensorflow::Tensor> inputs;
+};
+
 struct InitializersAndSignatures {
-  llvm::SmallVector<std::string, 4> initializers;
+  // Initializers are kept in a certain order as they need to be executed in
+  // that order.
+  std::vector<Initializer> initializers;
   SignatureMap signature_map;
 };
 
@@ -121,6 +132,13 @@ auto* saved_model_init_time_seconds =
         "/tensorflow/tfrt/saved_model/init_time",
         "Record the initialization time for the savedmodel.", "model_name");
 
+// TODO(b/239749833) clean up this retention after input spec validation is
+// enabled everywhere.
+auto* saved_model_input_spec_validation_failure =
+    tensorflow::monitoring::Gauge<bool, 1>::New(
+        "/tensorflow/tfrt/saved_model/input_spec_validation_failure",
+        "Record the models that failed input spec validation.", "model_name");
+
 tensorflow::Tensor CreateScalarStringTensor(absl::string_view str) {
   return tensorflow::Tensor(tensorflow::tstring(str));
 }
@@ -129,8 +147,7 @@ tensorflow::Tensor CreateScalarStringTensor(absl::string_view str) {
 //
 // TODO(chky): For V2 models, the bound input can also be a resource.
 StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
-    mlir::Operation* bound_input, absl::string_view saved_model_dir,
-    absl::flat_hash_map<std::string, tensorflow::Tensor>* variables) {
+    mlir::Operation* bound_input, absl::string_view saved_model_dir) {
   // Assets are files in the saved model directory. We pass their filenames to
   // functions so that they can be used.
   if (auto asset = llvm::dyn_cast<mlir::tf_saved_model::AssetOp>(bound_input)) {
@@ -144,16 +161,31 @@ StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
       "Failed to create captured tensors: unknown bound input type.");
 }
 
-StatusOr<SignatureMap> GetFunctionSignaturesFromTFSavedModelMLIR(
-    absl::string_view saved_model_dir, mlir::ModuleOp module) {
-  absl::flat_hash_map<std::string, tensorflow::Tensor> variables;
-  SignatureMap signatures;
+StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
+    mlir::ModuleOp module, absl::string_view saved_model_dir) {
+  InitializersAndSignatures result;
+
+  // A map for initializer inputs.
+  absl::flat_hash_map<std::string, std::vector<tensorflow::Tensor>>
+      initializer_input_map;
+
+  // Create placeholders for initializers.
+  for (auto session_initializer_name :
+       mlir::tf_saved_model::GetSessionInitializerExportedName(module)) {
+    Initializer initializer;
+    initializer.name = session_initializer_name.str();
+    initializer_input_map[initializer.name];
+    result.initializers.push_back(std::move(initializer));
+  }
 
+  auto& signatures = result.signature_map;
   tensorflow::StatusGroup status_group;
   TF_RETURN_IF_ERROR(tensorflow::MapFunctionSignaturesFromTFSavedModelMLIR(
-      module, [&status_group, &variables, &signatures, saved_model_dir](
-                  const tensorflow::TFRTSavedModelSignatureInfo& sig_info) {
-        auto& signature = signatures[std::string(sig_info.func_name)];
+      module,
+      [&status_group, &signatures, &initializer_input_map, saved_model_dir](
+          const tensorflow::TFRTSavedModelSignatureInfo& sig_info) {
+        auto signature_name = std::string(sig_info.func_name);
+        auto& signature = signatures[signature_name];
 
         auto copy = [](llvm::ArrayRef<llvm::StringRef> src,
                        std::vector<std::string>* dst) {
@@ -176,22 +208,31 @@ StatusOr<SignatureMap> GetFunctionSignaturesFromTFSavedModelMLIR(
           signature.output_specs.push_back(TensorSpec(spec.first, spec.second));
         }
 
+        auto init_iter = initializer_input_map.find(signature_name);
+        if (init_iter == initializer_input_map.end()) return;
+
+        auto& init_inputs = init_iter->second;
+
         for (auto* bound_input : sig_info.bound_inputs) {
-          auto capture = CreateTensorFromBoundInput(
-              bound_input, saved_model_dir, &variables);
+          auto capture =
+              CreateTensorFromBoundInput(bound_input, saved_model_dir);
           if (!capture.ok()) {
             status_group.Update(capture.status());
             // Insert a random tensor in case of errors.
-            signature.captures.push_back(tensorflow::Tensor());
+            init_inputs.push_back(tensorflow::Tensor());
           } else {
-            signature.captures.push_back(*std::move(capture));
+            init_inputs.push_back(*std::move(capture));
           }
         }
       }));
 
   if (!status_group.ok()) return status_group.as_concatenated_status();
 
-  return signatures;
+  for (auto& initializer : result.initializers) {
+    initializer.inputs = std::move(initializer_input_map.at(initializer.name));
+  }
+
+  return result;
 }
 
 tensorflow::Status RunInitializers(
@@ -199,11 +240,10 @@ tensorflow::Status RunInitializers(
     const SessionMetadata& model_metadata, tfrt::BEFFile* bef_file,
     const Runtime& runtime, tfrt::ResourceContext* resource_context,
     const FallbackState& fallback_state) {
-  auto* host = runtime.core_runtime()->GetHostContext();
   TF_ASSIGN_OR_RETURN(auto request_info,
-                      SetUpRequestContext(/*run_options=*/{}, model_metadata,
-                                          host, runtime.work_queue(),
-                                          resource_context, fallback_state));
+                      CreateRequestInfo(/*run_options=*/{}, model_metadata,
+                                        runtime, runtime.work_queue(),
+                                        resource_context, fallback_state));
 
   tfrt::ExecutionContext exec_ctx(request_info->tfrt_request_context);
 
@@ -213,44 +253,19 @@ tensorflow::Status RunInitializers(
   TF_RETURN_IF_ERROR(
       RunRuntimeInitializer(exec_ctx, bef_file, "_tfrt_fallback_init"));
 
-  for (const auto& init : initializers_and_signatures.initializers) {
-    // TODO(b/184771263): Consider using `GraphExecutionRunOnFunction()`
-    // instead.
-
-    auto* func = bef_file->GetFunction(init);
-    assert(func);
-
-    const auto& signature = initializers_and_signatures.signature_map.at(init);
-
-    auto ready_chain = tfrt::GetReadyChain();
-
-    // The actual arguments are the concat of side-effect chain and assets.
-    llvm::SmallVector<tfrt::AsyncValue*, 1> arguments;
-    auto cleanup = tensorflow::gtl::MakeCleanup([&]() {
-      for (auto* argument : arguments) argument->DropRef();
-    });
-
-    arguments.push_back(ready_chain.release());
-
-    for (const auto& capture : signature.captures) {
-      arguments.push_back(
-          tfrt::MakeAvailableAsyncValueRef<FallbackTensor>(capture).release());
-    }
-
-    assert(arguments.size() == func->argument_types().size());
-
-    llvm::SmallVector<tfrt::RCReference<tfrt::AsyncValue>, 1> results;
-    results.resize(func->result_types().size());
-    assert(results.size() == 1);
-
-    func->Execute(exec_ctx, arguments, results);
-
-    // Wait for the function execution to finish, as well as the side-effects.
-    host->Await(results);
-
-    if (auto* error = results[0]->GetErrorIfPresent()) {
-      return CreateTfErrorStatus(tfrt::DecodedDiagnostic(*error));
-    }
+  for (const auto& p : initializers_and_signatures.initializers) {
+    const auto& initializer_name = p.name;
+    const auto& initializer_inputs = p.inputs;
+    GraphExecutionOptions options(&runtime);
+    options.model_metadata = model_metadata;
+    auto* func = bef_file->GetFunction(initializer_name);
+    DCHECK(func);
+    std::vector<tensorflow::Tensor> outputs;
+    TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
+        options, /*run_options=*/{}, initializer_name, *func,
+        initializer_inputs, &outputs, resource_context, runtime, fallback_state,
+        /*req_deadline_tracker=*/nullptr));
+    DCHECK(outputs.empty());
   }
 
   // After we initialized all the resources in the original graph, we can run
@@ -304,7 +319,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     mlir::MLIRContext* context, const tensorflow::MetaGraphDef& meta_graph_def,
     const FallbackState& fallback_state, std::string saved_model_dir,
     bool import_user_signatures, bool run_placer_grappler_on_functions,
-    bool enable_tfrt_gpu) {
+    bool enable_tfrt_gpu, bool use_bridge_for_gpu) {
   std::vector<std::string> signature_names;
   if (import_user_signatures) {
     signature_names = FindNamesForValidSignatures(meta_graph_def);
@@ -322,7 +337,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
   TF_ASSIGN_OR_RETURN(auto import_input,
                       TfrtSavedModelMLIRImportInput::Create(
                           fallback_state, &meta_graph_def, /*debug_info=*/{},
-                          run_placer_grappler_on_functions, enable_tfrt_gpu));
+                          run_placer_grappler_on_functions, enable_tfrt_gpu,
+                          use_bridge_for_gpu));
 
   TF_ASSIGN_OR_RETURN(
       auto module,
@@ -347,19 +363,6 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
   return module;
 }
 
-StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
-    mlir::ModuleOp module, absl::string_view saved_model_dir) {
-  InitializersAndSignatures result;
-  TF_ASSIGN_OR_RETURN(
-      result.signature_map,
-      GetFunctionSignaturesFromTFSavedModelMLIR(saved_model_dir, module));
-  for (auto session_initializer_name :
-       mlir::tf_saved_model::GetSessionInitializerExportedName(module)) {
-    result.initializers.push_back(session_initializer_name.str());
-  }
-  return result;
-}
-
 tensorflow::Status InitSavedModel(
     const InitializersAndSignatures& initializers_and_signatures,
     tfrt::BEFFile* bef_file, const SavedModel::Options& options,
@@ -374,6 +377,120 @@ tensorflow::Status InitSavedModel(
   return OkStatus();
 }
 
+tensorflow::Status IsInputSpecsCorrect(
+    absl::string_view name, const internal::Signature& signature,
+    absl::Span<const tensorflow::Tensor> inputs) {
+  TF_RET_CHECK(signature.input_specs.size() == inputs.size())
+      << "signature " << name
+      << " input size is wrong, expected: " << signature.input_specs.size()
+      << ", actual: " << inputs.size();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const auto& expected_input_spec = signature.input_specs[i];
+    TF_RET_CHECK(expected_input_spec.dtype == inputs[i].dtype())
+        << "signature " << name
+        << " input dtype is wrong, expected: " << expected_input_spec.dtype
+        << ", actual: " << inputs[i].dtype();
+    TF_RET_CHECK(expected_input_spec.shape.IsCompatibleWith(inputs[i].shape()))
+        << "signature " << name
+        << " input shape is wrong, expected : " << expected_input_spec.shape
+        << ", actual: " << inputs[i].shape();
+  }
+  return OkStatus();
+}
+
+tensorflow::Status CheckInputSpecs(
+    const tensorflow::SessionMetadata& model_metadata,
+    const SavedModel::RunOptions& run_options, absl::string_view signature_name,
+    const internal::Signature& signature,
+    absl::Span<const tensorflow::Tensor> input_tensors) {
+  if (!run_options.validate_input_specs &&
+      !run_options.validate_input_specs_dry_run) {
+    return OkStatus();
+  }
+
+  auto status = IsInputSpecsCorrect(signature_name, signature, input_tensors);
+  if (!status.ok()) {
+    saved_model_input_spec_validation_failure
+        ->GetCell(
+            absl::StrCat(model_metadata.name(), ":", model_metadata.version()))
+        ->Set(true);
+    const auto error_string =
+        absl::StrCat("model: ", model_metadata.name(),
+                     ", version: ", model_metadata.version(),
+                     ", error: ", status.error_message());
+    if (!run_options.validate_input_specs_dry_run) {
+      return tensorflow::errors::InvalidArgument(error_string);
+    }
+    LOG_EVERY_N_SEC(ERROR, 5)
+        << "TFRT input specs validation failed, " << error_string;
+  }
+
+  return OkStatus();
+}
+
+tensorflow::Status PreprocessSignature(
+    const tensorflow::SessionMetadata& model_metadata,
+    const SavedModel::RunOptions& run_options, absl::string_view signature_name,
+    const tensorflow::SignatureDef& signature_def,
+    const internal::Signature& signature,
+    absl::Span<const tensorflow::Tensor> input_tensors,
+    absl::flat_hash_set<std::string>* visited_feed_tensor_names,
+    std::vector<std::pair<std::string, tensorflow::Tensor>>& inputs,
+    std::vector<std::string>& output_tensor_names) {
+  const auto& input_names = signature.input_names;
+
+  TF_RETURN_IF_ERROR(CheckInputSpecs(model_metadata, run_options,
+                                     signature_name, signature, input_tensors));
+
+  TF_RET_CHECK(input_tensors.size() == signature_def.inputs().size())
+      << "Incorrect input size for signature: " << signature_name
+      << ": expected " << signature_def.inputs().size() << ", but got "
+      << input_tensors.size();
+  DCHECK_EQ(input_names.size(), signature_def.inputs().size());
+
+  // Then we find out the corresponding tensor names (ie.
+  // node_name:output_idx) for the inputs using the SignatureDef proto.
+  //
+  // TODO(tfrt-devs): Consider including tensor names in `signatures_` as
+  // well, so that only `signatures_` is used here.
+  for (int i = 0; i < input_tensors.size(); ++i) {
+    const auto& tensor_info = signature_def.inputs().at(input_names[i]);
+
+    // TODO(b/184675681): Support other encoding cases.
+    //
+    // TODO(b/184679394): Add unit test for this check.
+    TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
+        << "Only dense tensor is supported, but got encoding case "
+        << tensor_info.encoding_case();
+
+    const auto& tensor_name = tensor_info.name();
+
+    // Skip if we have visited the feed tensor. Otherwise, marked it as
+    // visited and put it in the `flat_inputs`. Note that the following code
+    // deduplicate inputs with the feed tensor names, and generates the flat
+    // inputs in the same order.
+    if (visited_feed_tensor_names &&
+        !visited_feed_tensor_names->insert(tensor_name).second)
+      continue;
+    inputs.push_back(std::make_pair(tensor_name, input_tensors[i]));
+  }
+
+  for (const auto& output_key : signature.output_names) {
+    const auto& tensor_info = signature_def.outputs().at(output_key);
+
+    VLOG(1) << "Importing Signature Output: output_key = " << output_key
+            << ", tensor_info = " << tensor_info.DebugString();
+
+    TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
+        << "Only dense tensor is supported, but got encoding case "
+        << tensor_info.encoding_case();
+
+    output_tensor_names.push_back(tensor_info.name());
+  }
+
+  return OkStatus();
+}
+
 }  // namespace
 
 SavedModel::~SavedModel() = default;  // Out-of-line C++ key function.
@@ -427,6 +544,9 @@ void UpdateCompileOptions(SavedModel::Options& options) {
   if (options.graph_execution_options.enable_tfrt_gpu) {
     options.graph_execution_options.compile_options.decompose_resource_ops =
         false;
+    // TODO(b/260915352): Remove this flag and use GPU bridge by default, and
+    // remove the obsolete TFRT GPU runtime as well.
+    options.graph_execution_options.compile_options.use_bridge_for_gpu = true;
   }
 }
 
@@ -451,9 +571,10 @@ StatusOr<tensorflow::MetaGraphDef> ReadSavedModel(
 
 }  // namespace
 
-std::unique_ptr<SavedModel> SavedModelImpl::LoadSavedModel(
-    Options options, absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, tensorflow::Status* status) {
+tensorflow::StatusOr<std::unique_ptr<SavedModel>>
+SavedModelImpl::LoadSavedModel(Options options,
+                               absl::string_view saved_model_dir,
+                               const std::unordered_set<std::string>& tags) {
   std::string saved_model_dir_str = "unused";
   if (options.maybe_load_from_mla) {
     const auto mla_check_start_time = absl::Now();
@@ -467,139 +588,131 @@ std::unique_ptr<SavedModel> SavedModelImpl::LoadSavedModel(
     if (is_mla) {
       LOG(INFO) << "TFRT got an MLArchive dir: " << saved_model_dir
                 << ". Continuing to find the actual saved_model_dir in it.";
-      const auto statusor_saved_model_dir =
-          GetSavedModelDirFromMlaDir(saved_model_dir);
-      if (!statusor_saved_model_dir.ok()) {
-        *status = statusor_saved_model_dir.status();
-        return nullptr;
-      }
-      saved_model_dir_str = *statusor_saved_model_dir;
+      TF_ASSIGN_OR_RETURN(saved_model_dir_str,
+                          GetSavedModelDirFromMlaDir(saved_model_dir));
       saved_model_dir = saved_model_dir_str;
       LOG(INFO) << "TFRT found from MLArchive a saved model: "
                 << saved_model_dir;
     }  // Not an MLA; `saved_model_dir` is ready to use.
   }
 
-  auto meta_graph_def = ReadSavedModel(saved_model_dir, tags);
-  if (!meta_graph_def.ok()) {
-    *status = meta_graph_def.status();
-    return nullptr;
-  }
+  TF_ASSIGN_OR_RETURN(auto meta_graph_def,
+                      ReadSavedModel(saved_model_dir, tags));
+  return LoadSavedModel(std::move(options), std::move(meta_graph_def),
+                        saved_model_dir);
+}
 
+tensorflow::StatusOr<std::unique_ptr<SavedModel>>
+SavedModelImpl::LoadSavedModel(Options options,
+                               tensorflow::MetaGraphDef meta_graph_def,
+                               absl::string_view saved_model_dir) {
   LOG(INFO) << "TFRT loading v1 savedmodel: " << saved_model_dir;
   tfrt::metrics::AddTFRTVersionMetric();
 
   UpdateTpuTargetByBridgeCompatibility(options.graph_execution_options,
-                                       meta_graph_def->graph_def());
+                                       meta_graph_def.graph_def());
   UpdateCompileOptions(options);
 
-  auto saved_model =
-      [&]() -> tensorflow::StatusOr<std::unique_ptr<SavedModel>> {
-    mlir::MLIRContext context;
-
-    const bool lazy_loading_enabled =
-        meta_graph_def->signature_def_size() > options.lazy_loading_threshold;
-
-    // Step 1: Import saved model from a proto to an MLIR module.
-    const auto import_start_time = absl::Now();
-    auto session_options =
-        CreateDefaultSessionOptions(options.graph_execution_options);
-    // Set optimize_for_static_graph to true since we won't extend the graph
-    // later. If optimize_for_static_graph is set to false, FallbackState will
-    // keep an extra unused copy of the graph, which unnecessarily consumes
-    // memory.
-    session_options.config.mutable_experimental()
-        ->set_optimize_for_static_graph(true);
-    LOG_FIRST_N(INFO, 10) << "SessionOptions: "
-                          << session_options.config.DebugString();
-    LOG_FIRST_N(INFO, 10) << "GraphExecutionOptions: "
-                          << options.graph_execution_options;
-
-    // Creating the fallback_state using the original function def library
-    // without applying placer or grappler, it is OK for now because it's only
-    // used for captured functions in certain tf.data ops
-    const auto& fdef_lib = meta_graph_def->graph_def().library();
-    ASSIGN_OR_RETURN_IN_IMPORT(
-        auto fallback_state, FallbackState::Create(session_options, fdef_lib));
-    ASSIGN_OR_RETURN_IN_IMPORT(
-        auto mlir_module,
-        ImportSavedModel(
-            &context, *meta_graph_def, *fallback_state,
-            std::string(saved_model_dir),
-            /*import_user_signatures=*/!lazy_loading_enabled,
-            options.graph_execution_options.run_placer_grappler_on_functions,
-            options.graph_execution_options.enable_tfrt_gpu));
-
-    const auto import_duration = absl::Now() - import_start_time;
-    saved_model_import_time_seconds->GetCell(std::string(saved_model_dir))
-        ->Set(absl::ToInt64Seconds(import_duration));
-    LOG(INFO) << "TFRT finished importing savedmodel. Took "
-              << absl::ToInt64Milliseconds(import_duration) << " ms.";
-
-    // Step 2: Compile the MLIR module from TF dialect to TFRT dialect (in BEF).
-    const auto compile_start_time = absl::Now();
-    ASSIGN_OR_RETURN_IN_COMPILE(
-        auto initializers_and_signatures,
-        GetInitializersAndSignatures(mlir_module.get(), saved_model_dir));
-    // If lazy loading is enabled, the user signatures are not exported via MLIR
-    // module, so we need to get them from the proto.
-    // TODO(b/187228559): Unify the code paths for populating the signature map.
-    if (lazy_loading_enabled) {
-      GetSignaturesFromSignatureDef(initializers_and_signatures.signature_map,
-                                    meta_graph_def->signature_def(), options);
-    }
-    tfrt::BefBuffer bef;
-    RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
-        options.graph_execution_options.compile_options, mlir_module.get(),
-        &bef));
-
-    const auto compile_duration = absl::Now() - compile_start_time;
-    saved_model_compile_time_seconds->GetCell(std::string(saved_model_dir))
-        ->Set(absl::ToInt64Seconds(compile_duration));
-    LOG(INFO) << "TFRT finished compiling savedmodel. Took "
-              << absl::ToInt64Milliseconds(compile_duration) << " ms.";
-
-    // Step 3: Initialize runtime states using special BEF functions.
-    const auto init_start_time = absl::Now();
-    ASSIGN_OR_RETURN_IN_INIT(
-        auto bef_file, tfrt::CreateBefFileFromBefBuffer(
-                           *options.graph_execution_options.runtime, bef));
-
-    auto tpu_model_resource = std::make_unique<tfrt::tpu::TpuModelResource>();
-    auto resource_context = CreateResourceContext(
-        *options.graph_execution_options.runtime, tpu_model_resource.get(),
-        options.graph_execution_options.compile_options.tpu_target);
-    RETURN_IF_ERROR_IN_INIT(
-        InitSavedModel(initializers_and_signatures, bef_file.get(), options,
-                       resource_context.get(), *fallback_state));
-
-    const auto init_duration = absl::Now() - init_start_time;
-    saved_model_init_time_seconds->GetCell(std::string(saved_model_dir))
-        ->Set(absl::ToInt64Seconds(init_duration));
-    LOG(INFO) << "TFRT finished initializing savedmodel. Took "
-              << absl::ToInt64Milliseconds(init_duration) << " ms.";
-
-    ASSIGN_OR_RETURN_WITH_STAGE_INFO(
-        "graph_executor creation", auto graph_executor,
-        GraphExecutor::Create(options.graph_execution_options, *fallback_state,
-                              tpu_model_resource.get(),
-                              std::move(*meta_graph_def->mutable_graph_def())));
-
-    // Finally, create the saved model.
-    return {std::make_unique<SavedModelImpl>(
-        std::move(options), *std::move(meta_graph_def), std::move(bef),
-        std::move(bef_file),
-        std::move(initializers_and_signatures.signature_map),
-        std::move(fallback_state), std::move(tpu_model_resource),
-        std::move(resource_context), std::move(graph_executor))};
-  }();
-
-  if (!saved_model.ok()) {
-    *status = saved_model.status();
-    return nullptr;
+  mlir::MLIRContext context;
+
+  // Step 1: Import saved model from a proto to an MLIR module.
+  const auto import_start_time = absl::Now();
+  auto session_options =
+      CreateDefaultSessionOptions(options.graph_execution_options);
+  // Set optimize_for_static_graph to true since we won't extend the graph
+  // later. If optimize_for_static_graph is set to false, FallbackState will
+  // keep an extra unused copy of the graph, which unnecessarily consumes
+  // memory.
+  session_options.config.mutable_experimental()->set_optimize_for_static_graph(
+      true);
+  LOG_FIRST_N(INFO, 10) << "SessionOptions: "
+                        << session_options.config.DebugString();
+  LOG_FIRST_N(INFO, 10) << "GraphExecutionOptions: "
+                        << options.graph_execution_options;
+
+  // Creating the fallback_state using the original function def library
+  // without applying placer or grappler, it is OK for now because it's only
+  // used for captured functions in certain tf.data ops
+  const auto& fdef_lib = meta_graph_def.graph_def().library();
+  ASSIGN_OR_RETURN_IN_IMPORT(auto fallback_state,
+                             FallbackState::Create(session_options, fdef_lib));
+  ASSIGN_OR_RETURN_IN_IMPORT(
+      auto mlir_module,
+      ImportSavedModel(
+          &context, meta_graph_def, *fallback_state,
+          std::string(saved_model_dir),
+          /*import_user_signatures=*/!options.enable_lazy_loading,
+          options.graph_execution_options.run_placer_grappler_on_functions,
+          options.graph_execution_options.enable_tfrt_gpu,
+          options.graph_execution_options.compile_options.use_bridge_for_gpu));
+
+  const auto import_duration = absl::Now() - import_start_time;
+  saved_model_import_time_seconds->GetCell(std::string(saved_model_dir))
+      ->Set(absl::ToInt64Seconds(import_duration));
+  LOG(INFO) << "TFRT finished importing savedmodel. Took "
+            << absl::ToInt64Milliseconds(import_duration) << " ms.";
+
+  // Step 2: Compile the MLIR module from TF dialect to TFRT dialect (in BEF).
+  const auto compile_start_time = absl::Now();
+  ASSIGN_OR_RETURN_IN_COMPILE(
+      auto initializers_and_signatures,
+      GetInitializersAndSignatures(mlir_module.get(), saved_model_dir));
+  // If lazy loading is enabled, the user signatures are not exported via MLIR
+  // module, so we need to get them from the proto.
+  // TODO(b/187228559): Unify the code paths for populating the signature map.
+  if (options.enable_lazy_loading) {
+    GetSignaturesFromSignatureDef(initializers_and_signatures.signature_map,
+                                  meta_graph_def.signature_def(), options);
   }
-  *status = OkStatus();
-  return *std::move(saved_model);
+  tfrt::BefBuffer bef;
+  RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
+      options.graph_execution_options.compile_options, mlir_module.get(), &bef,
+      fallback_state.get()));
+
+  const auto compile_duration = absl::Now() - compile_start_time;
+  saved_model_compile_time_seconds->GetCell(std::string(saved_model_dir))
+      ->Set(absl::ToInt64Seconds(compile_duration));
+  LOG(INFO) << "TFRT finished compiling savedmodel. Took "
+            << absl::ToInt64Milliseconds(compile_duration) << " ms.";
+
+  // Step 3: Initialize runtime states using special BEF functions.
+  const auto init_start_time = absl::Now();
+  ASSIGN_OR_RETURN_IN_INIT(auto bef_file,
+                           tfrt::CreateBefFileFromBefBuffer(
+                               *options.graph_execution_options.runtime, bef));
+
+  auto tpu_model_resource = std::make_unique<tfrt::tpu::TpuModelResource>();
+  auto resource_context = CreateResourceContext(
+      *options.graph_execution_options.runtime, tpu_model_resource.get(),
+      options.graph_execution_options.compile_options.device_target);
+  RETURN_IF_ERROR_IN_INIT(
+      InitSavedModel(initializers_and_signatures, bef_file.get(), options,
+                     resource_context.get(), *fallback_state));
+
+  const auto init_duration = absl::Now() - init_start_time;
+  saved_model_init_time_seconds->GetCell(std::string(saved_model_dir))
+      ->Set(absl::ToInt64Seconds(init_duration));
+  LOG(INFO) << "TFRT finished initializing savedmodel. Took "
+            << absl::ToInt64Milliseconds(init_duration) << " ms.";
+
+  auto kernel_registry = std::make_unique<mlrt::KernelRegistry>();
+  // Register infra and standard math kernels
+  tensorflow::tf_mlrt::RegisterTfMlrtKernels(*kernel_registry);
+  tfrt::cpu::RegisterMlrtMathKernels(kernel_registry.get());
+
+  ASSIGN_OR_RETURN_WITH_STAGE_INFO(
+      "graph_executor creation", auto graph_executor,
+      GraphExecutor::Create(options.graph_execution_options, *fallback_state,
+                            tpu_model_resource.get(),
+                            std::move(*meta_graph_def.mutable_graph_def()),
+                            std::move(kernel_registry)));
+
+  // Finally, create the saved model.
+  return {std::make_unique<SavedModelImpl>(
+      std::move(options), std::move(meta_graph_def), std::move(bef),
+      std::move(bef_file), std::move(initializers_and_signatures.signature_map),
+      std::move(fallback_state), std::move(tpu_model_resource),
+      std::move(resource_context), std::move(graph_executor))};
 }
 
 SavedModelImpl::SavedModelImpl(
@@ -621,9 +734,7 @@ SavedModelImpl::SavedModelImpl(
       fallback_state_(std::move(fallback_state)),
       tpu_model_resource_(std::move(tpu_model_resource)),
       resource_context_(std::move(resource_context)),
-      graph_executor_(std::move(graph_executor)),
-      lazy_loading_enabled_(meta_graph_def_.signature_def_size() >
-                            options.lazy_loading_threshold) {}
+      graph_executor_(std::move(graph_executor)) {}
 
 std::vector<std::string> SavedModelImpl::GetFunctionNames() const {
   std::vector<std::string> result;
@@ -644,29 +755,6 @@ std::optional<FunctionMetadata> SavedModelImpl::GetFunctionMetadata(
   return FunctionMetadata(&iter->second);
 }
 
-namespace {
-tensorflow::Status IsInputSpecsCorrect(
-    absl::string_view name, const internal::Signature& signature,
-    absl::Span<const tensorflow::Tensor> inputs) {
-  TF_RET_CHECK(signature.input_specs.size() == inputs.size())
-      << "signature " << name
-      << " input size is wrong, expected: " << signature.input_specs.size()
-      << ", actual: " << inputs.size();
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    const auto& expected_input_spec = signature.input_specs[i];
-    TF_RET_CHECK(expected_input_spec.dtype == inputs[i].dtype())
-        << "signature " << name
-        << " input dtype is wrong, expected: " << expected_input_spec.dtype
-        << ", actual: " << inputs[i].dtype();
-    TF_RET_CHECK(expected_input_spec.shape.IsCompatibleWith(inputs[i].shape()))
-        << "signature " << name
-        << " input shape is wrong, expected : " << expected_input_spec.shape
-        << ", actual: " << inputs[i].shape();
-  }
-  return OkStatus();
-}
-}  // namespace
-
 tensorflow::Status SavedModelImpl::Run(
     const RunOptions& run_options, absl::string_view name,
     absl::Span<const tensorflow::Tensor> inputs,
@@ -677,28 +765,42 @@ tensorflow::Status SavedModelImpl::Run(
   auto sig_iter = signatures_.find(name);
   TF_RET_CHECK(sig_iter != signatures_.end())
       << "failed to find signature " << name << " in the graph";
-  if (run_options.validate_input_specs) {
-    TF_RETURN_IF_ERROR(IsInputSpecsCorrect(name, sig_iter->second, inputs));
-  }
-  if (run_options.validate_input_specs_dry_run) {
-    const auto status = IsInputSpecsCorrect(name, sig_iter->second, inputs);
-    if (!status.ok()) {
-      LOG(ERROR) << "TFRT input specs validation failed: "
-                 << status.error_message();
-    }
-  }
-  std::vector<tensorflow::Tensor> captures;
-  for (const auto& capture : sig_iter->second.captures) {
-    captures.push_back(capture);
+  const auto& signature = sig_iter->second;
+  const auto& signature_def = meta_graph_def_.signature_def().at(name);
+
+  if (options_.enable_lazy_loading &&
+      options_.lazy_loading_use_graph_executor) {
+    std::vector<std::pair<std::string, tensorflow::Tensor>> input_tensors;
+    input_tensors.reserve(inputs.size());
+
+    std::vector<std::string> output_tensor_names;
+    output_tensor_names.reserve(signature.output_names.size());
+
+    TF_RETURN_IF_ERROR(
+        PreprocessSignature(options_.graph_execution_options.model_metadata,
+                            run_options, name, signature_def, signature, inputs,
+                            /*visited_feed_tensor_names=*/nullptr,
+                            input_tensors, output_tensor_names));
+
+    return graph_executor_->Run(run_options, input_tensors, output_tensor_names,
+                                /*target_tensor_names=*/{}, outputs);
   }
 
+  TF_RETURN_IF_ERROR(
+      CheckInputSpecs(options_.graph_execution_options.model_metadata,
+                      run_options, name, signature, inputs));
+
   const tfrt::Function* func;
   tfrt::ResourceContext* resource_context;
-  if (lazy_loading_enabled_) {
+  if (options_.enable_lazy_loading) {
+    // TODO(b/216379787): Remove this lazy loading path once b/239749833 is
+    // unblocked.
+
     // If lazy loading is enabled, no signature is loaded into `bef_file_`, so
     // we need to find the BEF from the cache or create one.
-    TF_ASSIGN_OR_RETURN(const LoadingResult& loading_result,
-                        GetOrCreateLoadingResult({std::string(name)}));
+    TF_ASSIGN_OR_RETURN(
+        const LoadingResult& loading_result,
+        GetOrCreateLoadingResult(run_options, {std::string(name)}));
     func = loading_result.bef_file->GetFunction(
         tensorflow::kImportModelDefaultGraphFuncName);
     resource_context = loading_result.resource_context.get();
@@ -709,9 +811,9 @@ tensorflow::Status SavedModelImpl::Run(
   DCHECK(func);
 
   return GraphExecutionRunOnFunction(options_.graph_execution_options,
-                                     run_options, name, *func, inputs, captures,
-                                     outputs, resource_context, runtime(),
-                                     *fallback_state_, req_deadline_tracker_);
+                                     run_options, name, *func, inputs, outputs,
+                                     resource_context, runtime(),
+                                     *fallback_state_, &req_deadline_tracker_);
 }
 
 struct SavedModelImpl::JoinedSignature {
@@ -736,17 +838,13 @@ tensorflow::Status SavedModelImpl::RunMultipleSignatures(
   TF_RET_CHECK(multi_outputs) << "outputs must be provided";
   multi_outputs->clear();
 
-  // Due to possible overlapping of feed nodes among user-specified inputs,
-  // `JoinSignatures()` will deduplicate against fetch tensor names and produce
-  // the desired inputs in a new order. The same dedup logic is used here to
-  // generate the flattened input values in the same order.
+  // Due to possible overlapping of feed nodes among user-specified inputs, We
+  // deduplicate against fetch tensor names and produce the desired inputs in a
+  // new order. The same dedup logic is used here to generate the flattened
+  // input values in the same order.
   //
   // Note that we don't need to do any deduplicating nor reordering for the
   // fetch nodes.
-  //
-  // TODO(tfrt-devs): Consider refactoring JoinSignatures so that we don't have
-  // the implicit requirement that the same dedup logic must be used here and in
-  // JoinSignatures().
   std::vector<std::pair<std::string /*tensor_name*/, tensorflow::Tensor>>
       flat_inputs;
   std::vector<std::string> flat_output_names;
@@ -766,65 +864,11 @@ tensorflow::Status SavedModelImpl::RunMultipleSignatures(
     // `signatures_` keeps the user-specified input names that is in the same
     // order as `input_tensors`.
     const auto& signature = signatures_.at(signature_name);
-    const auto& input_names = signature.input_names;
-    if (run_options.validate_input_specs) {
-      TF_RETURN_IF_ERROR(
-          IsInputSpecsCorrect(signature_name, signature, input_tensors));
-    }
-    if (run_options.validate_input_specs_dry_run) {
-      const auto status =
-          IsInputSpecsCorrect(signature_name, signature, input_tensors);
-      if (!status.ok()) {
-        LOG(ERROR) << "TFRT input specs validation failed: "
-                   << status.error_message();
-      }
-    }
-    DCHECK(signature.captures.empty());
-
-    TF_RET_CHECK(input_tensors.size() == signature_def.inputs().size())
-        << "Incorrect input size for signature: " << signature_name
-        << ": expected " << signature_def.inputs().size() << ", but got "
-        << input_tensors.size();
-    DCHECK_EQ(input_names.size(), signature_def.inputs().size());
-
-    // Then we find out the corresponding tensor names (ie.
-    // node_name:output_idx) for the inputs using the SignatureDef proto.
-    //
-    // TODO(tfrt-devs): Consider including tensor names in `signatures_` as
-    // well, so that only `signatures_` is used here.
-    for (int j = 0; j < input_tensors.size(); ++j) {
-      const auto& tensor_info = signature_def.inputs().at(input_names[j]);
 
-      // TODO(b/184675681): Support other encoding cases.
-      //
-      // TODO(b/184679394): Add unit test for this check.
-      TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
-          << "Only dense tensor is supported, but got encoding case "
-          << tensor_info.encoding_case();
-
-      const auto& tensor_name = tensor_info.name();
-
-      // Skip if we have visited the feed tensor. Otherwise, marked it as
-      // visited and put it in the `flat_inputs`. Note that the following code
-      // uses the same logic as in JoinSignatures() to deduplicate inputs with
-      // the feed tensor names, and generates the flat inputs in the same order.
-      if (visited_feed_tensor_names.contains(tensor_name)) continue;
-      visited_feed_tensor_names.insert(tensor_name);
-      flat_inputs.push_back(std::make_pair(tensor_name, input_tensors[j]));
-    }
-
-    for (const auto& output_key : signature.output_names) {
-      const auto& tensor_info = signature_def.outputs().at(output_key);
-
-      VLOG(1) << "Importing Signature Output: output_key = " << output_key
-              << ", tensor_info = " << tensor_info.DebugString();
-
-      TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
-          << "Only dense tensor is supported, but got encoding case "
-          << tensor_info.encoding_case();
-
-      flat_output_names.push_back(tensor_info.name());
-    }
+    TF_RETURN_IF_ERROR(PreprocessSignature(
+        options_.graph_execution_options.model_metadata, run_options,
+        signature_name, signature_def, signature, input_tensors,
+        &visited_feed_tensor_names, flat_inputs, flat_output_names));
   }
 
   std::vector<tensorflow::Tensor> flat_outputs;
@@ -983,11 +1027,11 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
   loading_result->name = joined_signature.name;
   loading_result->resource_context = CreateResourceContext(
       runtime(), tpu_model_resource_.get(),
-      options_.graph_execution_options.compile_options.tpu_target);
+      options_.graph_execution_options.compile_options.device_target);
 
   RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
       options_.graph_execution_options.compile_options, module.get(),
-      &loading_result->bef));
+      &loading_result->bef, fallback_state_.get()));
 
   // Step 3: Initialize runtime states using special BEF functions.
   ASSIGN_OR_RETURN_IN_INIT(
@@ -1007,12 +1051,20 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
 }
 
 StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
-SavedModelImpl::GetOrCreateLoadingResult(absl::Span<const std::string> names) {
+SavedModelImpl::GetOrCreateLoadingResult(const RunOptions& run_options,
+                                         absl::Span<const std::string> names) {
   const auto joined_name = absl::StrJoin(names, kSignatureJoiningDelimiter);
   tensorflow::mutex_lock l(loading_result_cache_mu_);
   const auto iter = loading_result_cache_.find(joined_name);
   if (iter != loading_result_cache_.end()) return {*iter->second};
 
+  if (run_options.disable_compilation) {
+    return tensorflow::errors::InvalidArgument(
+        absl::StrCat("GraphExecutor: compilation is disabled in execution but "
+                     "the compiled graph is not found for ",
+                     joined_name));
+  }
+
   TF_ASSIGN_OR_RETURN(
       const auto joined_signature,
       JoinSignatures(names, signatures_, meta_graph_def_.signature_def()));
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.h b/tensorflow/core/tfrt/saved_model/saved_model.h
index 6d296040d54..3ea3a851d20 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model.h
@@ -65,8 +65,6 @@ inline bool operator==(const TensorSpec& a, const TensorSpec& b) {
 namespace internal {
 
 struct Signature {
-  std::vector<tensorflow::Tensor> captures;
-
   // The following three fields should have the same size.
   std::vector<std::string> input_names;
   std::vector<TensorSpec> input_specs;
@@ -115,16 +113,20 @@ class SavedModel {
   struct Options {
     explicit Options(const Runtime* rt) : graph_execution_options(rt) {}
 
-    // If the number of signagures is greater than the threshold, the loading of
-    // any signature (or signature combination) will be deferred until the first
-    // corresponding invocationof running. Otherwise, the individual signatures
-    // will be loaded along with the saved model.
-    int32_t lazy_loading_threshold = std::numeric_limits<int32_t>::max();
+    // If true, the loading of any signature (or signature combination) will be
+    // deferred until the first corresponding invocationof running. Otherwise,
+    // the individual signatures will be loaded along with the saved model.
+    bool enable_lazy_loading = false;
 
     // If true, we'll attempt to find MLArchive within the given loading path.
     // If not found, will use the path as a normal SavedModel directory.
     bool maybe_load_from_mla = false;
 
+    // If true, the lazy loading path will use tfrt_stub::GraphExecutor.
+    //
+    // TODO(b/216379787): Remove this option once b/239749833 is unblocked.
+    bool lazy_loading_use_graph_executor = false;
+
     GraphExecutionOptions graph_execution_options;
   };
 
@@ -203,9 +205,16 @@ class SavedModelImpl final : public SavedModel {
   //
   // If `options.maybe_load_from_mla` is true, tries opening `saved_model_dir`
   // as an MLA. If it's not an MLA, uses it as a normal SavedModel directory.
-  static std::unique_ptr<SavedModel> LoadSavedModel(
+  static tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
       Options options, absl::string_view saved_model_dir,
-      const std::unordered_set<std::string>& tags, tensorflow::Status* status);
+      const std::unordered_set<std::string>& tags);
+
+  // Loads all SignatureDefs in `meta_graph_def`. Refer to
+  // http://g3doc/learning/serving/g3doc/saved_model/overview.md
+  // for explanations on SavedModel.
+  static tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
+      Options options, tensorflow::MetaGraphDef meta_graph_def,
+      absl::string_view saved_model_dir);
 
   SavedModelImpl(
       Options options, tensorflow::MetaGraphDef meta_graph_def,
@@ -270,18 +279,10 @@ class SavedModelImpl final : public SavedModel {
   // Returns the loading result given the signature names.
   tensorflow::StatusOr<
       std::reference_wrapper<const SavedModelImpl::LoadingResult>>
-  GetOrCreateLoadingResult(absl::Span<const std::string> names)
+  GetOrCreateLoadingResult(const RunOptions& run_options,
+                           absl::Span<const std::string> names)
       TF_LOCKS_EXCLUDED(loading_result_cache_mu_);
 
-  // Runs `func` with the given inputs, and outputs the result.
-  tensorflow::Status RunInternal(const RunOptions& run_options,
-                                 absl::string_view signature_name,
-                                 const tfrt::Function& func,
-                                 absl::Span<const tensorflow::Tensor> inputs,
-                                 absl::Span<const tensorflow::Tensor> captures,
-                                 std::vector<tensorflow::Tensor>* outputs,
-                                 tfrt::ResourceContext* resource_context);
-
   Options options_;
   // `meta_graph_def_` only contains metadata of the model. The graph_def field
   // is removed.
@@ -306,7 +307,6 @@ class SavedModelImpl final : public SavedModel {
                       std::unique_ptr<LoadingResult>>
       loading_result_cache_ TF_GUARDED_BY(loading_result_cache_mu_);
   std::unique_ptr<GraphExecutor> graph_executor_;
-  bool lazy_loading_enabled_ = false;
 };
 
 class SavedModelMiraImpl;
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
index 04f88d0492e..a4b2051dfb0 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
@@ -25,13 +25,15 @@ namespace tfrt_stub {
 StatusOr<TfrtSavedModelMLIRImportInput> TfrtSavedModelMLIRImportInput::Create(
     const FallbackState& fallback_state, const MetaGraphDef* meta_graph_def,
     const GraphDebugInfo& debug_info,
-    bool run_placer_grappler_on_nested_functions, bool enable_tfrt_gpu) {
+    bool run_placer_grappler_on_nested_functions, bool enable_tfrt_gpu,
+    bool use_bridge_for_gpu) {
   DCHECK(meta_graph_def);
 
   TfrtGraphExecutionState::Options options;
   options.run_placer_grappler_on_functions =
       run_placer_grappler_on_nested_functions;
   options.enable_tfrt_gpu = enable_tfrt_gpu;
+  options.use_bridge_for_gpu = use_bridge_for_gpu;
   TF_ASSIGN_OR_RETURN(
       auto graph_execution_state,
       TfrtGraphExecutionState::Create(options, meta_graph_def->graph_def(),
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
index 45af8150c3b..59c6a5d89d6 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
@@ -31,7 +31,7 @@ class TfrtSavedModelMLIRImportInput : public SavedModelMLIRImportInput {
       const FallbackState& fallback_state, const MetaGraphDef* meta_graph_def,
       const GraphDebugInfo& debug_info,
       bool run_placer_grappler_on_nested_functions = false,
-      bool enable_tfrt_gpu = false);
+      bool enable_tfrt_gpu = false, bool use_bridge_for_gpu = false);
 
   TfrtSavedModelMLIRImportInput(
       const MetaGraphDef* meta_graph_def, const GraphDebugInfo& debug_info,
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h b/tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h
index 09ec280208e..ed971fef49e 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h
@@ -32,9 +32,9 @@ namespace tfrt_stub {
 
 class SavedModelMiraImpl final : public SavedModel {
  public:
-  static std::unique_ptr<SavedModel> LoadSavedModel(
+  tensorflow::StatusOr<std::unique_ptr<SavedModel>> LoadSavedModel(
       Options options, absl::string_view saved_model_dir,
-      const std::unordered_set<std::string>& tags, tensorflow::Status* status);
+      const std::unordered_set<std::string>& tags);
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc b/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
index 6d93e20fd5c..0b1a7791269 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_testutil.cc
@@ -40,10 +40,6 @@ ABSL_FLAG(std::string, force_data_format, "",
           "force data format for all layout sensitive operations. Currently "
           "the supported formats are 'NHWC' and 'NCHW'");
 
-ABSL_FLAG(bool, enable_native_ops, true,
-          "If true, native ops will be used if they are implemented in TFRT. "
-          "If false, all ops are using fallback.");
-
 ABSL_FLAG(
     bool, enable_grappler, false,
     "If true, run grappler passes before importing the SavedModel into MLIR.");
@@ -63,7 +59,6 @@ SavedModel::Options DefaultSavedModelOptions(
   SavedModel::Options options(runtime);
   auto& compile_options = options.graph_execution_options.compile_options;
   compile_options.enable_optimizer = absl::GetFlag(FLAGS_enable_optimizer);
-  compile_options.enable_native_ops = absl::GetFlag(FLAGS_enable_native_ops);
   compile_options.enable_grappler = absl::GetFlag(FLAGS_enable_grappler);
   compile_options.force_data_format = absl::GetFlag(FLAGS_force_data_format);
   return options;
@@ -80,10 +75,10 @@ TFRTSavedModelTest::TFRTSavedModelTest(
   CHECK(runtime_);
   auto options = DefaultSavedModelOptions(runtime_.get());
 
-  tensorflow::Status status;
-  saved_model_ = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                                /*tags=*/{"serve"}, &status);
-  TF_DCHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_DCHECK_OK(saved_model.status());
+  saved_model_ = *std::move(saved_model);
 }
 
 // Compute the results using TF1 session loaded from the saved model. In
@@ -171,16 +166,16 @@ void ExpectTensorEqual(const tensorflow::Tensor& x, const tensorflow::Tensor& y,
 
 SavedModel::Options DefaultTpuModelOptions(
     tensorflow::tfrt_stub::Runtime* runtime,
-    tensorflow::TfrtTpuInfraTarget tpu_target) {
+    tensorflow::TfrtDeviceInfraTarget device_target) {
   SavedModel::Options options(runtime);
   auto& compile_options = options.graph_execution_options.compile_options;
   compile_options.variable_device =
       "/job:localhost/replica:0/task:0/device:CPU:0";
   compile_options.enable_optimizer = false;
-  compile_options.enable_native_ops = false;
   compile_options.enable_grappler = true;
-  compile_options.tpu_target = tpu_target;
+  compile_options.device_target = device_target;
   compile_options.hoist_invariant_ops = true;
+  compile_options.sink_in_invariant_ops = true;
   compile_options.cost_threshold =
       1024;  // Servo currently uses 1024 as threshold for TPU models
 
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
index abfb6197435..134b2d5c5f1 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
@@ -110,7 +110,7 @@ void ExpectTensorEqual(const tensorflow::Tensor& x, const tensorflow::Tensor& y,
 
 SavedModel::Options DefaultTpuModelOptions(
     tensorflow::tfrt_stub::Runtime* runtime,
-    tensorflow::TfrtTpuInfraTarget tpu_target);
+    tensorflow::TfrtDeviceInfraTarget tpu_target);
 
 tensorflow::StatusOr<std::vector<tensorflow::serving::PredictRequest>>
 GetWarmupRequests(absl::string_view saved_model_dir);
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index f43ae953dae..353073dca52 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":internal"],
     licenses = ["notice"],
 )
@@ -12,7 +13,6 @@ package(
 package_group(
     name = "internal",
     packages = [
-        # copybara:uncomment "//third_party/auroraml/...",
         "//tensorflow/core/tfrt/saved_model/tests/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
     ],
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_gpu_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_gpu_test.cc
index 10b0d103f24..5cad7375d7f 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_gpu_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_gpu_test.cc
@@ -39,18 +39,17 @@ TEST(SavedModelTest, MatmulGpu) {
   options.graph_execution_options.enable_grappler_function_optimizer = true;
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<tensorflow::Tensor> inputs = {
       CreateTfTensor<float>({1, 3}, {1.0, 1.0, 1.0})};
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run(/*run_options=*/{}, "serving_default", inputs,
-                                &outputs));
+  TF_ASSERT_OK(
+      (*saved_model)
+          ->Run(/*run_options=*/{}, "serving_default", inputs, &outputs));
 
   ASSERT_EQ(outputs.size(), 1);
   EXPECT_THAT(GetTfTensorData<float>(outputs[0]),
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index a3afaf82d20..39467c7b8d7 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -35,9 +35,9 @@ namespace tfrt_stub {
 namespace {
 
 struct TestParams {
-  bool enable_native_ops = false;
   bool enable_grappler = false;
   bool enable_lazy_loading = false;
+  bool lazy_loading_use_graph_executor = false;
 };
 
 class SavedModelTest : public ::testing::TestWithParam<TestParams> {};
@@ -53,18 +53,15 @@ TEST_P(SavedModelTest, BasicV1) {
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
-  options.lazy_loading_threshold =
-      GetParam().enable_lazy_loading ? 0 : INT32_MAX;
-  options.graph_execution_options.compile_options.enable_native_ops =
-      GetParam().enable_native_ops;
+  options.enable_lazy_loading = GetParam().enable_lazy_loading;
+  options.lazy_loading_use_graph_executor =
+      GetParam().lazy_loading_use_graph_executor;
   options.graph_execution_options.compile_options.enable_grappler =
       GetParam().enable_grappler;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -74,7 +71,7 @@ TEST_P(SavedModelTest, BasicV1) {
   tfrt::SavedModel::RunOptions run_options;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -87,55 +84,40 @@ INSTANTIATE_TEST_SUITE_P(
     SavedModelLiteTest, SavedModelTest,
     ::testing::Values(
         // The values below are for:
-        // enable_native_ops, enable_grappler, enable_lazy_loading
-        TestParams{0, 0, 0}, TestParams{0, 0, 1}, TestParams{0, 1, 0},
-        TestParams{0, 1, 1}, TestParams{1, 0, 0}, TestParams{1, 0, 1},
-        TestParams{1, 1, 0}, TestParams{1, 1, 1}));
+        // enable_grappler, enable_lazy_loading, lazy_loading_use_graph_executor
+        TestParams{0, 0, 0}, TestParams{1, 0, 0}, TestParams{0, 1, 0},
+        TestParams{1, 1, 0}, TestParams{0, 1, 1}, TestParams{1, 1, 1}));
 
-TEST(SavedModelTest, CostMeasurementEnabled) {
+TEST(SavedModelTest, BasicV2) {
   // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
   // using the following python code:
-  //  x = tf.placeholder(tf.int32, shape=(3))
-  //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
-  //  r = tf.matmul(x, y)
+  // self.w = tf.Variable(tf.ones((3)), name='w')
+  // r = tf.matmul(x, self.w)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
-
-  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
-  auto options = DefaultSavedModelOptions(runtime.get());
-  options.graph_execution_options.compile_options.enable_native_ops = false;
+      "tensorflow/core/tfrt/saved_model/tests/toy_v2");
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  TFRTSavedModelTest test(saved_model_dir);
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
-  inputs.push_back(
-      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
-
-  tfrt::SavedModel::RunOptions run_options;
-  run_options.enable_cost_measurement = true;
+  inputs.emplace_back(tensorflow::DT_INT32,
+                      /*shape=*/tensorflow::TensorShape{1, 3});
+  auto flat = inputs.back().flat<int32_t>();
+  flat(0) = 1;
+  flat(1) = 1;
+  flat(2) = 1;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK(
+      test.GetSavedModel()->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
+  auto& output = outputs[0];
 
-  EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
-              ::testing::ElementsAreArray({6}));
-
-  auto op_count = saved_model->GetHostContext()
-                      ->GetOrCreateSharedContext<CostRecorder>()
-                      .size();
-
-  // There are three ops in the CostRecorder. They are tf.VarHandleOp,
-  // tf.ReadVariableOp and tf.MatMul
-  ASSERT_EQ(op_count, 3);
+  ASSERT_EQ(output.NumElements(), 1);
+  EXPECT_EQ(output.flat<int32_t>()(0), 6);
 }
 
-TEST(SavedModelTest, BasicV2) {
+TEST(SavedModelTest, BasicInlineExecution) {
   // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
   // using the following python code:
   // self.w = tf.Variable(tf.ones((3)), name='w')
@@ -143,7 +125,19 @@ TEST(SavedModelTest, BasicV2) {
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
       "tensorflow/core/tfrt/saved_model/tests/toy_v2");
 
-  TFRTSavedModelTest test(saved_model_dir);
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+
+  runtime->SetCreateRequestQueueFn(
+      [](int64_t) -> StatusOr<std::unique_ptr<WorkQueueInterface>> {
+        return tensorflow::tfrt_stub::WrapDefaultWorkQueue(
+            tfrt::CreateSingleThreadedWorkQueue());
+      });
+
+  auto options = DefaultSavedModelOptions(runtime.get());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto saved_model, SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                       /*tags=*/{"serve"}));
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -155,8 +149,7 @@ TEST(SavedModelTest, BasicV2) {
   flat(2) = 1;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(
-      test.GetSavedModel()->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
   auto& output = outputs[0];
 
@@ -172,13 +165,10 @@ TEST(SavedModelTest, VariableOnTpu) {
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
-  options.graph_execution_options.compile_options.enable_native_ops = false;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -190,7 +180,7 @@ TEST(SavedModelTest, VariableOnTpu) {
   flat(2) = 1;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
   auto& output = outputs[0];
 
@@ -227,11 +217,9 @@ TEST(SavedModelTest, RunMultipleSignatures) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<tensorflow::Tensor> toy_inputs;
   toy_inputs.push_back(CreateTfTensor<int32_t>(/*shape=*/{1, 3},
@@ -253,13 +241,14 @@ TEST(SavedModelTest, RunMultipleSignatures) {
 
   std::vector<std::vector<tensorflow::Tensor>> outputs;
   std::vector<std::string> names = {"toy", "another_toy", "yet_another_toy"};
-  TF_ASSERT_OK(saved_model->RunMultipleSignatures(/*run_options=*/{}, names,
-                                                  inputs, &outputs));
+  TF_ASSERT_OK(
+      (*saved_model)
+          ->RunMultipleSignatures(/*run_options=*/{}, names, inputs, &outputs));
 
   ASSERT_EQ(outputs.size(), 3);
 
   {
-    auto toy_metadata = saved_model->GetFunctionMetadata("toy");
+    auto toy_metadata = (*saved_model)->GetFunctionMetadata("toy");
     ASSERT_TRUE(toy_metadata.has_value());
     std::vector<std::pair<std::string, tensorflow::Tensor>>
         expected_toy_named_outputs;
@@ -275,7 +264,8 @@ TEST(SavedModelTest, RunMultipleSignatures) {
   }
 
   {
-    auto another_toy_metadata = saved_model->GetFunctionMetadata("another_toy");
+    auto another_toy_metadata =
+        (*saved_model)->GetFunctionMetadata("another_toy");
     ASSERT_TRUE(another_toy_metadata.has_value());
     std::vector<std::pair<std::string, tensorflow::Tensor>>
         expected_another_toy_named_outputs;
@@ -298,7 +288,7 @@ TEST(SavedModelTest, RunMultipleSignatures) {
 
   {
     auto yet_another_toy_metadata =
-        saved_model->GetFunctionMetadata("yet_another_toy");
+        (*saved_model)->GetFunctionMetadata("yet_another_toy");
     ASSERT_TRUE(yet_another_toy_metadata.has_value());
     std::vector<std::pair<std::string, tensorflow::Tensor>>
         expected_yet_another_toy_named_outputs;
@@ -339,11 +329,9 @@ TEST(SavedModelTest, RunMultipleSignatures_OverlappingNodes) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<std::vector<tensorflow::Tensor>> inputs = {
       {CreateTfTensor<int32_t>(/*shape=*/{1, 3},
@@ -355,8 +343,9 @@ TEST(SavedModelTest, RunMultipleSignatures_OverlappingNodes) {
 
   std::vector<std::vector<tensorflow::Tensor>> outputs;
   std::vector<std::string> names = {"toy", "another_toy", "toy"};
-  TF_ASSERT_OK(saved_model->RunMultipleSignatures(/*run_options=*/{}, names,
-                                                  inputs, &outputs));
+  TF_ASSERT_OK(
+      (*saved_model)
+          ->RunMultipleSignatures(/*run_options=*/{}, names, inputs, &outputs));
   ASSERT_EQ(outputs.size(), 3);
 
   ASSERT_EQ(outputs[0].size(), 1);
@@ -364,7 +353,8 @@ TEST(SavedModelTest, RunMultipleSignatures_OverlappingNodes) {
               ::testing::ElementsAreArray({6}));
 
   {
-    auto another_toy_metadata = saved_model->GetFunctionMetadata("another_toy");
+    auto another_toy_metadata =
+        (*saved_model)->GetFunctionMetadata("another_toy");
     ASSERT_TRUE(another_toy_metadata.has_value());
     std::vector<std::pair<std::string, tensorflow::Tensor>>
         expected_another_toy_named_outputs;
@@ -403,12 +393,10 @@ class SavedModelRunByTensorNamesTest : public ::testing::Test {
     runtime_ = DefaultTfrtRuntime(/*num_threads=*/1);
     auto options = DefaultSavedModelOptions(runtime_.get());
 
-    tensorflow::Status status;
-    saved_model_.reset(static_cast<SavedModelImpl*>(
-        SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                       /*tags=*/{"serve"}, &status)
-            .release()));
-    TF_CHECK_OK(status);
+    auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                      /*tags=*/{"serve"});
+    TF_CHECK_OK(saved_model.status());
+    saved_model_.reset(static_cast<SavedModelImpl*>(saved_model->release()));
 
     inputs_.push_back(
         std::make_pair("input1", CreateTfTensor<int32_t>(/*shape=*/{1, 3},
@@ -514,13 +502,10 @@ TEST(SavedModelTest, CustomWorkQueue) {
       std::make_unique<tfrt::tf::RunHandlerThreadWorkQueue>(queue_options));
 
   auto options = DefaultSavedModelOptions(runtime.get());
-  options.graph_execution_options.compile_options.enable_native_ops = false;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -528,7 +513,7 @@ TEST(SavedModelTest, CustomWorkQueue) {
       CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -536,7 +521,7 @@ TEST(SavedModelTest, CustomWorkQueue) {
 
   // Run one more time to check per-request state is correct set up.
   outputs.clear();
-  TF_ASSERT_OK(saved_model->Run({}, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -558,13 +543,10 @@ TEST(SavedModelTest, RunOptionsWorkQueue) {
       tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/4);
 
   auto options = DefaultSavedModelOptions(runtime.get());
-  options.graph_execution_options.compile_options.enable_native_ops = false;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -583,7 +565,7 @@ TEST(SavedModelTest, RunOptionsWorkQueue) {
   tfrt::SavedModel::RunOptions run_options;
   run_options.work_queue = &run_handler_queue;
 
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -591,7 +573,7 @@ TEST(SavedModelTest, RunOptionsWorkQueue) {
 
   // Run one more time to check per-request state is correct set up.
   outputs.clear();
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -610,11 +592,10 @@ TEST(SavedModelTest, UseMira) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  tensorflow::Status status;
   auto saved_model =
       SavedModelMiraImpl::LoadSavedModel(options, saved_model_dir,
-                                         /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+                                         /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -624,7 +605,7 @@ TEST(SavedModelTest, UseMira) {
   tfrt::SavedModel::RunOptions run_options;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -653,11 +634,9 @@ TEST(SavedModelTest, UseMla) {
   options.maybe_load_from_mla = true;
 
   // Load the model using the MLA dir.
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, mla_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, mla_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -667,7 +646,7 @@ TEST(SavedModelTest, UseMla) {
   tfrt::SavedModel::RunOptions run_options;
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run(run_options, "toy", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]),
@@ -743,13 +722,11 @@ TEST(SavedModelTest, RefTypeTensorInput) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_ASSERT_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_ASSERT_OK(saved_model.status());
   EXPECT_THAT(
-      saved_model->GetFunctionNames(),
+      (*saved_model)->GetFunctionNames(),
       ::testing::UnorderedElementsAre(
           "non_ref", "__tf_saved_model_session_initializer_save/restore_all"));
 }
@@ -761,21 +738,18 @@ TEST(SavedModelTest, HashTableAssetV1) {
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
-  options.graph_execution_options.compile_options.enable_native_ops = false;
   options.graph_execution_options.compile_options.enable_grappler = true;
   options.graph_execution_options.compile_options.hoist_invariant_ops = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<tensorflow::Tensor> inputs;
   inputs.push_back(CreateTfStringTensor(/*shape=*/{}, /*data=*/{"cat"}));
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int64_t>(outputs[0]),
@@ -889,14 +863,12 @@ TEST(SavedModelTest, Error) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_ASSERT_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_ASSERT_OK(saved_model.status());
 
   std::vector<tensorflow::Tensor> outputs;
-  status = saved_model->Run({}, "serving_default", {}, &outputs);
+  auto status = (*saved_model)->Run({}, "serving_default", {}, &outputs);
 
   ASSERT_FALSE(status.ok());
 
@@ -924,11 +896,9 @@ TEST_P(SavedModelPowTest, Pow) {
   options.graph_execution_options.run_placer_grappler_on_functions =
       GetParam().run_placer_grappler_on_functions;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<int32_t> data = {2};
   std::vector<tensorflow::Tensor> inputs;
@@ -936,7 +906,7 @@ TEST_P(SavedModelPowTest, Pow) {
       CreateTfTensor<int32_t>(/*shape=*/{}, absl::MakeConstSpan(data)));
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]), ::testing::ElementsAre(8));
@@ -957,11 +927,9 @@ TEST(SavedModelPowTest, MapDataset) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   std::vector<int32_t> data = {2};
   std::vector<tensorflow::Tensor> inputs;
@@ -969,7 +937,7 @@ TEST(SavedModelPowTest, MapDataset) {
       CreateTfTensor<int32_t>(/*shape=*/{}, absl::MakeConstSpan(data)));
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]), ::testing::ElementsAre(3));
@@ -988,11 +956,9 @@ TEST(SavedModelTest, ControlFlowV1) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_ASSERT_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_ASSERT_OK(saved_model.status());
 }
 
 TEST(SavedModelTest, WhileLoopV1) {
@@ -1007,11 +973,9 @@ TEST(SavedModelTest, WhileLoopV1) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_ASSERT_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_ASSERT_OK(saved_model.status());
 
   std::vector<int32_t> data = {0};
   std::vector<tensorflow::Tensor> inputs;
@@ -1019,7 +983,7 @@ TEST(SavedModelTest, WhileLoopV1) {
       CreateTfTensor<int32_t>(/*shape=*/{}, absl::MakeConstSpan(data)));
 
   std::vector<tensorflow::Tensor> outputs;
-  TF_ASSERT_OK(saved_model->Run({}, "serving_default", inputs, &outputs));
+  TF_ASSERT_OK((*saved_model)->Run({}, "serving_default", inputs, &outputs));
   ASSERT_EQ(outputs.size(), 1);
 
   EXPECT_THAT(GetTfTensorData<int32_t>(outputs[0]), ::testing::ElementsAre(10));
@@ -1039,12 +1003,11 @@ TEST(SavedModelTest, SparseTensorInput) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_ASSERT_OK(status);
-  EXPECT_THAT(saved_model->GetFunctionNames(), ::testing::ElementsAre("dense"));
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_ASSERT_OK(saved_model.status());
+  EXPECT_THAT((*saved_model)->GetFunctionNames(),
+              ::testing::ElementsAre("dense"));
 }
 
 TEST(SavedModelTest, DeadlineExceeded) {
@@ -1059,11 +1022,9 @@ TEST(SavedModelTest, DeadlineExceeded) {
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
 
-  tensorflow::Status status;
-  auto saved_model =
-      SavedModelImpl::LoadSavedModel(options, saved_model_dir,
-                                     /*tags=*/{"serve"}, &status);
-  TF_CHECK_OK(status);
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
 
   // Set input 'x' to [[1, 1, 1]]
   std::vector<tensorflow::Tensor> inputs;
@@ -1075,13 +1036,52 @@ TEST(SavedModelTest, DeadlineExceeded) {
   tfrt::SavedModel::RunOptions run_options;
   run_options.deadline = absl::ToChronoTime(absl::Now());
 
-  status = saved_model->Run(run_options, "toy", inputs, &outputs);
+  auto status = (*saved_model)->Run(run_options, "toy", inputs, &outputs);
 
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
               ::testing::HasSubstr("Deadline exceeded"));
 }
 
+TEST(SavedModelTest, DisableCompilation) {
+  // SavedModel toy contains a graph of a single 'tf.AddV2' op. It is generated
+  // using the following python code:
+  //  x = tf.placeholder(tf.int32, shape=(3))
+  //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
+  //  r = tf.matmul(x, y)
+  std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+
+  auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
+  auto options = DefaultSavedModelOptions(runtime.get());
+  options.enable_lazy_loading = true;
+
+  auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
+                                                    /*tags=*/{"serve"});
+  TF_CHECK_OK(saved_model.status());
+
+  // Set input 'x' to [[1, 1, 1]]
+  std::vector<tensorflow::Tensor> inputs;
+  inputs.push_back(
+      CreateTfTensor<int32_t>(/*shape=*/{1, 3}, /*data=*/{1, 1, 1}));
+
+  std::vector<tensorflow::Tensor> outputs;
+
+  tfrt::SavedModel::RunOptions run_options;
+  run_options.disable_compilation = true;
+
+  auto status = (*saved_model)->Run(run_options, "toy", inputs, &outputs);
+
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("GraphExecutor: compilation is disabled in "
+                           "execution but the compiled graph is not found"));
+
+  run_options.disable_compilation = false;
+  TF_ASSERT_OK((*saved_model)->Run(run_options, "toy", inputs, &outputs));
+}
+
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/tpu/BUILD b/tensorflow/core/tfrt/tpu/BUILD
index d20c1f56d4c..93d634f22d8 100644
--- a/tensorflow/core/tfrt/tpu/BUILD
+++ b/tensorflow/core/tfrt/tpu/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 5a3e817b86c..049e0a97754 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.bzl", "if_google", "if_oss", "tf_cc_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
@@ -305,3 +306,23 @@ cc_library(
         "//tensorflow/core/platform:threadpool_interface",
     ],
 )
+
+cc_library(
+    name = "device_variables_table",
+    hdrs = ["device_variables_table.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/synchronization",
+        "@llvm-project//llvm:Support",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+cc_library(
+    name = "gpu_variables_table",
+    hdrs = ["gpu_variables_table.h"],
+    deps = [
+        ":device_variables_table",
+        ":fallback_tensor",
+    ],
+)
diff --git a/tensorflow/core/tfrt/utils/device_variables_table.h b/tensorflow/core/tfrt/utils/device_variables_table.h
new file mode 100644
index 00000000000..1b1a742e18b
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/device_variables_table.h
@@ -0,0 +1,98 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/ADT/FunctionExtras.h"
+#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
+
+namespace tfrt {
+
+// A variable table that keeps track of the device copies of host tensors.
+// The same variable can have multiple copies on devices (e.g., on different TPU
+// cores), and hence they are differenticated via `copy_index`.
+// The table maps from <host tensor, copy_index> to device tensor.
+template <typename HostTensorType, typename DeviceTensorType>
+class DeviceVariablesTable {
+ public:
+  virtual ~DeviceVariablesTable() { ClearDeviceVariablesTable(); }
+
+  void AddOrUpdateDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index,
+      AsyncValueRef<DeviceTensorType> device_tensor) {
+    absl::MutexLock lock(&device_variables_mu_);
+    device_variables_table_.insert_or_assign(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index),
+        std::move(device_tensor));
+  }
+
+  AsyncValueRef<DeviceTensorType> GetDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index) {
+    absl::ReaderMutexLock lock(&device_variables_mu_);
+    auto it = device_variables_table_.find(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
+    return it != device_variables_table_.end()
+               ? it->second.CopyRef()
+               : AsyncValueRef<DeviceTensorType>();
+  }
+
+  AsyncValueRef<DeviceTensorType> GetOrAddDeviceVariable(
+      const HostTensorType& host_tensor, int copy_index,
+      llvm::unique_function<void(AsyncValueRef<DeviceTensorType>)> creator) {
+    absl::ReleasableMutexLock lock(&device_variables_mu_);
+    auto it = device_variables_table_.find(
+        std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index));
+    if (it != device_variables_table_.end()) return it->second.CopyRef();
+
+    auto device_tensor = MakeUnconstructedAsyncValueRef<DeviceTensorType>();
+    device_variables_table_.insert(
+        {std::make_pair(GetHostTensorDataPtr(host_tensor), copy_index),
+         device_tensor.CopyRef()});
+    lock.Release();
+    creator(device_tensor.CopyRef());
+    return device_tensor;
+  }
+
+  void ClearDeviceVariablesTable() {
+    absl::MutexLock lock(&device_variables_mu_);
+    device_variables_table_.clear();
+  }
+
+  int size() {
+    absl::ReaderMutexLock lock(&device_variables_mu_);
+    return device_variables_table_.size();
+  }
+
+ protected:
+  // Get the host tensor data pointer, which is used as a part of the table key.
+  virtual const void* GetHostTensorDataPtr(
+      const HostTensorType& host_tensor) = 0;
+
+ private:
+  absl::Mutex device_variables_mu_;
+
+  // A map from <host tensor data, copy_index> to device tensor.
+  absl::flat_hash_map<std::pair<const void*, int>,
+                      AsyncValueRef<DeviceTensorType>>
+      device_variables_table_ ABSL_GUARDED_BY(device_variables_mu_);
+};
+
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_DEVICE_VARIABLES_TABLE_H_
diff --git a/tensorflow/core/tfrt/utils/gpu_variables_table.h b/tensorflow/core/tfrt/utils/gpu_variables_table.h
new file mode 100644
index 00000000000..7e413f56fad
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/gpu_variables_table.h
@@ -0,0 +1,42 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
+
+#include "tensorflow/core/tfrt/utils/device_variables_table.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+
+namespace tfrt {
+namespace gpu {
+
+// This is for creating/getting GpuVariablesTable object in the execution
+// context at runtime.
+constexpr char kGpuVariablesTableResourceName[] = "GpuVariablesTableResource";
+
+// A variable table that keeps track of the device copies of GPU host tensors.
+class GpuVariablesTable
+    : public DeviceVariablesTable<tensorflow::tfrt_stub::FallbackTensor,
+                                  tensorflow::tfrt_stub::FallbackTensor> {
+ private:
+  const void* GetHostTensorDataPtr(
+      const tensorflow::tfrt_stub::FallbackTensor& host_tensor) override {
+    return host_tensor.tensor().data();
+  }
+};
+
+}  // namespace gpu
+}  // namespace tfrt
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_GPU_VARIABLES_TABLE_H_
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
index 5cd200bf190..663eca4eebf 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.cc
@@ -463,7 +463,7 @@ TfrtGraphExecutionState::CreateOptimizedGraph(
 
   result.grappler_duration = absl::Now() - grappler_start_time;
 
-  if (options_.enable_tfrt_gpu) {
+  if (options_.enable_tfrt_gpu && !options_.use_bridge_for_gpu) {
     TF_ASSIGN_OR_RETURN(
         result.graph,
         BuildXlaOpsAndMaybeInsertTransferOps(
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
index 965d8a8c8da..495a13a3e63 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h
@@ -52,7 +52,10 @@ class TfrtGraphExecutionState {
 
   struct Options {
     bool run_placer_grappler_on_functions = false;
+    // TODO(b/262826012): Remove the flag after we switch to using bridge.
     bool enable_tfrt_gpu = false;
+    // TODO(b/260915352): Remove the flag and default to using bridge.
+    bool use_bridge_for_gpu = false;
   };
 
   // Creates a `GraphExecutionState` given `graph_def` and `fallback_state`.
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
index 75b046561df..d98e6dde07e 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
@@ -722,8 +722,13 @@ TEST_F(ExtendGraphTest, ExtendGraph) {
     TF_ASSERT_OK(scope.ToGraphDef(&graphdef));
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state,
-                          tensorflow::tfrt_stub::FallbackState::Create({}, {}));
+  SessionOptions session_options;
+  // Disable optimizations for static graph to allow calls to Session::Extend.
+  session_options.config.mutable_experimental()
+      ->set_disable_optimize_for_static_graph(true);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fallback_state,
+      tensorflow::tfrt_stub::FallbackState::Create(session_options, {}));
 
   TfrtGraphExecutionState::Options options;
   options.run_placer_grappler_on_functions = false;
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 8b2b5c4fb3f..8c802fa18a3 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/jit:__subpackages__",
         "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
@@ -67,7 +68,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -180,6 +181,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
+        "//tensorflow/core/platform:status",
     ],
     # Always link this in, because even if we don't use it directly we want its
     # static initializers to dynamically load API symbols exported from libtpu.so
@@ -235,10 +237,10 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index 9351a7e557e..895dc7c90d2 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor/tpu:__subpackages__",
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 8299009b9ae..8f95afb7971 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -103,7 +103,10 @@ static constexpr int kTPUMaxTopologySize = 4096;
 const char kShardingAttribute[] = "_XlaSharding";
 
 const char kTPUPartitionedInput[] = "TPUPartitionedInput";
+const char kTPUPartitionedInputV2[] = "TPUPartitionedInputV2";
+
 const char kTPUPartitionedOutput[] = "TPUPartitionedOutput";
+const char kTPUPartitionedOutputV2[] = "TPUPartitionedOutputV2";
 
 const char kVarHandleOp[] = "VarHandleOp";
 
@@ -306,6 +309,16 @@ class IntrusiveHeap {
   Rep rep_;
 };
 
+bool _IsTPUPartitionedInput(const Node* node) {
+  return (node->type_string() == kTPUPartitionedInput) ||
+         (node->type_string() == kTPUPartitionedInputV2);
+}
+
+bool _IsTPUPartitionedOutput(const Node* node) {
+  return (node->type_string() == kTPUPartitionedOutput) ||
+         (node->type_string() == kTPUPartitionedOutputV2);
+}
+
 string CoreDeviceLabel(int core) {
   return strings::StrCat("/device:", DEVICE_TPU_REPLICATED_CORE, ":", core);
 }
@@ -2159,7 +2172,7 @@ Status DistributedTPURewritePass::AssignArgsAndRetvalsToCores(
   for (const Edge* edge : replicate_node->out_edges()) {
     int num_partitioned_outputs = 0;
     for (const Edge* out_edge : edge->dst()->out_edges()) {
-      if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+      if (_IsTPUPartitionedOutput(out_edge->dst())) {
         partitioned_output_nodes[edge->src_output()] = out_edge->dst();
         num_partitioned_outputs++;
       }
@@ -2219,7 +2232,7 @@ Status DistributedTPURewritePass::AssignArgsAndRetvalsToCores(
       Node* input_node;
       TF_RETURN_IF_ERROR(replicate_node->input_node(
           i + (is_per_replica_arg ? 0 : index_offset), &input_node));
-      if (input_node->type_string() == kTPUPartitionedInput) {
+      if (_IsTPUPartitionedInput(input_node)) {
         TF_ASSIGN_OR_RETURN(
             absl::optional<xla::OpSharding> parsed_sharding,
             GetShardingFromNodeDef(input_node->def(), /*add_metadata=*/true));
@@ -3289,7 +3302,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
   const bool mpmd = (num_cores_per_replica > 1) && !use_spmd;
 
   for (const Edge* e : replicate_input_edges) {
-    if (e->src()->type_string() == kTPUPartitionedInput) {
+    if (_IsTPUPartitionedInput(e->src())) {
       int num_users = 0;
       for (const auto& ue : e->src()->out_edges()) {
         if (!ue->IsControlEdge()) ++num_users;
@@ -3304,15 +3317,23 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
       nodes.resize(num_cores_per_replica, NodeAndPort(nullptr, 0));
       VLOG(2) << "allocate " << num_cores_per_replica
               << " for replicate_input_fan_in_nodes[" << e->dst_input() << "]";
+
       std::vector<const Edge*> fan_in_edges;
       TF_RETURN_IF_ERROR(e->src()->input_edges(&fan_in_edges));
-      TF_RET_CHECK(fan_in_edges.size() == num_cores_per_replica);
 
-      for (const Edge* fe : fan_in_edges) {
-        nodes[fe->dst_input()].node = fe->src();
-        nodes[fe->dst_input()].port = fe->src_output();
+      bool is_packed = false;
+      TF_RET_CHECK((e->src()->type_string() == kTPUPartitionedInput) ||
+                   TryGetNodeAttr(e->src()->def(), "is_packed", &is_packed));
+
+      int num_fan_in_edges = fan_in_edges.size();
+      TF_RET_CHECK(is_packed || (num_fan_in_edges == num_cores_per_replica));
+
+      for (int i = 0; i < num_cores_per_replica; ++i) {
+        const Edge* fe = fan_in_edges[i % num_fan_in_edges];
+        nodes[i].node = fe->src();
+        nodes[i].port = fe->src_output();
         VLOG(2) << "replicate_input_fan_in_nodes[" << e->dst_input() << "]["
-                << fe->dst_input() << "] = " << fe->src()->name();
+                << i << "] = " << fe->src()->name();
       }
     }
   }
@@ -3331,7 +3352,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
     int num_partitioned_outputs = 0;
 
     for (const Edge* out_edge : edge->dst()->out_edges()) {
-      if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+      if (_IsTPUPartitionedOutput(out_edge->dst())) {
         num_partitioned_outputs++;
         // Paths between replicate_node and replicate_output_fan_out_nodes:
         // ReplicateNode->TpuOutIdenity->kTPUPartitionedOutput->fan-out-nodes
@@ -3560,7 +3581,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
           if (arg_shardings[orig_arg_num].type() == xla::OpSharding::OTHER) {
             // Don't automatically add a split node when input node is
             // kTPUPartitionedInput
-            if (edge->src()->type_string() == kTPUPartitionedInput) {
+            if (_IsTPUPartitionedInput(edge->src())) {
               VLOG(2)
                   << "Connect "
                   << replicate_input_fan_in_nodes[input_num][core].node->name()
@@ -3609,7 +3630,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
               graph->AddEdge(split_node_and_index.node,
                              split_node_and_index.index, node, i);
             }
-          } else if (edge->src()->type_string() == kTPUPartitionedInput &&
+          } else if (_IsTPUPartitionedInput(edge->src()) &&
                      arg_shardings[orig_arg_num].type() ==
                          xla::OpSharding::REPLICATED) {
             graph->AddEdge(replicate_input_fan_in_nodes[input_num][core].node,
@@ -3747,7 +3768,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
           const Edge* e = replicate_output_edges[output_num];
           const Edge* e_out;
           for (const Edge* out_edge : e->dst()->out_edges()) {
-            if (out_edge->dst()->type_string() == kTPUPartitionedOutput) {
+            if (_IsTPUPartitionedOutput(out_edge->dst())) {
               isPartitionOutNode = true;
               e_out = out_edge;
             }
@@ -4988,7 +5009,7 @@ Status DistributedTPURewritePass::InternalRun(
   if (replicate_nodes.empty()) {
     // Remove unused TPUPartitionedInput nodes.
     for (Node* n : graph->nodes()) {
-      if (n->type_string() == kTPUPartitionedInput) graph->RemoveNode(n);
+      if (_IsTPUPartitionedInput(n)) graph->RemoveNode(n);
     }
     VLOG(1) << DumpGraphToFile("distributed_tpu_compilation_after", *graph,
                                options.flib_def);
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
index 2a3fd16e670..2832e5e3d3c 100644
--- a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
@@ -56,6 +56,7 @@ const char* const kTPUReplicatedInput = "TPUReplicatedInput";
 const char* const kTPUReplicatedOutput = "TPUReplicatedOutput";
 const char* const kPivotForClusterAttr = "_pivot_for_cluster";
 const char* const kTPUPartitionedInput = "TPUPartitionedInput";
+const char* const kTPUPartitionedInputV2 = "TPUPartitionedInputV2";
 
 // Finds the `index` of an _Arg or _Retval node.
 Status GetIndexAttr(const Node& n, int num_args, int* index) {
@@ -1550,7 +1551,8 @@ void RemoveUnusedTPUReplicatedInputs(Graph* graph) {
         std::vector<Node*> to_be_removed_src_nodes;
         for (const auto& e_in : n->in_edges()) {
           if (!e_in->IsControlEdge() &&
-              e_in->src()->type_string() == kTPUPartitionedInput)
+              (e_in->src()->type_string() == kTPUPartitionedInput ||
+               e_in->src()->type_string() == kTPUPartitionedInputV2))
             to_be_removed_src_nodes.push_back(e_in->src());
         }
         graph->RemoveNode(n);
diff --git a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
index 6a407c0088f..e16158623cd 100644
--- a/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/tpu_embedding_software_deduplication_rewrite_pass.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/node_builder.h"
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 890909f2146..8134b0e9b69 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -15,10 +15,12 @@ load(
 # Config setting to enable go/libtpu support.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/xrt/kernels:__subpackages__",
         "//tensorflow/core/tpu:__subpackages__",
         "//tensorflow/dtensor:__subpackages__",
+        "//third_party/py/jax_tpu_embedding:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -295,12 +297,12 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:dump",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/compiler/xla/service:hlo_module_group",
         "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core/framework:protos_all_cc",
@@ -419,8 +421,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo_module_group",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
         "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
@@ -878,9 +880,6 @@ cc_library(
     srcs = ["tpu_embedding_ops.cc"],
     deps = [
         ":tpu_mesh_state_interface",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:sharding_util",
-        "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_context",
         "//tensorflow/compiler/tf2xla:xla_helpers",
@@ -888,25 +887,22 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:if_op",
         "//tensorflow/compiler/tf2xla/kernels:while_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/tf2xla/lib:scatter",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:side_effect_util",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
+        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
         "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_api",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
         "//tensorflow/core/tpu:tpu_configuration",
-        "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/core/tpu/kernels:cross_replica_ops",
+        "//tensorflow/core/tpu/ops:tpu_embedding_ops",
         "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index ae8aeb41ad3..88d29ab9d0c 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -60,8 +60,8 @@ xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
 
   ApiConverter::ToC(shape, &c_shape);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
-                                                             &c_infeed_shape);
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(
+      &c_shape, &c_infeed_shape);
   xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
   ApiConverter::Destroy(&c_shape);
   ApiConverter::Destroy(&c_infeed_shape);
diff --git a/tensorflow/core/tpu/kernels/sharding_util_ops.cc b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
index 16d570abeba..ef7552d1034 100644
--- a/tensorflow/core/tpu/kernels/sharding_util_ops.cc
+++ b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
@@ -699,7 +699,8 @@ class XlaConcatNDShared : public OpKernel {
             "'paddings' must not exceed expected output shape dimension ",
             max_dim_size, " at index ", i, ", but got ", paddings_[i], ".");
       }
-      output_shape.AddDim(max_dim_size - paddings_[i]);
+      TF_RETURN_IF_ERROR(
+          output_shape.AddDimWithStatus(max_dim_size - paddings_[i]));
     }
 
     return OkStatus();
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index 373c0b393b9..b86589f102f 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -393,7 +393,7 @@ std::string TpuCompilationCacheInterface::FindCacheKey(
   }
   iter = fingerprint_key_map_.find(strings::StrCat(
       subgraph_key.prefix, subgraph_key.guaranteed_const_fingerprint()));
-  if (iter != session_key_map_.end()) {
+  if (iter != fingerprint_key_map_.end()) {
     return iter->second;
   }
   VLOG(1) << "No matching cache key found for key " << subgraph_key.ToString();
@@ -438,7 +438,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
 
     // Check if caller has disabled compilation. Set using
     // internal::ScopedTpuCompileDisabler.
-    if (!OpsApiFn()->TpuCompile_IsTpuCompilationEnabledFn()) {
+    if (!stream_executor::tpu::OpsApiFn()
+             ->TpuCompile_IsTpuCompilationEnabledFn()) {
       const std::string error_msg = strings::StrCat(
           "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
           "disabled, session_name(",
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.cc b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
index 64eda7ce83e..db17574e0f8 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-using ::stream_executor::port::StatusOr;
+using ::tsl::StatusOr;
 
 TpuCompileOp::TpuCompileOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>> compile_op_impl =
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 71d6435c599..02c98d3b1b6 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -116,7 +116,8 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
       ctx->cancellation_manager()->get_cancellation_token();
   const bool already_cancelled =
       !ctx->cancellation_manager()->RegisterCallback(token, [ctx, done]() {
-        if (OpsApiFn()->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
+        if (stream_executor::tpu::OpsApiFn()
+                ->TpuCompile_ShouldTpuCompileOpIgnoreCancellationFn()) {
           return;
         }
 
@@ -145,14 +146,24 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
   string status_payload;
   // Construct payload if compile_status is not ok and there's no payload for
   // compilation yet.
-  if (!compile_status
+  if (!compile_status.ok() &&
+      !compile_status
            .GetPayload(TpuCompileInterface::kTpuCompileErrorPayloadKey)
            .has_value()) {
+    // TODO(b/237021124): Remove the string insertion once the TF runtime
+    // correctly propagates the status payload set.
+    const std::string new_error_message =
+        absl::StrCat(TpuCompileInterface::kTpuCompileErrorMessage, ". ",
+                     compile_status.error_message());
+
     tpu::CompilationResultProto proto;
     proto.set_status_code(compile_status.code());
-    proto.set_status_error_message(
-        TruncateMessage(compile_status.error_message(), 128));
+    proto.set_status_error_message(TruncateMessage(new_error_message, 128));
     status_payload = proto.SerializeAsString();
+
+    // Update compile_status's error message as well.
+    compile_status = tensorflow::errors::CreateWithUpdatedMessage(
+        compile_status, new_error_message);
   }
   OP_REQUIRES_OK_OR_SET_PAYLOAD(ctx,
                                 TpuCompileInterface::kTpuCompileErrorPayloadKey,
@@ -407,7 +418,7 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
     SerializeToTString(proto, &output.scalar<tstring>()());
     ctx->set_output(0, output);
     status.SetPayload(TpuCompileInterface::kTpuCompileErrorPayloadKey,
-                      output.scalar<tstring>()());
+                      absl::Cord(output.scalar<tstring>()()));
   }
 
   if (status.ok()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index f1e76fc2627..387e844bbbd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -45,12 +45,10 @@ class CompileOpImplFactory {
  public:
   virtual ~CompileOpImplFactory() = default;
 
-  virtual stream_executor::port::StatusOr<
-      std::unique_ptr<TpuCompileOpKernelCommon>>
+  virtual tsl::StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>>
   CreateNonMlirImpl(OpKernelConstruction* ctx) = 0;
 
-  virtual stream_executor::port::StatusOr<
-      std::unique_ptr<TpuCompileOpKernelCommon>>
+  virtual tsl::StatusOr<std::unique_ptr<TpuCompileOpKernelCommon>>
   CreateMlirImpl(OpKernelConstruction* ctx) = 0;
 
   static CompileOpImplFactory* Get();
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index 64a05dc70af..fdd12961a70 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-using stream_executor::port::StatusOr;
+using tsl::StatusOr;
 
 Status TpuCompileOpKernelImpl::Compile(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index a7e74735d23..91b02e4a828 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -28,8 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-using ::stream_executor::port::Status;
-using ::stream_executor::port::StatusOr;
+using ::tsl::Status;
+using ::tsl::StatusOr;
 using ::xla::ComputationLayout;
 using ::xla::DebugOptions;
 using ::xla::DeviceAssignment;
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index d08daf8e7de..7758c464d40 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/types/variant.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -92,8 +92,7 @@ struct ShardingAndIndex {
 xla::Shape GetPerDeviceShape(const xla::Shape& shape,
                              const xla::HloSharding& sharding, int64_t device);
 
-stream_executor::port::StatusOr<std::unique_ptr<xla::HloModuleConfig>>
-CreateModuleConfig(
+tsl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
     const xla::ProgramShape& program_shape,
     absl::Span<const xla::Shape> argument_shapes,
     absl::optional<const xla::Shape> result_layout,
@@ -104,14 +103,13 @@ CreateModuleConfig(
     const xla::FusionConfigCollection* fusion_config_collection,
     const std::vector<std::vector<bool>>* fusion_config);
 
-stream_executor::port::StatusOr<std::unique_ptr<xla::HloModuleConfig>>
-CreateModuleConfig(
+tsl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
     const xla::ProgramShape& program_shape,
     absl::Span<const xla::Shape> argument_shapes,
     absl::optional<const xla::Shape> result_layout,
     absl::optional<const xla::DeviceAssignment> device_assignment,
-    int replica_count,
-    int num_partitions, const xla::DebugOptions* debug_options);
+    int replica_count, int num_partitions,
+    const xla::DebugOptions* debug_options);
 
 xla::ShapeTree<xla::HloSharding> GetSubtree(
     const xla::ShapeTree<xla::HloSharding>& tuple_shape_tree,
@@ -128,26 +126,26 @@ Status AddVariableUpdatesToCores(
     std::vector<std::vector<xla::Shape>>* per_core_output_shapes,
     std::vector<std::vector<std::pair<int, bool>>>* per_core_variable_indices);
 
-se::port::Status ComputeOutputShapesForEachCore(
+tsl::Status ComputeOutputShapesForEachCore(
     const tpu::TPUCompileMetadataProto& metadata,
     const XlaCompiler::CompilationResult& compilation_result,
     std::vector<std::vector<xla::Shape>>* per_core_output_shapes);
 
-se::port::Status CreateHloModules(
+tsl::Status CreateHloModules(
     const TPUCompileMetadataProto& metadata,
     const XlaCompiler::CompilationResult& compilation_result,
     const absl::optional<xla::DeviceAssignment>& device_assignment,
     std::vector<std::unique_ptr<xla::HloModule>>* hlo_modules);
 
-se::port::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
+tsl::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
     const TPUCompileMetadataProto& metadata,
     const std::vector<TensorShape>& arg_shapes);
 
-se::port::Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
-                                              TPUCompileMetadataProto* metadata,
-                                              NameAttrList* function_name,
-                                              std::string* mlir_module);
+tsl::Status CompileOpMetadataFromContext(OpKernelConstruction* ctx,
+                                         TPUCompileMetadataProto* metadata,
+                                         NameAttrList* function_name,
+                                         std::string* mlir_module);
 
 // Computes shapes for each argument. Uses both the static shape from the
 // metadata, and the dynamic shapes where the static shape is not
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 33cffecdcbd..695bc15aeba 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -225,7 +225,8 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   auto cleanup = absl::MakeCleanup([&status, &tpu_topology_output]() {
     TF_DeleteStatus(status);
-    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
+    stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        tpu_topology_output);
   });
 
   auto* mesh_common_state = mesh_state->mesh_common_state();
@@ -242,7 +243,7 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   params.tpu_topology_output = &tpu_topology_output;
   params.status = status;
 
-  tpu::OpsApiFn()->WaitForDistributedTpuOp_DoWorkFn(&params);
+  stream_executor::tpu::OpsApiFn()->WaitForDistributedTpuOp_DoWorkFn(&params);
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
 
@@ -287,7 +288,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
                      rmgr, tpu::kTpuEmbeddingEngineStateInterfaceResourceName));
 
   bool is_master_worker =
-      tpu::OpsApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
   if (!is_master_worker) {
     // Reset the mesh interface if we are not the master.
     OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
@@ -326,7 +327,8 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   int32_t* device_id_output = nullptr;
   auto cleanup = absl::MakeCleanup([&status, &device_id_output]() {
     TF_DeleteStatus(status);
-    tpu::OpsApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
+    stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(
+        device_id_output);
   });
 
   InitializeHostForDistributedTpuOp_DoWork_Params params;
@@ -340,7 +342,8 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   params.core_id_output = &device_id_output;
   params.status = status;
 
-  tpu::OpsApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(&params);
+  stream_executor::tpu::OpsApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
+      &params);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
 
   if (local_compilation_cache != nullptr) {
@@ -354,12 +357,13 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
                           tpu::kCompiledProtoCacheResourceName, proto_lookup));
   } else {
     int64_t cache_size_bytes;
-    tpu::OpsApiFn()->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
-        &cache_size_bytes);
+    stream_executor::tpu::OpsApiFn()
+        ->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
+            &cache_size_bytes);
 
     char* server_address_output = nullptr;
     auto cleanup_server_address = absl::MakeCleanup([&server_address_output]() {
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           server_address_output);
     });
     size_t server_address_output_size;
@@ -374,7 +378,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     params.server_address_output = &server_address_output;
     params.status = status;
 
-    tpu::OpsApiFn()
+    stream_executor::tpu::OpsApiFn()
         ->TpuConfigurationApi_CompilationCacheServerAddressFromConfigFn(
             &params);
     OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
@@ -453,8 +457,8 @@ void SetGlobalTPUArrayOp::Compute(OpKernelContext* ctx) {
   auto tpu_topology = ctx->input(0).scalar<tstring>()();
   TF_Status* status = TF_NewStatus();
 
-  tpu::OpsApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
-                                                tpu_topology.data(), status);
+  stream_executor::tpu::OpsApiFn()->SetGlobalTPUArrayOp_DoWorkFn(
+      tpu_topology.size(), tpu_topology.data(), status);
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
@@ -469,7 +473,7 @@ void DisconnectDistributedTpuChipsOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   int32_t number_of_chips_output = 0;
 
-  tpu::OpsApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
+  stream_executor::tpu::OpsApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
       &number_of_chips_output, status);
 
   Tensor* ctx_output;
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index e5e117819bd..2659cf5cd2b 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
index fe00ee452f4..071d918abfa 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_configuration_ops.cc
@@ -63,7 +63,7 @@ class ExecuteTPUEmbeddingPartitionerOp : public OpKernel {
 
     char* common_config_output = nullptr;
     auto cleanup = absl::MakeCleanup([&common_config_output]() {
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           common_config_output);
     });
     size_t common_config_output_size;
@@ -73,7 +73,8 @@ class ExecuteTPUEmbeddingPartitionerOp : public OpKernel {
     StatusHelper status;
     params.status = status.c_status;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_ExecutePartitionerFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_ExecutePartitionerFn(
+        &params);
     if (!status.ok()) {
       LOG(WARNING) << "ExecuteTPUEmbeddingPartitioner::Compute failed"
                    << status.status().ToString();
@@ -116,7 +117,7 @@ class ConfigureTPUEmbeddingMemoryOp : public OpKernel {
 
     char* memory_config_output = nullptr;
     auto cleanup = absl::MakeCleanup([&memory_config_output]() {
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           memory_config_output);
     });
     size_t memory_config_output_size;
@@ -127,7 +128,8 @@ class ConfigureTPUEmbeddingMemoryOp : public OpKernel {
     StatusHelper status;
     params.status = status.c_status;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_ConfigureMemoryFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_ConfigureMemoryFn(
+        &params);
     OP_REQUIRES_OK(ctx, status.status());
 
     const std::string memory_config_string =
@@ -176,7 +178,7 @@ class CollateTPUEmbeddingMemoryOp : public OpKernel {
 
     char* merged_memory_config_output = nullptr;
     auto cleanup = absl::MakeCleanup([&merged_memory_config_output]() {
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           merged_memory_config_output);
     });
 
@@ -187,7 +189,8 @@ class CollateTPUEmbeddingMemoryOp : public OpKernel {
     StatusHelper status;
     params.status = status.c_status;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_CollateMemoryFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_CollateMemoryFn(
+        &params);
     OP_REQUIRES_OK(ctx, status.status());
 
     const std::string merged_memory_config_string = std::string(
@@ -247,7 +250,7 @@ class ConfigureTPUEmbeddingHostOp : public OpKernel {
 
     char* network_config_output = nullptr;
     auto cleanup = absl::MakeCleanup([&network_config_output]() {
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
           network_config_output);
     });
 
@@ -258,7 +261,8 @@ class ConfigureTPUEmbeddingHostOp : public OpKernel {
     StatusHelper status;
     params.status = status.c_status;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_ConfigureHostFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_ConfigureHostFn(
+        &params);
     OP_REQUIRES_OK(ctx, status.status());
 
     const std::string network_config_string =
@@ -314,7 +318,8 @@ class ConnectTPUEmbeddingHostsOp : public OpKernel {
     StatusHelper status;
     params.status = status.c_status;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_ConnectHostsFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_ConnectHostsFn(
+        &params);
     OP_REQUIRES_OK(ctx, status.status());
 
     VLOG(1) << "ConnectTPUEmbeddingHostsOp::Compute done";
@@ -364,7 +369,7 @@ class FinalizeTPUEmbeddingOp : public OpKernel {
     core::ScopedUnref mesh_state_unref(mesh_state);
     params.tpu_mesh_state = mesh_state->data();
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_FinalizeFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_FinalizeFn(&params);
     OP_REQUIRES_OK(ctx, status.status());
     VLOG(1) << "FinalizeTPUEmbeddingOp::Compute done";
   }
@@ -398,7 +403,8 @@ class IsTPUEmbeddingInitializedOp : public OpKernel {
     bool is_initialized = false;
     params.is_tpu_embedding_initialized = &is_initialized;
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_IsInitializedFn(&params);
+    stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_IsInitializedFn(
+        &params);
 
     OP_REQUIRES_OK(ctx, status.status());
 
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
index b26401909b6..b07b0bd60d8 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
@@ -37,7 +37,8 @@ class TpuEmbeddingEngineStateInterface : public ResourceBase {
 
   ~TpuEmbeddingEngineStateInterface() override {
     if (engine_state_ != nullptr) {
-      OpsApiFn()->TpuEmbeddingEngineState_FreeFn(engine_state_);
+      stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_FreeFn(
+          engine_state_);
     }
   }
 
@@ -46,13 +47,16 @@ class TpuEmbeddingEngineStateInterface : public ResourceBase {
       return nullptr;
     }
     return static_cast<tensorflow::TpuEmbeddingEngineState*>(
-        OpsApiFn()->TpuEmbeddingEngineState_GetStateFn(engine_state_));
+        stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_GetStateFn(
+            engine_state_));
   }
 
   static TpuEmbeddingEngineStateInterface* Create() {
     XLA_TpuEmbeddingEngineState* state = nullptr;
-    if (OpsApiFn()->TpuEmbeddingEngineState_CreateFn != nullptr) {
-      state = OpsApiFn()->TpuEmbeddingEngineState_CreateFn();
+    if (stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_CreateFn !=
+        nullptr) {
+      state =
+          stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngineState_CreateFn();
     }
     return new TpuEmbeddingEngineStateInterface(state);
   }
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
index 52dd7a7114e..2c9f4d8f0c4 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.cc
@@ -92,8 +92,9 @@ class EnqueueTPUEmbeddingArbitraryTensorBatchOp : public OpKernel {
     }
     fixed_state_create_params.combiners_size = c_str_combiners.size();
     fixed_state_create_params.combiners = c_str_combiners.data();
-    fixed_state_ = tpu::OpsApiFn()->TpuEmbeddingTensorBatchFixedState_CreateFn(
-        &fixed_state_create_params);
+    fixed_state_ = stream_executor::tpu::OpsApiFn()
+                       ->TpuEmbeddingTensorBatchFixedState_CreateFn(
+                           &fixed_state_create_params);
 
     OP_REQUIRES_OK(ctx, status.status());
   }
@@ -167,7 +168,8 @@ class EnqueueTPUEmbeddingArbitraryTensorBatchOp : public OpKernel {
       tensorflow::profiler::TraceMe enqueue_batch_trace(
           [] { return "EnqueueBatch"; },
           tensorflow::profiler::TraceMeLevel::kInfo);
-      tpu::OpsApiFn()->TpuEmbeddingEngine_EnqueueTensorBatchFn(&params);
+      stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_EnqueueTensorBatchFn(
+          &params);
       OP_REQUIRES_OK(ctx, status.status());
     }
 
@@ -185,7 +187,8 @@ class EnqueueTPUEmbeddingArbitraryTensorBatchOp : public OpKernel {
   }
 
   ~EnqueueTPUEmbeddingArbitraryTensorBatchOp() override {
-    tpu::OpsApiFn()->TpuEmbeddingTensorBatchFixedState_DestroyFn(fixed_state_);
+    stream_executor::tpu::OpsApiFn()
+        ->TpuEmbeddingTensorBatchFixedState_DestroyFn(fixed_state_);
   }
 
  private:
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
index ea701ae5e56..1ee1f2ecfe1 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
@@ -222,8 +222,8 @@ void LoadAllTPUEmbeddingParametersOp::Compute(OpKernelContext* ctx) {
     }
   }
   StatusHelper status;
-  tpu::OpsApiFn()->TpuEmbeddingEngine_WriteParametersFn(&(params->c_params),
-                                                        status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_WriteParametersFn(
+      &(params->c_params), status.c_status);
   OP_REQUIRES_OK(ctx, status.status());
 
   VLOG(1) << "LoadAllTPUEmbeddingParameters::Compute done";
@@ -348,8 +348,8 @@ void RetrieveAllTPUEmbeddingParametersOp::Compute(OpKernelContext* ctx) {
     }
   }
   StatusHelper status;
-  tpu::OpsApiFn()->TpuEmbeddingEngine_ReadParametersFn(&(params->c_params),
-                                                       status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuEmbeddingEngine_ReadParametersFn(
+      &(params->c_params), status.c_status);
   OP_REQUIRES_OK(ctx, status.status());
 
   if (VLOG_IS_ON(5)) {
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index ae6df7ffbb6..fb51c1d0a2a 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -13,27 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/tpu/ops/tpu_embedding_ops.h"
+
+#include <cstdint>
 #include <string>
+#include <vector>
 
 #include "absl/cleanup/cleanup.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
-#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
-#include "tensorflow/core/tpu/tpu_configuration.h"
 
 namespace tensorflow {
 
+using xla::LiteralUtil;
+
 namespace {
 
 // This TensorFlow op receives a batch of activations from the
@@ -42,26 +47,17 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
  public:
   explicit RecvTPUEmbeddingActivationsOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
-    string config_string;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
 
     OP_REQUIRES(
-        ctx, tpu_embedding_config_.ParseFromString(config_string),
-        xla::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
-                             "proto from config attr"));
+        ctx, tpu_embedding_config_.ParseFromString(config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
   }
 
-  ~RecvTPUEmbeddingActivationsOp() override {}
+  ~RecvTPUEmbeddingActivationsOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
-    ResourceMgr* rm = GetTPUConfigResourceMgr();
-
-    tensorflow::tpu::TpuMeshStateInterface* mesh_state;
-    OP_REQUIRES_OK(
-        ctx, rm->Lookup(rm->default_container(),
-                        tensorflow::tpu::kTpuMeshStateInterfaceResourceName,
-                        &mesh_state));
-    core::ScopedUnref mesh_state_unref(mesh_state);
     OP_REQUIRES(
         ctx, ctx->num_inputs() == 1,
         errors::Internal("Kernel has ", ctx->num_inputs(),
@@ -69,24 +65,40 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
 
     xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
 
-    TpuEmbeddingEngine_RecvActivationsComputation_Params recv_activation_params;
+    TpuEmbeddingEngine_RecvActivationsComputation_Params params;
+    params.tpu_embedding_config.bytes = config_string_.c_str();
+    params.tpu_embedding_config.size = config_string_.size();
+    StatusHelper status;
+    params.status = status.c_status;
+    auto builder = ctx->builder();
+    OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
     TpuSerializedProto xla_computation_serialized;
     auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
       StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
     });
-    recv_activation_params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    recv_activation_params.status = status.c_status;
-    recv_activation_params.tpu_mesh_state = mesh_state->data();
-    auto builder = ctx->builder();
-    OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
+    params.xla_computation = &xla_computation_serialized;
     XLA_Shape c_shape;
     ApiConverter::ToC(shape, &c_shape);
     auto c_shape_cleanup =
         absl::MakeCleanup([&c_shape] { ApiConverter::Destroy(&c_shape); });
-    recv_activation_params.deduplication_data_shape = &c_shape;
-    tpu::OpsApiFn()->TpuEmbeddingEngine_RecvActivationsComputationFn(
-        &recv_activation_params);
+    params.deduplication_data_shape = &c_shape;
+
+    TpuSerializedProto op_sharding_proto_serialized;
+    if (ctx->builder()->sharding().has_value()) {
+      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                           &op_sharding_proto_serialized);
+      params.op_sharding = &op_sharding_proto_serialized;
+    } else {
+      params.op_sharding = nullptr;
+    }
+    auto op_sharding_cleanup = absl::MakeCleanup([&] {
+      if (params.op_sharding) {
+        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+      }
+    });
+
+    stream_executor::tpu::OpsApiFn()
+        ->TpuEmbeddingEngine_RecvActivationsComputationFn(&params);
     OP_REQUIRES_OK(ctx, status.status());
     auto xla_computation =
         stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
@@ -94,10 +106,15 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
     auto final_activations =
         xla::Call(builder, xla_computation, {deduplication_data});
 
-    int32 output_count = tpu_embedding_config_.feature_descriptor_size();
+    // Ensure that the number of outputs is the same as the number of user
+    // tables.
+    const int32 output_count =
+        (tpu_embedding_config_.feature_descriptor_size() == 0)
+            ? tpu_embedding_config_.table_descriptor_size()
+            : tpu_embedding_config_.feature_descriptor_size();
     OP_REQUIRES(
         ctx, ctx->num_outputs() == output_count,
-        xla::InvalidArgument(
+        errors::InvalidArgument(
             "Kernel has %d outputs but configuration expects %d outputs.",
             ctx->num_outputs(), output_count));
 
@@ -109,6 +126,7 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
 
  private:
   tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
+  std::string config_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvTPUEmbeddingActivationsOp);
 };
@@ -123,42 +141,50 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
  public:
   explicit RecvTPUEmbeddingDeduplicationDataOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
-    std::string config_string;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
     OP_REQUIRES(
-        ctx, tpu_embedding_config_.ParseFromString(config_string),
-        xla::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
-                             "proto from config attr"));
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
   }
 
-  ~RecvTPUEmbeddingDeduplicationDataOp() override {}
+  ~RecvTPUEmbeddingDeduplicationDataOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
-    VLOG(1) << "Compile RecvTPUDeduplicationDataOp";
-
-    ResourceMgr* rm = GetTPUConfigResourceMgr();
-
-    tensorflow::tpu::TpuMeshStateInterface* mesh_state;
-    OP_REQUIRES_OK(
-        ctx, rm->Lookup(rm->default_container(),
-                        tensorflow::tpu::kTpuMeshStateInterfaceResourceName,
-                        &mesh_state));
-    core::ScopedUnref mesh_state_unref(mesh_state);
+    VLOG(1) << "Compile RecvTPUEmbeddingDeduplicationDataOp";
 
     TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params
-        recv_deduplication_params;
+        params;
+
+    params.tpu_embedding_config.bytes = config_string_.c_str();
+    params.tpu_embedding_config.size = config_string_.size();
     TpuSerializedProto xla_computation_serialized;
     auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
       StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
     });
-    recv_deduplication_params.xla_computation = &xla_computation_serialized;
+    params.xla_computation = &xla_computation_serialized;
     StatusHelper status;
-    recv_deduplication_params.status = status.c_status;
-    recv_deduplication_params.tpu_mesh_state = mesh_state->data();
+    params.status = status.c_status;
+
+    TpuSerializedProto op_sharding_proto_serialized;
+    if (ctx->builder()->sharding().has_value()) {
+      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                           &op_sharding_proto_serialized);
+      params.op_sharding = &op_sharding_proto_serialized;
+    } else {
+      params.op_sharding = nullptr;
+    }
+    auto op_sharding_cleanup = absl::MakeCleanup([&] {
+      if (params.op_sharding) {
+        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+      }
+    });
 
-    tpu::OpsApiFn()
+    stream_executor::tpu::OpsApiFn()
         ->TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputationFn(
-            &recv_deduplication_params);
+            &params);
     OP_REQUIRES_OK(ctx, status.status());
 
     auto xla_computation =
@@ -170,7 +196,7 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
 
     // Ensure that the number of outputs is equal to 1 (for deduplication data).
     OP_REQUIRES(ctx, ctx->num_outputs() == 1,
-                xla::InvalidArgument(
+                errors::InvalidArgument(
                     "Kernel has %d outputs but configuration expects 1 output.",
                     ctx->num_outputs()));
 
@@ -179,7 +205,8 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
   }
 
  private:
-  tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
+  // TPU Embedding config string.
+  std::string config_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvTPUEmbeddingDeduplicationDataOp);
 };
@@ -194,29 +221,20 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
  public:
   explicit SendTPUEmbeddingGradientsOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
-    string config_string;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string));
-
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
     OP_REQUIRES(
-        ctx, tpu_embedding_config_.ParseFromString(config_string),
-        xla::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
-                             "proto from config attr"));
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
   }
 
-  ~SendTPUEmbeddingGradientsOp() override {}
+  ~SendTPUEmbeddingGradientsOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile SendTPUEmbeddingGradientsOp";
 
-    ResourceMgr* rm = GetTPUConfigResourceMgr();
-
-    tensorflow::tpu::TpuMeshStateInterface* mesh_state;
-    OP_REQUIRES_OK(
-        ctx, rm->Lookup(rm->default_container(),
-                        tensorflow::tpu::kTpuMeshStateInterfaceResourceName,
-                        &mesh_state));
-    core::ScopedUnref mesh_state_unref(mesh_state);
-
     std::vector<xla::XlaOp> gradients;
     std::vector<TensorShape> tf_gradient_shapes;
     OP_REQUIRES_OK(
@@ -240,25 +258,28 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
 
     xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
 
-    TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params
-        send_gradients_params;
+    TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params params;
+    params.tpu_embedding_config.bytes = config_string_.c_str();
+    params.tpu_embedding_config.size = config_string_.size();
     TpuSerializedProto xla_computation_serialized;
     auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
       StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
     });
-    send_gradients_params.xla_computation = &xla_computation_serialized;
+    params.xla_computation = &xla_computation_serialized;
     StatusHelper status;
-    send_gradients_params.status = status.c_status;
-    send_gradients_params.tpu_mesh_state = mesh_state->data();
+    params.status = status.c_status;
     OP_REQUIRES_VALUE(auto deduplication_shape, ctx,
                       builder->GetShape(deduplication_data));
     XLA_Shape gradient_tuple_c_shape;
+    params.gradient_tuple_shape = &gradient_tuple_c_shape;
     ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(gradient_shapes),
                       &gradient_tuple_c_shape);
     XLA_Shape learning_rate_tuple_c_shape;
+    params.learning_rate_tuple_shape = &learning_rate_tuple_c_shape;
     ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(learning_rate_shapes),
                       &learning_rate_tuple_c_shape);
     XLA_Shape deduplication_c_shape;
+    params.deduplication_data_shape = &deduplication_c_shape;
     ApiConverter::ToC(deduplication_shape, &deduplication_c_shape);
 
     auto c_shape_cleanup = absl::MakeCleanup([&gradient_tuple_c_shape,
@@ -268,10 +289,24 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
       ApiConverter::Destroy(&learning_rate_tuple_c_shape);
       ApiConverter::Destroy(&deduplication_c_shape);
     });
-    send_gradients_params.num_inputs = ctx->num_inputs();
+    params.num_inputs = ctx->num_inputs();
+
+    TpuSerializedProto op_sharding_proto_serialized;
+    if (ctx->builder()->sharding().has_value()) {
+      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                           &op_sharding_proto_serialized);
+      params.op_sharding = &op_sharding_proto_serialized;
+    } else {
+      params.op_sharding = nullptr;
+    }
+    auto op_sharding_cleanup = absl::MakeCleanup([&] {
+      if (params.op_sharding) {
+        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+      }
+    });
 
-    tpu::OpsApiFn()->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(
-        &send_gradients_params);
+    stream_executor::tpu::OpsApiFn()
+        ->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(&params);
     OP_REQUIRES_OK(ctx, status.status());
 
     auto xla_computation =
@@ -286,7 +321,8 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
   }
 
  private:
-  tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
+  // TPU Embedding config string.
+  std::string config_string_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SendTPUEmbeddingGradientsOp);
 };
@@ -294,5 +330,267 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaSendTPUEmbeddingGradients").AllowVariantTypes(),
                 SendTPUEmbeddingGradientsOp);
 
+// `XLARecvTPUEmbeddingDeduplicationDataOp` gives an XLA Tuple as results, which
+// can not be returned as static shape results. `SplitDedupDataOp` is to split
+// this XLA tuple into integer and float tensors to return.
+class SplitDedupDataOp : public XlaOpKernel {
+ public:
+  explicit SplitDedupDataOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tuple_mask", &tuple_mask_string_));
+    OP_REQUIRES(ctx, tuple_mask_tensor_.ParseFromString(tuple_mask_string_),
+                errors::InvalidArgument(
+                    "Malformed `tuple_mask` attr in SplitDedupData Op."));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(
+        ctx, ctx->num_inputs() == 1,
+        errors::InvalidArgument("SplitDedupDataOp must have 1 input but gets ",
+                                ctx->num_inputs()));
+    const xla::XlaOp& input_tuple = ctx->Input(0);
+    xla::XlaBuilder* builder = ctx->builder();
+
+    StatusOr<xla::Shape> tuple_shape = builder->GetShape(input_tuple);
+    OP_REQUIRES_OK(ctx, tuple_shape.status());
+
+    const int num_tuple_elements = tuple_shape->tuple_shapes_size();
+    OP_REQUIRES(
+        ctx,
+        tuple_mask_tensor_.tensor_shape().dim(0).size() == num_tuple_elements,
+        errors::InvalidArgument(
+            "Number of elements in `input` tuple does not match with "
+            "`tuple_mask`."));
+
+    if (num_tuple_elements == 0) {
+      // Returns empty tensors when tuple is empty.
+      ctx->SetOutput(
+          0, xla::ConstantLiteral(
+                 builder, LiteralUtil::CreateFromDimensions(xla::U32, {0})));
+      ctx->SetOutput(
+          1, xla::ConstantLiteral(
+                 builder, LiteralUtil::CreateFromDimensions(xla::F32, {0})));
+      return;
+    }
+
+    // Split tuple elements in `input_tuple` into two vectors: integers_vec and
+    // floats_vec, corresponding to their mask.
+    std::vector<xla::XlaOp> integers_vec, floats_vec;
+    for (int i = 0; i < num_tuple_elements; ++i) {
+      const xla::XlaOp& element = xla::GetTupleElement(input_tuple, i);
+      const int element_type = tuple_mask_tensor_.int_val(2 * i);
+      OP_REQUIRES(
+          ctx,
+          element_type == DedupTupleElementType::kInteger ||
+              element_type == DedupTupleElementType::kFloat,
+          errors::InvalidArgument(
+              "Elements in first column of tuple_mask_tensor are enums of ",
+              "DedupTupleElementType, which can only be 0 or 1. Where 0 ",
+              "represents integer and 1 represents float. But gets unexpected ",
+              "enum = ", element_type));
+
+      if (element_type == DedupTupleElementType::kInteger) {
+        integers_vec.push_back(element);
+      } else {
+        floats_vec.push_back(element);
+      }
+    }
+
+    // Concatenate elements of integer and floating as return tensors.
+    xla::XlaOp integer_tensor = xla::ConcatInDim(builder,
+                                                 /*operands=*/integers_vec,
+                                                 /*dimension=*/0);
+    xla::XlaOp float_tensor = xla::ConcatInDim(builder,
+                                               /*operands=*/floats_vec,
+                                               /*dimension=*/0);
+    ctx->SetOutput(0, integer_tensor);
+    ctx->SetOutput(1, float_tensor);
+  }
+
+ private:
+  std::string tuple_mask_string_;
+  tensorflow::TensorProto tuple_mask_tensor_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SplitDedupDataOp);
+};
+
+REGISTER_XLA_OP(Name("SplitDedupData").AllowVariantTypes(), SplitDedupDataOp);
+
+// MergeDedupDataOp merges integer and floating point tensors back to xla tuple.
+class MergeDedupDataOp : public XlaOpKernel {
+ public:
+  explicit MergeDedupDataOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tuple_mask", &tuple_mask_string_));
+    OP_REQUIRES(ctx, tuple_mask_tensor_.ParseFromString(tuple_mask_string_),
+                errors::InvalidArgument(
+                    "Malformed `tuple_mask` attr in MergeDedupData Op"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(
+        ctx, ctx->num_inputs() == 2,
+        errors::InvalidArgument("MergeDedupDataOp expects 2 inputs, but ",
+                                "gets ", ctx->num_inputs()));
+
+    // `integer_tensor` should be a 1-D tensor.
+    xla::XlaOp integer_tensor = ctx->Input(0);
+    StatusOr<xla::Shape> integer_tensor_shape =
+        ctx->builder()->GetShape(integer_tensor);
+    OP_REQUIRES_OK(ctx, integer_tensor_shape.status());
+    OP_REQUIRES(ctx, integer_tensor_shape->rank() == 1,
+                errors::InvalidArgument(
+                    "Expected rank of integer_vals is 1, but gets, ",
+                    integer_tensor_shape->rank()));
+    const int64_t num_integers = integer_tensor_shape->dimensions(0);
+
+    // `float_tensor` should be a 1-D tensor.
+    xla::XlaOp float_tensor = ctx->Input(1);
+    StatusOr<xla::Shape> float_tensor_shape =
+        ctx->builder()->GetShape(float_tensor);
+    OP_REQUIRES_OK(ctx, float_tensor_shape.status());
+    OP_REQUIRES(ctx, float_tensor_shape->rank() == 1,
+                errors::InvalidArgument("Expects rank of value is 1, but gets ",
+                                        float_tensor_shape->rank()));
+    const int64_t num_floats = float_tensor_shape->dimensions(0);
+
+    // Get total number of elements in deduplication data tuple.
+    auto builder = ctx->builder();
+    const tensorflow::TensorShapeProto& tuple_tensor_shape =
+        tuple_mask_tensor_.tensor_shape();
+    const int64_t num_tuple_elements = tuple_tensor_shape.dim(0).size();
+    if (num_tuple_elements == 0) {
+      OP_REQUIRES(
+          ctx, num_integers == 0 && num_floats == 0,
+          errors::InvalidArgument(
+              "Tuple mask indicates empty tuple, but integer_tensor ",
+              "shape is ", integer_tensor_shape->DebugString(),
+              " float_tensor shape is ", float_tensor_shape->DebugString()));
+      ctx->SetOutput(0, xla::Tuple(builder, {}));
+      return;
+    }
+    OP_REQUIRES(
+        ctx, tuple_tensor_shape.dim_size() == 2,
+        errors::InvalidArgument("Expects rank of tuple mask is 1, but gets ",
+                                tuple_tensor_shape.dim_size()));
+
+    std::vector<xla::XlaOp> output_vec;
+    output_vec.reserve(num_tuple_elements);
+
+    // Merge elements of integer and float tensor into a tuple.
+    int integer_offset = 0;
+    int float_offset = 0;
+    for (int i = 0; i < num_tuple_elements; ++i) {
+      const int element_type = tuple_mask_tensor_.int_val(2 * i);
+      const int span_size = tuple_mask_tensor_.int_val(2 * i + 1);
+      OP_REQUIRES(
+          ctx,
+          element_type == DedupTupleElementType::kInteger ||
+              element_type == DedupTupleElementType::kFloat,
+          errors::InvalidArgument(
+              "Elements in first column of tuple_mask_tensor are enums of ",
+              "DedupTupleElementType, which can only be 0 or 1. Where 0 ",
+              "represents integer and 1 represents float. But gets unexpected ",
+              "enum = ", element_type));
+
+      if (element_type == DedupTupleElementType::kInteger) {
+        OP_REQUIRES(ctx, integer_offset < num_integers,
+                    errors::InvalidArgument(
+                        "Offset of integers = ", integer_offset,
+                        " exceeds total number of integers = ", num_integers));
+        xla::XlaOp integer_slice =
+            xla::SliceInDim(integer_tensor,
+                            /*start_index=*/integer_offset,
+                            /*limit_index*/ integer_offset + span_size,
+                            /*stride=*/1, /*dimno=*/0);
+        output_vec.push_back(integer_slice);
+        integer_offset += span_size;
+      } else {
+        OP_REQUIRES(ctx, float_offset < num_floats,
+                    errors::InvalidArgument(
+                        "Offset of integers = ", float_offset,
+                        " exceeds total number of floats = ", num_floats));
+        xla::XlaOp float_slice =
+            xla::SliceInDim(float_tensor,
+                            /*start_index=*/float_offset,
+                            /*limit_index*/ float_offset + span_size,
+                            /*stride=*/1, /*dimno=*/0);
+        output_vec.push_back(float_slice);
+        float_offset += span_size;
+      }
+    }
+    OP_REQUIRES(ctx, integer_offset == num_integers,
+                errors::InvalidArgument(
+                    "Number of integers does not match, expect num_integers = ",
+                    num_integers, " but actually get = ", integer_offset));
+    OP_REQUIRES(ctx, float_offset == num_floats,
+                errors::InvalidArgument(
+                    "Number of floats does not match, expect num_floats = ",
+                    num_floats, " but actually get = ", float_offset));
+
+    xla::XlaOp output_tuple = xla::Tuple(builder, output_vec);
+    ctx->SetOutput(0, output_tuple);
+  }
+
+ private:
+  std::string tuple_mask_string_;
+  tensorflow::TensorProto tuple_mask_tensor_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MergeDedupDataOp);
+};
+
+REGISTER_XLA_OP(Name("MergeDedupData").AllowVariantTypes(), MergeDedupDataOp);
+
+// This op computes deduplication data tuple mask.
+class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
+ public:
+  explicit ComputeDedupDataTupleMaskOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile ComputeDeduplicationDataShapeOp";
+
+    TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params params;
+    params.tpu_embedding_config.bytes = config_string_.c_str();
+    params.tpu_embedding_config.size = config_string_.size();
+
+    TpuSerializedProto xla_computation_serialized;
+    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+    });
+
+    params.xla_computation = &xla_computation_serialized;
+    StatusHelper status;
+    params.status = status.c_status;
+
+    stream_executor::tpu::OpsApiFn()
+        ->TpuEmbeddingEngine_DedupDataTupleMaskComputationFn(&params);
+    OP_REQUIRES_OK(ctx, status.status());
+
+    auto xla_computation =
+        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+            xla_computation_serialized);
+    const xla::XlaOp deduplication_data_tuple_mask =
+        xla::Call(ctx->builder(), xla_computation, {});
+    ctx->SetOutput(0, deduplication_data_tuple_mask);
+    VLOG(1) << "Compile ComputeDedupDataTupleMaskOp done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ComputeDedupDataTupleMaskOp);
+};
+
+REGISTER_XLA_OP(Name("ComputeDedupDataTupleMask").AllowVariantTypes(),
+                ComputeDedupDataTupleMaskOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index 5f3c00caaca..6a58c9efd29 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -810,7 +810,7 @@ Status CreateConcatAndSplitNodesForInputTensor(
 
     Node* split_vec_node = nullptr;
     TensorShape split_vec_shape;
-    split_vec_shape.AddDim(1);
+    TF_RETURN_IF_ERROR(split_vec_shape.AddDimWithStatus(1));
     split_vec_shape.set_dim(0, last_dim_vec.size());
 
     Tensor split_vec_tensor(DT_INT32, split_vec_shape);
@@ -995,7 +995,7 @@ Status CreateConcatAndSplitNodesForOutputTensor(
 
     Node* split_vec_node = nullptr;
     TensorShape split_vec_shape;
-    split_vec_shape.AddDim(1);
+    TF_RETURN_IF_ERROR(split_vec_shape.AddDimWithStatus(1));
     split_vec_shape.set_dim(0, last_dim_vec.size());
 
     Tensor split_vec_tensor(DT_INT32, split_vec_shape);
@@ -1410,7 +1410,7 @@ Status TPUPartitionedCallOp::InitializeVarOnTPU(
   TF_RETURN_IF_ERROR(
       InstantiatePartition(*init_graph, fname, device, &fhandle, nullptr));
 
-  FunctionLibraryRuntime::Options opts;
+  FunctionLibraryRuntime::Options opts(ctx->step_id());
   opts.step_container = ctx->step_container();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.stats_collector = ctx->stats_collector();
@@ -1569,7 +1569,7 @@ Status TPUPartitionedCallOp::InitializeShardedVarOnTPU(
     functions.push_back(DeviceAndFHandle{.device = target, .handle = handle});
   }
 
-  FunctionLibraryRuntime::Options opts;
+  FunctionLibraryRuntime::Options opts(ctx->step_id());
 
   // Blocking on threads in the same thread pool is disallowed because
   // concurrent warm-up requests can exhaust the default thread pool.
@@ -1801,6 +1801,13 @@ Status TPUPartitionedCallOp::ReplaceResourceArgsWithVarHandleOps(
 Status TPUPartitionedCallOp::ReplaceAndPartitionXLAShardingVariable(
     Graph* graph, OpKernelContext* ctx, int device_ordinal,
     ResourceHandle& handle, Node* variable, const TPUMetadata& tpu_metadata) {
+  if (device_ordinal >= tpu_metadata.topology.num_tpu_devices_per_task()) {
+    return errors::InvalidArgument(
+        "There are ", tpu_metadata.topology.num_tpu_devices_per_task(),
+        " TPU devices, however selected device_ordinal: ", device_ordinal,
+        " exceeds the range");
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto sharding,
       GetShardingFromNodeDef(variable->def(), /*add_metadata=*/false));
@@ -2393,6 +2400,15 @@ Status TPUPartitionedCallOp::GetGraphFromFunction(
               coordinates_end);
           node->AddAttr("device_assignment", tpu_metadata->device_assignment);
         }
+
+        if (tpu_metadata->topology.num_tpu_devices_per_task() <
+            tpu_metadata->num_cores_per_replica) {
+          return errors::InvalidArgument(
+              "num_cores_per_replica: ", tpu_metadata->num_cores_per_replica,
+              " in the graph is larger than the number of available TPU "
+              "devices: ",
+              tpu_metadata->topology.num_tpu_devices_per_task());
+        }
       }
     }
   }
@@ -2686,7 +2702,7 @@ void TPUPartitionedCallOp::ExecuteFunctions(
     const std::vector<DeviceAndFHandle>& functions, OpKernelContext* ctx,
     int device_ordinal, int64_t ordinal_selector_req_id, DoneCallback done) {
   profiler::TraceMe trace_me("TPUPartitionedCallOp-ExecuteFunctions");
-  FunctionLibraryRuntime::Options opts;
+  FunctionLibraryRuntime::Options opts(ctx->step_id());
   opts.step_container = ctx->step_container();
   opts.stats_collector = ctx->stats_collector();
   // TODO(akshayka): Consider selecting a runner on a per-device basis,
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.h b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
index b01a8ac0471..1fe2e138437 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
@@ -120,7 +120,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
     if (!status.ok()) {
       autotuner_thresh_ = 0;
     }
-    tensorflow::tpu::OpsApiFn()->TfTpu_GetTpuPartitionedCallParamsFn(
+    stream_executor::tpu::OpsApiFn()->TfTpu_GetTpuPartitionedCallParamsFn(
         &runtime_params_);
   }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
index 40039acd2e1..1c117d4b65e 100644
--- a/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
@@ -47,7 +47,7 @@ class TpuHandleToProtoKeyOp : public OpKernel {
     OP_REQUIRES_OK(ctx, cache->GetKeysFromUid(uid.scalar<int64_t>()(), &keys));
 
     TensorShape output_shape;
-    output_shape.AddDim(keys.size());
+    OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(keys.size()));
     Tensor* result = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
     for (int i = 0; i < keys.size(); ++i) {
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 9915bbcf13f..b71d3fc46cb 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -37,14 +37,14 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
 
   ~TpuMeshStateInterface() override {
     if (mesh_state_ != nullptr) {
-      OpsApiFn()->TpuMeshState_FreeFn(mesh_state_);
+      stream_executor::tpu::OpsApiFn()->TpuMeshState_FreeFn(mesh_state_);
     }
   }
 
   static TpuMeshStateInterface* Create() {
     XLA_TpuMeshState* state = nullptr;
-    if (OpsApiFn()->TpuMeshState_CreateFn != nullptr) {
-      state = OpsApiFn()->TpuMeshState_CreateFn();
+    if (stream_executor::tpu::OpsApiFn()->TpuMeshState_CreateFn != nullptr) {
+      state = stream_executor::tpu::OpsApiFn()->TpuMeshState_CreateFn();
     }
     return new TpuMeshStateInterface(state);
   }
@@ -56,7 +56,8 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
       return nullptr;
     }
     return static_cast<tensorflow::TpuMeshCommonState*>(
-        OpsApiFn()->TpuMeshState_MeshCommonStateFn(mesh_state_));
+        stream_executor::tpu::OpsApiFn()->TpuMeshState_MeshCommonStateFn(
+            mesh_state_));
   }
 
   // Returns whether we should include the device assignment as a static field
@@ -70,8 +71,8 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
     // Static device assignment enables XLA to perform certain optimization when
     // all cores are used in the replicated computation.
     return metadata.num_cores_per_replica() * metadata.num_replicas() ==
-           OpsApiFn()->TpuTopology_AvailableCoreCountFn(mesh_state_,
-                                                        tpu_core_type);
+           stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoreCountFn(
+               mesh_state_, tpu_core_type);
   }
 
   string DebugString() const override { return "TpuMeshStateInterface"; }
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 2422746e4eb..9c7ec1f8333 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -89,10 +89,10 @@ std::string GuaranteedConstFingerprint(
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
     for (const Tensor& constant : guaranteed_constants) {
-      fingerprint =
-          tpu::OpsApiFn()->TpuCompile_CreateGuaranteedConstFingerprintFn(
-              fingerprint, constant.tensor_data().data(),
-              constant.tensor_data().size());
+      fingerprint = stream_executor::tpu::OpsApiFn()
+                        ->TpuCompile_CreateGuaranteedConstFingerprintFn(
+                            fingerprint, constant.tensor_data().data(),
+                            constant.tensor_data().size());
     }
     return std::to_string(fingerprint);
   } else {
@@ -124,7 +124,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     }
   }
   CompilationCacheKeyResult result =
-      tpu::OpsApiFn()->TpuCompile_CreateCompilationCacheKeyFn(
+      stream_executor::tpu::OpsApiFn()->TpuCompile_CreateCompilationCacheKeyFn(
           CompilationCacheKeyProperty{
               config_prefix.data(), shapes_prefix.data(), function_name.data(),
               mlir_module_fingerprint, flattened_device_ids.data(),
@@ -133,7 +133,8 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
               metadata.num_replicas(), mesh_state.data(), session_id,
               resource_mgr});
   auto buffer_cleanup = gtl::MakeCleanup([result]() {
-    tpu::OpsApiFn()->TpuCompile_DestroyCompilationCacheKeyFn(result);
+    stream_executor::tpu::OpsApiFn()->TpuCompile_DestroyCompilationCacheKeyFn(
+        result);
   });
   TpuCompilationCacheKey key;
   key.prefix = result.key;
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
index abdbd30da64..ae6a41d8e2c 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
@@ -30,22 +30,24 @@ constexpr int32_t kDeferredCoreSelectionReserved = -8193;
 class TPUOrdinalSelector : TPUOrdinalSelectorInterface {
  public:
   explicit TPUOrdinalSelector(int num_cores_per_replica = 1) {
-    OpsApiFn()->TfTpuOrdinalSelector_CreateFn(&ordinal_selector_,
-                                              num_cores_per_replica);
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_CreateFn(
+        &ordinal_selector_, num_cores_per_replica);
   }
   ~TPUOrdinalSelector() override {
-    OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(ordinal_selector_);
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(
+        ordinal_selector_);
   }
   int64_t GetOrdinal(absl::optional<uint64> key, int64_t* req_id) override {
     int64_t ordinal;
-    OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(ordinal_selector_, key,
-                                                  req_id, &ordinal);
+    stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(
+        ordinal_selector_, key, req_id, &ordinal);
     return ordinal;
   }
   void DequeueFromCoreSelector(int32_t device_ordinal,
                                int64_t req_id) override {
-    OpsApiFn()->TfTpuOrdinalSelector_DequeueFromCoreSelectorFn(
-        ordinal_selector_, device_ordinal, req_id);
+    stream_executor::tpu::OpsApiFn()
+        ->TfTpuOrdinalSelector_DequeueFromCoreSelectorFn(
+            ordinal_selector_, device_ordinal, req_id);
   }
 
  private:
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index b147f96f476..d088980af74 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -75,7 +75,8 @@ Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
   char* server_address_output = nullptr;
   auto cleanup = absl::MakeCleanup([&status, &server_address_output]() {
     TF_DeleteStatus(status);
-    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(server_address_output);
+    stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        server_address_output);
   });
   size_t server_address_output_size;
   *serving_port = -1;
@@ -88,7 +89,8 @@ Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
   params.port_output = serving_port;
   params.status = status;
 
-  tpu::OpsApiFn()->TpuConfigurationApi_GetServerAddressAndPortFn(&params);
+  stream_executor::tpu::OpsApiFn()
+      ->TpuConfigurationApi_GetServerAddressAndPortFn(&params);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
   *server_address =
       std::string(server_address_output, server_address_output_size);
@@ -105,7 +107,8 @@ TpuPodState::~TpuPodState() {
     VLOG(1) << "Shutting down Compilation Cache Service.";
     if (cache_service_->Shutdown(20)) {
       if (service_port_ >= 0) {
-        tpu::OpsApiFn()->TpuNetUtil_RecycleUnusedPortFn(service_port_);
+        stream_executor::tpu::OpsApiFn()->TpuNetUtil_RecycleUnusedPortFn(
+            service_port_);
       }
     } else {
       LOG(ERROR)
@@ -157,7 +160,8 @@ Status ConstructTpuPodState(
 
   char* host_config_output = nullptr;
   auto host_config_cleanup = absl::MakeCleanup([&host_config_output]() {
-    tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+    stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        host_config_output);
   });
   size_t host_config_output_size;
 
@@ -172,7 +176,7 @@ Status ConstructTpuPodState(
   params.host_config_output = &host_config_output;
   params.status = status;
 
-  tpu::OpsApiFn()->ConfigureDistributedTpuOp_DoWorkFn(&params);
+  stream_executor::tpu::OpsApiFn()->ConfigureDistributedTpuOp_DoWorkFn(&params);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
   *host_config_proto = std::string(host_config_output, host_config_output_size);
 
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index 1f56e4f24aa..facb0c62d7e 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 
-#include "tensorflow/compiler/xla/service/hlo_module_group.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 namespace se_tpu = ::stream_executor::tpu;
-using stream_executor::port::Status;
+using tsl::Status;
 }  // namespace
 
 TPUExecutableInfoProto TpuProgramGroup::ConstructExecutableInfo(
@@ -38,7 +38,7 @@ TPUExecutableInfoProto TpuProgramGroup::ConstructExecutableInfo(
   VLOG(1) << "ConstructExecutableInfo";
   TpuSerializedProto serialized_executable_info = {};
   StatusHelper status;
-  OpsApiFn()->TpuProgram_GetExecutableInfoFn(
+  stream_executor::tpu::OpsApiFn()->TpuProgram_GetExecutableInfoFn(
       xla_tpu_program, &serialized_executable_info, status.c_status);
   TPUExecutableInfoProto executable_info;
   if (status.ok()) {
@@ -54,7 +54,7 @@ TPUHostTransferInfoProto TpuProgramGroup::ConstructHostTransferInfo(
   VLOG(1) << "ConstructHostTransferInfo";
   TpuSerializedProto serialized_host_transfer_info = {};
   StatusHelper status;
-  OpsApiFn()->TpuProgram_GetHostTransferInfoFn(
+  stream_executor::tpu::OpsApiFn()->TpuProgram_GetHostTransferInfoFn(
       xla_tpu_program, &serialized_host_transfer_info, status.c_status);
   TPUHostTransferInfoProto host_transfer_info;
   if (status.ok()) {
@@ -70,7 +70,7 @@ xla::HloProto TpuProgramGroup::ConstructHloMetadata(
   VLOG(1) << "ConstructHloMetadata";
   TpuSerializedProto serialized_hlo_metadata = {};
   StatusHelper status;
-  OpsApiFn()->TpuProgram_GetHloMetadataFn(
+  stream_executor::tpu::OpsApiFn()->TpuProgram_GetHloMetadataFn(
       xla_tpu_program, &serialized_hlo_metadata, status.c_status);
   xla::HloProto hlo_metadata;
   if (status.ok()) {
@@ -99,8 +99,8 @@ void TpuProgramGroup::Initialize(
   for (size_t i = 0; i < tpu_programs_.size(); ++i) {
     const XLA_TpuProgram* xla_tpu_program = tpu_programs_[i];
     bool may_modify_variables;
-    OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(xla_tpu_program,
-                                                   &may_modify_variables);
+    stream_executor::tpu::OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(
+        xla_tpu_program, &may_modify_variables);
     may_modify_variables_array[i] = may_modify_variables;
     executable_infos[i] = ConstructExecutableInfo(xla_tpu_program);
     host_transfer_infos[i] = ConstructHostTransferInfo(xla_tpu_program);
@@ -116,7 +116,8 @@ void TpuProgramGroup::Initialize(
 
 bool TpuProgramGroup::has_sharding_program() const {
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    if (!OpsApiFn()->TpuProgram_HasShardingFn(tpu_program)) {
+    if (!stream_executor::tpu::OpsApiFn()->TpuProgram_HasShardingFn(
+            tpu_program)) {
       return false;
     }
   }
@@ -128,7 +129,8 @@ size_t TpuProgramGroup::program_count() const { return tpu_programs_.size(); }
 int64_t TpuProgramGroup::program_size() const {
   int64_t total_size = 0;
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    total_size += OpsApiFn()->TpuProgram_GetProgramSizeFn(tpu_program);
+    total_size += stream_executor::tpu::OpsApiFn()->TpuProgram_GetProgramSizeFn(
+        tpu_program);
   }
   return total_size;
 }
@@ -136,7 +138,9 @@ int64_t TpuProgramGroup::program_size() const {
 bool TpuProgramGroup::LogProgramMemorySummary() {
   bool success = true;
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
-    success &= OpsApiFn()->TpuProgram_LogProgramMemorySummaryFn(tpu_program);
+    success &=
+        stream_executor::tpu::OpsApiFn()->TpuProgram_LogProgramMemorySummaryFn(
+            tpu_program);
   }
   return success;
 }
@@ -144,7 +148,8 @@ bool TpuProgramGroup::LogProgramMemorySummary() {
 void TpuProgramGroup::UnloadAndDestroyPrograms() {
   for (XLA_TpuProgram* tpu_program : tpu_programs_) {
     StatusHelper status;
-    OpsApiFn()->TpuProgram_UnloadAndDestroyFn(tpu_program, status.c_status);
+    stream_executor::tpu::OpsApiFn()->TpuProgram_UnloadAndDestroyFn(
+        tpu_program, status.c_status);
     auto s = status.status();
     if (!s.ok()) {
       LOG(ERROR) << "TpuProgramGroup::UnloadPrograms(): " << s.ToString();
@@ -201,8 +206,8 @@ bool TpuProgramGroup::may_modify_variables(int index) const {
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   bool may_modify_variables;
-  OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(tpu_programs_[index],
-                                                 &may_modify_variables);
+  stream_executor::tpu::OpsApiFn()->TpuProgram_GetMayModifyVariablesFn(
+      tpu_programs_[index], &may_modify_variables);
   return may_modify_variables;
 }
 
@@ -217,10 +222,12 @@ const std::vector<std::string>& TpuProgramGroup::fingerprints() const {
 void TpuProgramGroup::set_fingerprints() {
   for (const XLA_TpuProgram* tpu_program : tpu_programs_) {
     TpuProgramFingerprint fingerprint =
-        OpsApiFn()->TpuProgram_GetFingerprintFn(tpu_program);
+        stream_executor::tpu::OpsApiFn()->TpuProgram_GetFingerprintFn(
+            tpu_program);
     tpu_program_fingerprints_.emplace_back(
         std::string(fingerprint.bytes, fingerprint.size));
-    OpsApiFn()->TpuProgram_DestroyFingerprintFn(fingerprint);
+    stream_executor::tpu::OpsApiFn()->TpuProgram_DestroyFingerprintFn(
+        fingerprint);
   }
 }
 
@@ -269,9 +276,9 @@ Status TpuProgramGroup::CompileAndBuild(
   size_t count = 0;
   XLA_TpuProgram** xla_tpu_programs = nullptr;
   StatusHelper status;
-  OpsApiFn()->TpuCompile_CompileAndBuildFn(serialized_compilation_request,
-                                           mesh_state, &xla_tpu_programs,
-                                           &count, status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuCompile_CompileAndBuildFn(
+      serialized_compilation_request, mesh_state, &xla_tpu_programs, &count,
+      status.c_status);
   if (!status.ok()) {
     VLOG(1) << "Run CompileAndBuild failed.";
     return status.status();
@@ -286,7 +293,7 @@ Status TpuProgramGroup::CompileAndBuild(
       tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
   tpu_program_group->Initialize(
       absl::MakeConstSpan(&xla_tpu_programs[0], count));
-  OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
+  stream_executor::tpu::OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
   return status.status();
 }
 
@@ -303,9 +310,9 @@ Status TpuProgramGroup::CompileAndBuild(
   size_t count = 0;
   XLA_TpuProgram** xla_tpu_programs = nullptr;
   StatusHelper status;
-  OpsApiFn()->TpuCompile_XrtCompileAndBuildFn(serialized_compilation_request,
-                                              mesh_state, &xla_tpu_programs,
-                                              &count, status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuCompile_XrtCompileAndBuildFn(
+      serialized_compilation_request, mesh_state, &xla_tpu_programs, &count,
+      status.c_status);
   if (!status.ok()) {
     VLOG(1) << "Run CompileAndBuild failed.";
     return status.status();
@@ -322,7 +329,7 @@ Status TpuProgramGroup::CompileAndBuild(
       tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
   tpu_program_group->Initialize(
       absl::MakeConstSpan(&xla_tpu_programs[0], count));
-  OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
+  stream_executor::tpu::OpsApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
   return status.status();
 }
 
@@ -331,9 +338,11 @@ std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
   std::vector<XLA_TpuProgram*> tpu_programs;
   tpu_programs.reserve(tpu_programs_.size());
   for (size_t i = 0; i < tpu_programs_.size(); ++i) {
-    if (OpsApiFn()->TpuProgram_HasShardingFn(tpu_programs_[i])) {
-      tpu_programs.push_back(OpsApiFn()->TpuProgram_GetTpuProgramFn(
-          tpu_programs_[i], sharding_type));
+    if (stream_executor::tpu::OpsApiFn()->TpuProgram_HasShardingFn(
+            tpu_programs_[i])) {
+      tpu_programs.push_back(
+          stream_executor::tpu::OpsApiFn()->TpuProgram_GetTpuProgramFn(
+              tpu_programs_[i], sharding_type));
       CHECK_NE(tpu_programs[i], nullptr);
     }
   }
@@ -347,11 +356,13 @@ Status TpuProgramGroup::DeserializeFromRpcResponseProtos(
 
   for (size_t i = 0; i < rpc_response_protos.size(); ++i) {
     StatusHelper status;
-    auto* xla_tpu_program = OpsApiFn()->TpuProgram_NewFn();
-    OpsApiFn()->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
-        rpc_response_protos[i], xla_tpu_program, status.c_status);
+    auto* xla_tpu_program =
+        stream_executor::tpu::OpsApiFn()->TpuProgram_NewFn();
+    stream_executor::tpu::OpsApiFn()
+        ->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
+            rpc_response_protos[i], xla_tpu_program, status.c_status);
     if (!status.status().ok()) {
-      OpsApiFn()->TpuProgram_FreeFn(xla_tpu_program);
+      stream_executor::tpu::OpsApiFn()->TpuProgram_FreeFn(xla_tpu_program);
       return status.status();
     }
     tpu_programs[i] = xla_tpu_program;
@@ -366,8 +377,8 @@ Status TpuProgramGroup::SerializeExecutable(
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   StatusHelper status;
-  OpsApiFn()->TpuProgram_SerializeTpuExecutableFn(tpu_programs_[index],
-                                                  executable, status.c_status);
+  stream_executor::tpu::OpsApiFn()->TpuProgram_SerializeTpuExecutableFn(
+      tpu_programs_[index], executable, status.c_status);
   return status.status();
 }
 
@@ -376,7 +387,7 @@ Status TpuProgramGroup::SerializeCompilerMetadata(
   CHECK_GE(index, 0);
   CHECK_LT(index, tpu_programs_.size());
   StatusHelper status;
-  OpsApiFn()->TpuProgram_SerializeCompilerMetadataFn(
+  stream_executor::tpu::OpsApiFn()->TpuProgram_SerializeCompilerMetadataFn(
       tpu_programs_[index], compiler_metadata, status.c_status);
   return status.status();
 }
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 42d5fd2f8ac..e7e40a0348b 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -86,7 +86,7 @@ class TpuAotCompilationOptions : public xla::AotCompilationOptions {
 
 class TpuProgramGroup : public TpuProgramGroupInterface {
  public:
-  using Status = ::stream_executor::port::Status;
+  using Status = ::tsl::Status;
 
   // Compiles Mlir or TF function computation by lowering into HLO IR and
   // returns TPU programs ready for execution.
diff --git a/tensorflow/core/tpu/kernels/xla/BUILD b/tensorflow/core/tpu/kernels/xla/BUILD
index c83812daf45..2d60b70359c 100644
--- a/tensorflow/core/tpu/kernels/xla/BUILD
+++ b/tensorflow/core/tpu/kernels/xla/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/tpu/kernels/xla/infeed_op.cc b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
index 4bf62c17237..bbbac04834d 100644
--- a/tensorflow/core/tpu/kernels/xla/infeed_op.cc
+++ b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
@@ -37,8 +37,8 @@ xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
 
   ApiConverter::ToC(shape, &c_shape);
 
-  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
-                                                             &c_infeed_shape);
+  stream_executor::tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(
+      &c_shape, &c_infeed_shape);
   xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
   ApiConverter::Destroy(&c_shape);
   ApiConverter::Destroy(&c_infeed_shape);
diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
index 5e620c3c902..6221ee56e50 100644
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -164,13 +165,14 @@ cc_library(
     ],
     linkstatic = 1,
     deps = [
+        ":tpu_embedding_shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
         "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
         "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
-        "//tensorflow/core/tpu/ops:tpu_embedding_shape_util",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
index 1d01130db77..5d16ea95d2c 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
@@ -13,17 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/tpu/ops/tpu_embedding_ops.h"
+
 #include <array>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/tpu/ops/tpu_embedding_shape_util.h"
 #include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
@@ -448,4 +453,95 @@ REGISTER_OP("XlaRecvTPUEmbeddingDeduplicationData")
     .SetIsStateful()
     .SetShapeFn(tensorflow::shape_inference::ScalarShape);
 
+// XlaRecvTPUEmbeddingDeduplicationData returns `output` of an XLA tuple, which
+// consists of integer and floating point values. For cases that users needs
+// static shape output, this XLA tuple can not be returned. Therefore we create
+// a pair of conversion operations, to convert deduplication data (XLA tuple)
+// to tensors and vice versa.
+// `SplitDedupData` is to split deduplication data XLA tuple into integer and
+// floating point tensors. Here we assume deduplication data XLA tuple only
+// has two type of elements. We infer output shapes of these two tensors with
+// `tuple_mask`, which is a serialized proto of 2-D int tensor. The first column
+// of `tuple_mask` is consisted by 0 and 1, where 0 means integer type, 1 means
+// floating point type. The second column is length of span, summation of these
+// spans should be equal to number of elements in deduplication data XLA tuple.
+// For example, `tuple_mask` of tuple (1, 2, 0.1, 3) is [[0, 2], [1, 1], [0, 1]]
+
+REGISTER_OP("SplitDedupData")
+    .Input("input: variant")
+    .Output("integer_tensor: integer_type")
+    .Output("float_tensor: float_type")
+    .Attr("integer_type: {int32, int64, uint32, uint64}")
+    .Attr("float_type: {half, bfloat16, float}")
+    .Attr("tuple_mask: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> Status {
+      std::string tuple_mask_str;
+      TF_RETURN_IF_ERROR(c->GetAttr("tuple_mask", &tuple_mask_str));
+
+      tensorflow::TensorProto tuple_mask_tensor;
+      if (!tuple_mask_tensor.ParseFromString(tuple_mask_str)) {
+        return errors::InvalidArgument(
+            "Malformed `tuple_mask` attr in SplitDedupData Op.");
+      }
+      const tensorflow::TensorShapeProto& tuple_tensor_shape =
+          tuple_mask_tensor.tensor_shape();
+      const int num_tuple_elements = tuple_tensor_shape.dim(0).size();
+      if (num_tuple_elements == 0) {
+        c->set_output(0, c->MakeShape({c->MakeDim(0)}));
+        c->set_output(1, c->MakeShape({c->MakeDim(0)}));
+        return OkStatus();
+      }
+
+      const int tuple_mask_rank = tuple_tensor_shape.dim_size();
+      if (tuple_mask_rank != 2) {
+        return errors::InvalidArgument(
+            "`tuple_mask` TensorProto must be a rank-2 tensor, but get ",
+            tuple_mask_rank);
+      }
+      TF_RET_CHECK(tuple_mask_tensor.int_val_size() == 2 * num_tuple_elements);
+
+      int integer_offset = 0;  // Offset of integer elements in tuple.
+      int float_offset = 0;    // Offset of floating elements in tuple.
+      for (int i = 0; i < num_tuple_elements; i++) {
+        const int element_type = tuple_mask_tensor.int_val(2 * i);
+        const int span_size = tuple_mask_tensor.int_val(2 * i + 1);
+
+        if (element_type == DedupTupleElementType::kInteger) {
+          integer_offset += span_size;
+        } else if (element_type == DedupTupleElementType::kFloat) {
+          float_offset += span_size;
+        } else {
+          return errors::InvalidArgument(
+              "Unexpected type of element in deduplication tuple, enum = ",
+              element_type, ", which is not integer or floating.");
+        }
+      }
+
+      const shape_inference::DimensionHandle integer_tensor_dim =
+          c->MakeDim(integer_offset);
+      const shape_inference::DimensionHandle float_tensor_dim =
+          c->MakeDim(float_offset);
+      c->set_output(0, c->MakeShape({integer_tensor_dim}));
+      c->set_output(1, c->MakeShape({float_tensor_dim}));
+      return OkStatus();
+    });
+
+// `MergeDedupData` is to merge outputs of `SplitDedupData` back to an XLA tuple
+// as deduplication data, with respect to `tuple_mask`.
+
+REGISTER_OP("MergeDedupData")
+    .Input("integer_tensor: integer_type")
+    .Input("float_tensor: float_type")
+    .Output("output: variant")
+    .Attr("tuple_mask: string")
+    .Attr("integer_type: {int32, int64, uint32, uint64}")
+    .Attr("float_type: {half, bfloat16, float}")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("ComputeDedupDataTupleMask")
+    .Output("output_shape: int32")
+    .Attr("config: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.h b/tensorflow/core/tpu/ops/tpu_embedding_ops.h
index 1cd24dcadf6..324f2b4e102 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.h
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.h
@@ -26,6 +26,12 @@ std::vector<std::string> GetPerTableLoadOptimizationParametersOps();
 // Get the names of the RetrieveTPUEmbedding*Parameters ops.
 std::vector<std::string> GetPerTableRetrieveOptimizationParametersOps();
 
+// Type enum of elements in deduplication data tuple.
+enum DedupTupleElementType {
+  kInteger = 0,
+  kFloat = 1,
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_OPS_TPU_EMBEDDING_OPS_H_
diff --git a/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc b/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
index 1a185ad2107..21e5de228ab 100644
--- a/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -33,6 +36,28 @@ ShapeHandle _UpdatePartitionDim(InferenceContext* c, const ShapeHandle handle,
   return newoutput0;
 }
 
+StatusOr<ShapeHandle> _ComputeOutputShape(
+    InferenceContext* c, const ShapeHandle handle,
+    const std::vector<int>& partition_dims) {
+  int rank = InferenceContext::Rank(handle);
+  if (partition_dims.empty()) {
+    return handle;  // no partitioning; input and output shapes same
+  } else if (rank > partition_dims.size()) {
+    return errors::InvalidArgument("Need at least ", rank,
+                                   " partition dimensions.");
+  }
+
+  ShapeHandle previous = handle;
+  for (int i = 0; i < rank; ++i) {
+    shape_inference::DimensionHandle dim;
+    TF_RETURN_IF_ERROR(
+        c->Multiply(c->Dim(previous, i), partition_dims[i], &dim));
+    TF_RETURN_IF_ERROR(c->ReplaceDim(previous, i, dim, &previous));
+  }
+
+  return previous;
+}
+
 REGISTER_OP("TPUPartitionedInput")
     .Input("inputs: N * T")
     .Output("output: T")
@@ -45,7 +70,30 @@ REGISTER_OP("TPUPartitionedInput")
       int partition_dim;
       TF_RETURN_IF_ERROR(c->GetAttr("partition_dim", &partition_dim));
 
+      if (c->num_inputs() == 0) {
+        return errors::InvalidArgument(
+            "Expected at least one input to TPUPartitionedInput.");
+      }
+
       ShapeHandle cur = c->input(c->num_inputs() - 1);
+      int rank = InferenceContext::kUnknownRank;
+      if (dtype == DT_RESOURCE) {
+        auto* shapes_and_types =
+            c->input_handle_shapes_and_types(c->num_inputs() - 1);
+        if (shapes_and_types) {
+          ShapeHandle shape_handle = shapes_and_types->at(0).shape;
+          rank = InferenceContext::Rank(shape_handle);
+        }
+      } else {
+        rank = InferenceContext::Rank(cur);
+      }
+
+      // limitation: can only validate rank when it is known
+      if ((rank != InferenceContext::kUnknownRank && partition_dim >= rank) ||
+          (partition_dim < -1))
+        return errors::InvalidArgument("Cannot partition dim ", partition_dim,
+                                       " of rank ", rank, " tensor.");
+
       for (int i = c->num_inputs() - 2; i >= 0; --i) {
         TF_RETURN_WITH_CONTEXT_IF_ERROR(c->Merge(c->input(i), cur, &cur),
                                         "From merging shape ", i,
@@ -101,4 +149,85 @@ REGISTER_OP("TPUPartitionedInput")
       return OkStatus();
     });
 
+REGISTER_OP("TPUPartitionedInputV2")
+    .Input("inputs: N * T")
+    .Output("output: T")
+    .Attr("N: int >= 1")
+    .Attr("T: type")
+    .Attr("partition_dims: list(int)")
+    .Attr("is_packed: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+      std::vector<int> partition_dims;
+      TF_RETURN_IF_ERROR(c->GetAttr("partition_dims", &partition_dims));
+      bool is_packed;
+      TF_RETURN_IF_ERROR(c->GetAttr("is_packed", &is_packed));
+
+      int num_partitions = 1;
+      for (const int& partition_dim : partition_dims) {
+        num_partitions *= partition_dim;
+      }
+
+      bool replicated = partition_dims.empty();
+      int num_inputs_expected = is_packed ? 1 : num_partitions;
+      if (!((replicated && !is_packed) ||
+            (c->num_inputs() == num_inputs_expected))) {
+        // we cannot validate the number of inputs for replicated, unpacked ops
+        // since we cannot infer the number of partitions from partition_dims
+        return errors::InvalidArgument("Expected ", num_inputs_expected,
+                                       " inputs, got ", c->num_inputs(), ".");
+      } else if (c->num_inputs() == 0) {
+        return errors::InvalidArgument(
+            "Expected at least one input to TPUPartitionedInputV2.");
+      }
+
+      ShapeHandle output_shape;
+      if (dtype == DT_RESOURCE) {
+        ShapeHandle previous_shape_handle;
+        const std::vector<shape_inference::ShapeAndType>* shapes_and_types =
+            nullptr;
+        for (int i = c->num_inputs() - 1; i >= 0; --i) {
+          shapes_and_types = c->input_handle_shapes_and_types(i);
+          if (shapes_and_types) {
+            ShapeHandle shape_handle = shapes_and_types->at(0).shape;
+            if (!c->FullyDefined(shape_handle)) {
+              return errors::InvalidArgument("Inputs must have static shape,",
+                                             "input[", i,
+                                             "] has unknown dimension.");
+            }
+
+            if (i != c->num_inputs() - 1) {
+              ShapeHandle tmp;
+              if (!c->Merge(shape_handle, previous_shape_handle, &tmp).ok()) {
+                return errors::InvalidArgument(
+                    "Inputs must have the same shape.");
+              }
+            } else {
+              previous_shape_handle = shape_handle;
+            }
+          }
+        }
+
+        if (shapes_and_types) {
+          TF_ASSIGN_OR_RETURN(
+              output_shape,
+              _ComputeOutputShape(c, previous_shape_handle, partition_dims));
+          std::vector<shape_inference::ShapeAndType> output_shapes_and_types;
+          output_shapes_and_types.push_back(shape_inference::ShapeAndType(
+              output_shape, shapes_and_types->at(0).dtype));
+          c->set_output_handle_shapes_and_types(0, output_shapes_and_types);
+        }
+      }
+
+      if (!c->FullyDefined(output_shape)) {
+        TF_ASSIGN_OR_RETURN(
+            output_shape, _ComputeOutputShape(c, c->input(0), partition_dims));
+      }
+
+      c->set_output(0, output_shape);
+
+      return OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc b/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
index 9ab183ad6d3..a3ce52a3c5e 100644
--- a/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -38,19 +41,86 @@ REGISTER_OP("TPUPartitionedOutput")
       TF_RETURN_IF_ERROR(c->GetAttr("num_splits", &num_splits));
       if (dtype == DT_RESOURCE) {
         return errors::Unimplemented("Not implemented.");
+      } else if (c->num_inputs() == 0) {
+        return errors::InvalidArgument(
+            "Expected at least one input to TPUPartitionedOutput.");
       }
 
       ShapeHandle input = c->input(0);
+      int rank = InferenceContext::Rank(input);
+      // limitation: can only validate rank when it is known
+      if ((rank != InferenceContext::kUnknownRank && partition_dim >= rank) ||
+          (partition_dim < -1))
+        return errors::InvalidArgument("Cannot partition dim ", partition_dim,
+                                       " of rank ", rank, " tensor.");
+
       ShapeHandle newoutput0;
-      shape_inference::DimensionHandle new_dim;
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(
-          c->Divide(c->Dim(input, partition_dim), num_splits,
-                    true /* evenly_divisible */, &new_dim),
-          "Number of ways to split should evenly divide the split dimension");
-      TF_CHECK_OK(c->ReplaceDim(input, partition_dim, new_dim, &newoutput0));
+      if (partition_dim == -1) {
+        newoutput0 = input;  // replicated input/output share shapes
+      } else {
+        shape_inference::DimensionHandle new_dim;
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            c->Divide(c->Dim(input, partition_dim), num_splits,
+                      true /* evenly_divisible */, &new_dim),
+            "Number of ways to split should evenly divide the split dimension");
+        TF_CHECK_OK(c->ReplaceDim(input, partition_dim, new_dim, &newoutput0));
+      }
+
       for (int i = num_splits - 1; i >= 0; --i) {
         c->set_output(i, newoutput0);
       }
+
+      return OkStatus();
+    });
+
+REGISTER_OP("TPUPartitionedOutputV2")
+    .Input("inputs:  T")
+    .Output("output: num_splits * T")
+    .Attr("T: type")
+    .Attr("num_splits: int >= 1")
+    .Attr("partition_dims: list(int)")
+    .SetShapeFn([](InferenceContext* c) {
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+      std::vector<int> partition_dims;
+      TF_RETURN_IF_ERROR(c->GetAttr("partition_dims", &partition_dims));
+      int num_splits;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_splits", &num_splits));
+      if (dtype == DT_RESOURCE) {
+        return errors::Unimplemented("Not implemented.");
+      } else if (c->num_inputs() == 0) {
+        return errors::InvalidArgument(
+            "Expected at least one input to TPUPartitionedOutputV2.");
+      }
+
+      ShapeHandle handle = c->input(0);
+      int rank = InferenceContext::Rank(handle);
+      int num_cores_per_replica = 1;
+      for (const int& partition_dim : partition_dims) {
+        num_cores_per_replica *= partition_dim;
+      }
+
+      if (num_splits != num_cores_per_replica) {
+        return errors::InvalidArgument("Expected ", num_cores_per_replica,
+                                       " splits.");
+      } else if (rank > (int)partition_dims.size()) {
+        return errors::InvalidArgument("Expected at least ", rank,
+                                       " partition dimensions.");
+      }
+
+      for (int i = 0; i < rank; ++i) {
+        shape_inference::DimensionHandle dim;
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            c->Divide(c->Dim(handle, i), partition_dims[i],
+                      true /* evenly_divisible */, &dim),
+            "Number of ways to split should evenly divide the split dimension");
+        TF_CHECK_OK(c->ReplaceDim(handle, i, dim, &handle));
+      }
+
+      for (int i = num_splits - 1; i >= 0; --i) {
+        c->set_output(i, handle);
+      }
+
       return OkStatus();
     });
 
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index dbbf266fa97..cadcc6bf645 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #if !defined(PLATFORM_GOOGLE)
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"
+#include "tensorflow/core/platform/status.h"
 #endif
 
 
diff --git a/tensorflow/core/tpu/tpu_embedding_errors.cc b/tensorflow/core/tpu/tpu_embedding_errors.cc
index f4c3759ab03..53efce986f1 100644
--- a/tensorflow/core/tpu/tpu_embedding_errors.cc
+++ b/tensorflow/core/tpu/tpu_embedding_errors.cc
@@ -29,7 +29,8 @@ Status AppendTpuEmbeddingErrorPayload(Status obj) {
         absl::StrCat(kTpuEmbeddingErrorMessage, ". ", obj.error_message());
     Status status(obj.code(), error_message);
     TPUEmbeddingError error_payload;
-    status.SetPayload(kTpuEmbeddingErrorUrl, error_payload.SerializeAsString());
+    status.SetPayload(kTpuEmbeddingErrorUrl,
+                      absl::Cord(error_payload.SerializeAsString()));
     return status;
   }
 }
diff --git a/tensorflow/core/tpu/tpu_embedding_errors.h b/tensorflow/core/tpu/tpu_embedding_errors.h
index a9b084ec8d8..9c5d308b336 100644
--- a/tensorflow/core/tpu/tpu_embedding_errors.h
+++ b/tensorflow/core/tpu/tpu_embedding_errors.h
@@ -50,7 +50,8 @@ StatusOr<T> AppendTpuEmbeddingErrorPayload(StatusOr<T> obj) {
         kTpuEmbeddingErrorMessage, ". ", obj.status().error_message());
     Status status(obj.status().code(), error_message);
     TPUEmbeddingError error_payload;
-    status.SetPayload(kTpuEmbeddingErrorUrl, error_payload.SerializeAsString());
+    status.SetPayload(kTpuEmbeddingErrorUrl,
+                      absl::Cord(error_payload.SerializeAsString()));
     return status;
   }
 }
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 09be220c067..506ba04af39 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 526902a3a72..c6aaf8b470f 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
@@ -109,7 +108,7 @@ xla::Shape HostShapeToDeviceShape(const xla::Shape& host_shape) {
   XLA_Shape c_host_shape;
   XLA_Shape c_device_shape;
   ApiConverter::ToC(host_shape, &c_host_shape);
-  tensorflow::tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
+  stream_executor::tpu::OpsApiFn()->HardwareLayout_HostShapeToDeviceShapeFn(
       &c_host_shape, &c_device_shape);
   xla::Shape device_shape = ApiConverter::FromC(&c_device_shape);
   ApiConverter::Destroy(&c_host_shape);
@@ -121,7 +120,8 @@ int64_t ShapeSizeCompact(const xla::Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
   int64_t size =
-      tensorflow::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactFn(&c_shape);
+      stream_executor::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactFn(
+          &c_shape);
   ApiConverter::Destroy(&c_shape);
   return size;
 }
@@ -130,7 +130,7 @@ int64_t ShapeSizeCompactRaw(const xla::Shape& shape) {
   XLA_Shape c_shape;
   ApiConverter::ToC(shape, &c_shape);
   int64_t size =
-      tensorflow::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactRawFn(
+      stream_executor::tpu::OpsApiFn()->HardwareLayout_ShapeSizeCompactRawFn(
           &c_shape);
   ApiConverter::Destroy(&c_shape);
   return size;
@@ -180,15 +180,15 @@ bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
 }
 
 // For dynamic inputs, copy them and attach metadata of shape sizes to the
-// end of the tensor.
+// beginning of the tensor.
 //
 // The buffer for dynamic shapes contains three parts:
 // +--------+
-// |Payload |
+// |Metadata|
 // +--------+
-// | Padding|
+// |Payload |
 // +--------+
-// |Metadata|
+// |Padding |
 // +--------+
 //
 // Metadata contains the sizes of shape without padding, eventually
@@ -234,8 +234,9 @@ xla::Status UpdateDynamicInputs(
               se::DeviceMemory<int8>(mutable_input_mem->AsDeviceMemoryBase()),
               absl::MakeSpan(absl::bit_cast<int8*>(raw_input_runtime->data()),
                              ShapeSizeCompactRaw(runtime_shape)));
-          stream->ThenDoHostCallback([raw_input_runtime, padded_data,
-                                      runtime_shape, compile_time_shape]() {
+          stream->ThenDoHostCallbackWithStatus([raw_input_runtime, padded_data,
+                                                runtime_shape,
+                                                compile_time_shape]() {
             // After getting the data onto the host, transpose the data to
             // the correct layout by delinearizing it and linearizing it again.
             XLA_Shape c_runtime_shape, c_compile_time_shape;
@@ -255,8 +256,8 @@ xla::Status UpdateDynamicInputs(
             params.compile_time_shape = &c_compile_time_shape;
             params.status = status.c_status;
 
-            tensorflow::tpu::OpsApiFn()->TpuExecute_RuntimeInputToPaddedDataFn(
-                &params);
+            stream_executor::tpu::OpsApiFn()
+                ->TpuExecute_RuntimeInputToPaddedDataFn(&params);
             ApiConverter::Destroy(&c_runtime_shape);
             ApiConverter::Destroy(&c_compile_time_shape);
             return status.status();
@@ -272,7 +273,7 @@ xla::Status UpdateDynamicInputs(
           stream->ThenMemcpyH2D<int8>(*padded_data, &typed_new_input_memory);
 
           // Retain the memory until the end of the transfer.
-          stream->ThenDoHostCallback([padded_data]() { return OkStatus(); });
+          stream->ThenDoHostCallback([padded_data] {});
 
           // Modify the memory location in the input shape tree to point to the
           // new input.
@@ -480,7 +481,8 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
   for (auto& prefetch : hlo_metadata.hlo_module().cross_program_prefetches()) {
     module->AddCrossProgramPrefetch(
         prefetch.parameter(),
-        xla::ShapeIndex(prefetch.index().begin(), prefetch.index().end()));
+        xla::ShapeIndex(prefetch.index().begin(), prefetch.index().end()),
+        prefetch.offset());
   }
 
   TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, backend->memory_allocator(),
diff --git a/tensorflow/core/tpu/tpu_model_server_initializer.cc b/tensorflow/core/tpu/tpu_model_server_initializer.cc
index 77d6f222efb..7cfcdb235a9 100644
--- a/tensorflow/core/tpu/tpu_model_server_initializer.cc
+++ b/tensorflow/core/tpu/tpu_model_server_initializer.cc
@@ -63,7 +63,7 @@ bool FindAndLoadTpuModelServer() {
       InitializeTpuLibrary(library);
     }
   }
-  OpsApiFn()->TfTpu_InitializeTpuModelServerFn();
+  stream_executor::tpu::OpsApiFn()->TfTpu_InitializeTpuModelServerFn();
   return true;
 }
 
diff --git a/tensorflow/core/transforms/BUILD b/tensorflow/core/transforms/BUILD
index 245672368e9..8015d674b1e 100644
--- a/tensorflow/core/transforms/BUILD
+++ b/tensorflow/core/transforms/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/cf_sink/BUILD b/tensorflow/core/transforms/cf_sink/BUILD
index 875b250cae7..778d97191b0 100644
--- a/tensorflow/core/transforms/cf_sink/BUILD
+++ b/tensorflow/core/transforms/cf_sink/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/consolidate_attrs/BUILD b/tensorflow/core/transforms/consolidate_attrs/BUILD
index 6ebe2512c92..b9dabc149d2 100644
--- a/tensorflow/core/transforms/consolidate_attrs/BUILD
+++ b/tensorflow/core/transforms/consolidate_attrs/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
@@ -16,6 +17,7 @@ cc_library(
     deps = [
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir:shape_inference_utils",
+        "//tensorflow/core/ir/importexport:convert_tensor",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/transforms:PassIncGen",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/core/transforms/consolidate_attrs/pass.cc b/tensorflow/core/transforms/consolidate_attrs/pass.cc
index 6c2cca8ddab..05f08a45d68 100644
--- a/tensorflow/core/transforms/consolidate_attrs/pass.cc
+++ b/tensorflow/core/transforms/consolidate_attrs/pass.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/core/ir/dialect.h"
+#include "tensorflow/core/ir/importexport/convert_tensor.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/tf_op_wrapper.h"
 #include "tensorflow/core/ir/types/dialect.h"
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/BUILD b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
index 0a1d171f8c8..598d76d8d30 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/BUILD
+++ b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/constant_folding/BUILD b/tensorflow/core/transforms/constant_folding/BUILD
index 3cfd8f25f2f..d059ec8fe75 100644
--- a/tensorflow/core/transforms/constant_folding/BUILD
+++ b/tensorflow/core/transforms/constant_folding/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index 9a01344c44d..e54835f3490 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -628,7 +628,7 @@ static bool IsValidConstShapeForMulConvPushDown(StringAttr data_format,
     }
 
     // TODO(chiahungduan): Symbolic shape equivalence is acceptable.
-    if (filter_shape.getShape() != llvm::makeArrayRef(broadcast_shape))
+    if (filter_shape.getShape() != llvm::ArrayRef(broadcast_shape))
       return false;
 
     // Only the last dimension could be larger than one, since broadcasting over
@@ -1089,8 +1089,7 @@ class MaterializeBroadcastGradientArgsOp
       int reduction_indices = reduce_dims[j].size();
       ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
           type_attr.getValue(), {reduction_indices},
-          llvm::makeArrayRef<int64_t>(reduce_dims[j].data(),
-                                      reduction_indices));
+          llvm::ArrayRef<int64_t>(reduce_dims[j].data(), reduction_indices));
       FailureOr<TFOp> const_op = CreateConstantTensorOp(
           rewriter, op->getLoc(), TFOp(op).name(), op->getResultTypes()[j],
           TFOp(op).controlRet(), const_attr);
@@ -1181,7 +1180,7 @@ class MaterializeReductionIndices
 
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
         indices_shape.getElementType(), {input_shape.getRank()},
-        llvm::makeArrayRef(elements));
+        llvm::ArrayRef(elements));
 
     FailureOr<TFOp> const_op = CreateConstantTensorOp(
         rewriter, indices->getLoc(), Twine(TFOp(op).name(), "/indices").str(),
@@ -2096,7 +2095,7 @@ class SimplifyReductionOp : public FolderPatternBase<SimplifyReductionOp> {
     std::iota(elements.begin(), elements.end(), 1);
     ElementsAttr const_attr = CreateElementsAttrOfTypeValues(
         builder.getIntegerType(32), {new_num_dimensions},
-        llvm::makeArrayRef(elements));
+        llvm::ArrayRef(elements));
     FailureOr<TFOp> const_op = CreateConstantTensorOp(
         builder, op->getLoc(), TFOp(op).name(),
         *(reduction_indices->result_type_begin()),
@@ -3685,7 +3684,11 @@ void ConstantFolding::runOnOperation() {
     for (Operation &op : func.SingleBlock::getBody()->without_terminator()) {
       ops.push_back(&op);
     }
-    if (!applyOpPatternsAndFold(ops, final_patterns_, /*strict=*/true)) break;
+    bool changed = false;
+    GreedyRewriteConfig config;
+    config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
+    (void)applyOpPatternsAndFold(ops, final_patterns_, config, &changed);
+    if (!changed) break;
   } while (iteration++ < max_iterations);
 
   // TODO(chiahungduan): This is used to avoid evaluating a node multiple times.
diff --git a/tensorflow/core/transforms/cse/BUILD b/tensorflow/core/transforms/cse/BUILD
index c86c11aaff8..01da88ec807 100644
--- a/tensorflow/core/transforms/cse/BUILD
+++ b/tensorflow/core/transforms/cse/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/cse/tests/cse.mlir b/tensorflow/core/transforms/cse/tests/cse.mlir
index 9bb396e643a..231b42185f6 100644
--- a/tensorflow/core/transforms/cse/tests/cse.mlir
+++ b/tensorflow/core/transforms/cse/tests/cse.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg.func(tfg-cse)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg.func(tfg-cse))' %s | FileCheck %s
 
 // CHECK-LABEL: tfg.func @test_simple_cse
 // CHECK-SAME: %[[A:.*]]: tensor
@@ -50,4 +50,4 @@ tfg.func @test_cse_control_tokens(%a: tensor<i32> {tfg.name = "a0"})
   %ctl = NoOp [%ctl1, %ctl0]
   // CHECK: return(%[[ADD1]]) [%[[CTL]] {
   return(%Add1) [%ctl {tfg.name = "noop"}] : tensor<i32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/transforms/drop_unregistered_attribute/BUILD b/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
index 5a630bc685f..98ba466b40c 100644
--- a/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
+++ b/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
index 2d4af2dc57c..52492c59d62 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/func_to_graph/BUILD b/tensorflow/core/transforms/func_to_graph/BUILD
index 5d74f5d753a..498fe45d5eb 100644
--- a/tensorflow/core/transforms/func_to_graph/BUILD
+++ b/tensorflow/core/transforms/func_to_graph/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/func_to_graph/tests/round_trip.mlir b/tensorflow/core/transforms/func_to_graph/tests/round_trip.mlir
index 88c5d98cda3..bee819a7ce0 100644
--- a/tensorflow/core/transforms/func_to_graph/tests/round_trip.mlir
+++ b/tensorflow/core/transforms/func_to_graph/tests/round_trip.mlir
@@ -1,7 +1,7 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 fetches=SomeAdd3 control_rets=SomeAdd4},tfg-lower-func-to-graph' %s | FileCheck %s
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{fetches=SomeAdd3 control_rets=SomeAdd4},tfg-lower-func-to-graph' %s | FileCheck %s
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 control_rets=SomeAdd4},tfg-lower-func-to-graph' %s | FileCheck %s
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2},tfg-lower-func-to-graph' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 fetches=SomeAdd3 control_rets=SomeAdd4},tfg-lower-func-to-graph)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{fetches=SomeAdd3 control_rets=SomeAdd4},tfg-lower-func-to-graph)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 control_rets=SomeAdd4},tfg-lower-func-to-graph)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2},tfg-lower-func-to-graph)' %s | FileCheck %s
 
 // CHECK: tfg.graph #tf_type.version<producer = 34, min_consumer = 5>
 tfg.graph #tf_type.version<producer = 34, min_consumer = 5> {
diff --git a/tensorflow/core/transforms/functional_to_region/BUILD b/tensorflow/core/transforms/functional_to_region/BUILD
index 371106fc1f9..b57d66c4929 100644
--- a/tensorflow/core/transforms/functional_to_region/BUILD
+++ b/tensorflow/core/transforms/functional_to_region/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index 1b6fb77a165..3b4392b0e79 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -73,7 +73,7 @@ class BasePattern {
 
   // Clone ops from one region to another with a given value mapping. Rename
   // clone ops with unique names.
-  void CloneAndRename(Region &from, Region &to, BlockAndValueMapping &bv) const;
+  void CloneAndRename(Region &from, Region &to, IRMapping &bv) const;
 
  protected:
   // Symbol table for looking up branch/loop functions.
@@ -275,7 +275,7 @@ YieldOp BasePattern::ReplaceReturnWithYield(Block &block, TypeRange types,
 void BasePattern::CloneAndReorderArgs(TypeRange types, Region &from, Region &to,
                                       PatternRewriter &rewriter) const {
   ControlType control_ty = dialect_.getControlType();
-  BlockAndValueMapping bv;
+  IRMapping bv;
   CloneAndRename(from, to, bv);
   SmallVector<Location> arg_locs(types.size(), from.getLoc());
   for (auto &it :
@@ -291,7 +291,7 @@ void BasePattern::CloneAndReorderArgs(TypeRange types, Region &from, Region &to,
 }
 
 void BasePattern::CloneAndRename(Region &from, Region &to,
-                                 BlockAndValueMapping &bv) const {
+                                 IRMapping &bv) const {
   from.cloneInto(&to, bv);
   StringAttr name_id = dialect_.getNameAttrIdentifier();
   auto op_name = to.getParentOp()->getAttrOfType<StringAttr>(name_id);
@@ -330,7 +330,7 @@ LogicalResult ConvertIfLikeOp<IfLikeOp, IfLikeRegionOp>::matchAndRewrite(
 
   // Move the regions over and replace the block arguments.
   ControlType control_ty = this->dialect_.getControlType();
-  BlockAndValueMapping then_bv, else_bv;
+  IRMapping then_bv, else_bv;
   auto func_args =
       llvm::zip(then_func.getArguments(), else_func.getArguments()).begin();
   rewriter.setInsertionPoint(region_op);
@@ -397,13 +397,13 @@ LogicalResult ConvertCaseLikeOp<CaseLikeOp, CaseLikeRegionOp>::matchAndRewrite(
 
   // Move the regions over and replace the block arguments.
   ControlType control_ty = this->dialect_.getControlType();
-  SmallVector<BlockAndValueMapping> bvs(branch_funcs.size(), {});
+  SmallVector<IRMapping> bvs(branch_funcs.size(), {});
   rewriter.setInsertionPoint(region_op);
   for (auto &arg : llvm::enumerate(args)) {
     for (auto it : llvm::zip(branch_funcs, bvs)) {
       BlockArgument branch_arg =
           GraphFuncOp::getDataValue(std::get<0>(it).getBody(), arg.index());
-      BlockAndValueMapping &bv = std::get<1>(it);
+      IRMapping &bv = std::get<1>(it);
       bv.map(branch_arg, arg.value());
       bv.map(GraphFuncOp::getControlTokenOf(branch_arg),
              LookupControlDependency(arg.value()));
diff --git a/tensorflow/core/transforms/graph_compactor/BUILD b/tensorflow/core/transforms/graph_compactor/BUILD
index c0e5d2ebc02..2c57a99f948 100644
--- a/tensorflow/core/transforms/graph_compactor/BUILD
+++ b/tensorflow/core/transforms/graph_compactor/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/graph_compactor/pass.cc b/tensorflow/core/transforms/graph_compactor/pass.cc
index 8e1c02bca31..2a3d6323cbb 100644
--- a/tensorflow/core/transforms/graph_compactor/pass.cc
+++ b/tensorflow/core/transforms/graph_compactor/pass.cc
@@ -77,9 +77,8 @@ static void EncodeName(unsigned counter, std::string &output) {
   constexpr unsigned valid_trailing_chars = valid_first_chars + 3;
   static_assert(sizeof(valid_chars) == valid_trailing_chars + 1,
                 "alphabet sanity check");
-  EncodeName(counter, output,
-             llvm::makeArrayRef(valid_chars, valid_first_chars),
-             llvm::makeArrayRef(valid_chars, valid_trailing_chars));
+  EncodeName(counter, output, llvm::ArrayRef(valid_chars, valid_first_chars),
+             llvm::ArrayRef(valid_chars, valid_trailing_chars));
 }
 
 namespace {
diff --git a/tensorflow/core/transforms/graph_compactor/tests/rename.mlir b/tensorflow/core/transforms/graph_compactor/tests/rename.mlir
index b5a8edc62ea..eda0a8dfb78 100644
--- a/tensorflow/core/transforms/graph_compactor/tests/rename.mlir
+++ b/tensorflow/core/transforms/graph_compactor/tests/rename.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg.func(tfg-name-compress)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg.func(tfg-name-compress))' %s | FileCheck %s
 
 // CHECK-LABEL: tfg.func @foo
 // CHECK-SAME: tfg.name = "A"
diff --git a/tensorflow/core/transforms/graph_compactor/tests/rename_lots.mlir b/tensorflow/core/transforms/graph_compactor/tests/rename_lots.mlir
index 2ebaee04e8b..896b776b8b8 100644
--- a/tensorflow/core/transforms/graph_compactor/tests/rename_lots.mlir
+++ b/tensorflow/core/transforms/graph_compactor/tests/rename_lots.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg.func(tfg-name-compress)' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg.func(tfg-name-compress))' %s | FileCheck %s
 
 // CHECK-LABEL: tfg.func @foo
 // CHECK-SAME: {tfg.name = "A"}
diff --git a/tensorflow/core/transforms/graph_to_func/BUILD b/tensorflow/core/transforms/graph_to_func/BUILD
index b259a455984..2ba7ffda769 100644
--- a/tensorflow/core/transforms/graph_to_func/BUILD
+++ b/tensorflow/core/transforms/graph_to_func/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/graph_to_func/tests/same_feeds_and_fetches.mlir b/tensorflow/core/transforms/graph_to_func/tests/same_feeds_and_fetches.mlir
index 75b63f71ecf..5ae83c8c00a 100644
--- a/tensorflow/core/transforms/graph_to_func/tests/same_feeds_and_fetches.mlir
+++ b/tensorflow/core/transforms/graph_to_func/tests/same_feeds_and_fetches.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1 fetches=Placeholder1}' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1 fetches=Placeholder1})' %s | FileCheck %s
 
 // CHECK:   tfg.func @_mlir_lifted_graph(%Placeholder1_0: tensor<*xf32> {tfg.lifted_value_attr = ["Placeholder1", 0 : index], tfg.name = "Placeholder1_0"}
 // CHECK-NEXT: -> (tensor<*xf32> {tfg.name = "Placeholder1_0"})
diff --git a/tensorflow/core/transforms/graph_to_func/tests/simple.mlir b/tensorflow/core/transforms/graph_to_func/tests/simple.mlir
index 2d2dd75c792..8b0f8b1a535 100644
--- a/tensorflow/core/transforms/graph_to_func/tests/simple.mlir
+++ b/tensorflow/core/transforms/graph_to_func/tests/simple.mlir
@@ -1,5 +1,5 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 fetches=SomeAdd3 control_rets=SomeAdd4}' %s | FileCheck %s
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2,Placeholder1 fetches=SomeAdd3,SomeAdd3 control_rets=SomeAdd4,SomeAdd4}' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2 fetches=SomeAdd3 control_rets=SomeAdd4})' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-lift-graph-to-func{feeds=Placeholder1,Placeholder2,Placeholder1 fetches=SomeAdd3,SomeAdd3 control_rets=SomeAdd4,SomeAdd4})' %s | FileCheck %s
 
 // Test that we can lift the graph into a function by using the provided feeds
 // as function arguments and the provided fetch as function results.
diff --git a/tensorflow/core/transforms/graph_transform_wrapper.cc b/tensorflow/core/transforms/graph_transform_wrapper.cc
index 02e5381b9b5..0ca7e18e69f 100644
--- a/tensorflow/core/transforms/graph_transform_wrapper.cc
+++ b/tensorflow/core/transforms/graph_transform_wrapper.cc
@@ -42,7 +42,7 @@ tensorflow::Status RunTransformOnGraph(
                       ImportGraphAndFunctionsToMlir(&context, debug_info,
                                                     *graph, graph->flib_def()));
 
-  PassManager pm(&context, mlir::PassManager::Nesting::Explicit);
+  PassManager pm((*module)->getName(), mlir::PassManager::Nesting::Explicit);
   // Construct passes.
   for (auto& pass : passes) pm.addPass(pass());
   mlir::StatusScopedDiagnosticHandler error_handler(&context);
diff --git a/tensorflow/core/transforms/legacy_call/BUILD b/tensorflow/core/transforms/legacy_call/BUILD
index 9cb9d2c8753..de50a30b4be 100644
--- a/tensorflow/core/transforms/legacy_call/BUILD
+++ b/tensorflow/core/transforms/legacy_call/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/region_to_functional/BUILD b/tensorflow/core/transforms/region_to_functional/BUILD
index 1037b8aac5c..6da4f3fe8c1 100644
--- a/tensorflow/core/transforms/region_to_functional/BUILD
+++ b/tensorflow/core/transforms/region_to_functional/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index a6570c82ed8..e0c29d62c6e 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/FunctionInterfaces.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -535,9 +535,11 @@ NamedAttrList BasePattern::BuildAttributes(RegionAttr preserved,
   for (auto &it : llvm::enumerate(results))
     res_attrs.push_back(build_attrs(preserved_res_attrs, it, arguments));
 
-  attrs.append(FunctionOpInterface::getArgDictAttrName(),
+  Optional<RegisteredOperationName> name =
+      RegisteredOperationName::lookup(GraphFuncOp::getOperationName(), ctx_);
+  attrs.append(GraphFuncOp::getArgAttrsAttrName(*name),
                ArrayAttr::get(ctx_, arg_attrs));
-  attrs.append(FunctionOpInterface::getResultDictAttrName(),
+  attrs.append(GraphFuncOp::getResAttrsAttrName(*name),
                ArrayAttr::get(ctx_, res_attrs));
   return attrs;
 }
@@ -649,6 +651,7 @@ FuncAttr BasePattern::Outline(Operation *op, PatternRewriter &rewriter,
       &name_uniquer);
 
   auto yield = cast<YieldOp>(region.front().getTerminator());
+  SmallVector<Value> yieldArgs(yield.getArgs());
   rewriter.setInsertionPoint(yield);
   auto ret_op = rewriter.replaceOpWithNewOp<ReturnOp>(
       yield, yield.getOperands(),
@@ -962,12 +965,14 @@ ConvertWhileLikeOp<WhileLikeRegionOp, WhileLikeOp>::matchAndRewrite(
     // Create a name scope for the condition function.
     NameUniquer name_uniquer(this->ctx_);
     // Create the function.
+
     NamedAttrList cond_attrs =
         this->BuildAttributes(op.getCondRegionAttrsAttr(), op.getInit(),
                               cond_op.getCond(), &name_uniquer);
     GraphFuncOp cond_func =
         this->CreateFunc(op.getLoc(), "while_cond_function", op.getCondRegion(),
                          cond_op.getCond().getType(), std::move(cond_attrs));
+
     // Replace the condition terminator.
     rewriter.setInsertionPoint(cond_op);
     SmallVector<Value> cond_rets = {cond_op.getCond()};
diff --git a/tensorflow/core/transforms/region_to_functional/pass.cc b/tensorflow/core/transforms/region_to_functional/pass.cc
index ee2710347ad..867fa8f4c43 100644
--- a/tensorflow/core/transforms/region_to_functional/pass.cc
+++ b/tensorflow/core/transforms/region_to_functional/pass.cc
@@ -51,7 +51,7 @@ struct RegionToFunctionalPass
     config.enableRegionSimplification = false;
     // Iterate until all regions have been outlined. This is guaranteed to
     // terminate because the IR can only hold a finite depth of regions.
-    config.maxIterations = GreedyRewriteConfig::kNoIterationLimit;
+    config.maxIterations = GreedyRewriteConfig::kNoLimit;
     if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
                                             config))) {
       getOperation()->emitError(getArgument() + " pass failed");
diff --git a/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_count.mlir b/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_count.mlir
index c0119699ba9..4c93bcb022e 100644
--- a/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_count.mlir
+++ b/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_count.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt %s --pass-pipeline='tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional' \
+// RUN: tfg-transforms-opt %s --pass-pipeline='builtin.module(tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional)' \
 // RUN: | FileCheck %s
 
 // Check that functions are renamed at most once when run through the region
diff --git a/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_reorder.mlir b/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_reorder.mlir
index 7b42b446dbc..21cd59dc90a 100644
--- a/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_reorder.mlir
+++ b/tensorflow/core/transforms/region_to_functional/tests/idempotence_arg_reorder.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt %s --pass-pipeline='tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional' \
+// RUN: tfg-transforms-opt %s --pass-pipeline='builtin.module(tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional,tfg-functional-to-region,tfg-region-to-functional)' \
 // RUN: | FileCheck %s
 
 // Check that functions are renamed at most once when passed through the region
diff --git a/tensorflow/core/transforms/region_to_functional/tests/sink_respecialize.mlir b/tensorflow/core/transforms/region_to_functional/tests/sink_respecialize.mlir
index d3e92c34062..2080b2027a9 100644
--- a/tensorflow/core/transforms/region_to_functional/tests/sink_respecialize.mlir
+++ b/tensorflow/core/transforms/region_to_functional/tests/sink_respecialize.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt -pass-pipeline='tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional,tfg-prepare-attrs-export,tfg-shape-inference' %s | FileCheck %s
+// RUN: tfg-transforms-opt -pass-pipeline='builtin.module(tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional,tfg-prepare-attrs-export,tfg-shape-inference)' %s | FileCheck %s
 
 // In this test case, `@then` has an unused argument `%b`. Sinking `Add` into 
 // `@else` does not cause the signature to visibly change, so the function is
@@ -32,4 +32,4 @@ tfg.func @test_respecialize(%cond: tensor<i1> {tfg.name = "cond"},
 
 // CHECK: tfg.func @then_tfg_region_specialized_if_0
 
-// CHECK: tfg.func @else_tfg_region_specialized_if_1
\ No newline at end of file
+// CHECK: tfg.func @else_tfg_region_specialized_if_1
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index c413d61e7cd..03dc448fa94 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/transforms/remapper/pass.cc b/tensorflow/core/transforms/remapper/pass.cc
index 5b2ee5bde26..d758fefd13f 100644
--- a/tensorflow/core/transforms/remapper/pass.cc
+++ b/tensorflow/core/transforms/remapper/pass.cc
@@ -179,6 +179,84 @@ static std::unique_ptr<OperationState> GetContractionBiasAddOpState(
   return state;
 }
 
+// Convert Softplus+Tanh+Mul to Mish
+// Mul(x, Tanh(Softplus(x))) --> _MklFusedMish
+class MatchSofplusTanhMul : public RemapperPatternBase {
+ public:
+  explicit MatchSofplusTanhMul(OpPropertyHelper &helper)
+      : RemapperPatternBase("tfg.Mul", helper, PatternBenefit(1)) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    // Fusion only available for CPU
+    if (!util::OpHasDevice(op, tensorflow::DEVICE_CPU)) return failure();
+
+    // Not allowing control flow on op
+    if (helper_.HasControlOperandsOrResultUsers(op)) return failure();
+
+    // Fusion only available for float32 and bfloat16 data types
+    auto attr = op->getAttrOfType<TypeAttr>("T");
+    if (!attr) return failure();
+    Type dtype = attr.getValue();
+    if (!dtype.isa<Float32Type, BFloat16Type>()) return failure();
+
+    TFOp mul_wrapper(op);
+
+    // Tanh op
+    Value tanh_value = op->getOperand(0);
+    // Input
+    Value x_value = op->getOperand(1);
+
+    // The Mul op is commutative and the inputs may be swapped.
+    auto CheckTanhOperand = [&](Value tanh_value) {
+      if (!tanh_value) return false;
+      Operation *op = tanh_value.getDefiningOp();
+      return op && this->helper_.getDialect()->IsTanh(op);
+    };
+
+    if (!CheckTanhOperand(tanh_value)) {
+      std::swap(tanh_value, x_value);
+      if (!CheckTanhOperand(tanh_value)) return failure();
+    }
+
+    Operation *tanh_op = tanh_value.getDefiningOp();
+
+    // Softplus op
+    Value softplus_value = tanh_op->getOperand(0);
+    Operation *softplus_op = softplus_value.getDefiningOp();
+
+    if (!(this->helper_.getDialect()->IsSoftplus(op)) &&
+        !(softplus_op->getOperand(0) == x_value)) {
+      return failure();
+    }
+
+    if (!helper_.HasAtMostOneUserOfResult0(tanh_op) ||
+        !helper_.HasAtMostOneUserOfResult0(softplus_op)) {
+      return failure();
+    }
+
+    // TODO(intel-tf): Allow valid control dependencies
+    // Not allowing control flow on Tanh or Softplus
+    if (helper_.HasControlOperandsOrResultUsers(tanh_op) ||
+        helper_.HasControlOperandsOrResultUsers(softplus_op)) {
+      return failure();
+    }
+
+    SmallVector<Value> operands;
+    // Set up non-control operand.
+    operands.push_back(x_value);
+    // Control operands come after regular operands.
+    llvm::append_range(operands, mul_wrapper.getControlOperands());
+
+    Operation *new_op = rewriter.create(
+        op->getLoc(), rewriter.getStringAttr("tfg._MklFusedMish"), operands,
+        op->getResultTypes(), op->getAttrs());
+    rewriter.replaceOp(op, new_op->getResults());
+
+    return success();
+  }
+};
+
 // Contraction + BiasAdd
 // TODO(intel-tf): Support Contraction + {Add, AddV2} fusion in the case it has
 // similar semantic of contraction + BiasAdd
@@ -349,6 +427,7 @@ class Remapper : public impl::RemapperBase<Remapper> {
     }
     if (enable_onednn_patterns_) {
       patterns.insert<MatchMulSigmoid>(context);
+      patterns.insert<MatchSofplusTanhMul>(helper_);
       // TODO(chiahungduan): Currently, the only pattern implemented in PDLL is
       // the same one as `MatchMulSigmoid`. Remove the one of them when there's
       // a decision that which one is preferred.
diff --git a/tensorflow/core/transforms/remapper/tests/onednn_mish.mlir b/tensorflow/core/transforms/remapper/tests/onednn_mish.mlir
new file mode 100644
index 00000000000..23f04b3f4e7
--- /dev/null
+++ b/tensorflow/core/transforms/remapper/tests/onednn_mish.mlir
@@ -0,0 +1,16 @@
+// RUN: tfg-transforms-opt --tfg-remapper=enable-onednn-patterns %s | FileCheck %s
+
+// -----
+
+// CHECK-LABEL: tfg.func @fusedmish_test
+tfg.func @fusedmish_test() {
+  // CHECK: %[[PLACEHOLDER:.*]], {{.*}} name("input_tensor")
+  %Placeholder, %ctl = Placeholder device("/device:CPU:0") name("input_tensor") {dtype = f32, shape = #tf_type.shape<64x64>} : () -> (tensor<64x64xf32>)
+  // CHECK: %[[SOFTPLUS:.*]], {{.*}} name("Softplus")
+  %Softplus, %ctl_0 = Softplus(%Placeholder) device("/device:CPU:0") name("Softplus") {T = f32} : (tensor<64x64xf32>) -> (tensor<64x64xf32>)
+  // CHECK: %[[TANH:.*]], {{.*}} name("Tanh")
+  %Tanh, %ctl_1 = Tanh(%Softplus) device("/device:CPU:0") name("Tanh") {T = f32} : (tensor<64x64xf32>) -> (tensor<64x64xf32>)
+  // CHECK: _MklFusedMish(%[[PLACEHOLDER:.*]]) {{.*}} name("Mul")
+  %Mul, %ctl_2 = Mul(%Placeholder, %Tanh) device("/device:CPU:0") name("Mul") {T = f32} : (tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64x64xf32>)
+  return
+}
diff --git a/tensorflow/core/transforms/shape_inference/BUILD b/tensorflow/core/transforms/shape_inference/BUILD
index 1f648505878..5e2a2d17404 100644
--- a/tensorflow/core/transforms/shape_inference/BUILD
+++ b/tensorflow/core/transforms/shape_inference/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/core/transforms:__subpackages__",
@@ -17,6 +18,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir:shape_inference_utils",
+        "//tensorflow/core/ir/importexport:convert_tensor",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/transforms:PassIncGen",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/core/transforms/shape_inference/pass.cc b/tensorflow/core/transforms/shape_inference/pass.cc
index 783eb60d9b8..19c066f0c5d 100644
--- a/tensorflow/core/transforms/shape_inference/pass.cc
+++ b/tensorflow/core/transforms/shape_inference/pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/ir/importexport/convert_tensor.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/tf_op_wrapper.h"
 #include "tensorflow/core/ir/types/dialect.h"
@@ -101,9 +102,8 @@ void ShapeInference::TryToCacheResultsTensorValue(Operation *op) {
   if (op_name == "Const") {
     cached_tensor_values_[op->getResult(0)] =
         op->getAttrOfType<DenseElementsAttr>("value");
-  } else if (op_name == "Identity" ||
-             (op_name == "IdentityN" &&
-              TFOp(op).getNonControlOperands().size() == 1)) {
+  } else if ((op_name == "Identity" || op_name == "IdentityN") &&
+             TFOp(op).getNonControlOperands().size() == 1) {
     DenseElementsAttr operand_tensor_value = GetTensorValue(op->getOperand(0));
     if (!operand_tensor_value) return;
     cached_tensor_values_[op->getResult(0)] = operand_tensor_value;
@@ -207,8 +207,8 @@ void ShapeInference::runOnOperation() {
       ShapedTypeComponents result = std::get<1>(it);
       TensorType inferred_type;
       if (result.hasRank()) {
-        inferred_type =
-            GetTypeFromTFTensorShape(result.getDims(), result.getElementType());
+        inferred_type = mlir::RankedTensorType::get(result.getDims(),
+                                                    result.getElementType());
       } else {
         inferred_type = UnrankedTensorType::get(result.getElementType());
       }
diff --git a/tensorflow/core/transforms/toposort/BUILD b/tensorflow/core/transforms/toposort/BUILD
index d2eca3cff97..b91b8a59c08 100644
--- a/tensorflow/core/transforms/toposort/BUILD
+++ b/tensorflow/core/transforms/toposort/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_compatible_with = get_compatible_with_cloud(),
     default_visibility = [
         "//tensorflow/compiler:__subpackages__",
diff --git a/tensorflow/core/transforms/toposort/tests/toposort.mlir b/tensorflow/core/transforms/toposort/tests/toposort.mlir
index 4bdadee08ea..f7cee620d77 100644
--- a/tensorflow/core/transforms/toposort/tests/toposort.mlir
+++ b/tensorflow/core/transforms/toposort/tests/toposort.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt %s --pass-pipeline="tfg.graph(tfg-toposort), tfg.func(tfg-toposort)" | FileCheck %s
+// RUN: tfg-transforms-opt %s --pass-pipeline="builtin.module(tfg.graph(tfg-toposort), tfg.func(tfg-toposort))" | FileCheck %s
 
 // Sort graphs topologically
 
diff --git a/tensorflow/core/transforms/toposort/tests/toposort_regions.mlir b/tensorflow/core/transforms/toposort/tests/toposort_regions.mlir
index 406e041dfe2..cc6defe4c56 100644
--- a/tensorflow/core/transforms/toposort/tests/toposort_regions.mlir
+++ b/tensorflow/core/transforms/toposort/tests/toposort_regions.mlir
@@ -1,4 +1,4 @@
-// RUN: tfg-transforms-opt %s --pass-pipeline="tfg.graph(tfg-toposort), tfg.func(tfg-toposort)" | FileCheck %s
+// RUN: tfg-transforms-opt %s --pass-pipeline="builtin.module(tfg.graph(tfg-toposort), tfg.func(tfg-toposort))" | FileCheck %s
 
 // Test with region ops
 // CHECK-LABEL: tfg.graph
diff --git a/tensorflow/core/user_ops/BUILD b/tensorflow/core/user_ops/BUILD
index 7af702509b6..abb63108228 100644
--- a/tensorflow/core/user_ops/BUILD
+++ b/tensorflow/core/user_ops/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__pkg__",
     ],
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index e203b99a2d0..f65df3f07e4 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -1,6 +1,5 @@
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_kernel_tests_linkstatic",
     "tf_proto_library",
 )
 load(
@@ -34,6 +33,7 @@ default_package_visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_package_visibility,
     features = [
         "-layering_check",
@@ -187,6 +187,7 @@ filegroup(
         "util.h",
         "work_sharder.h",
         "xla_config_registry.h",
+        "zen_util.h",
         "//tensorflow/tsl/util:framework_internal_private_hdrs",
     ],
 )
@@ -381,6 +382,32 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "fake_clock_env",
+    testonly = 1,
+    srcs = ["fake_clock_env.cc"],
+    hdrs = ["fake_clock_env.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_test(
+    name = "fake_clock_env_test",
+    srcs = [
+        "fake_clock_env_test.cc",
+    ],
+    deps = [
+        ":fake_clock_env",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:env",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "stats_calculator_portable",
     hdrs = [
@@ -505,6 +532,7 @@ cc_library(
     name = "managed_stack_trace",
     hdrs = ["managed_stack_trace.h"],
     visibility = [
+        "//tensorflow:__pkg__",
         "//tensorflow/c/eager:__pkg__",
         "//tensorflow/compiler/mlir/tensorflow:__pkg__",
         "//tensorflow/core:__pkg__",
@@ -732,6 +760,7 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/stream_executor/rocm:rocblas_wrapper",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_gpu_executor",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocsolver_wrapper",
+        "//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_wrapper",
     ] + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -751,9 +780,12 @@ tf_kernel_library(
         "//tensorflow/core:lib",
     ] + if_cuda([
         "//tensorflow/compiler/xla/stream_executor/cuda:cusparse_lib",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_blas_utils",
+        "//tensorflow/compiler/xla/stream_executor:data_type",
         "@local_config_cuda//cuda:cub_headers",
     ]) + if_rocm([
         "//tensorflow/compiler/xla/stream_executor/rocm:rocsolver_wrapper",
+        "//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_wrapper",
         "//tensorflow/compiler/xla/stream_executor/rocm:hipsparse_wrapper",
     ]),
 )
@@ -862,7 +894,6 @@ tf_cc_tests(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     visibility = [
         "//tensorflow/core:__pkg__",
     ],
@@ -890,6 +921,7 @@ tf_cc_tests(
         "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:status_matchers",
         "//third_party/eigen3",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -906,7 +938,6 @@ tf_cc_test(
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 23968819add..08450835efe 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -8,14 +8,18 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tensorflow:tensorflow.bzl",
-    "if_cuda_or_rocm",
     "tf_cuda_library",
     "tf_cuda_only_cc_test",
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:__subpackages__"],
     features = ["-layering_check"],
     licenses = ["notice"],
@@ -68,7 +72,9 @@ cc_library(
         "conv_autotune_maps.h",
         "conv_parameters.h",
     ],
-    visibility = ["//tensorflow/core/kernels:__pkg__"],
+    visibility = [
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
+    ],
 )
 
 tf_cuda_library(
@@ -78,7 +84,6 @@ tf_cuda_library(
         "conv_parameters.h",
     ],
     deps = [
-        ":autotune_maps_utils",
         ":conv_parameters_proto_cc",
         "//tensorflow/core/platform:hash",
         "//tensorflow/core/platform:protobuf",
@@ -88,26 +93,6 @@ tf_cuda_library(
     ],
 )
 
-tf_cuda_library(
-    name = "autotune_maps_utils",
-    srcs = ["autotune_maps_utils.cc"],
-    hdrs = ["autotune_maps_utils.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:str_format",
-        "//tensorflow/core/platform:protobuf",
-        "//tensorflow/core/platform:hash",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:statusor",
-    ] + if_cuda_or_rocm(
-        [
-            "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
-        ],
-    ),
-)
-
 tf_proto_library(
     name = "autotune_map_proto",
     srcs = [
@@ -142,13 +127,14 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":autotune_map_proto_cc",
-        ":autotune_maps_utils",
         ":conv_autotune_maps",
         ":conv_parameters",
         ":conv_parameters_proto_cc",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto_cc",
         "//tensorflow/compiler/xla/stream_executor:lazy_op_runner",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:stream_executor",
@@ -166,6 +152,7 @@ tf_cuda_only_cc_test(
         ":conv_parameters",
         ":conv_parameters_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_init",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:status_matchers",
diff --git a/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc b/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc
deleted file mode 100644
index 3e829b9d2a8..00000000000
--- a/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
-
-#include <string>
-
-#include "absl/strings/str_format.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/hash.h"
-#include "tensorflow/core/platform/statusor.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace tensorflow {
-
-namespace autotune_maps_utils {
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-namespace {
-
-using ::stream_executor::gpu::GpuDeviceHandle;
-using ::stream_executor::gpu::GpuDriver;
-
-StatusOr<string> DeviceIdToIdentifierHelper(int device_id) {
-  GpuDeviceHandle device;
-  TF_RETURN_IF_ERROR(GpuDriver::GetDevice(device_id, &device));
-  std::string device_name;
-  TF_RETURN_IF_ERROR(GpuDriver::GetDeviceName(device, &device_name));
-  int cc_major;
-  int cc_minor;
-  TF_RETURN_IF_ERROR(
-      GpuDriver::GetComputeCapability(&cc_major, &cc_minor, device));
-
-  uint64 device_memory_size = -1;
-  if (!GpuDriver::GetDeviceTotalMemory(device, &device_memory_size)) {
-    return errors::Internal("Failed to get device's total memory");
-  }
-
-  TF_ASSIGN_OR_RETURN(int core_count,
-                      GpuDriver::GetMultiprocessorCount(device));
-  return absl::StrFormat("%s sm_%d.%d with %dB RAM and %d cores", device_name,
-                         cc_major, cc_minor, device_memory_size, core_count);
-}
-
-}  // namespace
-
-std::vector<std::string> GetDeviceIdToIdentifierMap() {
-  int device_count = GpuDriver::GetDeviceCount();
-  std::vector<string> map(device_count);
-  for (int device_id = 0; device_id < device_count; device_id++) {
-    StatusOr<string> device_identifier_or_status =
-        DeviceIdToIdentifierHelper(device_id);
-    if (device_identifier_or_status.ok()) {
-      map[device_id] = device_identifier_or_status.value();
-    } else {
-      map[device_id] = "Unknown Graphics Device";
-    }
-  }
-  return map;
-}
-
-std::string DeviceIdToIdentifier(int device_id) {
-  // Ensure the static variable is trivially destructible and thus safe to be
-  // destruct in multi-thread setting.
-  static const auto& map =
-      *new std::vector<string>(GetDeviceIdToIdentifierMap());
-  return device_id < map.size() ? map[device_id] : "Unknown Graphics Device";
-}
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-std::string SerializeProtoDeterministic(const protobuf::Message& proto) {
-  std::string proto_serialized_string;
-  protobuf::io::StringOutputStream string_stream(&proto_serialized_string);
-  protobuf::io::CodedOutputStream stream(&string_stream);
-  // Ensure the serialization is deterministic so that equal ConvParameters
-  // have equal serialized strings and therefore equal hash codes.
-  stream.SetSerializationDeterministic(true);
-  proto.SerializeToCodedStream(&stream);
-  return proto_serialized_string;
-}
-
-uint64 HashProto(const protobuf::Message& proto) {
-  return Hash64(SerializeProtoDeterministic(proto));
-}
-
-}  // namespace autotune_maps_utils
-}  // namespace tensorflow
diff --git a/tensorflow/core/util/autotune_maps/autotune_maps_utils.h b/tensorflow/core/util/autotune_maps/autotune_maps_utils.h
deleted file mode 100644
index f7b62635ebe..00000000000
--- a/tensorflow/core/util/autotune_maps/autotune_maps_utils.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// For Google-internal use only.
-
-#ifndef TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_MAPS_UTILS_H_
-#define TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_MAPS_UTILS_H_
-
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/platform/protobuf.h"
-namespace tensorflow {
-namespace autotune_maps_utils {
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-// Given a device_id, this function computes an identifier string that
-// represents the corresponding GPU device type. Currently the identifier is
-// computed as
-// "<device_name> <compute_compatibility> <GPU_memory> <multiprocessor_count>".
-// We cannot simply use <device_name> output by GetDeviceName here because for
-// some GPUs the it will output uninformative names like "Graphics Device",
-// which cannot identify device types of GPUs.
-// TODO(ruochengw): Replace the identifier with something that uniquely
-// determines a GPU device type, e.g. PCI device ID.
-std::string DeviceIdToIdentifier(int device_id);
-
-// Precomputes a map storing the results of DeviceIdToIdentifierHelper for all
-// device_ids available and outputs "Unknown Graphics Device" when
-// DeviceIdToIdentifierHelper returns an error.
-std::vector<std::string> GetDeviceIdToIdentifierMap();
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-std::string SerializeProtoDeterministic(const protobuf::Message& proto);
-
-uint64_t HashProto(const protobuf::Message& proto);
-
-}  // namespace autotune_maps_utils
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_UTIL_AUTOTUNE_MAPS_AUTOTUNE_MAPS_UTILS_H_
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize.cc b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
index f4e3fe6ce89..ef0d52d96a1 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
@@ -21,15 +21,17 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.pb.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/util/activation_mode.h"
 #include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
-#include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
 #include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
+#include "tensorflow/tsl/lib/strings/proto_serialization.h"
 
 namespace tensorflow {
 
@@ -41,7 +43,7 @@ using stream_executor::dnn::AlgorithmDesc;
 using stream_executor::dnn::AlgorithmProto;
 
 template <typename Op>
-ConvMapProto ConvMapToProto(
+StatusOr<ConvMapProto> ConvMapToProto(
     const AutotuneMap<ConvParameters, AutotuneEntry<Op>> &autotune_map) {
   ConvMapProto proto;
 
@@ -74,8 +76,10 @@ ConvMapProto ConvMapToProto(
       }
     }
 
-    sorted_map.insert(std::make_pair(
-        autotune_maps_utils::SerializeProtoDeterministic(params_proto), kv));
+    std::string serialized_params;
+    TF_RET_CHECK(
+        tsl::SerializeToStringDeterministic(params_proto, &serialized_params));
+    sorted_map.insert(std::make_pair(std::move(serialized_params), kv));
   }
 
   for (auto const &p : sorted_map) {
@@ -92,17 +96,22 @@ Status PopulateConvMap(
   if (m.kv_pairs().size() == 0) {
     return OkStatus();
   }
-  std::set<std::string> unmatched_device_ids;
-  // Map device_id's to corresponding device_identifiers.
-  std::vector<string> device_ids_map =
-      autotune_maps_utils::GetDeviceIdToIdentifierMap();
-  // Map device_identifiers to device_ids whose corresponding GPU devices have
-  // the given device_identifier.
-  std::unordered_map<string, std::vector<int>> device_identifiers_map;
-  bool devices_matched = false;
+
+  // Get the list of all GPU StreamExecutors.
+  TF_ASSIGN_OR_RETURN(
+      se::Platform * platform,
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()));
+  std::vector<std::string> device_descs;
+  for (int i = 0; i < platform->VisibleDeviceCount(); i++) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::DeviceDescription> device_desc,
+                        platform->DescriptionForDevice(i));
+    device_descs.push_back(device_desc->model_str());
+  }
+
+  std::set<std::string> unmatched_device_descs;
   for (const ConvMapProto::Entry &kv : m.kv_pairs()) {
     const ConvParametersProto &params_proto = kv.key();
-    // Abort loading process whenever there is an entry whose version number
+    // Abort the loading process whenever there is an entry whose version number
     // doesn't match runtime version because the autotune results may be
     // incorrect.
     if (params_proto.version() != ConvParameters::kVersion) {
@@ -116,26 +125,6 @@ Status PopulateConvMap(
           ". Actual version: ", params_proto.version());
     }
 
-    auto iter = device_identifiers_map.find(params_proto.device_identifier());
-    std::vector<int> device_ids;
-    if (iter == device_identifiers_map.end()) {
-      for (int i = 0; i < device_ids_map.size(); i++) {
-        if (device_ids_map[i] == params_proto.device_identifier()) {
-          device_ids.push_back(i);
-        }
-      }
-      device_identifiers_map.insert(
-          std::make_pair(params_proto.device_identifier(), device_ids));
-    } else {
-      device_ids = iter->second;
-    }
-
-    if (device_ids.empty()) {
-      unmatched_device_ids.insert(params_proto.device_identifier());
-    } else {
-      devices_matched = true;
-    }
-
     const AlgorithmConfigProto &algorithm_config_proto = kv.value();
     const AlgorithmDesc primary(algorithm_config_proto.algorithm());
     const absl::optional<AlgorithmDesc> fallback =
@@ -144,7 +133,14 @@ Status PopulateConvMap(
                   AlgorithmDesc(algorithm_config_proto.algorithm_no_scratch()))
             : absl::nullopt;
 
-    for (int device_id : device_ids) {
+    bool devices_matched = false;
+    for (int ordinal = 0; ordinal < device_descs.size(); ordinal++) {
+      const std::string &desc_str = device_descs[ordinal];
+      if (desc_str != params_proto.device_identifier()) {
+        continue;
+      }
+      devices_matched = true;
+
       AutotuneEntry<Op> entry;
 #if TENSORFLOW_USE_ROCM
       // ROCm doesn't yet support the OpRunner-based API, so for the time being
@@ -155,23 +151,21 @@ Status PopulateConvMap(
       entry = AutotuneEntry<Op>(primary, fallback);
 #endif
 
-      autotune_map->Insert(ConvParameters(device_id, params_proto), entry);
+      autotune_map->Insert(ConvParameters(ordinal, params_proto), entry);
+    }
+
+    if (!devices_matched) {
+      unmatched_device_descs.insert(params_proto.device_identifier());
     }
   }
 
-  if (!unmatched_device_ids.empty()) {
+  if (!unmatched_device_descs.empty()) {
     LOG(WARNING) << "Unmatched device id's from AoT autotuning data: "
-                 << str_util::Join(unmatched_device_ids, ", ")
+                 << str_util::Join(unmatched_device_descs, ", ")
                  << "; existing devices: "
-                 << str_util::Join(device_ids_map, ", ");
+                 << str_util::Join(device_descs, ", ");
   }
 
-  // When no matching devices are found, populating autotuning map will not
-  // happen. Instead of silently reporting an OK status, report an error back.
-  if (!devices_matched) {
-    return errors::NotFound("No matching devices found for ",
-                            str_util::Join(device_ids_map, ", "));
-  }
   return OkStatus();
 }
 
@@ -181,11 +175,12 @@ Status PopulateConvMap(
 Status SerializeAutotuneMaps(std::string *output) {
   AutotuneMapsProto proto;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  *proto.mutable_conv_map() = ConvMapToProto(*ConvAutotuneMap::GetInstance());
-  *proto.mutable_fused_conv_map() =
-      ConvMapToProto(*FusedConvAutotuneMap::GetInstance());
+  TF_ASSIGN_OR_RETURN(*proto.mutable_conv_map(),
+                      ConvMapToProto(*ConvAutotuneMap::GetInstance()));
+  TF_ASSIGN_OR_RETURN(*proto.mutable_fused_conv_map(),
+                      ConvMapToProto(*FusedConvAutotuneMap::GetInstance()));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  *output = autotune_maps_utils::SerializeProtoDeterministic(proto);
+  TF_RET_CHECK(tsl::SerializeToStringDeterministic(proto, output));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
index f8f190749e9..93abb63f518 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_init.h"
 #include "tensorflow/core/platform/status_matchers.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h"
@@ -35,6 +36,14 @@ using stream_executor::gpu::GpuDriver;
 using ::tensorflow::testing::StatusIs;
 using ::testing::HasSubstr;
 
+// Gets a GPU StreamExecutor instance.  Any one will do.
+se::StreamExecutor* GetStreamExec() {
+  se::Platform* platform =
+      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+  CHECK_GT(platform->VisibleDeviceCount(), 0);
+  return platform->ExecutorForDevice(0).value();
+}
+
 // Tests when there is no entry in the autotune maps.
 TEST(AutotuneSerializeTest, Empty) {
   TF_CHECK_OK(GpuDriver::Init());
@@ -56,6 +65,7 @@ TEST(AutotuneSerializeTest, Consistency) {
   TF_CHECK_OK(GpuDriver::Init());
   ResetAutotuneMaps();
   ConvParameters conv_params_example_a = {
+      GetStreamExec(),
       /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
@@ -66,9 +76,9 @@ TEST(AutotuneSerializeTest, Consistency) {
       /*stride=*/{{1, 1}},
       /*padding=*/{{1, 1}},
       /*dtype=*/DataType::DT_INT8,
-      /*device_id=*/0,
       /*group_count=*/1};
   ConvParameters fused_params_example_a = {
+      GetStreamExec(),
       /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
@@ -79,7 +89,6 @@ TEST(AutotuneSerializeTest, Consistency) {
       /*stride=*/{{1, 1}},
       /*padding=*/{{1, 1}},
       /*dtype=*/DataType::DT_INT8,
-      /*device_id=*/0,
       /*group_count=*/1,
       ConvParameters::FusionInfo{1.0, 0., 0.,
                                  /*activation_mode=*/
@@ -87,6 +96,7 @@ TEST(AutotuneSerializeTest, Consistency) {
                                  /*is_contrib=*/false},
   };
   ConvParameters contrib_fused_params_example_a = {
+      GetStreamExec(),
       /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
@@ -97,7 +107,6 @@ TEST(AutotuneSerializeTest, Consistency) {
       /*stride=*/{{1, 1}},
       /*padding=*/{{1, 1}},
       /*dtype=*/DataType::DT_INT8,
-      /*device_id=*/0,
       /*group_count=*/1,
       ConvParameters::FusionInfo{1.0, 0., 0.,
                                  /*activation_mode=*/
@@ -136,6 +145,7 @@ TEST(AutotuneSerializeTest, VersionControl) {
   ResetAutotuneMaps();
 
   ConvParameters fused_params_example_a = {
+      GetStreamExec(),
       /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
@@ -146,7 +156,6 @@ TEST(AutotuneSerializeTest, VersionControl) {
       /*stride=*/{{1, 1}},
       /*padding=*/{{1, 1}},
       /*dtype=*/DataType::DT_INT8,
-      /*device_id=*/0,
       /*group_count=*/1,
       ConvParameters::FusionInfo{1.0, 0., 0.,
                                  /*activation_mode=*/
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.cc b/tensorflow/core/util/autotune_maps/conv_parameters.cc
index 7cdbe1ea96b..ef30d475719 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.cc
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.cc
@@ -20,32 +20,32 @@ limitations under the License.
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/hash.h"
-#include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
 #include "tensorflow/core/util/autotune_maps/conv_parameters.pb.h"
+#include "tensorflow/tsl/lib/strings/proto_serialization.h"
 
 namespace tensorflow {
 
 namespace {
-using ::tensorflow::protobuf::util::MessageDifferencer;
+using ::tsl::protobuf::util::MessageDifferencer;
 
 uint64 ComputeHash(int device_id, const ConvParametersProto& proto) {
-  return Hash64Combine(device_id, autotune_maps_utils::HashProto(proto));
+  return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 
 uint64 ComputeHash(int device_id, const MatmulParametersProto& proto) {
-  return Hash64Combine(device_id, autotune_maps_utils::HashProto(proto));
+  return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 }  // namespace
 
 ConvParameters::ConvParameters(
-    int64_t batch, int64_t in_depths, const absl::Span<const int64_t> in,
-    int data_format, int64_t out_depths, const absl::Span<const int64_t> filter,
+    se::StreamExecutor* stream_exec, int64_t batch, int64_t in_depths,
+    const absl::Span<const int64_t> in, int data_format, int64_t out_depths,
+    const absl::Span<const int64_t> filter,
     const absl::Span<const int64_t> dilation,
     const absl::Span<const int64_t> stride,
-    const absl::Span<const int64_t> padding, DataType dtype, int device_id,
-    int group_count, absl::optional<ConvParameters::FusionInfo> fusion_info,
-    int version)
-    : device_id_(device_id) {
+    const absl::Span<const int64_t> padding, DataType dtype, int group_count,
+    absl::optional<ConvParameters::FusionInfo> fusion_info, int version)
+    : device_id_(stream_exec->device_ordinal()) {
   proto_.set_batch(batch);
   proto_.set_in_depths(in_depths);
   *proto_.mutable_in() = {in.begin(), in.end()};
@@ -65,16 +65,17 @@ ConvParameters::ConvParameters(
     fusion_proto.set_is_contrib(fusion_info.value().is_contrib);
     *proto_.mutable_fusion() = fusion_proto;
   }
+  // Have to convert to std::string because apparently our open-source protobuf
+  // does not speak absl::string_view.
   proto_.set_device_identifier(
-      autotune_maps_utils::DeviceIdToIdentifier(device_id));
+      std::string(stream_exec->GetDeviceDescription().model_str()));
   proto_.set_version(version);
   hash_code_ = ComputeHash(device_id_, proto_);
 }
-
 ConvParameters::ConvParameters(int device_id, const ConvParametersProto& proto)
     : device_id_(device_id),
       proto_(proto),
-      hash_code_(ComputeHash(device_id, proto_)) {}
+      hash_code_(ComputeHash(device_id_, proto_)) {}
 
 bool ConvParameters::operator==(const ConvParameters& other) const {
   return device_id_ == other.device_id_ &&
@@ -84,11 +85,11 @@ bool ConvParameters::operator==(const ConvParameters& other) const {
 string ConvParameters::ToString() const { return proto_.DebugString(); }
 
 MatmulParameters::MatmulParameters(
-    DataType ab_dtype, DataType c_dtype, bool trans_a, bool trans_b, uint64_t m,
-    uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
-    stream_executor::dnn::ActivationMode activation_mode, int device_id,
-    int version)
-    : device_id_(device_id) {
+    se::StreamExecutor* stream_exec, DataType ab_dtype, DataType c_dtype,
+    bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k, int64_t lda,
+    int64_t ldb, int64_t ldc,
+    stream_executor::dnn::ActivationMode activation_mode, int version)
+    : device_id_(stream_exec->device_ordinal()) {
   proto_.set_ab_dtype(ab_dtype);
   proto_.set_c_dtype(c_dtype);
 
@@ -102,17 +103,19 @@ MatmulParameters::MatmulParameters(
   proto_.set_ldc(ldc);
   proto_.set_activation_mode(activation_mode);
 
+  // Have to convert to std::string because apparently our open-source protobuf
+  // does not speak absl::string_view.
   proto_.set_device_identifier(
-      autotune_maps_utils::DeviceIdToIdentifier(device_id));
+      std::string(stream_exec->GetDeviceDescription().model_str()));
   proto_.set_version(version);
   hash_code_ = ComputeHash(device_id_, proto_);
 }
 
-MatmulParameters::MatmulParameters(int device_id,
+MatmulParameters::MatmulParameters(se::StreamExecutor* stream_exec,
                                    const MatmulParametersProto& proto)
-    : device_id_(device_id),
+    : device_id_(stream_exec->device_ordinal()),
       proto_(proto),
-      hash_code_(ComputeHash(device_id, proto_)) {}
+      hash_code_(ComputeHash(device_id_, proto_)) {}
 
 bool MatmulParameters::operator==(const MatmulParameters& other) const {
   return device_id_ == other.device_id_ &&
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.h b/tensorflow/core/util/autotune_maps/conv_parameters.h
index 636b54cec77..6658fa6e21c 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.h
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.h
@@ -53,7 +53,7 @@ class ConvParameters {
   // A positive number that denotes the version of this class. Should be
   // incremented everytime this class or ConvParametersProto are updated in a
   // way that may invalidate autotune results.
-  static constexpr int kVersion = 2;
+  static constexpr int kVersion = 3;
   // LINT.ThenChange()
 
   // We have three kinds of convolutions today.  Vanilla unfused convolutions,
@@ -62,17 +62,21 @@ class ConvParameters {
   // cudnn calls, but have slightly different semantics (e.g. they interpret
   // padding differently).
   ConvParameters(
-      int64_t batch, int64_t in_depths, absl::Span<const int64_t> in,
-      int data_format, int64_t out_depths, absl::Span<const int64_t> filter,
-      absl::Span<const int64_t> dilation, absl::Span<const int64_t> stride,
-      absl::Span<const int64_t> padding, DataType dtype, int device_id,
-      int group_count,
+      se::StreamExecutor* stream_exec, int64_t batch, int64_t in_depths,
+      absl::Span<const int64_t> in, int data_format, int64_t out_depths,
+      absl::Span<const int64_t> filter, absl::Span<const int64_t> dilation,
+      absl::Span<const int64_t> stride, absl::Span<const int64_t> padding,
+      DataType dtype, int group_count,
       absl::optional<FusionInfo> fusion_info = absl::optional<FusionInfo>(),
       // This argument should be set only for test use.
       int version = kVersion);
 
   ConvParameters(int device_id, const ConvParametersProto& proto);
 
+  ConvParameters(se::StreamExecutor* stream_exec,
+                 const ConvParametersProto& proto)
+      : ConvParameters(stream_exec->device_ordinal(), proto) {}
+
   bool operator==(const ConvParameters& other) const;
 
   bool operator!=(const ConvParameters& other) const {
@@ -96,18 +100,19 @@ class MatmulParameters {
   // A positive number that denotes the version of this class. Should be
   // incremented everytime this class or ConvParametersProto are updated in a
   // way that may invalidate autotune results.
-  static constexpr int kVersion = 1;
+  static constexpr int kVersion = 2;
   // LINT.ThenChange()
 
-  MatmulParameters(DataType ab_dtype, DataType c_dtype, bool trans_a,
-                   bool trans_b, uint64_t m, uint64_t n, uint64_t k,
-                   int64_t lda, int64_t ldb, int64_t ldc,
+  MatmulParameters(se::StreamExecutor* stream_exec, DataType ab_dtype,
+                   DataType c_dtype, bool trans_a, bool trans_b, uint64_t m,
+                   uint64_t n, uint64_t k, int64_t lda, int64_t ldb,
+                   int64_t ldc,
                    stream_executor::dnn::ActivationMode activation_mode,
-                   int device_id,
                    // This argument should be set only for test use.
                    int version = kVersion);
 
-  MatmulParameters(int device_id, const MatmulParametersProto& proto);
+  MatmulParameters(se::StreamExecutor* stream_exec,
+                   const MatmulParametersProto& proto);
 
   bool operator==(const MatmulParameters& other) const;
 
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index aba752a1a3f..4c0b368389f 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -7,6 +7,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/util/cuda_solvers.cc b/tensorflow/core/util/cuda_solvers.cc
index bda7689f970..f5383a3f94b 100644
--- a/tensorflow/core/util/cuda_solvers.cc
+++ b/tensorflow/core/util/cuda_solvers.cc
@@ -490,6 +490,17 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context,
   return OkStatus();
 }
 
+#if TENSORFLOW_USE_ROCM
+#define GETRS_INSTANCE(Scalar, type_prefix)                                  \
+  template <>                                                                \
+  Status GpuSolver::Getrs<Scalar>(                                           \
+      cublasOperation_t trans, int n, int nrhs, const Scalar* A, int lda,    \
+      int* pivots, Scalar* B, int ldb, int* dev_lapack_info) const {         \
+    return GetrsImpl(DN_SOLVER_FN(getrs, type_prefix), context_,             \
+                     cusolver_dn_handle_, trans, n, nrhs, A, lda, pivots, B, \
+                     ldb, dev_lapack_info);                                  \
+  }
+#else
 #define GETRS_INSTANCE(Scalar, type_prefix)                                  \
   template <>                                                                \
   Status GpuSolver::Getrs<Scalar>(                                           \
@@ -499,6 +510,7 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context,
                      cusolver_dn_handle_, trans, n, nrhs, A, lda, pivots, B, \
                      ldb, dev_lapack_info);                                  \
   }
+#endif
 
 TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);
 
diff --git a/tensorflow/core/util/cuda_sparse.cc b/tensorflow/core/util/cuda_sparse.cc
index a2eb4fda47f..0cad65a81b5 100644
--- a/tensorflow/core/util/cuda_sparse.cc
+++ b/tensorflow/core/util/cuda_sparse.cc
@@ -508,6 +508,7 @@ TF_CALL_LAPACK_TYPES(CSRMV_INSTANCE);
 
 #else
 
+#if CUDA_VERSION < 12000
 template <typename Scalar>
 static inline Status CsrmvExImpl(cudaDataType_t dtype, OpKernelContext* context,
                                  cusparseHandle_t cusparse_handle,
@@ -546,6 +547,7 @@ static inline Status CsrmvExImpl(cudaDataType_t dtype, OpKernelContext* context,
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyMatDescr(descrA));
   return OkStatus();
 }
+#endif  // CUDA_VERSION < 12000
 
 template <typename Scalar>
 static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
@@ -569,10 +571,16 @@ static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
       cusparseCreateDnVec(&vecX, sizeX, const_cast<Scalar*>(x), dtype));
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateDnVec(&vecY, sizeY, y, dtype));
 
+#if CUDA_VERSION >= 12000
+  cusparseSpMVAlg_t algo = CUSPARSE_SPMV_CSR_ALG1;
+#else
+  cusparseSpMVAlg_t algo = CUSPARSE_CSRMV_ALG1;
+#endif
+
   size_t bufferSize;
-  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpMV_bufferSize(
-      cusparse_handle, transA, alpha_host, matA, vecX, beta_host, vecY, dtype,
-      CUSPARSE_CSRMV_ALG1, &bufferSize));
+  TF_RETURN_IF_GPUSPARSE_ERROR(
+      cusparseSpMV_bufferSize(cusparse_handle, transA, alpha_host, matA, vecX,
+                              beta_host, vecY, dtype, algo, &bufferSize));
 
   Tensor buffer;
   TF_RETURN_IF_ERROR(context->allocate_temp(
@@ -580,9 +588,9 @@ static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
   auto pBuffer = buffer.flat<int8>();
   DCHECK(pBuffer.data() != nullptr);
 
-  TF_RETURN_IF_GPUSPARSE_ERROR(
-      cusparseSpMV(cusparse_handle, transA, alpha_host, matA, vecX, beta_host,
-                   vecY, dtype, CUSPARSE_CSRMV_ALG1, pBuffer.data()));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpMV(cusparse_handle, transA, alpha_host,
+                                            matA, vecX, beta_host, vecY, dtype,
+                                            algo, pBuffer.data()));
 
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnVec(vecY));
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnVec(vecX));
@@ -590,6 +598,7 @@ static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
   return OkStatus();
 }
 
+#if CUDA_VERSION < 12000
 #define CSRMV_INSTANCE(Scalar, cudaDataType)                                   \
   template <>                                                                  \
   Status GpuSparse::Csrmv<Scalar>(                                             \
@@ -608,6 +617,20 @@ static inline Status SpMVImpl(cudaDataType_t dtype, OpKernelContext* context,
                       csrSortedColIndA, x, beta_host, y);                      \
     }                                                                          \
   }
+#else
+#define CSRMV_INSTANCE(Scalar, cudaDataType)                                  \
+  template <>                                                                 \
+  Status GpuSparse::Csrmv<Scalar>(                                            \
+      cusparseOperation_t transA, int m, int n, int nnz,                      \
+      const Scalar* alpha_host, const Scalar* csrSortedValA,                  \
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,               \
+      const Scalar* x, const Scalar* beta_host, Scalar* y) const {            \
+    DCHECK(initialized_);                                                     \
+    return SpMVImpl(cudaDataType, context_, *gpusparse_handle_, transA, m, n, \
+                    nnz, alpha_host, csrSortedValA, csrSortedRowPtrA,         \
+                    csrSortedColIndA, x, beta_host, y);                       \
+  }
+#endif  // CUDA_VERSION < 12000
 
 TF_CALL_CUSPARSE_DTYPES(CSRMV_INSTANCE);
 
@@ -758,7 +781,7 @@ static inline Status CsrgeamBufferSizeExtImpl(
 
 TF_CALL_LAPACK_TYPES(CSRGEAM_BUFFERSIZE_INSTANCE);
 
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
 
 Status GpuSparse::CsrgemmNnz(
     cusparseOperation_t transA, cusparseOperation_t transB, int m, int k, int n,
@@ -816,7 +839,7 @@ static inline Status CsrgemmImpl(
 
 TF_CALL_LAPACK_TYPES(CSRGEMM_INSTANCE);
 
-#else
+#elif GOOGLE_CUDA && (CUDA_VERSION < 12000)
 
 template <typename T>
 static const T* one_ptr() {
@@ -909,7 +932,61 @@ static inline Status CsrgemmImpl(
 
 TF_CALL_LAPACK_TYPES(CSRGEMM_INSTANCE);
 
-#endif  // CUDA_VERSION < 10000
+#elif GOOGLE_CUDA  // CUDA_VERSION >= 12000
+
+#define SPGEMM_WORKESTIMATION_INSTANCE(Scalar, dtype)                      \
+  template <>                                                              \
+  Status GpuSparse::SpGEMM_workEstimation<Scalar>(                         \
+      GpuSparseConstSpMatDescr & matA, GpuSparseConstSpMatDescr & matB,    \
+      GpuSparseSpMatDescr & matC, GpuSparseSpGEMMDescr & spgemmDescr,      \
+      size_t * bufferSize1, void* externalBuffer1) {                       \
+    DCHECK(initialized_);                                                  \
+    Scalar alpha = static_cast<Scalar>(1.0);                               \
+    Scalar beta = static_cast<Scalar>(0.0);                                \
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpGEMM_workEstimation(            \
+        *gpusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,              \
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA.get(), matB.get(),  \
+        &beta, matC.get(), dtype, CUSPARSE_SPGEMM_ALG1, spgemmDescr.get(), \
+        bufferSize1, externalBuffer1));                                    \
+    return OkStatus();                                                     \
+  }
+TF_CALL_CUSPARSE_DTYPES(SPGEMM_WORKESTIMATION_INSTANCE);
+
+#define SPGEMM_COMPUTE_INSTANCE(Scalar, dtype)                             \
+  template <>                                                              \
+  Status GpuSparse::SpGEMM_compute<Scalar>(                                \
+      GpuSparseConstSpMatDescr & matA, GpuSparseConstSpMatDescr & matB,    \
+      GpuSparseSpMatDescr & matC, GpuSparseSpGEMMDescr & spgemmDescr,      \
+      size_t * bufferSize2, void* externalBuffer2) {                       \
+    DCHECK(initialized_);                                                  \
+    Scalar alpha = static_cast<Scalar>(1.0);                               \
+    Scalar beta = static_cast<Scalar>(0.0);                                \
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpGEMM_compute(                   \
+        *gpusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,              \
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA.get(), matB.get(),  \
+        &beta, matC.get(), dtype, CUSPARSE_SPGEMM_ALG1, spgemmDescr.get(), \
+        bufferSize2, externalBuffer2));                                    \
+    return OkStatus();                                                     \
+  }
+TF_CALL_CUSPARSE_DTYPES(SPGEMM_COMPUTE_INSTANCE);
+
+#define SPGEMM_COPY_INSTANCE(Scalar, dtype)                                  \
+  template <>                                                                \
+  Status GpuSparse::SpGEMM_copy<Scalar>(                                     \
+      GpuSparseConstSpMatDescr & matA, GpuSparseConstSpMatDescr & matB,      \
+      GpuSparseSpMatDescr & matC, GpuSparseSpGEMMDescr & spgemmDescr) {      \
+    DCHECK(initialized_);                                                    \
+    Scalar alpha = static_cast<Scalar>(1.0);                                 \
+    Scalar beta = static_cast<Scalar>(0.0);                                  \
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpGEMM_copy(                        \
+        *gpusparse_handle_, CUSPARSE_OPERATION_NON_TRANSPOSE,                \
+        CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA.get(), matB.get(),    \
+        &beta, matC.get(), dtype, CUSPARSE_SPGEMM_ALG1, spgemmDescr.get())); \
+    return OkStatus();                                                       \
+  }
+TF_CALL_CUSPARSE_DTYPES(SPGEMM_COPY_INSTANCE);
+
+#endif
 
 template <typename Scalar, typename BufferSizeFnT, typename SparseFnT>
 static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
@@ -995,11 +1072,16 @@ static inline Status Csr2cscImpl(cudaDataType_t dtype, OpKernelContext* context,
                                  const int* csrRowPtr, const int* csrColInd,
                                  Scalar* cscVal, int* cscRowInd, int* cscColPtr,
                                  const cusparseAction_t copyValues) {
+#if CUDA_VERSION < 12000
+  cusparseCsr2CscAlg_t algo = CUSPARSE_CSR2CSC_ALG2;
+#else
+  cusparseCsr2CscAlg_t algo = CUSPARSE_CSR2CSC_ALG1;
+#endif
   size_t bufferSize;
   TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCsr2cscEx2_bufferSize(
       cusparse_handle, m, n, nnz, AsCudaComplex(csrVal), csrRowPtr, csrColInd,
       AsCudaComplex(cscVal), cscColPtr, cscRowInd, dtype, copyValues,
-      CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG2, &bufferSize));
+      CUSPARSE_INDEX_BASE_ZERO, algo, &bufferSize));
 
   Tensor buffer;
   TF_RETURN_IF_ERROR(context->allocate_temp(
@@ -1008,11 +1090,10 @@ static inline Status Csr2cscImpl(cudaDataType_t dtype, OpKernelContext* context,
 
   DCHECK(buffer.flat<Scalar>().data() != nullptr);
 
-  TF_RETURN_IF_GPUSPARSE_ERROR(
-      cusparseCsr2cscEx2(cusparse_handle, m, n, nnz, AsCudaComplex(csrVal),
-                         csrRowPtr, csrColInd, AsCudaComplex(cscVal), cscColPtr,
-                         cscRowInd, dtype, copyValues, CUSPARSE_INDEX_BASE_ZERO,
-                         CUSPARSE_CSR2CSC_ALG2, buffer.flat<Scalar>().data()));
+  TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCsr2cscEx2(
+      cusparse_handle, m, n, nnz, AsCudaComplex(csrVal), csrRowPtr, csrColInd,
+      AsCudaComplex(cscVal), cscColPtr, cscRowInd, dtype, copyValues,
+      CUSPARSE_INDEX_BASE_ZERO, algo, buffer.flat<Scalar>().data()));
 
   return OkStatus();
 }
diff --git a/tensorflow/core/util/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
index 5d170364287..9c3e492e1e9 100644
--- a/tensorflow/core/util/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -46,6 +46,7 @@ using gpusparseSpMMAlg_t = cusparseSpMMAlg_t;
 
 #elif TENSORFLOW_USE_ROCM
 
+#include "rocm/rocm_config.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h"
 
 using gpusparseStatus_t = hipsparseStatus_t;
@@ -64,6 +65,7 @@ using gpusparseSpMMAlg_t = hipsparseSpMMAlg_t;
 
 #endif
 
+#include "tensorflow/compiler/xla/stream_executor/data_type.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -71,6 +73,10 @@ using gpusparseSpMMAlg_t = hipsparseSpMMAlg_t;
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/public/version.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h"
+#endif
+
 // Macro that specializes a sparse method for all 4 standard
 // numeric types.
 // TODO: reuse with cuda_solvers
@@ -181,6 +187,112 @@ inline gpusparseOperation_t TransposeAndConjugateToGpuSparseOp(bool transpose,
 #endif
 }
 
+#if GOOGLE_CUDA && (CUDA_VERSION >= 12000)
+
+template <typename T>
+struct ToGpuSparseIndexType;
+template <>
+struct ToGpuSparseIndexType<int> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct ToGpuSparseIndexType<int64_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
+
+class GpuSparseSpGEMMDescr {
+ public:
+  GpuSparseSpGEMMDescr() : initialized_(false) {}
+  ~GpuSparseSpGEMMDescr() {
+    if (initialized_) {
+      cusparseSpGEMM_destroyDescr(descr_);
+    }
+  }
+  Status Initialize() {
+    if (initialized_) {
+      return errors::Internal("Double initializion of GpuSparseSpGEMMDescr.");
+    }
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseSpGEMM_createDescr(&descr_));
+    initialized_ = true;
+    return OkStatus();
+  }
+  cusparseSpGEMMDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseSpGEMMDescr_t descr_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseSpGEMMDescr);
+};
+
+class GpuSparseSpMatDescr {
+ public:
+  GpuSparseSpMatDescr() : initialized_(false) {}
+  ~GpuSparseSpMatDescr() {
+    if (initialized_) {
+      cusparseDestroySpMat(descr_);
+    }
+  }
+  template <typename IndexType, typename FloatType>
+  Status InitializeCsr(int64_t rows, int64_t cols, int64_t nnz,
+                       IndexType* csrRowOffsets, IndexType* csrColInd,
+                       FloatType* csrValues) {
+    if (initialized_) {
+      return errors::Internal("Double initializion of gpusparseSpMatDescr.");
+    }
+    using stream_executor::cuda::AsCudaDataType;
+    using stream_executor::dnn::ToDataType;
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsr(
+        &descr_, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues,
+        ToGpuSparseIndexType<IndexType>::value,
+        ToGpuSparseIndexType<IndexType>::value, CUSPARSE_INDEX_BASE_ZERO,
+        AsCudaDataType(ToDataType<FloatType>::value)));
+    initialized_ = true;
+    return OkStatus();
+  }
+  gpusparseSpMatDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseSpMatDescr_t descr_;
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseSpMatDescr);
+};
+
+class GpuSparseConstSpMatDescr {
+ public:
+  GpuSparseConstSpMatDescr() : initialized_(false) {}
+  ~GpuSparseConstSpMatDescr() {
+    if (initialized_) {
+      cusparseDestroySpMat(descr_);
+    }
+  }
+  template <typename IndexType, typename FloatType>
+  Status InitializeCsr(int64_t rows, int64_t cols, int64_t nnz,
+                       const IndexType* csrRowOffsets,
+                       const IndexType* csrColInd, const FloatType* csrValues) {
+    if (initialized_) {
+      return errors::Internal("Double initializion of gpusparseSpMatDescr.");
+    }
+    using stream_executor::cuda::AsCudaDataType;
+    using stream_executor::dnn::ToDataType;
+    TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateConstCsr(
+        &descr_, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues,
+        ToGpuSparseIndexType<IndexType>::value,
+        ToGpuSparseIndexType<IndexType>::value, CUSPARSE_INDEX_BASE_ZERO,
+        AsCudaDataType(ToDataType<FloatType>::value)));
+    initialized_ = true;
+    return OkStatus();
+  }
+  cusparseConstSpMatDescr_t& get() { return descr_; }
+
+ private:
+  bool initialized_;
+  cusparseConstSpMatDescr_t descr_;
+  TF_DISALLOW_COPY_AND_ASSIGN(GpuSparseConstSpMatDescr);
+};
+
+#endif
+
 // The GpuSparse class provides a simplified templated API for cuSparse
 // (http://docs.nvidia.com/cuda/cusparse/index.html).
 // An object of this class wraps static cuSparse instances,
@@ -372,24 +484,11 @@ class GpuSparse {
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC, void* workspace);
 
-#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
-  // Computes sparse-sparse matrix multiplication of matrices
-  // stored in CSR format.  This is part zero: calculate required workspace
-  // size.
-  template <typename Scalar>
-  Status CsrgemmBufferSize(
-      int m, int n, int k, const gpusparseMatDescr_t descrA, int nnzA,
-      const int* csrSortedRowPtrA, const int* csrSortedColIndA,
-      const gpusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
-      const int* csrSortedColIndB, csrgemm2Info_t info, size_t* workspaceBytes);
-#endif
-
   // Computes sparse-sparse matrix multiplication of matrices
-  // stored in CSR format.  This is part one: calculate nnz of the
-  // output.  csrSortedRowPtrC must be preallocated on device with
-  // m + 1 entries.  See:
-  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
+  // stored in CSR format.
 #if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
+  // Part one: calculate nnz of the output.
+  // csrSortedRowPtrC must be preallocated on device with m + 1 entries.
   Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
                     int m, int k, int n, const gpusparseMatDescr_t descrA,
                     int nnzA, const int* csrSortedRowPtrA,
@@ -398,23 +497,9 @@ class GpuSparse {
                     const int* csrSortedRowPtrB, const int* csrSortedColIndB,
                     const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
                     int* nnzTotalDevHostPtr);
-#else
-  Status CsrgemmNnz(int m, int n, int k, const gpusparseMatDescr_t descrA,
-                    int nnzA, const int* csrSortedRowPtrA,
-                    const int* csrSortedColIndA,
-                    const gpusparseMatDescr_t descrB, int nnzB,
-                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
-                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
-                    int* nnzTotalDevHostPtr, csrgemm2Info_t info,
-                    void* workspace);
-#endif
-
-  // Computes sparse - sparse matrix matmul of matrices
-  // stored in CSR format.  This is part two: perform sparse-sparse
-  // addition.  csrValC and csrColIndC must be allocated on the device
-  // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).  See:
-  // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
+  // Part two: perform sparse-sparse matmul.
+  // csrValC and csrColIndC must be allocated on the device with
+  // nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).
   template <typename Scalar>
   Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
                  int m, int k, int n, const gpusparseMatDescr_t descrA,
@@ -425,7 +510,27 @@ class GpuSparse {
                  const int* csrSortedColIndB, const gpusparseMatDescr_t descrC,
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC);
-#else
+#elif GOOGLE_CUDA && (CUDA_VERSION < 12000)
+  // Part zero: calculate required workspace size.
+  template <typename Scalar>
+  Status CsrgemmBufferSize(
+      int m, int n, int k, const gpusparseMatDescr_t descrA, int nnzA,
+      const int* csrSortedRowPtrA, const int* csrSortedColIndA,
+      const gpusparseMatDescr_t descrB, int nnzB, const int* csrSortedRowPtrB,
+      const int* csrSortedColIndB, csrgemm2Info_t info, size_t* workspaceBytes);
+  // Part one: calculate nnz of the output.
+  // csrSortedRowPtrC must be preallocated on device with m + 1 entries.
+  Status CsrgemmNnz(int m, int n, int k, const gpusparseMatDescr_t descrA,
+                    int nnzA, const int* csrSortedRowPtrA,
+                    const int* csrSortedColIndA,
+                    const gpusparseMatDescr_t descrB, int nnzB,
+                    const int* csrSortedRowPtrB, const int* csrSortedColIndB,
+                    const gpusparseMatDescr_t descrC, int* csrSortedRowPtrC,
+                    int* nnzTotalDevHostPtr, csrgemm2Info_t info,
+                    void* workspace);
+  // Part two: perform sparse-sparse matmul.
+  // csrValC and csrColIndC must be allocated on the device with
+  // nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).
   template <typename Scalar>
   Status Csrgemm(int m, int n, int k, const gpusparseMatDescr_t descrA,
                  int nnzA, const Scalar* csrSortedValA,
@@ -436,6 +541,23 @@ class GpuSparse {
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC, const csrgemm2Info_t info,
                  void* workspace);
+#elif GOOGLE_CUDA  // CUDA_VERSION >= 12000
+  template <typename Scalar>
+  Status SpGEMM_workEstimation(GpuSparseConstSpMatDescr& matA,
+                               GpuSparseConstSpMatDescr& matB,
+                               GpuSparseSpMatDescr& matC,
+                               GpuSparseSpGEMMDescr& spgemmDescr,
+                               size_t* bufferSize1, void* externalBuffer1);
+  template <typename Scalar>
+  Status SpGEMM_compute(GpuSparseConstSpMatDescr& matA,
+                        GpuSparseConstSpMatDescr& matB,
+                        GpuSparseSpMatDescr& matC,
+                        GpuSparseSpGEMMDescr& spgemmDescr, size_t* bufferSize2,
+                        void* externalBuffer2);
+  template <typename Scalar>
+  Status SpGEMM_copy(GpuSparseConstSpMatDescr& matA,
+                     GpuSparseConstSpMatDescr& matB, GpuSparseSpMatDescr& matC,
+                     GpuSparseSpGEMMDescr& spgemmDescr);
 #endif
 
   // In-place reordering of unsorted CSR to sorted CSR.
diff --git a/tensorflow/core/util/example_proto_helper_test.cc b/tensorflow/core/util/example_proto_helper_test.cc
index 0dd56a9c2e4..41994823a46 100644
--- a/tensorflow/core/util/example_proto_helper_test.cc
+++ b/tensorflow/core/util/example_proto_helper_test.cc
@@ -14,14 +14,55 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/example_proto_helper.h"
 
+#include <cstdint>
+
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/tstring.h"
 
 namespace tensorflow {
 namespace {
 
+TEST(CopyIntoSparseTensorTest, String) {
+  Tensor in_tensor(DT_STRING, TensorShape({2}));
+  in_tensor.flat<tstring>()(0) = "hello";
+  in_tensor.flat<tstring>()(1) = "world";
+  int n_values = 5;
+  Tensor ix_tensor(DT_INT64, TensorShape({n_values, 2}));
+  auto ix_matrix = ix_tensor.matrix<int64_t>();
+  for (int i = 0; i < n_values; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ix_matrix(i, j) = 0;
+    }
+  }
+  Tensor value_tensor(DT_STRING, TensorShape({n_values}));
+  int batch = 67;
+  int64_t offset = 1;
+  auto n_elems =
+      CopyIntoSparseTensor(in_tensor, batch, offset, &ix_tensor, &value_tensor);
+  EXPECT_EQ(2, n_elems);
+
+  EXPECT_EQ(0, ix_matrix(0, 0));
+  EXPECT_EQ(0, ix_matrix(0, 1));
+  EXPECT_EQ(batch, ix_matrix(1, 0));
+  EXPECT_EQ(0, ix_matrix(1, 1));
+  EXPECT_EQ(batch, ix_matrix(2, 0));
+  EXPECT_EQ(1, ix_matrix(2, 1));
+  EXPECT_EQ(0, ix_matrix(3, 0));
+  EXPECT_EQ(0, ix_matrix(3, 1));
+  EXPECT_EQ(0, ix_matrix(4, 0));
+  EXPECT_EQ(0, ix_matrix(4, 1));
+  auto values = value_tensor.flat<tstring>();
+  EXPECT_EQ("", values(0));
+  EXPECT_EQ("hello", values(1));
+  EXPECT_EQ("world", values(2));
+  EXPECT_EQ("", values(3));
+  EXPECT_EQ("", values(4));
+}
+
 constexpr char kDenseInt64Key[] = "dense_int64";
 constexpr char kDenseFloatKey[] = "dense_float";
 constexpr char kDenseStringKey[] = "dense_string";
diff --git a/tensorflow/core/util/fake_clock_env.cc b/tensorflow/core/util/fake_clock_env.cc
new file mode 100644
index 00000000000..55c7841a4cf
--- /dev/null
+++ b/tensorflow/core/util/fake_clock_env.cc
@@ -0,0 +1,38 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/fake_clock_env.h"
+
+#include <string>
+
+namespace tensorflow {
+
+FakeClockEnv::FakeClockEnv(Env* wrapped) : EnvWrapper(wrapped) {}
+
+void FakeClockEnv::AdvanceByMicroseconds(int64_t micros) {
+  {
+    mutex_lock l(mu_);
+    current_time_ += micros;
+  }
+}
+
+uint64 FakeClockEnv::NowMicros() const {
+  {
+    mutex_lock l(mu_);
+    return current_time_;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/fake_clock_env.h b/tensorflow/core/util/fake_clock_env.h
new file mode 100644
index 00000000000..0282e9655b3
--- /dev/null
+++ b/tensorflow/core/util/fake_clock_env.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// An Env implementation with a fake clock for NowMicros().
+// The clock doesn't advance on its own. It advances
+// via an explicit AdvanceByMicroseconds() method. All other Env virtual methods
+// pass through to a wrapped Env.
+class FakeClockEnv : public EnvWrapper {
+ public:
+  explicit FakeClockEnv(Env* wrapped);
+  ~FakeClockEnv() override = default;
+
+  // Advance the clock by a certain number of microseconds.
+  void AdvanceByMicroseconds(int64_t micros);
+
+  // Returns the current time of FakeClockEnv in microseconds.
+  uint64 NowMicros() const override;
+
+ private:
+  mutable mutex mu_;
+  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/core/util/fake_clock_env_test.cc b/tensorflow/core/util/fake_clock_env_test.cc
new file mode 100644
index 00000000000..abff82a6b3c
--- /dev/null
+++ b/tensorflow/core/util/fake_clock_env_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/fake_clock_env.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace {
+
+class FakeClockEnvTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    fake_clock_env_ = std::make_unique<FakeClockEnv>(Env::Default());
+  }
+
+  void TearDown() override { fake_clock_env_.reset(); }
+
+  std::unique_ptr<FakeClockEnv> fake_clock_env_;
+};
+
+TEST_F(FakeClockEnvTest, TimeInitializedToZero) {
+  EXPECT_EQ(0, fake_clock_env_->NowMicros());
+}
+
+TEST_F(FakeClockEnvTest, AdvanceTimeByMicroseconds) {
+  int current_time = fake_clock_env_->NowMicros();
+
+  // Advance current time and fake clock by equal duration.
+  int64_t duration = 100;
+  current_time += duration;
+  fake_clock_env_->AdvanceByMicroseconds(duration);
+  EXPECT_EQ(current_time, fake_clock_env_->NowMicros());
+
+  // Multiple advancements of current time and fake clock.
+  for (int i = 0; i < 5; ++i) {
+    fake_clock_env_->AdvanceByMicroseconds(100);
+    current_time += 100;
+  }
+  EXPECT_EQ(current_time, fake_clock_env_->NowMicros());
+
+  // Advance current time and fake clock by unequal durations.
+  current_time += duration;
+  duration = 200;
+  fake_clock_env_->AdvanceByMicroseconds(duration);
+  EXPECT_NE(current_time, fake_clock_env_->NowMicros());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 76fdbe778f9..36d7e58a50e 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -663,6 +663,18 @@ __device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
   }
 }
 
+template <typename F>
+__device__ Eigen::bfloat16 GpuAtomicCasHelper(Eigen::bfloat16* ptr,
+                                              F accumulate) {
+  Eigen::half ret = detail::GpuAtomicCasHelper(
+      reinterpret_cast<Eigen::half*>(ptr), [accumulate](Eigen::half a) {
+        Eigen::bfloat16 acc =
+            accumulate(Eigen::numext::bit_cast<Eigen::bfloat16>(a));
+        return Eigen::numext::bit_cast<Eigen::half>(acc);
+      });
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(ret);
+}
+
 template <typename F>
 __device__ long long GpuAtomicCasHelper(long long* ptr, F accumulate) {
   return static_cast<long long>(
@@ -725,6 +737,12 @@ __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
       ptr, [value](Eigen::half a) { return a + value; });
 }
 
+__device__ inline Eigen::bfloat16 GpuAtomicAdd(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return a + value; });
+}
+
 #if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
 __device__ inline double GpuAtomicAdd(double* ptr, double value) {
   return detail::GpuAtomicCasHelper(ptr,
@@ -785,6 +803,13 @@ __device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
   return detail::GpuAtomicCasHelper(
       ptr, [value](Eigen::half a) { return a - value; });
 }
+
+__device__ inline Eigen::bfloat16 GpuAtomicSub(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return a - value; });
+}
+
 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
 
 // GpuAtomicMax
@@ -838,6 +863,12 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
       ptr, [value](Eigen::half a) { return max(a, value); });
 }
 
+__device__ inline Eigen::bfloat16 GpuAtomicMax(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return max(a, value); });
+}
+
 #if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
@@ -905,6 +936,12 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
       ptr, [value](Eigen::half a) { return min(a, value); });
 }
 
+__device__ inline Eigen::bfloat16 GpuAtomicMin(Eigen::bfloat16* ptr,
+                                               Eigen::bfloat16 value) {
+  return detail::GpuAtomicCasHelper(
+      ptr, [value](Eigen::bfloat16 a) { return min(a, value); });
+}
+
 #if TENSORFLOW_USE_ROCM || (__CUDA_ARCH__ < 320)
 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 835091cdfc8..3a1319b6dec 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -117,6 +117,7 @@ Status GpuLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
 #elif TENSORFLOW_USE_ROCM
   hipLaunchKernelGGL(function, grid_dim, block_dim, shared_memory_size_bytes,
                      stream, std::forward<Args>(arguments)...);
+  TF_RETURN_IF_CUDA_ERROR(hipGetLastError());
 #endif
   return OkStatus();
 }
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index b876d0890f0..a7cdf7a8e48 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -154,7 +154,9 @@ class GpuLaunchConfigTest : public ::testing::Test {
 
   void copyToHost() {
 #if TENSORFLOW_USE_ROCM
-    hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost);
+    hipError_t err = hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize,
+                               hipMemcpyDeviceToHost);
+    ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
 #endif
   }
   virtual void SetUp() {
@@ -162,15 +164,15 @@ class GpuLaunchConfigTest : public ::testing::Test {
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
     outbuf_host = outbuf;
 #else
-    cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
+    hipError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
     outbuf_host = hostbuf;
 #endif
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   }
 
   virtual void TearDown() {
-    gpuDeviceSynchronize();
-    gpuFree(outbuf);
+    ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
+    ASSERT_EQ(gpuFree(outbuf), cudaSuccess);
     outbuf = nullptr;
   }
 };
@@ -322,7 +324,7 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
                                nullptr, failure_count));
   ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
   ASSERT_EQ(*failure_count, 0);
-  gpuFree(failure_count);
+  ASSERT_EQ(gpuFree(failure_count), cudaSuccess);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 3e2adc0fb6c..50ad93f366f 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -287,13 +287,23 @@ Gpu3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
   cudaDeviceGetAttribute(&ygridlimit, cudaDevAttrMaxGridDimY, dev);
   cudaDeviceGetAttribute(&zgridlimit, cudaDevAttrMaxGridDimZ, dev);
 #elif TENSORFLOW_USE_ROCM
-  hipGetDevice(&dev);
-  hipDeviceGetAttribute(&xthreadlimit, hipDeviceAttributeMaxBlockDimX, dev);
-  hipDeviceGetAttribute(&ythreadlimit, hipDeviceAttributeMaxBlockDimY, dev);
-  hipDeviceGetAttribute(&zthreadlimit, hipDeviceAttributeMaxBlockDimZ, dev);
-  hipDeviceGetAttribute(&xgridlimit, hipDeviceAttributeMaxGridDimX, dev);
-  hipDeviceGetAttribute(&ygridlimit, hipDeviceAttributeMaxGridDimY, dev);
-  hipDeviceGetAttribute(&zgridlimit, hipDeviceAttributeMaxGridDimZ, dev);
+  hipError_t err = hipGetDevice(&dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&xthreadlimit, hipDeviceAttributeMaxBlockDimX, dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&ythreadlimit, hipDeviceAttributeMaxBlockDimY, dev);
+  CHECK_EQ(err, hipSuccess);
+  err =
+      hipDeviceGetAttribute(&zthreadlimit, hipDeviceAttributeMaxBlockDimZ, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&xgridlimit, hipDeviceAttributeMaxGridDimX, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&ygridlimit, hipDeviceAttributeMaxGridDimY, dev);
+  CHECK_EQ(err, hipSuccess);
+  err = hipDeviceGetAttribute(&zgridlimit, hipDeviceAttributeMaxGridDimZ, dev);
+  CHECK_EQ(err, hipSuccess);
 #endif
 
   int block_count = 0;
diff --git a/tensorflow/core/util/gpu_solvers.h b/tensorflow/core/util/gpu_solvers.h
index 9110bd6c425..e3483c24103 100644
--- a/tensorflow/core/util/gpu_solvers.h
+++ b/tensorflow/core/util/gpu_solvers.h
@@ -33,7 +33,11 @@ limitations under the License.
 #else
 #include "rocm/include/hip/hip_complex.h"
 #include "rocm/include/rocblas.h"
+#include "rocm/rocm_config.h"
 #include "tensorflow/compiler/xla/stream_executor/blas.h"
+#if TF_ROCM_VERSION >= 40500
+#include "tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h"
+#endif
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h"
 #endif
 #include "tensorflow/core/framework/op_kernel.h"
@@ -284,7 +288,7 @@ class GpuSolver {
   // Uses LU factorization to solve A * X = B.
   template <typename Scalar>
   Status Getrs(const gpuSolverOp_t trans, int n, int nrhs, Scalar* A, int lda,
-               const int* dev_pivots, Scalar* B, int ldb, int* dev_lapack_info);
+               int* dev_pivots, Scalar* B, int ldb, int* dev_lapack_info);
 
   template <typename Scalar>
   Status GetrfBatched(int n, Scalar** dev_A, int lda, int* dev_pivots,
@@ -365,8 +369,9 @@ class GpuSolver {
 #if TF_ROCM_VERSION >= 40500
   // Hermitian (Symmetric) Eigen decomposition.
   template <typename Scalar>
-  Status Heevd(gpuSolverOp_t jobz, gpuSolverFill_t uplo, int n, Scalar* dev_A,
-               int lda, typename Eigen::NumTraits<Scalar>::Real* dev_W,
+  Status Heevd(hipsolverEigMode_t jobz, gpuSolverFill_t uplo, int n,
+               Scalar* dev_A, int lda,
+               typename Eigen::NumTraits<Scalar>::Real* dev_W,
                int* dev_lapack_info);
 #endif
 
@@ -551,6 +556,9 @@ class GpuSolver {
 #else  // TENSORFLOW_USE_ROCM
   hipStream_t hip_stream_;
   rocblas_handle rocm_blas_handle_;
+#if TF_ROCM_VERSION >= 40500
+  hipsolverHandle_t hipsolver_handle_;
+#endif
 #endif
 
   std::vector<TensorReference> scratch_tensor_refs_;
diff --git a/tensorflow/core/util/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
index 257462c4b6f..29429b32f00 100644
--- a/tensorflow/core/util/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 // CalculateResizeScale determines the float scaling factor.
 inline float CalculateResizeScale(int64_t in_size, int64_t out_size,
                                   bool align_corners) {
-  return (align_corners && out_size > 1)
+  return (align_corners && in_size > 1 && out_size > 1)
              ? (in_size - 1) / static_cast<float>(out_size - 1)
              : in_size / static_cast<float>(out_size);
 }
diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
index ee4b5bc1392..e3bf6adc0b5 100644
--- a/tensorflow/core/util/mkl_threadpool.h
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -28,7 +28,9 @@ limitations under the License.
 #include "dnnl_threadpool.hpp"
 #include "dnnl.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/util/onednn_env_vars.h"
 #define EIGEN_USE_THREADS
 
 namespace tensorflow {
@@ -60,6 +62,17 @@ inline void balance211(T n, U team, U tid, T* n_start, T* n_end) {
   *n_end = *n_start + min_per_team + (tid < remainder);
 }
 
+inline void run_jobs(bool balance, int i, int n, int njobs,
+                     const std::function<void(int, int)>& fn) {
+  if (balance) {
+    int start, end;
+    balance211(n, njobs, i, &start, &end);
+    for (int j = start; j < end; j++) fn(j, n);
+  } else {
+    fn(i, n);
+  }
+}
+
 struct MklDnnThreadPool : public threadpool_iface {
   MklDnnThreadPool() = default;
 
@@ -89,19 +102,20 @@ struct MklDnnThreadPool : public threadpool_iface {
     int nthr = get_num_threads();
     int njobs = std::min(n, nthr);
     bool balance = (nthr < n);
-    for (int i = 0; i < njobs; i++) {
+
+    // If use_caller_thread, schedule njobs-1 jobs to thread pool and run last
+    // job directly.
+    const bool use_caller_thread =
+        ThreadPoolUseCallerThread() && nthr == port::NumSchedulableCPUs();
+    const int njobs_to_schedule = use_caller_thread ? njobs - 1 : njobs;
+    for (int i = 0; i < njobs_to_schedule; i++) {
       eigen_interface_->ScheduleWithHint(
-          [balance, i, n, njobs, fn]() {
-            if (balance) {
-              int start, end;
-              balance211(n, njobs, i, &start, &end);
-              for (int j = start; j < end; j++) fn(j, n);
-            } else {
-              fn(i, n);
-            }
-          },
+          [balance, i, n, njobs, fn]() { run_jobs(balance, i, n, njobs, fn); },
           i, i + 1);
     }
+    if (use_caller_thread) {
+      run_jobs(balance, njobs - 1, n, njobs, fn);
+    }
   }
   ~MklDnnThreadPool() {}
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index a3f566bd43e..221793df1e3 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1978,7 +1978,6 @@ class MklPrimitiveFactory {
     static thread_local LRUCache<MklPrimitive> lru_cache_(kCapacity);
 #else
     static LRUCache<MklPrimitive> lru_cache_(kCapacity);
-    TF_GUARDED_BY(lru_mu_)
 #endif
     return lru_cache_;
   }
diff --git a/tensorflow/core/util/onednn_env_vars.cc b/tensorflow/core/util/onednn_env_vars.cc
index 7b79a9e9387..56850fedd62 100644
--- a/tensorflow/core/util/onednn_env_vars.cc
+++ b/tensorflow/core/util/onednn_env_vars.cc
@@ -42,5 +42,16 @@ bool UseSystemAlloc() {
   return use_sys_alloc;
 }
 
+bool ThreadPoolUseCallerThread() {
+  static bool threadpool_use_caller_thread = false;
+  static absl::once_flag once;
+  absl::call_once(once, [&] {
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ONEDNN_THREADPOOL_USE_CALLER_THREAD",
+                                   /*default_value*/ false,
+                                   &threadpool_use_caller_thread));
+  });
+  return threadpool_use_caller_thread;
+}
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/onednn_env_vars.h b/tensorflow/core/util/onednn_env_vars.h
index 36e1447393f..2a74c9bdd98 100644
--- a/tensorflow/core/util/onednn_env_vars.h
+++ b/tensorflow/core/util/onednn_env_vars.h
@@ -18,8 +18,13 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 namespace tensorflow {
+
 bool AreWeightsFrozen();
+
 bool UseSystemAlloc();
+
+bool ThreadPoolUseCallerThread();
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_ONEDNN_ENV_VARS_H_
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index 0aa3cfa708e..1a63ddd20f3 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -62,6 +62,26 @@ bool GpuSupportsHalfMatMulAndConv() {
 #endif
 }
 
+// Returns whether oneDNN should be enabled or disabled by default.
+//   - Linux: Enabled by default for CPUs with neural network features.
+//   - Windows: Disabled by default.
+inline bool DefaultOneDnnPolicy() {
+#if !defined(INTEL_MKL)
+  return false;
+#elif defined(PLATFORM_GOOGLE)
+  return true;
+#elif defined(__linux__)
+  return port::TestCPUFeature(port::CPUFeature::AVX512_VNNI) ||
+         port::TestCPUFeature(port::CPUFeature::AVX512_BF16) ||
+         port::TestCPUFeature(port::CPUFeature::AVX_VNNI) ||
+         port::TestCPUFeature(port::CPUFeature::AMX_TILE) ||
+         port::TestCPUFeature(port::CPUFeature::AMX_INT8) ||
+         port::TestCPUFeature(port::CPUFeature::AMX_BF16);
+#else
+  return false;
+#endif  // !defined(INTEL_MKL)
+}
+
 bool IsMklEnabled() {
 #ifndef INTEL_MKL
   return false;
@@ -76,21 +96,7 @@ bool IsMklEnabled() {
   });
   return (!oneDNN_disabled);
 #else
-  // Linux: Turn oneDNN on by default for CPUs with neural network features.
-  // Windows: oneDNN is off by default.
-  // No need to guard for other platforms here because INTEL_MKL is only defined
-  // for non-mobile Linux or Windows.
-  static bool oneDNN_enabled =
-#ifdef __linux__
-      port::TestCPUFeature(port::CPUFeature::AVX512_VNNI) ||
-      port::TestCPUFeature(port::CPUFeature::AVX512_BF16) ||
-      port::TestCPUFeature(port::CPUFeature::AVX_VNNI) ||
-      port::TestCPUFeature(port::CPUFeature::AMX_TILE) ||
-      port::TestCPUFeature(port::CPUFeature::AMX_INT8) ||
-      port::TestCPUFeature(port::CPUFeature::AMX_BF16);
-#else
-      false;
-#endif  // __linux__
+  static bool oneDNN_enabled = DefaultOneDnnPolicy();
   absl::call_once(once, [&] {
     auto status = ReadBoolFromEnvVar("TF_ENABLE_ONEDNN_OPTS", oneDNN_enabled,
                                      &oneDNN_enabled);
@@ -116,4 +122,32 @@ bool IsMklEnabled() {
   return oneDNN_enabled;
 #endif  // ENABLE_MKL
 }
-}  // end namespace tensorflow
+
+bool IsZenDnnEnabled() {
+#ifndef AMD_ZENDNN
+  return false;
+#else
+  static absl::once_flag once;
+  static bool ZenDNN_enabled = false;
+  absl::call_once(once, [&] {
+    auto status = ReadBoolFromEnvVar("TF_ENABLE_ZENDNN_OPTS", ZenDNN_enabled,
+                                     &ZenDNN_enabled);
+
+    if (!status.ok()) {
+      LOG(WARNING) << "TF_ENABLE_ZENDNN_OPTS is not set to either '0', 'false',"
+                   << " '1', or 'true'. Using the default setting: "
+                   << ZenDNN_enabled;
+    }
+    if (ZenDNN_enabled) {
+      LOG(INFO) << "ZenDNN custom operations are on. "
+                << "You may see slightly different numerical results due to "
+                << "floating-point round-off errors from different computation "
+                << "orders. To turn them off, set the environment variable "
+                << "`TF_ENABLE_ZENDNN_OPTS=0`.";
+    }
+  });
+  return ZenDNN_enabled;
+#endif  // !AMD_ZENDNN
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/port.h b/tensorflow/core/util/port.h
index 2fca0370977..d0a755c2de7 100644
--- a/tensorflow/core/util/port.h
+++ b/tensorflow/core/util/port.h
@@ -44,6 +44,9 @@ bool GpuSupportsHalfMatMulAndConv();
 // Returns true if INTEL_MKL is defined
 bool IsMklEnabled();
 
+// Returns true if TF_ENABLE_ZENDNN_OPTS is set to 1
+bool IsZenDnnEnabled();
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_PORT_H_
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 815cacca5f9..601547db9b2 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/util/quantization/BUILD b/tensorflow/core/util/quantization/BUILD
index 5ea87597bfe..05f3fab744d 100644
--- a/tensorflow/core/util/quantization/BUILD
+++ b/tensorflow/core/util/quantization/BUILD
@@ -16,6 +16,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 # buildifier: disable=no-layering-check
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = ["-layering_check"],
     licenses = ["notice"],
diff --git a/tensorflow/core/util/quantization/uniform_quant_ops_params.cc b/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
index fa7ee926755..54f1bdee600 100644
--- a/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
+++ b/tensorflow/core/util/quantization/uniform_quant_ops_params.cc
@@ -174,7 +174,7 @@ Status UniformQuantizedConvolutionParams::ValidateOrFillParamsAndValidateShape(
   }
   if (lhs_feature_count / feature_group_count_ != rhs_input_feature_count) {
     return InvalidArgument(
-        "lhs fetaure dimension size divided by feature_group_count must equal "
+        "lhs feature dimension size divided by feature_group_count must equal "
         "the rhs input feature dimension size, but ",
         lhs_feature_count, " / ", feature_group_count_,
         " != ", rhs_input_feature_count);
diff --git a/tensorflow/core/util/rocm_solvers.cc b/tensorflow/core/util/rocm_solvers.cc
index 4b48664912f..fb3e5a4b435 100644
--- a/tensorflow/core/util/rocm_solvers.cc
+++ b/tensorflow/core/util/rocm_solvers.cc
@@ -40,7 +40,6 @@ rocblas_Xtrsm   //    ----           //     ----                   / / Ungqr //
 
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_activation.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h"
 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
 #include "tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h"
@@ -582,7 +581,7 @@ TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
 #define GETRS_INSTANCE(Scalar, type_prefix)                                   \
   template <>                                                                 \
   Status GpuSolver::Getrs<Scalar>(rocblas_operation trans, int n, int nrhs,   \
-                                  Scalar* A, int lda, const int* dev_pivots,  \
+                                  Scalar* A, int lda, int* dev_pivots,        \
                                   Scalar* B, int ldb, int* dev_lapack_info) { \
     mutex_lock lock(handle_map_mutex);                                        \
     using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
@@ -645,7 +644,7 @@ TF_CALL_LAPACK_TYPES(POTRF_BATCHED_INSTANCE);
                           pivots.bytes())) {                                  \
       return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
     }                                                                         \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)(         \
+    TF_RETURN_IF_ROCBLAS_ERROR(ROCSOLVER_FN(getri_batched, type_prefix)(      \
         rocm_blas_handle_, n,                                                 \
         reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda,            \
         reinterpret_cast<int*>(pivots.mutable_data()), stride,                \
@@ -668,7 +667,7 @@ TF_CALL_ROCSOLV_TYPES(GETRI_BATCHED_INSTANCE);
     if (!CopyHostToDevice(context_, dev_a.mutable_data(), A, dev_a.bytes())) { \
       return errors::Internal("GetrfBatched: Failed to copy ptrs to device");  \
     }                                                                          \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getrf_batched, type_prefix)(          \
+    TF_RETURN_IF_ROCBLAS_ERROR(ROCSOLVER_FN(getrf_batched, type_prefix)(       \
         rocm_blas_handle_, n, n,                                               \
         reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda, dev_pivots, \
         stride, dev_info->mutable_data(), batch_size));                        \
@@ -696,7 +695,7 @@ TF_CALL_ROCSOLV_TYPES(GETRF_BATCHED_INSTANCE);
     if (!CopyHostToDevice(context_, dev_b.mutable_data(), B, dev_b.bytes())) { \
       return errors::Internal("GetrfBatched: Failed to copy ptrs to device");  \
     }                                                                          \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getrs_batched, type_prefix)(          \
+    TF_RETURN_IF_ROCBLAS_ERROR(ROCSOLVER_FN(getrs_batched, type_prefix)(       \
         rocm_blas_handle_, trans, n, nrhs,                                     \
         reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda, dev_pivots, \
         stride, reinterpret_cast<ROCmScalar**>(dev_b.mutable_data()), ldb,     \
diff --git a/tensorflow/core/util/rocm_sparse.cc b/tensorflow/core/util/rocm_sparse.cc
index 49774583de0..f2c1247cb75 100644
--- a/tensorflow/core/util/rocm_sparse.cc
+++ b/tensorflow/core/util/rocm_sparse.cc
@@ -152,9 +152,9 @@ Status GpuSparse::Initialize() {
   return OkStatus();
 }
 
-#define TF_CALL_HIPSPARSE_DTYPES(m)          \
-  m(float, ROCM_R_32F) m(double, ROCM_R_64F) \
-      m(std::complex<float>, ROCM_C_32F) m(std::complex<double>, ROCM_C_64F)
+#define TF_CALL_HIPSPARSE_DTYPES(m)                                          \
+  m(float, HIP_R_32F) m(double, HIP_R_64F) m(std::complex<float>, HIP_C_32F) \
+      m(std::complex<double>, HIP_C_64F)
 
 // Macro that specializes a sparse method for all 4 standard
 // numeric types.
@@ -357,14 +357,13 @@ static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
                                   const hipsparseMatDescr_t descrA,
                                   Scalar* csrVal, const int* csrRowPtr,
                                   int* csrColInd) {
-  GpuSparseCsrSortingConversionInfo info;
-  TF_RETURN_IF_ERROR(info.Initialize());
+  csru2csrInfo_t info;
 
   size_t pBufferSizeInBytes = 0;
 
   TF_RETURN_IF_GPUSPARSE_ERROR(
       buffer_size_op(hipsparse_handle, m, n, nnz, AsHipComplex(csrVal),
-                     csrRowPtr, csrColInd, info.info(), &pBufferSizeInBytes));
+                     csrRowPtr, csrColInd, info, &pBufferSizeInBytes));
 
   Tensor pBuffer_t;
   TF_RETURN_IF_ERROR(context->allocate_temp(
@@ -375,7 +374,7 @@ static inline Status Csru2csrImpl(SparseFnT op, BufferSizeFnT buffer_size_op,
 
   TF_RETURN_IF_GPUSPARSE_ERROR(op(hipsparse_handle, m, n, nnz, descrA,
                                   AsHipComplex(csrVal), csrRowPtr, csrColInd,
-                                  info.info(), pBuffer.data()));
+                                  info, pBuffer.data()));
 
   return OkStatus();
 }
diff --git a/tensorflow/core/util/sparse/BUILD b/tensorflow/core/util/sparse/BUILD
index 6d662526564..2cef81cb5fb 100644
--- a/tensorflow/core/util/sparse/BUILD
+++ b/tensorflow/core/util/sparse/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -50,3 +52,45 @@ filegroup(
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
+
+cc_library(
+    name = "sparse_tensor",
+    srcs = ["sparse_tensor.cc"],
+    hdrs = [
+        "dim_comparator.h",
+        "sparse_tensor.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:framework_types_hdr",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/lib/core:errors",
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform:errors",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_tensor_test",
+    size = "small",
+    srcs = [
+        "sparse_tensor_test.cc",
+    ],
+    deps = [
+        ":sparse_tensor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:statusor",
+        "//third_party/eigen3",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 65f8e508f1f..d8d7d2aa926 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/util/strided_slice_op_test.cc b/tensorflow/core/util/strided_slice_op_test.cc
index 1b1cd014b94..fdd16d4b86f 100644
--- a/tensorflow/core/util/strided_slice_op_test.cc
+++ b/tensorflow/core/util/strided_slice_op_test.cc
@@ -18,7 +18,16 @@ limitations under the License.
 #include <algorithm>
 #include <tuple>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
 
 namespace tensorflow {
 namespace {
@@ -337,5 +346,428 @@ TEST(StridedSliceAssignBCastTest, RemapDimensionsOutOfBoundsFails) {
   }
 }
 
+using IntVector = gtl::InlinedVector<int64_t, 4>;
+
+TensorShape AsTensorShape(gtl::ArraySlice<int64_t> dim_sizes) {
+  TensorShape out;
+  TF_CHECK_OK(TensorShape::BuildTensorShape(dim_sizes, &out));
+  return out;
+}
+
+TEST(ValidateStridedSliceOpTest, BasicStride) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({1, 1});
+  Tensor end_tensor = test::AsTensor<int32_t>({7, 7});
+  Tensor strides_tensor = test::AsTensor<int32_t>({2, 2});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x2;
+  int32_t end_mask_spec = 0x1;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({5, 4}));
+  EXPECT_EQ(final_shape, AsTensorShape({5, 4}));
+  EXPECT_FALSE(is_identity);
+  EXPECT_FALSE(is_simple_slice);
+  EXPECT_FALSE(slice_dim0);
+  EXPECT_EQ(begin, (IntVector{1, 0}));
+  EXPECT_EQ(end, (IntVector{10, 7}));
+  EXPECT_EQ(strides, (IntVector{2, 2}));
+}
+
+TEST(ValidateStridedSliceOpTest, NegativeBeginEnd) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({-9, /*Will use bounds*/ -20});
+  Tensor end_tensor = test::AsTensor<int32_t>({-3, -3});
+  Tensor strides_tensor = test::AsTensor<int32_t>({2, 2});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({3, 4}));
+  EXPECT_EQ(final_shape, AsTensorShape({3, 4}));
+  EXPECT_EQ(begin, (IntVector{1, 0}));
+  EXPECT_EQ(end, (IntVector{7, 7}));
+}
+
+TEST(ValidateStridedSliceOpTest, EmptyOutputDim) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({1, 1});
+  Tensor end_tensor = test::AsTensor<int32_t>({7, 1});
+  Tensor strides_tensor = test::AsTensor<int32_t>({2, 1});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({3, 0}));
+  EXPECT_EQ(final_shape, AsTensorShape({3, 0}));
+}
+
+TEST(ValidateStridedSliceOpTest, ZeroStrideFails) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({1, 1});
+  Tensor end_tensor = test::AsTensor<int32_t>({7, 7});
+  Tensor strides_tensor = test::AsTensor<int32_t>({0, 2});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x2;
+  int32_t end_mask_spec = 0x1;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          &begin_tensor, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("strides.* must be non-zero")));
+}
+
+TEST(ValidateStridedSliceOpTest, ShrinkAxis) {
+  // Shrink middle axis.
+  Tensor begin_tensor = test::AsTensor<int16_t>({0, 1, 0});
+  Tensor end_tensor = test::AsTensor<int16_t>({3, 1, 5});
+  Tensor strides_tensor = test::AsTensor<int16_t>({1, 1, 1});
+  TensorShape input_shape = AsTensorShape({3, 4, 5});
+  int32_t begin_mask_spec = 0x2;
+  int32_t end_mask_spec = 0x2;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x2;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(final_shape, AsTensorShape({3, 5}));
+}
+
+TEST(ValidateStridedSliceOpTest, ShrinkSliceOutOfBoundsFails) {
+  // Shrink middle axis.
+  Tensor begin_tensor = test::AsTensor<int16_t>({0, 7, 0});
+  Tensor end_tensor = test::AsTensor<int16_t>({3, 7, 5});
+  Tensor strides_tensor = test::AsTensor<int16_t>({1, 1, 1});
+  TensorShape input_shape = AsTensorShape({3, 4, 5});
+  int32_t begin_mask_spec = 0x2;
+  int32_t end_mask_spec = 0x2;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x2;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          &begin_tensor, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("slice index .* out of bounds")));
+}
+
+TEST(ValidateStridedSliceOpTest, ShrinkAxisNegativeStrideFails) {
+  // Shrink middle axis.
+  Tensor begin_tensor = test::AsTensor<int16_t>({0, 1, 0});
+  Tensor end_tensor = test::AsTensor<int16_t>({3, 2, 5});
+  Tensor strides_tensor = test::AsTensor<int16_t>({1, -1, 1});
+  TensorShape input_shape = AsTensorShape({3, 4, 5});
+  int32_t begin_mask_spec = 0x2;
+  int32_t end_mask_spec = 0x2;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x2;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          &begin_tensor, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("only stride 1 allowed")));
+}
+
+TEST(ValidateStridedSliceOpTest, NewAxis) {
+  Tensor begin_tensor = test::AsTensor<int64_t>({0, 0});
+  Tensor end_tensor = test::AsTensor<int64_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int64_t>({1, 1});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x2;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({10, 10}));
+  EXPECT_EQ(final_shape, AsTensorShape({10, 1, 10}));
+}
+
+TEST(ValidateStridedSliceOpTest, Ellipsis) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({0, 0});
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1, 1});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x1;
+  int32_t new_axis_mask = 0x2;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      &begin_tensor, &end_tensor, strides_tensor, input_shape, begin_mask_spec,
+      end_mask_spec, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+      &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+      &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({10, 10}));
+  EXPECT_EQ(final_shape, AsTensorShape({10, 10, 1}));
+}
+
+TEST(ValidateStridedSliceOpTest, MultipleEllipsisFails) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({0, 0});
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1, 1});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x3;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          &begin_tensor, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                             "Multiple ellipses in slice spec not allowed"));
+}
+
+TEST(ValidateStridedSliceOpTest, WrongBeginTensorFails) {
+  Tensor begin_tensor = test::AsTensor<int32_t>({0});  // Incorrect shape.
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1, 1});
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x1;
+  int32_t new_axis_mask = 0x2;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          &begin_tensor, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("Expected .* equal size tensors")));
+}
+
+TEST(ValidateStridedSliceOpTest, WrongStridesTensorWithNullBeginFails) {
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1});  // Invalid shape.
+  TensorShape input_shape = AsTensorShape({10, 10});
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x1;
+  int32_t new_axis_mask = 0x2;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          /*begin_tensor=*/nullptr, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(
+          tsl::error::Code::INVALID_ARGUMENT,
+          ::testing::ContainsRegex("Expected .* equal size tensors")));
+}
+
+TEST(ValidateStridedSliceOpTest, NullBeginEndWithShrinkAxis) {
+  Tensor strides_tensor = test::AsTensor<int32_t>({2, -2, 1});
+  TensorShape input_shape = AsTensorShape({10, 10, 1});
+  int32_t begin_mask_spec = 0x3;
+  int32_t end_mask_spec = 0x3;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x4;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      /*begin_tensor=*/nullptr, /*end_tensor=*/nullptr, strides_tensor,
+      input_shape, begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+      shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+      &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec));
+
+  EXPECT_EQ(processing_shape, AsTensorShape({5, 5, 1}));
+  EXPECT_EQ(final_shape, AsTensorShape({5, 5}));
+  EXPECT_EQ(strides, (IntVector{2, -2, 1}));
+}
+
+TEST(ValidateStridedSliceOpTest, UnknownInputRankFails) {
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1, 1});
+  PartialTensorShape input_shape;  // Unknown rank.
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x1;
+  int32_t new_axis_mask = 0x2;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  TensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  EXPECT_THAT(
+      ValidateStridedSliceOp(
+          /*begin_tensor=*/nullptr, &end_tensor, strides_tensor, input_shape,
+          begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+          shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+          &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec),
+      tsl::testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
+                             ::testing::ContainsRegex("unknown rank")));
+}
+
+TEST(ValidateStridedSliceOpTest, PartialInputShape) {
+  Tensor end_tensor = test::AsTensor<int32_t>({10, 10});
+  Tensor strides_tensor = test::AsTensor<int32_t>({1, 1});
+  PartialTensorShape input_shape;
+  TF_CHECK_OK(
+      PartialTensorShape::BuildPartialTensorShape({10, -1}, &input_shape));
+  int32_t begin_mask_spec = 0x0;
+  int32_t end_mask_spec = 0x0;
+  int32_t ellipsis_mask = 0x0;
+  int32_t new_axis_mask = 0x0;
+  int32_t shrink_axis_mask = 0x0;
+
+  // Function outputs.
+  PartialTensorShape processing_shape, final_shape;
+  bool is_identity, is_simple_slice, slice_dim0;
+  IntVector begin, end, strides;
+  StridedSliceShapeSpec shape_spec;
+
+  TF_EXPECT_OK(ValidateStridedSliceOp(
+      /*begin_tensor=*/nullptr, &end_tensor, strides_tensor, input_shape,
+      begin_mask_spec, end_mask_spec, ellipsis_mask, new_axis_mask,
+      shrink_axis_mask, &processing_shape, &final_shape, &is_identity,
+      &is_simple_slice, &slice_dim0, &begin, &end, &strides, &shape_spec));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 829c5fc7f4d..bb09558dae9 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -12,6 +12,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
index 1e36f4f94b8..69c90c31546 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 
 namespace tensorflow {
@@ -116,45 +115,36 @@ Status ByteSwapTensor(Tensor* t) {
                         t->NumElements());
 }
 
-Status ByteSwapTensorContent(MetaGraphDef* meta_graph_def) {
-  for (auto& function : *meta_graph_def->mutable_graph_def()
-                             ->mutable_library()
-                             ->mutable_function()) {
-    for (auto& node : (*function.mutable_node_def())) {
-      if (node.op() == "Const") {
-        auto node_iterator = node.mutable_attr()->find("value");
-        if (node_iterator != node.mutable_attr()->end()) {
-          AttrValue node_value = node_iterator->second;
-          if (node_value.has_tensor()) {
-            auto tsize = node_value.mutable_tensor()->tensor_content().size();
-            auto p_type = node_value.mutable_tensor()->dtype();
-            // Swap only when there is something in tensor_content field
-            if (tsize != 0 && DataTypeCanUseMemcpy(p_type)) {
-              Tensor parsed(p_type);
-              DCHECK(parsed.FromProto(*node_value.mutable_tensor()));
-              if (!parsed.tensor_data().empty()) {
-                TF_RETURN_IF_ERROR(ByteSwapTensor(&parsed));
-                (*node.mutable_attr())["value"]
-                    .mutable_tensor()
-                    ->set_tensor_content(
-                        string(reinterpret_cast<const char*>(
-                                   parsed.tensor_data().data()),
-                               parsed.tensor_data().size()));
-              } else {
-                void* copy = tensorflow::port::Malloc(tsize);
-                memcpy(copy,
-                       string(node_value.mutable_tensor()->tensor_content())
-                           .data(),
-                       tsize);
-                TF_RETURN_IF_ERROR(
-                    ByteSwapBuffer((char*)copy, tsize, p_type, -1));
-                (*node.mutable_attr())["value"]
-                    .mutable_tensor()
-                    ->set_tensor_content(
-                        string(reinterpret_cast<const char*>(copy), tsize));
-                tensorflow::port::Free(copy);
-              }
-            }
+Status ByteSwapTensorContentInNode(NodeDef& node) {
+  if (node.op() == "Const") {
+    auto node_iterator = node.mutable_attr()->find("value");
+    if (node_iterator != node.mutable_attr()->end()) {
+      AttrValue node_value = node_iterator->second;
+      if (node_value.has_tensor()) {
+        auto tsize = node_value.mutable_tensor()->tensor_content().size();
+        auto p_type = node_value.mutable_tensor()->dtype();
+        // Swap only when there is something in tensor_content field
+        if (tsize != 0 && DataTypeCanUseMemcpy(p_type)) {
+          Tensor parsed(p_type);
+          DCHECK(parsed.FromProto(*node_value.mutable_tensor()));
+          if (!parsed.tensor_data().empty()) {
+            TF_RETURN_IF_ERROR(ByteSwapTensor(&parsed));
+            (*node.mutable_attr())["value"]
+                .mutable_tensor()
+                ->set_tensor_content(string(
+                    reinterpret_cast<const char*>(parsed.tensor_data().data()),
+                    parsed.tensor_data().size()));
+          } else {
+            void* copy = tensorflow::port::Malloc(tsize);
+            memcpy(copy,
+                   string(node_value.mutable_tensor()->tensor_content()).data(),
+                   tsize);
+            TF_RETURN_IF_ERROR(ByteSwapBuffer((char*)copy, tsize, p_type, -1));
+            (*node.mutable_attr())["value"]
+                .mutable_tensor()
+                ->set_tensor_content(
+                    string(reinterpret_cast<const char*>(copy), tsize));
+            tensorflow::port::Free(copy);
           }
         }
       }
@@ -163,4 +153,19 @@ Status ByteSwapTensorContent(MetaGraphDef* meta_graph_def) {
   return OkStatus();
 }
 
+Status ByteSwapTensorContentInMetaGraphDef(MetaGraphDef* meta_graph_def) {
+  for (auto& function : *meta_graph_def->mutable_graph_def()
+                             ->mutable_library()
+                             ->mutable_function())
+    for (auto& node : (*function.mutable_node_def()))
+      TF_RETURN_IF_ERROR(ByteSwapTensorContentInNode(node));
+  return OkStatus();
+}
+
+Status ByteSwapTensorContentInGraphDef(GraphDef* graph_def) {
+  for (auto& node : *graph_def->mutable_node())
+    TF_RETURN_IF_ERROR(ByteSwapTensorContentInNode(node));
+  return OkStatus();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
index d1cadc9665c..bd910e45ebe 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_tensor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_TENSOR_H_
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -33,7 +34,16 @@ namespace tensorflow {
 Status ByteSwapTensor(Tensor *t);
 
 // Swap tensor_content field of Const Op Tensors in the named functions
-Status ByteSwapTensorContent(MetaGraphDef *meta_graph_def);
+// in NodeDef
+Status ByteSwapTensorContentInNode(NodeDef& node);
+
+// Swap tensor_content field of Const Op Tensors in the named functions
+// in MetaGraphDef
+Status ByteSwapTensorContentInMetaGraphDef(MetaGraphDef* meta_graph_def);
+
+// Swap tensor_content field of Const Op Tensors in the named functions
+// in GraphDef
+Status ByteSwapTensorContentInGraphDef(GraphDef* graph_def);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_bundle/testdata/BUILD b/tensorflow/core/util/tensor_bundle/testdata/BUILD
index 160e59bb68f..78b21c5af59 100644
--- a/tensorflow/core/util/tensor_bundle/testdata/BUILD
+++ b/tensorflow/core/util/tensor_bundle/testdata/BUILD
@@ -2,6 +2,7 @@
 # Old string tensors data package alias.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
index ee7ffd70c2c..811f73c3382 100644
--- a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
+++ b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 066f8cc6ade..9c389eeddfb 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -519,9 +519,9 @@ std::string GetConvnetDataFormat2D3DAttrString();
 // FORMAT_NCHW:        (N, C, spatial); rank = spatial.size() + 2
 // FORMAT_NCHW_VECT_C: (N, C, spatial, InnerC); rank = spatial.size() + 3
 // FORMAT_NHWC_VECT_W: (N, spatial, C, InnerW); rank = spatial.size() + 3
-inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
-                                   gtl::ArraySlice<int64_t> spatial,
-                                   int64_t C) {
+inline Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
+                                        gtl::ArraySlice<int64_t> spatial,
+                                        int64_t C, TensorShape* shape) {
   const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64_t, 6> dim_sizes(dims);
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
@@ -546,7 +546,15 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
     dim_sizes[GetTensorInnerFeatureDimIndex(dims, format)] = 4;
   }
   dim_sizes[feature_index] = C;
-  return TensorShape(dim_sizes);
+  return TensorShapeUtils::MakeShape(dim_sizes, shape);
+}
+
+inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N,
+                                   gtl::ArraySlice<int64_t> spatial,
+                                   int64_t C) {
+  TensorShape shape;
+  TF_CHECK_OK(ShapeFromFormatWithStatus(format, N, spatial, C, &shape));
+  return shape;
 }
 
 // Return a tensor shape of the specified 'format', and dimensions.
@@ -573,10 +581,19 @@ inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
   return TensorShape(dim_sizes);
 }
 
+// Return a tensor shape of the specified 'format', and dimensions.
+inline Status ShapeFromFormatWithStatus(TensorFormat format, int64_t N,
+                                        int64_t H, int64_t W, int64_t C,
+                                        TensorShape* shape) {
+  return ShapeFromFormatWithStatus(format, N, {H, W}, C, shape);
+}
+
 // Return a tensor shape of the specified 'format', and dimensions.
 inline TensorShape ShapeFromFormat(TensorFormat format, int64_t N, int64_t H,
-                                   int64_t W, int64_t C) {
-  return ShapeFromFormat(format, N, {H, W}, C);
+                              int64_t W, int64_t C) {
+  TensorShape shape;
+  TF_CHECK_OK(ShapeFromFormatWithStatus(format, N, {H, W}, C, &shape));
+  return shape;
 }
 
 // Return a filter tensor shape of the specified 'format', and dimensions.
@@ -588,11 +605,13 @@ inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
 
 // Returns a copy of the specified tensor 'src_shape' converted from
 // 'src_format' to 'dst_format'.
-inline TensorShape ShapeFromFormat(TensorFormat dst_format,
-                                   const TensorShape& src_shape,
-                                   TensorFormat src_format) {
+inline Status ShapeFromFormatWithStatus(TensorFormat dst_format,
+                                        const TensorShape& src_shape,
+                                        TensorFormat src_format,
+                                        TensorShape* shape) {
   if (src_format == dst_format) {
-    return src_shape;
+    *shape = src_shape;
+    return OkStatus();
   }
 
   const int64_t batch = GetTensorDim(src_shape, src_format, 'N');
@@ -609,7 +628,17 @@ inline TensorShape ShapeFromFormat(TensorFormat dst_format,
   if (src_format == FORMAT_NHWC_VECT_W) {
     spatial_dims[num_src_spatial_dims - 1] *= 4;
   }
-  return ShapeFromFormat(dst_format, batch, {spatial_dims}, channels);
+  return ShapeFromFormatWithStatus(dst_format, batch, {spatial_dims}, channels,
+                                   shape);
+}
+
+inline TensorShape ShapeFromFormat(TensorFormat dst_format,
+                                   const TensorShape& src_shape,
+                                   TensorFormat src_format) {
+  TensorShape shape;
+  TF_CHECK_OK(
+      ShapeFromFormatWithStatus(dst_format, src_shape, src_format, &shape));
+  return shape;
 }
 
 // Returns a copy of the specified filter tensor 'src_shape' converted from
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index 7d0378556dc..0d3f3517fd0 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
+#include <climits>
 #include <utility>
 #include <vector>
 
@@ -256,6 +257,15 @@ Status TensorSliceReader::GetTensor(
   std::unique_ptr<tensorflow::Tensor> t(new tensorflow::Tensor);
   Status s = tensorflow::Tensor::BuildTensor(type, shape, t.get());
   if (!s.ok()) return s;
+
+  for (const auto d : shape.dim_sizes()) {
+    if (d == LLONG_MAX) {
+      return errors::InvalidArgument("Unable to read dimensions of size ",
+                                     LLONG_MAX,
+                                     ". Got shape: ", shape.DebugString());
+    }
+  }
+
   bool success = false;
 
 #define READER_COPY(dt)                                                  \
@@ -273,6 +283,7 @@ Status TensorSliceReader::GetTensor(
     READER_COPY(DT_INT8);
     READER_COPY(DT_INT64);
     READER_COPY(DT_STRING);
+    READER_COPY(DT_BOOL);
     default:
       return errors::Unimplemented("Data type not supported");
   }
diff --git a/tensorflow/core/util/zen_util.h b/tensorflow/core/util/zen_util.h
new file mode 100644
index 00000000000..4910d3f4a5b
--- /dev/null
+++ b/tensorflow/core/util/zen_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_ZEN_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_ZEN_UTIL_H_
+
+#ifdef AMD_ZENDNN
+
+#include "absl/base/call_once.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+inline int64_t GetMempool() {
+  static absl::once_flag once;
+  static int64_t mempool = 1;
+  absl::call_once(once, [&] {
+    TF_CHECK_OK(
+        ReadInt64FromEnvVar("ZENDNN_ENABLE_MEMPOOL", mempool, &mempool));
+    return mempool;
+  });
+  return mempool;
+}
+
+inline bool IsBlockedFormatEnabled() {
+  static absl::once_flag once;
+  static bool blocked_format = false;
+  absl::call_once(once, [&] {
+    TF_CHECK_OK(ReadBoolFromEnvVar("ZENDNN_BLOCKED_FORMAT", blocked_format,
+                                   &blocked_format));
+    return blocked_format;
+  });
+  return blocked_format;
+}
+
+}  // namespace tensorflow
+
+#endif  // AMD_ZENDNN
+#endif  // TENSORFLOW_CORE_UTIL_ZEN_UTIL_H_
diff --git a/tensorflow/distribute/experimental/rpc/kernels/BUILD b/tensorflow/distribute/experimental/rpc/kernels/BUILD
index b9b912e1f52..f8757df41e0 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/BUILD
+++ b/tensorflow/distribute/experimental/rpc/kernels/BUILD
@@ -1,8 +1,10 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow/distribute/experimental/rpc/kernels/oss:defs.bzl", "grpc_credentials_dependency")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/distribute/experimental/rpc/kernels/oss/BUILD b/tensorflow/distribute/experimental/rpc/kernels/oss/BUILD
index ebe0bac64f0..f545f18ed00 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/oss/BUILD
+++ b/tensorflow/distribute/experimental/rpc/kernels/oss/BUILD
@@ -1,7 +1,8 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/distribute/experimental/rpc/kernels:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/distribute/experimental/rpc/proto/BUILD b/tensorflow/distribute/experimental/rpc/proto/BUILD
index 5c208150481..551d0eeca42 100644
--- a/tensorflow/distribute/experimental/rpc/proto/BUILD
+++ b/tensorflow/distribute/experimental/rpc/proto/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/distribute/experimental/rpc:__subpackages__",
         "//tensorflow/python/distribute/experimental/rpc:__subpackages__",
diff --git a/tensorflow/dtensor/BUILD b/tensorflow/dtensor/BUILD
index a5bdfb88849..f1a9c5a37af 100644
--- a/tensorflow/dtensor/BUILD
+++ b/tensorflow/dtensor/BUILD
@@ -2,6 +2,7 @@
 # minimal changes to user code.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -10,6 +11,7 @@ package_group(
     packages = [
         "//tensorflow_models/google/...",
         "//third_party/aimee/clara2_labs/...",
+        "//third_party/py/jax_tpu_embedding/...",
         "//third_party/py/keras/dtensor/...",
     ],
 )
@@ -22,5 +24,6 @@ package_group(
         "//tensorflow/core/...",
         "//tensorflow/dtensor/...",
         "//tensorflow/python/...",
+        "//tensorflow_federated/cc/core/...",  #  Needs access to C++ API for dtensor device
     ],
 )
diff --git a/tensorflow/dtensor/build_defs.bzl b/tensorflow/dtensor/build_defs.bzl
new file mode 100644
index 00000000000..ac5ef54174b
--- /dev/null
+++ b/tensorflow/dtensor/build_defs.bzl
@@ -0,0 +1,173 @@
+"""Helpers for defining multi-platform DTensor test targets."""
+
+load("//tensorflow:tensorflow.bzl", "py_strict_test")
+
+# LINT.IfChange
+ALL_BACKENDS = ["cpu", "gpu", "tpu"]
+TPU_V3_DONUT_BACKEND = "tpu_v3_2x2"
+TPU_V4_DONUT_BACKEND = "tpu_v4_2x2"
+GPU_2DEVS_BACKEND = "2gpus"
+PATHWAYS = "pw"
+# LINT.ThenChange(
+#     python/tests/test_backend_name.py:backend_name,
+#     python/tests/test_backend_name.oss.py:backend_name
+# )
+
+# FIXME(feyu): Gradually increase the coverage of OSS tests.
+# LINT.IfChange
+def _get_configurations(
+        disable,
+        enable,
+        disable_tfrt,
+        backend_tags,
+        backend_deps,
+        additional_backends,  # buildifier: disable=unused-variable
+        shard_count):
+    """Returns a list of dtensor_test configurations."""
+    disabled_tags = ["manual", "disabled"]
+    disabled_tfrt_configs = [d + "_tfrt" for d in disable_tfrt]
+    disabled_backends = [backend for backend in disable if backend not in enable]
+
+    backend_variant_deps = {
+        "gpu": [],
+        "tpu": [
+        ],
+        TPU_V3_DONUT_BACKEND: [
+        ],
+        TPU_V4_DONUT_BACKEND: [
+        ],
+        PATHWAYS: [
+        ],
+    }
+    configurations = [
+        dict(suffix = "cpu", backend = "cpu", tags = [], flags = [], env = {}, deps = []),
+        dict(
+            suffix = "gpu",
+            backend = "gpu",
+            tags = ["requires-gpu-nvidia", "gpu"],
+            flags = [],
+            env = {},
+            deps = [],
+        ),
+    ]
+    if GPU_2DEVS_BACKEND in additional_backends:
+        configurations = configurations + [
+            dict(
+                suffix = GPU_2DEVS_BACKEND,
+                backend = GPU_2DEVS_BACKEND,
+                tags = ["requires-gpu-nvidia:2", "gpu", "specialbackend", GPU_2DEVS_BACKEND],
+                flags = [],
+                env = {
+                },
+                deps = [],
+            ),
+        ]
+
+    # Post processing configurations.
+    for config in configurations:
+        config["tags"] = config["tags"] + backend_tags.get(config["backend"], [])
+        config["env"]["DTENSOR_TEST_UTIL_BACKEND"] = config["suffix"]
+
+        if config["backend"] in disabled_backends or config["suffix"] in disabled_tfrt_configs:
+            config["tags"] += disabled_tags
+
+        config["deps"] = (
+            config["deps"] +
+            backend_variant_deps.get(config["backend"], []) +
+            backend_deps.get(config["backend"], [])
+        )
+        config["shard_count"] = shard_count.get(config["backend"], None) if shard_count else None
+    return configurations
+
+# LINT.ThenChange(build_defs.bzl)
+
+def dtensor_test(
+        name,
+        srcs,
+        deps = [],
+        args = [],
+        env = {},
+        disable = [],
+        enable = [],
+        disable_tfrt = [],
+        data = [],
+        tags = [],
+        backend_tags = {},
+        backend_deps = {},
+        additional_backends = [],
+        main = None,
+        shard_count = None,
+        size = None,
+        get_configurations = _get_configurations,
+        test_rule = py_strict_test):
+    """Defines a set of per-platform DTensor test targets.
+
+    Generates test targets named:
+    :name  # test suite that tests all backends
+    :name_cpu
+    :name_cpu_tfrt
+    :name_gpu  # must run with --config=cuda
+    :name_tpu  # recommend to be run with -c opt
+    :name_tpu_tfrt  # recommend to be run with -c opt
+
+    Args:
+      name: test name
+      srcs: source files
+      deps: dependencies
+      args: arguments to pass to the test
+      env: environment variables to set when the test is executed
+      disable: list of backends on which the test should be disabled, e.g., ["cpu"]
+      enable: list of specific configs on which the test should be enabled,
+        e.g., ["tpu"]. This overrides 'disable'.
+      disable_tfrt: list of backends that are disabled for tfrt. This overrides 'enable'.
+      data: data dependencies
+      tags: test tags
+      backend_tags: a dictionary keyed by backend name of per-backend tags.
+      backend_deps: a dictionary keyed by backend name of per-backend deps.
+      additional_backends: list of backends in addition to common cpu/tpu/gpu.
+      main: the Python main file.
+      shard_count: a dictionary keyed by backend name of per-backend shard counts.
+      size: the test size.
+      get_configurations: a function that returns the list of configurations. Used to generate non-OSS test targets.
+    """
+    configurations = get_configurations(
+        disable = disable,
+        enable = enable,
+        disable_tfrt = disable_tfrt,
+        backend_tags = backend_tags,
+        backend_deps = backend_deps,
+        additional_backends = additional_backends,
+        shard_count = shard_count,
+    )
+
+    if main == None:
+        if len(srcs) == 1:
+            main = srcs[0]
+        else:
+            fail("Only one test source file is currently supported.")
+
+    all_tests = []
+    for config in configurations:
+        config_name = name + "_" + config["suffix"]
+
+        all_tests.append(config_name)
+
+        python_version = "PY3"
+        test_env = {}
+        test_env.update(config["env"])
+        test_env.update(env)
+
+        test_rule(
+            env = test_env,
+            name = config_name,
+            main = main,
+            srcs = srcs,
+            data = data,
+            args = config["flags"] + args,
+            deps = config["deps"] + deps,
+            tags = config["tags"] + tags,
+            python_version = python_version,
+            shard_count = config["shard_count"],
+            size = size,
+        )
+    native.test_suite(name = name, tests = all_tests, tags = ["-manual"])
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index eb87c4a6afd..23a00b514df 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -6,13 +6,15 @@ load(
     "tf_dtensor_tpu_dependencies",
 )
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "if_google")
+load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
 
 default_visibility = [
     "//tensorflow/dtensor:dtensor-internal",
+    "//tensorflow:__pkg__",
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_visibility,
     licenses = ["notice"],
 )
@@ -61,6 +63,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -87,6 +90,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensor_with_layout",
+    hdrs = ["tensor_with_layout.h"],
+    deps = [
+        ":constants",
+        ":tensor_layout",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager/parallel_device:parallel_device_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:fingerprint",
+    ],
+)
+
 cc_library(
     name = "dtensor_device_util",
     srcs = ["dtensor_device_util.cc"],
@@ -96,6 +112,7 @@ cc_library(
         ":dstatus",
         ":small_constant_optimization",
         ":tensor_layout",
+        ":tensor_with_layout",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
@@ -108,7 +125,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime:composite_device",
         "//tensorflow/core/common_runtime/eager:context",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -235,6 +251,7 @@ cc_library(
         "//tensorflow/dtensor/mlir:dtensor_mlir_passes",
         "//tensorflow/dtensor/mlir:tf_dtensor_dialect",
         "//tensorflow/dtensor/mlir/dtensor_dialect:Dialect",
+        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -244,26 +261,34 @@ cc_library(
 )
 
 cc_library(
-    name = "default_parallel_executor",
-    srcs = if_google(
-        ["google/default_parallel_executor.cc"],
-        ["default_parallel_executor.cc"],
+    name = "default_parallel_executor_lib",
+    deps = if_libtpu(
+        if_false = if_google(
+            google_value = ["//tensorflow/dtensor/cc/google:default_parallel_executor"],
+            oss_value = [":default_parallel_executor"],
+        ),
+        if_true = [":default_parallel_executor"],
     ),
-    hdrs = ["default_parallel_executor.h"],
+)
+
+cc_library(
+    name = "default_parallel_executor",
+    srcs = ["default_parallel_executor.cc"],
     deps = [
         ":parallel_executor_interface",
-        "//tensorflow/c:tf_status_helper",
-    ] + if_google([
-        # All new dependencies go here as the OSS part has no valid implementation.
-        ":dtensor_device_util",
-        "//learning/infra/mira/distributed/pathways:pathways_parallel_executor",
-    ]),
+        "//tensorflow/core/platform:logging",
+    ],
 )
 
 cc_library(
     name = "parallel_executor_interface",
     hdrs = ["parallel_executor.h"],
-    deps = ["//tensorflow/tsl/platform:status"],
+    deps = [
+        ":dtensor_device_util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_future",
+        "//tensorflow/tsl/platform:statusor",
+        "@llvm-project//mlir:IR",
+    ],
 )
 
 cc_library(
@@ -272,6 +297,8 @@ cc_library(
     hdrs = ["dtensor_device.h"],
     deps = [
         ":constants",
+        ":default_parallel_executor_lib",
+        ":parallel_executor_interface",
         ":dstatus",
         ":dtensor_device_util",
         ":dtensor_graph_to_mlir_pass",
@@ -281,12 +308,12 @@ cc_library(
         ":small_constant_optimization",
         ":tensor_layout",
         ":tpu_system_interface",
-        ":default_parallel_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:tf_status_helper",
@@ -296,6 +323,8 @@ cc_library(
         "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/c/eager/parallel_device:parallel_device_lib",
+        "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -310,5 +339,29 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
         "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
+        "//tensorflow/tsl/platform:status",
     ] + tf_dtensor_tpu_dependencies(),
 )
+
+cc_library(
+    name = "layout_to_xla_sharding",
+    srcs = ["xla_spmd/layout_to_xla_sharding.cc"],
+    hdrs = ["xla_spmd/layout_to_xla_sharding.h"],
+    deps = [
+        ":constants",
+        ":dstatus",
+        ":tensor_layout",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "mesh_type",
+    hdrs = ["mesh_type.h"],
+    deps = [
+        ":tensor_layout",
+        "//tensorflow/c:conversion_macros",
+    ],
+)
diff --git a/tensorflow/dtensor/cc/constants.h b/tensorflow/dtensor/cc/constants.h
index 839877dc744..b167d9f71b9 100644
--- a/tensorflow/dtensor/cc/constants.h
+++ b/tensorflow/dtensor/cc/constants.h
@@ -104,10 +104,6 @@ static constexpr char kDeviceSeedForMeshDims[] =
 // to true on the StatefulPartitionedCallOp generated by MLIR lowering.
 static constexpr char kSkipXlaCompilation[] = "_skip_xla_compilation";
 
-// Prefix of pipelining mesh name (kPipelineMeshNamePrefix + composite device
-// name).
-static constexpr char kPipelineMeshNamePrefix[] = "pipe_cluster:";
-
 // An attribute which stores the cache_key for the graph in the module. Used
 // to uniquely name functions.
 static constexpr char kCacheKey[] = "dtensor.cache_key";
@@ -134,6 +130,14 @@ static constexpr char kLoadEmbeddingFn[] = "load_embedding_fn";
 
 // Name of dtensor retrieve embedding function.
 static constexpr char kRetrieveEmbeddingFn[] = "retrieve_embedding_fn";
+
+// Attribute which stores the layouts to be applied to the elements returned by
+// calling IteratorGetNextOp on a tf.data iterator.
+static constexpr char kIteratorElementLayouts[] = "tf._element_layouts";
+
+// Attribute used in tf.data ops which stores the shapes of the output elements.
+static constexpr char kIteratorOutputShapes[] = "output_shapes";
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/default_parallel_executor.cc b/tensorflow/dtensor/cc/default_parallel_executor.cc
index 1df8840dd5a..64756dec07c 100644
--- a/tensorflow/dtensor/cc/default_parallel_executor.cc
+++ b/tensorflow/dtensor/cc/default_parallel_executor.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/dtensor/cc/default_parallel_executor.h"
+#include <memory>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/dtensor/cc/parallel_executor.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/cc/default_parallel_executor.h b/tensorflow/dtensor/cc/default_parallel_executor.h
deleted file mode 100644
index 2337de93a6c..00000000000
--- a/tensorflow/dtensor/cc/default_parallel_executor.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_DTENSOR_CC_DEFAULT_PARALLEL_EXECUTOR_H_
-#define TENSORFLOW_DTENSOR_CC_DEFAULT_PARALLEL_EXECUTOR_H_
-
-#include <memory>
-
-#include "tensorflow/dtensor/cc/parallel_executor.h"
-
-namespace tensorflow {
-namespace dtensor {
-
-// Factory method for Default ParallelExecutor instance.
-std::unique_ptr<ParallelExecutor> CreateDefaultParallelExecutor();
-
-}  // namespace dtensor
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_DTENSOR_CC_DEFAULT_PARALLEL_EXECUTOR_H_
diff --git a/tensorflow/dtensor/cc/dstatus.h b/tensorflow/dtensor/cc/dstatus.h
index ff2693c7d6f..40886a25ced 100644
--- a/tensorflow/dtensor/cc/dstatus.h
+++ b/tensorflow/dtensor/cc/dstatus.h
@@ -23,15 +23,15 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace dtensor {
 
 template <typename T>
-using StatusOr = stream_executor::port::StatusOr<T>;
+using StatusOr = tsl::StatusOr<T>;
 
 inline Status WithContext(const Status& ds, absl::string_view file,
                           int line_number, absl::string_view context = "") {
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index 396fcce926e..1b1f1de909e 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -17,8 +17,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <iterator>
 #include <memory>
+#include <optional>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -31,6 +34,8 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
@@ -40,6 +45,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
@@ -66,37 +73,26 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/dtensor/cc/constants.h"
-#include "tensorflow/dtensor/cc/default_parallel_executor.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/dtensor_device_util.h"
 #include "tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h"
+#include "tensorflow/dtensor/cc/parallel_executor.h"
 #include "tensorflow/dtensor/cc/small_constant_optimization.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/cc/tpu_system_interface.h"
 #include "tensorflow/dtensor/proto/layout.pb.h"
+#include "tensorflow/tsl/platform/status.h"
 
 namespace tensorflow {
 namespace dtensor {
 
-// TODO(b/189332820): Replace this with a Partitioner stub swapped in by the
-// Copybara workflow.
-StatusOr<ExecutionFunctions> ABSL_ATTRIBUTE_WEAK PipeliningPartitionerRun(
-    const absl::flat_hash_map<std::string, const MeshWithParallelDevice*>*
-        device_name_to_mesh_device,
-    FunctionLibraryDefinition* flib_def, DTensorMlirPassRunner* pass_runner,
-    const FunctionDef& fdef, const NameAttrList& eager_attributes,
-    const std::vector<TensorWithLayout*>& inputs, const DeviceSet& device_set,
-    int num_outputs) {
-  // The actual definition is in the pipelining package.
-  return errors::Unimplemented("DTensor pipelining is unavailable.");
-}
-
 class DTensorDevice {
  public:
   explicit DTensorDevice(absl::string_view name)
       : name_(name),
         same_shape_policy_enabled_(false),
         cancellation_manager_(std::make_unique<CancellationManager>()) {
+    // FIXME(b/258703996): Use tsl.
     if (getenv("DTENSOR_USE_PARALLEL_EXECUTOR") != nullptr) {
       parallel_executor_ = CreateDefaultParallelExecutor();
     }
@@ -125,23 +121,6 @@ class DTensorDevice {
     }
   }
 
-  // Returns sub meshes of pipelining.
-  // Key is the name of a composite device.
-  StatusOr<absl::flat_hash_map<std::string, const MeshWithParallelDevice*>>
-  PipelineSubMeshes(TFE_Context* context) {
-    absl::flat_hash_map<std::string, const MeshWithParallelDevice*>
-        device_to_mesh;
-    for (const auto& pair : mesh_to_device_map_) {
-      TF_ASSIGN_OR_RETURN(CompositeDevice * device,
-                          pair.second->FindOrCreateCompositeDevice(context));
-      if (device != nullptr) {
-        device_to_mesh[pair.second->composite_device()->name()] =
-            pair.second.get();
-      }
-    }
-    return device_to_mesh;
-  }
-
   // Runs an operation on the DTensorDevice,
   //
   // Ignoring the placement of the original op (TFE_OpGetDevice(original_op)).
@@ -299,6 +278,10 @@ class DTensorDevice {
   std::string FetchLayout(TFE_Context* context, TFE_TensorHandle* input,
                           TF_Status* status);
 
+  // Returns whether `input` is a dtensor of this DTensorDevice.
+  bool IsDTensor(TFE_Context* context, TFE_TensorHandle* input,
+                 TF_Status* status);
+
   TFE_TensorHandle* SparsePack(TFE_Context* context, int num_inputs,
                                TFE_TensorHandle** indices,
                                TFE_TensorHandle** values,
@@ -312,51 +295,79 @@ class DTensorDevice {
   std::unordered_map<std::string, int> GetFunctionCacheHitAndMissCount(
       TFE_Context* context, TF_Status* status) const;
 
+  void SetIteratorElementLayouts(TFE_Context* context, TFE_TensorHandle* input,
+                                 const std::vector<std::string>& string_layouts,
+                                 TF_Status* status);
+
  private:
-  // If the `operation_name` of an op indicates a custom DTensor op (e.g.
-  // CopyToMesh), then separately handle those custom ops instead of running
-  // default DTensor graph compilation.
+  // If the `operation_name` of an op indicates a custom DTensor op then
+  // separately handle those custom ops instead of running default DTensor graph
+  // compilation.
   void MaybeHandleDTensorCustomOps(
       const char* operation_name, const int num_inputs,
       const TFE_OpAttrs* attributes, TFE_Context* context,
       TFE_TensorHandle** inputs, int* num_outputs, TFE_TensorHandle** outputs,
       bool* is_custom_dtensor_op, TF_Status* status);
 
-  // Copies non-dtensor eager tensor or DTensor to a mesh specified by
-  // `attributes`.
-  // Currently, only copy to replicated layout on target mesh is supported.
-  void CopyToMesh(TFE_Context* context, int num_inputs,
-                  TFE_TensorHandle** inputs, const TFE_OpAttrs* attributes,
-                  TFE_TensorHandle** outputs, int* num_outputs,
-                  TF_Status* status);
-
   // Update output layouts for eager ops based on same shape policy.
-  void UpdateOutputLayoutsWithSameShapePolicy(
+  Status UpdateOutputLayoutsWithSameShapePolicy(
       const std::vector<PartialTensorShape>& global_output_shapes,
       const absl::flat_hash_set<Mesh>& input_meshes, absl::string_view op_name,
-      tensorflow::Graph* graph, std::vector<const Layout*>* output_layouts,
-      TF_Status* status);
+      tensorflow::Graph* graph, std::vector<const Layout*>* output_layouts);
+
+  // Stores states of a DTensorOperation that will be used for lowering,
+  // including different representations (e.g. MLIR Module) of the
+  // DTensorOperation, and other states (e.g. output layouts and shapes).
+  struct DTensorOperationLoweringContext {
+    // Optional MLIR module representation of the DTensorOperation.
+    // If exists, it is associated with DTensorDevice's PassRunner.
+    std::optional<mlir::ModuleOp> module;
+    // Graph representation of the DTensorOperation.
+    std::unique_ptr<tensorflow::Graph> graph;
+    // Derived output layout of the DTensorOperation
+    std::vector<const Layout*> output_layouts;
+    // Derived global output shapes of the DTensorOperation.
+    std::vector<PartialTensorShape> global_output_shapes;
+    // TF Device list associated with the DTensorOperation.
+    std::vector<tensorflow::Device*> tf_devices;
+    // Cache key of the operation calculated by
+    // ExecutableManager<T>::GetCachedExecutable based on the doperation and its
+    // metadata (e.g. inputs).
+    tensorflow::Fprint128 doperation_cache_key;
+  };
 
-  // Takes the description of an operation and makes a function out of it with
-  // the same signature, running DTensor MLIR passes. Registers that function
-  // with `context`. `translated_function_name` is set to the name of the
-  // function.
-  //
-  // The resulting function expects a device ID as its first input.
-  void LowerToSPMDFunction(TFE_Context* context,
-                           const std::vector<TensorWithLayout*>& inputs,
-                           const DTensorOperation& doperation,
-                           const TFE_OpAttrs* attributes, const int num_outputs,
-                           const ExecutionFunctions** execution_functions,
-                           TF_Status* status);
+  // Takes the description of a DTensorOperation and makes a ModuleOp out of it.
+  // The resulting ModuleOp and other derived states of the DTensorOperation are
+  // stored in the DTensorOperationLoweringContext. The Module is not
+  // transformed by DTensor passes.
+  StatusOr<DTensorOperationLoweringContext> DTensorOperationToModule(
+      TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+      const DTensorOperation& doperation, const NameAttrList& eager_attributes);
+
+  // Lowers the ModuleOp in the input DTensorOperationLoweringContext, and
+  // extracts ExecutionFunctions from lowered ModuleOp. Some fields (e.g. graph)
+  // of the input DTensorOperationLoweringContext may be updated.
+  void ModuleToExecutionFunctions(
+      TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+      const DTensorOperation& doperation, const NameAttrList& eager_attributes,
+      int num_outputs, DTensorOperationLoweringContext& lowering_context,
+      const ExecutionFunctions** execution_functions, TF_Status* status);
 
   // Execute a given function.
   void ExecuteFunctionAndWait(
       TFE_Context* context, const TranslatedFunction* function_ptr,
       const MeshWithParallelDevice* parallel_device_mesh,
-      const std::vector<parallel_device::ParallelTensor*>& parallel_inputs,
+      const std::vector<const parallel_device::TensorHandlePtr*>&
+          parallel_inputs,
       const int64_t step_id, const TFE_OpAttrs* attributes, TF_Status* status);
 
+  // Execute regular operation with ParallelExecutor
+  void ParallelExecuteRegularOperation(
+      TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+      mlir::ModuleOp mlir_module, const DTensorOperation& doperation,
+      const TFE_OpAttrs* attributes, int* num_outputs,
+      TFE_TensorHandle** outputs, TF_Status* status);
+
   // Implements `Execute` for operations which aren't special-cased in
   void ExecuteRegularOperation(TFE_Context* context,
                                const std::vector<TensorWithLayout*>& inputs,
@@ -396,7 +407,7 @@ class DTensorDevice {
   // set.
   const MeshWithParallelDevice* global_default_mesh_ = nullptr;
   // If the user has specified a default output layout.
-  absl::optional<Layout> default_layout_;
+  std::optional<Layout> default_layout_;
 
   // Determines whether tensors with a shape previously associated with only one
   // layout use that layout if nothing else can be inferred.
@@ -412,10 +423,13 @@ class DTensorDevice {
   };
   absl::flat_hash_map<int64_t, CachedLayout> shape_layout_cache_;
 
-  FunctionManager function_manager_;
+  ExecutableManager<ExecutionFunctions> function_manager_;
+  ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>> module_manager_;
 
   // Records the function compilation cache hits and misses.
   std::unordered_map<std::string, int> function_compilation_hits_and_misses_;
+  // Records the Module lowering cache hits and misses.
+  std::unordered_map<std::string, int> module_lowering_hits_and_misses_;
 
   // Coordinates cancelling ops across meshes on error. Must outlive any queued
   // async op launches, so we only reset it after seeing a failure status.
@@ -637,6 +651,12 @@ std::string DTensorDevice::FetchLayout(TFE_Context* context,
   return t->layout().ToString();
 }
 
+bool DTensorDevice::IsDTensor(TFE_Context* context, TFE_TensorHandle* input,
+                              TF_Status* status) {
+  const char* input_device = TFE_TensorHandleDeviceName(input, status);
+  return input_device == name_;
+}
+
 std::vector<TFE_TensorHandle*> DTensorDevice::Unpack(TFE_Context* context,
                                                      TFE_TensorHandle* input,
                                                      TF_Status* status) {
@@ -657,7 +677,7 @@ std::vector<TFE_TensorHandle*> DTensorDevice::Unpack(TFE_Context* context,
       TFE_TensorHandleDevicePointer(input, status));
   if (TF_GetCode(status) != TF_OK) return outputs;
 
-  if (is_remote_mesh(t->mesh().mesh_config())) {
+  if (is_remote_mesh(t->mesh())) {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  "DTensorUnpack is not supported on a remote mesh.");
     return outputs;
@@ -695,93 +715,10 @@ void DTensorDevice::MaybeHandleDTensorCustomOps(
     TFE_Execute(op.get(), outputs, num_outputs, status);
     return;
   }
-  if (operation_name == std::string("CopyToMesh")) {
-    CopyToMesh(context, num_inputs, inputs, attributes, outputs, num_outputs,
-               status);
-    return;
-  }
 
   *is_custom_dtensor_op = false;
 }
 
-void DTensorDevice::CopyToMesh(TFE_Context* context, int num_inputs,
-                               TFE_TensorHandle** inputs,
-                               const TFE_OpAttrs* attributes,
-                               TFE_TensorHandle** outputs, int* num_outputs,
-                               TF_Status* status) {
-  if (num_inputs != 1) {
-    RETURN_STATUS(status, TF_INVALID_ARGUMENT,
-                  "DTensor CopyToMesh requires exactly 1 input.");
-  }
-  if (*num_outputs < 1) {
-    RETURN_STATUS(status, TF_INTERNAL,
-                  "DTensor CopyToMesh must have output buffer to allocate at "
-                  "least 1 output.");
-  }
-
-  // Assign layout.
-  StatusOr<Layout> target_layout_or =
-      FetchLayoutFromAttributes(attributes, kQualifiedLayoutAttr);
-  if (!target_layout_or.ok()) {
-    RETURN_STATUS(status, TF_INVALID_ARGUMENT,
-                  "DTensor CopyToMesh requires valid layout attribute for "
-                  "destination DTensor.");
-  }
-
-  const Layout target_layout = *target_layout_or;
-  const Mesh& target_mesh = target_layout.mesh();
-
-  // TODO(b/193443769): Support sharded layout for eager copy to mesh.
-  if (!target_layout.IsFullyReplicated()) {
-    RETURN_STATUS(status, TF_UNIMPLEMENTED,
-                  "Target layout of DTensor CopyToMesh must be replicated. "
-                  "Consider changing the target layout to replicated layout or "
-                  "file a bug to the DTensor team (b/193443769).");
-  }
-
-  TFE_TensorHandle* input_tensor = inputs[0];
-
-  // Check that if input tensor is DTensor, then input layout of the DTensor
-  // must be replicated.
-  const char* input_device = TFE_TensorHandleDeviceName(input_tensor, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  if (name_ == input_device) {
-    // Handle input which is on DTensor device already.
-    TensorWithLayout* t = reinterpret_cast<TensorWithLayout*>(
-        TFE_TensorHandleDevicePointer(input_tensor, status));
-    if (TF_GetCode(status) != TF_OK) return;
-
-    if (!t->layout().IsFullyReplicated())
-      RETURN_STATUS(status, TF_INVALID_ARGUMENT,
-                    "Input tensor to CopyToMesh must be replicated DTensor or "
-                    "normal eager Tensor.");
-
-    // If input to CopyToMesh is a DTensor, we use the first local tensor as
-    // input tensor handle to invoke copy.
-    input_tensor = t->get_tensor(0);
-  }
-
-  auto it = mesh_to_device_map_.find(target_mesh);
-  if (it == mesh_to_device_map_.end()) {
-    RETURN_STATUS(
-        status, TF_INTERNAL,
-        "DTensor CopyToMesh target mesh is not registered. Meshes should be "
-        "automatically registered. Please file a bug. (component id: 833864)");
-  }
-
-  const MeshWithParallelDevice* target_parallel_mesh = it->second.get();
-
-  // Broadcast non-dtensor value to dtensor.
-  std::unique_ptr<TensorWithLayout> wrapper = TensorWithLayout::Broadcast(
-      context, input_tensor, *target_parallel_mesh, name_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  RecordInShapeLayoutCache(*wrapper);
-  *num_outputs = 1;
-  *outputs = MakeLayoutTensorHandle(context, std::move(wrapper), status);
-}
-
 namespace {
 
 // Verifies that all components have the same dtype and shape.
@@ -880,8 +817,9 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
       component_shape.push_back(TFE_TensorHandleDim(inputs[0], i, status));
       if (TF_GetCode(status) != TF_OK) return nullptr;
     }
-    packed_tensor = TensorWithLayout::Dummy(
-        component_shape, dtype, *target_parallel_device, *target_layout);
+    packed_tensor = CreateDummyTensorWithLayout(
+        component_shape, dtype, target_parallel_device->mesh_config(),
+        *target_layout);
 
   } else {
     auto local_devices = target_parallel_device->mesh_config().local_devices();
@@ -938,10 +876,10 @@ TFE_TensorHandle* DTensorDevice::Pack(TFE_Context* context, int num_inputs,
       return nullptr;
     }
 
-    packed_tensor =
-        TensorWithLayout::Wrap(std::move(parallel_tensor),
-                               *target_parallel_device, *target_layout)
-            .value();
+    packed_tensor = CreateTensorWithLayout(
+                        std::move(parallel_tensor),
+                        target_parallel_device->mesh_config(), *target_layout)
+                        .value();
   }
 
   RecordInShapeLayoutCache(*packed_tensor);
@@ -1030,7 +968,8 @@ TFE_TensorHandle* DTensorDevice::SparsePack(
   if (is_remote_mesh(target_parallel_device->mesh_config())) {
     // Create a dummy SparseTensorWithLayout.
     packed_tensor = SparseTensorWithLayout::Dummy(
-        local_shape, *target_parallel_device, target_layout.value());
+        local_shape, target_parallel_device->mesh_config(),
+        target_layout.value());
   } else {
     // Parse the indices, values, and dense_shape tensors and put them into
     // parallel tensors, and then pack it into a single SparseTensorWithLayout.
@@ -1085,7 +1024,7 @@ TFE_TensorHandle* DTensorDevice::SparsePack(
         SparseTensorWithLayout::Wrap(std::move(parallel_indices_tensor),
                                      std::move(parallel_values_tensor),
                                      std::move(parallel_dense_shapes_tensor),
-                                     *target_parallel_device,
+                                     target_parallel_device->mesh_config(),
                                      target_layout.value(), local_shape)
             .value();
   }
@@ -1113,22 +1052,20 @@ bool DTensorDevice::IsSparseDTensor(TFE_Context* context,
   return t->tensor_type() == TensorType::kSparse;
 }
 
-void DTensorDevice::UpdateOutputLayoutsWithSameShapePolicy(
+Status DTensorDevice::UpdateOutputLayoutsWithSameShapePolicy(
     const std::vector<PartialTensorShape>& global_output_shapes,
     const absl::flat_hash_set<Mesh>& input_meshes, absl::string_view op_name,
-    tensorflow::Graph* graph, std::vector<const Layout*>* output_layouts,
-    TF_Status* status) {
-  if (!same_shape_policy_enabled_) return;
+    tensorflow::Graph* graph, std::vector<const Layout*>* output_layouts) {
+  if (!same_shape_policy_enabled_) return OkStatus();
   // Simply do not hint if inputs span across multiple meshes.
-  if (input_meshes.size() > 1) return;
+  if (input_meshes.size() > 1) return OkStatus();
 
   for (Node* node : graph->op_nodes()) {
     if (!node->IsRetval()) {
       continue;
     }
     int output_index;
-    RETURN_C_STATUS_IF_NOT_OK(
-        GetNodeAttr(node->attrs(), "index", &output_index), status);
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &output_index));
     if (output_layouts->at(output_index)) {
       continue;
     }
@@ -1145,8 +1082,10 @@ void DTensorDevice::UpdateOutputLayoutsWithSameShapePolicy(
     //   is trivial. On the other hande, downstream system "thinks' Variable has
     //   shape same as the pointing value. So, providing a layout based on
     //   VarHandleOp (scalar) might confuse the downstream system.
+    // - CopyToMesh has a user-supplied layout that is propagated downstream.
     if (op_name != std::string("Relayout") &&
-        op_name != std::string("VarHandleOp")) {
+        op_name != std::string("VarHandleOp") &&
+        op_name != std::string("CopyToMesh")) {
       // TODO(b/162009702): Support matching between partially-known shapes.
       if (global_output_shape.IsFullyDefined()) {
         gtl::InlinedVector<int64, 4> shape_vector(
@@ -1192,6 +1131,7 @@ void DTensorDevice::UpdateOutputLayoutsWithSameShapePolicy(
       }
     }
   }
+  return OkStatus();
 }
 
 std::unordered_map<std::string, int>
@@ -1200,6 +1140,33 @@ DTensorDevice::GetFunctionCacheHitAndMissCount(TFE_Context* context,
   return function_compilation_hits_and_misses_;
 }
 
+void DTensorDevice::SetIteratorElementLayouts(
+    TFE_Context* context, TFE_TensorHandle* input,
+    const std::vector<std::string>& string_layouts, TF_Status* status) {
+  const char* input_device = TFE_TensorHandleDeviceName(input, status);
+  if (input_device != name_) {
+    TF_SetStatus(
+        status, TF_INVALID_ARGUMENT,
+        absl::StrCat(
+            "SetIteratorElementLayouts expects an iterator resource placed on ",
+            "the DTensor device: ", name_,
+            ", but it was placed on device: ", input_device)
+            .c_str());
+    return;
+  }
+  ResourceHandleWithLayout* t = reinterpret_cast<ResourceHandleWithLayout*>(
+      TFE_TensorHandleDevicePointer(input, status));
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::vector<Layout> layouts;
+  std::transform(string_layouts.cbegin(), string_layouts.cend(),
+                 std::back_inserter(layouts),
+                 [](const std::string& layout_str) {
+                   return Layout::FromString(layout_str).value();
+                 });
+  t->UpdateElementLayouts(layouts, status);
+}
+
 // From `graph` containing computation for all meshes, extract/select
 // computation for mesh specified in `function`. Returned graph is a cloned
 // graph with ops only for single mesh execution.
@@ -1323,14 +1290,14 @@ Status AddExecutionFunctionDefsToFunctionDefLibrary(
         absl::StrCat(func.name(), "_", unique_function_number.fetch_add(1));
     auto control_ret_node_names =
         [&control_ret_names, &selected_call_node_name](
-            const Node* node) -> absl::optional<std::string> {
+            const Node* node) -> std::optional<std::string> {
       // Add the stateful partitioned call node as a control return as we need
       // to process any control deps inside the inner function.
       if (control_ret_names.contains(node->name()) ||
           node->name() == selected_call_node_name) {
         return node->name();
       }
-      return absl::nullopt;
+      return std::nullopt;
     };
 
     tensorflow::FunctionDef to_run;
@@ -1349,32 +1316,27 @@ Status AddExecutionFunctionDefsToFunctionDefLibrary(
   return OkStatus();
 }
 
-void DTensorDevice::LowerToSPMDFunction(
+StatusOr<DTensorDevice::DTensorOperationLoweringContext>
+DTensorDevice::DTensorOperationToModule(
     TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
-    const DTensorOperation& doperation, const TFE_OpAttrs* attributes,
-    const int num_outputs, const ExecutionFunctions** execution_functions,
-    TF_Status* status) {
+    const DTensorOperation& doperation, const NameAttrList& eager_attributes) {
   profiler::TraceMe activity(
-      [&] { return "DTensorDevice::LowerToSPMDFunction"; },
+      [&] { return "DTensorDevice::DTensorOperationToModule"; },
       profiler::TraceMeLevel::kInfo);
   FunctionLibraryDefinition* flib_def =
       tensorflow::unwrap(context)->FuncLibDef();
-  auto graph(std::make_unique<tensorflow::Graph>(flib_def));
-  NameAttrList eager_attributes;
-  ASSIGN_OR_RETURN_C_STATUS(eager_attributes, FetchAttributes(attributes),
-                            status);
+  DTensorOperationLoweringContext result;
+  result.graph = std::make_unique<tensorflow::Graph>(flib_def);
 
-  std::vector<PartialTensorShape> global_output_shapes;
-  std::vector<const Layout*> output_layouts;
   const FunctionDef* function_def = doperation.function_def;
   if (!function_def) {
     // Output layouts of an eager op (e.g. fill) must be inferred before cache
     // key computation, since they might depend on the current DTensorDevice
     // state.
-    Status s = PrepareGraphForMlir(
+    TF_RETURN_IF_ERROR(PrepareGraphForMlir(
         function_manager_, inputs, doperation, *flib_def, eager_attributes,
-        default_layout_, graph.get(), &global_output_shapes, &output_layouts);
-    RETURN_C_STATUS_IF_NOT_OK(s, status);
+        default_layout_, result.graph.get(), &result.global_output_shapes,
+        &result.output_layouts));
 
     // Finds all meshes the inputs are lied on.
     absl::flat_hash_set<Mesh> input_meshes;
@@ -1385,75 +1347,106 @@ void DTensorDevice::LowerToSPMDFunction(
     }
     // Currently we only provide layout hints for op-by-op, since
     // they interact badly with layout propagation.
-    UpdateOutputLayoutsWithSameShapePolicy(global_output_shapes, input_meshes,
-                                           doperation.name, graph.get(),
-                                           &output_layouts, status);
-    if (TF_GetCode(status) != TF_OK) return;
+    TF_RETURN_IF_ERROR(UpdateOutputLayoutsWithSameShapePolicy(
+        result.global_output_shapes, input_meshes, doperation.name,
+        result.graph.get(), &result.output_layouts));
   }
 
-  std::pair<tensorflow::Fprint128, const ExecutionFunctions*>
-      cache_key_and_func = function_manager_.GetCachedFunction(
-          doperation, eager_attributes, inputs, output_layouts);
-  *execution_functions = cache_key_and_func.second;
-  if (*execution_functions != nullptr) {
-    function_compilation_hits_and_misses_["hit"]++;
-    return;
+  auto [cache_key, cached_mlir_module] = module_manager_.GetCachedExecutable(
+      doperation, eager_attributes, inputs, result.output_layouts);
+  result.doperation_cache_key = cache_key;
+
+  if (cached_mlir_module != nullptr) {
+    module_lowering_hits_and_misses_["hit"]++;
+    result.module = **cached_mlir_module;
+    return result;
   } else if (function_def) {
-    function_compilation_hits_and_misses_["miss"]++;
+    module_lowering_hits_and_misses_["miss"]++;
     LOG(INFO) << "DTensor cache key lookup missed for " << doperation.name
               << ". DTensor is (re-)computing its SPMD transformation.";
   }
 
   // It includes remote devices when the coordination service is enabled.
-  const auto device_list = tensorflow::unwrap(context)->ListAllTfDevices();
+  result.tf_devices = tensorflow::unwrap(context)->ListAllTfDevices();
   DeviceSet device_set;
-  for (const auto device : device_list) device_set.AddDevice(device);
+  for (const auto device : result.tf_devices) device_set.AddDevice(device);
 
   if (function_def) {
-    ASSIGN_OR_RETURN_C_STATUS(auto device_name_to_mesh_device,
-                              PipelineSubMeshes(context), status);
-    const bool is_pipelining_function = !device_name_to_mesh_device.empty();
-    // For a multi-mesh function for pipelining, we take a different execution
-    // path. Call the partitioner to lower and partition the graph into multiple
-    // sub functions to execute (one per sub mesh).
-    if (is_pipelining_function) {
-      ASSIGN_OR_RETURN_C_STATUS(
-          ExecutionFunctions functions,
-          PipeliningPartitionerRun(&device_name_to_mesh_device, flib_def,
-                                   &pass_runner_, *doperation.function_def,
-                                   eager_attributes, inputs, device_set,
-                                   num_outputs),
-          status);
-      *execution_functions = function_manager_.AddCachedFunction(
-          doperation, cache_key_and_func.first, std::move(functions));
-      return;
-    }
     // Output layouts of a function are inferred by MLIR lowering. They are
     // not necessary for cache key computation, so run PrepareGraphForMlir after
     // cache key computation to reduce the overheads of running the same
     // function multiple times.
-    Status s = PrepareGraphForMlir(
+    TF_RETURN_IF_ERROR(PrepareGraphForMlir(
         function_manager_, inputs, doperation, *flib_def, eager_attributes,
-        default_layout_, graph.get(), &global_output_shapes, &output_layouts);
-    RETURN_C_STATUS_IF_NOT_OK(s, status);
+        default_layout_, result.graph.get(), &result.global_output_shapes,
+        &result.output_layouts));
   }
 
-  absl::flat_hash_set<Node*> control_ret_nodes;
-  // Run DTensor MLIR passes that convert input graph to SPMD version.
+  VLOG(4) << tensorflow::DumpGraphToFile("after_prepare_for_mlir",
+                                         *result.graph, flib_def);
+
+  // Converts Graph to MLIR Module.
+  TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module_ref,
+                      pass_runner_.ImportGraphToMlir(
+                          device_set, doperation.is_func(), *flib_def,
+                          *result.graph, result.doperation_cache_key));
+
+  cached_mlir_module = module_manager_.AddCachedExecutable(
+      doperation, cache_key, mlir_module_ref.release());
+  result.module = **cached_mlir_module;
+  return result;
+}
+
+void DTensorDevice::ModuleToExecutionFunctions(
+    TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+    const DTensorOperation& doperation, const NameAttrList& eager_attributes,
+    int num_outputs, DTensorOperationLoweringContext& lowering_context,
+    const ExecutionFunctions** execution_functions, TF_Status* status) {
+  profiler::TraceMe activity(
+      [&] { return "DTensorDevice::ModuleToExecutionFunctions"; },
+      profiler::TraceMeLevel::kInfo);
+  FunctionLibraryDefinition* flib_def =
+      tensorflow::unwrap(context)->FuncLibDef();
+  const FunctionDef* function_def = doperation.function_def;
+
+  const ExecutionFunctions* cached_function =
+      function_manager_.GetCachedExecutableSimple(
+          lowering_context.doperation_cache_key);
+  if (cached_function != nullptr) {
+    *execution_functions = cached_function;
+    function_compilation_hits_and_misses_["hit"]++;
+    return;
+  } else if (function_def) {
+    function_compilation_hits_and_misses_["miss"]++;
+    LOG(INFO) << "DTensor cache key lookup missed for " << doperation.name
+              << ". DTensor is (re-)computing its ExecutionFunctions.";
+  }
+
+  // Transforms ModuleOp and extracts ExecutionFunctions from lowered ModuleOp.
+  if (!lowering_context.module.has_value()) {
+    RETURN_STATUS(status, TF_INVALID_ARGUMENT,
+                  "ModuleOp for ExecutionFunctions extraction is missing.");
+  }
   {
     profiler::TraceMe activity([&] { return "DTensorDevice::RunMLIRPasses"; },
                                profiler::TraceMeLevel::kInfo);
-    RETURN_C_STATUS_IF_NOT_OK(
-        pass_runner_.RunOnGraph(device_set, doperation.is_func(), flib_def,
-                                &graph, control_ret_nodes,
-                                cache_key_and_func.first),
-        status);
-  }
-  VLOG(4) << tensorflow::DumpGraphToFile("after_mlir_spmd_lowering", *graph,
-                                         flib_def);
+    RETURN_C_STATUS_IF_NOT_OK(pass_runner_.Run(*lowering_context.module),
+                              status);
+  }
+  // Converts MLIR to GraphDef and merges to the global Graph.
+  absl::flat_hash_set<Node*> control_ret_nodes;
+  GraphExportConfig export_config;
+  RETURN_C_STATUS_IF_NOT_OK(
+      ConvertMlirToGraph(*lowering_context.module, export_config,
+                         &(lowering_context.graph), flib_def,
+                         &control_ret_nodes),
+      status);
+  Graph* graph = lowering_context.graph.get();
+  VLOG(4) << DumpGraphToFile("after_dtensor_mlir_pass", *graph, flib_def);
+
   if (flib_def->Contains(kLoadEmbeddingFn)) {
-    Status s = InsertFunctionForTPUEmbeddingCheckpoint(
-        status, graph.get(), inputs, kLoadEmbeddingFn);
+    Status s = InsertFunctionForTPUEmbeddingCheckpoint(status, graph, inputs,
+                                                       kLoadEmbeddingFn);
     RETURN_C_STATUS_IF_NOT_OK(s, status);
   }
 
@@ -1462,21 +1455,22 @@ void DTensorDevice::LowerToSPMDFunction(
   // for each mesh and relevant input and output information.
   ASSIGN_OR_RETURN_C_STATUS(
       ExecutionFunctions functions,
-      IdentifyAllFunctionsToExecute(*graph.get(), global_output_shapes),
+      IdentifyAllFunctionsToExecute(*lowering_context.graph,
+                                    lowering_context.global_output_shapes),
       status);
 
   // In order to ensure that all resource assign operations as well as side
   // effecting ops are executed, we add identity ops before function outputs
   // with control rets.
-  RETURN_C_STATUS_IF_NOT_OK(MaybeInsertIdentityNodes(function_def, graph.get()),
+  RETURN_C_STATUS_IF_NOT_OK(MaybeInsertIdentityNodes(function_def, graph),
                             status);
 
-  VLOG(4) << tensorflow::DumpGraphToFile("after_post_processing_graph", *graph,
-                                         flib_def);
+  VLOG(4) << tensorflow::DumpGraphToFile("after_post_processing_graph",
+                                         *lowering_context.graph, flib_def);
 
   RETURN_C_STATUS_IF_NOT_OK(
-      AddExecutionFunctionDefsToFunctionDefLibrary(control_ret_nodes, context,
-                                                   *graph.get(), &functions),
+      AddExecutionFunctionDefsToFunctionDefLibrary(
+          control_ret_nodes, context, *lowering_context.graph, &functions),
       status);
   functions.num_device_ids = 1;
   if (function_def) {
@@ -1487,14 +1481,14 @@ void DTensorDevice::LowerToSPMDFunction(
     }
   }
 
-  *execution_functions = function_manager_.AddCachedFunction(
-      doperation, cache_key_and_func.first, std::move(functions));
+  *execution_functions = function_manager_.AddCachedExecutable(
+      doperation, lowering_context.doperation_cache_key, std::move(functions));
 }
 
 void DTensorDevice::ExecuteFunctionAndWait(
     TFE_Context* context, const TranslatedFunction* function_ptr,
     const MeshWithParallelDevice* parallel_device_mesh,
-    const std::vector<parallel_device::ParallelTensor*>& parallel_inputs,
+    const std::vector<const parallel_device::TensorHandlePtr*>& parallel_inputs,
     const int64_t step_id, const TFE_OpAttrs* attributes, TF_Status* status) {
   const std::string mesh_str = function_ptr->function_mesh.ToString();
   VLOG(4) << "Launching computation for mesh : " << mesh_str;
@@ -1527,20 +1521,54 @@ void DTensorDevice::ExecuteFunctionAndWait(
   }
 }
 
+void DTensorDevice::ParallelExecuteRegularOperation(
+    TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+    mlir::ModuleOp mlir_module, const DTensorOperation& doperation,
+    const TFE_OpAttrs* attributes, int* num_outputs, TFE_TensorHandle** outputs,
+    TF_Status* status) {
+  auto future_result =
+      parallel_executor_->Execute(context, inputs, mlir_module,
+                                  /*entry_function_name=*/"main", attributes);
+  auto result_with_status = future_result.Await();
+
+  std::vector<TensorWithLayout*> typed_outputs;
+  ASSIGN_OR_RETURN_C_STATUS(typed_outputs, result_with_status, status);
+  // assign outputs and take outputs' ownership
+  *num_outputs = typed_outputs.size();
+  for (int i = 0; i < *num_outputs; ++i) {
+    outputs[i] = MakeLayoutTensorHandle(
+        context, absl::WrapUnique(typed_outputs[i]), status);
+  }
+}
+
 void DTensorDevice::ExecuteRegularOperation(
     TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
     const DTensorOperation& doperation, const TFE_OpAttrs* attributes,
     int* num_outputs, TFE_TensorHandle** outputs, TF_Status* status) {
-  const ExecutionFunctions* execution_functions = nullptr;
+  ASSIGN_OR_RETURN_C_STATUS(auto eager_attributes, FetchAttributes(attributes),
+                            status);
 
-  LowerToSPMDFunction(context, inputs, doperation, attributes, *num_outputs,
-                      &execution_functions, status);
+  ASSIGN_OR_RETURN_C_STATUS(
+      auto lowering_context,
+      DTensorOperationToModule(context, inputs, doperation, eager_attributes),
+      status);
 
   if (parallel_executor_) {
-    RETURN_C_STATUS_IF_NOT_OK(parallel_executor_->Execute(), status);
+    if (!lowering_context.module.has_value()) {
+      RETURN_STATUS(status, TF_INTERNAL,
+                    "ParallelExecutor is enabled but ModuleOp is missing.");
+    }
+    ParallelExecuteRegularOperation(context, inputs, *lowering_context.module,
+                                    doperation, attributes, num_outputs,
+                                    outputs, status);
     return;
   }
 
+  const ExecutionFunctions* execution_functions = nullptr;
+  ModuleToExecutionFunctions(context, inputs, doperation, eager_attributes,
+                             *num_outputs, lowering_context,
+                             &execution_functions, status);
+
   if (TF_GetCode(status) != TF_OK) return;
 
   // Update input layouts for resource arguments.
@@ -1637,8 +1665,8 @@ void DTensorDevice::ExecuteRegularOperation(
   }
 
   if (load_embedding_ptr != nullptr) {
-    StatusOr<std::vector<parallel_device::ParallelTensor*>> parallel_inputs =
-        PrepareEmbeddingInputs(inputs);
+    StatusOr<std::vector<const parallel_device::TensorHandlePtr*>>
+        parallel_inputs = PrepareEmbeddingInputs(inputs);
     if (!parallel_inputs.ok()) {
       RETURN_STATUS(status, TF_INTERNAL,
                     parallel_inputs.status().error_message().c_str());
@@ -1655,8 +1683,9 @@ void DTensorDevice::ExecuteRegularOperation(
 
   // Extract the global parallel inputs and flatten SparseTensors
   // into the three component tensors.
-  std::vector<parallel_device::ParallelTensor*> global_parallel_inputs;
-  std::vector<parallel_device::ParallelTensor*> global_parallel_sparse_inputs;
+  std::vector<const parallel_device::TensorHandlePtr*> global_parallel_inputs;
+  std::vector<const parallel_device::TensorHandlePtr*>
+      global_parallel_sparse_inputs;
   absl::flat_hash_set<int> global_sparse_input_indices;
   for (auto input : inputs) {
     if (input->tensor_type() == TensorType::kSparse) {
@@ -1697,7 +1726,7 @@ void DTensorDevice::ExecuteRegularOperation(
         function_name_and_mesh_mapping[translated_function_name];
 
     // Gather the local inputs for this function.
-    std::vector<parallel_device::ParallelTensor*> parallel_inputs;
+    std::vector<const parallel_device::TensorHandlePtr*> parallel_inputs;
     parallel_inputs.reserve(inputs.size() + 1);
     auto input_mapping = function.input_index_map;
 
@@ -1713,7 +1742,7 @@ void DTensorDevice::ExecuteRegularOperation(
 
       if (global_index < execution_functions->num_device_ids) {
         parallel_inputs.push_back(
-            parallel_device_mesh->DeviceIDs(context, status));
+            parallel_device_mesh->DeviceIDs(context, status)->tensor_data());
         if (TF_GetCode(status) != TF_OK) return;
       } else {
         parallel_inputs.push_back(global_parallel_inputs[input_index]);
@@ -1755,9 +1784,9 @@ void DTensorDevice::ExecuteRegularOperation(
             std::vector<int64_t>(dim_sizes.begin(), dim_sizes.end());
         TF_DataType dtype =
             static_cast<TF_DataType>(function.output_dtypes.at(i));
-        auto remote_output =
-            TensorWithLayout::Dummy(local_shape, dtype, *parallel_device_mesh,
-                                    function.output_layouts[i]);
+        auto remote_output = CreateDummyTensorWithLayout(
+            local_shape, dtype, parallel_device_mesh->mesh_config(),
+            function.output_layouts[i]);
         output_with_layout.push_back(std::move(remote_output));
       }
     } else {
@@ -1786,8 +1815,8 @@ void DTensorDevice::ExecuteRegularOperation(
       for (int i = 0; i < result->size(); ++i) {
         ASSIGN_OR_RETURN_C_STATUS(
             auto local_output,
-            TensorWithLayout::Wrap(std::move((*result)[i]),
-                                   *parallel_device_mesh,
+            CreateTensorWithLayout(std::move((*result)[i]),
+                                   parallel_device_mesh->mesh_config(),
                                    function.output_layouts[i]),
             status);
         output_with_layout.push_back(std::move(local_output));
@@ -1798,9 +1827,12 @@ void DTensorDevice::ExecuteRegularOperation(
       // TODO(b/162744844): Generalize this pattern so that the extraction is
       // not special cased.
       if (function.shape_output_metadata.find(i) !=
-          function.shape_output_metadata.end()) {
-        output_with_layout[i]->set_input_layout_for_shape_op_result(
-            function.shape_output_metadata.at(i));
+              function.shape_output_metadata.end() &&
+          output_with_layout[i]->const_value_node() != nullptr) {
+        output_with_layout[i]
+            ->const_value_node()
+            ->set_input_layout_for_shape_op_result(
+                function.shape_output_metadata.at(i));
       }
 
       RecordInShapeLayoutCache(*output_with_layout[i]);
@@ -1930,12 +1962,14 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
       input_meshes.insert(t->layout().mesh());
     }
     // Remote mesh inputs are not able to be read and evaluated.
-    if (!is_remote_mesh(t->layout().mesh()) && !t->const_value().has_value()) {
+    if (!is_remote_mesh(t->layout().mesh()) &&
+        t->const_value_node() != nullptr &&
+        !t->const_value_node()->const_value().has_value()) {
       std::optional<NodeDef> const_value =
           ExtractSmallTensorValue(context, input, t->layout(), status);
       if (TF_GetCode(status) != TF_OK) return;
       if (const_value.has_value()) {
-        t->set_const_value(const_value.value());
+        t->const_value_node()->set_const_value(const_value.value());
       }
     }
     typed_inputs[j] = t;
@@ -1961,7 +1995,9 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     TF_DataType dtype = TFE_TensorHandleDataType(input);
     const bool small_int_tensor = num_elements < kSmallTensorThreshold &&
                                   (dtype == TF_INT32 || dtype == TF_INT64);
-    if (!(num_dims == 0 || dtype == TF_STRING || small_int_tensor)) {
+    // Only allow large constant autobroadcast for CopyToMesh op.
+    if (operation_name != std::string("CopyToMesh") &&
+        !(num_dims == 0 || dtype == TF_STRING || small_int_tensor)) {
       std::vector<int64_t> tensor_shape(TensorShapeAsVector(input, status));
       if (TF_GetCode(status) != TF_OK) return;
       RETURN_STATUS(
@@ -1982,12 +2018,13 @@ void DTensorDevice::Execute(const TFE_Op* original_op, int* num_outputs,
     // vector, whereas the input `TFE_TensorHandle`s maintain ownership for
     // inputs that already had layouts (and therefor had TensorWithLayout
     // objects).
-    std::unique_ptr<TensorWithLayout> wrapper = TensorWithLayout::Broadcast(
+    std::unique_ptr<TensorWithLayout> wrapper = TensorWithLayoutTf::Broadcast(
         context, input, *broadcast_mesh, name_, status);
     if (TF_GetCode(status) != TF_OK) return;
     if (!ShouldFoldInputArgument(dtensor_operation.name,
-                                 /*input_index=*/not_on_device_input_index)) {
-      wrapper->reset_const_value();
+                                 /*input_index=*/not_on_device_input_index) &&
+        wrapper->const_value_node() != nullptr) {
+      wrapper->const_value_node()->reset_const_value();
     }
     typed_inputs[not_on_device_input_index] = wrapper.get();
     inputs_with_no_layout.emplace_back(wrapper.release());
@@ -2032,7 +2069,7 @@ TFE_TensorHandle* CopyFromDTensorDevice(TFE_Context* context,
 
     return nullptr;
   }
-  if (typed_input->tensor()->dtype() == TF_RESOURCE) {
+  if (typed_input->dtype() == TF_RESOURCE) {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  "Trying to copy a DTensor resource handle is not supported.");
     return nullptr;
@@ -2102,6 +2139,12 @@ bool PinToDTensorDevice(const TFE_Op* op, TF_Status* s) {
   // tensor to a non-cpu mesh.
   if (has_non_dtensor_resource && broadcast_mesh &&
       !broadcast_mesh->is_cpu_mesh()) {
+    LOG(WARNING)
+        << "DTensor Function has been pinned back to a physical device because"
+        << "a regular TF Variable is an input along with dtensor inputs and "
+        << "was unable to be upcasted to a DVariable. This "
+        << "may be unintended and signify an error in the way the user is "
+        << "mixing dtensors with regular variables as inputs to a function.";
     return false;
   }
 
@@ -2119,7 +2162,8 @@ void AllocateDTensorDevice(absl::string_view device_name,
 }
 
 void AddMesh(const std::string& serialized_mesh, void* device_info,
-             bool is_async, bool is_host_mesh, TF_Status* status) {
+             bool is_async, bool is_host_mesh, int in_flight_nodes_limit,
+             TF_Status* status) {
   auto mesh_config_or_status = Mesh::FromString(serialized_mesh);
   if (!mesh_config_or_status.ok()) {
     TF_SetStatus(status, TF_INTERNAL,
@@ -2136,17 +2180,11 @@ void AddMesh(const std::string& serialized_mesh, void* device_info,
   // DTensor uses multi-client setup which doesn't use remote eager, so we can
   // enable eager async execution in ParallelDevice.
   std::unique_ptr<tensorflow::parallel_device::ParallelDevice> parallel(
-      new tensorflow::parallel_device::ParallelDevice(underlying_devices,
-                                                      is_async));
-
-  std::string composite_device_name;
-  if (absl::StartsWith(mesh_config.name(), kPipelineMeshNamePrefix)) {
-    composite_device_name = std::string(
-        absl::StripPrefix(mesh_config.name(), kPipelineMeshNamePrefix));
-  }
+      new tensorflow::parallel_device::ParallelDevice(
+          underlying_devices, is_async, in_flight_nodes_limit));
 
-  auto mesh = std::make_unique<MeshWithParallelDevice>(
-      std::move(mesh_config), std::move(parallel), composite_device_name);
+  auto mesh = std::make_unique<MeshWithParallelDevice>(std::move(mesh_config),
+                                                       std::move(parallel));
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   device->AddMesh(std::move(mesh), is_host_mesh);
 }
@@ -2235,6 +2273,12 @@ std::string FetchLayout(TFE_Context* context, TFE_TensorHandle* input,
   return device->FetchLayout(context, input, status);
 }
 
+bool IsDTensor(TFE_Context* context, TFE_TensorHandle* input, void* device_info,
+               TF_Status* status) {
+  DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
+  return device->IsDTensor(context, input, status);
+}
+
 TFE_TensorHandle* SparsePack(TFE_Context* context, int num_inputs,
                              TFE_TensorHandle** indices,
                              TFE_TensorHandle** values,
@@ -2257,5 +2301,13 @@ std::unordered_map<std::string, int> GetFunctionCacheHitAndMissCount(
   DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
   return device->GetFunctionCacheHitAndMissCount(context, status);
 }
+
+void SetIteratorElementLayouts(TFE_Context* context, TFE_TensorHandle* input,
+                               const std::vector<std::string>& string_layouts,
+                               void* device_info, TF_Status* status) {
+  DTensorDevice* device = reinterpret_cast<DTensorDevice*>(device_info);
+  device->SetIteratorElementLayouts(context, input, string_layouts, status);
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_device.h b/tensorflow/dtensor/cc/dtensor_device.h
index b18187fdf2a..243b8265401 100644
--- a/tensorflow/dtensor/cc/dtensor_device.h
+++ b/tensorflow/dtensor/cc/dtensor_device.h
@@ -37,18 +37,24 @@ namespace dtensor {
 void AllocateDTensorDevice(absl::string_view device_name,
                            TFE_CustomDevice* device, void** device_info);
 
-// Add a mesh to the layout propagator indicated by `device_info`.
+// Add a mesh to the `DTensorDevice` indicated by `device_info`.
 //
 // `serialized_mesh` is a serialized Mesh proto.
 //
-// is_async indicates whether DTensor operations on this mesh will return
-// immediately (with "non-ready" handles) or block until executed. This is
-// exposed as an option for ease of debugging, and will typically be on.
+// If `is_async` is true, it indicates the DTensor operations on this mesh will
+// return immediately (with "non-ready" handles), otherwise block until
+// executed. This is exposed as an option for ease of debugging, and will
+// typically be on.
 //
 // `is_host_mesh` indicates this is a CPU mesh used only for sea-of-donuts-style
 // host collectives.
+//
+// in_flight_nodes_limit throttles the number of inflight nodes in the eager
+// async executors used by DTensor. The throttling bounds the memory usage
+// of an eager training loop. Python API sets this value to 8 by default.
 void AddMesh(const std::string& serialized_mesh, void* device_info,
-             bool is_async, bool is_host_mesh, TF_Status* status);
+             bool is_async, bool is_host_mesh, int in_flight_nodes_limit,
+             TF_Status* status);
 
 // Sets a requested layout for outputs of all operations.
 void ExperimentalSetDefaultLayout(const std::string& serialized_layout,
@@ -101,6 +107,10 @@ std::vector<TFE_TensorHandle*> Unpack(TFE_Context* context,
 std::string FetchLayout(TFE_Context* context, TFE_TensorHandle* input,
                         void* device_info, TF_Status* status);
 
+// Returns whether `input` is a dtensor.
+bool IsDTensor(TFE_Context* context, TFE_TensorHandle* input, void* device_info,
+               TF_Status* status);
+
 // Pack `indices`, `values`, `shapes` tensors into a SparseTensorWithLayout.
 TFE_TensorHandle* SparsePack(TFE_Context* context, int num_inputs,
                              TFE_TensorHandle** indices,
@@ -120,6 +130,12 @@ bool IsSparseDTensor(TFE_Context* context, TFE_TensorHandle* input,
 // 'miss'.
 std::unordered_map<std::string, int> GetFunctionCacheHitAndMissCount(
     TFE_Context* context, void* device_info, TF_Status* status);
+
+// Sets the layouts for the elements emitted by an iterator resource tensor.
+void SetIteratorElementLayouts(TFE_Context* context, TFE_TensorHandle* input,
+                               const std::vector<std::string>& string_layouts,
+                               void* device_info, TF_Status* status);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.cc b/tensorflow/dtensor/cc/dtensor_device_util.cc
index 8be27c94c63..a198909cd5a 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.cc
+++ b/tensorflow/dtensor/cc/dtensor_device_util.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/dtensor_device_util.h"
 
+#include <algorithm>
 #include <cstddef>
+#include <iterator>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -113,9 +116,10 @@ std::unique_ptr<TensorWithLayout> BroadcastResourceTensor(
   // associated device of the resource itself.
   ResourceHandle r = t.flat<ResourceHandle>()(0);
 
+  const Mesh& target_mesh = mesh.mesh_config();
   // Only broadcast resource tensors onto a CPU mesh. Copying
   // resource tensors to non CPU device is not supported.
-  if (!mesh.mesh_config().is_cpu_mesh()) {
+  if (!target_mesh.is_cpu_mesh()) {
     std::string error_message =
         "Using a non-DTensor variable with DTensor is only supported for "
         "copying to a CPU mesh. If you are using a scope "
@@ -131,13 +135,13 @@ std::unique_ptr<TensorWithLayout> BroadcastResourceTensor(
   }
 
   LOG(INFO) << "Broadcasting resource tensor to a dtensor resource tensor.";
-  if (mesh.mesh_config().is_remote()) {
+  if (target_mesh.is_remote()) {
     TF_DataType dtype = TFE_TensorHandleDataType(tensor);
     std::vector<int64_t> shape(TensorShapeAsVector(tensor, status));
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    auto layout = Layout::ReplicatedOnMesh(mesh.mesh_config(), shape.size());
+    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape.size());
 
-    auto ret = TensorWithLayout::Dummy(shape, dtype, mesh, layout);
+    auto ret = CreateDummyTensorWithLayout(shape, dtype, target_mesh, layout);
     return ret;
   }
 
@@ -149,9 +153,9 @@ std::unique_ptr<TensorWithLayout> BroadcastResourceTensor(
                  ? 0
                  : r.dtypes_and_shapes().begin()->shape.dims();
 
-  StatusOr<std::unique_ptr<TensorWithLayout>> result = TensorWithLayout::Wrap(
-      std::move(parallel_tensor), mesh,
-      Layout::ReplicatedOnMesh(mesh.mesh_config(), rank));
+  StatusOr<std::unique_ptr<TensorWithLayout>> result =
+      CreateTensorWithLayout(std::move(parallel_tensor), target_mesh,
+                             Layout::ReplicatedOnMesh(target_mesh, rank));
   if (!result.ok()) {
     TF_SetStatus(
         status, TF_INTERNAL,
@@ -179,8 +183,8 @@ std::unique_ptr<TensorWithLayout> BroadcastResourceTensor(
   return std::move(*result);
 }
 
-bool LayoutsAreCompatible(absl::optional<Layout> first_layout,
-                          absl::optional<Layout> second_layout) {
+bool LayoutsAreCompatible(std::optional<Layout> first_layout,
+                          std::optional<Layout> second_layout) {
   if (!first_layout.has_value() && !second_layout.has_value()) {
     return true;
   }
@@ -258,21 +262,22 @@ StatusOr<Layout> GetLayoutThroughIdentityOps(Node* op, int output_index) {
 
 }  // namespace
 
-tensorflow::Fprint128 TensorWithLayout::CacheKey() const {
+tensorflow::Fprint128 TensorWithLayoutTf::CacheKey() const {
   tensorflow::Fprint128 f = tensorflow::Fingerprint128(layout_.ToString());
   // Use exact shape to compute the key.
   for (const int64_t dim : local_shape()) {
     f = FingerprintCat128(f, dim);
   }
-  if (const_value_.has_value()) {
+  if (const_value_node_->const_value().has_value()) {
     std::string serialized;
-    SerializeToStringDeterministic(const_value_.value(), &serialized);
+    SerializeToStringDeterministic(const_value_node_->const_value().value(),
+                                   &serialized);
     f = FingerprintCat128(f, tensorflow::Fingerprint128(serialized));
   }
   return f;
 }
 
-std::unique_ptr<TensorWithLayout> TensorWithLayout::Broadcast(
+std::unique_ptr<TensorWithLayout> TensorWithLayoutTf::Broadcast(
     TFE_Context* context, TFE_TensorHandle* tensor,
     const MeshWithParallelDevice& mesh, const std::string& dtensor_device_name,
     TF_Status* status) {
@@ -291,18 +296,19 @@ std::unique_ptr<TensorWithLayout> TensorWithLayout::Broadcast(
                                    status);
   }
 
-  if (mesh.mesh_config().is_remote()) {
+  const Mesh& target_mesh = mesh.mesh_config();
+  if (target_mesh.is_remote()) {
     TF_DataType dtype = TFE_TensorHandleDataType(tensor);
     std::vector<int64_t> shape(TensorShapeAsVector(tensor, status));
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    auto layout = Layout::ReplicatedOnMesh(mesh.mesh_config(), shape.size());
+    auto layout = Layout::ReplicatedOnMesh(target_mesh, shape.size());
 
-    auto ret = TensorWithLayout::Dummy(shape, dtype, mesh, layout);
-    absl::optional<NodeDef> const_value =
+    auto ret = CreateDummyTensorWithLayout(shape, dtype, target_mesh, layout);
+    std::optional<NodeDef> const_value =
         ExtractSmallTensorValue(context, tensor, layout, status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
     if (const_value) {
-      ret->set_const_value(const_value.value());
+      ret->const_value_node()->set_const_value(const_value.value());
     }
     return ret;
   }
@@ -319,55 +325,45 @@ std::unique_ptr<TensorWithLayout> TensorWithLayout::Broadcast(
     return nullptr;
   }
   size_t num_dims = shape->size();
-  const Layout layout = Layout::ReplicatedOnMesh(mesh.mesh_config(), num_dims);
+  const Layout layout = Layout::ReplicatedOnMesh(target_mesh, num_dims);
 
-  absl::optional<NodeDef> const_value =
+  std::optional<NodeDef> const_value =
       ExtractSmallTensorValue(context, tensor, layout, status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
 
-  std::unique_ptr<TensorWithLayout> result(new TensorWithLayout(
-      std::move(parallel_tensor), mesh, std::move(layout), *shape,
+  std::unique_ptr<TensorWithLayoutTf> result(new TensorWithLayoutTf(
+      std::move(parallel_tensor), target_mesh, std::move(layout), *shape,
       /*dtype=*/absl::nullopt, std::move(const_value)));
   return result;
 }
 
-StatusOr<std::unique_ptr<TensorWithLayout>> TensorWithLayout::Wrap(
-    std::unique_ptr<parallel_device::ParallelTensor> tensor,
-    const MeshWithParallelDevice& mesh, const Layout& layout) {
+StatusOr<std::unique_ptr<TensorWithLayout>> TensorWithLayoutTf::Wrap(
+    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+    const Layout& layout) {
   const std::vector<int64_t>* shape;
   TF_RETURN_IF_ERROR(tensor->Shape(&shape));
 
-  if (tensor->dtype() != TF_RESOURCE) {
-    return std::unique_ptr<TensorWithLayout>(
-        new TensorWithLayout(std::move(tensor), mesh, layout, *shape));
-  } else {
-    return std::unique_ptr<TensorWithLayout>(
-        new ResourceHandleWithLayout(std::move(tensor), mesh, layout, *shape));
-  }
+  return std::unique_ptr<TensorWithLayout>(
+      new TensorWithLayoutTf(std::move(tensor), mesh, layout, *shape));
 }
 
-std::unique_ptr<TensorWithLayout> TensorWithLayout::Dummy(
+std::unique_ptr<TensorWithLayout> TensorWithLayoutTf::Dummy(
     const std::vector<int64_t>& local_shape, const TF_DataType dtype,
-    const MeshWithParallelDevice& mesh, const Layout& layout) {
-  if (dtype != TF_RESOURCE) {
-    return std::unique_ptr<TensorWithLayout>(new TensorWithLayout(
-        /*tensor=*/nullptr, mesh, layout, local_shape, dtype));
-  } else {
-    return std::unique_ptr<TensorWithLayout>(new ResourceHandleWithLayout(
-        /*tensor=*/nullptr, mesh, layout, local_shape));
-  }
+    const Mesh& mesh, const Layout& layout) {
+  return std::unique_ptr<TensorWithLayout>(new TensorWithLayoutTf(
+      /*tensor=*/nullptr, mesh, layout, local_shape, dtype));
 }
 
-std::string TensorWithLayout::SummarizeValue() const {
+std::string TensorWithLayoutTf::SummarizeValue() const {
   std::string value_summary;
   Status status;
-  if (dtype() != TF_RESOURCE && layout().IsFullyReplicated()) {
+  if (layout().IsFullyReplicated()) {
     status =
-        tensorflow::unwrap(tensor()->tensor(0))->SummarizeValue(value_summary);
+        tensorflow::unwrap(tensor_->tensor(0))->SummarizeValue(value_summary);
   } else {
     // Note that this just prints the local values for sharded tensors. We could
     // instead run a collective here to relayout to replicated.
-    status = tensor()->SummarizeValue(value_summary);
+    status = tensor_->SummarizeValue(value_summary);
   }
   if (!status.ok()) {
     value_summary = "<error computing value>";
@@ -375,8 +371,8 @@ std::string TensorWithLayout::SummarizeValue() const {
   return absl::StrCat(value_summary, ", layout=\"", layout().ToString(), "\"");
 }
 
-std::string TensorWithLayout::DebugString() const {
-  auto dtype = static_cast<DataType>(tensor()->dtype());
+std::string TensorWithLayoutTf::DebugString() const {
+  auto dtype = static_cast<DataType>(tensor_->dtype());
 
   const auto& shape_vector = global_shape();
   return absl::StrCat("DTensor(", SummarizeValue(),
@@ -384,6 +380,22 @@ std::string TensorWithLayout::DebugString() const {
                       ", type=", DataTypeString(dtype), ")");
 }
 
+StatusOr<std::unique_ptr<TensorWithLayout>> ResourceHandleWithLayout::Wrap(
+    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+    const Layout& layout) {
+  const std::vector<int64_t>* shape;
+  TF_RETURN_IF_ERROR(tensor->Shape(&shape));
+  return std::unique_ptr<TensorWithLayout>(
+      new ResourceHandleWithLayout(std::move(tensor), mesh, layout, *shape));
+}
+
+std::unique_ptr<TensorWithLayout> ResourceHandleWithLayout::Dummy(
+    const std::vector<int64_t>& local_shape, const Mesh& mesh,
+    const Layout& layout) {
+  return std::unique_ptr<TensorWithLayout>(new ResourceHandleWithLayout(
+      /*tensor=*/nullptr, mesh, layout, local_shape));
+}
+
 void ResourceHandleWithLayout::EncodeAttributes(
     tensorflow::NodeDefBuilder& builder) const {
   // If set, attach shape and dtype to the given node def.
@@ -393,6 +405,14 @@ void ResourceHandleWithLayout::EncodeAttributes(
   if (dereferenced_dtype().has_value()) {
     builder.Attr("_handle_dtypes", {*dereferenced_dtype()});
   }
+  if (dereferenced_element_layouts().has_value()) {
+    std::vector<std::string> layout_strs;
+    std::transform(dereferenced_element_layouts()->begin(),
+                   dereferenced_element_layouts()->end(),
+                   std::back_inserter(layout_strs),
+                   [](const Layout& layout) { return layout.ToString(); });
+    builder.Attr("_element_layouts", layout_strs);
+  }
 }
 
 tensorflow::Fprint128 ResourceHandleWithLayout::CacheKey() const {
@@ -433,13 +453,32 @@ void ResourceHandleWithLayout::UpdateAttrs(const EmbeddingResourceAttrs& attrs,
   attrs_.emplace(attrs);
 }
 
-StatusOr<std::unique_ptr<TensorWithLayout>> SparseTensorWithLayout::Wrap(
+// TODO(b/256016071): The following shares similar logic if not the same with
+// `TensorWithLayoutTf`. Deduplicate it.
+std::string ResourceHandleWithLayout::SummarizeValue() const {
+  std::string value_summary;
+  Status status = tensor_->SummarizeValue(value_summary);
+  if (!status.ok()) {
+    value_summary = "<error computing value>";
+  }
+  return absl::StrCat(value_summary, ", layout=\"", layout().ToString(), "\"");
+}
+
+std::string ResourceHandleWithLayout::DebugString() const {
+  auto dtype = static_cast<DataType>(tensor_->dtype());
+
+  const auto& shape_vector = global_shape();
+  return absl::StrCat("DTensor(", SummarizeValue(),
+                      ", shape=", ShapeToDebugString(shape_vector),
+                      ", type=", DataTypeString(dtype), ")");
+}
+
+StatusOr<std::unique_ptr<TensorWithLayoutTf>> SparseTensorWithLayout::Wrap(
     std::unique_ptr<parallel_device::ParallelTensor> indices_tensor,
     std::unique_ptr<parallel_device::ParallelTensor> values_tensor,
     std::unique_ptr<parallel_device::ParallelTensor> shapes_tensor,
-    const MeshWithParallelDevice& mesh, const Layout& layout,
-    std::vector<int64_t> local_shape) {
-  return std::unique_ptr<TensorWithLayout>(new SparseTensorWithLayout(
+    const Mesh& mesh, const Layout& layout, std::vector<int64_t> local_shape) {
+  return std::unique_ptr<TensorWithLayoutTf>(new SparseTensorWithLayout(
       std::move(indices_tensor), std::move(values_tensor),
       std::move(shapes_tensor), mesh, layout, local_shape));
 }
@@ -499,145 +538,34 @@ TF_DataType SparseTensorWithLayout::dtype() const {
 TFE_TensorHandle* SparseTensorWithLayout::get_tensor(size_t index) const {
   int num_sparse_tensors = num_tensors() / 3;
   if (index < num_sparse_tensors) {
-    return indices()->tensor(index);
+    return indices_->tensor(index);
   } else if (index < 2 * num_sparse_tensors) {
-    return values()->tensor(index % num_sparse_tensors);
+    return values_->tensor(index % num_sparse_tensors);
   } else {
-    return dense_shapes()->tensor(index % num_sparse_tensors);
+    return dense_shapes_->tensor(index % num_sparse_tensors);
   }
 }
 
-absl::flat_hash_map<int, NodeDef> GetConstantFoldableTensors(
-    const std::vector<TensorWithLayout*>& inputs) {
-  absl::flat_hash_map<int, NodeDef> small_tensors;
-  for (auto index = 0; index < inputs.size(); ++index) {
-    if (inputs[index]->const_value().has_value()) {
-      small_tensors.insert({index, inputs[index]->const_value().value()});
-    }
-  }
-  return small_tensors;
-}
-
-// Thread unsafe method. go/thread-unsafe
-// Cache key computation should consider all features of an op that affects
-// the SPMD lowering. The cache keys of two ops must be different if the
-// translated functions are different.
-// - op name and attr
-// - input shapes and layouts
-// - default layout of outputs.
-// - values of constant foldable inputs.
-tensorflow::Fprint128 FunctionManager::CacheKeyForGraph(
-    const DTensorOperation& doperation, const NameAttrList& attributes,
-    const std::vector<TensorWithLayout*>& inputs,
-    const std::vector<const Layout*>& output_layouts) {
-  tensorflow::Fprint128 cache_key = tensorflow::Fingerprint128(doperation.name);
-  std::string serialized;
-  SerializeToStringDeterministic(attributes, &serialized);
-  cache_key =
-      FingerprintCat128(cache_key, tensorflow::Fingerprint128(serialized));
-  // Higher level cache based on operation name and input shapes.
-  for (auto i = 0; i < inputs.size(); ++i) {
-    if (!IsConstantFoldable(doperation, i)) {
-      inputs[i]->reset_const_value();
-    }
-    cache_key = FingerprintCat128(cache_key, inputs[i]->CacheKey());
+std::unique_ptr<TensorWithLayout> CreateDummyTensorWithLayout(
+    const std::vector<int64_t>& local_shape, TF_DataType dtype,
+    const Mesh& mesh, const Layout& layout) {
+  switch (dtype) {
+    case TF_RESOURCE:
+      return ResourceHandleWithLayout::Dummy(local_shape, mesh, layout);
+    default:
+      return TensorWithLayoutTf::Dummy(local_shape, dtype, mesh, layout);
   }
-  for (int output_index = 0; output_index < output_layouts.size();
-       ++output_index) {
-    if (output_layouts[output_index]) {
-      cache_key = FingerprintCat128(cache_key, output_index);
-      cache_key = FingerprintCat128(
-          cache_key,
-          tensorflow::Fingerprint128(output_layouts[output_index]->ToString()));
-    }
-  }
-  return cache_key;
 }
 
-// Thread-unsafe method go/thread-unsafe.
-std::pair<tensorflow::Fprint128, const ExecutionFunctions*>
-FunctionManager::GetCachedFunction(
-    const DTensorOperation& doperation, const NameAttrList& attributes,
-    const std::vector<TensorWithLayout*>& inputs,
-    const std::vector<const Layout*>& output_layouts) {
-  tensorflow::Fprint128 cache_key =
-      CacheKeyForGraph(doperation, attributes, inputs, output_layouts);
-  auto iter = function_cache_.find(cache_key);
-
-  // Early return if we have a cache hit.
-  if (iter != function_cache_.end()) {
-    return std::pair<Fprint128, ExecutionFunctions*>(cache_key, &iter->second);
-  }
-
-  // For eager ops we early return the cache miss and do not make further
-  // optimizations.
-  if (!doperation.is_func()) {
-    return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
+StatusOr<std::unique_ptr<TensorWithLayout>> CreateTensorWithLayout(
+    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+    const Layout& layout) {
+  switch (tensor->dtype()) {
+    case TF_RESOURCE:
+      return ResourceHandleWithLayout::Wrap(std::move(tensor), mesh, layout);
+    default:
+      return TensorWithLayoutTf::Wrap(std::move(tensor), mesh, layout);
   }
-
-  const tensorflow::Fprint128 doperation_hash =
-      CacheKeyForDTensorOperation(doperation);
-
-  // Save the constant folded inputs to this doperation if we have not seen this
-  // before. This is needed so that in the next call to this operation, we
-  // can compare these inputs to confirm which one is indeed a constant.
-  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
-  if (doperation_iter == dtensor_op_and_small_inputs_.end()) {
-    dtensor_op_and_small_inputs_.insert(
-        {doperation_hash, GetConstantFoldableTensors(inputs)});
-    return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
-  }
-
-  // If we are here, then we have ran this function before but constant folded
-  // some input(s) when it was not a constant input i.e. one of the small value
-  // to this function input changed. So mark those changed values as
-  // non-constant.
-  absl::flat_hash_map<int, NodeDef>& previous_small_inputs =
-      doperation_iter->second;
-  std::vector<int> non_constant_indices;
-
-  for (auto const& [index, previous_small_input] : previous_small_inputs) {
-    if (inputs[index]->const_value().has_value()) {
-      if (NodeDefsHaveDifferentTensorProto(
-              previous_small_input, inputs[index]->const_value().value())) {
-        inputs[index]->reset_const_value();
-        non_constant_indices.push_back(index);
-      }
-    }
-  }
-  for (int non_constant_index : non_constant_indices) {
-    previous_small_inputs.erase(non_constant_index);
-  }
-  // Generate a new cache key since we updated small const inputs which change
-  // the cache key.
-  cache_key = CacheKeyForGraph(doperation, attributes, inputs, output_layouts);
-  return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
-}
-
-const ExecutionFunctions* FunctionManager::AddCachedFunction(
-    const DTensorOperation& op, tensorflow::Fprint128 cache_key,
-    ExecutionFunctions function) {
-  return &function_cache_.insert({cache_key, std::move(function)})
-              .first->second;
-}
-
-bool FunctionManager::IsConstantFoldable(const DTensorOperation& doperation,
-                                         const int input_index) const {
-  // For eager ops, assume the inputs are constant foldable.
-  if (!doperation.is_func()) return true;
-  const tensorflow::Fprint128 doperation_hash =
-      CacheKeyForDTensorOperation(doperation);
-  // If we didn't see this doperation before then optimisticly assume this is
-  // foldable. The input at `input_index` is foldable only if it is one of the
-  // indices we have saved as the small inputs.
-  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
-  return doperation_iter == dtensor_op_and_small_inputs_.end() ||
-         doperation_iter->second.contains(input_index);
-}
-
-const tensorflow::Fprint128 FunctionManager::CacheKeyForDTensorOperation(
-    const DTensorOperation& doperation) const {
-  return tensorflow::Fingerprint128(doperation.name);
 }
 
 std::vector<int64_t> TensorShapeAsVector(TFE_TensorHandle* tensor,
@@ -652,12 +580,12 @@ std::vector<int64_t> TensorShapeAsVector(TFE_TensorHandle* tensor,
 }
 
 Status PrepareGraphForMlir(
-    const FunctionManager& function_manager,
+    const ExecutableManager<ExecutionFunctions>& function_manager,
     const std::vector<TensorWithLayout*>& inputs,
     const DTensorOperation& doperation,
     const tensorflow::FunctionLibraryDefinition& flib_def,
-    const NameAttrList& attributes,
-    const absl::optional<Layout>& default_layout, tensorflow::Graph* graph,
+    const NameAttrList& attributes, const std::optional<Layout>& default_layout,
+    tensorflow::Graph* graph,
     std::vector<PartialTensorShape>* global_output_shapes,
     std::vector<const Layout*>* output_layouts) {
   // We run shape inference on the graph to find output shapes, which may
@@ -667,8 +595,8 @@ Status PrepareGraphForMlir(
   tensorflow::Status status;
   {
     // We include an _Arg node for the device ID, but this isn't used by the
-    // initial function. It will be provided a value, though, so it's available
-    // for use in rewrites.
+    // initial function. It will be provided a value, though, so it's
+    // available for use in rewrites.
     tensorflow::NodeDefBuilder builder("device_id", "_Arg");
     tensorflow::PartialTensorShape partial_shape;
     TF_RETURN_IF_ERROR(tensorflow::PartialTensorShape::MakePartialShape(
@@ -690,16 +618,16 @@ Status PrepareGraphForMlir(
     // TODO(allenl): This will block until async execution is complete, which
     // will be slow. We should find a non-blocking way of fetching the shape,
     // at least pre-cache.
-    // The shape passed into MLIR transformation represents the global shape of
-    // the tensor. Ideally, the local shape on each parallel device should not
-    // be consulted at all and we should use the shape on our input tensor
+    // The shape passed into MLIR transformation represents the global shape
+    // of the tensor. Ideally, the local shape on each parallel device should
+    // not be consulted at all and we should use the shape on our input tensor
     // directly.
     const auto& shape = input->global_shape();
     std::vector<tensorflow::int64> cast_shape(shape.begin(), shape.end());
     tensorflow::PartialTensorShape partial_shape;
-    // For resource tensors, `shape` attribute should not be specified as shape
-    // of resource tensors is specified by resource shape subtype -- not the
-    // shape attribute.
+    // For resource tensors, `shape` attribute should not be specified as
+    // shape of resource tensors is specified by resource shape subtype -- not
+    // the shape attribute.
     auto* resource = dynamic_cast<const ResourceHandleWithLayout*>(input);
     if (!resource) {
       TF_RETURN_IF_ERROR(tensorflow::PartialTensorShape::MakePartialShape(
@@ -728,7 +656,7 @@ Status PrepareGraphForMlir(
             .Attr("T", dtype)
             .Attr("index", i + 1)  // Indices are offset by 1 for device_id
             .Attr(kLayoutAttr, input->layout().ToString())
-            .Attr(kMeshAttr, input->mesh().mesh_config().ToString())
+            .Attr(kMeshAttr, input->mesh().ToString())
             .Finalize(&arg_node_def, /*consume=*/true));
     Node* arg_node = graph->AddNode(arg_node_def, &status);
     TF_RETURN_IF_ERROR(status);
@@ -741,10 +669,11 @@ Status PrepareGraphForMlir(
         partial_shape, &shape_handle));
     TF_RETURN_IF_ERROR(shape_refiner.SetShape(arg_node, 0, shape_handle));
 
-    // Small constants are converted into constant graph nodes, instead of being
-    // passed in as input arguments. This provides more information to the SPMD
-    // and layout propagation passes.
-    if (!input->const_value().has_value() ||
+    // Small constants are converted into constant graph nodes, instead of
+    // being passed in as input arguments. This provides more information to
+    // the SPMD and layout propagation passes.
+    if (!input->const_value_node() ||
+        !input->const_value_node()->const_value().has_value() ||
         !function_manager.IsConstantFoldable(doperation, i)) {
       graph_op_inputs.push_back(FunctionArgument{
           arg_node, NodeDefBuilder::NodeOut{arg_node->name(), i, dtype}});
@@ -752,7 +681,7 @@ Status PrepareGraphForMlir(
     } else {
       // TODO(xiejw): Refactor the TensorWithLayout representation to avoid
       // special code here.
-      NodeDef const_node = input->const_value().value();
+      NodeDef const_node = input->const_value_node()->const_value().value();
       const_node.set_name(absl::StrCat("input_", i, "_const_value"));
       Node* const_value_n = graph->AddNode(const_node, &status);
       TF_RETURN_IF_ERROR(status);
@@ -993,13 +922,13 @@ void AddDTensorFunctionAttr(FunctionDef& function_def) {
       {"_OutputsOnOpDevice", outputs_on_op_device});
 }
 
-StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
-    const std::vector<TensorWithLayout*>& inputs) {
+StatusOr<std::vector<const parallel_device::TensorHandlePtr*>>
+PrepareEmbeddingInputs(const std::vector<TensorWithLayout*>& inputs) {
   absl::flat_hash_map<int64_t, std::vector<int64_t>> table_vars_input_index;
   for (int64_t i = 0; i < inputs.size(); ++i) {
     if (inputs[i]->tensor_type() != kResource) continue;
 
-    const absl::optional<EmbeddingResourceAttrs>& resource_attrs =
+    const std::optional<EmbeddingResourceAttrs>& resource_attrs =
         inputs[i]->attrs();
     if (resource_attrs.has_value()) {
       table_vars_input_index[resource_attrs->table_id].push_back(i);
@@ -1010,7 +939,7 @@ StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
   if (table_vars_input_index.empty()) {
     return errors::Internal("There are no TPU embedding resource input found.");
   }
-  std::vector<parallel_device::ParallelTensor*> parallel_inputs;
+  std::vector<const parallel_device::TensorHandlePtr*> parallel_inputs;
   // Assure parallel inputs has numeric order as table ids.
   for (const auto& [table_id, table_vars_indices] : table_vars_input_index) {
     for (const int64_t input_index : table_vars_indices) {
@@ -1154,5 +1083,26 @@ Status InsertFunctionForTPUEmbeddingCheckpoint(
   return OkStatus();
 }
 
+tensorflow::Fprint128 ExecutableManagerImpl::CacheKeyForDTensorOperation(
+    const DTensorOperation& doperation) const {
+  return tensorflow::Fingerprint128(doperation.name);
+}
+
+absl::flat_hash_map<int, NodeDef>
+ExecutableManagerImpl::GetConstantFoldableTensors(
+    const std::vector<TensorWithLayout*>& inputs) {
+  absl::flat_hash_map<int, NodeDef> small_tensors;
+  for (auto index = 0; index < inputs.size(); ++index) {
+    auto* const_value_node = inputs[index]->const_value_node();
+    if (const_value_node == nullptr) {
+      continue;
+    }
+    if (const_value_node->const_value().has_value()) {
+      small_tensors.insert({index, const_value_node->const_value().value()});
+    }
+  }
+  return small_tensors;
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.h b/tensorflow/dtensor/cc/dtensor_device_util.h
index 47fefafbd44..9634885a63d 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.h
+++ b/tensorflow/dtensor/cc/dtensor_device_util.h
@@ -16,24 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_UTIL_H_
 #define TENSORFLOW_DTENSOR_CC_DTENSOR_DEVICE_UTIL_H_
 
+#include <map>
+#include <memory>
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
-#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/small_constant_optimization.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/cc/tensor_with_layout.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -131,23 +137,15 @@ struct DTensorOperation {
   inline bool is_func() const { return function_def != nullptr; }
 };
 
-struct EmbeddingResourceAttrs {
-  int64_t table_id;
-  absl::optional<int64_t> slot_id;  // NOLINT
-  bool is_dirty = false;
-};
-
 // Contains a mesh bundled with a parallel device over all of the devices in
 // that mesh.
 class MeshWithParallelDevice {
  public:
   MeshWithParallelDevice(
       const Mesh& mesh_config,
-      std::unique_ptr<parallel_device::ParallelDevice> parallel_device,
-      const std::string& composite_device_name = "")
+      std::unique_ptr<parallel_device::ParallelDevice> parallel_device)
       : mesh_config_(mesh_config),
         parallel_device_(std::move(parallel_device)),
-        composite_device_name_(composite_device_name),
         // Device IDs are constructed lazily because we don't have a context
         // until we start executing ops.
         device_ids_tensor_(nullptr) {}
@@ -165,48 +163,16 @@ class MeshWithParallelDevice {
 
   const dtensor::Mesh& mesh_config() const { return mesh_config_; }
 
-  // Creates a CompositeDevice in eager context if it not exists.
-  // Called when parallel_device_ contains a subset of global devices, e.g.
-  // pipelining is enabled.
-  StatusOr<CompositeDevice*> FindOrCreateCompositeDevice(TFE_Context* context) {
-    if (composite_device_ == nullptr && !composite_device_name_.empty()) {
-      if (mesh_config_.global_devices().empty()) {
-        return errors::InvalidArgument(
-            "Expect non-empty global devices when creating a CompositeDevice.");
-      }
-      TF_RETURN_IF_ERROR(ContextFromInterface(tensorflow::unwrap(context))
-                             ->FindOrCreateCompositeDevice(
-                                 mesh_config_.global_devices(),
-                                 composite_device_name_, &composite_device_));
-    }
-    return composite_device_;
-  }
-
-  CompositeDevice* composite_device() const { return composite_device_; }
-
  private:
   dtensor::Mesh mesh_config_;
   std::unique_ptr<parallel_device::ParallelDevice> parallel_device_;
 
-  // Set when parallel_device_ contains a subset of global devices, e.g.
-  // pipelining is enabled.
-  const std::string composite_device_name_;
-  // A tensorflow::Device that represents underlying devices of
-  // parallel_device_. Set when composite_device_name_ is not empty.
-  CompositeDevice* composite_device_ = nullptr;  // owned by eager context
-
   // Constructed lazily; contains a parallel tensor with scalar integer device
   // IDs for each device.
   mutable std::unique_ptr<parallel_device::ParallelTensor> device_ids_tensor_;
 };
 
-enum TensorType {
-  kDense = 0,
-  kResource = 1,
-  kSparse = 2,
-};
-
-class TensorWithLayout {
+class TensorWithLayoutTf : public TensorWithLayout {
  public:
   // Broadcast a single non-parallel tensor onto `mesh` with a fully replicated
   // sharding spec. Does not take ownership of `tensor`.
@@ -217,152 +183,110 @@ class TensorWithLayout {
 
   // Given an already-parallel tensor, wraps it with a mesh and a layout.
   static StatusOr<std::unique_ptr<TensorWithLayout>> Wrap(
-      std::unique_ptr<parallel_device::ParallelTensor> tensor,
-      const MeshWithParallelDevice& mesh, const Layout& layout);
+      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+      const Layout& layout);
 
-  // A dummy TensorWithLayout without holding a ParallelTensor.
+  // Creates a dummy TensorWithLayoutTf without holding a ParallelTensor.
   static std::unique_ptr<TensorWithLayout> Dummy(
-      const std::vector<int64_t>& local_shape, const TF_DataType dtype,
-      const MeshWithParallelDevice& mesh, const Layout& layout);
+      const std::vector<int64_t>& local_shape, TF_DataType dtype,
+      const Mesh& mesh, const Layout& layout);
 
-  virtual ~TensorWithLayout() {}
+  ~TensorWithLayoutTf() override = default;
 
-  virtual const Layout& layout() const { return layout_; }
+  const Layout& layout() const override { return layout_; }
 
-  virtual TensorType tensor_type() const { return TensorType::kDense; }
-
-  virtual TF_DataType dtype() const {
-    if (dtype_.has_value()) {
-      return dtype_.value();
-    } else {
-      return tensor_->dtype();
-    }
-  }
+  TensorType tensor_type() const override { return TensorType::kDense; }
 
-  // Small constant value optimization for non-resource-handle tensors.
-  virtual void set_const_value(NodeDef& const_node) {
-    // If we extracted a constant value from the tensor, check if this
-    // value was the output from `tf.shape`. In this case, we need to
-    // forward the kShapeOpInputLayout attribute to the new node def. This
-    // is needed for layout propagation when running in op-by-op mode.
-    //
-    // TODO(b/162747667): Improve the presentation for Shape input Op
-    //                    layout.
-    if (shape_metadata_layout().has_value()) {
-      AddNodeAttr(kShapeOpInputLayout, {shape_metadata_layout()->ToString()},
-                  &(const_node));
-    }
-    const_value_.emplace(const_node);
+  TF_DataType dtype() const override {
+    return dtype_.has_value() ? dtype_.value() : tensor_->dtype();
   }
 
-  // Clears the cached const value if present.
-  void reset_const_value() { const_value_.reset(); }
-
   // Encodes the NodeDef via provided builder, if applicable.
-  virtual void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const {}
+  void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override {}
 
-  virtual tensorflow::Fprint128 CacheKey() const;
+  tensorflow::Fprint128 CacheKey() const override;
 
   // Updates layout for this Tensor.
-  virtual void UpdateLayout(const Layout& new_layout, TF_Status* status) {
+  void UpdateLayout(const Layout& new_layout, TF_Status* status) override {
     TF_SetStatus(status, TF_INTERNAL,
                  "Attempt to update layout on non-resource-handle");
   }
 
   // Update shape and dtype.
-  virtual void UpdateShapeAndDType(const TensorShapeProto& shape,
-                                   const DataType& dtype, TF_Status* status) {
+  void UpdateShapeAndDType(const TensorShapeProto& shape, const DataType& dtype,
+                           TF_Status* status) override {
     TF_SetStatus(status, TF_INTERNAL,
                  "Attempt to update shape and layout on non-resource-handle");
   }
 
   // Update Attrs for this Tensor.
-  virtual void UpdateAttrs(const EmbeddingResourceAttrs& attrs,
-                           TF_Status* status) {
+  void UpdateAttrs(const EmbeddingResourceAttrs& attrs,
+                   TF_Status* status) override {
     TF_SetStatus(status, TF_INTERNAL,
                  "Attempt to update layout on non-resource-handle");
   }
 
-  virtual TFE_TensorHandle* get_tensor(size_t index) const {
-    return tensor()->tensor(index);
+  TFE_TensorHandle* get_tensor(size_t index) const override {
+    return tensor_->tensor(index);
   }
 
-  virtual size_t num_tensors() const { return tensor()->num_tensors(); }
+  size_t num_tensors() const override { return tensor_->num_tensors(); }
 
-  virtual parallel_device::ParallelTensor* tensor() const {
-    return tensor_.get();
+  const parallel_device::TensorHandlePtr* tensor() const override {
+    return tensor_ != nullptr ? tensor_->tensor_data() : nullptr;
   }
 
-  // Returns a string which includes just the value and layout of the tensor.
-  virtual std::string SummarizeValue() const;
-  // Returns a string which includes `SummarizeValue` along with shape and type
-  // information.
-  virtual std::string DebugString() const;
-
-  void set_input_layout_for_shape_op_result(const Layout& layout) {
-    input_layout_for_shape_op_result_.emplace(layout);
-  }
+  std::string SummarizeValue() const override;
 
-  const absl::optional<Layout> shape_metadata_layout() const {
-    return input_layout_for_shape_op_result_;
-  }
+  std::string DebugString() const override;
 
-  const MeshWithParallelDevice& mesh() const { return mesh_; }
+  const Mesh& mesh() const override { return mesh_; }
 
-  // Compute global shape from layout & local tensor shape.
-  //
-  // For replicated layout tensors, global shape is simply the shape of local
-  // tensors on each device. For sharded tensor, this is the global shape
-  // encodes layout & local shape on each device.
-  const std::vector<int64_t> global_shape() const {
-    return layout().GlobalShapeFromLocalShape(local_shape());
+  std::vector<int64_t> global_shape() const override {
+    return layout_.GlobalShapeFromLocalShape(local_shape_);
   }
 
-  const std::vector<int64_t> local_shape() const { return local_shape_; }
+  const std::vector<int64_t>& local_shape() const override {
+    return local_shape_;
+  }
 
-  const absl::optional<NodeDef> const_value() const { return const_value_; }
+  const std::optional<EmbeddingResourceAttrs>& attrs() const override {
+    return attrs_;
+  }
 
-  const absl::optional<EmbeddingResourceAttrs>& attrs() const { return attrs_; }
+  ConstValueNode* const_value_node() const override {
+    return const_value_node_.get();
+  }
 
  protected:
-  TensorWithLayout(std::unique_ptr<parallel_device::ParallelTensor> tensor,
-                   const MeshWithParallelDevice& mesh, const Layout& layout,
-                   std::vector<int64_t> local_shape,
-                   absl::optional<TF_DataType> dtype = absl::nullopt,
-                   absl::optional<NodeDef> const_value = absl::nullopt)
+  TensorWithLayoutTf(std::unique_ptr<parallel_device::ParallelTensor> tensor,
+                     const Mesh& mesh, const Layout& layout,
+                     std::vector<int64_t> local_shape,
+                     std::optional<TF_DataType> dtype = std::nullopt,
+                     std::optional<NodeDef> const_value = std::nullopt)
       : tensor_(std::move(tensor)),
         layout_(layout),
         mesh_(mesh),
-        const_value_(std::move(const_value)),
         local_shape_(local_shape),
-        dtype_(dtype) {}
+        dtype_(dtype) {
+    const_value_node_ = std::make_unique<ConstValueNode>(const_value);
+  }
 
   std::unique_ptr<parallel_device::ParallelTensor> tensor_;
 
   Layout layout_;
 
-  const MeshWithParallelDevice& mesh_;
-
-  // Optionally holds the value of a small, non-resource tensor. Small constants
-  // are directly folded into the SPMD graph instead of being passed as inputs.
-  // This provides extra information to the layout propagation and SPMD passes
-  // during op-by-op execution. (For example, the reduction indices for Sum,
-  // target shapes for Rng/Reshape, etc).
-  absl::optional<NodeDef> const_value_;
-
-  // Optionally holds the original input layout for a shape Op returned Tensor.
-  // This is used to preserve information for a shape op output so that future
-  // uses could recover local shape.
-  // TODO(hthu,allenl,xiejw): Move this into a separate class for clarity.
-  absl::optional<Layout> input_layout_for_shape_op_result_ = absl::nullopt;
+  const Mesh& mesh_;
 
   // The local shape of tensors placed on each of `tensor_`'s component devices.
   std::vector<int64_t> local_shape_;
 
-  absl::optional<TF_DataType> dtype_;
+  std::optional<TF_DataType> dtype_;
 
   // Resource input attributes for embedding inputs.
-  absl::optional<EmbeddingResourceAttrs> attrs_;  // NOLINT
+  std::optional<EmbeddingResourceAttrs> attrs_;  // NOLINT
+
+  std::unique_ptr<ConstValueNode> const_value_node_;
 };
 
 // Extension of TensorWithLayout which holds resource handle with layout.
@@ -373,6 +297,16 @@ class TensorWithLayout {
 // 2. Small const optimization should be disabled.
 class ResourceHandleWithLayout : public TensorWithLayout {
  public:
+  // Similar to `Wrap` in `TensorWithLayoutTf` but for resource handle.
+  static StatusOr<std::unique_ptr<TensorWithLayout>> Wrap(
+      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+      const Layout& layout);
+
+  // Similar to `Dummy` in `TensorWithLayoutTf` but for resource handle.
+  static std::unique_ptr<TensorWithLayout> Dummy(
+      const std::vector<int64_t>& local_shape, const Mesh& mesh,
+      const Layout& layout);
+
   // The layout of uninitialized resource tensors, or the layout of the tensor
   // contained in an initialized resource.
   const Layout& layout() const override {
@@ -382,8 +316,8 @@ class ResourceHandleWithLayout : public TensorWithLayout {
 
   TensorType tensor_type() const override { return TensorType::kResource; }
 
-  void set_const_value(NodeDef& const_node) override {
-    // Just a no-op for resource handle. Maybe we should error out.
+  TF_DataType dtype() const override {
+    return dtype_.has_value() ? dtype_.value() : tensor_->dtype();
   }
 
   void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override;
@@ -392,6 +326,11 @@ class ResourceHandleWithLayout : public TensorWithLayout {
 
   void UpdateLayout(const Layout& new_layout, TF_Status* status) override;
 
+  void UpdateElementLayouts(const std::vector<Layout>& layouts,
+                            TF_Status* status) {
+    dereferenced_element_layouts_.emplace(layouts);
+  }
+
   void UpdateShapeAndDType(const TensorShapeProto& shape, const DataType& dtype,
                            TF_Status* status) override {
     set_dereferenced_shape(shape);
@@ -401,6 +340,36 @@ class ResourceHandleWithLayout : public TensorWithLayout {
   void UpdateAttrs(const EmbeddingResourceAttrs& attrs,
                    TF_Status* status) override;
 
+  TFE_TensorHandle* get_tensor(size_t index) const override {
+    return tensor_->tensor(index);
+  }
+
+  size_t num_tensors() const override { return tensor_->num_tensors(); }
+
+  const parallel_device::TensorHandlePtr* tensor() const override {
+    return tensor_ != nullptr ? tensor_->tensor_data() : nullptr;
+  }
+
+  std::string SummarizeValue() const override;
+
+  std::string DebugString() const override;
+
+  const Mesh& mesh() const override { return mesh_; }
+
+  std::vector<int64_t> global_shape() const override {
+    return layout_.GlobalShapeFromLocalShape(local_shape_);
+  }
+
+  const std::vector<int64_t>& local_shape() const override {
+    return local_shape_;
+  }
+
+  const std::optional<EmbeddingResourceAttrs>& attrs() const override {
+    return attrs_;
+  }
+
+  ConstValueNode* const_value_node() const override { return nullptr; }
+
   void UpdateDirtyness(bool is_dirty, TF_Status* status) {
     if (!attrs_.has_value()) {
       TF_SetStatus(status, TF_INTERNAL,
@@ -416,27 +385,44 @@ class ResourceHandleWithLayout : public TensorWithLayout {
     dereferenced_dtype_.emplace(dtype);
   }
 
-  const absl::optional<TensorShapeProto>& dereferenced_shape() const {
+  const std::optional<std::vector<Layout>>& dereferenced_element_layouts()
+      const {
+    return dereferenced_element_layouts_;
+  }
+
+  const std::optional<TensorShapeProto>& dereferenced_shape() const {
     return dereferenced_shape_;
   }
-  const absl::optional<DataType>& dereferenced_dtype() const {
+  const std::optional<DataType>& dereferenced_dtype() const {
     return dereferenced_dtype_;
   }
 
  public:
   ResourceHandleWithLayout(
-      std::unique_ptr<parallel_device::ParallelTensor> tensor,
-      const MeshWithParallelDevice& mesh, const Layout& layout,
-      std::vector<int64_t> local_shape)
-      : TensorWithLayout(std::move(tensor), mesh, layout, local_shape,
-                         TF_RESOURCE) {}
+      std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+      const Layout& layout, std::vector<int64_t> local_shape)
+      : tensor_(std::move(tensor)),
+        layout_(layout),
+        mesh_(mesh),
+        local_shape_(local_shape),
+        dtype_(TF_RESOURCE) {}
 
  private:
+  std::unique_ptr<parallel_device::ParallelTensor> tensor_;
+  Layout layout_;
+  const Mesh& mesh_;
+  std::vector<int64_t> local_shape_;
+  std::optional<TF_DataType> dtype_;
+  std::optional<EmbeddingResourceAttrs> attrs_;  // NOLINT
+
   // The layout of the tensor pointed to by this handle, if any.
-  absl::optional<Layout> dereferenced_layout_;
+  std::optional<Layout> dereferenced_layout_;
+  // The layouts of the tensors emitted by this resource handle if it is an
+  // iterator resource.
+  std::optional<std::vector<Layout>> dereferenced_element_layouts_;
   // The shape and dtype of the tensor pointed to by this resource tensor.
-  absl::optional<TensorShapeProto> dereferenced_shape_;
-  absl::optional<DataType> dereferenced_dtype_;
+  std::optional<TensorShapeProto> dereferenced_shape_;
+  std::optional<DataType> dereferenced_dtype_;
 };
 
 // TensorWithLayout for SparseTensors.
@@ -446,28 +432,23 @@ class ResourceHandleWithLayout : public TensorWithLayout {
 // The shapes of the SparseTensors will always be the dense view of the shapes,
 // and thus will have no difference with the TensorWithLayout in terms of
 // shapes.
-class SparseTensorWithLayout : public TensorWithLayout {
+class SparseTensorWithLayout : public TensorWithLayoutTf {
  public:
-  static StatusOr<std::unique_ptr<TensorWithLayout>> Wrap(
+  static StatusOr<std::unique_ptr<TensorWithLayoutTf>> Wrap(
       std::unique_ptr<parallel_device::ParallelTensor> indices_tensor,
       std::unique_ptr<parallel_device::ParallelTensor> values_tensor,
       std::unique_ptr<parallel_device::ParallelTensor> shapes_tensor,
-      const MeshWithParallelDevice& mesh, const Layout& layout,
-      std::vector<int64_t> local_shape);
+      const Mesh& mesh, const Layout& layout, std::vector<int64_t> local_shape);
 
   // A dummy TensorWithLayout without holding a ParallelTensor.
-  static std::unique_ptr<TensorWithLayout> Dummy(
-      const std::vector<int64_t>& local_shape,
-      const MeshWithParallelDevice& mesh, const Layout& layout) {
-    return std::unique_ptr<TensorWithLayout>(new SparseTensorWithLayout(
+  static std::unique_ptr<TensorWithLayoutTf> Dummy(
+      const std::vector<int64_t>& local_shape, const Mesh& mesh,
+      const Layout& layout) {
+    return std::unique_ptr<TensorWithLayoutTf>(new SparseTensorWithLayout(
         /*indices=*/nullptr, /*values=*/nullptr, /*dense_shapes=*/nullptr, mesh,
         layout, local_shape));
   }
 
-  void set_const_value(NodeDef& const_node) override {
-    // No-op for SparseTensors, consider erroring out.
-  }
-
   // Add attribute '_sparse' to the NodeDefBuilder so that the mlir::Value
   // that originate from SparseTensorWithLayout are marked as '_sparse'.
   void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const override {
@@ -476,7 +457,7 @@ class SparseTensorWithLayout : public TensorWithLayout {
 
   TensorType tensor_type() const override { return TensorType::kSparse; }
 
-  size_t num_tensors() const override { return 3 * indices()->num_tensors(); }
+  size_t num_tensors() const override { return 3 * indices_->num_tensors(); }
 
   TFE_TensorHandle* get_tensor(size_t index) const override;
 
@@ -486,24 +467,29 @@ class SparseTensorWithLayout : public TensorWithLayout {
 
   TF_DataType dtype() const override;
 
-  parallel_device::ParallelTensor* indices() const { return indices_.get(); }
+  const parallel_device::TensorHandlePtr* indices() const {
+    return indices_->tensor_data();
+  }
 
-  parallel_device::ParallelTensor* values() const { return values_.get(); }
+  const parallel_device::TensorHandlePtr* values() const {
+    return values_->tensor_data();
+  }
 
-  parallel_device::ParallelTensor* dense_shapes() const {
-    return dense_shapes_.get();
+  const parallel_device::TensorHandlePtr* dense_shapes() const {
+    return dense_shapes_->tensor_data();
   }
 
+  ConstValueNode* const_value_node() const override { return nullptr; }
+
  protected:
   SparseTensorWithLayout(
       std::unique_ptr<parallel_device::ParallelTensor> indices,
       std::unique_ptr<parallel_device::ParallelTensor> values,
       std::unique_ptr<parallel_device::ParallelTensor> dense_shapes,
-      const MeshWithParallelDevice& mesh, const Layout& layout,
-      std::vector<int64_t> local_shape,
-      absl::optional<TF_DataType> dtype = absl::nullopt,
-      absl::optional<NodeDef> const_value = absl::nullopt)
-      : TensorWithLayout(nullptr, mesh, layout, local_shape),
+      const Mesh& mesh, const Layout& layout, std::vector<int64_t> local_shape,
+      std::optional<TF_DataType> dtype = std::nullopt,
+      std::optional<NodeDef> const_value = std::nullopt)
+      : TensorWithLayoutTf(nullptr, mesh, layout, local_shape),
         indices_(std::move(indices)),
         values_(std::move(values)),
         dense_shapes_(std::move(dense_shapes)) {}
@@ -512,6 +498,19 @@ class SparseTensorWithLayout : public TensorWithLayout {
   std::unique_ptr<parallel_device::ParallelTensor> dense_shapes_;
 };
 
+// TODO(b/256016071): Instead of having the following two functions, create a
+// factory which can branch the creation of `TensorWithLayoutTf`,
+// `ResourceHandleWithLayout`, `SparseTensorWithLayout` and the incoming
+// `TensorWithLayoutPw`.
+
+std::unique_ptr<TensorWithLayout> CreateDummyTensorWithLayout(
+    const std::vector<int64_t>& local_shape, TF_DataType dtype,
+    const Mesh& mesh, const Layout& layout);
+
+StatusOr<std::unique_ptr<TensorWithLayout>> CreateTensorWithLayout(
+    std::unique_ptr<parallel_device::ParallelTensor> tensor, const Mesh& mesh,
+    const Layout& layout);
+
 template <typename T>
 std::string ShapeToDebugString(const std::vector<T> shape_vector) {
   std::vector<tensorflow::int64> cast_shape(shape_vector.begin(),
@@ -525,8 +524,28 @@ std::string ShapeToDebugString(const std::vector<T> shape_vector) {
     return shape.DebugString();
   }
 }
-// Class that holds information about DTensor Functions ran, including cached
-// lowered functions and constant folding input information per function.
+
+// Internal class with shared functions for every ExecutableManager<T>.
+class ExecutableManagerImpl {
+  template <typename T>
+  friend class ExecutableManager;
+
+ public:
+  absl::flat_hash_map<int, NodeDef> GetConstantFoldableTensors(
+      const std::vector<TensorWithLayout*>& inputs);
+
+  // Cache key for dtensor operation name, which includes the op name
+  // and the input shapes. This is needed as a higher level cache for constant
+  // folding.
+  tensorflow::Fprint128 CacheKeyForDTensorOperation(
+      const DTensorOperation& doperation) const;
+
+ private:
+  ExecutableManagerImpl() = default;
+};
+// Template Class that holds information about DTensor executable ran, including
+// cached lowered executable and constant folding input information per
+// function.
 //
 //
 // The caching policy for constant folded inputs is the following:
@@ -536,38 +555,37 @@ std::string ShapeToDebugString(const std::vector<T> shape_vector) {
 //   folded inputs to the previous constant folded inputs. We disable constant
 //   folding for the changed values, and save these new inputs.
 // TODO(b/169348205) Support cache eviction if the cache gets bloated.
-class FunctionManager {
+template <typename T>
+class ExecutableManager {
  public:
-  FunctionManager() = default;
+  ExecutableManager() = default;
 
-  // Caches the graph with the lowered 'function'.
-  const ExecutionFunctions* AddCachedFunction(const DTensorOperation& op,
-                                              tensorflow::Fprint128 cache_key,
-                                              ExecutionFunctions function);
+  // Caches the executable with ParallelExecutable.
+  const T* AddCachedExecutable(const DTensorOperation& op,
+                               tensorflow::Fprint128 cache_key, T executable);
 
-  // Returns the cache key and the cached lowered graph for the function.
-  // Returns a nullptr for the lowered graph if there is a cache miss.
+  // Returns the cache key and the cached lowered executable for the function.
+  // Returns a nullptr for the lowered executable if there is a cache miss.
   // Upon a cache miss, this will save some metadata about the function
   // and the small inputs to keep track of information for constant folding.
-  std::pair<tensorflow::Fprint128, const ExecutionFunctions*> GetCachedFunction(
+  std::pair<tensorflow::Fprint128, const T*> GetCachedExecutable(
       const DTensorOperation& doperation, const NameAttrList& attributes,
       const std::vector<TensorWithLayout*>& inputs,
       const std::vector<const Layout*>& output_layouts);
 
+  // Returns the cached lowered graph for the function.
+  // Returns a nullptr for the lowered graph if there is a cache miss.
+  // This Get operation has no side effect.
+  const T* GetCachedExecutableSimple(tensorflow::Fprint128 cache_key);
+
   // Returns whether the input at `input_index` is known to be constant
   // foldable for function `doperation`. An input is not constant foldable if we
   // have ran this function at least twice and the small input value changed
   // across separate runs.
   bool IsConstantFoldable(const DTensorOperation& doperation,
-                          const int input_index) const;
+                          int input_index) const;
 
  private:
-  // Cache key for dtensor operation name, which includes the op name
-  // and the input shapes. This is needed as a higher level cache for constant
-  // folding.
-  const tensorflow::Fprint128 CacheKeyForDTensorOperation(
-      const DTensorOperation& doperation) const;
-
   // Generates a cache key for the graph, including its attributes,
   // inputs, and outputs.
   tensorflow::Fprint128 CacheKeyForGraph(
@@ -576,8 +594,7 @@ class FunctionManager {
       const std::vector<const Layout*>& output_layouts);
 
   // Maps the hash of a graph with the lowered graph.
-  absl::flat_hash_map<tensorflow::Fprint128, ExecutionFunctions,
-                      tensorflow::Fprint128Hasher>
+  absl::flat_hash_map<tensorflow::Fprint128, T, tensorflow::Fprint128Hasher>
       function_cache_;
 
   // Maps the hash of dtensor_operation and its input shapes to a map
@@ -587,6 +604,8 @@ class FunctionManager {
   absl::flat_hash_map<tensorflow::Fprint128, absl::flat_hash_map<int, NodeDef>,
                       tensorflow::Fprint128Hasher>
       dtensor_op_and_small_inputs_;
+
+  ExecutableManagerImpl executable_manager_impl_;
 };
 
 // Returns the shape of a given tensor.
@@ -596,12 +615,12 @@ std::vector<int64_t> TensorShapeAsVector(TFE_TensorHandle* tensor,
 // Creates a Graph with _Arg and _Retval nodes surrounding an
 // `operation_name`-type node.
 Status PrepareGraphForMlir(
-    const FunctionManager& function_manager,
+    const ExecutableManager<ExecutionFunctions>& function_manager,
     const std::vector<TensorWithLayout*>& inputs,
     const DTensorOperation& doperation,
     const tensorflow::FunctionLibraryDefinition& flib_def,
-    const NameAttrList& attributes,
-    const absl::optional<Layout>& default_layout, tensorflow::Graph* graph,
+    const NameAttrList& attributes, const std::optional<Layout>& default_layout,
+    tensorflow::Graph* graph,
     std::vector<PartialTensorShape>* global_output_shapes,
     std::vector<const Layout*>* output_layouts);
 
@@ -623,14 +642,153 @@ Status MaybeInsertIdentityNodes(const FunctionDef* function_def, Graph* graph);
 void AddDTensorFunctionAttr(FunctionDef& function_def);
 
 // Prepare inputs of embeddings for checkpoint functions.
-StatusOr<std::vector<parallel_device::ParallelTensor*>> PrepareEmbeddingInputs(
-    const std::vector<TensorWithLayout*>& inputs);
+StatusOr<std::vector<const parallel_device::TensorHandlePtr*>>
+PrepareEmbeddingInputs(const std::vector<TensorWithLayout*>& inputs);
 
 Status InsertFunctionForTPUEmbeddingCheckpoint(
     TF_Status* status, Graph* graph,
     const std::vector<TensorWithLayout*>& inputs,
     const std::string& checkpoint_fn_name);
 
+////////////////////////////////////////////////////////////////////////////////
+// Implementation details for ExecutableManager<T>
+
+// Thread unsafe method. go/thread-unsafe
+// Generates a cache key for the graph, including its attributes,
+// inputs, and outputs.
+// Cache key computation should consider all features of an op that affects
+// the SPMD lowering. The cache keys of two ops must be different if the
+// translated functions are different.
+// - op name and attr
+// - input shapes and layouts
+// - default layout of outputs.
+// - values of constant foldable inputs.
+template <typename T>
+tensorflow::Fprint128 ExecutableManager<T>::CacheKeyForGraph(
+    const DTensorOperation& doperation, const NameAttrList& attributes,
+    const std::vector<TensorWithLayout*>& inputs,
+    const std::vector<const Layout*>& output_layouts) {
+  tensorflow::Fprint128 cache_key = tensorflow::Fingerprint128(doperation.name);
+  std::string serialized;
+  SerializeToStringDeterministic(attributes, &serialized);
+  cache_key =
+      FingerprintCat128(cache_key, tensorflow::Fingerprint128(serialized));
+  // Higher level cache based on operation name and input shapes.
+  for (int i = 0; i < inputs.size(); ++i) {
+    if (!IsConstantFoldable(doperation, i) &&
+        inputs[i]->const_value_node() != nullptr) {
+      inputs[i]->const_value_node()->reset_const_value();
+    }
+    cache_key = FingerprintCat128(cache_key, inputs[i]->CacheKey());
+  }
+  for (int output_index = 0; output_index < output_layouts.size();
+       ++output_index) {
+    if (output_layouts[output_index]) {
+      cache_key = FingerprintCat128(cache_key, output_index);
+      cache_key = FingerprintCat128(
+          cache_key,
+          tensorflow::Fingerprint128(output_layouts[output_index]->ToString()));
+    }
+  }
+  return cache_key;
+}
+
+// Thread-unsafe method go/thread-unsafe.
+template <typename T>
+std::pair<tensorflow::Fprint128, const T*>
+ExecutableManager<T>::GetCachedExecutable(
+    const DTensorOperation& doperation, const NameAttrList& attributes,
+    const std::vector<TensorWithLayout*>& inputs,
+    const std::vector<const Layout*>& output_layouts) {
+  tensorflow::Fprint128 cache_key =
+      CacheKeyForGraph(doperation, attributes, inputs, output_layouts);
+
+  // Early return if we have a cache hit.
+  if (auto iter = function_cache_.find(cache_key);
+      iter != function_cache_.end()) {
+    return std::pair<Fprint128, T*>(cache_key, &iter->second);
+  }
+
+  // For eager ops we early return the cache miss and do not make further
+  // optimizations.
+  if (!doperation.is_func()) {
+    return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
+  }
+
+  const tensorflow::Fprint128 doperation_hash =
+      executable_manager_impl_.CacheKeyForDTensorOperation(doperation);
+
+  // Save the constant folded inputs to this doperation if we have not seen this
+  // before. This is needed so that in the next call to this operation, we
+  // can compare these inputs to confirm which one is indeed a constant.
+  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
+  if (doperation_iter == dtensor_op_and_small_inputs_.end()) {
+    dtensor_op_and_small_inputs_.insert(
+        {doperation_hash,
+         executable_manager_impl_.GetConstantFoldableTensors(inputs)});
+    return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
+  }
+
+  // If we are here, then we have ran this function before but constant folded
+  // some input(s) when it was not a constant input i.e. one of the small value
+  // to this function input changed. So mark those changed values as
+  // non-constant.
+  absl::flat_hash_map<int, NodeDef>& previous_small_inputs =
+      doperation_iter->second;
+  std::vector<int> non_constant_indices;
+
+  for (auto const& [index, previous_small_input] : previous_small_inputs) {
+    auto* const_value_node = inputs[index]->const_value_node();
+    if (const_value_node == nullptr) {
+      continue;
+    }
+    if (const_value_node->const_value().has_value()) {
+      if (NodeDefsHaveDifferentTensorProto(
+              previous_small_input, const_value_node->const_value().value())) {
+        const_value_node->reset_const_value();
+        non_constant_indices.push_back(index);
+      }
+    }
+  }
+  for (int non_constant_index : non_constant_indices) {
+    previous_small_inputs.erase(non_constant_index);
+  }
+  // Generate a new cache key since we updated small const inputs which change
+  // the cache key.
+  cache_key = CacheKeyForGraph(doperation, attributes, inputs, output_layouts);
+  return std::pair<Fprint128, std::nullptr_t>(cache_key, nullptr);
+}
+
+// Thread-unsafe method go/thread-unsafe.
+template <typename T>
+const T* ExecutableManager<T>::GetCachedExecutableSimple(
+    tensorflow::Fprint128 cache_key) {
+  auto iter = function_cache_.find(cache_key);
+  return iter == function_cache_.end() ? nullptr : &iter->second;
+}
+
+template <typename T>
+const T* ExecutableManager<T>::AddCachedExecutable(
+    const DTensorOperation& op, tensorflow::Fprint128 cache_key, T executable) {
+  return &function_cache_.insert({cache_key, std::move(executable)})
+              .first->second;
+}
+
+template <typename T>
+bool ExecutableManager<T>::IsConstantFoldable(
+    const DTensorOperation& doperation, const int input_index) const {
+  // For eager ops, assume the inputs are constant foldable.
+  if (!doperation.is_func()) return true;
+  const tensorflow::Fprint128 doperation_hash =
+      executable_manager_impl_.CacheKeyForDTensorOperation(doperation);
+  // If we didn't see this doperation before then optimisticly assume this is
+  // foldable. The input at `input_index` is foldable only if it is one of the
+  // indices we have saved as the small inputs.
+  auto doperation_iter = dtensor_op_and_small_inputs_.find(doperation_hash);
+  return doperation_iter == dtensor_op_and_small_inputs_.end() ||
+         doperation_iter->second.contains(input_index);
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
index 1053012ee2d..b96c6ded23f 100644
--- a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
+++ b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h"
 
+#include <memory>
+#include <utility>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -44,6 +48,8 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
 #include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 
@@ -64,11 +70,11 @@ DTensorMlirPassRunner::DTensorMlirPassRunner()
   dtensor::CreateDTensorMLIRPass(pipeline_options, &pass_manager_);
 }
 
-Status DTensorMlirPassRunner::RunOnGraph(
+StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+DTensorMlirPassRunner::ImportGraphToMlir(
     const DeviceSet& device_set, bool is_func,
-    FunctionLibraryDefinition* flib_def, std::unique_ptr<Graph>* graph,
-    absl::flat_hash_set<Node*>& control_ret_nodes, Fprint128 cache_key) {
-  Graph* input_graph = graph->get();
+    const FunctionLibraryDefinition& flib_def, const Graph& graph,
+    Fprint128 cache_key) {
   GraphDebugInfo debug_info;
   GraphImportConfig import_config;
   import_config.graph_as_function = true;
@@ -83,19 +89,12 @@ Status DTensorMlirPassRunner::RunOnGraph(
   // target/control ret.
   import_config.control_outputs = {"eager_operation"};
 
-  // Import GraphDef to TF MLIR.
-  stream_executor::port::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
-      module_ref = ConvertGraphToMlir(*input_graph, debug_info, *flib_def,
-                                      import_config, &context_);
-  if (!module_ref.ok())
-    return errors::InvalidArgument(
-        absl::StrCat(
-            "Can not convert the graph to MLIR, errors from MLIR converter : ",
-            module_ref.status().error_message())
-            .c_str());
+  // Imports GraphDef to TF MLIR.
+  StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module_ref =
+      ConvertGraphToMlir(graph, debug_info, flib_def, import_config, &context_);
 
+  // Adds DTensor attributes to ModuleOp.
   mlir::ModuleOp module = module_ref.value().get();
-
   AddDevicesToOp(module, &device_set);
 
   // Tag the module for logging or not depending on flag.
@@ -110,6 +109,10 @@ Status DTensorMlirPassRunner::RunOnGraph(
       mlir::StringAttr::get(&context_, absl::StrCat("_", cache_key.low64, "_",
                                                     cache_key.high64)));
 
+  return module_ref;
+}
+
+Status DTensorMlirPassRunner::Run(mlir::ModuleOp module) {
   // Executes and collects results from the passes.
   mlir::StatusScopedDiagnosticHandler diag_handler(&context_);
 
@@ -120,16 +123,6 @@ Status DTensorMlirPassRunner::RunOnGraph(
   TF_RETURN_IF_ERROR(diag_handler.ConsumeStatus());
 
   if (logging_enabled_) pass_manager_.getContext()->enableMultithreading();
-
-  // Convert MLIR to graphdef for execution.
-  GraphExportConfig export_config;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      ConvertMlirToGraph(module, export_config, graph, flib_def,
-                         &control_ret_nodes),
-      "Error converting MLIR module back to graph");
-  Graph* output_graph = graph->get();
-  VLOG(4) << DumpGraphToFile("dtensor_mlir_pass_after", *output_graph,
-                             flib_def);
   return OkStatus();
 }
 
diff --git a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
index 1c9e10b7fb9..b6e25b6e7db 100644
--- a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
+++ b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_GRAPH_TO_MLIR_PASS_H_
 #define TENSORFLOW_DTENSOR_CC_DTENSOR_GRAPH_TO_MLIR_PASS_H_
 
+#include <memory>
+
 #include "absl/container/flat_hash_set.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
@@ -30,12 +32,15 @@ namespace tensorflow {
 class DTensorMlirPassRunner {
  public:
   DTensorMlirPassRunner();
-  // Translates `graph` and replaces it with the resulting rewritten graph.
-  Status RunOnGraph(const DeviceSet& device_set, bool is_func,
-                    FunctionLibraryDefinition* flib_def,
-                    std::unique_ptr<Graph>* graph,
-                    absl::flat_hash_set<Node*>& control_ret_nodes,
-                    Fprint128 cache_key);
+
+  // Imports Graph to MLIR module in tf_execute Dialect with DTensor attributes.
+  StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportGraphToMlir(
+      const DeviceSet& device_set, bool is_func,
+      const FunctionLibraryDefinition& flib_def, const Graph& graph,
+      Fprint128 cache_key);
+
+  // Transforms input MLIR module with DTensor Pass pipeline.
+  Status Run(mlir::ModuleOp module);
 
  private:
   // N.B. op_registration_ must be initialized before context/pass-manager to
diff --git a/tensorflow/dtensor/cc/dtensor_meta_ops.cc b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
index fe9ab45426b..0a1c8045499 100644
--- a/tensorflow/dtensor/cc/dtensor_meta_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
@@ -27,7 +27,7 @@ REGISTER_OP("DTensorAllReduce")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, int32, uint32, int64, bool}")
+    .Attr("T: {half, bfloat16, float, float64, int32, uint32, int64, bool}")
     .Attr("reduce_op: {'Min', 'Max', 'Mul', 'Add', 'Mean', 'Any', 'All'}")
     .Attr("device_type: string")  // e.g. "/device:TPU"
     .SetShapeFn(shape_inference::UnchangedShape);
diff --git a/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc b/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
index 6edf1dd2021..3f6319fc5b6 100644
--- a/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
+++ b/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
@@ -181,7 +181,8 @@ class ConfigureAndInitializeGlobalTPUOpKernel : public OpKernel {
     int32_t* device_id_output = nullptr;
     auto cleanup = absl::MakeCleanup([&status, &device_id_output]() {
       TF_DeleteStatus(status);
-      tpu::OpsApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
+      stream_executor::tpu::OpsApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(
+          device_id_output);
     });
 
     InitializeHostForDistributedTpuOp_DoWork_Params params;
@@ -195,7 +196,8 @@ class ConfigureAndInitializeGlobalTPUOpKernel : public OpKernel {
     params.core_id_output = &device_id_output;
     params.status = status;
 
-    tpu::OpsApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(&params);
+    stream_executor::tpu::OpsApiFn()
+        ->InitializeHostForDistributedTpuOp_DoWorkFn(&params);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
     for (size_t i = 0; i < device_id_output_size; ++i) {
       core_id_output_vec->push_back(device_id_output[i]);
@@ -292,8 +294,8 @@ class SetGlobalTPUArrayOpKernel : public OpKernel {
     auto tpu_topology = ctx->input(0).scalar<tstring>()();
     TF_Status* status = TF_NewStatus();
 
-    tpu::OpsApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
-                                                  tpu_topology.data(), status);
+    stream_executor::tpu::OpsApiFn()->SetGlobalTPUArrayOp_DoWorkFn(
+        tpu_topology.size(), tpu_topology.data(), status);
     OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
     TF_DeleteStatus(status);
 
diff --git a/tensorflow/dtensor/cc/dtensor_utils.cc b/tensorflow/dtensor/cc/dtensor_utils.cc
index 46590cb38ad..0cb83874732 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.cc
+++ b/tensorflow/dtensor/cc/dtensor_utils.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/dtensor/cc/dtensor_utils.h"
 
 #include <cstdlib>
+#include <string>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -97,5 +99,33 @@ int ReduceInBfloat16MaxGroupSize() {
   return 8;
 }
 
+bool LowerCollectiveGatherToCollectiveGatherV2() {
+  // We lower DTensorGather to CollectiveReduceV2 ops instead of
+  // CollectiveGatherV2, since we do not observe a performance gain with Gather
+  // lowering and ReduceV2 is agnostic of the rank order.
+  //
+  // If LOWER_DTENSOR_GATHER_TO_COLLECTIVE_GATHER_V2 environment variable is set
+  // to '1', it is reduced to collective
+  char* use_collective_gather =
+      std::getenv("LOWER_DTENSOR_GATHER_TO_COLLECTIVE_GATHER_V2");
+  if (use_collective_gather == nullptr) return false;
+  return true;
+}
+
+bool EnableReplicatedSpmdAsDefault(const std::string& op_name) {
+  // These environment variables enroll MLIR ops of the given name for default
+  // replicated SPMD expansion. No expanders are registered for these Ops,
+  // and without enrolling to the default replicated behavior, SPMD expansion
+  // raises an error for these Op.
+  //
+  // For example, to enroll tf.Mod, set
+  //   DTENSOR_ENABLE_REPLICATED_SPMD_AS_DEFAULT_TF.MOD = 1
+  std::string env_name = "DTENSOR_ENABLE_REPLICATED_SPMD_AS_DEFAULT_" +
+                         absl::AsciiStrToUpper(op_name);
+  char* dtensor_enable_replicated_spmd_as_default =
+      std::getenv(env_name.c_str());
+  return dtensor_enable_replicated_spmd_as_default != nullptr;
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_utils.h b/tensorflow/dtensor/cc/dtensor_utils.h
index 3dd2d33a962..b27894dbf50 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.h
+++ b/tensorflow/dtensor/cc/dtensor_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
 #define TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
 
+#include <string>
 namespace tensorflow {
 namespace dtensor {
 
@@ -51,6 +52,14 @@ bool DoNotFuseReduceScatter();
 // reduce op.
 int ReduceInBfloat16MaxGroupSize();
 
+// Returns whether to lower DTensorAllGather to CollectiveReduceV2. If false,
+// lowers it to CollectiveReduceV2 for GPU and CPU for supported data types.
+bool LowerCollectiveGatherToCollectiveGatherV2();
+
+// Returns whether to enable defaulting TF ops that do not have SPMD
+// implementation to default to the ReplicatedOpSpmdExpander.
+bool EnableReplicatedSpmdAsDefault(const std::string& op_name);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/mesh_type.h b/tensorflow/dtensor/cc/mesh_type.h
new file mode 100644
index 00000000000..4902ab7b737
--- /dev/null
+++ b/tensorflow/dtensor/cc/mesh_type.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
+#define TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+
+extern "C" {
+typedef struct TF_Mesh TF_Mesh;
+}
+DEFINE_CONVERSION_FUNCTIONS(dtensor::Mesh, TF_Mesh);
+
+typedef struct TF_Layout TF_Layout;
+DEFINE_CONVERSION_FUNCTIONS(dtensor::Layout, TF_Layout);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_MESH_TYPE_H_
diff --git a/tensorflow/dtensor/cc/parallel_executor.h b/tensorflow/dtensor/cc/parallel_executor.h
index ab5c9eb3ddf..dd00e8415db 100644
--- a/tensorflow/dtensor/cc/parallel_executor.h
+++ b/tensorflow/dtensor/cc/parallel_executor.h
@@ -16,19 +16,42 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_PARALLEL_EXECUTOR_H_
 #define TENSORFLOW_DTENSOR_CC_PARALLEL_EXECUTOR_H_
 
-#include "tensorflow/tsl/platform/status.h"
+#include <memory>
+#include <vector>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
+#include "tensorflow/dtensor/cc/dtensor_device_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace dtensor {
 
+template <typename T>
+using Future = ::xla::PjRtFuture<T>;
+
 // ParallelExecutor Interface
+// Note: The interface is under development and APIs are subject to change.
 class ParallelExecutor {
  public:
+  using ExecutionResult = tsl::StatusOr<std::vector<TensorWithLayout*>>;
+
   virtual ~ParallelExecutor() = default;
-  // Note: The API is under development and subject to change.
-  virtual tsl::Status Execute() const = 0;
+
+  // Takes input TensorWithLayouts, a MLIR module and the entry function name.
+  // Attributes are forwarded to executed operations unmodified.
+  // The execute is non-blocking and returns a Future of output TensorWithLayout
+  // raw pointers.
+  // The client is responsible for the ownership of the outputs.
+  virtual Future<ExecutionResult> Execute(
+      TFE_Context* context, const std::vector<TensorWithLayout*>& inputs,
+      mlir::ModuleOp module, llvm::StringRef entry_function_name,
+      const TFE_OpAttrs* attributes) const = 0;
 };
 
+// Factory method for Default ParallelExecutor instance.
+std::unique_ptr<ParallelExecutor> CreateDefaultParallelExecutor();
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/save_restore_util.cc b/tensorflow/dtensor/cc/save_restore_util.cc
index d1311a0740b..b6f040a5f24 100644
--- a/tensorflow/dtensor/cc/save_restore_util.cc
+++ b/tensorflow/dtensor/cc/save_restore_util.cc
@@ -161,7 +161,7 @@ SaveOpSpecs BuildPerDeviceSave(
                     prefix.getType().dyn_cast<mlir::RankedTensorType>(), prefix,
                     StringScalarConst(builder, prefix.getLoc(),
                                       DeviceSuffix(device_id, total_devices)))
-                .z();
+                .getZ();
         // Generate new prefix based on device_id and save op index, only when
         // we need a new save_op.
         new_prefixes.push_back(new_prefix);
diff --git a/tensorflow/dtensor/cc/small_constant_optimization.cc b/tensorflow/dtensor/cc/small_constant_optimization.cc
index 35b18bca756..bec2739f5c8 100644
--- a/tensorflow/dtensor/cc/small_constant_optimization.cc
+++ b/tensorflow/dtensor/cc/small_constant_optimization.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -71,15 +72,15 @@ void AppendFloatValues(const int num_of_elements, const float* float_values,
 
 }  // namespace
 
-absl::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
-                                                TFE_TensorHandle* tensor,
-                                                const Layout& layout,
-                                                TF_Status* status) {
+std::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
+                                               TFE_TensorHandle* tensor,
+                                               const Layout& layout,
+                                               TF_Status* status) {
   if (!layout.IsFullyReplicated()) return std::nullopt;
   auto num_elements = TFE_TensorHandleNumElements(tensor, status);
-  if (TF_GetCode(status) != TF_OK) return absl::nullopt;
+  if (TF_GetCode(status) != TF_OK) return std::nullopt;
 
-  if (num_elements >= kSmallTensorThreshold) return absl::nullopt;
+  if (num_elements >= kSmallTensorThreshold) return std::nullopt;
 
   // Check the DType before attempting to resolve the tensor so we don't try to
   // copy resource-dtype tensors off the DTensor device. Currently we only
@@ -87,7 +88,7 @@ absl::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
   // and tf_string tensors that are mostly used in save/restore ops.
   const auto& dtype = TFE_TensorHandleDataType(tensor);
   if (absl::c_find(kAllowedDataType, dtype) == std::end(kAllowedDataType)) {
-    return absl::nullopt;
+    return std::nullopt;
   }
 
   // This is the enum from protobuf, or the following AddNodeAttr will always
@@ -95,7 +96,7 @@ absl::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
   const auto& datatype = static_cast<DataType>(dtype);
   std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_tensor(
       TFE_TensorHandleResolve(tensor, status), TF_DeleteTensor);
-  if (TF_GetCode(status) != TF_OK) return absl::nullopt;
+  if (TF_GetCode(status) != TF_OK) return std::nullopt;
 
   NodeDef node_def;
   node_def.set_op("Const");
@@ -133,7 +134,7 @@ absl::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
                                 " fell through the supported extraction list. "
                                 "This should not happen.")
                        .c_str());
-      return absl::nullopt;
+      return std::nullopt;
   }
 
   std::vector<int64_t> dim_list;
diff --git a/tensorflow/dtensor/cc/small_constant_optimization.h b/tensorflow/dtensor/cc/small_constant_optimization.h
index 1cd214a7ded..dc297b51fb4 100644
--- a/tensorflow/dtensor/cc/small_constant_optimization.h
+++ b/tensorflow/dtensor/cc/small_constant_optimization.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_CC_SMALL_CONSTANT_OPTIMIZATION_H_
 #define TENSORFLOW_DTENSOR_CC_SMALL_CONSTANT_OPTIMIZATION_H_
 
-#include "absl/types/optional.h"
+#include <optional>
+
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
@@ -32,10 +33,10 @@ namespace dtensor {
 // particular, reductions require access to the axis argument at compilation
 // time. While this is not strictly necessary, it greatly simplifies SPMD code
 // generation and is generally available.
-absl::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
-                                                TFE_TensorHandle* tensor,
-                                                const Layout& layout,
-                                                TF_Status* status);
+std::optional<NodeDef> ExtractSmallTensorValue(TFE_Context* context,
+                                               TFE_TensorHandle* tensor,
+                                               const Layout& layout,
+                                               TF_Status* status);
 
 // Returns true if the given input argument should be eligible for extracting
 // into a graph constant.
diff --git a/tensorflow/dtensor/cc/tensor_layout.cc b/tensorflow/dtensor/cc/tensor_layout.cc
index 57e9a5cdd23..5eb5640112c 100644
--- a/tensorflow/dtensor/cc/tensor_layout.cc
+++ b/tensorflow/dtensor/cc/tensor_layout.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/errors.h"
@@ -50,21 +51,8 @@ constexpr const char* Layout::kAny;
 constexpr const char* Layout::kEmptyLayoutString;
 constexpr const char* Layout::kMatch;
 constexpr const char* Mesh::kEmptyMeshString;
-
-namespace {
-// Obtain all possible forms of indexing a mesh.
-//
-// e.g. given a mesh with dimensions [x=2, y=3], returns {
-//   [0, 0], [0, 1], [0, 2],
-//   [1, 0], [1, 1], [1, 2]
-// }
-inline std::vector<DeviceLocation> ComputeDeviceLocations(const Mesh* mesh) {
-  std::vector<DeviceLocation> mesh_locs(mesh->size());
-  for (size_t i = 0; i < mesh->size(); ++i)
-    mesh_locs[i] = *(mesh->device_location(i));
-  return mesh_locs;
-}
-}  // namespace
+constexpr const char* Mesh::kUseXLASPMDString;
+constexpr bool Mesh::kUseXLASPMD;
 
 namespace {
 // Expands a ShardVector into the size defined in new_num_shards_per_dim.
@@ -126,6 +114,13 @@ ShardVector ExpandShardVector(const ShardVector& shard_vec,
 }
 }  // namespace
 
+std::vector<DeviceLocation> ComputeDeviceLocations(const Mesh& mesh) {
+  std::vector<DeviceLocation> mesh_locs(mesh.size());
+  for (size_t i = 0; i < mesh.size(); ++i)
+    mesh_locs[i] = *(mesh.device_location(i));
+  return mesh_locs;
+}
+
 bool ShardVector::operator==(const ShardVector& other) const {
   // Check same number of shards.
   if (this->shards.empty() && other.shards.empty()) return true;
@@ -171,6 +166,10 @@ bool ShardVector::ContainsShard(const Shard& shard) const {
   return false;
 }
 
+bool IsDynamicSize(int64_t size) {
+  return mlir::ShapedType::isDynamic(size) || size == -1;
+}
+
 // static
 std::map<std::string, std::vector<int>>& Mesh::tpu_core_ids() {
   static auto tpu_core_ids = new std::map<std::string, std::vector<int>>();
@@ -213,6 +212,8 @@ StatusOr<Mesh> Mesh::ParseFromProto(const MeshProto& proto) {
     mesh.mesh_dims_[i].size = dim.size();
   }
 
+  mesh.use_xla_spmd_ = proto.use_xla_spmd();
+
   // Check invariants.
   int64 mesh_size = mesh.size();
   int num_devices = proto.global_device_ids_size();
@@ -253,12 +254,14 @@ StatusOr<Mesh> Mesh::GetMesh(const std::string& name,
                              const std::vector<std::int64_t>& global_device_ids,
                              const std::vector<std::int64_t>& local_device_ids,
                              const std::vector<std::string>& local_devices,
-                             const std::vector<std::string>& global_devices) {
+                             const std::vector<std::string>& global_devices,
+                             bool use_xla_spmd) {
   TF_ASSIGN_OR_RETURN(Mesh mesh, GetAbstractMesh(name, mesh_dims));
   mesh.global_device_ids_ = global_device_ids;
   mesh.local_device_ids_ = local_device_ids;
   mesh.local_devices_ = local_devices;
   mesh.global_devices_ = global_devices;
+  mesh.use_xla_spmd_ = use_xla_spmd;
 
   // Check number of devices matches conditions.
   size_t global_n = mesh.global_device_ids_.size();
@@ -428,6 +431,7 @@ Mesh Mesh::Empty() { return Mesh(); }
 MeshProto Mesh::ToProto() const {
   MeshProto mesh_proto;
   mesh_proto.set_name(name());
+  mesh_proto.set_use_xla_spmd(use_xla_spmd());
 
   for (const auto& d : local_devices_) {
     mesh_proto.add_local_devices(d);
@@ -479,6 +483,12 @@ std::string Mesh::ToString() const {
     mesh_str += "|";
     mesh_str += absl::StrJoin(global_devices_, ",");
   }
+
+  if (use_xla_spmd()) {
+    // Add use_xla_spmd
+    mesh_str += "|";
+    mesh_str += Mesh::kUseXLASPMDString;
+  }
   return mesh_str;
 }
 
@@ -564,9 +574,10 @@ StatusOr<Mesh> Mesh::FromString(const std::string& str) {
 
   // Check formatting error.
   if (mesh_parts.size() != 3 && mesh_parts.size() != 5 &&
-      mesh_parts.size() != 6)
+      mesh_parts.size() != 6 && mesh_parts.size() != 7)
     TF_RETURN_WITH_CONTEXT(errors::InvalidArgument(
-        "Expected either 5, 6 or 3 mesh parts but found", mesh_parts.size()));
+        "Expected either 5, 6, 7 or 3 mesh parts but found",
+        mesh_parts.size()));
 
   // Populate mesh.
   std::string name = mesh_parts[0];
@@ -610,17 +621,31 @@ StatusOr<Mesh> Mesh::FromString(const std::string& str) {
   if (!mesh_parts[4].empty())
     local_devices = absl::StrSplit(mesh_parts[4], ',');
 
+  bool use_xla_spmd = Mesh::kUseXLASPMD;
   std::vector<std::string> global_devices;
-  if (mesh_parts.size() == 6) {
+  if (mesh_parts.size() == 6 && !mesh_parts[5].empty()) {
     // Add global devices.
-    if (!mesh_parts[5].empty())
+    if (mesh_parts[5] == Mesh::kUseXLASPMDString) {
+      use_xla_spmd = true;
+    } else {
       global_devices = absl::StrSplit(mesh_parts[5], ',');
+    }
+  }
+  // Add use_xla_spmd.
+  if (mesh_parts.size() == 7 && !mesh_parts[6].empty()) {
+    if (mesh_parts[6] == Mesh::kUseXLASPMDString) {
+      use_xla_spmd = true;
+    } else {
+      return errors::InvalidArgument(
+          "Expected string ", Mesh::kUseXLASPMDString,
+          "as the 7th argument but got: ", mesh_parts[6]);
+    }
   }
 
   TF_ASSIGN_OR_RETURN(
       Mesh mesh,
       Mesh::GetMesh(name, mesh_dims, global_device_ids, local_device_ids,
-                    local_devices, global_devices));
+                    local_devices, global_devices, use_xla_spmd));
   return mesh;
 }
 
@@ -664,24 +689,25 @@ StatusOr<int32> Mesh::idx_for_dim(absl::string_view dim_name) const {
                                  " does not exist on mesh : ", ToString());
 }
 
-Mesh Mesh::CreateMesh(
-    const std::string& mesh_name, const std::vector<std::string>& dim_names,
-    const std::vector<std::int64_t>& global_device_ids_shape,
-    const std::vector<std::int64_t>& global_device_ids_flatten,
-    const std::vector<std::string>& global_devices_str,
-    const std::vector<std::int64_t>& local_device_ids,
-    const std::vector<std::string>& local_devices_str) {
+Mesh Mesh::CreateMesh(const std::string& mesh_name,
+                      const std::vector<std::string>& dim_names,
+                      const std::vector<std::int64_t>& mesh_shape,
+                      const std::vector<std::int64_t>& global_device_ids,
+                      const std::vector<std::string>& global_devices_str,
+                      const std::vector<std::int64_t>& local_device_ids,
+                      const std::vector<std::string>& local_devices_str,
+                      const bool use_xla_spmd) {
   Mesh mesh;
   mesh.name_ = mesh_name;
-
+  mesh.use_xla_spmd_ = use_xla_spmd;
   mesh.mesh_dims_.resize(dim_names.size());
 
   for (int i = 0; i < dim_names.size(); ++i) {
     mesh.mesh_dims_[i].name = dim_names[i];
-    mesh.mesh_dims_[i].size = global_device_ids_shape[i];
+    mesh.mesh_dims_[i].size = mesh_shape[i];
   }
 
-  for (const auto& id : global_device_ids_flatten) {
+  for (const auto& id : global_device_ids) {
     mesh.global_device_ids_.push_back(id);
   }
 
@@ -792,7 +818,7 @@ Mesh Layout::ReducedMesh() const {
   // Populate reduced mesh with global devices from original mesh.
   std::vector<int64_t> reduced_global_device_ids;
   std::vector<std::string> reduced_global_devs;
-  for (const DeviceLocation& loc : ComputeDeviceLocations(&reduced_mesh)) {
+  for (const DeviceLocation& loc : ComputeDeviceLocations(reduced_mesh)) {
     int64 pos = mesh().GetFlattenedCoordinate(loc);
     reduced_global_device_ids.push_back(mesh().global_device_ids().at(pos));
     if (!mesh().global_devices().empty()) {
@@ -879,7 +905,7 @@ ShardVector Layout::GetShardVector() const {
   };
   // Compute mesh locations and obtain shards from them.
   ShardVector shard_vec;
-  for (const DeviceLocation& mesh_loc : ComputeDeviceLocations(&mesh()))
+  for (const DeviceLocation& mesh_loc : ComputeDeviceLocations(mesh()))
     shard_vec.shards.push_back(GetShardFromDeviceLocation(mesh_loc));
   // Calculate dims.
   shard_vec.num_shards_per_dim = ShardVectorDims();
@@ -1009,16 +1035,18 @@ bool Layout::operator==(const Layout& b) const {
 }
 
 std::vector<int64_t> Layout::GlobalShapeFromLocalShape(
-    const std::vector<int64_t>& local_shape) const {
+    absl::Span<const int64_t> local_shape) const {
   if (IsFullyReplicated()) {
-    return local_shape;
+    return std::vector<int64_t>(local_shape.begin(), local_shape.end());
   }
   std::vector<int64_t> global_shape;
   global_shape.reserve(sharding_specs().size());
   for (int i = 0; i < sharding_specs().size(); ++i) {
     int64_t l_shape = local_shape.empty() ? 1 : local_shape[i];
     int64_t dim_shards = num_shards()[i];
-    global_shape.emplace_back(l_shape * dim_shards);
+    int64_t global_size =
+        IsDynamicSize(l_shape) ? l_shape : l_shape * dim_shards;
+    global_shape.emplace_back(global_size);
   }
   return global_shape;
 }
@@ -1033,7 +1061,10 @@ std::vector<int64_t> Layout::LocalShapeFromGlobalShape(
   for (int i = 0; i < sharding_specs().size(); ++i) {
     int64_t dim_shards = shards[i];
     // TODO(hthu): Shape might not be always divisible.
-    local_shape.emplace_back(global_shape[i] / dim_shards);
+    int64_t local_size = IsDynamicSize(global_shape[i])
+                             ? global_shape[i]
+                             : global_shape[i] / dim_shards;
+    local_shape.emplace_back(local_size);
   }
   return local_shape;
 }
@@ -1047,7 +1078,9 @@ PartialTensorShape Layout::LocalShapeFromGlobalShape(
   PartialTensorShape local_shape({});
   for (int spec_index = 0; spec_index < sharding_specs().size(); ++spec_index) {
     int64_t dim_size = global_shape.dim_size(spec_index);
-    local_shape.AddDim(dim_size == -1 ? -1 : dim_size / shards[spec_index]);
+    int64_t local_size =
+        IsDynamicSize(dim_size) ? dim_size : dim_size / shards[spec_index];
+    local_shape.AddDim(local_size);
   }
   return local_shape;
 }
@@ -1068,6 +1101,25 @@ Layout Layout::ReplicatedOnMesh(const Mesh& mesh, int rank) {
   return Layout::GetLayout(specs, mesh).value();
 }
 
+Layout Layout::ReplicatedLike(const Layout& layout) {
+  std::vector<std::string> specs(layout.rank(), kUnshardedDim);
+  return Layout::GetLayout(specs, layout.mesh()).value();
+}
+
+Layout Layout::BatchShardedOnMesh(const Mesh& mesh, int rank,
+                                  const string& mesh_dim, int axis) {
+  std::vector<std::string> specs(rank, kUnshardedDim);
+  specs[axis] = mesh_dim;
+  return Layout::GetLayout(specs, mesh).value();
+}
+
+Layout Layout::BatchShardedLike(const Layout& layout, const string& mesh_dim,
+                                int axis) {
+  std::vector<std::string> specs(layout.rank(), kUnshardedDim);
+  specs[axis] = mesh_dim;
+  return Layout::GetLayout(specs, layout.mesh()).value();
+}
+
 Layout Layout::AnyOnMesh(const Mesh& mesh, int rank) {
   std::vector<std::string> specs(rank, kAny);
   return Layout::GetLayout(specs, mesh).value();
diff --git a/tensorflow/dtensor/cc/tensor_layout.h b/tensorflow/dtensor/cc/tensor_layout.h
index fb835b6abb4..f2d96b8716d 100644
--- a/tensorflow/dtensor/cc/tensor_layout.h
+++ b/tensorflow/dtensor/cc/tensor_layout.h
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/proto/layout.pb.h"
@@ -44,6 +44,10 @@ limitations under the License.
 namespace tensorflow {
 namespace dtensor {
 
+// Returns true if `size` is a dynamic size based on either MLIR and TF
+// standards.
+bool IsDynamicSize(int64_t size);
+
 // The location of a device in a mesh.
 //
 // Each device has a unique location in the mesh, which is indicated by the
@@ -92,18 +96,25 @@ class Mesh {
   // we use this string representation of an empty mesh instead to avoid
   // confusion.
   static constexpr const char* kEmptyMeshString = "empty_mesh";
+  static constexpr const char* kUseXLASPMDString = "use_xla_spmd";
+  static constexpr bool kUseXLASPMD = false;
+
   static Mesh Empty();
   bool IsEmpty() const;
   Mesh() = default;
 
   // Creates fully defined mesh.
-  static Mesh CreateMesh(
-      const std::string& mesh_name, const std::vector<std::string>& dim_names,
-      const std::vector<std::int64_t>& global_device_ids_shape,
-      const std::vector<std::int64_t>& global_device_ids_flatten,
-      const std::vector<std::string>& global_devices_str,
-      const std::vector<std::int64_t>& local_device_ids,
-      const std::vector<std::string>& local_devices_str);
+  //
+  // When `use_xla_spmd` is true, all ops running on this mesh will use XLA SPMD
+  // instead of DTensor SPMD.
+  static Mesh CreateMesh(const std::string& mesh_name,
+                         const std::vector<std::string>& dim_names,
+                         const std::vector<std::int64_t>& mesh_shape,
+                         const std::vector<std::int64_t>& global_device_ids,
+                         const std::vector<std::string>& global_devices_str,
+                         const std::vector<std::int64_t>& local_device_ids,
+                         const std::vector<std::string>& local_devices_str,
+                         bool use_xla_spmd = Mesh::kUseXLASPMD);
 
   // Parses from MeshProto.
   static StatusOr<Mesh> ParseFromProto(const MeshProto& proto);
@@ -128,7 +139,8 @@ class Mesh {
       const std::vector<std::int64_t>& global_device_ids,
       const std::vector<std::int64_t>& local_device_ids,
       const std::vector<std::string>& local_devices,
-      const std::vector<std::string>& global_devices);
+      const std::vector<std::string>& global_devices,
+      bool use_xla_spmd = Mesh::kUseXLASPMD);
 
   bool is_cpu_mesh() const { return device_type() == "CPU"; }
   bool is_epu_mesh() const { return device_type() == "EPU"; }
@@ -192,6 +204,7 @@ class Mesh {
 
   int64 rank() const;
   int64 size() const;
+  bool use_xla_spmd() const { return use_xla_spmd_; }
   const std::string& name() const { return name_; }
 
   // Global unique fingerprint. Same on different workers.
@@ -229,8 +242,17 @@ class Mesh {
   std::vector<int64_t> local_device_ids_;
   std::vector<int64_t> global_device_ids_;
   std::vector<std::string> global_devices_;
+  bool use_xla_spmd_ = Mesh::kUseXLASPMD;
 };
 
+// Obtain all possible forms of indexing a mesh.
+//
+// e.g. given a mesh with dimensions [x=2, y=3], returns {
+//   [0, 0], [0, 1], [0, 2],
+//   [1, 0], [1, 1], [1, 2]
+// }
+std::vector<DeviceLocation> ComputeDeviceLocations(const Mesh& mesh);
+
 class Layout {
  public:
   static constexpr const char* kUnshardedDim = "unsharded";
@@ -266,6 +288,11 @@ class Layout {
 
   const Mesh& mesh() const { return mesh_; }
   static Layout ReplicatedOnMesh(const Mesh& mesh, int rank);
+  static Layout BatchShardedOnMesh(const Mesh& mesh, int rank,
+                                   const string& mesh_dim, int axis = 0);
+  static Layout ReplicatedLike(const Layout& layout);
+  static Layout BatchShardedLike(const Layout& layout, const string& mesh_dim,
+                                 int axis = 0);
   static Layout AnyOnMesh(const Mesh& mesh, int rank);
   // Creates a mesh of unique shards.
   Mesh ReducedMesh() const;
@@ -316,7 +343,7 @@ class Layout {
 
   // Compute global shape using the layout and provided local_shape.
   std::vector<int64_t> GlobalShapeFromLocalShape(
-      const std::vector<int64_t>& local_shape) const;
+      absl::Span<const int64_t> local_shape) const;
 
   std::vector<int64_t> LocalShapeFromGlobalShape(
       absl::Span<const int64_t> global_shape) const;
diff --git a/tensorflow/dtensor/cc/tensor_with_layout.h b/tensorflow/dtensor/cc/tensor_with_layout.h
new file mode 100644
index 00000000000..27975b675e2
--- /dev/null
+++ b/tensorflow/dtensor/cc/tensor_with_layout.h
@@ -0,0 +1,174 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
+#define TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_lib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+enum TensorType {
+  kDense = 0,
+  kResource = 1,
+  kSparse = 2,
+};
+
+struct EmbeddingResourceAttrs {
+  int64_t table_id;
+  std::optional<int64_t> slot_id;  // NOLINT
+  bool is_dirty = false;
+};
+
+class ConstValueNode {
+ public:
+  explicit ConstValueNode(std::optional<NodeDef> const_value)
+      : const_value_(const_value),
+        input_layout_for_shape_op_result_(std::nullopt) {}
+
+  // Small constant value optimization for non-resource-handle tensors.
+  void set_const_value(NodeDef& const_node) {
+    // If we extracted a constant value from the tensor, check if this
+    // value was the output from `tf.shape`. In this case, we need to
+    // forward the kShapeOpInputLayout attribute to the new node def. This
+    // is needed for layout propagation when running in op-by-op mode.
+    //
+    // TODO(b/162747667): Improve the presentation for Shape input Op
+    //                    layout.
+    if (input_layout_for_shape_op_result_.has_value()) {
+      AddNodeAttr(kShapeOpInputLayout,
+                  {input_layout_for_shape_op_result_->ToString()},
+                  &(const_node));
+    }
+    const_value_.emplace(const_node);
+  }
+
+  // Clears the cached const value if present.
+  void reset_const_value() { const_value_.reset(); }
+
+  const std::optional<NodeDef>& const_value() const { return const_value_; }
+
+  void set_input_layout_for_shape_op_result(const Layout& layout) {
+    input_layout_for_shape_op_result_.emplace(layout);
+  }
+
+  const std::optional<Layout>& shape_metadata_layout() const {
+    return input_layout_for_shape_op_result_;
+  }
+
+ private:
+  // The value of a small, non-resource tensor. Small constants
+  // are directly folded into the SPMD graph instead of being passed as inputs.
+  // This provides extra information to the layout propagation and SPMD passes
+  // during op-by-op execution. (For example, the reduction indices for Sum,
+  // target shapes for Rng/Reshape, etc).
+  std::optional<NodeDef> const_value_;
+
+  // The original input layout for a shape Op returned Tensor.
+  // This is used to preserve information for a shape op output so that future
+  // uses could recover local shape.
+  std::optional<Layout> input_layout_for_shape_op_result_ = std::nullopt;
+};
+
+// The representation of tensors transferred to underlying devices and the
+// layout for the tensors.
+class TensorWithLayout {
+ public:
+  virtual ~TensorWithLayout() = default;
+
+  // Gets the layout for the tensors.
+  virtual const Layout& layout() const = 0;
+
+  // Gets the tensor type which indicates whether the tensors are dense,
+  // resource or sparse.
+  virtual TensorType tensor_type() const = 0;
+
+  // Gets the data type of tensors.
+  virtual TF_DataType dtype() const = 0;
+
+  // Encodes the NodeDef via provided builder, if applicable.
+  virtual void EncodeAttributes(tensorflow::NodeDefBuilder& builder) const = 0;
+
+  // Generates a key which can be used for SPMD lowering.
+  virtual tensorflow::Fprint128 CacheKey() const = 0;
+
+  // Updates the layout for the tensors.
+  virtual void UpdateLayout(const Layout& new_layout, TF_Status* status) = 0;
+
+  // Updates the local shape and dtype of the tensors.
+  virtual void UpdateShapeAndDType(const TensorShapeProto& shape,
+                                   const DataType& dtype,
+                                   TF_Status* status) = 0;
+
+  // Updates the attributes for the tensors.
+  virtual void UpdateAttrs(const EmbeddingResourceAttrs& attrs,
+                           TF_Status* status) = 0;
+
+  // Gets the tensor handle at position `index`. This makes sense only when the
+  // implementation owns a list of tensor handles. Otherwise this returns
+  // `nullptr`.
+  virtual TFE_TensorHandle* get_tensor(size_t index) const = 0;
+
+  // Gets the number of tensors.
+  virtual size_t num_tensors() const = 0;
+
+  // Gets the tensor handle pointer pointing to the beginning of the tensors.
+  // This makes sense only when the implementation owns a list of tensor
+  // handles. Otherwise this returns `nullptr`.
+  virtual const parallel_device::TensorHandlePtr* tensor() const = 0;
+
+  // Returns a string which includes just the value and layout of the tensors.
+  virtual std::string SummarizeValue() const = 0;
+
+  // Returns a string which includes `SummarizeValue` along with shape and type
+  // information.
+  virtual std::string DebugString() const = 0;
+
+  // Gets the mesh for the tensors.
+  virtual const Mesh& mesh() const = 0;
+
+  // Computes global shape from layout & local tensor shape.
+  //
+  // For replicated layout tensors, global shape is simply the shape of local
+  // tensors on each device. For sharded tensor, this is the global shape
+  // encodes layout & local shape on each device.
+  virtual std::vector<int64_t> global_shape() const = 0;
+
+  // Gets the local shape of the tensors.
+  virtual const std::vector<int64_t>& local_shape() const = 0;
+
+  // Gets the resource input attributes for embedding inputs.
+  virtual const std::optional<EmbeddingResourceAttrs>& attrs() const = 0;
+
+  // Gets a `ConstValueNode` which can operate on a `NodeDef` representing a
+  // small const tensor. If it is not null, it can be used in the SPMD
+  // expansion, regardless of which runtime is being used.
+  virtual ConstValueNode* const_value_node() const = 0;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_CC_TENSOR_WITH_LAYOUT_H_
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
new file mode 100644
index 00000000000..987a5eb6893
--- /dev/null
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
@@ -0,0 +1,289 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+namespace {
+
+StatusOr<int64_t> DeviceLocationToLinearIndex(
+    absl::Span<const int64_t> mesh_shape, DeviceLocation dev_loc,
+    absl::Span<const int32_t> minor_to_major_ordering) {
+  if (mesh_shape.size() != dev_loc.size()) {
+    return errors::InvalidArgument(
+        "Mesh shape size and multi_index size must be equal");
+  }
+  int64_t scale = 1;
+  int64_t linear_index = 0;
+  for (auto dimension : minor_to_major_ordering) {
+    linear_index += scale * dev_loc[dimension];
+    scale *= mesh_shape[dimension];
+  }
+
+  return linear_index;
+}
+
+// Returns a grouping of devices in major-first to minor-last ordering based on
+// groups of devices that have the same piece of sharded tensor.
+//
+// `sharded_indices` represent the sharded indices from some Layout.
+//
+// Example:
+//  For a mesh with dimensions {'x': 2, 'y': 2} and sharded_indices {0},
+//  this means that the Layout is sharded only on the `x` dimension, and thus
+//  the mesh coordinates {0, 0} and {0, 1} are in one group and {1, 0} and
+//  {1, 1} are in the other group. If we convert mesh coordinates to linearized
+//  device index, this returns {{0, 1}, {2, 3}}.
+StatusOr<std::vector<std::vector<int64_t>>> ComputeReplicatedGroups(
+    const Mesh& mesh, const std::vector<int64_t>& sharded_indices) {
+  std::map<DeviceLocation, std::vector<int64_t>> replicated_group_map;
+
+  for (size_t device_id = 0; device_id < mesh.size(); ++device_id) {
+    TF_ASSIGN_OR_RETURN(DeviceLocation dev_loc,
+                        mesh.device_location(device_id));
+    DeviceLocation reduced_dev_loc;
+    for (int64_t shard_index : sharded_indices) {
+      reduced_dev_loc.push_back(dev_loc[shard_index]);
+    }
+    replicated_group_map[reduced_dev_loc].push_back(device_id);
+  }
+
+  // Reorder these replica groups from the map in major-first to minor-last
+  // ordering by going through each device id in increasing ordering.
+  std::vector<std::vector<int64_t>> replicated_groups;
+  absl::flat_hash_set<int64_t> already_included_devices;
+
+  for (size_t device_id = 0; device_id < mesh.size(); ++device_id) {
+    if (already_included_devices.contains(device_id)) {
+      continue;
+    }
+    // Find the replicated group that contains this device_id and add all
+    // devices in that group to replicated_groups.
+    for (const auto& [unused_hash, group] : replicated_group_map) {
+      if (std::find(group.begin(), group.end(), device_id) != group.end()) {
+        replicated_groups.push_back(group);
+
+        for (int64_t device : group) {
+          already_included_devices.insert(device);
+        }
+      }
+    }
+  }
+  return replicated_groups;
+}
+
+struct MeshDimInfo {
+  // Stores the dimension size of a mesh dimension.
+  int64_t size;
+  // Stores the index of a mesh dimension.
+  int64_t index;
+};
+
+// Returns a vector of device ids for XLA OpShardings `tile_assignment_devices`
+// field, based on the layout_shard_specs and the dimensions of the mesh.
+//
+// Note that this function assumes that the layouts are fully sharded, i.e
+// there is no Layout::UNSHARDED dimension in `layout_shard_specs`.
+//
+// At a high level, this function is a permutation function that permutes device
+// ids from [0, n) to a new ordering based on however the `layout_shard_specs`
+// transposes the ordering of `mesh_dims`.
+//
+// `tile_assignment_devices` in ::xla::OpSharding is a linearized list of
+// devices based on a defined `minor_to_major` ordering. The default
+// `minor_to_major` ordering of a Mesh is first index major, i.e [n-1, n-2, ...,
+// 0]. The `layout_shard_specs` essentially defines a new minor_to_major
+// ordering based on the ordering of the shard specs, and is needed  to
+// compute `tile_assignment_devices`.
+StatusOr<std::vector<int64_t>> ComputeTileAssignmentDevices(
+    const std::vector<std::string>& layout_shard_specs,
+    const std::vector<MeshDimension>& mesh_dims) {
+  if (layout_shard_specs.size() != mesh_dims.size()) {
+    return errors::InvalidArgument(
+        "Number of shard specs must equal number of mesh dimensions. This "
+        "might indicate that Layout is not fully sharded.");
+  }
+
+  absl::flat_hash_map<std::string, MeshDimInfo> mesh_spec_to_info;
+  int64_t num_devices = 1;
+
+  for (int64_t i = 0; i < mesh_dims.size(); ++i) {
+    num_devices *= mesh_dims[i].size;
+    MeshDimInfo mesh_dim_info;
+    mesh_dim_info.size = mesh_dims[i].size;
+    mesh_dim_info.index = i;
+    mesh_spec_to_info[mesh_dims[i].name] = mesh_dim_info;
+  }
+
+  // Shape of transposed mesh based on the ordering of layout's sharding specs.
+  std::vector<int64_t> mesh_shape;
+  mesh_shape.reserve(layout_shard_specs.size());
+  for (const MeshDimension& mesh_dim : mesh_dims) {
+    mesh_shape.push_back(mesh_dim.size);
+  }
+
+  // Compute the new minor to major ordering based on the ordering of layout
+  // sharding.
+  //
+  // Example:
+  //   For a Mesh with specs ['x', 'y'], the original minor_to_major is [1, 0].
+  //   But if the layout is ['y', 'x'], the new minor_to_major is [0, 1].
+  std::vector<int32_t> minor_to_major_ordering;
+  for (const std::string& shard_spec : layout_shard_specs) {
+    if (shard_spec == Layout::kUnshardedDim) {
+      return errors::InvalidArgument(
+          "Expected a sharded mesh dimension but received an unsharded "
+          "dimension.");
+    }
+    minor_to_major_ordering.insert(minor_to_major_ordering.begin(),
+                                   mesh_spec_to_info[shard_spec].index);
+  }
+
+  // For each device id increasing from [0, n), compute its multi-dimensional
+  // index in the mesh, and then compute its new linear index based on
+  // the new minor to major ordering. This will give us the new location
+  // in the transposed mesh based on the layout. Intuitively, this is just
+  // a permutation function of Layout: Layout can be thought of as how
+  // it permutes the pieces of tensors.
+  absl::flat_hash_map<int64_t, int64_t> permutation_map;
+  for (int device = 0; device < num_devices; ++device) {
+    // Compute the multidimensional index from this linear index.
+    DeviceLocation dev_loc;
+
+    int offset = device;
+    int64 i = mesh_shape.size() - 1;
+    while (i >= 0) {
+      dev_loc.insert(dev_loc.begin(), offset % mesh_shape[i]);
+      offset /= mesh_shape[i];
+      --i;
+    }
+
+    TF_ASSIGN_OR_RETURN(int64_t linear_index,
+                        DeviceLocationToLinearIndex(mesh_shape, dev_loc,
+                                                    minor_to_major_ordering));
+    permutation_map[linear_index] = device;
+  }
+
+  // For each device id increasing from [0, n), use the permutation map to
+  // reverse the permutation and linearize the device ordering. This
+  // gives us the final tile assignment devices such that it is ordered
+  // correctly based on Layout.
+  std::vector<int64_t> tile_assignment_devices;
+  tile_assignment_devices.reserve(num_devices);
+  for (int device = 0; device < num_devices; ++device) {
+    tile_assignment_devices.push_back(permutation_map[device]);
+  }
+  return tile_assignment_devices;
+}
+
+}  // namespace
+
+StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout) {
+  ::xla::OpSharding xla_sharding;
+
+  if (layout.IsFullyReplicated()) {
+    xla_sharding.set_type(::xla::OpSharding::REPLICATED);
+    return xla_sharding;
+  }
+  // If not replicated, then this is tile sharded, aka OpSharding::OTHER.
+  xla_sharding.set_type(::xla::OpSharding::OTHER);
+
+  // Set Tile Assignment Dimensions by handling both partially sharded and fully
+  // sharded.
+  int32 product_of_sharded_dimensions = 1;
+  for (int32 dim_size : layout.num_shards()) {
+    product_of_sharded_dimensions *= dim_size;
+    xla_sharding.add_tile_assignment_dimensions(dim_size);
+  }
+
+  const Mesh mesh = layout.mesh();
+
+  // Add the (n+1)th dimension representing the replicated group size. This
+  // only happens for partially sharded layouts.
+  if (product_of_sharded_dimensions != mesh.num_devices()) {
+    xla_sharding.add_tile_assignment_dimensions(mesh.num_devices() /
+                                                product_of_sharded_dimensions);
+    xla_sharding.set_replicate_on_last_tile_dim(true);
+  }
+
+  // Set Tile Assignment Devices, handling both partially and fully sharded
+  // layouts.
+  std::vector<std::string> sharded_layout_specs;
+  std::vector<int64_t> sharded_mesh_indices;
+
+  // Extract the non-replicated layout specs and mesh indices.
+  for (const std::string& spec : layout.sharding_spec_strs()) {
+    if (spec == Layout::kUnshardedDim) continue;
+    sharded_layout_specs.push_back(spec);
+    sharded_mesh_indices.push_back(mesh.idx_for_dim(spec).value());
+  }
+
+  // Create a new sub-mesh based only on the sharded dimensions of `layout`.
+  std::vector<MeshDimension> reduced_mesh_dims;
+  for (const MeshDimension& mesh_dim : mesh.dims()) {
+    if (std::find(sharded_layout_specs.begin(), sharded_layout_specs.end(),
+                  mesh_dim.name) != sharded_layout_specs.end()) {
+      reduced_mesh_dims.push_back(mesh_dim);
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> tile_assignment_devices,
+      ComputeTileAssignmentDevices(sharded_layout_specs, reduced_mesh_dims));
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::vector<int64_t>> replicated_groups,
+                      ComputeReplicatedGroups(mesh, sharded_mesh_indices));
+
+  if (tile_assignment_devices.size() != replicated_groups.size()) {
+    return errors::Internal(
+        "Replicated group size was not equal to the number of tile assignment "
+        "devices. Please file a bug to DTensor.",
+        "tile_assignment_devices size=", tile_assignment_devices.size(),
+        "and replicated_grous size=", replicated_groups.size(),
+        "for Layout=", layout.ToString());
+  }
+
+  // For partially sharded layouts, we need to expand the
+  // tile_assignment_devices based on the replica groups. This is a no-op
+  // for fully sharded layouts.
+  std::vector<int64_t> expanded_tile_assignment_devices;
+  for (int64_t group_index : tile_assignment_devices) {
+    for (int64_t device : replicated_groups[group_index]) {
+      expanded_tile_assignment_devices.push_back(device);
+    }
+  }
+
+  // Finally add this to the OpSharding proto.
+  for (int64_t device : expanded_tile_assignment_devices) {
+    xla_sharding.add_tile_assignment_devices(device);
+  }
+
+  return xla_sharding;
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
new file mode 100644
index 00000000000..3cb1f8ee93b
--- /dev/null
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
+#define TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
+
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+// Mhlo sharding string attribute, used for setting hlo sharding on ops, inputs,
+// and outputs of a function for XLA SPMD.
+constexpr char kXlaShardingAttr[] = "mhlo.sharding";
+
+// Returns an ::xla::OpSharding protobuf from `layout`.
+StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout);
+
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_XLA_SPMD_LAYOUT_TO_XLA_SHARDING_H_
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index 28a4edfcbfd..27441c5fdad 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -6,6 +6,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/dtensor:dtensor-internal",
         # Allow visibility from the mlir language server.
@@ -191,6 +192,9 @@ cc_library(
         "dtensor_allreduce_sum_optimization.cc",
         "dtensor_mixed_precision_reduce.cc",
         "dtensor_mlir_passes.cc",
+        "dtensor_remove_dtensorlayout.cc",
+        "dtensor_replace_auxiliary_layout_op.cc",
+        "dtensor_set_hlo_sharding.cc",
         "function_renaming.cc",
         "handle_cross_cluster_dependencies.cc",
         "handle_sparsetensors.cc",
@@ -239,11 +243,13 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
         "//tensorflow/dtensor/cc:constants",
         "//tensorflow/dtensor/cc:dtensor_utils",
+        "//tensorflow/dtensor/cc:layout_to_xla_sharding",
         "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
         "//tensorflow/dtensor/mlir/utils:dtensor_mlir_passes_internal",
@@ -472,7 +478,6 @@ cc_library(
         ":tf_dtensor_dialect",
         ":value_utils",
         "//tensorflow/compiler/mlir:array_container_utils",
-        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
@@ -508,6 +513,7 @@ cc_library(
     deps = [
         ":op_utils",
         ":tf_dtensor_dialect",
+        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/core:lib",
         "//tensorflow/dtensor/cc:dstatus",
diff --git a/tensorflow/dtensor/mlir/Passes.td b/tensorflow/dtensor/mlir/Passes.td
index fd2da41523c..d85e66380eb 100644
--- a/tensorflow/dtensor/mlir/Passes.td
+++ b/tensorflow/dtensor/mlir/Passes.td
@@ -322,4 +322,126 @@ def DTensorInferShapesForRestoreV2Op
   ];
 }
 
+def DTensorSetHloShardingPass : Pass<"dtensor-set-hlo-sharding", "mlir::ModuleOp"> {
+  let summary = "Set `mhlo.sharding` attribute for function inputs and ops.";
+
+  let description = [{
+    For each `tf.DTensorLayout` op, the pass sets `mhlo.sharding` attributes for
+     related Ops and the related outer layer function's inputs and outputs.
+    By default the operation is applied on every DTensorLayout op.
+    If `check_layout_use_xla_spmd`is set to true, then the pass checks every
+     DTensorLayout must have Mesh config with use_xla_spmd.
+    For example, by default the follow code
+
+    ```
+    func.func @main(%arg0) -> tensor<8x8xi32> {
+      %1 = "tf.DTensorLayout"(%arg0)
+      %2 = "tf.Identity"(%1)
+      %3 = "tf.DTensorLayout"(%2)
+      return %3 : tensor<8x8xi32>
+    }
+    ```
+
+    will be converted to
+
+    ```
+    func.func @main(%arg0 {mhlo.sharding = ""}) -> (tensor<8x8xi32> {mhlo.sharding = ""})
+      %1 = "tf.DTensorLayout"(%arg0)
+      %2 = "tf.Identity"(%1) {mhlo.sharding = ""}
+      %3 = "tf.DTensorLayout"(%2)
+      return %3 : tensor<8x8xi32>
+    }
+    ```
+
+    When `check_layout_use_xla_spmd` is set to true, the pass throws an exception for the above example.
+    Meanwhile, if `check_layout_use_xla_spmd` is set to true, the follow code
+
+    ```
+    func.func @main(%arg0) -> tensor<8x8xi32> {
+      %1 = "tf.DTensorLayout"(%0) {mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>}
+      %2 = "tf.Identity"(%1)
+      %3 = "tf.DTensorLayout"(%2){mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>}
+      return %3 : tensor<8x8xi32>
+    }
+    ```
+
+    will be converted to
+
+    ```
+    func.func @main(%arg0 {mhlo.sharding = ""}) -> (tensor<8x8xi32> {mhlo.sharding = ""})
+      %1 = "tf.DTensorLayout"(%0) {mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>}
+      %2 = "tf.Identity"(%1)
+      %3 = "tf.DTensorLayout"(%2){mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>}
+      return %3 : tensor<8x8xi32>
+    }
+    ```
+  }];
+
+  let constructor = "CreateDTensorSetHloShardingPass()";
+  let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
+  ];
+
+  let options = [
+    Option<"check_layout_use_xla_spmd_", "check_layout_use_xla_spmd", "bool",
+     /*default=*/"false",
+    "If true, the pass checks every DTensorLayout must have Mesh config with use_xla_spmd."
+    "Otherwise, the check is disabled.">
+  ];
+}
+
+def DTensorReplaceAuxiliaryDTensorLayoutOpPass
+    : Pass<"dtensor-replace-auxiliary-layout-op", "mlir::ModuleOp"> {
+  let summary = "Replace auxiliary `tf.DTensorLayout` op with `tf.Identity`.";
+
+  let description = [{
+    Canonicalizer and DCE transformation passes may remove ops in the graph and
+    result in multiple consecutive DTensorLayout ops. The pass detects all such
+     cases and replaces unnecessary DTensorLayout ops with Identity ops.
+
+    Removes `tf.DTensorLayouts` and inserts a `tf.Identity`.
+    For example:
+
+    ```
+      %0 = tf.DTensorLayout(arg0)
+      %1 = tf.DTensorLayout(%0)
+      %2 = tf.Add(%1, %1)
+    ```
+
+    will be converted to:
+    ```
+      %0 = tf.Identity(arg0)
+      %1 = tf.DTensorLayout(%0)
+      %2 = tf.Add(%1, %1)
+    ```
+  }];
+
+  let constructor = "CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass()";
+}
+
+def DTensorRemoveDTensorLayoutPass : Pass<"dtensor-remove-dtensorlayout", "mlir::ModuleOp"> {
+  let summary = "Remove DTensor `tf.DTensorLayout` ops.";
+
+  let description = [{
+    The pass removes DTensor `tf.DTensorLayout` ops.
+    For example,
+
+    ```mlir
+      %1 = tf.DTensorLayout(%0)
+      %2 = tf.SomeOp(%1)
+    ```
+
+    will be converted to
+
+    ```mlir
+      %2 = tf.SomeOp(%0)
+    ```
+  }];
+
+  let constructor = "CreateDTensorRemoveDTensorLayoutPass()";
+  let dependentDialects = [
+    "::mlir::dtensor::DTensorDialect"
+  ];
+}
+
 #endif // TF_DTENSOR_PASSES
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index be888236992..00a363e34b1 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -99,7 +99,7 @@ StatusOr<mlir::Value> EmitAllGather(
 
   if (newly_created_ops != nullptr) newly_created_ops->insert(all_gather);
 
-  return all_gather.output();
+  return all_gather.getOutput();
 }
 
 StatusOr<const mlir::Value> EmitAllScatter(
@@ -143,7 +143,7 @@ StatusOr<const mlir::Value> EmitAllScatter(
 
   if (newly_created_ops != nullptr) newly_created_ops->insert(all_scatter);
 
-  return all_scatter.output();
+  return all_scatter.getOutput();
 }
 
 StatusOr<mlir::Value> EmitDenseToSparseToDense(
@@ -458,7 +458,7 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
     return errors::InvalidArgument(
         "Requested halo exchange on unknown mesh dim");
 
-  // TODO(hongjunchoi): Add support fof halo exchange for GPU/CPU.
+  // TODO(b/261485237): Add support for halo exchange for GPU/CPU.
   if (!mesh.is_tpu_mesh())
     return errors::InvalidArgument("Halo exchange is only supported on TPU.");
 
diff --git a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
index 505d46f045c..4b8a3a92adb 100644
--- a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
+++ b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
@@ -138,6 +138,16 @@ CreateDTensorSparseExpansion();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorInferShapesForRestoreV2Op();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSetHloShardingPass(llvm::Optional<bool> check_layout_use_xla_spmd =
+                                    llvm::Optional<bool>(false));
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass();
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorRemoveDTensorLayoutPass();
+
 // Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
diff --git a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
index 1c6dd571871..32c8d249415 100644
--- a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
+++ b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
@@ -45,19 +45,6 @@ mlir::LogicalResult SetMeshForResourceCreatingCluster(
 
   if (!result.wasInterrupted()) return mlir::success();
 
-  const auto& cluster_ops = cluster.GetBody().without_terminator();
-
-  bool has_single_tf_op =
-      llvm::count_if(cluster_ops, [](auto& operation) {
-        return !llvm::isa<mlir::TF::DTensorLayout>(&operation);
-      }) == 1;
-
-  if (!has_single_tf_op) {
-    return cluster.emitOpError(
-        "cluster containing tf.VarHandleOp/DestroyResourceOp must contain "
-        "single operation and a terminator");
-  }
-
   if (!cluster->hasAttr(kMeshAttr)) {
     cluster->setAttr(kMeshAttr, builder->getStringAttr(Mesh::kEmptyMeshString));
   }
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index 3becf67bb2f..c0fe7dedf8e 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -153,12 +153,12 @@ mlir::LogicalResult MergeAllReduceGroup(
 
     int num_elements = all_reduce_ranked_type.getNumElements();
     auto flattened = builder.create<mlir::TF::ReshapeOp>(
-        DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.input(),
+        DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.getInput(),
         ops_util::GetR1Const({num_elements}, builder, loc));
     flattened_types.push_back(flattened.getType());
     auto indices = ops_util::GetR1Const({offset_num_elements}, builder, loc);
 
-    if (all_reduce.device_type().contains("TPU")) {
+    if (all_reduce.getDeviceType().contains("TPU")) {
       updated = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
           DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
           /*input=*/i == 0 ? merged.getResult() : updated,
@@ -179,8 +179,8 @@ mlir::LogicalResult MergeAllReduceGroup(
   // All-reduce the updated merged tensor.
   auto merged_all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
       all_reduce_group[0].getLoc(), updated.getType(), updated,
-      all_reduce_group[0].group_assignment(), all_reduce_group[0].reduce_op(),
-      all_reduce_group[0].device_type());
+      all_reduce_group[0].getGroupAssignment(),
+      all_reduce_group[0].getReduceOp(), all_reduce_group[0].getDeviceType());
   SetSingleLayoutOnOp(
       merged_all_reduce,
       ExtractSingleLayoutFromOp(all_reduce_group[0]).value().value());
@@ -317,7 +317,7 @@ mlir::LogicalResult CombineAllReduceOps(
   std::vector<mlir::TF::DTensorAllReduceOp> cross_slice_all_reduces;
   for (mlir::TF::DTensorAllReduceOp all_reduce : all_reduces) {
     mlir::DenseIntElementsAttr group_assignment_attr;
-    if (!matchPattern(all_reduce.group_assignment(),
+    if (!matchPattern(all_reduce.getGroupAssignment(),
                       m_Constant(&group_assignment_attr))) {
       return all_reduce.emitOpError("group_assignment should be a constant");
     }
@@ -333,7 +333,7 @@ mlir::LogicalResult CombineAllReduceOps(
         group_assignment_attr,
         GroupAssignment::ReplicaToDeviceMap::DefaultReplicaToDeviceMap(
             num_slices, slice_size));
-    // LINT.ThenChange(//tensorflow/dtensor/mlir/utils/collective_lowering_google.inc)
+    // LINT.ThenChange(//tensorflow/dtensor/mlir/utils/collective_lowering_google.cc)
     if (!group_assignment.ok()) {
       return all_reduce.emitOpError(
           llvm::formatv("Failed to create a GroupAssignment due to {0}",
@@ -500,7 +500,7 @@ createSubgroupsByReductionAttr(
     llvm::DenseMap<llvm::StringRef, std::vector<mlir::TF::DTensorAllReduceOp>>
         all_reduces_by_attr_reduce_op;
     for (mlir::TF::DTensorAllReduceOp all_reduce : all_reduce_group) {
-      llvm::StringRef attr_reduce_op = all_reduce.reduce_op();
+      llvm::StringRef attr_reduce_op = all_reduce.getReduceOp();
       all_reduces_by_attr_reduce_op[attr_reduce_op].push_back(all_reduce);
     }
     for (const auto& all_reduces_for_reduce_op_attr :
@@ -522,7 +522,7 @@ createSubgroupsByGroupAssignment(
     llvm::DenseMap<mlir::Value, std::vector<mlir::TF::DTensorAllReduceOp>>
         all_reduces_by_group_assignment;
     for (mlir::TF::DTensorAllReduceOp all_reduce : all_reduce_group) {
-      mlir::Value group_assignment = all_reduce.group_assignment();
+      mlir::Value group_assignment = all_reduce.getGroupAssignment();
       bool seen = false;
       for (mlir::Value seen_group_assignment : group_assignments) {
         if (same_group_assignments(group_assignment, seen_group_assignment)) {
@@ -553,7 +553,7 @@ struct DTensorAllReduceCombineOptimization
       llvm::DenseSet<mlir::Block*> blocks;
 
       cluster.GetBody().walk([&](mlir::TF::DTensorAllReduceOp all_reduce) {
-        if (!all_reduce.device_type().contains("TPU")) {
+        if (!all_reduce.getDeviceType().contains("TPU")) {
           // Only combine all reduces for GPU and CPU
           auto all_reduce_ranked_type =
               all_reduce.getType().dyn_cast<mlir::RankedTensorType>();
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index 0afaa7857f4..a4c328bb781 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -50,8 +50,8 @@ bool same_group_assignments(mlir::DenseIntElementsAttr attr_a,
 
 mlir::DenseIntElementsAttr GetScatterGroupAssignment(
     mlir::TF::DTensorAllScatterOp all_scatter, int scatter_dim) {
-  const Layout original_layout = all_scatter.input_layout();
-  const Layout desired_layout = all_scatter.output_layout();
+  const Layout original_layout = all_scatter.getInputLayout();
+  const Layout desired_layout = all_scatter.getOutputLayout();
   absl::flat_hash_set<std::string> scattered_dims;
   scattered_dims.insert(desired_layout.sharding_spec(scatter_dim));
 
@@ -86,8 +86,8 @@ mlir::LogicalResult ApplyOptimization(mlir::func::FuncOp function) {
         if (VLOG_IS_ON(2)) all_reduce.dump();
         if (VLOG_IS_ON(2)) all_scatter.dump();
 
-        const Layout original_layout = all_scatter.input_layout();
-        const Layout desired_layout = all_scatter.output_layout();
+        const Layout original_layout = all_scatter.getInputLayout();
+        const Layout desired_layout = all_scatter.getOutputLayout();
 
         // Find all potential scatter dimensions.
         std::vector<int> scatter_dims;
@@ -111,7 +111,7 @@ mlir::LogicalResult ApplyOptimization(mlir::func::FuncOp function) {
         // Check that the all-reduce and all-scatter group assignments are the
         // same.
         mlir::DenseIntElementsAttr all_reduce_group_assignment_attr;
-        if (!matchPattern(all_reduce.group_assignment(),
+        if (!matchPattern(all_reduce.getGroupAssignment(),
                           m_Constant(&all_reduce_group_assignment_attr))) {
           all_reduce.emitOpError("group_assignment should be a constant");
           return mlir::WalkResult::interrupt();
@@ -139,9 +139,9 @@ mlir::LogicalResult ApplyOptimization(mlir::func::FuncOp function) {
 
         auto reduce_scatter = builder.create<mlir::TF::DTensorReduceScatterOp>(
             all_reduce.getLoc(), all_scatter->getResultTypes(),
-            all_reduce.getOperand(0), all_reduce.group_assignment(),
-            scatter_dim_const_op, all_reduce.reduce_op(),
-            all_reduce.device_type());
+            all_reduce.getOperand(0), all_reduce.getGroupAssignment(),
+            scatter_dim_const_op, all_reduce.getReduceOp(),
+            all_reduce.getDeviceType());
         SetSingleLayoutOnOp(reduce_scatter, desired_layout);
 
         all_scatter->replaceAllUsesWith(reduce_scatter);
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index 737f09d5c4c..b369165cbba 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -45,7 +45,7 @@ mlir::Value GetIdentitySkippedInputs(mlir::Value val) {
   mlir::Value input = val;
   while (auto identity = llvm::dyn_cast_or_null<mlir::TF::IdentityOp>(
              input.getDefiningOp())) {
-    input = identity.input();
+    input = identity.getInput();
   }
   return input;
 }
@@ -55,7 +55,7 @@ bool IsZeroConstant(mlir::Value val) {
       GetIdentitySkippedInputs(val).getDefiningOp());
   if (!const_input) return false;
   mlir::DenseFPElementsAttr attr =
-      const_input.value().dyn_cast<mlir::DenseFPElementsAttr>();
+      const_input.getValue().dyn_cast<mlir::DenseFPElementsAttr>();
   // This uses the fact that constant Attrs becomes splats, so we only need to
   // check one value.
   if (!attr || !attr.isSplat()) return false;
@@ -90,20 +90,20 @@ mlir::LogicalResult CheckReduceAndSumOptimizationCriteria(
 
   llvm::SmallDenseSet<mlir::Attribute> reduction_group_assignments;
   for (mlir::TF::DTensorAllReduceOp reduction : *reduction_ops) {
-    if (reduction.reduce_op().str() != kReduceOpAdd) {
+    if (reduction.getReduceOp().str() != kReduceOpAdd) {
       *can_be_reordered = false;
       return mlir::success();
     }
 
     mlir::DenseIntElementsAttr group_assignment;
-    if (!matchPattern(reduction.group_assignment(),
+    if (!matchPattern(reduction.getGroupAssignment(),
                       m_Constant(&group_assignment))) {
       *can_be_reordered = false;
       return mlir::success();
     }
 
     reduction_group_assignments.insert(group_assignment);
-    reduction_inputs->emplace_back(reduction.input());
+    reduction_inputs->emplace_back(reduction.getInput());
   }
 
   *can_be_reordered = (reduction_group_assignments.size() == 1);
@@ -147,7 +147,7 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
   // From above check `CheckOptimizationCriteria()`, we know that all reduction
   // operations that are fused reused the same group assignment value.
   // 1) Get mlir::Value that represents group assignment used for reduction.
-  mlir::Value group_assignment = first_reduction_op.group_assignment();
+  mlir::Value group_assignment = first_reduction_op.getGroupAssignment();
 
   // Create a singe reduction operation that reduces the result of the locally
   // added tensor.
@@ -157,7 +157,7 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
       builder.create<mlir::TF::DTensorAllReduceOp>(
           op->getLoc(), op->getResult(0).getType(), op->getResult(0),
           group_assignment, builder.getStringAttr(std::string(kReduceOpAdd)),
-          builder.getStringAttr(first_reduction_op.device_type()));
+          builder.getStringAttr(first_reduction_op.getDeviceType()));
 
   const auto layout_or_status = ExtractSingleLayoutFromOp(first_reduction_op);
   if (!layout_or_status.ok())
@@ -176,7 +176,7 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
   // Replace usages of original tf.Add op with newly created output of
   // `all_reduce`.
   op->getResult(0).replaceAllUsesExcept(
-      all_reduce.output(),
+      all_reduce.getOutput(),
       llvm::SmallPtrSet<mlir::Operation*, 1>{all_reduce.getOperation()});
 
   // TODO(hongjunchoi, bfontain): Consider adding optimization for the case when
@@ -248,16 +248,18 @@ void OptimizeIdentityLikeOps(mlir::Operation* op, bool* changed) {
   if (!MayRemoveAllReduce(op)) return;
 
   dtensor_all_reduce->moveAfter(op);
-  mlir::Value input = dtensor_all_reduce.input();
+  mlir::Value input = dtensor_all_reduce.getInput();
   op->setOperand(0, input);
 
   mlir::Value op_output = op->getResult(0);
   dtensor_all_reduce.setOperand(0, op_output);
-  dtensor_all_reduce.input().setType(op_output.getType());
-  dtensor_all_reduce.output().setType(op_output.getType());
+  dtensor_all_reduce.getInput().setType(
+      op_output.getType().cast<mlir::TensorType>());
+  dtensor_all_reduce.getOutput().setType(
+      op_output.getType().cast<mlir::TensorType>());
 
   llvm::SmallPtrSet<mlir::Operation*, 4> exceptions{dtensor_all_reduce};
-  op_output.replaceAllUsesExcept(dtensor_all_reduce.output(), exceptions);
+  op_output.replaceAllUsesExcept(dtensor_all_reduce.getOutput(), exceptions);
   *changed = true;
 }
 
@@ -267,7 +269,7 @@ bool CheckWhileLoopOptimizationCriteria(
     mlir::OpOperand** add_input) {
   // Loop variant input that is being optimized should not be used in loop
   // condition.
-  mlir::Value loop_condition_input = while_op.cond().getArgument(index);
+  mlir::Value loop_condition_input = while_op.getCond().getArgument(index);
   if (!loop_condition_input.use_empty()) return false;
 
   // While loop output should be connected to add op.
@@ -306,11 +308,11 @@ bool CheckWhileLoopOptimizationCriteria(
   // DTensorAllReduce should calculate sum across devices and group assignment
   // must be statically known.
   mlir::Operation* group_assignment =
-      all_reduce.group_assignment().getDefiningOp();
+      all_reduce.getGroupAssignment().getDefiningOp();
   if (!group_assignment || !llvm::isa<mlir::TF::ConstOp>(group_assignment))
     return false;
 
-  if (all_reduce.reduce_op().str() != kReduceOpAdd) return false;
+  if (all_reduce.getReduceOp().str() != kReduceOpAdd) return false;
 
   // While loop block argument input connected to Add op should be
   // connected to constant operations with zero value.
@@ -370,7 +372,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
     mlir::TF::WhileRegionOp while_op, mlir::OpOperand& add_input,
     mlir::Operation* add_op, bool* changed) {
   // Set add input to input of all reduce.
-  mlir::Value all_reduce_input = all_reduce.input();
+  mlir::Value all_reduce_input = all_reduce.getInput();
   const int replacement_add_input_index =
       add_input.getOperandNumber() == 0 ? 1 : 0;
   add_op->setOperand(replacement_add_input_index, all_reduce_input);
@@ -380,7 +382,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
 
   mlir::Value while_output = while_op.getResult(output_index);
   mlir::Operation* group_assignment_const =
-      all_reduce.group_assignment().getDefiningOp();
+      all_reduce.getGroupAssignment().getDefiningOp();
   mlir::Operation* cloned_group_assignment =
       builder.clone(*group_assignment_const);
 
@@ -390,7 +392,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
       all_reduce.getLoc(), while_output.getType(), while_output,
       cloned_group_assignment->getResult(0),
       builder.getStringAttr(std::string(kReduceOpAdd)),
-      builder.getStringAttr(all_reduce.device_type()));
+      builder.getStringAttr(all_reduce.getDeviceType()));
 
   const auto layout_or_status = ExtractSingleLayoutFromOp(all_reduce);
   if (!layout_or_status.ok())
@@ -408,7 +410,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
 
   llvm::SmallPtrSet<mlir::Operation*, 4> exceptions;
   exceptions.insert(new_all_reduce.getOperation());
-  while_output.replaceAllUsesExcept(new_all_reduce.output(), exceptions);
+  while_output.replaceAllUsesExcept(new_all_reduce.getOutput(), exceptions);
 
   if (all_reduce.use_empty()) all_reduce.erase();
 
@@ -419,7 +421,7 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
 mlir::LogicalResult OptimizeWhileLoopLazyAllReduce(
     mlir::TF::WhileRegionOp while_op, bool* changed) {
   mlir::Operation* while_body_terminator =
-      while_op.body().front().getTerminator();
+      while_op.getBody().front().getTerminator();
   for (const auto& data :
        llvm::enumerate(while_body_terminator->getOpOperands())) {
     const int index = data.index();
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
index d0a500186f7..ca144844c9b 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
@@ -4,6 +4,7 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/dtensor:dtensor-internal",
         # Allow visibility from the mlir language server.
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
index 3780667cb02..090a0607878 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/ir/dtensor_dialect.td
@@ -52,8 +52,7 @@ void registerAttributes();
 
 public:
   }];
-
-  let emitAccessorPrefix = kEmitAccessorPrefix_Prefixed;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index 3bf7cc14765..4d29b61aaa4 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -42,7 +42,7 @@ template <class ReduceOpType>
 mlir::LogicalResult GetAllReduceGroupSize(ReduceOpType reduce_op,
                                           int32* group_size) {
   mlir::DenseIntElementsAttr group_assignment_attr;
-  if (!matchPattern(reduce_op.group_assignment(),
+  if (!matchPattern(reduce_op.getGroupAssignment(),
                     m_Constant(&group_assignment_attr)))
     return mlir::emitError(reduce_op.getLoc(),
                            "group_assigment must be a constant.");
@@ -67,7 +67,9 @@ template <class ReduceOpType>
 mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
                                             bool* changed) {
   const mlir::RankedTensorType& input_type =
-      reduce_op.input().getType().template dyn_cast<mlir::RankedTensorType>();
+      reduce_op.getInput()
+          .getType()
+          .template dyn_cast<mlir::RankedTensorType>();
   if (!input_type.getElementType().isBF16()) {
     // Upcast only applies for bfloat16 input.
     return mlir::success();
@@ -92,14 +94,16 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   // The original output tensor type that would have been used by all users of
   // the reduce op.
   const mlir::RankedTensorType& output_type =
-      reduce_op.output().getType().template dyn_cast<mlir::RankedTensorType>();
+      reduce_op.getOutput()
+          .getType()
+          .template dyn_cast<mlir::RankedTensorType>();
 
   mlir::TF::CastOp upcast = builder.create<mlir::TF::CastOp>(
       loc,
       mlir::RankedTensorType::get(input_type.getShape(), builder.getF32Type()),
-      reduce_op.input());
-  reduce_op->setOperand(0, upcast.y());
-  reduce_op.output().setType(upcast.y().getType());
+      reduce_op.getInput());
+  reduce_op->setOperand(0, upcast.getY());
+  reduce_op.getOutput().setType(upcast.getY().getType());
 
   builder.setInsertionPointAfter(reduce_op);
   mlir::TF::CastOp downcast = builder.create<mlir::TF::CastOp>(
@@ -110,7 +114,7 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   // Match the layout of the downcast with the reduce op, this is required for
   // the later passes.
   SetSingleLayoutOnOp(downcast, *reduce_layout);
-  reduce_op.output().replaceAllUsesExcept(downcast.y(), downcast);
+  reduce_op.getOutput().replaceAllUsesExcept(downcast.getY(), downcast);
 
   *changed = true;
   return mlir::success();
@@ -123,7 +127,7 @@ mlir::LogicalResult TryMixedPrecisionReduce(mlir::func::FuncOp function,
   int32_t changedReduceOpsCounter = 0;
 
   mlir::WalkResult walk_result = function.walk([&](ReduceOpType reduce_op) {
-    if (reduce_op.reduce_op().str() == kReduceOpAdd) {
+    if (reduce_op.getReduceOp().str() == kReduceOpAdd) {
       reduceOpsCounter += 1;
       bool changed = false;
       if (mlir::failed(MaybeUpcastForReduction(reduce_op, &changed)))
diff --git a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
index fde3176c767..fe9103bfd00 100644
--- a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
@@ -280,6 +280,14 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
     pm->addPass(mlir::TFDevice::CreateResourceOpLiftingPass());
     pm->addPass(mlir::TFDevice::CreateClusterOutliningPass());
 
+    // Prepare for XLA SPMD integration for XLA SPMD mesh. If there are layout
+    // operations on XLA SPMD mesh, then convert all of them to appropriate
+    // XLA sharding attributes.
+    pm->addPass(CreateDTensorSetHloShardingPass(
+        /*check_layout_use_xla_spmd=*/true));
+    pm->addPass(CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass());
+    pm->addPass(CreateDTensorRemoveDTensorLayoutPass());
+
     // Rename functions with unique names, to avoid collisions in the function
     // library.
     pm->addPass(CreateFunctionRenamingPass());
@@ -297,6 +305,11 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
     // expected by TPURewritePass.
     pm->addNestedPass<mlir::func::FuncOp>(
         mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
+    // Rewrite RecvTPUEmbeddingActivationsOp and SendTPUEmbeddingGradients ops
+    // to internal variants by introducing XlaRecvTPUEmbeddingDeduplicationData
+    // op.
+    pm->addNestedPass<mlir::func::FuncOp>(
+        mlir::TF::CreateRewriteTPUEmbeddingOpsPass());
     // Create TPU Compile and TPU Execute ops for each TPU devices.
     pm->addPass(mlir::TFTPU::CreateTPURewritePass());
     // Convert unified compilation and replication attributes back to legacy
diff --git a/tensorflow/dtensor/mlir/dtensor_remove_dtensorlayout.cc b/tensorflow/dtensor/mlir/dtensor_remove_dtensorlayout.cc
new file mode 100644
index 00000000000..aa0a4d94dd1
--- /dev/null
+++ b/tensorflow/dtensor/mlir/dtensor_remove_dtensorlayout.cc
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
+#include "tensorflow/dtensor/mlir/op_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+
+#define GEN_PASS_DEF_DTENSORREMOVEDTENSORLAYOUTPASS
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+class DTensorRemoveDTensorLayoutPass
+    : public impl::DTensorRemoveDTensorLayoutPassBase<
+          DTensorRemoveDTensorLayoutPass> {
+ public:
+  void runOnOperation() override {
+    RemoveDTensorLayoutOps(getOperation(), /*remove_xla_spmd_layouts=*/true);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorRemoveDTensorLayoutPass() {
+  return std::make_unique<DTensorRemoveDTensorLayoutPass>();
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/dtensor_replace_auxiliary_layout_op.cc b/tensorflow/dtensor/mlir/dtensor_replace_auxiliary_layout_op.cc
new file mode 100644
index 00000000000..076f061e4e2
--- /dev/null
+++ b/tensorflow/dtensor/mlir/dtensor_replace_auxiliary_layout_op.cc
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/dtensor/mlir/op_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+
+#define GEN_PASS_DEF_DTENSORREPLACEAUXILIARYDTENSORLAYOUTOPPASS
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+class DTensorReplaceAuxiliaryDTensorLayoutOpPass
+    : public impl::DTensorReplaceAuxiliaryDTensorLayoutOpPassBase<
+          DTensorReplaceAuxiliaryDTensorLayoutOpPass> {
+ public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    if (mlir::failed(ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(module)))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorReplaceAuxiliaryDTensorLayoutOpPass() {
+  return std::make_unique<DTensorReplaceAuxiliaryDTensorLayoutOpPass>();
+}
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.cc b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
index 81ff44fb484..5b8ba13aa31 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.cc
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
@@ -46,7 +48,7 @@ mlir::Value GetOrCreateCompilationKey(mlir::Operation* op) {
   assert(cluster);
   cluster.walk(
       [&](mlir::TF::_TPUCompileMlirPlaceholderProgramKeyOp compilation_key) {
-        key = compilation_key.program();
+        key = compilation_key.getProgram();
       });
   if (key) return key;
 
@@ -57,7 +59,7 @@ mlir::Value GetOrCreateCompilationKey(mlir::Operation* op) {
       builder.create<mlir::TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
           cluster.getLoc(), /*program=*/result_type,
           llvm::ArrayRef<mlir::Value>{});
-  return new_compilation_key.program();
+  return new_compilation_key.getProgram();
 }
 
 }  // namespace
@@ -99,9 +101,9 @@ StatusOr<mlir::Operation*> LowerDTensorSendToTFOp(
     mlir::TF::DTensorSend dtensor_send) {
   mlir::OpBuilder builder(dtensor_send);
   builder.setInsertionPointAfter(send_input.getDefiningOp());
-  std::string tensor_name = dtensor_send.key().str();
+  std::string tensor_name = dtensor_send.getKey().str();
 
-  Layout target_layout = dtensor_send.target_layout();
+  Layout target_layout = dtensor_send.getTargetLayout();
   absl::Span<const std::string> sending_devices =
       send_input_layout.mesh().local_devices();
   absl::Span<const std::string> receiving_devices =
@@ -155,11 +157,11 @@ StatusOr<mlir::Operation*> LowerDTensorSendToXlaOp(
     }
     // Create XlaSendFromHostV2 op
     lowered_send_op = builder.create<mlir::TF::_XlaSendFromHostV2Op>(
-        loc, value_to_send, program_key, device_ordinal, dtensor_send.key());
+        loc, value_to_send, program_key, device_ordinal, dtensor_send.getKey());
   } else {
     // Note that for ops running in XLA/TPU, device ordinal input is not needed.
     lowered_send_op = builder.create<mlir::TF::XlaSendToHostOp>(
-        loc, send_input, dtensor_send.key());
+        loc, send_input, dtensor_send.getKey());
   }
 
   dtensor_send.erase();
@@ -171,7 +173,7 @@ StatusOr<mlir::Operation*> LowerDTensorSendToXlaOp(
 // specific local tensor type needed, if different from the Recv op output type.
 StatusOr<mlir::Operation*> LowerDTensorRecvToXlaOp(
     mlir::TF::DTensorRecv dtensor_recv, mlir::Type output_type) {
-  const bool recv_at_cpu = dtensor_recv.layout().mesh().is_cpu_mesh();
+  const bool recv_at_cpu = dtensor_recv.getLayout().mesh().is_cpu_mesh();
   mlir::Operation* recv_xla_op = nullptr;
   mlir::OpBuilder builder(dtensor_recv);
 
@@ -199,13 +201,13 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToXlaOp(
     builder.setInsertionPoint(dtensor_recv);
     recv_xla_op = builder.create<mlir::TF::_XlaRecvAtHostV2Op>(
         dtensor_recv.getLoc(), output_types,
-        /*dynamic_key=*/program_key, device_ordinal, dtensor_recv.keyAttr());
+        /*dynamic_key=*/program_key, device_ordinal, dtensor_recv.getKeyAttr());
   } else {
     // Create XlaRecvFromHost op.
     recv_xla_op = builder.create<mlir::TF::XlaRecvFromHostOp>(
         dtensor_recv.getLoc(), output_type,
         ConvertTypeToTensorShapeAttr(dtensor_recv.getType()),
-        dtensor_recv.keyAttr());
+        dtensor_recv.getKeyAttr());
   }
 
   assert(recv_xla_op);
@@ -241,16 +243,16 @@ StatusOr<mlir::Operation*> LowerDTensorSendFromCPUToTFOp(
   absl::Span<const std::string> sending_devices =
       send_input_layout.mesh().local_devices();
 
-  Layout target_layout = dtensor_send.target_layout();
+  Layout target_layout = dtensor_send.getTargetLayout();
   absl::Span<const std::string> receiving_devices =
       target_layout.mesh().local_devices();
 
-  std::string tensor_name = dtensor_send.key().str();
+  std::string tensor_name = dtensor_send.getKey().str();
 
   mlir::Operation* lowered_send_op;
   for (size_t i = 0; i < receiving_devices.size(); ++i)
     lowered_send_op = builder.create<mlir::TF::_HostSendOp>(
-        send_input.getLoc(), dtensor_send.input(), tensor_name,
+        send_input.getLoc(), dtensor_send.getInput(), tensor_name,
         sending_devices[0],
         /*send_device_incarnation=*/0, receiving_devices[i]);
 
@@ -261,7 +263,7 @@ StatusOr<mlir::Operation*> LowerDTensorSendFromCPUToTFOp(
 // Lowers DTensorRecv op to TF Recv Op.
 StatusOr<mlir::Operation*> LowerDTensorRecvFromCPUToTFOp(
     const Mesh& send_mesh, mlir::TF::DTensorRecv dtensor_recv) {
-  const Layout& recv_layout = dtensor_recv.layout();
+  const Layout& recv_layout = dtensor_recv.getLayout();
 
   auto recv_cluster =
       dtensor_recv->getParentOfType<mlir::tf_device::ClusterOp>();
@@ -269,7 +271,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecvFromCPUToTFOp(
   mlir::OpBuilder builder(&recv_cluster.GetBody().front());
   llvm::SmallVector<mlir::Type, 4> output_types{dtensor_recv.getType()};
   builder.setInsertionPoint(dtensor_recv);
-  std::string tensor_name = dtensor_recv.key().str();
+  std::string tensor_name = dtensor_recv.getKey().str();
   absl::Span<const std::string> sending_devices = send_mesh.local_devices();
   absl::Span<const std::string> receiving_devices =
       recv_layout.mesh().local_devices();
@@ -291,13 +293,13 @@ StatusOr<mlir::Operation*> LowerDTensorRecvFromCPUToTFOp(
 StatusOr<mlir::Operation*> LowerDTensorRecvToTFOp(
     const Mesh& send_mesh, mlir::TF::DTensorRecv dtensor_recv,
     mlir::Type output_type) {
-  const Layout& recv_layout = dtensor_recv.layout();
+  const Layout& recv_layout = dtensor_recv.getLayout();
   auto recv_cluster =
       dtensor_recv->getParentOfType<mlir::tf_device::ClusterOp>();
 
   mlir::OpBuilder builder(&recv_cluster.GetBody().front());
   builder.setInsertionPoint(dtensor_recv);
-  std::string tensor_name = dtensor_recv.key().str();
+  std::string tensor_name = dtensor_recv.getKey().str();
   absl::Span<const std::string> sending_devices = send_mesh.local_devices();
   absl::Span<const std::string> receiving_devices =
       recv_layout.mesh().local_devices();
@@ -349,7 +351,8 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorSendToTFHostSend(
     const Layout& send_layout, const Mesh& recv_mesh,
     mlir::TF::DTensorSend dtensor_send) {
   const auto& send_mesh = send_layout.mesh();
-  bool i32_copy = dtensor_send.input().getType().getElementType().isInteger(32);
+  bool i32_copy =
+      dtensor_send.getInput().getType().getElementType().isInteger(32);
   auto module = dtensor_send->getParentOfType<mlir::ModuleOp>();
   mlir::SymbolTable symbol_table(module);
   auto device_pairs =
@@ -367,7 +370,7 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorSendToTFHostSend(
                        /*return_int64_type=*/false));
 
   mlir::StringAttr tensor_name =
-      builder.getStringAttr(dtensor_send.key().str());
+      builder.getStringAttr(dtensor_send.getKey().str());
   auto branches = GenerateBranches(
       dtensor_send, symbol_table, llvm::ArrayRef<mlir::Type>{},
       "{0}_send_{1}_{2}", device_pairs,
@@ -433,14 +436,14 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
   bool i32_copy = recv_type.getElementType().isInteger(32);
   TF_ASSIGN_OR_RETURN(
       mlir::TensorType local_recv_type,
-      LocalTypeFromGlobalType(dtensor_recv.layout(), recv_type));
+      LocalTypeFromGlobalType(dtensor_recv.getLayout(), recv_type));
   mlir::TensorType local_output_type =
       i32_copy ? mlir::RankedTensorType::get(local_recv_type.getShape(),
                                              builder.getIntegerType(64))
                : local_recv_type;
 
   mlir::StringAttr tensor_name =
-      builder.getStringAttr(dtensor_recv.key().str());
+      builder.getStringAttr(dtensor_recv.getKey().str());
   auto branches = GenerateBranches(
       dtensor_recv, symbol_table, llvm::ArrayRef<mlir::Type>{local_output_type},
       "{0}_receive_{1}_{2}", device_pairs,
@@ -468,7 +471,7 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
     lowered_recv = case_op;
   }
 
-  dtensor_recv.output().replaceAllUsesWith(lowered_recv->getResult(0));
+  dtensor_recv.getOutput().replaceAllUsesWith(lowered_recv->getResult(0));
   dtensor_recv.erase();
 
   return lowered_recv;
@@ -523,11 +526,12 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
                                             mlir::Operation* recv_op) {
   auto dtensor_send = llvm::cast<mlir::TF::DTensorSend>(send_op);
 
-  TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(dtensor_send.input()));
+  TF_ASSIGN_OR_RETURN(
+      const Layout input_layout,
+      ExtractRequiredLayoutFromOperand(dtensor_send.getInput()));
 
   const Mesh& input_mesh = input_layout.mesh();
-  const Layout& recv_layout = dtensor_send.target_layout();
+  const Layout& recv_layout = dtensor_send.getTargetLayout();
   const Mesh& target_mesh = recv_layout.mesh();
   bool one_to_one = IsOneToOneMeshTransfer(input_layout, recv_layout);
 
@@ -535,7 +539,7 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
   // layout is identical, then tensor from each source device is sent to
   // target device asynchronously.
   if (one_to_one && IsTpuToHostMeshTransfer(input_mesh, target_mesh)) {
-    return LowerDTensorSendToXlaOp(input_layout, dtensor_send.input(),
+    return LowerDTensorSendToXlaOp(input_layout, dtensor_send.getInput(),
                                    dtensor_send,
                                    /*send_from_device_zero=*/false);
   } else if (one_to_one && IsGpuToHostMeshTransfer(input_mesh, target_mesh) &&
@@ -547,14 +551,14 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
   // Calculate input tensor layout of data to send and target fully replicated
   // layout. For now, we ensure that all data transfer happen with fully
   // replicated tensors.
-  const int rank = ValueRank(dtensor_send.input());
+  const int rank = ValueRank(dtensor_send.getInput());
   const Layout target_layout = Layout::ReplicatedOnMesh(input_mesh, rank);
 
   // Convert tensor to send to replicated layout.
   mlir::OpBuilder builder(dtensor_send);
   TF_ASSIGN_OR_RETURN(mlir::Value send_input,
-                      EmitAllGather(builder, dtensor_send.input(), input_layout,
-                                    target_layout));
+                      EmitAllGather(builder, dtensor_send.getInput(),
+                                    input_layout, target_layout));
 
   // Insert control flow such that only device with device ordinal == 0 sends
   // the tensor data across mesh.
@@ -584,14 +588,14 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
       GetUniqueControlflowFnName("copy_to_mesh_send_if_else", builder));
 
   // Create empty else branch region.
-  auto& else_branch = send_if.else_branch();
+  auto& else_branch = send_if.getElseBranch();
   else_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&else_branch.front());
   builder.create<mlir::TF::YieldOp>(loc,
                                     /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
   // Create then branch region with DTensorSend op.
-  auto& then_branch = send_if.then_branch();
+  auto& then_branch = send_if.getThenBranch();
   then_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&then_branch.front());
   auto yield = builder.create<mlir::TF::YieldOp>(
@@ -647,17 +651,17 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
                       ExtractRequiredSingleLayoutFromOp(recv_op));
 
   mlir::Operation* lowered_recv;
-  const Layout recv_layout = dtensor_recv.layout();
+  const Layout recv_layout = dtensor_recv.getLayout();
   const Mesh& recv_mesh = recv_layout.mesh();
   mlir::OpBuilder builder(dtensor_recv);
 
   bool cpu_to_cpu =
-      dtensor_recv.layout().mesh().is_cpu_mesh() && send_mesh.is_cpu_mesh();
+      dtensor_recv.getLayout().mesh().is_cpu_mesh() && send_mesh.is_cpu_mesh();
   bool one_to_one = IsOneToOneMeshTransfer(send_layout, recv_layout);
   bool send_recv_xla = SendRecvOpUsesXla(send_mesh, recv_mesh);
 
   if (one_to_one && IsGpuToHostMeshTransfer(send_mesh, recv_mesh) &&
-      !dtensor_recv.layout().IsFullyReplicated()) {
+      !dtensor_recv.getLayout().IsFullyReplicated()) {
     TF_ASSIGN_OR_RETURN(lowered_recv,
                         LowerOneToOneDTensorRecvToTFHostRecv(
                             send_mesh, recv_layout, dtensor_recv));
@@ -676,7 +680,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
       // device (*for XLA/TPUs).
       TF_ASSIGN_OR_RETURN(mlir::TensorType local_output_type,
                           LocalTypeFromGlobalType(
-                              dtensor_recv.layout(),
+                              dtensor_recv.getLayout(),
                               dtensor_recv.getType().cast<mlir::TensorType>()));
       TF_ASSIGN_OR_RETURN(lowered_recv, LowerDTensorRecvToXlaOp(
                                             dtensor_recv, local_output_type));
@@ -690,7 +694,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
               : LowerDTensorRecvToTFOp;
 
       // For other send/recv layouts, the tensor needs to be replicated.
-      if (!dtensor_recv.layout().IsFullyReplicated()) {
+      if (!dtensor_recv.getLayout().IsFullyReplicated()) {
         return errors::InvalidArgument(
             "CopyToMesh where target mesh is GPU/TPU requires a replicated "
             "target layout.");
@@ -727,23 +731,33 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
           GetUniqueControlflowFnName("copy_to_mesh_recv_if_else", builder));
 
       // Create empty else branch region that outputs zeros.
-      auto& else_branch = recv_if.else_branch();
+      auto& else_branch = recv_if.getElseBranch();
       else_branch.push_back(new mlir::Block);
       builder.setInsertionPointToEnd(&else_branch.front());
 
       // Create a zero constant.
       mlir::Attribute const_attr;
-      if (output_type.getElementType().isIntOrIndex()) {
-        if (output_type.getElementType().isInteger(64)) {
+      auto output_element_type = output_type.getElementType();
+      if (output_element_type.isIntOrIndex()) {
+        if (output_element_type.isInteger(64)) {
           const_attr = mlir::DenseIntElementsAttr::get(
               output_type, llvm::SmallVector<int64_t>{0});
         } else {
           const_attr = mlir::DenseIntElementsAttr::get(
               output_type, llvm::SmallVector<int32_t>{0});
         }
-      } else {
+      } else if (output_element_type.isBF16()) {
+        mlir::FloatAttr zero = mlir::FloatAttr::get(output_element_type, 0.);
+        const_attr = mlir::DenseElementsAttr::get(
+            output_type, llvm::SmallVector<mlir::Attribute>{zero});
+      } else if (output_element_type.isF16() || output_element_type.isF32()) {
         const_attr = mlir::DenseFPElementsAttr::get(
             output_type, llvm::SmallVector<float>{0.0});
+      } else if (output_element_type.isF64()) {
+        const_attr = mlir::DenseFPElementsAttr::get(
+            output_type, llvm::SmallVector<double>{0.0});
+      } else {
+        return errors::InvalidArgument("unsupported output type");
       }
 
       mlir::Value zeros = builder.create<mlir::TF::ConstOp>(loc, const_attr);
@@ -751,7 +765,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
           loc, /*operands=*/llvm::ArrayRef<mlir::Value>{zeros});
 
       // Create then branch region with DTensorRecv op.
-      auto& then_branch = recv_if.then_branch();
+      auto& then_branch = recv_if.getThenBranch();
       then_branch.push_back(new mlir::Block);
       builder.setInsertionPointToEnd(&then_branch.front());
       dtensor_recv->moveBefore(&then_branch.front(), then_branch.front().end());
@@ -779,7 +793,7 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
       }
 
       // Replaces usages of DTensorRecv op with the broadcasted value.
-      dtensor_recv.output().replaceUsesWithIf(
+      dtensor_recv.getOutput().replaceUsesWithIf(
           lowered_recv->getResult(0), [&](mlir::OpOperand& operand) {
             return !recv_if->isProperAncestor(operand.getOwner());
           });
@@ -807,7 +821,7 @@ StatusOr<mlir::Operation*> LowerDTensorSendAndRecv(mlir::Operation* send_op,
   auto dtensor_send = llvm::cast<mlir::TF::DTensorSend>(send_op);
   auto dtensor_recv = llvm::dyn_cast<mlir::TF::DTensorRecv>(recv_op);
 
-  const Mesh recv_mesh = dtensor_recv.layout().mesh();
+  const Mesh recv_mesh = dtensor_recv.getLayout().mesh();
   TF_ASSIGN_OR_RETURN(
       std::optional<Mesh> send_mesh,
       ExtractDeviceMeshFromOp(
@@ -825,16 +839,16 @@ StatusOr<mlir::Operation*> LowerDTensorSendAndRecv(mlir::Operation* send_op,
   }
 
   const Layout recv_layout =
-      Layout::ReplicatedOnMesh(recv_mesh, ValueRank(dtensor_recv.output()));
+      Layout::ReplicatedOnMesh(recv_mesh, ValueRank(dtensor_recv.getOutput()));
   const Layout send_input_layout =
-      Layout::ReplicatedOnMesh(*send_mesh, ValueRank(dtensor_send.input()));
+      Layout::ReplicatedOnMesh(*send_mesh, ValueRank(dtensor_send.getInput()));
 
   TF_ASSIGN_OR_RETURN(mlir::Operation * lowered_recv,
                       LowerDTensorRecvToXlaOp(dtensor_recv));
   dtensor_recv->replaceAllUsesWith(lowered_recv);
   dtensor_recv.erase();
 
-  return LowerDTensorSendToXlaOp(send_input_layout, dtensor_send.input(),
+  return LowerDTensorSendToXlaOp(send_input_layout, dtensor_send.getInput(),
                                  dtensor_send,
                                  /*send_from_device_zero=*/false);
 }
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.h b/tensorflow/dtensor/mlir/dtensor_send_recv.h
index 51e8f0cc501..c156fd3738a 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.h
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.h
@@ -40,24 +40,24 @@ StatusOr<mlir::Operation*> GetCorrespondingDTensorSendRecvOp(
   if (std::is_same<DTensorOp, mlir::TF::DTensorSend>::value) {
     module.walk([&](mlir::Operation* op) {
       if (auto xla_recv_tpu = llvm::dyn_cast<mlir::TF::XlaRecvFromHostOp>(op)) {
-        if (dtensor_op.key() == xla_recv_tpu.key()) {
+        if (dtensor_op.getKey() == xla_recv_tpu.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto xla_recv_cpu =
                      llvm::dyn_cast<mlir::TF::_XlaRecvAtHostV2Op>(op)) {
-        if (dtensor_op.key() == xla_recv_cpu.key()) {
+        if (dtensor_op.getKey() == xla_recv_cpu.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto dtensor_recv =
                      llvm::dyn_cast<mlir::TF::DTensorRecv>(op)) {
-        if (dtensor_op.key() == dtensor_recv.key()) {
+        if (dtensor_op.getKey() == dtensor_recv.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto host_recv = llvm::dyn_cast<mlir::TF::_HostRecvOp>(op)) {
-        if (dtensor_op.key() == host_recv.tensor_name()) {
+        if (dtensor_op.getKey() == host_recv.getTensorName()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
@@ -72,24 +72,24 @@ StatusOr<mlir::Operation*> GetCorrespondingDTensorSendRecvOp(
     }
     module.walk([&](mlir::Operation* op) {
       if (auto xla_send_tpu = llvm::dyn_cast<mlir::TF::XlaSendToHostOp>(op)) {
-        if (dtensor_op.key() == xla_send_tpu.key()) {
+        if (dtensor_op.getKey() == xla_send_tpu.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto xla_send_cpu =
                      llvm::dyn_cast<mlir::TF::_XlaSendFromHostV2Op>(op)) {
-        if (dtensor_op.key() == xla_send_cpu.key()) {
+        if (dtensor_op.getKey() == xla_send_cpu.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto dtensor_send =
                      llvm::dyn_cast<mlir::TF::DTensorSend>(op)) {
-        if (dtensor_op.key() == dtensor_send.key()) {
+        if (dtensor_op.getKey() == dtensor_send.getKey()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
       } else if (auto host_send = llvm::dyn_cast<mlir::TF::_HostSendOp>(op)) {
-        if (dtensor_op.key() == host_send.tensor_name()) {
+        if (dtensor_op.getKey() == host_send.getTensorName()) {
           corresponding_op = op;
           return mlir::WalkResult::interrupt();
         }
diff --git a/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc b/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc
new file mode 100644
index 00000000000..265fcbbc8df
--- /dev/null
+++ b/tensorflow/dtensor/mlir/dtensor_set_hlo_sharding.cc
@@ -0,0 +1,141 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+
+#include "llvm/ADT/Optional.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+#define GEN_PASS_DECL_DTENSORSETHLOSHARDINGPASS
+#define GEN_PASS_DEF_DTENSORSETHLOSHARDINGPASS
+#include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+
+mlir::LogicalResult SetHloShardingForInputsAndOps(
+    mlir::ModuleOp module, mlir::OpBuilder builder,
+    bool check_layout_use_xla_spmd) {
+  module.walk([&](mlir::TF::DTensorLayout layout_op) {
+    if (check_layout_use_xla_spmd &&
+        !layout_op.getLayout().mesh().use_xla_spmd()) {
+      layout_op.emitOpError(
+          "Found a layout operation that is not on XLA SPMD mesh during XLA "
+          "SPMD integration.");
+    }
+    StatusOr<::xla::OpSharding> xla_sharding =
+        ConvertLayoutToXlaOpSharding(layout_op.getLayout());
+
+    if (!xla_sharding.ok())
+      module.emitError(xla_sharding.status().error_message());
+
+    mlir::Value operand = layout_op.getOperand();
+
+    if (mlir::BlockArgument block_arg =
+            operand.dyn_cast_or_null<mlir::BlockArgument>()) {
+      mlir::func::FuncOp func_op =
+          layout_op->getParentOfType<mlir::func::FuncOp>();
+      if (!func_op) {
+        module.emitError(
+            "Error finding surrounding FuncOp during "
+            "DTensorXlaSpmdIntegration.");
+      }
+      func_op.setArgAttr(
+          block_arg.getArgNumber(), kXlaShardingAttr,
+          builder.getStringAttr(xla_sharding->SerializeAsString()));
+    } else if (mlir::Operation* producing_op = operand.getDefiningOp()) {
+      producing_op->setAttr(
+          kXlaShardingAttr,
+          builder.getStringAttr(xla_sharding->SerializeAsString()));
+    }
+  });
+  return mlir::success();
+}
+
+mlir::LogicalResult SetHloShardingForOutputs(mlir::ModuleOp module,
+                                             mlir::OpBuilder builder) {
+  // Set output attributes
+  module.walk([&](mlir::func::ReturnOp return_op) {
+    for (auto return_index = 0; return_index < return_op.getNumOperands();
+         ++return_index) {
+      if (auto layout_op = llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(
+              return_op->getOperand(return_index).getDefiningOp())) {
+        StatusOr<::xla::OpSharding> xla_sharding =
+            ConvertLayoutToXlaOpSharding(layout_op.getLayout());
+
+        if (!xla_sharding.ok())
+          module.emitError(xla_sharding.status().error_message());
+
+        mlir::func::FuncOp func_op =
+            layout_op->getParentOfType<mlir::func::FuncOp>();
+        if (!func_op) {
+          module.emitError(
+              "Error finding surrounding FuncOp during "
+              "DTensorXlaSpmdIntegration.");
+        }
+
+        func_op.setResultAttr(
+            return_index, kXlaShardingAttr,
+            builder.getStringAttr(xla_sharding->SerializeAsString()));
+      }
+    }
+  });
+  return mlir::success();
+}
+
+class DTensorSetHloShardingPass
+    : public impl::DTensorSetHloShardingPassBase<DTensorSetHloShardingPass> {
+ public:
+  using DTensorSetHloShardingPassBase::DTensorSetHloShardingPassBase;
+
+  explicit DTensorSetHloShardingPass(
+      llvm::Optional<bool> check_layout_use_xla_spmd) {
+    if (check_layout_use_xla_spmd.has_value()) {
+      check_layout_use_xla_spmd_ = *check_layout_use_xla_spmd;
+    }
+  }
+
+  void runOnOperation() override {
+    mlir::MLIRContext& context = getContext();
+    mlir::OpBuilder builder(&context);
+    mlir::ModuleOp module = getOperation();
+    if (mlir::failed(SetHloShardingForInputsAndOps(
+            module, builder, check_layout_use_xla_spmd_.getValue()))) {
+      return signalPassFailure();
+    }
+
+    if (mlir::failed(SetHloShardingForOutputs(module, builder))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorSetHloShardingPass(
+    llvm::Optional<bool> check_layout_use_xla_spmd) {
+  return std::make_unique<DTensorSetHloShardingPass>(check_layout_use_xla_spmd);
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
index 5f4af496293..024133259e8 100644
--- a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
@@ -44,9 +44,9 @@ StatusOr<Layout> ComputeResultLayout(mlir::Operation* op,
                                  " not yet implemented.");
 
   auto argmax_op = llvm::cast<mlir::TF::ArgMaxOp>(op);
-  const auto input_rank = ValueRank(argmax_op.input());
+  const auto input_rank = ValueRank(argmax_op.getInput());
   TF_ASSIGN_OR_RETURN(int64_t axis,
-                      ExtractConstIntFromValue(argmax_op.dimension()));
+                      ExtractConstIntFromValue(argmax_op.getDimension()));
 
   if (axis < 0) axis += input_rank;
 
@@ -65,15 +65,15 @@ StatusOr<Layout> ComputeResultLayout(mlir::Operation* op,
 StatusOr<mlir::Operation*> ArgMaxSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto argmax_op = llvm::cast<mlir::TF::ArgMaxOp>(op);
   TF_ASSIGN_OR_RETURN(int64_t axis,
-                      ExtractConstIntFromValue(argmax_op.dimension()));
+                      ExtractConstIntFromValue(argmax_op.getDimension()));
   TF_ASSIGN_OR_RETURN(auto input_layout,
-                      ExtractLayoutFromOperand(argmax_op.input()));
+                      ExtractLayoutFromOperand(argmax_op.getInput()));
   TF_ASSIGN_OR_RETURN(auto output_layout, ExtractSingleLayoutFromOp(argmax_op));
   if (!input_layout || !output_layout)
     return errors::InvalidArgument(
         OpName(op), " is missing layouts during SPMD Expansion.");
 
-  auto input = argmax_op.input();
+  mlir::Value input = argmax_op.getInput();
   const auto input_rank = ValueRank(input);
 
   TF_ASSIGN_OR_RETURN(auto input_shape, GetShapeOfValue(input));
@@ -110,8 +110,8 @@ StatusOr<mlir::Operation*> ArgMaxSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   auto new_argmax = builder.create<mlir::TF::ArgMaxOp>(
       argmax_op.getLoc(), argmax_op.getResult().getType(), input,
-      argmax_op.dimension());
-  op->getResult(0).replaceAllUsesWith(new_argmax.output());
+      argmax_op.getDimension());
+  op->getResult(0).replaceAllUsesWith(new_argmax.getOutput());
   op->erase();
 
   return InferSPMDExpandedLocalShape(new_argmax);
@@ -142,8 +142,8 @@ StatusOr<llvm::DenseMap<int, Layout>> ArgMaxSPMDExpander::ComputeLayoutBackward(
 
   auto argmax_op = llvm::cast<mlir::TF::ArgMaxOp>(op);
   TF_ASSIGN_OR_RETURN(int64_t axis,
-                      ExtractConstIntFromValue(argmax_op.dimension()));
-  auto input = argmax_op.input();
+                      ExtractConstIntFromValue(argmax_op.getDimension()));
+  auto input = argmax_op.getInput();
   const auto input_rank = ValueRank(input);
 
   // Handle the case of negative axis.
diff --git a/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.cc
index 5ff5a021523..fa9ce8c6435 100644
--- a/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/bias_add_spmd_expander.cc
@@ -53,7 +53,7 @@ StatusOr<mlir::Operation*> BiasAddExpander::ExpandOp(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(auto output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
   mlir::TF::BiasAddOp bias_add_op = llvm::cast<mlir::TF::BiasAddOp>(op);
-  const llvm::StringRef data_format = bias_add_op.data_format();
+  const llvm::StringRef data_format = bias_add_op.getDataFormat();
   const int c_dim_idx = get_c_dimension_idx(output_layout, data_format);
 
   // Bias add op has 2 inputs: value and bias.
@@ -121,7 +121,7 @@ StatusOr<llvm::DenseMap<int, Layout>> BiasAddExpander::ComputeLayoutForward(
 
   Layout input_layout = input_layouts.lookup(0);
   mlir::TF::BiasAddOp bias_add_op = llvm::cast<mlir::TF::BiasAddOp>(op);
-  llvm::StringRef data_format = bias_add_op.data_format();
+  llvm::StringRef data_format = bias_add_op.getDataFormat();
   int c_dim_idx = get_c_dimension_idx(input_layout, data_format);
 
   std::vector<std::string> new_output_layout_specs =
@@ -158,7 +158,7 @@ StatusOr<llvm::DenseMap<int, Layout>> BiasAddExpander::ComputeLayoutBackward(
 
   // Bias layout should match 'C' dimension of input layout.
   mlir::TF::BiasAddOp bias_add_op = llvm::cast<mlir::TF::BiasAddOp>(op);
-  llvm::StringRef data_format = bias_add_op.data_format();
+  llvm::StringRef data_format = bias_add_op.getDataFormat();
   const int c_dim_idx = get_c_dimension_idx(output_layout, data_format);
 
   std::vector<std::string> bias_new_specs = {
diff --git a/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
index ba6450daaac..12e4bd86e77 100644
--- a/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/broadcast_to_spmd_expander.cc
@@ -37,26 +37,28 @@ namespace dtensor {
 StatusOr<mlir::Operation*> BroadcastToSPMDExpander::ExpandOp(
     mlir::Operation* op) {
   auto broadcast_op = llvm::cast<mlir::TF::BroadcastToOp>(op);
-  TF_ASSIGN_OR_RETURN(const Layout shape_layout,
-                      ExtractRequiredLayoutFromOperand(broadcast_op.shape()));
+  TF_ASSIGN_OR_RETURN(
+      const Layout shape_layout,
+      ExtractRequiredLayoutFromOperand(broadcast_op.getShape()));
   if (!shape_layout.IsFullyReplicated()) {
     return errors::InvalidArgument(
         "Error during BroadcastOp SPMD Expansion. Shape input of broadcast op "
         "must be fully replicated.");
   }
 
-  TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(broadcast_op.input()));
+  TF_ASSIGN_OR_RETURN(
+      const Layout input_layout,
+      ExtractRequiredLayoutFromOperand(broadcast_op.getInput()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(broadcast_op));
 
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> input_global_size,
-      GetGlobalShapeOfValueFromDTensorLayout(broadcast_op.input()));
+      GetGlobalShapeOfValueFromDTensorLayout(broadcast_op.getInput()));
 
   llvm::SmallVector<int64_t, 4> broadcast_to_shape;
   TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(
-      GetForwardedDTensorLayoutInput(broadcast_op.shape()),
+      GetForwardedDTensorLayoutInput(broadcast_op.getShape()),
       &broadcast_to_shape));
 
   // Input to BroadcastTo op requires all to all if non-broadcasted-dimensions
@@ -81,7 +83,7 @@ StatusOr<mlir::Operation*> BroadcastToSPMDExpander::ExpandOp(
   // Insert all-to-all operations just before Broadcast op to ensure all inputs
   // in correct local values.
   mlir::OpBuilder builder(op);
-  mlir::Value input_data = broadcast_op.input();
+  mlir::Value input_data = broadcast_op.getInput();
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
   const Layout all_to_all_input_layout =
       Layout::ReplicatedOnMesh(mesh, input_layout.rank());
@@ -132,10 +134,10 @@ BroadcastToSPMDExpander::ComputeLayoutForward(
   auto broadcast_op = llvm::cast<mlir::TF::BroadcastToOp>(op);
   TF_ASSIGN_OR_RETURN(
       const auto broadcasted_output_shape,
-      GetShapeOfValue(broadcast_op.output(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(broadcast_op.getOutput(), /*fail_on_dynamic=*/true));
   TF_ASSIGN_OR_RETURN(
       const auto input_shape,
-      GetShapeOfValue(broadcast_op.input(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(broadcast_op.getInput(), /*fail_on_dynamic=*/true));
 
   // Broadcasting works from trailing dimensions and dimensions are broadcasted
   // in forward direction.
@@ -178,10 +180,10 @@ BroadcastToSPMDExpander::ComputeLayoutBackward(
   auto broadcast_op = llvm::cast<mlir::TF::BroadcastToOp>(op);
   TF_ASSIGN_OR_RETURN(
       const auto broadcasted_output_shape,
-      GetShapeOfValue(broadcast_op.output(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(broadcast_op.getOutput(), /*fail_on_dynamic=*/true));
   TF_ASSIGN_OR_RETURN(
       const auto input_shape,
-      GetShapeOfValue(broadcast_op.input(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(broadcast_op.getInput(), /*fail_on_dynamic=*/true));
 
   // Broadcasting works from trailing dimensions and dimensions are broadcasted
   // in forward direction.
diff --git a/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
index a266bb53e9c..6c618def560 100644
--- a/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/control_flow_spmd_expander.cc
@@ -33,8 +33,8 @@ StatusOr<mlir::Operation*> WhileRegionSPMDExpander::ExpandOp(
 
   auto while_op = llvm::cast<mlir::TF::WhileRegionOp>(op);
   for (const auto& data :
-       llvm::enumerate(llvm::zip(while_op.cond().front().getArguments(),
-                                 while_op.body().front().getArguments()))) {
+       llvm::enumerate(llvm::zip(while_op.getCond().front().getArguments(),
+                                 while_op.getBody().front().getArguments()))) {
     const int index = data.index();
     mlir::BlockArgument cond_arg = std::get<0>(data.value());
     mlir::BlockArgument body_arg = std::get<1>(data.value());
@@ -70,9 +70,9 @@ StatusOr<mlir::Operation*> IfRegionSPMDExpander::ExpandOp(mlir::Operation* op) {
       return errors::InvalidArgument(
           "Missing layout of If op result during SPMD expansion.");
 
-    const Layout layout = result_layout_op.layout();
+    const Layout layout = result_layout_op.getLayout();
     if (!layout.IsFullyReplicated()) {
-      const auto global_shape = result_layout_op.global_shape();
+      const auto global_shape = result_layout_op.getGlobalShape();
       if (!global_shape)
         return errors::InvalidArgument(
             "Shape of If op must be statically known for SPMD expansion.");
diff --git a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
index 1e2f4b4be26..96ad5855858 100644
--- a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -46,9 +47,9 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
 
   // Data format "NCHW" or "NCDHW".
   int channel_dim = 1;
-  if (conv_op.data_format() == "NHWC")
+  if (conv_op.getDataFormat() == "NHWC")
     channel_dim = 3;
-  else if (conv_op.data_format() == "NDHWC")
+  else if (conv_op.getDataFormat() == "NDHWC")
     channel_dim = 4;
 
   if (input_layout.sharding_spec(channel_dim) != Layout::kUnshardedDim)
@@ -59,13 +60,13 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
     // No further checks needed for replicated case.
     return OkStatus();
 
-  if (conv_op.padding() == "EXPLICIT")
+  if (conv_op.getPadding() == "EXPLICIT")
     return errors::InvalidArgument(
         "Explicit padding not supported for convolution with spatial "
         "partitions.");
 
   const int num_non_default_dilations =
-      llvm::count_if(conv_op.dilations(), [](mlir::Attribute dilation) {
+      llvm::count_if(conv_op.getDilations(), [](mlir::Attribute dilation) {
         return dilation.cast<mlir::IntegerAttr>().getInt() != 1;
       });
   if (num_non_default_dilations > 0)
@@ -75,21 +76,21 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
 
   // TODO(b/208700444): support convolution with strides greater than 1.
   const int num_non_default_strides =
-      llvm::count_if(conv_op.strides(), [](mlir::Attribute stride) {
+      llvm::count_if(conv_op.getStrides(), [](mlir::Attribute stride) {
         return stride.cast<mlir::IntegerAttr>().getInt() != 1;
       });
   if (num_non_default_strides > 0)
     return errors::InvalidArgument(
         "Only stride 1 is supported for convolution with spatial partitions.");
 
-  mlir::Value input = conv_op.input();
+  mlir::Value input = conv_op.getInput();
   auto input_type = input.getType().dyn_cast<mlir::RankedTensorType>();
   if (!input_type || !input_type.hasStaticShape())
     return errors::InvalidArgument(
         "Input must have static shapes for convolution with spatial "
         "partitions.");
 
-  mlir::Value filter = conv_op.filter();
+  mlir::Value filter = conv_op.getFilter();
   auto filter_type = filter.getType().dyn_cast<mlir::RankedTensorType>();
   if (!filter_type || !filter_type.hasStaticShape())
     return errors::InvalidArgument(
@@ -139,9 +140,9 @@ template <typename ConvOp>
 StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
   mlir::OpBuilder builder(conv_op);
   TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(conv_op.input()));
+                      ExtractRequiredLayoutFromOperand(conv_op.getInput()));
   TF_ASSIGN_OR_RETURN(const Layout filter_layout,
-                      ExtractRequiredLayoutFromOperand(conv_op.filter()));
+                      ExtractRequiredLayoutFromOperand(conv_op.getFilter()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(conv_op));
 
@@ -162,14 +163,14 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
       input_layout.sharding_spec_strs();
   const std::vector<std::string> output_sharding_spec =
       output_layout.sharding_spec_strs();
-  llvm::StringRef format = conv_op.data_format();
-  llvm::StringRef padding = conv_op.padding();
+  llvm::StringRef format = conv_op.getDataFormat();
+  llvm::StringRef padding = conv_op.getPadding();
 
   const auto input_num_shards = input_layout.num_shards();
   const auto output_num_shards = output_layout.num_shards();
 
   auto filter_type =
-      conv_op.filter().getType().template dyn_cast<mlir::RankedTensorType>();
+      conv_op.getFilter().getType().template dyn_cast<mlir::RankedTensorType>();
   auto filter_shape = filter_type.getShape();
 
   int begin_input_dim = -1, end_input_dim = -1;
@@ -192,8 +193,9 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
        ++curr_input_dim) {
     int curr_filter_dim = curr_input_dim - begin_input_dim;
 
-    auto input_type =
-        conv_op.input().getType().template dyn_cast<mlir::RankedTensorType>();
+    auto input_type = conv_op.getInput()
+                          .getType()
+                          .template dyn_cast<mlir::RankedTensorType>();
     auto input_shape = input_type.getShape();
 
     if (input_sharding_spec[curr_input_dim] == Layout::kUnshardedDim) {
@@ -201,7 +203,7 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
         // Since we always emit a Conv op with "VALID" padding, we need to
         // manually pad the input tensor.
         conv_op->setOperand(
-            0, PadInputOnUnshardedDim(builder, location, conv_op.input(),
+            0, PadInputOnUnshardedDim(builder, location, conv_op.getInput(),
                                       curr_input_dim,
                                       filter_shape[curr_filter_dim]));
       }
@@ -240,7 +242,8 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
         mlir::Value halo_exchanged_input,
         EmitHaloExchange(builder, halo_size,
                          input_sharding_spec[curr_input_dim], input_layout,
-                         mesh_coordinates, cluster, location, conv_op.input()));
+                         mesh_coordinates, cluster, location,
+                         conv_op.getInput()));
 
     if (padding == "SAME") {
       conv_op->setOperand(0, halo_exchanged_input);
@@ -279,7 +282,7 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
 
     // Spatially partitioned convolution always uses VALID padding after halo
     // exchange.
-    conv_op.paddingAttr(builder.getStringAttr("VALID"));
+    conv_op.setPaddingAttr(builder.getStringAttr("VALID"));
   }
 
   return InferSPMDExpandedLocalShape(conv_op);
@@ -288,54 +291,261 @@ StatusOr<mlir::Operation*> HandleConv(ConvOp conv_op) {
 template <typename ConvBackpropInputOp>
 StatusOr<mlir::Operation*> HandleConvBackpropInput(
     const Layout& output_layout, ConvBackpropInputOp conv_op) {
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> input_layouts,
+                      ExtractRequiredLayoutFromOperands(conv_op));
+
+  const Layout& input_shape_layout = input_layouts[0];
+  const Layout& filter_layout = input_layouts[1];
+  const Layout& grad_layout = input_layouts[2];
+
+  // We only support batch sharding for these. In this case the output and input
+  // gradient must both be batch sharded. The filter input must be replicated.
+  if (!output_layout.IsBatchParallel() || !grad_layout.IsBatchParallel()) {
+    return errors::InvalidArgument("{0} only supports batch parallel layouts.",
+                                   conv_op->getName().getStringRef().str());
+  }
+  if (!filter_layout.IsFullyReplicated()) {
+    return errors::InvalidArgument("{0} only supports replicated filters.",
+                                   conv_op->getName().getStringRef().str());
+  }
+  if (!input_shape_layout.IsFullyReplicated()) {
+    return errors::InvalidArgument(
+        "Layout of the input shape (parameter 0) of {0} must be replicated.",
+        conv_op->getName().getStringRef().str());
+  }
+
   llvm::SmallVector<int64_t, 4> global_shape;
   Status extract_status =
-      ExtractConstVectorFromValue(conv_op.input_sizes(), &global_shape);
-
-  // Recover local shape in SPMD expansion.
-  if (extract_status.ok()) {
-    auto local_shape = output_layout.LocalShapeFromGlobalShape(global_shape);
-    mlir::OpBuilder builder(conv_op->getBlock(), conv_op->getBlock()->begin());
-    auto new_const = IntConst(
-        builder, conv_op->getLoc(),
-        llvm::SmallVector<int32_t, 4>(local_shape.begin(), local_shape.end()));
-    conv_op.input_sizesMutable().assign(new_const);
+      ExtractConstVectorFromValue(conv_op.getInputSizes(), &global_shape);
+
+  // If the input is dynamic size, we expect the output is all so dynamic size
+  // since they should roughly be the same shape. Don't support this for right
+  // now. The easy way to support this is to all gather the gradient input and
+  // compute this as a large local convolution and then slice to the output
+  // layout.
+  if (!extract_status.ok()) {
+    return errors::InvalidArgument("{0} requires static shape for input size.",
+                                   conv_op->getName().getStringRef().str());
   }
 
-  return InferSPMDExpandedLocalShape(conv_op);
+  // Compute the 'true' input/output layout of the operation. E.g. batch sharded
+  // vs non-batch sharded. If at least one of the the input gradient or output
+  // gradient is batch sharded, use that dimension.
+  string batch_sharding_dimension = grad_layout.sharding_spec(0);
+  if (batch_sharding_dimension == Layout::kUnshardedDim) {
+    batch_sharding_dimension = output_layout.sharding_spec(0);
+  } else if ((output_layout.sharding_spec(0) != Layout::kUnshardedDim) &&
+             (batch_sharding_dimension != output_layout.sharding_spec(0))) {
+    return errors::InvalidArgument(
+        "Input and output layout to {2} have incompatible sharding dimensions: "
+        "\"{0}\" and \"{1}\".",
+        grad_layout.sharding_spec(0), output_layout.sharding_spec(0),
+        conv_op->getName().getStringRef().str());
+  }
+
+  const Layout desired_input_gradient_layout =
+      Layout::BatchShardedLike(grad_layout, batch_sharding_dimension);
+  const Layout desired_output_gradient_layout =
+      Layout::BatchShardedLike(output_layout, batch_sharding_dimension);
+
+  const Layout desired_input_layout = Layout::BatchShardedOnMesh(
+      grad_layout.mesh(), global_shape.size(), batch_sharding_dimension);
+  const std::vector<int64_t> local_shape =
+      desired_input_layout.LocalShapeFromGlobalShape(global_shape);
+
+  mlir::OpBuilder builder(conv_op.getOperation());
+  mlir::Value new_const = IntConst(
+      builder, conv_op->getLoc(),
+      llvm::SmallVector<int32_t, 4>(local_shape.begin(), local_shape.end()));
+  conv_op.getInputSizesMutable().assign(new_const);
+
+  TF_ASSIGN_OR_RETURN(mlir::Value local_input_gradient,
+                      EmitRelayout(conv_op.getOutBackprop(), grad_layout,
+                                   desired_input_gradient_layout));
+  conv_op.getOutBackpropMutable().assign(local_input_gradient);
+
+  InferSPMDExpandedLocalShape(conv_op);
+
+  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
+  TF_ASSIGN_OR_RETURN(
+      mlir::Value local_output_gradient,
+      EmitRelayout(conv_op.getOutput(), desired_output_gradient_layout,
+                   output_layout, &newly_created_ops));
+  conv_op.getOutput().replaceAllUsesExcept(local_output_gradient,
+                                           newly_created_ops);
+  return local_output_gradient.getDefiningOp();
+}
+
+// This expands backprop ops which take tensor inputs into those which take
+// sizes. We first convert the (currently local) input shape to global and use
+// the const rather than input. The shape will be converted back to local in
+// HandleConvBackpropInput, but this is the correct behavior as
+// HandleConvBackpropInput will decided how it wants to expand the op.
+template <typename To, typename From>
+StatusOr<mlir::Operation*> HandleConvBackpropInputTensor(
+    const Layout& output_layout, From conv_op) {
+  TF_ASSIGN_OR_RETURN(llvm::SmallVector<int64_t> local_shape,
+                      GetTFShapeFromType(conv_op.getInput().getType()));
+
+  TF_ASSIGN_OR_RETURN(const Layout input_layout,
+                      ExtractRequiredLayoutFromOperand(conv_op.getInput()));
+
+  const std::vector<int64_t> global_shape =
+      input_layout.GlobalShapeFromLocalShape(local_shape);
+
+  mlir::OpBuilder builder(conv_op);
+  mlir::Value global_input_shape = IntConst(
+      builder, conv_op->getLoc(),
+      llvm::SmallVector<int32_t, 4>(global_shape.begin(), global_shape.end()));
+
+  // Insert a replicated layout along this edge, so that we can call
+  // HandleConvBackpropInput which expects there to be a layout here.
+  mlir::TF::ShapeAttr global_input_shape_shape = mlir::TF::ShapeAttr::get(
+      builder.getContext(),
+      global_input_shape.getType().cast<mlir::TensorType>());
+  mlir::TF::DTensorLayout global_input_shape_with_layout =
+      builder.create<mlir::TF::DTensorLayout>(
+          conv_op->getLoc(), global_input_shape,
+          mlir::dtensor::LayoutAttr::get(
+              builder.getContext(),
+              Layout::ReplicatedOnMesh(input_layout.mesh(), 1)),
+          global_input_shape_shape);
+
+  To new_conv = builder.create<To>(
+      conv_op->getLoc(), conv_op->getResultTypes(),
+      mlir::ValueRange({global_input_shape_with_layout, conv_op.getFilter(),
+                        conv_op.getOutBackprop()}),
+      conv_op->getAttrs());
+
+  conv_op.getOutput().replaceAllUsesWith(new_conv.getOutput());
+  conv_op.erase();
+
+  return HandleConvBackpropInput(output_layout, new_conv);
 }
 
 template <typename ConvBackpropFilterOp>
 StatusOr<mlir::Operation*> HandleConvBackpropFilter(
     const Layout& output_layout, ConvBackpropFilterOp conv_op) {
-  TF_ASSIGN_OR_RETURN(Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(conv_op.input()));
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> input_layouts,
+                      ExtractRequiredLayoutFromOperands(conv_op));
+
+  const Layout& input_layout = input_layouts[0];
+  const Layout& filter_shape_layout = input_layouts[1];
+  const Layout& grad_layout = input_layouts[2];
+
+  // We only support batch sharding for these. In this case the input
+  // activations and input gradient should both be batch sharded and
+  // the output (the filter gradient) should be replicated.
+  if (!input_layout.IsBatchParallel() || !grad_layout.IsBatchParallel()) {
+    return errors::InvalidArgument("{0} only supports batch parallel layouts.",
+                                   conv_op->getName().getStringRef().str());
+  }
+  if (!output_layout.IsFullyReplicated()) {
+    return errors::InvalidArgument("{0} only supports replicated filters.",
+                                   conv_op->getName().getStringRef().str());
+  }
+  if (!filter_shape_layout.IsFullyReplicated()) {
+    return errors::InvalidArgument(
+        "Filter shape input (parameter 1) for {0} must have replicated layout.",
+        conv_op->getName().getStringRef().str());
+  }
 
-  TF_ASSIGN_OR_RETURN(
-      Layout out_backprop_layout,
-      ExtractRequiredLayoutFromOperand((conv_op.out_backprop())));
-  // Perform a split on batch dimension so that the each local device performs
-  // local operation.
-  // TODO(hthu): Make this work on input with rank higher than 4.
-  if (input_layout.IsBatchParallel()) {
-    mlir::OpBuilder builder(conv_op);
-    if (out_backprop_layout.IsFullyReplicated()) {
-      TF_ASSIGN_OR_RETURN(const mlir::Value batch_sharded,
-                          EmitAllScatter(builder, conv_op.out_backprop(),
-                                         out_backprop_layout, input_layout));
-      conv_op.out_backpropMutable().assign(batch_sharded);
-    }
+  // Compute the 'true' input layouts of the operation. E.g. batch sharded
+  // vs non-batch sharded. Basically we get the batch sharding dimension from
+  // one of the inputs and check that the other is potentially sharded on the
+  // same dimension.
+  // TODO(b/262417847): if batch_sharding_dimension is Layout::kUnsharded, then
+  // we should consider sharding the input here. It may be faster to spread
+  // the convolution out and then all reduce after vs running it all locally.
+  string batch_sharding_dimension = input_layout.sharding_spec(0);
+  if (batch_sharding_dimension == Layout::kUnshardedDim) {
+    batch_sharding_dimension = grad_layout.sharding_spec(0);
+  } else if ((grad_layout.sharding_spec(0) != Layout::kUnshardedDim) &&
+             (batch_sharding_dimension != grad_layout.sharding_spec(0))) {
+    return errors::InvalidArgument(
+        "Input and gradient layouts for {2} have incompatible batch sharding "
+        "dimensions: \"{0}\" and \"{1}\".",
+        input_layouts[0].sharding_spec(0), input_layouts[0].sharding_spec(0),
+        conv_op->getName().getStringRef().str());
+  }
+
+  const Layout desired_input_activation_layout =
+      Layout::BatchShardedLike(input_layout, batch_sharding_dimension);
+  const Layout desired_input_gradient_layout =
+      Layout::BatchShardedLike(grad_layout, batch_sharding_dimension);
 
-    // Perform all reduce over batch dim.
+  TF_ASSIGN_OR_RETURN(mlir::Value local_input_activation,
+                      EmitRelayout(conv_op.getInput(), input_layout,
+                                   desired_input_activation_layout));
+  conv_op.getInputMutable().assign(local_input_activation);
+
+  TF_ASSIGN_OR_RETURN(mlir::Value local_input_gradient,
+                      EmitRelayout(conv_op.getOutBackprop(), grad_layout,
+                                   desired_input_gradient_layout));
+  conv_op.getOutBackpropMutable().assign(local_input_gradient);
+
+  InferSPMDExpandedLocalShape(conv_op);
+
+  // Output shall be replicated. If we were batch sharded, we need to
+  // all-reduce the partial results.
+
+  if (batch_sharding_dimension != Layout::kUnshardedDim) {
+    mlir::OpBuilder builder(conv_op.getOperation());
     builder.setInsertionPointAfter(conv_op);
     return DT_CTX(EmitAllReduce(builder, output_layout,
-                                {input_layout.sharding_spec(0)}, conv_op,
+                                {batch_sharding_dimension}, conv_op,
                                 kReduceOpAdd));
-  } else {
+  }
+
+  return conv_op.getOperation();
+}
+
+// This expands backprop ops which take tensor inputs into those which take
+// sizes. We check that the filter input shape is global and then make that a
+// const, and replace the op with the version taking shapes.
+template <typename To, typename From>
+StatusOr<mlir::Operation*> HandleConvBackpropFilterTensor(
+    const Layout& output_layout, From conv_op) {
+  TF_ASSIGN_OR_RETURN(const Layout filter_layout,
+                      ExtractRequiredLayoutFromOperand(conv_op.getFilter()));
+
+  if (!filter_layout.IsFullyReplicated()) {
     return errors::InvalidArgument(
-        "Convolution backprop for spatially partitioned input not supported.");
+        "Convolution backpropation ops only support replicated filters.");
   }
-  return InferSPMDExpandedLocalShape(conv_op);
+
+  TF_ASSIGN_OR_RETURN(llvm::SmallVector<int64_t> global_filter_shape,
+                      GetTFShapeFromType(conv_op.getFilter().getType()));
+
+  mlir::OpBuilder builder(conv_op);
+  mlir::Value global_filter_shape_const =
+      IntConst(builder, conv_op->getLoc(),
+               llvm::SmallVector<int32_t, 4>(global_filter_shape.begin(),
+                                             global_filter_shape.end()));
+
+  // Insert a replicated layout along this edge, so that we can call
+  // HandleConvBackpropInput which expects there to be a layout here.
+  mlir::TF::ShapeAttr global_filter_shape_shape = mlir::TF::ShapeAttr::get(
+      builder.getContext(),
+      global_filter_shape_const.getType().cast<mlir::TensorType>());
+  mlir::TF::DTensorLayout global_filter_shape_with_layout =
+      builder.create<mlir::TF::DTensorLayout>(
+          conv_op->getLoc(), global_filter_shape_const,
+          mlir::dtensor::LayoutAttr::get(
+              builder.getContext(),
+              Layout::ReplicatedOnMesh(filter_layout.mesh(), 1)),
+          global_filter_shape_shape);
+
+  To new_conv = builder.create<To>(
+      conv_op->getLoc(), conv_op->getResultTypes(),
+      mlir::ValueRange({conv_op.getInput(), global_filter_shape_with_layout,
+                        conv_op.getOutBackprop()}),
+      conv_op->getAttrs());
+
+  conv_op.getOutput().replaceAllUsesWith(new_conv.getOutput());
+  conv_op.erase();
+
+  return HandleConvBackpropFilter(output_layout, new_conv);
 }
 
 StatusOr<mlir::Operation*> HandleMaxPoolGradOp(
@@ -384,6 +594,12 @@ StatusOr<mlir::Operation*> ConvSPMDExpander::ExpandOp(mlir::Operation* op) {
   if (llvm::isa<mlir::TF::Conv2DBackpropInputOp>(op))
     return HandleConvBackpropInput<>(
         *output_layout, llvm::cast<mlir::TF::Conv2DBackpropInputOp>(op));
+  if (auto conv_op = llvm::dyn_cast<mlir::TF::Conv2DBackpropInputV2Op>(op))
+    return HandleConvBackpropInputTensor<mlir::TF::Conv2DBackpropInputOp>(
+        *output_layout, conv_op);
+  if (auto conv_op = llvm::dyn_cast<mlir::TF::Conv3DBackpropInputOp>(op))
+    return HandleConvBackpropInputTensor<mlir::TF::Conv3DBackpropInputV2Op>(
+        *output_layout, conv_op);
   if (llvm::isa<mlir::TF::Conv3DBackpropInputV2Op>(op))
     return HandleConvBackpropInput<>(
         *output_layout, llvm::cast<mlir::TF::Conv3DBackpropInputV2Op>(op));
@@ -392,6 +608,12 @@ StatusOr<mlir::Operation*> ConvSPMDExpander::ExpandOp(mlir::Operation* op) {
   if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp>(op))
     return HandleConvBackpropFilter<>(
         *output_layout, llvm::cast<mlir::TF::Conv2DBackpropFilterOp>(op));
+  if (auto conv_op = llvm::dyn_cast<mlir::TF::Conv2DBackpropFilterV2Op>(op))
+    return HandleConvBackpropFilterTensor<mlir::TF::Conv2DBackpropFilterOp>(
+        *output_layout, conv_op);
+  if (auto conv_op = llvm::dyn_cast<mlir::TF::Conv3DBackpropFilterOp>(op))
+    return HandleConvBackpropFilterTensor<mlir::TF::Conv3DBackpropFilterV2Op>(
+        *output_layout, conv_op);
   if (llvm::isa<mlir::TF::Conv3DBackpropFilterV2Op>(op))
     return HandleConvBackpropFilter<>(
         *output_layout, llvm::cast<mlir::TF::Conv3DBackpropFilterV2Op>(op));
@@ -437,28 +659,37 @@ StatusOr<llvm::DenseMap<int, Layout>> ConvSPMDExpander::ComputeLayoutForward(
       }
     }
   } else if (llvm::isa<mlir::TF::Conv2DBackpropInputOp,
-                       mlir::TF::Conv3DBackpropInputV2Op,
-                       mlir::TF::Conv2DBackpropFilterOp,
-                       mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
-    // Conv BackProp ops should usually take layout from gradient for both
-    // inputs and filters.
-
-    // 'grad' layout
-    if (input_layouts.find(2) != input_layouts.end()) {
-      if (llvm::isa<mlir::TF::Conv2DBackpropInputOp,
-                    mlir::TF::Conv3DBackpropInputV2Op>(op)) {
-        // BackProp ops try to respect layout from gradients for inputs.
+                       mlir::TF::Conv2DBackpropInputV2Op,
+                       mlir::TF::Conv3DBackpropInputOp,
+                       mlir::TF::Conv3DBackpropInputV2Op>(op)) {
+    if (llvm::isa<mlir::TF::Conv2DBackpropInputOp,
+                  mlir::TF::Conv3DBackpropInputV2Op>(op)) {
+      if (input_layouts.find(2) != input_layouts.end()) {
+        // The propagate the gradient layout to the new gradient, e.g. respect
+        // the spatial partitioning of the input gradient.
         output_layouts[0] = input_layouts.lookup(2);
       }
-
-      // For filters, we currently only try to request a replicated output
-      // layout.
-      if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp,
-                    mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
-        output_layouts[0] =
-            Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOpResult(0)));
+    } else {
+      if (input_layouts.find(0) != input_layouts.end()) {
+        // The propagate the gradient layout to the new gradient, e.g. respect
+        // the spatial partitioning of the input.
+        output_layouts[0] = input_layouts.lookup(0);
       }
     }
+  } else if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp,
+                       mlir::TF::Conv2DBackpropFilterV2Op,
+                       mlir::TF::Conv3DBackpropFilterOp,
+                       mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
+    if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp,
+                  mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
+      // For the ops which take filter shape as input, just return a replicated
+      // output shape.
+      output_layouts[0] =
+          Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOpResult(0)));
+    } else if (input_layouts.find(1) != input_layouts.end()) {
+      // For the ops taking a real filter, just copy the filter layout.
+      output_layouts[0] = input_layouts.lookup(1);
+    }
   } else {
     return errors::InvalidArgument(
         llvm::formatv(
@@ -496,18 +727,55 @@ StatusOr<llvm::DenseMap<int, Layout>> ConvSPMDExpander::ComputeLayoutBackward(
       }
     }
   } else if (llvm::isa<mlir::TF::Conv2DBackpropInputOp,
-                       mlir::TF::Conv3DBackpropInputV2Op,
-                       mlir::TF::Conv2DBackpropFilterOp,
-                       mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
-    // If suggested output layout exists, try to request grad to have output
-    // layout.
-    if (output_layouts.find(0) != output_layouts.end()) {
-      input_layouts[2] = output_layouts.lookup(0);
-      // Request inputs and filter_sizes to be replicated.
+                       mlir::TF::Conv2DBackpropInputV2Op,
+                       mlir::TF::Conv3DBackpropInputOp,
+                       mlir::TF::Conv3DBackpropInputV2Op>(op)) {
+    // Generally mark the filter as replicated.
+    input_layouts[1] =
+        Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(1)));
+    if (llvm::isa<mlir::TF::Conv2DBackpropInputOp,
+                  mlir::TF::Conv3DBackpropInputV2Op>(op)) {
+      // This input is a shape.
       input_layouts[0] =
           Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(0)));
+    }
+
+    if (output_layouts.find(0) != output_layouts.end()) {
+      Layout output_layout = output_layouts.lookup(0);
+      // Ask for the grad to have the same layout as the output. The reasoning
+      // here is that the if the output is spatially partitioned, we expect
+      // that the grad is spatially partitioned as well.
+      input_layouts[2] = output_layout;
+      if (llvm::isa<mlir::TF::Conv2DBackpropInputV2Op,
+                    mlir::TF::Conv3DBackpropInputOp>(op)) {
+        input_layouts[0] = output_layout;
+      }
+    }
+  } else if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp,
+                       mlir::TF::Conv2DBackpropFilterV2Op,
+                       mlir::TF::Conv3DBackpropFilterOp,
+                       mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
+    // Note for Filter op, we generally expect that the output layout would
+    // match the variable layout for the filter which is generally replicated.
+    // The gradient layout most likely needs to agree with the input layout,
+    // e.g. both spatially partitioned or not. This is somewhat similar to
+    // MatMul, for now just set both to replicated.
+
+    input_layouts[0] =
+        Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(0)));
+    input_layouts[2] =
+        Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(2)));
+
+    if (llvm::isa<mlir::TF::Conv2DBackpropFilterOp,
+                  mlir::TF::Conv3DBackpropFilterV2Op>(op)) {
+      // For ops taking filter shape as input, just use a replicated input
+      // layout.
       input_layouts[1] =
           Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(1)));
+    } else if (output_layouts.find(0) != output_layouts.end()) {
+      // For ops taking filer directly as input copy the output layout to the
+      // filter layout.
+      input_layouts[1] = output_layouts.lookup(0);
     }
   } else {
     return errors::InvalidArgument(
diff --git a/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
index dfd276e35ab..a98a938f822 100644
--- a/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/cumsum_spmd_expander.cc
@@ -43,8 +43,8 @@ StatusOr<int64_t> GetAxisDimension(mlir::Operation* op) {
         absl::StrCat("Expected Cumsum op but got : ", OpName(op)).c_str());
   }
   TF_ASSIGN_OR_RETURN(int64_t axis_dim,
-                      ExtractConstIntFromValue(cumsum.axis()));
-  int64_t tensor_rank = ValueRank(cumsum.x());
+                      ExtractConstIntFromValue(cumsum.getAxis()));
+  int64_t tensor_rank = ValueRank(cumsum.getX());
   // Axis can be in range [-tensor_rank, tensor_rank), so we add tensor_rank
   // to wrap it around.
   if (axis_dim >= -tensor_rank && axis_dim < 0) {
diff --git a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
index 611a70a3b1c..0e06d9941a7 100644
--- a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
diff --git a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
index 8905cdf130b..8258181bc11 100644
--- a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
@@ -65,14 +65,14 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
   if (!dtensor_send || !dtensor_recv) return OkStatus();
 
   TF_ASSIGN_OR_RETURN(const absl::optional<Layout> send_layout_or_null,
-                      ExtractLayoutFromOperand(dtensor_send.input()));
+                      ExtractLayoutFromOperand(dtensor_send.getInput()));
 
   if (!send_layout_or_null.has_value())
     return errors::InvalidArgument(
         "Input to DTensorSend must have specified layout.");
 
   const Layout& send_layout = send_layout_or_null.value();
-  const Layout recv_layout = dtensor_recv.layout();
+  const Layout recv_layout = dtensor_recv.getLayout();
 
   const Mesh& send_mesh = send_layout.mesh();
   const Mesh& recv_mesh = recv_layout.mesh();
@@ -139,7 +139,7 @@ StatusOr<Layout> MergeLayouts(
 StatusOr<llvm::DenseMap<int, Layout>> ComputeRelayoutLayout(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& layouts) {
   mlir::TF::RelayoutOp relayout = llvm::cast<mlir::TF::RelayoutOp>(op);
-  mlir::StringRef layout_attr = relayout.layout();
+  mlir::StringRef layout_attr = relayout.getLayout();
   TF_ASSIGN_OR_RETURN(const Layout mask_layout,
                       Layout::FromString(layout_attr.str()));
 
@@ -167,13 +167,13 @@ StatusOr<llvm::DenseMap<int, Layout>> ComputeRelayoutLayout(
 
 StatusOr<mlir::Operation*> RelayoutSPMDExpander::ExpandOp(mlir::Operation* op) {
   mlir::TF::RelayoutOp relayout = mlir::cast<mlir::TF::RelayoutOp>(op);
-  mlir::StringRef layout_attr = relayout.layout();
+  mlir::StringRef layout_attr = relayout.getLayout();
   TF_ASSIGN_OR_RETURN(const Layout target_layout,
                       Layout::FromString(layout_attr.str()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
   TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(relayout.input()));
+                      ExtractRequiredLayoutFromOperand(relayout.getInput()));
   bool match_present = false;
   for (const std::string& sharding_spec : target_layout.sharding_spec_strs())
     if (sharding_spec == Layout::kMatch) match_present = true;
@@ -189,14 +189,14 @@ StatusOr<mlir::Operation*> RelayoutSPMDExpander::ExpandOp(mlir::Operation* op) {
     // Replace with identity op.
     mlir::OpBuilder builder(relayout);
     mlir::TF::IdentityOp op = builder.create<mlir::TF::IdentityOp>(
-        relayout.getLoc(), relayout.input().getType(), relayout.input());
-    relayout.output().replaceAllUsesWith(op.output());
+        relayout.getLoc(), relayout.getInput().getType(), relayout.getInput());
+    relayout.getOutput().replaceAllUsesWith(op.getOutput());
     relayout.erase();
     return op.getOperation();
   }
 
   auto value_or_status =
-      EmitRelayout(relayout.input(), input_layout, output_layout);
+      EmitRelayout(relayout.getInput(), input_layout, output_layout);
   if (!value_or_status.ok())
     return errors::InvalidArgument(
         llvm::formatv("Unsupported layout received for tf.Relayout op. Trying "
@@ -206,7 +206,7 @@ StatusOr<mlir::Operation*> RelayoutSPMDExpander::ExpandOp(mlir::Operation* op) {
                       value_or_status.status().error_message())
             .str());
   mlir::Value output = value_or_status.value();
-  relayout.output().replaceAllUsesWith(output);
+  relayout.getOutput().replaceAllUsesWith(output);
   relayout.erase();
   return output.getDefiningOp();
 }
@@ -281,7 +281,7 @@ DTensorRecvSPMDExpander::ComputeLayoutForward(
     return errors::InvalidArgument(
         llvm::formatv("Expecting DTensorRecvOp but got {0}", OpName(op)).str());
   }
-  return llvm::DenseMap<int, Layout>({{0, dtensor_recv.layout()}});
+  return llvm::DenseMap<int, Layout>({{0, dtensor_recv.getLayout()}});
 }
 
 StatusOr<llvm::DenseMap<int, Layout>>
diff --git a/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
index 5736dccdda0..6b439730931 100644
--- a/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
@@ -60,7 +60,7 @@ StatusOr<mlir::Operation*> EinsumSPMDExpander::ExpandOp(mlir::Operation* op) {
                                          layout_after_einsum, new_inputs));
 
   mlir::OpBuilder builder(op);
-  mlir::BlockAndValueMapping mapping;
+  mlir::IRMapping mapping;
   for (int i = 0; i < op->getNumOperands(); ++i)
     mapping.map(op->getOperand(i), new_inputs[i]);
   mlir::Operation* new_op = builder.clone(*op, mapping);
@@ -255,7 +255,7 @@ StatusOr<llvm::DenseMap<int, Layout>> EinsumSPMDExpander::ComputeLayoutForward(
   // Need the mapping of input and output labels from the equation.
   auto einsum_op = mlir::cast<mlir::TF::EinsumOp>(op);
   size_t num_inputs = einsum_op.getNumOperands();
-  std::string equation = einsum_op.equation().str();
+  std::string equation = einsum_op.getEquation().str();
   absl::flat_hash_set<char> reduced_dim_labels;
   std::vector<absl::flat_hash_map<char, std::vector<int>>> input_mappings;
   absl::flat_hash_map<char, std::vector<int>> output_mapping;
@@ -310,7 +310,7 @@ StatusOr<llvm::DenseMap<int, Layout>> EinsumSPMDExpander::ComputeLayoutBackward(
   // Need the mapping of input and output labels from the equation.
   auto einsum_op = mlir::cast<mlir::TF::EinsumOp>(op);
   size_t num_inputs = einsum_op.getNumOperands();
-  std::string equation = einsum_op.equation().str();
+  std::string equation = einsum_op.getEquation().str();
   absl::flat_hash_set<char> reduced_dim_labels;
   std::vector<absl::flat_hash_map<char, std::vector<int>>> input_mappings;
   absl::flat_hash_map<char, std::vector<int>> output_mapping;
@@ -397,7 +397,7 @@ Status EinsumSPMDExpander::MaybeRelayoutInputs(
   absl::flat_hash_map<char, std::vector<int>> output_mapping;
   absl::flat_hash_set<char> contracting_labels;
   absl::flat_hash_set<char> all_labels;
-  TF_RETURN_IF_ERROR(ExtractEquationRelations(einsum.equation().str(),
+  TF_RETURN_IF_ERROR(ExtractEquationRelations(einsum.getEquation().str(),
                                               contracting_labels,
                                               input_mappings, output_mapping));
 
diff --git a/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.cc
index 5e917c15b49..d2c58de60a3 100644
--- a/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/expanddims_spmd_expander.cc
@@ -39,12 +39,12 @@ StatusOr<mlir::Operation*> ExpandDimsExpander::ExpandOp(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> global_output_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(expand_dims_op.output()));
+      GetGlobalShapeOfValueFromDTensorLayout(expand_dims_op.getOutput()));
 
   // Compute current output layout (just input layout with unsharded on the
   // new dim);
   TF_ASSIGN_OR_RETURN(int64_t dim,
-                      ExtractConstIntFromValue(expand_dims_op.dim()));
+                      ExtractConstIntFromValue(expand_dims_op.getDim()));
 
   if (dim < 0) dim += global_output_shape.size();
   std::vector<ShardingSpec> sharding_specs(global_output_shape.size());
@@ -63,10 +63,11 @@ StatusOr<mlir::Operation*> ExpandDimsExpander::ExpandOp(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value output_value,
-      EmitRelayout(expand_dims_op.output(), current_output_layout,
+      EmitRelayout(expand_dims_op.getOutput(), current_output_layout,
                    *output_layout, &newly_created_ops));
 
-  expand_dims_op.output().replaceAllUsesExcept(output_value, newly_created_ops);
+  expand_dims_op.getOutput().replaceAllUsesExcept(output_value,
+                                                  newly_created_ops);
 
   return output_value.getDefiningOp();
 }
@@ -78,7 +79,7 @@ StatusOr<llvm::DenseMap<int, Layout>> ExpandDimsExpander::ComputeLayoutForward(
   auto expand_dims_op = mlir::cast<mlir::TF::ExpandDimsOp>(op);
 
   TF_ASSIGN_OR_RETURN(int64_t dim,
-                      ExtractConstIntFromValue(expand_dims_op.dim()));
+                      ExtractConstIntFromValue(expand_dims_op.getDim()));
 
   // Do not infer any output layout if no operand layout is present.
   if (input_layouts.find(0) == input_layouts.end())
@@ -110,7 +111,7 @@ StatusOr<llvm::DenseMap<int, Layout>> ExpandDimsExpander::ComputeLayoutBackward(
   auto expand_dims_op = mlir::cast<mlir::TF::ExpandDimsOp>(op);
 
   TF_ASSIGN_OR_RETURN(int64_t dim,
-                      ExtractConstIntFromValue(expand_dims_op.dim()));
+                      ExtractConstIntFromValue(expand_dims_op.getDim()));
 
   if (dim < 0) dim += output_layout.rank();
 
diff --git a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
index 74e38083877..43a268e749a 100644
--- a/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fill_spmd_expander.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/dtensor_location.h"
@@ -32,7 +32,7 @@ namespace dtensor {
 StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto original_fill = mlir::cast<mlir::TF::FillOp>(op);
   TF_ASSIGN_OR_RETURN(auto dims_layout,
-                      ExtractLayoutFromOperand(original_fill.dims()));
+                      ExtractLayoutFromOperand(original_fill.getDims()));
   if (!dims_layout.has_value()) {
     return errors::InvalidArgument(
         "Failed during SPMD expansion of tf.FillOp. Layout of dimension "
@@ -65,15 +65,17 @@ StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto int_type = mlir::RankedTensorType::get(
       static_cast<int64>(shard_values.size()), builder.getIntegerType(32));
   auto int_attr = mlir::DenseIntElementsAttr::get(int_type, shard_values);
-  auto target_type_attr = mlir::hlo::convertElementsAttr(
-      int_attr,
-      original_fill.dims().getType().cast<mlir::TensorType>().getElementType());
+  auto target_type_attr =
+      mlir::hlo::convertElementsAttr(int_attr, original_fill.getDims()
+                                                   .getType()
+                                                   .cast<mlir::TensorType>()
+                                                   .getElementType());
 
   auto location = DT_LOC(op);
   auto num_shards =
       builder.create<mlir::TF::ConstOp>(location, target_type_attr);
   // Divide the global shape by the sharding spec.
-  auto div = builder.create<mlir::TF::DivOp>(location, original_fill.dims(),
+  auto div = builder.create<mlir::TF::DivOp>(location, original_fill.getDims(),
                                              num_shards.getResult());
   // Copy over static shape information if available
   auto global_output_type =
@@ -83,8 +85,8 @@ StatusOr<mlir::Operation*> FillSPMDExpander::ExpandOp(mlir::Operation* op) {
       LocalTypeFromGlobalType(output_layout.value(), global_output_type));
 
   auto new_fill = builder.create<mlir::TF::FillOp>(
-      location, local_type, div.getResult(), original_fill.value());
-  original_fill.getResult().replaceAllUsesWith(new_fill.output());
+      location, local_type, div.getResult(), original_fill.getValue());
+  original_fill.getResult().replaceAllUsesWith(new_fill.getOutput());
   original_fill.erase();
   return InferSPMDExpandedLocalShape(new_fill);
 }
@@ -94,8 +96,9 @@ StatusOr<llvm::DenseMap<int, Layout>> FillSPMDExpander::ComputeLayoutForward(
   TF_ASSIGN_OR_RETURN(auto mesh, ExtractDeviceMeshEnclosingCluster(op));
   // Always set replicated layout for output.
   return llvm::DenseMap<int, Layout>(
-      {{0, Layout::ReplicatedOnMesh(
-               mesh, ValueRank(llvm::cast<mlir::TF::FillOp>(op).output()))}});
+      {{0,
+        Layout::ReplicatedOnMesh(
+            mesh, ValueRank(llvm::cast<mlir::TF::FillOp>(op).getOutput()))}});
 }
 
 StatusOr<llvm::DenseMap<int, Layout>> FillSPMDExpander::ComputeLayoutBackward(
diff --git a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
index e0df614a346..f6bd92b4ea9 100644
--- a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h"
 
+#include <optional>
 #include <string>
 #include <vector>
 
 #include "absl/types/optional.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -33,153 +35,51 @@ namespace tensorflow {
 namespace dtensor {
 
 StatusOr<mlir::Operation*> GatherV2SPMDExpander::ExpandOp(mlir::Operation* op) {
-  auto gather_op = llvm::cast<mlir::TF::GatherV2Op>(op);
-  TF_ASSIGN_OR_RETURN(int64_t axis, ExtractConstIntFromValue(gather_op.axis()));
-  TF_ASSIGN_OR_RETURN(auto params_layout,
-                      ExtractLayoutFromOperand(gather_op.params()));
-  TF_ASSIGN_OR_RETURN(auto indices_layout,
-                      ExtractLayoutFromOperand(gather_op.indices()));
-  TF_ASSIGN_OR_RETURN(auto output_layout, ExtractSingleLayoutFromOp(gather_op));
-
-  const auto params_rank = ValueRank(gather_op.params());
-  const auto indices_rank = ValueRank(gather_op.indices());
-  if (params_rank == -1)
-    return errors::InvalidArgument("missing rank for params input");
-  if (indices_rank == -1)
-    return errors::InvalidArgument("missing rank for indices input");
-
-  // Handle the case of negative axis.
-  if (axis < 0) axis += params_rank;
-
-  int batch_dims = gather_op.batch_dims();
-
-  auto params = gather_op.params();
-  auto indices = gather_op.indices();
-
-  mlir::OpBuilder builder(op);
-
-  // Step 1: If the params are sharded on dim axis, an unconditional all-concat
-  // is generated. Alternatively, we could do: all-concating indices, followed
-  // by tf.Gather + slicing with correct masks.
-  //
-  // Currently we only support the case that the output layout matching the
-  // params layout for all non-axis dim. Other cases needs either a slicing or
-  // all-concat, which can be added later.
-  {
-    LayoutProto tgt_params_layout;
-    *tgt_params_layout.mutable_mesh_config() = params_layout->mesh().ToProto();
-    // check the first half
-    for (int i = 0; i < axis; ++i) {
-      const auto dim_name = params_layout->sharding_spec(i);
-      if (dim_name != output_layout->sharding_spec(i)) {
-        return errors::InvalidArgument(
-            llvm::formatv(
-                "input and output layout do not agree on non-axis dim {0}. "
-                "\n  params: {1}\n  output: {2}, axis: {3}",
-                i, params_layout->ToString(), output_layout->ToString(), axis)
-                .str());
-      }
-      tgt_params_layout.add_sharding_specs()->set_sharding_spec(dim_name);
-    }
-    // Set replicated for `axis` dim.
-    tgt_params_layout.add_sharding_specs()->set_sharding_spec(
-        Layout::kUnshardedDim);
-    // Check the second half
-    for (int i = axis + 1; i < params_rank; ++i) {
-      auto dim_name = params_layout->sharding_spec(i);
-      // To align the param dim with output, we can think we insert indices_rank
-      // - batch_dims dims from indices and remove one from param (axis), so
-      // the shifting is indices_rank - batch_dims - 1.
-      if (dim_name !=
-          output_layout->sharding_spec(i + indices_rank - batch_dims - 1)) {
-        return errors::InvalidArgument(
-            llvm::formatv(
-                "input and output layout do not agree on non-axis dim {0}. "
-                "\n  params: {1}\n  output: {2}, axis: {3}",
-                i, params_layout->ToString(), output_layout->ToString(), axis)
-                .str());
-      }
-      tgt_params_layout.add_sharding_specs()->set_sharding_spec(dim_name);
-    }
+  return ExpandOpHelper<mlir::TF::GatherV2Op>(op);
+}
 
-    if (!Layout::IsUnshardedDimension(params_layout->sharding_spec(axis))) {
-      TF_ASSIGN_OR_RETURN(
-          params, EmitAllGather(builder, params, *params_layout,
-                                Layout::FromProto(tgt_params_layout).value()));
-    }
-  }
+StatusOr<int64_t> GatherV2SPMDExpander::GetAxis(mlir::Operation* op) {
+  return ExtractConstIntFromValue(
+      llvm::cast<mlir::TF::GatherV2Op>(op).getAxis());
+}
 
-  // Step 2: Check the output layout. If it requires all-relayouting indices.
-  // Do it.
-  //
-  // Indices shape is not big typically. Relayouting is expected to be cheap.
-  {
-    bool indices_relayout_needed = false;
-    LayoutProto tgt_indices_layout;
-    *tgt_indices_layout.mutable_mesh_config() = output_layout->mesh().ToProto();
-    for (int i = 0; i < indices_rank; ++i) {
-      int index_in_output;
-      int index_in_indices;
-      if (i < batch_dims) {
-        // For dim within batch_dims, indices dim is aligning at the same index
-        // as output.
-        index_in_output = i;
-        index_in_indices = i;
-      } else {
-        // For dim after batch_dims, we can remove batch_dims from outputs and
-        // indices first, i.e., (i - batch_dims), add axis back, i.e., axis -
-        // batch_dims, and then put batch_dims back, so the target position in
-        // output is
-        //
-        //   i - batch_dims + axis - batch_dims + batch_dims
-        //
-        // which is as follows:
-        index_in_output = i + axis - batch_dims;
-        index_in_indices = i;
-      }
-      tgt_indices_layout.add_sharding_specs()->set_sharding_spec(
-          output_layout->sharding_spec(index_in_output));
-
-      if (output_layout->sharding_spec(index_in_output) !=
-          indices_layout->sharding_spec(index_in_indices)) {
-        indices_relayout_needed = true;
-      }
-    }
+StatusOr<uint64_t> GatherV2SPMDExpander::GetBatchDim(mlir::Operation* op) {
+  return llvm::cast<mlir::TF::GatherV2Op>(op).getBatchDims();
+}
 
-    if (indices_relayout_needed) {
-      TF_ASSIGN_OR_RETURN(
-          indices, EmitRelayout(indices, *indices_layout,
-                                Layout::FromProto(tgt_indices_layout).value()));
-    }
-  }
+StatusOr<mlir::Operation*> ResourceGatherSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  return ExpandOpHelper<mlir::TF::ResourceGatherOp>(op);
+}
 
-  auto new_gather = builder.create<mlir::TF::GatherV2Op>(
-      gather_op.getLoc(), gather_op.getResult().getType(), params, indices,
-      gather_op.axis(), gather_op.batch_dims());
-  op->getResult(0).replaceAllUsesWith(new_gather.output());
-  op->erase();
+StatusOr<int64_t> ResourceGatherSPMDExpander::GetAxis(mlir::Operation* op) {
+  return 0;
+}
 
-  return InferSPMDExpandedLocalShape(new_gather);
+StatusOr<uint64_t> ResourceGatherSPMDExpander::GetBatchDim(
+    mlir::Operation* op) {
+  return llvm::cast<mlir::TF::ResourceGatherOp>(op).getBatchDims();
 }
 
 StatusOr<llvm::DenseMap<int, Layout>>
-GatherV2SPMDExpander::ComputeLayoutForward(
+GatherCommonSPMDExpander::ComputeLayoutForward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
-  auto gather_op = llvm::cast<mlir::TF::GatherV2Op>(op);
-  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  TF_ASSIGN_OR_RETURN(int64_t axis, GetAxis(op));
+  TF_ASSIGN_OR_RETURN(uint64_t batch_dims, GetBatchDim(op));
 
-  TF_ASSIGN_OR_RETURN(int64_t axis, ExtractConstIntFromValue(gather_op.axis()));
-  int batch_dims = gather_op.batch_dims();
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
-  absl::optional<Layout> params_layout;
-  if (input_layouts.find(0) != input_layouts.end())
+  std::optional<Layout> params_layout;
+  if (input_layouts.find(0) != input_layouts.end()) {
     params_layout.emplace(input_layouts.lookup(0));
-  absl::optional<Layout> indices_layout;
-  if (input_layouts.find(1) != input_layouts.end())
+  }
+  std::optional<Layout> indices_layout;
+  if (input_layouts.find(1) != input_layouts.end()) {
     indices_layout.emplace(input_layouts.lookup(1));
+  }
 
-  const int params_rank = ValueRank(gather_op.params());
-  const int indices_rank = ValueRank(gather_op.indices());
+  const int params_rank = ValueRank(op->getOperand(0));
+  const int indices_rank = ValueRank(op->getOperand(1));
   if (params_rank == -1)
     return errors::InvalidArgument("missing rank for params input");
   if (indices_rank == -1)
@@ -187,113 +87,113 @@ GatherV2SPMDExpander::ComputeLayoutForward(
 
   // Handle the case of negative axis.
   if (axis < 0) axis += params_rank;
-
-  if (params_layout || indices_layout) {
-    std::vector<std::string> output_layout_specs;
-
-    // Get a list of mesh dims that params uses, other than the dim for axis.
-    llvm::DenseSet<llvm::StringRef> params_mesh_dims;
-    if (params_layout) {
-      for (int i = 0; i < params_rank; ++i)
-        if (i != axis &&
-            !Layout::IsUnshardedDimension(params_layout->sharding_spec(i)))
-          params_mesh_dims.insert(params_layout->sharding_spec(i));
-    }
-
-    auto add_mesh_dim_if = [&](const absl::optional<Layout>& input_layout,
-                               int64 dim, bool indices = false) {
-      // Only add the mesh dimension to the output_layout if 1) the input layout
-      // exists and 2) when the input is indices and the params dims don't
-      // contain the mesh dim we are adding (to avoid two different tensor dims
-      // being sharded over the same mesh dim).
-      // Note that params->sharding_spec(axis) is specifically excluded from the
-      // params_mesh_dims during construction above. This means that if we are
-      // processing the indices layout and it contains
-      // params->sharding_spec(axis) we will still add that sharding_spec to the
-      // output layout.
-      if (input_layout && (!indices || !params_mesh_dims.contains(
-                                           input_layout->sharding_spec(dim))))
-        output_layout_specs.push_back(input_layout->sharding_spec(dim));
-      else
-        output_layout_specs.push_back(Layout::kUnshardedDim);
-    };
-
-    for (int i = 0; i < axis; ++i) add_mesh_dim_if(params_layout, i);
-    for (int i = batch_dims; i < indices_rank; ++i)
-      add_mesh_dim_if(indices_layout, i, /*indices=*/true);
-    for (int i = axis + 1; i < params_rank; ++i)
-      add_mesh_dim_if(params_layout, i);
-
-    TF_ASSIGN_OR_RETURN(const Layout output_layout,
-                        Layout::GetLayout(output_layout_specs, mesh));
-    return llvm::DenseMap<int, Layout>({{0, output_layout}});
+  if (batch_dims < 0) batch_dims += indices_rank;
+  if (!params_layout && !indices_layout) {
+    return llvm::DenseMap<int, Layout>();
+  }
+  std::vector<std::string> output_layout_specs;
+
+  // Get a list of mesh dims that params uses, other than the dim for axis.
+  llvm::DenseSet<llvm::StringRef> params_mesh_dims;
+  if (params_layout) {
+    for (int i = 0; i < params_rank; ++i)
+      if (i != axis &&
+          !Layout::IsUnshardedDimension(params_layout->sharding_spec(i)))
+        params_mesh_dims.insert(params_layout->sharding_spec(i));
   }
 
-  return llvm::DenseMap<int, Layout>();
+  auto add_mesh_dim_if = [&](const absl::optional<Layout>& input_layout,
+                             int64 dim, bool indices = false) {
+    // Only add the mesh dimension to the output_layout if 1) the input layout
+    // exists and 2) when the input is indices and the params dims don't
+    // contain the mesh dim we are adding (to avoid two different tensor dims
+    // being sharded over the same mesh dim).
+    // Note that params->sharding_spec(axis) is specifically excluded from the
+    // params_mesh_dims during construction above. This means that if we are
+    // processing the indices layout and it contains
+    // params->sharding_spec(axis) we will still add that sharding_spec to the
+    // output layout.
+    if (input_layout && (!indices || !params_mesh_dims.contains(
+                                         input_layout->sharding_spec(dim))))
+      output_layout_specs.push_back(input_layout->sharding_spec(dim));
+    else
+      output_layout_specs.push_back(Layout::kUnshardedDim);
+  };
+
+  for (int i = 0; i < axis; ++i) add_mesh_dim_if(params_layout, i);
+  for (int i = batch_dims; i < indices_rank; ++i)
+    add_mesh_dim_if(indices_layout, i, /*indices=*/true);
+  for (int i = axis + 1; i < params_rank; ++i)
+    add_mesh_dim_if(params_layout, i);
+
+  TF_ASSIGN_OR_RETURN(const Layout output_layout,
+                      Layout::GetLayout(output_layout_specs, mesh));
+  return llvm::DenseMap<int, Layout>({{0, output_layout}});
 }
 
 StatusOr<llvm::DenseMap<int, Layout>>
-GatherV2SPMDExpander::ComputeLayoutBackward(
+GatherCommonSPMDExpander::ComputeLayoutBackward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
-  auto gather_op = llvm::cast<mlir::TF::GatherV2Op>(op);
+  TF_ASSIGN_OR_RETURN(int64_t axis, GetAxis(op));
+  TF_ASSIGN_OR_RETURN(uint64_t batch_dims, GetBatchDim(op));
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
-  llvm::DenseMap<int, Layout> input_layouts(gather_op.getNumOperands());
-  // axis is const
-  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/0);
-
-  if (output_layouts.find(0) != output_layouts.end()) {
-    // This will always exist since there is only one output.
-    const Layout output_layout = output_layouts.lookup(0);
-
-    TF_ASSIGN_OR_RETURN(int64_t axis,
-                        ExtractConstIntFromValue(gather_op.axis()));
-    int batch_dims = gather_op.batch_dims();
-
-    const int params_rank = ValueRank(gather_op.params());
-    const int indices_rank = ValueRank(gather_op.indices());
-    if (params_rank == -1)
-      return errors::InvalidArgument("missing rank for params input");
-    if (indices_rank == -1)
-      return errors::InvalidArgument("missing rank for indices input");
-
-    // Handle the case of negative axis.
-    if (axis < 0) axis += params_rank;
-
-    std::vector<std::string> params_layout_specs;
-    std::vector<std::string> indices_layout_specs;
-    params_layout_specs.reserve(params_rank);
-    indices_layout_specs.reserve(indices_rank);
-
-    // Extract the params layout. We will request that axis is replicated as
-    // that gives the least issues with spmd expansion.
-    // E.g. If we had axis = 1 and parmas layout [p0 p1 p2 p3]
-    // input layout [i0 i1] then the output layout would have been
-    // [p0 i0 i1 p2 p3] so to go backwards, we extract the ranges [0, axis)
-    // and [axis + indices.rank(), output.rank()) for params (with a replicated
-    // dim for the missing dimension inbetween). Indices layout is based on the
-    // range [axis, axis+indices)/
-    for (int i = 0; i < axis; ++i)
-      params_layout_specs.push_back(output_layout.sharding_spec(i));
-    params_layout_specs.push_back(Layout::kUnshardedDim);
-    for (int i = axis + indices_rank - batch_dims; i < output_layout.rank();
-         ++i)
-      params_layout_specs.push_back(output_layout.sharding_spec(i));
-
-    // Extract the indices layout.
-    for (int i = 0; i < batch_dims; ++i)
-      indices_layout_specs.push_back(output_layout.sharding_spec(i));
-    for (int i = axis; i < axis + indices_rank - batch_dims; ++i)
-      indices_layout_specs.push_back(output_layout.sharding_spec(i));
-
-    TF_ASSIGN_OR_RETURN(const Layout params_layout,
-                        Layout::GetLayout(params_layout_specs, mesh));
-    TF_ASSIGN_OR_RETURN(const Layout indices_layout,
-                        Layout::GetLayout(indices_layout_specs, mesh));
-    input_layouts[0] = params_layout;
-    input_layouts[1] = indices_layout;
+  llvm::DenseMap<int, Layout> input_layouts(op->getNumOperands());
+  // Axis is a constant so replicate it. ResourceGatherOp does not have axis.
+  if (llvm::isa<mlir::TF::GatherV2Op>(op)) {
+    input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/0);
+  }
+
+  auto it = output_layouts.find(0);
+  if (it == output_layouts.end()) {
+    return input_layouts;
   }
 
+  // This will always exist since there is only one output.
+  const Layout output_layout = it->getSecond();
+
+  const int params_rank = ValueRank(op->getOperand(0));
+  const int indices_rank = ValueRank(op->getOperand(1));
+  if (params_rank == -1)
+    return errors::InvalidArgument("missing rank for params input");
+  if (indices_rank == -1)
+    return errors::InvalidArgument("missing rank for indices input");
+
+  // Handle the case of negative axis.
+  if (axis < 0) axis += params_rank;
+  if (batch_dims < 0) batch_dims += indices_rank;
+  std::vector<std::string> params_layout_specs;
+  std::vector<std::string> indices_layout_specs;
+  params_layout_specs.reserve(params_rank);
+  indices_layout_specs.reserve(indices_rank);
+
+  // Extract the params layout. We will request that axis is replicated as
+  // that gives the least issues with spmd expansion.
+  // E.g. If we had axis = 1 and parmas layout [p0 p1 p2 p3]
+  // input layout [i0 i1] then the output layout would have been
+  // [p0 i0 i1 p2 p3] so to go backwards, we extract the ranges [0, axis)
+  // and [axis + indices.rank(), output.rank()) for params (with a replicated
+  // dim for the missing dimension inbetween). Indices layout is based on the
+  // range [axis, axis+indices)/
+  for (int i = 0; i < axis; ++i)
+    params_layout_specs.push_back(output_layout.sharding_spec(i));
+  params_layout_specs.push_back(Layout::kUnshardedDim);
+  for (int i = axis + indices_rank - batch_dims; i < output_layout.rank(); ++i)
+    params_layout_specs.push_back(output_layout.sharding_spec(i));
+
+  // Extract the indices layout.
+  for (int i = 0; i < batch_dims; ++i)
+    indices_layout_specs.push_back(output_layout.sharding_spec(i));
+  for (int i = axis; i < axis + indices_rank - batch_dims; ++i)
+    indices_layout_specs.push_back(output_layout.sharding_spec(i));
+
+  TF_ASSIGN_OR_RETURN(const Layout params_layout,
+                      Layout::GetLayout(params_layout_specs, mesh));
+  TF_ASSIGN_OR_RETURN(const Layout indices_layout,
+                      Layout::GetLayout(indices_layout_specs, mesh));
+  input_layouts[0] = params_layout;
+  input_layouts[1] = indices_layout;
+
   return input_layouts;
 }
 
@@ -368,19 +268,19 @@ Status GatherNdGetInputLayoutFromOutput(const Layout& output_layout,
 StatusOr<mlir::Operation*> GatherNdSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto gather_op = llvm::cast<mlir::TF::GatherNdOp>(op);
   TF_ASSIGN_OR_RETURN(Layout params_layout,
-                      ExtractRequiredLayoutFromOperand(gather_op.params()));
+                      ExtractRequiredLayoutFromOperand(gather_op.getParams()));
   TF_ASSIGN_OR_RETURN(Layout indices_layout,
-                      ExtractRequiredLayoutFromOperand(gather_op.indices()));
+                      ExtractRequiredLayoutFromOperand(gather_op.getIndices()));
   TF_ASSIGN_OR_RETURN(Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(gather_op));
 
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> indices_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(gather_op.indices()));
+      GetGlobalShapeOfValueFromDTensorLayout(gather_op.getIndices()));
   const int index_dimensions = indices_shape.back();
 
-  const auto params_rank = ValueRank(gather_op.params());
-  const auto indices_rank = ValueRank(gather_op.indices());
+  const auto params_rank = ValueRank(gather_op.getParams());
+  const auto indices_rank = ValueRank(gather_op.getIndices());
   if (params_rank == -1)
     return errors::InvalidArgument("missing rank for params input");
   if (indices_rank == -1)
@@ -439,10 +339,10 @@ StatusOr<mlir::Operation*> GatherNdSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value params,
-      EmitRelayout(gather_op.params(), params_layout, new_params_layout));
+      EmitRelayout(gather_op.getParams(), params_layout, new_params_layout));
   TF_ASSIGN_OR_RETURN(
       mlir::Value indices,
-      EmitRelayout(gather_op.indices(), indices_layout, new_indices_layout));
+      EmitRelayout(gather_op.getIndices(), indices_layout, new_indices_layout));
 
   mlir::OpBuilder builder(op);
 
@@ -450,14 +350,14 @@ StatusOr<mlir::Operation*> GatherNdSPMDExpander::ExpandOp(mlir::Operation* op) {
   // so we have to provided the output type. This output type is incorrect and
   // we manually run shape inference after.
   mlir::TF::GatherNdOp new_gather = builder.create<mlir::TF::GatherNdOp>(
-      op->getLoc(), gather_op.output().getType(), params, indices);
+      op->getLoc(), gather_op.getOutput().getType(), params, indices);
   InferSPMDExpandedLocalShape(new_gather);
 
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value output,
-      EmitRelayout(new_gather.output(), merged_output_layout, output_layout));
+  TF_ASSIGN_OR_RETURN(mlir::Value output,
+                      EmitRelayout(new_gather.getOutput(), merged_output_layout,
+                                   output_layout));
 
-  gather_op.output().replaceAllUsesWith(output);
+  gather_op.getOutput().replaceAllUsesWith(output);
   gather_op.erase();
   return output.getDefiningOp();
 }
@@ -468,10 +368,10 @@ GatherNdSPMDExpander::ComputeLayoutForward(
   auto gather_op = llvm::cast<mlir::TF::GatherNdOp>(op);
   TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
-  absl::optional<Layout> params_layout;
+  std::optional<Layout> params_layout;
   if (input_layouts.find(0) != input_layouts.end())
     params_layout.emplace(input_layouts.lookup(0));
-  absl::optional<Layout> indices_layout;
+  std::optional<Layout> indices_layout;
   if (input_layouts.find(1) != input_layouts.end())
     indices_layout.emplace(input_layouts.lookup(1));
 
@@ -482,8 +382,8 @@ GatherNdSPMDExpander::ComputeLayoutForward(
     return errors::Unimplemented(
         "dynamic last dimension for index is not supported");
 
-  const int params_rank = ValueRank(gather_op.params());
-  const int indices_rank = ValueRank(gather_op.indices());
+  const int params_rank = ValueRank(gather_op.getParams());
+  const int indices_rank = ValueRank(gather_op.getIndices());
   if (params_rank == -1)
     return errors::InvalidArgument("missing rank for params input");
   if (indices_rank == -1)
@@ -517,8 +417,8 @@ GatherNdSPMDExpander::ComputeLayoutBackward(
     return errors::Unimplemented(
         "dynamic last dimension for index is not supported");
 
-  const int params_rank = ValueRank(gather_op.params());
-  const int indices_rank = ValueRank(gather_op.indices());
+  const int params_rank = ValueRank(gather_op.getParams());
+  const int indices_rank = ValueRank(gather_op.getIndices());
   if (params_rank == -1)
     return errors::InvalidArgument("missing rank for params input");
   if (indices_rank == -1)
diff --git a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
index 3f2834e1ef6..460851ff9de 100644
--- a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
+++ b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.h
@@ -16,16 +16,172 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_GATHER_SPMD_EXPANDER_H_
 #define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_GATHER_SPMD_EXPANDER_H_
 
+#include <string>
+
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/collectives.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
 #include "tensorflow/dtensor/mlir/spmd_expander.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
 
-class GatherV2SPMDExpander : public SPMDExpanderBase {
+class GatherCommonSPMDExpander : public SPMDExpanderBase {
  public:
-  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+  template <typename OpType>
+  StatusOr<mlir::Operation*> ExpandOpHelper(mlir::Operation* op) {
+    TF_ASSIGN_OR_RETURN(int64_t axis, GetAxis(op));
+    TF_ASSIGN_OR_RETURN(uint64_t batch_dims, GetBatchDim(op));
+
+    mlir::Value params = op->getOperand(0);
+    mlir::Value indices = op->getOperand(1);
+
+    TF_ASSIGN_OR_RETURN(std::vector<Layout> operand_layouts,
+                        ExtractRequiredLayoutFromOperands(op));
+    TF_ASSIGN_OR_RETURN(const Layout& output_layout,
+                        ExtractRequiredSingleLayoutFromOp(op));
+
+    const Layout& params_layout = operand_layouts[0];
+    const Layout& indices_layout = operand_layouts[1];
+
+    const int params_rank = ValueRank(params);
+    const int indices_rank = ValueRank(indices);
+
+    if (params_rank == -1)
+      return errors::InvalidArgument("Missing rank for params input.");
+    if (indices_rank == -1)
+      return errors::InvalidArgument("Missing rank for indices input.");
+
+    // Handle the case of negative axis.
+    if (axis < 0) axis += params_rank;
+    if (batch_dims < 0) batch_dims += indices_rank;
+    mlir::OpBuilder builder(op);
+
+    // Step 1: If the params are sharded on dim axis, an unconditional
+    // all-concat is generated. Alternatively, we could do: all-concating
+    // indices, followed by tf.Gather + slicing with correct masks.
+    //
+    // Currently we only support the case that the output layout matching the
+    // params layout for all non-axis dim. Other cases needs either a slicing or
+    // all-concat, which can be added later.
+    {
+      LayoutProto tgt_params_layout;
+      *tgt_params_layout.mutable_mesh_config() = params_layout.mesh().ToProto();
+      // check the first half
+      for (int i = 0; i < axis; ++i) {
+        const std::string& dim_name = params_layout.sharding_spec(i);
+        if (dim_name != output_layout.sharding_spec(i)) {
+          return errors::InvalidArgument(
+              llvm::formatv(
+                  "input and output layout do not agree on non-axis dim {0}. "
+                  "\n  params: {1}\n  output: {2}, axis: {3}",
+                  i, params_layout.ToString(), output_layout.ToString(), axis)
+                  .str());
+        }
+        tgt_params_layout.add_sharding_specs()->set_sharding_spec(dim_name);
+      }
+      // Set replicated for `axis` dim.
+      tgt_params_layout.add_sharding_specs()->set_sharding_spec(
+          Layout::kUnshardedDim);
+      // Check the second half
+      for (int i = axis + 1; i < params_rank; ++i) {
+        const std::string& dim_name = params_layout.sharding_spec(i);
+        // To align the param dim with output, we can think we insert
+        // indices_rank
+        // - batch_dims dims from indices and remove one from param (axis), so
+        // the shifting is indices_rank - batch_dims - 1.
+        if (dim_name !=
+            output_layout.sharding_spec(i + indices_rank - batch_dims - 1)) {
+          return errors::InvalidArgument(
+              llvm::formatv(
+                  "input and output layout do not agree on non-axis dim {0}. "
+                  "\n  params: {1}\n  output: {2}, axis: {3}",
+                  i, params_layout.ToString(), output_layout.ToString(), axis)
+                  .str());
+        }
+        tgt_params_layout.add_sharding_specs()->set_sharding_spec(dim_name);
+      }
+
+      if (!Layout::IsUnshardedDimension(params_layout.sharding_spec(axis))) {
+        if (llvm::isa<mlir::TF::ResourceGatherOp>(op)) {
+          return errors::InvalidArgument(
+              "DTensor does not support sharded 0th dimension for the resource "
+              "tensor for ResourceGatherOp. Please unshard dimension ",
+              axis);
+        }
+        TF_ASSIGN_OR_RETURN(
+            params,
+            EmitAllGather(builder, params, params_layout,
+                          Layout::FromProto(tgt_params_layout).value()));
+      }
+    }
+
+    // Step 2: Check the output layout. If it requires all-relayouting indices.
+    // Do it.
+    //
+    // Indices shape is not big typically. Relayouting is expected to be cheap.
+    {
+      bool indices_relayout_needed = false;
+      LayoutProto tgt_indices_layout;
+      *tgt_indices_layout.mutable_mesh_config() =
+          output_layout.mesh().ToProto();
+      for (int i = 0; i < indices_rank; ++i) {
+        int index_in_output;
+        int index_in_indices;
+        if (i < batch_dims) {
+          // For dim within batch_dims, indices dim is aligning at the same
+          // index as output.
+          index_in_output = i;
+          index_in_indices = i;
+        } else {
+          // For dim after batch_dims, we can remove batch_dims from outputs and
+          // indices first, i.e., (i - batch_dims), add axis back, i.e., axis -
+          // batch_dims, and then put batch_dims back, so the target position in
+          // output is
+          //
+          //   i - batch_dims + axis - batch_dims + batch_dims
+          //
+          // which is as follows:
+          index_in_output = i + axis - batch_dims;
+          index_in_indices = i;
+        }
+        tgt_indices_layout.add_sharding_specs()->set_sharding_spec(
+            output_layout.sharding_spec(index_in_output));
+
+        if (output_layout.sharding_spec(index_in_output) !=
+            indices_layout.sharding_spec(index_in_indices)) {
+          indices_relayout_needed = true;
+        }
+      }
+
+      if (indices_relayout_needed) {
+        TF_ASSIGN_OR_RETURN(
+            indices,
+            EmitRelayout(indices, indices_layout,
+                         Layout::FromProto(tgt_indices_layout).value()));
+      }
+    }
+
+    auto new_operands = llvm::to_vector<4>(op->getOperands());
+    new_operands[0] = params;
+    new_operands[1] = indices;
+
+    mlir::Operation* new_gather =
+        builder
+            .create<OpType>(op->getLoc(), op->getResultTypes(),
+                            mlir::ValueRange(new_operands), op->getAttrs())
+            .getOperation();
+
+    op->getResult(0).replaceAllUsesWith(new_gather->getResult(0));
+    op->erase();
+
+    return InferSPMDExpandedLocalShape(new_gather);
+  }
 
   StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
       mlir::Operation* op,
@@ -34,6 +190,27 @@ class GatherV2SPMDExpander : public SPMDExpanderBase {
   StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
       mlir::Operation* op,
       const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+  virtual StatusOr<int64_t> GetAxis(mlir::Operation* op) = 0;
+  virtual StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) = 0;
+};
+
+class GatherV2SPMDExpander : public GatherCommonSPMDExpander {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<int64_t> GetAxis(mlir::Operation* op) override;
+
+  StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) override;
+};
+
+class ResourceGatherSPMDExpander : public GatherCommonSPMDExpander {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<int64_t> GetAxis(mlir::Operation* op) override;
+
+  StatusOr<uint64_t> GetBatchDim(mlir::Operation* op) override;
 };
 
 class GatherNdSPMDExpander : public SPMDExpanderBase {
diff --git a/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.cc
index 953c2b456fb..8771b956097 100644
--- a/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/optional.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -60,10 +60,10 @@ StatusOr<Layout> MatchBatchDim(const Layout& layout,
 StatusOr<mlir::Operation*> InTopKSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto in_top_k_op = mlir::cast<mlir::TF::InTopKV2Op>(op);
 
-  mlir::Value predictions = in_top_k_op.predictions();
+  mlir::Value predictions = in_top_k_op.getPredictions();
   TF_ASSIGN_OR_RETURN(const Layout predictions_layout,
                       ExtractRequiredLayoutFromOperand(predictions));
-  mlir::Value targets = in_top_k_op.targets();
+  mlir::Value targets = in_top_k_op.getTargets();
   TF_ASSIGN_OR_RETURN(const Layout targets_layout,
                       ExtractRequiredLayoutFromOperand(targets));
 
@@ -110,7 +110,7 @@ StatusOr<mlir::Operation*> InTopKSPMDExpander::ExpandOp(mlir::Operation* op) {
   }
 
   mlir::OpBuilder builder(op);
-  mlir::BlockAndValueMapping mapping;
+  mlir::IRMapping mapping;
   // Apply any input relayouts.
   if (relayout_predictions) {
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
new file mode 100644
index 00000000000..cbe59bdb855
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
@@ -0,0 +1,148 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/mlir/dtensor_location.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+StatusOr<mlir::Operation*> IteratorGetNextSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  mlir::TF::IteratorGetNextOp original_op =
+      mlir::cast<mlir::TF::IteratorGetNextOp>(op);
+  mlir::OpBuilder builder(op);
+
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> output_layouts,
+                      ExtractRequiredLayoutFromOp(op));
+
+  llvm::SmallVector<mlir::Type, 4> local_types(original_op->getNumResults());
+
+  for (int i = 0; i < original_op->getNumResults(); ++i) {
+    mlir::TensorType global_output_type =
+        original_op.getResult(i).getType().cast<mlir::TensorType>();
+    std::vector<int64_t> local_shape =
+        output_layouts[i].LocalShapeFromGlobalShape(
+            global_output_type.getShape());
+    local_types[i] = mlir::RankedTensorType::get(
+        local_shape, global_output_type.getElementType());
+  }
+
+  auto new_op = builder.create<mlir::TF::IteratorGetNextOp>(
+      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+
+  for (int i = 0; i < original_op->getNumResults(); ++i) {
+    original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
+  }
+  original_op.erase();
+  return InferSPMDExpandedLocalShape(new_op);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+IteratorGetNextSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  // Extract the output element layouts from the `tf._element_layouts` attribute
+  // of the iterator resource tensor.
+  TF_ASSIGN_OR_RETURN(const auto layouts,
+                      ExtractElementLayoutsFromOperand(op->getOpOperand(0)));
+
+  llvm::DenseMap<int, Layout> output_layouts(op->getNumResults());
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    output_layouts[i] = layouts[i];
+  }
+  return output_layouts;
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+IteratorGetNextSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  // Iterator resource tensors are always 0-dimensional.
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, /*rank=*/0)}});
+}
+
+StatusOr<mlir::Operation*> IteratorGetNextAsOptionalSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  // Extract the output element layouts from the `tf._element_layouts` attribute
+  // of the iterator resource tensor.
+  TF_ASSIGN_OR_RETURN(const auto output_layouts,
+                      ExtractElementLayoutsFromOperand(op->getOpOperand(0)));
+
+  auto array_attr = op->getAttrOfType<mlir::ArrayAttr>(kIteratorOutputShapes);
+  if (!array_attr)
+    return errors::InvalidArgument(
+        llvm::formatv("Could not find `{0}` attribute of op: {1}",
+                      kIteratorOutputShapes, op->getName())
+            .str());
+
+  llvm::SmallVector<mlir::Attribute, 4> output_shape_attrs(array_attr.size());
+  for (int i = 0; i < array_attr.size(); ++i) {
+    std::vector<int64_t> local_shape =
+        output_layouts[i].LocalShapeFromGlobalShape(
+            array_attr[i].cast<mlir::TF::ShapeAttr>().getShape());
+    output_shape_attrs[i] =
+        mlir::TF::ShapeAttr::get(op->getContext(), {local_shape})
+            .cast<mlir::Attribute>();
+  }
+
+  // Update the `output_shapes` attribute on the op to match the local shape
+  // based on the iterator element layouts.
+  op->setAttr(kIteratorOutputShapes,
+              mlir::ArrayAttr::get(op->getContext(), output_shape_attrs));
+  return InferSPMDExpandedLocalShape(op);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+IteratorGetNextAsOptionalSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  // Variant tensors are always 0-dimensional.
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, /*rank=*/0)}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+IteratorGetNextAsOptionalSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  // Iterator resource tensors are always 0-dimensional.
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, /*rank=*/0)}});
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h
new file mode 100644
index 00000000000..64c73dbbf4a
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
+
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class IteratorGetNextSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class IteratorGetNextAsOptionalSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_ITERATOR_SPMD_EXPANDER_H_
diff --git a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
index 083e7c65f45..e9ee2fdd838 100644
--- a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallSet.h"
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
@@ -41,12 +41,12 @@ void GetTransposeSettings(mlir::Operation* op, bool* left_transposed,
   if (mlir::isa<mlir::TF::BatchMatMulV2Op>(op)) {
     mlir::TF::BatchMatMulV2Op mm = mlir::cast<mlir::TF::BatchMatMulV2Op>(op);
     // Adjoint is just conjugate transpose.
-    *left_transposed = mm.adj_x();
-    *right_transposed = mm.adj_y();
+    *left_transposed = mm.getAdjX();
+    *right_transposed = mm.getAdjY();
   } else if (mlir::isa<mlir::TF::MatMulOp>(op)) {
     mlir::TF::MatMulOp mm = mlir::cast<mlir::TF::MatMulOp>(op);
-    *left_transposed = mm.transpose_a();
-    *right_transposed = mm.transpose_b();
+    *left_transposed = mm.getTransposeA();
+    *right_transposed = mm.getTransposeB();
   }
 }
 
@@ -75,7 +75,7 @@ StatusOr<mlir::Operation*> MatMulSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   mlir::OpBuilder builder(op);
 
-  mlir::BlockAndValueMapping mapping;
+  mlir::IRMapping mapping;
   mapping.map(op->getOperand(0), left);
   mapping.map(op->getOperand(1), right);
   mlir::Operation* new_op = builder.clone(*op, mapping);
@@ -124,8 +124,8 @@ StatusOr<Layout> MatMulSPMDExpander::OutputLayoutAndReducedDims(
 
     // Note that it doesn't matter if we pass the global or local shape to
     // GetBroadcastLayoutForElementWise, it will return the same result.
-    TF_ASSIGN_OR_RETURN(const auto left_shape, GetShapeOfValue(mm.x()));
-    TF_ASSIGN_OR_RETURN(const auto right_shape, GetShapeOfValue(mm.y()));
+    TF_ASSIGN_OR_RETURN(const auto left_shape, GetShapeOfValue(mm.getX()));
+    TF_ASSIGN_OR_RETURN(const auto right_shape, GetShapeOfValue(mm.getY()));
     std::vector<std::string> left_splits;
     std::vector<std::string> right_splits;
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index 1b0d3666ed0..54d3a643e08 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -133,12 +133,12 @@ StatusOr<mlir::Operation*> PackSPMDExpander::ExpandOp(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(const absl::optional<Layout> output_layout,
                       ExtractSingleLayoutFromOp(op));
 
-  const int output_rank = ValueRank(pack.output());
+  const int output_rank = ValueRank(pack.getOutput());
   if (output_rank == -1)
     return errors::Unimplemented("output must have a rank");
 
   TF_ASSIGN_OR_RETURN(
-      int axis, CanonicalizeAxis(pack.axis(), /*packed_rank=*/output_rank));
+      int axis, CanonicalizeAxis(pack.getAxis(), /*packed_rank=*/output_rank));
 
   // TODO(bfontain): This may not be the best, but for now relayout all inputs
   // to match the output layout. E.g. if the output layout is not but the input
@@ -165,7 +165,7 @@ StatusOr<mlir::Operation*> PackSPMDExpander::ExpandOp(mlir::Operation* op) {
 StatusOr<llvm::DenseMap<int, Layout>> PackSPMDExpander::ComputeLayoutForward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
   auto pack = llvm::cast<mlir::TF::PackOp>(op);
-  const int axis = pack.axis();
+  const int axis = pack.getAxis();
   return LayoutFromUnpackedTensors(axis, input_layouts);
 }
 
@@ -175,7 +175,7 @@ StatusOr<llvm::DenseMap<int, Layout>> PackSPMDExpander::ComputeLayoutBackward(
     return llvm::DenseMap<int, Layout>();
 
   auto pack = llvm::cast<mlir::TF::PackOp>(op);
-  const int axis = pack.axis();
+  const int axis = pack.getAxis();
   return LayoutsFromPackedTensor(axis, output_layouts.lookup(0),
                                  pack->getNumOperands());
 }
@@ -194,7 +194,7 @@ StatusOr<mlir::Operation*> UnpackSPMDExpander::ExpandOp(mlir::Operation* op) {
   }
 
   TF_ASSIGN_OR_RETURN(
-      int axis, CanonicalizeAxis(unpack.axis(), /*packed_rank=*/input_rank));
+      int axis, CanonicalizeAxis(unpack.getAxis(), /*packed_rank=*/input_rank));
 
   if (input_layout->num_shards_for_dim(input_layout->dim(axis)) != 1) {
     // If the axis being unpacked is sharded, relayout to replicated along that
@@ -224,7 +224,7 @@ StatusOr<llvm::DenseMap<int, Layout>> UnpackSPMDExpander::ComputeLayoutForward(
     return llvm::DenseMap<int, Layout>();
 
   auto unpack = llvm::cast<mlir::TF::UnpackOp>(op);
-  const int axis = unpack.axis();
+  const int axis = unpack.getAxis();
   return LayoutsFromPackedTensor(axis, input_layouts.lookup(0),
                                  unpack->getNumResults());
 }
@@ -232,7 +232,7 @@ StatusOr<llvm::DenseMap<int, Layout>> UnpackSPMDExpander::ComputeLayoutForward(
 StatusOr<llvm::DenseMap<int, Layout>> UnpackSPMDExpander::ComputeLayoutBackward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
   auto unpack = llvm::cast<mlir::TF::UnpackOp>(op);
-  const int axis = unpack.axis();
+  const int axis = unpack.getAxis();
   return LayoutFromUnpackedTensors(axis, output_layouts);
 }
 
@@ -296,12 +296,12 @@ StatusOr<llvm::DenseMap<int, Layout>> PadSPMDExpander::ComputeLayoutForward(
   mlir::Value pad_output;
 
   if (auto pad_v2 = llvm::dyn_cast<mlir::TF::PadV2Op>(op)) {
-    pad_output = pad_v2.output();
-    pad_input = pad_v2.input();
+    pad_output = pad_v2.getOutput();
+    pad_input = pad_v2.getInput();
   } else {
     auto pad_op = llvm::cast<mlir::TF::PadOp>(op);
-    pad_output = pad_op.output();
-    pad_input = pad_op.input();
+    pad_output = pad_op.getOutput();
+    pad_input = pad_op.getInput();
   }
 
   TF_RETURN_IF_ERROR(
@@ -320,14 +320,14 @@ StatusOr<llvm::DenseMap<int, Layout>> PadSPMDExpander::ComputeLayoutBackward(
   input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/2);
 
   if (auto pad_v2 = llvm::dyn_cast<mlir::TF::PadV2Op>(op)) {
-    pad_output = pad_v2.output();
-    pad_input = pad_v2.input();
+    pad_output = pad_v2.getOutput();
+    pad_input = pad_v2.getInput();
     // `constant_values` operand
     input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/0);
   } else {
     auto pad_op = llvm::cast<mlir::TF::PadOp>(op);
-    pad_output = pad_op.output();
-    pad_input = pad_op.input();
+    pad_output = pad_op.getOutput();
+    pad_input = pad_op.getInput();
   }
 
   if (output_layouts.find(0) != output_layouts.end()) {
@@ -370,7 +370,7 @@ StatusOr<mlir::Operation*> TileSPMDExpander::ExpandOp(mlir::Operation* op) {
         "TileOP doesn't have a layout after layout propagation");
 
   TF_ASSIGN_OR_RETURN(absl::optional<Layout> operand_layout,
-                      ExtractLayoutFromOperand(tile_op.input()));
+                      ExtractLayoutFromOperand(tile_op.getInput()));
   if (!operand_layout)
     return errors::InvalidArgument(
         "Input operand to TileOp doesn't have a layout after layout "
@@ -384,7 +384,7 @@ StatusOr<mlir::Operation*> TileSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   llvm::SmallVector<int64_t, 4> static_multiples;
   auto status =
-      ExtractConstVectorFromValue(tile_op.multiples(), &static_multiples);
+      ExtractConstVectorFromValue(tile_op.getMultiples(), &static_multiples);
   if (!status.ok())
     return errors::Unimplemented(
         "Tile with a sharded output is not implemented for dynamic "
@@ -443,9 +443,9 @@ StatusOr<mlir::Operation*> TileSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   auto new_tile =
       builder.create<mlir::TF::TileOp>(location, /*output=*/local_type,
-                                       /*input=*/tile_op.input(),
+                                       /*input=*/tile_op.getInput(),
                                        /*multiples=*/multiples_const);
-  tile_op.getResult().replaceAllUsesWith(new_tile.output());
+  tile_op.getResult().replaceAllUsesWith(new_tile.getOutput());
   tile_op.erase();
   return new_tile.getOperation();
 }
@@ -459,7 +459,7 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutForward(
   auto tile_op = llvm::cast<mlir::TF::TileOp>(op);
 
   auto output_ranked_type =
-      tile_op.output().getType().dyn_cast<mlir::RankedTensorType>();
+      tile_op.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
   if (!output_ranked_type || !output_ranked_type.hasStaticShape()) {
     return errors::InvalidArgument(
         llvm::formatv(
@@ -471,7 +471,7 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutForward(
 
   llvm::SmallVector<int64_t, 4> static_multiple;
   auto status =
-      ExtractConstVectorFromValue(tile_op.multiples(), &static_multiple);
+      ExtractConstVectorFromValue(tile_op.getMultiples(), &static_multiple);
 
   // If multiple operands cannot be statically known, output is set to
   // replicated.
@@ -505,7 +505,7 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutBackward(
 
   // Retrieve operand/output shapes of tile op.
   auto input_ranked_type =
-      tile_op.input().getType().dyn_cast<mlir::RankedTensorType>();
+      tile_op.getInput().getType().dyn_cast<mlir::RankedTensorType>();
   if (!input_ranked_type || !input_ranked_type.hasStaticShape()) {
     return errors::InvalidArgument(
         llvm::formatv(
@@ -518,13 +518,15 @@ StatusOr<llvm::DenseMap<int, Layout>> TileSPMDExpander::ComputeLayoutBackward(
   llvm::DenseMap<int, Layout> input_layouts(op->getNumOperands());
 
   // `multiples` operand is always set to have replicated layout.
-  input_layouts[1] = Layout::ReplicatedOnMesh(
-      mesh,
-      tile_op.multiples().getType().cast<mlir::RankedTensorType>().getRank());
+  input_layouts[1] =
+      Layout::ReplicatedOnMesh(mesh, tile_op.getMultiples()
+                                         .getType()
+                                         .cast<mlir::RankedTensorType>()
+                                         .getRank());
 
   llvm::SmallVector<int64_t, 4> static_multiple;
   auto status =
-      ExtractConstVectorFromValue(tile_op.multiples(), &static_multiple);
+      ExtractConstVectorFromValue(tile_op.getMultiples(), &static_multiple);
 
   // If multiple operands cannot be statically known they are set to replicated.
   if (!status.ok()) {
@@ -772,9 +774,9 @@ StatusOr<mlir::Operation*> ReshapeSPMDExpander::ExpandOp(mlir::Operation* op) {
       Layout::GetLayout(tgt_output_layout, input_layout->mesh()));
 
   auto reshape_op = mlir::cast<mlir::TF::ReshapeOp>(op);
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value new_input,
-      EmitRelayout(reshape_op.tensor(), *input_layout, desired_input_layout));
+  TF_ASSIGN_OR_RETURN(mlir::Value new_input,
+                      EmitRelayout(reshape_op.getTensor(), *input_layout,
+                                   desired_input_layout));
 
   mlir::OpBuilder builder(op);
 
@@ -791,7 +793,7 @@ StatusOr<mlir::Operation*> ReshapeSPMDExpander::ExpandOp(mlir::Operation* op) {
       op->getLoc(), new_input, new_reshape_const_op);
 
   TF_ASSIGN_OR_RETURN(auto final_output,
-                      EmitRelayout(new_reshape_op.output(),
+                      EmitRelayout(new_reshape_op.getOutput(),
                                    desired_output_layout, *output_layout));
 
   op->getResult(0).replaceAllUsesWith(final_output);
@@ -807,10 +809,10 @@ StatusOr<llvm::DenseMap<int, Layout>> ReshapeSPMDExpander::ComputeLayoutForward(
   auto reshape_op = mlir::cast<mlir::TF::ReshapeOp>(op);
   TF_ASSIGN_OR_RETURN(
       auto input_shape,
-      GetShapeOfValue(reshape_op.tensor(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(reshape_op.getTensor(), /*fail_on_dynamic=*/true));
   TF_ASSIGN_OR_RETURN(
       auto output_shape,
-      GetShapeOfValue(reshape_op.output(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(reshape_op.getOutput(), /*fail_on_dynamic=*/true));
 
   llvm::SmallVector<int64_t, 4> input_segment_start;
   llvm::SmallVector<int64_t, 4> input_segment_end;
@@ -840,10 +842,10 @@ ReshapeSPMDExpander::ComputeLayoutBackward(
   auto reshape_op = mlir::cast<mlir::TF::ReshapeOp>(op);
   TF_ASSIGN_OR_RETURN(
       auto input_shape,
-      GetShapeOfValue(reshape_op.tensor(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(reshape_op.getTensor(), /*fail_on_dynamic=*/true));
   TF_ASSIGN_OR_RETURN(
       auto output_shape,
-      GetShapeOfValue(reshape_op.output(), /*fail_on_dynamic=*/true));
+      GetShapeOfValue(reshape_op.getOutput(), /*fail_on_dynamic=*/true));
 
   llvm::SmallVector<int64_t, 4> input_segment_start;
   llvm::SmallVector<int64_t, 4> input_segment_end;
@@ -883,7 +885,7 @@ StatusOr<mlir::Operation*> TransposeSPMDExpander::ExpandOp(
 
     auto transpose = mlir::cast<mlir::TF::TransposeOp>(op);
     llvm::SmallVector<int64, 4> perm;
-    TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.perm(), &perm));
+    TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.getPerm(), &perm));
 
     for (const auto& p : llvm::enumerate(perm)) {
       if (operand_layout->dim(p.value()).sharding_spec() !=
@@ -909,7 +911,7 @@ TransposeSPMDExpander::ComputeLayoutForward(
 
   auto transpose = mlir::cast<mlir::TF::TransposeOp>(op);
   llvm::SmallVector<int64, 4> perm;
-  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.perm(), &perm));
+  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.getPerm(), &perm));
 
   const Layout input_layout = input_layouts.lookup(0);
   std::vector<std::string> output_layout_specs;
@@ -927,7 +929,7 @@ TransposeSPMDExpander::ComputeLayoutBackward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
   auto transpose = mlir::cast<mlir::TF::TransposeOp>(op);
   llvm::SmallVector<int64, 4> perm;
-  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.perm(), &perm));
+  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(transpose.getPerm(), &perm));
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   llvm::DenseMap<int, Layout> input_layouts(transpose->getNumOperands());
@@ -978,7 +980,7 @@ Status RelayoutOneHotInput(const absl::optional<Layout>& input_layout,
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value new_input,
-      EmitRelayout(one_hot.indices(), *input_layout, new_input_layout));
+      EmitRelayout(one_hot.getIndices(), *input_layout, new_input_layout));
 
   one_hot->setOperand(0, new_input);
 
@@ -996,7 +998,7 @@ StatusOr<mlir::Operation*> OneHotSPMDExpander::ExpandOp(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(const auto output_layout,
                       ExtractSingleLayoutFromOp(one_hot_op));
-  int axis = one_hot_op.axisAttr().getInt();
+  int axis = one_hot_op.getAxisAttr().getInt();
   if (axis == -1) axis = output_layout->rank() - 1;
 
   // For tf.OneHot, relayout input so that it matches the output layout (outside
@@ -1005,7 +1007,7 @@ StatusOr<mlir::Operation*> OneHotSPMDExpander::ExpandOp(mlir::Operation* op) {
       RelayoutOneHotInput(input_layout, output_layout, axis, one_hot_op));
 
   const int num_shards = output_layout->num_shards()[axis];
-  const auto depth = ExtractConstIntFromValue(one_hot_op.depth());
+  const auto depth = ExtractConstIntFromValue(one_hot_op.getDepth());
   const bool depth_statically_divisible_by_sharding =
       (depth.ok() && (*depth) % num_shards == 0);
 
@@ -1059,15 +1061,15 @@ StatusOr<mlir::Operation*> OneHotSPMDExpander::ExpandOp(mlir::Operation* op) {
     mlir::Value selected_sharding_scalar_value =
         builder.create<mlir::TF::ReshapeOp>(
             one_hot_op.getLoc(), mlir::ArrayRef<mlir::Type>{scalar_size_type},
-            mlir::ArrayRef<mlir::Value>{selected_sharding_at_dimension.output(),
-                                        scalar_shape},
+            mlir::ArrayRef<mlir::Value>{
+                selected_sharding_at_dimension.getOutput(), scalar_shape},
             mlir::ArrayRef<mlir::NamedAttribute>{});
 
     // `new_indices` =  `original_indices` - `selected_sharding_scalar_value` *
     // (depth/num_shards)
     mlir::Value id_offset = builder.create<mlir::TF::MulOp>(
         one_hot_op->getLoc(), new_depth, selected_sharding_scalar_value);
-    mlir::Value original_indices = one_hot_op.indices();
+    mlir::Value original_indices = one_hot_op.getIndices();
     mlir::Value new_indices = builder.create<mlir::TF::SubOp>(
         one_hot_op->getLoc(), original_indices, id_offset);
 
@@ -1086,8 +1088,8 @@ StatusOr<llvm::DenseMap<int, Layout>> OneHotSPMDExpander::ComputeLayoutForward(
     return llvm::DenseMap<int, Layout>();
 
   auto one_hot = mlir::dyn_cast<mlir::TF::OneHotOp>(op);
-  int axis = one_hot.axis();
-  if (axis == -1) axis = ValueRank(one_hot.indices());
+  int axis = one_hot.getAxis();
+  if (axis == -1) axis = ValueRank(one_hot.getIndices());
   TF_ASSIGN_OR_RETURN(auto mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   const Layout indices_layout = input_layouts.lookup(0);
@@ -1111,8 +1113,8 @@ StatusOr<llvm::DenseMap<int, Layout>> OneHotSPMDExpander::ComputeLayoutForward(
 StatusOr<llvm::DenseMap<int, Layout>> OneHotSPMDExpander::ComputeLayoutBackward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
   auto one_hot = mlir::dyn_cast<mlir::TF::OneHotOp>(op);
-  int axis = one_hot.axis();
-  if (axis == -1) axis = ValueRank(one_hot.indices());
+  int axis = one_hot.getAxis();
+  if (axis == -1) axis = ValueRank(one_hot.getIndices());
   TF_ASSIGN_OR_RETURN(auto mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   llvm::DenseMap<int, Layout> input_layouts(one_hot->getNumOperands());
diff --git a/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
index 71c62fd0082..11ff88b1547 100644
--- a/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.cc
@@ -53,7 +53,7 @@ StatusOr<mlir::Operation*> NullarySPMDExpander::ExpandOp(mlir::Operation* op) {
   if (all_operands_fully_replicated) return op;
 
   if (auto const_op = mlir::dyn_cast<mlir::TF::ConstOp>(op)) {
-    if (auto dense = const_op.value().dyn_cast<mlir::DenseElementsAttr>()) {
+    if (auto dense = const_op.getValue().dyn_cast<mlir::DenseElementsAttr>()) {
       if (dense.isSplat()) {
         // A 'splat' value for a DenseElementsAttr, has a single value for
         // all its elements. For these inputs, we don't need to slice. We just
@@ -72,7 +72,7 @@ StatusOr<mlir::Operation*> NullarySPMDExpander::ExpandOp(mlir::Operation* op) {
                 num_shards, " in the layout for that dimension.");
           new_shape[i] = shape[i] / num_shards;
         }
-        const_op.valueAttr(mlir::DenseElementsAttr::get(
+        const_op.setValueAttr(mlir::DenseElementsAttr::get(
             mlir::RankedTensorType::get(new_shape,
                                         dense.getType().getElementType()),
             dense.getSplatValue<mlir::Attribute>()));
diff --git a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
new file mode 100644
index 00000000000..d0f5efac876
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "tensorflow/dtensor/cc/constants.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/dtensor_location.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+StatusOr<mlir::Operation*> OptionalGetValueSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  auto original_op = mlir::cast<mlir::TF::OptionalGetValueOp>(op);
+  mlir::OpBuilder builder(op);
+
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> output_layouts,
+                      ExtractRequiredLayoutFromOp(op));
+
+  llvm::SmallVector<mlir::Type, 4> local_types(original_op->getNumResults());
+
+  for (int i = 0; i < original_op->getNumResults(); ++i) {
+    mlir::TensorType global_output_type =
+        original_op.getResult(i).getType().cast<mlir::TensorType>();
+    TF_ASSIGN_OR_RETURN(
+        mlir::TensorType local_type,
+        LocalTypeFromGlobalType(output_layouts[i], global_output_type));
+    local_types[i] = local_type;
+  }
+
+  auto new_op = builder.create<mlir::TF::OptionalGetValueOp>(
+      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+
+  for (int i = 0; i < original_op->getNumResults(); ++i) {
+    original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
+  }
+  original_op.erase();
+  return InferSPMDExpandedLocalShape(new_op);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+OptionalGetValueSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  // Extract the output element layouts from some op in the input chain that has
+  // the `tf._element_layouts` attribute set.
+  TF_ASSIGN_OR_RETURN(const auto layouts,
+                      ExtractElementLayoutsFromOperand(op->getOpOperand(0)));
+
+  llvm::DenseMap<int, Layout> output_layouts(op->getNumResults());
+  for (int i = 0; i < op->getNumResults(); ++i) {
+    output_layouts[i] = layouts[i];
+  }
+  return output_layouts;
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+OptionalGetValueSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(0)))}});
+}
+
+StatusOr<mlir::Operation*> OptionalHasValueSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  return InferSPMDExpandedLocalShape(op);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+OptionalHasValueSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, /*rank=*/0)}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+OptionalHasValueSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  return llvm::DenseMap<int, Layout>(
+      {{0, Layout::ReplicatedOnMesh(mesh, ValueRank(op->getOperand(0)))}});
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h
new file mode 100644
index 00000000000..cf5a1ae444f
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
+
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class OptionalGetValueSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class OptionalHasValueSPMDExpander final : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_OPTIONAL_SPMD_EXPANDER_H_
diff --git a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
index a6759327671..d6ec2663fc9 100644
--- a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
@@ -108,7 +108,7 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
                            kDeviceSeedForMeshDims)
                        .getValues<uint32_t>()
                        .begin()))
-      return squeeze.output();
+      return squeeze.getOutput();
 
   TF_ASSIGN_OR_RETURN(mlir::Value mesh_coordinates,
                       GetMeshCoordinatesFromCluster(cluster));
@@ -139,7 +139,8 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, multipliers);
   mlir::Value multiplier =
-      builder.create<mlir::TF::ConstOp>(cluster.getLoc(), const_attr).output();
+      builder.create<mlir::TF::ConstOp>(cluster.getLoc(), const_attr)
+          .getOutput();
 
   const mlir::RankedTensorType one_by_one =
       mlir::RankedTensorType::get({1, 1}, builder.getIntegerType(32));
@@ -154,7 +155,7 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
   mlir::Value seed_plus_prime =
       builder
           .create<mlir::TF::AddV2Op>(cluster.getLoc(), one_by_one, seed, prime)
-          .z();
+          .getZ();
 
   mlir::TF::SqueezeOp squeeze = builder.create<mlir::TF::SqueezeOp>(
       cluster.getLoc(),
@@ -164,7 +165,7 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
   squeeze->setAttr(kDeviceSeedForMeshDims,
                    builder.getI32TensorAttr(layout_dims));
 
-  return squeeze.output();
+  return squeeze.getOutput();
 }
 
 // Compute the new local shape for SPMD expansion and ensure it is valid.
@@ -173,7 +174,8 @@ StatusOr<llvm::SmallVector<int64_t, 4>> GetNewLocalShape(mlir::Operation* op,
                                                          const Layout& layout) {
   auto random_op = llvm::cast<RandomOp>(op);
   llvm::SmallVector<int64_t, 4> op_shape;
-  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(random_op.shape(), &op_shape));
+  TF_RETURN_IF_ERROR(
+      ExtractConstVectorFromValue(random_op.getShape(), &op_shape));
 
   // Validate that sharding of random op is compatible with it's user defined
   // shape and calculate new shape of local random op.
@@ -215,7 +217,7 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV1(const Layout& layout,
   // Create device_id_seed for local RNG.
   TF_ASSIGN_OR_RETURN(auto seed_xor,
                       ComputeNewSeed<RandomOp>(builder, op, layout, location,
-                                               random_op.seed()));
+                                               random_op.getSeed()));
 
   // Create a new random op with new `local` shape and newly generated seed.
   // StatelessRandom op is used to make random op SPMD expansion
@@ -228,7 +230,7 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV1(const Layout& layout,
   // TODO(zhonglinhan) : check different input for StatelessRandomUniformInt
   auto local_random = builder.create<RandomOp>(location, new_random_type,
                                                new_shape_value, seed_xor);
-  op->getResult(0).replaceAllUsesWith(local_random.output());
+  op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
 }
@@ -245,9 +247,9 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2(const Layout& layout,
 
   auto random_op = llvm::cast<RandomOp>(op);
   // Create device_id_seed for local RNG.
-  TF_ASSIGN_OR_RETURN(
-      auto seed_xor,
-      ComputeNewSeed<RandomOp>(builder, op, layout, location, random_op.key()));
+  TF_ASSIGN_OR_RETURN(auto seed_xor,
+                      ComputeNewSeed<RandomOp>(builder, op, layout, location,
+                                               random_op.getKey()));
 
   // Create a new random op with new `local` shape and newly generated seed.
   // StatelessRandom op is used to make random op SPMD expansion
@@ -258,10 +260,10 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2(const Layout& layout,
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
-  auto local_random =
-      builder.create<RandomOp>(location, new_random_type, new_shape_value,
-                               seed_xor, random_op.counter(), random_op.alg());
-  op->getResult(0).replaceAllUsesWith(local_random.output());
+  auto local_random = builder.create<RandomOp>(
+      location, new_random_type, new_shape_value, seed_xor,
+      random_op.getCounter(), random_op.getAlg());
+  op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
 }
@@ -278,9 +280,9 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2Range(
 
   auto random_op = llvm::cast<RandomOp>(op);
   // Create device_id_seed for local RNG.
-  TF_ASSIGN_OR_RETURN(
-      auto seed_xor,
-      ComputeNewSeed<RandomOp>(builder, op, layout, location, random_op.key()));
+  TF_ASSIGN_OR_RETURN(auto seed_xor,
+                      ComputeNewSeed<RandomOp>(builder, op, layout, location,
+                                               random_op.getKey()));
 
   // Create a new random op with new `local` shape and newly generated seed.
   // StatelessRandom op is used to make random op SPMD expansion
@@ -292,9 +294,10 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2Range(
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
   auto local_random = builder.create<RandomOp>(
-      location, new_random_type, new_shape_value, seed_xor, random_op.counter(),
-      random_op.alg(), random_op.minval(), random_op.maxval());
-  op->getResult(0).replaceAllUsesWith(local_random.output());
+      location, new_random_type, new_shape_value, seed_xor,
+      random_op.getCounter(), random_op.getAlg(), random_op.getMinval(),
+      random_op.getMaxval());
+  op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
 }
diff --git a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
index 69e5a24e4ac..b45ee012c21 100644
--- a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
@@ -97,10 +97,10 @@ Status ExtractDims(mlir::Operation* op,
                    bool* matched) {
   if (!llvm::isa<OpType>(op)) return OkStatus();
   auto reduce_op = llvm::cast<OpType>(op);
-  *keep_dims = reduce_op.keep_dims();
-  TF_RETURN_IF_ERROR(
-      ExtractConstVectorFromValue(reduce_op.reduction_indices(), reduced_dims));
-  TF_RETURN_IF_ERROR(AssertReplicated(reduce_op.reduction_indices()));
+  *keep_dims = reduce_op.getKeepDims();
+  TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(
+      reduce_op.getReductionIndices(), reduced_dims));
+  TF_RETURN_IF_ERROR(AssertReplicated(reduce_op.getReductionIndices()));
   *matched = true;
 
   return OkStatus();
@@ -112,7 +112,7 @@ Status ExtractDims<mlir::TF::BiasAddGradOp>(
     bool* keep_dims, bool* matched) {
   if (!llvm::isa<mlir::TF::BiasAddGradOp>(op)) return OkStatus();
   auto bias_add_grad_op = llvm::cast<mlir::TF::BiasAddGradOp>(op);
-  auto data_format = bias_add_grad_op.data_format();
+  auto data_format = bias_add_grad_op.getDataFormat();
   // rank is at least 2 (required by BiasAddGrad).
   int rank = ValueRank(bias_add_grad_op->getOperand(0));
   if (data_format.equals("NHWC")) {
diff --git a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
index 277358a7891..128e0d9fc95 100644
--- a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
@@ -16,21 +16,21 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/expansions/resource_spmd_expander.h"
 
 #include <algorithm>
+#include <iterator>
+#include <optional>
 #include <string>
+#include <vector>
 
-#include "absl/strings/str_join.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
-#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
 #include "tensorflow/dtensor/mlir/shape_utils.h"
-#include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -54,6 +53,60 @@ std::vector<AttrType> CreateOrGetMutableAttributeList(
   return output;
 }
 
+StatusOr<mlir::Operation*> ExpandVarHandleOp(mlir::Operation* op) {
+  mlir::OpBuilder builder(op);
+  builder.setInsertionPointAfter(op);
+
+  // This is the layout of the value held by the resource.
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> resource_layout,
+                      ExtractSingleLayoutFromOp(op));
+
+  TF_ASSIGN_OR_RETURN(
+      llvm::ArrayRef<int64_t> global_shape,
+      GetGlobalShapeOfValueFromDTensorLayout(op->getOpResult(0)));
+
+  if (!resource_layout) {
+    // If resource does not have a layout, perform local SPMD expansion.
+    return InferSPMDExpandedLocalShape(op);
+  }
+
+  // If resource has a layout, create VarHandleOps with local shape.
+  auto var_op = llvm::cast<mlir::TF::VarHandleOp>(op);
+  const std::vector<int64_t>& local_shape =
+      resource_layout->LocalShapeFromGlobalShape(global_shape);
+
+  // Replace var handle op with a VarHandleOp with local shape
+  auto resource_type = op->getOpResult(0)
+                           .getType()
+                           .cast<mlir::TensorType>()
+                           .getElementType()
+                           .dyn_cast<mlir::TF::ResourceType>();
+
+  auto sub_types = resource_type.getSubtypes();
+  auto resource_arg_sub_type = sub_types.front();
+
+  // The local shape that is to be assigned to this resource output.
+  llvm::SmallVector<int64_t, 4> local_arg_shape(local_shape.begin(),
+                                                local_shape.end());
+
+  auto local_variable_subtype = mlir::RankedTensorType::get(
+      local_arg_shape, resource_arg_sub_type.getElementType());
+  auto new_var_type = mlir::RankedTensorType::get(
+      {}, mlir::TF::ResourceType::get(
+              mlir::ArrayRef<mlir::TensorType>{local_variable_subtype},
+              builder.getContext()));
+
+  auto var_handle_op = builder.create<mlir::TF::VarHandleOp>(
+      var_op->getLoc(), new_var_type, var_op.getContainer(),
+      var_op.getSharedName());
+
+  auto result_op = InferSPMDExpandedLocalShape(var_handle_op);
+  op->getOpResult(0).replaceAllUsesWith(result_op->getOpResult(0));
+  op->erase();
+
+  return result_op;
+}
+
 Status ValidateAndAssignResourceInputLayout(mlir::tf_device::ClusterOp op,
                                             const std::string& layout_string,
                                             const int resource_arg_index,
@@ -118,10 +171,13 @@ Status ValidateAndAssignResourceInputLayout(mlir::tf_device::ClusterOp op,
 
 StatusOr<mlir::Operation*> ResourceSPMDExpander::ExpandOp(mlir::Operation* op) {
   // These ops need no special handling.
-  if (llvm::isa<mlir::TF::VarHandleOp, mlir::TF::DestroyResourceOp,
-                mlir::TF::VarIsInitializedOp>(op))
+  if (llvm::isa<mlir::TF::DestroyResourceOp, mlir::TF::VarIsInitializedOp>(op))
     return InferSPMDExpandedLocalShape(op);
 
+  if (llvm::isa<mlir::TF::VarHandleOp>(op)) {
+    return ExpandVarHandleOp(op);
+  }
+
   mlir::OpBuilder builder(op);
 
   // Output of read variable may need to be sliced, so it needs to be treated
@@ -135,6 +191,8 @@ StatusOr<mlir::Operation*> ResourceSPMDExpander::ExpandOp(mlir::Operation* op) {
       TF_RETURN_WITH_CONTEXT(errors::Internal("output layout is missing"));
     if (!input_layout)
       TF_RETURN_WITH_CONTEXT(errors::Internal("input layout is missing"));
+
+    LOG(INFO) << "Infer spmd local result " << op->getNumResults();
     InferSPMDExpandedLocalShape(op);
     llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
     TF_ASSIGN_OR_RETURN(
@@ -149,11 +207,11 @@ StatusOr<mlir::Operation*> ResourceSPMDExpander::ExpandOp(mlir::Operation* op) {
                  mlir::TF::AssignSubVariableOp>(op))
     TF_RETURN_WITH_CONTEXT(errors::Internal("unsupported resource op"));
 
-  TF_ASSIGN_OR_RETURN(absl::optional<Layout> output_layout,
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> output_layout,
                       ExtractSingleLayoutFromOp(op));
-  TF_ASSIGN_OR_RETURN(absl::optional<Layout> resource_layout,
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> resource_layout,
                       ExtractLayoutFromOperand(op->getOperand(0)));
-  TF_ASSIGN_OR_RETURN(absl::optional<Layout> value_layout,
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> value_layout,
                       ExtractLayoutFromOperand(op->getOperand(1)));
 
   // For assignment operations, the layout for the resource (first operand),
diff --git a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
index fcfe79d5543..58bdad936c0 100644
--- a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
@@ -87,7 +87,7 @@ StatusOr<mlir::Value> GetAllCandidateCheckpointPrefixes(
               StringConst(builder, prefix.getLoc(),
                           llvm::SmallVector<llvm::StringRef>(
                               {DeviceSuffix(0, mesh.num_devices())})))
-          .z();
+          .getZ();
 
   for (int64_t device_id = 1; device_id < mesh.num_devices(); ++device_id) {
     mlir::Value prefix_plus_dtensor_suffix =
@@ -98,7 +98,7 @@ StatusOr<mlir::Value> GetAllCandidateCheckpointPrefixes(
                 StringConst(builder, prefix.getLoc(),
                             llvm::SmallVector<llvm::StringRef>(
                                 {DeviceSuffix(device_id, mesh.num_devices())})))
-            .z();
+            .getZ();
 
     new_prefix = builder
                      .create<mlir::TF::ConcatOp>(
@@ -197,7 +197,7 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
   // empty.
   llvm::SmallVector<std::string, 4> original_shape_and_slices;
   const Status extraction_status = ExtractConstStringVectorFromValue(
-      original_save.shape_and_slices(), original_shape_and_slices);
+      original_save.getShapeAndSlices(), original_shape_and_slices);
   if (extraction_status.ok()) {
     for (const std::string& shape_and_slice : original_shape_and_slices) {
       if (!shape_and_slice.empty())
@@ -273,9 +273,9 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
       // First run a split op on the tensor_names so that we can use the proper
       // splitted output(one of the tensor_name) to reconstruct tensor_names
       // field in the new SaveV2 op.
-      TF_ASSIGN_OR_RETURN(
-          llvm::ArrayRef<int64_t> tensor_names_shape,
-          GetGlobalShapeOfValueFromDTensorLayout(original_save.tensor_names()));
+      TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> tensor_names_shape,
+                          GetGlobalShapeOfValueFromDTensorLayout(
+                              original_save.getTensorNames()));
       if (tensor_names_shape.size() != 1)
         return errors::Internal(
             llvm::formatv("SaveV2 op got `tensor_names` with rank {0}) but "
@@ -326,7 +326,7 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
               fn_builder
                   .create<mlir::TF::ConcatOp>(
                       location,
-                      /*output=*/original_save.tensor_names().getType(),
+                      /*output=*/original_save.getTensorNames().getType(),
                       /*concat_dim=*/
                       IntConst(fn_builder, location, /*values=*/{0}),
                       new_tensor_names)
@@ -384,7 +384,7 @@ StatusOr<mlir::Operation*> ExpandSaveV2Op(mlir::Operation* op) {
   absl::flat_hash_map<int64_t, std::pair<std::vector<int64_t>, Layout>>
       tensor_shape_layout_map;
   std::vector<SavingTensorMetadata> metadata;
-  for (const auto& it : llvm::enumerate(save_v2.tensors())) {
+  for (const auto& it : llvm::enumerate(save_v2.getTensors())) {
     mlir::Value tensor = it.value();
     // We use index to select the tensor names and shape_and_slices from the
     // inputs. This is generic regardless whether the inputs are constants or
@@ -511,7 +511,7 @@ StatusOr<mlir::Operation*> ExpandMergeV2Op(mlir::Operation* op) {
   else_fn_builder.create<mlir::TF::MergeV2CheckpointsOp>(
       location, checkpoint_prefixes, destination_prefixes,
       /*delete_old_dirs=*/
-      else_fn_builder.getBoolAttr(merge_v2.delete_old_dirs()),
+      else_fn_builder.getBoolAttr(merge_v2.getDeleteOldDirs()),
       /*allow_missing_files=*/else_fn_builder.getBoolAttr(true));
 
   else_fn_builder.create<mlir::func::ReturnOp>(location);
@@ -706,7 +706,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
                                                 shape_attr.getShape().end()));
   }
 
-  mlir::ArrayAttr input_layouts_attr = restore_v2.input_layouts();
+  mlir::ArrayAttr input_layouts_attr = restore_v2.getInputLayouts();
   if (!input_layouts_attr) {
     return errors::InvalidArgument(
         "DTensorRestoreV2Op requires input_layouts attributes.");
@@ -723,7 +723,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
       op, input_shapes, input_layouts,
       std::vector<mlir::Type>(op->getResultTypes().begin(),
                               op->getResultTypes().end()),
-      restore_v2.shape_and_slicesMutable());
+      restore_v2.getShapeAndSlicesMutable());
 }
 
 // Extract the layout and shapes the normal way. By this time, we should
@@ -780,7 +780,7 @@ StatusOr<mlir::Operation*> ExpandRestoreV2Op(mlir::Operation* op) {
 
   return ExpandRestoreV2OpHelper(
       op, global_shapes, layouts, new_types,
-      mlir::dyn_cast<mlir::TF::RestoreV2Op>(op).shape_and_slicesMutable());
+      mlir::dyn_cast<mlir::TF::RestoreV2Op>(op).getShapeAndSlicesMutable());
 }
 
 }  // namespace
@@ -838,7 +838,7 @@ StatusOr<llvm::SmallVector<Layout>> GetLayoutsFromAssignVariableOps(
       if (auto assign_op = llvm::dyn_cast_or_null<mlir::TF::AssignVariableOp>(
               consuming_op)) {
         TF_ASSIGN_OR_RETURN(auto layout, ExtractRequiredLayoutFromOperand(
-                                             assign_op.resource()));
+                                             assign_op.getResource()));
         layouts[result.getResultNumber()] = layout;
         break;
       }
@@ -900,7 +900,7 @@ SaveRestoreSPMDExpander::ComputeLayoutForward(
         mlir::cast<mlir::TF::DTensorRestoreV2Op>(op);
     llvm::DenseMap<int, Layout> output_layouts(restore_v2.getNumResults());
     // Output layout is simply the layout from the arguments.
-    for (const auto& it : llvm::enumerate(restore_v2.input_layouts())) {
+    for (const auto& it : llvm::enumerate(restore_v2.getInputLayouts())) {
       TF_ASSIGN_OR_RETURN(
           Layout layout,
           Layout::FromString(
diff --git a/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.cc
index d56315e1e3a..f469ca35e06 100644
--- a/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h"
 
+#include <optional>
 #include <string>
 
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
@@ -76,16 +79,16 @@ template <typename OpType>
 StatusOr<mlir::Operation*> TensorScatterOpExpand(mlir::Operation* op) {
   auto scatter_op = llvm::cast<OpType>(op);
   TF_ASSIGN_OR_RETURN(auto tensor_layout,
-                      ExtractLayoutFromOperand(scatter_op.tensor()));
+                      ExtractLayoutFromOperand(scatter_op.getTensor()));
   TF_ASSIGN_OR_RETURN(auto indices_layout,
-                      ExtractLayoutFromOperand(scatter_op.indices()));
+                      ExtractLayoutFromOperand(scatter_op.getIndices()));
   TF_ASSIGN_OR_RETURN(auto updates_layout,
-                      ExtractLayoutFromOperand(scatter_op.updates()));
+                      ExtractLayoutFromOperand(scatter_op.getUpdates()));
   TF_ASSIGN_OR_RETURN(auto output_layout,
                       ExtractSingleLayoutFromOp(scatter_op));
 
-  const int tensor_rank = ValueRank(scatter_op.tensor());
-  const int updates_rank = ValueRank(scatter_op.updates());
+  const int tensor_rank = ValueRank(scatter_op.getTensor());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
 
   if (tensor_rank == -1 || updates_rank == -1)
     return errors::InvalidArgument("all inputs must have valid rank.");
@@ -94,18 +97,18 @@ StatusOr<mlir::Operation*> TensorScatterOpExpand(mlir::Operation* op) {
   // operations.
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> tensor_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.tensor()));
+      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.getTensor()));
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> indices_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.indices()));
+      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.getIndices()));
   TF_ASSIGN_OR_RETURN(
       llvm::ArrayRef<int64_t> updates_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.updates()));
+      GetGlobalShapeOfValueFromDTensorLayout(scatter_op.getUpdates()));
 
   // Start by relaying out the inputs. Indices should replicated.
   TF_ASSIGN_OR_RETURN(
       mlir::Value new_indices,
-      EmitRelayout(scatter_op.indices(), *indices_layout,
+      EmitRelayout(scatter_op.getIndices(), *indices_layout,
                    Layout::ReplicatedOnMesh(indices_layout->mesh(),
                                             indices_shape.size())));
 
@@ -131,10 +134,10 @@ StatusOr<mlir::Operation*> TensorScatterOpExpand(mlir::Operation* op) {
                       Layout::GetLayout(updates_specs, updates_layout->mesh()));
   TF_ASSIGN_OR_RETURN(
       mlir::Value new_tensor,
-      EmitRelayout(scatter_op.tensor(), *tensor_layout, pre_output_layout));
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value new_updates,
-      EmitRelayout(scatter_op.updates(), *updates_layout, new_updates_layout));
+      EmitRelayout(scatter_op.getTensor(), *tensor_layout, pre_output_layout));
+  TF_ASSIGN_OR_RETURN(mlir::Value new_updates,
+                      EmitRelayout(scatter_op.getUpdates(), *updates_layout,
+                                   new_updates_layout));
 
   mlir::OpBuilder builder(op);
   OpType new_scatter = builder.create<OpType>(
@@ -142,7 +145,7 @@ StatusOr<mlir::Operation*> TensorScatterOpExpand(mlir::Operation* op) {
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value new_output,
-      EmitRelayout(new_scatter.output(), pre_output_layout, *output_layout));
+      EmitRelayout(new_scatter.getOutput(), pre_output_layout, *output_layout));
 
   op->getResult(0).replaceAllUsesWith(new_output);
   op->erase();
@@ -156,15 +159,15 @@ StatusOr<llvm::DenseMap<int, Layout>> TensorScatterOpComputeLayoutForward(
   TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
   auto scatter_op = llvm::cast<OpType>(op);
 
-  const int tensor_rank = ValueRank(scatter_op.tensor());
-  const int updates_rank = ValueRank(scatter_op.updates());
+  const int tensor_rank = ValueRank(scatter_op.getTensor());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
   if (tensor_rank == -1 || updates_rank == -1)
     return errors::InvalidArgument("all inputs must have valid rank.");
 
-  absl::optional<Layout> tensor_layout;
+  std::optional<Layout> tensor_layout;
   if (input_layouts.find(0) != input_layouts.end())
     tensor_layout.emplace(input_layouts.lookup(0));
-  absl::optional<Layout> updates_layout;
+  std::optional<Layout> updates_layout;
   if (input_layouts.find(2) != input_layouts.end())
     updates_layout.emplace(input_layouts.lookup(2));
 
@@ -184,9 +187,9 @@ StatusOr<llvm::DenseMap<int, Layout>> TensorScatterOpComputeLayoutBackward(
   TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
   auto scatter_op = llvm::cast<OpType>(op);
 
-  const int tensor_rank = ValueRank(scatter_op.tensor());
-  const int indices_rank = ValueRank(scatter_op.indices());
-  const int updates_rank = ValueRank(scatter_op.updates());
+  const int tensor_rank = ValueRank(scatter_op.getTensor());
+  const int indices_rank = ValueRank(scatter_op.getIndices());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
   if (tensor_rank == -1 || indices_rank == -1 || updates_rank == -1)
     return errors::InvalidArgument("all inputs must have valid rank.");
 
@@ -271,5 +274,170 @@ TensorScatterOpSPMDExpander::ComputeLayoutBackward(
       "Layout propagation for op : ", OpName(op), " is not implemented"));
 }
 
+StatusOr<mlir::Operation*> ScatterNdOpSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  TF_ASSIGN_OR_RETURN(const std::vector<Layout>& operand_layouts,
+                      ExtractRequiredLayoutFromOperands(op));
+  TF_ASSIGN_OR_RETURN(const Layout& output_layout,
+                      ExtractRequiredSingleLayoutFromOp(op));
+
+  const Layout& indices_layout = operand_layouts[0];
+  const Layout& updates_layout = operand_layouts[1];
+
+  TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  auto scatter_op = llvm::cast<mlir::TF::ScatterNdOp>(op);
+
+  const int output_rank = ValueRank(scatter_op.getResult());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
+  const int indices_rank = ValueRank(scatter_op.getIndices());
+
+  if (output_rank == -1 || updates_rank == -1) {
+    return errors::InvalidArgument(
+        "Dynamic shaped inputs are not supported. Please file a feature "
+        "request to TF DTensor: component id: 8333864");
+  }
+
+  llvm::SmallVector<int64_t, 4> global_shape;
+  if (!ExtractConstVectorFromValue(scatter_op.getShape(), &global_shape).ok()) {
+    return errors::InvalidArgument(
+        "Failed in extracting constant vector from shape tensor. Please file "
+        "a bug to TF DTensor: component id: 833864");
+  }
+
+  // Only do computation after replicating indices tensor.
+  // The expansion will work as the following:
+  // Let N be the rank of the output tensor.
+  // Let K be the rank of each update tensor.
+  // Then the size of each tensor of indices must be size N-K+1.
+  // We will enforce the indices tensor to be replicated, which means
+  // that also the first N-K+1 dimensions of the output must be replicated as
+  // well.
+  // We will shard the updates tensor however much we can, which also means
+  // we will shard the last K dimension tensors of the output tensor to be
+  // as sharded as the updates tensor.
+  //
+  TF_ASSIGN_OR_RETURN(
+      mlir::Value new_indices,
+      EmitRelayout(scatter_op.getIndices(), indices_layout,
+                   Layout::ReplicatedOnMesh(mesh, indices_rank)));
+
+  // Create intermediate layouts for tensors and updates. Since the layout of
+  // tensor and the output of the local tensor-scatter are the same we can reuse
+  // GetOutputLayout. This intermediate layout will be the layout that
+  // both the output and the updates tensor will agree upon. The updates
+  // intermediate layout will also be computed from the last K dimension of
+  // this.
+  TF_ASSIGN_OR_RETURN(Layout output_intermediate_layout,
+                      GetOutputLayout(output_layout, output_rank,
+                                      updates_layout, updates_rank, mesh));
+
+  std::vector<std::string> updates_specs(updates_rank);
+  if (updates_rank == 0) {
+    return errors::InvalidArgument(
+        "Expected updates_rank to be greater than zero, but got: ",
+        updates_rank);
+  }
+  updates_specs[0] = Layout::kUnshardedDim;
+
+  for (int i = 1; i < updates_rank; ++i) {
+    updates_specs[updates_rank - i] =
+        output_intermediate_layout.sharding_spec(output_rank - i);
+  }
+
+  TF_ASSIGN_OR_RETURN(Layout new_updates_layout,
+                      Layout::GetLayout(updates_specs, mesh));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_updates,
+                      EmitRelayout(scatter_op.getUpdates(), updates_layout,
+                                   new_updates_layout));
+
+  const std::vector<int64_t>& local_shape =
+      output_layout.LocalShapeFromGlobalShape(global_shape);
+
+  mlir::OpBuilder builder(op);
+  mlir::Operation* new_scatter = builder.create<mlir::TF::ScatterNdOp>(
+      op->getLoc(), op->getResult(0).getType(), new_indices, new_updates,
+      /*shape=*/
+      ::mlir::TF::collection_ops_util::GetR1Const(local_shape, builder,
+                                                  op->getLoc()));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_output,
+                      EmitRelayout(new_scatter->getResult(0),
+                                   output_intermediate_layout, output_layout));
+
+  op->getResult(0).replaceAllUsesWith(new_output);
+  op->erase();
+
+  return InferSPMDExpandedLocalShape(new_output.getDefiningOp());
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+ScatterNdOpSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  auto scatter_op = llvm::cast<mlir::TF::ScatterNdOp>(op);
+
+  const int output_rank = ValueRank(scatter_op.getResult());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
+  if (output_rank == -1 || updates_rank == -1)
+    return errors::InvalidArgument(
+        "Dynamic shaped inputs are not supported. Please file a feature "
+        "request to TF DTensor: component id: 8333864");
+
+  std::optional<Layout> updates_layout;
+  auto iter = input_layouts.find(1);
+  if (iter == input_layouts.end()) {
+    return llvm::DenseMap<int, Layout>();
+  }
+  TF_ASSIGN_OR_RETURN(const Layout output_layout,
+                      GetOutputLayout(std::nullopt, output_rank,
+                                      iter->getSecond(), updates_rank, mesh));
+  return llvm::DenseMap<int, Layout>({{0, output_layout}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+ScatterNdOpSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+  auto scatter_op = llvm::cast<mlir::TF::ScatterNdOp>(op);
+
+  const int output_rank = ValueRank(scatter_op.getResult());
+  const int indices_rank = ValueRank(scatter_op.getIndices());
+  const int updates_rank = ValueRank(scatter_op.getUpdates());
+
+  if (output_rank == -1 || indices_rank == -1 || updates_rank == -1)
+    return errors::InvalidArgument(
+        "Dynamic shaped inputs are not supported. Please file a feature "
+        "request to TF DTensor: component id: 8333864");
+
+  llvm::DenseMap<int, Layout> input_layouts(scatter_op.getNumOperands());
+
+  // Always set `indices` tensor and 'shape' tensor to replicated.
+  input_layouts[0] = Layout::ReplicatedOnMesh(mesh, /*rank=*/indices_rank);
+  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+
+  auto iter = output_layouts.find(0);
+  if (iter == output_layouts.end()) {
+    return input_layouts;
+  }
+  // Compute the updates layout.
+  const Layout& output_layout = iter->getSecond();
+  std::vector<std::string> updates_sharding_specs(updates_rank);
+
+  // Replicate the first dimension. This is the number of update tensors.
+  // Set the rest of the dimensions equal to the output's corresponding
+  // sharding.
+  updates_sharding_specs[0] = Layout::kUnshardedDim;
+  for (int i = 1; i < updates_rank; ++i) {
+    updates_sharding_specs[i] = output_layout.sharding_spec(output_rank - i);
+  }
+
+  TF_ASSIGN_OR_RETURN(const Layout updates_layout,
+                      Layout::GetLayout(updates_sharding_specs, mesh));
+  input_layouts[1] = updates_layout;
+
+  return input_layouts;
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h
index 0b4d789df27..fdb8fe9022f 100644
--- a/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h
+++ b/tensorflow/dtensor/mlir/expansions/scatter_spmd_expander.h
@@ -35,6 +35,18 @@ class TensorScatterOpSPMDExpander : public SPMDExpanderBase {
       const llvm::DenseMap<int, Layout>& output_layouts) override;
 };
 
+class ScatterNdOpSPMDExpander : public SPMDExpanderBase {
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
index abe23f82826..a7bc0587d2a 100644
--- a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
@@ -35,12 +35,12 @@ UnsortedSegmentSumSPMDExpander::ComputeLayoutForward(
     mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
   TF_ASSIGN_OR_RETURN(const auto mesh, ExtractDeviceMeshEnclosingCluster(op));
   auto unsorted_segmented_sum = llvm::cast<mlir::TF::UnsortedSegmentSumOp>(op);
-  const int output_rank = ValueRank(unsorted_segmented_sum.output());
+  const int output_rank = ValueRank(unsorted_segmented_sum.getOutput());
   if (input_layouts.find(0) != input_layouts.end()) {
     // If the data layout exists, we can use it to forward propagate a layout
     // to the output.
     const int segment_ids_rank =
-        ValueRank(unsorted_segmented_sum.segment_ids());
+        ValueRank(unsorted_segmented_sum.getSegmentIds());
 
     return llvm::DenseMap<int, Layout>(
         {{0, input_layouts.lookup(0)
@@ -65,7 +65,7 @@ UnsortedSegmentSumSPMDExpander::ComputeLayoutBackward(
   if (!output_layouts.empty()) {
     // If we have an output layout, we can send it backwards to the last few
     // dimension
-    const int data_rank = ValueRank(unsorted_segmented_sum.data());
+    const int data_rank = ValueRank(unsorted_segmented_sum.getData());
     return llvm::DenseMap<int, Layout>({{0, output_layouts.lookup(0)
                                                 .Truncate(1, /*end=*/true)
                                                 .LeftPad(data_rank)},
@@ -74,7 +74,7 @@ UnsortedSegmentSumSPMDExpander::ComputeLayoutBackward(
   }
   return llvm::DenseMap<int, Layout>(
       {{0, Layout::ReplicatedOnMesh(
-               mesh, /*rank=*/ValueRank(unsorted_segmented_sum.data()))},
+               mesh, /*rank=*/ValueRank(unsorted_segmented_sum.getData()))},
        {1, segment_ids_layout},
        {2, num_segments_layout}});
 }
@@ -92,8 +92,8 @@ StatusOr<mlir::Operation*> UnsortedSegmentSumSPMDExpander::ExpandOp(
   // 4. Emit a Relayout to the output layout.
 
   auto sum_op = mlir::cast<mlir::TF::UnsortedSegmentSumOp>(op);
-  mlir::Value data = sum_op.data();
-  mlir::Value segment_ids = sum_op.segment_ids();
+  mlir::Value data = sum_op.getData();
+  mlir::Value segment_ids = sum_op.getSegmentIds();
 
   TF_ASSIGN_OR_RETURN(Layout data_layout,
                       ExtractRequiredLayoutFromOperand(data));
@@ -116,8 +116,8 @@ StatusOr<mlir::Operation*> UnsortedSegmentSumSPMDExpander::ExpandOp(
 
   mlir::OpBuilder builder(op);
   mlir::Operation* new_sum_op = builder.create<mlir::TF::UnsortedSegmentSumOp>(
-      op->getLoc(), sum_op.output().getType(), data, new_segment_ids,
-      sum_op.num_segments());
+      op->getLoc(), sum_op.getOutput().getType(), data, new_segment_ids,
+      sum_op.getNumSegments());
 
   InferSPMDExpandedLocalShape(new_sum_op);
 
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index 87d3f1e7f52..72cfe9ec94c 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -41,11 +41,13 @@ Status GetSliceOpArguments(mlir::TF::SliceOp slice_op,
                            llvm::SmallVector<int64_t, 4>& begins,
                            bool& dynamic_begins,
                            llvm::SmallVector<int64_t, 4>& sizes) {
-  Status begins_result = ExtractConstVectorFromValue(slice_op.begin(), &begins);
+  Status begins_result =
+      ExtractConstVectorFromValue(slice_op.getBegin(), &begins);
   dynamic_begins = !begins_result.ok();
 
-  TF_RETURN_WITH_CONTEXT(ExtractConstVectorFromValue(slice_op.size(), &sizes),
-                         "expected constant argument for SliceOp::size()");
+  TF_RETURN_WITH_CONTEXT(
+      ExtractConstVectorFromValue(slice_op.getSize(), &sizes),
+      "expected constant argument for SliceOp::size()");
 
   return OkStatus();
 }
@@ -148,7 +150,7 @@ Status GetInputOrientedData(T strided_slice,
   llvm::SmallVector<int64_t, 4> spec_strides;
 
   TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> strides_shape,
-                      GetShapeOfValue(strided_slice.strides(),
+                      GetShapeOfValue(strided_slice.getStrides(),
                                       /*fail_on_dynamic=*/true));
   if (strides_shape.size() != 1)
     return errors::InvalidArgument(
@@ -157,26 +159,27 @@ Status GetInputOrientedData(T strided_slice,
   int64_t spec_rank = strides_shape[0];
   spec_to_input->resize(spec_rank, -1);
 
-  if (!ExtractConstVectorFromValue(strided_slice.strides(), &spec_strides).ok())
+  if (!ExtractConstVectorFromValue(strided_slice.getStrides(), &spec_strides)
+           .ok())
     spec_strides.resize(spec_rank, 0);
 
-  if (ExtractConstVectorFromValue(strided_slice.begin(), &spec_begin).ok())
+  if (ExtractConstVectorFromValue(strided_slice.getBegin(), &spec_begin).ok())
     if (spec_begin.size() != spec_rank)
       return errors::InvalidArgument(
           "rank of begin input to strided operation does not equal rank of "
           "strides input");
 
-  if (ExtractConstVectorFromValue(strided_slice.end(), &spec_end).ok())
+  if (ExtractConstVectorFromValue(strided_slice.getEnd(), &spec_end).ok())
     if (spec_end.size() != spec_rank)
       return errors::InvalidArgument(
           "rank of end input to strided operation does not equal rank of "
           "strides input");
 
-  const uint64_t new_axis_mask = strided_slice.new_axis_mask();
-  const uint64_t shink_axis_mask = strided_slice.shrink_axis_mask();
-  const uint64_t spec_begin_mask = strided_slice.begin_mask();
-  const uint64_t spec_end_mask = strided_slice.end_mask();
-  uint64_t ellipsis_mask = strided_slice.ellipsis_mask();
+  const uint64_t new_axis_mask = strided_slice.getNewAxisMask();
+  const uint64_t shink_axis_mask = strided_slice.getShrinkAxisMask();
+  const uint64_t spec_begin_mask = strided_slice.getBeginMask();
+  const uint64_t spec_end_mask = strided_slice.getEndMask();
+  uint64_t ellipsis_mask = strided_slice.getEllipsisMask();
 
   int64_t input_rank;
   if (mlir::isa<mlir::TF::StridedSliceOp>(strided_slice) ||
@@ -389,9 +392,9 @@ StatusOr<Layout> ApplyNewAndShrinkMasksToLayout(SliceOpT slice_op,
                                                 const Direction direction) {
   // Calculate bit mask for shrunk dimensions/newly added dimensions.
   const llvm::SmallVector<int64_t, 4> new_axis_mask =
-      CalculateBitVector(slice_op.new_axis_mask());
+      CalculateBitVector(slice_op.getNewAxisMask());
   const llvm::SmallVector<int64_t, 4> shrink_axis_mask =
-      CalculateBitVector(slice_op.shrink_axis_mask());
+      CalculateBitVector(slice_op.getShrinkAxisMask());
 
   std::vector<std::string> sharding_spec;
   int input_dim_index = 0;
@@ -449,7 +452,7 @@ mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
 StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto slice_op = mlir::cast<mlir::TF::SliceOp>(op);
   TF_ASSIGN_OR_RETURN(auto input_layout,
-                      ExtractLayoutFromOperand(slice_op.input()));
+                      ExtractLayoutFromOperand(slice_op.getInput()));
   TF_ASSIGN_OR_RETURN(auto output_layout, ExtractSingleLayoutFromOp(op));
 
   if (!output_layout || !input_layout)
@@ -459,7 +462,7 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   // The dyn_cast will never be nullptr as it is checked in
   // GetLayoutFromOperands.
   auto input_type =
-      slice_op.input().getType().dyn_cast<mlir::RankedTensorType>();
+      slice_op.getInput().getType().dyn_cast<mlir::RankedTensorType>();
   if (!input_type)
     return errors::InvalidArgument(
         "rank of input tensor must be statically known for slice op.");
@@ -477,7 +480,7 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
       GetSliceOpArguments(slice_op, begins, dynamic_begins, sizes));
 
   TF_ASSIGN_OR_RETURN(auto proposed_layout,
-                      VerifySliceLayout(slice_op, slice_op.input(),
+                      VerifySliceLayout(slice_op, slice_op.getInput(),
                                         *input_layout, &global_shape));
 
   llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
@@ -515,20 +518,21 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto loc = op->getLoc();
   // Both begin and size need to be the same type, so we must match the new
   // size input with the type of begin.
-  if (!slice_op.begin().getType().isa<mlir::ShapedType>())
+  if (!slice_op.getBegin().getType().isa<mlir::ShapedType>())
     return errors::Internal("type of begin is not a ShapedType");
-  mlir::ShapedType type = slice_op.begin().getType().cast<mlir::ShapedType>();
+  mlir::ShapedType type =
+      slice_op.getBegin().getType().cast<mlir::ShapedType>();
   if (type.getElementType().isInteger(32))
     new_size = IntConst(
         builder, loc, llvm::SmallVector<int32, 4>(sizes.begin(), sizes.end()));
   else
     new_size = Int64Const(builder, loc, sizes);
 
-  auto new_op =
-      builder
-          .create<mlir::TF::SliceOp>(loc, slice_op.output().getType(),
-                                     relayout_input, slice_op.begin(), new_size)
-          .getOperation();
+  auto new_op = builder
+                    .create<mlir::TF::SliceOp>(
+                        loc, slice_op.getOutput().getType(), relayout_input,
+                        slice_op.getBegin(), new_size)
+                    .getOperation();
   new_op = InferSPMDExpandedLocalShape(new_op);
 
   TF_ASSIGN_OR_RETURN(auto relayout_output,
@@ -551,7 +555,7 @@ StatusOr<llvm::DenseMap<int, Layout>> SliceSPMDExpander::ComputeLayoutForward(
   const Layout& input_layout = input_layouts.lookup(0);
   TF_ASSIGN_OR_RETURN(
       auto proposed_layout,
-      VerifySliceLayout(slice_op, slice_op.input(), input_layout));
+      VerifySliceLayout(slice_op, slice_op.getInput(), input_layout));
   return llvm::DenseMap<int, Layout>({{0, proposed_layout}});
 }
 
@@ -570,7 +574,7 @@ StatusOr<llvm::DenseMap<int, Layout>> SliceSPMDExpander::ComputeLayoutBackward(
     const Layout& output_layout = output_layouts.lookup(0);
     TF_ASSIGN_OR_RETURN(
         auto proposed_layout,
-        VerifySliceLayout(slice_op, slice_op.output(), output_layout));
+        VerifySliceLayout(slice_op, slice_op.getOutput(), output_layout));
     input_layouts[0] = proposed_layout;
   }
 
@@ -581,12 +585,12 @@ StatusOr<mlir::Operation*> StridedSliceSPMDExpander::ExpandOp(
     mlir::Operation* op) {
   auto strided_slice_op = mlir::cast<mlir::TF::StridedSliceOp>(op);
   TF_ASSIGN_OR_RETURN(Layout input_layout, ExtractRequiredLayoutFromOperand(
-                                               strided_slice_op.input()));
+                                               strided_slice_op.getInput()));
   TF_ASSIGN_OR_RETURN(Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
   TF_ASSIGN_OR_RETURN(
       const llvm::ArrayRef<int64_t> global_input_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.input()));
+      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
 
   llvm::SmallVector<int64_t, 4> end;
   TF_ASSIGN_OR_RETURN(
@@ -595,31 +599,31 @@ StatusOr<mlir::Operation*> StridedSliceSPMDExpander::ExpandOp(
                                         global_input_shape, &end));
 
   TF_ASSIGN_OR_RETURN(mlir::Value new_input,
-                      EmitRelayout(strided_slice_op.input(), input_layout,
+                      EmitRelayout(strided_slice_op.getInput(), input_layout,
                                    intermediate_input_layout));
 
-  strided_slice_op.inputMutable().assign(new_input);
+  strided_slice_op.getInputMutable().assign(new_input);
 
   mlir::OpBuilder builder(op);
 
   if (!end.empty()) {
     mlir::Value new_end =
         IntConstWithMatchingType(builder, strided_slice_op.getLoc(), end,
-                                 strided_slice_op.begin().getType());
-    strided_slice_op.endMutable().assign(new_end);
+                                 strided_slice_op.getBegin().getType());
+    strided_slice_op.getEndMutable().assign(new_end);
   }
 
   op = InferSPMDExpandedLocalShape(op);
 
   // Compute the layout of the output after the local StridedSlice takes place.
   const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.output());
+  const int output_rank = ValueRank(strided_slice_op.getOutput());
 
   // Calculate bit mask for shrinked dimensions/newly added dimensions.
   const llvm::SmallVector<int64_t, 4> new_axis_mask =
-      CalculateBitVector(strided_slice_op.new_axis_mask());
+      CalculateBitVector(strided_slice_op.getNewAxisMask());
   const llvm::SmallVector<int64_t, 4> shrink_axis_mask =
-      CalculateBitVector(strided_slice_op.shrink_axis_mask());
+      CalculateBitVector(strided_slice_op.getShrinkAxisMask());
 
   TF_ASSIGN_OR_RETURN(
       Layout intermediate_output_layout,
@@ -632,10 +636,10 @@ StatusOr<mlir::Operation*> StridedSliceSPMDExpander::ExpandOp(
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value output,
-      EmitRelayout(strided_slice_op.output(), intermediate_output_layout,
+      EmitRelayout(strided_slice_op.getOutput(), intermediate_output_layout,
                    output_layout, &newly_created_ops));
 
-  strided_slice_op.output().replaceAllUsesExcept(output, newly_created_ops);
+  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
 
   return output.getDefiningOp();
 }
@@ -650,10 +654,10 @@ StridedSliceSPMDExpander::ComputeLayoutForward(
   mlir::TF::StridedSliceOp strided_slice_op =
       mlir::cast<mlir::TF::StridedSliceOp>(op);
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.input(),
+                      GetShapeOfValue(strided_slice_op.getInput(),
                                       /*fail_on_dynamic=*/true));
   const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.output());
+  const int output_rank = ValueRank(strided_slice_op.getOutput());
 
   const Layout& input_layout = input_layouts.lookup(0);
   TF_ASSIGN_OR_RETURN(Layout proposed_layout,
@@ -676,10 +680,10 @@ StridedSliceSPMDExpander::ComputeLayoutBackward(
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.input(),
+                      GetShapeOfValue(strided_slice_op.getInput(),
                                       /*fail_on_dynamic=*/true));
   const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.output());
+  const int output_rank = ValueRank(strided_slice_op.getOutput());
 
   llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
   // Set replicated layout for begin, end, and strides operands.
@@ -712,19 +716,19 @@ StatusOr<mlir::Operation*> TensorStridedSliceUpdateSPMDExpander::ExpandOp(
       llvm::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
   TF_ASSIGN_OR_RETURN(
       const Layout input_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_op.input()));
+      ExtractRequiredLayoutFromOperand(strided_slice_op.getInput()));
   TF_ASSIGN_OR_RETURN(
       const Layout value_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_op.value()));
+      ExtractRequiredLayoutFromOperand(strided_slice_op.getValue()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
 
   TF_ASSIGN_OR_RETURN(
       const llvm::ArrayRef<int64_t> global_input_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.input()));
+      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
 
   const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.value());
+  const int value_rank = ValueRank(strided_slice_op.getValue());
 
   llvm::SmallVector<int64_t, 4> end;
   TF_ASSIGN_OR_RETURN(
@@ -738,23 +742,23 @@ StatusOr<mlir::Operation*> TensorStridedSliceUpdateSPMDExpander::ExpandOp(
                                      intermediate_input_layout, FORWARD));
 
   TF_ASSIGN_OR_RETURN(mlir::Value new_input,
-                      EmitRelayout(strided_slice_op.input(), input_layout,
+                      EmitRelayout(strided_slice_op.getInput(), input_layout,
                                    intermediate_input_layout));
 
   TF_ASSIGN_OR_RETURN(mlir::Value new_value,
-                      EmitRelayout(strided_slice_op.value(), value_layout,
+                      EmitRelayout(strided_slice_op.getValue(), value_layout,
                                    intermediate_value_layout));
 
-  strided_slice_op.inputMutable().assign(new_input);
-  strided_slice_op.valueMutable().assign(new_value);
+  strided_slice_op.getInputMutable().assign(new_input);
+  strided_slice_op.getValueMutable().assign(new_value);
 
   mlir::OpBuilder builder(op);
 
   if (!end.empty()) {
     mlir::Value new_end =
         IntConstWithMatchingType(builder, strided_slice_op.getLoc(), end,
-                                 strided_slice_op.begin().getType());
-    strided_slice_op.endMutable().assign(new_end);
+                                 strided_slice_op.getBegin().getType());
+    strided_slice_op.getEndMutable().assign(new_end);
   }
 
   op = InferSPMDExpandedLocalShape(op);
@@ -765,10 +769,10 @@ StatusOr<mlir::Operation*> TensorStridedSliceUpdateSPMDExpander::ExpandOp(
 
   TF_ASSIGN_OR_RETURN(
       mlir::Value output,
-      EmitRelayout(strided_slice_op.output(), intermediate_input_layout,
+      EmitRelayout(strided_slice_op.getOutput(), intermediate_input_layout,
                    output_layout, &newly_created_ops));
 
-  strided_slice_op.output().replaceAllUsesExcept(output, newly_created_ops);
+  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
 
   return output.getDefiningOp();
 }
@@ -785,10 +789,10 @@ TensorStridedSliceUpdateSPMDExpander::ComputeLayoutForward(
   mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
       mlir::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.input(),
+                      GetShapeOfValue(strided_slice_op.getInput(),
                                       /*fail_on_dynamic=*/true));
   const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.value());
+  const int value_rank = ValueRank(strided_slice_op.getValue());
 
   // We have a choice to determine the output layout, we will default to use
   // input_layout if available, otherwise we will expand value_layout and use
@@ -821,10 +825,10 @@ TensorStridedSliceUpdateSPMDExpander::ComputeLayoutBackward(
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.input(),
+                      GetShapeOfValue(strided_slice_op.getInput(),
                                       /*fail_on_dynamic=*/true));
   const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.value());
+  const int value_rank = ValueRank(strided_slice_op.getValue());
 
   llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
   // Set replicated layout for begin, end, and strides operands.
@@ -861,15 +865,15 @@ StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
   auto strided_slice_grad_op = llvm::cast<mlir::TF::StridedSliceGradOp>(op);
   TF_ASSIGN_OR_RETURN(
       const Layout input_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_grad_op.dy()));
+      ExtractRequiredLayoutFromOperand(strided_slice_grad_op.getDy()));
   TF_ASSIGN_OR_RETURN(const Layout output_layout,
                       ExtractRequiredSingleLayoutFromOp(op));
 
-  TF_ASSIGN_OR_RETURN(
-      const llvm::ArrayRef<int64_t> global_output_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_grad_op.output()));
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
+                      GetGlobalShapeOfValueFromDTensorLayout(
+                          strided_slice_grad_op.getOutput()));
 
-  const int input_rank = ValueRank(strided_slice_grad_op.dy());
+  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
   const int output_rank = global_output_shape.size();
 
   llvm::SmallVector<int64_t, 4> end;
@@ -884,18 +888,18 @@ StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
                           intermediate_output_layout, FORWARD));
 
   TF_ASSIGN_OR_RETURN(mlir::Value new_dy,
-                      EmitRelayout(strided_slice_grad_op.dy(), input_layout,
+                      EmitRelayout(strided_slice_grad_op.getDy(), input_layout,
                                    intermediate_input_layout));
 
-  strided_slice_grad_op.dyMutable().assign(new_dy);
+  strided_slice_grad_op.getDyMutable().assign(new_dy);
 
   mlir::OpBuilder builder(op);
 
   if (!end.empty()) {
     mlir::Value new_end =
         IntConstWithMatchingType(builder, strided_slice_grad_op.getLoc(), end,
-                                 strided_slice_grad_op.begin().getType());
-    strided_slice_grad_op.endMutable().assign(new_end);
+                                 strided_slice_grad_op.getBegin().getType());
+    strided_slice_grad_op.getEndMutable().assign(new_end);
   }
 
   // The shape input to StridedSliceGrad will still be global, so we need to
@@ -904,8 +908,8 @@ StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
       intermediate_output_layout.LocalShapeFromGlobalShape(global_output_shape);
   mlir::Value new_shape = IntConstWithMatchingType(
       builder, strided_slice_grad_op.getLoc(), computed_output_shape,
-      strided_slice_grad_op.begin().getType());
-  strided_slice_grad_op.shapeMutable().assign(new_shape);
+      strided_slice_grad_op.getBegin().getType());
+  strided_slice_grad_op.getShapeMutable().assign(new_shape);
 
   op = InferSPMDExpandedLocalShape(op);
 
@@ -913,13 +917,13 @@ StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
   // differences between intermediate_output_layout and output_layout.
   llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
 
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value output,
-      EmitRelayout(strided_slice_grad_op.output(), intermediate_output_layout,
-                   output_layout, &newly_created_ops));
+  TF_ASSIGN_OR_RETURN(mlir::Value output,
+                      EmitRelayout(strided_slice_grad_op.getOutput(),
+                                   intermediate_output_layout, output_layout,
+                                   &newly_created_ops));
 
-  strided_slice_grad_op.output().replaceAllUsesExcept(output,
-                                                      newly_created_ops);
+  strided_slice_grad_op.getOutput().replaceAllUsesExcept(output,
+                                                         newly_created_ops);
 
   return output.getDefiningOp();
 }
@@ -934,9 +938,9 @@ StridedSliceGradSPMDExpander::ComputeLayoutForward(
   mlir::TF::StridedSliceGradOp strided_slice_grad_op =
       mlir::cast<mlir::TF::StridedSliceGradOp>(op);
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
-                      GetShapeOfValue(strided_slice_grad_op.output(),
+                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
                                       /*fail_on_dynamic=*/true));
-  const int input_rank = ValueRank(strided_slice_grad_op.dy());
+  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
   const int output_rank = global_output_shape.size();
 
   const Layout& input_layout = input_layouts.lookup(4);
@@ -961,9 +965,9 @@ StridedSliceGradSPMDExpander::ComputeLayoutBackward(
   TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
 
   TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
-                      GetShapeOfValue(strided_slice_grad_op.output(),
+                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
                                       /*fail_on_dynamic=*/true));
-  const int input_rank = ValueRank(strided_slice_grad_op.dy());
+  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
   const int output_rank = global_output_shape.size();
 
   llvm::DenseMap<int, Layout> input_layouts(
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index 1847587c533..be20accb381 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <vector>
 
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -209,7 +210,7 @@ StatusOr<mlir::Value> GetFPConstOfType(mlir::OpBuilder& builder,
             mlir::DenseFPElementsAttr::get<float>(
                 mlir::RankedTensorType::get({}, type.getElementType()),
                 {value}))
-        .output();
+        .getOutput();
   } else {
     return errors::Unimplemented("non tensor type for labels is not supported");
   }
@@ -277,17 +278,17 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
               mesh_coordinates,
               IntConst(builder, input.getLoc(), {0, mesh_dim_index}),
               IntConst(builder, input.getLoc(), {1, 1}))
-          .output();
+          .getOutput();
 
   shard_id = builder
                  .create<mlir::TF::SqueezeOp>(
                      loc, mlir::RankedTensorType::get({}, builder.getI32Type()),
                      shard_id, builder.getI64ArrayAttr({0, 1}))
-                 .output();
+                 .getOutput();
 
   // `new_indices` = `input` - `shard_id` * (classes/num_shards)
   mlir::Value id_offset =
-      builder.create<mlir::TF::MulOp>(loc, shard_id, depth).z();
+      builder.create<mlir::TF::MulOp>(loc, shard_id, depth).getZ();
 
   // Note that the type of id_offset (int32) may not match the type of input.
   // So we insert a cast in this case.
@@ -300,10 +301,10 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
                 loc,
                 mlir::RankedTensorType::get({}, input_type.getElementType()),
                 id_offset)
-            .y();
+            .getY();
 
   mlir::Value indices =
-      builder.create<mlir::TF::SubOp>(loc, input, id_offset).z();
+      builder.create<mlir::TF::SubOp>(loc, input, id_offset).getZ();
 
   TF_ASSIGN_OR_RETURN(mlir::Value on_value,
                       GetFPConstOfType(builder, features, 1.0));
@@ -313,7 +314,7 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
   return builder
       .create<mlir::TF::OneHotOp>(input.getLoc(), indices, depth, on_value,
                                   off_value, builder.getI64IntegerAttr(1))
-      .output();
+      .getOutput();
 }
 
 }  // namespace
@@ -592,14 +593,15 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
       builder
           .create<mlir::TF::EqualOp>(op->getLoc(), labels, labels_zero,
                                      builder.getBoolAttr(true))
-          .z();
+          .getZ();
   const mlir::Value safe_softmax =
       builder
           .create<mlir::TF::SelectV2Op>(op->getLoc(), is_labels_zero,
                                         features_zero, log_softmax)
-          .output();
+          .getOutput();
   const mlir::Value prod =
-      builder.create<mlir::TF::MulOp>(op->getLoc(), labels, safe_softmax).z();
+      builder.create<mlir::TF::MulOp>(op->getLoc(), labels, safe_softmax)
+          .getZ();
 
   // Compute the reduce sum
   TF_ASSIGN_OR_RETURN(
@@ -609,7 +611,7 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
 
   builder.setInsertionPointAfterValue(positive_loss);
   mlir::Value loss =
-      builder.create<mlir::TF::NegOp>(op->getLoc(), positive_loss).y();
+      builder.create<mlir::TF::NegOp>(op->getLoc(), positive_loss).getY();
 
   mlir::Value backprop =
       builder.create<mlir::TF::SubOp>(op->getLoc(), softmax, labels);
diff --git a/tensorflow/dtensor/mlir/expansions/split_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/split_spmd_expander.cc
index e89afed31c8..f72b22d98df 100644
--- a/tensorflow/dtensor/mlir/expansions/split_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/split_spmd_expander.cc
@@ -84,10 +84,10 @@ StatusOr<int64_t> GetAdjustedSplitDim(mlir::Value split_dim_value,
 StatusOr<mlir::Operation*> SplitSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto split_op = mlir::cast<mlir::TF::SplitOp>(op);
   TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(split_op.value()));
+                      ExtractRequiredLayoutFromOperand(split_op.getValue()));
   TF_ASSIGN_OR_RETURN(
       const int64_t split_dim,
-      GetAdjustedSplitDim(split_op.split_dim(), split_op.value()));
+      GetAdjustedSplitDim(split_op.getSplitDim(), split_op.getValue()));
 
   if (Layout::IsShardedDimension(input_layout.dim(split_dim).sharding_spec())) {
     return errors::InvalidArgument(
@@ -124,7 +124,7 @@ StatusOr<llvm::DenseMap<int, Layout>> SplitSPMDExpander::ComputeLayoutBackward(
     // we can use for passing backwards.
     TF_ASSIGN_OR_RETURN(
         const int64_t split_dim,
-        GetAdjustedSplitDim(split_op.split_dim(), split_op.value()));
+        GetAdjustedSplitDim(split_op.getSplitDim(), split_op.getValue()));
     TF_ASSIGN_OR_RETURN(const Layout common_output_layout,
                         MergeLayoutsForSplitOutput(split_dim, output_layouts));
     // value
@@ -137,10 +137,10 @@ StatusOr<llvm::DenseMap<int, Layout>> SplitSPMDExpander::ComputeLayoutBackward(
 StatusOr<mlir::Operation*> SplitVSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto split_v_op = mlir::cast<mlir::TF::SplitVOp>(op);
   TF_ASSIGN_OR_RETURN(const Layout input_layout,
-                      ExtractRequiredLayoutFromOperand(split_v_op.value()));
+                      ExtractRequiredLayoutFromOperand(split_v_op.getValue()));
   TF_ASSIGN_OR_RETURN(
       const int64_t split_dim,
-      GetAdjustedSplitDim(split_v_op.split_dim(), split_v_op.value()));
+      GetAdjustedSplitDim(split_v_op.getSplitDim(), split_v_op.getValue()));
 
   if (Layout::IsShardedDimension(input_layout.dim(split_dim).sharding_spec())) {
     return errors::InvalidArgument(
@@ -179,7 +179,7 @@ StatusOr<llvm::DenseMap<int, Layout>> SplitVSPMDExpander::ComputeLayoutBackward(
     // we can use for passing backwards.
     TF_ASSIGN_OR_RETURN(
         const int64_t split_dim,
-        GetAdjustedSplitDim(split_v_op.split_dim(), split_v_op.value()));
+        GetAdjustedSplitDim(split_v_op.getSplitDim(), split_v_op.getValue()));
     TF_ASSIGN_OR_RETURN(const Layout common_output_layout,
                         MergeLayoutsForSplitOutput(split_dim, output_layouts));
     // value
diff --git a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
index c4c57ca8c3d..0ef5ca777e8 100644
--- a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
@@ -50,7 +50,7 @@ StatusOr<mlir::Operation*> TensorListReserveSPMDExpander::ExpandOp(
   mlir::TF::TensorListReserveOp new_op =
       builder.create<mlir::TF::TensorListReserveOp>(
           DT_LOC(op), new_output_type, new_shape_value,
-          tensorlist_op.num_elements());
+          tensorlist_op.getNumElements());
 
   op->getResult(0).replaceAllUsesWith(new_op.getResult());
   op->erase();
diff --git a/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.cc
index aaadf8423c4..6b8a5002489 100644
--- a/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.h"
 
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
@@ -42,7 +42,7 @@ StatusOr<Layout> GetSuggestedLayout(const Layout& input_layout) {
 
 StatusOr<mlir::Operation*> TopKSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto top_k_op = mlir::cast<mlir::TF::TopKV2Op>(op);
-  mlir::Value input = top_k_op.input();
+  mlir::Value input = top_k_op.getInput();
   TF_ASSIGN_OR_RETURN(auto input_layout, ExtractLayoutFromOperand(input));
 
   if (!input_layout)
@@ -61,7 +61,7 @@ StatusOr<mlir::Operation*> TopKSPMDExpander::ExpandOp(mlir::Operation* op) {
     TF_ASSIGN_OR_RETURN(Layout new_layout, GetSuggestedLayout(*input_layout));
     TF_ASSIGN_OR_RETURN(
         input, EmitAllGather(builder, input, *input_layout, new_layout));
-    mlir::BlockAndValueMapping mapping;
+    mlir::IRMapping mapping;
     mapping.map(op->getOperand(0), input);
     mlir::Operation* new_op = builder.clone(*op, mapping);
     new_op = InferSPMDExpandedLocalShape(new_op);
diff --git a/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.cc
new file mode 100644
index 00000000000..31888e1bd87
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.cc
@@ -0,0 +1,44 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+UnsupportedOpSPMDExpander::UnsupportedOpSPMDExpander(
+    const absl::string_view error_message) {
+  error_message_ = error_message;
+}
+
+StatusOr<mlir::Operation*> UnsupportedOpSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  return errors::Unimplemented(error_message_);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+UnsupportedOpSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  return errors::Unimplemented(error_message_);
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+UnsupportedOpSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  return errors::Unimplemented(error_message_);
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h
new file mode 100644
index 00000000000..4c0af77bc8a
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h
@@ -0,0 +1,46 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
+
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class UnsupportedOpSPMDExpander : public SPMDExpanderBase {
+ public:
+  explicit UnsupportedOpSPMDExpander(absl::string_view error_message);
+
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+
+ private:
+  absl::string_view error_message_;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_UNSUPPORTED_OP_SPMD_EXPANDER_H_
diff --git a/tensorflow/dtensor/mlir/group_assignment_test.cc b/tensorflow/dtensor/mlir/group_assignment_test.cc
index 36470adcc80..3d64f19be63 100644
--- a/tensorflow/dtensor/mlir/group_assignment_test.cc
+++ b/tensorflow/dtensor/mlir/group_assignment_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
index 1951818b9c9..b185e7f532b 100644
--- a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
+++ b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
@@ -73,7 +73,7 @@ mlir::Operation* GetConstOp(mlir::Operation* op) {
   if (llvm::isa<mlir::TF::ConstOp>(op)) return op;
 
   if (auto layout = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
-    mlir::Operation* input_op = layout.input().getDefiningOp();
+    mlir::Operation* input_op = layout.getInput().getDefiningOp();
     if (input_op && llvm::isa<mlir::TF::ConstOp>(input_op)) return input_op;
   }
   return nullptr;
@@ -88,7 +88,7 @@ mlir::LogicalResult CloneOpToCluster(mlir::Operation* const_op,
   auto copy_to_mesh =
       llvm::dyn_cast<mlir::TF::CopyToMeshOp>(operand->getOwner());
   assert(copy_to_mesh);
-  const std::string layout_attr = copy_to_mesh.layout().str();
+  const std::string layout_attr = copy_to_mesh.getLayout().str();
   StatusOr<Layout> layout = Layout::FromString(layout_attr);
   if (!layout.ok())
     return copy_to_mesh.emitOpError(
@@ -103,8 +103,8 @@ mlir::LogicalResult CloneOpToCluster(mlir::Operation* const_op,
       mlir::dtensor::LayoutAttr::get(builder.getContext(), *layout),
       mlir::TF::ShapeAttr::get(builder.getContext(), type));
 
-  copy_to_mesh.output().replaceUsesWithIf(
-      layout_op.output(), [&](mlir::OpOperand& operand) {
+  copy_to_mesh.getOutput().replaceUsesWithIf(
+      layout_op.getOutput(), [&](mlir::OpOperand& operand) {
         return cluster.getOperation()->isProperAncestor(operand.getOwner());
       });
 
@@ -219,7 +219,7 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
                                     mlir::MLIRContext* context,
                                     int* send_recv_counter) {
   const mlir::OpResult copied_value =
-      copy_to_mesh.input().cast<mlir::OpResult>();
+      copy_to_mesh.getInput().cast<mlir::OpResult>();
   const int result_index = copied_value.getResultNumber();
   auto src_cluster =
       llvm::cast<mlir::tf_device::ClusterOp>(copied_value.getDefiningOp());
@@ -230,10 +230,10 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
   mlir::OpBuilder builder(value_to_send.getParentBlock()->getTerminator());
 
   const std::string op_key =
-      llvm::formatv("communication_key_{0}_{1}", copy_to_mesh.layout(),
+      llvm::formatv("communication_key_{0}_{1}", copy_to_mesh.getLayout(),
                     *send_recv_counter)
           .str();
-  const std::string layout_attr = copy_to_mesh.layout().str();
+  const std::string layout_attr = copy_to_mesh.getLayout().str();
   auto layout_or_status = Layout::FromString(layout_attr);
   if (!layout_or_status.ok())
     return copy_to_mesh.emitOpError(
@@ -260,7 +260,7 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
       mlir::dtensor::LayoutAttr::get(context, target_layout));
 
   // Replace value for recv ops for all usages of `copy_to_mesh` op.
-  copy_to_mesh.replaceAllUsesWith(recv_op.output());
+  copy_to_mesh.replaceAllUsesWith(recv_op.getOutput());
 
   // Remove copy to mesh op.
   copy_to_mesh.erase();
diff --git a/tensorflow/dtensor/mlir/handle_sparsetensors.cc b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
index 39599d40850..48985d22fb8 100644
--- a/tensorflow/dtensor/mlir/handle_sparsetensors.cc
+++ b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
@@ -143,10 +143,11 @@ void CreateComponentTensorsFromSparseTensors(
         block_arg.getArgNumber(), kSparseValue);
     if (is_sparse) {
       sparse_tensor_components->push_back(SparseTensorToComponentInfo{
-          /*indices=*/mlir::RankedTensorType::get({-1, ValueRank(block_arg)},
-                                                  builder.getI64Type()),
+          /*indices=*/mlir::RankedTensorType::get(
+              {mlir::ShapedType::kDynamic, ValueRank(block_arg)},
+              builder.getI64Type()),
           /*values=*/
-          mlir::RankedTensorType::get({-1},
+          mlir::RankedTensorType::get({mlir::ShapedType::kDynamic},
                                       block_arg.getType()
                                           .dyn_cast<mlir::RankedTensorType>()
                                           .getElementType()),
@@ -185,8 +186,10 @@ struct DTensorSparseTensorToDenseTensor
     llvm::DenseMap<mlir::Value, llvm::ArrayRef<mlir::NamedAttribute>>
         arg_attribute_map;
     for (auto block_arg : main_func.getArguments()) {
-      arg_attribute_map.insert(std::make_pair(
-          block_arg, main_func.getArgAttrs(block_arg.getArgNumber())));
+      llvm::ArrayRef<mlir::NamedAttribute> attrs =
+          mlir::function_interface_impl::getArgAttrs(main_func,
+                                                     block_arg.getArgNumber());
+      arg_attribute_map.insert(std::make_pair(block_arg, attrs));
     }
 
     std::vector<SparseTensorToComponentInfo> sparse_tensor_components;
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
index e21a08d5ecc..e074489bce8 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.cc
@@ -48,10 +48,10 @@ RankedTensorType GetRankedTensorType(mlir::Value val) {
 
 mlir::LogicalResult DTensorLayout::verify() {
   DTensorLayout op = *this;
-  const auto& layout = op.layout();
+  const auto& layout = op.getLayout();
   if (layout.IsEmpty()) return mlir::success();
 
-  auto input_value = op.input();
+  auto input_value = op.getInput();
 
   RankedTensorType type = GetRankedTensorType(input_value);
 
@@ -86,8 +86,8 @@ mlir::LogicalResult DTensorLayout::verify() {
 
 mlir::LogicalResult DTensorAllGatherOp::verify() {
   DTensorAllGatherOp op = *this;
-  const tensorflow::dtensor::Layout input_layout = op.input_layout();
-  const tensorflow::dtensor::Layout output_layout = op.output_layout();
+  const tensorflow::dtensor::Layout input_layout = op.getInputLayout();
+  const tensorflow::dtensor::Layout output_layout = op.getOutputLayout();
 
   if (input_layout.rank() != output_layout.rank())
     return op.emitOpError()
@@ -107,7 +107,7 @@ mlir::LogicalResult DTensorAllGatherOp::verify() {
   }
 
   RankedTensorType input_type =
-      op.input().getType().dyn_cast<RankedTensorType>();
+      op.getInput().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return mlir::success();
 
   if (input_type.getRank() != input_layout.rank())
@@ -116,7 +116,7 @@ mlir::LogicalResult DTensorAllGatherOp::verify() {
            << " is not equal to input rank " << input_type.getRank();
 
   RankedTensorType output_type =
-      op.output().getType().dyn_cast<RankedTensorType>();
+      op.getOutput().getType().dyn_cast<RankedTensorType>();
   if (!output_type) return mlir::success();
 
   if (output_type.getRank() != output_layout.rank())
@@ -142,8 +142,8 @@ mlir::LogicalResult DTensorAllGatherOp::verify() {
 
 mlir::LogicalResult DTensorAllScatterOp::verify() {
   DTensorAllScatterOp op = *this;
-  const tensorflow::dtensor::Layout input_layout = op.input_layout();
-  const tensorflow::dtensor::Layout output_layout = op.output_layout();
+  const tensorflow::dtensor::Layout input_layout = op.getInputLayout();
+  const tensorflow::dtensor::Layout output_layout = op.getOutputLayout();
 
   if (input_layout.rank() != output_layout.rank())
     return op.emitOpError()
@@ -163,7 +163,7 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
   }
 
   RankedTensorType input_type =
-      op.input().getType().dyn_cast<RankedTensorType>();
+      op.getInput().getType().dyn_cast<RankedTensorType>();
   if (!input_type) return mlir::success();
 
   if (input_type.getRank() != input_layout.rank())
@@ -172,7 +172,7 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
            << " is not equal to input rank " << input_type.getRank();
 
   RankedTensorType output_type =
-      op.output().getType().dyn_cast<RankedTensorType>();
+      op.getOutput().getType().dyn_cast<RankedTensorType>();
   if (!output_type) return mlir::success();
 
   if (output_type.getRank() != output_layout.rank())
@@ -197,7 +197,7 @@ mlir::LogicalResult DTensorAllScatterOp::verify() {
 }
 
 LogicalResult DTensorLayout::inferReturnTypes(
-    MLIRContext* context, Optional<Location> location, ValueRange operands,
+    MLIRContext* context, std::optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   assert(operands.size() == 1);
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.td b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
index 60286d86273..9e0e7298471 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.td
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
@@ -520,7 +520,7 @@ def TF_DTensorAllReduceOp : TF_Op<"DTensorAllReduce", [Pure]> {
   let summary = "";
 
   let arguments = (ins
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Int32, TF_Int64, TF_Uint32]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32]>:$input,
     TF_Int32Tensor:$group_assignment,
 
     TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add", "Mean", "Any", "All"]>:$reduce_op,
@@ -528,7 +528,7 @@ def TF_DTensorAllReduceOp : TF_Op<"DTensorAllReduce", [Pure]> {
   );
 
   let results = (outs
-    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Int32, TF_Int64, TF_Uint32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/dtensor/mlir/layout_parsing.cc b/tensorflow/dtensor/mlir/layout_parsing.cc
index 114179f35f7..0eab62e82f3 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.cc
+++ b/tensorflow/dtensor/mlir/layout_parsing.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/dtensor/cc/constants.h"
@@ -58,7 +57,8 @@ StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(
     // If DTensorLayout is used, then DTensorLayout op is the only consumer for
     // the operation output value.
     auto users = op->getUsers();
-    out.emplace(llvm::cast<mlir::TF::DTensorLayout>(*users.begin()).layout());
+    out.emplace(
+        llvm::cast<mlir::TF::DTensorLayout>(*users.begin()).getLayout());
   } else {
     TF_ASSIGN_OR_RETURN(auto layouts, ExtractLayoutFromOp(op, attr_name));
     if (layouts.empty()) return out;
@@ -97,7 +97,7 @@ StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
     for (auto op_result : op->getOpResults()) {
       outs.emplace_back(
           llvm::cast<mlir::TF::DTensorLayout>(*op_result.getUsers().begin())
-              .layout());
+              .getLayout());
     }
   } else {
     auto serialized_layouts = op->getAttrOfType<mlir::ArrayAttr>(attr_name);
@@ -165,7 +165,7 @@ StatusOr<absl::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
     mlir::Operation* op = op_result.getDefiningOp();
     absl::optional<Layout> out;
     if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
-      out.emplace(layout_op.layout());
+      out.emplace(layout_op.getLayout());
     } else {
       const int result_number = op_result.getResultNumber();
       TF_ASSIGN_OR_RETURN(auto layouts, ExtractLayoutFromOp(op, kLayoutAttr));
@@ -272,5 +272,46 @@ StatusOr<absl::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
   return layout;
 }
 
+StatusOr<llvm::SmallVector<Layout, 4>> ExtractElementLayoutsFromOperand(
+    mlir::OpOperand& input_value) {
+  const int operand_index = input_value.getOperandNumber();
+  auto defining_op = input_value.get().getDefiningOp();
+
+  if (defining_op) {
+    if (mlir::isa<mlir::TF::DTensorLayout,
+                  mlir::TF::IteratorGetNextAsOptionalOp>(defining_op)) {
+      return ExtractElementLayoutsFromOperand(defining_op->getOpOperand(0));
+    }
+  }
+
+  // If we reach this point, we're working with a function argument.
+  mlir::Operation* op = input_value.getOwner();
+  auto enclosing_function = op->getParentOfType<mlir::func::FuncOp>();
+  if (!enclosing_function)
+    return errors::InvalidArgument(
+        llvm::formatv("Could not find iterator at {0}-th input to op: {1}",
+                      operand_index, op->getName())
+            .str());
+
+  auto block_arg = input_value.get().dyn_cast<mlir::BlockArgument>();
+  auto array_attr = enclosing_function.getArgAttrOfType<mlir::ArrayAttr>(
+      block_arg.getArgNumber(), kIteratorElementLayouts);
+  if (!array_attr)
+    return errors::InvalidArgument(
+        llvm::formatv(
+            "Could not find `{0}` attribute of {1}-th input to op: {2}",
+            kIteratorElementLayouts, operand_index, op->getName())
+            .str());
+
+  llvm::SmallVector<Layout, 4> layouts(array_attr.size());
+  for (int i = 0; i < array_attr.size(); ++i) {
+    layouts[i] = Layout::FromString(
+                     array_attr[i].cast<mlir::StringAttr>().getValue().str())
+                     .value();
+  }
+
+  return layouts;
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/layout_parsing.h b/tensorflow/dtensor/mlir/layout_parsing.h
index a2e566818d4..ddd30a2aeb0 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.h
+++ b/tensorflow/dtensor/mlir/layout_parsing.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
@@ -86,6 +85,12 @@ StatusOr<absl::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op);
 StatusOr<absl::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
     mlir::func::ReturnOp return_op, const int return_index);
 
+// Extract element layouts from the iterator resource operand of an op that uses
+// that iterator (e.g. IteratorGetNext, OptionalGetValue, etc.). The layouts are
+// extracted from the `tf._element_layouts` attribute of that resource tensor.
+StatusOr<llvm::SmallVector<Layout, 4>> ExtractElementLayoutsFromOperand(
+    mlir::OpOperand& input_value);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index 687db30d8f5..b077ddd1b0f 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -68,6 +69,14 @@ namespace {
 // fixing of oscillatory behaviors.
 constexpr int kLayoutPropagationMaxStages = 3;
 
+bool IsProducerResourceOpWithEmptyLayout(const mlir::Value& producer_value,
+                                         const Layout& producer) {
+  return (
+      producer.IsEmpty() &&
+      llvm::isa<mlir::TF::ResourceType>(
+          producer_value.getType().cast<mlir::TensorType>().getElementType()));
+}
+
 bool AllOpResultsHaveLayouts(
     mlir::ModuleOp* module, mlir::Dialect* tf_dialect,
     const llvm::DenseMap<mlir::Value, Layout>& layouts) {
@@ -190,7 +199,7 @@ void FilterkAnySpecs(std::vector<std::string>& proposed_specs) {
 // sharded over is not already sharded over by the producer, then we add that
 // sharding to the producer layout.
 StatusOr<Layout> MergeLayouts(
-    const absl::optional<Layout>& producer,
+    const mlir::Value& producer_value, const absl::optional<Layout>& producer,
     const mlir::DenseMap<mlir::OpOperand*, Layout>& consumers) {
   if (consumers.empty()) return producer.value();
 
@@ -239,7 +248,9 @@ StatusOr<Layout> MergeLayouts(
 
   // Return layout if there is no producer, else move into producer algorithm.
   const Mesh mesh = consumers.begin()->second.mesh();
-  if (!producer) {
+
+  if (!producer ||
+      IsProducerResourceOpWithEmptyLayout(producer_value, *producer)) {
     FilterkAnySpecs(proposed_specs);
     return Layout::GetLayout(proposed_specs, mesh);
   }
@@ -298,15 +309,15 @@ mlir::LogicalResult InsertLayoutsForDTensorLayout(
           .walk([&](mlir::TF::DTensorLayout op) -> mlir::WalkResult {
             // Check there are no "Layout::kAny" or "kMatch" specs in the
             // layouts.
-            for (const std::string& spec : op.layout().sharding_spec_strs())
+            for (const std::string& spec : op.getLayout().sharding_spec_strs())
               if (spec == Layout::kAny || spec == Layout::kMatch)
                 return op->emitOpError()
                        << "found " << spec
                        << " as a sharding spec which is not allowed";
             // Insert layout.
-            producer_request[op.input()].emplace(op.layout());
-            is_updated.insert(op.input());
-            is_locked.insert(op.input());
+            producer_request[op.getInput()].emplace(op.getLayout());
+            is_updated.insert(op.getInput());
+            is_locked.insert(op.getInput());
             return mlir::WalkResult::advance();
           })
           .wasInterrupted());
@@ -342,7 +353,9 @@ mlir::LogicalResult InsertInitialLayoutsFromComputeLayout(
 
     auto* expander = SPMDExpanderRegistry::Global()->GetPropagateFnForOp(op);
     if (expander == nullptr) {
-      op->emitOpError() << "does not implement layout propagation";
+      op->emitOpError()
+          << "does not implement layout propagation. Please file a feature "
+             "request to TF DTensor. (component id: 833864)";
       return mlir::WalkResult::interrupt();
     }
 
@@ -444,7 +457,8 @@ mlir::LogicalResult MergeAndGetUpdatedLayouts(
       }
       continue;
     }
-    auto merged = MergeLayouts(producer_layout, consumer_requests[value]);
+    auto merged =
+        MergeLayouts(value, producer_layout, consumer_requests[value]);
     if (!merged.ok())
       return value.getDefiningOp()->emitOpError()
              << merged.status().error_message();
@@ -570,7 +584,9 @@ mlir::LogicalResult UpdateLayoutsForOp(
     llvm::DenseSet<mlir::Value>& is_updated) {
   auto* expander = SPMDExpanderRegistry::Global()->GetPropagateFnForOp(op);
   if (expander == nullptr)
-    return op->emitOpError() << "does not implement layout propagation";
+    return op->emitOpError()
+           << "does not implement layout propagation. Please file a feature "
+              "request to TF DTensor. (component id: 833864)";
 
   // Get input and output layouts for this op from the merged_layouts map.
   llvm::DenseMap<int, Layout> input_layouts(op->getNumOperands());
@@ -705,7 +721,8 @@ mlir::LogicalResult InsertDTensorLayoutOps(
                                          merged_layout.second),
           mlir::TF::ShapeAttr::get(builder.getContext(), type));
       llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
-      merged_layout.first.replaceAllUsesExcept(layout_op.output(), exception);
+      merged_layout.first.replaceAllUsesExcept(layout_op.getOutput(),
+                                               exception);
     } else {
       mlir::emitError(merged_layout.first.getLoc())
           << "value type is not TensorType as expected.";
@@ -1026,36 +1043,6 @@ void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
   LOG(INFO) << "Dumped MLIR module to " << prefix;
 }
 
-// Canonicalizer and DCE transformation passes may removed ops in the graph and
-// result in multiple consecutive DTensorLayout ops. Detect all such cases and
-// replace unnecessary DTensorLayout ops with Identity ops.
-mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
-    mlir::ModuleOp module) {
-  llvm::SmallVector<mlir::TF::DTensorLayout, 4> layout_ops;
-  module.walk([&](mlir::TF::DTensorLayout op) { layout_ops.emplace_back(op); });
-
-  for (auto layout_op : llvm::reverse(layout_ops)) {
-    auto input_op = layout_op.input().getDefiningOp();
-    if (auto input_layout_op =
-            llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(input_op)) {
-      // Check that layout of input DTensorLayout op is equivalent to
-      // the layout of its connected DTensorLayout op.
-      if (layout_op.layout() != input_layout_op.layout())
-        return layout_op.emitOpError(
-            "Found inconsistent layout. This should never happen.");
-
-      // Replace DTensorLayout op with identity op.
-      mlir::OpBuilder builder(layout_op);
-      auto identity = builder.create<mlir::TF::IdentityOp>(
-          layout_op->getLoc(), layout_op.getType(), layout_op.input());
-      layout_op.output().replaceAllUsesWith(identity.output());
-      layout_op.erase();
-    }
-  }
-
-  return mlir::success();
-}
-
 // Inserts/changes DTensorLayout op after IfRegion op and results of then/else
 // branches to ensure that the return values of IfRegion ops are consistent.
 // After layout propagation, layouts of return value of tf.IfRegion op, and
@@ -1069,12 +1056,12 @@ mlir::LogicalResult InsertDTensorLayoutForIfRegionOp(
   for (mlir::TF::IfRegionOp if_op : if_ops) {
     for (mlir::OpResult if_result : if_op.getResults()) {
       const int result_index = if_result.getResultNumber();
-      mlir::Value then_branch_result = if_op.then_branch()
+      mlir::Value then_branch_result = if_op.getThenBranch()
                                            .front()
                                            .getTerminator()
                                            ->getOpOperand(result_index)
                                            .get();
-      mlir::Value else_branch_result = if_op.else_branch()
+      mlir::Value else_branch_result = if_op.getElseBranch()
                                            .front()
                                            .getTerminator()
                                            ->getOpOperand(result_index)
@@ -1086,9 +1073,9 @@ mlir::LogicalResult InsertDTensorLayoutForIfRegionOp(
           *then_branch_result.getDefiningOp());
       auto else_result_layout = llvm::dyn_cast<mlir::TF::DTensorLayout>(
           *else_branch_result.getDefiningOp());
-      llvm::SmallVector<Layout, 4> layouts{if_result_layout.layout(),
-                                           then_result_layout.layout(),
-                                           else_result_layout.layout()};
+      llvm::SmallVector<Layout, 4> layouts{if_result_layout.getLayout(),
+                                           then_result_layout.getLayout(),
+                                           else_result_layout.getLayout()};
       std::set<Layout> layouts_set{layouts.begin(), layouts.end()};
       if (layouts_set.size() == 1) continue;
 
@@ -1153,13 +1140,13 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
     mlir::OpBuilder& builder) {
   for (mlir::TF::WhileRegionOp op : while_ops) {
     // Get the terminator so we can check the output layouts of the loop body.
-    mlir::Operation* yield_op = op.body().front().getTerminator();
+    mlir::Operation* yield_op = op.getBody().front().getTerminator();
     if (!mlir::isa<mlir::TF::YieldOp>(yield_op))
       return op->emitOpError() << "body terminator is not a Yield op.";
 
-    for (int i = 0; i < op.body().getNumArguments(); ++i) {
+    for (int i = 0; i < op.getBody().getNumArguments(); ++i) {
       // Inputs should only have one, a DTensorLayout op.
-      mlir::Value argument = op.body().getArgument(i);
+      mlir::Value argument = op.getBody().getArgument(i);
       if (!argument.hasOneUse())
         return op.emitOpError()
                << "body argument " << i << " doesn't have a single use.";
@@ -1168,7 +1155,7 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
         return op.emitOpError() << "body argument " << i
                                 << " is not consumed by a DTensorLayout op.";
       const Layout input_layout =
-          mlir::cast<mlir::TF::DTensorLayout>(input_layout_op).layout();
+          mlir::cast<mlir::TF::DTensorLayout>(input_layout_op).getLayout();
 
       // Inputs to Yield should also be a DTensorLayout op.
       if (!yield_op->getOperand(i).isa<mlir::OpResult>() ||
@@ -1179,7 +1166,7 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
       mlir::Operation* output_layout_op =
           yield_op->getOperand(i).getDefiningOp();
       const Layout output_layout =
-          mlir::cast<mlir::TF::DTensorLayout>(output_layout_op).layout();
+          mlir::cast<mlir::TF::DTensorLayout>(output_layout_op).getLayout();
 
       // If the layouts are equal we have nothing to do. Note that this caches
       // the case that that input and output are a resource, since the layout
@@ -1200,11 +1187,11 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
               yield_op->getOperand(i), input_layout.ToString());
       mlir::TF::DTensorLayout first_layout_op =
           builder.create<mlir::TF::DTensorLayout>(
-              op.getLoc(), first_relayout.output(),
+              op.getLoc(), first_relayout.getOutput(),
               mlir::dtensor::LayoutAttr::get(builder.getContext(),
                                              input_layout),
               global_shape);
-      yield_op->setOperand(i, first_layout_op.output());
+      yield_op->setOperand(i, first_layout_op.getOutput());
 
       // Insert the second relayout op after the loop itself.
       builder.setInsertionPointAfter(op);
@@ -1216,11 +1203,11 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
               global_shape);
       mlir::TF::RelayoutOp second_relayout =
           builder.create<mlir::TF::RelayoutOp>(
-              op.getLoc(), second_layout_op.output().getType(),
-              second_layout_op.output(), output_layout.ToString());
+              op.getLoc(), second_layout_op.getOutput().getType(),
+              second_layout_op.getOutput(), output_layout.ToString());
       op->getResult(i).replaceAllUsesExcept(
-          second_relayout.output(), llvm::SmallPtrSet<mlir::Operation*, 1>{
-                                        second_layout_op.getOperation()});
+          second_relayout.getOutput(), llvm::SmallPtrSet<mlir::Operation*, 1>{
+                                           second_layout_op.getOperation()});
     }
   }
   return mlir::success();
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 6610bf8d731..38258beee78 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -272,14 +272,14 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
   // DTensorSend op sends the predicate to `mesh` cluster with replicated
   // layout.
   mlir::TensorType predicate_tensor_type =
-      if_region.cond().getType().cast<mlir::TensorType>();
+      if_region.getCond().getType().cast<mlir::TensorType>();
   const std::string send_recv_key =
       absl::StrCat(kSendRecvKeyPrefix, *num_send_recvs);
   *num_send_recvs += 1;
 
   const Layout target_layout = Layout::ReplicatedOnMesh(mesh, 0);
   builder.create<mlir::TF::DTensorSend>(
-      if_region.getLoc(), if_region.cond(),
+      if_region.getLoc(), if_region.getCond(),
       builder.getStringAttr(send_recv_key),
       mlir::dtensor::LayoutAttr::get(context, target_layout));
 
@@ -302,21 +302,21 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
   // Clone tf.IfRegion op inside newly created cluster and make sure
   // that the predicate tensor is from DTensorRecv op created above.
   auto host_side_if = builder.create<mlir::TF::IfRegionOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{}, recv_op.output(),
-      if_region.is_stateless(),
+      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
+      recv_op.getOutput(), if_region.getIsStateless(),
       GetUniqueControlflowFnName("cloned_if_then", builder),
       GetUniqueControlflowFnName("cloned_if_else", builder));
   *cloned_if_region_op = host_side_if;
 
   // Create empty then branch region.
-  auto& then_branch = host_side_if.then_branch();
+  auto& then_branch = host_side_if.getThenBranch();
   then_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&then_branch.front());
   builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
                                     /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
   // Create empty else branch region.
-  auto& else_branch = host_side_if.else_branch();
+  auto& else_branch = host_side_if.getElseBranch();
   else_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&else_branch.front());
   builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
@@ -350,7 +350,7 @@ mlir::LogicalResult VerifyClusterInputOutput(
 bool IsInsideIfThenBranch(mlir::TF::IfRegionOp if_op,
                           mlir::tf_device::ClusterOp cluster) {
   assert(if_op->isProperAncestor(cluster));
-  return if_op.then_branch().isAncestor(cluster->getParentRegion());
+  return if_op.getThenBranch().isAncestor(cluster->getParentRegion());
 }
 
 // Decomposes multi-mesh computation nested inside tf_if operations. See
@@ -379,19 +379,19 @@ mlir::LogicalResult DecomposeIf(mlir::TF::IfRegionOp if_op,
     // corresponding branch.
     if (IsInsideIfThenBranch(if_op, nested_cluster)) {
       mlir::Operation* then_branch_terminator =
-          cloned_if.then_branch().begin()->getTerminator();
+          cloned_if.getThenBranch().begin()->getTerminator();
       auto& nested_cluster_operations =
           nested_cluster.GetBody().getOperations();
-      cloned_if.then_branch().begin()->getOperations().splice(
+      cloned_if.getThenBranch().begin()->getOperations().splice(
           then_branch_terminator->getIterator(), nested_cluster_operations,
           nested_cluster_operations.begin(),
           std::prev(nested_cluster_operations.end()));
     } else {
       mlir::Operation* else_branch_terminator =
-          cloned_if.else_branch().begin()->getTerminator();
+          cloned_if.getElseBranch().begin()->getTerminator();
       auto& nested_cluster_operations =
           nested_cluster.GetBody().getOperations();
-      cloned_if.else_branch().begin()->getOperations().splice(
+      cloned_if.getElseBranch().begin()->getOperations().splice(
           else_branch_terminator->getIterator(), nested_cluster_operations,
           nested_cluster_operations.begin(),
           std::prev(nested_cluster_operations.end()));
diff --git a/tensorflow/dtensor/mlir/mesh_propagation.cc b/tensorflow/dtensor/mlir/mesh_propagation.cc
index 99257f45243..d65b16ef170 100644
--- a/tensorflow/dtensor/mlir/mesh_propagation.cc
+++ b/tensorflow/dtensor/mlir/mesh_propagation.cc
@@ -214,7 +214,7 @@ mlir::LogicalResult InferMeshFromInputs(
         // extracted from the DTensorLayout op to infer the mesh of the cluster.
         if (auto layout_op =
                 llvm::dyn_cast<mlir::TF::DTensorLayout>(operand->getOwner())) {
-          auto mesh = layout_op.layout().mesh();
+          auto mesh = layout_op.getLayout().mesh();
           extracted_config.emplace(mesh);
         } else {
           auto extract_result =
@@ -697,16 +697,16 @@ mlir::LogicalResult RewriteCopyToMeshGradOp(
   // changes the mesh.
   builder->setInsertionPoint(backward_op);
   StatusOr<Layout> layout =
-      Layout::FromString(backward_op.reference_layout().str());
+      Layout::FromString(backward_op.getReferenceLayout().str());
   if (!layout.ok()) {
     return backward_op.emitOpError("Failure passing layout: ")
-           << backward_op.reference_layout().str();
+           << backward_op.getReferenceLayout().str();
   }
   layout->set_mesh(mesh.value());
 
   auto op = builder->create<mlir::TF::CopyToMeshOp>(
       backward_op->getLoc(), backward_op->getResult(0).getType(),
-      backward_op.input(), layout->ToString());
+      backward_op.getInput(), layout->ToString());
 
   backward_op->replaceAllUsesWith(op);
   backward_op->erase();
diff --git a/tensorflow/dtensor/mlir/move_compilation_to_host.cc b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
index 4d714a1d4cb..e9b0cf86157 100644
--- a/tensorflow/dtensor/mlir/move_compilation_to_host.cc
+++ b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
@@ -90,7 +90,7 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
     mlir::tf_device::LaunchOp compile_op_launch, int* num_send_recv,
     mlir::Value* program_key_output) {
   mlir::OpBuilder builder(module.getContext());
-  mlir::Value compilation_key = *compile_op.program().begin();
+  mlir::Value compilation_key = *compile_op.getProgram().begin();
   absl::Span<const std::string> local_devices = mesh.local_devices();
 
   // Create tensor name mapping for each send/recv pair.
@@ -147,7 +147,7 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
     recv->setAttr("device", builder.getStringAttr(local_devices[i]));
 
     fn_builder.create<mlir::func::ReturnOp>(recv_select_fn.getLoc(),
-                                            recv.tensor());
+                                            recv.getTensor());
 
     compilation_key_functions.emplace_back(recv_select_fn);
   }
@@ -230,7 +230,7 @@ mlir::LogicalResult HandleCompilationOps(
     if (!host_function) {
       host_function = parent_function;
       auto mesh_it = llvm::find_if(computation_map, [&](auto& it) {
-        return it.second.f() == host_function.getSymName();
+        return it.second.getF() == host_function.getSymName();
       });
       if (mesh_it == computation_map.end())
         return compilation_key.emitOpError(
@@ -293,7 +293,7 @@ mlir::LogicalResult HandleCompilationOps(
         GetUniqueControlflowFnName("compilation_host_else", builder));
 
     // Create empty else branch region.
-    auto& host_else_branch = if_host.else_branch();
+    auto& host_else_branch = if_host.getElseBranch();
     host_else_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_else_branch.front());
     builder.create<mlir::TF::YieldOp>(
@@ -302,7 +302,7 @@ mlir::LogicalResult HandleCompilationOps(
 
     // Create then branch region with logic to compile TPU program and send
     // program key to all TPU devices.
-    auto& host_then_branch = if_host.then_branch();
+    auto& host_then_branch = if_host.getThenBranch();
     host_then_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_then_branch.front());
     auto yield = builder.create<mlir::TF::YieldOp>(
diff --git a/tensorflow/dtensor/mlir/op_to_device_cluster.cc b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
index 0652e2fc95a..7b49ad4bc00 100644
--- a/tensorflow/dtensor/mlir/op_to_device_cluster.cc
+++ b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
@@ -56,9 +56,9 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
       op->getLoc(), op->getResultTypes());
   if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
     cluster->setAttr(kMeshAttr, builder->getStringAttr(
-                                    layout_op.layout().mesh().ToString()));
+                                    layout_op.getLayout().mesh().ToString()));
   } else if (auto copy_to_mesh = llvm::dyn_cast<mlir::TF::CopyToMeshOp>(op)) {
-    const std::string layout_string = copy_to_mesh.layout().str();
+    const std::string layout_string = copy_to_mesh.getLayout().str();
     auto layout_or = Layout::FromString(layout_string);
     if (!layout_or.ok())
       return op->emitOpError(
diff --git a/tensorflow/dtensor/mlir/op_utils.cc b/tensorflow/dtensor/mlir/op_utils.cc
index f33a1c2c5ce..9f0215a4636 100644
--- a/tensorflow/dtensor/mlir/op_utils.cc
+++ b/tensorflow/dtensor/mlir/op_utils.cc
@@ -54,10 +54,58 @@ absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op) {
   return func;
 }
 
-void RemoveDTensorLayoutOp(mlir::TF::DTensorLayout layout) {
-  layout.output().replaceAllUsesWith(layout.input());
-  layout.erase();
+void RemoveDTensorLayoutOps(mlir::ModuleOp module,
+                            bool remove_xla_spmd_layouts) {
+  llvm::SmallVector<mlir::TF::DTensorLayout, 4> layout_ops;
+  module.walk([&](mlir::TF::DTensorLayout layout) {
+    // Remove layout ops only for layouts running on DTensor SPMD.
+    // Layout ops will be preserved for XLA SPMD to annotate sharding
+    // later down the DTensor stack.
+    if (remove_xla_spmd_layouts || !layout.getLayout().mesh().use_xla_spmd()) {
+      layout_ops.emplace_back(layout);
+    }
+  });
+
+  for (auto layout_op : layout_ops) {
+    layout_op.getOutput().replaceAllUsesWith(layout_op.getInput());
+    layout_op.erase();
+  }
 }
 
+mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
+    mlir::ModuleOp module) {
+  llvm::SmallVector<mlir::TF::DTensorLayout, 4> layout_ops;
+  module.walk([&](mlir::TF::DTensorLayout op) { layout_ops.emplace_back(op); });
+
+  llvm::DenseSet<mlir::TF::DTensorLayout> deleted_layout_ops;
+
+  for (auto layout_op : llvm::reverse(layout_ops)) {
+    if (deleted_layout_ops.contains(layout_op)) {
+      continue;
+    }
+    while (auto input_layout_op =
+               llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(
+                   layout_op.getInput().getDefiningOp())) {
+      // Check that layout of input DTensorLayout op is equivalent to
+      // the layout of its connected DTensorLayout op.
+      if (layout_op.getLayout() != input_layout_op.getLayout()) {
+        return layout_op.emitOpError(
+            "Found inconsistent layout. This should never happen.");
+      }
+
+      // Replace DTensorLayout op with identity op.
+      mlir::OpBuilder builder(input_layout_op);
+      auto new_identity = builder.create<mlir::TF::IdentityOp>(
+          input_layout_op->getLoc(), input_layout_op.getType(),
+          input_layout_op.getInput());
+      input_layout_op.getOutput().replaceAllUsesWith(new_identity.getOutput());
+      input_layout_op.erase();
+
+      deleted_layout_ops.insert(input_layout_op);
+    }
+  }
+
+  return mlir::success();
+}
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/op_utils.h b/tensorflow/dtensor/mlir/op_utils.h
index c86386e73d4..c63f034707f 100644
--- a/tensorflow/dtensor/mlir/op_utils.h
+++ b/tensorflow/dtensor/mlir/op_utils.h
@@ -37,6 +37,13 @@ inline std::string OpName(mlir::Operation* op) {
 // Returns FuncOp if `op` is a callable.
 absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
 
+// DTensorLayout only conveys layout information of tensors which is no
+// longer needed after SPMD expansion. As so, remove all layouts from
+// graph.
+//
+// If `remove_xla_spmd_layouts` is set to false, then Layouts that have
+// Mesh config with use_xla_spmd=True will not be deleted.
+//
 // Removes tf.DTensorLayout op and forwards it's input to it's users.
 // For example:
 //   %0 = tf.A()
@@ -46,8 +53,25 @@ absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
 // Will be converted to:
 //   %0 = tf.A()
 //   %2 = tf.B(%0)
-void RemoveDTensorLayoutOp(mlir::TF::DTensorLayout layout);
+void RemoveDTensorLayoutOps(mlir::ModuleOp module,
+                            bool remove_xla_spmd_layouts);
 
+// Canonicalizer and DCE transformation passes may removed ops in the graph and
+// result in multiple consecutive DTensorLayout ops. Detect all such cases and
+// replace unnecessary DTensorLayout ops with Identity ops.
+//
+// Removes tf.DTensorLayouts and inserts a tf.Identity.
+// For example:
+//   %0 = tf.DTensorLayout(arg0)
+//   %1 = tf.DTensorLayout(%0)
+//   %2 = tf.Add(%1, %1)
+//
+// Will be converted to:
+//   %0 = tf.Identity(arg0)
+//   %1 = tf.DTensorLayout(%0)
+//   %2 = tf.Add(%1, %1)
+mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
+    mlir::ModuleOp module);
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index 08988a100e4..b511718565a 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -54,14 +54,14 @@ void CreateDTensorLayoutOp(const Layout& layout, mlir::Value input,
       loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
       mlir::TF::ShapeAttr::get(context, type));
   llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
-  input.replaceAllUsesExcept(layout_op.output(), exception);
+  input.replaceAllUsesExcept(layout_op.getOutput(), exception);
 }
 
 // Adds DTensorLayout op following each Relayout operation to ensure that
 // tensor from `relayout` has fixed layout.
 mlir::LogicalResult PropagateDTensorLayoutForRelayout(
     mlir::MLIRContext& c, mlir::TF::RelayoutOp relayout) {
-  const std::string layout_str = relayout.layout().str();
+  const std::string layout_str = relayout.getLayout().str();
   auto layout_or_status = Layout::FromString(layout_str);
   if (!layout_or_status.ok()) {
     return relayout.emitOpError(
@@ -81,7 +81,7 @@ mlir::LogicalResult PropagateDTensorLayoutForRelayout(
   mlir::TensorType type = relayout.getType().dyn_cast<mlir::TensorType>();
   if (!type) return relayout.emitOpError("type required for Relayout op");
 
-  CreateDTensorLayoutOp(layout, relayout.output(), type, relayout.getLoc(),
+  CreateDTensorLayoutOp(layout, relayout.getOutput(), type, relayout.getLoc(),
                         &builder, &c);
   return mlir::success();
 }
@@ -198,7 +198,7 @@ struct DTensorPropagateDefaultLayout
                   mlir::dtensor::LayoutAttr::get(&context, *layout),
                   mlir::TF::ShapeAttr::get(&context, type));
               llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
-              op_output.replaceAllUsesExcept(layout_op.output(), exception);
+              op_output.replaceAllUsesExcept(layout_op.getOutput(), exception);
             } else {
               return op->emitOpError()
                      << "type for output " << index << " is not a TensorType";
diff --git a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
index a032220cb02..6100cfd8ce6 100644
--- a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
+++ b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
@@ -64,9 +64,9 @@ llvm::SmallVector<FunctionToChangeInfo, 4> FindFunctionsToRewrite(
     llvm::StringRef symbol;
     if (auto call_op =
             llvm::dyn_cast<mlir::TF::StatefulPartitionedCallOp>(op)) {
-      symbol = call_op.f();
+      symbol = call_op.getF();
     } else {
-      auto symbol_ref = llvm::dyn_cast<mlir::TF::PartitionedCallOp>(op).f();
+      auto symbol_ref = llvm::dyn_cast<mlir::TF::PartitionedCallOp>(op).getF();
       if (!symbol_ref.isa<mlir::FlatSymbolRefAttr>()) return;
       symbol = symbol_ref.getRootReference().getValue();
     }
@@ -125,15 +125,15 @@ mlir::LogicalResult PrependDeviceIdToCallsites(mlir::OpBuilder* builder,
           llvm::dyn_cast<mlir::TF::StatefulPartitionedCallOp>(op)) {
     new_call = builder->create<mlir::TF::StatefulPartitionedCallOp>(
         op->getLoc(), op->getResultTypes(), new_operands,
-        stateful_partitioned_call.f(), stateful_partitioned_call.config(),
-        stateful_partitioned_call.config_proto(),
-        stateful_partitioned_call.executor_type());
+        stateful_partitioned_call.getF(), stateful_partitioned_call.getConfig(),
+        stateful_partitioned_call.getConfigProto(),
+        stateful_partitioned_call.getExecutorType());
   } else {
     auto partitioned_call = llvm::cast<mlir::TF::PartitionedCallOp>(op);
     new_call = builder->create<mlir::TF::PartitionedCallOp>(
-        op->getLoc(), op->getResultTypes(), new_operands, partitioned_call.f(),
-        partitioned_call.config(), partitioned_call.config_proto(),
-        partitioned_call.executor_type());
+        op->getLoc(), op->getResultTypes(), new_operands,
+        partitioned_call.getF(), partitioned_call.getConfig(),
+        partitioned_call.getConfigProto(), partitioned_call.getExecutorType());
   }
 
   for (auto results : llvm::zip(op->getResults(), new_call->getResults()))
diff --git a/tensorflow/dtensor/mlir/restore_shape_inference.cc b/tensorflow/dtensor/mlir/restore_shape_inference.cc
index b97ac5aee55..77f02781592 100644
--- a/tensorflow/dtensor/mlir/restore_shape_inference.cc
+++ b/tensorflow/dtensor/mlir/restore_shape_inference.cc
@@ -91,26 +91,26 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
   } else if (auto identity_op =
                  llvm::dyn_cast_or_null<mlir::TF::IdentityOp>(op)) {
     auto new_identity_op = builder->create<mlir::TF::IdentityOp>(
-        identity_op.getLoc(), type, identity_op.input());
-    identity_op.output().replaceAllUsesWith(new_identity_op.output());
+        identity_op.getLoc(), type, identity_op.getInput());
+    identity_op.getOutput().replaceAllUsesWith(new_identity_op.getOutput());
     identity_op.erase();
 
     // Recursively shape inference to the input of the identity op.
     return BackwardShapeInferenceToRestoreOp(module, builder,
-                                             new_identity_op.input(), type);
+                                             new_identity_op.getInput(), type);
   } else if (auto recv_op = llvm::dyn_cast_or_null<mlir::TF::DTensorRecv>(op)) {
     // If we have a DTensorRecv, then there is cross mesh action and the
     // RestoreV2Op we want to fix is on the mesh of the corresponding
     // DTensorSend. Set shape of this DTensorRecv first and go to the
     // corresponding DTensorSend.
     auto new_recv_op = builder->create<mlir::TF::DTensorRecv>(
-        recv_op.getLoc(), type, builder->getStringAttr(recv_op.key()),
+        recv_op.getLoc(), type, builder->getStringAttr(recv_op.getKey()),
         mlir::TF::ShapeAttr::get(builder->getContext(),
                                  type.dyn_cast<mlir::TensorType>()),
         mlir::dtensor::LayoutAttr::get(builder->getContext(),
-                                       recv_op.layout()));
+                                       recv_op.getLayout()));
 
-    recv_op.replaceAllUsesWith(new_recv_op.output());
+    recv_op.replaceAllUsesWith(new_recv_op.getOutput());
     recv_op.erase();
 
     auto send_op = GetCorrespondingDTensorSendRecvOp<mlir::TF::DTensorRecv>(
@@ -139,9 +139,9 @@ mlir::LogicalResult PropagateShapeInformationFromAssignVariableOp(
     mlir::ModuleOp module) {
   module.walk([&](mlir::TF::AssignVariableOp assign_op) {
     // Check that the `value` has an unknown shape.
-    if (ValueRank(assign_op.value()) == -1) {
+    if (ValueRank(assign_op.getValue()) == -1) {
       StatusOr<llvm::ArrayRef<int64_t>> shape =
-          GetShapeOfValue(assign_op.resource());
+          GetShapeOfValue(assign_op.getResource());
       if (!shape.ok()) {
         assign_op->emitOpError(
             "Resource tensor was expected to have shape information but was "
@@ -151,9 +151,9 @@ mlir::LogicalResult PropagateShapeInformationFromAssignVariableOp(
       // Propagete shape backwards to all the ops that use or produce
       // the value with missing shape.
       mlir::OpBuilder builder(assign_op);
-      mlir::Type known_type = GetSubtypeOrSelf(assign_op.resource());
+      mlir::Type known_type = GetSubtypeOrSelf(assign_op.getResource());
       if (mlir::failed(BackwardShapeInferenceToRestoreOp(
-              module, &builder, assign_op.value(), known_type))) {
+              module, &builder, assign_op.getValue(), known_type))) {
         assign_op->emitOpError(
             "Error doing Backward shape inference from AssignVariableOp during "
             "CheckpointShapeInference.");
diff --git a/tensorflow/dtensor/mlir/shape_utils.cc b/tensorflow/dtensor/mlir/shape_utils.cc
index d33c914e6d5..021e88b6df3 100644
--- a/tensorflow/dtensor/mlir/shape_utils.cc
+++ b/tensorflow/dtensor/mlir/shape_utils.cc
@@ -42,7 +42,7 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalInputShape(
   if (input_defining_op) {
     if (auto layout_op =
             llvm::dyn_cast<mlir::TF::DTensorLayout>(input_defining_op)) {
-      auto global_shape = layout_op.global_shape();
+      auto global_shape = layout_op.getGlobalShape();
       if (!global_shape)
         return errors::Internal("global_shape does not have static rank");
       return *global_shape;
@@ -78,7 +78,7 @@ StatusOr<llvm::ArrayRef<int64_t>> ExtractGlobalOutputShape(
   if (op->getOpResult(output_index).hasOneUse()) {
     auto user = op->getOpResult(output_index).getUses().begin().getUser();
     if (auto layout_op = mlir::dyn_cast<mlir::TF::DTensorLayout>(user)) {
-      auto global_shape = layout_op.global_shape();
+      auto global_shape = layout_op.getGlobalShape();
       if (!global_shape)
         return errors::Internal("global_shape does not have static rank");
       return *global_shape;
@@ -125,7 +125,7 @@ mlir::NamedAttrList GetAllAttributesFromOperation(mlir::Operation* op) {
 // operation is `DTensorLayout` op, then we use input of DTensorLayout op
 // instead for correct constant matching.
 mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
-    llvm::Optional<mlir::Location> location, mlir::Operation* op,
+    std::optional<mlir::Location> location, mlir::Operation* op,
     int64_t graph_version,
     llvm::SmallVectorImpl<mlir::ShapedTypeComponents>& inferred_return_shapes) {
   if (auto type_op = llvm::dyn_cast<mlir::InferTypeOpInterface>(op)) {
@@ -170,7 +170,7 @@ mlir::LogicalResult InferShapeOfTFOpWithCustomOperandConstantFn(
   auto operand_as_constant_fn = [](mlir::Value operand) -> mlir::Attribute {
     while (auto input_op = llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(
                operand.getDefiningOp())) {
-      operand = input_op.input();
+      operand = input_op.getInput();
     }
 
     mlir::Attribute attr;
@@ -248,12 +248,12 @@ StatusOr<llvm::ArrayRef<int64_t>> GetGlobalShapeOfValueFromDTensorLayout(
   if (value.isa<mlir::OpResult>() &&
       mlir::isa<mlir::TF::DTensorLayout>(value.getDefiningOp())) {
     auto layout_op = mlir::cast<mlir::TF::DTensorLayout>(value.getDefiningOp());
-    if (layout_op.global_shape()) return layout_op.global_shape().getValue();
+    if (layout_op.getGlobalShape()) return layout_op.getGlobalShape().value();
   } else if (value.hasOneUse() &&
              mlir::isa<mlir::TF::DTensorLayout>(*value.getUsers().begin())) {
     auto layout_op =
         mlir::cast<mlir::TF::DTensorLayout>(*value.getUsers().begin());
-    if (layout_op.global_shape()) return layout_op.global_shape().getValue();
+    if (layout_op.getGlobalShape()) return layout_op.getGlobalShape().value();
   }
   return errors::InvalidArgument(
       "consumer or producer of value is not a DTensorLayout");
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
index d9d3ca5922d..7f3ab1f55d6 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
@@ -43,7 +43,7 @@ StatusOr<mlir::Value> ExpandIndices(mlir::OpBuilder& builder,
         "DTensor.");
   mlir::Location loc = indices.getLoc();
   auto indices_padded_type = mlir::RankedTensorType::get(
-      {-1, 3},
+      {mlir::ShapedType::kDynamic, 3},
       indices.getType().dyn_cast<mlir::RankedTensorType>().getElementType());
   // Little trick to make a rank-2 tensor of [[0,0], [0,1]] using rank 1
   // constants.
@@ -69,7 +69,7 @@ StatusOr<mlir::Operation*> DynamicEnqueueSparseExpander::ExpandOp(
   mlir::OpBuilder builder(dense_enqueue_op);
   mlir::Location location = dense_enqueue_op->getLoc();
 
-  mlir::OperandRange feature = dense_enqueue_op.embedding_indices();
+  mlir::OperandRange feature = dense_enqueue_op.getEmbeddingIndices();
   llvm::SmallVector<mlir::Value, 4> indices;
   llvm::SmallVector<mlir::Value, 4> values;
 
@@ -97,11 +97,11 @@ StatusOr<mlir::Operation*> DynamicEnqueueSparseExpander::ExpandOp(
               location,
               /*sample_indices_or_row_splits_list=*/indices,
               /*embedding_indices=*/values,
-              /*aggregation_weights=*/dense_enqueue_op.aggregation_weights(),
+              /*aggregation_weights=*/dense_enqueue_op.getAggregationWeights(),
               /*mode_override=*/
-              dense_enqueue_op.mode_override(),
-              /*device_ordinal=*/dense_enqueue_op.device_ordinal(),
-              /*combiners=*/dense_enqueue_op.combiners());
+              dense_enqueue_op.getModeOverride(),
+              /*device_ordinal=*/dense_enqueue_op.getDeviceOrdinal(),
+              /*combiners=*/dense_enqueue_op.getCombiners());
   dense_enqueue_op.erase();
   return sparse_enqueue_op;
 }
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
index 1eac1b90417..1d6fbaaed97 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
@@ -25,7 +25,7 @@ namespace dtensor {
 StatusOr<mlir::Operation*> MatMulSparseExpander::ExpandOp(mlir::Operation* op) {
   mlir::TF::MatMulOp mm = mlir::cast<mlir::TF::MatMulOp>(op);
   // If any of the transpose attributes are true, then return original op.
-  if (mm.transpose_a() || mm.transpose_b()) return op;
+  if (mm.getTransposeA() || mm.getTransposeB()) return op;
 
   // Expand to SparseTensorDenseMatMul Op only if the left operand
   // is a SparseTensor.
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index 3b5049d24ba..d086234678b 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -31,8 +31,10 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/dtensor_utils.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives.h"
+#include "tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
@@ -53,7 +55,21 @@ SPMDExpanderBase* SPMDExpanderRegistry::GetPropagateFnForOp(
     mlir::Operation* op) {
   auto key = OpName(op);
   auto fn = op_to_propagate_fn_map_.find(key);
-  if (fn == op_to_propagate_fn_map_.end()) return nullptr;
+  if (fn == op_to_propagate_fn_map_.end()) {
+    if (EnableReplicatedSpmdAsDefault(key)) {
+      LOG(WARNING)
+          << key << " is defaulting to ReplicatedOpSPMDExpander. This "
+          << " has performance implications as all inputs and outputs "
+          << " will be replicated if they are not already. Please file a "
+          << " feature request to TF DTensor to implement an efficient "
+          << " SPMD for this operation.";
+      RegisterPropagateFn(key, std::make_unique<ReplicatedOpSPMDExpander>(
+                                   /*relayout_when_sharded=*/true));
+      return op_to_propagate_fn_map_.find(key)->second.get();
+    } else {
+      return nullptr;
+    }
+  }
   return fn->second.get();
 }
 
@@ -72,11 +88,20 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
 
   if (computed_layout.empty() && op->getNumResults() != 0) {
     return errors::InvalidArgument(
-        absl::StrCat("No attachced layout found for op : ", OpName(op),
+        absl::StrCat("No attached layout found for op : ", OpName(op),
                      " This might be due to an error in layout propagation.")
             .c_str());
   }
 
+  // If op is on an XLA SPMD mesh, then set layout and skip expansion.
+  TF_ASSIGN_OR_RETURN(const Mesh& mesh, ExtractDeviceMeshEnclosingCluster(op));
+  if (mesh.use_xla_spmd()) {
+    *output = op;
+    SetLayoutOnOp(*output, absl::Span<std::optional<Layout>>(
+                               computed_layout.data(), computed_layout.size()));
+    return OkStatus();
+  }
+
   // `op` may be removed/replaced from the graph during SPMD expansion, so
   // extract the global output shape before expansion.
   llvm::SmallVector<llvm::SmallVector<int64_t, 4>, 4> global_output_shapes;
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index 89ca203f516..3ea023ef015 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/utils/convert_op_folder.h"
+#include "tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
@@ -70,7 +70,7 @@ StatusOr<mlir::TensorType> LocalTypeFromGlobalType(
   auto shape = llvm::to_vector<4>(original_type.getShape());
   auto shard_values = layout.num_shards();
   for (int output_axis = 0; output_axis < shape.size(); ++output_axis) {
-    if (shape[output_axis] != mlir::ShapedType::kDynamicSize) {
+    if (shape[output_axis] != mlir::ShapedType::kDynamic) {
       if (shape[output_axis] % shard_values[output_axis] != 0) {
         return errors::InvalidArgument(
             "The sharding spec for axis ", output_axis, " splits among ",
@@ -96,7 +96,7 @@ StatusOr<mlir::TensorType> GlobalTypeFromLocalType(
   auto shape = llvm::to_vector<4>(original_type.getShape());
   auto shard_values = layout.num_shards();
   for (int output_axis = 0; output_axis < shape.size(); ++output_axis)
-    if (shape[output_axis] != mlir::ShapedType::kDynamicSize)
+    if (shape[output_axis] != mlir::ShapedType::kDynamic)
       shape[output_axis] *= shard_values[output_axis];
   mlir::RankedTensorType new_output_type =
       mlir::RankedTensorType::get(shape, original_type.getElementType());
@@ -120,8 +120,7 @@ Status CreateSplitOp(const int num_split, const int split_dimension,
   auto input_type = src_input.getType().cast<mlir::TensorType>();
 
   if (input_type.hasRank()) {
-    if (input_type.getShape()[split_dimension] ==
-        mlir::ShapedType::kDynamicSize) {
+    if (input_type.getShape()[split_dimension] == mlir::ShapedType::kDynamic) {
       output_type = input_type;
     } else {
       auto shape = llvm::to_vector<4>(input_type.getShape());
@@ -145,7 +144,7 @@ Status CreateSplitOp(const int num_split, const int split_dimension,
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
   *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.output(), src_input);
+      location, output_types, split_dimension_op.getOutput(), src_input);
   return OkStatus();
 }
 
@@ -332,7 +331,7 @@ mlir::Value GetForwardedDTensorLayoutInput(mlir::Value value) {
       llvm::dyn_cast_or_null<mlir::TF::DTensorLayout>(value.getDefiningOp());
   if (!layout_op) return value;
 
-  return layout_op.input();
+  return layout_op.getInput();
 }
 
 // Takes an operand and traces its use across function call and
@@ -373,7 +372,7 @@ llvm::SmallVector<mlir::OpOperand*, 4> TraceUseToNextTFOp(
       values.emplace_back(if_op.getResult(operand->getOperandNumber()));
     } else if (auto while_op =
                    owner->getParentOfType<mlir::TF::WhileRegionOp>()) {
-      if (while_op && !while_op.cond().isAncestor(yield->getParentRegion()))
+      if (while_op && !while_op.getCond().isAncestor(yield->getParentRegion()))
         values.emplace_back(while_op.getResult(operand->getOperandNumber()));
     } else {
       LOG(WARNING)
@@ -381,11 +380,11 @@ llvm::SmallVector<mlir::OpOperand*, 4> TraceUseToNextTFOp(
     }
   } else if (mlir::isa<mlir::TF::DTensorLayout>(owner)) {
     auto dtensor_layout = mlir::cast<mlir::TF::DTensorLayout>(owner);
-    values.emplace_back(dtensor_layout.output());
+    values.emplace_back(dtensor_layout.getOutput());
   } else if (auto while_op = mlir::dyn_cast<mlir::TF::WhileRegionOp>(owner)) {
     // Handle loop variant inputs of while op.
-    mlir::Region& cond = while_op.cond();
-    mlir::Region& body = while_op.body();
+    mlir::Region& cond = while_op.getCond();
+    mlir::Region& body = while_op.getBody();
     const int operand_index = operand->getOperandNumber();
     values.emplace_back(cond.front().getArgument(operand_index));
     values.emplace_back(body.front().getArgument(operand_index));
@@ -501,7 +500,7 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
         op->getAttrOfType<mlir::StringAttr>(kMeshCoordinatesAttr)
                 .getValue()
                 .str() == serialized_mesh) {
-      ret_val = op.z();
+      ret_val = op.getZ();
       return mlir::WalkResult::interrupt();
     }
     return mlir::WalkResult::advance();
@@ -540,10 +539,10 @@ StatusOr<mlir::Value> GetMeshCoordinatesFromCluster(
                                                 running_product_value);
 
   auto mod_op = builder.create<mlir::TF::FloorModOp>(
-      cluster.getLoc(), div_op.z(), mesh_shape_value);
+      cluster.getLoc(), div_op.getZ(), mesh_shape_value);
 
   mod_op->setAttr(kMeshCoordinatesAttr, builder.getStringAttr(serialized_mesh));
-  return mod_op.z();
+  return mod_op.getZ();
 }
 
 mlir::LogicalResult ValidateMetadataAttributes(mlir::Operation* op) {
@@ -680,7 +679,7 @@ Status PrintTensor(mlir::Value value, const std::string& format_string = "%s") {
   mlir::TF::StringFormatOp format = builder.create<mlir::TF::StringFormatOp>(
       value.getLoc(), scalar_string, mlir::ValueRange({device_id, value}));
   format->setAttr("template", builder.getStringAttr(all_format));
-  builder.create<mlir::TF::PrintV2Op>(value.getLoc(), format.output(),
+  builder.create<mlir::TF::PrintV2Op>(value.getLoc(), format.getOutput(),
                                       /*output_stream=*/"log(info)",
                                       /*end=*/"\n");
   return OkStatus();
@@ -756,13 +755,13 @@ mlir::Operation* TopologicalIterator::next() {
     ops_to_visit_.push_back(&cluster_op.GetBody().front());
 
   if (auto while_op = mlir::dyn_cast<mlir::TF::WhileRegionOp>(op)) {
-    ops_to_visit_.push_back(&while_op.cond().front().front());
-    ops_to_visit_.push_back(&while_op.body().front().front());
+    ops_to_visit_.push_back(&while_op.getCond().front().front());
+    ops_to_visit_.push_back(&while_op.getBody().front().front());
   }
 
   if (auto if_op = mlir::dyn_cast<mlir::TF::IfRegionOp>(op)) {
-    ops_to_visit_.push_back(&if_op.then_branch().front().front());
-    ops_to_visit_.push_back(&if_op.else_branch().front().front());
+    ops_to_visit_.push_back(&if_op.getThenBranch().front().front());
+    ops_to_visit_.push_back(&if_op.getElseBranch().front().front());
   }
   return op;
 }
diff --git a/tensorflow/dtensor/mlir/spmd_expanders.cc b/tensorflow/dtensor/mlir/spmd_expanders.cc
index c2d5d823d8b..88c85a9a043 100644
--- a/tensorflow/dtensor/mlir/spmd_expanders.cc
+++ b/tensorflow/dtensor/mlir/spmd_expanders.cc
@@ -32,9 +32,11 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/expansions/identity_n_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/in_top_k_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.h"
+#include "tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/meta_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/nullary_spmd_expander.h"
+#include "tensorflow/dtensor/mlir/expansions/optional_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/qr_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/range_spmd_expander.h"
@@ -54,6 +56,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/expansions/tensorlist_setitem_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/top_k_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/trivial_spmd_expander.h"
+#include "tensorflow/dtensor/mlir/expansions/unsupported_op_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/spmd_expander.h"
 
 namespace tensorflow {
@@ -216,10 +219,17 @@ REGISTER_SPMD(L2Loss, TF::L2LossOp, ReduceSPMDExpander);
 REGISTER_SPMD(Conv2D, TF::Conv2DOp, ConvSPMDExpander);
 REGISTER_SPMD(Conv2DBackpropFilter, TF::Conv2DBackpropFilterOp,
               ConvSPMDExpander);
+REGISTER_SPMD(Conv2DBackpropFilterV2, TF::Conv2DBackpropFilterV2Op,
+              ConvSPMDExpander);
 REGISTER_SPMD(Conv2DBackpropInput, TF::Conv2DBackpropInputOp, ConvSPMDExpander);
+REGISTER_SPMD(Conv2DBackpropInputV2, TF::Conv2DBackpropInputV2Op,
+              ConvSPMDExpander);
 REGISTER_SPMD(Conv3D, TF::Conv3DOp, ConvSPMDExpander);
+REGISTER_SPMD(Conv3DBackpropFilter, TF::Conv3DBackpropFilterOp,
+              ConvSPMDExpander);
 REGISTER_SPMD(Conv3DBackpropFilterV2, TF::Conv3DBackpropFilterV2Op,
               ConvSPMDExpander);
+REGISTER_SPMD(Conv3DBackpropInput, TF::Conv3DBackpropInputOp, ConvSPMDExpander);
 REGISTER_SPMD(Conv3DBackpropInputV2, TF::Conv3DBackpropInputV2Op,
               ConvSPMDExpander);
 REGISTER_SPMD(MaxPool, TF::MaxPoolOp, ConvSPMDExpander);
@@ -267,6 +277,8 @@ REGISTER_SPMD(PadV2, TF::PadV2Op, PadSPMDExpander);
 // Scatter/Gather
 REGISTER_SPMD(GatherV2, TF::GatherV2Op, GatherV2SPMDExpander);
 REGISTER_SPMD(GatherNd, TF::GatherNdOp, GatherNdSPMDExpander);
+REGISTER_SPMD(ResourceGather, TF::ResourceGatherOp, ResourceGatherSPMDExpander);
+REGISTER_SPMD(ScatterNd, TF::ScatterNdOp, ScatterNdOpSPMDExpander);
 REGISTER_SPMD(TensorScatterUpdate, TF::TensorScatterUpdateOp,
               TensorScatterOpSPMDExpander);
 REGISTER_SPMD(TensorScatterAdd, TF::TensorScatterAddOp,
@@ -512,5 +524,33 @@ REGISTER_SPMD(WriteSummary, TF::WriteSummaryOp, IOOpSPMDExpander);
 REGISTER_SPMD(DisableCopyOnRead, TF::DisableCopyOnReadOp,
               DisableCopyOnReadSPMDExpander);
 REGISTER_SPMD(ShardedFilename, TF::ShardedFilenameOp, ReplicatedOpSPMDExpander);
+
+// tf.data Optional ops
+REGISTER_SPMD(OptionalHasValue, TF::OptionalHasValueOp,
+              OptionalHasValueSPMDExpander);
+REGISTER_SPMD(OptionalGetValue, TF::OptionalGetValueOp,
+              OptionalGetValueSPMDExpander);
+
+// tf.data Iterator ops
+REGISTER_SPMD(IteratorGetNext, TF::IteratorGetNextOp,
+              IteratorGetNextSPMDExpander);
+REGISTER_SPMD(IteratorGetNextAsOptional, TF::IteratorGetNextAsOptionalOp,
+              IteratorGetNextAsOptionalSPMDExpander);
+
+// Unsupported ops.
+REGISTER_SPMD(RandomNormal, TF::RandomUniformOp, UnsupportedOpSPMDExpander,
+              /*error_message=*/
+              "Stateful random operations are not supported in DTensor. Please "
+              "use stateless random operations instead.");
+REGISTER_SPMD(RandomNormalInt, TF::RandomUniformIntOp,
+              UnsupportedOpSPMDExpander,
+              /*error_message=*/
+              "Stateful random operations are not supported in DTensor. Please "
+              "use stateless random operations instead.");
+
+// Unique
+REGISTER_SPMD(Unique, TF::UniqueOp, ReplicatedOpSPMDExpander,
+              /*relayout_when_sharded=*/true);
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/spmd_expansion.cc b/tensorflow/dtensor/mlir/spmd_expansion.cc
index 99156dac575..7294e44bd01 100644
--- a/tensorflow/dtensor/mlir/spmd_expansion.cc
+++ b/tensorflow/dtensor/mlir/spmd_expansion.cc
@@ -181,7 +181,7 @@ bool IsValueUsedByAssignVariableOp(
             llvm::dyn_cast_or_null<mlir::TF::AssignVariableOp>(
                 NextTFOp(user))) {
       *resource_argument_index_for_assign_variable =
-          GetForwardedDTensorLayoutInput(assign_variable_op.resource())
+          GetForwardedDTensorLayoutInput(assign_variable_op.getResource())
               .cast<mlir::BlockArgument>()
               .getArgNumber();
       return true;
@@ -204,6 +204,11 @@ mlir::LogicalResult UpdateFunctionArgsUsingLayout(mlir::func::FuncOp function) {
           "Invalid layout attribute found during SPMD expansion: {0}",
           arg_layout.status().error_message()));
 
+    // XLA SPMD will handle argument shape updating for us.
+    if (arg_layout->mesh().use_xla_spmd()) {
+      continue;
+    }
+
     mlir::Type arg_type = mlir::getElementTypeOrSelf(
         function.getFunctionType().getInput(argument_index));
 
@@ -329,7 +334,7 @@ mlir::LogicalResult ConductSPMDExpansion(mlir::ModuleOp module) {
     const bool is_terminator_op =
         llvm::isa<mlir::func::ReturnOp, mlir::tf_device::ReturnOp>(op);
     if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op))
-      layout_op.output().setType(layout_op.input().getType());
+      layout_op.getOutput().setType(layout_op.getInput().getType());
 
     mlir::Operation* expanded_op = nullptr;
     auto status = RunSPMDExpansion(op, &expanded_op);
@@ -353,17 +358,6 @@ mlir::LogicalResult ConductSPMDExpansion(mlir::ModuleOp module) {
   return mlir::success();
 }
 
-// DTensorLayout only conveys layout information of tensors which is no
-// longer needed after SPMD expansion. As so, remove all layouts from
-// graph.
-void RemoveDTensorLayoutOps(mlir::ModuleOp module) {
-  llvm::SmallVector<mlir::TF::DTensorLayout, 4> layout_ops;
-  module.walk(
-      [&](mlir::TF::DTensorLayout layout) { layout_ops.emplace_back(layout); });
-
-  for (auto layout_op : layout_ops) RemoveDTensorLayoutOp(layout_op);
-}
-
 // Removes temporary attrs created during SPMD expansion.
 void RemoveTemporarySPMDAttrs(mlir::ModuleOp module) {
   module.walk([&](mlir::Operation* op) {
@@ -387,7 +381,7 @@ struct DTensorSPMDExpansion
     auto module = getOperation();
     if (failed(ConductSPMDExpansion(module))) return signalPassFailure();
 
-    RemoveDTensorLayoutOps(module);
+    RemoveDTensorLayoutOps(module, /*remove_xla_spmd_layouts=*/false);
 
     RemoveTemporarySPMDAttrs(module);
   };
diff --git a/tensorflow/dtensor/mlir/tests/BUILD b/tensorflow/dtensor/mlir/tests/BUILD
new file mode 100644
index 00000000000..ac871952770
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/BUILD
@@ -0,0 +1,60 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+glob_lit_tests(
+    data = [":test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    # TODO(b/177569789): Fix below tests to use V2.
+    exclude = [
+        "layout_propagation.mlir",
+        "spmd_fill.mlir",
+        "spmd_metadata.mlir",
+        "spmd_reduction.mlir",
+        "spmd_tile.mlir",
+    ],
+    tags_override = {
+        "dtensor_embedding_v2.mlir": ["no_oss"],
+        "embedding_optimizer.mlir": ["no_oss"],
+        "dtensor_embedding_checkpoint.mlir": ["no_oss"],  # FIXME(b/264922760): The test fails on OSS.
+        "move_compilation_to_host.mlir": ["no_oss"],  # FIXME(b/264922760): The test fails on OSS.
+        "spmd_dtensor_ops.mlir": ["no_oss"],  # FIXME(b/264922760): The test fails on OSS.
+        "spmd_embedding.mlir": ["no_oss"],  # FIXME(b/264922760): The test fails on OSS.
+    },
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        ":dtensor-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
+
+tf_cc_binary(
+    name = "dtensor-opt",
+    srcs = ["dtensor_mlir_opt_main.cc"],
+    deps = [
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/core:ops",
+        "//tensorflow/dtensor/cc:dtensor_ops",
+        "//tensorflow/dtensor/mlir:create_dtensor_mlir_passes",
+        "//tensorflow/dtensor/mlir:dtensor_mlir_passes",
+        "//tensorflow/dtensor/mlir:tf_dtensor_dialect",
+        "//tensorflow/dtensor/mlir/dtensor_dialect:Dialect",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+        "@stablehlo//:register",
+    ],
+)
diff --git a/tensorflow/dtensor/mlir/tests/annotate_global_shape.mlir b/tensorflow/dtensor/mlir/tests/annotate_global_shape.mlir
new file mode 100644
index 00000000000..7ce0abdb312
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/annotate_global_shape.mlir
@@ -0,0 +1,70 @@
+// RUN: dtensor-opt %s -dtensor-annotate-global-shape -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @check_op_global_shape_annotated
+func.func @check_op_global_shape_annotated() {
+  // CHECK:      "tf.A"() {_global_shape = [#tf_type.shape<>]}
+  // CHECK-NEXT: "tf.B"() {_global_shape = [#tf_type.shape<64x64>, #tf_type.shape<2x8x8>]}
+  %1 = "tf.A"() : () -> tensor<i32>
+  %2, %3 = "tf.B"() : () -> (tensor<64x64xi64>, tensor<2x8x8xf32>)
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @check_op_with_unranked_type_annotated
+func.func @check_op_with_unranked_type_annotated() {
+  // CHECK:      "tf.B"() {_global_shape = [#tf_type.shape<*>]}
+  %1 = "tf.B"() : () -> tensor<*xi32>
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @check_op_with_non_static_shape
+func.func @check_op_with_non_static_shape() {
+  // CHECK: "tf.B"() {_global_shape = [#tf_type.shape<4>, #tf_type.shape<?>]}
+  %1, %2 = "tf.B"() : () -> (tensor<4xi32>, tensor<?xi32>)
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @check_function_arg_retval_annotated
+// CHECK-SAME:  %arg0: tensor<4x2xi32> {tf._global_shape = #tf_type.shape<4x2>}
+// CHECK-SAME: (tensor<4x2xi32> {tf._global_shape = #tf_type.shape<4x2>})
+func.func @check_function_arg_retval_annotated(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32> {
+  %0 = "tf.Identity"(%arg0) : (tensor<4x2xi32>) -> tensor<4x2xi32>
+  func.return %0 : tensor<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_function_callsites_annotated_properly
+// CHECK-SAME:  %arg0: tensor<4x2xi32> {tf._global_shape = #tf_type.shape<4x2>}
+// CHECK-SAME: (tensor<4x2xi32> {tf._global_shape = #tf_type.shape<4x2>})
+func.func @check_function_callsites_annotated_properly(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32> {
+  // CHECK:      "tf.StatefulPartitionedCall"
+  // CHECK-SAME: _global_shape = [#tf_type.shape<4x2>]
+  // CHECK-SAME: (tensor<4x2xi32>) -> tensor<4x2xi32>
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @call_func} : (tensor<4x2xi32>) -> (tensor<4x2xi32>)
+  func.return %0 : tensor<4x2xi32>
+}
+
+// CHECK-LABEL: func @call_func
+// CHECK-SAME:  %arg0: tensor<4x2xi32> {tf._global_shape = #tf_type.shape<4x2>}
+func.func @call_func(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32> {
+  func.return %arg0 : tensor<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_resource_type_shape
+// CHECK-SAME:  %arg1: tensor<!tf_type.resource<tensor<4x2xf32>>> {tf._global_shape = #tf_type.shape<4x2>
+func.func @check_resource_type_shape(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<4x2xf32>>> {tf._layout = "sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = ""}} {
+    "tf_device.cluster"() ({
+      %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+      "tf.AssignVariableOp"(%arg1, %0) {device = ""} : (tensor<!tf_type.resource<tensor<4x2xf32>>>, tensor<f32>) -> ()
+      tf_device.return
+    }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
+    func.return
+  }
diff --git a/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir b/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir
new file mode 100644
index 00000000000..c63ffda64f2
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/cluster_function_conversion.mlir
@@ -0,0 +1,113 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-cluster-function-conversion -verify-diagnostics | FileCheck %s
+
+func.func @check_cluster_func_without_layout_disallowed() {
+  %1 = "tf.A"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> tensor<i32>
+  %2 = "tf.B"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> tensor<i32>
+  // expected-error @+1 {{requires _mesh attribute}}
+  %3 = "tf_device.cluster_func"(%1, %2) {func = @main_func1} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return
+}
+
+func.func @main_func1(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_layouts_retvals_attached_in_layout_op
+func.func @check_layouts_retvals_attached_in_layout_op() -> tensor<i32> {
+  // CHECK-NOT:       "tf_device.cluster_func"()
+  // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      _layout = ["sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf_device.cluster_func"() {func = @single_in_out, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+func.func @single_in_out() -> (tensor<i32>) {
+  %0 = "tf.Const"() {_layout = ["sharding_specs:scalar, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @check_layouts_retval_attached_with_multi_in_op
+func.func @check_layouts_retval_attached_with_multi_in_op(%arg0: tensor<i64>, %arg1: tensor<1xf32> {tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3" }, %arg2: tensor<1xf32> {tf._layout = "mesh:CPU,x=2,y=2 layout:scalar" }) -> tensor<1xf32> {
+  // CHECK-NOT:       "tf_device.cluster_func"()
+  // CHECK-NEXT:      %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"(%arg1, %arg2)
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf_device.cluster_func"(%arg1, %arg2) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", func = @multi_in} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+func.func @multi_in(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tf.Add"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_input_resource_layouts_attached_in_call_op
+func.func @check_input_resource_layouts_attached_in_call_op() -> tensor<i32> {
+  // CHECK-NOT:       "tf_device.cluster_func"()
+  // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      _inferred_resource_indices = dense<1> : vector<1xi32>
+  // CHECK-SAME:      _inferred_resource_layouts
+  // CHECK-SAME:      "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf_device.cluster_func"() {func = @single_in_out, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", _inferred_resource_indices = dense<1> : vector<1xi32>,
+    _inferred_resource_layouts = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+func.func @single_in_out() -> (tensor<i32>) {
+  %0 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_nested_stateful_partitioned_call
+func.func @check_nested_stateful_partitioned_call() -> (tensor<i32>, tensor<i32>) {
+  // CHECK-NOT:       "tf_device.cluster_func"()
+  // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  // CHECK-SAME:      "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:      config = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0:2 = "tf_device.cluster_func"() {func = @nested_stateful_partitioned_call, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+  func.return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+func.func @nested_stateful_partitioned_call() -> (tensor<i32>, tensor<i32>) {
+  %0:2 = "tf.StatefulPartitionedCall()"() {config = "", config_proto = "", executor_type = "", f = @nested_cluster_func, _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> (tensor<i32>, tensor<i32>)
+  func.return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+func.func @nested_cluster_func() -> (tensor<i32>, tensor<i32>) {
+  %0:2 = "tf_device.cluster_func"() {func = @nested_func, _mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+  func.return %0#0, %0#1 : tensor<i32>, tensor<i32>
+}
+
+func.func @nested_func() -> (tensor<i32>, tensor<i32>) {
+   %0 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+   %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  func.return %0, %1 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_var_handle_op_skip_compilation
+func.func @check_var_handle_op_skip_compilation() -> tensor<!tf_type.resource<tensor<i32>>> {
+  // CHECK-NOT:       "tf_device.cluster_func"()
+  // CHECK:           %[[SPC_OUT:.*]] = "tf.StatefulPartitionedCall"()
+  // CHECK-SAME:      _layout = ["sharding_specs: mesh:TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"]
+  // CHECK-SAME:      _skip_xla_compilation = true
+  // CHECK-SAME:      config = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  %0 = "tf_device.cluster_func"() {func = @var_handle_op, _mesh="TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  func.return %0 : tensor<!tf_type.resource<tensor<i32>>>
+}
+
+func.func @var_handle_op() -> (tensor<!tf_type.resource<tensor<i32>>>) {
+  %0 = "tf.VarHandleOp"() {_layout = ["sharding_specs:scalar, mesh:TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"], container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+  func.return %0 : tensor<!tf_type.resource<tensor<i32>>>
+}
diff --git a/tensorflow/dtensor/mlir/tests/constant_folding.mlir b/tensorflow/dtensor/mlir/tests/constant_folding.mlir
new file mode 100644
index 00000000000..61de0207cc1
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/constant_folding.mlir
@@ -0,0 +1,19 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-constant-folding | FileCheck %s
+
+// Check that constants with same size/value are de-duplicated.
+// CHECK-LABEL: func @check_constants_folded
+func.func @check_constants_folded() {
+  // CHECK:      %[[CONST_OUT_0:.*]] = "tf.Const"()
+  // CHECK-SAME: value = dense<[8, 128, 128]> : tensor<3xi32>
+  // CHECK-NEXT: %[[CONST_OUT_1:.*]] = "tf.Const"()
+  // CHECK-SAME: value = dense<[8, 128]> : tensor<2xi32>
+  // CHECK-NEXT: "tf.A"(%[[CONST_OUT_0]], %[[CONST_OUT_0]], %[[CONST_OUT_0]], %[[CONST_OUT_1]], %[[CONST_OUT_1]])
+  // CHECK-NEXT: return
+  %1 = "tf.Const"() {value = dense<[8, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %2 = "tf.Const"() {value = dense<[8, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %3 = "tf.Const"() {value = dense<[8, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %4 = "tf.Const"() {value = dense<[8, 128]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %5 = "tf.Const"() {value = dense<[8, 128]> : tensor<2xi32>} : () -> tensor<2xi32>
+  "tf.A"(%1, %2, %3, %4, %5) : (tensor<3xi32>, tensor<3xi32>, tensor<3xi32>, tensor<2xi32>, tensor<2xi32>) -> ()
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/cpu_layout.pbtxt b/tensorflow/dtensor/mlir/tests/cpu_layout.pbtxt
new file mode 100644
index 00000000000..97a2dcf578c
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/cpu_layout.pbtxt
@@ -0,0 +1,18 @@
+sharding_specs {
+  sharding_spec: "batch"
+}
+sharding_specs {
+  sharding_spec: "x"
+}
+mesh_config {
+  mesh_dimensions {
+    name: "batch"
+    size: 2
+  }
+  mesh_dimensions {
+    name: "x"
+    size: 2
+  }
+  devices: "/job:ps/device:CPU:0"
+  devices: "/job:ps/device:CPU:1"
+}
diff --git a/tensorflow/dtensor/mlir/tests/designate_resource_handle_mesh.mlir b/tensorflow/dtensor/mlir/tests/designate_resource_handle_mesh.mlir
new file mode 100644
index 00000000000..9d29708ffc9
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/designate_resource_handle_mesh.mlir
@@ -0,0 +1,48 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-designate-resource-handle-mesh  -verify-diagnostics | FileCheck %s
+
+// Check that pass is no-op for tf_device.cluster ops that does not contain
+// tf.VarHandle / tf.DestroyResource op.
+// CHECK-LABEL: func @main
+func.func @main()  -> (tensor<i32>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.A"
+  // CHECK-NEXT:   tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() : () -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) {_mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+ 
+// -----
+
+// Check that empty mesh is assigned to cluster containing tf.VarHandle op.
+// CHECK-LABEL: func @check_empty_mesh_assigned_varhandle_op
+func.func @check_empty_mesh_assigned_varhandle_op()  -> (tensor<!tf_type.resource<tensor<i32>>>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   %[[RESOURCE_OUT:.*]] = "tf.VarHandleOp"()
+  // CHECK-NEXT:   tf_device.return %[[RESOURCE_OUT]]
+  // CHECK-NEXT: _mesh = "empty_mesh"
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf_type.resource<tensor<i32>>>
+    tf_device.return %0 : tensor<!tf_type.resource<tensor<i32>>>
+  }) : () -> (tensor<!tf_type.resource<tensor<i32>>>)
+  func.return %1 : tensor<!tf_type.resource<tensor<i32>>>
+}
+
+// -----
+
+// Check that non-empty mesh is assigned to cluster containing tf.DestroyResource op.
+// CHECK-LABEL: func @check_mesh_assigned_destroy_resource_op
+func.func @check_mesh_assigned_destroy_resource_op(%arg0: tensor<!tf_type.resource>)  -> () {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.DestroyResourceOp"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  "tf_device.cluster"() ({
+    "tf.DestroyResourceOp"(%arg0) : (tensor<!tf_type.resource>) -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> ()
+  func.return
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/device_mesh_cluster_coarsening.mlir b/tensorflow/dtensor/mlir/tests/device_mesh_cluster_coarsening.mlir
new file mode 100644
index 00000000000..1f4fe3a65de
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/device_mesh_cluster_coarsening.mlir
@@ -0,0 +1,167 @@
+// RUN: dtensor-opt %s -dtensor-device-mesh-cluster-coarsening -split-input-file -verify-diagnostics | FileCheck %s
+
+// The layout string is the text format escaped string from a simple 2 device, 1d mesh.
+//
+// The corresponding proto to CPU used in below tests:
+// mesh_config {
+//   mesh_dimensions {
+//     dimension {
+//     name: "batch"
+//   }
+//   size: 2
+// }
+//   devices: "/job:localhost/task:0/device:CPU:0"
+//   devices: "/job:localhost/task:0/device:CPU:1"
+// }
+//
+// For TPU, just replace CPU string in devices with TPU.
+
+// CHECK-LABEL: func @coarsen_cluster_with_same_device_config
+func.func @coarsen_cluster_with_same_device_config() {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:      tf_device.return
+    // CHECK-NEXT:    _mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) {_mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.B"() : () -> tensor<f32>
+      tf_device.return %3 : tensor<f32>
+    }) {_mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<f32>)
+    func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @coarsening_clusters_with_different_configs
+func.func @coarsening_clusters_with_different_configs() {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:      tf_device.return %[[A_OUT]], %[[B_OUT]]
+    // CHECK-NEXT:    _mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    //
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:      %[[D_OUT:.*]] = "tf.D"
+    // CHECK-NEXT:      tf_device.return %[[D_OUT]]
+    // CHECK:        _mesh = "TPU|batch=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+    //
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[E_OUT:.*]] = "tf.E"
+    // CHECK-NEXT:      tf_device.return %[[E_OUT]]
+    // CHECK:        _mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) {_mesh= "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.B"() : () -> tensor<f32>
+      tf_device.return %3 : tensor<f32>
+    }) {_mesh= "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<f32>)
+
+    %5 = "tf_device.cluster"() ({
+      %4 = "tf.C"(%2, %0) : (tensor<f32>, tensor<i32>) -> tensor<f32>
+      tf_device.return %4 : tensor<f32>
+    }) {_mesh= "TPU|batch=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<f32>)
+
+    %7 = "tf_device.cluster"() ({
+      %6 = "tf.D"(%0, %5) : (tensor<i32>, tensor<f32>) -> tensor<f32>
+      tf_device.return %6 : tensor<f32>
+    }) {_mesh= "TPU|batch=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<f32>)
+
+    %9 = "tf_device.cluster"() ({
+      %8 = "tf.E"(%7) : (tensor<f32>) -> tensor<f32>
+      tf_device.return %8 : tensor<f32>
+    }) {_mesh= "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+
+    func.return
+}
+
+// -----
+
+func.func @check_cluster_without_mesh_attribute_disallowed() {
+    // expected-error @+1 {{failed to merge mesh cluster as cluster does not have mesh attribute. This is likely due to problem in mesh propagation}}
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) : () -> (tensor<i32>)
+
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.B"() : () -> tensor<f32>
+      tf_device.return %3 : tensor<f32>
+    }) {_mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<f32>)
+    func.return
+}
+
+// -----
+
+// Check ops in tf.WhileRegions are grouped into cluster correctly.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3", tf._mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"})
+-> (tensor<4xf32> {tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.WhileRegion"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       constant
+  // CHECK-NEXT:       "tf.NotEqual"
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:     _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       constant
+  // CHECK-NEXT:       "tf.Sub"
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:     _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK-NEXT:   (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+  // CHECK-NEXT:   "tf.Identity"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK:      _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  %15:2 = "tf_device.cluster"() ({
+    %2:2 = "tf.WhileRegion"(%arg1, %arg0) ({
+      ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+         %11 = "tf_device.cluster"() ({
+           %limit = arith.constant dense<5> : tensor<i32>
+           tf_device.return %limit : tensor<i32>
+         }){_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> tensor<i32>
+
+
+         %12 = "tf_device.cluster"() ({
+           %cond = "tf.NotEqual"(%carg1, %11) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+           tf_device.return %cond : tensor<i1>
+         }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}: () -> tensor<i1>
+
+         "tf.Yield"(%12) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+        %13 = "tf_device.cluster"() ({
+          %one = arith.constant dense<1.0> : tensor<4xf32>
+          tf_device.return %one: tensor<4xf32>
+         }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}: () -> tensor<4xf32>
+
+        %14 = "tf_device.cluster"() ({
+          %sub = "tf.Sub"(%barg0, %13) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+          tf_device.return %sub: tensor<4xf32>
+         }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}: () -> tensor<4xf32>
+
+        "tf.Yield"(%14, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+    }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+
+    tf_device.return %2#0, %2#1 : tensor<4xf32>, tensor<i32>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<4xf32>, tensor<i32>)
+
+  %16 = "tf_device.cluster"() ({
+    %5 = "tf.Identity"(%15#0) : (tensor<4xf32>) -> (tensor<4xf32>)
+    tf_device.return %5 : tensor<4xf32>
+  }){_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> tensor<4xf32>
+
+  func.return %16 : tensor<4xf32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_all_gather.mlir b/tensorflow/dtensor/mlir/tests/dtensor_all_gather.mlir
new file mode 100644
index 00000000000..438cee2da87
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_all_gather.mlir
@@ -0,0 +1,31 @@
+// RUN: LOWER_DTENSOR_GATHER_TO_COLLECTIVE_GATHER_V2=1 dtensor-opt %s -split-input-file -dtensor-all-gather-lowering -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @lower_allgather_tpu_mesh
+func.func @lower_allgather_tpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<2x2xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<2x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[UPDATED:.*]] = "tf.XlaDynamicUpdateSlice"(%{{[0-9]*}}, %arg1, %{{[0-9]*}})
+  // CHECK:      %[[REDUCED:.*]] = "tf.DTensorAllReduce"(%[[UPDATED]]
+  // CHECK:      tf_device.return %[[REDUCED]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorAllGather"(%arg1) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y
+=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xf32>) -> tensor<2x4xf32>
+    tf_device.return %1 : tensor<2x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// CHECK-LABEL: func @lower_allgather_gpu_mesh
+func.func @lower_allgather_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<2x2xf32> {tf._layout = "sharding_specs:x,y, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"}) -> tensor<2x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:   "tf.Transpose"(%arg1
+  // CHECK:  %[[ALLGATHER_OUT:.*]] = "tf.CollectiveGatherV2"
+  // CHECK:  "tf.Transpose"(%[[ALLGATHER_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorAllGather"(%arg1) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:GPU|x=2,y
+=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>} : (tensor<2x2xf32>) -> tensor<2x4xf32>
+    tf_device.return %1 : tensor<2x4xf32>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_all_scatter.mlir b/tensorflow/dtensor/mlir/tests/dtensor_all_scatter.mlir
new file mode 100644
index 00000000000..34676241f99
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_all_scatter.mlir
@@ -0,0 +1,21 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-all-scatter-lowering -verify-diagnostics | FileCheck %s --dump-input=fail
+
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<4x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: %[[MOD_CONST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DIV_CONST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[PRE_MESH_COORDS:[0-9]*]] = "tf.Div"(%arg0, %[[DIV_CONST]])
+  // CHECK-NEXT: %[[MESH_COORDS:.*]] = "tf.FloorMod"(%[[PRE_MESH_COORDS]], %[[MOD_CONST]])
+  // CHECK-NEXT: %[[SLICE_SHAPE:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[PRE_SLICE_OFFSET:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[SLICE_OFFSET:[0-9]*]] = "tf.MatMul"(%[[MESH_COORDS]], %[[PRE_SLICE_OFFSET]])
+  // CHECK-NEXT: %[[SQUEEZED_OFFSET:[0-9]*]] = "tf.Squeeze"(%[[SLICE_OFFSET]])
+  // CHECK-NEXT: %[[SLICE:[0-9]*]] = "tf.Slice"(%arg1, %[[SQUEEZED_OFFSET]], %[[SLICE_SHAPE]])
+  // CHECK:      tf_device.return %[[SLICE]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorAllScatter"(%arg1) {input_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x2xf32>
+    tf_device.return %1 : tensor<4x2xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<4x2xf32>
+  func.return %0 : tensor<4x2xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
new file mode 100644
index 00000000000..2c101531e55
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
@@ -0,0 +1,148 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-allreduce-combine-optimization -verify-diagnostics | FileCheck %s
+
+// Check that independent DTensorAllReduce ops of the same element type and group assignment are combined.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:      %[[VAL_1:.*]] = "tf.Const"
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[VAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  //
+  // CHECK:      %[[FILL:.*]] = "tf.Fill"
+  // CHECK:      %[[FLATTEN_1:.*]] = "tf.Reshape"(%[[VAL_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_1:.*]] = "tf.TensorStridedSliceUpdate"(%[[FILL]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_1]])
+  // CHECK:      %[[FLATTEN_2:.*]] = "tf.Reshape"(%[[VAL_2]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_2:.*]] = "tf.TensorStridedSliceUpdate"(%[[UPDATE_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_2]])
+  // CHECK:      %[[ALL_REDUCE:.*]] = "tf.DTensorAllReduce"(%[[UPDATE_2]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+  // CHECK:      %[[SLICE_1:.*]] = "tf.Slice"(%[[ALL_REDUCE]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_1:.*]] = "tf.Reshape"(%[[SLICE_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[SLICE_2:.*]] = "tf.Slice"(%[[ALL_REDUCE]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_2:.*]] = "tf.Reshape"(%[[SLICE_2]], %cst_{{[0-9]*}})
+  //
+  // CHECK:      %[[ADD:.*]] = "tf.Add"(%[[UNFLATTEN_1]], %[[UNFLATTEN_2]])
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %5 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %6 = "tf.DTensorAllReduce"(%4, %5) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %7 = "tf.Add"(%3, %6) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    "tf_device.return"(%7) : (tensor<4x4xf32>) -> ()
+  }) : () -> tensor<4x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+// Check that two groups of interdependent DTensorAllReduce ops are combined layer by layer.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:      %[[VAL_1:.*]] = "tf.Const"
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[VAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   {value = dense<{{1.0.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  //
+  //
+  // CHECK:      %[[ALL_REDUCE_0:.*]] = "tf.DTensorAllReduce"(%[[VAL_2]], %[[GROUP_ASSIGNMENT]])
+  //
+  // CHECK:      %[[FILL_1:.*]] = "tf.Fill"
+  // CHECK:      %[[FLATTEN_1:.*]] = "tf.Reshape"(%[[VAL_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_1:.*]] = "tf.TensorStridedSliceUpdate"(%[[FILL_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_1]])
+  // CHECK:      %[[FLATTEN_2:.*]] = "tf.Reshape"(%[[ALL_REDUCE_0]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_2:.*]] = "tf.TensorStridedSliceUpdate"(%[[UPDATE_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_2]])
+  // CHECK:      %[[ALL_REDUCE_1:.*]] = "tf.DTensorAllReduce"(%[[UPDATE_2]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+  // CHECK:      %[[SLICE_1:.*]] = "tf.Slice"(%[[ALL_REDUCE_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_1:.*]] = "tf.Reshape"(%[[SLICE_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[SLICE_2:.*]] = "tf.Slice"(%[[ALL_REDUCE_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_2:.*]] = "tf.Reshape"(%[[SLICE_2]], %cst_{{[0-9]*}})
+  //
+  // CHECK:      %[[FILL_2:.*]] = "tf.Fill"
+  // CHECK:      %[[FLATTEN_3:.*]] = "tf.Reshape"(%[[UNFLATTEN_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_3:.*]] = "tf.TensorStridedSliceUpdate"(%[[FILL_2]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_3]])
+  // CHECK:      %[[FLATTEN_4:.*]] = "tf.Reshape"(%[[UNFLATTEN_2]], %cst_{{[0-9]*}})
+  // CHECK:      %[[UPDATE_4:.*]] = "tf.TensorStridedSliceUpdate"(%[[UPDATE_3]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[FLATTEN_4]])
+  // CHECK:      %[[ALL_REDUCE_2:.*]] = "tf.DTensorAllReduce"(%[[UPDATE_4]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+  // CHECK:      %[[SLICE_3:.*]] = "tf.Slice"(%[[ALL_REDUCE_2]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_3:.*]] = "tf.Reshape"(%[[SLICE_3]], %cst_{{[0-9]*}})
+  // CHECK:      %[[SLICE_4:.*]] = "tf.Slice"(%[[ALL_REDUCE_2]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[UNFLATTEN_4:.*]] = "tf.Reshape"(%[[SLICE_4]], %cst_{{[0-9]*}})
+  //
+  // CHECK:      %[[ADD:.*]] = "tf.Add"(%[[UNFLATTEN_3]], %[[UNFLATTEN_4]])
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllReduce"(%3, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %5 = "tf.Const"() {value = dense<1.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %6 = "tf.DTensorAllReduce"(%5, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %7 = "tf.DTensorAllReduce"(%6, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %8 = "tf.DTensorAllReduce"(%7, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %9 = "tf.Add"(%4, %8) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+
+    "tf_device.return"(%9) : (tensor<4x4xf32>) -> ()
+  }) : () -> tensor<4x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// Check that DTensorAllReduce ops across region boundaries are not combined.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:      %[[VAL:.*]] = "tf.Const"
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:      %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"()
+  // CHECK-SAME:   {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[ALL_REDUCE_1:.*]] = "tf.DTensorAllReduce"(%[[VAL]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+  //
+  // CHECK:      "tf.WhileRegion"
+  //
+  // CHECK:      "tf.ToBool"
+  // CHECK-NEXT: "tf.Yield"
+  //
+  // CHECK:      %[[WHILE_FILL:.*]] = "tf.Fill"
+  // CHECK:      %[[WHILE_FLATTEN_1:.*]] = "tf.Reshape"(%[[VAL]], %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_UPDATE_1:.*]] = "tf.TensorStridedSliceUpdate"(%[[WHILE_FILL]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[WHILE_FLATTEN_1]])
+  // CHECK:      %[[WHILE_FLATTEN_2:.*]] = "tf.Reshape"(%[[VAL]], %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_UPDATE_2:.*]] = "tf.TensorStridedSliceUpdate"(%[[WHILE_UPDATE_1]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %cst_{{[0-9]*}}, %[[WHILE_FLATTEN_2]])
+  // CHECK:      %[[WHILE_ALL_REDUCE:.*]] = "tf.DTensorAllReduce"(%[[WHILE_UPDATE_2]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+  // CHECK:      %[[WHILE_SLICE_1:.*]] = "tf.Slice"(%[[WHILE_ALL_REDUCE]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_UNFLATTEN_1:.*]] = "tf.Reshape"(%[[WHILE_SLICE_1]], %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_SLICE_2:.*]] = "tf.Slice"(%[[WHILE_ALL_REDUCE]], %cst_{{[0-9]*}}, %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_UNFLATTEN_2:.*]] = "tf.Reshape"(%[[WHILE_SLICE_2]], %cst_{{[0-9]*}})
+  // CHECK:      %[[WHILE_ADD:.*]] = "tf.Add"(%[[WHILE_UNFLATTEN_1]], %[[WHILE_UNFLATTEN_2]])
+  // CHECK:      "tf.Yield"(%[[WHILE_ADD]])
+  //
+  // CHECK:      %[[ALL_REDUCE_2:.*]] = "tf.DTensorAllReduce"(%[[VAL]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:   (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+  // CHECK:      %[[ADD:.*]] = "tf.Add"(%[[ALL_REDUCE_1]], %[[ALL_REDUCE_2]])
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.WhileRegion"(%1) ({
+    ^bb0(%arg: tensor<4x4xf32>):
+      %5 = "tf.ToBool"(%arg) : (tensor<4x4xf32>) -> tensor<i1>
+      "tf.Yield"(%5) : (tensor<i1>) -> ()
+    }, {
+    ^bb0(%arg: tensor<4x4xf32>):
+      %5 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %6 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %7 = "tf.Add"(%5, %6) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+      "tf.Yield"(%7) : (tensor<4x4xf32>) -> ()
+    }) {is_stateless = true} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %5 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %6 = "tf.Add"(%3, %5) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    "tf_device.return"(%6) : (tensor<4x4xf32>) -> ()
+  }) : () -> tensor<4x4xf32>
+  "func.return"() : () -> ()
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
new file mode 100644
index 00000000000..8f2b864d9b3
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_lowering.mlir
@@ -0,0 +1,58 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-all-reduce-lowering -verify-diagnostics| FileCheck %s --dump-input=fail
+
+// Check the lowering of AllReduce on TPU with sum reduction.
+// CHECK-LABEL: func @lower_allreduce_sum
+func.func @lower_allreduce_sum() -> (tensor<4096x8192xf32>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ALLREDUCE_OUT:.*]] = "tf.XlaAllReduce"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Add"
+  // CHECK-NEXT   return %[[ALLREDUCE_OUT]]
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4096x8192xf32>} : () -> tensor<4096x8192xf32>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>) -> tensor<4096x8192xf32>
+  func.return %2: tensor<4096x8192xf32>
+}
+
+// Check the lowering of AllReduce on TPU with any boolean reduction.
+// CHECK-LABEL: func @lower_allreduce_any_boolean
+func.func @lower_allreduce_any_boolean() -> (tensor<4096x8192xi1>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[INPUT_CAST:.*]] = "tf.Cast"(%[[CONST_OUT_1]])
+  // CHECK-NEXT:  %[[ALLREDUCE_OUT:.*]] = "tf.XlaAllReduce"(%[[INPUT_CAST]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Max"
+  // CHECK-NEXT:  %[[OUTPUT_CAST:.*]] = "tf.Cast"(%[[ALLREDUCE_OUT]])
+  // CHECK-NEXT   return %[[OUTPUT_CAST]]
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Any"} : (tensor<4096x8192xi1>, tensor<4x2xi32>) -> tensor<4096x8192xi1>
+  func.return %2: tensor<4096x8192xi1>
+}
+
+// Check the lowering of AllReduce on TPU with all boolean reduction.
+// CHECK-LABEL: func @lower_allreduce_all_boolean
+func.func @lower_allreduce_all_boolean() -> (tensor<4096x8192xi1>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[INPUT_CAST:.*]] = "tf.Cast"(%[[CONST_OUT_1]])
+  // CHECK-NEXT:  %[[ALLREDUCE_OUT:.*]] = "tf.XlaAllReduce"(%[[INPUT_CAST]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Min"
+  // CHECK-NEXT:  %[[OUTPUT_CAST:.*]] = "tf.Cast"(%[[ALLREDUCE_OUT]])
+  // CHECK-NEXT   return %[[OUTPUT_CAST]]
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "All"} : (tensor<4096x8192xi1>, tensor<4x2xi32>) -> tensor<4096x8192xi1>
+  func.return %2: tensor<4096x8192xi1>
+}
+
+// -----
+
+// Check for error  of AllReduce on TPU with all boolean reduction.
+func.func @lower_allreduce_sum_boolean() -> (tensor<4096x8192xi1>) {
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  // expected-error @+1 {{reduce for boolean only supports 'All' or 'Any' reduction}}
+  %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xi1>, tensor<4x2xi32>) -> tensor<4096x8192xi1>
+  func.return %2: tensor<4096x8192xi1>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir
new file mode 100644
index 00000000000..c57296818d4
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_scatter_optimization.mlir
@@ -0,0 +1,112 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-allreduce-scatter-optimization -verify-diagnostics | FileCheck %s
+//
+//
+
+// CHECK-LABEL: func @all_reduce_only
+func.func @all_reduce_only() {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    // CHECK:     "tf.DTensorAllReduce"
+    // CHECK-NOT: "tf.DTensorReduceScatter"
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    "tf_device.return"(%3) : (tensor<4x4xf32>) -> ()
+  }) : () -> tensor<4x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @all_reduce_scatter_2d_major_dim
+func.func @all_reduce_scatter_2d_major_dim() {
+    // CHECK:               %[[INPUT:.*]] = "tf.Const"() {value = dense<0.0
+    // CHECK:               %[[GROUP:.*]] = "tf.Const"() {value =
+    // CHECK-SAME{LITERAL}: dense<[[0, 2], [1, 3]]>
+    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() {value = dense<0>
+    // CHECK:               "tf.DTensorReduceScatter"(%[[INPUT]], %[[GROUP]], %[[SCATTER_DIM]])
+    // CHECK-SAME:          reduce_op = "Add"
+    // CHECK-NOT:           "tf.DTensorAllReduce"
+    // CHECK-NOT:           "tf.DTensorAllScatter"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 2], [1, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllScatter"(%3) {_layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"], input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<2x4xf32>
+    "tf_device.return"(%4) : (tensor<2x4xf32>) -> ()
+  }) : () -> tensor<2x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @all_reduce_scatter_2d_minor_dim
+func.func @all_reduce_scatter_2d_minor_dim() {
+    // CHECK:               %[[INPUT:.*]] = "tf.Const"() {value = dense<0.0
+    // CHECK:               %[[GROUP:.*]] = "tf.Const"() {value =
+    // CHECK-SAME{LITERAL}: dense<[[0, 2], [1, 3]]>
+    // CHECK:               %[[SCATTER_DIM:.*]] = "tf.Const"() {value = dense<1>
+    // CHECK:               "tf.DTensorReduceScatter"(%[[INPUT]], %[[GROUP]], %[[SCATTER_DIM]])
+    // CHECK-SAME:          reduce_op = "Add"
+    // CHECK-NOT:           "tf.DTensorAllReduce"
+    // CHECK-NOT:           "tf.DTensorAllScatter"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 2], [1, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllScatter"(%3) {_layout = ["sharding_specs:unsharded,x, mesh:|x=2,y=2|*TPU"], input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x2xf32>
+    "tf_device.return"(%4) : (tensor<4x2xf32>) -> ()
+  }) : () -> tensor<4x2xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @all_reduce_multiple_users
+func.func @all_reduce_multiple_users() {
+    // CHECK:     "tf.DTensorAllReduce"
+    // CHECK:     "tf.DTensorAllScatter"
+    // CHECK-NOT: "tf.DTensorReduceScatter"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 2], [1, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllScatter"(%3) {_layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"], input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<2x4xf32>
+    %5 = "tf.Identity"(%3) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    "tf_device.return"(%4) : (tensor<2x4xf32>) -> ()
+  }) : () -> tensor<2x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @all_reduce_scatter_2d_mismatched_dim
+func.func @all_reduce_scatter_2d_mismatched_dim() {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:     "tf.DTensorAllReduce"
+    // CHECK:     "tf.DTensorAllScatter"
+    // CHECK-NOT: "tf.DTensorReduceScatter"
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllScatter"(%3) {_layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"], input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<2x4xf32>
+    "tf_device.return"(%4) : (tensor<2x4xf32>) -> ()
+  }) : () -> tensor<2x4xf32>
+  "func.return"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @all_reduce_scatter_2d_both_dims
+func.func @all_reduce_scatter_2d_both_dims() {
+    // CHECK:     "tf.DTensorAllReduce"
+    // CHECK:     "tf.DTensorAllScatter"
+    // CHECK-NOT: "tf.DTensorReduceScatter"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+    %2 = "tf.Const"() {value = dense<[[0, 2], [1, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %3 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "TPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorAllScatter"(%3) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*TPU"], input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<2x2xf32>
+    "tf_device.return"(%4) : (tensor<2x2xf32>) -> ()
+  }) : () -> tensor<2x2xf32>
+  "func.return"() : () -> ()
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir
new file mode 100644
index 00000000000..b964b7b04ad
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_sum_optimization.mlir
@@ -0,0 +1,293 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-allreduce-sum-optimization -verify-diagnostics | FileCheck %s
+
+// Check that DTensorAllReduce op with Add/AddN/AddV2 operations are optimized.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<916x8192xbf16>)  {
+  // CHECK:       %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL2:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ADD_OUT:.*]] = "tf.AddN"(%[[VAL1]], %[[VAL2]])
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorAllReduce"(%[[ADD_OUT]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-NEXT:  %[[B_OUT:.*]] = "tf.B"(%[[REDUCTION_OUT]])
+  %0 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () ->tensor<916x8192xbf16>
+  %3= "tf.DTensorAllReduce"(%1, %0) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %4= "tf.DTensorAllReduce"(%2, %0) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %5= "tf.AddN"(%3, %4) { device = ""} : (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  %6 = "tf.B"(%5) : (tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+
+  // CHECK:       %[[GROUP_ASSIGNMENT_2:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL3:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL4:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ADD_OUT_2:.*]] = "tf.Add"(%[[VAL3]], %[[VAL4]])
+  // CHECK-NEXT:  %[[REDUCTION_OUT_2:.*]] = "tf.DTensorAllReduce"(%[[ADD_OUT_2]], %[[GROUP_ASSIGNMENT_2]])
+  // CHECK-NEXT:  %[[C_OUT:.*]] = "tf.C"(%[[REDUCTION_OUT_2]])
+  %7 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %8 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %9 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () ->tensor<916x8192xbf16>
+  %10= "tf.DTensorAllReduce"(%8, %7) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %11= "tf.DTensorAllReduce"(%9, %7) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %12= "tf.Add"(%10, %11) { device = ""} : (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  %13 = "tf.C"(%12) : (tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+
+  // CHECK:       %[[GROUP_ASSIGNMENT_3:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL5:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL6:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ADD_OUT_3:.*]] = "tf.AddV2"(%[[VAL5]], %[[VAL6]])
+  // CHECK-NEXT:  %[[REDUCTION_OUT_3:.*]] = "tf.DTensorAllReduce"(%[[ADD_OUT_3]], %[[GROUP_ASSIGNMENT_3]])
+  // CHECK-NEXT:  %[[D_OUT:.*]] = "tf.D"(%[[REDUCTION_OUT_3]])
+  %14 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %15 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %16 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () ->tensor<916x8192xbf16>
+  %17= "tf.DTensorAllReduce"(%15, %14) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %18= "tf.DTensorAllReduce"(%16, %14) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %19= "tf.AddV2"(%17, %18) { device = ""} : (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  %20 = "tf.D"(%19) : (tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6 : tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op with operation with group assignment from different constant with same values are optimized correctly.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<916x8192xbf16>) {
+  // CHECK:       %[[VAL_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL_2:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT_2:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ADD_OUT_1:.*]] = "tf.AddN"(%[[VAL_1]], %[[VAL_2]])
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorAllReduce"(%[[ADD_OUT_1]], %[[GROUP_ASSIGNMENT_1]])
+  // CHECK-NEXT:  return %[[REDUCTION_OUT]]
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %4= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %3= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %5= "tf.DTensorAllReduce"(%2, %3) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+
+  %6= "tf.AddN"(%4, %5): (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6: tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op with operation type that is not `sum` are not optimized.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<916x8192xbf16>) {
+  // CHECK:  %[[REDUCE_OUT_1:.*]] = "tf.DTensorAllReduce"
+  // CHECK:  %[[REDUCE_OUT_2:.*]] = "tf.DTensorAllReduce"
+  // CHECK:  %[[ADD_OUT:.*]] = "tf.AddN"(%[[REDUCE_OUT_1]], %[[REDUCE_OUT_2]])
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %4= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Mean"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %3= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %5= "tf.DTensorAllReduce"(%2, %3) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Max"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %6= "tf.AddN"(%4, %5): (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6: tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op with different group assignment are not optimized away.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<916x8192xbf16>) {
+  // CHECK:  %[[REDUCE_OUT_1:.*]] = "tf.DTensorAllReduce"
+  // CHECK:  %[[REDUCE_OUT_2:.*]] = "tf.DTensorAllReduce"
+  // CHECK:  %[[ADD_OUT:.*]] = "tf.AddN"(%[[REDUCE_OUT_1]], %[[REDUCE_OUT_2]])
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %4= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %3= "tf.Const"() {value = dense<1> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %5= "tf.DTensorAllReduce"(%2, %3) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %6= "tf.AddN"(%4, %5): (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6: tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op with malformed layout specification is disallowed.
+func.func @main() -> (tensor<916x8192xbf16>) {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // expected-error @+1 {{Malformed layout specification for DTensorAllReduce op found}}
+  %4= "tf.DTensorAllReduce"(%0, %1) {_layout = ["1234"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %3= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %5= "tf.DTensorAllReduce"(%2, %3) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %6= "tf.AddN"(%4, %5): (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6: tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op with missing layout specification is disallowed.
+func.func @main() -> (tensor<916x8192xbf16>) {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // expected-error @+1 {{DTensorAllReduce op must have layout specification}}
+  %4= "tf.DTensorAllReduce"(%0, %1) {device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xbf16>} : () -> tensor<916x8192xbf16>
+  %3= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %5= "tf.DTensorAllReduce"(%2, %3) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xbf16>, tensor<2x64xi32>) -> tensor<916x8192xbf16>
+  %6= "tf.AddN"(%4, %5): (tensor<916x8192xbf16>, tensor<916x8192xbf16>) -> tensor<916x8192xbf16>
+  func.return %6: tensor<916x8192xbf16>
+}
+
+// -----
+
+// Check that DTensorAllReduce op inside while loop is optimized away correctly.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:       %[[VAL_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[VAL_1_ID:.*]] = "tf.Identity"(%[[VAL_1]])
+  // CHECK-NEXT:  %[[WHILE_OUT:.*]]:2 = "tf.WhileRegion"(%[[VAL_1_ID]], %[[ARG0]])
+  // CHECK:         "tf.A"
+  // CHECK-NEXT:    "tf.Yield"
+  // CHECK:         ^bb0(%[[BARG0:.*]]: tensor<4xf32>, %[[BARG1:.*]]: tensor<i32>)
+  // CHECK:          %[[INPUT0:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:     %[[GROUP:.*]] = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // CHECK-NEXT:     %[[CAST_OUT:.*]] = "tf.Cast"(%[[INPUT0]])
+  // CHECK-NEXT:     %[[ADD_OUT:.*]] = "tf.AddV2"(%[[CAST_OUT]], %[[BARG0]])
+  // CHECK-NEXT:     %[[OUT:.*]] = "tf.Identity"(%[[ADD_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:      %[[NEW_GROUP_ASSIGN:.*]] = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  // CHECK:      %[[ALL_REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[WHILE_OUT]]#0, %[[NEW_GROUP_ASSIGN]])
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+  %2 = "tf.Identity"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+
+  %9:2 = "tf.WhileRegion"(%2, %arg0) ({
+    ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+       %1 = "tf.A"(%carg1) : (tensor<i32>) -> (tensor<i1>)
+       "tf.Yield"(%1) : (tensor<i1>) -> ()
+  },  {
+    ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %3 = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+      %4= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+      %5= "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4xi32>, tensor<2x64xi32>) -> tensor<4xi32>
+      %6 = "tf.Cast"(%5) : (tensor<4xi32>) ->  tensor<4xf32>
+      %7 = "tf.AddV2"(%6, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %8 = "tf.Identity"(%7) : (tensor<4xf32>) -> tensor<4xf32>
+      "tf.Yield"(%8, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+  }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+
+  "tf.C"(%9#0) : (tensor<4xf32>) -> ()
+  func.return
+}
+
+// -----
+
+// Check that while op optimization is ignored if loop variant input is used for
+// while loop condition logic.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:       %[[WHILE_OUT:.*]]:2 = "tf.WhileRegion"
+  // CHECK:         "tf.A"
+  // CHECK-NEXT:    "tf.Yield"
+  // CHECK:         ^bb0(%[[BARG0:.*]]: tensor<4xf32>, %[[BARG1:.*]]: tensor<i32>)
+  // CHECK:          "tf.Const"()
+  // CHECK-NEXT:     "tf.Const"()
+  // CHECK-NEXT:     "tf.DTensorAllReduce"
+  // CHECK-NEXT:     "tf.AddV2"
+  // CHECK-NEXT:     "tf.Yield"
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+
+  %9:2 = "tf.WhileRegion"(%0, %arg0) ({
+    ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+       %1 = "tf.A"(%carg0) : (tensor<4xf32>) -> (tensor<i1>)
+       "tf.Yield"(%1) : (tensor<i1>) -> ()
+  },  {
+    ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %3 = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+      %4= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+      %5= "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4xf32>, tensor<2x64xi32>) -> tensor<4xf32>
+      %6 = "tf.AddV2"(%5, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      "tf.Yield"(%6, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+  }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that while op with input that is not constant zero will not trigger
+// lazy all reduce optimization.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:       %[[WHILE_OUT:.*]]:2 = "tf.WhileRegion"
+  // CHECK:         "tf.A"
+  // CHECK-NEXT:    "tf.Yield"
+  // CHECK:         ^bb0(%[[BARG0:.*]]: tensor<4xf32>, %[[BARG1:.*]]: tensor<i32>)
+  // CHECK:          "tf.Const"()
+  // CHECK-NEXT:     "tf.Const"()
+  // CHECK-NEXT:     "tf.DTensorAllReduce"
+  // CHECK-NEXT:     "tf.AddV2"
+  // CHECK-NEXT:     "tf.Yield"
+  %0 = "tf.Const"() {value = dense<[0.0, 1.0, 0.0, 0.0]> : tensor<4xf32>} : () -> tensor<4xf32>
+
+  %9:2 = "tf.WhileRegion"(%0, %arg0) ({
+    ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+       %1 = "tf.A"(%carg0) : (tensor<4xf32>) -> (tensor<i1>)
+       "tf.Yield"(%1) : (tensor<i1>) -> ()
+  },  {
+    ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+      %3 = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+      %4= "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+      %5= "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4xf32>, tensor<2x64xi32>) -> tensor<4xf32>
+      %6 = "tf.AddV2"(%5, %barg0) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      "tf.Yield"(%6, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+  }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that optimization that does not reduce number of DTensorAllReduce is
+// not applied.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<4096x8192xf32>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-NEXT:  "tf.Cast"(%[[REDUCE_OUT]])
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4096x8192xbf16>} : () -> tensor<4096x8192xbf16>
+  %1 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %2= "tf.DTensorAllReduce"(%0, %1) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=64|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xbf16>, tensor<2x64xi32>) -> tensor<4096x8192xbf16>
+  %3= "tf.Cast"(%2) {Truncate = false, device = ""} : (tensor<4096x8192xbf16>) -> tensor<4096x8192xf32>
+  func.return %3: tensor<4096x8192xf32>
+}
+
+// -----
+
+// Check that  DTensorAllReduce op moved after Identity-like operations.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<916x8192xf32>)  {
+  // CHECK:       %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[CONST_OUT_2:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[CAST_OUT:.*]] = "tf.Cast"(%[[CONST_OUT_2]])
+  // CHECK-NEXT:  %[[IDENTITY_OUT:.*]] = "tf.Identity"(%[[CAST_OUT]])
+  // CHECK-NEXT:  %[[RESHAPE_CONST:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[RESHAPE_OUT:.*]] = "tf.Reshape"(%[[IDENTITY_OUT]], %[[RESHAPE_CONST]])
+  // CHECK-NEXT:  %[[ADD_OUT:.*]] = "tf.AddN"(%[[RESHAPE_OUT]], %[[CONST_OUT_1]])
+  // CHECK-NEXT:  "tf.DTensorAllReduce"(%[[ADD_OUT]], %[[GROUP_ASSIGNMENT]])
+  %0 = "tf.Const"() {value = dense<0> : tensor<2x64xi32>} : () -> tensor<2x64xi32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<916x8192xf32>} : () -> tensor<916x8192xf32>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<8192x916xbf16>} : () ->tensor<8192x916xbf16>
+
+  %3= "tf.DTensorAllReduce"(%1, %0) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<916x8192xf32>, tensor<2x64xi32>) -> tensor<916x8192xf32>
+  %4= "tf.DTensorAllReduce"(%2, %0) {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<8192x916xbf16>, tensor<2x64xi32>) -> tensor<8192x916xbf16>
+  %5 = "tf.Cast"(%4){Truncate = false, device = ""} : ( tensor<8192x916xbf16>) -> tensor<8192x916xf32>
+  %6 = "tf.Identity"(%5){Truncate = false, device = ""} : (tensor<8192x916xf32>) -> tensor<8192x916xf32>
+  %7 = "tf.Const"() {value = dense<[916,8192]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %8 = "tf.Reshape"(%6, %7) : (tensor<8192x916xf32>, tensor<2xi32>) -> tensor<916x8192xf32>
+
+  %9= "tf.AddN"(%8, %3) { device = ""} : (tensor<916x8192xf32>, tensor<916x8192xf32>) -> tensor<916x8192xf32>
+  %10 = "tf.B"(%9) : (tensor<916x8192xf32>) ->  tensor<916x8192xf32>
+  func.return %10 :  tensor<916x8192xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_embedding_checkpoint.mlir b/tensorflow/dtensor/mlir/tests/dtensor_embedding_checkpoint.mlir
new file mode 100644
index 00000000000..bee19c5cda1
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_embedding_checkpoint.mlir
@@ -0,0 +1,18 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-embedding-checkpoint -mlir-print-ir-after-all | FileCheck %s --dump-input=fail
+
+// Check load embedding function is created.
+// CHECK-LABEL: func @load_embedding_fn
+// CHECK-SAME:  %[[ARG0:[a-z0-9]*]]: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._tpu_embedding_slot_id = 0 : i64, tf._tpu_embedding_table_id = 0 : i64}
+// CHECK-SAME:  %[[ARG1:[a-z0-9]*]]: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._tpu_embedding_slot_id = 1 : i64, tf._tpu_embedding_table_id = 0 : i64}
+// CHECK:         %0 = "tf.ReadVariableOp"(%arg0)
+// CHECK-NEXT:    %1 = "tf.ReadVariableOp"(%arg1)
+// CHECK-NEXT:    %cst = "tf.Const"()
+// CHECK-NEXT:    "tf.LoadAllTPUEmbeddingParameters"
+// CHECK-NEXT:    return
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._tpu_embedding_slot_id = 0 : i64, tf._tpu_embedding_table_id = 0 : i64},
+  %arg2: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._tpu_embedding_slot_id = 1 : i64, tf._tpu_embedding_table_id = 0 : i64},
+  %arg3: tensor<*x!tf_type.resource<tensor<4xi32>>>
+) -> () attributes {tf._tpu_embedding_configuration = "dummy configuration string"} {
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_embedding_v2.mlir b/tensorflow/dtensor/mlir/tests/dtensor_embedding_v2.mlir
new file mode 100644
index 00000000000..b9f2f04231f
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_embedding_v2.mlir
@@ -0,0 +1,117 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-embedding-v2 -verify-diagnostics -mlir-print-ir-after-all | FileCheck %s --dump-input=fail
+
+
+// Check main function and table arg attributes annotation.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._global_shape = #tf_type.shape<8x4>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._tpu_embedding_slot_id = 0 : i64, tf._tpu_embedding_table_id = 0 : i64}
+// CHECK-SAME: %[[ARG2:[a-z0-9]*]]: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._global_shape = #tf_type.shape<8x4>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._tpu_embedding_slot_id = 1 : i64, tf._tpu_embedding_table_id = 0 : i64}
+// CHECK-SAME: attributes {tf._tpu_embedding_configuration =
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg2: tensor<4x3xi32> {tf._layout = "sharding_specs:batch, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg3: tensor<*x!tf_type.resource<tensor<8x4xf32>>> {tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}
+) -> (tensor<4x3x4xf32>) {
+  // Check configuration ops on host level EPU cluster.
+  // CHECK:      "tf_device.cluster"()
+  // CHECK:        %cst = "tf.Const"()
+  // CHECK-NEXT:   "tf.SetEmbeddingConfig"(%cst) : (tensor<1x!tf_type.string>) -> ()
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:EPU:0"
+  // Check removal of old epu cluster mesh.
+  // CHECK-NOT:  _mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1"
+  // Check dequeue op is added on TPU cluster to return results.
+  // CHECK:      %0 = "tf_device.cluster"()
+  // CHECK:        %1 = "tf.DTensorEmbeddingDequeue"()
+  // CHECK-NEXT:   %2 = "tf.DTensorLayout"(%1)
+  // CHECK-NEXT:   %[[REDUCTION_INDICES:.*]] = "tf.Const"
+  // CHECK-NEXT:   %[[TARGET:.*]] = "tf.Const"
+  // CHECK-NEXT:   %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT:   %3 = "tf.Identity"(%2)
+  // CHECK-NEXT:   %4 = "tf.SquaredDifference"(%3, %[[TARGET:.*]])
+  // CHECK-NEXT:   %[[LOSS:.*]] = "tf.Sum"(%4, %[[REDUCTION_INDICES:.*]])
+  // CHECK-NEXT:   %[[GRAD:.*]] = "tf.Mul"(%3, %[[CST:.*]])
+  // CHECK-NEXT:   "tf.DTensorSendEmbeddingGradients"(%[[GRAD:.*]])
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:   %[[LOSS:.*]] : tensor<f32>
+  // CHECK-NEXT: _mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
+   // Check Enqueue op is added on feature's cluster.
+  // CHECK:      "tf_device.cluster"()
+  // CHECK:        %1 = "tf.DTensorLayout"(%arg2)
+  // CHECK-NEXT:   %cst = "tf.Const"() {value = dense<> : tensor<0xf32>} : () -> tensor<0xf32>
+  // CHECK-NEXT:   "tf.DTensorEmbeddingEnqueue"(%1, %cst)
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+  // Check slot variable remains on embedding table cluster.
+  // CHECK:      "tf_device.cluster"()
+  // CHECK-NEXT:   %1 = "tf.DTensorLayout"(%arg3)
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
+  // Check main function is end after this.
+  // CHECK-NEXT: return %0 : tensor<4x3x4xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {device = "", value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %cst_0 = "tf.Const"() {device = "", value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_1 = "tf.Const"() {device = "", value = dense<2.000000e+00> : tensor<4x3x4xf32>} : () -> tensor<4x3x4xf32>
+    %1 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1_2", layout = #dtensor.layout<sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>, shape = #tf_type.shape<4x3x4>} : () -> tensor<4x3x4xf32>
+    %2 = "tf.Identity"(%1) : (tensor<4x3x4xf32>) -> tensor<4x3x4xf32>
+    %3 = "tf.SquaredDifference"(%2, %cst_0) {device = ""} : (tensor<4x3x4xf32>, tensor<f32>) -> tensor<4x3x4xf32>
+    %4 = "tf.Sum"(%3, %cst) {device = "", keep_dims = false} : (tensor<4x3x4xf32>, tensor<3xi32>) -> tensor<f32>
+    %5 = "tf.Mul"(%2, %cst_1) {device = ""} : (tensor<4x3x4xf32>, tensor<4x3x4xf32>) -> tensor<4x3x4xf32>
+    "tf.DTensorSend"(%5) {key = "communication_key_sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_3", target_layout = #dtensor.layout<sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>} : (tensor<4x3x4xf32>) -> ()
+    tf_device.return %4 : tensor<f32>
+  }) {_mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<4x3x4xf32>
+  "tf_device.cluster"() ({
+    %1 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:batch, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_0", layout = #dtensor.layout<sharding_specs:batch, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>, shape = #tf_type.shape<4>} : () -> tensor<4x3xi32>
+    %2 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_1", layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>, shape = #tf_type.shape<8x4>} : () -> tensor<8x4xf32>
+    %3 = "tf.TPUDenseEmbeddingLookUp"(%1, %2) : (tensor<4x3xi32>, tensor<8x4xf32>) -> tensor<4x3x4xf32>
+    "tf.DTensorSend"(%3) {key = "communication_key_sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1_2", target_layout = #dtensor.layout<sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>} : (tensor<4x3x4xf32>) -> ()
+    %4 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_3", layout = #dtensor.layout<sharding_specs:batch,unsharded,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>, shape = #tf_type.shape<4x3x4>} : () -> tensor<4x3x4xf32>
+    %5 = "tf.TPUDenseEmbeddingLookUpGrad"(%4, %1, %2) {device = ""} : (tensor<4x3x4xf32>, tensor<4x3xi32>, tensor<8x4xf32>) -> tensor<8x4xf32>
+    "tf.DTensorSend"(%5) {key = "communication_key_sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_4", target_layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1"} : () -> ()
+  "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x3>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4x3xi32>) -> tensor<4x3xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:batch, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_0", target_layout = #dtensor.layout<sharding_specs:batch, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>} : (tensor<4x3xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
+  "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<*x!tf_type.resource<tensor<8x4xf32>>>) -> tensor<*x!tf_type.resource<tensor<8x4xf32>>>
+    %2 = "tf.ReadVariableOp"(%1) {device = ""} : (tensor<*x!tf_type.resource<tensor<8x4xf32>>>) -> tensor<8x4xf32>
+    %3 = "tf.ReadVariableOp"(%1) {device = ""} : (tensor<*x!tf_type.resource<tensor<8x4xf32>>>) -> tensor<8x4xf32>
+    "tf.DTensorSend"(%3) {key = "communication_key_sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1_1", target_layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:EPU:0,/job:localhost/replica:0/task:0/device:EPU:1>} : (tensor<8x4xf32>) -> ()
+    %4 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_4", layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>, shape = #tf_type.shape<8x4>} : () -> tensor<8x4xf32>
+    %5 = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<*x!tf_type.resource<tensor<8x4xf32>>>) -> tensor<*x!tf_type.resource<tensor<8x4xf32>>>
+    %6 = "tf.ReadVariableOp"(%5) {device = ""} : (tensor<*x!tf_type.resource<tensor<8x4xf32>>>) -> tensor<8x4xf32>
+    "tf.ApplyEmbeddingOptimizerV2"(%4, %2, %6) {device = "", optimization_parameters = "\22\00j\05\0D\CD\CC\CC=\88\01\02"} : (tensor<8x4xf32>, tensor<8x4xf32>, tensor<8x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+  func.return %0 : tensor<4x3x4xf32>
+}
+
+// -----
+
+// Non epu embedding look up op.
+func.func @main(%arg0: tensor<8x4xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU", tf._mesh = "CPU|x=1|*CPU"},
+  %arg1: tensor<4xi32>{tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU", tf._mesh = "CPU|x=1|*CPU"}) -> () {
+  "tf_device.cluster"() ({
+    // expected-error @+1 {{'tf.TPUDenseEmbeddingLookUp' op Expected embedding lookup op defined on EPU cluster but got : CPU}}
+    "tf.TPUDenseEmbeddingLookUp"(%arg1, %arg0) : (tensor<4xi32>, tensor<8x4xf32>) -> tensor<4x4xf32>
+    tf_device.return
+  }) {_mesh = "CPU|x=1|*CPU"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Failed to get cluster device of embedding lookup op.
+func.func @main(%arg0: tensor<8x4xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU", tf._mesh = "CPU|x=1|*CPU"},
+  %arg1: tensor<4xi32>{tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU", tf._mesh = "CPU|x=1|*CPU"}) -> () {
+  "tf_device.cluster"() ({
+    // expected-error @+1 {{'tf.TPUDenseEmbeddingLookUp' op Failed to get device type of cluster has embedding look op, got error: Cluster Mesh is not found.}}
+    "tf.TPUDenseEmbeddingLookUp"(%arg1, %arg0) : (tensor<4xi32>, tensor<8x4xf32>) -> tensor<4x4xf32>
+    tf_device.return
+  }) {_mesh = ""} : () -> ()
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_mixed_precision_reduce.mlir b/tensorflow/dtensor/mlir/tests/dtensor_mixed_precision_reduce.mlir
new file mode 100644
index 00000000000..37bb86de048
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_mixed_precision_reduce.mlir
@@ -0,0 +1,111 @@
+// RUN: DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE=4 dtensor-opt -- -split-input-file -dtensor-mixed-precision-reduce -verify-diagnostics %s | FileCheck %s
+
+// Check bfloat16 AllReduce is upcasted for a sufficient group size.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<1x4xbf16>
+func.func @main(
+  %arg0: tensor<1x4xbf16> {tf._global_shape = #tf_type.shape<8x4>, tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=8|*TPU"})
+  -> (tensor<4xbf16> {tf._global_shape = #tf_type.shape<4>}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:  %[[AXIS:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SUM_OUT:.*]] = "tf.Sum"(%[[ARG0]], %[[AXIS]])
+  // CHECK-SAME:    -> tensor<4xbf16>
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[UPCAST:.*]] = "tf.Cast"(%[[SUM_OUT]])
+  // CHECK-SAME:    (tensor<4xbf16>) -> tensor<4xf32>
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorAllReduce"(%[[UPCAST]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:    -> tensor<4xf32>
+  // CHECK-NEXT:  %[[DOWNCAST:.*]] = "tf.Cast"(%[[REDUCTION_OUT]])
+  // CHECK-SAME:    _layout = ["sharding_specs:unsharded, mesh:TPU|x=8|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:    (tensor<4xf32>) -> tensor<4xbf16>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<>], _layout = ["sharding_specs: mesh:TPU|x=8|*TPU"], value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Sum"(%arg0, %cst) {_global_shape = [#tf_type.shape<4>], device = "", keep_dims = false} : (tensor<1x4xbf16>, tensor<i32>) -> tensor<4xbf16>
+    %cst_0 = "tf.Const"() {value = dense<[[0, 1, 2, 3, 4, 5, 6, 7]]> : tensor<1x8xi32>} : () -> tensor<1x8xi32>
+    %2 = "tf.DTensorAllReduce"(%1, %cst_0) {_layout = ["sharding_specs:unsharded, mesh:TPU|x=8|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4xbf16>, tensor<1x8xi32>) -> tensor<4xbf16>
+    %3 = "tf.Identity"(%2) {_global_shape = [#tf_type.shape<4>], _layout = ["sharding_specs:unsharded, mesh:TPU|x=8|*TPU"], device = ""} : (tensor<4xbf16>) -> tensor<4xbf16>
+    tf_device.return %3 : tensor<4xbf16>
+  }) {_mesh = "TPU|x=8|*TPU"} : () -> tensor<4xbf16>
+  func.return %0 : tensor<4xbf16>
+}
+
+// -----
+
+// Check that bfloat16 AllReduce is not upcasted for a small group size.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<1x4xbf16>
+func.func @main(
+  %arg0: tensor<1x4xbf16> {tf._global_shape = #tf_type.shape<2x4>, tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2|*TPU"})
+  -> (tensor<4xbf16> {tf._global_shape = #tf_type.shape<4>}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:  %[[AXIS:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SUM_OUT:.*]] = "tf.Sum"(%[[ARG0]], %[[AXIS]])
+  // CHECK-SAME:    -> tensor<4xbf16>
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NOT:   "tf.Cast"
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorAllReduce"(%[[SUM_OUT]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:    -> tensor<4xbf16>
+  // CHECK-NOT:   "tf.Cast"
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<>], _layout = ["sharding_specs: mesh:TPU|x=2|*TPU"], value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Sum"(%arg0, %cst) {_global_shape = [#tf_type.shape<4>], device = "", keep_dims = false} : (tensor<1x4xbf16>, tensor<i32>) -> tensor<4xbf16>
+    %cst_0 = "tf.Const"() {value = dense<[[0, 1]]> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
+    %2 = "tf.DTensorAllReduce"(%1, %cst_0) {_layout = ["sharding_specs:unsharded, mesh:TPU|x=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4xbf16>, tensor<1x2xi32>) -> tensor<4xbf16>
+    %3 = "tf.Identity"(%2) {_global_shape = [#tf_type.shape<4>], _layout = ["sharding_specs:unsharded, mesh:TPU|x=2|*TPU"], device = ""} : (tensor<4xbf16>) -> tensor<4xbf16>
+    tf_device.return %3 : tensor<4xbf16>
+  }) {_mesh = "TPU|x=2|*TPU"} : () -> tensor<4xbf16>
+  func.return %0 : tensor<4xbf16>
+}
+
+// -----
+
+// Check bfloat16 ReduceScatter is upcasted for a sufficient group size.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<512x1024xbf16>
+func.func @main(
+  %arg0: tensor<512x1024xbf16> {tf._global_shape = #tf_type.shape<4096x1024>, tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=8|*TPU"})
+  -> (tensor<512x1024xbf16> {tf._global_shape = #tf_type.shape<4096x1024>}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIM:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[UPCAST:.*]] = "tf.Cast"(%[[ARG0]])
+  // CHECK-SAME:    (tensor<512x1024xbf16>) -> tensor<512x1024xf32>
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorReduceScatter"(%[[UPCAST]], %[[GROUP_ASSIGNMENT]], %[[SCATTER_DIM]])
+  // CHECK-SAME:    -> tensor<512x1024xf32>
+  // CHECK-NEXT:  %[[DOWNCAST:.*]] = "tf.Cast"(%[[REDUCTION_OUT]])
+  // CHECK-SAME:    _layout = ["sharding_specs:x,unsharded, mesh:TPU|x=8|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:   (tensor<512x1024xf32>) -> tensor<512x1024xbf16>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<[[0, 1, 2, 3, 4, 5, 6, 7]]> : tensor<1x8xi32>} : () -> tensor<1x8xi32>
+    %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.DTensorReduceScatter"(%arg0, %cst, %cst_0) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=8|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<512x1024xbf16>, tensor<1x8xi32>, tensor<i32>) -> tensor<512x1024xbf16>
+    %3 = "tf.Identity"(%2) {_global_shape = [#tf_type.shape<4096x1024>], _layout = ["sharding_specs:x,unsharded, mesh:TPU|x=8|*TPU"], device = ""} : (tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    tf_device.return %3 : tensor<512x1024xbf16>
+  }) {_mesh = "TPU|x=8|*TPU"} : () -> tensor<512x1024xbf16>
+  func.return %0 : tensor<512x1024xbf16>
+}
+
+// -----
+
+// Check that bfloat16 ReduceScatter is not upcasted for a small group size.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<512x1024xbf16>
+func.func @main(
+  %arg0: tensor<512x1024xbf16> {tf._global_shape = #tf_type.shape<1024x1024>, tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2|*TPU"})
+  -> (tensor<512x1024xbf16> {tf._global_shape = #tf_type.shape<1024x1024>}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIM:.*]] = "tf.Const"
+  // CHECK-NOT:   "tf.Cast"
+  // CHECK-NEXT:  %[[REDUCTION_OUT:.*]] = "tf.DTensorReduceScatter"(%[[ARG0]], %[[GROUP_ASSIGNMENT]], %[[SCATTER_DIM]])
+  // CHECK-SAME:    -> tensor<512x1024xbf16>
+  // CHECK-NOT:   "tf.Cast"
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<[[0, 1]]> : tensor<1x2xi32>} : () -> tensor<1x2xi32>
+    %cst_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.DTensorReduceScatter"(%arg0, %cst, %cst_0) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<512x1024xbf16>, tensor<1x2xi32>, tensor<i32>) -> tensor<512x1024xbf16>
+    %3 = "tf.Identity"(%2) {_global_shape = [#tf_type.shape<1024x1024>], _layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2|*TPU"], device = ""} : (tensor<512x1024xbf16>) -> tensor<512x1024xbf16>
+    tf_device.return %3 : tensor<512x1024xbf16>
+  }) {_mesh = "TPU|x=2|*TPU"} : () -> tensor<512x1024xbf16>
+  func.return %0 : tensor<512x1024xbf16>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_mlir_opt_main.cc b/tensorflow/dtensor/mlir/tests/dtensor_mlir_opt_main.cc
new file mode 100644
index 00000000000..c745d8af80a
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_mlir_opt_main.cc
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "stablehlo/dialect/Register.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h"
+#include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
+#include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
+#include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+  tensorflow::dtensor::registerDTensorPasses();
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::stablehlo::registerAllDialects(registry);
+  registry.insert<mlir::dtensor::DTensorDialect>();
+  mlir::TF::RegisterDTensorTFOps();
+
+  return failed(
+      mlir::MlirOptMain(argc, argv, "DTensor pass driver\n", registry));
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
new file mode 100644
index 00000000000..f052c58e5a7
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_reduce_scatter_lowering.mlir
@@ -0,0 +1,140 @@
+// RUN: DTENSOR_GPU_USE_NCCL_COMMUNICATION=1 dtensor-opt %s -split-input-file -dtensor-reduce-scatter-lowering -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// Check the lowering of DTensorReduceScatter on TPU with sum reduction.
+// CHECK-LABEL: func @lower_reduce_scatter_sum_tpu
+func.func @lower_reduce_scatter_sum_tpu() -> (tensor<2048x8192xf32>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIMENSION:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[REDUCE_SCATTER_OUT:.*]] = "tf.XlaReduceScatter"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]], %[[SCATTER_DIMENSION]])
+  // CHECK-SAME:  reduce_op = "Add"
+  // CHECK-NEXT   return %[[REDUCE_SCATTER_OUT]]
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4096x8192xf32>, _layout = ["sharding_specs:unsharded,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"]} : () -> tensor<4096x8192xf32>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xf32>
+  func.return %3: tensor<2048x8192xf32>
+}
+
+// Check the lowering of DTensorReduceScatter on CPU with sum reduction.
+// CHECK-LABEL: func @lower_reduce_scatter_sum_cpu
+func.func @lower_reduce_scatter_sum_cpu() -> (tensor<2048x8192xf32>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIMENSION:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ALL_REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Add"
+  // CHECK-NEXT:  %[[ALL_SCATTER_OUT:.*]] = "tf.DTensorAllScatter"(%[[ALL_REDUCE_OUT]])
+  // CHECK-NEXT   return %[[ALL_SCATTER_OUT]]
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4096x8192xf32>, _layout = ["sharding_specs:unsharded,unsharded, mesh:cpu_mesh|x=2,y=4|*CPU"]} : () -> tensor<4096x8192xf32>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:cpu_mesh|x=2,y=4|*CPU"], device_type = "/job:localhost/replica:0/task:0/device:CPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xf32>
+  func.return %3: tensor<2048x8192xf32>
+}
+
+// CHECK-LABEL: func @lower_reduce_scatter_gpu_mesh
+func.func @lower_reduce_scatter_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4096x8192xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"}) -> tensor<2048x8192xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:       %[[DEVICE_ID_RESHAPE:.*]] = "tf.Reshape"(%arg0
+  // CHECK:       %[[RELATIVE_DEVICE_ID:.*]] = "tf.Sub"(%[[DEVICE_ID_RESHAPE]]
+  // CHECK-DAG:   %[[CONST_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:   %[[DEVICE_ID_TO_GROUP_KEY:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 1, 2, 2, 3, 3]> : tensor<8xi32>}
+  // CHECK:       %[[GROUP_KEY_SLICE:.*]] = "tf.Slice"(%[[DEVICE_ID_TO_GROUP_KEY]], %[[RELATIVE_DEVICE_ID]], %[[CONST_1]]
+  // CHECK:       %[[GROUP_KEY_RESHAPE:.*]] = "tf.Reshape"(%[[GROUP_KEY_SLICE]]
+  // CHECK-DAG:   %[[INSTANCE_KEY:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} 
+  // CHECK-DAG:   %[[GROUP_SIZE:.*]] = "tf.Const"() {value = dense<2> : tensor<i32>}
+  // CHECK:       %[[REDUCE_SCATTER_OUT:.*]] = "tf.CollectiveReduceScatterV2"(%arg1, %[[GROUP_SIZE]], %[[GROUP_KEY_RESHAPE]], %[[INSTANCE_KEY]])
+  // CHECK-SAME:  final_op = "Id"
+  // CHECK-SAME:  merge_op = "Add"
+  // CHECK-NEXT   tf_device.return %[[REDUCE_SCATTER_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tf.DTensorReduceScatter"(%arg1, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"], device_type = "/job:localhost/replica:0/task:0/device:GPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xf32>
+    tf_device.return %3 : tensor<2048x8192xf32>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<2048x8192xf32>
+  func.return %0 : tensor<2048x8192xf32>
+}
+
+// CHECK-LABEL: func @lower_reduce_scatter_transpose_gpu_mesh
+func.func @lower_reduce_scatter_transpose_gpu_mesh(%arg0: tensor<i32>,
+           %arg1: tensor<4096x8192xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"}) -> tensor<2048x8192xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:  %[[TRANSPOSED_INPUT:.*]] = "tf.Transpose"(%arg1
+  // CHECK:  %[[REDUCE_SCATTER_OUT:.*]] = "tf.CollectiveReduceScatterV2"(%[[TRANSPOSED_INPUT]]
+  // CHECK:  %[[TRANSPOSED_OUTPUT:.*]] = "tf.Transpose"(%[[REDUCE_SCATTER_OUT]]
+  // CHECK-NEXT   tf_device.return %[[TRANSPOSED_OUTPUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tf.DTensorReduceScatter"(%arg1, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:gpu_mesh|x=2,y=4|*GPU"], device_type = "/job:localhost/replica:0/task:0/device:GPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xf32>
+    tf_device.return %3 : tensor<2048x8192xf32>
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> tensor<2048x8192xf32>
+  func.return %0 : tensor<2048x8192xf32>
+}
+
+// Check the lowering of DTensorReduceScatter on TPU with any boolean reduction.
+// CHECK-LABEL: func @lower_reduce_any_boolean_tpu
+func.func @lower_reduce_any_boolean_tpu() -> (tensor<2048x8192xi1>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIMENSION:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[INPUT_CAST:.*]] = "tf.Cast"(%[[CONST_OUT_1]])
+  // CHECK-NEXT:  %[[REDUCE_SCATTER_OUT:.*]] = "tf.XlaReduceScatter"(%[[INPUT_CAST]], %[[GROUP_ASSIGNMENT]], %[[SCATTER_DIMENSION]])
+  // CHECK-SAME:  reduce_op = "Max"
+  // CHECK-NEXT:  %[[OUTPUT_CAST:.*]] = "tf.Cast"(%[[REDUCE_SCATTER_OUT]])
+  // CHECK-NEXT   return %[[OUTPUT_CAST]]
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>, _layout = ["sharding_specs:unsharded,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"]} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Any"} : (tensor<4096x8192xi1>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xi1>
+  func.return %3: tensor<2048x8192xi1>
+}
+
+// Check the lowering of DTensorReduceScatter on CPU with any_boolean reduction.
+// CHECK-LABEL: func @lower_reduce_scatter_any_boolean_cpu
+func.func @lower_reduce_scatter_any_boolean_cpu() -> (tensor<2048x8192xi1>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIMENSION:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ALL_REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Any"
+  // CHECK-NEXT:  %[[ALL_SCATTER_OUT:.*]] = "tf.DTensorAllScatter"(%[[ALL_REDUCE_OUT]])
+  // CHECK-NEXT   return %[[ALL_SCATTER_OUT]]
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>, _layout = ["sharding_specs:unsharded,unsharded, mesh:cpu_mesh|x=2,y=4|*CPU"]} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:cpu_mesh|x=2,y=4|*CPU"], device_type = "/job:localhost/replica:0/task:0/device:CPU", reduce_op = "Any"} : (tensor<4096x8192xi1>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xi1>
+  func.return %3: tensor<2048x8192xi1>
+}
+
+// Check the lowering of DTensorReduceScatter without input layout.
+// CHECK-LABEL: func @lower_reduce_scatter_no_input_layout
+func.func @lower_reduce_scatter_no_input_layout() -> (tensor<2048x8192xf32>) {
+  // CHECK:       %[[CONST_OUT_1:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SCATTER_DIMENSION:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[ALL_REDUCE_OUT:.*]] = "tf.DTensorAllReduce"(%[[CONST_OUT_1]], %[[GROUP_ASSIGNMENT]])
+  // CHECK-SAME:  reduce_op = "Add"
+  // CHECK-NEXT:  %[[ALL_SCATTER_OUT:.*]] = "tf.DTensorAllScatter"(%[[ALL_REDUCE_OUT]])
+  // CHECK-NEXT   return %[[ALL_SCATTER_OUT]]
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<4096x8192xf32>} : () -> tensor<4096x8192xf32>
+    %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:cpu_mesh|x=2,y=4|*CPU"], device_type = "/job:localhost/replica:0/task:0/device:CPU", reduce_op = "Add"} : (tensor<4096x8192xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xf32>
+  func.return %3: tensor<2048x8192xf32>
+}
+
+// -----
+
+// Check for error of DTensorReduceScatter on TPU with sum boolean reduction.
+func.func @lower_reduce_sum_boolean_tpu() -> (tensor<2048x8192xi1>) {
+  %0 = "tf.Const"() {value = dense<1> : tensor<4096x8192xi1>, _layout = ["sharding_specs:unsharded,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"]} : () -> tensor<4096x8192xi1>
+  %1 = "tf.Const"() {value = dense<[[0, 1], [2, 3], [4, 5], [6, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+  %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // expected-error @+1 {{reduce for boolean only supports 'All' or 'Any' reduction}}
+  %3 = "tf.DTensorReduceScatter"(%0, %1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:tpu_mesh|x=2,y=4|*TPU"], device_type = "/job:localhost/replica:0/task:0/device:TPU", reduce_op = "Add"} : (tensor<4096x8192xi1>, tensor<4x2xi32>, tensor<i32>) -> tensor<2048x8192xi1>
+  func.return %3: tensor<2048x8192xi1>
+}
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_remove_dtensorlayout.mlir b/tensorflow/dtensor/mlir/tests/dtensor_remove_dtensorlayout.mlir
new file mode 100644
index 00000000000..0ea340f6c5b
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_remove_dtensorlayout.mlir
@@ -0,0 +1,24 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-remove-dtensorlayout | FileCheck %s
+
+// This test checks DTensorLayout ops are all removed, regardless of whether it
+// has the `use_xla_spmd` attribute.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:      "tf.Const"()
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK:      "tf.Const"()
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK:      "tf.Add"
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK-NEXT: tf_device.return
+    %1 = "tf.Const"() {value = dense<[[4, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3|use_xla_spmd>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    %3 = "tf.Const"() {value = dense<[[1, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    %5 = "tf.Add"(%2, %4): (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    %6 = "tf.DTensorLayout"(%5) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    tf_device.return %6 : tensor<2x2xi32>
+  }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xi32>)
+  func.return
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_replace_auxiliary_layout_op.mlir b/tensorflow/dtensor/mlir/tests/dtensor_replace_auxiliary_layout_op.mlir
new file mode 100644
index 00000000000..a6fe85abbe4
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_replace_auxiliary_layout_op.mlir
@@ -0,0 +1,12 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-replace-auxiliary-layout-op | FileCheck %s
+
+// Check auxiliary `tf.DTensorLayout` is replaced by `tf.Identity`.
+// CHECK-LABEL: func @check_replace_auxiliary_layout_op
+func.func @check_replace_auxiliary_layout_op(%arg0: tensor<8x8xi32>) -> tensor<8x8xi32> {
+  // CHECK-NEXT:  "tf.Identity"
+  // CHECK-NEXT:  "tf.DTensorLayout"
+  // CHECK-NEXT:  return
+  %0 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %1 : tensor<8x8xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir
new file mode 100644
index 00000000000..daf17686b2b
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding.mlir
@@ -0,0 +1,27 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' | FileCheck %s
+
+// Check all inputs, outputs, and operations have sharding attributes, with `check_layout_use_xla_spmd` set to true.
+// CHECK-LABEL: func @check_layouts_are_converted_to_xla_sharding_attributes
+// CHECK-SAME: (%arg0: tensor<8x8xi32> {mhlo.sharding = "", tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> (tensor<8x8xi32> {mhlo.sharding = ""}) {
+func.func @check_layouts_are_converted_to_xla_sharding_attributes(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> tensor<8x8xi32> {
+  // CHECK:      "tf.DTensorLayout"
+  // CHECK:      "tf.Identity"
+  // CHECK-SAME: mhlo.sharding = ""
+  // CHECK:      "tf.DTensorLayout"
+  // CHECK-NEXT: return
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %3 : tensor<8x8xi32>
+}
+
+// -----
+
+// Check that Layout ops not on XLA SPMD mesh is not allowed at this point.
+func.func @check_layouts_not_xla_spmd_is_not_allowed(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<8x8xi32>) {
+  // expected-error @+1 {{'tf.DTensorLayout' op Found a layout operation that is not on XLA SPMD mesh during XLA SPMD integration.}}
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %1 : tensor<8x8xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir
new file mode 100644
index 00000000000..8d4b55f0a97
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_set_hlo_sharding_default.mlir
@@ -0,0 +1,17 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding | FileCheck %s
+
+// Check all inputs and operations have sharding attributes, with `check_layout_use_xla_spmd` set to default value (false).
+// CHECK-LABEL: func @check_layouts_are_converted_to_xla_sharding_attributes
+// CHECK-SAME: (%arg0: tensor<8x8xi32> {mhlo.sharding = "", tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<8x8xi32> {mhlo.sharding = ""}) {
+func.func @check_layouts_are_converted_to_xla_sharding_attributes(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"}) -> tensor<8x8xi32> {
+  // CHECK:      "tf.DTensorLayout"
+  // CHECK:      "tf.Identity"
+  // CHECK-SAME: mhlo.sharding = ""
+  // CHECK:      "tf.DTensorLayout"
+  // CHECK-NEXT: return
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %3 : tensor<8x8xi32>
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir b/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir
new file mode 100644
index 00000000000..d417f09309d
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/dtensor_xla_spmd_integration.mlir
@@ -0,0 +1,26 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-set-hlo-sharding='check_layout_use_xla_spmd=true' -dtensor-replace-auxiliary-layout-op -dtensor-remove-dtensorlayout | FileCheck %s
+
+// Check after XLA SPMD integration, all inputs, outputs, and operations have sharding attributes.
+//  And all "tf.DTensorLayout" are removed.
+// CHECK-LABEL: func @check_layouts_are_converted_to_xla_sharding_attributes
+// CHECK-SAME: (%arg0: tensor<8x8xi32> {mhlo.sharding = "", tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> (tensor<8x8xi32> {mhlo.sharding = ""})
+func.func @check_layouts_are_converted_to_xla_sharding_attributes(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd"}) -> (tensor<8x8xi32>) {
+  // CHECK:      "tf.Identity"
+  // CHECK-SAME: mhlo.sharding = ""
+  // CHECK-NEXT: return
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %3 : tensor<8x8xi32>
+}
+
+// -----
+
+// Check that Layout ops not on XLA SPMD mesh is not allowed at this point.
+func.func @check_layouts_not_xla_spmd_is_not_allowed(
+  %arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<8x8xi32>) {
+  // expected-error @+1 {{'tf.DTensorLayout' op Found a layout operation that is not on XLA SPMD mesh during XLA SPMD integration.}}
+  %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+  return %1 : tensor<8x8xi32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/elide_identity_before_copy_to_mesh.mlir b/tensorflow/dtensor/mlir/tests/elide_identity_before_copy_to_mesh.mlir
new file mode 100644
index 00000000000..7b0c636df07
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/elide_identity_before_copy_to_mesh.mlir
@@ -0,0 +1,17 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-elide-identity-before-copy-to-mesh | FileCheck %s
+
+// Check that identity before CopyToMeshGrad is elided.
+// CHECK-LABEL: func @check_elide_identity
+func.func @check_elide_identity() -> (tensor<4xi32>) {
+    // CHECK: %[[CONST:.*]] = "tf.Const"()
+    // CHECK-NEXT: %[[CONST_1:.*]] = "tf.Const"()
+    // CHECK-NEXT: "tf.CopyToMeshGrad"(%[[CONST]], %[[CONST_1]])
+
+    %cst = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %cst_1 = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %1 = "tf.Identity"(%cst) : (tensor<4xi32>) -> tensor<4xi32>
+    %2 = "tf.CopyToMeshGrad"(%1, %cst_1) {reference_layout=""}: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    func.return %2 : tensor<4xi32>
+}
+
+
diff --git a/tensorflow/dtensor/mlir/tests/embedding_optimizer.mlir b/tensorflow/dtensor/mlir/tests/embedding_optimizer.mlir
new file mode 100644
index 00000000000..5439dc0c9b6
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/embedding_optimizer.mlir
@@ -0,0 +1,93 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-embedding -verify-diagnostics | FileCheck %s
+
+// Check simple optimizer passes without error.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<784x10xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                      tf._mesh = "CPU|x=1,y=1|*CPU"},
+           %arg1: tensor<100xi32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                   tf._mesh = "CPU|x=1,y=1|*CPU"}) -> (
+      tensor<!tf_type.string> {tf._default_layout = "sharding_specs:x,unsharded, mesh:EPU|x=1,y=1|*EPU"}) {
+  // CHECK: %[[CONST:.*]] = "tf.Const"()
+  // CHECK: return %[[CONST]]
+  %0 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>, optimizer = @sgd_optimizer} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  func.return %0 : tensor<!tf_type.string>
+}
+
+// CHECK-NOT: sgd_optimizer
+func.func @sgd_optimizer(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %0 = "tf.Const"() {value = dense<1.000000e-01> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x1xf32>, tensor<f32>) -> tensor<1x1xf32>
+  %2 = "tf.Sub"(%arg1, %1) : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x1xf32>
+  %3 = "tf.Identity"(%2) : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  func.return %3 : tensor<1x1xf32>
+}
+
+// -----
+
+// Error on non-float input.
+func.func @main(%arg0: tensor<784x10xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU",
+                                      tf._mesh = "CPU|x=1|*CPU"},
+           %arg1: tensor<100xi32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1|*CPU",
+                                   tf._mesh = "CPU|x=1|*CPU"}) -> (
+      tensor<!tf_type.string> {tf._default_layout = "sharding_specs:x,unsharded, mesh:EPU|x=2|*EPU"}) {
+  // expected-error @+1 {{optimizer has a non-float32 input with type S32 at input 1}}
+  %0 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>, optimizer = @optimizer_with_int} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  func.return %0 : tensor<!tf_type.string>
+}
+
+func.func @optimizer_with_int(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1xi32>) -> tensor<1x1xf32> {
+  %0 = "tf.Const"() {value = dense<1.000000e-01> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Cast"(%arg1) : (tensor<1x1xi32>) -> tensor<1x1xf32>
+  %2 = "tf.Mul"(%arg0, %0) : (tensor<1x1xf32>, tensor<f32>) -> tensor<1x1xf32>
+  %3 = "tf.Sub"(%1, %2) : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x1xf32>
+  %4 = "tf.Identity"(%3) : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  func.return %4 : tensor<1x1xf32>
+}
+
+// -----
+
+// Missing function.
+func.func @main(%arg0: tensor<784x10xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                      tf._mesh = "CPU|x=1,y=1|*CPU"},
+           %arg1: tensor<100xi32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                   tf._mesh = "CPU|x=1,y=1|*CPU"}) -> (
+      tensor<!tf_type.string> {tf._default_layout = "sharding_specs:x,unsharded, mesh:EPU|x=2,y=1|*EPU"}) {
+  // expected-error @+1 {{optimizer function optimizer_with_int not found}}
+  %0 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>, optimizer = @optimizer_with_int} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  func.return %0 : tensor<!tf_type.string>
+}
+
+// -----
+
+// Missing optimizer attribute.
+func.func @main(%arg0: tensor<784x10xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                      tf._mesh = "CPU|x=1,y=1|*CPU"},
+           %arg1: tensor<100xi32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                   tf._mesh = "CPU|x=1,y=1|*CPU"}) -> (
+      tensor<!tf_type.string> {tf._default_layout = "sharding_specs:x,unsharded, mesh:EPU|x=1,y=1|*EPU"}) {
+  // expected-error @+1 {{op requires attribute 'optimizer'}}
+  %0 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  func.return %0 : tensor<!tf_type.string>
+}
+
+// -----
+
+// Too many GetEmbeddingConfiguration ops.
+func.func @main(%arg0: tensor<784x10xf32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                      tf._mesh = "CPU|x=1,y=1|*CPU"},
+           %arg1: tensor<100xi32> {tf._layout = "sharding_specs:x, mesh:CPU|x=1,y=1|*CPU",
+                                   tf._mesh = "CPU|x=1,y=1|*CPU"}) -> (
+      tensor<!tf_type.string> {tf._default_layout = "sharding_specs:x,unsharded, mesh:EPU|x=2,y=1|*EPU"}) {
+  %0 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>, optimizer = @sgd_optimizer} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  // expected-error @+1 {{second GetEmbeddingConfiguration op found, only 1 supported}}
+  %1 = "tf.GetEmbeddingConfiguration"(%arg0, %arg1) {num_scalars = 0 : i64, num_slots = 0 : i64, operand_segment_sizes = array<i32: 1, 0, 1, 0>, optimizer = @sgd_optimizer} : (tensor<784x10xf32>, tensor<100xi32>) -> tensor<!tf_type.string>
+  func.return %0 : tensor<!tf_type.string>
+}
+
+func.func @sgd_optimizer(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %0 = "tf.Const"() {value = dense<1.000000e-01> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x1xf32>, tensor<f32>) -> tensor<1x1xf32>
+  %2 = "tf.Sub"(%arg1, %1) : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x1xf32>
+  %3 = "tf.Identity"(%2) : (tensor<1x1xf32>) -> tensor<1x1xf32>
+  func.return %3 : tensor<1x1xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/function_renaming.mlir b/tensorflow/dtensor/mlir/tests/function_renaming.mlir
new file mode 100644
index 00000000000..358b5fa3005
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/function_renaming.mlir
@@ -0,0 +1,17 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-function-renaming -verify-diagnostics | FileCheck %s
+
+module attributes {dtensor.cache_key = "_abc_def"}  {
+  // CHECK-LABEL: func @main
+  func.func @main(%arg0: tensor<f32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>) {
+    // CHECK:       "tf.StatefulPartitionedCall"
+    // CHECK-SAME:  f = @_func_0_abc_def
+    %0 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {f = @_func_0, config = "", config_proto = "", executor_type = ""} : (tensor<f32>, tensor<4xf32>) -> (tensor<4xf32>)
+    func.return %0 : tensor<4xf32>
+  }
+
+  // CHECK-LABEL: func private @_func_0_abc_def
+  func.func private @_func_0(%arg0: tensor<f32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>) {
+    %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<4xf32>) -> (tensor<4xf32>)
+    func.return %0 : tensor<4xf32>
+  }
+}
diff --git a/tensorflow/dtensor/mlir/tests/handle_cross_cluster_dependencies.mlir b/tensorflow/dtensor/mlir/tests/handle_cross_cluster_dependencies.mlir
new file mode 100644
index 00000000000..f1990e1f23d
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/handle_cross_cluster_dependencies.mlir
@@ -0,0 +1,246 @@
+// RUN: dtensor-opt %s -dtensor-handle_cross_cluster_dependences -split-input-file -verify-diagnostics | FileCheck %s
+
+// Check that CopyToMesh op must be used to send tensors across mesh clusters.
+func.func @main() {
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+
+    %2 = "tf_device.cluster"() ({
+      // expected-error @+1 {{CopyToMeshOp must be used to send data across mesh}}
+      %3 = "tf.Neg"(%0) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %3 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check that CopyToMesh inside incorrect mesh cluster is disallowed.
+func.func @main() -> tensor<i32> {
+    // expected-error @+1 {{ Failed to extract mesh }}
+    %0:2 = "tf_device.cluster"() ({
+      %2 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Neg"(%2) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2, %3 : tensor<i32>, tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+
+
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.CopyToMesh"(%0#0) { layout ="sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.Neg"(%3) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+    func.return %0#1 : tensor<i32>
+}
+
+// -----
+
+// Check that Const op is cloned across mesh to reduce data transfer.
+// CHECK-LABEL: func @main
+func.func @main() -> tensor<i32> {
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"()
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[CONST_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> tensor<i32>
+    %0:2 = "tf_device.cluster"() ({
+      %2 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Neg"(%2) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2, %3 : tensor<i32>, tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"()
+    // CHECK-NEXT:     %[[LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs: mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[LAYOUT_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.CopyToMesh"(%0#0) { layout ="sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.Neg"(%3) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+    func.return %0#1 : tensor<i32>
+}
+
+// -----
+
+// Check that CopyToMesh op is lowered to DTensorSend/DTensorRecv op.
+// CHECK-LABEL: func @main
+func.func @main() -> tensor<i32> {
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"()
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[A_OUT]]
+    // CHECK-NEXT:     "tf.DTensorSend"(%[[A_OUT]]
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+    // CHECK-SAME:     target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> tensor<i32>
+    %0:2 = "tf_device.cluster"() ({
+      %2 = "tf.A"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Neg"(%2) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2, %3 : tensor<i32>, tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[RECV_OUT:.*]] = "tf.DTensorRecv"()
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[RECV_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.CopyToMesh"(%0#0) { layout ="sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.Neg"(%3) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+    func.return %0#1 : tensor<i32>
+}
+
+// -----
+
+// Check that tensor transfer from 2 clusters from same mesh without CopyToMesh is allowed.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"()
+    // CHECK-NEXT:     %[[NEG_OUT_0:.*]] = "tf.Neg"(%[[CONST_OUT]]
+    // CHECK-NEXT:     tf_device.return %[[NEG_OUT_0]]
+    // CHECK-NEXT:   () -> tensor<i32>
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     "tf.Const"()
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[NEG_OUT_2:.*]] = "tf.Neg"(%[[CLUSTER_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %5 = "tf_device.cluster"() ({
+      %4 = "tf.Neg"(%0) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check that CopyToMesh op with multiple usages is lowered to
+// DTensorSend/DTensorRecv ops for each usages.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        %[[CPU_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"()
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[A_OUT]]
+    // CHECK-NEXT:     "tf.DTensorSend"(%[[NEG_OUT]]
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+    // CHECK-SAME:     target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     "tf.DTensorSend"(%[[NEG_OUT]]
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3_1"
+    // CHECK-SAME:     target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> tensor<i32>
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[RECV_OUT_1:.*]] = "tf.DTensorRecv"()
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT_1:.*]] = "tf.Neg"(%[[RECV_OUT_1]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.CopyToMesh"(%0) { layout ="sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.Neg"(%3) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[RECV_OUT_2:.*]] = "tf.DTensorRecv"()
+    // CHECK-SAME:     key = "communication_key_sharding_specs:unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3_1"
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT_2:.*]] = "tf.Neg"(%[[RECV_OUT_2]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %3 = "tf_device.cluster"() ({
+      %4 = "tf.CopyToMesh"(%0) { layout ="sharding_specs:unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : (tensor<i32>) -> (tensor<i32>)
+      %5 = "tf.Neg"(%4) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh="GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[NEG_OUT_3:.*]] = "tf.Neg"(%[[CPU_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %4 = "tf_device.cluster"() ({
+      %7 = "tf.Neg"(%0) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %7 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+
+    func.return
+}
+
+// -----
+
+// Check that CopyToMesh inside cluster is elided.
+// CHECK-LABEL: func @main
+func.func @main() -> tensor<i32> {
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"()
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[CONST_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> tensor<i32>
+    %0:2 = "tf_device.cluster"() ({
+      %2 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Neg"(%2) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2, %3 : tensor<i32>, tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"()
+    // CHECK-NEXT:     %[[LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs: mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[LAYOUT_OUT]]
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   () -> ()
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.CopyToMesh"(%0#0) { layout ="sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3" } : (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.CopyToMesh"(%3) { layout ="sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3" } : (tensor<i32>) -> (tensor<i32>)
+      %5 = "tf.Neg"(%4) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %5 : tensor<i32>
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+    func.return %0#1 : tensor<i32>
+}
+
+// -----
+
+// Check that unused cluster results are removed.
+// CHECK-LABEL: func @main
+func.func @main() -> tensor<i32> {
+    %0:3 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      tf_device.return %1, %2, %3 : tensor<i32>, tensor<i32>, tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
+    func.return %0#2 : tensor<i32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/handle_sparsetensors.mlir b/tensorflow/dtensor/mlir/tests/handle_sparsetensors.mlir
new file mode 100644
index 00000000000..52beaf4ffea
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/handle_sparsetensors.mlir
@@ -0,0 +1,81 @@
+// RUN: dtensor-opt %s -dtensor-sparse-tensor-to-dense-tensor -split-input-file -verify-diagnostics | FileCheck %s
+
+// Check int32 SparseTensors expand to SparseToDenseOp.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<6x4xi32> {tf._layout = "sharding_specs:batch,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", tf._sparse = true}) -> tensor<6x4xi32> attributes {tf.entry_function = {inputs = "device_id,op_input_0"}}{
+  // CHECK: func @main(%arg0: tensor<i32>, %arg1: tensor<?x2xi64>, %arg2: tensor<2xi64>, %arg3: tensor<?xi32>) -> tensor<6x4xi32> attributes {tf.entry_function = {inputs = "device_id,op_input_sparse_indices_1,op_input_sparse_dense_shapes_1,op_input_sparse_values_1"}} {
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.SparseToDense"(%arg1, %arg2, %arg3, %[[CST]])
+  // CHECK-NEXT: %[[DENSE_OUT:.*]] = "tf.DTensorLayout"(%[[DENSE]])
+  // CHECK-NEXT: "tf.AddV2"(%[[DENSE_OUT:.*]], %[[DENSE_OUT:.*]])
+  // CHECK-NEXT: "tf.Identity"
+  %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<6x4xi32>) -> tensor<6x4xi32>
+  %1 = "tf.AddV2"(%0, %0) {} : (tensor<6x4xi32>, tensor<6x4xi32>) -> tensor<6x4xi32>
+  %2 = "tf.Identity"(%1) {} : (tensor<6x4xi32>) -> tensor<6x4xi32>
+  func.return %2 : tensor<6x4xi32>
+}
+
+// -----
+
+// Check float32 SparseTensors expand to SparseToDenseOp.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:batch,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", tf._sparse = true}) -> tensor<6x4xf32> attributes {tf.entry_function = {inputs = "device_id,op_input_0"}}{
+  // CHECK: func @main(%arg0: tensor<i32>, %arg1: tensor<?x2xi64>, %arg2: tensor<2xi64>, %arg3: tensor<?xf32>) -> tensor<6x4xf32> attributes {tf.entry_function = {inputs = "device_id,op_input_sparse_indices_1,op_input_sparse_dense_shapes_1,op_input_sparse_values_1"}} {
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.SparseToDense"(%arg1, %arg2, %arg3, %[[CST]])
+  // CHECK-NEXT: %[[DENSE_OUT:.*]] = "tf.DTensorLayout"(%[[DENSE]])
+  // CHECK-NEXT: "tf.AddV2"(%[[DENSE_OUT:.*]], %[[DENSE_OUT:.*]])
+  // CHECK-NEXT: "tf.Identity"
+  %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+  %1 = "tf.AddV2"(%0, %0) {} : (tensor<6x4xf32>, tensor<6x4xf32>) -> tensor<6x4xf32>
+  %2 = "tf.Identity"(%1) {} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+  func.return %2 : tensor<6x4xf32>
+}
+
+// -----
+
+// Check int64 SparseTensors expand to SparseToDenseOp.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<6x4xi64> {tf._layout = "sharding_specs:batch,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", tf._sparse = true}) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_0"}}{
+  // CHECK: func @main(%arg0: tensor<i32>, %arg1: tensor<?x2xi64>, %arg2: tensor<2xi64>, %arg3: tensor<?xi64>) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_sparse_indices_1,op_input_sparse_dense_shapes_1,op_input_sparse_values_1"}} {
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.SparseToDense"(%arg1, %arg2, %arg3, %[[CST]])
+  // CHECK-NEXT: %[[DENSE_OUT:.*]] = "tf.DTensorLayout"(%[[DENSE]])
+  // CHECK-NEXT: "tf.AddV2"(%[[DENSE_OUT:.*]], %[[DENSE_OUT:.*]])
+  // CHECK-NEXT: "tf.Identity"
+  %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  %1 = "tf.AddV2"(%0, %0) {device = ""} : (tensor<6x4xi64>, tensor<6x4xi64>) -> tensor<6x4xi64>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  func.return %2 : tensor<6x4xi64>
+}
+
+// -----
+
+// Check the SparseTensor components are appended to the end of the block argument list.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<6x4xi64> {tf._layout = "sharding_specs:batch,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", tf._sparse = true}, %arg3: tensor<i32>) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_0"}}{
+  // CHECK: func @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<?x2xi64>, %arg3: tensor<2xi64>, %arg4: tensor<?xi64>) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_sparse_indices_1,op_input_sparse_dense_shapes_1,op_input_sparse_values_1"}} {
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.SparseToDense"(%arg2, %arg3, %arg4, %[[CST]])
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: %[[DENSE_OUT:.*]] = "tf.DTensorLayout"(%[[DENSE]])
+  // CHECK-NEXT: "tf.AddV2"(%[[DENSE_OUT:.*]], %[[DENSE_OUT:.*]])
+  %3 = "tf.Identity"(%arg3) {} : (tensor<i32>) -> tensor<i32>
+  %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  %1 = "tf.AddV2"(%0, %0) {device = ""} : (tensor<6x4xi64>, tensor<6x4xi64>) -> tensor<6x4xi64>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  func.return %2 : tensor<6x4xi64>
+}
+
+// -----
+
+// Check that a single SparseToDenseOp is created for all usages of a single SparseTensor
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<6x4xi64> {tf._layout = "sharding_specs:batch,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", tf._sparse = true}) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_0"}}{
+  // CHECK: func @main(%arg0: tensor<i32>, %arg1: tensor<?x2xi64>, %arg2: tensor<2xi64>, %arg3: tensor<?xi64>) -> tensor<6x4xi64> attributes {tf.entry_function = {inputs = "device_id,op_input_sparse_indices_1,op_input_sparse_dense_shapes_1,op_input_sparse_values_1"}} {
+  // CHECK-NEXT: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.SparseToDense"(%arg1, %arg2, %arg3, %[[CST]])
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.AddV2"
+  %1 = "tf.Identity"(%arg1) {device = ""} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  %2 = "tf.Identity"(%arg1) {device = ""} : (tensor<6x4xi64>) -> tensor<6x4xi64>
+  %3 = "tf.AddV2"(%1, %2) {device = ""} : (tensor<6x4xi64>, tensor<6x4xi64>) -> tensor<6x4xi64>
+  func.return %3 : tensor<6x4xi64>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/layout_propagation.mlir b/tensorflow/dtensor/mlir/tests/layout_propagation.mlir
new file mode 100644
index 00000000000..f97740c1038
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/layout_propagation.mlir
@@ -0,0 +1,535 @@
+// RUN: dtensor-opt %s -dtensor-annotate-global-shape -dtensor-layout-propagation -split-input-file -verify-diagnostics | FileCheck %s
+
+// Check Unary op layout propagation.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK: %1 =   "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %2 = "tf.Neg"(%1)
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   (tensor<i32>) -> tensor<i32>
+    // CHECK:       tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2 : tensor<i32>
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check elementwise op with operands having incompatible layouts is not
+// allowed.
+func.func @main() {
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Const"() {_layout = ["sharding_specs:x,z, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      // expected-error @+1 {{mesh dimension not contained in mesh}}
+      %3 = "tf.Add"(%1, %2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %3 : tensor<i32>
+    }) : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check elementwise op layout propagation with first operand missing layout.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        %1 = "tf.Const"()
+    // CHECK-SAME:   () -> tensor<2x2xi32>
+    // CHECK:        %2 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<2x2xi32>
+    // CHECK:        %3 = "tf.Add"(%1, %2)
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    // CHECK:       tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+      %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+      %3 = "tf.Add"(%1, %2) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+      tf_device.return %3 : tensor<2x2xi32>
+    }) {_mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}  : () -> (tensor<2x2xi32>)
+    func.return
+}
+
+// -----
+
+// Check elementwise op layout propagation
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        %1 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %2 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %3 = "tf.Add"(%1, %2)
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // CHECK:       tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Add"(%1, %2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %3 : tensor<i32>
+    }) {_mesh = "mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check layout propagation of elementwise op with broadcast propagation.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        %1 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<10x10xi32>
+    // CHECK:        %2 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs: mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %3 = "tf.Add"(%1, %2)
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME:   (tensor<10x10xi32>, tensor<i32>) -> tensor<10x10xi32>
+    // CHECK:      tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<10x10xi32>} : () -> tensor<10x10xi32>
+      %2 = "tf.Const"() {_layout = ["sharding_specs: mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Add"(%1, %2) : (tensor<10x10xi32>, tensor<i32>) -> tensor<10x10xi32>
+      tf_device.return %3 : tensor<10x10xi32>
+    }) {_mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<10x10xi32>)
+    func.return
+}
+
+
+// -----
+
+// Check layout propagation of elementwise op with multiple device cluster.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        %2 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %3 = "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME:   () -> tensor<i32>
+    // CHECK:        %4 = "tf.Add"(%2, %3)
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME:   (tensor<i32>, tensor<i32>) -> tensor<i32>
+    // CHECK:        tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %4 = "tf.Add"(%2, %3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> (tensor<i32>)
+
+    // CHECK:      "tf_device.cluster"
+    //
+    // CHECK: %2 = "tf.Const"()
+    // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME: () -> tensor<i32>
+    //
+    // CHECK: %3 = "tf.Const"()
+    // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME: () -> tensor<i32>
+    //
+    // CHECK: %4 = "tf.Add"(%2, %3)
+    // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME: (tensor<i32>, tensor<i32>) -> tensor<i32>
+    //
+    // CHECK:      tf_device.return
+    %1 = "tf_device.cluster"() ({
+      %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %3 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %4 = "tf.Add"(%2, %3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %4 : tensor<i32>
+    }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check layout propagation of elementwise op with multiple inputs.
+// CHECK-LABEL: func @main
+// CHECK:      "tf_device.cluster"() ({
+// CHECK-NEXT:   %1 = "tf.Add"(%arg1, %arg2)
+// CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"]
+// CHECK-SAME:   (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:        tf_device.return
+func.func @main(%arg0: tensor<i64>,
+  %arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"},
+  %arg2: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}) -> tensor<1xf32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Add"(%arg1, %arg2) {} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    tf_device.return %1 : tensor<1xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation of pack op.
+// CHECK-LABEL: func @main
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3" }) -> tensor<1xf32> {
+  // CHECK:       "tf.Pack"
+  // CHECK-SAME:  _layout = ["sharding_specs:unsharded,x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:  (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<4x10x10xf32>
+  // CHECK-NEXT:  tf_device.return
+  // CHECK-SAME:  tensor<4x10x10xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<10x10xf32>
+    %2 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<10x10xf32>
+    %3 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<10x10xf32>
+    %4 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<10x10xf32>
+    %5 = "tf.Pack"(%1, %2, %3, %4) {} : (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<4x10x10xf32>
+    tf_device.return %5 : tensor<4x10x10xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation logic of tf.Pack op with a single operand.
+// CHECK-LABEL: func @main
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3" }) -> tensor<1xf32> {
+  // CHECK:      "tf.Pack"(%1)
+  // CHECK-SAME: _layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME: axis = 0
+  // CHECK-SAME: (tensor<10x10xf32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: tensor<10x10xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<10x10xf32>
+    %2 = "tf.Pack"(%1) {axis = 0 : i64} : (tensor<10x10xf32>) -> tensor<10x10xf32>
+    tf_device.return %2 : tensor<10x10xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation of pack op with non-matching layouts.
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*TPU" }) -> tensor<1xf32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    %2 = "tf.A"() {_layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    %3 = "tf.A"() {_layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    %4 = "tf.A"() {_layout = ["sharding_specs:y,x, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    // expected-error @+1 {{'tf.Pack' op All arguments to pack must have the same layout.}}
+    %5 = "tf.Pack"(%1, %2, %3, %4) {} : (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<4x10x10xf32>
+    tf_device.return %5 : tensor<4x10x10xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation of reshape op with replicated inputs.
+// CHECK-LABEL: func @main
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*TPU" }) -> tensor<1xf32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const" () {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*TPU"], value = dense<10> : tensor<1xi32>}: () -> tensor<1xi32>
+    %2 = "tf.Const" () {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*TPU"], value = dense<-1> : tensor<1xi32>}: () -> tensor<1xi32>
+    %3 = "tf.Pack" (%1, %2) : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %4 = "tf.A"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    // CHECK:      "tf.Reshape"(%4, %3)
+    // CHECK-SAME: (tensor<10x10xf32>, tensor<2xi32>) -> tensor<10x10xf32>
+    %5 = "tf.Reshape"(%4, %3) : (tensor<10x10xf32>, tensor<2xi32>) -> tensor<10x10xf32>
+    tf_device.return %5 : tensor<10x10xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation of reshape op with replicated inputs with different rank.
+// CHECK-LABEL: func @main
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*TPU" }) -> tensor<1xf32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const" () {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*TPU"], value = dense<-1> : tensor<1xi32>}: () -> tensor<1xi32>
+    %2 = "tf.A"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10xf32>
+    // CHECK:      "tf.Reshape"(%2, %1)
+    // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    // CHECK-SAME: (tensor<10x10xf32>, tensor<1xi32>) -> tensor<100xf32>
+    %3 = "tf.Reshape"(%2, %1) : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<100xf32>
+    tf_device.return %3 : tensor<100xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check reshape op with batch sharded inputs.
+// CHECK-LABEL: func @main
+func.func @main(%arg1: tensor<1xf32> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*TPU" }) -> tensor<1xf32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const" () {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*TPU"], value = dense<-1> : tensor<1xi32>}: () -> tensor<1xi32>
+    %2 = "tf.Const" () {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*TPU"], value = dense<100> : tensor<1xi32>}: () -> tensor<1xi32>
+    %3 = "tf.Pack" (%1, %2) : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+    %4 = "tf.A"() {_layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU"]} : () -> tensor<10x10x10xf32>
+    // CHECK:      "tf.Reshape"
+    // CHECK-SAME: (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<10x100xf32>
+    %5 = "tf.Reshape"(%4, %3) : (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<10x100xf32>
+    tf_device.return %5 : tensor<10x100xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check that layout propagation of inputs that are sharded in non-batch dimension is disallowed.
+func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|*TPU"}, %arg1: tensor<32x32xf32> { tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|*TPU"}) {
+  "tf_device.cluster"() ({
+    // expected-error @+1 {{Features input to Softmax loss ops must be sharded only across batch dimension}}
+    "tf.SoftmaxCrossEntropyWithLogits"(%arg0, %arg1) : (tensor<32x32xf32>, tensor<32x32xf32>) -> (tensor<32x1xf32>, tensor<32x32xf32>)
+    tf_device.return
+ }) {layout = "sharding_specs:x,y, mesh:|x=2,y=2|*TPU"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check layout propagation of read variable op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<!tf_type.resource<tensor<1xf32>>> { tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU" }) -> tensor<1xf32> {
+  // CHECK:      "tf_device.cluster"
+  //
+  // CHECK: %1 = "tf.ReadVariableOp"(%arg0)
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME: (tensor<!tf_type.resource<tensor<1xf32>>>) -> tensor<1xf32>
+  //
+  // CHECK:      tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<1xf32>>>) -> tensor<1xf32>
+    tf_device.return %1 : tensor<1xf32>
+  }) {} : () -> tensor<1xf32>
+  func.return %0 : tensor<1xf32>
+}
+
+// -----
+
+// Check layout propagation of const ops from it's consumers.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        "tf_device.cluster"
+    //
+    // CHECK: %1 = "tf.Const"()
+    // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME: () -> tensor<22xi32>
+    //
+    // CHECK: %2 = "tf.Const"()
+    // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME: () -> tensor<22xi32>
+    //
+    // CHECK: %3 = "tf.Const"()
+    // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME: () -> tensor<22xi32>
+    //
+    // CHECK: %4 = "tf.Add"(%1, %2)
+    // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    // CHECK-SAME: (tensor<22xi32>, tensor<22xi32>) -> tensor<22xi32>
+    // CHECK:      tf_device.return
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]> : tensor<22xi32>} : () -> tensor<22xi32>
+      %2 = "tf.Const"() {value = dense<[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]> : tensor<22xi32>} : () -> tensor<22xi32>
+      %3 = "tf.Const"() {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]> : tensor<22xi32>} : () -> tensor<22xi32>
+      %4 = "tf.Add"(%1,%2) {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : (tensor<22xi32>,tensor<22xi32>) -> tensor<22xi32>
+      %5 = "tf.Add"(%2,%3) {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : (tensor<22xi32>,tensor<22xi32>) -> tensor<22xi32>
+      tf_device.return %3 : tensor<22xi32>
+    }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<22xi32>)
+
+    func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %1 = "tf.Const"()
+  // CHECK-SAME:   _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:   () -> tensor<2xi32>
+  // CHECK:        %2 = "tf.Const"()
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:   () -> tensor<2xi32>
+  // CHECK:        %3 = "tf.Reshape"(%1, %2)
+  // CHECK-SAME:   _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:   (tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+  // CHECK:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Reshape"(%1, %2) {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : (tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+    tf_device.return %3 : tensor<1x2xi32>
+  }) {_layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<1x2xi32>
+  func.return
+}
+
+// -----
+
+// Check layout propagation of fill op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2xi32>, %arg2: tensor<f32>) -> (tensor<?x?xf32>{
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Fill"
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], device = ""}
+  // CHECK-SAME:   tensor<2xi32>, tensor<f32>) -> tensor<?x?xf32>
+  tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Fill"(%arg1, %arg2) {device = ""} : (tensor<2xi32>, tensor<f32>) -> tensor<?x?xf32>
+    tf_device.return %1 : tensor<?x?xf32>
+  }) {} : () -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+// Check that layouts of ops in function definitions are propagated by inferring
+// layouts from function default layout values.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<?x?xi32>{
+  // CHECK:      "tf_device.cluster"
+  tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.PartitionedCall"(%arg1, %arg2) {f = @callee1, config = "", config_proto = "", executor_type = ""} : (tensor<i32>, tensor<i32>) -> tensor<?x?xi32>
+    tf_device.return %1 : tensor<?x?xi32>
+  }) { _mesh = "|x=2,y=2|*CPU" } : () -> tensor<?x?xi32>
+  func.return %0 : tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func private @callee1
+// CHECK-SAME:  %arg0: tensor<i32>
+// CHECK-SAME:  %arg1: tensor<i32>
+// CHECK:       tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee1(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<?x?xi32> attributes {tf.signature.is_stateful} {
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.PartitionedCall"(%arg0, %arg1) {f = @callee2, config = "", config_proto = "", executor_type = ""} : (tensor<i32>, tensor<i32>) -> tensor<?x?xi32>
+    tf_device.return %0 : tensor<?x?xi32>
+  }) { _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|*CPU" } : () -> tensor<?x?xi32>
+  func.return %1 : tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func private @callee2
+// CHECK-SAME:  %arg0: tensor<i32>
+// CHECK-SAME:  %arg1: tensor<i32>
+// CHECK:       tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<?x?xi32> attributes {tf.signature.is_stateful} {
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.Const"() { value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = "tf.Fill"(%0, %arg1) {device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<?x?xi32>
+    tf_device.return %1 : tensor<?x?xi32>
+  }) { _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3" } : () -> tensor<?x?xi32>
+  func.return %1 : tensor<?x?xi32>
+}
+
+// -----
+
+// Check that layouts of ops in function definitions are propagated by inferring
+// layouts from function argument layouts.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<?x?xi32>{ tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"},
+  %arg2: tensor<?x?xi32>{ tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> (tensor<?x?xi32>) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.PartitionedCall"(%arg1, %arg2) {f = @callee1, config = "", config_proto = "", executor_type = ""} : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+    tf_device.return %1 : tensor<?x?xi32>
+  }) { _mesh = "|x=2,y=2|*CPU" } : () -> tensor<?x?xi32>
+  func.return %0 : tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func private @callee1
+// CHECK-SAME:  %arg0: tensor<?x?xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+// CHECK-SAME:  %arg1: tensor<?x?xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee1(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> attributes {tf.signature.is_stateful} {
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.PartitionedCall"(%arg0, %arg1) {f = @callee2, config = "", config_proto = "", executor_type = ""} : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+    tf_device.return %0 : tensor<?x?xi32>
+  }) { _mesh = "mesh:CPU,x=2,y=2" } : () -> tensor<?x?xi32>
+  func.return %1 : tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func private @callee2
+// CHECK-SAME:  %arg0: tensor<?x?xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+// CHECK-SAME:  %arg1: tensor<?x?xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee2(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> attributes {tf.signature.is_stateful} {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.Add"
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  %1 = "tf_device.cluster"() ({
+    %1 = "tf.Add"(%arg0, %arg1) {device = ""} : (tensor<?x?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+    tf_device.return %1 : tensor<?x?xi32>
+  }) { _mesh = "|x=2,y=2|*CPU" } : () -> tensor<?x?xi32>
+  func.return %1 : tensor<?x?xi32>
+}
+
+// -----
+
+// Check that layouts of ops in functions with multiple outputs with different
+// layouts are supported.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<2x2xi32>{
+    tf._layout = "sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|*CPU"},
+  %arg2: tensor<2x2xi32>{
+    tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=2,x=2|*CPU"})
+-> (tensor<2x2xi32>) {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:      "tf.PartitionedCall"
+    // CHECK-SAME: _layout
+    // CHECK-SAME: "sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+    %1, %2 = "tf.PartitionedCall"(%arg1, %arg2) {f = @callee1, config = "", config_proto = "", executor_type = ""} : (tensor<2x2xi32>, tensor<2x2xi32>) -> (tensor<2x2xi32>, tensor<2x2xi32>)
+    tf_device.return %1 : tensor<2x2xi32>
+  }) {_mesh = "|batch=2,x=2|*CPU"} : () -> tensor<2x2xi32>
+  func.return %0 : tensor<2x2xi32>
+}
+
+// CHECK-LABEL: func private @callee1
+// CHECK-SAME:  %arg0: tensor<2x2xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+// CHECK-SAME:  %arg1: tensor<2x2xi32>
+// CHECK-SAME:  tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+// CHECK:       tf._default_layout = "sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+// CHECK:       tf._default_layout = "sharding_specs:batch,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee1(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> (tensor<2x2xi32>, tensor<2x2xi32>)  attributes {tf.signature.is_stateful} {
+  %5, %6 = "tf_device.cluster"() ({
+    // CHECK:       "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    %1 = "tf.Const"() {value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    // CHECK:       "tf.Add"
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    %2 = "tf.Add"(%1, %arg0) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    // CHECK:       "tf.Const"()
+    // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    %3 = "tf.Const"() {value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    // CHECK:       "tf.Add"
+    // CHECK-SAME:   _layout = ["sharding_specs:batch,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+    %4 = "tf.Add"(%3, %arg1) : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %2, %4 : tensor<2x2xi32>, tensor<2x2xi32>
+  }) {_mesh = "|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<2x2xi32>, tensor<2x2xi32>)
+  func.return %5, %6 : tensor<2x2xi32>, tensor<2x2xi32>
+}
+
+// -----
+// Unimplemented op throws an error.
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    // expected-error @+1 {{does not implement layout propagation}}
+    %0 = "tf.A"() : () -> tensor<2xi32>
+    tf_device.return %0 : tensor<2xi32>
+  }) {_mesh = "|batch=2,x=2|*CPU"} : () -> tensor<2xi32>
+  func.return
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
new file mode 100644
index 00000000000..0ad7ac59617
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/layout_propagation_v2.mlir
@@ -0,0 +1,1040 @@
+// RUN: dtensor-opt %s -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -split-input-file -verify-diagnostics | FileCheck %s
+
+// Check that layouts for constant ops automatically set as replicated on mesh.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        "tf_device.cluster"()
+    // CHECK-NEXT:     %[[CONST_OUT:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<10> : tensor<i32>}
+    // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs: mesh:CPU|x=4,y=1|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+    // CHECK-NEXT:     %[[NEG_OUT:.*]] = "tf.Neg"(%[[DTENSOR_LAYOUT_OUT]])
+    // CHECK-NEXT:     %[[NEG_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[NEG_OUT]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs: mesh:CPU|x=4,y=1|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+      %2 = "tf.Neg"(%1) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %2 : tensor<i32>
+    }) {_mesh="CPU|x=4,y=1|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check that conflicting consumer layouts are merged via setting replicated
+// layout to conflicting tensor dimension.
+// CHECK-LABEL: func @main
+func.func @main() {
+   // CHECK:        "tf_device.cluster"()
+   // CHECK-NEXT:     %[[CONST_OUT_1:.*]] = "tf.Const"()
+   // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT_1:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_1]])
+   // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+   // CHECK-NEXT:     %[[CONST_OUT_2:.*]] = "tf.Const"()
+   // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT_2:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_2]])
+   // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+   // CHECK-NEXT:     %[[TILE_OUT:.*]] = "tf.Tile"(%[[DTENSOR_LAYOUT_OUT_1]], %[[DTENSOR_LAYOUT_OUT_2]])
+   // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT_3:.*]] = "tf.DTensorLayout"(%[[TILE_OUT]])
+   // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+   %6, %7 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<10.0> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+    %2 = "tf.Const"() {value = dense<2> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Tile"(%1, %2) {device = ""} : (tensor<2x2xf32>, tensor<2xi32>) -> tensor<4x4xf32>
+
+    %4 = "tf.Neg"(%3) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+
+    %6 = "tf.Identity"(%3) : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %7 = "tf.DTensorLayout"(%6) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %5, %7 : tensor<4x4xf32>, tensor<4x4xf32>
+  }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<2x2xi32>, tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+// Check that constants with multiple consumers are cloned.
+// CHECK-LABEL: func @main
+func.func @main() {
+   %6, %7 = "tf_device.cluster"() ({
+    // CHECK:        "tf_device.cluster"()
+    // CHECK-NEXT:     %[[CONST_OUT_1:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<2x2>], value = dense<10> : tensor<2x2xi32>}
+    // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_1]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+    // CHECK-NEXT:     %[[CONST_OUT_2:.*]] = "tf.Const"() {_global_shape = [#tf_type.shape<2x2>], value = dense<10> : tensor<2x2xi32>}
+    // CHECK-NEXT:     %[[DTENSOR_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_2]])
+    // CHECK-SAME:     layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+    %1 = "tf.Const"() {value = dense<10> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %2 = "tf.Neg"(%1) : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+
+    %4 = "tf.Identity"(%1) : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %3, %5 : tensor<2x2xi32>, tensor<2x2xi32>
+  }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<2x2xi32>, tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+// Check that layout propagation of strided slice op with shrink axis attributes.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<3xi32> {tf._global_shape = #tf_type.shape<3>},
+  %arg2: tensor<1xi32> {tf._global_shape = #tf_type.shape<1>},
+  %arg3: tensor<1xi32> {tf._global_shape = #tf_type.shape<1>},
+  %arg4: tensor<1xi32> {tf._global_shape = #tf_type.shape<1>}) -> (tensor<i32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0,op_input_1,op_input_2,op_input_3", outputs = "op_output_0"}} {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:    %[[DTENSOR_LAYOUT_1:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
+  // CHECK-SAME:    layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>
+  // CHECK-NEXT:    %[[CONST_OUT_2:.*]] = "tf.Const"
+  // CHECK-NEXT:    %[[DTENSOR_LAYOUT_2:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_2]])
+  // CHECK-SAME:    layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>
+  // CHECK-NEXT:    %[[CONST_OUT_3:.*]] = "tf.Const"
+  // CHECK-NEXT:    %[[DTENSOR_LAYOUT_3:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_3]])
+  // CHECK-NEXT:    %[[CONST_OUT_4:.*]] = "tf.Const"
+  // CHECK-NEXT:    %[[DTENSOR_LAYOUT_4:.*]] = "tf.DTensorLayout"(%[[CONST_OUT_4]])
+  // CHECK-SAME:    layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>
+  // CHECK-NEXT:    "tf.StridedSlice"(%[[DTENSOR_LAYOUT_4]], %[[DTENSOR_LAYOUT_3]], %[[DTENSOR_LAYOUT_1]], %[[DTENSOR_LAYOUT_2]])
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {_global_shape = [#tf_type.shape<1>], value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+    %2 = "tf.Const"() {_global_shape = [#tf_type.shape<1>], value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    %3 = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[8, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %8 = "tf.StridedSlice"(%3, %2, %1, %1) {_global_shape = [#tf_type.shape<>], begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<3xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+    %9 = "tf.DTensorLayout"(%8) {_global_shape = [#tf_type.shape<>], global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs:scalar CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<i32>) -> tensor<i32>
+    tf_device.return {_global_shape = []} %9 : tensor<i32>
+  }) {_global_shape = [#tf_type.shape<>], _mesh = "CPU|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// Check that two consecutive DTensorLayoutOps with different layout is
+// disallowed.
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<2xi32> {tf._global_shape = #tf_type.shape<1>}) -> (tensor<2xi32> {tf._global_shape = #tf_type.shape<>}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) { global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    // expected-error @+1 {{Found inconsistent layout}}
+    %2 = "tf.DTensorLayout"(%1) { global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    tf_device.return %2 : tensor<2xi32>
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+
+// -----
+
+// Check that two multiple DTensorLayoutOps with are replaced by identity op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<2xi32> {tf._global_shape = #tf_type.shape<1>}) -> (tensor<2xi32> {tf._global_shape = #tf_type.shape<>}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    "tf.Identity"
+  // CHECK-NEXT:    "tf.DTensorLayout"
+  // CHECK-NEXT:    "tf.Add"
+  // CHECK-NEXT:    "tf.DTensorLayout"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) { global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %2 = "tf.DTensorLayout"(%1) { global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %3 = "tf.Add"(%2, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+    tf_device.return %3 : tensor<2xi32>
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> tensor<2xi32>
+  func.return %0 : tensor<2xi32>
+}
+
+// -----
+
+// Check that einsum will produce replicated layouts when there is a conflict
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<2x2xf32> {tf._global_shape = #tf_type.shape<2x2>},
+  %arg2: tensor<2x2xf32> {tf._global_shape = #tf_type.shape<2x2>}) -> (tensor<2x2xf32> {tf._global_shape = #tf_type.shape<2x2>}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.DTensorLayout"
+  // CHECK-NEXT: "tf.DTensorLayout"
+  // CHECK-NEXT: "tf.Einsum"
+  // CHECK-NEXT: "tf.DTensorLayout"
+  // CHECK-SAME: sharding_specs:unsharded,unsharded
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) { global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %2 = "tf.DTensorLayout"(%arg2) { global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:y,x, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+    tf_device.return %3 : tensor<2x2xf32>
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> tensor<2x2xf32>
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Check that conv2d uses input image layout as output layout.
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"},
+           %arg2:tensor<8x3x3x3xf32>) {
+  // CHECK-LABEL: func @main
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2D"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1>} : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1>} : (tensor<8x3x3x3xf32>) -> tensor<8x3x3x3xf32>
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<8x3x3x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that conv2d backprop uses grads as output layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<1x3x3x3xf32>,
+           %arg2: tensor<8x32x32x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropInput"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %img_shape = "tf.Const"() { value=dense<[8,32,32,3]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %filter_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropInput"(%img_shape, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<4xi32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv2DBackpropFilter output is replicated.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %input_img: tensor<2x9x9x1xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %grad: tensor<2x9x9x2xf32> {tf._layout = "sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}
+  ) -> tensor<2x2x1x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropFilter"
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %filter_shape = "tf.Const"() { value = dense<[2, 2, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %input_layout = "tf.DTensorLayout"(%input_img) {global_shape = #tf_type.shape<2x9x9x1>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<2x9x9x1xf32>) -> tensor<2x9x9x1xf32>
+    %grad_layout = "tf.DTensorLayout"(%grad) {global_shape = #tf_type.shape<2x9x9x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32>
+    %2 = "tf.Conv2DBackpropFilter"(%input_layout, %filter_shape, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<2x9x9x1xf32>, tensor<4xi32>, tensor<2x9x9x2xf32>) -> tensor<2x2x1x2xf32>
+    tf_device.return %2 : tensor<2x2x1x2xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2x2x1x2xf32>
+  func.return %0 : tensor<2x2x1x2xf32>
+}
+// -----
+
+
+// Check inserted DTensorLayout carries shape for resource tensor.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}) -> (tensor<!tf_type.resource<tensor<2x4xf32>>> {tf._default_layout = "empty_layout", tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id", outputs = "op_output_0"}} {
+  // CHECK:      %[[VAR_HANDLE_OUT:.*]] = "tf.VarHandleOp"
+  // CHECK:      "tf.DTensorLayout"(%[[VAR_HANDLE_OUT]])
+  // CHECK-SAME: global_shape = #tf_type.shape<2x4>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.VarHandleOp"() {_global_shape = [#tf_type.shape<>], allowed_devices = [], container = "", device = "", shared_name = "v"} : () -> tensor<!tf_type.resource<tensor<2x4xf32>>>
+    tf_device.return %1 : tensor<!tf_type.resource<tensor<2x4xf32>>>
+  }) {_mesh = "empty_mesh"} : () -> tensor<!tf_type.resource<tensor<2x4xf32>>>
+  func.return %0 : tensor<!tf_type.resource<tensor<2x4xf32>>>
+}
+
+// -----
+
+// Check that Relayout ops are properly inserted for WhileRegions if needed.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>  {},
+           %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}) -> (tensor<4xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>}  : (tensor<i32>) -> tensor<i32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>}  : (tensor<4xf32>) -> tensor<4xf32>
+    %4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %6 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+    %8 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %10 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
+    %12 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %16:2 = "tf.WhileRegion"(%4, %2) ({
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<4xf32>):
+      %27 = "tf.Less"(%arg2, %6) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%27) : (tensor<i1>) -> ()
+    },  {
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<4xf32>):
+      %27 = "tf.StatelessRandomNormal"(%10, %12) : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xf32>
+      // CHECK:      "tf.AddV2"
+      // CHECK-NEXT: "tf.AddV2"
+      // CHECK-NEXT: "tf.DTensorLayout"
+      // CHECK-NEXT: "tf.DTensorLayout"
+      // CHECK-NEXT: "tf.Relayout"
+      // CHECK-NEXT: "tf.DTensorLayout"
+      // CHECK-NEXT: "tf.Yield"
+      %33 = "tf.AddV2"(%arg3, %27) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %37 = "tf.AddV2"(%arg2, %8) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %40 = "tf.DTensorLayout"(%33) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4xf32>) -> tensor<4xf32>
+      "tf.Yield"(%37, %40) : (tensor<i32>, tensor<4xf32>) -> ()
+    }) {is_stateless = true, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<4xf32>) -> (tensor<i32>, tensor<4xf32>)
+    // CHECK:      "tf.DTensorLayout"
+    // CHECK-NEXT: "tf.Relayout"
+    // CHECK-NEXT: "tf.DTensorLayout"
+    // CHECK-NEXT: "tf.DTensorLayout"
+    tf_device.return %16#1 : tensor<4xf32>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<4xf32>)
+  func.return %0 : tensor<4xf32>
+}
+
+// -----
+
+// Check that no Relayout ops are properly inserted for WhileRegions when not
+// needed
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>  {},
+           %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}) -> (tensor<4xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>}  : (tensor<i32>) -> tensor<i32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>}  : (tensor<4xf32>) -> tensor<4xf32>
+    %4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %6 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+    %8 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %10 = "tf.Const"() {value = dense<4> : tensor<1xi32>} : () -> tensor<1xi32>
+    %12 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %16:2 = "tf.WhileRegion"(%4, %2) ({
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<4xf32>):
+      %27 = "tf.Less"(%arg2, %6) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%27) : (tensor<i1>) -> ()
+    },  {
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<4xf32>):
+      %27 = "tf.StatelessRandomNormal"(%10, %12) : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xf32>
+      // CHECK:      "tf.AddV2"
+      // CHECK-NEXT: "tf.DTensorLayout"
+      // CHECK-NEXT: "tf.AddV2"
+      // CHECK-NEXT: "tf.DTensorLayout"
+      // CHECK-NEXT: "tf.Yield"
+      %33 = "tf.AddV2"(%arg3, %27) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+      %37 = "tf.AddV2"(%arg2, %8) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.Yield"(%37, %33) : (tensor<i32>, tensor<4xf32>) -> ()
+    }) {is_stateless = true, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<4xf32>) -> (tensor<i32>, tensor<4xf32>)
+    // CHECK: "tf.DTensorLayout"
+    // CHECK-NEXT: "tf.DTensorLayout"
+    // CHECK-NEXT: tf_device.return
+    tf_device.return %16#1 : tensor<4xf32>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<4xf32>)
+  func.return %0 : tensor<4xf32>
+}
+
+// -----
+
+// Check that the contracted dimension of the reduce op is set to any.
+//
+// We verify that this is correct because tf.Const's sharding specs were set to
+// specs:x,y. Without the "kAny" flag it would have been set up to
+// specs:x,unsharded due to the conflict between the consumers tf.Sum and tf.Neg
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:      %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[CONST_OUT]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %val = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %identity = "tf.Identity"(%val) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %negval = "tf.Neg"(%identity) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %valuelayout = "tf.DTensorLayout"(%negval) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %dimension = "tf.Const"() { value = dense<1> : tensor<i64> } : () -> tensor<i64>
+    %sum = "tf.Sum"(%identity, %dimension) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    %sumlayout = "tf.DTensorLayout"(%sum) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x1xi32>) -> tensor<2x1xi32>
+    tf_device.return %sumlayout : tensor<2x1xi32>
+  }) {_mesh = "|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that layouts are propagated to then/else branches of the IfRegion op.
+// CHECK-LABEL: func @main
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:      %[[IF_OUT:.*]]:2 = "tf.IfRegion"
+    // CHECK-NEXT:   "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,y
+    // CHECK-NEXT:   "tf.Sqrt"
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,y
+    // CHECK:        "tf.Yield"
+    // CHECK:        "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,y
+    // CHECK:        "tf.Yield"
+    // CHECK-NEXT: (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+    // CHECK:      "tf.DTensorLayout"(%[[IF_OUT]]#1)
+    // CHECK-SAME: sharding_specs:unsharded,y
+    %predicate= "tf.Const"() { value = dense<0> :  tensor<i1>}: () ->  tensor<i1>
+    %1:2 = "tf.IfRegion"(%predicate) ({
+        %2 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %3 = "tf.Sqrt"(%2) {_global_shape = [#tf_type.shape<4x4>], device = ""} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %4 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %5 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%5, %4) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      },  {
+        %6 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %7 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%7, %6) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      }) {_else_func_name = "cond_false_140", _global_shape = [#tf_type.shape<>, #tf_type.shape<4x4>], _lower_using_switch_merge = true, _read_only_resource_inputs = [], _then_func_name = "cond_true_130", device = "", is_stateless = true} : (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+
+    tf_device.return %1#1: tensor<4x4xf64>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that if conflicting layouts for outputs of IfRegion exists, then
+// replicated layout is used.
+// CHECK-LABEL: func @main
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    %predicate= "tf.Const"() { value = dense<0> :  tensor<i1>}: () ->  tensor<i1>
+    // CHECK:      %[[IF_OUT:.*]]:2 = "tf.IfRegion"
+    // CHECK-NEXT:   "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,y
+    // CHECK-NEXT:   "tf.Sqrt"
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,unsharded
+    // CHECK:        "tf.Yield"
+    // CHECK:        "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,unsharded
+    // CHECK:        "tf.Yield"
+    // CHECK-NEXT: (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+    // CHECK:      "tf.DTensorLayout"(%[[IF_OUT]]#1)
+    // CHECK-SAME: sharding_specs:unsharded,unsharded
+    %1:2 = "tf.IfRegion"(%predicate) ({
+        %2 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %3 = "tf.Sqrt"(%2) {_global_shape = [#tf_type.shape<4x4>], device = ""} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %4 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %5 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%5, %4) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      },  {
+
+        %6 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %8 = "tf.DTensorLayout"(%6) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %7 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%7, %8) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      }) {_else_func_name = "cond_false_140", _global_shape = [#tf_type.shape<>, #tf_type.shape<4x4>], _lower_using_switch_merge = true, _read_only_resource_inputs = [], _then_func_name = "cond_true_130", device = "", is_stateless = true} : (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+
+    tf_device.return %1#1: tensor<4x4xf64>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that if duplicate layouts for outputs of IfRegion exists, then
+// replicated layout is used.
+// CHECK-LABEL: func @main
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    %predicate= "tf.Const"() { value = dense<0> :  tensor<i1>}: () ->  tensor<i1>
+    // CHECK:      %[[IF_OUT:.*]]:2 = "tf.IfRegion"
+    // CHECK-NEXT:   "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,y
+    // CHECK-NEXT:   "tf.Sqrt"
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,unsharded
+    // CHECK:        "tf.Yield"
+    // CHECK:        "tf.Const"()
+    // CHECK-NEXT:   "tf.DTensorLayout"
+    // CHECK-SAME:   sharding_specs:unsharded,unsharded
+    // CHECK:        "tf.Yield"
+    // CHECK-NEXT: (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+    // CHECK:      "tf.DTensorLayout"(%[[IF_OUT]]#1)
+    // CHECK-SAME: sharding_specs:unsharded,unsharded
+    %1:2 = "tf.IfRegion"(%predicate) ({
+        %2 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %3 = "tf.Sqrt"(%2) {_global_shape = [#tf_type.shape<4x4>], device = ""} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %4 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %5 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%5, %4) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      },  {
+        %6 = "tf.Const"() { value = dense<0.0> : tensor<4x4xf64>}: () ->  tensor<4x4xf64>
+        %8 = "tf.DTensorLayout"(%6) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x4xf64>) -> tensor<4x4xf64>
+        %7 = "tf.Const"() { value = dense<0> : tensor<i1>}: () ->  tensor<i1>
+        "tf.Yield"(%7, %8) {_global_shape = []} : (tensor<i1>, tensor<4x4xf64>) -> ()
+      }) {_else_func_name = "cond_false_140", _global_shape = [#tf_type.shape<>, #tf_type.shape<4x4>], _lower_using_switch_merge = true, _read_only_resource_inputs = [], _then_func_name = "cond_true_130", device = "", is_stateless = true} : (tensor<i1>) -> (tensor<i1>, tensor<4x4xf64>)
+
+    tf_device.return %1#1: tensor<4x4xf64>
+  }) {_mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that the contracted dimension of the Matmul op is set to any.
+//
+// We verify that this is correct because tf.Const's sharding specs were set to
+// specs:x,y. Without the "kAny" flag it would have been set up to
+// specs:x,unsharded due to the conflict between the consumers tf.Matmul and
+// tf.Neg.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[ID_OPERAND_2:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_2]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:unsharded,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[NEG:.*]] = "tf.Neg"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[NEG]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[MATMUL:.*]] = "tf.MatMul"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[MATMUL]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  %0 = "tf_device.cluster"() ({
+    %operand_1 = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %operand_2 = "tf.Const"() { value = dense<[[1, 2],[3, 4], [5, 6], [7, 8], [9, 10], [1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]> : tensor<10x2xi32>}: () -> tensor<10x2xi32>
+    %id_operand_1 = "tf.Identity"(%operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %id_operand_2 = "tf.Identity"(%operand_2) : (tensor<10x2xi32>) -> tensor<10x2xi32>
+    %random_consumer_op = "tf.Neg"(%id_operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %random_consumer_op_layout = "tf.DTensorLayout"(%random_consumer_op) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %matmul = "tf.MatMul"(%id_operand_1, %id_operand_2) {transpose_a = false, transpose_b = false}: (tensor<2x10xi32>, tensor<10x2xi32>) -> tensor<2x2xi32>
+    %matmul_layout = "tf.DTensorLayout"(%matmul) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %matmul_layout : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that einsum will propagate sharding to its inputs.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[ID_OPERAND_2:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_2]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:unsharded,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[EINSUM:.*]] = "tf.Einsum"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[EINSUM]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  %0 = "tf_device.cluster"() ({
+    %operand_1 = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %operand_2 = "tf.Const"() { value = dense<[[1, 2],[3, 4], [5, 6], [7, 8], [9, 10], [1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]> : tensor<10x2xi32>}: () -> tensor<10x2xi32>
+    %id_operand_1 = "tf.Identity"(%operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %id_operand_2 = "tf.Identity"(%operand_2) : (tensor<10x2xi32>) -> tensor<10x2xi32>
+    %einsum = "tf.Einsum"(%id_operand_1, %id_operand_2) {equation="ab,bc->ac"} : (tensor<2x10xi32>, tensor<10x2xi32>) -> tensor<2x2xi32>
+    %einsum_layout = "tf.DTensorLayout"(%einsum) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %einsum_layout : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> tensor<2x2xi32>
+  func.return
+}
+
+// -----
+
+// Check that einsum will propagate the "any" flag to its inputs.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[ID_OPERAND_2:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_2]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:unsharded,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[NEG:.*]] = "tf.Neg"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[NEG]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[EINSUM:.*]] = "tf.Einsum"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[EINSUM]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  %0 = "tf_device.cluster"() ({
+    %operand_1 = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %operand_2 = "tf.Const"() { value = dense<[[1, 2],[3, 4], [5, 6], [7, 8], [9, 10], [1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]> : tensor<10x2xi32>}: () -> tensor<10x2xi32>
+    %id_operand_1 = "tf.Identity"(%operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %id_operand_2 = "tf.Identity"(%operand_2) : (tensor<10x2xi32>) -> tensor<10x2xi32>
+    %random_consumer_op = "tf.Neg"(%id_operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %random_consumer_op_layout = "tf.DTensorLayout"(%random_consumer_op) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %einsum = "tf.Einsum"(%id_operand_1, %id_operand_2) {equation="ab,bc->ac"} : (tensor<2x10xi32>, tensor<10x2xi32>) -> tensor<2x2xi32>
+    %einsum_layout = "tf.DTensorLayout"(%einsum) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %einsum_layout : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> tensor<2x2xi32>
+  func.return
+}
+
+// -----
+
+// Check that einsum will propagate the "any" flag to its inputs, with
+// incompatible sharding.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[ID_OPERAND_2:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_2]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:unsharded,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[NEG:.*]] = "tf.Neg"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[NEG]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  // CHECK-NEXT:    %[[EINSUM:.*]] = "tf.Einsum"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[EINSUM]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|
+  %0 = "tf_device.cluster"() ({
+    %operand_1 = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %operand_2 = "tf.Const"() { value = dense<[[1, 2],[3, 4], [5, 6], [7, 8], [9, 10], [1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]> : tensor<10x2xi32>}: () -> tensor<10x2xi32>
+    %id_operand_1 = "tf.Identity"(%operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %id_operand_2 = "tf.Identity"(%operand_2) : (tensor<10x2xi32>) -> tensor<10x2xi32>
+    %random_consumer_op = "tf.Neg"(%id_operand_1) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %random_consumer_op_layout = "tf.DTensorLayout"(%random_consumer_op) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %einsum = "tf.Einsum"(%id_operand_1, %id_operand_2) {equation="ab,bc->ac"} : (tensor<2x10xi32>, tensor<10x2xi32>) -> tensor<2x2xi32>
+    %einsum_layout = "tf.DTensorLayout"(%einsum) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,z, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %einsum_layout : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> tensor<2x2xi32>
+  func.return
+}
+
+// -----
+
+// Check resolution of conflicting consumer layouts when proposed specs are
+// set to Any.
+//
+// We use the sum ops to generate "any" specs. These specs are passed to the
+// layout propagation algorithm as consumer_layouts.
+//
+// In this test we have two consumer ops:
+//   - Consumer_1 specs:any,x,
+//   - Consumer_2 specs:x,any,
+//
+// The test verifies that the final propagated layout is:
+//   - Proposed specs:unsharded,unsharded,
+//
+// avoiding the illegal scenario of specs:x,x,
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:unsharded,unsharded,  mesh:|x=2,y=2|
+  %0 = "tf_device.cluster"() ({
+    %val = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %identity = "tf.Identity"(%val) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %dimension_0 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+    %sum_0 = "tf.Sum"(%identity, %dimension_0) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<1x10xi32>
+    %sumlayout_0 = "tf.DTensorLayout"(%sum_0) {global_shape = #tf_type.shape<10>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1x10xi32>) -> tensor<1x10xi32>
+    %dimension_1 = "tf.Const"() { value = dense<1> : tensor<i64> } : () -> tensor<i64>
+    %sum_1 = "tf.Sum"(%identity, %dimension_1) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    %sumlayout_1 = "tf.DTensorLayout"(%sum_1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x1xi32>) -> tensor<2x1xi32>
+    tf_device.return %sumlayout_1 : tensor<2x1xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// This test verifies that when a consumer op is set to any, the proposed specs
+// are not modified.
+//
+// We have two consumer ops:
+//
+//   - Consumer_1 specs:x,unsharded,
+//   - Consumer_2 specs:any,unsharded,
+//
+// The test verifies that the final propagated layout is:
+//   - Proposed specs:x,unsharded,
+//
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|
+  %0 = "tf_device.cluster"() ({
+    %val = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %val_layout = "tf.DTensorLayout"(%val) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %identity = "tf.Identity"(%val_layout) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %consumer_op_1 = "tf.Neg"(%identity) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %consumer_op_1_layout = "tf.DTensorLayout"(%consumer_op_1) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %dimension_0 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+    %consumer_op_2 = "tf.Sum"(%identity, %dimension_0) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<1x10xi32>
+    tf_device.return %consumer_op_2 : tensor<1x10xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> (tensor<1x10xi32>)
+  func.return
+}
+
+// -----
+
+// This test verifies that when the proposed spec is set to any, and the
+// consumer op is proposing a specific sharding, the latter is taken.
+//
+// We use the sum ops to generate "any" specs. These specs are passed to the
+// layout propagation algorithm as consumer_layouts.
+//
+// In this test we have two consumer ops:
+//
+//   - Consumer_1 specs:any,unsharded,
+//   - Consumer_2 specs:x,unsharded,
+//
+// The test verifies that the final propagated layout is:
+//   - Proposed specs:x,unsharded,
+//
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:         %[[ID_OPERAND_1:.*]] = "tf.Identity"
+  // CHECK-NEXT:      "tf.DTensorLayout"(%[[ID_OPERAND_1]])
+  // CHECK-SAME:      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|
+  %0 = "tf_device.cluster"() ({
+    %val = "tf.Const"() { value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>}: () -> tensor<2x10xi32>
+    %val_layout = "tf.DTensorLayout"(%val) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %identity = "tf.Identity"(%val_layout) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %dimension_0 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+    %sum_0 = "tf.Sum"(%identity, %dimension_0) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<1x10xi32>
+    %consumer_op_1 = "tf.Neg"(%identity) : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    %consumer_op_1_layout = "tf.DTensorLayout"(%consumer_op_1) {global_shape = #tf_type.shape<2x10>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x10xi32>) -> tensor<2x10xi32>
+    tf_device.return %consumer_op_1_layout : tensor<2x10xi32>
+  }) {_mesh = "|x=2,y=2,z=2|*TPU"} : () -> (tensor<2x10xi32>)
+  func.return
+}
+
+// -----
+// This test verifies that a graph intentionally made to trigger an
+// infinite loop in the layout-propagation-v2 algorithm, should still converge
+// and produce a layout for all ops in the graph.
+//
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+  %arg1: tensor<16x16xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %arg2: tensor<16x16xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}
+  ) -> tensor<16x16xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.Identity"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.MatMul"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.Identity"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.Relayout"
+  // CHECK: "tf.DTensorLayout"
+  %1 = "tf_device.cluster"() ( {
+    %arg1_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<16x16>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    %arg2_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<16x16>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    %2 = "tf.Identity"(%arg1_layout) : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    %3 = "tf.MatMul"(%arg1_layout, %2) {transpose_a = false, transpose_b = false}: (tensor<16x16xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+    %4 = "tf.Identity"(%3) : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    %5 = "tf.Relayout"(%4) {layout = "sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: (tensor<16x16xf32>) -> tensor<16x16xf32>
+    %6 = "tf.Identity"(%5) : (tensor<16x16xf32>) -> tensor<16x16xf32>
+    tf_device.return %6 : tensor<16x16xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<16x16xf32>
+  func.return %1 : tensor<16x16xf32>
+}
+
+// -----
+
+// Check the tf.RestoreV2Op's output layout is correctly inferred for single
+// mesh cluster. The output layout should be the layout of the
+// AssignVariable's resource layout on a 1:1 CPU mesh.
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.string>,
+  %arg2: tensor<1x!tf_type.string>,
+  %arg3: tensor<1x!tf_type.string>,
+  %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>> {
+    tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) {
+
+    // CHECK: "tf_device.cluster"
+    // CHECK-NEXT: "tf.RestoreV2"
+    // CHECK-NEXT: "tf.DTensorLayout"
+    // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>
+    "tf_device.cluster"() ({
+      %6 = "tf.RestoreV2"(%arg1, %arg2, %arg3) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> (tensor<4x8xf32>)
+      "tf.AssignVariableOp"(%arg4, %6) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+    func.return
+}
+
+// -----
+
+// Check the tf.RestoreV2Op's output layout is correctly inferred for multi
+// mesh cluster function. The output layout should be the layout of the
+// AssignVariable's resource layout changed to the 1:1 CPU mesh.
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.string>,
+  %arg2: tensor<1x!tf_type.string>,
+  %arg3: tensor<1x!tf_type.string>,
+  %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>> {
+    tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) {
+    // CHECK: "tf_device.cluster"
+    "tf_device.cluster"() ({
+      %1 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>, shape = #tf_type.shape<4x8>} : () -> tensor<4x8xf32>
+      %2 = "tf.Identity"(%1) : (tensor<4x8xf32>) -> tensor<4x8xf32>
+      "tf.AssignVariableOp"(%arg4, %2) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+      tf_device.return
+    }) {_mesh="TPU|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK: "tf_device.cluster"
+    // CHECK-NEXT: "tf.RestoreV2"
+    // CHECK-NEXT: "tf.DTensorLayout"
+    // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>
+    "tf_device.cluster"() ({
+      %6 = "tf.RestoreV2"(%arg1, %arg2, %arg3) {} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> (tensor<4x8xf32>)
+      "tf.DTensorSend"(%6) {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", target_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>} : (tensor<4x8xf32>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check Conv2DBackpropInputV2 forwards input layout to output layout.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3xf32>,
+                %arg3: tensor<8x32x32x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropInputV2"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropInputV2"(%input_layout, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv2DBackpropInputV2 forwards output layout to input layouts.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3xf32>,
+                %arg3: tensor<8x32x32x3xf32>) {
+  // CHECK:      %[[INPUT:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[INPUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[FILTER:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[FILTER]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[GRAD:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[GRAD]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %input_identity = "tf.Identity"(%input_layout) : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_identity = "tf.Identity"(%filter_layout) : (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_identity = "tf.Identity"(%grad_layout) : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropInputV2"(%input_identity, %filter_identity, %grad_identity) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv_layout : tensor<8x32x32x3xf32>
+  }) {_mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv3DBackpropInput forwards input layout to output layout.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3x3xf32>,
+                %arg3: tensor<8x32x32x32x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropInput"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropInput"(%input_layout, %filter_layout, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv3DBackpropInput forwards output layout to input layouts.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3x3xf32>,
+                %arg3: tensor<8x32x32x32x3xf32>) {
+  // CHECK:      %[[INPUT:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[INPUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[FILTER:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[FILTER]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[GRAD:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[GRAD]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %input_identity = "tf.Identity"(%input_layout) : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_identity = "tf.Identity"(%filter_layout) : (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_identity = "tf.Identity"(%grad_layout) : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropInput"(%input_identity, %filter_identity, %grad_identity) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv_layout : tensor<8x32x32x32x3xf32>
+  }) {_mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv2DBackpropFilterV2 forwards filter layout to output layout.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3xf32>,
+                %arg3: tensor<8x32x32x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropFilterV2"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropFilterV2"(%input_layout, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv2DBackpropFilterV2 forwards output layout to filter layouts.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32>,
+                %arg2: tensor<1x3x3x4xf32>,
+                %arg3: tensor<8x32x32x3xf32>) {
+  // CHECK:      %[[INPUT:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[INPUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[FILTER:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[FILTER]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[GRAD:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[GRAD]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %input_identity = "tf.Identity"(%input_layout) : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_identity = "tf.Identity"(%filter_layout) : (tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
+    %grad_identity = "tf.Identity"(%grad_layout) : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropFilterV2"(%input_identity, %filter_identity, %grad_identity) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x4xf32>, tensor<8x32x32x3xf32>) -> tensor<1x3x3x4xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<1x3x3x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
+    tf_device.return %conv_layout : tensor<1x3x3x4xf32>
+  }) {_mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv3DBackpropFilter forwards input layout to output layout.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3x3xf32>,
+                %arg3: tensor<8x32x32x32x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropFilter"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK:      "tf.DTensorLayout"(%[[CONV_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropFilter"(%input_layout, %filter_layout, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check Conv3DBackpropFilter forwards output layout to input layouts.
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32>,
+                %arg2: tensor<1x3x3x3x4xf32>,
+                %arg3: tensor<8x32x32x32x3xf32>) {
+  // CHECK:      %[[INPUT:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[INPUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[FILTER:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[FILTER]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK:      %[[GRAD:.*]] = "tf.Identity"
+  // CHECK:      "tf.DTensorLayout"(%[[GRAD]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3x4xf32>) -> tensor<1x3x3x3x4xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %input_identity = "tf.Identity"(%input_layout) : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_identity = "tf.Identity"(%filter_layout) : (tensor<1x3x3x3x4xf32>) -> tensor<1x3x3x3x4xf32>
+    %grad_identity = "tf.Identity"(%grad_layout) : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropFilter"(%input_identity, %filter_identity, %grad_identity) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x4xf32>, tensor<8x32x32x32x3xf32>) -> tensor<1x3x3x3x4xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} :
+      (tensor<1x3x3x3x4xf32>) -> tensor<1x3x3x3x4xf32>
+    tf_device.return %conv_layout : tensor<1x3x3x3x4xf32>
+  }) {_mesh="|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
new file mode 100644
index 00000000000..e5043aca351
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/lower_send_recv.mlir
@@ -0,0 +1,78 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-lower-send-recv -verify-diagnostics | FileCheck %s
+
+// Check that Data transfer from CPU to TPU is lowered correctly.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[DEVICE_ID:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Identity"
+  // COMMENT: Check Recv before Send
+  // CHECK-DAG:    %[[RECV_DEVICE_ID:.*]] = "tf.Reshape"(%[[DEVICE_ID]], %[[RECV_SIZE_TYPE:[^)]*]])
+  // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[RECV_ID_TO_ORDINAL:.*]], %[[RECV_DEVICE_ID]], %[[RECV_SLICE_SIZE:[^)]*]])
+  // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[RECV_DEVICE_ORDINAL]], %[[RECV_SCALAR_TYPE:[^)]*]])
+  // CHECK-DAG:   %[[RECV_DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[RECV_DEVICE_ORDINAL_SCALAR]])
+  // CHECK-DAG:    %[[RECV_ID_TO_ORDINAL]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK-DAG:    %[[RECV_SIZE_TYPE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:    %[[RECV_SLICE_SIZE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:    %[[RECV_SCALAR_TYPE]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // COMMENT: Recv and Send seperated by the output tensor.
+  // CHECK:   %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK-NEXT:   %[[LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[CONST_OUT]])
+  // CHECK-DAG:   %[[SEND_DEVICE_ID:.*]] = "tf.Reshape"(%[[DEVICE_ID]], %[[SEND_SIZE_TYPE:[^)]*]])
+  // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[SEND_ID_TO_ORDINAL:.*]], %[[SEND_DEVICE_ID]], %[[SEND_SLICE_SIZE:[^)]*]])
+  // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[SEND_DEVICE_ORDINAL]], %[[SEND_SCALAR_TYPE:[^)]*]])
+  // CHECK-DAG:   %[[SEND_DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[SEND_DEVICE_ORDINAL_SCALAR]])
+  // CHECK-DAG:    %[[SEND_ID_TO_ORDINAL]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK-DAG:    %[[SEND_SIZE_TYPE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:    %[[SEND_SLICE_SIZE]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-DAG:    %[[SEND_SCALAR_TYPE]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+  // CHECK:   "tf._XlaSendFromHostV2"(%[[LAYOUT_OUT]], %[[PROGRAM_KEY]], %[[SEND_DEVICE_ORDINAL_SCALAR_64]])
+  // CHECK-NEXT:   %[[RECV_OUT:.*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_KEY]], %[[RECV_DEVICE_ORDINAL_SCALAR_64]])
+  // CHECK-SAME:   key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Identity"
+  // CHECK-NEXT:   %[[TPU_RECV_OUT:.*]] = "tf.XlaRecvFromHost"()
+  // CHECK-SAME:   key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+  // CHECK-NEXT:   %[[TPU_LAYOUT_OUT:.*]] = "tf.DTensorLayout"(%[[TPU_RECV_OUT]])
+  // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+  // CHECK-NEXT:   "tf.XlaSendToHost"(%[[A_OUT]])
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> tensor<1xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> ()
+
+    %2 = "tf.DTensorRecv"() {key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2", layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>, shape = #tf_type.shape<>} : () -> (tensor<1xi32>)
+    "tf.B"(%2) : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> tensor<1xi32>
+    %2 = "tf.A"(%1) : (tensor<1xi32>) -> tensor<1xi32>
+    "tf.DTensorSend"(%2) {key = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that device id usages are added correctly.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[DEVICE_ID:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) -> tensor<1xi32> {
+  // CHECK:       "tf_device.cluster"()
+  // CHECK-NEXT:    "tf.Identity"(%[[DEVICE_ID]])
+  // CHECK-NEXT:    "tf.XlaRecvFromHost"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:  _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %2 = "tf_device.cluster"() ({
+    %0 = "tf.XlaRecvFromHost"() {_layout = ["sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"], key = "communication_key_0", shape = #tf_type.shape<1>} : () -> tensor<1xi32>
+    tf_device.return %0 : tensor<1xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<1xi32>
+
+  func.return %2 : tensor<1xi32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/merge_clusters.mlir b/tensorflow/dtensor/mlir/tests/merge_clusters.mlir
new file mode 100644
index 00000000000..8d736ca9beb
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/merge_clusters.mlir
@@ -0,0 +1,556 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-merge-clusters -verify-diagnostics | FileCheck %s
+
+// Check that multiple tf_device.Cluster ops with same mesh specification are
+// merged correctly to a single global cluster.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) -> (tensor<1xi32>,  tensor<i64>, tensor<1xi32>, tensor<i64>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.Cast"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.FloorMod"
+  // CHECK-NEXT:   "tf.XlaRecvFromHost"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Equal"
+  // CHECK-NEXT:   "tf.IfRegion"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK:      %[[CLUSTER_OUT:.*]]:4 = "tf_device.cluster"
+  // CHECK:        "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[CONST_OUT1:.*]] = "tf.Const"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf._XlaSendFromHostV2"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf._XlaSendFromHostV2"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf._XlaSendFromHostV2"
+  // CHECK-NEXT:   %[[CONST_OUT2:.*]] = "tf.Const"
+  // CHECK-NEXT:   "tf._XlaSendFromHostV2"
+  // CHECK-NEXT:   "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[CAST_OUT:.*]] = "tf.Cast"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.FloorMod"
+  // CHECK-NEXT:   %[[RECV_OUT:.*]] = "tf._XlaRecvAtHostV2"
+  // CHECK-NEXT:   tf_device.return %[[CONST_OUT1]], %[[CONST_OUT2]], %[[RECV_OUT]], %[[CAST_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0
+  // CHECK-NEXT: return %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1, %[[CLUSTER_OUT]]#2, %[[CLUSTER_OUT]]#3
+  %7, %8 = "tf_device.cluster"() ({
+    %0 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"], value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    "tf._XlaSendFromHostV2"(%1, %0, %2) {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"} : (tensor<1xi32>, tensor<2x!tf_type.string>, tensor<i64>) -> ()
+    %3 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    "tf._XlaSendFromHostV2"(%1, %0, %3) {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"} : (tensor<1xi32>, tensor<2x!tf_type.string>, tensor<i64>) -> ()
+    %4 = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+    "tf._XlaSendFromHostV2"(%1, %0, %4) {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"} : (tensor<1xi32>, tensor<2x!tf_type.string>, tensor<i64>) -> ()
+    %5 = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+    "tf._XlaSendFromHostV2"(%1, %0, %5) {_layout = [], key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"} : (tensor<1xi32>, tensor<2x!tf_type.string>, tensor<i64>) -> ()
+    tf_device.return %1, %5 : tensor<1xi32>,  tensor<i64>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> (tensor<1xi32>,  tensor<i64>)
+  "tf_device.cluster"() ({
+    %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %1 = "tf.Const"() {value = dense<4> : tensor<i64>} : () -> tensor<i64>
+    %2 = "tf.FloorMod"(%0, %1) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %3 = "tf.XlaRecvFromHost"() {_layout = ["sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"], key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", shape = #tf_type.shape<1>} : () -> tensor<1xi32>
+    %4 = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+    %5 = "tf.Equal"(%2, %4) {incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "tf.IfRegion"(%5) ({
+      "tf.XlaSendToHost"(%3) {key = "communication_key_sharding_specs:, CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2"} : (tensor<1xi32>) -> ()
+      "tf.Yield"() : () -> ()
+    },  {
+      "tf.Yield"() : () -> ()
+    }) {_layout = [], is_stateless = false} : (tensor<i1>) -> ()
+    tf_device.return {_layout = []}
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+  %9, %10 = "tf_device.cluster"() ({
+    %0 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %1 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %2 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %3 = "tf.FloorMod"(%1, %2) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%0, %3) {_layout = ["sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"], key = "communication_key_sharding_specs:, CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_2"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<1xi32>
+    tf_device.return %4, %1 : tensor<1xi32>, tensor<i64>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> (tensor<1xi32>, tensor<i64>)
+  func.return %7, %8, %9, %10 : tensor<1xi32>,  tensor<i64>, tensor<1xi32>, tensor<i64>
+}
+
+// -----
+
+// Check that duplicate/nested tf_device.cluster ops are removed.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.A"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-NEXT:   "tf.G"
+  // CHECK-NEXT:   "tf.F"
+  // CHECK-NEXT:   "tf.IfRegion"
+  // CHECK-NEXT:     %[[D_OUT:.*]] = "tf.D"
+  // CHECK-NEXT:     %[[I_OUT:.*]] = "tf.I"(%[[D_OUT]])
+  // CHECK-NEXT:     "tf.J"(%[[I_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:        %[[E_OUT:.*]] = "tf.E"
+  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+      %10 = "tf_device.cluster"() ({
+        %8 = "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>)
+        %9 = "tf.I"(%8) : (tensor<?xi32>) -> (tensor<?xi32>)
+        tf_device.return %9 : tensor<?xi32>
+      }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> (tensor<?xi32>)
+
+      "tf.J"(%10) : (tensor<?xi32>) -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check clusters with no mesh specification are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{All clusters must have specified mesh}}
+      "tf_device.cluster"() ({
+        "tf.D"() : () -> ()
+        tf_device.return
+      }) : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check nested clusters with input edges are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{found nested tf_device.Cluster op with inputs}}
+      "tf_device.cluster"() ({
+        "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check nested clusters with outputs edges are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{found nested tf_device.Cluster op with outputs}}
+      %9 = "tf_device.cluster"() ({
+        %8 = "tf.D"() : () -> tensor<?xi32>
+        tf_device.return %8 : tensor<?xi32>
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> (tensor<?xi32>)
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+
+// -----
+
+// Check tf.If control flow ops are decomposed correctly.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"
+  // CHECK-NEXT:   "tf.F"
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        %[[E_OUT:.*]] = "tf.E"
+  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-NEXT: return %[[CLUSTER_OUT]]
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      "tf_device.cluster"() ({
+        "tf.D"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check decomposing 2 tf_device.cluster ops inside then/else branch of tf.If.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT_2:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT_2]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.I"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: %[[CPU_CLUSTER_OUT:.*]] = "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"()
+  // CHECK-NEXT:   "tf.B"()
+  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"()
+  // CHECK-NEXT:   "tf.F"()
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        %[[E_OUT:.*]] = "tf.E"()
+  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-NEXT: return %[[CPU_CLUSTER_OUT]]
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      "tf_device.cluster"() ({
+        "tf.D"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf_device.cluster"() ({
+        "tf.I"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check decomposing tf_device cluster inside tested tf.If op.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
+  // CHECK:       "tf_device.cluster"()
+  // CHECK:         %[[OUTER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
+  // CHECK-SAME:    key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE_RECV]])
+  // CHECK-NEXT:      %[[INNER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
+  // CHECK-SAME:      key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE_RECV]])
+  // CHECK-NEXT:        "tf.Yield"
+  // CHECK:             "tf.I"
+  // CHECK-NEXT:        "tf.D"
+  // CHECK:             "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:         tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME:    () -> ()
+
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    "tf.A"
+  // CHECK-NEXT:    "tf.B"
+  // CHECK-NEXT:    %[[OUTER_PREDICATE:.*]] = "tf.G"
+  // CHECK-NEXT:    "tf.DTensorSend"(%[[OUTER_PREDICATE]])
+  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE]])
+  // CHECK-NEXT:      %[[INNER_PREDICATE:.*]] = "tf.H"
+  // CHECK-NEXT:      "tf.DTensorSend"(%[[INNER_PREDICATE]])
+  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE]])
+  // CHECK-NEXT:        "tf.Yield"
+  // CHECK:             "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:         %[[E_OUT:.*]] = "tf.E"
+  // CHECK-NEXT:    tf_device.return %[[E_OUT]]
+  // CHECK-NEXT:  _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+
+    "tf.IfRegion"(%6) ({
+       %7 = "tf.H"(%4) : (tensor<?xi32>) -> (tensor<i1>)
+
+      "tf.IfRegion"(%7)({
+          "tf.Yield"() : () -> ()
+        },
+        {
+          "tf_device.cluster"() ({
+            %8 = "tf.I"() : () -> (tensor<?xi32>)
+            "tf.D"(%8) : (tensor<?xi32>) -> ()
+            tf_device.return
+          }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check whether metadata attributes are cloned correctly during cluster
+// merging.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf32>>>) -> () {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.B"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK-NEXT: "tf_device.cluster"
+  // CHECK:         "tf.A"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-NEXT:   "tf.AssignVariableOp"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _inferred_resource_indices = dense<1>
+  // CHECK-SAME: _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]
+  // CHECK-SAME: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  "tf_device.cluster"() ({
+    "tf.A"() : () -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+
+  "tf_device.cluster"() ({
+    "tf.B"() : () -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.C"() : () -> (tensor<2x4xf32>)
+    "tf.AssignVariableOp"(%arg1, %0) : (tensor<!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check whether metadata attributes are merged correctly.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf32>>>,  %arg2: tensor<!tf_type.resource<tensor<2x4xf32>>>) -> () {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.B"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK-NEXT: "tf_device.cluster"
+  // CHECK:         "tf.A"
+  // CHECK-NEXT:   "tf.AssignVariableOp"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-NEXT:   "tf.AssignVariableOp"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _inferred_resource_indices = dense<[2, 1]>
+  // CHECK-SAME: _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0", "sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]
+  // CHECK-SAME: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  "tf_device.cluster"() ({
+    %0 = "tf.A"() : () -> (tensor<2x4xf32>)
+    "tf.AssignVariableOp"(%arg2, %0) : (tensor<!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0",  _inferred_resource_indices = dense<2> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+
+  "tf_device.cluster"() ({
+    "tf.B"() : () -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.C"() : () -> (tensor<2x4xf32>)
+    "tf.AssignVariableOp"(%arg1, %0) : (tensor<!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check whether shape op metadata attributes are merged correctly.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2x4xf32>) -> () {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.B"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK-NEXT: "tf_device.cluster"
+  // CHECK:        "tf.ShapeOp"
+  // CHECK-NEXT:   "tf.ShapeOp"
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-SAME: _shape_input_indices = dense<[2, 1]
+  // CHECK-SAME: _shape_input_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0", "sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]
+  "tf_device.cluster"() ({
+    %0 = "tf.ShapeOp"(%arg1) : (tensor<2x4xf32>) -> (tensor<1xf32>)
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _shape_input_indices = dense<[1, 2]> : vector<2xi32>, _shape_input_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0", "sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+
+  "tf_device.cluster"() ({
+    "tf.B"() : () -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.ShapeOp"(%arg2) : (tensor<2x4xf32>) -> (tensor<1xf32>)
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _shape_input_indices = dense<[1]> : vector<1xi32>, _shape_input_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check whether conflicting metadata attributes disallowed.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf32>>>,  %arg2: tensor<!tf_type.resource<tensor<2x4xf32>>>) -> () {
+  "tf_device.cluster"() ({
+    %0 = "tf.A"() : () -> (tensor<2x4xf32>)
+    "tf.AssignVariableOp"(%arg2, %0) : (tensor<!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0",  _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:x,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+
+  "tf_device.cluster"() ({
+    "tf.B"() : () -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  // expected-error @+1 {{Found conflicting metadata attributes while merging clusters}}
+  "tf_device.cluster"() ({
+    %0 = "tf.C"() : () -> (tensor<2x4xf32>)
+    "tf.AssignVariableOp"(%arg1, %0) : (tensor<!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that unused tf_device.cluster results are pruned away.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[DEVICE_ID:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:       "tf_device.cluster"()
+  // CHECK-NEXT:    "tf.Const"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:  _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-SAME:  () -> ()
+  %2 = "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    tf_device.return %0 : tensor<i64>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<i64>
+ func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
new file mode 100644
index 00000000000..690293375c4
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
@@ -0,0 +1,547 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-mesh-propagation -verify-diagnostics | FileCheck %s
+
+// Checks that input mesh is correctly propagated to it's consumers.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:      tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:    _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) {_mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:      tf_device.return %[[B_OUT]]
+    // CHECK-NEXT:    _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.B"(%0) : (tensor<i32>) -> tensor<f32>
+      tf_device.return %3 : tensor<f32>
+    }) : () -> (tensor<f32>)
+    func.return
+}
+
+// -----
+
+// Checks that mesh is propagated from inputs of `tf_device.Cluster` op if the
+// inputs are arguments of enclosing function.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32> {tf._mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}, %arg2: tensor<1xf32> {tf._mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}) -> () {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:      tf_device.return
+    // CHECK-NEXT:    _mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"(%arg1, %arg2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+
+func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32> {tf._mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"}, %arg2: tensor<1xf32> {tf._mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}) -> () {
+    // expected-error @+1 {{ All inputs to `tf_device.Cluster` must have same mesh configuration}}
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"(%arg1, %arg2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Checks that mesh is correctly propagated from `tf_device.Cluster` op's consumers.
+// CHECK-LABEL: func @main
+func.func @main() {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:      tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:    _mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.A"() : () -> tensor<i32>
+      tf_device.return %1 : tensor<i32>
+    }) : () -> (tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:      tf_device.return %[[B_OUT]]
+    // CHECK-NEXT:    _mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+    %2 = "tf_device.cluster"() ({
+      %3 = "tf.B"(%0) : (tensor<i32>) -> tensor<f32>
+      tf_device.return %3 : tensor<f32>
+    }) {_mesh = "CPU|batch=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<f32>)
+    func.return
+}
+
+// -----
+
+// Checks that mesh is correctly propagated from default layout of the enclosing function.
+// CHECK-LABEL: func @main
+func.func @main() ->(tensor<i32>{tf._default_layout = "sharding_specs:unsharded, mesh:CPU|batch=2,x=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+  // CHECK-NEXT:      tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|batch=2,x=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() : () -> tensor<i32>
+    tf_device.return %1 : tensor<i32>
+  }) : () -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// Checks that mesh is propagated from function arguments and operands for
+// nested function.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource> {
+  tf._layout = "sharding_specs:unsharded, mesh:CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0",
+  tf._mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"}) ->
+tensor<2xi64> attributes {tf.entry_function = {control_outputs = "", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+  // CHECK-NEXT:      tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %0 = "tf_device.cluster"() ({
+    %1= "tf.A"(%arg1) : (tensor<!tf_type.resource>) -> (tensor<2xi64>)
+    tf_device.return %1 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[CALL_OUT:.*]] = "tf.PartitionedCall"
+  // CHECK-NEXT:      tf_device.return %[[CALL_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %1 = "tf_device.cluster"() ({
+    %2 = "tf.PartitionedCall"(%0, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""} : (tensor<2xi64>, tensor<i32>) -> (tensor<2xi64>)
+    tf_device.return %2 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+  func.return %1 : tensor<2xi64>
+}
+
+// CHECK-LABEL: func private @callee
+// CHECK-SAME: %arg0: tensor<2xi64>
+// CHECK-SAME: tf._mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+// CHECK-SAME: %arg1: tensor<i32>
+func.func private @callee(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> tensor<2xi64> attributes {tf.signature.is_stateful} {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+  // CHECK-NEXT:      tf_device.return %[[B_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.B"(%arg0, %arg1) : (tensor<2xi64>, tensor<i32>) -> (tensor<2xi64>)
+    tf_device.return %0 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+  func.return %1 : tensor<2xi64>
+}
+
+// -----
+
+// Checks that mesh is propagated for functions without outputs from functions'
+// arguments.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource> {
+  tf._layout = "sharding_specs:unsharded, CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0",
+  tf._mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"}) ->
+() attributes {tf.entry_function = {control_outputs = "", inputs = "device_id,op_input_0", outputs = ""}} {
+  // CHECK:   "tf_device.cluster"
+  // CHECK:   _mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  "tf_device.cluster"() ({
+    "tf.PartitionedCall"(%arg0, %arg1) {f = @assign_var, config = "", config_proto = "", executor_type = ""} : (tensor<i32>, tensor<!tf_type.resource>) -> ()
+    tf_device.return
+  }) : () -> ()
+  func.return
+}
+
+// CHECK-LABEL: func private @assign_var
+// CHECK-SAME: %arg0: tensor<i32>
+// CHECK-SAME: %arg1: tensor<!tf_type.resource>
+func.func private @assign_var(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> () attributes {tf.signature.is_stateful} {
+  // CHECK:   "tf_device.cluster"
+  // CHECK:   _mesh = "CPU|batch=1,x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  "tf_device.cluster"() ({
+    "tf.A"(%arg0, %arg1) : (tensor<i32>, tensor<!tf_type.resource2>) -> ()
+    tf_device.return
+  }) : () -> ()
+  func.return
+}
+
+// -----
+
+// Checks that mesh is propagated from consumers for nested functions.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource>) -> (tensor<2xi64>{tf._default_layout ="sharding_specs:unsharded, CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.A"
+  // CHECK-NEXT:      tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf_device.cluster"() ({
+    %1= "tf.A"(%arg1) : (tensor<!tf_type.resource>) -> (tensor<2xi64>)
+    tf_device.return %1 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[CALL_OUT:.*]]:2 = "tf.PartitionedCall"
+  // CHECK-NEXT:      tf_device.return %[[CALL_OUT]]#0
+  // CHECK:         _mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %1 = "tf_device.cluster"() ({
+    %2, %3 = "tf.PartitionedCall"(%0, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""} : (tensor<2xi64>, tensor<i32>) -> (tensor<2xi64>, tensor<i32>)
+    tf_device.return %2 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+  func.return %1 : tensor<2xi64>
+}
+
+// CHECK-LABEL: func private @callee
+// CHECK-SAME:  %arg0: tensor<2xi64>
+// CHECK-SAME:  %arg1: tensor<i32>
+// CHECK:       tf._mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+func.func private @callee(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> (tensor<2xi64>, tensor<i32>) attributes {tf.signature.is_stateful} {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+  // CHECK-NEXT:      tf_device.return %[[B_OUT]]
+  // CHECK-NEXT:   _mesh = "CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %1 = "tf_device.cluster"() ({
+    %0 = "tf.B"(%arg0, %arg1) : (tensor<2xi64>, tensor<i32>) -> (tensor<2xi64>)
+    tf_device.return %0 : tensor<2xi64>
+  }) : () -> tensor<2xi64>
+  func.return %1, %arg1: tensor<2xi64>, tensor<i32>
+}
+
+
+// -----
+
+// Check mesh is propagate from function body if no mesh can be find from inputs.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) -> tensor<f32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @mesh_from_func_body} : (tensor<i32>) -> tensor<f32>
+    tf_device.return %1 : tensor<f32>
+    // CHECK: _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  }) : () -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+
+func.func private @mesh_from_func_body(%arg0: tensor<i32>) -> tensor<f32> attributes {tf.signature.is_stateful} {
+  %0 = "tf_device.cluster"() ({
+    %3 = "tf.Const"() {_layout = ["sharding_specs: mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"], value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+    tf_device.return %3 : tensor<0xi32>
+  }) {_mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> tensor<0xi32>
+  %1 = "tf_device.cluster"() ({
+    %3 = "tf.RandomUniform"(%0) {_layout = ["sharding_specs: mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"], seed = 0 : i64, seed2 = 0 : i64} : (tensor<0xi32>) -> tensor<f32>
+    tf_device.return %3 : tensor<f32>
+  }) {_mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> tensor<f32>
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.Identity"(%1) {} : (tensor<f32>) -> tensor<f32>
+    tf_device.return %3 : tensor<f32>
+    // CHECK: _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  }) : () -> tensor<f32>
+  func.return %2 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8x128x128xf32> {tf._global_shape = #tf_type.shape<8x128x128>, tf._layout = "sharding_specs:x,unsharded,unsharded, mesh:CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1", tf._mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"}) -> (tensor<3xi32> {tf._global_shape = #tf_type.shape<3>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:  _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[8, 128, 128]> : tensor<3xi32>} : () -> tensor<3xi32>
+    tf_device.return {_global_shape = []} %1 : tensor<3xi32>
+  }) {_global_shape = [#tf_type.shape<3>]} : () -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<4xi32> {tf._layout = "sharding_specs:not_sharded mesh:CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"},
+           %arg1: tensor<4xi32> {tf._layout = "sharding_specs:not_sharded mesh:CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"}) -> (tensor<4xi32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.Const"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[8, 8, 128, 128]> : tensor<4xi32>} : () -> tensor<4xi32>
+    tf_device.return %1 : tensor<4xi32>
+  }) : () -> tensor<4xi32>
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.Add"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.Add"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %3 : tensor<4xi32>
+  }) : () -> tensor<4xi32>
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.Identity"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"
+  %4 = "tf_device.cluster"() ({
+    %5 = "tf.Identity"(%arg1) : (tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %5 : tensor<4xi32>
+  }) : () -> tensor<4xi32>
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.CopyToMesh"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  %6 = "tf_device.cluster"() ({
+    %7 = "tf.CopyToMesh"(%2) { layout = "sharding_specs:not_sharded mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : (tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %7 : tensor<4xi32>
+  }) { _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1" } : () -> tensor<4xi32>
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.CopyToMesh"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  %8 = "tf_device.cluster"() ({
+    %9 = "tf.CopyToMesh"(%4) { layout = "sharding_specs:not_sharded mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : (tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %9 : tensor<4xi32>
+  }) { _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1" } : () -> tensor<4xi32>
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.Add"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  %10 = "tf_device.cluster"() ({
+    %11 = "tf.Add"(%6, %8) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %11 : tensor<4xi32>
+  }) : () -> tensor<4xi32>
+
+  func.return %10 :tensor<4xi32>
+}
+
+// -----
+
+// Checks CopyToMeshGrad is written to CopyToMesh.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<4xi32> {tf._layout = "sharding_specs:not_sharded mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"},
+           %arg1: tensor<4xi32> {tf._layout = "sharding_specs:not_sharded mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}) -> (tensor<4xi32>) {
+
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:    "tf.CopyToMesh"
+  // CHECK-NEXT:    tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.CopyToMeshGrad"(%arg0, %arg1) { reference_layout = "sharding_specs:not_sharded mesh:TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    tf_device.return %1 : tensor<4xi32>
+  }) : () -> tensor<4xi32>
+
+  func.return %0 :tensor<4xi32>
+}
+
+// -----
+
+// Check mesh propagation of ops inside tf.WhileRegion op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3", tf._mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"})
+-> (tensor<4xf32> {tf._default_layout = "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.WhileRegion"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       constant
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:       _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       "tf.NotEqual"
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:       _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       constant
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:       _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK:          "tf_device.cluster"
+  // CHECK-NEXT:       "tf.Sub"
+  // CHECK-NEXT:       tf_device.return
+  // CHECK-NEXT:       _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:      _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"
+  %15:2 = "tf_device.cluster"() ({
+    %2:2 = "tf.WhileRegion"(%arg1, %arg0) ({
+      ^bb0(%carg0: tensor<4xf32>, %carg1: tensor<i32>):
+         %11 = "tf_device.cluster"() ({
+           %limit = arith.constant dense<5> : tensor<i32>
+           tf_device.return %limit : tensor<i32>
+         }) : () -> tensor<i32>
+
+
+         %12 = "tf_device.cluster"() ({
+           %cond = "tf.NotEqual"(%carg1, %11) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+           tf_device.return %cond : tensor<i1>
+         }) : () -> tensor<i1>
+
+         "tf.Yield"(%12) : (tensor<i1>) -> ()
+    },  {
+      ^bb0(%barg0: tensor<4xf32>, %barg1: tensor<i32>):
+        %13 = "tf_device.cluster"() ({
+          %one = arith.constant dense<1.0> : tensor<4xf32>
+          tf_device.return %one: tensor<4xf32>
+         }) : () -> tensor<4xf32>
+
+        %14 = "tf_device.cluster"() ({
+          %sub = "tf.Sub"(%barg0, %13) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+          tf_device.return %sub: tensor<4xf32>
+         }) : () -> tensor<4xf32>
+
+        "tf.Yield"(%14, %barg1) : (tensor<4xf32>, tensor<i32>) -> ()
+    }) {is_stateless = true} : (tensor<4xf32>, tensor<i32>) -> (tensor<4xf32>, tensor<i32>)
+
+    tf_device.return %2#0, %2#1 : tensor<4xf32>, tensor<i32>
+  }) : () -> (tensor<4xf32>, tensor<i32>)
+
+  %16 = "tf_device.cluster"() ({
+    %5 = "tf.Identity"(%15#0) : (tensor<4xf32>) -> (tensor<4xf32>)
+    tf_device.return %5 : tensor<4xf32>
+  }) : () -> tensor<4xf32>
+
+  func.return %16 : tensor<4xf32>
+}
+
+// -----
+
+// Check mesh propagation of ops inside tf.IfRegion op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<4xf32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+                        tf._mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<4xf32> {
+
+ // CHECK:       "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Const"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Const"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Const"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Const"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Sum"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.Equal"
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:  _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:  "tf_device.cluster"
+ // CHECK-NEXT:    "tf.IfRegion"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Identity"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Sqrt"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Relayout"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.DTensorLayout"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Identity"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf.Yield"
+ // CHECK:           "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Identity"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf_device.cluster"
+ // CHECK-NEXT:        "tf.Identity"
+ // CHECK-NEXT:        tf_device.return
+ // CHECK-NEXT:      _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ // CHECK-NEXT:      "tf.Yield"
+ // CHECK-NEXT:    (tensor<i1>) -> (tensor<i1>, tensor<4xf32>)
+ // CHECK-NEXT:    tf_device.return
+ // CHECK-NEXT:    _mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"
+ %0 = "tf_device.cluster"() ({
+    %10 = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    tf_device.return %10 : tensor<f32>
+  }) : () -> tensor<f32>
+  %1 = "tf_device.cluster"() ({
+    %10 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+    tf_device.return %10 : tensor<1xi32>
+  }) : () -> tensor<1xi32>
+  %2 = "tf_device.cluster"() ({
+    %10 = "tf.Const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
+    tf_device.return %10 : tensor<4xf32>
+  }) : () -> tensor<4xf32>
+  %3 = "tf_device.cluster"() ({
+    %10 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    tf_device.return %10 : tensor<i1>
+  }) : () -> tensor<i1>
+  %5 = "tf_device.cluster"() ({
+    %10 = "tf.Sum"(%arg1, %1) {device = "", keep_dims = false} : (tensor<4xf32>, tensor<1xi32>) -> tensor<f32>
+    tf_device.return %10 : tensor<f32>
+  }) : () -> tensor<f32>
+  %6 = "tf_device.cluster"() ({
+    %10 = "tf.Equal"(%5, %0) {device = "", incompatible_shape_error = true} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    tf_device.return %10 : tensor<i1>
+  }) : () -> tensor<i1>
+  %7:2 = "tf_device.cluster"() ({
+    %10:2 = "tf.IfRegion"(%6) ({
+      %11 = "tf_device.cluster"() ({
+        %16 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+        tf_device.return %16 : tensor<i1>
+      }) : () -> tensor<i1>
+      %12 = "tf_device.cluster"() ({
+        %16 = "tf.Sqrt"(%arg1) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+        tf_device.return %16 : tensor<4xf32>
+      }) : () -> tensor<4xf32>
+      %13 = "tf_device.cluster"() ({
+        %16 = "tf.Relayout"(%12) {device = "", layout = "sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : (tensor<4xf32>) -> tensor<4xf32>
+        tf_device.return %16 : tensor<4xf32>
+      }) : () -> tensor<4xf32>
+      %14 = "tf_device.cluster"() ({
+        %16 = "tf.DTensorLayout"(%13) {global_shape = #tf_type.shape<4>,  layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xf32>) -> tensor<4xf32>
+        tf_device.return %16 : tensor<4xf32>
+      }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<4xf32>
+      %15 = "tf_device.cluster"() ({
+        %16 = "tf.Identity"(%14) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+        tf_device.return %16 : tensor<4xf32>
+      }) : () -> tensor<4xf32>
+      "tf.Yield"(%11, %15) : (tensor<i1>, tensor<4xf32>) -> ()
+    },  {
+      %11 = "tf_device.cluster"() ({
+        %13 = "tf.Identity"(%3) {device = ""} : (tensor<i1>) -> tensor<i1>
+        tf_device.return %13 : tensor<i1>
+      }) : () -> tensor<i1>
+      %12 = "tf_device.cluster"() ({
+        %13 = "tf.Identity"(%2) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+        tf_device.return %13 : tensor<4xf32>
+      }) : () -> tensor<4xf32>
+      "tf.Yield"(%11, %12) : (tensor<i1>, tensor<4xf32>) -> ()
+    }) {_else_func_name = "cond_false_150", _lower_using_switch_merge = true, _read_only_resource_inputs = [], _then_func_name = "cond_true_140", device = "", is_stateless = true} : (tensor<i1>) -> (tensor<i1>, tensor<4xf32>)
+    tf_device.return %10#0, %10#1 : tensor<i1>, tensor<4xf32>
+  }) : () -> (tensor<i1>, tensor<4xf32>)
+  %8 = "tf_device.cluster"() ({
+    %10 = "tf.Identity"(%7#1) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %10 : tensor<4xf32>
+  }) : () -> tensor<4xf32>
+  %9 = "tf_device.cluster"() ({
+    %10 = "tf.Identity"(%8) {device = ""} : (tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %10 : tensor<4xf32>
+  }) : () -> tensor<4xf32>
+
+  func.return %9 : tensor<4xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
new file mode 100644
index 00000000000..71f5101637d
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
@@ -0,0 +1,391 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-move-compilation-to-host -verify-diagnostics | FileCheck %s
+
+// Check that TPU Compilation ops are moved to host computation functions and
+// Send/Recv ops are inserted to transfer program key.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 656 : i32}}  {
+  // CHECK-LABEL: func @main
+  func.func @main(%arg0: tensor<i32>,%arg1: tensor<4xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    // CHECK:       "tf.StatefulPartitionedCall"
+    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
+    // CHECK-SAME:  f = @_func_0
+    // CHECK-NEXT:  "tf.StatefulPartitionedCall"
+    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:  f = @_func_1
+    "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = [], _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<4xi32>) -> ()
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {_layout = ["sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  // CHECK-LABEL: func private @_func_0
+  // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+  // CHECK-SAME:  %[[ARG1:.*]]: tensor<4xi32>
+  func.func private @_func_0(%arg0: tensor<i32>, %arg1: tensor<4xi32>) {
+    // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<[0, 1]>
+    // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<1>
+    // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+    // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<1>
+    // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+    // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<>
+    // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+    // CHECK:        %[[PROGRAM_KEY:.*]] = "tf.Case"(%[[DEVICE_ORDINAL_SCALAR]])
+    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK-NEXT:   "tf.TPUExecute"(%[[ARG0]], %[[ARG1]], %[[PROGRAM_KEY]])
+    // CHECK-NEXT:   tf_device.return
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%arg0, %arg1, %0#1) : (tensor<i32>, tensor<4xi32>, tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  // CHECK-LABEL: func private @_func_1
+  // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+  func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
+    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
+    // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device_incarnation = 0
+    // CHECK-SAME:   tensor_name = "compilation_send_recv_key_0
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device_incarnation = 0
+    // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
+    // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUT]]#0)
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<0>
+    // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<1>
+    // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+    // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<1>
+    // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+    // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME:   value = dense<>
+    // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+    // CHECK-NEXT: %[[BROADCASTED_KEY:.*]] = "tf.Case"(%[[DEVICE_ORDINAL_SCALAR]])
+    // CHECK-NEXT: "tf.Const"
+    // CHECK-NEXT: "tf.Cast"
+    // CHECK-NEXT: %[[MOD_OUT:.*]] = "tf.FloorMod"
+    // CHECK-NEXT: "tf._XlaRecvAtHostV2"(%[[BROADCASTED_KEY]], %[[MOD_OUT]]
+    // CHECK-NEXT: "tf.Sqrt"
+    // CHECK-NEXT: "tf.Identity"
+    // CHECK-NEXT: "tf.Identity"
+    // CHECK-NEXT: return
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %3 = "tf.FloorMod"(%2, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%1, %3) {key = "communication_key_sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_0"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<f32>
+    %5 = "tf.Sqrt"(%4) : (tensor<f32>) -> tensor<f32>
+    %6 = "tf.Identity"(%5) : (tensor<f32>) -> tensor<f32>
+    %7 = "tf.Identity"(%6) : (tensor<f32>) -> tensor<f32>
+    func.return %7 : tensor<f32>
+  }
+}
+
+// -----
+
+// Check that TPU Compilation ops are moved to host computation functions and
+// Send/Recv ops are inserted to transfer program key for
+// TPUExecuteAndUpdateVariables op
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 656 : i32}}  {
+  // CHECK-LABEL: func @main
+  func.func @main(%arg0: tensor<i32>,%arg1: tensor<*x!tf_type.resource<tensor<4xf32>>> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    // CHECK:       "tf.StatefulPartitionedCall"
+    // CHECK-SAME:  _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"
+    // CHECK-SAME:  f = @_func_0
+    // CHECK-NEXT:  "tf.StatefulPartitionedCall"
+    // CHECK-SAME:  _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:  f = @_func_1
+    "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = [], _mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<*x!tf_type.resource<tensor<4xf32>>>) -> ()
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {_layout = ["sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], _mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  // CHECK-LABEL: func private @_func_0
+  // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+  // CHECK-SAME:  %[[ARG1:.*]]: tensor<*x!tf_type.resource<tensor<4xf32>>>
+  func.func private @_func_0(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<4xf32>>>) {
+    // CHECK-NEXT: %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<[0, 1]>
+    // CHECK-NEXT: %[[SIZE_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<1>
+    // CHECK-NEXT: %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+    // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<1>
+    // CHECK-NEXT: %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+    // CHECK-NEXT: %[[SCALAR_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<>
+    // CHECK-NEXT: %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+    // CHECK:      %[[PROGRAM_KEY:.*]] = "tf.Case"(%[[DEVICE_ORDINAL_SCALAR]])
+    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK-NEXT:   "tf.TPUExecuteAndUpdateVariables"(%[[ARG1]], %[[PROGRAM_KEY]])
+    // CHECK-NEXT:   tf_device.return
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecuteAndUpdateVariables"(%arg1, %0#1) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  // CHECK-LABEL: func private @_func_1
+  // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+  func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
+    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
+    // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device_incarnation = 0
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device_incarnation = 0
+    // CHECK-SAME:   tensor_name = "compilation_send_recv_key_0
+    // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
+    // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    // CHECK-SAME:   send_device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-SAME:   send_device_incarnation = 0
+    // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
+    // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUT]]#0)
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<0>
+    // CHECK-NEXT: %[[SIZE_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<1>
+    // CHECK-NEXT: %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+    // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<1>
+    // CHECK-NEXT: %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+    // CHECK-NEXT: %[[SCALAR_TYPE:.*]] = "tf.Const"
+    // CHECK-SAME: value = dense<>
+    // CHECK-NEXT: %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+    // CHECK-NEXT: %[[BROADCASTED_KEY:.*]] = "tf.Case"(%[[DEVICE_ORDINAL_SCALAR]])
+    // CHECK-NEXT: "tf.Const"
+    // CHECK-NEXT: "tf.Cast"
+    // CHECK-NEXT: %[[MOD_OUT:.*]] = "tf.FloorMod"
+    // CHECK-NEXT: "tf._XlaRecvAtHostV2"(%[[BROADCASTED_KEY]], %[[MOD_OUT]]
+    // CHECK-NEXT: "tf.Sqrt"
+    // CHECK-NEXT: "tf.Identity"
+    // CHECK-NEXT: "tf.Identity"
+    // CHECK-NEXT: return
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %3 = "tf.FloorMod"(%2, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%1, %3) {key = "communication_key_sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_0"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<f32>
+    %5 = "tf.Sqrt"(%4) : (tensor<f32>) -> tensor<f32>
+    %6 = "tf.Identity"(%5) : (tensor<f32>) -> tensor<f32>
+    %7 = "tf.Identity"(%6) : (tensor<f32>) -> tensor<f32>
+    func.return %7 : tensor<f32>
+  }
+}
+
+// -----
+
+// Check that StatefulPartitionedCall op without mesh specification is
+// disallowed.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 656 : i32}}  {
+  func.func @main(%arg0: tensor<i32>,%arg1: tensor<4xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    "tf.StatefulPartitionedCall"(%arg0, %arg1) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<4xi32>) -> ()
+    // expected-error @+1 {{StatefulPartitionCall op must have `_mesh` attribute specified}}
+    "tf.StatefulPartitionedCall"(%arg0) {config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_2} : (tensor<i32>) -> ()
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {_mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  func.func private @_func_0(%arg0: tensor<i32>, %arg1: tensor<4xi32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%arg0, %arg1, %0#1) : (tensor<i32>, tensor<4xi32>, tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_2(%arg0: tensor<i32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %3 = "tf.FloorMod"(%2, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%1, %3) {key = "communication_key_sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_0"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<f32>
+    %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
+    func.return %5 : tensor<f32>
+  }
+}
+
+// -----
+
+// Check that multiple TPU cluster computations with same mesh is disallowed.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 656 : i32}}  {
+  func.func @main(%arg0: tensor<i32>,%arg1: tensor<4xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    "tf.StatefulPartitionedCall"(%arg0, %arg1) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<4xi32>) -> ()
+    // expected-error @+1 {{There should be exactly 1 function for each mesh in computation cluster}}
+    "tf.StatefulPartitionedCall"(%arg0) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_2} : (tensor<i32>) -> ()
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {_mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  func.func private @_func_0(%arg0: tensor<i32>, %arg1: tensor<4xi32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%arg0, %arg1, %0#1) : (tensor<i32>, tensor<4xi32>, tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_2(%arg0: tensor<i32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %3 = "tf.FloorMod"(%2, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%1, %3) {key = "communication_key_sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_0"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<f32>
+    %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
+    func.return %5 : tensor<f32>
+  }
+}
+
+// -----
+
+// Check that at multiple TPU computations are disallowed.
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 656 : i32}}  {
+  // expected-error @+1 {{Only 1 XLA cluster for DTensor computation is supported for now}}
+  func.func @main(%arg0: tensor<i32>,%arg1: tensor<4xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}) -> (tensor<f32> {tf._global_shape = #tf_type.shape<>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    "tf.StatefulPartitionedCall"(%arg0, %arg1) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", config_proto = "", executor_type = "", f = @_func_0} : (tensor<i32>, tensor<4xi32>) -> ()
+    "tf.StatefulPartitionedCall"(%arg0) {_mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:TPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:TPU:0", config_proto = "", executor_type = "", f = @_func_2} : (tensor<i32>) -> ()
+    %0 = "tf.StatefulPartitionedCall"(%arg0) {_mesh = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config = "|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", config_proto = "", executor_type = "", f = @_func_1} : (tensor<i32>) -> tensor<f32>
+    func.return %0 : tensor<f32>
+  }
+
+  func.func private @_func_0(%arg0: tensor<i32>, %arg1: tensor<4xi32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%arg0, %arg1, %0#1) : (tensor<i32>, tensor<4xi32>, tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_2(%arg0: tensor<i32>) {
+    %0:2 = "tf_device.launch"() ({
+      %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+      tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    "tf_device.launch"() ({
+      "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    "tf_device.launch"() ({
+      "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+      tf_device.return
+    }) {device = ""} : () -> ()
+    func.return
+  }
+
+  func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<2x!tf_type.string>
+    %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<i32>) -> tensor<i64>
+    %3 = "tf.FloorMod"(%2, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    %4 = "tf._XlaRecvAtHostV2"(%1, %3) {key = "communication_key_sharding_specs: mesh:|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0_0"} : (tensor<2x!tf_type.string>, tensor<i64>) -> tensor<f32>
+    %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
+    func.return %5 : tensor<f32>
+  }
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/op_to_device_cluster.mlir b/tensorflow/dtensor/mlir/tests/op_to_device_cluster.mlir
new file mode 100644
index 00000000000..fe73bfbd745
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/op_to_device_cluster.mlir
@@ -0,0 +1,72 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-op-to-device-cluster -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: func @check_device_cluster_with_mesh_attribute
+func.func @check_device_cluster_with_mesh_attribute() -> tensor<i32> {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf.Const"() {value = dense<10> : tensor<i32>, _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_device_cluster_from_op_mesh
+func.func @check_device_cluster_from_op_mesh() -> tensor<i32> {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf.Const"() {value = dense<10> : tensor<i32>, _layout = ["sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"], _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_device_cluster_from_dtensor_layout_op
+func.func @check_device_cluster_from_dtensor_layout_op(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.DTensorLayout"
+  // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs:scalar |x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_device_cluster_from_copy_to_mesh_op
+func.func @check_device_cluster_from_copy_to_mesh_op(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.CopyToMesh"
+  // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+  // CHECK-NEXT:   _mesh = "|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"
+  %0 = "tf.CopyToMesh"(%arg0) { layout = "sharding_specs:x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: func @check_yield_op_ignored
+func.func @check_yield_op_ignored(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.WhileRegion"
+  // CHECK-NEXT:       bb0(%arg1: tensor<i32>):
+  // CHECK-NEXT:       %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+  // CHECK-NEXT:         %[[H_OUT:.*]] = "tf.H"
+  // CHECK-NEXT:         tf_device.return %[[H_OUT]]
+  // CHECK-NEXT:       () -> tensor<i1>
+  // CHECK-NEXT:       "tf.Yield"(%[[CLUSTER_OUT]])
+  // CHECK:            ^bb0(%arg1: tensor<i32>):
+  // CHECK-NEXT:       "tf.Yield"
+  %0 = "tf.WhileRegion"(%arg0) ({
+    ^bb0(%arg1: tensor<i32>):
+      %1 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+    }, {
+    ^bb0(%arg1: tensor<i32>):
+      "tf.Yield"(%arg1) : (tensor<i32>) -> ()
+    }) { is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
new file mode 100644
index 00000000000..40529d8adc2
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
@@ -0,0 +1,77 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-propagate-default-layout | FileCheck %s
+
+// Check that layouts attributes in function arguments are converted to layout
+// ops.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[ARG_0:[a-z0-9]*]]: tensor<i32>
+// CHECK-SAME:  %[[ARG_1:[a-z0-9]*]]: tensor<i32>
+func.func @main(
+  %arg1: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"},
+  %arg2: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> (tensor<i32>) {
+  // CHECK:      %[[ARG1_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%[[ARG_1]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  // CHECK-SAME: (tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: %[[ARG0_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%[[ARG_0]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  // CHECK-SAME: (tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: "tf.A"(%[[ARG0_OUT]], %[[ARG1_OUT]])
+  // CHECK-NEXT: "tf.B"(%[[ARG1_OUT]])
+  // CHECK-NEXT: "tf.C"(%[[ARG0_OUT]])
+  %1 = "tf.A"(%arg1, %arg2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  "tf.B"(%arg2) : (tensor<i32>) -> ()
+  "tf.C"(%arg1) : (tensor<i32>) -> ()
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Check that layouts attributes in function outputs are converted to layout
+// ops.
+
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<i32>{tf._default_layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) {
+  // CHECK:      %[[A_OUT:.*]] = "tf.A"() : () -> tensor<i32>
+  // CHECK-NEXT: %[[LAYOUT_A_OUT:.*]] = "tf.DTensorLayout"(%[[A_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  // CHECK-SAME: (tensor<i32>) -> tensor<i32>
+  %1 = "tf.A"() : () -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Check that layouts attributes of operations are correclty converted to layout
+// op.
+
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<i32>) {
+  // CHECK:      %[[A_OUT:.*]] = "tf.A"()
+  // CHECK-NEXT: "tf.DTensorLayout"(%[[A_OUT]])
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Check that resource typed arg with layouts are correctly converted to DTesnorLayout with global shape.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<!tf_type.resource<tensor<4x2xf32>>>{ tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> tensor<i32> {
+  // CHECK:      "tf.DTensorLayout"(%arg0)
+  // CHECK-SAME: global_shape = #tf_type.shape<4x2>
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Check that variant typed arg with layouts are correctly converted to DTesnorLayout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<!tf_type.variant<tensor<4x4xi32>>>{ tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> tensor<i32> {
+  // CHECK:      "tf.DTensorLayout"(%arg0)
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
+  %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/propagate_device_id_to_function.mlir b/tensorflow/dtensor/mlir/tests/propagate_device_id_to_function.mlir
new file mode 100644
index 00000000000..df7d347e5a9
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/propagate_device_id_to_function.mlir
@@ -0,0 +1,35 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-propagate-device-id-to-function-args | FileCheck %s
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[ARG_0:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[ARG_1:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[ARG_2:[a-z0-9]+]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<i32>{ tf._layout = ["\0A\00\12\C6\01\0A\05\0A\01x\10\02\0A\05\0A\01y\10\02\12,/job:localhost/replica:0/task:0/device:CPU:0\12,/job:localhost/replica:0/task:0/device:CPU:1\12,/job:localhost/replica:0/task:0/device:CPU:2\12,/job:localhost/replica:0/task:0/device:CPU:3"]},
+  %arg2: tensor<i32>{ tf._layout = ["\0A\00\12\C6\01\0A\05\0A\01x\10\02\0A\05\0A\01y\10\02\12,/job:localhost/replica:0/task:0/device:CPU:0\12,/job:localhost/replica:0/task:0/device:CPU:1\12,/job:localhost/replica:0/task:0/device:CPU:2\12,/job:localhost/replica:0/task:0/device:CPU:3"]}) -> (tensor<i32>) {
+  // CHECK: "tf.StatefulPartitionedCall"(%[[ARG_0]], %[[ARG_1]], %[[ARG_2]])
+  %1 = "tf.StatefulPartitionedCall"(%arg1, %arg2) {f = @callee1, config = "", config_proto = "", executor_type = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// CHECK-LABEL: func private @callee1
+// CHECK-SAME:  %[[CALL1_ARG0:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[CALL1_ARG1:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[CALL1_ARG2:[a-z0-9]+]]: tensor<i32>
+func.func private @callee1(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> attributes {tf.signature.is_stateful} {
+  // CHECK:      "tf.PartitionedCall"(%[[CALL1_ARG0]], %[[CALL1_ARG1]], %[[CALL1_ARG2]])
+  // CHECK-SAME: {config = "", config_proto = "", executor_type = "", f = @callee2}
+  // CHECK-SAME: (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  %0 = "tf.PartitionedCall"(%arg0, %arg1) {f = @callee2, config = "", config_proto = "", executor_type = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func private @callee2
+// CHECK-SAME:  %[[CALL2_ARG0:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[CALL2_ARG1:[a-z0-9]+]]: tensor<i32>
+// CHECK-SAME:  %[[CALL2_ARG2:[a-z0-9]+]]: tensor<i32>
+func.func private @callee2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> attributes {tf.signature.is_stateful} {
+  // CHECK: "tf.Add"(%[[CALL2_ARG1]], %[[CALL2_ARG2]])
+  %1 = "tf.Add"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir b/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir
new file mode 100644
index 00000000000..69a0d3f875d
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/restore_and_assign.mlir
@@ -0,0 +1,109 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-infer-shapes-for-restorev2-op -dtensor-layout-propagation-v2 -verify-diagnostics | FileCheck %s
+
+// Check the combination of inferring shape for restorev2 op and layout
+// propagation. After running both passes, all unknown shapes from RestoreV2
+// should be made known, and output layouts of RestoreV2 should match the
+// resource tensors being assigned to.
+
+// Single mesh
+func.func @main(
+  %arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.string>,
+  %arg2: tensor<!tf_type.string>,
+  %arg3: tensor<!tf_type.string>,
+  %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>) {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       %[[RESTORE:.*]] = "tf.RestoreV2"(%0, %1, %2) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[DLAYOUT:.*]] = "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    "tf_device.cluster"() ({
+      %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %2 = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %3 = "tf.DTensorLayout"(%arg4) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>) -> tensor<*x!tf_type.resource<tensor<4x8xf32>>>
+      %4 = "tf.RestoreV2"(%0, %1, %2): (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> (tensor<*xf32>)
+      "tf.AssignVariableOp"(%3, %4) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+    func.return
+}
+
+// -----
+
+// Single mesh with ops between the Restore and Assign.
+func.func @main(
+  %arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.string>,
+  %arg2: tensor<!tf_type.string>,
+  %arg3: tensor<!tf_type.string>,
+  %arg4: tensor<*x!tf_type.resource<tensor<4x8xf64>>>) {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       "tf.DTensorLayout"
+    // CHECK-NEXT:       %[[RESTORE:.*]]  = "tf.RestoreV2"(%0, %1, %2) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[DLAYOUT:.*]]  = "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[CAST:.*]]     = "tf.Cast"(%[[DLAYOUT]]) {Truncate = false} : (tensor<4x8xf32>) -> tensor<4x8xf64>
+    // CHECK-NEXT:       %[[DLAYOUT2:.*]] = "tf.DTensorLayout"(%[[CAST]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf64>) -> tensor<4x8xf64>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%3, %[[DLAYOUT2]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf64>>>, tensor<4x8xf64>) -> ()
+    "tf_device.cluster"() ({
+      %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %2 = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %3 = "tf.DTensorLayout"(%arg4) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<*x!tf_type.resource<tensor<4x8xf64>>>) -> tensor<*x!tf_type.resource<tensor<4x8xf64>>>
+      %4 = "tf.RestoreV2"(%0, %1, %2): (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> (tensor<*xf32>)
+      %5 = "tf.Cast"(%4) {} : (tensor<*xf32>) -> tensor<*xf64>
+      "tf.AssignVariableOp"(%3, %5) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf64>>>, tensor<*xf64>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+    func.return
+}
+
+// -----
+
+// Cross mesh with send/recv
+func.func @main(
+  %arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.string>,
+  %arg2: tensor<!tf_type.string>,
+  %arg3: tensor<!tf_type.string>,
+  %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>) {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       %[[RESOURCE:.*]] = "tf.DTensorLayout"(%arg4)
+    // CHECK-NEXT:       %[[RECV:.*]] = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>, shape = #tf_type.shape<4x8>} : () -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[RECV_DL:.*]] = "tf.DTensorLayout"(%[[RECV]])
+    // CHECK-NEXT:       %[[IDENTITY:.*]] = "tf.Identity"(%[[RECV_DL]]) : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       %[[IDENTITY_DL:.*]] = "tf.DTensorLayout"(%[[IDENTITY]])
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%[[RESOURCE]], %[[IDENTITY_DL]]) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       tf_device.return
+    "tf_device.cluster"() ({
+      %4 = "tf.DTensorLayout"(%arg4) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1>} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>) -> tensor<*x!tf_type.resource<tensor<4x8xf32>>>
+      %5 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>, shape = #tf_type.shape<*>} : () -> tensor<*xf32>
+      %6 = "tf.Identity"(%5) : (tensor<*xf32>) -> tensor<*xf32>
+      "tf.AssignVariableOp"(%4, %6) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
+      tf_device.return
+    }) {_mesh="TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       %[[DL1:.*]] = "tf.DTensorLayout"(%arg1)
+    // CHECK-NEXT:       %[[DL2:.*]] = "tf.DTensorLayout"(%arg2)
+    // CHECK-NEXT:       %[[DL3:.*]] = "tf.DTensorLayout"(%arg3)
+    // CHECK-NEXT:       %[[RESTORE:.*]] = "tf.RestoreV2"(%[[DL1]], %[[DL2]], %[[DL3]]) : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.DTensorLayout"(%[[RESTORE]]) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.DTensorSend"
+    // CHECK-NEXT:       tf_device.return
+    "tf_device.cluster"() ({
+      %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %1 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %2 = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %3 = "tf.RestoreV2"(%0, %1, %2) {} : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> (tensor<*xf32>)
+      "tf.DTensorSend"(%3) {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1", target_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>} : (tensor<*xf32>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> (tensor<i32>)
+    func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir b/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir
new file mode 100644
index 00000000000..ba2a3a3f907
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/restore_shape_inference.mlir
@@ -0,0 +1,62 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-infer-shapes-for-restorev2-op -verify-diagnostics | FileCheck %s
+
+// Check the tf.RestoreV2Op's and all connected ops' resulting types are inferred from the AssignVariableOps in a single mesh. All unknown shapes should be known after this pass.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<i64>>>) {
+    // CHECK:        %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<i64>)
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK:        %1 = "tf.Identity"(%0#1) : (tensor<i64>) -> tensor<i64>
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xi64>)
+    "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
+    %1 = "tf.Identity"(%0#1) {} : (tensor<*xi64>) -> tensor<*xi64>
+    "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<*xi64>) -> ()
+    func.return
+}
+
+// -----
+
+
+// Check the tf.RestoreV2Op's and all connected ops' resulting types are inferred from the AssignVariableOps in cross mesh cluster. All unknown shapes should be known after this pass.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<i64>>>) {
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       %2 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>, shape = #tf_type.shape<4x8>} : () -> tensor<4x8xf32>
+    // CHECK-NEXT:       %3 = "tf.Identity"(%2) : (tensor<4x8xf32>) -> tensor<4x8xf32>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg4, %3) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       tf_device.return
+    "tf_device.cluster"() ({
+      %1 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>, shape = #tf_type.shape<*>} : () -> tensor<*xf32>
+      %2 = "tf.Identity"(%1) : (tensor<*xf32>) -> tensor<*xf32>
+      "tf.AssignVariableOp"(%arg4, %2) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
+      tf_device.return
+    }) {_mesh="TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>, tensor<i32>)
+
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:       %2:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<i64>)
+    // CHECK-NEXT:       %3 = "tf.Identity"(%2#1) : (tensor<i64>) -> tensor<i64>
+    // CHECK-NEXT:       "tf.AssignVariableOp"(%arg5, %3) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+    // CHECK-NEXT:       "tf.DTensorSend"(%2#0) {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", target_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>} : (tensor<4x8xf32>) -> ()
+    // CHECK-NEXT:       tf_device.return
+    "tf_device.cluster"() ({
+      %6:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) {} : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xi64>)
+      %7 = "tf.Identity"(%6#1) : (tensor<*xi64>) -> tensor<*xi64>
+      "tf.AssignVariableOp"(%arg5, %7) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<*xi64>) -> ()
+      "tf.DTensorSend"(%6#0) {key = "communication_key_sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3", target_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3>} : (tensor<*xf32>) -> ()
+      tf_device.return
+    }) {_mesh="CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"} : () -> (tensor<i32>)
+    func.return
+}
+
+// -----
+
+// Check correctness of shape inference and element type propagation of a graph containing tf.Cast ops.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.string>, %arg2: tensor<2x!tf_type.string>, %arg3: tensor<2x!tf_type.string>, %arg4: tensor<*x!tf_type.resource<tensor<4x8xf32>>>, %arg5: tensor<*x!tf_type.resource<tensor<f32>>>) {
+    // CHECK:        %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<4x8xf32>, tensor<bf16>)
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<4x8xf32>) -> ()
+    // CHECK:        %1 = "tf.Cast"(%0#1) {Truncate = false} : (tensor<bf16>) -> tensor<f32>
+    // CHECK-NEXT:   "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %0:2 = "tf.RestoreV2"(%arg1, %arg2, %arg3): (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xbf16>)
+    "tf.AssignVariableOp"(%arg4, %0#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<4x8xf32>>>, tensor<*xf32>) -> ()
+    %1 = "tf.Cast"(%0#1) {} : (tensor<*xbf16>) -> tensor<*xf32>
+    "tf.AssignVariableOp"(%arg5, %1) {validate_shape = false} : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<*xf32>) -> ()
+    func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/set_default_sharding.mlir b/tensorflow/dtensor/mlir/tests/set_default_sharding.mlir
new file mode 100644
index 00000000000..93daa8c893a
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/set_default_sharding.mlir
@@ -0,0 +1,30 @@
+// RUN: dtensor-opt %s -dtensor-set-default-sharding | FileCheck %s
+
+// CHECK-LABEL: func @empty_func
+func.func @empty_func(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>)  -> (tensor<?xi32>, tensor<?xi32>) {
+  func.return %arg0, %arg1 : tensor<?xi32>, tensor<?xi32>
+}
+
+// CHECK-LABEL: func @check_default_sharding_set
+func.func @check_default_sharding_set() {
+  %0 = "tf.A"() : () -> tensor<?xi32>
+  %1 = "tf.B"() : () -> tensor<?xi32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: _tpu_replicate = "cluster0"
+  // CHECK-SAME: input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"],
+  // CHECK-SAME: output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"]
+  %2, %3 = "tf_device.cluster_func"(%0, %1) {_tpu_replicate = "cluster0", func = @empty_func} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  func.return
+}
+
+// CHECK-LABEL: func @check_non_tpu_cluster_func_ignored
+func.func @check_non_tpu_cluster_func_ignored() {
+  %0 = "tf.A"() : () -> tensor<?xi32>
+  %1 = "tf.B"() : () -> tensor<?xi32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-NOT: _tpu_replicate = "cluster0"
+  // CHECK-NOT: input_sharding_configuration
+  // CHECK-NOT: output_sharding_configuration
+  %2, %3 = "tf_device.cluster_func"(%0, %1) {func = @empty_func} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/sparse_expansion.mlir b/tensorflow/dtensor/mlir/tests/sparse_expansion.mlir
new file mode 100644
index 00000000000..e7a6981dcd8
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/sparse_expansion.mlir
@@ -0,0 +1,57 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-sparse-expansion -verify-diagnostics | FileCheck %s
+
+// Check SparseExpansion for tf.MatMul with 1 SparseTensor left operand expands to SparseTensorDenseMatMul op.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x16xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}, %arg2: tensor<?x2xi64>, %arg3: tensor<2xi64>, %arg4: tensor<?xf32>) -> tensor<8x16xf32> {
+  // CHECK: "tf.Const"
+  // CHECK-NEXT: "tf.SparseToDense"
+  // CHECK-NEXT: %[[DENSE:.*]] = "tf.DTensorLayout"(%arg1)
+  // CHECK-NEXT: "tf.DTensorLayout"
+  // CHECK-NEXT: "tf.SparseTensorDenseMatMul"(%arg2, %arg4, %arg3, %[[DENSE]])
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %0 = "tf.SparseToDense"(%arg2, %arg3, %arg4, %cst) : (tensor<?x2xi64>, tensor<2xi64>, tensor<?xf32>, tensor<f32>) -> tensor<8x4xf32>
+  %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x16>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<4x16xf32>) -> tensor<4x16xf32>
+  %2 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  %3 = "tf.MatMul"(%2, %1) {device = "", transpose_a = false, transpose_b = false} : (tensor<8x4xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  %5 = "tf.Identity"(%2) {device = ""} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  func.return %4 : tensor<8x16xf32>
+}
+
+// -----
+
+
+// Check that after SparseExpansion, unused SparseToDense ops are removed.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x16xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}, %arg2: tensor<?x2xi64>, %arg3: tensor<2xi64>, %arg4: tensor<?xf32>) -> tensor<8x16xf32> {
+  // CHECK: %[[DENSE:.*]] = "tf.DTensorLayout"(%arg1)
+  // CHECK-NEXT: "tf.SparseTensorDenseMatMul"(%arg2, %arg4, %arg3, %[[DENSE]])
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %0 = "tf.SparseToDense"(%arg2, %arg3, %arg4, %cst) : (tensor<?x2xi64>, tensor<2xi64>, tensor<?xf32>, tensor<f32>) -> tensor<8x4xf32>
+  %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x16>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<4x16xf32>) -> tensor<4x16xf32>
+  %2 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  %3 = "tf.MatMul"(%2, %1) {device = "", transpose_a = false, transpose_b = false} : (tensor<8x4xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
+  %4 = "tf.Identity"(%3) {device = ""} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  func.return %4 : tensor<8x16xf32>
+}
+
+// -----
+
+// Check SparseExpansion for tf.MatMul with 2 SparseTensor operands is a no-change. That is, the original op gets returned.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<?x2xi64>, %arg2: tensor<2xi64>, %arg3: tensor<?xf32>, %arg4: tensor<?x2xi64>, %arg5: tensor<2xi64>, %arg6: tensor<?xf32>) -> tensor<8x16xf32> {
+  // CHECK: "tf.Const"
+  // CHECK-NEXT: "tf.SparseToDense"
+  // CHECK: "tf.Const"
+  // CHECK: "tf.SparseToDense"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.DTensorLayout"
+  // CHECK: "tf.MatMul"
+  // CHECK: "tf.Identity"
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %0 = "tf.SparseToDense"(%arg4, %arg5, %arg6, %cst) : (tensor<?x2xi64>, tensor<2xi64>, tensor<?xf32>, tensor<f32>) -> tensor<4x16xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.SparseToDense"(%arg1, %arg2, %arg3, %cst_0) : (tensor<?x2xi64>, tensor<2xi64>, tensor<?xf32>, tensor<f32>) -> tensor<8x4xf32>
+  %2 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<4x16>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<4x16xf32>) -> tensor<4x16xf32>
+  %3 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|batch=2,x=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+  %4 = "tf.MatMul"(%3, %2) {device = "", transpose_a = false, transpose_b = false} : (tensor<8x4xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
+  %5 = "tf.Identity"(%4) {device = ""} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  func.return %5 : tensor<8x16xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_batchparallel.mlir b/tensorflow/dtensor/mlir/tests/spmd_batchparallel.mlir
new file mode 100644
index 00000000000..13acf9f0746
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_batchparallel.mlir
@@ -0,0 +1,38 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// Test replicated layout.
+
+func.func @main(%arg0: tensor<1xf32>,
+           %arg1: tensor<8x128x128x3xf32> {tf._layout = "sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<8x128x128x3xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.AdjustSaturation"
+  // CHECK-NEXT:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<1xf32>) -> tensor<1xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x128x128x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128x3xf32>) -> tensor<8x128x128x3xf32>
+    %3 = "tf.AdjustSaturation"(%2, %1) {} : (tensor<8x128x128x3xf32>, tensor<1xf32>) -> tensor<8x128x128x3xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<8x128x128x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128x3xf32>) -> tensor<8x128x128x3xf32>
+    tf_device.return %4 : tensor<8x128x128x3xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<8x128x128x3xf32>
+  func.return %0 : tensor<8x128x128x3xf32>
+}
+
+// -----
+
+// Test batch sharded layout. Should emit Identity op.
+
+func.func @main(%arg0: tensor<1xf32>,
+           %arg1: tensor<8x128x128x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<8x128x128x3xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.AdjustSaturation"
+  // CHECK-NEXT: "tf.IdentityN"
+  // CHECK-NEXT:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<1xf32>) -> tensor<1xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x128x128x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128x3xf32>) -> tensor<8x128x128x3xf32>
+    %3 = "tf.AdjustSaturation"(%2, %1) {} : (tensor<8x128x128x3xf32>, tensor<1xf32>) -> tensor<8x128x128x3xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<8x128x128x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128x3xf32>) -> tensor<8x128x128x3xf32>
+    tf_device.return %4 : tensor<8x128x128x3xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<8x128x128x3xf32>
+  func.return %0 : tensor<8x128x128x3xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_concat.mlir b/tensorflow/dtensor/mlir/tests/spmd_concat.mlir
new file mode 100644
index 00000000000..8989bda8760
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_concat.mlir
@@ -0,0 +1,190 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check that a dim sharded on all Concat inputs (which is not the concat dim)
+// produces output layout with the same dim sharded.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<2x16x32xf32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<6x16x32xf32>
+func.func @main(%arg0: tensor<2x32x32xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg1: tensor<6x32x32xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU"}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
+  // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0]], %[[ARG1]], %[[AXIS]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      (tensor<2x16x32xf32>, tensor<6x16x32xf32>, tensor<i32>) -> tensor<8x16x32xf32>
+  // CHECK:         tf_device.return
+  // CHECK-SAME:      tensor<8x16x32xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<2x32x32>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<2x32x32xf32>) -> tensor<2x32x32xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x32x32>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<6x32x32xf32>) -> tensor<6x32x32xf32>
+    %3 = "tf.ConcatV2"(%1, %2, %cst) : (tensor<2x32x32xf32>, tensor<6x32x32xf32>, tensor<i32>) -> tensor<8x32x32xf32>
+    %4 = "tf.Identity"(%3) : (tensor<8x32x32xf32>) -> tensor<8x32x32xf32>
+    tf_device.return %4 : tensor<8x32x32xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x32x32xf32>)
+  func.return
+}
+
+// -----
+
+// Check that if the concat dim is sharded in any Concat inputs, then that dim
+// is replicated in the output layout.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<8x4x32xf32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<8x2x32xf32>
+// CHECK-SAME: %[[ARG2:[a-z0-9]*]]: tensor<8x4x32xf32>
+func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|*TPU"},
+           %arg1: tensor<8x8x32xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=4|*TPU"},
+           %arg2: tensor<8x16x32xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=4|*TPU"}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
+  // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<8x2x32xf32>) -> tensor<8x8x32xf32>
+  // CHECK-NEXT:    %[[ARG2_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG2]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<8x16x32xf32>
+  // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0]], %[[ARG1_RELAYOUT]], %[[ARG2_RELAYOUT]], %[[AXIS]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      (tensor<8x4x32xf32>, tensor<8x8x32xf32>, tensor<8x16x32xf32>, tensor<i32>) -> tensor<8x28x32xf32>
+  // CHECK:         tf_device.return
+  // CHECK-SAME:      tensor<8x28x32xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x4x32>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=4|*TPU>} : (tensor<8x4x32xf32>) -> tensor<8x4x32xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x32>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|*TPU>} : (tensor<8x8x32xf32>) -> tensor<8x8x32xf32>
+    %3 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x16x32>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=4|*TPU>} : (tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+    %4 = "tf.ConcatV2"(%1, %2, %3, %cst) : (tensor<8x4x32xf32>, tensor<8x8x32xf32>, tensor<8x16x32xf32>, tensor<i32>) -> tensor<8x28x32xf32>
+    %5 = "tf.Identity"(%4) : (tensor<8x28x32xf32>) -> tensor<8x28x32xf32>
+    tf_device.return %5 : tensor<8x28x32xf32>
+  }) {_mesh="|x=4|*TPU"} : () -> (tensor<8x28x32xf32>)
+  func.return
+}
+
+// -----
+
+// Check that dims sharded on any Concat inputs (which is not the concat dim,
+// and does not conflict with any other sharding) produces output layout with
+// the same dims sharded.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<4x4x32xf32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<8x8x16xf32>
+// CHECK-SAME: %[[ARG2:[a-z0-9]*]]: tensor<4x16x16xf32>
+func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg1: tensor<8x8x32xf32> {tf._layout = "sharding_specs:unsharded,unsharded,y, mesh:|x=2,y=2|*TPU"},
+           %arg2: tensor<8x16x32xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:|x=2,y=2|*TPU"}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
+  // CHECK-NEXT:    %[[ARG0_RELAYOUT:.*]] = "tf.DTensorAllScatter"(%[[ARG0]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<4x4x32xf32>) -> tensor<4x4x16xf32>
+  // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllScatter"(%[[ARG1]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<8x8x16xf32>) -> tensor<4x8x16xf32>
+  // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0_RELAYOUT]], %[[ARG1_RELAYOUT]], %[[ARG2]], %[[AXIS]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      (tensor<4x4x16xf32>, tensor<4x8x16xf32>, tensor<4x16x16xf32>, tensor<i32>) -> tensor<4x28x16xf32>
+  // CHECK:         tf_device.return
+  // CHECK-SAME:      tensor<4x28x16xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x4x32>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x4x32xf32>) -> tensor<8x4x32xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x32>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,y, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x32xf32>) -> tensor<8x8x32xf32>
+    %3 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x16x32>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:|x=2,y=2|*TPU>} : (tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+    %4 = "tf.ConcatV2"(%1, %2, %3, %cst) : (tensor<8x4x32xf32>, tensor<8x8x32xf32>, tensor<8x16x32xf32>, tensor<i32>) -> tensor<8x28x32xf32>
+    %5 = "tf.Identity"(%4) : (tensor<8x28x32xf32>) -> tensor<8x28x32xf32>
+    tf_device.return %5 : tensor<8x28x32xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x28x32xf32>)
+  func.return
+}
+
+// -----
+
+// Check that any dims with conflicting sharding across the Concat inputs are
+// deduplicated and the output layout is replicated in those dims.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<8x4x32xf32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<16x8x16xf32>
+func.func @main(%arg0: tensor<8x8x32xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg1: tensor<16x8x32xf32> {tf._layout = "sharding_specs:unsharded,unsharded,x, mesh:|x=2,y=2|*TPU"}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
+  // CHECK-NEXT:    %[[ARG0_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG0]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<8x8x32xf32>
+  // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<16x8x16xf32>) -> tensor<16x8x32xf32>
+  // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0_RELAYOUT]], %[[ARG1_RELAYOUT]], %[[AXIS]])
+  // CHECK-SAME:      _layout = ["sharding_specs:unsharded,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      (tensor<8x8x32xf32>, tensor<16x8x32xf32>, tensor<i32>) -> tensor<24x8x32xf32>
+  // CHECK:         tf_device.return
+  // CHECK-SAME:      tensor<24x8x32xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8x32>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x32xf32>) -> tensor<8x8x32xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<16x8x32>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,x, mesh:|x=2,y=2|*TPU>} : (tensor<16x8x32xf32>) -> tensor<16x8x32xf32>
+    %3 = "tf.ConcatV2"(%1, %2, %cst) : (tensor<8x8x32xf32>, tensor<16x8x32xf32>, tensor<i32>) -> tensor<24x8x32xf32>
+    %4 = "tf.Identity"(%3) : (tensor<24x8x32xf32>) -> tensor<24x8x32xf32>
+    tf_device.return %4 : tensor<24x8x32xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<24x8x32xf32>)
+  func.return
+}
+
+// -----
+
+// Check that if any Concat input is sharded on the concat dim, along with other
+// inputs sharded on other dims, then relayout is correctly applied to those
+// inputs and the order of DTensorAllScatter -> DTensorAllGather is correct.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<4x4x32xf32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<8x4x32xf32>
+// CHECK-SAME: %[[ARG2:[a-z0-9]*]]: tensor<4x8x32xf32>
+func.func @main(%arg0: tensor<8x4x32xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg1: tensor<8x8x32xf32> {tf._layout = "sharding_specs:unsharded,y,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2: tensor<8x16x32xf32> {tf._layout = "sharding_specs:x,y,unsharded, mesh:|x=2,y=2|*TPU"}) {
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    %[[AXIS:.*]] = "tf.Const"()
+  // CHECK-NEXT:    %[[ARG1_SCATTER:.*]] = "tf.DTensorAllScatter"(%[[ARG1]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<8x4x32xf32>) -> tensor<4x4x32xf32>
+  // CHECK-NEXT:    %[[ARG1_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG1_SCATTER]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<4x4x32xf32>) -> tensor<4x8x32xf32>
+  // CHECK-NEXT:    %[[ARG2_RELAYOUT:.*]] = "tf.DTensorAllGather"(%[[ARG2]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      (tensor<4x8x32xf32>) -> tensor<4x16x32xf32>
+  // CHECK-NEXT:    %[[CONCAT_OUT:.*]] = "tf.ConcatV2"(%[[ARG0]], %[[ARG1_RELAYOUT]], %[[ARG2_RELAYOUT]], %[[AXIS]])
+  // CHECK-SAME:      _layout = ["sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME:      (tensor<4x4x32xf32>, tensor<4x8x32xf32>, tensor<4x16x32xf32>, tensor<i32>) -> tensor<4x28x32xf32>
+  // CHECK:         tf_device.return
+  // CHECK-SAME:      tensor<4x28x32xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x4x32>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x4x32xf32>) -> tensor<8x4x32xf32>
+    %2 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x32>, layout = #dtensor.layout<sharding_specs:unsharded,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x32xf32>) -> tensor<8x8x32xf32>
+    %3 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x16x32>, layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+    %4 = "tf.ConcatV2"(%1, %2, %3, %cst) : (tensor<8x4x32xf32>, tensor<8x8x32xf32>, tensor<8x16x32xf32>, tensor<i32>) -> tensor<8x28x32xf32>
+    %5 = "tf.Identity"(%4) : (tensor<8x28x32xf32>) -> tensor<8x28x32xf32>
+    tf_device.return %5 : tensor<8x28x32xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x28x32xf32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_conv.mlir b/tensorflow/dtensor/mlir/tests/spmd_conv.mlir
new file mode 100644
index 00000000000..c628f3b85ce
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_conv.mlir
@@ -0,0 +1,696 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check that Conv2D uses input image layout as output layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+           %arg2:tensor<8x3x3x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2D"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4x32x32x3xf32>, tensor<8x3x3x3xf32>) -> tensor<4x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU>} : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU>} : (tensor<8x3x3x3xf32>) -> tensor<8x3x3x3xf32>
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<8x3x3x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="|x=2,y=1|*TPU"} : () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3D uses input image layout as output layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+           %arg2:tensor<8x3x3x3x3xf32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3D"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4x32x32x32x3xf32>, tensor<8x3x3x3x3xf32>) -> tensor<4x32x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU>} : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU>} : (tensor<8x3x3x3x3xf32>) -> tensor<8x3x3x3x3xf32>
+    %conv = "tf.Conv3D"(%img_layout, %filter_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<8x3x3x3x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x32x3xf32>
+  }) {_mesh="|x=2,y=1|*TPU"} : () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2D backprop uses grads as output layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<1x3x3x3xf32>,
+           %arg2: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropInput"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4xi32>, tensor<1x3x3x3xf32>, tensor<4x32x32x3xf32>) -> tensor<4x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_shape = "tf.Const"() { value=dense<[8,32,32,3]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %filter_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropInput"(%img_shape, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<4xi32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3DBackPropInputV2 uses grads as output layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<1x3x3x3x3xf32>,
+           %arg2: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropInputV2"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: (tensor<5xi32>, tensor<1x3x3x3x3xf32>, tensor<4x32x32x32x3xf32>) -> tensor<4x32x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_shape = "tf.Const"() { value=dense<[8,32,32,32,3]> : tensor<5xi32>} : () -> tensor<5xi32>
+    %filter_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<1x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropInputV2"(%img_shape, %filter_layout, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<5xi32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check all reduce emitted in Conv2DBackpropFilter when image is batch sharded
+// but input_shape is replicated const.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %input_img: tensor<2x9x9x1xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+  %grad: tensor<2x9x9x2xf32> {tf._layout = "sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU"}
+  ) -> tensor<2x2x1x2xf32> {
+  // CHECK: "tf_device.cluster"
+  // CHECK: %[[FILTER_SHAPE:.*]] = "tf.Const"()
+  // CHECK: %[[SLICE_OUT:.*]] = "tf.DTensorAllScatter"(%arg2)
+  // CHECK: %[[BACKPROP_OUT:.*]] = "tf.Conv2DBackpropFilter"(%arg1, %[[FILTER_SHAPE]], %[[SLICE_OUT]])
+  // CHECK: %[[TPU_GROUP:.*]] = "tf.Const"()
+  // CHECK: %[[XLA_ALL_REDUCE:.*]] = "tf.DTensorAllReduce"(%[[BACKPROP_OUT]], %[[TPU_GROUP]])
+  %0 = "tf_device.cluster"() ({
+    %filter_shape = "tf.Const"() { value = dense<[2, 2, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %input_layout = "tf.DTensorLayout"(%input_img) {global_shape = #tf_type.shape<2x9x9x1>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<2x9x9x1xf32>) -> tensor<2x9x9x1xf32>
+    %grad_layout = "tf.DTensorLayout"(%grad) {global_shape = #tf_type.shape<2x9x9x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32>
+    %2 = "tf.Conv2DBackpropFilter"(%input_layout, %filter_shape, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<2x9x9x1xf32>, tensor<4xi32>, tensor<2x9x9x2xf32>) -> tensor<2x2x1x2xf32>
+    tf_device.return %2 : tensor<2x2x1x2xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"} : () -> tensor<2x2x1x2xf32>
+  func.return %0 : tensor<2x2x1x2xf32>
+}
+
+// -----
+
+// Check all reduce emitted in Conv3DBackpropFilterV2 when image is batch
+// sharded but input_shape is replicated const.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %input_img: tensor<2x9x9x9x1xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+  %grad: tensor<2x9x9x9x2xf32> {tf._layout = "sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU"}
+  ) -> tensor<2x2x2x1x2xf32> {
+  // CHECK: "tf_device.cluster"
+  // CHECK: %[[FILTER_SHAPE:.*]] = "tf.Const"()
+  // CHECK: %[[SLICE_OUT:.*]] = "tf.DTensorAllScatter"(%arg2)
+  // CHECK: %[[BACKPROP_OUT:.*]] = "tf.Conv3DBackpropFilterV2"(%arg1, %[[FILTER_SHAPE]], %[[SLICE_OUT]])
+  // CHECK: %[[TPU_GROUP:.*]] = "tf.Const"()
+  // CHECK: %[[XLA_ALL_REDUCE:.*]] = "tf.DTensorAllReduce"(%[[BACKPROP_OUT]], %[[TPU_GROUP]])
+  %0 = "tf_device.cluster"() ({
+    %filter_shape = "tf.Const"() { value = dense<[2, 2, 2, 1, 2]> : tensor<5xi32>} : () -> tensor<5xi32>
+    %input_layout = "tf.DTensorLayout"(%input_img) {global_shape = #tf_type.shape<2x9x9x9x1>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<2x9x9x9x1xf32>) -> tensor<2x9x9x9x1xf32>
+    %grad_layout = "tf.DTensorLayout"(%grad) {global_shape = #tf_type.shape<2x9x9x9x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<2x9x9x9x2xf32>) -> tensor<2x9x9x9x2xf32>
+    %2 = "tf.Conv3DBackpropFilterV2"(%input_layout, %filter_shape, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<2x9x9x9x1xf32>, tensor<5xi32>, tensor<2x9x9x9x2xf32>) -> tensor<2x2x2x1x2xf32>
+    tf_device.return %2 : tensor<2x2x2x1x2xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"} : () -> tensor<2x2x2x1x2xf32>
+  func.return %0 : tensor<2x2x2x1x2xf32>
+}
+
+// -----
+
+// Check that Conv2D with spatial partitioning requires kernel to be fully
+// replicated.
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:unsharded,x,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<8x3x3x3xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x3x3x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x3x3x3xf32>) -> tensor<8x3x3x3xf32>
+    // expected-error @+1 {{Filter for convolution must have fully replicated layout.}}
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<8x32x32x3xf32>, tensor<8x3x3x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3D with spatial partitioning requires kernel to be fully
+// replicated.
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<8x3x3x3x3xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x3x3x3x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x3x3x3x3xf32>) -> tensor<8x3x3x3x3xf32>
+    // expected-error @+1 {{Filter for convolution must have fully replicated layout.}}
+    %conv = "tf.Conv3D"(%img_layout, %filter_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<8x32x32x32x3xf32>, tensor<8x3x3x3x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv : tensor<8x32x32x32x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2D with spatial partitioning requires input dimension size to
+// be greater than halo size.
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<15x15x3x3xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x8x3xf32>) -> tensor<8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<15x15x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<15x15x3x3xf32>) -> tensor<15x15x3x3xf32>
+    // expected-error @+1 {{input shard tensor size of each processor must be greater than halo size}}
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<8x8x8x3xf32>, tensor<15x15x3x3xf32>) -> tensor<8x8x8x3xf32>
+    tf_device.return %conv : tensor<8x8x8x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x8x8x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3D with spatial partitioning requires input dimension size to
+// be greater than halo size.
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<15x3x15x3x3xf32>) {
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x8x8x3xf32>) -> tensor<8x8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<15x3x15x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<15x3x15x3x3xf32>) -> tensor<15x3x15x3x3xf32>
+    // expected-error @+1 {{input shard tensor size of each processor must be greater than halo size}}
+    %conv = "tf.Conv3D"(%img_layout, %filter_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<8x8x8x8x3xf32>, tensor<15x3x15x3x3xf32>) -> tensor<8x8x8x8x3xf32>
+    tf_device.return %conv : tensor<8x8x8x8x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x8x8x8x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2D with spatial partitioning using "SAME" padding produces
+// begin and end halos on both spatial dimensions.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<3x3x3x3xf32>) {
+  // CHECK:         "tf_device.cluster"
+
+  // Build left halo on height dim.
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
+  // Build right halo on height dim.
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
+  // Concat the halos with the shard on the height dim.
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %arg1, %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<8x4x4x3xf32>, tensor<8x1x4x3xf32>, tensor<i64>) -> tensor<8x6x4x3xf32>
+
+  // Build left halo on width dim.
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x6x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
+  // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<4x2xi32>) -> tensor<8x6x1x3xf32>
+  // Build right halo on width dim.
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x6x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
+  // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<4x2xi32>) -> tensor<8x6x1x3xf32>
+  // Concat the halos with the shard on the width dim.
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_HW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[CONCAT_H_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
+  // CHECK-SAME:          (tensor<8x6x1x3xf32>, tensor<8x6x4x3xf32>, tensor<8x6x1x3xf32>, tensor<i64>) -> tensor<8x6x6x3xf32>
+
+  // CHECK-NEXT:      "tf.Conv2D"(%[[CONCAT_HW_TENSOR]], %arg2)
+  // CHECK-SAME:          padding = "VALID"
+  // CHECK-SAME:          (tensor<8x6x6x3xf32>, tensor<3x3x3x3xf32>) -> tensor<8x4x4x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x8x3xf32>) -> tensor<8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<3x3x3x3xf32>) -> tensor<3x3x3x3xf32>
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<8x8x8x3xf32>, tensor<3x3x3x3xf32>) -> tensor<8x8x8x3xf32>
+    tf_device.return %conv : tensor<8x8x8x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x8x8x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3D with spatial partitioning using "SAME" padding produces
+// begin and end halos on all spatial dimensions.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,z,unsharded, mesh:|x=2,y=2,z=2|*TPU"},
+           %arg2:tensor<3x3x3x3x3xf32>) {
+  // CHECK:         "tf_device.cluster"
+
+  // Build left halo on depth dim.
+  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_LEFT_BEGIN]], %[[SLICE_D_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_D_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_D_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 4], [1, 5], [2, 6], [3, 7], [4, 0], [5, 1], [6, 2], [7, 3]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_D_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_D_LEFT]], %[[PAIRS_D_LEFT]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
+  // Build right halo on depth dim.
+  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_RIGHT_BEGIN]], %[[SLICE_D_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_D_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_D_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 4], [1, 5], [2, 6], [3, 7], [4, 0], [5, 1], [6, 2], [7, 3]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_D_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_D_RIGHT]], %[[PAIRS_D_RIGHT]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
+  // Concat the halos with the shard on the depth dim.
+  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_D_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_D_LEFT]], %arg1, %[[EXCHANGED_HALO_D_RIGHT]], %[[CONCAT_D_AXIS]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x4x4x4x3xf32>, tensor<8x1x4x4x3xf32>, tensor<i64>) -> tensor<8x6x4x4x3xf32>
+
+  // Build left halo on height dim.
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1], [4, 6], [5, 7], [6, 4], [7, 5]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
+  // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x6x1x4x3xf32>
+  // Build right halo on height dim.
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1], [4, 6], [5, 7], [6, 4], [7, 5]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
+  // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x6x1x4x3xf32>
+  // Concat the halos with the shard on the height dim.
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_DH_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %[[CONCAT_D_TENSOR]], %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
+  // CHECK-SAME:          (tensor<8x6x1x4x3xf32>, tensor<8x6x4x4x3xf32>, tensor<8x6x1x4x3xf32>, tensor<i64>) -> tensor<8x6x6x4x3xf32>
+
+  // Build left halo on width dim.
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x6x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x6x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2], [4, 5], [5, 4], [6, 7], [7, 6]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
+  // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x2xi32>) -> tensor<8x6x6x1x3xf32>
+  // Build right halo on width dim.
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 6, 6, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x6x6x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x6x6x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2], [4, 5], [5, 4], [6, 7], [7, 6]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
+  // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x2xi32>) -> tensor<8x6x6x1x3xf32>
+  // Concat the halos with the shard on the width dim.
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_DHW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[CONCAT_DH_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
+  // CHECK-SAME:          (tensor<8x6x6x1x3xf32>, tensor<8x6x6x4x3xf32>, tensor<8x6x6x1x3xf32>, tensor<i64>) -> tensor<8x6x6x6x3xf32>
+
+  // CHECK-NEXT:      "tf.Conv3D"(%[[CONCAT_DHW_TENSOR]], %arg2)
+  // CHECK-SAME:          padding = "VALID"
+  // CHECK-SAME:          (tensor<8x6x6x6x3xf32>, tensor<3x3x3x3x3xf32>) -> tensor<8x4x4x4x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,z,unsharded, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<8x8x8x8x3xf32>) -> tensor<8x8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2,z=8|*TPU>} : (tensor<3x3x3x3x3xf32>) -> tensor<3x3x3x3x3xf32>
+    %conv = "tf.Conv3D"(%img_layout, %filter_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} : (tensor<8x8x8x8x3xf32>, tensor<3x3x3x3x3xf32>) -> tensor<8x8x8x8x3xf32>
+    tf_device.return %conv : tensor<8x8x8x8x3xf32>
+  }) {_mesh="|x=2,y=2,z=2|*TPU"} : () -> (tensor<8x8x8x8x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2D with spatial partitioning using "VALID" padding produces
+// begin and end halos on both spatial dimensions and all necessary slice ops.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2:tensor<3x3x3x3xf32>) {
+  // CHECK:         "tf_device.cluster"
+
+  // Build left halo on height dim.
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
+  // Build right halo on height dim.
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<4x2xi32>) -> tensor<8x1x4x3xf32>
+  // Concat the halos with the shard on the height dim.
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_H_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %arg1, %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
+  // CHECK-SAME:          (tensor<8x1x4x3xf32>, tensor<8x4x4x3xf32>, tensor<8x1x4x3xf32>, tensor<i64>) -> tensor<8x6x4x3xf32>
+  // Dynamically slice the concatenated tensor to get correct size for VALID padding.
+  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[VALID_OFFSET_H:.*]] = "tf.Mul"
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_H:.*]] = "tf.Sub"(%[[HALO_SIZES_H]], %[[VALID_OFFSET_H]])
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() {value = dense<[8, 5, 4, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_H_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_H_TENSOR]], %[[VALID_SLICE_BEGIN_H]], %[[VALID_SLICE_SIZE_H]])
+
+  // Build left halo on width dim.
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_H_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x5x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
+  // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<4x2xi32>) -> tensor<8x5x1x3xf32>
+  // Build right halo on width dim.
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_H_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x4x3xf32>, tensor<4xi32>, tensor<4xi32>) -> tensor<8x5x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
+  // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<4x2xi32>) -> tensor<8x5x1x3xf32>
+  // Concat the halos with the shard on the width dim.
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_HW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[VALID_SLICE_H_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
+  // CHECK-SAME:          (tensor<8x5x1x3xf32>, tensor<8x5x4x3xf32>, tensor<8x5x1x3xf32>, tensor<i64>) -> tensor<8x5x6x3xf32>
+  // Dynamically slice the concatenated tensor to get correct size for VALID padding.
+  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK-NEXT:      %[[VALID_OFFSET_W:.*]] = "tf.Mul"
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_W:.*]] = "tf.Sub"(%[[HALO_SIZES_W]], %[[VALID_OFFSET_W]])
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 3]> : tensor<4xi64>} : () -> tensor<4xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_HW_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_HW_TENSOR]], %[[VALID_SLICE_BEGIN_W]], %[[VALID_SLICE_SIZE_W]])
+
+  // CHECK-NEXT:      "tf.Conv2D"(%[[VALID_SLICE_HW_TENSOR]], %arg2)
+  // CHECK-SAME:          padding = "VALID"
+  // CHECK-SAME:          (tensor<8x5x5x3xf32>, tensor<3x3x3x3xf32>) -> tensor<8x3x3x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<8x8x8x3xf32>) -> tensor<8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<3x3x3x3xf32>) -> tensor<3x3x3x3xf32>
+    %conv = "tf.Conv2D"(%img_layout, %filter_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<8x8x8x3xf32>, tensor<3x3x3x3xf32>) -> tensor<8x6x6x3xf32>
+    tf_device.return %conv : tensor<8x6x6x3xf32>
+  }) {_mesh="|x=2,y=2|*TPU"} : () -> (tensor<8x6x6x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3D with spatial partitioning using "VALID" padding produces
+// begin and end halos on all spatial dimensions and all necessary slice ops.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x8x8x8x3xf32> {tf._layout = "sharding_specs:unsharded,x,y,z,unsharded, mesh:|x=2,y=2,z=2|*TPU"},
+           %arg2:tensor<3x3x3x3x3xf32>) {
+  // CHECK:         "tf_device.cluster"
+
+  // Build left halo on depth dim.
+  // CHECK:           %[[SLICE_D_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 3, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_LEFT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_LEFT_BEGIN]], %[[SLICE_D_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_D_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_D_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 4], [1, 5], [2, 6], [3, 7], [4, 0], [5, 1], [6, 2], [7, 3]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_D_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_D_LEFT]], %[[PAIRS_D_LEFT]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
+  // Build right halo on depth dim.
+  // CHECK:           %[[SLICE_D_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 1, 4, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_D_RIGHT:.*]] = "tf.Slice"(%arg1, %[[SLICE_D_RIGHT_BEGIN]], %[[SLICE_D_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x4x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x1x4x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_D_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_D_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 4], [1, 5], [2, 6], [3, 7], [4, 0], [5, 1], [6, 2], [7, 3]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_D_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_D_RIGHT]], %[[PAIRS_D_RIGHT]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x2xi32>) -> tensor<8x1x4x4x3xf32>
+  // Concat the halos with the shard on the depth dim.
+  // CHECK-NEXT:      %[[CONCAT_D_AXIS:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_D_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_D_LEFT]], %arg1, %[[EXCHANGED_HALO_D_RIGHT]], %[[CONCAT_D_AXIS]])
+  // CHECK-SAME:          (tensor<8x1x4x4x3xf32>, tensor<8x4x4x4x3xf32>, tensor<8x1x4x4x3xf32>, tensor<i64>) -> tensor<8x6x4x4x3xf32>
+  // Dynamically slice the concatenated tensor to get correct size for VALID padding.
+  // CHECK-NEXT:      %[[HALO_SIZES_D:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_D:.*]] = "tf.Const"() {value = dense<[0, 1, 0, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[VALID_OFFSET_D:.*]] = "tf.Mul"
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_D:.*]] = "tf.Sub"(%[[HALO_SIZES_D]], %[[VALID_OFFSET_D]])
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_D:.*]] = "tf.Const"() {value = dense<[8, 5, 4, 4, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_D_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_D_TENSOR]], %[[VALID_SLICE_BEGIN_D]], %[[VALID_SLICE_SIZE_D]])
+
+  // Build left halo on height dim.
+  // CHECK:           %[[SLICE_H_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 3, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_D_TENSOR]], %[[SLICE_H_LEFT_BEGIN]], %[[SLICE_H_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1], [4, 6], [5, 7], [6, 4], [7, 5]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_H_LEFT]], %[[PAIRS_H_LEFT]])
+  // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x5x1x4x3xf32>
+  // Build right halo on height dim.
+  // CHECK:           %[[SLICE_H_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 1, 4, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_H_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_D_TENSOR]], %[[SLICE_H_RIGHT_BEGIN]], %[[SLICE_H_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x4x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x1x4x3xf32>
+  // CHECK-NEXT:      %[[HALO_H_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_H_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 2], [1, 3], [2, 0], [3, 1], [4, 6], [5, 7], [6, 4], [7, 5]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_H_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_H_RIGHT]], %[[PAIRS_H_RIGHT]])
+  // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x2xi32>) -> tensor<8x5x1x4x3xf32>
+  // Concat the halos with the shard on the height dim.
+  // CHECK-NEXT:      %[[CONCAT_H_AXIS:.*]] = "tf.Const"() {value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_DH_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_H_LEFT]], %[[VALID_SLICE_D_TENSOR]], %[[EXCHANGED_HALO_H_RIGHT]], %[[CONCAT_H_AXIS]])
+  // CHECK-SAME:          (tensor<8x5x1x4x3xf32>, tensor<8x5x4x4x3xf32>, tensor<8x5x1x4x3xf32>, tensor<i64>) -> tensor<8x5x6x4x3xf32>
+  // Dynamically slice the concatenated tensor to get correct size for VALID padding.
+  // CHECK-NEXT:      %[[HALO_SIZES_H:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_H:.*]] = "tf.Const"() {value = dense<[0, 0, 1, 0, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[VALID_OFFSET_H:.*]] = "tf.Mul"
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_H:.*]] = "tf.Sub"(%[[HALO_SIZES_H]], %[[VALID_OFFSET_H]])
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_H:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 4, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_DH_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_DH_TENSOR]], %[[VALID_SLICE_BEGIN_H]], %[[VALID_SLICE_SIZE_H]])
+
+  // Build left halo on width dim.
+  // CHECK:           %[[SLICE_W_LEFT_BEGIN:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 3, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_LEFT:.*]] = "tf.Slice"(%[[VALID_SLICE_DH_TENSOR]], %[[SLICE_W_LEFT_BEGIN]], %[[SLICE_W_LEFT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x5x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x5x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_LEFT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_LEFT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2], [4, 5], [5, 4], [6, 7], [7, 6]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_LEFT:.*]] = "tf.CollectivePermute"(%[[HALO_W_LEFT]], %[[PAIRS_W_LEFT]])
+  // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x2xi32>) -> tensor<8x5x5x1x3xf32>
+  // Build right halo on width dim.
+  // CHECK:           %[[SLICE_W_RIGHT_BEGIN:.*]] = "tf.Const"() {value = dense<0> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT_SIZE:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 1, 3]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[SLICE_W_RIGHT:.*]] = "tf.Slice"(%[[VALID_SLICE_DH_TENSOR]], %[[SLICE_W_RIGHT_BEGIN]], %[[SLICE_W_RIGHT_SIZE]])
+  // CHECK-SAME:          (tensor<8x5x5x4x3xf32>, tensor<5xi32>, tensor<5xi32>) -> tensor<8x5x5x1x3xf32>
+  // CHECK-NEXT:      %[[HALO_W_RIGHT:.*]] = "tf.SelectV2"
+  // CHECK-NEXT:      %[[PAIRS_W_RIGHT:.*]] = "tf.Const"
+  // CHECK-SAME{LITERAL}: value = dense<[[0, 1], [1, 0], [2, 3], [3, 2], [4, 5], [5, 4], [6, 7], [7, 6]]>
+  // CHECK-NEXT:      %[[EXCHANGED_HALO_W_RIGHT:.*]] = "tf.CollectivePermute"(%[[HALO_W_RIGHT]], %[[PAIRS_W_RIGHT]])
+  // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x2xi32>) -> tensor<8x5x5x1x3xf32>
+  // Concat the halos with the shard on the width dim.
+  // CHECK-NEXT:      %[[CONCAT_W_AXIS:.*]] = "tf.Const"() {value = dense<3> : tensor<i64>} : () -> tensor<i64>
+  // CHECK-NEXT:      %[[CONCAT_DHW_TENSOR:.*]] = "tf.ConcatV2"(%[[EXCHANGED_HALO_W_LEFT]], %[[VALID_SLICE_DH_TENSOR]], %[[EXCHANGED_HALO_W_RIGHT]], %[[CONCAT_W_AXIS]])
+  // CHECK-SAME:          (tensor<8x5x5x1x3xf32>, tensor<8x5x5x4x3xf32>, tensor<8x5x5x1x3xf32>, tensor<i64>) -> tensor<8x5x5x6x3xf32>
+  // Dynamically slice the concatenated tensor to get correct size for VALID padding.
+  // CHECK-NEXT:      %[[HALO_SIZES_W:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[HALO_INCREMENTS_W:.*]] = "tf.Const"() {value = dense<[0, 0, 0, 1, 0]> : tensor<5xi32>} : () -> tensor<5xi32>
+  // CHECK-NEXT:      %[[VALID_OFFSET_W:.*]] = "tf.Mul"
+  // CHECK-NEXT:      %[[VALID_SLICE_BEGIN_W:.*]] = "tf.Sub"(%[[HALO_SIZES_W]], %[[VALID_OFFSET_W]])
+  // CHECK-NEXT:      %[[VALID_SLICE_SIZE_W:.*]] = "tf.Const"() {value = dense<[8, 5, 5, 5, 3]> : tensor<5xi64>} : () -> tensor<5xi64>
+  // CHECK-NEXT:      %[[VALID_SLICE_DHW_TENSOR:.*]] = "tf.Slice"(%[[CONCAT_DHW_TENSOR]], %[[VALID_SLICE_BEGIN_W]], %[[VALID_SLICE_SIZE_W]])
+
+  // CHECK-NEXT:      "tf.Conv3D"(%[[VALID_SLICE_DHW_TENSOR]], %arg2)
+  // CHECK-SAME:          padding = "VALID"
+  // CHECK-SAME:          (tensor<8x5x5x5x3xf32>, tensor<3x3x3x3x3xf32>) -> tensor<8x3x3x3x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %img_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x8x8x8x3>, layout = #dtensor.layout<sharding_specs:unsharded,x,y,z,unsharded, mesh:|x=2,y=2,z=2|*TPU>} : (tensor<8x8x8x8x3xf32>) -> tensor<8x8x8x8x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2,z=8|*TPU>} : (tensor<3x3x3x3x3xf32>) -> tensor<3x3x3x3x3xf32>
+    %conv = "tf.Conv3D"(%img_layout, %filter_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "VALID", strides = [1, 1, 1, 1, 1]} : (tensor<8x8x8x8x3xf32>, tensor<3x3x3x3x3xf32>) -> tensor<8x6x6x6x3xf32>
+    tf_device.return %conv : tensor<8x6x6x6x3xf32>
+  }) {_mesh="|x=2,y=2,z=2|*TPU"} : () -> (tensor<8x6x6x6x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2DBackpropInputV2 becomes Conv2DBackpropInput in SPMD expansion
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+                %arg2: tensor<1x3x3x3xf32>,
+                %arg3: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropInput"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4xi32>, tensor<1x3x3x3xf32>, tensor<4x32x32x3xf32>) -> tensor<4x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropInputV2"(%input_layout, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    tf_device.return %conv_layout : tensor<8x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<8x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3DBackpropInput becomes Conv3DBackpropInputV2 in SPMD expansion
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+                %arg2: tensor<1x3x3x3x3xf32>,
+                %arg3: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropInputV2"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: (tensor<5xi32>, tensor<1x3x3x3x3xf32>, tensor<4x32x32x32x3xf32>) -> tensor<4x32x32x32x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropInput"(%input_layout, %filter_layout, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    tf_device.return %conv_layout : tensor<8x32x32x32x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<8x32x32x32x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv2DBackpropFilterV2 becomes Conv2DBackpropFilter in SPMD expansion
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+                %arg2: tensor<1x3x3x3xf32>,
+                %arg3: tensor<8x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv2DBackpropFilter"
+  // CHECK-SAME: data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4x32x32x3xf32>, tensor<4xi32>, tensor<4x32x32x3xf32>) -> tensor<1x3x3x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x3xf32>) -> tensor<8x32x32x3xf32>
+    %conv = "tf.Conv2DBackpropFilterV2"(%input_layout, %filter_layout, %grad_layout) {data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} :
+      (tensor<8x32x32x3xf32>, tensor<1x3x3x3xf32>, tensor<8x32x32x3xf32>) -> tensor<1x3x3x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<1x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3xf32>) -> tensor<1x3x3x3xf32>
+    tf_device.return %conv_layout : tensor<1x3x3x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<1x3x3x3xf32>)
+  func.return
+}
+
+// -----
+
+// Check that Conv3DBackpropFilter becomes Conv3DBackpropFilterV2 in SPMD expansion
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+                %arg1: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"},
+                %arg2: tensor<1x3x3x3x3xf32>,
+                %arg3: tensor<8x32x32x32x3xf32> {tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=1|*TPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[CONV_OUT:.*]] = "tf.Conv3DBackpropFilterV2"
+  // CHECK-SAME: data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]
+  // CHECK-SAME: (tensor<4x32x32x32x3xf32>, tensor<5xi32>, tensor<4x32x32x32x3xf32>) -> tensor<1x3x3x3x3xf32>
+  %0 = "tf_device.cluster"() ({
+    %input_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %filter_layout = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<1x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %grad_layout = "tf.DTensorLayout"(%arg3) {global_shape = #tf_type.shape<8x32x32x32x3>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<8x32x32x32x3xf32>) -> tensor<8x32x32x32x3xf32>
+    %conv = "tf.Conv3DBackpropFilter"(%input_layout, %filter_layout, %grad_layout) {data_format = "NDHWC", dilations = [1, 1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1, 1]} :
+      (tensor<8x32x32x32x3xf32>, tensor<1x3x3x3x3xf32>, tensor<8x32x32x32x3xf32>) -> tensor<1x3x3x3x3xf32>
+    %conv_layout = "tf.DTensorLayout"(%conv) {global_shape = #tf_type.shape<1x3x3x3x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} :
+      (tensor<1x3x3x3x3xf32>) -> tensor<1x3x3x3x3xf32>
+    tf_device.return %conv_layout : tensor<1x3x3x3x3xf32>
+  }) {_mesh="TPU|x=2,y=2|*TPU"}: () -> (tensor<1x3x3x3x3xf32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_dtensor_ops.mlir b/tensorflow/dtensor/mlir/tests/spmd_dtensor_ops.mlir
new file mode 100644
index 00000000000..7bacb55fa60
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_dtensor_ops.mlir
@@ -0,0 +1,323 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check that Data transfer from CPU to TPU is lowered correctly.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+  // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR]])
+  // CHECK-NEXT:   %[[ZERO:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64]], %[[ZERO]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE]])
+  // CHECK-NEXT:     %[[ZERO_2:.*]] = "tf.Const"
+  // CHECK-SAME:     value = dense<0>
+  // CHECK-NEXT:     "tf._XlaSendFromHostV2"(%[[CONST_OUT]], %[[PROGRAM_KEY]], %[[ZERO_2]])
+  // CHECK-SAME:     key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<[0, 1, 2, 3]>
+  // CHECK-NEXT:   %[[SIZE_TYPE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID_2:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE_2]])
+  // CHECK-NEXT:   %[[SLICE_SIZE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_2:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL_2]], %[[DEVICE_ID_2]], %[[SLICE_SIZE_2]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_2:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL_2]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64_2:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR_2]])
+  // CHECK-NEXT:   %[[ZERO_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE_2:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64_2]], %[[ZERO_2]])
+  // CHECK-NEXT:   %[[IF_OUT:.*]] = "tf.IfRegion"(%[[PREDICATE_2]])
+  // CHECK-NEXT:     %[[RECV_OUT:.*]] = "tf.XlaRecvFromHost"()
+  // CHECK-SAME:      key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0"
+  // CHECK-NEXT:     "tf.Yield"(%[[RECV_OUT]])
+  // CHECK:          %[[ZEROS_3:.*]] = "tf.Const"
+  // CHECK-NEXT:     "tf.Yield"(%[[ZEROS_3]])
+  // CHECK:       %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[OUTPUT:.*]] = "tf.DTensorAllReduce"(%[[IF_OUT]], %[[GROUP_ASSIGNMENT]])
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> tensor<1xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> tensor<1xi32>
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+  func.return
+}
+
+
+// -----
+
+// Check that Data transfer from TPU to CPU is lowered correctly.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<[0, 1, 2, 3]>
+  // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+  // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR]])
+  // CHECK-NEXT:   %[[ZERO:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64]], %[[ZERO]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE]])
+  // CHECK-NEXT:     "tf.XlaSendToHost"(%[[CONST_OUT]])
+  // CHECK-SAME:     key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[SIZE_TYPE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID_2:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE_2]])
+  // CHECK-NEXT:   %[[SLICE_SIZE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_2:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL_2]], %[[DEVICE_ID_2]], %[[SLICE_SIZE_2]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_2:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL_2]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64_2:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR_2]])
+  // CHECK-NEXT:   %[[RECV_OUT:.*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_KEY]], %[[DEVICE_ORDINAL_SCALAR_64_2]])
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> tensor<1xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> tensor<1xi32>
+
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that tensor to send is converted to replicated layout before send.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK:        %[[ALL_GATHER_OUT:.*]] = "tf.DTensorAllGather"
+  // CHECK-SAME:   output_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<[0, 1, 2, 3]>
+  // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+  // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR]])
+  // CHECK-NEXT:   %[[ZERO:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64]], %[[ZERO]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE]])
+  // CHECK-NEXT:     "tf.XlaSendToHost"(%[[ALL_GATHER_OUT]])
+  // CHECK-SAME:     key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[SIZE_TYPE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID_2:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE_2]])
+  // CHECK-NEXT:   %[[SLICE_SIZE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_2:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL_2]], %[[DEVICE_ID_2]], %[[SLICE_SIZE_2]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_2:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL_2]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64_2:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR_2]])
+  // CHECK-NEXT:   %[[RECV_OUT:.*]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM_KEY]], %[[DEVICE_ORDINAL_SCALAR_64_2]])
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> tensor<2xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<2xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>, shape = #tf_type.shape<>} : () -> tensor<2xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<2xi32>) -> tensor<2xi32>
+
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that send/recv to clusters with same mesh is disallowed.
+func.func @main(%arg0: tensor<i32>) {
+
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|*CPU>} : (tensor<1xi32>) -> tensor<1xi32>
+    // expected-error @+1 {{Only use CopyToMesh to transfer data across different mesh cluster}}
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|*CPU>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=2|*CPU"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|*CPU>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=2|*CPU>} : (tensor<1xi32>) -> tensor<1xi32>
+
+    tf_device.return
+  }) {_mesh = "CPU|x=2|*CPU"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that multi-mesh transfer between two non host clusters is disallowed.
+func.func @main(%arg0: tensor<i32>) {
+
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<1xi32>) -> tensor<1xi32>
+    // expected-error @+1 {{f.CopyToMesh op must be used to send data from/to host mesh}}
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:GPU|x=2|0,1|0,1|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:GPU|x=2|*GPU>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:GPU|x=2|0,1|0,1|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:GPU|x=2|*GPU>, shape = #tf_type.shape<>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:GPU|x=2|*GPU>} : (tensor<1xi32>) -> tensor<1xi32>
+
+    tf_device.return
+  }) {_mesh = "GPU|x=2|*GPU"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that send/recv between two CPUs works.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[CONST_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:   "tf._HostSend"(%[[CONST_OUT]])
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf._HostRecv"
+  // CHECK-SAME:     tensor_name = "communication_key_sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1_0"
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0>} : (tensor<1xi32>) -> tensor<1xi32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1_0", target_layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1>} : (tensor<1xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1_0", layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1>, shape = #tf_type.shape<1>} : () -> tensor<1xi32>
+    %1 = "tf.DTensorLayout"(%0) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1>} : (tensor<1xi32>) -> tensor<1xi32>
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that Data transfer from CPU to GPU is lowered correctly.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[INPUT:.*]] = "tf.Const"
+  // CHECK-NEXT:   "tf._HostSend"(%[[INPUT]])
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE]])
+  // CHECK-NEXT:   %[[SLICE_SIZE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL]], %[[DEVICE_ID]], %[[SLICE_SIZE]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR]])
+  // CHECK-NEXT:   %[[ZERO:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64]], %[[ZERO]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[ID_TO_ORDINAL_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<[0, 1, 2, 3]>
+  // CHECK-NEXT:   %[[SIZE_TYPE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ID_2:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_TYPE_2]])
+  // CHECK-NEXT:   %[[SLICE_SIZE_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<1>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_2:.*]] = "tf.Slice"(%[[ID_TO_ORDINAL_2]], %[[DEVICE_ID_2]], %[[SLICE_SIZE_2]])
+  // CHECK-NEXT:   %[[SCALAR_TYPE:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<>
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_2:.*]] = "tf.Reshape"(%[[DEVICE_ORDINAL_2]], %[[SCALAR_TYPE]])
+  // CHECK-NEXT:   %[[DEVICE_ORDINAL_SCALAR_64_2:.*]] = "tf.Cast"(%[[DEVICE_ORDINAL_SCALAR_2]])
+  // CHECK-NEXT:   %[[ZERO_2:.*]] = "tf.Const"
+  // CHECK-SAME:   value = dense<0>
+  // CHECK-NEXT:   %[[PREDICATE_2:.*]] = "tf.Equal"(%[[DEVICE_ORDINAL_SCALAR_64_2]], %[[ZERO_2]])
+  // CHECK-NEXT:   %[[IF_OUT:.*]] = "tf.IfRegion"(%[[PREDICATE_2]])
+  // CHECK-NEXT:     %[[RECV_OUT:.*]] = "tf._HostRecv"
+  // CHECK-NEXT:     "tf.Yield"(%[[RECV_OUT]])
+  // CHECK:          %[[ZEROS_3:.*]] = "tf.Const"
+  // CHECK-NEXT:     "tf.Yield"(%[[ZEROS_3]])
+  // CHECK:       %[[GROUP_ASSIGNMENT:.*]] = "tf.Const"
+  // CHECK-NEXT:  %[[OUGPUT:.*]] = "tf.DTensorAllReduce"(%[[IF_OUT]], %[[GROUP_ASSIGNMENT]])
+  "tf_device.cluster"() ({
+    %0 = "tf.Const"() {value = dense<1.> : tensor<8x8xf32>} : () -> tensor<8x8xf32>
+    %1 = "tf.DTensorLayout"(%0) {_global_shape = [#tf_type.shape<8x8>], global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+    "tf.DTensorSend"(%1) {key = "communication_key_sharding_specs:, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3_0", target_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>} : (tensor<8x8xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> ()
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorRecv"() {key = "communication_key_sharding_specs:, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3_0", layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>, shape = #tf_type.shape<8x8>} : () -> tensor<8x8xf32>
+    %1 = "tf.DTensorLayout"(%0) {_global_shape = [#tf_type.shape<8x8>], global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3>} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+    tf_device.return
+  }) {_mesh = "GPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:GPU:0,/job:localhost/task:0/device:GPU:1,/job:localhost/task:0/device:GPU:2,/job:localhost/task:0/device:GPU:3"} : () -> ()
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_einsum.mlir b/tensorflow/dtensor/mlir/tests/spmd_einsum.mlir
new file mode 100644
index 00000000000..3ca2d710211
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_einsum.mlir
@@ -0,0 +1,214 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Einsum (normal matrix multiplication)
+// No AllToAll on input or output, only AllReduce on output.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<2x4xi32> {tf._layout = "sharding_specs:unsharded,y, mesh:|x=2,y=2|*GPU"},
+           %arg2: tensor<4x2xi32> {tf._layout = "sharding_specs:y,unsharded, mesh:|x=2,y=2|*GPU"}) -> tensor<2x2xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[EINSUM_RESULT:.*]] = "tf.Einsum"(%arg1, %arg2)
+  // CHECK:        %[[RETURN:.*]] = "tf.DTensorAllReduce"(%[[EINSUM_RESULT]]
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:   %[[RETURN]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:|x=2,y=2|*GPU>} : (tensor<2x4xi32>) -> tensor<2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x2>, layout = #dtensor.layout<sharding_specs:y,unsharded, mesh:|x=2,y=2|*GPU>} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<2x4xi32>, tensor<4x2xi32>) -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*GPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %4 : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2|*GPU", _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*GPU"]} : () -> (tensor<2x2xi32>)
+  func.return %0 : tensor<2x2xi32>
+}
+
+// -----
+
+// Replicated Einsum (normal matrix multiplication, no sharded dimensions reduced)
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<2x4xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2: tensor<4x2xi32> {tf._layout = "sharding_specs:unsharded,y, mesh:|x=2,y=2|*TPU"}) -> tensor<2x2xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[EINSUM_RESULT:.*]] = "tf.Einsum"(%arg1, %arg2)
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:   %[[EINSUM_RESULT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<2x4xi32>) -> tensor<2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x2>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:|x=2,y=2|*TPU>} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<2x4xi32>, tensor<4x2xi32>) -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|*TPU>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %4 : tensor<2x2xi32>
+  }) {_mesh = "|x=2,y=2|*TPU", _layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*TPU"]} : () -> (tensor<2x2xi32>)
+  func.return %0 : tensor<2x2xi32>
+}
+
+// -----
+
+// Einsum for transformer, no CRS
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<16x25x8xi32>{ tf._layout="sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU"},
+           %arg2: tensor<25x8x8x50xi32>{ tf._layout="sharding_specs:unsharded,unsharded,y,unsharded, mesh:|x=2,y=2|*TPU"}) -> tensor<16x25x8x50xi32> {
+  // CHECK:     "tf_device.cluster"
+  // CHECK:       %[[EINSUM_RESULT:.*]] = "tf.Einsum"(%arg1, %arg2)
+  // CHECK-NEXT:  tf_device.return
+  // CHECK-SAME:  %[[EINSUM_RESULT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<16x25x8>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<16x25x8xi32>) -> tensor<16x25x8xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<25x8x8x50>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<25x8x8x50xi32>) -> tensor<25x8x8x50xi32>
+    %3 = "tf.Einsum"(%1, %2) {equation="bse,sehq->bshq"} : (tensor<16x25x8xi32>, tensor<25x8x8x50xi32>) -> tensor<16x25x8x50xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<16x25x8x50>, layout = #dtensor.layout<sharding_specs:x,unsharded,y,unsharded, mesh:|x=2,y=2|*TPU>} : (tensor<16x25x8x50xi32>) -> tensor<16x25x8x50xi32>
+    tf_device.return %4 : tensor<16x25x8x50xi32>
+  }) {_mesh = "|x=2,y=2|*TPU"} : () -> (tensor<16x25x8x50xi32>)
+  func.return %0 : tensor<16x25x8x50xi32>
+}
+
+// -----
+
+// Invalid equation
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<16x24x8xi32>{ tf._layout="sharding_specs:x,z,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"},
+           %arg2: tensor<24x8x8x50xi32>{ tf._layout="sharding_specs:y,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"}) -> tensor<16x24x8x50xi32> {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<16x24x8>, layout = #dtensor.layout<sharding_specs:x,z,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<16x24x8xi32>) -> tensor<16x24x8xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<24x8x8x50>, layout = #dtensor.layout<sharding_specs:y,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<24x8x8x50xi32>) -> tensor<24x8x8x50xi32>
+    // expected-error @+1 {{incompatible mesh dimensions in equation, label 's' is mapped to mesh dimension 'y' and 'z'}}
+    %3 = "tf.Einsum"(%1, %2) {equation="bse,sehq->bshq"} : (tensor<16x24x8xi32>, tensor<24x8x8x50xi32>) -> tensor<16x24x8x50xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<16x24x8x50>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<16x24x8x50xi32>) -> tensor<16x24x8x50xi32>
+    tf_device.return %4 : tensor<16x24x8x50xi32>
+  }) {_mesh = "TPU|x=2,y=2,z=2|*TPU", _layout = ["sharding_specs:x,unsharded,unsharded,unsharded,  mesh:TPU|x=2,y=2,z=2|*TPU"]} : () -> (tensor<16x24x8x50xi32>)
+  func.return %0 : tensor<16x24x8x50xi32>
+}
+
+// -----
+
+// y,x . x,y -> *,y
+// We unshard %arg1
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:y,x, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%[[GATHERED]], %arg2)
+  // CHECK:      %[[FINAL_REDUCE:[0-9]*]] = "tf.DTensorAllReduce"(%[[EINSUM_RESULT]], %cst)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:y,x, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// *,x . x,* -> *,y
+// We should slice arg2 before matmul rather than slicing the result.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%arg2)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%arg1, %[[SLICE]])
+  // CHECK:      %[[FINAL_REDUCE:[0-9]*]] = "tf.DTensorAllReduce"(%[[EINSUM_RESULT]], %cst)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// x,y . *,y -> x,y
+// We unshard %arg1 on y.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%[[GATHERED]], %arg2)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="ab,bc->ac"} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// Example from BERT 64 way sharding.
+// bsd,dnh->bsnh  x,*,y . *,y,* -> x,*,y,*
+// Unshard arg1 along y.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<8x128x128xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<128x16x64xf32> {tf._layout = "sharding_specs:unsharded,y,unsharded, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<8x128x16x64xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%[[GATHERED]], %arg2)
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[EINSUM_RESULT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x128x128>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128xf32>) -> tensor<8x128x128xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<128x16x64>, layout = #dtensor.layout<sharding_specs:unsharded,y,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<128x16x64xf32>) -> tensor<128x16x64xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="bsd,dnh->bsnh"} : (tensor<8x128x128xf32>, tensor<128x16x64xf32>) -> tensor<8x128x16x64xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<8x128x16x64>, layout = #dtensor.layout<sharding_specs:x,unsharded,y,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x16x64xf32>) -> tensor<8x128x16x64xf32>
+    tf_device.return %4 : tensor<8x128x16x64xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<8x128x16x64xf32>
+  func.return %0 : tensor<8x128x16x64xf32>
+}
+
+// -----
+
+// Example from BERT 64 way sharding.
+// bfd,bfi->id  x,*,y . x,*,y -> y,*
+// Unshard arg1 along y, reduce on output.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<8x128x128xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<8x128x256xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<256x128xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%[[GATHERED]], %arg2)
+  // CHECK:      %[[RETURN:.*]] = "tf.DTensorAllReduce"(%[[EINSUM_RESULT]]
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[RETURN]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x128x128>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128xf32>) -> tensor<8x128x128xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x128x256>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x256xf32>) -> tensor<8x128x256xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="bfd,bfi->id"} : (tensor<8x128x128xf32>, tensor<8x128x256xf32>) -> tensor<256x128xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<256x128>, layout = #dtensor.layout<sharding_specs:y,unsharded, mesh:TPU|x=2,y=2|*TPU>} : (tensor<256x128xf32>) -> tensor<256x128xf32>
+    tf_device.return %4 : tensor<256x128xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<256x128xf32>
+  func.return %0 : tensor<256x128xf32>
+}
+
+// -----
+
+// Example from BERT 64 way sharding.
+// bfi,bfd->di  x,*,y . x,*,y -> *,y
+// Unshard arg2 along y, reduce on output.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<8x128x256xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU"},
+           %arg2: tensor<8x128x128xf32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU"}) -> tensor<128x256xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg2)
+  // CHECK-NEXT: %[[EINSUM_RESULT:[0-9]*]] = "tf.Einsum"(%arg1, %[[GATHERED]])
+  // CHECK:      %[[RETURN:.*]] = "tf.DTensorAllReduce"(%[[EINSUM_RESULT]]
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[RETURN]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x128x256>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x256xf32>) -> tensor<8x128x256xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x128x128>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<8x128x128xf32>) -> tensor<8x128x128xf32>
+    %3 = "tf.Einsum"(%1, %2) {equation="bfi,bfd->di"} : (tensor<8x128x256xf32>, tensor<8x128x128xf32>) -> tensor<128x256xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<128x256>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|*TPU>} : (tensor<128x256xf32>) -> tensor<128x256xf32>
+    tf_device.return %4 : tensor<128x256xf32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> tensor<128x256xf32>
+  func.return %0 : tensor<128x256xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_embedding.mlir b/tensorflow/dtensor/mlir/tests/spmd_embedding.mlir
new file mode 100644
index 00000000000..f991d4ca195
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_embedding.mlir
@@ -0,0 +1,110 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check that embedding configuration is lowered correctly.
+// CHECK-LABEL: func @main
+func.func @main() -> () {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT:   %[[IsInit_OUT:.*]] = "tf.IsTPUEmbeddingInitialized"
+  // CHECK-NEXT:   "tf.If"(%[[IsInit_OUT:.*]], %[[CST:.*]])
+  // CHECK-NEXT: tf_device.return
+  // CHECK-NEXT: _mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:EPU:0"
+  // Check lowered private if-else branch function.
+  // CHECK-LABEL: func private @tf.SetEmbeddingConfig_else_func
+  // CHECK-NEXT:   tf.ExecuteTPUEmbeddingPartitioner
+  // CHECK-NEXT:   tf.ConfigureTPUEmbeddingMemory
+  // CHECK-NEXT:   tf.CollateTPUEmbeddingMemory
+  // CHECK-NEXT:   tf.ConfigureTPUEmbeddingHost
+  // CHECK-NEXT:   tf.ConnectTPUEmbeddingHost
+  // CHECK-NEXT:   tf.FinalizeTPUEmbedding
+  "tf_device.cluster"() ({
+      %cst = "tf.Const"() {_global_shape = [#tf_type.shape<>], _layout = ["sharding_specs: mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"], value = dense<"\0A\17\0A\03T_0\10\08\18\04*\0C\22\00j\05\0D\00\00\80?\88\01\02\10\01 \01(\02R\03\1A\01\06"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+      "tf.SetEmbeddingConfig"(%cst) : (tensor<1x!tf_type.string>) -> ()
+      tf_device.return
+    }) {_mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:EPU:0"} : () -> ()
+    func.return
+}
+
+// -----
+
+// Check DTensorLoadTPUEmbeddingParameters is lowered correctly.
+// CHECK-LABEL: func @main
+func.func @main(
+  %arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg2: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg3: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg4: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg5: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg6: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg7: tensor<!tf_type.resource<tensor<0x0xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<0x0>, tf._global_shape = #tf_type.shape<0x0>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg8: tensor<!tf_type.resource<tensor<8x4xf32>>> {tf._assigned_resource_local_shape = #tf_type.shape<8x4>, tf._global_shape = #tf_type.shape<8x4>, tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", tf._mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"}
+) {
+  // CHECK:    "tf_device.cluster"
+  //
+  // CHECK: %0 = "tf.ReadVariableOp"(%arg1)
+  // CHECK-SAME: _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"]
+  // CHECK: %7 = "tf.ReadVariableOp"(%arg8)
+  // CHECK-SAME: _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"]
+  // CHECK-NEXT: tf.LoadAllTPUEmbeddingParameters
+  // CHECK-NEXT: tf_device.return
+  "tf_device.cluster"() ({
+      %0 = "tf.ReadVariableOp"(%arg1) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %1 = "tf.ReadVariableOp"(%arg2) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %2 = "tf.ReadVariableOp"(%arg3) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %3 = "tf.ReadVariableOp"(%arg4) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %4 = "tf.ReadVariableOp"(%arg5) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %5 = "tf.ReadVariableOp"(%arg6) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %6 = "tf.ReadVariableOp"(%arg7) {_global_shape = [#tf_type.shape<0x0>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<0x0xf32>>>) -> tensor<0x0xf32>
+      %7 = "tf.ReadVariableOp"(%arg8) {_global_shape = [#tf_type.shape<8x4>], _layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], device = ""} : (tensor<!tf_type.resource<tensor<8x4xf32>>>) -> tensor<8x4xf32>
+      "tf.LoadAllTPUEmbeddingParameters"(%7, %0, %1, %2, %3, %4, %5, %6) {_layout = [], config = "\0A\1B\0A\05video\10\08\18\04 \01*\0C\22\00j\05\0D\00\00\80?\88\01\02\10\02\18\08 \01(\02", num_shards = 1 : i64, shard_id = 0 : i64} : (tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>) -> ()
+      tf_device.return {_layout = []}
+    }) {_mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+    func.return
+}
+
+// -----
+
+// Check RetrieveTPUEmbeddingParameters is lowered correctly.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) -> (tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK-NEXT: "tf.RetrieveAllTPUEmbeddingParameters"
+  // CHECK-NEXT:  tf_device.return
+  %0:8 = "tf_device.cluster"() ({
+      %parameters, %auxiliary1, %auxiliary2, %auxiliary3, %auxiliary4, %auxiliary5, %auxiliary6, %auxiliary7 = "tf.RetrieveAllTPUEmbeddingParameters"() {_layout = ["sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0", "sharding_specs:batch,unsharded, mesh:|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"], config = "\0A\1B\0A\05video\10\08\18\04 \01*\0C\22\00j\05\0D\00\00\80?\88\01\02\10\02\18\08 \01(\02", num_shards = 1 : i64, shard_id = 0 : i64} : () -> (tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>)
+      tf_device.return {_layout = []} %parameters, %auxiliary1, %auxiliary2, %auxiliary3, %auxiliary4, %auxiliary5, %auxiliary6, %auxiliary7 : tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>
+    }) {_mesh = "|batch=1|0|0|/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>)
+    func.return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5, %0#6, %0#7 : tensor<8x4xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>, tensor<0x0xf32>
+}
+
+// -----
+
+// Check DTensorEmbeddingDequeueOp is lowered correctly.
+// CHECK-LABEL: func @main
+func.func @main() -> (tensor<12x4xf32> {tf._global_shape = #tf_type.shape<12x4>}) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK-NEXT: "tf.RecvTPUEmbeddingActivations"()
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+      %1 = "tf.DTensorEmbeddingDequeue"() {config = "\0A\17\0A\03T_0\10\08\18\04*\0C\22\00j\05\0D\00\00\80?\88\01\02\10\01 \01(\02R\03\1A\01\06", output_layouts = ["sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"]} : () -> tensor<12x4xf32>
+      %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<12x4>, layout = #dtensor.layout<sharding_specs:batch,unsharded, mesh:|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1>} : (tensor<12x4xf32>) -> tensor<12x4xf32>
+      tf_device.return %2 : tensor<12x4xf32>
+    }) {_mesh = "|batch=2|0,1|0,1|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<12x4xf32>
+  func.return %0 : tensor<12x4xf32>
+}
+
+// -----
+
+// Check DTensorSendEmbeddingGradientsOp is lowered correctly.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<12x4xf32> {tf._layout = "sharding_specs:batch,unsharded, mesh:|batch=2,x=1|*TPU"}) -> () {
+  // CHECK:    "tf_device.cluster"
+  // CHECK-NEXT: "tf.SendTPUEmbeddingGradients"
+  // CHECK-NEXT: tf_device.return
+  "tf_device.cluster"() ({
+    "tf.DTensorSendEmbeddingGradients"(%arg0) {config = "", operand_segment_sizes = array<i32: 1, 0>} : (tensor<12x4xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "mesh:|batch=2,x=1|*TPU"} : () -> ()
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
new file mode 100644
index 00000000000..80e2ec3aafa
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_expansion.mlir
@@ -0,0 +1,712 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() : () -> tensor<i32>
+    %2 = "tf.B"() : () -> tensor<i32>
+    // expected-error @+1 {{No attached layout found for op : tf.Add}}
+    %3 = "tf.Add"(%1, %2) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check that ops with malformed layouts are disallowed.
+func.func @main() {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() : () -> tensor<i32>
+    %2 = "tf.B"() : () -> tensor<i32>
+    // expected-error @+1 {{Expected 2 items but found}}
+    %3 = "tf.Add"(%1, %2) {_layout = [",,"]}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+    tf_device.return %3 : tensor<i32>
+  }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check operands without layouts are disallowed.
+func.func @main(%arg0: tensor<i32>) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.A"() : () -> tensor<2x2xi32>
+    %2 = "tf.B"() : () -> tensor<2x2xi32>
+    // expected-error @+1 {{input layout of elementwise op must be known before SPMD expansion}}
+    %3 = "tf.Add"(%1, %2) {_layout = ["sharding_specs:x,y, TPU|x=2,y=1|*TPU"]}: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %3 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=1|*TPU"} : () -> (tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD is skipped for layouts with XLA SPMD mesh.
+//
+// Arguments and ops and Retvals should remain in global shape.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<8x8xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd"}) -> (tensor<8x8xi32>) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+    %2 = "tf.Identity"(%1) {_global_shape = [#tf_type.shape<8x8>], device = ""} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+    %3= "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<8x8>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd>} : (tensor<8x8xi32>) -> tensor<8x8xi32>
+    tf_device.return %3 : tensor<8x8xi32>
+  }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7|use_xla_spmd"} : () -> tensor<8x8xi32>
+  return %0 : tensor<8x8xi32>
+}
+
+// -----
+
+// Check that elementwise batch parallel op SPMD expansion.
+// CHECK-LABEL: func @main
+func.func @main(
+  %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"},
+  %arg1: tensor<2x2xi32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"}) {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:        "tf_device.cluster"
+    // CHECK-NEXT:   "tf.Add"
+    // CHECK-SAME:   _layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+    %3 = "tf.Add"(%arg0, %arg1) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|*TPU"]}: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %3 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+// Check tf.Add SPMD with sharded inputs/outputs
+// CHECK-LABEL: func @main
+func.func @main(
+  %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"},
+  %arg1: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[ADD_OUT:.*]] = "tf.Add"
+  // CHECK-NEXT:      tf_device.return
+  // CHECK-SAME:      %[[ADD_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %3 = "tf.Add"(%arg0, %arg1) {_layout = ["sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"]}: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %3 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check tf.Neg Op SPMD.
+// CHECK-LABEL: func @main
+func.func @main(
+  %arg0: tensor<2x2xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[NEG_OUT:.*]] = "tf.Neg"
+  // CHECK-NEXT:      tf_device.return
+  // CHECK-SAME:      %[[NEG_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %2 = "tf.Neg"(%arg0) {_layout = ["sharding_specs:x,y, mesh:TPU|x=2,y=2|*TPU"]}: (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %2 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check replicated tf.Const op SPMD.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:      %[[ADD_OUT:.*]] = "tf.Add"(%[[A_OUT]], %[[B_OUT]])
+  // CHECK-NEXT:      tf_device.return
+  // CHECK-SAME:      %[[ADD_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value=dense<1> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
+    %2 = "tf.Const"() {_layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value=dense<1> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
+    %3 = "tf.Add"(%1, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]}: (tensor<1x1xi32>, tensor<1x1xi32>) -> tensor<1x1xi32>
+    tf_device.return %3 : tensor<1x1xi32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<1x1xi32>
+  func.return
+}
+
+// -----
+
+// Check sharded tf.Const op SPMD.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<1>}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[A_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:      %[[A_SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%[[A_OUT]])
+  // CHECK-SAME:      input_layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-SAME:      output_layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+  // CHECK-NEXT:      %[[IDENTITY_OUT:[0-9]*]] = "tf.IdentityN"(%[[A_SLICE]])
+  // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.Const"
+  // CHECK-NEXT:      %[[ADD_OUT:[0-9]*]] = "tf.Add"(%[[IDENTITY_OUT]], %[[B_OUT]])
+  // CHECK-NEXT:      tf_device.return
+  // CHECK-SAME:      %[[ADD_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> (tensor<2xi32>)
+    %3 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<1>, layout = #dtensor.layout<sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<i32>) -> (tensor<i32>)
+    %5 = "tf.Add"(%2, %4) {_layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]}: (tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
+    tf_device.return %5 : tensor<2xi32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2xi32>
+ func.return
+}
+
+// -----
+
+// Check sharded tf.Const op SPMD with splat.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:      %[[CONST_OUT:.*]] = "tf.Const"() {[[BEFORE_ATTR:.*]]value = dense<1> : tensor<1xi32>[[AFTER_ATTR:.*]]} : () -> tensor<1xi32>
+  // CHECK-NEXT:      tf_device.return
+  %0 = "tf_device.cluster"() ({
+   %1 = "tf.Const"() {_layout = ["sharding_specs:x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], value=dense<1>: tensor<2xi32>} : () -> tensor<2xi32>
+   tf_device.return %1 : tensor<2xi32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<2xi32>
+ func.return
+}
+
+// -----
+
+// Check replicated tf.BroadcastTo op SPMD.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<3xi32>) {
+  // CHECK:       "tf.BroadcastTo"
+  // CHECK-SAME:  tensor<3xi32>, tensor<2xi64>) -> tensor<3x3xi32>
+  %0 = "tf_device.cluster"() ({
+    %1 = arith.constant dense<[3, 3]> : tensor<2xi32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> (tensor<2xi32>)
+    %3 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<3xi32>) -> (tensor<3xi32>)
+    %4 = "tf.BroadcastTo"(%3, %2) {_layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = ""} : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
+    tf_device.return %4 : tensor<3x3xi32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<3x3xi32>
+ func.return
+}
+
+// -----
+
+// Check replicated tf.range op SPMD.
+// CHECK-LABEL: func @main
+func.func @main() {
+  // CHECK:       "tf.Range"
+  // CHECK-SAME:  tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+  %0 = "tf_device.cluster"() ({
+    %0 = arith.constant dense<0> : tensor<i32>
+    %1 = arith.constant dense<3> : tensor<i32>
+    %2 = arith.constant dense<1> : tensor<i32>
+    %3 = "tf.Range"(%0, %1, %2) {_layout = ["sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+    tf_device.return %3 : tensor<3xi32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<3xi32>
+ func.return
+}
+
+// -----
+
+// Check tf.AssignVariable op SPMD
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32xi32> { tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}, %arg1: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.AssignVariableOp"
+  // CHECK-NEXT:     tf_device.return
+  // CHECK-NEXT:     _inferred_resource_indices = dense<1> : vector<1xi32>
+  // CHECK-SAME:     _inferred_resource_layouts
+  // CHECK-SAME:     "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  "tf_device.cluster"() ({
+    "tf.AssignVariableOp"(%arg1, %arg0) {dtype = i32} : (tensor<!tf_type.resource>, tensor<32x32xi32>) -> ()
+    tf_device.return
+ }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check tf.Softmax op SPMD where last dimension is not sharded.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.Softmax"
+  // CHECK-NEXT:     tf_device.return
+  "tf_device.cluster"() ({
+    "tf.Softmax"(%arg0) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : (tensor<32x32xf32>) -> (tensor<32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check tf.Softmax op with rank 3.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.Softmax"
+  // CHECK-NEXT:     tf_device.return
+  "tf_device.cluster"() ({
+    "tf.Softmax"(%arg0) {_layout = ["sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"]} : (tensor<32x32x32xf32>) -> (tensor<32x32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check SPMD expansion of softmax op with non-sharded last dimension.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.LogSoftmax"
+  // CHECK-NEXT:     tf_device.return
+  "tf_device.cluster"() ({
+    "tf.LogSoftmax"(%arg0) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : (tensor<32x32xf32>) -> (tensor<32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check SPMD expansion of SoftMax op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.LogSoftmax"
+  // CHECK-NEXT:     tf_device.return
+  "tf_device.cluster"() ({
+    "tf.LogSoftmax"(%arg0) {_layout = ["sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"]} : (tensor<32x32x32xf32>) -> (tensor<32x32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check that Softmax op with last dimension sharded is supported on TPU's.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32x32xf32> { tf._layout = "sharding_specs:x,y,z, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
+  "tf_device.cluster"() ({
+    "tf.Softmax"(%arg0) {_layout = ["sharding_specs:x,y,z, mesh:TPU|x=2,y=2,z=2|*TPU"]} : (tensor<32x32x32xf32>) -> (tensor<32x32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2,z=2|*TPU"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check that random uniform op with incompatible shape is disallowed.
+func.func @main(%arg0: tensor<i32>) {
+  %0 = "tf_device.cluster"() ({
+    // %1 = "tf.Const"() {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = "", value = dense<16> : tensor<2xi32>} : () -> tensor<2xi32>
+    // %2 = "tf.Const"() {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = "", value = dense<[123, 321]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = arith.constant dense<[16]> : tensor<1xi32>
+    %2 = arith.constant dense<[2, 1]> : tensor<2xi32>
+    // expected-error @+1 {{Sharding dimension of random op does not match rank of the random op}}
+    %3 = "tf.StatelessRandomUniform"(%1, %2) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], device = ""} : (tensor<1xi32>, tensor<2xi32>) -> tensor<16x16xf32>
+    tf_device.return %3 : tensor<16x16xf32>
+ }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<16x16xf32>
+ func.return
+}
+
+// -----
+
+// Check Resource Apply op SPMD.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %arg0: tensor<f32>
+// CHECK-SAME: %arg1: tensor<1x1xf32>
+// CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+// CHECK-SAME: %arg2: tensor<!tf_type.resource>
+// CHECK-SAME: tf._layout = "sharding_specs: mesh:||||"
+func.func @main(
+  %arg0: tensor<f32>,
+  %arg1: tensor<2x2xf32>{ tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %arg2: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:     "tf.ResourceApplyGradientDescent"(%arg2, %arg0, %arg1)
+  // CHECK-NEXT:     tf_device.return
+  "tf_device.cluster"() ({
+    "tf.ResourceApplyGradientDescent"(%arg2, %arg0, %arg1) {_layout = ["sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"], use_locking = false} : (tensor<!tf_type.resource>, tensor<f32>, tensor<2x2xf32>) -> ()
+    tf_device.return
+ }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check that function inputs are modified to reflect local input shapes.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %arg0: tensor<f32>
+// CHECK-SAME: %arg1: tensor<1x1xf32>
+// CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+// CHECK-SAME: %arg2: tensor<!tf_type.resource>
+// CHECK-SAME: tf._layout = "sharding_specs: mesh:||||"
+func.func @main(
+  %arg0: tensor<f32>,
+  %arg1: tensor<2x2xf32>{ tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %arg2: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
+  "tf_device.cluster"() ({
+    tf_device.return
+ }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check that tf_device.Cluster op return values are updated to reflect local
+// shape.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %arg0: tensor<f32>
+// CHECK-SAME: %arg1: tensor<1x1xf32>
+// CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+// CHECK-SAME: %arg2: tensor<!tf_type.resource>
+// CHECK-SAME: tf._layout = "sharding_specs: mesh:||||"
+func.func @main(
+  %arg0: tensor<f32>,
+  %arg1: tensor<2x2xf32>{ tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %arg2: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
+  "tf_device.cluster"() ({
+    // CHECK:      tf_device.return
+    // CHECK-SAME: tensor<1x1xf32>
+    tf_device.return %arg1 : tensor<2x2xf32>
+ }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xf32>)
+ func.return
+}
+
+// -----
+
+// Check that function signature as well as return types of callsite operations
+// are updated to reflect local shape.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<f32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<1x1xf32>
+// CHECK-SAME: tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+// CHECK-SAME: %[[ARG2:[a-z0-9]*]]: tensor<!tf_type.resource>
+// CHECK-SAME: tf._layout = "sharding_specs: mesh:||||"
+func.func @main(
+  %arg0: tensor<f32>,
+  %arg1: tensor<2x2xf32>{ tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+  %arg2: tensor<!tf_type.resource> { tf._layout = "sharding_specs: mesh:||||"}) {
+  "tf_device.cluster"() ({
+    // CHECK:     "tf.StatefulPartitionedCall"(%[[ARG1]])
+    // CHECK-SAME: (tensor<1x1xf32>) -> tensor<1x1xf32>
+    %0 = "tf.StatefulPartitionedCall"(%arg1) {config = "", config_proto = "", executor_type = "", f = @pcall_func} : (tensor<2x2xf32>) -> (tensor<2x2xf32>)
+    tf_device.return %arg1 : tensor<2x2xf32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xf32>)
+ func.return
+}
+
+// CHECK-LABEL: func @pcall_func
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<1x1xf32>
+func.func @pcall_func(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: return %[[ARG0]] : tensor<1x1xf32>
+  func.return %arg0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Check DTensorLayout ops are removed after SPMD Expansion.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  %0 = "tf_device.cluster"() ({
+    // CHECK:      "tf.Const"()
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK:      "tf.Const"()
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK:      "tf.Add"
+    // CHECK-NOT:  "tf.DTensorLayout"
+    // CHECK-NEXT: tf_device.return
+    %1 = "tf.Const"() {value = dense<[[4, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    %3 = "tf.Const"() {value = dense<[[1, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    %5 = "tf.Add"(%2, %4): (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+    %6 = "tf.DTensorLayout"(%5) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+    tf_device.return %6 : tensor<2x2xi32>
+  }) {_mesh = "mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
+// CHECK-SAME: %[[ARG1:[a-z0-9]*]]: tensor<2x2xf32>
+func.func @main(
+  %arg0: tensor<i32>, %arg1: tensor<2x2xf32>) {
+  "tf_device.cluster"() ({
+    // CHECK:       %[[ARG1_SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%[[ARG1]])
+    // CHECK-SAME:  input_layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-SAME:  output_layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>
+    // CHECK-NEXT:  %[[NEG_OUT:[0-9]*]] = "tf.Neg"(%[[ARG1_SLICE]])
+    // CHECK-NEXT:  tf_device.return
+    // CHECK-SAME:  %[[NEG_OUT]]
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xf32>) -> (tensor<2x2xf32>)
+    %2 = "tf.Neg"(%1) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xf32>) -> (tensor<2x2xf32>)
+    tf_device.return %3: tensor<2x2xf32>
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<2x2xf32>)
+ func.return
+}
+
+// -----
+
+// A super tricky case where the DTensorLayout is out of the tf_device.cluster and somewhat gets casted to BlockArgument with a wild argument number 3.
+// CHECK-LABEL: func @main
+// CHECK:       %arg1: tensor<!tf_type.resource<tensor<1xf32>>>
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<!tf_type.resource<tensor<2xf32>>> {tf._global_shape = #tf_type.shape<2>, tf._layout = "empty_layout", tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg2: tensor<2xf32> {tf._global_shape = #tf_type.shape<2>,
+    tf._layout = "sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) {
+  %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.resource<tensor<2xf32>>>) -> tensor<!tf_type.resource<tensor<2xf32>>>
+  "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg2) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xf32>) -> tensor<2xf32>
+    "tf.AssignVariableOp"(%0, %1) {_global_shape = [], device = ""} : (tensor<!tf_type.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+    tf_device.return
+  }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check to ensure that the local shape of resource-type arguments are not double-calculated if they are assigned to a tensor value wihtin the function.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %arg0: tensor<i32>
+// CHECK-SAME: %arg1: tensor<1x4xf32>
+// CHECK-SAME: %arg2: tensor<!tf_type.resource<tensor<1x4xf32>>>
+func.func @main(
+  %arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<2x4xf32> {
+    tf._global_shape = #tf_type.shape<2x4>,
+    tf._layout = "sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg2: tensor<*x!tf_type.resource<tensor<2x4xf32>>> {
+    tf._global_shape = #tf_type.shape<2x4>,
+    tf._layout = "sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"})
+  -> (tensor<2x4xf32> {tf._global_shape = #tf_type.shape<2x4>}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg2) {_global_shape = [#tf_type.shape<*>], global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<*x!tf_type.resource<tensor<2x4xf32>>>) -> tensor<*x!tf_type.resource<tensor<2x4xf32>>>
+    %2 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<2x4>], global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    "tf.AssignVariableOp"(%1, %2) {_global_shape = [], device = ""} : (tensor<*x!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+    %3 = "tf.ReadVariableOp"(%1) {_global_shape = [#tf_type.shape<2x4>], device = ""} : (tensor<*x!tf_type.resource<tensor<2x4xf32>>>) -> tensor<2x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %5 = "tf.Identity"(%4) {_global_shape = [#tf_type.shape<2x4>], device = ""} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %6 = "tf.DTensorLayout"(%5) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    tf_device.return %6 : tensor<2x4xf32>
+  }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<2x4xf32>
+  func.return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+// Check SPMD expansion of Cumsum op with sharding on axis dimension, should
+// produce a replicated layout on that axis dimension, with allgather and
+// allscatter for intermediate layout computation.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:   "tf.DTensorAllGather"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Cumsum"
+  // CHECK-NEXT:   "tf.DTensorAllScatter"
+  // CHECK-NEXT:    tf_device.return
+  "tf_device.cluster"() ({
+     %axis = "tf.Const"() { value = dense<0> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<i64>
+    "tf.Cumsum"(%arg0, %axis) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : (tensor<32x32xf32>, tensor<i64>) -> (tensor<32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check SPMD expansion of Cumsum op with no sharding on axis dim. This should
+// not produce an allscatter or allgather for intermediate layout computation
+// since no relayouts are happening.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<32x32xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Cumsum"
+  // CHECK-NEXT:    tf_device.return
+  "tf_device.cluster"() ({
+     %axis = "tf.Const"() { value = dense<-1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : () -> tensor<i64>
+    "tf.Cumsum"(%arg0, %axis) {_layout = ["sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]} : (tensor<32x32xf32>, tensor<i64>) -> (tensor<32x32xf32>)
+    tf_device.return
+ }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+ func.return
+}
+
+// -----
+
+// Check Relayout for SparseTensors emits the appropriate ops required for relaying out a SparseTensor.
+// We do this by doing a matmul (between a sparsetensor and a densetensor)
+// tf.matmul (*,x) multiplied by (x,*) causes a relayout on the left operand.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x16xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"}, %arg2: tensor<?x2xi64>, %arg3: tensor<2xi64>, %arg4: tensor<?xf32>) -> tensor<8x16xf32> {
+  // CHECK: %[[CST:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[DENSE_0:.*]] = "tf.SparseToDense"(%arg2, %arg3, %arg4, %[[CST]])
+  // CHECK-NEXT: %[[RIGHT_OPERAND:.*]] = "tf.DTensorAllScatter"(%arg1)
+  // CHECK-NEXT: %[[ONE:.*]] = "tf.DTensorAllGather"(%[[DENSE_0]])
+  // CHECK-NEXT: %[[TWO:.*]] = "tf.ZerosLike"(%[[ONE]])
+  // CHECK-NEXT: %[[THREE:.*]] = "tf.NotEqual"(%[[ONE]], %[[TWO]])
+  // CHECK-NEXT: %[[WHERE:.*]] = "tf.Where"(%[[THREE]])
+  // CHECK-NEXT: %[[GATHER:.*]] = "tf.GatherNd"(%[[ONE]], %[[WHERE]])
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Shape"(%[[ONE]])
+  // CHECK-NEXT: %[[CST_0:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[LEFT_OPERAND:.*]] = "tf.SparseToDense"(%[[WHERE]], %[[SHAPE]], %[[GATHER]], %[[CST_0]])
+  // CHECK-NEXT: "tf.MatMul"(%[[LEFT_OPERAND]], %[[RIGHT_OPERAND]])
+  %0 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<?x2>, layout = #dtensor.layout<sharding_specs:x,batch, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<?x2xi64>) -> tensor<?x2xi64>
+  %1 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %2 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<f32>) -> tensor<f32>
+    %3 = "tf.SparseToDense"(%0, %arg3, %arg4, %2) {_global_shape = [#tf_type.shape<8x4>]} : (tensor<?x2xi64>, tensor<2xi64>, tensor<?xf32>, tensor<f32>) -> tensor<8x4xf32>
+    %4 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<4x16>], global_shape = #tf_type.shape<4x16>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<4x16xf32>) -> tensor<4x16xf32>
+    %5 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<8x4>], global_shape = #tf_type.shape<8x4>, layout = #dtensor.layout<sharding_specs:x,batch, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<8x4xf32>) -> tensor<8x4xf32>
+    %6 = "tf.MatMul"(%5, %4) {_global_shape = [#tf_type.shape<8x16>], device = "", transpose_a = false, transpose_b = false} : (tensor<8x4xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
+    %7 = "tf.DTensorLayout"(%6) {global_shape = #tf_type.shape<8x16>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+    %8 = "tf.Identity"(%7) {_global_shape = [#tf_type.shape<8x16>], device = ""} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+    %9 = "tf.DTensorLayout"(%8) {global_shape = #tf_type.shape<8x16>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3>} : (tensor<8x16xf32>) -> tensor<8x16xf32>
+    tf_device.return %9 : tensor<8x16xf32>
+  }) {_mesh = "|batch=2,x=2|0,1,2,3|0,1,2,3|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3"} : () -> tensor<8x16xf32>
+  func.return %1 : tensor<8x16xf32>
+}
+
+// -----
+
+// Check SPMD expansion of TensorListReserve replicated and TensorListSet with a sharded tensor emits a gather to replicated.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<4x4xi32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> (tensor<4x4xi32>) {
+  // CHECK:        "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.DTensorAllGather"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.TensorListReserve"
+  // CHECK-NEXT:   "tf.TensorListSetItem"
+  // CHECK-NEXT:   "tf.TensorListGetItem"
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<4> : tensor<2xi32>} : () -> tensor<2xi32>
+    %1 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %cst_0 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<4> : tensor<2xi32>} : () -> tensor<2xi32>
+    %2 = "tf.DTensorLayout"(%cst_0) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<4> : tensor<i32>} : () -> tensor<i32>
+    %3 = "tf.DTensorLayout"(%cst_1) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<i32>) -> tensor<i32>
+    %cst_2 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %4 = "tf.DTensorLayout"(%cst_2) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<i32>) -> tensor<i32>
+    %cst_3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %5 = "tf.DTensorLayout"(%cst_3) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<i32>) -> tensor<i32>
+    %6 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<4x4>], global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4x4xi32>) -> tensor<4x4xi32>
+    %7 = "tf.TensorListReserve"(%1, %3) {_global_shape = [#tf_type.shape<>], device = ""} : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<4x4xi32>>>
+    %8 = "tf.DTensorLayout"(%7) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.variant<tensor<4x4xi32>>>) -> tensor<!tf_type.variant<tensor<4x4xi32>>>
+    %9 = "tf.TensorListSetItem"(%8, %4, %6) {_global_shape = [#tf_type.shape<>], device = ""} : (tensor<!tf_type.variant<tensor<4x4xi32>>>, tensor<i32>, tensor<4x4xi32>) -> tensor<!tf_type.variant<tensor<4x4xi32>>>
+    %10 = "tf.DTensorLayout"(%9) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.variant<tensor<4x4xi32>>>) -> tensor<!tf_type.variant<tensor<4x4xi32>>>
+    %11 = "tf.TensorListGetItem"(%10, %5, %2) {_global_shape = [#tf_type.shape<4x4>], device = ""} : (tensor<!tf_type.variant<tensor<4x4xi32>>>, tensor<i32>, tensor<2xi32>) -> tensor<4x4xi32>
+    %12 = "tf.DTensorLayout"(%11) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4x4xi32>) -> tensor<4x4xi32>
+    %13 = "tf.Identity"(%12) {_global_shape = [#tf_type.shape<4x4>], device = ""} : (tensor<4x4xi32>) -> tensor<4x4xi32>
+    %14 = "tf.DTensorLayout"(%13) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4x4xi32>) -> tensor<4x4xi32>
+    tf_device.return %14 : tensor<4x4xi32>
+  }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<4x4xi32>
+  func.return %0 : tensor<4x4xi32>
+}
+
+// -----
+
+// Check SPMD expansion of DisableCopyOnRead has correct shape.
+// CHECK-LABEL: func @main
+func.func @main(
+  %arg0: tensor<i32>,
+  %arg1: tensor<!tf_type.resource<tensor<4x8xi32>>> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> () {
+  // CHECK: "tf_device.cluster"
+  // CHECK:   "tf.DisableCopyOnRead"(%arg1) {_global_shape = [], _layout = [], device = ""} : (tensor<!tf_type.resource<tensor<2x8xi32>>>) -> ()
+  "tf_device.cluster"() ({
+    %0 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x8>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.resource<tensor<4x8xi32>>>) -> tensor<!tf_type.resource<tensor<4x8xi32>>>
+    "tf.DisableCopyOnRead"(%0) {device = ""} : (tensor<!tf_type.resource<tensor<4x8xi32>>>) -> ()
+    tf_device.return
+  }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check SPMD expansion of ScatterNd op output is the sharding of updates
+// tensor.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<16x4x4xi32>) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"()
+  // CHECK-NEXT:   %[[INDICES:.*]] = "tf.Const"()
+  // CHECK-NEXT:   %[[NEW_SHAPE:.*]] = "tf.Const"() {value = dense<[16, 2, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK-NEXT:   "tf.ScatterNd"(%[[INDICES]], %arg0, %[[NEW_SHAPE]])
+  %0 = "tf_device.cluster"() ({
+    %shape = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[16, 4, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %indices = "tf.Const"() {_global_shape = [#tf_type.shape<2x1>], value = dense<[[0], [15]]> : tensor<2x1xi32>} : () -> tensor<2x1xi32>
+    %updates_with_layout = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<2x4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<2x4x4xi32>) -> tensor<2x4x4xi32>
+    %indices_with_layout= "tf.DTensorLayout"(%indices) {global_shape = #tf_type.shape<2x1>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<2x1xi32>) -> tensor<2x1xi32>
+    %shape_with_layout = "tf.DTensorLayout"(%shape) {global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<3xi32>) -> tensor<3xi32>
+    %4 = "tf.ScatterNd"(%indices_with_layout, %updates_with_layout, %shape_with_layout) {_global_shape = [#tf_type.shape<16x4x4>], device = ""} : (tensor<2x1xi32>, tensor<2x4x4xi32>, tensor<3xi32>) -> tensor<16x4x4xi32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<16x4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<16x4x4xi32>) -> tensor<16x4x4xi32>
+    tf_device.return %5 : tensor<16x4x4xi32>
+  }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"} : () -> tensor<16x4x4xi32>
+  return %0 : tensor<16x4x4xi32>
+}
+
+// -----
+
+// Check SPMD expansion of ScatterNd op indices is relayout to replicated.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<2x4x4xi32> {tf._layout = "sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"},
+                %arg1: tensor<2x1xi32> {tf._global_shape = #tf_type.shape<2x1>, tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<16x4x4xi32>) {
+  // CHECK:   "tf_device.cluster"
+  // CHECK:     "tf.DTensorAllGather"(%arg1)
+  // CHECK:     "tf.ScatterNd"
+  %0 = "tf_device.cluster"() ({
+    %shape = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[16, 4, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %updates_with_layout = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<2x4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<2x4x4xi32>) -> tensor<2x4x4xi32>
+    %indices_with_layout= "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x1>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<2x1xi32>) -> tensor<2x1xi32>
+    %shape_with_layout = "tf.DTensorLayout"(%shape) {global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<3xi32>) -> tensor<3xi32>
+    %4 = "tf.ScatterNd"(%indices_with_layout, %updates_with_layout, %shape_with_layout) {_global_shape = [#tf_type.shape<16x4x4>], device = ""} : (tensor<2x1xi32>, tensor<2x4x4xi32>, tensor<3xi32>) -> tensor<16x4x4xi32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<16x4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x,unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7>} : (tensor<16x4x4xi32>) -> tensor<16x4x4xi32>
+    tf_device.return %5 : tensor<16x4x4xi32>
+  }) {_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"} : () -> tensor<16x4x4xi32>
+  return %0 : tensor<16x4x4xi32>
+}
+
+// -----
+
+// Check stateful random operations raise error.
+func.func @main(
+  %arg0: tensor<2xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    // expected-error @+1 {{Stateful random operations are not supported in DTensor.}}
+    %1 = "tf.RandomUniform"(%arg0) {_layout = ["sharding_specs:x,y, mesh:CPU|x=2,y=2|*CPU"]}: (tensor<2xi32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|*CPU>} : (tensor<4x4xf32>) -> (tensor<4x4xf32>)
+    tf_device.return %2 : tensor<4x4xf32>
+  }) {_mesh = "CPU|x=2,y=2|*CPU"} : () -> (tensor<4x4xf32>)
+  func.return
+}
+
+
+// -----
+
+// Check stateful random operations raise error.
+func.func @main(
+  %arg0: tensor<2xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"},
+  %arg1: tensor<1xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"},
+  %arg2: tensor<1xi32> { tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"}
+) {
+  %0 = "tf_device.cluster"() ({
+    // expected-error @+1 {{Stateful random operations are not supported in DTensor.}}
+    %1 = "tf.RandomUniformInt"(%arg0, %arg1, %arg2) {_layout = ["sharding_specs:x,y, mesh:CPU|x=2,y=2|*CPU"]}: (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4x4xi32>
+    %2 = "tf.DTensorLayout"(%1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|*TPU>} : (tensor<4x4xi32>) -> (tensor<4x4xi32>)
+    tf_device.return %2 : tensor<4x4xi32>
+  }) {_mesh = "CPU|x=2,y=2|*CPU"} : () -> (tensor<4x4xi32>)
+  func.return
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_fill.mlir b/tensorflow/dtensor/mlir/tests/spmd_fill.mlir
new file mode 100644
index 00000000000..3d931f479a8
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_fill.mlir
@@ -0,0 +1,53 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// Check Fill op on sharded default input as argument.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU" }, %arg2: tensor<f32> ) -> (tensor<?x?xf32> {tf._default_layout = "sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-SAME:   dense<[2, 1]>
+  // CHECK-NEXT:   "tf.Div"
+  // CHECK-NEXT:   "tf.Fill"
+  // CHECK-SAME:   _layout = ["sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:   tensor<2xi32>, tensor<f32>) -> tensor<?x?xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Fill"(%arg1, %arg2) : (tensor<2xi32>, tensor<f32>) -> tensor<?x?xf32>
+    tf_device.return %1 : tensor<?x?xf32>
+  }) {} : () -> tensor<?x?xf32>
+  func.return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+// Check Fill op on sharded default input as ConstOp.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"}, %arg2: tensor<f32> ) -> (tensor<8x1xf32>{
+  tf._default_layout = "sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|*CPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   dense<[2, 1]>
+  // CHECK-NEXT:   "tf.Div"
+  // CHECK-NEXT:   "tf.Fill"
+  // CHECK-SAME:   {_layout = ["sharding_specs:x,unsharded, mesh:CPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]}
+  // CHECK-SAME:   tensor<2xi32>, tensor<f32>) -> tensor<?x?xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[8, 1]> : tensor<2xi32>, _layout = ["sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"]} : () -> tensor<2xi32>
+    %2 = "tf.Fill"(%1, %arg2) {device = ""} : (tensor<2xi32>, tensor<f32>) -> tensor<8x1xf32>
+    tf_device.return %2 : tensor<8x1xf32>
+  }) {} : () -> tensor<8x1xf32>
+  func.return %0 : tensor<8x1xf32>
+}
+
+// -----
+
+// Check tf.Fill op with incompatible layout disallowed.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:CPU|x=2,y=2|*CPU"}, %arg2: tensor<f32> ) -> (tensor<8x1xf32>{
+  // expected-error @+4 {{The sharding spec for axis 0 splits among 3 values}}
+  tf._default_layout = "sharding_specs:x,unsharded, mesh:CPU|x=3,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[8, 1]> : tensor<2xi32>, _layout = ["sharding_specs:unsharded, mesh:CPU|x=3,y=2|*CPU"]} : () -> tensor<2xi32>
+    %2 = "tf.Fill"(%1, %arg2) {device = ""} : (tensor<2xi32>, tensor<f32>) -> tensor<8x1xf32>
+    tf_device.return %2 : tensor<8x1xf32>
+  }) {} : () -> tensor<8x1xf32>
+  func.return %0 : tensor<8x1xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_io_ops.mlir b/tensorflow/dtensor/mlir/tests/spmd_io_ops.mlir
new file mode 100644
index 00000000000..467d9be3e48
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_io_ops.mlir
@@ -0,0 +1,21 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion | FileCheck %s
+
+// Check ops registered to IO Op Expander only happen on Device 0.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource>) {
+  "tf_device.cluster"() ({
+    // CHECK:      tf.NotEqual
+    // CHECK:      tf.If
+    // CHECK-SAME: else_branch = @tf.[[ELSE:[a-zA-Z0-9_]*]]
+    // CHECK-SAME: then_branch = @tf.[[THEN:[a-zA-Z0-9_]*]]
+    // CHECK:      func private @tf.[[THEN]]
+    // CHECK:      tf.NoOp
+    // CHECK:      func private @tf.[[ELSE]]
+    // CHECK:      "tf.WriteSummary"
+    %3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<""> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+    %2 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    "tf.WriteSummary"(%arg1, %1, %2, %3, %3) {_global_shape = [], device = ""} : (tensor<*x!tf_type.resource>, tensor<i64>, tensor<i32>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> ()    tf_device.return
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir b/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir
new file mode 100644
index 00000000000..ca2cc5f4345
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_iterator.mlir
@@ -0,0 +1,107 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check that IteratorGetNextOp layout is set correctly based on iterator
+// resource attribute `tf._element_layouts`.
+// CHECK-LABEL: func @main
+func.func @main(
+    %arg0: tensor<1xi32>,
+    %arg1: tensor<*x!tf_type.resource> {
+        tf._element_layouts = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"],
+        tf._layout = "sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/task:0/device:CPU:7"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[ITER_OUT:.*]] = "tf.IteratorGetNext"(%arg1)
+  // CHECK-SAME:     _layout = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<8x16xf32>
+  // CHECK:        tf_device.return
+  // CHECK-SAME:     tensor<8x16xf32>
+  %0 = "tf_device.cluster"() ({
+    %elem = "tf.IteratorGetNext"(%arg1) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<*x!tf_type.resource>) -> tensor<32x16xf32>
+    %identity = "tf.Identity"(%elem) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<32x16xf32>) -> tensor<32x16xf32>
+    tf_device.return %identity : tensor<32x16xf32>
+  }) {_mesh="|x=4,y=2|*TPU"} : () -> (tensor<32x16xf32>)
+  func.return
+}
+
+// -----
+
+// Check that IteratorGetNextOp layout is set correctly based on iterator
+// resource attribute `tf._element_layouts`, where a DTensorLayout op has been
+// applied to the iterator resource tensor.
+// CHECK-LABEL: func @main
+func.func @main(
+    %arg0: tensor<1xi32>,
+    %arg1: tensor<*x!tf_type.resource> {
+        tf._element_layouts = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"],
+        tf._layout = "sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/task:0/device:CPU:7"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[ITER_OUT:.*]] = "tf.IteratorGetNext"(%arg1)
+  // CHECK-SAME:     _layout = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<8x16xf32>
+  // CHECK:        tf_device.return
+  // CHECK-SAME:     tensor<8x16xf32>
+  %0 = "tf_device.cluster"() ({
+    %elem_layout = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<*>, layout = #dtensor.layout<sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/task:0/device:CPU:7>} : (tensor<*x!tf_type.resource>) -> tensor<*x!tf_type.resource>
+    %elem = "tf.IteratorGetNext"(%elem_layout) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<*x!tf_type.resource>) -> tensor<32x16xf32>
+    %identity = "tf.Identity"(%elem) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<32x16xf32>) -> tensor<32x16xf32>
+    tf_device.return %identity : tensor<32x16xf32>
+  }) {_mesh="|x=4,y=2|*TPU"} : () -> (tensor<32x16xf32>)
+  func.return
+}
+
+// -----
+
+// Check that element layouts from iterator with optional output is set
+// correctly based on iterator resource attribute `tf._element_layouts`.
+// CHECK-LABEL: func @main
+func.func @main(
+    %arg0: tensor<1xi32>,
+    %arg1: tensor<*x!tf_type.resource> {
+        tf._element_layouts = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"],
+        tf._layout = "sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/task:0/device:CPU:7"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[ITER_OUT:.*]] = "tf.IteratorGetNext"(%arg1)
+  // CHECK-SAME:     _layout = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<8x16xf32>
+  // CHECK:        "tf.WhileRegion"
+  // CHECK:        %[[ITER_OPTIONAL_OUT:.*]] = "tf.IteratorGetNextAsOptional"(%arg1)
+  // CHECK-SAME:     _layout = ["sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     output_shapes = [#tf_type.shape<8x16>]
+  // CHECK-SAME:     (tensor<*x!tf_type.resource>) -> tensor<!tf_type.variant>
+  // CHECK-NEXT:   %[[HAS_VALUE:.*]] = "tf.OptionalHasValue"(%[[ITER_OPTIONAL_OUT]])
+  // CHECK-SAME:     _layout = ["sharding_specs: mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     (tensor<!tf_type.variant>) -> tensor<i1>
+  // CHECK:        %[[GET_VALUE:.*]] = "tf.OptionalGetValue"(%[[ITER_OPTIONAL_OUT]])
+  // CHECK-SAME:     _layout = ["sharding_specs:x,unsharded, mesh:|x=4,y=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7"]
+  // CHECK-SAME:     (tensor<!tf_type.variant>) -> tensor<8x16xf32>
+  // CHECK:        tf_device.return
+  // CHECK-SAME:     tensor<8x16xf32>
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %elem = "tf.IteratorGetNext"(%arg1) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<*x!tf_type.resource>) -> tensor<32x16xf32>
+    %while_region:2 = "tf.WhileRegion"(%cst, %elem) ({
+      ^bb0(%arg2: tensor<i1>, %arg3: tensor<32x16xf32>):
+        %identity = "tf.Identity"(%arg2) {_global_shape = [#tf_type.shape<>]} : (tensor<i1>) -> tensor<i1>
+        "tf.Yield"(%identity) {_global_shape = []} : (tensor<i1>) -> ()
+      }, {
+      ^bb0(%arg2: tensor<i1>, %arg3: tensor<32x16xf32>):
+        %iter_optional_out = "tf.IteratorGetNextAsOptional"(%arg1) {_global_shape = [#tf_type.shape<>], output_shapes = [#tf_type.shape<32x16>], output_types = [f32]} : (tensor<*x!tf_type.resource>) -> tensor<!tf_type.variant>
+        %has_value = "tf.OptionalHasValue"(%iter_optional_out) {_global_shape = [#tf_type.shape<>]} : (tensor<!tf_type.variant>) -> tensor<i1>
+        %if_region:2 = "tf.IfRegion"(%has_value) ({
+          %has_value_identity = "tf.Identity"(%has_value) {_global_shape = [#tf_type.shape<>]} : (tensor<i1>) -> tensor<i1>
+          %20 = "tf.OptionalGetValue"(%iter_optional_out) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<!tf_type.variant>) -> tensor<32x16xf32>
+          %22 = "tf.Identity"(%20) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<32x16xf32>) -> tensor<32x16xf32>
+          "tf.Yield"(%has_value_identity, %22) {_global_shape = []} : (tensor<i1>, tensor<32x16xf32>) -> ()
+        }, {
+          %arg_identity = "tf.Identity"(%arg3) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<32x16xf32>) -> tensor<32x16xf32>
+          %has_value_identity = "tf.Identity"(%has_value) {_global_shape = [#tf_type.shape<>]} : (tensor<i1>) -> tensor<i1>
+          "tf.Yield"(%has_value_identity, %arg_identity) {_global_shape = []} : (tensor<i1>, tensor<32x16xf32>) -> ()
+        }) {_global_shape = [#tf_type.shape<>, #tf_type.shape<32x16>], _lower_using_switch_merge = true, is_stateless = true} : (tensor<i1>) -> (tensor<i1>, tensor<32x16xf32>)
+        %1 = "tf.Identity"(%if_region#0) {_global_shape = [#tf_type.shape<>]} : (tensor<i1>) -> tensor<i1>
+        %2 = "tf.Identity"(%if_region#1) {_global_shape = [#tf_type.shape<32x16>]} : (tensor<32x16xf32>) -> tensor<32x16xf32>
+        "tf.Yield"(%1, %2) {_global_shape = []} : (tensor<i1>, tensor<32x16xf32>) -> ()
+      }
+    ) {_global_shape = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<32x16>], _lower_using_switch_merge = true, is_stateless = false, shape_invariant} : (tensor<i1>, tensor<32x16xf32>) -> (tensor<i1>, tensor<32x16xf32>)
+    tf_device.return %while_region#1 : tensor<32x16xf32>
+  }) {_mesh="|x=4,y=2|*TPU"} : () -> (tensor<32x16xf32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir b/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir
new file mode 100644
index 00000000000..da1661329d1
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_matmul.mlir
@@ -0,0 +1,180 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// Batch matmul, no batch dimensions
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<2x4xi32> {tf._layout = "sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x2xi32> {tf._layout = "sharding_specs:y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      %[[MATMUL_OUT:.*]] = "tf.BatchMatMulV2"(%arg1, %arg2)
+  // CHECK-SAME: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME: (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[SUM_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x4xi32>) -> tensor<2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x2>, layout = #dtensor.layout<sharding_specs:y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+    %3 = "tf.BatchMatMulV2"(%1, %2) {adj_x = false, adj_y = false}: (tensor<2x4xi32>, tensor<4x2xi32>) -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %4 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Batch matmul with batch dims
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<8x2x4xi32> {tf._layout = "sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<8x4x2xi32> {tf._layout = "sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      %[[MATMUL_OUT:.*]] = "tf.BatchMatMulV2"
+  // CHECK-SAME: (tensor<4x2x2xi32>, tensor<4x2x2xi32>) -> tensor<4x2x2xi32>
+  // CHECK:      %[[GROUP_ID:.*]] = "tf.Const"() {value = dense<{{.*}}> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+  // CHECK:      %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
+  // CHECK-SAME: _layout = ["sharding_specs:x,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME: (tensor<4x2x2xi32>, tensor<2x2xi32>) -> tensor<4x2x2xi32>
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[SUM_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<8x2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<8x2x4xi32>) -> tensor<8x2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<8x4x2>, layout = #dtensor.layout<sharding_specs:x,y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<8x4x2xi32>) -> tensor<8x4x2xi32>
+    %3 = "tf.BatchMatMulV2"(%1, %2) {adj_x = false, adj_y = false}: (tensor<8x2x4xi32>, tensor<8x4x2xi32>) -> tensor<8x2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<8x2x2>, layout = #dtensor.layout<sharding_specs:x,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<8x2x2xi32>) -> tensor<8x2x2xi32>
+    tf_device.return %4 : tensor<8x2x2xi32>
+  }) { _mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Batch matmul, with incompatible dimensions
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<2x4xi32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2,z=2|*TPU"},
+           %arg2: tensor<4x2xi32> {tf._layout = "sharding_specs:z,unsharded, mesh:TPU|x=2,y=2,z=2|*TPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<2x4xi32>) -> tensor<2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x2>, layout = #dtensor.layout<sharding_specs:z,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+    // expected-error @+1 {{Contracting dimension for matmul has sharding dimension y for the left input and z for the right input which are not equal.}}
+    %3 = "tf.BatchMatMulV2"(%1, %2) {adj_x = false, adj_y = false}: (tensor<2x4xi32>, tensor<4x2xi32>) -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %4 : tensor<2x2xi32>
+  }) {_layout = ["sharding_specs:x,y, mesh:TPU|x=2,y=2,z=2|*TPU"], _mesh = "TPU|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Regular matmul
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<2x4xi32> {tf._layout = "sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x2xi32> {tf._layout = "sharding_specs:y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[MATMUL_OUT:.*]] = "tf.MatMul"(%arg1, %arg2)
+  // CHECK-SAME:   (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  // CHECK-NEXT:   %[[GROUP_ID:.*]] = "tf.Const"()
+  // CHECK-NEXT:   %[[SUM_OUT:.*]] = "tf.DTensorAllReduce"(%[[MATMUL_OUT]], %[[GROUP_ID]])
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME: reduce_op = "Add"
+  // CHECK-SAME:   (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:   %[[SUM_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x4xi32>) -> tensor<2x4xi32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x2>, layout = #dtensor.layout<sharding_specs:y,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x2xi32>) -> tensor<4x2xi32>
+    %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = false}: (tensor<2x4xi32>, tensor<4x2xi32>) -> tensor<2x2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2x2xi32>) -> tensor<2x2xi32>
+    tf_device.return %4 : tensor<2x2xi32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Regular MatMul with one operand sharded
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x3xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<4x3xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: %[[B_SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%arg2)
+  // CHECK:      %[[MATMUL_OUT:.*]] = "tf.MatMul"(%arg1, %[[B_SLICE]])
+  // CHECK:      %[[COLL_OUT:.*]] = "tf.DTensorAllReduce"
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[COLL_OUT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x3xf32>) -> tensor<4x3xf32>
+    %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = false} : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x3>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x3xf32>) -> tensor<4x3xf32>
+    tf_device.return %4 : tensor<4x3xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<4x3xf32>
+  func.return %0 : tensor<4x3xf32>
+}
+
+// -----
+
+// y,x . x,y -> *,y
+// We unshard %arg1
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:y,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[MATMUL_RESULT:[0-9]*]] = "tf.MatMul"(%[[GATHERED]], %arg2)
+  // CHECK:      %[[FINAL_REDUCE:[0-9]*]] = "tf.DTensorAllReduce"(%[[MATMUL_RESULT]], %cst)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:y,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = false} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// *,x . x,* -> *,y
+// We should slice arg2 before matmul rather than slicing the result.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[SLICE:[0-9]*]] = "tf.DTensorAllScatter"(%arg2)
+  // CHECK-NEXT: %[[MATMUL_RESULT:[0-9]*]] = "tf.MatMul"(%arg1, %[[SLICE]])
+  // CHECK:      %[[FINAL_REDUCE:[0-9]*]] = "tf.DTensorAllReduce"(%[[MATMUL_RESULT]], %cst)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = false} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// x,y . *,y -> x,y
+// We unshard %arg1 on y.
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<4x4xf32> {tf._layout = "sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<4x4xf32> {tf._layout = "sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) -> tensor<4x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK-NEXT: %[[MATMUL_RESULT:[0-9]*]] = "tf.MatMul"(%[[GATHERED]], %arg2)
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:unsharded,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    %3 = "tf.MatMul"(%1, %2) {transpose_a = false, transpose_b = false} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<4x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<4x4xf32>) -> tensor<4x4xf32>
+    tf_device.return %4 : tensor<4x4xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_metadata.mlir b/tensorflow/dtensor/mlir/tests/spmd_metadata.mlir
new file mode 100644
index 00000000000..64bc3c1d191
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_metadata.mlir
@@ -0,0 +1,111 @@
+// RUN: dtensor-opt -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation -dtensor-spmd-expansion | FileCheck %s
+
+// SPMD of shape of with replicated layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      "tf.Shape"
+  // CHECK-NOT:  "tf.Const"()
+  // CHECK-NOT:  %[[C:.*]] = "tf.Mul"
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %a = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                       _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x10xi32>
+    %shape_a = "tf.Shape"(%a) : (tensor<2x10xi32>) -> tensor<2xi32>
+    tf_device.return %shape_a : tensor<2xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD of shape op with 2D input.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK: "tf.Shape"
+  // CHECK: "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[C:.*]] = "tf.Mul"(%[[A:.*]], %[[B:.*]]) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %a = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                       _layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x10xi32>
+    %shape_a = "tf.Shape"(%a) : (tensor<2x10xi32>) -> tensor<2xi32>
+    tf_device.return %shape_a : tensor<2xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD of shape op with 3D input.
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:[a-z0-9]*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK-NEXT:  %[[MOD_CONST:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[DIV_CONST:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[PRE_MESH_COORDS:[0-9]*]] = "tf.Div"(%[[ARG0]], %[[DIV_CONST]])
+  // CHECK-NEXT:  %[[MESH_COORDS:[0-9]*]] = "tf.FloorMod"(%[[PRE_MESH_COORDS]], %[[MOD_CONST]])
+  // CHECK-NEXT:  %[[TENSOR:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SLICE_SHAPE:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[PRE_SLICE_OFFSET:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[SLICE_OFFSET:[0-9]*]] = "tf.MatMul"(%[[MESH_COORDS]], %[[PRE_SLICE_OFFSET]])
+  // CHECK-NEXT:  %[[SQUEEZED_OFFSET:[0-9]*]] = "tf.Squeeze"(%[[SLICE_OFFSET]])
+  // CHECK-NEXT:  %[[TENSOR_SLICE:[0-9]*]] = "tf.Slice"(%[[TENSOR]], %[[SQUEEZED_OFFSET]], %[[SLICE_SHAPE]])
+  // CHECK-NEXT:  %[[TENSOR_SLICE_IDENTITY:[0-9]*]] = "tf.IdentityN"(%[[TENSOR_SLICE]])
+  // CHECK-NEXT:  %[[TENSOR_SLICE_SHAPE:[0-9]*]] = "tf.Shape"(%[[TENSOR_SLICE_IDENTITY]])
+  // CHECK-NEXT:  %[[TENSOR_SPLIT_SIZES:[0-9]*]] = "tf.Const"
+  // CHECK-NEXT:  %[[TENSOR_SHAPE:[0-9]*]] = "tf.Mul"(%[[TENSOR_SLICE_SHAPE]], %[[TENSOR_SPLIT_SIZES]])
+  // CHECK-NEXT:  tf_device.return
+  %0 = "tf_device.cluster"() ({
+
+    %1 = "tf.Const"() {value = dense<[[[0, 1], [2, 3]], [[4, 5], [6, 7]]]> : tensor<2x2x2xi32>,
+                       _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x2x2xi32>
+    %2 = "tf.Shape"(%1) : (tensor<2x2x2xi32>) -> tensor<3xi32>
+    tf_device.return %2 : tensor<3xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<3xi32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD of rank op with 3D input.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK: "tf.Rank"
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %b = "tf.Const"() {value = dense<[[[0, 1], [2, 3]], [[4, 5], [6, 7]]]> : tensor<2x2x2xi32>,
+                       _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x2x2xi32>
+    %rank_b = "tf.Rank"(%b) : (tensor<2x2x2xi32>) -> tensor<i32>
+    tf_device.return %rank_b : tensor<i32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD BroadcastGradientArgs op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<8x3x3x3xf32>{tf._layout = "sharding_specs:x,unsharded,unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[OUT1:.*]], %[[OUT2:.*]] = "tf.BroadcastGradientArgs"
+  // CHECK-SAME:   _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3", "sharding_specs:unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:   (tensor<4xi32>, tensor<4xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:   %[[OUT1]]
+  %0 = "tf_device.cluster"() ({
+    %dimension = "tf.Const"() { value = dense<-1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*CPU"]} : () -> tensor<i64>
+    %v2 = "tf.Sum"(%arg1, %dimension) {keep_dims = true}: (tensor<8x3x3x3xf32>, tensor<i64>) -> tensor<8x3x3x1xf32>
+    %s1 = "tf.Shape"(%arg1): (tensor<8x3x3x3xf32>) -> tensor<4xi32>
+    %s2 = "tf.Shape"(%v2): (tensor<8x3x3x1xf32>) -> tensor<4xi32>
+    %b1, %b2 = "tf.BroadcastGradientArgs"(%s1, %s2): (tensor<4xi32>, tensor<4xi32>) -> (tensor<4xi32>, tensor<4xi32>)
+    tf_device.return %b1 : tensor<4xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_random.mlir b/tensorflow/dtensor/mlir/tests/spmd_random.mlir
new file mode 100644
index 00000000000..257d7c6137f
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_random.mlir
@@ -0,0 +1,62 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s --dump-input=fail
+
+// Random with no sharding
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Const"()
+  // CHECK-SAME: value = dense<[32, 32, 64]>
+  // CHECK-NEXT: %[[RANDOM:.*]] = "tf.StatelessRandomUniform"(%[[SHAPE]], %arg1)
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[RANDOM]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<2xi32>) -> tensor<2xi32>
+    %2 = "tf.Const"() {value = dense<[32, 32, 64]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<3xi64>) -> tensor<3xi64>
+    %4 = "tf.StatelessRandomUniform"(%3, %1) : (tensor<3xi64>, tensor<2xi32>) -> tensor<32x32x64xf32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<32x32x64>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<32x32x64xf32>) -> tensor<32x32x64xf32>
+    tf_device.return %5 : tensor<32x32x64xf32>
+  }) {_mesh = "TPU|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Random with x,z sharding
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:TPU|x=4,y=2,z=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:10,/job:localhost/task:0/device:TPU:11,/job:localhost/task:0/device:TPU:12,/job:localhost/task:0/device:TPU:13,/job:localhost/task:0/device:TPU:14,/job:localhost/task:0/device:TPU:15"}) {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: %[[MESH_SIZES:.*]] = "tf.Const"()
+  // CHECK-SAME: 4, 2, 2
+  // CHECK-NEXT: %[[MESH_SIZES_RUNNING_PRODUCT:.*]] = "tf.Const"() {value =
+  // CHECK-SAME: 4, 2, 1
+  // CHECK-NEXT: %[[MESH_COORDS_PRE_MOD:.*]] = "tf.Div"(%arg0, %[[MESH_SIZES_RUNNING_PRODUCT]])
+  // CHECK-NEXT: %[[MESH_COORDS:.*]] = "tf.FloorMod"(%[[MESH_COORDS_PRE_MOD]], %[[MESH_SIZES]])
+  // CHECK-SAME: _mesh_coordinates =
+  // CHECK-NEXT: %[[MESH_MULTIPLER:.*]] = "tf.Const"()
+  // CHECK-SAME [65536], [0], [262144]
+  // CHECK-NEXT: %[[DEVICE_SEED:.*]] = "tf.MatMul"(%[[MESH_COORDS]], %[[MESH_MULTIPLER]])
+  // CHECK-NEXT: %[[PRIME:.*]] = "tf.Const"() {value = dense<65521>
+  // CHECK-NEXT: %[[DEVICE_SEED_PRIME:.*]] = "tf.AddV2"(%[[DEVICE_SEED]], %[[PRIME]])
+  // CHECK-NEXT: %[[DEVICE_SEED_SQUEEZE:.*]] = "tf.Squeeze"(%[[DEVICE_SEED_PRIME]]) {
+  // CHECK-NOT: dtensor.device_seed_for_mesh_dims
+  // CHECK-SAME: }
+  // CHECK-NEXT: %[[OLD_SHAPE:.*]] = "tf.Const"(
+  // CHECK-NEXT: %[[DEVICE_SEED_CAST:.*]] = "tf.Cast"(%[[DEVICE_SEED_SQUEEZE]])
+  // CHECK-NEXT: %[[NEW_SEED:.*]] = "tf.BitwiseXor"(%arg1, %[[DEVICE_SEED_CAST]])
+  // CHECK-NEXT: %[[NEW_SHAPE:.*]] = "tf.Const"() {value = dense<[8, 32, 32]>
+  // CHECK-NEXT: %[[RANDOM:.*]] = "tf.StatelessRandomUniform"(%[[NEW_SHAPE]], %[[NEW_SEED]])
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[RANDOM]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=4,y=2,z=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:10,/job:localhost/task:0/device:TPU:11,/job:localhost/task:0/device:TPU:12,/job:localhost/task:0/device:TPU:13,/job:localhost/task:0/device:TPU:14,/job:localhost/task:0/device:TPU:15>} : (tensor<2xi32>) -> tensor<2xi32>
+    %2 = "tf.Const"() {value = dense<[32, 32, 64]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %3 = "tf.DTensorLayout"(%2) {global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:TPU|x=4,y=2,z=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:10,/job:localhost/task:0/device:TPU:11,/job:localhost/task:0/device:TPU:12,/job:localhost/task:0/device:TPU:13,/job:localhost/task:0/device:TPU:14,/job:localhost/task:0/device:TPU:15>} : (tensor<3xi64>) -> tensor<3xi64>
+    %4 = "tf.StatelessRandomUniform"(%3, %1) : (tensor<3xi64>, tensor<2xi32>) -> tensor<32x32x64xf32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<32x32x64>, layout = #dtensor.layout<sharding_specs:x,unsharded,z, mesh:TPU|x=4,y=2,z=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:10,/job:localhost/task:0/device:TPU:11,/job:localhost/task:0/device:TPU:12,/job:localhost/task:0/device:TPU:13,/job:localhost/task:0/device:TPU:14,/job:localhost/task:0/device:TPU:15>} : (tensor<32x32x64xf32>) -> tensor<32x32x64xf32>
+    tf_device.return %5 : tensor<32x32x64xf32>
+  }) {_mesh = "TPU|x=4,y=2,z=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3,/job:localhost/task:0/device:TPU:4,/job:localhost/task:0/device:TPU:5,/job:localhost/task:0/device:TPU:6,/job:localhost/task:0/device:TPU:7,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:8,/job:localhost/task:0/device:TPU:10,/job:localhost/task:0/device:TPU:11,/job:localhost/task:0/device:TPU:12,/job:localhost/task:0/device:TPU:13,/job:localhost/task:0/device:TPU:14,/job:localhost/task:0/device:TPU:15"} : () -> (tensor<i32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_reduction.mlir b/tensorflow/dtensor/mlir/tests/spmd_reduction.mlir
new file mode 100644
index 00000000000..5d45cf2176f
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_reduction.mlir
@@ -0,0 +1,91 @@
+// RUN: dtensor-opt --nouse_layout_propagation_v2 -- %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation -dtensor-spmd-expansion | FileCheck %s
+
+// Check SPMD expansion of reduction op with replicated input.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      "tf.Sum"
+  // CHECK-NOT:  "tf.DTensorAllReduce"
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %value = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                           _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x10xi32>
+    %dimension = "tf.Const"() { value = dense<1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*CPU"]} : () -> tensor<i64>
+    %sum = "tf.Sum"(%value, %dimension) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    tf_device.return %sum : tensor<2x1xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD expansion of reduction op on TPU mesh with replicated input.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      "tf.Sum"
+  // CHECK-NOT:  tf.DTensorAllReduce
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %value = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                           _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*TPU"]}: () -> tensor<2x10xi32>
+    %dimension = "tf.Const"() { value = dense<1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*TPU"] } : () -> tensor<i64>
+    %sum = "tf.Sum"(%value, %dimension) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    tf_device.return %sum : tensor<2x1xi32>
+  }) {_mesh = ["|x=2,y=2|*TPU"]} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD expansion of reduce op with sharded inputs on TPU mesh.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:     tf.DTensorAllReduce
+  %0 = "tf_device.cluster"() ({
+    %value = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                           _layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*TPU"]}: () -> tensor<2x10xi32>
+    %dimension = "tf.Const"() { value = dense<1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*TPU"] } : () -> tensor<i64>
+    %sum = "tf.Sum"(%value, %dimension) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    tf_device.return %sum : tensor<2x1xi32>
+  }) {_mesh = "|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
+// -----
+
+// Check SPMD reduction of reduce op with sharded inputs.
+// CHECK-LABEL: func @main
+// CHECK:  "tf_device.cluster"
+// CHECK:  "tf.DTensorAllReduce"
+func.func @main(%arg0: tensor<i32>) {
+  %0 = "tf_device.cluster"() ({
+    %value = "tf.Const"() {value = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]> : tensor<2x10xi32>,
+                           _layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*CPU"]}: () -> tensor<2x10xi32>
+    %dimension = "tf.Const"() { value = dense<1> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*TPU"]} : () -> tensor<i64>
+    %sum = "tf.Sum"(%value, %dimension) {keep_dims=true}: (tensor<2x10xi32>, tensor<i64>) -> tensor<2x1xi32>
+    tf_device.return %sum : tensor<2x1xi32>
+  }) {_mesh = "|x=2,y=2|*CPU"} : () -> (tensor<2x1xi32>)
+  func.return
+}
+
+// -----
+
+// Check that reduction over an unsharded dimension, should not emit an
+// all-reduce.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  // CHECK:     "tf_device.cluster"
+  // CHECK-NOT:   tf.DTensorAllReduce
+  // CHECK:       tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %value = "tf.Const"() {value = dense<[[[0, 1], [2, 3]], [[4, 5], [6, 7]]]> : tensor<2x2x2xi32>,
+                           _layout = ["sharding_specs:x,y,unsharded, mesh:|x=2,y=2|*TPU"]}: () -> tensor<2x2x2xi32>
+    %dimension = "tf.Const"() { value = dense<2> : tensor<i64>, _layout = ["sharding_specs:scalar, mesh:|x=2,y=2|*TPU"] } : () -> tensor<i64>
+    %sum = "tf.Sum"(%value, %dimension) {keep_dims=true}: (tensor<2x2x2xi32>, tensor<i64>) -> tensor<2x2x1xi32>
+    tf_device.return %sum : tensor<2x2x1xi32>
+  }) {_mesh = "|x=2,y=2|*TPU"} : () -> (tensor<i32>)
+  func.return
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir b/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir
new file mode 100644
index 00000000000..c315360b00e
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_save_restore.mlir
@@ -0,0 +1,189 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion | FileCheck %s
+
+// Check SPMD for save op for replicated tensor only happens on device 0.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  "tf_device.cluster"() ({
+    // CHECK:      "tf.Case"
+    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+    // CHECK:      func private @tf.[[D0]]
+    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      "tf.Add"(%arg0, %[[CST]])
+    // CHECK:      ""
+    // CHECK:      func private @tf.[[D1]]
+    // CHECK:      "tf.NoOp"
+    %0 = "tf.Const"() {value = dense<"/dev/null"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %1 = "tf.Const"() {value = dense<"t1"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %2 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %3 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    "tf.SaveV2"(%0, %1, %2, %3) : (tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check SPMD for save op for sharded tensor.
+// The following should generate a switch case on device id, and 2 save ops in each branch.
+// One for device 0, another for device 1.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  "tf_device.cluster"() ({
+    // CHECK:      tf.Case
+    // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+    // CHECK:      func private @tf.[[D0]]
+    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      "tf.Add"(%arg0, %[[CST]])
+    // CHECK:      "2 0,1"
+    // CHECK:      func private @tf.[[D1]]
+    // CHECK:      %[[CST:.*]] = "tf.Const"() {value = dense<"_dev-1-of-2">
+    // CHECK:      "tf.Add"(%arg0, %[[CST]])
+    // CHECK:      "2 1,1"
+    %0 = "tf.Const"() {value = dense<"/dev/null"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %1 = "tf.Const"() {value = dense<"t1"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %2 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %3 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    "tf.SaveV2"(%0, %1, %2, %4) : (tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<2xi32>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check MergeV2 only happens on Device 0 for DTensor Checkpointing V1 and
+// a dtensor all-reduce is introduced to the graph.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) {
+  "tf_device.cluster"() ({
+    // CHECK:      tf.DTensorAllReduce
+    // CHECK:      tf.NotEqual
+    // CHECK:      tf.If
+    // CHECK-SAME: else_branch = @tf.[[ELSE:[a-zA-Z0-9_]*]]
+    // CHECK-SAME: then_branch = @tf.[[THEN:[a-zA-Z0-9_]*]]
+    // CHECK:      func private @tf.[[THEN]]
+    // CHECK:      tf.NoOp
+    // CHECK:      func private @tf.[[ELSE]]
+    // CHECK:      "tf.Const"() {value = dense<"_dev-0-of-2">
+    // CHECK:      "tf.Add"
+    // CHECK:      "tf.Const"() {value = dense<"_dev-1-of-2">
+    // CHECK:      "tf.Add"
+    // CHECK:      "tf.Concat"
+    // CHECK:      "tf.MergeV2Checkpoints"
+    %0 = "tf.Const"() {value = dense<"/dev/null/device"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %1 = "tf.Const"() {value = dense<"/dev/null/destination"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    "tf.MergeV2Checkpoints"(%0, %1) {allow_missing_files = true, delete_old_dirs = false} : (tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> ()
+    tf_device.return
+  }) {_mesh = "CPU|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"} : () -> ()
+  func.return
+}
+
+
+// -----
+
+// Check DTensorRestoreV2 does local restore with slice_spec.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<!tf_type.string> {
+    tf._layout = "sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg2: tensor<2x!tf_type.string> {
+    tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg3: tensor<2x!tf_type.string> {
+    tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg4: tensor<8x2xf32> {
+    tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg5: tensor<2x4xf32> {
+    tf._layout = "sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> (tensor<*xf32> , tensor<*xf32> ) {
+    %0:2 = "tf_device.cluster"() ({
+      // CHECK:      %[[CONDITION:.*]] = "tf.Equal"(%[[LOCAL_DEVICE_IDS:.*]], %arg0)
+      // CHECK:      %[[IDX_TENSOR:.*]] = "tf.Where"(%[[CONDITION]])
+      // CHECK-SAME: tensor<1x1xi64>
+      // CHECK:      %[[BRANCH_IDX:.*]] = "tf.Cast"(%[[IDX_TENSOR]]
+      // CHECK:      "tf.Reshape"(%[[BRANCH_IDX]]
+      // CHECK-SAME: (tensor<1x1xi32>, tensor<0xi32>) -> tensor<i32>
+      // CHECK:      tf.Case
+      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+      // CHECK:      func private @tf.[[D0]]
+      // CHECK:      "tf.Const"() {value = dense<["", "2 4 0,1:-"]>
+      // CHECK:      func private @tf.[[D1]]
+      // CHECK:      "tf.Const"() {value = dense<["", "2 4 1,1:-"]>
+      %1 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+      %2 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<["model/r/.ATTRIBUTES/VARIABLE_VALUE", "model/s/.ATTRIBUTES/VARIABLE_VALUE"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+      %3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<"/dev/null/ckpt-0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+      %4 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<>], global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %5 = "tf.DTensorLayout"(%2) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x!tf_type.string>) -> tensor<2x!tf_type.string>
+      %6 = "tf.DTensorLayout"(%1) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x!tf_type.string>) -> tensor<2x!tf_type.string>
+      %7:2 = "tf.DTensorRestoreV2"(%4, %5, %6) {_global_shape = [#tf_type.shape<*>, #tf_type.shape<*>], device = "", input_dtypes = [f32, f32],
+        input_shapes=[#tf_type.shape<8x2>, #tf_type.shape<2x4>],
+        input_layouts=["sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+          "sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"]} : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xf32>)
+      %8 = "tf.DTensorLayout"(%7#0) {global_shape = #tf_type.shape<*>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<*xf32>) -> tensor<*xf32>
+      %9 = "tf.DTensorLayout"(%7#1) {global_shape = #tf_type.shape<*>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<*xf32>) -> tensor<*xf32>
+      tf_device.return %8, %9 : tensor<*xf32>, tensor<*xf32>
+    }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> (tensor<*xf32>, tensor<*xf32>)
+    func.return %0#0, %0#1 : tensor<*xf32>, tensor<*xf32>
+  }
+
+
+// -----
+
+// Check RestoreV2 does local restore with correct shape_and_slice spec.
+// Restores a replicated 8x2 tensor and a x,unsharded 2x4 tensor.
+// The expansion of a RestoreV2 should be the same expansion as a
+// DTensorRestoreV2.
+//
+// To check correctness of the expansion, we just need to check that the
+// correct `shape_and_slices` constant string is produced for each
+// device_id function.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>},
+  %arg1: tensor<!tf_type.string> {
+    tf._layout = "sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg2: tensor<2x!tf_type.string> {
+    tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg3: tensor<2x!tf_type.string> {
+    tf._layout = "sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg4: tensor<*x!tf_type.resource<tensor<8x2xf32>>> {
+    tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"},
+  %arg5: tensor<*x!tf_type.resource<tensor<2x4xf32>>> {
+    tf._layout = "sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1",
+    tf._mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> (tensor<8x2xf32> , tensor<2x4xf32> ) {
+    %0:2 = "tf_device.cluster"() ({
+      // CHECK:      %[[CONDITION:.*]] = "tf.Equal"(%[[LOCAL_DEVICE_IDS:.*]], %arg0)
+      // CHECK:      %[[IDX_TENSOR:.*]] = "tf.Where"(%[[CONDITION]])
+      // CHECK-SAME: tensor<1x1xi64>
+      // CHECK:      %[[BRANCH_IDX:.*]] = "tf.Cast"(%[[IDX_TENSOR]]
+      // CHECK:      "tf.Reshape"(%[[BRANCH_IDX]]
+      // CHECK-SAME: (tensor<1x1xi32>, tensor<0xi32>) -> tensor<i32>
+      // CHECK:      tf.Case
+      // CHECK-SAME: branches = [@tf.[[D0:.*]], @tf.[[D1:.*]]]
+      // CHECK:      func private @tf.[[D0]]
+      // CHECK:      "tf.Const"() {value = dense<["", "2 4 0,1:-"]>
+      // CHECK:      func private @tf.[[D1]]
+      // CHECK:      "tf.Const"() {value = dense<["", "2 4 1,1:-"]>
+      %1 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+      %2 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<["model/r/.ATTRIBUTES/VARIABLE_VALUE", "model/s/.ATTRIBUTES/VARIABLE_VALUE"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+      %3 = "tf.Const"() {_global_shape = [#tf_type.shape<>], value = dense<"/dev/null/ckpt-0"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+      %4 = "tf.DTensorLayout"(%3) {_global_shape = [#tf_type.shape<>], global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+      %5 = "tf.DTensorLayout"(%2) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x!tf_type.string>) -> tensor<2x!tf_type.string>
+      %6 = "tf.DTensorLayout"(%1) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x!tf_type.string>) -> tensor<2x!tf_type.string>
+      %7:2 = "tf.RestoreV2"(%4, %5, %6) {input_dtypes = [f32, f32]} : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<8x2xf32>, tensor<2x4xf32>)
+      "tf.AssignVariableOp"(%arg4, %7#0) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<8x2xf32>>>, tensor<8x2xf32>) -> ()
+      "tf.AssignVariableOp"(%arg5, %7#1) {validate_shape = true} : (tensor<*x!tf_type.resource<tensor<2x4xf32>>>, tensor<2x4xf32>) -> ()
+      %8 = "tf.DTensorLayout"(%7#0) {global_shape = #tf_type.shape<*>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<8x2xf32>) -> tensor<8x2xf32>
+      %9 = "tf.DTensorLayout"(%7#1) {global_shape = #tf_type.shape<*>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+      tf_device.return %8, %9 : tensor<8x2xf32>, tensor<2x4xf32>
+    }) {_mesh = "|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> (tensor<8x2xf32>, tensor<2x4xf32>)
+    func.return %0#0, %0#1 : tensor<8x2xf32>, tensor<2x4xf32>
+  }
diff --git a/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir b/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir
new file mode 100644
index 00000000000..2122b72d419
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_segment_sum.mlir
@@ -0,0 +1,29 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// UnsortedSegmentSum data and segment sum same layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<1xi32>,
+           %arg1: tensor<16x2xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"},
+           %arg2: tensor<16xi32> {tf._layout = "sharding_specs:x, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"}) {
+  // CHECK:    "tf_device.cluster"
+  // CHECK:      %[[NUM_SEGMENTS:.*]] = "tf.Const"
+  // CHECK-SAME: () -> tensor<i32>
+  // CHECK:      %[[LOCAL_RESULT:.*]] = "tf.UnsortedSegmentSum"(%arg1, %arg2, %[[NUM_SEGMENTS]])
+  // CHECK-SAME: (tensor<4x2xf32>, tensor<4xi32>, tensor<i32>) -> tensor<8x2xf32>
+  // CHECK:      %[[RESULT:.*]] = "tf.DTensorAllReduce"(%[[LOCAL_RESULT]]
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"]
+  // CHECK-SAME: reduce_op = "Add"
+  // CHECK:      %[[FINAL_RESULT:.*]] = "tf.DTensorAllScatter"(%[[RESULT]]
+  // CHECK-NEXT: tf_device.return
+  // CHECK-SAME: %[[FINAL_RESULT]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<16x2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<16x2xf32>) -> tensor<16x2xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<16>, layout = #dtensor.layout<sharding_specs:x, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<16xi32>) -> tensor<16xi32>
+    %3 = "tf.Const"() {value = dense<8> : tensor<i32>} : () -> tensor<i32>
+    %4 = "tf.DTensorLayout"(%3) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs: mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<i32>) -> tensor<i32>
+    %5 = "tf.UnsortedSegmentSum"(%1, %2, %4) : (tensor<16x2xf32>, tensor<16xi32>, tensor<i32>) -> tensor<8x2xf32>
+    %6 = "tf.DTensorLayout"(%5) {global_shape = #tf_type.shape<8x2>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3>} : (tensor<8x2xf32>) -> tensor<8x2xf32>
+    tf_device.return %6 : tensor<8x2xf32>
+  }) {_mesh = "TPU|x=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> (tensor<i32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/spmd_slice.mlir b/tensorflow/dtensor/mlir/tests/spmd_slice.mlir
new file mode 100644
index 00000000000..6741b0c2638
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_slice.mlir
@@ -0,0 +1,155 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s --dump-input=always
+
+// Check SPMD of splice op with replicated input.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<2x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.Slice"(%arg1, %cst, %cst_1)
+  // CHECK-SAME:     _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:     (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.Slice"(%1, %2, %3) : (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+    tf_device.return %4 : tensor<2x2xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Check that the slice on sharded x dimension is from 1, which requires a relayout to a fully replicated layout.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<1x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[GATHERED:[0-9]*]] = "tf.DTensorAllGather"(%arg1)
+  // CHECK:        "tf.Slice"(%[[GATHERED]], %cst, %cst_1)
+  // CHECK-SAME:     _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:     (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xf32>
+  // CHECK:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.Slice"(%1, %2, %3) : (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xf32>
+    tf_device.return %4 : tensor<1x2xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<1x2xf32>)
+  func.return %0 : tensor<1x2xf32>
+}
+
+// -----
+
+// Check that the slice on sharded x dimension is from 0, which can operate on and produce sharded tensors.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<2x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.Slice"(%arg1, %cst, %cst_1)
+  // CHECK-SAME:     _layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:     (tensor<1x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xf32>
+  // CHECK:        tf_device.return
+  // CHECK-SAME:     _layout = []
+  // CHECK-SAME:     tensor<1x2xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.Const"() {value = dense<[0, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.Slice"(%1, %2, %3) : (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+    tf_device.return %4 : tensor<2x2xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Slice a sharded input but produce a replicated output. This would crash without the fix for b/181933405.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<2x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.Slice"(%arg1, %cst, %cst_1)
+  // CHECK-SAME:     (tensor<1x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xf32>
+  // CHECK:        tf_device.return
+  // CHECK-NEXT:     _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"], _mesh = "|x=2,y=2|*CPU"
+  // CHECK-SAME:     () -> tensor<2x2xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.Const"() {value = dense<[0, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Const"() {value = dense<[-1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.Slice"(%1, %2, %3) : (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+    %5 = "tf.DTensorLayout"(%4) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x2xf32>) -> tensor<2x2xf32>
+    tf_device.return %5 : tensor<2x2xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Check SPMD expansion slice with a dynamic begins and sharded input on non
+// full slice dimensions.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:unsharded,x, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<2xi64> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<1x4xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // CHECK-NEXT:   %[[SLICE:.*]] = "tf.Slice"(%arg1, %arg2, %[[SLICE_SIZE]])
+  // CHECK-SAME:     _layout = ["sharding_specs:unsharded,x, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:     (tensor<2x2xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x2xf32>
+  // CHECK-NEXT:   tf_device.return
+  // CHECK-SAME:     %[[SLICE]]
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,x, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2xi64>) -> tensor<2xi64>
+    %3 = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> tensor<2xi64>
+    %4 = "tf.Slice"(%1, %2, %3) : (tensor<2x4xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xf32>
+    tf_device.return %4 : tensor<1x4xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded,x, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<1x4xf32>)
+  func.return %0 : tensor<1x4xf32>
+}
+
+// -----
+
+// Check SPMD expansion of strided slice op with replicated input.
+func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<2x2xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %cst_2 = "tf.Const"() {value = dense<2> : tensor<2xi32>}
+  // CHECK:        "tf.StridedSlice"(%arg1, %cst, %cst_2, %cst_1)
+  // CHECK-SAME:     _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]
+  // CHECK-SAME:     (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+  // CHECK:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<2x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+    %3 = "tf.Const"() {value = dense<[2, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %4 = "tf.Const"() {value = dense<[1, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %5 = "tf.StridedSlice"(%1, %2, %3, %4) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<2x4xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2xf32>
+    tf_device.return %5 : tensor<2x2xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<2x2xf32>)
+  func.return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// Check layout propagation and spmd expansion of strided slice grad op.
+func.func @main(%arg0: tensor<15x12xf32>) -> tensor<15x197x12xf32> {
+  // CHECK:      "tf_device.cluster"
+  %0 = "tf_device.cluster"() ({
+    %cst = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_0 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<[0, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {_global_shape = [#tf_type.shape<2>], value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_2 = "tf.Const"() {_global_shape = [#tf_type.shape<3>], value = dense<[15, 197, 12]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %1 = "tf.DTensorLayout"(%arg0) {_global_shape = [#tf_type.shape<15x12>], global_shape = #tf_type.shape<15x12>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<15x12xf32>) -> tensor<15x12xf32>
+    %2 = "tf.DTensorLayout"(%cst_2) {_global_shape = [#tf_type.shape<3>], global_shape = #tf_type.shape<3>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<3xi32>) -> tensor<3xi32>
+    %3 = "tf.DTensorLayout"(%cst_1) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %4 = "tf.DTensorLayout"(%cst_0) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %5 = "tf.DTensorLayout"(%cst) {_global_shape = [#tf_type.shape<2>], global_shape = #tf_type.shape<2>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2xi32>) -> tensor<2xi32>
+    %6 = "tf.StridedSliceGrad"(%2, %3, %4, %5, %1) {_global_shape = [#tf_type.shape<15x197x12>], begin_mask = 1 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 1 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<3xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<15x12xf32>) -> tensor<15x197x12xf32>
+    %7 = "tf.DTensorLayout"(%6) {_global_shape = [#tf_type.shape<15x197x12>], global_shape = #tf_type.shape<15x197x12>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded,unsharded, mesh:|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<15x197x12xf32>) -> tensor<15x197x12xf32>
+    tf_device.return %7 : tensor<15x197x12xf32>
+    }) {_mesh = "|x=1,y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<15x197x12xf32>
+  func.return %0 : tensor<15x197x12xf32>
+}
+
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_softmax_loss.mlir b/tensorflow/dtensor/mlir/tests/spmd_softmax_loss.mlir
new file mode 100644
index 00000000000..0b9b44e0d49
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_softmax_loss.mlir
@@ -0,0 +1,134 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check SPMD of Softmax with no sharding.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<6x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<6xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.SoftmaxCrossEntropyWithLogits"
+  // CHECK-NEXT: "tf.IdentityN"
+  // CHECK-NEXT:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %loss, %backprop = "tf.SoftmaxCrossEntropyWithLogits"(%1, %2) : (tensor<6x4xf32>, tensor<6x4xf32>) -> (tensor<6xf32>, tensor<6x4xf32>)
+    %3 = "tf.DTensorLayout"(%loss) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6xf32>) -> tensor<6xf32>
+    %4 = "tf.DTensorLayout"(%backprop) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    tf_device.return %3 : tensor<6xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<6xf32>)
+  func.return %0 : tensor<6xf32>
+}
+
+// -----
+
+// Check SPMD of Softmax with batch sharding but no class sharding.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<6x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<6xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.SoftmaxCrossEntropyWithLogits"
+  // CHECK-NEXT: "tf.IdentityN"
+  // CHECK-NEXT:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %loss, %backprop = "tf.SoftmaxCrossEntropyWithLogits"(%1, %2) : (tensor<6x4xf32>, tensor<6x4xf32>) -> (tensor<6xf32>, tensor<6x4xf32>)
+    %3 = "tf.DTensorLayout"(%loss) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=2|*CPU>} : (tensor<6xf32>) -> tensor<6xf32>
+    %4 = "tf.DTensorLayout"(%backprop) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    tf_device.return %3 : tensor<6xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:x, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<6xf32>)
+  func.return %0 : tensor<6xf32>
+}
+
+// -----
+
+// Check SPMD of Softmax with batch sharding and class sharding,
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<6x4xf32> {tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|*CPU"}) -> tensor<6xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[LOCAL_MAX:.*]] = "tf.Max"(%arg1,
+  // CHECK:      %[[MAX:.*]] = "tf.DTensorAllReduce"(%[[LOCAL_MAX]],
+  // CHECK-SAME: "Max"
+  // CHECK:      %[[SHIFTED_LOGITS:.*]] = "tf.Sub"(%arg1, %[[MAX]])
+  // CHECK:      %[[EXP_LOGITS:.*]] = "tf.Exp"(%[[SHIFTED_LOGITS]])
+  // CHECK:      %[[LOCAL_SUM:.*]] = "tf.Sum"(%[[EXP_LOGITS]],
+  // CHECK:      %[[SUM:.*]] = "tf.DTensorAllReduce"(%[[LOCAL_SUM]],
+  // CHECK-SAME: "Add"
+  // CHECK:      %[[LOG_SUM:.*]] = "tf.Log"(%[[SUM]])
+  // CHECK:      %[[LOG_SOFTMAX:.*]] = "tf.Sub"(%[[SHIFTED_LOGITS]], %[[LOG_SUM]])
+  // CHECK:      %[[SOFTMAX:.*]] = "tf.Div"(%[[EXP_LOGITS]], %[[SUM]])
+  // CHECK:      %[[IS_ZERO:.*]] = "tf.Equal"(%arg2,
+  // CHECK:      %[[SAFE_LOG_SOFTMAX:.*]] = "tf.SelectV2"(%[[IS_ZERO]], %[[ZERO:.*]], %[[LOG_SOFTMAX]])
+  // CHECK:      %[[PROD:.*]] = "tf.Mul"(%arg2, %[[SAFE_LOG_SOFTMAX]])
+  // CHECK:      %[[LOCAL_NEG_LOSS:.*]] = "tf.Sum"(%[[PROD]],
+  // CHECK:      %[[NEG_LOSS:.*]] = "tf.DTensorAllReduce"(%[[LOCAL_NEG_LOSS]],
+  // CHECK-SAME: "Add"
+  // CHECK:      %[[SQUEEZED_NEG_LOSS:.*]] = "tf.Squeeze"(%[[NEG_LOSS]])
+  // CHECK:      %[[LOSS:.*]] = "tf.Neg"(%[[SQUEEZED_NEG_LOSS]])
+  // CHECK:      %[[BACKPROP:.*]] = "tf.Sub"(%[[SOFTMAX]], %arg2)
+  // CHECK-NEXT: "tf.IdentityN"(%[[LOSS]], %[[BACKPROP]])
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %loss, %backprop = "tf.SoftmaxCrossEntropyWithLogits"(%1, %2) : (tensor<6x4xf32>, tensor<6x4xf32>) -> (tensor<6xf32>, tensor<6x4xf32>)
+    %3 = "tf.DTensorLayout"(%loss) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=2|*CPU>} : (tensor<6xf32>) -> tensor<6xf32>
+    %4 = "tf.DTensorLayout"(%backprop) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    tf_device.return %3 : tensor<6xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<6xf32>)
+  func.return %0 : tensor<6xf32>
+}
+
+// -----
+
+// Check SPMD of SparseSoftmax with no sharding.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<6xi32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"}) -> tensor<6xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.SparseSoftmaxCrossEntropyWithLogits"
+  // CHECK-NEXT: "tf.IdentityN"
+  // CHECK-NEXT: tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6xi32>) -> tensor<6xi32>
+    %loss, %backprop = "tf.SparseSoftmaxCrossEntropyWithLogits"(%1, %2) : (tensor<6x4xf32>, tensor<6xi32>) -> (tensor<6xf32>, tensor<6x4xf32>)
+    %3 = "tf.DTensorLayout"(%loss) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6xf32>) -> tensor<6xf32>
+    %4 = "tf.DTensorLayout"(%backprop) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    tf_device.return %3 : tensor<6xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<6xf32>)
+  func.return %0 : tensor<6xf32>
+}
+
+// -----
+
+// Check SPMD of Softmax with batch sharding but no class sharding.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+           %arg1: tensor<6x4xf32> {tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"},
+           %arg2: tensor<6xi32> {tf._layout = "sharding_specs:x, mesh:|x=2,y=2|*CPU"}) -> tensor<6xf32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT: "tf.SparseSoftmaxCrossEntropyWithLogits"
+  // CHECK-NEXT: "tf.IdentityN"
+  // CHECK-NEXT:        tf_device.return
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    %2 = "tf.DTensorLayout"(%arg2) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=2|*CPU>} : (tensor<6xi32>) -> tensor<6xi32>
+    %loss, %backprop = "tf.SparseSoftmaxCrossEntropyWithLogits"(%1, %2) : (tensor<6x4xf32>, tensor<6xi32>) -> (tensor<6xf32>, tensor<6x4xf32>)
+    %3 = "tf.DTensorLayout"(%loss) {global_shape = #tf_type.shape<6>, layout = #dtensor.layout<sharding_specs:x, mesh:|x=2,y=2|*CPU>} : (tensor<6xf32>) -> tensor<6xf32>
+    %4 = "tf.DTensorLayout"(%backprop) {global_shape = #tf_type.shape<6x4>, layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU>} : (tensor<6x4xf32>) -> tensor<6x4xf32>
+    tf_device.return %3 : tensor<6xf32>
+  }) {_mesh = "|x=2,y=2|*CPU", _layout = ["sharding_specs:x, mesh:|x=2,y=2|*CPU"]} : () -> (tensor<6xf32>)
+  func.return %0 : tensor<6xf32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir b/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir
new file mode 100644
index 00000000000..5bfda19d43b
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_squeeze.mlir
@@ -0,0 +1,55 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check Squeeze with postive index.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> , %arg1: tensor<2x1xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<2xf32> {
+  // CHECK:      "tf.Squeeze"(%arg1)
+  // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|
+  // CHECK-SAME: (tensor<1x1xf32>) -> tensor<1xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<2x1>], global_shape = #tf_type.shape<2x1>,
+      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %2 = "tf.Squeeze"(%1) {_global_shape = [#tf_type.shape<2>], device = "", squeeze_dims = [1]} : (tensor<2x1xf32>) -> tensor<2xf32>
+    tf_device.return %2 : tensor<2xf32>
+  }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Check Squeeze with negative index.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> , %arg1: tensor<2x1xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<2xf32> {
+  // CHECK:      "tf.Squeeze"(%arg1)
+  // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|
+  // CHECK-SAME: (tensor<1x1xf32>) -> tensor<1xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<2x1>], global_shape = #tf_type.shape<2x1>,
+      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %2 = "tf.Squeeze"(%1) {_global_shape = [#tf_type.shape<2>], device = "", squeeze_dims = [-1]} : (tensor<2x1xf32>) -> tensor<2xf32>
+    tf_device.return %2 : tensor<2xf32>
+  }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Check Squeeze that does not locally squeeze the dim with local shape 1.
+
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32> , %arg1: tensor<2x1xf32> { tf._layout = "sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}) -> tensor<2xf32> {
+  // CHECK:      "tf.Squeeze"(%arg1)
+  // CHECK-SAME: _layout = ["sharding_specs:x, mesh:|x=2,y=1|0,1|0,1|
+  // CHECK-SAME: squeeze_dims = [1]
+  // CHECK-SAME: (tensor<1x1xf32>) -> tensor<1xf32>
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.DTensorLayout"(%arg1) {_global_shape = [#tf_type.shape<2x1>], global_shape = #tf_type.shape<2x1>,
+      layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<2x1xf32>) -> tensor<2x1xf32>
+    %2 = "tf.Squeeze"(%1) {_global_shape = [#tf_type.shape<2>], device = "", squeeze_dims = []} : (tensor<2x1xf32>) -> tensor<2xf32>
+    tf_device.return %2 : tensor<2xf32>
+  }) {_mesh = "|x=2,y=1|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"} : () -> tensor<2xf32>
+  func.return %0 : tensor<2xf32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_tile.mlir b/tensorflow/dtensor/mlir/tests/spmd_tile.mlir
new file mode 100644
index 00000000000..d1fb3af045b
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_tile.mlir
@@ -0,0 +1,62 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check TileOp on sharded const input.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<2x1xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"},
+  %arg2: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> (tensor<4x3xf32>{
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[MULTIPLES:.*]] = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK:      "tf.Tile"(%arg1, %[[MULTIPLES]])
+  // CHECK-SAME:  (tensor<2x1xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  tf._default_layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>, _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> tensor<2xi32>
+    %2 = "tf.Tile"(%arg1, %1) {device = ""} : (tensor<2x1xf32>, tensor<2xi32>) -> tensor<4x3xf32>
+    tf_device.return %2 : tensor<4x3xf32>
+  }) {} : () -> tensor<4x3xf32>
+  func.return %0 : tensor<4x3xf32>
+}
+
+// -----
+
+// Check TileOp on sharded const input with partial shape.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<?x1xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"},
+  %arg2: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> (tensor<?x3xf32>{
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[MULTIPLES:.*]] = "tf.Const"()
+  // CHECK-NEXT: dense<[1, 3]>
+  // CHECK:      "tf.Tile"
+  // CHECK-SAME: (tensor<?x1xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+  tf._default_layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>, _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> tensor<2xi32>
+    %2 = "tf.Tile"(%arg1, %1) {device = ""} : (tensor<?x1xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+    tf_device.return %2 : tensor<?x3xf32>
+  }) {} : () -> tensor<?x3xf32>
+  func.return %0 : tensor<?x3xf32>
+}
+
+// -----
+
+// Check TileOp on sharded const input with partial shape.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>,
+  %arg1: tensor<?x1xf32> {tf._layout = "sharding_specs:unsharded,unsharded, mesh:|x=2,y=2|*CPU"},
+  %arg2: tensor<2xi32> {tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"}) -> (tensor<?x3xf32>{
+  // CHECK:      "tf_device.cluster"
+  // CHECK:      %[[MULTIPLES:.*]] = "tf.Const"()
+  // CHECK-NEXT: dense<[1, 3]>
+  // CHECK:      "tf.Tile"
+  // CHECK-SAME: (tensor<?x1xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+  tf._default_layout = "sharding_specs:x,unsharded, mesh:|x=2,y=2|*CPU"}) {
+  %0 = "tf_device.cluster"() ({
+    %1 = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>, _layout = ["sharding_specs:unsharded, mesh:|x=2,y=2|*CPU"]} : () -> tensor<2xi32>
+    %2 = "tf.Tile"(%arg1, %1) {device = ""} : (tensor<?x1xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+    tf_device.return %2 : tensor<?x3xf32>
+  }) {} : () -> tensor<?x3xf32>
+  func.return %0 : tensor<?x3xf32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/spmd_var_handle.mlir b/tensorflow/dtensor/mlir/tests/spmd_var_handle.mlir
new file mode 100644
index 00000000000..4db25b78d5e
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/spmd_var_handle.mlir
@@ -0,0 +1,33 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-annotate-global-shape -dtensor-layout-propagation-v2 -dtensor-spmd-expansion -verify-diagnostics | FileCheck %s
+
+// Check layout propagation for tf.VarHandleOp followed by Relayout.
+func.func @main()  -> (tensor<2xi32>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.VarHandleOp"()
+  // CHECK-SAME:      _layout = ["sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"]
+  // CHECK:   "tf.ReadVariableOp"
+  // CHECK-SAME:      _layout = ["sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"]
+  %0 = "tf_device.cluster"() ({
+    %0 = "tf.VarHandleOp"() {_global_shape = [#tf_type.shape<>], allowed_devices = [], container = "", device = "", shared_name = ""} : () -> tensor<!tf_type.resource<tensor<2xi32>>>
+    %1 = "tf.Relayout"(%0) {_global_shape = [#tf_type.shape<>], device = "", layout = "sharding_specs:x, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1"}
+    : (tensor<!tf_type.resource<tensor<2xi32>>>) -> tensor<!tf_type.resource<tensor<2xi32>>>
+    %2 = "tf.ReadVariableOp"(%0) {_global_shape = [#tf_type.shape<2>], device = ""} : (tensor<!tf_type.resource<tensor<2xi32>>>) -> tensor<2xi32>
+    tf_device.return %2 : tensor<2xi32>
+  }) {_mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<2xi32>)
+  func.return %0 : tensor<2xi32>
+}
+
+// -----
+
+
+// Check layout propagation for tf.VarHandleOp without a Relayout.
+func.func @main()  -> (tensor<!tf_type.resource<tensor<2xi32>>>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.VarHandleOp"()
+  // CHECK-SAME:      _layout = ["empty_layout"]
+  %0 = "tf_device.cluster"() ({
+    %0 = "tf.VarHandleOp"() {_global_shape = [#tf_type.shape<>], allowed_devices = [], container = "", device = "", shared_name = ""} : () -> tensor<!tf_type.resource<tensor<2xi32>>>
+    tf_device.return %0 : tensor<!tf_type.resource<tensor<2xi32>>>
+  }) {_mesh = "TPU|x=2|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"} : () -> (tensor<!tf_type.resource<tensor<2xi32>>>)
+  func.return %0 : tensor<!tf_type.resource<tensor<2xi32>>>
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/tf_dtensor_ops.mlir b/tensorflow/dtensor/mlir/tests/tf_dtensor_ops.mlir
new file mode 100644
index 00000000000..3d64c951123
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/tf_dtensor_ops.mlir
@@ -0,0 +1,109 @@
+// RUN: dtensor-opt %s -split-input-file -verify-diagnostics
+
+// Check that a DTensorLayout op with mismatched rank between layout and input
+// value is disallowed.
+func.func @invalid_rank_disallowed(%arg0: tensor<i32>) {
+  // expected-error@+1 {{requires matching rank for layout and input, but got 2}}
+  %0 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=2,y=2|*CPU>} : (tensor<i32>) -> tensor<i32>
+  func.return
+}
+
+// -----
+
+// Check that a DTensorLayout op with sharding configuration that cannot evenly
+// divide the dimension of the input value is disallowed.
+func.func @invalid_sharding_dim_disallowed(%arg0: tensor<2x2xi32>) {
+  // expected-error@+1 {{requires dimension 0 to be divisible by sharding specified in DTensorLayout, but got dimension size=2 is not divisible by number of shards in layout for this dimension=8.}}
+  %0 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<2x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<2x2xi32>) -> (tensor<2x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_layout_with_sharding(%arg0: tensor<16x2xi32>) {
+  // CHECK:      "tf.DTensorLayout"
+  // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,y mesh:CPU|x=8,y=2|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3,CPU:4,CPU:5,CPU:6,CPU:7,CPU:8,CPU:9,CPU:10,CPU:11,CPU:12,CPU:13,CPU:14,CPU:15>
+  // CHECK-SAME: (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  %0 = "tf.DTensorLayout"(%arg0) {global_shape = #tf_type.shape<16x2>, layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_gather_unequal_tensor_ranks(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{received input and output layouts of unequal ranks 2 and 1}}
+  %0 = "tf.DTensorAllGather"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_gather_bad_layouts(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{dimension 1 of output layout has sharding spec y which is more sharded then the input layout spec unsharded}}
+  %0 = "tf.DTensorAllGather"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_gather_bad_input_rank(%arg0: tensor<16xi32>) {
+  // expected-error@+1 {{input layout rank 2 is not equal to input rank 1}}
+  %0 = "tf.DTensorAllGather"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_gather_bad_output_rank(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{output layout rank 2 is not equal to output rank 1}}
+  %0 = "tf.DTensorAllGather"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_gather_bad_output_shape(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{computed output shape 4 at dimension 1 is not equal to actual output shape 2}}
+  %0 = "tf.DTensorAllGather"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_scatter_unequal_tensor_ranks(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{received input and output layouts of unequal ranks 2 and 1}}
+  %0 = "tf.DTensorAllScatter"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_scatter_bad_layouts(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{dimension 1 of input layout has sharding spec y which is more sharded then the output layout spec unsharded}}
+  %0 = "tf.DTensorAllScatter"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_scatter_bad_input_rank(%arg0: tensor<16xi32>) {
+  // expected-error@+1 {{input layout rank 2 is not equal to input rank 1}}
+  %0 = "tf.DTensorAllScatter"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_scatter_bad_output_rank(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{output layout rank 2 is not equal to output rank 1}}
+  %0 = "tf.DTensorAllScatter"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16xi32>)
+  func.return
+}
+
+// -----
+
+func.func @dtensor_all_scatter_bad_output_shape(%arg0: tensor<16x2xi32>) {
+  // expected-error@+1 {{computed output shape 1 at dimension 1 is not equal to actual output shape 2}}
+  %0 = "tf.DTensorAllScatter"(%arg0) {input_layout = #dtensor.layout<sharding_specs:x,unsharded, mesh:CPU|x=8,y=2|*CPU>, output_layout = #dtensor.layout<sharding_specs:x,y, mesh:CPU|x=8,y=2|*CPU>} : (tensor<16x2xi32>) -> (tensor<16x2xi32>)
+  func.return
+}
diff --git a/tensorflow/dtensor/mlir/tests/tpu_add_resource_device_attribute.mlir b/tensorflow/dtensor/mlir/tests/tpu_add_resource_device_attribute.mlir
new file mode 100644
index 00000000000..33c62eaa819
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/tpu_add_resource_device_attribute.mlir
@@ -0,0 +1,74 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-tpu-add-resource-device-attribute | FileCheck %s
+
+// Test that tf.ReadVariable op and tf.AssignVariable op has device attribute
+// added that is consistent with device attribute of TPUExecute op.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>, %arg1: tensor<!tf_type.resource<tensor<300x128xf32>>>) -> (tensor<300x128xf32>) {
+  %0 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {config = "mesh:TPU,x=2,y=2", config_proto = "", executor_type = "", f = @tpu_func} : (tensor<!tf_type.resource<tensor<300x128xf32>>>, tensor<!tf_type.resource<tensor<300x128xf32>>>) -> (tensor<300x128xf32>)
+  func.return %0 :tensor<300x128xf32>
+}
+
+// CHECK-LABEL: func @tpu_func
+// CHECK-SAME:  %arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>
+// CHECK-SAME:  tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"
+// CHECK-SAME:  %arg1: tensor<!tf_type.resource<tensor<300x128xf32>>>
+// CHECK-SAME:  tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"
+func.func @tpu_func(%arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>, %arg1: tensor<!tf_type.resource<tensor<300x128xf32>>>) -> tensor<300x128xf32> {
+  %0 = "tf.ReadVariableOp"(%arg0) {_global_shape = [#tf_type.shape<300x128>], _layout = ["mesh:TPU,x=2,y=2 layout:x,unsharded,"]} : (tensor<!tf_type.resource<tensor<300x128xf32>>>) -> tensor<300x128xf32>
+
+  %1:2 = "tf_device.launch"() ({
+    %compilation_status, %program = "tf._TPUCompileMlir"() {
+      metadata = "...",
+      mlir_module = ".."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+
+  "tf_device.launch"() ({
+    "tf.TPUCompileSucceededAssert"(%1#0) : (tensor<!tf_type.string>) -> ()
+    tf_device.return
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+
+  %2 = "tf_device.launch"() ({
+    %3 = "tf.TPUExecute"(%0, %1#1) : (tensor<300x128xf32>, tensor<2x!tf_type.string>) -> tensor<300x128xf32>
+    tf_device.return %3 : tensor<300x128xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<300x128xf32>
+
+  "tf.AssignVariableOp"(%arg1, %2) : (tensor<!tf_type.resource<tensor<300x128xf32>>>, tensor<300x128xf32>) -> ()
+  func.return %2 : tensor<300x128xf32>
+}
+
+// -----
+
+// Test that device attribute to resource input to TPU computation is correctly
+// added.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>) -> (tensor<300x128xf32>) {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "mesh:TPU,x=2,y=2", config_proto = "", executor_type = "", f = @tpu_func} : (tensor<!tf_type.resource<tensor<300x128xf32>>>) -> (tensor<300x128xf32>)
+  func.return %0 :tensor<300x128xf32>
+}
+
+// CHECK-LABEL: func @tpu_func
+// CHECK-SAME:  %arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>
+// CHECK-SAME:  tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"
+func.func @tpu_func(%arg0: tensor<!tf_type.resource<tensor<300x128xf32>>>) -> tensor<300x128xf32> {
+  %1:2 = "tf_device.launch"() ({
+    %compilation_status, %program = "tf._TPUCompileMlir"() {
+      metadata = "...",
+      mlir_module = ".."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+
+  "tf_device.launch"() ({
+    "tf.TPUCompileSucceededAssert"(%1#0) : (tensor<!tf_type.string>) -> ()
+    tf_device.return
+  }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+
+  %2 = "tf_device.launch"() ({
+    %3 = "tf.TPUExecute"(%arg0, %1#1) : (tensor<!tf_type.resource<tensor<300x128xf32>>>, tensor<2x!tf_type.string>) -> tensor<300x128xf32>
+    tf_device.return %3 : tensor<300x128xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<300x128xf32>
+
+  func.return %2 : tensor<300x128xf32>
+}
+
+
diff --git a/tensorflow/dtensor/mlir/tests/tpu_integration.mlir b/tensorflow/dtensor/mlir/tests/tpu_integration.mlir
new file mode 100644
index 00000000000..e94bfef6019
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/tpu_integration.mlir
@@ -0,0 +1,36 @@
+// RUN: dtensor-opt %s -dtensor-tpu-integration | FileCheck %s
+
+// Test that tf_device.Cluster op is created for tf.StatefulPartitionedCall that
+// runs on TPU's.
+// CHECK-LABEL: func @main
+func.func @main(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  %0, %1 = "tf.StatefulPartitionedCall"(%arg0) {config = "|x=2,y=2|*TPU", config_proto = "", executor_type = "", f = @tpu_func} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+
+  %2, %3 = "tf.StatefulPartitionedCall"(%arg0) {config = "|x=2,y=2|*TPU", config_proto = "", executor_type = "", f = @cpu_func} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  func.return %0, %1 : tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @tpu_func
+func.func @tpu_func(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   tf.Const
+  // CHECK-NEXT:   tf.Add
+  // CHECK-NEXT:   tf_device.return
+  // CHECK:      _tpu_replicate
+  // CHECK-SAME: device_assignment = []
+  // CHECK-SAME: num_cores_per_replica = 1
+  // CHECK-SAME: padding_map = []
+  // CHECK-SAME: step_marker_location = ""
+  // CHECK-SAME: topology = ""
+  // CHECK-SAME: use_spmd_for_xla_partitioning = false
+  %1 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Add"(%1, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %2, %arg0: tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @cpu_func
+func.func @cpu_func(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  // CHECK-NOT:      tf_device.Cluster
+  %0, %1 = "tf.A"(%arg0) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  func.return %0, %1: tensor<i32>, tensor<i32>
+}
diff --git a/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir b/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir
new file mode 100644
index 00000000000..b152d296e77
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/undo_merge_const_across_mesh.mlir
@@ -0,0 +1,17 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-undo-merge-const-across-mesh | FileCheck %s
+
+// Check that constants with different meshes are duplicated.
+// CHECK-LABEL: func @check_undo_sccp
+func.func @check_undo_sccp() -> (tensor<4xi32>, tensor<4xi32>) {
+    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_A:.*]]) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    // CHECK-DAG: %[[CONST_A]] = "tf.Const"()
+    // CHECK-DAG: "tf.DTensorLayout"(%[[CONST_B:.*]]) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    // CHECK-DAG: %[[CONST_B]] = "tf.Const"()
+
+    %cst = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %2 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|x=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    %3 = "tf.DTensorLayout"(%cst) {global_shape = #tf_type.shape<4>, layout = #dtensor.layout<sharding_specs:unsharded, mesh:|y=2|0,1|0,1|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1>} : (tensor<4xi32>) -> tensor<4xi32>
+    func.return %2, %3 : tensor<4xi32>, tensor<4xi32>
+}
+
+
diff --git a/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir b/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir
new file mode 100644
index 00000000000..c93146c1a07
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir
@@ -0,0 +1,83 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-update-tpu-metadata -verify-diagnostics | FileCheck %s
+
+// Check that TPUCompileMetadata proto is updated with correct number of replicas.
+// CHECK-LABEL: func @main
+func.func @main() {
+  "tf.StatefulPartitionedCall"() {config = ":|x=2,y=2|*TPU", config_proto = "", executor_type = "", f = @f_callee} : () -> ()
+  func.return
+}
+
+func.func @f_callee() {
+  // CHECK:    tf_device.launch
+  // CHECK:      "tf._TPUCompileMlir"
+  // CHECK-SAME:  metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\04 \01"
+  // CHECK:    device = ""
+  %0:2 = "tf_device.launch"() ({
+    %1, %2 = "tf._TPUCompileMlir"() {
+      NumDynamicShapes = 0 : i64,
+      metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \01",
+      mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    tf_device.return %1, %2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+  })  {device = "tpu_host:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+
+  // CHECK-NEXT: "tf.TPUExecute"
+  "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+  func.return
+}
+
+// -----
+
+// Check that device placement of _TPUCompileMlir/TPUExeute operation is removed/updated properly.
+// CHECK-LABEL: func @main
+func.func @main() {
+  "tf.StatefulPartitionedCall"() {config = "|x=2,y=2|*TPU", config_proto = "", executor_type = "", f = @f_callee} : () -> ()
+  func.return
+}
+
+func.func @f_callee() {
+  // CHECK:    tf_device.launch
+  // CHECK:      "tf._TPUCompileMlir"
+  // CHECK:    device = ""
+  %0:2 = "tf_device.launch"() ({
+    %1, %2 = "tf._TPUCompileMlir"() {
+      NumDynamicShapes = 0 : i64,
+      metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\01 \01",
+      mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    tf_device.return %1, %2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+  })  {device = "tpu_host:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+
+  // CHECK:    tf_device.launch
+  // CHECK:      "tf.TPUExecute"
+  // CHECK:    device = ""
+  "tf_device.launch"() ({
+    "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+    tf_device.return
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> ()
+  func.return
+}
+
+// -----
+
+// Check that unparable TPUCompilaMetadataProto is disallowed.
+func.func @main() {
+  "tf.StatefulPartitionedCall"() {config = "|x=2,y=2|*TPU", config_proto = "", executor_type = "", f = @f_callee} : () -> ()
+  func.return
+}
+
+func.func @f_callee() {
+  %0:2 = "tf_device.launch"() ({
+    // expected-error @+1 {{unable to parse TPUCompileMetadata}}
+    %1, %2 = "tf._TPUCompileMlir"() {
+      NumDynamicShapes = 0 : i64,
+      metadata = "\0A\0B\0C",
+      mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+    tf_device.return %1, %2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
+  })  {device = "tpu_host:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
+
+  "tf_device.launch"() ({
+    "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
+    tf_device.return
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> ()
+  func.return
+}
+
diff --git a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
index 059a085f904..325c367f726 100644
--- a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
+++ b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
@@ -107,7 +107,7 @@ struct DTensorTpuAddResourceDeviceAttribute
             if (!read_variable_op) continue;
 
             AddPlaceholderDeviceAttributeToResource(
-                read_variable_op.resource().cast<mlir::BlockArgument>(),
+                read_variable_op.getResource().cast<mlir::BlockArgument>(),
                 tpu_execute);
           }
 
@@ -118,7 +118,7 @@ struct DTensorTpuAddResourceDeviceAttribute
 
             AddPlaceholderDeviceAttributeToResource(
                 llvm::cast<mlir::TF::AssignVariableOp>(assign_variable)
-                    .resource()
+                    .getResource()
                     .cast<mlir::BlockArgument>(),
                 tpu_execute);
           }
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index bc9a47f4d94..3bfa6010ea1 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -80,7 +80,7 @@ void IdentifyTPUFunctions(
   if (!main_func) return;
 
   for (auto call : main_func.getOps<mlir::TF::StatefulPartitionedCallOp>()) {
-    auto mesh_or_status = Mesh::FromString(string(call.config()));
+    auto mesh_or_status = Mesh::FromString(string(call.getConfig()));
     // Function calls created by end users instead of being converted from
     // tf_device.cluster do not have a serialized mesh as a config attribute. We
     // ignore the error returned from parsing in this case.
diff --git a/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc b/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
index 360ee90c3d2..e8fb30b9af4 100644
--- a/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
+++ b/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
@@ -47,7 +47,7 @@ struct DTensorUndoMergeConstAcrossMesh
             mlir::dyn_cast<mlir::TF::DTensorLayout>(consumer);
         if (!layout_op) continue;
 
-        const Layout layout = layout_op.layout();  // keep-alive for mesh.
+        const Layout layout = layout_op.getLayout();  // keep-alive for mesh.
         const Mesh& mesh = layout.mesh();
         if (std::find(known_meshes.begin(), known_meshes.end(), mesh) ==
             known_meshes.end()) {
diff --git a/tensorflow/dtensor/mlir/utils/BUILD b/tensorflow/dtensor/mlir/utils/BUILD
index 549f0d31d73..2a3ffb0017e 100644
--- a/tensorflow/dtensor/mlir/utils/BUILD
+++ b/tensorflow/dtensor/mlir/utils/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_oss")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/dtensor:dtensor-internal",
     ],
@@ -20,13 +21,19 @@ cc_library(
             "dtensor_embedding_stub.cc",
             "dtensor_embedding_v2_stub.cc",
         ],
+    ) + if_google(
+        [
+            "collective_lowering_google.cc",
+            "update_tpu_metadata_google.cc",
+        ],
     ),
     hdrs = ["dtensor_mlir_passes_internal.h"],
-    textual_hdrs = [] + if_google(["collective_lowering_google.inc"]),
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/dtensor/cc:constants",
@@ -41,15 +48,18 @@ cc_library(
         "//tensorflow/dtensor/mlir:group_assignment",
         "//tensorflow/dtensor/mlir:layout_parsing",
         "//tensorflow/dtensor/mlir:op_utils",
+        "//tensorflow/dtensor/mlir:shape_utils",
         "//tensorflow/dtensor/mlir:spmd_expander_common",
         "//tensorflow/dtensor/mlir:tf_dtensor_dialect",
         "//tensorflow/dtensor/mlir:value_utils",
         "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
+        "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index cdec045d09c..16bb36d3f54 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -14,11 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include <atomic>
+#include <cstdlib>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -36,9 +41,12 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/dstatus.h"
 #include "tensorflow/dtensor/cc/dtensor_utils.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/collectives_common.h"
 #include "tensorflow/dtensor/mlir/device_utils.h"
 #include "tensorflow/dtensor/mlir/dtensor_dialect/ir/dialect.h"
@@ -49,6 +57,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
+#include "tensorflow/tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -59,52 +68,159 @@ namespace {
 #define GEN_PASS_DEF_DTENSORALLGATHERLOWERING
 #define GEN_PASS_DEF_DTENSORALLSCATTERLOWERING
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
+}  // namespace
+
+namespace internal {
+#ifdef PLATFORM_GOOGLE
+mlir::LogicalResult EmitAllReduceForXlaGoogle(
+    mlir::MLIRContext& context, mlir::OpBuilder& builder,
+    mlir::TF::DTensorAllReduceOp all_reduce,
+    mlir::DenseIntElementsAttr group_assignment_attr, int32 key_base,
+    mlir::Operation** final_op);
+#endif
 
 namespace ops_util = ::mlir::TF::collection_ops_util;
 constexpr int32 kUninitializedGroupKey = 0;
 
-// A counter that is used to generate shift base values for TF collective group
-// and instance keys. Every TF collective AllReduce op in a program gets a value
-// from this counter. The value increments according to the position of the
-// AllReduce op in the program. Different hosts go through exactly the same MLIR
-// logic and therefore iterate over AllReduce ops in the same order (even in the
-// presence of control flow), so they should indenpendently generate the same
-// counter value for matching AllReduce ops across hosts.
-static std::atomic<int32> tf_collective_key_base{0};
+std::atomic<int32> tf_collective_instance_key_base{0};
 
-}  // namespace
-}  // namespace dtensor
-}  // namespace tensorflow
+bool HasEnableReuseGroupKey() {
+  // FIXME(b/258703996): use tsl::ReadBoolFromEnvVar()
+  // Experimental feature. If nonzero, reuse group key when emitting
+  // Collectives. Default is 1. This is only allowed to be set before the first
+  // use of DTensor.
+  static const char* env_str = (std::getenv("DTENSOR_ENABLE_REUSE_GROUP_KEY"));
+  if (env_str && strcmp(env_str, "0") == 0) {
+    return false;
+  }
+  return true;
+}
 
-#ifdef PLATFORM_GOOGLE
-// Use the Google internal version of EmitAllReduceForXla.
-#include "collective_lowering_google.inc"
-#else
-namespace tensorflow {
-namespace dtensor {
-namespace {
-constexpr char kCrossReplica[] = "CrossReplica";
+bool UseNcclCommunicationOnGpu() {
+  // This is the same as gpu_use_nccl_communication() from
+  // tensorflow/dtensor/python/config.py.
+  static bool is_enabled = [] {
+    bool ret = false;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("DTENSOR_GPU_USE_NCCL_COMMUNICATION",
+                                        /*default_val=*/false, &ret));
+    return ret;
+  }();
+  return is_enabled;
+}
 
 mlir::LogicalResult EmitAllReduceForXla(
     mlir::MLIRContext& context, mlir::OpBuilder& builder,
     mlir::TF::DTensorAllReduceOp all_reduce,
     mlir::DenseIntElementsAttr group_assignment_attr, int32 key_base,
     mlir::Operation** final_op) {
+#ifdef PLATFORM_GOOGLE
+  return EmitAllReduceForXlaGoogle(context, builder, all_reduce,
+                                   group_assignment_attr, key_base, final_op);
+#else
+  constexpr char kCrossReplica[] = "CrossReplica";
+
   // For TPUs, lower to XlaAllReduce straightforwardly.
   *final_op = builder.create<mlir::TF::XlaAllReduceOp>(
-      all_reduce.getLoc(), all_reduce.getResult().getType(), all_reduce.input(),
-      all_reduce.group_assignment(), all_reduce.reduce_opAttr(),
-      builder.getStringAttr(kCrossReplica));
+      all_reduce.getLoc(), all_reduce.getResult().getType(),
+      all_reduce.getInput(), all_reduce.getGroupAssignment(),
+      all_reduce.getReduceOpAttr(), builder.getStringAttr(kCrossReplica));
   return mlir::success();
-}
-}  // namespace
-}  // namespace dtensor
-}  // namespace tensorflow
 #endif
+}
+
+llvm::SmallVector<int32_t, 4> GetGroupKeyOffsets(
+    const mlir::DenseIntElementsAttr& group_assignment, int32_t* group_size) {
+  DCHECK_EQ(group_assignment.getType().getRank(), 2);
+  auto shape = group_assignment.getType().getShape();
+  const int32_t num_groups = shape[0];
+  *group_size = shape[1];
+  const int32_t num_devices = num_groups * *group_size;
+
+  llvm::SmallVector<int32, 4> device_id_to_group_key(num_devices);
+  device_id_to_group_key.resize(num_devices, kUninitializedGroupKey);
+  // 21 bits + 11 bits allow roughly 2M all-reduces in one program and up to a
+  // full DF pod.
+  DCHECK_LE(num_devices, 1L << 11) << "Exceeding 2048 groups.";
+  for (const auto& it :
+       llvm::enumerate(group_assignment.getValues<llvm::APInt>())) {
+    int32 device_id = it.value().getSExtValue();
+    DCHECK_LE(0, device_id);
+    DCHECK_LT(device_id, num_devices);
+    DCHECK_EQ(device_id_to_group_key[device_id], kUninitializedGroupKey);
+    const int32 group_offset = static_cast<int32>(it.index()) / *group_size;
+    device_id_to_group_key[device_id] = group_offset;
+  }
+  return device_id_to_group_key;
+}
+
+int32_t GetCollectiveKeyBase(
+    Mesh mesh, const mlir::DenseIntElementsAttr& group_assignment) {
+  // A counter that is used to generate shift base values for TF collective
+  // group and instance keys. Every TF collective AllReduce op in a program gets
+  // a value from this counter. The value increments according to the position
+  // of the AllReduce op in the program. Different hosts go through exactly the
+  // same MLIR logic and therefore iterate over AllReduce ops in the same order
+  // (even in the presence of control flow), so they should indenpendently
+  // generate the same counter value for matching AllReduce ops across hosts.
+  static std::atomic<int32> tf_collective_key_base{0};
+
+  if (!HasEnableReuseGroupKey()) {
+    return tf_collective_key_base++;
+  }
+  // Use an atomic counter to generate bases for group and instance keys.
+  static tensorflow::mutex* mtx = new tensorflow::mutex();
+  static auto* mesh_to_key_base =
+      new std::map<std::tuple<std::string, llvm::SmallVector<int32, 4>>,
+                   int32_t>();
+  int32_t group_size;
+  const llvm::SmallVector<int32, 4> group_key_offsets =
+      GetGroupKeyOffsets(group_assignment, &group_size);
+
+  const auto iter =
+      mesh_to_key_base->find({mesh.ToString(), group_key_offsets});
+  tensorflow::mutex_lock lock(*mtx);
+  if (iter != mesh_to_key_base->end()) {
+    return iter->second;
+  }
+  int32_t key_base = tf_collective_key_base++;
+  mesh_to_key_base->insert({{mesh.ToString(), group_key_offsets}, key_base});
+  return key_base;
+}
+
+void CreateGroupAndInstanceKey(
+    mlir::OpBuilder& builder, const mlir::Location& loc,
+    const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
+    mlir::Value device_id, mlir::Value* group_key_scalar,
+    mlir::Value* instance_key_scalar) {
+  int32_t group_size;
+  llvm::SmallVector<int32, 4> device_id_to_group_key =
+      GetGroupKeyOffsets(group_assignment, &group_size);
+  // 21 bits + 11 bits allow roughly 2M all-reduces in one program and up to a
+  // full DF pod.
+  DCHECK_LT(key_base, 1L << 21) << "Reaching 2^21 all-reduces.";
+  for (int32_t& it : device_id_to_group_key) {
+    it += (key_base << 11);
+  }
+
+  // Create a scalar group key by slicing device_id_to_group_key with
+  // device_id.
+  auto group_key_loc = DT_LOC2(loc, "group_key");
+  auto group_key_slice = builder.create<mlir::TF::SliceOp>(
+      group_key_loc, EffectivelyScalarR1Type(builder.getIntegerType(32)),
+      /*input=*/IntConst(builder, loc, device_id_to_group_key),
+      /*begin=*/device_id,
+      /*size=*/IntConst(builder, loc, {1}));
+  auto group_key_reshape = builder.create<mlir::TF::ReshapeOp>(
+      group_key_loc, /*tensor=*/group_key_slice.getResult(),
+      /*shape=*/ops_util::GetR1Const({}, builder, loc));
+  *group_key_scalar = group_key_reshape.getResult();
+
+  // Generate a unique instance key for this collective.
+  *instance_key_scalar = ops_util::CreateScalarConst(
+      static_cast<int32>(tf_collective_instance_key_base++), builder,
+      DT_LOC2(loc, "instance_key"));
+}
 
-namespace tensorflow {
-namespace dtensor {
-namespace {
 // Emit a host CollectiveReduce op for the given input.
 // `group_assignment` is used to generate an array of group keys.
 // `device_id` slices into that array to get the key for a device at runtime.
@@ -119,11 +235,6 @@ mlir::Operation* EmitCollectiveReduce(
     const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
     mlir::Value device_id, int32 host_group_size,
     const mlir::StringRef device_type) {
-  DCHECK_EQ(group_assignment.getType().getRank(), 2);
-  auto shape = group_assignment.getType().getShape();
-  const int32 num_groups = shape[0];
-  const int32 group_size = shape[1];
-  const int32 num_devices = num_groups * group_size;
   const mlir::TensorType input_type =
       input.getType().dyn_cast<mlir::TensorType>();
 
@@ -143,39 +254,11 @@ mlir::Operation* EmitCollectiveReduce(
         input);
     input = cast_to_int64.getResult();
   }
-  mlir::Value group_key_scalar;
-  llvm::SmallVector<int32, 4> device_id_to_group_key(num_devices);
-  device_id_to_group_key.resize(num_devices, kUninitializedGroupKey);
-  // 21 bits + 11 bits allow roughly 2M all-reduces in one program and up to a
-  // full DF pod.
-  DCHECK_LT(key_base, 1L << 21) << "Reaching 2^21 all-reduces.";
-  DCHECK_LE(num_devices, 1L << 11) << "Exceeding 2048 groups.";
-  for (const auto& it :
-       llvm::enumerate(group_assignment.getValues<llvm::APInt>())) {
-    int32 device_id = it.value().getSExtValue();
-    DCHECK_LE(0, device_id);
-    DCHECK_LT(device_id, num_devices);
-    DCHECK_EQ(device_id_to_group_key[device_id], kUninitializedGroupKey);
-    const int32 group_id = static_cast<int32>(it.index()) / group_size;
-    device_id_to_group_key[device_id] = (key_base << 11) ^ group_id;
-  }
 
-  // Create a scalar group key by slicing device_id_to_group_key with
-  // device_id.
-  auto group_key_loc = DT_LOC2(loc, "group_key");
-  auto group_key_slice = builder.create<mlir::TF::SliceOp>(
-      group_key_loc, EffectivelyScalarR1Type(builder.getIntegerType(32)),
-      /*input=*/IntConst(builder, loc, device_id_to_group_key),
-      /*begin=*/device_id,
-      /*size=*/IntConst(builder, loc, {1}));
-  auto group_key_reshape = builder.create<mlir::TF::ReshapeOp>(
-      group_key_loc, /*tensor=*/group_key_slice.getResult(),
-      /*shape=*/ops_util::GetR1Const({}, builder, loc));
-  group_key_scalar = group_key_reshape.getResult();
-
-  // Generate a unique instance key for this collective.
-  mlir::Value instance_key_scalar = ops_util::CreateScalarConst(
-      static_cast<int32>(key_base), builder, DT_LOC2(loc, "instance_key"));
+  mlir::Value group_key_scalar;
+  mlir::Value instance_key_scalar;
+  CreateGroupAndInstanceKey(builder, loc, group_assignment, key_base, device_id,
+                            &group_key_scalar, &instance_key_scalar);
 
   const bool is_mean_op = reduce_op_str == kReduceOpMean;
   mlir::Value group_size_scalar = ops_util::CreateScalarConst(
@@ -200,6 +283,127 @@ mlir::Operation* EmitCollectiveReduce(
   return collective_reduce;
 }
 
+// Emits TransposeOp with permuting passed dim_idx with first axis.
+mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
+                                 const mlir::Location& loc, mlir::Value input,
+                                 std::vector<int64> perm_arr) {
+  auto tr_input_type = input.getType().cast<mlir::ShapedType>();
+  auto shape = tr_input_type.getShape();
+
+  auto perm_type = mlir::RankedTensorType::get(
+      {static_cast<int64>(perm_arr.size())}, builder.getIntegerType(64));
+
+  auto constant_attr = builder.getI64TensorAttr(perm_arr);
+  auto perm_op =
+      builder.create<mlir::TF::ConstOp>(loc, perm_type, constant_attr);
+
+  std::vector<int64> transposed_shape(shape.begin(), shape.end());
+  for (int i = 0; i < shape.size(); i++) {
+    transposed_shape[i] = shape[perm_arr[i]];
+  }
+  auto transposed_type = mlir::RankedTensorType::get(
+      transposed_shape, tr_input_type.getElementType());
+
+  return builder.create<mlir::TF::TransposeOp>(loc, transposed_type, input,
+                                               perm_op);
+}
+
+mlir::Operation* EmitCollectiveReduceScatter(
+    mlir::OpBuilder& builder, const mlir::Location& loc, mlir::Value input,
+    mlir::Type output_type, const std::string& reduce_op_str,
+    const mlir::DenseIntElementsAttr& group_assignment, int32 scatter_dimension,
+    int32 key_base, mlir::Value device_id, int32 host_group_size,
+    const mlir::StringRef device_type) {
+  mlir::TensorType input_type = input.getType().dyn_cast<mlir::TensorType>();
+
+  const bool need_transpose = scatter_dimension != 0;
+  std::vector<int64> perm_for_transpose;
+  if (need_transpose) {
+    perm_for_transpose.reserve(input_type.getRank());
+    for (int i = 0; i < input_type.getRank(); i++) {
+      perm_for_transpose.push_back(i);
+    }
+    std::swap(perm_for_transpose[scatter_dimension], perm_for_transpose[0]);
+    auto pre_transpose_op =
+        EmitTransposeOp(builder, loc, input, perm_for_transpose);
+    input = pre_transpose_op->getResult(0);
+    input_type = input.getType().dyn_cast<mlir::TensorType>();
+    // Compute transposed output type for CollectiveReduceScatter
+    auto output_shape = output_type.dyn_cast<mlir::TensorType>().getShape();
+    std::vector<int64> transposed_shape(output_shape.begin(),
+                                        output_shape.end());
+    for (int i = 0; i < output_shape.size(); i++) {
+      transposed_shape[i] = output_shape[perm_for_transpose[i]];
+    }
+    output_type = mlir::RankedTensorType::get(transposed_shape,
+                                              input_type.getElementType());
+  }
+
+  mlir::Value group_key_scalar;
+  mlir::Value instance_key_scalar;
+  CreateGroupAndInstanceKey(builder, loc, group_assignment, key_base, device_id,
+                            &group_key_scalar, &instance_key_scalar);
+
+  const bool is_mean_op = reduce_op_str == kReduceOpMean;
+  mlir::Value group_size_scalar = ops_util::CreateScalarConst(
+      host_group_size, builder, DT_LOC2(loc, "group_size"));
+  auto collective_reduce_scatter = builder.create<
+      mlir::TF::CollectiveReduceScatterV2Op>(
+      loc, output_type, input, group_size_scalar, group_key_scalar,
+      instance_key_scalar,
+      /*ordering_token=*/mlir::ValueRange({}),
+      /*merge_op=*/builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
+      /*final_op=*/builder.getStringAttr(is_mean_op ? "Div" : "Id"),
+      /*communication_hint=*/builder.getStringAttr("nccl"),  // TODO(tmorris):
+                                                             // this shouldn't
+                                                             // be needed
+      /*timeout_seconds=*/builder.getF32FloatAttr(0.),
+      /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
+  SetSingleLayoutOnOp(collective_reduce_scatter, Layout::Empty());
+  if (need_transpose) {
+    return EmitTransposeOp(builder, loc,
+                           collective_reduce_scatter->getResult(0),
+                           perm_for_transpose);
+  }
+  return collective_reduce_scatter;
+}
+
+mlir::Operation* EmitCollectiveGather(
+    mlir::OpBuilder& builder, const mlir::Location& loc, mlir::Value input,
+    const mlir::DenseIntElementsAttr& group_assignment, int32 key_base,
+    mlir::Value device_id, int32 host_group_size,
+    const mlir::StringRef device_type) {
+  DCHECK_EQ(group_assignment.getType().getRank(), 2);
+  auto shape = group_assignment.getType().getShape();
+  const int32 group_size = shape[1];
+  const mlir::TensorType input_type =
+      input.getType().dyn_cast<mlir::TensorType>();
+  auto input_shape = input_type.getShape();
+  auto dim_0_shape = input_shape[0];
+  std::vector<int64> output_shape = {input_shape.begin(), input_shape.end()};
+  output_shape[0] = dim_0_shape * group_size;
+  auto output_type =
+      mlir::RankedTensorType::get(output_shape, input_type.getElementType());
+
+  mlir::Value group_key_scalar;
+  mlir::Value instance_key_scalar;
+  CreateGroupAndInstanceKey(builder, loc, group_assignment, key_base, device_id,
+                            &group_key_scalar, &instance_key_scalar);
+
+  mlir::Value group_size_scalar =
+      ops_util::CreateScalarConst(host_group_size, builder, loc);
+  auto collective_gather = builder.create<mlir::TF::CollectiveGatherV2Op>(
+      loc, /*output_type=*/input.getType(), input, group_size_scalar,
+      group_key_scalar, instance_key_scalar,
+      /*ordering_token=*/mlir::ValueRange({}),
+      /*communication_hint=*/builder.getStringAttr(""),
+      /*timeout_seconds=*/builder.getF32FloatAttr(0.));
+  SetSingleLayoutOnOp(collective_gather, Layout::Empty());
+  collective_gather.getData().setType(output_type);
+
+  return collective_gather;
+}
+
 mlir::LogicalResult LowerAllReduceOpImpl(
     mlir::MLIRContext& context, mlir::OpBuilder& builder,
     mlir::TF::DTensorAllReduceOp all_reduce, mlir::Value* value) {
@@ -210,18 +414,18 @@ mlir::LogicalResult LowerAllReduceOpImpl(
     return all_reduce.emitOpError(output_layout.status().error_message());
   }
   mlir::DenseIntElementsAttr group_assignment_attr;
-  if (!matchPattern(all_reduce.group_assignment(),
+  if (!matchPattern(all_reduce.getGroupAssignment(),
                     m_Constant(&group_assignment_attr)))
     return mlir::emitError(loc, "group_assigment must be a constant.");
   if (group_assignment_attr.getType().getRank() != 2)
     return mlir::emitError(loc, "group_assignment should have two dimensions.");
   int32 group_size = group_assignment_attr.getType().getShape()[1];
 
+  Mesh mesh = output_layout->mesh();
   // This will become more general when Topology is properly defined.
-  const bool is_tpu = all_reduce.device_type().endswith("TPU");
-  // Use an atomic counter to generate bases for group and instance keys.
-  int32 key_base = tf_collective_key_base++;
+  const bool is_tpu = all_reduce.getDeviceType().endswith("TPU");
 
+  const int32_t key_base = GetCollectiveKeyBase(mesh, group_assignment_attr);
   mlir::Operation* final_op;
   if (is_tpu) {
     if (mlir::failed(EmitAllReduceForXla(context, builder, all_reduce,
@@ -247,10 +451,10 @@ mlir::LogicalResult LowerAllReduceOpImpl(
     mlir::Value relative_device_id =
         builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
 
-    final_op = EmitCollectiveReduce(
-        builder, loc, all_reduce.input(), all_reduce.reduce_op().str(),
+    final_op = internal::EmitCollectiveReduce(
+        builder, loc, all_reduce.getInput(), all_reduce.getReduceOp().str(),
         group_assignment_attr, key_base, relative_device_id,
-        /*host_group_size=*/group_size, all_reduce.device_type().str());
+        /*host_group_size=*/group_size, all_reduce.getDeviceType().str());
   }
   SetSingleLayoutOnOp(final_op, *output_layout);
   *value = final_op->getResult(0);
@@ -271,26 +475,28 @@ mlir::LogicalResult ConvertBoolReduce(ReduceOpType reduce_op) {
       output_type.dyn_cast<mlir::TensorType>();
   if (tensor_input_type && tensor_output_type &&
       tensor_input_type.getElementType().isInteger(1)) {
-    if (reduce_op.reduce_opAttr().getValue().str() == kReduceOpAll)
-      reduce_op.reduce_opAttr(builder.getStringAttr(std::string(kReduceOpMin)));
-    else if (reduce_op.reduce_opAttr().getValue().str() == kReduceOpAny)
-      reduce_op.reduce_opAttr(builder.getStringAttr(std::string(kReduceOpMax)));
+    if (reduce_op.getReduceOpAttr().getValue().str() == kReduceOpAll)
+      reduce_op.setReduceOpAttr(
+          builder.getStringAttr(std::string(kReduceOpMin)));
+    else if (reduce_op.getReduceOpAttr().getValue().str() == kReduceOpAny)
+      reduce_op.setReduceOpAttr(
+          builder.getStringAttr(std::string(kReduceOpMax)));
     else
       return reduce_op.emitOpError()
              << "reduce for boolean only supports 'All' or 'Any' reduction. "
-             << "Received '" << reduce_op.reduce_opAttr().getValue().str()
+             << "Received '" << reduce_op.getReduceOpAttr().getValue().str()
              << "'";
     const mlir::Type integer_input_type = mlir::RankedTensorType::get(
         tensor_input_type.getShape(), builder.getIntegerType(32));
     mlir::TF::CastOp cast_to_int32 = builder.create<mlir::TF::CastOp>(
-        loc, integer_input_type, reduce_op.input());
-    reduce_op.setOperand(0, cast_to_int32.y());
-    const mlir::Type integer_output_type = mlir::RankedTensorType::get(
+        loc, integer_input_type, reduce_op.getInput());
+    reduce_op.setOperand(0, cast_to_int32.getY());
+    auto integer_output_type = mlir::RankedTensorType::get(
         tensor_output_type.getShape(), builder.getIntegerType(32));
-    reduce_op.output().setType(integer_output_type);
+    reduce_op.getOutput().setType(integer_output_type);
 
     // Add cast back to boolean after reduction.
-    mlir::Value result = reduce_op.output();
+    mlir::Value result = reduce_op.getOutput();
     builder.setInsertionPointAfter(reduce_op);
     mlir::TF::CastOp cast_to_bool =
         builder.create<mlir::TF::CastOp>(loc, output_type, result);
@@ -300,7 +506,8 @@ mlir::LogicalResult ConvertBoolReduce(ReduceOpType reduce_op) {
       return reduce_op.emitOpError(result_layout.status().error_message());
     }
     SetSingleLayoutOnOp(cast_to_bool, *result_layout);
-    reduce_op.output().replaceAllUsesExcept(cast_to_bool.y(), cast_to_bool);
+    reduce_op.getOutput().replaceAllUsesExcept(cast_to_bool.getY(),
+                                               cast_to_bool);
   }
 
   return mlir::success();
@@ -331,43 +538,70 @@ mlir::LogicalResult LowerReduceScatterOp(
     return reduce_scatter.emitOpError(output_layout.status().error_message());
   }
   mlir::DenseIntElementsAttr group_assignment_attr;
-  if (!matchPattern(reduce_scatter.group_assignment(),
+  if (!matchPattern(reduce_scatter.getGroupAssignment(),
                     m_Constant(&group_assignment_attr)))
     return reduce_scatter.emitOpError("group_assigment must be a constant.");
   if (group_assignment_attr.getType().getRank() != 2)
     return reduce_scatter.emitOpError(
         "group_assignment should have two dimensions.");
+  mlir::DenseIntElementsAttr scatter_attr;
+  if (!matchPattern(reduce_scatter.getScatterDimension(),
+                    m_Constant(&scatter_attr))) {
+    return reduce_scatter.emitOpError(
+        "Scatter dimension not constant integer array.");
+  }
+  int32 scatter_dim = (*scatter_attr.begin()).getSExtValue();
 
   mlir::OpBuilder builder(reduce_scatter);
-  if (reduce_scatter.device_type().endswith("TPU")) {
+  if (reduce_scatter.getDeviceType().endswith("TPU")) {
     if (mlir::failed(ConvertBoolReduce<mlir::TF::DTensorReduceScatterOp>(
             reduce_scatter)))
       return mlir::failure();
     // For TPUs, lower to XlaReduceScatter straightforwardly.
     mlir::Operation* xla_reduce_scatter =
         builder.create<mlir::TF::XlaReduceScatterOp>(
-            loc, reduce_scatter.getResult().getType(), reduce_scatter.input(),
-            reduce_scatter.group_assignment(),
-            reduce_scatter.scatter_dimension(), reduce_scatter.reduce_opAttr());
+            loc, reduce_scatter.getResult().getType(),
+            reduce_scatter.getInput(), reduce_scatter.getGroupAssignment(),
+            reduce_scatter.getScatterDimension(),
+            reduce_scatter.getReduceOpAttr());
     SetSingleLayoutOnOp(xla_reduce_scatter, *output_layout);
     reduce_scatter.replaceAllUsesWith(xla_reduce_scatter);
+  } else if (reduce_scatter.getDeviceType().endswith("GPU") &&
+             UseNcclCommunicationOnGpu()) {
+    // Use CollectiveReduceScatterV2 which has a NCCL GPU implementation.
+    mlir::Value device_id = ops_util::ReshapeScalarToSizeType(
+        builder, DeviceId(reduce_scatter.getResult()).value(), loc);
+    // TODO(b/188076080): Clean up device id.
+    mlir::Value start_device_id = ops_util::GetR1Const(
+        {(*output_layout).mesh().min_global_device_id()}, builder, loc);
+    mlir::Value relative_device_id =
+        builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+
+    int32 group_size = group_assignment_attr.getType().getShape()[1];
+    const int32_t key_base =
+        GetCollectiveKeyBase((*output_layout).mesh(), group_assignment_attr);
+
+    mlir::Operation* collective_op = EmitCollectiveReduceScatter(
+        builder, loc, reduce_scatter.getInput(),
+        reduce_scatter.getResult().getType(),
+        reduce_scatter.getReduceOp().str(), group_assignment_attr, scatter_dim,
+        key_base, relative_device_id,
+        /*host_group_size=*/group_size, reduce_scatter.getDeviceType().str());
+    SetSingleLayoutOnOp(collective_op, *output_layout);
+    reduce_scatter.replaceAllUsesWith(collective_op);
   } else {
-    // For non TPUs device, decompose to DTensorAllReduce+DTensorAllScatter.
+    // For CPU and non-NCCL GPU devices, decompose to
+    // DTensorAllReduce+DTensorAllScatter.
+    // TODO(tmorris): Once CollectiveReduceScatterV2 has a non-NCCL
+    // implementation, remove this path.
     StatusOr<Layout> input_layout =
-        ExtractRequiredLayoutFromOperand(reduce_scatter.input());
+        ExtractRequiredLayoutFromOperand(reduce_scatter.getInput());
     if (!input_layout.ok()) {
       // If input layout is not defined, modify the output_layout based on the
       // scattered dimension.
-      mlir::DenseIntElementsAttr scatter_attr;
-      if (!matchPattern(reduce_scatter.scatter_dimension(),
-                        m_Constant(&scatter_attr))) {
-        return reduce_scatter.emitOpError(
-            "Scatter dimension not constant integer array.");
-      }
-      mlir::APInt scatter_dim = *scatter_attr.begin();
       std::vector<string> input_sharding_spec =
           output_layout->sharding_spec_strs();
-      input_sharding_spec[scatter_dim.getSExtValue()] = Layout::kUnshardedDim;
+      input_sharding_spec[scatter_dim] = Layout::kUnshardedDim;
       input_layout =
           Layout::GetLayout(input_sharding_spec, output_layout->mesh());
     }
@@ -378,8 +612,8 @@ mlir::LogicalResult LowerReduceScatterOp(
 
     auto dtensor_allreduce = builder.create<mlir::TF::DTensorAllReduceOp>(
         reduce_scatter.getLoc(), reduce_scatter.getOperand(0).getType(),
-        reduce_scatter.getOperand(0), reduce_scatter.group_assignment(),
-        reduce_scatter.reduce_op(), reduce_scatter.device_type());
+        reduce_scatter.getOperand(0), reduce_scatter.getGroupAssignment(),
+        reduce_scatter.getReduceOp(), reduce_scatter.getDeviceType());
     SetSingleLayoutOnOp(dtensor_allreduce, *input_layout);
 
     mlir::Operation* dtensor_all_scatter =
@@ -439,9 +673,187 @@ mlir::Value SelectElementsBasedOnId(
       loc, chosen, ops_util::GetR1Const({output_shape_size}, builder, loc));
 }
 
+StatusOr<const mlir::DenseIntElementsAttr> GetGroupAssignment(
+    mlir::OpBuilder builder, const Layout src_layout,
+    absl::flat_hash_set<std::string> reduced_dims) {
+  std::vector<int32> partitions_flat;
+  TF_ASSIGN_OR_RETURN(
+      auto all_partitions,
+      GetAllReducePartitionsFromReducedDims(src_layout, reduced_dims));
+
+  const int32 num_partitions = all_partitions.size();
+  for (auto& p : all_partitions) {
+    if (p.second.size() != all_partitions.begin()->second.size()) {
+      return errors::InvalidArgument(
+          "partitions had different sizes -- "
+          "this is not supported in MLIR.");
+    }
+    partitions_flat.insert(partitions_flat.end(), p.second.begin(),
+                           p.second.end());
+  }
+
+  const int32 partition_size = all_partitions.begin()->second.size();
+
+  const mlir::RankedTensorType shaped_type = mlir::RankedTensorType::get(
+      {num_partitions, partition_size},
+      mlir::IntegerType::get(builder.getContext(), 32));
+  const mlir::DenseIntElementsAttr group_assignment =
+      mlir::DenseIntElementsAttr::get(shaped_type, partitions_flat);
+  if (group_assignment.getType().getRank() != 2) {
+    return errors::InvalidArgument(
+        "group_assignment should have two dimensions.");
+  }
+  return group_assignment;
+}
+
+mlir::LogicalResult LowerAllGatherOpToCollective(
+    mlir::TF::DTensorAllGatherOp all_gather) {
+  const Layout src_layout = all_gather.getInputLayout();
+  const Layout tgt_layout = all_gather.getOutputLayout();
+  mlir::OpBuilder builder(all_gather);
+  builder.setInsertionPointAfter(all_gather);
+
+  const mlir::Location loc = DT_LOC(all_gather.getLoc());
+
+  mlir::Value device_id = ops_util::ReshapeScalarToSizeType(
+      builder, DeviceId(all_gather.getResult()).value(), loc);
+  mlir::Value start_device_id = ops_util::GetR1Const(
+      {(tgt_layout).mesh().min_global_device_id()}, builder, loc);
+  mlir::Value relative_device_id =
+      builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+
+  StatusOr<std::string> device_type_or_status =
+      DeviceTypeFromMesh(src_layout.mesh());
+  if (!device_type_or_status.ok())
+    return all_gather.emitOpError()
+           << device_type_or_status.status().error_message();
+  const std::string device_type = device_type_or_status.value();
+
+  const mlir::RankedTensorType input_type =
+      all_gather.getInput().getType().dyn_cast<mlir::RankedTensorType>();
+  const mlir::RankedTensorType output_type =
+      all_gather.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
+
+  if (!input_type)
+    return all_gather.emitOpError() << "input type is not a RankedTensorType";
+  if (!output_type)
+    return all_gather.emitOpError() << "output type is not a RankedTensorType";
+
+  const std::vector<int64_t> output_shape = output_type.getShape();
+  const std::vector<int64_t> input_shape = input_type.getShape();
+
+  mlir::Value prev_op_result = all_gather.getInput();
+
+  absl::flat_hash_set<std::string> dims_to_gather;
+
+  std::vector<int32> num_shards_per_dim;
+  absl::flat_hash_map<int32, int32> previous_sharded_dim;
+  int32 last_sharded_dim = 0;
+  std::vector<int64_t> input_shape_after_tr;
+
+  std::vector<int64> perm_for_transpose;
+  perm_for_transpose.reserve(src_layout.rank());
+  for (int i = 0; i < src_layout.rank(); i++) {
+    perm_for_transpose.push_back(i);
+  }
+
+  for (int i = 0; i < src_layout.rank(); i++) {
+    if (src_layout.num_shards_for_dim(src_layout.dim(i)) ==
+            tgt_layout.num_shards_for_dim(tgt_layout.dim(i)) ||
+        src_layout.num_shards_for_dim(src_layout.dim(i)) == 1) {
+      continue;
+    }
+
+    int64 temp = perm_for_transpose[0];
+    perm_for_transpose[0] = perm_for_transpose[i];
+    perm_for_transpose[i] = temp;
+
+    num_shards_per_dim.push_back(
+        src_layout.num_shards_for_dim(src_layout.dim(i)));
+    previous_sharded_dim[i] = last_sharded_dim;
+    last_sharded_dim = i;
+
+    input_shape_after_tr.insert(input_shape_after_tr.begin(), input_shape[i]);
+    dims_to_gather.insert(src_layout.sharding_spec(i));
+  }
+  auto pre_transpose_op =
+      EmitTransposeOp(builder, loc, prev_op_result, perm_for_transpose);
+  prev_op_result = pre_transpose_op->getResult(0);
+
+  auto group_assignment_or =
+      GetGroupAssignment(builder, src_layout, dims_to_gather);
+  if (!group_assignment_or.ok()) {
+    return all_gather.emitOpError()
+           << group_assignment_or.status().error_message();
+  }
+  auto group_assignment = group_assignment_or.value();
+  int32 group_size = group_assignment.getType().getShape()[1];
+  int32 key_base = GetCollectiveKeyBase(tgt_layout.mesh(), group_assignment);
+  auto collective_op =
+      EmitCollectiveGather(builder, loc, prev_op_result, group_assignment,
+                           key_base, relative_device_id,
+                           /*host_group_size=*/group_size, device_type);
+
+  prev_op_result = collective_op->getResult(0);
+  if (num_shards_per_dim.size() > 1) {
+    std::vector<int64> new_shape;
+    new_shape.reserve(input_shape.size() + num_shards_per_dim.size());
+    for (int j = 0; j < num_shards_per_dim.size(); j++) {
+      new_shape.push_back(num_shards_per_dim[j]);
+    }
+
+    for (int j = 0; j < input_shape_after_tr.size(); j++) {
+      new_shape.push_back(input_shape_after_tr[j]);
+    }
+
+    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
+        loc, /*tensor=*/collective_op->getResult(0),
+        /*shape=*/ops_util::GetR1Const(new_shape, builder, loc));
+
+    prev_op_result = reshape_op->getResult(0);
+    for (int i = src_layout.rank() - 1; i >= 0; i--) {
+      if (src_layout.num_shards_for_dim(src_layout.dim(i)) ==
+              tgt_layout.num_shards_for_dim(tgt_layout.dim(i)) ||
+          src_layout.num_shards_for_dim(src_layout.dim(i)) == 1) {
+        continue;
+      }
+
+      // Transpose based on sharding. Sharded dims are updated in the front
+      // before calling collective.
+      std::vector<int64> perm_arr = {};
+      // for (int j = 0; j <= src_layout.rank(); j++) {
+      perm_arr.reserve(new_shape.size());
+      for (int j = 0; j < new_shape.size(); j++) {
+        perm_arr.push_back(j);
+      }
+
+      if (i != previous_sharded_dim[i]) {
+        for (int j = i + 1; j < new_shape.size(); j++) {
+          perm_arr[j] = j - 1;
+        }
+        perm_arr[i] = new_shape.size() - 1;
+      }
+      auto tr_op = EmitTransposeOp(builder, loc, prev_op_result, perm_arr);
+      prev_op_result = tr_op->getResult(0);
+    }
+  } else {
+    auto post_transpose_op =
+        EmitTransposeOp(builder, loc, prev_op_result, perm_for_transpose);
+    prev_op_result = post_transpose_op->getResult(0);
+  }
+
+  auto output_reshape_op = builder.create<mlir::TF::ReshapeOp>(
+      loc, /*tensor=*/prev_op_result,
+      /*shape=*/ops_util::GetR1Const(output_shape, builder, loc));
+  SetSingleLayoutOnOp(output_reshape_op, tgt_layout);
+  all_gather.replaceAllUsesWith(output_reshape_op->getResult(0));
+  all_gather.erase();
+  return mlir::success();
+}
+
 mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
-  const Layout src_layout = all_gather.input_layout();
-  const Layout tgt_layout = all_gather.output_layout();
+  const Layout src_layout = all_gather.getInputLayout();
+  const Layout tgt_layout = all_gather.getOutputLayout();
 
   llvm::SmallVector<int64, 4> concat_dims;
   for (int64 i = 0; i < src_layout.rank(); ++i)
@@ -454,24 +866,39 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
 
   if (concat_dims.empty()) {
     mlir::TF::IdentityOp identity = builder.create<mlir::TF::IdentityOp>(
-        all_gather.getLoc(), all_gather.input().getType(), all_gather.input());
+        all_gather.getLoc(), all_gather.getInput().getType(),
+        all_gather.getInput());
     SetSingleLayoutOnOp(identity, tgt_layout);
 
-    all_gather.output().replaceAllUsesWith(identity);
+    all_gather.getOutput().replaceAllUsesWith(identity);
     all_gather.erase();
     return mlir::success();
   }
 
   const mlir::RankedTensorType input_type =
-      all_gather.input().getType().dyn_cast<mlir::RankedTensorType>();
+      all_gather.getInput().getType().dyn_cast<mlir::RankedTensorType>();
   const mlir::RankedTensorType output_type =
-      all_gather.output().getType().dyn_cast<mlir::RankedTensorType>();
+      all_gather.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
 
   if (!input_type)
     return all_gather.emitOpError() << "input type is not a RankedTensorType";
   if (!output_type)
     return all_gather.emitOpError() << "output type is not a RankedTensorType";
 
+  if (!LowerCollectiveGatherToCollectiveGatherV2() ||
+      src_layout.mesh().is_tpu_mesh()) {
+    // Use existing Reduce flow for TPU mesh and when explicitly enabled.
+  } else if (input_type.getElementType().isInteger(32) ||
+             input_type.getElementType().isInteger(64) ||
+             input_type.getElementType().isF16() ||
+             input_type.getElementType().isF32() ||
+             input_type.getElementType().isF64()) {
+    // CollectiveGatherV2 does not support any other data type.
+    return LowerAllGatherOpToCollective(all_gather);
+  } else {
+    // Use existing reduce flow for unsupported data types.
+  }
+
   const std::vector<int64_t> output_shape = output_type.getShape();
 
   // Construct an output with zeros of the correct size, and add our
@@ -522,7 +949,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   // Resize three flat lists to 2D matrices and select one vertical vector out
   // of every matrix based on device ID.
   StatusOr<mlir::Value> device_id_scalar_or_status =
-      DeviceId(all_gather.input());
+      DeviceId(all_gather.getInput());
   if (!device_id_scalar_or_status.ok())
     return all_gather.emitOpError()
            << device_id_scalar_or_status.status().error_message();
@@ -552,12 +979,12 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
              << "source and target layout are not both on tpu";
     update_result = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
         loc, zeros.getType(), /*input=*/zeros,
-        /*update=*/all_gather.input(), /*indices=*/begin);
+        /*update=*/all_gather.getInput(), /*indices=*/begin);
   } else {
     update_result = builder.create<mlir::TF::TensorStridedSliceUpdateOp>(
         loc, zeros.getType(),
         /*input=*/zeros, begin, end, strides,
-        /*value=*/all_gather.input());
+        /*value=*/all_gather.getInput());
   }
 
   // All reduce among concatenated dimensions.
@@ -579,7 +1006,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
     // If every device lives in its own partition, we don't need to emit a
     // collective.
     SetSingleLayoutOnOp(update_result.getDefiningOp(), tgt_layout);
-    all_gather.output().replaceAllUsesWith(update_result);
+    all_gather.getOutput().replaceAllUsesWith(update_result);
     all_gather.erase();
     return mlir::success();
   }
@@ -620,15 +1047,15 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
           builder.getStringAttr(device_type));
   SetSingleLayoutOnOp(all_reduce, tgt_layout);
 
-  all_gather.output().replaceAllUsesWith(all_reduce.getResult());
+  all_gather.getOutput().replaceAllUsesWith(all_reduce.getResult());
   all_gather.erase();
   return mlir::LogicalResult::success();
 }
 
 mlir::LogicalResult LowerAllScatterOp(
     mlir::TF::DTensorAllScatterOp all_scatter) {
-  const Layout original_layout = all_scatter.input_layout();
-  const Layout desired_layout = all_scatter.output_layout();
+  const Layout original_layout = all_scatter.getInputLayout();
+  const Layout desired_layout = all_scatter.getOutputLayout();
 
   mlir::tf_device::ClusterOp cluster =
       all_scatter->getParentOfType<mlir::tf_device::ClusterOp>();
@@ -651,7 +1078,7 @@ mlir::LogicalResult LowerAllScatterOp(
   // sharding_spec[j]=i and this is a dimension with split and 0 otherwise.
 
   mlir::RankedTensorType output_type =
-      all_scatter.output().getType().dyn_cast<mlir::RankedTensorType>();
+      all_scatter.getOutput().getType().dyn_cast<mlir::RankedTensorType>();
   if (!output_type)
     return all_scatter.emitOpError() << "input must have static rank";
 
@@ -712,20 +1139,23 @@ mlir::LogicalResult LowerAllScatterOp(
       all_scatter.getLoc(),
       mlir::RankedTensorType::get({original_layout.rank()},
                                   builder.getIntegerType(32)),
-      offset.product(), builder.getI64ArrayAttr({0}));
+      offset.getProduct(), builder.getI64ArrayAttr({0}));
 
   auto result = builder.create<mlir::TF::SliceOp>(
-      all_scatter.getLoc(), output_type, all_scatter.input(),
-      offset_squeezed.output(), slice_shape_value);
+      all_scatter.getLoc(), output_type, all_scatter.getInput(),
+      offset_squeezed.getOutput(), slice_shape_value);
 
   SetSingleLayoutOnOp(result, desired_layout);
 
-  all_scatter.output().replaceAllUsesExcept(result.output(), result);
+  all_scatter.getOutput().replaceAllUsesExcept(result.getOutput(), result);
   all_scatter.erase();
 
   return mlir::LogicalResult::success();
 }
 
+}  // namespace internal
+
+namespace {
 struct DTensorAllReduceLowering
     : public impl::DTensorAllReduceLoweringBase<DTensorAllReduceLowering> {
   void runOnOperation() override {
@@ -740,7 +1170,7 @@ struct DTensorAllReduceLowering
 
     // Replace every DTensorAllReduce op with device-specific implementations.
     for (auto& all_reduce : all_reduces)
-      if (mlir::failed(LowerAllReduceOp(context, all_reduce)))
+      if (mlir::failed(internal::LowerAllReduceOp(context, all_reduce)))
         return signalPassFailure();
   }
 };
@@ -763,7 +1193,7 @@ struct DTensorReduceScatterLowering
 
     // Replace every DTensorAllReduce op with device-specific implementations.
     for (auto& all_reduce : all_reduces)
-      if (mlir::failed(LowerReduceScatterOp(all_reduce)))
+      if (mlir::failed(internal::LowerReduceScatterOp(all_reduce)))
         return signalPassFailure();
   }
 };
@@ -780,7 +1210,7 @@ struct DTensorAllGatherLowering
     });
 
     for (mlir::TF::DTensorAllGatherOp all_gather : all_gathers)
-      if (mlir::failed(LowerAllGatherOp(all_gather)))
+      if (mlir::failed(internal::LowerAllGatherOp(all_gather)))
         return signalPassFailure();
   }
 };
@@ -797,7 +1227,7 @@ struct DTensorAllScatterLowering
     });
 
     for (mlir::TF::DTensorAllScatterOp all_scatter : all_scatters)
-      if (mlir::failed(LowerAllScatterOp(all_scatter)))
+      if (mlir::failed(internal::LowerAllScatterOp(all_scatter)))
         return signalPassFailure();
   }
 };
diff --git a/tensorflow/dtensor/mlir/utils/dtensor_embedding_stub.cc b/tensorflow/dtensor/mlir/utils/dtensor_embedding_stub.cc
index f6a0d38b70a..a4c085ab5ed 100644
--- a/tensorflow/dtensor/mlir/utils/dtensor_embedding_stub.cc
+++ b/tensorflow/dtensor/mlir/utils/dtensor_embedding_stub.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/xla/mlir_hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h"
 
 namespace tensorflow {
diff --git a/tensorflow/dtensor/mlir/utils/dtensor_embedding_v2_stub.cc b/tensorflow/dtensor/mlir/utils/dtensor_embedding_v2_stub.cc
index e368fe6c2ec..eca8a8b9b0f 100644
--- a/tensorflow/dtensor/mlir/utils/dtensor_embedding_v2_stub.cc
+++ b/tensorflow/dtensor/mlir/utils/dtensor_embedding_v2_stub.cc
@@ -28,6 +28,11 @@ struct DTensorEmbeddingV2
   void runOnOperation() override {}
 };
 
+struct DTensorEmbeddingCheckpoint
+    : public impl::DTensorEmbeddingCheckpointBase<DTensorEmbeddingCheckpoint> {
+  void runOnOperation() override {}
+};
+
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -35,13 +40,10 @@ CreateDTensorEmbeddingPassV2() {
   return std::make_unique<DTensorEmbeddingV2>();
 }
 
-struct DTensorEmbeddingCheckpoint
-    : public impl::DTensorEmbeddingCheckpointBase<DTensorEmbeddingCheckpoint> {
-  void runOnOperation() override {}
-};
-
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-CreateDTensorEmbeddingCheckpointPass();
+CreateDTensorEmbeddingCheckpointPass() {
+  return std::make_unique<DTensorEmbeddingCheckpoint>();
+}
 
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc b/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
index e5dcd6dd433..a75438a4ac0 100644
--- a/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
+++ b/tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc
@@ -16,10 +16,11 @@ limitations under the License.
 // LINT.IfChange
 #include "tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.h"
 
+#include <cstdlib>
+
 #include "mlir/IR/BuiltinOps.h"
 #include "tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h"
 
-
 namespace tensorflow {
 namespace dtensor {
 
diff --git a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
index 7773d82f4b5..c75573d3147 100644
--- a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
+++ b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
@@ -15,16 +15,16 @@ limitations under the License.
 
 #include <string>
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/Visitors.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,6 +40,20 @@ limitations under the License.
 namespace tensorflow {
 namespace dtensor {
 
+namespace internal {
+#ifdef PLATFORM_GOOGLE
+extern void ComputeReplicaGroupSplitInfo(int requested_num_replicas,
+                                         int* num_replicas,
+                                         int* core_id_offset);
+#else
+// By default, all TPUs are connected, construct a single replica group.
+void ComputeReplicaGroupSplitInfo(int requested_num_replicas, int* num_replicas,
+                                  int* core_id_local_offset) {
+  *num_replicas = requested_num_replicas;
+  *core_id_local_offset = 0;
+}
+#endif
+}  // namespace internal
 namespace {
 #define GEN_PASS_DEF_DTENSORUPDATETPUMETADATA
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
@@ -79,14 +93,19 @@ mlir::LogicalResult UpdateTPUCompileMetadata(const Mesh& mesh_config,
                                              mlir::func::FuncOp function,
                                              mlir::OpBuilder* builder) {
   auto result = function.walk([&](mlir::TF::_TPUCompileMlirOp compile) {
-    auto original_metadata = compile.metadata();
+    auto original_metadata = compile.getMetadata();
     tpu::TPUCompileMetadataProto metadata_proto;
     if (!metadata_proto.ParseFromString(original_metadata.str())) {
       compile.emitOpError("unable to parse TPUCompileMetadata");
       return mlir::WalkResult::interrupt();
     }
 
+    int core_id_local_offset = 0;
     int num_replicas = mesh_config.num_devices();
+
+    internal::ComputeReplicaGroupSplitInfo(num_replicas, &num_replicas,
+                                           &core_id_local_offset);
+
     metadata_proto.set_num_replicas(num_replicas);
 
     // We keep DTensor mesh global device IDs equal to XLA replica IDs, both
@@ -130,14 +149,14 @@ mlir::LogicalResult UpdateTPUCompileMetadata(const Mesh& mesh_config,
       // TODO(b/188076080): Clean up device id.
       const int64_t start_device_id = mesh_config.min_global_device_id();
       for (int i = 0; i < num_replicas; ++i) {
-        int tpu_core_id_index = i + start_device_id;
+        int tpu_core_id_index = i + start_device_id + core_id_local_offset;
         computation_device->add_replica_device_ids(
             tpu_core_ids[tpu_core_id_index]);
       }
       *metadata_proto.mutable_device_assignment() = device_assignment;
     }
 
-    compile.metadataAttr(
+    compile.setMetadataAttr(
         builder->getStringAttr(metadata_proto.SerializeAsString()));
     return mlir::WalkResult::advance();
   });
@@ -152,11 +171,12 @@ struct DTensorUpdateTPUMetadata
     mlir::MLIRContext& context = getContext();
     mlir::OpBuilder builder(&context);
     auto module = getOperation();
-    mlir::func::FuncOp main_func = module.lookupSymbol<mlir::func::FuncOp>("main");
+    mlir::func::FuncOp main_func =
+        module.lookupSymbol<mlir::func::FuncOp>("main");
     if (!main_func) return;
 
     auto result = main_func.walk([&](mlir::TF::StatefulPartitionedCallOp op) {
-      auto call_config = op.config();
+      auto call_config = op.getConfig();
       auto mesh_or_status = Mesh::FromString(call_config.str());
       if (!mesh_or_status.ok()) return mlir::WalkResult::advance();
 
diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index db45615c366..2719ceb1e11 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/value_utils.h"
 
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
 #include "tensorflow/dtensor/mlir/op_utils.h"
@@ -49,7 +51,7 @@ mlir::Value GetForwardedInput(mlir::Value value) {
       // TODO(bfontain): Add cases for identity and control flow return values.
       if (mlir::TF::DTensorLayout layout_op =
               mlir::dyn_cast<mlir::TF::DTensorLayout>(op)) {
-        value = layout_op.input();
+        value = layout_op.getInput();
         value_updated = true;
       }
     }
@@ -92,6 +94,16 @@ mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
   return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
 }
 
+StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type) {
+  auto ranked_type = type.dyn_cast<mlir::RankedTensorType>();
+  if (!ranked_type) {
+    return errors::InvalidArgument(
+        llvm::formatv("Type {0} is not a RankedTensorType.", type).str());
+  }
+
+  return ConvertMlirShapeToTF(ranked_type.getShape());
+}
+
 mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
                        llvm::ArrayRef<int64_t> values) {
   auto const_type = mlir::RankedTensorType::get(
@@ -236,7 +248,7 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
       scalar_size_type.getShape(), builder, location);
   mlir::Value scalar_sliced_value = builder.create<mlir::TF::ReshapeOp>(
       location, mlir::ArrayRef<mlir::Type>{scalar_size_type},
-      mlir::ArrayRef<mlir::Value>{sliced_value.output(), scalar_shape},
+      mlir::ArrayRef<mlir::Value>{sliced_value.getOutput(), scalar_shape},
       mlir::ArrayRef<mlir::NamedAttribute>{});
   return scalar_sliced_value;
 }
diff --git a/tensorflow/dtensor/mlir/value_utils.h b/tensorflow/dtensor/mlir/value_utils.h
index 6ac38e0fcf3..bab372a0869 100644
--- a/tensorflow/dtensor/mlir/value_utils.h
+++ b/tensorflow/dtensor/mlir/value_utils.h
@@ -37,6 +37,11 @@ mlir::RankedTensorType EffectivelyScalarR1Type(mlir::Type element_type);
 mlir::Value ReshapeSizeTypeToScalar(mlir::OpBuilder builder, mlir::Location loc,
                                     mlir::Value tensor);
 
+// Retuns a int64 array representing the TensorFlow shape given the MLIR type.
+// If the type is a resource, returns the underlying shape of the resource
+// instead. Returns an error if the type is not a RankedTensorType.
+StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type);
+
 // Return a 1-D int32 constant array with the given values.
 mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
                      llvm::ArrayRef<int32> values);
diff --git a/tensorflow/dtensor/proto/BUILD b/tensorflow/dtensor/proto/BUILD
index 4507bd08a74..f6d43438454 100644
--- a/tensorflow/dtensor/proto/BUILD
+++ b/tensorflow/dtensor/proto/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/dtensor:dtensor-users",
diff --git a/tensorflow/dtensor/proto/layout.proto b/tensorflow/dtensor/proto/layout.proto
index 30d3dfc22e9..e73c99d515d 100644
--- a/tensorflow/dtensor/proto/layout.proto
+++ b/tensorflow/dtensor/proto/layout.proto
@@ -60,4 +60,7 @@ message MeshProto {
 
   // Required name which uniquely identifies this mesh in a program.
   string name = 3;
+
+  // If true, ops on this mesh will use XLA SPMD.
+  bool use_xla_spmd = 7;
 }
diff --git a/tensorflow/dtensor/python/BUILD b/tensorflow/dtensor/python/BUILD
index c375005ad2d..28a1f2c83a4 100644
--- a/tensorflow/dtensor/python/BUILD
+++ b/tensorflow/dtensor/python/BUILD
@@ -5,9 +5,11 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 default_visibility = [
     "//tensorflow/dtensor:dtensor-internal",
+    "//third_party/py/jax_tpu_embedding:__subpackages__",
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_visibility,
     licenses = ["notice"],
 )
@@ -41,12 +43,9 @@ pytype_strict_library(
     srcs = ["api.py"],
     srcs_version = "PY3",
     deps = [
-        ":config",
         ":dtensor_device",
         ":gen_dtensor_ops",
         ":layout",
-        "//tensorflow/python:config",
-        "//tensorflow/python:device",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:tf_export",
@@ -138,6 +137,21 @@ pytype_strict_library(
     ],
 )
 
+pytype_strict_library(
+    name = "numpy_util",
+    srcs = ["numpy_util.py"],
+    deps = [
+        ":api",
+        ":layout",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/types",
+        "//third_party/py/numpy",
+    ],
+)
+
 pytype_strict_library(
     name = "save_restore",
     srcs = ["save_restore.py"],
@@ -177,6 +191,7 @@ pytype_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/dtensor/python/accelerator_util.py b/tensorflow/dtensor/python/accelerator_util.py
index e7ef55d41b6..b4393e074de 100644
--- a/tensorflow/dtensor/python/accelerator_util.py
+++ b/tensorflow/dtensor/python/accelerator_util.py
@@ -35,13 +35,19 @@ def is_initialized() -> bool:
   return bool(_INITIALIZED_ACCELERATOR_SYSTEM_TYPE)
 
 
+def set_initialized(value):
+  """Sets if accelerator system has been initialized."""
+  global _INITIALIZED_ACCELERATOR_SYSTEM_TYPE
+  _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = value
+
+
 def initialize_multi_client_cluster(job_name: str,
                                     dtensor_jobs: List[str],
                                     client_id: int,
                                     collective_leader: str,
                                     port: Optional[int] = None,
                                     gpu_use_nccl_communication: bool = False,
-                                    enable_coordination_service: bool = False):
+                                    enable_coordination_service: bool = True):
   """Initialize GRPC servers and collectives for multi-client DTensor setup.
 
   This function can be used to initialize a multi-client cluster and enable
@@ -112,7 +118,7 @@ def initialize_multi_client_cluster(job_name: str,
     v1=[])
 def initialize_accelerator_system(
     device_type: Optional[str] = None,
-    enable_coordination_service: Optional[bool] = False) -> str:
+    enable_coordination_service: Optional[bool] = True) -> str:
   """Initializes accelerators and communication fabrics for DTensor.
 
   DTensor configures TensorFlow to run in the local mode or multi-client mode.
@@ -143,6 +149,8 @@ def initialize_accelerator_system(
       The default value is `localhost` in local mode, and
       `worker` when in the multi-client mode. All DTensor clients within the
       same multi-client cluster share the same job name.
+  - `DTENSOR_USE_PARALLEL_EXECUTOR`: string, with its value being `pw` to
+      specify that the backend is Pathways, and TensorFlow otherwise.
 
   Args:
     device_type: Type of accelerator to use, can be CPU, GPU, or TPU. If None,
@@ -181,12 +189,14 @@ def initialize_accelerator_system(
   if config.gpu_use_nccl_communication():
     logical_gpu_count = config.num_local_devices("GPU")
     physical_gpu_count = len(tf_config.list_physical_devices("GPU"))
-    if logical_gpu_count != physical_gpu_count:
+    if logical_gpu_count > physical_gpu_count:
       raise ValueError(
-          f"DTENSOR_GPU_USE_NCCL_COMMUNICATION is set for using NCCL. "
-          f"NCCL Collectives require same number of logical and physical GPUs. "
+          "DTENSOR_GPU_USE_NCCL_COMMUNICATION is set for using NCCL. "
+          "NCCL Collectives require one to one mapping between logical and "
+          "physical GPUs. "
           f"The number of logical GPU ({logical_gpu_count}) "
-          f"differs from the number of physical GPU ({physical_gpu_count}).")
+          f"is more than the number of physical GPU ({physical_gpu_count})."
+      )
 
   # Configure logical host CPU devices for accelerators.
   if device_type in ("GPU", "TPU"):
@@ -211,7 +221,7 @@ def initialize_accelerator_system(
       )._collective_use_nccl_communication = config.gpu_use_nccl_communication(
       )
 
-  if device_type == "TPU":
+  if device_type == "TPU" and not config.backend_is_pw():
     tpu_util.initialize_tpu_system()
 
   _INITIALIZED_ACCELERATOR_SYSTEM_TYPE = device_type
@@ -240,7 +250,7 @@ def shutdown_accelerator_system() -> None:
         "Shutting down accelerator system under multi-client mode is "
         "not supported.")
 
-  if device_type == "TPU":
+  if device_type == "TPU" and not config.backend_is_pw():
     tpu_util.shutdown_tpu_system()
 
   # reset TF context to stop gRPC servers.
diff --git a/tensorflow/dtensor/python/api.py b/tensorflow/dtensor/python/api.py
index 3e9988d0ffe..a3d521e9edb 100644
--- a/tensorflow/dtensor/python/api.py
+++ b/tensorflow/dtensor/python/api.py
@@ -22,7 +22,6 @@
 from tensorflow.dtensor.python import gen_dtensor_ops
 from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.python.eager import context
-from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -114,6 +113,22 @@ def device_name() -> str:
   return _dtensor_device().name
 
 
+@tf_export("experimental.dtensor.is_dtensor", v1=[])
+def is_dtensor(tensor) -> bool:
+  """Check whether the input tensor is a DTensor.
+
+  In Python, a DTensor has the same type as a `tf.Tensor`. This method will
+  let you check and handle the tensor differently if a tf.Tensor is a DTensor.
+
+  Args:
+    tensor: an object to be checked.
+
+  Returns:
+    bool, True if the given tensor is a DTensor.
+  """
+  return _dtensor_device().is_dtensor(tensor)
+
+
 # -----------------------------------------------------------------------------
 # Data transfer methods.
 
@@ -127,7 +142,7 @@ def copy_to_mesh(
 
   Copies a regular tf.Tensor onto the DTensor device. Use the mesh attached to
   `layout` as target mesh. This method currently only supports replicated
-  layouts. To get a DTensor with a sharded layout, use the `pack` method.
+  layouts, or one-to-one copies for sharded layouts.
 
   Args:
     tensor: A regular tf.Tensor to be copied as a DTensor.
@@ -139,7 +154,8 @@ def copy_to_mesh(
     A DTensor on the DTensor device with the given layout.
   """
   del source_layout
-  return _dtensor_device().copy_to_mesh(tensor, layout)
+  with run_on(layout.mesh):
+    return gen_dtensor_ops.copy_to_mesh(tensor, layout.to_string())
 
 
 @tf_export("experimental.dtensor.pack", v1=[])
@@ -397,23 +413,6 @@ def relayout(tensor: ops.Tensor, layout: layout_lib.Layout) -> ops.Tensor:
   return gen_dtensor_ops.relayout(tensor, layout_str)
 
 
-# -----------------------------------------------------------------------------
-# Private methods.
-
-
-def is_tpu_present() -> bool:
-  """Returns true if TPU devices are present."""
-  # Check if TPU is present from initialized context.
-  # TPU_SYSTEM is a logical device that indicates TPUs are present.
-  tpu_system_devices = tf_config.list_physical_devices("TPU_SYSTEM")
-  return len(tpu_system_devices) > 0  # pylint: disable=g-explicit-length-test
-
-
-def is_gpu_present() -> bool:
-  """Returns true if TPU devices are present."""
-  return len(tf_config.list_physical_devices("GPU")) > 0  # pylint: disable=g-explicit-length-test
-
-
 def _set_dtensor_device(device: dtensor_device.DTensorDevice) -> None:
   global _dtensor_singleton
   _dtensor_singleton = device
@@ -422,7 +421,8 @@ def _set_dtensor_device(device: dtensor_device.DTensorDevice) -> None:
 def _dtensor_device() -> dtensor_device.DTensorDevice:
   with _dtensor_singleton_lock:
     if _dtensor_singleton is None:
-      _set_dtensor_device(dtensor_device.DTensorDevice(meshes=[]))
+      _set_dtensor_device(
+          dtensor_device.DTensorDevice(meshes=[], is_async=True))
   return _dtensor_singleton
 
 
diff --git a/tensorflow/dtensor/python/config.py b/tensorflow/dtensor/python/config.py
index 13bb32b596b..05e47cbdc86 100644
--- a/tensorflow/dtensor/python/config.py
+++ b/tensorflow/dtensor/python/config.py
@@ -207,3 +207,8 @@ def preferred_device_type() -> str:
 def gpu_use_nccl_communication() -> bool:
   """Return True if environment indicates NCCL shall be used for GPU."""
   return os.environ.get("DTENSOR_GPU_USE_NCCL_COMMUNICATION", "0") != "0"
+
+
+def backend_is_pw() -> bool:
+  """Return True if environment indicates the backend is Pathways."""
+  return os.environ.get("DTENSOR_USE_PARALLEL_EXECUTOR") == "pw"
diff --git a/tensorflow/dtensor/python/d_checkpoint.py b/tensorflow/dtensor/python/d_checkpoint.py
index 25ba782d703..c8a1eaa87ec 100644
--- a/tensorflow/dtensor/python/d_checkpoint.py
+++ b/tensorflow/dtensor/python/d_checkpoint.py
@@ -171,12 +171,14 @@ def __init__(self, mesh: layout.Mesh, **kwargs):
     super().__init__(**kwargs)
     self._mesh = mesh
 
-  def restore_saveables(
-      self,
-      tensor_saveables: Dict[str, saveable_object.SaveableObject],
-      python_positions: List[restore_lib.CheckpointPosition],
-      registered_savers: Optional[Dict[str, Dict[str, base.Trackable]]] = None
-  ) -> Optional[List[ops.Operation]]:
+  def restore_saveables(self,
+                        tensor_saveables: Dict[str,
+                                               saveable_object.SaveableObject],
+                        python_positions: List[restore_lib.CheckpointPosition],
+                        registered_savers: Optional[Dict[str, Dict[
+                            str, base.Trackable]]] = None,
+                        reader: py_checkpoint_reader.NewCheckpointReader = None
+                        ) -> Optional[List[ops.Operation]]:
     """Run or build restore operations for SaveableObjects.
 
     Args:
@@ -185,6 +187,7 @@ def restore_saveables(
         Trackables bound to the checkpoint.
       registered_savers: a dict mapping saver names-> object name -> Trackable.
         This argument is not implemented for DTensorCheckpoint.
+      reader: A CheckpointReader. Creates one lazily if None.
 
     Returns:
       When graph building, a list of restore operations, either cached or newly
@@ -197,7 +200,8 @@ def restore_saveables(
     if python_positions:
       # Lazily create the NewCheckpointReader, since this requires file access
       # and we may not have any Python saveables.
-      reader = py_checkpoint_reader.NewCheckpointReader(self.save_path_string)
+      if reader is None:
+        reader = py_checkpoint_reader.NewCheckpointReader(self.save_path_string)
       for position in python_positions:
         key = position.object_proto.attributes[0].checkpoint_key
         position.trackable.deserialize(reader.get_tensor(key))
diff --git a/tensorflow/dtensor/python/dtensor_device.py b/tensorflow/dtensor/python/dtensor_device.py
index 3cf21232632..50f7a9badf1 100644
--- a/tensorflow/dtensor/python/dtensor_device.py
+++ b/tensorflow/dtensor/python/dtensor_device.py
@@ -32,6 +32,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import resource_variable_ops
 
 
@@ -45,7 +46,10 @@
 class DTensorDevice(object):
   """Wraps a custom device which attempts to propagate tensor layouts."""
 
-  def __init__(self, meshes: List[layout_lib.Mesh], is_async=True):
+  def __init__(self,
+               meshes: List[layout_lib.Mesh],
+               is_async=True,
+               in_flight_nodes_limit=8):
     """Create a new DTensorDevice which executes ops on `underlying_device`.
 
     Args:
@@ -54,6 +58,9 @@ def __init__(self, meshes: List[layout_lib.Mesh], is_async=True):
       is_async: Indicates whether DTensor operations on this client will return
         immediately (with "non-ready" handles) or block until executed. This is
         on by default and is exposed as an option for ease of debugging.
+      in_flight_nodes_limit: Indicates the limit of in-flight nodes before
+        enqueueing of async operations to DTensorDevice is blocked. This limit
+        is per mesh. 0 for no limits from DTensor. Default is 8.
     """
     if any(not isinstance(mesh, layout_lib.Mesh) for mesh in meshes):
       raise TypeError(
@@ -71,6 +78,7 @@ def __init__(self, meshes: List[layout_lib.Mesh], is_async=True):
     self._current_output_layout = None
     self._current_default_mesh = None
     self._is_async = is_async
+    self._in_flight_nodes_limit = in_flight_nodes_limit
     self._meshes = set()
     self._mesh_lock = threading.Lock()
     for mesh in meshes:
@@ -128,7 +136,8 @@ def _register_mesh(self, mesh: layout_lib.Mesh):
     with self._mesh_lock:
       if mesh not in self._meshes:
         _pywrap_dtensor_device.AddMesh(self._device_info, mesh.to_string(),
-                                       self._is_async, False)
+                                       self._is_async, False,
+                                       self._in_flight_nodes_limit)
         self._meshes.add(mesh)
         if mesh.device_type().upper() == "TPU":
           logging.info(
@@ -136,7 +145,8 @@ def _register_mesh(self, mesh: layout_lib.Mesh):
               mesh.host_mesh().to_string(), mesh.to_string())
           _pywrap_dtensor_device.AddMesh(self._device_info,
                                          mesh.host_mesh().to_string(),
-                                         self._is_async, True)
+                                         self._is_async, True,
+                                         self._in_flight_nodes_limit)
           self._meshes.add(mesh.host_mesh())
           embedding_host_mesh = self._create_embedding_host_mesh(mesh)
           if embedding_host_mesh:
@@ -145,7 +155,8 @@ def _register_mesh(self, mesh: layout_lib.Mesh):
                 embedding_host_mesh.to_string(), mesh.to_string())
             _pywrap_dtensor_device.AddMesh(self._device_info,
                                            embedding_host_mesh.to_string(),
-                                           self._is_async, False)
+                                           self._is_async, False,
+                                           self._in_flight_nodes_limit)
             self._meshes.add(embedding_host_mesh)
 
   @property
@@ -284,6 +295,26 @@ def fetch_layout(self, dtensor: Any) -> layout_lib.Layout:
       raise core._status_to_exception(e) from None  # pylint: disable=protected-access
     return layout_lib.Layout.from_string(layout_string)
 
+  def is_dtensor(self, tensor: Any) -> bool:
+    """Check whether the input tensor is a DTensor.
+
+    In Python, a DTensor has the same type as a `tf.Tensor`. This method will
+    let you check and handle the tensor differently if a tf.Tensor is a DTensor.
+
+    Args:
+      tensor: an object to be checked.
+
+    Returns:
+      bool, True if the given tensor is a DTensor.
+    """
+    if not tensor_util.is_tensor(tensor):
+      return False
+    return _pywrap_dtensor_device.IsDTensor(
+        context.context()._handle,  # pylint: disable=protected-access
+        tensor,
+        self._device_info,
+    )
+
   def set_same_shape_policy(self, enabled):
     """Guess layouts using the layouts of other tensors with the same shape.
 
@@ -348,6 +379,22 @@ def _get_function_cache_hit_and_miss_count(self):
         context.context()._handle,  # pylint: disable=protected-access,
         self._device_info)
 
+  def set_iterator_element_layouts(self, iterator_resource_dtensor,
+                                   layouts: List[layout_lib.Layout]):
+    """Sets the element layouts on an iterator resource tensor.
+
+    Args:
+      iterator_resource_dtensor: a DTensor created by packing the individiual
+        iterator resource tensors.
+      layouts: the flattened list of layouts to be applied to the elements
+        emitted by the iterator resource DTensor.
+    """
+    _pywrap_dtensor_device.SetIteratorElementLayouts(
+        context.context()._handle,  # pylint: disable=protected-access
+        iterator_resource_dtensor,
+        [layout.to_string() for layout in layouts],
+        self._device_info)
+
   @contextlib.contextmanager
   def _experimental_default_mesh(self, mesh: layout_lib.Mesh):
     """Sets a default mesh for all ops in the scope.
diff --git a/tensorflow/dtensor/python/layout.py b/tensorflow/dtensor/python/layout.py
index e5accec1523..2ede9d37c62 100644
--- a/tensorflow/dtensor/python/layout.py
+++ b/tensorflow/dtensor/python/layout.py
@@ -30,6 +30,7 @@
 # UNSHARDED indicates a tensor dimension is not sharded over any mesh dimension.
 UNSHARDED = 'unsharded'
 MATCH = 'match'
+USE_XLA_SPMD = False
 
 tf_export(
     'experimental.dtensor.UNSHARDED',
@@ -76,6 +77,7 @@ class Mesh(_pywrap_dtensor_device.Mesh):
   _local_devices = List[tf_device.DeviceSpec]
   _global_devices = Optional[List[tf_device.DeviceSpec]]
   _device_type: str
+  _use_xla_spmd: bool
 
   def __init__(self,
                dim_names: List[str],
@@ -83,7 +85,8 @@ def __init__(self,
                local_device_ids: List[int],
                local_devices: List[tf_device.DeviceSpec],
                mesh_name: str = '',
-               global_devices: Optional[List[tf_device.DeviceSpec]] = None):
+               global_devices: Optional[List[tf_device.DeviceSpec]] = None,
+               use_xla_spmd: bool = USE_XLA_SPMD):
     """Builds a Mesh.
 
     The `dim_names` and `global_device_ids` arguments describe the dimension
@@ -123,6 +126,8 @@ def __init__(self,
         mostly used to indicate whether it is a CPU, GPU, or TPU-based mesh.
       global_devices (optional): The list of global devices. Set when multiple
         device meshes are in use.
+      use_xla_spmd (optional): Boolean when True, will use XLA SPMD instead of
+        DTensor SPMD.
     """
     # Check if input args are valid.
     if not isinstance(global_device_ids, np.ndarray):
@@ -154,7 +159,7 @@ def __init__(self,
 
     super().__init__(mesh_name, dim_names, global_device_ids_shape,
                      global_device_ids_flatten, global_devices_str,
-                     local_device_ids, local_devices_str)
+                     local_device_ids, local_devices_str, use_xla_spmd)
 
     if len(dim_names) != global_device_ids.ndim:
       raise ValueError(
@@ -196,9 +201,12 @@ def __init__(self,
     if len(device_types) > 1:
       raise ValueError('Devices containing multiple device_types : %s' %
                        device_types)
-
+    device_type = device_types.pop()
+    if use_xla_spmd and device_type != 'TPU':
+      raise ValueError('XLA SPMD is not currently not supported for %s mesh.' %
+                       device_type)
     # Set object's state.
-    self._device_type = device_types.pop()
+    self._device_type = device_type
     self._dim_names = dim_names
     self._dim_dict = {
         dim_name: MeshDimension(dim_name, global_device_ids.shape[i])
@@ -211,13 +219,15 @@ def __init__(self,
     self._name = mesh_name
     self._strides = _compute_mesh_strides(
         [self._dim_dict[dim] for dim in self._dim_names])
+    self._use_xla_spmd = use_xla_spmd
 
   def __eq__(self, other):
     if not isinstance(other, type(self)) and not isinstance(self, type(other)):
       raise ValueError('comparing with type : {0} but expecting : {1}'.format(
           type(other), type(self)))
-    return self.as_proto().SerializeToString() == other.as_proto(
-    ).SerializeToString()
+    return (self.as_proto().SerializeToString(
+        deterministic=True) == other.as_proto().SerializeToString(
+            deterministic=True))
 
   def __getitem__(self, dim_name: str) -> MeshDimension:
     if dim_name not in self._dim_dict:
@@ -266,6 +276,7 @@ def as_proto(self) -> layout_pb2.MeshProto:
       for d in self._global_devices:
         mesh_proto.global_devices.append(d.to_string())
 
+    mesh_proto.use_xla_spmd = self.use_xla_spmd()
     return mesh_proto
 
   def coords(self, device_idx: int) -> ops.Tensor:
@@ -308,7 +319,7 @@ def from_proto(proto: layout_pb2.MeshProto) -> 'Mesh':
     name = proto.name
     dims = [dim.name for dim in proto.mesh_dimensions]
     return Mesh(dims, global_device_ids, local_device_ids, local_devices, name,
-                global_devices)
+                global_devices, proto.use_xla_spmd)
 
   # TODO(panzf): Remove this in the last step of C++/Python unification
   # Removing this method depends on C++ Mesh implements all Python methods
@@ -317,18 +328,23 @@ def from_string(mesh_str: str) -> 'Mesh':
     """Construct a mesh instance from input `proto`."""
     # Separate elements of mesh.
     mesh_parts = mesh_str.split('|')
-    global_dev_str = None
+    global_dev_str_or_use_xla_spmd = None
+    use_xla_spmd = False
     if len(mesh_parts) == 5:
       name, mesh_dim_strs, global_id_str, local_id_str, dev_str = mesh_parts
     elif len(mesh_parts) == 6:
       (name, mesh_dim_strs, global_id_str, local_id_str, dev_str,
-       global_dev_str) = mesh_parts
+       global_dev_str_or_use_xla_spmd) = mesh_parts
+    elif len(mesh_parts) == 7:
+      (name, mesh_dim_strs, global_id_str, local_id_str, dev_str,
+       global_dev_str_or_use_xla_spmd, use_xla_spmd) = mesh_parts
     else:
       raise ValueError('Invalid mesh string : %s' % mesh_str)
 
     # Load mesh proto.
     mesh_proto = layout_pb2.MeshProto()
     mesh_proto.name = name
+    mesh_proto.use_xla_spmd = (use_xla_spmd == 'use_xla_spmd')
 
     for mesh_dim_str in mesh_dim_strs.split(','):
       name, size_str = mesh_dim_str.split('=')
@@ -347,9 +363,14 @@ def from_string(mesh_str: str) -> 'Mesh':
       for dev in dev_str.split(','):
         mesh_proto.local_devices.append(dev)
 
-    if global_dev_str:
-      for dev in global_dev_str.split(','):
-        mesh_proto.global_devices.append(dev)
+    # Global device ids and use_xla_spmd are both optional strings appended to
+    # the end. When there are 6 arguments, we need to check which argument.
+    if global_dev_str_or_use_xla_spmd:
+      if global_dev_str_or_use_xla_spmd == 'use_xla_spmd':
+        mesh_proto.use_xla_spmd = True
+      else:
+        for dev in global_dev_str_or_use_xla_spmd.split(','):
+          mesh_proto.global_devices.append(dev)
 
     return Mesh.from_proto(mesh_proto)
 
@@ -512,7 +533,6 @@ class Layout(object):
    TPU:4     [[4, 5]]
    TPU:5     [[4, 5]]
   ```
-
   """
 
   def __init__(self, sharding_specs: List[str], mesh: Mesh):
@@ -562,6 +582,9 @@ def __eq__(self, other) -> bool:
   def __repr__(self) -> str:
     return f'Layout(sharding_specs={self.sharding_specs}, mesh={self.mesh})'
 
+  def __hash__(self) -> int:
+    return hash(self.serialized_string())
+
   def as_proto(self) -> layout_pb2.LayoutProto:
     """Create a proto representation of a layout."""
     layout_proto = layout_pb2.LayoutProto()
@@ -671,7 +694,7 @@ def replicated(mesh: Mesh, rank: int) -> 'Layout':
 
   def serialized_string(self) -> bytes:
     """Returns a serialized Protobuf binary string representation."""
-    return self.as_proto().SerializeToString()
+    return self.as_proto().SerializeToString(deterministic=True)
 
   # A layout with no sharding specs is acceptable, therefore we only check the
   # mesh.
@@ -684,11 +707,3 @@ def to_string(self) -> str:
 
     mesh_str = 'mesh:' + self.mesh.to_string()
     return sharding_spec_str + ' ' + mesh_str
-
-  def unravel(self, unpacked_tensors: List[np.ndarray]) -> np.ndarray:
-    """Convert a flattened list of shards into a sharded array."""
-    unravelled = np.ndarray([self.num_shards(i) for i in range(self.rank)],
-                            dtype=np.object)
-    for offset, loc in enumerate(self.offset_to_shard()):
-      unravelled[loc] = unpacked_tensors[offset]
-    return unravelled
diff --git a/tensorflow/dtensor/python/mesh_util.py b/tensorflow/dtensor/python/mesh_util.py
index 6571071ee85..d001e418758 100644
--- a/tensorflow/dtensor/python/mesh_util.py
+++ b/tensorflow/dtensor/python/mesh_util.py
@@ -69,7 +69,8 @@ def _make_device_specs(
 def create_mesh(mesh_dims: Optional[List[Tuple[str, int]]] = None,
                 mesh_name: str = '',
                 devices: Optional[List[str]] = None,
-                device_type: Optional[str] = None) -> layout.Mesh:
+                device_type: Optional[str] = None,
+                use_xla_spmd: bool = layout.USE_XLA_SPMD) -> layout.Mesh:
   """Creates a single-client mesh.
 
   If both `mesh_dims` and `devices` are specified, they must match each otehr.
@@ -85,6 +86,8 @@ def create_mesh(mesh_dims: Optional[List[Tuple[str, int]]] = None,
       of tf.DeviceSpec, e.g. 'CPU:0'. Defaults to all available logical devices.
     device_type: If `devices` is missing, the type of devices to use. Defaults
       to 'CPU'.
+    use_xla_spmd: Boolean when True, will use XLA SPMD instead of
+      DTensor SPMD.
 
   Returns:
     A single-client mesh created from specified or default arguments.
@@ -114,7 +117,8 @@ def create_mesh(mesh_dims: Optional[List[Tuple[str, int]]] = None,
       global_device_ids=global_device_ids,
       local_device_ids=local_device_ids,
       local_devices=device_specs,
-      mesh_name=mesh_name)
+      mesh_name=mesh_name,
+      use_xla_spmd=use_xla_spmd)
   _print_context(
       num_global_devices=len(device_specs),
       num_clients=1,
@@ -125,10 +129,12 @@ def create_mesh(mesh_dims: Optional[List[Tuple[str, int]]] = None,
 
 
 @tf_export('experimental.dtensor.create_distributed_mesh', v1=[])
-def create_distributed_mesh(mesh_dims: List[Tuple[str, int]],
-                            mesh_name: str = '',
-                            local_devices: Optional[List[str]] = None,
-                            device_type: Optional[str] = None) -> layout.Mesh:
+def create_distributed_mesh(
+    mesh_dims: List[Tuple[str, int]],
+    mesh_name: str = '',
+    local_devices: Optional[List[str]] = None,
+    device_type: Optional[str] = None,
+    use_xla_spmd: bool = layout.USE_XLA_SPMD) -> layout.Mesh:
   """Creates a distributed mesh.
 
   This is similar to `create_mesh`, but with a different set of arguments to
@@ -147,7 +153,9 @@ def create_distributed_mesh(mesh_dims: List[Tuple[str, int]],
       part of tf.DeviceSpec, e.g. 'CPU:0'. Defaults to all available local
       logical devices.
     device_type: Type of device to build the mesh for. Defaults to 'CPU'.
-      Supported values are 'CPU', 'GPU', 'TPU'.
+      Supported values are 'CPU', 'GPU', 'TPU'.6
+    use_xla_spmd: Boolean when True, will use XLA SPMD instead of
+      DTensor SPMD.
 
   Returns:
     A mesh that spans evenly across all DTensor clients in the cluster.
@@ -199,13 +207,18 @@ def create_distributed_mesh(mesh_dims: List[Tuple[str, int]],
         global_device_ids=global_device_ids,
         local_device_ids=local_device_ids,
         local_devices=device_specs,
-        mesh_name=mesh_name)
+        mesh_name=mesh_name,
+        use_xla_spmd=use_xla_spmd)
     _print_context(num_global_devices, config.num_clients(), config.client_id(),
                    device_type, mesh)
     return mesh
 
   if device_type.upper() == 'TPU':
-    mesh = tpu_util.create_tpu_mesh(dim_names, shape, mesh_name)
+    mesh = tpu_util.create_tpu_mesh(
+        mesh_dim_names=dim_names,
+        mesh_shape=shape,
+        mesh_name=mesh_name,
+        use_xla_spmd=use_xla_spmd)
     _print_context(
         config.num_global_devices(device_type), config.num_clients(),
         config.client_id(), device_type, mesh)
@@ -214,8 +227,13 @@ def create_distributed_mesh(mesh_dims: List[Tuple[str, int]],
   raise ValueError(f'Device type {device_type} is not CPU, GPU or TPU')
 
 
+_BARRIER_DICT = {}
+
+
 @tf_export('experimental.dtensor.barrier', v1=[])
-def barrier(mesh: layout.Mesh, barrier_name: Optional[str] = None):
+def barrier(mesh: layout.Mesh,
+            barrier_name: Optional[str] = None,
+            timeout_in_ms: Optional[int] = None):
   """Runs a barrier on the mesh.
 
   Upon returning from the barrier, all operations run before the barrier
@@ -240,7 +258,9 @@ def barrier(mesh: layout.Mesh, barrier_name: Optional[str] = None):
 
   Args:
     mesh: The mesh to run the barrier on.
-    barrier_name: The name of the barrier. mainly used for logging purpose.
+    barrier_name: The name of the barrier. Mainly used for logging purpose.
+    timeout_in_ms: The timeout of the barrier in ms. If omitted, blocks
+      indefinitely till the barrier is reached from all clients.
   """
   if barrier_name is None:
     barrier_name = '(barrier)'
@@ -266,5 +286,15 @@ def barrier(mesh: layout.Mesh, barrier_name: Optional[str] = None):
   # from users. Consider dropping this if there is a `big` performance hit.
   context.async_wait()
 
+  if context.context().coordination_service:
+    if timeout_in_ms is None:
+      timeout_in_ms = 24 * 60 * 60 * 1000  # 24 hours to stand in for infinite.
+
+    num_calls = _BARRIER_DICT.setdefault(barrier_name, 0)
+    _BARRIER_DICT[barrier_name] = num_calls + 1
+
+    barrier_id = f'{barrier_name}:{num_calls}'
+    context.context().wait_at_barrier(barrier_id, timeout_in_ms)
+
   logging.info('finished running barrier across all clients after '
                'op: %s', barrier_name)
diff --git a/tensorflow/dtensor/python/numpy_util.py b/tensorflow/dtensor/python/numpy_util.py
new file mode 100644
index 00000000000..247eba2f93a
--- /dev/null
+++ b/tensorflow/dtensor/python/numpy_util.py
@@ -0,0 +1,115 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to convert data buffers to/from DTensor tensors."""
+from typing import List
+
+import numpy as np
+
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.types.core import Tensor, TensorLike  # pylint: disable=g-multiple-import
+
+# FIXME(b/262894693): Functions in this file are buggy.
+# They do not distinguish between the client-local data and the global view.
+
+
+def _split(value, splits, axis=0, split_fn=np.split, stack_fn=np.stack):
+  """Split `value` into a sharded nparray/tf tensor based on the number of splits.
+  """
+  children = split_fn(value, splits[0], axis=axis)
+  if len(splits) > 1:
+    splits = splits[1:]
+    children = [_split(child, splits, axis + 1) for child in children]
+  return stack_fn(children)
+
+
+def to_numpy(tensor: TensorLike) -> np.ndarray:
+  """Copy `input` DTensor to an equivalent local numpy array."""
+  layout = api.fetch_layout(tensor)
+  if layout.mesh.is_remote():
+    return np.array([None])
+
+  unpacked = [tensor.numpy() for tensor in api.unpack(tensor)]
+  return unpacked_to_numpy(unpacked, layout)
+
+
+def unpacked_to_numpy(unpacked: List[TensorLike],
+                      layout: layout_lib.Layout) -> np.ndarray:
+  """Heals local Tensor components to a numpy array."""
+  if len(unpacked) != len(layout.offset_to_shard()):
+    raise ValueError('Wrong number of component Tensors.')
+
+  unravelled = np.ndarray([layout.num_shards(i) for i in range(layout.rank)],
+                          dtype=object)
+
+  for offset, loc in enumerate(layout.offset_to_shard()):
+    unravelled[loc] = unpacked[offset]
+
+  concat_tensor = np.block(unravelled.tolist())
+
+  # np.block can introduce empty initial dimensions, peel these off until
+  # the output matches the rank of the input tensors.
+  while concat_tensor.ndim > unpacked[0].ndim:
+    concat_tensor = np.squeeze(concat_tensor, axis=0)
+  return concat_tensor
+
+
+# TODO(feyu): rename to slice.
+def unpack(t: TensorLike,
+           layout: layout_lib.Layout,
+           split_fn=np.split,
+           stack_fn=np.stack) -> List[TensorLike]:
+  """Slice `t` into a flattened list of tensors suitable for `pack`."""
+  if not layout.rank:
+    return [t] * layout.mesh.size
+  sharded_tensor = _split(
+      t, [layout.num_shards(i) for i in range(layout.rank)],
+      split_fn=split_fn,
+      stack_fn=stack_fn)
+  flattened = [np.ndarray([])] * layout.mesh.size
+  for offset, shard in enumerate(layout.offset_to_shard()):
+    flattened[offset] = sharded_tensor[tuple(shard)]
+  return flattened
+
+
+def pack_numpy(value: np.ndarray,
+               layout: layout_lib.Layout,
+               make_sparse: bool = False) -> Tensor:
+  assert value is not None
+  unpacked = unpack(value, layout)
+  if make_sparse:
+    return api.pack([sparse_ops.from_dense(t) for t in unpacked], layout)
+  return api.pack(unpacked, layout)
+
+
+def pack_tf_tensor(value: Tensor, layout: layout_lib.Layout) -> Tensor:
+  if value is None:
+    raise ValueError('pack requires values to be passed in')
+  unpacked = unpack(
+      value, layout, split_fn=array_ops.split, stack_fn=array_ops.stack)
+  return api.pack(unpacked, layout)
+
+
+@polymorphic_function.function
+def stateless_random_uniform(shape, seed, layout):
+  """Creates uniform random tensor with the given layout."""
+  return api.relayout(
+      stateless_random_ops.stateless_random_uniform(shape=shape, seed=seed),
+      layout=layout,
+  )
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
new file mode 100644
index 00000000000..141a2f67b8b
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -0,0 +1,339 @@
+load("//tensorflow:pytype.default.bzl", "pytype_library")
+load("//tensorflow:tensorflow.bzl", "if_google")
+load(
+    "//tensorflow/dtensor:build_defs.bzl",
+    "ALL_BACKENDS",
+    "GPU_2DEVS_BACKEND",
+    "PATHWAYS",
+    "TPU_V3_DONUT_BACKEND",
+    "dtensor_test",
+)
+
+# File used by internal tests.
+exports_files([
+    "spmd_test.py",
+    "collective_test.py",
+])
+
+pytype_library(
+    name = "test_util",
+    testonly = if_google(
+        True,
+        oss_value = False,  # build_pip_package depends on this target.
+    ),
+    srcs = [
+        "test_backend_name.py",
+        "test_backend_util.py",
+        "test_util.py",
+        "test_util_ops.py",
+    ],
+    visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/dtensor:dtensor-users",
+        "//tensorflow/tools/pip_package:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/dtensor/python:tpu_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:bitwise_ops_gen",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:spectral_ops_gen",
+        "//tensorflow/python:stateless_random_ops_gen",
+        "//tensorflow/python:stateless_random_ops_v2_gen",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "config_test",
+    srcs = ["config_test.py"],
+    main = "config_test.py",
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+    ],
+)
+
+dtensor_test(
+    name = "collective_test",
+    srcs = ["collective_test.py"],
+    additional_backends = [
+        TPU_V3_DONUT_BACKEND,
+        GPU_2DEVS_BACKEND,
+        PATHWAYS,
+    ],
+    disable = [PATHWAYS],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:dtensor_device",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "collective_combine_all_reduce_test",
+    srcs = [":collective_test.py"],
+    args = if_google(
+        [
+            "--vmodule=dtensor_graph_to_mlir_pass=4",
+        ],
+        [],
+    ),
+    env = {
+        "DTENSOR_ENABLE_COMBINE_ALL_REDUCES_OPTIMIZATION": "1",
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:dtensor_device",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+dtensor_test(
+    name = "multi_client_test",
+    srcs = ["multi_client_test.py"],
+    additional_backends = [
+        GPU_2DEVS_BACKEND,
+        TPU_V3_DONUT_BACKEND,
+    ],
+    disable = [
+        "gpu",  # multi-client gpu is tested via GPU_2DEVS_BACKEND.
+        "tpu",  # multi-client tpu is tested via TPU_V3_DONUT_BACKEND.
+    ],
+    disable_tfrt = [
+        "cpu",  # TODO(b/217969210): Re-enable in TFRT CPU.
+        GPU_2DEVS_BACKEND,  # TODO(b/230679405): Re-enable in TFRT GPU.
+    ],
+    tags = [
+        "no_windows",
+        "nosan",
+    ],  # b/195537906
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+    ],
+)
+
+dtensor_test(
+    name = "multi_client_test_nccl_local",  # Use a suffix for coverage, b/23027507#comment47
+    srcs = ["multi_client_test.py"],
+    additional_backends = [
+        GPU_2DEVS_BACKEND,
+    ],
+    args = [
+        "--num_clients=0",
+        "--num_devices=2",
+        "--model_dim_size=2",
+    ],
+    disable = ALL_BACKENDS,
+    env = {
+        "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
+        "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
+        "DTENSOR_DO_NOT_FUSE_REDUCE_SCATTER": "1",  # FIXME(b/266609048): timeout if fused.
+    },
+    tags = [
+        "no_windows",
+        "nosan",  # b/195537906
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+    ],
+)
+
+dtensor_test(
+    name = "multi_client_test_nccl",  # Use a suffix for coverage, b/23027507#comment47
+    srcs = ["multi_client_test.py"],
+    additional_backends = [
+        GPU_2DEVS_BACKEND,
+    ],
+    args = [
+        "--num_clients=2",
+        "--num_devices=1",
+        "--model_dim_size=2",
+    ],
+    disable = ALL_BACKENDS,
+    env = {
+        "DTENSOR_GPU_USE_NCCL_COMMUNICATION": "1",
+        "NCCL_P2P_DISABLE": "1",  # FIXME(b/251183104): p2p detection in cuda 10.1+ is broken.
+        "DTENSOR_DO_NOT_FUSE_REDUCE_SCATTER": "1",  # FIXME(b/266609048): timeout if fused.
+    },
+    tags = [
+        "no_windows",
+        "nosan",  # b/195537906
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+    ],
+)
+
+dtensor_test(
+    name = "numpy_util_test",
+    srcs = ["numpy_util_test.py"],
+    main = "numpy_util_test.py",
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+dtensor_test(
+    name = "spmd_test",
+    srcs = ["spmd_test.py"],
+    additional_backends = [TPU_V3_DONUT_BACKEND],
+    main = "spmd_test.py",
+    shard_count = {
+        "cpu": 25,
+        "gpu": 10,
+        "tpu": 10,
+        TPU_V3_DONUT_BACKEND: 32,
+    },
+    tags = [
+        "no_oss_py38",  # TODO(b/267017937)
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:numpy_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:bitwise_ops_gen",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:linalg_ops_gen",
+        "//tensorflow/python:list_ops_gen",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops_gen",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:stateless_random_ops_gen",
+        "//tensorflow/python:string_ops_gen",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/dtensor/python/tests/collective_test.py b/tensorflow/dtensor/python/tests/collective_test.py
new file mode 100644
index 00000000000..8efb80bc1ce
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/collective_test.py
@@ -0,0 +1,448 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DTensor collectives."""
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import dtensor_device
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+# pylint: enable=g-direct-tensorflow-import
+
+Layout = layout_lib.Layout
+
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_DIMS = [_MESH_DIM_X, _MESH_DIM_Y]
+
+
+class CollectiveTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(CollectiveTest, self).setUp()
+
+    global_ids = test_util.create_device_ids_array((2, 1))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh(_MESH_DIMS, global_ids, local_ids,
+                                test_util.create_device_list((2, 1), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    self.fully_replicated_layout_2d = Layout.replicated(self.mesh, rank=2)
+    self.first_dimension_sharded_layout_2d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, 2)
+    self.scalar_layout = Layout.replicated(self.mesh, rank=0)
+
+  def testReduceOnBfloat16(self):
+    self.skipForDeviceType(['GPU'],
+                           'GPUs do not support bfloat16 collective reduce')
+    self.skipForDeviceType(['TPU'],
+                           'This test only needs to run on 2 cores.',
+                           unless_device_count_equals_to=2)
+
+    a = constant_op.constant(
+        np.array([[1, 2, 3, 4], [5.0, 6.0, 7.0, 8.0]]), dtype=dtypes.bfloat16)
+    expected_result = math_ops.reduce_sum(a)
+
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    dtensor_result = math_ops.reduce_sum(sharded_a)
+
+    self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
+
+  def testReduceOnInt32(self):
+    a = constant_op.constant(
+        np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), dtype=dtypes.int32)
+
+    expected_result = math_ops.reduce_sum(a)
+
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    dtensor_result = math_ops.reduce_sum(sharded_a)
+
+    self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
+
+  def testTwoReducesWithAssign(self):
+    # FIXME(b/238384852): The purpose of this test is to validate the control
+    # dependency added by DTensor.
+    # However, as we have no way of testing the per-device graph
+    # produced by the DTensor SPMD expansion, we have to use manual logging
+    # to verify if the results are correct.
+    # For example, this test is used to check cl/459358521.
+
+    # Uses dtypes.float32 to avoid int32 problem with VarHandleOp on GPUs.
+    a = constant_op.constant(
+        np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), dtype=dtypes.float32)
+    b = constant_op.constant(
+        np.array([[11, 12, 13, 4], [15, 16, 17, 18]]), dtype=dtypes.float32)
+
+    expected_result = math_ops.reduce_sum(a) * math_ops.reduce_sum(b)
+
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    sharded_b = numpy_util.pack_numpy(b, self.first_dimension_sharded_layout_2d)
+    sharded_v = d_variable.DVariable(sharded_b)
+
+    @polymorphic_function.function
+    def func(a, b):
+      a1 = math_ops.reduce_sum(a, name='reducea')
+      sharded_v.assign(a)
+      b1 = math_ops.reduce_sum(b, name='reduceb')
+      return a1 * b1
+
+    with api.run_on(self.mesh):
+      dtensor_result = func(sharded_a, sharded_b)
+    self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_all', math_ops.reduce_all),
+      ('_any', math_ops.reduce_any),
+  )
+  def testReduceOnBool(self, reduction):
+    # TODO(b/193531363): Track the work to support int32 reduce.
+    self.skipForDeviceType(['GPU'],
+                           'GPUs do not support int32 collective reduce')
+    self.skipForDeviceType(['TPU'],
+                           'This test only needs to run on 2 cores.',
+                           unless_device_count_equals_to=2)
+
+    a = constant_op.constant(
+        np.array([[True, False, False, True], [False, False, False, True]]),
+        dtype=dtypes.bool)
+    expected_result = reduction(a)
+
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    dtensor_result = reduction(sharded_a)
+
+    self.assertDTensorEqual(expected_result, self.scalar_layout, dtensor_result)
+
+  def testAllToAllOnBool(self):
+    # TODO(b/193531363): Track the work to support int32 reduce.
+    self.skipForDeviceType(['GPU'],
+                           'GPUs do not support int32 collective reduce')
+    self.skipForDeviceType(['TPU'],
+                           'This test only needs to run on 2 cores.',
+                           unless_device_count_equals_to=2)
+
+    a = constant_op.constant(
+        np.array([[True, False, False, True], [False, False, False, True]]),
+        dtype=dtypes.bool)
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    unsharded_a = api.relayout(sharded_a, self.fully_replicated_layout_2d)
+
+    self.assertDTensorEqual(a, self.fully_replicated_layout_2d, unsharded_a)
+
+  def testAllToAllOnInt32(self):
+    # TODO(b/193471732): Tracking the work to do int32 all-concat.
+    #
+    # Currently, the test will fail with segfault.
+    self.skipForDeviceType(['GPU'],
+                           'GPUs do not support int32 StridedSliceXXX Ops')
+    self.skipForDeviceType(['TPU'],
+                           'This test only needs to run on 2 cores.',
+                           unless_device_count_equals_to=2)
+
+    a = constant_op.constant(np.array([[1, 2], [3, 4]]), dtype=dtypes.int32)
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    unsharded_a = api.relayout(sharded_a, self.fully_replicated_layout_2d)
+
+    self.assertDTensorEqual(a, self.fully_replicated_layout_2d, unsharded_a)
+
+  def testNoOpAllToAll(self):
+    self.skipForDeviceType(['TPU'],
+                           'This test only needs to run on 2 cores.',
+                           unless_device_count_equals_to=2)
+
+    a = constant_op.constant(
+        np.array([[1., 2., 3., 4.], [5.0, 6.0, 7.0, 8.0]]),
+        dtype=dtypes.float32)
+    expected_result = a
+
+    sharded_a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout_2d)
+    dtensor_result = api.relayout(sharded_a, self.fully_replicated_layout_2d)
+
+    self.assertDTensorEqual(expected_result, self.fully_replicated_layout_2d,
+                            dtensor_result)
+
+  # Regression test for b/184401449.
+  def testDeviceIdTensorOnSplitHost(self):
+    if not test_util.is_tpu_present():
+      self.skipTest('This test only runs on TPUs.')
+    self.skipForDeviceType(['TPU'],
+                           'This test requires 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = [0, 1, 4, 5, 2, 3, 6, 7]  # not sequentially increasing
+    mesh = layout_lib.Mesh(_MESH_DIMS, global_ids, local_ids,
+                           test_util.create_device_list((2, 4), 'TPU'),
+                           'tpu_mesh')
+    device = dtensor_device.DTensorDevice(meshes=[mesh])
+    # This works because on 2x2, global device IDs are equal to physical TPU
+    # core IDs: both are range(8). So local device IDs happen to be usable here.
+    # TODO(b/180046115): Add a device.get_tpu_core_ids method and translate
+    # device IDs to core IDs before setting the list here.
+    device.set_tpu_core_ids('tpu_mesh', local_ids)
+    layout_x = Layout.batch_sharded(mesh, _MESH_DIM_X, 2)
+    layout_y = Layout.batch_sharded(mesh, _MESH_DIM_Y, 2)
+
+    # Create a 2x4 batch-sharded d-tensor, with batch IDs in its first column
+    # and zeros in other columns.
+    # pylint: disable=g-complex-comprehension
+    replica_ids = [
+        constant_op.constant([loc[_MESH_DIM_X], 0, 0, 0],
+                             dtype=dtypes.int32,
+                             shape=[1, 4])
+        for loc in mesh.local_device_locations()
+    ]
+    # pylint: enable=g-complex-comprehension
+    replica_ids = device.pack(replica_ids, layout_x)
+
+    # Create a 4x4 y-sharded d-tensor filled with ones.
+    ones = [array_ops.ones([1, 4], dtype=dtypes.int32)] * 8
+    ones = device.pack(ones, layout_y)
+
+    # If `a` has a layout of [x, unsharded], and `b` has a layout of
+    # [y, unsharded], the matmul will slice `a` to [x, y], do a local matmul,
+    # and all-reduce to produce a final result with a layout of [x, unsharded].
+    #
+    # Because `a` only has non-zero values in its first column, local devices
+    # must be given a correct device ID tensor (as the first argument of every
+    # function) to produce correct `begin` values for slicing `a`.
+    #
+    # Although this function only contains a single op, running it in op-by-op
+    # mode doesn't produce the intented effect because the output of
+    # math_ops.matmul would have a layout of [y, unsharded] instead of
+    # [x, unsharded].
+    @polymorphic_function.function
+    def func(a, b):
+      return math_ops.matmul(a, b)
+
+    dtensor_result = func(replica_ids, ones)
+
+    # The correct result is a 2x4 batch-sharded d-tensor, with rows filled with
+    # batch IDs.
+    expected_result = [
+        constant_op.constant(
+            [loc[_MESH_DIM_X]] * 4, dtype=dtypes.int32, shape=[1, 4])
+        for loc in mesh.local_device_locations()
+    ]
+
+    self.assertEqual(device.fetch_layout(dtensor_result), layout_x)
+    dtensor_result = [t.numpy() for t in device.unpack(dtensor_result)]
+    self.assertAllEqual(expected_result, dtensor_result)
+
+
+class CollectiveTestWithCustomMesh(test_util.DTensorBaseTest):
+
+  # Create two independent global AllReduce ops that should get combined.
+  def testGlobalAllReduceCombiner(self):
+    self.skipForDeviceType(['TPU'],
+                           'This test requires 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    # Create and use an eight-device mesh just for this test.
+    global_ids = test_util.create_device_ids_array((8,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh([_MESH_DIM_X], global_ids, local_ids,
+                                test_util.create_device_list((8,), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+
+    mesh = self.configTestMesh(mesh_dict)
+    fully_replicated_layout_1d = Layout.replicated(mesh, rank=1)
+    first_dimension_sharded_layout_2d = Layout.batch_sharded(
+        mesh, _MESH_DIM_X, 2)
+
+    @polymorphic_function.function
+    def func(a, b):
+      a = math_ops.reduce_sum(a, axis=[0])
+      b = math_ops.reduce_sum(b, axis=[0])
+      # Do something with the results before adding them, to make sure the MLIR
+      # pass can handle dependent ops sandwiched between two all-reduce ops.
+      return gen_math_ops.square(a) + gen_math_ops.square(b)
+
+    row = constant_op.constant(np.array([[1., 2.0]]), dtype=dtypes.float32)
+    a = array_ops.repeat(row, repeats=[8], axis=0)
+    b = gen_array_ops.reverse_v2(a, axis=[1])
+    expected_result = func(a, b)
+
+    a = numpy_util.pack_numpy(a, first_dimension_sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, first_dimension_sharded_layout_2d)
+    dtensor_result = func(a, b)
+
+    self.assertDTensorEqual(expected_result, fully_replicated_layout_1d,
+                            dtensor_result)
+
+  # Create two independent global AllReduce ops that should get combined.
+  # Create two independent global AllReduce ops with different reductions, that
+  # should not get combined
+
+  def testGlobalAllReduceCombinerDifferentReduce(self):
+    self.skipForDeviceType(['TPU'],
+                           'This test requires 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    # Create and use an eight-device mesh just for this test.
+    global_ids = test_util.create_device_ids_array((8,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh([_MESH_DIM_X], global_ids, local_ids,
+                                test_util.create_device_list((8,), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+
+    mesh = self.configTestMesh(mesh_dict)
+    fully_replicated_layout_1d = Layout.replicated(mesh, rank=1)
+    first_dimension_sharded_layout_2d = Layout.batch_sharded(
+        mesh, _MESH_DIM_X, 2)
+
+    @polymorphic_function.function
+    def func(a, b):
+      a = math_ops.reduce_sum(a, axis=[0])
+      b = math_ops.reduce_mean(b, axis=[0])
+
+      # Do something with the results before adding them, to make sure the MLIR
+      # pass can handle dependent ops sandwiched between two all-reduce ops.
+      return gen_math_ops.square(a) + gen_math_ops.square(b)
+
+    row = constant_op.constant(np.array([[1., 2.0]]), dtype=dtypes.float32)
+    a = array_ops.repeat(row, repeats=[8], axis=0)
+    b = gen_array_ops.reverse_v2(a, axis=[1])
+    expected_result = func(a, b)
+
+    a = numpy_util.pack_numpy(a, first_dimension_sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, first_dimension_sharded_layout_2d)
+    dtensor_result = func(a, b)
+
+    self.assertDTensorEqual(expected_result, fully_replicated_layout_1d,
+                            dtensor_result)
+
+  # Create two independent subgroup AllReduce ops that should get combined.
+  def testSubgroupAllReduceCombiner(self):
+    self.skipForDeviceType(['TPU'],
+                           'This test requires 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    # Create and use an eight-device mesh just for this test.
+    global_ids = test_util.create_device_ids_array((4, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh(_MESH_DIMS, global_ids, local_ids,
+                                test_util.create_device_list((4, 2), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+
+    mesh = self.configTestMesh(mesh_dict)
+    fully_sharded_layout_2d = Layout(_MESH_DIMS, mesh)
+
+    @polymorphic_function.function
+    def func(a, b):
+      a = math_ops.reduce_sum(a, axis=[0])
+      b = math_ops.reduce_sum(b, axis=[0])
+      # Do something with the results before adding them, to make sure the MLIR
+      # pass can handle dependent ops sandwiched between two all-reduce ops.
+      return gen_math_ops.square(a) + gen_math_ops.square(b)
+
+    row = constant_op.constant(np.array([[1., 2.0]]), dtype=dtypes.float32)
+    a = array_ops.repeat(row, repeats=[8], axis=0)
+    b = gen_array_ops.reverse_v2(a, axis=[1])
+    expected_result = func(a, b)
+
+    a = numpy_util.pack_numpy(a, fully_sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, fully_sharded_layout_2d)
+    dtensor_result = func(a, b)
+
+    self.assertDTensorEqual(expected_result, Layout([_MESH_DIM_Y], mesh),
+                            dtensor_result)
+
+  # TODO(b/188605096): also add a MixedPrecisionReduceScatter test
+  def testMixedPrecisionAllReduce(self):
+    has_enable_dtensor_mixed_precision_reduce = (
+        'DTENSOR_ENABLE_MIXED_PRECISION_REDUCE' in os.environ
+    )
+    has_dtensor_reduce_in_bfloat16_max_group_size = (
+        'DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE' in os.environ
+    )
+    if has_dtensor_reduce_in_bfloat16_max_group_size:
+      old_dtensor_reduce_in_bfloat16_max_group_size = os.environ[
+          'DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE']
+    os.environ['DTENSOR_ENABLE_MIXED_PRECISION_REDUCE'] = ''
+    os.environ['DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE'] = '4'
+
+    self.skipForDeviceType(['GPU'],
+                           'GPUs do not support bfloat16 reduce')
+    self.skipForDeviceType(['TPU'],
+                           'This test requires 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    # Create and use an 8-device mesh just for this test. Mixed-precision
+    # AllReduce will be in effect since the reduction will be across 8 devices
+    # which is larger than the max group size flag value of 4.
+    global_ids = test_util.create_device_ids_array((8,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout_lib.Mesh([_MESH_DIM_X], global_ids, local_ids,
+                                test_util.create_device_list((8,), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+
+    mesh = self.configTestMesh(mesh_dict)
+    replicated_layout_1d = Layout.replicated(mesh, rank=1)
+    first_dim_sharded_layout_1d = Layout.batch_sharded(
+        mesh, _MESH_DIM_X, rank=2)
+
+    @polymorphic_function.function
+    def func(x):
+      return math_ops.reduce_sum(x, axis=0)
+
+    # Reduce across 8 devices.
+    inp = constant_op.constant(
+        np.arange(48.).reshape((8, 6)), dtype=dtypes.bfloat16)
+    expected_result = np.sum(inp, axis=0)
+
+    inp_dtensor = numpy_util.pack_numpy(inp, first_dim_sharded_layout_1d)
+    dtensor_result = func(inp_dtensor)
+
+    self.assertDTensorEqual(
+        expected_result, replicated_layout_1d, dtensor_result)
+
+    if not has_enable_dtensor_mixed_precision_reduce:
+      del os.environ['DTENSOR_ENABLE_MIXED_PRECISION_REDUCE']
+    if has_dtensor_reduce_in_bfloat16_max_group_size:
+      os.environ['DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE'] = (
+          old_dtensor_reduce_in_bfloat16_max_group_size
+      )
+    else:
+      del os.environ['DTENSOR_REDUCE_IN_BFLOAT16_MAX_GROUP_SIZE']
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/config_test.py b/tensorflow/dtensor/python/tests/config_test.py
new file mode 100644
index 00000000000..455ccf3af34
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/config_test.py
@@ -0,0 +1,92 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the open source DTensor Python API."""
+
+import os
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import config
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.platform import test as tf_test
+# pylint: enable=g-direct-tensorflow-import
+
+
+class ConfigTest(tf_test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    test_util.reset_logical_devices('CPU', 2)
+    if test_util.is_gpu_present():
+      test_util.reset_logical_devices('GPU', 2)
+
+  def tearDown(self):
+    os.environ.pop(config._DT_JOBS, [])
+    super().tearDown()
+
+  def test_env_vars(self):
+    self.assertEqual(config.client_id(), 0)
+    self.assertEqual(config.num_clients(), 1)
+    self.assertEqual(config.job_name(), 'localhost')
+    self.assertEqual(config.full_job_name(), 'localhost/replica:0/task:0')
+    self.assertEqual(config.jobs(), [])
+
+  def test_list_devices(self):
+    device_type = config.preferred_device_type()
+
+    local_devices = [
+        tf_device.DeviceSpec.from_string(
+            f'/job:localhost/replica:0/task:0/device:{device_type}:0'),
+        tf_device.DeviceSpec.from_string(
+            f'/job:localhost/replica:0/task:0/device:{device_type}:1'),
+    ]
+
+    self.assertEqual(config.local_devices(device_type), local_devices)
+    self.assertEqual(config.num_local_devices(device_type), 2)
+    self.assertEqual(config.num_global_devices(device_type), 2)
+    # The eager context should not be initialized by any of the calls
+    self.assertFalse(context.context()._initialized)  # pylint: disable=protected-access
+
+  def test_sort_jobs_with_bns_names(self):
+    # bns names must be sorted in the bns order.
+    dtensor_jobs = [
+        '/bns/localhost/{task_id}'.format(task_id=i) for i in range(16)
+    ]
+    os.environ[config._DT_JOBS] = ','.join(dtensor_jobs)
+    self.assertListEqual(dtensor_jobs, config.jobs())
+
+    dtensor_jobs = [
+        '/bns/localhost/{task_id}:8888'.format(task_id=i) for i in range(16)
+    ]
+    os.environ[config._DT_JOBS] = ','.join(dtensor_jobs)
+    self.assertListEqual(dtensor_jobs, config.jobs())
+
+    dtensor_jobs = [
+        '/bns/localhost/{task_id}'.format(task_id=100 - i) for i in range(16)
+    ]
+    os.environ[config._DT_JOBS] = ','.join(dtensor_jobs)
+    with self.assertRaisesRegex(ValueError, 'Unexpected DTENSOR_JOBS'):
+      config.jobs()
+
+  def test_jobs_with_ip_port(self):
+    # The ip port format is not a bns address, and not required to sorted.
+    dtensor_jobs = ['localhost:{port}'.format(port=16 - i) for i in range(16)]
+    os.environ[config._DT_JOBS] = ','.join(dtensor_jobs)
+    self.assertListEqual(dtensor_jobs, config.jobs())
+
+
+if __name__ == '__main__':
+  tf_test.main()
diff --git a/tensorflow/dtensor/python/tests/multi_client_test.py b/tensorflow/dtensor/python/tests/multi_client_test.py
new file mode 100644
index 00000000000..bca12c43970
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/multi_client_test.py
@@ -0,0 +1,339 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DTensor multi-client setup."""
+import os
+import sys
+
+from absl import flags
+import numpy as np
+import portpicker
+
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import config as d_config
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as d_layout
+from tensorflow.dtensor.python import mesh_util
+from tensorflow.dtensor.python.tests import test_backend_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test as tf_test
+
+
+_NUM_DEVICES = flags.DEFINE_integer(
+    'num_devices', 4, 'Number of local devices. '
+    '4 is the only allowed value for TPU.')
+_NUM_CLIENTS = flags.DEFINE_integer(
+    'num_clients', 2, 'Number of clients. 0 for local mode.'
+    '2 is the only allowed value for TPU.')
+_MODEL_DIM_SIZE = flags.DEFINE_integer('model_dim_size', 4,
+                                       'Size of the model dimension.')
+
+_BATCH_DIM = 'batch'
+_MODEL_DIM = 'model'
+
+_BATCH_SIZE = 8
+_STEPS = 5
+_LR = 1e-3
+
+
+@polymorphic_function.function
+def _run_step(inputs, w, b, k):
+  with backprop.GradientTape() as g:
+    g.watch([w, b])
+    logits = nn_ops.conv2d_v2(inputs, k, strides=[1, 1, 1, 1], padding='SAME')
+    logits = array_ops.reshape(logits, [logits.shape[0], -1])
+    logits = math_ops.matmul(logits, w)
+    logits = logits + b
+    loss = math_ops.reduce_sum(logits, axis=[0, 1])
+
+  gw, gb = g.gradient(loss, [w, b])
+  for v, v_grad in zip([w, b], [gw, gb]):
+    v.assign_sub(_LR * v_grad)
+  return gw, gb, loss
+
+
+def init_var(mesh):
+  w_initializer = stateless_random_ops.stateless_random_normal(
+      [28 * 28, 16], seed=[0, 1]
+  )
+  b_initializer = stateless_random_ops.stateless_random_normal(
+      [16], seed=[0, 2]
+  )
+  k_initializer = stateless_random_ops.stateless_random_normal(
+      [3, 3, 1, 1], seed=[0, 3]
+  )
+  n_w = variables.Variable(w_initializer)
+  n_b = variables.Variable(b_initializer)
+  n_k = variables.Variable(k_initializer)
+
+  # Initialize DTensor variables.
+  w_initializer_on_mesh = d_api.copy_to_mesh(
+      w_initializer, d_layout.Layout.replicated(mesh, rank=2)
+  )
+  b_initializer_on_mesh = d_api.copy_to_mesh(
+      b_initializer, d_layout.Layout.replicated(mesh, rank=1)
+  )
+  k_initializer_on_mesh = d_api.copy_to_mesh(
+      k_initializer, d_layout.Layout.replicated(mesh, rank=4)
+  )
+  w = d_variable.DVariable(
+      d_api.relayout(
+          w_initializer_on_mesh,
+          d_layout.Layout(['unsharded', _MODEL_DIM], mesh),
+      )
+  )
+  b = d_variable.DVariable(
+      d_api.relayout(b_initializer_on_mesh, d_layout.Layout([_MODEL_DIM], mesh))
+  )
+  k = d_variable.DVariable(k_initializer_on_mesh)
+  return (n_w, n_b, n_k), (w, b, k)
+
+
+class DTensorMNISTTest(tf_test.TestCase):
+
+  def setUp(self):
+    super(DTensorMNISTTest, self).setUp()
+    if config.list_physical_devices('GPU'):
+      device_type = 'GPU'
+    elif test_util.is_tpu_present():
+      device_type = 'TPU'
+    else:
+      device_type = 'CPU'
+
+    local_devices = d_config.local_devices(device_type)
+    num_devices = len(local_devices)
+
+    global_device_ids = test_util.create_device_ids_array((
+        d_config.num_clients() * num_devices // _MODEL_DIM_SIZE.value,
+        _MODEL_DIM_SIZE.value,
+    ))
+    device_ids_list = np.ravel(global_device_ids).tolist()
+
+    index = d_config.client_id() * num_devices
+    local_device_ids = device_ids_list[index:(index + num_devices)]
+
+    self.mesh = d_layout.Mesh(
+        [_BATCH_DIM, _MODEL_DIM],
+        global_device_ids=global_device_ids,
+        local_device_ids=local_device_ids,
+        local_devices=local_devices,
+    )
+
+  def tearDown(self):
+    # A barrier prevents a client from disconnecting prematurely in tests.
+    mesh_util.barrier(self.mesh)
+    super().tearDown()
+
+  def test_mnist(self):
+    def train():
+      input_layout = d_layout.Layout.batch_sharded(
+          self.mesh, _BATCH_DIM, rank=4
+      )
+      (n_w, n_b, n_k), (w, b, k) = init_var(self.mesh)
+      for i in range(_STEPS):
+        data = stateless_random_ops.stateless_random_normal(
+            [_BATCH_SIZE, 28, 28, 1], seed=[0, i]
+        )
+        g_nw, g_nb, n_loss = _run_step(data.numpy(), n_w, n_b, n_k)
+
+        input_image = d_api.copy_to_mesh(
+            data, layout=d_layout.Layout.replicated(self.mesh, rank=4)
+        )
+        input_image = d_api.relayout(input_image, layout=input_layout)
+
+        with ops.device_v2(self.mesh.local_devices()[0]):
+          gw, gb, loss = _run_step(input_image, w, b, k)
+
+      gw = d_api.relayout(gw, d_layout.Layout.replicated(self.mesh, rank=2))
+      w = d_api.relayout(w, d_layout.Layout.replicated(self.mesh, rank=2))
+      gb = d_api.relayout(gb, d_layout.Layout.replicated(self.mesh, rank=1))
+      b = d_api.relayout(b, d_layout.Layout.replicated(self.mesh, rank=1))
+
+      return (n_loss, g_nw, g_nb, n_w, n_b), (loss, gw, gb, w, b)
+
+    (n_loss, g_nw, g_nb, n_w, n_b), (loss, gw, gb, w, b) = train()
+
+    self.assertAllClose(n_loss, loss, atol=5e-4)
+    self.assertAllClose(g_nw, gw, atol=1e-5)
+    self.assertAllClose(g_nb, gb, atol=1e-5)
+    self.assertAllClose(n_w, w, atol=1e-5)
+    self.assertAllClose(n_b, b, atol=1e-5)
+
+  def test_copy_to_mesh(self):
+    layout = d_layout.Layout([_BATCH_DIM, 'unsharded'], self.mesh)
+    host_layout = d_layout.Layout(layout.sharding_specs, self.mesh.host_mesh())
+    x = d_api.pack(
+        [array_ops.ones((2, 2), dtype=dtypes.float32)]
+        * len(self.mesh.local_devices()),
+        layout,
+    )
+
+    @polymorphic_function.function
+    def d2h(x):
+      return d_api.copy_to_mesh(x, host_layout)
+
+    @polymorphic_function.function
+    def h2d(x):
+      return d_api.copy_to_mesh(x, layout)
+
+    y = d2h(x)
+    ys = d_api.unpack(y)
+    for i in ys:
+      self.assertAllClose(i, array_ops.ones((2, 2)), atol=1e-5)
+    z = h2d(y)
+    zs = d_api.unpack(z)
+    for i in zs:
+      self.assertAllClose(i, array_ops.ones((2, 2)), atol=1e-5)
+
+
+def multi_client_main():
+  """Creates a Flock of TensorFlow Processes on localhost."""
+  flags.FLAGS(sys.argv, known_only=True)
+  num_clients = _NUM_CLIENTS.value or 1
+  num_devices = _NUM_DEVICES.value
+
+  # No GPU visible to the flock controller.
+  os.environ['CUDA_VISIBLE_DEVICES'] = ''
+
+  # Python multiprocess module in OSS.
+  mp_context = test_backend_util.get_mp_context()
+
+  print('Check per client log in Test artifacts.', flush=True)
+
+  # Inverts the order of ports intentionally to rule out ordering bugs.
+  server_ports = sorted(
+      [portpicker.pick_unused_port() for _ in range(num_clients)], reverse=True)
+
+  additional_ports = sorted(
+      [portpicker.pick_unused_port() for _ in range(num_clients)]
+  )
+
+  # Starts processes
+  procs = []
+  for client_idx in range(num_clients):
+    proc = mp_context.Process(
+        target=run_client,
+        args=(client_idx, server_ports, additional_ports, num_devices),
+        name=f'Client-{client_idx}',
+    )
+    proc.start()
+    procs.append(proc)
+
+  # Joins processes
+  exitcode = 0
+  for proc in procs:
+    proc.join()
+    if proc.exitcode != 0:
+      exitcode = proc.exitcode
+
+  sys.exit(exitcode)
+
+
+def run_client(idx, server_ports, additional_ports, num_devices):
+  """Runs test.main() from a DTensor Client process on localhost.
+
+  This function runs in a separate process so that the eager context is
+  proprely separated, which resembles real world multi-client setup.
+
+  Virtual devices are configured before test.main() is called.
+
+  Each client is configured to only have access to the physical GPU device
+  corresponding to its client id via CUDA_VISIBLE_DEVICES.
+
+  Each client is configured to only have access to some TPU cores
+  corresponding to its client id via flags.
+
+  The clients redirects stdout and stderr to files under Test Artifacts.
+
+  Args:
+    idx: integer task number represents the client's id from global picture.
+    server_ports: A list of ports that is allocated and to be used to construct
+      GRPC server. server_ports[idx] will be the GRPC server on the
+      corresponding client.
+    additional_ports: A list of ports that is allocated and to be used to
+      construct the backends.
+    num_devices: Number of devices per client.
+  """
+  # Python ForkServer doesn't parse the absl flags.
+  flags.FLAGS(sys.argv, known_only=True)
+
+  test_backend_util.slice_host_devices_for_multiworker(
+      _NUM_CLIENTS.value, idx, additional_ports
+  )
+
+  artifact_dir = os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', '')
+
+  # Redirect extra client's stderr/stdout to undeclared outputs on sponge.
+  if artifact_dir:
+    with open(
+        os.path.join(artifact_dir, f'test-client-process-{idx}.log'),
+        'wb') as fp:
+      os.dup2(fp.fileno(), 1)
+      os.dup2(fp.fileno(), 2)
+
+  # Set up cluster and enable collectives.
+  dtensor_jobs = [f'localhost:{port:06d}' for port in server_ports]
+
+  # Configures DTensor multi-client environment variables.
+  # pylint: disable=protected-access
+  if _NUM_CLIENTS.value != 0:
+    os.environ[d_config._DT_CLIENT_ID] = f'{idx}'
+    os.environ[d_config._DT_JOB_NAME] = 'worker'
+    os.environ[d_config._DT_JOBS] = ','.join(dtensor_jobs)
+  # pylint: enable=protected-access
+
+  if config.list_physical_devices('GPU'):
+    device_type = 'GPU'
+  elif test_util.is_tpu_present():
+    device_type = 'TPU'
+  else:
+    device_type = 'CPU'
+
+  reset_logical_devices(device_type, num_devices)
+
+  # The following function call never returns.
+  tf_test.main()
+
+
+def reset_logical_devices(device_type, num_devices):
+  """Ensures multi-client with the number of logical devices for CPU/GPU/TPU."""
+  test_util.reset_context()
+  if device_type != 'TPU':
+    # Configure virtual devices. This does not initialize the TensorFlow
+    # context.
+    test_util.reset_logical_devices(device_type, num_devices)
+
+  accelerator_util.initialize_accelerator_system(
+      device_type, enable_coordination_service=True)
+
+  # Validates the correct number of devices are created.
+  logical_devices = test_util.list_local_logical_devices(device_type)
+  assert len(logical_devices) == num_devices, (
+      logical_devices,
+      f'Test is misconfigured: expecting {num_devices} logical_devices.')
+
+
+if __name__ == '__main__':
+  test_backend_util.handle_test_main(multi_client_main)
diff --git a/tensorflow/dtensor/python/tests/numpy_util_test.py b/tensorflow/dtensor/python/tests/numpy_util_test.py
new file mode 100644
index 00000000000..8d650b2c076
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/numpy_util_test.py
@@ -0,0 +1,127 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the buffer and DTensor conversion helpers."""
+
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.platform import test as tf_test
+# pylint: enable=g-direct-tensorflow-import
+
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+
+
+class NumpyUtilTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2, 2))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                            test_util.create_device_list((2, 2), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def test_tensor_from_replicated(self):
+    tensors = [np.arange(4) for i in range(self.mesh.size)]
+    replicated_layout = layout.Layout([layout.UNSHARDED, layout.UNSHARDED],
+                                      mesh=self.mesh)
+
+    self.assertAllClose(
+        np.arange(4), numpy_util.unpacked_to_numpy(tensors, replicated_layout))
+
+  def test_tensor_x_sharded(self):
+    t00 = np.arange(8).reshape(2, 4)
+    t01 = np.arange(8).reshape(2, 4)
+    t10 = np.arange(8, 16).reshape(2, 4)
+    t11 = np.arange(8, 16).reshape(2, 4)
+    tensors = [t00, t01, t10, t11]
+    sharded_on_x = layout.Layout([_MESH_DIM_X, layout.UNSHARDED],
+                                 mesh=self.mesh)
+    self.assertAllClose(
+        np.arange(16).reshape(4, 4),
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_x))
+
+  def test_tensor_y_sharded(self):
+    # [[0,1], [4,5], [8,9], [12,13]]
+    t00 = np.arange(16).reshape(4, 4)[:, :-2]
+    # [[2,3], [6,7], [10,11], [14,15]]
+    t01 = np.arange(16).reshape(4, 4)[:, 2:4]
+    t10 = np.arange(16).reshape(4, 4)[:, :-2]
+    t11 = np.arange(16).reshape(4, 4)[:, 2:4]
+    tensors = [t00, t01, t10, t11]
+    sharded_on_y = layout.Layout([layout.UNSHARDED, _MESH_DIM_Y],
+                                 mesh=self.mesh)
+    self.assertAllClose(
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_y),
+        np.arange(16).reshape(4, 4))
+
+  def test_tensor_x_sharded_on_mesh_y(self):
+    t00 = np.arange(8).reshape(2, 4)
+    t01 = np.arange(8, 16).reshape(2, 4)
+    t10 = np.arange(8).reshape(2, 4)
+    t11 = np.arange(8, 16).reshape(2, 4)
+    tensors = [t00, t01, t10, t11]
+    sharded_on_y = layout.Layout([_MESH_DIM_Y, layout.UNSHARDED],
+                                 mesh=self.mesh)
+    self.assertAllClose(
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_y),
+        np.arange(16).reshape(4, 4))
+
+  def test_tensor_y_sharded_on_mesh_x(self):
+    # [[0,1], [4,5], [8,9], [12,13]]
+    t00 = np.arange(16).reshape(4, 4)[:, :-2]
+    t01 = np.arange(16).reshape(4, 4)[:, :-2]
+    # [[2,3], [6,7], [10,11], [14,15]]
+    t10 = np.arange(16).reshape(4, 4)[:, 2:4]
+    t11 = np.arange(16).reshape(4, 4)[:, 2:4]
+    tensors = [t00, t01, t10, t11]
+    sharded_on_x = layout.Layout([layout.UNSHARDED, _MESH_DIM_X],
+                                 mesh=self.mesh)
+    self.assertAllClose(
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_x),
+        np.arange(16).reshape(4, 4))
+
+  def test_tensor_x_y_sharded_x_y(self):
+    t00 = np.array([[0, 1], [4, 5]])
+    t01 = np.array([[2, 3], [6, 7]])
+    t10 = np.array([[8, 9], [12, 13]])
+    t11 = np.array([[10, 11], [14, 15]])
+    tensors = [t00, t01, t10, t11]
+    sharded_on_x_y = layout.Layout([_MESH_DIM_X, _MESH_DIM_Y], mesh=self.mesh)
+    self.assertAllClose(
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_x_y),
+        np.arange(16).reshape(4, 4))
+
+  def test_tensor_x_y_sharded_y_x(self):
+    t00 = np.array([[0, 1], [4, 5]])
+    t01 = np.array([[8, 9], [12, 13]])
+    t10 = np.array([[2, 3], [6, 7]])
+    t11 = np.array([[10, 11], [14, 15]])
+    tensors = [t00, t01, t10, t11]
+    sharded_on_y_x = layout.Layout([_MESH_DIM_Y, _MESH_DIM_X], mesh=self.mesh)
+    self.assertAllClose(
+        numpy_util.unpacked_to_numpy(tensors, sharded_on_y_x),
+        np.arange(16).reshape(4, 4))
+
+
+if __name__ == '__main__':
+  tf_test.main()
diff --git a/tensorflow/dtensor/python/tests/spmd_test.py b/tensorflow/dtensor/python/tests/spmd_test.py
new file mode 100644
index 00000000000..1505ad5107c
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/spmd_test.py
@@ -0,0 +1,3508 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the open source DTensor Python API."""
+
+import os
+from unittest import mock
+from absl.testing import parameterized
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.dtensor.python.tests import test_util_ops
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import gen_io_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gen_list_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test as tf_test
+from tensorflow.python.util import nest
+# pylint: enable=g-direct-tensorflow-import
+
+# Makes a 2-D mesh with dimensions as, X(2) and Y(4).
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_MESH_DIMS = [_MESH_DIM_X, _MESH_DIM_Y]
+
+_MATMUL_IMPLEMENTED = (('_unsharded', 0,
+                        0), ('_a_unsharded_b_contracting', 0,
+                             1), ('_a_unsharded_b_non_contracting', 0,
+                                  2), ('_a_non_contracting_b_unsharded', 1,
+                                       0), ('_a_contracting_b_unsharded', 2, 0),
+                       ('_a_contracting_b_contracting', 2,
+                        1), ('_a_non_contracting_b_contracting', 1,
+                             1), ('_a_non_contracting_b_non_contracting', 1, 2),
+                       ('_a_contracting_b_non_contracting', 2, 2))
+_BATCH_MATMUL_IMPLEMENTED = (('_unsharded', 0,
+                              0), ('_a_unsharded_b_contracting', 0,
+                                   2), ('_a_unsharded_b_non_contracting', 0,
+                                        3), ('_a_batch_b_batch', 1, 1),
+                             ('_a_non_contracting_b_unsharded', 2,
+                              0), ('_a_contracting_b_unsharded', 3,
+                                   0), ('_a_contracting_b_contracting', 3, 2),
+                             ('_a_non_contracting_b_contracting', 2,
+                              2), ('_a_non_contracting_b_non_contracting', 2,
+                                   3), ('_a_contracting_b_non_contracting', 3,
+                                        3), ('_a_unsharded_b_batch', 0,
+                                             1), ('_a_batch_b_unsharded', 1, 0),
+                             ('_a_batch_b_contracting', 1,
+                              2), ('_a_batch_b_non_contracting', 1,
+                                   3), ('_a_non_contracting_b_batch', 2,
+                                        1), ('_a_contracting_b_batch', 3, 1))
+_MATMUL_TRANSPOSE = (('', False, False), ('_b_transpose', False, True),
+                     ('_a_transpose', True, False), ('_a_transpose_b_transpose',
+                                                     True, True))
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+UNSHARDED = layout_lib.UNSHARDED
+
+
+def select_tol(op, mesh, default_tol, low_res_tol):
+  # Lowers the tol for math_ops.pow,
+  # nn_ops.log_softmax_v2 and gen_math_ops.tanh due to
+  # resolution on TPU
+  if (op not in [
+      math_ops.pow, nn_ops.log_softmax_v2, gen_math_ops.tanh,
+      gen_math_ops.acosh, gen_math_ops.asinh, gen_math_ops.digamma,
+      gen_math_ops.igammac, gen_math_ops.lgamma, gen_math_ops.log1p,
+      math_ops.xlog1py, gen_math_ops.xlogy, gen_math_ops.zeta, gen_math_ops.tan,
+      gen_math_ops.sin, gen_math_ops.sinh, math_ops.softplus
+  ]):
+    return default_tol
+
+  if 'TPU' in mesh.local_devices()[0]:
+    return low_res_tol
+  else:
+    return default_tol
+
+
+def order_broadcastable_operands(op, lhs, rhs):
+  # Swaps operands lhs and rhs. Assumes lhs is the broadcasting tensor. Due to
+  # ops only with right broadcastable operand like gen_math_ops.truncate_div
+  if (op in [gen_math_ops.truncate_div, gen_math_ops.truncate_mod]):
+    return rhs, lhs
+  return lhs, rhs
+
+
+class DTensorSPMDTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorSPMDTest, self).setUp()
+
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                     test_util.create_device_list((2, 4), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    # Creates a bunch of common layouts used by tests later.
+    # - 0-d
+    self.scalar_replicated_layout = Layout.replicated(self.mesh, rank=0)
+    # - 1-d
+    self.replicated_layout_1d = Layout.replicated(self.mesh, rank=1)
+    self.first_dimension_sharded_layout_1d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=1)
+    # - 2-d
+    self.replicated_layout_2d = Layout.replicated(self.mesh, rank=2)
+    self.first_dimension_sharded_layout = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=2)
+    self.last_dimension_sharded_layout = Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=2)
+
+    self.layouts_2d = [
+        self.replicated_layout_2d, self.first_dimension_sharded_layout,
+        self.last_dimension_sharded_layout
+    ]
+
+    # - 3-d
+    self.replicated_layout_3d = Layout.replicated(self.mesh, rank=3)
+    self.first_dimension_sharded_layout_3d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=3)
+    self.middle_dimension_sharded_layout_3d = Layout(
+        [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+    self.last_dimension_sharded_layout_3d = Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=3)
+
+    self.layouts_3d = [
+        self.replicated_layout_3d, self.first_dimension_sharded_layout_3d,
+        self.middle_dimension_sharded_layout_3d,
+        self.last_dimension_sharded_layout_3d
+    ]
+
+    self.shardings = {
+        'batch': Layout.batch_sharded,
+        'inner': Layout.inner_sharded
+    }
+
+  @parameterized.named_parameters(
+      ('unsharded_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+      ('x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('unsharded_x', [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('x,y', [_MESH_DIM_X, _MESH_DIM_Y]),
+  )
+  @mock.patch.dict(
+      os.environ, {'DTENSOR_ENABLE_REPLICATED_SPMD_AS_DEFAULT_TF.MOD': '1'}
+  )
+  def testDefaultReplicatedSpmd(self, shard_specs):
+    x = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 8], seed=[0, 1], dtype=dtypes.float32
+    )
+    y = constant_op.constant(7, dtype=dtypes.float32)
+
+    expected_result = math_ops.Mod(x=x, y=y)
+    expected_layout = Layout.replicated(self.mesh, rank=2)
+    dtensor_result = math_ops.Mod(
+        x=numpy_util.pack_numpy(x, layout=Layout(shard_specs, self.mesh)),
+        y=numpy_util.pack_numpy(y, layout=Layout([], self.mesh)),
+    )
+
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.product(
+      shard_type=['replicated', 'batch_sharded'], full_matrices=[True, False])
+  def testQR(self, shard_type, full_matrices):
+    np.random.seed(123)
+    inputs = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 10).reshape([8, 9, 10]),
+        dtype=dtypes.float32)
+
+    expected_result = gen_linalg_ops.qr(
+        input=inputs, full_matrices=True, name=None)
+
+    if shard_type == 'replicated':
+      layout = self.first_dimension_sharded_layout_3d
+    else:
+      layout = self.replicated_layout_3d
+
+    inputs = numpy_util.pack_numpy(inputs, layout)
+
+    got = gen_linalg_ops.qr(
+        input=inputs, full_matrices=full_matrices, name=None)
+    self.assertDTensorEqual(expected_result[0], layout, got[0])
+    self.assertDTensorEqual(expected_result[1], layout, got[1])
+
+  def testReduceScatter(self,):
+    # Generates an AllReduce due to sharding of inner dimensions of Matmul
+    # and a Scatter due to the Relayout.  The AllReduce+Scatter can be combined
+    # to a single ReduceScatter.
+    a, b, c = 128, 128, 128
+    seed = [0, 1]
+    first_dim_sharded = self.first_dimension_sharded_layout
+    second_dim_sharded = self.last_dimension_sharded_layout
+
+    with api.run_on(self.mesh):
+      m1 = numpy_util.stateless_random_uniform(
+          layout=second_dim_sharded, shape=[a, b], seed=seed
+      )
+      m2 = numpy_util.stateless_random_uniform(
+          layout=first_dim_sharded, shape=[b, c], seed=seed
+      )
+
+    @polymorphic_function.function
+    def func():
+      m3 = math_ops.matmul(m1, m2)
+      return m3
+
+    @polymorphic_function.function
+    def scattered_func():
+      m3 = math_ops.matmul(m1, m2)
+      return api.relayout(m3, self.first_dimension_sharded_layout)
+
+    dtensor_result = func()
+    dtensor_scattered_result = scattered_func()
+
+    self.assertDTensorEqual(dtensor_result, self.first_dimension_sharded_layout,
+                            dtensor_scattered_result)
+
+  def testReduceScatterLastDimSharded(
+      self,
+  ):
+    # ReduceScatter on non-0th dimension which requires a transpose.
+    a, b, c = 128, 128, 128
+    seed = [0, 1]
+    first_dim_sharded = self.first_dimension_sharded_layout
+    second_dim_sharded = self.last_dimension_sharded_layout
+
+    @polymorphic_function.function
+    def uniform(shape, seed, layout):
+      return api.relayout(
+          stateless_random_ops.stateless_random_uniform(shape=shape, seed=seed),
+          layout=layout,
+      )
+
+    with api.run_on(self.mesh):
+      m1 = uniform(layout=second_dim_sharded, shape=[a, b], seed=seed)
+      m2 = uniform(layout=first_dim_sharded, shape=[b, c], seed=seed)
+
+    @polymorphic_function.function
+    def func():
+      m3 = math_ops.matmul(m1, m2)
+      return m3
+
+    @polymorphic_function.function
+    def scattered_func():
+      m3 = math_ops.matmul(m1, m2)
+      return api.relayout(m3, self.last_dimension_sharded_layout)
+
+    dtensor_result = func()
+    dtensor_scattered_result = scattered_func()
+
+    self.assertDTensorEqual(
+        dtensor_result,
+        self.last_dimension_sharded_layout,
+        dtensor_scattered_result,
+    )
+
+  def testExpandDimsDifferentInputAndOutputLayouts(self,):
+    src_numpy = np.random.uniform(size=[10, 10])
+    src = constant_op.constant(src_numpy, dtype=dtypes.float32)
+
+    expected = array_ops.expand_dims_v2(src, axis=-1)
+
+    src = numpy_util.pack_numpy(src, self.replicated_layout_2d)
+
+    @polymorphic_function.function
+    def expand_dims_fn(src):
+      expanded = array_ops.expand_dims_v2(src, axis=-1)
+      return api.relayout(expanded, self.first_dimension_sharded_layout_3d)
+
+    dtensor_result = expand_dims_fn(src)
+    self.assertDTensorEqual(expected, self.first_dimension_sharded_layout_3d,
+                            dtensor_result)
+
+    @polymorphic_function.function
+    def expand_dims_list_axis_fn(src):
+      expanded = array_ops.expand_dims_v2(src, axis=[-1])
+      return api.relayout(expanded, self.first_dimension_sharded_layout_3d)
+
+    dtensor_result_2 = expand_dims_list_axis_fn(src)
+    self.assertDTensorEqual(expected, self.first_dimension_sharded_layout_3d,
+                            dtensor_result_2)
+
+  def testPackAndUnpackAssertion(self):
+    layout = Layout.replicated(self.mesh, rank=3)
+    # Due to Perf concerns, `pack` does not check the compatibility of
+    # components and layout. Here, we inject a wrong value components.
+    with api.run_on(self.mesh):
+      b = api.pack(
+          [constant_op.constant([[[(x + 1) * 1.0]]]) for x in range(8)],
+          layout=layout)
+      assert b.shape == [1, 1, 1]
+
+    # `to_numpy` assumes all unpacked tensors are compatible with the
+    # layout. So, it picks any component to use if that dimension is replicated.
+    # In this case, it picks the final one.
+    result_dtensor = numpy_util.to_numpy(b)
+
+    self.assertAllEqual(constant_op.constant([[[8.]]]), result_dtensor)
+
+    # assertDTensorEqual does more aggressive check, which respects the layout.
+    with self.assertRaisesRegex(AssertionError, 'Mismatched value'):
+      self.assertDTensorEqual(constant_op.constant([[[8.]]]), layout, b)
+
+  @parameterized.named_parameters(test_util_ops.UNARY_OPS)
+  def testUnaryOpsWithTwoShardedAndOneReplicatedDimension(self, op):
+    a = constant_op.constant([[[1.], [2.], [3.], [4.]], [[5.], [6.], [7.],
+                                                         [8.]]])
+    assert a.shape == [2, 4, 1]
+    expected_result = op(a)
+
+    layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED], self.mesh)
+    a = numpy_util.pack_numpy(a, layout)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(expected_result, layout, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.UNARY_OPS)
+  def testUnaryOpsWithFullyReplicatedInputs(self, op):
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    assert a.shape == [2, 2]
+    expected_result = op(a)
+
+    a = api.copy_to_mesh(a, self.replicated_layout_2d)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(
+        expected_result, self.replicated_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.UNARY_OPS)
+  def testUnaryOpsWithFullyShardedInputs(self, op):
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 4, 2)), dtype=dtypes.float32)
+    expected_result = op(a)
+
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                            self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(
+        expected_result, sharded_layout, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.UNARY_OPS)
+  def testUnaryOpsWithBatchShardedInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-3)
+    a = constant_op.constant(np.arange(6).reshape((2, 3)), dtype=dtypes.float32)
+    expected_result = op(a)
+
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+    dtensor_result = op(a)
+
+    self.assertDTensorEqual(
+        expected_result,
+        self.first_dimension_sharded_layout,
+        dtensor_result,
+        tol=tol)
+
+  def testInvertOpsWithFullyShardedInputs(self):
+    # Invert only support int inputs.
+    op = lambda x: gen_bitwise_ops.invert(x=x, name='Invert')
+
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 4, 2)), dtype=dtypes.int32)
+    expected_result = op(a)
+
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                            self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(
+        expected_result, sharded_layout, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(('replicated', layout_lib.UNSHARDED),
+                                  ('sharded', _MESH_DIM_X))
+  def testInvertPermutationOp(self, shard):
+    self.skipForDeviceType(['GPU', 'TPU'],
+                           'Invert Permutation runs in CPU only.')
+    op_input = constant_op.constant([3, 4, 0, 2, 1, 5])
+    expected_result = gen_array_ops.invert_permutation(op_input)
+    # We should always expected the output to be replicated as the
+    # expander should relayout both inputs and outputs to replicated.
+    expected_layout = Layout.replicated(self.mesh, rank=1)
+
+    self.assertDTensorEqual(
+        expected_result, expected_layout,
+        gen_array_ops.invert_permutation(
+            numpy_util.pack_numpy(op_input, Layout([shard], self.mesh))))
+
+  def testErfcInvOpsWithFullyShardedInputs(self):
+    # By official doc, math_ops.erfcinv is defined on (0, 2]. In addition,
+    # math_ops.erfcinv internally calls ndtri internally. So to test the op for
+    # spmd expanding, we call raw op here.
+    op = lambda x: gen_math_ops.erfinv(x=x, name='erfinv')
+
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 4, 2)) / 30 + 0.1, dtype=dtypes.float32)
+    expected_result = op(a)
+
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                            self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(
+        expected_result, sharded_layout, dtensor_result, tol=tol)
+
+  def testPopulationCountWithFullyShardedInputs(self):
+    # By official doc, gen_bitwise_ops.population_count only supports int
+    # inputs.
+    op = lambda x: gen_bitwise_ops.population_count(x=x, name='pc')
+
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 4, 2)), dtype=dtypes.int32)
+    expected_result = op(a)
+
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                            self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    dtensor_result = op(a)
+
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, 1e-4)
+    self.assertDTensorEqual(
+        expected_result, sharded_layout, dtensor_result, tol=tol)
+
+  def testIgammacOpsWithFullyShardedInputs(self):
+    # Igammac has super low precision on TPU. So we test it as a separated unit
+    # tests to avoid lower the tol of other tests.
+    #
+    # In addition, according to wiki link below, for s=4, all values are not
+    # inf/nan.
+    #
+    # https://en.wikipedia.org/wiki/Incomplete_gamma_function
+    tol = 1e-2
+    op = lambda x: gen_math_ops.igammac(4, x)
+
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 4, 2)), dtype=dtypes.float32)
+    expected_result = op(a)
+
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                            self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout)
+    dtensor_result = op(a)
+
+    self.assertDTensorEqual(
+        expected_result, sharded_layout, dtensor_result, tol=tol)
+
+  @parameterized.parameters(('replicated',), ('sharded',))
+  def testBiasAdd2D(self, shard_type):
+    value = np.array([[1., 2.], [3., 4.]])
+    bias = np.array([0.1, 0.2])
+    expected_result = nn_ops.bias_add(value, bias)
+
+    if shard_type == 'replicated':
+      layout = self.replicated_layout_2d
+    else:
+      layout = self.first_dimension_sharded_layout
+
+    value = numpy_util.pack_numpy(value, layout)
+    bias = numpy_util.pack_numpy(bias, self.replicated_layout_1d)
+    dtensor_result = nn_ops.bias_add(value, bias)
+    self.assertDTensorEqual(expected_result, layout, dtensor_result)
+
+  @parameterized.product(
+      shard_type=['replicated', 'batch_sharded'],
+      data_format=['N...C', 'NC...'])
+  def testBiasAdd4D(self, shard_type, data_format):
+    value = np.ones(shape=(6, 2, 4, 2), dtype=np.float32)
+    bias = np.array([0.1, 0.2], dtype=np.float32)
+    expected_result = nn_ops.bias_add(value, bias, data_format=data_format)
+
+    if shard_type == 'replicated':
+      layout = Layout.replicated(self.mesh, rank=4)
+    else:
+      layout = Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=4)
+
+    value = numpy_util.pack_numpy(value, layout)
+    bias = numpy_util.pack_numpy(bias, self.replicated_layout_1d)
+
+    dtensor_result = nn_ops.bias_add(value, bias, data_format=data_format)
+    self.assertDTensorEqual(expected_result, layout, dtensor_result)
+
+  @parameterized.product(
+      data_format=['N...C', 'NC...'],
+      bias_sharding=['x', 'y', layout_lib.UNSHARDED],
+      c_dim_sharding=['x', layout_lib.UNSHARDED])
+  def testBiasAddDataFormatTest(self, data_format, bias_sharding,
+                                c_dim_sharding):
+    if data_format == 'N...C':
+      c_dim = 3
+      input_sharding = [
+          layout_lib.UNSHARDED, layout_lib.UNSHARDED, 'y', c_dim_sharding
+      ]
+      a = np.ones(shape=(1, 1, 4, 4), dtype=np.float32)
+      layout = Layout(input_sharding, self.mesh)
+    else:
+      c_dim = 1
+      input_sharding = [
+          layout_lib.UNSHARDED, c_dim_sharding, 'y', layout_lib.UNSHARDED
+      ]
+      a = np.ones(shape=(1, 4, 4, 1), dtype=np.float32)
+      layout = Layout(input_sharding, self.mesh)
+
+    bias = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32)
+    expected_result = nn_ops.bias_add(a, bias, data_format=data_format)
+    expected_result_sharding = input_sharding
+    if c_dim_sharding == layout_lib.UNSHARDED and bias_sharding != 'y':
+      expected_result_sharding[c_dim] = bias_sharding
+
+    expected_layout = Layout(expected_result_sharding, self.mesh)
+    a = numpy_util.pack_numpy(a, layout)
+    bias = numpy_util.pack_numpy(bias, Layout([bias_sharding], self.mesh))
+    result = nn_ops.bias_add(a, bias=bias, data_format=data_format)
+
+    self.assertDTensorEqual(expected_result, expected_layout, result)
+
+  @parameterized.parameters(('replicated',), ('batch_sharded',))
+  def testBiasAddGrad2D(self, shard_type):
+    value = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    expected_result = gen_nn_ops.bias_add_grad(out_backprop=value)
+
+    if shard_type == 'replicated':
+      layout = self.replicated_layout_2d
+    else:
+      layout = self.first_dimension_sharded_layout
+    expected_layout = self.replicated_layout_1d
+
+    value = numpy_util.pack_numpy(value, layout)
+    dtensor_result = gen_nn_ops.bias_add_grad(out_backprop=value)
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.product(
+      shard_type=['replicated', 'batch_sharded'], data_format=['NHWC', 'NCHW'])
+  def testBiasAddGrad4D(self, shard_type, data_format):
+    value = np.ones(shape=(2, 3, 4, 5), dtype=np.float32)
+    expected_result = gen_nn_ops.bias_add_grad(
+        out_backprop=value, data_format=data_format)
+
+    if shard_type == 'replicated':
+      layout = Layout.replicated(self.mesh, rank=4)
+    else:
+      layout = Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=4)
+    expected_layout = self.replicated_layout_1d
+
+    value = numpy_util.pack_numpy(value, layout)
+    dtensor_result = gen_nn_ops.bias_add_grad(
+        out_backprop=value, data_format=data_format)
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_FLOAT_OPS)
+  def testBinaryOpsWithFullyReplicatedInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    b = constant_op.constant([[10., 20.], [30., 40.]])
+    expected_result = op(a, b)
+
+    a = api.copy_to_mesh(a, self.replicated_layout_2d)
+    b = api.copy_to_mesh(b, self.replicated_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, self.replicated_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_FLOAT_OPS)
+  def testBinaryFloatOpsWithFullyShardedInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    a = constant_op.constant(np.arange(8).reshape((2, 4)), dtype=dtypes.float32)
+    b = constant_op.constant(
+        np.arange(8).reshape((2, 4)) + 10.0, dtype=dtypes.float32)
+    expected_result = op(a, b)
+
+    sharded_layout_2d = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, sharded_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, sharded_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_BOOL_OPS)
+  def testBinaryBoolOpsWithFullyShardedInputs(self, op):
+    a = array_ops.reshape(
+        constant_op.constant(
+            [True, False, True, False, True, False, True, False]), [2, 4])
+    b = array_ops.reshape(
+        constant_op.constant(
+            [True, True, True, True, False, False, False, False]), [2, 4])
+    expected_result = op(a, b)
+
+    sharded_layout_2d = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, sharded_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(expected_result, sharded_layout_2d, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_INT_OPS)
+  def testBinaryIntOpsWithFullyShardedInputs(self, op):
+    a = constant_op.constant(np.arange(8).reshape((2, 4)))
+    b = constant_op.constant(np.arange(8).reshape((2, 4)) + 1)
+    expected_result = op(a, b)
+
+    sharded_layout_2d = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    a = numpy_util.pack_numpy(a, sharded_layout_2d)
+    b = numpy_util.pack_numpy(b, sharded_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(expected_result, sharded_layout_2d, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_FLOAT_OPS)
+  def testBinaryFloatOpsWithBatchShardedInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.]]), dtype=dtypes.float32)
+    b = constant_op.constant(
+        np.array([[10., 20.], [30., 40.]]), dtype=dtypes.float32)
+    expected_result = op(a, b)
+
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+    b = numpy_util.pack_numpy(b, self.first_dimension_sharded_layout)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result,
+        self.first_dimension_sharded_layout,
+        dtensor_result,
+        tol=tol)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_INT_OPS)
+  def testBinaryIntOpsWithBatchShardedInputs(self, op):
+    a = constant_op.constant(np.array([[1, 2], [3, 4]]))
+    b = constant_op.constant(np.array([[5, 6], [7, 4]]))
+    expected_result = op(a, b)
+
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+    b = numpy_util.pack_numpy(b, self.first_dimension_sharded_layout)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(expected_result,
+                            self.first_dimension_sharded_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.BINARY_FLOAT_OPS_WITH_BROADCASTING_SUPPORT
+  )
+  def testBinaryFloatOpsWithFullyReplicatedBroadcastableInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    # Currently we only support scalar.
+    a = constant_op.constant(23.4)
+    b = constant_op.constant([[10., 20.], [30., 40.]])
+    expected_result = op(a, b)
+
+    a = api.copy_to_mesh(a, self.scalar_replicated_layout)
+    b = api.copy_to_mesh(b, self.replicated_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, self.replicated_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(
+      test_util_ops.BINARY_INT_OPS_WITH_BROADCASTING_SUPPORT
+  )
+  def testBinaryIntOpsWithFullyReplicatedBroadcastableInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    # Currently we only support scalar.
+    a = constant_op.constant(3)
+    b = constant_op.constant([[0, 1], [2, 3]])
+    a, b = order_broadcastable_operands(op, a, b)
+    expected_result = op(a, b)
+
+    a = api.copy_to_mesh(a, self.scalar_replicated_layout)
+    b = api.copy_to_mesh(b, self.replicated_layout_2d)
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, self.replicated_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(
+      test_util_ops.BINARY_FLOAT_OPS_WITH_BROADCASTING_SUPPORT
+  )
+  def testBinaryOpsWithFullyShardedBroadcastableInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    # Currently we only support scalar.
+    a = constant_op.constant(23.4)
+    b = constant_op.constant(
+        10.0 * np.arange(8).reshape((2, 4)), dtype=dtypes.float32)
+    expected_result = op(a, b)
+
+    a = api.copy_to_mesh(a, self.scalar_replicated_layout)
+    sharded_layout_2d = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    b = numpy_util.pack_numpy(b, sharded_layout_2d)
+
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, sharded_layout_2d, dtensor_result, tol=tol)
+
+  @parameterized.named_parameters(
+      test_util_ops.BINARY_FLOAT_OPS_WITH_BROADCASTING_SUPPORT
+  )
+  def testBinaryOpsWithBatchShardedBroadcastableInputs(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    # Currently we only support scalar.
+    a = constant_op.constant(23.4)
+    b = constant_op.constant(
+        np.array([[10., 20.], [30., 40.]]), dtype=dtypes.float32)
+    expected_result = op(a, b)
+
+    a = api.copy_to_mesh(a, self.scalar_replicated_layout)
+    b = numpy_util.pack_numpy(b, self.first_dimension_sharded_layout)
+
+    dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result,
+        self.first_dimension_sharded_layout,
+        dtensor_result,
+        tol=tol)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          [
+              {
+                  'testcase_name': 'Concat',
+                  'op': (lambda v: array_ops.concat(values=v, axis=1)),
+              },
+              {
+                  'testcase_name':
+                      'ConcatV1',
+                  'op':
+                      (lambda v: gen_array_ops.concat(concat_dim=1, values=v)),
+              },
+              {
+                  'testcase_name': 'ConcatV2',
+                  'op': (lambda v: gen_array_ops.concat_v2(values=v, axis=1)),
+              },
+          ],
+          [
+              {
+                  'shard_type': 'replicated',
+              },
+              {
+                  'shard_type': 'sharded',
+              },
+              {
+                  'shard_type': 'mixed',
+              },
+          ],
+      ))
+  def testConcatOpSPMD(self, op, shard_type):
+    layout_a = self.replicated_layout_2d
+    layout_b = self.replicated_layout_2d
+    layout_output = self.replicated_layout_2d
+
+    if shard_type == 'sharded':
+      layout_a = self.first_dimension_sharded_layout
+      layout_b = self.first_dimension_sharded_layout
+      layout_output = self.first_dimension_sharded_layout
+    elif shard_type == 'mixed':
+      layout_b = self.first_dimension_sharded_layout
+      layout_output = self.first_dimension_sharded_layout
+
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    b = constant_op.constant([[1., 2.], [3., 4.]])
+    expected_result = op([a, b])
+
+    with api.run_on(self.mesh):
+      a = numpy_util.pack_numpy(a, layout_a)
+      b = numpy_util.pack_numpy(b, layout_b)
+      c = op([a, b])
+
+    self.assertDTensorEqual(expected_result, layout_output, c)
+
+  @parameterized.named_parameters([{
+      'testcase_name': 'ConcatV1',
+      'op': (lambda v: gen_array_ops.concat(concat_dim=1, values=v))
+  }, {
+      'testcase_name': 'ConcatV2',
+      'op': (lambda v: gen_array_ops.concat_v2(values=v, axis=1))
+  }])
+  def testConcatOpShardedOnConcatDim(self, op):
+    a = constant_op.constant(
+        np.arange(16).reshape((2, 2, 4)), dtype=dtypes.float32)
+    b = constant_op.constant(
+        np.arange(16).reshape((2, 2, 4)), dtype=dtypes.float32)
+    expected_result = op([a, b])
+
+    a_layout = Layout([layout_lib.UNSHARDED, _MESH_DIM_X, _MESH_DIM_Y],
+                      self.mesh)
+    b_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                      self.mesh)
+    # If any input is sharded on the concat dim, then the concat dim is
+    # replicated in the output. Dim 0 in the output is replicated because of
+    # broadcast compatibility, mesh dimension X is already used in dim 1 of
+    # input a.
+    output_layout = Layout(
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_Y], self.mesh)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+
+    @polymorphic_function.function
+    def concat_fn(a, b):
+      return op([a, b])
+
+    dtensor_result = concat_fn(a, b)
+
+    self.assertDTensorEqual(expected_result, output_layout, dtensor_result)
+
+  def testPackWithDifferentInputLayouts(self):
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    b = constant_op.constant([[1., 2.], [3., 4.]])
+    expected_result = gen_array_ops.pack(values=[a, b], axis=-1)
+
+    a = numpy_util.pack_numpy(a, self.replicated_layout_2d)
+    b = numpy_util.pack_numpy(b, self.first_dimension_sharded_layout)
+
+    @polymorphic_function.function
+    def pack_fn(a, b):
+      c = gen_array_ops.pack(values=[a, b], axis=-1)
+      return api.relayout(c, self.first_dimension_sharded_layout_3d)
+
+    dtensor_result = pack_fn(a, b)
+
+    self.assertDTensorEqual(expected_result,
+                            self.first_dimension_sharded_layout_3d,
+                            dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.REDUCTION_OPS)
+  def testReductionOpsWithFullyReplicatedInputs(self, op):
+    for axis, expected_layout in [([0], self.replicated_layout_1d),
+                                  ([1], self.replicated_layout_1d),
+                                  ([0, 1], self.scalar_replicated_layout),
+                                  (None, self.scalar_replicated_layout)]:
+      # Disable the pylint as the cell var is used for this iteration only.
+      # pylint: disable=cell-var-from-loop
+      reduction_op = lambda x: op(x, axis=axis)
+      # pylint: enable=cell-var-from-loop
+
+      a = constant_op.constant([[1., 2.], [3., 4.]])
+      expected_result = reduction_op(a)
+
+      a = api.copy_to_mesh(a, self.replicated_layout_2d)
+      with api.run_on(self.mesh):
+        dtensor_result = reduction_op(a)
+
+      self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.REDUCTION_OPS)
+  def testReductionOpsWithBatchParallelInputs(self, op):
+    sharded_layout_1d = Layout([_MESH_DIM_X], self.mesh)
+    for axis, expected_layout in [
+        (
+            [0],
+            self.replicated_layout_1d,
+        ),
+        ([1], sharded_layout_1d),
+        (
+            [0, 1],
+            self.scalar_replicated_layout,
+        ),
+        (
+            None,
+            self.scalar_replicated_layout,
+        ),
+    ]:
+      # Disable the pylint as the cell var is used for this iteration only.
+      # pylint: disable=cell-var-from-loop
+      reduction_op = lambda x: op(x, axis=axis)
+      # pylint: enable=cell-var-from-loop
+
+      a = constant_op.constant(
+          np.array([[1., 2.], [3., 4.], [5.0, 6.0], [7.0, 8.0]]),
+          dtype=dtypes.float32)
+      expected_result = reduction_op(a)
+
+      a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+      with api.run_on(self.mesh):
+        dtensor_result = reduction_op(a)
+
+        self.assertDTensorEqual(expected_result, expected_layout,
+                                dtensor_result)
+
+  def testReduceLogSumExpWithBatchParallelInputs(self):
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.], [5.0, 6.0], [7.0, 8.0]]),
+        dtype=dtypes.float32)
+    expected_result = math_ops.reduce_logsumexp(a, axis=-1)
+
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+    with api.run_on(self.mesh):
+      dtensor_result = math_ops.reduce_logsumexp(a, axis=-1)
+
+      self.assertDTensorEqual(expected_result,
+                              self.first_dimension_sharded_layout_1d,
+                              dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.REDUCTION_OPS)
+  def testReductionOpsWithBatchParallelInputsWithInt64Dtype(self, op):
+    self.skipForDeviceType(['TPU'], 'reduce on TPU only supports int32')
+
+    sharded_layout_1d = Layout([_MESH_DIM_X], self.mesh)
+    for axis, expected_layout in [
+        (
+            [0],
+            self.replicated_layout_1d,
+        ),
+        (
+            [1],
+            sharded_layout_1d,
+        ),
+        (
+            [0, 1],
+            self.scalar_replicated_layout,
+        ),
+        (
+            None,
+            self.scalar_replicated_layout,
+        ),
+    ]:
+      # Disable the pylint as the cell var is used for this iteration only.
+      # pylint: disable=cell-var-from-loop
+      reduction_op = lambda x: op(x, axis=axis)
+      # pylint: enable=cell-var-from-loop
+
+      a = constant_op.constant(
+          np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dtype=dtypes.int64)
+      expected_result = reduction_op(a)
+
+      # pylint: disable=g-long-lambda
+      a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+      with api.run_on(self.mesh):
+        dtensor_result = reduction_op(a)
+
+        self.assertDTensorEqual(expected_result, expected_layout,
+                                dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.REDUCTION_OPS)
+  def testReductionOpsWithBatchParallelInputsWithInt32(self, op):
+    self.skipForDeviceType(['GPU'], 'reduce on GPU only supports int64')
+
+    sharded_layout_1d = Layout([_MESH_DIM_X], self.mesh)
+    for axis, expected_layout in [
+        (
+            [0],
+            self.replicated_layout_1d,
+        ),
+        (
+            [1],
+            sharded_layout_1d,
+        ),
+        (
+            [0, 1],
+            self.scalar_replicated_layout,
+        ),
+        (
+            None,
+            self.scalar_replicated_layout,
+        ),
+    ]:
+      # Disable the pylint as the cell var is used for this iteration only.
+      # pylint: disable=cell-var-from-loop
+      reduction_op = lambda x: op(x, axis=axis)
+      # pylint: enable=cell-var-from-loop
+
+      a = constant_op.constant(
+          np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dtype=dtypes.int32)
+      expected_result = reduction_op(a)
+
+      # pylint: disable=g-long-lambda
+      a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+      with api.run_on(self.mesh):
+        dtensor_result = reduction_op(a)
+
+        self.assertDTensorEqual(expected_result, expected_layout,
+                                dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          test_util_ops.REDUCTION_OPS,
+          [
+              {
+                  'dtype': dtypes.float32
+              },
+              {
+                  'dtype': dtypes.int32
+              },
+          ],
+      ))
+  def testReductionOpsWithReplicatedWithDtypes(self, op, dtype):
+    self.skipForDeviceType(['GPU'], 'b/169353279: int32 caused segfault on GPU')
+
+    axis = [0]
+    # Disable the pylint as the cell var is used for this iteration only.
+    # pylint: disable=cell-var-from-loop
+    reduction_op = lambda x: op(x, axis=axis)
+    # pylint: enable=cell-var-from-loop
+
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.], [5.0, 6.0], [7.0, 8.0]]), dtype=dtype)
+    expected_result = reduction_op(a)
+    a = numpy_util.pack_numpy(a, self.replicated_layout_2d)
+
+    with api.run_on(self.mesh):
+      dtensor_result = reduction_op(a)
+
+      self.assertDTensorEqual(expected_result, self.replicated_layout_1d,
+                              dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          test_util_ops.REDUCTION_OPS,
+          [
+              {
+                  'dtype': dtypes.float32
+              },
+              {
+                  'dtype': dtypes.int32
+              },
+          ],
+      ))
+  def testReductionOpsWithBatchShardingWithDTypes(self, op, dtype):
+    self.skipForDeviceType(['GPU'], 'b/169353279: int32 caused segfault on GPU')
+
+    axis = [1]
+    # Disable the pylint as the cell var is used for this iteration only.
+    # pylint: disable=cell-var-from-loop
+    reduction_op = lambda x: op(x, axis=axis)
+    # pylint: enable=cell-var-from-loop
+
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.], [5.0, 6.0], [7.0, 8.0]]), dtype=dtype)
+    expected_result = reduction_op(a)
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+    with api.run_on(self.mesh):
+      dtensor_result = reduction_op(a)
+
+      self.assertDTensorEqual(expected_result,
+                              self.first_dimension_sharded_layout_1d,
+                              dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          test_util_ops.REDUCTION_OPS,
+          [
+              {
+                  'axis': [0, 1],
+                  'dtype': dtypes.float32
+              },
+              {
+                  'axis': [0, 1],
+                  'dtype': dtypes.int32
+              },
+              {
+                  'axis': None,
+                  'dtype': dtypes.float32
+              },
+              {
+                  'axis': None,
+                  'dtype': dtypes.int32
+              },
+          ],
+      ))
+  def testReductionOpsWithReplicatedLayoutAndDTypes(self, op, axis, dtype):
+    self.skipForDeviceType(['GPU'], 'b/169353279: int32 caused segfault on GPU')
+
+    # Disable the pylint as the cell var is used for this iteration only.
+    # pylint: disable=cell-var-from-loop
+    reduction_op = lambda x: op(x, axis=axis)
+    # pylint: enable=cell-var-from-loop
+
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.], [5.0, 6.0], [7.0, 8.0]]), dtype=dtype)
+    expected_result = reduction_op(a)
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+
+    with api.run_on(self.mesh):
+      dtensor_result = reduction_op(a)
+
+      self.assertDTensorEqual(expected_result, self.scalar_replicated_layout,
+                              dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          [
+              {
+                  'testcase_name': 'FullyReplicatedInputs',
+                  'shard_type': 'replicated',
+              },
+              {
+                  'testcase_name': 'BatchShardedInputs',
+                  'shard_type': 'batch_sharded',
+              },
+          ],
+          [
+              {
+                  'axis': -1,
+              },
+              {
+                  'axis': 0,
+              },
+              {
+                  'axis': 1,
+              },
+          ],
+      )
+  )
+  def testOneHotSPMDWith(self, shard_type, axis):
+    if axis != -1:
+      self.skipTest('b/177569789: fix this test with layout propagation v2')
+
+    indices = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.int32)
+    depth = constant_op.constant(10, dtype=dtypes.int32)
+    indices_layout = (
+        self.replicated_layout_2d
+        if shard_type == 'replicated' else self.first_dimension_sharded_layout)
+    output_layout = (
+        Layout.replicated(self.mesh, rank=3) if shard_type == 'replicated' else
+        Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=3))
+
+    expected_result = array_ops.one_hot(indices, depth, axis=axis)
+
+    indices = numpy_util.pack_numpy(indices, indices_layout)
+    depth = api.copy_to_mesh(depth, self.scalar_replicated_layout)
+    dtensor_result = array_ops.one_hot(indices, depth, axis=axis)
+    if axis == 0 and shard_type == 'batch_sharded':
+      output_layout = self.middle_dimension_sharded_layout_3d
+
+    self.assertDTensorEqual(expected_result, output_layout, dtensor_result)
+
+  def testOneHotSPMDWithDifferentLayout(self):
+    indices = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.int32)
+    depth = constant_op.constant(10, dtype=dtypes.int32)
+    expected_result = array_ops.one_hot(indices, depth, axis=2)
+
+    indices = numpy_util.pack_numpy(indices, self.replicated_layout_2d)
+
+    depth = api.copy_to_mesh(depth, self.scalar_replicated_layout)
+
+    @polymorphic_function.function
+    def one_hot_fn(indices, depth):
+      result = array_ops.one_hot(indices, depth, axis=2)
+      return api.relayout(result, self.first_dimension_sharded_layout_3d)
+
+    dtensor_result = one_hot_fn(indices, depth)
+
+    self.assertDTensorEqual(expected_result,
+                            self.first_dimension_sharded_layout_3d,
+                            dtensor_result)
+
+  def testL2LossOpsWithFullyReplicatedInputs(self):
+    loss_op = gen_nn_ops.l2_loss
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    expected_result = loss_op(a)
+    expected_layout = self.scalar_replicated_layout
+
+    a = api.copy_to_mesh(a, self.replicated_layout_2d)
+    dtensor_result = loss_op(a)
+
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  def testL2LossOpsWithFullyShardedInputs(self):
+    loss_op = gen_nn_ops.l2_loss
+    a = constant_op.constant([[1., 2.], [3., 4.]])
+    expected_result = loss_op(a)
+    expected_layout = self.scalar_replicated_layout
+
+    a = numpy_util.pack_numpy(a, self.first_dimension_sharded_layout)
+    dtensor_result = loss_op(a)
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.EXPANSION_OPS)
+  def testExpansionOpsReplicatedLayout(self, inputs, op):
+    self.skipTest('b/177569789: fix this test with layout propagation v2')
+
+    global_op_args = inputs()
+    expected_result = op(*global_op_args)
+
+    with api.run_on(self.mesh):
+      dtensor_op_args = inputs()
+
+      def _broadcast_to_replicated(x):
+        x = constant_op.constant(x)
+        return api.copy_to_mesh(
+            x, Layout.replicated(self.mesh, rank=x.shape.ndims))
+
+      dtensor_op_args = nest.map_structure(_broadcast_to_replicated,
+                                           dtensor_op_args)
+
+      with api._dtensor_device()._default_layout(self.replicated_layout_2d):
+        dtensor_result = op(*dtensor_op_args)
+    self.assertDTensorEqual(expected_result, self.replicated_layout_2d,
+                            dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.EXPANSION_OPS)
+  def testExpansionOpsFullySharded(self, inputs, op):
+    self.skipTest('b/177569789: fix this test with layout propagation v2')
+
+    global_op_args = inputs()
+    expected_result = op(*global_op_args)
+
+    with api.run_on(self.mesh):
+      dtensor_op_args = inputs()
+
+      def _broadcast_to_replicated(x):
+        x = constant_op.constant(x)
+        return api.copy_to_mesh(
+            x, Layout.replicated(self.mesh, rank=x.shape.ndims))
+
+      dtensor_op_args = nest.map_structure(_broadcast_to_replicated,
+                                           dtensor_op_args)
+      sharded_layout_2d = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+      with api._dtensor_device()._default_layout(sharded_layout_2d):
+        dtensor_result = op(*dtensor_op_args)
+
+    self.assertDTensorEqual(expected_result, sharded_layout_2d, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.EXPANSION_OPS)
+  def testExpansionOpsBatchSharded(self, inputs, op):
+    self.skipTest('b/177569789: fix this test with layout propagation v2')
+
+    global_op_args = inputs()
+    expected_result = op(*global_op_args)
+
+    first_d_shard_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED],
+                                  self.mesh)
+
+    with api.run_on(self.mesh):
+      dtensor_op_args = inputs()
+
+      def _broadcast_to_replicated(x):
+        x = constant_op.constant(x)
+        return api.copy_to_mesh(
+            x, Layout.replicated(self.mesh, rank=x.shape.ndims))
+
+      dtensor_op_args = nest.map_structure(_broadcast_to_replicated,
+                                           dtensor_op_args)
+
+      with api._dtensor_device().default_layout(first_d_shard_layout):
+        dtensor_result = op(*dtensor_op_args)
+
+    self.assertDTensorEqual(expected_result, first_d_shard_layout,
+                            dtensor_result)
+
+  def testSliceOpsWithFullyReplicatedInputs(self):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.slice(t, [0, 0], [-1, 2])
+
+    a = api.copy_to_mesh(t, self.replicated_layout_2d)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.slice(a, [0, 0], [-1, 2])
+
+    self.assertDTensorEqual(expected_result, self.replicated_layout_2d,
+                            dtensor_result)
+
+  @parameterized.named_parameters(('_minus_one_size', -1), ('_pos_size', 2))
+  def testSliceOpsWithFullSlicingOnShardedInputs(self, size):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.slice(t, [0, 0], [size, 2])
+    sharded_layout = self.first_dimension_sharded_layout
+
+    t = numpy_util.pack_numpy(t, sharded_layout)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.slice(t, [0, 0], [size, 2])
+
+    self.assertDTensorEqual(expected_result, sharded_layout, dtensor_result)
+
+  def testSliceOpsWithDynamicBeginFullSlicingOnShardedInputs(self):
+    tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    begins = constant_op.constant([0, 0], dtype=dtypes.int32)
+
+    @polymorphic_function.function
+    def slice_fn(tensor, begins):
+      return array_ops.slice(tensor, begins, [2, 2])
+
+    expected_result = slice_fn(tensor, begins)
+
+    sharded_layout = self.first_dimension_sharded_layout
+
+    tensor = numpy_util.pack_numpy(tensor, sharded_layout)
+    begins = numpy_util.pack_numpy(begins, self.replicated_layout_1d)
+
+    dtensor_result = slice_fn(tensor, begins)
+
+    self.assertDTensorEqual(expected_result, sharded_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('FullyReplicatedInputs', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED] * 2), ('NewAxisMask', {
+          'begin': [0, 0, 0, 0],
+          'end': [0, 0, 2, 4],
+          'strides': [1, 1, 1, 1],
+          'new_axis_mask': 3
+      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 4),
+      ('ShrinkAxisMask', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 1],
+          'shrink_axis_mask': 2
+      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED]),
+      ('ShardingOnNonSlicedDimension', {
+          'begin': [0, 0],
+          'end': [2, 2],
+          'strides': [1, 2]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('StrideOnShardedDimensionNoRelayout1', {
+          'begin': [0, 0],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout2', {
+          'begin': [0, 1],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout3', {
+          'begin': [0, 0],
+          'end': [2, 3],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNeedRelayout', {
+          'begin': [0, 0],
+          'end': [-1, 4],
+          'strides': [1, 3]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED], [layout_lib.UNSHARDED] * 2))
+  def testStridedSliceOps(self, args, input_layout, expected_layout=None):
+    input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = gen_array_ops.strided_slice(input=input_tensor, **args)
+
+    input_layout = Layout(input_layout, self.mesh)
+    if expected_layout is None:
+      expected_layout = input_layout
+    else:
+      expected_layout = Layout(expected_layout, self.mesh)
+
+    dtensor_input_tensor = numpy_util.pack_numpy(input_tensor, input_layout)
+    dtensor_result = gen_array_ops.strided_slice(
+        input=dtensor_input_tensor, **args)
+
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('FullyReplicatedInputs', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED] * 2), ('NewAxisMask', {
+          'begin': [0, 0, 0, 0],
+          'end': [0, 0, 2, 4],
+          'strides': [1, 1, 1, 1],
+          'new_axis_mask': 3
+      }, [layout_lib.UNSHARDED] * 2), ('ShrinkAxisMask', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 1],
+          'shrink_axis_mask': 2
+      }, [layout_lib.UNSHARDED] * 2), ('ShardingOnNonSlicedDimension', {
+          'begin': [0, 0],
+          'end': [2, 2],
+          'strides': [1, 2]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('StrideOnShardedDimensionNoRelayout1', {
+          'begin': [0, 0],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout2', {
+          'begin': [0, 1],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout3', {
+          'begin': [0, 0],
+          'end': [2, 3],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNeedRelayout', {
+          'begin': [0, 0],
+          'end': [-1, 4],
+          'strides': [1, 3]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED], [layout_lib.UNSHARDED] * 2))
+  def testStridedSliceGradOps(self, args, input_layout, expected_layout=None):
+    input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    shape = input_tensor.shape.as_list()
+    input_layout = Layout(input_layout, self.mesh)
+    if expected_layout is None:
+      expected_layout = input_layout
+    else:
+      expected_layout = Layout(expected_layout, self.mesh)
+
+    grad = gen_array_ops.strided_slice(input=input_tensor, **args)
+    expected_result = gen_array_ops.strided_slice_grad(
+        shape=shape, **args, dy=grad)
+
+    dtensor_input_tensor = numpy_util.pack_numpy(input_tensor, input_layout)
+    grad = gen_array_ops.strided_slice(input=dtensor_input_tensor, **args)
+    dtensor_result = gen_array_ops.strided_slice_grad(
+        shape=shape, **args, dy=grad)
+
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('FullyReplicatedInputs', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 2),
+      ('NewAxisMask', {
+          'begin': [0, 0, 0, 0],
+          'end': [0, 0, 2, 4],
+          'strides': [1, 1, 1, 1],
+          'new_axis_mask': 3
+      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 4),
+      ('ShrinkAxisMask', {
+          'begin': [0, 0],
+          'end': [-1, 2],
+          'strides': [1, 1],
+          'shrink_axis_mask': 2
+      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED]),
+      ('ShardingOnNonSlicedDimension', {
+          'begin': [0, 0],
+          'end': [2, 2],
+          'strides': [1, 2]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED
+         ], [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('StrideOnShardedDimensionNoRelayout1', {
+          'begin': [0, 0],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X
+         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout2', {
+          'begin': [0, 1],
+          'end': [2, 4],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X
+         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNoRelayout3', {
+          'begin': [0, 0],
+          'end': [2, 3],
+          'strides': [1, 2]
+      }, [layout_lib.UNSHARDED, _MESH_DIM_X
+         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('StrideOnShardedDimensionNeedRelayout', {
+          'begin': [0, 0],
+          'end': [-1, 4],
+          'strides': [1, 3]
+      }, [_MESH_DIM_X, layout_lib.UNSHARDED
+         ], [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 2))
+  def testStridedSliceUpdateOps(self,
+                                args,
+                                input_layout,
+                                value_layout,
+                                expected_layout=None):
+    self.skipForDeviceType(['TPU'], 'b/123559667; op has no XLA implementation')
+    input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    value_tensor = gen_array_ops.strided_slice(input=input_tensor, **args) * 10.
+    expected_result = gen_array_ops.tensor_strided_slice_update(
+        input=input_tensor, value=value_tensor, **args)
+
+    input_layout = Layout(input_layout, self.mesh)
+    value_layout = Layout(value_layout, self.mesh)
+    if expected_layout is None:
+      expected_layout = input_layout
+    else:
+      expected_layout = Layout(expected_layout, self.mesh)
+
+    dtensor_input_tensor = numpy_util.pack_numpy(input_tensor, input_layout)
+    dtensor_value_tensor = numpy_util.pack_numpy(value_tensor, value_layout)
+    dtensor_result = gen_array_ops.tensor_strided_slice_update(
+        input=dtensor_input_tensor, value=dtensor_value_tensor, **args)
+
+    self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  def testBroadcastGradientArgs(self):
+    a = constant_op.constant([128, 10])
+    b = constant_op.constant([128, 10])
+    ea, eb = gen_array_ops.broadcast_gradient_args(s0=a, s1=b)
+
+    a = api.copy_to_mesh(a, self.replicated_layout_1d)
+    b = api.copy_to_mesh(b, self.replicated_layout_1d)
+    da, db = gen_array_ops.broadcast_gradient_args(s0=a, s1=b)
+
+    self.assertDTensorEqual(ea, self.replicated_layout_1d, da)
+    self.assertDTensorEqual(eb, self.replicated_layout_1d, db)
+
+  def _transpose_shape(self, transpose, shape):
+    if transpose:
+      shape[-1], shape[-2] = shape[-2:]
+    return shape
+
+  def _merge_layouts_for_matmul(self, layout_a, layout_b, transpose_a,
+                                transpose_b):
+    # This merge does no error checking and assumes that mesh dimensions
+    # are compatible and that layout_a and b are on the same mesh.
+    # Prepend enough layout_lib.UNSHARDED to give both lists the same size.
+    a_sharding_spec = (
+        [layout_lib.UNSHARDED] * max(0, layout_b.rank - layout_a.rank) +
+        layout_a.sharding_specs)
+    b_sharding_spec = (
+        [layout_lib.UNSHARDED] * max(0, layout_a.rank - layout_b.rank) +
+        layout_b.sharding_specs)
+    if transpose_a:
+      a_sharding_spec[-1], a_sharding_spec[-2] = a_sharding_spec[-2:]
+    if transpose_b:
+      b_sharding_spec[-1], b_sharding_spec[-2] = b_sharding_spec[-2:]
+
+    def _get_mesh_dim(i):
+      if b_sharding_spec[i] == layout_lib.UNSHARDED:
+        return a_sharding_spec[i]
+      return b_sharding_spec[i]
+
+    final_layout = [_get_mesh_dim(i) for i in range(len(a_sharding_spec) - 2)]
+    final_layout.append(a_sharding_spec[-2])
+    final_layout.append(b_sharding_spec[-1])
+    if final_layout[-2] == final_layout[-1]:
+      final_layout[-2] = layout_lib.UNSHARDED
+      final_layout[-1] = layout_lib.UNSHARDED
+    for i in range(len(final_layout) - 2):
+      if (final_layout[i] == a_sharding_spec[-2] or
+          final_layout[i] == a_sharding_spec[-1] or
+          final_layout[i] == b_sharding_spec[-2] or
+          final_layout[i] == b_sharding_spec[-1]):
+        final_layout[i] = layout_lib.UNSHARDED
+
+    return Layout(final_layout, layout_a.mesh)
+
+  @parameterized.named_parameters(*test_util.product(_MATMUL_IMPLEMENTED,
+                                                     _MATMUL_TRANSPOSE))
+  def testMatMul(self, a_layout, b_layout, transpose_a, transpose_b):
+    # Swap layout 1 and 2, so that test name is correct (contracting and
+    # non_contracting dims switch when transposed).
+    if transpose_a and a_layout > 0:
+      a_layout = 3 - a_layout
+    if transpose_b and b_layout > 0:
+      b_layout = 3 - b_layout
+    a_layout = self.layouts_2d[a_layout]
+    b_layout = self.layouts_2d[b_layout]
+    a_numpy = np.random.uniform(size=self._transpose_shape(transpose_a, [4, 8]))
+    b_numpy = np.random.uniform(
+        size=self._transpose_shape(transpose_b, [8, 12]))
+    a = constant_op.constant(a_numpy, dtype=dtypes.float32)
+    b = constant_op.constant(b_numpy, dtype=dtypes.float32)
+
+    expected = math_ops.matmul(
+        a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+    dtensor_result = math_ops.matmul(
+        a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    expected_layout = self._merge_layouts_for_matmul(a_layout, b_layout,
+                                                     transpose_a, transpose_b)
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(*test_util.product(_BATCH_MATMUL_IMPLEMENTED,
+                                                     _MATMUL_TRANSPOSE))
+  def testBatchMatMul(self, a_layout, b_layout, transpose_a, transpose_b):
+    # Swap layout 2 and 3, so that test name is correct (contracting and
+    # non_contracting dims switch when transposed).
+    if transpose_a and a_layout > 1:
+      a_layout = 5 - a_layout
+    if transpose_b and b_layout > 1:
+      b_layout = 5 - b_layout
+    a_layout = self.layouts_3d[a_layout]
+    b_layout = self.layouts_3d[b_layout]
+    a_numpy = np.random.uniform(
+        size=self._transpose_shape(transpose_a, [2, 4, 8]))
+    b_numpy = np.random.uniform(
+        size=self._transpose_shape(transpose_b, [2, 8, 12]))
+    a = constant_op.constant(a_numpy, dtype=dtypes.float32)  # 2x4x8
+    b = constant_op.constant(b_numpy, dtype=dtypes.float32)  # 2x8x12
+
+    # math_ops.matmul should emit a BatchMatMulV2 op here.
+    expected = math_ops.matmul(
+        a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+    dtensor_result = math_ops.matmul(
+        a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    expected_layout = self._merge_layouts_for_matmul(a_layout, b_layout,
+                                                     transpose_a, transpose_b)
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_a_unsharded_b_unsharded', 0, 0), ('_a_batch_b_unsharded', 1, 0),
+      ('_a_non_contracting_b_unsharded', 2, 0),
+      ('_a_contracting_b_unsharded', 3, 0),
+      ('_a_unsharded_b_non_contracting', 0, 2),
+      ('_a_unsharded_b_contracting', 0, 1),
+      ('_a_contracting_b_contracting', 3, 1),
+      ('_a_contracting_b_non_contracting', 3, 2),
+      ('_a_non_contracting_b_non_contracting', 2, 2),
+      ('_a_non_contracting_b_contracting', 2, 1),
+      ('_a_batch_b_non_contracting', 1, 2), ('_a_batch_b_contracting', 1, 1))
+  def testBatchMatMulWithBroadcasting(self, a_layout, b_layout):
+    a_layout = self.layouts_3d[a_layout]
+    b_layout = self.layouts_2d[b_layout]
+    a_numpy = np.random.uniform(size=[2, 2, 4])
+    b_numpy = np.random.uniform(size=[4, 6])
+    a = constant_op.constant(a_numpy, dtype=dtypes.float32)  # 2x2x4
+    b = constant_op.constant(b_numpy, dtype=dtypes.float32)  # 4x6
+
+    # math_ops.matmul should emit a BatchMatMulV2 op here.
+    expected = math_ops.matmul(a, b)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+    dtensor_result = math_ops.matmul(a, b)
+    expected_layout = self._merge_layouts_for_matmul(a_layout, b_layout, False,
+                                                     False)
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(('_positive_axis_negative_batch', 0, -1),
+                                  ('_negative_axis_positive_batch', -2, 0))
+  def testGather(self, axis, batch_dims):
+    params = np.arange(1000 * 4).reshape((1000, 4))
+    # "batch" size = 2, num_indices = 3 per example
+    indices = np.random.randint(0, 1000, size=4 * 3).reshape((4, 3))
+    expected = array_ops.gather_v2(
+        params, indices, axis=axis, batch_dims=batch_dims)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout.replicated(self.mesh, 2))
+    indices = numpy_util.pack_numpy(
+        indices, Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=2))
+
+    dtensor_result = array_ops.gather_v2(
+        params, indices, axis=axis, batch_dims=batch_dims)
+    expected_layout = Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=3)
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testResourceGather(self):
+    params = np.arange(1000 * 4).reshape((1000, 4))
+    indices = np.random.randint(0, 1000, size=1000 * 3).reshape((1000, 3))
+
+    expected = array_ops.gather_v2(variables.Variable(params), indices)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout.replicated(self.mesh, 2))
+    indices = numpy_util.pack_numpy(
+        indices, Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=2))
+
+    dtensor_result = array_ops.gather_v2(d_variable.DVariable(params), indices)
+    expected_layout = Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=3)
+
+    self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testResourceGatherRaisesErrorWhenResourceZeroDimSharded(self):
+    sharded_tensor = numpy_util.pack_numpy(
+        np.arange(1000 * 4).reshape((1000, 4)),
+        layout=Layout.batch_sharded(self.mesh, _MESH_DIM_Y, 2))
+    # "batch" size = 2, num_indices = 3 per example
+    indices = api.copy_to_mesh(
+        np.random.randint(0, 1000, size=4 * 3).reshape((4, 3)),
+        Layout.replicated(self.mesh, rank=2))
+
+    with self.assertRaisesRegex(
+        errors_impl.UnknownError,
+        'DTensor does not support sharded 0th dimension for the resource tensor'
+    ):
+      array_ops.gather_v2(d_variable.DVariable(sharded_tensor), indices)
+
+  def testUnsortedSegmentSum(self):
+    self.skipForDeviceType(['TPU'], 'waiting for cl/344197900')
+    num_segments = 12
+    data = np.random.uniform(size=[num_segments, 4])
+    segment_ids = np.random.randint(0, num_segments, size=num_segments)
+    expected = gen_math_ops.unsorted_segment_sum(data, segment_ids,
+                                                 num_segments)
+
+    data = numpy_util.pack_numpy(data, Layout.replicated(self.mesh, 2))
+    segment_ids = numpy_util.pack_numpy(
+        segment_ids, Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=1))
+    with api.run_on(self.mesh):
+      dtensor_result = gen_math_ops.unsorted_segment_sum(
+          data, segment_ids, num_segments)
+      expected_layout = Layout.replicated(self.mesh, 2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testUnsortedSegmentSumWithFullyShardedIndices(self):
+    self.skipForDeviceType(['TPU'], 'waiting for cl/344197900')
+
+    num_segments = 8
+    data = np.random.uniform(size=[2, 4, 3])
+    segment_ids = np.random.randint(0, num_segments, size=[2, 4])
+    expected = gen_math_ops.unsorted_segment_sum(data, segment_ids,
+                                                 num_segments)
+
+    data = numpy_util.pack_numpy(data, Layout.replicated(self.mesh, 3))
+    segment_ids = numpy_util.pack_numpy(
+        segment_ids, Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh))
+    with api.run_on(self.mesh):
+      dtensor_result = gen_math_ops.unsorted_segment_sum(
+          data, segment_ids, num_segments)
+      expected_layout = Layout.replicated(self.mesh, 2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_same_rank', [2, 2]),
+      ('_adding_one_rank', [2, 2, 1]),
+      ('_adding_one_rank_and_broadcasting', [2, 2, 2]),
+  )
+  def testBroadcastOpsWithFullyReplicatedInputs(self, new_shape):
+    op = gen_array_ops.broadcast_to
+    a = constant_op.constant([[1.], [3.]])
+    assert a.shape == [2, 1]
+
+    expected_result = op(a, new_shape)
+
+    a = api.copy_to_mesh(a, self.replicated_layout_2d)
+    dtensor_result = op(a, new_shape)
+
+    self.assertDTensorEqual(expected_result,
+                            Layout.replicated(self.mesh, len(new_shape)),
+                            dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util_ops.expand_test_config(
+          [
+              {
+                  'testcase_name': 'FullyReplicatedInputs',
+                  'op': array_ops.where_v2
+              },
+              {
+                  'testcase_name': 'BatchShardedInputs',
+                  'op': array_ops.where_v2
+              },
+          ],
+          [
+              {
+                  'shard_type': 'replicated',
+              },
+              {
+                  'shard_type': 'batch_sharded',
+              },
+          ],
+      ))
+  def testWhere(self, op, shard_type):
+    layout = (
+        self.replicated_layout_2d
+        if shard_type == 'replicated' else self.first_dimension_sharded_layout)
+
+    a = constant_op.constant([[True, False], [False, True]])
+    b = constant_op.constant([[10., 20.], [30., 40.]])
+    c = constant_op.constant([[50., 60.], [70., 80.]])
+    expected_result = op(a, b, c)
+
+    if shard_type == 'replicated':
+      a = api.copy_to_mesh(a, layout)
+      b = api.copy_to_mesh(b, layout)
+      c = api.copy_to_mesh(c, layout)
+    else:
+      a = numpy_util.pack_numpy(a, layout)
+      b = numpy_util.pack_numpy(b, layout)
+      c = numpy_util.pack_numpy(c, layout)
+    dtensor_result = op(a, b, c)
+
+    self.assertDTensorEqual(expected_result, layout, dtensor_result)
+
+  def testSqueezeOp(self):
+    t = array_ops.ones([1, 2, 1])
+    expected_result0 = array_ops.squeeze_v2(t)
+    expected_result1 = array_ops.squeeze_v2(t, axis=0)
+    expected_result2 = array_ops.squeeze_v2(t, axis=-1)
+
+    # t will have [1,1,1] as locally sharded shape, this covers the case that
+    # we should not squeeze the dim that's sharded.
+    t = numpy_util.pack_tf_tensor(
+        t,
+        Layout([layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED],
+               self.mesh))
+    dtensor_result0 = array_ops.squeeze_v2(t)
+    dtensor_result1 = array_ops.squeeze_v2(t, axis=0)
+    dtensor_result2 = array_ops.squeeze_v2(t, axis=-1)
+
+    self.assertDTensorEqual(expected_result0, Layout([_MESH_DIM_X], self.mesh),
+                            dtensor_result0)
+    self.assertDTensorEqual(
+        expected_result1, Layout([_MESH_DIM_X, layout_lib.UNSHARDED],
+                                 self.mesh), dtensor_result1)
+    self.assertDTensorEqual(
+        expected_result2, Layout([layout_lib.UNSHARDED, _MESH_DIM_X],
+                                 self.mesh), dtensor_result2)
+
+  @parameterized.parameters(('replicated',), ('sharded',))
+  def testDiagPart(self, shard_type):
+    x = stateless_random_ops.stateless_random_uniform(
+        shape=(16, 16), seed=[0, 1])
+    expected = gen_array_ops.diag_part(input=x)
+
+    if shard_type == 'replicated':
+      layout = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+    else:
+      layout = Layout.replicated(self.mesh, 2)
+    x = numpy_util.pack_numpy(x, layout)
+
+    got = gen_array_ops.diag_part(input=x)
+    self.assertDTensorEqual(expected, Layout.replicated(self.mesh, 1), got)
+
+  @parameterized.product(
+      axis_dim=[-3, -2, -1, 0, 1, 2],
+      shard_type=['replicated', 'batch_sharded'],
+      reverse=[True, False])
+  def testCumSum(self, axis_dim, shard_type, reverse):
+    input_tensor = stateless_random_ops.stateless_random_uniform(
+        shape=(16, 16, 16), seed=[0, 1])
+    expected = math_ops.cumsum(x=input_tensor, axis=axis_dim, reverse=reverse)
+
+    if shard_type == 'replicated':
+      layout = Layout.replicated(self.mesh, rank=3)
+      expected_layout = layout
+    else:
+      layout = Layout.batch_sharded(self.mesh, batch_dim=_MESH_DIM_X, rank=3)
+      # Axis dimension should always be replicated, even on sharding dim.
+      if axis_dim in [-3, 0]:
+        expected_layout = Layout.replicated(self.mesh, rank=3)
+      else:
+        expected_layout = layout
+
+    input_tensor = numpy_util.pack_numpy(input_tensor, layout)
+    got = math_ops.cumsum(x=input_tensor, axis=axis_dim, reverse=reverse)
+
+    self.assertDTensorEqual(expected, expected_layout, got)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testStringFormat(self, shard_spec):
+    self.skipForDeviceType(['TPU'], 'StringFormat not supported on TPU.')
+
+    np.random.seed(123)
+    inputs = constant_op.constant(
+        np.random.normal(0.0, 1.0, 8 * 9 * 9).reshape([8, 9, 9, 1]),
+        dtype=dtypes.float32)
+    expected_result = gen_string_ops.string_format(inputs=[inputs])
+
+    if shard_spec == 'sharded':
+      layout = Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=4)
+    else:
+      layout = Layout.replicated(self.mesh, rank=4)
+    inputs = numpy_util.pack_numpy(inputs, layout)
+    got = gen_string_ops.string_format(inputs=[inputs])
+
+    # Manually compare instead of assertDTensorEqual since outputs are strings.
+    self.assertEqual(
+        api.fetch_layout(got), Layout.replicated(self.mesh, rank=0))
+    for got_tensor in api.unpack(got):
+      self.assertEqual(expected_result, got_tensor)
+
+  @parameterized.named_parameters(('Sharded', 'sharded'),
+                                  ('Replicated', 'replicated'))
+  def testStringFormatOnTPURequiresCopyToMeshToCPU(self, shard_spec):
+    self.skipForDeviceType(['CPU', 'GPU'], 'Testing only for TPU.')
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+
+    tpu_mesh = Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                    test_util.create_device_list((2, 4), 'TPU'))
+    cpu_mesh = Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                    test_util.create_device_list((2, 4), 'CPU'))
+
+    cpu_layout = Layout.replicated(cpu_mesh, rank=4)
+    if shard_spec == 'sharded':
+      tpu_layout = Layout.batch_sharded(tpu_mesh, _MESH_DIM_X, rank=4)
+    else:
+      tpu_layout = Layout.replicated(tpu_mesh, rank=4)
+
+    inputs = stateless_random_ops.stateless_random_uniform(
+        shape=(8, 9, 9, 1), seed=[0, 1])
+    expected_result = gen_string_ops.string_format(inputs=[inputs])
+
+    inputs = numpy_util.pack_numpy(inputs, tpu_layout)
+    # StringFormat is not supported on TPU, so copy_to_mesh to the CPU.
+    # Since we cannot eager copy_to_mesh from an input with non-replicated
+    # layout yet, relayout to replicated layout first, and then transfer to CPU.
+    inputs = api.copy_to_mesh(
+        api.relayout(inputs, Layout.replicated(tpu_mesh, rank=4)), cpu_layout)
+
+    got = gen_string_ops.string_format(inputs=[inputs])
+    # Manually compare instead of assertDTensorEqual since outputs are strings.
+    self.assertEqual(api.fetch_layout(got), Layout.replicated(cpu_mesh, rank=0))
+    for got_tensor in api.unpack(got):
+      self.assertEqual(expected_result, got_tensor)
+
+  @parameterized.named_parameters(
+      # TODO(feyu): to_hash_bucket and to_hash_bucket_strong are not defined
+      # in the tf MLIR dialect.
+      ('ShardedFast', gen_string_ops.string_to_hash_bucket_fast, 'sharded'),
+      ('ReplicatedFast', gen_string_ops.string_to_hash_bucket_fast,
+       'replicated'),
+  )
+  def testStringToHashBucket(self, to_hash_bucket_fn, shard_spec):
+    self.skipForDeviceType(
+        ['GPU', 'TPU'],
+        'StringToHashBucket functions not supported on TPU or GPU.')
+
+    inputs = constant_op.constant(['a', 'b', 'c', 'd'], dtype=dtypes.string)
+    expected_result = to_hash_bucket_fn(inputs, num_buckets=32)
+
+    if shard_spec == 'sharded':
+      layout = Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=1)
+    else:
+      layout = Layout.replicated(self.mesh, rank=1)
+    inputs = numpy_util.pack_numpy(inputs, layout)
+    got = to_hash_bucket_fn(inputs, num_buckets=32)
+
+    self.assertDTensorEqual(expected_result, layout, got)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'Replicated',
+          'shard_type': 'replicated',
+      }, {
+          'testcase_name': 'BatchSharded',
+          'shard_type': 'batch_sharded',
+      })
+  def testTensorListReserveSetAndGetRetrievesCorrectTensor(self, shard_type):
+    self.skipForDeviceType(['TPU', 'GPU'], 'Testing only for CPU.')
+
+    input_tensor = array_ops.ones([4, 4], dtype=dtypes.int32)
+
+    if shard_type == 'replicated':
+      layout = Layout.replicated(self.mesh, rank=2)
+    else:
+      layout = Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=2)
+
+    @polymorphic_function.function
+    def f(input_tensor):
+      list_handle = gen_list_ops.tensor_list_reserve(
+          element_shape=constant_op.constant([4, 4], dtype=dtypes.int32),
+          num_elements=constant_op.constant(4, dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+      list_handle = gen_list_ops.tensor_list_set_item(
+          input_handle=list_handle,
+          index=constant_op.constant(0, dtype=dtypes.int32),
+          item=input_tensor)
+      return gen_list_ops.tensor_list_get_item(
+          input_handle=list_handle,
+          index=constant_op.constant(0, dtype=dtypes.int32),
+          element_shape=constant_op.constant([4, 4], dtype=dtypes.int32),
+          element_dtype=dtypes.int32)
+
+    got_tensor = f(numpy_util.pack_numpy(input_tensor, layout))
+    self.assertDTensorEqual(input_tensor, Layout.replicated(self.mesh, rank=2),
+                            got_tensor)
+
+  @parameterized.named_parameters(
+      ('x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('unsharded_x', [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('unsharded_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]))
+  def testDisableCopyOnRead(self, sharding_specs):
+    self.skipForDeviceType(['TPU'], 'Op not supported on TPUs.')
+
+    def f(d_var):
+      gen_resource_variable_ops.disable_copy_on_read(resource=d_var.handle)
+      return d_var
+
+    layout = Layout(sharding_specs, self.mesh)
+
+    variable = d_variable.DVariable(
+        initial_value=numpy_util.pack_numpy(
+            array_ops.ones([4, 8], dtype=dtypes.float32), layout))
+
+    # Eager
+    self.assertEqual(api.fetch_layout(f(variable)), layout)
+
+    # Function
+    self.assertEqual(
+        api.fetch_layout(polymorphic_function.function(f)(variable)), layout)
+
+  def testShardedFilename(self):
+    self.skipForDeviceType(['TPU', 'GPU'],
+                           'Strings only for CPUs, this is a host-only op.')
+
+    basename = constant_op.constant('dtensor-file')
+    shard = constant_op.constant(1, dtype=dtypes.int32)
+    num_shards = constant_op.constant(16, dtype=dtypes.int32)
+
+    layout = Layout.replicated(self.mesh, rank=0)
+
+    expected = gen_io_ops.sharded_filename(
+        basename=basename, shard=shard, num_shards=num_shards, name=None)
+
+    result = gen_io_ops.sharded_filename(
+        basename=numpy_util.pack_numpy(basename, layout),
+        shard=numpy_util.pack_numpy(shard, layout),
+        num_shards=numpy_util.pack_numpy(num_shards, layout))
+
+    self.assertEqual(api.fetch_layout(result), layout)
+    for result_tensor in api.unpack(result):
+      self.assertEqual(expected, result_tensor)
+
+  @parameterized.named_parameters(*test_util.product(
+      (('_indices_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+       ('_indices_x', [_MESH_DIM_X, layout_lib.UNSHARDED])),
+      (('_updates_unsharded_unsharded',
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+       ('_updates_x_unsharded',
+        [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED]),
+       ('_updates_unsharded_y',
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_Y]),
+       ('_updates_x_y', [layout_lib.UNSHARDED, _MESH_DIM_X, _MESH_DIM_Y]))))
+  def testScatterNd(self, indices_spec, updates_spec):
+    indices_layout = Layout(indices_spec, self.mesh)
+    updates_layout = Layout(updates_spec, self.mesh)
+    indices = constant_op.constant([[0], [15]])
+    updates = constant_op.constant([[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7],
+                                     [8, 8, 8, 8]],
+                                    [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7],
+                                     [8, 8, 8, 8]]])
+    shape = [16, 4, 4]
+
+    expected_result = gen_array_ops.scatter_nd(indices, updates, shape)
+    got_result = gen_array_ops.scatter_nd(
+        numpy_util.pack_numpy(indices, indices_layout),
+        numpy_util.pack_numpy(updates, updates_layout), shape)
+
+    self.assertDTensorEqual(expected_result, updates_layout, got_result)
+
+
+class DTensorConvSPMDTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorConvSPMDTest, self).setUp()
+
+    # TODO(b/169436213): Re-enable TPU after figuring out multi-chip story.
+    self.skipForDeviceType(['TPU'], 'reserving 4 chips on forge is unreliable')
+
+    if config.list_physical_devices('GPU') or config.list_logical_devices(
+        'TPU_SYSTEM'):
+      self.skipTest(
+          'Skipping as 3D mesh with 18 devices cannot be tested on GPU/TPU.')
+
+    # Builds a 2x3x3 mesh.
+    self._mesh_dim_b = 'b'
+    self._mesh_dim_x = 'x'
+    self._mesh_dim_y = 'y'
+    self._dims = [self._mesh_dim_b, self._mesh_dim_x, self._mesh_dim_y]
+
+    global_ids = test_util.create_device_ids_array([2, 3, 3])
+    local_ids = np.ravel(global_ids).tolist()
+
+    mesh_dict = {
+        device: Mesh(self._dims, global_ids, local_ids,
+                     test_util.create_device_list([2, 3, 3], 'CPU'))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    self._num_devices = self.mesh.size
+    test_util.reset_logical_devices('CPU', self.mesh.size)
+
+  @parameterized.named_parameters(test_util_ops.PADDINGS)
+  def testConv2DWithFullReplicatedInputs(self, padding):
+    np.random.seed(123)
+
+    x_in = np.random.normal(0.0, 1.0, 9 * 9).reshape([1, 9, 9, 1])
+    kernel_in = np.array([
+        [[[2, 0.1]], [[3, 0.2]]],
+        [[[0, 0.3]], [[1, 0.4]]],
+    ])
+
+    x = constant_op.constant(x_in, dtype=dtypes.float32)
+    kernel = constant_op.constant(kernel_in, dtype=dtypes.float32)
+    expected_result = nn_ops.conv2d_v2(
+        x, kernel, strides=[1, 1, 1, 1], padding=padding)
+
+    x = api.copy_to_mesh(x, Layout([layout_lib.UNSHARDED] * 4, self.mesh))
+    kernel = api.copy_to_mesh(kernel,
+                              Layout([layout_lib.UNSHARDED] * 4, self.mesh))
+
+    got = nn_ops.conv2d_v2(x, kernel, strides=[1, 1, 1, 1], padding=padding)
+
+    self.assertDTensorEqual(expected_result,
+                            Layout([layout_lib.UNSHARDED] * 4, self.mesh), got)
+
+  @parameterized.product(shard_type=['replicated', 'batch_sharded'])
+  def testConv3DBackpropInput(self, shard_type):
+    input_sizes = constant_op.constant([4, 4, 4, 4, 4])
+    filter_input = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 4, 4, 4, 4], seed=[0, 1])
+    out_backprop = stateless_random_ops.stateless_random_uniform(
+        shape=[4, 4, 4, 4, 4], seed=[0, 1])
+    strides = [1, 1, 1, 1, 1]
+
+    expected_result = gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=input_sizes,
+        filter=filter_input,
+        out_backprop=out_backprop,
+        strides=strides,
+        padding='SAME')
+
+    if shard_type == 'replicated':
+      grad_layout = Layout.replicated(self.mesh, rank=5)
+    else:
+      grad_layout = Layout.batch_sharded(self.mesh, self._mesh_dim_b, rank=5)
+
+    got_result = gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=numpy_util.pack_numpy(input_sizes,
+                                          Layout.replicated(self.mesh, rank=1)),
+        filter=numpy_util.pack_numpy(filter_input,
+                                     Layout.replicated(self.mesh, rank=5)),
+        out_backprop=numpy_util.pack_numpy(out_backprop, grad_layout),
+        strides=strides,
+        padding='SAME')
+
+    self.assertDTensorEqual(expected_result, grad_layout, got_result)
+
+  @parameterized.named_parameters(test_util_ops.PADDINGS)
+  def testConv2DWithBatchShardedInputs(self, padding):
+    # Reason to flip same shape policy: The backprop of the nn_ops.conv2d_v2 is
+    # simply array_ops.ones_like_v2(conv2d_result). However, as DTensor does not
+    # control gradient tape, the tape will not attach the layout from
+    # conv2d_result to the ones. In normal computation, the backprop pass has
+    # more information to pass from the final scalar loss to the conv2d, so
+    # this is not a problem.
+    # But this well-design unit tests, without same shape policy, it will get a
+    # different layout for the inputs' grad.
+    api._dtensor_device().set_same_shape_policy(True)
+
+    np.random.seed(123)
+
+    x_in = np.random.normal(0.0, 1.0, 2 * 9 * 9).reshape([2, 9, 9, 1])
+    kernel_in = np.array([
+        [[[2, 0.1]], [[3, 0.2]]],
+        [[[0, 0.3]], [[1, 0.4]]],
+    ])
+
+    x = constant_op.constant(x_in, dtype=dtypes.float32)
+    kernel = constant_op.constant(kernel_in, dtype=dtypes.float32)
+    with backprop.GradientTape() as tape:
+      tape.watch([x, kernel])
+      expected_result = nn_ops.conv2d_v2(
+          x, kernel, strides=[1, 1, 1, 1], padding=padding)
+    expected_input_gradient, expected_filter_gradient = tape.gradient(
+        expected_result, [x, kernel])
+
+    x = numpy_util.pack_numpy(
+        x, Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh))
+    kernel = api.copy_to_mesh(kernel,
+                              Layout([layout_lib.UNSHARDED] * 4, self.mesh))
+
+    # Explicitly open the scope as ops generated from tape could be broadcasted
+    # to replicated by default.
+    with api.run_on(self.mesh):
+      with backprop.GradientTape() as tape:
+        tape.watch([x, kernel])
+        got = nn_ops.conv2d_v2(x, kernel, strides=[1, 1, 1, 1], padding=padding)
+      got_input_gradient, got_filter_filter = tape.gradient(got, [x, kernel])
+
+    self.assertDTensorEqual(
+        expected_result,
+        Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh), got)
+    self.assertDTensorEqual(
+        expected_input_gradient,
+        Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh),
+        got_input_gradient)
+    self.assertDTensorEqual(expected_filter_gradient,
+                            Layout([layout_lib.UNSHARDED] * 4, self.mesh),
+                            got_filter_filter)
+    api._dtensor_device().set_same_shape_policy(False)
+
+  @parameterized.named_parameters(test_util_ops.PADDINGS)
+  def testMaxPoolWithBatchShardedInputs(self, padding):
+    np.random.seed(123)
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, row_window_size, col_window_size, 1]
+    stride_size = [1, row_window_size - 1, col_window_size - 1, 1]
+
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 2 * num_rows * num_cols * 3).reshape(
+        [2, num_rows, num_cols, 3])
+
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+    expected_result = nn_ops.max_pool_v2(inputs, window_size, stride_size,
+                                         padding)
+
+    x = numpy_util.pack_numpy(
+        inputs, Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh))
+
+    got = nn_ops.max_pool_v2(x, window_size, stride_size, padding)
+
+    self.assertDTensorEqual(
+        expected_result,
+        Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh), got)
+
+  @parameterized.named_parameters(test_util_ops.PADDINGS)
+  def testMaxPoolGradWithBatchShardedInputs(self, padding):
+    np.random.seed(123)
+    row_window_size = 3
+    col_window_size = 4
+    window_size = [1, row_window_size, col_window_size, 1]
+    stride_size = [1, row_window_size - 1, col_window_size - 1, 1]
+
+    num_rows = (row_window_size - 1) * 5 + 1
+    num_cols = (col_window_size - 1) * 7 + 1
+    x_in = np.random.normal(0.0, 1.0, 2 * num_rows * num_cols * 3).reshape(
+        [2, num_rows, num_cols, 3])
+    inputs = constant_op.constant(x_in, dtype=dtypes.float32)
+    with backprop.GradientTape() as tape:
+      tape.watch([inputs])
+      expected_result = nn_ops.max_pool_v2(inputs, window_size, stride_size,
+                                           padding)
+    expected_grad = tape.gradient(expected_result, [inputs])
+
+    x = numpy_util.pack_numpy(
+        inputs, Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh))
+
+    with api.run_on(self.mesh):
+      with backprop.GradientTape() as tape:
+        tape.watch([x])
+        dtensor_result = nn_ops.max_pool_v2(x, window_size, stride_size,
+                                            padding)
+      dtensor_grad = tape.gradient(dtensor_result, [x])
+
+    self.assertDTensorEqual(
+        expected_grad[0],
+        Layout([self._dims[0]] + [layout_lib.UNSHARDED] * 3, self.mesh),
+        dtensor_grad[0])
+
+
+class DTensorLayoutPropSPMDTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorLayoutPropSPMDTest, self).setUp()
+
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                     test_util.create_device_list((2, 4), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    self.scalar_replicated_layout = Layout.replicated(self.mesh, rank=0)
+
+    self.replicated_layout_1d = Layout.replicated(self.mesh, rank=1)
+    self.first_dimension_sharded_layout_1d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=1)
+
+    self.replicated_layout_2d = Layout.replicated(self.mesh, rank=2)
+    self.first_dimension_sharded_layout_2d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=2)
+    self.last_dimension_sharded_layout_2d = Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=2)
+
+    self.replicated_layout_3d = Layout.replicated(self.mesh, rank=3)
+    self.first_dimension_sharded_layout_3d = Layout.batch_sharded(
+        self.mesh, _MESH_DIM_X, rank=3)
+    self.middle_dimension_sharded_layout_3d = Layout(
+        [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+    self.last_dimension_sharded_layout_3d = Layout.inner_sharded(
+        self.mesh, _MESH_DIM_X, rank=3)
+
+    # Make a list so that we can index layouts by sharding dimension and then
+    # by rank.
+    self.layouts = [
+        [
+            None, self.first_dimension_sharded_layout_1d,
+            self.first_dimension_sharded_layout_2d,
+            self.first_dimension_sharded_layout_3d
+        ],
+        [
+            None, None, self.last_dimension_sharded_layout_2d,
+            self.middle_dimension_sharded_layout_3d
+        ],
+        [None, None, None, self.last_dimension_sharded_layout_3d],
+        # Keep this at the end so a sharding dim of -1 corresponds to
+        # replicated.
+        [
+            None, self.replicated_layout_1d, self.replicated_layout_2d,
+            self.replicated_layout_3d
+        ],
+    ]
+
+  @parameterized.named_parameters(
+      ('_collapse_all_dims', [2, 4, 3], 0, [24], 0),
+      ('_collapse_partial_dims', [2, 4, 3], 0, [8, 3], 0),
+      ('_collapse_partial_dims_with_ones', [2, 4, 1], 0, [8, 1], 0),
+      ('_collapse_partial_dims_non_batch', [2, 4, 3], 1, [2, 12], 1),
+      ('_collapse_partial_dims_non_batch_with_ones', [1, 4, 3], 1, [1, 12], 1),
+      ('_uncollapse_all_dims', [24], 0, [2, 4, 3], 0),
+      ('_uncollapse_partial_dims', [4, 4], 0, [2, 2, 4], 0),
+      ('_uncollapse_partial_dims_non_batch', [2, 12], 1, [2, 4, 3], 1),
+      ('_expand_dims', [2, 2], 0, [2, 1, 2], 0),
+      ('_squeeze', [2, 1, 2], 0, [2, 2], 0),
+  )
+  def testReshape(self, src_shape, src_sharding_dim, target_shape,
+                  target_sharding_dim):
+    src_numpy = np.random.uniform(size=src_shape)
+    src = constant_op.constant(src_numpy, dtype=dtypes.float32)
+
+    expected = array_ops.reshape(src, target_shape)
+
+    src = numpy_util.pack_numpy(src,
+                                self.layouts[src_sharding_dim][len(src_shape)])
+    dtensor_result = array_ops.reshape(src, target_shape)
+    self.assertDTensorEqual(
+        expected, self.layouts[target_sharding_dim][len(target_shape)],
+        dtensor_result)
+
+  def testReshapeWithAllConcatOutputLayout(self):
+    src_shape = [2, 4, 3]
+    target_shape = [2, 12]
+    src_layout = Layout(
+        [layout_lib.UNSHARDED, _MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+    target_layout = Layout.replicated(self.mesh, rank=2)
+
+    src_numpy = np.random.uniform(size=src_shape)
+    src = constant_op.constant(src_numpy, dtype=dtypes.float32)
+
+    expected = array_ops.reshape(src, target_shape)
+
+    src = numpy_util.pack_numpy(src, src_layout)
+    with api._dtensor_device()._default_layout(target_layout):
+      dtensor_result = array_ops.reshape(src, target_shape)
+    self.assertDTensorEqual(expected, target_layout, dtensor_result)
+
+  def testReshapeWithSplitOnOutputLayout(self):
+    src_shape = [2, 4, 3]
+    target_shape = [2, 12]
+    src_layout = Layout.replicated(self.mesh, rank=3)
+    target_layout = Layout([layout_lib.UNSHARDED, _MESH_DIM_X], self.mesh)
+
+    src_numpy = np.random.uniform(size=src_shape)
+    src = constant_op.constant(src_numpy, dtype=dtypes.float32)
+
+    expected = array_ops.reshape(src, target_shape)
+
+    src = numpy_util.pack_numpy(src, src_layout)
+    with api._dtensor_device()._default_layout(target_layout):
+      dtensor_result = array_ops.reshape(src, target_shape)
+    self.assertDTensorEqual(expected, target_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_shard_on_first_output_dim', [4], [2], -1, 0),
+      ('_shard_on_first_input_dim', [4, 4], [1, 2], 0, 0),
+  )
+  def testTile(self, src_shape, multiples, src_sharding_dim,
+               target_sharding_dim):
+    src_numpy = np.random.uniform(size=src_shape)
+    src = constant_op.constant(src_numpy, dtype=dtypes.float32)
+
+    expected = gen_array_ops.tile(src, multiples)
+
+    src = numpy_util.pack_numpy(src,
+                                self.layouts[src_sharding_dim][len(src_shape)])
+    with api._dtensor_device()._default_layout(
+        self.layouts[target_sharding_dim][len(src_shape)]):
+      dtensor_result = gen_array_ops.tile(src, multiples)
+    self.assertDTensorEqual(expected,
+                            self.layouts[target_sharding_dim][len(src_shape)],
+                            dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_unsharded_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+      ('_x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('_unsharded_x', [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('_y_unsharded', [_MESH_DIM_Y, layout_lib.UNSHARDED]),
+      ('_unsharded_y', [layout_lib.UNSHARDED, _MESH_DIM_Y]),
+      ('_x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('_y_x', [_MESH_DIM_Y, _MESH_DIM_X]))
+  def testConst(self, sharding):
+    src_numpy = np.random.uniform(size=[4, 12])
+    zero_numpy = np.zeros_like(src_numpy)
+    expected = constant_op.constant(src_numpy, dtype=dtypes.float32)  # 4x12
+
+    layout = Layout(sharding, self.mesh)
+    zeros = numpy_util.pack_numpy(zero_numpy, layout)
+
+    # We can't execute const on dtensor device eagerly, so we wrap it in a
+    # function and pass a dtensor (which we ignore) to the function in order to
+    # trigger dtensor execution.
+    @polymorphic_function.function
+    def const_test(_):
+      with api._dtensor_device()._default_layout(layout):
+        return constant_op.constant(src_numpy, dtype=dtypes.float32)  # 4x12
+
+    dtensor_result = const_test(zeros)
+
+    self.assertDTensorEqual(expected, layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_unsharded_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+      ('_x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('_unsharded_x', [layout_lib.UNSHARDED, _MESH_DIM_X]),
+      ('_y_unsharded', [_MESH_DIM_Y, layout_lib.UNSHARDED]),
+      ('_unsharded_y', [layout_lib.UNSHARDED, _MESH_DIM_Y]),
+      ('_x_y', [_MESH_DIM_X, _MESH_DIM_Y]),
+      ('_y_x', [_MESH_DIM_Y, _MESH_DIM_X]))
+  def testConstScalar(self, sharding):
+    src_numpy = np.random.uniform()
+    zero_numpy = np.zeros(shape=[4, 12])
+    # Note: The diff between this test and the one above is `src_numpy` is
+    # single value here; while the one above is full shape numpy array.
+    expected = constant_op.constant(
+        src_numpy, shape=[4, 12], dtype=dtypes.float32)  # 4x12
+
+    layout = Layout(sharding, self.mesh)
+    zeros = numpy_util.pack_numpy(zero_numpy, layout)
+
+    # We can't execute const on dtensor device eagerly, so we wrap it in a
+    # function and pass a dtensor (which we ignore) to the function in order to
+    # trigger dtensor execution.
+    @polymorphic_function.function
+    def const_test(_):
+      with api._dtensor_device()._default_layout(layout):
+        return constant_op.constant(
+            src_numpy, shape=[4, 12], dtype=dtypes.float32)  # 4x12
+
+    dtensor_result = const_test(zeros)
+
+    self.assertDTensorEqual(expected, layout, dtensor_result)
+
+  def testRandomOpByOp(self):
+    with ops.device_v2(self.mesh.device_type()):
+      seed = constant_op.constant([1, 2])
+      expected = gen_stateless_random_ops.stateless_random_uniform_full_int(
+          shape=[2, 2], seed=seed, dtype=dtypes.int64)
+
+    seed = api.copy_to_mesh(seed, Layout.replicated(rank=1, mesh=self.mesh))
+    dtensor_result = gen_stateless_random_ops.stateless_random_uniform_full_int(
+        shape=[2, 2], seed=seed, dtype=dtypes.int64)
+    # Note that we only expect the same result (a) for the same device since
+    # this determines the algorithm, and (b) for fully-replicated output layouts
+    # since device_id hashing does not reproduce exactly the single-machine
+    # numbers, only their distribution.
+    self.assertDTensorEqual(expected, Layout.replicated(rank=2, mesh=self.mesh),
+                            dtensor_result)
+
+  def testRange(self):
+    start = 0
+    limit = 3
+    expected = math_ops.range(start, limit)
+
+    layout = Layout([layout_lib.UNSHARDED], self.mesh)
+
+    with api._dtensor_device()._default_layout(layout):
+      dtensor_result = math_ops.range(start, limit)
+
+    self.assertDTensorEqual(expected, layout, dtensor_result)
+
+  def testBroadcastTo(self):
+    inputs = constant_op.constant([1, 2, 3])
+    shape = [3, 3]
+    expected = gen_array_ops.broadcast_to(inputs, shape)
+
+    inputs = api.copy_to_mesh(inputs, Layout.replicated(self.mesh, rank=1))
+    with api.run_on(self.mesh):
+      dtensor_result = gen_array_ops.broadcast_to(inputs, shape)
+
+    self.assertDTensorEqual(expected, Layout.replicated(self.mesh, rank=2),
+                            dtensor_result)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'Replicated',
+          'sharding': [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      },
+      {
+          'testcase_name': 'BatchSharded',
+          'sharding': [_MESH_DIM_X, layout_lib.UNSHARDED],
+      },
+      {
+          'testcase_name': 'FullySharded',
+          'sharding': [_MESH_DIM_X, _MESH_DIM_Y],
+      },
+  )
+  def testTanhGrad(self, sharding):
+    inputs = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]],
+                                  dtype=dtypes.float32)
+
+    with backprop.GradientTape() as tape:
+      tape.watch([inputs])
+      expected_result = gen_math_ops.tanh(inputs)
+    expected_grad = tape.gradient(expected_result, [inputs])
+
+    layout = Layout(sharding, self.mesh)
+    inputs = numpy_util.pack_numpy(inputs.numpy(), layout)
+    with api.run_on(self.mesh):
+      with backprop.GradientTape() as tape:
+        tape.watch([inputs])
+        dtensor_result = gen_math_ops.tanh(inputs)
+      dtensor_grad = tape.gradient(dtensor_result, [inputs])
+    # df2x2 lowers the tanh preceision to 1e-4.
+    self.assertDTensorEqual(
+        expected_grad[0],
+        layout,
+        dtensor_grad[0],
+        tol=1e-4
+        if 'TPU' in self.mesh.local_devices()[0] else test_util.DEFAULT_TOL)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'Replicated',
+          'shard_type': 'replicated',
+      }, {
+          'testcase_name': 'BatchSharded',
+          'shard_type': 'batch_sharded',
+      })
+  def testIdentityN(self, shard_type):
+    layout = (
+        Layout.replicated(self.mesh, rank=2) if shard_type == 'replicated' else
+        Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=2))
+
+    a = constant_op.constant([[10., 20.], [30., 40.]])
+    b = constant_op.constant([[50., 60.], [70., 80.]])
+    expected_c, expected_d = gen_array_ops.identity_n([a, b])
+
+    if shard_type == 'replicated':
+      a = api.copy_to_mesh(a, layout)
+      b = api.copy_to_mesh(b, layout)
+    else:
+      a = numpy_util.pack_numpy(a, layout)
+      b = numpy_util.pack_numpy(b, layout)
+    dtensor_c, dtensor_d = gen_array_ops.identity_n([a, b])
+
+    self.assertDTensorEqual(expected_c, layout, dtensor_c)
+    self.assertDTensorEqual(expected_d, layout, dtensor_d)
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'Replicated',
+          'sharding': [layout_lib.UNSHARDED, layout_lib.UNSHARDED]
+      },
+      {
+          'testcase_name': 'BatchSharded',
+          'sharding': [_MESH_DIM_X, layout_lib.UNSHARDED],
+      },
+      {
+          'testcase_name': 'FullySharded',
+          'sharding': [_MESH_DIM_X, _MESH_DIM_Y],
+      },
+  )
+  def testArgMax(self, sharding):
+    for axis in [0, 1]:
+      inputs = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]],
+                                    dtype=dtypes.float32)
+      expect_result = math_ops.argmax_v2(
+          inputs, axis=axis, output_type=dtypes.int32)
+
+      input_layout = Layout(sharding, self.mesh)
+      inputs = numpy_util.pack_numpy(inputs.numpy(), input_layout)
+
+      output_layout = Layout([sharding[1 - axis]], self.mesh)
+      dtensor_result = math_ops.argmax_v2(
+          inputs, axis=axis, output_type=dtypes.int32)
+      self.assertDTensorEqual(expect_result, output_layout, dtensor_result)
+
+  @parameterized.named_parameters(('PositiveAxis', 0), ('NegativeAxis', -2))
+  def testSplitOpsWithFullyReplicatedInputs(self, split_axis):
+    t = random_ops.random_uniform([2, 4])
+    expected_result = array_ops.split(t, 2, axis=split_axis)
+    t = api.copy_to_mesh(t, self.replicated_layout_2d)
+    dtensor_result = array_ops.split(t, 2, axis=split_axis)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 2)
+    self.assertLen(dtensor_result, 2)
+    self.assertDTensorEqual(expected_result[0], self.replicated_layout_2d,
+                            dtensor_result[0])
+    self.assertDTensorEqual(expected_result[1], self.replicated_layout_2d,
+                            dtensor_result[1])
+
+  def testSplitOpsWithNonSplitAxisSharded(self):
+    t = random_ops.random_uniform([2, 4])
+    expected_result = array_ops.split(t, 2, axis=1)
+    t = numpy_util.pack_numpy(t, self.first_dimension_sharded_layout_2d)
+    dtensor_result = array_ops.split(t, 2, axis=1)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 2)
+    self.assertLen(dtensor_result, 2)
+    self.assertDTensorEqual(expected_result[0],
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result[0])
+    self.assertDTensorEqual(expected_result[1],
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result[1])
+
+  def testSplitOpsWithSplitAxisShardedRaisesError(self):
+    t = random_ops.random_uniform([2, 4])
+    t = numpy_util.pack_numpy(t, self.last_dimension_sharded_layout_2d)
+    with self.assertRaises(errors_impl.UnknownError):
+      # Spliting over sharded dimension is not yet supported.
+      _ = array_ops.split(t, 2, axis=1)
+
+  @parameterized.named_parameters(('PositiveAxis', 1), ('NegativeAxis', -1))
+  def testSplitVOpsWithFullyReplicatedInputs(self, split_axis):
+    t = random_ops.random_uniform([4, 5])
+    expected_result = array_ops.split(t, [1, 3, 1], axis=split_axis)
+    t = api.copy_to_mesh(t, self.replicated_layout_2d)
+    dtensor_result = array_ops.split(t, [1, 3, 1], axis=split_axis)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 3)
+    self.assertLen(dtensor_result, 3)
+    self.assertDTensorEqual(expected_result[0], self.replicated_layout_2d,
+                            dtensor_result[0])
+    self.assertDTensorEqual(expected_result[1], self.replicated_layout_2d,
+                            dtensor_result[1])
+    self.assertDTensorEqual(expected_result[2], self.replicated_layout_2d,
+                            dtensor_result[2])
+
+  def testSplitVOpsWithNonSplitAxisSharded(self):
+    t = random_ops.random_uniform([4, 5])
+    expected_result = array_ops.split(t, [1, 3, 1], axis=1)
+    t = numpy_util.pack_numpy(t, self.first_dimension_sharded_layout_2d)
+    dtensor_result = array_ops.split(t, [1, 3, 1], axis=1)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 3)
+    self.assertLen(dtensor_result, 3)
+    self.assertDTensorEqual(expected_result[0],
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result[0])
+    self.assertDTensorEqual(expected_result[1],
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result[1])
+    self.assertDTensorEqual(expected_result[2],
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result[2])
+
+  def testSplitVOpsWithSplitAxisShardedRaisesError(self):
+    t = random_ops.random_uniform([2, 4])
+    t = numpy_util.pack_numpy(t, self.last_dimension_sharded_layout_2d)
+    with self.assertRaises(errors_impl.UnknownError):
+      # Spliting over sharded dimension is not yet supported.
+      _ = array_ops.split(t, [1, 1, 2], axis=1)
+
+  def testUnpackWithFullyReplicatedInputs(self):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.unstack(t, axis=0)
+    t = api.copy_to_mesh(t, self.replicated_layout_2d)
+    dtensor_result = array_ops.unstack(t, axis=0)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 2)
+    self.assertLen(dtensor_result, 2)
+    self.assertDTensorEqual(expected_result[0], self.replicated_layout_1d,
+                            dtensor_result[0])
+    self.assertDTensorEqual(expected_result[1], self.replicated_layout_1d,
+                            dtensor_result[1])
+
+  def testUnpackWithShardedInput(self):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.unstack(t, axis=1)
+    t = numpy_util.pack_numpy(
+        t, Layout([layout_lib.UNSHARDED, _MESH_DIM_X], self.mesh))
+    dtensor_result = array_ops.unstack(t, axis=1)
+    self.assertIsInstance(expected_result, list)
+    self.assertIsInstance(dtensor_result, list)
+    self.assertLen(expected_result, 4)
+    self.assertLen(dtensor_result, 4)
+    for i in range(4):
+      self.assertDTensorEqual(expected_result[i], self.replicated_layout_1d,
+                              dtensor_result[i])
+
+  @parameterized.named_parameters(
+      (
+          '_unshard_input_batch_sharded',
+          'bsd,dnh->bsnh',
+          [8, 128, 128],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [128, 16, 64],
+          [layout_lib.UNSHARDED, _MESH_DIM_Y, layout_lib.UNSHARDED],
+          [
+              _MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y,
+              layout_lib.UNSHARDED
+          ],
+      ),
+      (
+          '_unshard_input_all_reduce_output',
+          'bfi,bfd->di',
+          [8, 128, 256],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [8, 128, 128],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+          [layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+      (
+          '_contracting_dim_sharded_in_output',
+          'bfnh,nhd->bfd',
+          [8, 128, 16, 8],
+          [
+              _MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y,
+              layout_lib.UNSHARDED
+          ],
+          [16, 8, 128],
+          [_MESH_DIM_Y, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y],
+      ),
+  )
+  def testEinsum(self, equation, a_shape, a_layout, b_shape, b_layout,
+                 output_layout):
+    a_numpy = np.random.uniform(size=a_shape)
+    b_numpy = np.random.uniform(size=b_shape)
+    a = constant_op.constant(a_numpy, dtype=dtypes.float32)
+    b = constant_op.constant(b_numpy, dtype=dtypes.float32)
+
+    expected = special_math_ops.einsum(equation, a, b)
+
+    a_layout = Layout(a_layout, self.mesh)
+    b_layout = Layout(b_layout, self.mesh)
+    output_layout = Layout(output_layout, self.mesh)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+
+    @polymorphic_function.function
+    def einsum_fn(x, y):
+      result = special_math_ops.einsum(equation, x, y)
+      return api.relayout(result, output_layout)
+
+    dtensor_result = einsum_fn(a, b)
+    self.assertDTensorEqual(expected, output_layout, dtensor_result)
+
+  def testAddV2DifferentSharding(self):
+    a_numpy = np.random.uniform(size=[16, 8])
+    b_numpy = np.random.uniform(size=[16, 8])
+    a = constant_op.constant(a_numpy, dtype=dtypes.float32)
+    b = constant_op.constant(b_numpy, dtype=dtypes.float32)
+
+    expected = math_ops.add(a, b)
+
+    a_layout = Layout([_MESH_DIM_Y, layout_lib.UNSHARDED], self.mesh)
+    b_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+
+    a = numpy_util.pack_numpy(a, a_layout)
+    b = numpy_util.pack_numpy(b, b_layout)
+
+    @polymorphic_function.function
+    def add_fn(x, y):
+      result = math_ops.add(x, y)
+      return api.relayout(result, a_layout)
+
+    dtensor_result = add_fn(a, b)
+    self.assertDTensorEqual(expected, a_layout, dtensor_result)
+
+  @parameterized.named_parameters(test_util_ops.BINARY_FLOAT_OPS)
+  def testBinaryOpsWithArbitrarySharding(self, op):
+    tol = select_tol(op, self.mesh, test_util.DEFAULT_TOL, low_res_tol=1e-2)
+    a = constant_op.constant(
+        np.array([[1., 2.], [3., 4.]]), dtype=dtypes.float32)
+    b = constant_op.constant(
+        np.array([[10., 20.], [30., 40.]]), dtype=dtypes.float32)
+    expected_result = op(a, b)
+
+    layout_x_n = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+    layout_n_x = Layout([layout_lib.UNSHARDED, _MESH_DIM_X], self.mesh)
+
+    a = numpy_util.pack_numpy(a, layout_x_n)
+    b = numpy_util.pack_numpy(b, layout_n_x)
+
+    with api._dtensor_device()._default_layout(layout_n_x):
+      dtensor_result = op(a, b)
+
+    self.assertDTensorEqual(
+        expected_result, layout_n_x, dtensor_result, tol=tol)
+
+  def testUnsortedSegmentSum(self):
+    self.skipForDeviceType(['GPU'], 'reduce on GPU only supports int64')
+    num_segments = 12
+    data = np.random.uniform(size=[num_segments, 4])
+    segment_ids = np.random.randint(
+        0, num_segments, size=num_segments, dtype=np.int32)
+    expected = gen_math_ops.unsorted_segment_sum(data, segment_ids,
+                                                 num_segments)
+
+    data = numpy_util.pack_numpy(data, Layout.replicated(self.mesh, 2))
+    segment_ids = numpy_util.pack_numpy(
+        segment_ids, Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=1))
+    with api.run_on(self.mesh):
+      dtensor_result = gen_math_ops.unsorted_segment_sum(
+          data, segment_ids, num_segments)
+      expected_layout = Layout.replicated(self.mesh, 2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testUnsortedSegmentSumWithShardingData(self):
+    self.skipForDeviceType(['GPU'], 'reduce on GPU only supports int64')
+    num_segments = 12
+    data = np.random.uniform(size=[num_segments, 4]).astype(np.float32)
+    segment_ids = np.random.randint(
+        0, num_segments, size=num_segments, dtype=np.int32)
+    expected = gen_math_ops.unsorted_segment_sum(data, segment_ids,
+                                                 num_segments)
+
+    data = numpy_util.pack_numpy(
+        data, Layout([layout_lib.UNSHARDED, _MESH_DIM_X], self.mesh))
+    segment_ids = numpy_util.pack_numpy(segment_ids,
+                                        Layout.replicated(self.mesh, 1))
+
+    with api._dtensor_device()._default_layout(Layout.replicated(self.mesh, 2)):
+      with api.run_on(self.mesh):
+        dtensor_result = gen_math_ops.unsorted_segment_sum(
+            data, segment_ids, num_segments)
+        expected_layout = Layout.replicated(self.mesh, 2)
+        self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParams(self):
+    params = np.arange(1000 * 4).reshape((1000, 4)).astype(np.float32)
+    # "batch" size = 2, num_indices = 4 per example
+    indices = np.random.randint(
+        0, 1000, size=4 * 4).reshape((4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, axis=0)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices, Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                        self.mesh))
+
+    expected_layout = Layout(
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+        self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(params, indices, axis=0)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_float32', np.float32),
+      ('_int32', np.int32))
+  @mock.patch.dict(os.environ,
+                   {'LOWER_DTENSOR_GATHER_TO_COLLECTIVE_GATHER_V2': '1'})
+  def testGatherIndicesShardingParams(self, data_type):
+    params = np.arange(1000 * 4).reshape((1000, 4)).astype(data_type)
+    # "batch" size = 2, num_indices = 4 per example
+    indices = np.random.randint(
+        0, 1000, size=4 * 4).reshape((4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, axis=0)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices, Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh))
+
+    expected_layout = Layout(
+        [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(params, indices, axis=0)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsAxisIs1(self):
+    params = np.arange(1000 * 32).reshape((1000, 32)).astype(np.float32)
+    # "batch" size = 2, num_indices = 4 per example
+    indices = np.random.randint(
+        0, 32, size=4 * 4).reshape((4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, axis=1)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices, Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                        self.mesh))
+
+    expected_layout = Layout(
+        [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(params, indices, axis=1)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingIndices(self):
+    params = np.arange(1000 * 4).reshape((1000, 4))
+    # "batch" size = 2, num_indices = 4 per example
+    indices = np.random.randint(0, 1000, size=4 * 4).reshape((4, 4))
+    expected = array_ops.gather_v2(params, indices, axis=0)
+
+    params = numpy_util.pack_numpy(
+        params, layout=Layout.replicated(self.mesh, 2))
+    indices = numpy_util.pack_numpy(
+        indices, Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                        self.mesh))
+    expected_layout = Layout(
+        [layout_lib.UNSHARDED, _MESH_DIM_Y, layout_lib.UNSHARDED], self.mesh)
+    with api.run_on(self.mesh):
+      with api._dtensor_device()._default_layout(expected_layout):
+        dtensor_result = array_ops.gather_v2(params, indices, axis=0)
+        self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsWithBatchDim(self):
+    params = np.arange(128 * 1000 * 2).reshape(
+        (128, 1000, 2)).astype(np.float32)
+    indices = np.random.randint(
+        0, 1000, size=128 * 4 * 4).reshape((128, 4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=1)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                      self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices,
+        Layout(
+            [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+            self.mesh))
+
+    expected_layout = Layout([
+        _MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED,
+        layout_lib.UNSHARDED
+    ], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=1)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsWithBatchDimAxisIs2(self):
+    params = np.arange(128 * 1000 * 32).reshape(
+        (128, 1000, 32)).astype(np.float32)
+    indices = np.random.randint(
+        0, 32, size=128 * 4 * 4).reshape((128, 4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=2)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                      self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices,
+        Layout(
+            [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+            self.mesh))
+
+    expected_layout = Layout(
+        [_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+        self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsWithBatchDimAxisIs2ShardingAfterAxis(self):
+    params = np.arange(128 * 5 * 32 * 64).reshape(
+        (128, 5, 32, 64)).astype(np.float32)
+    indices = np.random.randint(
+        0, 32, size=128 * 4 * 4).reshape((128, 4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=2)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([
+            _MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_Y
+        ], self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices,
+        Layout(
+            [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+            self.mesh))
+
+    expected_layout = Layout([
+        _MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED,
+        layout_lib.UNSHARDED, _MESH_DIM_Y
+    ], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsWithBatchDimAxisIs2ShardingIndices(self):
+    params = np.arange(128 * 5 * 32 * 64).reshape(
+        (128, 5, 32, 64)).astype(np.float32)
+    indices = np.random.randint(
+        0, 32, size=128 * 4 * 4).reshape((128, 4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=2)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([
+            _MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED,
+            layout_lib.UNSHARDED
+        ], self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices,
+        Layout([layout_lib.UNSHARDED, _MESH_DIM_Y, layout_lib.UNSHARDED],
+               self.mesh))
+
+    expected_layout = Layout([
+        _MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y, layout_lib.UNSHARDED,
+        layout_lib.UNSHARDED
+    ], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=2)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherParamsShardingAfterAxisWithBatchDim(self):
+    params = np.arange(128 * 1000 * 2).reshape(
+        (128, 1000, 2)).astype(np.float32)
+    indices = np.random.randint(
+        0, 1000, size=128 * 4).reshape((128, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=1)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X],
+                      self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices, Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                        self.mesh))
+
+    expected_layout = Layout(
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, _MESH_DIM_X], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=1)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testGatherShardingParamsIndicesWithBatchDim(self):
+    params = np.arange(128 * 1000 * 2).reshape(
+        (128, 1000, 2)).astype(np.float32)
+    indices = np.random.randint(
+        0, 1000, size=128 * 4 * 4).reshape((128, 4, 4)).astype(np.int32)
+    expected = array_ops.gather_v2(params, indices, batch_dims=1, axis=1)
+
+    params = numpy_util.pack_numpy(
+        params,
+        layout=Layout([_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                      self.mesh))
+    indices = numpy_util.pack_numpy(
+        indices,
+        Layout([_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+               self.mesh))
+
+    expected_layout = Layout([
+        _MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED,
+        layout_lib.UNSHARDED
+    ], self.mesh)
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.gather_v2(
+          params, indices, batch_dims=1, axis=1)
+      self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testTranspose(self):
+    original_a = np.arange(5 * 4 * 6).reshape((5, 4, 6)).astype(np.float32)
+
+    original_layout = Layout([layout_lib.UNSHARDED, _MESH_DIM_Y, _MESH_DIM_X],
+                             self.mesh)
+
+    # paris of (perm, expected_layout)
+    combinations = [
+        ([2, 0, 1], [_MESH_DIM_X, layout_lib.UNSHARDED, _MESH_DIM_Y]),
+        ([2, 1, 0], [_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED]),
+        ([1, 2, 0], [_MESH_DIM_Y, _MESH_DIM_X, layout_lib.UNSHARDED]),
+        ([1, 0, 2], [_MESH_DIM_Y, layout_lib.UNSHARDED, _MESH_DIM_X]),
+        ([0, 1, 2], [layout_lib.UNSHARDED, _MESH_DIM_Y, _MESH_DIM_X]),
+        ([0, 2, 1], [layout_lib.UNSHARDED, _MESH_DIM_X, _MESH_DIM_Y]),
+    ]
+
+    for (perm, expected_spec) in combinations:
+      a = original_a
+      expected = array_ops.transpose_v2(a, perm)
+
+      a = numpy_util.pack_numpy(a, original_layout)
+      expected_layout = Layout(expected_spec, self.mesh)
+      with api.run_on(self.mesh):
+        dtensor_result = array_ops.transpose_v2(a, perm)
+        self.assertDTensorEqual(expected, expected_layout, dtensor_result)
+
+  def testSliceOpsWithNonFullSlicingOnShardedInputs(self):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.slice(t, [0, 0], [1, 2])
+    sharded_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+
+    t = numpy_util.pack_numpy(t, sharded_layout)
+    expected_layout = Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                             self.mesh)
+
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.slice(t, [0, 0], [1, 2])
+      self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  def testSliceOpWithNonFullSlicingOnShardedInputsAndFullSlicingOnAnother(self):
+    t = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
+    expected_result = array_ops.slice(t, [0, 0], [1, 4])
+    sharded_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y], self.mesh)
+
+    t = numpy_util.pack_numpy(t, sharded_layout)
+    expected_layout = Layout([layout_lib.UNSHARDED, _MESH_DIM_Y], self.mesh)
+
+    with api.run_on(self.mesh):
+      dtensor_result = array_ops.slice(t, [0, 0], [1, 4])
+      self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  def testSliceOpsWithNonFullSlicingOnShardedInputsWithShardedOutputs(self):
+    t = constant_op.constant([
+        [1., 2., 3., 4.],
+        [5., 6., 7., 8.],
+        [11., 12., 13., 14.],
+        [15., 16., 17., 18.],
+    ])
+    expected_result = array_ops.slice(t, [0, 0], [2, 2])
+    sharded_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+
+    t = numpy_util.pack_numpy(t, sharded_layout)
+    expected_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+
+    with api.run_on(self.mesh):
+
+      @polymorphic_function.function
+      def op_fn(x):
+        y = array_ops.slice(x, [0, 0], [2, 2])
+        return api.relayout(y, expected_layout)
+
+      dtensor_result = op_fn(t)
+      self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  def testSliceOpsWithFullyReplicatedInputsWithShardedOutputs(self):
+    t = constant_op.constant([
+        [1., 2., 3., 4.],
+        [5., 6., 7., 8.],
+        [11., 12., 13., 14.],
+        [15., 16., 17., 18.],
+    ])
+    expected_result = array_ops.slice(t, [0, 0], [2, 2])
+    operand_layout = Layout([layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+                            self.mesh)
+
+    t = numpy_util.pack_numpy(t, operand_layout)
+    expected_layout = Layout([_MESH_DIM_X, layout_lib.UNSHARDED], self.mesh)
+
+    with api.run_on(self.mesh):
+
+      @polymorphic_function.function
+      def op_fn(x):
+        y = array_ops.slice(x, [0, 0], [2, 2])
+        return api.relayout(y, expected_layout)
+
+      dtensor_result = op_fn(t)
+      self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      test_util.product(
+          [('_tensor_unsharded_updates_unsharded', -1, -1, 2),
+           ('_tensor_first_updates_unsharded', 0, -1, 2),
+           ('_tensor_second_updates_unsharded', 1, -1, 2),
+           ('_tensor_unsharded_updates_first', -1, 0, 2),
+           ('_tensor_first_updates_first', 0, 0, 2),
+           ('_tensor_second_updates_first', 1, 0, 2),
+           ('_tensor_unsharded_updates_second', -1, 1, 2),
+           ('_tensor_first_updates_second', 0, 1, 2),
+           ('_tensor_second_updates_second', 1, 1, 2),
+           ('_tensor_second_updates_second_rank_three_indices', 1, 1, 3)], [(
+               'update',
+               gen_array_ops.tensor_scatter_update,
+           ), (
+               'add',
+               gen_array_ops.tensor_scatter_add,
+           )]))
+  def testTensorScatterUpdate(self, tensor_dimension, updates_dimension,
+                              indices_rank, op_type):
+    tensor_layout = self.layouts[tensor_dimension][2]
+    updates_layout = self.layouts[updates_dimension][indices_rank]
+
+    # Tensor is shape [4, 2], indices is shape [..., 2, 1], updates is shape
+    # [..., 2, 2]
+    #
+    # Entries in indices should be unique and integers in the range [0, 4).
+    # Tensor and updates are float.
+
+    tensor_numpy = np.random.uniform(size=[4, 2])
+    padding_axes = [1] * (indices_rank - 2)
+    updates_numpy = np.random.uniform(size=padding_axes + [2, 2])
+    indices_numpy_flat = np.array(
+        [np.random.randint(0, 4),
+         np.random.randint(0, 3)])
+    if indices_numpy_flat[0] == indices_numpy_flat[1]:
+      indices_numpy_flat[1] += 1
+    indices_numpy = indices_numpy_flat.reshape(padding_axes + [2, 1])
+
+    tensor = constant_op.constant(tensor_numpy, dtype=dtypes.float32)
+    updates = constant_op.constant(updates_numpy, dtype=dtypes.float32)
+    indices = constant_op.constant(indices_numpy, dtype=dtypes.int32)
+
+    golden_result = op_type(tensor=tensor, updates=updates, indices=indices)
+
+    tensor = numpy_util.pack_numpy(tensor, tensor_layout)
+    updates = numpy_util.pack_numpy(updates, updates_layout)
+    indices = numpy_util.pack_numpy(
+        indices, Layout.replicated(tensor_layout.mesh, indices_rank))
+
+    dtensor_result = op_type(tensor=tensor, updates=updates, indices=indices)
+
+    # If either of the inputs are sharded in the non-index dimension, then
+    # the output is as well, otherwise it is replicated.
+    if tensor_dimension == 1 or updates_dimension == 1:
+      expected_layout = self.layouts[1][2]
+    else:
+      expected_layout = self.layouts[-1][2]
+
+    self.assertDTensorEqual(golden_result, expected_layout, dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_params_unsharded_indices_unsharded', -1, -1),
+      ('_params_first_indices_unsharded', 0, -1),
+      ('_params_third_indices_unsharded', 2, -1),
+      ('_params_unsharded_indices_first', -1, 0),
+      ('_params_first_indices_first', 0, 0),
+      ('_params_third_indices_first', 2, 0),
+      ('_params_unsharded_indices_second', -1, 1),
+      ('_params_first_indices_second', 0, 1),
+      ('_params_third_indices_second', 2, 1),
+  )
+  def testGatherNd(self, params_dimension, indices_dimension):
+    self.skipForDeviceType(['GPU'], 'b/179387248 cases with AllConcat crash')
+    params_layout = self.layouts[params_dimension][3]
+    indices_layout = self.layouts[indices_dimension][2]
+
+    # Params will have shape [6, 4, 4] and indices will have shape [2, 2]
+    # this will result in a tensor with final shape [2, 4].
+
+    params_numpy = np.random.uniform(size=[6, 4, 4])
+    indices_numpy = np.array(
+        [[np.random.randint(0, 6),
+          np.random.randint(0, 4)],
+         [np.random.randint(0, 6),
+          np.random.randint(0, 4)]])
+
+    params = constant_op.constant(params_numpy, dtype=dtypes.float32)
+    indices = constant_op.constant(indices_numpy, dtype=dtypes.int32)
+
+    golden_result = gen_array_ops.gather_nd(params=params, indices=indices)
+
+    params = numpy_util.pack_numpy(params, params_layout)
+    indices = numpy_util.pack_numpy(indices, indices_layout)
+
+    dtensor_result = gen_array_ops.gather_nd(params=params, indices=indices)
+
+    if params_dimension < 2 and indices_dimension == 0:
+      # if params isn't sharded in the last dimension, then sharding of indices
+      # the first dimension gives a first dimension sharding of the output
+      expected_layout = self.layouts[0][2]
+    elif params_dimension == 2:
+      # if params is sharded in the last dimension, then sharding of indices is
+      # ignored as they are both sharded on the same dimension.
+      expected_layout = self.layouts[1][2]
+    else:
+      expected_layout = self.layouts[-1][2]
+
+    self.assertDTensorEqual(golden_result, expected_layout, dtensor_result)
+
+  def testGatherNdUnshardedInputShardedOutput(self):
+    # Params will have shape [6, 4, 4] and indices will have shape [2, 2]
+    # this will result in a tensor with final shape [2, 4].
+
+    params_numpy = np.random.uniform(size=[6, 4, 4])
+    indices_numpy = np.array(
+        [[np.random.randint(0, 6),
+          np.random.randint(0, 4)],
+         [np.random.randint(0, 6),
+          np.random.randint(0, 4)]])
+
+    params = constant_op.constant(params_numpy, dtype=dtypes.float32)
+    indices = constant_op.constant(indices_numpy, dtype=dtypes.int32)
+
+    golden_result = gen_array_ops.gather_nd(params=params, indices=indices)
+
+    params = numpy_util.pack_numpy(params, self.replicated_layout_3d)
+    indices = numpy_util.pack_numpy(indices, self.replicated_layout_2d)
+
+    @polymorphic_function.function
+    def gather_with_relayout(params, indices):
+      result = gen_array_ops.gather_nd(params=params, indices=indices)
+      return api.relayout(result, self.first_dimension_sharded_layout_2d)
+
+    dtensor_result = gather_with_relayout(params, indices)
+
+    self.assertDTensorEqual(golden_result,
+                            self.first_dimension_sharded_layout_2d,
+                            dtensor_result)
+
+  @parameterized.named_parameters(
+      ('_unsharded_unsharded', [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+      ('_x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+      ('_x_y', [_MESH_DIM_X, _MESH_DIM_Y]))
+  def testTopK(self, sharding):
+    inputs_layout = Layout(sharding, self.mesh)
+
+    inputs = constant_op.constant(
+        np.random.uniform(size=[4, 16]), dtype=dtypes.float32)
+    topk = constant_op.constant(2, dtype=dtypes.int32)
+
+    expected_topk_values, expected_topk_indices = nn_ops.top_k(inputs, k=topk)
+
+    inputs = numpy_util.pack_numpy(inputs, inputs_layout)
+
+    dtensor_topk_values, dtensor_topk_indices = nn_ops.top_k(inputs, k=topk)
+
+    expected_sharding = [sharding[0], layout_lib.UNSHARDED]
+    expected_layout = Layout(expected_sharding, self.mesh)
+
+    self.assertDTensorEqual(expected_topk_values, expected_layout,
+                            dtensor_topk_values)
+    self.assertDTensorEqual(expected_topk_indices, expected_layout,
+                            dtensor_topk_indices)
+
+  @parameterized.named_parameters(*test_util.product(
+      (('_targets_unsharded', [layout_lib.UNSHARDED]),
+       ('_targets_x', [_MESH_DIM_X]), ('_targets_y', [_MESH_DIM_Y])),
+      (('_predictions_unsharded_unsharded',
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED]),
+       ('_predictions_x_unsharded', [_MESH_DIM_X, layout_lib.UNSHARDED]),
+       ('_predictions_unsharded_y', [layout_lib.UNSHARDED, _MESH_DIM_Y]),
+       ('_predictions_x_y', [_MESH_DIM_X, _MESH_DIM_Y]))))
+  def testInTopK(self, targets_sharding, predictions_sharding):
+
+    # TODO(b/193471732): changed back to dtypes.int32 once it is fixed.
+    int_dtype = dtypes.int64 if self.mesh.device_type(
+    ) == 'GPU' else dtypes.int32
+
+    targets_layout = Layout(targets_sharding, self.mesh)
+    predictions_layout = Layout(predictions_sharding, self.mesh)
+
+    targets = constant_op.constant([2, 2, 1, 0], dtype=int_dtype)
+    predictions = constant_op.constant([[0.1, 0.3, 0.2, 0.4],
+                                        [0.1, 0.2, 0.3, 0.4],
+                                        [0.3, 0.4, 0.1, 0.2],
+                                        [0.1, 0.3, 0.4, 0.2]])
+    topk = constant_op.constant(2, dtype=int_dtype)
+
+    expected_output = nn_ops.in_top_k_v2(targets, predictions, k=topk)
+
+    targets = numpy_util.pack_numpy(targets, targets_layout)
+    predictions = numpy_util.pack_numpy(predictions, predictions_layout)
+
+    dtensor_output = nn_ops.in_top_k_v2(targets, predictions, k=topk)
+
+    expected_sharding = [layout_lib.UNSHARDED]
+    # Select the more sharded layout
+    if targets_sharding[0] != layout_lib.UNSHARDED:
+      expected_sharding[0] = targets_sharding[0]
+    if predictions_sharding[0] != layout_lib.UNSHARDED:
+      expected_sharding[0] = predictions_sharding[0]
+
+    expected_layout = Layout(expected_sharding, self.mesh)
+    self.assertDTensorEqual(expected_output, expected_layout, dtensor_output)
+
+
+class DTensorRelayoutTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(DTensorRelayoutTest, self).setUp()
+
+    self.skipForDeviceType(['TPU'],
+                           'all tests require 8 TPU cores.',
+                           unless_device_count_equals_to=8)
+
+    global_ids = test_util.create_device_ids_array((2, 4))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: Mesh([_MESH_DIM_X, _MESH_DIM_Y], global_ids, local_ids,
+                     test_util.create_device_list((2, 4), device))
+        for device in ('CPU', 'GPU', 'TPU')
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+    context.ensure_initialized()
+
+  def testRelayoutEagerAllConcat(self):
+    op = gen_nn_ops.relu
+
+    a = constant_op.constant([[[1.], [-2.], [3.], [-4.]],
+                              [[5.], [-6.], [-7.], [8.]]])
+    assert a.shape == [2, 4, 1]
+    expected_result = op(a)
+
+    init_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                         self.mesh)
+    a = numpy_util.pack_numpy(a, init_layout)
+    dtensor_output = op(a)
+
+    final_layout = Layout(
+        [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED], self.mesh)
+    # eager relayout
+    dtensor_result = api.relayout(dtensor_output, final_layout)
+    self.assertDTensorEqual(expected_result, final_layout, dtensor_result)
+
+  def testRelayoutEagerSlice(self):
+    op = gen_nn_ops.relu
+
+    a = constant_op.constant([[[1.], [-2.], [3.], [-4.]],
+                              [[5.], [-6.], [-7.], [8.]]])
+    assert a.shape == [2, 4, 1]
+    expected_result = op(a)
+
+    init_layout = Layout(
+        [layout_lib.UNSHARDED, layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+        self.mesh)
+    a = numpy_util.pack_numpy(a, init_layout)
+    dtensor_output = op(a)
+
+    final_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                          self.mesh)
+    # eager relayout
+    dtensor_result = api.relayout(dtensor_output, final_layout)
+    self.assertDTensorEqual(expected_result, final_layout, dtensor_result)
+
+  def testRelayoutGraphAllConcat(self):
+    op = gen_nn_ops.relu
+
+    a = constant_op.constant([[[1.], [-2.], [3.], [-4.]],
+                              [[5.], [-6.], [-7.], [8.]]])
+    assert a.shape == [2, 4, 1]
+    expected_result = op(a)
+
+    init_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                         self.mesh)
+    a = numpy_util.pack_numpy(a, init_layout)
+
+    final_layout = Layout(
+        [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED], self.mesh)
+
+    @polymorphic_function.function
+    def wrap_fn(x):
+      dtensor_output = op(x)
+      return api.relayout(dtensor_output, final_layout)
+
+    dtensor_result = wrap_fn(a)
+    self.assertDTensorEqual(expected_result, final_layout, dtensor_result)
+
+  def testRelayoutGraphSlice(self):
+    op = gen_nn_ops.relu
+
+    a = constant_op.constant([[[1.], [-2.], [3.], [-4.]],
+                              [[5.], [-6.], [-7.], [8.]]])
+    assert a.shape == [2, 4, 1]
+    expected_result = op(a)
+
+    init_layout = Layout(
+        [_MESH_DIM_X, layout_lib.UNSHARDED, layout_lib.UNSHARDED], self.mesh)
+    a = numpy_util.pack_numpy(a, init_layout)
+
+    final_layout = Layout([_MESH_DIM_X, _MESH_DIM_Y, layout_lib.UNSHARDED],
+                          self.mesh)
+
+    @polymorphic_function.function
+    def wrap_fn(x):
+      dtensor_output = op(x)
+      return api.relayout(dtensor_output, final_layout)
+
+    dtensor_result = wrap_fn(a)
+    self.assertDTensorEqual(expected_result, final_layout, dtensor_result)
+
+
+if __name__ == '__main__':
+  tf_test.main()
diff --git a/tensorflow/dtensor/python/tests/test_backend_name.py b/tensorflow/dtensor/python/tests/test_backend_name.py
new file mode 100644
index 00000000000..b7b02d7648c
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/test_backend_name.py
@@ -0,0 +1,39 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""List of test backend names."""
+
+import enum
+import os
+
+
+# LINT.IfChange(backend_name)
+class DTensorTestUtilBackend(enum.Enum):
+  """DTensor backend the test is being run on."""
+  UNSPECIFIED = 'unspecified'
+  CPU = 'cpu'
+  GPU = 'gpu'
+  GPU_2DEVS_BACKEND = '2gpus'
+  TPU = 'tpu'
+  TPU_STREAM_EXECUTOR = 'tpu_se'
+  TPU_V3_DONUT_BACKEND = 'tpu_v3_2x2'
+  TPU_V4_DONUT_BACKEND = 'tpu_v4_2x2'
+  PATHWAYS = 'pw'
+
+
+DTENSOR_TEST_UTIL_BACKEND = DTensorTestUtilBackend(
+    os.getenv('DTENSOR_TEST_UTIL_BACKEND', default='unspecified')
+)
+
+# LINT.ThenChange()
diff --git a/tensorflow/dtensor/python/tests/test_backend_util.py b/tensorflow/dtensor/python/tests/test_backend_util.py
new file mode 100644
index 00000000000..92309c6168a
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/test_backend_util.py
@@ -0,0 +1,67 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to set up DTensor backend in tests."""
+
+# LINT.IfChange
+import multiprocessing
+import os
+
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python.tests.test_backend_name import DTENSOR_TEST_UTIL_BACKEND
+from tensorflow.python.platform import test as tf_test
+
+
+class DTensorTestBackendConfigurator:
+  """Configurate test backends."""
+
+  def __init__(self, test_case: tf_test.TestCase):
+    self._test_case = test_case
+    # TODO(b/260771689): Refactor common backend set up logic to here.
+
+  def tearDown(self):
+    # Only need to explicitly shuts down TPU system in TFRT since in current
+    # runtime, the shutdown is done in initialization process.
+    if accelerator_util.is_initialized():
+      accelerator_util.shutdown_accelerator_system()
+
+
+def slice_host_devices_for_multiworker(num_clients, client_id, ports):
+  """Configure the current process to only use a slice of devices."""
+  if num_clients == 0:
+    # All GPUs are visible to the client.
+    del os.environ['CUDA_VISIBLE_DEVICES']
+  else:
+    # Make the client_id-th GPU visible to the client.
+    os.environ['CUDA_VISIBLE_DEVICES'] = f'{client_id}'
+    # Make the client_id-th (4x) TPU cores visible to the client.
+    os.environ['CLOUD_TPU_TASK_ID'] = f'{client_id}'
+    if 'tpu' in DTENSOR_TEST_UTIL_BACKEND.value:
+      del ports  # Unused due to lack of implementation.
+      # We need to find out if there is a way to slice a CloudTPU host to
+      # multiple workers.
+      raise NotImplementedError(
+          'OSS multi-client tests of TPU is not supported.'
+      )
+
+
+def get_mp_context():
+  return multiprocessing.get_context('forkserver')
+
+
+def handle_test_main(main):
+  main()
+
+
+# LINT.ThenChange(test_backend_util.py)
diff --git a/tensorflow/dtensor/python/tests/test_util.py b/tensorflow/dtensor/python/tests/test_util.py
new file mode 100644
index 00000000000..847cb88fc93
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/test_util.py
@@ -0,0 +1,384 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods for DTensor testing."""
+import collections
+import copy
+import itertools
+import json
+import typing
+
+from absl import flags
+from absl.testing import parameterized
+
+import numpy as np
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import config
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python import numpy_util
+from tensorflow.dtensor.python.config import is_gpu_present  # pylint: disable=unused-import
+from tensorflow.dtensor.python.config import is_tpu_present  # pylint: disable=unused-import
+from tensorflow.dtensor.python.config import preferred_device_type  # pylint: disable=unused-import
+from tensorflow.dtensor.python.tests.test_backend_name import DTENSOR_TEST_UTIL_BACKEND
+from tensorflow.dtensor.python.tests.test_backend_name import DTensorTestUtilBackend
+from tensorflow.dtensor.python.tests.test_backend_util import DTensorTestBackendConfigurator
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test as tf_test
+# pylint: enable=g-direct-tensorflow-import
+
+# DTensor only runs with TF V2.
+v2_compat.enable_v2_behavior()
+
+DEFAULT_TOL = 1e-5
+
+_DEFAULT_GPU_MEMORY_LIMIT = 200  # MB
+
+
+def create_device_ids_array(shape):
+  device_count = np.prod(shape)
+  return np.arange(device_count).reshape(shape)
+
+
+def create_device_array(shape, device_type):
+  device_count = np.prod(shape)
+  return np.asarray([
+      tf_device.DeviceSpec(  # pylint: disable=g-complex-comprehension
+          job='localhost/replica:0/task:0',
+          device_type=device_type,
+          device_index=i) for i in range(device_count)
+  ]).reshape(shape)
+
+
+def create_device_list(shape, device_type):
+  devices = create_device_array(shape, device_type)
+  return np.ravel(devices).tolist()
+
+
+def reset_context():
+  context._reset_context()  # pylint: disable=protected-access
+
+
+def reset_logical_devices(device_type, count):
+  """Resets logical devices for CPU/GPU.
+
+  Logical devices can only be instantiated once on a particular context. For
+  now, context re-use is triggering some function duplication errors, so we
+  reset the context on each call.
+
+  Args:
+    device_type: The device_type to reset.
+    count: numbers of virtual device to reset to.
+  """
+  reset_context()
+  devices = tf_config.list_physical_devices(device_type)
+  if device_type.upper() not in ('CPU', 'GPU'):
+    raise ValueError('resetting logical device for non-supported device type : '
+                     '%s' % device_type)
+
+  if count < len(devices):
+    devices = devices[:count]
+    tf_config.set_visible_devices(devices, device_type=device_type.upper())
+
+  for i, device in enumerate(devices):
+    n = (i + 1) * count // len(devices) - i * count // len(devices)
+    assert n > 0  # guaranteed if count >= len(devices)
+    configs = []
+    for ordinal in range(n):
+      if device_type.upper() == 'GPU':
+        dev_config = context.LogicalDeviceConfiguration(
+            memory_limit=_DEFAULT_GPU_MEMORY_LIMIT,
+            experimental_device_ordinal=ordinal)
+      else:
+        dev_config = context.LogicalDeviceConfiguration()
+      configs.append(dev_config)
+
+    tf_config.set_logical_device_configuration(device, configs)
+
+
+def list_local_logical_devices(device_type):
+  """Returns a list of local logial devices."""
+
+  # When coordinator service is enabled, list_logical_devices returns
+  # a global list.
+  devices = tf_config.list_logical_devices(device_type)
+
+  def is_local(device):
+    spec = tf_device.DeviceSpec.from_string(device.name)
+    if spec.job is None or spec.job == 'localhost':
+      return True
+    elif spec.job == config.job_name() and spec.task == config.client_id():
+      return True
+    return False
+
+  return [d for d in devices if is_local(d)]
+
+
+def is_tfrt_enabled():
+  return context.is_tfrt_enabled()
+
+
+FLAGS = flags.FLAGS
+
+
+class DTensorBaseTest(tf_test.TestCase, parameterized.TestCase):
+  """Provides comparison helper for dtensor vs local results."""
+
+  @classmethod
+  def setUpClass(cls):
+    super(DTensorBaseTest, cls).setUpClass()
+
+  def setUp(self):
+    super().setUp()
+    self._backend_configurator = DTensorTestBackendConfigurator(self)
+
+  def tearDown(self):
+    # Make sure all async ops finish.
+    context.async_wait()
+
+    # TODO(hthu): Remove the reset once we fixed the CopyToMesh with
+    # DefaultMesh placement issue.
+    reset_dtensor()
+
+    self._backend_configurator.tearDown()
+
+    super().tearDown()
+
+  @staticmethod
+  def configTestMesh(  # pylint: disable=invalid-name
+      device_type_mesh_map: typing.Dict[typing.Text,
+                                        layout_lib.Mesh]) -> layout_lib.Mesh:
+    """Configs corresponding mesh given test context.
+
+    If runs on a CPU mesh, set virtual device on CPU.
+    If runs on a GPU mesh, sets virtual device on GPU with proper memory limits.
+    if runs on a TPU mesh, initializes TPU system.
+
+    Args:
+      device_type_mesh_map: A dictionary containing device_type -> mesh mapping.
+
+    Returns:
+      A properly configured mesh for use in test.
+    """
+    reset_context()
+
+    def get_mesh(device_type):
+      mesh = device_type_mesh_map.get(device_type, None)
+      if mesh is None:
+        raise ValueError('Requires a %s mesh to run test on %s.' %
+                         (device_type, device_type))
+      return mesh
+
+    mesh = None
+    if is_tpu_present():
+      mesh = get_mesh('TPU')
+      reset_context()
+      accelerator_util.initialize_accelerator_system('TPU')
+    elif tf_config.list_physical_devices('GPU'):
+      mesh = get_mesh('GPU')
+      reset_logical_devices('GPU', np.prod(mesh.shape()))
+      accelerator_util.initialize_accelerator_system('GPU')
+    else:
+      mesh = get_mesh('CPU')
+      reset_logical_devices('CPU', np.prod(mesh.shape()))
+      accelerator_util.initialize_accelerator_system('CPU')
+
+    return mesh
+
+  def skipForDeviceType(  # pylint: disable=invalid-name
+      self,
+      device_type: typing.List[str],
+      reason: str,
+      unless_device_count_equals_to=None):
+    """Skip the test for the specific device_type.
+
+    Args:
+      device_type: list of device types, one of "CPU", "GPU", or "TPU".
+      reason: string that describe the reason for skipping the test.
+      unless_device_count_equals_to: Optional int. This parameter only works if
+        device_type is "TPU". If set, the test will be skipped unless the number
+        of TPUs equals to the specified count.
+    """
+    physical_device_types = set(
+        [d.device_type for d in tf_config.list_physical_devices()])
+    for device in device_type:
+      if device == 'TPU' and is_tpu_present():
+        if unless_device_count_equals_to is None:
+          self.skipTest(reason)
+        elif len(list_local_logical_devices(
+            device)) != unless_device_count_equals_to:
+          self.skipTest(reason)
+      if device == 'CPU' and len(
+          physical_device_types) == 1 and 'CPU' in physical_device_types:
+        # Make sure we skip when only `CPU` is present.
+        self.skipTest(reason)
+      if device == 'GPU' and 'GPU' in physical_device_types:
+        self.skipTest(reason)
+
+  def skipForTfrt(self, reason: str):  # pylint: disable=invalid-name
+    if is_tfrt_enabled():
+      self.skipTest(reason)
+
+  def skipTest(self, reason):  # pylint: disable=invalid-name
+    # skipTest() may be called in super().setUp()
+    if hasattr(self, '_backend_configurator'):
+      self._backend_configurator.tearDown()
+    super().skipTest(reason)
+
+  def assertDTensorEqual(
+      self,  # pylint: disable=invalid-name
+      expected_result,
+      expected_layout,
+      result_dtensor,
+      tol=DEFAULT_TOL):
+    """Asserts DTensor is of the particular value."""
+    if issubclass(
+        type(result_dtensor), resource_variable_ops.BaseResourceVariable):
+      result_dtensor = result_dtensor.value()
+    if expected_layout is not None:
+      # This, the assertEqual, is a pure proto raw bytes comparison. To make it
+      # human-readable, use the `to_string` api for Layout for the dedicated msg
+      # field.
+      #
+      # Futhurmore, as the mesh part is very long and usually identical. Try to
+      # cut them as well, to make it easier to read.
+      expected_str = expected_layout.to_string()
+      got_str = api.fetch_layout(result_dtensor).to_string()
+      index_for_mesh = expected_str.find('mesh:')
+      if index_for_mesh != -1 and got_str.find(
+          expected_str[index_for_mesh:]) != -1:
+        # the mesh part is same. cut them so it is more readable.
+        expected_str = expected_str[:index_for_mesh]
+        got_str = got_str[:got_str.find('mesh:')]
+
+      self.assertEqual(
+          api.fetch_layout(result_dtensor),
+          expected_layout,
+          msg=(
+              '=======\nexpected layout is\n  {}\n\nwhile got layout is\n  {}\n'
+              .format(expected_str, got_str)
+          ),
+      )
+
+    layout = api.fetch_layout(result_dtensor)
+    unpacked = [t.numpy() for t in api.unpack(result_dtensor)]
+
+    # Check global shape.
+    self.assertAllEqual(expected_result.shape, result_dtensor.shape)
+
+    result_dtensor = numpy_util.to_numpy(result_dtensor)
+
+    # Check dtype.
+    # Note: This check needs be after result_dtensor is converted
+    # into numpy, due to failure with Numpy version 1.18.5.
+    self.assertEqual(
+        expected_result.dtype, result_dtensor.dtype, result_dtensor
+    )
+
+    # Check value on concatenated result DTensor.
+    self.assertAllClose(expected_result, result_dtensor, atol=tol, rtol=tol)
+
+    # In addition to check the 'concatenated' DTensor, we also check all
+    # "replicated" parts are same.
+    #
+    # The algorithm is simple:
+    # 1. For a mesh with topology (x,y,z,p), and a DTensor with layout ('',z,x).
+    # 2. Create some data structures:
+    #   - create a mapping from device id (called offset below) to mesh
+    #     location. For the mesh above, loc {x:1,y:2,z:2,p:0} means the device
+    #     is located at that coordinates in the 4-D mesh.
+    #   - create a mapping from mesh location to device id.
+    # 3. Find all replicated mesh dimension names, i.e., 'y' and `p` in the
+    #     example above.
+    # 4. Iterate over all unpacked components, translate the offset (device id)
+    #    to mesh location, called (x',y',z',p').
+    #   - For `y`, which is replicated dim in the mesh, check all unpacked
+    #     components at (x',*,z',p') are same as the component at (x',0,z',p').
+    #   - For `p`, which is also replicated dim in the mesh, check all unpacked
+    #     components at (x',y',z',*) are same as the component at (x',y',z',0).
+
+    def hash_key(loc):
+      """Hash key for Python dict."""
+      # Python dict is unhashable. Creates a sorted dict and dumps as json str.
+      d = collections.OrderedDict(sorted(loc.items(), key=lambda x: x[0]))
+      return json.dumps(d)
+
+    offset_to_mesh_loc_dict = layout.mesh.unravel_index()
+    mesh_loc_to_offset_dict = {}
+    for offset, loc in offset_to_mesh_loc_dict.items():
+      mesh_loc_to_offset_dict[hash_key(loc)] = offset
+
+    # pylint: disable=protected-access
+    replicated_dims = [
+        x for x in layout.mesh._dim_names if x not in layout.sharding_specs
+    ]
+    # pylint: enable=protected-access
+
+    for offset, tensor in enumerate(unpacked):
+      mesh_loc = offset_to_mesh_loc_dict[offset]
+      for dim_sharding in replicated_dims:
+        if mesh_loc[dim_sharding] != 0:
+          mesh_loc = copy.deepcopy(mesh_loc)  # deepcopy as we will mutate
+          mesh_loc[dim_sharding] = 0
+          offset = mesh_loc_to_offset_dict[hash_key(mesh_loc)]
+          # tol is be as low as possible as they should match "exactly". so, we
+          # ignore the `tol` passed by caller and choose the default one.
+          self.assertAllClose(tensor, unpacked[offset])
+
+
+def product(*lists):
+  """Makes a product of names parameters list."""
+  # Each element lists should be a tuple of tuples of the form
+  # (("test1", ...), ("test2", ...), ...).
+  # Function returns the product of the lists with the labels concatenated.
+  return [  # pylint: disable=g-complex-comprehension
+      (''.join(p[0]
+               for p in elt), *sum((p[1:]
+                                    for p in elt), ()))
+      for elt in itertools.product(*lists)
+  ]
+
+
+def reset_dtensor():
+  """Resets the singleton DTensor Device.
+
+  This behavior is not generally exposed and only meant to be used in tests.
+  """
+  api._reset()  # pylint: disable=protected-access
+
+
+__all__ = [
+    'DEFAULT_TOL',
+    'DTensorTestUtilBackend',
+    'DTENSOR_TEST_UTIL_BACKEND',
+    'create_device_ids_array',
+    'create_device_array',
+    'create_device_list',
+    'reset_context',
+    'reset_logical_devices',
+    'list_local_logical_devices',
+    'is_tfrt_enabled',
+    'FLAGS',
+    'DTensorBaseTest',
+    'product',
+    'reset_dtensor',
+    'is_tpu_present',
+    'is_gpu_present',
+]
diff --git a/tensorflow/dtensor/python/tests/test_util_ops.py b/tensorflow/dtensor/python/tests/test_util_ops.py
new file mode 100644
index 00000000000..89513cff540
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/test_util_ops.py
@@ -0,0 +1,658 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods for DTensor testing."""
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gen_spectral_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import special_math_ops
+
+
+def expand_test_config(op_list, test_configs):
+  """Returns a list of test case args that covers ops and test_configs.
+
+  The list is a Cartesian product between op_list and test_configs.
+
+  Args:
+    op_list: A list of dicts, with items keyed by 'testcase_name' and 'op'.
+      Available lists are defined later in this module.
+    test_configs: A list of dicts, additional kwargs to be appended for each
+      test parameters.
+
+  Returns:
+    test_configurations: a list of test parameters that covers all
+      provided ops in op_list and args in test_configs.
+  """
+  test_configurations = []
+  for op_info in op_list:
+    test_index = 0
+    for added_test_config in test_configs:
+      test_config = op_info.copy()
+      test_config.update(added_test_config)
+      test_config['testcase_name'] = op_info['testcase_name'] + '_' + str(
+          test_index)
+      test_index += 1
+      test_configurations.append(test_config)
+  return test_configurations
+
+
+# Disable pyformat for this block to force compact style.
+# pyformat: disable
+#
+# Disable g-long-lambda to make the unit test suits compact (avoid def new func)
+# pylint: disable=g-long-lambda
+UNARY_OPS = [
+    {
+        'testcase_name': 'Identity',
+        'op': array_ops.identity
+    },
+    {
+        'testcase_name': 'ZerosLike',
+        'op': array_ops.zeros_like_v2
+    },
+    {
+        'testcase_name': 'Abs',
+        'op': math_ops.abs
+    },
+    {
+        'testcase_name': 'Negative',
+        'op': gen_math_ops.neg
+    },
+    {
+        'testcase_name': 'Cast',
+        'op': lambda x: math_ops.cast(x, dtypes.int32)
+    },
+    {
+        'testcase_name': 'ErfOp',
+        'op': gen_math_ops.erf
+    },
+    {
+        'testcase_name': 'Softmax',
+        'op': nn_ops.softmax_v2
+    },
+    {
+        'testcase_name': 'LogSoftmax',
+        'op': nn_ops.log_softmax_v2
+    },
+    {
+        'testcase_name': 'StopGradient',
+        'op': array_ops.stop_gradient
+    },
+    {
+        'testcase_name': 'Exp',
+        'op': math_ops.exp
+    },
+    {
+        'testcase_name': 'Sqrt',
+        'op': math_ops.sqrt
+    },
+    {
+        'testcase_name': 'Rsqrt',
+        'op': math_ops.rsqrt
+    },
+    {
+        'testcase_name': 'Reciprocal',
+        'op': gen_math_ops.reciprocal
+    },
+    {
+        'testcase_name': 'Relu',
+        'op': gen_nn_ops.relu
+    },
+    {
+        'testcase_name': 'Square',
+        'op': gen_math_ops.square
+    },
+    {
+        'testcase_name': 'Tanh',
+        'op': gen_math_ops.tanh
+    },
+    {
+        'testcase_name': 'Cos',
+        'op': gen_math_ops.cos
+    },
+    {
+        'testcase_name': 'Sigmoid',
+        'op': math_ops.sigmoid
+    },
+    {
+        'testcase_name': 'Acos',
+        'op': math_ops.acos
+    },
+    {
+        'testcase_name': 'Acosh',
+        'op': gen_math_ops.acosh
+    },
+    {
+        'testcase_name': 'Angle',
+        'op': math_ops.angle
+    },
+    {
+        'testcase_name': 'Asin',
+        'op': gen_math_ops.asin
+    },
+    {
+        'testcase_name': 'Asinh',
+        'op': gen_math_ops.asinh
+    },
+    {
+        'testcase_name': 'Atan',
+        'op': gen_math_ops.atan
+    },
+    {
+        'testcase_name': 'Bessel0e',
+        'op': special_math_ops.bessel_i0e
+    },
+    {
+        'testcase_name': 'Bessel1e',
+        'op': special_math_ops.bessel_i1e
+    },
+    {
+        'testcase_name': 'Bitcast',
+        'op': lambda x: gen_array_ops.bitcast(x, type=dtypes.int32)
+    },
+    {
+        'testcase_name': 'Ceil',
+        'op': math_ops.ceil
+    },
+    {
+        'testcase_name': 'CheckNumbers',
+        'op': (lambda x: gen_array_ops.check_numerics(x, message='bug'))
+    },
+    {
+        'testcase_name': 'ClipByValue',
+        'op': (lambda x: clip_ops.clip_by_value(x, 1.5, 2.5))
+    },
+    {
+        'testcase_name': 'Conj',
+        'op': math_ops.conj
+    },
+    {
+        'testcase_name': 'Cosh',
+        'op': gen_math_ops.cosh
+    },
+    {
+        'testcase_name': 'Digamma',
+        'op': gen_math_ops.digamma
+    },
+    {
+        'testcase_name':
+            'ComplexAbs',
+        'op':
+            lambda x: gen_math_ops.complex_abs(
+                x=math_ops.cast(x, dtypes.complex64), Tout=float, name='raw')
+    },
+    {
+        'testcase_name': 'Sign',
+        'op': math_ops.sign
+    },
+    {
+        'testcase_name': 'Elu',
+        'op': gen_nn_ops.elu
+    },
+    {
+        'testcase_name': 'Erfc',
+        'op': gen_math_ops.erfc
+    },
+    {
+        'testcase_name': 'Expm1',
+        'op': gen_math_ops.expm1
+    },
+    {
+        'testcase_name': 'Floor',
+        'op': math_ops.floor
+    },
+    {
+        'testcase_name': 'Imag',
+        'op': math_ops.imag
+    },
+    {
+        'testcase_name': 'Inv',
+        'op': (lambda x: gen_math_ops.inv(x=x, name='Inv'))
+    },
+    {
+        'testcase_name': 'IsInf',
+        'op': gen_math_ops.is_inf
+    },
+    {
+        'testcase_name': 'IsNan',
+        'op': gen_math_ops.is_nan
+    },
+    {
+        'testcase_name': 'LeakyRelu',
+        'op': (lambda x: nn_ops.leaky_relu((x - 2), alpha=0.3)),
+    },
+    {
+        'testcase_name': 'Lgamma',
+        'op': gen_math_ops.lgamma,
+    },
+    {
+        'testcase_name': 'Log1p',
+        'op': gen_math_ops.log1p,
+    },
+    {
+        'testcase_name': 'Ndtri',
+        'op': (lambda x: math_ops.ndtri(x / 100)),
+    },
+    {
+        'testcase_name': 'Selu',
+        'op': gen_nn_ops.selu,
+    },
+    {
+        'testcase_name': 'Sin',
+        'op': gen_math_ops.sin,
+    },
+    {
+        'testcase_name': 'Sinh',
+        'op': gen_math_ops.sinh,
+    },
+    {
+        'testcase_name': 'Softplus',
+        'op': math_ops.softplus,
+    },
+    {
+        'testcase_name': 'Softsign',
+        'op': gen_nn_ops.softsign,
+    },
+    {
+        'testcase_name': 'Tan',
+        'op': gen_math_ops.tan,
+    },
+    {
+        'testcase_name': 'Round',
+        'op': math_ops.round,
+    },
+    {
+        'testcase_name': 'Rint',
+        'op': gen_math_ops.rint,
+    },
+    {
+        'testcase_name': 'Relu6',
+        'op': nn_ops.relu6,
+    },
+    {
+        'testcase_name': 'Real',
+        'op': math_ops.real,
+    },
+    {
+        'testcase_name': 'PreventGradient',
+        'op': lambda x: gen_array_ops.prevent_gradient(input=x),
+    },
+]
+
+BINARY_ANY_TYPE_OPS_WITH_BROADCASTING_SUPPORT = [
+    {
+        'testcase_name': 'Add',
+        'op': math_ops.add
+    },
+    {
+        'testcase_name': 'Subtract',
+        'op': math_ops.subtract
+    },
+    {
+        'testcase_name': 'Multiply',
+        'op': math_ops.multiply
+    },
+    {
+        'testcase_name': 'Maximum',
+        'op': gen_math_ops.maximum
+    },
+    {
+        'testcase_name': 'Minimum',
+        'op': gen_math_ops.minimum
+    },
+    {
+        'testcase_name': 'Squared_Difference',
+        'op': gen_math_ops.squared_difference
+    },
+    {
+        'testcase_name': 'GreaterEqual',
+        'op': gen_math_ops.greater_equal
+    },
+    {
+        'testcase_name': 'Equal',
+        'op': math_ops.equal
+    },
+    {
+        'testcase_name': 'NotEqual',
+        'op': math_ops.not_equal
+    },
+    {
+        'testcase_name': 'LessEqual',
+        'op': gen_math_ops.less_equal
+    },
+    {
+        'testcase_name': 'Less',
+        'op': gen_math_ops.less
+    },
+    {
+        'testcase_name': 'Pow',
+        'op': math_ops.pow
+    },
+]
+
+BINARY_FLOAT_OPS_WITH_BROADCASTING_SUPPORT = [
+    {
+        'testcase_name': 'Real_Divide',
+        'op': math_ops.divide
+    },
+    {
+        'testcase_name': 'DivNoNan',
+        'op': math_ops.div_no_nan
+    },
+] + BINARY_ANY_TYPE_OPS_WITH_BROADCASTING_SUPPORT
+
+BINARY_INT_OPS_WITH_BROADCASTING_SUPPORT = [
+    {
+        'testcase_name': 'LeftShift',
+        'op': gen_bitwise_ops.left_shift
+    },
+    {
+        'testcase_name': 'RightShift',
+        'op': gen_bitwise_ops.right_shift
+    },
+    {
+        'testcase_name': 'BitwiseOr',
+        'op': gen_bitwise_ops.bitwise_or
+    },
+    {
+        'testcase_name': 'BitwiseAnd',
+        'op': gen_bitwise_ops.bitwise_and
+    },
+    {
+        'testcase_name': 'BitwiseXor',
+        'op': gen_bitwise_ops.bitwise_xor
+    },
+    {
+        'testcase_name': 'TruncateDiv',
+        'op': gen_math_ops.truncate_div
+    },
+    {
+        'testcase_name': 'TruncateMod',
+        'op': gen_math_ops.truncate_mod
+    },
+] + BINARY_ANY_TYPE_OPS_WITH_BROADCASTING_SUPPORT
+
+BINARY_BOOL_OPS = [{
+    'testcase_name': 'LogicalOr',
+    'op': gen_math_ops.logical_or
+}]
+
+BINARY_FLOAT_OPS = [
+    {
+        'testcase_name': 'RsqrtGrad',
+        'op': lambda y, dy: gen_math_ops.rsqrt_grad(y=y, dy=dy)
+    },
+    {
+        'testcase_name': 'SqrtGrad',
+        'op': lambda y, dy: gen_math_ops.sqrt_grad(y=y, dy=dy)
+    },
+    {
+        'testcase_name': 'Atan2',
+        'op': gen_math_ops.atan2
+    },
+    {
+        'testcase_name': 'Betainc',
+        'op': lambda a, b: gen_math_ops.betainc(a, b, 1.0)
+    },
+    {
+        'testcase_name': 'Complex',
+        'op': math_ops.complex
+    },
+    {
+        'testcase_name':
+            'EluGrad',
+        'op': (lambda x, y: gen_nn_ops.elu_grad(
+            gradients=x, outputs=y, name='op_elugrad'))
+    },
+    {
+        'testcase_name': 'Igamma',
+        'op': gen_math_ops.igamma
+    },
+    {
+        'testcase_name':
+            'IgammaGradA',
+        'op': (lambda a, x: gen_math_ops.igamma_grad_a(
+            a=a, x=x, name='IgammaGradA'))
+    },
+    {
+        'testcase_name':
+            'LeakyReluGrad',
+        'op':
+            (lambda x, y: gen_nn_ops.leaky_relu_grad(gradients=x, features=y)),
+    },
+    {
+        'testcase_name': 'MulNoNan',
+        'op': (lambda x, y: gen_math_ops.mul_no_nan(x=x, y=y)),
+    },
+    {
+        'testcase_name': 'NextAfter',
+        'op': gen_math_ops.next_after,
+    },
+    {
+        'testcase_name': 'PolyGamma',
+        'op': gen_math_ops.polygamma,
+    },
+    {
+        'testcase_name': 'SeluGrad',
+        'op': (lambda x, y: gen_nn_ops.selu_grad(gradients=x, outputs=y)),
+    },
+    {
+        'testcase_name': 'Relu6Grad',
+        'op': (lambda x, y: gen_nn_ops.relu6_grad(gradients=x, features=y)),
+    },
+    {
+        'testcase_name': 'ReciprocalGrad',
+        'op': (lambda x, y: gen_math_ops.reciprocal_grad(y=x, dy=y)),
+    },
+    {
+        'testcase_name': 'Xdivy',
+        'op': math_ops.xdivy,
+    },
+    {
+        'testcase_name': 'Xlog1py',
+        'op': math_ops.xlog1py,
+    },
+    {
+        'testcase_name': 'Xlogy',
+        'op': gen_math_ops.xlogy,
+    },
+    {
+        'testcase_name': 'Zeta',
+        'op': gen_math_ops.zeta,
+    },
+] + BINARY_FLOAT_OPS_WITH_BROADCASTING_SUPPORT
+
+BINARY_INT_OPS = [] + BINARY_INT_OPS_WITH_BROADCASTING_SUPPORT
+
+REDUCTION_OPS = [
+    {
+        'testcase_name': 'Sum',
+        'op': math_ops.reduce_sum
+    },
+    {
+        'testcase_name': 'Mean',
+        'op': math_ops.reduce_mean
+    },
+    {
+        'testcase_name': 'Prod',
+        'op': math_ops.reduce_prod
+    },
+    {
+        'testcase_name': 'Max',
+        'op': math_ops.reduce_max
+    },
+    {
+        'testcase_name': 'Min',
+        'op': math_ops.reduce_min
+    },
+]
+
+# TODO(b/171746536): added v2 rng ops here once supported.
+RANDOM_OPS = [{
+    'testcase_name': 'StatelessNorm',
+    'op': gen_stateless_random_ops.stateless_random_normal,
+    'dtype': dtypes.float32,
+    'op_version': 'V1'
+}, {
+    'testcase_name': 'StatelessTruncatedNorm',
+    'op': gen_stateless_random_ops.stateless_truncated_normal,
+    'dtype': dtypes.float32,
+    'op_version': 'V1'
+}, {
+    'testcase_name': 'StatelessUniform',
+    'op': gen_stateless_random_ops.stateless_random_uniform,
+    'dtype': dtypes.float32,
+    'op_version': 'V1'
+}, {
+    'testcase_name': 'StatelessUniformFullInt',
+    'op': gen_stateless_random_ops.stateless_random_uniform_full_int,
+    'dtype': dtypes.int32,
+    'op_version': 'V1'
+}, {
+    'testcase_name': 'StatelessRandomUniformFullIntV2',
+    'op': gen_stateless_random_ops_v2.stateless_random_uniform_full_int_v2,
+    'dtype': dtypes.int32,
+    'op_version': 'V2'
+}, {
+    'testcase_name': 'StatelessRandomNormalV2',
+    'op': gen_stateless_random_ops_v2.stateless_random_normal_v2,
+    'dtype': dtypes.float32,
+    'op_version': 'V2'
+}, {
+    'testcase_name': 'StatelessTruncatedNormalV2',
+    'op': gen_stateless_random_ops_v2.stateless_truncated_normal_v2,
+    'dtype': dtypes.float32,
+    'op_version': 'V2'
+}, {
+    'testcase_name': 'StatelessRandomUniformV2',
+    'op': gen_stateless_random_ops_v2.stateless_random_uniform_v2,
+    'dtype': dtypes.float32,
+    'op_version': 'V2'
+}, {
+    'testcase_name': 'StatelessRandomUniformIntV2',
+    'op': gen_stateless_random_ops_v2.stateless_random_uniform_int_v2,
+    'dtype': dtypes.int32,
+    'op_version': 'V2_RANGE'
+}]
+
+# op(inputs()) is expected to return an NxM tensor (N, M both even) with a
+# flexible output sharding, depending on the context `op` runs in.
+EXPANSION_OPS = [
+    dict(
+        testcase_name='TileFrom1x1Array',
+        inputs=lambda: (constant_op.constant([[1.]]), [4, 4]),
+        op=gen_array_ops.tile),
+    dict(
+        testcase_name='TileFrom2x2Array',
+        inputs=lambda: (constant_op.constant([[1., 2.], [3., 4.]]), [2, 4]),
+        op=gen_array_ops.tile),
+    dict(
+        testcase_name='Fill',
+        inputs=lambda: ([2, 4], constant_op.constant(1.)),
+        op=array_ops.fill),
+]
+
+BATCH_PARALLEL_2D_WINDOW_OPS = [(
+    'AvgPool',
+    nn_ops.avg_pool_v2,
+)]
+
+BATCH_PARALLEL_3D_WINDOW_OPS = [(
+    'MaxPool3D',
+    nn_ops.max_pool3d,
+), (
+    'AvgPool3D',
+    nn_ops.avg_pool3d,
+)]
+
+FFT_OPS = [(
+    'FFT',
+    gen_spectral_ops.fft,
+    1,
+), (
+    'FFT2D',
+    gen_spectral_ops.fft2d,
+    2,
+), (
+    'FFT3D',
+    gen_spectral_ops.fft3d,
+    3,
+), (
+    'IFFT',
+    gen_spectral_ops.ifft,
+    1,
+), (
+    'IFFT2D',
+    gen_spectral_ops.ifft2d,
+    2,
+), (
+    'IFFT3D',
+    gen_spectral_ops.ifft3d,
+    3,
+)]
+
+RFFT_OPS = [(
+    'IRFFT',
+    gen_spectral_ops.irfft,
+    1,
+    dtypes.complex64,
+), (
+    'IRFFT2D',
+    gen_spectral_ops.irfft2d,
+    2,
+    dtypes.complex64,
+), (
+    'IRFFT3D',
+    gen_spectral_ops.irfft3d,
+    3,
+    dtypes.complex64,
+), (
+    'RFFT',
+    gen_spectral_ops.rfft,
+    1,
+    dtypes.float32,
+), (
+    'RFFT2D',
+    gen_spectral_ops.rfft2d,
+    2,
+    dtypes.float32,
+), (
+    'RFFT3D',
+    gen_spectral_ops.rfft3d,
+    3,
+    dtypes.float32,
+)]
+
+PADDINGS = [
+    {
+        'testcase_name': 'SamePadding',
+        'padding': 'SAME'
+    },
+    {
+        'testcase_name': 'ValidPadding',
+        'padding': 'VALID'
+    },
+]
+# pylint: enable=g-long-lambda
+# pyformat: enable
diff --git a/tensorflow/dtensor/python/tpu_util.py b/tensorflow/dtensor/python/tpu_util.py
index c061925f869..d41b9090861 100644
--- a/tensorflow/dtensor/python/tpu_util.py
+++ b/tensorflow/dtensor/python/tpu_util.py
@@ -23,7 +23,6 @@
 from tensorflow.dtensor.python import config
 from tensorflow.dtensor.python import dtensor_device
 from tensorflow.dtensor.python import gen_dtensor_ops
-from tensorflow.dtensor.python import heartbeat
 from tensorflow.dtensor.python import layout as layout_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -280,13 +279,6 @@ def initialize_tpu_system():
                   + "consider to restart the job or try another machine.")
     raise e
 
-  # Optionally exchange heartbeats between workers every minute.
-  if config.num_clients() > 1 and config.heartbeat_enabled():
-    logging.info(
-        "Starting DTensor heartbeat service exchanging signals every 10 minutes"
-    )
-    heartbeat.start(period=180)
-
   # Clear out the eager context caches since the memory is invalid now.
   logging.info("Clearing out eager caches")
   context.context()._clear_caches()  # pylint: disable=protected-access
@@ -580,15 +572,17 @@ def _build_orthogonal_rings(
 
 
 @tf_export("experimental.dtensor.create_tpu_mesh", v1=[])
-def create_tpu_mesh(mesh_dim_names: List[str],
-                    mesh_shape: List[int],
-                    mesh_name: str,
-                    ring_dims: Optional[int] = None,
-                    ring_axes: Optional[List[str]] = None,
-                    ring_bounds: Optional[List[int]] = None,
-                    can_split_host_across_rings: bool = True,
-                    build_ring_across_rings: bool = False,
-                    rotate_ring_across_rings: bool = False) -> layout_lib.Mesh:
+def create_tpu_mesh(
+    mesh_dim_names: List[str],
+    mesh_shape: List[int],
+    mesh_name: str,
+    ring_dims: Optional[int] = None,
+    ring_axes: Optional[List[str]] = None,
+    ring_bounds: Optional[List[int]] = None,
+    can_split_host_across_rings: bool = True,
+    build_ring_across_rings: bool = False,
+    rotate_ring_across_rings: bool = False,
+    use_xla_spmd: bool = layout_lib.USE_XLA_SPMD) -> layout_lib.Mesh:
   """Returns a distributed TPU mesh optimized for AllReduce ring reductions.
 
   Only as many as leading axes specified by `ring_axes` as necessary will be
@@ -619,6 +613,8 @@ def create_tpu_mesh(mesh_dim_names: List[str],
       across model-parallel rings. This ring could be strided.
     rotate_ring_across_rings: Optional; If true, build the data-parallel ring in
       column-major instead of row-major order.
+    use_xla_spmd: Boolean when True, will use XLA SPMD instead of
+      DTensor SPMD.
   """
 
   logging.info("Building a TPU mesh %s of shape %s", mesh_name, mesh_shape)
@@ -719,7 +715,7 @@ def create_tpu_mesh(mesh_dim_names: List[str],
   global_device_ids, local_device_ids, local_device_list = _create_device_array(
       mesh_shape, _TPU_DEVICE_TYPE, None, local_device_ids=indexes)
   return layout_lib.Mesh(mesh_dim_names, global_device_ids, local_device_ids,
-                         local_device_list, mesh_name)
+                         local_device_list, mesh_name, use_xla_spmd)
 
 
 def get_device_ids(mesh: layout_lib.Mesh,
diff --git a/tensorflow/dtensor/tests/BUILD b/tensorflow/dtensor/tests/BUILD
index d7887fc768c..411920ea8b9 100644
--- a/tensorflow/dtensor/tests/BUILD
+++ b/tensorflow/dtensor/tests/BUILD
@@ -2,6 +2,8 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 tf_cc_test(
     name = "tensor_layout_test",
     srcs = ["tensor_layout_test.cc"],
@@ -10,6 +12,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/proto:layout_proto_cc",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "layout_to_xla_sharding_test",
+    srcs = ["layout_to_xla_sharding_test.cc"],
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/dtensor/cc:layout_to_xla_sharding",
+        "//tensorflow/dtensor/cc:tensor_layout",
+        "//tensorflow/dtensor/proto:layout_proto_cc",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc b/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc
new file mode 100644
index 00000000000..b6b5f8fee6a
--- /dev/null
+++ b/tensorflow/dtensor/tests/layout_to_xla_sharding_test.cc
@@ -0,0 +1,377 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.h"
+
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+TEST(LayoutToXLAShardingTest, ReplicatedLayout1D) {
+  std::string layout_str =
+      "sharding_specs:unsharded, "
+      "mesh:|x=2|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/"
+      "task:0/device:CPU:1";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+}
+
+TEST(LayoutToXLAShardingTest, ReplicatedLayout2D) {
+  std::string layout_str =
+      "sharding_specs:unsharded,unsharded "
+      "mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+}
+
+TEST(LayoutToXLAShardingTest, ReplicatedLayout3D) {
+  std::string layout_str =
+      "sharding_specs:unsharded,unsharded,unsharded, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::REPLICATED, op_sharding.type());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), IsEmpty());
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), IsEmpty());
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedLayout1D) {
+  std::string layout_str =
+      "sharding_specs:x, "
+      "mesh:|x=3|0,1,2|0,1,2|/job:localhost/task:0/device:CPU:0,/job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(3));
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedLayout2D) {
+  std::string layout_str =
+      "sharding_specs:x,y, "
+      "mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2));
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedLayout2DAsymmetricMesh) {
+  std::string layout_str =
+      "sharding_specs:y,x, "
+      "mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/"
+      "device:CPU:2,/job:localhost/task:0/device:CPU:3,/job:localhost/task:0/"
+      "device:CPU:4,/job:localhost/task:0/device:CPU:5,/job:localhost/task:0/"
+      "device:CPU:6,/job:localhost/task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(4, 2));
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 4, 1, 5, 2, 6, 3, 7));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout2D) {
+  std::string layout_str =
+      "sharding_specs:y,x, "
+      "mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2));
+
+  // Devices should now be ordered 'y' axis first.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 2, 1, 3));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedLayout3D) {
+  std::string layout_str =
+      "sharding_specs:x,y,z, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
+
+  // Devices should now be ordered 'y' axis first.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_1) {
+  std::string layout_str =
+      "sharding_specs:z,x,y, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
+
+  // Devices are permuted in z axis first and then x and y. It helps to manually
+  // draw this to confirm it.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 2, 4, 6, 1, 3, 5, 7));
+}
+
+TEST(LayoutToXLAShardingTest, FullyShardedPermutedLayout3D_2) {
+  std::string layout_str =
+      "sharding_specs:z,y,x, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_FALSE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 2, 2));
+
+  // Devices are permuted in reverse order, it helps to draw this out manually
+  // to understand this is correct.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 4, 2, 6, 1, 5, 3, 7));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedLayout2D) {
+  std::string layout_str =
+      "sharding_specs:x,unsharded, "
+      "mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 1, 2));
+
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout2D) {
+  std::string layout_str =
+      "sharding_specs:y,unsharded, "
+      "mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(), ElementsAre(2, 1, 2));
+
+  // Permuted on the Y dimension.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(), ElementsAre(0, 2, 1, 3));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_1) {
+  std::string layout_str =
+      "sharding_specs:x,y,unsharded, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  // Last dim is two since every replication group is size 2.
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
+              ElementsAre(2, 2, 1, 2));
+
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedLayout3D_2) {
+  std::string layout_str =
+      "sharding_specs:x,unsharded,unsharded, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  // Last dim is four since every replication group is size 4.
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
+              ElementsAre(2, 1, 1, 4));
+
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 1, 2, 3, 4, 5, 6, 7));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_1) {
+  std::string layout_str =
+      "sharding_specs:z,y,unsharded, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  // Last dim is two since every replication group is size 2.
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
+              ElementsAre(2, 2, 1, 2));
+
+  // Same permutation as 'z', 'y', 'x'.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 4, 2, 6, 1, 5, 3, 7));
+}
+
+TEST(LayoutToXLAShardingTest, PartiallyShardedPermutedLayout3D_2) {
+  std::string layout_str =
+      "sharding_specs:y,unsharded,z, "
+      "mesh:|x=2,y=2,z=2|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/task:0/"
+      "device:CPU:0,/"
+      "job:localhost/"
+      "task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/"
+      "task:0/device:CPU:3,/job:localhost/task:0/device:CPU:4,/job:localhost/"
+      "task:0/device:CPU:5,/job:localhost/task:0/device:CPU:6,/job:localhost/"
+      "task:0/device:CPU:7";
+
+  ::xla::OpSharding op_sharding =
+      ConvertLayoutToXlaOpSharding(Layout::FromString(layout_str).value())
+          .value();
+
+  EXPECT_EQ(::xla::OpSharding::OTHER, op_sharding.type());
+  EXPECT_TRUE(op_sharding.replicate_on_last_tile_dim());
+  // Last dim is two since every replication group is size 2.
+  EXPECT_THAT(op_sharding.tile_assignment_dimensions(),
+              ElementsAre(2, 1, 2, 2));
+
+  // Same permutation as 'y', 'z'.
+  EXPECT_THAT(op_sharding.tile_assignment_devices(),
+              ElementsAre(0, 4, 1, 5, 2, 6, 3, 7));
+}
+
+}  // namespace
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/tests/tensor_layout_test.cc b/tensorflow/dtensor/tests/tensor_layout_test.cc
index 785c6ac1f78..e2ce36cf75f 100644
--- a/tensorflow/dtensor/tests/tensor_layout_test.cc
+++ b/tensorflow/dtensor/tests/tensor_layout_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/dtensor/proto/layout.pb.h"
 
@@ -28,6 +29,7 @@ namespace tensorflow {
 namespace dtensor {
 namespace {
 
+using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
@@ -67,12 +69,70 @@ class LayoutTest : public ::testing::Test {
   }
 };
 
-TEST_F(LayoutTest, FromStringEmptyMesh) {
+TEST(MeshTest, FromStringEmptyMesh) {
   Mesh mesh = Mesh::Empty();
   std::string mesh_str = mesh.ToString();
   EXPECT_EQ(mesh_str, Mesh::kEmptyMeshString);
 }
 
+TEST(MeshTest, FromStringMeshWithGlobalDevices) {
+  StatusOr<Mesh> mesh = Mesh::FromString(
+      "mesh:|x=2|0,1|0|/job:localhost/task:0/device:CPU:0|/job:localhost/"
+      "task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1");
+  EXPECT_THAT(mesh->global_devices(),
+              ElementsAre("/job:localhost/task:0/device:CPU:0",
+                          "/job:localhost/task:0/device:CPU:1"));
+}
+
+TEST(MeshTest, FromStringMeshWithXLASPMDAndGlobalDevices) {
+  StatusOr<Mesh> mesh = Mesh::FromString(
+      "mesh:|x=2|0,1|0|/job:localhost/task:0/device:CPU:0|/job:localhost/"
+      "task:0/device:CPU:1|use_xla_spmd");
+  EXPECT_TRUE(mesh->use_xla_spmd());
+}
+
+TEST(MeshTest, FromStringMeshWithXLASPMD) {
+  StatusOr<Mesh> mesh = Mesh::FromString(
+      "mesh:|x=1|0|0|/job:localhost/task:0/device:CPU:0|use_xla_spmd");
+  EXPECT_TRUE(mesh->use_xla_spmd());
+}
+
+TEST(MeshTest, FromStringMeshWithoutXLASPMD) {
+  StatusOr<Mesh> mesh =
+      Mesh::FromString("mesh:|x=1|0|0|/job:localhost/task:0/device:CPU:0");
+  EXPECT_FALSE(mesh->use_xla_spmd());
+}
+
+TEST(MeshTest, ToStringMeshWithoutXLASPMD) {
+  Mesh mesh = Mesh::CreateMesh("MyMesh", /*dim_names=*/{"x"},
+                               /*mesh_shape=*/{2},
+                               /*global_device_ids=*/{0, 1},
+                               /*global_devices_str=*/
+                               {"/job:localhost/task:0/device:CPU:0",
+                                "/job:localhost/task:0/device:CPU:1"},
+                               /*local_device_ids=*/{0, 1},
+                               /*local_devices_str=*/
+                               {"/job:localhost/task:0/device:CPU:0",
+                                "/job:localhost/task:0/device:CPU:1"},
+                               /*use_xla_spmd=*/false);
+  EXPECT_TRUE(!absl::StrContains(mesh.ToString(), Mesh::kUseXLASPMDString));
+}
+
+TEST(MeshTest, ToStringMeshWithXLASPMD) {
+  Mesh mesh = Mesh::CreateMesh("MyMesh", /*dim_names=*/{"x"},
+                               /*mesh_shape=*/{2},
+                               /*global_device_ids=*/{0, 1},
+                               /*global_devices_str=*/
+                               {"/job:localhost/task:0/device:CPU:0",
+                                "/job:localhost/task:0/device:CPU:1"},
+                               /*local_device_ids=*/{0, 1},
+                               /*local_devices_str=*/
+                               {"/job:localhost/task:0/device:CPU:0",
+                                "/job:localhost/task:0/device:CPU:1"},
+                               /*use_xla_spmd=*/true);
+  EXPECT_THAT(mesh.ToString(), ContainsRegex(Mesh::kUseXLASPMDString));
+}
+
 TEST_F(LayoutTest, FromStringEmptyLayout) {
   Layout layout = Layout::Empty();
   std::string layout_str = layout.ToString();
@@ -562,6 +622,12 @@ TEST_F(LayoutTest, EquivalentLayout) {
   EXPECT_FALSE(y_sharded.IsEquivalent(fully_sharded));
 }
 
+TEST(DynamicSizeTest, IsDynamicSize) {
+  EXPECT_TRUE(IsDynamicSize(-1));
+  EXPECT_TRUE(IsDynamicSize(mlir::ShapedType::kDynamic));
+  EXPECT_FALSE(IsDynamicSize(10));
+}
+
 }  // namespace
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index e1972f3c3e2..682279cb245 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -8,7 +8,10 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_custom_op_library")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_custom_op_library(
     name = "zero_out_op_kernel_1.so",
@@ -62,7 +65,7 @@ py_library(
 
 py_test(
     name = "zero_out_1_test",
-    size = "small",
+    size = "medium",
     srcs = ["zero_out_1_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
@@ -95,7 +98,7 @@ py_test(
 
 py_test(
     name = "zero_out_3_test",
-    size = "small",
+    size = "medium",
     srcs = ["zero_out_3_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_1/README.md b/tensorflow/examples/custom_ops_doc/multiplex_1/README.md
new file mode 100644
index 00000000000..e8e34810265
--- /dev/null
+++ b/tensorflow/examples/custom_ops_doc/multiplex_1/README.md
@@ -0,0 +1,402 @@
+# Create a custom multiplexer op
+
+This page provides an end-to-end example for adding a custom multiplexer op to
+TensorFlow. For additional context,
+read the
+[OSS guide on creating custom ops](https://www.tensorflow.org/guide/create_op).
+
+## Creating a custom multiplexer op
+
+This examples demonstrates how you can create a Python custom multiplexer
+`multiplex_1_op`, similar to
+[`tf.where`](https://tensorflow.org/api_docs/python/tf/where?version=nightly)
+which you can call as:
+
+<!-- test_snippets_in_readme skip -->
+```python
+multiplex_1_op.multiplex(condition, x, y)                                        # doctest: skip
+```
+
+This custom op returns elements chosen from either of the two input tensors `x`
+or `y` depending on the `condition`.
+
+Example usage:
+
+<!-- test_snippets_in_readme skip -->
+```python
+from tensorflow.examples.custom_ops_doc.multiplex_1 import multiplex_1_op
+
+m = multiplex_1_op.multiplex([True, False, False, True], [1,2,3,4], [100,200,300,400])
+m.numpy()
+```
+
+<!-- test_snippets_in_readme skip -->
+```
+array([  1, 200, 300,   4], dtype=int32)
+```
+
+Note that this simplified `multiplex_1` op has limitations that are not present
+in `tf.where` such as:
+
+*   Support only for dense tensors
+*   Support only for CPU computations
+*   No broadcasting capabilities
+*   No extensibility through optional parameters
+
+The example below contains C++ and Python code snippets to illustrate the code
+flow. These snippets are not all complete; some are missing namespace
+declarations, imports, and test cases.
+
+### Step 1 - Define op interface
+
+Define the op interface and register it using the `REGISTER_OP` macro.
+
+<!-- test_snippets_in_readme skip -->
+```
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+```
+
+```
+REGISTER_OP("Examples1>MultiplexDense")
+    .Input("cond: bool")
+    .Input("a_values: T")
+    .Input("b_values: T")
+    .Output("output_values: T")
+    .Attr("T: type")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      // Determine the output shape and also assert that inputs 0 and 1 have
+      // the same shape.
+      tensorflow::shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &out));
+      // Assert that inputs 0 and 2 have the same shape, i.e. that all inputs
+      // have the same shape. This is optional, but it is desirable
+      // to raise errors about inconsistent input shapes early when using
+      // graph mode.
+      tensorflow::shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+
+      c->set_output(0, out);
+      return ::tensorflow::OkStatus();
+    })
+    .Doc(R"doc(
+Return elements chosen from `a` or `b` depending on `cond`.
+
+This is similar to `np.where` and `tf.where`, but simplified to only handle
+the case of dense tensors, no optional parameters, no broadcasting, etc..
+
+cond: tf.Tensor of type bool.
+a_values: tf.Tensor with the same type and shape as `b_values`.
+b_values: tf.Tensor with the same type and shape as `a_values`.
+
+      Where True, yield `a_values`, otherwise yield `b_values`.
+output_values: A tf.Tensor with elements from `a` where `cond` is True, and
+               elements from `b` elsewhere.
+)doc");
+```
+
+Note that:
+
+*   This op has three input tensors - one boolean tensor for selecting which
+    values to choose from the two other input tensors of matching type `T`, and
+    one output tensor of type `T`.
+*   The `Attr` for this op is defined as `.Attr("T: type")` which specifies `T`
+    as an `Attr` of type `type`. In the subsequent steps, you will use `T` with
+    a template class to define the type of the contents of tensors.
+*   The docstring for this op is specified by passing a string to `.Doc()`.
+*   The shape function for this op uses the `Merge` method of the
+    [`tensorflow::shape_inference::InferenceContext`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/shape_inference.h#:~:text=class%20InferenceContext)
+    object which is a helper function to set the output shape to be the same as
+    the identical shapes of the two inputs (for example, if it is used for
+    binary ops) and has error checking that the two inputs have the same shape.
+    Since `multiplex_1` has three inputs, two calls to `Merge` are used to
+    assert that all three inputs are the same shape.
+
+### Step 2 - Register the op implementation (kernel)
+
+Register the kernel by calling the `REGISTER_KERNEL_BUILDER` macro.
+
+```
+#define REGISTER_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("Examples1>MultiplexDense")      \
+                              .Device(::tensorflow::DEVICE_CPU) \
+                              .TypeConstraint<type>("T"),       \
+                          MultiplexDenseOp<type>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+```
+
+### Step 3 - Implement the op kernel(s)
+
+In the op kernel in `multiplex_1_kernel.cc`, create a class derived from
+[`OpKernel`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_kernel.h#:~:text=class%20OpKernel)
+that implements a `Compute` method to get and validate input tensors, perform
+computation, and create the output tensors.
+
+```
+template <typename T>
+class MultiplexDenseOp : public OpKernel {
+ public:
+  explicit MultiplexDenseOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  MultiplexDenseOp(const MultiplexDenseOp& other) = delete;
+  MultiplexDenseOp& operator=(const MultiplexDenseOp& other) = delete;
+  ~MultiplexDenseOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override {
+    const auto& cond_tensor = ctx->input(0);
+    const auto& a_values_tensor = ctx->input(1);
+    const auto& b_values_tensor = ctx->input(2);
+
+    // Allow any shape, but require that a_values, b_values, and cond all
+    // have the same shape.
+    // Note that ::tensorflow::TensorShapeUtils has some useful functions
+    // for checking shapes.
+    OP_REQUIRES(ctx, a_values_tensor.shape() == b_values_tensor.shape(),
+                InvalidArgument(
+                    "a_values and b_values must have the same shape. "
+                    "a_values shape: ",
+                    a_values_tensor.shape().DebugString(), " b_values shape: ",
+                    b_values_tensor.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, a_values_tensor.shape() == cond_tensor.shape(),
+        InvalidArgument("a_values and cond must have the same shape. "
+                        "a_values shape: ",
+                        a_values_tensor.shape().DebugString(),
+                        " cond shape: ", cond_tensor.shape().DebugString()));
+
+    const auto a_values = a_values_tensor.flat<T>();
+    const auto b_values = b_values_tensor.flat<T>();
+    const auto cond = cond_tensor.flat<bool>();
+
+    // Create an output tensor
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, a_values_tensor.shape(), &output_tensor));
+    auto output = output_tensor->template flat<T>();
+    const int64_t N = a_values_tensor.NumElements();
+
+    // Here is an example of processing tensors in a simple loop directly
+    // without relying on any libraries. For intensive math operations, it is
+    // a good practice to use libraries such as Eigen that support
+    // tensors when possible, e.g. "output = cond.select(a_values, b_values);"
+    // Eigen supports chunking into blocks and multi-threading.
+    // See
+    // https://eigen.tuxfamily.org/dox/unsupported/eigen_tensors.html#title55
+    for (int64_t i = 0; i < N; i++) {
+      if (cond(i)) {
+        output(i) = a_values(i);
+      } else {
+        output(i) = b_values(i);
+      }
+    }
+  }
+};
+```
+
+A common way to access the values in tensors for manipulation is to get
+flattened rank-1
+[`Eigen::Tensor`](https://eigen.tuxfamily.org/dox/unsupported/eigen_tensors.html)
+objects. In the example code, this is done for all three inputs and the output.
+The example also processes tensors in a simple loop directly without relying on
+any libraries.
+
+Using Eigen, the `for` loop above could have been written simply as:
+
+<!-- test_snippets_in_readme skip -->
+```c++
+output = cond.select(a_values, b_values);
+```
+
+[Selection](https://eigen.tuxfamily.org/dox/unsupported/eigen_tensors.html#title55)
+from Eigen supports chunking into blocks and multi-threading.
+
+For intensive mathematical operations, it is a good practice to use libraries
+such as [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page#Overview)
+that support tensors to do the computation when possible. Eigen is vectorized,
+avoids dynamic memory allocation and therefore is typically faster than using
+simple `for` loops.
+
+Eigen provides the following for accessing tensor values (for both inputs and
+outputs):
+
+*   `flat<T>()(index)` for element access for tensors of any rank
+*   `scalar<T>()()` for rank 0 tensors
+*   `vec<T>()(index)` for rank 1 tensors
+*   `matrix<T>()(i, j)` for rank 2 tensors
+*   `tensor<T, 3>()(i, j, k)` for tensors of known rank (e.g. 3).
+
+#### Compile the op (optional)
+
+Compile the C++ op to create a kernel library and Python wrapper that enables
+you to use the op with TensorFlow.
+
+Create a `BUILD` file for the op which declares the dependencies and the output
+build targets. Refer to
+[building for OSS](https://www.tensorflow.org/guide/create_op#build_the_op_library).
+
+### Step 4 - Create the Python wrapper (optional)
+
+To create the Python wrapper, import and implement a function that serves as the
+op's public API and provides a docstring.
+
+```
+def multiplex(cond, a, b, name=None):
+  """Return elements chosen from `a` or `b` depending on `cond`.
+
+  This is similar to `np.where` and `tf.where`, but simplified to only handle
+  the case of dense tensors, no optional parameters, no broadcasting, etc..
+
+  >>> multiplex([True, False, False, True], [1,2,3,4], [100,200,300,400])
+  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 200, 300,   4], ...)>
+
+  Args:
+    cond: tf.Tensor of type bool. Where True, yield `a`, otherwise yield `b`.
+    a: tf.Tensor with the same type and shape as `b`.
+    b: tf.Tensor with the same type and shape as `a`.
+    name: An optional name for the op.
+
+  Returns:
+    A tf.Tensor with elements from `a` where `cond` is True, and elements
+    from `b` elsewhere.
+  """
+  return gen_multiplex_1_op.examples1_multiplex_dense(
+      cond=cond, a_values=a, b_values=b, name=name)
+```
+
+### Step 5 - Test the op
+
+Create op tests using classes derived from
+[`tf.test.TestCase`](https://www.tensorflow.org/api_docs/python/tf/test/TestCase).
+
+When writing tests to ensure that the op works correctly in both graph and eager
+executions, it is important to note that errors in the op code may be detected
+in two distinct phases of code execution depending on how it is executed (eager
+or graph executions). Errors may be detected early by the shape function or a
+bit later from the logic in the `Compute` method. This may lead to differing
+error types and/or messages.
+
+Below are test excerpts showing how to handle errors for different scenarios.
+The first test case demonstrates error handling when errors are common across
+eager and graph executions and the second test case demonstrates error handling
+when the errors are different in eager and graph executions.
+
+```
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_int(self):
+    a = tf.constant([1, 2, 3, 4, 5])
+    b = tf.constant([10, 20, 30, 40, 50])
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b))
+    # expected result is [1, 20, 3, 40, 5]
+    result = multiplex_1_op.multiplex(cond, a, b)
+    self.assertAllEqual(result, expect)
+```
+
+```
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_bad_types(self):
+    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0])  # float
+    b = tf.constant([10, 20, 30, 40, 50])  # int32
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    with self.assertRaisesRegex(
+        (errors_impl.InvalidArgumentError, TypeError),
+        # Eager mode raises InvalidArgumentError with the following message
+        r'(cannot compute Examples1>MultiplexDense as input #2\(zero-based\) '
+        r'was expected to be a float tensor but is a int32 tensor '
+        r'\[Op:Examples1>MultiplexDense\]'
+        r')|('
+        # Graph mode raises TypeError with the following message
+        r"Input 'b_values' of 'Examples1>MultiplexDense' Op has type int32 that "
+        r"does not match type float32 of argument 'a_values'.)"):
+      self.evaluate(multiplex_1_op.multiplex(cond, a, b))
+```
+
+Refer to `multiplex_1_test.py` for the full source code which contains all the
+test cases.
+
+Reuse the `BUILD` file created in Step 3a above to add build
+rules for the Python API wrapper and the op test.
+
+<!-- test_snippets_in_readme skip -->
+```
+load("//third_party/tensorflow:strict.default.bzl", "py_strict_library")
+load("//third_party/tensorflow:tensorflow.default.bzl", "tf_py_test")
+```
+
+```
+py_strict_library(
+    name = "multiplex_1_op",
+    srcs = ["multiplex_1_op.py"],
+    srcs_version = "PY3",
+    visibility = ["//third_party/tensorflow/google/g3doc:__subpackages__"],
+    deps = [
+        ":gen_multiplex_1_op",
+        ":multiplex_1_kernel",
+        "//third_party/py/tensorflow",
+    ],
+)
+
+tf_py_test(
+    name = "multiplex_1_test",
+    size = "small",
+    srcs = ["multiplex_1_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":multiplex_1_op",
+        "//third_party/py/numpy",
+        "//third_party/py/tensorflow",
+        "//third_party/tensorflow/python/framework:errors",
+        "//third_party/tensorflow/python/framework:test_lib",
+    ],
+)
+```
+
+Test the op by running:
+
+<!-- test_snippets_in_readme skip -->
+```shell
+$ bazel test //third_party/tensorflow/google/g3doc/example/multiplex_1:multiplex_1_test
+```
+
+### Use the op
+
+Use the op by importing and calling it as follows:
+
+<!-- test_snippets_in_readme skip -->
+```python
+import tensorflow as tf
+from tensorflow.examples.custom_ops_doc.multiplex_1 import multiplex_1_op
+
+a = tf.constant([1, 2, 3, 4, 5])
+b = tf.constant([10, 20, 30, 40, 50])
+cond = tf.constant([True, False, True, False, True], dtype=bool)
+
+result = multiplex_1_op.multiplex(cond, a, b)
+result.numpy()
+```
+
+<!-- test_snippets_in_readme skip -->
+```
+ array([ 1, 20,  3, 40,  5], dtype=int32)
+```
+
+Here, `multiplex_1_op` is the name of the Python wrapper that was created in
+this example.
+
+### Summary
+
+In this example, you learned how to define and use a custom multiplexer op. The
+image below summarizes the files created for this op.
+
+The table below summarizes the build rules and targets for building and testing
+the `multiplex_1` op.
+
+Op components                           | Build rule             | Build target         | Source
+--------------------------------------- | ---------------------- | -------------------- | ------
+Kernels (C++)                           | `tf_kernel_library`    | `multiplex_1_kernel` | `multiplex_1_kernel.cc`, `multiplex_1_op.cc`
+Wrapper (automatically generated)       | `tf_gen_op_wrapper.py` | `gen_multiplex_1_op` | N/A
+Wrapper (with public API and docstring) | `py_strict_library`    | `multiplex_1_op`     | `multiplex_1_op.py`
+Tests                                   | `tf_py_test`           | `multiplex_1_test`   | `multiplex_1_test.py`
+
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_test.py b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_test.py
index 597e23ab966..7bec753eaf9 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_test.py
+++ b/tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_test.py
@@ -71,8 +71,8 @@ def test_multiplex_bad_size(self):
     with self.assertRaisesRegex(
         (errors_impl.InvalidArgumentError, ValueError),
         # Eager mode raises InvalidArgumentError with the following message
-        r'(a_values and b_values must have the same shape. '
-        r'a_values shape: \[5\] b_values shape: \[2\](?s).* '
+        r'(?s)(a_values and b_values must have the same shape. '
+        r'a_values shape: \[5\] b_values shape: \[2\].* '
         r'\[Op:Examples1>MultiplexDense\]'
         r')|('
         # Graph mode raises ValueError with the following message
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_2/multiplex_2_test.py b/tensorflow/examples/custom_ops_doc/multiplex_2/multiplex_2_test.py
index 1635d58fbe6..fd6be3fff54 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_2/multiplex_2_test.py
+++ b/tensorflow/examples/custom_ops_doc/multiplex_2/multiplex_2_test.py
@@ -71,8 +71,8 @@ def test_multiplex_bad_size(self):
     with self.assertRaisesRegex(
         (errors_impl.InvalidArgumentError, ValueError),
         # Eager mode raises InvalidArgumentError with the following message
-        r'(a and b must have the same shape. '
-        r'a shape: \[5\] b shape: \[2\](?s).* '
+        r'(?s)(a and b must have the same shape. '
+        r'a shape: \[5\] b shape: \[2\].* '
         r'\[Op:Examples>MultiplexDense\]'
         r')|('
         # Graph mode raises ValueError with the following message
diff --git a/tensorflow/examples/custom_ops_doc/simple_hash_table/BUILD b/tensorflow/examples/custom_ops_doc/simple_hash_table/BUILD
index 533d0b2cafa..76dac4640fc 100644
--- a/tensorflow/examples/custom_ops_doc/simple_hash_table/BUILD
+++ b/tensorflow/examples/custom_ops_doc/simple_hash_table/BUILD
@@ -42,7 +42,8 @@ py_strict_library(
 
 tf_py_test(
     name = "simple_hash_table_test",
-    size = "small",
+    size = "medium",  # This test blocks because it writes and reads a file,
+    timeout = "short",  # but it still runs quickly.
     srcs = ["simple_hash_table_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index a3bc220b5d9..41e9bc844c6 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index f39f73e7696..9ef9a85bd1f 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -3,7 +3,10 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_cc_binary(
     name = "detect_objects",
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 76043bbd63f..baf0d6966a4 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 31f4752716d..373d433ab81 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -17,7 +17,7 @@
 import os.path
 
 from tensorflow.examples.speech_commands import freeze
-from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.platform import test
@@ -119,7 +119,7 @@ def testCreateSavedModel(self):
           model_architecture='conv',
           preprocess='micro')
       global_variables_initializer().run()
-      graph_util.convert_variables_to_constants(
+      convert_to_constants.convert_variables_to_constants(
           sess, sess.graph_def, ['labels_softmax'])
       freeze.save_saved_model(saved_model_path, sess, input_tensor,
                               output_tensor)
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index a4a8d9533c2..7f76bdf867f 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -3,7 +3,10 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cc_library(
     name = "wav_to_spectrogram_lib",
diff --git a/tensorflow/go/core/framework/BUILD b/tensorflow/go/core/framework/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/core/framework/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/core/protobuf/BUILD b/tensorflow/go/core/protobuf/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/core/protobuf/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index f84a588899c..475619c55a5 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -28,8 +28,8 @@ import (
 	"os"
 	"path/filepath"
 
-	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 	"github.com/tensorflow/tensorflow/tensorflow/go/op"
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )
 
 func Example() {
@@ -47,8 +47,8 @@ func Example() {
 	// - 3 is the (R, G, B) values of the pixel colors represented as a float.
 	//
 	// And produces as output a vector with shape [ NUM_LABELS ].
-	// output[i] is the probability that the input image was recognized as
-	// having the i-th label.
+	// output[i] is the model-implied probability of the input image having
+	// the i-th label.
 	//
 	// A separate file contains a list of string labels corresponding to the
 	// integer indices of the output.
@@ -64,18 +64,30 @@ func Example() {
 	// - Constructs another TensorFlow graph to normalize the image into a
 	//   form suitable for the model (for example, resizing the image)
 	// - Creates and executes a Session to obtain a Tensor in this normalized form.
-	modeldir := flag.String("dir", "", "Directory containing the trained model files. The directory will be created and the model downloaded into it if necessary")
-	imagefile := flag.String("image", "", "Path of a JPEG-image to extract labels for")
+	modeldir := flag.String(
+		"dir",
+		"testdata/saved_model/inception5h",
+		"Directory containing the trained model files. The directory will be"+
+			"created and the model downloaded into it if necessary",
+	)
+	imagefile := flag.String(
+		"image",
+		"testdata/label_image/grace_hopper.jpg",
+		"Path of a JPEG-image to extract labels for",
+	)
 	flag.Parse()
-	if *modeldir == "" || *imagefile == "" {
-		flag.Usage()
-		return
-	}
+
 	// Load the serialized GraphDef from a file.
 	modelfile, labelsfile, err := modelFiles(*modeldir)
 	if err != nil {
 		log.Fatal(err)
 	}
+
+	labels, err := readLabelsFile(labelsfile)
+	if err != nil {
+		log.Fatal(err)
+	}
+
 	model, err := ioutil.ReadFile(modelfile)
 	if err != nil {
 		log.Fatal(err)
@@ -115,34 +127,20 @@ func Example() {
 	}
 	// output[0].Value() is a vector containing probabilities of
 	// labels for each image in the "batch". The batch size was 1.
-	// Find the most probably label index.
+	// Find the most probable label index.
 	probabilities := output[0].Value().([][]float32)[0]
-	printBestLabel(probabilities, labelsfile)
+	printBestLabel(probabilities, labels)
+	// // Output:
+	// // BEST MATCH: (29% likely) military uniform
 }
 
-func printBestLabel(probabilities []float32, labelsFile string) {
-	bestIdx := 0
-	for i, p := range probabilities {
-		if p > probabilities[bestIdx] {
-			bestIdx = i
-		}
-	}
-	// Found the best match. Read the string from labelsFile, which
-	// contains one line per label.
-	file, err := os.Open(labelsFile)
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer file.Close()
-	scanner := bufio.NewScanner(file)
-	var labels []string
-	for scanner.Scan() {
-		labels = append(labels, scanner.Text())
-	}
-	if err := scanner.Err(); err != nil {
-		log.Printf("ERROR: failed to read %s: %v", labelsFile, err)
-	}
-	fmt.Printf("BEST MATCH: (%2.0f%% likely) %s\n", probabilities[bestIdx]*100.0, labels[bestIdx])
+func printBestLabel(probabilities []float32, labels []string) {
+	idx := argmax(probabilities)
+	fmt.Printf(
+		"BEST MATCH: (%2.0f%% likely) %s",
+		probabilities[idx]*100.0,
+		labels[idx],
+	)
 }
 
 // Convert the image in filename to a Tensor suitable as input to the Inception model.
@@ -289,3 +287,33 @@ func unzip(dir, zipfile string) error {
 	}
 	return nil
 }
+
+func readLabelsFile(f string) ([]string, error) {
+	var labels []string
+
+	file, err := os.Open(f)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	for i := 0; scanner.Scan(); i++ {
+		labels = append(labels, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return labels, nil
+}
+
+// argmax returns the index the maximal element in slice a.
+func argmax(a []float32) int {
+	var idx int
+	for i := 0; i < len(a); i++ {
+		if a[i] > a[idx] {
+			idx = i
+		}
+	}
+	return idx
+}
diff --git a/tensorflow/go/genop/BUILD b/tensorflow/go/genop/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/genop/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/genop/internal/BUILD b/tensorflow/go/genop/internal/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/genop/internal/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index bb112303807..14f113727c7 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -259,6 +259,9 @@ func TestGraphAddGradientsSums(t *testing.T) {
 		Name:  "y1",
 		Input: []Input{y0},
 	})
+	if err != nil {
+		t.Fatal(err)
+	}
 	y1 := op1.Output(0)
 
 	grad, err := g.AddGradients("", []Output{y0, y1}, []Output{x}, nil)
@@ -293,6 +296,10 @@ func TestGraphAddGradientsSums(t *testing.T) {
 func TestGraphAddGradientsWithInitialValues(t *testing.T) {
 	g := NewGraph()
 	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	op0, err := g.AddOperation(OpSpec{
 		Type:  "Square",
 		Name:  "y0",
diff --git a/tensorflow/go/op/BUILD b/tensorflow/go/op/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/op/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index bde1feab591..f3b2ae8df61 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -699,7 +699,7 @@ func AngleTout(value tf.DataType) AngleAttr {
 //
 // ```
 // # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
+// tf.math.angle(input) ==> [2.0132, 1.056]
 // ```
 //
 // @compatibility(numpy)
@@ -6080,6 +6080,53 @@ func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key
 	return op.Output(0)
 }
 
+// CollectiveReduceScatterV2Attr is an optional argument to CollectiveReduceScatterV2.
+type CollectiveReduceScatterV2Attr func(optionalAttr)
+
+// CollectiveReduceScatterV2CommunicationHint sets the optional communication_hint attribute to value.
+// If not specified, defaults to "auto"
+func CollectiveReduceScatterV2CommunicationHint(value string) CollectiveReduceScatterV2Attr {
+	return func(m optionalAttr) {
+		m["communication_hint"] = value
+	}
+}
+
+// CollectiveReduceScatterV2TimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveReduceScatterV2TimeoutSeconds(value float32) CollectiveReduceScatterV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
+// CollectiveReduceScatterV2MaxSubdivsPerDevice sets the optional max_subdivs_per_device attribute to value.
+// If not specified, defaults to -1
+func CollectiveReduceScatterV2MaxSubdivsPerDevice(value int64) CollectiveReduceScatterV2Attr {
+	return func(m optionalAttr) {
+		m["max_subdivs_per_device"] = value
+	}
+}
+
+// Mutually reduces multiple tensors of identical type and shape and scatters the result.
+func CollectiveReduceScatterV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, ordering_token []tf.Output, merge_op string, final_op string, optional ...CollectiveReduceScatterV2Attr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"merge_op": merge_op, "final_op": final_op}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduceScatterV2",
+		Input: []tf.Input{
+			input, group_size, group_key, instance_key, tf.OutputList(ordering_token),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CollectiveReduceV2Attr is an optional argument to CollectiveReduceV2.
 type CollectiveReduceV2Attr func(optionalAttr)
 
@@ -6526,6 +6573,36 @@ func ComputeBatchSize(scope *Scope, input_dataset tf.Output) (batch_size tf.Outp
 	return op.Output(0)
 }
 
+// An op computes tuple mask of deduplication data from embedding core.
+//
+// The deduplication data receiving from embedding core is a Tensor with
+// type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+// rank 1 tensors. This op is to represents types and length of these elements.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns A 2-D int tensor represent mask of deduplication data tuple generated by
+// `XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+// type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+// tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+// represents float tensor. The second dimension of `output_shape` gives length of
+// each tuple element.
+func ComputeDedupDataTupleMask(scope *Scope, config string) (output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "ComputeDedupDataTupleMask",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -7182,6 +7259,104 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
+// Conv2DBackpropFilterV2Attr is an optional argument to Conv2DBackpropFilterV2.
+type Conv2DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv2DBackpropFilterV2UseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterV2UseCudnnOnGpu(value bool) Conv2DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterV2ExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to {}
+func Conv2DBackpropFilterV2ExplicitPaddings(value []int64) Conv2DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//
+//	[batch, in_height, in_width, in_channels].
+//
+// Alternatively, the format could be "NCHW", the data storage order of:
+//
+//	[batch, in_channels, in_height, in_width].
+//
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterV2DataFormat(value string) Conv2DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
+func Conv2DBackpropFilterV2Dilations(value []int64) Conv2DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter: 4-D with shape `[filter_height, filter_width, in_channels, out_channels]`.
+//
+// Only shape of tensor is used.
+//
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+//
+// Gradients w.r.t. the output of the convolution.
+//
+//	strides: The stride of the sliding window for each dimension of the input
+//
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilterV2(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
 type Conv2DBackpropInputAttr func(optionalAttr)
 
@@ -7282,6 +7457,106 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
+// Conv2DBackpropInputV2Attr is an optional argument to Conv2DBackpropInputV2.
+type Conv2DBackpropInputV2Attr func(optionalAttr)
+
+// Conv2DBackpropInputV2UseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputV2UseCudnnOnGpu(value bool) Conv2DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputV2ExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to {}
+func Conv2DBackpropInputV2ExplicitPaddings(value []int64) Conv2DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//
+//	[batch, in_height, in_width, in_channels].
+//
+// Alternatively, the format could be "NCHW", the data storage order of:
+//
+//	[batch, in_channels, in_height, in_width].
+//
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputV2DataFormat(value string) Conv2DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
+func Conv2DBackpropInputV2Dilations(value []int64) Conv2DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//
+// Only shape of tensor is used.
+//
+//	filter: 4-D with shape
+//
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+//
+// Gradients w.r.t. the output of the convolution.
+//
+//	strides: The stride of the sliding window for each dimension of the input
+//
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInputV2(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInputV2",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Conv3DAttr is an optional argument to Conv3D.
 type Conv3DAttr func(optionalAttr)
 
@@ -13349,6 +13624,11 @@ func DynamicEnqueueTPUEmbeddingArbitraryTensorBatch(scope *Scope, sample_indices
 // <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
 // </div>
 //
+// Raises:
+//   - `InvalidArgumentError` in following cases:
+//   - If partitions is not in range `[0, num_partiions)`
+//   - If `partitions.shape` does not match prefix of `data.shape` argument.
+//
 // Arguments:
 //
 //	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
@@ -16338,7 +16618,11 @@ func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr
 	}
 }
 
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same shape and type.
+//
+//	Quantization is called fake since the output is still in floating point.
+//	The API converts inputs into values within the range [min and max] and returns
+//	as output.
 //
 // # Attributes
 //
@@ -16359,7 +16643,26 @@ func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr
 // *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
 // `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
 //
-// Quantization is called fake since the output is still in floating point.
+// # Examples
+//
+// ```python
+//
+// inp = tf.constant ([10.03, -10.23, 3])
+// out = tf.quantization.fake_quant_with_min_max_args(inp, min=-5, max=5,
+//
+//	num_bits=16)
+//
+// print(out)
+//
+// #  Output:
+// #  tf.Tensor([ 4.9999237 -5.0000763  3.0000763], shape=(3,), dtype=float32)
+// ```
+//
+// Raises:
+//   - InvalidArgumentError:
+//   - If num_bits are outside of range [2, 16].
+//   - If min >= max.
+//   - ValueError: If `inputs` are of any other type than float32.
 func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -26293,6 +26596,41 @@ func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.O
 	return op.Output(0), op.Output(1)
 }
 
+// An op merges elements of integer and float tensors into deduplication data as
+// XLA tuple.
+//
+// This op merges outputs of SplitDedupDataOp, which gives two 1-D tensors, integer
+// and floating point. With respect to tuple_mask, this op merges values of these
+// two tensors into an XLA tuple, which should be as same as input to
+// SplitDedupDataOp.
+//
+// Arguments:
+//
+//	integer_tensor: A 1-D integer tensor, includes integer elements of deduplication data tuple.
+//	float_tensor: A 1-D float tensor, includes float elements of deduplication data tuple.
+//	tuple_mask: A serialized TensorProto string of output tuple mask. This mask is a 2-D tensor,
+//
+// with first column as tuple element type, and second column as span of this type.
+// For example, an output tuple of (1, 2, 0.1, 3), its mask is [[0, 2], [1, 1], [0,
+// 1]]. We expect only two types of elements: integer(0) and float(1).
+//
+// Returns An XLA tuple merging integer and float elements as deduplication data tuple.
+func MergeDedupData(scope *Scope, integer_tensor tf.Output, float_tensor tf.Output, tuple_mask string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"tuple_mask": tuple_mask}
+	opspec := tf.OpSpec{
+		Type: "MergeDedupData",
+		Input: []tf.Input{
+			integer_tensor, float_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Merges summaries.
 //
 // This op creates a
@@ -34822,6 +35160,69 @@ func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types [
 	return op.Output(0)
 }
 
+// RandomDatasetV2Attr is an optional argument to RandomDatasetV2.
+type RandomDatasetV2Attr func(optionalAttr)
+
+// RandomDatasetV2RerandomizeEachIteration sets the optional rerandomize_each_iteration attribute to value.
+//
+// value: A boolean attribute to rerandomize the sequence of random numbers generated
+// at each epoch.
+// If not specified, defaults to false
+func RandomDatasetV2RerandomizeEachIteration(value bool) RandomDatasetV2Attr {
+	return func(m optionalAttr) {
+		m["rerandomize_each_iteration"] = value
+	}
+}
+
+// RandomDatasetV2Metadata sets the optional metadata attribute to value.
+// If not specified, defaults to ""
+func RandomDatasetV2Metadata(value string) RandomDatasetV2Attr {
+	return func(m optionalAttr) {
+		m["metadata"] = value
+	}
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Creates a Dataset that returns a stream of uniformly distributed
+// pseudorandom 64-bit signed integers. It accepts a boolean attribute that
+// determines if the random number generators are re-applied at each epoch. The
+// default value is True which means that the seeds are applied and the same
+// sequence of random numbers are generated at each epoch. If set to False, the
+// seeds are not re-applied and a different sequence of random numbers are
+// generated at each epoch.
+//
+// In the TensorFlow Python API, you can instantiate this dataset via the
+// class `tf.data.experimental.RandomDatasetV2`.
+//
+// Arguments:
+//
+//	seed: A scalar seed for the random number generator. If either seed or
+//
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//
+//	seed2: A second scalar seed to avoid seed collision.
+//	seed_generator: A resource for the random number seed generator.
+func RandomDatasetV2(scope *Scope, seed tf.Output, seed2 tf.Output, seed_generator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...RandomDatasetV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomDatasetV2",
+		Input: []tf.Input{
+			seed, seed2, seed_generator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomGammaAttr is an optional argument to RandomGamma.
 type RandomGammaAttr func(optionalAttr)
 
@@ -42323,6 +42724,67 @@ func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf
 	return op.Output(0)
 }
 
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// Note: That this op is currently only supported with jit_compile=True.
+//
+// The only difference with SegmentProd is the additional input  `num_segments`.
+// This helps in evaluating the output shape in compile time.
+// `num_segments` should be consistent with segment_ids.
+// e.g. Max(segment_ids) - 1 should be equal to `num_segments` for a 1-d segment_ids
+// With inconsistent num_segments, the op still runs. only difference is,
+// the output takes the size of num_segments irrespective of size of segment_ids and data.
+// for num_segments less than expected output size, the last elements are ignored
+// for num_segments more than the expected output size, last elements are assigned 1.
+//
+// For example:
+//
+// >>> @tf.function(jit_compile=True)
+// ... def test(c):
+// ...   return tf.raw_ops.SegmentProdV2(data=c, segment_ids=tf.constant([0, 0, 1]), num_segments=2)
+// >>> c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// >>> test(c).numpy()
+// array([[4, 6, 6, 4],
+//
+//	[5, 6, 7, 8]], dtype=int32)
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+//
+// first dimension.  Values should be sorted and can be repeated.
+// The values must be less than `num_segments`.
+//
+// Caution: The values are always validated to be sorted on CPU, never validated
+// on GPU.
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimensionw which has size
+// `num_segments`.
+func SegmentProdV2(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentProdV2",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along segments of a tensor.
 //
 // Read
@@ -42378,6 +42840,48 @@ func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.
 	return op.Output(0)
 }
 
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// Note that this op is currently only supported with jit_compile=True.
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+//
+// first dimension.  Values should be sorted and can be repeated.
+// The values must be less than `num_segments`.
+//
+// Caution: The values are always validated to be sorted on CPU, never validated
+// on GPU.
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func SegmentSumV2(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSumV2",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Selects elements from `x` or `y`, depending on `condition`.
 //
 // The `x`, and `y` tensors must all have the same shape, and the
@@ -46909,6 +47413,44 @@ func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (outp
 	return output
 }
 
+// An op splits input deduplication data XLA tuple into integer and floating point
+// tensors.
+//
+// Deduplication data is an XLA tuple, which consists of integer and floating point
+// values. This op is to split these values into two groups for two types, and
+// construct each group as one tensor to return.
+//
+// Arguments:
+//
+//	input: An XLA tuple including integer and float elements as deduplication data tuple.
+//	integer_type: integer_tensor type. Allowed types: int32, int64, uint32, uint64.
+//	float_type: float_tensor type. Allowed types: half, bfloat16, float.
+//	tuple_mask: A serialized TensorProto string of output tuple mask. This mask is a 2-D tensor,
+//
+// with first column as tuple element type, and second column as span of this type.
+// For example, an output tuple of (1, 2, 0.1, 3), its mask is [[0, 2], [1, 1], [0,
+// 1]]. We expect only two types of elements: integer(0) and float(1).
+//
+// Returns:
+//
+//	integer_tensor: A 1-D integer tensor, includes integer elements of deduplication data tuple.
+//	float_tensor: A 1-D float tensor, includes float elements of deduplication data tuple.
+func SplitDedupData(scope *Scope, input tf.Output, integer_type tf.DataType, float_type tf.DataType, tuple_mask string) (integer_tensor tf.Output, float_tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"integer_type": integer_type, "float_type": float_type, "tuple_mask": tuple_mask}
+	opspec := tf.OpSpec{
+		Type: "SplitDedupData",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
@@ -50049,6 +50591,22 @@ func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
+// Synchronizes the device this op is run on.
+//
+// Only GPU ops are asynchrous in TensorFlow, and so this only has an effect when
+// run on GPUs. On GPUs, this op synchronizes the GPU's compute stream.
+//
+// Returns the created operation.
+func SyncDevice(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SyncDevice",
+	}
+	return scope.AddOperation(opspec)
+}
+
 // TFRecordDatasetAttr is an optional argument to TFRecordDataset.
 type TFRecordDatasetAttr func(optionalAttr)
 
@@ -50339,6 +50897,48 @@ func TPUPartitionedInput(scope *Scope, inputs []tf.Output, optional ...TPUPartit
 	return op.Output(0)
 }
 
+// TPUPartitionedInputV2Attr is an optional argument to TPUPartitionedInputV2.
+type TPUPartitionedInputV2Attr func(optionalAttr)
+
+// TPUPartitionedInputV2IsPacked sets the optional is_packed attribute to value.
+//
+// value: Indicates whether the input is a packed resource.
+// If not specified, defaults to false
+func TPUPartitionedInputV2IsPacked(value bool) TPUPartitionedInputV2Attr {
+	return func(m optionalAttr) {
+		m["is_packed"] = value
+	}
+}
+
+// An op that groups a list of partitioned inputs together. Supports ND sharding.
+//
+// Arguments:
+//
+//	inputs: A list of partitioned inputs which must have the same shape.
+//	partition_dims: A list of integers describing how each dimension is partitioned. Emptiness
+//
+// indicates the inputs are replicated.
+//
+// Returns A handle which represents the full shape of partitioned tensors.
+func TPUPartitionedInputV2(scope *Scope, inputs []tf.Output, partition_dims []int64, optional ...TPUPartitionedInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"partition_dims": partition_dims}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUPartitionedInputV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TPUPartitionedOutputAttr is an optional argument to TPUPartitionedOutput.
 type TPUPartitionedOutputAttr func(optionalAttr)
 
@@ -50389,6 +50989,44 @@ func TPUPartitionedOutput(scope *Scope, inputs tf.Output, num_splits int64, opti
 	return output
 }
 
+// An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
+//
+// outputs outside the XLA computation. Supports ND sharding.
+//
+// Arguments:
+//
+//	inputs: A tensor which represents the full shape of partitioned tensors.
+//
+//	partition_dims: A list of integers describing how each dimension is partitioned. Emptiness
+//
+// indicates the inputs are replicated.
+//
+// Returns A list of partitioned outputs which have the same shape.
+func TPUPartitionedOutputV2(scope *Scope, inputs tf.Output, num_splits int64, partition_dims []int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_splits": num_splits, "partition_dims": partition_dims}
+	opspec := tf.OpSpec{
+		Type: "TPUPartitionedOutputV2",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("TPUPartitionedOutputV2", err)
+		return
+	}
+	return output
+}
+
 // TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
 type TPUReplicateMetadataAttr func(optionalAttr)
 
@@ -52981,8 +53619,15 @@ func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Out
 //
 // Returns the timestamp as a `float64` for seconds since the Unix epoch.
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
+// Common usages include:
+// * Logging
+// * Providing a random number seed
+// * Debugging graph execution
+// * Generating timing information, mainly through comparison of timestamps
+//
+// Note: In graph mode, the timestamp is computed when the op is executed,
+// not when it is added to the graph.  In eager mode, the timestamp is computed
+// when the op is eagerly executed.
 func Timestamp(scope *Scope) (ts tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -53337,7 +53982,7 @@ func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
+// Returns x / y element-wise, rounded towards zero.
 //
 // Truncation designates that negative numbers will round fractional quantities
 // toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
@@ -54273,6 +54918,136 @@ func UniformQuantize(scope *Scope, input tf.Output, scales tf.Output, zero_point
 	return op.Output(0)
 }
 
+// UniformQuantizedAddAttr is an optional argument to UniformQuantizedAdd.
+type UniformQuantizedAddAttr func(optionalAttr)
+
+// UniformQuantizedAddLhsQuantizationAxis sets the optional lhs_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `lhs`, only per-tensor quantization is supported.
+// Thus, this must be set to -1.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedAddLhsQuantizationAxis(value int64) UniformQuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["lhs_quantization_axis"] = value
+	}
+}
+
+// UniformQuantizedAddRhsQuantizationAxis sets the optional rhs_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `rhs`, only per-tensor quantization
+// or per-channel quantization along `kernel_output_feature_dimension` is supported.
+// Thus, this must be set to -1 or `dimension_numbers.kernel_output_feature_dimension`.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedAddRhsQuantizationAxis(value int64) UniformQuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["rhs_quantization_axis"] = value
+	}
+}
+
+// UniformQuantizedAddOutputQuantizationAxis sets the optional output_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `output`, only per-tensor quantization or per-channel quantization along `output_feature_dimension` is supported.
+// Thus, this must be set to -1 or `dimension_numbers.output_feature_dimension`.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedAddOutputQuantizationAxis(value int64) UniformQuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["output_quantization_axis"] = value
+	}
+}
+
+// Perform quantized add of quantized Tensor `lhs` and quantized Tensor `rhs` to make quantized `output`.
+//
+// Given quantized `lhs` and quantized `rhs`, performs quantized add on `lhs` and `rhs` to make quantized `output`.
+//
+// `UniformQuantizedAdd` follows Numpy broadcasting rules.
+// The two input array shapes are compared element-wise.
+// Starting with the trailing dimensions, the two dimensions either have to be equal or one of them needs to be 1.
+//
+// `lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+// ```
+// quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+// ```
+// `output` is also quantized, using the same formula.
+//
+// If `lhs` and `output` is both per-axis quantized, the quantization axis must match.
+// Also, if `rhs` and `output` is both per-axis quantized, the quantization axis must match.
+// *Match* means the axis must match when adding, regarding the broadcasting.
+// i.e. For both operands `lhs` and `rhs`,
+// if `operand.quantization_axis` >= 0 and `output.quantization_axis` >= 0,
+// `operand.dims` - `operand.quantization_axis` must be equal to `output.dims` - `output.quantization_axis`.
+//
+// Arguments:
+//
+//	lhs: Must be a quantized tensor.
+//	rhs: Must be a quantized tensor.
+//	lhs_scales: The float value(s) used as scale factors when quantizing the original data that `lhs` represents.
+//	lhs_zero_points: The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+//
+// Must have same shape with `lhs_scales`.
+//
+//	rhs_scales: The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+//	rhs_zero_points: The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+//
+// Must have same shape with `rhs_scales`.
+//
+//	output_scales: The float value(s) to use as scale factors when quantizing original data that `output` represents.
+//	output_zero_points: The int32 value(s) used as zero points when quantizing original data that output represents.
+//
+// Must have same shape with `output_scales`.
+//
+//	lhs_quantization_min_val: The min value of the quantized data stored in `lhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	lhs_quantization_max_val: The max value of the quantized data stored in `lhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to 127.
+//
+//	rhs_quantization_min_val: The min value of the quantized data stored in `rhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	rhs_quantization_max_val: The max value of the quantized data stored in `rhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to 127.
+//
+//	output_quantization_min_val: The min value of the quantized data stored in `output`.
+//
+// For example, if  `Tout` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	output_quantization_max_val: The max value of the quantized data stored in `output`.
+//
+// For example, if `Tout` is `qint8`, this must be set to 127.
+//
+// Returns The output quantized tensor.
+func UniformQuantizedAdd(scope *Scope, lhs tf.Output, rhs tf.Output, lhs_scales tf.Output, lhs_zero_points tf.Output, rhs_scales tf.Output, rhs_zero_points tf.Output, output_scales tf.Output, output_zero_points tf.Output, lhs_quantization_min_val int64, lhs_quantization_max_val int64, rhs_quantization_min_val int64, rhs_quantization_max_val int64, output_quantization_min_val int64, output_quantization_max_val int64, optional ...UniformQuantizedAddAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"lhs_quantization_min_val": lhs_quantization_min_val, "lhs_quantization_max_val": lhs_quantization_max_val, "rhs_quantization_min_val": rhs_quantization_min_val, "rhs_quantization_max_val": rhs_quantization_max_val, "output_quantization_min_val": output_quantization_min_val, "output_quantization_max_val": output_quantization_max_val}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformQuantizedAdd",
+		Input: []tf.Input{
+			lhs, rhs, lhs_scales, lhs_zero_points, rhs_scales, rhs_zero_points, output_scales, output_zero_points,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // UniformQuantizedClipByValueAttr is an optional argument to UniformQuantizedClipByValue.
 type UniformQuantizedClipByValueAttr func(optionalAttr)
 
@@ -54335,6 +55110,237 @@ func UniformQuantizedClipByValue(scope *Scope, operand tf.Output, min tf.Output,
 	return op.Output(0)
 }
 
+// UniformQuantizedConvolutionAttr is an optional argument to UniformQuantizedConvolution.
+type UniformQuantizedConvolutionAttr func(optionalAttr)
+
+// UniformQuantizedConvolutionWindowStrides sets the optional window_strides attribute to value.
+//
+// value: The stride of the sliding window for each spatial dimension of `lhs`.
+// Must be an empty list (default) or a list of size (number of spatial dimensions).
+// If an empty list is provided, the stride for each spatial dimension is set to 1.
+// If not specified, defaults to {}
+func UniformQuantizedConvolutionWindowStrides(value []int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["window_strides"] = value
+	}
+}
+
+// UniformQuantizedConvolutionExplicitPadding sets the optional explicit_padding attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, must be set as a list indicating
+// the explicit paddings at the start and end of each `lhs` spatial dimension.
+// Otherwise, this must be empty.
+//
+// (If used,) Must be a list of size `2 * (number of lhs spatial dimensions)`,
+// where `(explicit_padding[2 * i], explicit_padding[2 * i + 1])` indicates
+// `(start_padding, end_padding)` of `spatial_dimensions[i]`.
+// If not specified, defaults to {}
+func UniformQuantizedConvolutionExplicitPadding(value []int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["explicit_padding"] = value
+	}
+}
+
+// UniformQuantizedConvolutionLhsDilation sets the optional lhs_dilation attribute to value.
+//
+// value: The dilation factor to apply in each spatial dimension of `lhs`.
+// Must be an empty list (default) or a list of size (number of `lhs` spatial dimensions).
+// If empty list, the dilation for each `lhs` spatial dimension is set to 1.
+// If not specified, defaults to {}
+func UniformQuantizedConvolutionLhsDilation(value []int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["lhs_dilation"] = value
+	}
+}
+
+// UniformQuantizedConvolutionRhsDilation sets the optional rhs_dilation attribute to value.
+//
+// value: The dilation factor to apply in each spatial dimension of `rhs`.
+// Must be an empty list (default) or a list of size (number of `rhs` spatial dimensions).
+// If empty list, the dilation for each `rhs` spatial dimension is set to 1.
+// If not specified, defaults to {}
+func UniformQuantizedConvolutionRhsDilation(value []int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["rhs_dilation"] = value
+	}
+}
+
+// UniformQuantizedConvolutionBatchGroupCount sets the optional batch_group_count attribute to value.
+//
+// value: The number of batch groups. Used for grouped filters.
+// Must be a divisor of `output_feature`.
+// If not specified, defaults to 1
+func UniformQuantizedConvolutionBatchGroupCount(value int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["batch_group_count"] = value
+	}
+}
+
+// UniformQuantizedConvolutionFeatureGroupCount sets the optional feature_group_count attribute to value.
+//
+// value: The number of feature groups. Used for grouped convolutions.
+// Must be a divisor of both `lhs_feature` and `output_feature`.
+// If not specified, defaults to 1
+func UniformQuantizedConvolutionFeatureGroupCount(value int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["feature_group_count"] = value
+	}
+}
+
+// UniformQuantizedConvolutionDimensionNumbers sets the optional dimension_numbers attribute to value.
+//
+// value: Structure of dimension information for the convolution op.
+// Must be an empty string (default) or a serialized string of `tensorflow.UniformQuantizedConvolutionDimensionNumbersAttr` proto.
+// If empty string, the default is `("NCHW", "OIHW", "NCHW")` (for a 2D convolution).
+// If not specified, defaults to ""
+func UniformQuantizedConvolutionDimensionNumbers(value string) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["dimension_numbers"] = value
+	}
+}
+
+// UniformQuantizedConvolutionLhsQuantizationAxis sets the optional lhs_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `lhs`, only per-tensor quantization is supported.
+// Thus, this must be set to -1.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedConvolutionLhsQuantizationAxis(value int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["lhs_quantization_axis"] = value
+	}
+}
+
+// UniformQuantizedConvolutionRhsQuantizationAxis sets the optional rhs_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `rhs`, only per-tensor quantization
+// or per-channel quantization along `kernel_output_feature_dimension` is supported.
+// Thus, this must be set to -1 or `dimension_numbers.kernel_output_feature_dimension`.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedConvolutionRhsQuantizationAxis(value int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["rhs_quantization_axis"] = value
+	}
+}
+
+// UniformQuantizedConvolutionOutputQuantizationAxis sets the optional output_quantization_axis attribute to value.
+//
+// value: Indicates the dimension index of the tensor where per-axis quantization is applied for the slices along that dimension.
+// If set to -1 (default), this indicates per-tensor quantization.
+// For the `output`, only per-tensor quantization or per-channel quantization along `output_feature_dimension` is supported.
+// Thus, this must be set to -1 or `dimension_numbers.output_feature_dimension`.
+// Other values will raise error at OpKernel construction.
+// If not specified, defaults to -1
+func UniformQuantizedConvolutionOutputQuantizationAxis(value int64) UniformQuantizedConvolutionAttr {
+	return func(m optionalAttr) {
+		m["output_quantization_axis"] = value
+	}
+}
+
+// Perform quantized convolution of quantized Tensor `lhs` and quantized Tensor `rhs`. to make quantized `output`.
+//
+// Given quantized `lhs` and quantized `rhs`, performs quantized dot on `lhs` and `rhs` to make quantized `output`.
+//
+// `lhs` and `rhs` must be Tensors of same rank, and meet following shape conditions.
+// - `lhs_feature` % `feature_group_count` == 0
+// - `lhs_feature` % `rhs_input_feature` == 0
+// - `lhs_feature` / `feature_group_count` == `rhs_input_feature`
+// - `rhs_output_feature` % `feature_group_count` == 0
+// - `lhs_batch` % `batch_group_count` == 0
+// - `rhs_output_feature` % `batch_group_count` == 0
+//
+// `lhs` and `rhs` must be quantized Tensor, where data value is quantized using the formula:
+// ```
+// quantized_data = clip(original_data / scale + zero_point, quantization_min_val, quantization_max_val)
+// ```
+// `output` is also quantized, using the same formula.
+// If `rhs` is per-tensor quantized, `output` must be also per-tensor quantized.
+//
+// Arguments:
+//
+//	lhs: Must be a quantized tensor, rank >= 3.
+//	rhs: Must be a quantized tensor, same rank as `lhs`.
+//	lhs_scales: The float value(s) used as scale factors when quantizing the original data that `lhs` represents.
+//
+// Must be a scalar `Tensor` (`lhs` supports only per-tensor quantization).
+//
+//	lhs_zero_points: The int32 value(s) used as zero points when quantizing original data that `lhs` represents.
+//
+// Same shape condition as `lhs_scales`.
+//
+//	rhs_scales: The float value(s) used as scale factors when quantizing the original data that `rhs` represents.
+//
+// Must be a scalar `Tensor` for per-tensor quantization,
+// or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`, for per-channel quantization.
+//
+//	rhs_zero_points: The int32 value(s) used as zero points when quantizing original data that `rhs` represents.
+//
+// Same shape condition as `rhs_scales`.
+//
+//	output_scales: The float value(s) to use as scale factors when quantizing original data that `output` represents.
+//
+// Must be a scalar `Tensor` for per-tensor quantization,
+// or 1D `Tensor` of size `rhs.dim_size(kernel_output_feature_dimension)`
+// - which is equal to `output.dim_size(output_feature_dimension)`,
+// for per-channel quantization.
+// If `rhs` is per-tensor quantized, output must be also per-tensor quantized.
+// This means that if `rhs_scales` and `rhs_zero_points` are scalar `Tensor`s, `output_scales` and `output_zero_points` must be scalar `Tensor`s as well.
+//
+//	output_zero_points: The int32 value(s) used as zero points when quantizing original data that output represents.
+//
+// Same shape condition as `output_scales`.
+//
+//	Tout: The type of `output` `Tensor`.
+//	padding: string from: `"SAME"`, `"VALID"`, or `"EXPLICIT"`, indicating the type of padding algorithm to use.
+//	lhs_quantization_min_val: The min value of the quantized data stored in `lhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	lhs_quantization_max_val: The max value of the quantized data stored in `lhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to 127.
+//
+//	rhs_quantization_min_val: The min value of the quantized data stored in `rhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	rhs_quantization_max_val: The max value of the quantized data stored in `rhs`.
+//
+// For example, if `Tin` is `qint8`, this must be set to 127.
+//
+//	output_quantization_min_val: The min value of the quantized data stored in `output`.
+//
+// For example, if  `Tout` is `qint8`, this must be set to -127 if narrow range quantized or -128 if not.
+//
+//	output_quantization_max_val: The max value of the quantized data stored in `output`.
+//
+// For example, if `Tout` is `qint8`, this must be set to 127.
+//
+// Returns The output quantized tensor of `Tout`, same rank as `lhs` and `rhs`.
+func UniformQuantizedConvolution(scope *Scope, lhs tf.Output, rhs tf.Output, lhs_scales tf.Output, lhs_zero_points tf.Output, rhs_scales tf.Output, rhs_zero_points tf.Output, output_scales tf.Output, output_zero_points tf.Output, Tout tf.DataType, padding string, lhs_quantization_min_val int64, lhs_quantization_max_val int64, rhs_quantization_min_val int64, rhs_quantization_max_val int64, output_quantization_min_val int64, output_quantization_max_val int64, optional ...UniformQuantizedConvolutionAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tout": Tout, "padding": padding, "lhs_quantization_min_val": lhs_quantization_min_val, "lhs_quantization_max_val": lhs_quantization_max_val, "rhs_quantization_min_val": rhs_quantization_min_val, "rhs_quantization_max_val": rhs_quantization_max_val, "output_quantization_min_val": output_quantization_min_val, "output_quantization_max_val": output_quantization_max_val}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformQuantizedConvolution",
+		Input: []tf.Input{
+			lhs, rhs, lhs_scales, lhs_zero_points, rhs_scales, rhs_zero_points, output_scales, output_zero_points,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // UniformQuantizedConvolutionHybridAttr is an optional argument to UniformQuantizedConvolutionHybrid.
 type UniformQuantizedConvolutionHybridAttr func(optionalAttr)
 
@@ -55066,7 +56072,7 @@ func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
 //
 // ```
 // x = tf.constant([1, 1, 2, 4, 4, 4, 7, 8, 8])
-// y, idx, count = UniqueWithCountsV2(x, axis = [0])
+// y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis = [0])
 // y ==> [1, 2, 4, 7, 8]
 // idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 // count ==> [2, 1, 3, 1, 2]
@@ -55080,7 +56086,7 @@ func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
 //	[1, 0, 0],
 //	[2, 0, 0]])
 //
-// y, idx, count = UniqueWithCountsV2(x, axis=[0])
+// y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis=[0])
 // y ==> [[1, 0, 0],
 //
 //	[2, 0, 0]]
@@ -55097,7 +56103,7 @@ func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
 //	[1, 0, 0],
 //	[2, 0, 0]])
 //
-// y, idx, count = UniqueWithCountsV2(x, axis=[1])
+// y, idx, count = tf.raw_ops.UniqueWithCountsV2(x=x, axis=[1])
 // y ==> [[1, 0],
 //
 //	[1, 0],
@@ -56288,7 +57294,8 @@ func XlaBroadcastHelper(scope *Scope, lhs tf.Output, rhs tf.Output, broadcast_di
 //
 //	args: A list of `Tensor` with possibly different types to be passed as arguments
 //
-// to the HLO module.
+// to the HLO module. These are all non-dimension arguments. The dimension
+// arguments are computed at JIT time.
 //
 //	version: Changes when we change the semantics of the op, to support backwards
 //
diff --git a/tensorflow/go/saved_model_test.go b/tensorflow/go/saved_model_test.go
index 24811d692af..a9f97379ef2 100644
--- a/tensorflow/go/saved_model_test.go
+++ b/tensorflow/go/saved_model_test.go
@@ -16,25 +16,86 @@ limitations under the License.
 
 package tensorflow
 
-import "testing"
+import (
+	"math"
+	"testing"
+)
 
-func TestSavedModel(t *testing.T) {
-	tags := []string{"serve"}
-	bundle, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", tags, nil)
+func TestSavedModelHalfPlusTwo(t *testing.T) {
+	var (
+		exportDir = "testdata/saved_model/half_plus_two/00000123"
+		tags      = []string{"serve"}
+		options   = new(SessionOptions)
+	)
+
+	// Load saved model half_plus_two.
+	m, err := LoadSavedModel(exportDir, tags, options)
 	if err != nil {
 		t.Fatalf("LoadSavedModel(): %v", err)
 	}
-	if op := bundle.Graph.Operation("y"); op == nil {
+
+	// Check that named operations x and y are present in the graph.
+	if op := m.Graph.Operation("x"); op == nil {
+		t.Fatalf("\"x\" not found in graph")
+	}
+	if op := m.Graph.Operation("y"); op == nil {
 		t.Fatalf("\"y\" not found in graph")
 	}
-	t.Logf("SavedModel: %+v", bundle)
-	// TODO(jhseu): half_plus_two has a tf.Example proto dependency to run. Add a
-	// more thorough test when the generated protobufs are available.
+
+	// Define test cases for half plus two (y = 0.5 * x + 2).
+	tests := []struct {
+		name string
+		X    float32
+		Y    float32
+	}{
+		{"NegVal", -1, 1.5},
+		{"PosVal", 1, 2.5},
+		{"Zero", 0, 2.0},
+		{"NegInf", float32(math.Inf(-1)), float32(math.Inf(-1))},
+		{"PosInf", float32(math.Inf(1)), float32(math.Inf(1))},
+	}
+
+	// Run tests.
+	for _, c := range tests {
+		t.Run(c.name, func(t *testing.T) {
+			x, err := NewTensor([]float32{c.X})
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			y, err := m.Session.Run(
+				map[Output]*Tensor{
+					m.Graph.Operation("x").Output(0): x,
+				},
+				[]Output{
+					m.Graph.Operation("y").Output(0),
+				},
+				nil,
+			)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			got := y[0].Value().([]float32)[0]
+			if got != c.Y {
+				t.Fatalf("got: %#v, want: %#v", got, c.Y)
+			}
+		})
+	}
+
+	t.Logf("SavedModel: %+v", m)
+	// TODO(jhseu): half_plus_two has a tf.Example proto dependency to run.
+	// Add a more thorough test when the generated protobufs are available.
 }
 
 func TestSavedModelWithEmptyTags(t *testing.T) {
-	tags := []string{}
-	_, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", tags, nil)
+	var (
+		exportDir = "testdata/saved_model/half_plus_two/00000123"
+		tags      = []string{}
+		options   = new(SessionOptions)
+	)
+
+	_, err := LoadSavedModel(exportDir, tags, options)
 	if err == nil {
 		t.Fatalf("LoadSavedModel() should return an error if tags are empty")
 	}
diff --git a/tensorflow/go/stream_executor/BUILD b/tensorflow/go/stream_executor/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/stream_executor/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 6d609d6b2f0..c8474941b0c 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -43,28 +43,30 @@ type DataType C.TF_DataType
 
 // Types of scalar values in the TensorFlow type system.
 const (
-	Float      DataType = C.TF_FLOAT
-	Double     DataType = C.TF_DOUBLE
-	Int32      DataType = C.TF_INT32
-	Uint32     DataType = C.TF_UINT32
-	Uint8      DataType = C.TF_UINT8
-	Int16      DataType = C.TF_INT16
-	Int8       DataType = C.TF_INT8
-	String     DataType = C.TF_STRING
-	Complex64  DataType = C.TF_COMPLEX64
-	Complex    DataType = C.TF_COMPLEX
-	Int64      DataType = C.TF_INT64
-	Uint64     DataType = C.TF_UINT64
-	Bool       DataType = C.TF_BOOL
-	Qint8      DataType = C.TF_QINT8
-	Quint8     DataType = C.TF_QUINT8
-	Qint32     DataType = C.TF_QINT32
-	Bfloat16   DataType = C.TF_BFLOAT16
-	Qint16     DataType = C.TF_QINT16
-	Quint16    DataType = C.TF_QUINT16
-	Uint16     DataType = C.TF_UINT16
-	Complex128 DataType = C.TF_COMPLEX128
-	Half       DataType = C.TF_HALF
+	Float        DataType = C.TF_FLOAT
+	Double       DataType = C.TF_DOUBLE
+	Int32        DataType = C.TF_INT32
+	Uint32       DataType = C.TF_UINT32
+	Uint8        DataType = C.TF_UINT8
+	Int16        DataType = C.TF_INT16
+	Int8         DataType = C.TF_INT8
+	String       DataType = C.TF_STRING
+	Complex64    DataType = C.TF_COMPLEX64
+	Complex      DataType = C.TF_COMPLEX
+	Int64        DataType = C.TF_INT64
+	Uint64       DataType = C.TF_UINT64
+	Bool         DataType = C.TF_BOOL
+	Qint8        DataType = C.TF_QINT8
+	Quint8       DataType = C.TF_QUINT8
+	Qint32       DataType = C.TF_QINT32
+	Bfloat16     DataType = C.TF_BFLOAT16
+	Qint16       DataType = C.TF_QINT16
+	Quint16      DataType = C.TF_QUINT16
+	Uint16       DataType = C.TF_UINT16
+	Complex128   DataType = C.TF_COMPLEX128
+	Half         DataType = C.TF_HALF
+	Float8e5m2   DataType = C.TF_FLOAT8_E5M2
+	Float8e4m3fn DataType = C.TF_FLOAT8_E4M3FN
 )
 
 // Tensor holds a multi-dimensional array of elements of a single data type.
@@ -554,7 +556,7 @@ func isTensorSerializable(dataType DataType) error {
 	// serialization and deserialization of Tensors.  Till then capitalize
 	// on knowledge of the implementation for numeric types.
 	switch dataType {
-	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half:
+	case Float, Double, Int32, Uint8, Int16, Int8, Complex, Int64, Bool, Quint8, Qint32, Bfloat16, Qint16, Quint16, Uint16, Complex128, Half, Float8e5m2, Float8e4m3fn:
 		return nil
 	default:
 		return fmt.Errorf("serialization of tensors with the DataType %d is not yet supported, see https://github.com/tensorflow/tensorflow/issues/6003", dataType)
diff --git a/tensorflow/go/tensor_handle_test.go b/tensorflow/go/tensor_handle_test.go
index 15dea64b08c..ee1acdb9fe9 100644
--- a/tensorflow/go/tensor_handle_test.go
+++ b/tensorflow/go/tensor_handle_test.go
@@ -28,9 +28,13 @@ func TestNewTensorHandle(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	if _, err = NewTensorHandle(tensor); err != nil {
+	th, err := NewTensorHandle(tensor)
+	if err != nil {
 		t.Fatal(err)
 	}
+	if th == nil {
+		t.Errorf("expected non-nil tensor handle; got: %v", th)
+	}
 }
 
 func TestTensorHandleDataType(t *testing.T) {
@@ -121,6 +125,9 @@ func TestTensorHandleToTensor(t *testing.T) {
 	}
 
 	tensor, err := th.ToTensor()
+	if err != nil {
+		t.Fatal(err)
+	}
 	if v := tensor.Value().([][]float32); !reflect.DeepEqual(v, initialVals) {
 		t.Errorf("Got %#v, want %#v", v, initialVals)
 	}
diff --git a/tensorflow/go/testdata/label_image/grace_hopper.jpg b/tensorflow/go/testdata/label_image/grace_hopper.jpg
new file mode 100644
index 00000000000..d2a427810f6
Binary files /dev/null and b/tensorflow/go/testdata/label_image/grace_hopper.jpg differ
diff --git a/tensorflow/go/testdata/saved_model/half_plus_two/00000123/assets/foo.txt b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/assets/foo.txt
new file mode 100644
index 00000000000..f9ff0366880
--- /dev/null
+++ b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/assets/foo.txt
@@ -0,0 +1 @@
+asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/go/testdata/saved_model/half_plus_two/00000123/saved_model.pb b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/saved_model.pb
new file mode 100644
index 00000000000..4a4fd025d9d
Binary files /dev/null and b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/saved_model.pb differ
diff --git a/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.data-00000-of-00001 b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..15b75d6ef6b
Binary files /dev/null and b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.index b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.index
new file mode 100644
index 00000000000..7ec9fb4fe2d
Binary files /dev/null and b/tensorflow/go/testdata/saved_model/half_plus_two/00000123/variables/variables.index differ
diff --git a/tensorflow/go/tsl/profiler/protobuf/BUILD b/tensorflow/go/tsl/profiler/protobuf/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/tsl/profiler/protobuf/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/go/tsl/protobuf/BUILD b/tensorflow/go/tsl/protobuf/BUILD
new file mode 100644
index 00000000000..b873efc0f81
--- /dev/null
+++ b/tensorflow/go/tsl/protobuf/BUILD
@@ -0,0 +1 @@
+# Empty file to be replaced in https://github.com/tensorflow/tensorflow/pull/50934
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index a4422987df4..8caae69055d 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -15,6 +15,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/java/LEGACY.md b/tensorflow/java/LEGACY.md
new file mode 100644
index 00000000000..edbd4597ada
--- /dev/null
+++ b/tensorflow/java/LEGACY.md
@@ -0,0 +1,88 @@
+## Quickstart
+
+-   Refer to
+    [Installing TensorFlow for Java](https://www.tensorflow.org/install/lang_java)
+-   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+-   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
+
+## Nightly builds
+
+Releases built from release branches are available on Maven Central.
+Additionally, every day binaries are built from the `master` branch on GitHub:
+
+-   [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
+-   [Source JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
+-   JNI:
+    -   [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
+    -   [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
+    -   [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
+    -   Windows: (No nightly builds available yet)
+
+## Building from source
+
+If the quickstart instructions above do not work out, the TensorFlow Java and
+native libraries will need to be built from source.
+
+1.  Install [bazel](https://www.bazel.build/versions/master/docs/install.html)
+
+2.  Setup the environment to build TensorFlow from source code
+    ([Linux or macOS](https://www.tensorflow.org/install/source)). If you'd like
+    to skip reading those details and do not care about GPU support, try the
+    following:
+
+    ```sh
+    # On Linux
+    sudo apt-get install python swig python-numpy
+
+    # On Mac OS X with homebrew
+    brew install swig
+    ```
+
+3.  [Configure](https://www.tensorflow.org/install/source) (e.g., enable GPU
+    support) and build:
+
+    ```sh
+    ./configure
+    bazel build --config opt \
+      //tensorflow/java:tensorflow \
+      //tensorflow/java:libtensorflow_jni
+    ```
+
+The command above will produce two files in the `bazel-bin/tensorflow/java`
+directory:
+
+*   An archive of Java classes: `libtensorflow.jar`
+*   A native library: `libtensorflow_jni.so` on Linux, `libtensorflow_jni.dylib`
+    on OS X, or `tensorflow_jni.dll` on Windows.
+
+To compile Java code that uses the TensorFlow Java API, include
+`libtensorflow.jar` in the classpath. For example:
+
+```sh
+javac -cp bazel-bin/tensorflow/java/libtensorflow.jar ...
+```
+
+To execute the compiled program, include `libtensorflow.jar` in the classpath
+and the native library in the library path. For example:
+
+```sh
+java -cp bazel-bin/tensorflow/java/libtensorflow.jar \
+  -Djava.library.path=bazel-bin/tensorflow/java \
+  ...
+```
+
+Installation on Windows requires the more experimental
+[bazel on Windows](https://bazel.build/versions/master/docs/windows.html).
+Details are omitted here, but find inspiration in the script used for building
+the release archive:
+[`tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh`](https://www.tensorflow.org/code/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh).
+
+### Bazel
+
+If your project uses bazel for builds, add a dependency on
+`//tensorflow/java:tensorflow` to the `java_binary` or `java_library` rule. For
+example:
+
+```sh
+bazel run -c opt //tensorflow/java/src/main/java/org/tensorflow/examples:label_image
+```
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 66194692614..b9795a36950 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,94 +1,9 @@
 # TensorFlow for Java
 
-> *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
-> [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
+> *WARNING*: This version of the Java client is deprecated and has been replaced by a new version that is maintained 
+> in its own [repository](https://github.com/tensorflow/java) under the TensorFlow organization. 
 >
-> For using TensorFlow on Android refer instead to [TensorFlow Lite](https://www.tensorflow.org/code/tensorflow/lite/).
+> For using TensorFlow on a JVM, please refer to new [TensorFlow Java](https://www.tensorflow.org/jvm/install).
+> For using TensorFlow on Android, refer instead to [TensorFlow Lite](https://www.tensorflow.org/code/tensorflow/lite/).
 
-## Quickstart
-
--   Refer to [Installing TensorFlow for Java](https://www.tensorflow.org/install/lang_java)
--   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
--   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
-
-## Nightly builds
-
-Releases built from release branches are available on Maven Central.
-Additionally, every day binaries are built from the `master` branch on GitHub:
-
--   [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
--   [Source JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
--   JNI:
-    -   [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
-    -   [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
-    -   [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
-    -   Windows: (No nightly builds available yet)
-
-## Building from source
-
-If the quickstart instructions above do not work out, the TensorFlow Java and
-native libraries will need to be built from source.
-
-1.  Install [bazel](https://www.bazel.build/versions/master/docs/install.html)
-
-2.  Setup the environment to build TensorFlow from source code
-    ([Linux or macOS](https://www.tensorflow.org/install/source)).
-    If you'd like to skip reading those details and do not care about GPU
-    support, try the following:
-
-    ```sh
-    # On Linux
-    sudo apt-get install python swig python-numpy
-
-    # On Mac OS X with homebrew
-    brew install swig
-    ```
-
-3.  [Configure](https://www.tensorflow.org/install/source)
-    (e.g., enable GPU support) and build:
-
-    ```sh
-    ./configure
-    bazel build --config opt \
-      //tensorflow/java:tensorflow \
-      //tensorflow/java:libtensorflow_jni
-    ```
-
-The command above will produce two files in the `bazel-bin/tensorflow/java`
-directory:
-
-*   An archive of Java classes: `libtensorflow.jar`
-*   A native library: `libtensorflow_jni.so` on Linux, `libtensorflow_jni.dylib`
-    on OS X, or `tensorflow_jni.dll` on Windows.
-
-To compile Java code that uses the TensorFlow Java API, include
-`libtensorflow.jar` in the classpath. For example:
-
-```sh
-javac -cp bazel-bin/tensorflow/java/libtensorflow.jar ...
-```
-
-To execute the compiled program, include `libtensorflow.jar` in the classpath
-and the native library in the library path. For example:
-
-```sh
-java -cp bazel-bin/tensorflow/java/libtensorflow.jar \
-  -Djava.library.path=bazel-bin/tensorflow/java \
-  ...
-```
-
-Installation on Windows requires the more experimental [bazel on
-Windows](https://bazel.build/versions/master/docs/windows.html). Details are
-omitted here, but find inspiration in the script used for building the release
-archive:
-[`tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh`](https://www.tensorflow.org/code/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh).
-
-### Bazel
-
-If your project uses bazel for builds, add a dependency on
-`//tensorflow/java:tensorflow` to the `java_binary` or `java_library` rule. For
-example:
-
-```sh
-bazel run -c opt //tensorflow/java/src/main/java/org/tensorflow/examples:label_image
-```
+Follow this [link](LEGACY.md) for legacy instructions.
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 4bcae0791c4..ea6b08fecc8 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -3,16 +3,6 @@ JAVA_VERSION_OPTS = []
 # A more robust set of lint and errorprone checks when building
 # Java source to improve code consistency.
 
-XLINT_OPTS = [
-    "-Werror",
-    "-Xlint:all",
-    "-Xlint:-processing",
-    "-Xlint:-serial",
-    "-Xlint:-try",
-    "-Xlint:-classfile",  # see b/32750402, go/javac-warnings#classfile
-    "-Xlint:-options",  # see b/195079247
-]
-
 # The bazel errorprone plugin currently only enables default errorChecks
 # https://github.com/bazelbuild/bazel/blob/97975603e5ff2247e6bb352e3afd27fea38f108d/src/java_tools/buildjar/java/com/google/devtools/build/buildjar/javac/plugins/errorprone/ErrorPronePlugin.java#L52
 #
@@ -26,132 +16,131 @@ XLINT_OPTS = [
 # This list is from ENABLED_WARNINGS in
 # com/google/errorprone/scanner/BuiltInCheckerSuppliers.java
 EP_ENABLED_WARNINGS = [
-    "-Xep:AmbiguousMethodReference:ERROR",
-    "-Xep:ArgumentSelectionDefectChecker:ERROR",
-    "-Xep:AssertEqualsArgumentOrderChecker:ERROR",
-    "-Xep:BadAnnotationImplementation:ERROR",
-    "-Xep:BadComparable:ERROR",
-    "-Xep:BoxedPrimitiveConstructor:ERROR",
-    "-Xep:CannotMockFinalClass:ERROR",
-    "-Xep:ClassCanBeStatic:ERROR",
-    "-Xep:ClassNewInstance:ERROR",
-    "-Xep:DefaultCharset:ERROR",
-    "-Xep:DoubleCheckedLocking:ERROR",
-    "-Xep:ElementsCountedInLoop:ERROR",
-    "-Xep:EqualsHashCode:ERROR",
-    "-Xep:EqualsIncompatibleType:ERROR",
-    "-Xep:Finally:ERROR",
-    "-Xep:FloatingPointLiteralPrecision:ERROR",
-    "-Xep:FragmentInjection:ERROR",
-    "-Xep:FragmentNotInstantiable:ERROR",
-    "-Xep:FunctionalInterfaceClash:ERROR",
-    "-Xep:FutureReturnValueIgnored:ERROR",
-    "-Xep:GetClassOnEnum:ERROR",
-    "-Xep:ImmutableAnnotationChecker:ERROR",
-    "-Xep:ImmutableEnumChecker:ERROR",
-    "-Xep:IncompatibleModifiers:ERROR",
-    "-Xep:InjectOnConstructorOfAbstractClass:ERROR",
-    "-Xep:InputStreamSlowMultibyteRead:ERROR",
-    "-Xep:IterableAndIterator:ERROR",
-    "-Xep:JavaLangClash:ERROR",
-    "-Xep:JUnit3FloatingPointComparisonWithoutDelta:ERROR",
-    "-Xep:JUnitAmbiguousTestClass:ERROR",
-    "-Xep:LiteralClassName:ERROR",
-    "-Xep:LogicalAssignment:ERROR",
-    "-Xep:MissingFail:ERROR",
-    "-Xep:MissingOverride:ERROR",
-    "-Xep:MutableConstantField:ERROR",
-    "-Xep:NamedParameters:ERROR",
-    "-Xep:NarrowingCompoundAssignment:ERROR",
-    "-Xep:NonAtomicVolatileUpdate:ERROR",
-    "-Xep:NonOverridingEquals:ERROR",
-    "-Xep:NullableConstructor:ERROR",
-    "-Xep:NullablePrimitive:ERROR",
-    "-Xep:NullableVoid:ERROR",
-    "-Xep:OperatorPrecedence:ERROR",
-    "-Xep:OverridesGuiceInjectableMethod:ERROR",
-    "-Xep:PreconditionsInvalidPlaceholder:ERROR",
-    "-Xep:ProtoFieldPreconditionsCheckNotNull:ERROR",
-    "-Xep:ReferenceEquality:ERROR",
-    "-Xep:RequiredModifiers:ERROR",
-    "-Xep:ShortCircuitBoolean:ERROR",
-    "-Xep:SimpleDateFormatConstant:ERROR",
-    "-Xep:StaticGuardedByInstance:ERROR",
-    "-Xep:SynchronizeOnNonFinalField:ERROR",
-    "-Xep:TruthConstantAsserts:ERROR",
-    "-Xep:TypeParameterShadowing:ERROR",
-    "-Xep:TypeParameterUnusedInFormals:ERROR",
-    "-Xep:UnsynchronizedOverridesSynchronized:ERROR",
-    "-Xep:URLEqualsHashCode:ERROR",
-    "-Xep:WaitNotInLoop:ERROR",
+    "-Xep:AmbiguousMethodReference:WARN",
+    "-Xep:ArgumentSelectionDefectChecker:WARN",
+    "-Xep:AssertEqualsArgumentOrderChecker:WARN",
+    "-Xep:BadAnnotationImplementation:WARN",
+    "-Xep:BadComparable:WARN",
+    "-Xep:BoxedPrimitiveConstructor:WARN",
+    "-Xep:CannotMockFinalClass:WARN",
+    "-Xep:ClassCanBeStatic:WARN",
+    "-Xep:ClassNewInstance:WARN",
+    "-Xep:DefaultCharset:WARN",
+    "-Xep:DoubleCheckedLocking:WARN",
+    "-Xep:ElementsCountedInLoop:WARN",
+    "-Xep:EqualsHashCode:WARN",
+    "-Xep:EqualsIncompatibleType:WARN",
+    "-Xep:Finally:WARN",
+    "-Xep:FloatingPointLiteralPrecision:WARN",
+    "-Xep:FragmentInjection:WARN",
+    "-Xep:FragmentNotInstantiable:WARN",
+    "-Xep:FunctionalInterfaceClash:WARN",
+    "-Xep:FutureReturnValueIgnored:WARN",
+    "-Xep:GetClassOnEnum:WARN",
+    "-Xep:ImmutableAnnotationChecker:WARN",
+    "-Xep:ImmutableEnumChecker:WARN",
+    "-Xep:IncompatibleModifiers:WARN",
+    "-Xep:InjectOnConstructorOfAbstractClass:WARN",
+    "-Xep:InputStreamSlowMultibyteRead:WARN",
+    "-Xep:IterableAndIterator:WARN",
+    "-Xep:JavaLangClash:WARN",
+    "-Xep:JUnit3FloatingPointComparisonWithoutDelta:WARN",
+    "-Xep:JUnitAmbiguousTestClass:WARN",
+    "-Xep:LiteralClassName:WARN",
+    "-Xep:LogicalAssignment:WARN",
+    "-Xep:MissingFail:WARN",
+    "-Xep:MissingOverride:WARN",
+    "-Xep:MutableConstantField:WARN",
+    "-Xep:NamedParameters:WARN",
+    "-Xep:NarrowingCompoundAssignment:WARN",
+    "-Xep:NonAtomicVolatileUpdate:WARN",
+    "-Xep:NonOverridingEquals:WARN",
+    "-Xep:NullableConstructor:WARN",
+    "-Xep:NullablePrimitive:WARN",
+    "-Xep:NullableVoid:WARN",
+    "-Xep:OperatorPrecedence:WARN",
+    "-Xep:OverridesGuiceInjectableMethod:WARN",
+    "-Xep:PreconditionsInvalidPlaceholder:WARN",
+    "-Xep:ProtoFieldPreconditionsCheckNotNull:WARN",
+    "-Xep:ReferenceEquality:WARN",
+    "-Xep:RequiredModifiers:WARN",
+    "-Xep:ShortCircuitBoolean:WARN",
+    "-Xep:SimpleDateFormatConstant:WARN",
+    "-Xep:StaticGuardedByInstance:WARN",
+    "-Xep:SynchronizeOnNonFinalField:WARN",
+    "-Xep:TruthConstantAsserts:WARN",
+    "-Xep:TypeParameterShadowing:WARN",
+    "-Xep:TypeParameterUnusedInFormals:WARN",
+    "-Xep:UnsynchronizedOverridesSynchronized:WARN",
+    "-Xep:URLEqualsHashCode:WARN",
+    "-Xep:WaitNotInLoop:WARN",
 ]
 
 # This list is from DISABLED_CHECKS in
 # com/google/errorprone/scanner/BuiltInCheckerSuppliers.java
 EP_DISABLED_CHECKS = [
-    "-Xep:AutoFactoryAtInject:ERROR",
-    "-Xep:AssertFalse:ERROR",
-    "-Xep:AssistedInjectAndInjectOnConstructors:ERROR",
-    "-Xep:AssistedInjectAndInjectOnSameConstructor:ERROR",
-    "-Xep:BigDecimalLiteralDouble:ERROR",
-    "-Xep:BindingToUnqualifiedCommonType:ERROR",
-    "-Xep:ClassName:ERROR",
-    "-Xep:ComparisonContractViolated:ERROR",
-    "-Xep:ConstantField:ERROR",
-    "-Xep:ConstructorInvokesOverridable:ERROR",
+    "-Xep:AutoFactoryAtInject:WARN",
+    "-Xep:AssertFalse:WARN",
+    "-Xep:AssistedInjectAndInjectOnConstructors:WARN",
+    "-Xep:AssistedInjectAndInjectOnSameConstructor:WARN",
+    "-Xep:BigDecimalLiteralDouble:WARN",
+    "-Xep:BindingToUnqualifiedCommonType:WARN",
+    "-Xep:ClassName:WARN",
+    "-Xep:ComparisonContractViolated:WARN",
+    "-Xep:ConstantField:WARN",
+    "-Xep:ConstructorInvokesOverridable:WARN",
     # False positives, disabled
-    # "-Xep:ConstructorLeaksThis:ERROR",
-    "-Xep:DepAnn:ERROR",
-    "-Xep:DivZero:ERROR",
-    "-Xep:EmptyIfStatement:ERROR",
-    "-Xep:EmptySetMultibindingContributions:ERROR",
-    "-Xep:EmptyTopLevelDeclaration:ERROR",
-    "-Xep:ExpectedExceptionChecker:ERROR",
-    "-Xep:HardCodedSdCardPath:ERROR",
-    "-Xep:InjectedConstructorAnnotations:ERROR",
-    "-Xep:InsecureCipherMode:ERROR",
-    "-Xep:InvalidTargetingOnScopingAnnotation:ERROR",
-    "-Xep:IterablePathParameter:ERROR",
-    "-Xep:JMockTestWithoutRunWithOrRuleAnnotation:ERROR",
-    "-Xep:JavaxInjectOnFinalField:ERROR",
-    "-Xep:LockMethodChecker:ERROR",
-    "-Xep:LongLiteralLowerCaseSuffix:ERROR",
-    "-Xep:MethodCanBeStatic:ERROR",
-    "-Xep:MissingDefault:ERROR",
-    "-Xep:MixedArrayDimensions:ERROR",
-    "-Xep:MoreThanOneQualifier:ERROR",
-    "-Xep:MultiVariableDeclaration:ERROR",
-    "-Xep:MultipleTopLevelClasses:ERROR",
-    "-Xep:NoAllocationChecker:ERROR",
-    "-Xep:NonCanonicalStaticMemberImport:ERROR",
-    "-Xep:NumericEquality:ERROR",
-    "-Xep:PackageLocation:ERROR",
-    "-Xep:PrimitiveArrayPassedToVarargsMethod:ERROR",
-    "-Xep:PrivateConstructorForUtilityClass:ERROR",
-    "-Xep:PrivateConstructorForNoninstantiableModule:ERROR",
-    "-Xep:ProtoStringFieldReferenceEquality:ERROR",
-    "-Xep:QualifierOrScopeOnInjectMethod:ERROR",
-    "-Xep:QualifierWithTypeUse:ERROR",
-    "-Xep:RedundantThrows:ERROR",
-    "-Xep:RemoveUnusedImports:ERROR",
-    "-Xep:ScopeAnnotationOnInterfaceOrAbstractClass:ERROR",
-    "-Xep:ScopeOrQualifierAnnotationRetention:ERROR",
-    "-Xep:StaticQualifiedUsingExpression:ERROR",
-    "-Xep:StaticOrDefaultInterfaceMethod:ERROR",
-    "-Xep:StringEquality:ERROR",
-    "-Xep:TestExceptionChecker:ERROR",
+    # "-Xep:ConstructorLeaksThis:WARN",
+    "-Xep:DepAnn:WARN",
+    "-Xep:DivZero:WARN",
+    "-Xep:EmptyIfStatement:WARN",
+    "-Xep:EmptySetMultibindingContributions:WARN",
+    "-Xep:EmptyTopLevelDeclaration:WARN",
+    "-Xep:ExpectedExceptionChecker:WARN",
+    "-Xep:HardCodedSdCardPath:WARN",
+    "-Xep:InjectedConstructorAnnotations:WARN",
+    "-Xep:InvalidTargetingOnScopingAnnotation:WARN",
+    "-Xep:IterablePathParameter:WARN",
+    "-Xep:JMockTestWithoutRunWithOrRuleAnnotation:WARN",
+    "-Xep:JavaxInjectOnFinalField:WARN",
+    "-Xep:LockMethodChecker:WARN",
+    "-Xep:LongLiteralLowerCaseSuffix:WARN",
+    "-Xep:MethodCanBeStatic:WARN",
+    "-Xep:MissingDefault:WARN",
+    "-Xep:MixedArrayDimensions:WARN",
+    "-Xep:MoreThanOneQualifier:WARN",
+    "-Xep:MultiVariableDeclaration:WARN",
+    "-Xep:MultipleTopLevelClasses:WARN",
+    "-Xep:NoAllocationChecker:WARN",
+    "-Xep:NonCanonicalStaticMemberImport:WARN",
+    "-Xep:NumericEquality:WARN",
+    "-Xep:PackageLocation:WARN",
+    "-Xep:PrimitiveArrayPassedToVarargsMethod:WARN",
+    "-Xep:PrivateConstructorForUtilityClass:WARN",
+    "-Xep:PrivateConstructorForNoninstantiableModule:WARN",
+    "-Xep:ProtoStringFieldReferenceEquality:WARN",
+    "-Xep:QualifierOrScopeOnInjectMethod:WARN",
+    "-Xep:QualifierWithTypeUse:WARN",
+    "-Xep:RedundantThrows:WARN",
+    "-Xep:RemoveUnusedImports:WARN",
+    "-Xep:ScopeAnnotationOnInterfaceOrAbstractClass:WARN",
+    "-Xep:ScopeOrQualifierAnnotationRetention:WARN",
+    "-Xep:StaticQualifiedUsingExpression:WARN",
+    "-Xep:StaticOrDefaultInterfaceMethod:WARN",
+    "-Xep:StringEquality:WARN",
+    "-Xep:TestExceptionChecker:WARN",
     # TODO: stylistic changes in code
-    # "-Xep:ThrowsUncheckedException:ERROR",
-    # "-Xep:UngroupedOverloads:ERROR",
-    "-Xep:UnlockMethodChecker:ERROR",
-    "-Xep:UnnecessaryDefaultInEnumSwitch:ERROR",
-    "-Xep:UnnecessaryStaticImport:ERROR",
-    "-Xep:UseBinds:ERROR",
-    "-Xep:VarChecker:ERROR",
-    "-Xep:WildcardImport:ERROR",
-    "-Xep:WrongParameterPackage:ERROR",
+    # "-Xep:ThrowsUncheckedException:WARN",
+    # "-Xep:UngroupedOverloads:WARN",
+    "-Xep:UnlockMethodChecker:WARN",
+    "-Xep:UnnecessaryDefaultInEnumSwitch:WARN",
+    "-Xep:UnnecessaryStaticImport:WARN",
+    "-Xep:UseBinds:WARN",
+    "-Xep:VarChecker:WARN",
+    "-Xep:WildcardImport:WARN",
+    "-Xep:WrongParameterPackage:WARN",
 ]
 
 EP_OPTS = EP_ENABLED_WARNINGS + EP_DISABLED_CHECKS
 
-JAVACOPTS = JAVA_VERSION_OPTS + XLINT_OPTS + EP_OPTS
+JAVACOPTS = JAVA_VERSION_OPTS + EP_OPTS
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
index daf3abbbb4e..1f278494550 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
@@ -2,6 +2,7 @@
 # TensorFlow Java examples.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/java/src/main/native/exception_jni.h b/tensorflow/java/src/main/native/exception_jni.h
index 465281f804e..bdb9f8514ca 100644
--- a/tensorflow/java/src/main/native/exception_jni.h
+++ b/tensorflow/java/src/main/native/exception_jni.h
@@ -22,7 +22,7 @@ limitations under the License.
 extern "C" {
 #endif
 
-struct TF_Status;
+struct TSL_Status;
 
 extern const char kIllegalArgumentException[];
 extern const char kIllegalStateException[];
@@ -34,7 +34,7 @@ void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...);
 
 // If status is not TF_OK, then throw an appropriate exception.
 // Returns true iff TF_GetCode(status) == TF_OK.
-bool throwExceptionIfNotOK(JNIEnv* env, const TF_Status* status);
+bool throwExceptionIfNotOK(JNIEnv* env, const TSL_Status* status);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
index 84efd1ca091..b24aef63f9c 100644
--- a/tensorflow/js/BUILD
+++ b/tensorflow/js/BUILD
@@ -11,6 +11,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 97fea769180..4c0de99b517 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -4,8 +4,10 @@ load("//tensorflow/lite:special_rules.bzl", "SPECIAL_RULES_DEPS", "internal_visi
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -143,7 +145,6 @@ cc_library(
     copts = tflite_copts_warnings(),
 )
 
-# TODO(b/128420794): Migrate clients to use :version directly.
 alias(
     name = "schema_fbs_version",
     actual = ":version",
@@ -159,10 +160,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":graph_info",
+        ":kernel_api",
         ":memory_planner",
         ":simple_memory_arena",
         ":util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -175,10 +177,11 @@ cc_library(
     copts = tflite_copts_warnings() + ["-DTF_LITE_TENSORFLOW_PROFILER"],
     deps = [
         ":graph_info",
+        ":kernel_api",
         ":memory_planner",
         ":simple_memory_arena_with_profiler",
         ":util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -193,7 +196,8 @@ cc_test(
         ":arena_planner_with_profiler",
         ":graph_info",
         "//tensorflow/core:tflite_portable_logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -209,7 +213,7 @@ cc_library(
         ":graph_info",
         ":memory_planner",
         ":util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -221,7 +225,7 @@ cc_test(
         ":graph_info",
         ":simple_planner",
         "//tensorflow/core:tflite_portable_logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -234,7 +238,9 @@ cc_library(
     hdrs = ["context.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -244,7 +250,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -256,7 +262,7 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":kernel_api",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -265,7 +271,9 @@ cc_library(
     hdrs = ["memory_planner.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -277,7 +285,7 @@ cc_library(
     deps = [
         ":macros",
         ":tensorflow_profiler_logger_shim",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -291,7 +299,7 @@ cc_library(
     deps = [
         ":macros",
         ":tensorflow_profiler_logger_shim",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -323,10 +331,18 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":macros",
+        "//base:addressmap",
+        "//base:examine_stack",
+        "//base:low_level_alloc",
+        "//base:malloc_hook",
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/debugging:stacktrace",
+        "@com_google_absl//absl/debugging:symbolize",
+        "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
 )
@@ -346,7 +362,9 @@ cc_library(
     name = "builtin_op_data",
     hdrs = ["builtin_op_data.h"],
     compatible_with = get_compatible_with_portable(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -357,7 +375,9 @@ cc_library(
         "context_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -402,8 +422,8 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":string",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -421,7 +441,7 @@ cc_library(
 # The library that implements the full C++ API.
 # See also 'framework' below, which is the corresponding public target.
 # DEPRECATED: use 'framework_stable' or 'framework' instead.
-alias(
+alias_with_tflite(
     name = "framework_lib",
     actual = "framework",
     visibility = ["//tensorflow/lite:__subpackages__"],
@@ -432,22 +452,24 @@ alias(
 # Experimental APIs are functional, tested and usable in production; however,
 # the corresponding API surface has not been finalized, and is subject to
 # change.
-alias(
+alias_with_tflite(
     name = "framework",
     actual = "framework_experimental",
 )
 
 # The public target for the C++ API excluding experimental APIs.
 # TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
-cc_library(
+cc_library_with_tflite(
     name = "framework_stable",
     srcs = STABLE_FRAMEWORK_LIB_IMPL_HDRS,
     hdrs = STABLE_FRAMEWORK_LIB_HDRS,
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tflite_copts_warnings(),
+    tflite_deps = [
+        ":cc_api_stable",
+    ],
     deps = [
         ":allocation",
-        ":cc_api_stable",
         ":external_cpu_backend_context",
         ":graph_info",
         ":kernel_api",
@@ -458,12 +480,12 @@ cc_library(
         ":string",
         ":type_to_tflitetype",
         ":util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
@@ -481,15 +503,17 @@ cc_library(
 # Note that if you have code which depends on both stable and experimental API
 # features, it's fine to depend only on 'framework_experimental', since
 # that includes 'framework_stable' as a subset.
-cc_library(
+cc_library_with_tflite(
     name = "framework_experimental",
     srcs = [],
-    hdrs = FRAMEWORK_LIB_HDRS,
+    hdrs = [hdr for hdr in FRAMEWORK_LIB_HDRS if hdr != "optional_debug_tools.h"] + ["optional_debug_tools.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tflite_copts_warnings(),
+    tflite_deps = [
+        ":cc_api_experimental",
+    ],
     deps = [
         ":allocation",
-        ":cc_api_experimental",
         ":external_cpu_backend_context",
         ":graph_info",
         ":kernel_api",
@@ -501,13 +525,14 @@ cc_library(
         ":string",
         ":type_to_tflitetype",
         ":util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/core:framework_experimental",
         "//tensorflow/lite/core:model_builder",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
@@ -520,7 +545,7 @@ cc_library(
 # The key parts of the C++ API.  This target defines the TF Lite classes for
 # loading models and interpreting them.
 # DEPRECATED: prefer to depend on :cc_api_stable or :cc_api_experimental.
-alias(
+alias_with_tflite(
     name = "cc_api",
     actual = "cc_api_experimental",
     visibility = [
@@ -530,21 +555,29 @@ alias(
     ],
 )
 
+cc_library(
+    name = "logger",
+    srcs = ["logger.cc"],
+    hdrs = ["logger.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+    deps = [
+        ":minimal_logging",
+    ],
+)
+
 # The key parts of the C++ API, excluding experimental APIs.
 # This target defines the TF Lite classes for loading models and interpreting them.
 #
 # This target has restricted visibility; for a public target that exposes
 # these APIs, see 'framework_stable' above.
-cc_library(
+cc_library_with_tflite(
     name = "cc_api_stable",
-    srcs = STABLE_FRAMEWORK_LIB_IMPL_HDRS + [
-        "logger.cc",
-    ],
+    srcs = STABLE_FRAMEWORK_LIB_IMPL_HDRS,
     hdrs = [
         "interpreter.h",
         "interpreter_builder.h",
         "interpreter_options.h",
-        "logger.h",
         "model.h",
         "model_builder.h",
     ],
@@ -560,6 +593,7 @@ cc_library(
         ":allocation",
         ":external_cpu_backend_context",
         ":graph_info",
+        ":logger",
         ":macros",
         ":memory_planner",
         ":minimal_logging",
@@ -572,14 +606,14 @@ cc_library(
         ":type_to_tflitetype",
         ":util",
         ":version",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/core:model_builder",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:telemetry",
         "//tensorflow/lite/delegates/xnnpack:tflite_with_xnnpack_qs8",
         "//tensorflow/lite/delegates/xnnpack:tflite_with_xnnpack_qu8",
@@ -601,11 +635,14 @@ cc_library(
     srcs = ["signature_runner.cc"],
     hdrs = ["signature_runner.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//tensorflow/lite/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/lite/c:__subpackages__",
+        "//tensorflow/lite/core:__subpackages__",
+    ],
     deps = [
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/internal:signature_def",
     ],
 )
@@ -614,7 +651,7 @@ cc_library(
 #
 # This target has restricted visibility; for a public target that exposes
 # these APIs, see 'framework_experimental' above.
-cc_library(
+cc_library_with_tflite(
     name = "cc_api_experimental",
     srcs = [],
     hdrs = [
@@ -630,6 +667,9 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tflite_copts_warnings(),
+    tflite_deps = [
+        ":cc_api_stable",
+    ],
     visibility = [
         "//tensorflow/lite/core/shims:__subpackages__",
         "//tensorflow/lite/delegates/flex:__subpackages__",
@@ -638,7 +678,6 @@ cc_library(
     deps = [
         ":allocation",
         ":builtin_ops",
-        ":cc_api_stable",
         ":external_cpu_backend_context",
         ":macros",
         ":memory_planner",
@@ -648,12 +687,12 @@ cc_library(
         ":string",
         ":type_to_tflitetype",
         ":util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core:cc_api_experimental",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
@@ -689,9 +728,9 @@ cc_library(
     deps = [
         ":kernel_api",
         ":macros",
-        "//tensorflow/lite:cc_api_experimental",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_experimental",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -721,8 +760,8 @@ cc_library(
     ],
     deps = [
         ":minimal_logging",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -751,9 +790,9 @@ cc_library(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/api:op_resolver_internal",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -766,7 +805,8 @@ cc_library(
     copts = tflite_copts_warnings(),
     deps = [
         ":string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -778,7 +818,7 @@ cc_library(
     copts = tflite_copts() + tflite_copts_warnings(),
     linkstatic = True,
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
     alwayslink = 1,
@@ -795,8 +835,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     defines = select({
         ":tflite_kernel_use_xnnpack_false": [],
-        "//tensorflow:macos": [],
-        "//tensorflow:fuchsia": [],
+        ":tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
             "TFLITE_KERNEL_USE_XNNPACK",
         ],
@@ -832,11 +871,9 @@ cc_library(
     # Note: adding ":tflite_with_xnnpack_enabled" to the values of following
     # configuration conditions will make TFLite interpreter to apply XNNPACK
     # delegate by default.
-    deps = select({
-        "//tensorflow:macos": [],
-        "//tensorflow:fuchsia": [],
-        "//conditions:default": [":tflite_with_xnnpack_enabled"],
-    }),
+    deps = [
+        ":tflite_with_xnnpack_enabled",
+    ],
 )
 
 cc_library(
@@ -849,7 +886,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tflite_copts_warnings(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ] + select({
         ":tflite_with_xnnpack_explicit_true": [
             "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
@@ -867,13 +904,12 @@ cc_test(
     name = "string_util_test",
     size = "small",
     srcs = ["string_util_test.cc"],
-    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":framework",
         ":string",
         ":string_util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -889,9 +925,9 @@ cc_library(
         ":framework",
         ":string_util",
         ":version",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:cpu_backend_context",
@@ -922,11 +958,11 @@ cc_test(
         ":string",
         ":string_util",
         ":util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/utils:simple_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/testing:util",
@@ -945,8 +981,8 @@ cc_test(
     ],
     deps = [
         ":framework",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
@@ -961,7 +997,7 @@ cc_test(
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -975,7 +1011,7 @@ cc_test(
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":simple_memory_arena",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -997,10 +1033,10 @@ tf_cc_test(
     ],
     deps = [
         ":framework",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/flex:delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1017,7 +1053,6 @@ cc_test(
         "testdata/multi_add.bin",
     ],
     tags = [
-        "no_windows",  # TODO(b/194459105): the test is flaky.
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
         "tflite_smoke_test",
@@ -1027,9 +1062,9 @@ cc_test(
         ":string",
         ":tflite_with_xnnpack",
         ":util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1059,7 +1094,7 @@ cc_test(
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
@@ -1081,16 +1116,13 @@ cc_test(
     size = "small",
     srcs = ["optional_debug_tools_test.cc"],
     data = ["testdata/add.bin"],
-    tags = [
-        "nomsan",  # TODO(b/186359792)
-    ],
     deps = [
         ":framework",
         ":optional_debug_tools",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1104,22 +1136,19 @@ cc_library(
     deps = [
         ":kernel_api",
         ":macros",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "create_op_resolver_header",
     hdrs = ["create_op_resolver.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tensorflow/lite/core:create_op_resolver_header",
-    ],
+    deps = ["//tensorflow/lite/core:create_op_resolver_header"],
 )
 
 # Defines CreateOpResolver with all builtin ops.
-cc_library(
+cc_library_with_tflite(
     name = "create_op_resolver_with_builtin_ops",
     srcs = ["create_op_resolver_with_builtin_ops.cc"],
     hdrs = ["create_op_resolver.h"],
@@ -1129,7 +1158,7 @@ cc_library(
         ":mutable_op_resolver",
         ":op_resolver",
         "//tensorflow/lite/core:create_op_resolver_header",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
     ],
     # Some targets only have an implicit dependency on CreateOpResolver.
     # This avoids warnings about backwards references when linking.
@@ -1137,7 +1166,7 @@ cc_library(
 )
 
 # Defines CreateOpResolver with a selected subset of ops.
-cc_library(
+cc_library_with_tflite(
     name = "create_op_resolver_with_selected_ops",
     srcs = ["create_op_resolver_with_selected_ops.cc"],
     hdrs = ["create_op_resolver.h"],
@@ -1164,8 +1193,8 @@ cc_test(
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     deps = [
         ":util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1213,7 +1242,9 @@ cc_library(
         ],
     }),
     compatible_with = get_compatible_with_portable(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_test(
@@ -1222,7 +1253,7 @@ cc_test(
     srcs = ["type_to_tflitetype_test.cc"],
     deps = [
         ":type_to_tflitetype",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1287,6 +1318,28 @@ tflite_cc_shared_object(
     ],
 )
 
+package_group(
+    name = "tflite_test_util_visibility",
+    packages = [
+        "//tensorflow/lite/...",
+        "//tensorflow_lite_support/...",
+        "//third_party/mediapipe/tasks/...",
+    ],
+)
+
+cc_library_with_tflite(
+    name = "test_util",
+    testonly = True,
+    hdrs = ["test_util.h"],
+    tflite_deps = [
+        "//tensorflow/lite/core/shims:c_shims_test_util",
+    ],
+    visibility = [":tflite_test_util_visibility"],
+    deps = [
+        "//testing/base/public:gunit_for_library_testonly",
+    ],
+)
+
 bzl_library(
     name = "build_def_bzl",
     srcs = ["build_def.bzl"],
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 9dfbb4b72f3..c71a3925ac6 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -148,6 +148,26 @@ find_package(gemmlowp REQUIRED)
 find_package(NEON_2_SSE REQUIRED)
 find_package(cpuinfo REQUIRED)  #CPUINFO is used by XNNPACK and RUY library
 find_package(ruy REQUIRED)
+# Download necessary dependencies.
+# Download pthreadpool source package if it doesn't exist.
+if(NOT DEFINED PTHREADPOOL_SOURCE_DIR)
+    message(STATUS "Downloading pthreadpool to ${CMAKE_BINARY_DIR}/pthreadpool-source (define PTHREADPOOL_SOURCE_DIR to avoid it)")
+    configure_file(cmake/DownloadPThreadPool.cmake "${CMAKE_BINARY_DIR}/pthreadpool-download/CMakeLists.txt")
+    execute_process(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/pthreadpool-download")
+    execute_process(COMMAND "${CMAKE_COMMAND}" --build .
+      WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/pthreadpool-download")
+    set(PTHREADPOOL_SOURCE_DIR "${CMAKE_BINARY_DIR}/pthreadpool-source" CACHE STRING "pthreadpool source directory")
+endif()
+# Configure pthreadpool
+if(NOT TARGET pthreadpool)
+  set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "")
+  set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "")
+  set(PTHREADPOOL_ALLOW_DEPRECATED_API OFF CACHE BOOL "")
+  add_subdirectory(
+    "${PTHREADPOOL_SOURCE_DIR}"
+    "${CMAKE_BINARY_DIR}/pthreadpool")
+endif()
 set(TF_TARGET_PRIVATE_OPTIONS "")
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
   # TensorFlow uses a heap of deprecated proto fields so surpress these
@@ -155,8 +175,8 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
   list(APPEND TF_TARGET_PRIVATE_OPTIONS "-Wno-deprecated-declarations")
 endif()
 # Additional compiler flags used when compiling TF Lite.
-set(TFLITE_TARGET_PUBLIC_OPTIONS "")
-set(TFLITE_TARGET_PRIVATE_OPTIONS "-DEIGEN_NEON_GEBP_NR=4")
+set(TFLITE_TARGET_PUBLIC_OPTIONS "-DEIGEN_NEON_GEBP_NR=4")
+set(TFLITE_TARGET_PRIVATE_OPTIONS "")
 set(TFLITE_TARGET_PRIVATE_DEFINITIONS "")
 # Additional library dependencies based upon enabled features.
 set(TFLITE_TARGET_DEPENDENCIES "")
@@ -209,6 +229,9 @@ list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*tflite_with_xnnpack\\.cc$")
 # Exclude Flex related files.
 list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*with_selected_ops\\.cc$")
 
+# Exclude tensorflow_profiler_logger files.
+list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*tensorflow_profiler_logger\\.cc$")
+
 if(_TFLITE_ENABLE_MMAP)
   list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*mmap_allocation_disabled\\.cc$")
 else()
@@ -222,6 +245,7 @@ if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
 endif()
 populate_tflite_source_vars("core" TFLITE_CORE_SRCS)
 populate_tflite_source_vars("core/api" TFLITE_CORE_API_SRCS)
+populate_tflite_source_vars("core/c" TFLITE_CORE_C_SRCS)
 populate_tflite_source_vars("core/experimental/acceleration/configuration" TFLITE_CORE_EXPERIMENTAL_SRCS)
 populate_tflite_source_vars("core/kernels" TFLITE_CORE_KERNELS_SRCS)
 populate_tflite_source_vars("core/tools" TFLITE_CORE_TOOLS_SRCS)
@@ -468,6 +492,7 @@ set(TFLITE_PROFILER_SRCS
   ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
   ${TFLITE_SOURCE_DIR}/profiling/root_profiler.h
   ${TFLITE_SOURCE_DIR}/profiling/root_profiler.cc
+  ${TFLITE_SOURCE_DIR}/profiling/telemetry/profiler.cc
 )
 if(CMAKE_SYSTEM_NAME MATCHES "Android")
   list(APPEND TFLITE_PROFILER_SRCS
@@ -483,6 +508,7 @@ endif()
 # TFLite library
 set(_ALL_TFLITE_SRCS
   ${TFLITE_CORE_API_SRCS}
+  ${TFLITE_CORE_C_SRCS}
   ${TFLITE_CORE_EXPERIMENTAL_SRCS}
   ${TFLITE_CORE_KERNELS_SRCS}
   ${TFLITE_CORE_SRCS}
@@ -511,6 +537,7 @@ set(_ALL_TFLITE_SRCS
   ${TFLITE_PROFILER_SRCS}
   ${TFLITE_SOURCE_DIR}/internal/signature_def.h
   ${TFLITE_SOURCE_DIR}/kernels/internal/utils/sparsity_format_converter.cc
+  ${TFLITE_SOURCE_DIR}/schema/conversion_metadata_generated.h
   ${TFLITE_SOURCE_DIR}/schema/schema_utils.cc
   ${TFLITE_SOURCE_DIR}/schema/schema_generated.h
 )
@@ -537,6 +564,7 @@ target_link_libraries(tensorflow-lite
     flatbuffers::flatbuffers
     gemmlowp
     ruy::ruy
+    pthreadpool
     ${CMAKE_DL_LIBS}
     ${TFLITE_TARGET_DEPENDENCIES}
 )
@@ -626,4 +654,4 @@ target_link_libraries(_pywrap_tensorflow_interpreter_wrapper
 target_compile_options(_pywrap_tensorflow_interpreter_wrapper
   PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
   PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
-)
+)
\ No newline at end of file
diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h
index 84b725427e4..2d83939acc7 100644
--- a/tensorflow/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Memory management for TF Lite.
 #ifndef TENSORFLOW_LITE_ALLOCATION_H_
 #define TENSORFLOW_LITE_ALLOCATION_H_
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index e2a69461951..bbe7455a115 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -23,17 +23,29 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/simple_memory_arena.h"
 
 namespace tflite {
 namespace {
 
-constexpr int32_t kNodeNotAssigned = std::numeric_limits<int32_t>::max();
 constexpr int32_t kLastActiveNodeUndefined =
     std::numeric_limits<int32_t>::max();
+constexpr int32_t kNodeNotAssigned = std::numeric_limits<int32_t>::max();
 
+bool ShareFirstInputWithFirstOutputForNode(const TfLiteRegistration& node_reg) {
+  // TODO (b/254230751): add support for more ops which support forwarding.
+  switch (node_reg.builtin_code) {
+    case kTfLiteBuiltinExpandDims:
+    case kTfLiteBuiltinReshape:
+    case kTfLiteBuiltinSqueeze:
+      return true;
+    default:
+      return false;
+  }
+}
 }  // namespace
 
 ArenaPlanner::ArenaPlanner(TfLiteContext* context,
@@ -95,6 +107,56 @@ TfLiteStatus ArenaPlanner::ResetAllocationsAfter(int node) {
   return kTfLiteOk;
 }
 
+int ArenaPlanner::FindSharedTensor(int tensor_index) {
+  auto actual_tensor_it = actual_tensor_id_.find(tensor_index);
+  if (actual_tensor_it != actual_tensor_id_.end()) {
+    tensor_index = actual_tensor_it->second;
+  }
+  return tensor_index;
+}
+
+void ArenaPlanner::IdentifySharedTensors() {
+  actual_tensor_id_.clear();
+  TfLiteTensor* tensors = graph_info_->tensors();
+  const int num_execution_nodes = graph_info_->num_execution_nodes();
+  for (int i = 0; i < num_execution_nodes; ++i) {
+    const auto& reg = graph_info_->registration(i);
+    const auto& tflite_node = graph_info_->node(i);
+    if (ShareFirstInputWithFirstOutputForNode(reg)) {
+      int32_t input_tensor = tflite_node.inputs->data[0];
+      int32_t output_tensor = tflite_node.outputs->data[0];
+      bool is_input_or_output_tensor = false;
+      for (int input : graph_info_->inputs()) {
+        if (input == input_tensor) {
+          is_input_or_output_tensor = true;
+          break;
+        }
+      }
+      for (int output : graph_info_->outputs()) {
+        if (output == output_tensor) {
+          is_input_or_output_tensor = true;
+          break;
+        }
+      }
+      if (is_input_or_output_tensor) continue;
+      TfLiteAllocationType input_allocation_type =
+          tensors[input_tensor].allocation_type;
+      TfLiteAllocationType output_allocation_type =
+          tensors[output_tensor].allocation_type;
+      // Only tensors allocated in the same arena may be shared.
+      if (input_allocation_type != output_allocation_type) {
+        continue;
+      }
+      if (input_allocation_type == kTfLiteArenaRw ||
+          input_allocation_type == kTfLiteArenaRwPersistent) {
+        // Handle the case where a shared tensor is also shared.
+        int32_t actual_output_tensor_id = FindSharedTensor(input_tensor);
+        actual_tensor_id_[output_tensor] = actual_output_tensor_id;
+      }
+    }
+  }
+}
+
 TfLiteStatus ArenaPlanner::PlanAllocations() {
   // Invalidate any existing data.
   const size_t num_tensors = graph_info_->num_tensors();
@@ -137,6 +199,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     refcounts[tensor_index]++;
   }
 
+  IdentifySharedTensors();
   // Variable tensors also should be ensured to be never overwritten and need to
   // be alive all the time.
   for (int tensor_index : graph_info_->variables()) {
@@ -162,27 +225,31 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
   }
 
   // Count references to node input tensors.
-  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
+  const int num_execution_nodes = graph_info_->num_execution_nodes();
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_inputs = node.inputs;
     for (int j = 0; j < node_inputs->size; ++j) {
       int tensor_index = node_inputs->data[j];
       if (tensor_index != kTfLiteOptionalTensor) {
+        // Correctly count references for shared buffers.
+        tensor_index = FindSharedTensor(tensor_index);
         refcounts[tensor_index]++;
       }
     }
   }
 
   // Go through the graph in execution order.
-  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
 
     // First queue output tensors for allocation.
     TfLiteIntArray* node_outputs = node.outputs;
     for (int j = 0; j < node_outputs->size; ++j) {
       int tensor_index = node_outputs->data[j];
-      TF_LITE_ENSURE_STATUS(allocate(i, tensor_index));
+      //  Don't allocate output tensors here for shared memory parts.
       nodes_to_tensors_[i].insert(tensor_index);
+      TF_LITE_ENSURE_STATUS(allocate(i, tensor_index));
     }
 
     // Then update the ref-counts of the node's inputs, and if necessary queue
@@ -190,8 +257,11 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     if (!preserve_all_tensors_) {
       TfLiteIntArray* node_inputs = node.inputs;
       for (int j = 0; j < node_inputs->size; ++j) {
+        // If the tensor is a ref we decrement the original tensor.
         int tensor_index = node_inputs->data[j];
         if (tensor_index != kTfLiteOptionalTensor) {
+          // Correctly count references for shared buffers.
+          tensor_index = FindSharedTensor(tensor_index);
           refcounts[tensor_index]--;
           if (refcounts[tensor_index] == 0) {
             TF_LITE_ENSURE_STATUS(deallocate(i, tensor_index));
@@ -214,9 +284,9 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   dealloc_node_.resize(num_tensors, kNodeNotAssigned);
   allocs_.resize(num_tensors);
   // Set allocation and deallocation for temporary tensors.
-  for (size_t i = first_node; i <= static_cast<size_t>(last_node) &&
-                              i < graph_info_->num_execution_nodes();
-       ++i) {
+  const int num_execution_nodes = graph_info_->num_execution_nodes();
+  for (size_t i = first_node;
+       i <= static_cast<size_t>(last_node) && i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_temporaries = node.temporaries;
     for (int j = 0; j < node_temporaries->size; ++j) {
@@ -238,12 +308,12 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   TfLiteTensor* tensors = graph_info_->tensors();
   if (arena_reallocated) {
     for (int i = 0; i < static_cast<int>(num_tensors); ++i) {
-      TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i, tensors[i]));
+      TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i, tensors));
     }
   } else {
     for (int i = 0; i < static_cast<int>(tensors_allocated.size()); ++i) {
-      TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(
-          tensors_allocated[i], tensors[tensors_allocated[i]]));
+      TF_LITE_ENSURE_STATUS(
+          ResolveTensorAllocation(tensors_allocated[i], tensors));
     }
   }
 
@@ -273,7 +343,7 @@ TfLiteStatus ArenaPlanner::AcquireNonPersistentMemory() {
   for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
     TfLiteTensor& tensor = tensors[i];
     if (tensor.allocation_type == kTfLiteArenaRw) {
-      TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i, tensors[i]));
+      TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i, tensors));
     }
   }
   return kTfLiteOk;
@@ -387,6 +457,23 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(
   // Vector of ids of already allocated tensors, ordered by offset.
   for (const auto& tensor_index : *tensors_allocated) {
     TfLiteTensor& tensor = tensors[tensor_index];
+    // Only allocate ArenaRw tensors which own their buffer.
+    auto it = actual_tensor_id_.find(tensor_index);
+    if (it != actual_tensor_id_.end()) {
+      // A tensor whose buffer is shared may have had its allocation type
+      // changed to kTfLiteCustom or kTfLiteDynamic after `PlanAllocations` was
+      // called. This means that the buffer is no longer shareable so remove its
+      // index from `actual_tensor_id_`.
+      TfLiteAllocationType allocation_type =
+          tensors[it->second].allocation_type;
+      if (allocation_type != kTfLiteArenaRwPersistent &&
+          allocation_type != kTfLiteArenaRw) {
+        actual_tensor_id_.erase(it);
+      } else {
+        // Don't allocate the tensor, it can safely share the input buffer.
+        continue;
+      }
+    }
     if (tensor.allocation_type == kTfLiteArenaRw) {
       TF_LITE_ENSURE_STATUS(
           arena_.Allocate(context_, tensor_alignment_, tensor.bytes,
@@ -394,6 +481,7 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(
                           dealloc_node_[tensor_index], &allocs_[tensor_index]));
     }
     // Check allocs_[].size to prevent from reallocation of persistent tensors.
+    // Only allocate ArenaRwPersistent tensors which own their buffer.
     if (tensor.allocation_type == kTfLiteArenaRwPersistent &&
         allocs_[tensor_index].size == 0) {
       if (allocs_[tensor_index].size < tensor.bytes) {
@@ -409,8 +497,39 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(
   return kTfLiteOk;
 }
 
+bool AreTensorsAllocatedInSameArena(int32_t root_tensor_index,
+                                    int32_t tensor_index,
+                                    const TfLiteTensor* tensors) {
+  if (tensors[root_tensor_index].allocation_type == kTfLiteArenaRw &&
+      tensors[tensor_index].allocation_type == kTfLiteArenaRw) {
+    return true;
+  }
+  if (tensors[root_tensor_index].allocation_type == kTfLiteArenaRwPersistent &&
+      tensors[tensor_index].allocation_type == kTfLiteArenaRwPersistent) {
+    return true;
+  }
+  return false;
+}
+
 TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int32_t tensor_index,
-                                                   TfLiteTensor& tensor) {
+                                                   TfLiteTensor* tensors) {
+  // Resolve allocation for tensors which share buffers.
+  auto actual_tensor_it = actual_tensor_id_.find(tensor_index);
+  TfLiteTensor& tensor = tensors[tensor_index];
+  int32_t root_tensor_index = actual_tensor_it == actual_tensor_id_.end()
+                                  ? tensor_index
+                                  : actual_tensor_it->second;
+  const TfLiteTensor& root_tensor = tensors[root_tensor_index];
+  if (root_tensor_index != tensor_index) {
+    if (AreTensorsAllocatedInSameArena(root_tensor_index, tensor_index,
+                                       tensors)) {
+      // Make sure that the input tensor has already been allocated.
+      ResolveTensorAllocation(root_tensor_index, tensors);
+      tensor.data.data = root_tensor.data.data;
+      return kTfLiteOk;
+    }
+  }
+
   if (tensor.allocation_type == kTfLiteArenaRw) {
     // Skip resolution if the size of the tensor is zero, leaving it as a
     // nullptr.
diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
index bf55b6a884e..e67fb3d7f3a 100644
--- a/tensorflow/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -17,10 +17,11 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/simple_memory_arena.h"
@@ -72,6 +73,8 @@ class ArenaPlanner : public MemoryPlanner {
   std::intptr_t BasePointer(TfLiteAllocationType type);
 
  private:
+  // Identify tensors which may share memory.
+  void IdentifySharedTensors();
   // Make sure all the arenas have reserved enough memory to store all their
   // tensors.
   TfLiteStatus Commit(bool* arena_reallocated);
@@ -96,7 +99,7 @@ class ArenaPlanner : public MemoryPlanner {
   // Assign absolute memory location to a tensor, based on its relative
   // position inside the corresponding arena buffer.
   TfLiteStatus ResolveTensorAllocation(int32_t tensor_index,
-                                       TfLiteTensor& tensor);
+                                       TfLiteTensor* tensors);
 
   // Register an allocation for all internal (temporary) tensors of
   // 'node_index'.
@@ -106,6 +109,9 @@ class ArenaPlanner : public MemoryPlanner {
   // 'node_index'.
   TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index);
 
+  // Return the index of the tensor owing `tensor_index's` buffer.
+  int FindSharedTensor(int tensor_index);
+
   TfLiteContext* context_;
   std::unique_ptr<GraphInfo> graph_info_;
 
@@ -142,6 +148,11 @@ class ArenaPlanner : public MemoryPlanner {
 
   // Index of the last node whose tensors were allocated.
   int last_active_node_;
+
+  // Holds index of original tensor if the tensor is sharing underlined
+  // data with another tensor.
+  // NOLINTNEXTLINE - absl::flat_hash_map increases binary size by 106kB.
+  std::unordered_map<int32_t, int32_t> actual_tensor_id_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 9b6d057b1c3..168691065ef 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -26,7 +26,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/testing/util.h"
 
@@ -52,17 +53,25 @@ constexpr const int kTensorAlignment = 4;
 class TestOp {
  public:
   TestOp(std::initializer_list<int> inputs, std::initializer_list<int> outputs,
-         std::initializer_list<int> temporaries)
-      : inputs_(inputs), outputs_(outputs), temporaries_(temporaries) {}
+         std::initializer_list<int> temporaries,
+         int builtin_code = kTfLiteBuiltinAdd)
+      : inputs_(inputs),
+        outputs_(outputs),
+        temporaries_(temporaries),
+        registration_(TfLiteRegistration()) {
+    registration_.builtin_code = builtin_code;
+  }
 
   const std::vector<int>& inputs() const { return inputs_; }
   const std::vector<int>& outputs() const { return outputs_; }
   const std::vector<int>& temporaries() const { return temporaries_; }
+  const TfLiteRegistration& registration() const { return registration_; }
 
  private:
   std::vector<int> inputs_;
   std::vector<int> outputs_;
   std::vector<int> temporaries_;
+  TfLiteRegistration registration_;
 };
 
 // A test graph where inputs are processed by the given nodes to produce
@@ -88,6 +97,8 @@ class TestGraph {
         return lite;
       };
 
+      registrations_.push_back(node.registration());
+
       nodes_.push_back(TfLiteNode());
       nodes_.back().inputs = int_array(node.inputs());
       for (int t : node.inputs()) {
@@ -125,6 +136,9 @@ class TestGraph {
   const std::vector<int>& inputs() { return inputs_; }
   const std::vector<int>& outputs() { return outputs_; }
   const std::vector<int>& variables() { return variables_; }
+  const std::vector<TfLiteRegistration>& registrations() {
+    return registrations_;
+  }
 
   void SetVariables(const std::vector<int>& variables) {
     variables_ = variables;
@@ -141,6 +155,7 @@ class TestGraph {
  private:
   std::vector<TfLiteNode> nodes_;
   std::vector<TfLiteTensor> tensors_;
+  std::vector<TfLiteRegistration> registrations_;
   std::vector<int> inputs_;
   std::vector<int> outputs_;
   std::vector<int> variables_;
@@ -161,6 +176,9 @@ class TestGraphInfo : public GraphInfo {
   const TfLiteNode& node(size_t index) const override {
     return graph_->nodes()[index];
   }
+  const TfLiteRegistration& registration(size_t index) const override {
+    return graph_->registrations()[index];
+  }
   size_t node_index(size_t index) const override { return index; }
   const std::vector<int>& inputs() const override { return graph_->inputs(); }
   const std::vector<int>& outputs() const override { return graph_->outputs(); }
@@ -389,6 +407,114 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) {
   EXPECT_EQ(GetOffset(1), 4);
 }
 
+TEST_F(ArenaPlannerTest, SimpleGraphWithInplaceReshape) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0}, {2}, {}},  // First op
+                      {{1}, {3}, {}},  // Second op
+                      // Third op, with in-place reshape.
+                      {{2, 3}, {4}, {}, kTfLiteBuiltinReshape},
+                      {{4}, {5}, {}}  // Fourth Op, output
+                  },
+                  {5});
+  SetGraph(&graph);
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors two and 4 should have the same offset.
+  EXPECT_EQ(GetOffset(2), GetOffset(4));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithChainOfInplaceOps) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0}, {2}, {}},
+                      {{2, 3}, {4}, {}, kTfLiteBuiltinReshape},
+                      {{4, 3}, {5}, {}, kTfLiteBuiltinExpandDims},
+                      {{5, 3}, {6}, {}, kTfLiteBuiltinSqueeze},
+                      {{6, 3}, {7}, {}, kTfLiteBuiltinReshape},
+                      {{7}, {8}, {}},
+                  },
+                  {8});
+  SetGraph(&graph);
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors 2,4 5, 6, 7 should have the same offset.
+  EXPECT_EQ(GetOffset(2), GetOffset(2));
+  EXPECT_EQ(GetOffset(2), GetOffset(4));
+  EXPECT_EQ(GetOffset(2), GetOffset(5));
+  EXPECT_EQ(GetOffset(2), GetOffset(6));
+  EXPECT_EQ(GetOffset(2), GetOffset(7));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphsWithReshapeInputOutput) {
+  TestGraph graph(
+      {0, 1},
+      {/* in, out, tmp */
+       {{0}, {2}, {}},
+       // Reshape's input and output are not graph inputs or outputs.
+       {{2, 1}, {3}, {}, kTfLiteBuiltinReshape},
+       {{3}, {4}, {}}},
+      {4});
+  SetGraph(&graph);
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors 2 and 3 should have the same offset.
+  EXPECT_EQ(GetOffset(2), GetOffset(3));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphsWithReshapeInputTensor) {
+  TestGraph graph(
+      {0, 1},
+      {                                           /* in, out, tmp */
+       {{0, 1}, {2}, {}, kTfLiteBuiltinReshape},  // First op is reshape
+       {{4}, {3}, {}}},
+      {3});
+  SetGraph(&graph);
+  // Only arena allocated tensors have shared buffer.
+  (*graph.tensors())[0].allocation_type = kTfLiteDynamic;
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors 0 and 2 should have different offsets.
+  EXPECT_NE(GetOffset(0), GetOffset(2));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphsWithReshapeOutputTensor) {
+  TestGraph graph(
+      {0, 1},
+      {
+          /* in, out, tmp */
+          {{0}, {2}, {}},
+          {{2, 1}, {3}, {}, kTfLiteBuiltinReshape}  // Last op is reshape
+      },
+      {3});
+  SetGraph(&graph);
+  // Only arena allocated tensors have shared buffer.
+  (*graph.tensors())[0].allocation_type = kTfLiteDynamic;
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors 2 and 3 should have different offsets.
+  EXPECT_NE(GetOffset(2), GetOffset(3));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphsWithReshapeDynamicInput) {
+  TestGraph graph(
+      {0, 1},
+      {
+          /* in, out, tmp */
+          {{0, 1}, {2}, {}, kTfLiteBuiltinReshape}  // First op is reshape
+      },
+      {2});
+  SetGraph(&graph);
+  // Only arena allocated tensors have shared buffer.
+  (*graph.tensors())[0].allocation_type = kTfLiteDynamic;
+  Execute(0, graph.nodes().size() - 1);
+
+  // Tensors 0 and 2 should have different offsets.
+  EXPECT_NE(GetOffset(0), GetOffset(2));
+}
+
 TEST_F(ArenaPlannerTest, SimpleGraphWithResetAllocationsAfter) {
   TestGraph graph({0, 1},
                   {
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 006caa6ee12..2b726fc49d2 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -17,6 +17,9 @@ def tflite_copts():
         clean_dep("//tensorflow:android_arm"): [
             "-mfpu=neon",
         ],
+        # copybara:uncomment_begin(google-only)
+        # clean_dep("//tensorflow:chromiumos_x86_64"): [],
+        # copybara:uncomment_end
         clean_dep("//tensorflow:ios_x86_64"): [
             "-msse4.1",
         ],
@@ -56,6 +59,9 @@ def tflite_copts():
     }) + select({
         clean_dep("//tensorflow/lite:tensorflow_profiler_config"): ["-DTF_LITE_TENSORFLOW_PROFILER"],
         "//conditions:default": [],
+    }) + select({
+        clean_dep("//tensorflow/lite/delegates:tflite_debug_delegate"): ["-DTFLITE_DEBUG_DELEGATE"],
+        "//conditions:default": [],
     })
 
     return copts + tflite_copts_extra()
@@ -143,7 +149,13 @@ def tflite_linkopts_no_undefined():
     report errors for undefined symbols at runtime.
     """
     return if_oss(
-        ["-Wl,--no-undefined"],
+        select({
+            "//tensorflow:ios": [
+                # iOS linker uses "--undefined error" instead of "--no-undefined".
+                "-Wl,-undefined,error",
+            ],
+            "//conditions:default": ["-Wl,--no-undefined"],
+        }),
         select({
             # Can't enable errors for undefined symbols for asan/msan/tsan mode,
             # since undefined symbols in shared libraries (references to symbols
@@ -152,6 +164,10 @@ def tflite_linkopts_no_undefined():
             "//tools/cpp:asan_build": [],
             "//tools/cpp:msan_build": [],
             "//tools/cpp:tsan_build": [],
+            "//tensorflow:ios": [
+                # iOS linker uses "--undefined error" instead of "--no-undefined".
+                "-Wl,-undefined,error",
+            ],
             "//conditions:default": ["-Wl,--no-undefined"],
         }),
     )
@@ -467,7 +483,7 @@ def tflite_custom_cc_library(
         gen_selected_ops(
             name = "%s_registration" % name,
             model = models,
-            testonly = kwargs.get("testonly", default = False),
+            testonly = kwargs.get("testonly", False),
         )
         real_srcs.append(":%s_registration" % name)
         real_srcs.append("//tensorflow/lite:create_op_resolver_with_selected_ops.cc")
@@ -603,7 +619,7 @@ def tflite_custom_c_library(
         gen_selected_ops(
             name = "%s_registration" % name,
             model = models,
-            testonly = kwargs.get("testonly", default = False),
+            testonly = kwargs.get("testonly", False),
         )
 
         if experimental:
@@ -643,6 +659,7 @@ def tflite_custom_c_library(
         ]
         experimental_deps = [
             "//tensorflow/lite/c:c_api_experimental_without_op_resolver_without_alwayslink",
+            "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver_without_alwayslink",
         ]
     else:
         hdrs = [
@@ -656,11 +673,12 @@ def tflite_custom_c_library(
         deps = [
             op_resolver_deps,
             "//tensorflow/lite:builtin_ops",
-            "//tensorflow/lite/c:common",
-            "//tensorflow/lite/c:c_api_types",
             "//tensorflow/lite/c:c_api_without_op_resolver_without_alwayslink",
             "//tensorflow/lite/core:private_headers",
-            "//tensorflow/lite/core/c:c_api_without_op_resolver_without_alwayslink",
+            # TODO(bekzhan): Remove this dependency after we move c_api_opaque.h to tflite/core/.
+            "//tensorflow/lite/core/c:private_c_api_types",
+            "//tensorflow/lite/core/c:private_c_api_without_op_resolver_without_alwayslink",
+            "//tensorflow/lite/core/c:private_common",
             "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ] + experimental_deps,
         **kwargs
diff --git a/tensorflow/lite/builtin_op_data.h b/tensorflow/lite/builtin_op_data.h
index b9d4284513d..161801cb45e 100644
--- a/tensorflow/lite/builtin_op_data.h
+++ b/tensorflow/lite/builtin_op_data.h
@@ -17,6 +17,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 #endif  // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 945b99fb515..34824c386e5 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -10,6 +10,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -49,6 +50,7 @@ tflite_cc_library_with_c_headers_test(
     hdrs = [
         "c_api.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":c_api_without_op_resolver",
@@ -76,6 +78,7 @@ tflite_cc_library_with_c_headers_test(
 tflite_cc_library_with_c_headers_test(
     name = "c_api_without_op_resolver",
     hdrs = ["c_api.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
@@ -102,7 +105,6 @@ tflite_cc_library_with_c_headers_test(
 tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental",
     srcs = [
-        "c_api_experimental.cc",
         "c_api_opaque.cc",
     ],
     hdrs = [
@@ -112,15 +114,12 @@ tflite_cc_library_with_c_headers_test(
     copts = tflite_copts(),
     deps = [
         ":c_api",
-        ":c_api_internal",
         ":c_api_opaque_internal",
-        ":c_api_types",
-        ":common",
-        "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_experimental",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
     alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
@@ -130,7 +129,6 @@ tflite_cc_library_with_c_headers_test(
 tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver",
     srcs = [
-        "c_api_experimental.cc",
         "c_api_opaque.cc",
     ],
     hdrs = [
@@ -141,15 +139,13 @@ tflite_cc_library_with_c_headers_test(
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
-        ":c_api_internal",
         ":c_api_opaque_internal",
-        ":c_api_types",
         ":c_api_without_op_resolver",
-        ":common",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:c_api_without_op_resolver",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
     alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
@@ -159,7 +155,6 @@ tflite_cc_library_with_c_headers_test(
 tflite_cc_library_with_c_headers_test(
     name = "c_api_experimental_without_op_resolver_without_alwayslink",
     srcs = [
-        "c_api_experimental.cc",
         "c_api_opaque.cc",
     ],
     hdrs = [
@@ -170,15 +165,13 @@ tflite_cc_library_with_c_headers_test(
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
-        ":c_api_internal",
         ":c_api_opaque_internal_without_alwayslink",
-        ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
-        ":common",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:c_api_experimental_without_op_resolver_without_alwayslink",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:c_api_without_op_resolver_without_alwayslink",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
 )
@@ -188,24 +181,29 @@ tflite_cc_library_with_c_headers_test(
     hdrs = ["c_api_types.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/core/c:c_api_types",
+    ],
 )
 
 cc_library(
     name = "c_api_internal",
     srcs = ["c_api_internal.cc"],
     hdrs = ["c_api_internal.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     visibility = [
         "//tensorflow/lite/core/c:__subpackages__",
         "//tensorflow/lite/delegates/utils:__pkg__",
     ],
     deps = [
-        ":common",
         ":common_internal",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
     ],
 )
 
@@ -248,21 +246,10 @@ test_suite(
     ],
 )
 
-cc_test(
+test_suite(
     name = "c_api_experimental_test",
-    size = "small",
-    srcs = ["c_api_experimental_test.cc"],
-    copts = tflite_copts(),
-    data = ["//tensorflow/lite:testdata/add.bin"],
-    deps = [
-        ":c_api",
-        ":c_api_experimental",
-        ":common",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/core/c:c_api",
-        "//tensorflow/lite/delegates:delegate_test_util",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest_main",
+    tests = [
+        "//tensorflow/lite/core/c:c_api_experimental_test",
     ],
 )
 
@@ -274,15 +261,15 @@ cc_test(
     data = ["//tensorflow/lite:testdata/multi_signatures.bin"],
     deps = [
         ":c_api",
-        ":c_api_experimental",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_experimental",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 tflite_cc_library_with_c_headers_test(
     name = "common",
-    srcs = ["common.cc"],
+    srcs = [],
     hdrs = [
         "builtin_op_data.h",
         "common.h",
@@ -292,6 +279,8 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         "//tensorflow/lite:tflite_kernel_use_xnnpack_optional",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
     ] + select({
         "//tensorflow/lite:tensorflow_profiler_config": [
             "//tensorflow/lite:macros",
@@ -299,7 +288,6 @@ tflite_cc_library_with_c_headers_test(
         ],
         "//conditions:default": [],
     }),
-    alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
 )
 
 cc_library(
@@ -312,8 +300,8 @@ cc_library(
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
-        ":c_api_types",
-        ":common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -328,9 +316,9 @@ cc_library(
     deps = [
         ":c_api_types",
         ":c_api_without_op_resolver",
-        ":common",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -347,9 +335,9 @@ cc_library(
     deps = [
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
-        ":common",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -362,26 +350,32 @@ exports_files([
     "common.h",
 ])
 
+filegroup(
+    name = "tensorflowlite_c_api_hdrs_filegroup",
+    srcs = [
+        "c_api.h",
+        "c_api_types.h",
+        "common.h",
+    ],
+)
+
+alias(
+    name = "tensorflowlite_c_impl_hdrs_filegroup",
+    actual = "//tensorflow/lite/core/c:headers_filegroup",
+)
+
 # Test the C extension API code.
-cc_test(
+test_suite(
     name = "common_test",
-    size = "small",
-    srcs = ["common_test.cc"],
-    deps = [
-        ":c_api_types",
-        ":common",
-        "@com_google_googletest//:gtest_main",
+    tests = [
+        "//tensorflow/lite/core/c:common_test",
     ],
 )
 
-cc_test(
+test_suite(
     name = "builtin_op_data_test",
-    size = "small",
-    srcs = ["builtin_op_data_test.cc"],
-    copts = ["-Wno-unused-variable"],
-    deps = [
-        ":common",
-        "@com_google_googletest//:gtest_main",
+    tests = [
+        "//tensorflow/lite/core/c:builtin_op_data_test",
     ],
 )
 
@@ -403,9 +397,10 @@ cc_test(
     deps = [
         ":c_api",
         ":c_api_experimental",
-        ":c_api_types",
-        ":common",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_experimental",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
diff --git a/tensorflow/lite/c/CMakeLists.txt b/tensorflow/lite/c/CMakeLists.txt
index 321345dad8d..8df942099c3 100644
--- a/tensorflow/lite/c/CMakeLists.txt
+++ b/tensorflow/lite/c/CMakeLists.txt
@@ -28,8 +28,10 @@ if (NOT TENSORFLOW_SOURCE_DIR)
   )
 endif()
 
+set(TFLITE_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite")
+
 add_subdirectory(
-  "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
+  "${TFLITE_SOURCE_DIR}"
   "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite"
   EXCLUDE_FROM_ALL
 )
@@ -61,24 +63,24 @@ if (TFLITE_C_BUILD_SHARED_LIBS)
 endif()
 
 add_library(tensorflowlite_c ${TFLITE_C_LIBTYPE}
+  ${TFLITE_SOURCE_DIR}/core/c/c_api.cc
+  ${TFLITE_SOURCE_DIR}/core/c/c_api_experimental.cc
+  ${TFLITE_SOURCE_DIR}/core/c/common.cc
   builtin_op_data.h
-  common.h
-  common.cc
-  c_api_types.h
   c_api.h
-  c_api.cc
   c_api_experimental.h
-  c_api_experimental.cc
   c_api_internal.h
+  c_api_types.h
+  common.h
 )
 
 if (TFLITE_C_BUILD_SHARED_LIBS)
   if (WIN32)
     target_compile_definitions(tensorflowlite_c PRIVATE TFL_COMPILE_LIBRARY)
   elseif (APPLE)
-    target_link_options(tensorflowlite_c PRIVATE "-Wl,-exported_symbols_list,${TENSORFLOW_SOURCE_DIR}/tensorflow/lite/c/exported_symbols.lds")
+    target_link_options(tensorflowlite_c PRIVATE "-Wl,-exported_symbols_list,${TFLITE_SOURCE_DIR}/c/exported_symbols.lds")
   else ()
-    target_link_options(tensorflowlite_c PRIVATE "-Wl,--version-script,${TENSORFLOW_SOURCE_DIR}/tensorflow/lite/c/version_script.lds")
+    target_link_options(tensorflowlite_c PRIVATE "-Wl,--version-script,${TFLITE_SOURCE_DIR}/c/version_script.lds")
   endif()
 endif()
 
diff --git a/tensorflow/lite/c/README.md b/tensorflow/lite/c/README.md
index 7b1f806a115..c11abaf56f3 100644
--- a/tensorflow/lite/c/README.md
+++ b/tensorflow/lite/c/README.md
@@ -34,7 +34,8 @@ bazel build -c opt //tensorflow/lite/c:tensorflowlite_c
 ```
 
 and for Android (replace `android_arm` with `android_arm64` for 64-bit),
-assuming you've [configured your project for Android builds](../g3doc/guide/android.md):
+assuming you've
+[configured your project for Android builds](../g3doc/android/lite_build.md):
 
 ```sh
 bazel build -c opt --cxxopt=--std=c++11 --config=android_arm \
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 05e2f9c9f80..b1981b3c593 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -15,511 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
 
-#include <stdint.h>
-
-#include "tensorflow/lite/c/common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// TfLiteReshapeParams can't have dynamic data so we fix the maximum possible
-// number of dimensions.
-#define TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT 8
-
-// TODO(aselle): Consider using "if this then that" for testing.
-
-// Useful placeholder to put in otherwise empty structs to avoid size warnings.
-typedef struct {
-  char dummy;
-} EmptyStructPlaceholder;
-
-// IMPORTANT: All new members of structs must be added at the end to ensure
-// backwards compatibility.
-
-// Possible padding types (for convolutions)
-typedef enum {
-  kTfLitePaddingUnknown = 0,
-  kTfLitePaddingSame,
-  kTfLitePaddingValid,
-} TfLitePadding;
-
-typedef enum {
-  kTfLiteMirrorPaddingUnknown = 0,
-  kTfLiteMirrorPaddingReflect,
-  kTfLiteMirrorPaddingSymmetric,
-} TfLiteMirrorPaddingMode;
-
-// TODO(b/130259536): We should move this out of builtin_op_data.
-typedef struct {
-  int width;
-  int height;
-  int width_offset;
-  int height_offset;
-} TfLitePaddingValues;
-
-typedef struct {
-  TfLiteMirrorPaddingMode mode;
-} TfLiteMirrorPaddingParams;
-
-// Possible fused activation functions.
-typedef enum {
-  kTfLiteActNone = 0,
-  kTfLiteActRelu,
-  kTfLiteActReluN1To1,  // min(max(-1, x), 1)
-  kTfLiteActRelu6,      // min(max(0, x), 6)
-  kTfLiteActTanh,
-  kTfLiteActSignBit,
-  kTfLiteActSigmoid,
-} TfLiteFusedActivation;
-
-typedef struct {
-  // Parameters for CONV_2D version 1.
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  TfLiteFusedActivation activation;
-
-  // Parameters for CONV_2D version 2.
-  // Note: Version 2 supports dilation values not equal to 1.
-  int dilation_width_factor;
-  int dilation_height_factor;
-} TfLiteConvParams;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  int stride_depth;
-  int dilation_width_factor;
-  int dilation_height_factor;
-  int dilation_depth_factor;
-  TfLiteFusedActivation activation;
-} TfLiteConv3DParams;
-
-typedef TfLiteConv3DParams TfLiteConv3DTransposeParams;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  int filter_width;
-  int filter_height;
-  TfLiteFusedActivation activation;
-  struct {
-    TfLitePaddingValues padding;
-  } computed;
-} TfLitePoolParams;
-
-typedef struct {
-  // Parameters for DepthwiseConv version 1 or above.
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  // `depth_multiplier` is redundant. It's used by CPU kernels in
-  // TensorFlow 2.0 or below, but ignored in versions above.
-  //
-  // The information can be deduced from the shape of input and the shape of
-  // weights. Since the TFLiteConverter toolchain doesn't support partially
-  // specified shapes, relying on `depth_multiplier` stops us from supporting
-  // graphs with dynamic shape tensors.
-  //
-  // Note: Some of the delegates (e.g. NNAPI, GPU) are still relying on this
-  // field.
-  int depth_multiplier;
-  TfLiteFusedActivation activation;
-  // Parameters for DepthwiseConv version 2 or above.
-  int dilation_width_factor;
-  int dilation_height_factor;
-} TfLiteDepthwiseConvParams;
-
-typedef struct {
-  int rank;
-  TfLiteFusedActivation activation;
-
-  // Parameter for SVDF version 4.
-  bool asymmetric_quantize_inputs;
-} TfLiteSVDFParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-
-  // Parameter for RNN version 3.
-  bool asymmetric_quantize_inputs;
-} TfLiteRNNParams;
-
-typedef struct {
-  bool time_major;
-  TfLiteFusedActivation activation;
-
-  // Parameter for Sequence RNN version 3.
-  bool asymmetric_quantize_inputs;
-} TfLiteSequenceRNNParams;
-
-typedef struct {
-  bool time_major;
-  TfLiteFusedActivation activation;
-  bool merge_outputs;
-
-  // Parameter for Bidirectional RNN verison 3.
-  bool asymmetric_quantize_inputs;
-} TfLiteBidirectionalSequenceRNNParams;
-
-typedef enum {
-  kTfLiteFullyConnectedWeightsFormatDefault = 0,
-  kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
-} TfLiteFullyConnectedWeightsFormat;
-
-typedef struct {
-  // Parameters for FullyConnected version 1 or above.
-  TfLiteFusedActivation activation;
-
-  // Parameters for FullyConnected version 2 or above.
-  TfLiteFullyConnectedWeightsFormat weights_format;
-
-  // Parameters for FullyConnected version 5 or above.
-  // If set to true, then the number of dimensions in the input and the output
-  // tensors are the same. Furthermore, all but the last dimension of the input
-  // and output shapes will be equal.
-  bool keep_num_dims;
-
-  // Parameters for FullyConnected version 7 or above.
-  // If set to true and the weights are quantized, then non constant inputs
-  // are quantized at evaluation time with asymmetric quantization.
-  bool asymmetric_quantize_inputs;
-} TfLiteFullyConnectedParams;
-
-typedef enum {
-  kTfLiteLshProjectionUnknown = 0,
-  kTfLiteLshProjectionSparse = 1,
-  kTfLiteLshProjectionDense = 2,
-} TfLiteLSHProjectionType;
-
-typedef struct {
-  TfLiteLSHProjectionType type;
-} TfLiteLSHProjectionParams;
-
-typedef struct {
-  float beta;
-} TfLiteSoftmaxParams;
-
-typedef struct {
-  int axis;
-  TfLiteFusedActivation activation;
-} TfLiteConcatenationParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-  // Parameter added for the version 4.
-  bool pot_scale_int16;
-} TfLiteAddParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteSpaceToBatchNDParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteBatchToSpaceNDParams;
-
-typedef struct {
-  bool adj_x;
-  bool adj_y;
-  // Parameters for BatchMatMul version 4 or above.
-  // If set to true and the weights are quantized, then non constant inputs
-  // are quantized at evaluation time with asymmetric quantization.
-  bool asymmetric_quantize_inputs;
-} TfLiteBatchMatMulParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteMulParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-  // Parameter added for the version 5.
-  bool pot_scale_int16;
-} TfLiteSubParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteDivParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteL2NormParams;
-
-typedef struct {
-  int radius;
-  float bias;
-  float alpha;
-  float beta;
-} TfLiteLocalResponseNormParams;
-
-typedef enum {
-  kTfLiteLSTMFullKernel = 0,
-  kTfLiteLSTMBasicKernel
-} TfLiteLSTMKernelType;
-
-typedef struct {
-  // Parameters for LSTM version 1.
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-
-  // Parameters for LSTM version 2.
-  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
-  TfLiteLSTMKernelType kernel_type;
-
-  // Parameters for LSTM version 4.
-  bool asymmetric_quantize_inputs;
-} TfLiteLSTMParams;
-
-typedef struct {
-  // Parameters needed for the underlying LSTM.
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-
-  // If set to true then the first dimension is time, otherwise batch.
-  bool time_major;
-
-  // Parameter for unidirectional sequence RNN version 3.
-  bool asymmetric_quantize_inputs;
-} TfLiteUnidirectionalSequenceLSTMParams;
-
-typedef struct {
-  // Parameters supported by version 1:
-  // Parameters inherited for the LSTM kernel.
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-
-  // If true, store the outputs of both directions in the first output.
-  bool merge_outputs;
-
-  // Parameters supported by version 2:
-  // If set to true then the first dimension is time, otherwise batch.
-  bool time_major;
-
-  // Parameters supported by version 3:
-  // If set to true, then hybrid ops use asymmetric quantization for inputs.
-  bool asymmetric_quantize_inputs;
-} TfLiteBidirectionalSequenceLSTMParams;
-
-typedef struct {
-  bool align_corners;
-  // half_pixel_centers assumes pixels are of half the actual dimensions, and
-  // yields more accurate resizes. Corresponds to the same argument for the
-  // original TensorFlow op in TF2.0.
-  bool half_pixel_centers;
-} TfLiteResizeBilinearParams;
-
-typedef struct {
-  bool align_corners;
-  bool half_pixel_centers;
-} TfLiteResizeNearestNeighborParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLitePadParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLitePadV2Params;
-
-typedef struct {
-  // These fields are only used in old models for backward compatibility.
-  // In the current implementation, we use the 2nd input of the op as the shape,
-  // and these fields are unused.
-  int shape[TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT];
-  int num_dimensions;
-} TfLiteReshapeParams;
-
-typedef struct {
-  int ngram_size;
-  int max_skip_size;
-  bool include_all_ngrams;
-} TfLiteSkipGramParams;
-
-typedef struct {
-  int block_size;
-} TfLiteSpaceToDepthParams;
-
-typedef struct {
-  int block_size;
-} TfLiteDepthToSpaceParams;
-
-typedef struct {
-  TfLiteType in_data_type;
-  TfLiteType out_data_type;
-} TfLiteCastParams;
-
-typedef enum {
-  kTfLiteCombinerTypeSum = 0,
-  kTfLiteCombinerTypeMean = 1,
-  kTfLiteCombinerTypeSqrtn = 2,
-} TfLiteCombinerType;
-
-typedef struct {
-  TfLiteCombinerType combiner;
-} TfLiteEmbeddingLookupSparseParams;
-
-typedef struct {
-  int axis;
-  int batch_dims;
-} TfLiteGatherParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteTransposeParams;
-
-typedef struct {
-  bool keep_dims;
-} TfLiteReducerParams;
-
-typedef struct {
-  int num_splits;
-} TfLiteSplitParams;
-
-typedef struct {
-  int num_splits;
-} TfLiteSplitVParams;
-
-typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int squeeze_dims[8];
-  int num_squeeze_dims;
-} TfLiteSqueezeParams;
-
-typedef struct {
-  int begin_mask;
-  int end_mask;
-  int ellipsis_mask;
-  int new_axis_mask;
-  int shrink_axis_mask;
-} TfLiteStridedSliceParams;
-
-typedef struct {
-  TfLiteType output_type;
-} TfLiteArgMaxParams;
-
-typedef struct {
-  TfLiteType output_type;
-} TfLiteArgMinParams;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-} TfLiteTransposeConvParams;
-
-typedef struct {
-  bool validate_indices;
-} TfLiteSparseToDenseParams;
-
-typedef struct {
-  TfLiteType out_type;
-} TfLiteShapeParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteRankParams;
-
-typedef struct {
-  // Parameters supported by version 1:
-  float min;
-  float max;
-  int num_bits;
-
-  // Parameters supported by version 2:
-  bool narrow_range;
-} TfLiteFakeQuantParams;
-
-typedef struct {
-  int values_count;
-  int axis;
-} TfLitePackParams;
-
-typedef struct {
-  int axis;
-} TfLiteOneHotParams;
-
-typedef struct {
-  int num;
-  int axis;
-} TfLiteUnpackParams;
-
-typedef struct {
-  float alpha;
-} TfLiteLeakyReluParams;
-
-typedef struct {
-  TfLiteType index_out_type;
-} TfLiteUniqueParams;
-
-typedef struct {
-  int seq_dim;
-  int batch_dim;
-} TfLiteReverseSequenceParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteMatrixDiagParams;
-
-typedef struct {
-  EmptyStructPlaceholder placeholder;
-} TfLiteMatrixSetDiagParams;
-
-typedef struct {
-  int then_subgraph_index;
-  int else_subgraph_index;
-} TfLiteIfParams;
-
-typedef struct {
-  int cond_subgraph_index;
-  int body_subgraph_index;
-} TfLiteWhileParams;
-
-typedef struct {
-  bool exclusive;
-  bool reverse;
-} TfLiteCumsumParams;
-
-typedef struct {
-  int init_subgraph_index;
-} TfLiteCallOnceParams;
-
-typedef struct {
-  int table_id;
-  TfLiteType key_dtype;
-  TfLiteType value_dtype;
-} TfLiteHashtableParams;
-
-typedef struct {
-  const char* container;
-  const char* shared_name;
-} TfLiteVarHandleParams;
-
-typedef struct {
-  int seed;
-  int seed2;
-} TfLiteRandomParams;
-
-typedef struct {
-  int num_boundaries;
-  // This points to the memory stored in the model (flatbuffer),
-  // and is not owned.
-  const float* boundaries;
-} TfLiteBucketizeParams;
-
-typedef struct {
-  bool approximate;
-} TfLiteGeluParams;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/builtin_op_data.h.
+#include "tensorflow/lite/core/c/builtin_op_data.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/lite/c/c_api_experimental.h b/tensorflow/lite/c/c_api_experimental.h
index 02307632d31..76aefff02d1 100644
--- a/tensorflow/lite/c/c_api_experimental.h
+++ b/tensorflow/lite/c/c_api_experimental.h
@@ -15,391 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
 #define TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
 
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/c/c_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// --------------------------------------------------------------------------
-// Opaque types used by the C API.
-
-/// TfLiteSignatureRunner is used to run inference on a signature.
-///
-/// Note: A signature is used to define a computation in a TF model. A model can
-/// have multiple signatures. Each signature contains three components:
-///   * Signature Key: A unique string to identify a signature
-///   * Inputs: A list of names, each mapped to an input tensor of a signature
-///   * Outputs: A list of names, each mapped to an output tensor of a signature
-///
-/// To learn more about signatures in TFLite, refer to:
-/// https://www.tensorflow.org/lite/guide/signatures
-///
-/// Using the TfLiteSignatureRunner, for a particular signature, you can set its
-/// inputs, invoke (i.e. execute) the computation, and retrieve its outputs.
-typedef struct TfLiteSignatureRunner TfLiteSignatureRunner;
-
-// --------------------------------------------------------------------------
-/// Resets all variable tensors to zero.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
-    TfLiteInterpreter* interpreter);
-
-/// Adds an op registration for a builtin operator.
-///
-/// Op registrations are used to map ops referenced in the flatbuffer model
-/// to executable function pointers (`TfLiteRegistration`s).
-///
-/// NOTE: The interpreter will make a shallow copy of `registration` internally,
-/// so the caller should ensure that its contents (function pointers, etc...)
-/// remain valid for the duration of the interpreter's lifetime. A common
-/// practice is making the provided `TfLiteRegistration` instance static.
-///
-/// Code that uses this function should NOT call
-/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
-    TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
-    const TfLiteRegistration* registration, int32_t min_version,
-    int32_t max_version);
-
-/// Adds an op registration for a custom operator.
-///
-/// Op registrations are used to map ops referenced in the flatbuffer model
-/// to executable function pointers (`TfLiteRegistration`s).
-///
-/// NOTE: The interpreter will make a shallow copy of `registration` internally,
-/// so the caller should ensure that its contents (function pointers, etc...)
-/// remain valid for the duration of any created interpreter's lifetime. A
-/// common practice is making the provided `TfLiteRegistration` instance static.
-///
-/// The lifetime of the string pointed to by `name` must be at least as long
-/// as the lifetime of the `TfLiteInterpreterOptions`.
-///
-/// Code that uses this function should NOT call
-/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
-    TfLiteInterpreterOptions* options, const char* name,
-    const TfLiteRegistration* registration, int32_t min_version,
-    int32_t max_version);
-
-/// Registers callbacks for resolving builtin or custom operators.
-///
-/// The `TfLiteInterpreterOptionsSetOpResolver` function provides an alternative
-/// method for registering builtin ops and/or custom ops, by providing operator
-/// resolver callbacks.  Unlike using `TfLiteInterpreterOptionsAddBuiltinOp`
-/// and/or `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all
-/// the operators in a single call.
-///
-/// Code that uses this function should NOT call
-/// `TfLiteInterpreterOptionsAddBuiltin` or
-/// `TfLiteInterpreterOptionsAddCustomOp` on the same options object.
-///
-/// If `op_resolver_user_data` is non-null, its lifetime must be at least as
-/// long as the lifetime of the `TfLiteInterpreterOptions`.
-///
-/// WARNING: This is an experimental API and subject to change.
-void TfLiteInterpreterOptionsSetOpResolver(
-    TfLiteInterpreterOptions* options,
-    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
-                                                 TfLiteBuiltinOperator op,
-                                                 int version),
-    const TfLiteRegistration* (*find_custom_op)(void* user_data,
-                                                const char* custom_op,
-                                                int version),
-    void* op_resolver_user_data);
-
-/// \private
-/// `TfLiteRegistration_V1` version of TfLiteInterpreterOptionsSetOpResolver.
-///
-/// WARNING: This function is deprecated / not an official part of the API, is
-/// only for binary backwards compatibility, and should not be called.
-void TfLiteInterpreterOptionsSetOpResolverV1(
-    TfLiteInterpreterOptions* options,
-    const TfLiteRegistration_V1* (*find_builtin_op_v1)(void* user_data,
-                                                       TfLiteBuiltinOperator op,
-                                                       int version),
-    const TfLiteRegistration_V1* (*find_custom_op_v1)(void* user_data,
-                                                      const char* op,
-                                                      int version),
-    void* op_resolver_user_data);
-
-/// Returns a new interpreter using the provided model and options, or null on
-/// failure, where the model uses only the operators explicitly added to the
-/// options.  This is the same as `TFLiteInterpreterCreate` from `c_api.h`,
-/// except that the only operators that are supported are the ones registered
-/// in `options` via calls to `TfLiteInterpreterOptionsSetOpResolver`,
-/// `TfLiteInterpreterOptionsAddBuiltinOp`, and/or
-/// `TfLiteInterpreterOptionsAddCustomOp`.
-///
-/// * `model` must be a valid model instance. The caller retains ownership of
-///   the object, and can destroy it immediately after creating the interpreter;
-///   the interpreter will maintain its own reference to the underlying model
-///   data.
-/// * `options` should not be null. The caller retains ownership of the object,
-///   and can safely destroy it immediately after creating the interpreter.
-///
-/// NOTE: The client *must* explicitly allocate tensors before attempting to
-/// access input tensor data or invoke the interpreter.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteInterpreter*
-TfLiteInterpreterCreateWithSelectedOps(const TfLiteModel* model,
-                                       const TfLiteInterpreterOptions* options);
-
-/// Enable or disable the NN API delegate for the interpreter (true to enable).
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
-    TfLiteInterpreterOptions* options, bool enable);
-
-/// Enable or disable CPU fallback for the interpreter (true to enable).
-/// If enabled, TfLiteInterpreterInvoke will do automatic fallback from
-/// executing with delegate(s) to regular execution without delegates
-/// (i.e. on CPU).
-///
-/// Allowing the fallback is suitable only if both of the following hold:
-/// - The caller is known not to cache pointers to tensor data across
-///   TfLiteInterpreterInvoke calls.
-/// - The model is not stateful (no variables, no LSTMs) or the state isn't
-///   needed between batches.
-///
-/// When delegate fallback is enabled, TfLiteInterpreterInvoke will
-/// behave as follows:
-///   If one or more delegates were set in the interpreter options
-///   (see TfLiteInterpreterOptionsAddDelegate),
-///   AND inference fails,
-///   then the interpreter will fall back to not using any delegates.
-///   In that case, the previously applied delegate(s) will be automatically
-///   undone, and an attempt will be made to return the interpreter to an
-///   invokable state, which may invalidate previous tensor addresses,
-///   and the inference will be attempted again, using input tensors with
-///   the same value as previously set.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetEnableDelegateFallback(
-    TfLiteInterpreterOptions* options, bool enable);
-
-// Set if buffer handle output is allowed.
-//
-/// When using hardware delegation, Interpreter will make the data of output
-/// tensors available in `tensor->data` by default. If the application can
-/// consume the buffer handle directly (e.g. reading output from OpenGL
-/// texture), it can set this flag to false, so Interpreter won't copy the
-/// data from buffer handle to CPU memory. WARNING: This is an experimental
-/// API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteSetAllowBufferHandleOutput(
-    const TfLiteInterpreter* interpreter, bool allow_buffer_handle_output);
-
-/// Allow a delegate to look at the graph and modify the graph to handle
-/// parts of the graph themselves. After this is called, the graph may
-/// contain new nodes that replace 1 more nodes.
-/// 'delegate' must outlive the interpreter.
-/// Use `TfLiteInterpreterOptionsAddDelegate` instead of this unless
-/// absolutely required.
-/// Returns one of the following three status codes:
-/// 1. kTfLiteOk: Success.
-/// 2. kTfLiteDelegateError: Delegation failed due to an error in the
-/// delegate. The Interpreter has been restored to its pre-delegation state.
-/// NOTE: This undoes all delegates previously applied to the Interpreter.
-/// 3. kTfLiteError: Unexpected/runtime failure.
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterModifyGraphWithDelegate(
-    const TfLiteInterpreter* interpreter, TfLiteDelegate* delegate);
-
-/// Returns the tensor index corresponding to the input tensor
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorIndex(
-    const TfLiteInterpreter* interpreter, int32_t input_index);
-
-/// Returns the tensor index corresponding to the output tensor
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorIndex(
-    const TfLiteInterpreter* interpreter, int32_t output_index);
-
-/// --------------------------------------------------------------------------
-/// SignatureRunner APIs
-///
-/// You can run inference by either:
-///
-/// (i) (recommended) using the Interpreter to initialize SignatureRunner(s) and
-///     then only using SignatureRunner APIs.
-///
-/// (ii) only using Interpreter APIs.
-///
-/// NOTE:
-/// * Only use one of the above options to run inference, i.e. avoid mixing both
-///   SignatureRunner APIs and Interpreter APIs to run inference as they share
-///   the same underlying data (e.g. updating an input tensor “A” retrieved
-///   using the Interpreter APIs will update the state of the input tensor “B”
-///   retrieved using SignatureRunner APIs, if they point to the same underlying
-///   tensor in the model; as it is not possible for a user to debug this by
-///   analyzing the code, it can lead to undesirable behavior).
-/// * The TfLiteSignatureRunner type is conditionally thread-safe, provided that
-///   no two threads attempt to simultaneously access two TfLiteSignatureRunner
-///   instances that point to the same underlying signature, or access a
-///   TfLiteSignatureRunner and its underlying TfLiteInterpreter, unless all
-///   such simultaneous accesses are reads (rather than writes).
-/// * The lifetime of a TfLiteSignatureRunner object ends when
-///   TfLiteSignatureRunnerDelete() is called on it (or when the lifetime of the
-///   underlying TfLiteInterpreter ends -- but you should call
-///   TfLiteSignatureRunnerDelete() before that happens in order to avoid
-///   resource leaks).
-/// * You can only apply delegates to the interpreter (via
-///   TfLiteInterpreterOptions) and not to a signature.
-
-/// Returns the number of signatures defined in the model.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetSignatureCount(
-    const TfLiteInterpreter* interpreter);
-
-/// Returns the key of the Nth signature in the model, where N is specified as
-/// `signature_index`.
-///
-/// NOTE: The lifetime of the returned key is the same as (and depends on) the
-/// lifetime of `interpreter`.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteInterpreterGetSignatureKey(
-    const TfLiteInterpreter* interpreter, int32_t signature_index);
-
-/// Returns a new signature runner using the provided interpreter and signature
-/// key, or nullptr on failure.
-///
-/// NOTE: `signature_key` is a null-terminated C string that must match the
-/// key of a signature in the interpreter's model.
-///
-/// NOTE: The returned signature runner should be destroyed, by calling
-/// TfLiteSignatureRunnerDelete(), before the interpreter is destroyed.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteSignatureRunner*
-TfLiteInterpreterGetSignatureRunner(const TfLiteInterpreter* interpreter,
-                                    const char* signature_key);
-
-/// Returns the number of inputs associated with a signature.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetInputCount(
-    const TfLiteSignatureRunner* signature_runner);
-
-/// Returns the (null-terminated) name of the Nth input in a signature, where N
-/// is specified as `input_index`.
-///
-/// NOTE: The lifetime of the returned name is the same as (and depends on) the
-/// lifetime of `signature_runner`.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetInputName(
-    const TfLiteSignatureRunner* signature_runner, const int32_t input_index);
-
-/// Resizes the input tensor identified as `input_name` to be the dimensions
-/// specified by `input_dims` and `input_dims_size`. Only unknown dimensions can
-/// be resized with this function. Unknown dimensions are indicated as `-1` in
-/// the `dims_signature` attribute of a TfLiteTensor.
-///
-/// Returns status of failure or success. Note that this doesn't actually resize
-/// any existing buffers. A call to TfLiteSignatureRunnerAllocateTensors() is
-/// required to change the tensor input buffer.
-///
-/// NOTE: This function is similar to TfLiteInterpreterResizeInputTensorStrict()
-/// and not TfLiteInterpreterResizeInputTensor().
-///
-/// NOTE: `input_name` must match the name of an input in the signature.
-///
-/// NOTE: This function makes a copy of the input dimensions, so the caller can
-/// safely deallocate `input_dims` immediately after this function returns.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerResizeInputTensor(
-    TfLiteSignatureRunner* signature_runner, const char* input_name,
-    const int* input_dims, int32_t input_dims_size);
-
-/// Updates allocations for tensors associated with a signature and resizes
-/// dependent tensors using the specified input tensor dimensionality.
-/// This is a relatively expensive operation and hence should only be called
-/// after initializing the signature runner object and/or resizing any inputs.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerAllocateTensors(
-    TfLiteSignatureRunner* signature_runner);
-
-/// Returns the input tensor identified by `input_name` in the given signature.
-/// Returns nullptr if the given name is not valid.
-///
-/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
-/// the lifetime of `signature_runner`.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteSignatureRunnerGetInputTensor(
-    TfLiteSignatureRunner* signature_runner, const char* input_name);
-
-/// Runs inference on a given signature.
-///
-/// Before calling this function, the caller should first invoke
-/// TfLiteSignatureRunnerAllocateTensors() and should also set the values for
-/// the input tensors. After successfully calling this function, the values for
-/// the output tensors will be set.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerInvoke(
-    TfLiteSignatureRunner* signature_runner);
-
-/// Returns the number of output tensors associated with the signature.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetOutputCount(
-    const TfLiteSignatureRunner* signature_runner);
-
-/// Returns the (null-terminated) name of the Nth output in a signature, where
-/// N is specified as `output_index`.
-///
-/// NOTE: The lifetime of the returned name is the same as (and depends on) the
-/// lifetime of `signature_runner`.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetOutputName(
-    const TfLiteSignatureRunner* signature_runner, int32_t output_index);
-
-/// Returns the output tensor identified by `output_name` in the given
-/// signature. Returns nullptr if the given name is not valid.
-///
-/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
-/// the lifetime of `signature_runner`.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteSignatureRunnerGetOutputTensor(
-    const TfLiteSignatureRunner* signature_runner, const char* output_name);
-
-/// Attempts to cancel in flight invocation if any.
-/// This will not affect calls to `Invoke` that happend after this.
-/// Non blocking and thread safe.
-/// Returns kTfLiteError if cancellation is not enabled, otherwise returns
-/// kTfLiteOk.
-/// NOTE: Calling this function will cancel in-flight invocations
-/// in all SignatureRunners built from the same interpreter.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerCancel(
-    TfLiteSignatureRunner* signature_runner);
-
-/// Destroys the signature runner.
-///
-/// WARNING: This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteSignatureRunnerDelete(
-    TfLiteSignatureRunner* signature_runner);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_experimental.h.
+#include "tensorflow/lite/core/c/c_api_experimental.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index bc921951a07..e4f5caa1d56 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -25,8 +25,9 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 #include "tensorflow/lite/signature_runner.h"
 
 // Internal structures and subroutines used by the C API. These are likely to
@@ -37,7 +38,7 @@ limitations under the License.
 
 struct TfLiteModel {
   // Sharing is safe as FlatBufferModel is const.
-  std::shared_ptr<const tflite::FlatBufferModel> impl;
+  std::shared_ptr<const tflite::impl::FlatBufferModel> impl;
 };
 
 // The `TfLiteOpResolver` struct is an abstract callback interface that
@@ -112,12 +113,15 @@ struct TfLiteInterpreterOptions {
   // Determines whether to allow to cancel invocations with
   // `Interpreter::Cancel` or `SignatureRunner::Cancel`.
   bool enable_cancellation = false;
+
+  // If not nullptr, report telemetry metrics to profiler.
+  TfLiteTelemetryProfilerStruct* telemetry_profiler = nullptr;
 };
 
 struct TfLiteInterpreter {
   // Taking a reference to the (const) model data avoids lifetime-related issues
   // and complexity with the TfLiteModel's existence.
-  std::shared_ptr<const tflite::FlatBufferModel> model;
+  std::shared_ptr<const tflite::impl::FlatBufferModel> model;
 
   // The interpreter does not take ownership of the provided ErrorReporter
   // instance, so we ensure its validity here. Note that the interpreter may use
diff --git a/tensorflow/lite/c/c_api_opaque.cc b/tensorflow/lite/c/c_api_opaque.cc
index e922d709de5..dd80ce7de26 100644
--- a/tensorflow/lite/c/c_api_opaque.cc
+++ b/tensorflow/lite/c/c_api_opaque.cc
@@ -15,14 +15,51 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api_opaque.h"
 
+#include <cstdio>
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/c/c_api_opaque_internal.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
+namespace {
+
+const TfLiteTensor* Convert(const TfLiteOpaqueTensor* opaque_tensor) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueTensor and TfLiteTensor being equivalent.
+  return reinterpret_cast<const TfLiteTensor*>(opaque_tensor);
+}
+
+const TfLiteNode* Convert(const TfLiteOpaqueNode* opaque_node) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueNode and TfLiteNode being equivalent.
+  return reinterpret_cast<const TfLiteNode*>(opaque_node);
+}
+
+const TfLiteContext* Convert(const TfLiteOpaqueContext* opaque_context) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueContext and TfLiteContext being equivalent.
+  return reinterpret_cast<const TfLiteContext*>(opaque_context);
+}
+
+const ::tflite::Subgraph* GetSubgraph(
+    const TfLiteOpaqueContext* opaque_context) {
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteContext::impl_ having type ::tflite::Subgraph*.
+  return reinterpret_cast<const ::tflite::Subgraph*>(
+      Convert(opaque_context)->impl_);
+}
+
+}  // namespace
+
 TfLiteType TfLiteOpaqueTensorType(const TfLiteOpaqueTensor* opaque_tensor) {
   return TfLiteTensorType(reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
 }
@@ -38,6 +75,44 @@ int32_t TfLiteOpaqueTensorDim(const TfLiteOpaqueTensor* opaque_tensor,
                          dim_index);
 }
 
+TfLiteStatus TfLiteOpaqueTensorGetNumDimsSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t* num_dims) {
+  const TfLiteTensor* tensor = Convert(opaque_tensor);
+  if (!tensor->dims_signature) {
+    *num_dims = -1;
+    return kTfLiteOk;
+  }
+  *num_dims = tensor->dims_signature->size;
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueTensorGetDimSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index,
+    int32_t* dim_length) {
+  const TfLiteTensor* tensor = Convert(opaque_tensor);
+  // `dims_signature` is not defined when no unknown dimensions are present.
+  if (tensor->dims_signature != nullptr && tensor->dims_signature->size != 0) {
+    *dim_length = tensor->dims_signature->data[dim_index];
+  } else {
+    *dim_length = tensor->dims->data[dim_index];
+  }
+  return kTfLiteOk;
+}
+
+const TfLiteIntArray* TfLiteOpaqueTensorDims(
+    const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->dims;
+}
+
+const TfLiteIntArray* TfLiteOpaqueTensorDimsSignature(
+    const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->dims_signature;
+}
+
+int TfLiteOpaqueTensorIsVariable(const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->is_variable ? 1 : 0;
+}
+
 size_t TfLiteOpaqueTensorByteSize(const TfLiteOpaqueTensor* opaque_tensor) {
   return TfLiteTensorByteSize(
       reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
@@ -47,10 +122,25 @@ void* TfLiteOpaqueTensorData(const TfLiteOpaqueTensor* opaque_tensor) {
   return TfLiteTensorData(reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
 }
 
+TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
+    const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->allocation_type;
+}
+
 const char* TfLiteOpaqueTensorName(const TfLiteOpaqueTensor* opaque_tensor) {
   return TfLiteTensorName(reinterpret_cast<const TfLiteTensor*>(opaque_tensor));
 }
 
+TfLiteQuantization TfLiteOpaqueTensorGetQuantization(
+    const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->quantization;
+}
+
+TfLiteQuantizationParams TfLiteOpaqueTensorGetQuantizationParams(
+    const TfLiteOpaqueTensor* opaque_tensor) {
+  return Convert(opaque_tensor)->params;
+}
+
 TfLiteStatus TfLiteOpaqueTensorCopyFromBuffer(TfLiteOpaqueTensor* opaque_tensor,
                                               const void* input_data,
                                               size_t input_data_size) {
@@ -68,10 +158,10 @@ TfLiteStatus TfLiteOpaqueTensorCopyToBuffer(
 }
 
 const TfLiteOpaqueTensor* TfLiteOpaqueNodeGetInput(
-    TfLiteOpaqueContext* opaque_context, const TfLiteOpaqueNode* opaque_node,
-    int index) {
+    const TfLiteOpaqueContext* opaque_context,
+    const TfLiteOpaqueNode* opaque_node, int index) {
   const TfLiteTensor* tensor =
-      tflite::GetInput(reinterpret_cast<TfLiteContext*>(opaque_context),
+      tflite::GetInput(reinterpret_cast<const TfLiteContext*>(opaque_context),
                        reinterpret_cast<const TfLiteNode*>(opaque_node), index);
   return reinterpret_cast<const TfLiteOpaqueTensor*>(tensor);
 }
@@ -97,6 +187,42 @@ void* TfLiteOpaqueNodeGetUserData(const TfLiteOpaqueNode* opaque_node) {
   return reinterpret_cast<const TfLiteNode*>(opaque_node)->user_data;
 }
 
+void* TfLiteOpaqueNodeGetBuiltinData(const TfLiteOpaqueNode* opaque_node) {
+  return Convert(opaque_node)->builtin_data;
+}
+
+TfLiteStatus TfLiteOpaqueNodeGetCustomInitialData(
+    const TfLiteOpaqueNode* opaque_node, const void** init_data, int* size) {
+  *init_data = Convert(opaque_node)->custom_initial_data;
+  *size = Convert(opaque_node)->custom_initial_data_size;
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueNodeInputs(const TfLiteOpaqueNode* opaque_node,
+                                    const int** inputs, int* num_inputs) {
+  const TfLiteNode* node = Convert(opaque_node);
+  *inputs = node->inputs->data;
+  *num_inputs = node->inputs->size;
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueNodeOutputs(const TfLiteOpaqueNode* opaque_node,
+                                     const int** outputs, int* num_outputs) {
+  const TfLiteNode* node = Convert(opaque_node);
+  *outputs = node->outputs->data;
+  *num_outputs = node->outputs->size;
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueNodeTemporaries(const TfLiteOpaqueNode* opaque_node,
+                                         const int** temporaries,
+                                         int* num_temporaries) {
+  const TfLiteNode* node = Convert(opaque_node);
+  *temporaries = node->temporaries->data;
+  *num_temporaries = node->temporaries->size;
+  return kTfLiteOk;
+}
+
 TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
     TfLiteOpaqueContext* opaque_context, TfLiteIntArray** execution_plan) {
   // The following casts are safe only because this code is part of the
@@ -149,7 +275,7 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
     TfLiteRegistrationExternal* registration_external,
     const TfLiteIntArray* nodes_to_replace,
-    struct TfLiteOpaqueDelegateStruct* opaque_delegate) {
+    TfLiteOpaqueDelegate* opaque_delegate) {
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
@@ -158,6 +284,12 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
   TfLiteContext* context = reinterpret_cast<TfLiteContext*>(opaque_context);
   TfLiteDelegate* delegate = reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
 
+  // Wrap the provided 'registration_external' as a regular 'TfLiteRegistration'
+  // object to reduce the places in the TF Lite runtime that need to be aware
+  // of 'TfLiteRegistrationExternal's.  Note that it is important to
+  // brace-initialize the 'TfLiteRegistration' so that we pass a registration to
+  // 'ReplaceNodeSubsetsWithDelegateKernels' that has all of its fields set to
+  // null, except the 'registration_external' one.
   TfLiteRegistration registration{};
   registration.registration_external = registration_external;
 
@@ -175,3 +307,97 @@ TfLiteOpaqueTensor* TfLiteOpaqueContextGetOpaqueTensor(
   auto context = reinterpret_cast<const TfLiteContext*>(opaque_context);
   return reinterpret_cast<TfLiteOpaqueTensor*>(&context->tensors[index]);
 }
+
+TfLiteStatus TfLiteOpaqueContextGetInputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** inputs,
+    int* num_inputs) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  const std::vector<int>& subgraph_inputs = subgraph->inputs();
+
+  *inputs = subgraph_inputs.data();
+  *num_inputs = subgraph_inputs.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextGetOutputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** outputs,
+    int* num_outputs) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  const std::vector<int>& subgraph_outputs = subgraph->outputs();
+  *outputs = subgraph_outputs.data();
+  *num_outputs = subgraph_outputs.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus TfLiteOpaqueContextGetVariables(
+    const struct TfLiteOpaqueContext* opaque_context, const int** variables,
+    int* num_variables) {
+  auto* subgraph = GetSubgraph(opaque_context);
+
+  const std::vector<int>& subgraph_variables = subgraph->variables();
+  *variables = subgraph_variables.data();
+  *num_variables = subgraph_variables.size();
+  return kTfLiteOk;
+}
+
+size_t TfLiteOpaqueContextGetNumNodes(
+    const struct TfLiteOpaqueContext* opaque_context) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  return subgraph->nodes_size();
+}
+
+size_t TfLiteOpaqueContextGetNumTensors(
+    const struct TfLiteOpaqueContext* opaque_context) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  return subgraph->tensors_size();
+}
+
+const char* TfLiteOpaqueContextGetName(
+    const struct TfLiteOpaqueContext* opaque_context) {
+  auto* subgraph = GetSubgraph(opaque_context);
+  return subgraph->GetName().c_str();
+}
+
+TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
+                                             TfLiteOpaqueTensor* tensor,
+                                             TfLiteIntArray* new_size) {
+  // The following casts are safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
+  // TfLiteOpaqueTensor and TfLiteTensor being equivalent.
+  TfLiteContext* tflite_context = reinterpret_cast<TfLiteContext*>(context);
+  return tflite_context->ResizeTensor(
+      tflite_context, reinterpret_cast<TfLiteTensor*>(tensor), new_size);
+}
+
+void TfLiteOpaqueContextReportError(struct TfLiteOpaqueContext* opaque_context,
+                                    const char* format, ...) {
+  va_list vlist;
+  va_start(vlist, format);
+  TfLiteOpaqueContextReportErrorVa(opaque_context, format, vlist);
+  va_end(vlist);
+}
+void TfLiteOpaqueContextReportErrorVa(
+    struct TfLiteOpaqueContext* opaque_context, const char* format,
+    va_list vlist) {
+  // Determine the length of the resulting error message.
+  va_list copy;
+  va_copy(copy, vlist);
+  int n = vsnprintf(nullptr, 0, format, copy);
+  if (n < 0) {
+    return;
+  }
+  size_t size = (size_t)n + 1;  // +1 for '\0'.
+  char* buffer = new char[size];
+  n = vsnprintf(buffer, size, format, vlist);
+  if (n < 0) {
+    delete[] buffer;
+    return;
+  }
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // TfLiteOpaqueContext and TfLiteContext being equivalent.
+  auto* context = reinterpret_cast<TfLiteContext*>(opaque_context);
+  TF_LITE_KERNEL_LOG(context, "%s", buffer);
+  delete[] buffer;
+}
diff --git a/tensorflow/lite/c/c_api_opaque.h b/tensorflow/lite/c/c_api_opaque.h
index 2e93c5abe5a..c5a8699a175 100644
--- a/tensorflow/lite/c/c_api_opaque.h
+++ b/tensorflow/lite/c/c_api_opaque.h
@@ -15,9 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_OPAQUE_H_
 #define TENSORFLOW_LITE_C_C_API_OPAQUE_H_
 
-#include "tensorflow/lite/c/c_api_types.h"  // IWYU pragma: export
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,7 +38,8 @@ extern "C" {
 TFL_CAPI_EXPORT extern TfLiteType TfLiteOpaqueTensorType(
     const TfLiteOpaqueTensor* opaque_tensor);
 
-// Returns the number of dimensions that the tensor has.
+// Returns the number of dimensions that the tensor has.  Returns -1 in case
+// the 'opaque_tensor' does not have its dimensions property set.
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
     const TfLiteOpaqueTensor* opaque_tensor);
 
@@ -46,6 +47,32 @@ TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorNumDims(
 TFL_CAPI_EXPORT extern int32_t TfLiteOpaqueTensorDim(
     const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index);
 
+// Loads into the provided 'num_dims' the number of dimensions that the tensor's
+// signature has. Returns 'kTfLiteOk' if 'num_dims' was successfully loaded. Any
+// other return code indicates an error and 'num_dims' won't be loaded.
+//
+// A tensor's dimension signature encodes shapes with unknown dimensions with
+// -1.  E.g. for a tensor with three dimensions, whose first dimension has an
+// unknown size, and the second and third dimension have a size of 2, the
+// dimension signature is [-1,2,2], and 'TfLiteOpaqueTensorGetNumDimsSignature'
+// loads 3 into 'num_dims'. If the tensor does not have its dimension signature
+// field set then 'num_dims' is set to -1.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetNumDimsSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t* num_dims);
+
+// Loads into the provided 'dim_length' the length of the tensor in the
+// 'dim_index' signature dimension or -1 if that dimension has unknown length.
+// Returns 'kTfLiteOk' if 'dim_length' was successfully loaded. Any
+// other return code indicates an error and 'dim_length' won't be loaded.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorGetDimSignature(
+    const TfLiteOpaqueTensor* opaque_tensor, int32_t dim_index,
+    int32_t* dim_length);
+
+// Returns 'non-zero' if the provided 'opaque_tensor' is a variable, and returns
+// zero otherwise.
+TFL_CAPI_EXPORT extern int TfLiteOpaqueTensorIsVariable(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
 // Returns the size of the underlying data in bytes.
 TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
     const TfLiteOpaqueTensor* opaque_tensor);
@@ -54,10 +81,23 @@ TFL_CAPI_EXPORT extern size_t TfLiteOpaqueTensorByteSize(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueTensorData(
     const TfLiteOpaqueTensor* opaque_tensor);
 
+// Returns the 'opaque_tensor's allocation type.
+TFL_CAPI_EXPORT extern TfLiteAllocationType TfLiteOpaqueTensorGetAllocationType(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
 // Returns the (null-terminated) name of the tensor.
 TFL_CAPI_EXPORT extern const char* TfLiteOpaqueTensorName(
     const TfLiteOpaqueTensor* opaque_tensor);
 
+// Returns the 'opaque_tensor's quantization information.
+TFL_CAPI_EXPORT extern TfLiteQuantization TfLiteOpaqueTensorGetQuantization(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
+// Returns the 'opaque_tensor's quantization parameters.
+TFL_CAPI_EXPORT extern TfLiteQuantizationParams
+TfLiteOpaqueTensorGetQuantizationParams(
+    const TfLiteOpaqueTensor* opaque_tensor);
+
 // Copies from the provided input buffer into the tensor's buffer.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyFromBuffer(
     TfLiteOpaqueTensor* opaque_tensor, const void* input_data,
@@ -73,8 +113,8 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueTensorCopyToBuffer(
 
 // Returns the input tensor of the given node.
 TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor* TfLiteOpaqueNodeGetInput(
-    TfLiteOpaqueContext* opaque_context, const TfLiteOpaqueNode* opaque_node,
-    int index);
+    const TfLiteOpaqueContext* opaque_context,
+    const TfLiteOpaqueNode* opaque_node, int index);
 
 // Returns the output tensor of the given node.
 TFL_CAPI_EXPORT extern TfLiteOpaqueTensor* TfLiteOpaqueNodeGetOutput(
@@ -95,6 +135,77 @@ TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
     const TfLiteOpaqueNode* opaque_node);
 
+// Returns the builtin data associated with the provided 'opaque_node'.
+//
+// The builtin init data associated with a node would typically be set during
+// the creation of the associated interpreter, through a mechanism like the
+// interpreter builder that loads a TFLite model and initialises the
+// interpreter's nodes accordingly.  Under these conditions the returned address
+// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetBuiltinData(
+    const TfLiteOpaqueNode* opaque_node);
+
+// Loads into the provided '*init_data' pointer the address of the custom init
+// data associated with the provided 'opaque_node'.  The length of data is
+// loaded into the provided 'size' pointer.  Returns 'kTfLiteOk' in case
+// of success.  Any other return value indicates a failure and will leave
+// 'init_data' and 'size' in an unspecified state.
+//
+// The custom init data associated with a node would typically be set during the
+// creation of the associated interpreter, through a mechanism like the
+// interpreter builder that loads a TFLite model and initialises the
+// interpreter's nodes accordingly.  Under these conditions the returned address
+// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueNodeGetCustomInitialData(
+    const TfLiteOpaqueNode* opaque_node, const void** init_data, int* size);
+
+// Loads into the provided '*inputs' pointer the starting address of an array
+// of indices representing the tensors that are inputs of the provided
+// 'opaque_node'. The length of the array is loaded into the provided
+// 'num_inputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
+// return value indicates a failure and will leave 'inputs' and
+// 'num_inputs' in an unspecified state.
+//
+// The input tensors associated with a node would typically be set during the
+// creation of the associated interpreter, through a mechanism like the
+// interpreter builder that loads a TFLite model and initialises the
+// interpreter's nodes accordingly.  Under these conditions the loaded address
+// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeInputs(
+    const TfLiteOpaqueNode* opaque_node, const int** inputs, int* num_inputs);
+
+// Loads into the provided '*outputs' pointer the starting address of an array
+// of indices representing the tensors that are outputs of the provided
+// 'opaque_node'. The length of the array is loaded into the provided
+// 'num_outputs' pointer. Returns 'kTfLiteOk' in case of success.  Any other
+// return value indicates a failure and will leave 'outputs' and
+// 'num_outputs' in an unspecified state.
+//
+// The output tensors associated with a node would typically be set during the
+// creation of the associated interpreter, through a mechanism like the
+// interpreter builder that loads a TFLite model and initialises the
+// interpreter's nodes accordingly.  Under these conditions the loaded address
+// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueNodeOutputs(
+    const TfLiteOpaqueNode* opaque_node, const int** outputs, int* num_outputs);
+
+// Loads into the provided '*temporaries' pointer the starting address of an
+// array of indices representing the temporary tensors associated with the
+// provided 'opaque_node'. The length of the array is loaded into the provided
+// 'num_temporaries' pointer. Returns 'kTfLiteOk' in case of success.  Any other
+// return value indicates a failure and will leave 'temporaries' and
+// 'num_temporaries' in an unspecified state.
+//
+// The temporary tensors associated with a node would typically be set during
+// the creation of the associated interpreter, through a mechanism like the
+// interpreter builder that loads a TFLite model and initialises the
+// interpreter's nodes accordingly.  Under these conditions the loaded address
+// remains valid throughout the lifetime of the 'opaque_node'.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueNodeTemporaries(const TfLiteOpaqueNode* opaque_node,
+                                         const int** temporaries,
+                                         int* num_temporaries);
+
 // --------------------------------------------------------------------------
 // Accessors for TfLiteOpaqueContext.
 
@@ -155,7 +266,7 @@ TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
     TfLiteRegistrationExternal* registration_external,
     const TfLiteIntArray* nodes_to_replace,
-    struct TfLiteOpaqueDelegateStruct* opaque_delegate);
+    TfLiteOpaqueDelegate* opaque_delegate);
 
 // Returns modifiable access to the opaque tensor that corresponds to the
 // specified `index` and is associated with the provided `opaque_context`.
@@ -177,6 +288,190 @@ TFL_CAPI_EXPORT
 TfLiteOpaqueTensor* TfLiteOpaqueContextGetOpaqueTensor(
     const TfLiteOpaqueContext* opaque_context, int index);
 
+// Loads into the provided '*inputs' pointer the starting address of an array
+// of indices representing the tensors that are inputs to the subgraph that is
+// associated with the provided 'opaque_context'.  The length of the array is
+// loaded into the provided 'num_inputs' pointer.  Returns 'kTfLiteOk' in case
+// of success.  Any other return value indicates a failure and will leave
+// 'inputs' and 'num_inputs' in an unspecified state.  Calls to 'SetInputs' on
+// the associated subgraph invalidate the loaded pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetInputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** inputs,
+    int* num_inputs);
+
+// Loads into the provided '*outputs' pointer the starting address of an array
+// of indices representing the tensors that are outputs to the subgraph that is
+// associated with the provided 'opaque_context'.  The length of the array is
+// loaded into the provided 'num_outputs' pointer.  Returns 'kTfLiteOk' in case
+// of success.  Any other return value indicates a failure and will leave
+// 'outputs' and 'num_outputs' in an unspecified state.  Calls to 'SetOutputs'
+// on the associated subgraph invalidate the loaded pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetOutputs(
+    const struct TfLiteOpaqueContext* opaque_context, const int** outputs,
+    int* num_outputs);
+
+// Loads into the provided '*variables' pointer the starting address of an array
+// of indices representing the tensors that are variables to the subgraph that
+// is associated with the provided 'opaque_context'.  The length of the array is
+// loaded into the provided 'num_variables' pointer.  Returns 'kTfLiteOk' in
+// case of success.  Any other return value indicates a failure and will leave
+// 'variables' and 'num_variables' in an unspecified state.  Calls to
+// 'SetVariables' on the associated subgraph invalidate the loaded pointers.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextGetVariables(
+    const struct TfLiteOpaqueContext* opaque_context, const int** variables,
+    int* num_variables);
+
+// Returns the number of nodes associated with the provided 'opaque_context'.
+TFL_CAPI_EXPORT
+size_t TfLiteOpaqueContextGetNumNodes(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+// Returns the number of tensors associated with the provided 'opaque_context'.
+TFL_CAPI_EXPORT
+size_t TfLiteOpaqueContextGetNumTensors(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+// Returns the name of the subgraph that is associated with the provided
+// 'opaque_context'.  Typically the returned pointer will remain valid
+// throughout the lifetime of the subgraph, but may be invalidated by a call to
+// 'Subgraph::SetName'.
+TFL_CAPI_EXPORT
+const char* TfLiteOpaqueContextGetName(
+    const struct TfLiteOpaqueContext* opaque_context);
+
+// Resizes the provided 'tensor' that is associated with the provided
+// 'context' so that the 'tensor's shape matches the dimensionality specified
+// via the provided 'new_size' array.  Returns 'kTfLiteOk' in
+// case of success.  Any other return value indicates a failure and will leave
+// the 'tensor' in an unspecified state.  The TF Lite runtime takes ownership
+// of the 'new_size' array, even in case of failure.
+TFL_CAPI_EXPORT
+TfLiteStatus TfLiteOpaqueContextResizeTensor(TfLiteOpaqueContext* context,
+                                             TfLiteOpaqueTensor* tensor,
+                                             TfLiteIntArray* new_size);
+
+// Reports an error message formed by using the provided 'format' string in
+// combination with the data provided via the unnamed arguments following the
+// the 'format' parameter ('...').  The intended usage and behavior is the same
+// as with 'printf' with regards to how the data and the formatting string
+// interact.  E.g.
+// 'TfLiteOpaqueContextReportError(opaque_context, "a=%d b=%d", a, b);'
+//
+// The provided 'opaque_context' will be used for reporting the resulting error
+// message.
+//
+// Note that TF Lite clients can use macros like 'TF_LITE_OPAQUE_ENSURE' to
+// check for certain conditions to be true, and print an error message if the
+// condition does not hold.  Direct usage of this function from application code
+// should therefore be rare.
+TFL_CAPI_EXPORT
+void TfLiteOpaqueContextReportError(struct TfLiteOpaqueContext* opaque_context,
+                                    const char* format, ...);
+
+// Same as 'TfLiteOpaqueContextReportError', but with the variable arguments
+// passed via a 'va_list' instead of directly.
+//
+// Callers that receive an ellipsis and want to forward it to
+// to the opaque context error reporting API can add the ellipsis content to a
+// 'va_list' and then call 'TfLiteOpaqueContextReportErrorVa'. E.g.:
+//
+// void MyErrorReporter(struct TfLiteOpaqueContext* opaque_context,
+//                                     const char* format, ...) {
+//   va_list vlist;
+//   va_start(vlist, format);
+//   TfLiteOpaqueContextReportErrorVa(opaque_context, format, vlist);
+//   va_end(vlist);
+// }
+TFL_CAPI_EXPORT
+void TfLiteOpaqueContextReportErrorVa(
+    struct TfLiteOpaqueContext* opaque_context, const char* format,
+    va_list vlist);
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Try to make all reporting calls through TF_LITE_OPAQUE_KERNEL_LOG rather than
+// calling the TfLiteOpaqueContextReportError function directly, so that message
+// strings can be stripped out if the binary size needs to be severely
+// optimized.
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, ...)             \
+  do {                                                             \
+    TfLiteOpaqueContextReportError((opaque_context), __VA_ARGS__); \
+  } while (false)
+#define TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(opaque_context, ...)         \
+  do {                                                               \
+    if ((opaque_context) != nullptr) {                               \
+      TfLiteOpaqueContextReportError((opaque_context), __VA_ARGS__); \
+    }                                                                \
+  } while (false)
+#else  // TF_LITE_STRIP_ERROR_STRINGS
+#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
+#define TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, ...) ARGS_UNUSED(__VA_ARGS__)
+#define TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(opaque_context, ...) \
+  ARGS_UNUSED(__VA_ARGS__)
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+
+// Check whether value is true, and if not return kTfLiteError from
+// the current function (and report the error string msg).
+#define TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, value, msg)        \
+  do {                                                               \
+    if (!(value)) {                                                  \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), __FILE__ " " msg); \
+      return kTfLiteError;                                           \
+    }                                                                \
+  } while (0)
+
+// Check whether the value `a` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+#define TF_LITE_OPAQUE_ENSURE(opaque_context, a)                           \
+  do {                                                                     \
+    if (!(a)) {                                                            \
+      TF_LITE_OPAQUE_KERNEL_LOG(opaque_context, "%s:%d: %s was not true.", \
+                                __FILE__, __LINE__, #a);                   \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+
+// Check whether the value `a == b` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+// `a` and `b` may be evaluated more than once, so no side effects or
+// extremely expensive computations should be done.
+// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
+#define TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, a, b)                         \
+  do {                                                                         \
+    if ((a) != (b)) {                                                          \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), "%s:%d %s != %s (%d != %d)", \
+                                __FILE__, __LINE__, #a, #b, (a), (b));         \
+      return kTfLiteError;                                                     \
+    }                                                                          \
+  } while (0)
+
+#define TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, a, b)                   \
+  do {                                                                         \
+    if ((a) != (b)) {                                                          \
+      TF_LITE_OPAQUE_KERNEL_LOG((opaque_context), "%s:%d %s != %s (%s != %s)", \
+                                __FILE__, __LINE__, #a, #b,                    \
+                                TfLiteTypeGetName(a), TfLiteTypeGetName(b));   \
+      return kTfLiteError;                                                     \
+    }                                                                          \
+  } while (0)
+
+#define TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, a, b, epsilon)            \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_OPAQUE_KERNEL_LOG(                                             \
+          (opaque_context), "%s:%d %s not near %s (%f != %f)", __FILE__,     \
+          __LINE__, #a, #b, static_cast<double>(a), static_cast<double>(b)); \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index 8209a972b28..9590a302dbc 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/c/c_api_opaque_internal.h b/tensorflow/lite/c/c_api_opaque_internal.h
index 5b1f894d5f7..7c3f1d3deab 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.h
+++ b/tensorflow/lite/c/c_api_opaque_internal.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_OPAQUE_INTERNAL_H_
 #define TENSORFLOW_LITE_C_C_API_OPAQUE_INTERNAL_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace internal {
diff --git a/tensorflow/lite/c/c_api_signature_runner_test.cc b/tensorflow/lite/c/c_api_signature_runner_test.cc
index fbda01b393a..30614e5d7e5 100644
--- a/tensorflow/lite/c/c_api_signature_runner_test.cc
+++ b/tensorflow/lite/c/c_api_signature_runner_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_experimental.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/c/c_api_types.h b/tensorflow/lite/c/c_api_types.h
index 9d7668e132e..18bccde66ea 100644
--- a/tensorflow/lite/c/c_api_types.h
+++ b/tensorflow/lite/c/c_api_types.h
@@ -19,129 +19,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_C_API_TYPES_H_
 #define TENSORFLOW_LITE_C_C_API_TYPES_H_
 
-#include <stdint.h>
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/c_api_types.h.
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
-// library.
-#ifdef SWIG
-#define TFL_CAPI_EXPORT
-#elif defined(TFL_STATIC_LIBRARY_BUILD)
-#define TFL_CAPI_EXPORT
-#else  // not definded TFL_STATIC_LIBRARY_BUILD
-#if defined(_WIN32)
-#ifdef TFL_COMPILE_LIBRARY
-#define TFL_CAPI_EXPORT __declspec(dllexport)
-#else
-#define TFL_CAPI_EXPORT __declspec(dllimport)
-#endif  // TFL_COMPILE_LIBRARY
-#else
-#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
-#endif  // _WIN32
-#endif  // SWIG
-
-// Note that new error status values may be added in future in order to
-// indicate more fine-grained internal states, therefore, applications should
-// not rely on status values being members of the enum.
-typedef enum TfLiteStatus {
-  kTfLiteOk = 0,
-
-  // Generally referring to an error in the runtime (i.e. interpreter)
-  kTfLiteError = 1,
-
-  // Generally referring to an error from a TfLiteDelegate itself.
-  kTfLiteDelegateError = 2,
-
-  // Generally referring to an error in applying a delegate due to
-  // incompatibility between runtime and delegate, e.g., this error is returned
-  // when trying to apply a TF Lite delegate onto a model graph that's already
-  // immutable.
-  kTfLiteApplicationError = 3,
-
-  // Generally referring to serialized delegate data not being found.
-  // See tflite::delegates::Serialization.
-  kTfLiteDelegateDataNotFound = 4,
-
-  // Generally referring to data-writing issues in delegate serialization.
-  // See tflite::delegates::Serialization.
-  kTfLiteDelegateDataWriteError = 5,
-
-  // Generally referring to data-reading issues in delegate serialization.
-  // See tflite::delegates::Serialization.
-  kTfLiteDelegateDataReadError = 6,
-
-  // Generally referring to issues when the TF Lite model has ops that cannot be
-  // resolved at runtime. This could happen when the specific op is not
-  // registered or built with the TF Lite framework.
-  kTfLiteUnresolvedOps = 7,
-
-  // Generally referring to invocation cancelled by the user.
-  // See `interpreter::Cancel`.
-  // TODO(b/194915839): Implement `interpreter::Cancel`.
-  // TODO(b/250636993): Cancellation triggered by `SetCancellationFunction`
-  // should also return this status code.
-  kTfLiteCancelled = 8,
-} TfLiteStatus;
-
-// Types supported by tensor
-typedef enum {
-  kTfLiteNoType = 0,
-  kTfLiteFloat32 = 1,
-  kTfLiteInt32 = 2,
-  kTfLiteUInt8 = 3,
-  kTfLiteInt64 = 4,
-  kTfLiteString = 5,
-  kTfLiteBool = 6,
-  kTfLiteInt16 = 7,
-  kTfLiteComplex64 = 8,
-  kTfLiteInt8 = 9,
-  kTfLiteFloat16 = 10,
-  kTfLiteFloat64 = 11,
-  kTfLiteComplex128 = 12,
-  kTfLiteUInt64 = 13,
-  kTfLiteResource = 14,
-  kTfLiteVariant = 15,
-  kTfLiteUInt32 = 16,
-  kTfLiteUInt16 = 17,
-  kTfLiteInt4 = 18,
-} TfLiteType;
-
-// Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
-// If per-layer quantization is specified this field will still be populated in
-// addition to TfLiteAffineQuantization.
-// Parameters for asymmetric quantization. Quantized values can be converted
-// back to float using:
-//     real_value = scale * (quantized_value - zero_point)
-typedef struct TfLiteQuantizationParams {
-  float scale;
-  int32_t zero_point;
-} TfLiteQuantizationParams;
-
-// --------------------------------------------------------------------------
-// Opaque types used by c_api.h, c_api_opaque.h and common.h.
-
-// TfLiteOpaqueContext is an opaque version of TfLiteContext;
-typedef struct TfLiteOpaqueContext TfLiteOpaqueContext;
-
-// TfLiteOpaqueNode is an opaque version of TfLiteNode;
-typedef struct TfLiteOpaqueNode TfLiteOpaqueNode;
-
-// TfLiteOpaqueTensor is an opaque version of TfLiteTensor;
-typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor;
-
-// TfLiteOpaqueDelegateStruct: opaque version of TfLiteDelegate; allows
-// delegation of nodes to alternative backends.
-//
-// This is an abstract type that is intended to have the same
-// role as TfLiteDelegate from common.h, but without exposing the implementation
-// details of how delegates are implemented.
-// WARNING: This is an experimental type and subject to change.
-typedef struct TfLiteOpaqueDelegateStruct TfLiteOpaqueDelegateStruct;
-
-#ifdef __cplusplus
-}  // extern C
-#endif
 #endif  // TENSORFLOW_LITE_C_C_API_TYPES_H_
diff --git a/tensorflow/lite/c/c_test.c b/tensorflow/lite/c/c_test.c
index ec27cc81285..4a9d2636441 100644
--- a/tensorflow/lite/c/c_test.c
+++ b/tensorflow/lite/c/c_test.c
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_experimental.h"
 #include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 // This file exists just to verify that the above header files above can build,
 // link, and run as "C" code.
@@ -306,7 +306,7 @@ static void TestInferenceUsingInterpreter(void) {
 
 TfLiteStatus PrepareThatChecksExecutionPlanSizeEqualsTwo(
     TfLiteOpaqueContext* context,
-    struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+    TfLiteOpaqueDelegate* opaque_delegate, void* data) {
   bool* delegate_prepared = (bool*)data;
   *delegate_prepared = true;
 
@@ -327,11 +327,11 @@ static void TestTfLiteOpaqueContextGetExecutionPlan(void) {
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder = { NULL };
   opaque_delegate_builder.data = &delegate_prepared;
   opaque_delegate_builder.Prepare = PrepareThatChecksExecutionPlanSizeEqualsTwo;
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, opaque_delegate);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
   // The delegate should have been applied.
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 761edcfd306..718650df89c 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -36,1091 +36,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
 
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
+/// For documentation, see
+/// third_party/tensorflow/lite/core/c/common.h.
+#include "tensorflow/lite/core/c/common.h"  // IWYU pragma: export
 
-#include "tensorflow/lite/c/c_api_types.h"  // IWYU pragma: export
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// The list of external context types known to TF Lite. This list exists solely
-// to avoid conflicts and to ensure ops can share the external contexts they
-// need. Access to the external contexts is controlled by one of the
-// corresponding support files.
-typedef enum TfLiteExternalContextType {
-  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
-  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
-  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
-  kTfLiteCpuBackendContext = 3,  // include cpu_backend_context.h to use.
-  kTfLiteMaxExternalContexts = 4
-} TfLiteExternalContextType;
-
-// Forward declare so dependent structs and methods can reference these types
-// prior to the struct definitions.
-struct TfLiteContext;
-struct TfLiteDelegate;
-struct TfLiteRegistration;
-struct TfLiteOpaqueDelegateStruct;
-struct TfLiteOpaqueDelegateBuilder;
-
-// An external context is a collection of information unrelated to the TF Lite
-// framework, but useful to a subset of the ops. TF Lite knows very little
-// about the actual contexts, but it keeps a list of them, and is able to
-// refresh them if configurations like the number of recommended threads
-// change.
-typedef struct TfLiteExternalContext {
-  TfLiteExternalContextType type;
-  TfLiteStatus (*Refresh)(struct TfLiteContext* context);
-} TfLiteExternalContext;
-
-#define kTfLiteOptionalTensor (-1)
-
-// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
-// indices
-typedef struct TfLiteIntArray {
-  int size;
-
-#if defined(_MSC_VER)
-  // Context for why this is needed is in http://b/189926408#comment21
-  int data[1];
-#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-       __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON) ||                                             \
-    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
-  // gcc 6.1+ have a bug where flexible members aren't properly handled
-  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-  int data[0];
-#else
-  int data[];
-#endif
-} TfLiteIntArray;
-
-// Given the size (number of elements) in a TfLiteIntArray, calculate its size
-// in bytes.
-size_t TfLiteIntArrayGetSizeInBytes(int size);
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Create a array of a given `size` (uninitialized entries).
-// This returns a pointer, that you must free using TfLiteIntArrayFree().
-TfLiteIntArray* TfLiteIntArrayCreate(int size);
-#endif
-
-// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
-int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b);
-
-// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
-int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
-                              const int b_data[]);
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Create a copy of an array passed as `src`.
-// You are expected to free memory with TfLiteIntArrayFree
-TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
-
-// Free memory of array `a`.
-void TfLiteIntArrayFree(TfLiteIntArray* a);
-#endif  // TF_LITE_STATIC_MEMORY
-
-// Fixed size list of floats. Used for per-channel quantization.
-typedef struct TfLiteFloatArray {
-  int size;
-#if defined(_MSC_VER)
-  // Context for why this is needed is in http://b/189926408#comment21
-  float data[1];
-#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-       __GNUC_MINOR__ >= 1) ||                                      \
-    defined(HEXAGON) ||                                             \
-    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
-  // gcc 6.1+ have a bug where flexible members aren't properly handled
-  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-  float data[0];
-#else
-  float data[];
-#endif
-} TfLiteFloatArray;
-
-// Given the size (number of elements) in a TfLiteFloatArray, calculate its size
-// in bytes.
-int TfLiteFloatArrayGetSizeInBytes(int size);
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Create a array of a given `size` (uninitialized entries).
-// This returns a pointer, that you must free using TfLiteFloatArrayFree().
-TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
-
-// Free memory of array `a`.
-void TfLiteFloatArrayFree(TfLiteFloatArray* a);
-#endif  // TF_LITE_STATIC_MEMORY
-
-// Since we must not depend on any libraries, define a minimal subset of
-// error macros while avoiding names that have pre-conceived meanings like
-// assert and check.
-
-// Try to make all reporting calls through TF_LITE_KERNEL_LOG rather than
-// calling the context->ReportError function directly, so that message strings
-// can be stripped out if the binary size needs to be severely optimized.
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#define TF_LITE_KERNEL_LOG(context, ...)            \
-  do {                                              \
-    (context)->ReportError((context), __VA_ARGS__); \
-  } while (false)
-
-#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)        \
-  do {                                                \
-    if ((context) != nullptr) {                       \
-      (context)->ReportError((context), __VA_ARGS__); \
-    }                                                 \
-  } while (false)
-#else  // TF_LITE_STRIP_ERROR_STRINGS
-#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
-#define TF_LITE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
-#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
-#endif  // TF_LITE_STRIP_ERROR_STRINGS
-
-// Check whether value is true, and if not return kTfLiteError from
-// the current function (and report the error string msg).
-#define TF_LITE_ENSURE_MSG(context, value, msg)        \
-  do {                                                 \
-    if (!(value)) {                                    \
-      TF_LITE_KERNEL_LOG((context), __FILE__ " " msg); \
-      return kTfLiteError;                             \
-    }                                                  \
-  } while (0)
-
-// Check whether the value `a` is true, and if not return kTfLiteError from
-// the current function, while also reporting the location of the error.
-#define TF_LITE_ENSURE(context, a)                                      \
-  do {                                                                  \
-    if (!(a)) {                                                         \
-      TF_LITE_KERNEL_LOG((context), "%s:%d %s was not true.", __FILE__, \
-                         __LINE__, #a);                                 \
-      return kTfLiteError;                                              \
-    }                                                                   \
-  } while (0)
-
-#define TF_LITE_ENSURE_STATUS(a) \
-  do {                           \
-    const TfLiteStatus s = (a);  \
-    if (s != kTfLiteOk) {        \
-      return s;                  \
-    }                            \
-  } while (0)
-
-// Check whether the value `a == b` is true, and if not return kTfLiteError from
-// the current function, while also reporting the location of the error.
-// `a` and `b` may be evaluated more than once, so no side effects or
-// extremely expensive computations should be done.
-// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
-#define TF_LITE_ENSURE_EQ(context, a, b)                                   \
-  do {                                                                     \
-    if ((a) != (b)) {                                                      \
-      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
-                         __LINE__, #a, #b, (a), (b));                      \
-      return kTfLiteError;                                                 \
-    }                                                                      \
-  } while (0)
-
-#define TF_LITE_ENSURE_TYPES_EQ(context, a, b)                             \
-  do {                                                                     \
-    if ((a) != (b)) {                                                      \
-      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%s != %s)", __FILE__, \
-                         __LINE__, #a, #b, TfLiteTypeGetName(a),           \
-                         TfLiteTypeGetName(b));                            \
-      return kTfLiteError;                                                 \
-    }                                                                      \
-  } while (0)
-
-#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
-  do {                                                                       \
-    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
-    if (delta > epsilon) {                                                   \
-      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
-                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
-                         static_cast<double>(b));                            \
-      return kTfLiteError;                                                   \
-    }                                                                        \
-  } while (0)
-
-#define TF_LITE_ENSURE_OK(context, status) \
-  do {                                     \
-    const TfLiteStatus s = (status);       \
-    if ((s) != kTfLiteOk) {                \
-      return s;                            \
-    }                                      \
-  } while (0)
-
-// Single-precision complex data type compatible with the C99 definition.
-typedef struct TfLiteComplex64 {
-  float re, im;  // real and imaginary parts, respectively.
-} TfLiteComplex64;
-
-// Double-precision complex data type compatible with the C99 definition.
-typedef struct TfLiteComplex128 {
-  double re, im;  // real and imaginary parts, respectively.
-} TfLiteComplex128;
-
-// Half precision data type compatible with the C99 definition.
-typedef struct TfLiteFloat16 {
-  uint16_t data;
-} TfLiteFloat16;
-
-// Return the name of a given type, for error reporting purposes.
-const char* TfLiteTypeGetName(TfLiteType type);
-
-// SupportedQuantizationTypes.
-typedef enum TfLiteQuantizationType {
-  // No quantization.
-  kTfLiteNoQuantization = 0,
-  // Affine quantization (with support for per-channel quantization).
-  // Corresponds to TfLiteAffineQuantization.
-  kTfLiteAffineQuantization = 1,
-} TfLiteQuantizationType;
-
-// Structure specifying the quantization used by the tensor, if-any.
-typedef struct TfLiteQuantization {
-  // The type of quantization held by params.
-  TfLiteQuantizationType type;
-  // Holds an optional reference to a quantization param structure. The actual
-  // type depends on the value of the `type` field (see the comment there for
-  // the values and corresponding types).
-  void* params;
-} TfLiteQuantization;
-
-// Parameters for asymmetric quantization across a dimension (i.e per output
-// channel quantization).
-// quantized_dimension specifies which dimension the scales and zero_points
-// correspond to.
-// For a particular value in quantized_dimension, quantized values can be
-// converted back to float using:
-//     real_value = scale * (quantized_value - zero_point)
-typedef struct TfLiteAffineQuantization {
-  TfLiteFloatArray* scale;
-  TfLiteIntArray* zero_point;
-  int32_t quantized_dimension;
-} TfLiteAffineQuantization;
-
-/* A union of pointers that points to memory for a given tensor. */
-typedef union TfLitePtrUnion {
-  /* Do not access these members directly, if possible, use
-   * GetTensorData<TYPE>(tensor) instead, otherwise only access .data, as other
-   * members are deprecated. */
-  int32_t* i32;
-  uint32_t* u32;
-  int64_t* i64;
-  uint64_t* u64;
-  float* f;
-  TfLiteFloat16* f16;
-  double* f64;
-  char* raw;
-  const char* raw_const;
-  uint8_t* uint8;
-  bool* b;
-  int16_t* i16;
-  uint16_t* ui16;
-  TfLiteComplex64* c64;
-  TfLiteComplex128* c128;
-  int8_t* int8;
-  /* Only use this member. */
-  void* data;
-} TfLitePtrUnion;
-
-// Memory allocation strategies.
-//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
-//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
-//        and available during eval.
-//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
-//        only available during eval.
-//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
-//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
-//        useful for tensors that can be computed during prepare and treated
-//        as constant inputs for downstream ops (also in prepare).
-//  * kTfLiteCustom: Custom memory allocation provided by the user. See
-//        TfLiteCustomAllocation below.
-typedef enum TfLiteAllocationType {
-  kTfLiteMemNone = 0,
-  kTfLiteMmapRo,
-  kTfLiteArenaRw,
-  kTfLiteArenaRwPersistent,
-  kTfLiteDynamic,
-  kTfLitePersistentRo,
-  kTfLiteCustom,
-} TfLiteAllocationType;
-
-// The delegates should use zero or positive integers to represent handles.
-// -1 is reserved from unallocated status.
-typedef int TfLiteBufferHandle;
-enum {
-  kTfLiteNullBufferHandle = -1,
-};
-
-// Storage format of each dimension in a sparse tensor.
-typedef enum TfLiteDimensionType {
-  kTfLiteDimDense = 0,
-  kTfLiteDimSparseCSR,
-} TfLiteDimensionType;
-
-// Metadata to encode each dimension in a sparse tensor.
-typedef struct TfLiteDimensionMetadata {
-  TfLiteDimensionType format;
-  int dense_size;
-  TfLiteIntArray* array_segments;
-  TfLiteIntArray* array_indices;
-} TfLiteDimensionMetadata;
-
-// Parameters used to encode a sparse tensor. For detailed explanation of each
-// field please refer to lite/schema/schema.fbs.
-typedef struct TfLiteSparsity {
-  TfLiteIntArray* traversal_order;
-  TfLiteIntArray* block_map;
-  TfLiteDimensionMetadata* dim_metadata;
-  int dim_metadata_size;
-} TfLiteSparsity;
-
-// Defines a custom memory allocation not owned by the runtime.
-// `data` should be aligned to kDefaultTensorAlignment defined in
-// lite/util.h. (Currently 64 bytes)
-// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
-typedef struct TfLiteCustomAllocation {
-  void* data;
-  size_t bytes;
-} TfLiteCustomAllocation;
-
-// The flags used in `Interpreter::SetCustomAllocationForTensor`.
-// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
-typedef enum TfLiteCustomAllocationFlags {
-  kTfLiteCustomAllocationFlagsNone = 0,
-  // Skips checking whether allocation.data points to an aligned buffer as
-  // expected by the TFLite runtime.
-  // NOTE: Setting this flag can cause crashes when calling Invoke().
-  // Use with caution.
-  kTfLiteCustomAllocationFlagsSkipAlignCheck = 1,
-} TfLiteCustomAllocationFlags;
-
-// A tensor in the interpreter system which is a wrapper around a buffer of
-// data including a dimensionality (or NULL if not currently defined).
-#ifndef TF_LITE_STATIC_MEMORY
-typedef struct TfLiteTensor {
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
-  TfLiteIntArray* dims;
-  // Quantization information.
-  TfLiteQuantizationParams params;
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
-  TfLiteAllocationType allocation_type;
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
-  size_t bytes;
-
-  // An opaque pointer to a tflite::MMapAllocation
-  const void* allocation;
-
-  // Null-terminated name of this tensor.
-  const char* name;
-
-  // The delegate which knows how to handle `buffer_handle`.
-  // WARNING: This is an experimental interface that is subject to change.
-  struct TfLiteDelegate* delegate;
-
-  // An integer buffer handle that can be handled by `delegate`.
-  // The value is valid only when delegate is not null.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteBufferHandle buffer_handle;
-
-  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
-  // responsible to set data_is_stale to true.
-  // `delegate->CopyFromBufferHandle` can be called to copy the data from
-  // delegate buffer.
-  // WARNING: This is an // experimental interface that is subject to change.
-  bool data_is_stale;
-
-  // True if the tensor is a variable.
-  bool is_variable;
-
-  // Quantization information. Replaces params field above.
-  TfLiteQuantization quantization;
-
-  // Parameters used to encode a sparse tensor.
-  // This is optional. The field is NULL if a tensor is dense.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteSparsity* sparsity;
-
-  // Optional. Encodes shapes with unknown dimensions with -1. This field is
-  // only populated when unknown dimensions exist in a read-write tensor (i.e.
-  // an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
-  // `dims_signature` contains [1, -1, -1, 3]). Note that this field only
-  // exists when TF_LITE_STATIC_MEMORY is not defined.
-  const TfLiteIntArray* dims_signature;
-} TfLiteTensor;
-
-// A structure representing an instance of a node.
-// This structure only exhibits the inputs, outputs, user defined data and some
-// node properties (like statefulness), not other features like the type.
-typedef struct TfLiteNode {
-  // Inputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* inputs;
-
-  // Outputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* outputs;
-
-  // intermediate tensors to this node expressed as indices into the simulator's
-  // tensors.
-  TfLiteIntArray* intermediates;
-
-  // Temporary tensors uses during the computations. This usually contains no
-  // tensors, but ops are allowed to change that if they need scratch space of
-  // any sort.
-  TfLiteIntArray* temporaries;
-
-  // Opaque data provided by the node implementer through `Registration.init`.
-  void* user_data;
-
-  // Opaque data provided to the node if the node is a builtin. This is usually
-  // a structure defined in builtin_op_data.h
-  void* builtin_data;
-
-  // Custom initial data. This is the opaque data provided in the flatbuffer.
-  // WARNING: This is an experimental interface that is subject to change.
-  const void* custom_initial_data;
-  int custom_initial_data_size;
-
-  // The pointer to the delegate. This is non-null only when the node is
-  // created by calling `interpreter.ModifyGraphWithDelegate`.
-  // WARNING: This is an experimental interface that is subject to change.
-  struct TfLiteDelegate* delegate;
-
-  // Whether this op might have side effect (e.g. stateful op).
-  bool might_have_side_effect;
-} TfLiteNode;
-#else   // defined(TF_LITE_STATIC_MEMORY)?
-// NOTE: This flag is opt-in only at compile time.
-//
-// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
-// contains only the minimum fields required to initialize and prepare a micro
-// inference graph. The fields in this struct have been ordered from
-// largest-to-smallest for optimal struct sizeof.
-//
-// This struct does not use:
-// - allocation
-// - buffer_handle
-// - data_is_stale
-// - delegate
-// - dims_signature
-// - name
-// - sparsity
-typedef struct TfLiteTensor {
-  // TODO(b/155784997): Consider consolidating these quantization fields:
-  // Quantization information. Replaces params field above.
-  TfLiteQuantization quantization;
-
-  // Quantization information.
-  TfLiteQuantizationParams params;
-
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
-  TfLiteIntArray* dims;
-
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
-  size_t bytes;
-
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
-  TfLiteAllocationType allocation_type;
-
-  // True if the tensor is a variable.
-  bool is_variable;
-} TfLiteTensor;
-
-// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
-// only the minimum fields required to represent a node.
-//
-// This struct does not use:
-// - delegate
-// - intermediates
-// - temporaries
-typedef struct TfLiteNode {
-  // Inputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* inputs;
-
-  // Outputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* outputs;
-
-  // intermediate tensors to this node expressed as indices into the simulator's
-  // tensors.
-  TfLiteIntArray* intermediates;
-
-  // Opaque data provided by the node implementer through `Registration.init`.
-  void* user_data;
-
-  // Opaque data provided to the node if the node is a builtin. This is usually
-  // a structure defined in builtin_op_data.h
-  void* builtin_data;
-
-  // Custom initial data. This is the opaque data provided in the flatbuffer.
-  // WARNING: This is an experimental interface that is subject to change.
-  const void* custom_initial_data;
-  int custom_initial_data_size;
-} TfLiteNode;
-#endif  // TF_LITE_STATIC_MEMORY
-
-// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
-// of information required for a kernel to run during TfLiteRegistration::Eval.
-// TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
-// builds with this flag by default internally.
-typedef struct TfLiteEvalTensor {
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have.
-  TfLiteIntArray* dims;
-
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-} TfLiteEvalTensor;
-
-#ifndef TF_LITE_STATIC_MEMORY
-// Free data memory of tensor `t`.
-void TfLiteTensorDataFree(TfLiteTensor* t);
-
-// Free quantization data.
-void TfLiteQuantizationFree(TfLiteQuantization* quantization);
-
-// Free sparsity parameters.
-void TfLiteSparsityFree(TfLiteSparsity* sparsity);
-
-// Free memory of tensor `t`.
-void TfLiteTensorFree(TfLiteTensor* t);
-
-// Set all of a tensor's fields (and free any previously allocated data).
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor);
-
-// Copies the contents of 'src' in 'dst'.
-// Function does nothing if either 'src' or 'dst' is passed as nullptr and
-// return kTfLiteOk.
-// Returns kTfLiteError if 'src' and 'dst' doesn't have matching data size.
-// Note function copies contents, so it won't create new data pointer
-// or change allocation type.
-// All Tensor related properties will be copied from 'src' to 'dst' like
-// quantization, sparsity, ...
-TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst);
-
-// Change the size of the memory block owned by `tensor` to `num_bytes`.
-// Tensors with allocation types other than kTfLiteDynamic will be ignored.
-// `tensor`'s internal data buffer will be assigned a pointer
-// which can safely be passed to free or realloc if `num_bytes` is zero.
-// Behaviour is undefined if `tensor` is NULL.
-// If `preserve_data` is true, tensor data will be unchanged in the range from
-// the start of the region up to the minimum of the old and new sizes.
-void TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
-                                 bool preserve_data);
-
-// Change the size of the memory block owned by `tensor` to `num_bytes`.
-// Tensors with allocation types other than kTfLiteDynamic will be ignored.
-// `tensor`'s internal data buffer will be assigned a pointer
-// which can safely be passed to free or realloc if `num_bytes` is zero.
-// Behaviour is undefined if `tensor` is NULL.
-// Tensor data will be unchanged in the range from the start of the region up to
-// the minimum of the old and new sizes.
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
-#endif  // TF_LITE_STATIC_MEMORY
-
-// WARNING: This is an experimental interface that is subject to change.
-//
-// Currently, TfLiteDelegateParams has to be allocated in a way that it's
-// trivially destructable. It will be stored as `builtin_data` field in
-// `TfLiteNode` of the delegate node.
-//
-// See also the `CreateDelegateParams` function in `interpreter.cc` details.
-typedef struct TfLiteDelegateParams {
-  struct TfLiteDelegate* delegate;
-  TfLiteIntArray* nodes_to_replace;
-  TfLiteIntArray* input_tensors;
-  TfLiteIntArray* output_tensors;
-} TfLiteDelegateParams;
-
-// WARNING: This is an experimental interface that is subject to change.
-//
-// Currently, TfLiteOpaqueDelegateParams has to be allocated in a way that it's
-// trivially destructable. It will be stored as `builtin_data` field in
-// `TfLiteNode` of the delegate node.
-//
-// See also the `CreateOpaqueDelegateParams` function in `subgraph.cc`
-// details.
-typedef struct TfLiteOpaqueDelegateParams {
-  struct TfLiteOpaqueDelegateStruct* delegate;
-  void* delegate_data;
-  TfLiteIntArray* nodes_to_replace;
-  TfLiteIntArray* input_tensors;
-  TfLiteIntArray* output_tensors;
-} TfLiteOpaqueDelegateParams;
-
-typedef struct TfLiteContext {
-  // Number of tensors in the context.
-  size_t tensors_size;
-
-  // The execution plan contains a list of the node indices in execution
-  // order. execution_plan->size is the current number of nodes. And,
-  // execution_plan->data[0] is the first node that needs to be run.
-  // TfLiteDelegates can traverse the current execution plan by iterating
-  // through each member of this array and using GetNodeAndRegistration() to
-  // access details about a node. i.e.
-  //
-  // TfLiteIntArray* execution_plan;
-  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
-  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
-  //    int node_index = execution_plan->data[exec_index];
-  //    TfLiteNode* node;
-  //    TfLiteRegistration* reg;
-  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
-  // }
-  // Note: the memory pointed by '`*execution_plan` is OWNED by TfLite runtime.
-  // Future calls to GetExecutionPlan invalidates earlier outputs. The following
-  // code snippet shows the issue of such an invocation pattern. After calling
-  // CheckNode, subsequent access to `plan_1st` is undefined.
-  //
-  // void CheckNode(const TfLiteNode* node) {
-  //   ...
-  //   TfLiteIntArray* plan_2nd;
-  //   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_2nd));
-  //   ...
-  // }
-  //
-  // TfLiteIntArray* plan_1st;
-  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_1st));
-  // for (int exec_index = 0; exec_index < plan_1st->size; exec_index++) {
-  //    int node_index = plan_1st->data[exec_index];
-  //    TfLiteNode* node;
-  //    TfLiteRegistration* reg;
-  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
-  //    CheckNode(node);
-  // }
-  //
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
-                                   TfLiteIntArray** execution_plan);
-
-  // An array of tensors in the interpreter context (of length `tensors_size`)
-  TfLiteTensor* tensors;
-
-  // opaque full context ptr (an opaque c++ data structure)
-  void* impl_;
-
-  // Request memory pointer be resized. Updates dimensions on the tensor.
-  // NOTE: ResizeTensor takes ownership of newSize.
-  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
-                               TfLiteIntArray* new_size);
-  // Request that an error be reported with format string msg.
-  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
-
-  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
-  // non-null, the value pointed to by `first_new_tensor_index` will be set to
-  // the index of the first new tensor.
-  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
-                             int* first_new_tensor_index);
-
-  // Get a Tensor node by node_index.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetNodeAndRegistration)(
-      struct TfLiteContext*, int node_index, TfLiteNode** node,
-      struct TfLiteRegistration** registration);
-
-  // Replace ops with one or more stub delegate operations. This function
-  // does not take ownership of `nodes_to_replace`.
-  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
-      struct TfLiteContext*, struct TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
-
-  // Number of threads that are recommended to subsystems like gemmlowp and
-  // eigen.
-  int recommended_num_threads;
-
-  // Access external contexts by type.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
-                                               TfLiteExternalContextType);
-  // Set the value of a external context. Does not take ownership of the
-  // pointer.
-  // WARNING: This is an experimental interface that is subject to change.
-  void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
-                             TfLiteExternalContext*);
-
-  // Flag for allowing float16 precision for FP32 calculation.
-  // default: false.
-  // WARNING: This is an experimental API and subject to change.
-  bool allow_fp32_relax_to_fp16;
-
-  // Pointer to the op-level profiler, if set; nullptr otherwise.
-  void* profiler;
-
-  // Allocate persistent buffer which has the same life time as the interpreter.
-  // Returns nullptr on failure.
-  // The memory is allocated from heap for TFL, and from tail in TFLM.
-  // This method is only available in Init or Prepare stage.
-  // WARNING: This is an experimental interface that is subject to change.
-  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
-
-  // Allocate a buffer which will be deallocated right after invoke phase.
-  // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
-  // This method is only available in invoke stage.
-  // NOTE: If possible use RequestScratchBufferInArena method to avoid memory
-  // allocation during inference time.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*AllocateBufferForEval)(struct TfLiteContext* ctx, size_t bytes,
-                                        void** ptr);
-
-  // Request a scratch buffer in the arena through static memory planning.
-  // This method is only available in Prepare stage and the buffer is allocated
-  // by the interpreter between Prepare and Eval stage. In Eval stage,
-  // GetScratchBuffer API can be used to fetch the address.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*RequestScratchBufferInArena)(struct TfLiteContext* ctx,
-                                              size_t bytes, int* buffer_idx);
-
-  // Get the scratch buffer pointer.
-  // This method is only available in Eval stage.
-  // WARNING: This is an experimental interface that is subject to change.
-  void* (*GetScratchBuffer)(struct TfLiteContext* ctx, int buffer_idx);
-
-  // Resize the memory pointer of the `tensor`. This method behaves the same as
-  // `ResizeTensor`, except that it makes a copy of the shape array internally
-  // so the shape array could be deallocated right afterwards.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*ResizeTensorExplicit)(struct TfLiteContext* ctx,
-                                       TfLiteTensor* tensor, int dims,
-                                       const int* shape);
-
-  // This method provides a preview of post-delegation partitioning. Each
-  // TfLiteDelegateParams in the referenced array corresponds to one instance of
-  // the delegate kernel.
-  // Example usage:
-  //
-  // TfLiteIntArray* nodes_to_replace = ...;
-  // TfLiteDelegateParams* params_array;
-  // int num_partitions = 0;
-  // TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
-  //    context, delegate, nodes_to_replace, &params_array, &num_partitions));
-  // for (int idx = 0; idx < num_partitions; idx++) {
-  //    const auto& partition_params = params_array[idx];
-  //    ...
-  // }
-  //
-  // NOTE: The context owns the memory referenced by partition_params_array. It
-  // will be cleared with another call to PreviewDelegateParitioning, or after
-  // TfLiteDelegateParams::Prepare returns.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*PreviewDelegatePartitioning)(
-      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
-      TfLiteDelegateParams** partition_params_array, int* num_partitions);
-
-  // Returns a TfLiteTensor struct for a given index.
-  // WARNING: This is an experimental interface that is subject to change.
-  // WARNING: This method may not be available on all platforms.
-  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
-                             int tensor_idx);
-
-  // Returns a TfLiteEvalTensor struct for a given index.
-  // WARNING: This is an experimental interface that is subject to change.
-  // WARNING: This method may not be available on all platforms.
-  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
-                                     int tensor_idx);
-
-  // Retrieves named metadata buffer from the TFLite model.
-  // Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
-  // Model: that is, there exists a `metadata` entry with given `name` string.
-  // (see TFLite's schema.fbs).
-  // The corresponding `buffer` information is populated in `ptr` & `bytes`.
-  // The data from `ptr` is valid for the lifetime of the Interpreter.
-  //
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
-                                   const char* name, const char** ptr,
-                                   size_t* bytes);
-} TfLiteContext;
-
-// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
-// for C API which doesn't use internal types (such as `TfLiteContext`) but only
-// uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
-// field is the exactly the same as with `TfLiteRegistration`.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
-
-typedef struct TfLiteRegistration {
-  // Initializes the op from serialized data.
-  // Called only *once* for the lifetime of the op, so any one-time allocations
-  // should be made here (unless they depend on tensor sizes).
-  //
-  // If a built-in op:
-  //   `buffer` is the op's params data (TfLiteLSTMParams*).
-  //   `length` is zero.
-  // If custom op:
-  //   `buffer` is the op's `custom_options`.
-  //   `length` is the size of the buffer.
-  //
-  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
-  // or an instance of a struct).
-  //
-  // The returned pointer will be stored with the node in the `user_data` field,
-  // accessible within prepare and invoke functions below.
-  // NOTE: if the data is already in the desired format, simply implement this
-  // function to return `nullptr` and implement the free function to be a no-op.
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-
-  // The pointer `buffer` is the data previously returned by an init invocation.
-  void (*free)(TfLiteContext* context, void* buffer);
-
-  // prepare is called when the inputs this node depends on have been resized.
-  // context->ResizeTensor() can be called to request output tensors to be
-  // resized.
-  // Can be called multiple times for the lifetime of the op.
-  //
-  // Returns kTfLiteOk on success.
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-
-  // Execute the node (should read node->inputs and output to node->outputs).
-  // Returns kTfLiteOk on success.
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-
-  // profiling_string is called during summarization of profiling information
-  // in order to group executions together. Providing a value here will cause a
-  // given op to appear multiple times is the profiling report. This is
-  // particularly useful for custom ops that can perform significantly
-  // different calculations depending on their `user-data`.
-  const char* (*profiling_string)(const TfLiteContext* context,
-                                  const TfLiteNode* node);
-
-  // Builtin codes. If this kernel refers to a builtin this is the code
-  // of the builtin. This is so we can do marshaling to other frameworks like
-  // NN API.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  int32_t builtin_code;
-
-  // Custom op name. If the op is a builtin, this will be null.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  // WARNING: This is an experimental interface that is subject to change.
-  const char* custom_name;
-
-  // The version of the op.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  int version;
-
-  // The external version of `TfLiteRegistration`. Since we can't use internal
-  // types (such as `TfLiteContext`) for C API to maintain ABI stability.
-  // C API user will provide `TfLiteRegistrationExternal` to implement custom
-  // ops. We keep it inside of `TfLiteRegistration` and use it to route
-  // callbacks properly.
-  TfLiteRegistrationExternal* registration_external;
-} TfLiteRegistration;
-
-// Old version of `TfLiteRegistration` to maintain binary backward
-// compatibility.
-// WARNING: This structure is deprecated / not an official part of the API.
-// It should be only used for binary backward compatibility.
-typedef struct TfLiteRegistration_V1 {
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-  void (*free)(TfLiteContext* context, void* buffer);
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-  const char* (*profiling_string)(const TfLiteContext* context,
-                                  const TfLiteNode* node);
-  int32_t builtin_code;
-  const char* custom_name;
-  int version;
-} TfLiteRegistration_V1;
-
-// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
-// values should be 1, 2, 4, 8, ...etc.
-typedef enum TfLiteDelegateFlags {
-  kTfLiteDelegateFlagsNone = 0,
-  // The flag is set if the delegate can handle dynamic sized tensors.
-  // For example, the output shape of a `Resize` op with non-constant shape
-  // can only be inferred when the op is invoked.
-  // In this case, the Delegate is responsible for calling
-  // `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
-  // `ResizeTensor` when invoking the op.
-  //
-  // If the delegate isn't capable to handle dynamic tensors, this flag need
-  // to be set to false.
-  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
-
-  // This flag can be used by delegates (that allow dynamic tensors) to ensure
-  // applicable tensor shapes are automatically propagated in the case of tensor
-  // resizing.
-  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
-  // of a delegate kernel will have correct shapes before its Prepare() method
-  // is called. The runtime leverages TFLite builtin ops in the original
-  // execution plan to propagate shapes.
-  //
-  // A few points to note:
-  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
-  // false, this one is redundant since the delegate kernels are re-initialized
-  // every time tensors are resized.
-  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
-  // work is required to prepare the original execution plan.
-  // 3. This flag requires that the original execution plan only have ops with
-  // valid registrations (and not 'dummy' custom ops like with Flex).
-  // WARNING: This feature is experimental and subject to change.
-  kTfLiteDelegateFlagsRequirePropagatedShapes = 2
-} TfLiteDelegateFlags;
-
-// WARNING: This is an experimental interface that is subject to change.
-typedef struct TfLiteDelegate {
-  // Data that delegate needs to identify itself. This data is owned by the
-  // delegate. The delegate is owned in the user code, so the delegate is
-  // responsible for deallocating this when it is destroyed.
-  void* data_;
-
-  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
-  // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
-  // to ask the TensorFlow lite runtime to create macro-nodes to represent
-  // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteContext* context,
-                          struct TfLiteDelegate* delegate);
-
-  // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
-  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
-  // cannot be null.
-  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
-                                       struct TfLiteDelegate* delegate,
-                                       TfLiteBufferHandle buffer_handle,
-                                       TfLiteTensor* tensor);
-
-  // Copy the data from raw memory of the given 'tensor' to delegate buffer
-  // handle. This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
-                                     struct TfLiteDelegate* delegate,
-                                     TfLiteBufferHandle buffer_handle,
-                                     TfLiteTensor* tensor);
-
-  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
-  // this doesn't release the underlying resource (e.g. textures). The
-  // resources are either owned by application layer or the delegate.
-  // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteContext* context,
-                           struct TfLiteDelegate* delegate,
-                           TfLiteBufferHandle* handle);
-
-  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
-  int64_t flags;
-
-  // The opaque delegate builder associated with this object.  If set then the
-  // TF Lite runtime will give precedence to this field.  E.g. instead of
-  // invoking 'Prepare' via the function pointer inside the 'TfLiteDelegate'
-  // object, the runtime will first check if the corresponding function
-  // pointer inside 'opaque_delegate_builder' is set and if so invoke that.
-  //
-  // If this field is non-null, then the 'Prepare' field (of the
-  // 'TfLiteDelegate') should be null.
-  struct TfLiteOpaqueDelegateBuilder* opaque_delegate_builder;
-} TfLiteDelegate;
-
-// Build a 'null' delegate, with all the fields properly set to their default
-// values.
-TfLiteDelegate TfLiteDelegateCreate(void);
-
-// `TfLiteOpaqueDelegateBuilder` is used for constructing
-// `TfLiteOpaqueDelegateStruct`, see `TfLiteOpaqueDelegateCreate` below.  Note:
-// This struct is not ABI stable.
-//
-// For forward source compatibility `TfLiteOpaqueDelegateBuilder` objects should
-// be brace-initialized, so that all fields (including any that might be added
-// in the future) get zero-initialized.  The purpose of each field is exactly
-// the same as with `TfLiteDelegate`.
-//
-// WARNING: This is an experimental interface that is subject to change.
-typedef struct TfLiteOpaqueDelegateBuilder {
-  // Data that delegate needs to identify itself. This data is owned by the
-  // delegate. The delegate is owned in the user code, so the delegate is
-  // responsible for deallocating this when it is destroyed.
-  void* data;
-  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
-  // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
-  // to ask the TensorFlow lite runtime to create macro-nodes to represent
-  // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteOpaqueContext* context,  // NOLINT
-                          struct TfLiteOpaqueDelegateStruct* delegate,
-                          void* data);
-  // Copies the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
-  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
-  // cannot be null.
-  TfLiteStatus (*CopyFromBufferHandle)(  // NOLINT
-      TfLiteOpaqueContext* context, struct TfLiteOpaqueDelegateStruct* delegate,
-      void* data, TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
-  // Copies the data from raw memory of the given 'tensor' to delegate buffer
-  // handle. This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyToBufferHandle)(  // NOLINT
-      TfLiteOpaqueContext* context, struct TfLiteOpaqueDelegateStruct* delegate,
-      void* data, TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
-  // Frees the Delegate Buffer Handle. Note: This only frees the handle, but
-  // this doesn't release the underlying resource (e.g. textures). The
-  // resources are either owned by application layer or the delegate.
-  // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteOpaqueContext* context,  // NOLINT
-                           struct TfLiteOpaqueDelegateStruct* delegate,
-                           void* data, TfLiteBufferHandle* handle);
-  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
-  int64_t flags;
-} TfLiteOpaqueDelegateBuilder;
-
-// Creates an opaque delegate and returns its address.  The opaque delegate will
-// behave according to the provided 'opaque_delegate_builder'.  The lifetime of
-// the fields within the 'opaque_delegate_builder' must outlive any interaction
-// between the runtime and the returned 'TfLiteOpaqueDelegateStruct'.  The
-// returned address should be passed to 'TfLiteOpaqueDelegateDelete' for
-// deletion.  If 'opaque_delegate_builder' is a null pointer, then a null
-// pointer will be returned.
-struct TfLiteOpaqueDelegateStruct* TfLiteOpaqueDelegateCreate(
-    const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder);
-
-// Deletes the provided opaque 'delegate'.  This function has no effect if the
-// 'delegate' is a null pointer.
-void TfLiteOpaqueDelegateDelete(
-    const struct TfLiteOpaqueDelegateStruct* delegate);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
 #endif  // TENSORFLOW_LITE_C_COMMON_H_
diff --git a/tensorflow/lite/c/common_internal.cc b/tensorflow/lite/c/common_internal.cc
index bbb6e3ee27b..2728fa91a0e 100644
--- a/tensorflow/lite/c/common_internal.cc
+++ b/tensorflow/lite/c/common_internal.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common_internal.h"
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 
 TfLiteStatus TfLiteDelegatePrepareInternal(TfLiteContext* context,
                                            TfLiteDelegate* delegate) {
@@ -24,12 +24,12 @@ TfLiteStatus TfLiteDelegatePrepareInternal(TfLiteContext* context,
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
+  // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
       delegate->opaque_delegate_builder->Prepare) {
     status = delegate->opaque_delegate_builder->Prepare(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
-        reinterpret_cast<struct TfLiteOpaqueDelegateStruct*>(delegate),
+        reinterpret_cast<TfLiteOpaqueDelegate*>(delegate),
         delegate->opaque_delegate_builder->data);
   } else {
     status = delegate->Prepare(context, delegate);
@@ -43,12 +43,12 @@ TfLiteStatus TfLiteDelegateCopyFromBufferHandleInternal(
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
+  // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
       tensor->delegate->opaque_delegate_builder->CopyFromBufferHandle) {
     return delegate->opaque_delegate_builder->CopyFromBufferHandle(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
-        reinterpret_cast<TfLiteOpaqueDelegateStruct*>(delegate),
+        reinterpret_cast<TfLiteOpaqueDelegate*>(delegate),
         delegate->opaque_delegate_builder->data, tensor->buffer_handle,
         reinterpret_cast<TfLiteOpaqueTensor*>(tensor));
   } else {
@@ -64,12 +64,12 @@ TfLiteStatus TfLiteDelegateFreeBufferHandleInternal(
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
+  // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate) &&
       delegate->opaque_delegate_builder->FreeBufferHandle) {
     delegate->opaque_delegate_builder->FreeBufferHandle(
         reinterpret_cast<TfLiteOpaqueContext*>(context),
-        reinterpret_cast<struct TfLiteOpaqueDelegateStruct*>(delegate),
+        reinterpret_cast<TfLiteOpaqueDelegate*>(delegate),
         delegate->opaque_delegate_builder->data, buffer_handle);
     return kTfLiteOk;
   } else if (delegate->FreeBufferHandle != nullptr) {
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index 18a3405d2b6..f0baced21da 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_C_COMMON_INTERNAL_H_
 #define TENSORFLOW_LITE_C_COMMON_INTERNAL_H_
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 
 // Internal structures and subroutines used by the C API. These are likely to
 // change and should not be depended on directly by any C API clients.
diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc
deleted file mode 100644
index 633113bbba6..00000000000
--- a/tensorflow/lite/c/common_test.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/c/common.h"
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
-
-namespace tflite {
-
-// NOTE: this tests only the TfLiteIntArray part of context.
-// most of common.h is provided in the context of using it with
-// interpreter.h and interpreter.cc, so interpreter_test.cc tests context
-// structures more thoroughly.
-
-TEST(IntArray, TestIntArrayCreate) {
-  TfLiteIntArray* a = TfLiteIntArrayCreate(0);
-  TfLiteIntArray* b = TfLiteIntArrayCreate(3);
-  TfLiteIntArrayFree(a);
-  TfLiteIntArrayFree(b);
-}
-
-TEST(IntArray, TestIntArrayCopy) {
-  TfLiteIntArray* a = TfLiteIntArrayCreate(2);
-  a->data[0] = 22;
-  a->data[1] = 24;
-  TfLiteIntArray* b = TfLiteIntArrayCopy(a);
-  ASSERT_NE(a, b);
-  ASSERT_EQ(a->size, b->size);
-  ASSERT_EQ(a->data[0], b->data[0]);
-  ASSERT_EQ(a->data[1], b->data[1]);
-  TfLiteIntArrayFree(a);
-  TfLiteIntArrayFree(b);
-}
-
-TEST(IntArray, TestIntArrayEqual) {
-  TfLiteIntArray* a = TfLiteIntArrayCreate(1);
-  a->data[0] = 1;
-  TfLiteIntArray* b = TfLiteIntArrayCreate(2);
-  b->data[0] = 5;
-  b->data[1] = 6;
-  TfLiteIntArray* c = TfLiteIntArrayCreate(2);
-  c->data[0] = 5;
-  c->data[1] = 6;
-  TfLiteIntArray* d = TfLiteIntArrayCreate(2);
-  d->data[0] = 6;
-  d->data[1] = 6;
-  ASSERT_FALSE(TfLiteIntArrayEqual(a, b));
-  ASSERT_TRUE(TfLiteIntArrayEqual(b, c));
-  ASSERT_TRUE(TfLiteIntArrayEqual(b, b));
-  ASSERT_FALSE(TfLiteIntArrayEqual(c, d));
-  TfLiteIntArrayFree(a);
-  TfLiteIntArrayFree(b);
-  TfLiteIntArrayFree(c);
-  TfLiteIntArrayFree(d);
-}
-
-TEST(FloatArray, TestFloatArrayCreate) {
-  TfLiteFloatArray* a = TfLiteFloatArrayCreate(0);
-  TfLiteFloatArray* b = TfLiteFloatArrayCreate(3);
-  TfLiteFloatArrayFree(a);
-  TfLiteFloatArrayFree(b);
-}
-
-TEST(Types, TestTypeNames) {
-  auto type_name = [](TfLiteType t) {
-    return std::string(TfLiteTypeGetName(t));
-  };
-  EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
-  EXPECT_EQ(type_name(kTfLiteFloat64), "FLOAT64");
-  EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
-  EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
-  EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
-  EXPECT_EQ(type_name(kTfLiteUInt16), "UINT16");
-  EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
-  EXPECT_EQ(type_name(kTfLiteUInt32), "UINT32");
-  EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
-  EXPECT_EQ(type_name(kTfLiteUInt64), "UINT64");
-  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
-  EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
-  EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
-  EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
-  EXPECT_EQ(type_name(kTfLiteComplex128), "COMPLEX128");
-  EXPECT_EQ(type_name(kTfLiteString), "STRING");
-  EXPECT_EQ(type_name(kTfLiteResource), "RESOURCE");
-  EXPECT_EQ(type_name(kTfLiteVariant), "VARIANT");
-  EXPECT_EQ(type_name(kTfLiteInt4), "INT4");
-}
-
-TEST(Quantization, TestQuantizationFree) {
-  TfLiteTensor t;
-  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
-  t.allocation_type = kTfLiteArenaRw;
-  t.dims = nullptr;
-  t.dims_signature = nullptr;
-  t.quantization.type = kTfLiteAffineQuantization;
-  t.sparsity = nullptr;
-  auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
-      malloc(sizeof(TfLiteAffineQuantization)));
-  params->scale = TfLiteFloatArrayCreate(3);
-  params->zero_point = TfLiteIntArrayCreate(3);
-  t.quantization.params = reinterpret_cast<void*>(params);
-  TfLiteTensorFree(&t);
-}
-
-TEST(Sparsity, TestSparsityFree) {
-  TfLiteTensor t = {};
-  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
-  t.allocation_type = kTfLiteArenaRw;
-  t.dims = nullptr;
-  t.dims_signature = nullptr;
-
-  // A dummy CSR sparse matrix.
-  t.sparsity = static_cast<TfLiteSparsity*>(malloc(sizeof(TfLiteSparsity)));
-  t.sparsity->traversal_order = TfLiteIntArrayCreate(2);
-  t.sparsity->block_map = nullptr;
-
-  t.sparsity->dim_metadata = static_cast<TfLiteDimensionMetadata*>(
-      malloc(sizeof(TfLiteDimensionMetadata) * 2));
-  t.sparsity->dim_metadata_size = 2;
-
-  t.sparsity->dim_metadata[0].format = kTfLiteDimDense;
-  t.sparsity->dim_metadata[0].dense_size = 4;
-
-  t.sparsity->dim_metadata[1].format = kTfLiteDimSparseCSR;
-  t.sparsity->dim_metadata[1].array_segments = TfLiteIntArrayCreate(2);
-  t.sparsity->dim_metadata[1].array_indices = TfLiteIntArrayCreate(3);
-
-  TfLiteTensorFree(&t);
-}
-
-TEST(TensorCopy, TensorCopy_VALID) {
-  const int kNumElements = 32;
-  const int kBytes = sizeof(float) * kNumElements;
-  TfLiteTensor src;
-  TfLiteTensor dst;
-  TfLiteDelegate delegate;
-  memset(&delegate, 0, sizeof(delegate));
-  memset(&src, 0, sizeof(TfLiteTensor));
-  memset(&dst, 0, sizeof(TfLiteTensor));
-  src.data.raw = static_cast<char*>(malloc(kBytes));
-  for (int i = 0; i < kNumElements; ++i) {
-    src.data.f[i] = i;
-  }
-  dst.data.raw = static_cast<char*>(malloc(kBytes));
-
-  src.bytes = dst.bytes = kBytes;
-  src.delegate = &delegate;
-  src.data_is_stale = true;
-  src.allocation_type = kTfLiteDynamic;
-  src.type = kTfLiteFloat32;
-  src.dims = TfLiteIntArrayCreate(1);
-  src.dims->data[0] = 1;
-  src.dims_signature = TfLiteIntArrayCopy(src.dims);
-  src.buffer_handle = 5;
-
-  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(&src, &dst));
-
-  EXPECT_EQ(dst.bytes, src.bytes);
-  EXPECT_EQ(dst.delegate, src.delegate);
-  EXPECT_EQ(dst.data_is_stale, src.data_is_stale);
-  EXPECT_EQ(dst.type, src.type);
-  EXPECT_EQ(1, TfLiteIntArrayEqual(dst.dims, src.dims));
-  EXPECT_EQ(dst.buffer_handle, src.buffer_handle);
-  for (int i = 0; i < kNumElements; ++i) {
-    EXPECT_EQ(dst.data.f[i], src.data.f[i]);
-  }
-
-  TfLiteTensorFree(&src);
-  // We don't change allocation type, and since the test keeps the dst
-  // allocation as non dynamic, then we have to delete it manually.
-  free(dst.data.raw);
-  TfLiteTensorFree(&dst);
-}
-
-TEST(TensorCopy, TensorCopy_INVALID) {
-  TfLiteTensor src;
-  TfLiteTensor dst;
-
-  // Nullptr passed, should just return.
-  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(&src, nullptr));
-  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(nullptr, &dst));
-
-  // Incompatible sizes passed.
-  src.bytes = 10;
-  dst.bytes = 12;
-  EXPECT_EQ(kTfLiteError, TfLiteTensorCopy(&src, &dst));
-}
-
-TEST(TestTfLiteOpaqueDelegate, CreateAndDelete) {
-  std::unique_ptr<TfLiteOpaqueDelegateBuilder> opaque_delegate_builder(
-      new TfLiteOpaqueDelegateBuilder{});
-
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
-      TfLiteOpaqueDelegateCreate(opaque_delegate_builder.get());
-
-  TfLiteOpaqueDelegateDelete(opaque_delegate);
-}
-
-TEST(TestTfLiteOpaqueDelegate, CallTfLiteOpaqueDelegateCreateWithNull) {
-  EXPECT_EQ(nullptr, TfLiteOpaqueDelegateCreate(nullptr));
-}
-
-TEST(TestTfLiteOpaqueDelegate, CallTfLiteOpaqueDelegateDeleteWithNull) {
-  TfLiteOpaqueDelegateDelete(nullptr);
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/cmake/DownloadPThreadPool.cmake b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
new file mode 100644
index 00000000000..a44c518e8a9
--- /dev/null
+++ b/tensorflow/lite/cmake/DownloadPThreadPool.cmake
@@ -0,0 +1,30 @@
+#
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMAKE_MINIMUM_REQUIRED(VERSION 3.5 FATAL_ERROR)
+
+PROJECT(pthreadpool-download NONE)
+
+INCLUDE(ExternalProject)
+ExternalProject_Add(pthreadpool
+  URL https://github.com/Maratyszcza/pthreadpool/archive/545ebe9f225aec6dca49109516fac02e973a3de2.zip
+  URL_HASH SHA256=8461f6540ae9f777ce20d1c0d1d249e5e61c438744fb390c0c6f91940aa69ea3
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/pthreadpool-source"
+  BINARY_DIR "${CMAKE_BINARY_DIR}/pthreadpool"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND ""
+)
\ No newline at end of file
diff --git a/tensorflow/lite/context.h b/tensorflow/lite/context.h
index b32e98b82bf..c82f621f0ae 100644
--- a/tensorflow/lite/context.h
+++ b/tensorflow/lite/context.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CONTEXT_H_
 #define TENSORFLOW_LITE_CONTEXT_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 #endif  // TENSORFLOW_LITE_CONTEXT_H_
diff --git a/tensorflow/lite/context_util.h b/tensorflow/lite/context_util.h
index ed42cc736ba..cbbe9f10311 100644
--- a/tensorflow/lite/context_util.h
+++ b/tensorflow/lite/context_util.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// This provides a few C++ helpers that are useful for manipulating C
 /// structures in C++.
 #ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_
@@ -20,7 +21,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index 690253aacd2..dadabed801d 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -1,19 +1,30 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-exports_files([
-    "macros.h",
-    "subgraph.h",
-])
+exports_files(
+    srcs = [
+        "create_op_resolver.h",
+        "macros.h",
+        "subgraph.h",
+    ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+bzl_library(
+    name = "special_rules_bzl",
+    srcs = ["special_rules.bzl"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
 
-# WARNING: Deps of :headers and :private_headers must be the same.
-# If you update deps of :headers, then make the same update to :private_headers.
-# LINT.IfChange(headers)
 cc_library(
     name = "headers",
     hdrs = [
@@ -32,16 +43,16 @@ cc_library(
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite:string",
         "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
     ],
 )
-# LINT.ThenChange(:private_headers)
 
 # This target is only for use by the 'tflite_custom_c_library' build macro
 # and should not be used anywhere other than in the implementation of that build macro.
@@ -52,38 +63,14 @@ cc_library(
 # it and subsequently depend on it, which would be bad. Using a separate :private_headers target
 # ensures that the only use of the unwantedly-"public" target is inside the
 # 'tflite_custom_c_library' itself, where it is less likely to get copied into third party code.
-# WARNING: Deps of :private_headers and :headers must be the same.
-# If you update deps of :private_headers, then make the same update to :headers.
-# LINT.IfChange(private_headers)
-cc_library(
+alias(
     name = "private_headers",
-    hdrs = [
-        "interpreter.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
+    actual = ":headers",
     tags = ["avoid_dep"],
     visibility = [
         "//visibility:public",
     ],
-    deps = [
-        ":subgraph",
-        "//tensorflow/lite:allocation",
-        "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:interpreter_options_header",
-        "//tensorflow/lite:signature_runner",
-        "//tensorflow/lite:stderr_reporter",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/core/api:error_reporter",
-        "//tensorflow/lite/experimental/remat:metadata_util",
-        "//tensorflow/lite/experimental/resource",
-        "//tensorflow/lite/internal:signature_def",
-        "//tensorflow/lite/profiling:root_profiler",
-    ],
 )
-# LINT.ThenChange(:headers)
 
 # The public target for the C++ API excluding experimental APIs.
 # TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
@@ -119,14 +106,18 @@ cc_library(
         "//tensorflow/lite:string",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
     ],
@@ -186,14 +177,18 @@ cc_library(
         "//tensorflow/lite:string",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
     ],
@@ -239,11 +234,11 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:telemetry",
         "//tensorflow/lite/delegates/xnnpack:tflite_with_xnnpack_qs8",
         "//tensorflow/lite/delegates/xnnpack:tflite_with_xnnpack_qu8",
@@ -253,6 +248,10 @@ cc_library(
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers//:runtime_cc",
@@ -311,15 +310,19 @@ cc_library(
         "//tensorflow/lite:string",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/internal:signature_def",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "//tensorflow/lite/schema:conversion_metadata_fbs",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers//:runtime_cc",
         "@ruy//ruy:denormal",
@@ -339,10 +342,10 @@ cc_library(
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers",
     ],
@@ -376,7 +379,7 @@ cc_test(
         "tflite_smoke_test",
     ],
     deps = [
-        ":headers",
+        ":framework",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:interpreter_test_util",
         "//tensorflow/lite:string",
@@ -384,7 +387,8 @@ cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
@@ -392,11 +396,6 @@ cc_test(
     ],
 )
 
-# WARNING: Deps of :create_op_resolver_header and :private_create_op_resolver_header must
-# be the same.
-# If you update deps of :create_op_resolver_header, then make the same update
-# to :private_create_op_resolver_header.
-# LINT.IfChange(create_op_resolver_header)
 cc_library(
     name = "create_op_resolver_header",
     hdrs = [
@@ -411,7 +410,6 @@ cc_library(
         "//tensorflow/lite:op_resolver",
     ],
 )
-# LINT.ThenChange(:private_create_op_resolver_header)
 
 # This target is only for use by the "tflite_custom_c_library" and "tflite_custom_cc_library" build
 # macro and should not be used anywhere other than in the implementation of that build macro.
@@ -423,27 +421,14 @@ cc_library(
 # :private_create_op_resolver_header target ensures that the only use of the unwantedly-"public"
 # target is inside the "tflite_custom_c_library" itself, where it is less likely to get copied into
 # third party code.
-# WARNING: Deps of :private_create_op_resolver_header and :create_op_resolver_header must
-# be the same.
-# If you update deps of :private_create_op_resolver_header, then make the same update
-# to :create_op_resolver_header.
-# LINT.IfChange(private_create_op_resolver_header)
-cc_library(
+alias(
     name = "private_create_op_resolver_header",
-    hdrs = [
-        "create_op_resolver.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
+    actual = ":create_op_resolver_header",
     tags = ["avoid_dep"],
     visibility = [
         "//visibility:public",
     ],
-    deps = [
-        "//tensorflow/lite:mutable_op_resolver",
-        "//tensorflow/lite:op_resolver",
-    ],
 )
-# LINT.ThenChange(:create_op_resolver_header)
 
 cc_library(
     name = "macros",
@@ -465,6 +450,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tflite_copts_warnings(),
     visibility = [
+        "//platforms/darwinn/tflite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
         "//tensorflow/lite/core:__subpackages__",
         "//tensorflow/lite/kernels:__subpackages__",
@@ -478,13 +464,14 @@ cc_library(
         "//tensorflow/lite:memory_planner",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/c:common_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/profiling:root_profiler",
+        "//tensorflow/lite/profiling/telemetry",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
         "//tensorflow/lite:tflite_use_simple_memory_planner": [
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index e0c37a3d7d6..916dbafe541 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/lite:special_rules.bzl", "op_resolver_internal_visibility_all
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
@@ -27,7 +28,7 @@ cc_library(
         ":error_reporter",
         ":op_resolver",
         "@flatbuffers//:runtime_cc",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         # TODO(b/158301698): consider moving internal:compatibility to a more
         # central location.
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -53,7 +54,7 @@ cc_library(
     ],
     deps = [
         ":error_reporter",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers//:runtime_cc",
@@ -124,7 +125,7 @@ cc_test(
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:mutable_op_resolver",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
@@ -137,7 +138,7 @@ cc_test(
     deps = [
         ":api",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/api/error_reporter.h b/tensorflow/lite/core/api/error_reporter.h
index 05839a611c7..99ab8cf3657 100644
--- a/tensorflow/lite/core/api/error_reporter.h
+++ b/tensorflow/lite/core/api/error_reporter.h
@@ -34,9 +34,22 @@ namespace tflite {
 /// that drives a GUI error log box.
 class ErrorReporter {
  public:
-  virtual ~ErrorReporter() {}
+  virtual ~ErrorReporter() = default;
+  /// Converts `args` to character equivalents according to `format` string,
+  /// constructs the error string and report it.
+  /// Returns number of characters written or zero on success, and negative
+  /// number on error.
   virtual int Report(const char* format, va_list args) = 0;
+
+  /// Converts arguments to character equivalents according to `format` string,
+  /// constructs the error string and report it.
+  /// Returns number of characters written or zero on success, and negative
+  /// number on error.
   int Report(const char* format, ...);
+
+  /// Equivalent to `Report` above. The additional `void*` parameter is unused.
+  /// This method is for compatibility with macros that takes `TfLiteContext`,
+  /// like TF_LITE_ENSURE and related macros.
   int ReportError(void*, const char* format, ...);
 };
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index ac2b68f6146..2879afd8c1d 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <memory>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -2160,6 +2160,8 @@ TfLiteStatus ParseUnidirectionalSequenceLSTM(const Operator* op,
     params->time_major = seq_lstm_params->time_major();
     params->asymmetric_quantize_inputs =
         seq_lstm_params->asymmetric_quantize_inputs();
+    params->diagonal_recurrent_tensors =
+        seq_lstm_params->diagonal_recurrent_tensors();
   }
   *builtin_data = params.release();
   return kTfLiteOk;
@@ -2338,6 +2340,9 @@ TfLiteStatus ParseTransposeConv(const Operator* op,
     params->padding = ConvertPadding(transpose_conv_params->padding());
     params->stride_width = transpose_conv_params->stride_w();
     params->stride_height = transpose_conv_params->stride_h();
+
+    params->activation =
+        ConvertActivation(transpose_conv_params->fused_activation_function());
   } else {
     // TODO(b/157480169): We should either return kTfLiteError or fill in some
     // reasonable defaults in the params struct. We are not doing so until we
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index c7653f01f84..4df83d5eeaa 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include <new>
 #include <type_traits>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index d9f05112ff5..314a1f8b805 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index 6f7e4c2a64e..ce5ae4f406e 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/core/api/op_resolver.h"
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index cec1f2dd46a..0118c84e144 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -19,20 +19,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
-// Opaque type similar to TfLiteDelegate / TfLiteOpaqueDelegate.
-// This is used for cases (e.g. when using "TF Lite with Google Play Services")
-// where the TF Lite runtime might be built using a newer (or older)
-// version of the TF Lite sources than the app, and hence might have a
-// different definition of the TfLiteDelegate type. TF Lite APIs use
-// TfLiteOpaqueDelegate rather than TfLiteDelegate when they want to
-// refer to a delegate defined with that potentially different version
-// of the TfLiteDelegate type.
-struct TfLiteOpaqueDelegateStruct;
-
 namespace tflite {
 
 /// Abstract interface that returns TfLiteRegistrations given op codes or custom
@@ -62,7 +52,7 @@ class OpResolver {
   // Represents a function that creates a TfLite delegate instance.
   using TfLiteDelegateCreator =
       std::function<std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
-          int /*num_threads*/)>;
+          TfLiteContext* /*context*/)>;
 
   // Represents a sequence of delegate creator functions.
   using TfLiteDelegateCreators = std::vector<TfLiteDelegateCreator>;
@@ -82,8 +72,7 @@ class OpResolver {
   // Represents an opaque delegate instance.
   // WARNING: Experimental interface, subject to change.
   using TfLiteOpaqueDelegatePtr =
-      std::unique_ptr<TfLiteOpaqueDelegateStruct,
-                      void (*)(TfLiteOpaqueDelegateStruct*)>;
+      std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
 
   // Represents a function that creates an opaque delegate instance.
   // WARNING: Experimental interface, subject to change.
diff --git a/tensorflow/lite/core/api/op_resolver_internal.h b/tensorflow/lite/core/api/op_resolver_internal.h
index 7283f435dd7..449492fe727 100644
--- a/tensorflow/lite/core/api/op_resolver_internal.h
+++ b/tensorflow/lite/core/api/op_resolver_internal.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_INTERNAL_H_
 
 /// \file
+///
 /// This header op_resolver_internal.h exists so that we can have fine-grained
 /// access control on the MayContainUserDefinedOps method.
 
diff --git a/tensorflow/lite/core/api/op_resolver_internal_test.cc b/tensorflow/lite/core/api/op_resolver_internal_test.cc
index d1c3f2c6c8d..d052e9c7bab 100644
--- a/tensorflow/lite/core/api/op_resolver_internal_test.cc
+++ b/tensorflow/lite/core/api/op_resolver_internal_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/api/op_resolver.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
index 1a54827794c..5affb56a0ce 100644
--- a/tensorflow/lite/core/api/profiler.h
+++ b/tensorflow/lite/core/api/profiler.h
@@ -39,26 +39,30 @@ class Profiler {
     // the delegate.
     DELEGATE_OPERATOR_INVOKE_EVENT = 1 << 2,
 
+    // A delegate op invoke event that profiles a delegate op in the
+    // Operator-wise Profiling section and not in the Delegate internal section.
+    DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT = 1 << 3,
+
     // The event is a recording of runtime instrumentation such as the overall
     // TFLite runtime status, the TFLite delegate status (if a delegate
     // is applied), and the overall model inference latency etc.
     // Note, the delegate status and overall status are stored as separate
     // event_metadata fields. In particular, the delegate status is encoded
     // as DelegateStatus::full_status().
-    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 1 << 3,
+    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 1 << 4,
 
     // Telemetry events. Users and code instrumentations should invoke Telemetry
     // calls instead of using the following types directly.
     // See experimental/telemetry:profiler for definition of each metadata.
     //
     // A telemetry event that reports model and interpreter level events.
-    TELEMETRY_EVENT = 1 << 4,
+    TELEMETRY_EVENT = 1 << 5,
     // A telemetry event that reports model and interpreter level settings.
-    TELEMETRY_REPORT_SETTINGS = 1 << 5,
+    TELEMETRY_REPORT_SETTINGS = 1 << 6,
     // A telemetry event that reports delegate level events.
-    TELEMETRY_DELEGATE_EVENT = 1 << 6,
+    TELEMETRY_DELEGATE_EVENT = 1 << 7,
     // A telemetry event that reports delegate settings.
-    TELEMETRY_DELEGATE_REPORT_SETTINGS = 1 << 7,
+    TELEMETRY_DELEGATE_REPORT_SETTINGS = 1 << 8,
   };
 
   virtual ~Profiler() {}
@@ -168,6 +172,16 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
                       static_cast<uint32_t>(node_index)) {}
 };
 
+class ScopedDelegateProfiledOperatorProfile : public ScopedProfile {
+ public:
+  ScopedDelegateProfiledOperatorProfile(Profiler* profiler, const char* tag,
+                                        int node_index)
+      : ScopedProfile(
+            profiler, tag,
+            Profiler::EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT,
+            static_cast<uint32_t>(node_index)) {}
+};
+
 // Similar to ScopedProfile but has extra event metadata for EndEvent.
 class ScopedRuntimeInstrumentationProfile {
  public:
@@ -217,6 +231,11 @@ class ScopedRuntimeInstrumentationProfile {
   tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
       _profile_, __COUNTER__)((profiler), (tag), (node_index))
 
+#define TFLITE_SCOPED_DELEGATE_PROFILED_OPERATOR_PROFILE(profiler, tag, \
+                                                         node_index)    \
+  tflite::ScopedDelegateProfiledOperatorProfile TFLITE_VARNAME_UNIQ(    \
+      _profile_, __COUNTER__)((profiler), (tag), (node_index))
+
 #define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
     profiler, tag, event_metadata1, event_metadata2)                       \
   do {                                                                     \
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index 3aac16b6878..18a643c78dc 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <string.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/api/tensor_utils.h b/tensorflow/lite/core/api/tensor_utils.h
index 9f1cf94a5ff..440da8ab431 100644
--- a/tensorflow/lite/core/api/tensor_utils.h
+++ b/tensorflow/lite/core/api/tensor_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
 #define TENSORFLOW_LITE_CORE_API_TENSOR_UTILS_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/api/verifier.h b/tensorflow/lite/core/api/verifier.h
index ca1cfb044bd..8128ff31e1e 100644
--- a/tensorflow/lite/core/api/verifier.h
+++ b/tensorflow/lite/core/api/verifier.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Abstract interface for verifying a model.
 #ifndef TENSORFLOW_LITE_CORE_API_VERIFIER_H_
 #define TENSORFLOW_LITE_CORE_API_VERIFIER_H_
diff --git a/tensorflow/lite/core/async/BUILD b/tensorflow/lite/core/async/BUILD
new file mode 100644
index 00000000000..dafa29b532e
--- /dev/null
+++ b/tensorflow/lite/core/async/BUILD
@@ -0,0 +1,152 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "async_kernel_internal",
+    hdrs = ["async_kernel_internal.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_library(
+    name = "common",
+    hdrs = ["common.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
+cc_library(
+    name = "task_internal",
+    srcs = ["task_internal.cc"],
+    hdrs = ["task_internal.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":async_kernel_internal",
+        ":common",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_test(
+    name = "task_internal_test",
+    srcs = ["task_internal_test.cc"],
+    deps = [
+        ":async_kernel_internal",
+        ":common",
+        ":task_internal",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "async_subgraph",
+    srcs = ["async_subgraph.cc"],
+    hdrs = ["async_subgraph.h"],
+    deps = [
+        ":async_kernel_internal",
+        ":common",
+        ":task_internal",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_test(
+    name = "async_subgraph_test",
+    srcs = ["async_subgraph_test.cc"],
+    deps = [
+        ":async_kernel_internal",
+        ":async_subgraph",
+        ":backend_async_kernel_interface",
+        ":common",
+        ":task_internal",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/async/interop:attribute_map_internal",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/core/async/testing:mock_async_kernel",
+        "//tensorflow/lite/core/async/testing:test_backend",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "backend_async_kernel_interface",
+    srcs = ["backend_async_kernel_interface.cc"],
+    hdrs = ["backend_async_kernel_interface.h"],
+    deps = [
+        ":async_kernel_internal",
+        ":common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_library(
+    name = "async_signature_runner",
+    srcs = ["async_signature_runner.cc"],
+    hdrs = ["async_signature_runner.h"],
+    deps = [
+        ":async_kernel_internal",
+        ":async_subgraph",
+        ":common",
+        ":task_internal",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_without_op_resolver",
+        "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/internal:signature_def",
+    ],
+)
+
+cc_test(
+    name = "async_signature_runner_test",
+    srcs = ["async_signature_runner_test.cc"],
+    deps = [
+        ":async_kernel_internal",
+        ":async_signature_runner",
+        ":backend_async_kernel_interface",
+        ":common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_for_testing",
+        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/async/c:task",
+        "//tensorflow/lite/core/async/c:types",
+        "//tensorflow/lite/core/async/testing:mock_async_kernel",
+        "//tensorflow/lite/core/async/testing:test_backend",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "backend_async_kernel_interface_test",
+    srcs = ["backend_async_kernel_interface_test.cc"],
+    deps = [
+        ":backend_async_kernel_interface",
+        "//tensorflow/lite/core/async/testing:mock_async_kernel",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/core/async/README.md b/tensorflow/lite/core/async/README.md
new file mode 100644
index 00000000000..c88aec59b3e
--- /dev/null
+++ b/tensorflow/lite/core/async/README.md
@@ -0,0 +1,16 @@
+# TfLite asynchronous execution
+
+WARNING: This feature is experimental and subject to change.
+
+Experimental support for TFLite asynchronous execution and interoperability.
+
+## Directory structure
+
+| Directory          | Description                                         |
+| ------------------ | --------------------------------------------------- |
+| `/async`           | Asynchronous execution APIs. Definition for async   |
+:                    : kernel.                                             :
+| `/async/interop`   | Data structures supporting buffer and sync object   |
+:                    : interop. Reconciliation functions for buffer / sync :
+:                    : attributes.                                         :
+| `/async/interop/c` | C APIs for buffer and sync object interop.          |
diff --git a/tensorflow/lite/core/async/async_kernel_internal.h b/tensorflow/lite/core/async/async_kernel_internal.h
new file mode 100644
index 00000000000..d0b88511dae
--- /dev/null
+++ b/tensorflow/lite/core/async/async_kernel_internal.h
@@ -0,0 +1,139 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+typedef struct TfLiteAttributeMap TfLiteAttributeMap;
+typedef struct TfLiteBackendBuffer TfLiteBackendBuffer;
+typedef struct TfLiteExecutionTask TfLiteExecutionTask;
+
+struct TfLiteAsyncKernel {
+  // Stores the arbitrary data used to identify the async kernel it self.
+  // Filled by the backend delegate.
+  void* kernel_data;
+
+  // Buffer operations
+  // ======================
+  // Registers the buffer to `handle`.
+  // `buffer` and `attrs` lifespan is not gauranteed after the function call.
+  // kernels should read the stored attributes instead of caching the
+  // attribute map.
+  // `io_type` specifies whether this buffer is used as an input buffer
+  // or an output buffer. If a buffer is both used as input and output,
+  // specify it as output. Not null.
+  // `attrs` describes the attributes of the buffer. It's gauranteed to be
+  // of kTfLiteBufferAttrMap type and not null.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  TfLiteStatus (*register_buffer)(TfLiteAsyncKernel* async_kernel,
+                                  TfLiteOpaqueContext* context, int32_t io_type,
+                                  const TfLiteBackendBuffer* buffer,
+                                  const TfLiteAttributeMap* attrs,
+                                  TfLiteBufferHandle handle) = nullptr;
+
+  // Registers a buffer slice from a previously registered memory.
+  // `buffer` is the handle of the buffer pool previously registered.
+  // `attrs` contains the information of the buffer slice.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // If the `handle` is not recognized, returns error.
+  TfLiteStatus (*register_buffer_slice)(TfLiteAsyncKernel* async_kernel,
+                                        TfLiteOpaqueContext* context,
+                                        TfLiteBufferHandle buffer_pool,
+                                        const TfLiteAttributeMap* attrs,
+                                        TfLiteBufferHandle handle) = nullptr;
+
+  // Unregisters a buffer or a buffer slice.
+  // `handle` is a buffer handle previously assigned via register_* calls.
+  // If the `handle` is not recognized, returns error.
+  TfLiteStatus (*unregister_buffer)(TfLiteAsyncKernel* async_kernel,
+                                    TfLiteOpaqueContext* context,
+                                    TfLiteBufferHandle handle) = nullptr;
+
+  // Reconciliations
+  // ===================
+  // Inspects the buffer types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // buffer.
+  std::vector<const char*> (*supported_buffer_types)(
+      const TfLiteAsyncKernel* async_kernel, int32_t io_type) = nullptr;
+
+  // Inspects the sync object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // sync object.
+  std::vector<const char*> (*supported_synchronizations)(
+      const TfLiteAsyncKernel* async_kernel, int32_t io_type) = nullptr;
+
+  // Reconciles buffer or sync attributes for tensor at tensor_index.
+  // Fills `merged` with reconciled attributes.
+  // If `conflict` is provided, conflicting attributes will be provided there.
+  // Returns true if there's no conflict.
+  bool (*reconcile_restrictions)(
+      const TfLiteAsyncKernel* async_kernel, TfLiteOpaqueContext* context,
+      TfLiteOpaqueNode* node, int tensor_index,
+      const TfLiteAttributeMap* user_provided_attributes,
+      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) = nullptr;
+
+  // Sets the input / output buffer / sync attributes.
+  // Backend kernel will check the input attributes covers all the requirements.
+  // A typical workflow is for callers call Reconcile*Restrictions method
+  // above to have a merged attribute list, check all restrictions are met
+  // and set input / output attribute here.
+  // Returns TfLiteOk if provided `attrs` covers all requirements.
+  TfLiteStatus (*set_attributes)(TfLiteAsyncKernel* async_kernel,
+                                 TfLiteOpaqueContext* context,
+                                 TfLiteOpaqueNode* node, int tensor_index,
+                                 const TfLiteAttributeMap* attrs) = nullptr;
+
+  // Prepares the kernel using the information from Set[In|Out]putAttributes
+  // call above.
+  TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
+                          TfLiteOpaqueContext* context,
+                          TfLiteOpaqueNode* node) = nullptr;
+  // Execution methods
+  // =============================
+
+  // Schedules an execution with the information provided in task.
+  // The application is responsible for filling out buffer and sync mappings
+  // to tensors.
+  // Backend will set the sync ptr for related tensors if requested.
+  // i.e. SetOutputAttributes has sync implementation requested, and
+  // the TfLiteSynchronization is not null for the tensor in `task`.
+  // Returns kTfLiteOk if the execution is successfully scheduled.
+  TfLiteStatus (*eval)(TfLiteAsyncKernel* async_kernel,
+                       TfLiteOpaqueContext* context, TfLiteOpaqueNode* node,
+                       TfLiteExecutionTask* task) = nullptr;
+
+  // Waits on the execution scheduled using the task to finish.
+  // Returns kTfLiteOk if the task is finished (w/ or w/o blocking).
+  TfLiteStatus (*wait)(TfLiteAsyncKernel* async_kernel,
+                       TfLiteOpaqueContext* context,
+                       TfLiteExecutionTask* task) = nullptr;
+
+  // Finishes the task and clean up allocated resources for the task.
+  // May block if there's pending executions.
+  // Returns kTfLiteOk if there's no error.
+  TfLiteStatus (*finish)(TfLiteAsyncKernel* async_kernel,
+                         TfLiteOpaqueContext* context,
+                         TfLiteExecutionTask* task) = nullptr;
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
diff --git a/tensorflow/lite/core/async/async_signature_runner.cc b/tensorflow/lite/core/async/async_signature_runner.cc
new file mode 100644
index 00000000000..8f4a2b23124
--- /dev/null
+++ b/tensorflow/lite/core/async/async_signature_runner.cc
@@ -0,0 +1,159 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/async/async_signature_runner.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/async_subgraph.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/task_internal.h"
+#include "tensorflow/lite/signature_runner.h"
+
+namespace tflite {
+
+// This is a temporary helper class that will be removed after this API is
+// moved out of experimental.
+class SignatureRunnerHelper {
+ public:
+  static Subgraph* GetSubgraph(SignatureRunner* runner) {
+    return runner->subgraph_;
+  }
+  static const internal::SignatureDef* GetSignatureDef(
+      SignatureRunner* runner) {
+    return runner->signature_def_;
+  }
+};
+
+namespace async {
+
+namespace {
+
+// Returns the tensor index of the given signature name.
+// `map` is a mapping from tensor signature name to tensor index.
+// Return -1 if name is not found in the map.
+int GetIndex(const std::map<std::string, uint32_t>& map, const char* name) {
+  const auto& it = map.find(name);
+  return it == map.end() ? -1 : it->second;
+}
+
+}  // namespace
+
+int AsyncSignatureRunner::GetTensorIndex(TfLiteIoType io_type,
+                                         const char* name) const {
+  int tensor_index = -1;
+  switch (io_type) {
+    case TfLiteIoType::kTfLiteIoInput: {
+      tensor_index = GetIndex(signature_def_->inputs, name);
+      break;
+    };
+    case TfLiteIoType::kTfLiteIoOutput: {
+      tensor_index = GetIndex(signature_def_->outputs, name);
+      break;
+    }
+    default: {
+      return false;
+    }
+  }
+  if (tensor_index < 0) {
+    subgraph_->ReportError("Signature tensor name %s was not found", name);
+  }
+  return tensor_index;
+}
+
+AsyncSignatureRunner::AsyncSignatureRunner(SignatureRunner* signature_runner)
+    : AsyncSignatureRunner(
+          SignatureRunnerHelper::GetSignatureDef(signature_runner),
+          SignatureRunnerHelper::GetSubgraph(signature_runner)) {}
+
+AsyncSignatureRunner::AsyncSignatureRunner(
+    const internal::SignatureDef* signature_def, Subgraph* subgraph)
+    : signature_def_(signature_def), subgraph_(subgraph) {
+  async_subgraph_ = std::make_unique<AsyncSubgraph>(subgraph);
+}
+
+TfLiteStatus AsyncSignatureRunner::RegisterBuffer(
+    TfLiteIoType io_type, const TfLiteBackendBuffer* buffer,
+    const TfLiteAttributeMap* attrs, TfLiteBufferHandle* handle) {
+  return async_subgraph_->RegisterBuffer(io_type, buffer, attrs, handle);
+}
+
+TfLiteStatus AsyncSignatureRunner::RegisterBufferSlice(
+    TfLiteBufferHandle buffer_pool, const TfLiteAttributeMap* attrs,
+    TfLiteBufferHandle* handle) {
+  return async_subgraph_->RegisterBufferSlice(buffer_pool, attrs, handle);
+}
+
+TfLiteStatus AsyncSignatureRunner::UnregisterBuffer(TfLiteBufferHandle handle) {
+  return async_subgraph_->UnregisterBuffer(handle);
+}
+
+std::vector<const char*> AsyncSignatureRunner::SupportedBufferTypes(
+    TfLiteIoType io_type) const {
+  return async_subgraph_->SupportedBufferTypes(io_type);
+}
+std::vector<const char*> AsyncSignatureRunner::SupportedSynchronizations(
+    TfLiteIoType io_type) const {
+  return async_subgraph_->SupportedSynchronizations(io_type);
+}
+
+bool AsyncSignatureRunner::ReconcileRestrictions(
+    TfLiteIoType io_type, const char* name,
+    const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const {
+  auto tensor_index = GetTensorIndex(io_type, name);
+  if (tensor_index < 0) return false;
+  return async_subgraph_->ReconcileRestrictions(
+      tensor_index, user_provided_attributes, merged, conflict);
+}
+
+TfLiteStatus AsyncSignatureRunner::SetAttributes(
+    TfLiteIoType io_type, const char* name, const TfLiteAttributeMap* attrs) {
+  auto tensor_index = GetTensorIndex(io_type, name);
+  if (tensor_index < 0) return kTfLiteError;
+  return async_subgraph_->SetAttributes(tensor_index, attrs);
+}
+
+TfLiteStatus AsyncSignatureRunner::PrepareBackends() {
+  return async_subgraph_->Prepare();
+}
+
+TfLiteExecutionTask* AsyncSignatureRunner::CreateTask() {
+  auto* task = async_subgraph_->CreateTask();
+  task->task->SetInputNameMap(&signature_def_->inputs);
+  task->task->SetOutputNameMap(&signature_def_->outputs);
+  return task;
+}
+
+TfLiteStatus AsyncSignatureRunner::InvokeAsync(TfLiteExecutionTask* task) {
+  return async_subgraph_->InvokeAsync(task);
+}
+
+TfLiteStatus AsyncSignatureRunner::Wait(TfLiteExecutionTask* task) {
+  return async_subgraph_->Wait(task);
+}
+
+TfLiteStatus AsyncSignatureRunner::Finish(TfLiteExecutionTask* task) {
+  return async_subgraph_->Finish(task);
+}
+
+}  // namespace async
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/async_signature_runner.h b/tensorflow/lite/core/async/async_signature_runner.h
new file mode 100644
index 00000000000..effece9552a
--- /dev/null
+++ b/tensorflow/lite/core/async/async_signature_runner.h
@@ -0,0 +1,178 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/async_subgraph.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/internal/signature_def.h"
+#include "tensorflow/lite/signature_runner.h"
+
+namespace tflite {
+namespace async {
+
+// Forward declarations
+class AsyncSignatureRunnerTest;
+
+// WARNING: Experimental interface, subject to change
+//
+// Async version of SignatureRunner class for running TFLite models using
+// SignatureDef.
+class AsyncSignatureRunner {
+ public:
+  // TODO(b/191883048): Move ctor to private and use `Create` function as
+  // factory method.
+  // Currently we don't have way to expose signature def from interpreter
+  // without changes to interpreter.
+  //
+  // static AsyncSignatureRunner* Create(const TfLiteInterpreter* interpreter,
+  //                                     const char* signature_key);
+  // WARNING: This is a temporary constructor before we stablize the API.
+  // This if for avoiding making intrusive changes to non experimental code.
+  // For now, users can construct AsyncSignatureRunner as follows:
+  //  std::unique_ptr<tflite::Interpreter> interpreter;
+  //  InterpreterBuilder(model, resolver)(&interpreter);
+  //  AsyncSignatureRunner runner(interpreter->GetSignatureRunner("func"));
+  explicit AsyncSignatureRunner(SignatureRunner* signature_runner);
+  AsyncSignatureRunner(const internal::SignatureDef* signature_def,
+                       Subgraph* subgraph);
+
+  // Registers a TfLiteBackendBuffer to backends.
+  // The `buffer` will be sent to all backends and TfLite runtime
+  // will assign an unique `handle` for backends to recognize the buffer.
+  // `io_type` specifies whether the buffer will be used as an input only
+  // or it will be used as an output.
+  // `buffer`, `attrs`, and `handle` should not be null.
+  // The application must provide the buffer type in `attrs`. It can also
+  // include additional attributes for the backends to validate (e.g. padding).
+  // Returns kTfLiteError is any of the backends failed to register
+  // the buffer (e.g. buffer type is not supported).
+  TfLiteStatus RegisterBuffer(TfLiteIoType io_type,
+                              const TfLiteBackendBuffer* buffer,
+                              const TfLiteAttributeMap* attrs,
+                              TfLiteBufferHandle* handle);
+
+  // Registers a buffer slice from a previously registered handle `buffer_pool`.
+  // `attrs` needs to contain both the information from the buffer pool
+  // as well as slice information (offset and size).
+  // `attrs` and `handle` should not be nullptr.
+  // If the application choose to provide the buffer type in `attrs` it must be
+  // identical to the buffer type of the buffer pool provided during
+  // RegisterBuffer call.
+  // Returns kTfLiteError if the registration failed (e.g. `buffer_pool`
+  // not found).
+  TfLiteStatus RegisterBufferSlice(TfLiteBufferHandle buffer_pool,
+                                   const TfLiteAttributeMap* attrs,
+                                   TfLiteBufferHandle* handle);
+
+  // Unregisters a buffer (or buffer slice) with `handle`.
+  // Returns kTfLiteError if `handle` is not recognized.
+  TfLiteStatus UnregisterBuffer(TfLiteBufferHandle handle);
+
+  // Returns a list of names of supported buffer types.
+  std::vector<const char*> SupportedBufferTypes(TfLiteIoType io_type) const;
+
+  // Returns a list of names of supported synchronization types.
+  std::vector<const char*> SupportedSynchronizations(
+      TfLiteIoType io_type) const;
+
+  // Reconciles registrations with all backends depending on I/O tensor `name`
+  // if the backend kernel reads or writes the tensor.
+  // Merged attributes will be populated to `merged`.
+  // If there's a conflict attribute, it's populated to `conflict` if provided.
+  // `user_provided_attributes` and `merged` should not be nullptr.
+  // Returns true if the reconcilation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(TfLiteIoType io_type, const char* name,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
+  // Finalizes the attribute for I/O tensor `name` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(TfLiteIoType io_type, const char* name,
+                             const TfLiteAttributeMap* attrs);
+
+  // Prepares delegate backends for execution.
+  // Must be called after calling `SetAttributes`.
+  TfLiteStatus PrepareBackends();
+
+  // Creates an execution task for this subgraph.
+  // Must be called after `Prepare`.
+  // When creating task, all intermediate resources will be allocated
+  // for this task.
+  // The task must be released by calling `Finish`.
+  TfLiteExecutionTask* CreateTask();
+
+  // Schedules an asynchronous execution with I/O information
+  // provided in `task`.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backend kernels failed to schedule
+  // the execution.
+  TfLiteStatus InvokeAsync(TfLiteExecutionTask* task);
+
+  // Blocks and wait for execution tied to `task` to finish.
+  // `task` should not be nullptr.
+  // Can be called from multiple threads. All calls will block until the
+  // task finishes execution.
+  //
+  // NOTE: `Wait` and `InvokeAsync` should be called in pairs with the same
+  // `task`, unless `Finish(task)` is called and task is freed. The application
+  // is responsible to call `Wait` after `InvokeAsync` even if all output
+  // tensors are associated with synchronizations.
+  //
+  // Returns kTfLiteError if any backends failed to finish the execution.
+  // If the task is currently idle, it will return its latest status code.
+  TfLiteStatus Wait(TfLiteExecutionTask* task);
+
+  // Finishes the task and release all intermediate resources tied to
+  // this task. Must be and only be called once for the same `task` object.
+  // If there's ongoing execution, will block wait for the execution
+  // to finish.
+  // `task` should not be nullptr and will be deleted.
+  // NOTE: Caller needs to ensure `Finish` is not called concurrently with
+  // `InvokeAsync` or `Wait`.
+  // Returns kTfLiteError if failes to release the task. The task will be
+  // destroyed regardless of error or not.
+  TfLiteStatus Finish(TfLiteExecutionTask* task);
+
+ private:
+  friend class AsyncSignatureRunnerTest;
+
+  int GetTensorIndex(TfLiteIoType io_type, const char* name) const;
+
+  // Not owned.
+  const internal::SignatureDef* signature_def_ = nullptr;
+  Subgraph* subgraph_ = nullptr;
+
+  // Currently AsyncSubgraph is owned by SignatureRunner. However after
+  // we stablize the interface, the async subgraph should be owned by the
+  // interpreter and AsyncSignatureRunner won't own any of the subgraphs.
+  std::unique_ptr<AsyncSubgraph> async_subgraph_;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
diff --git a/tensorflow/lite/core/async/async_signature_runner_test.cc b/tensorflow/lite/core/async/async_signature_runner_test.cc
new file mode 100644
index 00000000000..234479f4bea
--- /dev/null
+++ b/tensorflow/lite/core/async/async_signature_runner_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/async_signature_runner.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/core/async/c/task.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/testing/mock_async_kernel.h"
+#include "tensorflow/lite/core/async/testing/test_backend.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace async {
+
+class AsyncSignatureRunnerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    kernel_ =
+        std::make_unique<::testing::StrictMock<testing::MockAsyncKernel>>();
+    backend_ = std::make_unique<testing::TestBackend>(kernel_->kernel());
+
+    interpreter_ = std::make_unique<Interpreter>();
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration* reg = ops::builtin::Register_ADD();
+    void* builtin_data_1 = malloc(sizeof(int));
+    interpreter_->AddNodeWithParameters({0, 0}, {1}, nullptr, 0, builtin_data_1,
+                                        reg);
+    signature_def_.signature_key = "serving_default";
+    signature_def_.inputs["input"] = 0;
+    signature_def_.outputs["output"] = 1;
+    signature_def_.subgraph_index = 0;
+  }
+
+  void BuildAsyncSignatureRunner() {
+    interpreter_->ModifyGraphWithDelegate(backend_->get_delegate());
+    signature_runner_ = std::make_unique<AsyncSignatureRunner>(
+        &signature_def_, interpreter_->subgraph(0));
+  }
+
+  int GetTensorIndex(TfLiteIoType io_type, const char* name) {
+    return signature_runner_->GetTensorIndex(io_type, name);
+  }
+
+  void TearDown() override { signature_runner_.reset(); }
+
+ protected:
+  std::unique_ptr<::testing::StrictMock<testing::MockAsyncKernel>> kernel_;
+  std::unique_ptr<testing::TestBackend> backend_;
+  std::unique_ptr<Interpreter> interpreter_;
+  internal::SignatureDef signature_def_;
+  std::unique_ptr<AsyncSignatureRunner> signature_runner_;
+};
+
+TEST_F(AsyncSignatureRunnerTest, InputNameTest) {
+  BuildAsyncSignatureRunner();
+  EXPECT_EQ(0, GetTensorIndex(TfLiteIoType::kTfLiteIoInput, "input"));
+  EXPECT_EQ(-1, GetTensorIndex(TfLiteIoType::kTfLiteIoInput, "output"));
+  EXPECT_EQ(-1, GetTensorIndex(TfLiteIoType::kTfLiteIoInput, "foo"));
+}
+
+TEST_F(AsyncSignatureRunnerTest, OutputNameTest) {
+  BuildAsyncSignatureRunner();
+  EXPECT_EQ(1, GetTensorIndex(TfLiteIoType::kTfLiteIoOutput, "output"));
+  EXPECT_EQ(-1, GetTensorIndex(TfLiteIoType::kTfLiteIoOutput, "input"));
+  EXPECT_EQ(-1, GetTensorIndex(TfLiteIoType::kTfLiteIoOutput, "foo"));
+}
+
+TEST_F(AsyncSignatureRunnerTest, CreateTaskTest) {
+  EXPECT_CALL(*kernel_, Finish(::testing::_, ::testing::_));
+
+  BuildAsyncSignatureRunner();
+  auto* task = signature_runner_->CreateTask();
+  EXPECT_NE(nullptr, task);
+
+  TfLiteExecutionTaskSetBuffer(task, TfLiteIoType::kTfLiteIoInput, "input", 24);
+  TfLiteExecutionTaskSetBuffer(task, TfLiteIoType::kTfLiteIoOutput, "output",
+                               12);
+  TfLiteBufferHandle input_buffer, output_buffer;
+  input_buffer = TfLiteExecutionTaskGetBufferByIndex(task, 0);
+  output_buffer = TfLiteExecutionTaskGetBufferByIndex(task, 1);
+  EXPECT_EQ(24, input_buffer);
+  EXPECT_EQ(12, output_buffer);
+  EXPECT_EQ(kTfLiteOk, signature_runner_->Finish(task));
+}
+
+}  // namespace async
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
new file mode 100644
index 00000000000..218ad2c4aaf
--- /dev/null
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -0,0 +1,186 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/async_subgraph.h"
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/task_internal.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace async {
+
+Subgraph* AsyncSubgraph::subgraph() const { return subgraph_; }
+
+TfLiteContext* AsyncSubgraph::context() const { return subgraph_->context(); }
+
+TfLiteOpaqueContext* AsyncSubgraph::opaque_context() const {
+  return reinterpret_cast<TfLiteOpaqueContext*>(context());
+}
+
+TfLiteAsyncKernel* AsyncSubgraph::async_kernel() const {
+  if (async_kernel_ == nullptr) {
+    auto* node = reinterpret_cast<TfLiteNode*>(opaque_node_);
+    async_kernel_ = reinterpret_cast<TfLiteAsyncKernel*>(node->user_data);
+  }
+  return async_kernel_;
+}
+
+AsyncSubgraph::AsyncSubgraph(Subgraph* subgraph) : subgraph_(subgraph) {
+  // Currently we only support one delegate and fully delegated subgph.
+  if (!IsFullyDelegated()) {
+    subgraph->ReportError("Model is no fully delegated by 1 backend.");
+    return;
+  }
+  // TODO(b/191883048): Add/Check delegate flag to indicate kernel support.
+  const TfLiteNode& node =
+      subgraph->nodes_and_registration()[subgraph_->execution_plan()[0]].first;
+  async_kernel_ = reinterpret_cast<TfLiteAsyncKernel*>(node.user_data);
+  // TODO(b/191883048): Add AsyncSubgraph as friend class of Subgraph and
+  // remove the const cast.
+  opaque_node_ =
+      reinterpret_cast<TfLiteOpaqueNode*>(const_cast<TfLiteNode*>(&node));
+}
+
+bool AsyncSubgraph::IsFullyDelegated() const {
+  if (subgraph_->execution_plan().size() != 1) return false;
+  const TfLiteNode& node =
+      subgraph_->nodes_and_registration()[subgraph_->execution_plan()[0]].first;
+  if (node.delegate == nullptr) return false;
+  return true;
+}
+
+TfLiteStatus AsyncSubgraph::RegisterBuffer(TfLiteIoType io_type,
+                                           const TfLiteBackendBuffer* buffer,
+                                           const TfLiteAttributeMap* attrs,
+                                           TfLiteBufferHandle* handle) {
+  if (buffer == nullptr || attrs == nullptr || handle == nullptr ||
+      async_kernel() == nullptr) {
+    return kTfLiteError;
+  }
+  *handle = next_buffer_handle_.fetch_add(1, std::memory_order_relaxed);
+  return (*async_kernel_->register_buffer)(
+      async_kernel_, reinterpret_cast<TfLiteOpaqueContext*>(context()), io_type,
+      buffer, attrs, *handle);
+}
+
+TfLiteStatus AsyncSubgraph::RegisterBufferSlice(TfLiteBufferHandle buffer_pool,
+                                                const TfLiteAttributeMap* attrs,
+                                                TfLiteBufferHandle* handle) {
+  if (attrs == nullptr || handle == nullptr || async_kernel() == nullptr) {
+    return kTfLiteError;
+  }
+  *handle = next_buffer_handle_.fetch_add(1, std::memory_order_relaxed);
+  return (*async_kernel_->register_buffer_slice)(
+      async_kernel_, opaque_context(), buffer_pool, attrs, *handle);
+}
+
+TfLiteStatus AsyncSubgraph::UnregisterBuffer(TfLiteBufferHandle handle) {
+  if (async_kernel() == nullptr) return kTfLiteError;
+  return (*async_kernel_->unregister_buffer)(async_kernel_, opaque_context(),
+                                             handle);
+}
+
+std::vector<const char*> AsyncSubgraph::SupportedBufferTypes(
+    TfLiteIoType io_type) const {
+  if (async_kernel() == nullptr) return {};
+  return (*async_kernel_->supported_buffer_types)(async_kernel_, io_type);
+}
+
+std::vector<const char*> AsyncSubgraph::SupportedSynchronizations(
+    TfLiteIoType io_type) const {
+  if (async_kernel() == nullptr) return {};
+  return (*async_kernel_->supported_synchronizations)(async_kernel_, io_type);
+}
+
+bool AsyncSubgraph::ReconcileRestrictions(
+    int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const {
+  if (user_provided_attributes == nullptr || merged == nullptr ||
+      async_kernel() == nullptr) {
+    return false;
+  }
+  return (*async_kernel_->reconcile_restrictions)(
+      async_kernel_, opaque_context(), opaque_node_, tensor_index,
+      user_provided_attributes, merged, conflict);
+}
+
+TfLiteStatus AsyncSubgraph::SetAttributes(int tensor_index,
+                                          const TfLiteAttributeMap* attrs) {
+  if (attrs == nullptr || async_kernel() == nullptr) {
+    return kTfLiteError;
+  }
+  return (*async_kernel_->set_attributes)(async_kernel_, opaque_context(),
+                                          opaque_node_, tensor_index, attrs);
+}
+
+TfLiteStatus AsyncSubgraph::Prepare() {
+  if (async_kernel() == nullptr) return kTfLiteError;
+  return (*async_kernel_->prepare)(async_kernel_, opaque_context(),
+                                   opaque_node_);
+}
+
+TfLiteExecutionTask* AsyncSubgraph::CreateTask() {
+  return new TfLiteExecutionTask;
+}
+
+TfLiteStatus AsyncSubgraph::InvokeAsync(TfLiteExecutionTask* task) {
+  if (task == nullptr || async_kernel() == nullptr) {
+    return kTfLiteError;
+  }
+  if (task->task->SetScheduled(true)) {
+    TFLITE_LOG(tflite::TFLITE_LOG_ERROR,
+               "The task has already been scheduled for execution.");
+    return kTfLiteError;
+  }
+  auto ret = (*async_kernel_->eval)(async_kernel_, opaque_context(),
+                                    opaque_node_, task);
+  task->task->SetStatus(ret);
+  return ret;
+}
+
+TfLiteStatus AsyncSubgraph::Wait(TfLiteExecutionTask* task) {
+  if (task == nullptr || async_kernel() == nullptr) {
+    return kTfLiteError;
+  }
+  if (!task->task->Scheduled()) {
+    // Nothing to wait. Returns the previous status code in case multiple
+    // threads are waiting for the same task.
+    return task->task->Status();
+  }
+  auto ret = (*async_kernel_->wait)(async_kernel_, opaque_context(), task);
+  task->task->SetStatus(ret);
+  task->task->SetScheduled(false);
+  return ret;
+}
+
+TfLiteStatus AsyncSubgraph::Finish(TfLiteExecutionTask* task) {
+  if (async_kernel() == nullptr) return kTfLiteError;
+  auto ret = (*async_kernel_->finish)(async_kernel_, opaque_context(), task);
+  if (ret != kTfLiteOk) {
+    subgraph_->ReportError("Failed to finish task.");
+  }
+  delete task;
+  return ret;
+}
+
+}  // namespace async
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/async_subgraph.h b/tensorflow/lite/core/async/async_subgraph.h
new file mode 100644
index 00000000000..2173c0b6b6a
--- /dev/null
+++ b/tensorflow/lite/core/async/async_subgraph.h
@@ -0,0 +1,174 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
+
+#include <atomic>
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace tflite {
+namespace async {
+
+// Forward declaration
+class AsyncSubgraphTestPeer;
+
+// AsyncSubgraph class manages to dispatch I/O information and
+// schedule executions to underlying delegate kernels.
+// TODO(b/191883048): Currently we require either `AllocateTensors` or
+// `EnsureTensorAllocation` called to ensure the backend kernels are prepared.
+// However, we don't need to allocate the CPU memory for input / output tensors.
+// We need customize the OpPrepare or memory planner to skip the allocation
+// for user provided buffer case.
+class AsyncSubgraph {
+ public:
+  explicit AsyncSubgraph(Subgraph* subgraph);
+
+  // Returns the underlying TfLite subgraph.
+  Subgraph* subgraph() const;
+
+  // Returns the TfLiteContext of the subgraph.
+  TfLiteContext* context() const;
+
+  // Registers a TfLiteBackendBuffer to backends.
+  // The `buffer` will be sent to all backends and TfLite runtime
+  // will assign an unique `handle` for backends to recognize the buffer.
+  // `buffer`, `attrs`, and `handle` should not be null.
+  // Returns kTfLiteError is any of the backends failed to register
+  // the buffer (e.g. buffer type is not supported).
+  TfLiteStatus RegisterBuffer(TfLiteIoType io_type,
+                              const TfLiteBackendBuffer* buffer,
+                              const TfLiteAttributeMap* attrs,
+                              TfLiteBufferHandle* handle);
+
+  // Registers a buffer slice from a previously registered handle `buffer_pool`.
+  // `attrs` needs to contain both the information from the buffer pool
+  // as well as slice information (offset and size).
+  // `attrs` and `handle` should not be nullptr.
+  //
+  // NOTE: When using sliced buffer as output buffer, the application needs to
+  // make sure slices from the same buffer pool should not be used across
+  // different executions (from InvokeAsync call to the output sync signals)
+  // otherwise data corruption may occur.
+  // TODO(b/243175542): Programmatically ensure slices from one buffer are used
+  // exclusively by one backend to write to for a single execution.
+  //
+  // Returns kTfLiteError if the registration failed (e.g. `buffer_pool`
+  // not found).
+  TfLiteStatus RegisterBufferSlice(TfLiteBufferHandle buffer_pool,
+                                   const TfLiteAttributeMap* attrs,
+                                   TfLiteBufferHandle* handle);
+
+  // Unregisters a buffer (or buffer slice) with `handle`.
+  // Returns kTfLiteError if `handle` is not recognized.
+  TfLiteStatus UnregisterBuffer(TfLiteBufferHandle handle);
+
+  // Returns a list of names of supported buffer types.
+  std::vector<const char*> SupportedBufferTypes(TfLiteIoType io_type) const;
+
+  // Returns a list of names of supported synchronization types.
+  std::vector<const char*> SupportedSynchronizations(
+      TfLiteIoType io_type) const;
+
+  // Reconciles registrations with all backends depending on tensor at
+  // `tensor_index` if the backend kernel reads or writes the tensor.
+  // Merged attributes will be populated to `merged`.
+  // If there's a conflict attribute, it's populated to `conflict` if provided.
+  // `user_provided_attributes` and `merged` should not be nullptr.
+  // Returns true if the reconcilation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(int tensor_index,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
+  // Finalizes the attribute for tensor at `tensor_index` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor
+  // at `tensor_index`.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
+
+  // Prepares delegate backends for execution.
+  // Must be called after calling `SetAttributes`.
+  TfLiteStatus Prepare();
+
+  // Creates an execution task for this subgraph.
+  // Must be called after `Prepare`.
+  // When creating task, all intermediate resources will be allocated
+  // for this task.
+  // The task must be released by calling `Finish`.
+  TfLiteExecutionTask* CreateTask();
+
+  // Schedules an asynchronous execution with I/O information
+  // provided in `task`.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backend kernels failed to schedule
+  // the execution.
+  TfLiteStatus InvokeAsync(TfLiteExecutionTask* task);
+
+  // Blocks and wait for execution tied to `task` to finish.
+  // `task` should not be nullptr.
+  // Returns kTfLiteError if any backends failed to finish the execution.
+  // If the task is currently idle, it will return its latest status code.
+  TfLiteStatus Wait(TfLiteExecutionTask* task);
+
+  // Finishes the task and release all intermediate resources tied to
+  // this task.
+  // If there's ongoing execution, will block wait for the execution
+  // to finish.
+  // `task` should not be nullptr and will be deleted.
+  // Returns kTfLiteError if failes to release the task. In this case `task`
+  // will not be deleted.
+  TfLiteStatus Finish(TfLiteExecutionTask* task);
+
+ private:
+  friend class AsyncSubgraphTestPeer;
+
+  // Returns true if the subgraph is fully delegated by 1 backend.
+  bool IsFullyDelegated() const;
+
+  // Returns the opaque TfLiteContext of the subgraph.
+  TfLiteOpaqueContext* opaque_context() const;
+
+  // Returns the async backend kernel that delegates the subgraph.
+  // NOTE: Since we assume only 1 backend will delegate the model, we cache
+  // the async kernel instance. In theory, the subgraph should iterate through
+  // execution plan to fetch the individual async kernels and operate
+  // respectively.
+  TfLiteAsyncKernel* async_kernel() const;
+
+  // Not owned.
+  Subgraph* subgraph_ = nullptr;
+
+  // Next buffer handle to assign in Register* calls.
+  std::atomic<TfLiteBufferHandle> next_buffer_handle_ = 0;
+
+  // Currently AsyncSubgraph only support fully delegated by 1 backend case.
+  // Not owned.
+  mutable TfLiteAsyncKernel* async_kernel_ = nullptr;
+  TfLiteOpaqueNode* opaque_node_ = nullptr;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SUBGRAPH_H_
diff --git a/tensorflow/lite/core/async/async_subgraph_test.cc b/tensorflow/lite/core/async/async_subgraph_test.cc
new file mode 100644
index 00000000000..6ff9123f6b1
--- /dev/null
+++ b/tensorflow/lite/core/async/async_subgraph_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/async_subgraph.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/async/task_internal.h"
+#include "tensorflow/lite/core/async/testing/mock_async_kernel.h"
+#include "tensorflow/lite/core/async/testing/test_backend.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+
+using ::testing::_;
+
+namespace tflite {
+namespace async {
+
+class AsyncSubgraphTestPeer {
+ public:
+  explicit AsyncSubgraphTestPeer(AsyncSubgraph* subgraph)
+      : subgraph_(subgraph) {}
+
+  bool IsFullyDelegated() const { return subgraph_->IsFullyDelegated(); }
+
+ private:
+  AsyncSubgraph* subgraph_;
+};
+
+class AsyncSubgraphTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    kernel_ =
+        std::make_unique<::testing::StrictMock<testing::MockAsyncKernel>>();
+    backend_ = std::make_unique<testing::TestBackend>(kernel_->kernel());
+
+    interpreter_ = std::make_unique<Interpreter>();
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration* reg = ops::builtin::Register_ADD();
+    void* builtin_data_1 = malloc(sizeof(int));
+    void* builtin_data_2 = malloc(sizeof(int));
+    void* builtin_data_3 = malloc(sizeof(int));
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data_1,
+                                        reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data_2,
+                                        reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data_3,
+                                        reg);
+  }
+
+  void BuildAsyncSubgraph() {
+    interpreter_->ModifyGraphWithDelegate(backend_->get_delegate());
+    subgraph_ = std::make_unique<AsyncSubgraph>(interpreter_->subgraph(0));
+  }
+
+  void TearDown() override { subgraph_.reset(); }
+
+ protected:
+  std::unique_ptr<::testing::StrictMock<testing::MockAsyncKernel>> kernel_;
+  std::unique_ptr<testing::TestBackend> backend_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<AsyncSubgraph> subgraph_;
+};
+
+TEST_F(AsyncSubgraphTest, FullyDelegated) {
+  BuildAsyncSubgraph();
+  EXPECT_TRUE(AsyncSubgraphTestPeer(subgraph_.get()).IsFullyDelegated());
+}
+
+TEST_F(AsyncSubgraphTest, NotFullyDelegated) {
+  // Don't do delegation.
+  backend_->SetMinPartitionedNodes(42);
+  BuildAsyncSubgraph();
+  EXPECT_FALSE(AsyncSubgraphTestPeer(subgraph_.get()).IsFullyDelegated());
+}
+
+TEST_F(AsyncSubgraphTest, BasicTest) {
+  BuildAsyncSubgraph();
+
+  EXPECT_CALL(*kernel_, RegisterBuffer(_, _, _, _, _));
+  EXPECT_CALL(*kernel_, RegisterBufferSlice(_, _, _, _));
+  EXPECT_CALL(*kernel_, UnregisterBuffer(_, _));
+  EXPECT_CALL(*kernel_, SupportedBufferTypes(_));
+  EXPECT_CALL(*kernel_, SupportedSynchronizations(_));
+  EXPECT_CALL(*kernel_, ReconcileRestrictions(_, _, _, _, _, _));
+  EXPECT_CALL(*kernel_, SetAttributes(_, _, _, _));
+  EXPECT_CALL(*kernel_, Prepare(_, _));
+  EXPECT_CALL(*kernel_, Eval(_, _, _));
+  EXPECT_CALL(*kernel_, Wait(_, _));
+  EXPECT_CALL(*kernel_, Finish(_, _));
+
+  auto* buffer = TfLiteBackendBufferCreate();
+  auto* attrs = new TfLiteAttributeMap(kTfLiteBufferAttrMap);
+  TfLiteBufferHandle handle = 1;
+  TfLiteBufferHandle another_handle = 1;
+  auto* task = new TfLiteExecutionTask;
+  EXPECT_FALSE(task->task->Scheduled());
+
+  subgraph_->RegisterBuffer(kTfLiteIoInput, buffer, attrs, &handle);
+  subgraph_->RegisterBufferSlice(handle, attrs, &another_handle);
+  subgraph_->UnregisterBuffer(handle);
+  subgraph_->SupportedBufferTypes(kTfLiteIoInput);
+  subgraph_->SupportedSynchronizations(kTfLiteIoInput);
+  subgraph_->ReconcileRestrictions(0, attrs, attrs, attrs);
+  subgraph_->SetAttributes(0, attrs);
+  subgraph_->Prepare();
+  EXPECT_EQ(kTfLiteOk, subgraph_->InvokeAsync(task));
+  EXPECT_TRUE(task->task->Scheduled());
+  // Scheduling another execution w/o waiting on the task should return error.
+  EXPECT_EQ(kTfLiteError, subgraph_->InvokeAsync(task));
+  EXPECT_TRUE(task->task->Scheduled());
+  EXPECT_EQ(kTfLiteOk, task->task->Status());
+  EXPECT_EQ(kTfLiteOk, subgraph_->Wait(task));
+
+  // If waiting the task failed, all successive `Wait` should also fail.
+  task->task->SetStatus(kTfLiteError);
+  EXPECT_EQ(kTfLiteError, subgraph_->Wait(task));
+  EXPECT_EQ(kTfLiteError, subgraph_->Wait(task));
+
+  EXPECT_FALSE(task->task->Scheduled());
+  // Deletes `task`
+  subgraph_->Finish(task);
+
+  TfLiteBackendBufferDelete(buffer);
+  delete attrs;
+
+  EXPECT_NE(handle, another_handle);
+}
+
+}  // namespace async
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface.cc b/tensorflow/lite/core/async/backend_async_kernel_interface.cc
new file mode 100644
index 00000000000..a8a6c47924e
--- /dev/null
+++ b/tensorflow/lite/core/async/backend_async_kernel_interface.cc
@@ -0,0 +1,163 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+
+#include <vector>
+
+namespace tflite {
+namespace delegates {
+
+namespace internal {
+TfLiteStatus RegisterBuffer(TfLiteAsyncKernel* async_kernel,
+                            TfLiteOpaqueContext* context, int32_t io_type,
+                            const TfLiteBackendBuffer* buffer,
+                            const TfLiteAttributeMap* attrs,
+                            TfLiteBufferHandle handle) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->RegisterBuffer(context, static_cast<TfLiteIoType>(io_type), buffer,
+                       attrs, handle);
+}
+
+// Registers a buffer slice from a previously registered memory.
+// `attrs` contains the information of the memory, but also additional slice
+// information.
+TfLiteStatus RegisterBufferSlice(TfLiteAsyncKernel* async_kernel,
+                                 TfLiteOpaqueContext* context,
+                                 TfLiteBufferHandle buffer,
+                                 const TfLiteAttributeMap* attrs,
+                                 TfLiteBufferHandle handle) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->RegisterBufferSlice(context, buffer, attrs, handle);
+}
+
+// Unregisters a buffer or a buffer slice.
+TfLiteStatus UnregisterBuffer(TfLiteAsyncKernel* async_kernel,
+                              TfLiteOpaqueContext* context,
+                              const TfLiteBufferHandle handle) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->UnregisterBuffer(context, handle);
+}
+
+// Reconciliations
+// ===================
+
+// Inspects the buffer / sync implementation types supported by the backend.
+std::vector<const char*> SupportedBufferTypes(
+    const TfLiteAsyncKernel* async_kernel, int32_t io_type) {
+  return reinterpret_cast<const BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->SupportedBufferTypes(static_cast<TfLiteIoType>(io_type));
+}
+std::vector<const char*> SupportedSynchronizations(
+    const TfLiteAsyncKernel* async_kernel, int32_t io_type) {
+  return reinterpret_cast<const BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->SupportedSynchronizations(static_cast<TfLiteIoType>(io_type));
+}
+
+// Reconciles buffer or sync attributes for tensor at tensor_index.
+// Fills `merged` with reconciled attributes.
+// If `conflict` is provided, conflicting attributes will be provided there.
+// Returns true if there's no conflict.
+bool ReconcileRestrictions(const TfLiteAsyncKernel* async_kernel,
+                           TfLiteOpaqueContext* context, TfLiteOpaqueNode* node,
+                           int tensor_index,
+                           const TfLiteAttributeMap* user_provided_attributes,
+                           TfLiteAttributeMap* merged,
+                           TfLiteAttributeMap* conflict) {
+  return reinterpret_cast<const BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->ReconcileRestrictions(context, node, tensor_index,
+                              user_provided_attributes, merged, conflict);
+}
+// Sets the input / output buffer / sync attributes.
+// Backend kernel will check the input attributes covers all the requirements.
+// A typical workflow is for callers call Reconcile*Restrictions method
+// above to have a merged attribute list, check all restrictions are met
+// and set input / output attribute here.
+// Returns TfLiteOk if provided `attrs` covers all requirements.
+TfLiteStatus SetAttributes(TfLiteAsyncKernel* async_kernel,
+                           TfLiteOpaqueContext* context, TfLiteOpaqueNode* node,
+                           int tensor_index, const TfLiteAttributeMap* attrs) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->SetAttributes(context, node, tensor_index, attrs);
+}
+
+// Prepares the kernel using the information from Set[In|Out]putAttributes
+// call above.
+TfLiteStatus Prepare(TfLiteAsyncKernel* async_kernel,
+                     TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->Prepare(context, node);
+}
+
+// Execution methods
+// =============================
+
+// Schedules an execution with the information provided in task.
+// The application is responsible for filling out buffer and sync mappings
+// to tensors.
+// Backend will set the sync ptr for related tensors if requested.
+// i.e. SetOutputAttributes has sync implementation requested, and
+// the TfLiteSynchronization is not null for the tensor in `task`.
+// Returns TfLiteOk if the execution is successfully scheduled.
+TfLiteStatus Eval(TfLiteAsyncKernel* async_kernel, TfLiteOpaqueContext* context,
+                  TfLiteOpaqueNode* node, TfLiteExecutionTask* task) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->Eval(context, node, task);
+}
+
+// Waits on the execution scheduled using the task to finish.
+TfLiteStatus Wait(TfLiteAsyncKernel* async_kernel, TfLiteOpaqueContext* context,
+                  TfLiteExecutionTask* task) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->Wait(context, task);
+}
+
+// Finishes the task and clean up allocated resources for the task.
+TfLiteStatus Finish(TfLiteAsyncKernel* async_kernel,
+                    TfLiteOpaqueContext* context, TfLiteExecutionTask* task) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             async_kernel->kernel_data)
+      ->Finish(context, task);
+}
+
+}  // namespace internal
+
+BackendAsyncKernelInterface::BackendAsyncKernelInterface() {
+  kernel_ = new TfLiteAsyncKernel();
+  kernel_->kernel_data = this;
+  kernel_->register_buffer = internal::RegisterBuffer;
+  kernel_->register_buffer_slice = internal::RegisterBufferSlice;
+  kernel_->unregister_buffer = internal::UnregisterBuffer;
+  kernel_->supported_buffer_types = internal::SupportedBufferTypes;
+  kernel_->supported_synchronizations = internal::SupportedSynchronizations;
+  kernel_->reconcile_restrictions = internal::ReconcileRestrictions;
+  kernel_->set_attributes = internal::SetAttributes;
+  kernel_->prepare = internal::Prepare;
+  kernel_->eval = internal::Eval;
+  kernel_->wait = internal::Wait;
+  kernel_->finish = internal::Finish;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface.h b/tensorflow/lite/core/async/backend_async_kernel_interface.h
new file mode 100644
index 00000000000..3274b1fe9d5
--- /dev/null
+++ b/tensorflow/lite/core/async/backend_async_kernel_interface.h
@@ -0,0 +1,202 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
+
+#include <vector>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+// TODO(b/191883048): This interface should only depend on C API instead of
+// internal definitions.
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+
+namespace tflite {
+namespace delegates {
+
+// A C++ wrapper around TfLiteAsyncKernel C API that delegate developers
+// can use to add support for asynchronous execution.
+// The implementation of `BackendAsyncKernelInterface` must be thread safe.
+class BackendAsyncKernelInterface {
+ public:
+  BackendAsyncKernelInterface();
+  virtual ~BackendAsyncKernelInterface() {
+    if (kernel_) delete kernel_;
+  }
+
+  // Returns the TfLiteAsyncKernel instance.
+  // kernel_ will be filled with the implementation of the class.
+  virtual TfLiteAsyncKernel* kernel() { return kernel_; }
+
+  // The following methods should be implemented to support buffer interop
+  // and asynchronous execution.
+
+  // Buffer operations
+  // ======================
+  // Registers the buffer to `handle`.
+  // `buffer` and `attrs` lifespan is not gauranteed after the function call.
+  // kernels should read the stored attributes instead of caching the
+  // attribute map.
+  // `io_type` specifies whether this buffer is used as an input buffer
+  // or an output buffer. If a buffer is both used as input and output,
+  // specify it as output. Not null.
+  // `attrs` describes the attributes of the buffer. It's gauranteed to be
+  // of kTfLiteBufferAttrMap type and not null.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // In `attrs`, the application must provide the type of the buffer.
+  // If additional attributes (e.g. padding, size) are provided, the backend
+  // is responsible for validating those attributes to be compatible.
+  // The backend will not own the actual buffer wrapped in `buffer`, but the
+  // backend can choose to increase the ref count if underlying implementaion
+  // supports that.
+  virtual TfLiteStatus RegisterBuffer(TfLiteOpaqueContext* context,
+                                      TfLiteIoType io_type,
+                                      const TfLiteBackendBuffer* buffer,
+                                      const TfLiteAttributeMap* attrs,
+                                      TfLiteBufferHandle handle) = 0;
+
+  // Registers a buffer slice from a previously registered memory.
+  // `buffer` is the handle of the buffer pool previously registered.
+  // `attrs` contains the information of the buffer slice.
+  // `handle` is the buffer handle assigned by TfLite runtime to recognize
+  // this piece of buffer.
+  // NOTE: The backend is responsible to validate the slicing is "valid":
+  // * The slicing is not nested from another slice. (i.e. the `buffer_pool` is
+  //   a handle returned by `RegisterBuffer`.)
+  // * The attributes of the slice (e.g. size, offset) is of valid values
+  //   from the buffer pool.
+  // If the `handle` is not recognized, returns error.
+  virtual TfLiteStatus RegisterBufferSlice(TfLiteOpaqueContext* context,
+                                           TfLiteBufferHandle buffer_pool,
+                                           const TfLiteAttributeMap* attrs,
+                                           TfLiteBufferHandle handle) = 0;
+
+  // Unregisters a buffer or a buffer slice.
+  // `handle` is a buffer handle previously assigned via register_* calls.
+  // If the `handle` is not recognized, returns error.
+  // Unregistering the buffer does not mean deallocating the buffer. However
+  // the backend need to reduce the ref-count if ref counting is performed
+  // during `Register*` calls.
+  virtual TfLiteStatus UnregisterBuffer(TfLiteOpaqueContext* context,
+                                        TfLiteBufferHandle handle) = 0;
+
+  // Reconciliations
+  // ===================
+  // Inspects the buffer types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // buffer.
+  virtual std::vector<const char*> SupportedBufferTypes(
+      TfLiteIoType io_type) const = 0;
+
+  // Inspects the sync object types supported by the backend.
+  // `io_type` specify whether the call returns supported input or output
+  // sync object.
+  virtual std::vector<const char*> SupportedSynchronizations(
+      TfLiteIoType io_type) const = 0;
+
+  // Reconciles buffer or sync attributes for tensor at tensor_index.
+  // Fills `merged` with reconciled attributes.
+  // If `conflict` is provided, conflicting attributes will be provided there.
+  // If the type of the `user_provided_attributes` is not recognizable, returns
+  // error.
+  // If any of the attribute in the `user_provided_attributes` is not
+  // recognizable skip this attribute.
+  // Returns true if the attribute map type is recognizable and there's no
+  // conflicting attribute.
+  virtual bool ReconcileRestrictions(
+      TfLiteOpaqueContext* context, TfLiteOpaqueNode* node, int tensor_index,
+      const TfLiteAttributeMap* user_provided_attributes,
+      TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const = 0;
+
+  // Sets the input / output buffer / sync attributes.
+  // Backend kernel will check the input attributes covers all the requirements.
+  // A typical workflow is for callers call Reconcile*Restrictions method
+  // above to have a merged attribute list, check all restrictions are met
+  // and set input / output attribute here.
+  // Returns TfLiteOk if provided `attrs` covers all requirements.
+  virtual TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
+                                     TfLiteOpaqueNode* node, int tensor_index,
+                                     const TfLiteAttributeMap* attrs) = 0;
+
+  // Prepares the kernel using the information from Set[In|Out]putAttributes
+  // call above.
+  virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                               TfLiteOpaqueNode* node) = 0;
+
+  // Execution methods
+  // =============================
+
+  // Schedules an execution with the information provided in task.
+  // The application is responsible for filling out buffer and sync mappings
+  // to tensors.
+  // Backend will set the sync ptr for related tensors if requested.
+  // i.e. SetOutputAttributes has sync implementation requested, and
+  // the TfLiteSynchronization is not null for the tensor in `task`.
+  //
+  // TfLite runtime guarantees that the task is in ready state (i.e. no
+  // un-ended execution for this task).
+  //
+  // Input synchronizations:
+  // If the synchronization of a input tensor is `kTfLiteSyncTypeNoSyncObj`
+  // type or it's nullptr, it means the data is ready during Eval call.
+  // If not, data will be available when the synchronization signals and the
+  // backend is responsible for closing the underlying synchronization.
+  // The backend is responsible for dedupping the input sync.
+  //
+  // Output synchronizations:
+  // If the synchronization type is `kTfLiteSyncTypeNoSyncObj` or is nullptr,
+  // the backend does not need to provide synchronization objects to the user.
+  // Otherwise, the backend need to provide the sync according to the sync type
+  // provided. The underlying sync object will be closed by the app (or
+  // downstream components).
+  // If there are multiple non-nullptr kTfLiteSynchronization provided for
+  // different output tensors, the backend is responsible for duplicating the
+  // synchronization.
+  // TODO(b/191883048): What if the sync fence is not dup-able?
+  //
+  // Returns kTfLiteOk if the execution is successfully scheduled.
+  virtual TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Waits on the execution scheduled using the task to finish.
+  // TfLite runtime guarantees that the task has an un-ended execution.
+  //
+  // Callers should be able to call `Wait` on the same task from multiple
+  // threads, and those calls should return the same status (i.e. if the backend
+  // failed to successfully wait on the task, all `Wait` to the task should
+  // return the same error before a new invocation is scheduled). Returns
+  // kTfLiteOk if the task is finished (w/ or w/o blocking).
+  virtual TfLiteStatus Wait(TfLiteOpaqueContext* context,
+                            TfLiteExecutionTask* task) = 0;
+
+  // Finishes the task and clean up allocated resources for the task.
+  // May block if there's pending executions.
+  // This function will be called once and only once for individual task.
+  // Returns kTfLiteOk if there's no error. The backend is responsible to
+  // clean up task resources regardless there's error or not.
+  virtual TfLiteStatus Finish(TfLiteOpaqueContext* context,
+                              TfLiteExecutionTask* task) = 0;
+
+ protected:
+  TfLiteAsyncKernel* kernel_ = nullptr;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_BACKEND_ASYNC_KERNEL_INTERFACE_H_
diff --git a/tensorflow/lite/core/async/backend_async_kernel_interface_test.cc b/tensorflow/lite/core/async/backend_async_kernel_interface_test.cc
new file mode 100644
index 00000000000..4a9612785a1
--- /dev/null
+++ b/tensorflow/lite/core/async/backend_async_kernel_interface_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/testing/mock_async_kernel.h"
+
+using ::testing::_;
+
+namespace tflite::delegates {
+namespace {
+
+TEST(BackendAsyncKernelInterfaceTest, BasicTest) {
+  testing::StrictMock<async::testing::MockAsyncKernel> kernel;
+
+  EXPECT_CALL(kernel, RegisterBuffer(_, _, _, _, _));
+  EXPECT_CALL(kernel, RegisterBufferSlice(_, _, _, _));
+  EXPECT_CALL(kernel, UnregisterBuffer(_, _));
+  EXPECT_CALL(kernel, SupportedBufferTypes(_));
+  EXPECT_CALL(kernel, SupportedSynchronizations(_));
+  EXPECT_CALL(kernel, ReconcileRestrictions(_, _, _, _, _, _));
+  EXPECT_CALL(kernel, SetAttributes(_, _, _, _));
+  EXPECT_CALL(kernel, Prepare(_, _));
+  EXPECT_CALL(kernel, Eval(_, _, _));
+  EXPECT_CALL(kernel, Wait(_, _));
+  EXPECT_CALL(kernel, Finish(_, _));
+
+  auto* tflite_kernel = kernel.kernel();
+
+  tflite_kernel->register_buffer(tflite_kernel, nullptr, 0, nullptr, nullptr,
+                                 0);
+  tflite_kernel->register_buffer_slice(tflite_kernel, nullptr, 0, nullptr, 0);
+  tflite_kernel->unregister_buffer(tflite_kernel, nullptr, 0);
+  tflite_kernel->supported_buffer_types(tflite_kernel, 0);
+  tflite_kernel->supported_synchronizations(tflite_kernel, 0);
+  tflite_kernel->reconcile_restrictions(tflite_kernel, nullptr, nullptr, 0,
+                                        nullptr, nullptr, nullptr);
+  tflite_kernel->set_attributes(tflite_kernel, nullptr, nullptr, 0, nullptr);
+  tflite_kernel->prepare(tflite_kernel, nullptr, nullptr);
+  tflite_kernel->eval(tflite_kernel, nullptr, nullptr, nullptr);
+  tflite_kernel->wait(tflite_kernel, nullptr, nullptr);
+  tflite_kernel->finish(tflite_kernel, nullptr, nullptr);
+}
+
+}  // namespace
+}  // namespace tflite::delegates
diff --git a/tensorflow/lite/core/async/c/BUILD b/tensorflow/lite/core/async/c/BUILD
new file mode 100644
index 00000000000..ba34a9adea5
--- /dev/null
+++ b/tensorflow/lite/core/async/c/BUILD
@@ -0,0 +1,47 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "types",
+    hdrs = ["types.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+)
+
+cc_library(
+    name = "task",
+    srcs = ["task.cc"],
+    hdrs = ["task.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        "//tensorflow/lite/core/async:common",
+        "//tensorflow/lite/core/async:task_internal",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_test(
+    name = "task_test",
+    srcs = ["task_test.cc"],
+    deps = [
+        ":task",
+        ":types",
+        "//tensorflow/lite/core/async:common",
+        "//tensorflow/lite/core/async:task_internal",
+        "//tensorflow/lite/core/async/interop/c:types",
+        "//tensorflow/lite/core/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/core/async/c/task.cc b/tensorflow/lite/core/async/c/task.cc
new file mode 100644
index 00000000000..f157b033a6a
--- /dev/null
+++ b/tensorflow/lite/core/async/c/task.cc
@@ -0,0 +1,100 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/c/task.h"
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/task_internal.h"
+
+extern "C" {
+
+TfLiteStatus TfLiteExecutionTaskSetBuffer(TfLiteExecutionTask* task,
+                                          int32_t io_type,
+                                          const char* tensor_signature_name,
+                                          TfLiteBufferHandle handle) {
+  if (task == nullptr || task->task == nullptr ||
+      tensor_signature_name == nullptr)
+    return kTfLiteError;
+  return task->task->SetBufferHandle(static_cast<TfLiteIoType>(io_type),
+                                     tensor_signature_name, handle);
+}
+
+TfLiteStatus TfLiteExecutionTaskSetSync(TfLiteExecutionTask* task,
+                                        int32_t io_type,
+                                        const char* tensor_signature_name,
+                                        TfLiteSynchronization* sync) {
+  if (task == nullptr || task->task == nullptr ||
+      tensor_signature_name == nullptr)
+    return kTfLiteError;
+  return task->task->SetSynchronization(static_cast<TfLiteIoType>(io_type),
+                                        tensor_signature_name, sync);
+}
+
+TfLiteBufferHandle TfLiteExecutionTaskGetBufferByName(
+    const TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name) {
+  if (task == nullptr || task->task == nullptr ||
+      tensor_signature_name == nullptr)
+    return kTfLiteNullBufferHandle;
+  return task->task->GetBufferHandle(static_cast<TfLiteIoType>(io_type),
+                                     tensor_signature_name);
+}
+
+TfLiteSynchronization* TfLiteExecutionTaskGetSyncByName(
+    const TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name) {
+  if (task == nullptr || task->task == nullptr ||
+      tensor_signature_name == nullptr)
+    return nullptr;
+  return task->task->GetSynchronization(static_cast<TfLiteIoType>(io_type),
+                                        tensor_signature_name);
+}
+
+TfLiteBufferHandle TfLiteExecutionTaskGetBufferByIndex(
+    const TfLiteExecutionTask* task, int tensor_index) {
+  if (task == nullptr || task->task == nullptr) return kTfLiteNullBufferHandle;
+  return task->task->GetBufferHandle(tensor_index);
+}
+
+TfLiteSynchronization* TfLiteExecutionTaskGetSyncByIndex(
+    const TfLiteExecutionTask* task, int tensor_index) {
+  if (task == nullptr || task->task == nullptr) return nullptr;
+  return task->task->GetSynchronization(tensor_index);
+}
+
+void* TfLiteExecutionTaskGetDelegateExecutionData(
+    const TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel) {
+  if (task == nullptr || task->task == nullptr) return nullptr;
+  return task->task->GetDelegateExecutionData(kernel);
+}
+
+void TfLiteExecutionTaskSetDelegateExecutionData(
+    const TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel, void* data) {
+  if (task == nullptr || task->task == nullptr) return;
+  task->task->SetDelegateExecutionData(kernel, data);
+}
+
+TfLiteStatus TfLiteExecutionTaskGetStatus(const TfLiteExecutionTask* task) {
+  if (task == nullptr || task->task == nullptr) return kTfLiteError;
+  return task->task->Status();
+}
+
+void TfLiteExecutionTaskSetStatus(const TfLiteExecutionTask* task,
+                                  TfLiteStatus status) {
+  if (task == nullptr || task->task == nullptr) return;
+  task->task->SetStatus(status);
+}
+}
diff --git a/tensorflow/lite/core/async/c/task.h b/tensorflow/lite/core/async/c/task.h
new file mode 100644
index 00000000000..0960497adcc
--- /dev/null
+++ b/tensorflow/lite/core/async/c/task.h
@@ -0,0 +1,136 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// TfLiteExecutionTask API.
+///
+/// TfLiteExecutionTask stores the information for a specific
+/// execution. It includes the mapping from tensors to the buffer handles as
+/// well as the synchronization objects.
+/// WARNING: This file contains experimental APIs and subject to change.
+
+/// Buffers
+/// --------------------------------------------------------------------------
+/// If no synchronization type is set, the input data is default to synchronized
+/// (i.e. ready when calling InvokeAsync)
+
+/// Sets the buffer handle to the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns kTfLiteError if the tensor is not found or nullptr args.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetBuffer(
+    TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name, TfLiteBufferHandle handle);
+
+/// Returns the buffer handle of the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns kTfLiteNullBufferHandle if the tensor is not found or null input.
+TFL_CAPI_EXPORT extern TfLiteBufferHandle TfLiteExecutionTaskGetBufferByName(
+    const TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name);
+
+/// The same as `TfLiteExecutionTaskGetBufferByName` but takes tensor index
+/// instead of the name from signature.
+TFL_CAPI_EXPORT extern TfLiteBufferHandle TfLiteExecutionTaskGetBufferByIndex(
+    const TfLiteExecutionTask* task, int tensor_index);
+
+/// Synchronizations
+/// --------------------------------------------------------------------------
+/// Associates synchronization objects to input / output tensors.
+///
+/// For input tensor, either a nullptr or default sync type
+/// `kTfLiteSyncTypeNoSyncObj` means the input is already ready when scheduling
+/// the execution. otherwise, the input data will be ready when the underlying
+/// sync object signals. The backend is responsible to close the underlying
+/// sync object.
+/// For output tensor, if the user does not require the backend to return
+/// the sync object, it can set the sync type to default
+/// `kTfLiteSyncTypeNoSyncObj` or a nullptr TfLiteSynchronization. It means the
+/// data is ready when the application calls `Wait` on the given task. Otherwise
+/// the backend needs to provide a not-null sync object according to the sync
+/// type and it will be signaled when the output data is ready. The underlying
+/// output sync object needs to be closed by the application (or some downstream
+/// in the pipeline). The backend will be responsible for duplicating the synch
+/// if TfLiteSynchronizations are not nullptr for different output tensor
+/// produced by the same backend.
+///
+/// The application needs to maintain the lifetime of the input
+/// TfLiteSynchronizations associated with the task during its invocation.
+/// TODO(b/191883048): Revisit if we want to bundle the lifetime of sync with
+/// the task itself and delete the TfLiteSynchronization in `Finish(task)`.
+
+/// Sets the sync object to the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// A nullptr `sync` esentially means the tensor data does not need
+/// synchronization.
+/// `task` does not take the ownership of `sync`, so caller needs to release
+/// `sync` when destroying the `task` with AsyncSignatureRunner::Finish.
+/// Returns kTfLiteError if the tensor is not found.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetSync(
+    TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name, TfLiteSynchronization* sync);
+
+/// Returns the sync object of the input / output tensor associated with
+/// `tensor_signature_name`.
+/// `task` and `tensor_signature_name` must not be nullptr.
+/// Returns nullptr if the tensor is not found or null input.
+TFL_CAPI_EXPORT extern TfLiteSynchronization* TfLiteExecutionTaskGetSyncByName(
+    const TfLiteExecutionTask* task, int32_t io_type,
+    const char* tensor_signature_name);
+
+/// The same as `TfLiteExecutionTaskGetSyncByName` but takes tensor index
+/// instead of the name from signature.
+TFL_CAPI_EXPORT extern TfLiteSynchronization* TfLiteExecutionTaskGetSyncByIndex(
+    const TfLiteExecutionTask* task, int tensor_index);
+
+/// Task execution data
+/// Backends may store task specific data for executions. This ease the burden
+/// for backends to maintain the mapping across different tasks.
+TFL_CAPI_EXPORT extern void* TfLiteExecutionTaskGetDelegateExecutionData(
+    const TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel);
+
+TFL_CAPI_EXPORT extern void TfLiteExecutionTaskSetDelegateExecutionData(
+    const TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel, void* data);
+
+/// Task status
+/// Thread safe accessors for the lastest status of the task.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskGetStatus(
+    const TfLiteExecutionTask* task);
+
+TFL_CAPI_EXPORT extern void TfLiteExecutionTaskSetStatus(
+    const TfLiteExecutionTask* task, TfLiteStatus status);
+
+// TODO(b/262574034): Also add APIs for error code and error messages.
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_TASK_H_
diff --git a/tensorflow/lite/core/async/c/task_test.cc b/tensorflow/lite/core/async/c/task_test.cc
new file mode 100644
index 00000000000..76d527c0ec2
--- /dev/null
+++ b/tensorflow/lite/core/async/c/task_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/c/task.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/async/task_internal.h"
+
+namespace {
+
+class TfLiteExecutionTaskTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    input_names_["x"] = 1;
+    input_names_["y"] = 2;
+    output_names_["a"] = 3;
+    task_.task->SetInputNameMap(&input_names_);
+    task_.task->SetOutputNameMap(&output_names_);
+  }
+
+  TfLiteExecutionTask* task() { return &task_; }
+
+ protected:
+  tflite::async::ExecutionTask::TensorNameMapT input_names_;
+  tflite::async::ExecutionTask::TensorNameMapT output_names_;
+  TfLiteExecutionTask task_;
+};
+
+TEST_F(TfLiteExecutionTaskTest, BasicTest) {
+  auto* sync = TfLiteSynchronizationCreate();
+
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoInput, "x", 42));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoInput, "y", 43));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoOutput, "a", 44));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetSync(task(), kTfLiteIoInput, "x", sync));
+
+  EXPECT_EQ(42,
+            TfLiteExecutionTaskGetBufferByName(task(), kTfLiteIoInput, "x"));
+  EXPECT_EQ(43,
+            TfLiteExecutionTaskGetBufferByName(task(), kTfLiteIoInput, "y"));
+  EXPECT_EQ(44,
+            TfLiteExecutionTaskGetBufferByName(task(), kTfLiteIoOutput, "a"));
+  EXPECT_EQ(sync,
+            TfLiteExecutionTaskGetSyncByName(task(), kTfLiteIoInput, "x"));
+  EXPECT_EQ(nullptr,
+            TfLiteExecutionTaskGetSyncByName(task(), kTfLiteIoInput, "y"));
+  EXPECT_EQ(nullptr,
+            TfLiteExecutionTaskGetSyncByName(task(), kTfLiteIoOutput, "a"));
+
+  TfLiteSynchronizationDelete(sync);
+}
+
+TEST_F(TfLiteExecutionTaskTest, BasicTestByTensorIndex) {
+  auto* sync = TfLiteSynchronizationCreate();
+
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoInput, "x", 42));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoInput, "y", 43));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetBuffer(task(), kTfLiteIoOutput, "a", 44));
+  EXPECT_EQ(kTfLiteOk,
+            TfLiteExecutionTaskSetSync(task(), kTfLiteIoInput, "x", sync));
+
+  EXPECT_EQ(42, TfLiteExecutionTaskGetBufferByIndex(task(), 1));
+  EXPECT_EQ(43, TfLiteExecutionTaskGetBufferByIndex(task(), 2));
+  EXPECT_EQ(44, TfLiteExecutionTaskGetBufferByIndex(task(), 3));
+  EXPECT_EQ(sync, TfLiteExecutionTaskGetSyncByIndex(task(), 1));
+  EXPECT_EQ(nullptr, TfLiteExecutionTaskGetSyncByIndex(task(), 2));
+  EXPECT_EQ(nullptr, TfLiteExecutionTaskGetSyncByIndex(task(), 3));
+
+  TfLiteSynchronizationDelete(sync);
+}
+
+TEST_F(TfLiteExecutionTaskTest, NullTest) {
+  EXPECT_EQ(kTfLiteError,
+            TfLiteExecutionTaskSetBuffer(nullptr, kTfLiteIoInput, "x", 42));
+  EXPECT_EQ(kTfLiteError,
+            TfLiteExecutionTaskSetSync(nullptr, kTfLiteIoInput, "x", nullptr));
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            TfLiteExecutionTaskGetBufferByName(nullptr, kTfLiteIoOutput, "a"));
+  EXPECT_EQ(nullptr,
+            TfLiteExecutionTaskGetSyncByName(nullptr, kTfLiteIoInput, "x"));
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            TfLiteExecutionTaskGetBufferByIndex(nullptr, 3));
+  EXPECT_EQ(nullptr, TfLiteExecutionTaskGetSyncByIndex(nullptr, 3));
+  EXPECT_EQ(kTfLiteError, TfLiteExecutionTaskGetStatus(nullptr));
+  TfLiteExecutionTaskSetStatus(nullptr, kTfLiteOk);
+}
+
+TEST_F(TfLiteExecutionTaskTest, StatusTest) {
+  EXPECT_EQ(kTfLiteOk, TfLiteExecutionTaskGetStatus(task()));
+  TfLiteExecutionTaskSetStatus(task(), kTfLiteError);
+  EXPECT_EQ(kTfLiteError, TfLiteExecutionTaskGetStatus(task()));
+}
+
+}  // namespace
diff --git a/tensorflow/lite/core/async/c/types.h b/tensorflow/lite/core/async/c/types.h
new file mode 100644
index 00000000000..85bceeff977
--- /dev/null
+++ b/tensorflow/lite/core/async/c/types.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/// Opaque type for TfLiteAsyncKernel.
+typedef struct TfLiteAsyncKernel TfLiteAsyncKernel;
+
+/// Opaque type for TfLiteExecutionTask.
+/// NOTE: TfLiteExecutionTask is NOT thread-safe.
+typedef struct TfLiteExecutionTask TfLiteExecutionTask;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_C_TYPES_H_
diff --git a/tensorflow/lite/core/async/common.h b/tensorflow/lite/core/async/common.h
new file mode 100644
index 00000000000..dd15be1219e
--- /dev/null
+++ b/tensorflow/lite/core/async/common.h
@@ -0,0 +1,29 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_COMMON_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_COMMON_H_
+
+// Enum tag for specifying whether a corresponding API call is targeting
+// input tensor or output tensor.
+typedef enum TfLiteIoType {
+  kTfLiteIoUnknown = 0,
+  kTfLiteIoInput = 1,
+  kTfLiteIoOutput = 2,
+} TfLiteIoType;
+
+// Representing an async exection.
+typedef int TfLiteAsyncEventHandle;
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_COMMON_H_
diff --git a/tensorflow/lite/core/async/interop/BUILD b/tensorflow/lite/core/async/interop/BUILD
new file mode 100644
index 00000000000..baf991abd3c
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/BUILD
@@ -0,0 +1,70 @@
+# Libraries to support TfLite buffer / synchronization interoperability.
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_test(
+    name = "reconcile_fns_test",
+    srcs = ["reconcile_fns_test.cc"],
+    deps = [
+        ":attribute_keys",
+        ":attribute_map_internal",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "attribute_keys",
+    hdrs = ["attribute_keys.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
+cc_library(
+    name = "variant",
+    srcs = ["variant.cc"],
+    hdrs = ["variant.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
+cc_test(
+    name = "variant_test",
+    srcs = ["variant_test.cc"],
+    deps = [
+        ":variant",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "attribute_map_internal_test",
+    srcs = ["attribute_map_internal_test.cc"],
+    deps = [
+        ":attribute_keys",
+        ":attribute_map_internal",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "attribute_map_internal",
+    srcs = [
+        "attribute_map_internal.cc",
+        "reconcile_fns.cc",
+    ],
+    hdrs = [
+        "attribute_map_internal.h",
+        "reconcile_fns.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":attribute_keys",
+        ":variant",
+        "//tensorflow/lite/core/async/interop/c:types",
+    ],
+)
diff --git a/tensorflow/lite/core/async/interop/attribute_keys.h b/tensorflow/lite/core/async/interop/attribute_keys.h
new file mode 100644
index 00000000000..c2ec2d885db
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/attribute_keys.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_KEYS_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_KEYS_H_
+
+#include <cstdint>
+
+const uint32_t kTfLiteAttributeKeyStart = 0;
+const uint32_t kTfLiteAttributeKeyEnd = UINT32_MAX;
+
+// General buffer attribute keys that are recognizable by TFLite.
+enum class TfLiteBufferAttributeKey : uint32_t {
+  kAttributeKeyStart = kTfLiteAttributeKeyStart,
+  // Backing buffer resource. const char*
+  // e.g. "AHardwareBuffer".
+  kBufferResourceTypeName = 1,
+  // Buffer alignment, size_t
+  kAlignment = 2,
+  // Buffer padding, size_t
+  kPadding = 3,
+  // Buffer offset, size_t
+  kOffset = 4,
+  // Buffer size (padded size if applicable), size_t
+  kSize = 5,
+
+  ATTRIBUTE_KEY_END = kTfLiteAttributeKeyEnd,
+};
+
+// General synchronization attribute keys that are recognizable by TFLite.
+enum class TfLiteSyncAttributeKey : uint32_t {
+  kAttributeKeyStart = kTfLiteAttributeKeyStart,
+  // Synchronization type name. const char*
+  // e.g. "ANeuralNetworksEvent"
+  kSyncObjectTypeName = 1,
+
+  ATTRIBUTE_KEY_END = kTfLiteAttributeKeyEnd,
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_KEYS_H_
diff --git a/tensorflow/lite/core/async/interop/attribute_map_internal.cc b/tensorflow/lite/core/async/interop/attribute_map_internal.cc
new file mode 100644
index 00000000000..40f1d74427d
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/attribute_map_internal.cc
@@ -0,0 +1,48 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+
+#include "tensorflow/lite/core/async/interop/reconcile_fns.h"
+
+namespace tflite {
+namespace interop {
+
+bool AttributeMap::ReconcileAttributes(const AttributeMap* other,
+                                       AttributeMap* merged,
+                                       AttributeMap* conflict) const {
+  if (other == nullptr || merged == nullptr) return false;
+  if (type_ != other->type_) return false;
+  merged->type_ = type_;
+  if (conflict) conflict->type_ = type_;
+
+  // TODO(b/191883048): Reconcile custom keys.
+  return tflite::interop::ReconcileGeneralAttributeKeys(
+      type_, &attrs_, &other->attrs_, &merged->attrs_,
+      conflict ? &conflict->attrs_ : nullptr);
+}
+
+bool AttributeMap::CheckAttributeCoverage(const AttributeMap* other,
+                                          AttributeMap* conflict) const {
+  if (other == nullptr) return false;
+  if (type_ != other->type_) return false;
+  if (conflict) conflict->type_ = type_;
+
+  // TODO(b/191883048): Check custom key coverage.
+  return tflite::interop::CheckGeneralAttributeKeysCoverage(
+      type_, &attrs_, &other->attrs_, conflict ? &conflict->attrs_ : nullptr);
+}
+
+}  // namespace interop
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/interop/attribute_map_internal.h b/tensorflow/lite/core/async/interop/attribute_map_internal.h
new file mode 100644
index 00000000000..262c2b4677c
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/attribute_map_internal.h
@@ -0,0 +1,113 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+#include "tensorflow/lite/core/async/interop/attribute_keys.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/async/interop/variant.h"
+
+namespace tflite {
+namespace interop {
+
+// A value type pruned map, containing the attributes describing the properties
+// of a backend buffer or synchronization object.
+class AttributeMap {
+ public:
+  explicit AttributeMap(TfLiteAttrMapType type) : type_(type) {}
+  using KeyT = uint32_t;
+  using CustomKeyT = std::string;
+  // TODO(b/191883048): Benchmark std::variant vs. tagged union.
+  using ValueT = tflite::interop::Variant;
+  // TODO(b/191883048): Currently the number of attributes is small enough.
+  // So it's possible to optimize with a flat map.
+  using ContainerT = std::map<KeyT, ValueT>;
+  using CustomContainerT = std::map<CustomKeyT, ValueT>;
+
+  bool IsBufferAttributeMap() const { return type_ == kTfLiteBufferAttrMap; }
+  bool IsSyncAttributeMap() const { return type_ == kTfLiteSyncAttrMap; }
+
+  // Reconciles and merges the attribute values from other.
+  // After reconciliation, the merged value is compatible with both *this and
+  // `other`. e.g. a merged buffer size will be the maximum of two operands.
+  // If there's any attributes that cannot be reconciled, it will be filled to
+  // `conflict` if provided.
+  // `other` and `merged` should not be nullptr.
+  // Returns true if there's no conflicting attributes.
+  bool ReconcileAttributes(const AttributeMap* other, AttributeMap* merged,
+                           AttributeMap* conflict) const;
+
+  // Checks if the attributes fully covers requirements.
+  // An attribute covers if the values are compatible or it only appears
+  // in *this.
+  // `other` should not be nullptr otherwise will return false.
+  // Returns true if attrs completely covers requirements.
+  bool CheckAttributeCoverage(const AttributeMap* other,
+                              AttributeMap* conflict) const;
+
+  // Retrieves attribute value by key.
+  // Returns true if corresponding attribute exists, otherwise returns false.
+  template <typename ValueT>
+  bool GetAttr(TfLiteBufferAttributeKey key, ValueT* value) const {
+    if (auto it = attrs_.find(static_cast<uint32_t>(key)); it != attrs_.end()) {
+      *value = it->second.Get<ValueT>();
+      return true;
+    }
+    return false;
+  }
+
+  // Sets attribute value by key.
+  template <typename ValueT>
+  void SetAttr(TfLiteBufferAttributeKey key, ValueT value) {
+    attrs_.insert_or_assign(static_cast<uint32_t>(key), value);
+  }
+
+  // Retrieves custom attribute value by key.
+  // Returns true if corresponding attribute exists, otherwise returns false.
+  template <typename ValueT>
+  bool GetCustomAttr(CustomKeyT key, ValueT* value) const {
+    if (auto it = custom_attrs_.find(key); it != custom_attrs_.end()) {
+      *value = it->second.Get<ValueT>();
+      return true;
+    }
+    return false;
+  }
+
+  // Sets custom attribute value by key.
+  template <typename ValueT>
+  void SetCustomAttr(CustomKeyT key, ValueT value) {
+    custom_attrs_.insert_or_assign(key, value);
+  }
+
+ private:
+  TfLiteAttrMapType type_;
+  ContainerT attrs_;
+  CustomContainerT custom_attrs_;
+};
+
+}  // namespace interop
+}  // namespace tflite
+
+struct TfLiteAttributeMap {
+  explicit TfLiteAttributeMap(TfLiteAttrMapType type) : impl(type) {}
+
+  tflite::interop::AttributeMap impl;
+};
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_ATTRIBUTE_MAP_INTERNAL_H_
diff --git a/tensorflow/lite/core/async/interop/attribute_map_internal_test.cc b/tensorflow/lite/core/async/interop/attribute_map_internal_test.cc
new file mode 100644
index 00000000000..2d47f1b9aa7
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/attribute_map_internal_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/async/interop/attribute_keys.h"
+
+namespace tflite {
+namespace interop {
+namespace {
+
+TEST(AttributeMapTest, TypeTest) {
+  {
+    auto attrs = AttributeMap(kTfLiteBufferAttrMap);
+    EXPECT_TRUE(attrs.IsBufferAttributeMap());
+    EXPECT_FALSE(attrs.IsSyncAttributeMap());
+  }
+
+  {
+    auto attrs = AttributeMap(kTfLiteSyncAttrMap);
+    EXPECT_TRUE(attrs.IsSyncAttributeMap());
+    EXPECT_FALSE(attrs.IsBufferAttributeMap());
+  }
+}
+
+TEST(AttributeMapTest, AccessorTest) {
+  auto attrs = AttributeMap(kTfLiteBufferAttrMap);
+  {
+    attrs.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(8));
+    size_t result;
+    EXPECT_TRUE(attrs.GetAttr(TfLiteBufferAttributeKey::kAlignment, &result));
+    EXPECT_EQ(8, result);
+  }
+  {
+    attrs.SetCustomAttr("Foo", 12);
+    int result;
+    EXPECT_FALSE(attrs.GetCustomAttr("Bar", &result));
+    EXPECT_TRUE(attrs.GetCustomAttr("Foo", &result));
+    EXPECT_EQ(12, result);
+  }
+}
+
+TEST(AttributeMapTest, ReconcileFailDifferentTypes) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  auto attrs2 = AttributeMap(kTfLiteSyncAttrMap);
+  auto attrs3 = AttributeMap(kTfLiteBufferAttrMap);
+  EXPECT_FALSE(
+      attrs1.ReconcileAttributes(&attrs2, &attrs3, /*conflict=*/nullptr));
+  EXPECT_FALSE(attrs1.CheckAttributeCoverage(&attrs2, &attrs3));
+}
+
+TEST(AttributeMapTest, NullptrTest) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  auto attrs2 = AttributeMap(kTfLiteBufferAttrMap);
+  EXPECT_FALSE(attrs1.ReconcileAttributes(/*other=*/nullptr, &attrs2,
+                                          /*conflict=*/nullptr));
+  EXPECT_FALSE(attrs1.ReconcileAttributes(&attrs2, /*merged=*/nullptr,
+                                          /*conflict=*/nullptr));
+  EXPECT_FALSE(attrs1.CheckAttributeCoverage(/*other=*/nullptr,
+                                             /*conflict=*/nullptr));
+}
+
+TEST(AttributeMapTest, ReconcileDifferentTypes) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  auto attrs2 = AttributeMap(kTfLiteSyncAttrMap);
+  auto attrs3 = AttributeMap(kTfLiteBufferAttrMap);
+  EXPECT_FALSE(attrs1.ReconcileAttributes(&attrs2, &attrs3,
+                                          /*conflict=*/nullptr));
+}
+
+TEST(AttributeMapTest, ReconcileTest) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs1.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(8));
+  auto attrs2 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs2.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(4));
+  auto attrs3 = AttributeMap(kTfLiteSyncAttrMap);
+  auto attrs4 = AttributeMap(kTfLiteSyncAttrMap);
+  EXPECT_TRUE(attrs1.ReconcileAttributes(&attrs2, &attrs3, &attrs4));
+  EXPECT_TRUE(attrs3.IsBufferAttributeMap());
+  EXPECT_TRUE(attrs4.IsBufferAttributeMap());
+  size_t result;
+  EXPECT_TRUE(attrs3.GetAttr(TfLiteBufferAttributeKey::kAlignment, &result));
+  EXPECT_EQ(8, result);
+}
+
+TEST(AttributeMapTest, CoverageTest) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs1.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(8));
+  auto attrs2 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs2.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(4));
+  auto attrs3 = AttributeMap(kTfLiteSyncAttrMap);
+  EXPECT_TRUE(attrs1.CheckAttributeCoverage(&attrs2, &attrs3));
+  EXPECT_TRUE(attrs3.IsBufferAttributeMap());
+}
+
+TEST(AttributeMapTest, CoverageFailedTest) {
+  auto attrs1 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs1.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(10));
+  auto attrs2 = AttributeMap(kTfLiteBufferAttrMap);
+  attrs2.SetAttr(TfLiteBufferAttributeKey::kAlignment, size_t(4));
+  auto conflict = AttributeMap(kTfLiteSyncAttrMap);
+  EXPECT_FALSE(attrs1.CheckAttributeCoverage(&attrs2, &conflict));
+  EXPECT_TRUE(conflict.IsBufferAttributeMap());
+  size_t result;
+  EXPECT_TRUE(conflict.GetAttr(TfLiteBufferAttributeKey::kAlignment, &result));
+  EXPECT_EQ(4, result);
+}
+
+}  // namespace
+}  // namespace interop
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/interop/c/BUILD b/tensorflow/lite/core/async/interop/c/BUILD
new file mode 100644
index 00000000000..23268ec14f0
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/BUILD
@@ -0,0 +1,61 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "types",
+    srcs = ["types.cc"],
+    hdrs = ["types.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+)
+
+cc_library(
+    name = "attribute_map",
+    srcs = ["attribute_map.cc"],
+    hdrs = ["attribute_map.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        "//tensorflow/lite/core/async/interop:attribute_map_internal",
+    ],
+)
+
+cc_test(
+    name = "attribute_map_test",
+    srcs = ["attribute_map_test.cc"],
+    deps = [
+        ":attribute_map",
+        ":types",
+        "//tensorflow/lite/core/async/interop:attribute_keys",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "types_test",
+    srcs = ["types_test.cc"],
+    deps = [
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "constants",
+    srcs = ["constants.cc"],
+    hdrs = ["constants.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/core/c:c_api_types",
+    ],
+)
diff --git a/tensorflow/lite/core/async/interop/c/attribute_map.cc b/tensorflow/lite/core/async/interop/c/attribute_map.cc
new file mode 100644
index 00000000000..1abee866e4e
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/attribute_map.cc
@@ -0,0 +1,78 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+extern "C" {
+
+TfLiteAttributeMap* TfLiteAttributeMapCreate(int32_t type) {
+  return new TfLiteAttributeMap(static_cast<TfLiteAttrMapType>(type));
+}
+
+void TfLiteAttributeMapDelete(TfLiteAttributeMap* attrs) { delete attrs; }
+
+bool TfLiteAttributeMapIsBufferAttributeMap(const TfLiteAttributeMap* attrs) {
+  if (attrs) return attrs->impl.IsBufferAttributeMap();
+  return false;
+}
+
+bool TfLiteAttributeMapIsSyncAttributeMap(const TfLiteAttributeMap* attrs) {
+  if (attrs) return attrs->impl.IsSyncAttributeMap();
+  return false;
+}
+
+void TfLiteAttributeMapCopy(const TfLiteAttributeMap* src,
+                            TfLiteAttributeMap* dst) {
+  if (src && dst) {
+    dst->impl = src->impl;
+  }
+}
+
+#define DEFINE_ATTR_MAP_ACCESSOR(type, type_name)                              \
+  bool TfLiteAttributeMapGet##type_name##Attr(const TfLiteAttributeMap* attrs, \
+                                              uint32_t key, type* val) {       \
+    return attrs ? attrs->impl.GetAttr(                                        \
+                       static_cast<TfLiteBufferAttributeKey>(key), val)        \
+                 : false;                                                      \
+  }                                                                            \
+  void TfLiteAttributeMapSet##type_name##Attr(TfLiteAttributeMap* attrs,       \
+                                              uint32_t key, type val) {        \
+    if (attrs) {                                                               \
+      attrs->impl.SetAttr(static_cast<TfLiteBufferAttributeKey>(key), val);    \
+    }                                                                          \
+  }                                                                            \
+  bool TfLiteAttributeMapGetCustom##type_name##Attr(                           \
+      const TfLiteAttributeMap* attrs, const char* key, type* val) {           \
+    return attrs ? attrs->impl.GetCustomAttr(key, val) : false;                \
+  }                                                                            \
+  void TfLiteAttributeMapSetCustom##type_name##Attr(                           \
+      TfLiteAttributeMap* attrs, const char* key, type val) {                  \
+    if (attrs) {                                                               \
+      attrs->impl.SetCustomAttr(key, val);                                     \
+    }                                                                          \
+  }
+
+DEFINE_ATTR_MAP_ACCESSOR(int, Int);
+DEFINE_ATTR_MAP_ACCESSOR(size_t, SizeT);
+DEFINE_ATTR_MAP_ACCESSOR(const char*, String);
+
+#undef DEFINE_ATTR_MAP_ACCESSOR
+
+}  // extern "C"
diff --git a/tensorflow/lite/core/async/interop/c/attribute_map.h b/tensorflow/lite/core/async/interop/c/attribute_map.h
new file mode 100644
index 00000000000..20ff6e06d96
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/attribute_map.h
@@ -0,0 +1,94 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// TfLiteAttributeMap API.
+///
+/// TfLiteAttributeMap stores buffer or sync attributes and keeps those
+/// intelligible across different backends and applications.
+/// Backend delegates can define a set of attribute keys to describe what's
+/// the attribute and defines the type of the value.
+/// Different components (application, TfLite runtime, backends) can use
+/// TfLiteAttributeMap to negotiate the requirements of the buffer / sync
+/// and establish the contract on specifications of a particular input / output.
+/// WARNING: This is an experimental type and subject to change.
+
+/// Opaque type for TfLiteAttributeMap.
+typedef struct TfLiteAttributeMap TfLiteAttributeMap;
+
+/// Creates an attribute map.
+/// `type` argument determines what's the attribute map is describing
+/// (e.g. buffer, or sync object).
+/// Returned object is owned by the caller.
+TfLiteAttributeMap* TfLiteAttributeMapCreate(int32_t type);
+
+/// Destroys the attribute map.
+/// Do nothing if `attrs` is nullptr.
+void TfLiteAttributeMapDelete(TfLiteAttributeMap* attrs);
+
+/// Returns true if `attrs` is a buffer attribute map.
+/// If `attrs` is nullptr, returns false.
+bool TfLiteAttributeMapIsBufferAttributeMap(const TfLiteAttributeMap* attrs);
+
+/// Returns true if `attrs` is a sync object attribute map.
+/// If `attrs` is nullptr, returns false.
+bool TfLiteAttributeMapIsSyncAttributeMap(const TfLiteAttributeMap* attrs);
+
+/// Copies all attributes from `src` to `dst`. Any existing attributes in `dst`
+/// will be cleared.
+/// If `src` or `dst` is null, does nothing.
+void TfLiteAttributeMapCopy(const TfLiteAttributeMap* src,
+                            TfLiteAttributeMap* dst);
+
+// --------------------------------------------------------------------------
+/// Accessor methods.
+///
+/// For getters, returns false if the key is not set. TfLite does not check
+/// the type of values, callers needs to make sure the requested type matches
+/// the value set in the map.
+/// For setters, if the value type is a pointer (e.g. c string literals),
+/// caller needs to ensure the lifetime of value exceeds the attribute map.
+/// If the key is set in previous calls, old value will be overriden by
+/// successive setter calls.
+#define DECLARE_ATTR_MAP_ACCESSOR(type, type_name)                             \
+  bool TfLiteAttributeMapGet##type_name##Attr(const TfLiteAttributeMap* attrs, \
+                                              uint32_t key, type* val);        \
+  void TfLiteAttributeMapSet##type_name##Attr(TfLiteAttributeMap* attrs,       \
+                                              uint32_t key, type val);         \
+  bool TfLiteAttributeMapGetCustom##type_name##Attr(                           \
+      const TfLiteAttributeMap* attrs, const char* key, type* val);            \
+  void TfLiteAttributeMapSetCustom##type_name##Attr(                           \
+      TfLiteAttributeMap* attrs, const char* key, type val);
+
+DECLARE_ATTR_MAP_ACCESSOR(int, Int);
+DECLARE_ATTR_MAP_ACCESSOR(size_t, SizeT);
+DECLARE_ATTR_MAP_ACCESSOR(const char*, String);
+
+#undef DECLARE_ATTR_MAP_ACCESSOR
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_ATTRIBUTE_MAP_H_
diff --git a/tensorflow/lite/core/async/interop/c/attribute_map_test.cc b/tensorflow/lite/core/async/interop/c/attribute_map_test.cc
new file mode 100644
index 00000000000..d82df6c32f7
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/attribute_map_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/c/attribute_map.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace {
+
+TEST(AttributeMapTest, AttributeMapCreateTypeCheckTest) {
+  {
+    auto* attr = TfLiteAttributeMapCreate(kTfLiteBufferAttrMap);
+    EXPECT_TRUE(TfLiteAttributeMapIsBufferAttributeMap(attr));
+    EXPECT_FALSE(TfLiteAttributeMapIsSyncAttributeMap(attr));
+    TfLiteAttributeMapDelete(attr);
+  }
+  {
+    auto* attr = TfLiteAttributeMapCreate(kTfLiteSyncAttrMap);
+    EXPECT_FALSE(TfLiteAttributeMapIsBufferAttributeMap(attr));
+    EXPECT_TRUE(TfLiteAttributeMapIsSyncAttributeMap(attr));
+    TfLiteAttributeMapDelete(attr);
+  }
+}
+
+TEST(AttributeMapTest, AttributeMapAccessor) {
+  auto* attr = TfLiteAttributeMapCreate(kTfLiteBufferAttrMap);
+  {
+    TfLiteAttributeMapSetSizeTAttr(attr, 1, 42);
+    size_t result = 0;
+    EXPECT_TRUE(TfLiteAttributeMapGetSizeTAttr(attr, 1, &result));
+    EXPECT_EQ(42, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetSizeTAttr(attr, 2, &result));
+  }
+  {
+    TfLiteAttributeMapSetIntAttr(attr, 3, 21);
+    int result = 0;
+    EXPECT_TRUE(TfLiteAttributeMapGetIntAttr(attr, 3, &result));
+    EXPECT_EQ(21, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetIntAttr(attr, 4, &result));
+  }
+  {
+    const char str[] = "some string";
+    // Overriding key 1.
+    TfLiteAttributeMapSetStringAttr(attr, 1, str);
+    const char* result = nullptr;
+    EXPECT_TRUE(TfLiteAttributeMapGetStringAttr(attr, 1, &result));
+    EXPECT_EQ(str, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetStringAttr(attr, 2, &result));
+  }
+  TfLiteAttributeMapDelete(attr);
+}
+
+TEST(AttributeMapTest, AttributeMapCustomAccessor) {
+  auto* attr = TfLiteAttributeMapCreate(kTfLiteBufferAttrMap);
+  {
+    TfLiteAttributeMapSetCustomSizeTAttr(attr, "foo", 42);
+    size_t result = 0;
+    EXPECT_TRUE(TfLiteAttributeMapGetCustomSizeTAttr(attr, "foo", &result));
+    EXPECT_EQ(42, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetCustomSizeTAttr(attr, "bar", &result));
+  }
+  {
+    TfLiteAttributeMapSetCustomIntAttr(attr, "baz", 21);
+    int result = 0;
+    EXPECT_TRUE(TfLiteAttributeMapGetCustomIntAttr(attr, "baz", &result));
+    EXPECT_EQ(21, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetCustomIntAttr(attr, "quux", &result));
+  }
+  {
+    const char str[] = "some string";
+    // Overriding key "foo".
+    TfLiteAttributeMapSetCustomStringAttr(attr, "foo", str);
+    const char* result = nullptr;
+    EXPECT_TRUE(TfLiteAttributeMapGetCustomStringAttr(attr, "foo", &result));
+    EXPECT_EQ(str, result);
+    EXPECT_FALSE(TfLiteAttributeMapGetCustomStringAttr(attr, "bar", &result));
+  }
+  TfLiteAttributeMapDelete(attr);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/core/async/interop/c/constants.cc b/tensorflow/lite/core/async/interop/c/constants.cc
new file mode 100644
index 00000000000..27a8cfd4aea
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/constants.cc
@@ -0,0 +1,21 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/c/constants.h"
+
+extern "C" {
+
+const char kTfLiteSyncTypeNoSyncObj[] = "no_sync_obj";
+
+}  // extern "C"
diff --git a/tensorflow/lite/core/async/interop/c/constants.h b/tensorflow/lite/core/async/interop/c/constants.h
new file mode 100644
index 00000000000..2c9dcfabc5e
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/constants.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// Constants for TensorFlow Lite Async API.
+///
+/// WARNING: This is an experimental type and subject to change.
+
+/// Synchronization type name of "no synchronization object".
+///
+/// This is the default synchronization type for tensors that do not have
+/// user-specified synchronization attributes.
+/// When set on input tensors, the backend must ignore any input synchronization
+/// objects provided by the user, and the buffer content of the input tensor
+/// must be ready when AsyncSignatureRunner::InvokeAsync is called.
+/// When set on output tensors, the backend must not provide any output
+/// synchronization objects back to the user, and the buffer content of the
+/// output tensor must be ready when AsyncSignatureRunner::Wait returns.
+TFL_CAPI_EXPORT extern const char kTfLiteSyncTypeNoSyncObj[];  // "no_sync_obj"
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_CONSTANTS_H_
diff --git a/tensorflow/lite/core/async/interop/c/types.cc b/tensorflow/lite/core/async/interop/c/types.cc
new file mode 100644
index 00000000000..9b077da23a8
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/types.cc
@@ -0,0 +1,55 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+struct TfLiteBackendBuffer {
+  void* ptr = nullptr;
+};
+
+struct TfLiteSynchronization {
+  void* ptr = nullptr;
+};
+
+extern "C" {
+
+TfLiteBackendBuffer* TfLiteBackendBufferCreate() {
+  return new TfLiteBackendBuffer;
+}
+void TfLiteBackendBufferDelete(TfLiteBackendBuffer* buf) {
+  if (buf) delete buf;
+}
+void TfLiteBackendBufferSetPtr(TfLiteBackendBuffer* buf, void* ptr) {
+  buf->ptr = ptr;
+}
+
+void* TfLiteBackendBufferGetPtr(const TfLiteBackendBuffer* buf) {
+  return buf->ptr;
+}
+
+TfLiteSynchronization* TfLiteSynchronizationCreate() {
+  return new TfLiteSynchronization;
+}
+void TfLiteSynchronizationDelete(TfLiteSynchronization* sync) {
+  if (sync) delete sync;
+}
+void TfLiteSynchronizationSetPtr(TfLiteSynchronization* sync, void* ptr) {
+  sync->ptr = ptr;
+}
+
+void* TfLiteSynchronizationGetPtr(const TfLiteSynchronization* sync) {
+  return sync->ptr;
+}
+
+}  // extern "C"
diff --git a/tensorflow/lite/core/async/interop/c/types.h b/tensorflow/lite/core/async/interop/c/types.h
new file mode 100644
index 00000000000..97748052aac
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/types.h
@@ -0,0 +1,95 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+/// Opaque types for buffer / synchronization object interoperability.
+/// WARNING: This is an experimental type and subject to change.
+
+/// Type of the attribute map.
+/// An attribute map can either describe the propoerties of backend buffers
+/// or sychronizations.
+/// The value of the TfLiteAttrMapType determines the interpretation of
+/// attribute keys. See comments below.
+typedef enum TfLiteAttrMapType {
+  kTfLiteAttrMapUnknown = 0,
+
+  // The attributes describes a backend buffer.
+  // Keys are of TfLiteBufferAttributeKey type.
+  kTfLiteBufferAttrMap = 1,
+
+  // The attributes describes a sync object.
+  // Keys are of TfLiteSyncAttributeKey type.
+  kTfLiteSyncAttrMap = 2,
+} TfLiteAttrMapType;
+
+/// TfLiteBackendBuffer is a an opaque type that abstracts platform specific
+/// implementations of buffer objects. It's used for carrying the actual buffer
+/// across applications, TFLite runtime and backends.
+typedef struct TfLiteBackendBuffer TfLiteBackendBuffer;
+
+/// Creates an empty TfLiteBackendBuffer.
+/// Returned object is owned by the caller.
+TfLiteBackendBuffer* TfLiteBackendBufferCreate();
+
+/// Destroys a TfLiteBackendBuffer.
+/// Calling this function will not release the actual buffer stored underneath.
+void TfLiteBackendBufferDelete(TfLiteBackendBuffer* buf);
+
+/// Stores a type puned buffer object to TfLiteBackendBuffer.
+/// `buf` will not own or control the lifecycle of `ptr`.
+/// Callers needs to ensure lifetime of *ptr exceeds `buf`.
+void TfLiteBackendBufferSetPtr(TfLiteBackendBuffer* buf, void* ptr);
+
+/// Retrieves the buffer object from TfLiteBackendBuffer.
+/// Callers can use TfLiteAttributeMap buffer type name to interpret returned
+/// pointer.
+void* TfLiteBackendBufferGetPtr(const TfLiteBackendBuffer* buf);
+
+/// TfLiteSynchronization is an opaque type that abstracts platform specific
+/// implementations of synchronization objects. It's used for carrying the
+/// actual sync object across applications, TFLite runtime and backends.
+typedef struct TfLiteSynchronization TfLiteSynchronization;
+
+/// Creates an empty TfLiteSynchronization.
+/// Returned object is owned by the caller.
+TfLiteSynchronization* TfLiteSynchronizationCreate();
+
+/// Destroys a TfLiteSynchronization.
+/// Calling this function will not release the actual sync object stored.
+void TfLiteSynchronizationDelete(TfLiteSynchronization* sync);
+
+/// Stores a type-punned pointer for actual synchronization object.
+/// `sync` will not own or control the lifecycle of `ptr`.
+/// Callers needs to ensure lifetime of *ptr exceeds `sync`.
+void TfLiteSynchronizationSetPtr(TfLiteSynchronization* sync, void* ptr);
+
+/// Retrieves the sync object from TfLiteSynchronization.
+/// Callers can use TfLiteAttributeMap sync type name to interpret returned
+/// pointer.
+void* TfLiteSynchronizationGetPtr(const TfLiteSynchronization* sync);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_C_TYPES_H_
diff --git a/tensorflow/lite/core/async/interop/c/types_test.cc b/tensorflow/lite/core/async/interop/c/types_test.cc
new file mode 100644
index 00000000000..7cdc62854be
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/c/types_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+TEST(TypesTest, TfLiteBackendBuffer) {
+  auto* tflite_buf = TfLiteBackendBufferCreate();
+
+  EXPECT_EQ(nullptr, TfLiteBackendBufferGetPtr(tflite_buf));
+
+  char buf[42];
+  TfLiteBackendBufferSetPtr(tflite_buf, buf);
+  EXPECT_EQ(buf, TfLiteBackendBufferGetPtr(tflite_buf));
+
+  char another_buf[7];
+  TfLiteBackendBufferSetPtr(tflite_buf, another_buf);
+  EXPECT_EQ(another_buf, TfLiteBackendBufferGetPtr(tflite_buf));
+
+  TfLiteBackendBufferDelete(tflite_buf);
+}
+
+TEST(TypesTest, TfLiteSynchronization) {
+  auto* tflite_sync = TfLiteSynchronizationCreate();
+
+  EXPECT_EQ(nullptr, TfLiteSynchronizationGetPtr(tflite_sync));
+
+  int fd = 42;
+  TfLiteSynchronizationSetPtr(tflite_sync, &fd);
+  EXPECT_EQ(&fd, TfLiteSynchronizationGetPtr(tflite_sync));
+
+  double sync = 7.11;
+  TfLiteSynchronizationSetPtr(tflite_sync, &sync);
+  EXPECT_EQ(&sync, TfLiteSynchronizationGetPtr(tflite_sync));
+
+  TfLiteSynchronizationDelete(tflite_sync);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/core/async/interop/reconcile_fns.cc b/tensorflow/lite/core/async/interop/reconcile_fns.cc
new file mode 100644
index 00000000000..db03fdf6fed
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/reconcile_fns.cc
@@ -0,0 +1,187 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/reconcile_fns.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+
+#include "tensorflow/lite/core/async/interop/attribute_keys.h"
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+
+namespace tflite {
+namespace interop {
+namespace {
+
+// TODO(b/191883048): Check binary size impact with <numeric> header and replace
+// with std::lcm if possible.
+template <typename T>
+T gcd(T x, T y) {
+  while (y) {
+    auto m = x % y;
+    x = y;
+    y = m;
+  }
+  return x;
+}
+
+template <typename T>
+T lcm(T x, T y) {
+  return x / gcd(x, y) * y;
+}
+
+// Reconciled alignment is LCM of l and r.
+void ReconcileAlignment(size_t l, size_t r, AttributeMap::ContainerT* merged) {
+  merged->insert_or_assign(
+      static_cast<size_t>(TfLiteBufferAttributeKey::kAlignment), lcm(l, r));
+}
+
+// Reconciled padding is LCM of l and r.
+void ReconcilePadding(size_t l, size_t r, AttributeMap::ContainerT* merged) {
+  merged->insert_or_assign(
+      static_cast<size_t>(TfLiteBufferAttributeKey::kPadding), lcm(l, r));
+}
+
+// For alignment and padding, if l is multiples of r, it's covering r.
+bool CheckMultiples(size_t l, size_t r) { return l % r == 0; }
+
+// Reconciled size is max(l, r).
+void ReconcileSize(size_t l, size_t r, AttributeMap::ContainerT* merged) {
+  merged->insert_or_assign(static_cast<size_t>(TfLiteBufferAttributeKey::kSize),
+                           std::max(l, r));
+}
+
+// Checks if l >= r.
+bool CheckSize(size_t l, size_t r) { return l >= r; }
+
+}  // namespace
+
+bool ReconcileGeneralAttributeKeys(TfLiteAttrMapType type,
+                                   const AttributeMap::ContainerT* lhs,
+                                   const AttributeMap::ContainerT* rhs,
+                                   AttributeMap::ContainerT* merged,
+                                   AttributeMap::ContainerT* conflict) {
+  if (lhs == nullptr || rhs == nullptr || merged == nullptr) return false;
+  bool ret = true;
+  std::set<uint32_t> keys;
+  std::transform(lhs->begin(), lhs->end(), std::inserter(keys, keys.end()),
+                 [](auto pair) { return pair.first; });
+  std::transform(rhs->begin(), rhs->end(), std::inserter(keys, keys.end()),
+                 [](auto pair) { return pair.first; });
+  for (auto k : keys) {
+    const auto l = lhs->find(k);
+    const auto r = rhs->find(k);
+    if (l == lhs->end()) {
+      merged->insert_or_assign(k, r->second);
+      continue;
+    }
+    if (r == rhs->end()) {
+      merged->insert_or_assign(k, l->second);
+      continue;
+    }
+    if (type == kTfLiteBufferAttrMap) {
+      switch (static_cast<TfLiteBufferAttributeKey>(k)) {
+        case TfLiteBufferAttributeKey::kSize:
+          ReconcileSize(l->second.Get<size_t>(), r->second.Get<size_t>(),
+                        merged);
+          break;
+        case TfLiteBufferAttributeKey::kAlignment:
+          ReconcileAlignment(l->second.Get<size_t>(), r->second.Get<size_t>(),
+                             merged);
+          break;
+        case TfLiteBufferAttributeKey::kPadding:
+          ReconcilePadding(l->second.Get<size_t>(), r->second.Get<size_t>(),
+                           merged);
+          break;
+        default:
+          // For other keys, check equality.
+          if (l->second == r->second) {
+            merged->insert_or_assign(k, l->second);
+          } else {
+            ret = false;
+            if (conflict) conflict->insert_or_assign(k, r->second);
+          }
+      }
+    } else {
+      // Check equality.
+      if (l->second == r->second) {
+        merged->insert_or_assign(k, l->second);
+      } else {
+        ret = false;
+        if (conflict) conflict->insert_or_assign(k, r->second);
+      }
+    }
+  }
+  return ret;
+}
+
+bool CheckGeneralAttributeKeysCoverage(TfLiteAttrMapType type,
+                                       const AttributeMap::ContainerT* lhs,
+                                       const AttributeMap::ContainerT* rhs,
+                                       AttributeMap::ContainerT* conflict) {
+  if (lhs == nullptr || rhs == nullptr) return false;
+  bool ret = true;
+  std::set<uint32_t> keys;
+  std::transform(lhs->begin(), lhs->end(), std::inserter(keys, keys.end()),
+                 [](auto pair) { return pair.first; });
+  std::transform(rhs->begin(), rhs->end(), std::inserter(keys, keys.end()),
+                 [](auto pair) { return pair.first; });
+  for (auto k : keys) {
+    bool has_conflict = false;
+    const auto l = lhs->find(k);
+    const auto r = rhs->find(k);
+    if (r == rhs->end()) {
+      continue;
+    } else if (l == lhs->end()) {
+      has_conflict = true;
+    } else {
+      if (type == kTfLiteBufferAttrMap) {
+        switch (static_cast<TfLiteBufferAttributeKey>(k)) {
+          case TfLiteBufferAttributeKey::kSize:
+            has_conflict |=
+                !CheckSize(l->second.Get<size_t>(), r->second.Get<size_t>());
+            break;
+          case TfLiteBufferAttributeKey::kAlignment:
+            has_conflict |= !CheckMultiples(l->second.Get<size_t>(),
+                                            r->second.Get<size_t>());
+            break;
+          case TfLiteBufferAttributeKey::kPadding:
+            has_conflict |=
+                !CheckSize(l->second.Get<size_t>(), r->second.Get<size_t>());
+            break;
+          default:
+            // For other keys, check equality.
+            if (l->second != r->second) {
+              has_conflict = true;
+            }
+        }
+      } else {
+        if (l->second != r->second) {
+          has_conflict = true;
+        }
+      }
+    }
+    if (has_conflict) {
+      if (conflict != nullptr) conflict->insert_or_assign(k, r->second);
+      ret = false;
+    }
+  }
+  return ret;
+}
+
+}  // namespace interop
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/interop/reconcile_fns.h b/tensorflow/lite/core/async/interop/reconcile_fns.h
new file mode 100644
index 00000000000..72d9f3221a8
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/reconcile_fns.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
+
+// Reconciliation functions for merging and examinate buffer / synchronization
+// attributes.
+
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace tflite {
+namespace interop {
+
+// Reconciles general attributes.
+// `lhs`, `rhs`, `merged` are required to be not null, otherwise return false.
+// The merged attribute will be set in `merged`. If there's any attribute that
+// can not be reconciled, it will be set in `conflict` and return false.
+bool ReconcileGeneralAttributeKeys(TfLiteAttrMapType type,
+                                   const AttributeMap::ContainerT* lhs,
+                                   const AttributeMap::ContainerT* rhs,
+                                   AttributeMap::ContainerT* merged,
+                                   AttributeMap::ContainerT* conflict);
+
+// Check if `lhs` covers all attribute in `rhs`.
+// `lhs` and `rhs` are required to be not null, otherwise return false.
+// If there's any attribute that is not covered (i.e. missing from `lhs` or
+// values are incompatible), it will be set in `conflict` and return false.
+bool CheckGeneralAttributeKeysCoverage(TfLiteAttrMapType type,
+                                       const AttributeMap::ContainerT* lhs,
+                                       const AttributeMap::ContainerT* rhs,
+                                       AttributeMap::ContainerT* conflict);
+
+}  // namespace interop
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_RECONCILE_FNS_H_
diff --git a/tensorflow/lite/core/async/interop/reconcile_fns_test.cc b/tensorflow/lite/core/async/interop/reconcile_fns_test.cc
new file mode 100644
index 00000000000..f70275eb4c2
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/reconcile_fns_test.cc
@@ -0,0 +1,297 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/reconcile_fns.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <tuple>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/async/interop/attribute_keys.h"
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
+
+namespace tflite::interop {
+namespace {
+
+using ContainerT = AttributeMap::ContainerT;
+
+template <typename ValT, typename KeyT>
+void SetAttr(ContainerT* c, KeyT k, ValT v) {
+  c->insert_or_assign(static_cast<uint32_t>(k), v);
+}
+
+template <typename ValT, typename KeyT>
+ValT GetAttr(const ContainerT& c, KeyT k) {
+  return c.at(static_cast<uint32_t>(k)).Get<ValT>();
+}
+
+TEST(ReconcileTest, NullCheck) {
+  ContainerT m1, m2;
+  // `merged` nullptr
+  EXPECT_FALSE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &m1, &m2,
+                                             /*merged=*/nullptr,
+                                             /*conflict=*/nullptr));
+  // `lhs` nullptr
+  EXPECT_FALSE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap,
+                                             /*lhs=*/nullptr, &m1, &m2,
+                                             /*conflict=*/nullptr));
+  // `rhs` nullptr
+  EXPECT_FALSE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &m1,
+                                             /*rhs=*/nullptr, &m2,
+                                             /*conflict=*/nullptr));
+  // `lhs` nullptr
+  EXPECT_FALSE(CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap,
+                                                 /*lhs=*/nullptr, &m1, &m2));
+  // `rhs` nullptr
+  EXPECT_FALSE(CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &m1,
+                                                 /*rhs=*/nullptr, &m2));
+}
+
+TEST(ReconcileTest, MissingAttributeTest) {
+  {
+    ContainerT lhs, rhs, merged;
+    SetAttr(&lhs, TfLiteBufferAttributeKey::kAlignment, size_t(4));
+    EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &merged, nullptr));
+    EXPECT_EQ(4, GetAttr<size_t>(merged, TfLiteBufferAttributeKey::kAlignment));
+  }
+
+  {
+    ContainerT lhs, rhs, merged;
+    SetAttr(&rhs, TfLiteBufferAttributeKey::kAlignment, size_t(4));
+    EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &merged, nullptr));
+    EXPECT_EQ(4, GetAttr<size_t>(merged, TfLiteBufferAttributeKey::kAlignment));
+  }
+
+  {
+    ContainerT lhs, rhs, merged;
+    const char value[] = "string";
+    SetAttr(&rhs, TfLiteSyncAttributeKey::kSyncObjectTypeName, value);
+    EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteSyncAttrMap, &lhs, &rhs,
+                                              &merged, nullptr));
+    EXPECT_EQ(value, GetAttr<const char*>(
+                         merged, TfLiteSyncAttributeKey::kSyncObjectTypeName));
+  }
+}
+
+TEST(CheckCoverageTest, MissingAttributeTest) {
+  {
+    ContainerT lhs, rhs;
+    SetAttr(&lhs, TfLiteBufferAttributeKey::kAlignment, size_t(4));
+    EXPECT_TRUE(CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &lhs,
+                                                  &rhs, nullptr));
+  }
+
+  {
+    ContainerT lhs, rhs, merged;
+    SetAttr(&rhs, TfLiteBufferAttributeKey::kAlignment, size_t(4));
+    EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &merged, nullptr));
+    EXPECT_FALSE(CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &lhs,
+                                                   &rhs, nullptr));
+  }
+}
+
+class ReconcileAlignmentTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, size_t>> {};
+
+TEST_P(ReconcileAlignmentTest, Test) {
+  ContainerT lhs, rhs, merged;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kAlignment, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kAlignment, std::get<1>(GetParam()));
+  EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                            &merged, nullptr));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            GetAttr<size_t>(merged, TfLiteBufferAttributeKey::kAlignment));
+}
+
+INSTANTIATE_TEST_SUITE_P(ReconcileAlignmentTest, ReconcileAlignmentTest,
+                         testing::Values(std::make_tuple(4, 4, 4),
+                                         std::make_tuple(1, 4, 4),
+                                         std::make_tuple(8, 4, 8),
+                                         std::make_tuple(8, 3, 24)));
+
+class CheckAlignmentTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, bool>> {};
+
+TEST_P(CheckAlignmentTest, Test) {
+  ContainerT lhs, rhs, conflict;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kAlignment, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kAlignment, std::get<1>(GetParam()));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &conflict));
+  EXPECT_EQ(!std::get<2>(GetParam()),
+            conflict.count(
+                static_cast<uint32_t>(TfLiteBufferAttributeKey::kAlignment)));
+}
+
+INSTANTIATE_TEST_SUITE_P(CheckAlignmentTest, CheckAlignmentTest,
+                         testing::Values(std::make_tuple(4, 4, true),
+                                         std::make_tuple(4, 1, true),
+                                         std::make_tuple(1, 4, false)));
+
+class ReconcilePaddingTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, size_t>> {};
+
+TEST_P(ReconcilePaddingTest, Test) {
+  ContainerT lhs, rhs, merged;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kPadding, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kPadding, std::get<1>(GetParam()));
+  EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                            &merged, nullptr));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            GetAttr<size_t>(merged, TfLiteBufferAttributeKey::kPadding));
+}
+
+INSTANTIATE_TEST_SUITE_P(ReconcilePaddingTest, ReconcilePaddingTest,
+                         testing::Values(std::make_tuple(4, 4, 4),
+                                         std::make_tuple(1, 4, 4),
+                                         std::make_tuple(8, 4, 8),
+                                         std::make_tuple(8, 3, 24)));
+
+class CheckPaddingTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, bool>> {};
+
+TEST_P(CheckPaddingTest, Test) {
+  ContainerT lhs, rhs, conflict;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kPadding, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kPadding, std::get<1>(GetParam()));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &conflict));
+  EXPECT_EQ(!std::get<2>(GetParam()), conflict.count(static_cast<uint32_t>(
+                                          TfLiteBufferAttributeKey::kPadding)));
+}
+
+INSTANTIATE_TEST_SUITE_P(CheckPaddingTest, CheckPaddingTest,
+                         testing::Values(std::make_tuple(4, 4, true),
+                                         std::make_tuple(4, 1, true),
+                                         std::make_tuple(1, 4, false)));
+
+class ReconcileSizeTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, size_t>> {};
+
+TEST_P(ReconcileSizeTest, Test) {
+  ContainerT lhs, rhs, merged;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kSize, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kSize, std::get<1>(GetParam()));
+  EXPECT_TRUE(ReconcileGeneralAttributeKeys(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                            &merged, nullptr));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            GetAttr<size_t>(merged, TfLiteBufferAttributeKey::kSize));
+}
+
+INSTANTIATE_TEST_SUITE_P(ReconcileSizeTest, ReconcileSizeTest,
+                         testing::Values(std::make_tuple(4, 4, 4),
+                                         std::make_tuple(1, 4, 4),
+                                         std::make_tuple(8, 4, 8),
+                                         std::make_tuple(8, 3, 8)));
+
+class CheckSizeTest
+    : public testing::TestWithParam<std::tuple<size_t, size_t, bool>> {};
+
+TEST_P(CheckSizeTest, Test) {
+  ContainerT lhs, rhs, conflict;
+  SetAttr(&lhs, TfLiteBufferAttributeKey::kSize, std::get<0>(GetParam()));
+  SetAttr(&rhs, TfLiteBufferAttributeKey::kSize, std::get<1>(GetParam()));
+  EXPECT_EQ(std::get<2>(GetParam()),
+            CheckGeneralAttributeKeysCoverage(kTfLiteBufferAttrMap, &lhs, &rhs,
+                                              &conflict));
+  EXPECT_EQ(
+      !std::get<2>(GetParam()),
+      conflict.count(static_cast<uint32_t>(TfLiteBufferAttributeKey::kSize)));
+}
+
+INSTANTIATE_TEST_SUITE_P(CheckSizeTest, CheckSizeTest,
+                         testing::Values(std::make_tuple(4, 4, true),
+                                         std::make_tuple(4, 1, true),
+                                         std::make_tuple(1, 4, false)));
+
+class ReconcileNameTest
+    : public testing::TestWithParam<std::tuple<TfLiteAttrMapType, uint32_t>> {};
+
+TEST_P(ReconcileNameTest, Test) {
+  constexpr char name_string1[] = "string1";
+  std::string name_string1_1 = "string1";
+  constexpr char name_string2[] = "string2";
+  {
+    ContainerT lhs, rhs, merged;
+    SetAttr(&lhs, std::get<1>(GetParam()), name_string1);
+    SetAttr(&rhs, std::get<1>(GetParam()), name_string1_1.c_str());
+    EXPECT_TRUE(ReconcileGeneralAttributeKeys(std::get<0>(GetParam()), &lhs,
+                                              &rhs, &merged, nullptr));
+    EXPECT_EQ(0, strcmp(GetAttr<const char*>(merged, std::get<1>(GetParam())),
+                        name_string1));
+  }
+  {
+    ContainerT lhs, rhs, merged, conflict;
+    SetAttr(&lhs, std::get<1>(GetParam()), name_string1);
+    SetAttr(&rhs, std::get<1>(GetParam()), name_string2);
+    EXPECT_FALSE(ReconcileGeneralAttributeKeys(std::get<0>(GetParam()), &lhs,
+                                               &rhs, &merged, &conflict));
+    EXPECT_TRUE(conflict.count(std::get<1>(GetParam())));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ReconcileNameTest, ReconcileNameTest,
+    testing::Values(
+        std::make_tuple(kTfLiteBufferAttrMap,
+                        static_cast<uint32_t>(
+                            TfLiteBufferAttributeKey::kBufferResourceTypeName)),
+        std::make_tuple(kTfLiteSyncAttrMap,
+                        static_cast<uint32_t>(
+                            TfLiteSyncAttributeKey::kSyncObjectTypeName))));
+
+class CheckNameTest
+    : public testing::TestWithParam<std::tuple<TfLiteAttrMapType, uint32_t>> {};
+
+TEST_P(CheckNameTest, Test) {
+  constexpr char name_string1[] = "string1";
+  std::string name_string1_1 = "string1";
+  constexpr char name_string2[] = "string2";
+  {
+    ContainerT lhs, rhs;
+    SetAttr(&lhs, std::get<1>(GetParam()), name_string1);
+    SetAttr(&rhs, std::get<1>(GetParam()), name_string1_1.c_str());
+    EXPECT_TRUE(CheckGeneralAttributeKeysCoverage(std::get<0>(GetParam()), &lhs,
+                                                  &rhs, nullptr));
+  }
+  {
+    ContainerT lhs, rhs, conflict;
+    SetAttr(&lhs, std::get<1>(GetParam()), name_string1);
+    SetAttr(&rhs, std::get<1>(GetParam()), name_string2);
+    EXPECT_FALSE(CheckGeneralAttributeKeysCoverage(std::get<0>(GetParam()),
+                                                   &lhs, &rhs, &conflict));
+    EXPECT_TRUE(conflict.count(std::get<1>(GetParam())));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CheckNameTest, CheckNameTest,
+    testing::Values(
+        std::make_tuple(kTfLiteBufferAttrMap,
+                        static_cast<uint32_t>(
+                            TfLiteBufferAttributeKey::kBufferResourceTypeName)),
+        std::make_tuple(kTfLiteSyncAttrMap,
+                        static_cast<uint32_t>(
+                            TfLiteSyncAttributeKey::kSyncObjectTypeName))));
+
+}  // namespace
+}  // namespace tflite::interop
diff --git a/tensorflow/lite/core/async/interop/variant.cc b/tensorflow/lite/core/async/interop/variant.cc
new file mode 100644
index 00000000000..038c8a53c55
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/variant.cc
@@ -0,0 +1,47 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/variant.h"
+
+#include <utility>
+
+namespace tflite {
+namespace interop {
+
+Variant::Variant() {
+  type = kInvalid;
+  val.i = 0;
+}
+
+bool Variant::operator==(const Variant& other) const {
+  if (type != other.type) return false;
+  switch (type) {
+    case kInvalid:
+      // Treats uninitialized variant equals.
+      return true;
+    case kInt:
+      return val.i == other.val.i;
+    case kSizeT:
+      return val.s == other.val.s;
+    case kString:
+      return (val.c == other.val.c) || (strcmp(val.c, other.val.c) == 0);
+  }
+}
+
+bool Variant::operator!=(const Variant& other) const {
+  return !(*this == other);
+}
+
+}  // namespace interop
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/interop/variant.h b/tensorflow/lite/core/async/interop/variant.h
new file mode 100644
index 00000000000..25f17506efe
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/variant.h
@@ -0,0 +1,117 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
+
+#include <cstddef>
+#include <string>
+
+namespace tflite {
+namespace interop {
+
+// Tagged union implementation for variant type.
+// Getters and Setters have compile time check to ensure the type is supported
+// in the variant. But this class won't perform runtime type check. Callers
+// are required to ensure type used in getters are the same as setters.
+// For pointer type values hold in the variant (including C-style string literal
+// const char*), Variant does not hold the ownership of the value.
+struct Variant {
+  Variant();
+
+  template <typename T>
+  explicit Variant(T v) {
+    Set(v);
+  }
+
+  template <typename T>
+  Variant& operator=(T v) {
+    Set(v);
+    return *this;
+  }
+
+  // Getter. Disabled if the type is not supported in the variant.
+  template <typename T>
+  const T& Get() const = delete;
+
+  // Setter. Disabled if the type is not supported in the variant.
+  template <typename T>
+  void Set(T v) = delete;
+
+  // Returns the opaque data pointer.
+  // Callers are responsible for ensuring to cast to correct type.
+  void const* GetPtr() const { return &val; }
+
+  // Comparator.
+  // If the underlying data is string type (const char*), performs a string
+  // comparison. Otherwise checks equality of the data.
+  bool operator==(const Variant& other) const;
+  bool operator!=(const Variant& other) const;
+
+  // Data types supported in the variant.
+  union {
+    int i;
+    size_t s;
+    const char* c;
+  } val;
+
+  // Tracking bit used for equality comparison.
+  enum { kInvalid, kInt, kSizeT, kString } type;
+};
+
+// Copyable.
+template <>
+inline Variant::Variant(const Variant& v) : val(v.val), type(v.type) {}
+
+// Copy assign with copy-and-swap.
+template <>
+inline Variant& Variant::operator=(Variant v) {
+  std::swap(val, v.val);
+  std::swap(type, v.type);
+  return *this;
+}
+
+// Accessor specializations.
+template <>
+inline const int& Variant::Get<int>() const {
+  return val.i;
+}
+template <>
+inline const size_t& Variant::Get<size_t>() const {
+  return val.s;
+}
+template <>
+inline const char* const& Variant::Get<const char*>() const {
+  return val.c;
+}
+template <>
+inline void Variant::Set(int v) {
+  val.i = v;
+  type = kInt;
+}
+template <>
+inline void Variant::Set(size_t v) {
+  val.s = v;
+  type = kSizeT;
+}
+template <>
+inline void Variant::Set(const char* v) {
+  val.c = v;
+  type = kString;
+}
+
+}  // namespace interop
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_INTEROP_VARIANT_H_
diff --git a/tensorflow/lite/core/async/interop/variant_test.cc b/tensorflow/lite/core/async/interop/variant_test.cc
new file mode 100644
index 00000000000..a22ec7facce
--- /dev/null
+++ b/tensorflow/lite/core/async/interop/variant_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/interop/variant.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace tflite::interop {
+namespace {
+
+TEST(VariantTest, IntTest) {
+  {
+    Variant a(1);
+    EXPECT_EQ(1, a.Get<int>());
+  }
+  {
+    Variant a(1);
+    a.Set(2);
+    EXPECT_EQ(2, a.Get<int>());
+  }
+  {
+    Variant a(42);
+    Variant b(a);
+    EXPECT_EQ(42, b.Get<int>());
+  }
+  {
+    Variant a(42);
+    EXPECT_EQ(42, *static_cast<const int*>(a.GetPtr()));
+  }
+  {
+    Variant a(42);
+    Variant b(42);
+    EXPECT_EQ(a, b);
+    b.Set(21);
+    EXPECT_NE(a, b);
+  }
+}
+
+TEST(VariantTest, SizeTTest) {
+  {
+    size_t v = 1;
+    Variant a(v);
+    EXPECT_EQ(1, a.Get<size_t>());
+  }
+  {
+    size_t v = 1;
+    Variant a(v);
+    size_t t = 2;
+    a.Set(t);
+    EXPECT_EQ(2, a.Get<size_t>());
+  }
+  {
+    size_t v = 42;
+    Variant a(v);
+    Variant b(a);
+    EXPECT_EQ(42, b.Get<size_t>());
+  }
+  {
+    size_t v = 42;
+    Variant a(v);
+    EXPECT_EQ(42, *static_cast<const size_t*>(a.GetPtr()));
+  }
+  {
+    Variant a(size_t(42));
+    Variant b(size_t(42));
+    EXPECT_EQ(a, b);
+    b.Set(size_t(21));
+    EXPECT_NE(a, b);
+  }
+}
+
+TEST(VariantTest, StringTest) {
+  {
+    const char v[] = "string";
+    Variant a(v);
+    EXPECT_EQ(v, a.Get<const char*>());
+  }
+  {
+    const char v[] = "string";
+    Variant a(v);
+    const char t[] = "another string";
+    a.Set(t);
+    EXPECT_EQ(t, a.Get<const char*>());
+  }
+  {
+    const char v[] = "string";
+    Variant a(v);
+    Variant b(a);
+    EXPECT_EQ(v, b.Get<const char*>());
+  }
+  {
+    const char v[] = "string";
+    Variant a(v);
+    EXPECT_EQ(v, *static_cast<const char* const*>(a.GetPtr()));
+  }
+  {
+    const char v[] = "string";
+    Variant a(v);
+    std::string str = "string";
+    Variant b(str.c_str());
+    EXPECT_EQ(a, b);
+    b.Set("another string");
+    EXPECT_NE(a, b);
+  }
+}
+
+}  // namespace
+}  // namespace tflite::interop
diff --git a/tensorflow/lite/core/async/task_internal.cc b/tensorflow/lite/core/async/task_internal.cc
new file mode 100644
index 00000000000..e2f88ee6fa7
--- /dev/null
+++ b/tensorflow/lite/core/async/task_internal.cc
@@ -0,0 +1,107 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/task_internal.h"
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace tflite {
+namespace async {
+
+bool ExecutionTask::GetTensorIdx(TfLiteIoType io_type, const char* name,
+                                 int* idx) const {
+  const std::map<std::string, uint32_t>* map = nullptr;
+  if (io_type == kTfLiteIoInput) {
+    map = input_name_to_idx_;
+  } else {
+    map = output_name_to_idx_;
+  }
+  if (!map) return false;
+  if (auto it_idx = map->find(name); it_idx != map->end()) {
+    *idx = it_idx->second;
+    return true;
+  }
+  return false;
+}
+
+TfLiteBufferHandle ExecutionTask::GetBufferHandle(TfLiteIoType io_type,
+                                                  const char* name) const {
+  int index = 0;
+  if (!GetTensorIdx(io_type, name, &index)) {
+    return kTfLiteNullBufferHandle;
+  }
+  return GetBufferHandle(index);
+}
+
+TfLiteBufferHandle ExecutionTask::GetBufferHandle(int tensor_index) const {
+  if (auto it = io_data_.find(tensor_index); it != io_data_.end()) {
+    return it->second.buf;
+  }
+  return kTfLiteNullBufferHandle;
+}
+
+TfLiteStatus ExecutionTask::SetBufferHandle(TfLiteIoType io_type,
+                                            const char* name,
+                                            TfLiteBufferHandle handle) {
+  int index = 0;
+  if (!GetTensorIdx(io_type, name, &index)) {
+    return kTfLiteError;
+  }
+  io_data_[index].buf = handle;
+  return kTfLiteOk;
+}
+
+TfLiteSynchronization* ExecutionTask::GetSynchronization(
+    TfLiteIoType io_type, const char* name) const {
+  int index = 0;
+  if (!GetTensorIdx(io_type, name, &index)) {
+    return nullptr;
+  }
+  return GetSynchronization(index);
+}
+
+TfLiteSynchronization* ExecutionTask::GetSynchronization(
+    int tensor_index) const {
+  if (auto it = io_data_.find(tensor_index); it != io_data_.end()) {
+    return it->second.sync;
+  }
+  return nullptr;
+}
+
+TfLiteStatus ExecutionTask::SetSynchronization(TfLiteIoType io_type,
+                                               const char* name,
+                                               TfLiteSynchronization* sync) {
+  int index = 0;
+  if (!GetTensorIdx(io_type, name, &index)) {
+    return kTfLiteError;
+  }
+  io_data_[index].sync = sync;
+  return kTfLiteOk;
+}
+
+}  // namespace async
+}  // namespace tflite
+
+TfLiteExecutionTask::TfLiteExecutionTask() {
+  task = std::make_unique<tflite::async::ExecutionTask>();
+}
diff --git a/tensorflow/lite/core/async/task_internal.h b/tensorflow/lite/core/async/task_internal.h
new file mode 100644
index 00000000000..d4c2e19d7e1
--- /dev/null
+++ b/tensorflow/lite/core/async/task_internal.h
@@ -0,0 +1,151 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+
+// Forward declaration
+namespace tflite::async {
+class ExecutionTask;
+}  // namespace tflite::async
+
+// TfLiteExecutionTask object holds the mapping from tensor index to
+// the backend buffer and sync object tied to that tensor.
+// It also holds the event handle that represents a specific scheduled
+// async execution.
+// The AsyncSignatureRunner that creates the task should out-live the life time
+// of the TfLiteExecutionTask.
+struct TfLiteExecutionTask {
+  TfLiteExecutionTask();
+  std::unique_ptr<tflite::async::ExecutionTask> task;
+};
+
+namespace tflite {
+namespace async {
+
+// Implementation of TfLiteExecutionTask.
+// This class is not thread safe.
+class ExecutionTask {
+ public:
+  // Returns the buffer handle for input / output tensor `name`.
+  // If there's tensor `name` is not found, returns kTfLiteNullBufferHandle.
+  TfLiteBufferHandle GetBufferHandle(TfLiteIoType io_type,
+                                     const char* name) const;
+  // Same as GetBufferHandle above, but uses tensor index as key.
+  TfLiteBufferHandle GetBufferHandle(int tensor_index) const;
+
+  // Sets the buffer handle for input / output `name`.
+  // If there's tensor `name` is not found, do nothing.
+  TfLiteStatus SetBufferHandle(TfLiteIoType io_type, const char* name,
+                               TfLiteBufferHandle handle);
+
+  // Returns the TfLiteSynchronization for input / output tensor `name`.
+  // If there's tensor `name` is not found, returns nullptr.
+  TfLiteSynchronization* GetSynchronization(TfLiteIoType io_type,
+                                            const char* name) const;
+  // Same as GetSynchronization above, but uses tensor index as key.
+  TfLiteSynchronization* GetSynchronization(int tensor_index) const;
+
+  // Sets the TfLiteSynchronization for input / output tensor `name`.
+  // If there's tensor `name` is not found, do nothing.
+  TfLiteStatus SetSynchronization(TfLiteIoType io_type, const char* name,
+                                  TfLiteSynchronization* sync);
+
+  using TensorNameMapT = std::map<std::string, uint32_t>;
+
+  // Sets the mapping from signature input name to tensor index.
+  void SetInputNameMap(const TensorNameMapT* input_name_to_idx) {
+    input_name_to_idx_ = input_name_to_idx;
+  }
+
+  // Sets the mapping from signature output name to tensor index.
+  void SetOutputNameMap(const TensorNameMapT* output_name_to_idx) {
+    output_name_to_idx_ = output_name_to_idx;
+  }
+
+  // Returns the status of this task.
+  // True if the task has been scheduled, false if idle.
+  bool Scheduled() const { return scheduled_.load(); }
+
+  // Exchanges the status of this task. Whether it's been scheduled or in idle
+  // state.
+  // Returns the previous value of the task.
+  bool SetScheduled(bool scheduled) { return scheduled_.exchange(scheduled); }
+
+  // Returns the latest status of this task.
+  // Thread safe.
+  TfLiteStatus Status() const { return status_.load(); }
+
+  // Sets the status code for this task.
+  // Thread safe.
+  void SetStatus(TfLiteStatus status) { status_.store(status); }
+
+  // Sets the delegate execution data for this task.
+  void SetDelegateExecutionData(TfLiteAsyncKernel* kernel, void* data) {
+    data_ = data;
+  }
+
+  // Returns the delegate execution data for this task.
+  // Returns nullptr if not set.
+  void* GetDelegateExecutionData(TfLiteAsyncKernel* kernel) const {
+    return data_;
+  }
+
+ private:
+  struct IOData {
+    TfLiteBufferHandle buf = kTfLiteNullBufferHandle;
+    TfLiteSynchronization* sync = nullptr;
+  };
+
+  // Finds the tensor index for input / output name.
+  // Returns false if the tensor is not found.
+  bool GetTensorIdx(TfLiteIoType io_type, const char* name, int* idx) const;
+
+  // Mapping from tensor index to buffer handle and sync object ptr.
+  // Set by the application.
+  std::map<int, IOData> io_data_;
+
+  // The status of the task. Whether the task has been scheduled or not.
+  // The bit is set when calling InvokeAsync to this task, and resets on Wait.
+  std::atomic_bool scheduled_ = false;
+
+  // The latest status of this task. Default value will be kTfLiteOk.
+  std::atomic<TfLiteStatus> status_ = kTfLiteOk;
+
+  // Mapping from signature name to tensor index.
+  // Not owned. Set and owned by AsyncSignatureRunner.
+  const TensorNameMapT* input_name_to_idx_ = nullptr;
+  const TensorNameMapT* output_name_to_idx_ = nullptr;
+
+  // Delegate owned data.
+  // NOTE: Currently we only support one delegate. If we are to support multiple
+  // backends, we might need to change this to a map.
+  void* data_ = nullptr;
+};
+
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TASK_INTERNAL_H_
diff --git a/tensorflow/lite/core/async/task_internal_test.cc b/tensorflow/lite/core/async/task_internal_test.cc
new file mode 100644
index 00000000000..dd4acd6cf9b
--- /dev/null
+++ b/tensorflow/lite/core/async/task_internal_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/async/task_internal.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/core/async/common.h"
+#include "tensorflow/lite/core/async/interop/c/types.h"
+
+namespace tflite::async {
+
+TEST(TfLiteExecutionTaskTest, BasicTest) {
+  tflite::async::ExecutionTask task;
+  tflite::async::ExecutionTask::TensorNameMapT input_names;
+  input_names["x"] = 1;
+  input_names["y"] = 2;
+  tflite::async::ExecutionTask::TensorNameMapT output_names;
+  output_names["a"] = 3;
+  task.SetInputNameMap(&input_names);
+  task.SetOutputNameMap(&output_names);
+
+  auto* sync = TfLiteSynchronizationCreate();
+
+  EXPECT_EQ(kTfLiteOk, task.SetBufferHandle(kTfLiteIoInput, "x", 42));
+  EXPECT_EQ(kTfLiteOk, task.SetBufferHandle(kTfLiteIoInput, "y", 43));
+  EXPECT_EQ(kTfLiteOk, task.SetBufferHandle(kTfLiteIoOutput, "a", 44));
+  EXPECT_EQ(kTfLiteOk, task.SetSynchronization(kTfLiteIoInput, "x", sync));
+
+  EXPECT_EQ(42, task.GetBufferHandle(kTfLiteIoInput, "x"));
+  EXPECT_EQ(43, task.GetBufferHandle(kTfLiteIoInput, "y"));
+  EXPECT_EQ(44, task.GetBufferHandle(kTfLiteIoOutput, "a"));
+  EXPECT_EQ(sync, task.GetSynchronization(kTfLiteIoInput, "x"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoInput, "y"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoOutput, "a"));
+
+  TfLiteSynchronizationDelete(sync);
+}
+
+TEST(TfLiteExecutionTaskTest, NameMapUninitialized) {
+  tflite::async::ExecutionTask task;
+
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            task.GetBufferHandle(kTfLiteIoInput, "foo"));
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            task.GetBufferHandle(kTfLiteIoOutput, "foo"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoOutput, "foo"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoOutput, "foo"));
+}
+
+TEST(TfLiteExecutionTaskTest, NoMatchingName) {
+  tflite::async::ExecutionTask task;
+  tflite::async::ExecutionTask::TensorNameMapT input_names;
+  input_names["x"] = 1;
+  input_names["y"] = 2;
+  tflite::async::ExecutionTask::TensorNameMapT output_names;
+  output_names["a"] = 3;
+  task.SetInputNameMap(&input_names);
+  task.SetOutputNameMap(&output_names);
+
+  auto* sync = TfLiteSynchronizationCreate();
+
+  EXPECT_EQ(kTfLiteError, task.SetBufferHandle(kTfLiteIoInput, "xx", 42));
+  EXPECT_EQ(kTfLiteError, task.SetBufferHandle(kTfLiteIoOutput, "aa", 44));
+  EXPECT_EQ(kTfLiteError, task.SetSynchronization(kTfLiteIoInput, "xx", sync));
+  EXPECT_EQ(kTfLiteError, task.SetSynchronization(kTfLiteIoOutput, "aa", sync));
+
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            task.GetBufferHandle(kTfLiteIoInput, "xx"));
+  EXPECT_EQ(kTfLiteNullBufferHandle,
+            task.GetBufferHandle(kTfLiteIoOutput, "aa"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoInput, "xx"));
+  EXPECT_EQ(nullptr, task.GetSynchronization(kTfLiteIoOutput, "aa"));
+
+  TfLiteSynchronizationDelete(sync);
+}
+
+TEST(TfLiteExecutionTaskTest, DelegateData) {
+  TfLiteAsyncKernel kernel{};
+  int data = 0;
+  tflite::async::ExecutionTask task;
+
+  EXPECT_EQ(nullptr, task.GetDelegateExecutionData(&kernel));
+
+  task.SetDelegateExecutionData(&kernel, &data);
+  EXPECT_EQ(&data, task.GetDelegateExecutionData(&kernel));
+}
+
+}  // namespace tflite::async
diff --git a/tensorflow/lite/core/async/testing/BUILD b/tensorflow/lite/core/async/testing/BUILD
new file mode 100644
index 00000000000..7851892405a
--- /dev/null
+++ b/tensorflow/lite/core/async/testing/BUILD
@@ -0,0 +1,34 @@
+# Test utilities for TFLite async execution.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "test_backend",
+    testonly = 1,
+    srcs = ["test_backend.cc"],
+    hdrs = ["test_backend.h"],
+    deps = [
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/core/async:async_kernel_internal",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates:utils",
+    ],
+)
+
+cc_library(
+    name = "mock_async_kernel",
+    testonly = 1,
+    hdrs = ["mock_async_kernel.h"],
+    deps = [
+        "//tensorflow/lite/core/async:backend_async_kernel_interface",
+        "//testing/base/public:gunit_for_library_testonly",
+    ],
+)
diff --git a/tensorflow/lite/core/async/testing/mock_async_kernel.h b/tensorflow/lite/core/async/testing/mock_async_kernel.h
new file mode 100644
index 00000000000..2883f27b691
--- /dev/null
+++ b/tensorflow/lite/core/async/testing/mock_async_kernel.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
+
+#include <gmock/gmock.h>
+#include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+// A fully mocked out async kernel.
+// Mocked TfLiteAsyncKernel can be retreived by `MockAsyncKernel::kernel()`.
+class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
+ public:
+  MOCK_METHOD(TfLiteStatus, RegisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteIoType, const TfLiteBackendBuffer*,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, RegisterBufferSlice,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle,
+               const TfLiteAttributeMap*, TfLiteBufferHandle),
+              (override));
+  MOCK_METHOD(TfLiteStatus, UnregisterBuffer,
+              (TfLiteOpaqueContext*, TfLiteBufferHandle), (override));
+  MOCK_METHOD(std::vector<const char*>, SupportedBufferTypes, (TfLiteIoType),
+              (const, override));
+  MOCK_METHOD(std::vector<const char*>, SupportedSynchronizations,
+              (TfLiteIoType), (const, override));
+  MOCK_METHOD(bool, ReconcileRestrictions,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*, TfLiteAttributeMap*,
+               TfLiteAttributeMap*),
+              (const, override));
+  MOCK_METHOD(TfLiteStatus, SetAttributes,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
+               const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Eval,
+              (TfLiteOpaqueContext*, TfLiteOpaqueNode*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Wait, (TfLiteOpaqueContext*, TfLiteExecutionTask*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, Finish,
+              (TfLiteOpaqueContext*, TfLiteExecutionTask*), (override));
+};
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TESTING_MOCK_ASYNC_KERNEL_H_
diff --git a/tensorflow/lite/core/async/testing/test_backend.cc b/tensorflow/lite/core/async/testing/test_backend.cc
new file mode 100644
index 00000000000..e577fe4e87b
--- /dev/null
+++ b/tensorflow/lite/core/async/testing/test_backend.cc
@@ -0,0 +1,91 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/async/testing/test_backend.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+namespace {
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context,
+                             TfLiteDelegate* tflite_delegate) {
+  auto* backend = reinterpret_cast<TestBackend*>(tflite_delegate->data_);
+
+  // Can delegate all nodes.
+  delegates::IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration,
+          std::string* unsupported_details) -> bool { return true; };
+
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  auto supported_nodes = helper.GetNodesOfFirstNLargestPartitions(
+      backend->NumPartitions(), backend->MinPartitionedNodes());
+
+  // Create TfLiteRegistration with the provided async kernel.
+  TfLiteRegistration reg{};
+  reg.init = [](TfLiteContext* context, const char* buffer,
+                size_t length) -> void* {
+    const TfLiteDelegateParams* params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    auto* backend = reinterpret_cast<TestBackend*>(params->delegate->data_);
+    // AsyncSubgraph requires TfLiteNode.user_data to be of TfLiteAsyncKernel
+    // type.
+    return backend->get_kernel();
+  };
+  reg.free = [](TfLiteContext*, void*) -> void {};
+  reg.prepare = [](TfLiteContext*, TfLiteNode*) -> TfLiteStatus {
+    return kTfLiteOk;
+  };
+  reg.invoke = [](TfLiteContext*, TfLiteNode*) -> TfLiteStatus {
+    return kTfLiteOk;
+  };
+  reg.profiling_string = nullptr;
+  reg.builtin_code = kTfLiteBuiltinDelegate;
+  reg.custom_name = "TestBackend";
+  reg.version = 1;
+
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, reg, BuildTfLiteIntArray(supported_nodes).get(),
+      tflite_delegate);
+}
+
+}  // namespace
+
+TestBackend::TestBackend(TfLiteAsyncKernel* kernel)
+    : kernel_(kernel), delegate_(TfLiteDelegateCreate()) {
+  delegate_.Prepare = &DelegatePrepare;
+  delegate_.CopyFromBufferHandle = nullptr;
+  delegate_.CopyToBufferHandle = nullptr;
+  delegate_.FreeBufferHandle = nullptr;
+  delegate_.data_ = this;
+}
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
diff --git a/tensorflow/lite/core/async/testing/test_backend.h b/tensorflow/lite/core/async/testing/test_backend.h
new file mode 100644
index 00000000000..0a170accda6
--- /dev/null
+++ b/tensorflow/lite/core/async/testing/test_backend.h
@@ -0,0 +1,63 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
+#define TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
+
+#include <limits>
+#include <memory>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/async/async_kernel_internal.h"
+
+namespace tflite {
+namespace async {
+namespace testing {
+
+// A test backend that takes in arbitrary TfLiteAsyncKernel.
+class TestBackend {
+ public:
+  explicit TestBackend(TfLiteAsyncKernel* kernel);
+
+  TfLiteDelegate* get_delegate() { return &delegate_; }
+  TfLiteAsyncKernel* get_kernel() { return kernel_; }
+
+  // Maximum delegate partitions.
+  int NumPartitions() const { return num_partitions_; }
+  void SetNumPartitions(int num_partitions) {
+    num_partitions_ = num_partitions;
+  }
+
+  // Minimal number of nodes the backend delegates.
+  int MinPartitionedNodes() const { return min_partioned_nodes_; }
+  void SetMinPartitionedNodes(int min_partioned_nodes) {
+    min_partioned_nodes_ = min_partioned_nodes;
+  }
+
+ private:
+  // Not owned.
+  TfLiteAsyncKernel* kernel_ = nullptr;
+
+  // Owned.
+  TfLiteDelegate delegate_;
+
+  int num_partitions_ = std::numeric_limits<int>::max();
+  int min_partioned_nodes_ = 0;
+};
+
+}  // namespace testing
+}  // namespace async
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_ASYNC_TESTING_TEST_BACKEND_H_
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 6a91091df33..a44822ee638 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -3,79 +3,93 @@ load(
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite/core/c:special_rules.bzl", "common_header_visibility_allowlist")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite:__subpackages__"],
     licenses = ["notice"],
 )
 
-exports_files([
-    "c_api.h",
-])
+exports_files(
+    srcs = [
+        "c_api.h",
+        "c_api_experimental.h",
+        "c_api_types.h",
+        "common.h",
+    ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+bzl_library(
+    name = "special_rules_bzl",
+    srcs = ["special_rules.bzl"],
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "headers_filegroup",
+    srcs = [
+        "c_api.h",
+        "c_api_types.h",
+        "common.h",
+    ],
+)
 
-# LINT.IfChange(c_api)
 tflite_cc_library_with_c_headers_test(
     name = "c_api",
     hdrs = [
         "c_api.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
+        ":c_api_types",
         ":c_api_without_op_resolver",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:create_op_resolver_with_builtin_ops",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
     ],
     alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
 )
-# LINT.ThenChange(:private_c_api)
 
-# LINT.IfChange(private_c_api)
-cc_library(
+# This is a private target, its visibility is set to public only to be
+# used by "tflite_custom_c_library".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
     name = "private_c_api",
-    hdrs = [
-        "c_api.h",
-    ],
-    copts = tflite_copts(),
+    actual = ":c_api",
     tags = ["avoid_dep"],
-    # This is a private target, its visibility is set to public only to be
-    # used by "tflite_custom_c_library".
-    # Do not use this target directly and don't consider it as a part of the public API.
     visibility = [
         "//visibility:public",
     ],
-    deps = [
-        ":c_api_without_op_resolver",
-        "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:create_op_resolver_with_builtin_ops",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-    ],
 )
-# LINT.ThenChange(:c_api)
 
 tflite_cc_library_with_c_headers_test(
     name = "c_api_without_op_resolver",
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
+        ":c_api_types",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:create_op_resolver_header",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:create_op_resolver_header",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/delegates:interpreter_utils",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -95,8 +109,8 @@ tflite_cc_library_with_c_headers_test(
         "//visibility:public",
     ],
     deps = [
+        ":c_api_types",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite/c:c_api_types",
     ],
 )
 
@@ -107,19 +121,15 @@ tflite_cc_library_with_c_headers_test(
     hdrs = ["c_api.h"],
     copts = tflite_copts(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
-    visibility = [
-        "//visibility:public",
-    ],
     deps = [
+        ":c_api_types",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:create_op_resolver_header",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/c:common_internal",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:create_op_resolver_header",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/delegates:interpreter_utils",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:kernel_util",
@@ -127,6 +137,18 @@ tflite_cc_library_with_c_headers_test(
     ],
 )
 
+# This is a private target, its visibility is set to public only to be
+# used by "tflite_custom_c_library".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
+    name = "private_c_api_without_op_resolver_without_alwayslink",
+    actual = ":c_api_without_op_resolver_without_alwayslink",
+    tags = ["avoid_dep"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 cc_test(
     name = "c_api_test",
     size = "small",
@@ -137,16 +159,15 @@ cc_test(
         "//tensorflow/lite:testdata/add_quantized.bin",
         "//tensorflow/lite:testdata/custom_sinh.bin",
     ],
-    visibility = [
-        "//tensorflow/lite/c:__subpackages__",
-    ],
     deps = [
         ":c_api",
+        ":c_api_types",
+        ":common",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:delegate_test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -162,15 +183,182 @@ cc_test(
         "//tensorflow/lite:testdata/add_quantized.bin",
         "//tensorflow/lite:testdata/custom_sinh.bin",
     ],
+    deps = [
+        ":c_api_types",
+        ":c_api_without_op_resolver_without_alwayslink",
+        ":common",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/c:selectively_built_c_api_test_lib",
+        "//tensorflow/lite/delegates:delegate_test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "builtin_op_data_test",
+    size = "small",
+    srcs = ["builtin_op_data_test.cc"],
+    copts = ["-Wno-unused-variable"],
+    deps = [
+        ":common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+# This is a private target, its visibility is set to public only to be
+# used by "tflite_custom_c_library".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
+    name = "private_c_api_types",
+    actual = ":c_api_types",
+    tags = ["avoid_dep"],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_types",
+    hdrs = ["c_api_types.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+)
+
+# Test the C extension API code.
+cc_test(
+    name = "common_test",
+    size = "small",
+    srcs = ["common_test.cc"],
+    deps = [
+        ":c_api_types",
+        ":common",
+        "//tensorflow/lite:util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "common",
+    srcs = ["common.cc"],
+    hdrs = [
+        "builtin_op_data.h",
+        "common.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+        "@org_tensorflow_lite_support//tensorflow_lite_support/custom_ops:__subpackages__",
+    ] + common_header_visibility_allowlist(),
+    deps = [
+        ":c_api_types",
+        "//tensorflow/lite:tflite_kernel_use_xnnpack_optional",
+    ] + select({
+        "//tensorflow/lite:tensorflow_profiler_config": [
+            "//tensorflow/lite:macros",
+            "//tensorflow/lite:tensorflow_profiler_logger_shim",
+        ],
+        "//conditions:default": [],
+    }),
+    alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
+)
+
+# This is a private target, its visibility is set to public only to be
+# used by "tflite_custom_c_library".
+# Do not use this target directly and don't consider it as a part of the public API.
+alias(
+    name = "private_common",
+    actual = ":common",
+    tags = ["avoid_dep"],
     visibility = [
-        "//tensorflow/lite/c:__subpackages__",
+        "//visibility:public",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_experimental",
+    srcs = [
+        "c_api_experimental.cc",
+    ],
+    hdrs = [
+        "c_api_experimental.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":c_api",
+        ":common",
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite:signature_runner",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
+    ],
+    alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
+)
+
+# Same as ":c_api_experimental", but without linking in the default CreateOpResolver implementation.
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_experimental_without_op_resolver",
+    srcs = [
+        "c_api_experimental.cc",
+    ],
+    hdrs = [
+        "c_api_experimental.h",
+    ],
+    aspect_hints = ["//tools/build_defs/swift:auto_module"],
+    copts = tflite_copts(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
+    deps = [
+        ":c_api_without_op_resolver",
+        ":common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
+    ],
+    alwayslink = 1,  # Why?? TODO(b/161243354): eliminate this.
+)
+
+# Same as ":c_api_experimental", but without linking in the default CreateOpResolver implementation.
+tflite_cc_library_with_c_headers_test(
+    name = "c_api_experimental_without_op_resolver_without_alwayslink",
+    srcs = [
+        "c_api_experimental.cc",
+    ],
+    hdrs = [
+        "c_api_experimental.h",
     ],
+    aspect_hints = ["//tools/build_defs/swift:auto_module"],
+    copts = tflite_copts(),
+    tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_without_op_resolver_without_alwayslink",
+        ":common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/c:selectively_built_c_api_test_lib",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/profiling/telemetry:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
+    ],
+)
+
+cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = ["c_api_experimental_test.cc"],
+    copts = tflite_copts(),
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":common",
+        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/delegates:delegate_test_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
new file mode 100644
index 00000000000..26d7e9989c8
--- /dev/null
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -0,0 +1,537 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/builtin_op_data.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TfLiteReshapeParams can't have dynamic data so we fix the maximum possible
+// number of dimensions.
+#define TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT 8
+
+// TODO(aselle): Consider using "if this then that" for testing.
+
+// Useful placeholder to put in otherwise empty structs to avoid size warnings.
+typedef struct {
+  char dummy;
+} EmptyStructPlaceholder;
+
+// IMPORTANT: All new members of structs must be added at the end to ensure
+// backwards compatibility.
+
+// Possible padding types (for convolutions)
+typedef enum {
+  kTfLitePaddingUnknown = 0,
+  kTfLitePaddingSame,
+  kTfLitePaddingValid,
+} TfLitePadding;
+
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
+// TODO(b/130259536): We should move this out of builtin_op_data.
+typedef struct {
+  int width;
+  int height;
+  int width_offset;
+  int height_offset;
+} TfLitePaddingValues;
+
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
+// Possible fused activation functions.
+typedef enum {
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActReluN1To1,  // min(max(-1, x), 1)
+  kTfLiteActRelu6,      // min(max(0, x), 6)
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} TfLiteFusedActivation;
+
+typedef struct {
+  // Parameters for CONV_2D version 1.
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  TfLiteFusedActivation activation;
+
+  // Parameters for CONV_2D version 2.
+  // Note: Version 2 supports dilation values not equal to 1.
+  int dilation_width_factor;
+  int dilation_height_factor;
+} TfLiteConvParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int stride_depth;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int dilation_depth_factor;
+  TfLiteFusedActivation activation;
+} TfLiteConv3DParams;
+
+typedef TfLiteConv3DParams TfLiteConv3DTransposeParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int filter_width;
+  int filter_height;
+  TfLiteFusedActivation activation;
+  struct {
+    TfLitePaddingValues padding;
+  } computed;
+} TfLitePoolParams;
+
+typedef struct {
+  // Parameters for DepthwiseConv version 1 or above.
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  //
+  // The information can be deduced from the shape of input and the shape of
+  // weights. Since the TFLiteConverter toolchain doesn't support partially
+  // specified shapes, relying on `depth_multiplier` stops us from supporting
+  // graphs with dynamic shape tensors.
+  //
+  // Note: Some of the delegates (e.g. NNAPI, GPU) are still relying on this
+  // field.
+  int depth_multiplier;
+  TfLiteFusedActivation activation;
+  // Parameters for DepthwiseConv version 2 or above.
+  int dilation_width_factor;
+  int dilation_height_factor;
+} TfLiteDepthwiseConvParams;
+
+typedef struct {
+  int rank;
+  TfLiteFusedActivation activation;
+
+  // Parameter for SVDF version 4.
+  bool asymmetric_quantize_inputs;
+} TfLiteSVDFParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+
+  // Parameter for RNN version 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+
+  // Parameter for Sequence RNN version 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteSequenceRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+  bool merge_outputs;
+
+  // Parameter for Bidirectional RNN verison 3.
+  bool asymmetric_quantize_inputs;
+} TfLiteBidirectionalSequenceRNNParams;
+
+typedef enum {
+  kTfLiteFullyConnectedWeightsFormatDefault = 0,
+  kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
+} TfLiteFullyConnectedWeightsFormat;
+
+typedef struct {
+  // Parameters for FullyConnected version 1 or above.
+  TfLiteFusedActivation activation;
+
+  // Parameters for FullyConnected version 2 or above.
+  TfLiteFullyConnectedWeightsFormat weights_format;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimensions in the input and the output
+  // tensors are the same. Furthermore, all but the last dimension of the input
+  // and output shapes will be equal.
+  bool keep_num_dims;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true and the weights are quantized, then non constant inputs
+  // are quantized at evaluation time with asymmetric quantization.
+  bool asymmetric_quantize_inputs;
+} TfLiteFullyConnectedParams;
+
+typedef enum {
+  kTfLiteLshProjectionUnknown = 0,
+  kTfLiteLshProjectionSparse = 1,
+  kTfLiteLshProjectionDense = 2,
+} TfLiteLSHProjectionType;
+
+typedef struct {
+  TfLiteLSHProjectionType type;
+} TfLiteLSHProjectionParams;
+
+typedef struct {
+  float beta;
+} TfLiteSoftmaxParams;
+
+typedef struct {
+  int axis;
+  TfLiteFusedActivation activation;
+} TfLiteConcatenationParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+  // Parameter added for the version 4.
+  bool pot_scale_int16;
+} TfLiteAddParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteSpaceToBatchNDParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteBatchToSpaceNDParams;
+
+typedef struct {
+  bool adj_x;
+  bool adj_y;
+  // Parameters for BatchMatMul version 4 or above.
+  // If set to true and the weights are quantized, then non constant inputs
+  // are quantized at evaluation time with asymmetric quantization.
+  bool asymmetric_quantize_inputs;
+} TfLiteBatchMatMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+  // Parameter added for the version 5.
+  bool pot_scale_int16;
+} TfLiteSubParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteDivParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteL2NormParams;
+
+typedef struct {
+  int radius;
+  float bias;
+  float alpha;
+  float beta;
+} TfLiteLocalResponseNormParams;
+
+typedef enum {
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+} TfLiteLSTMKernelType;
+
+typedef struct {
+  // Parameters for LSTM version 1.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  TfLiteLSTMKernelType kernel_type;
+
+  // Parameters for LSTM version 4.
+  bool asymmetric_quantize_inputs;
+} TfLiteLSTMParams;
+
+typedef struct {
+  // Parameters needed for the underlying LSTM.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
+
+  // Parameter for unidirectional sequence RNN version 3.
+  bool asymmetric_quantize_inputs;
+
+  // Parameter for unidirectional sequence RNN version 4.
+  bool diagonal_recurrent_tensors;
+} TfLiteUnidirectionalSequenceLSTMParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  // Parameters inherited for the LSTM kernel.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If true, store the outputs of both directions in the first output.
+  bool merge_outputs;
+
+  // Parameters supported by version 2:
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
+
+  // Parameters supported by version 3:
+  // If set to true, then hybrid ops use asymmetric quantization for inputs.
+  bool asymmetric_quantize_inputs;
+} TfLiteBidirectionalSequenceLSTMParams;
+
+typedef struct {
+  bool align_corners;
+  // half_pixel_centers assumes pixels are of half the actual dimensions, and
+  // yields more accurate resizes. Corresponds to the same argument for the
+  // original TensorFlow op in TF2.0.
+  bool half_pixel_centers;
+} TfLiteResizeBilinearParams;
+
+typedef struct {
+  bool align_corners;
+  bool half_pixel_centers;
+} TfLiteResizeNearestNeighborParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLitePadParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLitePadV2Params;
+
+typedef struct {
+  // These fields are only used in old models for backward compatibility.
+  // In the current implementation, we use the 2nd input of the op as the shape,
+  // and these fields are unused.
+  int shape[TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT];
+  int num_dimensions;
+} TfLiteReshapeParams;
+
+typedef struct {
+  int ngram_size;
+  int max_skip_size;
+  bool include_all_ngrams;
+} TfLiteSkipGramParams;
+
+typedef struct {
+  int block_size;
+} TfLiteSpaceToDepthParams;
+
+typedef struct {
+  int block_size;
+} TfLiteDepthToSpaceParams;
+
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
+typedef enum {
+  kTfLiteCombinerTypeSum = 0,
+  kTfLiteCombinerTypeMean = 1,
+  kTfLiteCombinerTypeSqrtn = 2,
+} TfLiteCombinerType;
+
+typedef struct {
+  TfLiteCombinerType combiner;
+} TfLiteEmbeddingLookupSparseParams;
+
+typedef struct {
+  int axis;
+  int batch_dims;
+} TfLiteGatherParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteTransposeParams;
+
+typedef struct {
+  bool keep_dims;
+} TfLiteReducerParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitVParams;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int squeeze_dims[8];
+  int num_squeeze_dims;
+} TfLiteSqueezeParams;
+
+typedef struct {
+  int begin_mask;
+  int end_mask;
+  int ellipsis_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+} TfLiteStridedSliceParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMinParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+
+  // Parameters supported by version 4:
+  TfLiteFusedActivation activation;
+} TfLiteTransposeConvParams;
+
+typedef struct {
+  bool validate_indices;
+} TfLiteSparseToDenseParams;
+
+typedef struct {
+  TfLiteType out_type;
+} TfLiteShapeParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteRankParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  float min;
+  float max;
+  int num_bits;
+
+  // Parameters supported by version 2:
+  bool narrow_range;
+} TfLiteFakeQuantParams;
+
+typedef struct {
+  int values_count;
+  int axis;
+} TfLitePackParams;
+
+typedef struct {
+  int axis;
+} TfLiteOneHotParams;
+
+typedef struct {
+  int num;
+  int axis;
+} TfLiteUnpackParams;
+
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
+typedef struct {
+  TfLiteType index_out_type;
+} TfLiteUniqueParams;
+
+typedef struct {
+  int seq_dim;
+  int batch_dim;
+} TfLiteReverseSequenceParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteMatrixDiagParams;
+
+typedef struct {
+  EmptyStructPlaceholder placeholder;
+} TfLiteMatrixSetDiagParams;
+
+typedef struct {
+  int then_subgraph_index;
+  int else_subgraph_index;
+} TfLiteIfParams;
+
+typedef struct {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+} TfLiteWhileParams;
+
+typedef struct {
+  bool exclusive;
+  bool reverse;
+} TfLiteCumsumParams;
+
+typedef struct {
+  int init_subgraph_index;
+} TfLiteCallOnceParams;
+
+typedef struct {
+  int table_id;
+  TfLiteType key_dtype;
+  TfLiteType value_dtype;
+} TfLiteHashtableParams;
+
+typedef struct {
+  const char* container;
+  const char* shared_name;
+} TfLiteVarHandleParams;
+
+typedef struct {
+  int seed;
+  int seed2;
+} TfLiteRandomParams;
+
+typedef struct {
+  int num_boundaries;
+  // This points to the memory stored in the model (flatbuffer),
+  // and is not owned.
+  const float* boundaries;
+} TfLiteBucketizeParams;
+
+typedef struct {
+  bool approximate;
+} TfLiteGeluParams;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/core/c/builtin_op_data_test.cc
similarity index 98%
rename from tensorflow/lite/c/builtin_op_data_test.cc
rename to tensorflow/lite/core/c/builtin_op_data_test.cc
index ece018b6d11..3cdda93f18d 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/core/c/builtin_op_data_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index ab4fb3659bb..78c5ba5ea56 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -22,8 +22,9 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/c/common_internal.h"
+#include "tensorflow/lite/core/create_op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/create_op_resolver.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/interpreter_utils.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -52,6 +53,8 @@ extern "C" {
 
 const char* TfLiteVersion() { return TFLITE_VERSION_STRING; }
 
+int TfLiteSchemaVersion() { return TFLITE_SCHEMA_VERSION; }
+
 TfLiteModel* TfLiteModelCreate(const void* model_data, size_t model_size) {
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
       static_cast<const char*>(model_data), model_size);
@@ -59,18 +62,50 @@ TfLiteModel* TfLiteModelCreate(const void* model_data, size_t model_size) {
   return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
 }
 
+TfLiteModel* TfLiteModelCreateWithErrorReporter(
+    const void* model_data, size_t model_size,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data) {
+  struct TfLiteErrorReporterCallback er_cb = {user_data, reporter};
+  auto error_reporter = std::make_unique<CallbackErrorReporter>(er_cb);
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
+      static_cast<const char*>(model_data), model_size, nullptr,
+      error_reporter.get());
+  std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
+  return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
+}
+
 TfLiteModel* TfLiteModelCreateFromFile(const char* model_path) {
   auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(model_path);
   std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
   return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
 }
 
+TfLiteModel* TfLiteModelCreateFromFileWithErrorReporter(
+    const char* model_path,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data) {
+  struct TfLiteErrorReporterCallback er_cb = {user_data, reporter};
+  auto error_reporter = std::make_unique<CallbackErrorReporter>(er_cb);
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(
+      model_path, nullptr, error_reporter.get());
+  std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
+  return shared_model ? new TfLiteModel{std::move(shared_model)} : nullptr;
+}
+
 void TfLiteModelDelete(TfLiteModel* model) { delete model; }
 
 TfLiteInterpreterOptions* TfLiteInterpreterOptionsCreate() {
   return new TfLiteInterpreterOptions{};
 }
 
+struct TfLiteInterpreterOptions* TfLiteInterpreterOptionsCopy(
+    const struct TfLiteInterpreterOptions* from) {
+  struct TfLiteInterpreterOptions* copy = new TfLiteInterpreterOptions{};
+  *copy = *from;
+  return copy;
+}
+
 void TfLiteInterpreterOptionsDelete(TfLiteInterpreterOptions* options) {
   delete options;
 }
@@ -81,20 +116,10 @@ void TfLiteInterpreterOptionsSetNumThreads(TfLiteInterpreterOptions* options,
 }
 
 void TfLiteInterpreterOptionsAddDelegate(TfLiteInterpreterOptions* options,
-                                         TfLiteDelegate* delegate) {
+                                         TfLiteOpaqueDelegate* delegate) {
   options->delegates.push_back(delegate);
 }
 
-void TfLiteInterpreterOptionsAddOpaqueDelegate(
-    TfLiteInterpreterOptions* options,
-    TfLiteOpaqueDelegateStruct* opaque_delegate) {
-  // The following cast is safe only because this code is part of the TF Lite
-  // runtime implementation.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
-  TfLiteDelegate* delegate = reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
-  TfLiteInterpreterOptionsAddDelegate(options, delegate);
-}
-
 void TfLiteInterpreterOptionsSetErrorReporter(
     TfLiteInterpreterOptions* options,
     void (*reporter)(void* user_data, const char* format, va_list args),
@@ -118,6 +143,7 @@ TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
 static void InitTfLiteRegistration(
     TfLiteRegistration* registration,
     TfLiteRegistrationExternal* registration_external) {
+  registration->builtin_code = registration_external->builtin_code;
   registration->custom_name = registration_external->custom_name;
   registration->version = registration_external->version;
   registration->registration_external = registration_external;
@@ -200,6 +226,9 @@ TfLiteStatus TfLiteInterpreterCancel(const TfLiteInterpreter* interpreter) {
 TfLiteType TfLiteTensorType(const TfLiteTensor* tensor) { return tensor->type; }
 
 int32_t TfLiteTensorNumDims(const TfLiteTensor* tensor) {
+  if (!tensor->dims) {
+    return -1;
+  }
   return tensor->dims->size;
 }
 
@@ -313,6 +342,11 @@ TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
     const TfLiteRegistrationExternal* registration) {
   return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
 }
+
+const char* TfLiteRegistrationExternalGetCustomName(
+    const TfLiteRegistrationExternal* registration) {
+  return registration->custom_name;
+}
 // LINT.ThenChange(//tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
 
 }  // extern "C"
@@ -441,6 +475,13 @@ TfLiteInterpreter* InterpreterCreateWithOpResolver(
   tflite::InterpreterBuilder builder(model->impl->GetModel(), *op_resolver,
                                      error_reporter);
 
+  if (optional_options && optional_options->telemetry_profiler) {
+    std::unique_ptr<tflite::telemetry::TelemetryProfiler> profiler;
+    profiler.reset(tflite::telemetry::MakeTfLiteTelemetryProfiler(
+        optional_options->telemetry_profiler));
+    builder.SetTelemetryProfiler(std::move(profiler));
+  }
+
   std::unique_ptr<tflite::Interpreter> interpreter;
   if (builder(&interpreter) != kTfLiteOk) {
     return nullptr;
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index 214d59d14fd..632750fe35a 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// \warning Users of TensorFlow Lite should not include this file directly,
 /// but should instead include "third_party/tensorflow/lite/c/c_api.h".
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
@@ -25,9 +25,10 @@ limitations under the License.
 #include <stdlib.h>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/c_api_types.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 
 // --------------------------------------------------------------------------
+/// \file
 /// C API for TensorFlow Lite.
 ///
 /// The API leans towards simplicity and uniformity instead of convenience, as
@@ -64,7 +65,7 @@ limitations under the License.
 ///
 /// // Extract the output tensor data.
 /// const TfLiteTensor* output_tensor =
-//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+///      TfLiteInterpreterGetOutputTensor(interpreter, 0);
 /// TfLiteTensorCopyToBuffer(output_tensor, output.data(),
 ///                          output.size() * sizeof(float));
 ///
@@ -84,27 +85,24 @@ extern "C" {
 // NOLINTBEGIN(modernize-redundant-void-arg)
 
 // --------------------------------------------------------------------------
-// Opaque types used by the C API.
+// Opaque types used by the C API.  (See also c_api_types.h.)
 
-// TfLiteModel wraps a loaded TensorFlow Lite model.
+/// TfLiteModel wraps a loaded TensorFlow Lite model.
 typedef struct TfLiteModel TfLiteModel;
 
-// TfLiteInterpreterOptions allows customized interpreter configuration.
+/// TfLiteInterpreterOptions allows customized interpreter configuration.
 typedef struct TfLiteInterpreterOptions TfLiteInterpreterOptions;
 
-// Allows delegation of nodes to alternative backends.
-typedef struct TfLiteDelegate TfLiteDelegate;
-
-// TfLiteInterpreter provides inference from a provided model.
+/// TfLiteInterpreter provides inference from a provided model.
 typedef struct TfLiteInterpreter TfLiteInterpreter;
 
-// A tensor in the interpreter system which is a wrapper around a buffer of
-// data including a dimensionality (or NULL if not currently defined).
+/// A tensor in the interpreter system which is a wrapper around a buffer of
+/// data including a dimensionality (or NULL if not currently defined).
 typedef struct TfLiteTensor TfLiteTensor;
 
-// TfLiteRegistrationExternal is an external version of TfLiteRegistration to
-// use custom op registration API.
-// WARNING: This is an experimental type and subject to change.
+/// TfLiteRegistrationExternal is an external version of TfLiteRegistration to
+/// use custom op registration API.
+/// \warning This is an experimental type and subject to change.
 typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
 
 // --------------------------------------------------------------------------
@@ -117,264 +115,302 @@ typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
 /// e.g. "2.12.0" or "2.13.0-rc2".
 TFL_CAPI_EXPORT extern const char* TfLiteVersion(void);
 
-// Returns a model from the provided buffer, or null on failure.
-//
-// NOTE: The caller retains ownership of the `model_data` buffer and should
-// ensure that the lifetime of the `model_data` buffer must be at least as long
-// as the lifetime of the `TfLiteModel` and of any `TfLiteInterpreter` objects
-// created from that `TfLiteModel`, and furthermore the contents of the
-// `model_data` buffer must not be modified during that time."
+/// The supported TensorFlow Lite model file Schema version.
+///
+/// Returns the (major) version number of the Schema used for model
+/// files that is supported by the (potentially dynamically loaded)
+/// TensorFlow Lite Runtime.
+///
+/// Model files using schema versions different to this may not be supported by
+/// the current version of the TF Lite Runtime.
+TFL_CAPI_EXPORT int TfLiteSchemaVersion(void);
+
+/// Returns a model from the provided buffer, or null on failure.
+///
+/// \note The caller retains ownership of the `model_data` buffer and should
+/// ensure that the lifetime of the `model_data` buffer must be at least as long
+/// as the lifetime of the `TfLiteModel` and of any `TfLiteInterpreter` objects
+/// created from that `TfLiteModel`, and furthermore the contents of the
+/// `model_data` buffer must not be modified during that time."
 TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreate(const void* model_data,
                                                       size_t model_size);
 
-// Returns a model from the provided file, or null on failure.
-//
-// NOTE: The file's contents must not be modified during the lifetime of the
-// `TfLiteModel` or of any `TfLiteInterpreter` objects created from that
-// `TfLiteModel`.
+/// Same as `TfLiteModelCreate` with customizble error reporter.
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateWithErrorReporter(
+    const void* model_data, size_t model_size,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data);
+
+/// Returns a model from the provided file, or null on failure.
+///
+/// \note The file's contents must not be modified during the lifetime of the
+/// `TfLiteModel` or of any `TfLiteInterpreter` objects created from that
+/// `TfLiteModel`.
 TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFile(
     const char* model_path);
 
-// Destroys the model instance.
+/// Same as `TfLiteModelCreateFromFile` with customizble error reporter.
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
+TFL_CAPI_EXPORT extern TfLiteModel* TfLiteModelCreateFromFileWithErrorReporter(
+    const char* model_path,
+    void (*reporter)(void* user_data, const char* format, va_list args),
+    void* user_data);
+
+/// Destroys the model instance.
 TFL_CAPI_EXPORT extern void TfLiteModelDelete(TfLiteModel* model);
 
-// Returns a new interpreter options instances.
+/// Returns a new interpreter options instances.
 TFL_CAPI_EXPORT extern TfLiteInterpreterOptions*
 TfLiteInterpreterOptionsCreate();
 
-// Destroys the interpreter options instance.
+/// Creates and returns a shallow copy of an options object.
+///
+/// The caller is responsible for calling `TfLiteInterpreterOptionsDelete` to
+/// deallocate the object pointed to by the returned pointer.
+TFL_CAPI_EXPORT extern TfLiteInterpreterOptions* TfLiteInterpreterOptionsCopy(
+    const TfLiteInterpreterOptions* from);
+
+/// Destroys the interpreter options instance.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsDelete(
     TfLiteInterpreterOptions* options);
 
-// Sets the number of CPU threads to use for the interpreter.
+/// Sets the number of CPU threads to use for the interpreter.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetNumThreads(
     TfLiteInterpreterOptions* options, int32_t num_threads);
 
-// Adds a delegate to be applied during `TfLiteInterpreter` creation.
-//
-// If delegate application fails, interpreter creation will also fail with an
-// associated error logged.
-//
-// NOTE: The caller retains ownership of the delegate and should ensure that it
-// remains valid for the duration of any created interpreter's lifetime.
+/// Adds a delegate to be applied during `TfLiteInterpreter` creation.
+///
+/// If delegate application fails, interpreter creation will also fail with an
+/// associated error logged.
+///
+/// \note The caller retains ownership of the delegate and should ensure that it
+/// remains valid for the duration of any created interpreter's lifetime.
+///
+/// If you are NOT using "TensorFlow Lite in Play Services", and NOT building
+/// with `TFLITE_WITH_STABLE_ABI` or `TFLITE_USE_OPAQUE_DELEGATE` macros
+/// enabled, it is possible to pass a `TfLiteDelegate*` rather than a
+/// `TfLiteOpaqueDelegate*` to this function, since in those cases,
+/// `TfLiteOpaqueDelegate` is just a typedef alias for `TfLiteDelegate`.
+/// This is for compatibility with existing source code
+/// and existing delegates.  For new delegates, it is recommended to
+/// use `TfLiteOpaqueDelegate` rather than `TfLiteDelegate`.  (See
+/// `TfLiteOpaqueDelegate` in tensorflow/lite/core/c/c_api_types.h.)
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddDelegate(
-    TfLiteInterpreterOptions* options, TfLiteDelegate* delegate);
-
-// Adds an opaque delegate to be applied during `TfLiteInterpreter` creation.
-//
-// If delegate application fails, interpreter creation will also fail with an
-// associated error logged.
-//
-// NOTE: The caller retains ownership of the delegate and should ensure that it
-// remains valid for the duration of any created interpreter's lifetime.
-TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddOpaqueDelegate(
-    TfLiteInterpreterOptions* options,
-    TfLiteOpaqueDelegateStruct* opaque_delegate);
+    TfLiteInterpreterOptions* options, TfLiteOpaqueDelegate* delegate);
 
-// Sets a custom error reporter for interpreter execution.
-//
-// * `reporter` takes the provided `user_data` object, as well as a C-style
-//   format string and arg list (see also vprintf).
-// * `user_data` is optional. If non-null, it is owned by the client and must
-//   remain valid for the duration of the interpreter lifetime.
+/// Sets a custom error reporter for interpreter execution.
+///
+/// * `reporter` takes the provided `user_data` object, as well as a C-style
+///   format string and arg list (see also vprintf).
+/// * `user_data` is optional. If non-null, it is owned by the client and must
+///   remain valid for the duration of the interpreter lifetime.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetErrorReporter(
     TfLiteInterpreterOptions* options,
     void (*reporter)(void* user_data, const char* format, va_list args),
     void* user_data);
 
-// Adds an op registration to be applied during `TfLiteInterpreter` creation.
-//
-// The `TfLiteRegistrationExternal` object is needed to implement custom op of
-// TFLite Interpreter via C API. Calling this function ensures that any
-// `TfLiteInterpreter` created with the specified `options` can execute models
-// that use the custom operator specified in `registration`.
-// Please refer https://www.tensorflow.org/lite/guide/ops_custom for custom op
-// support.
-// NOTE: The caller retains ownership of the TfLiteRegistrationExternal object
-// and should ensure that it remains valid for the duration of any created
-// interpreter's lifetime.
-// WARNING: This is an experimental API and subject to change.
+/// Adds an op registration to be applied during `TfLiteInterpreter` creation.
+///
+/// The `TfLiteRegistrationExternal` object is needed to implement custom op of
+/// TFLite Interpreter via C API. Calling this function ensures that any
+/// `TfLiteInterpreter` created with the specified `options` can execute models
+/// that use the custom operator specified in `registration`.
+/// Please refer https://www.tensorflow.org/lite/guide/ops_custom for custom op
+/// support.
+/// \note The caller retains ownership of the TfLiteRegistrationExternal object
+/// and should ensure that it remains valid for the duration of any created
+/// interpreter's lifetime.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddRegistrationExternal(
     TfLiteInterpreterOptions* options,
     TfLiteRegistrationExternal* registration);
 
-// Enables users to cancel in-flight invocations with `TfLiteInterpreterCancel`.
-//
-// By default it is disabled and calling to `TfLiteInterpreterCancel` will
-// return kTfLiteError. See `TfLiteInterpreterCancel`.
-//
-// WARNING: This is an experimental API and subject to change.
+/// Enables users to cancel in-flight invocations with
+/// `TfLiteInterpreterCancel`.
+///
+/// By default it is disabled and calling to `TfLiteInterpreterCancel` will
+/// return kTfLiteError. See `TfLiteInterpreterCancel`.
+///
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
     TfLiteInterpreterOptions* options, bool enable);
 
-// Returns a new interpreter using the provided model and options, or null on
-// failure.
-//
-// * `model` must be a valid model instance. The caller retains ownership of the
-//   object, and may destroy it (via TfLiteModelDelete) immediately after
-//   creating the interpreter.  However, if the TfLiteModel was allocated with
-//   TfLiteModelCreate, then the `model_data` buffer that was passed to
-//   TfLiteModelCreate must outlive the lifetime of the TfLiteInterpreter object
-//   that this function returns, and must not be modified during that time;
-//   and if the TfLiteModel was allocated with TfLiteModelCreateFromFile, then
-//   the contents of the model file must not be modified during the lifetime of
-//   the TfLiteInterpreter object that this function returns.
-// * `optional_options` may be null. The caller retains ownership of the object,
-//   and can safely destroy it (via TfLiteInterpreterOptionsDelete) immediately
-//   after creating the interpreter.
-//
-// NOTE: The client *must* explicitly allocate tensors before attempting to
-// access input tensor data or invoke the interpreter.
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and may destroy it (via TfLiteModelDelete) immediately after
+///   creating the interpreter.  However, if the TfLiteModel was allocated with
+///   TfLiteModelCreate, then the `model_data` buffer that was passed to
+///   TfLiteModelCreate must outlive the lifetime of the TfLiteInterpreter
+///   object that this function returns, and must not be modified during that
+///   time; and if the TfLiteModel was allocated with TfLiteModelCreateFromFile,
+///   then the contents of the model file must not be modified during the
+///   lifetime of the TfLiteInterpreter object that this function returns.
+/// * `optional_options` may be null. The caller retains ownership of the
+///   object, and can safely destroy it (via TfLiteInterpreterOptionsDelete)
+///   immediately after creating the interpreter.
+///
+/// \note The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
 TFL_CAPI_EXPORT extern TfLiteInterpreter* TfLiteInterpreterCreate(
     const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options);
 
-// Destroys the interpreter.
+/// Destroys the interpreter.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterDelete(
     TfLiteInterpreter* interpreter);
 
-// Returns the number of input tensors associated with the model.
+/// Returns the number of input tensors associated with the model.
 TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorCount(
     const TfLiteInterpreter* interpreter);
 
-// Returns a pointer to an array of input tensor indices.  The length of the
-// array can be obtained via a call to `TfLiteInterpreterGetInputTensorCount`.
-//
-// Typically the input tensors associated with an `interpreter` would be set
-// during the initialization of the `interpreter`, through a mechanism like the
-// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
-// interpreter.  However, there are some circumstances in which the pointer may
-// not remain valid throughout the lifetime of the interpreter, because calls
-// to `SetInputs` on the interpreter invalidate the returned pointer.
-//
-// The ownership of the array remains with the TFLite runtime.
+/// Returns a pointer to an array of input tensor indices.  The length of the
+/// array can be obtained via a call to `TfLiteInterpreterGetInputTensorCount`.
+///
+/// Typically the input tensors associated with an `interpreter` would be set
+/// during the initialization of the `interpreter`, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `SetInputs` on the interpreter invalidate the returned pointer.
+///
+/// The ownership of the array remains with the TFLite runtime.
 TFL_CAPI_EXPORT const int* TfLiteInterpreterInputTensorIndices(
     const TfLiteInterpreter* interpreter);
 
-// Returns the tensor associated with the input index.
-// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+/// Returns the tensor associated with the input index.
+/// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
 TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteInterpreterGetInputTensor(
     const TfLiteInterpreter* interpreter, int32_t input_index);
 
-// Resizes the specified input tensor.
-//
-// NOTE: After a resize, the client *must* explicitly allocate tensors before
-// attempting to access the resized tensor data or invoke the interpreter.
-//
-// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
-//
-// This function makes a copy of the input dimensions, so the client can safely
-// deallocate `input_dims` immediately after this function returns.
+/// Resizes the specified input tensor.
+///
+/// \note After a resize, the client *must* explicitly allocate tensors before
+/// attempting to access the resized tensor data or invoke the interpreter.
+///
+/// REQUIRES: 0 <= input_index < TfLiteInterpreterGetInputTensorCount(tensor)
+///
+/// This function makes a copy of the input dimensions, so the client can safely
+/// deallocate `input_dims` immediately after this function returns.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResizeInputTensor(
     TfLiteInterpreter* interpreter, int32_t input_index, const int* input_dims,
     int32_t input_dims_size);
 
-// Updates allocations for all tensors, resizing dependent tensors using the
-// specified input tensor dimensionality.
-//
-// This is a relatively expensive operation, and need only be called after
-// creating the graph and/or resizing any inputs.
+/// Updates allocations for all tensors, resizing dependent tensors using the
+/// specified input tensor dimensionality.
+///
+/// This is a relatively expensive operation, and need only be called after
+/// creating the graph and/or resizing any inputs.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterAllocateTensors(
     TfLiteInterpreter* interpreter);
 
-// Runs inference for the loaded graph.
-//
-// Before calling this function, the caller should first invoke
-// TfLiteInterpreterAllocateTensors() and should also set the values for the
-// input tensors.  After successfully calling this function, the values for the
-// output tensors will be set.
-//
-// NOTE: It is possible that the interpreter is not in a ready state to
-// evaluate (e.g., if AllocateTensors() hasn't been called, or if a
-// ResizeInputTensor() has been performed without a subsequent call to
-// AllocateTensors()).
-//
-//   If the (experimental!) delegate fallback option was enabled in the
-//   interpreter options, then the interpreter will automatically fall back to
-//   not using any delegates if execution with delegates fails. For details, see
-//   TfLiteInterpreterOptionsSetEnableDelegateFallback in c_api_experimental.h.
-//
-// Returns one of the following status codes:
-//  - kTfLiteOk: Success. Output is valid.
-//  - kTfLiteDelegateError: Execution with delegates failed, due to a problem
-//    with the delegate(s). If fallback was not enabled, output is invalid.
-//    If fallback was enabled, this return value indicates that fallback
-//    succeeded, the output is valid, and all delegates previously applied to
-//    the interpreter have been undone.
-//  - kTfLiteApplicationError: Same as for kTfLiteDelegateError, except that
-//    the problem was not with the delegate itself, but rather was
-//    due to an incompatibility between the delegate(s) and the
-//    interpreter or model.
-//  - kTfLiteError: Unexpected/runtime failure. Output is invalid.
-
+/// Runs inference for the loaded graph.
+///
+/// Before calling this function, the caller should first invoke
+/// TfLiteInterpreterAllocateTensors() and should also set the values for the
+/// input tensors.  After successfully calling this function, the values for the
+/// output tensors will be set.
+///
+/// \note It is possible that the interpreter is not in a ready state to
+/// evaluate (e.g., if AllocateTensors() hasn't been called, or if a
+/// ResizeInputTensor() has been performed without a subsequent call to
+/// AllocateTensors()).
+///
+///   If the (experimental!) delegate fallback option was enabled in the
+///   interpreter options, then the interpreter will automatically fall back to
+///   not using any delegates if execution with delegates fails. For details,
+///   see TfLiteInterpreterOptionsSetEnableDelegateFallback in
+///   c_api_experimental.h.
+///
+/// Returns one of the following status codes:
+///  - kTfLiteOk: Success. Output is valid.
+///  - kTfLiteDelegateError: Execution with delegates failed, due to a problem
+///    with the delegate(s). If fallback was not enabled, output is invalid.
+///    If fallback was enabled, this return value indicates that fallback
+///    succeeded, the output is valid, and all delegates previously applied to
+///    the interpreter have been undone.
+///  - kTfLiteApplicationError: Same as for kTfLiteDelegateError, except that
+///    the problem was not with the delegate itself, but rather was
+///    due to an incompatibility between the delegate(s) and the
+///    interpreter or model.
+///  - kTfLiteError: Unexpected/runtime failure. Output is invalid.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterInvoke(
     TfLiteInterpreter* interpreter);
 
-// Returns the number of output tensors associated with the model.
+/// Returns the number of output tensors associated with the model.
 TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorCount(
     const TfLiteInterpreter* interpreter);
 
-// Returns a pointer to an array of output tensor indices.  The length of the
-// array can be obtained via a call to `TfLiteInterpreterGetOutputTensorCount`.
-//
-// Typically the output tensors associated with an `interpreter` would be set
-// during the initialization of the `interpreter`, through a mechanism like the
-// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
-// interpreter.  However, there are some circumstances in which the pointer may
-// not remain valid throughout the lifetime of the interpreter, because calls to
-// `SetOutputs` on the interpreter invalidate the returned pointer.
-//
-// The ownership of the array remains with the TFLite runtime.
+/// Returns a pointer to an array of output tensor indices.  The length of the
+/// array can be obtained via a call to `TfLiteInterpreterGetOutputTensorCount`.
+///
+/// Typically the output tensors associated with an `interpreter` would be set
+/// during the initialization of the `interpreter`, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `SetOutputs` on the interpreter invalidate the returned pointer.
+///
+/// The ownership of the array remains with the TFLite runtime.
 TFL_CAPI_EXPORT const int* TfLiteInterpreterOutputTensorIndices(
     const TfLiteInterpreter* interpreter);
 
-// Returns the tensor associated with the output index.
-// REQUIRES: 0 <= output_index < TfLiteInterpreterGetOutputTensorCount(tensor)
-//
-// NOTE: The shape and underlying data buffer for output tensors may be not
-// be available until after the output tensor has been both sized and allocated.
-// In general, best practice is to interact with the output tensor *after*
-// calling TfLiteInterpreterInvoke().
+/// Returns the tensor associated with the output index.
+/// REQUIRES: 0 <= output_index < TfLiteInterpreterGetOutputTensorCount(tensor)
+///
+/// \note The shape and underlying data buffer for output tensors may be not
+/// be available until after the output tensor has been both sized and
+/// allocated.
+/// In general, best practice is to interact with the output tensor *after*
+/// calling TfLiteInterpreterInvoke().
 TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteInterpreterGetOutputTensor(
     const TfLiteInterpreter* interpreter, int32_t output_index);
 
-// Returns modifiable access to the tensor that corresponds to the
-// specified `index` and is associated with the provided `interpreter`.
-//
-// This requires the `index` to be between 0 and N - 1, where N is the
-// number of tensors in the model.
-//
-// Typically the tensors associated with the `interpreter` would be set during
-// the `interpreter` initialization, through a mechanism like the
-// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
-// interpreter.  However, there are some circumstances in which the pointer may
-// not remain valid throughout the lifetime of the interpreter, because calls to
-// `AddTensors` on the interpreter invalidate the returned pointer.
-//
-// Note the difference between this function and
-// `TfLiteInterpreterGetInputTensor` (or `TfLiteInterpreterGetOutputTensor` for
-// that matter): `TfLiteInterpreterGetTensor` takes an index into the array of
-// all tensors associated with the `interpreter`'s model, whereas
-// `TfLiteInterpreterGetInputTensor` takes an index into the array of input
-// tensors.
-//
-// The ownership of the tensor remains with the TFLite runtime, meaning the
-// caller should not deallocate the pointer.
+/// Returns modifiable access to the tensor that corresponds to the
+/// specified `index` and is associated with the provided `interpreter`.
+///
+/// This requires the `index` to be between 0 and N - 1, where N is the
+/// number of tensors in the model.
+///
+/// Typically the tensors associated with the `interpreter` would be set during
+/// the `interpreter` initialization, through a mechanism like the
+/// `InterpreterBuilder`, and remain unchanged throughout the lifetime of the
+/// interpreter.  However, there are some circumstances in which the pointer may
+/// not remain valid throughout the lifetime of the interpreter, because calls
+/// to `AddTensors` on the interpreter invalidate the returned pointer.
+///
+/// Note the difference between this function and
+/// `TfLiteInterpreterGetInputTensor` (or `TfLiteInterpreterGetOutputTensor` for
+/// that matter): `TfLiteInterpreterGetTensor` takes an index into the array of
+/// all tensors associated with the `interpreter`'s model, whereas
+/// `TfLiteInterpreterGetInputTensor` takes an index into the array of input
+/// tensors.
+///
+/// The ownership of the tensor remains with the TFLite runtime, meaning the
+/// caller should not deallocate the pointer.
 TFL_CAPI_EXPORT
 TfLiteTensor* TfLiteInterpreterGetTensor(const TfLiteInterpreter* interpreter,
                                          int index);
 
-// Tries to cancel any in-flight invocation.
-//
-// NOTE: This only cancels `TfLiteInterpreterInvoke` calls that happen before
-// calling this and it does not cancel subsequent invocations.
-// NOTE: Calling this function will also cancel any in-flight invocations of
-// SignatureRunners constructed from this interpreter.
-// Non-blocking and thread safe.
-//
-// Returns kTfLiteError if cancellation is not enabled via
-// `TfLiteInterpreterOptionsEnableCancellation`.
-//
-// WARNING: This is an experimental API and subject to change.
+/// Tries to cancel any in-flight invocation.
+///
+/// \note This only cancels `TfLiteInterpreterInvoke` calls that happen before
+/// calling this and it does not cancel subsequent invocations.
+/// \note Calling this function will also cancel any in-flight invocations of
+/// SignatureRunners constructed from this interpreter.
+/// Non-blocking and thread safe.
+///
+/// Returns kTfLiteError if cancellation is not enabled via
+/// `TfLiteInterpreterOptionsEnableCancellation`.
+///
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterCancel(
     const TfLiteInterpreter* interpreter);
 
@@ -386,107 +422,115 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterCancel(
 // language bindings. In the future the tensor struct will likely be made opaque
 // in the public API.
 
-// Returns the type of a tensor element.
+/// Returns the type of a tensor element.
 TFL_CAPI_EXPORT extern TfLiteType TfLiteTensorType(const TfLiteTensor* tensor);
 
-// Returns the number of dimensions that the tensor has.
+/// Returns the number of dimensions that the tensor has.  Returns -1 in case
+/// the 'opaque_tensor' does not have its dimensions property set.
 TFL_CAPI_EXPORT extern int32_t TfLiteTensorNumDims(const TfLiteTensor* tensor);
 
-// Returns the length of the tensor in the "dim_index" dimension.
-// REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
+/// Returns the length of the tensor in the "dim_index" dimension.
+/// REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
 TFL_CAPI_EXPORT extern int32_t TfLiteTensorDim(const TfLiteTensor* tensor,
                                                int32_t dim_index);
 
-// Returns the size of the underlying data in bytes.
+/// Returns the size of the underlying data in bytes.
 TFL_CAPI_EXPORT extern size_t TfLiteTensorByteSize(const TfLiteTensor* tensor);
 
-// Returns a pointer to the underlying data buffer.
-//
-// NOTE: The result may be null if tensors have not yet been allocated, e.g.,
-// if the Tensor has just been created or resized and `TfLiteAllocateTensors()`
-// has yet to be called, or if the output tensor is dynamically sized and the
-// interpreter hasn't been invoked.
+/// Returns a pointer to the underlying data buffer.
+///
+/// \note The result may be null if tensors have not yet been allocated, e.g.,
+/// if the Tensor has just been created or resized and `TfLiteAllocateTensors()`
+/// has yet to be called, or if the output tensor is dynamically sized and the
+/// interpreter hasn't been invoked.
 TFL_CAPI_EXPORT extern void* TfLiteTensorData(const TfLiteTensor* tensor);
 
-// Returns the (null-terminated) name of the tensor.
+/// Returns the (null-terminated) name of the tensor.
 TFL_CAPI_EXPORT extern const char* TfLiteTensorName(const TfLiteTensor* tensor);
 
-// Returns the parameters for asymmetric quantization. The quantization
-// parameters are only valid when the tensor type is `kTfLiteUInt8` and the
-// `scale != 0`. Quantized values can be converted back to float using:
-//    real_value = scale * (quantized_value - zero_point);
+/// Returns the parameters for asymmetric quantization. The quantization
+/// parameters are only valid when the tensor type is `kTfLiteUInt8` and the
+/// `scale != 0`. Quantized values can be converted back to float using:
+///    real_value = scale * (quantized_value - zero_point);
 TFL_CAPI_EXPORT extern TfLiteQuantizationParams TfLiteTensorQuantizationParams(
     const TfLiteTensor* tensor);
 
-// Copies from the provided input buffer into the tensor's buffer.
-// REQUIRES: input_data_size == TfLiteTensorByteSize(tensor)
+/// Copies from the provided input buffer into the tensor's buffer.
+/// REQUIRES: input_data_size == TfLiteTensorByteSize(tensor)
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyFromBuffer(
     TfLiteTensor* tensor, const void* input_data, size_t input_data_size);
 
-// Copies to the provided output buffer from the tensor's buffer.
-// REQUIRES: output_data_size == TfLiteTensorByteSize(tensor)
+/// Copies to the provided output buffer from the tensor's buffer.
+/// REQUIRES: output_data_size == TfLiteTensorByteSize(tensor)
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyToBuffer(
     const TfLiteTensor* output_tensor, void* output_data,
     size_t output_data_size);
 
-// Returns a new TfLiteRegistrationExternal instance.
-//
-// NOTE: The caller retains ownership and should ensure that
-// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
-// the lifetime of the `TfLiteInterpreter`.
-// WARNING: This is an experimental API and subject to change.
+/// Returns a new TfLiteRegistrationExternal instance.
+///
+/// \note The caller retains ownership and should ensure that
+/// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
+/// the lifetime of the `TfLiteInterpreter`.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteRegistrationExternal*
 TfLiteRegistrationExternalCreate(TfLiteBuiltinOperator builtin_code,
                                  const char* custom_name, int version);
 
-// Return the builtin op code of the provided external 'registration'.
-//
-// WARNING: This is an experimental API and subject to change.
+/// Return the builtin op code of the provided external 'registration'.
+///
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteBuiltinOperator
 TfLiteRegistrationExternalGetBuiltInCode(
     const TfLiteRegistrationExternal* registration);
 
-// Destroys the TfLiteRegistrationExternal instance.
-// WARNING: This is an experimental API and subject to change.
+/// Returns the custom name of the provided 'registration'. The returned pointer
+/// will be non-null iff the op is a custom op.
+///
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const char* TfLiteRegistrationExternalGetCustomName(
+    const TfLiteRegistrationExternal* registration);
+
+/// Destroys the TfLiteRegistrationExternal instance.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalDelete(
     TfLiteRegistrationExternal* registration);
 
-// Sets the initialization callback for the registration.
-//
-// The callback is called to initialize the op from serialized data.
-// Please refer `init` of `TfLiteRegistration` for the detail.
-// WARNING: This is an experimental API and subject to change.
+/// Sets the initialization callback for the registration.
+///
+/// The callback is called to initialize the op from serialized data.
+/// Please refer `init` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
     TfLiteRegistrationExternal* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                   size_t length));
 
-// Sets the deallocation callback for the registration.
-//
-// This callback is called to deallocate the data returned by the init callback.
-// The value passed in the `data` parameter is the value that was returned by
-// the `init` callback.
-// Please refer `free` of `TfLiteRegistration` for the detail.
-// WARNING: This is an experimental API and subject to change.
+/// Sets the deallocation callback for the registration.
+///
+/// This callback is called to deallocate the data returned by the init
+/// callback. The value passed in the `data` parameter is the value that was
+/// returned by the `init` callback.
+/// Please refer `free` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
     TfLiteRegistrationExternal* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
 
-// Sets the preparation callback for the registration.
-//
-// The callback is called when the inputs of operator have been resized.
-// Please refer `prepare` of `TfLiteRegistration` for the detail.
-// WARNING: This is an experimental API and subject to change.
+/// Sets the preparation callback for the registration.
+///
+/// The callback is called when the inputs of operator have been resized.
+/// Please refer `prepare` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
     TfLiteRegistrationExternal* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
                             TfLiteOpaqueNode* node));
 
-// Sets the invocation callback for the registration.
-//
-// The callback is called when the operator is executed.
-// Please refer `invoke` of `TfLiteRegistration` for the detail.
-// WARNING: This is an experimental API and subject to change.
+/// Sets the invocation callback for the registration.
+///
+/// The callback is called when the operator is executed.
+/// Please refer `invoke` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
     TfLiteRegistrationExternal* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
diff --git a/tensorflow/lite/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
similarity index 95%
rename from tensorflow/lite/c/c_api_experimental.cc
rename to tensorflow/lite/core/c/c_api_experimental.cc
index 5a083a6f4cf..5b74a41f1a9 100644
--- a/tensorflow/lite/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -13,16 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
 
 #include <stdint.h>
 
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/profiling/telemetry/profiler.h"
 #include "tensorflow/lite/signature_runner.h"
 
 extern "C" {
@@ -201,4 +204,10 @@ void TfLiteSignatureRunnerDelete(TfLiteSignatureRunner* signature_runner) {
   delete signature_runner;
 }
 
+void TfLiteInterpreterOptionsSetTelemetryProfiler(
+    TfLiteInterpreterOptions* options,
+    TfLiteTelemetryProfilerStruct* profiler) {
+  options->telemetry_profiler = profiler;
+}
+
 }  // extern "C"
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
new file mode 100644
index 00000000000..bcf0d83ed12
--- /dev/null
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -0,0 +1,419 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/c_api_experimental.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// --------------------------------------------------------------------------
+// Opaque types used by the C API.
+
+/// TfLiteSignatureRunner is used to run inference on a signature.
+///
+/// Note: A signature is used to define a computation in a TF model. A model can
+/// have multiple signatures. Each signature contains three components:
+///   * Signature Key: A unique string to identify a signature
+///   * Inputs: A list of names, each mapped to an input tensor of a signature
+///   * Outputs: A list of names, each mapped to an output tensor of a signature
+///
+/// To learn more about signatures in TFLite, refer to:
+/// https://www.tensorflow.org/lite/guide/signatures
+///
+/// Using the TfLiteSignatureRunner, for a particular signature, you can set its
+/// inputs, invoke (i.e. execute) the computation, and retrieve its outputs.
+typedef struct TfLiteSignatureRunner TfLiteSignatureRunner;
+
+// --------------------------------------------------------------------------
+/// Resets all variable tensors to zero.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
+    TfLiteInterpreter* interpreter);
+
+/// Adds an op registration for a builtin operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of the interpreter's lifetime. A common
+/// practice is making the provided `TfLiteRegistration` instance static.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
+    TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
+    const TfLiteRegistration* registration, int32_t min_version,
+    int32_t max_version);
+
+/// Adds an op registration for a custom operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of any created interpreter's lifetime. A
+/// common practice is making the provided `TfLiteRegistration` instance static.
+///
+/// The lifetime of the string pointed to by `name` must be at least as long
+/// as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver` on the same options object.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
+    TfLiteInterpreterOptions* options, const char* name,
+    const TfLiteRegistration* registration, int32_t min_version,
+    int32_t max_version);
+
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// The `TfLiteInterpreterOptionsSetOpResolver` function provides an alternative
+/// method for registering builtin ops and/or custom ops, by providing operator
+/// resolver callbacks.  Unlike using `TfLiteInterpreterOptionsAddBuiltinOp`
+/// and/or `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all
+/// the operators in a single call.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsAddBuiltin` or
+/// `TfLiteInterpreterOptionsAddCustomOp` on the same options object.
+///
+/// If `op_resolver_user_data` is non-null, its lifetime must be at least as
+/// long as the lifetime of the `TfLiteInterpreterOptions`.
+///
+/// WARNING: This is an experimental API and subject to change.
+void TfLiteInterpreterOptionsSetOpResolver(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                                 TfLiteBuiltinOperator op,
+                                                 int version),
+    const TfLiteRegistration* (*find_custom_op)(void* user_data,
+                                                const char* custom_op,
+                                                int version),
+    void* op_resolver_user_data);
+
+/// \private
+/// `TfLiteRegistration_V1` version of TfLiteInterpreterOptionsSetOpResolver.
+///
+/// WARNING: This function is deprecated / not an official part of the API, is
+/// only for binary backwards compatibility, and should not be called.
+void TfLiteInterpreterOptionsSetOpResolverV1(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration_V1* (*find_builtin_op_v1)(void* user_data,
+                                                       TfLiteBuiltinOperator op,
+                                                       int version),
+    const TfLiteRegistration_V1* (*find_custom_op_v1)(void* user_data,
+                                                      const char* op,
+                                                      int version),
+    void* op_resolver_user_data);
+
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure, where the model uses only the operators explicitly added to the
+/// options.  This is the same as `TFLiteInterpreterCreate` from `c_api.h`,
+/// except that the only operators that are supported are the ones registered
+/// in `options` via calls to `TfLiteInterpreterOptionsSetOpResolver`,
+/// `TfLiteInterpreterOptionsAddBuiltinOp`, and/or
+/// `TfLiteInterpreterOptionsAddCustomOp`.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and can destroy it immediately after creating the interpreter;
+///   the interpreter will maintain its own reference to the underlying model
+///   data.
+/// * `options` should not be null. The caller retains ownership of the object,
+///   and can safely destroy it immediately after creating the interpreter.
+///
+/// NOTE: The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteInterpreter*
+TfLiteInterpreterCreateWithSelectedOps(const TfLiteModel* model,
+                                       const TfLiteInterpreterOptions* options);
+
+/// Enable or disable the NN API delegate for the interpreter (true to enable).
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
+    TfLiteInterpreterOptions* options, bool enable);
+
+/// Enable or disable CPU fallback for the interpreter (true to enable).
+/// If enabled, TfLiteInterpreterInvoke will do automatic fallback from
+/// executing with delegate(s) to regular execution without delegates
+/// (i.e. on CPU).
+///
+/// Allowing the fallback is suitable only if both of the following hold:
+/// - The caller is known not to cache pointers to tensor data across
+///   TfLiteInterpreterInvoke calls.
+/// - The model is not stateful (no variables, no LSTMs) or the state isn't
+///   needed between batches.
+///
+/// When delegate fallback is enabled, TfLiteInterpreterInvoke will
+/// behave as follows:
+///   If one or more delegates were set in the interpreter options
+///   (see TfLiteInterpreterOptionsAddDelegate),
+///   AND inference fails,
+///   then the interpreter will fall back to not using any delegates.
+///   In that case, the previously applied delegate(s) will be automatically
+///   undone, and an attempt will be made to return the interpreter to an
+///   invokable state, which may invalidate previous tensor addresses,
+///   and the inference will be attempted again, using input tensors with
+///   the same value as previously set.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetEnableDelegateFallback(
+    TfLiteInterpreterOptions* options, bool enable);
+
+// Set if buffer handle output is allowed.
+//
+/// When using hardware delegation, Interpreter will make the data of output
+/// tensors available in `tensor->data` by default. If the application can
+/// consume the buffer handle directly (e.g. reading output from OpenGL
+/// texture), it can set this flag to false, so Interpreter won't copy the
+/// data from buffer handle to CPU memory. WARNING: This is an experimental
+/// API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteSetAllowBufferHandleOutput(
+    const TfLiteInterpreter* interpreter, bool allow_buffer_handle_output);
+
+/// Allow a delegate to look at the graph and modify the graph to handle
+/// parts of the graph themselves. After this is called, the graph may
+/// contain new nodes that replace 1 more nodes.
+/// 'delegate' must outlive the interpreter.
+/// Use `TfLiteInterpreterOptionsAddDelegate` instead of this unless
+/// absolutely required.
+/// Returns one of the following three status codes:
+/// 1. kTfLiteOk: Success.
+/// 2. kTfLiteDelegateError: Delegation failed due to an error in the
+/// delegate. The Interpreter has been restored to its pre-delegation state.
+/// NOTE: This undoes all delegates previously applied to the Interpreter.
+/// 3. kTfLiteError: Unexpected/runtime failure.
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterModifyGraphWithDelegate(
+    const TfLiteInterpreter* interpreter, TfLiteDelegate* delegate);
+
+/// Returns the tensor index corresponding to the input tensor
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorIndex(
+    const TfLiteInterpreter* interpreter, int32_t input_index);
+
+/// Returns the tensor index corresponding to the output tensor
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorIndex(
+    const TfLiteInterpreter* interpreter, int32_t output_index);
+
+/// --------------------------------------------------------------------------
+/// SignatureRunner APIs
+///
+/// You can run inference by either:
+///
+/// (i) (recommended) using the Interpreter to initialize SignatureRunner(s) and
+///     then only using SignatureRunner APIs.
+///
+/// (ii) only using Interpreter APIs.
+///
+/// NOTE:
+/// * Only use one of the above options to run inference, i.e. avoid mixing both
+///   SignatureRunner APIs and Interpreter APIs to run inference as they share
+///   the same underlying data (e.g. updating an input tensor “A” retrieved
+///   using the Interpreter APIs will update the state of the input tensor “B”
+///   retrieved using SignatureRunner APIs, if they point to the same underlying
+///   tensor in the model; as it is not possible for a user to debug this by
+///   analyzing the code, it can lead to undesirable behavior).
+/// * The TfLiteSignatureRunner type is conditionally thread-safe, provided that
+///   no two threads attempt to simultaneously access two TfLiteSignatureRunner
+///   instances that point to the same underlying signature, or access a
+///   TfLiteSignatureRunner and its underlying TfLiteInterpreter, unless all
+///   such simultaneous accesses are reads (rather than writes).
+/// * The lifetime of a TfLiteSignatureRunner object ends when
+///   TfLiteSignatureRunnerDelete() is called on it (or when the lifetime of the
+///   underlying TfLiteInterpreter ends -- but you should call
+///   TfLiteSignatureRunnerDelete() before that happens in order to avoid
+///   resource leaks).
+/// * You can only apply delegates to the interpreter (via
+///   TfLiteInterpreterOptions) and not to a signature.
+
+/// Returns the number of signatures defined in the model.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetSignatureCount(
+    const TfLiteInterpreter* interpreter);
+
+/// Returns the key of the Nth signature in the model, where N is specified as
+/// `signature_index`.
+///
+/// NOTE: The lifetime of the returned key is the same as (and depends on) the
+/// lifetime of `interpreter`.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const char* TfLiteInterpreterGetSignatureKey(
+    const TfLiteInterpreter* interpreter, int32_t signature_index);
+
+/// Returns a new signature runner using the provided interpreter and signature
+/// key, or nullptr on failure.
+///
+/// NOTE: `signature_key` is a null-terminated C string that must match the
+/// key of a signature in the interpreter's model.
+///
+/// NOTE: The returned signature runner should be destroyed, by calling
+/// TfLiteSignatureRunnerDelete(), before the interpreter is destroyed.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteSignatureRunner*
+TfLiteInterpreterGetSignatureRunner(const TfLiteInterpreter* interpreter,
+                                    const char* signature_key);
+
+/// Returns the number of inputs associated with a signature.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetInputCount(
+    const TfLiteSignatureRunner* signature_runner);
+
+/// Returns the (null-terminated) name of the Nth input in a signature, where N
+/// is specified as `input_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetInputName(
+    const TfLiteSignatureRunner* signature_runner, const int32_t input_index);
+
+/// Resizes the input tensor identified as `input_name` to be the dimensions
+/// specified by `input_dims` and `input_dims_size`. Only unknown dimensions can
+/// be resized with this function. Unknown dimensions are indicated as `-1` in
+/// the `dims_signature` attribute of a TfLiteTensor.
+///
+/// Returns status of failure or success. Note that this doesn't actually resize
+/// any existing buffers. A call to TfLiteSignatureRunnerAllocateTensors() is
+/// required to change the tensor input buffer.
+///
+/// NOTE: This function is similar to TfLiteInterpreterResizeInputTensorStrict()
+/// and not TfLiteInterpreterResizeInputTensor().
+///
+/// NOTE: `input_name` must match the name of an input in the signature.
+///
+/// NOTE: This function makes a copy of the input dimensions, so the caller can
+/// safely deallocate `input_dims` immediately after this function returns.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerResizeInputTensor(
+    TfLiteSignatureRunner* signature_runner, const char* input_name,
+    const int* input_dims, int32_t input_dims_size);
+
+/// Updates allocations for tensors associated with a signature and resizes
+/// dependent tensors using the specified input tensor dimensionality.
+/// This is a relatively expensive operation and hence should only be called
+/// after initializing the signature runner object and/or resizing any inputs.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerAllocateTensors(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Returns the input tensor identified by `input_name` in the given signature.
+/// Returns nullptr if the given name is not valid.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `signature_runner`.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteTensor* TfLiteSignatureRunnerGetInputTensor(
+    TfLiteSignatureRunner* signature_runner, const char* input_name);
+
+/// Runs inference on a given signature.
+///
+/// Before calling this function, the caller should first invoke
+/// TfLiteSignatureRunnerAllocateTensors() and should also set the values for
+/// the input tensors. After successfully calling this function, the values for
+/// the output tensors will be set.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerInvoke(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Returns the number of output tensors associated with the signature.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern size_t TfLiteSignatureRunnerGetOutputCount(
+    const TfLiteSignatureRunner* signature_runner);
+
+/// Returns the (null-terminated) name of the Nth output in a signature, where
+/// N is specified as `output_index`.
+///
+/// NOTE: The lifetime of the returned name is the same as (and depends on) the
+/// lifetime of `signature_runner`.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const char* TfLiteSignatureRunnerGetOutputName(
+    const TfLiteSignatureRunner* signature_runner, int32_t output_index);
+
+/// Returns the output tensor identified by `output_name` in the given
+/// signature. Returns nullptr if the given name is not valid.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `signature_runner`.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const TfLiteTensor* TfLiteSignatureRunnerGetOutputTensor(
+    const TfLiteSignatureRunner* signature_runner, const char* output_name);
+
+/// Attempts to cancel in flight invocation if any.
+/// This will not affect calls to `Invoke` that happend after this.
+/// Non blocking and thread safe.
+/// Returns kTfLiteError if cancellation is not enabled, otherwise returns
+/// kTfLiteOk.
+/// NOTE: Calling this function will cancel in-flight invocations
+/// in all SignatureRunners built from the same interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteSignatureRunnerCancel(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Destroys the signature runner.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteSignatureRunnerDelete(
+    TfLiteSignatureRunner* signature_runner);
+
+/// Registers the telemetry profiler to the interpreter.
+/// Note: The interpreter does not take the ownership of profiler, but callers
+/// must ensure profiler->data outlives the lifespan of the interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetTelemetryProfiler(
+    TfLiteInterpreterOptions* options, TfLiteTelemetryProfilerStruct* profiler);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/lite/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
similarity index 95%
rename from tensorflow/lite/c/c_api_experimental_test.cc
rename to tensorflow/lite/core/c/c_api_experimental_test.cc
index 74c5fbe370f..db02518e110 100644
--- a/tensorflow/lite/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
 
 #include <string.h>
 
@@ -23,8 +23,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
 #include "tensorflow/lite/testing/util.h"
 
@@ -45,7 +45,7 @@ const TfLiteRegistration* GetDummyRegistration() {
 
 TEST(CApiExperimentalTest, Smoke) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -67,7 +67,7 @@ TEST(CApiExperimentalTest, Smoke) {
 // Test using TfLiteInterpreterCreateWithSelectedOps.
 TEST(CApiExperimentalTest, SelectedBuiltins) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -90,7 +90,7 @@ TEST(CApiExperimentalTest, SelectedBuiltins) {
 // we do NOT get the standard builtin operators by default.
 TEST(CApiExperimentalTest, MissingBuiltin) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   // Install a custom error reporter into the interpreter by way of options.
@@ -147,7 +147,7 @@ const TfLiteRegistration* MyFindCustomOp(void*, const char* custom_op,
 // Test using TfLiteInterpreterCreateWithSelectedOps.
 TEST(CApiExperimentalTest, SetOpResolver) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
@@ -200,7 +200,7 @@ void CheckExecution(TfLiteInterpreterOptions* options,
                     TfLiteStatus expected_first_result,
                     TfLiteStatus expected_subsequent_results) {
   TfLiteModel* model =
-      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+      TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 42f433c513c..7bc710316a4 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -26,12 +26,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace {
@@ -53,6 +56,13 @@ TEST(CApiSimple, Version) {
   EXPECT_STREQ(TfLiteVersion(), version);
 }
 
+TEST(CApiSimple, SchemaVersion) {
+  // The following checks will need updating if we change the schema version.
+  EXPECT_EQ(TfLiteSchemaVersion(), 3);
+  // Calling the function again should give the same result.
+  EXPECT_EQ(TfLiteSchemaVersion(), 3);
+}
+
 TEST(CApiSimple, Smoke) {
   TfLiteModel* model =
       TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
@@ -299,7 +309,7 @@ TEST(CApiSimple, DelegateExternal_GetExecutionPlan) {
   opaque_delegate_builder.data = &delegate_prepared;
   opaque_delegate_builder.Prepare =
       [](TfLiteOpaqueContext* context,  // NOLINT
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
         *static_cast<bool*>(data) = true;
 
         TfLiteIntArray* execution_plan;
@@ -310,11 +320,11 @@ TEST(CApiSimple, DelegateExternal_GetExecutionPlan) {
         return kTfLiteOk;
       };
 
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, opaque_delegate);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
   // The delegate should have been applied.
@@ -356,7 +366,7 @@ struct OpState {
 };
 
 std::vector<int>* g_nodes_to_replace;
-TfLiteOpaqueDelegateStruct* g_opaque_delegate_struct;
+TfLiteOpaqueDelegate* g_opaque_delegate_struct;
 
 TfLiteRegistrationExternal* CreateExternalRegistration() {
   TfLiteRegistrationExternal* registration_external =
@@ -405,7 +415,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare =
       [](TfLiteOpaqueContext* opaque_context,
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
         DelegateState* delegate_state = static_cast<DelegateState*>(data);
         delegate_state->delegate_prepared = true;
 
@@ -427,13 +437,13 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
         return kTfLiteOk;
       };
 
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
   g_opaque_delegate_struct = opaque_delegate;
 
   EXPECT_EQ(g_nodes_to_replace->size(), 0);
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, opaque_delegate);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   TfLiteModelDelete(model);
 
@@ -453,6 +463,64 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   g_opaque_delegate_struct = nullptr;
 }
 
+TEST(CApiSimple,
+     OpaqueDelegate_TransferRegistrationExternalOwnershipWithoutNodeToReplace) {
+  g_nodes_to_replace = new std::vector<int>();
+
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+
+  TfLiteRegistrationExternal* registration_external =
+      CreateExternalRegistration();
+  // Create and install a delegate instance.
+  DelegateState delegate_state{false, registration_external};
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.data = &delegate_state;
+  opaque_delegate_builder.Prepare =
+      [](TfLiteOpaqueContext* opaque_context,
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
+        DelegateState* delegate_state = static_cast<DelegateState*>(data);
+        delegate_state->delegate_prepared = true;
+
+        TfLiteOpaqueNode* node = nullptr;
+        TfLiteRegistrationExternal* registration_external = nullptr;
+        TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
+                                                  &registration_external);
+        EXPECT_NE(node, nullptr);
+        EXPECT_NE(registration_external, nullptr);
+
+        // Create a fake execution plan to avoid replacing nodes.
+        TfLiteIntArray* fake_execution_plan = TfLiteIntArrayCreate(0);
+        TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+            opaque_context, delegate_state->registration_external,
+            fake_execution_plan, opaque_delegate);
+        TfLiteIntArrayFree(fake_execution_plan);
+
+        return kTfLiteOk;
+      };
+
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+  g_opaque_delegate_struct = opaque_delegate;
+
+  EXPECT_EQ(g_nodes_to_replace->size(), 0);
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  TfLiteModelDelete(model);
+
+  // The delegate should have been applied with 0 node to replace.
+  EXPECT_TRUE(delegate_state.delegate_prepared);
+  std::vector<int>& nodes_to_replace = *g_nodes_to_replace;
+  EXPECT_EQ(nodes_to_replace.size(), 0);
+
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+  delete g_nodes_to_replace;
+  g_opaque_delegate_struct = nullptr;
+}
+
 using ::tflite::delegates::test_utils::TestFP16Delegation;
 
 TEST_F(TestFP16Delegation,
@@ -467,7 +535,7 @@ TEST_F(TestFP16Delegation,
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare =
       [](TfLiteOpaqueContext* opaque_context,
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
         DelegateState* delegate_state = static_cast<DelegateState*>(data);
         delegate_state->delegate_prepared = true;
         TfLiteIntArray* execution_plan;
@@ -501,7 +569,7 @@ TEST_F(TestFP16Delegation,
         TfLiteIntArrayFree(subset_to_replace);
         return kTfLiteOk;
       };
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
   g_opaque_delegate_struct = opaque_delegate;
   EXPECT_EQ(g_nodes_to_replace->size(), 0);
@@ -523,6 +591,19 @@ TEST_F(TestFP16Delegation,
   g_opaque_delegate_struct = nullptr;
 }
 
+static void error_reporter(void* user_data, const char* format, va_list args) {
+  reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format, args);
+}
+
+TEST(CApiSimple, InterpreterOptionsCopy) {
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptions* copy = TfLiteInterpreterOptionsCopy(options);
+  ASSERT_NE(copy, nullptr);
+  ASSERT_NE(copy, options);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteInterpreterOptionsDelete(copy);
+}
+
 TEST(CApiSimple, ErrorReporter) {
   TfLiteModel* model =
       TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
@@ -530,13 +611,7 @@ TEST(CApiSimple, ErrorReporter) {
 
   // Install a custom error reporter into the interpreter by way of options.
   tflite::TestErrorReporter reporter;
-  TfLiteInterpreterOptionsSetErrorReporter(
-      options,
-      [](void* user_data, const char* format, va_list args) {
-        reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
-                                                                        args);
-      },
-      &reporter);
+  TfLiteInterpreterOptionsSetErrorReporter(options, error_reporter, &reporter);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
   // The options/model can be deleted immediately after interpreter creation.
@@ -554,6 +629,55 @@ TEST(CApiSimple, ErrorReporter) {
   TfLiteInterpreterDelete(interpreter);
 }
 
+TEST(CApiSimple, ModelCreateWithErrorReporter) {
+  TfLiteModel* model = nullptr;
+  tflite::TestErrorReporter reporter;
+
+  // valid model with error reporter
+  std::ifstream model_file("tensorflow/lite/testdata/add.bin");
+  model_file.seekg(0, std::ios_base::end);
+  std::vector<char> model_buffer(model_file.tellg());
+  model_file.seekg(0, std::ios_base::beg);
+  model_file.read(model_buffer.data(), model_buffer.size());
+  model = TfLiteModelCreateWithErrorReporter(
+      model_buffer.data(), model_buffer.size(), error_reporter, &reporter);
+  ASSERT_NE(model, nullptr);
+  EXPECT_EQ(reporter.error_messages(), "");
+  TfLiteModelDelete(model);
+
+  // invalid model with error reporter
+  std::vector<char> invalid_model(20, 'c');
+  model = TfLiteModelCreateWithErrorReporter(
+      invalid_model.data(), invalid_model.size(), error_reporter, &reporter);
+  ASSERT_EQ(model, nullptr);
+  EXPECT_EQ(reporter.error_messages(),
+            "The model is not a valid Flatbuffer buffer");
+  TfLiteModelDelete(model);
+}
+
+TEST(CApiSimple, ModelCreateFromFileWithErrorReporter) {
+  TfLiteModel* model = nullptr;
+  tflite::TestErrorReporter reporter;
+
+  // valid model file with error reporter
+  model = TfLiteModelCreateFromFileWithErrorReporter(
+      "third_party/tensorflow/lite/testdata/add.bin", error_reporter,
+      &reporter);
+  ASSERT_NE(model, nullptr);
+  EXPECT_EQ(reporter.error_messages(), "");
+  TfLiteModelDelete(model);
+
+  // invalid model file with error reporter
+  std::vector<char> invalid_model(20, 'c');
+  model = TfLiteModelCreateFromFileWithErrorReporter("invalid/path/foo.tflite",
+                                                     error_reporter, &reporter);
+  ASSERT_EQ(model, nullptr);
+  ASSERT_THAT(
+      reporter.error_messages(),
+      testing::ContainsRegex("Could not open 'invalid/path/foo.tflite'."));
+  TfLiteModelDelete(model);
+}
+
 TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
   struct DelegateKernelState {
     TfLiteOpaqueTensor* input_tensor = nullptr;
@@ -632,8 +756,8 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
   TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare =
-      [](TfLiteOpaqueContext* context,
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+      [](TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* opaque_delegate,
+         void* data) {
         auto delegate_state = static_cast<DelegateState*>(data);
         delegate_state->delegate_prepared = true;
 
@@ -670,11 +794,11 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
         return kTfLiteOk;
       };
 
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, opaque_delegate);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
   // The delegate should have been applied.
@@ -736,7 +860,7 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
   opaque_delegate_builder.data = &delegate_state;
   opaque_delegate_builder.Prepare =
       [](TfLiteOpaqueContext* opaque_context,
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
         DelegatePrepareStatus* delegate_state =
             static_cast<DelegatePrepareStatus*>(data);
         delegate_state->prepared = true;
@@ -760,11 +884,66 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
         return kTfLiteOk;
       };
 
-  struct TfLiteOpaqueDelegateStruct* opaque_delegate =
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  TfLiteModelDelete(model);
+
+  // The delegate should have been applied.
+  EXPECT_TRUE(delegate_state.prepared);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TEST(CApiSimple, TfLiteOpaqueContextResizeTensor) {
+  struct DelegatePrepareStatus {
+    bool prepared;
+  };
+  DelegatePrepareStatus delegate_state{false};
+
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.data = &delegate_state;
+  opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                       TfLiteOpaqueDelegate* opaque_delegate,
+                                       void* data) {
+    DelegatePrepareStatus* delegate_state =
+        static_cast<DelegatePrepareStatus*>(data);
+    delegate_state->prepared = true;
+
+    TfLiteOpaqueTensor* tensor =
+        TfLiteOpaqueContextGetOpaqueTensor(opaque_context, 0);
+    EXPECT_EQ(4, TfLiteOpaqueTensorNumDims(tensor));
+    EXPECT_EQ(1, TfLiteOpaqueTensorDim(tensor, 0));
+    EXPECT_EQ(8, TfLiteOpaqueTensorDim(tensor, 1));
+    EXPECT_EQ(8, TfLiteOpaqueTensorDim(tensor, 2));
+    EXPECT_EQ(3, TfLiteOpaqueTensorDim(tensor, 3));
+
+    TfLiteIntArray* new_dims = TfLiteIntArrayCreate(3);
+    new_dims->data[0] = 2;
+    new_dims->data[1] = 3;
+    new_dims->data[2] = 4;
+    EXPECT_EQ(kTfLiteOk, TfLiteOpaqueContextResizeTensor(opaque_context, tensor,
+                                                         new_dims));
+
+    EXPECT_EQ(new_dims->size, TfLiteOpaqueTensorNumDims(tensor));
+    EXPECT_EQ(new_dims->data[0], TfLiteOpaqueTensorDim(tensor, 0));
+    EXPECT_EQ(new_dims->data[1], TfLiteOpaqueTensorDim(tensor, 1));
+    EXPECT_EQ(new_dims->data[2], TfLiteOpaqueTensorDim(tensor, 2));
+    return kTfLiteOk;
+  };
+
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, opaque_delegate);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   TfLiteModelDelete(model);
 
@@ -867,6 +1046,9 @@ TEST(CApiSimple, CustomOpSupport) {
   TfLiteRegistrationExternalSetFree(reg, &FlexSinhFree);
   TfLiteRegistrationExternalSetInvoke(reg, &FlexSinhEval);
 
+  const char* kCustomName = TfLiteRegistrationExternalGetCustomName(reg);
+  EXPECT_EQ("Sinh", kCustomName);
+
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   TfLiteInterpreterOptionsAddRegistrationExternal(options, reg);
 
@@ -1104,5 +1286,208 @@ TEST(CApiSimple, CallbackOpResolver_V1) {
     EXPECT_EQ(reg_sinh->registration_external, nullptr);
   }
 }
+const char* kSubgraphName = "TheName";
+
+TEST(CApiSimple, OpaqueApiAccessors) {
+  //
+  // Construct a model in-memory with various node and tensor properties that
+  // we are going to query with the API functions that work with opaque types.
+  //
+  ::tflite::Interpreter interpreter;
+  interpreter.primary_subgraph().SetName(kSubgraphName);
+  interpreter.AddTensors(3);
+  std::vector<int> dims = {1, 3};
+  std::vector<int> dims_signature = {-1, 3};
+  interpreter.SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "a", dims, TfLiteQuantizationParams{1.0, 0},
+      /*is_variable=*/false, &dims_signature);
+  interpreter.SetTensorParametersReadWrite(
+      1, kTfLiteFloat32, "b", dims, TfLiteQuantizationParams{1.0, 0},
+      /*is_variable=*/true, &dims_signature);
+  interpreter.SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "c", dims, TfLiteQuantizationParams{1.0, 0},
+      /*is_variable=*/false, &dims_signature);
+  // Add an additional "blank" tensor that doesn't have its properties set via
+  // an API like 'SetTensorParametersReadWrite' to simulate the case where one
+  // or multiple blank tensors are added after the model has been loaded.
+  interpreter.AddTensors(1);
+
+  interpreter.SetInputs({0, 1});
+  interpreter.SetOutputs({2});
+  const char* initial_data = "";
+  tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
+  TfLiteAddParams* builtin_data =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
+  const TfLiteRegistration* reg =
+      resolver.FindOp(::tflite::BuiltinOperator_ADD, 1);
+  interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0, builtin_data,
+                                    reg);
+  interpreter.primary_subgraph().variables().push_back(1);
+
+  //
+  // We delegate all nodes to a kernel, so that the TFLite runtime provides us
+  // with an opaque context that represents the model that we constructed
+  // in-memory.
+  //
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  bool delegate_kernel_invoked = false;
+  opaque_delegate_builder.data = &delegate_kernel_invoked;
+  opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueDelegate* delegate,
+                                       void* data) -> TfLiteStatus {
+    //
+    // Define a delegate kernel that checks that the properties of the model
+    // are accessible via the opaque API function.
+    //
+    TfLiteRegistrationExternal* reg = TfLiteRegistrationExternalCreate(
+        kTfLiteBuiltinDelegate, "my delegate", 1);
+    TfLiteRegistrationExternalSetInit(
+        reg,
+        [](TfLiteOpaqueContext* opaque_context, const char* buffer,
+           size_t length) -> void* {
+          const TfLiteOpaqueDelegateParams* params =
+              reinterpret_cast<const TfLiteOpaqueDelegateParams*>(buffer);
+          EXPECT_EQ(2, params->input_tensors->size);
+          TfLiteOpaqueTensor* opaque_input_tensor =
+              TfLiteOpaqueContextGetOpaqueTensor(
+                  opaque_context, params->input_tensors->data[0]);
+          EXPECT_EQ(2, TfLiteOpaqueTensorNumDims(opaque_input_tensor));
+          EXPECT_EQ(1, TfLiteOpaqueTensorDim(opaque_input_tensor, 0));
+          EXPECT_EQ(3, TfLiteOpaqueTensorDim(opaque_input_tensor, 1));
+
+          int32_t num_dims = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueTensorGetNumDimsSignature(
+                                   opaque_input_tensor, &num_dims));
+          EXPECT_EQ(2, num_dims);
+
+          int32_t dim_length = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueTensorGetDimSignature(
+                                   opaque_input_tensor, 0, &dim_length));
+          EXPECT_EQ(-1, dim_length);
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueTensorGetDimSignature(
+                                   opaque_input_tensor, 1, &dim_length));
+          EXPECT_EQ(3, dim_length);
+
+          EXPECT_FALSE(TfLiteOpaqueTensorIsVariable(opaque_input_tensor));
+          EXPECT_TRUE(
+              TfLiteOpaqueTensorIsVariable(TfLiteOpaqueContextGetOpaqueTensor(
+                  opaque_context, params->input_tensors->data[1])));
+          EXPECT_EQ(kTfLiteArenaRw,
+                    TfLiteOpaqueTensorGetAllocationType(opaque_input_tensor));
+
+          TfLiteQuantizationParams quantization_params =
+              TfLiteOpaqueTensorGetQuantizationParams(opaque_input_tensor);
+          EXPECT_EQ(1.0, quantization_params.scale);
+          EXPECT_EQ(0, quantization_params.zero_point);
+
+          TfLiteQuantization quantization =
+              TfLiteOpaqueTensorGetQuantization(opaque_input_tensor);
+          EXPECT_EQ(kTfLiteAffineQuantization, quantization.type);
+          EXPECT_STREQ(kSubgraphName,
+                       TfLiteOpaqueContextGetName(opaque_context));
+          EXPECT_EQ(4, TfLiteOpaqueContextGetNumTensors(opaque_context));
+          EXPECT_EQ(-1,
+                    TfLiteOpaqueTensorNumDims(
+                        TfLiteOpaqueContextGetOpaqueTensor(opaque_context, 3)));
+
+          EXPECT_EQ(kTfLiteOk,
+                    TfLiteOpaqueTensorGetNumDimsSignature(
+                        TfLiteOpaqueContextGetOpaqueTensor(opaque_context, 3),
+                        &num_dims));
+          EXPECT_EQ(-1, num_dims);
+
+          // 1 node for ADD and 1 node for the delegate kernel.
+          EXPECT_EQ(2, TfLiteOpaqueContextGetNumNodes(opaque_context));
+
+          TfLiteOpaqueNode* node = nullptr;
+          TfLiteRegistrationExternal* registration_external = nullptr;
+          TfLiteOpaqueContextGetNodeAndRegistration(
+              opaque_context, params->nodes_to_replace->data[0], &node,
+              &registration_external);
+          // ADD is a builtin OP, not a custom OP.
+          const char* kCustomName =
+              TfLiteRegistrationExternalGetCustomName(registration_external);
+          EXPECT_EQ(nullptr, kCustomName);
+
+          const void* node_custom_init_data = nullptr;
+          int node_custom_init_data_size = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueNodeGetCustomInitialData(
+                                   node, &node_custom_init_data,
+                                   &node_custom_init_data_size));
+          EXPECT_EQ(nullptr, node_custom_init_data);
+          EXPECT_EQ(0, node_custom_init_data_size);
+
+          void* builtin = TfLiteOpaqueNodeGetBuiltinData(node);
+          TfLiteAddParams* add_params =
+              reinterpret_cast<TfLiteAddParams*>(builtin);
+          EXPECT_EQ(add_params->activation, kTfLiteActNone);
+
+          const int* node_inputs = nullptr;
+          int node_inputs_size = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueNodeInputs(node, &node_inputs,
+                                                      &node_inputs_size));
+          EXPECT_EQ(2, node_inputs_size);
+          EXPECT_EQ(0, node_inputs[0]);
+          EXPECT_EQ(1, node_inputs[1]);
+
+          const int* node_outputs = nullptr;
+          int node_outputs_size = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueNodeOutputs(node, &node_outputs,
+                                                       &node_outputs_size));
+          EXPECT_EQ(1, node_outputs_size);
+          EXPECT_EQ(2, node_outputs[0]);
+
+          const int* node_temporaries = nullptr;
+          int node_temporaries_size = 0;
+          EXPECT_EQ(kTfLiteOk,
+                    TfLiteOpaqueNodeTemporaries(node, &node_temporaries,
+                                                &node_temporaries_size));
+          EXPECT_EQ(0, node_temporaries_size);
+
+          const int* context_inputs = nullptr;
+          int context_inputs_size = 0;
+          EXPECT_EQ(kTfLiteOk,
+                    TfLiteOpaqueContextGetInputs(
+                        opaque_context, &context_inputs, &context_inputs_size));
+          EXPECT_EQ(2, context_inputs_size);
+          EXPECT_EQ(0, context_inputs[0]);
+          EXPECT_EQ(1, context_inputs[1]);
+
+          const int* context_outputs = nullptr;
+          int context_outputs_size = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueContextGetOutputs(
+                                   opaque_context, &context_outputs,
+                                   &context_outputs_size));
+          EXPECT_EQ(1, context_outputs_size);
+          EXPECT_EQ(2, context_outputs[0]);
+
+          const int* context_variables = nullptr;
+          int context_variables_size = 0;
+          EXPECT_EQ(kTfLiteOk, TfLiteOpaqueContextGetVariables(
+                                   opaque_context, &context_variables,
+                                   &context_variables_size));
+          EXPECT_EQ(1, context_variables_size);
+          EXPECT_EQ(1, context_variables[0]);
+
+          bool* delegate_kernel_invoked =
+              static_cast<bool*>(params->delegate_data);
+          *delegate_kernel_invoked = true;
+          return nullptr;
+        });
+
+    TfLiteIntArray* execution_plan{};
+    TfLiteOpaqueContextGetExecutionPlan(context, &execution_plan);
+    TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+        context, reg, execution_plan, delegate);
+    return kTfLiteOk;
+  };
+  TfLiteDelegate my_delegate{};
+  my_delegate.opaque_delegate_builder = &opaque_delegate_builder;
+
+  EXPECT_EQ(kTfLiteOk, interpreter.ModifyGraphWithDelegate(&my_delegate));
+  EXPECT_TRUE(delegate_kernel_invoked);
+}
 
 }  // namespace
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
new file mode 100644
index 00000000000..3aab43f4444
--- /dev/null
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -0,0 +1,168 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file declares types used by the pure C inference API defined in c_api.h,
+// some of which are also used in the C++ and C kernel and interpreter APIs.
+
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/c_api_types.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+
+#ifndef TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
+#define TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Define TFL_CAPI_EXPORT macro to export a function properly with a shared
+// library.
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#elif defined(TFL_STATIC_LIBRARY_BUILD)
+#define TFL_CAPI_EXPORT
+#else  // not definded TFL_STATIC_LIBRARY_BUILD
+#if defined(_WIN32)
+#ifdef TFL_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TFL_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+// Note that new error status values may be added in future in order to
+// indicate more fine-grained internal states, therefore, applications should
+// not rely on status values being members of the enum.
+typedef enum TfLiteStatus {
+  kTfLiteOk = 0,
+
+  // Generally referring to an error in the runtime (i.e. interpreter)
+  kTfLiteError = 1,
+
+  // Generally referring to an error from a TfLiteDelegate itself.
+  kTfLiteDelegateError = 2,
+
+  // Generally referring to an error in applying a delegate due to
+  // incompatibility between runtime and delegate, e.g., this error is returned
+  // when trying to apply a TF Lite delegate onto a model graph that's already
+  // immutable.
+  kTfLiteApplicationError = 3,
+
+  // Generally referring to serialized delegate data not being found.
+  // See tflite::delegates::Serialization.
+  kTfLiteDelegateDataNotFound = 4,
+
+  // Generally referring to data-writing issues in delegate serialization.
+  // See tflite::delegates::Serialization.
+  kTfLiteDelegateDataWriteError = 5,
+
+  // Generally referring to data-reading issues in delegate serialization.
+  // See tflite::delegates::Serialization.
+  kTfLiteDelegateDataReadError = 6,
+
+  // Generally referring to issues when the TF Lite model has ops that cannot be
+  // resolved at runtime. This could happen when the specific op is not
+  // registered or built with the TF Lite framework.
+  kTfLiteUnresolvedOps = 7,
+
+  // Generally referring to invocation cancelled by the user.
+  // See `interpreter::Cancel`.
+  // TODO(b/194915839): Implement `interpreter::Cancel`.
+  // TODO(b/250636993): Cancellation triggered by `SetCancellationFunction`
+  // should also return this status code.
+  kTfLiteCancelled = 8,
+} TfLiteStatus;
+
+// Types supported by tensor
+typedef enum {
+  kTfLiteNoType = 0,
+  kTfLiteFloat32 = 1,
+  kTfLiteInt32 = 2,
+  kTfLiteUInt8 = 3,
+  kTfLiteInt64 = 4,
+  kTfLiteString = 5,
+  kTfLiteBool = 6,
+  kTfLiteInt16 = 7,
+  kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
+  kTfLiteFloat16 = 10,
+  kTfLiteFloat64 = 11,
+  kTfLiteComplex128 = 12,
+  kTfLiteUInt64 = 13,
+  kTfLiteResource = 14,
+  kTfLiteVariant = 15,
+  kTfLiteUInt32 = 16,
+  kTfLiteUInt16 = 17,
+  kTfLiteInt4 = 18,
+} TfLiteType;
+
+// Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
+// If per-layer quantization is specified this field will still be populated in
+// addition to TfLiteAffineQuantization.
+// Parameters for asymmetric quantization. Quantized values can be converted
+// back to float using:
+//     real_value = scale * (quantized_value - zero_point)
+typedef struct TfLiteQuantizationParams {
+  float scale;
+  int32_t zero_point;
+} TfLiteQuantizationParams;
+
+// --------------------------------------------------------------------------
+// Opaque types used by c_api.h, c_api_opaque.h and common.h.
+
+// TfLiteOpaqueContext is an opaque version of TfLiteContext;
+typedef struct TfLiteOpaqueContext TfLiteOpaqueContext;
+
+// TfLiteOpaqueNode is an opaque version of TfLiteNode;
+typedef struct TfLiteOpaqueNode TfLiteOpaqueNode;
+
+// TfLiteOpaqueTensor is an opaque version of TfLiteTensor;
+typedef struct TfLiteOpaqueTensor TfLiteOpaqueTensor;
+
+// TfLiteDelegate: allows delegation of nodes to alternative backends.
+// Forward declaration of concrete type declared in common.h.
+typedef struct TfLiteDelegate TfLiteDelegate;
+
+// TfLiteOpaqueDelegateStruct: unconditionally opaque version of
+// TfLiteDelegate; allows delegation of nodes to alternative backends.
+//
+// This is an abstract type that is intended to have the same
+// role as TfLiteDelegate, but without exposing the implementation
+// details of how delegates are implemented.
+// WARNING: This is an experimental type and subject to change.
+typedef struct TfLiteOpaqueDelegateStruct TfLiteOpaqueDelegateStruct;
+
+// TfLiteOpaqueDelegate: conditionally opaque version of
+// TfLiteDelegate; allows delegation of nodes to alternative backends.
+// For TF Lite in Play Services, this is an opaque type,
+// but for regular TF Lite, this is just a typedef for TfLiteDelegate.
+// WARNING: This is an experimental type and subject to change.
+#if TFLITE_WITH_STABLE_ABI || TFLITE_USE_OPAQUE_DELEGATE
+typedef TfLiteOpaqueDelegateStruct TfLiteOpaqueDelegate;
+#else
+typedef TfLiteDelegate TfLiteOpaqueDelegate;
+#endif
+
+#ifdef __cplusplus
+}  // extern C
+#endif
+#endif  // TENSORFLOW_LITE_CORE_C_C_API_TYPES_H_
diff --git a/tensorflow/lite/c/common.cc b/tensorflow/lite/core/c/common.cc
similarity index 81%
rename from tensorflow/lite/c/common.cc
rename to tensorflow/lite/core/c/common.cc
index 38211781cd4..602d9e078a3 100644
--- a/tensorflow/lite/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #ifdef TF_LITE_TENSORFLOW_PROFILER
 #include "tensorflow/lite/tensorflow_profiler_logger.h"
 #endif
@@ -105,9 +105,13 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
       t->allocation_type == kTfLitePersistentRo) {
     if (t->data.raw) {
 #ifdef TF_LITE_TENSORFLOW_PROFILER
+      tflite::PauseHeapMonitoring(/*pause=*/true);
       tflite::OnTfLiteTensorDealloc(t);
 #endif
       free(t->data.raw);
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+      tflite::PauseHeapMonitoring(/*pause=*/false);
+#endif
     }
   }
   t->data.raw = nullptr;
@@ -215,44 +219,54 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
   return kTfLiteOk;
 }
 
-void TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
-                                 bool preserve_data) {
+TfLiteStatus TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
+                                         bool preserve_data) {
   if (tensor->allocation_type != kTfLiteDynamic &&
       tensor->allocation_type != kTfLitePersistentRo) {
-    return;
+    return kTfLiteOk;
   }
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+  tflite::PauseHeapMonitoring(/*pause=*/true);
+#endif
+  size_t alloc_bytes = num_bytes;
   // TODO(b/145340303): Tensor data should be aligned.
 #ifdef TFLITE_KERNEL_USE_XNNPACK
-  num_bytes += 16;  // XNNPACK_EXTRA_BYTES = 16
+  alloc_bytes += 16;  // XNNPACK_EXTRA_BYTES = 16
 #endif
   if (!tensor->data.data) {
-    tensor->data.data = (char*)malloc(num_bytes);
+    tensor->data.data = (char*)malloc(alloc_bytes);
 #ifdef TF_LITE_TENSORFLOW_PROFILER
-    tflite::OnTfLiteTensorAlloc(tensor, num_bytes);
+    tflite::OnTfLiteTensorAlloc(tensor, alloc_bytes);
 #endif
   } else if (num_bytes > tensor->bytes) {
 #ifdef TF_LITE_TENSORFLOW_PROFILER
     tflite::OnTfLiteTensorDealloc(tensor);
 #endif
     if (preserve_data) {
-      tensor->data.data = (char*)realloc(tensor->data.data, num_bytes);
+      tensor->data.data = (char*)realloc(tensor->data.data, alloc_bytes);
     } else {
       // Calling free and malloc can be more efficient as it avoids needlessly
       // copying the data when it is not required.
       free(tensor->data.data);
-      tensor->data.data = (char*)malloc(num_bytes);
+      tensor->data.data = (char*)malloc(alloc_bytes);
     }
 #ifdef TF_LITE_TENSORFLOW_PROFILER
-    tflite::OnTfLiteTensorAlloc(tensor, num_bytes);
+    tflite::OnTfLiteTensorAlloc(tensor, alloc_bytes);
 #endif
   }
-#ifdef TFLITE_KERNEL_USE_XNNPACK
-  num_bytes -= 16;  // XNNPACK_EXTRA_BYTES = 16
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+  tflite::PauseHeapMonitoring(/*pause=*/false);
 #endif
   tensor->bytes = num_bytes;
+  if (tensor->data.data == nullptr && num_bytes != 0) {
+    // We are done allocating but tensor is pointing to null and a valid size
+    // was requested, so we error.
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
+TfLiteStatus TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
   return TfLiteTensorResizeMaybeCopy(num_bytes, tensor, true);
 }
 #endif  // TF_LITE_STATIC_MEMORY
@@ -303,7 +317,7 @@ const char* TfLiteTypeGetName(TfLiteType type) {
 
 TfLiteDelegate TfLiteDelegateCreate() { return TfLiteDelegate{}; }
 
-struct TfLiteOpaqueDelegateStruct* TfLiteOpaqueDelegateCreate(
+TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
     const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder) {
   if (!opaque_delegate_builder) return nullptr;
 
@@ -311,11 +325,10 @@ struct TfLiteOpaqueDelegateStruct* TfLiteOpaqueDelegateCreate(
   result->opaque_delegate_builder = new TfLiteOpaqueDelegateBuilder{};
   *(result->opaque_delegate_builder) = *opaque_delegate_builder;
 
-  return reinterpret_cast<struct TfLiteOpaqueDelegateStruct*>(result);
+  return reinterpret_cast<TfLiteOpaqueDelegate*>(result);
 }
 
-void TfLiteOpaqueDelegateDelete(
-    const struct TfLiteOpaqueDelegateStruct* opaque_delegate) {
+void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* opaque_delegate) {
   if (!opaque_delegate) return;
 
   const TfLiteDelegate* tflite_delegate =
@@ -324,4 +337,18 @@ void TfLiteOpaqueDelegateDelete(
   delete tflite_delegate;
 }
 
+void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate) {
+  if (!delegate) return nullptr;
+
+  // The following cast is safe only because this code is part of the
+  // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+  // 'TfLiteOpaqueDelegate' and 'TfLiteDelegate' being equivalent.
+  const auto* tflite_delegate =
+      reinterpret_cast<const TfLiteDelegate*>(delegate);
+
+  if (!tflite_delegate->opaque_delegate_builder) return nullptr;
+
+  return tflite_delegate->opaque_delegate_builder->data;
+}
+
 }  // extern "C"
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
new file mode 100644
index 00000000000..be577a77c3a
--- /dev/null
+++ b/tensorflow/lite/core/c/common.h
@@ -0,0 +1,1156 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines common C types and APIs for implementing operations,
+// delegates and other constructs in TensorFlow Lite. The actual operations and
+// delegates can be defined using C++, but the interface between the interpreter
+// and the operations are C.
+//
+// Summary of abstractions
+// TF_LITE_ENSURE - Self-sufficient error checking
+// TfLiteStatus - Status reporting
+// TfLiteIntArray - stores tensor shapes (dims),
+// TfLiteContext - allows an op to access the tensors
+// TfLiteTensor - tensor (a multidimensional array)
+// TfLiteNode - a single node or operation
+// TfLiteRegistration - the implementation of a conceptual operation.
+// TfLiteDelegate - allows delegation of nodes to alternative backends.
+//
+// Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
+
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/c/common.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+
+#ifndef TENSORFLOW_LITE_CORE_C_COMMON_H_
+#define TENSORFLOW_LITE_CORE_C_COMMON_H_
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The list of external context types known to TF Lite. This list exists solely
+// to avoid conflicts and to ensure ops can share the external contexts they
+// need. Access to the external contexts is controlled by one of the
+// corresponding support files.
+typedef enum TfLiteExternalContextType {
+  kTfLiteEigenContext = 0,       // include eigen_support.h to use.
+  kTfLiteGemmLowpContext = 1,    // include gemm_support.h to use.
+  kTfLiteEdgeTpuContext = 2,     // Placeholder for Edge TPU support.
+  kTfLiteCpuBackendContext = 3,  // include cpu_backend_context.h to use.
+  kTfLiteMaxExternalContexts = 4
+} TfLiteExternalContextType;
+
+// Forward declare so dependent structs and methods can reference these types
+// prior to the struct definitions.
+struct TfLiteContext;
+struct TfLiteDelegate;
+struct TfLiteRegistration;
+struct TfLiteOpaqueDelegateBuilder;
+
+// An external context is a collection of information unrelated to the TF Lite
+// framework, but useful to a subset of the ops. TF Lite knows very little
+// about the actual contexts, but it keeps a list of them, and is able to
+// refresh them if configurations like the number of recommended threads
+// change.
+typedef struct TfLiteExternalContext {
+  TfLiteExternalContextType type;
+  TfLiteStatus (*Refresh)(struct TfLiteContext* context);
+} TfLiteExternalContext;
+
+#define kTfLiteOptionalTensor (-1)
+
+// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+// indices
+typedef struct TfLiteIntArray {
+  int size;
+
+#if defined(_MSC_VER)
+  // Context for why this is needed is in http://b/189926408#comment21
+  int data[1];
+#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+       __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON) ||                                             \
+    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
+  // gcc 6.1+ have a bug where flexible members aren't properly handled
+  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+  int data[0];
+#else
+  int data[];
+#endif
+} TfLiteIntArray;
+
+// Given the size (number of elements) in a TfLiteIntArray, calculate its size
+// in bytes.
+size_t TfLiteIntArrayGetSizeInBytes(int size);
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteIntArrayFree().
+TfLiteIntArray* TfLiteIntArrayCreate(int size);
+#endif
+
+// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
+int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b);
+
+// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
+int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
+                              const int b_data[]);
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteIntArrayFree
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
+
+// Free memory of array `a`.
+void TfLiteIntArrayFree(TfLiteIntArray* a);
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Fixed size list of floats. Used for per-channel quantization.
+typedef struct TfLiteFloatArray {
+  int size;
+#if defined(_MSC_VER)
+  // Context for why this is needed is in http://b/189926408#comment21
+  float data[1];
+#elif (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+       __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON) ||                                             \
+    (defined(__clang__) && __clang_major__ == 7 && __clang_minor__ == 1)
+  // gcc 6.1+ have a bug where flexible members aren't properly handled
+  // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+  float data[0];
+#else
+  float data[];
+#endif
+} TfLiteFloatArray;
+
+// Given the size (number of elements) in a TfLiteFloatArray, calculate its size
+// in bytes.
+int TfLiteFloatArrayGetSizeInBytes(int size);
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteFloatArrayFree().
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
+
+// Free memory of array `a`.
+void TfLiteFloatArrayFree(TfLiteFloatArray* a);
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Try to make all reporting calls through TF_LITE_KERNEL_LOG rather than
+// calling the context->ReportError function directly, so that message strings
+// can be stripped out if the binary size needs to be severely optimized.
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+#define TF_LITE_KERNEL_LOG(context, ...)            \
+  do {                                              \
+    (context)->ReportError((context), __VA_ARGS__); \
+  } while (false)
+
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...)        \
+  do {                                                \
+    if ((context) != nullptr) {                       \
+      (context)->ReportError((context), __VA_ARGS__); \
+    }                                                 \
+  } while (false)
+#else  // TF_LITE_STRIP_ERROR_STRINGS
+#define ARGS_UNUSED(...) (void)sizeof(#__VA_ARGS__)
+#define TF_LITE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
+#define TF_LITE_MAYBE_KERNEL_LOG(context, ...) ARGS_UNUSED(__VA_ARGS__)
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+
+// Check whether value is true, and if not return kTfLiteError from
+// the current function (and report the error string msg).
+#define TF_LITE_ENSURE_MSG(context, value, msg)        \
+  do {                                                 \
+    if (!(value)) {                                    \
+      TF_LITE_KERNEL_LOG((context), __FILE__ " " msg); \
+      return kTfLiteError;                             \
+    }                                                  \
+  } while (0)
+
+// Check whether the value `a` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+#define TF_LITE_ENSURE(context, a)                                      \
+  do {                                                                  \
+    if (!(a)) {                                                         \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s was not true.", __FILE__, \
+                         __LINE__, #a);                                 \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0)
+
+#define TF_LITE_ENSURE_STATUS(a) \
+  do {                           \
+    const TfLiteStatus s = (a);  \
+    if (s != kTfLiteOk) {        \
+      return s;                  \
+    }                            \
+  } while (0)
+
+// Check whether the value `a == b` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+// `a` and `b` may be evaluated more than once, so no side effects or
+// extremely expensive computations should be done.
+// NOTE: Use TF_LITE_ENSURE_TYPES_EQ if comparing TfLiteTypes.
+#define TF_LITE_ENSURE_EQ(context, a, b)                                   \
+  do {                                                                     \
+    if ((a) != (b)) {                                                      \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
+                         __LINE__, #a, #b, (a), (b));                      \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+
+#define TF_LITE_ENSURE_TYPES_EQ(context, a, b)                             \
+  do {                                                                     \
+    if ((a) != (b)) {                                                      \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s != %s (%s != %s)", __FILE__, \
+                         __LINE__, #a, #b, TfLiteTypeGetName(a),           \
+                         TfLiteTypeGetName(b));                            \
+      return kTfLiteError;                                                 \
+    }                                                                      \
+  } while (0)
+
+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
+#define TF_LITE_ENSURE_OK(context, status) \
+  do {                                     \
+    const TfLiteStatus s = (status);       \
+    if ((s) != kTfLiteOk) {                \
+      return s;                            \
+    }                                      \
+  } while (0)
+
+// Single-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex64 {
+  float re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex64;
+
+// Double-precision complex data type compatible with the C99 definition.
+typedef struct TfLiteComplex128 {
+  double re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex128;
+
+// Half precision data type compatible with the C99 definition.
+typedef struct TfLiteFloat16 {
+  uint16_t data;
+} TfLiteFloat16;
+
+// Return the name of a given type, for error reporting purposes.
+const char* TfLiteTypeGetName(TfLiteType type);
+
+// SupportedQuantizationTypes.
+typedef enum TfLiteQuantizationType {
+  // No quantization.
+  kTfLiteNoQuantization = 0,
+  // Affine quantization (with support for per-channel quantization).
+  // Corresponds to TfLiteAffineQuantization.
+  kTfLiteAffineQuantization = 1,
+} TfLiteQuantizationType;
+
+// Structure specifying the quantization used by the tensor, if-any.
+typedef struct TfLiteQuantization {
+  // The type of quantization held by params.
+  TfLiteQuantizationType type;
+  // Holds an optional reference to a quantization param structure. The actual
+  // type depends on the value of the `type` field (see the comment there for
+  // the values and corresponding types).
+  void* params;
+} TfLiteQuantization;
+
+// Parameters for asymmetric quantization across a dimension (i.e per output
+// channel quantization).
+// quantized_dimension specifies which dimension the scales and zero_points
+// correspond to.
+// For a particular value in quantized_dimension, quantized values can be
+// converted back to float using:
+//     real_value = scale * (quantized_value - zero_point)
+typedef struct TfLiteAffineQuantization {
+  TfLiteFloatArray* scale;
+  TfLiteIntArray* zero_point;
+  int32_t quantized_dimension;
+} TfLiteAffineQuantization;
+
+/* A union of pointers that points to memory for a given tensor. */
+typedef union TfLitePtrUnion {
+  /* Do not access these members directly, if possible, use
+   * GetTensorData<TYPE>(tensor) instead, otherwise only access .data, as other
+   * members are deprecated. */
+  int32_t* i32;
+  uint32_t* u32;
+  int64_t* i64;
+  uint64_t* u64;
+  float* f;
+  TfLiteFloat16* f16;
+  double* f64;
+  char* raw;
+  const char* raw_const;
+  uint8_t* uint8;
+  bool* b;
+  int16_t* i16;
+  uint16_t* ui16;
+  TfLiteComplex64* c64;
+  TfLiteComplex128* c128;
+  int8_t* int8;
+  /* Only use this member. */
+  void* data;
+} TfLitePtrUnion;
+
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
+//  * kTfLiteCustom: Custom memory allocation provided by the user. See
+//        TfLiteCustomAllocation below.
+typedef enum TfLiteAllocationType {
+  kTfLiteMemNone = 0,
+  kTfLiteMmapRo,
+  kTfLiteArenaRw,
+  kTfLiteArenaRwPersistent,
+  kTfLiteDynamic,
+  kTfLitePersistentRo,
+  kTfLiteCustom,
+} TfLiteAllocationType;
+
+// The delegates should use zero or positive integers to represent handles.
+// -1 is reserved from unallocated status.
+typedef int TfLiteBufferHandle;
+enum {
+  kTfLiteNullBufferHandle = -1,
+};
+
+// Storage format of each dimension in a sparse tensor.
+typedef enum TfLiteDimensionType {
+  kTfLiteDimDense = 0,
+  kTfLiteDimSparseCSR,
+} TfLiteDimensionType;
+
+// Metadata to encode each dimension in a sparse tensor.
+typedef struct TfLiteDimensionMetadata {
+  TfLiteDimensionType format;
+  int dense_size;
+  TfLiteIntArray* array_segments;
+  TfLiteIntArray* array_indices;
+} TfLiteDimensionMetadata;
+
+// Parameters used to encode a sparse tensor. For detailed explanation of each
+// field please refer to lite/schema/schema.fbs.
+typedef struct TfLiteSparsity {
+  TfLiteIntArray* traversal_order;
+  TfLiteIntArray* block_map;
+  TfLiteDimensionMetadata* dim_metadata;
+  int dim_metadata_size;
+} TfLiteSparsity;
+
+// Defines a custom memory allocation not owned by the runtime.
+// `data` should be aligned to kDefaultTensorAlignment defined in
+// lite/util.h. (Currently 64 bytes)
+// NOTE: See Interpreter.SetCustomAllocationForTensor for details on usage.
+typedef struct TfLiteCustomAllocation {
+  void* data;
+  size_t bytes;
+} TfLiteCustomAllocation;
+
+// The flags used in `Interpreter::SetCustomAllocationForTensor`.
+// Note that this is a bitmask, so the values should be 1, 2, 4, 8, ...etc.
+typedef enum TfLiteCustomAllocationFlags {
+  kTfLiteCustomAllocationFlagsNone = 0,
+  // Skips checking whether allocation.data points to an aligned buffer as
+  // expected by the TFLite runtime.
+  // NOTE: Setting this flag can cause crashes when calling Invoke().
+  // Use with caution.
+  kTfLiteCustomAllocationFlagsSkipAlignCheck = 1,
+} TfLiteCustomAllocationFlags;
+
+// A tensor in the interpreter system which is a wrapper around a buffer of
+// data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
+typedef struct TfLiteTensor {
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+  // Quantization information.
+  TfLiteQuantizationParams params;
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // An opaque pointer to a tflite::MMapAllocation
+  const void* allocation;
+
+  // Null-terminated name of this tensor.
+  const char* name;
+
+  // The delegate which knows how to handle `buffer_handle`.
+  // WARNING: This is an experimental interface that is subject to change.
+  struct TfLiteDelegate* delegate;
+
+  // An integer buffer handle that can be handled by `delegate`.
+  // The value is valid only when delegate is not null.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteBufferHandle buffer_handle;
+
+  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
+  // responsible to set data_is_stale to true.
+  // `delegate->CopyFromBufferHandle` can be called to copy the data from
+  // delegate buffer.
+  // WARNING: This is an // experimental interface that is subject to change.
+  bool data_is_stale;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Parameters used to encode a sparse tensor.
+  // This is optional. The field is NULL if a tensor is dense.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteSparsity* sparsity;
+
+  // Optional. Encodes shapes with unknown dimensions with -1. This field is
+  // only populated when unknown dimensions exist in a read-write tensor (i.e.
+  // an input or output tensor). (e.g.  `dims` contains [1, 1, 1, 3] and
+  // `dims_signature` contains [1, -1, -1, 3]).  If no unknown dimensions exist
+  // then `dims_signature` is either null, or set to an empty array.  Note that
+  // this field only exists when TF_LITE_STATIC_MEMORY is not defined.
+  const TfLiteIntArray* dims_signature;
+} TfLiteTensor;
+
+// A structure representing an instance of a node.
+// This structure only exhibits the inputs, outputs, user defined data and some
+// node properties (like statefulness), not other features like the type.
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // intermediate tensors to this node expressed as indices into the simulator's
+  // tensors.
+  TfLiteIntArray* intermediates;
+
+  // Temporary tensors uses during the computations. This usually contains no
+  // tensors, but ops are allowed to change that if they need scratch space of
+  // any sort.
+  TfLiteIntArray* temporaries;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+
+  // The pointer to the delegate. This is non-null only when the node is
+  // created by calling `interpreter.ModifyGraphWithDelegate`.
+  // WARNING: This is an experimental interface that is subject to change.
+  struct TfLiteDelegate* delegate;
+
+  // Whether this op might have side effect (e.g. stateful op).
+  bool might_have_side_effect;
+} TfLiteNode;
+#else   // defined(TF_LITE_STATIC_MEMORY)?
+// NOTE: This flag is opt-in only at compile time.
+//
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// This struct does not use:
+// - allocation
+// - buffer_handle
+// - data_is_stale
+// - delegate
+// - dims_signature
+// - name
+// - sparsity
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Specific reduced TfLiteNode struct for TF Micro runtime. This struct contains
+// only the minimum fields required to represent a node.
+//
+// This struct does not use:
+// - delegate
+// - intermediates
+// - temporaries
+typedef struct TfLiteNode {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // intermediate tensors to this node expressed as indices into the simulator's
+  // tensors.
+  TfLiteIntArray* intermediates;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+#endif  // TF_LITE_STATIC_MEMORY
+
+// Light-weight tensor struct for TF Micro runtime. Provides the minimal amount
+// of information required for a kernel to run during TfLiteRegistration::Eval.
+// TODO(b/160955687): Move this field into TF_LITE_STATIC_MEMORY when TFLM
+// builds with this flag by default internally.
+typedef struct TfLiteEvalTensor {
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have.
+  TfLiteIntArray* dims;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+} TfLiteEvalTensor;
+
+#ifndef TF_LITE_STATIC_MEMORY
+// Free data memory of tensor `t`.
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free sparsity parameters.
+void TfLiteSparsityFree(TfLiteSparsity* sparsity);
+
+// Free memory of tensor `t`.
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+// Copies the contents of 'src' in 'dst'.
+// Function does nothing if either 'src' or 'dst' is passed as nullptr and
+// return kTfLiteOk.
+// Returns kTfLiteError if 'src' and 'dst' doesn't have matching data size.
+// Note function copies contents, so it won't create new data pointer
+// or change allocation type.
+// All Tensor related properties will be copied from 'src' to 'dst' like
+// quantization, sparsity, ...
+TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst);
+
+// Change the size of the memory block owned by `tensor` to `num_bytes`.
+// Tensors with allocation types other than `kTfLiteDynamic` will be ignored and
+// a kTfLiteOk will be returned.
+// `tensor`'s internal data buffer will be assigned a pointer
+// which can safely be passed to free or realloc if `num_bytes` is zero.
+// If `preserve_data` is true, tensor data will be unchanged in the range from
+// the start of the region up to the minimum of the old and new sizes. In the
+// case of NULL tensor, or an error allocating new memory, returns
+// `kTfLiteError`.
+TfLiteStatus TfLiteTensorResizeMaybeCopy(size_t num_bytes, TfLiteTensor* tensor,
+                                         bool preserve_data);
+
+// Change the size of the memory block owned by `tensor` to `num_bytes`.
+// Tensors with allocation types other than kTfLiteDynamic will be ignored and
+// a kTfLiteOk will be returned.
+// `tensor`'s internal data buffer will be assigned a pointer
+// which can safely be passed to free or realloc if `num_bytes` is zero.
+// Tensor data will be unchanged in the range from the start of the region up to
+// the minimum of the old and new sizes. In the case
+// of NULL tensor, or an error allocating new memory, returns `kTfLiteError`.
+TfLiteStatus TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
+
+// WARNING: This is an experimental interface that is subject to change.
+//
+// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+// trivially destructable. It will be stored as `builtin_data` field in
+// `TfLiteNode` of the delegate node.
+//
+// See also the `CreateDelegateParams` function in `interpreter.cc` details.
+typedef struct TfLiteDelegateParams {
+  struct TfLiteDelegate* delegate;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteDelegateParams;
+
+// WARNING: This is an experimental interface that is subject to change.
+//
+// Currently, TfLiteOpaqueDelegateParams has to be allocated in a way that it's
+// trivially destructable. It will be stored as `builtin_data` field in
+// `TfLiteNode` of the delegate node.
+//
+// See also the `CreateOpaqueDelegateParams` function in `subgraph.cc`
+// details.
+typedef struct TfLiteOpaqueDelegateParams {
+  TfLiteOpaqueDelegate* delegate;
+  void* delegate_data;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteOpaqueDelegateParams;
+
+typedef struct TfLiteContext {
+  // Number of tensors in the context.
+  size_t tensors_size;
+
+  // The execution plan contains a list of the node indices in execution
+  // order. execution_plan->size is the current number of nodes. And,
+  // execution_plan->data[0] is the first node that needs to be run.
+  // TfLiteDelegates can traverse the current execution plan by iterating
+  // through each member of this array and using GetNodeAndRegistration() to
+  // access details about a node. i.e.
+  //
+  // TfLiteIntArray* execution_plan;
+  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
+  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
+  //    int node_index = execution_plan->data[exec_index];
+  //    TfLiteNode* node;
+  //    TfLiteRegistration* reg;
+  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  // }
+  // Note: the memory pointed by '`*execution_plan` is OWNED by TfLite runtime.
+  // Future calls to GetExecutionPlan invalidates earlier outputs. The following
+  // code snippet shows the issue of such an invocation pattern. After calling
+  // CheckNode, subsequent access to `plan_1st` is undefined.
+  //
+  // void CheckNode(const TfLiteNode* node) {
+  //   ...
+  //   TfLiteIntArray* plan_2nd;
+  //   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_2nd));
+  //   ...
+  // }
+  //
+  // TfLiteIntArray* plan_1st;
+  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan_1st));
+  // for (int exec_index = 0; exec_index < plan_1st->size; exec_index++) {
+  //    int node_index = plan_1st->data[exec_index];
+  //    TfLiteNode* node;
+  //    TfLiteRegistration* reg;
+  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  //    CheckNode(node);
+  // }
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
+                                   TfLiteIntArray** execution_plan);
+
+  // An array of tensors in the interpreter context (of length `tensors_size`)
+  TfLiteTensor* tensors;
+
+  // opaque full context ptr (an opaque c++ data structure)
+  void* impl_;
+
+  // Request memory pointer be resized. Updates dimensions on the tensor.
+  // NOTE: ResizeTensor takes ownership of newSize.
+  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
+                               TfLiteIntArray* new_size);
+  // Request that an error be reported with format string msg.
+  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
+
+  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
+  // non-null, the value pointed to by `first_new_tensor_index` will be set to
+  // the index of the first new tensor.
+  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
+                             int* first_new_tensor_index);
+
+  // Get a Tensor node by node_index.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetNodeAndRegistration)(
+      struct TfLiteContext*, int node_index, TfLiteNode** node,
+      struct TfLiteRegistration** registration);
+
+  // Replace ops with one or more stub delegate operations. This function
+  // does not take ownership of `nodes_to_replace`.
+  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
+      struct TfLiteContext*, struct TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, struct TfLiteDelegate* delegate);
+
+  // Number of threads that are recommended to subsystems like gemmlowp and
+  // eigen.
+  int recommended_num_threads;
+
+  // Access external contexts by type.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
+                                               TfLiteExternalContextType);
+  // Set the value of a external context. Does not take ownership of the
+  // pointer.
+  // WARNING: This is an experimental interface that is subject to change.
+  void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
+                             TfLiteExternalContext*);
+
+  // Flag for allowing float16 precision for FP32 calculation.
+  // default: false.
+  // WARNING: This is an experimental API and subject to change.
+  bool allow_fp32_relax_to_fp16;
+
+  // Pointer to the op-level profiler, if set; nullptr otherwise.
+  void* profiler;
+
+  // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
+  // The memory is allocated from heap for TFL, and from tail in TFLM.
+  // This method is only available in Init or Prepare stage.
+  // WARNING: This is an experimental interface that is subject to change.
+  void* (*AllocatePersistentBuffer)(struct TfLiteContext* ctx, size_t bytes);
+
+  // Allocate a buffer which will be deallocated right after invoke phase.
+  // The memory is allocated from heap in TFL, and from volatile arena in TFLM.
+  // This method is only available in invoke stage.
+  // NOTE: If possible use RequestScratchBufferInArena method to avoid memory
+  // allocation during inference time.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*AllocateBufferForEval)(struct TfLiteContext* ctx, size_t bytes,
+                                        void** ptr);
+
+  // Request a scratch buffer in the arena through static memory planning.
+  // This method is only available in Prepare stage and the buffer is allocated
+  // by the interpreter between Prepare and Eval stage. In Eval stage,
+  // GetScratchBuffer API can be used to fetch the address.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*RequestScratchBufferInArena)(struct TfLiteContext* ctx,
+                                              size_t bytes, int* buffer_idx);
+
+  // Get the scratch buffer pointer.
+  // This method is only available in Eval stage.
+  // WARNING: This is an experimental interface that is subject to change.
+  void* (*GetScratchBuffer)(struct TfLiteContext* ctx, int buffer_idx);
+
+  // Resize the memory pointer of the `tensor`. This method behaves the same as
+  // `ResizeTensor`, except that it makes a copy of the shape array internally
+  // so the shape array could be deallocated right afterwards.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*ResizeTensorExplicit)(struct TfLiteContext* ctx,
+                                       TfLiteTensor* tensor, int dims,
+                                       const int* shape);
+
+  // This method provides a preview of post-delegation partitioning. Each
+  // TfLiteDelegateParams in the referenced array corresponds to one instance of
+  // the delegate kernel.
+  // Example usage:
+  //
+  // TfLiteIntArray* nodes_to_replace = ...;
+  // TfLiteDelegateParams* params_array;
+  // int num_partitions = 0;
+  // TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+  //    context, delegate, nodes_to_replace, &params_array, &num_partitions));
+  // for (int idx = 0; idx < num_partitions; idx++) {
+  //    const auto& partition_params = params_array[idx];
+  //    ...
+  // }
+  //
+  // NOTE: The context owns the memory referenced by partition_params_array. It
+  // will be cleared with another call to PreviewDelegateParitioning, or after
+  // TfLiteDelegateParams::Prepare returns.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*PreviewDelegatePartitioning)(
+      struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegateParams** partition_params_array, int* num_partitions);
+
+  // Returns a TfLiteTensor struct for a given index.
+  // WARNING: This is an experimental interface that is subject to change.
+  // WARNING: This method may not be available on all platforms.
+  TfLiteTensor* (*GetTensor)(const struct TfLiteContext* context,
+                             int tensor_idx);
+
+  // Returns a TfLiteEvalTensor struct for a given index.
+  // WARNING: This is an experimental interface that is subject to change.
+  // WARNING: This method may not be available on all platforms.
+  TfLiteEvalTensor* (*GetEvalTensor)(const struct TfLiteContext* context,
+                                     int tensor_idx);
+
+  // Retrieves named metadata buffer from the TFLite model.
+  // Returns kTfLiteOk if metadata is successfully obtained from the flatbuffer
+  // Model: that is, there exists a `metadata` entry with given `name` string.
+  // (see TFLite's schema.fbs).
+  // The corresponding `buffer` information is populated in `ptr` & `bytes`.
+  // The data from `ptr` is valid for the lifetime of the Interpreter.
+  //
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetModelMetadata)(const struct TfLiteContext* context,
+                                   const char* name, const char** ptr,
+                                   size_t* bytes);
+} TfLiteContext;
+
+// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+// for C API which doesn't use internal types (such as `TfLiteContext`) but only
+// uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
+// field is the exactly the same as with `TfLiteRegistration`.
+typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+
+typedef struct TfLiteRegistration {
+  // Initializes the op from serialized data.
+  // Called only *once* for the lifetime of the op, so any one-time allocations
+  // should be made here (unless they depend on tensor sizes).
+  //
+  // If a built-in op:
+  //   `buffer` is the op's params data (TfLiteLSTMParams*).
+  //   `length` is zero.
+  // If custom op:
+  //   `buffer` is the op's `custom_options`.
+  //   `length` is the size of the buffer.
+  //
+  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
+  // or an instance of a struct).
+  //
+  // The returned pointer will be stored with the node in the `user_data` field,
+  // accessible within prepare and invoke functions below.
+  // NOTE: if the data is already in the desired format, simply implement this
+  // function to return `nullptr` and implement the free function to be a no-op.
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+
+  // The pointer `buffer` is the data previously returned by an init invocation.
+  void (*free)(TfLiteContext* context, void* buffer);
+
+  // prepare is called when the inputs this node depends on have been resized.
+  // context->ResizeTensor() can be called to request output tensors to be
+  // resized.
+  // Can be called multiple times for the lifetime of the op.
+  //
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+
+  // Execute the node (should read node->inputs and output to node->outputs).
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+
+  // profiling_string is called during summarization of profiling information
+  // in order to group executions together. Providing a value here will cause a
+  // given op to appear multiple times is the profiling report. This is
+  // particularly useful for custom ops that can perform significantly
+  // different calculations depending on their `user-data`.
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+
+  // Builtin codes. If this kernel refers to a builtin this is the code
+  // of the builtin. This is so we can do marshaling to other frameworks like
+  // NN API.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int32_t builtin_code;
+
+  // Custom op name. If the op is a builtin, this will be null.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  // WARNING: This is an experimental interface that is subject to change.
+  const char* custom_name;
+
+  // The version of the op.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int version;
+
+  // The external version of `TfLiteRegistration`. Since we can't use internal
+  // types (such as `TfLiteContext`) for C API to maintain ABI stability.
+  // C API user will provide `TfLiteRegistrationExternal` to implement custom
+  // ops. We keep it inside of `TfLiteRegistration` and use it to route
+  // callbacks properly.
+  TfLiteRegistrationExternal* registration_external;
+} TfLiteRegistration;
+
+// Old version of `TfLiteRegistration` to maintain binary backward
+// compatibility.
+// WARNING: This structure is deprecated / not an official part of the API.
+// It should be only used for binary backward compatibility.
+typedef struct TfLiteRegistration_V1 {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+  int32_t builtin_code;
+  const char* custom_name;
+  int version;
+} TfLiteRegistration_V1;
+
+// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
+// values should be 1, 2, 4, 8, ...etc.
+typedef enum TfLiteDelegateFlags {
+  kTfLiteDelegateFlagsNone = 0,
+  // The flag is set if the delegate can handle dynamic sized tensors.
+  // For example, the output shape of a `Resize` op with non-constant shape
+  // can only be inferred when the op is invoked.
+  // In this case, the Delegate is responsible for calling
+  // `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
+  // `ResizeTensor` when invoking the op.
+  //
+  // If the delegate isn't capable to handle dynamic tensors, this flag need
+  // to be set to false.
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1,
+
+  // This flag can be used by delegates (that allow dynamic tensors) to ensure
+  // applicable tensor shapes are automatically propagated in the case of tensor
+  // resizing.
+  // This means that non-dynamic (allocation_type != kTfLiteDynamic) I/O tensors
+  // of a delegate kernel will have correct shapes before its Prepare() method
+  // is called. The runtime leverages TFLite builtin ops in the original
+  // execution plan to propagate shapes.
+  //
+  // A few points to note:
+  // 1. This requires kTfLiteDelegateFlagsAllowDynamicTensors. If that flag is
+  // false, this one is redundant since the delegate kernels are re-initialized
+  // every time tensors are resized.
+  // 2. Enabling this flag adds some overhead to AllocateTensors(), since extra
+  // work is required to prepare the original execution plan.
+  // 3. This flag requires that the original execution plan only have ops with
+  // valid registrations (and not 'dummy' custom ops like with Flex).
+  // WARNING: This feature is experimental and subject to change.
+  kTfLiteDelegateFlagsRequirePropagatedShapes = 2,
+
+  // This flag can be used by delegates to request per-operator profiling. If a
+  // node is a delegate node, this flag will be checked before profiling. If
+  // set, then the node will not be profiled. The delegate will then add per
+  // operator information using Profiler::EventType::OPERATOR_INVOKE_EVENT and
+  // the results will appear in the operator-wise Profiling section and not in
+  // the Delegate internal section.
+  kTfLiteDelegateFlagsPerOperatorProfiling = 4
+} TfLiteDelegateFlags;
+
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct TfLiteDelegate {
+  // Data that delegate needs to identify itself. This data is owned by the
+  // delegate. The delegate is owned in the user code, so the delegate is
+  // responsible for deallocating this when it is destroyed.
+  void* data_;
+
+  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  // delegate a view of the current graph through TfLiteContext*. It typically
+  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
+  // to ask the TensorFlow lite runtime to create macro-nodes to represent
+  // delegated subgraphs of the original graph.
+  TfLiteStatus (*Prepare)(TfLiteContext* context,
+                          struct TfLiteDelegate* delegate);
+
+  // Copy the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
+                                       struct TfLiteDelegate* delegate,
+                                       TfLiteBufferHandle buffer_handle,
+                                       TfLiteTensor* tensor);
+
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
+                                     struct TfLiteDelegate* delegate,
+                                     TfLiteBufferHandle buffer_handle,
+                                     TfLiteTensor* tensor);
+
+  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  // this doesn't release the underlying resource (e.g. textures). The
+  // resources are either owned by application layer or the delegate.
+  // This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteContext* context,
+                           struct TfLiteDelegate* delegate,
+                           TfLiteBufferHandle* handle);
+
+  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
+
+  // The opaque delegate builder associated with this object.  If set then the
+  // TF Lite runtime will give precedence to this field.  E.g. instead of
+  // invoking 'Prepare' via the function pointer inside the 'TfLiteDelegate'
+  // object, the runtime will first check if the corresponding function
+  // pointer inside 'opaque_delegate_builder' is set and if so invoke that.
+  //
+  // If this field is non-null, then the 'Prepare' field (of the
+  // 'TfLiteDelegate') should be null.
+  struct TfLiteOpaqueDelegateBuilder* opaque_delegate_builder;
+} TfLiteDelegate;
+
+// Build a 'null' delegate, with all the fields properly set to their default
+// values.
+TfLiteDelegate TfLiteDelegateCreate(void);
+
+// `TfLiteOpaqueDelegateBuilder` is used for constructing
+// `TfLiteOpaqueDelegate`, see `TfLiteOpaqueDelegateCreate` below.  Note:
+// This struct is not ABI stable.
+//
+// For forward source compatibility `TfLiteOpaqueDelegateBuilder` objects should
+// be brace-initialized, so that all fields (including any that might be added
+// in the future) get zero-initialized.  The purpose of each field is exactly
+// the same as with `TfLiteDelegate`.
+//
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct TfLiteOpaqueDelegateBuilder {
+  // Data that delegate needs to identify itself. This data is owned by the
+  // delegate. The delegate is owned in the user code, so the delegate is
+  // responsible for deallocating this when it is destroyed.
+  void* data;
+  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  // delegate a view of the current graph through TfLiteContext*. It typically
+  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
+  // to ask the TensorFlow lite runtime to create macro-nodes to represent
+  // delegated subgraphs of the original graph.
+  TfLiteStatus (*Prepare)(TfLiteOpaqueContext* context,  // NOLINT
+                          TfLiteOpaqueDelegate* delegate, void* data);
+  // Copies the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
+  TfLiteStatus (*CopyFromBufferHandle)(  // NOLINT
+      TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
+      TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
+  // Copies the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(  // NOLINT
+      TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
+      TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* tensor);
+  // Frees the Delegate Buffer Handle. Note: This only frees the handle, but
+  // this doesn't release the underlying resource (e.g. textures). The
+  // resources are either owned by application layer or the delegate.
+  // This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteOpaqueContext* context,  // NOLINT
+                           TfLiteOpaqueDelegate* delegate, void* data,
+                           TfLiteBufferHandle* handle);
+  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
+} TfLiteOpaqueDelegateBuilder;
+
+// Creates an opaque delegate and returns its address.  The opaque delegate will
+// behave according to the provided 'opaque_delegate_builder'.  The lifetime of
+// the objects pointed to by any of the fields within the
+// 'opaque_delegate_builder' must outlive the returned
+// 'TfLiteOpaqueDelegate' and any 'TfLiteInterpreter',
+// 'TfLiteInterpreterOptions', 'tflite::Interpreter', or
+// 'tflite::InterpreterBuilder' that the delegate is added to.  The returned
+// address should be passed to 'TfLiteOpaqueDelegateDelete' for deletion.  If
+// 'opaque_delegate_builder' is a null pointer, then a null pointer will be
+// returned.
+TfLiteOpaqueDelegate* TfLiteOpaqueDelegateCreate(
+    const TfLiteOpaqueDelegateBuilder* opaque_delegate_builder);
+
+// Deletes the provided opaque 'delegate'.  This function has no effect if the
+// 'delegate' is a null pointer.
+void TfLiteOpaqueDelegateDelete(TfLiteOpaqueDelegate* delegate);
+
+// Returns a pointer to the data associated with the provided opaque 'delegate'.
+//
+// A null pointer will be returned when:
+// - The 'delegate' is null.
+// - The 'data' field of the 'TfLiteOpaqueDelegateBuilder' used to construct the
+//   'delegate' was null.
+// - Or in case of any other error.
+// - The 'delegate' has been constructed via a 'TfLiteOpaqueDelegateBuilder',
+//   but the 'data' field of the 'TfLiteOpaqueDelegateBuilder' is null.
+void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_CORE_C_COMMON_H_
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
new file mode 100644
index 00000000000..e8a4d5638d7
--- /dev/null
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -0,0 +1,419 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/c/common.h"
+
+#include <cstddef>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+using ::testing::ElementsAreArray;
+
+// NOTE: this tests only the TfLiteIntArray part of context.
+// most of common.h is provided in the context of using it with
+// interpreter.h and interpreter.cc, so interpreter_test.cc tests context
+// structures more thoroughly.
+
+TEST(IntArray, TestIntArrayCreate) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(0);
+  TfLiteIntArray* b = TfLiteIntArrayCreate(3);
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+}
+
+TEST(IntArray, TestIntArrayCopy) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(2);
+  a->data[0] = 22;
+  a->data[1] = 24;
+  TfLiteIntArray* b = TfLiteIntArrayCopy(a);
+  ASSERT_NE(a, b);
+  ASSERT_EQ(a->size, b->size);
+  ASSERT_EQ(a->data[0], b->data[0]);
+  ASSERT_EQ(a->data[1], b->data[1]);
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+}
+
+TEST(IntArray, TestIntArrayEqual) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(1);
+  a->data[0] = 1;
+  TfLiteIntArray* b = TfLiteIntArrayCreate(2);
+  b->data[0] = 5;
+  b->data[1] = 6;
+  TfLiteIntArray* c = TfLiteIntArrayCreate(2);
+  c->data[0] = 5;
+  c->data[1] = 6;
+  TfLiteIntArray* d = TfLiteIntArrayCreate(2);
+  d->data[0] = 6;
+  d->data[1] = 6;
+  ASSERT_FALSE(TfLiteIntArrayEqual(a, b));
+  ASSERT_TRUE(TfLiteIntArrayEqual(b, c));
+  ASSERT_TRUE(TfLiteIntArrayEqual(b, b));
+  ASSERT_FALSE(TfLiteIntArrayEqual(c, d));
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+  TfLiteIntArrayFree(c);
+  TfLiteIntArrayFree(d);
+}
+
+TEST(FloatArray, TestFloatArrayCreate) {
+  TfLiteFloatArray* a = TfLiteFloatArrayCreate(0);
+  TfLiteFloatArray* b = TfLiteFloatArrayCreate(3);
+  TfLiteFloatArrayFree(a);
+  TfLiteFloatArrayFree(b);
+}
+
+TEST(Types, TestTypeNames) {
+  auto type_name = [](TfLiteType t) {
+    return std::string(TfLiteTypeGetName(t));
+  };
+  EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
+  EXPECT_EQ(type_name(kTfLiteFloat64), "FLOAT64");
+  EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
+  EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
+  EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
+  EXPECT_EQ(type_name(kTfLiteUInt16), "UINT16");
+  EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
+  EXPECT_EQ(type_name(kTfLiteUInt32), "UINT32");
+  EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
+  EXPECT_EQ(type_name(kTfLiteUInt64), "UINT64");
+  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
+  EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
+  EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
+  EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
+  EXPECT_EQ(type_name(kTfLiteComplex128), "COMPLEX128");
+  EXPECT_EQ(type_name(kTfLiteString), "STRING");
+  EXPECT_EQ(type_name(kTfLiteResource), "RESOURCE");
+  EXPECT_EQ(type_name(kTfLiteVariant), "VARIANT");
+  EXPECT_EQ(type_name(kTfLiteInt4), "INT4");
+}
+
+TEST(Quantization, TestQuantizationFree) {
+  TfLiteTensor t;
+  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
+  t.allocation_type = kTfLiteArenaRw;
+  t.dims = nullptr;
+  t.dims_signature = nullptr;
+  t.quantization.type = kTfLiteAffineQuantization;
+  t.sparsity = nullptr;
+  auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  params->scale = TfLiteFloatArrayCreate(3);
+  params->zero_point = TfLiteIntArrayCreate(3);
+  t.quantization.params = reinterpret_cast<void*>(params);
+  TfLiteTensorFree(&t);
+}
+
+TEST(Sparsity, TestSparsityFree) {
+  TfLiteTensor t = {};
+  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
+  t.allocation_type = kTfLiteArenaRw;
+  t.dims = nullptr;
+  t.dims_signature = nullptr;
+
+  // A dummy CSR sparse matrix.
+  t.sparsity = static_cast<TfLiteSparsity*>(malloc(sizeof(TfLiteSparsity)));
+  t.sparsity->traversal_order = TfLiteIntArrayCreate(2);
+  t.sparsity->block_map = nullptr;
+
+  t.sparsity->dim_metadata = static_cast<TfLiteDimensionMetadata*>(
+      malloc(sizeof(TfLiteDimensionMetadata) * 2));
+  t.sparsity->dim_metadata_size = 2;
+
+  t.sparsity->dim_metadata[0].format = kTfLiteDimDense;
+  t.sparsity->dim_metadata[0].dense_size = 4;
+
+  t.sparsity->dim_metadata[1].format = kTfLiteDimSparseCSR;
+  t.sparsity->dim_metadata[1].array_segments = TfLiteIntArrayCreate(2);
+  t.sparsity->dim_metadata[1].array_indices = TfLiteIntArrayCreate(3);
+
+  TfLiteTensorFree(&t);
+}
+
+TEST(TensorCopy, TensorCopy_VALID) {
+  const int kNumElements = 32;
+  const int kBytes = sizeof(float) * kNumElements;
+  TfLiteTensor src;
+  TfLiteTensor dst;
+  TfLiteDelegate delegate;
+  memset(&delegate, 0, sizeof(delegate));
+  memset(&src, 0, sizeof(TfLiteTensor));
+  memset(&dst, 0, sizeof(TfLiteTensor));
+  src.data.raw = static_cast<char*>(malloc(kBytes));
+  for (int i = 0; i < kNumElements; ++i) {
+    src.data.f[i] = i;
+  }
+  dst.data.raw = static_cast<char*>(malloc(kBytes));
+
+  src.bytes = dst.bytes = kBytes;
+  src.delegate = &delegate;
+  src.data_is_stale = true;
+  src.allocation_type = kTfLiteDynamic;
+  src.type = kTfLiteFloat32;
+  src.dims = TfLiteIntArrayCreate(1);
+  src.dims->data[0] = 1;
+  src.dims_signature = TfLiteIntArrayCopy(src.dims);
+  src.buffer_handle = 5;
+
+  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(&src, &dst));
+
+  EXPECT_EQ(dst.bytes, src.bytes);
+  EXPECT_EQ(dst.delegate, src.delegate);
+  EXPECT_EQ(dst.data_is_stale, src.data_is_stale);
+  EXPECT_EQ(dst.type, src.type);
+  EXPECT_EQ(1, TfLiteIntArrayEqual(dst.dims, src.dims));
+  EXPECT_EQ(dst.buffer_handle, src.buffer_handle);
+  for (int i = 0; i < kNumElements; ++i) {
+    EXPECT_EQ(dst.data.f[i], src.data.f[i]);
+  }
+
+  TfLiteTensorFree(&src);
+  // We don't change allocation type, and since the test keeps the dst
+  // allocation as non dynamic, then we have to delete it manually.
+  free(dst.data.raw);
+  TfLiteTensorFree(&dst);
+}
+
+TEST(TensorCopy, TensorCopy_INVALID) {
+  TfLiteTensor src;
+  TfLiteTensor dst;
+
+  // Nullptr passed, should just return.
+  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(&src, nullptr));
+  EXPECT_EQ(kTfLiteOk, TfLiteTensorCopy(nullptr, &dst));
+
+  // Incompatible sizes passed.
+  src.bytes = 10;
+  dst.bytes = 12;
+  EXPECT_EQ(kTfLiteError, TfLiteTensorCopy(&src, &dst));
+}
+
+TEST(TestTfLiteOpaqueDelegate, CreateAndDelete) {
+  std::unique_ptr<TfLiteOpaqueDelegateBuilder> opaque_delegate_builder(
+      new TfLiteOpaqueDelegateBuilder{});
+
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(opaque_delegate_builder.get());
+
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TEST(TestTfLiteOpaqueDelegate, CallTfLiteOpaqueDelegateCreateWithNull) {
+  EXPECT_EQ(nullptr, TfLiteOpaqueDelegateCreate(nullptr));
+}
+
+TEST(TestTfLiteOpaqueDelegate, CallTfLiteOpaqueDelegateDeleteWithNull) {
+  TfLiteOpaqueDelegateDelete(nullptr);
+}
+
+TEST(TestTensorRealloc, TensorReallocMoreBytesSucceeds) {
+  const TfLiteType t = kTfLiteFloat32;
+  const int num_elements = 4;
+  const int new_num_elements = 6;
+  const size_t bytes = sizeof(float) * num_elements;
+  const size_t new_bytes = sizeof(float) * new_num_elements;
+  float* data = (float*)malloc(bytes);
+  memset(data, 0, bytes);
+
+  TfLiteIntArray* dims = ConvertVectorToTfLiteIntArray({num_elements});
+  TfLiteTensor* tensor = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+  tensor->sparsity = nullptr;
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->bytes = bytes;
+  tensor->type = t;
+  tensor->data.data = data;
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->dims = dims;
+  tensor->dims_signature = TfLiteIntArrayCopy(dims);
+
+  ASSERT_EQ(TfLiteTensorRealloc(new_bytes, tensor), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, new_bytes);
+
+  ASSERT_THAT(std::vector<int>(tensor->data.f, tensor->data.f + num_elements),
+              ElementsAreArray({0, 0, 0, 0}));
+
+  TfLiteTensorFree(tensor);
+  free(tensor);
+}
+
+TEST(TestTensorRealloc, TensorReallocLessBytesSucceeds) {
+  const TfLiteType t = kTfLiteFloat32;
+  const int num_elements = 4;
+  const int new_num_elements = 2;
+  const size_t bytes = sizeof(float) * num_elements;
+  const size_t new_bytes = sizeof(float) * new_num_elements;
+  float* data = (float*)malloc(bytes);
+  memset(data, 0, bytes);
+
+  TfLiteIntArray* dims = ConvertVectorToTfLiteIntArray({num_elements});
+  TfLiteTensor* tensor = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+  tensor->sparsity = nullptr;
+  tensor->bytes = bytes;
+  tensor->type = t;
+  tensor->data.data = data;
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->dims = dims;
+  tensor->dims_signature = TfLiteIntArrayCopy(dims);
+  tensor->quantization.type = kTfLiteNoQuantization;
+
+  ASSERT_EQ(TfLiteTensorRealloc(new_bytes, tensor), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, new_bytes);
+
+  ASSERT_THAT(std::vector<int>(tensor->data.f, tensor->data.f + 2),
+              ElementsAreArray({0, 0}));
+
+  TfLiteTensorFree(tensor);
+  free(tensor);
+}
+
+TEST(TestTensorRealloc, TensorReallocNonDynamicNoChange) {
+  const TfLiteType t = kTfLiteFloat32;
+  const int num_elements = 4;
+  const int new_num_elements = 6;
+  const size_t bytes = sizeof(float) * num_elements;
+  const size_t new_bytes = sizeof(float) * new_num_elements;
+  float* data = (float*)malloc(bytes);
+  memset(data, 0, bytes);
+
+  TfLiteIntArray* dims = ConvertVectorToTfLiteIntArray({num_elements});
+  TfLiteTensor* tensor = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+  tensor->sparsity = nullptr;
+  tensor->bytes = bytes;
+  tensor->type = t;
+  tensor->data.data = data;
+  tensor->allocation_type = kTfLiteArenaRw;
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->dims = dims;
+  tensor->dims_signature = TfLiteIntArrayCopy(dims);
+
+  EXPECT_EQ(TfLiteTensorRealloc(new_bytes, tensor), kTfLiteOk);
+  // Tensor should still be intact.
+  EXPECT_EQ(tensor->bytes, bytes);
+
+  EXPECT_THAT(std::vector<int>(tensor->data.i32, tensor->data.i32 + 4),
+              ElementsAreArray({0, 0, 0, 0}));
+
+  free(tensor->data.data);
+  TfLiteTensorFree(tensor);
+  free(tensor);
+}
+
+TEST(TestTensorRealloc, TensorReallocNumByte0) {
+  const TfLiteType t = kTfLiteFloat32;
+  const int num_elements = 4;
+  const int new_num_elements = 0;
+  const size_t bytes = sizeof(float) * num_elements;
+  const size_t new_bytes = sizeof(float) * new_num_elements;
+  float* data = (float*)malloc(bytes);
+  memset(data, 0, bytes);
+
+  TfLiteIntArray* dims = ConvertVectorToTfLiteIntArray({num_elements});
+  TfLiteTensor* tensor = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+  tensor->sparsity = nullptr;
+  tensor->bytes = bytes;
+  tensor->type = t;
+  tensor->data.data = data;
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->dims = dims;
+  tensor->dims_signature = TfLiteIntArrayCopy(dims);
+
+  EXPECT_EQ(TfLiteTensorRealloc(new_bytes, tensor), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 0);
+
+  TfLiteTensorFree(tensor);
+  free(tensor);
+}
+
+TEST(TestTensorRealloc, TensorReallocLargeBytesFails) {
+  const TfLiteType t = kTfLiteFloat32;
+  const int num_elements = 4;
+  const size_t bytes = sizeof(float) * num_elements;
+
+  float* data = (float*)malloc(bytes);
+  memset(data, 0, bytes);
+
+  TfLiteIntArray* dims = ConvertVectorToTfLiteIntArray({num_elements});
+  TfLiteTensor* tensor = (TfLiteTensor*)malloc(sizeof(TfLiteTensor));
+  tensor->sparsity = nullptr;
+  tensor->bytes = bytes;
+  tensor->type = t;
+  tensor->data.data = data;
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->dims = dims;
+  tensor->dims_signature = TfLiteIntArrayCopy(dims);
+  tensor->quantization.type = kTfLiteNoQuantization;
+
+  const size_t large_bytes = std::numeric_limits<size_t>::max() - 16;
+  // Subtract 16 to account for adding 16 for XNN_EXTRA_BYTES
+  EXPECT_EQ(TfLiteTensorRealloc(large_bytes, tensor), kTfLiteError);
+
+  TfLiteTensorFree(tensor);
+  free(data);
+  free(tensor);
+}
+
+TEST(TestTfLiteOpaqueDelegate, GetData_WellFormedOpaqueDelegate) {
+  int delegate_data = 42;
+  TfLiteOpaqueDelegateBuilder builder{};
+  builder.data = &delegate_data;
+
+  TfLiteOpaqueDelegate* opaque_delegate = TfLiteOpaqueDelegateCreate(&builder);
+
+  EXPECT_EQ(&delegate_data, TfLiteOpaqueDelegateGetData(opaque_delegate));
+
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TEST(TestTfLiteOpaqueDelegate,
+     GetData_NotConstructedWithTfLiteOpaqueDelegateCreate) {
+  // Given a non-opaque delegate, that was created with 'TfLiteDelegateCreate'
+  // and has its 'data_' field set manually.
+  int delegate_data = 42;
+  TfLiteDelegate non_opaque_delegate = TfLiteDelegateCreate();
+  non_opaque_delegate.data_ = &delegate_data;
+  // The following cast is safe only because this code is part of the
+  // TF Lite test suite.  Apps using TF Lite should not rely on
+  // 'TfLiteOpaqueDelegate' and 'TfLiteDelegate' being equivalent.
+  auto* opaque_delegate =
+      reinterpret_cast<TfLiteOpaqueDelegate*>(&non_opaque_delegate);
+
+  // The accessor returns 'nullptr', because the 'data' field inside the opaque
+  // delegate builder was not set.  Note that we deliberately don't fall back
+  // to returning the 'TfLiteDelegate's 'data_' field.  The fact that the
+  // 'TfLiteDelegate' is the internal representation of the
+  // 'TfLiteOpaqueDelegate' is an implementation detail that could
+  // theoretically change in the future.
+  EXPECT_EQ(nullptr, TfLiteOpaqueDelegateGetData(opaque_delegate));
+}
+
+TEST(TestTfLiteOpaqueDelegate, GetData_NoDataSetViaOpaqueDelegateBuilder) {
+  TfLiteOpaqueDelegateBuilder builder{};
+  TfLiteOpaqueDelegate* opaque_delegate = TfLiteOpaqueDelegateCreate(&builder);
+  // The accessor returns 'nullptr', because the 'data' field inside the opaque
+  // delegate builder was not set.
+  EXPECT_EQ(nullptr, TfLiteOpaqueDelegateGetData(opaque_delegate));
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/c/special_rules.bzl b/tensorflow/lite/core/c/special_rules.bzl
index 3daa293b9e2..bf51185cad9 100644
--- a/tensorflow/lite/core/c/special_rules.bzl
+++ b/tensorflow/lite/core/c/special_rules.bzl
@@ -12,8 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""External-only build rules for c_api."""
+"""Build macros for C headers."""
 
 def c_api_visibility_allowlist():
     """Returns a list of packages that can depend on c_api."""
     return []
+
+def common_header_visibility_allowlist():
+    """Returns a list of packages that can depend on tensorflow/lite/core/c/common.h."""
+    return []
+
+def experimental_acceleration_api_allowlist():
+    """Returns a list of packages that can depend on tensorflow/lite/core/experimental/acceleration/ subpackages.."""
+    return []
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
index 25ecf21ccf0..e106faf145d 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
@@ -1,4 +1,8 @@
+load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
+load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -7,11 +11,34 @@ cc_library(
     srcs = ["delegate_registry.cc"],
     hdrs = ["delegate_registry.h"],
     visibility = [
-        "//tensorflow/lite/experimental/acceleration/configuration:__subpackages__",
-    ],
+        "//tensorflow/lite:__subpackages__",
+    ] + delegate_registry_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+cc_library(
+    name = "stable_delegate_registry",
+    srcs = ["stable_delegate_registry.cc"],
+    hdrs = ["stable_delegate_registry.h"],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + experimental_acceleration_api_allowlist(),
+    deps = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/core/shims:tflite_use_opaque_delegate",  # buildcleaner: keep
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_test(
+    name = "stable_delegate_registry_test",
+    srcs = ["stable_delegate_registry_test.cc"],
+    deps = [
+        ":stable_delegate_registry",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
index 6ba32b10aa2..20c19c7348f 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/BUILD
@@ -1,14 +1,128 @@
-load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# C API for delegate plugins.
+
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
+load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + experimental_acceleration_api_allowlist(),
     licenses = ["notice"],
 )
 
-cc_library_with_tflite(
-    name = "vendor_delegate",
-    hdrs = ["vendor_delegate.h"],
-    tflite_deps = [
-        "//tensorflow/lite/core/shims:delegate_plugin",
+cc_library(
+    name = "delegate_plugin",
+    hdrs = ["delegate_plugin.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_library(
+    name = "gpu_plugin",
+    srcs = ["gpu_plugin.cc"],
+    hdrs = ["gpu_plugin.h"],
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin_impl",
+    ],
+)
+
+# For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
+# On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
+# the test srcs are set to the empty list, so the test will succeed without testing anything.
+cc_test(
+    name = "gpu_plugin_test",
+    srcs = select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": ["gpu_plugin_test.cc"],
+        "//conditions:default": [],
+    }),
+    linkopts = gpu_delegate_linkopts(),
+    deps = select({
+        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [":gpu_plugin"],
+        "//conditions:default": [],
+    }) + [
+        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+    ],
+)
+
+cc_library(
+    name = "nnapi_plugin",
+    srcs = ["nnapi_plugin.cc"],
+    hdrs = ["nnapi_plugin.h"],
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin_impl",
+    ],
+)
+
+cc_test(
+    name = "nnapi_plugin_test",
+    srcs = ["nnapi_plugin_test.cc"],
+    deps = [
+        ":nnapi_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "xnnpack_plugin",
+    srcs = ["xnnpack_plugin.cc"],
+    hdrs = ["xnnpack_plugin.h"],
+    deps = [
+        ":delegate_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+    ],
+)
+
+cc_test(
+    name = "xnnpack_plugin_test",
+    srcs = ["xnnpack_plugin_test.cc"],
+    deps = [
+        ":xnnpack_plugin",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "@com_google_googletest//:gtest_main",
+        "@pthreadpool",
+    ],
+)
+
+cc_library(
+    name = "stable_delegate",
+    hdrs = ["stable_delegate.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
new file mode 100644
index 00000000000..498124ba0db
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -0,0 +1,105 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
+
+// C API types for TF Lite delegate plugins.
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Type of function to allocate and construct a delegate.
+// The tflite_settings parameter should be a pointer to a FlatBuffer table
+// object of type tflite::TFLiteSettings.  (We use 'void *' here since this
+// is a C API so we don't want to directly reference C++ types such
+// as tflite::TFLiteSettings.)
+typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+// Type of function to destroy and deallocate a delegate.
+// The delegate argument must have been created with the corresponding
+// create function from the same delegate plugin.
+typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
+
+// Type of function to return an error code for the last delegate operation.
+// The delegate argument must have been created with the corresponding
+// create function from the same delegate plugin.
+typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
+
+// Struct to hold all the methods for a delegate plugin.
+typedef struct TfLiteDelegatePlugin {
+  // Function to allocate and construct a delegate.
+  TfLiteDelegatePluginCreateFunc *create;
+
+  // Function to deallocate a delegate.
+  TfLiteDelegatePluginDestroyFunc *destroy;
+
+  // Function to return an error code for the last delegate operation.
+  TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteDelegatePlugin;
+
+// The following block guarded by TFLITE_USE_OPAQUE_DELEGATE has the exact same
+// functionality as the concrete types above but only uses truly opaque types.
+// Note that it has to be an addition along with the concrete types at this
+// point because the in some cases both types are used together in a same build
+// target. e.g. TFLite-in-Play Services initialization context.
+#if TFLITE_USE_OPAQUE_DELEGATE
+
+// Same as TfLiteDelegatePluginCreateFunc but uses truly opaque types.
+typedef TfLiteOpaqueDelegateStruct *TfLiteOpaqueDelegatePluginCreateFunc(
+    const void *tflite_settings);
+
+// Same as TfLiteDelegatePluginDestroyFunc but uses truly opaque types.
+typedef void TfLiteOpaqueDelegatePluginDestroyFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+// Same as TfLiteDelegatePluginGetDelegateErrnoFunc but uses truly opaque types.
+typedef int TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc(
+    TfLiteOpaqueDelegateStruct *delegate);
+
+// Same as TfLiteDelegatePlugin but uses truly opaque types.
+typedef struct TfLiteOpaqueDelegatePlugin {
+  TfLiteOpaqueDelegatePluginCreateFunc *create;
+
+  TfLiteOpaqueDelegatePluginDestroyFunc *destroy;
+
+  TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
+} TfLiteOpaqueDelegatePlugin;
+
+#else
+
+typedef TfLiteDelegatePluginCreateFunc TfLiteOpaqueDelegatePluginCreateFunc;
+typedef TfLiteDelegatePluginDestroyFunc TfLiteOpaqueDelegatePluginDestroyFunc;
+typedef TfLiteDelegatePluginGetDelegateErrnoFunc
+    TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc;
+typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
+
+#endif  // TFLITE_USE_OPAQUE_DELEGATE
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc
similarity index 94%
rename from tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc
index b8e902cefe9..b9c0ced2b39 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 // This file implements the Delegate Plugin for the GPU Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
 
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h"
 
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
new file mode 100644
index 00000000000..bd6d52204dc
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
+
+// This header file is for the delegate plugin for GPU.
+//
+// For the C++ delegate plugin interface, the GPU delegate plugin is added to
+// the DelegatePluginRegistry by the side effect of a constructor for a static
+// object, so there's no public API needed for this plugin, other than the API
+// of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the GPU delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the GPU delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin_test.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc
similarity index 94%
rename from tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin_test.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc
index 47bd4c331a5..aebd1535e55 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin_test.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // GPU Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc
similarity index 94%
rename from tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc
index e44db1ea67c..99a1240cbf3 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 // This file implements the Delegate Plugin for the NNAPI Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
 
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h"
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
new file mode 100644
index 00000000000..ca8fdd17fcd
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
+
+// This header file is for the delegate plugin for NNAPI.
+//
+// For the C++ delegate plugin interface, the NNAPI delegate plugin is added to
+// the DelegatePluginRegistry by the side effect of a constructor for a static
+// object, so there's no public API needed for this plugin, other than the API
+// of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the NNAPI delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the NNAPI delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteNnapiDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin_test.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
similarity index 95%
rename from tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
index adf8eeb074e..5edb149b616 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // NNAPI Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000000..45e52b0dea0
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+// C API types for TFLite delegates that implement stable delegate ABI.
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Constant that identifies the TfLiteStableDelegate ABI version that the
+// delegate supports. This will get incremented if there are changes to the
+// struct. The version is in semver 2 format (see https://semver.org).
+#define TFL_STABLE_DELEGATE_ABI_VERSION "1.0.0"
+
+// Contains stable delegate metadata and implementation.
+typedef struct TfLiteStableDelegate {
+  // The struct ABI version this delegate supports in semver 2 format. It should
+  // be set to TFL_STABLE_DELEGATE_ABI_VERSION.
+  const char* delegate_abi_version;
+
+  // Uniquely identifies a delegate.
+  const char* delegate_name;
+
+  // Release version of this delegate.
+  const char* delegate_version;
+
+  // Provides the implementation of the delegate plugin.
+  const TfLiteOpaqueDelegatePlugin* delegate_plugin;
+} TfLiteStableDelegate;
+
+#ifdef __cplusplus
+};  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/vendor_delegate.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/vendor_delegate.h
deleted file mode 100644
index 0587ddd8c83..00000000000
--- a/tensorflow/lite/core/experimental/acceleration/configuration/c/vendor_delegate.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
-#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
-
-// C API types for TFLite vendor delegates.
-
-#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Constant that identifies the TfLiteVendorDelegate ABI version that the
-// delegate supports. This will get incremented if there are changes to the
-// struct. The version is in semver 2 format (see https://semver.org).
-#define TFL_VENDOR_DELEGATE_ABI_VERSION "1.0.0"
-
-// Contains vendor delegate metadata and implementation.
-typedef struct TfLiteVendorDelegate {
-  // The struct ABI version this delegate supports in semver 2 format. It should
-  // be set to TFL_VENDOR_DELEGATE_ABI_VERSION.
-  const char* vendor_delegate_abi_version;
-
-  // Uniquely identifies a delegate.
-  const char* delegate_name;
-
-  // Release version of this delegate.
-  const char* delegate_version;
-
-  // Provides the implementation of the delegate plugin.
-  const TfLiteOpaqueDelegatePlugin* delegate_plugin;
-} TfLiteVendorDelegate;
-
-#ifdef __cplusplus
-};  // extern "C"
-#endif
-
-#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc
similarity index 92%
rename from tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc
index 804d12f8727..fe080b3f0de 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc
@@ -15,11 +15,11 @@ limitations under the License.
 
 // This file implements the C API Delegate Plugin for the XNNPACK Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
new file mode 100644
index 00000000000..eea33705212
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -0,0 +1,50 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOLINTBEGIN(whitespace/line_length)
+/// WARNING: Users of TensorFlow Lite should not include this file directly,
+/// but should instead include
+/// "third_party/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+// NOLINTEND(whitespace/line_length)
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
+
+// This header file is for the delegate plugin for XNNPACK.
+//
+// For the C++ delegate plugin interface, the XNNPACK delegate plugin is added
+// to the DelegatePluginRegistry by the side effect of a constructor for a
+// static object, so there's no public API needed for this plugin, other than
+// the API of tflite::delegates::DelegatePluginRegistry, which is declared in
+// delegate_registry.h.
+//
+// But to provide a C API to access the XNNPACK delegate plugin, we do expose
+// some functions, which are declared below.
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C API for the XNNPACK delegate plugin.
+// Returns a pointer to a statically allocated table of function pointers.
+const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
similarity index 95%
rename from tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
rename to tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
index 2a04ad8a0a4..2cd83832779 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // Some very simple unit tests of the C API Delegate Plugin for the
 // XNNPACK Delegate.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "pthreadpool.h"  // from @pthreadpool
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
index 09d7be2bc91..105d501ea8f 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 // Defines an interface for TFLite delegate plugins.
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc
new file mode 100644
index 00000000000..ea0a68faa60
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.cc
@@ -0,0 +1,58 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h"
+
+#include <string>
+
+#include "absl/synchronization/mutex.h"
+
+namespace tflite {
+namespace delegates {
+
+void StableDelegateRegistry::RegisterStableDelegate(
+    const TfLiteStableDelegate* delegate) {
+  auto* const instance = StableDelegateRegistry::GetSingleton();
+  instance->RegisterStableDelegateImpl(delegate);
+}
+
+const TfLiteStableDelegate* StableDelegateRegistry::RetrieveStableDelegate(
+    const std::string& name) {
+  auto* const instance = StableDelegateRegistry::GetSingleton();
+  return instance->RetrieveStableDelegateImpl(name);
+}
+
+void StableDelegateRegistry::RegisterStableDelegateImpl(
+    const TfLiteStableDelegate* delegate) {
+  absl::MutexLock lock(&mutex_);
+  registry_[delegate->delegate_name] = delegate;
+}
+
+const TfLiteStableDelegate* StableDelegateRegistry::RetrieveStableDelegateImpl(
+    const std::string& name) {
+  absl::MutexLock lock(&mutex_);
+  if (registry_.find(name) == registry_.end()) {
+    return nullptr;
+  } else {
+    return registry_[name];
+  }
+}
+
+StableDelegateRegistry* StableDelegateRegistry::GetSingleton() {
+  static auto* instance = new StableDelegateRegistry();
+  return instance;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
new file mode 100644
index 00000000000..1c346691802
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+#define TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h"
+
+namespace tflite {
+namespace delegates {
+
+// A dedicated singleton registry for TfLiteStableDelegate.
+// Note that there is also a non-stable delegate registry
+// (third_party/tensorflow/lite/core/experimental/acceleration/configuration/
+// delegate_registry.h)
+// but it does not serve very well for TfLiteStableDelegate as it could not
+// register all the information of TfLiteStableDelegate and it uses concrete
+// types.
+class StableDelegateRegistry {
+ public:
+  // Registers a TfLiteStableDelegate pointer to the registry.
+  static void RegisterStableDelegate(const TfLiteStableDelegate* delegate);
+  // Retrieves the pointer to the corresponding TfLiteStableDelegate from the
+  // registry given a delegate name. Returns nullptr if no registration found.
+  static const TfLiteStableDelegate* RetrieveStableDelegate(
+      const std::string& name);
+
+ private:
+  static StableDelegateRegistry* GetSingleton();
+  void RegisterStableDelegateImpl(const TfLiteStableDelegate* delegate);
+  const TfLiteStableDelegate* RetrieveStableDelegateImpl(
+      const std::string& name);
+
+  absl::Mutex mutex_;
+  std::unordered_map<std::string, const TfLiteStableDelegate*> registry_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc
new file mode 100644
index 00000000000..2af1056cb84
--- /dev/null
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/experimental/acceleration/configuration/stable_delegate_registry.h"
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using tflite::delegates::StableDelegateRegistry;
+
+TfLiteStableDelegate CreateTestStableDelegate() {
+  TfLiteStableDelegate stable_delegate = {TFL_STABLE_DELEGATE_ABI_VERSION,
+                                          "test_delegate", "V1.0.0", nullptr};
+  return stable_delegate;
+}
+
+class StableDelegateRegistryTest : public testing::Test {
+ public:
+  void SetUp() override {
+    stable_delegate_ = CreateTestStableDelegate();
+    StableDelegateRegistry::RegisterStableDelegate(&stable_delegate_);
+  }
+
+ protected:
+  TfLiteStableDelegate stable_delegate_;
+};
+
+TEST_F(StableDelegateRegistryTest, TestRetrieval) {
+  EXPECT_EQ(StableDelegateRegistry::RetrieveStableDelegate("test_delegate"),
+            &stable_delegate_);
+}
+
+TEST_F(StableDelegateRegistryTest, NoRegistrationFound) {
+  EXPECT_EQ(
+      StableDelegateRegistry::RetrieveStableDelegate("not_valid_delegate"),
+      nullptr);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index 65ae9a00cda..15ca7c86ba2 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdlib.h>
 
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -29,9 +30,11 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/interpreter_options.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry.h"
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/util.h"
 
@@ -90,7 +93,6 @@ TfLiteQuantization GetQuantizationFromLegacy(
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
 #if defined(TFLITE_IS_MOBILE_PLATFORM)
@@ -329,8 +331,7 @@ TfLiteStatus Interpreter::ApplyLazyDelegateProviders() {
   // by default, in which case, the execution will fall back to default
   // implementation if the XNNPACK delegate fails to be applied.
   for (size_t i = 0; i < delegate_providers.size(); ++i) {
-    auto delegate_ptr =
-        delegate_providers[i](context_->recommended_num_threads);
+    auto delegate_ptr = delegate_providers[i](context_);
     // Note when XNNPACK-by-default is disabled, the corresponding creator (i.e.
     // tflite::MaybeCreateXNNPACKDelegate(...)) will return a nullptr.
     // Therefore, we simply continue with the next one.
@@ -437,6 +438,18 @@ TfLiteStatus Interpreter::SetMetadata(
   return kTfLiteOk;
 }
 
+TfLiteStatus Interpreter::SetTelemetrySettings(
+    std::unique_ptr<TfLiteTelemetryInterpreterSettings> settings) {
+  telemetry_data_ = std::move(settings);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::ReportTelemetrySettings(const char* setting_name) {
+  telemetry::TelemetryReportSettings(context_, setting_name,
+                                     telemetry_data_.get());
+  return kTfLiteOk;
+}
+
 bool Interpreter::IsFullyDelegated() const {
   return primary_subgraph().IsFullyDelegated();
 }
@@ -495,4 +508,13 @@ TfLiteStatus Interpreter::EnableCancellation() {
 
 TfLiteStatus Interpreter::Cancel() { return primary_subgraph().Cancel(); }
 
+void Interpreter::AddProfiler(std::unique_ptr<Profiler> profiler) {
+  if (profiler == nullptr) return;
+  if (root_profiler_ == nullptr) {
+    root_profiler_ = std::make_unique<profiling::RootProfiler>();
+  }
+  root_profiler_->AddProfiler(std::move(profiler));
+  SetSubgraphProfiler();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/core/interpreter.h b/tensorflow/lite/core/interpreter.h
index 6d52986fcf5..b4f78fc6aee 100644
--- a/tensorflow/lite/core/interpreter.h
+++ b/tensorflow/lite/core/interpreter.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Main abstraction controlling the tflite interpreter.
 /// Do NOT include this file directly,
 /// instead include third_party/tensorflow/lite/interpreter.h
@@ -39,9 +40,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/common.h"  // IWYU pragma: export
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/common.h"  // IWYU pragma: export
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/experimental/resource/initialization_status.h"
@@ -51,6 +52,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter_options.h"
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
 #include "tensorflow/lite/profiling/root_profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
 #include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/string_type.h"
@@ -58,6 +60,7 @@ limitations under the License.
 
 namespace tflite {
 
+#ifndef DOXYGEN_SKIP
 class InterpreterTest;  // Class for friend declarations.
 
 namespace delegates {
@@ -71,6 +74,7 @@ class TestDelegation;  // Class for friend declarations.
 namespace interpreter_wrapper {
 class InterpreterWrapper;  // Class for friend declarations.
 }  // namespace interpreter_wrapper
+#endif  // DOXYGEN_SKIP
 
 /// An interpreter for a graph of nodes that input and output from tensors.
 /// Each node of the graph processes a set of input tensors and produces a
@@ -105,8 +109,14 @@ class InterpreterWrapper;  // Class for friend declarations.
 /// Note: For nearly all practical use cases, one should not directly construct
 /// an Interpreter object, but rather use the InterpreterBuilder.
 ///
-/// WARNING: This class is *not* thread-safe. The client is responsible for
+/// \warning This class is *not* thread-safe. The client is responsible for
 /// ensuring serialized interaction to avoid data races and undefined behavior.
+using Interpreter = impl::Interpreter;
+
+namespace impl {
+
+class InterpreterBuilder;  // Class for friend declarations.
+
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -255,7 +265,7 @@ class Interpreter {
   /// Return the number of ops in the model.
   size_t nodes_size() const { return primary_subgraph().nodes_size(); }
 
-  /// WARNING: Experimental interface, subject to change
+  /// \warning Experimental interface, subject to change.
   const std::vector<int>& execution_plan() const {
     return primary_subgraph().execution_plan();
   }
@@ -310,9 +320,9 @@ class Interpreter {
     return nullptr;
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns list of all keys of different method signatures defined in the
-  /// model.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns list of all keys of different method signatures defined
+  /// in the model.
   /// Note, pointers returned have lifetime same as the Interpreter object.
   std::vector<const std::string*> signature_keys() const {
     std::vector<const std::string*> signature_keys;
@@ -323,10 +333,10 @@ class Interpreter {
     return signature_keys;
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns a pointer to the SignatureRunner instance to run the part of the
-  /// graph identified by a SignatureDef. The nullptr is returned if the given
-  /// signature key is not valid.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns a pointer to the SignatureRunner instance to run the part
+  /// of the graph identified by a SignatureDef. The nullptr is returned if the
+  /// given signature key is not valid.
   /// If you need to specify delegates, you have to do that before calling this
   /// function. This function will additionally apply default delegates. Thus,
   /// applying delegates after that might lead to undesirable behaviors.
@@ -334,9 +344,9 @@ class Interpreter {
   /// and the SignatureRunner class is *not* thread-safe.
   SignatureRunner* GetSignatureRunner(const char* signature_key);
 
-  /// WARNING: Experimental interface, subject to change
-  /// Return the subgraph index that corresponds to a SignatureDef, defined by
-  /// 'signature_key'.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Return the subgraph index that corresponds to a SignatureDef,
+  /// defined by 'signature_key'.
   /// If invalid name passed, -1 will be returned.
   int GetSubgraphIndexFromSignature(const char* signature_key) const {
     for (const auto& signature : signature_defs_) {
@@ -347,8 +357,8 @@ class Interpreter {
     return -1;
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns the mapping of inputs to tensor index in the signature
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns the mapping of inputs to tensor index in the signature
   /// specified through 'signature_key'.
   /// If invalid name passed, an empty list will be returned.
   const std::map<std::string, uint32_t>& signature_inputs(
@@ -361,8 +371,8 @@ class Interpreter {
     return *default_empty_list;
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns the mapping of outputs to tensor index in the signature
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns the mapping of outputs to tensor index in the signature
   /// specified through 'signature_key'.
   /// If invalid name passed, an empty list will be returned.
   const std::map<std::string, uint32_t>& signature_outputs(
@@ -375,9 +385,9 @@ class Interpreter {
     return *default_empty_list;
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns the input tensor identified by 'signature_input_name' in the
-  /// signature identified by 'signature_key'.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns the input tensor identified by 'signature_input_name' in
+  /// the signature identified by 'signature_key'.
   /// Returns nullptr if not found.
   TfLiteTensor* input_tensor_by_signature(const char* signature_input_name,
                                           const char* signature_key) {
@@ -389,9 +399,9 @@ class Interpreter {
     return subgraph(subgraph_index)->tensor(tensor_index);
   }
 
-  /// WARNING: Experimental interface, subject to change
-  /// Returns the output tensor identified by 'signature_output_name' in the
-  /// signature identified by 'signature_key'.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Returns the output tensor identified by 'signature_output_name' in
+  /// the signature identified by 'signature_key'.
   /// Returns nullptr if not found.
   const TfLiteTensor* output_tensor_by_signature(
       const char* signature_output_name, const char* signature_key) const {
@@ -469,9 +479,10 @@ class Interpreter {
   TfLiteStatus ResizeInputTensorStrict(int tensor_index,
                                        const std::vector<int>& dims);
 
-  /// This releases memory held by non-persistent tensors. It does NOT
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief This releases memory held by non-persistent tensors. It does NOT
   /// re-perform memory planning. AllocateTensors needs to be called before next
-  /// invocation. WARNING: Experimental interface, subject to change
+  /// invocation.
   TfLiteStatus ReleaseNonPersistentMemory();
 
   /// Update allocations for all tensors. This will redim dependent tensors
@@ -521,31 +532,31 @@ class Interpreter {
   /// This method will be removed in a future release.
   void SetAllowFp16PrecisionForFp32(bool allow);
 
-  /// Get the half precision flag.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Get the half precision flag.
   bool GetAllowFp16PrecisionForFp32() const {
     return context_->allow_fp32_relax_to_fp16;
   }
 
-  /// Sets the cancellation function pointer in order to cancel a request in the
-  /// middle of a call to Invoke(). The interpreter queries this function during
-  /// inference, between op invocations; when it returns true, the interpreter
-  /// will abort execution and return `kTfLiteError`. The `data` parameter
-  /// contains any data used by the cancellation function, and if non-null,
-  /// remains owned by the caller.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Sets the cancellation function pointer in order to cancel a request
+  /// in the middle of a call to Invoke(). The interpreter queries this function
+  /// during inference, between op invocations; when it returns true, the
+  /// interpreter will abort execution and return `kTfLiteError`. The `data`
+  /// parameter contains any data used by the cancellation function, and if
+  /// non-null, remains owned by the caller.
   void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
 
-  /// Attempts to cancel in flight invocation if any.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief  Attempts to cancel in flight invocation if any.
   /// This will not affect `Invoke`s that happends after the cancellation.
   /// Non blocking. Thread safe.
   /// Returns kTfLiteError if cancellation is not enabled, otherwise returns
   /// kTfLiteOk.
-  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus Cancel();
 
-  /// Allow a delegate to look at the graph and modify the graph to handle
-  /// parts of the graph themselves. After this is called, the graph may
+  /// \brief Allow a delegate to look at the graph and modify the graph to
+  /// handle parts of the graph themselves. After this is called, the graph may
   /// contain new nodes that replace 1 more nodes.
   /// 'delegate' must outlive the interpreter.
   /// Returns one of the following status codes:
@@ -561,18 +572,17 @@ class Interpreter {
   /// 4. kTfLiteUnresolvedOps: Delegation failed because the model has an
   /// operator that cannot be resolved. This can happen when the op is not
   /// registered or built with the TF Lite framework.
-  /// 5. kTfLiteError: Unexpected/runtime failure.
-  /// WARNING: This is an experimental API and subject to change.
+  /// 5. kTfLiteError: Unexpected/runtime failure. \n
+  /// \warning This is an experimental API and subject to change. \n
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteOpaqueDelegateStruct* delegate);
 
   // Owning handle to a TfLiteDelegate instance.
   using TfLiteDelegatePtr =
       std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
 
-  /// Same as ModifyGraphWithDelegate except this interpreter takes
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Same as ModifyGraphWithDelegate except this interpreter takes
   /// ownership of the provided delegate.
-  /// WARNING: This is an experimental API and subject to change.
   template <typename Delegate, typename Deleter>
   inline TfLiteStatus ModifyGraphWithDelegate(
       std::unique_ptr<Delegate, Deleter> delegate) {
@@ -595,14 +605,16 @@ class Interpreter {
   TfLiteStatus ModifyGraphWithDelegate(
       std::unique_ptr<TfLiteDelegate> delegate) = delete;
 
-  /// Ensure the data in `tensor.data` is readable. In case delegate is used,
-  /// it might require to copy the data from delegate buffer to raw memory.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Ensure the data in `tensor.data` is readable. In case delegate is
+  /// used, it might require to copy the data from delegate buffer to raw
+  /// memory.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
     return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
 
-  /// Set the delegate buffer handle to a tensor. It can be called in the
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set the delegate buffer handle to a tensor. It can be called in the
   /// following cases:
   /// 1. Set the buffer handle to a tensor that's not being written by a
   ///    delegate. For example, feeding an OpenGL texture as the input of the
@@ -610,48 +622,47 @@ class Interpreter {
   /// 2. Set the buffer handle to a tensor that uses the same delegate.
   ///    For example, set an OpenGL texture as the output of inference, while
   ///    the node which produces output is an OpenGL delegate node.
-  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus SetBufferHandle(int tensor_index,
                                TfLiteBufferHandle buffer_handle,
                                TfLiteDelegate* delegate);
-  TfLiteStatus SetBufferHandle(int tensor_index,
-                               TfLiteBufferHandle buffer_handle,
-                               TfLiteOpaqueDelegateStruct* opaque_delegate);
 
-  /// Get the delegate buffer handle, and the delegate which can process the
-  /// buffer handle.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Get the delegate buffer handle, and the delegate which can process
+  /// the buffer handle.
   TfLiteStatus GetBufferHandle(int tensor_index,
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
-  TfLiteStatus GetBufferHandle(int tensor_index,
-                               TfLiteBufferHandle* buffer_handle,
-                               TfLiteOpaqueDelegateStruct** opaque_delegate);
 
-  /// Sets the profiler to tracing execution. The caller retains ownership
-  /// of the profiler and must ensure its validity.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Sets the profiler to tracing execution. The caller retains
+  /// ownership of the profiler and must ensure its validity.
   /// Previously registered profilers will be unregistered.
   /// If `profiler` is nullptr, all previously installed profilers will be
   /// removed.
-  /// WARNING: This is an experimental API and subject to change.
   void SetProfiler(Profiler* profiler);
 
-  /// Same as SetProfiler except this interpreter takes ownership
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Same as SetProfiler except this interpreter takes ownership
   /// of the provided profiler.
   /// Previously registered profilers will be unregistered.
   /// If `profiler` is nullptr, all previously installed profilers will be
   /// removed.
-  /// WARNING: This is an experimental API and subject to change.
   void SetProfiler(std::unique_ptr<Profiler> profiler);
 
-  /// Adds the profiler to tracing execution. The caller retains ownership
-  /// of the profiler and must ensure its validity.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Adds the profiler to tracing execution. The caller retains
+  /// ownership of the profiler and must ensure its validity.
   /// nullptr `profiler` will be ignored.
-  /// WARNING: This is an experimental API and subject to change.
   void AddProfiler(Profiler* profiler);
 
-  /// Gets the profiler used for op tracing.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Adds the profiler to tracing execution. Transfers
+  /// ownership of the profiler to the interpreter.
+  /// nullptr `profiler` will be ignored.
+  void AddProfiler(std::unique_ptr<Profiler> profiler);
+
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Gets the profiler used for op tracing.
   Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
@@ -662,23 +673,23 @@ class Interpreter {
   /// pointers to existing tensors.
   static constexpr int kTensorsCapacityHeadroom = 16;
 
-  /// Set if buffer handle output is allowed.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Set if buffer handle output is allowed.
   ///
   /// When using hardware delegation, Interpreter will make the data of output
   /// tensors available in `tensor->data` by default. If the application can
   /// consume the buffer handle directly (e.g. reading output from OpenGL
   /// texture), it can set this flag to false, so Interpreter won't copy the
   /// data from buffer handle to CPU memory.
-  /// WARNING: This is an experimental API and subject to change.
   void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
     allow_buffer_handle_output_ = allow_buffer_handle_output;
   }
 
-  /// Reset all variable tensors to the default value.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Reset all variable tensors to the default value.
   /// If a variable tensor doesn't have a buffer, reset it to zero.
   /// TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
   /// to the value of the buffer.
-  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ResetVariableTensors();
 
   /// Retrieve an operator's description of its work, for profiling purposes.
@@ -694,8 +705,8 @@ class Interpreter {
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
-  /// Assigns (or reassigns) a custom memory allocation for the given tensor.
-  /// `flags` is a bitmask, see TfLiteCustomAllocationFlags.
+  /// \brief Assigns (or reassigns) a custom memory allocation for the given
+  /// tensor. `flags` is a bitmask, see TfLiteCustomAllocationFlags.
   /// The runtime does NOT take ownership of the underlying memory.
   ///
   /// NOTE: User needs to call AllocateTensors() after this.
@@ -714,23 +725,22 @@ class Interpreter {
   ///    defined in lite/util.h. (Currently 64 bytes)
   ///    This check is skipped if kTfLiteCustomAllocationFlagsSkipAlignCheck is
   ///    set through `flags`.
-  ///
-  /// WARNING: This is an experimental interface that is subject to change.
+  /// \warning This is an experimental API and subject to change. \n
   TfLiteStatus SetCustomAllocationForTensor(
       int tensor_index, const TfLiteCustomAllocation& allocation,
       int64_t flags = kTfLiteCustomAllocationFlagsNone);
 
-  /// Apply InterpreterOptions which tunes behavior of the interpreter.
-  /// WARNING: This is an experimental interface that is subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Apply InterpreterOptions which tunes behavior of the interpreter.
   TfLiteStatus ApplyOptions(InterpreterOptions* options);
 
 #ifndef DOXYGEN_SKIP
-  /// Return the number of subgraphs in the model.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Return the number of subgraphs in the model.
   size_t subgraphs_size() const { return subgraphs_.size(); }
 
-  /// Get a pointer to a subgraph if in bounds.
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change. \n
+  /// \brief Get a pointer to a subgraph if in bounds.
   const Subgraph* subgraph(int subgraph_index) const {
     if (subgraph_index < 0 ||
         static_cast<size_t>(subgraph_index) >= subgraphs_size()) {
@@ -739,33 +749,35 @@ class Interpreter {
     return subgraphs_[subgraph_index].get();
   }
 
-  /// WARNING: This is an experimental API and subject to change.
+  /// \warning This is an experimental API and subject to change.
   Subgraph* subgraph(int subgraph_index) {
     return const_cast<Subgraph*>(
         static_cast<const Interpreter*>(this)->subgraph(subgraph_index));
   }
 
-  /// WARNING: Experimental interface, subject to change
+  /// \warning Experimental interface, subject to change.
   Subgraph& primary_subgraph() {
-    return *subgraphs_.front();  /// Safe as subgraphs_ always has 1 entry.
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 
-  /// WARNING: Experimental interface, subject to change
+  /// \warning Experimental interface, subject to change.
   const Subgraph& primary_subgraph() const {
     return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 #endif  // DOXYGEN_SKIP
 
-  /// WARNING: Experimental interface, subject to change
-  /// Get the error reporter associated with this interpreter.
+  /// \warning Experimental interface, subject to change. \n
+  /// \brief Get the error reporter associated with this interpreter.
   ErrorReporter* error_reporter() const { return error_reporter_; }
 
  private:
-  friend class InterpreterBuilder;
+  friend class tflite::impl::InterpreterBuilder;
+#ifndef DOXYGEN_SKIP
   friend class tflite::InterpreterTest;
   friend class tflite::delegates::InterpreterUtils;
   friend class tflite::delegates::test_utils::TestDelegation;
   friend class tflite::interpreter_wrapper::InterpreterWrapper;
+#endif  // DOXYGEN_SKIP
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -848,6 +860,14 @@ class Interpreter {
   // Used by InterpreterBuilder, should be called after setting up subgraphs.
   TfLiteStatus SetMetadata(const std::map<std::string, std::string>& metadata);
 
+  // Sets telemetry settings on model information and interpreter settings.
+  // Used by InterpreterBuilder.
+  TfLiteStatus SetTelemetrySettings(
+      std::unique_ptr<TfLiteTelemetryInterpreterSettings> telemetry_settings);
+
+  // Reports the telemetry settings with the given setting name.
+  TfLiteStatus ReportTelemetrySettings(const char* setting_name);
+
   /// Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
   /// entries. The value pointed to by `first_new_subgraph_index` will be set to
   /// the index of the first new subgraph if `first_new_subgraph_index` is
@@ -914,7 +934,7 @@ class Interpreter {
   // An empty one means there's no delegate to be applied by default or
   // delegates have been applied and doesn't need to be applied again.
   using TfLiteDelegateCreator =
-      std::function<TfLiteDelegatePtr(int /*num_threads*/)>;
+      std::function<TfLiteDelegatePtr(TfLiteContext* /*context*/)>;
   using TfLiteDelegateCreators = std::vector<TfLiteDelegateCreator>;
   TfLiteDelegateCreators lazy_delegate_providers_;
 
@@ -930,6 +950,9 @@ class Interpreter {
   // Data is mapped from the Metadata in TFLite flatbuffer model.
   std::map<std::string, std::string> metadata_;
 
+  // Telemery data including model metadata and interpreter settings.
+  std::unique_ptr<TfLiteTelemetryInterpreterSettings> telemetry_data_;
+
   // InterpreterOptions object which is being used.
   std::unique_ptr<InterpreterOptions> options_;
 
@@ -951,5 +974,7 @@ class Interpreter {
   bool cancellation_enabled_ = false;
 };
 
+}  // namespace impl
+
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_CORE_INTERPRETER_H_
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index d51ba51bb12..c8bf87d89e7 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/core/model_builder.h"
@@ -38,6 +38,8 @@ limitations under the License.
 #include "tensorflow/lite/internal/signature_def.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/profiling/platform_profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+#include "tensorflow/lite/schema/conversion_metadata_generated.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/shared_library.h"
@@ -74,6 +76,9 @@ namespace tflite {
 
 namespace {
 
+constexpr char kConversionMetadataKey[] = "CONVERSION_METADATA";
+constexpr char kTelemetryBuilderEventName[] = "InterpreterBuilder::operator()";
+
 // Ensure that ErrorReporter is non-null.
 ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
   return e ? e : DefaultErrorReporter();
@@ -230,7 +235,8 @@ InterpreterBuilder::InterpreterBuilder(
     const InterpreterOptions* options_experimental)
     : model_(model),
       op_resolver_(op_resolver),
-      error_reporter_(ValidateErrorReporter(error_reporter)) {
+      error_reporter_(ValidateErrorReporter(error_reporter)),
+      metadata_(FlatBufferModel::ReadAllMetadata(model_)) {
   if (options_experimental) {
     options_ = *options_experimental;
   }
@@ -321,11 +327,15 @@ class MallocDataAllocator : public BuiltinDataAllocator {
 
 TfLiteStatus InterpreterBuilder::ParseNodes(
     const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-    Subgraph* subgraph) {
+    Subgraph* subgraph, TfLiteTelemetrySubgraphInfo* subgraph_info) {
   TfLiteStatus status = kTfLiteOk;
 
   // Reduce the number of redundant allocations
   subgraph->ReserveNodes(operators->size());
+  if (subgraph_info) {
+    subgraph_info->op_types.resize(operators->size());
+    subgraph_info->custom_op_names.resize(operators->size());
+  }
 
   for (int i = 0; i < operators->size(); ++i) {
     const auto* op = operators->Get(i);
@@ -347,6 +357,10 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
 
     BuiltinOperator op_type =
         static_cast<BuiltinOperator>(registration->builtin_code);
+    if (subgraph_info) {
+      subgraph_info->op_types[i] = op_type;
+      subgraph_info->custom_op_names[i] = registration->custom_name;
+    }
 
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
       error_reporter_->Report(
@@ -564,7 +578,7 @@ TfLiteStatus InterpreterBuilder::ParseSignatureDefs(
 TfLiteStatus InterpreterBuilder::ParseTensors(
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-    Subgraph* subgraph) {
+    Subgraph* subgraph, TfLiteTelemetrySubgraphInfo* subgraph_info) {
   TfLiteStatus status = kTfLiteOk;
 
   // A little helper to get the names of inputs and outputs. Note that they
@@ -575,6 +589,10 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     return kEmptyTensorName;
   };
 
+  if (subgraph_info) {
+    subgraph_info->quantizations.resize(tensors->size());
+  }
+
   num_fp32_tensors_ = 0;
   for (int i = 0; i < tensors->size(); ++i) {
     const auto* tensor = tensors->Get(i);
@@ -621,6 +639,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
                               i);
       status = kTfLiteError;
     }
+    if (subgraph_info) subgraph_info->quantizations[i] = quantization;
 
     std::vector<int> dims_signature = {};
     if (tensor->shape_signature()) {
@@ -770,11 +789,23 @@ TfLiteStatus InterpreterBuilder::operator()(
   (*interpreter)
       ->SetProfilerImpl(tflite::profiling::MaybeCreatePlatformProfiler());
 
+  bool telemetry_registered = telemetry_profiler_ != nullptr;
+  std::unique_ptr<TfLiteTelemetryInterpreterSettings> telemetry_settings;
+  if (telemetry_registered) {
+    (*interpreter)->AddProfiler(std::move(telemetry_profiler_));
+    telemetry_settings = std::make_unique<TfLiteTelemetryInterpreterSettings>();
+    telemetry_settings->subgraph_infos.resize(subgraphs->size());
+  }
+
   for (int subgraph_index = 0; subgraph_index < subgraphs->size();
        ++subgraph_index) {
     const tflite::SubGraph* subgraph = (*subgraphs)[subgraph_index];
     tflite::Subgraph* modified_subgraph =
         (*interpreter)->subgraph(subgraph_index);
+    auto* subgraph_info =
+        telemetry_registered
+            ? &telemetry_settings->subgraph_infos[subgraph_index]
+            : nullptr;
     auto operators = subgraph->operators();
     auto tensors = subgraph->tensors();
     if (!tensors) {
@@ -795,9 +826,11 @@ TfLiteStatus InterpreterBuilder::operator()(
     // Finally setup nodes and tensors
     // Parse tensors before nodes as ParseNodes checks input tensors for the
     // nodes.
-    if (ParseTensors(buffers, tensors, modified_subgraph) != kTfLiteOk)
+    if (ParseTensors(buffers, tensors, modified_subgraph, subgraph_info) !=
+        kTfLiteOk)
       return cleanup_and_error();
-    if (operators && ParseNodes(operators, modified_subgraph) != kTfLiteOk)
+    if (operators &&
+        ParseNodes(operators, modified_subgraph, subgraph_info) != kTfLiteOk)
       return cleanup_and_error();
 
     std::vector<int> variables;
@@ -827,6 +860,13 @@ TfLiteStatus InterpreterBuilder::operator()(
         op_resolver_.GetDelegateCreators();
   }
 
+  if (telemetry_registered) {
+    ParseConversionMetadata(telemetry_settings.get());
+    (*interpreter)->SetTelemetrySettings(std::move(telemetry_settings));
+    // Reports model and interpreter settings if telemetry is applied.
+    (*interpreter)->ReportTelemetrySettings(kTelemetryBuilderEventName);
+  }
+
   TfLiteStatus status = ApplyDelegates(interpreter->get());
   if (status != kTfLiteOk) {
     interpreter->reset();
@@ -836,6 +876,7 @@ TfLiteStatus InterpreterBuilder::operator()(
   if (options_.GetDynamicAllocationForLargeTensors()) {
     (*interpreter)->ApplyOptionsImpl(&options_);
   }
+
   return status;
 }
 
@@ -855,4 +896,24 @@ void InterpreterBuilder::AddDelegate(
   AddDelegate(reinterpret_cast<TfLiteDelegate*>(opaque_delegate));
 }
 
+void InterpreterBuilder::ParseConversionMetadata(
+    TfLiteTelemetryInterpreterSettings* settings) {
+  if (settings == nullptr) return;
+  auto it = metadata_.find(kConversionMetadataKey);
+  if (it == metadata_.end()) {
+    // No conversion metadata embeded.
+    return;
+  }
+  auto* conversion_meta = GetConversionMetadata(it->second.data());
+  if (conversion_meta == nullptr || conversion_meta->options() == nullptr) {
+    // Empty conversion metadata.
+    return;
+  }
+  settings->conversion_metadata =
+      std::make_unique<TfLiteTelemetryConversionMetadata>();
+  settings->conversion_metadata->model_optimization_modes =
+      FlatBufferIntArrayToVector(
+          conversion_meta->options()->model_optimization_modes());
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/core/interpreter_builder.h b/tensorflow/lite/core/interpreter_builder.h
index 2176d71ef73..71a26e5cc16 100644
--- a/tensorflow/lite/core/interpreter_builder.h
+++ b/tensorflow/lite/core/interpreter_builder.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Provides functionality to construct an interpreter for a model.
 ///
 /// WARNING: Users of TensorFlow Lite should not include this file directly,
@@ -26,17 +27,20 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffers.h"
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+#include "tensorflow/lite/profiling/telemetry/profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
@@ -44,25 +48,29 @@ namespace tflite {
 
 /// Build an interpreter capable of interpreting `model`.
 ///
-/// `model`: A model whose lifetime must be at least as long as any
+/// * `model`: A model whose lifetime must be at least as long as any
 ///   interpreter(s) created by the builder. In principle multiple interpreters
 ///   can be made from a single model.
-/// `op_resolver`: An instance that implements the `OpResolver` interface, which
-///   maps custom op names and builtin op codes to op registrations. The
+/// * `op_resolver`: An instance that implements the `OpResolver` interface,
+///   which maps custom op names and builtin op codes to op registrations. The
 ///   lifetime of the provided `op_resolver` object must be at least as long as
 ///   the `InterpreterBuilder`; unlike `model` and `error_reporter`, the
 ///   `op_resolver` does not need to exist for the duration of any created
 ///   `Interpreter` objects.
-/// `error_reporter`: a functor that is called to report errors that handles
+/// * `error_reporter`: a functor that is called to report errors that handles
 ///   printf var arg semantics. The lifetime of the `error_reporter` object must
 ///   be greater than or equal to the `Interpreter` created by `operator()`.
-/// `options_experimental`: Options that can change behavior of interpreter.
+/// * `options_experimental`: Options that can change behavior of interpreter.
 ///   WARNING: this parameter is an experimental API and is subject to change.
 ///
 /// Returns a kTfLiteOk when successful and sets interpreter to a valid
 /// Interpreter. Note: The user must ensure the lifetime of the model (and error
 /// reporter, if provided) is at least as long as interpreter's lifetime, and
 /// a single model instance may safely be used with multiple interpreters.
+using InterpreterBuilder = impl::InterpreterBuilder;
+
+namespace impl {
+
 class InterpreterBuilder {
  public:
   /// For this constructor, the ErrorReporter will be extracted from the
@@ -110,15 +118,23 @@ class InterpreterBuilder {
   void AddDelegate(TfLiteDelegate* delegate);
   void AddDelegate(TfLiteOpaqueDelegateStruct* opaque_delegate);
 
+  // Registers a telemetry profiler.
+  // Transfers the ownership to the InterpreterOptions.
+  // WARNING: This is an experimental API and subject to change.
+  void SetTelemetryProfiler(
+      std::unique_ptr<telemetry::TelemetryProfiler> profiler) {
+    telemetry_profiler_ = std::move(profiler);
+  }
+
  private:
   TfLiteStatus BuildLocalIndexToRegistrationMapping();
   TfLiteStatus ParseNodes(
       const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-      Subgraph* subgraph);
+      Subgraph* subgraph, TfLiteTelemetrySubgraphInfo* subgraph_info);
   TfLiteStatus ParseTensors(
       const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
       const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-      Subgraph* subgraph);
+      Subgraph* subgraph, TfLiteTelemetrySubgraphInfo* subgraph_info);
   TfLiteStatus ApplyDelegates(Interpreter* interpreter);
   TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization,
                                  TfLiteQuantization* quantization,
@@ -129,6 +145,7 @@ class InterpreterBuilder {
       const flatbuffers::Vector<flatbuffers::Offset<SignatureDef>>*
           signature_def_list,
       Interpreter* interpreter);
+  void ParseConversionMetadata(TfLiteTelemetryInterpreterSettings* settings);
 
   const ::tflite::Model* model_;
   const OpResolver& op_resolver_;
@@ -149,8 +166,12 @@ class InterpreterBuilder {
   int num_fp32_tensors_ = 0;
   int num_threads_ = -1;
   InterpreterOptions options_;
+
+  std::unique_ptr<telemetry::TelemetryProfiler> telemetry_profiler_;
 };
 
+}  // namespace impl
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/core/interpreter_experimental.cc b/tensorflow/lite/core/interpreter_experimental.cc
index 0d754183ac7..d4e345a801d 100644
--- a/tensorflow/lite/core/interpreter_experimental.cc
+++ b/tensorflow/lite/core/interpreter_experimental.cc
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "ruy/denormal.h"  // from @ruy
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
@@ -79,14 +79,6 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   return ModifyGraphWithDelegateImpl(delegate);
 }
 
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(
-    TfLiteOpaqueDelegateStruct* delegate) {
-  // The following cast is safe only because this code is part of the
-  // TF Lite runtime tests.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
-  return ModifyGraphWithDelegate(reinterpret_cast<TfLiteDelegate*>(delegate));
-}
-
 bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
@@ -107,16 +99,6 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
-TfLiteStatus Interpreter::SetBufferHandle(
-    int tensor_index, TfLiteBufferHandle buffer_handle,
-    TfLiteOpaqueDelegateStruct* opaque_delegate) {
-  // The following cast is safe only because this code is part of the TF Lite
-  // runtime code.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
-  return SetBufferHandle(tensor_index, buffer_handle,
-                         reinterpret_cast<TfLiteDelegate*>(opaque_delegate));
-}
-
 TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle* buffer_handle,
                                           TfLiteDelegate** delegate) {
@@ -129,20 +111,6 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
-TfLiteStatus Interpreter::GetBufferHandle(
-    int tensor_index, TfLiteBufferHandle* buffer_handle,
-    TfLiteOpaqueDelegateStruct** opaque_delegate) {
-  TfLiteDelegate* delegate_ptr;
-  TF_LITE_ENSURE_STATUS(
-      GetBufferHandle(tensor_index, buffer_handle, &delegate_ptr));
-  // The following cast is safe only because this code is part of the TF Lite
-  // runtime code.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
-  *opaque_delegate =
-      reinterpret_cast<TfLiteOpaqueDelegateStruct*>(delegate_ptr);
-  return kTfLiteOk;
-}
-
 void Interpreter::SetProfiler(Profiler* profiler) {
   if (profiler == nullptr) {
     root_profiler_ = nullptr;
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index ced842834a1..1d13363c624 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -1,10 +1,20 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite/core:special_rules.bzl", "builtin_ops_visibility_allowlist")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
+exports_files(
+    srcs = [
+        "builtin_op_kernels.h",
+        "register.h",
+    ],
+    visibility = ["//tensorflow/lite/core/shims:__subpackages__"],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = ["register.cc"],
@@ -14,15 +24,13 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     visibility = [
-        "//tensorflow/lite/core/shims:__subpackages__",
-        "//tensorflow/lite/kernels:__subpackages__",
-        "//tensorflow/lite/tools/benchmark:__subpackages__",
-    ],
+        "//tensorflow/lite:__subpackages__",
+    ] + builtin_ops_visibility_allowlist(),
     deps = [
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:tflite_with_xnnpack_optional",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:cc_api_stable",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_op_kernels",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -44,8 +52,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:tflite_with_xnnpack_optional",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:cc_api_stable",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_op_kernels",
         "//tensorflow/lite/schema:schema_fbs",
     ],
diff --git a/tensorflow/lite/core/kernels/builtin_op_kernels.h b/tensorflow/lite/core/kernels/builtin_op_kernels.h
index 9926787a0d6..15a0f0d7bbb 100644
--- a/tensorflow/lite/core/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/kernels/builtin_op_kernels.h
@@ -20,7 +20,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
 #define TENSORFLOW_LITE_CORE_KERNELS_BUILTIN_OP_KERNELS_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 3dde976492e..88903928b8e 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/core/kernels/register.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tflite_with_xnnpack_optional.h"
@@ -35,7 +35,7 @@ namespace builtin {
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS(), /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU(), /* min_version = */ 1,
              /* max_version = */ 3);
@@ -57,10 +57,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 6);
+             /* max_version = */ 7);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version = */ 1,
              /* max_version = */ 4);
@@ -82,7 +82,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 9);
+             /* max_version = */ 10);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -114,7 +114,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version = */ 1,
              /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version = */ 1,
@@ -171,7 +171,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version = */ 1,
              /* max_version = */ 5);
@@ -206,7 +206,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
   AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
@@ -215,7 +215,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -352,7 +352,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_UNSORTED_SEGMENT_SUM,
              Register_UNSORTED_SEGMENT_SUM());
   AddBuiltin(BuiltinOperator_ATAN2, Register_ATAN2());
-  AddBuiltin(BuiltinOperator_SIGN, Register_SIGN());
+  AddBuiltin(BuiltinOperator_SIGN, Register_SIGN(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
@@ -367,8 +369,17 @@ BuiltinOpResolver::BuiltinOpResolver() {
 
   // Populate the list of TF Lite delegate creators. The created delegates could
   // be applied to the model graph by default at runtime.
-  delegate_creators_.push_back([](int num_threads) {
-    return tflite::MaybeCreateXNNPACKDelegate(num_threads);
+  delegate_creators_.push_back([](TfLiteContext* context) {
+    return tflite::MaybeCreateXNNPACKDelegate(
+        context, /*enable_xnnpack_unsigned_quantized=*/false);
+  });
+}
+
+BuiltinOpResolverWithXNNPACK::BuiltinOpResolverWithXNNPACK() {
+  delegate_creators_.clear();
+  delegate_creators_.push_back([](TfLiteContext* context) {
+    return tflite::MaybeCreateXNNPACKDelegate(
+        context, /*enable_xnnpack_unsigned_quantized=*/true);
   });
 }
 
diff --git a/tensorflow/lite/core/kernels/register.h b/tensorflow/lite/core/kernels/register.h
index 5f53de827a5..3b1bb1d03b3 100644
--- a/tensorflow/lite/core/kernels/register.h
+++ b/tensorflow/lite/core/kernels/register.h
@@ -37,6 +37,12 @@ class BuiltinOpResolver : public MutableOpResolver {
   BuiltinOpResolver();
 };
 
+// This built-in op resolver enables XNNPACK by default for all data types.
+class BuiltinOpResolverWithXNNPACK : public BuiltinOpResolver {
+ public:
+  BuiltinOpResolverWithXNNPACK();
+};
+
 // TfLite interpreter could apply a TfLite delegate by default. To completely
 // disable this behavior, one could choose to use the following class
 // BuiltinOpResolverWithoutDefaultDelegates.
diff --git a/tensorflow/lite/core/model.h b/tensorflow/lite/core/model.h
index 0d028e51fe0..56b6f6a2c38 100644
--- a/tensorflow/lite/core/model.h
+++ b/tensorflow/lite/core/model.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Defines tflite::Interpreter and tflite::InterpreterBuilder.
 ///
 /// WARNING: Users of TensorFlow Lite should not include this file directly,
diff --git a/tensorflow/lite/core/model_builder.cc b/tensorflow/lite/core/model_builder.cc
index cd1af9673a7..c1c5409cc1a 100644
--- a/tensorflow/lite/core/model_builder.cc
+++ b/tensorflow/lite/core/model_builder.cc
@@ -54,6 +54,8 @@ std::unique_ptr<Allocation> GetAllocationFromFile(
   return allocation;
 }
 
+namespace impl {
+
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
@@ -69,6 +71,9 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
       GetAllocationFromFile(filename, error_reporter), extra_verifier,
       error_reporter);
 }
+
+}  // namespace impl
+
 #endif
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
@@ -170,14 +175,19 @@ string FlatBufferModel::GetMinimumRuntime() const {
 }
 
 std::map<std::string, std::string> FlatBufferModel::ReadAllMetadata() const {
+  return ReadAllMetadata(model_);
+}
+
+std::map<std::string, std::string> FlatBufferModel::ReadAllMetadata(
+    const tflite::Model* model) {
   std::map<std::string, std::string> keys_values;
-  if (!model_ || !model_->metadata() || !model_->buffers()) return keys_values;
+  if (!model || !model->metadata() || !model->buffers()) return keys_values;
 
-  for (int i = 0; i < model_->metadata()->size(); ++i) {
-    auto metadata = model_->metadata()->Get(i);
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    auto metadata = model->metadata()->Get(i);
     auto buf = metadata->buffer();
-    if (buf >= model_->buffers()->size()) continue;
-    const tflite::Buffer* buffer = (*model_->buffers())[buf];
+    if (buf >= model->buffers()->size()) continue;
+    const tflite::Buffer* buffer = (*model->buffers())[buf];
     if (!buffer || !buffer->data()) continue;
     const flatbuffers::Vector<uint8_t>* array = buffer->data();
     if (!array) continue;
diff --git a/tensorflow/lite/core/model_builder.h b/tensorflow/lite/core/model_builder.h
index b408479478f..5d8bc723d8d 100644
--- a/tensorflow/lite/core/model_builder.h
+++ b/tensorflow/lite/core/model_builder.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Deserialization infrastructure for tflite. Provides functionality
 /// to go from a serialized tflite model in flatbuffer format to an
 /// in-memory representation of the model.
@@ -31,10 +32,10 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/verifier.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/stderr_reporter.h"
@@ -69,6 +70,8 @@ namespace tflite {
 /// OpResolver must be defined to provide your kernel implementations to the
 /// interpreter. This is environment specific and may consist of just the
 /// builtin ops, or some custom operators you defined to extend tflite.
+namespace impl {
+
 class FlatBufferModel {
  public:
   /// Builds a model based on a file.
@@ -179,6 +182,11 @@ class FlatBufferModel {
   // See Metadata table in TFLite schema.
   std::map<std::string, std::string> ReadAllMetadata() const;
 
+  // Return model metadata as a mapping of name & buffer strings.
+  // See Metadata table in TFLite schema.
+  static std::map<std::string, std::string> ReadAllMetadata(
+      const ::tflite::Model* model);
+
   /// Returns true if the model identifier is correct (otherwise false and
   /// reports an error).
   bool CheckModelIdentifier() const;
@@ -208,6 +216,10 @@ class FlatBufferModel {
   std::unique_ptr<Allocation> allocation_;
 };
 
+}  // namespace impl
+
+using FlatBufferModel = impl::FlatBufferModel;
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CORE_MODEL_BUILDER_H_
diff --git a/tensorflow/lite/core/model_test.cc b/tensorflow/lite/core/model_test.cc
index 90045357e31..8d0583e62e7 100644
--- a/tensorflow/lite/core/model_test.cc
+++ b/tensorflow/lite/core/model_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -29,15 +29,15 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/verifier.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/interpreter_test_util.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
diff --git a/tensorflow/lite/core/shims/BUILD b/tensorflow/lite/core/shims/BUILD
index 919ef14d75d..7ce31988a93 100644
--- a/tensorflow/lite/core/shims/BUILD
+++ b/tensorflow/lite/core/shims/BUILD
@@ -1,6 +1,7 @@
 # Description: this package contains shim library targets that forward
 # to the TF Lite C and C++ API targets.  See README.md.
 
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
@@ -20,11 +21,24 @@ exports_files(
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
 )
 
+# Any target that depends on this one, directly or indirectly,
+# will have -DTFLITE_USE_OPAQUE_DELEGATE passed to the C/C++ compiler.
+cc_library(
+    name = "tflite_use_opaque_delegate",
+    compatible_with = get_compatible_with_portable(),
+    defines = [
+        "TFLITE_USE_OPAQUE_DELEGATE",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 #------------------------------------------------------------------------------
 # C++ API
 
@@ -62,7 +76,7 @@ alias(
     actual = "framework_experimental",
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
 )
@@ -75,14 +89,14 @@ cc_library(
     copts = tflite_copts() + tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:arena_planner",
         "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:memory_planner",
         "//tensorflow/lite:minimal_logging",
@@ -91,9 +105,10 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -115,14 +130,14 @@ cc_library(
     copts = tflite_copts() + tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:arena_planner",
         "//tensorflow/lite:external_cpu_backend_context",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite:graph_info",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:memory_planner",
@@ -132,10 +147,11 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core:headers",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -169,9 +185,10 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -199,9 +216,10 @@ cc_library(
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_experimental",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/api:verifier",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -214,20 +232,19 @@ cc_library(
         "cc/kernels/register.h",
         # TODO(b/161243354): remove the line below when clients no
         # longer depend on it.
-        "//tensorflow/lite/kernels:builtin_op_kernels.h",
+        "//tensorflow/lite/core/kernels:builtin_op_kernels.h",
         "//tensorflow/lite/kernels:fully_connected.h",
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
         ":builtin_ops_list",
         "//tensorflow/lite:cc_api",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:builtin_ops",
     ],
 )
 
@@ -237,14 +254,14 @@ cc_library(
 cc_library(
     name = "create_op_resolver_header",
     srcs = [
-        "//tensorflow/lite:create_op_resolver.h",
+        "//tensorflow/lite/core:create_op_resolver.h",
     ],
     hdrs = [
         "cc/create_op_resolver.h",
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
@@ -256,7 +273,7 @@ cc_library(
     name = "create_op_resolver_with_builtin_ops",
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
@@ -285,11 +302,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
     ],
 )
 
@@ -306,11 +323,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/tools:verifier",
+        "//tensorflow/lite/core/tools:verifier",
     ],
 )
 
@@ -322,7 +339,7 @@ cc_library(
         "//tensorflow/lite/java/src/main/native:__pkg__",
     ],
     deps = [
-        "//tensorflow/lite/tools:verifier_internal",
+        "//tensorflow/lite/core/tools:verifier_internal",
     ],
 )
 
@@ -343,7 +360,7 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = ["//tensorflow/lite/c:c_api"],
@@ -355,7 +372,7 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = ["//tensorflow/lite/c:c_api_without_op_resolver"],
@@ -363,14 +380,20 @@ cc_library(
 
 cc_library(
     name = "c_api_experimental",
-    hdrs = ["c/c_api_experimental.h"],
+    hdrs = [
+        "c/c_api_experimental.h",
+        "c/c_api_opaque.h",
+    ],
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
-    deps = ["//tensorflow/lite/c:c_api_experimental"],
+    deps = [
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/core/c:c_api_experimental",
+    ],
 )
 
 cc_library(
@@ -379,10 +402,10 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
-    deps = ["//tensorflow/lite/c:c_api_types"],
+    deps = ["//tensorflow/lite/core/c:c_api_types"],
 )
 
 cc_library(
@@ -391,10 +414,12 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -403,10 +428,12 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -415,11 +442,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
     ],
 )
 
@@ -429,11 +456,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration/c:gpu_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin",
     ],
 )
 
@@ -443,11 +470,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration/c:nnapi_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin",
     ],
 )
 
@@ -457,11 +484,11 @@ cc_library(
     copts = tflite_copts_warnings(),
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/experimental/acceleration/configuration/c:xnnpack_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin",
     ],
 )
 
@@ -510,7 +537,7 @@ cc_library(
     hdrs = ["c/shims_test_util.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
 )
@@ -521,7 +548,7 @@ cc_library(
     hdrs = ["cc/shims_test_util.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-        "@mediapipe//mediapipe/tasks:__subpackages__",
+        "@mediapipe//mediapipe:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/core/shims/c/builtin_op_data.h b/tensorflow/lite/core/shims/c/builtin_op_data.h
index 747c80580b9..c3a74bf9f18 100644
--- a/tensorflow/lite/core/shims/c/builtin_op_data.h
+++ b/tensorflow/lite/core/shims/c/builtin_op_data.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/lite/core/shims/c/c_api_experimental.h b/tensorflow/lite/core/shims/c/c_api_experimental.h
index ec1222c699e..a317ef1a2fe 100644
--- a/tensorflow/lite/core/shims/c/c_api_experimental.h
+++ b/tensorflow/lite/core/shims/c/c_api_experimental.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
 
-#include "tensorflow/lite/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/lite/core/shims/c/c_api_opaque.h b/tensorflow/lite/core/shims/c/c_api_opaque.h
new file mode 100644
index 00000000000..18e7ae20877
--- /dev/null
+++ b/tensorflow/lite/core/shims/c/c_api_opaque.h
@@ -0,0 +1,20 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
+#define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
+
+#include "tensorflow/lite/c/c_api_opaque.h"
+
+#endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_OPAQUE_H_
diff --git a/tensorflow/lite/core/shims/c/c_api_types.h b/tensorflow/lite/core/shims/c/c_api_types.h
index 1b8b8f552bd..cf940fc050f 100644
--- a/tensorflow/lite/core/shims/c/c_api_types.h
+++ b/tensorflow/lite/core/shims/c/c_api_types.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_C_API_TYPES_H_
diff --git a/tensorflow/lite/core/shims/c/common.h b/tensorflow/lite/core/shims/c/common.h
index bcbd16847d9..0ac0deae737 100644
--- a/tensorflow/lite/core/shims/c/common.h
+++ b/tensorflow/lite/core/shims/c/common.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_COMMON_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_COMMON_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 // TfLiteOpaqueDelegate: allows delegation of nodes to alternative backends.
 // TfLiteOpaqueDelegate is an abstract type that is intended to have the same
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h
index 2eee6d51693..0ac219521bc 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h
+++ b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h
@@ -15,20 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef TfLiteDelegatePluginCreateFunc TfLiteOpaqueDelegatePluginCreateFunc;
-typedef TfLiteDelegatePluginDestroyFunc TfLiteOpaqueDelegatePluginDestroyFunc;
-typedef TfLiteDelegatePluginGetDelegateErrnoFunc
-    TfLiteOpaqueDelegatePluginGetDelegateErrnoFunc;
-typedef TfLiteDelegatePlugin TfLiteOpaqueDelegatePlugin;
-
-#ifdef __cplusplus
-};  // extern "C"
-#endif
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h
index b4c91a52100..ecb7397b6de 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h
+++ b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/gpu_plugin.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h
index 2a374f862e3..2f7c590ad8f 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h
+++ b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/nnapi_plugin.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h
index d049c9421a7..66fa56b463b 100644
--- a/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h
+++ b/tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h
@@ -15,6 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_C_EXPERIMENTAL_ACCELERATION_CONFIGURATION_XNNPACK_H_
diff --git a/tensorflow/lite/core/shims/cc/create_op_resolver.h b/tensorflow/lite/core/shims/cc/create_op_resolver.h
index a5873c1ed11..8c412313056 100644
--- a/tensorflow/lite/core/shims/cc/create_op_resolver.h
+++ b/tensorflow/lite/core/shims/cc/create_op_resolver.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/create_op_resolver.h"
+#include "tensorflow/lite/core/create_op_resolver.h"
 
 namespace tflite_shims {
 using ::tflite::CreateOpResolver;
diff --git a/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h
index fc47636cdc3..4712ed60def 100644
--- a/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/core/shims/cc/experimental/acceleration/configuration/delegate_registry.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_CC_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 
 namespace tflite_shims {
 namespace delegates {
diff --git a/tensorflow/lite/core/shims/cc/interpreter.h b/tensorflow/lite/core/shims/cc/interpreter.h
index 45cf36be245..0212b7a3c09 100644
--- a/tensorflow/lite/core/shims/cc/interpreter.h
+++ b/tensorflow/lite/core/shims/cc/interpreter.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 
 namespace tflite_shims {
-using Interpreter = ::tflite::Interpreter;
+using Interpreter = ::tflite::impl::Interpreter;
 }  // namespace tflite_shims
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_H_
diff --git a/tensorflow/lite/core/shims/cc/interpreter_builder.h b/tensorflow/lite/core/shims/cc/interpreter_builder.h
index 891a5747f7e..f98b01246bc 100644
--- a/tensorflow/lite/core/shims/cc/interpreter_builder.h
+++ b/tensorflow/lite/core/shims/cc/interpreter_builder.h
@@ -17,10 +17,10 @@ limitations under the License.
 
 /// For documentation, see third_party/tensorflow/lite/interpreter_builder.h.
 
-#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 
 namespace tflite_shims {
-using InterpreterBuilder = ::tflite::InterpreterBuilder;
+using InterpreterBuilder = ::tflite::impl::InterpreterBuilder;
 }  // namespace tflite_shims
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h b/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h
index da671f68406..2cd7c5b836d 100644
--- a/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/core/shims/cc/kernels/builtin_op_kernels.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_BUILTIN_OP_KERNELS_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_BUILTIN_OP_KERNELS_H_
 
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 
 namespace tflite_shims {
 namespace ops {
diff --git a/tensorflow/lite/core/shims/cc/kernels/register.h b/tensorflow/lite/core/shims/cc/kernels/register.h
index 0dedfc99068..88e080f26da 100644
--- a/tensorflow/lite/core/shims/cc/kernels/register.h
+++ b/tensorflow/lite/core/shims/cc/kernels/register.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_CC_KERNELS_REGISTER_H_
 
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
 
 namespace tflite_shims {
 namespace ops {
diff --git a/tensorflow/lite/core/shims/cc/model_builder.h b/tensorflow/lite/core/shims/cc/model_builder.h
index 3ee9c5a3016..ad48e2c7855 100644
--- a/tensorflow/lite/core/shims/cc/model_builder.h
+++ b/tensorflow/lite/core/shims/cc/model_builder.h
@@ -17,10 +17,10 @@ limitations under the License.
 
 /// For documentation, see third_party/tensorflow/lite/model_builder.h.
 
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 
 namespace tflite_shims {
-using FlatBufferModel = ::tflite::FlatBufferModel;
+using FlatBufferModel = ::tflite::impl::FlatBufferModel;
 }  // namespace tflite_shims
 
 #endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_BUILDER_H_
diff --git a/tensorflow/lite/core/shims/cc/tools/verifier.h b/tensorflow/lite/core/shims/cc/tools/verifier.h
index 41721c2681c..f9773b54b4a 100644
--- a/tensorflow/lite/core/shims/cc/tools/verifier.h
+++ b/tensorflow/lite/core/shims/cc/tools/verifier.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_H_
 
-#include "tensorflow/lite/tools/verifier.h"
+#include "tensorflow/lite/core/tools/verifier.h"
 
 namespace tflite_shims {
 
diff --git a/tensorflow/lite/core/shims/cc/tools/verifier_internal.h b/tensorflow/lite/core/shims/cc/tools/verifier_internal.h
index 15cff08ba75..49f47129620 100644
--- a/tensorflow/lite/core/shims/cc/tools/verifier_internal.h
+++ b/tensorflow/lite/core/shims/cc/tools/verifier_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_INTERNAL_H_
 #define TENSORFLOW_LITE_CORE_SHIMS_CC_TOOLS_VERIFIER_INTERNAL_H_
 
-#include "tensorflow/lite/tools/verifier_internal.h"
+#include "tensorflow/lite/core/tools/verifier_internal.h"
 
 namespace tflite_shims {
 namespace internal {
diff --git a/tensorflow/lite/core/special_rules.bzl b/tensorflow/lite/core/special_rules.bzl
new file mode 100644
index 00000000000..08be72b50d6
--- /dev/null
+++ b/tensorflow/lite/core/special_rules.bzl
@@ -0,0 +1,27 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Build macros for C++ headers."""
+
+def builtin_ops_visibility_allowlist():
+    """Returns a list of packages that can depend on :builtin_ops."""
+    return []
+
+def verifier_visibility_allowlist():
+    """Returns a list of packages that can depend on :verifier."""
+    return []
+
+def delegate_registry_visibility_allowlist():
+    """Returns a list of packages that can depend on delegate_registry.h."""
+    return []
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index aa63bdb7107..ade15c70998 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -26,24 +26,26 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 #ifdef TFLITE_USE_SIMPLE_MEMORY_PLANNER
@@ -202,27 +204,41 @@ class InterpreterInfo : public GraphInfo {
   explicit InterpreterInfo(Subgraph* subgraph) : subgraph_(subgraph) {}
 
   size_t num_tensors() const override { return subgraph_->tensors_size(); }
+
   TfLiteTensor* tensors() override { return subgraph_->tensors(); }
+
   TfLiteTensor* tensor(size_t index) override {
     return subgraph_->tensor(index);
   }
+
   size_t num_execution_nodes() const override {
     return subgraph_->execution_plan().size();
   }
+
   size_t num_total_nodes() const override { return subgraph_->nodes_size(); }
+
   const TfLiteNode& node(size_t index) const override {
     int node_index = subgraph_->execution_plan()[index];
     return subgraph_->nodes_and_registration()[node_index].first;
   }
+
+  const TfLiteRegistration& registration(size_t index) const override {
+    const int node_index = subgraph_->execution_plan()[index];
+    return subgraph_->nodes_and_registration()[node_index].second;
+  }
+
   size_t node_index(size_t index) const override {
     return subgraph_->execution_plan()[index];
   }
+
   const std::vector<int>& inputs() const override {
     return subgraph_->inputs();
   }
+
   const std::vector<int>& outputs() const override {
     return subgraph_->outputs();
   }
+
   const std::vector<int>& variables() const override {
     return subgraph_->variables();
   }
@@ -347,7 +363,7 @@ void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
 //
 // +----------------------------------------------+
 // | TfLiteOpaqueDelegateParams                   |
-// | struct TfLiteOpaqueDelegateStruct* delegate; |
+// | struct TfLiteOpaqueDelegate* delegate;       |
 // | void* delegate_data;                         |
 // | TfLiteIntArray* nodes_to_replace;            |--\
 // | TfLiteIntArray* input_tensors;               |--+--\
@@ -421,8 +437,8 @@ TfLiteOpaqueDelegateParams* CreateOpaqueDelegateParams(
                                                            node_subset);
   // The following cast is safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
-  params->delegate = reinterpret_cast<TfLiteOpaqueDelegateStruct*>(delegate);
+  // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
+  params->delegate = reinterpret_cast<TfLiteOpaqueDelegate*>(delegate);
   params->delegate_data = delegate->opaque_delegate_builder->data;
   return params;
 }
@@ -447,6 +463,33 @@ void PopulatePreviewDelegateParams(const NodeSubset& node_subset,
                              params->output_tensors);
 }
 
+// Returns the 'custom_name' associated with the provided 'registration', or
+// "unknown" if the registration does not have a custom name.
+//
+// Note that 'TfLiteRegistration' has a top-level 'custom_name' field and also
+// a nested 'custom_name' field defined inside the optionally set
+// 'registration_external' structure.  The top-level field takes precedence over
+// the nested field.  'TfLiteRegistration'
+// objects can optionally carry a 'TfLiteRegistrationExternal' pointer in their
+// 'registration_external' field.  If that's the case then the
+// 'TfLiteRegistration' object is merely a wrapper over a
+// 'TfLiteRegistrationExternal', with all fields except 'registration_external'
+// being null, that contains the actual logic that the registration represents.
+// See also the comment inside
+// 'TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels'.
+const char* GetDelegateKernalName(const TfLiteRegistration& registration) {
+  if (registration.custom_name) {
+    return registration.custom_name;
+  }
+
+  if (registration.registration_external &&
+      registration.registration_external->custom_name) {
+    return registration.registration_external->custom_name;
+  }
+
+  return "unknown";
+}
+
 }  // namespace
 
 TfLiteStatus Subgraph::PartitionGraph(const TfLiteIntArray* nodes_to_replace,
@@ -460,6 +503,13 @@ TfLiteStatus Subgraph::PartitionGraph(const TfLiteIntArray* nodes_to_replace,
 TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
     TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
     TfLiteDelegate* delegate) {
+  // The subgraph is taking ownership of the external registration, in case the
+  // user has supplied an opaque delegate.
+  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
+    registration_externals_.insert(std::unique_ptr<TfLiteRegistrationExternal>(
+        registration.registration_external));
+  }
+
   // Ignore empty node replacement sets.
   if (!nodes_to_replace->size) {
     return kTfLiteOk;
@@ -480,20 +530,13 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
   // off in production builds on other platforms.
   TFLITE_LOG_PROD(
       tflite::TFLITE_LOG_VERBOSE,
-      "Replacing %d node(s) with delegate (%s) node, yielding %zu partitions.",
-      nodes_to_replace->size,
-      registration.custom_name ? registration.custom_name : "unknown",
+      "Replacing %d node(s) with delegate (%s) node, yielding %zu partitions "
+      "for the whole graph.",
+      nodes_to_replace->size, GetDelegateKernalName(registration),
       node_subsets.size());
 
   execution_plan_.clear();
 
-  // The subgraph is taking ownership of the external registration, in case the
-  // user has supplied an opaque delegate.
-  if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
-    registration_externals_.insert(std::unique_ptr<TfLiteRegistrationExternal>(
-        registration.registration_external));
-  }
-
   for (auto& node_subset : node_subsets) {
     // Subsets claimed by the delegate should have a "macro" op created, the
     // other node_subsets (kTfNonPartition) just have their nodes added back to
@@ -790,27 +833,6 @@ TfLiteStatus Subgraph::CheckInputAndOutputForOverlap(const int* input_indices,
   return kTfLiteOk;
 }
 
-TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
-                                     size_t dims_size, size_t* bytes) {
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
-  // When 'dims_size' is 0, we simply assume it's a scalar. Therefore, we start
-  // 'count' as 1.
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) {
-    size_t old_count = count;
-    TF_LITE_ENSURE_MSG(
-        &context_,
-        MultiplyAndCheckOverflow(old_count, dims[k], &count) == kTfLiteOk,
-        "BytesRequired number of elements overflowed.\n");
-  }
-  size_t type_size = 0;
-  TF_LITE_ENSURE_OK(&context_, GetSizeOfType(&context_, type, &type_size));
-  TF_LITE_ENSURE_MSG(
-      &context_, MultiplyAndCheckOverflow(type_size, count, bytes) == kTfLiteOk,
-      "BytesRequired number of bytes overflowed.\n");
-  return kTfLiteOk;
-}
-
 TfLiteStatus Subgraph::AllocateTensors() {
   if (!consistent_) {
     ReportError("AllocateTensors() called on inconsistent model.");
@@ -869,6 +891,27 @@ TfLiteStatus Subgraph::AllocateTensors() {
   // index that uses the tensor.
   InitializeTensorReleaseMap();
 
+  // Temporary tensors allocated during Prepare for nodes which are subsequently
+  // delegated are not required and can be freed.
+  if (!pre_delegation_execution_plan_.empty()) {
+    // NOLINTNEXTLINE - absl::flat_hash_set increases binary size by 106kB.
+    std::unordered_set<int> delegated_nodes;
+    for (int pre_delegation_node_index : pre_delegation_execution_plan_) {
+      delegated_nodes.insert(pre_delegation_node_index);
+    }
+    for (int pose_delegation_node_index : execution_plan_) {
+      delegated_nodes.erase(pose_delegation_node_index);
+    }
+    for (int node_index : delegated_nodes) {
+      TfLiteNode& node = nodes_and_registration_[node_index].first;
+      // Free all temporary tensors allocated by delegated nodes.
+      for (int i = 0; i < node.temporaries->size; ++i) {
+        TfLiteTensor* temporary_tensor = tensor(node.temporaries->data[i]);
+        TfLiteTensorDataFree(temporary_tensor);
+        temporary_tensor->bytes = 0;
+      }
+    }
+  }
   return kTfLiteOk;
 }
 
@@ -996,7 +1039,7 @@ bool Subgraph::OpMightHaveSideEffect(
 
 TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
                                          const std::vector<int>& dims) {
-  const bool delegates_applied = !pre_delegation_execution_plan_.empty();
+  const bool delegates_applied = !delegates_applied_.empty();
   const bool graph_is_immutable = state_ == kStateInvokableAndImmutable;
   if (graph_is_immutable && !delegates_applied) {
     ReportError("ResizeInputTensor is disallowed when graph is immutable.");
@@ -1309,7 +1352,8 @@ TfLiteStatus Subgraph::RemoveUnusedInputs() {
     refcounts[tensor_index]++;
   }
   // Count references to node input tensors.
-  for (size_t i = 0; i < graph_info->num_execution_nodes(); ++i) {
+  const size_t num_execution_nodes = graph_info->num_execution_nodes();
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info->node(i);
     TfLiteIntArray* node_inputs = node.inputs;
     for (int j = 0; j < node_inputs->size; ++j) {
@@ -1337,6 +1381,11 @@ TfLiteStatus Subgraph::RemoveUnusedInputs() {
 }
 
 TfLiteStatus Subgraph::Invoke() {
+  auto status = InvokeImpl();
+  telemetry::TelemetryReportEvent(&context_, "Invoke", status);
+  return status;
+}
+TfLiteStatus Subgraph::InvokeImpl() {
   if (!consistent_) {
     ReportError("Invoke called on model that is not consistent.");
     return kTfLiteError;
@@ -1381,7 +1430,14 @@ TfLiteStatus Subgraph::Invoke() {
     tensorflow::profiler::TraceMe* trace_op =
         tflite::OnTfLiteOpInvoke(op_name, subgraph_index_, node_index);
 #endif  // TF_LITE_TENSORFLOW_PROFILER
-    TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(profiler_.get(), op_name, node_index);
+
+    // If per operator profiling flag is set in the delegate, this macro op
+    // should not be profiled, thus a nullptr is passed to the ScopedProfile
+    bool profile_op =
+        !(node.delegate != nullptr &&
+          (node.delegate->flags & kTfLiteDelegateFlagsPerOperatorProfiling));
+    TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE(
+        profile_op ? profiler_.get() : nullptr, op_name, node_index);
 
     for (int i = 0; i < node.inputs->size; ++i) {
       int tensor_index = node.inputs->data[i];
@@ -1588,8 +1644,9 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
   if (type != kTfLiteString && type != kTfLiteResource &&
       type != kTfLiteVariant && sparsity == nullptr) {
     size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, ndims, &required_bytes));
+    TF_LITE_ENSURE_OK(
+        &context_,
+        tflite::BytesRequired(type, dims, ndims, &required_bytes, context_));
     TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
   }
 
@@ -1642,8 +1699,9 @@ TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     // many bytes we will need based on the dimensions. String tensors are
     // allocated dynamically and we can't know ahead of time how much space
     // they will require.
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, ndims, &required_bytes));
+    TF_LITE_ENSURE_OK(
+        &context_,
+        tflite::BytesRequired(type, dims, ndims, &required_bytes, context_));
   }
 
   TfLiteAllocationType allocation_type = kTfLiteArenaRw;
@@ -1693,8 +1751,9 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
     if (tensor->type != kTfLiteString && tensor->type != kTfLiteResource &&
         tensor->type != kTfLiteVariant) {
       size_t bytesRequired;
-      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
-                                          new_size->size, &bytesRequired);
+      TfLiteStatus status =
+          tflite::BytesRequired(tensor->type, new_size->data, new_size->size,
+                                &bytesRequired, context_);
       if (status != kTfLiteOk) {
         TfLiteIntArrayFree(new_size);
         return kTfLiteError;
@@ -1858,7 +1917,7 @@ TfLiteStatus Subgraph::RedoAllDelegates() {
   std::vector<TfLiteDelegate*> delegates_to_apply;
   delegates_applied_.swap(delegates_to_apply);
   for (auto* delegate : delegates_to_apply) {
-    TF_LITE_ENSURE_STATUS(ModifyGraphWithDelegate(delegate));
+    TF_LITE_ENSURE_STATUS(ModifyGraphWithDelegateImpl(delegate));
   }
   return kTfLiteOk;
 }
@@ -1906,6 +1965,12 @@ TfLiteStatus Subgraph::EnsureMemoryAllocations() {
 }
 
 TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  auto status = ModifyGraphWithDelegateImpl(delegate);
+  telemetry::TelemetryReportEvent(&context_, "ModifyGraphWithDelegate", status);
+  return status;
+}
+
+TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   TFLITE_SCOPED_TAGGED_DEFAULT_PROFILE(profiler_.get(),
                                        "ModifyGraphWithDelegate");
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 9060186ef8c..11deb195d8a 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -30,10 +30,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/allocation.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/c/common_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/experimental/resource/initialization_status.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
@@ -44,25 +44,33 @@ limitations under the License.
 
 namespace tflite {
 
+#ifndef DOXYGEN_SKIP
 class SingleOpModel;  // Class for friend declarations.
 
 namespace internal {
 class CommonOpaqueConversionUtil;  // Class for friend declarations.
 }
 
+namespace impl {
+class Interpreter;         // Class for friend declarations.
+class InterpreterBuilder;  // Class for friend declarations.
+}  // namespace impl
+
 namespace delegates {
 namespace test_utils {
 class TestDelegate;  // Class for friend declarations.
 }  // namespace test_utils
 }  // namespace delegates
+#endif  // DOXYGEN_SKIP
 
 class Subgraph {
  public:
-  friend class Interpreter;
+#ifndef DOXYGEN_SKIP
+  friend class ::tflite::impl::Interpreter;
   friend class SignatureRunner;
   friend class SingleOpModel;
   friend class internal::CommonOpaqueConversionUtil;
-
+#endif  // DOXYGEN_SKIP
   Subgraph(ErrorReporter* error_reporter,
            TfLiteExternalContext** external_contexts,
            std::vector<std::unique_ptr<Subgraph>>* subgraphs,
@@ -340,6 +348,12 @@ class Subgraph {
   static constexpr int kInvalidSubgraphIndex = -1;
   int GetSubgraphIndex() const { return subgraph_index_; }
 
+  // Returns true if this subgraph is the primary subgraph.
+  // Returns false otherwise, including the cases when GetSubgraphIndex()
+  // returns kInvalidSubgraphIndex.
+  // WARNING: This is an experimental API and subject to change.
+  bool IsPrimarySubgraph() const { return GetSubgraphIndex() == 0; }
+
   // True if all tensors in the graph has static size after calling
   // `AllocateTensors` function.
   // Before `AllocateTensors` is called, this will always return true;
@@ -443,8 +457,10 @@ class Subgraph {
   }
 
  private:
-  friend class InterpreterBuilder;
+#ifndef DOXYGEN_SKIP
+  friend class tflite::impl::InterpreterBuilder;
   friend class TestDelegate;
+#endif  // DOXYGEN_SKIP
   // SubgraphAwareProfiler wraps an actual TFLite profiler, such as a
   // BufferedProfiler instance, and takes care of event profiling/tracing in a
   // certain subgraph.
@@ -481,6 +497,12 @@ class Subgraph {
                           subgraph_index_);
     }
 
+    void AddEventWithData(const char* tag, EventType event_type,
+                          const void* data) override {
+      if (!profiler_) return;
+      profiler_->AddEventWithData(tag, event_type, data);
+    }
+
    private:
     // Not own the memory.
     Profiler* const profiler_;
@@ -555,11 +577,15 @@ class Subgraph {
                                              const int* output_indices,
                                              int num_outputs);
 
-  // Compute the number of bytes required to represent a tensor with dimensions
-  // specified by the array dims (of length dims_size). Returns the status code
-  // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                             size_t* bytes);
+  // Invoke the subgraph (run the whole graph in dependency order).
+  // Does not report invoke status through profiler.
+  TfLiteStatus InvokeImpl();
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // Does not report invoke status through profiler.
+  TfLiteStatus ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate);
 
   // Request an tensor be resized implementation. If the given tensor is of
   // type kTfLiteDynamic it will also be allocated new memory.
diff --git a/tensorflow/lite/core/tools/BUILD b/tensorflow/lite/core/tools/BUILD
index 2002a62e944..63db7b430c4 100644
--- a/tensorflow/lite/core/tools/BUILD
+++ b/tensorflow/lite/core/tools/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow/lite:special_rules.bzl", "verifier_internal_visibility_allowlist")
 load("//tensorflow/lite:build_def.bzl", "tflite_self_contained_libs_test_suite")
+load("//tensorflow/lite/core:special_rules.bzl", "verifier_visibility_allowlist")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite:__subpackages__"],
     licenses = ["notice"],
 )
 
@@ -11,15 +14,16 @@ cc_library(
     hdrs = ["verifier.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
-    ],
+    ] + verifier_visibility_allowlist(),
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/core/api:op_resolver",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/tools:verifier_internal",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -43,6 +47,7 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
@@ -58,6 +63,8 @@ cc_library(
     hdrs = ["verifier_internal.h"],
     visibility = verifier_internal_visibility_allowlist() + [
         "//tensorflow/lite/tools:__subpackages__",
+        "//tensorflow/lite/core/shims:__subpackages__",
+        "//tensorflow/lite/java/src/main/native:__pkg__",
     ],
     deps = [
         "//tensorflow/lite/schema:schema_fbs",
@@ -74,6 +81,7 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/core/tools/verifier.h b/tensorflow/lite/core/tools/verifier.h
index 88fad54d576..1ea028da294 100644
--- a/tensorflow/lite/core/tools/verifier.h
+++ b/tensorflow/lite/core/tools/verifier.h
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/model.h"      // Legacy.
 #include "tensorflow/lite/error_reporter.h"  // Legacy.
-#include "tensorflow/lite/model.h"           // Legacy.
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/tools/verifier_internal_test.cc b/tensorflow/lite/core/tools/verifier_internal_test.cc
index 350ab22bbbe..fa063a11f17 100644
--- a/tensorflow/lite/core/tools/verifier_internal_test.cc
+++ b/tensorflow/lite/core/tools/verifier_internal_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/error_reporter.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
diff --git a/tensorflow/lite/core/tools/verifier_test.cc b/tensorflow/lite/core/tools/verifier_test.cc
index 7618bc318ec..664782f504e 100644
--- a/tensorflow/lite/core/tools/verifier_test.cc
+++ b/tensorflow/lite/core/tools/verifier_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/error_reporter.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
diff --git a/tensorflow/lite/create_op_resolver.h b/tensorflow/lite/create_op_resolver.h
index 942c2b53300..41012171b07 100644
--- a/tensorflow/lite/create_op_resolver.h
+++ b/tensorflow/lite/create_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
 
-/// For documentation, see
-/// third_party/tensorflow/lite/core/create_op_resolver.h.
-#include "tensorflow/lite/core/create_op_resolver.h"  // IWYU pragma: export
+#include <memory>
+
+#include "tensorflow/lite/core/create_op_resolver.h"
+
+namespace tflite {
+using ::tflite::CreateOpResolver;
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_CREATE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/create_op_resolver_with_builtin_ops.cc b/tensorflow/lite/create_op_resolver_with_builtin_ops.cc
index fb8b6eaeb2b..53b42a972fa 100644
--- a/tensorflow/lite/create_op_resolver_with_builtin_ops.cc
+++ b/tensorflow/lite/create_op_resolver_with_builtin_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/create_op_resolver.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/create_op_resolver.h"
+#include "tensorflow/lite/core/kernels/register.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/create_op_resolver_with_selected_ops.cc b/tensorflow/lite/create_op_resolver_with_selected_ops.cc
index ac981fbbc0c..d255682cbf0 100644
--- a/tensorflow/lite/create_op_resolver_with_selected_ops.cc
+++ b/tensorflow/lite/create_op_resolver_with_selected_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/create_op_resolver.h"
+#include "tensorflow/lite/core/create_op_resolver.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 // This method is generated by `gen_selected_ops`.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 6a9510ca085..22a993e9ed5 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -18,10 +18,16 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
+config_setting(
+    name = "tflite_debug_delegate",
+    define_values = {"tflite_debug_delegate": "true"},
+)
+
 cc_library(
     name = "telemetry",
     srcs = ["telemetry.cc"],
@@ -31,6 +37,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
     ],
 )
@@ -44,6 +51,7 @@ cc_test(
         ":telemetry",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/profiling:profile_buffer",
         "@com_google_googletest//:gtest_main",
@@ -61,6 +69,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
 )
@@ -84,6 +93,7 @@ cc_test(
     deps = [
         ":utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -103,6 +113,7 @@ cc_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -131,10 +142,12 @@ cc_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_experimental",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/experimental/remat:metadata_util",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_conversion_utils",
@@ -146,6 +159,47 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "opaque_delegate_test",
+    size = "small",
+    srcs = ["opaque_delegate_test.cc"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    defines = [
+        "TFLITE_USE_OPAQUE_DELEGATE",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "opaque_delegate_strip_error_strings_test",
+    size = "small",
+    srcs = ["opaque_delegate_test.cc"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    defines = [
+        "TFLITE_USE_OPAQUE_DELEGATE",
+        "TF_LITE_STRIP_ERROR_STRINGS",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "delegate_test_util",
     testonly = True,
@@ -155,12 +209,13 @@ cc_library(
         ":interpreter_utils",
         ":utils",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
         "//tensorflow/lite:util",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_conversion_utils",
@@ -181,6 +236,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@farmhash_archive//:farmhash",
         "@flatbuffers",
     ],
@@ -195,6 +251,7 @@ cc_test(
         ":serialization",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/delegates/coreml/BUILD b/tensorflow/lite/delegates/coreml/BUILD
index 724404544f7..624aeae7124 100644
--- a/tensorflow/lite/delegates/coreml/BUILD
+++ b/tensorflow/lite/delegates/coreml/BUILD
@@ -13,9 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+)
 
 licenses(["notice"])
 
@@ -46,6 +49,10 @@ objc_library(
     srcs = ["coreml_delegate.mm"],
     hdrs = ["coreml_delegate.h"],
     copts = ["-std=c++17"],
+    defines = select({
+        "//tensorflow/lite/delegates:tflite_debug_delegate": ["TFLITE_DEBUG_DELEGATE"],
+        "//conditions:default": [],
+    }),
     module_name = "TensorFlowLiteCCoreML",
     deps = [
         ":coreml_delegate_kernel",
@@ -53,6 +60,7 @@ objc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/delegates/coreml/builders:op_builder",
         "//tensorflow/lite/delegates/coreml/builders:op_validator",
@@ -76,9 +84,11 @@ objc_library(
         ":mlmodel_proto_cc",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/coreml/builders:op_builder",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:types",
+        "@FP16",
     ],
 )
diff --git a/tensorflow/lite/delegates/coreml/README.md b/tensorflow/lite/delegates/coreml/README.md
index 1353bf92e48..848775b72e9 100644
--- a/tensorflow/lite/delegates/coreml/README.md
+++ b/tensorflow/lite/delegates/coreml/README.md
@@ -160,8 +160,28 @@ extension CoreMLDelegate {
 
 ```c++
 typedef struct {
- // We have dummy for now as we can't have empty struct in C.
- char dummy;
+  // Only create delegate when Neural Engine is available on the device.
+  TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
+  // This sets the maximum number of Core ML delegates created.
+  // Each graph corresponds to one delegated node subset in the
+  // TFLite model. Set this to 0 to delegate all possible partitions.
+  int max_delegated_partitions;
+  // This sets the minimum number of nodes per partition delegated with
+  // Core ML delegate. Defaults to 2.
+  int min_nodes_per_partition;
+#ifdef TFLITE_DEBUG_DELEGATE
+  // This sets the index of the first node that could be delegated.
+  int first_delegate_node_index;
+  // This sets the index of the last node that could be delegated.
+  int last_delegate_node_index;
+#endif
 } TfLiteCoreMlDelegateOptions;
 
 // Return a delegate that uses CoreML for ops execution.
diff --git a/tensorflow/lite/delegates/coreml/builders/BUILD b/tensorflow/lite/delegates/coreml/builders/BUILD
index 210ad8996dd..f71b9e410fb 100644
--- a/tensorflow/lite/delegates/coreml/builders/BUILD
+++ b/tensorflow/lite/delegates/coreml/builders/BUILD
@@ -30,6 +30,7 @@ cc_library(
         ":util",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -46,6 +47,7 @@ cc_library(
     hdrs = ["op_factory.h"],
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -54,6 +56,7 @@ cc_library(
     hdrs = ["op_validator.h"],
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -64,7 +67,10 @@ cc_library(
     deps = [
         ":op_validator",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "@FP16",
     ],
 )
 
@@ -74,6 +80,8 @@ cc_test(
     deps = [
         ":util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//third_party/eigen3",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
index 187a9740b71..4e707f172f8 100644
--- a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/delegates/coreml/builders/threshold_layer_builder.h"
 
diff --git a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h
index 2e41b723ca1..3eba8e22d22 100644
--- a/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h
+++ b/tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/coreml/builders/add_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/add_op_builder.cc
index 5df4b012121..23b83a3cb0f 100644
--- a/tensorflow/lite/delegates/coreml/builders/add_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/add_op_builder.cc
@@ -17,10 +17,11 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
+#include "tensorflow/lite/delegates/coreml/builders/util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -75,10 +76,10 @@ TfLiteStatus AddOpBuilder::RegisterInputs(const TfLiteIntArray* inputs,
   // store constant, scalar value into MultiplyLayerParams directly.
   if (IsConstantTensor(input_0) && NumElements(input_0) == 1) {
     AddInput(inputs->data[1]);
-    SetAlpha(GetTensorData<float>(input_0)[0]);
+    SetAlpha(GetScalarFloatFromTensor(input_0));
   } else if (IsConstantTensor(input_1) && NumElements(input_1) == 1) {
     AddInput(inputs->data[0]);
-    SetAlpha(GetTensorData<float>(input_1)[0]);
+    SetAlpha(GetScalarFloatFromTensor(input_1));
   } else {
     AddInput(inputs->data[0]);
     AddInput(inputs->data[1]);
diff --git a/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.cc
index 1c3438b5ec3..1686de69778 100644
--- a/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/concatenation_op_builder.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h b/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h
index 0aa0a1179bf..035fba3bc5c 100644
--- a/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h
+++ b/tensorflow/lite/delegates/coreml/builders/convolution_op_builder.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h b/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h
index 0fa76540055..e1a64fb3c37 100644
--- a/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h
+++ b/tensorflow/lite/delegates/coreml/builders/dummy_op_builder.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.cc
index 328b03ce476..b85b8991469 100644
--- a/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/fully_connected_op_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.cc
index d593a0815f3..5350fb7d459 100644
--- a/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/hardswish_op_builder.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/add_op_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/mul_op_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/mul_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/mul_op_builder.cc
index f15feb086dc..ad9e28c2860 100644
--- a/tensorflow/lite/delegates/coreml/builders/mul_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/mul_op_builder.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/activation_layer_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
@@ -81,10 +81,10 @@ TfLiteStatus MulOpBuilder::RegisterInputs(const TfLiteIntArray* inputs,
   // store constant, scalar value into MultiplyLayerParams directly.
   if (IsConstantTensor(input_0) && NumElements(input_0) == 1) {
     AddInput(inputs->data[1]);
-    SetAlpha(GetTensorData<float>(input_0)[0]);
+    SetAlpha(GetScalarFloatFromTensor(input_0));
   } else if (IsConstantTensor(input_1) && NumElements(input_1) == 1) {
     AddInput(inputs->data[0]);
-    SetAlpha(GetTensorData<float>(input_1)[0]);
+    SetAlpha(GetScalarFloatFromTensor(input_1));
   } else {
     AddInput(inputs->data[0]);
     AddInput(inputs->data[1]);
diff --git a/tensorflow/lite/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/delegates/coreml/builders/op_builder.cc
index 0759d4fbc41..903b941c4f3 100644
--- a/tensorflow/lite/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/op_builder.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/coreml/builders/op_factory.h b/tensorflow/lite/delegates/coreml/builders/op_factory.h
index 0d4b19dd85f..3cbf16bbaca 100644
--- a/tensorflow/lite/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/delegates/coreml/builders/op_factory.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_FACTORY_H_
 #define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_FACTORY_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/delegates/coreml/builders/op_validator.h b/tensorflow/lite/delegates/coreml/builders/op_validator.h
index 916e296c409..f56371302b7 100644
--- a/tensorflow/lite/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/delegates/coreml/builders/op_validator.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_VALIDATOR_H_
 #define TENSORFLOW_LITE_DELEGATES_COREML_BUILDERS_OP_VALIDATOR_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
index cae8331bdc2..f54532bef40 100644
--- a/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/pad_op_builder.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc b/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
index 3b65ef0116a..8d3c3177994 100644
--- a/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/pooling_layer_builder.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
index c8194679e74..9a46f4cc479 100644
--- a/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/reshape_op_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_builder.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.cc b/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.cc
index 3f8d83ae068..9e91cb4641a 100644
--- a/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.cc
+++ b/tensorflow/lite/delegates/coreml/builders/resize_bilinear_op_builder.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_factory.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/util.cc b/tensorflow/lite/delegates/coreml/builders/util.cc
index e11caa411fa..da04a351c59 100644
--- a/tensorflow/lite/delegates/coreml/builders/util.cc
+++ b/tensorflow/lite/delegates/coreml/builders/util.cc
@@ -16,7 +16,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "fp16.h"  // from @FP16
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
@@ -76,10 +79,26 @@ bool IsBroadcastable(const TfLiteTensor* input_0, const TfLiteTensor* input_1) {
   return true;
 }
 
+bool IsBinaryOpSupportedType(const TfLiteTensor* tensor) {
+  return tensor->type == kTfLiteFloat32 ||
+         (tensor->type == kTfLiteFloat16 && IsConstantTensor(tensor));
+}
+
 bool IsBinaryOpSupported(const TfLiteRegistration* registration,
                          const TfLiteNode* node, TfLiteContext* context) {
-  return IsBroadcastable(GetInput(context, node, 0),
-                         GetInput(context, node, 1));
+  const auto* input_0 = GetInput(context, node, 0);
+  const auto* input_1 = GetInput(context, node, 1);
+  if (IsBinaryOpSupportedType(input_0) && IsBinaryOpSupportedType(input_1)) {
+    return IsBroadcastable(input_0, input_1);
+  }
+  return false;
+}
+
+float GetScalarFloatFromTensor(const TfLiteTensor* tensor) {
+  if (tensor->type == kTfLiteFloat16) {
+    return fp16_ieee_to_fp32_value(GetTensorData<uint16_t>(tensor)[0]);
+  }
+  return GetTensorData<float>(tensor)[0];
 }
 
 }  // namespace coreml
diff --git a/tensorflow/lite/delegates/coreml/builders/util.h b/tensorflow/lite/delegates/coreml/builders/util.h
index 2d4e7461b93..ea22872ddff 100644
--- a/tensorflow/lite/delegates/coreml/builders/util.h
+++ b/tensorflow/lite/delegates/coreml/builders/util.h
@@ -30,6 +30,10 @@ namespace coreml {
 bool IsBinaryOpSupported(const TfLiteRegistration* registration,
                          const TfLiteNode* node, TfLiteContext* context);
 
+// Gets the float scalar value from the given tensor. The tensor should be a
+// constant float32/float16 tensor of size 1.
+float GetScalarFloatFromTensor(const TfLiteTensor* tensor);
+
 }  // namespace coreml
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/coreml/builders/util_test.cc b/tensorflow/lite/delegates/coreml/builders/util_test.cc
index 795b93ded82..858ac5740cc 100644
--- a/tensorflow/lite/delegates/coreml/builders/util_test.cc
+++ b/tensorflow/lite/delegates/coreml/builders/util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/common.h"
 
 using tflite::delegates::coreml::IsBinaryOpSupported;
@@ -37,6 +38,7 @@ class IsBinaryOpSupportedTest : public testing::Test {
 
     for (auto& tensor : tensors_) {
       tensor.allocation_type = kTfLiteArenaRw;
+      tensor.type = kTfLiteFloat32;
       tensor.dims = nullptr;
     }
   }
@@ -107,4 +109,34 @@ TEST_F(IsBinaryOpSupportedTest, NotSupportedBroadcastTest) {
     FreeInputShapes();
   }
 }
+
+TEST_F(IsBinaryOpSupportedTest, SupportConstFloat16InputTest) {
+  tensors_[0].allocation_type = kTfLiteMmapRo;
+  tensors_[0].type = kTfLiteFloat16;
+  SetInputShapes({{2, 2, 3}, {2, 2, 3}});
+  ASSERT_TRUE(IsBinaryOpSupported(nullptr, &node_, &context_));
+}
+
+TEST_F(IsBinaryOpSupportedTest, NotSupportNonConstFloat16InputTest) {
+  tensors_[0].allocation_type = kTfLiteArenaRw;
+  tensors_[0].type = kTfLiteFloat16;
+  SetInputShapes({{2, 2, 3}, {2, 2, 3}});
+  ASSERT_FALSE(IsBinaryOpSupported(nullptr, &node_, &context_));
+}
+
+TEST(UtilTest, GetScalarFloatFromTensorTest) {
+  std::vector<Eigen::half> half{Eigen::half{-535.54f}};
+  std::vector<float> float32{-535.54f};
+  TfLiteTensor tensor;
+  tensor.type = kTfLiteFloat16;
+  tensor.data.data = reinterpret_cast<uint16_t*>(half.data());
+  EXPECT_THAT(tflite::delegates::coreml::GetScalarFloatFromTensor(&tensor),
+              testing::FloatNear(-535.54f, 0.1f));
+
+  tensor.type = kTfLiteFloat32;
+  tensor.data.data = float32.data();
+  EXPECT_THAT(tflite::delegates::coreml::GetScalarFloatFromTensor(&tensor),
+              testing::FloatNear(-535.54f, 0.f));
+}
+
 }  // namespace
diff --git a/tensorflow/lite/delegates/coreml/coreml_delegate.h b/tensorflow/lite/delegates/coreml/coreml_delegate.h
index 9250e62190d..02dfcccde03 100644
--- a/tensorflow/lite/delegates/coreml/coreml_delegate.h
+++ b/tensorflow/lite/delegates/coreml/coreml_delegate.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 
+// LINT.IfChange
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -45,6 +47,12 @@ typedef struct {
   // This sets the minimum number of nodes per partition delegated with
   // Core ML delegate. Defaults to 2.
   int min_nodes_per_partition;
+#ifdef TFLITE_DEBUG_DELEGATE
+  // This sets the index of the first node that could be delegated.
+  int first_delegate_node_index;
+  // This sets the index of the last node that could be delegated.
+  int last_delegate_node_index;
+#endif
 } TfLiteCoreMlDelegateOptions;
 
 // Return a delegate that uses CoreML for ops execution.
@@ -59,4 +67,6 @@ void TfLiteCoreMlDelegateDelete(TfLiteDelegate* delegate);
 }
 #endif  // __cplusplus
 
+// LINT.ThenChange(README.md)
+
 #endif  // TENSORFLOW_LITE_DELEGATES_COREML_COREML_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/delegates/coreml/coreml_delegate.mm
index 3da5a4ee4ef..48c1b63ce12 100644
--- a/tensorflow/lite/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/delegates/coreml/coreml_delegate.mm
@@ -20,8 +20,8 @@
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/coreml/builders/op_validator.h"
 #include "tensorflow/lite/delegates/coreml/builders/util.h"
 #include "tensorflow/lite/delegates/coreml/coreml_delegate_kernel.h"
@@ -175,6 +175,14 @@ explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
       if (params_.min_nodes_per_partition <= 0) {
         params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
       }
+#ifdef TFLITE_DEBUG_DELEGATE
+      if (params_.first_delegate_node_index < 0) {
+        params_.first_delegate_node_index = 0;
+      }
+      if (params->last_delegate_node_index <= 0) {
+        params_.last_delegate_node_index = std::numeric_limits<int>::max();
+      }
+#endif
     }
   }
 
@@ -242,7 +250,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   };
 
   delegates::FP16GraphPartitionHelper partition_helper(context, node_supported_fn);
+#ifndef TFLITE_DEBUG_DELEGATE
   TF_LITE_ENSURE_STATUS(partition_helper.Partition(nullptr));
+#else
+  TF_LITE_ENSURE_STATUS(partition_helper.Partition(nullptr, params->first_delegate_node_index,
+                                                   params->last_delegate_node_index));
+#endif
 
   std::vector<int> delegated_nodes = partition_helper.GetNodesOfFirstNLargestPartitions(
       params->max_delegated_partitions, params->min_nodes_per_partition);
diff --git a/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.mm b/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.mm
index e21a08ed2e6..813604ada60 100644
--- a/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.mm
+++ b/tensorflow/lite/delegates/coreml/coreml_delegate_kernel.mm
@@ -14,6 +14,7 @@
 ==============================================================================*/
 #include "tensorflow/lite/delegates/coreml/coreml_delegate_kernel.h"
 
+#include "fp16.h"  // from @FP16
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -210,9 +211,18 @@ void TransposeToHWC(const float* chw, float* hwc, const TfLiteIntArray* hwc_dims
       for (int i = 0; i < input_tensor_ids_.size(); ++i) {
         const int tensor_id = input_tensor_ids_[i];
         TfLiteTensor* tensor = &context->tensors[tensor_id];
+        const float* float_ptr = tensor->data.f;
+        if (tensor->type == kTfLiteFloat16) {
+          std::vector<float> floats = std::vector<float>(NumElements(tensor));
+          const uint16_t* float16_ptr = reinterpret_cast<uint16_t const*>(tensor->data.f16);
+          for (int j = 0; j < floats.size(); j++) {
+            floats[j] = fp16_ieee_to_fp32_value(float16_ptr[j]);
+          }
+          float_ptr = floats.data();
+        }
         // Transpose input to CHW.
         // TODO(b/143992544): try adding transpose op for inputs.
-        TransposeToCHW(tensor->data.f, inputs_[i].data.data(), tensor->dims);
+        TransposeToCHW(float_ptr, inputs_[i].data.data(), tensor->dims);
       }
 
       if (![executor_ invokeWithInputs:inputs_ outputs:outputs_]) {
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 98f60931318..57bd350b5ab 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -25,15 +25,15 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
 #include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
@@ -51,8 +51,7 @@ using test_utils::TestTwoDelegates;
 namespace {
 
 TEST_F(TestDelegate, NullDelegate) {
-  TfLiteDelegate* delegate = nullptr;
-  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(delegate),
+  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(nullptr),
             kTfLiteDelegateError);
 }
 
@@ -439,9 +438,9 @@ struct DelegateState {
 struct OpaqueTestDelegate {
   static constexpr int kTestDelegateOutput = 42;
 
-  static inline TfLiteStatus Prepare(
-      TfLiteOpaqueContext* opaque_context,
-      struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+  static inline TfLiteStatus Prepare(TfLiteOpaqueContext* opaque_context,
+                                     TfLiteOpaqueDelegate* opaque_delegate,
+                                     void* data) {
     DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
     delegate_state->delegate_prepared = true;
 
@@ -465,9 +464,8 @@ struct OpaqueTestDelegate {
   }
 
   static inline TfLiteStatus CopyFromBufferHandle(
-      TfLiteOpaqueContext* context, struct TfLiteOpaqueDelegateStruct* delegate,
-      void* data, TfLiteBufferHandle buffer_handle,
-      TfLiteOpaqueTensor* opaque_tensor) {
+      TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate, void* data,
+      TfLiteBufferHandle buffer_handle, TfLiteOpaqueTensor* opaque_tensor) {
     DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
     delegate_state->copy_from_buffer_handle_called = true;
     delegate_state->buffer_handle = buffer_handle;
@@ -485,9 +483,10 @@ struct OpaqueTestDelegate {
     return kTfLiteOk;
   }
 
-  static inline void FreeBufferHandle(
-      TfLiteOpaqueContext* context, struct TfLiteOpaqueDelegateStruct* delegate,
-      void* data, TfLiteBufferHandle* buffer_handle) {
+  static inline void FreeBufferHandle(TfLiteOpaqueContext* context,
+                                      TfLiteOpaqueDelegate* delegate,
+                                      void* data,
+                                      TfLiteBufferHandle* buffer_handle) {
     DelegateState* delegate_state = reinterpret_cast<DelegateState*>(data);
     delegate_state->free_buffer_handle_called = true;
     delegate_state->buffer_handle = *buffer_handle;
@@ -496,7 +495,7 @@ struct OpaqueTestDelegate {
 
 // Ensure that the runtime correctly interacts with a delegate that uses the
 // 'TfLiteOpaqueDelegateBuilder'.  This test:
-// 1. Defines a delegate that will replace the full graph with a delegate
+// 1. Defines a delegate that will replace the full graph will a delegate
 //    kernel.
 // 2. Associates the model's output tensor with the delegate and marks the
 //    output tensor's data as stale, to prompt the runtime to use the delegate's
@@ -515,19 +514,18 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
   ASSERT_NE(model, nullptr);
   constexpr int kNumTensorElements = 1 * 8 * 8 * 3;
 
-  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
-  opaque_delegate_builder.data = &delegate_state;
-  opaque_delegate_builder.CopyFromBufferHandle =
+  TfLiteOpaqueDelegateBuilder opaque_delegate{};
+  opaque_delegate.data = &delegate_state;
+  opaque_delegate.CopyFromBufferHandle =
       OpaqueTestDelegate::CopyFromBufferHandle;
-  opaque_delegate_builder.FreeBufferHandle =
-      OpaqueTestDelegate::FreeBufferHandle;
-  opaque_delegate_builder.Prepare = OpaqueTestDelegate::Prepare;
-  TfLiteOpaqueDelegateStruct* opaque_delegate =
-      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+  opaque_delegate.FreeBufferHandle = OpaqueTestDelegate::FreeBufferHandle;
+  opaque_delegate.Prepare = OpaqueTestDelegate::Prepare;
 
   tflite::ops::builtin::BuiltinOpResolver resolver;
   tflite::InterpreterBuilder builder(*model, resolver);
-  builder.AddDelegate(opaque_delegate);
+  TfLiteDelegate tflite_delegate{};
+  tflite_delegate.opaque_delegate_builder = &opaque_delegate;
+  builder.AddDelegate(&tflite_delegate);
   std::unique_ptr<tflite::Interpreter> interpreter;
   builder(&interpreter);
   ASSERT_NE(interpreter, nullptr);
@@ -546,18 +544,10 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
   int first_buffer_handle = 1;
   const int kOutputTensorIndex = 2;
   interpreter->SetBufferHandle(kOutputTensorIndex, first_buffer_handle,
-                               opaque_delegate);
+                               &tflite_delegate);
   TfLiteTensor* output_t = interpreter->output_tensor(0);
   output_t->data_is_stale = true;
 
-  // Check that we can get the same buffer handle and delegate pointer back.
-  TfLiteBufferHandle loaded_buffer_handle = kTfLiteNullBufferHandle;
-  TfLiteOpaqueDelegateStruct* loaded_opaque_delegate = nullptr;
-  interpreter->GetBufferHandle(kOutputTensorIndex, &loaded_buffer_handle,
-                               &loaded_opaque_delegate);
-  EXPECT_EQ(loaded_buffer_handle, first_buffer_handle);
-  EXPECT_EQ(loaded_opaque_delegate, opaque_delegate);
-
   // Run inference
   ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
   EXPECT_TRUE(delegate_state.delegate_prepared);
@@ -574,7 +564,7 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
   // associated with it will free the previously installed buffer handle.
   int second_buffer_handle = first_buffer_handle + 1;
   interpreter->SetBufferHandle(kOutputTensorIndex, second_buffer_handle,
-                               opaque_delegate);
+                               &tflite_delegate);
   EXPECT_FALSE(delegate_state.copy_from_buffer_handle_called);
   EXPECT_EQ(delegate_state.buffer_handle, first_buffer_handle);
   EXPECT_TRUE(delegate_state.free_buffer_handle_called);
@@ -586,7 +576,33 @@ TEST(TestOpaqueDelegate, PrepareCopyFromFree) {
   EXPECT_FALSE(delegate_state.copy_from_buffer_handle_called);
   EXPECT_EQ(delegate_state.buffer_handle, second_buffer_handle);
   EXPECT_TRUE(delegate_state.free_buffer_handle_called);
-  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+TEST(TestDelegateKernel, WithoutName) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(
+          "third_party/tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  TfLiteDelegate tflite_delegate{};
+  tflite_delegate.Prepare =
+      [](TfLiteContext* context,
+         struct TfLiteDelegate* delegate) -> TfLiteStatus {
+    TfLiteIntArray* execution_plan;
+    TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
+    TfLiteRegistration registration{};
+    registration.init = [](TfLiteContext* context, const char* buffer,
+                           size_t length) -> void* { return nullptr; };
+    context->ReplaceNodeSubsetsWithDelegateKernels(context, registration,
+                                                   execution_plan, delegate);
+    return kTfLiteOk;
+  };
+  builder.AddDelegate(&tflite_delegate);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  builder(&interpreter);
+  ASSERT_NE(interpreter, nullptr);
 }
 
 TEST_F(TestDelegate, DelegateCustomOpResolution) {
@@ -1109,6 +1125,46 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
   TfLiteDelegate delegate_;
 };
 
+TfLiteRegistrationExternal* CreateTfLiteRegistrationExternal() {
+  auto registration = TfLiteRegistrationExternalCreate(
+      kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
+  TfLiteRegistrationExternalSetPrepare(
+      registration,
+      [](TfLiteOpaqueContext* context,
+         TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
+        // If tensors are resized, the runtime should propagate shapes
+        // automatically if 'kTfLiteDelegateFlagsRequirePropagatedShapes' flag
+        // is set.
+
+        // Output 0 should be dynamic.
+        TfLiteOpaqueTensor* output0 =
+            TfLiteOpaqueNodeGetOutput(context, opaque_node, 0);
+        EXPECT_EQ(kTfLiteDynamic, TfLiteOpaqueTensorGetAllocationType(output0));
+
+        // Output 1 has the same shape as input.
+        const TfLiteOpaqueTensor* input =
+            TfLiteOpaqueNodeGetInput(context, opaque_node, 0);
+        const TfLiteOpaqueTensor* output1 =
+            TfLiteOpaqueNodeGetOutput(context, opaque_node, 1);
+
+        if (TfLiteOpaqueTensorNumDims(input) !=
+            TfLiteOpaqueTensorNumDims(output1)) {
+          return kTfLiteError;
+        }
+        // When 'kTfLiteDelegateFlagsRequirePropagatedShapes' is *not* set then
+        // changes to the dimensions of the 'input' tensor won't automatically
+        // propagate to the 'output1' tensor dimensions.
+        if (TfLiteOpaqueTensorDim(input, 0) !=
+            TfLiteOpaqueTensorDim(output1, 0)) {
+          return kTfLiteError;
+        }
+
+        return kTfLiteOk;
+      });
+
+  return registration;
+}
+
 class TestOpaqueDelegateBuilderWithDynamicTensors
     : public TestDelegateWithDynamicTensors {
  public:
@@ -1119,24 +1175,14 @@ class TestOpaqueDelegateBuilderWithDynamicTensors
     // uses its opaque_delegate_builder field.
     delegate_.Prepare = nullptr;
     delegate_.opaque_delegate_builder = &delegate_external_;
-    delegate_external_.Prepare =
-        [](TfLiteOpaqueContext* opaque_context,
-           struct TfLiteOpaqueDelegateStruct* opaque_delegate,
-           void* data) -> TfLiteStatus {
-      // Note, ideally this function should not perform any casts on the
-      // provided opaque context or opaque delegate. However, the APIs that
-      // allow a caller to load an execution plan by providing an opaque
-      // context, or replace nodes with a delegate kernel by providing an opaque
-      // delegate, are added in child CLs.
+    delegate_external_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                    TfLiteOpaqueDelegate* opaque_delegate,
+                                    void* data) -> TfLiteStatus {
       TfLiteIntArray* execution_plan;
-      TfLiteContext* context = reinterpret_cast<TfLiteContext*>(opaque_context);
-      TfLiteDelegate* delegate =
-          reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
-      TF_LITE_ENSURE_STATUS(
-          context->GetExecutionPlan(context, &execution_plan));
-      TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
-          context, DelegateRegistration(), execution_plan, delegate);
-      return status;
+      TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan);
+      return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+          opaque_context, CreateTfLiteRegistrationExternal(), execution_plan,
+          opaque_delegate);
     };
     delegate_external_.flags = kTfLiteDelegateFlagsNone;
   }
@@ -1438,8 +1484,7 @@ TEST_P(TestFP16Delegation, NonDelegatedInterpreterWorks) {
 }
 
 TEST_F(TestFP16Delegation, NullDelegate) {
-  TfLiteDelegate* delegate = nullptr;
-  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(delegate),
+  EXPECT_EQ(interpreter_->ModifyGraphWithDelegate(nullptr),
             kTfLiteDelegateError);
   // Verify that resulting interpreter still works, despite null delegate.
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
diff --git a/tensorflow/lite/delegates/delegate_test_util.cc b/tensorflow/lite/delegates/delegate_test_util.cc
index 8f91bb45091..aa38a3c74a5 100644
--- a/tensorflow/lite/delegates/delegate_test_util.cc
+++ b/tensorflow/lite/delegates/delegate_test_util.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/delegates/utils.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/delegates/delegate_test_util.h b/tensorflow/lite/delegates/delegate_test_util.h
index 353951bd45a..18ab52f1b62 100644
--- a/tensorflow/lite/delegates/delegate_test_util.h
+++ b/tensorflow/lite/delegates/delegate_test_util.h
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 namespace delegates {
@@ -93,12 +93,13 @@ class SimpleDelegate {
 // Friend of Interpreter to access private methods.
 class TestDelegation {
  public:
-  virtual ~TestDelegation() {}
+  virtual ~TestDelegation() = default;
 
   // Returns an empty interpreter that uses the same default delegates that are
   // normally enabled by default.
-  static std::unique_ptr<Interpreter> NewInterpreterWithDefaultDelegates() {
-    auto interpreter = std::make_unique<Interpreter>();
+  static std::unique_ptr<impl::Interpreter>
+  NewInterpreterWithDefaultDelegates() {
+    auto interpreter = std::make_unique<impl::Interpreter>();
     interpreter->lazy_delegate_providers_ =
         tflite::ops::builtin::BuiltinOpResolver().GetDelegateCreators();
     return interpreter;
@@ -116,7 +117,7 @@ class TestDelegation {
   void AddSubgraphs(int subgraphs_to_add,
                     int* first_new_subgraph_index = nullptr);
 
-  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<impl::Interpreter> interpreter_;
 };
 
 // Tests scenarios involving a single delegate.
@@ -211,7 +212,7 @@ class TestFP16Delegation : public ::testing::TestWithParam<int> {
     bool fail_delegate_node_invoke_ = false;
   };
 
-  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<impl::Interpreter> interpreter_;
   std::unique_ptr<FP16Delegate> delegate_;
   Eigen::half float16_const_;
 };
diff --git a/tensorflow/lite/delegates/external/BUILD b/tensorflow/lite/delegates/external/BUILD
index 192091141be..3a577e696e4 100644
--- a/tensorflow/lite/delegates/external/BUILD
+++ b/tensorflow/lite/delegates/external/BUILD
@@ -14,20 +14,31 @@
 # ==============================================================================
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "external_delegate_interface",
+    hdrs = ["external_delegate_interface.h"],
+    deps = [
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 cc_library(
     name = "external_delegate",
     srcs = ["external_delegate.cc"],
     hdrs = ["external_delegate.h"],
     deps = [
+        ":external_delegate_interface",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:shared_library",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/external/external_delegate.cc b/tensorflow/lite/delegates/external/external_delegate.cc
index 6853d1cc399..6427d3f4648 100644
--- a/tensorflow/lite/delegates/external/external_delegate.cc
+++ b/tensorflow/lite/delegates/external/external_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/external/external_delegate_interface.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/shared_library.h"
 
@@ -28,10 +29,6 @@ namespace {
 
 // External delegate library construct
 struct ExternalLib {
-  using CreateDelegatePtr = std::add_pointer<TfLiteDelegate*(
-      const char**, const char**, size_t,
-      void (*report_error)(const char*))>::type;
-  using DestroyDelegatePtr = std::add_pointer<void(TfLiteDelegate*)>::type;
   struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
 
   // Open a given delegate library and load the create/destroy symbols
@@ -44,22 +41,23 @@ struct ExternalLib {
     void* handle = SharedLibrary::LoadLibrary(library.c_str());
 #endif  // defined(_WIN32)
     if (handle == nullptr) {
-      TFLITE_LOG(TFLITE_LOG_INFO, "Unable to load external delegate from : %s",
-                 library.c_str());
+      TFLITE_LOG(TFLITE_LOG_INFO,
+                 "Unable to load external delegate from : %s (%s)",
+                 library.c_str(), SharedLibrary::GetError());
     } else {
-      create =
-          reinterpret_cast<decltype(create)>(SharedLibrary::GetLibrarySymbol(
-              handle, "tflite_plugin_create_delegate"));
-      destroy =
-          reinterpret_cast<decltype(destroy)>(SharedLibrary::GetLibrarySymbol(
-              handle, "tflite_plugin_destroy_delegate"));
+      create = reinterpret_cast<decltype(&tflite_plugin_create_delegate)>(
+          SharedLibrary::GetLibrarySymbol(handle,
+                                          "tflite_plugin_create_delegate"));
+      destroy = reinterpret_cast<decltype(&tflite_plugin_destroy_delegate)>(
+          SharedLibrary::GetLibrarySymbol(handle,
+                                          "tflite_plugin_destroy_delegate"));
       return create && destroy;
     }
     return false;
   }
 
-  CreateDelegatePtr create{nullptr};
-  DestroyDelegatePtr destroy{nullptr};
+  decltype(&tflite_plugin_create_delegate) create{nullptr};
+  decltype(&tflite_plugin_destroy_delegate) destroy{nullptr};
 };
 
 // An ExternalDelegateWrapper is responsibile to manage a TFLite delegate
diff --git a/tensorflow/lite/delegates/external/external_delegate_interface.h b/tensorflow/lite/delegates/external/external_delegate_interface.h
new file mode 100644
index 00000000000..9c3d9fc2a1b
--- /dev/null
+++ b/tensorflow/lite/delegates/external/external_delegate_interface.h
@@ -0,0 +1,79 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
+
+#include "tensorflow/lite/c/common.h"
+
+// This header file declares the interface that external delegate shared
+// libraries need to implement.  The functions declared here are not defined
+// in TF Lite itself -- this just declares the interface to functions that
+// are defined elsewhere, in a shared library that TfLiteExternalDelegate will
+// dynamically load.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Define TFL_EXTERNAL_DELEGATE_EXPORT macro to export an external delegate API
+// function properly with a shared library.
+#ifdef SWIG
+#define TFL_EXTERNAL_DELEGATE_EXPORT
+#else  // !defined SWIG
+#ifdef _WIN32
+// On Windows, the TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY macro should be
+// defined when _building_ an external delegate shared library, but should not
+// be defined when _using_ an external delegate shared library.
+#ifdef TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+#define TFL_EXTERNAL_DELEGATE_EXPORT __declspec(dllexport)
+#else  // !defined TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+// We may not actually need dllimport,
+// since the symbols will looked up dynamically?
+#define TFL_EXTERNAL_DELEGATE_EXPORT __declspec(dllimport)
+#endif  // !defined TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY
+#else   // !defined _WIN32
+#define TFL_EXTERNAL_DELEGATE_EXPORT __attribute__((visibility("default")))
+#endif  // !defined _WIN32
+#endif  // !defined SWIG
+
+// Creates a delegate object based on provided key-value options.
+//
+// The delegate is initialized using the option settings specified by the
+// names in `options_keys` and the corresponding values in `options_values`,
+// which are both arrays of length `num_options` of NUL-terminated C strings.
+// This function *should not* modify those arrays, but the caller must not rely
+// on that. `options_keys` and `options_values` may be null if `num_options` is
+// zero.
+//
+// On success, returns a non-null value that should be deallocated with
+// tflite_plugin_destroy_delegate when no longer needed.
+// On failure, returns NULL to indicate an error, with the detailed information
+// reported by calling `report_error` if provided.
+extern TFL_EXTERNAL_DELEGATE_EXPORT TfLiteDelegate*
+tflite_plugin_create_delegate(const char* const* options_keys,
+                              const char* const* options_values,
+                              size_t num_options,
+                              void (*report_error)(const char*));
+
+// Destroys a delegate object that was created by tflite_plugin_create_delegate.
+// Calling this with nullptr as the argument value is allowed and has no effect.
+extern TFL_EXTERNAL_DELEGATE_EXPORT void tflite_plugin_destroy_delegate(
+    TfLiteDelegate* delegate);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_DELEGATES_EXTERNAL_EXTERNAL_DELEGATE_INTERFACE_H_
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 5ed2d1bcc8c..a0a34b6d599 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -22,6 +22,7 @@ default_visibility = [
 # This is a TF Lite delegate that is powered by TensorFlow's Eager.
 #
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_visibility,
     licenses = ["notice"],
 )
@@ -36,12 +37,14 @@ cc_library(
     name = "buffer_map",
     srcs = ["buffer_map.cc"],
     hdrs = ["buffer_map.h"],
+    compatible_with = get_compatible_with_cloud(),
     copts = tf_opts_nortti_if_lite_protos(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         ":util",
         ":buffer_map_util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:string",
         "//tensorflow/lite/kernels/internal:compatibility",
     ] + if_mobile([
@@ -56,11 +59,13 @@ cc_library(
     name = "buffer_map_util",
     srcs = ["buffer_map_util.cc"],
     hdrs = ["buffer_map_util.h"],
+    compatible_with = get_compatible_with_cloud(),
     copts = tf_opts_nortti_if_lite_protos(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         ":util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/experimental/resource",
     ] + if_mobile([
@@ -84,7 +89,7 @@ tf_cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -103,6 +108,7 @@ tf_cc_test(
 # )
 tflite_flex_cc_library(
     name = "delegate",
+    compatible_with = get_compatible_with_cloud(),
     visibility = ["//visibility:public"],
 )
 
@@ -129,11 +135,12 @@ cc_library(
     srcs = [
         "delegate_symbol.cc",
     ],
+    compatible_with = get_compatible_with_cloud(),
     copts = tflite_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":delegate_only_runtime",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
     ],
     alwayslink = 1,
 )
@@ -150,6 +157,7 @@ cc_library(
     hdrs = [
         "delegate.h",
     ],
+    compatible_with = get_compatible_with_cloud(),
     copts = tflite_copts() + tf_opts_nortti_if_android(),
     features = tf_features_nolayering_check_if_ios(),
     visibility = ["//visibility:public"],
@@ -162,6 +170,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:minimal_logging",
@@ -202,6 +211,7 @@ cc_library(
     name = "delegate_data",
     srcs = ["delegate_data.cc"],
     hdrs = ["delegate_data.h"],
+    compatible_with = get_compatible_with_cloud(),
     copts = tf_opts_nortti_if_android(),
     features = tf_features_nolayering_check_if_ios(),
     visibility = ["//visibility:public"],
@@ -213,6 +223,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@flatbuffers",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite:cc_api",
         "//tensorflow/lite:util",
@@ -241,6 +252,7 @@ tf_cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/testing:util",
         "@com_google_absl//absl/memory",
@@ -252,9 +264,11 @@ tf_cc_test(
 cc_library(
     name = "subgraph_resource",
     hdrs = ["subgraph_resource.h"],
+    compatible_with = get_compatible_with_cloud(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:cc_api",
     ] + if_mobile([
         "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -279,6 +293,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_googletest//:gtest_main",
@@ -304,11 +319,13 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
+    compatible_with = get_compatible_with_cloud(),
     features = tf_features_nolayering_check_if_ios(),
     #TODO(b/206038955): Consider restrict the visibility to '//third_party/fcp/client:__subpackages__'.
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:kernel_api",
         "@com_google_absl//absl/strings:str_format",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -335,8 +352,9 @@ tf_cc_test(
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -394,6 +412,7 @@ cc_library(
 cc_library(
     name = "tflite_subgraph_execute",
     srcs = ["tflite_subgraph_execute.cc"],
+    compatible_with = get_compatible_with_cloud(),
     copts = tf_opts_nortti_if_android(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
@@ -405,8 +424,9 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "//tensorflow/lite:cc_api",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
     ] + if_mobile([
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
index 9c7417a72a1..16d9e7a3a1f 100644
--- a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
@@ -108,6 +108,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "CheckNumerics",
           "CheckNumericsV2",
           "Cholesky",
+          "ClipByValue",
           "CombinedNonMaxSuppression",
           "Complex",
           "ComplexAbs",
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index 2227d559c88..73c3cb79f75 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/delegates/flex/buffer_map_util.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/string_util.h"
diff --git a/tensorflow/lite/delegates/flex/buffer_map_util.h b/tensorflow/lite/delegates/flex/buffer_map_util.h
index 2b58348e84e..ea9086a9d15 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_util.h
+++ b/tensorflow/lite/delegates/flex/buffer_map_util.h
@@ -56,6 +56,11 @@ class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 
   inline size_t size() const override { return len_; }
 
+  // Indicates that `TfLiteTensorBuffer` is responsible for deallocating its
+  // underlying buffer. This buffer must have been allocated by
+  // `tensorflow::cpu_allocator`
+  inline void TakeOwnershipOfBuffer() { reused_buffer_from_tflite_ = false; }
+
   inline bool BufferReusedFromTfLiteTensor() const {
     return reused_buffer_from_tflite_;
   }
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 1dffc420e65..083f7662aaa 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -94,7 +94,8 @@ def tflite_flex_cc_library(
         additional_deps = [],
         testonly = 0,
         visibility = ["//visibility:public"],
-        link_symbol = True):
+        link_symbol = True,
+        compatible_with = None):
     """A rule to generate a flex delegate with only ops to run listed models.
 
     Args:
@@ -105,6 +106,8 @@ def tflite_flex_cc_library(
       additional_deps: Dependencies for additional TF ops.
       testonly: Mark this library as testonly if true.
       visibility: visibility of the generated rules.
+      link_symbol: If true, add delegate_symbol to deps.
+      compatible_with: The standard compatible_with attribute.
     """
     portable_tensorflow_lib = clean_dep("//tensorflow/core:portable_tensorflow_lib")
     if models:
@@ -125,6 +128,7 @@ def tflite_flex_cc_library(
                 clean_dep("//tensorflow/core/kernels:portable_extended_ops"),
             ]) + [CUSTOM_KERNEL_HEADER.header],
             copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
+            compatible_with = compatible_with,
             defines = [
                 "SELECTIVE_REGISTRATION",
                 "SUPPORT_SELECTIVE_REGISTRATION",
@@ -164,6 +168,7 @@ def tflite_flex_cc_library(
             clean_dep("//tensorflow/lite/delegates/flex:delegate.h"),
         ],
         features = tf_features_nolayering_check_if_ios(),
+        compatible_with = compatible_with,
         visibility = visibility,
         deps = [
             clean_dep("//tensorflow/lite/delegates/flex:delegate_data"),
@@ -176,6 +181,9 @@ def tflite_flex_cc_library(
             clean_dep("//tensorflow:ios"): [
                 portable_tensorflow_lib,
             ],
+            clean_dep("//tensorflow:chromiumos"): [
+                portable_tensorflow_lib,
+            ],
             "//conditions:default": [
                 clean_dep("//tensorflow/core:tensorflow"),
                 clean_dep("//tensorflow/lite/c:common"),
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 1e1a19cfcf5..b0ad8b83028 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -49,6 +49,10 @@ TfLiteDelegateUniquePtr FlexDelegate::Create(
         ->CopyFromBufferHandle(context, buffer_handle, tensor);
   };
   flex_delegate->flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+  // NOMUTANTS -- this flag has effects in profiler that disable the profiling
+  // of the macro operator "TfLiteFlexDelegate", which only shows in profiler
+  // output string. Adding flag check in Flex tests is currently not necessary.
+  flex_delegate->flags |= kTfLiteDelegateFlagsPerOperatorProfiling;
   reinterpret_cast<FlexDelegate*>(flex_delegate->data_)->base_delegate_ =
       flex_delegate.get();
   return flex_delegate;
diff --git a/tensorflow/lite/delegates/flex/delegate_symbol.cc b/tensorflow/lite/delegates/flex/delegate_symbol.cc
index e06649caffe..61c0d1c8d0c 100644
--- a/tensorflow/lite/delegates/flex/delegate_symbol.cc
+++ b/tensorflow/lite/delegates/flex/delegate_symbol.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
 namespace tflite {
 // Corresponding weak declaration found in lite/core/interpreter_builder.cc.
diff --git a/tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex/BUILD b/tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex/BUILD
index b186724cbb8..3d8614b254f 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex/BUILD
@@ -1,3 +1,5 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 filegroup(
diff --git a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
index d0dfd3515d8..7a43d5a9f46 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
@@ -4,7 +4,10 @@
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
-package(default_visibility = ["//visibility:public"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
 
 licenses(["notice"])
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 258fad605f4..61bc79620a0 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -474,7 +474,7 @@ TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
   // tensors (buffer forwarding).
   auto check_if_op_reuses_input = [](const string& op_name) {
     return op_name == "TensorListPushBack" || op_name == "TensorListSetItem" ||
-           op_name == "SparseReshape";
+           op_name == "SparseReshape" || op_name == "StridedSlice";
   };
 
   for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
@@ -742,7 +742,7 @@ TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
 
     // Execute the TensorFlow Ops sequentially.
     for (auto& node_data : op_data_->nodes) {
-      TFLITE_SCOPED_DELEGATE_OPERATOR_PROFILE(
+      TFLITE_SCOPED_DELEGATE_PROFILED_OPERATOR_PROFILE(
           reinterpret_cast<Profiler*>(context->profiler),
           node_data->name().c_str(), node_data->index());
 
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 3558f01fb6a..91a22c19b0d 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -7,6 +7,7 @@ load("@bazel_skylib//rules:build_test.bzl", "build_test")
 # in Java. Please don't use them for other purposes.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/lite/android:__subpackages__",
     ],
diff --git a/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc b/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
index 6c85fd423d6..bfb92a7a39a 100644
--- a/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
+++ b/tensorflow/lite/delegates/flex/tflite_subgraph_execute.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string>
+#include <unordered_map>
 #include <utility>
 
 #include "absl/strings/numbers.h"
@@ -28,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tstring.h"
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/flex/buffer_map_util.h"
 #include "tensorflow/lite/delegates/flex/subgraph_resource.h"
@@ -64,7 +65,11 @@ REGISTER_OP("TfLiteSubgraphExecute")
 class TfLiteSubgraphExecute : public OpKernel {
  public:
   explicit TfLiteSubgraphExecute(OpKernelConstruction* ctx)
-      : OpKernel(ctx), tfl_tensors_need_allocation_(true) {}
+      : OpKernel(ctx),
+        tfl_tensors_need_allocation_(true),
+        fill_output_to_input_map_(true),
+        first_run_(true),
+        output_tensors_can_be_shared_(true) {}
 
   void Compute(OpKernelContext* ctx) override {
     // Fetch the TF Lite subgraph to execute.
@@ -82,6 +87,7 @@ class TfLiteSubgraphExecute : public OpKernel {
     mutex_lock lock(resource->GetExclusiveLock());
     tflite::Subgraph& subgraph_selected = resource->GetSubgraphResource();
 
+    OutputToInputMap(ctx, subgraph_selected);
     OP_REQUIRES(ctx, ctx->num_inputs() == subgraph_selected.inputs().size() + 1,
                 errors::InvalidArgument("TF Lite subgraph expects ",
                                         subgraph_selected.inputs().size(),
@@ -90,24 +96,152 @@ class TfLiteSubgraphExecute : public OpKernel {
 
     // Resize input tensors if necessary.
     ResizeInputTensor(ctx, subgraph_selected);
+    SetCustomAllocatorsForInputTensors(ctx, subgraph_selected);
 
     if (tfl_tensors_need_allocation_) {
       OP_REQUIRES(ctx, subgraph_selected.AllocateTensors() == kTfLiteOk,
                   errors::Internal("Failed to call allocate tensors"));
       tfl_tensors_need_allocation_ = false;
+      output_tensors_can_be_shared_ = !subgraph_selected.HasDynamicTensors();
     }
 
+    if (output_tensors_can_be_shared_) {
+      SetCustomAllocatorsForOutputTensors(ctx, subgraph_selected);
+      // The output tensors have been allocated by
+      // `SetCustomAllocatorsForOutputTensors`. They have also been arena
+      // allocated by `AllocateTensors`. Calling `ReleaseMemory` and
+      // `AllocateTensors` allows us to reduce peak memory usage by no longer
+      // allocating the output tensors in the arena. This only needs to be done
+      // once as subsequent calls to `AllocateTensors` will not allocate the
+      // output tensors as they have been set to
+      // `kTfLiteCustom`.
+      if (first_run_) {
+        subgraph_selected.ReleaseMemory();
+        TfLiteStatus status = subgraph_selected.AllocateTensors();
+        if (status != kTfLiteOk) {
+          CleanUpCustomOutputs();
+          ctx->CtxFailure(errors::Internal("Failed to call allocate tensors",
+                                           subgraph_selected.GetName()));
+          return;
+        }
+        first_run_ = false;
+      }
+    }
     // Copy input tensors to subgraph.
     SetSubgraphInput(ctx, subgraph_selected, resource->GetFlexDelegate());
 
-    OP_REQUIRES(ctx, subgraph_selected.Invoke() == kTfLiteOk,
-                errors::Internal("Failed to invoke tflite subgraph"));
+    TfLiteStatus status = subgraph_selected.Invoke();
+    if (status != kTfLiteOk) {
+      CleanUpCustomOutputs();
+      ctx->CtxFailure(errors::Internal("Failed to invoke tflite subgraph",
+                                       subgraph_selected.GetName()));
+      return;
+    }
 
     // Copy tflite results.
     CopyTFLiteSubgraphResult(ctx, subgraph_selected);
   }
 
  private:
+  void CleanUpCustomOutputs() {
+    for (void* ptr : custom_output_ptrs_) {
+      tensorflow::cpu_allocator()->DeallocateRaw(ptr);
+    }
+  }
+  // Identifies subgraph inputs which are also outputs and maps the output
+  // tensor id to the input number. This means that tensors which are both
+  // subgraph inputs and outputs can be handled with zero copies.
+  void OutputToInputMap(OpKernelContext* ctx,
+                        tflite::Subgraph& subgraph_selected) {
+    if (!fill_output_to_input_map_) {
+      return;
+    }
+    for (int i = 0; i < subgraph_selected.inputs().size(); ++i) {
+      int input_idx = subgraph_selected.inputs()[i];
+      for (int j = 0; j < subgraph_selected.outputs().size(); ++j) {
+        int output_idx = subgraph_selected.outputs()[j];
+        if (input_idx == output_idx) {
+          output_to_input_map_[output_idx] = i;
+          break;
+        }
+      }
+    }
+    fill_output_to_input_map_ = false;
+  }
+
+  bool TensorCanBeShared(const TfLiteTensor* tensor) const {
+    if ((tensor->type == kTfLiteResource || tensor->type == kTfLiteVariant ||
+         tensor->type == kTfLiteString)) {
+      return false;
+    }
+    if (tensor->allocation_type != kTfLiteArenaRw &&
+        tensor->allocation_type != kTfLiteArenaRwPersistent &&
+        tensor->allocation_type != kTfLiteCustom) {
+      return false;
+    }
+    return true;
+  }
+
+  // Sets custom allocators for all output tensors which are not Resources,
+  // Variants or Strings. This means that these tensors can share the same
+  // memory as the TF tensors, reducing memcpys and memory usage.
+  void SetCustomAllocatorsForOutputTensors(
+      OpKernelContext* ctx, tflite::Subgraph& subgraph_selected) {
+    custom_output_ptrs_.clear();
+    for (int i = 0; i < subgraph_selected.outputs().size(); ++i) {
+      int tensor_idx = subgraph_selected.outputs()[i];
+      TfLiteTensor* subgraph_output = subgraph_selected.tensor(tensor_idx);
+      if (!TensorCanBeShared(subgraph_output)) {
+        continue;
+      }
+      if (output_to_input_map_.find(tensor_idx) != output_to_input_map_.end()) {
+        continue;
+      }
+      void* ptr = tensorflow::cpu_allocator()->AllocateRaw(
+          EIGEN_MAX_ALIGN_BYTES, subgraph_output->bytes);
+      custom_output_ptrs_.push_back(ptr);
+      TfLiteCustomAllocation allocation{ptr, subgraph_output->bytes};
+      OP_REQUIRES(
+          ctx,
+          subgraph_selected.SetCustomAllocationForTensor(
+              tensor_idx, allocation,
+              // Using kTfLiteCustomAllocationFlagsSkipAlignCheck is
+              // safe as the pointer comes from TensorFlow.
+              // TODO(b/257964109): Remove this flag when fixed.
+              kTfLiteCustomAllocationFlagsSkipAlignCheck) == kTfLiteOk,
+          errors::Internal(
+              "Failed to set custom allocation for output tensor %d, name: %s",
+              tensor_idx, subgraph_output->name));
+    }
+  }
+
+  // Sets custom allocators for all input tensors which are not Resources,
+  // Variants or Strings. This means that these tensors can share the same
+  // memory as the TF tensors, reducing memcpys and memory usage.
+  void SetCustomAllocatorsForInputTensors(OpKernelContext* ctx,
+                                          tflite::Subgraph& subgraph_selected) {
+    for (int i = 0; i < subgraph_selected.inputs().size(); ++i) {
+      int tensor_idx = subgraph_selected.inputs()[i];
+      TfLiteTensor* subgraph_input = subgraph_selected.tensor(tensor_idx);
+      if (!TensorCanBeShared(subgraph_input)) {
+        continue;
+      }
+      const Tensor& tf_tensor = ctx->input(i + 1);
+      TfLiteCustomAllocation allocation{tf_tensor.data(),
+                                        tf_tensor.AllocatedBytes()};
+      OP_REQUIRES(ctx,
+                  subgraph_selected.SetCustomAllocationForTensor(
+                      tensor_idx, allocation,
+                      // Using kTfLiteCustomAllocationFlagsSkipAlignCheck is
+                      // safe as the pointer comes from TensorFlow.
+                      // TODO(b/257964109): Remove this flag when fixed.
+                      kTfLiteCustomAllocationFlagsSkipAlignCheck) == kTfLiteOk,
+                  errors::Internal(
+                      "Failed to set custom allocation for input tensor %d",
+                      tensor_idx));
+    }
+  }
+
   void ResizeInputTensor(OpKernelContext* ctx,
                          tflite::Subgraph& subgraph_selected) {
     for (int i = 0; i < subgraph_selected.inputs().size(); ++i) {
@@ -116,12 +250,15 @@ class TfLiteSubgraphExecute : public OpKernel {
       TfLiteTensor* subgraph_input =
           subgraph_selected.tensor(subgraph_selected.inputs()[i]);
 
-      bool need_resize = false;
-      for (int dim = 0; dim < tf_tensor.shape().dims(); dim++) {
-        if (tf_tensor.shape().dim_size(dim) !=
-            subgraph_input->dims->data[dim]) {
-          need_resize = true;
-          break;
+      // Always resize for unranked tensors.
+      bool need_resize = (subgraph_input->dims->size == 0);
+      if (!need_resize) {
+        for (int dim = 0; dim < tf_tensor.shape().dims(); dim++) {
+          if (tf_tensor.shape().dim_size(dim) !=
+              subgraph_input->dims->data[dim]) {
+            need_resize = true;
+            break;
+          }
         }
       }
       if (need_resize) {
@@ -189,7 +326,7 @@ class TfLiteSubgraphExecute : public OpKernel {
         }
       } else if (subgraph_input->type == kTfLiteVariant) {
         InitializeVariantOrResource(tf_tensor, subgraph_input);
-      } else {
+      } else if (!TensorCanBeShared(subgraph_input)) {
         tensorflow::StringPiece tensor_data = tf_tensor.tensor_data();
         OP_REQUIRES(ctx, subgraph_input->bytes == tensor_data.size(),
                     errors::Internal("tensor size doesn't match"));
@@ -212,20 +349,54 @@ class TfLiteSubgraphExecute : public OpKernel {
       TfLiteTensor* subgraph_output =
           subgraph_selected.tensor(subgraph_selected.outputs()[i]);
 
-      // Forcing a memcpy for each tensor output from the called dataset
-      // subgraph. This is because the callee subgraph might be invoked
+      // If the output is also an input, create the output tensor using the same
+      // buffer as the input tensor so that no copy is needed and to also save
+      // memory.
+      int output_idx = subgraph_selected.outputs()[i];
+      auto it = output_to_input_map_.find(output_idx);
+      if (it != output_to_input_map_.end()) {
+        const Tensor& tf_tensor = ctx->input(it->second + 1);
+        Tensor tensor(tf_tensor);
+        ctx->set_output(i, std::move(tensor));
+        continue;
+      }
+      // Take ownership of the TFLite output ptr where possible, otherwise copy
+      // the output. This is because the callee subgraph might be invoked
       // repeatedly for each item in the dataset, and the result TfLiteTensor's
       // data should be immediately copied into tensorflow::Tensor.
       Tensor tensor;
-      OP_REQUIRES_OK(
-          ctx, tflite::flex::SetTfTensorFromTfLite(subgraph_output, &tensor,
-                                                   /*allow_reusing=*/false));
+      if (output_tensors_can_be_shared_ && TensorCanBeShared(subgraph_output)) {
+        tflite::flex::TfLiteTensorBuffer* buf =
+            new tflite::flex::TfLiteTensorBuffer(subgraph_output, true);
+        buf->TakeOwnershipOfBuffer();
+        tensorflow::TensorShape shape;
+        int num_dims = subgraph_output->dims->size;
+        for (int i = 0; i < num_dims; ++i) {
+          OP_REQUIRES_OK(
+              ctx, shape.AddDimWithStatus(subgraph_output->dims->data[i]));
+        }
+        tensor = tensorflow::TensorCApi::MakeTensor(
+            tflite::flex::GetTensorFlowDataType(subgraph_output->type), shape,
+            buf);
+        buf->Unref();
+      } else {
+        OP_REQUIRES_OK(
+            ctx, tflite::flex::SetTfTensorFromTfLite(subgraph_output, &tensor,
+                                                     /*allow_reusing=*/false));
+      }
       ctx->set_output(i, std::move(tensor));
     }
   }
 
   // Tells if the target subgraph needs to invoko AllocateTensors().
   bool tfl_tensors_need_allocation_;
+  bool fill_output_to_input_map_;
+  // NOLINTNEXTLINE - absl::flat_hash_map increases binary size by 106kB.
+  std::unordered_map<int, int> output_to_input_map_;
+  bool first_run_;
+  bool output_tensors_can_be_shared_;
+  std::vector<void*> custom_output_ptrs_;
+  std::vector<TfLiteAllocationType> original_allocation_type_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TfLiteSubgraphExecute").Device(DEVICE_CPU),
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index e8956b31aa4..3d67668a60a 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
@@ -232,7 +232,7 @@ TEST(UtilTest, CreateTfTensorFromTfLiteTensorFloat) {
 }
 
 TEST(UtilTest, CreateTfTensorFromTfLiteTensorString) {
-  TfLiteTensor tflite_tensor;
+  TfLiteTensor tflite_tensor{};
   tflite_tensor.type = kTfLiteString;
   tflite_tensor.is_variable = false;
   tflite_tensor.sparsity = nullptr;
@@ -252,7 +252,7 @@ TEST(UtilTest, CreateTfTensorFromTfLiteTensorString) {
   std::string data_arr[] = {std::string("a_str\0ing", 9), "b_string"};
   tflite::DynamicBuffer buf;
   for (const auto& value : data_arr) {
-    buf.AddString(value.data(), value.length());
+    ASSERT_EQ(buf.AddString(value.data(), value.length()), kTfLiteOk);
   }
   buf.WriteToTensor(&tflite_tensor, nullptr);
 
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index bb11f9a7ef8..692d6556a6a 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -5,6 +5,7 @@ load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -39,6 +40,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
@@ -81,6 +83,7 @@ objc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -97,7 +100,6 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/metal:metal_spatial_tensor",
         "@com_google_absl//absl/types:span",
     ],
-    alwayslink = 1,
 )
 
 objc_library(
@@ -106,7 +108,6 @@ objc_library(
     copts = ["-std=c++17"],
     sdk_frameworks = ["Metal"],
     deps = ["//tensorflow/lite/delegates/gpu:metal_delegate"],
-    alwayslink = 1,
 )
 
 # build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --linkopt -s --strip always :libtensorflowlite_gpu_gl.so
@@ -164,6 +165,10 @@ ios_static_framework(
 # bazel build -c opt --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always --cxxopt=-std=c++14 :tensorflow_lite_gpu_dylib --apple_platform_type=macos
 macos_dylib(
     name = "tensorflow_lite_gpu_dylib",
+    linkopts = [
+        "-all_load",
+        "-dead_strip",
+    ],
     minimum_os_version = "10.13",
     tags = [
         "manual",
@@ -216,7 +221,10 @@ cc_library(
     name = "delegate_options",
     srcs = ["delegate_options.cc"],
     hdrs = ["delegate_options.h"],
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+    ],
 )
 
 cc_library(
@@ -237,9 +245,13 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/profiling/telemetry",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
+        "//tensorflow/lite/profiling/telemetry:telemetry_status",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/delegates/gpu/cl:api",
         "//tensorflow/lite/delegates/gpu/cl:util",
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index b4dccb95944..280f416b0e6 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -327,6 +327,11 @@ enum class InferenceUsage {
   // Prefer maximizing the throughput. Same inference runner will be used
   // repeatedly on different inputs.
   SUSTAINED_SPEED,
+
+  // Balance init latency and throughput. This option will result in slightly
+  // higher init latency than FAST_SINGLE_ANSWER but should have inference
+  // latency closer to SUSTAINED_SPEED.
+  BALANCED,
 };
 
 // Defines aspects to control while instantiating a runner.
diff --git a/tensorflow/lite/delegates/gpu/build_defs.bzl b/tensorflow/lite/delegates/gpu/build_defs.bzl
index b080fad8e2b..462ec7c2218 100644
--- a/tensorflow/lite/delegates/gpu/build_defs.bzl
+++ b/tensorflow/lite/delegates/gpu/build_defs.bzl
@@ -1,5 +1,17 @@
 """Additional build options needed for the GPU Delegate."""
 
+# copybara:uncomment_begin(google-only)
+# load("//third_party/android/ndk/platforms:grte_top.bzl", "min_supported_ndk_api")
+# copybara:uncomment_end
+
+def nativewindow_linkopts():
+    # copybara:uncomment_begin(google-only)
+    # return min_supported_ndk_api("26", ["-lnativewindow"])
+    # copybara:uncomment_end
+    # copybara:comment_begin(oss-only)
+    return ["-lnativewindow"]
+    # copybara:comment_end
+
 def gpu_delegate_linkopts():
     """Additional link options needed when linking in the GPU Delegate."""
     return select({
@@ -12,4 +24,4 @@ def gpu_delegate_linkopts():
             "-lGLESv2",
         ],
         "//conditions:default": [],
-    })
+    }) + nativewindow_linkopts()
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 9a42d9bb455..f4d46c53eeb 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -6,6 +6,7 @@ load(
 load("//tensorflow:tensorflow.bzl", "if_google", "workspace_root")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -358,6 +359,7 @@ cc_library(
         ":tensor_type_util",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/gpu:api",
         "//tensorflow/lite/delegates/gpu:delegate",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -429,14 +431,25 @@ cc_library(
             "-ldl",  # opencl_wrapper calls dlopen()
             "-lm",
         ],
+        # copybara:uncomment_begin(google-only)
+        # "//tools/cc_target_os:linux-google": [
+        # "-ldl",
+        # "-rdynamic",
+        # ],
+        # copybara:uncomment_end
         "//conditions:default": ["-ldl"],  # opencl_wrapper calls dlopen()
     }),
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
         "@opencl_headers",
+        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/cl/" + if_google("google", "default") + ":qcom_wrapper",
-    ],
+    ] + select({
+        # copybara:uncomment_begin(google-only)
+        # "//tools/cc_target_os:linux-google": ["//third_party/opencl_icd_loader"],
+        # copybara:uncomment_end
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 115a34f3f93..21462b111af 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -664,6 +664,8 @@ CreateGpuModelInfo GetCreateInfo(const Environment& environment,
   if (options.usage == InferenceUsage::FAST_SINGLE_ANSWER) {
     create_info.hints.Add(ModelHints::kReduceKernelsCount);
     create_info.hints.Add(ModelHints::kFastTuning);
+  } else if (options.usage == InferenceUsage::BALANCED) {
+    create_info.hints.Add(ModelHints::kReduceKernelsCount);
   } else if (options.usage == InferenceUsage::SUSTAINED_SPEED) {
     create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc b/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc
index 68f9d1c86f2..da027d77afc 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments.cc
@@ -624,6 +624,12 @@ absl::Status CLArguments::Bind(cl_kernel kernel, int offset) {
   return absl::OkStatus();
 }
 
+bool CLArguments::HasEqualScalarArguments(const CLArguments& other) const {
+  return (other.int_values_ == int_values_ &&
+          other.float_values_ == float_values_ &&
+          other.half_values_ == half_values_);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_arguments.h b/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
index a192c842e64..463be29555b 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_arguments.h
@@ -52,6 +52,10 @@ class CLArguments : public ArgumentsBinder {
 
   absl::Status Bind(cl_kernel kernel, int offset = 0);
 
+  // Compares Int, Float, Half names to values mapping with the mapping in
+  // `other` and returns true if they are the same.
+  bool HasEqualScalarArguments(const CLArguments& other) const;
+
  private:
   absl::Status AllocateObjects(const Arguments& args, CLContext* context);
   absl::Status AddObjectArgs(const GpuInfo& gpu_info, const Arguments& args);
@@ -91,6 +95,11 @@ class CLArguments : public ArgumentsBinder {
 
     // offset to shared storage.
     uint32_t offset = -1;
+
+    bool operator==(const IntValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
   };
   std::map<std::string, IntValue> int_values_;
   std::vector<int32_t> shared_int4s_data_;
@@ -104,6 +113,11 @@ class CLArguments : public ArgumentsBinder {
 
     // offset to shared storage.
     uint32_t offset = -1;
+
+    bool operator==(const FloatValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
   };
   std::map<std::string, FloatValue> float_values_;
   std::vector<float> shared_float4s_data_;
@@ -120,6 +134,11 @@ class CLArguments : public ArgumentsBinder {
 
     // offset to shared uniform storage.
     uint32_t offset = -1;
+
+    bool operator==(const HalfValue& other) const {
+      return value == other.value && offset == other.offset &&
+             active == other.active;
+    }
   };
   std::map<std::string, HalfValue> half_values_;
   std::vector<half> shared_half4s_data_;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_operation.h b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
index 9d59e3f6eb8..c9192830a02 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
@@ -122,6 +122,10 @@ class ClOperation {
 
   int3 GetWorkGroupSize() const { return operation_->work_group_size_; }
 
+  bool HasEqualScalarArguments(const ClOperation& op) const {
+    return cl_args_.HasEqualScalarArguments(op.cl_args_);
+  }
+
  private:
   std::unique_ptr<GPUOperation> operation_;
   CLKernel kernel_;
diff --git a/tensorflow/lite/delegates/gpu/cl/default/BUILD b/tensorflow/lite/delegates/gpu/cl/default/BUILD
index db93f995e29..d34cce6f6fb 100644
--- a/tensorflow/lite/delegates/gpu/cl/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/default/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index b1dc786248e..6f0d59e4296 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
@@ -682,9 +683,35 @@ absl::Status InferenceContext::Compile(
 absl::Status InferenceContext::Tune(TuningType tuning_type,
                                     const GpuInfo& gpu_info,
                                     ProfilingCommandQueue* profiling_queue) {
+  // Cache tuned CL operations. Multiple CL operations might share the
+  // same kernel but use different inputs, which might require different working
+  // group setups. Therefore, we store a vector of tuned cl operations for each
+  // kernel and match in a second stage based on equal CL arguments.
+  typedef std::reference_wrapper<const ClOperation> ClOperationRef;
+  absl::flat_hash_map<uint64_t, std::vector<ClOperationRef>> tuned_ops;
+
   for (auto& node : nodes_) {
+    uint64_t fingerprint = node.cl_operation.GetKernelFingerprint();
+    auto cl_ops_it = tuned_ops.find(fingerprint);
+    bool found_cached_cl_op = false;
+    if (cl_ops_it != tuned_ops.end()) {
+      for (const auto& cl_op : cl_ops_it->second) {
+        if (!node.cl_operation.HasEqualScalarArguments(cl_op)) {
+          continue;
+        }
+        // Fingerprint and CLArguments match, so we reuse the work group size.
+        node.cl_operation.GetGpuOperation().work_group_size_ =
+            cl_op.get().GetGpuOperation().work_group_size_;
+        node.cl_operation.GetGpuOperation().RecalculateWorkGroupsCount();
+        found_cached_cl_op = true;
+      }
+    }
+    if (found_cached_cl_op) {
+      continue;
+    }
     RETURN_IF_ERROR(
         node.cl_operation.Tune(tuning_type, gpu_info, profiling_queue));
+    tuned_ops[fingerprint].emplace_back(std::cref(node.cl_operation));
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 983b0e90119..bf69869c4e6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index 8648971b708..dbfb835b4cd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -528,20 +528,22 @@ class CpuCopier : public OpenClConverterImpl {
                                const TensorObject& tensor_obj) {
     const size_t num_elements = cpu_memory->size_bytes;
     const bool* bool_data = reinterpret_cast<bool*>(cpu_memory->data);
-    std::vector<uint8_t> tmp_data(num_elements);
+    tmp_bool_data_ = std::make_unique<std::vector<uint8_t>>();
+    tmp_bool_data_->reserve(num_elements);
     for (int i = 0; i < num_elements; ++i) {
-      tmp_data[i] = bool_data[i];
+      tmp_bool_data_->push_back(bool_data[i]);
     }
     auto texture_output = std::get_if<OpenClTexture>(&tensor_obj);
     if (texture_output) {
       return queue_->EnqueueWriteImage(texture_output->memobj,
                                        int3(region_[0], region_[1], region_[2]),
-                                       tmp_data.data(), async_);
+                                       tmp_bool_data_->data(), async_);
     }
     auto buffer_output = std::get_if<OpenClBuffer>(&tensor_obj);
     if (buffer_output) {
-      return queue_->EnqueueWriteBuffer(buffer_output->memobj, tmp_data.size(),
-                                        tmp_data.data(), async_);
+      return queue_->EnqueueWriteBuffer(buffer_output->memobj,
+                                        tmp_bool_data_->size(),
+                                        tmp_bool_data_->data(), async_);
     }
     return absl::InternalError("Unexpected object");
   }
@@ -550,6 +552,7 @@ class CpuCopier : public OpenClConverterImpl {
   bool async_;
   DataType input_data_type_;
   DataType output_data_type_;
+  std::unique_ptr<std::vector<uint8_t>> tmp_bool_data_;
 };
 
 class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 307ffb7364d..743d22219c7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -27,219 +27,126 @@ namespace gpu {
 namespace cl {
 namespace {
 
-TEST_F(OpenCLOperationTest, Abs) {
-  auto status = AbsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Abs) { ASSERT_OK(AbsTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Cos) {
-  auto status = CosTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Cos) { ASSERT_OK(CosTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Copy) {
-  auto status = CopyTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Copy) { ASSERT_OK(CopyTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Elu) {
-  auto status = EluTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Elu) { ASSERT_OK(EluTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Exp) {
-  auto status = ExpTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Exp) { ASSERT_OK(ExpTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Floor) {
-  auto status = FloorTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Floor) { ASSERT_OK(FloorTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, FloorDiv) {
-  auto status = FloorDivTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, FloorDiv) { ASSERT_OK(FloorDivTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, FloorMod) {
-  auto status = FloorModTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, FloorMod) { ASSERT_OK(FloorModTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, HardSwish) {
-  auto status = HardSwishTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, HardSwish) { ASSERT_OK(HardSwishTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Log) {
-  auto status = LogTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Log) { ASSERT_OK(LogTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Neg) {
-  auto status = NegTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Neg) { ASSERT_OK(NegTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Rsqrt) {
-  auto status = RsqrtTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Rsqrt) { ASSERT_OK(RsqrtTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Sigmoid) {
-  auto status = SigmoidTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Sigmoid) { ASSERT_OK(SigmoidTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Sin) {
-  auto status = SinTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Sin) { ASSERT_OK(SinTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Sqrt) {
-  auto status = SqrtTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Sqrt) { ASSERT_OK(SqrtTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Square) {
-  auto status = SquareTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Square) { ASSERT_OK(SquareTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Tanh) {
-  auto status = TanhTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Tanh) { ASSERT_OK(TanhTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Sub) {
-  auto status = SubTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Sub) { ASSERT_OK(SubTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, SquaredDiff) {
-  auto status = SquaredDiffTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SquaredDiffTest(&exec_env_));
 }
 
-TEST_F(OpenCLOperationTest, Div) {
-  auto status = DivTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Div) { ASSERT_OK(DivTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Pow) {
-  auto status = PowTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Pow) { ASSERT_OK(PowTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Add) {
-  auto status = AddTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Add) { ASSERT_OK(AddTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Maximum) {
-  auto status = MaximumTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Maximum) { ASSERT_OK(MaximumTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, MaximumWithScalar) {
-  auto status = MaximumWithScalarTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MaximumWithScalarTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
-  auto status = MaximumWithConstantLinearTensorTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MaximumWithConstantLinearTensorTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
-  auto status = MaximumWithConstantHWCTensorTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MaximumWithConstantHWCTensorTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
-  auto status = MaximumWithConstantHWCTensorBroadcastChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MaximumWithConstantHWCTensorBroadcastChannelsTest(&exec_env_));
 }
 
-TEST_F(OpenCLOperationTest, Minimum) {
-  auto status = MinimumTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Minimum) { ASSERT_OK(MinimumTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, MinimumWithScalar) {
-  auto status = MinimumWithScalarTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MinimumWithScalarTest(&exec_env_));
 }
 
-TEST_F(OpenCLOperationTest, Mul) {
-  auto status = MulTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Mul) { ASSERT_OK(MulTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, MulBroadcastHW) {
-  auto status = MulBroadcastHWTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MulBroadcastHWTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
-  auto status = MulBroadcastChannelsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MulBroadcastChannelsTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
-  auto status = SubWithScalarAtFirstPositionTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(SubWithScalarAtFirstPositionTest(&exec_env_));
 }
 
-TEST_F(OpenCLOperationTest, Less) {
-  auto status = LessTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Less) { ASSERT_OK(LessTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, LessEqual) {
-  auto status = LessEqualTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, LessEqual) { ASSERT_OK(LessEqualTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, Greater) {
-  auto status = GreaterTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Greater) { ASSERT_OK(GreaterTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, GreaterEqual) {
-  auto status = GreaterEqualTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(GreaterEqualTest(&exec_env_));
 }
 
-TEST_F(OpenCLOperationTest, Equal) {
-  auto status = EqualTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, Equal) { ASSERT_OK(EqualTest(&exec_env_)); }
 
-TEST_F(OpenCLOperationTest, NotEqual) {
-  auto status = NotEqualTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
-}
+TEST_F(OpenCLOperationTest, NotEqual) { ASSERT_OK(NotEqualTest(&exec_env_)); }
 
 TEST_F(OpenCLOperationTest, CosBroadcast) {
-  auto status = CosBroadcastTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(CosBroadcastTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MaximumScalarBroadcastInput) {
-  auto status = MaximumScalarBroadcastInputTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MaximumScalarBroadcastInputTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MulLinearBroadcastInput) {
-  auto status = MulLinearBroadcastInputTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MulLinearBroadcastInputTest(&exec_env_));
 }
 
 TEST_F(OpenCLOperationTest, MulBroadcastBothInputs) {
-  auto status = MulBroadcastBothInputsTest(&exec_env_);
-  ASSERT_TRUE(status.ok()) << status.error_message();
+  ASSERT_OK(MulBroadcastBothInputsTest(&exec_env_));
+}
+
+TEST_F(OpenCLOperationTest, LogicalAndTest) {
+  ASSERT_OK(LogicalAndTest(&exec_env_));
+}
+
+TEST_F(OpenCLOperationTest, LogicalAndWithConstantTest) {
+  ASSERT_OK(LogicalAndWithConstantTest(&exec_env_));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
index 7df31f26875..df5d69c5660 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
@@ -1115,7 +1115,7 @@ TEST_F(LstmOpTest, Cifg_Peephole_Projection_LayerNorm) {
   VerifyGoldens(&lstm, 0.00001f);
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST_F(LstmOpTest, InvalidTypes) {
   const int n_batch = 1;
   const int n_input = 2;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index 3aa1424fe29..103c253970d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -39,6 +39,11 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
   ASSERT_TRUE(status.ok()) << status.error_message();
 }
 
+TEST_F(OpenCLOperationTest, Winograd4x4To36Batch) {
+  auto status = Winograd4x4To36BatchTest(&exec_env_);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+}
+
 TEST_F(OpenCLOperationTest, Winograd36To4x4) {
   auto status = Winograd36To4x4Test(&exec_env_);
   ASSERT_TRUE(status.ok()) << status.error_message();
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 7bf63235972..d81c4d96cb6 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -96,6 +96,16 @@ absl::Status LoadOpenCL() {
     LoadOpenCLFunctions(libopencl, false);
     return absl::OkStatus();
   }
+  // Check if OpenCL functions are found via OpenCL ICD Loader.
+  LoadOpenCLFunctions(libopencl, false);
+  if (clGetPlatformIDs != nullptr) {
+    cl_uint num_platforms;
+    cl_int status = clGetPlatformIDs(0, nullptr, &num_platforms);
+    if (status == CL_SUCCESS && num_platforms != 0) {
+      return absl::OkStatus();
+    }
+    return absl::UnknownError("OpenCL is not supported.");
+  }
   // record error
   std::string error(dlerror());
   return absl::UnknownError(
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 53a1e95e7fc..f5c26496875 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -64,12 +65,12 @@ cc_binary(
     name = "memory_sharing_sample",
     srcs = ["memory_sharing_sample.cc"],
     deps = [
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/time",
     ],
 )
@@ -78,12 +79,12 @@ cc_binary(
     name = "performance_profiling",
     srcs = ["performance_profiling.cc"],
     deps = [
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/memory_sharing_sample.cc b/tensorflow/lite/delegates/gpu/cl/testing/memory_sharing_sample.cc
index ee7cfb94ade..78e1f70944c 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/memory_sharing_sample.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/memory_sharing_sample.cc
@@ -16,15 +16,17 @@ limitations under the License.
 #include <algorithm>
 #include <chrono>  // NOLINT(build/c++11)
 #include <iostream>
+#include <ostream>
+#include <set>
 #include <string>
 
 #include "absl/time/time.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 7271601bb86..15018715fc3 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/time/time.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 namespace gpu {
@@ -292,8 +292,12 @@ absl::Status RunModelSample(const std::string& model_name) {
   std::cout << "Storage type: " << ToString(create_info.storage_type)
             << std::endl;
   InferenceContext context;
+  const auto start_init = std::chrono::high_resolution_clock::now();
   RETURN_IF_ERROR(
       context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
+  const auto end_init = std::chrono::high_resolution_clock::now();
+  std::cout << "Graph initialization time: "
+            << (end_init - start_init).count() * 1e-6f << " ms." << std::endl;
 
   auto* queue = env.profiling_queue();
   ProfilingInfo profiling_info;
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 048dfcf66c6..0d5426c6c39 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -3,6 +3,7 @@ load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("//tensorflow:tensorflow.bzl", "workspace_root")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -93,6 +94,7 @@ cc_library(
         ":status",
         "//tensorflow/lite/delegates/gpu/common/task:testing_util",
         "//tensorflow/lite/delegates/gpu/common/tasks:cast",
+        "//tensorflow/lite/delegates/gpu/common/tasks:concat_z",
         "//tensorflow/lite/delegates/gpu/common/tasks:conv_generic",
         "//tensorflow/lite/delegates/gpu/common/tasks:elementwise",
         "//tensorflow/lite/delegates/gpu/common/tasks:prelu",
@@ -206,6 +208,7 @@ cc_library(
         ":tensor",
         "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:lstm_shared",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -241,13 +244,15 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/delegates:utils",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/tools/versioning:gpu_compatibility",
         "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/kernels:kernel_util",
@@ -267,6 +272,7 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
@@ -287,6 +293,7 @@ cc_library(
         ":tensor",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -324,6 +331,7 @@ cc_library(
         ":status",
         ":tensor",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
@@ -363,6 +371,7 @@ cc_library(
     deps = [
         ":status",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:types",
@@ -378,6 +387,7 @@ cc_test(
         ":quantization_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
@@ -477,6 +487,7 @@ cc_library(
         ":shape",
         ":status",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/default/BUILD b/tensorflow/lite/delegates/gpu/common/default/BUILD
index 167ab13a36c..f6456acb211 100644
--- a/tensorflow/lite/delegates/gpu/common/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/default/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/google/BUILD b/tensorflow/lite/delegates/gpu/common/google/BUILD
index 9e98ad56212..cd8e622120a 100644
--- a/tensorflow/lite/delegates/gpu/common/google/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/google/BUILD
@@ -14,6 +14,7 @@
 # ==============================================================================
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 7f785477003..fec97d51601 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -555,6 +555,14 @@ void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
   std::string lowered = gpu_description;
   absl::AsciiStrToLower(&lowered);
   gpu_info->vendor = GetGpuVendor(lowered);
+
+  // Because clvk is an OpenCL layer on top of vulkan, it does not react to CL
+  // optimisation as native CL implementation does. For the time being, let's
+  // manage it manually with explicit conditions in the code.
+  if (gpu_info->IsApiOpenCl() && gpu_info->opencl_info.IsCLVK()) {
+    gpu_info->vendor = GpuVendor::kUnknown;
+  }
+
   if (gpu_info->IsAdreno()) {
     gpu_info->adreno_info = AdrenoInfo(lowered);
   } else if (gpu_info->IsApple()) {
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 9b31ac4143a..10fc39a7030 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 
 namespace tflite {
@@ -385,6 +386,8 @@ struct OpenClInfo {
   SupportedImage2dTypes supported_images_2d;
 
   bool IsImage2dFromBufferSupported() const;
+
+  bool IsCLVK() const { return absl::StrContains(platform_version, "clvk"); }
 };
 
 enum class MetalLanguageVersion {
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.cc b/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.cc
index d13f54c0db0..7d30d8c2b1f 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/cast.h"
+#include "tensorflow/lite/delegates/gpu/common/tasks/concat_z.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/conv_generic.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/common/tasks/prelu.h"
@@ -1722,5 +1723,106 @@ absl::Status TestLinkingAddAddMulOp(TestExecutionEnvironment* env,
   return absl::OkStatus();
 }
 
+absl::Status TestLinkingConcatAndCosOp(TestExecutionEnvironment* env) {
+  GraphFloat32 graph;
+  auto input0 = graph.NewValue();
+  input0->tensor.type = DataType::FLOAT32;
+  input0->tensor.shape = BHWC(1, 32, 32, 21);
+  auto input1 = graph.NewValue();
+  input1->tensor.type = DataType::FLOAT32;
+  input1->tensor.shape = BHWC(1, 32, 32, 7);
+
+  auto concat_node = graph.NewNode();
+  concat_node->operation.type = ToString(tflite::gpu::OperationType::CONCAT);
+
+  ConcatAttributes concat_attr;
+  concat_attr.axis = Axis::CHANNELS;
+
+  concat_node->operation.attributes = concat_attr;
+  RETURN_IF_ERROR(graph.AddConsumer(concat_node->id, input0->id));
+  RETURN_IF_ERROR(graph.AddConsumer(concat_node->id, input1->id));
+
+  auto cos_node = graph.NewNode();
+  cos_node->operation.type = ToString(OperationType::COS);
+  tflite::gpu::Value* concat_output = nullptr;
+  RETURN_IF_ERROR(
+      ConnectTwoNodes(&graph, concat_node, cos_node, &concat_output));
+  concat_output->tensor.type = DataType::FLOAT32;
+  concat_output->tensor.shape = BHWC(1, 32, 32, 28);
+
+  tflite::gpu::Value* cos_output = nullptr;
+  RETURN_IF_ERROR(AddOutput(&graph, cos_node, &cos_output));
+  cos_output->tensor.type = DataType::FLOAT32;
+  cos_output->tensor.shape = BHWC(1, 32, 32, 28);
+
+  RETURN_IF_ERROR(RunGraphTransformsForGpuModel(&graph));
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      CreateGpuModelInfo create_info;
+      create_info.precision = precision;
+      create_info.storage_type = storage;
+
+      GpuModel gpu_model;
+      RETURN_IF_ERROR(
+          GraphToGpuModel(graph, create_info, env->GetGpuInfo(), &gpu_model));
+
+      if (gpu_model.nodes.size() != 1) {
+        return absl::InternalError("Expected model with one node.");
+      }
+
+      TensorFloat32 src_tensor0;
+      src_tensor0.shape = input0->tensor.shape;
+      src_tensor0.data.resize(src_tensor0.shape.DimensionsProduct());
+      for (int i = 0; i < src_tensor0.data.size(); ++i) {
+        src_tensor0.data[i] = std::sin(i * 0.12345f);
+      }
+      TensorFloat32 src_tensor1;
+      src_tensor1.shape = input1->tensor.shape;
+      src_tensor1.data.resize(src_tensor1.shape.DimensionsProduct());
+      for (int i = 0; i < src_tensor1.data.size(); ++i) {
+        src_tensor1.data[i] = std::sin(i * 0.12345f);
+      }
+
+      TensorFloat32 dst_tensor_v1;
+      RETURN_IF_ERROR(env->ExecuteGpuModel(
+          {src_tensor0, src_tensor1},
+          std::vector<TensorFloat32*>{&dst_tensor_v1}, &gpu_model));
+
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+
+      OperationDef op_def_two_input;
+      op_def_two_input.precision = precision;
+      op_def_two_input.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def_two_input.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def_two_input.dst_tensors.push_back({data_type, storage, Layout::HWC});
+
+      GPUOperation concat_operation =
+          CreateConcatZ(op_def_two_input, {21, 7}, env->GetGpuInfo());
+      TensorFloat32 intermediate;
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          {src_tensor0, src_tensor1},
+          std::make_unique<GPUOperation>(std::move(concat_operation)),
+          concat_output->tensor.shape, &intermediate));
+
+      GPUOperation cos_operation = CreateElementwiseOneInput(
+          env->GetGpuInfo(), op_def, OperationType::COS);
+      TensorFloat32 dst_tensor_v0;
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          intermediate,
+          std::make_unique<GPUOperation>(std::move(cos_operation)),
+          cos_output->tensor.shape, &dst_tensor_v0));
+
+      RETURN_IF_ERROR(
+          PointWiseNear(dst_tensor_v0.data, dst_tensor_v1.data, 0.0f));
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h b/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h
index 1589705b0d1..1ed103459c4 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_test_util.h
@@ -180,6 +180,15 @@ absl::Status TestLinkingSliceCastOp(TestExecutionEnvironment* env);
 absl::Status TestLinkingAddAddMulOp(TestExecutionEnvironment* env,
                                     bool use_second_input_add);
 
+//    input
+//      |
+//   concat
+//      |
+//   cosinus
+//      |
+//   output
+absl::Status TestLinkingConcatAndCosOp(TestExecutionEnvironment* env);
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.cc b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
index 0d1678773a6..5f1b9723feb 100644
--- a/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.h b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
index b7c32371abc..b032a48c014 100644
--- a/tensorflow/lite/delegates/gpu/common/lstm_parser.h
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
@@ -16,8 +16,8 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index f37f05f8517..4051bc0a096 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -32,9 +32,9 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
@@ -112,8 +112,10 @@ absl::Status NewConstNode(TensorFloat32 t, GraphFloat32* graph, Value** value) {
   return absl::OkStatus();
 }
 
-absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
-                                        TensorOrScalar* tensor_or_scalar) {
+template <DataType DataTypeT, typename T>
+absl::Status ParseInputsWithConstTensorImpl(
+    Node* node, ObjectReader* reader,
+    TensorOrScalarBase<DataTypeT, T>* tensor_or_scalar) {
   const std::string& opname = node->operation.type;
 
   // Determine runtime/constant tensors.
@@ -149,24 +151,24 @@ absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
     }
     RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
     if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
-      Tensor<Scalar, DataType::FLOAT32> tensor;
+      Tensor<Scalar, DataTypeT> tensor;
       RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      *tensor_or_scalar = tensor.data[0];
+      *tensor_or_scalar = static_cast<T>(tensor.data[0]);
     } else {
       if (CheckIfLinearConvertible(constant_dims).ok()) {
-        Tensor<Linear, DataType::FLOAT32> tensor;
+        Tensor<Linear, DataTypeT> tensor;
         RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
         *tensor_or_scalar = std::move(tensor);
       } else if (constant_dims->size == 2) {
-        Tensor<HW, DataType::FLOAT32> tensor_hw;
+        Tensor<HW, DataTypeT> tensor_hw;
         RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor_hw));
-        Tensor<HWC, DataType::FLOAT32> tensor;
+        Tensor<HWC, DataTypeT> tensor;
         tensor.id = tensor_hw.id;
         tensor.shape = HWC(1, tensor_hw.shape.h, tensor_hw.shape.w);
         tensor.data = tensor_hw.data;
         *tensor_or_scalar = std::move(tensor);
       } else {
-        Tensor<HWC, DataType::FLOAT32> tensor;
+        Tensor<HWC, DataTypeT> tensor;
         RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
         *tensor_or_scalar = std::move(tensor);
       }
@@ -175,6 +177,39 @@ absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
   return absl::OkStatus();
 }
 
+absl::Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
+                                        const TfLiteTensor* input0) {
+  switch (input0->type) {
+    case kTfLiteBool: {
+      ElementwiseAttributesBase<DataType::BOOL, bool> attr;
+      RETURN_IF_ERROR(
+          ParseInputsWithConstTensorImpl(node, reader, &attr.param));
+      attr.runtime_tensor_is_second =
+          IsConstantTensor(reader->GetInputTensor(0));
+      node->operation.attributes = std::move(attr);
+      return absl::OkStatus();
+    }
+    case kTfLiteInt32: {
+      ElementwiseAttributesBase<DataType::INT32, int32_t> attr;
+      RETURN_IF_ERROR(
+          ParseInputsWithConstTensorImpl(node, reader, &attr.param));
+      attr.runtime_tensor_is_second =
+          IsConstantTensor(reader->GetInputTensor(0));
+      node->operation.attributes = std::move(attr);
+      return absl::OkStatus();
+    }
+    default: {
+      ElementwiseAttributes attr;
+      RETURN_IF_ERROR(
+          ParseInputsWithConstTensorImpl(node, reader, &attr.param));
+      attr.runtime_tensor_is_second =
+          IsConstantTensor(reader->GetInputTensor(0));
+      node->operation.attributes = std::move(attr);
+      return absl::OkStatus();
+    }
+  }
+}
+
 absl::Status MaybeFuseActivationForElementwiseNode(
     OperationType operation_type, const TfLiteNode* tflite_node,
     GraphFloat32* graph, Node* node) {
@@ -1080,11 +1115,12 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
                                                         /*runtime_inputs=*/1,
                                                         /*const_inputs=*/1,
                                                         /*outputs=*/1));
-      ElementwiseAttributes attr;
-      RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
-      attr.runtime_tensor_is_second =
-          IsConstantTensor(reader->GetInputTensor(0));
-      node->operation.attributes = std::move(attr);
+      const TfLiteTensor* input_tensor0 = reader->GetInputTensor(0);
+      const TfLiteTensor* constant_tensor = IsConstantTensor(input_tensor0)
+                                                ? input_tensor0
+                                                : reader->GetInputTensor(1);
+      RETURN_IF_ERROR(
+          ParseInputsWithConstTensor(node, reader, constant_tensor));
     } else {
       return absl::InvalidArgumentError("Incorrect operation type passed");
     }
@@ -1149,6 +1185,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       case OperationType::GREATER_EQUAL:
       case OperationType::LESS:
       case OperationType::LESS_EQUAL:
+      case OperationType::LOGICAL_AND:
       case OperationType::MAXIMUM:
       case OperationType::MINIMUM:
       case OperationType::MUL:
@@ -1173,6 +1210,7 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       case OperationType::GREATER_EQUAL:
       case OperationType::LESS:
       case OperationType::LESS_EQUAL:
+      case OperationType::LOGICAL_AND:
       case OperationType::MAXIMUM:
       case OperationType::MINIMUM:
       case OperationType::MUL:
@@ -2687,7 +2725,7 @@ class UnpackOperationParser : public TFLiteOperationParser {
       // Adding Identity reshape that will be removed.
       Node* node = graph->NewNode();
       node->operation.type = ToString(OperationType::RESHAPE);
-      RETURN_IF_ERROR(reader->AddInput(node, 1));
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
       RETURN_IF_ERROR(reader->AddOutputs(node));
       // New shape comes from output shape.
       ReshapeAttributes attr;
@@ -3008,6 +3046,9 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
           OperationType::SIGMOID);
     case kTfLiteBuiltinLog:
       return std::make_unique<ElementwiseOperationParser>(OperationType::LOG);
+    case kTfLiteBuiltinLogicalAnd:
+      return std::make_unique<ElementwiseOperationParser>(
+          OperationType::LOGICAL_AND);
     case kTfLiteBuiltinLstm:
       return std::make_unique<LSTMOperationParser>();
     case kTfLiteBuiltinMaximum:
@@ -3172,6 +3213,10 @@ TfLiteIntArray* GetOpsToReplace(
     if (registration->builtin_code == kTfLiteBuiltinSelectV2) {
       allowed_in_types.push_back(kTfLiteBool);
     }
+    if (registration->builtin_code == kTfLiteBuiltinLogicalAnd) {
+      allowed_in_types.push_back(kTfLiteBool);
+      allowed_out_types.push_back(kTfLiteBool);
+    }
     if (!IsAllAllowedTensors(context, node->inputs, allowed_in_types) ||
         !IsAllAllowedTensors(context, node->outputs, allowed_out_types)) {
       if (unsupported_details) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 7f1c3188044..c45e68b70d4 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index c314ed14486..e662b68c5f1 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "fp16.h"  // from @FP16
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 22482f52721..5996dbd6fa4 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -23,8 +23,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index a22efd5f457..371af5bab2c 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -36,11 +36,13 @@ Padding2D& Padding2D::operator=(const Padding2D& value) {
   return *this;
 }
 
-bool Padding2D::operator==(const Padding2D& value) {
+bool Padding2D::operator==(const Padding2D& value) const {
   return this->prepended == value.prepended && this->appended == value.appended;
 }
 
-bool Padding2D::operator!=(const Padding2D& value) { return !(*this == value); }
+bool Padding2D::operator!=(const Padding2D& value) const {
+  return !(*this == value);
+}
 
 Padding2D& Padding2D::operator-(const Padding2D& value) {
   prepended.h -= value.prepended.h;
@@ -140,6 +142,8 @@ std::string ToString(enum OperationType op) {
       return "less_equal";
     case OperationType::LOG:
       return "log";
+    case OperationType::LOGICAL_AND:
+      return "logical_and";
     case OperationType::LSTM:
       return "lstm";
     case OperationType::MAXIMUM:
@@ -257,6 +261,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"less", OperationType::LESS},
           {"less_equal", OperationType::LESS_EQUAL},
           {"log", OperationType::LOG},
+          {"logical_and", OperationType::LOGICAL_AND},
           {"lstm", OperationType::LSTM},
           {"maximum", OperationType::MAXIMUM},
           {"max_unpooling", OperationType::MAX_UNPOOLING_2D},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 9214bbffadc..6afaa6b7514 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -67,6 +67,7 @@ enum class OperationType {
   LESS,
   LESS_EQUAL,
   LOG,
+  LOGICAL_AND,
   LSTM,
   MAXIMUM,
   MAX_UNPOOLING_2D,
@@ -113,15 +114,16 @@ std::string ToString(enum OperationType op);
 
 OperationType OperationTypeFromString(const std::string& name);
 
-typedef absl::variant<absl::monostate, Tensor<HWC, DataType::FLOAT32>,
-                      Tensor<Linear, DataType::FLOAT32>, float>
-    TensorOrScalar;
+template <DataType DataTypeT, typename t>
+using TensorOrScalarBase = std::variant<std::monostate, Tensor<HWC, DataTypeT>,
+                                        Tensor<Linear, DataTypeT>, t>;
+
+using TensorOrScalar = TensorOrScalarBase<DataType::FLOAT32, float>;
 
 struct Padding2D {
-  Padding2D() = default;
   Padding2D& operator=(const Padding2D& value);
-  bool operator==(const Padding2D& value);
-  bool operator!=(const Padding2D& value);
+  bool operator==(const Padding2D& value) const;
+  bool operator!=(const Padding2D& value) const;
   Padding2D& operator-(const Padding2D& value);
 
   // Padding values for every axis (if needed), where 'prepended' defines
@@ -500,10 +502,13 @@ struct Pad3DAttributes {
 // input.
 BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr);
 
-struct ConstTensorAttributes {
-  Tensor<BHWC, DataType::FLOAT32> tensor;
+template <DataType DataTypeT>
+struct ConstTensorAttributesBase {
+  Tensor<BHWC, DataTypeT> tensor;
 };
 
+using ConstTensorAttributes = ConstTensorAttributesBase<DataType::FLOAT32>;
+
 struct DensifyAttributes {
   Tensor<BHWC, DataType::FLOAT32> tensor;
 };
@@ -562,14 +567,18 @@ BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
 // @return shape of a tensor after Mean operation is applied to the given input.
 BHWDC CalculateOutputShape(const BHWDC& input, const MeanAttributes& attr);
 
-struct ElementwiseAttributes {
-  TensorOrScalar param;
+template <DataType DataTypeT, typename t>
+struct ElementwiseAttributesBase {
+  TensorOrScalarBase<DataTypeT, t> param;
   // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
   // true when runtime tensor is B(on second position). this is important for
   // ops that non commutative, for example subtract.
   bool runtime_tensor_is_second = false;
 };
 
+using ElementwiseAttributes =
+    ElementwiseAttributesBase<DataType::FLOAT32, float>;
+
 struct ReshapeAttributes {
   BHWC new_shape;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/BUILD b/tensorflow/lite/delegates/gpu/common/selectors/BUILD
index 5932aeb9e35..ff21ecb2fa0 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/selectors/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/default/BUILD b/tensorflow/lite/delegates/gpu/common/selectors/default/BUILD
index 901e86cae61..9ab1115b730 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/selectors/default/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
index ab227fc385b..91850248c88 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
@@ -305,6 +305,25 @@ void AddConvSharedWeights(
   }
 }
 
+template <DataType DataTypeT, typename T>
+absl::Status CreateElementwiseTwoInputWithOneConstant(
+    const GpuInfo& gpu_info, const OperationDef& op_def, OperationType op_type,
+    const Node& node, const Value* input, const Value* output,
+    std::unique_ptr<GPUOperation>* gpu_op) {
+  auto attr = std::any_cast<ElementwiseAttributesBase<DataTypeT, T>>(
+      node.operation.attributes);
+  GPUOperation operation;
+  if (input->tensor.shape != output->tensor.shape) {
+    operation = CreateElementwiseWithBroadcast(gpu_info, op_def, op_type, attr,
+                                               input->tensor.shape,
+                                               output->tensor.shape);
+  } else {
+    operation = CreateElementwise(gpu_info, op_def, op_type, attr);
+  }
+  *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 absl::Status GPUOperationFromNodePart0(
@@ -741,6 +760,7 @@ absl::Status GPUOperationFromNodePart0(
     case OperationType::GREATER_EQUAL:
     case OperationType::LESS:
     case OperationType::LESS_EQUAL:
+    case OperationType::LOGICAL_AND:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
     case OperationType::MUL:
@@ -779,18 +799,22 @@ absl::Status GPUOperationFromNodePart0(
         *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
         return absl::OkStatus();
       } else if (inputs.size() == 1 && node.operation.attributes.has_value()) {
-        auto attr =
-            absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
-        GPUOperation operation;
-        if (inputs[0]->tensor.shape != outputs[0]->tensor.shape) {
-          operation = CreateElementwiseWithBroadcast(
-              gpu_info, op_def, op_type, attr, inputs[0]->tensor.shape,
-              outputs[0]->tensor.shape);
-        } else {
-          operation = CreateElementwise(gpu_info, op_def, op_type, attr);
+        Value* input = inputs[0];
+        Value* output = inputs[0];
+        switch (inputs[0]->tensor.type) {
+          case DataType::BOOL:
+            return CreateElementwiseTwoInputWithOneConstant<DataType::BOOL,
+                                                            bool>(
+                gpu_info, op_def, op_type, node, input, output, gpu_op);
+          case DataType::INT32:
+            return CreateElementwiseTwoInputWithOneConstant<DataType::INT32,
+                                                            int32_t>(
+                gpu_info, op_def, op_type, node, input, output, gpu_op);
+          default:
+            return CreateElementwiseTwoInputWithOneConstant<DataType::FLOAT32,
+                                                            float>(
+                gpu_info, op_def, op_type, node, input, output, gpu_op);
         }
-        *gpu_op = std::make_unique<GPUOperation>(std::move(operation));
-        return absl::OkStatus();
       }
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
index 3d9f804a57e..a4f49590819 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/special_selector.cc
@@ -57,8 +57,7 @@ absl::Status GPUSubgraphFromGraph(
           .ok()) {
     return absl::OkStatus();
   }
-  if (hints.Check(ModelHints::kAllowSpecialKernels) &&
-      TryFusedPointwiseConv(graph, first_node_id, precision, tensor_descriptors,
+  if (TryFusedPointwiseConv(graph, first_node_id, precision, tensor_descriptors,
                             consumed_nodes, gpu_subgraph)
           .ok()) {
     gpu_subgraph->operations[0].name = "slice_mul_mean_concat";
diff --git a/tensorflow/lite/delegates/gpu/common/task/BUILD b/tensorflow/lite/delegates/gpu/common/task/BUILD
index 8d90973d7d0..06c116de739 100644
--- a/tensorflow/lite/delegates/gpu/common/task/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/task/BUILD
@@ -1,6 +1,7 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -50,6 +51,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -102,6 +104,7 @@ cc_library(
     deps = [
         ":gpu_object_desc",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -135,13 +138,13 @@ cc_library(
     srcs = ["tensor_desc.cc"],
     hdrs = ["tensor_desc.h"],
     deps = [
+        ":gpu_object_desc",
+        ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
-        "//tensorflow/lite/delegates/gpu/common/task:gpu_object_desc",
-        "//tensorflow/lite/delegates/gpu/common/task:util",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
index 712f98903e0..56e1ce74d62 100644
--- a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -46,7 +47,7 @@ GPUResources BufferDescriptor::GetGPUResources(const GpuInfo& gpu_info) const {
 }
 
 absl::Status BufferDescriptor::PerformSelector(
-    const GpuInfo& gpu_info, const std::string& selector,
+    const GpuInfo& gpu_info, absl::string_view selector,
     const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) const {
   if (selector == "Read") {
diff --git a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h
index 2f1aa7a37c9..cc3b8ff137d 100644
--- a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
@@ -43,7 +44,7 @@ struct BufferDescriptor : public GPUObjectDescriptor {
   BufferDescriptor& operator=(BufferDescriptor&& desc) = default;
 
   absl::Status PerformSelector(const GpuInfo& gpu_info,
-                               const std::string& selector,
+                               absl::string_view selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
index c3919120465..419666fa432 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h
@@ -16,11 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_OBJECT_DESC_H_
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
@@ -158,11 +160,11 @@ struct GenericGPUResourcesWithValue {
   std::vector<std::pair<std::string, int>> ints;
   std::vector<std::pair<std::string, float>> floats;
 
-  void AddFloat(const std::string& name, float value) {
-    floats.push_back({name, value});
+  void AddFloat(absl::string_view name, float value) {
+    floats.emplace_back(name, value);
   }
-  void AddInt(const std::string& name, int value) {
-    ints.push_back({name, value});
+  void AddInt(absl::string_view name, int value) {
+    ints.emplace_back(name, value);
   }
 };
 
@@ -175,19 +177,24 @@ class GPUObjectDescriptor {
   GPUObjectDescriptor& operator=(GPUObjectDescriptor&& obj_desc) = default;
   virtual ~GPUObjectDescriptor() = default;
 
-  void SetStateVar(const std::string& key, const std::string& value) const {
-    state_vars_[key] = value;
+  void SetStateVar(absl::string_view key, absl::string_view value) const {
+    auto it = state_vars_.find(key);
+    if (it == state_vars_.end()) {
+      state_vars_[std::string(key)] = std::string(value);
+    } else {
+      it->second = std::string(value);
+    }
   }
 
   virtual absl::Status PerformConstExpr(const tflite::gpu::GpuInfo& gpu_info,
-                                        const std::string& const_expr,
+                                        absl::string_view const_expr,
                                         std::string* result) const {
     return absl::UnimplementedError(
         "No implementation of perform const expression");
   }
 
   virtual absl::Status PerformSelector(
-      const GpuInfo& gpu_info, const std::string& selector,
+      const GpuInfo& gpu_info, absl::string_view selector,
       const std::vector<std::string>& args,
       const std::vector<std::string>& template_args,
       std::string* result) const {
@@ -211,7 +218,7 @@ class GPUObjectDescriptor {
       const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
   friend void Decode(const tflite::gpu::data::GPUObjectDescriptor* fb_obj,
                      GPUObjectDescriptor* obj);
-  mutable std::map<std::string, std::string> state_vars_;
+  mutable std::map<std::string, std::string, std::less<>> state_vars_;
   AccessType access_type_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
index 63d144ee702..4218709557d 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
@@ -275,8 +275,9 @@ absl::Status ResolveSelectorsPass(
             kArgsPrefix + object_name + "_" + member_name;
         ReplaceAllWords(member_name, new_name, &patch);
       }
-
-      patch = linkable_patch + patch;
+      if (!linkable_patch.empty()) {
+        patch = "{\n" + linkable_patch + patch + ";\n}";
+      }
       code->replace(arg_pos, close_bracket_pos - arg_pos, patch);
       position = arg_pos + patch.size();
     } else {
diff --git a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
index 57eef468a73..03909221f99 100644
--- a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
+
 namespace tflite {
 namespace gpu {
 
@@ -31,7 +33,7 @@ GPUResources QcomThinFilterDescriptor::GetGPUResources(
 }
 
 absl::Status QcomThinFilterDescriptor::PerformSelector(
-    const GpuInfo& gpu_info, const std::string& selector,
+    const GpuInfo& gpu_info, absl::string_view selector,
     const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) const {
   if (selector == "GetHandle" && args.empty()) {
diff --git a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h
index 403fa3a8c59..8ab1efc18eb 100644
--- a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
 
@@ -40,7 +41,7 @@ struct QcomThinFilterDescriptor : public GPUObjectDescriptor {
       default;
 
   absl::Status PerformSelector(const GpuInfo& gpu_info,
-                               const std::string& selector,
+                               absl::string_view selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
index cf7d0cd1fed..b3d342d5f71 100644
--- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -109,7 +110,7 @@ std::string GetConversion(const GpuInfo& gpu_info,
   }
 }
 
-void MayBeAddConversion(const std::string& conversion, std::string* result) {
+void MayBeAddConversion(absl::string_view conversion, std::string* result) {
   *result = absl::Substitute(conversion, *result);
 }
 
@@ -382,7 +383,7 @@ void TensorDescriptor::GetGpuResources(
 }
 
 absl::Status TensorDescriptor::PerformConstExpr(const GpuInfo& gpu_info,
-                                                const std::string& const_expr,
+                                                absl::string_view const_expr,
                                                 std::string* result) const {
   if (const_expr == "type" || const_expr == "scalar_type") {
     const int vec_size = const_expr == "scalar_type" ? 1 : 4;
@@ -399,7 +400,7 @@ absl::Status TensorDescriptor::PerformConstExpr(const GpuInfo& gpu_info,
 }
 
 absl::Status TensorDescriptor::PerformSelector(
-    const GpuInfo& gpu_info, const std::string& selector,
+    const GpuInfo& gpu_info, absl::string_view selector,
     const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) const {
   if (selector == "Width") {
@@ -847,7 +848,7 @@ std::string TensorDescriptor::Read(
 }
 
 std::string TensorDescriptor::Write(
-    const GpuInfo& gpu_info, const std::string& var_name,
+    const GpuInfo& gpu_info, absl::string_view var_name,
     const std::vector<std::string>& coords) const {
   bool is_texture_write = storage_type_ == TensorStorageType::IMAGE_BUFFER ||
                           storage_type_ == TensorStorageType::TEXTURE_2D ||
@@ -861,7 +862,7 @@ std::string TensorDescriptor::Write(
       use_buffer_for_write_only_2d_texture_) {
     is_texture_write = false;
   }
-  std::string write_expr = var_name;
+  std::string write_expr(var_name);
   DataType write_required_type = data_type_;
   if (data_type_ == DataType::BOOL) {
     // DataType::BOOL stored as DataType::UINT8
@@ -1035,7 +1036,7 @@ std::string TensorDescriptor::StorageTypeToAddressType() const {
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsLinear(
-    const std::string& x) const {
+    absl::string_view x) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1054,7 +1055,7 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsLinear(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsHW(
-    const std::string& x, const std::string& y) const {
+    absl::string_view x, absl::string_view y) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1073,7 +1074,7 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsHW(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHS(
-    const std::string& x, const std::string& y, const std::string& s) const {
+    absl::string_view x, absl::string_view y, absl::string_view s) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1096,8 +1097,8 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHS(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHSB(
-    const std::string& x, const std::string& y, const std::string& s,
-    const std::string& b) const {
+    absl::string_view x, absl::string_view y, absl::string_view s,
+    absl::string_view b) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1122,8 +1123,8 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHSB(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHDS(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s) const {
+    absl::string_view x, absl::string_view y, absl::string_view z,
+    absl::string_view s) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1148,8 +1149,8 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHDS(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHDSB(
-    const std::string& x, const std::string& y, const std::string& z,
-    const std::string& s, const std::string& b) const {
+    absl::string_view x, absl::string_view y, absl::string_view z,
+    absl::string_view s, absl::string_view b) const {
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
     case TensorStorageType::IMAGE_BUFFER:
@@ -1176,8 +1177,8 @@ std::vector<std::string> TensorDescriptor::GetPhysicalCoordsWHDSB(
 }
 
 std::string TensorDescriptor::GetGlobalAddressNoDeclaration(
-    const std::string& xc, const std::string& yc, const std::string& zc,
-    const std::string& sc, const std::string& bc) const {
+    absl::string_view xc, absl::string_view yc, absl::string_view zc,
+    absl::string_view sc, absl::string_view bc) const {
   auto coords = GetPhysicalCoords(xc, yc, zc, sc, bc);
   switch (storage_type_) {
     case TensorStorageType::BUFFER:
@@ -1197,8 +1198,8 @@ std::string TensorDescriptor::GetGlobalAddressNoDeclaration(
 }
 
 std::vector<std::string> TensorDescriptor::GetPhysicalCoords(
-    const std::string& xc, const std::string& yc, const std::string& zc,
-    const std::string& sc, const std::string& bc) const {
+    absl::string_view xc, absl::string_view yc, absl::string_view zc,
+    absl::string_view sc, absl::string_view bc) const {
   if (layout_ == Layout::HWC) {
     return GetPhysicalCoordsWHS(xc, yc, sc);
   } else if (layout_ == Layout::BHWC) {
@@ -1334,12 +1335,22 @@ int TensorDescriptor::GetLinearIndex(const BHWDC& shape5d, int b, int x, int y,
   }
 }
 
+template <DataType DataTypeT>
 void TensorDescriptor::UploadData(
-    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+    const tflite::gpu::Tensor<HWC, DataTypeT>& src) {
   shape_ = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c);
   UploadData(src.data.data());
 }
 
+template void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
+
+template void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::BOOL>& src);
+
+template void TensorDescriptor::UploadData(
+    const tflite::gpu::Tensor<HWC, DataType::INT32>& src);
+
 bool TensorDescriptor::SupportsZeroClamp(const Axis& axis,
                                          const GpuInfo& gpu_info) const {
   switch (storage_type_) {
@@ -1593,9 +1604,10 @@ TensorStorageType GetStorageTypeForLinearTensor(const GpuInfo& gpu_info,
   }
 }
 
+template <DataType DataTypeT>
 TensorDescriptor CreateConstantLinearTensorDescriptor(
     DataType data_type, TensorStorageType storage_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src) {
   TensorDescriptor tensor_desc =
       TensorDescriptor(data_type, storage_type, Layout::LINEAR);
   tensor_desc.SetBHWDCShape(BHWDC(1, 1, 1, 1, src.shape.v));
@@ -1603,14 +1615,39 @@ TensorDescriptor CreateConstantLinearTensorDescriptor(
   return tensor_desc;
 }
 
+template <DataType DataTypeT>
 TensorDescriptor CreateConstantLinearTensorDescriptor(
     const GpuInfo& gpu_info, DataType data_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src) {
   return CreateConstantLinearTensorDescriptor(
       data_type, GetStorageTypeForLinearTensor(gpu_info, data_type, src.shape),
       src);
 }
 
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    DataType data_type, TensorStorageType storage_type,
+    const tflite::gpu::Tensor<Linear, DataType::BOOL>& src);
+
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    const GpuInfo& gpu_info, DataType data_type,
+    const tflite::gpu::Tensor<Linear, DataType::BOOL>& src);
+
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    DataType data_type, TensorStorageType storage_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    const GpuInfo& gpu_info, DataType data_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    DataType data_type, TensorStorageType storage_type,
+    const tflite::gpu::Tensor<Linear, DataType::INT32>& src);
+
+template TensorDescriptor CreateConstantLinearTensorDescriptor(
+    const GpuInfo& gpu_info, DataType data_type,
+    const tflite::gpu::Tensor<Linear, DataType::INT32>& src);
+
 TensorDescriptor CreateConstantHWVec4TensorDescriptor(
     DataType data_type, TensorStorageType storage_type, int width, int height,
     const uint8_t* data) {
diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
index 5f756ef3703..42b62b920d1 100644
--- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
+++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_object_desc.h"
@@ -66,11 +67,11 @@ class TensorDescriptor : public GPUObjectDescriptor {
                        GenericGPUResourcesWithValue* resources) const;
 
   absl::Status PerformConstExpr(const GpuInfo& gpu_info,
-                                const std::string& const_expr,
+                                absl::string_view const_expr,
                                 std::string* result) const override;
 
   absl::Status PerformSelector(const GpuInfo& gpu_info,
-                               const std::string& selector,
+                               absl::string_view selector,
                                const std::vector<std::string>& args,
                                const std::vector<std::string>& template_args,
                                std::string* result) const override;
@@ -97,7 +98,8 @@ class TensorDescriptor : public GPUObjectDescriptor {
   template <DataType T>
   void DownloadData(tflite::gpu::Tensor<BHWDC, T>* dst);
 
-  void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
+  template <DataType T>
+  void UploadData(const tflite::gpu::Tensor<HWC, T>& src);
 
   int GetLinearIndex(const BHWDC& shape5d, int b, int x, int y, int d, int s,
                      int sub_c) const;
@@ -169,9 +171,10 @@ class TensorDescriptor : public GPUObjectDescriptor {
   friend void Decode(const data::TensorDescriptor* fb_desc,
                      TensorDescriptor* desc);
 
+  template <DataType DataTypeT>
   friend TensorDescriptor CreateConstantLinearTensorDescriptor(
       DataType data_type, TensorStorageType storage_type,
-      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+      const tflite::gpu::Tensor<Linear, DataTypeT>& src);
 
   friend TensorDescriptor CreateConstantHWVec4TensorDescriptor(
       DataType data_type, TensorStorageType storage_type, int width, int height,
@@ -212,42 +215,42 @@ class TensorDescriptor : public GPUObjectDescriptor {
 
   std::string Read(const GpuInfo& gpu_info, DataType read_as_type,
                    const std::vector<std::string>& coords) const;
-  std::string Write(const GpuInfo& gpu_info, const std::string& var_name,
+  std::string Write(const GpuInfo& gpu_info, absl::string_view var_name,
                     const std::vector<std::string>& coords) const;
 
   absl::Status MaybeGetDataTypeFromTemplateArgs(
       const std::vector<std::string>& template_args, DataType* result) const;
 
-  std::string GetGlobalAddressNoDeclaration(const std::string& xc,
-                                            const std::string& yc,
-                                            const std::string& zc,
-                                            const std::string& sc,
-                                            const std::string& bc) const;
-
-  std::vector<std::string> GetPhysicalCoordsWHS(const std::string& x,
-                                                const std::string& y,
-                                                const std::string& s) const;
-  std::vector<std::string> GetPhysicalCoordsWHSB(const std::string& x,
-                                                 const std::string& y,
-                                                 const std::string& s,
-                                                 const std::string& b) const;
-  std::vector<std::string> GetPhysicalCoordsWHDS(const std::string& x,
-                                                 const std::string& y,
-                                                 const std::string& z,
-                                                 const std::string& s) const;
-  std::vector<std::string> GetPhysicalCoordsWHDSB(const std::string& x,
-                                                  const std::string& y,
-                                                  const std::string& z,
-                                                  const std::string& s,
-                                                  const std::string& b) const;
-  std::vector<std::string> GetPhysicalCoords(const std::string& xc,
-                                             const std::string& yc,
-                                             const std::string& zc,
-                                             const std::string& sc,
-                                             const std::string& bc) const;
-  std::vector<std::string> GetPhysicalCoordsLinear(const std::string& x) const;
-  std::vector<std::string> GetPhysicalCoordsHW(const std::string& x,
-                                               const std::string& y) const;
+  std::string GetGlobalAddressNoDeclaration(absl::string_view xc,
+                                            absl::string_view yc,
+                                            absl::string_view zc,
+                                            absl::string_view sc,
+                                            absl::string_view bc) const;
+
+  std::vector<std::string> GetPhysicalCoordsWHS(absl::string_view x,
+                                                absl::string_view y,
+                                                absl::string_view s) const;
+  std::vector<std::string> GetPhysicalCoordsWHSB(absl::string_view x,
+                                                 absl::string_view y,
+                                                 absl::string_view s,
+                                                 absl::string_view b) const;
+  std::vector<std::string> GetPhysicalCoordsWHDS(absl::string_view x,
+                                                 absl::string_view y,
+                                                 absl::string_view z,
+                                                 absl::string_view s) const;
+  std::vector<std::string> GetPhysicalCoordsWHDSB(absl::string_view x,
+                                                  absl::string_view y,
+                                                  absl::string_view z,
+                                                  absl::string_view s,
+                                                  absl::string_view b) const;
+  std::vector<std::string> GetPhysicalCoords(absl::string_view xc,
+                                             absl::string_view yc,
+                                             absl::string_view zc,
+                                             absl::string_view sc,
+                                             absl::string_view bc) const;
+  std::vector<std::string> GetPhysicalCoordsLinear(absl::string_view x) const;
+  std::vector<std::string> GetPhysicalCoordsHW(absl::string_view x,
+                                               absl::string_view y) const;
 
   bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
                            std::string* xc, std::string* yc, std::string* zc,
@@ -298,12 +301,15 @@ TensorDescriptor CreateHwcTensorDescriptor(DataType data_type,
 TensorStorageType GetStorageTypeForLinearTensor(const GpuInfo& gpu_info,
                                                 DataType data_type,
                                                 const Linear& shape);
+template <DataType DataTypeT>
 TensorDescriptor CreateConstantLinearTensorDescriptor(
     DataType data_type, TensorStorageType storage_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src);
+
+template <DataType DataTypeT>
 TensorDescriptor CreateConstantLinearTensorDescriptor(
     const GpuInfo& gpu_info, DataType data_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+    const tflite::gpu::Tensor<Linear, DataTypeT>& src);
 
 TensorDescriptor CreateConstantHWVec4TensorDescriptor(
     DataType data_type, TensorStorageType storage_type, int width, int height,
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/BUILD b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
index fe1444f3ff0..346d65eafe4 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/tasks/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
index b13fd381acb..72e54dd21c9 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
@@ -305,8 +305,7 @@ void ConvGeneric::GenerateCode(const GpuInfo& gpu_info) {
     compiler_options_.push_back(CompilerOptions::kClFastRelaxedMath);
   }
   if (conv_params_.IsPrivateMemBroadcast() &&
-      (gpu_info.IsCL20OrHigher() ||
-       gpu_info.opencl_info.platform_version.find("clvk"))) {
+      (gpu_info.IsCL20OrHigher() || gpu_info.opencl_info.IsCLVK())) {
     compiler_options_.push_back(CompilerOptions::kCl20);
   }
   bool kernel_is_trivial =
@@ -1749,7 +1748,8 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
-  } else if (gpu_info.IsIntel()) {
+  } else if (gpu_info.IsIntel() ||
+             (gpu_info.IsApiOpenCl() && gpu_info.opencl_info.IsCLVK())) {
     if (different_weights_for_height) {
       work_group_size_ = int3(16, 1, 1);
       work_group_launch_order_ = int3(0, 1, 2);
@@ -1774,7 +1774,8 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
         definition.precision != CalculationsPrecision::F32_F16) {
       const bool supports_subgroups =
           gpu_info.SupportsExtension("cl_khr_subgroups") ||
-          gpu_info.SupportsExtension("cl_intel_subgroups");
+          gpu_info.SupportsExtension("cl_intel_subgroups") ||
+          gpu_info.opencl_info.IsCLVK();
       if (supports_subgroups) {
         const int kSubGroupSize = 16;
         const bool supports_subgroup_size_control =
@@ -1784,7 +1785,7 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
           conv_params.weights_upload_type =
               WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
           conv_params.simd_size = kSubGroupSize;
-        } else if (gpu_info.opencl_info.platform_version.find("clvk")) {
+        } else if (gpu_info.opencl_info.IsCLVK()) {
           // It will work because of specific driver using subgroup size 16
           conv_params.weights_upload_type =
               WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
index fdf8d68677e..112581427a2 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.cc
@@ -219,6 +219,12 @@ std::string GetTwoInputCode(const OperationType& op_type,
       result += "$0.z = $1.z != $2.z;\n";
       result += "$0.w = $1.w != $2.w;";
       break;
+    case OperationType::LOGICAL_AND:
+      result = "$0.x = ($1.x != 0) && ($2.x != 0);\n";
+      result += "$0.y = ($1.y != 0) && ($2.y != 0);\n";
+      result += "$0.z = ($1.z != 0) && ($2.z != 0);\n";
+      result += "$0.w = ($1.w != 0) && ($2.w != 0);";
+      break;
     default:
       return "Unknown operation type;";
   }
@@ -231,10 +237,19 @@ std::string GetTwoInputCode(const OperationType& op_type,
 
 // Creates simple two input (first input is runtime tensor and second input is
 // scalar argument) operation, for example sub, div, pow, etc.
+template <typename T>
 ElementwiseDescriptor CreateElementwiseOneRuntimeOneScalar(
     const OperationDef& definition, const OperationType& op_type,
-    float scalar_parameter, bool swap_inputs) {
+    T scalar_parameter, bool swap_inputs) {
   ElementwiseDescriptor op_desc;
+  if (std::is_same<T, int32_t>::value) {
+    op_desc.args.AddInt("scalar", scalar_parameter);
+    op_desc.code =
+        "int4 second_val = CONVERT_TO_INT4(INIT_FLT4(args.scalar));\n";
+    op_desc.code += GetTwoInputCode(op_type, "out_value", "in_value",
+                                    "second_val", swap_inputs);
+    return op_desc;
+  }
   if (definition.precision == CalculationsPrecision::F32) {
     op_desc.args.AddFloat("scalar", scalar_parameter);
   } else {
@@ -248,10 +263,11 @@ ElementwiseDescriptor CreateElementwiseOneRuntimeOneScalar(
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant linear tensor) operation, for example sub, div and etc.
+template <DataType DataTypeT>
 ElementwiseDescriptor CreateElementwiseTwoInput(
     const GpuInfo& gpu_info, const OperationDef& definition,
     const OperationType& op_type,
-    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
+    const tflite::gpu::Tensor<Linear, DataTypeT>& constant_tensor,
     bool swap_inputs) {
   TensorDescriptor const_tensor_desc = CreateConstantLinearTensorDescriptor(
       gpu_info, definition.src_tensors[0].GetDataType(), constant_tensor);
@@ -274,10 +290,11 @@ ElementwiseDescriptor CreateElementwiseTwoInput(
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant HWC tensor) operation, for example sub, div and etc.
+template <DataType DataTypeT>
 ElementwiseDescriptor CreateElementwiseTwoInput(
     const GpuInfo& gpu_info, const OperationDef& definition,
     const OperationType& op_type,
-    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
+    const tflite::gpu::Tensor<HWC, DataTypeT>& constant_tensor,
     bool swap_inputs) {
   const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
                           constant_tensor.shape.c);
@@ -305,15 +322,16 @@ ElementwiseDescriptor CreateElementwiseTwoInput(
   return op_desc;
 }
 
-ElementwiseDescriptor CreateElementwiseDesc(const GpuInfo& gpu_info,
-                                            const OperationDef& definition,
-                                            const OperationType& op_type,
-                                            const ElementwiseAttributes& attr) {
-  const float* scalar = absl::get_if<float>(&attr.param);
+template <DataType DataTypeT, typename T>
+ElementwiseDescriptor CreateElementwiseDesc(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr) {
+  const T* scalar = std::get_if<T>(&attr.param);
   const auto* linear_tensor =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
+      std::get_if<tflite::gpu::Tensor<Linear, DataTypeT>>(&attr.param);
   const auto* hwc_tensor =
-      absl::get_if<tflite::gpu::Tensor<HWC, DataType::FLOAT32>>(&attr.param);
+      std::get_if<tflite::gpu::Tensor<HWC, DataTypeT>>(&attr.param);
 
   if (scalar) {
     return CreateElementwiseOneRuntimeOneScalar(definition, op_type, *scalar,
@@ -349,10 +367,11 @@ GPUOperation CreateElementwiseOneInput(const GpuInfo& gpu_info,
       CreateElementwiseOneInput(gpu_info, definition.precision, op_type));
 }
 
-GPUOperation CreateElementwise(const GpuInfo& gpu_info,
-                               const OperationDef& definition,
-                               const OperationType& op_type,
-                               const ElementwiseAttributes& attr) {
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr) {
   return CreateGpuOperation(
       definition, CreateElementwiseDesc(gpu_info, definition, op_type, attr));
 }
@@ -429,12 +448,12 @@ GPUOperation CreateElementwiseOneInputWithBroadcast(
   return op;
 }
 
-GPUOperation CreateElementwiseWithBroadcast(const GpuInfo& gpu_info,
-                                            const OperationDef& definition,
-                                            const OperationType& op_type,
-                                            const ElementwiseAttributes& attr,
-                                            const BHWC& input_shape,
-                                            const BHWC& output_shape) {
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr,
+    const BHWC& input_shape, const BHWC& output_shape) {
   ElementwiseDescriptor op_desc =
       CreateElementwiseDesc(gpu_info, definition, op_type, attr);
 
@@ -482,5 +501,38 @@ GPUOperation CreateElementwiseTwoInputWithBroadcast(
   return op;
 }
 
+template GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::BOOL, bool>& attr);
+
+template GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::FLOAT32, float>& attr);
+
+template GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::INT32, int32_t>& attr);
+
+template GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::BOOL, bool>& attr,
+    const BHWC& input_shape, const BHWC& output_shape);
+
+template GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::FLOAT32, float>& attr,
+    const BHWC& input_shape, const BHWC& output_shape);
+
+template GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataType::INT32, int>& attr,
+    const BHWC& input_shape, const BHWC& output_shape);
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
index 96c81847259..5624a605279 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise.h
@@ -45,20 +45,21 @@ GPUOperation CreateElementwiseOneInputWithBroadcast(
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant or linear/hwc tensor) operation, for example sub, div and etc.
-GPUOperation CreateElementwise(const GpuInfo& gpu_info,
-                               const OperationDef& definition,
-                               const OperationType& op_type,
-                               const ElementwiseAttributes& attr);
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwise(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr);
 
 // Creates simple two input(first input is runtime tensor and second input is
 // constant or linear/hwc tensor) operation, for example sub, div and etc.
 // Can broadcast input.
-GPUOperation CreateElementwiseWithBroadcast(const GpuInfo& gpu_info,
-                                            const OperationDef& definition,
-                                            const OperationType& op_type,
-                                            const ElementwiseAttributes& attr,
-                                            const BHWC& input_shape,
-                                            const BHWC& output_shape);
+template <DataType DataTypeT, typename T>
+GPUOperation CreateElementwiseWithBroadcast(
+    const GpuInfo& gpu_info, const OperationDef& definition,
+    const OperationType& op_type,
+    const ElementwiseAttributesBase<DataTypeT, T>& attr,
+    const BHWC& input_shape, const BHWC& output_shape);
 
 // Creates simple two input(2 runtime tensors) operation, for example
 // sub, div and etc.
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
index 07e57af342e..80b69a7b974 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.cc
@@ -1347,5 +1347,95 @@ absl::Status MulBroadcastBothInputsTest(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
+absl::Status LogicalAndTest(TestExecutionEnvironment* env) {
+  using TensorBool32 = Tensor<BHWC, DataType::BOOL>;
+  TensorBool32 src_tensor_0, src_tensor_1, ref_tensor;
+  src_tensor_0.shape = BHWC(1, 1, 2, 24);
+  src_tensor_1.shape = BHWC(1, 1, 2, 24);
+  ref_tensor.shape = BHWC(1, 1, 2, 24);
+  for (int i = 0; i < 48; i++) {
+    bool value = i % 2 == 0;
+    src_tensor_0.data.push_back(value);
+    src_tensor_1.data.push_back(i < 24 ? value : !value);
+    ref_tensor.data.push_back(i < 24 ? value : false);
+  }
+
+  for (auto src_storage : env->GetSupportedStorages(DataType::BOOL)) {
+    for (auto dst_storage : env->GetSupportedStorages(DataType::BOOL)) {
+      OperationDef op_def;
+      op_def.precision = CalculationsPrecision::F32;
+      op_def.src_tensors.push_back({DataType::BOOL, src_storage, Layout::HWC});
+      op_def.src_tensors.push_back({DataType::BOOL, src_storage, Layout::HWC});
+      op_def.dst_tensors.push_back({DataType::BOOL, dst_storage, Layout::HWC});
+
+      TensorDescriptor src_desc0, src_desc1, dst_desc;
+      src_desc0 = op_def.src_tensors[0];
+      src_desc0.UploadData(src_tensor_0);
+      src_desc1 = op_def.src_tensors[1];
+      src_desc1.UploadData(src_tensor_1);
+      dst_desc.SetBHWCShape(BHWC(1, 1, 2, 24));
+
+      GPUOperation operation = CreateElementwiseTwoInput(
+          op_def, OperationType::LOGICAL_AND, src_tensor_1.shape);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          {&src_desc0, &src_desc1}, {&dst_desc},
+          std::make_unique<GPUOperation>(std::move(operation))));
+
+      tflite::gpu::Tensor<BHWC, DataType::BOOL> dst_tensor;
+      dst_desc.DownloadData(&dst_tensor);
+      if (dst_tensor.data != ref_tensor.data) {
+        return absl::InternalError("not equal");
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status LogicalAndWithConstantTest(TestExecutionEnvironment* env) {
+  using TensorBool32 = Tensor<BHWC, DataType::BOOL>;
+  TensorBool32 src_tensor_0, ref_tensor;
+  src_tensor_0.shape = BHWC(1, 1, 2, 24);
+  ref_tensor.shape = BHWC(1, 1, 2, 24);
+
+  Tensor<HWC, DataType::BOOL> src_tensor_1;
+  src_tensor_1.shape = HWC(1, 2, 24);
+  for (int i = 0; i < 48; i++) {
+    bool value = i % 2 == 0;
+    src_tensor_0.data.push_back(value);
+    src_tensor_1.data.push_back(i < 24 ? value : !value);
+    ref_tensor.data.push_back(i < 24 ? value : false);
+  }
+
+  ElementwiseAttributesBase<DataType::BOOL, bool> attr;
+  attr.param = src_tensor_1;
+
+  for (auto src_storage : env->GetSupportedStorages(DataType::BOOL)) {
+    for (auto dst_storage : env->GetSupportedStorages(DataType::BOOL)) {
+      OperationDef op_def;
+      op_def.precision = CalculationsPrecision::F32;
+      op_def.src_tensors.push_back({DataType::BOOL, src_storage, Layout::HWC});
+      op_def.dst_tensors.push_back({DataType::BOOL, dst_storage, Layout::HWC});
+
+      TensorDescriptor src_desc0, src_desc1, dst_desc;
+      src_desc0 = op_def.src_tensors[0];
+      src_desc0.UploadData(src_tensor_0);
+      dst_desc.SetBHWCShape(BHWC(1, 1, 2, 24));
+
+      GPUOperation operation = CreateElementwise(
+          env->GetGpuInfo(), op_def, OperationType::LOGICAL_AND, attr);
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          {&src_desc0}, {&dst_desc},
+          std::make_unique<GPUOperation>(std::move(operation))));
+
+      tflite::gpu::Tensor<BHWC, DataType::BOOL> dst_tensor;
+      dst_desc.DownloadData(&dst_tensor);
+      if (dst_tensor.data != ref_tensor.data) {
+        return absl::InternalError("not equal");
+      }
+    }
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
index c7df18decbf..2fcc734305d 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/elementwise_test_util.h
@@ -66,6 +66,8 @@ absl::Status CosBroadcastTest(TestExecutionEnvironment* env);
 absl::Status MaximumScalarBroadcastInputTest(TestExecutionEnvironment* env);
 absl::Status MulLinearBroadcastInputTest(TestExecutionEnvironment* env);
 absl::Status MulBroadcastBothInputsTest(TestExecutionEnvironment* env);
+absl::Status LogicalAndTest(TestExecutionEnvironment* env);
+absl::Status LogicalAndWithConstantTest(TestExecutionEnvironment* env);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc b/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
index 2adbb01c2e7..6fd1d262c5e 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
@@ -75,12 +75,26 @@ int3 GetMaximumPossibleWGSize(const std::vector<int>& ordered_sizes,
                               int max_total_wg_size) {
   int3 wg_size = int3(1, 1, 1);
   int wg_size_total = 1;
+  // Make sure that a minimum number of reductions happens inside the loop over
+  // reduction dims. Otherwise, the reduction size could equal the number of
+  // workgroups and the inner loop would just copy the values to the reducer,
+  // which is inefficient.
+  const int minimum_loop_reductions = 4;
+  int total_loop_reductions = 1;
   for (int i = ordered_sizes.size() - 1; i >= 0; i--) {
     const int wg_index = ordered_sizes.size() - 1 - i;
     if (wg_index >= 3) {
       return wg_size;
     }
-    while (ordered_sizes[i] >= wg_size[wg_index] * 2) {
+    int loop_reductions_dim = 1;
+    while (ordered_sizes[i] >= wg_size[wg_index] * 2 * loop_reductions_dim) {
+      // Don't increase the work group size of this dim until we have at least
+      // 'minimum_loop_reductions' reductions.
+      if (total_loop_reductions < minimum_loop_reductions) {
+        total_loop_reductions *= 2;
+        loop_reductions_dim *= 2;
+        continue;
+      }
       wg_size_total *= 2;
       if (wg_size_total > max_total_wg_size) {
         return wg_size;
@@ -193,7 +207,7 @@ std::string Reduce::GetReduceKernelCode(const OperationDef& op_def,
       }
     }
   }
-  const bool channels_reductin = HasAxis(axis_to_reduce, Axis::CHANNELS);
+  const bool channels_reduction = HasAxis(axis_to_reduce, Axis::CHANNELS);
   int wg_dims = 0;
   if (use_wg_reduction_) {
     if (work_group_size.y == 1 && work_group_size.z == 1) {
@@ -322,7 +336,7 @@ std::string Reduce::GetReduceKernelCode(const OperationDef& op_def,
              op_type == OperationType::REDUCE_MINIMUM) {
     c += "  " + accum_type_decl + " reducer = args.src_tensor.Read" +
          read_as_template + "(" + src_coordinates + ");\n";
-    if (channels_reductin) {
+    if (channels_reduction) {
       c += "  reducer.y = reducer.x;\n";
       c += "  reducer.z = reducer.x;\n";
       c += "  reducer.w = reducer.x;\n";
@@ -385,7 +399,7 @@ std::string Reduce::GetReduceKernelCode(const OperationDef& op_def,
   }
   c += "    " + accum_type_decl + " src_val = args.src_tensor.Read" +
        read_as_template + "(" + src_coordinates + ");\n";
-  if (channels_reductin) {
+  if (channels_reduction) {
     if (op_type == OperationType::REDUCE_SUM ||
         op_type == OperationType::MEAN) {
       c += "    src_val = src_val * mask_a;\n";
@@ -409,27 +423,29 @@ std::string Reduce::GetReduceKernelCode(const OperationDef& op_def,
     const int total_size =
         work_group_size.x * work_group_size.y * work_group_size.z;
     int offset = 1;
-    int reminder = total_size / 4;
-    for (; reminder >= 8; reminder /= 4, offset *= 4) {
-      c += "  if (local_id < " + std::to_string(reminder) + ") {\n";
+    int remainder = total_size / 4;
+    for (; remainder >= 8; remainder /= 4, offset *= 4) {
+      c += "  if (local_id < " + std::to_string(remainder) + ") {\n";
       c += "    int t = local_id * " + std::to_string(offset * 4) + ";\n";
-      c += "    " + accum_type_decl + " sum = accum[t + " +
+      c += "    " + accum_type_decl + " reduced = accum[t + " +
            std::to_string(offset) + "];\n";
-      c += "    sum = " +
-           MakeOp(op_type, "sum",
+      c += "    reduced = " +
+           MakeOp(op_type, "reduced",
                   "accum[t + " + std::to_string(offset * 2) + "]") +
            ";\n";
-      c += "    sum = " +
-           MakeOp(op_type, "sum",
+      c += "    reduced = " +
+           MakeOp(op_type, "reduced",
                   "accum[t + " + std::to_string(offset * 3) + "]") +
            ";\n";
-      c += "    accum[t] = " + MakeOp(op_type, "accum[t]", "sum") + ";\n";
+      c += "    accum[t] = " + MakeOp(op_type, "accum[t]", "reduced") + ";\n";
       c += "  }\n";
       c += "  LOCAL_MEM_BARRIER;\n";
     }
+    // Ensure only id 0 executes a write command.
+    c += "  if (local_id != 0) return;\n";
     c += "  reducer = accum[0];\n";
-    reminder *= 4;
-    for (int i = 1; i < reminder; ++i) {
+    remainder *= 4;
+    for (int i = 1; i < remainder; ++i) {
       c += "  reducer = " +
            MakeOp(op_type, "reducer",
                   "accum[" + std::to_string(offset * i) + "]") +
@@ -439,7 +455,7 @@ std::string Reduce::GetReduceKernelCode(const OperationDef& op_def,
       c += "  reducer *= args.inv_multiplier_2;\n";
     }
   }
-  if (channels_reductin) {
+  if (channels_reduction) {
     if (op_type == OperationType::REDUCE_SUM ||
         op_type == OperationType::MEAN) {
       c += "  reducer.x += reducer.y + reducer.z + reducer.w;\n";
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.cc
index 0a7600e7cc8..1a5f46dfa74 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/reduce_test_util.cc
@@ -199,28 +199,41 @@ absl::Status ReduceProductChannelsTest(TestExecutionEnvironment* env) {
 }
 
 absl::Status ReduceMaxChannelsTest(TestExecutionEnvironment* env) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 6);
-  src_tensor.data = {1.1,  2.0,  -0.3, -100.0, 32.6, 1.1,
-                     -3.1, -4.0, -5.0, -7.0,   -2.0, -100.0};
-  const std::set<tflite::gpu::Axis> axis{Axis::CHANNELS};
+  const std::vector<int> num_channels = {6, 257};
 
-  for (auto precision : env->GetSupportedPrecisions()) {
-    auto data_type = DeduceDataTypeFromPrecision(precision);
-    for (auto storage : env->GetSupportedStorages(data_type)) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      Reduce operation =
-          CreateReduce(axis, src_tensor.shape, OperationType::REDUCE_MAXIMUM,
-                       op_def, env->GetGpuInfo());
-      RETURN_IF_ERROR(env->ExecuteGPUOperation(
-          src_tensor, std::make_unique<Reduce>(std::move(operation)),
-          BHWC(1, 2, 1, 1), &dst_tensor));
-      RETURN_IF_ERROR(PointWiseNear({32.6f, -2.0f}, dst_tensor.data, eps));
+  for (int channels : num_channels) {
+    TensorFloat32 src_tensor;
+    src_tensor.shape = BHWC(1, 2, 1, channels);
+    src_tensor.data = std::vector<float>(2 * channels, -1000.0f);
+    // Move the custom values to the end of the tensor to ensure masking works
+    // correctly.
+    std::vector<float> channel1 = {1.1, 2.0, -0.3, -100.0, 32.6, 1.1};
+    std::vector<float> channel2 = {-3.1, -4.0, -5.0, -7.0, -2.0, -100.0};
+    src_tensor.data.insert(src_tensor.data.begin() + (channels - 6),
+                           channel1.begin(), channel1.end());
+    src_tensor.data.insert(src_tensor.data.begin() + (2 * channels - 6),
+                           channel2.begin(), channel2.end());
+
+    const std::set<tflite::gpu::Axis> axis{Axis::CHANNELS};
+
+    for (auto precision : env->GetSupportedPrecisions()) {
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      for (auto storage : env->GetSupportedStorages(data_type)) {
+        const float eps =
+            precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+        OperationDef op_def;
+        op_def.precision = precision;
+        op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+        op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+        TensorFloat32 dst_tensor;
+        Reduce operation =
+            CreateReduce(axis, src_tensor.shape, OperationType::REDUCE_MAXIMUM,
+                         op_def, env->GetGpuInfo());
+        RETURN_IF_ERROR(env->ExecuteGPUOperation(
+            src_tensor, std::make_unique<Reduce>(std::move(operation)),
+            BHWC(1, 2, 1, 1), &dst_tensor));
+        RETURN_IF_ERROR(PointWiseNear({32.6f, -2.0f}, dst_tensor.data, eps));
+      }
     }
   }
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD b/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
index 36591c00dbe..496fe7d6985 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/winograd.cc b/tensorflow/lite/delegates/gpu/common/tasks/winograd.cc
index 1b423c73d22..d87a41c4c21 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/winograd.cc
@@ -53,13 +53,21 @@ std::string GetKernelWinograd4x4To36(const GpuInfo& gpu_info,
                                      const OperationDef& op_def) {
   std::string c;
   const auto src_desc = op_def.src_tensors[0];
+  c += "MAIN_FUNCTION($0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = GLOBAL_ID_0;\n";
+    c += "  int X = (linear_id / args.dst_tensor.Batch()) * 4;\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = GLOBAL_ID_0 * 4;\n";
+  }
   c += R"(
-MAIN_FUNCTION($0) {
-  int X = GLOBAL_ID_0 * 4;
   int Y = GLOBAL_ID_1 * 4;
   int S = GLOBAL_ID_2;
 
-  if (GLOBAL_ID_0 >= args.tiles_x || GLOBAL_ID_1 >= args.tiles_y) return;
+  if (X / 4 >= args.tiles_x || Y / 4 >= args.tiles_y) return;
 
   FLT4 I[6][6];
   for (int y = 0; y < 6; ++y) {
@@ -68,9 +76,6 @@ MAIN_FUNCTION($0) {
     }
   }
 )";
-  if (src_desc.IsLinear()) {
-    c += "  int src_base = args.src_tensor.GetAddress(0, 0, S);\n";
-  }
   for (int y = 0; y < 6; ++y) {
     const std::string s_y = std::to_string(y);
     c += "  {\n";
@@ -80,10 +85,6 @@ MAIN_FUNCTION($0) {
            "args.src_tensor.Height();\n";
       c += "    coord_y = clamp(coord_y, 0, args.src_tensor.Height() - 1);\n";
     }
-    if (src_desc.IsLinear()) {
-      c += "    int src_adress_y = src_base + coord_y * "
-           "args.src_tensor.Width();\n";
-    }
     for (int x = 0; x < 6; ++x) {
       const std::string s_x = std::to_string(x);
       c += "    {\n";
@@ -102,13 +103,8 @@ MAIN_FUNCTION($0) {
       } else if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
         multiplier = " * INIT_FLT(in_y)";
       }
-      if (src_desc.IsLinear()) {
-        c += "      FLT4 src = args.src_tensor.Read(src_adress_y + coord_x)" +
-             multiplier + ";\n";
-      } else {
-        c += "      FLT4 src = args.src_tensor.Read(coord_x, coord_y, S)" +
-             multiplier + ";\n";
-      }
+      c += "      FLT4 src = args.src_tensor.Read(coord_x, coord_y, S)" +
+           multiplier + ";\n";
       c += "      I[0][" + s_x + "] += args.Bt.Read(" + std::to_string(y) +
            ") * src;\n";
       c += "      I[1][" + s_x + "] += args.Bt.Read(" + std::to_string(y + 6) +
@@ -127,7 +123,7 @@ MAIN_FUNCTION($0) {
   }
 
   c += R"(
-  int dst_x = GLOBAL_ID_1 * args.tiles_x + GLOBAL_ID_0;
+  int dst_x = Y / 4 * args.tiles_x + X / 4;
   for (int y = 0; y < 6; ++y) {
     FLT4 value = I[y][0] + args.Bt.Read(2) * I[y][2] + args.Bt.Read(4) * I[y][4];
     args.dst_tensor.Write(value, dst_x, y * 6 + 0, S);
@@ -150,10 +146,17 @@ MAIN_FUNCTION($0) {
 std::string GetKernelWinograd36To4x4(const OperationDef& op_def) {
   std::string c;
   const auto src_desc = op_def.src_tensors[0];
-
+  c += "MAIN_FUNCTION($0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = GLOBAL_ID_0;\n";
+    c += "  int tile_id = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int tile_id = GLOBAL_ID_0;\n";
+  }
   c += R"(
-MAIN_FUNCTION($0) {
-  int tile_id = GLOBAL_ID_0;
   int Z = GLOBAL_ID_2;
   int tiles_count_x = (args.dst_tensor.Width() + 3) / 4;
   int tile_x = (tile_id % tiles_count_x) * 4;
@@ -166,22 +169,6 @@ MAIN_FUNCTION($0) {
       I[y][x] = INIT_FLT4(0.0f);
     }
   }
-)";
-  if (src_desc.IsLinear()) {
-    c += R"(
-  int src_adress = args.src_tensor.GetAddress(tile_id, 0, Z);
-  for (int y = 0; y < 6; ++y) {
-    for (int x = 0; x < 6; ++x, src_adress += args.src_tensor.Width()) {
-      FLT4 src = args.src_tensor.Read(src_adress);
-      I[0][x] += src * args.At.Read(y);
-      I[1][x] += src * args.At.Read(y + 6);
-      I[2][x] += src * args.At.Read(y + 12);
-      I[3][x] += src * args.At.Read(y + 18);
-    }
-  }
-)";
-  } else {
-    c += R"(
   for (int y = 0; y < 6; ++y) {
     for (int x = 0; x < 6; ++x) {
       FLT4 src = args.src_tensor.Read(tile_id, y * 6 + x, Z);
@@ -191,9 +178,6 @@ MAIN_FUNCTION($0) {
       I[3][x] += src * args.At.Read(y + 18);
     }
   }
-)";
-  }
-  c += R"(
 
   FLT4 bias_val = args.biases.Read(Z);
   for (int y = 0; y < 4; ++y) {
@@ -231,7 +215,7 @@ int3 Winograd4x4To36::GetGridSize() const {
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2;
   int tiles_x = DivideRoundUp(new_width, 4);
   int tiles_y = DivideRoundUp(new_height, 4);
-  return int3(tiles_x, tiles_y, src_[0]->Slices());
+  return int3(tiles_x * dst_[0]->Batch(), tiles_y, src_[0]->Slices());
 }
 
 absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
@@ -297,7 +281,15 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
   args_.AddInt("tiles_x");
 
   c += "MAIN_FUNCTION($0) {\n";
-  c += "  int DST_X = GLOBAL_ID_0;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = GLOBAL_ID_0;\n";
+    c += "  int DST_X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int DST_X = GLOBAL_ID_0;\n";
+  }
   c += "  int DST_Y = GLOBAL_ID_1;\n";
   c += "  int DST_Z = GLOBAL_ID_2;\n";
   c += "  if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= "
@@ -319,17 +311,13 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
   c += "  bt_ar[5] = t1.y;\n";
   auto read_src = [&](const std::string& src, const std::string& xs) {
     std::string read_statement;
-    if (src_desc.IsLinear()) {
-      read_statement = "args.src_tensor.Read(src_a_" + xs + " + offset)";
-    } else {
-      read_statement = "args.src_tensor.Read(xc" + xs + ", yc, DST_Z)";
-    }
+    read_statement = "args.src_tensor.Read(xc" + xs + ", yc, DST_Z)";
     std::string multiplier;
     if (!src_desc.SupportsZeroClamp(Axis::WIDTH, gpu_info)) {
-      if (!(src_desc.IsLinear() &&
-            src_desc.ReturnsZeroForNegOneRead(gpu_info))) {
-        multiplier = " * m" + xs + "_x";
-      }
+      multiplier += " * m" + xs + "_x";
+    }
+    if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
+      multiplier += " * INIT_FLT(iny)";
     }
     c += "    FLT4 " + src + " = " + read_statement + multiplier + ";\n";
   };
@@ -343,16 +331,6 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
       c += "  xc" + xs + " = clamp(xc" + xs +
            ", 0, args.src_tensor.Width() - 1);\n";
     }
-    if (src_desc.IsLinear()) {
-      c += "  int src_a_" + xs + " = args.src_tensor.GetAddress(xc" + xs +
-           ", 0, DST_Z);\n";
-      if (src_desc.ReturnsZeroForNegOneRead(gpu_info)) {
-        c += "  src_a_" + xs +
-             " = select(-args.src_tensor.Width() * args.src_tensor.Height(), "
-             "src_a_" +
-             xs + ", inx" + xs + ");\n";
-      }
-    }
   }
   const bool manual_unroll =
       !(op_def.precision == CalculationsPrecision::F32 && gpu_info.IsMali());
@@ -362,7 +340,6 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
     if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
       c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
       c += "    yc = clamp(yc, 0, args.src_tensor.Height() - 1);\n";
-      c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
       c += "    FLT bt = bt_ar[0] * INIT_FLT(iny);\n";
     } else {
       c += "    FLT bt = bt_ar[0];\n";
@@ -381,7 +358,6 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
       if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
         c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
         c += "    yc = clamp(yc, 0, args.src_tensor.Height() - 1);\n";
-        c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
         c += "    FLT bt = bt_ar[" + ys + "] * INIT_FLT(iny);\n";
       } else {
         c += "    FLT bt = bt_ar[" + ys + "];\n";
@@ -406,7 +382,6 @@ std::string Winograd4x4To36TileX6::GetWinograd4x4To36TileX6Code(
     if (!src_desc.SupportsZeroClamp(Axis::HEIGHT, gpu_info)) {
       c += "    bool iny = (yc >= 0 && yc < args.src_tensor.Height());\n";
       c += "    yc = clamp(yc, 0, args.src_tensor.Height() - 1);\n";
-      c += "    int offset = select(0, yc * args.src_tensor.Width(), iny);\n";
       c += "    FLT bt = bt_ar[y] * INIT_FLT(iny);\n";
     } else {
       c += "    FLT bt = bt_ar[y];\n";
@@ -538,7 +513,7 @@ Winograd4x4To36TileX6 CreateWinograd4x4To36TileX6(
 }
 
 int3 Winograd36To4x4::GetGridSize() const {
-  return int3(src_[0]->Width(), 1, src_[0]->Slices());
+  return int3(src_[0]->Width() * dst_[0]->Batch(), 1, src_[0]->Slices());
 }
 
 Winograd36To4x4 CreateWinograd36To4x4(
@@ -586,7 +561,16 @@ std::string Winograd36To4x4Tile4x1::GetWinograd36To4x4Tile4x1Code(
   args_.AddInt("tiles_x");
 
   c += "MAIN_FUNCTION($0) {\n";
-  c += "  int tile_id = GLOBAL_ID_0;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = GLOBAL_ID_0;\n";
+    c += "  int tile_id = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int tile_id = GLOBAL_ID_0;\n";
+  }
+
   c += "  int DST_Y = GLOBAL_ID_1;\n";
   c += "  int DST_Z = GLOBAL_ID_2;\n";
   c += "  int tile_x = (tile_id % args.tiles_x) * 4;\n";
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.cc b/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.cc
index b94814d4792..6b5fb4a830f 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.cc
@@ -227,6 +227,75 @@ absl::Status Winograd4x4To36Test(TestExecutionEnvironment* env) {
   return absl::OkStatus();
 }
 
+absl::Status Winograd4x4To36BatchTest(TestExecutionEnvironment* env) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(3, 4, 4, 1);
+  src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
+  for (int i = 0; i < src_tensor.shape.DimensionsProduct(); ++i) {
+    src_tensor.data[i] = sin(i);
+  }
+
+  TensorFloat32 dst_ref;
+  dst_ref.shape = BHWC(src_tensor.shape.b, 36, 1, 1);
+  dst_ref.data.resize(dst_ref.shape.DimensionsProduct());
+  auto b_t = BtMatrixForWinograd4x4To6x6();
+
+  // Bt * Src * B
+  // 1: temp = Src * B
+  for (int batch_id = 0; batch_id < dst_ref.shape.b; ++batch_id) {
+    std::vector<float> temp(36, 0.0f);
+    for (int y = 0; y < 6; ++y) {
+      for (int x = 0; x < 6; ++x) {
+        float sum = 0.0f;
+        for (int i = 0; i < 6; ++i) {
+          if (y < 1 || y > 4 || i < 1 || i > 4) continue;
+          const int index =
+              src_tensor.shape.LinearIndex({batch_id, y - 1, i - 1, 0});
+          sum += src_tensor.data[index] * b_t[x * 6 + i];
+        }
+        temp[y * 6 + x] = sum;
+      }
+    }
+    // 2: ref = Bt * temp
+    for (int y = 0; y < 6; ++y) {
+      for (int x = 0; x < 6; ++x) {
+        float sum = 0.0f;
+        for (int i = 0; i < 6; ++i) {
+          sum += b_t[y * 6 + i] * temp[i * 6 + x];
+        }
+        const int index =
+            dst_ref.shape.LinearIndex({batch_id, y * 6 + x, 0, 0});
+        dst_ref.data[index] = sum;
+      }
+    }
+  }
+
+  for (auto precision : env->GetSupportedPrecisions()) {
+    auto data_type = DeduceDataTypeFromPrecision(precision);
+    for (auto storage : env->GetSupportedStorages(data_type)) {
+      float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
+      if (!env->GetGpuInfo().IsRoundToNearestSupported()) {
+        eps *= 4.0f;
+      }
+      OperationDef op_def;
+      op_def.precision = precision;
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorFloat32 dst_tensor;
+      Padding2D padding;
+      padding.prepended = HW(1, 1);
+      padding.appended = HW(1, 1);
+      Winograd4x4To36 operation =
+          CreateWinograd4x4To36(op_def, padding, env->GetGpuInfo());
+      RETURN_IF_ERROR(env->ExecuteGPUOperation(
+          src_tensor, std::make_unique<Winograd4x4To36>(std::move(operation)),
+          dst_ref.shape, &dst_tensor));
+      RETURN_IF_ERROR(PointWiseNear(dst_ref.data, dst_tensor.data, eps));
+    }
+  }
+  return absl::OkStatus();
+}
+
 absl::Status Winograd36To4x4Test(TestExecutionEnvironment* env) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 36, 1, 1);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h b/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h
index b3175544966..1fa7e2cb564 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/winograd_test_util.h
@@ -25,6 +25,7 @@ namespace gpu {
 absl::Status Winograd4x4To36TileX6Test(TestExecutionEnvironment* env);
 absl::Status Winograd36To4x4Tile4x1Test(TestExecutionEnvironment* env);
 absl::Status Winograd4x4To36Test(TestExecutionEnvironment* env);
+absl::Status Winograd4x4To36BatchTest(TestExecutionEnvironment* env);
 absl::Status Winograd36To4x4Test(TestExecutionEnvironment* env);
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/testing/BUILD b/tensorflow/lite/delegates/gpu/common/testing/BUILD
index 487404d18a0..7c80920ee05 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -12,10 +13,12 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index d3b9d7dd9a8..844e54e04e3 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_testonly = 1,
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
@@ -26,7 +27,9 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index a6e33ca563f..57984c59d45 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_testonly = 1,
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
index 729b9466cdb..a474ec86173 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
@@ -23,9 +23,9 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_type.h"
 
 std::ostream& operator<<(std::ostream& os, const TfLiteTensor& tensor) {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
index 076275e21db..f839773a8f5 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -31,8 +31,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
index 4d46b1ccb60..2ca0a8e3c47 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index d26b4f807de..d6e52049bf0 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index ff6cd92f52a..b69276cfd80 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -33,9 +33,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+#include "tensorflow/lite/delegates/gpu/delegate_options.h"
 #include "tensorflow/lite/delegates/serialization.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
 #ifndef CL_DELEGATE_NO_GL
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
@@ -70,6 +74,8 @@ InferenceUsage ToUsage(int32_t usage) {
       return InferenceUsage::FAST_SINGLE_ANSWER;
     case TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED:
       return InferenceUsage::SUSTAINED_SPEED;
+    case TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED:
+      return InferenceUsage::BALANCED;
   }
   return InferenceUsage::UNKNOWN;
 }
@@ -81,6 +87,8 @@ class Delegate {
  public:
   explicit Delegate(const TfLiteGpuDelegateOptionsV2* options)
       : num_delegate_kernels_(0) {
+    telemetry_settings_ =
+        std::make_unique<TfLiteTelemetryGpuDelegateSettings>();
     delegate_.data_ = reinterpret_cast<void*>(this);
     delegate_.Prepare = DelegatePrepare;
     delegate_.CopyFromBufferHandle = nullptr;
@@ -98,6 +106,8 @@ class Delegate {
       params.model_token = options_.model_token;
       params.cache_dir = options_.serialization_dir;
       serialization_ = std::make_unique<Serialization>(params);
+      telemetry_settings_ =
+          std::make_unique<TfLiteTelemetryGpuDelegateSettings>();
     }
   }
 
@@ -113,6 +123,9 @@ class Delegate {
     return options_.max_delegated_partitions;
   }
   int num_delegate_kernels() const { return num_delegate_kernels_; }
+  TfLiteTelemetryGpuDelegateSettings* telemetry_settings() {
+    return telemetry_settings_.get();
+  }
 
  private:
   TfLiteDelegate delegate_;
@@ -121,6 +134,8 @@ class Delegate {
 
   std::unique_ptr<Serialization> serialization_;
 
+  std::unique_ptr<TfLiteTelemetryGpuDelegateSettings> telemetry_settings_;
+
   friend class DelegateKernel;
 };
 
@@ -129,6 +144,8 @@ class DelegateKernel {
  public:
   explicit DelegateKernel(Delegate* delegate) : delegate_(delegate) {
     ++delegate_->num_delegate_kernels_;
+    telemetry_settings_ =
+        std::make_unique<TfLiteTelemetryGpuDelegateSettings>();
   }
   ~DelegateKernel() { --delegate_->num_delegate_kernels_; }
 
@@ -146,11 +163,13 @@ class DelegateKernel {
 
     std::unique_ptr<InferenceBuilder> builder;
     bool graph_is_destroyed;
+    bool backend_opencl = false;
     const int experimental_flags = delegate_->options().experimental_flags;
     if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY) {
       RETURN_IF_ERROR(InitializeOpenClApi(&graph, &builder, &graph_is_destroyed,
                                           context, delegate_params,
                                           delegate_->serialization()));
+      backend_opencl = true;
     } else if (experimental_flags & TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY) {
       RETURN_IF_ERROR(InitializeOpenGlApi(&graph, &builder));
     } else {
@@ -170,9 +189,18 @@ class DelegateKernel {
         }
         RETURN_IF_ERROR(InitializeOpenGlApi(
             graph_is_destroyed ? &graph2 : &graph, &builder));
+      } else {
+        backend_opencl = true;
       }
     }
 
+    telemetry_settings_->backend =
+        backend_opencl ? TfLiteTelemetryGpuDelegateSettings::OPENCL
+                       : TfLiteTelemetryGpuDelegateSettings::OPENGL;
+    telemetry::TelemetryReportDelegateSettings(
+        context, "GpuDelegateKernel::Prepare",
+        telemetry::TelemetrySource::TFLITE_GPU, telemetry_settings_.get());
+
     // At this point, TFLite hasn't allocated tensors yet, therefore, collect
     // indices and set all input and output tensors from TFLite later.
     input_indices_.reserve(input_refs.size());
@@ -494,6 +522,8 @@ class DelegateKernel {
   absl::flat_hash_map<int, int> quant_conversion_map_;
   std::thread::id thread_id_prepare_;  // thread id used for Prapare()
   bool enforce_same_thread_ = false;   // flag to enforce same thread for Invoke
+
+  std::unique_ptr<TfLiteTelemetryGpuDelegateSettings> telemetry_settings_;
 };
 
 inline DelegateKernel* GetDelegateKernel(TfLiteNode* node) {
@@ -577,7 +607,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
       context, kRegistration, ops_to_replace, delegate);
   TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Created %d GPU delegate kernels.",
                   gpu_delegate->num_delegate_kernels());
+  auto* delegate_setting = gpu_delegate->telemetry_settings();
+  delegate_setting->num_nodes_delegated = ops_to_replace->size;
   TfLiteIntArrayFree(ops_to_replace);
+  telemetry::TelemetryReportDelegateSettings(
+      context, "GpuDelegate::DelegatePrepare",
+      telemetry::TelemetrySource::TFLITE_GPU, delegate_setting);
   return status;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.h b/tensorflow/lite/delegates/gpu/delegate_options.h
index 1aa16baabb6..d7cceaaf9e7 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.h
+++ b/tensorflow/lite/delegates/gpu/delegate_options.h
@@ -33,6 +33,11 @@ enum TfLiteGpuInferenceUsage {
   // Prefer maximizing the throughput. Same delegate will be used repeatedly on
   // multiple inputs.
   TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+
+  // Balance init latency and throughput. This option will result in slightly
+  // higher init latency than FAST_SINGLE_ANSWER but should have inference
+  // latency closer to SUSTAINED_SPEED.
+  TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED = 2,
 };
 
 enum TfLiteGpuInferencePriority {
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 6e3e0aff57c..83a69bb6b52 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index 44af67fc9ad..bb5914fd190 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/BUILD b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
index 95e8900f78b..70687b224b0 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 7d62109d6a7..9b45ceab9d2 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
index 069ceeff68e..0448505903b 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/resize.cc
@@ -42,7 +42,8 @@ class Resize : public NodeShader {
 
     if (ctx.input_shapes[0][2] > ctx.output_shapes[0][2] ||
         ctx.input_shapes[0][1] > ctx.output_shapes[0][1]) {
-      return absl::InvalidArgumentError("Output size is less than input size.");
+      return absl::UnimplementedError(
+          "Downsampling is currently not supported by the resize op on GPU.");
     }
     if (ctx.output_shapes[0][2] != attr.new_shape.w ||
         ctx.output_shapes[0][1] != attr.new_shape.h) {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
index e689ad24878..b087465804f 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc
@@ -52,7 +52,7 @@ class Slice : public NodeShader {
     };
 
     std::string code;
-    code += "      ivec2 offset;\n";
+    code += "      ivec3 offset;\n";
     if (attr.strides.w > 0) {
       code += "      offset.x = $widths.x$;\n";
     } else {
@@ -71,31 +71,37 @@ class Slice : public NodeShader {
         code += "      offset.y = src_height + $heights.z$;\n";
       }
     }
-    code += "      ivec2 stride = ivec2($widths.y$, $heights.y$);\n";
-    code += "      ivec2 coord = offset + ivec2(gid.xy) * stride;\n";
-    code += "      bool outside = false;\n";
-    code += "      int step = gid.z * 4;\n";
-    code += "      int buffer_index = 0;\n";
-    code += "      int addr = 0;\n";
-    for (int i = 0; i < 4; i++) {
-      code += "      addr = step * $channels.y$;\n";
-      if (attr.strides.c > 0) {
-        code += "      addr += $channels.x$;\n";
+    if (attr.strides.c > 0) {
+      code += "      offset.z = $channels.x$;\n";
+    } else {
+      if (attr.ends.c > 0) {
+        code += "      offset.z = $channels.z$;\n";
       } else {
-        if (attr.ends.c > 0) {
-          code += "      addr += $channels.z$;\n";
-        } else {
-          code += "      addr += src_channels + $channels.z$;\n";
-        }
-      }
-      code += "      if (step < $dst_size$) {\n        value_0[" +
-              std::to_string(i) +
-              "] = $input_data_0[coord.x, coord.y, addr / 4]$[addr % 4];\n     "
-              " }\n";
-      if (i != 3) {
-        code += "      step++;\n";
+        code += "      offset.z = src_channels + $channels.z$;\n";
       }
     }
+    code +=
+        "      ivec3 stride = "
+        "ivec3($widths.y$, $heights.y$, $channels.y$);\n";
+    code += "      ivec3 coord;\n";
+    code += "      coord.xy = offset.xy + ivec2(gid.xy) * stride.xy;\n";
+    code += "      int step = gid.z * 4;\n";
+    code += "      coord.z = offset.z + step * stride.z;\n";
+    code +=
+        "      if(step++ < $dst_size$) value_0[0] = "
+        "$input_data_0[coord.x, coord.y, coord.z / 4]$[coord.z % 4];\n";
+    code += "      coord.z += $channels.y$;\n";
+    code +=
+        "      if(step++ < $dst_size$) value_0[1] = "
+        "$input_data_0[coord.x, coord.y, coord.z / 4]$[coord.z % 4];\n";
+    code += "      coord.z += $channels.y$;\n";
+    code +=
+        "      if(step++ < $dst_size$) value_0[2] = "
+        "$input_data_0[coord.x, coord.y, coord.z / 4]$[coord.z % 4];\n";
+    code += "      coord.z += $channels.y$;\n";
+    code +=
+        "      if(step++ < $dst_size$) value_0[3] = "
+        "$input_data_0[coord.x, coord.y, coord.z / 4]$[coord.z % 4];\n";
 
     *generated_code = {
         /*parameters=*/std::move(parameters),
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
index 92d743ce557..d9b84bbbf43 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/lite/delegates/gpu/gl:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index f9a1cc2535f..99d02fb7bdc 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD
index 8ebffab775b..c0077a3eec9 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index f92b47f3ffd..b69f387b922 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -44,7 +44,8 @@ public GpuDelegate(GpuDelegateFactory.Options options) {
             options.areQuantizedModelsAllowed(),
             options.getInferencePreference(),
             options.getSerializationDir(),
-            options.getModelToken());
+            options.getModelToken(),
+            options.getForceBackend().value());
   }
 
   @UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
@@ -83,7 +84,8 @@ private static native long createDelegate(
       boolean quantizedModelsAllowed,
       int preference,
       String serializationDir,
-      String modelToken);
+      String modelToken,
+      int forceBackend);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegateFactory.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegateFactory.java
index c4e3d104a59..f673cbd7de4 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegateFactory.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegateFactory.java
@@ -42,6 +42,26 @@ public Options() {}
      */
     public static final int INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1;
 
+    /** Which GPU backend to select. */
+    public enum GpuBackend {
+      /** Try OpenCL first. Fall back to OpenGL if it's not available. */
+      UNSET(0),
+      /** Enforces execution with Open CL. */
+      OPENCL(1),
+      /** Enforces execution with Open GL. */
+      OPENGL(2);
+      private final int value;
+
+      GpuBackend(int value) {
+        this.value = value;
+      }
+
+      /** int value of this enum. */
+      public int value() {
+        return value;
+      }
+    }
+
     /**
      * Sets whether precision loss is allowed.
      *
@@ -96,6 +116,14 @@ public Options setSerializationParams(String serializationDir, String modelToken
       return this;
     }
 
+    /**
+     * Sets the GPU Backend.
+     */
+    public Options setForceBackend(GpuBackend forceBackend) {
+      this.forceBackend = forceBackend;
+      return this;
+    }
+
     public boolean isPrecisionLossAllowed() {
       return precisionLossAllowed;
     }
@@ -116,11 +144,16 @@ public String getModelToken() {
       return modelToken;
     }
 
+    public GpuBackend getForceBackend() {
+      return forceBackend;
+    }
+
     private boolean precisionLossAllowed = true;
     boolean quantizedModelsAllowed = true;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
     String serializationDir = null;
     String modelToken = null;
+    GpuBackend forceBackend = GpuBackend.UNSET;
   }
 
   public GpuDelegateFactory() {
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
index d8f87c782de..338e8a0ca04 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -50,6 +51,7 @@ cc_library_with_tflite(
     ],
     deps = [
         "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/delegates/gpu:delegate_options",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
         "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 0f32d254a2b..16ad8fba81e 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/delegates/gpu/delegate_options.h"
 #endif
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
@@ -36,6 +37,9 @@ using tflite::GPUSettings;
 using tflite::GPUSettingsBuilder;
 using tflite::TFLiteSettings;
 using tflite::TFLiteSettingsBuilder;
+#else
+constexpr int kGpuBackendOpenCl = 1;
+constexpr int kGpuBackendOpenGl = 2;
 #endif
 
 extern "C" {
@@ -43,7 +47,7 @@ extern "C" {
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
     jboolean quantized_models_allowed, jint inference_preference,
-    jstring serialization_dir, jstring model_token) {
+    jstring serialization_dir, jstring model_token, jint force_backend) {
   if (!tflite::jni::CheckJniInitializedOrThrow(env)) return 0;
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
@@ -99,6 +103,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     gpu_settings_builder.add_inference_preference(
         static_cast<tflite::GPUInferenceUsage>(inference_preference));
   }
+  if (force_backend) {
+    gpu_settings_builder.add_force_backend(
+        static_cast<tflite::GPUBackend>(force_backend));
+  }
   Offset<GPUSettings> gpu_settings = gpu_settings_builder.Finish();
   TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder);
   tflite_settings_builder.add_gpu_settings(gpu_settings);
@@ -138,6 +146,11 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     options.experimental_flags |=
         TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION;
   }
+  if (force_backend == kGpuBackendOpenCl) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+  } else if (force_backend == kGpuBackendOpenGl) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_GL_ONLY;
+  }
   return reinterpret_cast<jlong>(TfLiteGpuDelegateV2Create(&options));
 #endif  // TFLITE_DISABLE_SELECT_JAVA_APIS
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 4610371ba86..8571ff7f041 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -13,6 +13,7 @@ load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("//tensorflow:tensorflow.bzl", "workspace_root")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD b/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD
index 37cfe9d811f..3aa08e5ad47 100644
--- a/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/BUILD
@@ -1,6 +1,7 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -19,6 +20,9 @@ objc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/gpu/common:gpu_info",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
@@ -28,7 +32,6 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm b/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm
index b6f930c4e21..eb8ffd4d47e 100644
--- a/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm
+++ b/tensorflow/lite/delegates/gpu/metal/benchmarking/main.mm
@@ -20,6 +20,9 @@
 
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
@@ -30,10 +33,7 @@
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
 namespace gpu {
@@ -103,14 +103,14 @@ void FillInputTensor(tflite::Interpreter* interpreter) {
           printed++;
         }
         if (printed == kMaxPrint) {
-          std::cout << "Printed " << kMaxPrint << " different elements, threshhold - "
+          std::cout << "Printed " << kMaxPrint << " different elements, threshold - "
                     << per_element_eps << ", next different elements skipped" << std::endl;
           printed++;
         }
       }
     }
     std::cout << "Total " << total_different << " different elements, for output #" << i
-              << ", threshhold - " << per_element_eps << std::endl;
+              << ", threshold - " << per_element_eps << std::endl;
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index f3ff75a60cb..2232d17f75b 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -11,6 +11,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -198,6 +199,7 @@ objc_library(
     sdk_frameworks = ["XCTest"],
     deps = [
         ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/tasks:elementwise_test_util",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index f449964513b..ca63e4d06cf 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -241,4 +241,14 @@ - (void)testMulBroadcastBothInputs {
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testLogicalAnd {
+  auto status = LogicalAndTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
+- (void)testLogicalAndWithConstant {
+  auto status = LogicalAndWithConstantTest(&exec_env_);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 @end
diff --git a/tensorflow/lite/delegates/hexagon/BUILD b/tensorflow/lite/delegates/hexagon/BUILD
index d2aa9e0d3fe..04b59082653 100644
--- a/tensorflow/lite/delegates/hexagon/BUILD
+++ b/tensorflow/lite/delegates/hexagon/BUILD
@@ -16,6 +16,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -57,6 +58,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon/builders:op_builder",
         "//tensorflow/lite/delegates/hexagon/hexagon_nn:hexagon_nn_header",
         "//tensorflow/lite/delegates/utils:simple_delegate",
@@ -89,6 +91,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
     ] + select({
         "//tensorflow:ios": [],
@@ -110,6 +113,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
 )
@@ -121,6 +125,7 @@ cc_test(
     deps = [
         ":utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/delegates/hexagon/builders/BUILD b/tensorflow/lite/delegates/hexagon/builders/BUILD
index 4a1e3c17869..31a64c43261 100644
--- a/tensorflow/lite/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/delegates/hexagon/builders/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -81,6 +82,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon:hexagon_implementation",
         "//tensorflow/lite/delegates/hexagon/hexagon_nn:hexagon_nn_header",
         "//tensorflow/lite/kernels:kernel_util",
@@ -100,5 +102,8 @@ cc_library(
         "manual",
         "nobuilder",
     ],
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+    ],
 )
diff --git a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
index 896c7e30fd4..bd63dcabb31 100644
--- a/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/activation_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
index 1b6de1b8907..4e888de5fc5 100644
--- a/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/arithmetic_builder.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <limits>
 
 #include "hexagon/hexagon_nn_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc b/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
index d60b58be590..4b582257279 100644
--- a/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/cast_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/concat_builder.cc b/tensorflow/lite/delegates/hexagon/builders/concat_builder.cc
index c85d8ea8f34..7a3be4c9135 100644
--- a/tensorflow/lite/delegates/hexagon/builders/concat_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/concat_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index b82d62ba48e..17c1ce63718 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
index 25de3d2bc41..968a7fb71db 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_helpers.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
index 774f8759c6e..5e6ff2699fd 100644
--- a/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/hardswish_builder.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
index 4565b299800..9e87d4109db 100644
--- a/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/l2_normalization_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
index f6da28084f3..ce75be2f958 100644
--- a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <limits>
 
 #include "hexagon/hexagon_nn_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
index 9f16e7aba58..a6fc3e14167 100644
--- a/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
index c2a2889b142..30359860210 100644
--- a/tensorflow/lite/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/delegates/hexagon/builders/op_builder.h
@@ -24,8 +24,8 @@ limitations under the License.
 
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
index 102bfa828b0..7ccdb299d5d 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pack_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
index 7473d686391..d49a3de4ab9 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pad_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
index 2aba6c7f164..3d783808d3a 100644
--- a/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/pool_2d_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
index 66b86abb6a8..6e653fd70e4 100644
--- a/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/quantize_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
index 1558ef6b771..a41a9fb23ee 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/reduce_builder.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc b/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
index 1e8ebf807c9..5946abff4d1 100644
--- a/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/reshape_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
index 5ec23d79bcb..7276e9ad450 100644
--- a/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/resize_nearest_neighbor_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
index 5c77dc710d6..b5463a2bb16 100644
--- a/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/softmax_builder.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
index 7dbfd081f60..90642aacab8 100644
--- a/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/space_to_depth_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
index 167d4d1d877..a4d031bfd2b 100644
--- a/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/split_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
index 9e88023a20d..ebef0393c21 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/BUILD
@@ -1,6 +1,7 @@
 load(":tests.bzl", "hexagon_op_tests")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -14,8 +15,10 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -58,6 +61,7 @@ hexagon_op_tests(
     deps = [
         ":hexagon_delegate_op_model",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:reshape_test_common",
         "//tensorflow/lite/kernels/internal:test_util",
@@ -80,6 +84,7 @@ cc_test(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h b/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
index cc409f1e0f7..d93b7522814 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
index d744f98d44a..577d2a35d68 100644
--- a/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/transpose_conv_2d_builder.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
index 4e14d122f87..7b42a9b449c 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
 #include "tensorflow/lite/delegates/hexagon/utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
index e7ba5e4477a..36cc523e773 100644
--- a/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
+++ b/tensorflow/lite/delegates/hexagon/hexagon_delegate_kernel.h
@@ -24,9 +24,9 @@ limitations under the License.
 
 #include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/hexagon/builders/op_builder.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #include "tensorflow/lite/delegates/hexagon/hexagon_implementation.h"
diff --git a/tensorflow/lite/delegates/hexagon/java/BUILD b/tensorflow/lite/delegates/hexagon/java/BUILD
index 8f684d95fd8..7774c189abd 100644
--- a/tensorflow/lite/delegates/hexagon/java/BUILD
+++ b/tensorflow/lite/delegates/hexagon/java/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary", "tflite_jni_linkopt
 load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD b/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
index 5565c0c4903..49c5a70aff5 100644
--- a/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
@@ -1,3 +1,5 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 filegroup(
diff --git a/tensorflow/lite/delegates/hexagon/java/src/main/native/BUILD b/tensorflow/lite/delegates/hexagon/java/src/main/native/BUILD
index 360f9eb05dd..2ebda7302cb 100644
--- a/tensorflow/lite/delegates/hexagon/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/hexagon/java/src/main/native/BUILD
@@ -4,7 +4,10 @@
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
-package(default_visibility = ["//tensorflow/lite/delegates/hexagon/java:__subpackages__"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite/delegates/hexagon/java:__subpackages__"],
+)
 
 licenses(["notice"])
 
diff --git a/tensorflow/lite/delegates/hexagon/utils.cc b/tensorflow/lite/delegates/hexagon/utils.cc
index 18b047571c0..cde12014dda 100644
--- a/tensorflow/lite/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/delegates/hexagon/utils.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 071f55cde99..bd3161e5179 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -38,8 +39,9 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
@@ -67,6 +69,7 @@ cc_library(
         ":nnapi_delegate_no_nnapi_implementation",
         "//tensorflow/lite:allocation",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -101,8 +104,9 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
@@ -173,6 +177,8 @@ cc_test(
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:deprecated_backends",
         "//tensorflow/lite/kernels:test_util",
@@ -198,6 +204,7 @@ cc_test(
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:deprecated_backends",
         "//tensorflow/lite/kernels:test_util",
@@ -223,6 +230,8 @@ cc_test(
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -247,6 +256,8 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -271,6 +282,8 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -295,6 +308,8 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
@@ -315,6 +330,7 @@ cc_test(
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
index c045fc7679b..c6d2388ed70 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
index 04084119327..0f65073b42a 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
@@ -7,6 +7,7 @@ load("//tensorflow/lite:special_rules.bzl", "nnapi_native_srcs_visibility_allowl
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index dfe98f2a682..86f7b534aee 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/delegates/serialization.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
@@ -53,9 +53,9 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
 #include "tensorflow/lite/delegates/utils.h"
@@ -2313,6 +2313,13 @@ bool NNAPIDelegateKernel::Validate(
       } else {
         ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
       }
+      const int input0_rank =
+          context->tensors[node->inputs->data[0]].dims->size;
+      const int input1_rank =
+          context->tensors[node->inputs->data[1]].dims->size;
+      Expect(input0_rank <= 4 && input1_rank <= 4,
+             NNAPIValidationFailureType::kUnsupportedOperandRank,
+             "Input rank must be <= 4", &val_ctx);
     } break;
     case kTfLiteBuiltinArgMax:
     case kTfLiteBuiltinArgMin: {
@@ -2372,6 +2379,13 @@ bool NNAPIDelegateKernel::Validate(
       } else {
         ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
       }
+      const int input0_rank =
+          context->tensors[node->inputs->data[0]].dims->size;
+      const int input1_rank =
+          context->tensors[node->inputs->data[1]].dims->size;
+      Expect(input0_rank <= 4 && input1_rank <= 4,
+             NNAPIValidationFailureType::kUnsupportedOperandRank,
+             "Input rank must be <= 4", &val_ctx);
     } break;
     case kTfLiteBuiltinAveragePool2d: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -2804,6 +2818,13 @@ bool NNAPIDelegateKernel::Validate(
       Expect(context->tensors[node->inputs->data[0]].type == kTfLiteFloat32,
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only support float div.", &val_ctx);
+      const int input0_rank =
+          context->tensors[node->inputs->data[0]].dims->size;
+      const int input1_rank =
+          context->tensors[node->inputs->data[1]].dims->size;
+      Expect(input0_rank <= 4 && input1_rank <= 4,
+             NNAPIValidationFailureType::kUnsupportedOperandRank,
+             "Input rank must be <= 4", &val_ctx);
     } break;
     case kTfLiteBuiltinPad:
     case kTfLiteBuiltinPadv2: {
@@ -3075,6 +3096,9 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedOutputType,
              "NNAPI does not support generating a scalar as output for MEAN.",
              &val_ctx);
+      Expect(context->tensors[node->inputs->data[0]].dims->size <= 4,
+             NNAPIValidationFailureType::kUnsupportedOperandValue,
+             "NNAPI does not support mean of a tensor with rank > 4", &val_ctx);
     } break;
     case kTfLiteBuiltinEmbeddingLookup: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -4485,25 +4509,28 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
       return kTfLiteError;
     }
 
-    if (nnapi_->SL_ANeuralNetworksDiagnostic_registerCallbacks != nullptr) {
-      nnapi_->SL_ANeuralNetworksDiagnostic_registerCallbacks(
-          [](const void* nnapi,
-             const ANeuralNetworksDiagnosticCompilationInfo* info) {
-            return LogCompilationInfoOnce(static_cast<const NnApi*>(nnapi),
+    if (!delegate_options.disable_debugging_diagnostics_callbacks) {
+      if (nnapi_->SL_ANeuralNetworksDiagnostic_registerCallbacks != nullptr) {
+        nnapi_->SL_ANeuralNetworksDiagnostic_registerCallbacks(
+            [](const void* nnapi,
+               const ANeuralNetworksDiagnosticCompilationInfo* info) {
+              return LogCompilationInfoOnce(static_cast<const NnApi*>(nnapi),
+                                            info);
+            },
+            [](const void* nnapi,
+               const ANeuralNetworksDiagnosticExecutionInfo* info) {
+              return LogExecutionInfoOnce(static_cast<const NnApi*>(nnapi),
                                           info);
-          },
-          [](const void* nnapi,
-             const ANeuralNetworksDiagnosticExecutionInfo* info) {
-            return LogExecutionInfoOnce(static_cast<const NnApi*>(nnapi), info);
-          },
-          const_cast<NnApi*>(nnapi_));
-      TFLITE_LOG_PROD(TFLITE_LOG_INFO,
-                      "Registered diagnostics callbacks in NNAPI SL driver"
-                      "SL_ANeuralNetworksDiagnostic_registerCallbacks.");
-    } else {
-      TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
-                      "NNAPI SL driver did not implement "
-                      "SL_ANeuralNetworksDiagnostic_registerCallbacks!");
+            },
+            const_cast<NnApi*>(nnapi_));
+        TFLITE_LOG_PROD(TFLITE_LOG_INFO,
+                        "Registered diagnostics callbacks in NNAPI SL driver"
+                        "SL_ANeuralNetworksDiagnostic_registerCallbacks.");
+      } else {
+        TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
+                        "NNAPI SL driver did not implement "
+                        "SL_ANeuralNetworksDiagnostic_registerCallbacks!");
+      }
     }
   }
 
@@ -6412,6 +6439,8 @@ void StatefulNnApiDelegate::StatefulNnApiDelegateConstructorImpl(
   delegate_data_.vendor_plugin = options.vendor_plugin;
   delegate_data_.max_execution_cache_size = options.max_execution_cache_size;
   delegate_data_.tensor_max_size_hints = options.tensor_max_size_hints;
+  delegate_data_.disable_debugging_diagnostics_callbacks =
+      options.disable_debugging_diagnostics_callbacks;
 
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
@@ -6484,6 +6513,8 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.vendor_plugin = delegate_data->vendor_plugin;
   options.max_execution_cache_size = delegate_data->max_execution_cache_size;
   options.tensor_max_size_hints = delegate_data->tensor_max_size_hints;
+  options.disable_debugging_diagnostics_callbacks =
+      delegate_data->disable_debugging_diagnostics_callbacks;
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 93d3b38b1d0..67dd6090b13 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -173,6 +173,12 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // must provide max tensor size in the "tensor_max_size_hints" field for all
     // output tensors with dynamic shapes.
     NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
+
+    // Controls disabling of the debugging diagnostics callbacks that only print
+    // debug logs, which are otherwise enabled by default.
+    // Use this in case different callbacks are being registered elsewhere, such
+    // as for example to send logs through some logger.
+    bool disable_debugging_diagnostics_callbacks = false;
   };
 
   // Uses default options.
@@ -358,6 +364,12 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Options.
     std::unique_ptr<delegates::Serialization> cache;
 
+    // Controls disabling of the default diagnostics callbacks that only print
+    // debug logs, which are otherwise enabled by default.
+    // Use this in case different callbacks are being registered elsewhere, such
+    // as for example to send logs through some logger.
+    bool disable_debugging_diagnostics_callbacks = false;
+
     explicit Data(const NnApi* nnapi);
     explicit Data(std::unique_ptr<const NnApi> nnapi);
     ~Data();
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index 7b8e0b7ab25..3a4d0058067 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
index fac37dbc242..c9a465144af 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
index 6a7de308642..026222720b5 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
index 30222865596..291df2a4eab 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/fully_connected.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 2edaabdf29d..392cf046178 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
new file mode 100644
index 00000000000..3d301da8a4d
--- /dev/null
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -0,0 +1,306 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/util.h"
+
+using ::testing::ContainsRegex;
+namespace tflite {
+namespace delegates {
+
+namespace {
+
+TEST(TestOpaqueDelegate, AddDelegate) {
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(
+          "third_party/tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder{};
+  opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                       TfLiteOpaqueDelegate* opaque_delegate,
+                                       void* data) -> TfLiteStatus {
+    // Test that an unnamed delegate kernel can be passed to the TF Lite
+    // runtime.
+    TfLiteRegistrationExternal* registration_external =
+        TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
+                                         /*name*/ nullptr,
+                                         /*version=*/1);
+    TfLiteRegistrationExternalSetInit(
+        registration_external,
+        [](TfLiteOpaqueContext* context, const char* buffer,
+           size_t length) -> void* { return nullptr; });
+    TfLiteIntArray* execution_plan;
+    TF_LITE_ENSURE_STATUS(
+        TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
+    return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
+        opaque_context, registration_external, execution_plan, opaque_delegate);
+  };
+  TfLiteOpaqueDelegate* opaque_delegate =
+      TfLiteOpaqueDelegateCreate(&opaque_delegate_builder);
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  builder.AddDelegate(opaque_delegate);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  EXPECT_EQ(kTfLiteOk, builder(&interpreter));
+  ASSERT_NE(interpreter, nullptr);
+  TfLiteOpaqueDelegateDelete(opaque_delegate);
+}
+
+class TestOpaqueMacros : public ::testing::Test {
+  void DelegateImpl(TfLiteStatus expected_status) {
+    // The 'opaque_delegate_builder_' is being configured in the test itself.
+    opaque_delegate_ = TfLiteOpaqueDelegateCreate(&opaque_delegate_builder_);
+    tflite::ops::builtin::BuiltinOpResolver resolver;
+    tflite::InterpreterBuilder builder(*model_, resolver);
+    builder.AddDelegate(opaque_delegate_);
+    std::unique_ptr<tflite::Interpreter> interpreter;
+    EXPECT_EQ(expected_status, builder(&interpreter));
+  }
+  void SetUp() override {
+    model_ = tflite::FlatBufferModel::BuildFromFile(
+        "third_party/tensorflow/lite/testdata/add.bin", &reporter_);
+    ASSERT_NE(model_, nullptr);
+  }
+  void TearDown() override { TfLiteOpaqueDelegateDelete(opaque_delegate_); }
+
+ public:
+  TestOpaqueMacros()
+      : reporter_{},
+        model_{},
+        opaque_delegate_builder_{},
+        opaque_delegate_(nullptr) {}
+  void EnsureDelegationSucceeds() { DelegateImpl(kTfLiteOk); }
+  void EnsureDelegationFails() { DelegateImpl(kTfLiteDelegateError); }
+  ::tflite::TestErrorReporter reporter_;
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  TfLiteOpaqueDelegateBuilder opaque_delegate_builder_;
+  TfLiteOpaqueDelegate* opaque_delegate_;
+};
+
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE(opaque_context, false);
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*false was not true.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE(opaque_context, true);
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*was not true.*");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_EQ_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, true, false);
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*true != false.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_EQ_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_EQ(opaque_context, true, true);
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.* != *");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, false, "custom error msg");
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*custom error msg.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, true, "custom error msg");
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_TYPES_EQ_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, kTfLiteFloat32,
+                                   kTfLiteInt32);
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*kTfLiteFloat32 != kTfLiteInt32.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_TYPES_EQ_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_TYPES_EQ(opaque_context, kTfLiteFloat32,
+                                   kTfLiteFloat32);
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*!=.*");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_NEAR_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, 1, 10, 5);
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*1 not near 10.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_NEAR_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_NEAR(opaque_context, 10, 10, 5);
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*third_party/tensorflow/lite/delegates/"
+      "opaque_delegate_test\\.cc.*10 not near 10.*");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_MAYBE_KERNEL_LOG_REPORTS) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(
+        opaque_context, "Report through TF_LITE_OPAQUE_MAYBE_KERNEL_LOG");
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*Report through TF_LITE_OPAQUE_MAYBE_KERNEL_LOG.*");
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_THAT(reporter_.error_messages(), ContainsRegex(txt_regex));
+#else
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+#endif
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_MAYBE_KERNEL_LOG_SILENT) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_MAYBE_KERNEL_LOG(nullptr, "Should not be printed.");
+    return kTfLiteOk;
+  };
+  EnsureDelegationSucceeds();
+  const std::string txt_regex(
+      ".*Report through TF_LITE_OPAQUE_MAYBE_KERNEL_LOG.*");
+  EXPECT_THAT(reporter_.error_messages(), Not(ContainsRegex(txt_regex)));
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_EmptyString) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, false, "");
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+}
+TEST_F(TestOpaqueMacros, TF_LITE_OPAQUE_ENSURE_MSG_WithFormattingChars) {
+  opaque_delegate_builder_.Prepare = [](TfLiteOpaqueContext* opaque_context,
+                                        TfLiteOpaqueDelegate* opaque_delegate,
+                                        void* data) -> TfLiteStatus {
+    TF_LITE_OPAQUE_ENSURE_MSG(opaque_context, false, "%i %d");
+    return kTfLiteOk;
+  };
+  EnsureDelegationFails();
+}
+}  // anonymous namespace
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index ed2c1bfcd1e..173134d7b7a 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -50,9 +51,11 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus GraphPartitionHelper::Partition(
-    std::set<std::string>* unsupported_nodes_info) {
-  const auto prepare_status = PrepareSupportedNodes(unsupported_nodes_info);
+TfLiteStatus GraphPartitionHelper::PartitionImpl(
+    std::set<std::string>* unsupported_nodes_info, int start_node_index,
+    int end_node_index) {
+  const auto prepare_status = PrepareSupportedNodes(
+      unsupported_nodes_info, start_node_index, end_node_index);
   if (prepare_status != kTfLiteOk) return prepare_status;
 
   TfLiteDelegateParams* partition_params_array_ = nullptr;
@@ -113,7 +116,8 @@ std::vector<int> GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
 }
 
 TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
-    std::set<std::string>* unsupported_nodes_info) {
+    std::set<std::string>* unsupported_nodes_info, int start_node_index,
+    int end_node_index) {
   if (!is_node_supported_fn_) return kTfLiteOk;
 
   TfLiteIntArray* execution_plan = nullptr;
@@ -149,6 +153,13 @@ TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
     std::string unsupported_details;
     if (IsNodeSupported(context_, node, registration, node_id,
                         &unsupported_details)) {
+#ifdef TFLITE_DEBUG_DELEGATE
+      if (node_id < start_node_index) {
+        continue;
+      } else if (node_id > end_node_index) {
+        break;
+      }
+#endif  // TFLITE_DEBUG_DELEGATE
       supported_nodes_->data[supported_nodes_->size++] = node_id;
     } else if (unsupported_nodes_info) {
       std::string node_info = GetOpNameByRegistration(*registration);
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 90dd20e34d9..f6dc10cc66f 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -69,7 +69,25 @@ class GraphPartitionHelper {
   // replaced with one delegate kernel (i.e. a kTfLiteBuiltinDelegate op).
   // If 'unsupported_nodes_info' is provided, it will be populated with
   // information about all different unsupported nodes.
-  virtual TfLiteStatus Partition(std::set<std::string>* unsupported_nodes_info);
+  virtual TfLiteStatus Partition(
+      std::set<std::string>* unsupported_nodes_info) {
+    return PartitionImpl(unsupported_nodes_info, 0,
+                         std::numeric_limits<int>::max());
+  }
+
+#ifdef TFLITE_DEBUG_DELEGATE
+  // Partition the graph into node subsets such that each subset could be
+  // replaced with one delegate kernel (i.e. a kTfLiteBuiltinDelegate op).
+  // If 'unsupported_nodes_info' is provided, it will be populated with
+  // information about all different unsupported nodes.
+  // The 'start_node_index' and 'end_node_index' define the range of nodes
+  // that could be delegated.
+  virtual TfLiteStatus Partition(std::set<std::string>* unsupported_nodes_info,
+                                 int start_node_index, int end_node_index) {
+    return PartitionImpl(unsupported_nodes_info, start_node_index,
+                         end_node_index);
+  }
+#endif  // TFLITE_DEBUG_DELEGATE
 
   // Returns the first n largest partitions or all if #partitions is less than
   // 'n' and each parition has at least (>=) 'min_nodes_per_partition' nodes.
@@ -104,6 +122,9 @@ class GraphPartitionHelper {
   }
   virtual std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
       int n, int min_nodes_per_partition);
+  virtual TfLiteStatus PartitionImpl(
+      std::set<std::string>* unsupported_nodes_info, int start_node_index,
+      int end_node_index);
 
   TfLiteContext* const context_ = nullptr;
 
@@ -122,8 +143,12 @@ class GraphPartitionHelper {
   // associated w/ 'context_').
   // If 'unsupported_nodes_info' is provided, it will be populated with
   // information about all different unsupported nodes.
+  // The 'start_node_index' and 'end_node_index' define the range of nodes that
+  // could be delegated.
   TfLiteStatus PrepareSupportedNodes(
-      std::set<std::string>* unsupported_nodes_info = nullptr);
+      std::set<std::string>* unsupported_nodes_info = nullptr,
+      int start_node_index = 0,
+      int end_node_index = std::numeric_limits<int>::max());
 
   // The number of total nodes passed in for partitioning (i.e. the
   // execution_plan size associated w/ 'context_')
diff --git a/tensorflow/lite/delegates/utils/BUILD b/tensorflow/lite/delegates/utils/BUILD
index d5d35b05781..4e6c7cae38a 100644
--- a/tensorflow/lite/delegates/utils/BUILD
+++ b/tensorflow/lite/delegates/utils/BUILD
@@ -1,4 +1,8 @@
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -13,10 +17,12 @@ cc_library(
     hdrs = [
         "simple_delegate.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels/internal:compatibility",
     ],
@@ -29,25 +35,27 @@ cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/utils/dummy_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "simple_opaque_delegate",
     srcs = ["simple_opaque_delegate.cc"],
     hdrs = ["simple_opaque_delegate.h"],
+    tflite_deps = [
+        "//tensorflow/lite/core/shims:c_api",
+        "//tensorflow/lite/core/shims:c_api_experimental",
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/core/shims:c_api_types",
+    ],
     deps = [
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_experimental",
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
     ],
 )
@@ -68,15 +76,15 @@ cc_test(
     data = [":c_api_test_builtin_op_models"],
     deps = [
         ":simple_opaque_delegate",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_experimental",
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/shims:builtin_ops",
+        "//tensorflow/lite/core/shims:c_api",
+        "//tensorflow/lite/core/shims:c_api_experimental",
+        "//tensorflow/lite/core/shims:c_api_types",
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/core/shims:framework",
         "//tensorflow/lite/delegates:delegate_test_util",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
index 77efcbe79f5..682803b79ff 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -18,6 +19,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
     ],
 )
@@ -27,11 +29,14 @@ cc_binary(
     srcs = [
         "external_delegate_adaptor.cc",
     ],
+    defines = ["TFL_EXTERNAL_DELEGATE_COMPILE_LIBRARY"],
     linkshared = 1,
     linkstatic = 1,
     deps = [
         ":dummy_delegate",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/delegates/external:external_delegate_interface",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
     ],
@@ -91,3 +96,14 @@ cc_binary(
         "//tensorflow/lite/tools/evaluation/tasks/coco_object_detection:run_eval_lib",
     ],
 )
+
+sh_test(
+    name = "external_delegate_test",
+    srcs = ["external_delegate_test.sh"],
+    data = [
+        "//tensorflow/lite/delegates/coreml/internal_test/testdata:mobilenet_v2_1.0_224_quantized_weights_fp16.tflite",
+        "//tensorflow/lite/delegates/utils/dummy_delegate:dummy_external_delegate.so",
+        "//tensorflow/lite/tools/benchmark:benchmark_model",
+    ],
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
index 4de381ee226..d9871bef34e 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h"
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/lite/delegates/utils/simple_delegate.h"
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h
index 94023aa7d21..0f91556df4b 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h
@@ -45,6 +45,7 @@ TfLiteDelegate* TfLiteDummyDelegateCreate(const DummyDelegateOptions* options);
 
 // Destroys a delegate created with `TfLiteDummyDelegateCreate` call.
 void TfLiteDummyDelegateDelete(TfLiteDelegate* delegate);
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc
index 472295e3351..0fe25d176dd 100644
--- a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_adaptor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/external/external_delegate_interface.h"
 #include "tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/logging.h"
@@ -23,9 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace tools {
 
-TfLiteDelegate* CreateDummyDelegateFromOptions(char** options_keys,
-                                               char** options_values,
-                                               size_t num_options) {
+TfLiteDelegate* CreateDummyDelegateFromOptions(
+    const char* const* options_keys, const char* const* options_values,
+    size_t num_options) {
   DummyDelegateOptions options = TfLiteDummyDelegateOptionsDefault();
 
   // Parse key-values options to DummyDelegateOptions by mimicking them as
@@ -46,20 +47,20 @@ TfLiteDelegate* CreateDummyDelegateFromOptions(char** options_keys,
   }
 
   constexpr char kAllowedBuiltinOp[] = "allowed_builtin_code";
-  constexpr char kReportErrorDuingInit[] = "error_during_init";
-  constexpr char kReportErrorDuingPrepare[] = "error_during_prepare";
-  constexpr char kReportErrorDuingInvoke[] = "error_during_invoke";
+  constexpr char kReportErrorDuringInit[] = "error_during_init";
+  constexpr char kReportErrorDuringPrepare[] = "error_during_prepare";
+  constexpr char kReportErrorDuringInvoke[] = "error_during_invoke";
 
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kAllowedBuiltinOp, &options.allowed_builtin_code,
                                "Allowed builtin code."),
-      tflite::Flag::CreateFlag(kReportErrorDuingInit,
+      tflite::Flag::CreateFlag(kReportErrorDuringInit,
                                &options.error_during_init,
                                "Report error during init."),
-      tflite::Flag::CreateFlag(kReportErrorDuingPrepare,
+      tflite::Flag::CreateFlag(kReportErrorDuringPrepare,
                                &options.error_during_prepare,
                                "Report error during prepare."),
-      tflite::Flag::CreateFlag(kReportErrorDuingInvoke,
+      tflite::Flag::CreateFlag(kReportErrorDuringInvoke,
                                &options.error_during_invoke,
                                "Report error during invoke."),
   };
@@ -88,14 +89,17 @@ extern "C" {
 
 // Defines two symbols that need to be exported to use the TFLite external
 // delegate. See tensorflow/lite/delegates/external for details.
-TFL_CAPI_EXPORT TfLiteDelegate* tflite_plugin_create_delegate(
-    char** options_keys, char** options_values, size_t num_options,
-    void (*report_error)(const char*)) {
+extern TFL_EXTERNAL_DELEGATE_EXPORT TfLiteDelegate*
+tflite_plugin_create_delegate(const char* const* options_keys,
+                              const char* const* options_values,
+                              size_t num_options,
+                              void (*report_error)(const char*)) {
   return tflite::tools::CreateDummyDelegateFromOptions(
       options_keys, options_values, num_options);
 }
 
-TFL_CAPI_EXPORT void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate) {
+TFL_EXTERNAL_DELEGATE_EXPORT void tflite_plugin_destroy_delegate(
+    TfLiteDelegate* delegate) {
   TfLiteDummyDelegateDelete(delegate);
 }
 
diff --git a/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh
new file mode 100755
index 00000000000..f802888f5e2
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/dummy_delegate/external_delegate_test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -o errexit
+set -o nounset
+
+readonly benchmark_tool=third_party/tensorflow/lite/tools/benchmark/benchmark_model
+readonly external_delegate=third_party/tensorflow/lite/delegates/utils/dummy_delegate/dummy_external_delegate.so
+readonly model=third_party/tensorflow/lite/delegates/coreml/internal_test/testdata/mobilenet_v2_1.0_224_quantized_weights_fp16.tflite
+readonly benchmark_log=/tmp/benchmark.out
+
+die() { echo "$@" >&2; exit 1; }
+
+$benchmark_tool --graph=$model \
+    --external_delegate_path=$external_delegate \
+    --external_delegate_options='error_during_init:true;error_during_prepare:true' \
+    >& $benchmark_log
+cat $benchmark_log
+grep -q 'EXTERNAL delegate created.' $benchmark_log \
+    || die "Didn't find expected log contents"
+
+echo "PASS"
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
new file mode 100644
index 00000000000..6416ca821f5
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
@@ -0,0 +1,125 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# A sample delegate that supports addition and subtraction only.
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts", "tflite_linkopts_no_undefined")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library_with_tflite(
+    name = "sample_stable_delegate",
+    testonly = True,
+    srcs = [
+        "sample_stable_delegate.cc",
+    ],
+    hdrs = [
+        "sample_stable_delegate.h",
+    ],
+    copts = tflite_copts(),
+    tflite_deps = [
+        "//tensorflow/lite/core/shims:builtin_ops",
+        "//tensorflow/lite/core/shims:builtin_op_data",
+        "//tensorflow/lite/core/shims:c_api",
+        "//tensorflow/lite/core/shims:c_api_experimental",
+        "//tensorflow/lite/core/shims:c_api_types",
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
+    ],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+cc_test(
+    name = "sample_stable_delegate_test",
+    srcs = ["sample_stable_delegate_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    deps = [
+        ":sample_stable_delegate",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library_with_tflite(
+    name = "sample_stable_delegate_external",
+    testonly = True,
+    srcs = [
+        "sample_stable_delegate_external.cc",
+    ],
+    copts = tflite_copts(),
+    tflite_deps = [
+        ":sample_stable_delegate",
+        "//tensorflow/lite/core/shims:c_api",
+        "//tensorflow/lite/core/shims:c_api_experimental",
+        "//tensorflow/lite/core/shims:c_api_types",
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/core/shims:delegate_plugin",
+        "//tensorflow/lite/delegates/utils:simple_opaque_delegate",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:stable_delegate_inferface",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+    ],
+)
+
+tflite_cc_shared_object(
+    name = "tensorflowlite_sample_stable_delegate",
+    testonly = True,
+    linkopts = tflite_linkopts_no_undefined() + select({
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            # Expose necessary symbols only.
+            "-Wl,--version-script,$(location //tensorflow/lite/delegates/utils/experimental/stable_delegate:version_script.lds)",
+        ],
+    }),
+    per_os_targets = True,
+    deps = [
+        ":sample_stable_delegate_external",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:version_script.lds",
+    ],
+)
+
+cc_test(
+    name = "sample_stable_delegate_external_test",
+    srcs = ["sample_stable_delegate_external_test.cc"],
+    data = [
+        ":tensorflowlite_sample_stable_delegate",
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    deps = [
+        ":sample_stable_delegate",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:c_api_experimental",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
new file mode 100644
index 00000000000..9b6fe30e80d
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
@@ -0,0 +1,160 @@
+# TensorFlow Lite Sample Stable Delegate
+
+## Description
+
+An example delegate for stable delegate testing that supports addition and
+subtraction operations only.
+
+The sample stable delegate implementation uses the stable delegate API,
+which is based around `TfLiteOpaqueDelegate`. `TfLiteOpaqueDelegate` is
+an opaque version of `TfLiteDelegate`; which allows delegation of nodes to
+alternative backends. This is an abstract type that is intended to have the same
+role as `TfLiteDelegate` from
+[common.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h),
+but without exposing the implementation details of how delegates are
+implemented.
+
+`TfLiteOpaqueDelegate`s can be loaded dynamically
+(see `sample_stable_delegate_external_test.cc`) and then be supplied to the
+TFLite runtime, in the same way as statically linked delegates can.
+
+Note however that open-source TF Lite does not (yet) provide a binary stable
+interface between delegates and the TF Lite runtime itself.  Therefore any
+opaque delegate that is loaded dynamically into TF Lite *must* have been built
+against the same version (and commit) that the TF Lite runtime itself has been
+built at. Any other configuration can lead to undefined behavior.
+
+## Delegate implementation
+
+The sample stable delegate uses two supporting interfaces [SimpleOpaqueDelegateInterface and SimpleOpaqueDelegateKernelInterface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_opaque_delegate.h).
+These APIs make it easier to implement an opaque TF Lite delegate, though their
+usage is entirely optional.
+
+The `sample_stable_delegate_test` driver (see next section) makes use of the
+[TfLiteOpaqueDelegateFactory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_opaque_delegate.h)
+facility, which provides static methods that deal with delegate creation and
+deletion.
+
+## Testing
+
+See [sample_stable_delegate_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc)
+for a standalone test driver that links the sample stable delegate statically
+and runs inference on a TF Lite model.
+
+See [sample_stable_delegate_external_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc)
+for a standalone test driver that loads the sample stable delegate dynamically
+and runs inference on a TF Lite model.
+
+### Benchmark Tools
+
+#### Delegate Performance Benchmark app
+
+The [Delegate Performance Benchmark app](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md)
+is the recommended tool to test the latency and accuracy of a stable delegate.
+
+#### TF Lite Benchmark Tool
+
+During early development stages of a new stable delegate it can also be useful
+to directly load the delegate's shared library file into TF Lite's
+`benchmark_model` tool, because this development workflow works on regular linux
+desktop machines and also allows users to benchmark any TF Lite model file they
+are interested in.
+
+Support for stable delegate binaries has been integrated into TF Lite's
+[`benchmark_model`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+CLI tool. We can use this tool to test the sample stable delegate with a
+provided TF Lite model file.
+
+##### A) Run on a regular linux host
+
+The following instructions show how to run the
+tool on regular desktop linux machine.
+
+First, we build the sample stable delegate shared library file,
+`libtensorflowlite_sample_stable_delegate.so`, which we will later load
+dynamically with the `benchmark_model` tool:
+
+```bash
+bazel build -c opt //tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate
+```
+
+Next, we create a configuration file for the component that loads the stable
+delegate:
+
+```bash
+echo "{
+  \"stable_delegate_loader_settings\": {
+    \"delegate_path\": \"$(bazel info -c opt bazel-bin)/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so\"
+  }
+  // Add concrete delegate settings for the test target delegate.
+}
+"> stable_delegate_settings.json
+```
+
+Then, we build the `benchmark_model` tool itself:
+
+```bash
+bazel build -c opt //tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+Now, we can execute the benchmark tool.  We provide the settings file together
+with a TF Lite file that contains ADD operations.  We do this because the sample
+stable delegate only support ADD and SUB:
+
+```bash
+$(bazel info -c opt bazel-bin)/tensorflow/lite/tools/benchmark/benchmark_model \
+  --stable_delegate_settings_file=$(pwd)/stable_delegate_settings.json \
+    --graph=$(pwd)/tensorflow/lite/testdata/add.bin
+```
+
+Note that when you make changes to the sample delegate you need to rebuild the
+delegate's shared library file, in order for benchmark_model to pick up the new
+delegate code.
+
+
+##### B) Run on Android
+
+The following instructions show how to run the tool on Android.
+
+First, we build the sample stable delegate shared library file,
+`libtensorflowlite_sample_stable_delegate.so`, which we will later load
+dynamically with the `benchmark_model` tool:
+
+```bash
+bazel build -c opt --config=android_arm64 //tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate
+
+adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so /data/local/tmp
+```
+
+Next, we create a configuration file for the component that loads the stable
+delegate:
+
+```bash
+adb shell 'echo "{
+  \"stable_delegate_loader_settings\": {
+    \"delegate_path\": \"/data/local/tmp/libtensorflowlite_sample_stable_delegate.so\"
+  }
+  // Add concrete delegate settings for the test target delegate.
+}
+"> /data/local/tmp/stable_delegate_settings.json'
+```
+
+Then, we build the `benchmark_model` tool itself:
+
+```bash
+bazel build -c opt --config=android_arm64 //tensorflow/lite/tools/benchmark:benchmark_model
+
+adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/tools/benchmark/benchmark_model /data/local/tmp
+```
+
+Now, we can execute the benchmark tool. We provide the settings file together
+with a TF Lite file that contains ADD operations. We do this because the sample
+stable delegate only support ADD and SUB:
+
+```bash
+adb push tensorflow/lite/testdata/add.bin /data/local/tmp/add.bin
+adb shell "/data/local/tmp/benchmark_model \
+  --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json \
+  --graph=/data/local/tmp/add.bin"
+```
+
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
new file mode 100644
index 00000000000..4dcda2ea270
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
@@ -0,0 +1,210 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/lite/core/shims/c/c_api.h"
+#include "tensorflow/lite/core/shims/c/c_api_opaque.h"
+#include "tensorflow/lite/core/shims/c/c_api_types.h"
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+
+namespace tflite {
+namespace example {
+namespace {
+
+class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
+  bool IsExternalTensor(const TfLiteOpaqueTensor* opaque_tensor) const {
+    return external_tensors_.count(opaque_tensor) != 0;
+  }
+
+  void DeriveExternalTensors() {
+    for (const TfLiteOpaqueTensor* tensor : node_input_tensors_set_) {
+      if (node_output_tensors_set_.count(tensor) == 0) {
+        external_tensors_.insert(tensor);
+      }
+    }
+
+    for (const TfLiteOpaqueTensor* tensor : node_output_tensors_set_) {
+      if (node_input_tensors_set_.count(tensor) == 0) {
+        external_tensors_.insert(tensor);
+      }
+    }
+  }
+
+ public:
+  TfLiteStatus Init(TfLiteOpaqueContext* context,
+                    const TfLiteOpaqueDelegateParams* params) override {
+    if (params->delegate == nullptr) return kTfLiteDelegateError;
+
+    context_ = context;
+    builtin_code_.resize(params->nodes_to_replace->size);
+
+    node_input_tensors_.resize(params->nodes_to_replace->size);
+    node_output_tensors_.resize(params->nodes_to_replace->size);
+
+    for (int i = 0; i < params->nodes_to_replace->size; ++i) {
+      const int node_index = params->nodes_to_replace->data[i];
+
+      TfLiteOpaqueNode* delegated_node = nullptr;
+      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
+      TfLiteOpaqueContextGetNodeAndRegistration(
+          context, node_index, &delegated_node, &delegated_node_registration);
+
+      auto input_tensor1 = TfLiteOpaqueNodeGetInput(context, delegated_node, 0);
+      node_input_tensors_[i].push_back(input_tensor1);
+      node_input_tensors_set_.insert(input_tensor1);
+
+      auto input_tensor2 = TfLiteOpaqueNodeGetInput(context, delegated_node, 1);
+      node_input_tensors_[i].push_back(input_tensor2);
+      node_input_tensors_set_.insert(input_tensor2);
+
+      auto output_tensor =
+          TfLiteOpaqueNodeGetOutput(context, delegated_node, 0);
+      node_output_tensors_[i] = output_tensor;
+      node_output_tensors_set_.insert(output_tensor);
+
+      builtin_code_[i] =
+          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
+    }
+
+    // Determine which tensors are external (the TFLite runtime takes care
+    // of them) so that we know which tensors are 'internal' to this delegate.
+    // For the internal tensors we need to ensure they have memory allocated to
+    // store their data, and take care of re-sizing etc.
+    DeriveExternalTensors();
+
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                       TfLiteOpaqueNode* delegated_node) override {
+    if (external_tensors_.empty()) return kTfLiteOk;
+
+    const int kTheInputTensorSize =
+        helpers::CalculateNumElements((*external_tensors_.begin()));
+    for (std::vector<const TfLiteOpaqueTensor*>& vecs : node_input_tensors_) {
+      for (const TfLiteOpaqueTensor* tensor : vecs) {
+        if (IsExternalTensor(tensor)) continue;
+
+        std::vector<float>& vec_memory = internal_tensors_memory_[tensor];
+        vec_memory.resize(kTheInputTensorSize);
+      }
+    }
+    for (const TfLiteOpaqueTensor* tensor : node_output_tensors_) {
+      if (IsExternalTensor(tensor)) continue;
+
+      std::vector<float>& vec_memory = internal_tensors_memory_[tensor];
+      vec_memory.resize(kTheInputTensorSize);
+    }
+
+    return kTfLiteOk;
+  }
+
+  void ComputeImpl(float* input_1, float* input_2, float* output,
+                   int builtin_code, int number_of_elements) {
+    for (int i = 0; i < number_of_elements; ++i) {
+      if (builtin_code == kTfLiteBuiltinAdd) {
+        output[i] = input_1[i] + input_2[i];
+      } else {
+        output[i] = input_1[i] - input_2[i];
+      }
+    }
+  }
+
+  float* GetRawDataSource(TfLiteOpaqueContext* context,
+                          const TfLiteOpaqueTensor* tensor) {
+    if (IsExternalTensor(tensor)) {
+      return reinterpret_cast<float*>(TfLiteOpaqueTensorData(tensor));
+    } else {
+      return internal_tensors_memory_[tensor].data();
+    }
+  }
+
+  TfLiteStatus Eval(TfLiteOpaqueContext* context,
+                    TfLiteOpaqueNode* delegated_node) override {
+    for (int i = 0; i < node_input_tensors_.size(); ++i) {
+      float* input1 = GetRawDataSource(context, node_input_tensors_[i][0]);
+      float* input2 = GetRawDataSource(context, node_input_tensors_[i][1]);
+      float* output = GetRawDataSource(context, node_output_tensors_[i]);
+      // We assume that all input, output and intermediate tensors of the
+      // delegated subgraph have the same size.
+      ComputeImpl(input1, input2, output, builtin_code_[i],
+                  helpers::CalculateNumElements(node_output_tensors_[i]));
+    }
+    return kTfLiteOk;
+  }
+
+ private:
+  std::vector<std::vector<const TfLiteOpaqueTensor*>> node_input_tensors_;
+  absl::flat_hash_set<const TfLiteOpaqueTensor*> node_input_tensors_set_;
+  std::vector<const TfLiteOpaqueTensor*> node_output_tensors_;
+  absl::flat_hash_set<const TfLiteOpaqueTensor*> node_output_tensors_set_;
+  absl::flat_hash_set<const TfLiteOpaqueTensor*> external_tensors_;
+  absl::flat_hash_map<const TfLiteOpaqueTensor*, std::vector<float>>
+      internal_tensors_memory_;
+  TfLiteOpaqueContext* context_;
+  // Holds the builtin code of the ops.
+  // builtin_code_[i] is the type of node at index 'i'
+  std::vector<int> builtin_code_;
+};
+}  // namespace
+
+int helpers::CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
+  int total_num_elements = 1;
+  for (int i = 0; i < TfLiteOpaqueTensorNumDims(opaque_tensor); ++i) {
+    total_num_elements *= TfLiteOpaqueTensorDim(opaque_tensor, i);
+  }
+  return total_num_elements;
+}
+
+bool SampleStableDelegate::IsNodeSupportedByDelegate(
+    const TfLiteRegistrationExternal* registration_external,
+    const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const {
+  if (kTfLiteBuiltinAdd !=
+          TfLiteRegistrationExternalGetBuiltInCode(registration_external) &&
+      kTfLiteBuiltinSub !=
+          TfLiteRegistrationExternalGetBuiltInCode(registration_external))
+    return false;
+
+  // This delegate only supports float32 types.
+  for (int i = 0; i < TfLiteOpaqueNodeNumberOfInputs(node); ++i) {
+    const TfLiteOpaqueTensor* tensor =
+        TfLiteOpaqueNodeGetInput(context, node, i);
+    if (TfLiteOpaqueTensorType(tensor) != kTfLiteFloat32) return false;
+  }
+
+  return true;
+}
+
+TfLiteStatus SampleStableDelegate::Initialize(TfLiteOpaqueContext* context) {
+  return kTfLiteOk;
+}
+
+const char* SampleStableDelegate::Name() const {
+  return kSampleStableDelegateName;
+}
+
+std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
+SampleStableDelegate::CreateDelegateKernelInterface() {
+  return std::make_unique<SampleStableDelegateKernel>();
+}
+
+}  // namespace example
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
new file mode 100644
index 00000000000..43c4424fa80
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+
+namespace tflite {
+namespace example {
+namespace helpers {
+int CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor);
+}  // namespace helpers
+
+static const char kSampleStableDelegateName[] = "SampleStableDelegate";
+static const char kSampleStableDelegateVersion[] = "1.0.0";
+
+// A simple delegate that supports only addition and subtraction operations.
+// Implements SimpleOpaqueDelegateInterface, and therefore the delegate can be
+// easily be adapted to work with the stable TFLite delegate API via
+// TfLiteOpaqueDelegateFactory.
+class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
+ public:
+  // SampleStableDelegate supports float32 input type only.
+  // Returns true if the inputs of 'node' are float32 and the operation is
+  // addition or subtraction.
+  bool IsNodeSupportedByDelegate(
+      const TfLiteRegistrationExternal* registration_external,
+      const TfLiteOpaqueNode* node,
+      TfLiteOpaqueContext* context) const override;
+
+  // No-op. The delegate doesn't have extra steps to perform during
+  // initialization.
+  TfLiteStatus Initialize(TfLiteOpaqueContext* context) override;
+
+  // Returns a name that identifies the delegate.
+  const char* Name() const override;
+
+  // Returns an instance of SampleStableDelegateKernel that implements
+  // SimpleOpaqueDelegateKernelInterface. SampleStableDelegateKernel describes
+  // how a subgraph is delegated and the concrete evaluation of both addition
+  // and subtraction operations to be performed by the delegate.
+  std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
+  CreateDelegateKernelInterface() override;
+};
+
+}  // namespace example
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_SAMPLE_STABLE_DELEGATE_SAMPLE_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc
new file mode 100644
index 00000000000..c25d09781bf
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external.cc
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdlib>
+#include <memory>
+#include <utility>
+
+#include "tensorflow/lite/core/shims/c/c_api_types.h"
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h"
+#include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+
+namespace {
+
+TfLiteOpaqueDelegate* SampleStableDelegateCreateFunc(
+    const void* tflite_settings) {
+  auto delegate = std::make_unique<tflite::example::SampleStableDelegate>();
+  return tflite::TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
+      std::move(delegate));
+}
+
+void SampleStableDelegateDestroyFunc(
+    TfLiteOpaqueDelegate* sample_stable_delegate) {
+  tflite::TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(
+      sample_stable_delegate);
+}
+
+int SampleStableDelegateErrnoFunc(
+    TfLiteOpaqueDelegate* sample_stable_delegate) {
+  // no-op
+  return 0;
+}
+
+const TfLiteOpaqueDelegatePlugin sample_stable_delegate_plugin = {
+    SampleStableDelegateCreateFunc, SampleStableDelegateDestroyFunc,
+    SampleStableDelegateErrnoFunc};
+
+}  // namespace
+
+/**
+ * A super simple test delegate for testing.
+ */
+extern "C" const TfLiteStableDelegate TFL_TheStableDelegate = {
+    TFL_STABLE_DELEGATE_ABI_VERSION, tflite::example::kSampleStableDelegateName,
+    tflite::example::kSampleStableDelegateVersion,
+    &sample_stable_delegate_plugin};
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc
new file mode 100644
index 00000000000..4978f90430e
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace {
+
+using tflite::TFLiteSettings;
+using tflite::TFLiteSettingsBuilder;
+using tflite::delegates::utils::LoadDelegateFromSharedLibrary;
+
+TEST(SampleStableDelegate, LoadFromSharedLibraryFile) {
+  // Load the example stable opaque_delegate that implements the ADD operation
+  // from a shared libary file.
+  const TfLiteStableDelegate* stable_delegate_handle =
+      LoadDelegateFromSharedLibrary(
+          "tensorflow/lite/delegates/utils/experimental/"
+          "sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so");
+  ASSERT_NE(stable_delegate_handle, nullptr);
+  EXPECT_STREQ(stable_delegate_handle->delegate_abi_version,
+               TFL_STABLE_DELEGATE_ABI_VERSION);
+  EXPECT_STREQ(stable_delegate_handle->delegate_name,
+               tflite::example::kSampleStableDelegateName);
+  EXPECT_STREQ(stable_delegate_handle->delegate_version,
+               tflite::example::kSampleStableDelegateVersion);
+  ASSERT_NE(stable_delegate_handle->delegate_plugin, nullptr);
+
+  // Build TFLiteSettings flatbuffer and pass into opaque_delegate plugin
+  // create method.
+  flatbuffers::FlatBufferBuilder flatbuffer_builder;
+  TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder);
+  flatbuffers::Offset<TFLiteSettings> tflite_settings =
+      tflite_settings_builder.Finish();
+  flatbuffer_builder.Finish(tflite_settings);
+  const TFLiteSettings* settings = flatbuffers::GetRoot<TFLiteSettings>(
+      flatbuffer_builder.GetBufferPointer());
+  TfLiteOpaqueDelegate* opaque_delegate =
+      stable_delegate_handle->delegate_plugin->create(settings);
+  ASSERT_NE(opaque_delegate, nullptr);
+
+  //
+  // Create the model and the interpreter
+  //
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  ASSERT_NE(options, nullptr);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate);
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  // The options can be deleted immediately after interpreter creation.
+  TfLiteInterpreterOptionsDelete(options);
+
+  //
+  // Allocate the tensors and fill the input tensor.
+  //
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  TfLiteTensor* input_tensor =
+      TfLiteInterpreterGetInputTensor(interpreter, /*input_index=*/0);
+  ASSERT_NE(input_tensor, nullptr);
+  const float kTensorCellValue = 3.f;
+  int64_t n = tflite::NumElements(input_tensor);
+  std::vector<float> input(n, kTensorCellValue);
+  ASSERT_EQ(TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                                       input.size() * sizeof(float)),
+            kTfLiteOk);
+
+  //
+  // Run the interpreter and read the output tensor.
+  //
+  ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+  std::vector<float> output(n, 0);
+  ASSERT_EQ(TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                                     output.size() * sizeof(float)),
+            kTfLiteOk);
+
+  // The 'add.bin' model does the following operation ('t_output' denotes the
+  // single output tensor, and 't_input' denotes the single input tensor):
+  //
+  // t_output = t_input + t_input + t_input = t_input * 3
+  for (int i = 0; i < output.size(); ++i) {
+    EXPECT_EQ(output[i], kTensorCellValue * 3);
+  }
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteModelDelete(model);
+  stable_delegate_handle->delegate_plugin->destroy(opaque_delegate);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc
new file mode 100644
index 00000000000..c2f06b52585
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace {
+
+TEST(SampleStableDelegate, StaticallyLinkedDelegate) {
+  // Create an instance of the sample stable delegate that implements the ADD
+  // operation.
+  tflite::TfLiteOpaqueDelegateUniquePtr opaque_delegate =
+      tflite::TfLiteOpaqueDelegateFactory::Create(
+          std::make_unique<tflite::example::SampleStableDelegate>());
+  ASSERT_NE(opaque_delegate, nullptr);
+
+  //
+  // Create the model and the interpreter
+  //
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  ASSERT_NE(options, nullptr);
+  TfLiteInterpreterOptionsAddDelegate(options, opaque_delegate.get());
+  TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  // The options can be deleted immediately after interpreter creation.
+  TfLiteInterpreterOptionsDelete(options);
+
+  //
+  // Allocate the tensors and fill the input tensor.
+  //
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  TfLiteTensor* input_tensor =
+      TfLiteInterpreterGetInputTensor(interpreter, /*input_index=*/0);
+  ASSERT_NE(input_tensor, nullptr);
+  const float kTensorCellValue = 3.f;
+  int64_t n = tflite::NumElements(input_tensor);
+  std::vector<float> input(n, kTensorCellValue);
+  ASSERT_EQ(TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                                       input.size() * sizeof(float)),
+            kTfLiteOk);
+
+  //
+  // Run the interpreter and read the output tensor.
+  //
+  ASSERT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TfLiteTensor* output_tensor =
+      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+  std::vector<float> output(n, 0);
+  ASSERT_EQ(TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                                     output.size() * sizeof(float)),
+            kTfLiteOk);
+
+  // The 'add.bin' model does the following operation ('t_output' denotes the
+  // single output tensor, and 't_input' denotes the single input tensor):
+  //
+  // t_output = t_input + t_input + t_input = t_input * 3
+  for (int i = 0; i < output.size(); ++i) {
+    EXPECT_EQ(output[i], kTensorCellValue * 3);
+  }
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteModelDelete(model);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
new file mode 100644
index 00000000000..7432b9f2842
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
@@ -0,0 +1,126 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Utils package for the stable delegate APIs.
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_linkopts_no_undefined")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+
+cc_library_with_tflite(
+    name = "delegate_loader",
+    srcs = ["delegate_loader.cc"],
+    hdrs = ["delegate_loader.h"],
+    tflite_deps = ["//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
+        "//tensorflow/lite/tools:logging",
+    ],
+)
+
+cc_library_with_tflite(
+    name = "stable_delegate_inferface",
+    hdrs = ["stable_delegate_interface.h"],
+    tflite_deps = ["//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_cc_shared_object(
+    name = "tensorflowlite_stable_xnnpack_delegate",
+    testonly = True,
+    srcs = ["stable_xnnpack_delegate.cc"],
+    defines = ["TFL_STABLE_DELEGATE_COMPILE_LIBRARY"],
+    linkopts = tflite_linkopts_no_undefined() + select({
+        "//tensorflow:windows": [],
+        # iOS linker doesn't support -version-script.
+        "//tensorflow:ios": [
+            # TODO(b/260213307): support hiding unnecessary symbols on iOS.
+        ],
+        "//conditions:default": [
+            # Expose necessary symbols only.
+            "-Wl,--version-script,$(location //tensorflow/lite/delegates/utils/experimental/stable_delegate:version_script.lds)",
+        ],
+    }),
+    per_os_targets = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stable_delegate_inferface",
+        ":version_script.lds",
+        "//tensorflow/lite/core/shims:c_api",
+        "//tensorflow/lite/core/shims:xnnpack_plugin",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+    ],
+)
+
+cc_test(
+    name = "delegate_loader_test",
+    srcs = ["delegate_loader_test.cc"],
+    data = ["//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate"],
+    deps = [
+        ":delegate_loader",
+        "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tflite_settings_json_parser",
+    srcs = [
+        "tflite_settings_json_parser.cc",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs_contents-inl.h",
+    ],
+    hdrs = ["tflite_settings_json_parser.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/tools:logging",
+        "@flatbuffers",
+    ],
+)
+
+build_test(
+    name = "stable_delegate_build_test",
+    targets = [
+        ":delegate_loader",
+        ":test_xnnpack_settings.json",
+        ":tensorflowlite_stable_xnnpack_delegate",
+        ":tflite_settings_json_parser",
+    ],
+)
+
+cc_test(
+    name = "tflite_settings_json_parser_test",
+    srcs = ["tflite_settings_json_parser_test.cc"],
+    data = [
+        ":test_xnnpack_settings.json",
+        "//tensorflow/lite/tools/delegates/experimental/stable_delegate:test_invalid_settings.json",
+    ],
+    deps = [
+        ":tflite_settings_json_parser",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+exports_files(
+    srcs = [
+        "test_xnnpack_settings.json",
+        "version_script.lds",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
new file mode 100644
index 00000000000..775ea9b0124
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
@@ -0,0 +1,77 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+
+#include <dlfcn.h>
+
+#include <string>
+
+#include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+using ::tflite::acceleration::AndroidInfo;
+using ::tflite::acceleration::RequestAndroidInfo;
+
+const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
+    const std::string& delegate_path) {
+  return LoadDelegateFromSharedLibrary(delegate_path,
+                                       kTfLiteStableDelegateSymbol);
+}
+
+const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
+    const std::string& delegate_path, const std::string& delegate_symbol) {
+  // TODO(b/239825926): Use android_dlopen_ext to support loading from an offset
+  // within an apk.
+  void* delegate_lib_handle = nullptr;
+  // RTLD_NOW: Ensures that any dynamic linking errors occur early rather than
+  // crash later.
+  // RTLD_LOCAL: Symbols are not available to subsequent objects.
+  int dlopen_flags = RTLD_NOW | RTLD_LOCAL;
+  int sdk_version;
+  AndroidInfo android_info;
+  if (RequestAndroidInfo(&android_info).ok() &&
+      absl::SimpleAtoi(android_info.android_sdk_version, &sdk_version) &&
+      sdk_version >= 23) {
+    // RTLD_NODELETE: Not unload the shared object during dlclose to prevent
+    // thread specific key leakage. It is supported since Android SDK level 23.
+    dlopen_flags |= RTLD_NODELETE;
+    TFLITE_LOG(INFO) << "Android SDK level is " << sdk_version
+                     << ", using dlopen with RTLD_NODELETE.";
+  }
+  delegate_lib_handle = dlopen(delegate_path.c_str(), dlopen_flags);
+  if (!delegate_lib_handle) {
+    TFLITE_LOG(ERROR) << "Failed to open library " << delegate_path << ": "
+                      << dlerror();
+    return nullptr;
+  }
+
+  auto* stable_delegate_pointer = reinterpret_cast<TfLiteStableDelegate*>(
+      dlsym(delegate_lib_handle, delegate_symbol.c_str()));
+  if (!stable_delegate_pointer) {
+    TFLITE_LOG(ERROR) << "Failed to find " << delegate_symbol
+                      << " symbol: " << dlerror();
+    dlclose(delegate_lib_handle);
+    return nullptr;
+  }
+  return stable_delegate_pointer;
+}
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
new file mode 100644
index 00000000000..5277523e60e
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
+
+#include <string>
+
+#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+const char kTfLiteStableDelegateSymbol[] = "TFL_TheStableDelegate";
+
+// Same as the other function but uses kTfLiteStableDelegateSymbol. This
+// function is preferred to be used because delegate developers are required to
+// use kTfLiteStableDelegateSymbol.
+const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
+    const std::string& delegate_path);
+
+// Loads the TFLite delegate shared library and returns the pointer to
+// TfLiteStableDelegate (defined in
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h)
+// identified by delegate_symbol. The returned pointer could be null if the
+// delegate shared library cannot be opened or the delegate symbol cannot be
+// found.
+const TfLiteStableDelegate* LoadDelegateFromSharedLibrary(
+    const std::string& delegate_path, const std::string& delegate_symbol);
+
+// TODO(b/239825926): Add ABI version check when loading TfLiteStableDelegate.
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_DELEGATE_LOADER_H_
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc
new file mode 100644
index 00000000000..41ce5fa0ad4
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace {
+
+using tflite::TFLiteSettings;
+using tflite::TFLiteSettingsBuilder;
+using tflite::delegates::utils::LoadDelegateFromSharedLibrary;
+
+TEST(TfLiteDelegateLoaderUtilsTest, Simple) {
+  const TfLiteStableDelegate* stable_delegate_handle =
+      LoadDelegateFromSharedLibrary(
+          "tensorflow/lite/delegates/utils/experimental/"
+          "sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so");
+
+  EXPECT_NE(stable_delegate_handle, nullptr);
+  EXPECT_STREQ(stable_delegate_handle->delegate_abi_version,
+               TFL_STABLE_DELEGATE_ABI_VERSION);
+  EXPECT_STREQ(stable_delegate_handle->delegate_name,
+               tflite::example::kSampleStableDelegateName);
+  EXPECT_STREQ(stable_delegate_handle->delegate_version,
+               tflite::example::kSampleStableDelegateVersion);
+  EXPECT_NE(stable_delegate_handle->delegate_plugin, nullptr);
+
+  // Builds TFLiteSettings flatbuffer and passes into delegate plugin create
+  // method.
+  flatbuffers::FlatBufferBuilder flatbuffer_builder;
+  TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder);
+  flatbuffers::Offset<TFLiteSettings> tflite_settings =
+      tflite_settings_builder.Finish();
+  flatbuffer_builder.Finish(tflite_settings);
+  const TFLiteSettings* settings = flatbuffers::GetRoot<TFLiteSettings>(
+      flatbuffer_builder.GetBufferPointer());
+  auto delegate = stable_delegate_handle->delegate_plugin->create(settings);
+
+  EXPECT_NE(delegate, nullptr);
+  EXPECT_EQ(
+      stable_delegate_handle->delegate_plugin->get_delegate_errno(delegate), 0);
+  stable_delegate_handle->delegate_plugin->destroy(delegate);
+}
+
+TEST(TfLiteDelegateLoaderUtilsTest, WrongSymbolReturnsNullptr) {
+  const TfLiteStableDelegate* stable_delegate_handle =
+      LoadDelegateFromSharedLibrary(
+          "tensorflow/lite/delegates/utils/experimental/"
+          "sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so",
+          "NOT_REAL_SYMBOL");
+  EXPECT_EQ(stable_delegate_handle, nullptr);
+}
+
+TEST(TfLiteDelegateLoaderUtilsTest, MissingLibReturnsNullptr) {
+  const TfLiteStableDelegate* stable_delegate_handle =
+      LoadDelegateFromSharedLibrary("not_real_delegate.so");
+  EXPECT_EQ(stable_delegate_handle, nullptr);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
new file mode 100644
index 00000000000..89fa5d73652
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
+
+#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+
+// This header file declares the interface that stable delegate shared
+// libraries need to implement. The stable delegate loader will dynamically load
+// the shared library.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Define TFL_STABLE_DELEGATE_EXPORT macro to export a stable delegate API
+// function properly with a shared library.
+#ifdef SWIG
+#define TFL_STABLE_DELEGATE_EXPORT
+#else  // !defined SWIG
+#ifdef _WIN32
+// On Windows, the TFL_STABLE_DELEGATE_COMPILE_LIBRARY macro should be defined
+// when _building_ a stable delegate shared library, but should not be defined
+// when _using_ a stable delegate shared library.
+#ifdef TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#define TFL_STABLE_DELEGATE_EXPORT __declspec(dllexport)
+#else  // !defined TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#define TFL_STABLE_DELEGATE_EXPORT __declspec(dllimport)
+#endif  // !defined TFL_STABLE_DELEGATE_COMPILE_LIBRARY
+#else   // !defined _WIN32
+#define TFL_STABLE_DELEGATE_EXPORT __attribute__((visibility("default")))
+#endif  // !defined _WIN32
+#endif  // !defined SWIG
+
+// The variable contains stable delegate metadata and implementation.
+//
+// The variable is dynamically initialized and it will be used as the entrypoint
+// for the stable delegate providers to load the symbols. Don't add other
+// initializations, which depend on the sequence of this initialization.
+extern TFL_STABLE_DELEGATE_EXPORT const TfLiteStableDelegate
+    TFL_TheStableDelegate;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_STABLE_DELEGATE_INTERFACE_H_
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc
new file mode 100644
index 00000000000..d680415cd6c
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_xnnpack_delegate.cc
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/shims/c/c_api.h"
+#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/xnnpack_plugin.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * A stable delegate wrapper of the XNNPack delegate for testing.
+ * The built stable delegate binary is used to verify the stable delegate
+ * providers by checking if the delegate settings are propagated correctly.
+ */
+extern TFL_STABLE_DELEGATE_EXPORT const TfLiteStableDelegate
+    TFL_TheStableDelegate = {TFL_STABLE_DELEGATE_ABI_VERSION, "XNNPACKDelegate",
+                             TfLiteVersion(),
+                             TfLiteXnnpackDelegatePluginCApi()};
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json b/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json
new file mode 100644
index 00000000000..098d00c0f33
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/test_xnnpack_settings.json
@@ -0,0 +1,11 @@
+// Test only XNNPack delegate settings file.
+//
+// This file follows the TFLiteSettings structure in the schema of the generated
+// flatbuffer, which is based on the proto file:
+// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+{
+  "delegate": "XNNPACK",
+  "xnnpack_settings": {
+    "num_threads": 5
+  }
+}
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc
new file mode 100644
index 00000000000..3599451b7d9
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.cc
@@ -0,0 +1,76 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+
+#include <string>
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_fbs_contents-inl.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+TfLiteSettingsJsonParser::TfLiteSettingsJsonParser() {
+  TFLITE_DCHECK(parser_.Parse(configuration_fbs_contents) &&
+                parser_.SetRootType("TFLiteSettings"));
+}
+
+const TFLiteSettings* TfLiteSettingsJsonParser::Parse(
+    const std::string& json_file_path) {
+  if (!LoadFromJsonFile(json_file_path) || buffer_pointer_ == nullptr) {
+    return nullptr;
+  }
+  return flatbuffers::GetRoot<TFLiteSettings>(buffer_pointer_);
+}
+
+const uint8_t* TfLiteSettingsJsonParser::GetBufferPointer() {
+  return buffer_pointer_;
+}
+
+flatbuffers::uoffset_t TfLiteSettingsJsonParser::GetBufferSize() {
+  return buffer_size_;
+}
+
+bool TfLiteSettingsJsonParser::LoadFromJsonFile(
+    const std::string& json_file_path) {
+  buffer_size_ = 0;
+  buffer_pointer_ = nullptr;
+  if (json_file_path.empty()) {
+    TFLITE_LOG(ERROR) << "Invalid JSON file path.";
+    return false;
+  }
+  std::string json_file;
+  if (!flatbuffers::LoadFile(json_file_path.c_str(), false, &json_file)) {
+    TFLITE_LOG(ERROR) << "Failed to load the delegate settings file ("
+                      << json_file_path << ").";
+    return false;
+  }
+  if (!parser_.Parse(json_file.c_str())) {
+    TFLITE_LOG(ERROR) << "Failed to parse the delegate settings file ("
+                      << json_file_path << ").";
+    return false;
+  }
+  buffer_size_ = parser_.builder_.GetSize();
+  buffer_pointer_ = parser_.builder_.GetBufferPointer();
+  return true;
+}
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
new file mode 100644
index 00000000000..46851e756dc
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
+
+#include <string>
+
+#include "flatbuffers/idl.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace delegates {
+namespace utils {
+
+// This class parses a JSON file to tflite::TFLiteSettings*.
+// Note: This class is "thread-compatible", i.e. not thread-safe but also not
+// thread-hostile
+// <https://web.archive.org/web/20210125044505/https://www.ibm.com/developerworks/java/library/j-jtp09263/index.html>.
+// That is, each instance is not thread-safe, but multiple separate instances
+// are safely independent.
+class TfLiteSettingsJsonParser {
+ public:
+  TfLiteSettingsJsonParser();
+
+  // Loads TFLiteSettings from a JSON file path. The lifetime of the
+  // TFLiteSettings object is tied to the lifetime of the
+  // TfLiteSettingsJsonParser instance.
+  //
+  // Returns the pointer to the TFLiteSettings object or nullptr if an error is
+  // encountered.
+  const TFLiteSettings* Parse(const std::string& json_file_path);
+
+  // Returns the buffer pointer to the loaded TFLiteSettings object or nullptr
+  // if an error was encountered during loading or the TFLiteSettings object is
+  // not loaded. The lifetime of the buffer is tied to the lifetime of the
+  // TfLiteSettingsJsonParser instance.
+  const uint8_t* GetBufferPointer();
+
+  // Returns the buffer size of the loaded TFLiteSettings object or 0 if an
+  // error was encountered during loading or the TFLiteSettings object is not
+  // loaded.
+  flatbuffers::uoffset_t GetBufferSize();
+
+ private:
+  // Parses content inside `json_file_path` into flatbuffer. Returns true if the
+  // parsing was successful, otherwise the method returns false.
+  bool LoadFromJsonFile(const std::string& json_file_path);
+
+  flatbuffers::Parser parser_;
+  uint8_t* buffer_pointer_;
+  flatbuffers::uoffset_t buffer_size_;
+};
+
+}  // namespace utils
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_EXPERIMENTAL_STABLE_DELEGATE_TFLITE_SETTINGS_JSON_PARSER_H_
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc
new file mode 100644
index 00000000000..f0678a00253
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+
+#include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace {
+
+using tflite::TFLiteSettings;
+using tflite::delegates::utils::TfLiteSettingsJsonParser;
+
+TEST(TfLiteSettingsJsonParserTest, SuccessWithValidXNNPackDelegateSettings) {
+  TfLiteSettingsJsonParser parser;
+
+  const TFLiteSettings* tflite_settings = parser.Parse(
+      "tensorflow/lite/delegates/utils/experimental/"
+      "stable_delegate/test_xnnpack_settings.json");
+
+  EXPECT_NE(parser.GetBufferPointer(), nullptr);
+  EXPECT_NE(parser.GetBufferSize(), 0);
+  ASSERT_NE(tflite_settings, nullptr);
+  EXPECT_EQ(tflite_settings->delegate(), tflite::Delegate_XNNPACK);
+  ASSERT_NE(tflite_settings->xnnpack_settings(), nullptr);
+  EXPECT_EQ(tflite_settings->xnnpack_settings()->num_threads(), 5);
+}
+
+TEST(TfLiteSettingsJsonParserTest, GetBufferPointerReturnsValidBufferPointers) {
+  TfLiteSettingsJsonParser parser;
+  parser.Parse(
+      "tensorflow/lite/delegates/utils/experimental/"
+      "stable_delegate/test_xnnpack_settings.json");
+  const uint8_t* buffer_pointer = parser.GetBufferPointer();
+
+  ASSERT_NE(buffer_pointer, nullptr);
+  ASSERT_NE(parser.GetBufferSize(), 0);
+  const TFLiteSettings* tflite_settings =
+      flatbuffers::GetRoot<TFLiteSettings>(buffer_pointer);
+  ASSERT_NE(tflite_settings, nullptr);
+  EXPECT_EQ(tflite_settings->delegate(), tflite::Delegate_XNNPACK);
+  ASSERT_NE(tflite_settings->xnnpack_settings(), nullptr);
+  EXPECT_EQ(tflite_settings->xnnpack_settings()->num_threads(), 5);
+}
+
+// This test passes the path to a JSON file that the content of the file cannot
+// be parsed into the TFLiteSettings structure.
+TEST(TfLiteSettingsJsonParserTest, FailedToParseInvalidSettings) {
+  TfLiteSettingsJsonParser parser;
+
+  EXPECT_EQ(
+      parser.Parse("tensorflow/lite/tools/delegates/experimental/"
+                   "stable_delegate/test_invalid_settings.json"),
+      nullptr);
+  EXPECT_EQ(parser.GetBufferPointer(), nullptr);
+  EXPECT_EQ(parser.GetBufferSize(), 0);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/version_script.lds b/tensorflow/lite/delegates/utils/experimental/stable_delegate/version_script.lds
new file mode 100644
index 00000000000..ff9f47c96d5
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/version_script.lds
@@ -0,0 +1,28 @@
+VERS_1.0 {
+  # Export the stable delegate API.
+  global:
+    # Export stable delegate symbol.
+    extern "C" {
+      TFL_TheStableDelegate;
+    };
+
+    # Export operator new/delete.  These are defined as weak symbols and we need to keep these
+    # symbols to ensure that if the main binary overrides these, we use the same version in the
+    # stable delegate, since data that is allocated by stable delegate may be deallocated elsewhere.
+    extern "C++" {
+      # The syntax here is a bit awkward.  Here we want to match against a demangled symbol name
+      # that contains spaces, but we also want to use glob-style pattern matching.
+      # The linker script syntax doesn't allow spaces in symbol names unless quoted;
+      # but if you do quote the symbol name, then it doesn't do glob-style pattern matching.
+      # So we can't use quotes.  Instead we just use the wildcard character "?" to match space,
+      # which is not ideal, since it would also match any other character.  But these patterns
+      # are nevertheless sufficiently unique that they are unlikely to match any symbols other
+      # than the overloaded global ::operator new and ::operator delete functions.
+      operator?new*;
+      operator?delete*;
+    };
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
index d4e4d1ae0b8..011e2e65e2c 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/delegates/utils/dummy_delegate/dummy_delegate.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
index 8c0b11122f2..fc3071d3f27 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
@@ -20,11 +20,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/c_api.h"
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/shims/c/c_api.h"
+#include "tensorflow/lite/core/shims/c/c_api_opaque.h"
+#include "tensorflow/lite/core/shims/c/c_api_types.h"
+#include "tensorflow/lite/core/shims/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/util.h"
 
@@ -36,9 +35,9 @@ TfLiteRegistrationExternal* GetDelegateKernelRegistration(
       TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
                                        /*version=*/1);
 
-  tflite::internal::TfLiteRegistrationExternalSetFreeWithData(
-      kernel_registration, kernel_registration,
-      [](void* data, TfLiteOpaqueContext* context, void* buffer) -> void {
+  TfLiteRegistrationExternalSetFree(
+      kernel_registration,
+      [](TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
@@ -84,7 +83,7 @@ TfLiteRegistrationExternal* GetDelegateKernelRegistration(
 }
 
 TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
-                             struct TfLiteOpaqueDelegateStruct* opaque_delegate,
+                             TfLiteOpaqueDelegate* opaque_delegate,
                              void* data) {
   auto* simple_opaque_delegate =
       reinterpret_cast<SimpleOpaqueDelegateInterface*>(data);
@@ -120,8 +119,7 @@ TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
 }
 }  // namespace
 
-struct TfLiteOpaqueDelegateStruct*
-TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
+TfLiteOpaqueDelegate* TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
     std::unique_ptr<SimpleOpaqueDelegateInterface> simple_delegate,
     int64_t flags) {
   if (simple_delegate == nullptr) {
@@ -137,12 +135,10 @@ TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
 }
 
 void TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(
-    struct TfLiteOpaqueDelegateStruct* opaque_delegate) {
-  TfLiteDelegate* delegate = reinterpret_cast<TfLiteDelegate*>(opaque_delegate);
-  if (!delegate) return;
-  SimpleOpaqueDelegateInterface* simple_delegate =
-      reinterpret_cast<SimpleOpaqueDelegateInterface*>(
-          delegate->opaque_delegate_builder->data);
+    TfLiteOpaqueDelegate* opaque_delegate) {
+  if (!opaque_delegate) return;
+  auto* simple_delegate = reinterpret_cast<SimpleOpaqueDelegateInterface*>(
+      TfLiteOpaqueDelegateGetData(opaque_delegate));
   delete simple_delegate;
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
index 98baadec917..0673b7f6b91 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
@@ -19,29 +19,27 @@ limitations under the License.
 // called during inference, similar to TFLite Kernels. Delegate owner should
 // implement this interface to build/prepare/invoke the delegated subgraph.
 // - SimpleOpaqueDelegateInterface:
-// This class wraps TFLiteOpaqueDelegateStruct and users need to implement the
+// This class wraps TFLiteOpaqueDelegate and users need to implement the
 // interface and then call
-// TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(...) to get
-// TfLiteOpaqueDelegateStruct* that can be passed to
-// TfLiteInterpreterOptionsAddOpaqueDelegate and free it via
-// TfLiteOpaqueDelegateStruct::DeleteSimpleDelegate(...).
-// or call TfLiteOpaqueDelegateStruct::Create(...) to get a std::unique_ptr
-// TfLiteOpaqueDelegateStruct that can also be passed to
-// TfLiteInterpreterOptionsAddOpaqueDelegate, in which case TfLite interpereter
-// takes the memory ownership of the delegate.
+// TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(...) to get a
+// TfLiteOpaqueDelegate* that can be passed to
+// TfLiteInterpreterOptionsAddDelegate and free it via
+// TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(...),
+// or call TfLiteOpaqueDelegateFactory::Create(...) to get a std::unique_ptr
+// to TfLiteOpaqueDelegate that can also be passed to
+// TfLiteInterpreterOptionsAddDelegate.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_OPAQUE_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_OPAQUE_DELEGATE_H_
 
 #include <memory>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/shims/c/c_api_types.h"
+#include "tensorflow/lite/core/shims/c/common.h"
 
 namespace tflite {
 
 using TfLiteOpaqueDelegateUniquePtr =
-    std::unique_ptr<struct TfLiteOpaqueDelegateStruct,
-                    void (*)(struct TfLiteOpaqueDelegateStruct*)>;
+    std::unique_ptr<TfLiteOpaqueDelegate, void (*)(TfLiteOpaqueDelegate*)>;
 
 // Users should inherit from this class and implement the interface below.
 // Each instance represents a single part of the graph (subgraph).
@@ -115,15 +113,14 @@ class TfLiteOpaqueDelegateFactory {
   // A simple usage of the flags bit mask:
   // CreateSimpleDelegate(..., kTfLiteDelegateFlagsAllowDynamicTensors |
   // kTfLiteDelegateFlagsRequirePropagatedShapes)
-  static struct TfLiteOpaqueDelegateStruct* CreateSimpleDelegate(
+  static TfLiteOpaqueDelegate* CreateSimpleDelegate(
       std::unique_ptr<SimpleOpaqueDelegateInterface> simple_delegate,
       int64_t flags = kTfLiteDelegateFlagsNone);
 
   // Deletes 'delegate' the passed pointer must be the one returned from
   // CreateSimpleDelegate. This function will destruct the SimpleDelegate object
   // too.
-  static void DeleteSimpleDelegate(
-      struct TfLiteOpaqueDelegateStruct* opaque_delegate);
+  static void DeleteSimpleDelegate(TfLiteOpaqueDelegate* opaque_delegate);
 
   // A convenient function wrapping the above two functions and returning a
   // std::unique_ptr type for auto memory management.
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
index bbf627fe3d6..61547d36ec5 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
@@ -15,207 +15,22 @@ limitations under the License.
 #include "tensorflow/lite/delegates/utils/simple_opaque_delegate.h"
 
 #include <array>
-#include <map>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/c_api.h"
-#include "tensorflow/lite/c/c_api_opaque.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/shims/c/c_api.h"
+#include "tensorflow/lite/core/shims/c/c_api_opaque.h"
+#include "tensorflow/lite/core/shims/c/c_api_types.h"
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/core/shims/cc/kernels/register.h"
 #include "tensorflow/lite/delegates/delegate_test_util.h"
-#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 
-static const char kDelegateName[] = "My opaque delegate";
-
-int CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
-  int total_num_elements = 1;
-  for (int i = 0; i < TfLiteOpaqueTensorNumDims(opaque_tensor); ++i) {
-    total_num_elements *= TfLiteOpaqueTensorDim(opaque_tensor, i);
-  }
-  return total_num_elements;
-}
-
-class MySimpleOpaqueDelegateKernel
-    : public SimpleOpaqueDelegateKernelInterface {
-  bool IsExternalTensor(const TfLiteOpaqueTensor* opaque_tensor) const {
-    return external_tensors_.count(opaque_tensor) != 0;
-  }
-
-  void DeriveExternalTensors() {
-    for (const TfLiteOpaqueTensor* tensor : node_input_tensors_set_) {
-      if (node_output_tensors_set_.count(tensor) == 0) {
-        external_tensors_.insert(tensor);
-      }
-    }
-
-    for (const TfLiteOpaqueTensor* tensor : node_output_tensors_set_) {
-      if (node_input_tensors_set_.count(tensor) == 0) {
-        external_tensors_.insert(tensor);
-      }
-    }
-  }
-
- public:
-  TfLiteStatus Init(TfLiteOpaqueContext* context,
-                    const TfLiteOpaqueDelegateParams* params) override {
-    if (params->delegate == nullptr) return kTfLiteDelegateError;
-
-    context_ = context;
-    builtin_code_.resize(params->nodes_to_replace->size);
-
-    node_input_tensors_.resize(params->nodes_to_replace->size);
-    node_output_tensors_.resize(params->nodes_to_replace->size);
-
-    for (int i = 0; i < params->nodes_to_replace->size; ++i) {
-      const int node_index = params->nodes_to_replace->data[i];
-
-      TfLiteOpaqueNode* delegated_node = nullptr;
-      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
-      TfLiteOpaqueContextGetNodeAndRegistration(
-          context, node_index, &delegated_node, &delegated_node_registration);
-
-      auto input_tensor1 = TfLiteOpaqueNodeGetInput(context, delegated_node, 0);
-      node_input_tensors_[i].push_back(input_tensor1);
-      node_input_tensors_set_.insert(input_tensor1);
-
-      auto input_tensor2 = TfLiteOpaqueNodeGetInput(context, delegated_node, 1);
-      node_input_tensors_[i].push_back(input_tensor2);
-      node_input_tensors_set_.insert(input_tensor2);
-
-      auto output_tensor =
-          TfLiteOpaqueNodeGetOutput(context, delegated_node, 0);
-      node_output_tensors_[i] = output_tensor;
-      node_output_tensors_set_.insert(output_tensor);
-
-      builtin_code_[i] =
-          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
-    }
-
-    // Determine which tensors are external (the TFLite runtime takes care
-    // of them) so that we know which tensors are 'internal' to this delegate.
-    // For the internal tensors we need to ensure they have memory allocated to
-    // store their data, and take care of re-sizing etc.
-    DeriveExternalTensors();
-
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus Prepare(TfLiteOpaqueContext* context,
-                       TfLiteOpaqueNode* delegated_node) override {
-    if (external_tensors_.empty()) return kTfLiteOk;
-
-    const int kTheInputTensorSize =
-        CalculateNumElements((*external_tensors_.begin()));
-    for (std::vector<const TfLiteOpaqueTensor*>& vecs : node_input_tensors_) {
-      for (const TfLiteOpaqueTensor* tensor : vecs) {
-        if (IsExternalTensor(tensor)) continue;
-
-        std::vector<float>& vec_memory = internal_tensors_memory_[tensor];
-        vec_memory.resize(kTheInputTensorSize);
-      }
-    }
-    for (const TfLiteOpaqueTensor* tensor : node_output_tensors_) {
-      if (IsExternalTensor(tensor)) continue;
-
-      std::vector<float>& vec_memory = internal_tensors_memory_[tensor];
-      vec_memory.resize(kTheInputTensorSize);
-    }
-
-    return kTfLiteOk;
-  }
-
-  void ComputeImpl(float* input_1, float* input_2, float* output,
-                   int builtin_code, int number_of_elements) {
-    for (int i = 0; i < number_of_elements; ++i) {
-      if (builtin_code == kTfLiteBuiltinAdd) {
-        output[i] = input_1[i] + input_2[i];
-      } else {
-        output[i] = input_1[i] - input_2[i];
-      }
-    }
-  }
-
-  float* GetRawDataSource(TfLiteOpaqueContext* context,
-                          const TfLiteOpaqueTensor* tensor) {
-    if (IsExternalTensor(tensor)) {
-      return reinterpret_cast<float*>(TfLiteOpaqueTensorData(tensor));
-    } else {
-      return internal_tensors_memory_[tensor].data();
-    }
-  }
-
-  TfLiteStatus Eval(TfLiteOpaqueContext* context,
-                    TfLiteOpaqueNode* delegated_node) override {
-    for (int i = 0; i < node_input_tensors_.size(); ++i) {
-      float* input1 = GetRawDataSource(context, node_input_tensors_[i][0]);
-      float* input2 = GetRawDataSource(context, node_input_tensors_[i][1]);
-      float* output = GetRawDataSource(context, node_output_tensors_[i]);
-      // We assume that all input, output and intermediate tensors of the
-      // delegated subgraph have the same size.
-      ComputeImpl(input1, input2, output, builtin_code_[i],
-                  CalculateNumElements(node_output_tensors_[i]));
-    }
-    return kTfLiteOk;
-  }
-
- private:
-  std::vector<std::vector<const TfLiteOpaqueTensor*>> node_input_tensors_;
-  absl::flat_hash_set<const TfLiteOpaqueTensor*> node_input_tensors_set_;
-  std::vector<const TfLiteOpaqueTensor*> node_output_tensors_;
-  absl::flat_hash_set<const TfLiteOpaqueTensor*> node_output_tensors_set_;
-  absl::flat_hash_set<const TfLiteOpaqueTensor*> external_tensors_;
-  absl::flat_hash_map<const TfLiteOpaqueTensor*, std::vector<float>>
-      internal_tensors_memory_;
-  TfLiteOpaqueContext* context_;
-  // Holds the builtin code of the ops.
-  // builtin_code_[i] is the type of node at index 'i'
-  std::vector<int> builtin_code_;
-};
-
-class MySimpleOpaqueDelegate : public SimpleOpaqueDelegateInterface {
- public:
-  bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node,
-      TfLiteOpaqueContext* context) const override {
-    if (kTfLiteBuiltinAdd !=
-            TfLiteRegistrationExternalGetBuiltInCode(registration_external) &&
-        kTfLiteBuiltinSub !=
-            TfLiteRegistrationExternalGetBuiltInCode(registration_external))
-      return false;
-
-    // This delegate only supports float32 types.
-    for (int i = 0; i < TfLiteOpaqueNodeNumberOfInputs(node); ++i) {
-      const TfLiteOpaqueTensor* tensor =
-          TfLiteOpaqueNodeGetInput(context, node, i);
-      if (TfLiteOpaqueTensorType(tensor) != kTfLiteFloat32) return false;
-    }
-
-    return true;
-  }
-
-  TfLiteStatus Initialize(TfLiteOpaqueContext* context) override {
-    return kTfLiteOk;
-  }
-
-  const char* Name() const override { return kDelegateName; }
-
-  std::unique_ptr<SimpleOpaqueDelegateKernelInterface>
-  CreateDelegateKernelInterface() override {
-    return std::make_unique<MySimpleOpaqueDelegateKernel>();
-  }
-};
-
 class TestDelegate : public ::testing::Test {};
 
 TEST_F(TestDelegate, TestDataAddBin_SingleInputSingleOutput_FullyDelegated) {
@@ -224,7 +39,7 @@ TEST_F(TestDelegate, TestDataAddBin_SingleInputSingleOutput_FullyDelegated) {
   //
   TfLiteOpaqueDelegateUniquePtr my_opaque_delegate =
       TfLiteOpaqueDelegateFactory::Create(
-          std::make_unique<MySimpleOpaqueDelegate>());
+          std::make_unique<example::SampleStableDelegate>());
 
   //
   // Create the model and the interpreter
@@ -235,7 +50,7 @@ TEST_F(TestDelegate, TestDataAddBin_SingleInputSingleOutput_FullyDelegated) {
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
   TfLiteInterpreterOptionsSetNumThreads(options, 2);
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, my_opaque_delegate.get());
+  TfLiteInterpreterOptionsAddDelegate(options, my_opaque_delegate.get());
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
   // The options can be deleted immediately after interpreter creation.
@@ -304,7 +119,7 @@ TEST(DelegateTest,
      TestDataAddBin_SingleInputSingleOutput_FullyDelegated_ResizeInputTensors) {
   TfLiteOpaqueDelegateUniquePtr my_opaque_delegate =
       TfLiteOpaqueDelegateFactory::Create(
-          std::make_unique<MySimpleOpaqueDelegate>());
+          std::make_unique<example::SampleStableDelegate>());
 
   TfLiteModel* model =
       TfLiteModelCreateFromFile("third_party/tensorflow/lite/testdata/add.bin");
@@ -313,7 +128,7 @@ TEST(DelegateTest,
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
   TfLiteInterpreterOptionsSetNumThreads(options, 2);
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, my_opaque_delegate.get());
+  TfLiteInterpreterOptionsAddDelegate(options, my_opaque_delegate.get());
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
@@ -381,7 +196,7 @@ TEST(DelegateTest,
 TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
   TfLiteOpaqueDelegateUniquePtr my_opaque_delegate =
       TfLiteOpaqueDelegateFactory::Create(
-          std::make_unique<MySimpleOpaqueDelegate>());
+          std::make_unique<example::SampleStableDelegate>());
 
   TfLiteModel* model = TfLiteModelCreateFromFile(
       "third_party/tensorflow/lite/testdata/multi_add.bin");
@@ -390,7 +205,7 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
   TfLiteInterpreterOptionsSetNumThreads(options, 2);
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, my_opaque_delegate.get());
+  TfLiteInterpreterOptionsAddDelegate(options, my_opaque_delegate.get());
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
@@ -504,13 +319,13 @@ TfLiteRegistrationExternal* GetDelegateKernelRegistrationImpl(
 using ::tflite::delegates::test_utils::TestFP16Delegation;
 
 TEST_F(TestFP16Delegation, MultipleDelegateKernels) {
-  auto my_simple_delegate = std::make_unique<MySimpleOpaqueDelegate>();
-  TfLiteOpaqueDelegateStruct* opaque_delegate =
+  auto my_simple_delegate = std::make_unique<example::SampleStableDelegate>();
+  TfLiteOpaqueDelegate* opaque_delegate =
       TfLiteOpaqueDelegateFactory::CreateSimpleDelegate(
           std::move(my_simple_delegate));
   // The following cast is safe only because this code is part of the
   // TF Lite tests.  Apps using TF Lite should not rely on
-  // TfLiteOpaqueDelegateStruct and TfLiteDelegate being equivalent.
+  // TfLiteOpaqueDelegate and TfLiteDelegate being equivalent.
   ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(
                 reinterpret_cast<TfLiteDelegate*>(opaque_delegate)),
             kTfLiteOk);
@@ -520,25 +335,25 @@ TEST_F(TestFP16Delegation, MultipleDelegateKernels) {
   TfLiteOpaqueDelegateFactory::DeleteSimpleDelegate(opaque_delegate);
 }
 
-// A test facilty used in the 'SetBufferHandle' unit test.  See the tests
+// A test facility used in the 'SetBufferHandle' unit test.  See the tests
 // comments for further context on the implementation of this class.
 class MySimpleOpaqueDelegateWithBufferHandleSupport
-    : public MySimpleOpaqueDelegate {
+    : public example::SampleStableDelegate {
  public:
   static constexpr int kDelegateOutputValue = 42;
   TfLiteStatus CopyFromBufferHandle(TfLiteOpaqueContext* context,
                                     TfLiteBufferHandle buffer_handle,
                                     TfLiteOpaqueTensor* tensor) {
     auto* output = reinterpret_cast<float*>(TfLiteOpaqueTensorData(tensor));
-    std::vector<float> test_output(CalculateNumElements(tensor),
-                                   kDelegateOutputValue);
+    std::vector<float> test_output(
+        example::helpers::CalculateNumElements(tensor), kDelegateOutputValue);
     memcpy(output, test_output.data(), test_output.size() * sizeof(float));
 
     return kTfLiteOk;
   }
 
   void FreeBufferHandle(TfLiteOpaqueContext* context,  // NOLINT
-                        struct TfLiteOpaqueDelegateStruct* delegate,
+                        TfLiteOpaqueDelegate* delegate,
                         TfLiteBufferHandle* handle) {
     recorded_buffer_handle_ = *handle;
     free_buffer_handle_called_ = true;
@@ -564,7 +379,7 @@ TEST_F(TestDelegate, SetBufferHandle) {
   // handle-related code.
   opaque_delegate_builder.Prepare =
       [](TfLiteOpaqueContext* opaque_context,
-         struct TfLiteOpaqueDelegateStruct* opaque_delegate, void* data) {
+         TfLiteOpaqueDelegate* opaque_delegate, void* data) {
         auto* simple_opaque_delegate =
             reinterpret_cast<SimpleOpaqueDelegateInterface*>(data);
         TF_LITE_ENSURE_STATUS(
@@ -582,9 +397,8 @@ TEST_F(TestDelegate, SetBufferHandle) {
   opaque_delegate_builder.flags = kTfLiteDelegateFlagsNone;
   opaque_delegate_builder.data = &my_simple_delegate;
   opaque_delegate_builder.CopyFromBufferHandle =
-      [](TfLiteOpaqueContext* context,
-         struct TfLiteOpaqueDelegateStruct* delegate, void* data,
-         TfLiteBufferHandle buffer_handle,
+      [](TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate,
+         void* data, TfLiteBufferHandle buffer_handle,
          TfLiteOpaqueTensor* tensor) -> TfLiteStatus {
     auto* simple_opaque_delegate =
         reinterpret_cast<MySimpleOpaqueDelegateWithBufferHandleSupport*>(data);
@@ -593,9 +407,8 @@ TEST_F(TestDelegate, SetBufferHandle) {
     return kTfLiteOk;
   };
   opaque_delegate_builder.FreeBufferHandle =
-      [](TfLiteOpaqueContext* context,
-         struct TfLiteOpaqueDelegateStruct* delegate, void* data,
-         TfLiteBufferHandle* handle) {
+      [](TfLiteOpaqueContext* context, TfLiteOpaqueDelegate* delegate,
+         void* data, TfLiteBufferHandle* handle) {
         auto* simple_opaque_delegate =
             reinterpret_cast<MySimpleOpaqueDelegateWithBufferHandleSupport*>(
                 data);
@@ -675,7 +488,7 @@ TEST(DelegateTest,
      TestDataConvHugeIm2ColBin_MultiInputSingleOutput_PartiallyDelegated) {
   TfLiteOpaqueDelegateUniquePtr my_opaque_delegate =
       TfLiteOpaqueDelegateFactory::Create(
-          std::make_unique<MySimpleOpaqueDelegate>());
+          std::make_unique<example::SampleStableDelegate>());
   TfLiteModel* model = TfLiteModelCreateFromFile(
       "third_party/tensorflow/lite/testdata/conv_huge_im2col.bin");
   ASSERT_NE(model, nullptr);
@@ -683,7 +496,7 @@ TEST(DelegateTest,
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
   ASSERT_NE(options, nullptr);
   TfLiteInterpreterOptionsSetNumThreads(options, 2);
-  TfLiteInterpreterOptionsAddOpaqueDelegate(options, my_opaque_delegate.get());
+  TfLiteInterpreterOptionsAddDelegate(options, my_opaque_delegate.get());
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
   ASSERT_NE(interpreter, nullptr);
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 83fd7d9d24d..f34041c1112 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tf
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -115,8 +116,9 @@ cc_library(
         ":tflite_with_xnnpack_qu8",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -133,7 +135,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = internal_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -147,8 +149,9 @@ cc_library(
         ":quantization_util",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:compatibility",
@@ -195,8 +198,9 @@ cc_library(
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -213,8 +217,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -232,8 +237,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -250,8 +256,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -269,8 +276,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -287,8 +295,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -306,8 +315,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -324,8 +334,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -341,8 +352,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -358,8 +370,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -377,8 +390,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -395,8 +409,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -412,8 +427,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -430,8 +446,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -448,8 +465,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -466,8 +484,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -483,8 +502,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -500,8 +520,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -517,8 +538,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -534,8 +556,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -551,8 +574,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -568,8 +592,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -585,8 +610,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -602,8 +628,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -619,8 +646,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -636,8 +664,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//testing/base/public:gunit_for_library_testonly",
@@ -653,8 +681,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -670,8 +699,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -687,8 +717,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//testing/base/public:gunit_for_library_testonly",
@@ -705,8 +735,8 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//testing/base/public:gunit_for_library_testonly",
@@ -722,8 +752,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -739,8 +770,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -758,8 +790,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -777,8 +810,9 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@FP16",
@@ -796,8 +830,8 @@ cc_library(
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//testing/base/public:gunit_for_library_testonly",
@@ -818,8 +852,8 @@ cc_library(
         ":xnnpack_delegate",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//testing/base/public:gunit_for_library_testonly",
@@ -1399,6 +1433,36 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "signed_quantized_reshape_test",
+    srcs = ["signed_quantized_reshape_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":reshape_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "unsigned_quantized_reshape_test",
+    srcs = ["unsigned_quantized_reshape_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":reshape_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
@@ -2247,7 +2311,8 @@ cc_test(
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index ef13dd71a02..130dd6fe890 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -454,23 +454,34 @@ Below is the list of currently supported floating-point operators:
 * Output size, filter and bias (if present) must be static (use
   `kTfLiteMmapRo` allocation type).
 
-### Floating-Point (IEEE FP16) Operators (experimental)
+### Floating-Point (IEEE FP16) Operators
 
 XNNPACK supports half-precision (using IEEE FP16 format) inference for a subset
 of floating-point operators. XNNPACK automatically enables half-precision
 inference when the following conditions are met:
 
 * XNNPACK runs on hardware that natively supports computations in IEEE FP16
-format. Currently, this hardware is limited to ARM64 devices with ARMv8.2 FP16
-arithmetics extension, and includes Android phones starting with Pixel 3,
-Galaxy S9 (Snapdragon SoC), Galaxy S10 (Exynos SoC), iOS devices with A11 or
-newer SoCs, and all Apple Silicon Macs.
+format. Currently, this hardware is limited to ARM & ARM64 devices with
+ARMv8.2 FP16 arithmetics extension, and includes Android phones starting with
+Pixel 3, Galaxy S9 (Snapdragon SoC), Galaxy S10 (Exynos SoC), iOS devices with
+A11 or newer SoCs, all Apple Silicon Macs, and Windows ARM64 laptops based with
+Snapdragon 850 SoC or newer.
 
 * IEEE FP16 inference is supported for every floating-point operator in the
 model.
 
 * The model's "reduced_precision_support" metadata indicates that the model
-is compatible with FP16 inference.
+is compatible with FP16 inference. The metadata can be added during model
+conversion using the `_experimental_supported_accumulation_type` attribute
+of the [tf.lite.TargetSpec](https://www.tensorflow.org/api_docs/python/tf/lite/TargetSpec)
+object:
+
+```python
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+...
+converter.target_spec.supported_types = [tf.float16]
+converter.target_spec._experimental_supported_accumulation_type = tf.dtypes.float16
+```
 
 When the above conditions are met, XNNPACK replace FP32 operators with their
 FP16 equivalents, and insert additional operators to convert model inputs
@@ -486,7 +497,7 @@ is used. Forcing FP16 inference has several effects:
 * Besides ARM64 devices with ARMv8.2 FP16 arithmetics extension, forced FP16
 inference is supported on x86/x86-64 devices with AVX2 extension in emulation
 mode: all elementary floating-point operations are computed in FP32, then
-converted to FP16 and back to FP32. Note that such simulation is not exactly
+converted to FP16 and back to FP32. Note that such simulation is not bit-exact
 equivalent to native FP16 inference, but simulates the effects of restricted
 mantissa precision and exponent range in the native FP16 arithmetics.
 
@@ -512,171 +523,10 @@ TfLiteDelegate* xnnpack_delegate =
     TfLiteXNNPackDelegateCreate(&xnnpack_options);
 ```
 
-Below is the list of operators supported in IEEE FP16 inference:
-
-#### `ABS`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `ADD`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `AVERAGE_POOL_2D`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `CEIL`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `CONV_2D`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `CONCATENATION`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `DEPTH_TO_SPACE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `DEPTHWISE_CONV_2D`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `DIV`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `FLOOR`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `FULLY_CONNECTED`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `HARD_SWISH`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `LEAKY_RELU`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `LOGISTIC`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `MAX_POOL_2D`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `MAXIMUM`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `MEAN`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `MINIMUM`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `MUL`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `NEG`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `PAD`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `PRELU`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `RELU`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `RELU6`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `RELU_N1_TO_1`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `RESHAPE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `RESIZE_BILINEAR`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `ROUND`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SLICE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SOFTMAX`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SPACE_TO_DEPTH`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SPLIT`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SQRT`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SQUARE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SQUARED_DIFFERENCE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `STRIDED_SLICE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `SUB`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-* Neither of the inputs can be static (use `kTfLiteMmapRo` allocation type).
-
-#### `TRANSPOSE`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
-
-#### `TRANSPOSE_CONV`
-
-* Must satisfy constraints on the floating-point (FP32) operator.
+XNNPACK has full feature parity between FP32 and FP16 operators: all operators
+that are supported for FP32 inference are also supported for FP16 inference,
+and vice versa. In particular, sparse inference operators are supported for FP16
+inference on ARM processors.
 
 ### Quantized Operators
 
@@ -803,6 +653,13 @@ Below is the list of currently supported quantized operators:
   as the outputs, and the ratio of input scale to output scale must be in the
   [2**-8, 2**7] range.
 
+#### `RESHAPE`
+
+*   The first input and the output must be in 8-bit quantized format.
+*   The second input (the input with the new shape specification) must be either
+    static (use `kTfLiteMmapRo` allocation type), or absent (with the new shape
+    specified via `ReshapeOptions` table).
+
 #### `RESIZE_BILINEAR`
 
 * The first input and the output must be 4D tensors in 8-bit quantized format.
@@ -848,7 +705,8 @@ Below is the list of currently supported quantized operators:
 
 XNNPACK backend supports sparse inference for CNN models described in the
 [Fast Sparse ConvNets](https://arxiv.org/abs/1911.09723) paper. Sparse
-inference is restricted to subgraphs with the following operators:
+inference is restricted to subgraphs with the following floating-point
+operators:
 
 * Sparse subgraph must store its weights in sparse representation (using
   `DENSIFY` operators in the TensorFlow Lite schema).
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index 1df70732f6b..5b64efd9675 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
index 286890aa5c8..ed8d3751be0 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc b/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
index 246120af466..d7b9071b3a4 100644
--- a/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/concatenation_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/concatenation_tester.h b/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
index c90527e8ae4..ec80aedfdf6 100644
--- a/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/concatenation_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index 72f91ea6b7d..03b66e403f9 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -26,11 +26,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
index b4c994f7d8f..4f9f284ba28 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
index 6b6999b5c0b..3757c008ae1 100644
--- a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.cc
@@ -26,8 +26,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h
index fed63b73160..baa64012998 100644
--- a/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depth_to_space_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index 0162866e4cb..e4d86cc7aed 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index 72977910775..8748d8663da 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc b/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
index 4295b150436..c5e67f6e974 100644
--- a/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dequantize_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/dequantize_tester.h b/tensorflow/lite/delegates/xnnpack/dequantize_tester.h
index b0af2538bf2..b29df24d569 100644
--- a/tensorflow/lite/delegates/xnnpack/dequantize_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/dequantize_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index bd980046f8e..e8c74d7600b 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index 09184d9a02e..d7b51ef3872 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index cba6a0d4753..f1d5a5d3ae4 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
index 191dc938e89..68f8d54fd12 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index 232bc04576e..35b39c58345 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.h b/tensorflow/lite/delegates/xnnpack/pad_tester.h
index a6951fdf156..c6cefb4349b 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
index da92777ad50..829d127e6df 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
index a84be10ad45..d559c451d96 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index 4646ec6c1fd..d36a6539510 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.h b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
index a46ffdcf48e..3dbb7fbf6ee 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
index 0b0d24aa507..bb4a3249b04 100644
--- a/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantize_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantize_tester.h b/tensorflow/lite/delegates/xnnpack/quantize_tester.h
index 5d16cd5993f..8300158de5e 100644
--- a/tensorflow/lite/delegates/xnnpack/quantize_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantize_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
index 65cbf3c0827..28867e105c6 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h
index a794c537af7..4bfafd4086f 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_binary_elementwise_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
index 8eb4d42b44d..5aa5b026195 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.cc
@@ -26,8 +26,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
index bd695b770df..b59cb6cabb6 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
index f07c6f6811f..4ef1758f6e2 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.cc
@@ -26,8 +26,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h
index e5f6bd3f238..82178d50997 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
index 7e194058279..ac63ec48247 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.cc
@@ -27,8 +27,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h
index e1492b0e372..c5514e57aa1 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
index 761e78634fa..8f64df1c547 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h
index 62907a714f2..ca46b2801e7 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_leaky_relu_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
index 7f23f67da14..30c92ff7ae7 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h
index ae0ef7c760a..bfb688710aa 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pad_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
index a78581b7622..e2f633fcaca 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.cc
@@ -24,8 +24,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h
index 86d59f1c65e..d1719b40298 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_pool_2d_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
index c1eff1cfb95..f42265b2dd0 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h
index 8eb7f50f3d4..d8199ec1f4c 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
index c4481d80da2..d992992d883 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
index 6ac294dd130..863c8e7c41f 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
index 3b09d6e0a78..02fb44af791 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
@@ -28,10 +28,10 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
index 2dfddf37c0f..06885971f5c 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
@@ -22,14 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
-class Interpreter;
-
 namespace xnnpack {
 
 class QuantizedTransposeConvTester {
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
index a8cb770e922..ebeecb285d8 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h
index ee9616426ff..595fe4360eb 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_unary_elementwise_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
index 4816175eef0..e9204ff8da8 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.h b/tensorflow/lite/delegates/xnnpack/reduce_tester.h
index 63ce19464f5..149b6303080 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_test.cc b/tensorflow/lite/delegates/xnnpack/reshape_test.cc
index 498936eb85d..fc8d240f120 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_test.cc
@@ -45,7 +45,7 @@ TEST(Reshape, 4DShapeAsInput) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(true)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 4DShapeAsParam) {
@@ -66,7 +66,7 @@ TEST(Reshape, 4DShapeAsParam) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(false)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 3DShapeAsInput) {
@@ -87,7 +87,7 @@ TEST(Reshape, 3DShapeAsInput) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(true)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 3DShapeAsParam) {
@@ -108,7 +108,7 @@ TEST(Reshape, 3DShapeAsParam) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(false)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 2DShapeAsInput) {
@@ -128,7 +128,7 @@ TEST(Reshape, 2DShapeAsInput) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(true)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 2DShapeAsParam) {
@@ -148,7 +148,7 @@ TEST(Reshape, 2DShapeAsParam) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(false)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 1DShapeAsInput) {
@@ -166,7 +166,7 @@ TEST(Reshape, 1DShapeAsInput) {
       .InputShape(shape)
       .OutputShape(shape)
       .OutputShapeAsInput(true)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 1DShapeAsParam) {
@@ -184,7 +184,7 @@ TEST(Reshape, 1DShapeAsParam) {
       .InputShape(shape)
       .OutputShape(shape)
       .OutputShapeAsInput(false)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, 0D) {
@@ -195,7 +195,7 @@ TEST(Reshape, 0D) {
   ReshapeTester()
       .InputShape(std::vector<int32_t>())
       .OutputShape(std::vector<int32_t>())
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 TEST(Reshape, MultiThreading) {
@@ -219,7 +219,7 @@ TEST(Reshape, MultiThreading) {
       .InputShape(input_shape)
       .OutputShape(output_shape)
       .OutputShapeAsInput(true)
-      .Test(xnnpack_delegate.get());
+      .Test(TensorType_FLOAT32, xnnpack_delegate.get());
 }
 
 }  // namespace xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index c011bb70a58..295fd597ca5 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -36,15 +36,71 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-void ReshapeTester::Test(TfLiteDelegate* delegate) const {
+template <class T>
+void ReshapeTester::Test(TensorType tensor_type,
+                         Interpreter* delegate_interpreter,
+                         Interpreter* default_interpreter) const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto f32rng =
+  std::uniform_int_distribution<int32_t> input_distribution(
+      std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+  auto input_rng = std::bind(input_distribution, std::ref(rng));
+
+  T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
+  std::generate(default_input_data, default_input_data + InputSize(),
+                std::ref(input_rng));
+
+  T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
+  std::copy(default_input_data, default_input_data + InputSize(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  T* default_output_data = default_interpreter->typed_output_tensor<T>(0);
+  T* delegate_output_data = delegate_interpreter->typed_output_tensor<T>(0);
+
+  for (size_t i = 0; i < OutputSize(); i++) {
+    ASSERT_EQ(delegate_output_data[i], default_output_data[i]);
+  }
+}
+
+template <>
+void ReshapeTester::Test<float>(TensorType tensor_type,
+                                Interpreter* delegate_interpreter,
+                                Interpreter* default_interpreter) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
       std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
 
+  float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
+  std::generate(default_input_data, default_input_data + InputSize(),
+                std::ref(input_rng));
+
+  float* delegate_input_data =
+      delegate_interpreter->typed_input_tensor<float>(0);
+  std::copy(default_input_data, default_input_data + InputSize(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data =
+      default_interpreter->typed_output_tensor<float>(0);
+  float* delegate_output_data =
+      delegate_interpreter->typed_output_tensor<float>(0);
+
+  for (size_t i = 0; i < OutputSize(); i++) {
+    ASSERT_EQ(delegate_output_data[i], default_output_data[i]);
+  }
+}
+
+void ReshapeTester::Test(TensorType tensor_type,
+                         TfLiteDelegate* delegate) const {
   ASSERT_EQ(InputSize(), OutputSize());
 
-  std::vector<char> buffer = CreateTfLiteModel();
+  std::vector<char> buffer = CreateTfLiteModel(tensor_type);
   const Model* model = GetModel(buffer.data());
 
   std::unique_ptr<Interpreter> delegate_interpreter;
@@ -76,29 +132,26 @@ void ReshapeTester::Test(TfLiteDelegate* delegate) const {
 
   ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
 
-  float* default_input_data = default_interpreter->typed_input_tensor<float>(0);
-  std::generate(default_input_data, default_input_data + InputSize(),
-                std::ref(f32rng));
-
-  float* delegate_input_data =
-      delegate_interpreter->typed_input_tensor<float>(0);
-  std::copy(default_input_data, default_input_data + InputSize(),
-            delegate_input_data);
-
-  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
-  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
-
-  float* default_output_data =
-      default_interpreter->typed_output_tensor<float>(0);
-  float* delegate_output_data =
-      delegate_interpreter->typed_output_tensor<float>(0);
-
-  for (size_t i = 0; i < OutputSize(); i++) {
-    ASSERT_EQ(delegate_output_data[i], default_output_data[i]);
+  switch (tensor_type) {
+    case TensorType_FLOAT32:
+      Test<float>(TensorType_FLOAT32, delegate_interpreter.get(),
+                  default_interpreter.get());
+      break;
+    case TensorType_INT8:
+      Test<int8_t>(TensorType_INT8, delegate_interpreter.get(),
+                   default_interpreter.get());
+      break;
+    case TensorType_UINT8:
+      Test<uint8_t>(TensorType_UINT8, delegate_interpreter.get(),
+                    default_interpreter.get());
+      break;
+    default:
+      GTEST_FAIL();
   }
 }
 
-std::vector<char> ReshapeTester::CreateTfLiteModel() const {
+std::vector<char> ReshapeTester::CreateTfLiteModel(
+    TensorType tensor_type) const {
   flatbuffers::FlatBufferBuilder builder;
   flatbuffers::Offset<OperatorCode> operator_code =
       CreateOperatorCode(builder, BuiltinOperator_RESHAPE, 0);
@@ -117,11 +170,21 @@ std::vector<char> ReshapeTester::CreateTfLiteModel() const {
       CreateTensor(builder,
                    builder.CreateVector<int32_t>(InputShape().data(),
                                                  InputShape().size()),
-                   TensorType_FLOAT32),
+                   tensor_type,
+                   /*buffer=*/0, /*name=*/0,
+                   CreateQuantizationParameters(
+                       builder, /*min=*/0, /*max=*/0,
+                       builder.CreateVector<float>({/*scale=*/1.0f}),
+                       builder.CreateVector<int64_t>({/*zero_point=*/0}))),
       CreateTensor(builder,
                    builder.CreateVector<int32_t>(OutputShape().data(),
                                                  OutputShape().size()),
-                   TensorType_FLOAT32),
+                   tensor_type,
+                   /*buffer=*/0, /*name=*/0,
+                   CreateQuantizationParameters(
+                       builder, /*min=*/0, /*max=*/0,
+                       builder.CreateVector<float>({/*scale=*/1.0f}),
+                       builder.CreateVector<int64_t>({/*zero_point=*/0}))),
   }};
 
   if (OutputShapeAsInput()) {
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.h b/tensorflow/lite/delegates/xnnpack/reshape_tester.h
index 2265da8d350..35e8a64366b 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace xnnpack {
@@ -67,10 +69,14 @@ class ReshapeTester {
 
   inline bool OutputShapeAsInput() const { return shape_as_input_; }
 
-  void Test(TfLiteDelegate* delegate) const;
+  template <class T>
+  void Test(TensorType tensor_type, Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TensorType tensor_type, TfLiteDelegate* delegate) const;
 
  private:
-  std::vector<char> CreateTfLiteModel() const;
+  std::vector<char> CreateTfLiteModel(TensorType tensor_type) const;
 
   static int32_t ComputeSize(const std::vector<int32_t>& shape);
 
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index 70f904afada..b6f7c26eec2 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
index 6885fcf9033..73791e520dd 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_reshape_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_reshape_test.cc
new file mode 100644
index 00000000000..1f57d47cdba
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_reshape_test.cc
@@ -0,0 +1,226 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/reshape_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(SignedQuantizedReshape, 4DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 4DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 3DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 3DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 2DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{{shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 2DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{{shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 1DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> shape({shape_rng()});
+
+  ReshapeTester()
+      .InputShape(shape)
+      .OutputShape(shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 1DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> shape({shape_rng()});
+
+  ReshapeTester()
+      .InputShape(shape)
+      .OutputShape(shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, 0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  ReshapeTester()
+      .InputShape(std::vector<int32_t>())
+      .OutputShape(std::vector<int32_t>())
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedReshape, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_INT8, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/slice_tester.cc b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
index aade3410c09..53e7d1681de 100644
--- a/tensorflow/lite/delegates/xnnpack/slice_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/slice_tester.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/slice_tester.h b/tensorflow/lite/delegates/xnnpack/slice_tester.h
index 6ed8699b54f..c4ce7b5ccf0 100644
--- a/tensorflow/lite/delegates/xnnpack/slice_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/slice_tester.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index f7f1a356012..5717010e117 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.h b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
index 674dc9a443e..b5e7e48fbf0 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
index e1e4397eb7f..76f0cf63e4a 100644
--- a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h
index 7749ce0bbfe..804a70ce115 100644
--- a/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/space_to_depth_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/split_tester.cc b/tensorflow/lite/delegates/xnnpack/split_tester.cc
index 169301184cb..d881826578b 100644
--- a/tensorflow/lite/delegates/xnnpack/split_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/split_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/split_tester.h b/tensorflow/lite/delegates/xnnpack/split_tester.h
index 3e53a62caba..b73a0f96b35 100644
--- a/tensorflow/lite/delegates/xnnpack/split_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/split_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
index 15d8d51ca0e..e5f0ca4e680 100644
--- a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h
index 3599cc5df9a..b2cbe95ccab 100644
--- a/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/strided_slice_tester.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
index cedb20e4766..1899b6825de 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "fp16.h"  // from @FP16
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/xnnpack/test_util.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
index 3a5fb996f42..9588653e748 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_tester.cc b/tensorflow/lite/delegates/xnnpack/transpose_tester.cc
index cabc13dfe17..dd07f86cda4 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_tester.cc
@@ -24,8 +24,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_tester.h b/tensorflow/lite/delegates/xnnpack/transpose_tester.h
index a90c89cdb7d..b5fb206af24 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/transpose_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index bab8a90b28b..e75d0b4e073 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
index e3c210fd6b3..c5d6a513aca 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_reshape_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_reshape_test.cc
new file mode 100644
index 00000000000..85a8ebb4164
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_reshape_test.cc
@@ -0,0 +1,226 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/reshape_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(UnsignedQuantizedReshape, 4DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 4DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 3DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 3DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 2DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{{shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 2DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{{shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 1DShapeAsInput) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> shape({shape_rng()});
+
+  ReshapeTester()
+      .InputShape(shape)
+      .OutputShape(shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 1DShapeAsParam) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> shape({shape_rng()});
+
+  ReshapeTester()
+      .InputShape(shape)
+      .OutputShape(shape)
+      .OutputShapeAsInput(false)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, 0D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  ReshapeTester()
+      .InputShape(std::vector<int32_t>())
+      .OutputShape(std::vector<int32_t>())
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedReshape, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  const std::vector<int32_t> input_shape{
+      {shape_rng(), shape_rng(), shape_rng(), shape_rng()}};
+  std::vector<int32_t> output_shape(input_shape.cbegin(), input_shape.cend());
+  std::shuffle(output_shape.begin(), output_shape.end(), rng);
+
+  ReshapeTester()
+      .InputShape(input_shape)
+      .OutputShape(output_shape)
+      .OutputShapeAsInput(true)
+      .Test(TensorType_UINT8, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
index facb8b543dc..9501693ab00 100644
--- a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h
index 07efe18f684..7da1b62fe03 100644
--- a/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/variable_ops_tester.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
index 4532a1048d7..e7f88c01172 100644
--- a/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weights_cache_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index b5837dcc9c2..8ffe3786aee 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -32,10 +32,11 @@ limitations under the License.
 
 #include "xnnpack.h"  // from @XNNPACK
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/quantization_util.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
@@ -130,9 +131,9 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
       if (zero_point < std::numeric_limits<uint8_t>::min() ||
           zero_point > std::numeric_limits<uint8_t>::max()) {
         TF_LITE_KERNEL_LOG(context,
-                           "unsupported zero-point value (%f) for UINT8 tensor "
+                           "unsupported zero-point value (%d) for UINT8 tensor "
                            "%d in XNNPACK delegate",
-                           scale, t);
+                           zero_point, t);
         return xnn_datatype_invalid;
       }
 
@@ -191,7 +192,7 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
         if (zero_point < std::numeric_limits<int8_t>::min() ||
             zero_point > std::numeric_limits<int8_t>::max()) {
           TF_LITE_KERNEL_LOG(context,
-                             "unsupported zero-point value (%f) for INT8 "
+                             "unsupported zero-point value (%d) for INT8 "
                              "tensor %d in XNNPACK delegate",
                              zero_point, t);
           return xnn_datatype_invalid;
@@ -404,10 +405,11 @@ class VariableHolder {
                                                const TfLiteTensor* tensor,
                                                TfLiteContext* logging_context) {
     if (tensor->type != kTfLiteFloat32) {
-      TF_LITE_KERNEL_LOG(logging_context,
-                         "failed to associate variable tensors with tensor %d: "
-                         "only kTfLiteFloat32 variable tensors are supported",
-                         local_id);
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "failed to associate variable tensors with tensor %d: "
+          "only kTfLiteFloat32 variable tensors are supported",
+          local_id);
       return kTfLiteError;
     }
     const uint32_t global_id = GetGlobalId(local_id);
@@ -419,19 +421,19 @@ class VariableHolder {
       // Not inserted.
       if (it.first->second.type != tensor->type) {
         // Make sure that existing type matches.
-        TF_LITE_KERNEL_LOG(logging_context,
-                           "mismatch between existing type of "
-                           "variable tensor id %d: expected %d, got %d",
-                           local_id, tensor->type, it.first->second.type);
+        TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                 "mismatch between existing type of "
+                                 "variable tensor id %d: expected %d, got %d",
+                                 local_id, tensor->type, it.first->second.type);
         return kTfLiteError;
       }
       auto const& dims = it.first->second.dims;
       for (size_t i = 0; i < dims.size(); i++) {
         if (dims[i] != tensor->dims->data[i]) {
-          TF_LITE_KERNEL_LOG(logging_context,
-                             "mismatch between dimension %d of "
-                             "variable tensor id %d: expected %d, got %d",
-                             i, local_id, dims[i], tensor->dims->data[i]);
+          TF_LITE_MAYBE_KERNEL_LOG(logging_context,
+                                   "mismatch between dimension %d of "
+                                   "variable tensor id %d: expected %d, got %d",
+                                   i, local_id, dims[i], tensor->dims->data[i]);
           return kTfLiteError;
         }
       }
@@ -463,12 +465,31 @@ class Delegate {
 
  public:
   explicit Delegate(const TfLiteXNNPackDelegateOptions* options,
-                    xnn_workspace_t workspace) {
+                    xnn_workspace_t workspace,
+                    TfLiteContext* context = nullptr) {
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
-    if (options != nullptr && options->num_threads > 1) {
-      threadpool_.reset(
-          pthreadpool_create(static_cast<size_t>(options->num_threads)));
+    pthreadpool_t threadpool = nullptr;
+    if (context != nullptr) {
+      threadpool =
+          CpuBackendContext::GetFromContext(context)->get_xnnpack_threadpool();
+    }
+    if (threadpool != nullptr) {
+      // Note that by passing a valid threadpool via context, your xnnpack
+      // threadpool will have the same number of threads as
+      // CpuBackendContext::max_num_threads_. If this is not desired behavior,
+      // pass a null threadpool, and then set num_threads through
+      // TfLiteXNNPackDelegateOptions.
+      threadpool_.reset(threadpool);
+      own_threadpool_ = false;
+    } else {
+      own_threadpool_ = true;
+      if (options != nullptr && options->num_threads > 1) {
+        threadpool_.reset(
+            pthreadpool_create(static_cast<size_t>(options->num_threads)));
+        threadpool = threadpool_.get();
+      }
     }
+
 #endif
     TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                          "Created TensorFlow Lite XNNPACK delegate for CPU.");
@@ -546,14 +567,22 @@ class Delegate {
     return variable_holder_.GetAllTensors();
   }
 
+  void maybe_release_threadpool_ownership() {
+#if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
+    if (!own_threadpool_) {
+      threadpool_.release();
+    }
+#endif
+  }
+
  private:
   TfLiteDelegate delegate_ = {
-      reinterpret_cast<void*>(this),  // .data_
-      DelegatePrepare,                // .Prepare
-      nullptr,                        // .CopyFromBufferHandle
-      nullptr,                        // .CopyToBufferHandle
-      nullptr,                        // .FreeBufferHandle
-      kTfLiteDelegateFlagsNone,       // .flags
+      reinterpret_cast<void*>(this),             // .data_
+      DelegatePrepare,                           // .Prepare
+      nullptr,                                   // .CopyFromBufferHandle
+      nullptr,                                   // .CopyToBufferHandle
+      nullptr,                                   // .FreeBufferHandle
+      kTfLiteDelegateFlagsPerOperatorProfiling,  // .flags
   };
 
   // Unpacked data for quasi-static tensors, i.e. tensors produced by
@@ -573,6 +602,8 @@ class Delegate {
   // Thread pool with smart-pointer for lifetime management.
   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_{
       nullptr, &pthreadpool_destroy};
+  // Boolean that indicates if threadpool_ was created by xnnpack_delegate.
+  bool own_threadpool_;
 #endif
   std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
       nullptr, &xnn_release_workspace};
@@ -767,7 +798,8 @@ class Subgraph {
     }
 
     // XNNPACK Value IDs for TFLite tensors
-    std::vector<uint32_t> xnnpack_tensors(tensors.back() + 1);
+    std::vector<uint32_t> xnnpack_tensors(tensors.empty() ? 0
+                                                          : tensors.back() + 1);
     for (int t : tensors) {
       if (context->tensors[t].type == kTfLiteResource) {
         // We should never see a resource tensor if we are not handling variable
@@ -1054,9 +1086,10 @@ class Subgraph {
     for (size_t node_index = 0; node_index < num_operators; ++node_index) {
       operator_name = &operator_names[name_len];
       name_len += strlen(operator_name) + 1;
-      profiler->AddEvent(operator_name,
-                         Profiler::EventType::DELEGATE_OPERATOR_INVOKE_EVENT,
-                         operator_timings[node_index], node_index);
+      profiler->AddEvent(
+          operator_name,
+          Profiler::EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT,
+          operator_timings[node_index], node_index);
     }
     return kTfLiteOk;
   }
@@ -4502,8 +4535,9 @@ class Subgraph {
     }
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, input_tensor,
+                                       node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 0,
                                            XNN_MAX_TENSOR_DIMS,
                                            node->inputs->data[0]));
@@ -4522,8 +4556,9 @@ class Subgraph {
     }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
-        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(
+        CheckTensorFloat32OrQUInt8Type(delegate, logging_context, output_tensor,
+                                       node->outputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 0,
                                            XNN_MAX_TENSOR_DIMS,
                                            node->outputs->data[0]));
@@ -5399,6 +5434,12 @@ class Subgraph {
         &padding_top, &padding_bottom, &padding_left, &padding_right,
         &adjustment_height, &adjustment_width));
 
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, deconv_params->activation, &output_min,
+        &output_max));
+
     if (subgraph != nullptr) {
       const xnn_status status = xnn_define_deconvolution_2d(
           subgraph,
@@ -5417,8 +5458,8 @@ class Subgraph {
           /*groups=*/1,
           /*group_input_channels=*/input_channels,
           /*group_output_channels=*/output_channels,
-          /*output_min=*/-std::numeric_limits<float>::infinity(),
-          /*output_max=*/+std::numeric_limits<float>::infinity(),
+          /*output_min=*/output_min,
+          /*output_max=*/output_max,
           /*input_id=*/xnnpack_tensors[input_tensor_index],
           /*filter_id=*/xnnpack_tensors[filter_tensor_index],
           /*bias_id=*/xnnpack_tensor_bias,
@@ -5986,6 +6027,11 @@ TfLiteXNNPackDelegateOptions TfLiteXNNPackDelegateOptionsDefault() {
 
 TfLiteDelegate* TfLiteXNNPackDelegateCreate(
     const TfLiteXNNPackDelegateOptions* options) {
+  return TfLiteXNNPackDelegateCreateWithThreadpool(options, nullptr);
+}
+
+TfLiteDelegate* TfLiteXNNPackDelegateCreateWithThreadpool(
+    const TfLiteXNNPackDelegateOptions* options, TfLiteContext* context) {
   xnn_status status = xnn_initialize(/*allocator=*/nullptr);
   if (status != xnn_status_success) {
     return nullptr;
@@ -5996,7 +6042,8 @@ TfLiteDelegate* TfLiteXNNPackDelegateCreate(
     return nullptr;
   }
 
-  auto* xnnpack_delegate = new ::tflite::xnnpack::Delegate(options, workspace);
+  auto* xnnpack_delegate =
+      new ::tflite::xnnpack::Delegate(options, workspace, context);
   return xnnpack_delegate ? xnnpack_delegate->tflite_delegate() : nullptr;
 }
 
@@ -6011,6 +6058,9 @@ void* TfLiteXNNPackDelegateGetThreadPool(TfLiteDelegate* delegate) {
 
 void TfLiteXNNPackDelegateDelete(TfLiteDelegate* delegate) {
   if (delegate != nullptr) {
-    delete static_cast<::tflite::xnnpack::Delegate*>(delegate->data_);
+    ::tflite::xnnpack::Delegate* data =
+        static_cast<::tflite::xnnpack::Delegate*>(delegate->data_);
+    data->maybe_release_threadpool_ownership();
+    delete data;
   }
 }
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
index 660ce66c363..3dd742258d2 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_XNNPACK_DELEGATE_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -61,6 +61,12 @@ TfLiteXNNPackDelegateOptionsDefault();
 TFL_CAPI_EXPORT TfLiteDelegate* TfLiteXNNPackDelegateCreate(
     const TfLiteXNNPackDelegateOptions* options);
 
+// Performs the same task as TfLiteXNNPackDelegateCreate, with one exception.
+// If the context passed contains a non-null xnnpack_threadpool field,
+// we will use it as the threadpool for the delegate created.
+TfLiteDelegate* TfLiteXNNPackDelegateCreateWithThreadpool(
+    const TfLiteXNNPackDelegateOptions* options, TfLiteContext* context);
+
 // Returns the pthreadpool_t object used for parallelization in XNNPACK.
 // Can return NULL if the XNNPack delegate is single-threaded.
 //
diff --git a/tensorflow/lite/examples/experimental_new_converter/BUILD b/tensorflow/lite/examples/experimental_new_converter/BUILD
index a1add2cc1f8..82049d3b976 100644
--- a/tensorflow/lite/examples/experimental_new_converter/BUILD
+++ b/tensorflow/lite/examples/experimental_new_converter/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index b6d0f108e41..f9d368dfbce 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -53,6 +54,7 @@ cc_library(
         "log.h",
     ],
     deps = [
+        "//tensorflow/core/platform:tstring",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
index f876d486477..f0602871540 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers.cc
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <iostream>
 #include <string>
 
+#include "tensorflow/core/platform/ctstring_internal.h"
 #include "tensorflow/lite/examples/label_image/log.h"
 
 namespace tflite {
@@ -92,11 +93,13 @@ std::vector<uint8_t> read_bmp(const std::string& input_bmp_name, int* width,
   file.seekg(0, std::ios::beg);
   file.read(reinterpret_cast<char*>(img_bytes.data()), len);
   const int32_t header_size =
-      *(reinterpret_cast<const int32_t*>(img_bytes.data() + 10));
-  *width = *(reinterpret_cast<const int32_t*>(img_bytes.data() + 18));
-  *height = *(reinterpret_cast<const int32_t*>(img_bytes.data() + 22));
+      TF_le32toh(*(reinterpret_cast<const int32_t*>(img_bytes.data() + 10)));
+  *width =
+      TF_le32toh(*(reinterpret_cast<const int32_t*>(img_bytes.data() + 18)));
+  *height =
+      TF_le32toh(*(reinterpret_cast<const int32_t*>(img_bytes.data() + 22)));
   const int32_t bpp =
-      *(reinterpret_cast<const int32_t*>(img_bytes.data() + 28));
+      TF_le32toh(*(reinterpret_cast<const int32_t*>(img_bytes.data() + 28)));
   *channels = bpp / 8;
 
   if (s->verbose)
diff --git a/tensorflow/lite/examples/minimal/BUILD b/tensorflow/lite/examples/minimal/BUILD
index 40b779c1a95..2d7f43c1c8c 100644
--- a/tensorflow/lite/examples/minimal/BUILD
+++ b/tensorflow/lite/examples/minimal/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/examples/python/BUILD b/tensorflow/lite/examples/python/BUILD
index d31fcf8d905..3ba7dbb5258 100644
--- a/tensorflow/lite/examples/python/BUILD
+++ b/tensorflow/lite/examples/python/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index d9690c635e7..d2e92695140 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -18,6 +18,7 @@ load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index fddaf64b43d..cafb55d4e99 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -18,18 +18,21 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load(":build_defs.bzl", "flatbuffer_schema_compat_test")
 
 # copybara:comment_begin(oss-only)
 load("//tensorflow/tsl/platform/default:build_config.bzl", "tf_proto_library_py")
 # copybara:comment_end
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
     licenses = ["notice"],
 )
 
+# We generate a FlatBuffer schema from the Protobuf schema.
 genrule(
     name = "configuration_schema",
     srcs = ["configuration.proto"],
@@ -37,15 +40,63 @@ genrule(
     # We rename the namespace since otherwise the proto classes and flatbuffer
     # classes would have the same names.
     cmd = """
-    $(location {}) --proto -o $(@D) $(location :configuration.proto)
-    perl -p -i -e 's/tflite.proto/tflite/' $(@D)/configuration.fbs
+        $(location {}) --proto -o $(@D) $(location :configuration.proto)
+        perl -p -i -e 's/tflite.proto/tflite/' $@
     """.format(flatc_path),
     compatible_with = get_compatible_with_portable(),
-    tools = [
-        flatc_path,
+    tools = [flatc_path],
+)
+
+# We also do the same transformation for the _previous_
+# version of the schema -- this is used to test that changes
+# to the schema preserve binary backwards compatibility.
+genrule(
+    name = "configuration_prev_schema",
+    srcs = ["testdata/configuration.proto_prev"],  # Must NOT end in '.proto'.
+    outs = ["configuration.prev.fbs"],  # MUST end in '.fbs'.
+    # We rename the namespace since otherwise the proto classes and flatbuffer
+    # classes would have the same names.
+    cmd = """
+        cp $(location :testdata/configuration.proto_prev) $(@D)/configuration.prev.proto
+        $(location {}) --proto -o $(@D) $(@D)/configuration.prev.proto
+        perl -p -i -e 's/tflite.proto/tflite/' $@
+    """.format(flatc_path),
+    compatible_with = get_compatible_with_portable(),
+    tools = [flatc_path],
+)
+
+# Test that changes to the proto file preserve binary backwards compatibility
+# of the generated FlatBuffer schema, relative to the one generated from the
+# previous proto file.
+flatbuffer_schema_compat_test(
+    name = "configuration_abi_stability_test",
+    ref_schema = ":configuration.prev.fbs",
+    schema = ":configuration.fbs",
+)
+
+# Test that changes to the proto file OR to the FlatBuffer proto-to-flatbuffer
+# schema conversion itself will preserve binary backwards compatibility of the
+# generated FlatBuffer schema, relative to an older snapshot of the
+# generated FlatBuffer schema.
+flatbuffer_schema_compat_test(
+    name = "configuration_flatbuffer_abi_stability_test",
+    ref_schema = "testdata/configuration.old.fbs",
+    schema = ":configuration.fbs",
+)
+
+# Test that the previous version of the proto is different than the current version.
+sh_test(
+    name = "prev_is_different_than_current_test",
+    srcs = ["prev_is_different_than_current_test.sh"],
+    data = [
+        "configuration.proto",
+        "testdata/configuration.proto_prev",
     ],
 )
 
+# Generate a C++ header containing the contents of the FlatBuffer schema
+# as a char array literal.  This is potentially useful for embedding in programs
+# (e.g. for JSON parsing using that schema).
 genrule(
     name = "configuration_fbs_contents_cc",
     srcs = ["configuration.fbs"],
@@ -139,12 +190,13 @@ cc_test(
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "delegate_registry",
     hdrs = ["delegate_registry.h"],
+    copts = tflite_copts_warnings(),
     deps = [
         ":configuration_fbs",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "@com_google_absl//absl/synchronization",
     ],
@@ -174,14 +226,15 @@ cc_library(
     srcs = ["nnapi_plugin.cc"],
     hdrs = ["nnapi_plugin.h"],
     visibility = nnapi_plugin_impl_visibility_allowlist() + [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
     ],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/nnapi:nnapi_implementation_headers",
         "@com_google_absl//absl/memory",
     ],
@@ -198,11 +251,11 @@ cc_test(
     ],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         ":nnapi_plugin",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate_mock_test",
         "//tensorflow/lite/kernels:test_util",
@@ -216,8 +269,8 @@ cc_library(
     srcs = ["hexagon_plugin.cc"],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         "@com_google_absl//absl/memory",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
@@ -250,12 +303,13 @@ cc_library(
         "//conditions:default": [],
     }),
     visibility = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:__pkg__",
         "//tensorflow/lite/experimental/acceleration/configuration/c:__pkg__",
     ],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         "@com_google_absl//absl/memory",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
     ] + select({
         "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
@@ -285,9 +339,9 @@ cc_test(
     deps = [
         ":configuration_cc_proto",
         ":configuration_fbs",
-        ":delegate_registry",
         ":gpu_plugin_impl",
         ":proto_to_flatbuffer",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers//:runtime_cc",
     ],
@@ -298,8 +352,8 @@ cc_library(
     srcs = ["xnnpack_plugin.cc"],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/memory",
@@ -312,8 +366,8 @@ cc_test(
     srcs = ["xnnpack_plugin_test.cc"],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         ":xnnpack_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers//:runtime_cc",
@@ -326,9 +380,9 @@ cc_library(
     srcs = ["coreml_plugin.cc"],
     deps = [
         ":configuration_fbs",
-        ":delegate_registry",
         "@com_google_absl//absl/memory",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
     ] + select({
         "//tensorflow:macos": [
             "//tensorflow/lite/delegates/coreml:coreml_delegate",
@@ -341,4 +395,41 @@ cc_library(
     alwayslink = 1,  # For registration to always run.
 )
 
+# TODO(b/260582614): Add support for TF Lite in Play Services.
+cc_library(
+    name = "stable_delegate_plugin",
+    srcs = ["stable_delegate_plugin.cc"],
+    hdrs = ["stable_delegate_plugin.h"],
+    deps = [
+        ":configuration_fbs",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/core/shims:delegate_plugin",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+        "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = 1,  # For registration to always run.
+)
+
+cc_test(
+    name = "stable_delegate_plugin_test",
+    srcs = ["stable_delegate_plugin_test.cc"],
+    data = ["//tensorflow/lite/delegates/utils/experimental/stable_delegate:libtensorflowlite_stable_xnnpack_delegate.so"],
+    tags = [
+        # TODO(b/259303511): Propagate build config to data correctly to enable the test on x86 platforms.
+        "no_test_android_x86",
+    ],
+    deps = [
+        ":configuration_fbs",
+        ":stable_delegate_plugin",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+        "@pthreadpool",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/experimental/acceleration/configuration/build_defs.bzl b/tensorflow/lite/experimental/acceleration/configuration/build_defs.bzl
new file mode 100644
index 00000000000..db95f203586
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/build_defs.bzl
@@ -0,0 +1,43 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Build macros for checking ABI compatibility."""
+
+load("@flatbuffers//:build_defs.bzl", "flatc_path")
+
+def flatbuffer_schema_compat_test(name, ref_schema, schema):
+    """Generates a test for schema binary compatibility.
+
+    Generates a test that the specified schema file is binary backwards
+    compatible with a reference schema (e.g. a previous version of the
+    schema).
+
+    Note: currently this build macro requires that the schema be a single
+    fully self-contained .fbs file; it does not yet support includes.
+    """
+
+    native.genrule(
+        name = name + "_gen",
+        srcs = [ref_schema, schema],
+        outs = [name + "_test.sh"],
+        tools = [flatc_path],
+        cmd = ("echo $(rootpath {}) --conform $(rootpath {}) $(rootpath {}) > $@"
+            .format(flatc_path, ref_schema, schema)),
+    )
+
+    native.sh_test(
+        name = name,
+        srcs = [name + "_test.sh"],
+        data = [flatc_path, ref_schema, schema],
+    )
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/BUILD b/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
index fad334a80b0..e15ddebf91d 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/BUILD
@@ -13,10 +13,11 @@
 # limitations under the License.
 # ==============================================================================
 
+# LINT.IfChange
+
 # C API for delegate plugins.
 
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
 
 package(
     default_visibility = ["//visibility:private"],
@@ -29,103 +30,71 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:delegate_plugin",
     ],
 )
 
 cc_library(
     name = "nnapi_plugin",
-    srcs = ["nnapi_plugin.cc"],
     hdrs = ["nnapi_plugin.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin_impl",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin",
     ],
 )
 
-cc_test(
+test_suite(
     name = "nnapi_plugin_test",
-    srcs = ["nnapi_plugin_test.cc"],
-    deps = [
-        ":nnapi_plugin",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "@com_google_googletest//:gtest_main",
+    tests = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:nnapi_plugin_test",
     ],
 )
 
 cc_library(
     name = "gpu_plugin",
-    srcs = ["gpu_plugin.cc"],
     hdrs = ["gpu_plugin.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/gpu:delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin_impl",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin",
     ],
 )
 
 # For non-Android platforms, this should be built with '--copt=-DCL_DELEGATE_NO_GL'.
 # On non-supported platforms (i.e. non-Android platforms if -DCL_DELEGATE_NO_GL wasn't specified),
 # the test srcs are set to the empty list, so the test will succeed without testing anything.
-cc_test(
+test_suite(
     name = "gpu_plugin_test",
-    srcs = select({
-        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": ["gpu_plugin_test.cc"],
-        "//conditions:default": [],
-    }),
-    linkopts = gpu_delegate_linkopts(),
-    deps = select({
-        "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [":gpu_plugin"],
-        "//conditions:default": [],
-    }) + [
-        "@com_google_googletest//:gtest_main",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+    tests = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:gpu_plugin_test",
     ],
 )
 
 cc_library(
     name = "xnnpack_plugin",
-    srcs = ["xnnpack_plugin.cc"],
     hdrs = ["xnnpack_plugin.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":delegate_plugin",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin",
     ],
 )
 
-cc_test(
+test_suite(
     name = "xnnpack_plugin_test",
-    srcs = ["xnnpack_plugin_test.cc"],
-    deps = [
-        ":xnnpack_plugin",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//third_party/pthreadpool",
-        "@com_google_googletest//:gtest_main",
+    tests = [
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:xnnpack_plugin_test",
     ],
 )
 
 cc_library(
-    name = "vendor_delegate",
-    hdrs = ["vendor_delegate.h"],
+    name = "stable_delegate",
+    hdrs = ["stable_delegate.h"],
     visibility = [
         "//tensorflow/lite:__subpackages__",
         "@org_tensorflow_lite_support//tensorflow_lite_support:__subpackages__",
     ],
     deps = [
-        "//tensorflow/lite/core/experimental/acceleration/configuration/c:vendor_delegate",
+        "//tensorflow/lite/core/experimental/acceleration/configuration/c:stable_delegate",
     ],
 )
+
+# LINT.ThenChange(BUILD)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
index d5e6c3d4d6d..ccf2794fbe3 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h
@@ -17,44 +17,10 @@ limitations under the License.
 
 // C API types for TF Lite delegate plugins.
 
-#include "tensorflow/lite/c/common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Type of function to allocate and construct a delegate.
-// The tflite_settings parameter should be a pointer to a FlatBuffer table
-// object of type tflite::TFLiteSettings.  (We use 'void *' here since this
-// is a C API so we don't want to directly reference C++ types such
-// as tflite::TFLiteSettings.)
-typedef TfLiteDelegate *TfLiteDelegatePluginCreateFunc(
-    const void *tflite_settings);
-
-// Type of function to destroy and deallocate a delegate.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
-typedef void TfLiteDelegatePluginDestroyFunc(TfLiteDelegate *);
-
-// Type of function to return an error code for the last delegate operation.
-// The delegate argument must have been created with the corresponding
-// create function from the same delegate plugin.
-typedef int TfLiteDelegatePluginGetDelegateErrnoFunc(TfLiteDelegate *);
-
-// Struct to hold all the methods for a delegate plugin.
-typedef struct TfLiteDelegatePlugin {
-  // Function to allocate and construct a delegate.
-  TfLiteDelegatePluginCreateFunc *create;
-
-  // Function to deallocate a delegate.
-  TfLiteDelegatePluginDestroyFunc *destroy;
-
-  // Function to return an error code for the last delegate operation.
-  TfLiteDelegatePluginGetDelegateErrnoFunc *get_delegate_errno;
-} TfLiteDelegatePlugin;
-
-#ifdef __cplusplus
-};  // extern "C"
-#endif
+// NOLINTBEGIN(whitespace/line_length)
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h.
+// NOLINTEND(whitespace/line_length)
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
index db6894d47ba..c91801c2f3f 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h
@@ -26,18 +26,10 @@ limitations under the License.
 // But to provide a C API to access the GPU delegate plugin, we do expose
 // some functions, which are declared below.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the GPU delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteGpuDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// NOLINTBEGIN(whitespace/line_length)
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h.
+// NOLINTEND(whitespace/line_length)
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/gpu_plugin.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_GPU_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
index cef0b441fc3..b10b4e66c7d 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h
@@ -26,18 +26,10 @@ limitations under the License.
 // But to provide a C API to access the NNAPI delegate plugin, we do expose
 // some functions, which are declared below.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the NNAPI delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteNnapiDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// NOLINTBEGIN(whitespace/line_length)
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h.
+// NOLINTEND(whitespace/line_length)
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/nnapi_plugin.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_NNAPI_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h b/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
new file mode 100644
index 00000000000..ca84ed2917f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h
@@ -0,0 +1,20 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h"
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_STABLE_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/vendor_delegate.h b/tensorflow/lite/experimental/acceleration/configuration/c/vendor_delegate.h
deleted file mode 100644
index 8ceb62aea3b..00000000000
--- a/tensorflow/lite/experimental/acceleration/configuration/c/vendor_delegate.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
-
-#include "tensorflow/lite/core/experimental/acceleration/configuration/c/vendor_delegate.h"
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_VENDOR_DELEGATE_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
index a5e4277652a..4ca71a66cef 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h
@@ -26,18 +26,10 @@ limitations under the License.
 // But to provide a C API to access the XNNPACK delegate plugin, we do expose
 // some functions, which are declared below.
 
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C API for the XNNPACK delegate plugin.
-// Returns a pointer to a statically allocated table of function pointers.
-const TfLiteDelegatePlugin* TfLiteXnnpackDelegatePluginCApi();
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
+// NOLINTBEGIN(whitespace/line_length)
+/// For documentation, see
+/// third_party/tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h.
+// NOLINTEND(whitespace/line_length)
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/xnnpack_plugin.h"  // IWYU pragma: export
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_C_XNNPACK_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index 7339da12110..2ef6458e885 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -12,16 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// WARNING: Until b/191428000 is fixed you need to manually generate and update
-// the generated flatbuffer code when modifying this file. See BUILD for more
-// information.
-
+//-----------------------------------------------------------------------------
+// WARNING: read all the warnings below before modifying this file!
+//-----------------------------------------------------------------------------
+//
 // This schema defines how to configure TFLite for delegation. These
 // definitions can be used in multiple ways: as output of a compatibility list,
 // in benchmarking tools and to decouple delegate instantiation from code.
 //
 // The schema is work-in-progress, covering the most broadly used delegates and
 // options.
+//
+// This schema is written using ProtoBuf syntax, but it is also used to generate
+// a corresponding FlatBuffer schema.
+//
+// WARNING: The TfLiteSettings flatbuffer is used as part of the ABI
+// for TensorFlow in Play Services, so please be careful to preserve
+// binary backwards compatibility!
+//
+// WARNING: the Protobuf to Flatbuffer schema conversion does NOT
+// pay any attention to the protobuf field numbers in this file,
+// so setting the protobuf field numbers is NOT sufficient to preserve binary
+// backwards compatibility.  Instead, to preserve backwards binary
+// compatibility, new fields MUST ONLY be added at the END of messages,
+// and fields should NEVER be deleted, but instead can only be deprecated.
+//
+// WARNING: before modifying this file, you should copy the previous contents
+// of this file to 'testdata/configuration.proto_prev'.  This is used to test
+// that your changes will preserve binary backwards compatibility.
+//
+// WARNING: you need to manually generate and update the generated flatbuffer
+// code (configuration_generated.h) when modifying this file. See BUILD for
+// more information.
+
+// LINT.IfChange
 
 syntax = "proto2";
 
@@ -49,6 +73,9 @@ enum ExecutionPreference {
 }
 
 // TFLite accelerator to use.
+//
+// STATUS: support library and the stable delegate loader settings are agnostic
+// to the actual accelerator.
 enum Delegate {
   NONE = 0;
 
@@ -199,7 +226,7 @@ enum GPUInferenceUsage {
 // See
 // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
 message GPUSettings {
-  // Ignored if inference_priority1/2/3 are set.
+  // Obsolete: Ignored if inference_priority1/2/3 are set.
   optional bool is_precision_loss_allowed = 1;
   optional bool enable_quantized_inference = 2 [default = true];
   optional GPUBackend force_backend = 3;
@@ -309,6 +336,18 @@ message CoreMLSettings {
   optional int32 min_nodes_per_partition = 4 [default = 2];
 }
 
+// Stable delegate loader settings.
+//
+// See
+// tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
+// An example stable delegate:
+// tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+message StableDelegateLoaderSettings {
+  // The path of the stable delegate shared object file. Then the stable
+  // delegate provider can dynamically load the shared object file.
+  optional string delegate_path = 1;
+}
+
 // EdgeTPU device spec.
 //
 message EdgeTpuDeviceSpec {
@@ -476,7 +515,22 @@ message TFLiteSettings {
   optional FallbackSettings fallback_settings = 9;
 
   // Whether to disable default delegates (XNNPack).
+  // TODO(b/260405596): Update the comment to clarify the interaction between
+  // `disable_default_delegates` and `fallback_settings`.
   optional bool disable_default_delegates = 12;
+
+  // For loading a stable delegate. If an app supplies a delegate shared library
+  // (e.g. packaged with the app, or downloaded separately), the app can use
+  // this field for passing the path to the delegate shared library.
+  //
+  // The stable delegate loader settings field works together with the settings
+  // of other concrete stable delegates; the stable delegate loader is not a
+  // concrete delegate type but a mechanism for initializing the TF Lite stable
+  // delegates.
+  //
+  // See
+  // tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+  optional StableDelegateLoaderSettings stable_delegate_loader_settings = 13;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
@@ -552,6 +606,7 @@ message BenchmarkMetric {
 // well as sent to server for monitoring.
 //
 // Used with event type END.
+// Next ID: 7
 message BenchmarkResult {
   // Time to load model and apply acceleration. Initialization may get run
   // multiple times to get information on variance.
@@ -568,6 +623,16 @@ message BenchmarkResult {
   optional bool ok = 4;
   // Metrics that were used to determine the 'ok' status.
   repeated BenchmarkMetric metrics = 5;
+
+  message InferenceOutput {
+    // The matching Flatbuffer type is ubyte.
+    optional bytes value = 1;
+  }
+  // Model output in byte format. Each InferenceOutput comes from one output
+  // tensor. It is ordered the same as tflite::Interpreter::output_tensor(),
+  // i.e. the value of output_tensor(i) is stored in actual_output[i]. Only
+  // populated in custom validation case.
+  repeated InferenceOutput actual_output = 6;
 }
 
 // A handled error.
@@ -730,4 +795,6 @@ message MinibenchmarkSettings {
 message BenchmarkEventStorage {
   optional ModelIdGroup model_id_group = 1;
   optional BenchmarkEvent benchmark_event = 2;
-}
\ No newline at end of file
+}
+
+// LINT.ThenChange(//tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
old mode 100755
new mode 100644
index f4a2bd829a6..e4152020eaf
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -46,6 +46,10 @@ struct CoreMLSettings;
 struct CoreMLSettingsBuilder;
 struct CoreMLSettingsT;
 
+struct StableDelegateLoaderSettings;
+struct StableDelegateLoaderSettingsBuilder;
+struct StableDelegateLoaderSettingsT;
+
 struct EdgeTpuDeviceSpec;
 struct EdgeTpuDeviceSpecBuilder;
 struct EdgeTpuDeviceSpecT;
@@ -82,6 +86,14 @@ struct BenchmarkResult;
 struct BenchmarkResultBuilder;
 struct BenchmarkResultT;
 
+namespace BenchmarkResult_ {
+
+struct InferenceOutput;
+struct InferenceOutputBuilder;
+struct InferenceOutputT;
+
+}  // namespace BenchmarkResult_
+
 struct ErrorCode;
 struct ErrorCodeBuilder;
 struct ErrorCodeT;
@@ -110,6 +122,10 @@ struct ModelFile;
 struct ModelFileBuilder;
 struct ModelFileT;
 
+struct ModelIdGroup;
+struct ModelIdGroupBuilder;
+struct ModelIdGroupT;
+
 struct BenchmarkStoragePaths;
 struct BenchmarkStoragePathsBuilder;
 struct BenchmarkStoragePathsT;
@@ -122,6 +138,10 @@ struct MinibenchmarkSettings;
 struct MinibenchmarkSettingsBuilder;
 struct MinibenchmarkSettingsT;
 
+struct BenchmarkEventStorage;
+struct BenchmarkEventStorageBuilder;
+struct BenchmarkEventStorageT;
+
 bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
 bool operator!=(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs);
 bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs);
@@ -134,6 +154,8 @@ bool operator==(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
 bool operator!=(const XNNPackSettingsT &lhs, const XNNPackSettingsT &rhs);
 bool operator==(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
 bool operator!=(const CoreMLSettingsT &lhs, const CoreMLSettingsT &rhs);
+bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
+bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs);
 bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
 bool operator!=(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs);
 bool operator==(const EdgeTpuInactivePowerConfigT &lhs, const EdgeTpuInactivePowerConfigT &rhs);
@@ -152,6 +174,12 @@ bool operator==(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
 bool operator!=(const BenchmarkMetricT &lhs, const BenchmarkMetricT &rhs);
 bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
 bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs);
+namespace BenchmarkResult_ {
+
+bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs);
+}  // namespace BenchmarkResult_
+
 bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
 bool operator!=(const ErrorCodeT &lhs, const ErrorCodeT &rhs);
 bool operator==(const BenchmarkErrorT &lhs, const BenchmarkErrorT &rhs);
@@ -166,12 +194,16 @@ bool operator==(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
 bool operator!=(const MiniBenchmarkEventT &lhs, const MiniBenchmarkEventT &rhs);
 bool operator==(const ModelFileT &lhs, const ModelFileT &rhs);
 bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs);
+bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
+bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs);
 bool operator==(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
 bool operator!=(const BenchmarkStoragePathsT &lhs, const BenchmarkStoragePathsT &rhs);
 bool operator==(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
 bool operator!=(const ValidationSettingsT &lhs, const ValidationSettingsT &rhs);
 bool operator==(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
 bool operator!=(const MinibenchmarkSettingsT &lhs, const MinibenchmarkSettingsT &rhs);
+bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
+bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs);
 
 enum ExecutionPreference : int32_t {
   ExecutionPreference_ANY = 0,
@@ -1517,6 +1549,68 @@ inline flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(
 
 flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(flatbuffers::FlatBufferBuilder &_fbb, const CoreMLSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct StableDelegateLoaderSettingsT : public flatbuffers::NativeTable {
+  typedef StableDelegateLoaderSettings TableType;
+  std::string delegate_path{};
+};
+
+struct StableDelegateLoaderSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef StableDelegateLoaderSettingsT NativeTableType;
+  typedef StableDelegateLoaderSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DELEGATE_PATH = 4
+  };
+  const flatbuffers::String *delegate_path() const {
+    return GetPointer<const flatbuffers::String *>(VT_DELEGATE_PATH);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DELEGATE_PATH) &&
+           verifier.VerifyString(delegate_path()) &&
+           verifier.EndTable();
+  }
+  StableDelegateLoaderSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(StableDelegateLoaderSettingsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<StableDelegateLoaderSettings> Pack(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StableDelegateLoaderSettingsBuilder {
+  typedef StableDelegateLoaderSettings Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_delegate_path(flatbuffers::Offset<flatbuffers::String> delegate_path) {
+    fbb_.AddOffset(StableDelegateLoaderSettings::VT_DELEGATE_PATH, delegate_path);
+  }
+  explicit StableDelegateLoaderSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<StableDelegateLoaderSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<StableDelegateLoaderSettings>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> delegate_path = 0) {
+  StableDelegateLoaderSettingsBuilder builder_(_fbb);
+  builder_.add_delegate_path(delegate_path);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettingsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *delegate_path = nullptr) {
+  auto delegate_path__ = delegate_path ? _fbb.CreateString(delegate_path) : 0;
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      delegate_path__);
+}
+
+flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct EdgeTpuDeviceSpecT : public flatbuffers::NativeTable {
   typedef EdgeTpuDeviceSpec TableType;
   tflite::EdgeTpuDeviceSpec_::PlatformType platform_type = tflite::EdgeTpuDeviceSpec_::PlatformType_MMIO;
@@ -1997,6 +2091,7 @@ struct TFLiteSettingsT : public flatbuffers::NativeTable {
   std::unique_ptr<tflite::CoralSettingsT> coral_settings{};
   std::unique_ptr<tflite::FallbackSettingsT> fallback_settings{};
   bool disable_default_delegates = false;
+  std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
   TFLiteSettingsT() = default;
   TFLiteSettingsT(const TFLiteSettingsT &o);
   TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -2018,7 +2113,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_EDGETPU_SETTINGS = 20,
     VT_CORAL_SETTINGS = 22,
     VT_FALLBACK_SETTINGS = 24,
-    VT_DISABLE_DEFAULT_DELEGATES = 26
+    VT_DISABLE_DEFAULT_DELEGATES = 26,
+    VT_STABLE_DELEGATE_LOADER_SETTINGS = 28
   };
   tflite::Delegate delegate() const {
     return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
@@ -2056,6 +2152,9 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool disable_default_delegates() const {
     return GetField<uint8_t>(VT_DISABLE_DEFAULT_DELEGATES, 0) != 0;
   }
+  const tflite::StableDelegateLoaderSettings *stable_delegate_loader_settings() const {
+    return GetPointer<const tflite::StableDelegateLoaderSettings *>(VT_STABLE_DELEGATE_LOADER_SETTINGS);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
@@ -2079,6 +2178,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
            verifier.VerifyTable(fallback_settings()) &&
            VerifyField<uint8_t>(verifier, VT_DISABLE_DEFAULT_DELEGATES, 1) &&
+           VerifyOffset(verifier, VT_STABLE_DELEGATE_LOADER_SETTINGS) &&
+           verifier.VerifyTable(stable_delegate_loader_settings()) &&
            verifier.EndTable();
   }
   TFLiteSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2126,6 +2227,9 @@ struct TFLiteSettingsBuilder {
   void add_disable_default_delegates(bool disable_default_delegates) {
     fbb_.AddElement<uint8_t>(TFLiteSettings::VT_DISABLE_DEFAULT_DELEGATES, static_cast<uint8_t>(disable_default_delegates), 0);
   }
+  void add_stable_delegate_loader_settings(flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_STABLE_DELEGATE_LOADER_SETTINGS, stable_delegate_loader_settings);
+  }
   explicit TFLiteSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2150,8 +2254,10 @@ inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
     flatbuffers::Offset<tflite::EdgeTpuSettings> edgetpu_settings = 0,
     flatbuffers::Offset<tflite::CoralSettings> coral_settings = 0,
     flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
-    bool disable_default_delegates = false) {
+    bool disable_default_delegates = false,
+    flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0) {
   TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
   builder_.add_fallback_settings(fallback_settings);
   builder_.add_coral_settings(coral_settings);
   builder_.add_edgetpu_settings(edgetpu_settings);
@@ -2316,6 +2422,7 @@ struct BenchmarkResultT : public flatbuffers::NativeTable {
   int32_t max_memory_kb = 0;
   bool ok = false;
   std::vector<std::unique_ptr<tflite::BenchmarkMetricT>> metrics{};
+  std::vector<std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>> actual_output{};
   BenchmarkResultT() = default;
   BenchmarkResultT(const BenchmarkResultT &o);
   BenchmarkResultT(BenchmarkResultT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -2330,7 +2437,8 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_INFERENCE_TIME_US = 6,
     VT_MAX_MEMORY_KB = 8,
     VT_OK = 10,
-    VT_METRICS = 12
+    VT_METRICS = 12,
+    VT_ACTUAL_OUTPUT = 14
   };
   const flatbuffers::Vector<int64_t> *initialization_time_us() const {
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_INITIALIZATION_TIME_US);
@@ -2347,6 +2455,9 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *>(VT_METRICS);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *>(VT_ACTUAL_OUTPUT);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_INITIALIZATION_TIME_US) &&
@@ -2358,6 +2469,9 @@ struct BenchmarkResult FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_METRICS) &&
            verifier.VerifyVector(metrics()) &&
            verifier.VerifyVectorOfTables(metrics()) &&
+           VerifyOffset(verifier, VT_ACTUAL_OUTPUT) &&
+           verifier.VerifyVector(actual_output()) &&
+           verifier.VerifyVectorOfTables(actual_output()) &&
            verifier.EndTable();
   }
   BenchmarkResultT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2384,6 +2498,9 @@ struct BenchmarkResultBuilder {
   void add_metrics(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics) {
     fbb_.AddOffset(BenchmarkResult::VT_METRICS, metrics);
   }
+  void add_actual_output(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output) {
+    fbb_.AddOffset(BenchmarkResult::VT_ACTUAL_OUTPUT, actual_output);
+  }
   explicit BenchmarkResultBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2401,8 +2518,10 @@ inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(
     flatbuffers::Offset<flatbuffers::Vector<int64_t>> inference_time_us = 0,
     int32_t max_memory_kb = 0,
     bool ok = false,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkMetric>>> metrics = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>> actual_output = 0) {
   BenchmarkResultBuilder builder_(_fbb);
+  builder_.add_actual_output(actual_output);
   builder_.add_metrics(metrics);
   builder_.add_max_memory_kb(max_memory_kb);
   builder_.add_inference_time_us(inference_time_us);
@@ -2417,21 +2536,90 @@ inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResultDirect(
     const std::vector<int64_t> *inference_time_us = nullptr,
     int32_t max_memory_kb = 0,
     bool ok = false,
-    const std::vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr) {
+    const std::vector<flatbuffers::Offset<tflite::BenchmarkMetric>> *metrics = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> *actual_output = nullptr) {
   auto initialization_time_us__ = initialization_time_us ? _fbb.CreateVector<int64_t>(*initialization_time_us) : 0;
   auto inference_time_us__ = inference_time_us ? _fbb.CreateVector<int64_t>(*inference_time_us) : 0;
   auto metrics__ = metrics ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkMetric>>(*metrics) : 0;
+  auto actual_output__ = actual_output ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>>(*actual_output) : 0;
   return tflite::CreateBenchmarkResult(
       _fbb,
       initialization_time_us__,
       inference_time_us__,
       max_memory_kb,
       ok,
-      metrics__);
+      metrics__,
+      actual_output__);
 }
 
 flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+namespace BenchmarkResult_ {
+
+struct InferenceOutputT : public flatbuffers::NativeTable {
+  typedef InferenceOutput TableType;
+  std::vector<uint8_t> value{};
+};
+
+struct InferenceOutput FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef InferenceOutputT NativeTableType;
+  typedef InferenceOutputBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUE = 4
+  };
+  const flatbuffers::Vector<uint8_t> *value() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_VALUE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyVector(value()) &&
+           verifier.EndTable();
+  }
+  InferenceOutputT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(InferenceOutputT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<InferenceOutput> Pack(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct InferenceOutputBuilder {
+  typedef InferenceOutput Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_value(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> value) {
+    fbb_.AddOffset(InferenceOutput::VT_VALUE, value);
+  }
+  explicit InferenceOutputBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<InferenceOutput> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<InferenceOutput>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> value = 0) {
+  InferenceOutputBuilder builder_(_fbb);
+  builder_.add_value(value);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutputDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *value = nullptr) {
+  auto value__ = value ? _fbb.CreateVector<uint8_t>(*value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      value__);
+}
+
+flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+}  // namespace BenchmarkResult_
+
 struct ErrorCodeT : public flatbuffers::NativeTable {
   typedef ErrorCode TableType;
   tflite::Delegate source = tflite::Delegate_NONE;
@@ -2968,6 +3156,11 @@ struct ModelFileT : public flatbuffers::NativeTable {
   int64_t fd = 0;
   int64_t offset = 0;
   int64_t length = 0;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  ModelFileT() = default;
+  ModelFileT(const ModelFileT &o);
+  ModelFileT(ModelFileT&&) FLATBUFFERS_NOEXCEPT = default;
+  ModelFileT &operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT;
 };
 
 struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -2977,7 +3170,8 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_FILENAME = 4,
     VT_FD = 6,
     VT_OFFSET = 8,
-    VT_LENGTH = 10
+    VT_LENGTH = 10,
+    VT_MODEL_ID_GROUP = 12
   };
   const flatbuffers::String *filename() const {
     return GetPointer<const flatbuffers::String *>(VT_FILENAME);
@@ -2991,6 +3185,9 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int64_t length() const {
     return GetField<int64_t>(VT_LENGTH, 0);
   }
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_FILENAME) &&
@@ -2998,6 +3195,8 @@ struct ModelFile FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int64_t>(verifier, VT_FD, 8) &&
            VerifyField<int64_t>(verifier, VT_OFFSET, 8) &&
            VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
            verifier.EndTable();
   }
   ModelFileT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3021,6 +3220,9 @@ struct ModelFileBuilder {
   void add_length(int64_t length) {
     fbb_.AddElement<int64_t>(ModelFile::VT_LENGTH, length, 0);
   }
+  void add_model_id_group(flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(ModelFile::VT_MODEL_ID_GROUP, model_id_group);
+  }
   explicit ModelFileBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3037,11 +3239,13 @@ inline flatbuffers::Offset<ModelFile> CreateModelFile(
     flatbuffers::Offset<flatbuffers::String> filename = 0,
     int64_t fd = 0,
     int64_t offset = 0,
-    int64_t length = 0) {
+    int64_t length = 0,
+    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0) {
   ModelFileBuilder builder_(_fbb);
   builder_.add_length(length);
   builder_.add_offset(offset);
   builder_.add_fd(fd);
+  builder_.add_model_id_group(model_id_group);
   builder_.add_filename(filename);
   return builder_.Finish();
 }
@@ -3051,18 +3255,97 @@ inline flatbuffers::Offset<ModelFile> CreateModelFileDirect(
     const char *filename = nullptr,
     int64_t fd = 0,
     int64_t offset = 0,
-    int64_t length = 0) {
+    int64_t length = 0,
+    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0) {
   auto filename__ = filename ? _fbb.CreateString(filename) : 0;
   return tflite::CreateModelFile(
       _fbb,
       filename__,
       fd,
       offset,
-      length);
+      length,
+      model_id_group);
 }
 
 flatbuffers::Offset<ModelFile> CreateModelFile(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ModelIdGroupT : public flatbuffers::NativeTable {
+  typedef ModelIdGroup TableType;
+  std::string model_namespace{};
+  std::string model_id{};
+};
+
+struct ModelIdGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ModelIdGroupT NativeTableType;
+  typedef ModelIdGroupBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_NAMESPACE = 4,
+    VT_MODEL_ID = 6
+  };
+  const flatbuffers::String *model_namespace() const {
+    return GetPointer<const flatbuffers::String *>(VT_MODEL_NAMESPACE);
+  }
+  const flatbuffers::String *model_id() const {
+    return GetPointer<const flatbuffers::String *>(VT_MODEL_ID);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_NAMESPACE) &&
+           verifier.VerifyString(model_namespace()) &&
+           VerifyOffset(verifier, VT_MODEL_ID) &&
+           verifier.VerifyString(model_id()) &&
+           verifier.EndTable();
+  }
+  ModelIdGroupT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelIdGroupT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ModelIdGroup> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelIdGroupBuilder {
+  typedef ModelIdGroup Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_model_namespace(flatbuffers::Offset<flatbuffers::String> model_namespace) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_NAMESPACE, model_namespace);
+  }
+  void add_model_id(flatbuffers::Offset<flatbuffers::String> model_id) {
+    fbb_.AddOffset(ModelIdGroup::VT_MODEL_ID, model_id);
+  }
+  explicit ModelIdGroupBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ModelIdGroup> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ModelIdGroup>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> model_namespace = 0,
+    flatbuffers::Offset<flatbuffers::String> model_id = 0) {
+  ModelIdGroupBuilder builder_(_fbb);
+  builder_.add_model_id(model_id);
+  builder_.add_model_namespace(model_namespace);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroupDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *model_namespace = nullptr,
+    const char *model_id = nullptr) {
+  auto model_namespace__ = model_namespace ? _fbb.CreateString(model_namespace) : 0;
+  auto model_id__ = model_id ? _fbb.CreateString(model_id) : 0;
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      model_namespace__,
+      model_id__);
+}
+
+flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct BenchmarkStoragePathsT : public flatbuffers::NativeTable {
   typedef BenchmarkStoragePaths TableType;
   std::string storage_file_path{};
@@ -3301,6 +3584,75 @@ inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettingsDir
 
 flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(flatbuffers::FlatBufferBuilder &_fbb, const MinibenchmarkSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BenchmarkEventStorageT : public flatbuffers::NativeTable {
+  typedef BenchmarkEventStorage TableType;
+  std::unique_ptr<tflite::ModelIdGroupT> model_id_group{};
+  std::unique_ptr<tflite::BenchmarkEventT> benchmark_event{};
+  BenchmarkEventStorageT() = default;
+  BenchmarkEventStorageT(const BenchmarkEventStorageT &o);
+  BenchmarkEventStorageT(BenchmarkEventStorageT&&) FLATBUFFERS_NOEXCEPT = default;
+  BenchmarkEventStorageT &operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct BenchmarkEventStorage FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BenchmarkEventStorageT NativeTableType;
+  typedef BenchmarkEventStorageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_ID_GROUP = 4,
+    VT_BENCHMARK_EVENT = 6
+  };
+  const tflite::ModelIdGroup *model_id_group() const {
+    return GetPointer<const tflite::ModelIdGroup *>(VT_MODEL_ID_GROUP);
+  }
+  const tflite::BenchmarkEvent *benchmark_event() const {
+    return GetPointer<const tflite::BenchmarkEvent *>(VT_BENCHMARK_EVENT);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_ID_GROUP) &&
+           verifier.VerifyTable(model_id_group()) &&
+           VerifyOffset(verifier, VT_BENCHMARK_EVENT) &&
+           verifier.VerifyTable(benchmark_event()) &&
+           verifier.EndTable();
+  }
+  BenchmarkEventStorageT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BenchmarkEventStorageT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BenchmarkEventStorage> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BenchmarkEventStorageBuilder {
+  typedef BenchmarkEventStorage Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_model_id_group(flatbuffers::Offset<tflite::ModelIdGroup> model_id_group) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_MODEL_ID_GROUP, model_id_group);
+  }
+  void add_benchmark_event(flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event) {
+    fbb_.AddOffset(BenchmarkEventStorage::VT_BENCHMARK_EVENT, benchmark_event);
+  }
+  explicit BenchmarkEventStorageBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<BenchmarkEventStorage> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BenchmarkEventStorage>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<tflite::ModelIdGroup> model_id_group = 0,
+    flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event = 0) {
+  BenchmarkEventStorageBuilder builder_(_fbb);
+  builder_.add_benchmark_event(benchmark_event);
+  builder_.add_model_id_group(model_id_group);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 
 inline bool operator==(const ComputeSettingsT &lhs, const ComputeSettingsT &rhs) {
   return
@@ -3692,6 +4044,43 @@ inline flatbuffers::Offset<CoreMLSettings> CreateCoreMLSettings(flatbuffers::Fla
 }
 
 
+inline bool operator==(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+  return
+      (lhs.delegate_path == rhs.delegate_path);
+}
+
+inline bool operator!=(const StableDelegateLoaderSettingsT &lhs, const StableDelegateLoaderSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline StableDelegateLoaderSettingsT *StableDelegateLoaderSettings::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<StableDelegateLoaderSettingsT>(new StableDelegateLoaderSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void StableDelegateLoaderSettings::UnPackTo(StableDelegateLoaderSettingsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = delegate_path(); if (_e) _o->delegate_path = _e->str(); }
+}
+
+inline flatbuffers::Offset<StableDelegateLoaderSettings> StableDelegateLoaderSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStableDelegateLoaderSettings(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<StableDelegateLoaderSettings> CreateStableDelegateLoaderSettings(flatbuffers::FlatBufferBuilder &_fbb, const StableDelegateLoaderSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StableDelegateLoaderSettingsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _delegate_path = _o->delegate_path.empty() ? 0 : _fbb.CreateString(_o->delegate_path);
+  return tflite::CreateStableDelegateLoaderSettings(
+      _fbb,
+      _delegate_path);
+}
+
+
 inline bool operator==(const EdgeTpuDeviceSpecT &lhs, const EdgeTpuDeviceSpecT &rhs) {
   return
       (lhs.platform_type == rhs.platform_type) &&
@@ -3964,7 +4353,8 @@ inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
       ((lhs.edgetpu_settings == rhs.edgetpu_settings) || (lhs.edgetpu_settings && rhs.edgetpu_settings && *lhs.edgetpu_settings == *rhs.edgetpu_settings)) &&
       ((lhs.coral_settings == rhs.coral_settings) || (lhs.coral_settings && rhs.coral_settings && *lhs.coral_settings == *rhs.coral_settings)) &&
       ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
-      (lhs.disable_default_delegates == rhs.disable_default_delegates);
+      (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
+      ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings));
 }
 
 inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
@@ -3984,7 +4374,8 @@ inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
         edgetpu_settings((o.edgetpu_settings) ? new tflite::EdgeTpuSettingsT(*o.edgetpu_settings) : nullptr),
         coral_settings((o.coral_settings) ? new tflite::CoralSettingsT(*o.coral_settings) : nullptr),
         fallback_settings((o.fallback_settings) ? new tflite::FallbackSettingsT(*o.fallback_settings) : nullptr),
-        disable_default_delegates(o.disable_default_delegates) {
+        disable_default_delegates(o.disable_default_delegates),
+        stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr) {
 }
 
 inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
@@ -4000,6 +4391,7 @@ inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFER
   std::swap(coral_settings, o.coral_settings);
   std::swap(fallback_settings, o.fallback_settings);
   std::swap(disable_default_delegates, o.disable_default_delegates);
+  std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
   return *this;
 }
 
@@ -4024,6 +4416,7 @@ inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const flatbuffers::res
   { auto _e = coral_settings(); if (_e) { if(_o->coral_settings) { _e->UnPackTo(_o->coral_settings.get(), _resolver); } else { _o->coral_settings = std::unique_ptr<tflite::CoralSettingsT>(_e->UnPack(_resolver)); } } }
   { auto _e = fallback_settings(); if (_e) { if(_o->fallback_settings) { _e->UnPackTo(_o->fallback_settings.get(), _resolver); } else { _o->fallback_settings = std::unique_ptr<tflite::FallbackSettingsT>(_e->UnPack(_resolver)); } } }
   { auto _e = disable_default_delegates(); _o->disable_default_delegates = _e; }
+  { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } }
 }
 
 inline flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4046,6 +4439,7 @@ inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(flatbuffers::Fla
   auto _coral_settings = _o->coral_settings ? CreateCoralSettings(_fbb, _o->coral_settings.get(), _rehasher) : 0;
   auto _fallback_settings = _o->fallback_settings ? CreateFallbackSettings(_fbb, _o->fallback_settings.get(), _rehasher) : 0;
   auto _disable_default_delegates = _o->disable_default_delegates;
+  auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
   return tflite::CreateTFLiteSettings(
       _fbb,
       _delegate,
@@ -4059,7 +4453,8 @@ inline flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(flatbuffers::Fla
       _edgetpu_settings,
       _coral_settings,
       _fallback_settings,
-      _disable_default_delegates);
+      _disable_default_delegates,
+      _stable_delegate_loader_settings);
 }
 
 
@@ -4151,7 +4546,8 @@ inline bool operator==(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs)
       (lhs.inference_time_us == rhs.inference_time_us) &&
       (lhs.max_memory_kb == rhs.max_memory_kb) &&
       (lhs.ok == rhs.ok) &&
-      (lhs.metrics == rhs.metrics);
+      (lhs.metrics == rhs.metrics) &&
+      (lhs.actual_output == rhs.actual_output);
 }
 
 inline bool operator!=(const BenchmarkResultT &lhs, const BenchmarkResultT &rhs) {
@@ -4166,6 +4562,8 @@ inline BenchmarkResultT::BenchmarkResultT(const BenchmarkResultT &o)
         ok(o.ok) {
   metrics.reserve(o.metrics.size());
   for (const auto &v : o.metrics) { metrics.emplace_back((v) ? new tflite::BenchmarkMetricT(*v) : nullptr); }
+  actual_output.reserve(o.actual_output.size());
+  for (const auto &v : o.actual_output) { actual_output.emplace_back((v) ? new tflite::BenchmarkResult_::InferenceOutputT(*v) : nullptr); }
 }
 
 inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUFFERS_NOEXCEPT {
@@ -4174,6 +4572,7 @@ inline BenchmarkResultT &BenchmarkResultT::operator=(BenchmarkResultT o) FLATBUF
   std::swap(max_memory_kb, o.max_memory_kb);
   std::swap(ok, o.ok);
   std::swap(metrics, o.metrics);
+  std::swap(actual_output, o.actual_output);
   return *this;
 }
 
@@ -4191,6 +4590,7 @@ inline void BenchmarkResult::UnPackTo(BenchmarkResultT *_o, const flatbuffers::r
   { auto _e = max_memory_kb(); _o->max_memory_kb = _e; }
   { auto _e = ok(); _o->ok = _e; }
   { auto _e = metrics(); if (_e) { _o->metrics.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->metrics[_i]) { _e->Get(_i)->UnPackTo(_o->metrics[_i].get(), _resolver); } else { _o->metrics[_i] = std::unique_ptr<tflite::BenchmarkMetricT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+  { auto _e = actual_output(); if (_e) { _o->actual_output.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->actual_output[_i]) { _e->Get(_i)->UnPackTo(_o->actual_output[_i].get(), _resolver); } else { _o->actual_output[_i] = std::unique_ptr<tflite::BenchmarkResult_::InferenceOutputT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
 }
 
 inline flatbuffers::Offset<BenchmarkResult> BenchmarkResult::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkResultT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4206,15 +4606,58 @@ inline flatbuffers::Offset<BenchmarkResult> CreateBenchmarkResult(flatbuffers::F
   auto _max_memory_kb = _o->max_memory_kb;
   auto _ok = _o->ok;
   auto _metrics = _o->metrics.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkMetric>> (_o->metrics.size(), [](size_t i, _VectorArgs *__va) { return CreateBenchmarkMetric(*__va->__fbb, __va->__o->metrics[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _actual_output = _o->actual_output.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::BenchmarkResult_::InferenceOutput>> (_o->actual_output.size(), [](size_t i, _VectorArgs *__va) { return CreateInferenceOutput(*__va->__fbb, __va->__o->actual_output[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateBenchmarkResult(
       _fbb,
       _initialization_time_us,
       _inference_time_us,
       _max_memory_kb,
       _ok,
-      _metrics);
+      _metrics,
+      _actual_output);
+}
+
+namespace BenchmarkResult_ {
+
+
+inline bool operator==(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+  return
+      (lhs.value == rhs.value);
+}
+
+inline bool operator!=(const InferenceOutputT &lhs, const InferenceOutputT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline InferenceOutputT *InferenceOutput::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<InferenceOutputT>(new InferenceOutputT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void InferenceOutput::UnPackTo(InferenceOutputT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = value(); if (_e) { _o->value.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->value.begin()); } }
+}
+
+inline flatbuffers::Offset<InferenceOutput> InferenceOutput::Pack(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateInferenceOutput(_fbb, _o, _rehasher);
 }
 
+inline flatbuffers::Offset<InferenceOutput> CreateInferenceOutput(flatbuffers::FlatBufferBuilder &_fbb, const InferenceOutputT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const InferenceOutputT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _value = _o->value.size() ? _fbb.CreateVector(_o->value) : 0;
+  return tflite::BenchmarkResult_::CreateInferenceOutput(
+      _fbb,
+      _value);
+}
+
+}  // namespace BenchmarkResult_
+
 
 inline bool operator==(const ErrorCodeT &lhs, const ErrorCodeT &rhs) {
   return
@@ -4572,7 +5015,8 @@ inline bool operator==(const ModelFileT &lhs, const ModelFileT &rhs) {
       (lhs.filename == rhs.filename) &&
       (lhs.fd == rhs.fd) &&
       (lhs.offset == rhs.offset) &&
-      (lhs.length == rhs.length);
+      (lhs.length == rhs.length) &&
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group));
 }
 
 inline bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs) {
@@ -4580,6 +5024,23 @@ inline bool operator!=(const ModelFileT &lhs, const ModelFileT &rhs) {
 }
 
 
+inline ModelFileT::ModelFileT(const ModelFileT &o)
+      : filename(o.filename),
+        fd(o.fd),
+        offset(o.offset),
+        length(o.length),
+        model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr) {
+}
+
+inline ModelFileT &ModelFileT::operator=(ModelFileT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(filename, o.filename);
+  std::swap(fd, o.fd);
+  std::swap(offset, o.offset);
+  std::swap(length, o.length);
+  std::swap(model_id_group, o.model_id_group);
+  return *this;
+}
+
 inline ModelFileT *ModelFile::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = std::unique_ptr<ModelFileT>(new ModelFileT());
   UnPackTo(_o.get(), _resolver);
@@ -4593,6 +5054,7 @@ inline void ModelFile::UnPackTo(ModelFileT *_o, const flatbuffers::resolver_func
   { auto _e = fd(); _o->fd = _e; }
   { auto _e = offset(); _o->offset = _e; }
   { auto _e = length(); _o->length = _e; }
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } }
 }
 
 inline flatbuffers::Offset<ModelFile> ModelFile::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelFileT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4607,12 +5069,55 @@ inline flatbuffers::Offset<ModelFile> CreateModelFile(flatbuffers::FlatBufferBui
   auto _fd = _o->fd;
   auto _offset = _o->offset;
   auto _length = _o->length;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
   return tflite::CreateModelFile(
       _fbb,
       _filename,
       _fd,
       _offset,
-      _length);
+      _length,
+      _model_id_group);
+}
+
+
+inline bool operator==(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+  return
+      (lhs.model_namespace == rhs.model_namespace) &&
+      (lhs.model_id == rhs.model_id);
+}
+
+inline bool operator!=(const ModelIdGroupT &lhs, const ModelIdGroupT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ModelIdGroupT *ModelIdGroup::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ModelIdGroupT>(new ModelIdGroupT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ModelIdGroup::UnPackTo(ModelIdGroupT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_namespace(); if (_e) _o->model_namespace = _e->str(); }
+  { auto _e = model_id(); if (_e) _o->model_id = _e->str(); }
+}
+
+inline flatbuffers::Offset<ModelIdGroup> ModelIdGroup::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModelIdGroup(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ModelIdGroup> CreateModelIdGroup(flatbuffers::FlatBufferBuilder &_fbb, const ModelIdGroupT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelIdGroupT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_namespace = _o->model_namespace.empty() ? 0 : _fbb.CreateString(_o->model_namespace);
+  auto _model_id = _o->model_id.empty() ? 0 : _fbb.CreateString(_o->model_id);
+  return tflite::CreateModelIdGroup(
+      _fbb,
+      _model_namespace,
+      _model_id);
 }
 
 
@@ -4758,6 +5263,58 @@ inline flatbuffers::Offset<MinibenchmarkSettings> CreateMinibenchmarkSettings(fl
       _validation_settings);
 }
 
+
+inline bool operator==(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+  return
+      ((lhs.model_id_group == rhs.model_id_group) || (lhs.model_id_group && rhs.model_id_group && *lhs.model_id_group == *rhs.model_id_group)) &&
+      ((lhs.benchmark_event == rhs.benchmark_event) || (lhs.benchmark_event && rhs.benchmark_event && *lhs.benchmark_event == *rhs.benchmark_event));
+}
+
+inline bool operator!=(const BenchmarkEventStorageT &lhs, const BenchmarkEventStorageT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline BenchmarkEventStorageT::BenchmarkEventStorageT(const BenchmarkEventStorageT &o)
+      : model_id_group((o.model_id_group) ? new tflite::ModelIdGroupT(*o.model_id_group) : nullptr),
+        benchmark_event((o.benchmark_event) ? new tflite::BenchmarkEventT(*o.benchmark_event) : nullptr) {
+}
+
+inline BenchmarkEventStorageT &BenchmarkEventStorageT::operator=(BenchmarkEventStorageT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_id_group, o.model_id_group);
+  std::swap(benchmark_event, o.benchmark_event);
+  return *this;
+}
+
+inline BenchmarkEventStorageT *BenchmarkEventStorage::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<BenchmarkEventStorageT>(new BenchmarkEventStorageT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void BenchmarkEventStorage::UnPackTo(BenchmarkEventStorageT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_id_group(); if (_e) { if(_o->model_id_group) { _e->UnPackTo(_o->model_id_group.get(), _resolver); } else { _o->model_id_group = std::unique_ptr<tflite::ModelIdGroupT>(_e->UnPack(_resolver)); } } }
+  { auto _e = benchmark_event(); if (_e) { if(_o->benchmark_event) { _e->UnPackTo(_o->benchmark_event.get(), _resolver); } else { _o->benchmark_event = std::unique_ptr<tflite::BenchmarkEventT>(_e->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<BenchmarkEventStorage> BenchmarkEventStorage::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBenchmarkEventStorage(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BenchmarkEventStorage> CreateBenchmarkEventStorage(flatbuffers::FlatBufferBuilder &_fbb, const BenchmarkEventStorageT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BenchmarkEventStorageT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_id_group = _o->model_id_group ? CreateModelIdGroup(_fbb, _o->model_id_group.get(), _rehasher) : 0;
+  auto _benchmark_event = _o->benchmark_event ? CreateBenchmarkEvent(_fbb, _o->benchmark_event.get(), _rehasher) : 0;
+  return tflite::CreateBenchmarkEventStorage(
+      _fbb,
+      _model_id_group,
+      _benchmark_event);
+}
+
 }  // namespace tflite
 
 #endif  // FLATBUFFERS_GENERATED_CONFIGURATION_TFLITE_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/coreml_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/coreml_plugin.cc
index 9b16aa39157..855c6f7e4ca 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/coreml_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/coreml_plugin.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/minimal_logging.h"
 
 // Guarding anyway although this file not expected to be compiled for non-Apple.
diff --git a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
index cd22d99533b..2c6a3fcd57f 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,10 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
 
-// NOLINTBEGIN(whitespace/line_length)
-/// For documentation, see
-/// third_party/tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h.
-// NOLINTEND(whitespace/line_length)
-#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+
+namespace tflite {
+namespace delegates {
+
+using TfLiteOpaqueDelegatePtr = ::tflite::delegates::TfLiteDelegatePtr;
+using DelegatePluginInterface = ::tflite::delegates::DelegatePluginInterface;
+using DelegatePluginRegistry = ::tflite::delegates::DelegatePluginRegistry;
+
+}  // namespace delegates
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_DELEGATE_REGISTRY_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc
index d5eeb12325f..296c760fdee 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto.cc
@@ -327,6 +327,16 @@ proto::EdgeTpuSettings ConvertEdgeTpuSettings(const EdgeTpuSettings& settings) {
   return proto_settings;
 }
 
+proto::StableDelegateLoaderSettings ConvertStableDelegateLoaderSettings(
+    const StableDelegateLoaderSettings& settings) {
+  proto::StableDelegateLoaderSettings proto_settings;
+  if (settings.delegate_path() != nullptr) {
+    proto_settings.set_delegate_path(settings.delegate_path()->str());
+  }
+
+  return proto_settings;
+}
+
 proto::CoralSettings ConvertCoralSettings(const CoralSettings& settings) {
   proto::CoralSettings proto_settings;
   if (settings.device() != nullptr) {
@@ -344,45 +354,51 @@ proto::TFLiteSettings ConvertTfliteSettings(const TFLiteSettings& settings) {
   proto::TFLiteSettings proto_settings;
   proto_settings.set_delegate(ConvertDelegate(settings.delegate()));
   if (settings.nnapi_settings() != nullptr) {
-    *(proto_settings.mutable_nnapi_settings()) =
+    *proto_settings.mutable_nnapi_settings() =
         ConvertNNAPISettings(*settings.nnapi_settings());
   }
   if (settings.gpu_settings() != nullptr) {
-    *(proto_settings.mutable_gpu_settings()) =
+    *proto_settings.mutable_gpu_settings() =
         ConvertGPUSettings(*settings.gpu_settings());
   }
   if (settings.hexagon_settings() != nullptr) {
-    *(proto_settings.mutable_hexagon_settings()) =
+    *proto_settings.mutable_hexagon_settings() =
         ConvertHexagonSettings(*settings.hexagon_settings());
   }
 
   if (settings.xnnpack_settings() != nullptr) {
-    *(proto_settings.mutable_xnnpack_settings()) =
+    *proto_settings.mutable_xnnpack_settings() =
         ConvertXNNPackSettings(*settings.xnnpack_settings());
   }
 
   if (settings.coreml_settings() != nullptr) {
-    *(proto_settings.mutable_coreml_settings()) =
+    *proto_settings.mutable_coreml_settings() =
         ConvertCoreMLSettings(*settings.coreml_settings());
   }
 
+  if (settings.stable_delegate_loader_settings() != nullptr) {
+    *proto_settings.mutable_stable_delegate_loader_settings() =
+        ConvertStableDelegateLoaderSettings(
+            *settings.stable_delegate_loader_settings());
+  }
+
   if (settings.cpu_settings() != nullptr) {
-    *(proto_settings.mutable_cpu_settings()) =
+    *proto_settings.mutable_cpu_settings() =
         ConvertCPUSettings(*settings.cpu_settings());
   }
 
   proto_settings.set_max_delegated_partitions(
       settings.max_delegated_partitions());
   if (settings.edgetpu_settings() != nullptr) {
-    *(proto_settings.mutable_edgetpu_settings()) =
+    *proto_settings.mutable_edgetpu_settings() =
         ConvertEdgeTpuSettings(*settings.edgetpu_settings());
   }
   if (settings.coral_settings() != nullptr) {
-    *(proto_settings.mutable_coral_settings()) =
+    *proto_settings.mutable_coral_settings() =
         ConvertCoralSettings(*settings.coral_settings());
   }
   if (settings.fallback_settings() != nullptr) {
-    *(proto_settings.mutable_fallback_settings()) =
+    *proto_settings.mutable_fallback_settings() =
         ConvertFallbackSettings(*settings.fallback_settings());
   }
   proto_settings.set_disable_default_delegates(
diff --git a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc
index 26b594c732a..3467de23b27 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/flatbuffer_to_proto_test.cc
@@ -467,6 +467,21 @@ TEST_F(ConversionTest, CoralSettings) {
   EXPECT_EQ(768, output_settings.usb_max_bulk_in_queue_length());
 }
 
+TEST_F(ConversionTest, StableDelegateLoaderSettings) {
+  const std::string kDelegatePath = "TEST_DELEGATE_PATH";
+  settings_.tflite_settings = std::make_unique<TFLiteSettingsT>();
+  settings_.tflite_settings->stable_delegate_loader_settings =
+      std::make_unique<StableDelegateLoaderSettingsT>();
+
+  settings_.tflite_settings->stable_delegate_loader_settings->delegate_path =
+      kDelegatePath;
+  EXPECT_EQ(ConvertFromFlatbuffer(settings_)
+                .tflite_settings()
+                .stable_delegate_loader_settings()
+                .delegate_path(),
+            kDelegatePath);
+}
+
 TEST_F(ConversionTest, CPUSettings) {
   settings_.tflite_settings = std::make_unique<TFLiteSettingsT>();
   settings_.tflite_settings->cpu_settings = std::make_unique<CPUSettingsT>();
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
index b77f56fa8f6..0bbadded790 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin.h
@@ -36,8 +36,8 @@ limitations under the License.
 #endif
 #endif
 
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc
index 52e52e5511e..ce718dc61cb 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/gpu_plugin_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
index 44acc2370f2..809137bb087 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/hexagon_plugin.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 
 #if defined(__ARM_ARCH)
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
index 52f8c9ad9b1..3e4ee56273b 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
index cbb58b87c85..debc8da1c95 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/nnapi_plugin_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/kernels/test_util.h"
 
 // Tests for checking that the NNAPI Delegate plugin correctly handles all the
@@ -284,7 +284,9 @@ class NNAPIMultiOpPluginTest : public ::testing::Test {
                              /* hexagon_settings */ 0,
                              /* xnnpack_settings */ 0,
                              /* coreml_settings */ 0,
-                             /* cpu_settings */ 0, max_delegated_partitions));
+                             /* cpu_settings */ 0, max_delegated_partitions,
+                             /* disable_default_delegates */ false,
+                             /* stable_delegate_loader_settings */ 0));
 
     plugin_ = delegates::DelegatePluginRegistry::CreateByName(
         "NnapiPlugin", *tflite_settings_);
diff --git a/tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh b/tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh
new file mode 100755
index 00000000000..14fdc28913b
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/prev_is_different_than_current_test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o nounset
+set -o errexit
+
+readonly DIR_PREFIX="third_party/tensorflow/lite/experimental/acceleration/configuration"
+readonly CURRENT_PROTO="$DIR_PREFIX/configuration.proto"
+readonly PREV_PROTO="$DIR_PREFIX/testdata/configuration.proto_prev"
+
+diff -u $PREV_PROTO $CURRENT_PROTO &&
+  die "Current proto should be different than prev proto"
+
+echo "PASS"
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
index 2de1ded0c4b..487c577f8ba 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
@@ -174,21 +174,21 @@ EdgeTpuPowerState ConvertEdgeTpuPowerState(proto::EdgeTpuPowerState state) {
 }
 
 Offset<FallbackSettings> ConvertFallbackSettings(
-    const proto::FallbackSettings& settings, FlatBufferBuilder* builder) {
+    const proto::FallbackSettings& settings, FlatBufferBuilder& builder) {
   return CreateFallbackSettings(
-      *builder, /*allow_automatic_fallback_on_compilation_error=*/
+      builder, /*allow_automatic_fallback_on_compilation_error=*/
       settings.allow_automatic_fallback_on_compilation_error(),
       /*allow_automatic_fallback_on_execution_error=*/
       settings.allow_automatic_fallback_on_execution_error());
 }
 
 Offset<NNAPISettings> ConvertNNAPISettings(const proto::NNAPISettings& settings,
-                                           FlatBufferBuilder* builder) {
+                                           FlatBufferBuilder& builder) {
   return CreateNNAPISettings(
-      *builder,
-      /*accelerator_name=*/builder->CreateString(settings.accelerator_name()),
-      /*cache_directory=*/builder->CreateString(settings.cache_directory()),
-      /*model_token=*/builder->CreateString(settings.model_token()),
+      builder,
+      /*accelerator_name=*/builder.CreateString(settings.accelerator_name()),
+      /*cache_directory=*/builder.CreateString(settings.cache_directory()),
+      /*model_token=*/builder.CreateString(settings.model_token()),
       ConvertNNAPIExecutionPreference(settings.execution_preference()),
       /*no_of_nnapi_instances_to_cache=*/
       settings.no_of_nnapi_instances_to_cache(),
@@ -207,9 +207,9 @@ Offset<NNAPISettings> ConvertNNAPISettings(const proto::NNAPISettings& settings,
 }
 
 Offset<GPUSettings> ConvertGPUSettings(const proto::GPUSettings& settings,
-                                       FlatBufferBuilder* builder) {
+                                       FlatBufferBuilder& builder) {
   return CreateGPUSettings(
-      *builder,
+      builder,
       /*is_precision_loss_allowed=*/settings.is_precision_loss_allowed(),
       /*enable_quantized_inference=*/settings.enable_quantized_inference(),
       ConvertGPUBackend(settings.force_backend()),
@@ -217,14 +217,14 @@ Offset<GPUSettings> ConvertGPUSettings(const proto::GPUSettings& settings,
       ConvertGPUInferencePriority(settings.inference_priority2()),
       ConvertGPUInferencePriority(settings.inference_priority3()),
       ConvertGPUInferenceUsage(settings.inference_preference()),
-      /*cache_directory=*/builder->CreateString(settings.cache_directory()),
-      /*model_token=*/builder->CreateString(settings.model_token()));
+      /*cache_directory=*/builder.CreateString(settings.cache_directory()),
+      /*model_token=*/builder.CreateString(settings.model_token()));
 }
 
 Offset<HexagonSettings> ConvertHexagonSettings(
-    const proto::HexagonSettings& settings, FlatBufferBuilder* builder) {
+    const proto::HexagonSettings& settings, FlatBufferBuilder& builder) {
   return CreateHexagonSettings(
-      *builder,
+      builder,
       /*debug_level=*/settings.debug_level(),
       /*powersave_level=*/settings.powersave_level(),
       /*print_graph_profile=*/settings.print_graph_profile(),
@@ -232,15 +232,15 @@ Offset<HexagonSettings> ConvertHexagonSettings(
 }
 
 Offset<XNNPackSettings> ConvertXNNPackSettings(
-    const proto::XNNPackSettings& settings, FlatBufferBuilder* builder) {
+    const proto::XNNPackSettings& settings, FlatBufferBuilder& builder) {
   return CreateXNNPackSettings(
-      *builder,
+      builder,
       /*num_threads=*/settings.num_threads(),
       /*flags=*/tflite::XNNPackFlags(settings.flags()));
 }
 
 Offset<CoreMLSettings> ConvertCoreMLSettings(
-    const proto::CoreMLSettings& settings, FlatBufferBuilder* builder) {
+    const proto::CoreMLSettings& settings, FlatBufferBuilder& builder) {
   tflite::CoreMLSettings_::EnabledDevices enabled_devices =
       tflite::CoreMLSettings_::EnabledDevices_DEVICES_ALL;
   switch (settings.enabled_devices()) {
@@ -257,37 +257,44 @@ Offset<CoreMLSettings> ConvertCoreMLSettings(
   }
 
   return CreateCoreMLSettings(
-      *builder, enabled_devices, settings.coreml_version(),
+      builder, enabled_devices, settings.coreml_version(),
       settings.max_delegated_partitions(), settings.min_nodes_per_partition());
 }
 
+Offset<StableDelegateLoaderSettings> ConvertStableDelegateLoaderSettings(
+    const proto::StableDelegateLoaderSettings& settings,
+    FlatBufferBuilder& builder) {
+  return CreateStableDelegateLoaderSettings(
+      builder, builder.CreateString(settings.delegate_path()));
+}
+
 Offset<CPUSettings> ConvertCPUSettings(const proto::CPUSettings& settings,
-                                       FlatBufferBuilder* builder) {
-  return CreateCPUSettings(*builder,
+                                       FlatBufferBuilder& builder) {
+  return CreateCPUSettings(builder,
                            /*num_threads=*/settings.num_threads());
 }
 
 Offset<tflite::EdgeTpuDeviceSpec> ConvertEdgeTpuDeviceSpec(
-    FlatBufferBuilder* builder, const proto::EdgeTpuDeviceSpec& device_spec) {
+    FlatBufferBuilder& builder, const proto::EdgeTpuDeviceSpec& device_spec) {
   Offset<Vector<Offset<String>>> device_paths_fb = 0;
   if (device_spec.device_paths_size() > 0) {
     std::vector<Offset<String>> device_paths;
     for (const auto& device_path : device_spec.device_paths()) {
-      auto device_path_fb = builder->CreateString(device_path);
+      auto device_path_fb = builder.CreateString(device_path);
       device_paths.push_back(device_path_fb);
     }
-    device_paths_fb = builder->CreateVector(device_paths);
+    device_paths_fb = builder.CreateVector(device_paths);
   }
 
   return tflite::CreateEdgeTpuDeviceSpec(
-      *builder,
+      builder,
       static_cast<tflite::EdgeTpuDeviceSpec_::PlatformType>(
           device_spec.platform_type()),
       device_spec.num_chips(), device_paths_fb, device_spec.chip_family());
 }
 
 Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
-    const proto::EdgeTpuSettings& settings, FlatBufferBuilder* builder) {
+    const proto::EdgeTpuSettings& settings, FlatBufferBuilder& builder) {
   Offset<Vector<Offset<tflite::EdgeTpuInactivePowerConfig>>>
       inactive_power_configs = 0;
 
@@ -299,14 +306,14 @@ Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
     for (const auto& config : settings.inactive_power_configs()) {
       inactive_power_configs_std.push_back(
           tflite::CreateEdgeTpuInactivePowerConfig(
-              *builder,
+              builder,
               static_cast<tflite::EdgeTpuPowerState>(
                   config.inactive_power_state()),
               config.inactive_timeout_us()));
     }
 
     inactive_power_configs =
-        builder->CreateVector<Offset<tflite::EdgeTpuInactivePowerConfig>>(
+        builder.CreateVector<Offset<tflite::EdgeTpuInactivePowerConfig>>(
             inactive_power_configs_std);
   }
 
@@ -318,11 +325,11 @@ Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
 
   Offset<String> model_token = 0;
   if (settings.has_model_token()) {
-    model_token = builder->CreateString(settings.model_token());
+    model_token = builder.CreateString(settings.model_token());
   }
 
   return CreateEdgeTpuSettings(
-      *builder, ConvertEdgeTpuPowerState(settings.inference_power_state()),
+      builder, ConvertEdgeTpuPowerState(settings.inference_power_state()),
       inactive_power_configs, settings.inference_priority(),
       edgetpu_device_spec, model_token,
       static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(
@@ -331,17 +338,17 @@ Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
 }
 
 Offset<CoralSettings> ConvertCoralSettings(const proto::CoralSettings& settings,
-                                           FlatBufferBuilder* builder) {
+                                           FlatBufferBuilder& builder) {
   return CreateCoralSettings(
-      *builder, builder->CreateString(settings.device()),
+      builder, builder.CreateString(settings.device()),
       static_cast<tflite::CoralSettings_::Performance>(settings.performance()),
       settings.usb_always_dfu(), settings.usb_max_bulk_in_queue_length());
 }
 
 Offset<TFLiteSettings> ConvertTfliteSettings(
-    const proto::TFLiteSettings& settings, FlatBufferBuilder* builder) {
+    const proto::TFLiteSettings& settings, FlatBufferBuilder& builder) {
   return CreateTFLiteSettings(
-      *builder, ConvertDelegate(settings.delegate()),
+      builder, ConvertDelegate(settings.delegate()),
       ConvertNNAPISettings(settings.nnapi_settings(), builder),
       ConvertGPUSettings(settings.gpu_settings(), builder),
       ConvertHexagonSettings(settings.hexagon_settings(), builder),
@@ -352,26 +359,28 @@ Offset<TFLiteSettings> ConvertTfliteSettings(
       ConvertEdgeTpuSettings(settings.edgetpu_settings(), builder),
       ConvertCoralSettings(settings.coral_settings(), builder),
       ConvertFallbackSettings(settings.fallback_settings(), builder),
-      settings.disable_default_delegates());
+      settings.disable_default_delegates(),
+      ConvertStableDelegateLoaderSettings(
+          settings.stable_delegate_loader_settings(), builder));
 }
 
 Offset<ModelFile> ConvertModelFile(const proto::ModelFile& model_file,
-                                   FlatBufferBuilder* builder) {
-  return CreateModelFile(*builder, builder->CreateString(model_file.filename()),
+                                   FlatBufferBuilder& builder) {
+  return CreateModelFile(builder, builder.CreateString(model_file.filename()),
                          model_file.fd(), model_file.offset(),
                          model_file.length());
 }
 
 Offset<BenchmarkStoragePaths> ConvertBenchmarkStoragePaths(
     const proto::BenchmarkStoragePaths& storage_paths,
-    FlatBufferBuilder* builder) {
+    FlatBufferBuilder& builder) {
   return CreateBenchmarkStoragePaths(
-      *builder, builder->CreateString(storage_paths.storage_file_path()),
-      builder->CreateString(storage_paths.data_directory_path()));
+      builder, builder.CreateString(storage_paths.storage_file_path()),
+      builder.CreateString(storage_paths.data_directory_path()));
 }
 
 Offset<MinibenchmarkSettings> ConvertMinibenchmarkSettings(
-    const proto::MinibenchmarkSettings& settings, FlatBufferBuilder* builder) {
+    const proto::MinibenchmarkSettings& settings, FlatBufferBuilder& builder) {
   Offset<Vector<Offset<TFLiteSettings>>> settings_to_test = 0;
   std::vector<Offset<TFLiteSettings>> settings_to_test_vec;
   if (settings.settings_to_test_size() > 0) {
@@ -379,11 +388,11 @@ Offset<MinibenchmarkSettings> ConvertMinibenchmarkSettings(
       settings_to_test_vec.push_back(ConvertTfliteSettings(one, builder));
     }
     settings_to_test =
-        builder->CreateVector<Offset<TFLiteSettings>>(settings_to_test_vec);
+        builder.CreateVector<Offset<TFLiteSettings>>(settings_to_test_vec);
   }
 
   return CreateMinibenchmarkSettings(
-      *builder, settings_to_test,
+      builder, settings_to_test,
       ConvertModelFile(settings.model_file(), builder),
       ConvertBenchmarkStoragePaths(settings.storage_paths(), builder));
 }
@@ -392,18 +401,18 @@ const ComputeSettings* ConvertFromProto(
     const proto::ComputeSettings& proto_settings, FlatBufferBuilder* builder) {
   auto settings = CreateComputeSettings(
       *builder, ConvertExecutionPreference(proto_settings.preference()),
-      ConvertTfliteSettings(proto_settings.tflite_settings(), builder),
+      ConvertTfliteSettings(proto_settings.tflite_settings(), *builder),
       builder->CreateString(proto_settings.model_namespace_for_statistics()),
       builder->CreateString(proto_settings.model_identifier_for_statistics()),
       ConvertMinibenchmarkSettings(proto_settings.settings_to_test_locally(),
-                                   builder));
+                                   *builder));
   return flatbuffers::GetTemporaryPointer(*builder, settings);
 }
 
 const MinibenchmarkSettings* ConvertFromProto(
     const proto::MinibenchmarkSettings& proto_settings,
     flatbuffers::FlatBufferBuilder* builder) {
-  auto settings = ConvertMinibenchmarkSettings(proto_settings, builder);
+  auto settings = ConvertMinibenchmarkSettings(proto_settings, *builder);
   return flatbuffers::GetTemporaryPointer(*builder, settings);
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc
new file mode 100644
index 00000000000..6aa94c04ee1
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.cc
@@ -0,0 +1,27 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements the TFLite Delegate Plugin for the NNAPI Delegate.
+
+#include "tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h"
+
+namespace tflite {
+namespace delegates {
+
+TFLITE_REGISTER_DELEGATE_FACTORY_FUNCTION(StableDelegatePlugin,
+                                          StableDelegatePlugin::New);
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
new file mode 100644
index 00000000000..6ca42f334d6
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin.h
@@ -0,0 +1,92 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
+
+// This file provides the StableDelegatePlugin class, which implements the
+// TFLite Delegate Plugin Interface for the stable delegates.
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace delegates {
+
+class StableDelegatePlugin : public DelegatePluginInterface {
+ public:
+  static std::unique_ptr<StableDelegatePlugin> New(
+      const TFLiteSettings& tflite_settings) {
+    return std::make_unique<StableDelegatePlugin>(tflite_settings);
+  }
+
+  explicit StableDelegatePlugin(const TFLiteSettings& tflite_settings) {
+    // Creates a copy of TFLiteSettings within the stable delegate plugin.
+    TFLiteSettingsT tflite_settings_t;
+    tflite_settings.UnPackTo(&tflite_settings_t);
+    tflite_settings_builder_.Finish(
+        CreateTFLiteSettings(tflite_settings_builder_, &tflite_settings_t));
+    const StableDelegateLoaderSettings* stable_delegate_loader_settings =
+        GetTFLiteSettings()->stable_delegate_loader_settings();
+    if (!stable_delegate_loader_settings ||
+        !stable_delegate_loader_settings->delegate_path() ||
+        stable_delegate_loader_settings->delegate_path()->Length() == 0) {
+      TFLITE_LOG(ERROR) << "The delegate path field is not available from the "
+                           "provided stable delegate loader settings.";
+      return;
+    }
+    const auto* stable_delegate_ = utils::LoadDelegateFromSharedLibrary(
+        stable_delegate_loader_settings->delegate_path()->str());
+    if (!stable_delegate_) {
+      TFLITE_LOG(ERROR) << "Failed to load stable delegate plugin symbol from "
+                        << stable_delegate_loader_settings->delegate_path();
+      return;
+    }
+    stable_delegate_plugin_ = stable_delegate_->delegate_plugin;
+    TFLITE_LOG(INFO)
+        << "The stable delegate plugin has loaded delegate plugin for "
+        << stable_delegate_->delegate_name;
+  }
+
+  TfLiteDelegatePtr Create() override {
+    return TfLiteDelegatePtr(
+        stable_delegate_plugin_->create(GetTFLiteSettings()),
+        stable_delegate_plugin_->destroy);
+  }
+
+  int GetDelegateErrno(TfLiteOpaqueDelegate* from_delegate) override {
+    return stable_delegate_plugin_->get_delegate_errno(from_delegate);
+  }
+
+ private:
+  const TFLiteSettings* GetTFLiteSettings() {
+    return flatbuffers::GetRoot<TFLiteSettings>(
+        tflite_settings_builder_.GetBufferPointer());
+  }
+
+  const TfLiteOpaqueDelegatePlugin* stable_delegate_plugin_;
+  flatbuffers::FlatBufferBuilder tflite_settings_builder_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_CONFIGURATION_STABLE_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc
new file mode 100644
index 00000000000..bde73863593
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/stable_delegate_plugin_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Some very simple unit tests of the (C++) XNNPack Delegate Plugin.
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "pthreadpool.h"  // from @pthreadpool
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+
+class StableDelegatePluginTest : public testing::Test {
+ public:
+  static constexpr int kNumThreadsForTest = 7;
+  static constexpr tflite::XNNPackFlags kFlagsForTest =
+      tflite::XNNPackFlags::XNNPackFlags_TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8;
+  static constexpr char kDelegateBinaryPath[] =
+      "tensorflow/lite/delegates/utils/experimental/"
+      "stable_delegate/libtensorflowlite_stable_xnnpack_delegate.so";
+
+  void SetUp() override {
+    // Construct a FlatBuffer that contains
+    //   TFLiteSettings {
+    //     delegate: Delegate.XNNPACK,
+    //     XNNPackSettings { num_threads: kNumThreadsForTest
+    //                       flags: TFLITE_XNNPACK_DELEGATE_FLAG_QS8 |
+    //                           TFLITE_XNNPACK_DELEGATE_FLAG_QU8
+    //     },
+    //     StableDelegateLoaderSettings { delegate_path: kDelegateBinaryPath }
+    //   }.
+    // We use the stable XNNPack delegate binary for testing stable delegate
+    // provider.
+    flatbuffers::Offset<flatbuffers::String> stable_delegate_path_offset =
+        flatbuffer_builder_.CreateString(kDelegateBinaryPath);
+    StableDelegateLoaderSettingsBuilder stable_delegate_loader_settings_builder(
+        flatbuffer_builder_);
+    stable_delegate_loader_settings_builder.add_delegate_path(
+        stable_delegate_path_offset);
+    flatbuffers::Offset<StableDelegateLoaderSettings>
+        stable_delegate_loader_settings =
+            stable_delegate_loader_settings_builder.Finish();
+    XNNPackSettingsBuilder xnnpack_settings_builder(flatbuffer_builder_);
+    xnnpack_settings_builder.add_num_threads(kNumThreadsForTest);
+    xnnpack_settings_builder.add_flags(kFlagsForTest);
+    flatbuffers::Offset<XNNPackSettings> xnnpack_settings =
+        xnnpack_settings_builder.Finish();
+    TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder_);
+    tflite_settings_builder.add_stable_delegate_loader_settings(
+        stable_delegate_loader_settings);
+    tflite_settings_builder.add_xnnpack_settings(xnnpack_settings);
+    // Stable delegate plugin doesn't rely on the delegate specified in the
+    // TFLiteSettings provided.
+    tflite_settings_builder.add_delegate(Delegate_XNNPACK);
+    flatbuffers::Offset<TFLiteSettings> tflite_settings =
+        tflite_settings_builder.Finish();
+    flatbuffer_builder_.Finish(tflite_settings);
+    tflite_settings_ = flatbuffers::GetRoot<TFLiteSettings>(
+        flatbuffer_builder_.GetBufferPointer());
+    // Create a stable delegate plugin for an XNNPack delegate using the
+    // settings from the flatbuffer.
+    delegate_plugin_ = delegates::DelegatePluginRegistry::CreateByName(
+        "StableDelegatePlugin", *tflite_settings_);
+    ASSERT_NE(delegate_plugin_, nullptr);
+  }
+  void TearDown() override { delegate_plugin_.reset(); }
+
+ protected:
+  // settings_ points into storage owned by flatbuffer_builder_.
+  flatbuffers::FlatBufferBuilder flatbuffer_builder_;
+  const TFLiteSettings *tflite_settings_;
+  std::unique_ptr<delegates::DelegatePluginInterface> delegate_plugin_;
+};
+
+TEST_F(StableDelegatePluginTest, CanCreateAndDestroyDelegate) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  EXPECT_NE(delegate, nullptr);
+}
+
+TEST_F(StableDelegatePluginTest, CanGetDelegateErrno) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+
+  EXPECT_EQ(delegate_plugin_->GetDelegateErrno(delegate.get()), 0);
+}
+
+TEST_F(StableDelegatePluginTest, SetsCorrectThreadCount) {
+  delegates::TfLiteDelegatePtr delegate = delegate_plugin_->Create();
+  pthreadpool_t threadpool = static_cast<pthreadpool_t>(
+      TfLiteXNNPackDelegateGetThreadPool(delegate.get()));
+
+  EXPECT_EQ(pthreadpool_get_threads_count(threadpool), kNumThreadsForTest);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.old.fbs b/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.old.fbs
new file mode 100644
index 00000000000..f6970c4bb6f
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.old.fbs
@@ -0,0 +1,355 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Generated from configuration.proto
+
+namespace tflite;
+
+enum ExecutionPreference : int {
+  ANY = 0,
+  LOW_LATENCY = 1,
+  LOW_POWER = 2,
+  FORCE_CPU = 3,
+}
+
+enum Delegate : int {
+  NONE = 0,
+  NNAPI = 1,
+  GPU = 2,
+  HEXAGON = 3,
+  XNNPACK = 4,
+  EDGETPU = 5,
+  EDGETPU_CORAL = 6,
+  CORE_ML = 7,
+}
+
+enum NNAPIExecutionPreference : int {
+  UNDEFINED = 0,
+  NNAPI_LOW_POWER = 1,
+  NNAPI_FAST_SINGLE_ANSWER = 2,
+  NNAPI_SUSTAINED_SPEED = 3,
+}
+
+enum NNAPIExecutionPriority : int {
+  NNAPI_PRIORITY_UNDEFINED = 0,
+  NNAPI_PRIORITY_LOW = 1,
+  NNAPI_PRIORITY_MEDIUM = 2,
+  NNAPI_PRIORITY_HIGH = 3,
+}
+
+enum GPUBackend : int {
+  UNSET = 0,
+  OPENCL = 1,
+  OPENGL = 2,
+}
+
+enum GPUInferencePriority : int {
+  GPU_PRIORITY_AUTO = 0,
+  GPU_PRIORITY_MAX_PRECISION = 1,
+  GPU_PRIORITY_MIN_LATENCY = 2,
+  GPU_PRIORITY_MIN_MEMORY_USAGE = 3,
+}
+
+enum GPUInferenceUsage : int {
+  GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0,
+  GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1,
+}
+
+enum XNNPackFlags : int {
+  TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0,
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1,
+  TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2,
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3,
+  TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4,
+}
+
+namespace tflite.CoreMLSettings_;
+
+enum EnabledDevices : int {
+  DEVICES_ALL = 0,
+  DEVICES_WITH_NEURAL_ENGINE = 1,
+}
+
+namespace tflite.EdgeTpuDeviceSpec_;
+
+enum PlatformType : int {
+  MMIO = 0,
+  REFERENCE = 1,
+  SIMULATOR = 2,
+  REMOTE_SIMULATOR = 3,
+}
+
+namespace tflite;
+
+enum EdgeTpuPowerState : int {
+  UNDEFINED_POWERSTATE = 0,
+  TPU_CORE_OFF = 1,
+  READY = 2,
+  ACTIVE_MIN_POWER = 3,
+  ACTIVE_VERY_LOW_POWER = 4,
+  ACTIVE_LOW_POWER = 5,
+  ACTIVE = 6,
+  OVER_DRIVE = 7,
+}
+
+namespace tflite.EdgeTpuSettings_;
+
+enum FloatTruncationType : int {
+  UNSPECIFIED = 0,
+  NO_TRUNCATION = 1,
+  BFLOAT16 = 2,
+  HALF = 3,
+}
+
+enum QosClass : int {
+  QOS_UNDEFINED = 0,
+  BEST_EFFORT = 1,
+  REALTIME = 2,
+}
+
+namespace tflite.CoralSettings_;
+
+enum Performance : int {
+  UNDEFINED = 0,
+  MAXIMUM = 1,
+  HIGH = 2,
+  MEDIUM = 3,
+  LOW = 4,
+}
+
+namespace tflite;
+
+enum BenchmarkEventType : int {
+  UNDEFINED_BENCHMARK_EVENT_TYPE = 0,
+  START = 1,
+  END = 2,
+  ERROR = 3,
+  LOGGED = 4,
+  RECOVERED_ERROR = 5,
+}
+
+enum BenchmarkStage : int {
+  UNKNOWN = 0,
+  INITIALIZATION = 1,
+  INFERENCE = 2,
+}
+
+table ComputeSettings {
+  preference:tflite.ExecutionPreference;
+  tflite_settings:tflite.TFLiteSettings;
+  model_namespace_for_statistics:string;
+  model_identifier_for_statistics:string;
+  settings_to_test_locally:tflite.MinibenchmarkSettings;
+}
+
+table NNAPISettings {
+  accelerator_name:string;
+  cache_directory:string;
+  model_token:string;
+  execution_preference:tflite.NNAPIExecutionPreference;
+  no_of_nnapi_instances_to_cache:int;
+  fallback_settings:tflite.FallbackSettings;
+  allow_nnapi_cpu_on_android_10_plus:bool;
+  execution_priority:tflite.NNAPIExecutionPriority;
+  allow_dynamic_dimensions:bool;
+  allow_fp16_precision_for_fp32:bool;
+  use_burst_computation:bool;
+  support_library_handle:long;
+}
+
+table GPUSettings {
+  is_precision_loss_allowed:bool;
+  enable_quantized_inference:bool = true;
+  force_backend:tflite.GPUBackend;
+  inference_priority1:tflite.GPUInferencePriority;
+  inference_priority2:tflite.GPUInferencePriority;
+  inference_priority3:tflite.GPUInferencePriority;
+  inference_preference:tflite.GPUInferenceUsage;
+  cache_directory:string;
+  model_token:string;
+}
+
+table HexagonSettings {
+  debug_level:int;
+  powersave_level:int;
+  print_graph_profile:bool;
+  print_graph_debug:bool;
+}
+
+table XNNPackSettings {
+  num_threads:int;
+  flags:tflite.XNNPackFlags;
+}
+
+table CoreMLSettings {
+  enabled_devices:tflite.CoreMLSettings_.EnabledDevices;
+  coreml_version:int;
+  max_delegated_partitions:int;
+  min_nodes_per_partition:int = 2;
+}
+
+table StableDelegateLoaderSettings {
+  delegate_path:string;
+}
+
+table EdgeTpuDeviceSpec {
+  platform_type:tflite.EdgeTpuDeviceSpec_.PlatformType;
+  num_chips:int;
+  device_paths:[string];
+  chip_family:int;
+}
+
+table EdgeTpuInactivePowerConfig {
+  inactive_power_state:tflite.EdgeTpuPowerState;
+  inactive_timeout_us:long;
+}
+
+table EdgeTpuSettings {
+  inference_power_state:tflite.EdgeTpuPowerState;
+  inactive_power_configs:[tflite.EdgeTpuInactivePowerConfig];
+  inference_priority:int = -1;
+  edgetpu_device_spec:tflite.EdgeTpuDeviceSpec;
+  model_token:string;
+  float_truncation_type:tflite.EdgeTpuSettings_.FloatTruncationType;
+  qos_class:tflite.EdgeTpuSettings_.QosClass;
+}
+
+table CoralSettings {
+  device:string;
+  performance:tflite.CoralSettings_.Performance;
+  usb_always_dfu:bool;
+  usb_max_bulk_in_queue_length:int;
+}
+
+table CPUSettings {
+  num_threads:int = -1;
+}
+
+table TFLiteSettings {
+  delegate:tflite.Delegate;
+  nnapi_settings:tflite.NNAPISettings;
+  gpu_settings:tflite.GPUSettings;
+  hexagon_settings:tflite.HexagonSettings;
+  xnnpack_settings:tflite.XNNPackSettings;
+  coreml_settings:tflite.CoreMLSettings;
+  cpu_settings:tflite.CPUSettings;
+  max_delegated_partitions:int;
+  edgetpu_settings:tflite.EdgeTpuSettings;
+  coral_settings:tflite.CoralSettings;
+  fallback_settings:tflite.FallbackSettings;
+  disable_default_delegates:bool;
+  stable_delegate_loader_settings:tflite.StableDelegateLoaderSettings;
+}
+
+table FallbackSettings {
+  allow_automatic_fallback_on_compilation_error:bool;
+  allow_automatic_fallback_on_execution_error:bool;
+}
+
+table BenchmarkMetric {
+  name:string;
+  values:[float];
+}
+
+table BenchmarkResult {
+  initialization_time_us:[long];
+  inference_time_us:[long];
+  max_memory_kb:int;
+  ok:bool;
+  metrics:[tflite.BenchmarkMetric];
+  actual_output:[tflite.BenchmarkResult_.InferenceOutput];
+}
+
+namespace tflite.BenchmarkResult_;
+
+table InferenceOutput {
+  value:[ubyte];
+}
+
+namespace tflite;
+
+table ErrorCode {
+  source:tflite.Delegate;
+  tflite_error:int;
+  underlying_api_error:long;
+}
+
+table BenchmarkError {
+  stage:tflite.BenchmarkStage;
+  exit_code:int;
+  signal:int;
+  error_code:[tflite.ErrorCode];
+  mini_benchmark_error_code:int;
+}
+
+table BenchmarkEvent {
+  tflite_settings:tflite.TFLiteSettings;
+  event_type:tflite.BenchmarkEventType;
+  result:tflite.BenchmarkResult;
+  error:tflite.BenchmarkError;
+  boottime_us:long;
+  wallclock_us:long;
+}
+
+table BestAccelerationDecision {
+  number_of_source_events:int;
+  min_latency_event:tflite.BenchmarkEvent;
+  min_inference_time_us:long;
+}
+
+table BenchmarkInitializationFailure {
+  initialization_status:int;
+}
+
+table MiniBenchmarkEvent {
+  is_log_flushing_event:bool;
+  best_acceleration_decision:tflite.BestAccelerationDecision;
+  initialization_failure:tflite.BenchmarkInitializationFailure;
+  benchmark_event:tflite.BenchmarkEvent;
+}
+
+table ModelFile {
+  filename:string;
+  fd:long;
+  offset:long;
+  length:long;
+  model_id_group:tflite.ModelIdGroup;
+}
+
+table ModelIdGroup {
+  model_namespace:string;
+  model_id:string;
+}
+
+table BenchmarkStoragePaths {
+  storage_file_path:string;
+  data_directory_path:string;
+}
+
+table ValidationSettings {
+  per_test_timeout_ms:long;
+}
+
+table MinibenchmarkSettings {
+  settings_to_test:[tflite.TFLiteSettings];
+  model_file:tflite.ModelFile;
+  storage_paths:tflite.BenchmarkStoragePaths;
+  validation_settings:tflite.ValidationSettings;
+}
+
+table BenchmarkEventStorage {
+  model_id_group:tflite.ModelIdGroup;
+  benchmark_event:tflite.BenchmarkEvent;
+}
+
diff --git a/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev
new file mode 100644
index 00000000000..2740c4b4675
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev
@@ -0,0 +1,785 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//-----------------------------------------------------------------------------
+// WARNING: read all the warnings below before modifying this file!
+//-----------------------------------------------------------------------------
+//
+// This schema defines how to configure TFLite for delegation. These
+// definitions can be used in multiple ways: as output of a compatibility list,
+// in benchmarking tools and to decouple delegate instantiation from code.
+//
+// The schema is work-in-progress, covering the most broadly used delegates and
+// options.
+//
+// This schema is written using ProtoBuf syntax, but it is also used to generate
+// a corresponding FlatBuffer schema.
+//
+// WARNING: The TfLiteSettings flatbuffer is used as part of the ABI
+// for TensorFlow in Play Services, so please be careful to preserve
+// binary backwards compatibility!
+//
+// WARNING: the Protobuf to Flatbuffer schema conversion does NOT
+// pay any attention to the protobuf field numbers in this file,
+// so setting the protobuf field numbers is NOT sufficient to preserve binary
+// backwards compatibility.  Instead, to preserve backwards binary
+// compatibility, new fields MUST ONLY be added at the END of messages,
+// and fields should NEVER be deleted, but instead can only be deprecated.
+//
+// WARNING: you need to manually generate and update the generated flatbuffer
+// code (configuration_generated.h) when modifying this file. See BUILD for
+// more information.
+
+// LINT.IfChange
+
+syntax = "proto2";
+
+package tflite.proto;
+
+// ExecutionPreference is used to match accelerators against the preferences of
+// the current application or usecase. Some of the values here can appear both
+// in the compatibility list and as input, some only as input.
+//
+// These are separate from NNAPIExecutionPreference - the compatibility list
+// design doesn't assume a one-to-one mapping between which usecases
+// compatibility list entries have been developed for and what settings are used
+// for NNAPI.
+enum ExecutionPreference {
+  // Match any selected preference. Allowlist (semantically - value is same as
+  // on input).
+  ANY = 0;
+  // Match low latency preference. Both compatibility list and input.
+  LOW_LATENCY = 1;
+  // Math low power preference. Both compatibility list and input.
+  LOW_POWER = 2;
+  // Never accelerate. Can be used for input to compatibility list or for
+  // standalone Acceleration configuration.
+  FORCE_CPU = 3;
+}
+
+// TFLite accelerator to use.
+//
+// STATUS: support library and the stable delegate loader settings are agnostic
+// to the actual accelerator.
+enum Delegate {
+  NONE = 0;
+
+  NNAPI = 1;
+  GPU = 2;
+  HEXAGON = 3;
+  XNNPACK = 4;
+  // The EdgeTpu in Pixel devices.
+  EDGETPU = 5;
+  // The Coral EdgeTpu Dev Board / USB accelerator.
+  EDGETPU_CORAL = 6;
+  // Apple CoreML.
+  CORE_ML = 7;
+}
+
+enum NNAPIExecutionPreference {
+  // Undefined.
+  UNDEFINED = 0;
+  // Prefer executing in a way that minimizes battery drain.
+  NNAPI_LOW_POWER = 1;
+  // Prefer returning a single answer as fast as possible, even if this causes
+  // more power consumption.
+  NNAPI_FAST_SINGLE_ANSWER = 2;
+  // Prefer maximizing the throughput of successive frames, for example when
+  // processing successive frames coming from the camera.
+  NNAPI_SUSTAINED_SPEED = 3;
+}
+
+enum NNAPIExecutionPriority {
+  NNAPI_PRIORITY_UNDEFINED = 0;
+  NNAPI_PRIORITY_LOW = 1;
+  NNAPI_PRIORITY_MEDIUM = 2;
+  NNAPI_PRIORITY_HIGH = 3;
+}
+
+// One possible acceleration configuration.
+message ComputeSettings {
+  // Which preference to use this accelerator for.
+  optional ExecutionPreference preference = 1;
+  // How to configure TFLite
+  optional TFLiteSettings tflite_settings = 2;
+  // Identifiers to use for instrumentation and telemetry.
+  optional string model_namespace_for_statistics = 3;
+  optional string model_identifier_for_statistics = 4;
+
+  // 'Maybe' acceleration: use mini-benchmark to select settings.
+  optional MinibenchmarkSettings settings_to_test_locally = 5;
+}
+
+// NNAPI delegate settings.
+message NNAPISettings {
+  // Which instance (NNAPI accelerator) to use. One driver may provide several
+  // accelerators (though a driver may also hide several back-ends behind one
+  // name, at the choice of the driver vendor).
+  // Note that driver introspection is only available in Android Q and later.
+  optional string accelerator_name = 1;
+
+  // NNAPI model compilation caching settings to be passed to
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 2;
+  optional string model_token = 3;
+
+  // NNAPI execution preference to pass. See
+  // https://developer.android.com/ndk/reference/group/neural-networks.html
+  optional NNAPIExecutionPreference execution_preference = 4;
+
+  // Number of instances to cache for the same model (for input size
+  // changes). This is mandatory for getting reasonable performance in that
+  // case.
+  optional int32 no_of_nnapi_instances_to_cache = 5;
+
+  // Deprecated; use the fallback_settings in TFLiteSettings.
+  //
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 6 [deprecated = true];
+
+  // Whether to allow use of NNAPI CPU (nnapi-reference accelerator) on Android
+  // 10+ when an accelerator name is not specified. The NNAPI CPU typically
+  // performs less well than the TfLite built-in kernels; but allowing allows a
+  // model to be partially accelerated which may be a win.
+  optional bool allow_nnapi_cpu_on_android_10_plus = 7;
+
+  optional NNAPIExecutionPriority execution_priority = 8;
+
+  // Whether to allow dynamic dimension sizes without re-compilation.
+  // A tensor of with dynamic dimension must have a valid dims_signature
+  // defined.
+  // Only supported in NNAPI 1.1 and newer versions.
+  // WARNING: Setting this flag to true may result in model being rejected by
+  // accelerator. This should only be enabled if the target device supports
+  // dynamic dimensions of the model.
+  // By default this is set to false.
+  optional bool allow_dynamic_dimensions = 9;
+
+  // Whether to allow the NNAPI accelerator to optionally use lower-precision
+  // float16 (16-bit floating point) arithmetic when doing calculations on
+  // float32 (32-bit floating point).
+  optional bool allow_fp16_precision_for_fp32 = 10;
+
+  // Whether to use NNAPI Burst mode.
+  // Burst mode allows accelerators to efficiently manage resources, which
+  // would significantly reduce overhead especially if the same delegate
+  // instance is to be used for multiple inferences.
+  optional bool use_burst_computation = 11;
+
+  // Optional pointer to NNAPI Support Library provided pointer to
+  // NnApiSLDriverImplFL5 which can be used to construct the
+  // NNAPI delegate.
+  optional int64 support_library_handle = 12;
+}
+
+// LINT.IfChange
+// Which GPU backend to select. Default behaviour on Android is to try OpenCL
+// and if it's not available fall back to OpenGL.
+enum GPUBackend {
+  UNSET = 0;
+  OPENCL = 1;
+  OPENGL = 2;
+  // Not yet supported.
+  // VULKAN = 3;
+  // METAL = 4;
+}
+
+// GPU inference priorities define relative priorities given by the GPU delegate
+// to different client needs.
+// Corresponds to TfLiteGpuInferencePriority.
+enum GPUInferencePriority {
+  GPU_PRIORITY_AUTO = 0;
+  GPU_PRIORITY_MAX_PRECISION = 1;
+  GPU_PRIORITY_MIN_LATENCY = 2;
+  GPU_PRIORITY_MIN_MEMORY_USAGE = 3;
+}
+
+// GPU inference preference for initialization time vs. inference time.
+// Corresponds to TfLiteGpuInferenceUsage.
+enum GPUInferenceUsage {
+  // Delegate will be used only once, therefore, bootstrap/init time should
+  // be taken into account.
+  GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = 0;
+
+  // Prefer maximizing the throughput. Same delegate will be used repeatedly on
+  // multiple inputs.
+  GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = 1;
+}
+
+// GPU Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/gpu/delegate.h
+message GPUSettings {
+  // Ignored if inference_priority1/2/3 are set.
+  optional bool is_precision_loss_allowed = 1;
+  optional bool enable_quantized_inference = 2 [default = true];
+  optional GPUBackend force_backend = 3;
+
+  // Ordered priorities provide better control over desired semantics,
+  // where priority(n) is more important than priority(n+1). Therefore,
+  // each time inference engine needs to make a decision, it uses
+  // ordered priorities to do so.
+  //
+  // Default values correspond to GPU_PRIORITY_AUTO.
+  // AUTO priority can only be used when higher priorities are fully specified.
+  // For example:
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
+  //   VALID:   priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
+  //            priority3 = AUTO
+  //   INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
+  //   INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
+  //            priority3 = MAX_PRECISION
+  // Invalid priorities will result in error.
+  //
+  // For more information, see TfLiteGpuDelegateOptionsV2.
+  optional GPUInferencePriority inference_priority1 = 4
+      [default = GPU_PRIORITY_AUTO];
+  optional GPUInferencePriority inference_priority2 = 5
+      [default = GPU_PRIORITY_AUTO];
+  optional GPUInferencePriority inference_priority3 = 6
+      [default = GPU_PRIORITY_AUTO];
+
+  // Whether to optimize for compilation+execution time or execution time only.
+  optional GPUInferenceUsage inference_preference = 7;
+
+  // Model serialization. Setting both of these fields will also set the
+  // TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_SERIALIZATION flag on the delegate.
+  //
+  // GPU model serialization directory passed in TfLiteGpuDelegateOptionsV2.
+  // This should be set to the application's code cache directory so that it can
+  // not be accessed by other apps and is correctly deleted on app updates.
+  // tflite::StatefulNnApiDelegate
+  optional string cache_directory = 8;
+  // Normally, the model name with version number should be provided here, since
+  // each model needs an unique ID to avoid cache collision.
+  optional string model_token = 9;
+}
+// LINT.ThenChange(GpuAccelerationConfig.java)
+
+// Hexagon Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/hexagon/hexagon_delegate.h
+message HexagonSettings {
+  optional int32 debug_level = 1;
+  optional int32 powersave_level = 2;
+  optional bool print_graph_profile = 3;
+  optional bool print_graph_debug = 4;
+}
+
+// XNNPack Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h
+enum XNNPackFlags {
+  // These flags match the flags in xnnpack_delegate.h.
+  TFLITE_XNNPACK_DELEGATE_NO_FLAGS = 0;
+  // Enable fast signed integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8 = 1;
+  // Enable fast unsigned integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QU8 = 2;
+  // Enable both, signed and unsigned integer XNNpack kernels.
+  TFLITE_XNNPACK_DELEGATE_FLAG_QS8_QU8 = 3;
+  // Force 16-bit floating point inference.
+  TFLITE_XNNPACK_DELEGATE_FLAG_FORCE_FP16 = 4;
+}
+
+message XNNPackSettings {
+  optional int32 num_threads = 1;
+  optional XNNPackFlags flags = 2 [default = TFLITE_XNNPACK_DELEGATE_NO_FLAGS];
+}
+
+// CoreML Delegate settings.
+//
+// See
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/coreml/coreml_delegate.h
+message CoreMLSettings {
+  // Note the enum order change from the above header for better proto practice.
+  enum EnabledDevices {
+    // Always create Core ML delegate.
+    DEVICES_ALL = 0;
+    // Create Core ML delegate only on devices with Apple Neural Engine.
+    DEVICES_WITH_NEURAL_ENGINE = 1;
+  }
+  // Only create delegate when Neural Engine is available on the device.
+  optional EnabledDevices enabled_devices = 1;
+
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  optional int32 coreml_version = 2;
+  // This sets the maximum number of Core ML delegates created.
+  // Each graph corresponds to one delegated node subset in the
+  // TFLite model. Set this to 0 to delegate all possible partitions.
+  optional int32 max_delegated_partitions = 3 [default = 0];
+  // This sets the minimum number of nodes per partition delegated with
+  // Core ML delegate. Defaults to 2.
+  optional int32 min_nodes_per_partition = 4 [default = 2];
+}
+
+// Stable delegate loader settings.
+//
+// See
+// tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h
+// An example stable delegate:
+// tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+message StableDelegateLoaderSettings {
+  // The path of the stable delegate shared object file. Then the stable
+  // delegate provider can dynamically load the shared object file.
+  optional string delegate_path = 1;
+}
+
+// EdgeTPU device spec.
+//
+message EdgeTpuDeviceSpec {
+  // EdgeTPU platform types.
+  enum PlatformType {
+    MMIO = 0;
+    REFERENCE = 1;
+    SIMULATOR = 2;
+    REMOTE_SIMULATOR = 3;
+  }
+
+  // Execution platform for the EdgeTPU device.
+  optional PlatformType platform_type = 1;
+
+  // Number of chips to use for the EdgeTPU device.
+  optional int32 num_chips = 2;
+
+  // Paths to the EdgeTPU devices;
+  repeated string device_paths = 3;
+
+  // Chip family used by the EdgeTpu device.
+  optional int32 chip_family = 4;
+}
+
+// Generic definitions of EdgeTPU power states.
+enum EdgeTpuPowerState {
+  // Undefined power state.
+  UNDEFINED_POWERSTATE = 0;
+
+  // TPU core is off but control cluster is on.
+  TPU_CORE_OFF = 1;
+
+  // A non-active low-power state that has much smaller transition time to
+  // active compared to off.
+  READY = 2;
+
+  // Minimum power active state.
+  ACTIVE_MIN_POWER = 3;
+
+  // Very low performance, very low power.
+  ACTIVE_VERY_LOW_POWER = 4;
+
+  // Low performance, low power.
+  ACTIVE_LOW_POWER = 5;
+
+  // The normal performance and power. This setting usually provides the
+  // optimal perf/power trade-off for the average use-case.
+  ACTIVE = 6;
+
+  // Maximum performance level. Potentially higher power and thermal. This
+  // setting may not be allowed in production depending on the system.
+  OVER_DRIVE = 7;
+}
+
+message EdgeTpuInactivePowerConfig {
+  // Inactive power states between inferences.
+  optional EdgeTpuPowerState inactive_power_state = 1;
+
+  // Inactive timeout in microseconds between inferences.
+  optional int64 inactive_timeout_us = 2;
+}
+
+// EdgeTPU Delegate settings.
+//
+message EdgeTpuSettings {
+  // Float truncation types for EdgeTPU.
+  enum FloatTruncationType {
+    UNSPECIFIED = 0;
+    NO_TRUNCATION = 1;
+    BFLOAT16 = 2;
+    HALF = 3;
+  }
+
+  enum QosClass {
+    QOS_UNDEFINED = 0;
+    BEST_EFFORT = 1;
+    REALTIME = 2;
+  }
+
+  // Target inference power state for running the model.
+  optional EdgeTpuPowerState inference_power_state = 1;
+
+  // Inactive power states between inferences.
+  repeated EdgeTpuInactivePowerConfig inactive_power_configs = 2;
+
+  // Priority for the inference request.
+  optional int32 inference_priority = 3 [default = -1];
+
+  // Device spec for creating the EdgeTpu device.
+  optional EdgeTpuDeviceSpec edgetpu_device_spec = 4;
+
+  // A unique identifier of the input TfLite model.
+  optional string model_token = 5;
+
+  // Float truncation type for EdgeTPU.
+  optional FloatTruncationType float_truncation_type = 6;
+
+  // QoS class to determine chunking size for PRO onward.
+  optional QosClass qos_class = 7 [default = QOS_UNDEFINED];
+}
+
+// Coral Dev Board / USB accelerator delegate settings.
+//
+// See
+// https://github.com/google-coral/edgetpu/blob/master/libedgetpu/edgetpu_c.h
+message CoralSettings {
+  enum Performance {
+    UNDEFINED = 0;
+    MAXIMUM = 1;
+    HIGH = 2;
+    MEDIUM = 3;
+    LOW = 4;
+  }
+
+  // The Edge Tpu device to be used. See
+  // https://github.com/google-coral/libcoral/blob/982426546dfa10128376d0c24fd8a8b161daac97/coral/tflite_utils.h#L131-L137
+  optional string device = 1;
+  // The desired performance level. This setting adjusts the internal clock
+  // rate to achieve different performance / power balance. Higher performance
+  // values improve speed, but increase power usage.
+  optional Performance performance = 2 [default = MAXIMUM];
+  // If true, always perform device firmware update (DFU) after reset. DFU is
+  // usually only necessary after power cycle.
+  optional bool usb_always_dfu = 3;
+  // The maximum bulk in queue length. Larger queue length may improve USB
+  // performance on the direction from device to host. When not specified (or
+  // zero), `usb_max_bulk_in_queue_length` will default to 32 according to the
+  // current EdgeTpu Coral implementation.
+  optional int32 usb_max_bulk_in_queue_length = 4;
+}
+
+message CPUSettings {
+  // Set to -1 to let the interpreter choose. Otherwise, must be > 0.
+  optional int32 num_threads = 1 [default = -1];
+}
+
+// How to configure TFLite.
+message TFLiteSettings {
+  // Which delegate to use.
+  optional Delegate delegate = 1;
+
+  // How to configure the chosen delegate.
+  // (In principle we would like to use 'oneof', but flatc turns that into an
+  // nested anonymous table rather than a union. See
+  // https://github.com/google/flatbuffers/issues/4628).
+  optional NNAPISettings nnapi_settings = 2;
+  optional GPUSettings gpu_settings = 3;
+  optional HexagonSettings hexagon_settings = 4;
+  optional XNNPackSettings xnnpack_settings = 5;
+  optional CoreMLSettings coreml_settings = 11;
+
+  // How to configure CPU execution.
+  optional CPUSettings cpu_settings = 6;
+
+  // Shared delegation settings.
+  optional int32 max_delegated_partitions = 7;
+
+  // For configuring the EdgeTpuDelegate.
+  optional EdgeTpuSettings edgetpu_settings = 8;
+
+  // For configuring the Coral EdgeTpu Delegate.
+  optional CoralSettings coral_settings = 10;
+
+  // Whether to automatically fall back to TFLite CPU path.
+  optional FallbackSettings fallback_settings = 9;
+
+  // Whether to disable default delegates (XNNPack).
+  // TODO(b/260405596): Update the comment to clarify the interaction between
+  // `disable_default_delegates` and `fallback_settings`.
+  optional bool disable_default_delegates = 12;
+
+  // For loading a stable delegate. If an app supplies a delegate shared library
+  // (e.g. packaged with the app, or downloaded separately), the app can use
+  // this field for passing the path to the delegate shared library.
+  //
+  // The stable delegate loader settings field works together with the settings
+  // of other concrete stable delegates; the stable delegate loader is not a
+  // concrete delegate type but a mechanism for initializing the TF Lite stable
+  // delegates.
+  //
+  // See
+  // tensorflow/lite/delegates/utils/experimental/sample_stable_delegate
+  optional StableDelegateLoaderSettings stable_delegate_loader_settings = 13;
+}
+
+// Whether to automatically fallback to TFLite CPU path on delegation errors.
+//
+// Typically fallback is enabled in production use but disabled in tests and
+// benchmarks to ensure they test the intended path.
+message FallbackSettings {
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // compilation failure. Default is not allowing automatic fallback.
+  //
+  // This is useful in naive production usecases where the caller would prefer
+  // for the model to run even if it's not accelerated. More advanced users will
+  // implement fallback themselves; e.g., by using a different model on CPU.
+  //
+  // Note that compilation errors may occur either at initial
+  // ModifyGraphWithDelegate() time, or when calling AllocateTensors() after
+  // resizing.
+  optional bool allow_automatic_fallback_on_compilation_error = 7;
+  // Whether to allow automatically falling back to TfLite CPU path on
+  // execution error. Default is not allowing automatic fallback.
+  //
+  // Experimental, use with care (only when you have complete control over the
+  // client code).
+  //
+  // The caveat above for compilation error holds.  Additionally, execution-time
+  // errors are harder to handle automatically as they require invalidating the
+  // TfLite interpreter which most client code has not been designed to deal
+  // with.
+  optional bool allow_automatic_fallback_on_execution_error = 8;
+}
+
+// On-device mini-benchmark result storage. The following definitions are used
+// to keep an append-only log of benchmark results on-device. (Hence there is
+// single top-level event that is used for all data).
+//
+// These definitions don't need a proto-to-flatbuffer conversion, since they are
+// not used for specifying configuration in the Tasks library.
+
+// Which stage of benchmarking the event is for.
+// There might be multiple events with the same type, if a benchmark is run
+// multiple times.
+enum BenchmarkEventType {
+  UNDEFINED_BENCHMARK_EVENT_TYPE = 0;
+  // Benchmark start. A start without an end can be interpreted as a test that
+  // has crashed or hung.
+  START = 1;
+  // Benchmarking completion. A model was successfully loaded, acceleration
+  // configured and inference run without errors. There may still be an issue
+  // with correctness of results, or with performance.
+  END = 2;
+  // Benchmark was not completed due to an error. The error may be a handled
+  // error (e.g., failure in a delegate), or a crash.
+  ERROR = 3;
+  // Benchmark data has been sent for logging.
+  LOGGED = 4;
+  // Benchmark encountered an error but was able to continue. The error is not
+  // related to the model execution but to the mini-benchmark logic. An example
+  // of error is a failure when trying to set the CPU affinity of the benchmark
+  // runner process.
+  RECOVERED_ERROR = 5;
+}
+
+// A correctness metric from a benchmark, for example KL-divergence between
+// known-good CPU output and on-device output. These are primarily used for
+// telemetry and monitored server-side.
+message BenchmarkMetric {
+  optional string name = 1;
+  repeated float values = 2 [packed = true];
+}
+
+// Outcome of a successfully complete benchmark run. This information is
+// intended to both be used on-device to select best compute configuration as
+// well as sent to server for monitoring.
+//
+// Used with event type END.
+message BenchmarkResult {
+  // Time to load model and apply acceleration. Initialization may get run
+  // multiple times to get information on variance.
+  repeated int64 initialization_time_us = 1 [packed = true];
+  // Time to run inference (call Invoke()). Inference may get run multiple times
+  // to get information on variance.
+  repeated int64 inference_time_us = 2 [packed = true];
+  // Maximum memory used. Measures size of application heap (does not
+  // necessarily take into account driver-side allocation.
+  optional int32 max_memory_kb = 3;
+  // Whether the inference produced correct results (validation graph output
+  // 'ok' for all test inputs). Used on-device to disallow configurations that
+  // produce incorrect results (e.g., due to OpenCL driver bugs).
+  optional bool ok = 4;
+  // Metrics that were used to determine the 'ok' status.
+  repeated BenchmarkMetric metrics = 5;
+}
+
+// A handled error.
+message ErrorCode {
+  // Which delegate the error comes from (or NONE, if it comes from the tflite
+  // framework).
+  optional Delegate source = 1;
+  // What the tflite level error is.
+  optional int32 tflite_error = 2;
+  // What the underlying error is (e.g., NNAPI or OpenGL error).
+  optional int64 underlying_api_error = 3;
+}
+
+// When during benchmark execution an error occurred.
+enum BenchmarkStage {
+  UNKNOWN = 0;
+  // During model loading or delegation.
+  INITIALIZATION = 1;
+  // During inference.
+  INFERENCE = 2;
+}
+
+// An error that occurred during benchmarking.
+//
+// Used with event type ERROR.
+message BenchmarkError {
+  // How far benchmarking got.
+  optional BenchmarkStage stage = 1;
+  // Process exit code.
+  optional int32 exit_code = 2;
+  // Signal the process received.
+  optional int32 signal = 3;
+  // Handled tflite error.
+  repeated ErrorCode error_code = 4;
+  // Mini-benchmark error code.
+  optional int32 mini_benchmark_error_code = 5;
+}
+
+// Top-level benchmarking event stored on-device. All events for a model are
+// parsed to detect the status.
+message BenchmarkEvent {
+  // Which settings were used for benchmarking.
+  optional TFLiteSettings tflite_settings = 1;
+  // Type of the event.
+  optional BenchmarkEventType event_type = 2;
+  // Result of benchmark, used when type is END.
+  optional BenchmarkResult result = 3;
+  // Error during benchmark, used when type is ERROR.
+  optional BenchmarkError error = 4;
+  // Start timestamps. These are used for
+  // 1. Checking whether a test was started but not completed within a given
+  // deadline.
+  // 2. Optionally, telemetry timestamps.
+  optional int64 boottime_us = 5;
+  optional int64 wallclock_us = 6;
+}
+
+// Represent the decision on the best acceleration from the mini-benchmark.
+message BestAccelerationDecision {
+  // Number of events used to take the decision.
+  // Using just the size instaed of the full list of events to save space.
+  optional int32 number_of_source_events = 1;
+
+  // Event with min latency in the source ones.
+  optional BenchmarkEvent min_latency_event = 2;
+
+  // Min latency as read from min_latency_event.
+  optional int64 min_inference_time_us = 3;
+}
+
+// Represent a failure during the initialization of the mini-benchmark.
+message BenchmarkInitializationFailure {
+  // Status code returned by the mini-benchmark initialization function.
+  optional int32 initialization_status = 1;
+}
+
+// Events generated by the mini-benchmark before and after triggering
+// the different configuration-specific benchmarks
+message MiniBenchmarkEvent {
+  // Not using oneof because of the way the generated cpp code.
+  // See comment above on TfLite settings for details.
+
+  // If set to true, this event is used to mark all previous events in the
+  // mini-benchmark internal storage as read and one of the other fields
+  // in this message will have a value.
+  optional bool is_log_flushing_event = 1;
+  // Event generated when a best acceleration decision is taken.
+  optional BestAccelerationDecision best_acceleration_decision = 2;
+  // Reports a failure during mini-benchmark initialization.
+  optional BenchmarkInitializationFailure initialization_failure = 3;
+  // Event generated while benchmarking the different settings to test locally.
+  optional BenchmarkEvent benchmark_event = 4;
+}
+
+// How to access the model for mini-benchmark.
+// Since mini-benchmark runs in a separate process, it can not access an
+// in-memory model. It can read the model either from a file or from a file
+// descriptor. The file descriptor typically comes from the Android asset
+// manager.
+//
+// Users should set either filename, or all of fd, offset and length.
+message ModelFile {
+  // Filename for reading model from.
+  optional string filename = 1;
+  // File descriptor to read model from.
+  optional int64 fd = 2;
+  // Offset for model in file descriptor.
+  optional int64 offset = 3;
+  // Length of model in file descriptor.
+  optional int64 length = 4;
+  optional ModelIdGroup model_id_group = 5;
+}
+
+message ModelIdGroup {
+  optional string model_namespace = 1;
+  optional string model_id = 2;
+}
+
+// Where to store mini-benchmark state.
+message BenchmarkStoragePaths {
+  // Base path to the files used to store benchmark results in. Two files
+  // will be generated: one with the given path and an extra file to store
+  // events related to best acceleration results at path storage_file_path +
+  // ".extra.fb". Must be specific to the model.
+  // Note on Android, this should be the code cache directory.
+  optional string storage_file_path = 1;
+
+  // Path to a directory for intermediate files (lock files, extracted
+  // binaries).
+  // Note on Android, this typically is the data cache directory (i.e. the one
+  // returned by `getCacheDir()`).
+  optional string data_directory_path = 2;
+}
+
+// Validation related settings.
+// Next ID: 2
+message ValidationSettings {
+  // Timeout for one settings under test. If test didn't finish within this
+  // timeout, this setting is considered hanging.
+  optional int64 per_test_timeout_ms = 1;
+}
+
+// How to run a minibenchmark.
+// Next ID: 5
+message MinibenchmarkSettings {
+  // Which settings to test. This would typically be filled in from an
+  // allowlist.
+  repeated TFLiteSettings settings_to_test = 1;
+  // How to access the model. This would typically be set dynamically, as it
+  // depends on the application folder and/or runtime state.
+  optional ModelFile model_file = 2;
+  // Where to store state. This would typically be set dynamically, as it
+  // depends on the application folder.
+  optional BenchmarkStoragePaths storage_paths = 3;
+  // Validation test related settings.
+  optional ValidationSettings validation_settings = 4;
+}
+
+// Schema used for cache Benchmark result.
+message BenchmarkEventStorage {
+  optional ModelIdGroup model_id_group = 1;
+  optional BenchmarkEvent benchmark_event = 2;
+}
+
+// LINT.ThenChange(//tensorflow/lite/experimental/acceleration/configuration/testdata/configuration.proto_prev)
diff --git a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc b/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc
index 0a89c265139..93c875b09d8 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 
 namespace tflite {
 namespace delegates {
diff --git a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc b/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc
index 203993febd8..0ecccf57dab 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/xnnpack_plugin_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "pthreadpool.h"  // from @pthreadpool
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 0d8da24a4b7..f433cae051b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -20,14 +20,13 @@ load("//tensorflow:tensorflow.bzl", "clean_dep")
 load("build_defs.bzl", "embedded_binary")
 load("special_rules.bzl", "jpeg_copts", "libjpeg_deps", "minibenchmark_visibility_allowlist")
 
-default_visibility_group = [
-    "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
-    "//tensorflow/lite/tools/benchmark:__subpackages__",
-    "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
-] + minibenchmark_visibility_allowlist()
-
 package(
-    default_visibility = default_visibility_group,
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
+        "//tensorflow/lite/tools/benchmark:__subpackages__",
+        "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
+    ] + minibenchmark_visibility_allowlist(),
     licenses = ["notice"],
 )
 
@@ -40,7 +39,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "@flatbuffers",
@@ -56,9 +55,9 @@ cc_test(
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:interpreter_test_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
@@ -80,9 +79,9 @@ cc_library(
     deps = [
         ":status_codes",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
@@ -96,8 +95,8 @@ cc_test(
     deps = [
         ":fb_storage",
         ":status_codes",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
@@ -127,7 +126,7 @@ cc_library(
         "decode_jpeg_status.h",
     ],
     deps = [
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
     ],
 )
 
@@ -142,7 +141,7 @@ cc_library(
         ":decode_jpeg_status",
         ":jpeg_common",
         ":libjpeg_hdrs",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
     ],
 )
 
@@ -155,7 +154,7 @@ cc_test(
     deps = [
         ":decode_jpeg_status",
         ":libjpeg_handle",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -188,7 +187,7 @@ cc_test(
     deps = [
         ":decode_jpeg_status",
         ":libc_handle",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -225,7 +224,7 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -264,7 +263,7 @@ cc_test(
         ":libjpeg_decoder_test_helper",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -280,8 +279,8 @@ cc_library(
         ":libjpeg_decoder",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:tensor",
         "@flatbuffers",
@@ -337,8 +336,8 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
     ],
 )
@@ -349,7 +348,7 @@ cc_test(
     deps = [
         ":embedded_chessboard_jpeg",
         ":jpeg_header_parser",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -417,21 +416,23 @@ cc_library(
     deps = [
         ":call",
         ":decode_jpeg",
-        ":model_loader",
+        # Link in the GpuModulePlugin statically. It is created by name.
+        ":gpu_module_plugin",  # buildcleaner: keep
         ":status_codes",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration:delegate_registry",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:constants",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/lite/tools:model_loader",
     ],
 )
 
@@ -440,9 +441,11 @@ cc_library(
     srcs = ["validator_runner_options.cc"],
     hdrs = ["validator_runner_options.h"],
     deps = [
+        ":benchmark_result_evaluator",
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",  # buildcleaner: keep
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
     ],
@@ -456,11 +459,11 @@ cc_library(
         ":benchmark_result_evaluator",
         ":fb_storage",
         ":file_lock",
-        ":model_loader",
         ":runner",
         ":status_codes",
         ":validator",
         "@flatbuffers",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
@@ -470,6 +473,7 @@ cc_library(
         # be invoked.
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",  # buildcleaner: keep
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
+        "//tensorflow/lite/tools:model_loader",
     ],
 )
 
@@ -526,11 +530,9 @@ cc_library(
         ":constants",
         ":fb_storage",
         ":file_lock",
-        ":model_loader",
         ":set_big_core_affinity_h",
         ":status_codes",
         ":validator",
-        ":validator_runner",
         "@com_google_absl//absl/strings",
         "@flatbuffers",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
@@ -539,7 +541,7 @@ cc_library(
         # be invoked.
         "//tensorflow/lite/nnapi/sl:nnapi_support_library",  # buildcleaner: keep
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
-        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/tools:model_loader",
     ],
 )
 
@@ -663,6 +665,11 @@ cc_library(
         "mini_benchmark.cc",
     ],
     hdrs = ["mini_benchmark.h"],
+    visibility = [
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
+        "//tensorflow/lite/tools/benchmark:__subpackages__",
+        "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
+    ] + minibenchmark_visibility_allowlist(),
     deps = [
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_absl//absl/status:statusor",
@@ -676,6 +683,11 @@ cc_library(
     srcs = [
         "mini_benchmark_implementation.cc",
     ],
+    visibility = [
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
+        "//tensorflow/lite/tools/benchmark:__subpackages__",
+        "@org_tensorflow_lite_support//tensorflow_lite_support/cc:__subpackages__",
+    ] + minibenchmark_visibility_allowlist(),
     deps = [
         ":fb_storage",
         ":mini_benchmark",
@@ -752,34 +764,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "model_loader",
-    srcs = ["model_loader.cc"],
-    hdrs = ["model_loader.h"],
-    deps = [
-        ":status_codes",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:model_builder",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_test(
-    name = "model_loader_test",
-    srcs = ["model_loader_test.cc"],
-    deps = [
-        ":embedded_mobilenet_model",
-        ":mini_benchmark_test_helper",
-        ":model_loader",
-        ":status_codes",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:runtime_cc",
-    ],
-)
-
 cc_library(
     name = "constants",
     hdrs = ["constants.h"],
@@ -810,9 +794,9 @@ cc_binary(
     linkshared = True,
     deps = [
         ":constants",
-        ":model_loader",
         ":status_codes",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
+        "//tensorflow/lite/tools:model_loader",
         "@com_google_absl//absl/strings",
         "@flatbuffers//:runtime_cc",
     ],
@@ -823,9 +807,9 @@ cc_library(
     srcs = ["runner_test_entry_points.cc"],
     deps = [
         ":constants",
-        ":model_loader",
         ":status_codes",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
+        "//tensorflow/lite/tools:model_loader",
         "@com_google_absl//absl/strings",
         "@flatbuffers//:runtime_cc",
     ],
@@ -881,7 +865,6 @@ cc_binary(
     deps = [
         ":constants",
         ":fb_storage",
-        ":model_loader",
         ":runner",
         ":status_codes",
         ":set_big_core_affinity",
@@ -893,6 +876,7 @@ cc_binary(
         "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
+        "//tensorflow/lite/tools:model_loader",
         "//tensorflow/lite:minimal_logging",
     ] + select({
         # On Android, as the validation runs in a separate process as a
@@ -926,6 +910,22 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "gpu_module_plugin",
+    srcs = ["gpu_module_plugin.cc"],
+    hdrs = ["gpu_module_plugin.h"],
+    deps = [
+        ":status_codes",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+    alwayslink = 1,
+)
+
 embedded_binary(
     name = "embedded_nnapi_sl_fake_impl",
     testonly = 1,
@@ -959,7 +959,6 @@ cc_test(
         ":embedded_mobilenet_validation_model",
         ":embedded_mobilenet_model",
         ":mini_benchmark_test_helper",
-        ":model_loader",
         ":status_codes",
         ":validator",
         "@com_google_googletest//:gtest_main",
@@ -972,6 +971,8 @@ cc_test(
         "//tensorflow/lite/experimental/acceleration/configuration:flatbuffer_to_proto",
         "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
         "//tensorflow/lite/experimental/acceleration/configuration:proto_to_flatbuffer",
+        "//tensorflow/lite/tools:model_loader",
+        "//tensorflow/lite:stderr_reporter",
     ] + select({
         clean_dep("//tensorflow:android"): [
             "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
@@ -984,6 +985,7 @@ cc_test(
     name = "validator_runner_impl_test",
     srcs = ["validator_runner_impl_test.cc"],
     deps = [
+        ":benchmark_result_evaluator",
         ":embedded_mobilenet_model",
         ":embedded_mobilenet_validation_model",
         ":embedded_nnapi_sl_fake_impl",
@@ -994,6 +996,7 @@ cc_test(
         ":status_codes",
         ":validator_runner_entrypoint",
         ":validator_runner_impl",
+        ":validator_runner_options",
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
@@ -1004,6 +1007,7 @@ cc_test(
         "//tensorflow/lite/nnapi/sl:nnapi_support_library_headers",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
     ],
 )
 
@@ -1011,6 +1015,7 @@ cc_test(
     name = "blocking_validator_runner_test",
     srcs = ["blocking_validator_runner_test.cc"],
     deps = [
+        ":benchmark_result_evaluator",
         ":blocking_validator_runner",
         ":embedded_mobilenet_model",
         ":embedded_mobilenet_validation_model",
@@ -1020,7 +1025,7 @@ cc_test(
         ":validator_runner_options",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:public_headers_lib",
+        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
index c5b35ccbe5c..ce8a0062a7f 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.cc
@@ -14,20 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 
+#include <memory>
+
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
 namespace acceleration {
 
-bool EmbeddedResultEvaluator::IsValidationSuccessEvent(
-    const BenchmarkEvent& event) {
-  return event.event_type() == BenchmarkEventType_END && event.result() &&
-         event.result()->ok();
+EmbeddedResultEvaluator* EmbeddedResultEvaluator::GetInstance() {
+  static EmbeddedResultEvaluator* const instance =
+      new EmbeddedResultEvaluator();
+  return instance;
 }
 
-bool CustomResultEvaluator::IsValidationSuccessEvent(
-    const BenchmarkEvent& event) {
-  return event.event_type() == BenchmarkEventType_END;
+bool EmbeddedResultEvaluator::HasPassedAccuracyCheck(
+    const BenchmarkResult& result) {
+  return result.ok();
 }
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
index 964bd893217..3cda96ce35b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h
@@ -21,30 +21,32 @@ namespace tflite {
 namespace acceleration {
 
 // Evaluates the BenchmarkEvent output from validator.
-class BenchmarkResultEvaluator {
+class AbstractBenchmarkResultEvaluator {
  public:
-  virtual ~BenchmarkResultEvaluator() = default;
-
-  // Returns whether this event means the validation passed.
-  virtual bool IsValidationSuccessEvent(const BenchmarkEvent& event) = 0;
+  virtual ~AbstractBenchmarkResultEvaluator() = default;
+
+  // Returns whether this event means the validation test has passed. It checks
+  // that the test has finished successfully, and the test result passed
+  // accuracy checks.
+  bool IsValidationSuccessEvent(const BenchmarkEvent& event) {
+    return event.event_type() == BenchmarkEventType_END && event.result() &&
+           HasPassedAccuracyCheck(*event.result());
+  }
+
+  // Returns whether this BenchmarkResult should pass the accuracy check.
+  virtual bool HasPassedAccuracyCheck(const BenchmarkResult& result) = 0;
 };
 
 // Evaluator for embedded validation scenario.
-class EmbeddedResultEvaluator : public BenchmarkResultEvaluator {
+class EmbeddedResultEvaluator : public AbstractBenchmarkResultEvaluator {
  public:
-  ~EmbeddedResultEvaluator() override = default;
-
-  bool IsValidationSuccessEvent(const BenchmarkEvent& event) override;
-};
+  static EmbeddedResultEvaluator* GetInstance();
 
-// Evaluator for custom validatio scenario.
-// Note: This class treats validation test completion as success for now. It
-// will integrate with custom validation rule from users later.
-class CustomResultEvaluator : public BenchmarkResultEvaluator {
- public:
-  ~CustomResultEvaluator() override = default;
+  bool HasPassedAccuracyCheck(const BenchmarkResult& result) override;
 
-  bool IsValidationSuccessEvent(const BenchmarkEvent& event) override;
+ private:
+  EmbeddedResultEvaluator() = default;
+  ~EmbeddedResultEvaluator() override = default;
 };
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
index 03855f77d20..56ba287e6af 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
 
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -32,6 +33,7 @@ namespace tflite {
 namespace acceleration {
 
 using ::flatbuffers::FlatBufferBuilder;
+using ::flatbuffers::GetRoot;
 
 // Wait time between each query to the test result file, defined in
 // microseconds.
@@ -55,16 +57,17 @@ BlockingValidatorRunner::BlockingValidatorRunner(
       options.custom_input_data.empty()
           ? nullptr
           : std::make_unique<CustomValidationEmbedder>(
-                options.custom_input_batch_size, options.custom_input_data),
-      options.error_reporter, options.nnapi_sl,
-      options.validation_entrypoint_name);
+                options.custom_input_batch_size, options.custom_input_data,
+                options.error_reporter),
+      options.error_reporter, options.nnapi_sl, options.gpu_plugin_handle,
+      options.validation_entrypoint_name, options.benchmark_result_evaluator);
 }
 
 MinibenchmarkStatus BlockingValidatorRunner::Init() {
   return validator_runner_impl_->Init();
 }
 
-std::vector<const BenchmarkEvent*> BlockingValidatorRunner::TriggerValidation(
+std::vector<FlatBufferBuilder> BlockingValidatorRunner::TriggerValidation(
     const std::vector<const TFLiteSettings*>& for_settings) {
   if (for_settings.empty()) {
     return {};
@@ -75,42 +78,73 @@ std::vector<const BenchmarkEvent*> BlockingValidatorRunner::TriggerValidation(
   (void)unlink(storage_path_.c_str());
   auto to_be_run =
       std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>();
+  std::vector<TFLiteSettingsT> for_settings_obj;
+  for_settings_obj.reserve(for_settings.size());
   for (auto settings : for_settings) {
     TFLiteSettingsT tflite_settings;
     settings->UnPackTo(&tflite_settings);
     flatbuffers::FlatBufferBuilder copy;
     copy.Finish(CreateTFLiteSettings(copy, &tflite_settings));
     to_be_run->emplace_back(std::move(copy));
+    for_settings_obj.emplace_back(tflite_settings);
   }
   validator_runner_impl_->TriggerValidationAsync(std::move(to_be_run));
 
   // The underlying process runner should ensure each test finishes on time or
   // timed out. deadline_us is added here as an extra safety guard.
-  int64_t total_timeout_ms = per_test_timeout_ms_ * for_settings.size();
+  int64_t total_timeout_ms = per_test_timeout_ms_ * (1 + for_settings.size());
   int64_t deadline_us = Validator::BootTimeMicros() + total_timeout_ms * 1000;
 
   bool within_timeout = true;
-  int completed = 0;
   // TODO(b/249274787): GetNumCompletedResults() loads the file from disk each
   // time when called. We should find a way to optimize the FlatbufferStorage to
   // reduce the I/O and remove the sleep().
-  while ((completed = validator_runner_impl_->GetNumCompletedResults()) <
+  while ((validator_runner_impl_->GetNumCompletedResults()) <
              for_settings.size() &&
          (within_timeout = Validator::BootTimeMicros() < deadline_us)) {
     usleep(absl::ToInt64Microseconds(kWaitBetweenRefresh));
   }
-  std::vector<const BenchmarkEvent*> results =
-      validator_runner_impl_->GetSuccessfulResults();
+
+  std::vector<FlatBufferBuilder> results =
+      validator_runner_impl_->GetCompletedResults();
   if (!within_timeout) {
     TFLITE_LOG_PROD(
         TFLITE_LOG_WARNING,
         "Validation timed out after %ld ms. Return before all tests finished.",
         total_timeout_ms);
-  } else if (completed > results.size()) {
-    TFLITE_LOG_PROD(
-        TFLITE_LOG_WARNING,
-        "Validation completed. %d out of %d tests failed due to error.",
-        completed - results.size(), completed);
+  } else if (for_settings.size() != results.size()) {
+    TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
+                    "Validation completed.Started benchmarking for %d "
+                    "TFLiteSettings, received %d results.",
+                    for_settings.size(), results.size());
+  }
+
+  // If there are any for_settings missing from results, add an error event.
+  std::vector<TFLiteSettingsT> result_settings;
+  result_settings.reserve(results.size());
+  for (auto& result : results) {
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    TFLiteSettingsT event_settings;
+    event->tflite_settings()->UnPackTo(&event_settings);
+    result_settings.emplace_back(std::move(event_settings));
+  }
+  for (auto& settings_obj : for_settings_obj) {
+    auto result_it =
+        std::find(result_settings.begin(), result_settings.end(), settings_obj);
+    if (result_it == result_settings.end()) {
+      FlatBufferBuilder fbb;
+      fbb.Finish(CreateBenchmarkEvent(
+          fbb, CreateTFLiteSettings(fbb, &settings_obj),
+          BenchmarkEventType_ERROR, /* result */ 0,
+          CreateBenchmarkError(fbb, BenchmarkStage_UNKNOWN,
+                               /* exit_code */ 0, /* signal */ 0,
+                               /* error_code */ 0,
+                               /* mini_benchmark_error_code */
+                               kMinibenchmarkCompletionEventMissing),
+          Validator::BootTimeMicros(), Validator::WallTimeMicros()));
+      results.emplace_back(std::move(fbb));
+    }
   }
   return results;
 }
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
index d7ce78b1aba..a0684033b81 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h
@@ -41,9 +41,10 @@ class BlockingValidatorRunner {
 
   MinibenchmarkStatus Init();
 
-  // Trigger the validation tests with for_settings, and return the successful
-  // test result.
-  std::vector<const BenchmarkEvent*> TriggerValidation(
+  // Trigger the validation tests with for_settings, and return the test result.
+  // Each for_settings will have a corresponding result. The result is of schema
+  // BenchmarkEvent.
+  std::vector<flatbuffers::FlatBufferBuilder> TriggerValidation(
       const std::vector<const TFLiteSettings*>& for_settings);
 
  private:
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
index a553a7ba5f0..a1396131dd0 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "flatbuffers/buffer.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
@@ -36,16 +37,20 @@ namespace acceleration {
 namespace {
 
 using ::flatbuffers::FlatBufferBuilder;
+using ::flatbuffers::GetRoot;
+
+class CustomResultEvaluator : public AbstractBenchmarkResultEvaluator {
+ public:
+  bool HasPassedAccuracyCheck(const BenchmarkResult& result) override {
+    return true;
+  }
+};
 
 class BlockingValidatorRunnerTest : public ::testing::Test {
  protected:
   void SetUp() override {
     MiniBenchmarkTestHelper helper;
     should_perform_test_ = helper.should_perform_test();
-
-    if (!should_perform_test_) {
-      return;
-    }
     options_.model_path = helper.DumpToTempFile(
         "mobilenet_quant_with_validation.tflite",
         g_tflite_acceleration_embedded_mobilenet_validation_model,
@@ -54,7 +59,6 @@ class BlockingValidatorRunnerTest : public ::testing::Test {
 
     options_.data_directory_path = ::testing::TempDir();
     options_.storage_path = ::testing::TempDir() + "/storage_path.fb";
-    (void)unlink(options_.storage_path.c_str());
     options_.per_test_timeout_ms = 5000;
 
     plain_model_path_ = MiniBenchmarkTestHelper::DumpToTempFile(
@@ -63,6 +67,12 @@ class BlockingValidatorRunnerTest : public ::testing::Test {
         g_tflite_acceleration_embedded_mobilenet_model_len);
   }
 
+  void TearDown() override {
+    if (should_perform_test_) {
+      ASSERT_EQ(unlink(options_.storage_path.c_str()), 0);
+    }
+  }
+
   std::string plain_model_path_;
   ValidatorRunnerOptions options_;
   bool should_perform_test_ = true;
@@ -83,12 +93,14 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWithEmbeddedValidation) {
   fbb.Finish(CreateTFLiteSettings(fbb));
 #endif  // __ANDROID__
 
-  std::vector<const BenchmarkEvent*> results = runner.TriggerValidation(
+  std::vector<FlatBufferBuilder> results = runner.TriggerValidation(
       {flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())});
   EXPECT_THAT(results, testing::Not(testing::IsEmpty()));
   for (auto& result : results) {
-    EXPECT_EQ(result->event_type(), BenchmarkEventType_END);
-    EXPECT_TRUE(result->result()->ok());
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    EXPECT_EQ(event->event_type(), BenchmarkEventType_END);
+    EXPECT_TRUE(event->result()->ok());
   }
 }
 
@@ -107,6 +119,8 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWithFdModelCustomValidation) {
   options_.model_offset = 0;
   options_.custom_input_batch_size = 3;
   options_.custom_input_data = {std::vector<uint8_t>(3 * 224 * 224 * 3, 1)};
+  CustomResultEvaluator evaluator;
+  options_.benchmark_result_evaluator = &evaluator;
 
   BlockingValidatorRunner runner(options_);
   ASSERT_EQ(runner.Init(), kMinibenchmarkSuccess);
@@ -117,13 +131,16 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWithFdModelCustomValidation) {
   fbb.Finish(CreateTFLiteSettings(fbb));
 #endif  // __ANDROID__
 
-  std::vector<const BenchmarkEvent*> results = runner.TriggerValidation(
+  std::vector<FlatBufferBuilder> results = runner.TriggerValidation(
       {flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())});
   EXPECT_THAT(results, testing::Not(testing::IsEmpty()));
   for (auto& result : results) {
-    EXPECT_EQ(result->event_type(), BenchmarkEventType_END);
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    EXPECT_EQ(event->event_type(), BenchmarkEventType_END);
   }
 }
+
 #ifndef __ANDROID__
 TEST_F(BlockingValidatorRunnerTest, SucceedWhenRunningMultipleTimes) {
   if (!should_perform_test_) {
@@ -138,32 +155,46 @@ TEST_F(BlockingValidatorRunnerTest, SucceedWhenRunningMultipleTimes) {
 
   int num_runs = 3;
   for (int i = 0; i < num_runs; i++) {
-    std::vector<const BenchmarkEvent*> results = runner.TriggerValidation(
-        {flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())});
+    std::vector<FlatBufferBuilder> results = runner.TriggerValidation(
+        {flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer()),
+         flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())});
     EXPECT_THAT(results, testing::Not(testing::IsEmpty()));
     for (auto& result : results) {
-      EXPECT_EQ(result->event_type(), BenchmarkEventType_END);
-      EXPECT_TRUE(result->result()->ok());
+      const BenchmarkEvent* event =
+          GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+      EXPECT_EQ(event->event_type(), BenchmarkEventType_END);
+      EXPECT_TRUE(event->result()->ok());
     }
   }
 }
 #endif  // !__ANDROID__
 
-TEST_F(BlockingValidatorRunnerTest, ReturnEmptyWhenTimedOut) {
+TEST_F(BlockingValidatorRunnerTest, ReturnErrorWhenTimedOut) {
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
     return;
   }
-
   options_.per_test_timeout_ms = 100;
   BlockingValidatorRunner runner(options_);
   ASSERT_EQ(runner.Init(), kMinibenchmarkSuccess);
   FlatBufferBuilder fbb;
   fbb.Finish(CreateTFLiteSettings(fbb));
 
-  std::vector<const BenchmarkEvent*> results = runner.TriggerValidation(
+  std::vector<FlatBufferBuilder> results = runner.TriggerValidation(
       {flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())});
-  EXPECT_THAT(results, testing::IsEmpty());
+  EXPECT_THAT(results, testing::SizeIs(1));
+  for (auto& result : results) {
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    EXPECT_EQ(event->event_type(), BenchmarkEventType_ERROR);
+#ifdef __ANDROID__
+    EXPECT_EQ(event->error()->mini_benchmark_error_code(),
+              kMinibenchmarkCommandTimedOut);
+#else
+    EXPECT_EQ(event->error()->mini_benchmark_error_code(),
+              kMinibenchmarkCompletionEventMissing);
+#endif
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
index e369ac90166..7bb09520454 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/build_defs.bzl
@@ -105,13 +105,25 @@ def validation_model(
               %s %s \
               --jpegs=$$JPEGS \
               --use_ondevice_cpu_for_golden=%s \
-              --output=$(@D)/tmp
+              --output='$(@D)/%s.tflite.tmp'
           $(location //tensorflow/lite/experimental/acceleration/mini_benchmark:copy_associated_files) \
-              $(@D)/tmp \
+              '$(@D)/%s.tflite.tmp' \
               $(location %s) \
               $(location %s.tflite)
-          rm $(@D)/tmp
-        """ % (jpegs, main_model, metrics_model, scale_arg, zeropoint_arg, use_ondevice_cpu_for_golden, main_model, name),
+          rm '$(@D)/%s.tflite.tmp'
+        """ % (
+            jpegs,
+            main_model,
+            metrics_model,
+            scale_arg,
+            zeropoint_arg,
+            use_ondevice_cpu_for_golden,
+            name,
+            name,
+            main_model,
+            name,
+            name,
+        ),
         tools = [
             "//tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier:embedder_cmdline",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:copy_associated_files",
@@ -150,9 +162,9 @@ def validation_test(name, validation_model, tags = [], copts = [], deps = []):
             "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
             "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:big_little_affinity",
-            "//tensorflow/lite/experimental/acceleration/mini_benchmark:model_loader",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
             "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator",
+            "//tensorflow/lite/tools:model_loader",
         ] + select({
             clean_dep("//tensorflow:android"): [
                 "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
index 656808701f3..2041c9a6d9e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
@@ -19,6 +19,7 @@ default_visibility_group = [
 ] + minibenchmark_visibility_allowlist()
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_visibility_group,
     licenses = ["notice"],
 )
@@ -28,11 +29,13 @@ cc_library(
     srcs = ["c_api.cc"],
     hdrs = ["c_api.h"],
     deps = [
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core/api:error_reporter",
+        ":c_api_types",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:benchmark_result_evaluator",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:blocking_validator_runner",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_entrypoint",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_options",
         "@flatbuffers",
     ],
@@ -43,15 +46,19 @@ cc_test(
     srcs = ["c_api_test.cc"],
     deps = [
         ":c_api",
-        "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_entrypoint",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
         "@flatbuffers//:runtime_cc",
     ],
 )
+
+cc_library(
+    name = "c_api_types",
+    hdrs = ["c_api_types.h"],
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
index b3bbf80d986..c8e8f7abcc4 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
@@ -23,9 +23,11 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
-#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
 
@@ -52,10 +54,35 @@ class CErrorReporter : public tflite::ErrorReporter {
   ErrorReporterFunc* error_reporter_func_;
 };
 
+class CResultEvaluator
+    : public tflite::acceleration::AbstractBenchmarkResultEvaluator {
+ public:
+  using ValidatorFunc = bool(void* user_data, uint8_t* benchmark_result_data,
+                             int benchmark_result_data_size);
+  explicit CResultEvaluator(
+      const TfLiteMiniBenchmarkCustomValidationInfo& custom_validation_info)
+      : user_data_(custom_validation_info.accuracy_validator_user_data),
+        validator_func_(custom_validation_info.accuracy_validator_func) {}
+
+  ~CResultEvaluator() override = default;
+
+  bool HasPassedAccuracyCheck(
+      const tflite::BenchmarkResult& benchmark_result) override {
+    flatbuffers::FlatBufferBuilder fbb;
+    tflite::BenchmarkResultT result_obj;
+    benchmark_result.UnPackTo(&result_obj);
+    fbb.Finish(tflite::CreateBenchmarkResult(fbb, &result_obj));
+    return validator_func_(user_data_, fbb.GetBufferPointer(), fbb.GetSize());
+  }
+
+ private:
+  void* user_data_;
+  ValidatorFunc* validator_func_;
+};
+
 // Allocate memory in minibenchmark_result and serialize benchmark_events to it.
-void CreateData(
-    const std::vector<const tflite::BenchmarkEvent*>& benchmark_events,
-    TfLiteMiniBenchmarkResult& minibenchmark_result) {
+void CreateData(std::vector<FlatBufferBuilder> benchmark_events,
+                TfLiteMiniBenchmarkResult& minibenchmark_result) {
   if (benchmark_events.empty()) {
     return;
   }
@@ -63,10 +90,11 @@ void CreateData(
   std::vector<uint8_t> data;
   data.reserve(kPerBenchmarkEventSize * benchmark_events.size());
   auto cur = data.begin();
-  for (auto& event : benchmark_events) {
+  for (auto& event_data : benchmark_events) {
     FlatBufferBuilder fbb;
     tflite::BenchmarkEventT event_obj;
-    event->UnPackTo(&event_obj);
+    flatbuffers::GetRoot<tflite::BenchmarkEvent>(event_data.GetBufferPointer())
+        ->UnPackTo(&event_obj);
     fbb.FinishSizePrefixed(CreateBenchmarkEvent(fbb, &event_obj));
     data.insert(cur, fbb.GetBufferPointer(),
                 fbb.GetBufferPointer() + fbb.GetSize());
@@ -93,8 +121,8 @@ std::vector<std::vector<uint8_t>> ToCustomInputData(
 }
 
 void TfLiteBlockingValidatorRunnerTriggerValidationImpl(
-    const TfLiteMinibenchmarkSettings& settings,
-    TfLiteMiniBenchmarkResult& result) {
+    const TfLiteMiniBenchmarkSettings& settings,
+    TfLiteMiniBenchmarkResult& result, tflite::ErrorReporter* error_reporter) {
   // Create ValidatorRunnerOptions.
   const tflite::MinibenchmarkSettings* minibenchmark_settings =
       flatbuffers::GetRoot<tflite::MinibenchmarkSettings>(
@@ -103,17 +131,25 @@ void TfLiteBlockingValidatorRunnerTriggerValidationImpl(
       tflite::acceleration::CreateValidatorRunnerOptionsFrom(
           *minibenchmark_settings);
 
-  std::unique_ptr<CErrorReporter> error_reporter;
-  if (settings.error_reporter_func != nullptr) {
-    error_reporter = std::make_unique<CErrorReporter>(
-        settings.error_reporter_user_data, settings.error_reporter_func);
-    options.error_reporter = error_reporter.get();
-  }
+  std::unique_ptr<tflite::acceleration::AbstractBenchmarkResultEvaluator>
+      result_evaluator;
   if (settings.custom_validation_info.buffer) {
     options.custom_input_batch_size =
         settings.custom_validation_info.batch_size;
     options.custom_input_data =
         ToCustomInputData(settings.custom_validation_info);
+    if (settings.custom_validation_info.accuracy_validator_func != nullptr) {
+      result_evaluator =
+          std::make_unique<CResultEvaluator>(settings.custom_validation_info);
+      options.benchmark_result_evaluator = result_evaluator.get();
+    }
+  }
+  if (error_reporter) {
+    options.error_reporter = error_reporter;
+  }
+  if (settings.gpu_plugin_handle) {
+    options.gpu_plugin_handle =
+        (const TfLiteDelegatePlugin*)settings.gpu_plugin_handle;
   }
 
   tflite::acceleration::BlockingValidatorRunner runner(options);
@@ -132,14 +168,98 @@ void TfLiteBlockingValidatorRunnerTriggerValidationImpl(
 
 }  // namespace
 
+int TfLiteMiniBenchmarkResultInitStatus(TfLiteMiniBenchmarkResult* result) {
+  return result->init_status;
+}
+
+uint8_t* TfLiteMiniBenchmarkResultFlatBufferData(
+    TfLiteMiniBenchmarkResult* result) {
+  return result->flatbuffer_data;
+}
+
+size_t TfLiteMiniBenchmarkResultFlatBufferDataSize(
+    TfLiteMiniBenchmarkResult* result) {
+  return result->flatbuffer_data_size;
+}
+
+void TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation, int batch_size,
+    uint8_t* buffer, size_t* buffer_dim, int buffer_dim_size) {
+  custom_validation->batch_size = batch_size;
+  custom_validation->buffer = buffer;
+  custom_validation->buffer_dim = buffer_dim;
+  custom_validation->buffer_dim_size = buffer_dim_size;
+}
+
+void TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation,
+    void* accuracy_validator_user_data,
+    bool (*accuracy_validator_func)(void* user_data,
+                                    uint8_t* benchmark_result_data,
+                                    int benchmark_result_data_size)) {
+  custom_validation->accuracy_validator_user_data =
+      accuracy_validator_user_data;
+  custom_validation->accuracy_validator_func = accuracy_validator_func;
+}
+
+TfLiteMiniBenchmarkSettings* TfLiteMiniBenchmarkSettingsCreate() {
+  return new TfLiteMiniBenchmarkSettings{};
+}
+
+TfLiteMiniBenchmarkCustomValidationInfo*
+TfLiteMiniBenchmarkSettingsCustomValidationInfo(
+    TfLiteMiniBenchmarkSettings* settings) {
+  return &settings->custom_validation_info;
+}
+
+void TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+    TfLiteMiniBenchmarkSettings* settings, uint8_t* flatbuffer_data,
+    size_t flatbuffer_data_size) {
+  settings->flatbuffer_data = flatbuffer_data;
+  settings->flatbuffer_data_size = flatbuffer_data_size;
+}
+
+void TfLiteMiniBenchmarkSettingsSetErrorReporter(
+    TfLiteMiniBenchmarkSettings* settings, void* error_reporter_user_data,
+    int (*error_reporter_func)(void* user_data, const char* format,
+                               va_list args)) {
+  settings->error_reporter_user_data = error_reporter_user_data;
+  settings->error_reporter_func = error_reporter_func;
+}
+
+void TfLiteMiniBenchmarkSettingsSetGpuPluginHandle(
+    TfLiteMiniBenchmarkSettings* settings, void* gpu_plugin_handle) {
+  settings->gpu_plugin_handle = gpu_plugin_handle;
+}
+
+void TfLiteMiniBenchmarkSettingsFree(TfLiteMiniBenchmarkSettings* settings) {
+  delete settings;
+}
+
 TfLiteMiniBenchmarkResult* TfLiteBlockingValidatorRunnerTriggerValidation(
-    TfLiteMinibenchmarkSettings* settings) {
+    TfLiteMiniBenchmarkSettings* settings) {
   TfLiteMiniBenchmarkResult* return_value =
       new TfLiteMiniBenchmarkResult{0, nullptr, 0};
-  if (!settings || !settings->flatbuffer_data ||
-      settings->flatbuffer_data_size == 0) {
+
+  if (!settings) {
+    return_value->init_status =
+        tflite::acceleration::kMinibenchmarkPreconditionNotMet;
+    return return_value;
+  }
+
+  std::unique_ptr<CErrorReporter> error_reporter;
+  if (settings->error_reporter_func != nullptr) {
+    error_reporter = std::make_unique<CErrorReporter>(
+        settings->error_reporter_user_data, settings->error_reporter_func);
+  }
+
+  if (!settings->flatbuffer_data || settings->flatbuffer_data_size == 0) {
     return_value->init_status =
         tflite::acceleration::kMinibenchmarkPreconditionNotMet;
+    if (error_reporter) {
+      TF_LITE_REPORT_ERROR(error_reporter.get(),
+                           "MinibenchmarkSettings config is not set.");
+    }
     return return_value;
   }
   // Verify data is not corrupted.
@@ -148,10 +268,15 @@ TfLiteMiniBenchmarkResult* TfLiteBlockingValidatorRunnerTriggerValidation(
   if (!verifier.VerifyBuffer<tflite::MinibenchmarkSettings>()) {
     return_value->init_status =
         tflite::acceleration::kMinibenchmarkCorruptSizePrefixedFlatbufferFile;
+    if (error_reporter) {
+      TF_LITE_REPORT_ERROR(error_reporter.get(),
+                           "MinibenchmarkSettings is corruprted.");
+    }
     return return_value;
   }
 
-  TfLiteBlockingValidatorRunnerTriggerValidationImpl(*settings, *return_value);
+  TfLiteBlockingValidatorRunnerTriggerValidationImpl(*settings, *return_value,
+                                                     error_reporter.get());
   return return_value;
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
index 405ee50e58d..2d68200d457 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.h
@@ -17,60 +17,58 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
-
-#include <cstdarg>
+#include <stdio.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// The result of triggering MiniBenchmark.
-typedef struct TfLiteMiniBenchmarkResult {
-  // MinibenchmarkStatus of whether test is initialized successfully. The value
-  // maps to status_codes.h.
-  int init_status;
-  // The pointer to a stream of BenchmarkEvent(s). Size of each event is
-  // prefixed.
-  uint8_t* flatbuffer_data;
-  // The byte size of the flatbuffer_data.
-  size_t flatbuffer_data_size;
-} TfLiteMiniBenchmarkResult;
+// APIs of TfLiteMiniBenchmarkResult.
+typedef struct TfLiteMiniBenchmarkResult TfLiteMiniBenchmarkResult;
+int TfLiteMiniBenchmarkResultInitStatus(TfLiteMiniBenchmarkResult* result);
+uint8_t* TfLiteMiniBenchmarkResultFlatBufferData(
+    TfLiteMiniBenchmarkResult* result);
+size_t TfLiteMiniBenchmarkResultFlatBufferDataSize(
+    TfLiteMiniBenchmarkResult* result);
+// Free memory allocated with `result`.
+void TfLiteMiniBenchmarkResultFree(TfLiteMiniBenchmarkResult* result);
 
-// Custom validation related info.
-typedef struct TfLiteMiniBenchmarkCustomValidationInfo {
-  // The batch number of custom input.
-  int batch_size;
-  // Length of buffer_dim.
-  int buffer_dim_size;
-  // The size of each custom input within buffer.
-  size_t* buffer_dim;
-  // Pointer to concatenated custom input data. At embedding time, the
-  // i-th input tensor buffer starts from sum(buffer_dim[0...i-1]) to
-  // sum(buffer_dim[0...i]).
-  uint8_t* buffer;
-} TfLiteMiniBenchmarkCustomValidationInfo;
+// APIs of TfLiteMiniBenchmarkCustomValidationInfo.
+typedef struct TfLiteMiniBenchmarkCustomValidationInfo
+    TfLiteMiniBenchmarkCustomValidationInfo;
+void TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation, int batch_size,
+    uint8_t* buffer, size_t* buffer_dim, int buffer_dim_size);
+void TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+    TfLiteMiniBenchmarkCustomValidationInfo* custom_validation,
+    void* accuracy_validator_user_data,
+    bool (*accuracy_validator_func)(void* user_data,
+                                    uint8_t* benchmark_result_data,
+                                    int benchmark_result_data_size));
 
-typedef struct TfLiteMiniBenchmarkSettings {
-  // The pointer to a flatbuffer data of MinibenchmarkSettings.
-  uint8_t* flatbuffer_data;
-  // The byte size of the flatbuffer_data.
-  size_t flatbuffer_data_size;
-  // Custom validation related info.
-  TfLiteMiniBenchmarkCustomValidationInfo custom_validation_info;
-  // Custom error reporter to log error to. If the function is provided, errors
-  // will be log with this function.
-  void* error_reporter_user_data;
-  int (*error_reporter_func)(void* user_data, const char* format, va_list args);
-} TfLiteMinibenchmarkSettings;
+// APIs of TfLiteMiniBenchmarkSettings.
+typedef struct TfLiteMiniBenchmarkSettings TfLiteMiniBenchmarkSettings;
+TfLiteMiniBenchmarkSettings* TfLiteMiniBenchmarkSettingsCreate();
+TfLiteMiniBenchmarkCustomValidationInfo*
+TfLiteMiniBenchmarkSettingsCustomValidationInfo(
+    TfLiteMiniBenchmarkSettings* settings);
+void TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+    TfLiteMiniBenchmarkSettings* settings, uint8_t* flatbuffer_data,
+    size_t flatbuffer_data_size);
+void TfLiteMiniBenchmarkSettingsSetErrorReporter(
+    TfLiteMiniBenchmarkSettings* settings, void* error_reporter_user_data,
+    int (*error_reporter_func)(void* user_data, const char* format,
+                               va_list args));
+void TfLiteMiniBenchmarkSettingsSetGpuPluginHandle(
+    TfLiteMiniBenchmarkSettings* settings, void* gpu_plugin_handle);
+void TfLiteMiniBenchmarkSettingsFree(TfLiteMiniBenchmarkSettings* settings);
 
+// Others.
 // Trigger validation for `settings` and return the validation result.
 // This returns a pointer, that you must free using
 // TfLiteMiniBenchmarkResultFree().
 TfLiteMiniBenchmarkResult* TfLiteBlockingValidatorRunnerTriggerValidation(
-    TfLiteMinibenchmarkSettings* settings);
-
-// Free memory allocated with `result`.
-void TfLiteMiniBenchmarkResultFree(TfLiteMiniBenchmarkResult* result);
+    TfLiteMiniBenchmarkSettings* settings);
 
 #ifdef __cplusplus
 }  // extern "C".
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
index 8626c0a0834..c3d2f6061fd 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/vector.h"  // from @flatbuffers
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
@@ -64,6 +63,20 @@ class MockErrorReporter {
   MOCK_METHOD(int, Log, (const char* format, va_list args));
 };
 
+class MockResultEvaluator {
+ public:
+  static bool Invoke(void* user_data, uint8_t* benchmark_result_data,
+                     int benchmark_result_data_size) {
+    MockResultEvaluator* evaluator =
+        static_cast<MockResultEvaluator*>(user_data);
+    return evaluator->HasPassedAccuracyCheck(benchmark_result_data,
+                                             benchmark_result_data_size);
+  }
+  MOCK_METHOD(bool, HasPassedAccuracyCheck,
+              (uint8_t * benchmark_result_data,
+               int benchmark_result_data_size));
+};
+
 class CApiTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -107,7 +120,14 @@ class CApiTest : public ::testing::Test {
       flatbuffers::Vector<flatbuffers::Offset<tflite::TFLiteSettings>>>
   CreateTFLiteSettings() {
     return mini_benchmark_fbb_.CreateVector(
-        {tflite::CreateTFLiteSettings(mini_benchmark_fbb_)});
+        {tflite::CreateTFLiteSettings(mini_benchmark_fbb_,
+#ifdef ANDROID
+                                      tflite::Delegate_GPU
+#else
+                                      tflite::Delegate_NONE
+#endif
+
+                                      )});
   }
 
   flatbuffers::FlatBufferBuilder mini_benchmark_fbb_;
@@ -126,24 +146,78 @@ TEST_F(CApiTest, SucceedWithEmbeddedValidation) {
       mini_benchmark_fbb_, CreateTFLiteSettings(),
       CreateModelFile(embedded_model_path_), CreateStoragePaths(),
       CreateValidationSettings()));
-  TfLiteMiniBenchmarkSettings settings{mini_benchmark_fbb_.GetBufferPointer(),
-                                       mini_benchmark_fbb_.GetSize()};
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkResult* result =
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
+  std::vector<const tflite::BenchmarkEvent*> events =
+      ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                        TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
+
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkSuccess);
+  EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
+  for (auto& event : events) {
+    EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_END);
+    EXPECT_TRUE(event->result()->ok());
+  }
+  TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
+}
+
+TEST_F(CApiTest, SucceedWithCustomValidationAndPassingRule) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+
+  const int batch_size = 5;
+  size_t input_size[] = {batch_size * 224 * 224 * 3};
+  std::vector<uint8_t> custom_input_data(input_size[0], 1);
+  mini_benchmark_fbb_.Finish(tflite::CreateMinibenchmarkSettings(
+      mini_benchmark_fbb_, CreateTFLiteSettings(),
+      CreateModelFile(plain_model_path_), CreateStoragePaths(),
+      CreateValidationSettings()));
+  MockResultEvaluator mock_evaluator;
+  EXPECT_CALL(mock_evaluator, HasPassedAccuracyCheck(_, _))
+      .Times(1)
+      .WillRepeatedly(testing::Return(true));
+
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkCustomValidationInfo* custom_validation =
+      TfLiteMiniBenchmarkSettingsCustomValidationInfo(settings);
+  TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(custom_validation,
+                                                   /*batch_size=*/batch_size,
+                                                   custom_input_data.data(),
+                                                   /*buffer_dim=*/input_size,
+                                                   /*buffer_dim_size=*/1);
+  TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+      custom_validation, &mock_evaluator, MockResultEvaluator::Invoke);
 
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
   std::vector<const tflite::BenchmarkEvent*> events =
-      ToBenchmarkEvents(result->flatbuffer_data, result->flatbuffer_data_size);
+      ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                        TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
+
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkSuccess);
 
-  EXPECT_THAT(result->init_status, tflite::acceleration::kMinibenchmarkSuccess);
   EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
   for (auto& event : events) {
     EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_END);
     EXPECT_TRUE(event->result()->ok());
   }
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
-TEST_F(CApiTest, SucceedWithCustomValidation) {
+TEST_F(CApiTest, ReturnNotOkWhenAccuracyCheckFail) {
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
     return;
@@ -156,25 +230,41 @@ TEST_F(CApiTest, SucceedWithCustomValidation) {
       mini_benchmark_fbb_, CreateTFLiteSettings(),
       CreateModelFile(plain_model_path_), CreateStoragePaths(),
       CreateValidationSettings()));
-  TfLiteMiniBenchmarkSettings settings{
-      mini_benchmark_fbb_.GetBufferPointer(), mini_benchmark_fbb_.GetSize(),
-      TfLiteMiniBenchmarkCustomValidationInfo{/*batch_size=*/batch_size,
-                                              /*num_input=*/1,
-                                              /*buffer_dim=*/input_size,
-                                              custom_input_data.data()}};
+  MockResultEvaluator mock_evaluator;
+  EXPECT_CALL(mock_evaluator, HasPassedAccuracyCheck(_, _))
+      .Times(1)
+      .WillRepeatedly(testing::Return(false));
+
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkCustomValidationInfo* custom_validation =
+      TfLiteMiniBenchmarkSettingsCustomValidationInfo(settings);
+  TfLiteMiniBenchmarkCustomValidationInfoSetBuffer(custom_validation,
+                                                   /*batch_size=*/batch_size,
+                                                   custom_input_data.data(),
+                                                   /*buffer_dim=*/input_size,
+                                                   /*buffer_dim_size=*/1);
+  TfLiteMiniBenchmarkCustomValidationInfoSetAccuracyValidator(
+      custom_validation, &mock_evaluator, MockResultEvaluator::Invoke);
 
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
   std::vector<const tflite::BenchmarkEvent*> events =
-      ToBenchmarkEvents(result->flatbuffer_data, result->flatbuffer_data_size);
+      ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                        TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
 
-  EXPECT_THAT(result->init_status, tflite::acceleration::kMinibenchmarkSuccess);
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkSuccess);
 
   EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
   for (auto& event : events) {
     EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_END);
+    EXPECT_FALSE(event->result()->ok());
   }
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
 TEST_F(CApiTest, ReturnFailStatusWhenModelPathInvalid) {
@@ -182,22 +272,23 @@ TEST_F(CApiTest, ReturnFailStatusWhenModelPathInvalid) {
       mini_benchmark_fbb_, CreateTFLiteSettings(),
       CreateModelFile("invalid/path"), CreateStoragePaths(),
       CreateValidationSettings()));
-  TfLiteMiniBenchmarkSettings settings{
-      mini_benchmark_fbb_.GetBufferPointer(),
-      mini_benchmark_fbb_.GetSize(),
-  };
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
 
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
 
-  EXPECT_THAT(result->init_status,
-              tflite::acceleration::kMinibenchmarkModelBuildFailed);
-  EXPECT_EQ(result->flatbuffer_data, nullptr);
-  EXPECT_EQ(result->flatbuffer_data_size, 0);
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkModelInitFailed);
+  EXPECT_EQ(TfLiteMiniBenchmarkResultFlatBufferData(result), nullptr);
+  EXPECT_EQ(TfLiteMiniBenchmarkResultFlatBufferDataSize(result), 0);
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
-TEST_F(CApiTest, ReturnEmptyWhenTestTimedOut) {
+TEST_F(CApiTest, ReturnErrorWhenTestTimedOut) {
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
     return;
@@ -208,49 +299,62 @@ TEST_F(CApiTest, ReturnEmptyWhenTestTimedOut) {
       CreateModelFile(embedded_model_path_), CreateStoragePaths(),
       tflite::CreateValidationSettings(mini_benchmark_fbb_,
                                        /*per_test_timeout_ms=*/2)));
-  TfLiteMiniBenchmarkSettings settings{
-      mini_benchmark_fbb_.GetBufferPointer(),
-      mini_benchmark_fbb_.GetSize(),
-  };
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
 
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
 
-  EXPECT_THAT(result->init_status, tflite::acceleration::kMinibenchmarkSuccess);
-  EXPECT_EQ(result->flatbuffer_data, nullptr);
-  EXPECT_EQ(result->flatbuffer_data_size, 0);
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
+              tflite::acceleration::kMinibenchmarkSuccess);
+  std::vector<const tflite::BenchmarkEvent*> events =
+      ToBenchmarkEvents(TfLiteMiniBenchmarkResultFlatBufferData(result),
+                        TfLiteMiniBenchmarkResultFlatBufferDataSize(result));
+  EXPECT_THAT(events, testing::Not(testing::IsEmpty()));
+  for (auto& event : events) {
+    EXPECT_EQ(event->event_type(), tflite::BenchmarkEventType_ERROR);
+  }
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
 TEST_F(CApiTest, UseProvidedErrorReporterWhenFail) {
   mini_benchmark_fbb_.Finish(tflite::CreateMinibenchmarkSettings(
       mini_benchmark_fbb_, CreateTFLiteSettings(), 0, 0, 0));
-  TfLiteMiniBenchmarkSettings settings{
-      mini_benchmark_fbb_.GetBufferPointer(),
-      mini_benchmark_fbb_.GetSize(),
-  };
   MockErrorReporter reporter;
   EXPECT_CALL(reporter, Log(_, _)).Times(testing::AtLeast(1));
-  settings.error_reporter_user_data = &reporter;
-  settings.error_reporter_func = &MockErrorReporter::InvokeErrorReporter;
+
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, mini_benchmark_fbb_.GetBufferPointer(),
+      mini_benchmark_fbb_.GetSize());
+  TfLiteMiniBenchmarkSettingsSetErrorReporter(
+      settings, &reporter, &MockErrorReporter::InvokeErrorReporter);
 
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
-  EXPECT_THAT(result->init_status,
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
+  EXPECT_THAT(TfLiteMiniBenchmarkResultInitStatus(result),
               tflite::acceleration::kMinibenchmarkPreconditionNotMet);
+  EXPECT_EQ(TfLiteMiniBenchmarkResultFlatBufferData(result), nullptr);
+  EXPECT_EQ(TfLiteMiniBenchmarkResultFlatBufferDataSize(result), 0);
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 
 TEST_F(CApiTest, ReturnFailStatusWhenSettingsCorrupted) {
   std::vector<uint8_t> settings_corrupted(10, 1);
-  TfLiteMiniBenchmarkSettings settings{settings_corrupted.data(),
-                                       settings_corrupted.size()};
+  TfLiteMiniBenchmarkSettings* settings = TfLiteMiniBenchmarkSettingsCreate();
+  TfLiteMiniBenchmarkSettingsSetFlatBufferData(
+      settings, settings_corrupted.data(), settings_corrupted.size());
   TfLiteMiniBenchmarkResult* result =
-      TfLiteBlockingValidatorRunnerTriggerValidation(&settings);
+      TfLiteBlockingValidatorRunnerTriggerValidation(settings);
 
   EXPECT_THAT(
-      result->init_status,
+      TfLiteMiniBenchmarkResultInitStatus(result),
       tflite::acceleration::kMinibenchmarkCorruptSizePrefixedFlatbufferFile);
   TfLiteMiniBenchmarkResultFree(result);
+  TfLiteMiniBenchmarkSettingsFree(settings);
 }
 }  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_types.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_types.h
new file mode 100644
index 00000000000..bdcef41b975
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_types.h
@@ -0,0 +1,90 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdarg>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Internal structures used by the C API. Clients should not directly depend
+// on this file.
+// The result of triggering MiniBenchmark.
+struct TfLiteMiniBenchmarkResult {
+  // MinibenchmarkStatus of whether test is initialized successfully. The value
+  // maps to status_codes.h.
+  int init_status;
+  // The pointer to a stream of BenchmarkEvent(s). Size of each event is
+  // prefixed.
+  uint8_t* flatbuffer_data;
+  // The byte size of the flatbuffer_data.
+  size_t flatbuffer_data_size;
+};
+
+// Custom validation related info. For forward source compatibility, this
+// struct should always be brace-initialized, so that all fields (including any
+// that might be added in the future) get zero-initialized.
+struct TfLiteMiniBenchmarkCustomValidationInfo {
+  // The batch number of custom input.
+  int batch_size;
+  // Length of buffer_dim.
+  int buffer_dim_size;
+  // The size of each custom input within buffer.
+  size_t* buffer_dim;
+  // Pointer to concatenated custom input data. At embedding time, the
+  // i-th input tensor buffer starts from sum(buffer_dim[0...i-1]) to
+  // sum(buffer_dim[0...i]).
+  uint8_t* buffer;
+  // Arbitrary data that will be passed  to the `accuracy_validator_func`
+  // function via its `user_data` parameter.
+  void* accuracy_validator_user_data;
+  // Custom validation rule that decides whether a BenchmarkResult passes the
+  // accuracy check.
+  bool (*accuracy_validator_func)(void* user_data,
+                                  uint8_t* benchmark_result_data,
+                                  int benchmark_result_data_size);
+};
+
+// Mini-benchmark settings. For forward source compatibility, this struct
+// should always be brace-initialized, so that all fields (including any that
+// might be added in the future) get zero-initialized.
+struct TfLiteMiniBenchmarkSettings {
+  // The pointer to a flatbuffer data of MinibenchmarkSettings.
+  uint8_t* flatbuffer_data;
+  // The byte size of the flatbuffer_data.
+  size_t flatbuffer_data_size;
+  // Custom validation related info.
+  TfLiteMiniBenchmarkCustomValidationInfo custom_validation_info;
+  // Arbitrary data that will be passed  to the `error_reporter_func`
+  // function via its `user_data` parameter.
+  void* error_reporter_user_data;
+  // Custom error reporter to log error to. If the function is provided, errors
+  // will be logged with this function.
+  int (*error_reporter_func)(void* user_data, const char* format, va_list args);
+  // A handle to a gpu_plugin provided by external library. This handle will be
+  // used to lookup the shared object file that provides GPU Delegate Plugin.
+  void* gpu_plugin_handle;
+};
+
+#ifdef __cplusplus
+}  // extern "C".
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_C_C_API_TYPES_H_
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/call.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/call.cc
index 97947533833..a4b76c26fc2 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/call.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/call.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h
index 62a7ebf37f9..63351190394 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CALL_REGISTER_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_CALL_REGISTER_H_
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 namespace tflite {
 namespace acceleration {
 namespace ops {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/call_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/call_test.cc
index 783cdf0c789..6c842bf5405 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/call_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/call_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/interpreter_test_util.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/testing/util.h"
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
index 24e10f0cb67..b1e2d619904 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h
index 7ee55708546..dfa3049d3e5 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h
index da2dbd9e509..ae3a0a3cb70 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 namespace tflite {
 namespace acceleration {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.cc
index d264693c59f..a7d548ef3de 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 // We only really care about Android, but we want the code to be portable for
 // ease of testing. See also discussion in cl/224174491.
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h
index f21b557fcdc..b6ee7dfc5c0 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "flatbuffers/base.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/stderr_reporter.h"
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
index ca081842012..05db755da1e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc
new file mode 100644
index 00000000000..9f974c87c3d
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.cc
@@ -0,0 +1,110 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h"
+
+#include <dlfcn.h>
+
+#include <memory>
+#include <string>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace acceleration {
+
+using ::tflite::delegates::TfLiteDelegatePtr;
+using SymbolFunc = const TfLiteDelegatePlugin*();
+
+// Function name used to get a pointer to GpuDelegatePlugin.
+constexpr char kPluginGetterSymbolName[] = "TfLiteGpuDelegatePluginCApi";
+
+std::unique_ptr<delegates::DelegatePluginInterface> GpuModulePlugin::New(
+    const TFLiteSettings& acceleration) {
+  return std::unique_ptr<GpuModulePlugin>(new GpuModulePlugin(acceleration));
+}
+
+int GpuModulePlugin::GetDelegateErrno(TfLiteDelegate* from_delegate) {
+  if (!plugin_handle_) {
+    return error_code_;
+  }
+  return plugin_handle_->get_delegate_errno(from_delegate);
+}
+
+TfLiteDelegatePtr GpuModulePlugin::Create() {
+  if (!plugin_handle_) {
+    return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+  }
+  return TfLiteDelegatePtr(plugin_handle_->create(tflite_settings_),
+                           (plugin_handle_->destroy));
+}
+
+// In case GPU acceleration is not supported for this platform, we still need to
+// construct an empty object so that Create() can later be called on it.
+GpuModulePlugin::GpuModulePlugin(const TFLiteSettings& tflite_settings) {
+  TFLiteSettingsT settings_obj;
+  tflite_settings.UnPackTo(&settings_obj);
+  fbb_.Finish(CreateTFLiteSettings(fbb_, &settings_obj));
+  tflite_settings_ =
+      flatbuffers::GetRoot<TFLiteSettings>(fbb_.GetBufferPointer());
+
+  module_ = dlopen(tflite_settings_->stable_delegate_loader_settings()
+                       ->delegate_path()
+                       ->c_str(),
+                   RTLD_NOW | RTLD_LOCAL);
+  if (!module_) {
+    TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "Failed to load Gpu Module from %s",
+                    tflite_settings_->stable_delegate_loader_settings()
+                        ->delegate_path()
+                        ->c_str());
+    error_code_ = kMinibenchmarkCannotLoadGpuModule;
+    return;
+  }
+  void* sym = dlsym(module_, kPluginGetterSymbolName);
+  if (!sym) {
+    TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "Failed to create symbol '%s'",
+                    kPluginGetterSymbolName);
+    error_code_ = kMinibenchmarkCannotLoadGpuModule;
+    return;
+  }
+
+  plugin_handle_ = reinterpret_cast<SymbolFunc*>(sym)();
+  if (!plugin_handle_) {
+    TFLITE_LOG_PROD(
+        TFLITE_LOG_WARNING,
+        "GPU Module loaded successfully from %s, but plugin handle is null.",
+        tflite_settings_->stable_delegate_loader_settings()
+            ->delegate_path()
+            ->c_str());
+    error_code_ = kMinibenchmarkDelegatePluginNotFound;
+  }
+}
+
+GpuModulePlugin::~GpuModulePlugin() {
+  if (module_) {
+    dlclose(module_);
+  }
+}
+
+static auto* g_delegate_plugin_GpuModulePlugin =
+    new tflite::delegates::DelegatePluginRegistry::Register(
+        "GpuModulePlugin", GpuModulePlugin ::New);
+}  // namespace acceleration
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
new file mode 100644
index 00000000000..38f0410d0c3
--- /dev/null
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/gpu_module_plugin.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
+
+// This file provides the GpuPlugin class, which implements the
+// TFLite Delegate Plugin for the GPU Delegate.
+
+#include <memory>
+#include <string>
+
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+
+namespace tflite {
+namespace acceleration {
+
+// A DelegatePlugin that uses external library to create GPU Plugin.
+class GpuModulePlugin : public delegates::DelegatePluginInterface {
+ public:
+  static std::unique_ptr<DelegatePluginInterface> New(
+      const TFLiteSettings& acceleration);
+
+  // Move only.
+  GpuModulePlugin(GpuModulePlugin&& other) = default;
+  GpuModulePlugin& operator=(GpuModulePlugin&& other) = default;
+  ~GpuModulePlugin() override;
+
+  delegates::TfLiteDelegatePtr Create() override;
+  int GetDelegateErrno(TfLiteDelegate* from_delegate) override;
+
+ private:
+  explicit GpuModulePlugin(const TFLiteSettings& tflite_settings);
+
+  // The handle to the loaded external library.
+  void* module_ = nullptr;
+  const TfLiteDelegatePlugin* plugin_handle_ = nullptr;
+  // A copy of the input tflite_settings.
+  flatbuffers::FlatBufferBuilder fbb_;
+  // A pointer to the data in fbb_.
+  const TFLiteSettings* tflite_settings_;
+  MinibenchmarkStatus error_code_ = kMinibenchmarkSuccess;
+};
+
+}  // namespace acceleration
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_GPU_MODULE_PLUGIN_H_
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.cc
index 21f6e42b13b..9af7407eeab 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/string_util.h"
@@ -247,7 +247,7 @@ class JfifHeaderParser {
 
   static TfLiteStatus ReadWordAt(const tflite::StringRef& jpeg_image_data,
                                  int read_offset, uint16_t* result) {
-    if (read_offset < 0 || read_offset > jpeg_image_data.len - 2) {
+    if (read_offset < 0 || read_offset + 2 > jpeg_image_data.len) {
       return kTfLiteError;
     }
     // Cast to unsigned since char can be signed.
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h
index ab806275699..39acc69f97e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h"
 #include "tensorflow/lite/string_type.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser_test.cc
index fc6ad5af218..f3600094ae7 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_chessboard_jpeg.h"
 
 namespace tflite {
@@ -66,7 +66,7 @@ const int kChessboardImgChannels = 3;
 TEST(ReadJpegHeader, ShouldParseValidJpgImage) {
   const tflite::StringRef chessboard_image{
       reinterpret_cast<const char*>(g_tflite_acceleration_chessboard_jpeg),
-      g_tflite_acceleration_chessboard_jpeg_len};
+      static_cast<size_t>(g_tflite_acceleration_chessboard_jpeg_len)};
   ASSERT_GT(chessboard_image.len, 4);
 
   JpegHeader header;
@@ -79,8 +79,8 @@ TEST(ReadJpegHeader, ShouldParseValidJpgImage) {
 
 TEST(ReadJpegHeader, ShouldFailForInvalidJpegImage) {
   const std::string invalid_image = "invalid image content";
-  const tflite::StringRef invalid_image_ref{
-      invalid_image.c_str(), static_cast<int>(invalid_image.size())};
+  const tflite::StringRef invalid_image_ref{invalid_image.c_str(),
+                                            invalid_image.size()};
 
   JpegHeader header;
 
@@ -100,7 +100,7 @@ TEST(ReadJpegHeader, ShouldFailForEmptyJpegImage) {
 TEST(ApplyHeaderToImage, ReturnsNewImageWithDifferentHeader) {
   const tflite::StringRef chessboard_image{
       reinterpret_cast<const char*>(g_tflite_acceleration_chessboard_jpeg),
-      g_tflite_acceleration_chessboard_jpeg_len};
+      static_cast<size_t>(g_tflite_acceleration_chessboard_jpeg_len)};
 
   JpegHeader new_header{
       .height = 20, .width = 30, .channels = 1, .bits_per_sample = 3};
@@ -111,8 +111,8 @@ TEST(ApplyHeaderToImage, ReturnsNewImageWithDifferentHeader) {
       BuildImageWithNewHeader(chessboard_image, new_header, new_image_data),
       StatusEq({kTfLiteOk, ""}));
 
-  const tflite::StringRef altered_image{
-      new_image_data.c_str(), static_cast<int>(new_image_data.size())};
+  const tflite::StringRef altered_image{new_image_data.c_str(),
+                                        new_image_data.size()};
   JpegHeader header;
   ASSERT_THAT(ReadJpegHeader(altered_image, &header),
               StatusEq({kTfLiteOk, ""}));
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.cc
index 45caf5f11f2..9d431d2689c 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.cc
@@ -28,7 +28,7 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h
index a12f152f390..4121dfe8e43 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_decompress_buffered_struct.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test.cc
index 78a189d24bb..15dc12c4e87 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_decoder_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_chessboard_jpeg.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_snow_jpeg.h"
@@ -60,7 +60,7 @@ TEST(LibjpegDecoderTest, DecodingChessboardShouldSucceedOnAndroid) {
   ASSERT_THAT(decoder, NotNull());
   tflite::StringRef string_ref = {
       reinterpret_cast<const char*>(g_tflite_acceleration_chessboard_jpeg),
-      g_tflite_acceleration_chessboard_jpeg_len};
+      static_cast<size_t>(g_tflite_acceleration_chessboard_jpeg_len)};
   unsigned char decoded[kDecodedSize];
 
   status = decoder->DecodeImage(string_ref, kExpectedImageDimensions, decoded,
@@ -81,8 +81,7 @@ TEST(LibjpegDecoderTest, DecodingRainbowTestCardShouldSucceedOnAndroid) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef string_ref = {encoded.c_str(),
-                                  static_cast<int>(encoded.length())};
+  tflite::StringRef string_ref = {encoded.c_str(), encoded.length()};
   unsigned char decoded[kDecodedSize];
 
   status = decoder->DecodeImage(string_ref, kExpectedImageDimensions, decoded,
@@ -101,7 +100,7 @@ TEST(LibjpegDecoderTest, ErrorsFromJpegLayerAreReturnedToCaller) {
   EXPECT_EQ(status.code, kTfLiteOk);
   ASSERT_THAT(decoder, NotNull());
   std::string str = "this is not a jpeg image";
-  tflite::StringRef encoded = {str.c_str(), static_cast<int>(str.length())};
+  tflite::StringRef encoded = {str.c_str(), str.length()};
   unsigned char decoded_image[12];
 
   status = decoder->DecodeImage(encoded, kExpectedImageDimensions,
@@ -120,8 +119,7 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenDecodeBufferIsSmall) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef string_ref = {encoded.c_str(),
-                                  static_cast<int>(encoded.length())};
+  tflite::StringRef string_ref = {encoded.c_str(), encoded.length()};
   const int decoded_size = 100;
   unsigned char decoded[decoded_size];
   status = decoder->DecodeImage(string_ref, kExpectedImageDimensions, decoded,
@@ -141,8 +139,7 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenImageDimensionsDifferFromExpected) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef string_ref = {encoded.c_str(),
-                                  static_cast<int>(encoded.length())};
+  tflite::StringRef string_ref = {encoded.c_str(), encoded.length()};
   unsigned char decoded[kDecodedSize];
 
   status = decoder->DecodeImage(string_ref,
@@ -164,8 +161,7 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenImageDimensionsAreOverThreshold) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef origin_string_ref = {encoded.c_str(),
-                                         static_cast<int>(encoded.length())};
+  tflite::StringRef origin_string_ref = {encoded.c_str(), encoded.length()};
 
   const JpegHeader kHeader{
       .height = static_cast<int>(LibjpegDecoder::kMaxImageHeight + 1),
@@ -181,8 +177,8 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenImageDimensionsAreOverThreshold) {
       BuildImageWithNewHeader(origin_string_ref, kHeader, altered_image);
   ASSERT_EQ(alter_header_status.code, kTfLiteOk);
 
-  tflite::StringRef altered_string_ref = {
-      altered_image.c_str(), static_cast<int>(altered_image.length())};
+  tflite::StringRef altered_string_ref = {altered_image.c_str(),
+                                          altered_image.length()};
 
   std::vector<unsigned char> decoded(decoded_size);
   status = decoder->DecodeImage(altered_string_ref, kHeader, decoded.data(),
@@ -205,8 +201,7 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenImageHasUnsupportedNumberOfChannels) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef string_ref = {encoded.c_str(),
-                                  static_cast<int>(encoded.length())};
+  tflite::StringRef string_ref = {encoded.c_str(), encoded.length()};
   unsigned char decoded[300 * 250 * 4];
 
   const JpegHeader kHeader{.height = 300, .width = 250, .channels = 4};
@@ -225,8 +220,7 @@ TEST(LibjpegDecoderTest, DecodingFailsWhenExpectedBitPerSampleIsNot8) {
   std::string encoded(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef string_ref = {encoded.c_str(),
-                                  static_cast<int>(encoded.length())};
+  tflite::StringRef string_ref = {encoded.c_str(), encoded.length()};
   unsigned char decoded[kDecodedSize];
 
   status = decoder->DecodeImage(
@@ -247,9 +241,8 @@ TEST(LibjpegDecoderTest, DoesNotDecodeBeyondWhatIsSpecifiedInHeader) {
   std::string origin_encoded_img(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef origin_string_ref = {
-      origin_encoded_img.c_str(),
-      static_cast<int>(origin_encoded_img.length())};
+  tflite::StringRef origin_string_ref = {origin_encoded_img.c_str(),
+                                         origin_encoded_img.length()};
 
   JpegHeader undersized_image_header = {
       .height = kExpectedImageDimensions.height / 2,
@@ -260,8 +253,8 @@ TEST(LibjpegDecoderTest, DoesNotDecodeBeyondWhatIsSpecifiedInHeader) {
       origin_string_ref, undersized_image_header, altered_image);
   ASSERT_EQ(alter_header_status.code, kTfLiteOk);
 
-  tflite::StringRef altered_string_ref = {
-      altered_image.c_str(), static_cast<int>(altered_image.length())};
+  tflite::StringRef altered_string_ref{altered_image.c_str(),
+                                       altered_image.length()};
 
   unsigned char decoded[kDecodedSize / 4];
 
@@ -279,9 +272,8 @@ TEST(LibjpegDecoderTest, CanReadImagesWithVeryLargeRows) {
   std::string origin_encoded_img(
       reinterpret_cast<const char*>(g_tflite_acceleration_snow_jpeg),
       g_tflite_acceleration_snow_jpeg_len);
-  tflite::StringRef origin_string_ref = {
-      origin_encoded_img.c_str(),
-      static_cast<int>(origin_encoded_img.length())};
+  tflite::StringRef origin_string_ref = {origin_encoded_img.c_str(),
+                                         origin_encoded_img.length()};
 
   JpegHeader one_long_row_image_header = {
       .height = 1,
@@ -292,8 +284,8 @@ TEST(LibjpegDecoderTest, CanReadImagesWithVeryLargeRows) {
       origin_string_ref, one_long_row_image_header, altered_image);
   ASSERT_EQ(alter_header_status.code, kTfLiteOk);
 
-  tflite::StringRef altered_string_ref = {
-      altered_image.c_str(), static_cast<int>(altered_image.length())};
+  tflite::StringRef altered_string_ref = {altered_image.c_str(),
+                                          altered_image.length()};
 
   const size_t kImageSize = LibjpegDecoder::kMaxImageWidth * 3;
   std::vector<unsigned char> decoded(kImageSize);
@@ -312,8 +304,7 @@ TEST(LibjpegDecoderTest, FailDecodingAnImageWithUnexpectedEofInDataStream) {
   std::string img(
       reinterpret_cast<const char*>(g_tflite_acceleration_test_card_jpeg),
       g_tflite_acceleration_test_card_jpeg_len);
-  tflite::StringRef truncated_image_ref = {
-      img.c_str(), static_cast<int>(img.length() - 100)};
+  tflite::StringRef truncated_image_ref = {img.c_str(), img.length() - 100};
 
   unsigned char decoded[kDecodedSize];
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.cc
index 71b42613e57..ef213254bf9 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle_test.cc
index 9a1db976691..adeea25735e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg_handle_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/metrics/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/metrics/BUILD
index 79457ebf995..8bfbe33d634 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/metrics/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/metrics/BUILD
@@ -23,6 +23,7 @@ default_visibility_group = [
 ] + minibenchmark_visibility_allowlist()
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_visibility_group,
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
index 281f55b5274..11335114a1c 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test.cc
@@ -285,7 +285,7 @@ TEST_F(MiniBenchmarkTest, DelegatePluginNotSupported) {
     const auto& t = event.benchmark_event;
     if (t == nullptr) continue;
     if (t->event_type == tflite::BenchmarkEventType_ERROR &&
-        t->error->exit_code ==
+        t->error->mini_benchmark_error_code ==
             tflite::acceleration::kMinibenchmarkDelegateNotSupported) {
       is_found = true;
       break;
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader_test.cc
deleted file mode 100644
index 458ec6b7358..00000000000
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_format.h"
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
-#include "tensorflow/lite/model_builder.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace tflite {
-namespace acceleration {
-namespace {
-
-using ::testing::IsNull;
-using ::testing::Not;
-using ::testing::WhenDynamicCastTo;
-
-class ModelLoaderTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    model_path_ = MiniBenchmarkTestHelper::DumpToTempFile(
-        "mobilenet_quant.tflite",
-        g_tflite_acceleration_embedded_mobilenet_model,
-        g_tflite_acceleration_embedded_mobilenet_model_len);
-  }
-  std::string model_path_;
-};
-
-TEST_F(ModelLoaderTest, CreateFromModelPath) {
-  auto model_loader = std::make_unique<PathModelLoader>(model_path_);
-
-  ASSERT_NE(model_loader, nullptr);
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkSuccess);
-}
-
-TEST_F(ModelLoaderTest, CreateFromFdPath) {
-  int fd = open(model_path_.c_str(), O_RDONLY);
-  ASSERT_GE(fd, 0);
-  struct stat stat_buf = {0};
-  ASSERT_EQ(fstat(fd, &stat_buf), 0);
-  auto model_loader =
-      std::make_unique<MmapModelLoader>(fd, 0, stat_buf.st_size);
-  close(fd);
-
-  ASSERT_NE(model_loader, nullptr);
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkSuccess);
-}
-
-TEST_F(ModelLoaderTest, CreateFromPipePath) {
-  // Setup.
-  // Read the model and serialize it.
-  auto model = FlatBufferModel::BuildFromFile(model_path_.c_str());
-  flatbuffers::FlatBufferBuilder fbb;
-  ModelT model_obj;
-  model->GetModel()->UnPackTo(&model_obj);
-  std::string model_description = model_obj.description;
-  fbb.Finish(CreateModel(fbb, &model_obj));
-  int pipe_fds[2];
-  ASSERT_EQ(pipe(pipe_fds), 0);
-  pid_t r = fork();
-  // Child thread to write to pipe.
-  if (r == 0) {
-    close(pipe_fds[0]);
-    int written_bytes = 0;
-    int remaining_bytes = fbb.GetSize();
-    uint8_t* buffer = fbb.GetBufferPointer();
-    while (remaining_bytes > 0 &&
-           (written_bytes = write(pipe_fds[1], buffer, remaining_bytes)) > 0) {
-      remaining_bytes -= written_bytes;
-      buffer += written_bytes;
-    }
-    close(pipe_fds[1]);
-    ASSERT_TRUE(written_bytes > 0 && remaining_bytes == 0);
-    _exit(0);
-  }
-
-  // Execute.
-  // Parent thread.
-  // Close the write pipe.
-  close(pipe_fds[1]);
-  auto model_loader =
-      std::make_unique<PipeModelLoader>(pipe_fds[0], fbb.GetSize());
-  ASSERT_NE(model_loader, nullptr);
-
-  // Verify.
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkSuccess);
-  EXPECT_EQ(model_loader->GetModel()->GetModel()->description()->string_view(),
-            model_description);
-}
-
-TEST_F(ModelLoaderTest, InvalidModelPath) {
-  auto model_loader = std::make_unique<PathModelLoader>("invalid/path");
-
-  ASSERT_NE(model_loader, nullptr);
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkModelBuildFailed);
-}
-
-TEST_F(ModelLoaderTest, InvalidFd) {
-  auto model_loader = std::make_unique<MmapModelLoader>(0, 5, 10);
-
-  ASSERT_NE(model_loader, nullptr);
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkModelReadFailed);
-}
-
-TEST_F(ModelLoaderTest, InvalidPipe) {
-  auto model_loader = std::make_unique<PipeModelLoader>(-1, 10);
-
-  ASSERT_NE(model_loader, nullptr);
-  EXPECT_THAT(model_loader->Init(), kMinibenchmarkPreconditionNotMet);
-}
-
-TEST_F(ModelLoaderTest, CreateModelLoaderFromValidPath) {
-  EXPECT_THAT(CreateModelLoaderFromPath("a/b/c").get(),
-              WhenDynamicCastTo<PathModelLoader*>(Not(IsNull())));
-  EXPECT_THAT(CreateModelLoaderFromPath("fd:1:2:3").get(),
-              WhenDynamicCastTo<MmapModelLoader*>(Not(IsNull())));
-  EXPECT_THAT(CreateModelLoaderFromPath("pipe:1:2:3").get(),
-              WhenDynamicCastTo<PipeModelLoader*>(Not(IsNull())));
-}
-
-TEST_F(ModelLoaderTest, CreateModelLoaderFromInvalidPath) {
-  EXPECT_EQ(CreateModelLoaderFromPath("fd:1"), nullptr);
-  EXPECT_EQ(CreateModelLoaderFromPath("fd:1:2:3:4"), nullptr);
-
-  EXPECT_EQ(CreateModelLoaderFromPath("pipe:1"), nullptr);
-  EXPECT_EQ(CreateModelLoaderFromPath("pipe:1:2:3:4"), nullptr);
-}
-
-}  // namespace
-}  // namespace acceleration
-}  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/BUILD
index 43b75008330..e686d7de9ab 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/BUILD
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
     ],
@@ -41,11 +42,11 @@ cc_library(
         ":validation_graph_builder",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:decode_jpeg_status",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:jpeg_common",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:jpeg_header_parser",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
         "@com_google_absl//absl/status",
@@ -62,7 +63,7 @@ cc_library(
         ":grafter",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -78,7 +79,7 @@ cc_binary(
     deps = [
         ":embedder",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:call",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:decode_jpeg",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
@@ -95,6 +96,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:stderr_reporter",
         "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/tools:verifier",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:constants",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
         "//tensorflow/lite/schema:schema_fbs",
@@ -108,17 +110,17 @@ cc_test(
     deps = [
         ":custom_validation_embedder",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:call",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:model_loader",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/tools:model_loader",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
index 6e613da0ca9..ec25ab06872 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/tools/verifier.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -72,18 +73,6 @@ void CustomValidationEmbedder::CreateTensorsFrom(
     // If buffer content is provided, embed the content. Otherwise create an
     // empty buffer.
     if (buffer_content && !(*buffer_content)[i].empty()) {
-      int expected_size = 1;
-      for (int dim : base_tensor.shape) {
-        expected_size *= dim;
-      }
-      // Only log as a warning for now, since tensor shape might be unknown.
-      if (expected_size != (*buffer_content)[i].size()) {
-        TF_LITE_REPORT_ERROR(
-            error_reporter_,
-            "Unexpected custom input buffer size for tensor %d. "
-            "Expected: %d. Actual: %d.",
-            i, expected_size, (*buffer_content)[i].size());
-      }
       buffers.push_back(
           CreateBuffer(fbb, fbb.CreateVector((*buffer_content)[i])));
     } else {
@@ -172,7 +161,11 @@ MinibenchmarkStatus CustomValidationEmbedder::BuildModel(
                   fbb.CreateVector(signature_defs)),
       "TFL3");
 
-  return kMinibenchmarkSuccess;
+  if (Verify(fbb.GetBufferPointer(), fbb.GetSize(), error_reporter_)) {
+    return kMinibenchmarkSuccess;
+  } else {
+    return kMinibenchmarkValidationSubgraphBuildFailed;
+  }
 }
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h
index 01bbbad390d..5d68a4a3d00 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h
@@ -52,9 +52,12 @@ namespace acceleration {
 // custom_input[i] will be mapped to main_model.input[i].
 class CustomValidationEmbedder {
  public:
-  CustomValidationEmbedder(int batch_size,
-                           std::vector<std::vector<uint8_t>> custom_input)
-      : batch_size_(batch_size), custom_input_(std::move(custom_input)) {}
+  CustomValidationEmbedder(
+      int batch_size, std::vector<std::vector<uint8_t>> custom_input,
+      ErrorReporter* error_reporter = DefaultErrorReporter())
+      : batch_size_(batch_size),
+        custom_input_(std::move(custom_input)),
+        error_reporter_(error_reporter) {}
 
   // Move only.
   CustomValidationEmbedder(CustomValidationEmbedder&&) = default;
@@ -81,7 +84,7 @@ class CustomValidationEmbedder {
 
   int batch_size_;
   std::vector<std::vector<uint8_t>> custom_input_;
-  ErrorReporter* error_reporter_ = tflite::DefaultErrorReporter();
+  ErrorReporter* error_reporter_;
 };
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
index 94e66a1237c..f87aa1810ae 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder_test.cc
@@ -21,17 +21,18 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 namespace tflite {
 namespace acceleration {
@@ -48,11 +49,12 @@ class CustomValidationEmbedderTest : public ::testing::Test {
         g_tflite_acceleration_embedded_mobilenet_model,
         g_tflite_acceleration_embedded_mobilenet_model_len);
     ASSERT_TRUE(!plain_model_path.empty());
-    plain_model_loader_ = std::make_unique<PathModelLoader>(plain_model_path);
-    ASSERT_EQ(plain_model_loader_->Init(), kMinibenchmarkSuccess);
+    plain_model_loader_ =
+        std::make_unique<tools::PathModelLoader>(plain_model_path);
+    ASSERT_TRUE(plain_model_loader_->Init());
   }
 
-  std::unique_ptr<ModelLoader> plain_model_loader_;
+  std::unique_ptr<tools::ModelLoader> plain_model_loader_;
 };
 
 TEST_F(CustomValidationEmbedderTest, BuildValidationModelSucceed) {
@@ -93,6 +95,15 @@ TEST_F(CustomValidationEmbedderTest, BuildValidationModelTooManyInput) {
       kMinibenchmarkValidationSubgraphBuildFailed);
 }
 
+TEST_F(CustomValidationEmbedderTest, BuildValidationModelInvalidBufferSize) {
+  CustomValidationEmbedder embedder(2, {std::vector<uint8_t>(2, 2)});
+
+  FlatBufferBuilder fbb;
+  EXPECT_EQ(
+      embedder.BuildModel(*plain_model_loader_->GetModel()->GetModel(), fbb),
+      kMinibenchmarkValidationSubgraphBuildFailed);
+}
+
 }  // namespace
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.cc
index 1af64dada9d..909d64156b3 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/reflection_generated.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_status.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_common.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/jpeg_header_parser.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace fb = flatbuffers;
@@ -101,8 +101,7 @@ absl::Status Embedder::ValidateInputs() {
     int width, height, components;
     decode_jpeg_kernel::JpegHeader header{0};
     auto status = decode_jpeg_kernel::ReadJpegHeader(
-        {jpeg_image_data.data(), static_cast<int>(jpeg_image_data.size())},
-        &header);
+        {jpeg_image_data.data(), jpeg_image_data.size()}, &header);
     VALIDATE(status.code == kTfLiteOk,
              "Failed to decompress jpeg data at index %d: %s", jpeg_number,
              status.error_message.c_str());
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h
index dd3f9023a1a..e262c4bba64 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/reflection_generated.h"  // from @flatbuffers
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/schema/reflection/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
index 16e1d8ee514..a2cc7d24eeb 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder_main.cc
@@ -28,10 +28,10 @@ limitations under the License.
 #include "flatbuffers/reflection_generated.h"  // from @flatbuffers
 #include "flatbuffers/util.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/embedder.h"
-#include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/schema/reflection/schema_generated.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.cc
index ab7a01df09a..c8ac032d46d 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/validation_graph_builder.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/idl.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/grafter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
index 47b2793929d..3fb1633d68f 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/model_validation_test.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/big_little_affinity.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
+#include "tensorflow/lite/tools/model_loader.h"
 #ifdef ENABLE_NNAPI_SL_TEST
 #include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
 #endif /* ENABLE_NNAPI_SL_TEST */
@@ -106,13 +106,16 @@ class LocalizerValidationRegressionTest : public ::testing::Test {
     ASSERT_GE(fd, 0);
     struct stat stat_buf = {0};
     ASSERT_EQ(fstat(fd, &stat_buf), 0);
-    auto validator = std::make_unique<Validator>(
-        std::make_unique<MmapModelLoader>(fd, /*offset=*/0, stat_buf.st_size),
-        settings);
+    auto validator =
+        std::make_unique<Validator>(std::make_unique<tools::MmapModelLoader>(
+                                        fd, /*offset=*/0, stat_buf.st_size),
+                                    settings);
     close(fd);
 
     Validator::Results results;
-    EXPECT_EQ(validator->RunValidation(&results), kMinibenchmarkSuccess);
+    Validator::Status validation_run = validator->RunValidation(&results);
+    EXPECT_EQ(validation_run.status, kMinibenchmarkSuccess);
+    EXPECT_EQ(validation_run.stage, BenchmarkStage_UNKNOWN);
     EXPECT_TRUE(results.ok);
     EXPECT_EQ(results.delegate_error, 0);
     if (accelerator_name != "CPU") {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
index 9e74e7f3265..99f7a572589 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/models/BUILD
@@ -16,6 +16,7 @@
 # Model files for mini-benchmark tests and examples.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:__subpackages__",
         "//tensorflow/lite/tools/benchmark:__subpackages__",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
index 97b1b40400b..76196611c6c 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/runner_test_entry_points.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 extern "C" {
 
@@ -83,9 +83,9 @@ int TfLiteWriteArgs(int argc, char** argv) {
 }
 
 int TfLiteReadFromPipe(int argc, char** argv) {
-  std::unique_ptr<tflite::acceleration::ModelLoader> model_loader =
-      tflite::acceleration::CreateModelLoaderFromPath(argv[3]);
-  if (model_loader->Init() != tflite::acceleration::kMinibenchmarkSuccess) {
+  std::unique_ptr<tflite::tools::ModelLoader> model_loader =
+      tflite::tools::CreateModelLoaderFromPath(argv[3]);
+  if (!model_loader->Init()) {
     return 1;
   }
   tflite::ModelT model;
@@ -100,9 +100,10 @@ int TfLiteReadFromPipe(int argc, char** argv) {
 }
 
 int TfLiteReadFromPipeInProcess(int argc, char** argv) {
-  std::unique_ptr<tflite::acceleration::ModelLoader> model_loader =
-      tflite::acceleration::CreateModelLoaderFromPath(argv[3]);
-  return model_loader->Init();
+  std::unique_ptr<tflite::tools::ModelLoader> model_loader =
+      tflite::tools::CreateModelLoaderFromPath(argv[3]);
+  return model_loader->Init() ? ::tflite::acceleration::kMinibenchmarkSuccess
+                              : 1;
 }
 
 }  // extern "C"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h
index 24ed9eb2e5c..165ab9954e1 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h
@@ -38,6 +38,8 @@ enum MinibenchmarkStatus {
   //
   // Runner main status codes used to indicate inability to dynamically load and
   // execute the validation code.
+  //
+  // Next available code: 15
   // LINT.IfChange
   kMinibenchmarkRunnerMainDlopenFailed = 11,
   kMinibenchmarkRunnerMainSymbolLookupFailed = 12,
@@ -45,10 +47,14 @@ enum MinibenchmarkStatus {
   kMinibenchmarkUnsupportedPlatform = 14,
   // LINT.ThenChange(//tensorflow/lite/experimental/acceleration/mini_benchmark/runner_main.c)
   // General status codes that may be used anywhere
+  //
+  // Next available code: 121
   kMinibenchmarkPreconditionNotMet = 119,
   kMinibenchmarkSuccess = 120,
   // Storage status codes. These are used when storage can not be used to pass
   // status.
+  //
+  // Next available code: 29
   kMinibenchmarkCorruptSizePrefixedFlatbufferFile = 21,
   kMinibenchmarkCantCreateStorageFile = 22,
   kMinibenchmarkFlockingStorageFileFailed = 23,
@@ -62,6 +68,8 @@ enum MinibenchmarkStatus {
   // process or communicated through the storage mechanism. These can be > 127.
   //
   // Runner status codes.
+  //
+  // Next available code: 516
   kMinibenchmarkDladdrReturnedZero = 502,
   kMinibenchmarkDliFnameWasNull = 503,
   kMinibenchmarkDliFnameHasApkNameOnly = 504,
@@ -75,13 +83,18 @@ enum MinibenchmarkStatus {
   kMiniBenchmarkCannotLoadSupportLibrary = 511,
   kMiniBenchmarkInvalidSupportLibraryConfiguration = 512,
   kMinibenchmarkPipeFailed = 513,
+  kMinibenchmarkCannotLoadGpuModule = 515,
+
   // Validator status codes.
+  //
+  // Next available code: 1018
   kMinibenchmarkDelegateNotSupported = 1000,
   kMinibenchmarkDelegatePluginNotFound = 1001,
-  kMinibenchmarkDelegateCreateFailed = 1013,
+  kMinibenchmarkDelegateCreateFailed = 1014,
   kMinibenchmarkModelTooLarge = 1002,  // Safety limit currently set at 100M.
   kMinibenchmarkSeekToModelOffsetFailed = 1003,
   kMinibenchmarkModelReadFailed = 1004,
+  kMinibenchmarkModelInitFailed = 1017,
   kMinibenchmarkInterpreterBuilderFailed = 1005,
   kMinibenchmarkValidationSubgraphNotFound = 1006,
   kMinibenchmarkModifyGraphWithDelegateFailed = 1007,
@@ -89,18 +102,25 @@ enum MinibenchmarkStatus {
   kMinibenchmarkInvokeFailed = 1009,
   kMinibenchmarkModelBuildFailed = 1010,
   kMinibenchmarkValidationSubgraphHasTooFewInputs = 1011,
-  kMinibenchmarkValidationSubgraphHasTooFewOutputs = 1011,
-  kMinibenchmarkValidationSubgraphInputsDontMatchOutputs = 1012,
-  kMinibenchmarkValidationInputMissing = 1013,
-  kMinibenchmarkValidationSubgraphBuildFailed = 1014,
+  kMinibenchmarkValidationSubgraphHasTooFewOutputs = 1012,
+  kMinibenchmarkValidationSubgraphInputsDontMatchOutputs = 1013,
+  kMinibenchmarkValidationInputMissing = 1015,
+  kMinibenchmarkValidationSubgraphBuildFailed = 1016,
 
   // Validator runner status codes.
+  //
+  // Next available code: 1505
   kMinibenchmarkChildProcessAlreadyRunning = 1501,
   kMinibenchmarkValidationEntrypointSymbolNotFound = 1502,
   kMinibenchmarkNoValidationRequestFound = 1503,
+  kMinibenchmarkCompletionEventMissing = 1504,
 
   // Validator runner recoverable errors
+  //
+  // Next available code: 1602
   kMinibenchmarkUnableToSetCpuAffinity = 1601,
+
+  kMinibenchmarkAbiMethodNotAvailable = 1701,
 };
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
index e215c0cc999..47a35df830e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.cc
@@ -27,24 +27,24 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/call_register.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/decode_jpeg_register.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 #ifndef TEMP_FAILURE_RETRY
 #ifdef __ANDROID__
@@ -63,17 +63,17 @@ std::unique_ptr<tflite::delegates::DelegatePluginInterface> LoadDelegatePlugin(
       name + "Plugin", tflite_settings);
 }
 
-void AddTensorDataToMap(TfLiteTensor* tensor,
-                        std::map<std::string, std::vector<char>>& output_map) {
+void AppendTensorDataToVector(const TfLiteTensor* tensor,
+                              std::vector<std::vector<char>>& output_vector) {
   std::vector<char> char_output(TfLiteTensorByteSize(tensor));
   memcpy(char_output.data(), TfLiteTensorData(tensor),
          TfLiteTensorByteSize(tensor));
-  output_map.emplace(TfLiteTensorName(tensor), std::move(char_output));
+  output_vector.emplace_back(std::move(char_output));
 }
 
 // Returns whether the tensor is embedded with data.
-inline bool HasTensorData(ModelLoader* model_loader, const Subgraph& graph,
-                          int index) {
+inline bool HasTensorData(tools::ModelLoader* model_loader,
+                          const Subgraph& graph, int index) {
   // TODO(b/247752800): Find a better approach to see if data is embedded,
   // regardless of how the model is loaded.
   const TfLiteTensor* tensor = graph.tensor(index);
@@ -201,25 +201,47 @@ MinibenchmarkStatus Validator::LoadDelegate() {
 
   // Create delegate plugin and delegate.
   Delegate which_delegate = Delegate_NONE;
-  if (compute_settings_->tflite_settings()) {
+  bool is_stable_delegate = false;
+  auto tflite_settings = compute_settings_->tflite_settings();
+  if (tflite_settings) {
     which_delegate = compute_settings_->tflite_settings()->delegate();
+    if (tflite_settings->stable_delegate_loader_settings() &&
+        tflite_settings->stable_delegate_loader_settings()->delegate_path()) {
+      is_stable_delegate = !tflite_settings->stable_delegate_loader_settings()
+                                ->delegate_path()
+                                ->str()
+                                .empty();
+    }
   }
   std::string delegate_name;
-  switch (which_delegate) {
-    case Delegate_NONE:
-      // Skip creating delegate if running on CPU.
-      return kMinibenchmarkSuccess;
-    case Delegate_NNAPI:
-      delegate_name = "Nnapi";
-      break;
-    case Delegate_GPU:
-      delegate_name = "Gpu";
-      break;
-    case Delegate_XNNPACK:
-      delegate_name = "XNNPack";
-      break;
-    default:
-      return kMinibenchmarkDelegateNotSupported;
+  if (is_stable_delegate) {
+    if (which_delegate == Delegate_GPU) {
+      // Load GPU plugin from GpuModulePlugin when delegate_path is provided.
+      // This is a workaround before StableDelegate is supported.
+      delegate_name = "GpuModule";
+    } else {
+      // When a stable delegate shared library is provided, the stable delegate
+      // plugin loads symbols from the shared library to initialize the
+      // delegates.
+      delegate_name = "StableDelegate";
+    }
+  } else {
+    switch (which_delegate) {
+      case Delegate_NONE:
+        // Skip creating delegate if running on CPU.
+        return kMinibenchmarkSuccess;
+      case Delegate_NNAPI:
+        delegate_name = "Nnapi";
+        break;
+      case Delegate_GPU:
+        delegate_name = "Gpu";
+        break;
+      case Delegate_XNNPACK:
+        delegate_name = "XNNPack";
+        break;
+      default:
+        return kMinibenchmarkDelegateNotSupported;
+    }
   }
 
   TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Running mini-benchmark on %s",
@@ -304,9 +326,9 @@ MinibenchmarkStatus Validator::CreateInterpreter(int* delegate_error_out,
   }
 
   // Check if the model is actually going to execute on the delegate.
-  // For now just give a warning, with the exception of NNAPI SL mini benchmark.
-  // Can consider changing to error in other contexts.
-  // The logic is copy/pasted from benchmark_tflite_model.cc
+  // For now just give a warning, with the exception of NNAPI SL mini
+  // benchmark. Can consider changing to error in other contexts. The logic is
+  // copy/pasted from benchmark_tflite_model.cc
   // TODO(b/232085640): Replace this logic with Subgraph::IsFullyDelegated()
   // after making that function public.
   absl::flat_hash_set<int> checked_node_ids;
@@ -335,39 +357,45 @@ MinibenchmarkStatus Validator::CreateInterpreter(int* delegate_error_out,
   return kMinibenchmarkSuccess;
 }
 
-MinibenchmarkStatus Validator::RunValidation(Results* results_out) {
+Validator::Status Validator::RunValidation(Results* results_out) {
+  BenchmarkStage stage = BenchmarkStage_INITIALIZATION;
   if (!results_out) {
-    return kMinibenchmarkPreconditionNotMet;
+    return Validator::Status{kMinibenchmarkPreconditionNotMet, stage};
   }
   if (!model_loader_) {
-    return kMinibenchmarkModelReadFailed;
+    return Validator::Status{kMinibenchmarkModelReadFailed, stage};
+  }
+  if (!model_loader_->Init()) {
+    return Validator::Status{kMinibenchmarkModelInitFailed, stage};
   }
 
-#define MB_RETURN_IF_ERROR(s)                 \
-  {                                           \
-    MinibenchmarkStatus c = (s);              \
-    if (c != kMinibenchmarkSuccess) return c; \
+#define MB_RETURN_IF_ERROR(s, bs)                                      \
+  {                                                                    \
+    MinibenchmarkStatus c = (s);                                       \
+    if (c != kMinibenchmarkSuccess) return Validator::Status{c, (bs)}; \
   }
 
-  MB_RETURN_IF_ERROR(model_loader_->Init());
   // The lifetime of the delegate must be at least as long as the lifetime of
   // any Interpreter.
   int64_t delegate_load_start_time_us = ElapsedTimeMicros();
-  MB_RETURN_IF_ERROR(LoadDelegate());
+  MB_RETURN_IF_ERROR(LoadDelegate(), stage);
   MB_RETURN_IF_ERROR(CreateInterpreter(&results_out->delegate_error,
-                                       &results_out->delegated_kernels));
+                                       &results_out->delegated_kernels),
+                     stage);
   int64_t delegate_load_end_time_us = ElapsedTimeMicros();
 
   ValidatorProfiler profiler;
+  stage = BenchmarkStage_INFERENCE;
+
   if (has_accuracy_validation_) {
-    MB_RETURN_IF_ERROR(CheckGoldenOutput(results_out));
+    MB_RETURN_IF_ERROR(CheckGoldenOutput(results_out), stage);
   }
 
   main_model_->SetProfiler(&profiler, 0);
   TfLiteStatus status = validation_entrypoint_->Invoke();
   main_model_->SetProfiler(nullptr, 0);
   if (status != kTfLiteOk) {
-    return kMinibenchmarkInvokeFailed;
+    MB_RETURN_IF_ERROR(kMinibenchmarkInvokeFailed, stage);
   }
 
   int model_output_size = main_model_->outputs().size();
@@ -404,13 +432,15 @@ MinibenchmarkStatus Validator::RunValidation(Results* results_out) {
 
     TFLITE_LOG_PROD(TFLITE_LOG_INFO, "  accuracy: %s",
                     results_out->ok ? "ok" : "not ok");
-  }
-
-  // Model output.
-  for (int i = 0; i < model_output_size; i++) {
-    AddTensorDataToMap(
-        validation_entrypoint_->tensor(validation_entrypoint_->outputs()[i]),
-        results_out->actual_inference_output);
+  } else {
+    // Model output.
+    results_out->actual_inference_output.clear();
+    results_out->actual_inference_output.reserve(model_output_size);
+    for (int i = 0; i < model_output_size; i++) {
+      AppendTensorDataToVector(
+          validation_entrypoint_->tensor(validation_entrypoint_->outputs()[i]),
+          results_out->actual_inference_output);
+    }
   }
   // Performance metrics.
   results_out->delegate_prep_time_us =
@@ -427,7 +457,7 @@ MinibenchmarkStatus Validator::RunValidation(Results* results_out) {
     }
   }
 #undef MB_RETURN_IF_ERROR
-  return kMinibenchmarkSuccess;
+  return Validator::Status{kMinibenchmarkSuccess};
 }
 
 int64_t Validator::BootTimeMicros() { return ElapsedTimeMicros(); }
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
index ebfa182dc8d..a5ccde6eaae 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h
@@ -22,14 +22,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/lite/core/experimental/acceleration/configuration/delegate_registry.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 namespace tflite {
 namespace acceleration {
@@ -43,7 +43,7 @@ class Validator {
  public:
   // Construct Validator for the given model and compute settings. The
   // compute_settings must be valid for the lifetime of the Validator instance.
-  Validator(std::unique_ptr<ModelLoader> model_loader,
+  Validator(std::unique_ptr<tools::ModelLoader> model_loader,
             const ComputeSettings* compute_settings)
       : model_loader_(std::move(model_loader)),
         compute_settings_(compute_settings) {}
@@ -65,14 +65,23 @@ class Validator {
     int delegate_error = 0;
     // Number of delegated kernels.
     int delegated_kernels = 0;
-    // Model output with the delegate.
-    // key: output tensor name;
-    // value: output tensor data in byte format.
-    std::map<std::string, std::vector<char>> actual_inference_output;
+    // Model output with the delegate, in byte format. It is ordered the same as
+    // tflite::Interpreter::output_tensor(), i.e. the value of output_tensor(i)
+    // is stored in actual_inference_output[i].
+    std::vector<std::vector<char>> actual_inference_output;
+  };
+
+  // Status from validation run.
+  struct Status {
+    // Status of the mini benchmark run.
+    MinibenchmarkStatus status;
+    // Stage during which validation run failed.
+    // Unknown in case of a successful run.
+    BenchmarkStage stage = BenchmarkStage_UNKNOWN;
   };
 
   // Run the validation graph and return validation results.
-  MinibenchmarkStatus RunValidation(Results* results_out);
+  Status RunValidation(Results* results_out);
 
   // Get timestamps.
   static int64_t BootTimeMicros();
@@ -96,7 +105,7 @@ class Validator {
   // embedded, run Model on CPU and add golden output to model_.
   MinibenchmarkStatus CheckGoldenOutput(Results* results_out);
 
-  std::unique_ptr<ModelLoader> model_loader_;
+  std::unique_ptr<tools::ModelLoader> model_loader_;
   const ComputeSettings* compute_settings_;
   // Optional. Interpreter that runs on CPU.
   std::unique_ptr<Interpreter> golden_interpreter_;
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
index 8c296f39774..9e402a81b80 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.cc
@@ -48,8 +48,10 @@ ValidatorRunner::ValidatorRunner(const ValidatorRunnerOptions& options)
       options.custom_input_data.empty()
           ? nullptr
           : std::make_unique<CustomValidationEmbedder>(
-                options.custom_input_batch_size, options.custom_input_data),
-      error_reporter_, options.nnapi_sl, options.validation_entrypoint_name);
+                options.custom_input_batch_size, options.custom_input_data,
+                options.error_reporter),
+      error_reporter_, options.nnapi_sl, options.gpu_plugin_handle,
+      options.validation_entrypoint_name, options.benchmark_result_evaluator);
 }
 
 MinibenchmarkStatus ValidatorRunner::Init() {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
index 07028a807b1..ae34fc49c3e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h
@@ -59,7 +59,7 @@ class ValidatorRunner {
   // Get results for successfully completed validation runs. The caller can then
   // pick the best configuration based on timings.
   std::vector<const BenchmarkEvent*> GetSuccessfulResults() {
-    return validator_runner_impl_->GetSuccessfulResults();
+    return validator_runner_impl_->GetSuccessfulResultsFromStorage();
   }
 
   // Get results for completed validation runs regardless whether it is
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
index 18e6211e216..d0e2b19bc5f 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint.h"
 
+#include <dlfcn.h>
+
+#include <memory>
 #include <string>
 
 #ifndef _WIN32
@@ -23,7 +26,6 @@ limitations under the License.
 #include <sys/types.h>
 
 #include <cstdint>
-#include <memory>
 #include <thread>  // NOLINT: only used on Android, where std::thread is allowed
 #include <utility>
 #include <vector>
@@ -34,44 +36,62 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/constants.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner.h"
 #include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 namespace tflite {
 namespace acceleration {
 namespace {
 
-MinibenchmarkStatus RunValidator(absl::string_view model_path,
-                                 const std::string& nnapi_sl_path,
-                                 TFLiteSettingsT& tflite_settings,
-                                 Validator::Results& results) {
-  // Load NNAPI Support Library if specified.
+using flatbuffers::Offset;
+
+Validator::Status RunValidator(absl::string_view model_path,
+                               const std::string& delegate_so_path,
+                               const TFLiteSettingsT& tflite_settings,
+                               Validator::Results& results) {
+  // Make a copy of tflite_settings for modification, so that the original
+  // tflite_settings will be written to storage. BlockingValidatorRunner
+  // compares the input and output tflite_settings to make sure all configs have
+  // corresponding results.
+  TFLiteSettingsT copy(tflite_settings);
+  // Load Delegate Library if specified.
   std::unique_ptr<const ::tflite::nnapi::NnApiSupportLibrary> nnapi_sl_handle;
-  if (tflite_settings.nnapi_settings && !nnapi_sl_path.empty()) {
-    // We are not calling dlclose, it will be done once the
-    // validator process ends.
-    nnapi_sl_handle = ::tflite::nnapi::loadNnApiSupportLibrary(nnapi_sl_path);
+  if (!delegate_so_path.empty()) {
+    if (tflite_settings.nnapi_settings) {
+      // We are not calling dlclose, it will be done once the
+      // validator process ends.
+      nnapi_sl_handle =
+          ::tflite::nnapi::loadNnApiSupportLibrary(delegate_so_path);
+
+      if (!nnapi_sl_handle) {
+        return Validator::Status{kMiniBenchmarkCannotLoadSupportLibrary,
+                                 BenchmarkStage_INITIALIZATION};
+      }
 
-    if (!nnapi_sl_handle) {
-      return kMiniBenchmarkCannotLoadSupportLibrary;
+      copy.nnapi_settings->support_library_handle =
+          reinterpret_cast<uint64_t>(nnapi_sl_handle->getFL5());
+    } else if (tflite_settings.gpu_settings) {
+      // Pass the module file path to GpuModulePlugin to load. GPU delegate is
+      // not currently using the stable delegate API, but uses the same
+      // mechanism to load.
+      // TODO(b/266066861): Migrate to stable delegate API once it's launched.
+      copy.stable_delegate_loader_settings =
+          std::make_unique<StableDelegateLoaderSettingsT>();
+      copy.stable_delegate_loader_settings->delegate_path = delegate_so_path;
     }
-
-    tflite_settings.nnapi_settings->support_library_handle =
-        reinterpret_cast<uint64_t>(nnapi_sl_handle->getFL5());
   }
 
   flatbuffers::FlatBufferBuilder fbb;
-  fbb.Finish(
-      CreateComputeSettings(fbb, ExecutionPreference_ANY,
-                            CreateTFLiteSettings(fbb, &tflite_settings)));
-  std::unique_ptr<ModelLoader> model_loader =
-      CreateModelLoaderFromPath(model_path);
+  fbb.Finish(CreateComputeSettings(fbb, ExecutionPreference_ANY,
+                                   CreateTFLiteSettings(fbb, &copy)));
+  std::unique_ptr<tools::ModelLoader> model_loader =
+      tools::CreateModelLoaderFromPath(model_path);
   if (!model_loader) {
-    return kMinibenchmarkPreconditionNotMet;
+    return Validator::Status{kMinibenchmarkPreconditionNotMet,
+                             BenchmarkStage_INITIALIZATION};
   }
 
   auto validator = std::make_unique<Validator>(
@@ -84,7 +104,7 @@ MinibenchmarkStatus RunValidator(absl::string_view model_path,
 }  // namespace
 
 extern "C" {
-// TODO(b/232085640): Add documentation to this function.
+
 int Java_org_tensorflow_lite_acceleration_validation_entrypoint(int argc,
                                                                 char** argv) {
   if (argc < 6) return 1;
@@ -112,9 +132,9 @@ int Java_org_tensorflow_lite_acceleration_validation_entrypoint(int argc,
   }
 
   FlatbufferStorage<BenchmarkEvent> storage(storage_path);
-  MinibenchmarkStatus status = storage.Read();
-  if (status != kMinibenchmarkSuccess) {
-    return status;
+  MinibenchmarkStatus read_status = storage.Read();
+  if (read_status != kMinibenchmarkSuccess) {
+    return read_status;
   }
   TFLiteSettingsT tflite_settings;
 
@@ -125,48 +145,61 @@ int Java_org_tensorflow_lite_acceleration_validation_entrypoint(int argc,
         &fbb,
         CreateBenchmarkEvent(
             fbb, CreateTFLiteSettings(fbb, &tflite_settings),
-            BenchmarkEventType_RECOVERED_ERROR, /* result */ 0,
-            CreateBenchmarkError(
-                fbb, BenchmarkStage_UNKNOWN,
-                kMinibenchmarkUnableToSetCpuAffinity, /*signal=*/0,
-                /*error_code=*/0,
-                /*mini_benchmark_error_code=*/set_big_core_affinity_errno),
+            BenchmarkEventType_RECOVERED_ERROR, /* result = */ 0,
+            // There is no dedicated field for the errno, so we pass it as
+            // exit_code instead.
+            CreateBenchmarkError(fbb, BenchmarkStage_INITIALIZATION,
+                                 /* exit_code = */ set_big_core_affinity_errno,
+                                 /* signal = */ 0,
+                                 /* error_code = */ 0,
+                                 /* mini_benchmark_error_code = */
+                                 kMinibenchmarkUnableToSetCpuAffinity),
             Validator::BootTimeMicros(), Validator::WallTimeMicros()));
   }
 
-  status = kMinibenchmarkNoValidationRequestFound;
+  Validator::Status run_status =
+      Validator::Status{kMinibenchmarkNoValidationRequestFound};
+
   for (int i = storage.Count() - 1; i >= 0; i--) {
     const BenchmarkEvent* event = storage.Get(i);
     if (event->event_type() == BenchmarkEventType_START) {
       event->tflite_settings()->UnPackTo(&tflite_settings);
 
       Validator::Results results;
-      status =
+      run_status =
           RunValidator(model_path, nnapi_sl_path, tflite_settings, results);
-      if (status != kMinibenchmarkSuccess) {
+      if (run_status.status != kMinibenchmarkSuccess) {
         break;
       }
 
-      // If succeed, write MiniBenchmark metrics to file then return.
+      // If succeed, write MiniBenchmark output to file then return.
       flatbuffers::FlatBufferBuilder fbb;
       std::vector<int64_t> delegate_prep_time_us{results.delegate_prep_time_us};
-      std::vector<flatbuffers::Offset<tflite::BenchmarkMetric>> metrics;
+      std::vector<Offset<tflite::BenchmarkMetric>> metrics;
       metrics.reserve(results.metrics.size());
       for (const auto& name_and_values : results.metrics) {
         metrics.push_back(
             CreateBenchmarkMetric(fbb, fbb.CreateString(name_and_values.first),
                                   fbb.CreateVector(name_and_values.second)));
       }
+      std::vector<Offset<BenchmarkResult_::InferenceOutput>> actual_output;
+      for (const auto& output : results.actual_inference_output) {
+        const uint8_t* output_uint8 =
+            reinterpret_cast<const uint8_t*>(output.data());
+        actual_output.push_back(BenchmarkResult_::CreateInferenceOutput(
+            fbb, fbb.CreateVector(output_uint8, output.size())));
+      }
       return storage.Append(
           &fbb,
-          CreateBenchmarkEvent(fbb, CreateTFLiteSettings(fbb, &tflite_settings),
-                               BenchmarkEventType_END,
-                               CreateBenchmarkResult(
-                                   fbb, fbb.CreateVector(delegate_prep_time_us),
-                                   fbb.CreateVector(results.execution_time_us),
-                                   0, results.ok, fbb.CreateVector(metrics)),
-                               /* error */ 0, Validator::BootTimeMicros(),
-                               Validator::WallTimeMicros()));
+          CreateBenchmarkEvent(
+              fbb, CreateTFLiteSettings(fbb, &tflite_settings),
+              BenchmarkEventType_END,
+              CreateBenchmarkResult(
+                  fbb, fbb.CreateVector(delegate_prep_time_us),
+                  fbb.CreateVector(results.execution_time_us), 0, results.ok,
+                  fbb.CreateVector(metrics), fbb.CreateVector(actual_output)),
+              /* error */ 0, Validator::BootTimeMicros(),
+              Validator::WallTimeMicros()));
     }
   }
   // Write error to file.
@@ -175,9 +208,12 @@ int Java_org_tensorflow_lite_acceleration_validation_entrypoint(int argc,
       &fbb, CreateBenchmarkEvent(
                 fbb, CreateTFLiteSettings(fbb, &tflite_settings),
                 BenchmarkEventType_ERROR, /* result */ 0,
-                CreateBenchmarkError(fbb, BenchmarkStage_UNKNOWN, status),
+                CreateBenchmarkError(fbb, run_status.stage, /* exit_code */ 0,
+                                     /* signal */ 0, /* error_code */ 0,
+                                     run_status.status),
                 Validator::BootTimeMicros(), Validator::WallTimeMicros()));
 }
+
 }  // extern "C"
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
index 7b1f79539c5..be9984b879e 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_entrypoint_test.cc
@@ -121,7 +121,7 @@ TEST_F(ValidatorRunnerEntryPointTest, NoValidationRequestFound) {
 
   EXPECT_EQ(BenchmarkEventType_ERROR, event->event_type());
   EXPECT_EQ(kMinibenchmarkNoValidationRequestFound,
-            event->error()->exit_code());
+            event->error()->mini_benchmark_error_code());
 }
 
 TEST_F(ValidatorRunnerEntryPointTest, CannotSetCpuAffinity) {
@@ -134,8 +134,9 @@ TEST_F(ValidatorRunnerEntryPointTest, CannotSetCpuAffinity) {
   const tflite::BenchmarkEvent* event = events[0];
 
   EXPECT_EQ(BenchmarkEventType_RECOVERED_ERROR, event->event_type());
-  EXPECT_EQ(kMinibenchmarkUnableToSetCpuAffinity, event->error()->exit_code());
-  EXPECT_EQ(10, event->error()->mini_benchmark_error_code());
+  EXPECT_EQ(kMinibenchmarkUnableToSetCpuAffinity,
+            event->error()->mini_benchmark_error_code());
+  EXPECT_EQ(10, event->error()->exit_code());
 }
 
 TEST_F(ValidatorRunnerEntryPointTest, CannotLoadNnapi) {
@@ -179,7 +180,7 @@ TEST_F(ValidatorRunnerEntryPointTest, CannotLoadNnapi) {
   const tflite::BenchmarkEvent* event = events[1];
   EXPECT_EQ(BenchmarkEventType_ERROR, event->event_type());
   EXPECT_EQ(kMiniBenchmarkCannotLoadSupportLibrary,
-            event->error()->exit_code());
+            event->error()->mini_benchmark_error_code());
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
index b74f5be9efd..b980b4d8f03 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/file_lock.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/runner.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator.h"
 #include "tensorflow/lite/logger.h"
 #include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 namespace tflite {
 namespace acceleration {
@@ -75,19 +75,17 @@ MinibenchmarkStatus ValidatorRunnerImpl::Init() {
     return status;
   }
 
-  std::unique_ptr<ModelLoader> model_loader =
-      CreateModelLoaderFromPath(fd_or_model_path_);
+  std::unique_ptr<tools::ModelLoader> model_loader =
+      tools::CreateModelLoaderFromPath(fd_or_model_path_);
   if (!model_loader) {
     TF_LITE_REPORT_ERROR(error_reporter_, "Failed to parse model path.");
     return kMinibenchmarkPreconditionNotMet;
   }
 
   // Check that the model can be loaded from disk.
-  status = model_loader->Init();
-  if (status != kMinibenchmarkSuccess) {
-    TF_LITE_REPORT_ERROR(error_reporter_, "Could not load model: %d",
-                         static_cast<int>(status));
-    return status;
+  if (!model_loader->Init()) {
+    TF_LITE_REPORT_ERROR(error_reporter_, "Could not load model");
+    return kMinibenchmarkModelInitFailed;
   }
 
   if (custom_validation_embedder_) {
@@ -109,6 +107,13 @@ MinibenchmarkStatus ValidatorRunnerImpl::Init() {
     return status;
   }
 
+  status = gpu_helper_.Load();
+  if (status != kMinibenchmarkSuccess) {
+    TF_LITE_REPORT_ERROR(error_reporter_, "Failed to load GPU Module: %d",
+                         static_cast<int>(status));
+    return status;
+  }
+
   status = validation_entrypoint_helper_.Validate();
   if (status != kMinibenchmarkSuccess) {
     return status;
@@ -137,96 +142,148 @@ void ValidatorRunnerImpl::TriggerValidationAsync(
   // runner may potentially hang, so we can't wait for it to terminate.
   // error_reporter is not passed in because the ownership cannot be passed to
   // the thread.
-  std::thread detached_thread([model_path = fd_or_model_path_,
-                               storage_path = storage_path_,
-                               data_directory_path = data_directory_path_,
-                               tflite_settings = std::move(tflite_settings),
-                               validation_entrypoint_name =
-                                   validation_entrypoint_helper_.name().c_str(),
-                               validation_entrypoint =
-                                   validation_entrypoint_helper_
-                                       .LoadEntrypoint(),
-                               nnapi_sl_path = nnapi_helper_.nnapi_sl_path(),
-                               model_with_custom_input =
-                                   CopyModel(model_with_custom_input_.get()),
-                               timeout_ms = timeout_ms_]() {
-    FileLock lock(storage_path + ".parent_lock");
-    if (!lock.TryLock()) {
-      return;
-    }
-    for (auto& one_setting : *tflite_settings) {
-      FlatbufferStorage<BenchmarkEvent> storage(storage_path);
-      TFLiteSettingsT tflite_settings_obj;
-      flatbuffers::GetRoot<TFLiteSettings>(one_setting.GetBufferPointer())
-          ->UnPackTo(&tflite_settings_obj);
-      TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Run validation with entry point '%s'",
-                      validation_entrypoint_name);
-      ProcessRunner runner(data_directory_path, validation_entrypoint_name,
-                           validation_entrypoint, timeout_ms);
-      int exitcode = 0;
-      int signal = 0;
-      MinibenchmarkStatus status = runner.Init();
-      if (status == kMinibenchmarkSuccess) {
-        flatbuffers::FlatBufferBuilder fbb;
-        status = storage.Append(
-            &fbb,
-            CreateBenchmarkEvent(
-                fbb, CreateTFLiteSettings(fbb, &tflite_settings_obj),
-                BenchmarkEventType_START, /* result */ 0, /* error */ 0,
-                Validator::BootTimeMicros(), Validator::WallTimeMicros()));
-        if (status == kMinibenchmarkSuccess) {
+  std::thread detached_thread(
+      [model_path = fd_or_model_path_, storage_path = storage_path_,
+       data_directory_path = data_directory_path_,
+       tflite_settings = std::move(tflite_settings),
+       validation_entrypoint_name =
+           validation_entrypoint_helper_.name().c_str(),
+       validation_entrypoint = validation_entrypoint_helper_.LoadEntrypoint(),
+       nnapi_sl_path = nnapi_helper_.nnapi_sl_path(),
+       gpu_so_path = gpu_helper_.gpu_so_path(),
+       model_with_custom_input = CopyModel(model_with_custom_input_.get()),
+       timeout_ms = timeout_ms_]() {
+        FileLock lock(storage_path + ".parent_lock");
+        if (!lock.TryLock()) {
+          return;
+        }
+        for (auto& one_setting : *tflite_settings) {
+          FlatbufferStorage<BenchmarkEvent> storage(storage_path);
+          TFLiteSettingsT tflite_settings_obj;
+          flatbuffers::GetRoot<TFLiteSettings>(one_setting.GetBufferPointer())
+              ->UnPackTo(&tflite_settings_obj);
+          TFLITE_LOG_PROD(TFLITE_LOG_INFO,
+                          "Run validation with entry point '%s'",
+                          validation_entrypoint_name);
+          ProcessRunner runner(data_directory_path, validation_entrypoint_name,
+                               validation_entrypoint, timeout_ms);
+          int exitcode = 0;
+          int signal = 0;
+          MinibenchmarkStatus status = runner.Init();
+          if (status == kMinibenchmarkSuccess) {
+            // Write START event to storage.
+            flatbuffers::FlatBufferBuilder fbb;
+            status = storage.Append(
+                &fbb,
+                CreateBenchmarkEvent(
+                    fbb, CreateTFLiteSettings(fbb, &tflite_settings_obj),
+                    BenchmarkEventType_START, /* result */ 0, /* error */ 0,
+                    Validator::BootTimeMicros(), Validator::WallTimeMicros()));
+          }
+          if (status != kMinibenchmarkSuccess) {
+            flatbuffers::FlatBufferBuilder fbb;
+            storage.Append(
+                &fbb,
+                CreateBenchmarkEvent(
+                    fbb, CreateTFLiteSettings(fbb, &tflite_settings_obj),
+                    BenchmarkEventType_ERROR, /* result */ 0,
+                    CreateBenchmarkError(fbb, BenchmarkStage_INITIALIZATION,
+                                         exitcode, signal, /* error_code */ {},
+                                         status),
+                    Validator::BootTimeMicros(), Validator::WallTimeMicros()));
+            continue;
+          }
           std::vector<std::string> args;
           if (!model_with_custom_input) {
             args.push_back(model_path);
           }
           args.push_back(storage_path);
           args.push_back(data_directory_path);
-          if (!nnapi_sl_path.empty() &&
-              tflite_settings_obj.delegate == tflite::Delegate_NNAPI) {
+          // If NNAPI or GPU is provided as a shared object file, pass the file
+          // path as a commandline flag.
+          if (tflite_settings_obj.delegate == tflite::Delegate_NNAPI &&
+              !nnapi_sl_path.empty()) {
             TFLITE_LOG_PROD(
                 TFLITE_LOG_INFO,
                 "Running benchmark using NNAPI support library at path '%s'",
                 nnapi_sl_path.c_str());
             args.push_back(nnapi_sl_path);
+          } else if (tflite_settings_obj.delegate == tflite::Delegate_GPU &&
+                     !gpu_so_path.empty()) {
+            TFLITE_LOG_PROD(
+                TFLITE_LOG_INFO,
+                "Running benchmark using GPU Delegate Module at path '%s'",
+                gpu_so_path.c_str());
+            args.push_back(gpu_so_path);
           }
+
           std::string output;
           status = runner.Run(model_with_custom_input.get(), args, &output,
                               &exitcode, &signal);
+          if (status != kMinibenchmarkSuccess) {
+            std::cout << "Run() returned " << status << std::endl;
+            flatbuffers::FlatBufferBuilder fbb;
+            storage.Append(
+                &fbb,
+                CreateBenchmarkEvent(
+                    fbb, CreateTFLiteSettings(fbb, &tflite_settings_obj),
+                    BenchmarkEventType_ERROR, /* result */ 0,
+                    CreateBenchmarkError(fbb, BenchmarkStage_UNKNOWN, exitcode,
+                                         signal, {}, status),
+                    Validator::BootTimeMicros(), Validator::WallTimeMicros()));
+          }
         }
-      }
-      if (status != kMinibenchmarkSuccess) {
-        std::cout << "Run() returned " << status << std::endl;
-        flatbuffers::FlatBufferBuilder fbb;
-        storage.Append(
-            &fbb,
-            CreateBenchmarkEvent(
-                fbb, CreateTFLiteSettings(fbb, &tflite_settings_obj),
-                BenchmarkEventType_ERROR, /* result */ 0,
-                CreateBenchmarkError(fbb, BenchmarkStage_UNKNOWN, status,
-                                     signal, {}, exitcode),
-                Validator::BootTimeMicros(), Validator::WallTimeMicros()));
-      }
-    }
-  });
+      });
   detached_thread.detach();
 }
 
-std::vector<const BenchmarkEvent*> ValidatorRunnerImpl::GetSuccessfulResults() {
+std::vector<const BenchmarkEvent*>
+ValidatorRunnerImpl::GetSuccessfulResultsFromStorage() {
   std::vector<const BenchmarkEvent*> results;
   storage_.Read();
   for (int i = 0; i < storage_.Count(); i++) {
     const BenchmarkEvent* event = storage_.Get(i);
+    TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "Benchmark event(%d).",
+                    event->event_type());
+
     if (benchmark_evaluator_->IsValidationSuccessEvent(*event)) {
       results.push_back(event);
     } else if (event->event_type() == BenchmarkEventType_ERROR) {
-      TFLITE_LOG(TFLITE_LOG_WARNING,
-                 "Benchmark event failed with error code (%d).",
-                 event->error()->error_code());
+      TFLITE_LOG(
+          TFLITE_LOG_WARNING,
+          "Benchmark event failed with error code (%d), signal (%d), exit code "
+          "(%d), stage (%d), mini benchmark error code (%d).\n",
+          event->error()->error_code(), event->error()->signal(),
+          event->error()->exit_code(), event->error()->stage(),
+          event->error()->mini_benchmark_error_code());
     }
   }
   return results;
 }
 
+std::vector<FlatBufferBuilder> ValidatorRunnerImpl::GetCompletedResults() {
+  storage_.Read();
+  std::vector<FlatBufferBuilder> results;
+  for (int i = 0; i < storage_.Count(); i++) {
+    const BenchmarkEvent* event = storage_.Get(i);
+    if (event->event_type() != BenchmarkEventType_ERROR &&
+        event->event_type() != BenchmarkEventType_END) {
+      continue;
+    }
+    BenchmarkEventT event_obj;
+    event->UnPackTo(&event_obj);
+
+    if (benchmark_evaluator_->IsValidationSuccessEvent(*event)) {
+      event_obj.result->ok = true;
+    }
+
+    FlatBufferBuilder fbb;
+    fbb.Finish(CreateBenchmarkEvent(fbb, &event_obj));
+    results.emplace_back(std::move(fbb));
+  }
+  return results;
+}
+
 int ValidatorRunnerImpl::GetNumCompletedResults() {
   storage_.Read();
   int num_results = 0;
@@ -272,11 +329,11 @@ MinibenchmarkStatus ValidatorRunnerImpl::NnapiHelper::Load() {
     // Looking for the file where the NNAPI SL is loaded from. We are using
     // the ANeuralNetworks_getRuntimeFeatureLevel because it is a required
     // function for NNAPI drivers.
-    // If the function is not defined or it wasn't defined in any of the shared
-    // libraries loaded by the calling process we fail with a specific error
-    // code. This could happen only if the NNAPI Support Library pointer set
-    // into our TfLiteSettings comes from an invalid NNAPI SL library or there
-    // is some error in the NNAPI loading code.
+    // If the function is not defined or it wasn't defined in any of the
+    // shared libraries loaded by the calling process we fail with a
+    // specific error code. This could happen only if the NNAPI Support
+    // Library pointer set into our TfLiteSettings comes from an invalid
+    // NNAPI SL library or there is some error in the NNAPI loading code.
     if (!nnapi_sl_->ANeuralNetworks_getRuntimeFeatureLevel) {
       return kMiniBenchmarkCannotLoadSupportLibrary;
     }
@@ -294,5 +351,23 @@ MinibenchmarkStatus ValidatorRunnerImpl::NnapiHelper::Load() {
   return kMinibenchmarkSuccess;
 }
 
+MinibenchmarkStatus ValidatorRunnerImpl::GpuHelper::Load() {
+  if (gpu_plugin_handle_) {
+#ifndef _WIN32
+    Dl_info dl_info;
+    // Looking for the file where GPU is loaded from. This file will be passed
+    // to validator in a separate process.
+    int status = dladdr(gpu_plugin_handle_, &dl_info);
+    if (status == 0 || !dl_info.dli_fname) {
+      return kMinibenchmarkCannotLoadGpuModule;
+    }
+    gpu_so_path_ = dl_info.dli_fname;
+  }
+#else   // _WIN32
+    return kMinibenchmarkUnsupportedPlatform;
+  }
+#endif  // !_WIN32
+  return kMinibenchmarkSuccess;
+}
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
index ba8b39f0f3a..a75eae6a3ec 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/fb_storage.h"
@@ -45,7 +46,9 @@ class ValidatorRunnerImpl {
       const std::string& data_directory_path, int timeout_ms,
       std::unique_ptr<CustomValidationEmbedder> custom_validation_embedder,
       ErrorReporter* error_reporter, const NnApiSLDriverImplFL5* nnapi_sl,
-      const std::string& validation_entrypoint_name)
+      const TfLiteDelegatePlugin* gpu_plugin_handle,
+      const std::string& validation_entrypoint_name,
+      AbstractBenchmarkResultEvaluator* benchmark_evaluator)
       : fd_or_model_path_(fd_or_model_path),
         storage_path_(storage_path),
         data_directory_path_(data_directory_path),
@@ -54,14 +57,10 @@ class ValidatorRunnerImpl {
         error_reporter_(error_reporter),
         storage_(storage_path_, error_reporter_),
         nnapi_helper_(nnapi_sl),
+        gpu_helper_(gpu_plugin_handle),
         validation_entrypoint_helper_(validation_entrypoint_name,
-                                      error_reporter_) {
-    if (custom_validation_embedder_) {
-      benchmark_evaluator_ = std::make_unique<CustomResultEvaluator>();
-    } else {
-      benchmark_evaluator_ = std::make_unique<EmbeddedResultEvaluator>();
-    }
-  }
+                                      error_reporter_),
+        benchmark_evaluator_(benchmark_evaluator) {}
 
   MinibenchmarkStatus Init();
 
@@ -71,9 +70,16 @@ class ValidatorRunnerImpl {
       std::unique_ptr<std::vector<flatbuffers::FlatBufferBuilder>>
           tflite_settings);
 
-  std::vector<const BenchmarkEvent*> GetSuccessfulResults();
+  // Returns the unmodified successful BenchmarkEvent from storage. If a
+  // BenchmarkEvent is considered pass with the BenchmarkResultEvaluator, but
+  // its result.ok is set to false, the BenchmarkEvent will be returned as-is.
+  std::vector<const BenchmarkEvent*> GetSuccessfulResultsFromStorage();
   int GetNumCompletedResults();
 
+  // Returns the completed BenchmarkEvent. BenchmarkResult::ok() will be set to
+  // the result from BenchmarkResultEvaluator.
+  std::vector<flatbuffers::FlatBufferBuilder> GetCompletedResults();
+
  private:
   class NnapiHelper {
    public:
@@ -93,6 +99,19 @@ class ValidatorRunnerImpl {
     std::string nnapi_sl_path_;
   };
 
+  // Lookup the GPU Module .so file path if gpu_plugin_handle is provided.
+  class GpuHelper {
+   public:
+    explicit GpuHelper(const TfLiteDelegatePlugin* gpu_plugin_handle)
+        : gpu_plugin_handle_(gpu_plugin_handle) {}
+    MinibenchmarkStatus Load();
+    const std::string& gpu_so_path() const { return gpu_so_path_; }
+
+   private:
+    const TfLiteDelegatePlugin* gpu_plugin_handle_;
+    std::string gpu_so_path_;
+  };
+
   class ValidationEntrypointHelper {
    public:
     using EntrypointFunc = int(int argc, char** argv);
@@ -131,8 +150,9 @@ class ValidatorRunnerImpl {
   ErrorReporter* error_reporter_;
   FlatbufferStorage<BenchmarkEvent> storage_;
   NnapiHelper nnapi_helper_;
+  GpuHelper gpu_helper_;
   ValidationEntrypointHelper validation_entrypoint_helper_;
-  std::unique_ptr<BenchmarkResultEvaluator> benchmark_evaluator_ = nullptr;
+  AbstractBenchmarkResultEvaluator* benchmark_evaluator_ = nullptr;
 };
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
index 9aa50c6ea2e..f125fc02b29 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_impl_test.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/time/time.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_nnapi_sl_fake_impl.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/nnapi_sl_fake_impl.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
 #include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
 #include "tensorflow/lite/stderr_reporter.h"
 #ifdef __ANDROID__
@@ -46,51 +49,74 @@ namespace tflite {
 namespace acceleration {
 namespace {
 
+using ::flatbuffers::FlatBufferBuilder;
+using ::flatbuffers::GetRoot;
+
 constexpr absl::Duration kWaitBetweenRefresh = absl::Milliseconds(20);
 
+class AlwaysTrueEvaluator : public AbstractBenchmarkResultEvaluator {
+ public:
+  bool HasPassedAccuracyCheck(const BenchmarkResult& result) override {
+    return true;
+  }
+};
+
 class ValidatorRunnerImplTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    (void)unlink(storage_path_.c_str());
     MiniBenchmarkTestHelper helper;
     should_perform_test_ = helper.should_perform_test();
-    model_path_ = helper.DumpToTempFile(
+    nnapi_sl_dump_path_ = helper.DumpToTempFile(
+        "libnnapi_fake.so", g_nnapi_sl_fake_impl, g_nnapi_sl_fake_impl_len);
+
+    options_.data_directory_path = ::testing::TempDir();
+    options_.storage_path = ::testing::TempDir() + "/storage_path.fb";
+    options_.validation_entrypoint_name =
+        "Java_org_tensorflow_lite_acceleration_validation_entrypoint";
+    options_.error_reporter = tflite::DefaultErrorReporter();
+    options_.benchmark_result_evaluator =
+        EmbeddedResultEvaluator::GetInstance();
+    options_.per_test_timeout_ms = 0;
+
+    options_.model_path = helper.DumpToTempFile(
         "mobilenet_quant_with_validation.tflite",
         g_tflite_acceleration_embedded_mobilenet_validation_model,
         g_tflite_acceleration_embedded_mobilenet_validation_model_len);
-    ASSERT_TRUE(!model_path_.empty());
+    ASSERT_TRUE(!options_.model_path.empty());
 
     plain_model_path_ = MiniBenchmarkTestHelper::DumpToTempFile(
         "mobilenet_quant.tflite",
         g_tflite_acceleration_embedded_mobilenet_model,
         g_tflite_acceleration_embedded_mobilenet_model_len);
     ASSERT_TRUE(!plain_model_path_.empty());
+  }
 
-    nnapi_sl_dump_path_ = helper.DumpToTempFile(
-        "libnnapi_fake.so", g_nnapi_sl_fake_impl, g_nnapi_sl_fake_impl_len);
+  void TearDown() override {
+    if (should_perform_test_) {
+      ASSERT_EQ(unlink(options_.storage_path.c_str()), 0);
+    }
   }
 
   ValidatorRunnerImpl CreateValidator() {
-    return ValidatorRunnerImpl(model_path_, storage_path_, data_directory_path_,
-                               0, std::move(custom_validation_embedder_),
-                               error_reporter_, nnapi_sl_, entrypoint_name_);
+    return ValidatorRunnerImpl(
+        options_.model_path, options_.storage_path,
+        options_.data_directory_path, options_.per_test_timeout_ms,
+        std::move(custom_validation_embedder_), options_.error_reporter,
+        options_.nnapi_sl, options_.gpu_plugin_handle,
+        options_.validation_entrypoint_name,
+        options_.benchmark_result_evaluator);
   }
 
   bool should_perform_test_;
-  std::string data_directory_path_ = ::testing::TempDir();
-  std::string storage_path_ = ::testing::TempDir() + "/storage_path.fb";
-  std::string model_path_;
+  ValidatorRunnerOptions options_{};
   std::string plain_model_path_;
   std::unique_ptr<CustomValidationEmbedder> custom_validation_embedder_ =
       nullptr;
   std::string nnapi_sl_dump_path_;
-  const NnApiSLDriverImplFL5* nnapi_sl_ = nullptr;
-  std::string entrypoint_name_ =
-      "Java_org_tensorflow_lite_acceleration_validation_entrypoint";
-  ErrorReporter* error_reporter_ = tflite::DefaultErrorReporter();
 };
 
-TEST_F(ValidatorRunnerImplTest, SucceedWithNnApiSl) {
+TEST_F(ValidatorRunnerImplTest,
+       GetSuccessfulResultsSucceedWithNnApiSlAndEmbeddedValidation) {
   // Setup.
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
@@ -106,7 +132,7 @@ TEST_F(ValidatorRunnerImplTest, SucceedWithNnApiSl) {
   std::unique_ptr<const ::tflite::nnapi::NnApiSupportLibrary> fake_nnapi_sl =
       ::tflite::nnapi::loadNnApiSupportLibrary(nnapi_sl_dump_path_);
   ASSERT_THAT(fake_nnapi_sl.get(), ::testing::NotNull());
-  nnapi_sl_ = fake_nnapi_sl->getFL5();
+  options_.nnapi_sl = fake_nnapi_sl->getFL5();
 
   ValidatorRunnerImpl validator = CreateValidator();
   ASSERT_EQ(validator.Init(), kMinibenchmarkSuccess);
@@ -122,28 +148,37 @@ TEST_F(ValidatorRunnerImplTest, SucceedWithNnApiSl) {
           std::move(tflite_settings)));
 
   // Validate.
-  FlatbufferStorage<BenchmarkEvent> storage(storage_path_, error_reporter_);
+  FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
+                                            options_.error_reporter);
   while (validator.GetNumCompletedResults() < 1) {
     usleep(absl::ToInt64Microseconds(kWaitBetweenRefresh));
   }
-  std::vector<const BenchmarkEvent*> results = validator.GetSuccessfulResults();
-  for (auto result : results) {
-    EXPECT_THAT(result, testing::Property(&BenchmarkEvent::event_type,
+  std::vector<const BenchmarkEvent*> results =
+      validator.GetSuccessfulResultsFromStorage();
+  ASSERT_THAT(results, testing::Not(testing::IsEmpty()));
+  for (auto& result : results) {
+    ASSERT_THAT(result, testing::Property(&BenchmarkEvent::event_type,
                                           testing::Eq(BenchmarkEventType_END)));
+    EXPECT_THAT(result->result()->actual_output(),
+                testing::Pointee(testing::SizeIs(0)));
   }
   EXPECT_TRUE(WasNnApiSlInvoked());
 }
 
-TEST_F(ValidatorRunnerImplTest, SucceedWithEmbeddedValidation) {
+TEST_F(ValidatorRunnerImplTest,
+       GetCompletedResultsReturnsOkWithCustomValidation) {
   // Setup.
   if (!should_perform_test_) {
     std::cerr << "Skipping test";
     return;
   }
+  int batch_size = 3;
   custom_validation_embedder_ = std::make_unique<CustomValidationEmbedder>(
-      3, std::vector<std::vector<uint8_t>>{
-             std::vector<uint8_t>(3 * 224 * 224 * 3, 1)});
-  model_path_ = plain_model_path_;
+      batch_size, std::vector<std::vector<uint8_t>>{
+                      std::vector<uint8_t>(batch_size * 224 * 224 * 3, 1)});
+  options_.model_path = plain_model_path_;
+  AlwaysTrueEvaluator evaluator;
+  options_.benchmark_result_evaluator = &evaluator;
   ValidatorRunnerImpl validator = CreateValidator();
   ASSERT_EQ(validator.Init(), kMinibenchmarkSuccess);
 
@@ -156,14 +191,67 @@ TEST_F(ValidatorRunnerImplTest, SucceedWithEmbeddedValidation) {
           std::move(tflite_settings)));
 
   // Validate.
-  FlatbufferStorage<BenchmarkEvent> storage(storage_path_, error_reporter_);
+  FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
+                                            options_.error_reporter);
   while (validator.GetNumCompletedResults() < 1) {
     usleep(absl::ToInt64Microseconds(kWaitBetweenRefresh));
   }
-  std::vector<const BenchmarkEvent*> results = validator.GetSuccessfulResults();
-  for (auto result : results) {
-    EXPECT_THAT(result, testing::Property(&BenchmarkEvent::event_type,
-                                          testing::Eq(BenchmarkEventType_END)));
+  std::vector<FlatBufferBuilder> results = validator.GetCompletedResults();
+  ASSERT_THAT(results, testing::Not(testing::IsEmpty()));
+  for (auto& result : results) {
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    ASSERT_THAT(event, testing::Property(&BenchmarkEvent::event_type,
+                                         testing::Eq(BenchmarkEventType_END)));
+    EXPECT_TRUE(event->result()->ok());
+    EXPECT_THAT(event->result()->actual_output(),
+                testing::Pointee(testing::SizeIs(1)));
+    EXPECT_THAT(event->result()->actual_output()->Get(0)->value(),
+                testing::Pointee(testing::SizeIs(batch_size * 1001)));
+  }
+}
+
+TEST_F(ValidatorRunnerImplTest,
+       GetCompletedResultsReturnsNotOkIfCustomValidationFailed) {
+  // Setup.
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+  int batch_size = 3;
+  custom_validation_embedder_ = std::make_unique<CustomValidationEmbedder>(
+      batch_size, std::vector<std::vector<uint8_t>>{
+                      std::vector<uint8_t>(batch_size * 224 * 224 * 3, 1)});
+  options_.model_path = plain_model_path_;
+  ValidatorRunnerImpl validator = CreateValidator();
+  ASSERT_EQ(validator.Init(), kMinibenchmarkSuccess);
+
+  std::vector<flatbuffers::FlatBufferBuilder> tflite_settings(1);
+  tflite_settings[0].Finish(CreateTFLiteSettings(tflite_settings[0]));
+
+  // Run.
+  validator.TriggerValidationAsync(
+      std::make_unique<std::vector<flatbuffers::FlatBufferBuilder>>(
+          std::move(tflite_settings)));
+
+  // Validate.
+  FlatbufferStorage<BenchmarkEvent> storage(options_.storage_path,
+                                            options_.error_reporter);
+  while (validator.GetNumCompletedResults() < 1) {
+    usleep(absl::ToInt64Microseconds(kWaitBetweenRefresh));
+  }
+  std::vector<FlatBufferBuilder> results = validator.GetCompletedResults();
+  ASSERT_THAT(results, testing::Not(testing::IsEmpty()));
+  for (auto& result : results) {
+    const BenchmarkEvent* event =
+        GetRoot<BenchmarkEvent>(result.GetBufferPointer());
+    ASSERT_THAT(event, testing::Property(&BenchmarkEvent::event_type,
+                                         testing::Eq(BenchmarkEventType_END)));
+    EXPECT_FALSE(event->result()->ok());
+    EXPECT_THAT(event->result()->actual_output(),
+                testing::Pointee(testing::SizeIs(1)));
+    EXPECT_THAT(event->result()->actual_output()->Get(0)->value(),
+                testing::Pointee(testing::SizeIs(batch_size * 1001)));
   }
 }
 
@@ -175,25 +263,25 @@ TEST_F(ValidatorRunnerImplTest, FailIfItCannotFindNnApiSlPath) {
 
   // Building an NNAPI SL structure with invalid handle.
   NnApiSLDriverImplFL5 wrong_handle_nnapi_sl{};
-  nnapi_sl_ = &wrong_handle_nnapi_sl;
+  options_.nnapi_sl = &wrong_handle_nnapi_sl;
   ValidatorRunnerImpl validator = CreateValidator();
 
   EXPECT_EQ(validator.Init(), kMiniBenchmarkCannotLoadSupportLibrary);
 }
 
 TEST_F(ValidatorRunnerImplTest, FailWithInvalidEntrypoint) {
-  entrypoint_name_ = "invalid_name()";
+  options_.validation_entrypoint_name = "invalid_name()";
   EXPECT_EQ(CreateValidator().Init(),
             kMinibenchmarkValidationEntrypointSymbolNotFound);
 }
 
 TEST_F(ValidatorRunnerImplTest, FailIfCannotLoadModel) {
-  model_path_ = "invalid/path";
-  EXPECT_EQ(CreateValidator().Init(), kMinibenchmarkModelBuildFailed);
+  options_.model_path = "invalid/path";
+  EXPECT_EQ(CreateValidator().Init(), kMinibenchmarkModelInitFailed);
 }
 
 TEST_F(ValidatorRunnerImplTest, FailIfCannotEmbedInputData) {
-  model_path_ = plain_model_path_;
+  options_.model_path = plain_model_path_;
   custom_validation_embedder_ = std::make_unique<CustomValidationEmbedder>(
       1, std::vector<std::vector<uint8_t>>(2));
   EXPECT_EQ(CreateValidator().Init(),
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
index 0f6acc0d0fa..655a1d91f06 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h
@@ -15,11 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_OPTIONS_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_VALIDATOR_RUNNER_OPTIONS_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/nnapi/sl/include/SupportLibrary.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
@@ -51,6 +54,11 @@ struct ValidatorRunnerOptions {
   // M*N. The input data from different batches are concatenated so that the
   // j-th input data maps to custom_input_data[i][j * M to(j + 1) * M].
   std::vector<std::vector<uint8_t>> custom_input_data;
+  // The custom validation rule that decides whether the output is considered
+  // passing accuracy checks. The lifetime of this evaluator should last longer
+  // than validator runner.
+  AbstractBenchmarkResultEvaluator* benchmark_result_evaluator =
+      EmbeddedResultEvaluator::GetInstance();
 
   // Required: The 'storage_path' must be model-specific.
   std::string storage_path;
@@ -61,7 +69,7 @@ struct ValidatorRunnerOptions {
   // timeout is not enabled.
   int per_test_timeout_ms = 0;
 
-  // The nnapi_sl pointer can be used to configure the runner to use
+  // Optional: The nnapi_sl pointer can be used to configure the runner to use
   // the NNAPI implementation coming from the Support Library instead of
   // the NNAPI platform drivers.
   // If nnapi_sl is not null we expect the functions referenced by the
@@ -70,6 +78,11 @@ struct ValidatorRunnerOptions {
   // shared library, dlclose is called only after all this mini-benchmark
   // object has been deleted.
   const NnApiSLDriverImplFL5* nnapi_sl = nullptr;
+  // Optional: A handle to a gpu_plugin provided by TFLite-in-PlayServices GPU
+  // Module. It will be used to lookup the shared object that provides GPU
+  // Delegate Plugin.
+  const TfLiteDelegatePlugin* gpu_plugin_handle = nullptr;
+
   std::string validation_entrypoint_name = TfLiteValidationEntrypointName();
   ErrorReporter* error_reporter = DefaultErrorReporter();
 };
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
index b925c1aca88..fb0fa60632a 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_test.cc
@@ -227,7 +227,15 @@ TEST_F(ValidatorRunnerTest, ShouldUseNnApiSl) {
   while (event_count < settings.size()) {
     events = validator.GetAndFlushEventsToLog();
     event_count += events.size();
+    // Duplicating the sleep(1) from CheckConfigurations() method above in this
+    // file. The validation is done in a separate process, this is likely needed
+    // to properly wait for the async process to finish.
+#ifndef _WIN32
+    sleep(1);
+#endif  // !_WIN32
   }
+  ASSERT_EQ(validator.TriggerMissingValidation(settings), 0);
+
   EXPECT_TRUE(WasNnApiSlInvoked());
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
index 36f5f3fe8d4..9a64f09a391 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/validator_test.cc
@@ -22,17 +22,16 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration.pb.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_modifier/custom_validation_embedder.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
 #include "tensorflow/lite/schema/mutable/schema_generated.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 // Note that these tests are not meant to be completely exhaustive, but to test
 // error propagation.
@@ -43,6 +42,9 @@ namespace {
 
 using flatbuffers::FlatBufferBuilder;
 
+// The output tensor byte size from one input.
+constexpr int kOutputTensorSize = 1001;
+
 class ValidatorTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -52,49 +54,45 @@ class ValidatorTest : public ::testing::Test {
         g_tflite_acceleration_embedded_mobilenet_validation_model_len);
     ASSERT_TRUE(!validation_model_path.empty());
     validation_model_loader_ =
-        std::make_unique<PathModelLoader>(validation_model_path);
+        std::make_unique<tools::PathModelLoader>(validation_model_path);
 
     std::string plain_model_path = MiniBenchmarkTestHelper::DumpToTempFile(
         "mobilenet_quant.tflite",
         g_tflite_acceleration_embedded_mobilenet_model,
         g_tflite_acceleration_embedded_mobilenet_model_len);
     ASSERT_TRUE(!plain_model_path.empty());
-    plain_model_loader_ = std::make_unique<PathModelLoader>(plain_model_path);
+    plain_model_loader_ =
+        std::make_unique<tools::PathModelLoader>(plain_model_path);
 
     compute_settings_fbb_.Finish(CreateComputeSettings(compute_settings_fbb_));
     default_compute_settings_ = flatbuffers::GetRoot<ComputeSettings>(
         compute_settings_fbb_.GetBufferPointer());
   }
 
-  std::unique_ptr<ModelLoader> validation_model_loader_;
-  std::unique_ptr<ModelLoader> plain_model_loader_;
+  std::unique_ptr<tools::ModelLoader> validation_model_loader_;
+  std::unique_ptr<tools::ModelLoader> plain_model_loader_;
   FlatBufferBuilder compute_settings_fbb_;
   const ComputeSettings* default_compute_settings_;
 };
 
 TEST_F(ValidatorTest, HappyPathOnCpuWithEmbeddedValidation) {
-  ASSERT_EQ(validation_model_loader_->Init(), kMinibenchmarkSuccess);
-  int model_output_size = validation_model_loader_->GetModel()
-                              ->GetModel()
-                              ->subgraphs()
-                              ->Get(0)
-                              ->outputs()
-                              ->size();
-
+  ASSERT_TRUE(validation_model_loader_->Init());
   Validator validator(std::move(validation_model_loader_),
                       default_compute_settings_);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results), kMinibenchmarkSuccess);
+  Validator::Status validation_run = validator.RunValidation(&results);
+  EXPECT_EQ(validation_run.status, kMinibenchmarkSuccess);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_UNKNOWN);
   EXPECT_TRUE(results.ok);
   EXPECT_GE(results.metrics.size(), 0);
   EXPECT_EQ(results.delegate_error, 0);
-  EXPECT_EQ(results.actual_inference_output.size(), model_output_size);
+  EXPECT_TRUE(results.actual_inference_output.empty());
 }
 
 TEST_F(ValidatorTest, HappyPathOnCpuWithCustomValidation) {
   // Setup.
-  ASSERT_EQ(plain_model_loader_->Init(), kMinibenchmarkSuccess);
-  ASSERT_EQ(validation_model_loader_->Init(), kMinibenchmarkSuccess);
+  ASSERT_TRUE(plain_model_loader_->Init());
+  ASSERT_TRUE(validation_model_loader_->Init());
   const SubGraph* main_model =
       plain_model_loader_->GetModel()->GetModel()->subgraphs()->Get(0);
 
@@ -119,17 +117,22 @@ TEST_F(ValidatorTest, HappyPathOnCpuWithCustomValidation) {
       "mobilenet_quant_with_input.tflite", model_with_input.GetBufferPointer(),
       model_with_input.GetSize());
   ASSERT_TRUE(!model_path.empty());
-  auto model_loader = std::make_unique<PathModelLoader>(model_path);
+  auto model_loader = std::make_unique<tools::PathModelLoader>(model_path);
 
   // Execute.
   Validator validator(std::move(model_loader), default_compute_settings_);
   // Verify.
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results), kMinibenchmarkSuccess);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkSuccess);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_UNKNOWN);
   EXPECT_FALSE(results.ok);
   EXPECT_EQ(results.metrics.size(), 0);
   EXPECT_EQ(results.delegate_error, 0);
   EXPECT_EQ(results.actual_inference_output.size(), model_output_size);
+  EXPECT_EQ(results.actual_inference_output[0].size(),
+            batch_size * kOutputTensorSize);
 }
 
 TEST_F(ValidatorTest, DelegateNotSupported) {
@@ -140,20 +143,24 @@ TEST_F(ValidatorTest, DelegateNotSupported) {
 
   Validator validator(std::move(validation_model_loader_), settings);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results),
-            kMinibenchmarkDelegateNotSupported);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkDelegateNotSupported);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_INITIALIZATION);
 }
 
 TEST_F(ValidatorTest, NoValidationSubgraph) {
   Validator validator(std::move(plain_model_loader_),
                       default_compute_settings_);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results),
-            kMinibenchmarkValidationSubgraphNotFound);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkValidationSubgraphNotFound);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_INITIALIZATION);
 }
 
 TEST_F(ValidatorTest, NoValidationInputData) {
-  ASSERT_EQ(plain_model_loader_->Init(), kMinibenchmarkSuccess);
+  ASSERT_TRUE(plain_model_loader_->Init());
   FlatBufferBuilder model_with_input;
   CustomValidationEmbedder embedder(1, {{}});
   EXPECT_EQ(embedder.BuildModel(*plain_model_loader_->GetModel()->GetModel(),
@@ -163,12 +170,14 @@ TEST_F(ValidatorTest, NoValidationInputData) {
       "mobilenet_quant_with_input.tflite", model_with_input.GetBufferPointer(),
       model_with_input.GetSize());
   ASSERT_TRUE(!model_path.empty());
-  auto model_loader = std::make_unique<PathModelLoader>(model_path);
+  auto model_loader = std::make_unique<tools::PathModelLoader>(model_path);
 
   Validator validator(std::move(model_loader), default_compute_settings_);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results),
-            kMinibenchmarkValidationInputMissing);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkValidationInputMissing);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_INITIALIZATION);
 }
 
 TEST_F(ValidatorTest, InvalidModel) {
@@ -178,16 +187,22 @@ TEST_F(ValidatorTest, InvalidModel) {
       g_tflite_acceleration_embedded_mobilenet_validation_model_len - 12000);
   ASSERT_TRUE(!dump_path.empty());
 
-  Validator validator(std::make_unique<PathModelLoader>(dump_path),
+  Validator validator(std::make_unique<tools::PathModelLoader>(dump_path),
                       default_compute_settings_);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results), kMinibenchmarkModelBuildFailed);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkModelInitFailed);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_INITIALIZATION);
 }
 
 TEST_F(ValidatorTest, EmptyModelLoader) {
   Validator validator(nullptr, default_compute_settings_);
   Validator::Results results;
-  EXPECT_EQ(validator.RunValidation(&results), kMinibenchmarkModelReadFailed);
+  Validator::Status validation_run = validator.RunValidation(&results);
+
+  EXPECT_EQ(validation_run.status, kMinibenchmarkModelReadFailed);
+  EXPECT_EQ(validation_run.stage, BenchmarkStage_INITIALIZATION);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index bb4ffc5275a..430f85284e9 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -12,6 +12,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -68,7 +69,7 @@ cc_test(
     deps = [
         ":audio_microfrontend",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
index b73e2d25dc4..e5c7cf130c7 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
index 95d826b1373..1808620feaa 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -1,5 +1,6 @@
 # Library for generating feature vectors from audio data
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 1f73f3fc2e8..b648e6bca1e 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -49,8 +49,9 @@ def testSimple(self):
           lower_band_limit=LOWER_BAND_LIMIT,
           smoothing_bits=SMOOTHING_BITS,
           enable_pcan=True)
-      self.assertAllEqual(filterbanks.eval(),
-                          [[479, 425], [436, 378], [410, 350], [391, 325]])
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[479, 425], [436, 378], [410, 350], [391, 325]])
 
   def testSimpleFloatScaled(self):
     with self.test_session():
@@ -69,9 +70,10 @@ def testSimpleFloatScaled(self):
           enable_pcan=True,
           out_scale=64,
           out_type=tf.float32)
-      self.assertAllEqual(filterbanks.eval(),
-                          [[7.484375, 6.640625], [6.8125, 5.90625],
-                           [6.40625, 5.46875], [6.109375, 5.078125]])
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[7.484375, 6.640625], [6.8125, 5.90625], [6.40625, 5.46875],
+           [6.109375, 5.078125]])
 
   def testStacking(self):
     with self.test_session():
@@ -90,8 +92,9 @@ def testStacking(self):
           enable_pcan=True,
           right_context=1,
           frame_stride=2)
-      self.assertAllEqual(filterbanks.eval(),
-                          [[479, 425, 436, 378], [410, 350, 391, 325]])
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[479, 425, 436, 378], [410, 350, 391, 325]])
 
   def testStackingWithOverlap(self):
     with self.test_session():
@@ -132,8 +135,9 @@ def testStackingDropFrame(self):
           enable_pcan=True,
           left_context=1,
           frame_stride=2)
-      self.assertAllEqual(filterbanks.eval(),
-                          [[479, 425, 479, 425], [436, 378, 410, 350]])
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[479, 425, 479, 425], [436, 378, 410, 350]])
 
   def testZeroPadding(self):
     with self.test_session():
diff --git a/tensorflow/lite/experimental/remat/BUILD b/tensorflow/lite/experimental/remat/BUILD
index d439cb7601c..2860b4c75f3 100644
--- a/tensorflow/lite/experimental/remat/BUILD
+++ b/tensorflow/lite/experimental/remat/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/experimental/remat/metadata_util.h b/tensorflow/lite/experimental/remat/metadata_util.h
index f613c6aa7be..821d1c7f122 100644
--- a/tensorflow/lite/experimental/remat/metadata_util.h
+++ b/tensorflow/lite/experimental/remat/metadata_util.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Functions for serializiation/deserialization of control dependency
 /// information to/from model metadata.
 ///
diff --git a/tensorflow/lite/experimental/resource/BUILD b/tensorflow/lite/experimental/resource/BUILD
index 8f1d7afc800..65b173b7ba4 100644
--- a/tensorflow/lite/experimental/resource/BUILD
+++ b/tensorflow/lite/experimental/resource/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -23,8 +24,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
     ],
 )
@@ -37,8 +38,8 @@ cc_test(
     deps = [
         ":resource",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/experimental/resource/initialization_status.h b/tensorflow/lite/experimental/resource/initialization_status.h
index ccaa393d8b8..b65b58a6f71 100644
--- a/tensorflow/lite/experimental/resource/initialization_status.h
+++ b/tensorflow/lite/experimental/resource/initialization_status.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/resource/lookup_interfaces.h b/tensorflow/lite/experimental/resource/lookup_interfaces.h
index cc62aaf2033..e89ab2fba89 100644
--- a/tensorflow/lite/experimental/resource/lookup_interfaces.h
+++ b/tensorflow/lite/experimental/resource/lookup_interfaces.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/resource/lookup_util.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/experimental/resource/resource_variable.cc b/tensorflow/lite/experimental/resource/resource_variable.cc
index d2627237735..48fab3cd785 100644
--- a/tensorflow/lite/experimental/resource/resource_variable.cc
+++ b/tensorflow/lite/experimental/resource/resource_variable.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 namespace tflite {
 namespace resource {
diff --git a/tensorflow/lite/experimental/resource/resource_variable.h b/tensorflow/lite/experimental/resource/resource_variable.h
index e19cb14f1f4..3f34082c85f 100644
--- a/tensorflow/lite/experimental/resource/resource_variable.h
+++ b/tensorflow/lite/experimental/resource/resource_variable.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_VARIABLE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_VARIABLE_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/resource/resource_variable_test.cc b/tensorflow/lite/experimental/resource/resource_variable_test.cc
index 3fca890f66a..58f2fe9ce28 100644
--- a/tensorflow/lite/experimental/resource/resource_variable_test.cc
+++ b/tensorflow/lite/experimental/resource/resource_variable_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/experimental/resource/static_hashtable.h b/tensorflow/lite/experimental/resource/static_hashtable.h
index 07c9d4a9a18..6e030fa7e9b 100644
--- a/tensorflow/lite/experimental/resource/static_hashtable.h
+++ b/tensorflow/lite/experimental/resource/static_hashtable.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/experimental/resource/lookup_util.h"
 #include "tensorflow/lite/experimental/resource/resource_base.h"
diff --git a/tensorflow/lite/external_cpu_backend_context.cc b/tensorflow/lite/external_cpu_backend_context.cc
index e53958b59a6..c4584633c42 100644
--- a/tensorflow/lite/external_cpu_backend_context.cc
+++ b/tensorflow/lite/external_cpu_backend_context.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/external_cpu_backend_context.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/external_cpu_backend_context.h b/tensorflow/lite/external_cpu_backend_context.h
index 662734c9cd5..b8ae1906ead 100644
--- a/tensorflow/lite/external_cpu_backend_context.h
+++ b/tensorflow/lite/external_cpu_backend_context.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index f5a85574adc..a9c3a3c2bc9 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -96,8 +96,8 @@ have it and the Android NDK and SDK installed on your system.
 
 1.  Install the latest version of the [Bazel build system](https://bazel.build/versions/master/docs/install.html).
 2.  The Android NDK is required to build the native (C/C++) TensorFlow Lite
-    code. The current recommended version is 19c, which may be found
-    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-19c-downloads).
+    code. The current recommended version is 21e, which may be found
+    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-21e-downloads).
 3.  The Android SDK and build tools may be obtained
     [here](https://developer.android.com/tools/revisions/build-tools.html), or
     alternatively as part of
@@ -122,7 +122,7 @@ prompt. Successful configuration should yield entries similar to the following
 in the `.tf_configure.bazelrc` file in the root folder:
 
 ```shell
-build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r19c"
+build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r21e"
 build --action_env ANDROID_NDK_API_LEVEL="21"
 build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
 build --action_env ANDROID_SDK_API_LEVEL="23"
diff --git a/tensorflow/lite/g3doc/android/play_services.md b/tensorflow/lite/g3doc/android/play_services.md
index 98a65371fe4..76b3e136518 100644
--- a/tensorflow/lite/g3doc/android/play_services.md
+++ b/tensorflow/lite/g3doc/android/play_services.md
@@ -159,9 +159,9 @@ services API for TensorFlow Lite:
 dependencies {
 ...
     // Tensorflow Lite dependencies for Google Play services
-    implementation 'com.google.android.gms:play-services-tflite-java:16.0.0'
+    implementation 'com.google.android.gms:play-services-tflite-java:16.0.1'
     // Optional: include Tensorflow Lite Support Library
-    implementation 'com.google.android.gms:play-services-tflite-support:16.0.0'
+    implementation 'com.google.android.gms:play-services-tflite-support:16.0.1'
 ...
 }
 ```
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
index e5e3723d9f7..291f972bb21 100644
--- a/tensorflow/lite/g3doc/api_docs/index.md
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -9,6 +9,7 @@ platform from the list below.
 *   [Swift API reference](https://tensorflow.org/lite/api_docs/swift/Classes)
 *   Objective-C API reference (coming soon)
 *   [C++ API reference](https://tensorflow.org/lite/api_docs/cc)
+*   [C API reference](https://tensorflow.org/lite/api_docs/c)
 
 We also provide other tools related to TensorFlow Lite.
 
diff --git a/tensorflow/lite/g3doc/examples/audio_classification/overview.md b/tensorflow/lite/g3doc/examples/audio_classification/overview.md
index c5e4eb55c11..fad3b8f28ed 100644
--- a/tensorflow/lite/g3doc/examples/audio_classification/overview.md
+++ b/tensorflow/lite/g3doc/examples/audio_classification/overview.md
@@ -32,9 +32,9 @@ also build your own custom inference pipeline using the
 [TensorFlow Lite Support Library](../../inference_with_metadata/lite_support).
 
 The Android example below demonstrates the implementation using the
-[TFLite Task Library](https://github.com/tensorflow/examples/tree/master/lite/examples/sound_classification/android)
+[TFLite Task Library](https://github.com/tensorflow/examples/tree/master/lite/examples/audio_classification/android)
 
-<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/sound_classification/android">View
+<a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/audio_classification/android">View
 Android example</a>
 
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/sound_classification/ios">View
diff --git a/tensorflow/lite/g3doc/guide/build_cmake.md b/tensorflow/lite/g3doc/guide/build_cmake.md
index 6950a365bb8..6de955e70b0 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake.md
@@ -81,6 +81,7 @@ variables to point to your library installations.
 ```sh
 cmake ../tensorflow_src/tensorflow/lite -DTFLITE_ENABLE_INSTALL=ON \
   -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON \
+  -DSYSTEM_FARMHASH=ON \
   -Dabsl_DIR=<install path>/lib/cmake/absl \
   -DEigen3_DIR=<install path>/share/eigen3/cmake \
   -DFlatbuffers_DIR=<install path>/lib/cmake/flatbuffers \
diff --git a/tensorflow/lite/g3doc/guide/build_cmake_arm.md b/tensorflow/lite/g3doc/guide/build_cmake_arm.md
index 9a3a353c113..17397810e2d 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake_arm.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake_arm.md
@@ -148,20 +148,23 @@ Raspberry Pi Zero.
 
 #### Download toolchain
 
-These commands install arm-rpi-linux-gnueabihf toolchain under
-${HOME}/toolchains.
+These commands install gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf toolchain
+under ${HOME}/toolchains.
 
 ```sh
-curl -L https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz -o rpi-toolchain.tar.gz
-tar xzf rpi-toolchain.tar.gz -C ${HOME}/toolchains
-mv ${HOME}/toolchains/rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5 ${HOME}/toolchains/arm-rpi-linux-gnueabihf
+curl -LO https://storage.googleapis.com/mirror.tensorflow.org/developer.arm.com/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz
+mkdir -p ${HOME}/toolchains
+tar xvf gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz -C ${HOME}/toolchains
 ```
 
+**Note:** Binaries built with GCC 8.3 require glibc 2.28 or higher. If your
+target has lower glibc version, you need to use older GCC toolchain.
+
 #### Run CMake
 
 ```sh
-ARMCC_PREFIX=${HOME}/toolchains/arm-rpi-linux-gnueabihf/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf/bin/arm-rpi-linux-gnueabihf-
-ARMCC_FLAGS="-march=armv6 -mfpu=vfp -funsafe-math-optimizations"
+ARMCC_FLAGS="-march=armv6 -mfpu=vfp -mfloat-abi=hard -funsafe-math-optimizations"
+ARMCC_PREFIX=${HOME}/toolchains/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/bin/arm-linux-gnueabihf-
 cmake -DCMAKE_C_COMPILER=${ARMCC_PREFIX}gcc \
   -DCMAKE_CXX_COMPILER=${ARMCC_PREFIX}g++ \
   -DCMAKE_C_FLAGS="${ARMCC_FLAGS}" \
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index b9eda1d446c..b765099db93 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -603,6 +603,44 @@ For more Python sample code, see
 Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
 documentation about the interpreter.
 
+## Run inference with dynamic shape model
+
+If you want to run a model with dynamic input shape,
+*resize the input shape* before running inference.
+Otherwise, the `None` shape in Tensorflow models will be replaced by a
+placeholder of `1` in TFLite models.
+
+The following examples show how to resize the input shape before
+running inference in different languages.
+All the examples assume that the input shape is defined as `[1/None, 10]`, and
+need to be resized to `[3, 10]`.
+
+<section class="tabs">
+
+###### C++ {.new-tab}
+
+```c++
+// Resize input tensors before allocate tensors
+interpreter->ResizeInputTensor(/*tensor_index=*/0, std::vector<int>{3,10});
+interpreter->AllocateTensors();
+```
+###### Python {.new-tab}
+
+```python
+# Load the TFLite model in TFLite Interpreter
+interpreter = tf.lite.Interpreter(model_path=TFLITE_FILE_PATH)
+  
+# Resize input shape for dynamic shape model and allocate tensor
+interpreter.resize_tensor_input(interpreter.get_input_details()[0]['index'], [3, 10])
+interpreter.allocate_tensors()
+  
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+```
+
+</section>
+
 ## Supported operations
 
 TensorFlow Lite supports a subset of TensorFlow operations with some
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index 6a82e43e83f..fc21086845e 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -7,7 +7,7 @@ example:
 image classification example</a>
 
 For an explanation of the source code, you should also read
-[TensorFlow Lite iOS image classification](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md).
+[TensorFlow Lite iOS image classification](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/ios/README.md).
 
 This example app uses
 [image classification](https://www.tensorflow.org/lite/examples/image_classification/overview)
@@ -58,10 +58,10 @@ version constraint as in the above examples, CocoaPods will pull the latest
 stable release by default.
 
 You can also specify a version constraint. For example, if you wish to depend on
-version 2.0.0, you can write the dependency as:
+version 2.10.0, you can write the dependency as:
 
 ```ruby
-pod 'TensorFlowLiteSwift', '~> 2.0.0'
+pod 'TensorFlowLiteSwift', '~> 2.10.0'
 ```
 
 This will ensure the latest available 2.x.y version of the `TensorFlowLiteSwift`
diff --git a/tensorflow/lite/g3doc/guide/op_select_allowlist.md b/tensorflow/lite/g3doc/guide/op_select_allowlist.md
index 2c14f7d685e..735f732d373 100644
--- a/tensorflow/lite/g3doc/guide/op_select_allowlist.md
+++ b/tensorflow/lite/g3doc/guide/op_select_allowlist.md
@@ -86,6 +86,7 @@ supported by TensorFlow Lite runtime with the Select TensorFlow Ops feature.
 *   `raw_ops.CheckNumerics`
 *   `raw_ops.CheckNumericsV2`
 *   `raw_ops.Cholesky`
+*   `raw_ops.ClipByValue`
 *   `raw_ops.CombinedNonMaxSuppression`
 *   `raw_ops.Complex`
 *   `raw_ops.ComplexAbs`
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index b307b2da0d6..898481c7495 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -102,7 +102,7 @@ Here is a non-exhaustive list of TensorFlow operations that are usually removed
 from the graph:
 
 *   `tf.add`
-*   `tf.check_numerics`
+*   `tf.debugging.check_numerics`
 *   `tf.constant`
 *   `tf.div`
 *   `tf.divide`
diff --git a/tensorflow/lite/g3doc/guide/ops_custom.md b/tensorflow/lite/g3doc/guide/ops_custom.md
index 46239c5962d..f9aa9bd31f4 100644
--- a/tensorflow/lite/g3doc/guide/ops_custom.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -30,10 +30,10 @@ Using custom operators consists of four steps.
     program.
 
 Let’s walk through an end-to-end example of running a model with a custom
-operator `tf.sin` (named as `Sin`, refer to #create-a-tensorflow-model) which is
-supported in TensorFlow, but unsupported in TensorFlow Lite.
+operator `tf.atan` (named as `Atan`, refer to #create-a-tensorflow-model) which
+is supported in TensorFlow, but unsupported in TensorFlow Lite.
 
-Note: The `tf.sin` function is **not** a custom operator. It is a regular
+Note: The `tf.atan` function is **not** a custom operator. It is a regular
 operator
 which is supported by both TensorFlow and TensorFlow Lite. But we **assume**
 that it is a custom operator in the following example in order to demonstrate a
@@ -43,17 +43,17 @@ The TensorFlow Text operator is an example of a custom operator. See the
 <a href="https://tensorflow.org/text/guide/text_tf_lite" class="external">
   Convert TF Text to TF Lite</a> tutorial for a code example.
 
-## Example: Custom `Sin` operator
+## Example: Custom `Atan` operator
 
 Let’s walk through an example of supporting a TensorFlow operator that
-TensorFlow Lite does not have. Assume we are using the `Sin` operator and that
-we are building a very simple model for a function `y = sin(x + offset)`, where
+TensorFlow Lite does not have. Assume we are using the `Atan` operator and that
+we are building a very simple model for a function `y = atan(x + offset)`, where
 `offset` is trainable.
 
 ### Create a TensorFlow Model
 
 The following code snippet trains a simple TensorFlow model. This model just
-contains a custom operator named `Sin`, which is a function `y = sin(x +
+contains a custom operator named `Atan`, which is a function `y = atan(x +
 offset)`, where `offset` is trainable.
 
 ```python
@@ -61,19 +61,19 @@ import tensorflow as tf
 
 # Define training dataset and variables
 x = [-8, 0.5, 2, 2.2, 201]
-y = [-0.6569866 ,  0.99749499,  0.14112001, -0.05837414,  0.80641841]
+y = [-1.4288993, 0.98279375, 1.2490457, 1.2679114, 1.5658458]
 offset = tf.Variable(0.0)
 
-# Define a simple model which just contains a custom operator named `Sin`
-@tf.function
-def sin(x):
-  return tf.sin(x + offset, name="Sin")
+# Define a simple model which just contains a custom operator named `Atan`
+@tf.function(input_signature=[tf.TensorSpec.from_tensor(tf.constant(x))])
+def atan(x):
+  return tf.atan(x + offset, name="Atan")
 
 # Train model
 optimizer = tf.optimizers.Adam(0.01)
 def train(x, y):
     with tf.GradientTape() as t:
-      predicted_y = sin(x)
+      predicted_y = atan(x)
       loss = tf.reduce_sum(tf.square(predicted_y - y))
     grads = t.gradient(loss, [offset])
     optimizer.apply_gradients(zip(grads, [offset]))
@@ -87,7 +87,7 @@ print("The predicted offset is:", offset.numpy())
 
 ```python
 The actual offset is: 1.0
-The predicted offset is: 1.0000001
+The predicted offset is: 0.99999905
 ```
 
 At this point, if you try to generate a TensorFlow Lite model with the default
@@ -95,9 +95,7 @@ converter flags, you will get the following error message:
 
 ```none
 Error:
-Some of the operators in the model are not supported by the standard TensorFlow
-Lite runtime...... Here is
-a list of operators for which you will need custom implementations: Sin.
+error: 'tf.Atan' op is neither a custom op nor a flex op.
 ```
 
 ### Convert to a TensorFlow Lite Model
@@ -106,18 +104,23 @@ Create a TensorFlow Lite model with custom operators, by setting the converter
 attribute `allow_custom_ops` as shown below:
 
 <pre>
-converter = tf.lite.TFLiteConverter.from_concrete_functions([sin.get_concrete_function(x)], sin)
+converter = tf.lite.TFLiteConverter.from_concrete_functions([atan.get_concrete_function()], atan)
 <b>converter.allow_custom_ops = True</b>
 tflite_model = converter.convert()
 </pre>
 
-At this point, if you run it with the default interpreter, you will get the
-following error messages:
+At this point, if you run it with the default interpreter using commands such as
+follows:
+
+```python
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
+```
+
+You will still get the error:
 
 ```none
-Error:
-Didn't find custom operator for name 'Sin'
-Registration failed.
+Encountered unresolved custom op: Atan.
 ```
 
 ### Create and register the operator.
@@ -190,7 +193,7 @@ All we need to do to use the op in TensorFlow Lite is define two functions
 (`Prepare` and `Eval`), and construct a `TfLiteRegistration`:
 
 ```cpp
-TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus AtanPrepare(TfLiteContext* context, TfLiteNode* node) {
   using namespace tflite;
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -208,13 +211,13 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
-TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus AtanEval(TfLiteContext* context, TfLiteNode* node) {
   using namespace tflite;
-  const TfLiteTensor* input = GetInput(context, node,0);
-  TfLiteTensor* output = GetOutput(context, node,0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
 
-  float* input_data = input->data.f;
-  float* output_data = output->data.f;
+  float* input_data = GetTensorData<float>(input);
+  float* output_data = GetTensorData<float>(output);
 
   size_t count = 1;
   int num_dims = NumDimensions(input);
@@ -223,13 +226,13 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   for (size_t i=0; i<count; ++i) {
-    output_data[i] = sin(input_data[i]);
+    output_data[i] = atan(input_data[i]);
   }
   return kTfLiteOk;
 }
 
-TfLiteRegistration* Register_SIN() {
-  static TfLiteRegistration r = {nullptr, nullptr, SinPrepare, SinEval};
+TfLiteRegistration* Register_ATAN() {
+  static TfLiteRegistration r = {nullptr, nullptr, AtanPrepare, AtanEval};
   return &r;
 }
 ```
@@ -237,8 +240,8 @@ TfLiteRegistration* Register_SIN() {
 When initializing the `OpResolver`, add the custom op into the resolver (see
 below for an example). This will register the operator with Tensorflow Lite so
 that TensorFlow Lite can use the new implementation. Note that the last two
-arguments in `TfLiteRegistration` correspond to the `SinPrepare` and `SinEval`
-functions you defined for the custom op. If you used `SinInit` and `SinFree`
+arguments in `TfLiteRegistration` correspond to the `AtanPrepare` and `AtanEval`
+functions you defined for the custom op. If you used `AtanInit` and `AtanFree`
 functions to initialize variables used in the op and to free up space,
 respectively, then they would be added to the first two arguments of
 `TfLiteRegistration`; those arguments are set to `nullptr` in this example.
@@ -273,7 +276,7 @@ To add the custom op created above, you call `AddOp` (before you pass the
 resolver to the `InterpreterBuilder`):
 
 ```c++
-resolver.AddCustom("Sin", Register_SIN());
+resolver.AddCustom("Atan", Register_ATAN());
 ```
 
 If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
@@ -300,7 +303,7 @@ To profile your op with the TensorFlow Lite benchmark tool, you can use the
 for TensorFlow Lite. For testing purposes, you can make your local build of
 TensorFlow Lite aware of your custom op by adding the appropriate `AddCustom`
 call (as show above) to
-[register.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/register.cc)
+[register.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/core/kernels/register.cc)
 
 ## Best practices
 
diff --git a/tensorflow/lite/g3doc/guide/signatures.ipynb b/tensorflow/lite/g3doc/guide/signatures.ipynb
index 76e8e5be4a9..912729492f7 100644
--- a/tensorflow/lite/g3doc/guide/signatures.ipynb
+++ b/tensorflow/lite/g3doc/guide/signatures.ipynb
@@ -395,14 +395,14 @@
         "encode_runner-\u003eAllocateTensors();\n",
         "\n",
         "TfLiteTensor* input_tensor = encode_runner-\u003einput_tensor(\"x\");\n",
-        "float* input = input_tensor-\u003edata.f;\n",
+        "float* input = GetTensorData\u003cfloat\u003e(input_tensor);\n",
         "// Fill `input`.\n",
         "\n",
         "encode_runner-\u003eInvoke();\n",
         "\n",
         "const TfLiteTensor* output_tensor = encode_runner-\u003eoutput_tensor(\n",
         "    \"encoded_result\");\n",
-        "float* output = output_tensor-\u003edata.f;\n",
+        "float* output = GetTensorData\u003cfloat\u003e(output_tensor);\n",
         "// Access `output`.\n",
         "```"
       ]
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_embedder.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_embedder.md
index 89fd7679131..1c134ca7f54 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_embedder.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_embedder.md
@@ -43,11 +43,11 @@ options.mutable_model_file_with_metadata()->set_file_name(model_path);
 options.set_l2_normalize(true);
 std::unique_ptr<ImageEmbedder> image_embedder = ImageEmbedder::CreateFromOptions(options).value();
 
-// Create input frame_buffer1 and frame_buffer_2 from your inputs `image_data1`, `image_data2`, `image_dimension1` and `image_dimension2`.
+// Create input frame_buffer_1 and frame_buffer_2 from your inputs `image_data1`, `image_data2`, `image_dimension1` and `image_dimension2`.
 // See more information here: tensorflow_lite_support/cc/task/vision/utils/frame_buffer_common_utils.h
-std::unique_ptr<FrameBuffer> frame_buffer1 = CreateFromRgbRawBuffer(
+std::unique_ptr<FrameBuffer> frame_buffer_1 = CreateFromRgbRawBuffer(
       image_data1, image_dimension1);
-std::unique_ptr<FrameBuffer> frame_buffer1 = CreateFromRgbRawBuffer(
+std::unique_ptr<FrameBuffer> frame_buffer_2 = CreateFromRgbRawBuffer(
       image_data2, image_dimension2);
 
 // Run inference on two images.
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
index aabb9582967..c3631587935 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
@@ -18,7 +18,7 @@ The end-to-end workflow involves the following steps:
 
 1.  [Train a model](#train_a_model) (in Python): A jupyter notebook to train,
     convert and optimize a model for on-device use.
-2.  [Run inference](#run_inference) (in C++ 11): An end-to-end unit test that
+2.  [Run inference](#run_inference) (in C++ 17): An end-to-end unit test that
     runs inference on the model using the [C++ library](library.md).
 
 ## Get a supported device
diff --git a/tensorflow/lite/g3doc/microcontrollers/index.md b/tensorflow/lite/g3doc/microcontrollers/index.md
index 0e392a26f81..33c7faa471c 100644
--- a/tensorflow/lite/g3doc/microcontrollers/index.md
+++ b/tensorflow/lite/g3doc/microcontrollers/index.md
@@ -28,14 +28,14 @@ and delightful ways.
 
 ## Supported platforms
 
-TensorFlow Lite for Microcontrollers is written in C++ 11 and requires a 32-bit
+TensorFlow Lite for Microcontrollers is written in C++ 17 and requires a 32-bit
 platform. It has been tested extensively with many processors based on the
 [Arm Cortex-M Series](https://developer.arm.com/ip-products/processors/cortex-m)
 architecture, and has been ported to other architectures including
 [ESP32](https://www.espressif.com/en/products/hardware/esp32/overview). The
 framework is available as an Arduino library. It can also generate projects for
 development environments such as Mbed. It is open source and can be included in
-any C++ 11 project.
+any C++ 17 project.
 
 The following development boards are supported:
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/library.md b/tensorflow/lite/g3doc/microcontrollers/library.md
index ed0047788a9..1973bf2e686 100644
--- a/tensorflow/lite/g3doc/microcontrollers/library.md
+++ b/tensorflow/lite/g3doc/microcontrollers/library.md
@@ -75,8 +75,8 @@ that contain all of the necessary source files, using a `Makefile`. The current
 supported environments are Keil, Make, and Mbed.
 
 To generate these projects with Make, clone the
-[TensorFlow repository](http://github.com/tensorflow/tensorflow) and run the
-following command:
+[TensorFlow/tflite-micro repository](https://github.com/tensorflow/tflite-micro)
+and run the following command:
 
 ```bash
 make -f tensorflow/lite/micro/tools/make/Makefile generate_projects
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 71c5263a803..87d03eac210 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -37,6 +37,7 @@ following ops in 16-bit and 32-bit float precision:
 *   `DEPTHWISE_CONV_2D v1-2`
 *   `EXP`
 *   `FULLY_CONNECTED`
+*   `LOGICAL_AND`
 *   `LOGISTIC`
 *   `LSTM v2 (Basic LSTM only)`
 *   `MAX_POOL_2D`
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index b74713a41a0..2a8d8828a62 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -231,9 +231,12 @@ tflite_quant_model = converter.convert()
 </pre>
 
 Examples of the use cases where accuracy improvements provided by this
-quantization scheme include: * super-resolution, * audio signal processing such
-as noise cancelling and beamforming, * image de-noising, * HDR reconstruction
-from a single image.
+quantization scheme include:
+
+*   super-resolution,
+*   audio signal processing such as noise cancelling and beamforming,
+*   image de-noising,
+*   HDR reconstruction from a single image.
 
 The disadvantage of this quantization is:
 
diff --git a/tensorflow/lite/g3doc/tools/BUILD b/tensorflow/lite/g3doc/tools/BUILD
index b0fb45b6f1c..c47fcb734b1 100644
--- a/tensorflow/lite/g3doc/tools/BUILD
+++ b/tensorflow/lite/g3doc/tools/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/generate-pc.sh b/tensorflow/lite/generate-pc.sh
new file mode 100644
index 00000000000..2a319ffe936
--- /dev/null
+++ b/tensorflow/lite/generate-pc.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TF_PREFIX='/usr/local'
+LIBDIR='lib'
+
+usage() {
+    echo "Usage: $0 OPTIONS"
+    echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
+    echo -e "-v, --version\tset TensorFlow version"
+    echo -e "-h, --help\tdisplay this message"
+}
+
+[ $# == 0 ] && usage && exit 0
+
+# read the options
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
+eval set -- "$ARGS"
+
+# extract options and their arguments into variables.
+while true ; do
+    case "$1" in
+        -h|--help) usage ; exit ;;
+        -p|--prefix)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_PREFIX=$2 ; shift 2 ;;
+            esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
+        -v|--version)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_VERSION=$2 ; shift 2 ;;
+            esac ;;
+        --) shift ; break ;;
+        *) echo "Internal error! Try '$0 --help' for more information." ; exit 1 ;;
+    esac
+done
+
+[ -z $TF_VERSION ] && echo "Specify a version using -v or --version" && exit 1
+
+echo "Generating pkgconfig file for TensorFlowLite $TF_VERSION in $TF_PREFIX"
+
+cat << EOF > tensorflowlite.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/${LIBDIR}
+includedir=\${prefix}/include/tensorflow
+
+Name: TensorFlowLite
+Version: ${TF_VERSION}
+Description: Library for deploying models on mobile, microcontrollers and other edge devices.
+Requires:
+Libs: -L\${libdir} -ltensorflowlite
+Cflags: -I\${includedir}
+EOF
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 3038b6a54b0..eac16e365b8 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
index 363e06c182f..1093aa8bb72 100644
--- a/tensorflow/lite/graph_info.h
+++ b/tensorflow/lite/graph_info.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
@@ -53,6 +53,10 @@ class GraphInfo {
   // be between 0 and num_execution_nodes().
   virtual const TfLiteNode& node(size_t index) const = 0;
 
+  // Returns a node registration given its index which is expected to be between
+  // 0 and num_nodes().
+  virtual const TfLiteRegistration& registration(size_t index) const = 0;
+
   // Returns an implementation-specific node index which may be different from
   // execution-plan index.
   // Expected to be between 0 and num_total_nodes().
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 2ab683bd988..d9d96da09e2 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -60,6 +60,7 @@ class SimpleTestGraph : public GraphInfo {
     for (const auto& [inputs, outputs, might_have_side_effect] : nodes) {
       AddNode(inputs, outputs, might_have_side_effect);
     }
+    registrations_.resize(nodes.size());
   }
 
   ~SimpleTestGraph() override {
@@ -80,6 +81,9 @@ class SimpleTestGraph : public GraphInfo {
     return index + node_index_offset_;
   }
   size_t num_tensors() const override { return tensors_.size(); }
+  const TfLiteRegistration& registration(size_t index) const override {
+    return registrations_[index + node_index_offset_];
+  }
   TfLiteTensor* tensor(size_t index) override { return &tensors_[index]; }
   TfLiteTensor* tensors() override { return tensors_.data(); }
   const std::vector<int>& inputs() const override { return inputs_; }
@@ -108,6 +112,7 @@ class SimpleTestGraph : public GraphInfo {
   std::vector<int> inputs_;
   std::vector<int> outputs_;
   std::vector<int> variables_;
+  std::vector<TfLiteRegistration> registrations_;
   size_t node_index_offset_;
 };
 
diff --git a/tensorflow/lite/internal/BUILD b/tensorflow/lite/internal/BUILD
index 375a2b4097e..53b0717d11b 100644
--- a/tensorflow/lite/internal/BUILD
+++ b/tensorflow/lite/internal/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/lite:__subpackages__",
     ],
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 041aa41c04d..b1eece4ac1c 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-/// \file
-/// Main abstraction controlling the tflite interpreter.
-/// See c/common.h for the API for defining operations (TfLiteRegistration).
 #ifndef TENSORFLOW_LITE_INTERPRETER_H_
 #define TENSORFLOW_LITE_INTERPRETER_H_
 
 /// For documentation, see
-/// third_party/tensorflow/lite/core/interpreter.h.
+/// tensorflow/lite/core/interpreter.h.
 
-#include "tensorflow/lite/core/interpreter.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/interpreter.h"
+
+namespace tflite {
+using Interpreter = ::tflite::impl::Interpreter;
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_INTERPRETER_H_
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index 94ae030370d..01dfefe8a43 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
 #define TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
 
-/// For documentation, see
-/// third_party/tensorflow/lite/core/interpreter_builder.h.
-#include "tensorflow/lite/core/interpreter_builder.h"  // IWYU pragma: export
+/// For documentation, see third_party/tensorflow/lite/core/interpreter_builder.h.
+
+#include "tensorflow/lite/core/interpreter_builder.h"
+
+namespace tflite {
+using InterpreterBuilder = ::tflite::impl::InterpreterBuilder;
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/interpreter_options.h b/tensorflow/lite/interpreter_options.h
index c70cd730373..d20fd5cb087 100644
--- a/tensorflow/lite/interpreter_options.h
+++ b/tensorflow/lite/interpreter_options.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Provides options to an interpreter.
 ///
 #ifndef TENSORFLOW_LITE_INTERPRETER_OPTIONS_H_
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 18e1e3f59f9..3f17d3f2057 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -31,13 +31,13 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/interpreter_test_util.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_type.h"
@@ -2245,13 +2245,14 @@ class TestLazyDelegateProvider : public InterpreterTest {
             std::unique_ptr<SimpleDelegateInterface>(
                 new DummyLazyDelegate(return_error))),
         TfLiteDelegateFactory::DeleteSimpleDelegate);
-    mutable_lazy_delegate_providers()->push_back([=](int /*num_threads*/) {
-      return Interpreter::TfLiteDelegatePtr(
-          TfLiteDelegateFactory::CreateSimpleDelegate(
-              std::unique_ptr<SimpleDelegateInterface>(
-                  new DummyLazyDelegate(return_error))),
-          TfLiteDelegateFactory::DeleteSimpleDelegate);
-    });
+    mutable_lazy_delegate_providers()->push_back(
+        [=](TfLiteContext* /*context*/) {
+          return Interpreter::TfLiteDelegatePtr(
+              TfLiteDelegateFactory::CreateSimpleDelegate(
+                  std::unique_ptr<SimpleDelegateInterface>(
+                      new DummyLazyDelegate(return_error))),
+              TfLiteDelegateFactory::DeleteSimpleDelegate);
+        });
 
     if (create_dyanmic_tensor) {
       // Mark the output as dynamic tensor.
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index 67a5f2f771a..e1d997a4833 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -79,11 +79,14 @@ strip_common_include_path_prefix(
     name = "strip_common_include_path_core",
     hdr_labels = [
         "//tensorflow/lite:builtin_ops.h",
-        "//tensorflow/lite/c:c_api_experimental.h",
-        "//tensorflow/lite/c:common.h",
-        # Here we include the path of the actual c api header instead of the shim header.
+        # Here we include the paths of the actual c library headers in tensorflow/lite/core/c instead of the shim headers in tensorflow/lite/c.
+        "//tensorflow/lite/core/c:c_api_experimental.h",
+        "//tensorflow/lite/core/c:common.h",
         "//tensorflow/lite/core/c:c_api.h",
+        "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
+        "//tensorflow/lite/profiling/telemetry/c:profiler.h",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
     ],
 )
 
@@ -103,9 +106,11 @@ tflite_ios_framework(
         ":builtin_ops.h",
         ":c_api.h",
         ":c_api_experimental.h",
+        ":c_api_types.h",
         ":common.h",
+        ":profiler.h",
+        ":telemetry_setting.h",
         ":xnnpack_delegate.h",
-        "//tensorflow/lite/c:c_api_types.h",
     ],
     allowlist_symbols_file = ":allowlist_TensorFlowLiteC.txt",
     bundle_name = "TensorFlowLiteC",
@@ -123,9 +128,11 @@ ios_static_framework(
         ":builtin_ops.h",
         ":c_api.h",
         ":c_api_experimental.h",
+        ":c_api_types.h",
         ":common.h",
+        ":profiler.h",
+        ":telemetry_setting.h",
         ":xnnpack_delegate.h",
-        "//tensorflow/lite/c:c_api_types.h",
     ],
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
@@ -146,7 +153,7 @@ ios_static_framework(
 ios_static_framework(
     name = "TensorFlowLiteSelectTfOps_framework",
     avoid_deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
     bundle_name = "TensorFlowLiteSelectTfOps",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
@@ -201,8 +208,9 @@ cc_library(
         "//tensorflow/lite/c:c_api_experimental.h",
         "//tensorflow/lite/c:c_api_types.h",
         "//tensorflow/lite/c:common.h",
-        "//tensorflow/lite/core/c:c_api",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
+        "//tensorflow/lite/profiling/telemetry/c:profiler.h",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
     ],
     tags = [
         "nobuilder",
@@ -212,8 +220,10 @@ cc_library(
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/c:c_api_experimental",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
     ],
 )
 
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.h b/tensorflow/lite/ios/TensorFlowLiteC.h
index 96fe41c3ea0..4c4da6dd250 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.h
+++ b/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -17,10 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/c_api_experimental.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 
 #endif  // TENSORFLOW_LITE_IOS_TENSORFLOWLITEC_H_
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.podspec b/tensorflow/lite/ios/TensorFlowLiteC.podspec
index 0281de20872..2349e476a0c 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/ios/TensorFlowLiteC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
-  s.version          = '2.10.0'
+  s.version          = '2.11.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/18/20220909-095119/TensorFlowLiteC/2.10.0/9410f57778559cad/TensorFlowLiteC-2.10.0.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/20/20221205-133425/TensorFlowLiteC/2.11.0/5f36dfd15a35e951/TensorFlowLiteC-2.11.0.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
index 0193a18647d..468b47ab883 100644
--- a/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
+++ b/tensorflow/lite/ios/TensorFlowLiteSelectTfOps.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSelectTfOps'
-  s.version          = '2.10.0'
+  s.version          = '2.11.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/18/20220909-095119/TensorFlowLiteSelectTfOps/2.10.0/8b398c9ad1c10693/TensorFlowLiteSelectTfOps-2.10.0.tar.gz" }
+  s.source           = { :http => "https://dl.google.com/tflite-release/ios/prod/tensorflow/lite/release/ios/release/20/20221205-133425/TensorFlowLiteSelectTfOps/2.11.0/e47bed0ee7474e55/TensorFlowLiteSelectTfOps-2.11.0.tar.gz" }
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
index 579021f3b3c..6ac9174a7c5 100644
--- a/tensorflow/lite/java/AndroidManifest.xml
+++ b/tensorflow/lite/java/AndroidManifest.xml
@@ -2,10 +2,9 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.lite">
 
-    <!-- TFLite Java Library is built against NDK API 19. It may work for Android API levels below
-    19, but is not guaranteed. -->
+    <!-- TFLite Java Library is built against NDK API 19. -->
     <uses-sdk
-        android:targetSdkVersion="19" />
+        android:minSdkVersion="19" />
 
     <application />
 
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 15e5ab01b03..8c9606f9682 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -10,6 +10,7 @@ load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -40,6 +41,7 @@ exports_files([
 
 JAVA_SRCS = glob([
     "src/main/java/org/tensorflow/lite/*.java",
+    "src/main/java/org/tensorflow/lite/acceleration/*.java",
     "src/main/java/org/tensorflow/lite/annotations/*.java",
 ]) + [
     "//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_api_src",
@@ -63,6 +65,7 @@ JAVA_API_SRCS = [
     "src/main/java/org/tensorflow/lite/Tensor.java",
     "src/main/java/org/tensorflow/lite/TensorFlowLite.java",
     "src/main/java/org/tensorflow/lite/annotations/UsedByReflection.java",
+    "src/main/java/org/tensorflow/lite/acceleration/ValidatedAccelerationConfig.java",
     "//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_api_src",
 ]
 
@@ -97,6 +100,7 @@ filegroup(
         "src/main/java/org/tensorflow/lite/Tensor.java",
         "src/main/java/org/tensorflow/lite/TensorFlowLite.java",
         "src/main/java/org/tensorflow/lite/TensorImpl.java",
+        "src/main/java/org/tensorflow/lite/acceleration/ValidatedAccelerationConfig.java",
         "src/main/java/org/tensorflow/lite/annotations/UsedByReflection.java",
         "src/main/java/org/tensorflow/lite/package-info.java",
         "//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_src",
@@ -133,7 +137,7 @@ aar_with_jni(
     headers = [
         "//tensorflow/lite:builtin_ops.h",
         "//tensorflow/lite/c:c_api.h",
-        "//tensorflow/lite/c:c_api_types.h",
+        "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/c:c_api_experimental.h",
         # TODO(b/175298345): Clean up and if possible remove common.h here.
         "//tensorflow/lite/c:common.h",
@@ -154,7 +158,7 @@ aar_with_jni(
     headers = [
         "//tensorflow/lite:builtin_ops.h",
         "//tensorflow/lite/c:c_api.h",
-        "//tensorflow/lite/c:c_api_types.h",
+        "//tensorflow/lite/core/c:c_api_types.h",
         # TODO(b/175298345): move the stable parts of common.h into
         # a separate header file that contains no experimental APIs
         # (perhaps also replacing concrete struct types with opaque types
@@ -635,6 +639,7 @@ java_test_with_tflite(
     visibility = ["//visibility:private"],
     deps = [
         ":tensorflowlite_javalib_stable",
+        "//third_party/java/mockito",
         "@com_google_truth",
         "@junit",
     ],
@@ -943,7 +948,7 @@ jni_binary_with_tflite(
 # client code, using System.LoadLibrary, before TensorFlowLite.init() is called.
 jni_binary_with_tflite(
     name = "libtensorflowlite_gpu_jni.so",
-    linkopts = tflite_jni_linkopts() + tflite_linkopts_no_undefined(),
+    linkopts = tflite_jni_linkopts(),
     linkscript = ":gpu_version_script.lds",
     tflite_deps = [
         "//tensorflow/lite/delegates/gpu/java/src/main/native",
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
index 7bd42e8230c..0c2baa9d310 100644
--- a/tensorflow/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -1,6 +1,7 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/lite/java/demo/app/src/main/assets/BUILD
index fd99a06911a..47168d69232 100644
--- a/tensorflow/lite/java/demo/app/src/main/assets/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/assets/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 38a3ef6416a..e36c77b7369 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -5,6 +5,7 @@ load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/java/ovic/demo/app/BUILD b/tensorflow/lite/java/ovic/demo/app/BUILD
index 2046015a8ff..126625be7f7 100644
--- a/tensorflow/lite/java/ovic/demo/app/BUILD
+++ b/tensorflow/lite/java/ovic/demo/app/BUILD
@@ -3,6 +3,7 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/java/ovic/src/testdata/BUILD b/tensorflow/lite/java/ovic/src/testdata/BUILD
index d6a55e2075d..32093987103 100644
--- a/tensorflow/lite/java/ovic/src/testdata/BUILD
+++ b/tensorflow/lite/java/ovic/src/testdata/BUILD
@@ -1,6 +1,7 @@
 # Testdata for OVIC benchmarker demo App and tests.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index ea16f7916fa..fd0425c53e9 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -88,6 +88,12 @@ public Options(InterpreterApi.Options options) {
       super(options);
     }
 
+    @Override
+    public Options setUseXNNPACK(boolean useXNNPACK) {
+      super.setUseXNNPACK(useXNNPACK);
+      return this;
+    }
+
     @Override
     public Options setNumThreads(int numThreads) {
       super.setNumThreads(numThreads);
@@ -148,22 +154,6 @@ public Options setCancellable(boolean allow) {
       return this;
     }
 
-    /**
-     * Experimental: Disable an optimized set of CPU kernels (provided by XNNPACK).
-     *
-     * <p>Disabling this flag will disable use of a highly optimized set of CPU kernels provided via
-     * the XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
-     * See
-     * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
-     * for more details.
-     *
-     * <p>WARNING: This is an experimental interface that is subject to change.
-     */
-    public Options setUseXNNPACK(boolean useXNNPACK) {
-      this.useXNNPACK = useXNNPACK;
-      return this;
-    }
-
     @Override
     public Options setRuntime(InterpreterApi.Options.TfLiteRuntime runtime) {
       super.setRuntime(runtime);
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
index 81f600dea26..e65155304d5 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterApi.java
@@ -23,6 +23,7 @@
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime;
+import org.tensorflow.lite.acceleration.ValidatedAccelerationConfig;
 import org.tensorflow.lite.nnapi.NnApiDelegate;
 
 /**
@@ -98,6 +99,8 @@ public Options(Options other) {
       this.delegates = new ArrayList<>(other.delegates);
       this.delegateFactories = new ArrayList<>(other.delegateFactories);
       this.runtime = other.runtime;
+      this.validatedAccelerationConfig = other.validatedAccelerationConfig;
+      this.useXNNPACK = other.useXNNPACK;
     }
 
     /**
@@ -268,10 +271,48 @@ public TfLiteRuntime getRuntime() {
       return runtime;
     }
 
+    /** Specify the acceleration configuration. */
+    public Options setAccelerationConfig(ValidatedAccelerationConfig config) {
+      this.validatedAccelerationConfig = config;
+      return this;
+    }
+
+    /** Return the acceleration configuration. */
+    public ValidatedAccelerationConfig getAccelerationConfig() {
+      return this.validatedAccelerationConfig;
+    }
+
+    /**
+     * Enable or disable an optimized set of CPU kernels (provided by XNNPACK). Enabled by default.
+     */
+    public Options setUseXNNPACK(boolean useXNNPACK) {
+      this.useXNNPACK = useXNNPACK;
+      return this;
+    }
+
+    public boolean getUseXNNPACK() {
+      // A null value indicates the default behavior, which is currently to apply the delegate.
+      return useXNNPACK == null || useXNNPACK.booleanValue();
+    }
+
     TfLiteRuntime runtime = TfLiteRuntime.FROM_APPLICATION_ONLY;
     int numThreads = -1;
     Boolean useNNAPI;
+
+    /**
+     * Note: the initial "null" value indicates default behavior (XNNPACK delegate will be applied
+     * by default whenever possible).
+     *
+     * <p>Disabling this flag will disable use of a highly optimized set of CPU kernels provided via
+     * the XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+     * See
+     * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+     * for more details.
+     */
+    Boolean useXNNPACK;
+
     Boolean allowCancellation;
+    ValidatedAccelerationConfig validatedAccelerationConfig;
 
     // See InterpreterApi.Options#addDelegate.
     final List<Delegate> delegates;
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterImpl.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterImpl.java
index 016449674a5..4e3dde73dd0 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterImpl.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/InterpreterImpl.java
@@ -44,7 +44,6 @@ public Options(Options other) {
       super(other);
       allowFp16PrecisionForFp32 = other.allowFp16PrecisionForFp32;
       allowBufferHandleOutput = other.allowBufferHandleOutput;
-      useXNNPACK = other.useXNNPACK;
     }
 
     // See Interpreter.Options#setAllowFp16PrecisionForFp32(boolean).
@@ -52,11 +51,6 @@ public Options(Options other) {
 
     // See Interpreter.Options#setAllowBufferHandleOutput(boolean).
     Boolean allowBufferHandleOutput;
-
-    // See Interpreter.Options#setUseXNNPACK(boolean).
-    // Note: the initial "null" value indicates default behavior (XNNPACK delegate will be applied
-    // by default whenever possible).
-    Boolean useXNNPACK;
   }
 
   /**
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 328e754cfc7..9abbac4d622 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -77,6 +77,10 @@ private void init(long errorHandle, long modelHandle, InterpreterImpl.Options op
     if (options == null) {
       options = new InterpreterImpl.Options();
     }
+    if (options.getAccelerationConfig() != null) {
+      // Apply the validated acceleration config
+      options.getAccelerationConfig().apply(options);
+    }
     this.errorHandle = errorHandle;
     this.modelHandle = modelHandle;
     // First create the interpreter without delegates.  We need an interpreter in order to figure
@@ -85,13 +89,13 @@ private void init(long errorHandle, long modelHandle, InterpreterImpl.Options op
     // (Alternatively, we could determine this without needing to recreate the interpreter
     // by passing the tflite::Model in to here, and then traversing that?)
     ArrayList<Long> delegateHandles = new ArrayList<>();
-    boolean useXnnpack = true;
-    if (options.useXNNPACK != null) {
-      useXnnpack = options.useXNNPACK;
-    }
     this.interpreterHandle =
         createInterpreter(
-            modelHandle, errorHandle, options.getNumThreads(), useXnnpack, delegateHandles);
+            modelHandle,
+            errorHandle,
+            options.getNumThreads(),
+            options.getUseXNNPACK(),
+            delegateHandles);
     this.originalGraphHasUnresolvedFlexOp = hasUnresolvedFlexOp(interpreterHandle);
     addDelegates(options);
     initDelegatesWithInterpreterFactory();
@@ -104,7 +108,11 @@ private void init(long errorHandle, long modelHandle, InterpreterImpl.Options op
       delete(/* errorHandle= */ 0, /* modelHandle= */ 0, this.interpreterHandle);
       this.interpreterHandle =
           createInterpreter(
-              modelHandle, errorHandle, options.getNumThreads(), useXnnpack, delegateHandles);
+              modelHandle,
+              errorHandle,
+              options.getNumThreads(),
+              options.getUseXNNPACK(),
+              delegateHandles);
     }
     if (options.allowFp16PrecisionForFp32 != null) {
       allowFp16PrecisionForFp32(interpreterHandle, options.allowFp16PrecisionForFp32);
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/acceleration/ValidatedAccelerationConfig.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/acceleration/ValidatedAccelerationConfig.java
new file mode 100644
index 00000000000..6cb1759aaf5
--- /dev/null
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/acceleration/ValidatedAccelerationConfig.java
@@ -0,0 +1,37 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.acceleration;
+
+import org.tensorflow.lite.InterpreterApi.Options;
+
+/**
+ * Interface specifying validated acceleration configuration. Developers should not implement this
+ * interface directly as it is only supported through the Acceleration service SDK.
+ */
+public interface ValidatedAccelerationConfig {
+
+  /**
+   * Returns the serialized validated acceleration config as bytes.
+   */
+  byte[] serialize();
+
+  /**
+   * Applies the validated acceleration config to the interpreter options.
+   *
+   * @hide
+   */
+  void apply(Options options);
+}
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 852a5afac4d..fc7bda537e3 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc b/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
index 7e099cc0554..c4db8d2a61c 100644
--- a/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
+++ b/tensorflow/lite/java/src/main/native/interpreter_factory_impl_jni.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/lite/core/shims/c/c_api.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
-#include "tensorflow/lite/version.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,10 +34,8 @@ Java_org_tensorflow_lite_InterpreterFactoryImpl_nativeRuntimeVersion(
 JNIEXPORT jstring JNICALL
 Java_org_tensorflow_lite_InterpreterFactoryImpl_nativeSchemaVersion(
     JNIEnv* env, jclass /*clazz*/) {
-  // TODO(b/214292391): use a new C API function rather than the
-  // TFLITE_SCHEMA_VERSION constant here.
   char buf[64];
-  snprintf(buf, sizeof(buf), "%d", TFLITE_SCHEMA_VERSION);
+  snprintf(buf, sizeof(buf), "%d", TfLiteSchemaVersion());
   return env->NewStringUTF(buf);
 }
 
diff --git a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
index 343c3daafc3..78c0cabffbe 100644
--- a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
+++ b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
@@ -108,7 +108,7 @@ bool OpResolverLazyDelegateProxy::MayContainUserDefinedOps() const {
 }
 
 std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-OpResolverLazyDelegateProxy::createXNNPackDelegate(int num_threads) {
+OpResolverLazyDelegateProxy::createXNNPackDelegate(TfLiteContext* context) {
   TfLiteDelegate* delegate = nullptr;
   void (*delegate_deleter)(TfLiteDelegate*) = nullptr;
 #if !TFLITE_DISABLE_SELECT_JAVA_APIS
@@ -128,9 +128,6 @@ OpResolverLazyDelegateProxy::createXNNPackDelegate(int num_threads) {
 
     if (xnnpack_options_default && xnnpack_create && xnnpack_delete) {
       TfLiteXNNPackDelegateOptions options = xnnpack_options_default();
-      if (num_threads > 0) {
-        options.num_threads = num_threads;
-      }
       delegate = xnnpack_create(&options);
       delegate_deleter = xnnpack_delete;
     }
@@ -142,8 +139,8 @@ OpResolverLazyDelegateProxy::createXNNPackDelegate(int num_threads) {
 
 OpResolver::TfLiteOpaqueDelegatePtr
 OpResolverLazyDelegateProxy::createXNNPackOpaqueDelegate(int num_threads) {
-  TfLiteOpaqueDelegateStruct* delegate = nullptr;
-  void (*delegate_deleter)(TfLiteOpaqueDelegateStruct*) = nullptr;
+  TfLiteOpaqueDelegate* delegate = nullptr;
+  void (*delegate_deleter)(TfLiteOpaqueDelegate*) = nullptr;
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
   // Construct a FlatBuffer containing
   //   TFLiteSettings {
diff --git a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h
index ed0186031f4..743c5bcb5a4 100644
--- a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h
+++ b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.h
@@ -42,7 +42,7 @@ class OpResolverLazyDelegateProxy : public OpResolver {
   bool MayContainUserDefinedOps() const override;
 
   static std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-  createXNNPackDelegate(int num_threads);
+  createXNNPackDelegate(TfLiteContext* context);
 
   static OpResolver::TfLiteOpaqueDelegatePtr createXNNPackOpaqueDelegate(
       int num_threads);
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterApiTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterApiTest.java
index 693f0363dae..2a11b539fdf 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterApiTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterApiTest.java
@@ -17,6 +17,9 @@
 
 import static com.google.common.truth.Truth.assertThat;
 import static org.junit.Assert.fail;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
 
 import java.io.File;
 import java.nio.ByteBuffer;
@@ -30,6 +33,7 @@
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime;
+import org.tensorflow.lite.acceleration.ValidatedAccelerationConfig;
 
 /** Unit tests for {@link org.tensorflow.lite.InterpreterApi}. */
 @RunWith(JUnit4.class)
@@ -93,6 +97,41 @@ public void testInterpreterWithOptions() throws Exception {
     }
   }
 
+  @Test
+  public void testInterpreterWithoutAccelerationConfig() throws Exception {
+    FloatBuffer parsedOutput = FloatBuffer.allocate(1);
+    InterpreterApi.Options options = new InterpreterApi.Options(TEST_OPTIONS);
+    assertThat(options.getAccelerationConfig()).isNull();
+
+    try (InterpreterApi interpreter = InterpreterApi.create(MODEL_BUFFER, options)) {
+      // Not setting acceleration config has no effect on an interpreter.
+      assertThat(interpreter).isNotNull();
+
+      interpreter.run(2.37f, parsedOutput);
+      assertThat(parsedOutput.get(0)).isWithin(0.1f).of(7.11f);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithAccelerationConfig() throws Exception {
+    InterpreterApi.Options options = new InterpreterApi.Options(TEST_OPTIONS);
+
+    // Mock the acceleration config interface.
+    ValidatedAccelerationConfig accelerationConfig = mock(ValidatedAccelerationConfig.class);
+
+    // Set the acceleration config
+    options.setAccelerationConfig(accelerationConfig);
+
+    // Verify that the config was set
+    assertThat(options.getAccelerationConfig()).isEqualTo(accelerationConfig);
+
+    try (InterpreterApi interpreter = InterpreterApi.create(MODEL_BUFFER, options)) {
+      assertThat(interpreter).isNotNull();
+      // Verify that the apply method was invoked
+      verify(accelerationConfig).apply(any());
+    }
+  }
+
   @Test
   public void testInterpreterWithNullOptions() throws Exception {
     try (InterpreterApi interpreter = InterpreterApi.create(MODEL_BUFFER, null)) {
@@ -193,7 +232,7 @@ public void testRunWithFileModel() throws Exception {
   public void testRunWithDirectByteBufferModel() throws Exception {
     ByteBuffer byteBuffer = ByteBuffer.allocateDirect(MODEL_BUFFER.capacity());
     byteBuffer.order(ByteOrder.nativeOrder());
-    byteBuffer.put(MODEL_BUFFER.duplicate());  // Use duplicate to avoid updating MODEL_BUFFER.
+    byteBuffer.put(MODEL_BUFFER.duplicate()); // Use duplicate to avoid updating MODEL_BUFFER.
     try (InterpreterApi interpreter = InterpreterApi.create(byteBuffer, TEST_OPTIONS)) {
       float[] oneD = {1.23f, 6.54f, 7.81f};
       float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
@@ -211,7 +250,7 @@ public void testRunWithDirectByteBufferModel() throws Exception {
   public void testRunWithInvalidByteBufferModel() throws Exception {
     ByteBuffer byteBuffer = ByteBuffer.allocate(MODEL_BUFFER.capacity());
     byteBuffer.order(ByteOrder.nativeOrder());
-    byteBuffer.put(MODEL_BUFFER.duplicate());  // Use duplicate to avoid updating MODEL_BUFFER.
+    byteBuffer.put(MODEL_BUFFER.duplicate()); // Use duplicate to avoid updating MODEL_BUFFER.
     try {
       InterpreterApi.create(byteBuffer, TEST_OPTIONS);
       fail();
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index 7994b67c061..2dd9e00a182 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -17,6 +17,8 @@
 
 import static com.google.common.truth.Truth.assertThat;
 import static java.util.concurrent.TimeUnit.MICROSECONDS;
+import static org.junit.Assert.assertThrows;
+import static org.tensorflow.lite.gpu.GpuDelegateFactory.Options.GpuBackend.OPENCL;
 
 import com.google.common.base.Stopwatch;
 import java.io.File;
@@ -120,6 +122,22 @@ public void testInterpreterWithGpu_QuantModelRunOnCPU() throws Exception {
     }
   }
 
+  @Test
+  public void testInterpreterWithGpu_forceOpenCl_throwsException() {
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate =
+        new GpuDelegate(new GpuDelegateFactory.Options().setForceBackend(OPENCL))) {
+      IllegalArgumentException e =
+          assertThrows(
+              IllegalArgumentException.class,
+              // Create interpreter fails because OpenCL is not available on device.
+              () ->
+                  new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate)));
+
+      assertThat(e).hasMessageThat().contains("Can not open OpenCL library");
+    }
+  }
+
   @Test
   public void testDelegateSerialization() throws Exception {
     ByteBuffer img =
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index d996986df40..4dce2423501 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -39,7 +40,7 @@ cc_library_with_tflite(
         "//tensorflow/lite/java/src/main/native:native_framework_only",
     ],
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/java/jni",
@@ -66,7 +67,7 @@ cc_library_with_tflite(
         "//tensorflow/lite/java/src/main/native:native_stable_framework_only",
     ],
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/java/jni",
         "//tensorflow/lite/kernels:kernel_util",
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index de12df9cd4d..17c32e6a6fe 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 extern "C" {
diff --git a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index b02ba6f8772..2264288a363 100644
--- a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -4,6 +4,7 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 423dafce498..97d67d97b13 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2,8 +2,10 @@ load("//tensorflow/lite:build_def.bzl", "tflite_combine_cc_tests", "tflite_copts
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -177,7 +179,6 @@ cc_library(
     deps = [
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "@com_google_benchmark//:benchmark",
         "@flatbuffers",
     ],
 )
@@ -189,7 +190,6 @@ cc_library(
     hdrs = ["test_util.h"],
     deps = [
         ":acceleration_test_util",
-        ":builtin_ops",
         ":test_delegate_providers_lib",
         "//tensorflow/core/platform:logging",
         "//tensorflow/lite:framework_stable",
@@ -198,10 +198,11 @@ cc_library(
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core:subgraph",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/nnapi:acceleration_test_util",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:tensor_utils",
@@ -290,7 +291,7 @@ cc_library(
     deps = [
         ":op_macros",
         "//tensorflow/lite:arena_planner",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:optimized_eigen",
     ],
 )
@@ -305,7 +306,7 @@ cc_test(
     }),
     deps = [
         ":eigen_support",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:optimized_eigen",
         "@com_google_googletest//:gtest_main",
     ],
@@ -383,10 +384,11 @@ cc_library(
         "@ruy//ruy:context",
         "@ruy//ruy:path",
         "@gemmlowp",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:macros",
         "//tensorflow/lite:external_cpu_backend_context",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "@pthreadpool//:pthreadpool",
     ] + select({
         # This select must match the similar select in `copts`
         "//tensorflow:linux_ppc64le": [],
@@ -502,7 +504,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
     ],
@@ -519,7 +521,7 @@ cc_test(
     tags = ["tflite_smoke_test"],
     deps = [
         ":kernel_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
@@ -563,7 +565,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:types",
     ],
 )
@@ -702,9 +704,10 @@ BUILTIN_KERNEL_DEPS = [
     "//tensorflow/lite:framework_stable",
     "//tensorflow/lite:minimal_logging",
     "//tensorflow/lite:string_util",
+    "//tensorflow/lite:tflite_kernel_use_xnnpack_optional",
     "//tensorflow/lite/core:subgraph",
     "//tensorflow/lite:util",
-    "//tensorflow/lite/c:common",
+    "//tensorflow/lite/core/c:common",
     "//tensorflow/lite/kernels/internal:audio_utils",
     "//tensorflow/lite/kernels/internal:common",
     "//tensorflow/lite/kernels/internal:compatibility",
@@ -724,6 +727,11 @@ BUILTIN_KERNEL_DEPS = [
         ":eigen_support",
         "//tensorflow/lite/kernels/internal:optimized_eigen",
     ],
+}) + select({
+    "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
+    "//conditions:default": [
+        "@XNNPACK//:xnnpack_for_tflite",
+    ],
 })
 
 cc_library(
@@ -743,7 +751,7 @@ cc_library(
         "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/core/lib/random:philox_random",
         "//tensorflow/core/lib/random:random_distributions_utils",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         # TODO(b/179298174): Move out from the experimental directory.
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:cppmath",
@@ -761,8 +769,8 @@ cc_library(
     deps = [
         ":kernel_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:tensor",
         "@flatbuffers",
@@ -776,12 +784,12 @@ cc_test(
         "variable_ops_test.cc",
     ],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":variable_op_kernels",  # buildcleaner: keep
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -838,7 +846,7 @@ cc_library(
     deps = [
         ":gru_cell",
         ":kernel_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:common",
@@ -862,7 +870,7 @@ cc_library(
     deps = [
         ":cpu_backend_context",
         ":op_macros",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:optimized_base",
@@ -880,8 +888,17 @@ cc_library(
     copts = tflite_copts(),
 )
 
-# For internal usage by shared libraries only.
 cc_library(
+    name = "builtin_ops_list",
+    compatible_with = get_compatible_with_portable(),
+    textual_hdrs = ["builtin_ops_list.inc"],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+# For internal usage by shared libraries only.
+cc_library_with_tflite(
     name = "builtin_ops_all_linked",
     hdrs = [
         "builtin_op_kernels.h",
@@ -889,21 +906,24 @@ cc_library(
         "register.h",
     ],
     copts = tflite_copts(),
+    tflite_deps = [
+        "//tensorflow/lite:framework_stable",
+    ],
     # Limit visibility to TFLite only.
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
         ":builtin_op_kernels",
-        "//tensorflow/lite:framework_stable",
+        ":builtin_ops_list",
         "//tensorflow/lite:tflite_with_xnnpack_optional",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops_all_linked",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "builtin_ops",
     hdrs = [
         "builtin_op_kernels.h",
@@ -911,12 +931,15 @@ cc_library(
         "register.h",
     ],
     compatible_with = get_compatible_with_portable(),
+    tflite_deps = [
+        "//tensorflow/lite:cc_api_stable",
+    ],
     deps = [
         ":builtin_op_kernels",  # buildcleaner: keep
-        "//tensorflow/lite:cc_api_stable",
+        ":builtin_ops_list",
         "//tensorflow/lite:mutable_op_resolver",
         "//tensorflow/lite:tflite_with_xnnpack_optional",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -935,7 +958,7 @@ cc_library(
         ":builtin_op_kernels",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -1180,11 +1203,11 @@ cc_test(
     size = "small",
     srcs = ["broadcast_args_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1194,11 +1217,11 @@ cc_test(
     size = "small",
     srcs = ["broadcast_to_test.cc"],
     deps = [
-        ":builtin_ops",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1277,9 +1300,9 @@ cc_test(
     deps = [
         ":kernel_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/tools:logging",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1308,7 +1331,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
@@ -1630,7 +1653,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1757,7 +1780,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1801,7 +1824,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1816,7 +1839,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1830,7 +1853,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1845,7 +1868,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2084,7 +2107,7 @@ cc_test(
         ":cpu_backend_context",
         ":lstm_eval",
         ":test_main",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2213,7 +2236,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2277,7 +2300,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2291,7 +2314,6 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:string",
         "//tensorflow/lite/core:headers",
         "//tensorflow/lite/schema:schema_fbs",
@@ -2309,7 +2331,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2322,7 +2344,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2336,8 +2358,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2351,8 +2373,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -2367,7 +2389,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -2381,7 +2403,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2409,7 +2431,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -2447,7 +2469,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -2623,8 +2645,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
@@ -2673,16 +2695,16 @@ cc_library(
     srcs = ["subgraph_test_util.cc"],
     hdrs = ["subgraph_test_util.h"],
     deps = [
-        ":builtin_ops",
         ":kernel_util",
         ":variable_op_kernels",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:interpreter_test_util",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -2827,7 +2849,7 @@ cc_test(
     ],
     deps = [
         "//tensorflow/lite:framework_stable",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels:test_main",
@@ -2894,7 +2916,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2911,7 +2933,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2928,7 +2950,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
@@ -2945,7 +2967,7 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/kernels/CMakeLists.txt b/tensorflow/lite/kernels/CMakeLists.txt
index f5e22ee97da..46c80713ddf 100644
--- a/tensorflow/lite/kernels/CMakeLists.txt
+++ b/tensorflow/lite/kernels/CMakeLists.txt
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 find_package(googletest REQUIRED)
+find_package(google_benchmark REQUIRED)
 find_package(nsync REQUIRED)
 find_package(re2 REQUIRED)
 
@@ -83,6 +84,7 @@ set(TEST_FRAMEWORK_SRC
   ${TFLITE_SOURCE_DIR}/delegates/nnapi/acceleration_test_list.cc
   ${TFLITE_SOURCE_DIR}/delegates/nnapi/acceleration_test_util.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
+  ${TFLITE_SOURCE_DIR}/profiling/telemetry/telemetry.cc
   ${TFLITE_SOURCE_DIR}/schema/schema_conversion_utils.cc
   ${TFLITE_SOURCE_DIR}/tools/command_line_flags.cc
   ${DELEGATE_PROVIDERS}
@@ -91,10 +93,9 @@ set(TEST_FRAMEWORK_SRC
   ${TFLITE_SOURCE_DIR}/tools/optimize/quantization_utils.cc
   ${TFLITE_SOURCE_DIR}/tools/tool_params.cc
   ${TFLITE_SOURCE_DIR}/tools/versioning/op_version.cc
-  ${TFLITE_SOURCE_DIR}/tools/versioning/op_signature.cc
-  ${TF_SOURCE_DIR}/core/platform/default/env_time.cc
-  ${TF_SOURCE_DIR}/core/platform/default/logging.cc
-  ${TF_SOURCE_DIR}/core/platform/default/mutex.cc
+  ${TF_SOURCE_DIR}/tsl/platform/default/env_time.cc
+  ${TF_SOURCE_DIR}/tsl/platform/default/logging.cc
+  ${TF_SOURCE_DIR}/tsl/platform/default/mutex.cc
   internal/test_util.cc
   acceleration_test_util.cc
   acceleration_test_util_internal.cc
@@ -108,6 +109,15 @@ if(NOT _TFLITE_ENABLE_NNAPI)
   )
 endif()
 
+set(TEST_FRAMEWORK_OPTIONS "")
+if(TFLITE_ENABLE_XNNPACK)
+  list(APPEND TEST_FRAMEWORK_SRC
+    ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
+    ${TFLITE_SOURCE_DIR}/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc)
+else()
+  list(APPEND TEST_FRAMEWORK_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
+endif()
+
 # Base library to be later linked with the gtest OR gtest_main library
 add_library(tensorflow-lite-test-base ${TEST_FRAMEWORK_SRC})
 target_link_libraries(tensorflow-lite-test-base
@@ -115,9 +125,11 @@ target_link_libraries(tensorflow-lite-test-base
   nsync_cpp
   re2
   tensorflow-lite
+  benchmark
 )
 add_dependencies(tensorflow-lite-test-base mutable_schema_file)
 target_include_directories(tensorflow-lite-test-base PUBLIC ${SCHEMA_GENERATED_ROOT})
+target_compile_options(tensorflow-lite-test-base PUBLIC ${TEST_FRAMEWORK_OPTIONS})
 
 add_library(tensorflow-lite-test-gtest-main INTERFACE)
 target_link_libraries(tensorflow-lite-test-gtest-main INTERFACE
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 8c22c7ca93c..467c4c9f809 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <functional>
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -595,14 +595,14 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
     // insignificant. Use a symmetric output range of [-1.0; 1.0] and double as
     // FloatT for backward compatibility.
     data->params.exp_lut = data->exp_lut;
-    LUTPopulate<int16_t, double>(
+    LUTPopulate<int16_t>(
         10.0 / range, std::numeric_limits<int16_t>::max(), 2.0 / range, 0,
         [](double value) { return std::exp(value); }, data->params.exp_lut);
 
     // Input is in the [0; 1] range and use a symmetric output range of
     // [-1.0; 1.0] and double as FloatT for backward compatibility.
     data->params.one_over_one_plus_x_lut = data->one_over_one_plus_x_lut;
-    LUTPopulate<int16_t, double>(
+    LUTPopulate<int16_t>(
         1.0 / range, std::numeric_limits<int16_t>::min(), 2.0 / range, 0,
         [](double value) { return 1.0 / (1.0 + value); },
         data->params.one_over_one_plus_x_lut);
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 6cad1883750..8b28313984d 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
index dee2a2cb73c..50ac95788fa 100644
--- a/tensorflow/lite/kernels/add_n.cc
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index 5cf1d01c0c1..3b98859edfe 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/assign_variable.cc b/tensorflow/lite/kernels/assign_variable.cc
index d8ce3a27549..be53d9011ec 100644
--- a/tensorflow/lite/kernels/assign_variable.cc
+++ b/tensorflow/lite/kernels/assign_variable.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/atan2.cc b/tensorflow/lite/kernels/atan2.cc
index 48f44dbb866..ff2add22221 100644
--- a/tensorflow/lite/kernels/atan2.cc
+++ b/tensorflow/lite/kernels/atan2.cc
@@ -14,7 +14,7 @@
 
 #include <cmath>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/atan2_custom.cc b/tensorflow/lite/kernels/atan2_custom.cc
index 1e79db5eb1b..ce7b20a877e 100644
--- a/tensorflow/lite/kernels/atan2_custom.cc
+++ b/tensorflow/lite/kernels/atan2_custom.cc
@@ -14,7 +14,7 @@
 
 #include <cmath>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/custom_ops_register.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/audio_spectrogram.cc b/tensorflow/lite/kernels/audio_spectrogram.cc
index 46066310157..c32d5a89b04 100644
--- a/tensorflow/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/spectrogram.h"
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index 91ba2e602dc..d01adce73ee 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 71450c6d2b5..15788ff09ce 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/batch_matmul.h"
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index 36c812687eb..b98ac91c20e 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index 6fec3c39c9b..fd748d6b915 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -199,7 +199,7 @@ TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8EmptyOutput) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ::testing::IsEmpty());
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 33f1e417ad1..d08592faec5 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
@@ -1194,6 +1194,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           &lstm_params,
           /*forward_sequence=*/true, time_major, /*output_offset=*/0,
           fw_scratch_buffer, fw_activation_state, fw_cell_state, fw_output,
+          /*recurrent_to_input_is_diag=*/false,
+          /*recurrent_to_forget_is_diag=*/false,
+          /*recurrent_to_cell_is_diag=*/false,
+          /*recurrent_to_output_is_diag=*/false,
           CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
@@ -1215,7 +1219,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           &lstm_params,
           /*forward_sequence=*/false, time_major, bw_output_offset,
           bw_scratch_buffer, bw_activation_state, bw_cell_state,
-          actual_bw_output, CpuBackendContext::GetFromContext(context));
+          actual_bw_output,
+          /*recurrent_to_input_is_diag=*/false,
+          /*recurrent_to_forget_is_diag=*/false,
+          /*recurrent_to_cell_is_diag=*/false,
+          /*recurrent_to_output_is_diag=*/false,
+          CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
     }
@@ -1302,6 +1311,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, kAuxInputZeroPoints),
           GetTemporary(context, node, kOutputStateZeroPoints), fw_row_sums,
           fw_row_sums_size, &op_data->compute_fw_row_sums,
+          /*recurrent_to_input_is_diag=*/false,
+          /*recurrent_to_forget_is_diag=*/false,
+          /*recurrent_to_cell_is_diag=*/false,
+          /*recurrent_to_output_is_diag=*/false,
           CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
@@ -1343,6 +1356,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTemporary(context, node, kAuxInputZeroPoints),
           GetTemporary(context, node, kOutputStateZeroPoints), bw_row_sums,
           bw_row_sums_size, &op_data->compute_bw_row_sums,
+          /*recurrent_to_input_is_diag=*/false,
+          /*recurrent_to_forget_is_diag=*/false,
+          /*recurrent_to_cell_is_diag=*/false,
+          /*recurrent_to_output_is_diag=*/false,
           CpuBackendContext::GetFromContext(context));
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index e18dbb52138..4813b7c8420 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -361,8 +361,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(
         context, GetOutputSafe(context, node, kBwOutputTensor, &bw_output));
     TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
-    bw_output_size_array->data[0] = batch_size;
-    bw_output_size_array->data[1] = max_time;
+    bw_output_size_array->data[0] = (time_major) ? max_time : batch_size;
+    bw_output_size_array->data[1] = (time_major) ? batch_size : max_time;
     bw_output_size_array->data[2] = bw_num_units;
     TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output,
                                                      bw_output_size_array));
diff --git a/tensorflow/lite/kernels/broadcast_args.cc b/tensorflow/lite/kernels/broadcast_args.cc
index 309064292e2..f3868c6c743 100644
--- a/tensorflow/lite/kernels/broadcast_args.cc
+++ b/tensorflow/lite/kernels/broadcast_args.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/broadcast_args_test.cc b/tensorflow/lite/kernels/broadcast_args_test.cc
index fa75e93d104..94c746391b8 100644
--- a/tensorflow/lite/kernels/broadcast_args_test.cc
+++ b/tensorflow/lite/kernels/broadcast_args_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -66,7 +66,7 @@ class BroadcastArgsOpTest : public ::testing::Test {};
 using DataTypes = ::testing::Types<int64_t, int32_t>;
 TYPED_TEST_SUITE(BroadcastArgsOpTest, DataTypes);
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TYPED_TEST(BroadcastArgsOpTest, ShapeNotBroadcastable) {
   BroadcastArgsOpModel<TypeParam> m({2, 3, 4, 4}, {2, 2});
   EXPECT_DEATH(ASSERT_EQ(m.Invoke(), kTfLiteOk), "");
diff --git a/tensorflow/lite/kernels/broadcast_to.cc b/tensorflow/lite/kernels/broadcast_to.cc
index 0e7baca2277..5a0eaab1021 100644
--- a/tensorflow/lite/kernels/broadcast_to.cc
+++ b/tensorflow/lite/kernels/broadcast_to.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -101,7 +101,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Not yet support string type due to the use of memcopy with fixed size.
   TF_LITE_ENSURE(context, op_context.input->type != kTfLiteString);
 
-  if (IsConstantTensor(op_context.shape)) {
+  if (IsConstantOrPersistentTensor(op_context.shape)) {
     return ResizeOutputTensor(context, &op_context);
   }
 
diff --git a/tensorflow/lite/kernels/broadcast_to_test.cc b/tensorflow/lite/kernels/broadcast_to_test.cc
index 955434a06cb..2ed480afa62 100644
--- a/tensorflow/lite/kernels/broadcast_to_test.cc
+++ b/tensorflow/lite/kernels/broadcast_to_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -79,7 +79,7 @@ class BroadcastToOpTest : public ::testing::Test {};
 using DataTypes = ::testing::Types<float, uint8_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(BroadcastToOpTest, DataTypes);
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TYPED_TEST(BroadcastToOpTest, ShapeMustBe1D) {
   EXPECT_DEATH(
       BroadcastToOpModel<TypeParam>({2, 3, 4, 4}, {2, 2}, {2, 3, 4, 4}), "");
diff --git a/tensorflow/lite/kernels/bucketize.cc b/tensorflow/lite/kernels/bucketize.cc
index 56b331939d1..cc155593071 100644
--- a/tensorflow/lite/kernels/bucketize.cc
+++ b/tensorflow/lite/kernels/bucketize.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -42,6 +43,13 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData();
   const auto* params = reinterpret_cast<const TfLiteBucketizeParams*>(buffer);
 
+  if (!FLATBUFFERS_LITTLEENDIAN) {
+    int32_t* p =
+        reinterpret_cast<int32_t*>(const_cast<float*>(params->boundaries));
+    for (size_t i = 0; i < params->num_boundaries; i++, p++)
+      *p = flatbuffers::EndianSwap(*p);
+  }
+
   op_data->boundaries = params->boundaries;
   op_data->num_boundaries = params->num_boundaries;
   return op_data;
diff --git a/tensorflow/lite/kernels/bucketize_test.cc b/tensorflow/lite/kernels/bucketize_test.cc
index dd37aad78a3..fbb800be9d2 100644
--- a/tensorflow/lite/kernels/bucketize_test.cc
+++ b/tensorflow/lite/kernels/bucketize_test.cc
@@ -84,7 +84,7 @@ TEST(BucketizeOpTest, Int32) {
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 3, 3, 2, 1, 3}));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(BucketizeOpTest, UnsortedBuckets) {
   EXPECT_DEATH(BucketizeOpModel<float>(
                    /*input=*/{/*type=*/TensorType_INT32, /*shape=*/{3, 2}},
diff --git a/tensorflow/lite/kernels/builtin_op_kernels.h b/tensorflow/lite/kernels/builtin_op_kernels.h
index 3b2983d142c..4c767a3bff6 100644
--- a/tensorflow/lite/kernels/builtin_op_kernels.h
+++ b/tensorflow/lite/kernels/builtin_op_kernels.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,8 +15,21 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
 #define TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
 
-/// For documentation, see
-/// third_party/tensorflow/lite/core/kernels/builtin_op_kernels.h.
-#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+#define TFLITE_OP(NAME) \
+    using ::tflite::ops::builtin::NAME;
+
+#include "tensorflow/lite/core/shims/builtin_ops_list.inc"
+
+#undef TFLITE_OP
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_BUILTIN_OP_KERNELS_H_
diff --git a/tensorflow/lite/kernels/builtin_ops_list.inc b/tensorflow/lite/kernels/builtin_ops_list.inc
new file mode 100644
index 00000000000..7d7bdf70205
--- /dev/null
+++ b/tensorflow/lite/kernels/builtin_ops_list.inc
@@ -0,0 +1,173 @@
+
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DO NOT EDIT MANUALLY: This file is automatically generated by
+// `tensorflow/lite/schema/builtin_ops_list/generator.cc`.
+
+TFLITE_OP(Register_ADD)
+TFLITE_OP(Register_AVERAGE_POOL_2D)
+TFLITE_OP(Register_CONCATENATION)
+TFLITE_OP(Register_CONV_2D)
+TFLITE_OP(Register_DEPTHWISE_CONV_2D)
+TFLITE_OP(Register_DEPTH_TO_SPACE)
+TFLITE_OP(Register_DEQUANTIZE)
+TFLITE_OP(Register_EMBEDDING_LOOKUP)
+TFLITE_OP(Register_FLOOR)
+TFLITE_OP(Register_FULLY_CONNECTED)
+TFLITE_OP(Register_HASHTABLE_LOOKUP)
+TFLITE_OP(Register_L2_NORMALIZATION)
+TFLITE_OP(Register_L2_POOL_2D)
+TFLITE_OP(Register_LOCAL_RESPONSE_NORMALIZATION)
+TFLITE_OP(Register_LOGISTIC)
+TFLITE_OP(Register_LSH_PROJECTION)
+TFLITE_OP(Register_LSTM)
+TFLITE_OP(Register_MAX_POOL_2D)
+TFLITE_OP(Register_MUL)
+TFLITE_OP(Register_RELU)
+TFLITE_OP(Register_RELU_N1_TO_1)
+TFLITE_OP(Register_RELU6)
+TFLITE_OP(Register_RESHAPE)
+TFLITE_OP(Register_RESIZE_BILINEAR)
+TFLITE_OP(Register_RNN)
+TFLITE_OP(Register_SOFTMAX)
+TFLITE_OP(Register_SPACE_TO_DEPTH)
+TFLITE_OP(Register_SVDF)
+TFLITE_OP(Register_TANH)
+TFLITE_OP(Register_SKIP_GRAM)
+TFLITE_OP(Register_EMBEDDING_LOOKUP_SPARSE)
+TFLITE_OP(Register_PAD)
+TFLITE_OP(Register_UNIDIRECTIONAL_SEQUENCE_RNN)
+TFLITE_OP(Register_GATHER)
+TFLITE_OP(Register_BATCH_TO_SPACE_ND)
+TFLITE_OP(Register_SPACE_TO_BATCH_ND)
+TFLITE_OP(Register_TRANSPOSE)
+TFLITE_OP(Register_MEAN)
+TFLITE_OP(Register_SUB)
+TFLITE_OP(Register_DIV)
+TFLITE_OP(Register_SQUEEZE)
+TFLITE_OP(Register_UNIDIRECTIONAL_SEQUENCE_LSTM)
+TFLITE_OP(Register_STRIDED_SLICE)
+TFLITE_OP(Register_BIDIRECTIONAL_SEQUENCE_RNN)
+TFLITE_OP(Register_EXP)
+TFLITE_OP(Register_TOPK_V2)
+TFLITE_OP(Register_SPLIT)
+TFLITE_OP(Register_LOG_SOFTMAX)
+TFLITE_OP(Register_BIDIRECTIONAL_SEQUENCE_LSTM)
+TFLITE_OP(Register_CAST)
+TFLITE_OP(Register_PRELU)
+TFLITE_OP(Register_MAXIMUM)
+TFLITE_OP(Register_ARG_MAX)
+TFLITE_OP(Register_MINIMUM)
+TFLITE_OP(Register_LESS)
+TFLITE_OP(Register_NEG)
+TFLITE_OP(Register_PADV2)
+TFLITE_OP(Register_GREATER)
+TFLITE_OP(Register_GREATER_EQUAL)
+TFLITE_OP(Register_LESS_EQUAL)
+TFLITE_OP(Register_SELECT)
+TFLITE_OP(Register_SLICE)
+TFLITE_OP(Register_SIN)
+TFLITE_OP(Register_TRANSPOSE_CONV)
+TFLITE_OP(Register_SPARSE_TO_DENSE)
+TFLITE_OP(Register_TILE)
+TFLITE_OP(Register_EXPAND_DIMS)
+TFLITE_OP(Register_EQUAL)
+TFLITE_OP(Register_NOT_EQUAL)
+TFLITE_OP(Register_LOG)
+TFLITE_OP(Register_SUM)
+TFLITE_OP(Register_SQRT)
+TFLITE_OP(Register_RSQRT)
+TFLITE_OP(Register_SHAPE)
+TFLITE_OP(Register_POW)
+TFLITE_OP(Register_ARG_MIN)
+TFLITE_OP(Register_FAKE_QUANT)
+TFLITE_OP(Register_REDUCE_PROD)
+TFLITE_OP(Register_REDUCE_MAX)
+TFLITE_OP(Register_PACK)
+TFLITE_OP(Register_LOGICAL_OR)
+TFLITE_OP(Register_ONE_HOT)
+TFLITE_OP(Register_LOGICAL_AND)
+TFLITE_OP(Register_LOGICAL_NOT)
+TFLITE_OP(Register_UNPACK)
+TFLITE_OP(Register_REDUCE_MIN)
+TFLITE_OP(Register_FLOOR_DIV)
+TFLITE_OP(Register_REDUCE_ANY)
+TFLITE_OP(Register_SQUARE)
+TFLITE_OP(Register_ZEROS_LIKE)
+TFLITE_OP(Register_FILL)
+TFLITE_OP(Register_FLOOR_MOD)
+TFLITE_OP(Register_RANGE)
+TFLITE_OP(Register_RESIZE_NEAREST_NEIGHBOR)
+TFLITE_OP(Register_LEAKY_RELU)
+TFLITE_OP(Register_SQUARED_DIFFERENCE)
+TFLITE_OP(Register_MIRROR_PAD)
+TFLITE_OP(Register_ABS)
+TFLITE_OP(Register_SPLIT_V)
+TFLITE_OP(Register_UNIQUE)
+TFLITE_OP(Register_CEIL)
+TFLITE_OP(Register_REVERSE_V2)
+TFLITE_OP(Register_ADD_N)
+TFLITE_OP(Register_GATHER_ND)
+TFLITE_OP(Register_COS)
+TFLITE_OP(Register_WHERE)
+TFLITE_OP(Register_RANK)
+TFLITE_OP(Register_ELU)
+TFLITE_OP(Register_REVERSE_SEQUENCE)
+TFLITE_OP(Register_MATRIX_DIAG)
+TFLITE_OP(Register_QUANTIZE)
+TFLITE_OP(Register_MATRIX_SET_DIAG)
+TFLITE_OP(Register_ROUND)
+TFLITE_OP(Register_HARD_SWISH)
+TFLITE_OP(Register_IF)
+TFLITE_OP(Register_WHILE)
+TFLITE_OP(Register_NON_MAX_SUPPRESSION_V4)
+TFLITE_OP(Register_NON_MAX_SUPPRESSION_V5)
+TFLITE_OP(Register_SCATTER_ND)
+TFLITE_OP(Register_SELECT_V2)
+TFLITE_OP(Register_DENSIFY)
+TFLITE_OP(Register_SEGMENT_SUM)
+TFLITE_OP(Register_BATCH_MATMUL)
+TFLITE_OP(Register_CUMSUM)
+TFLITE_OP(Register_CALL_ONCE)
+TFLITE_OP(Register_BROADCAST_TO)
+TFLITE_OP(Register_RFFT2D)
+TFLITE_OP(Register_CONV_3D)
+TFLITE_OP(Register_IMAG)
+TFLITE_OP(Register_REAL)
+TFLITE_OP(Register_COMPLEX_ABS)
+TFLITE_OP(Register_HASHTABLE)
+TFLITE_OP(Register_HASHTABLE_FIND)
+TFLITE_OP(Register_HASHTABLE_IMPORT)
+TFLITE_OP(Register_HASHTABLE_SIZE)
+TFLITE_OP(Register_REDUCE_ALL)
+TFLITE_OP(Register_CONV_3D_TRANSPOSE)
+TFLITE_OP(Register_VAR_HANDLE)
+TFLITE_OP(Register_READ_VARIABLE)
+TFLITE_OP(Register_ASSIGN_VARIABLE)
+TFLITE_OP(Register_BROADCAST_ARGS)
+TFLITE_OP(Register_RANDOM_STANDARD_NORMAL)
+TFLITE_OP(Register_BUCKETIZE)
+TFLITE_OP(Register_RANDOM_UNIFORM)
+TFLITE_OP(Register_MULTINOMIAL)
+TFLITE_OP(Register_GELU)
+TFLITE_OP(Register_DYNAMIC_UPDATE_SLICE)
+TFLITE_OP(Register_RELU_0_TO_1)
+TFLITE_OP(Register_UNSORTED_SEGMENT_PROD)
+TFLITE_OP(Register_UNSORTED_SEGMENT_MAX)
+TFLITE_OP(Register_UNSORTED_SEGMENT_SUM)
+TFLITE_OP(Register_ATAN2)
+TFLITE_OP(Register_UNSORTED_SEGMENT_MIN)
+TFLITE_OP(Register_SIGN)
diff --git a/tensorflow/lite/kernels/call_once.cc b/tensorflow/lite/kernels/call_once.cc
index 2a066cf366d..1d1abcd181e 100644
--- a/tensorflow/lite/kernels/call_once.cc
+++ b/tensorflow/lite/kernels/call_once.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/initialization_status.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 29701f940ca..1ddfd7cdad6 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -70,6 +70,42 @@ void copyCast(const std::complex<float>* in, std::complex<float>* out,
                  [](std::complex<float> a) { return a; });
 }
 
+template <typename ToT>
+void copyCast(const Eigen::half* in, ToT* out, int num_elements) {
+  std::transform(in, in + num_elements, out, [](Eigen::half a) {
+    return static_cast<ToT>(Eigen::half_impl::half_to_float(a));
+  });
+}
+
+template <>
+void copyCast(const Eigen::half* in, std::complex<float>* out,
+              int num_elements) {
+  std::transform(in, in + num_elements, out, [](Eigen::half a) {
+    return std::complex<float>(Eigen::half_impl::half_to_float(a));
+  });
+}
+
+template <typename FromT>
+void copyCastToFloat16(const FromT* in, Eigen::half* out, int num_elements) {
+  std::transform(in, in + num_elements, out, [](FromT a) {
+    return Eigen::half_impl::float_to_half_rtne(static_cast<float>(a));
+  });
+}
+
+template <>
+void copyCastToFloat16(const std::complex<float>* in, Eigen::half* out,
+                       int num_elements) {
+  std::transform(in, in + num_elements, out, [](std::complex<float> a) {
+    return Eigen::half_impl::float_to_half_rtne(std::real(a));
+  });
+}
+
+template <>
+void copyCastToFloat16(const Eigen::half* in, Eigen::half* out,
+                       int num_elements) {
+  std::transform(in, in + num_elements, out, [](Eigen::half a) { return a; });
+}
+
 template <typename FromT>
 TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
                           TfLiteTensor* out, int num_elements) {
@@ -95,9 +131,16 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
     case kTfLiteInt8:
       copyCast(in, out->data.int8, num_elements);
       break;
+    case kTfLiteFloat16:
+      copyCastToFloat16(in, reinterpret_cast<Eigen::half*>(out->data.f16),
+                        num_elements);
+      break;
     case kTfLiteFloat32:
       copyCast(in, GetTensorData<float>(out), num_elements);
       break;
+    case kTfLiteFloat64:
+      copyCast(in, out->data.f64, num_elements);
+      break;
     case kTfLiteBool:
       copyCast(in, out->data.b, num_elements);
       break;
@@ -135,9 +178,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return copyToTensor(context, input->data.uint8, output, num_elements);
     case kTfLiteInt8:
       return copyToTensor(context, input->data.int8, output, num_elements);
+    case kTfLiteFloat16:
+      return copyToTensor(context,
+                          reinterpret_cast<Eigen::half*>(input->data.f16),
+                          output, num_elements);
     case kTfLiteFloat32:
       return copyToTensor(context, GetTensorData<float>(input), output,
                           num_elements);
+    case kTfLiteFloat64:
+      return copyToTensor(context, input->data.f64, output, num_elements);
     case kTfLiteBool:
       return copyToTensor(context, input->data.b, output, num_elements);
     case kTfLiteComplex64:
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index a593f296876..15836d91611 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index f063ba185ce..a02c8949979 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/complex_support.cc b/tensorflow/lite/kernels/complex_support.cc
index 3dbb7a72aef..8713e73ea57 100644
--- a/tensorflow/lite/kernels/complex_support.cc
+++ b/tensorflow/lite/kernels/complex_support.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <complex>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 310bb909067..87ec74018b2 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -15,10 +15,11 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/concatenation.h"
 
 #include <stdint.h>
+
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index cbf22538cea..6394dedd587 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -181,7 +181,7 @@ TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapes) {
                                                 8, 9, 10, 11, 12}));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapesWrongAxis) {
   EXPECT_DEATH(
       ConcatenationOpModel m0(
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 9e115bdcbc3..8961d0d7d63 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #define TFLITE_WITH_MULTITHREADED_EIGEN
 #endif
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
 #include "tensorflow/lite/kernels/eigen_support.h"
@@ -347,6 +347,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   // or equals (normal conv).
   auto input_channel = input->dims->data[3];
   auto filter_input_channel = filter->dims->data[3];
+  TF_LITE_ENSURE(context, filter_input_channel > 0);
   TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, 0);
   data->groups = input_channel / filter_input_channel;
 
@@ -596,6 +597,8 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
       const auto* affine_quantization =
           reinterpret_cast<TfLiteAffineQuantization*>(
               filter->quantization.params);
+      TF_LITE_ENSURE(context, affine_quantization);
+      TF_LITE_ENSURE(context, affine_quantization->scale);
       TF_LITE_ENSURE_EQ(
           context, affine_quantization->scale->size,
           filter->dims->data[affine_quantization->quantized_dimension]);
@@ -746,6 +749,9 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     effective_kernel_type = kReference;
   }
 
+  if (filter->type == kTfLiteInt4) {
+    effective_kernel_type = kReference;
+  }
   // Grouped convolution is right now only supported on reference kernel.
   if (data->groups != 1) {
     effective_kernel_type = kReference;
@@ -753,13 +759,40 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 
   switch (effective_kernel_type) {
     case kReference: {
-      reference_integer_ops::ConvPerChannel(
-          op_params, data->per_channel_output_multiplier.data(),
-          data->per_channel_output_shift.data(), GetTensorShape(input),
-          GetTensorData<int8>(input), GetTensorShape(filter),
-          GetTensorData<int8>(filter), GetTensorShape(bias),
-          GetTensorData<int32>(bias), GetTensorShape(output),
-          GetTensorData<int8>(output));
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          const size_t bytes_unpacked = filter->bytes * 2;
+          // container variable packed filter data is unpacked into in
+          // ConvPerChannelWithPackedInt4Weights before op is used on the data
+          int8_t* unpacked_filter_data =
+              new int8_t[bytes_unpacked / sizeof(int8_t)];
+          reference_integer_ops::ConvPerChannelWithPackedInt4Weights(
+              op_params, data->per_channel_output_multiplier.data(),
+              data->per_channel_output_shift.data(), GetTensorShape(input),
+              GetTensorData<int8>(input), GetTensorShape(filter),
+              GetTensorData<int8>(filter), unpacked_filter_data,
+              GetTensorShape(bias), GetTensorData<int32>(bias),
+              GetTensorShape(output), GetTensorData<int8>(output));
+          delete[] unpacked_filter_data;
+          break;
+        }
+        case kTfLiteInt8: {
+          reference_integer_ops::ConvPerChannel(
+              op_params, data->per_channel_output_multiplier.data(),
+              data->per_channel_output_shift.data(), GetTensorShape(input),
+              GetTensorData<int8>(input), GetTensorShape(filter),
+              GetTensorData<int8>(filter), GetTensorShape(bias),
+              GetTensorData<int32>(bias), GetTensorShape(output),
+              GetTensorData<int8>(output));
+          break;
+        }
+
+        default: {
+          printf("Weight type %s (%d) not supported.",
+                 TfLiteTypeGetName(filter->type), filter->type);
+          break;
+        }
+      }
       break;
     }
     case kGenericOptimized:
diff --git a/tensorflow/lite/kernels/conv3d.cc b/tensorflow/lite/kernels/conv3d.cc
index e08dec67d2e..9f775038ca8 100644
--- a/tensorflow/lite/kernels/conv3d.cc
+++ b/tensorflow/lite/kernels/conv3d.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/conv3d_transpose.cc b/tensorflow/lite/kernels/conv3d_transpose.cc
index 1754f9184bb..ddcfd43450c 100644
--- a/tensorflow/lite/kernels/conv3d_transpose.cc
+++ b/tensorflow/lite/kernels/conv3d_transpose.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/conv_mem_test.cc b/tensorflow/lite/kernels/conv_mem_test.cc
index 65c7b41f970..2265ad32322 100644
--- a/tensorflow/lite/kernels/conv_mem_test.cc
+++ b/tensorflow/lite/kernels/conv_mem_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/macros.h"
-#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/tools/logging.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index a4371e6304c..deba8d2242f 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -1676,7 +1676,7 @@ class PerChannelQuantizedConvolutionOpModel
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST_P(ConvolutionOpTest, AsymmetricPerchannelQuantization) {
   EXPECT_DEATH(PerChannelQuantizedConvolutionOpModel m(
                    GetRegistration(),
@@ -1792,6 +1792,53 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
+TEST_P(ConvolutionOpTest, Simple4bitPerChannelTest) {
+  PerChannelQuantizedConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT4,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({31, 64, -57, -46})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
+}
+
 class HybridPerChannelConvolutionOpModel
     : public BaseConvolutionOpModel<int8_t> {
  public:
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index b372b1cf9c8..c78a266a43d 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <memory>
 
+#include "pthreadpool.h"  // from @pthreadpool
+
 #ifdef TFLITE_HAVE_CPUINFO
 #include "include/cpuinfo.h"
 #endif
@@ -24,7 +26,7 @@ limitations under the License.
 #include "public/gemmlowp.h"
 #include "ruy/context.h"  // from @ruy
 #include "ruy/path.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -148,6 +150,14 @@ void CpuBackendContext::SetMaxNumThreads(int max_num_threads) {
 
 void CpuBackendContext::SetUseCaching(bool flag) { use_caching_ = flag; }
 
+pthreadpool_t CpuBackendContext::get_xnnpack_threadpool() {
+  if (!xnnpack_threadpool_ && max_num_threads_ > 1) {
+    xnnpack_threadpool_.reset(
+        pthreadpool_create(static_cast<size_t>(max_num_threads_)));
+  }
+  return xnnpack_threadpool_.get();
+}
+
 bool CpuBackendContext::PreferGemmlowpOnX86() {
   bool use_gemmlowp_on_x86 = false;
 #if defined(TFLITE_X86_PLATFORM) && TFLITE_HAS_ATTRIBUTE_WEAK && \
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 10618b50eec..24e70bb6391 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -24,8 +24,9 @@ limitations under the License.
 #include <memory>
 
 #include "public/gemmlowp.h"
+#include "pthreadpool.h"  // from @pthreadpool
 #include "ruy/context.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
 
 namespace tflite {
@@ -53,6 +54,8 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
 
   bool use_caching() const { return use_caching_; }
 
+  pthreadpool_t get_xnnpack_threadpool();
+
   void ClearCaches() override { ruy_context_->ClearPrepackedCache(); }
 
   // Gemmlowp on x86 is a deprecated path but some clients may still use
@@ -115,6 +118,11 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
   // (currently the Ruy library only).
   bool use_caching_;
 
+  // A smart pointer for the xnnpack threadpool. Is created by a call from the
+  // interpreter, and then consumed by xnnpack, possibly via a TFLite kernel.
+  std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)>
+      xnnpack_threadpool_{nullptr, &pthreadpool_destroy};
+
   CpuBackendContext(const CpuBackendContext&) = delete;
 };
 
diff --git a/tensorflow/lite/kernels/ctc/BUILD b/tensorflow/lite/kernels/ctc/BUILD
index c5f5d0addf0..4eb29edc1da 100644
--- a/tensorflow/lite/kernels/ctc/BUILD
+++ b/tensorflow/lite/kernels/ctc/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -51,7 +52,7 @@ cc_library(
     deps = [
         ":ctc_utils",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal:optimized_base",
@@ -68,8 +69,8 @@ cc_test(
     deps = [
         ":ctc_beam_search_decoder_op",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
index 2648d4fb5d9..1abad4b50a2 100644
--- a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
+++ b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/ctc/ctc_beam_search.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
index 44a7ea31013..0ffacfbae80 100644
--- a/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
+++ b/tensorflow/lite/kernels/ctc/ctc_beam_search_decoder_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/cumsum.cc b/tensorflow/lite/kernels/cumsum.cc
index b37bab15803..997589d2b3b 100644
--- a/tensorflow/lite/kernels/cumsum.cc
+++ b/tensorflow/lite/kernels/cumsum.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
-
 namespace tflite {
 namespace ops {
 namespace builtin {
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index c31ca1762f7..d2624c8a01c 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_CUSTOM_OPS_REGISTER_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/densify.cc b/tensorflow/lite/kernels/densify.cc
index d7a25f2e5d5..196ed69f700 100644
--- a/tensorflow/lite/kernels/densify.cc
+++ b/tensorflow/lite/kernels/densify.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/densify_test.cc b/tensorflow/lite/kernels/densify_test.cc
index 2f00e4ad854..be634c51432 100644
--- a/tensorflow/lite/kernels/densify_test.cc
+++ b/tensorflow/lite/kernels/densify_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/kernels/depth_to_space.cc b/tensorflow/lite/kernels/depth_to_space.cc
index f99d1c4f8a5..7530365f550 100644
--- a/tensorflow/lite/kernels/depth_to_space.cc
+++ b/tensorflow/lite/kernels/depth_to_space.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/depth_to_space_test.cc b/tensorflow/lite/kernels/depth_to_space_test.cc
index 6c9aba9b5da..33a075615bf 100644
--- a/tensorflow/lite/kernels/depth_to_space_test.cc
+++ b/tensorflow/lite/kernels/depth_to_space_test.cc
@@ -55,7 +55,7 @@ class DepthToSpaceOpModel : public SingleOpModel {
   int output_;
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(DepthToSpaceOpModel, BadBlockSize) {
   EXPECT_DEATH(DepthToSpaceOpModel({TensorType_FLOAT32, {1, 1, 1, 4}}, 4),
                "Cannot allocate tensors");
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index e126033afbe..fe807e3fda1 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
@@ -128,8 +130,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      data_type == kTfLiteInt8 || data_type == kTfLiteInt16);
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, data_type);
   if (!is_hybrid) {
-    TF_LITE_ENSURE(context,
-                   filter->type == data_type || data_type == kTfLiteInt16);
+    TF_LITE_ENSURE(context, filter->type == data_type ||
+                                data_type == kTfLiteInt16 ||
+                                filter->type == kTfLiteInt4);
   }
 
   if (data_type == kTfLiteInt16) {
@@ -400,23 +403,58 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
                                                &op_params.depth_multiplier));
 
-  if (kernel_type == kReference) {
-    reference_integer_ops::DepthwiseConvPerChannel(
-        op_params, data->per_channel_output_multiplier.data(),
-        data->per_channel_output_shift.data(), GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
-  } else {
-    optimized_integer_ops::DepthwiseConvPerChannel(
-        op_params, data->per_channel_output_multiplier.data(),
-        data->per_channel_output_shift.data(), GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output),
-        CpuBackendContext::GetFromContext(context));
+  KernelType effective_kernel_type = kernel_type;
+
+  if (filter->type == kTfLiteInt4) {
+    effective_kernel_type = kReference;
+  }
+
+  switch (effective_kernel_type) {
+    case kReference: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          const size_t bytes_unpacked = filter->bytes * 2;
+          auto unpacked_filter_data =
+              std::make_unique<int8_t[]>(bytes_unpacked);
+          reference_integer_ops::DepthwiseConvPerChannelWithPackedInt4Weights(
+              op_params, data->per_channel_output_multiplier.data(),
+              data->per_channel_output_shift.data(), GetTensorShape(input),
+              GetTensorData<int8_t>(input), GetTensorShape(filter),
+              GetTensorData<int8_t>(filter), unpacked_filter_data.get(),
+              GetTensorShape(bias), GetTensorData<int32_t>(bias),
+              GetTensorShape(output), GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          reference_integer_ops::DepthwiseConvPerChannel(
+              op_params, data->per_channel_output_multiplier.data(),
+              data->per_channel_output_shift.data(), GetTensorShape(input),
+              GetTensorData<int8>(input), GetTensorShape(filter),
+              GetTensorData<int8>(filter), GetTensorShape(bias),
+              GetTensorData<int32>(bias), GetTensorShape(output),
+              GetTensorData<int8>(output));
+          break;
+        }
+        default: {
+          printf("Weight type %s (%d) not supported.",
+                 TfLiteTypeGetName(filter->type), filter->type);
+          break;
+        }
+      }
+      break;
+    }
+    case kGenericOptimized:
+    case kNeonOptimized: {
+      optimized_integer_ops::DepthwiseConvPerChannel(
+          op_params, data->per_channel_output_multiplier.data(),
+          data->per_channel_output_shift.data(), GetTensorShape(input),
+          GetTensorData<int8>(input), GetTensorShape(filter),
+          GetTensorData<int8>(filter), GetTensorShape(bias),
+          GetTensorData<int32>(bias), GetTensorShape(output),
+          GetTensorData<int8>(output),
+          CpuBackendContext::GetFromContext(context));
+      break;
+    }
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 1c918927cb1..f012138de58 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -1851,6 +1851,109 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
               ElementsAreArray(ArrayFloatNear({9, 18, 0, 0, 47, 54, 0, 0})));
 }
 
+// The expected values for this test were obtained by running the test with the
+// same parameters but by setting filter type to INT8.
+TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple4bit3x3FilterTest) {
+  // TODO(b/265987257) - remove when NNAPI interaction with 4bit depthwise_conv
+  // is fixed.
+  using testing::FloatEq;
+  using testing::Pointwise;
+
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 3, 3, 8}, -63.5, 64, 0.5, -1},
+      {TensorType_INT4,
+       // [1 * 3 * 3 * 8] as [input_channel, y, x, output_channel]
+       {1, 3, 3, 8},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/
+       {2.5, 2, 3, 4, 4, 3, 2, 2.5},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0, 0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
+  m.SetInput({// array of 9 x 8 => with tensor dimmensions [1, 3, 3, 8] as
+              // [input_channel, y, x, output_channel]
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0});
+  m.SetFilter(
+      /*filter data*/
+      {// array of 9 x 8 => with tensor dimmensions [1, 3, 3, 8] as
+       // [input_channel, y, x, output_channel]
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+
+  // Invoke and verify output.
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              Pointwise(FloatEq(), {0, 18, 0, 0, 36, 54, 0, 0}));
+}
+
+/*The expected values for this test were obtained by running the test with the
+ * same parameters but by setting filter type to INT8*/
+TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple4bitPerAxisTest) {
+  // TODO(b/265987257) - remove when NNAPI interaction with 4bit depthiwse_conv
+  // is fixed.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT4,
+       // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+       {1, 2, 2, 4},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2, 3, 4},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      /*filter data*/
+      {
+          // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+          // depth multiplier = 2
+          1, 2, 3, 4,  // y = 0, x = 0
+          3, 4, 5, 6,  // y = 0, x = 1
+          7, 8, 5, 6,  // y = 1, x = 0
+          3, 4, 1, 2,  // y = 1, x = 1
+      });
+  m.SetBias({3, -2, 4, 6});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({43, 48, 21, 22, 3, -4, -30, -54})));
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAre(85, 95, 41, 43, 5, -9, -61, -109));
+}
+
 TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
        Simple3x3FilterPaddingSameTest) {
   PerChannelQuantizedDepthwiseConvolutionOpModel m(
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index 9aa752406c4..eab7650b6dc 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/dequantize.h b/tensorflow/lite/kernels/dequantize.h
index 91a3335df65..c2b4f46c644 100644
--- a/tensorflow/lite/kernels/dequantize.h
+++ b/tensorflow/lite/kernels/dequantize.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 7f8b4dc7ccf..793fcd9dab8 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
index e7ab5616b19..4c9fc843e24 100644
--- a/tensorflow/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
diff --git a/tensorflow/lite/kernels/dynamic_update_slice.cc b/tensorflow/lite/kernels/dynamic_update_slice.cc
index 714ec9aef14..27103b91935 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -123,6 +123,11 @@ void DynamicUpdateSlice(const TfLiteTensor* input, const TfLiteTensor* update,
   // Copies input to output first.
   memcpy(output->data.raw, input->data.raw, input->bytes);
 
+  // Update tensor has no elements. Skip.
+  if (update_shape.FlatSize() == 0) {
+    return;
+  }
+
   std::vector<int> current_dim(input_dims, 0);
   // Overwrites update to output.
   do {
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 0d5c9e4dbaa..a12ed79dc31 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -143,6 +143,20 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestI32) {
                                                         7, -2, 9}));
 }
 
+TEST(DynamicUpdateSliceOpTest, ZeroSizeTestI32) {
+  DynamicUpdateSliceOpModel m({TensorType_INT32, {3, 3}},
+                              {TensorType_INT32, {2, 0}},
+                              {TensorType_INT32, {2}});
+  m.SetInput<int32_t>({1, 2, 3,  //
+                       4, 5, 6,  //
+                       7, 8, 9});
+  m.SetStartIndices<int32_t>({1, 1});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({1, 2, 3,  //
+                                                        4, 5, 6,  //
+                                                        7, 8, 9}));
+}
+
 TEST(DynamicUpdateSliceOpTest, SimpleTestI64) {
   DynamicUpdateSliceOpModel m({TensorType_INT64, {3, 3}},
                               {TensorType_INT64, {2, 1}},
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index d33ea3e5ab8..0dc977e876c 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/lite/arena_planner.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
diff --git a/tensorflow/lite/kernels/eigen_support.h b/tensorflow/lite/kernels/eigen_support.h
index 41109327dec..4a9c48acf16 100644
--- a/tensorflow/lite/kernels/eigen_support.h
+++ b/tensorflow/lite/kernels/eigen_support.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
 #define TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace EigenForTFLite {
 struct ThreadPoolDevice;
diff --git a/tensorflow/lite/kernels/eigen_support_test.cc b/tensorflow/lite/kernels/eigen_support_test.cc
index 0737e1fc195..af6a9ad7a36 100644
--- a/tensorflow/lite/kernels/eigen_support_test.cc
+++ b/tensorflow/lite/kernels/eigen_support_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 4dbd5f2ae72..a249c3f192d 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -55,7 +55,8 @@ bool IsLogicalSupportedType(const TfLiteType type) {
 }
 
 bool IsAbsSupportedType(const TfLiteType type) {
-  return type == kTfLiteFloat32 || type == kTfLiteInt8 || type == kTfLiteInt16;
+  return type == kTfLiteFloat32 || type == kTfLiteInt8 ||
+         type == kTfLiteInt16 || type == kTfLiteInt32;
 }
 
 bool IsRsqrtSupportedType(const TfLiteType type) {
@@ -234,6 +235,8 @@ TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
       return input->quantization.type == kTfLiteNoQuantization
                  ? AbsInt16EvalImpl(context, node, type)
                  : AbsEvalQuantized<int16_t>(context, node, type);
+    case kTfLiteInt32:
+      return EvalImpl<int32_t>(context, node, std::abs<int32_t>, type);
     default:
       TF_LITE_KERNEL_LOG(context, "Current data type %s is not supported.",
                          TfLiteTypeGetName(type));
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index f101790ccef..98170db8781 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -37,6 +37,17 @@ class ElementWiseOpBaseModel : public SingleOpModel {
   int output_;
 };
 
+class ElementWiseOpIntModel : public ElementWiseOpBaseModel {
+ public:
+  ElementWiseOpIntModel(BuiltinOperator op,
+                        std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_INT32);
+    SetBuiltinOp(op, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_shape});
+  }
+};
+
 class ElementWiseOpFloatModel : public ElementWiseOpBaseModel {
  public:
   ElementWiseOpFloatModel(BuiltinOperator op,
@@ -162,6 +173,19 @@ TEST(ElementWise, Abs) {
                                                   }));
 }
 
+TEST(ElementWise, AbsInt32) {
+  ElementWiseOpIntModel m(BuiltinOperator_ABS, {1, 2, 4, 1});
+  m.PopulateTensor<int32_t>(m.input(), {
+                                           0, -6, 2, 4,   //
+                                           3, -2, 10, 1,  //
+                                       });
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()), ElementsAreArray({
+                                                        0, 6, 2, 4,   //
+                                                        3, 2, 10, 1,  //
+                                                    }));
+}
+
 TEST(ElementWise, AbsInt8) {
   std::vector<float> data = {15., 46., 78., -142., -1., -17., -49., 113.};
   std::vector<float> abs_data(data.size());
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 127d6ec927b..3c4d34923ef 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -33,7 +33,7 @@ limitations under the License.
 
 #include <cstring>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
index dd0c02bd890..a8c8a04905c 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -67,8 +67,8 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index dca6e8ee4d4..16e3b65d9bd 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -123,9 +123,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestUint8) {
   HybridEmbeddingLookupOpModel m({3}, {3, 8}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
   });
 
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -133,9 +133,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestUint8) {
   EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
                   kTestTolerance)));
 }
@@ -144,9 +144,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestUint8) {
   HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
   });
 
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -154,9 +154,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestUint8) {
   EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
                   kTestTolerance)));
 }
@@ -165,9 +165,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestUint8) {
   HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
-      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
   });
 
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -175,9 +175,9 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestUint8) {
   EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear(
                   {
-                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
-                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
-                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
                   kTestTolerance)));
 }
diff --git a/tensorflow/lite/kernels/exp.cc b/tensorflow/lite/kernels/exp.cc
index d865636d5a5..5eb79f3ba5c 100644
--- a/tensorflow/lite/kernels/exp.cc
+++ b/tensorflow/lite/kernels/exp.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/expand_dims.cc b/tensorflow/lite/kernels/expand_dims.cc
index c6ca42936b7..7ec8a5fd051 100644
--- a/tensorflow/lite/kernels/expand_dims.cc
+++ b/tensorflow/lite/kernels/expand_dims.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <stdint.h>
 #include <string.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -89,7 +89,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
   }
 
-  if (IsConstantTensor(axis)) {
+  if (IsConstantOrPersistentTensor(axis)) {
     int axis_value;
     TF_LITE_ENSURE_OK(context,
                       GetAxisValueFromTensor(context, *axis, &axis_value));
@@ -118,7 +118,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (output->type == kTfLiteString) {
     TfLiteTensorRealloc(input->bytes, output);
   }
-  memcpy(output->data.raw, input->data.raw, input->bytes);
+  // Only copy data if input and output do not share a buffer.
+  if (output->data.data != input->data.data) {
+    memcpy(output->data.data, input->data.data, input->bytes);
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/fake_quant.cc b/tensorflow/lite/kernels/fake_quant.cc
index 0e2692a9bf4..1d7b13e4649 100644
--- a/tensorflow/lite/kernels/fake_quant.cc
+++ b/tensorflow/lite/kernels/fake_quant.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
index 3e320b240ed..a9c773b14d7 100644
--- a/tensorflow/lite/kernels/fill.cc
+++ b/tensorflow/lite/kernels/fill.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index 1151136b7f9..4d240dd7b36 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/floor.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/floor_div.cc b/tensorflow/lite/kernels/floor_div.cc
index 4fe3c68d218..b7ffe262628 100644
--- a/tensorflow/lite/kernels/floor_div.cc
+++ b/tensorflow/lite/kernels/floor_div.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
index a57ad2cbd62..2126a22a27b 100644
--- a/tensorflow/lite/kernels/floor_mod.cc
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 9dbfbd34ada..074a9f5d7af 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -18,9 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <memory>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h"
@@ -135,7 +136,8 @@ inline TfLiteStatus CheckTypes(TfLiteContext* context,
                                const TfLiteTensor* bias, TfLiteTensor* output,
                                TfLiteFullyConnectedParams* params) {
   const bool is_quantized =
-      ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8));
+      ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
+       (filter->type == kTfLiteInt4));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_shuffled =
       is_quantized && (params->weights_format ==
@@ -287,7 +289,7 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
       //  Currently only Int8/Int16 is supported for per channel quantization.
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteInt16);
-      TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
+      TF_LITE_ENSURE(context, (filter->type == kTfLiteInt8));
       TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                         per_channel_quantization_size);
       TF_LITE_ENSURE_EQ(
@@ -458,7 +460,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input;
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_quantized =
-      ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8));
+      ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8) ||
+       (filter->type == kTfLiteInt4));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
   const bool is_pie = kernel_type == kLegacyPie;
 
@@ -803,7 +806,17 @@ void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
   op_params.quantized_activation_max = data->output_activation_max;
   op_params.lhs_cacheable = IsConstantTensor(filter);
   op_params.rhs_cacheable = IsConstantTensor(input);
-  if (kernel_type == kReference) {
+
+  if (filter->type == kTfLiteInt4) {
+    const size_t bytes_unpacked = filter->bytes * 2;
+    auto unpacked_filter_data = std::make_unique<int8_t[]>(bytes_unpacked);
+    reference_integer_ops::FullyConnectedWithPackedInt4Weights(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(filter), GetTensorData<int8_t>(filter),
+        unpacked_filter_data.get(), GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(output),
+        GetTensorData<int8_t>(output));
+  } else if (kernel_type == kReference) {
     reference_integer_ops::FullyConnected(
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(filter), GetTensorData<int8_t>(filter),
@@ -824,7 +837,9 @@ void FullyConnectedInt16(const OpData* data, const TfLiteTensor* input,
                          const TfLiteTensor* filter, const TfLiteTensor* bias,
                          TfLiteTensor* output) {
   FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
   op_params.output_multiplier = data->output_multiplier;
   op_params.output_shift = data->output_shift;
   op_params.quantized_activation_min = data->output_activation_min;
@@ -889,6 +904,8 @@ void FullyConnectedPerChannelInt16(const OpData* data,
   // op_params.weights_offset is not set (filter.params.zero_point is not used),
   // since it will be always assumed to be 0.
   FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
   if (bias && bias->type == kTfLiteInt64) {
@@ -1021,8 +1038,19 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                 "Invalid quantized and sparse fully-connected format.");
             return kTfLiteError;
           }
-          if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
-              sparsity.dim_metadata[2].dense_size == 16) {
+          if (filter->type == kTfLiteInt4) {
+            const size_t bytes_unpacked = filter->bytes * 2;
+            auto unpacked_filter_data =
+                std::make_unique<int8_t[]>(bytes_unpacked);
+            reference_integer_ops::FullyConnectedWithPackedInt4Weights(
+                op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+                GetTensorShape(filter), GetTensorData<int8_t>(filter),
+                unpacked_filter_data.get(), GetTensorShape(bias),
+                GetTensorData<int32_t>(bias), GetTensorShape(output),
+                GetTensorData<int8_t>(output));
+          } else if (sparsity.dim_metadata_size ==
+                         kDimMetadataSizeBlockSparse &&
+                     sparsity.dim_metadata[2].dense_size == 16) {
             // Block sparse with block size of 1x16.
             optimized_ops::FullyConnectedSparseWeight1x16(
                 sparsity, op_params, input_shape, GetTensorData<int8_t>(input),
@@ -1300,6 +1328,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
         return kTfLiteError;
       }
+    case kTfLiteInt4:
+      if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
+        return EvalQuantized<kernel_type>(context, node, params, data, input,
+                                          filter, bias, output);
+      } else {
+        TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
+        return kTfLiteError;
+      }
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Filter data type %s currently not supported.",
diff --git a/tensorflow/lite/kernels/fully_connected.h b/tensorflow/lite/kernels/fully_connected.h
index badc9e7a91c..26cbf797ad7 100644
--- a/tensorflow/lite/kernels/fully_connected.h
+++ b/tensorflow/lite/kernels/fully_connected.h
@@ -21,7 +21,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_FULLY_CONNECTED_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 3b783964ab1..d091c9c7088 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -143,7 +143,8 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT,
       int input_size = -1, bool weights_per_channel_quantized = false,
-      std::vector<float> per_channel_quantization_scales = {})
+      std::vector<float> per_channel_quantization_scales = {},
+      TfLiteType filter_type = kTfLiteNoType)
       : batches_(batches),
         units_(units),
         input_size_(input_size),
@@ -192,6 +193,11 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
                              {units_, input_size_},
                              /*min=*/-63.5,
                              /*max=*/64});
+      } else if (filter_type == kTfLiteInt4) {
+        weights_ = AddInput({TensorType_INT4,
+                             {units_, input_size_},
+                             /*min=*/input.min,
+                             /*max=*/input.max});
       } else {
         weights_ =
             AddInput({input.type, {units_, input_size_}, input.min, input.max});
@@ -287,11 +293,11 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT,
-      int input_size = -1)
-      : BaseFullyConnectedOpModel(registration, units, batches, input, output,
-                                  bias_type, keep_num_dims,
-                                  bias_tensor_optional, activation_func,
-                                  weights_format, input_size) {}
+      int input_size = -1, TfLiteType filter_type = kTfLiteNoType)
+      : BaseFullyConnectedOpModel(
+            registration, units, batches, input, output, bias_type,
+            keep_num_dims, bias_tensor_optional, activation_func,
+            weights_format, input_size, false, {}, filter_type) {}
 
   void SetBias(const std::vector<float>& data) {
     if (bias_type_ == TensorType_INT32) {
@@ -306,6 +312,10 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     QuantizeAndPopulate<T>(weights_, data);
   }
 
+  void SetWeights4bit(const std::vector<float>& data) {
+    QuantizeAndPopulate4bit(weights_, data);
+  }
+
   template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
@@ -700,6 +710,35 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8NoBias) {
               ElementsAre(150, 150, 150, 184, 184, 184));
 }
 
+// The expected values for this test were obtained by running the test with the
+// same parameters but by setting filter type to INT8.
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt4) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128}, TensorType_INT32, false,
+      false, ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat_DEFAULT, -1, kTfLiteInt4);
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights4bit({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              testing::Pointwise(testing::FloatEq(), {64, 64, 68, 82, 82, 87}));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(63, 63, 67, 81, 81, 86));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
@@ -1393,7 +1432,7 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput4DOutput) {
                              }));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInputInvalidShape) {
   // Note that it is not required that the first dimension be the number of
   // batches. But it is required that the last dimension is the 'input_dim'.
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index d53492a2545..fbbdce8bfac 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
index 83613daefb9..6c91e15df51 100644
--- a/tensorflow/lite/kernels/gather_nd.cc
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/gradient/BUILD b/tensorflow/lite/kernels/gradient/BUILD
index d897b4610f6..712e46d7e2e 100644
--- a/tensorflow/lite/kernels/gradient/BUILD
+++ b/tensorflow/lite/kernels/gradient/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -18,7 +19,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:common",
diff --git a/tensorflow/lite/kernels/gradient/bcast_grad_args.cc b/tensorflow/lite/kernels/gradient/bcast_grad_args.cc
index 0cc9a00f73a..d44555cb9df 100644
--- a/tensorflow/lite/kernels/gradient/bcast_grad_args.cc
+++ b/tensorflow/lite/kernels/gradient/bcast_grad_args.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <array>
 #include <cmath>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/gradient/bcast_grad_args.h b/tensorflow/lite/kernels/gradient/bcast_grad_args.h
index 5cde6b9266a..21eb650b468 100644
--- a/tensorflow/lite/kernels/gradient/bcast_grad_args.h
+++ b/tensorflow/lite/kernels/gradient/bcast_grad_args.h
@@ -19,7 +19,7 @@ limitations under the License.
 // This file declares the TensorFlow Lite's broadcast gradient argument custom
 // operator.
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/hashtable.cc b/tensorflow/lite/kernels/hashtable.cc
index 625a04101e0..a700c1e15d7 100644
--- a/tensorflow/lite/kernels/hashtable.cc
+++ b/tensorflow/lite/kernels/hashtable.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <string>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/hashtable_find.cc b/tensorflow/lite/kernels/hashtable_find.cc
index a9f912f68de..58e0dcfaefa 100644
--- a/tensorflow/lite/kernels/hashtable_find.cc
+++ b/tensorflow/lite/kernels/hashtable_find.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/hashtable_import.cc b/tensorflow/lite/kernels/hashtable_import.cc
index 950dbdf4bf7..bc0bae6fd99 100644
--- a/tensorflow/lite/kernels/hashtable_import.cc
+++ b/tensorflow/lite/kernels/hashtable_import.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index 0de24b11333..224b6812a2a 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include <cstdlib>
 #include <cstring>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
diff --git a/tensorflow/lite/kernels/hashtable_ops_test.cc b/tensorflow/lite/kernels/hashtable_ops_test.cc
index 08115aeb102..84622c09e7b 100644
--- a/tensorflow/lite/kernels/hashtable_ops_test.cc
+++ b/tensorflow/lite/kernels/hashtable_ops_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/hashtable_size.cc b/tensorflow/lite/kernels/hashtable_size.cc
index 2474f8949de..5840cb0b64a 100644
--- a/tensorflow/lite/kernels/hashtable_size.cc
+++ b/tensorflow/lite/kernels/hashtable_size.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/lookup_interfaces.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index 52109058ea2..ad724f45876 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -89,7 +89,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i + 1, &input));
       std::vector<int> dims(input->dims->data,
                             input->dims->data + input->dims->size);
-      subgraph->ResizeInputTensor(i, dims);
+      TF_LITE_ENSURE_OK(context, subgraph->ResizeInputTensor(i, dims));
       TfLiteTensor* subgraph_input = subgraph->tensor(subgraph->inputs()[i]);
       if (IsDynamicTensor(input)) {
         SetTensorToDynamic(subgraph_input);
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index cb4ae0a0004..a830c5df2e2 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_extra_arm_config_settings",
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -66,6 +67,7 @@ cc_library(
 cc_library(
     name = "legacy_types",
     hdrs = ["legacy_types.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -317,7 +319,7 @@ cc_library(
         ":tensor_utils",
         ":transpose_utils",
         ":types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:cpu_backend_threadpool",
@@ -340,6 +342,7 @@ cc_library(
         "optimized/legacy_optimized_ops.h",
         "optimized/optimized_ops.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -354,7 +357,7 @@ cc_library(
         ":tensor_utils",
         ":transpose_utils",
         ":types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:cpu_backend_threadpool",
@@ -376,8 +379,8 @@ cc_library(
         ":common",
         ":optimized_base",
         ":types",
-        "//tensorflow/core/kernels:eigen_spatial_convolutions-inl",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl",
         "//third_party/eigen3",
     ],
 )
@@ -614,7 +617,7 @@ cc_library(
         ":tensor_utils",
         ":types",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
@@ -687,6 +690,7 @@ cc_library(
         "reference/transpose.h",
         "reference/transpose_conv.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     # We are disabling parse_headers for this header-only target so that the
     # external and internal builds are consistent. The primary issue here is
@@ -708,7 +712,7 @@ cc_library(
         ":tensor_utils",
         ":types",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
         "//third_party/eigen3",
@@ -733,7 +737,7 @@ cc_library(
     deps = [
         ":types",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -753,7 +757,7 @@ cc_library(
     deps = [
         ":types",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -815,7 +819,7 @@ cc_library(
         ":cpu_check",
         ":neon_tensor_utils",
         ":portable_tensor_utils",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:op_macros",
@@ -843,7 +847,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":tensor_utils",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
@@ -894,7 +898,7 @@ cc_library(
     deps = [
         ":cpu_check",
         "//third_party/eigen3",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ] + selects.with_or({
         (
             ":arm_any",
@@ -940,7 +944,7 @@ cc_test(
         ":common",
         ":quantization_util",
         ":tensor_utils",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 4514c9e88d5..65e248de8a8 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -377,42 +377,50 @@ inline Integer FloorLog2(Integer n) {
   }
 }
 
-// The size of the LUT depends on the type of input. For uint8 and int8 inputs
-// we use a 256 entries LUT to map all the values in the (u)int8 range. For
-// int16 inputs the high 9 bits are used for indexing and the 7 remaining bits
-// are used for interpolation. We thus use a 513-entries LUT for int16 cases,
-// 512 for the 9-bit indexing and 1 extra entry to interpolate the last value.
-template <typename T>
-constexpr int LUTSize() {
-  static_assert(std::is_same<T, uint8_t>::value ||
-                    std::is_same<T, int8_t>::value ||
-                    std::is_same<T, int16_t>::value,
-                "Only LUTs with uint8, int8 or int16 inputs are supported.");
-  if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) {
-    return 256;
-  } else {
-    return 513;
-  }
+namespace detail {
+
+// LUTPopulate takes an optional type-erased transform_params to allow passing
+// extra parameters to the transform function pointer. const void* is used
+// instead of std::function to be compatible with TFLite Micro
+template <typename FloatT, typename Func>
+inline typename std::enable_if<std::is_same<Func, FloatT (*)(FloatT)>::value,
+                               FloatT>::type
+LUTTransform(Func transform, const void* /*transform_params*/, FloatT value) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+  return transform(value);
+}
+
+template <typename FloatT, typename Func>
+inline typename std::enable_if<
+    std::is_same<Func, FloatT (*)(FloatT, const void*)>::value, FloatT>::type
+LUTTransform(Func transform, const void* transform_params, FloatT value) {
+  static_assert(std::is_floating_point<FloatT>::value,
+                "FloatT must be a floating-point type.");
+  return transform(value, transform_params);
 }
 
 // Use the same LUT generation code for both uint8_t and int8_t. Int8_t indexes
 // will be directly casted to uint8_t, the int8 LUT will thus be ordered as [0,
 // 1, ..., 127, -128, ..., -2, -1] instead of [-128, -127, ..., -1, 0, 1, ...,
 // 126, 127].
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
-                                   std::is_same<T, int8_t>::value,
-                               void>::type
-LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
-            int32_t output_zero_point, float (*transform)(float), T* lut) {
+template <typename T, typename Func>
+inline void LUTPopulateInt8(float input_scale, int32_t input_zero_point,
+                            float output_scale, int32_t output_zero_point,
+                            Func transform, const void* transform_params,
+                            T* lut) {
+  static_assert(
+      std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value,
+      "T must be an uint8 or int8 type.");
   uint8_t* lut_uint8 = reinterpret_cast<uint8_t*>(lut);
   const float inverse_scale = 1 / output_scale;
   int32_t maxval = std::numeric_limits<T>::max();
   int32_t minval = std::numeric_limits<T>::min();
   for (int32_t val = minval; val <= maxval; ++val) {
     const float dequantized = input_scale * (val - input_zero_point);
-    const float transformed = transform(dequantized);
-    const float rescaled = std::round(transformed * inverse_scale);
+    const float transformed =
+        LUTTransform(transform, transform_params, dequantized);
+    const float rescaled = TfLiteRound(transformed * inverse_scale);
     const int32_t quantized =
         static_cast<int32_t>(rescaled + output_zero_point);
     lut_uint8[static_cast<uint8_t>(static_cast<T>(val))] = static_cast<uint8_t>(
@@ -422,10 +430,11 @@ LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
 
 // Keep floating-point type configurable for backward compatibility. float
 // should be used for FloatT by default.
-template <typename T, typename FloatT>
-inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
-LUTPopulate(FloatT input_scale, int32_t input_zero_point, FloatT output_scale,
-            int32_t output_zero_point, FloatT (*transform)(FloatT), T* lut) {
+template <typename FloatT, typename Func>
+inline void LUTPopulateInt16(FloatT input_scale, int32_t input_zero_point,
+                             FloatT output_scale, int32_t output_zero_point,
+                             Func transform, const void* transform_params,
+                             int16_t* lut) {
   static_assert(std::is_floating_point<FloatT>::value,
                 "FloatT must be a floating-point type.");
   const FloatT input_min =
@@ -441,16 +450,21 @@ LUTPopulate(FloatT input_scale, int32_t input_zero_point, FloatT output_scale,
   const FloatT step = (input_max - input_min) / nb_steps;
   const FloatT half_step = step / 2;
   const FloatT output_scaling_inv =
-      static_cast<FloatT>(std::numeric_limits<T>::max() -
-                          std::numeric_limits<T>::min() + 1) /
+      static_cast<FloatT>(std::numeric_limits<int16_t>::max() -
+                          std::numeric_limits<int16_t>::min() + 1) /
       (output_max - output_min);
-  const FloatT table_min = static_cast<FloatT>(std::numeric_limits<T>::min());
-  const FloatT table_max = static_cast<FloatT>(std::numeric_limits<T>::max());
+  const FloatT table_min =
+      static_cast<FloatT>(std::numeric_limits<int16_t>::min());
+  const FloatT table_max =
+      static_cast<FloatT>(std::numeric_limits<int16_t>::max());
 
   for (int i = 0; i < nb_steps; i++) {
-    const FloatT val = transform(input_min + i * step);
-    const FloatT val_midpoint = transform(input_min + i * step + half_step);
-    const FloatT val_next = transform(input_min + (i + 1) * step);
+    const FloatT val =
+        LUTTransform<FloatT>(transform, transform_params, input_min + i * step);
+    const FloatT val_midpoint = LUTTransform<FloatT>(
+        transform, transform_params, input_min + i * step + half_step);
+    const FloatT val_next = LUTTransform<FloatT>(transform, transform_params,
+                                                 input_min + (i + 1) * step);
 
     const FloatT sample_val = TfLiteRound(val * output_scaling_inv);
     const FloatT midpoint_interp_val =
@@ -461,54 +475,84 @@ LUTPopulate(FloatT input_scale, int32_t input_zero_point, FloatT output_scale,
     const FloatT midpoint_err = midpoint_interp_val - midpoint_val;
     const FloatT bias = TfLiteRound(midpoint_err / 2);
 
-    lut[i] = static_cast<T>(std::min<FloatT>(
+    lut[i] = static_cast<int16_t>(std::min<FloatT>(
         std::max<FloatT>(sample_val - bias, table_min), table_max));
   }
 
-  lut[nb_steps] = static_cast<T>(std::min<FloatT>(
-      std::max<FloatT>(TfLiteRound(transform(input_max) * output_scaling_inv),
+  lut[nb_steps] = static_cast<int16_t>(std::min<FloatT>(
+      std::max<FloatT>(TfLiteRound(LUTTransform<FloatT>(
+                                       transform, transform_params, input_max) *
+                                   output_scaling_inv),
                        table_min),
       table_max));
 }
 
+}  // namespace detail
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
+                                   std::is_same<T, int8_t>::value,
+                               void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float), T* lut) {
+  detail::LUTPopulateInt8(input_scale, input_zero_point, output_scale,
+                          output_zero_point, transform, nullptr, lut);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value ||
+                                   std::is_same<T, int8_t>::value,
+                               void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float, const void*),
+            const void* transform_params, T* lut) {
+  detail::LUTPopulateInt8(input_scale, input_zero_point, output_scale,
+                          output_zero_point, transform, transform_params, lut);
+}
+
 template <typename T>
 inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
 LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
             int32_t output_zero_point, float (*transform)(float), T* lut) {
-  LUTPopulate<T, float>(input_scale, input_zero_point, output_scale,
-                        output_zero_point, transform, lut);
+  detail::LUTPopulateInt16<float>(input_scale, input_zero_point, output_scale,
+                                  output_zero_point, transform, nullptr, lut);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
+LUTPopulate(float input_scale, int32_t input_zero_point, float output_scale,
+            int32_t output_zero_point, float (*transform)(float, const void*),
+            const void* transform_params, T* lut) {
+  detail::LUTPopulateInt16<float>(input_scale, input_zero_point, output_scale,
+                                  output_zero_point, transform,
+                                  transform_params, lut);
+}
+
+// Deprecated, avoid usage and prefer the float version. Kept for
+// backward-compatiblity.
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int16_t>::value, void>::type
+LUTPopulate(double input_scale, int32_t input_zero_point, double output_scale,
+            int32_t output_zero_point, double (*transform)(double), T* lut) {
+  detail::LUTPopulateInt16<double>(input_scale, input_zero_point, output_scale,
+                                   output_zero_point, transform, nullptr, lut);
 }
 
-// Deprecated and will be removed in future, please use LUTPopulate instead
-template <typename FloatT, typename LutInT, typename LutOutT>
-inline void gen_lut(FloatT (*func)(FloatT), FloatT input_min, FloatT input_max,
-                    FloatT output_min, FloatT output_max, LutOutT* lut) {
-  static_assert(std::is_same<LutInT, LutOutT>::value,
-                "Input and output type of the LUT must be the same.");
-  static_assert(std::is_same<LutInT, int16_t>::value,
-                "Only int16_t type LUT are supported.");
-  static_assert(std::is_same<FloatT, float>::value,
-                "Only float type is supported for FloatT.");
-  using T = LutInT;
-
-  const auto zero_point = [](float min, float max, float scale) {
-    // Symmetric int16 LUT, we know the zero-point will not overflow an int32_t
-    // and zero-point from min will be the same as from max.
-    return static_cast<int32_t>(
-        static_cast<float>(std::numeric_limits<T>::min()) - min / scale);
-  };
-
-  const float scale = static_cast<float>(std::numeric_limits<T>::max() -
-                                         std::numeric_limits<T>::min());
-  const float input_scale = (input_max - input_min) / scale;
-  const FloatT output_scale = (output_max - output_min) / scale;
-  const int32_t input_zero_point =
-      zero_point(input_min, input_max, input_scale);
-  const int32_t output_zero_point =
-      zero_point(output_min, output_max, output_scale);
-
-  return LUTPopulate<T, float>(input_scale, input_zero_point, output_scale,
-                               output_zero_point, func, lut);
+// The size of the LUT depends on the type of input. For uint8 and int8 inputs a
+// simple 256 entries LUT is used. For int16 inputs the high 9 bits are used for
+// indexing and the 7 remaining bits are used for interpolation. We thus use a
+// 513-entries LUT for int16 cases, 512 for the 9-bit indexing and 1 extra entry
+// to interpolate the last value.
+template <typename T>
+constexpr int LUTSize() {
+  static_assert(std::is_same<T, uint8_t>::value ||
+                    std::is_same<T, int8_t>::value ||
+                    std::is_same<T, int16_t>::value,
+                "Only LUTs with uint8, int8 or int16 inputs are supported.");
+  // As per c++11: constexpr methods cannot have more than one return statement.
+  return (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
+             ? 256
+             : 513;
 }
 
 // int16_t -> int16_t table lookup with interpolation
diff --git a/tensorflow/lite/kernels/internal/kernel_utils.h b/tensorflow/lite/kernels/internal/kernel_utils.h
index 2f551570e17..97e2bf899e5 100644
--- a/tensorflow/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/lite/kernels/internal/kernel_utils.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 namespace tflite {
 namespace kernel_utils {
diff --git a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
index 22a19b77533..63256f55310 100644
--- a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_BATCH_MATMUL_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_BATCH_MATMUL_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/internal/common.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index 7bdc0971565..ff5575978c0 100644
--- a/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -40,6 +40,6 @@ limitations under the License.
 #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions-inl.h"
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
index 4566d60b8c7..33d7c2a57bf 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h
@@ -63,6 +63,9 @@ inline void TransposeConvV2(
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
 
@@ -103,12 +106,10 @@ inline void TransposeConvV2(
   optimized_ops::BiasAdd(scratch_data_p, bias_data, batch_size, output_height,
                          output_width, output_depth);
 
-  const int32_t output_min = std::numeric_limits<DestinationScalar>::min();
-  const int32_t output_max = std::numeric_limits<DestinationScalar>::max();
-
   optimized_ops::Quantize(output_multiplier, output_shift, output_depth,
                           output_shape.FlatSize(), params.output_offset,
-                          output_min, output_max, scratch_data, output_data);
+                          output_activation_min, output_activation_max,
+                          scratch_data, output_data);
 }
 
 }  // namespace optimized_integer_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
index c4eab73796f..6bb1aaacdfb 100644
--- a/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <assert.h>
 #include <stdint.h>
 #include <sys/types.h>
+
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -26,7 +27,7 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index ca608a0931f..71755d851e7 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1783,7 +1783,7 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
   const int32_t output_min = std::numeric_limits<int8_t>::min();
   const int32_t output_max = std::numeric_limits<int8_t>::max();
 
-  const int32x4_t output_zp_dup = vdupq_n_s32(-output_zp);
+  const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
   const int32x4_t max_val_dup = vdupq_n_s32(output_max);
   const int32x4_t min_val_dup = vdupq_n_s32(output_min);
 
@@ -1821,7 +1821,7 @@ void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
       const int16_t b = input_2[index];
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
-      value -= output_zp;
+      value += output_zp;
       value = std::min(std::max(-128, value), 127);
 
       output[index] = static_cast<int8>(value);
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index a3c97ddb3c4..8665bd231cc 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -40,7 +40,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
@@ -4882,6 +4882,9 @@ inline void TransposeConvV2(
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
 
@@ -4914,14 +4917,19 @@ inline void TransposeConvV2(
   output_data_p = output_data;
   BiasAdd(output_data_p, bias_data, batch_size, output_height, output_width,
           output_depth);
+
+  for (int i = 0; i < output_offset * batch_size; ++i) {
+    output_data[i] = std::min(std::max(output_data[i], output_activation_min),
+                              output_activation_max);
+  }
 }
 
 inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
-                     int32_t output_zp, int32_t* scratch, uint8_t* output) {
+                     int32_t output_zp, const int32_t output_min,
+                     const int32_t output_max, int32_t* scratch,
+                     uint8_t* output) {
   ruy::profiler::ScopeLabel label("Quantize/uint8");
   int i = 0;
-  const int32_t output_min = std::numeric_limits<uint8_t>::min();
-  const int32_t output_max = std::numeric_limits<uint8_t>::max();
 
 #ifdef USE_NEON
   const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
@@ -5381,6 +5389,9 @@ inline void TransposeConvV2(
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
 
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+
   const int hwoi_ordered_filter_total_size =
       filter_height * filter_width * output_depth;
 
@@ -5421,8 +5432,8 @@ inline void TransposeConvV2(
           output_depth);
 
   Quantize(params.output_multiplier, params.output_shift,
-           output_shape.FlatSize(), params.output_offset, scratch_data,
-           output_data);
+           output_shape.FlatSize(), params.output_offset, output_activation_min,
+           output_activation_max, scratch_data, output_data);
 }
 
 // Integer-only version of ResizeNearestNeighbor. Since scales are represented
@@ -7184,8 +7195,7 @@ template <typename T>
 inline void Transpose3D(const TransposeParams& params,
                         const RuntimeShape& input_shape, const T* input_data,
                         const RuntimeShape& output_shape, T* output_data) {
-  int s1, s2, s3;
-  s1 = input_shape.Dims(0);
+  int s2, s3;
   s2 = input_shape.Dims(1);
   s3 = input_shape.Dims(2);
 
diff --git a/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h b/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h
index d9499831cd7..2e1abf7a59a 100644
--- a/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h
+++ b/tensorflow/lite/kernels/internal/optimized/resize_bilinear.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index 9da7b801e90..df556fea371 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
diff --git a/tensorflow/lite/kernels/internal/portable_tensor.h b/tensorflow/lite/kernels/internal/portable_tensor.h
index 4d71c967841..45135b1f78c 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
index 4904a6a3a02..a9cfee8e7d7 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
index 266d0b4ffec..c28892c134e 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
diff --git a/tensorflow/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc
index 9a6d3073f85..aec0b2ba54b 100644
--- a/tensorflow/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc
@@ -171,7 +171,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMinBoundary) {
   EXPECT_EQ(qp.zero_point, 0);
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroNotInRange) {
   // Assumption is that zero is within the range.
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, 30.0), "");
@@ -328,7 +328,7 @@ TEST(QuantizationUtilTest, IntegerDoubleCompare) {
   EXPECT_EQ(1, IntegerDoubleCompare(INFINITY, NAN));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) {
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), "");
 }
diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h
index 6344bdc72f9..d3b8c115e0d 100644
--- a/tensorflow/lite/kernels/internal/reference/comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/comparisons.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_COMPARISONS_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/densify.h b/tensorflow/lite/kernels/internal/reference/densify.h
index aef9186d59a..98911345d66 100644
--- a/tensorflow/lite/kernels/internal/reference/densify.h
+++ b/tensorflow/lite/kernels/internal/reference/densify.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
diff --git a/tensorflow/lite/kernels/internal/reference/gather.h b/tensorflow/lite/kernels/internal/reference/gather.h
index 61dcc022ee2..974f63d61ac 100644
--- a/tensorflow/lite/kernels/internal/reference/gather.h
+++ b/tensorflow/lite/kernels/internal/reference/gather.h
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <cstring>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "ruy/profiler/instrumentation.h"  // from @ruy
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
index 7676fce0f4d..312ba0f9301 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 
 namespace tflite {
 namespace reference_integer_ops {
@@ -121,6 +122,21 @@ inline void DepthwiseConvPerChannel(
   }
 }
 
+inline void DepthwiseConvPerChannelWithPackedInt4Weights(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int8_t* unpacked_filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_NE(unpacked_filter_data, nullptr);
+  tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+      filter_data, filter_shape.FlatSize(), unpacked_filter_data);
+  DepthwiseConvPerChannel(params, output_multiplier, output_shift, input_shape,
+                          input_data, filter_shape, unpacked_filter_data,
+                          bias_shape, bias_data, output_shape, output_data);
+}
+
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
     const int32_t* output_shift, const RuntimeShape& input_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
index 634f0bffa11..cc881a3afe2 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 
 namespace tflite {
 namespace reference_integer_ops {
@@ -28,14 +29,15 @@ namespace reference_integer_ops {
 // zero_point (params.weights_offset) is always 0.
 // However, for per-tensor functions, params.weights_offset is still applied for
 // backward compatibility.
-
-inline void FullyConnectedPerChannel(
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnectedPerChannel(
     const FullyConnectedParams& params, const int32_t* output_multiplier,
     const int* output_shift, const RuntimeShape& input_shape,
-    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data) {
+    const InputType* input_data, const RuntimeShape& filter_shape,
+    const WeightType* filter_data, const RuntimeShape& bias_shape,
+    const BiasType* bias_data, const RuntimeShape& output_shape,
+    OutputType* output_data) {
   const int32_t input_offset = params.input_offset;
   const int32_t output_offset = params.output_offset;
   const int32_t output_activation_min = params.quantized_activation_min;
@@ -51,7 +53,7 @@ inline void FullyConnectedPerChannel(
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
     for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32_t acc = 0;
+      BiasType acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
         int32_t input_val = input_data[b * accum_depth + d];
         int32_t filter_val = filter_data[out_c * accum_depth + d];
@@ -60,62 +62,26 @@ inline void FullyConnectedPerChannel(
       if (bias_data) {
         acc += bias_data[out_c];
       }
-      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_c],
-                                          output_shift[out_c]);
-      acc += output_offset;
-      acc = std::max(acc, output_activation_min);
-      acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
-    }
-  }
-}
-
-template <typename AccumScalar>
-inline void FullyConnectedPerChannel(
-    const FullyConnectedParams& params, const int32_t* output_multiplier,
-    const int* output_shift, const RuntimeShape& input_shape,
-    const int16_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const AccumScalar* bias_data, const RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int filter_dim_count = filter_shape.DimensionsCount();
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-  const int output_depth = output_shape.Dims(output_dim_count - 1);
-  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
-  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-  for (int b = 0; b < batches; ++b) {
-    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      AccumScalar acc = 0;
-      for (int d = 0; d < accum_depth; ++d) {
-        int32_t input_val = input_data[b * accum_depth + d];
-        int32_t filter_val = filter_data[out_c * accum_depth + d];
-        acc += filter_val * input_val;
-      }
-      if (bias_data) {
-        acc += bias_data[out_c];
-      }
       int32_t acc_scaled = MultiplyByQuantizedMultiplier(
           acc, output_multiplier[out_c], output_shift[out_c]);
+      acc_scaled += output_offset;
       acc_scaled = std::max(acc_scaled, output_activation_min);
       acc_scaled = std::min(acc_scaled, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
     }
   }
 }
 
-inline void FullyConnected(
-    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const int8_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const int32_t* bias_data, const RuntimeShape& output_shape,
-    int8_t* output_data) {
+template <typename InputType, typename WeightType, typename OutputType,
+          typename BiasType>
+void FullyConnected(const FullyConnectedParams& params,
+                    const RuntimeShape& input_shape,
+                    const InputType* input_data,
+                    const RuntimeShape& filter_shape,
+                    const WeightType* filter_data,
+                    const RuntimeShape& bias_shape, const BiasType* bias_data,
+                    const RuntimeShape& output_shape, OutputType* output_data) {
   const int32_t input_offset = params.input_offset;
   const int32_t filter_offset = params.weights_offset;
   const int32_t output_offset = params.output_offset;
@@ -135,7 +101,7 @@ inline void FullyConnected(
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
   for (int b = 0; b < batches; ++b) {
     for (int out_c = 0; out_c < output_depth; ++out_c) {
-      int32_t acc = 0;
+      BiasType acc = 0;
       for (int d = 0; d < accum_depth; ++d) {
         int32_t input_val = input_data[b * accum_depth + d];
         int32_t filter_val = filter_data[out_c * accum_depth + d];
@@ -144,57 +110,31 @@ inline void FullyConnected(
       if (bias_data) {
         acc += bias_data[out_c];
       }
-      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-      acc += output_offset;
-      acc = std::max(acc, output_activation_min);
-      acc = std::min(acc, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
-    }
-  }
-}
-
-template <typename AccumScalar>
-inline void FullyConnected(
-    const FullyConnectedParams& params, const RuntimeShape& input_shape,
-    const int16_t* input_data, const RuntimeShape& filter_shape,
-    const int8_t* filter_data, const RuntimeShape& bias_shape,
-    const AccumScalar* bias_data, const RuntimeShape& output_shape,
-    int16_t* output_data) {
-  const int32_t filter_offset = params.weights_offset;
-  const int32_t output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
-  TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int filter_dim_count = filter_shape.DimensionsCount();
-  const int output_dim_count = output_shape.DimensionsCount();
-  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
-  const int output_depth = output_shape.Dims(output_dim_count - 1);
-  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
-  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-  for (int b = 0; b < batches; ++b) {
-    for (int out_c = 0; out_c < output_depth; ++out_c) {
-      AccumScalar acc = 0;
-      for (int d = 0; d < accum_depth; ++d) {
-        int32_t input_val = input_data[b * accum_depth + d];
-        int32_t filter_val = filter_data[out_c * accum_depth + d];
-        acc += (filter_val + filter_offset) * input_val;
-      }
-      if (bias_data) {
-        acc += bias_data[out_c];
-      }
       int32_t acc_scaled =
           MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc_scaled += output_offset;
       acc_scaled = std::max(acc_scaled, output_activation_min);
       acc_scaled = std::min(acc_scaled, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<int16_t>(acc_scaled);
+      output_data[out_c + output_depth * b] =
+          static_cast<OutputType>(acc_scaled);
     }
   }
 }
 
+inline void FullyConnectedWithPackedInt4Weights(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, int8_t* unpacked_filter_data,
+    const RuntimeShape& bias_shape, const int32_t* bias_data,
+    const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_NE(unpacked_filter_data, nullptr);
+  tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+      filter_data, filter_shape.FlatSize(), unpacked_filter_data);
+  FullyConnected(params, input_shape, input_data, filter_shape,
+                 unpacked_filter_data, bias_shape, bias_data, output_shape,
+                 output_data);
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
index 22e897409eb..05066184c14 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -24,10 +24,10 @@ limitations under the License.
 namespace tflite {
 namespace reference_integer_ops {
 
-template <typename T>
-inline void MulElementwise(int size, const ArithmeticParams& params,
-                           const T* input1_data, const T* input2_data,
-                           T* output_data) {
+template <typename InputType, typename OutputType>
+void MulElementwise(int size, const ArithmeticParams& params,
+                    const InputType* input1_data, const InputType* input2_data,
+                    OutputType* output_data) {
   for (int i = 0; i < size; ++i) {
     const int32_t input1_val = params.input1_offset + input1_data[i];
     const int32_t input2_val = params.input2_offset + input2_data[i];
@@ -39,7 +39,7 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
     const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<T>(clamped_output);
+    output_data[i] = static_cast<OutputType>(clamped_output);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
index 92919a71d8e..40f99ceeaa0 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h
@@ -55,8 +55,8 @@ inline void TransposeConv(
   const int output_width = output_shape.Dims(2);
   const int32_t input_offset = params.input_offset;
   const int32_t output_offset = params.output_offset;
-  const int32_t output_activation_min = std::numeric_limits<int8_t>::min();
-  const int32_t output_activation_max = std::numeric_limits<int8_t>::max();
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   const int num_elements = output_shape.FlatSize();
@@ -153,8 +153,8 @@ inline void TransposeConv(
   const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
-  const int32_t output_activation_min = std::numeric_limits<int16_t>::min();
-  const int32_t output_activation_max = std::numeric_limits<int16_t>::max();
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   const int num_elements = output_shape.FlatSize();
diff --git a/tensorflow/lite/kernels/internal/reference/l2normalization.h b/tensorflow/lite/kernels/internal/reference/l2normalization.h
index 7587d2b5c2e..e5c91bf5bc7 100644
--- a/tensorflow/lite/kernels/internal/reference/l2normalization.h
+++ b/tensorflow/lite/kernels/internal/reference/l2normalization.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 3597a78d65a..4ae6af7d077 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -389,6 +389,10 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   op_params.stride_width = stride_width;
   op_params.stride_height = stride_height;
 
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &op_params.float_activation_min,
+                      &op_params.float_activation_max);
+
   TransposeConv(op_params, DimsToShape(input_dims), input_data,
                 DimsToShape(filter_dims), filter_data,
                 /*bias_shape*/ RuntimeShape(), /*bias*/ nullptr,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4684be64fe0..d386203e460 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -663,7 +663,7 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       const int16_t b = input_2[index];
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
-      value -= output_zp;
+      value += output_zp;
       value = std::min(std::max(static_cast<int32_t>(-128), value),
                        static_cast<int32_t>(127));
 
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index 341b3a08735..adc435f9050 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -468,6 +468,22 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
   return true;
 }
 
+template <typename T, typename U>
+inline bool QuantizedMeanOrSumExtraArgs(
+    const T* input_data, int32_t input_zero_point, float input_scale,
+    const int* input_dims, const int input_num_dims, T* output_data,
+    float output_scale, int32_t output_multiplier, int output_shift,
+    int32_t output_zero_point, const int* output_dims,
+    const int output_num_dims, const int* axis, const int num_axis_dimensions,
+    bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
+    bool compute_sum) {
+  return QuantizedMeanOrSum<T, U>(
+      input_data, input_zero_point, input_scale, input_dims, input_num_dims,
+      output_data, output_zero_point, output_scale, output_dims,
+      output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
+      resolved_axis, temp_sum, compute_sum);
+}
+
 template <typename T>
 inline bool QuantizedReduceProd(const T* input_data, int32_t input_zero_point,
                                 const RuntimeShape& input_shape, T* output_data,
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index abd157b7c6a..5272610097b 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "fixedpoint/fixedpoint.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/add.h"
diff --git a/tensorflow/lite/kernels/internal/reference/strided_slice.h b/tensorflow/lite/kernels/internal/reference/strided_slice.h
index a339e5f43ba..b76baaad53a 100644
--- a/tensorflow/lite/kernels/internal/reference/strided_slice.h
+++ b/tensorflow/lite/kernels/internal/reference/strided_slice.h
@@ -75,8 +75,12 @@ inline void StridedSlice(const tflite::StridedSliceParams& op_params,
       return index < end;
     }
   };
-  const int* shape = static_cast<const int*>(input_shape.DimsData());
-  const int* stride = static_cast<const int*>(params_copy.strides);
+  // With a static_cast it is not possible to initialize
+  // a variable of type 'const int *'
+  // with an rvalue of type 'const int32_t *' (aka 'const long *').
+  // reinterpret_cast is required to handle this casting.
+  const int* shape = reinterpret_cast<const int*>(input_shape.DimsData());
+  const int* stride = reinterpret_cast<const int*>(params_copy.strides);
   const bool inner_stride_is_1 = params_copy.strides[4] == 1;
 
   for (int offset_0 = start_0; lc(stop_0, stride[0], offset_0);
diff --git a/tensorflow/lite/kernels/internal/reference/string_comparisons.h b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
index 61c43ac73f0..d3cda1bc18e 100644
--- a/tensorflow/lite/kernels/internal/reference/string_comparisons.h
+++ b/tensorflow/lite/kernels/internal/reference/string_comparisons.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_STRING_COMPARISONS_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/reference/comparisons.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index 49bf5998fe0..fa3de9509de 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <algorithm>
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
diff --git a/tensorflow/lite/kernels/internal/reference/transpose_conv.h b/tensorflow/lite/kernels/internal/reference/transpose_conv.h
index ac91f379a86..8a51e0fa5e9 100644
--- a/tensorflow/lite/kernels/internal/reference/transpose_conv.h
+++ b/tensorflow/lite/kernels/internal/reference/transpose_conv.h
@@ -49,6 +49,8 @@ inline void TransposeConv(
   const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
   if (bias_data) {
     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
   }
@@ -99,14 +101,18 @@ inline void TransposeConv(
       }
     }
   }
-  if (bias_data) {
-    for (int batch = 0; batch < batches; ++batch) {
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-            output_data[Offset(output_shape, batch, out_y, out_x,
-                               out_channel)] += bias_data[out_channel];
-          }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          float acc = output_data[Offset(output_shape, batch, out_y, out_x,
+                                         out_channel)];
+          if (bias_data) acc += bias_data[out_channel];
+
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              ActivationFunctionWithMinMax(acc, output_activation_min,
+                                           output_activation_max);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index f1d3e17fcbc..7e639b919ce 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 3b61b9eeead..0fe64f58c6e 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 
 #if defined(_MSC_VER)
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 4f8d93992d9..c4c11999503 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <algorithm>
 
 #include <gmock/gmock.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -960,9 +960,9 @@ TEST(uKernels, QuantMul8bitArbitrarySclaeTest) {
   CwiseMul(input1.data(), input2.data(), multiplier, shift, 2, 15, 3,
            output.data());
   const std::vector<int8_t> expected_output = {
-      -84,  127, 127, -128, -128, 127,  56,   -128, 127,  -128,
-      -126, 127, -7,  127,  127,  -128, -128, 127,  -128, 127,
-      127,  -33, -20, 127,  -128, -128, -128, -128, 127,  -128,
+      -78,  127, 127, -128, -128, 127,  62,   -128, 127,  -128,
+      -120, 127, -1,  127,  127,  -128, -128, 127,  -128, 127,
+      127,  -27, -14, 127,  -128, -128, -128, -128, 127,  -128,
   };
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
diff --git a/tensorflow/lite/kernels/internal/utils/BUILD b/tensorflow/lite/kernels/internal/utils/BUILD
index dccd73cc82e..67d540501b1 100644
--- a/tensorflow/lite/kernels/internal/utils/BUILD
+++ b/tensorflow/lite/kernels/internal/utils/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -15,7 +16,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//third_party/eigen3",
     ],
 )
@@ -30,6 +31,7 @@ cc_test(
     deps = [
         ":sparsity_format_converter",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
index b77522e01df..be9526e0413 100644
--- a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
+++ b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace internal {
diff --git a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter_test.cc b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter_test.cc
index 149457627e2..fede0d62950 100644
--- a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter_test.cc
+++ b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 
 namespace tflite {
 namespace internal {
diff --git a/tensorflow/lite/kernels/irfft2d.cc b/tensorflow/lite/kernels/irfft2d.cc
index 6d24d54c79e..057fca106fa 100644
--- a/tensorflow/lite/kernels/irfft2d.cc
+++ b/tensorflow/lite/kernels/irfft2d.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "third_party/fft2d/fft2d.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index efc7669821f..7b7e37362af 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include <string>
 #endif  // TF_LITE_STATIC_MEMORY
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 068744220b9..fc586b1e037 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <string>
 #endif  // TF_LITE_STATIC_MEMORY
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index dc50b67b465..bc4fa125556 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
index d809518f9e8..06b2a76c220 100644
--- a/tensorflow/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h"
diff --git a/tensorflow/lite/kernels/local_response_norm.cc b/tensorflow/lite/kernels/local_response_norm.cc
index 64a60df8bd4..31ff343af40 100644
--- a/tensorflow/lite/kernels/local_response_norm.cc
+++ b/tensorflow/lite/kernels/local_response_norm.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
index a3d6bdbbd49..3a32e3513f9 100644
--- a/tensorflow/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stddef.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/lsh_projection.cc b/tensorflow/lite/kernels/lsh_projection.cc
index 92a5ee556f7..dc63c7b5c81 100644
--- a/tensorflow/lite/kernels/lsh_projection.cc
+++ b/tensorflow/lite/kernels/lsh_projection.cc
@@ -56,8 +56,8 @@ limitations under the License.
 #include <cstring>
 #include <memory>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include <farmhash.h>
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 85dea69ea3b..9f74aad9755 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
@@ -1930,6 +1930,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*forward_sequence=*/true,
           /*time_major=*/true,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
+          /*recurrent_to_input_is_diag=*/false,
+          /*recurrent_to_forget_is_diag=*/false,
+          /*recurrent_to_cell_is_diag=*/false,
+          /*recurrent_to_output_is_diag=*/false,
           CpuBackendContext::GetFromContext(context));
     }
     case kTfLiteUInt8:
@@ -2031,6 +2035,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               /*aux_input_zp=*/nullptr,
               GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
               row_sums_size, &op_data->compute_row_sums,
+              /*recurrent_to_input_is_diag=*/false,
+              /*recurrent_to_forget_is_diag=*/false,
+              /*recurrent_to_cell_is_diag=*/false,
+              /*recurrent_to_output_is_diag=*/false,
               CpuBackendContext::GetFromContext(context));
         }
         return lstm_eval::EvalHybrid(
@@ -2074,6 +2082,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_zp=*/nullptr,
             GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
             row_sums_size, &op_data->compute_row_sums,
+            /*recurrent_to_input_is_diag=*/false,
+            /*recurrent_to_forget_is_diag=*/false,
+            /*recurrent_to_cell_is_diag=*/false,
+            /*recurrent_to_output_is_diag=*/false,
             CpuBackendContext::GetFromContext(context));
       }
       const int num_intermediate_tensors = node->intermediates->size;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 3a4ad0cea0d..2de32091e09 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "ruy/mul_params.h"  // from @ruy
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
@@ -91,7 +91,10 @@ void ComputeRowSums(
     const int8_t* recurrent_to_cell_weights_ptr,
     const int8_t* recurrent_to_output_weights_ptr,
     const int8_t* projection_weights_ptr, bool use_cifg,
-    const float* aux_input_ptr) {
+    const float* aux_input_ptr, bool recurrent_to_input_is_diag = false,
+    bool recurrent_to_forget_is_diag = false,
+    bool recurrent_to_cell_is_diag = false,
+    bool recurrent_to_output_is_diag = false) {
   // Compute the row sums for dequantization
   if (!use_cifg) {
     tensor_utils::ReductionSumVector(input_to_input_weights_ptr,
@@ -121,19 +124,27 @@ void ComputeRowSums(
                                      n_aux_input);
   }
   if (!use_cifg) {
-    tensor_utils::ReductionSumVector(recurrent_to_input_weights_ptr,
-                                     recurrent_to_input_row_sums, n_cell,
+    if (!recurrent_to_input_is_diag) {
+      tensor_utils::ReductionSumVector(recurrent_to_input_weights_ptr,
+                                       recurrent_to_input_row_sums, n_cell,
+                                       n_output);
+    }
+  }
+  if (!recurrent_to_forget_is_diag) {
+    tensor_utils::ReductionSumVector(recurrent_to_forget_weights_ptr,
+                                     recurrent_to_forget_row_sums, n_cell,
+                                     n_output);
+  }
+  if (!recurrent_to_cell_is_diag) {
+    tensor_utils::ReductionSumVector(recurrent_to_cell_weights_ptr,
+                                     recurrent_to_cell_row_sums, n_cell,
+                                     n_output);
+  }
+  if (!recurrent_to_output_is_diag) {
+    tensor_utils::ReductionSumVector(recurrent_to_output_weights_ptr,
+                                     recurrent_to_output_row_sums, n_cell,
                                      n_output);
   }
-  tensor_utils::ReductionSumVector(recurrent_to_forget_weights_ptr,
-                                   recurrent_to_forget_row_sums, n_cell,
-                                   n_output);
-  tensor_utils::ReductionSumVector(recurrent_to_cell_weights_ptr,
-                                   recurrent_to_cell_row_sums, n_cell,
-                                   n_output);
-  tensor_utils::ReductionSumVector(recurrent_to_output_weights_ptr,
-                                   recurrent_to_output_row_sums, n_cell,
-                                   n_output);
 
   if (projection_weights_ptr != nullptr) {
     tensor_utils::ReductionSumVector(
@@ -188,7 +199,7 @@ inline void CalculateLstmGateFloat(
     const int n_output, const int n_cell,
     const TfLiteFusedActivation activation, float* gate,
     const bool is_input_all_zeros, const bool is_aux_input_all_zeros,
-    float* output, CpuBackendContext* context) {
+    float* output, bool recurrent_is_diag, CpuBackendContext* context) {
   const bool use_peephole = (cell_to_gate_weights != nullptr);
   const bool use_layer_norm = (layer_norm_coefficients != nullptr);
 
@@ -217,9 +228,16 @@ inline void CalculateLstmGateFloat(
     std::swap(accumulation_buffer, output);
   }
   // For each batch and cell: compute recurrent_weight * output_state.
-  MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, output_state,
-                                      accumulation_buffer, output, n_cell,
-                                      n_output, n_batch, context);
+  if (recurrent_is_diag) {
+    tflite::tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recurrent_to_gate_weights, n_cell, output_state, n_batch,
+        accumulation_buffer);
+    std::swap(accumulation_buffer, output);
+  } else {
+    MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, output_state,
+                                        accumulation_buffer, output, n_cell,
+                                        n_output, n_batch, context);
+  }
   // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
@@ -349,8 +367,10 @@ void CalculateLstmGateHybrid(
     const float aux_input_to_gate_weights_scale,
     int32_t* aux_input_to_gate_row_sums,
     // Output state and weights
-    const int8_t* output_state, const float* output_state_sf,
-    const int32_t* output_state_zp, const int8_t* recurrent_to_gate_weights,
+    const int8_t* output_state, const float* output_state_float,
+    const float* output_state_sf, const int32_t* output_state_zp,
+    const int8_t* recurrent_to_gate_weights,
+    const float* recurrent_to_gate_diag,
     const uint8_t* recurrent_to_gate_weights_ledger,
     const float recurrent_to_gate_weights_scale,
     int32_t* recurrent_to_gate_row_sums,
@@ -370,10 +390,10 @@ void CalculateLstmGateHybrid(
     const bool is_output_state_all_zeros, bool* compute_row_sums,
     CpuBackendContext* context,
     // Scratch arrays
-    float* scratch0,        // size: n_batch
-    float* scratch1,        // size: n_cell, only used if peephole LSTM
-    int32_t* accum_scratch  // For MatrixBatchVectorMultiplyAccumulate
-) {
+    float* scratch0,         // size: n_batch
+    float* scratch1,         // size: n_cell, only used if peephole LSTM
+    int32_t* accum_scratch,  // For MatrixBatchVectorMultiplyAccumulate
+    bool recurrent_is_diag) {
   const bool use_peephole = (cell_to_gate_weights != nullptr);
   const bool use_layer_norm = (layer_norm_coefficients != nullptr);
 
@@ -425,11 +445,16 @@ void CalculateLstmGateHybrid(
           recurrent_to_gate_weights, recurrent_to_gate_weights_ledger, n_cell,
           n_output, output_state, scales.data(), n_batch, gate);
     } else {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_gate_weights, n_cell, n_output, output_state,
-          recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
-          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
-          recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+      if (recurrent_is_diag) {
+        tflite::tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+            recurrent_to_gate_diag, n_cell, output_state_float, n_batch, gate);
+      } else {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            recurrent_to_gate_weights, n_cell, n_output, output_state,
+            recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
+            /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
+            recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+      }
     }
   }
   // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
@@ -667,12 +692,13 @@ void CalculateLstmOutputInteger8x8_16(
   // Note: unlike float/hybrid, the activation is always Tanh.
   tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state, n_batch, n_cell,
                           scratch0);
-  tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a, hidden_scale_b,
-                         n_batch, n_cell, hidden_zp, scratch1);
-
   const bool use_projection = (projection_weights != nullptr);
 
   if (use_projection) {
+    // b/246629213 the projection operation assumes -hidden_zp in CwiseMul
+    tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a,
+                           hidden_scale_b, n_batch, n_cell, -hidden_zp,
+                           scratch1);
     // Note: no bias like in float/hybrid
     std::fill_n(output_state, n_batch * n_output, 0);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
@@ -684,7 +710,9 @@ void CalculateLstmOutputInteger8x8_16(
                                   quantized_proj_clip);
     }
   } else {
-    std::copy_n(scratch1, n_batch * n_output, output_state);
+    tensor_utils::CwiseMul(output_gate, scratch0, hidden_scale_a,
+                           hidden_scale_b, n_batch, n_cell, hidden_zp,
+                           output_state);
   }
 }
 
@@ -875,7 +903,9 @@ inline void LstmStepFloat(
     int n_aux_input, int n_output, int output_batch_leading_dim,
     float* output_state_ptr, float* cell_state_ptr, float* scratch0,
     float* scratch1, float* scratch2, float* scratch3, float* scratch4,
-    float* output_ptr, CpuBackendContext* context) {
+    float* output_ptr, bool recurrent_to_input_is_diag,
+    bool recurrent_to_forget_is_diag, bool recurrent_to_cell_is_diag,
+    bool recurrent_to_output_is_diag, CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepFloat");
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -897,17 +927,17 @@ inline void LstmStepFloat(
 
   if (!use_cifg) {
     // Calculate the input gate. (If not CIFG.)
-    CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
-                           aux_input_to_input_weights_ptr, output_state_ptr,
-                           recurrent_to_input_weights_ptr,
-
-                           cell_state_ptr, cell_to_input_weights_ptr,
-                           input_layer_norm_coefficients_ptr,
-                           input_gate_bias_ptr, n_batch, n_input, n_aux_input,
-                           n_output, n_cell,
-                           /*activation=*/kTfLiteActSigmoid, input_gate_scratch,
-                           is_input_all_zeros, is_aux_input_all_zeros,
-                           accumulation_scratch_buffer, context);
+    CalculateLstmGateFloat(
+        input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+        aux_input_to_input_weights_ptr, output_state_ptr,
+        recurrent_to_input_weights_ptr,
+
+        cell_state_ptr, cell_to_input_weights_ptr,
+        input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
+        n_input, n_aux_input, n_output, n_cell,
+        /*activation=*/kTfLiteActSigmoid, input_gate_scratch,
+        is_input_all_zeros, is_aux_input_all_zeros, accumulation_scratch_buffer,
+        recurrent_to_input_is_diag, context);
   }
   // Calculate the forget gate.
   CalculateLstmGateFloat(
@@ -919,7 +949,8 @@ inline void LstmStepFloat(
       forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
       n_input, n_aux_input, n_output, n_cell,
       /*activation=*/kTfLiteActSigmoid, forget_gate_scratch, is_input_all_zeros,
-      is_aux_input_all_zeros, accumulation_scratch_buffer, context);
+      is_aux_input_all_zeros, accumulation_scratch_buffer,
+      recurrent_to_forget_is_diag, context);
   // Calculate the cell update gate.
   CalculateLstmGateFloat(
       input_ptr, input_to_cell_weights_ptr, aux_input_ptr,
@@ -930,7 +961,8 @@ inline void LstmStepFloat(
       /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr,
       cell_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
       params->activation, cell_gate_scratch, is_input_all_zeros,
-      is_aux_input_all_zeros, accumulation_scratch_buffer, context);
+      is_aux_input_all_zeros, accumulation_scratch_buffer,
+      recurrent_to_cell_is_diag, context);
   // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
                       forget_gate_scratch, cell_gate_scratch, use_cifg,
@@ -945,7 +977,8 @@ inline void LstmStepFloat(
       output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
       n_input, n_aux_input, n_output, n_cell,
       /*activation=*/kTfLiteActSigmoid, output_gate_scratch, is_input_all_zeros,
-      is_aux_input_all_zeros, accumulation_scratch_buffer, context);
+      is_aux_input_all_zeros, accumulation_scratch_buffer,
+      recurrent_to_output_is_diag, context);
   // Update the output state.
   CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr,
                            output_gate_scratch, params->activation,
@@ -1052,15 +1085,19 @@ inline void LstmStepHybrid(
     const int8_t* aux_input_to_output_weights_ptr,
     float aux_input_to_output_weights_scale,
     const int8_t* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_input_diag,
     const uint8_t* recurrent_to_input_weights_ledger_ptr,
     float recurrent_to_input_weights_scale,
     const int8_t* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_forget_diag,
     const uint8_t* recurrent_to_forget_weights_ledger_ptr,
     float recurrent_to_forget_weights_scale,
     const int8_t* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_cell_diag,
     const uint8_t* recurrent_to_cell_weights_ledger_ptr,
     float recurrent_to_cell_weights_scale,
     const int8_t* recurrent_to_output_weights_ptr,
+    const float* recurrent_to_output_diag,
     const uint8_t* recurrent_to_output_weights_ledger_ptr,
     float recurrent_to_output_weights_scale,
     const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
@@ -1088,6 +1125,8 @@ inline void LstmStepHybrid(
     float* output_ptr, int32_t* input_zp, int32_t* aux_input_zp,
     int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
     bool* compute_row_sums, bool asymmetric_quantize_inputs,
+    bool recurrent_to_input_is_diag, bool recurrent_to_forget_is_diag,
+    bool recurrent_to_cell_is_diag, bool recurrent_to_output_is_diag,
     CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepHybrid");
   // Since we have already checked that weights are all there or none, we
@@ -1161,7 +1200,9 @@ inline void LstmStepHybrid(
           aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
           recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
           recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-          projection_weights_ptr, use_cifg, aux_input_ptr);
+          projection_weights_ptr, use_cifg, aux_input_ptr,
+          recurrent_to_input_is_diag, recurrent_to_forget_is_diag,
+          recurrent_to_cell_is_diag, recurrent_to_output_is_diag);
       *compute_row_sums = false;
     }
   }
@@ -1198,15 +1239,17 @@ inline void LstmStepHybrid(
         input_to_input_row_sums, quantized_aux_input_ptr, aux_input_sf,
         aux_input_zp, aux_input_to_input_weights_ptr,
         aux_input_to_input_weights_scale, aux_input_to_input_row_sums,
-        quantized_output_state_ptr, output_state_sf, output_state_zp,
-        recurrent_to_input_weights_ptr, recurrent_to_input_weights_ledger_ptr,
+        quantized_output_state_ptr, output_state_ptr, output_state_sf,
+        output_state_zp, recurrent_to_input_weights_ptr,
+        recurrent_to_input_diag, recurrent_to_input_weights_ledger_ptr,
         recurrent_to_input_weights_scale, recurrent_to_input_row_sums,
         cell_state_ptr, cell_to_input_weights_ptr, cell_to_input_weights_scale,
         input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
         n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
         input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
         is_output_state_all_zeros, compute_row_sums, context,
-        scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
+        scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr,
+        recurrent_to_input_is_diag);
   }
   // Calculate the forget gate.
   CalculateLstmGateHybrid(
@@ -1215,15 +1258,17 @@ inline void LstmStepHybrid(
       input_to_forget_row_sums, quantized_aux_input_ptr, aux_input_sf,
       aux_input_zp, aux_input_to_forget_weights_ptr,
       aux_input_to_forget_weights_scale, aux_input_to_forget_row_sums,
-      quantized_output_state_ptr, output_state_sf, output_state_zp,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_ledger_ptr,
+      quantized_output_state_ptr, output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_forget_weights_ptr,
+      recurrent_to_forget_diag, recurrent_to_forget_weights_ledger_ptr,
       recurrent_to_forget_weights_scale, recurrent_to_forget_row_sums,
       cell_state_ptr, cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
       forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
       n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
       forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
       is_output_state_all_zeros, compute_row_sums, context,
-      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
+      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr,
+      recurrent_to_forget_is_diag);
   // Calculate the cell update gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_cell_weights_ptr,
@@ -1231,16 +1276,17 @@ inline void LstmStepHybrid(
       input_to_cell_row_sums, quantized_aux_input_ptr, aux_input_sf,
       aux_input_zp, aux_input_to_cell_weights_ptr,
       aux_input_to_cell_weights_scale, aux_input_to_cell_row_sums,
-      quantized_output_state_ptr, output_state_sf, output_state_zp,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_ledger_ptr,
-      recurrent_to_cell_weights_scale, recurrent_to_cell_row_sums,
+      quantized_output_state_ptr, output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_cell_weights_ptr, recurrent_to_cell_diag,
+      recurrent_to_cell_weights_ledger_ptr, recurrent_to_cell_weights_scale,
+      recurrent_to_cell_row_sums,
       /*cell_state=*/nullptr, /*cell_to_gate_weights=*/nullptr,
       /*cell_to_gate_weights_scale=*/0.0f, cell_layer_norm_coefficients_ptr,
       cell_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
       params->activation, cell_gate_scratch, is_input_all_zeros,
       is_aux_input_all_zeros, is_output_state_all_zeros, compute_row_sums,
       context, scaling_factors_scratch, recovered_cell_weights,
-      accum_scratch_ptr);
+      accum_scratch_ptr, recurrent_to_cell_is_diag);
   // Update the cell state.
   UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch,
                       forget_gate_scratch, cell_gate_scratch, use_cifg,
@@ -1252,15 +1298,17 @@ inline void LstmStepHybrid(
       input_to_output_row_sums, quantized_aux_input_ptr, aux_input_sf,
       aux_input_zp, aux_input_to_output_weights_ptr,
       aux_input_to_output_weights_scale, aux_input_to_output_row_sums,
-      quantized_output_state_ptr, output_state_sf, output_state_zp,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_ledger_ptr,
+      quantized_output_state_ptr, output_state_ptr, output_state_sf,
+      output_state_zp, recurrent_to_output_weights_ptr,
+      recurrent_to_output_diag, recurrent_to_output_weights_ledger_ptr,
       recurrent_to_output_weights_scale, recurrent_to_output_row_sums,
       cell_state_ptr, cell_to_output_weights_ptr, cell_to_output_weights_scale,
       output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
       n_input, n_aux_input, n_output, n_cell, kTfLiteActSigmoid,
       output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros,
       is_output_state_all_zeros, compute_row_sums, context,
-      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr);
+      scaling_factors_scratch, recovered_cell_weights, accum_scratch_ptr,
+      recurrent_to_output_is_diag);
   // Update the output state.
   CalculateLstmOutputHybrid(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
@@ -1769,8 +1817,11 @@ TfLiteStatus EvalFloat(
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
+    bool recurrent_to_input_is_diag, bool recurrent_to_forget_is_diag,
+    bool recurrent_to_cell_is_diag, bool recurrent_to_output_is_diag,
     CpuBackendContext* context) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+
   int max_time, n_batch;
   if (input->dims->size == 3) {
     max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
@@ -1785,7 +1836,9 @@ TfLiteStatus EvalFloat(
 
   // n_cell and n_output will be the same size when there is no projection.
   const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
+  const int n_output = recurrent_to_output_is_diag
+                           ? recurrent_to_output_weights->dims->data[0]
+                           : recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -1859,7 +1912,8 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
           input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
           output_gate_scratch, accumulation_scratch_buffer, output_ptr,
-          context);
+          recurrent_to_input_is_diag, recurrent_to_forget_is_diag,
+          recurrent_to_cell_is_diag, recurrent_to_output_is_diag, context);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1921,7 +1975,8 @@ TfLiteStatus EvalFloat(
             output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
             output_gate_scratch_ptr, accumulation_scratch_buffer, output_ptr,
-            context);
+            recurrent_to_input_is_diag, recurrent_to_forget_is_diag,
+            recurrent_to_cell_is_diag, recurrent_to_output_is_diag, context);
       }
     }
   }
@@ -1973,7 +2028,9 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* output_scratch_buffer, TfLiteTensor* output,
     TfLiteTensor* input_zp, TfLiteTensor* aux_input_zp,
     TfLiteTensor* output_state_zp, TfLiteTensor* row_sums, int row_sums_size,
-    bool* compute_row_sums, CpuBackendContext* context) {
+    bool* compute_row_sums, bool recurrent_to_input_is_diag,
+    bool recurrent_to_forget_is_diag, bool recurrent_to_cell_is_diag,
+    bool recurrent_to_output_is_diag, CpuBackendContext* context) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1988,7 +2045,9 @@ TfLiteStatus EvalHybrid(
       (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
   // n_cell and n_output will be the same size when there is no projection.
   const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
+  const int n_output = recurrent_to_output_is_diag
+                           ? recurrent_to_output_weights->dims->data[0]
+                           : recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to get the condition.
@@ -2061,15 +2120,19 @@ TfLiteStatus EvalHybrid(
           GetTensorData<int8_t>(aux_input_to_output_weights),
           GetTensorScale(aux_input_to_output_weights),
           GetTensorData<int8_t>(recurrent_to_input_weights),
+          GetTensorData<float>(recurrent_to_input_weights),
           GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
           GetTensorScale(recurrent_to_input_weights),
           GetTensorData<int8_t>(recurrent_to_forget_weights),
+          GetTensorData<float>(recurrent_to_forget_weights),
           GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
           GetTensorScale(recurrent_to_forget_weights),
           GetTensorData<int8_t>(recurrent_to_cell_weights),
+          GetTensorData<float>(recurrent_to_cell_weights),
           GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
           GetTensorScale(recurrent_to_cell_weights),
           GetTensorData<int8_t>(recurrent_to_output_weights),
+          GetTensorData<float>(recurrent_to_output_weights),
           GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
           GetTensorScale(recurrent_to_output_weights),
           GetTensorData<int8_t>(cell_to_input_weights),
@@ -2105,7 +2168,8 @@ TfLiteStatus EvalHybrid(
           GetTensorData<int32_t>(output_scratch_buffer), output_ptr,
           input_zp_ptr, aux_input_zp_ptr, output_state_zp_ptr, row_sums_ptr,
           row_sums_size, compute_row_sums, params->asymmetric_quantize_inputs,
-          context);
+          recurrent_to_input_is_diag, recurrent_to_forget_is_diag,
+          recurrent_to_cell_is_diag, recurrent_to_output_is_diag, context);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -2159,15 +2223,19 @@ TfLiteStatus EvalHybrid(
             GetTensorData<int8_t>(aux_input_to_output_weights),
             GetTensorScale(aux_input_to_output_weights),
             GetTensorData<int8_t>(recurrent_to_input_weights),
+            GetTensorData<float>(recurrent_to_input_weights),
             GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
             GetTensorScale(recurrent_to_input_weights),
             GetTensorData<int8_t>(recurrent_to_forget_weights),
+            GetTensorData<float>(recurrent_to_forget_weights),
             GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
             GetTensorScale(recurrent_to_forget_weights),
             GetTensorData<int8_t>(recurrent_to_cell_weights),
+            GetTensorData<float>(recurrent_to_cell_weights),
             GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
             GetTensorScale(recurrent_to_cell_weights),
             GetTensorData<int8_t>(recurrent_to_output_weights),
+            GetTensorData<float>(recurrent_to_output_weights),
             GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
             GetTensorScale(recurrent_to_output_weights),
             GetTensorData<int8_t>(cell_to_input_weights),
@@ -2203,7 +2271,9 @@ TfLiteStatus EvalHybrid(
             cell_state_ptr, GetTensorData<int32_t>(output_scratch_buffer),
             output_ptr, input_zp_ptr, aux_input_zp_ptr, output_state_zp_ptr,
             row_sums_ptr, row_sums_size, compute_row_sums,
-            params->asymmetric_quantize_inputs, context);
+            params->asymmetric_quantize_inputs, recurrent_to_input_is_diag,
+            recurrent_to_forget_is_diag, recurrent_to_cell_is_diag,
+            recurrent_to_output_is_diag, context);
       }
     }
   }
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 63b61c4b658..e0f1c273096 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 namespace tflite {
@@ -121,7 +121,10 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output, CpuBackendContext* context);
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    bool recurrent_to_input_is_diag, bool recurrent_to_forget_is_diag,
+    bool recurrent_to_cell_is_diag, bool recurrent_to_output_is_diag,
+    CpuBackendContext* context);
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
@@ -167,7 +170,9 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* output_scratch_buffer, TfLiteTensor* output,
     TfLiteTensor* input_zp, TfLiteTensor* aux_input_zp,
     TfLiteTensor* output_state_zp, TfLiteTensor* row_sums, int row_sums_size,
-    bool* compute_row_sums, CpuBackendContext* context);
+    bool* compute_row_sums, bool recurrent_to_input_is_diag,
+    bool recurrent_to_forget_is_diag, bool recurrent_to_cell_is_diag,
+    bool recurrent_to_output_is_diag, CpuBackendContext* context);
 
 TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index cd996781028..90d8bf06d57 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 namespace tflite {
@@ -943,7 +943,11 @@ void TestOneHybridAsymmLSTM() {
       one_parameter.GetAccumScratchBuffer(), output,
       one_parameter.GetInputZeroPoints(), one_parameter.GetAuxInputZeroPoints(),
       one_parameter.GetOutputStateZeroPoints(), one_parameter.GetRowSums(),
-      one_parameter.GetNumRowSums(), &compute_row_sums, &context);
+      one_parameter.GetNumRowSums(), &compute_row_sums,
+      /*recurrent_to_input_is_diag=*/false,
+      /*recurrent_to_forget_is_diag=*/false,
+      /*recurrent_to_cell_is_diag=*/false,
+      /*recurrent_to_output_is_diag=*/false, &context);
   const std::vector<float> expected_cell = {
       7.83134,  1.96158, 2.18285, 3.28739,  0.483214,
       0.618206, 1.21539, 1.4052,  -3.17735, 2.24296,  //
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index fc88994aea4..75152dca1cb 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2095,7 +2095,7 @@ TEST(IntegerLstmOpTest, Cifg_NoPeephole_Projection_LayerNorm_8x8_8) {
   }
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(LstmOpTest, InvalidTypes) {
   const int n_batch = 1;
   const int n_input = 2;
diff --git a/tensorflow/lite/kernels/matrix_diag.cc b/tensorflow/lite/kernels/matrix_diag.cc
index ce3219ed281..72eee041162 100644
--- a/tensorflow/lite/kernels/matrix_diag.cc
+++ b/tensorflow/lite/kernels/matrix_diag.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/matrix_set_diag.cc b/tensorflow/lite/kernels/matrix_set_diag.cc
index 514892e8de3..b5b41f644ea 100644
--- a/tensorflow/lite/kernels/matrix_set_diag.cc
+++ b/tensorflow/lite/kernels/matrix_set_diag.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 112adbcb1d3..34aac77a1aa 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
index 28c185e13a2..dd42435c01c 100644
--- a/tensorflow/lite/kernels/mfcc.cc
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/mfcc_dct.h"
 #include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index f5427d193c7..ed474c5e69a 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 4bad6c4c202..d7e31faa96f 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
diff --git a/tensorflow/lite/kernels/multinomial.cc b/tensorflow/lite/kernels/multinomial.cc
index eba1f4a75a0..be4b473091d 100644
--- a/tensorflow/lite/kernels/multinomial.cc
+++ b/tensorflow/lite/kernels/multinomial.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <limits>
 #include <random>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
index 1c8ddf8db19..d2e93828420 100644
--- a/tensorflow/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index f4c046e4f75..5779c2b2812 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <initializer_list>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/numeric_verify.cc b/tensorflow/lite/kernels/numeric_verify.cc
index 45771cbd9b4..2f7d402938e 100644
--- a/tensorflow/lite/kernels/numeric_verify.cc
+++ b/tensorflow/lite/kernels/numeric_verify.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/dequantize.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
index 75bfb48d6b1..2707d45312e 100644
--- a/tensorflow/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index a097c60851e..7cb63136904 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index bd68c4615aa..e13cbbe4421 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index b5dd55612cf..af74d3d4df5 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -187,7 +187,7 @@ class PadOpDynamicModel : public PadOpModel<float> {
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
       PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -342,7 +342,7 @@ std::vector<Matcher<float>> DequantizedArrayNear(
 
 class QuantizedPadOpTest : public ::testing::Test {};
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 template <typename integer_type, TensorType tensor_dtype>
 void ZeroNotInQuantizationRange() {
   // The test_util and actual quantization code currently ensure that the range
@@ -475,7 +475,7 @@ TEST_F(QuantizedPadOpTest, Int16AdvancedDynamicTest) {
   AdvancedDynamicTest<int16_t, TensorType_INT16>();
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
   typedef PadV2OpConstModel<float> f;
   EXPECT_DEATH(f({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -693,7 +693,7 @@ class QuantizedPadV2OpTest : public ::testing::Test {
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 template <TensorType tensor_dtype>
 void ZeroNotInQuantizationRangeV2() {
   // The test_util and actual quantization code currently ensure that the range
diff --git a/tensorflow/lite/kernels/padding.h b/tensorflow/lite/kernels/padding.h
index d9cca3ea135..cc9d596f1a5 100644
--- a/tensorflow/lite/kernels/padding.h
+++ b/tensorflow/lite/kernels/padding.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_PADDING_H_
 #define TENSORFLOW_LITE_KERNELS_PADDING_H_
 
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/parse_example/BUILD b/tensorflow/lite/kernels/parse_example/BUILD
index 4c8bbef2c30..83dc6e0fbb4 100644
--- a/tensorflow/lite/kernels/parse_example/BUILD
+++ b/tensorflow/lite/kernels/parse_example/BUILD
@@ -1,8 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_features_nolayering_check_if_ios")
 load("//tensorflow/lite:special_rules.bzl", "nonportable_visibility_allowlist")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 # Kernel for custom parse_example
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -19,13 +21,14 @@ cc_library(
         "example_proto_fast_parsing.h",
         "parse_example.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     features = tf_features_nolayering_check_if_ios(),
     deps = [
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@flatbuffers",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite:string_util",
@@ -55,10 +58,10 @@ tf_cc_test(
     deps = [
         ":parse_example",
         "@flatbuffers",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/api:op_resolver",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_main",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
@@ -103,7 +106,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_absl//absl/base",
@@ -124,9 +127,10 @@ tf_cc_test(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api:op_resolver",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_main",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/kernels/parse_example/parse_example.cc b/tensorflow/lite/kernels/parse_example/parse_example.cc
index 908de2079ea..fba0e114662 100644
--- a/tensorflow/lite/kernels/parse_example/parse_example.cc
+++ b/tensorflow/lite/kernels/parse_example/parse_example.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 #include "tensorflow/core/util/presized_cuckoo_map.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/parse_example/example_proto_fast_parsing.h"
diff --git a/tensorflow/lite/kernels/parse_example/parse_example_test.cc b/tensorflow/lite/kernels/parse_example/parse_example_test.cc
index e3ed828c8d8..6cc8334410a 100644
--- a/tensorflow/lite/kernels/parse_example/parse_example_test.cc
+++ b/tensorflow/lite/kernels/parse_example/parse_example_test.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 
diff --git a/tensorflow/lite/kernels/perception/BUILD b/tensorflow/lite/kernels/perception/BUILD
index 315b6ca825c..2b5f08c5393 100644
--- a/tensorflow/lite/kernels/perception/BUILD
+++ b/tensorflow/lite/kernels/perception/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -21,7 +22,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:common",
diff --git a/tensorflow/lite/kernels/perception/dense_image_warp.cc b/tensorflow/lite/kernels/perception/dense_image_warp.cc
index 91952810d61..1dd6170bfb6 100644
--- a/tensorflow/lite/kernels/perception/dense_image_warp.cc
+++ b/tensorflow/lite/kernels/perception/dense_image_warp.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
index 29a796f452f..521deb37994 100644
--- a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
+++ b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
index 5f58561f6d0..e87f7f6e0c2 100644
--- a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
+++ b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 84d060e7db4..e279cae8c92 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <cstdlib>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
diff --git a/tensorflow/lite/kernels/pooling3d.cc b/tensorflow/lite/kernels/pooling3d.cc
index 31a0d5185f6..3ab8701f247 100644
--- a/tensorflow/lite/kernels/pooling3d.cc
+++ b/tensorflow/lite/kernels/pooling3d.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/pooling3d_test.cc b/tensorflow/lite/kernels/pooling3d_test.cc
index 954f1b2791b..815b9447ce6 100644
--- a/tensorflow/lite/kernels/pooling3d_test.cc
+++ b/tensorflow/lite/kernels/pooling3d_test.cc
@@ -119,7 +119,7 @@ std::vector<float> BasePoolingOpModel<float>::GetOutput() {
   return ExtractVector<float>(output_);
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(AveragePoolingOpTest, InvalidDimSize) {
   EXPECT_DEATH(BasePoolingOpModel<float> m(
                    kAverage,
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index cbb4bbf15a5..b36b052834f 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -1151,7 +1151,7 @@ TEST(FloatPoolingOpTest, L2PoolPaddingValidSlide1) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.0, 6.5}));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(FloatPoolingOpTest, MaxPoolWithZeroStride) {
   EXPECT_DEATH(
       FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
diff --git a/tensorflow/lite/kernels/pow.cc b/tensorflow/lite/kernels/pow.cc
index 30ad52b6f97..f16573e97dc 100644
--- a/tensorflow/lite/kernels/pow.cc
+++ b/tensorflow/lite/kernels/pow.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 9c5c43e6afa..fa420e435d3 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/random_ops.cc b/tensorflow/lite/kernels/random_ops.cc
index 8636ca2053d..d8ed2eb6e5b 100644
--- a/tensorflow/lite/kernels/random_ops.cc
+++ b/tensorflow/lite/kernels/random_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions_utils.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/random_standard_normal_custom.cc b/tensorflow/lite/kernels/random_standard_normal_custom.cc
index 1564dc7601a..af695146cec 100644
--- a/tensorflow/lite/kernels/random_standard_normal_custom.cc
+++ b/tensorflow/lite/kernels/random_standard_normal_custom.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <random>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/random_uniform_custom.cc b/tensorflow/lite/kernels/random_uniform_custom.cc
index 0b996cd44d4..3b7b94ec910 100644
--- a/tensorflow/lite/kernels/random_uniform_custom.cc
+++ b/tensorflow/lite/kernels/random_uniform_custom.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <limits>
 #include <random>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index 1bf255089af..b71e6ed5539 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <functional>
 #include <type_traits>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -111,8 +111,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = dtype;
 
-  if (IsConstantTensor(start) && IsConstantTensor(limit) &&
-      IsConstantTensor(delta)) {
+  if (IsConstantOrPersistentTensor(start) &&
+      IsConstantOrPersistentTensor(limit) &&
+      IsConstantOrPersistentTensor(delta)) {
     return ResizeOutput(context, start, limit, delta, output);
   }
 
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index aa0f9c2fd25..289d7b40d6c 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/read_variable.cc b/tensorflow/lite/kernels/read_variable.cc
index 45f65250bcf..7a2dc46e9ac 100644
--- a/tensorflow/lite/kernels/read_variable.cc
+++ b/tensorflow/lite/kernels/read_variable.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <stdint.h>
 #include <string.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 33ef06ed728..b7088a7a3d2 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <limits>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/mean.h"
@@ -253,7 +253,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, GetTemporarySafe(context, node, /*index=*/3, &normalized_dims));
 
-  if (!IsConstantTensor(op_context.input)) {
+  if (!IsConstantOrPersistentTensor(op_context.input)) {
     SetTensorToDynamic(normalized_dims);
   } else {
     normalized_dims->allocation_type = kTfLiteArenaRw;
@@ -261,7 +261,7 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
                       ResizeTempDims(context, &op_context, normalized_dims));
   }
   // Leaves work to Eval if axis is not constant; else resizes output.
-  if (!IsConstantTensor(op_context.axis)) {
+  if (!IsConstantOrPersistentTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
     SetTensorToDynamic(resolved_axis);
     return kTfLiteOk;
@@ -306,7 +306,7 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* temp_sum;
   TF_LITE_ENSURE_OK(context,
                     GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
-  if (!IsConstantTensor(op_context.axis)) {
+  if (!IsConstantOrPersistentTensor(op_context.axis)) {
     SetTensorToDynamic(temp_sum);
     return kTfLiteOk;
   }
@@ -338,7 +338,7 @@ TfLiteStatus PrepareProd(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point, 0);
   }
 
-  if (!IsConstantTensor(op_context.axis)) {
+  if (!IsConstantOrPersistentTensor(op_context.axis)) {
     SetTensorToDynamic(temp_prod);
     return kTfLiteOk;
   }
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index f371a19ff69..6721dc69a32 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-/// For documentation, see third_party/tensorflow/lite/core/kernels/register.h.
-#include "tensorflow/lite/core/kernels/register.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/kernels/register.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+using BuiltinOpResolver = ::tflite::ops::builtin::BuiltinOpResolver;
+using BuiltinOpResolverWithoutDefaultDelegates =
+    ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates;
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_REGISTER_H_
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index b9503ef317f..bb852cd4160 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/register_ref.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/kernels/register_ref.h b/tensorflow/lite/kernels/register_ref.h
index 406fad89673..f8054ec1645 100644
--- a/tensorflow/lite/kernels/register_ref.h
+++ b/tensorflow/lite/kernels/register_ref.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/kernels/reshape.cc b/tensorflow/lite/kernels/reshape.cc
index aede3eb9357..e9bcda644f0 100644
--- a/tensorflow/lite/kernels/reshape.cc
+++ b/tensorflow/lite/kernels/reshape.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <cstring>
 #include <memory>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -148,7 +148,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
   if (output->type != kTfLiteString) {
     if (NumInputs(node) == 1 ||
-        IsConstantTensor(GetInput(context, node, kShapeTensor))) {
+        IsConstantOrPersistentTensor(GetInput(context, node, kShapeTensor))) {
       TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
     } else {
       SetTensorToDynamic(output);
@@ -183,7 +183,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     output->bytes = bytes_required;
   }
 
-  memcpy(output->data.raw, input->data.raw, input->bytes);
+  // Only copy data if input and output do not share a buffer.
+  if (output->data.data != input->data.data) {
+    memcpy(output->data.data, input->data.data, input->bytes);
+  }
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 1aa2be692ec..419fc3b6001 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -52,7 +52,7 @@ TYPED_TEST(ReshapeOpTest, MismatchedDimensions) {
       EXPECT_NE(m.Invoke(), kTfLiteOk)
           << "num_input_elements != num_output_elements";
     } else {
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
       EXPECT_DEATH(
           ReshapeOpModel<TypeParam>({1, 2, 4, 1}, {2}, {2, 1}, shape_type),
           "num_input_elements != num_output_elements");
@@ -62,7 +62,7 @@ TYPED_TEST(ReshapeOpTest, MismatchedDimensions) {
 }
 
 TYPED_TEST(ReshapeOpTest, TooManyDimensions) {
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
   for (ShapeSpecificationType shape_type :
        ReshapeOpTest<ShapeSpecificationType>::_range_) {
     EXPECT_DEATH(
@@ -77,7 +77,7 @@ TYPED_TEST(ReshapeOpTest, TooManySpecialDimensions) {
   for (ShapeSpecificationType shape_type :
        ReshapeOpTest<ShapeSpecificationType>::_range_) {
     if (shape_type != ShapeSpecificationType::kAsTensor) {
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
       EXPECT_DEATH(ReshapeOpModel<TypeParam>({1, 2, 4, 1}, {4}, {-1, -1, 2, 4},
                                              shape_type),
                    "stretch_dim != -1");
@@ -175,7 +175,7 @@ TYPED_TEST(ReshapeOpTest, LegacyScalarOutput) {
   for (ShapeSpecificationType shape_type :
        ReshapeOpTest<ShapeSpecificationType>::_range_) {
     if (shape_type == ShapeSpecificationType::kAsConstantTensor) {
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
       EXPECT_DEATH(ReshapeOpModel<TypeParam>({1}, {1}, {0}, shape_type),
                    "num_input_elements != num_output_elements");
 #endif
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index e757f1e0638..b04500f6afa 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index 85e833c351a..3d3c31959d3 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
index 3405b6722c8..279bcb24a80 100644
--- a/tensorflow/lite/kernels/reverse.cc
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/reverse_sequence.cc b/tensorflow/lite/kernels/reverse_sequence.cc
index 4e5babd1125..f7a4ebba2b6 100644
--- a/tensorflow/lite/kernels/reverse_sequence.cc
+++ b/tensorflow/lite/kernels/reverse_sequence.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index ae7fa35f2de..ab01bf72fd6 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "third_party/fft2d/fft2d.h"
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
diff --git a/tensorflow/lite/kernels/roll.cc b/tensorflow/lite/kernels/roll.cc
index 65fda4f3342..e8c75d3345c 100644
--- a/tensorflow/lite/kernels/roll.cc
+++ b/tensorflow/lite/kernels/roll.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <cstring>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/roll_test.cc b/tensorflow/lite/kernels/roll_test.cc
index fbbab612aca..29604f893c4 100644
--- a/tensorflow/lite/kernels/roll_test.cc
+++ b/tensorflow/lite/kernels/roll_test.cc
@@ -89,7 +89,7 @@ class BaseRollOpModel : public SingleOpModel {
   int output_;
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(RollOpTest, MismatchSize) {
   EXPECT_DEATH(BaseRollOpModel m(/*input=*/{TensorType_FLOAT32, {1, 2, 4, 2}},
                                  /*shift=*/{2, 3}, /*axis=*/{2},
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
index 60db1a144a3..883fcc53ba0 100644
--- a/tensorflow/lite/kernels/round.cc
+++ b/tensorflow/lite/kernels/round.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/scatter_nd.cc b/tensorflow/lite/kernels/scatter_nd.cc
index c5aab584093..e92dca7b823 100644
--- a/tensorflow/lite/kernels/scatter_nd.cc
+++ b/tensorflow/lite/kernels/scatter_nd.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -105,7 +105,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = updates->type;
 
-  if (IsConstantTensor(shape)) {
+  if (IsConstantOrPersistentTensor(shape)) {
     switch (indices->type) {
       case kTfLiteInt32:
         TF_LITE_ENSURE_OK(
diff --git a/tensorflow/lite/kernels/segment_sum.cc b/tensorflow/lite/kernels/segment_sum.cc
index d6f0d6ade4b..6eba89ab838 100644
--- a/tensorflow/lite/kernels/segment_sum.cc
+++ b/tensorflow/lite/kernels/segment_sum.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index c9a23c80bcc..5563f74ae0a 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index f092b0b00db..9b625d4d4b6 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/shim/BUILD b/tensorflow/lite/kernels/shim/BUILD
index 4d2a647881b..54a21720c30 100644
--- a/tensorflow/lite/kernels/shim/BUILD
+++ b/tensorflow/lite/kernels/shim/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.bzl", "if_mobile", "if_not_mobile", "tf_cc_test", "tf_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # The public targets are marked public individually
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],
@@ -18,7 +19,6 @@ filegroup(
         "tf_op_shim.h",
         "tf_tensor_view.h",
     ],
-    # copybara:uncomment compatible_with = ["//buildenv/target:gce"],
     visibility = ["//tensorflow/core:__pkg__"],
 )
 
@@ -110,7 +110,7 @@ cc_library(
         ":tensor_view",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -138,7 +138,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":shape",
+        ":status_macros",
         ":tensor_view",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -183,7 +185,8 @@ cc_library(
         ":tensor_view",
         ":tflite_tensor_view",
         "//tensorflow/lite:mutable_op_resolver",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -191,6 +194,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tflite_op_wrapper",
+    hdrs = ["tflite_op_wrapper.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_kernel",
+        ":status_macros",
+        "//tensorflow/lite:type_to_tflitetype",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
+cc_test(
+    name = "tflite_op_wrapper_test",
+    srcs = ["tflite_op_wrapper_test.cc"],
+    deps = [
+        ":op_kernel",
+        ":tflite_op_shim",
+        ":tflite_op_wrapper",
+        "//tensorflow/core/platform:tstring",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:public_headers_lib",
+    ],
+)
+
 cc_library(
     name = "status_macros",
     hdrs = ["status_macros.h"],
@@ -207,7 +237,7 @@ cc_library(
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:type_to_tflitetype",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/lite/kernels/shim/README.md b/tensorflow/lite/kernels/shim/README.md
index dd808546371..5e7f852dced 100644
--- a/tensorflow/lite/kernels/shim/README.md
+++ b/tensorflow/lite/kernels/shim/README.md
@@ -121,6 +121,9 @@ This is because `OpKernelShim` is a static interface using the CRTP pattern.
 Similarly, the context classes: `InitContext`, `InvokeContext` and
 `ShapeInferenceContext` are all static interfaces in the same way.
 
+The class `MyOp` can also be templatized. See `test_op/tmpl_op.h` for an
+example.
+
 ### Context Interfaces
 
 An op kernel written using this library has access to a number of *context*
diff --git a/tensorflow/lite/kernels/shim/op_kernel.h b/tensorflow/lite/kernels/shim/op_kernel.h
index 461ffce137c..84ff3c2c851 100644
--- a/tensorflow/lite/kernels/shim/op_kernel.h
+++ b/tensorflow/lite/kernels/shim/op_kernel.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/variant.h"
 #include "tensorflow/lite/kernels/shim/shape.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
 #include "tensorflow/lite/kernels/shim/tensor_view.h"
 
 namespace tflite {
@@ -177,7 +178,8 @@ struct ContextTypeForRuntime {
 //   };
 //
 // WARNING: Experimental interface, subject to change
-template <template <Runtime> typename SubType, Runtime Rt>
+template <template <Runtime, typename...> typename SubType, Runtime Rt,
+          typename... Ts>
 class OpKernelShim {
  public:
   // Some typedefs for convenience
@@ -194,21 +196,26 @@ class OpKernelShim {
 
   // If the operation has any attributes they are passed here.
   absl::Status Init(InitContext* ctx) {
-    return static_cast<SubType<Rt>&>(*this).Init(ctx);
+    return static_cast<SubType<Rt, Ts...>&>(*this).Init(ctx);
   }
 
   // The actual computations of the operation
   absl::Status Invoke(InvokeContext* ctx) {
-    return static_cast<SubType<Rt>&>(*this).Invoke(ctx);
+    return static_cast<SubType<Rt, Ts...>&>(*this).Invoke(ctx);
   }
 
   // Shape inference
   static absl::Status ShapeInference(ShapeInferenceContext* ctx) {
-    return SubType<Rt>::ShapeInference(ctx);
+    return SubType<Rt, Ts...>::ShapeInference(ctx);
   }
 
  protected:
   OpKernelShim() = default;
+
+  // Convience method for filling a single dimension output tensor.
+  template <typename BufferType, typename DType>
+  absl::Status FillOutputTensor(const std::vector<BufferType>& buffer,
+                                int index, InvokeContext* context) const;
 };
 
 /////////////////////// Implementations
@@ -248,7 +255,23 @@ absl::Status ShapeInferenceContext<SubType>::GetAttr(
   return internal::GetAttr<AttrType>(attr_name, attr_value_or, value);
 }
 
+template <template <Runtime, typename...> typename SubType, Runtime Rt,
+          typename... Ts>
+template <typename BufferType, typename DType>
+absl::Status OpKernelShim<SubType, Rt, Ts...>::FillOutputTensor(
+    const std::vector<BufferType>& buffer, const int index,
+    tflite::shim::InvokeContext<typename ContextTypeForRuntime<Rt>::Invoke>*
+        context) const {
+  SH_ASSIGN_OR_RETURN(
+      const auto tensorview,
+      context->GetOutput(
+          index, tflite::shim::Shape({static_cast<int>(buffer.size())})));
+  auto data = tensorview->template As<DType, 1>();
+  for (int i = 0; i < buffer.size(); ++i) data(i) = buffer.at(i);
+  return absl::OkStatus();
+}
+
 }  // namespace shim
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_KERNELS_SHIM_ABSTRACT_OP_H_
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_OP_KERNEL_H_
diff --git a/tensorflow/lite/kernels/shim/tensor_view.h b/tensorflow/lite/kernels/shim/tensor_view.h
index 2e966106393..fc952a4496d 100644
--- a/tensorflow/lite/kernels/shim/tensor_view.h
+++ b/tensorflow/lite/kernels/shim/tensor_view.h
@@ -15,10 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_SHIM_TENSOR_VIEW_H_
 #define TENSORFLOW_LITE_KERNELS_SHIM_TENSOR_VIEW_H_
 
+#include <variant>
+
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "absl/types/variant.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tstring.h"
 
@@ -47,10 +48,11 @@ class TensorView {
  protected:
   // Union over all data types
   using DataVariantType =
-      absl::variant<absl::Span<bool>, absl::Span<uint8_t>, absl::Span<uint64_t>,
-                    absl::Span<int8_t>, absl::Span<int16_t>,
-                    absl::Span<int32_t>, absl::Span<int64_t>, absl::Span<float>,
-                    absl::Span<double>, absl::Span<::tensorflow::tstring>>;
+      std::variant<absl::Span<bool>, absl::Span<int8_t>, absl::Span<uint8_t>,
+                   absl::Span<int16_t>, absl::Span<uint16_t>,
+                   absl::Span<int32_t>, absl::Span<uint32_t>,
+                   absl::Span<int64_t>, absl::Span<uint64_t>, absl::Span<float>,
+                   absl::Span<double>, absl::Span<::tensorflow::tstring>>;
 
   // An interface while provides convenient row-major indexing over the
   // underlying tensor.
@@ -171,11 +173,11 @@ class TensorView {
   // Data
   template <typename DType>
   absl::Span<DType> &Data() {
-    return absl::get<absl::Span<DType>>(data_);
+    return std::get<absl::Span<DType>>(data_);
   }
   template <typename DType>
   constexpr absl::Span<DType> Data() const {
-    return absl::get<absl::Span<DType>>(data_);
+    return std::get<absl::Span<DType>>(data_);
   }
 
   // Reads the tensor given the dtype and its rank and provides an indexing
diff --git a/tensorflow/lite/kernels/shim/test_op/BUILD b/tensorflow/lite/kernels/shim/test_op/BUILD
index a60454fd7a7..b771b2414e5 100644
--- a/tensorflow/lite/kernels/shim/test_op/BUILD
+++ b/tensorflow/lite/kernels/shim/test_op/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_kernel_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/lite/kernels/shim:__subpackages__"],
     licenses = ["notice"],
 )
@@ -57,7 +58,7 @@ cc_library(
     deps = [
         ":simple_op",
         "//tensorflow/lite:mutable_op_resolver",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/shim:tflite_op_shim",
     ],
 )
@@ -68,11 +69,87 @@ cc_test(
     deps = [
         ":simple_tflite_op",
         "//tensorflow/core/platform:tstring",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
 )
 
+cc_library(
+    name = "tmpl_op",
+    hdrs = ["tmpl_op.h"],
+    deps = [
+        "//tensorflow/lite/kernels/shim:op_kernel",
+        "//tensorflow/lite/kernels/shim:status_macros",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+tf_kernel_library(
+    name = "tmpl_tf_op",
+    srcs = ["tmpl_tf_op.cc"],
+    hdrs = ["tmpl_tf_op.h"],
+    deps = [
+        ":tmpl_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/lite/kernels/shim:tf_op_shim",
+    ],
+)
+
+tf_kernel_library(
+    name = "tmpl_tflite_op",
+    srcs = ["tmpl_tflite_op.cc"],
+    hdrs = ["tmpl_tflite_op.h"],
+    deps = [
+        ":tmpl_op",
+        "//tensorflow/lite:mutable_op_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels/shim:tflite_op_shim",
+        "//tensorflow/lite/kernels/shim:tflite_op_wrapper",
+    ],
+)
+
+tf_cc_test(
+    name = "tmpl_tf_op_test",
+    srcs = ["tmpl_tf_op_test.cc"],
+    tags = [
+        "nochromiumos_arm",
+        # Exclude from mobile builds
+        "tflite_not_portable",
+        "tflite_not_portable_android",
+    ],
+    deps = [
+        ":tmpl_tf_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:fake_input",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:ops_testutil",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "tmpl_tflite_op_test",
+    srcs = ["tmpl_tflite_op_test.cc"],
+    tags = [
+        "nochromiumos_arm",
+        # Exclude from mobile builds
+        "tflite_not_portable",
+        "tflite_not_portable_android",
+    ],
+    deps = [
+        ":tmpl_tflite_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:public_headers_lib",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/kernels/shim/test_op/README.md b/tensorflow/lite/kernels/shim/test_op/README.md
index 908bc68aa2d..2787304daca 100644
--- a/tensorflow/lite/kernels/shim/test_op/README.md
+++ b/tensorflow/lite/kernels/shim/test_op/README.md
@@ -1,16 +1,19 @@
 This directory contains a fake operation in order to demonstrate and test the
 interfaces.
 
+First test op `SimpleOp` which is an op that test various attributes and input
+and output types. The other one is `TmplOp` which tests a templatized kernel.
+
 The contents:
 
-## `simple_op.h|cc`
+## `simple_op.h|cc`, `tmpl_op.h|cc`
 
 This is where the actual implementation of this op resides
 
-## `simple_tf_op.cc`
+## `simple_tf_op.cc`, `tmpl_tf_op.cc`
 
-The TF op definition which uses `simple_op`.
+The TF op definition.
 
 ## `simple_tflite_op.h|cc`
 
-The TFLite op definition which uses `simple_op`.
+The TFLite op definition.
diff --git a/tensorflow/lite/kernels/shim/test_op/simple_op.h b/tensorflow/lite/kernels/shim/test_op/simple_op.h
index 16980844149..04498410353 100644
--- a/tensorflow/lite/kernels/shim/test_op/simple_op.h
+++ b/tensorflow/lite/kernels/shim/test_op/simple_op.h
@@ -68,6 +68,9 @@ Outputs
   out3: int64, list<shape=?> - fourth output that is in1 but incremented.
 )doc";
 
+  static const char* OpName() { return kOpName; }
+  static const char* Doc() { return kDoc; }
+
   // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op)
   static std::vector<std::string> Attrs() {
     return {absl::StrCat(kOutput1SizeAttr, ": int"), "output2_suffix: string",
diff --git a/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.cc b/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.cc
index 942b971b009..616e819c70e 100644
--- a/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.cc
+++ b/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h"
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/shim/test_op/simple_op.h"
 #include "tensorflow/lite/kernels/shim/tflite_op_shim.h"
 
diff --git a/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h b/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h
index 91f278c51b1..e7eee061af8 100644
--- a/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h
+++ b/tensorflow/lite/kernels/shim/test_op/simple_tflite_op.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TFLITE_OP_H_
 #define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_SIMPLE_TFLITE_OP_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_op.h b/tensorflow/lite/kernels/shim/test_op/tmpl_op.h
new file mode 100644
index 00000000000..061e2404716
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_op.h
@@ -0,0 +1,104 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+
+namespace tflite {
+namespace shim {
+
+// A simple operation for demonstration and testing purposes.
+// See the kDoc member for documentation.
+
+template <Runtime Rt, typename AType, typename BType>
+class TmplOp : public OpKernelShim<TmplOp, Rt, AType, BType> {
+ protected:
+  enum Inputs { kInput0 = 0, kInput1 };
+  enum Outputs { kOutput0 = 0 };
+
+ public:
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::InitContext;
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::InvokeContext;
+  using typename OpKernelShim<TmplOp, Rt, AType, BType>::ShapeInferenceContext;
+
+  TmplOp() = default;
+  static constexpr char kOpName[] = "TemplatizedOperation";
+  static constexpr char kDoc[] = R"doc(
+Description:
+  Templatized op for testing and demonstration purposes.
+
+Attrs
+  AType: The type for input0
+  BType: The type for input1
+Inputs
+  in0: AType, shape=[] - A scalar input
+  in1: BType, shape=[] - A scalar input
+Outputs
+  out0: int, shape=[] - first output
+)doc";
+
+  static const char* OpName() { return kOpName; }
+  static const char* Doc() { return kDoc; }
+
+  // Attributes declaration (syntax: https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Attrs() {
+    return {"AType: {int32, float} = DT_INT32", "BType: type"};
+  }
+  // Input tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Inputs() {
+    return {"in0: AType", "in1: BType"};
+  }
+  // Output tensors declaration (syntax:
+  // https://www.tensorflow.org/guide/create_op)
+  static std::vector<std::string> Outputs() { return {"out0: float"}; }
+
+  // Initializes the op
+  absl::Status Init(InitContext* ctx) { return absl::OkStatus(); }
+
+  // Shape inference
+  static absl::Status ShapeInference(ShapeInferenceContext* ctx) {
+    // outpu0
+    SH_RETURN_IF_ERROR(ctx->SetOutputShape(kOutput0, Shape({})));
+    return absl::OkStatus();
+  }
+
+  // Runs the operation
+  absl::Status Invoke(InvokeContext* ctx) {
+    using std::int32_t;
+    // input 0
+    SH_ASSIGN_OR_RETURN(const auto input0_t, ctx->GetInput(kInput0));
+    const auto in0 = input0_t->template AsScalar<AType>();
+    // input 1
+    SH_ASSIGN_OR_RETURN(const auto input1_t, ctx->GetInput(kInput1));
+    const auto in1 = input1_t->template AsScalar<BType>();
+    // output 0
+    SH_ASSIGN_OR_RETURN(auto output0_t, ctx->GetOutput(kOutput0, Shape({})));
+    auto& out0 = output0_t->template AsScalar<float>();
+    out0 = in0 + in1;
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_OP_H_
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.cc b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.cc
new file mode 100644
index 00000000000..adea65d4e49
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.cc
@@ -0,0 +1,41 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h"
+
+#include <cstdint>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tflite {
+namespace shim {
+
+using TmplOpKernelInstance = TmplOpKernel<float, int32_t>;
+
+REGISTER_TF_OP_SHIM(TmplOpKernelInstance);
+
+REGISTER_KERNEL_BUILDER(Name(TmplOpKernelInstance::OpName())
+                            .Device(::tensorflow::DEVICE_CPU)
+                            .TypeConstraint<float>("AType")
+                            .TypeConstraint<int32_t>("BType"),
+                        TmplOpKernel<float, int32_t>);
+
+REGISTER_KERNEL_BUILDER(Name(TmplOpKernelInstance::OpName())
+                            .Device(::tensorflow::DEVICE_CPU)
+                            .TypeConstraint<int32_t>("AType")
+                            .TypeConstraint<int64_t>("BType"),
+                        TmplOpKernel<int32_t, int64_t>);
+
+}  // namespace shim
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h
new file mode 100644
index 00000000000..3183c261879
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
+
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_op.h"
+#include "tensorflow/lite/kernels/shim/tf_op_shim.h"
+
+namespace tflite {
+namespace shim {
+
+template <typename AType, typename BType>
+class TmplOpKernel : public TfOpKernel<TmplOp, AType, BType> {
+ public:
+  using TfOpKernel<TmplOp, AType, BType>::TfOpKernel;
+};
+
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TF_OP_H_
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op_test.cc b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op_test.cc
new file mode 100644
index 00000000000..1085afe583d
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tf_op_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+
+namespace tflite {
+namespace shim {
+namespace {
+
+using ::tensorflow::DT_FLOAT;
+using ::tensorflow::DT_INT32;
+using ::tensorflow::DT_INT64;
+using ::tensorflow::FakeInput;
+using ::tensorflow::NodeDefBuilder;
+using ::tensorflow::TensorShape;
+using ::tensorflow::test::AsTensor;
+using ::tensorflow::test::ExpectTensorEqual;
+
+class TmplOpTfTest : public ::tensorflow::OpsTestBase {};
+
+TEST_F(TmplOpTfTest, float_int32) {
+  // Prepare graph.
+  TF_ASSERT_OK(NodeDefBuilder("tmpl_op", "TemplatizedOperation")
+                   .Attr("AType", DT_FLOAT)
+                   .Attr("BType", DT_INT32)
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT32))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({}), {10.5});
+  AddInputFromArray<int32_t>(TensorShape({}), {20});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Validate the output.
+  ExpectTensorEqual<float>(*GetOutput(0),
+                           AsTensor<float>({30.5}, /*shape=*/{}));
+}
+
+TEST_F(TmplOpTfTest, int32_int64) {
+  // Prepare graph.
+  TF_ASSERT_OK(NodeDefBuilder("tmpl_op", "TemplatizedOperation")
+                   .Attr("AType", DT_INT32)
+                   .Attr("BType", DT_INT64)
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(DT_INT64))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<int32_t>(TensorShape({}), {10});
+  AddInputFromArray<int64_t>(TensorShape({}), {20});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Validate the output.
+  ExpectTensorEqual<float>(*GetOutput(0), AsTensor<float>({30}, /*shape=*/{}));
+}
+
+}  // namespace
+}  // namespace shim
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.cc b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.cc
new file mode 100644
index 00000000000..4f207ab32ff
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.cc
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_op.h"
+#include "tensorflow/lite/kernels/shim/tflite_op_shim.h"
+#include "tensorflow/lite/kernels/shim/tflite_op_wrapper.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace {
+const char a_type[]("AType"), b_type[]("BType");
+}  // namespace
+
+using ::tflite::shim::op_wrapper::Attr;
+using ::tflite::shim::op_wrapper::AttrName;
+using ::tflite::shim::op_wrapper::OpWrapper;
+
+template <shim::Runtime Rt>
+using Op = OpWrapper<Rt, shim::TmplOp, Attr<AttrName<a_type>, int32_t, float>,
+                     Attr<AttrName<b_type>, int32_t, int64_t, bool>>;
+
+using OpKernel = ::tflite::shim::TfLiteOpKernel<Op>;
+
+void AddTmplOp(MutableOpResolver* resolver) { OpKernel::Add(resolver); }
+
+TfLiteRegistration* Register_TMPL_OP() {
+  return OpKernel::GetTfLiteRegistration();
+}
+
+const char* OpName_TMPL_OP() { return OpKernel::OpName(); }
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h
new file mode 100644
index 00000000000..f9e35ec9e84
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+// Add TmplOp to the resolver
+void AddTmplOp(MutableOpResolver* resolver);
+
+// Creates and returns the op kernel
+TfLiteRegistration* Register_TMPL_OP();
+
+// The name of the op
+const char* OpName_TMPL_OP();
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TEST_OP_TMPL_TFLITE_OP_H_
diff --git a/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op_test.cc b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op_test.cc
new file mode 100644
index 00000000000..6da43ae853a
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/shim/test_op/tmpl_tflite_op.h"
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace shim {
+namespace {
+
+template <typename AType, typename BType>
+class TmplOpModel : public SingleOpModel {
+ public:
+  // Builds the op model and feeds in inputs, ready to invoke.
+  TmplOpModel(const std::vector<uint8_t>& op_options,
+              const std::vector<tflite::TensorType>& input_types,
+              const std::vector<std::vector<int>>& input_shapes,
+              const std::vector<AType>& input0,
+              const std::vector<BType>& input1,
+              const std::vector<tflite::TensorType>& output_types) {
+    // Define inputs.
+    std::vector<int> input_idx;
+    for (const auto input_type : input_types) {
+      input_idx.push_back(AddInput(input_type));
+    }
+    // Define outputs.
+    for (const auto output_type : output_types) {
+      output_idx_.push_back(AddOutput(output_type));
+    }
+    // Build the interpreter.
+    SetCustomOp(ops::custom::OpName_TMPL_OP(), op_options,
+                ops::custom::Register_TMPL_OP);
+    BuildInterpreter(input_shapes);
+    // Populate inputs.
+    PopulateTensor(input_idx[0], input0);
+    PopulateTensor(input_idx[1], input1);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput(const int i) {
+    return ExtractVector<T>(output_idx_[i]);
+  }
+
+  std::vector<int> GetOutputShape(const int i) {
+    return GetTensorShape(output_idx_[i]);
+  }
+
+ protected:
+  // Tensor indices
+  std::vector<int> output_idx_;
+};
+
+TEST(TmplOpModel, float_int32) {
+  // Test input
+  flexbuffers::Builder builder;
+  builder.Map([&]() {
+    builder.Int("AType", kTfLiteFloat32);
+    builder.Int("BType", kTfLiteInt32);
+  });
+  builder.Finish();
+  std::vector<std::vector<int>> input_shapes = {{}, {}};
+  std::vector<tflite::TensorType> input_types = {tflite::TensorType_FLOAT32,
+                                                 tflite::TensorType_INT32};
+  std::vector<tflite::TensorType> output_types = {tflite::TensorType_FLOAT32};
+  const std::vector<float> input0 = {5.6f};
+  const std::vector<int32_t> input1 = {3};
+  // Run the op
+  TmplOpModel<float, int32_t> m(
+      /*op_options=*/builder.GetBuffer(), input_types, input_shapes, input0,
+      input1, output_types);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  // Assertions
+  EXPECT_THAT(m.GetOutput<float>(0), testing::ElementsAre(8.6f));
+}
+
+TEST(TmplOpModel, int32_int64) {
+  // Test input
+  flexbuffers::Builder builder;
+  builder.Map([&]() {
+    builder.Int("AType", kTfLiteInt32);
+    builder.Int("BType", kTfLiteInt64);
+  });
+  builder.Finish();
+  std::vector<std::vector<int>> input_shapes = {{}, {}};
+  std::vector<tflite::TensorType> input_types = {tflite::TensorType_INT32,
+                                                 tflite::TensorType_INT64};
+  std::vector<tflite::TensorType> output_types = {tflite::TensorType_FLOAT32};
+  const std::vector<int32_t> input0 = {12};
+  const std::vector<int64_t> input1 = {33l};
+  // Run the op
+  TmplOpModel<int32_t, int64_t> m(
+      /*op_options=*/builder.GetBuffer(), input_types, input_shapes, input0,
+      input1, output_types);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  // Assertions
+  EXPECT_THAT(m.GetOutput<float>(0), testing::ElementsAre(45.0f));
+}
+
+TEST(TmplOpModel, int32_bool) {
+  // Test input
+  flexbuffers::Builder builder;
+  builder.Map([&]() {
+    builder.Int("AType", kTfLiteInt32);
+    builder.Int("BType", kTfLiteBool);
+  });
+  builder.Finish();
+  std::vector<std::vector<int>> input_shapes = {{}, {}};
+  std::vector<tflite::TensorType> input_types = {tflite::TensorType_INT32,
+                                                 tflite::TensorType_BOOL};
+  std::vector<tflite::TensorType> output_types = {tflite::TensorType_FLOAT32};
+  const std::vector<int32_t> input0 = {12};
+  const std::vector<bool> input1 = {true};
+  // Run the op
+  TmplOpModel<int32_t, bool> m(
+      /*op_options=*/builder.GetBuffer(), input_types, input_shapes, input0,
+      input1, output_types);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  // Assertions
+  EXPECT_THAT(m.GetOutput<float>(0), testing::ElementsAre(13.0f));
+}
+
+}  // namespace
+}  // namespace shim
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/test_util.h b/tensorflow/lite/kernels/shim/test_util.h
index 6a7b0665343..9177948ef19 100644
--- a/tensorflow/lite/kernels/shim/test_util.h
+++ b/tensorflow/lite/kernels/shim/test_util.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/shim/tf_op_shim.h b/tensorflow/lite/kernels/shim/tf_op_shim.h
index bf2f5ab93df..000f6e9e3ef 100644
--- a/tensorflow/lite/kernels/shim/tf_op_shim.h
+++ b/tensorflow/lite/kernels/shim/tf_op_shim.h
@@ -88,10 +88,10 @@ class TfShapeInferenceContext
 
 // The adaptor between an op implementation (OpKernelShim subclass) and TF
 // runtime
-template <template <Runtime> typename Impl>
+template <template <Runtime, typename...> typename Impl, typename... Ts>
 class TfOpKernel : public ::tensorflow::OpKernel {
  public:
-  using ImplType = Impl<Runtime::kTf>;
+  using ImplType = Impl<Runtime::kTf, Ts...>;
 
   explicit TfOpKernel(::tensorflow::OpKernelConstruction* c)
       : OpKernel(c), impl_(std::make_unique<ImplType>()) {
@@ -113,10 +113,10 @@ class TfOpKernel : public ::tensorflow::OpKernel {
   }
 
   // The operation name
-  static const char* OpName() { return ImplType::kOpName; }
+  static const char* OpName() { return ImplType::OpName(); }
 
  protected:
-  std::unique_ptr<OpKernelShim<Impl, Runtime::kTf>> impl_;
+  std::unique_ptr<OpKernelShim<Impl, Runtime::kTf, Ts...>> impl_;
 };
 
 static_assert(::tensorflow::shape_inference::InferenceContext::kUnknownDim ==
@@ -129,8 +129,8 @@ static_assert(::tensorflow::shape_inference::InferenceContext::kUnknownRank ==
 // Builds the OpDef to register the op with the TF runtime
 template <typename Kernel>
 ::tensorflow::register_op::OpDefBuilderWrapper CreateOpDefBuilderWrapper() {
-  auto ret =
-      ::tensorflow::register_op::OpDefBuilderWrapper(Kernel::ImplType::kOpName);
+  auto ret = ::tensorflow::register_op::OpDefBuilderWrapper(
+      Kernel::ImplType::OpName());
   for (const auto& input : Kernel::ImplType::Inputs()) ret = ret.Input(input);
   for (const auto& output : Kernel::ImplType::Outputs())
     ret = ret.Output(output);
diff --git a/tensorflow/lite/kernels/shim/tflite_op_shim.cc b/tensorflow/lite/kernels/shim/tflite_op_shim.cc
index ce5b96ba138..7e09c582c11 100644
--- a/tensorflow/lite/kernels/shim/tflite_op_shim.cc
+++ b/tensorflow/lite/kernels/shim/tflite_op_shim.cc
@@ -24,9 +24,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/shim/status_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace shim {
@@ -103,6 +104,11 @@ TensorViewOr TfLiteInvokeContext::GetOutput(const int idx,
         absl::StrCat("output tensor is null during invocation. idx: ", idx));
   if (tflite_tensor->data.raw == nullptr ||
       tflite_tensor->allocation_type == kTfLiteDynamic) {
+    // Clear out string tensor so previous values are not copied.
+    if (tflite_tensor->type == kTfLiteString) {
+      tflite::DynamicBuffer buf;
+      buf.WriteToTensor(tflite_tensor, /*new_shape=*/nullptr);
+    }
     TfLiteIntArray* output_shape_array =
         ShapeToTfLiteShape(output_shape.value());
     context_->ResizeTensor(context_, tflite_tensor, output_shape_array);
diff --git a/tensorflow/lite/kernels/shim/tflite_op_shim.h b/tensorflow/lite/kernels/shim/tflite_op_shim.h
index e5a76016626..560950404b9 100644
--- a/tensorflow/lite/kernels/shim/tflite_op_shim.h
+++ b/tensorflow/lite/kernels/shim/tflite_op_shim.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/shim/op_kernel.h"
 #include "tensorflow/lite/kernels/shim/shape.h"
@@ -54,7 +54,8 @@ class TfLiteInvokeContext : public InvokeContext<TfLiteInvokeContext> {
   TfLiteInvokeContext(TfLiteContext* context_, TfLiteNode* node_);
   // Read an input tensor
   ConstTensorViewOr GetInput(const int idx) const;
-  // Get a mutable output tensor
+  // Get a mutable output tensor. For output string tensors, this should only
+  // be called once.
   TensorViewOr GetOutput(const int idx, const Shape& shape) const;
   // Number of input tensors
   int NumInputs() const;
@@ -111,10 +112,10 @@ Shape TfLiteShapeToShape(const TfLiteIntArray* tflite_shape);
 
 // An op kernel base class which is an adapter between an Op implementation
 // (OpKernelShim subclass) and TFLite runtime
-template <template <Runtime> typename Impl>
+template <template <Runtime, typename...> typename Impl, typename... Ts>
 class TfLiteOpKernel {
  public:
-  using ImplType = Impl<Runtime::kTfLite>;
+  using ImplType = Impl<Runtime::kTfLite, Ts...>;
 
   // Builds a TfLiteRegistration object to register this with the TfLite runtime
   static TfLiteRegistration* GetTfLiteRegistration() {
@@ -125,11 +126,11 @@ class TfLiteOpKernel {
 
   // Adds this op kernel to the passed in op resolver
   static void Add(MutableOpResolver* resolver) {
-    resolver->AddCustom(ImplType::kOpName, GetTfLiteRegistration());
+    resolver->AddCustom(ImplType::OpName(), GetTfLiteRegistration());
   }
 
   // The operation name
-  static const char* OpName() { return ImplType::kOpName; }
+  static const char* OpName() { return ImplType::OpName(); }
 
  protected:
   // The data that is stored in node::user_data.
diff --git a/tensorflow/lite/kernels/shim/tflite_op_wrapper.h b/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
new file mode 100644
index 00000000000..bb960736355
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/tflite_op_wrapper.h
@@ -0,0 +1,369 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
+#define TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/status_macros.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+
+namespace tflite {
+namespace shim {
+namespace op_wrapper {
+
+using ::tflite::shim::OpKernelShim;
+using ::tflite::shim::Runtime;
+
+// Represents an attribute which can have many types. The first template
+// parameter should be an AttrName, and the packed parameter should be the
+// list of types of the attribute.
+// TODO(b/265879737): When we begin compiling using C++20, the first template
+// parameter should be changed to the template type.
+template <typename N, typename... T>
+struct Attr {
+  const char* Name() const { return N::Name(); }
+};
+
+// Used to store the name of an attribute.
+template <char const* str>
+struct AttrName {
+  static const char* Name() { return str; }
+};
+
+// Object for passing around types.
+template <typename T>
+struct AttrType {
+  using type = T;
+};
+
+// The following constexprs are used to create the variant type which contains
+// the combinations of our templated op. This variant is what is wrapped and
+// ultimately called by the wrapper op.
+//
+// Example:
+//   TmplOp with Attrs {"AType: {bool, float}", "BType: {int32, int64}"};
+// Call:
+//   const char a_type[]("AType"), b_type[]("BType");
+//   VariantOp<Rt,
+//             TmplOp,
+//              Attr<AttrName<a_type>, bool, float>,
+//              Attr<AttrName<b_type>, int32_t, int64_t>> x;
+// Result:
+//   std::variant<TmplOp<Rt, bool, int32_t>, TmplOp<Rt, bool, int64_t>,
+//                TmplOp<Rt, float, int32_t>, TmplOp<Rt, float, int64_t>> x;
+
+// Prepends a type onto a tuple.
+template <typename T, typename... Us>
+static constexpr std::tuple<T, Us...> prependTypeInner(T, std::tuple<Us...>);
+
+// Prepend a type on each inner tuple group. This expression unwraps the inner
+// tuples, and the inner expression performs the prepending.
+template <typename T, typename... Us>
+static constexpr auto prependType(T, std::tuple<Us...>)
+    -> std::tuple<decltype(prependTypeInner(std::declval<T>(),
+                                            std::declval<Us>()))...>;
+
+// Base case for recursively processing all combinations of remaining
+// attributes. The result is a tuple containing each type individually.
+template <typename Name, typename... Ts>
+static constexpr std::tuple<std::tuple<Ts>...> getCombinations(
+    Attr<Name, Ts...>);
+
+// Base case for recursively processing all types of an attribute.
+template <typename Name, typename Head, typename... Attrs>
+static constexpr auto getCombinations(Attr<Name, Head>, Attrs...)
+    -> decltype(prependType(std::declval<Head>(),
+                            getCombinations(std::declval<Attrs>()...)));
+
+// Creates a tuple of tuples from a list of Attrute types by recursively
+// popping the first type off the first attribute and prepending it to the
+// combination of other attribute types. This result is then combined with the
+// recursive processing of other types left.
+template <typename Name, typename Head, typename... Tail, typename... Attrs>
+static constexpr auto getCombinations(Attr<Name, Head, Tail...>, Attrs...)
+    -> decltype(std::tuple_cat(
+        prependType(std::declval<Head>(),
+                    getCombinations(std::declval<Attrs>()...)),
+        getCombinations(std::declval<Attr<Name, Tail...>>(),
+                        std::declval<Attrs>()...)));
+
+// Converts a tuple of types into the corresponding op type.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... Ts>
+static constexpr Op<Rt, Ts...> convertTuplesToOpsInner(std::tuple<Ts...>);
+
+// Convert a tuple of types into our op with those types. We first need to
+// unwrap the inner tuples, we can then convert each individually in the
+// inner expression and wrap them back up into a tuple.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... Ts>
+static constexpr auto convertTuplesToOps(std::tuple<Ts...>) -> std::tuple<
+    decltype(convertTuplesToOpsInner<Rt, Op>(std::declval<Ts>()))...>;
+
+// Convert a tuple of types into a variant of types.
+template <typename... Ts>
+static constexpr std::variant<Ts...> convertTupleToVariant(std::tuple<Ts...>);
+
+// The variant Op type created with TMP. A tuple of tuples containing the
+// attribute combinations is first created. Then each inner tuple is converted
+// into the op types, and finally the outer tuple is converted into a variant.
+// Note, this uses a struct rather than a type alias because of a C++ limitation
+// with template parameter packs not being deduced for aliases.
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename FirstAttr, typename... OtherAttrs>
+struct VariantOp {
+  using type =
+      decltype(convertTupleToVariant(convertTuplesToOps<Rt, Op>(getCombinations(
+          std::declval<FirstAttr>(), std::declval<OtherAttrs>()...))));
+};
+
+// Intermediate object used by the OpWrapper to properly extend OpKernelShim.
+template <Runtime Rt>
+class OpWrapperExtension : public OpKernelShim<OpWrapperExtension, Rt> {};
+
+// Wraps a polymorphic op to be used by TF Lite. At this time, TF Lite does not
+// support TypeConstraints like TensorFlow. This will wrap the op variants
+// and delegate calls to the correctly typed variant when called.
+//
+// Example usage:
+// Given a templated Op `TmplOp` with Attrs:
+//     Attrs {"AType: {bool, float}", "BType: {int32_t, int64_t}"};
+//
+// We can define our type with the following (note that until C++20, these
+// strings cannot be defined inline):
+//
+// const char a_type[]("AType"), b_type[]("BType");
+// template <shim::Runtime Rt>
+// using OpWrapperType = OpWrapper<Rt, TmplOp,
+//     Attr<AttrName<a_type>, bool, float>,
+//     Attr<AttrName<b_type>, int32_t, int64_t>>;
+template <Runtime Rt, template <Runtime, typename...> typename Op,
+          typename... As>
+class OpWrapper : public OpWrapperExtension<Rt> {
+ public:
+  // This variant can be any permutation of the Op and its template params.
+  using TmplOpType = typename VariantOp<Rt, Op, As...>::type;
+  // For static calls, the exact type shouldn't matter, we just need a type.
+  using TmplOpType0 = typename std::variant_alternative<0, TmplOpType>::type;
+
+  using typename OpKernelShim<OpWrapperExtension, Rt>::InitContext;
+  using typename OpKernelShim<OpWrapperExtension, Rt>::InvokeContext;
+  using typename OpKernelShim<OpWrapperExtension, Rt>::ShapeInferenceContext;
+  OpWrapper() = default;
+
+  // For the static methods, they shouldn't change based on the types.
+  static const char* OpName() { return TmplOpType0::OpName(); }
+  static const char* Doc() { return TmplOpType0::Doc(); }
+
+  static std::vector<std::string> Attrs() { return TmplOpType0::Attrs(); }
+  static std::vector<std::string> Inputs() { return TmplOpType0::Inputs(); }
+  static std::vector<std::string> Outputs() { return TmplOpType0::Outputs(); }
+
+  static absl::Status ShapeInference(ShapeInferenceContext* context) {
+    return TmplOpType0::ShapeInference(context);
+  }
+
+  // Creates the correctly typed wrapped object before delegating the Init call
+  // to it. Invoke will also use this variant.
+  absl::Status Init(InitContext* context) {
+    SH_RETURN_IF_ERROR(SetVariantOp<As...>(context));
+
+    return std::visit(
+        [context](auto&& op) -> absl::Status { return op.Init(context); },
+        *op_);
+  }
+
+  // Call Invoke on the created wrapped object.
+  absl::Status Invoke(InvokeContext* context) {
+    return std::visit(
+        [context](auto&& op) -> absl::Status { return op.Invoke(context); },
+        *op_);
+  }
+
+ private:
+  // Sets op_ to the variant type matching the type attributes provided by the
+  // InitContext. Similar to creating the variant type, we recursively
+  // get all combinations of the attributes.
+  template <typename FirstAttr, typename... Attrs>
+  absl::Status SetVariantOp(InitContext* c) {
+    return CombineAttributeTypes(this, c, FirstAttr{}, Attrs{}...);
+  }
+
+  // A simple object to hold Attrutes while we recursively find the
+  // combinations. When called, it will unwrap the stored types to call the
+  // underlying function.
+  // The template parameters are:
+  //   F: Object to wrap which will be another Forwarder object or the OpWrapper
+  //   Name: AttrName of the attribute.
+  //   T: Type of attribute for this combination.
+  template <typename F, typename Name, typename T>
+  struct Forwarder {
+   public:
+    explicit Forwarder(F* f) : inner(f) {}
+
+    template <typename... Args>
+    absl::Status SetOpCombination(Args... args) {
+      return inner->SetOpCombination(Name::Name(), AttrType<T>{}, args...);
+    }
+
+   private:
+    F* inner;
+  };
+
+  // Recursively processes for each combination of attribute types. First,
+  // running over the first attibute and sub-combinations, then running over
+  // the combinations of the remaining types of the first attribute.
+  template <typename F, typename Name, typename Head, typename... Tail,
+            typename... Attrs>
+  absl::Status CombineAttributeTypes(F* obj, InitContext* c,
+                                     Attr<Name, Head, Tail...>, Attrs... rest) {
+    SH_RETURN_IF_ERROR(
+        ApplyAttrType(obj, c, Name{}, AttrType<Head>{}, rest...));
+
+    return CombineAttributeTypes(obj, c, Attr<Name, Tail...>{}, rest...);
+  }
+
+  // Base case for recursively processing types of an attribute.
+  template <typename F, typename Name, typename... Attrs>
+  absl::Status CombineAttributeTypes(F*, InitContext*, Attr<Name>, Attrs...) {
+    return absl::OkStatus();
+  }
+
+  // Saves the names and types of each attribute in the current combination
+  // in a Forwarder object which will ultimately call a typed function.
+  template <typename F, typename Name, typename T, typename Attr,
+            typename... Attrs>
+  absl::Status ApplyAttrType(F* obj, InitContext* c, Name, AttrType<T>, Attr a,
+                             Attrs... rest) {
+    Forwarder<F, Name, T> forwarder(obj);
+
+    return CombineAttributeTypes(&forwarder, c, a, rest...);
+  }
+
+  // Base case for recursively finding combinations of attributes.
+  template <typename F, typename Name, typename T>
+  absl::Status ApplyAttrType(F* obj, InitContext* c, Name, AttrType<T> t) {
+    return obj->SetOpCombination(Name::Name(), t, c);
+  }
+
+  // Checks the attribute types from the context for this particular attribute
+  // type combination. If correct, we set the op variant to this op combo.
+  //
+  // For this, we actually need to overload the functiona nd create a template
+  // for each number of attributes.
+  template <typename T>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                InitContext* context) {
+    int64_t datatype_1;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    if (datatype_1 == typeToTfLiteType<T>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V, typename W>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                std::string Name4, AttrType<W>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3, datatype_4;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name4, &datatype_4));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>() &&
+        datatype_4 == typeToTfLiteType<W>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V, W>());
+    }
+    return absl::OkStatus();
+  }
+
+  template <typename T, typename U, typename V, typename W, typename X>
+  absl::Status SetOpCombination(std::string Name1, AttrType<T>,
+                                std::string Name2, AttrType<U>,
+                                std::string Name3, AttrType<V>,
+                                std::string Name4, AttrType<W>,
+                                std::string Name5, AttrType<X>,
+                                InitContext* context) {
+    int64_t datatype_1, datatype_2, datatype_3, datatype_4, datatype_5;
+    SH_RETURN_IF_ERROR(context->GetAttr(Name1, &datatype_1));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name2, &datatype_2));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name3, &datatype_3));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name4, &datatype_4));
+    SH_RETURN_IF_ERROR(context->GetAttr(Name5, &datatype_5));
+    if (datatype_1 == typeToTfLiteType<T>() &&
+        datatype_2 == typeToTfLiteType<U>() &&
+        datatype_3 == typeToTfLiteType<V>() &&
+        datatype_4 == typeToTfLiteType<W>() &&
+        datatype_5 == typeToTfLiteType<X>()) {
+      this->op_ = std::make_unique<TmplOpType>(Op<Rt, T, U, V, W, X>());
+    }
+    return absl::OkStatus();
+  }
+
+ protected:
+  // The wrapped object variant.
+  std::unique_ptr<TmplOpType> op_;
+};
+
+}  // namespace op_wrapper
+}  // namespace shim
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SHIM_TFLITE_OP_WRAPPER_H_
diff --git a/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc b/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc
new file mode 100644
index 00000000000..5d03db8657a
--- /dev/null
+++ b/tensorflow/lite/kernels/shim/tflite_op_wrapper_test.cc
@@ -0,0 +1,571 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/shim/tflite_op_wrapper.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/lite/kernels/shim/op_kernel.h"
+#include "tensorflow/lite/kernels/shim/tflite_op_shim.h"
+
+namespace tflite {
+namespace shim {
+namespace op_wrapper {
+namespace {
+
+// Tests the created type of the variant is correct.
+class VariantOpTest : public ::testing::Test {
+ public:
+  // Fake template op to test against.
+  template <shim::Runtime Rt, typename... Ts>
+  class TmplOp {};
+
+  // For checking if variant has a member type
+  template <typename T, typename VARIANT_T>
+  struct isVariantMember;
+
+  template <typename T, typename... ALL_T>
+  struct isVariantMember<T, std::variant<ALL_T...>>
+      : public std::disjunction<std::is_same<T, ALL_T>...> {};
+
+  // Names for parameters
+  static constexpr char kAttrName[] = "AttrName";
+};
+
+TEST_F(VariantOpTest, TestVariantOpCreation_1) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 1);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_2) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, bool>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 2);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_1x1) {
+  using VOp =
+      VariantOp<Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName>, int64_t>,
+                Attr<AttrName<kAttrName>, bool>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 1);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_1x1x1) {
+  using VOp =
+      VariantOp<Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName>, int64_t>,
+                Attr<AttrName<kAttrName>, bool>,
+                Attr<AttrName<kAttrName>, bool>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 1);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, bool>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_2x1) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, float>,
+                        Attr<AttrName<kAttrName>, bool>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 2);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, float, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_1x2) {
+  using VOp =
+      VariantOp<Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName>, int64_t>,
+                Attr<AttrName<kAttrName>, bool, float>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 2);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_2x2) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, int32_t>,
+                        Attr<AttrName<kAttrName>, bool, float>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 4);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_3x3) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, int32_t, int8_t>,
+                        Attr<AttrName<kAttrName>, bool, float, char>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 9);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, char>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, char>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int8_t, bool>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int8_t, float>, VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int8_t, char>, VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_2x2x2) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, int32_t>,
+                        Attr<AttrName<kAttrName>, bool, float>,
+                        Attr<AttrName<kAttrName>, char, int8_t>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 8);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, char>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, int8_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, float, char>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, float, int8_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool, char>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool, int8_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, float, char>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, float, int8_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_2x1x3x1) {
+  using VOp = VariantOp<Runtime::kTfLite, TmplOp,
+                        Attr<AttrName<kAttrName>, int64_t, int32_t>,
+                        Attr<AttrName<kAttrName>, bool>,
+                        Attr<AttrName<kAttrName>, char, int8_t, float>,
+                        Attr<AttrName<kAttrName>, uint16_t>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 6);
+
+  bool b;
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, char, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, int8_t, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int64_t, bool, float, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool, char, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool, int8_t, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+  b = isVariantMember<TmplOp<Runtime::kTfLite, int32_t, bool, float, uint16_t>,
+                      VOp>::value;
+  EXPECT_TRUE(b);
+}
+
+TEST_F(VariantOpTest, TestVariantOpCreation_4x4x6) {
+  using VOp =
+      VariantOp<Runtime::kTfLite, TmplOp,
+                Attr<AttrName<kAttrName>, int64_t, int32_t, int16_t, int8_t>,
+                Attr<AttrName<kAttrName>, int64_t, int32_t, int16_t, int8_t>,
+                Attr<AttrName<kAttrName>, int64_t, int32_t, int16_t, int8_t,
+                     bool, float>>::type;
+
+  EXPECT_EQ(std::variant_size_v<VOp>, 96);
+}
+
+// Tests the correct type of the variant is set given the parameter types of the
+// context in the Init function.
+class SetVariantOpTest : public ::testing::Test {
+ public:
+  // Extend OpWrapper to get access to underlying op variant for testing.
+  template <Runtime Rt, template <Runtime, typename...> typename Op,
+            typename... As>
+  class OpWrapperFriend : public OpWrapper<Rt, Op, As...> {
+   public:
+    using TmplOpType = typename VariantOp<Rt, Op, As...>::type;
+    TmplOpType* GetOp() { return this->op_.get(); }
+  };
+
+  // Fake template op to test against.
+  template <Runtime Rt, typename... Ts>
+  class TmplOp : public OpKernelShim<TmplOp, Rt, Ts...> {
+   public:
+    using typename OpKernelShim<TmplOp, Rt, Ts...>::InitContext;
+    absl::Status Init(InitContext* ctx) { return absl::OkStatus(); }
+  };
+
+  // Fake InitContext used to set the flexbuffer attribute map.
+  class FakeInitContext : public TfLiteInitContext {
+   public:
+    explicit FakeInitContext(const flexbuffers::Map* m)
+        : TfLiteInitContext(nullptr, m) {}
+  };
+
+  // Helper methods for creating a FakeInitContext
+  template <typename T>
+  flexbuffers::Map CreateAttrMap() {
+    fbb_ = std::make_unique<flexbuffers::Builder>();
+    fbb_->Map([&]() {
+      fbb_->Int(kAttrName1, static_cast<int>(typeToTfLiteType<T>()));
+    });
+    fbb_->Finish();
+    return flexbuffers::GetRoot(fbb_->GetBuffer()).AsMap();
+  }
+
+  template <typename T, typename U>
+  flexbuffers::Map CreateAttrMap() {
+    fbb_ = std::make_unique<flexbuffers::Builder>();
+    fbb_->Map([&]() {
+      fbb_->Int(kAttrName1, static_cast<int>(typeToTfLiteType<T>()));
+      fbb_->Int(kAttrName2, static_cast<int>(typeToTfLiteType<U>()));
+    });
+    fbb_->Finish();
+    return flexbuffers::GetRoot(fbb_->GetBuffer()).AsMap();
+  }
+
+  template <typename T, typename U, typename V>
+  flexbuffers::Map CreateAttrMap() {
+    fbb_ = std::make_unique<flexbuffers::Builder>();
+    fbb_->Map([&]() {
+      fbb_->Int(kAttrName1, static_cast<int>(typeToTfLiteType<T>()));
+      fbb_->Int(kAttrName2, static_cast<int>(typeToTfLiteType<U>()));
+      fbb_->Int(kAttrName3, static_cast<int>(typeToTfLiteType<V>()));
+    });
+    fbb_->Finish();
+    return flexbuffers::GetRoot(fbb_->GetBuffer()).AsMap();
+  }
+
+  // Names for parameters
+  static constexpr char kAttrName1[] = "AttrName1";
+  static constexpr char kAttrName2[] = "AttrName2";
+  static constexpr char kAttrName3[] = "AttrName3";
+
+ private:
+  // These must exist for length of test
+  std::unique_ptr<flexbuffers::Builder> fbb_;
+};
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_1) {
+  auto op_wrapper = OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                                    Attr<AttrName<kAttrName1>, bool>>();
+
+  auto map = CreateAttrMap<bool>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_1x1) {
+  auto op_wrapper = OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                                    Attr<AttrName<kAttrName1>, bool>,
+                                    Attr<AttrName<kAttrName2>, int32_t>>();
+
+  auto map = CreateAttrMap<bool, int32_t>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_1x1x1) {
+  auto op_wrapper = OpWrapperFriend<
+      Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName1>, bool>,
+      Attr<AttrName<kAttrName2>, int32_t>, Attr<AttrName<kAttrName3>, float>>();
+
+  auto map = CreateAttrMap<bool, int32_t, float>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b =
+      std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t, float>>(
+          *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_2) {
+  auto op_wrapper =
+      OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                      Attr<AttrName<kAttrName1>, bool, int32_t>>();
+
+  auto map = CreateAttrMap<bool>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b;
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_2x1) {
+  auto op_wrapper = OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                                    Attr<AttrName<kAttrName1>, bool, int32_t>,
+                                    Attr<AttrName<kAttrName2>, float>>();
+
+  auto map = CreateAttrMap<int32_t, float>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b;
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int32_t, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_1x2) {
+  auto op_wrapper =
+      OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                      Attr<AttrName<kAttrName1>, bool>,
+                      Attr<AttrName<kAttrName2>, float, int32_t>>();
+
+  auto map = CreateAttrMap<bool, float>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b;
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_2x2) {
+  auto op_wrapper =
+      OpWrapperFriend<Runtime::kTfLite, TmplOp,
+                      Attr<AttrName<kAttrName1>, bool, int64_t>,
+                      Attr<AttrName<kAttrName2>, float, int32_t>>();
+
+  auto map = CreateAttrMap<bool, float>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b;
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+
+  map = CreateAttrMap<bool, int32_t>();
+  context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+
+  map = CreateAttrMap<int64_t, float>();
+  context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+
+  map = CreateAttrMap<int64_t, int32_t>();
+  context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, bool, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, float>>(
+      *op_wrapper.GetOp());
+  EXPECT_FALSE(b);
+  b = std::holds_alternative<TmplOp<Runtime::kTfLite, int64_t, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_3x3) {
+  auto op_wrapper = OpWrapperFriend<
+      Runtime::kTfLite, TmplOp,
+      Attr<AttrName<kAttrName1>, bool, int64_t, ::tensorflow::tstring>,
+      Attr<AttrName<kAttrName2>, float, int32_t, uint32_t>>();
+
+  auto map = CreateAttrMap<::tensorflow::tstring, int32_t>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b;
+  b = std::holds_alternative<
+      TmplOp<Runtime::kTfLite, ::tensorflow::tstring, int32_t>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_2x2x2) {
+  auto op_wrapper = OpWrapperFriend<
+      Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName1>, bool, int32_t>,
+      Attr<AttrName<kAttrName2>, float, uint32_t>,
+      Attr<AttrName<kAttrName3>, ::tensorflow::tstring, int64_t>>();
+
+  auto map = CreateAttrMap<int32_t, uint32_t, ::tensorflow::tstring>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b = std::holds_alternative<
+      TmplOp<Runtime::kTfLite, int32_t, uint32_t, ::tensorflow::tstring>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_2x1x3) {
+  auto op_wrapper = OpWrapperFriend<
+      Runtime::kTfLite, TmplOp, Attr<AttrName<kAttrName1>, bool, int32_t>,
+      Attr<AttrName<kAttrName2>, float>,
+      Attr<AttrName<kAttrName3>, ::tensorflow::tstring, int64_t, uint32_t>>();
+
+  auto map = CreateAttrMap<int32_t, float, ::tensorflow::tstring>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b = std::holds_alternative<
+      TmplOp<Runtime::kTfLite, int32_t, float, ::tensorflow::tstring>>(
+      *op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+TEST_F(SetVariantOpTest, TestSetVariantOp_4x4x6) {
+  auto op_wrapper = OpWrapperFriend<
+      Runtime::kTfLite, TmplOp,
+      Attr<AttrName<kAttrName1>, bool, int32_t, uint32_t, int8_t>,
+      Attr<AttrName<kAttrName2>, float, int16_t, int32_t, uint32_t>,
+      Attr<AttrName<kAttrName3>, int8_t, uint8_t, int64_t, uint64_t, int32_t,
+           uint32_t>>();
+
+  auto map = CreateAttrMap<int32_t, float, uint32_t>();
+  auto context = FakeInitContext(&map);
+  EXPECT_OK(op_wrapper.Init(&context));
+
+  bool b = std::holds_alternative<
+      TmplOp<Runtime::kTfLite, int32_t, float, uint32_t>>(*op_wrapper.GetOp());
+  EXPECT_TRUE(b);
+}
+
+}  // namespace
+}  // namespace op_wrapper
+}  // namespace shim
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shim/tflite_tensor_view.cc b/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
index 9fb3501e143..f89d437485f 100644
--- a/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
+++ b/tensorflow/lite/kernels/shim/tflite_tensor_view.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/variant.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/shim/tensor_view.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
diff --git a/tensorflow/lite/kernels/shim/tflite_tensor_view.h b/tensorflow/lite/kernels/shim/tflite_tensor_view.h
index 3db1c1190e1..8276f269e88 100644
--- a/tensorflow/lite/kernels/shim/tflite_tensor_view.h
+++ b/tensorflow/lite/kernels/shim/tflite_tensor_view.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/shim/tensor_view.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/sign.cc b/tensorflow/lite/kernels/sign.cc
index 850313a9084..1e4d5dd32d7 100644
--- a/tensorflow/lite/kernels/sign.cc
+++ b/tensorflow/lite/kernels/sign.cc
@@ -14,7 +14,7 @@
 
 #include <cmath>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -72,6 +72,10 @@ TfLiteStatus PointwiseUnaryOpEval(TfLiteContext* context, TfLiteNode* node) {
           context,
           (PointwiseUnaryOpDoEval<Op, double>(context, input, output)));
       break;
+    case kTfLiteInt32:
+      TF_LITE_ENSURE_OK(context, (PointwiseUnaryOpDoEval<Op, int32_t>(
+                                     context, input, output)));
+      break;
     default:
       TF_LITE_KERNEL_LOG(context, "Unsupported datatype for sign output: %s",
                          TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/sign_custom.cc b/tensorflow/lite/kernels/sign_custom.cc
index 25f150771f0..ccf109d99b7 100644
--- a/tensorflow/lite/kernels/sign_custom.cc
+++ b/tensorflow/lite/kernels/sign_custom.cc
@@ -14,7 +14,7 @@
 
 #include <cmath>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/custom_ops_register.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/sign_test.cc b/tensorflow/lite/kernels/sign_test.cc
index 69a1ef65403..2ad721ed066 100644
--- a/tensorflow/lite/kernels/sign_test.cc
+++ b/tensorflow/lite/kernels/sign_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cmath>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
@@ -34,6 +35,11 @@ tflite::TensorType GetTTEnum<double>() {
   return tflite::TensorType_FLOAT64;
 }
 
+template <>
+tflite::TensorType GetTTEnum<int32_t>() {
+  return tflite::TensorType_INT32;
+}
+
 class SignModel : public tflite::SingleOpModel {
  public:
   SignModel(tflite::TensorData x,
@@ -56,16 +62,16 @@ class SignModel : public tflite::SingleOpModel {
 };
 
 template <typename Float>
-class SignTest : public ::testing::Test {
+class SignTestFloat : public ::testing::Test {
  public:
   using FloatType = Float;
 };
 
 using TestTypes = ::testing::Types<float, double>;
 
-TYPED_TEST_SUITE(SignTest, TestTypes);
+TYPED_TEST_SUITE(SignTestFloat, TestTypes);
 
-TYPED_TEST(SignTest, TestScalar) {
+TYPED_TEST(SignTestFloat, TestScalarFloat) {
   using Float = typename TestFixture::FloatType;
   tflite::TensorData x = {GetTTEnum<Float>(), {}};
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
@@ -78,7 +84,7 @@ TYPED_TEST(SignTest, TestScalar) {
   ASSERT_FLOAT_EQ(m.GetOutput<Float>({-3.0})[0], -1.0);
 }
 
-TYPED_TEST(SignTest, TestBatch) {
+TYPED_TEST(SignTestFloat, TestBatchFloat) {
   using Float = typename TestFixture::FloatType;
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
@@ -92,5 +98,37 @@ TYPED_TEST(SignTest, TestBatch) {
       {1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 0.0}));
 }
 
+template <typename Int>
+class SignTestInt : public ::testing::Test {
+ public:
+  using IntType = Int;
+};
+using TestTypesInt = ::testing::Types<int32_t>;
+
+TYPED_TEST_SUITE(SignTestInt, TestTypesInt);
+
+TYPED_TEST(SignTestInt, TestScalarInt) {
+  using Int = typename TestFixture::IntType;
+  tflite::TensorData x = {GetTTEnum<Int>(), {}};
+  tflite::TensorData output = {GetTTEnum<Int>(), {}};
+  SignModel m(x, output);
+  auto got = m.GetOutput<Int>({0});
+  ASSERT_EQ(got.size(), 1);
+  EXPECT_EQ(got[0], 0);
+
+  ASSERT_EQ(m.GetOutput<Int>({5})[0], 1);
+  ASSERT_EQ(m.GetOutput<Int>({-3})[0], -1);
+}
+
+TYPED_TEST(SignTestInt, TestBatchInt) {
+  using Int = typename TestFixture::IntType;
+  tflite::TensorData x = {GetTTEnum<Int>(), {4, 2, 1}};
+  tflite::TensorData output = {GetTTEnum<Int>(), {4, 2, 1}};
+  SignModel m(x, output);
+
+  EXPECT_EQ(m.GetOutput<Int>({0, -7, 6, -5, 4, -3, 2, 1}),
+            std::vector<Int>({0, -1, 1, -1, 1, -1, 1, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index 6505988f4d2..ea7efecb03d 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -33,8 +33,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
@@ -98,7 +98,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   tflite::StringRef strref = tflite::GetString(input, 0);
   int prev_idx = 0;
-  for (int i = 1; i < strref.len; i++) {
+  for (size_t i = 1; i < strref.len; i++) {
     if (isspace(*(strref.str + i))) {
       if (i > prev_idx && !isspace(*(strref.str + prev_idx))) {
         words.push_back({strref.str + prev_idx, i - prev_idx});
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 95560d95a30..b11573395e6 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index a011efb28c6..e6a8251f560 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index d41c50e4510..ed9ec21c4a1 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 108b293c905..eb261057ae3 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -115,7 +115,7 @@ class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(
       SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
@@ -231,7 +231,7 @@ class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index e9e674211ae..5596c81788d 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 9d58fbc283d..146c6a1b22b 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -54,7 +54,7 @@ class SpaceToDepthOpModel : public SingleOpModel {
   int output_;
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(SpaceToDepthOpModel, BadBlockSize) {
   EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
                "Cannot allocate tensors");
diff --git a/tensorflow/lite/kernels/sparse_to_dense.cc b/tensorflow/lite/kernels/sparse_to_dense.cc
index 8ee9e245eb3..000fddeea7b 100644
--- a/tensorflow/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 73692914dc5..03dc10b8f8e 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index 648a8cb334e..786b17e4538 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index 92530db03d3..c8346d640a6 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/binary_function.h"
diff --git a/tensorflow/lite/kernels/squeeze.cc b/tensorflow/lite/kernels/squeeze.cc
index ac282fd0959..b0f6c9b007f 100644
--- a/tensorflow/lite/kernels/squeeze.cc
+++ b/tensorflow/lite/kernels/squeeze.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -91,8 +91,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
-  memcpy(op_context.output->data.raw, op_context.input->data.raw,
-         op_context.input->bytes);
+  // Only copy data if input and output do not share a buffer.
+  if (op_context.output->data.data != op_context.input->data.data) {
+    memcpy(op_context.output->data.data, op_context.input->data.data,
+           op_context.input->bytes);
+  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 34d3d8be72c..5ab013bed94 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -229,41 +229,8 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  StridedSliceContext op_context(context, node);
-
-  // Ensure validity of input tensor and its dimension
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
-  TF_LITE_ENSURE_EQ(context, NumElements(op_context.begin),
-                    NumElements(op_context.end));
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-
-  // Only INT32 begin/end/strides are supported
-  // TODO(b/253465311): add support for INT64
-  TF_LITE_ENSURE_TYPES_EQ(context, op_context.begin->type, kTfLiteInt32);
-  TF_LITE_ENSURE_TYPES_EQ(context, op_context.end->type, kTfLiteInt32);
-  TF_LITE_ENSURE_TYPES_EQ(context, op_context.strides->type, kTfLiteInt32);
-  TF_LITE_ENSURE_MSG(context, op_context.input_dims <= 5,
-                     "StridedSlice op only supports 1D-5D input arrays.");
-
-  // Postpone allocation of output if any of the indexing tensors is not
-  // constant
-  if (!(IsConstantTensor(op_context.begin) &&
-        IsConstantTensor(op_context.end) &&
-        IsConstantTensor(op_context.strides))) {
-    SetTensorToDynamic(op_context.output);
-    return kTfLiteOk;
-  }
-  return ResizeOutputTensor(context, &op_context);
-}
-
 template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   StridedSliceContext op_context(context, node);
 
   if (IsDynamicTensor(op_context.output)) {
@@ -323,6 +290,53 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.begin),
+                    NumElements(op_context.end));
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  // Only INT32 begin/end/strides are supported
+  // TODO(b/253465311): add support for INT64
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_TYPES_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.input_dims <= 5,
+                     "StridedSlice op only supports 1D-5D input arrays.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(op_context.begin) &&
+        IsConstantTensor(op_context.end) &&
+        IsConstantTensor(op_context.strides))) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  if (IsConstantOrPersistentTensor(op_context.input)) {
+    SetTensorToPersistentRo(op_context.output);
+    ResizeOutputTensor(context, &op_context);
+    return EvalImpl<kGenericOptimized>(context, node);
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  StridedSliceContext op_context(context, node);
+  if (IsConstantOrPersistentTensor(op_context.output)) {
+    return kTfLiteOk;
+  }
+  return EvalImpl<kernel_type>(context, node);
+}
+
 }  // namespace strided_slice
 
 TfLiteRegistration* Register_STRIDED_SLICE_REF() {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 41180435a2e..269555cf688 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -93,7 +93,7 @@ class StridedSliceOpTest : public ::testing::Test {};
 using DataTypes = ::testing::Types<float, uint8_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TYPED_TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(StridedSliceOpModel<TypeParam>({2, 2, 2, 2, 2, 2}, {5}, {5}, {5},
                                               0, 0, 0, 0, 0),
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index ded280c5350..d09504ab5fb 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <algorithm>
 #include <limits>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h"
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index bd135f553de..b11448d387a 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -25,10 +25,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
 
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index 0e661995869..74a38153631 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/table.cc b/tensorflow/lite/kernels/table.cc
index 11d066f8aa0..3638cc996ce 100644
--- a/tensorflow/lite/kernels/table.cc
+++ b/tensorflow/lite/kernels/table.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/lut.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 1e4758307f4..dd90cf5cb58 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -32,16 +32,16 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/delegates/nnapi/acceleration_test_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/acceleration_test_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 3dc30d5b220..4ac3bafd269 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -65,13 +65,21 @@ std::vector<::testing::Matcher<std::complex<float>>> ArrayComplex64Near(
 
 template <typename T>
 inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
-                               int32_t zero_point) {
+                               int32_t zero_point,
+                               TfLiteType type = kTfLiteNoType) {
   std::vector<T> q;
+
+  T min = std::numeric_limits<T>::min();
+  T max = std::numeric_limits<T>::max();
+
+  if (type == kTfLiteInt4) {
+    min = -7;
+    max = 7;
+  }
+
   for (const auto& f : data) {
     q.push_back(static_cast<T>(std::max<float>(
-        std::numeric_limits<T>::min(),
-        std::min<float>(std::numeric_limits<T>::max(),
-                        std::round(zero_point + (f / scale))))));
+        min, std::min<float>(max, std::round(zero_point + (f / scale))))));
   }
   return q;
 }
@@ -457,10 +465,19 @@ class SingleOpModel {
   template <typename T>
   void QuantizeAndPopulate(int index, const std::vector<float>& data) {
     TfLiteTensor* t = interpreter_->tensor(index);
-    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point);
+    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point, t->type);
     PopulateTensor(index, 0, q.data(), q.data() + q.size());
   }
 
+  void QuantizeAndPopulate4bit(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    t->type = kTfLiteInt4;
+    std::vector<int8_t> quantized_output =
+        Quantize<int8_t>(data, t->params.scale, t->params.zero_point, t->type);
+    PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
+                       quantized_output.data() + quantized_output.size());
+  }
+
   void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
     std::vector<int8_t> q = QuantizeTensor(index, data);
     PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
@@ -494,11 +511,19 @@ class SingleOpModel {
                                                    : params->scale->data[i];
       scales_inv[i] = 1.0f / scale;
     }
+
     optimize::utils::SymmetricPerChannelQuantizeValues(
-        input_data.data(), scales_inv, shape, channel_index, &quantized_output);
+        input_data.data(), scales_inv, shape, channel_index, &quantized_output,
+        t->type);
 
-    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
-                   quantized_output.data() + quantized_output.size());
+    if (t->type == kTfLiteInt4) {
+      PopulateTensor4bit(index, /*offset=*/0, quantized_output.data(),
+                         quantized_output.data() + quantized_output.size());
+
+    } else {
+      PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                     quantized_output.data() + quantized_output.size());
+    }
   }
 
   template <typename T>
@@ -697,6 +722,9 @@ class SingleOpModel {
         } else if (t.type == TensorType_INT16) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int16_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT4) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max, kTfLiteInt4);
         } else {
           LOG(FATAL) << "No support for the requested quantized type";
         }
@@ -756,12 +784,62 @@ class SingleOpModel {
     absl::c_copy(data, v + offset);
   }
 
+  void PackInt4ValuesDenselyInPlace(uint8_t* src_buffer, int buffer_size) {
+    for (int i = 0; i < buffer_size; ++i) {
+      if (i % 2 == 0) {
+        src_buffer[i / 2] = src_buffer[i] & 0x0F;
+      } else {
+        src_buffer[i / 2] |= src_buffer[i] << 4;
+      }
+    }
+    // the rest of the buffer should be empty since half of it is packed with
+    // the values
+    memset(src_buffer + (buffer_size + 1) / 2, 0, buffer_size / 2);
+  }
+
+  int ElementCount(TfLiteIntArray& dims) {
+    int result = 1;
+    for (int i = 0; i < dims.size; ++i) {
+      result *= dims.data[i];
+    }
+    return result;
+  }
+
+  // Partially populates the tensor, starting at the given offset.
+  void PopulateTensor4bit(int index, int offset, int8_t* begin, int8_t* end) {
+    auto data = absl::Span<int8_t>(begin, end - begin);
+    TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+    uint8_t* v = nullptr;
+    if (tensor_ptr) {
+      v = reinterpret_cast<uint8_t*>(tensor_ptr->data.data);
+    }
+
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      LOG(FATAL) << "Unknown tensor error.";
+    }
+    absl::c_copy(data, v + offset);
+    PackInt4ValuesDenselyInPlace(v, ElementCount(*tensor_ptr->dims));
+    tensor_ptr->bytes = ((ElementCount(*tensor_ptr->dims) + 1) / 2);
+  }
+
   template <typename T>
-  std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+  std::pair<float, int32_t> QuantizationParams(
+      float f_min, float f_max, TfLiteType type = kTfLiteNoType) {
     int32_t zero_point = 0;
     float scale = 0;
-    const T qmin = std::numeric_limits<T>::min();
-    const T qmax = std::numeric_limits<T>::max();
+    T qmin;
+    T qmax;
+
+    if (type == kTfLiteInt4) {
+      qmin = -7;
+      qmax = 7;
+    } else {
+      qmin = std::numeric_limits<T>::min();
+      qmax = std::numeric_limits<T>::max();
+    }
     const float qmin_double = qmin;
     const float qmax_double = qmax;
     // 0 should always be a representable value. Let's assume that the initial
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index af3bab509ad..0b5c8fe7686 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 6d0d2c2cdd4..4ef4cdbfbca 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <iterator>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -215,7 +215,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Set output dynamic if the `top_k` tensor is not constant, or the input has
   // dynamic dimensions (indicated by dims signature).
-  if (IsConstantTensor(top_k) && !HasUnspecifiedDimension(input)) {
+  if (IsConstantOrPersistentTensor(top_k) && !HasUnspecifiedDimension(input)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   } else {
     TfLiteTensor* output_indexes;
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 112e1fa0892..64c7cf3a685 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -14,7 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+#include <array>
+
+#include "xnnpack.h"  // from @XNNPACK
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#endif  // TFLITE_KERNEL_USE_XNNPACK
+
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -98,10 +105,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int size = op_context.perm->dims->data[0];
   TransposeParams params;
   params.perm_count = size;
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+  xnn_status status;
+  CpuBackendContext* cpu_backend_context =
+      CpuBackendContext::GetFromContext(context);
+  pthreadpool_t threadpool = cpu_backend_context->get_xnnpack_threadpool();
+  threadpool = nullptr;
+  std::array<size_t, kTransposeMaxDimensions> xnn_input_shape;
+  std::array<size_t, kTransposeMaxDimensions> xnn_perm;
+  TfLiteIntArray* input_shape = op_context.input->dims;
   for (int i = 0; i < size; ++i) {
     params.perm[i] = perm_data[i];
+    xnn_perm[i] = perm_data[i];
+    xnn_input_shape[i] = input_shape->data[i];
   }
 
+#else   // TFLITE_KERNEL_USE_XNNPACK
+  for (int i = 0; i < size; ++i) {
+    params.perm[i] = perm_data[i];
+  }
+#endif  // TFLITE_KERNEL_USE_XNNPACK
 #define TF_LITE_TRANSPOSE(type, scalar)                     \
   type::Transpose(params, GetTensorShape(op_context.input), \
                   GetTensorData<scalar>(op_context.input),  \
@@ -115,36 +138,65 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
     case kTfLiteInt32:
       if (kernel_type == kGenericOptimized) {
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+        status = xnn_run_transpose_nd_x32(
+            GetTensorData<int32_t>(op_context.input),
+            GetTensorData<int32_t>(op_context.output), size,
+            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(context, "Failed to run xnnpack transpose");
+        }
+#else   // TFLITE_KERNEL_USE_XNNPACK
         TF_LITE_TRANSPOSE(optimized_ops, int32_t);
+#endif  // TFLITE_KERNEL_USE_XNNPACK
       } else {
         TF_LITE_TRANSPOSE(reference_ops, int32_t);
       }
       break;
+    case kTfLiteBool:
+      if (sizeof(bool) != 1) {
+        TF_LITE_TRANSPOSE(reference_ops, bool);
+        break;
+      }
+      [[fallthrough]];
     case kTfLiteUInt8:
     case kTfLiteInt8:
       if (kernel_type == kGenericOptimized) {
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+        status = xnn_run_transpose_nd_x8(
+            GetTensorData<int8_t>(op_context.input),
+            GetTensorData<int8_t>(op_context.output), size,
+            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(context, "Failed to run xnnpack transpose");
+        }
+#else   // TFLITE_KERNEL_USE_XNNPACK
         TF_LITE_TRANSPOSE(optimized_ops, int8_t);
+#endif  // TFLITE_KERNEL_USE_XNNPACK
       } else {
         TF_LITE_TRANSPOSE(reference_ops, int8_t);
       }
       break;
     case kTfLiteInt16:
-      TF_LITE_TRANSPOSE(reference_ops, int16_t);
-      break;
-    case kTfLiteInt64:
-      TF_LITE_TRANSPOSE(reference_ops, int64_t);
-      break;
-    case kTfLiteBool:
-      if (sizeof(bool) == 1) {
-        if (kernel_type == kGenericOptimized) {
-          TF_LITE_TRANSPOSE(optimized_ops, int8_t);
-        } else {
-          TF_LITE_TRANSPOSE(reference_ops, int8_t);
+      if (kernel_type == kGenericOptimized) {
+#ifdef TFLITE_KERNEL_USE_XNNPACK
+        status = xnn_run_transpose_nd_x16(
+            GetTensorData<int16_t>(op_context.input),
+            GetTensorData<int16_t>(op_context.output), size,
+            xnn_input_shape.data(), xnn_perm.data(), /*flags=*/0, threadpool);
+        if (status != xnn_status_success) {
+          TF_LITE_KERNEL_LOG(context, "Failed to run xnnpack transpose");
         }
+#else   // TFLITE_KERNEL_USE_XNNPACK
+        TF_LITE_TRANSPOSE(optimized_ops, int16_t);
+#endif  // TFLITE_KERNEL_USE_XNNPACK
       } else {
-        TF_LITE_TRANSPOSE(reference_ops, bool);
+        TF_LITE_TRANSPOSE(reference_ops, int16_t);
       }
       break;
+    case kTfLiteInt64:
+      TF_LITE_TRANSPOSE(reference_ops, int64_t);
+      break;
     default:
       TF_LITE_KERNEL_LOG(context,
                          "Type %s is currently not supported by Transpose.",
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index c4797628142..8630b182f1e 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 // NOLINTNEXTLINE - This header file shouldn't go to the top.
@@ -381,8 +381,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     data->per_channel_output_multiplier.resize(channels_out);
     data->per_channel_output_shift.resize(channels_out);
+    auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, weights, bias, output, kTfLiteActNone,
+        context, input, weights, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier.data(),
@@ -398,6 +399,10 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
                const TfLiteTensor* weights, const TfLiteTensor* bias,
                const TfLiteTensor* transposed_weights, TfLiteTensor* col2im,
                TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
   tflite::ConvParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
@@ -406,6 +411,8 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params,
   op_params.padding_values.height_offset = data->padding.height_offset;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
 
   switch (kernel_type) {
     case kReference: {
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index cedfddf60e7..02532e71bf7 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -60,6 +60,7 @@ class BaseTransposeConvOpModel : public SingleOpModel {
                            std::initializer_list<InputType> filter_data,
                            const TensorData& input, const TensorData& output,
                            Padding padding, int stride_w, int stride_h,
+                           tflite::ActivationFunctionType fused_activation,
                            TestType test_type, int version = 1) {
     // Just to be confusing, transpose_conv has an _input_ named "output_shape"
     // that sets the shape of the output tensor of the op :). It must always be
@@ -75,10 +76,11 @@ class BaseTransposeConvOpModel : public SingleOpModel {
 
     output_ = AddOutput(output);
 
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
-        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE_CONV,
+                 BuiltinOptions_TransposeConvOptions,
+                 CreateTransposeConvOptions(builder_, padding, stride_w,
+                                            stride_h, fused_activation)
+                     .Union());
     resolver_ = std::make_unique<SingleOpResolver>(
         BuiltinOperator_TRANSPOSE_CONV, registration, version);
     BuildInterpreter(
@@ -146,7 +148,8 @@ TEST_P(TransposeConvOpTest, SimpleTest) {
   TransposeConvOpModel model(
       GetRegistration(), {1, 4, 4, 1}, {TensorType_FLOAT32, {1, 3, 3, 1}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9}, {TensorType_FLOAT32, {1, 4, 4, 1}},
-      {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1, GetTestType());
+      {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1, ActivationFunctionType_NONE,
+      GetTestType());
   model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
@@ -157,6 +160,34 @@ TEST_P(TransposeConvOpTest, SimpleTest) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+// Test case:
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     tf.constant(np.arange(1, 10), shape=[ 3, 3, 1, 1 ], dtype=tf.float32),
+//     tf.constant(np.arange(1, 17), shape=[ 1, 4, 4, 1 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+TEST_P(TransposeConvOpTest, fusedRELUTest) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  TransposeConvOpModel model(
+      GetRegistration(), {1, 4, 4, 1}, {TensorType_FLOAT32, {1, 3, 3, 1}},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9}, {TensorType_FLOAT32, {1, 4, 4, 1}},
+      {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1, ActivationFunctionType_RELU,
+      GetTestType());
+  model.SetInput({1, 2, -3, -4, 5, 6, -7, -8, 9, 10, -11, -12, 13, 14, 15, 16});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({29, 24, 0, 0, 99, 72, 0, 0, 207, 186, 0, 0, 263,
+                                292, 141, 0}));
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
 // Test case:
 // filter = tf.constant(np.arange(1, 19),
 //                      shape=[ 3, 3, 1, 2 ],
@@ -174,7 +205,7 @@ TEST_P(TransposeConvOpTest, TwoFiltersTest) {
       GetRegistration(), {1, 4, 4, 1}, {TensorType_FLOAT32, {1, 3, 3, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
       {TensorType_FLOAT32, {1, 4, 4, 2}}, {TensorType_FLOAT32, {}},
-      Padding_SAME, 1, 1, GetTestType());
+      Padding_SAME, 1, 1, ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
                   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                   23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
@@ -203,7 +234,7 @@ TEST_P(TransposeConvOpTest, PaddingValidTest) {
       GetRegistration(), {1, 6, 6, 1}, {TensorType_FLOAT32, {1, 3, 3, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
       {TensorType_FLOAT32, {1, 4, 4, 2}}, {TensorType_FLOAT32, {}},
-      Padding_VALID, 1, 1, GetTestType());
+      Padding_VALID, 1, 1, ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
                   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                   23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
@@ -232,7 +263,8 @@ TEST_P(TransposeConvOpTest, StrideValidTest) {
   TransposeConvOpModel model(
       GetRegistration(), {1, 5, 5, 1}, {TensorType_FLOAT32, {1, 3, 3, 1}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9}, {TensorType_FLOAT32, {1, 2, 2, 1}},
-      {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2, GetTestType());
+      {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2,
+      ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1, 2, 3, 4});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
@@ -258,7 +290,7 @@ TEST_P(TransposeConvOpTest, MultiChannelTest) {
       GetRegistration(), {1, 5, 5, 2}, {TensorType_FLOAT32, {2, 3, 3, 1}},
       {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
       {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {}},
-      Padding_VALID, 2, 2, GetTestType());
+      Padding_VALID, 2, 2, ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1, 2, 3, 4});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
@@ -287,7 +319,8 @@ TEST_P(TransposeConvOpTest, AccuracyTest) {
   TransposeConvOpModel model(
       GetRegistration(), {1, 3, 4, 1}, {TensorType_FLOAT32, {1, 3, 3, 1}},
       {9, 5, 6, 9, 8, 5, 3, 1, 4}, {TensorType_FLOAT32, {1, 1, 2, 1}},
-      {TensorType_FLOAT32, {}}, Padding_SAME, 3, 3, GetTestType());
+      {TensorType_FLOAT32, {}}, Padding_SAME, 3, 3, ActivationFunctionType_NONE,
+      GetTestType());
   model.SetInput({323, 521});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
@@ -298,6 +331,37 @@ TEST_P(TransposeConvOpTest, AccuracyTest) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
 }
 
+// Test case:
+// filter = tf.constant(np.random.randint(1, 10, size=9),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 3, 4, 1 ]),
+//     filter,
+//     tf.constant([323, 521], shape=[ 1, 1, 2, 1], dtype=tf.float32),
+//     [1, 3, 3, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
+TEST_P(TransposeConvOpTest, AccuracyWithFusedActivationTest) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  TransposeConvOpModel model(
+      GetRegistration(), {1, 3, 4, 1}, {TensorType_FLOAT32, {1, 3, 3, 1}},
+      {9, 5, 6, 9, 8, 5, 3, 1, 4}, {TensorType_FLOAT32, {1, 1, 2, 1}},
+      {TensorType_FLOAT32, {}}, Padding_SAME, 3, 3, ActivationFunctionType_RELU,
+      GetTestType());
+  model.SetInput({323, -521});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {1615, 1938, 0, 0, 2584, 1615, 0, 0, 323, 1292, 0, 0})));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
+}
+
 class QuantizedTransposeConvOpModel : public BaseTransposeConvOpModel<uint8_t> {
  public:
   using BaseTransposeConvOpModel::BaseTransposeConvOpModel;
@@ -316,7 +380,8 @@ TEST_P(TransposeConvOpTest, SimpleTestQuantized) {
       GetRegistration(), {1, 4, 4, 1},
       {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
-      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1, GetTestType());
+      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1,
+      ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
@@ -330,6 +395,32 @@ TEST_P(TransposeConvOpTest, SimpleTestQuantized) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+TEST_P(TransposeConvOpTest, SimpleTestWithFusedActivationQuantized) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  // Float would be {1, 2, 3, 4, 5, 6, 7, 8, 9}
+  std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137,
+                                                139, 141, 143, 145};
+  QuantizedTransposeConvOpModel model(
+      GetRegistration(), {1, 4, 4, 1},
+      {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
+      {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1,
+      ActivationFunctionType_RELU, GetTestType());
+  model.SetInput({1, 2, -3, -4, 5, 6, -7, -8, 9, 10, -11, -12, 13, 14, 15, 16});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({28, 24, 0, 0, 100, 72, 0, 0, 208,
+                                               188, 0, 0, 264, 292, 140, 0})));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
 TEST_P(TransposeConvOpTest, TwoFiltersTestQuantized) {
   // Float would be {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
   // 18}
@@ -340,7 +431,8 @@ TEST_P(TransposeConvOpTest, TwoFiltersTestQuantized) {
       GetRegistration(), {1, 4, 4, 1},
       {TensorType_UINT8, {1, 3, 3, 2}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 2}, -63.5, 64},
-      {TensorType_UINT8, {}, -4064, 4096}, Padding_SAME, 1, 1, GetTestType());
+      {TensorType_UINT8, {}, -4064, 4096}, Padding_SAME, 1, 1,
+      ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
                   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                   23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
@@ -364,7 +456,8 @@ TEST_P(TransposeConvOpTest, PaddingValidTestQuantized) {
       GetRegistration(), {1, 6, 6, 1},
       {TensorType_UINT8, {1, 3, 3, 2}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 2}, -63.5, 64},
-      {TensorType_UINT8, {}, -4064, 4096}, Padding_VALID, 1, 1, GetTestType());
+      {TensorType_UINT8, {}, -4064, 4096}, Padding_VALID, 1, 1,
+      ActivationFunctionType_NONE, GetTestType());
   model.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
                   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                   23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
@@ -404,7 +497,8 @@ TEST_P(TransposeConvOpTest, SimpleTestQuantizedPerChannelSingleChannel) {
       {TensorType_INT8, {1, 3, 3, 1}, 0, 0, 0, 0, true, {9.0 / 127}, {0}, 0},
       const_filter_data,
       {TensorType_INT8, {1, 4, 4, 1}, 0, 0, 16.0 / 255, -128},
-      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1, GetTestType(),
+      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1,
+      ActivationFunctionType_NONE, GetTestType(),
       /* version */ 2);
   model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   if (GetTestType() == TestType::kDynamic) {
@@ -442,7 +536,8 @@ TEST_P(TransposeConvOpTest, TestQuantizedPerChannelMultiChannel) {
        {0, 0},
        0},
       const_filter_data, {TensorType_INT8, {1, 2, 2, 1}, 0, 0, 4.0 / 255, -128},
-      {TensorType_INT8, {}, 0, 0, 1, -128}, Padding_VALID, 2, 2, GetTestType(),
+      {TensorType_INT8, {}, 0, 0, 1, -128}, Padding_VALID, 2, 2,
+      ActivationFunctionType_NONE, GetTestType(),
       /* version */ 2);
   model.SetInput({1, 2, 3, 4});
   if (GetTestType() == TestType::kDynamic) {
@@ -482,7 +577,8 @@ TEST_P(TransposeConvOpTest, TestQuantizedPerTensorMultiChannel) {
        {0, 0},
        0},
       const_filter_data, {TensorType_INT8, {1, 2, 2, 1}, 0, 0, 4.0 / 255, -128},
-      {TensorType_INT8, {}, 0, 0, 1, -128}, Padding_VALID, 2, 2, GetTestType(),
+      {TensorType_INT8, {}, 0, 0, 1, -128}, Padding_VALID, 2, 2,
+      ActivationFunctionType_NONE, GetTestType(),
       /* version */ 2);
   model.SetInput({1, 2, 3, 4});
   if (GetTestType() == TestType::kDynamic) {
@@ -555,7 +651,8 @@ TEST_P(TransposeConvOpTest, SimpleTestQuantizedPerChannel16x8) {
        /*scale=*/1.0,
        /*zero_point=*/0},
       /*padding=*/Padding_SAME,
-      /*stride_w=*/1, /*stride_h=*/1, GetTestType());
+      /*stride_w=*/1, /*stride_h=*/1,
+      /*fused_activation_function=*/ActivationFunctionType_NONE, GetTestType());
   model.SetInput({
       // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
       3, 2,    // batch = 0, y = 0, x = 0
@@ -576,6 +673,72 @@ TEST_P(TransposeConvOpTest, SimpleTestQuantizedPerChannel16x8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 3, 2}));
 }
 
+TEST_P(TransposeConvOpTest,
+       SimpleTestWithFusedActivationQuantizedPerChannel16x8) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  const std::initializer_list<float> filter_data = {
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      1, 2,  // out channel = 0, y = 0, x = 0
+      3, 4,  // out channel = 0, y = 0, x = 1
+      3, 4,  // out channel = 0, y = 1, x = 0
+      5, 6,  // out channel = 0, y = 1, x = 1
+      7, 8,  // out channel = 1, y = 0, x = 0
+      5, 6,  // out channel = 1, y = 0, x = 1
+      3, 4,  // out channel = 1, y = 1, x = 0
+      1, 2,  // out channel = 1, y = 1, x = 1
+  };
+  PerChannelQuantizedTransposeConvOpModel16x8 model(
+      GetRegistration(),
+      /*output_shape_data=*/{1, 2, 3, 2},
+      /*filter=*/
+      {TensorType_INT8,
+       /*shape=*/{2, 2, 2, 2},
+       /*min=*/-64, /*max=*/64,
+       /*scale=*/0, /*zero_point=*/0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{7.0 / 127, 8.0 / 127},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      /*filter_data=*/{},
+      /*input=*/
+      {TensorType_INT16,
+       /*shape=*/{1, 2, 3, 2},
+       /*min=*/0, /*max=*/0,
+       /*scale=*/4.0 / 127,
+       /*zero_point=*/0},
+      /*output=*/
+      {TensorType_INT16,
+       /*shape=*/{},
+       /*min=*/0, /*max=*/0,
+       /*scale=*/1.0,
+       /*zero_point=*/0},
+      /*padding=*/Padding_SAME,
+      /*stride_w=*/1, /*stride_h=*/1,
+      /*fused_activation_function=*/ActivationFunctionType_RELU, GetTestType());
+  model.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  model.SetFilter(filter_data);
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {7, 37, 16, 26, 0, 0, 27, 69, 48, 42, 0, 0}, 1e-5)));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 3, 2}));
+}
+
 template <typename InputType>
 class BaseTransposeConvBiasOpModel : public SingleOpModel {
  public:
@@ -585,8 +748,9 @@ class BaseTransposeConvBiasOpModel : public SingleOpModel {
                                std::initializer_list<InputType> filter_data,
                                const TensorData& input,
                                const TensorData& output, Padding padding,
-                               int stride_w, int stride_h, TestType test_type,
-                               int version = 3) {
+                               int stride_w, int stride_h,
+                               tflite::ActivationFunctionType fused_activation,
+                               TestType test_type, int version = 3) {
     if (test_type == TestType::kDynamic) {
       output_shape_ = AddInput({TensorType_INT32, {4}});
       filter_ = AddInput(filter);
@@ -630,10 +794,11 @@ class BaseTransposeConvBiasOpModel : public SingleOpModel {
 
     output_ = AddOutput(output);
 
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
-        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE_CONV,
+                 BuiltinOptions_TransposeConvOptions,
+                 CreateTransposeConvOptions(builder_, padding, stride_w,
+                                            stride_h, fused_activation)
+                     .Union());
     resolver_ = std::make_unique<SingleOpResolver>(
         BuiltinOperator_TRANSPOSE_CONV, registration, version);
     BuildInterpreter({GetShape(output_shape_), GetShape(filter_),
@@ -702,7 +867,9 @@ TEST_P(TransposeConvOpTest, MultiChannelBiasTest) {
       {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
       /*input=*/{TensorType_FLOAT32, {1, 2, 2, 1}},
       /*output=*/{TensorType_FLOAT32, {}}, Padding_VALID,
-      /*stride_w=*/2, /*stride_h=*/2, GetTestType(), /* version */ 3);
+      /*stride_w=*/2, /*stride_h=*/2,
+      /*fused_activation_function=*/ActivationFunctionType_NONE, GetTestType(),
+      /* version */ 3);
   model.SetInput({1, 2, 3, 4});
   model.SetBias({3, 4});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
@@ -716,6 +883,47 @@ TEST_P(TransposeConvOpTest, MultiChannelBiasTest) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
 }
 
+// Test case:
+// input_data = np.arange(1, 5).reshape(1,2,2,1).astype(np.float32)
+// filter_data = np.arange(1, 19).reshape(3,3,2,1).astype(np.float32)
+// bias_data = np.array([3,4])
+// input = tf.keras.layers.Input(shape=(2, 2, 1))
+// output = tf.keras.layers.Convolution2DTranspose(filters=2,
+//                                                 kernel_size=[3, 3],
+//                                                 strides=[2, 2],
+//                                                 padding="valid")(input)
+// model = tf.keras.models.Model(input, output)
+// model.layers[1].set_weights([filter_data, bias_data])
+// output = model.predict(input_data)
+TEST_P(TransposeConvOpTest, MultiChannelBiasWithFusedActivationTest) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  TransposeConvOpBiasModel model(
+      GetRegistration(), /*output_shape=*/{1, 5, 5, 2},
+      /*filter=*/{TensorType_FLOAT32, {2, 3, 3, 1}},
+      /*filter_data=*/
+      {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+      /*input=*/{TensorType_FLOAT32, {1, 2, 2, 1}},
+      /*output=*/{TensorType_FLOAT32, {}}, Padding_VALID,
+      /*stride_w=*/2, /*stride_h=*/2,
+      /*fused_activation_function=*/ActivationFunctionType_RELU, GetTestType(),
+      /* version */ 3);
+  model.SetInput({1, 2, -3, 4});
+  model.SetBias({3, 4});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({4,  6,  6,  8,  10, 14, 9,  12, 13, 16, 10, 12, 12,
+                        14, 28, 32, 21, 24, 25, 28, 13, 12, 9,  8,  35, 40,
+                        45, 52, 57, 64, 0,  0,  0,  0,  0,  0,  39, 44, 47,
+                        52, 0,  0,  0,  0,  4,  6,  63, 68, 71, 76}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
 class QuantizedTransposeConvBiasOpModel
     : public BaseTransposeConvBiasOpModel<uint8_t> {
  public:
@@ -735,8 +943,8 @@ TEST_P(TransposeConvOpTest, SimpleBiasTestQuantized) {
       GetRegistration(), {1, 4, 4, 1},
       {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
       {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
-      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1, GetTestType(),
-      /* version */ 3);
+      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1,
+      ActivationFunctionType_NONE, GetTestType(), /* version */ 3);
   model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   model.SetBias({1});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
@@ -751,6 +959,35 @@ TEST_P(TransposeConvOpTest, SimpleBiasTestQuantized) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+TEST_P(TransposeConvOpTest, SimpleBiasWithFusedActivationTestQuantized) {
+  // NNAPI can not support for the new kernel behaviors at the moment.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+
+  // Float would be {1, 2, 3, 4, 5, 6, 7, 8, 9}
+  std::initializer_list<uint8_t> filter_data = {129, 131, 133, 135, 137,
+                                                139, 141, 143, 145};
+  QuantizedTransposeConvBiasOpModel model(
+      GetRegistration(), {1, 4, 4, 1},
+      {TensorType_UINT8, {1, 3, 3, 1}, -63.5, 64}, filter_data,
+      {TensorType_UINT8, {1, 4, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {}, -508, 512}, Padding_SAME, 1, 1,
+      ActivationFunctionType_RELU, GetTestType(), /* version */ 3);
+  model.SetInput(
+      {1, 2, -3, -4, 5, 6, -7, -8, 9, 10, -11, -12, 13, 14, -15, -16});
+  model.SetBias({1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(model.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {32, 24, 0, 0, 100, 72, 0, 0, 208, 156, 0, 0, 264, 172, 0, 0},
+                  1e-5)));
+
+  // GetOutputShape() should always be same as model.SetOutputShape(...);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
 class PerChannelQuantizedTransposeConvBiasOpModel
     : public BaseTransposeConvBiasOpModel<int8_t> {
  public:
@@ -779,8 +1016,8 @@ TEST_P(TransposeConvOpTest, SimpleBiasTestQuantizedPerChannelSingleChannel) {
       {TensorType_INT8, {1, 3, 3, 1}, 0, 0, 0, 0, true, {9.0 / 127}, {0}, 0},
       const_filter_data,
       {TensorType_INT8, {1, 4, 4, 1}, 0, 0, 16.0 / 255, -128},
-      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1, GetTestType(),
-      /* version */ 3);
+      {TensorType_INT8, {}, 0, 0, 2, -128}, Padding_SAME, 1, 1,
+      ActivationFunctionType_NONE, GetTestType(), /* version */ 3);
   model.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   if (GetTestType() == TestType::kDynamic) {
     model.SetFilter(filter_data);
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index d45b4e454c3..01be4bba0f4 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -92,7 +92,7 @@ class TransposeOpDynamicModel : public TransposeOpModel {
   }
 };
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, TestUnequalPermSize) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
@@ -376,7 +376,7 @@ TEST(TransposeTest, 5DDividedIntoTwo2DsThird) {
   EXPECT_EQ(m.GetOutput(), out);
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, Test7DInputTensor) {
   EXPECT_DEATH(
       TransposeOpConstModel({1, 2, 3, 4, 5, 6, 7}, {6}, {0, 1, 2, 3, 4, 5}),
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/kernels/unidirectional_sequence_gru.cc
index b43acbb9f19..9e26ced2919 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_gru.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_gru.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/gru_cell.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 025a099a45a..04167e897fb 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
@@ -43,6 +43,11 @@ struct OpData {
   int scratch_tensor_index;
   bool compute_row_sums = false;
 
+  bool recurrent_to_input_is_diag = false;
+  bool recurrent_to_forget_is_diag = false;
+  bool recurrent_to_cell_is_diag = false;
+  bool recurrent_to_output_is_diag = false;
+
   lstm_eval::IntegerLstmParameter integer_lstm_param;
 };
 
@@ -453,12 +458,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
       context, node, lstm::full::kRecurrentToInputWeightsTensor);
+
   if (recurrent_to_input_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    bool recurrent_to_input_is_diag =
+        recurrent_to_input_weights->dims->size == 1;
+    if (recurrent_to_input_is_diag) {
+      TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 1);
+    } else {
+      TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+      TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                        n_output);
+      TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_input_weights->type,
+                              input_to_forget_weights->type);
+    }
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
   }
 
   const TfLiteTensor* recurrent_to_forget_weights;
@@ -466,21 +480,33 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       context,
       GetInputSafe(context, node, lstm::full::kRecurrentToForgetWeightsTensor,
                    &recurrent_to_forget_weights));
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
+  bool recurrent_to_forget_is_diag =
+      recurrent_to_forget_weights->dims->size == 1;
+  if (recurrent_to_forget_is_diag) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 1);
+  } else {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                      n_output);
+    TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_forget_weights->type,
+                            input_to_forget_weights->type);
+  }
 
   const TfLiteTensor* recurrent_to_cell_weights;
   TF_LITE_ENSURE_OK(
       context,
       GetInputSafe(context, node, lstm::full::kRecurrentToCellWeightsTensor,
                    &recurrent_to_cell_weights));
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
+  bool recurrent_to_cell_is_diag = recurrent_to_cell_weights->dims->size == 1;
+  if (recurrent_to_cell_is_diag) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 1);
+  } else {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                      n_output);
+    TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_cell_weights->type,
+                            input_to_forget_weights->type);
+  }
 
   // We make sure the input-gate's parameters are either both present (regular
   // LSTM) or not at all (CIFG-LSTM).
@@ -852,7 +878,7 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-// Resize the output and  state tensors based on the sizes of the input tensors.
+// Resize the output and state tensors based on the sizes of the input tensors.
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -909,10 +935,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       context,
       GetInputSafe(context, node, lstm::full::kRecurrentToOutputWeightsTensor,
                    &recurrent_to_output_weights));
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  bool recurrent_to_output_is_diag =
+      recurrent_to_output_weights->dims->size == 1 ? true : false;
+  if (recurrent_to_output_is_diag) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 1);
+  } else {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->type,
+                      input_to_output_weights->type);
+  }
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
                     n_cell);
-  const int n_output = recurrent_to_output_weights->dims->data[1];
+  const int n_output = recurrent_to_output_is_diag
+                           ? recurrent_to_output_weights->dims->data[0]
+                           : recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
   TF_LITE_ENSURE_OK(
@@ -1219,7 +1255,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
           node->builtin_data);
-  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   const bool use_layer_norm = op_data->use_layer_norm;
   const bool time_major = params->time_major;
   const TfLiteTensor* input;
@@ -1347,6 +1383,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           projection_weights, projection_bias, &lstm_params,
           /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
+          /*recurrent_to_input_is_diag=*/
+          (recurrent_to_input_weights == nullptr ||
+           recurrent_to_input_weights->dims->size == 1),
+          /*recurrent_to_forget_is_diag=*/
+          (recurrent_to_forget_weights->dims->size == 1),
+          /*recurrent_to_cell_is_diag=*/
+          (recurrent_to_cell_weights->dims->size == 1),
+          /*recurrent_to_output_is_diag=*/
+          (recurrent_to_output_weights->dims->size == 1),
           CpuBackendContext::GetFromContext(context));
     }
     case kTfLiteUInt8:
@@ -1359,7 +1404,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             context,
             GetTemporarySafe(context, node, kScratchBuffer, &scratch_buffer));
 
-        OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
         TfLiteTensor* row_sums;
         TF_LITE_ENSURE_OK(context,
                           GetTemporarySafe(context, node, kRowSums, &row_sums));
@@ -1406,6 +1450,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_zp=*/nullptr,
             GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
             row_sums_size, &op_data->compute_row_sums,
+            /*recurrent_to_input_is_diag=*/
+            (recurrent_to_input_weights == nullptr ||
+             recurrent_to_input_weights->dims->size == 1),
+            /*recurrent_to_forget_is_diag=*/
+            (recurrent_to_forget_weights->dims->size == 1),
+            /*recurrent_to_cell_is_diag=*/
+            (recurrent_to_cell_weights->dims->size == 1),
+            /*recurrent_to_output_is_diag=*/
+            (recurrent_to_output_weights->dims->size == 1),
             CpuBackendContext::GetFromContext(context));
       } else {
         TfLiteTensor* scratch0;
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index cf3fd3a031a..fa06d2201bb 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential LSTM op.
 
+#include <tuple>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -29,83 +30,6 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
-// The hybrid model has quantized weights.
-class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
- public:
-  HybridUnidirectionalLSTMOpModel(
-      int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
-      bool time_major, bool use_cifg, bool use_peephole,
-      bool use_projection_weights, bool use_projection_bias, float cell_clip,
-      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
-      TensorType tensor_type, bool asymmetric_quantize_inputs)
-      : UnidirectionalLSTMOpModel(
-            n_batch, n_input, n_cell, n_output, sequence_length, time_major,
-            use_cifg, use_peephole, use_projection_weights, use_projection_bias,
-            cell_clip, proj_clip, input_shapes, tensor_type, false,
-            asymmetric_quantize_inputs) {
-    tensor_type_ = tensor_type;
-  }
-
-  void SetWeights(int weights_idx, const std::vector<float>& f) {
-    if (tensor_type_ == TensorType_UINT8) {
-      SymmetricQuantizeAndPopulate(weights_idx, f);
-    } else {
-      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
-    }
-  }
-
-  void SetInputToInputWeights(const std::vector<float>& f) {
-    SetWeights(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(const std::vector<float>& f) {
-    SetWeights(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(const std::vector<float>& f) {
-    SetWeights(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(const std::vector<float>& f) {
-    SetWeights(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    SetWeights(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(const std::vector<float>& f) {
-    SetWeights(cell_to_output_weights_, f);
-  }
-
-  void SetProjectionWeights(const std::vector<float>& f) {
-    SetWeights(projection_weights_, f);
-  }
-
- protected:
-  TensorType tensor_type_;
-};
-
 class BaseUnidirectionalLstmTest : public ::testing::TestWithParam<bool> {
  protected:
   // Weights of the LSTM model. Some are optional.
@@ -3097,6 +3021,369 @@ TEST(IntegerUnidirectionalSequenceLstmOpTest,
   EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(expected_output));
 }
 
+class IndyLSTMOpTest
+    : public ::testing::TestWithParam<std::tuple<bool, bool, bool>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    PeepHoleAndCifg, IndyLSTMOpTest,
+    testing::Combine(/*use_cifg=*/testing::Bool(),
+                     /*use_peephole=*/testing::Bool(),
+                     /*asymmetric_quantize_inputs=*/testing::Bool()));
+
+TEST_P(IndyLSTMOpTest, HybridCheckThatDiagAndNonDiagRecurrentWeightsAreEqual) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when diagonal recurrent weights are
+  // used.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  auto params = GetParam();
+  const bool use_cifg = std::get<0>(params);
+  const bool use_peephole = std::get<1>(params);
+  const bool asymmetric_quantize_inputs = std::get<2>(params);
+
+  // Populates the non-recurrent weights of a LSTM/IndyLSTM model with the same
+  // values.
+  auto SetLstmWeights = [&](HybridUnidirectionalLSTMOpModel& model) -> void {
+    if (!use_cifg) {
+      model.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                                    -0.34550029, 0.04266912, -0.15680569,
+                                    -0.34856534, 0.43890524});
+    }
+
+    model.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                                 -0.20583314, 0.44344562, 0.22077113,
+                                 -0.29909778});
+
+    model.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                   -0.31343272, -0.40032279, 0.44781327,
+                                   0.01387155, -0.35593212});
+
+    model.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                   0.40525138, 0.44272184, 0.03897077,
+                                   -0.1556896, 0.19487578});
+
+    if (!use_cifg) {
+      model.SetInputGateBias({0., 0., 0., 0.});
+    }
+
+    model.SetCellBias({0., 0., 0., 0.});
+
+    model.SetForgetGateBias({1., 1., 1., 1.});
+
+    model.SetOutputGateBias({0., 0., 0., 0.});
+    if (use_peephole) {
+      if (!use_cifg) {
+        model.SetCellToInputWeights(
+            {0.040369894,  0.030746894,  0.24704495,   0.018586371,
+             -0.037586458, -0.15312155,  -0.11812848,  -0.11465643,
+             0.20259799,   0.11418174,   -0.10116027,  -0.011334949,
+             0.12411352,   -0.076769054, -0.052169047, 0.21198851,
+             -0.38871562,  -0.09061183,  -0.09683246,  -0.21929175});
+      }
+      model.SetCellToForgetWeights(
+          {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+      model.SetCellToOutputWeights(
+          {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+    }
+  };
+
+  std::vector<int> input_weights_shape{n_cell, n_input};
+  if (use_cifg) {
+    input_weights_shape = std::vector<int>{0, 0};
+  }
+  std::vector<int> recurrent_to_input_weights_shape{n_cell, n_output};
+  if (use_cifg) {
+    input_weights_shape = std::vector<int>{0, 0};
+  }
+  // Input shapes common to both the LSTM and IndyLSTM models.
+  std::vector<std::vector<int>> input_shapes = {
+      {sequence_length, n_batch, n_input},  // input tensor
+
+      // Forward cell
+      input_weights_shape,  // input_to_input_weight tensor
+      {n_cell, n_input},    // input_to_forget_weight tensor
+      {n_cell, n_input},    // input_to_cell_weight tensor
+      {n_cell, n_input},    // input_to_output_weight tensor
+
+      recurrent_to_input_weights_shape,  // recurrent_to_input_weight tensor
+      {n_cell, n_output},                // recurrent_to_forget_weight tensor
+      {n_cell, n_output},                // recurrent_to_cell_weight tensor
+      {n_cell, n_output},                // recurrent_to_output_weight tensor
+
+      {(use_peephole & !use_cifg) ? n_cell : 0},  // cell_to_input_weight tensor
+      {use_peephole ? n_cell : 0},  // cell_to_forget_weight tensor
+      {use_peephole ? n_cell : 0},  // cell_to_output_weight tensor
+
+      {n_cell},  // input_gate_bias tensor
+      {n_cell},  // forget_gate_bias tensor
+      {n_cell},  // cell_bias tensor
+      {n_cell},  // output_gate_bias tensor
+
+      {0, 0},  // projection_weight tensor
+      {0},     // projection_bias tensor
+
+      {n_batch, n_output},  // output_state tensor
+      {n_batch, n_cell},    // cell_state tensor
+  };
+
+  // Build a regular bidirectional LSTM with full diagonal recurrent matrices.
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*time_major=*/true,
+      /*use_cifg=*/use_cifg,
+      /*use_peephole=*/use_peephole,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, input_shapes, TensorType_UINT8,
+      asymmetric_quantize_inputs, /*diagonal_recurrent_weights=*/false);
+
+  if (!use_cifg) {
+    lstm.SetRecurrentToInputWeights({-0.0063535, 0.0, 0.0, 0.0,  //
+                                     0.0, 0.08183324, 0.0, 0.0,  //
+                                     0.0, 0.0, 0.48091322, 0.0,  //
+                                     0.0, 0.0, 0.0, 0.10629296});
+  }
+
+  lstm.SetRecurrentToCellWeights({-0.3407414, 0.0, 0.0, 0.0,   //
+                                  0.0, -0.00123841, 0.0, 0.0,  //
+                                  0.0, 0.0, -0.501764, 0.0,    //
+                                  0.0, 0.0, 0.0, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights({-0.48684245, 0.0, 0.0, 0.0,  //
+                                    0.0, 0.20864892, 0.0, 0.0,   //
+                                    0.0, 0.0, 0.36447752, 0.0,   //
+                                    0.0, 0.0, 0.0, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights({0.43385774, 0.0, 0.0, 0.0,   //
+                                    0.0, -0.39835793, 0.0, 0.0,  //
+                                    0.0, 0.0, 0.20047462, 0.0,   //
+                                    0.0, 0.0, 0.0, 0.39922136});
+
+  input_shapes[5] = {n_cell};  // recurrent_to_input_weight tensor
+  input_shapes[6] = {n_cell};  // recurrent_to_forget_weight tensor
+  input_shapes[7] = {n_cell};  // recurrent_to_cell_weight tensor
+  input_shapes[8] = {n_cell};  // recurrent_to_output_weight tensor
+
+  // Build a regular unidirectional LSTM with full diagonal recurrent matrices.
+  HybridUnidirectionalLSTMOpModel indy_lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*time_major=*/true,
+      /*use_cifg=*/use_cifg,
+      /*use_peephole=*/use_peephole,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, input_shapes, TensorType_UINT8,
+      asymmetric_quantize_inputs, /*diagonal_recurrent_weights=*/true);
+  SetLstmWeights(lstm);
+  SetLstmWeights(indy_lstm);
+  if (!use_cifg) {
+    indy_lstm.SetRecurrentToInputWeights(
+        {-0.0063535, 0.08183324, 0.48091322, 0.10629296});
+  }
+
+  indy_lstm.SetRecurrentToCellWeights(
+      {-0.3407414, -0.00123841, -0.501764, -0.16368064});
+
+  indy_lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, 0.20864892, 0.36447752, -0.01140004});
+
+  indy_lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.39835793, 0.20047462, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  indy_lstm.SetInput(0, batch0_start, batch0_end);
+
+  ASSERT_EQ(lstm.Invoke(), kTfLiteOk);
+  ASSERT_EQ(indy_lstm.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(indy_lstm.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(lstm.GetOutput(), 1e-3)));
+}
+
+TEST_P(IndyLSTMOpTest, CheckThatDiagAndNonDiagRecurrentWeightsAreEqual) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when diagonal recurrent weights are
+  // used.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  auto params = GetParam();
+  const bool use_cifg = std::get<0>(params);
+  const bool use_peephole = std::get<1>(params);
+
+  // Populates the non-recurrent weights of a LSTM/IndyLSTM model with the same
+  // values.
+  auto SetLstmWeights = [&](UnidirectionalLSTMOpModel& model) -> void {
+    if (!use_cifg) {
+      model.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                                    -0.34550029, 0.04266912, -0.15680569,
+                                    -0.34856534, 0.43890524});
+    }
+
+    model.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                                 -0.20583314, 0.44344562, 0.22077113,
+                                 -0.29909778});
+
+    model.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                   -0.31343272, -0.40032279, 0.44781327,
+                                   0.01387155, -0.35593212});
+
+    model.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                   0.40525138, 0.44272184, 0.03897077,
+                                   -0.1556896, 0.19487578});
+
+    if (!use_cifg) {
+      model.SetInputGateBias({0., 0., 0., 0.});
+    }
+
+    model.SetCellBias({0., 0., 0., 0.});
+
+    model.SetForgetGateBias({1., 1., 1., 1.});
+
+    model.SetOutputGateBias({0., 0., 0., 0.});
+    if (use_peephole) {
+      if (!use_cifg) {
+        model.SetCellToInputWeights(
+            {0.040369894,  0.030746894,  0.24704495,   0.018586371,
+             -0.037586458, -0.15312155,  -0.11812848,  -0.11465643,
+             0.20259799,   0.11418174,   -0.10116027,  -0.011334949,
+             0.12411352,   -0.076769054, -0.052169047, 0.21198851,
+             -0.38871562,  -0.09061183,  -0.09683246,  -0.21929175});
+      }
+      model.SetCellToForgetWeights(
+          {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+      model.SetCellToOutputWeights(
+          {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+    }
+  };
+
+  std::vector<int> input_weights_shape{n_cell, n_input};
+  if (use_cifg) {
+    input_weights_shape = std::vector<int>{0, 0};
+  }
+  std::vector<int> recurrent_to_input_weights_shape{n_cell, n_output};
+  if (use_cifg) {
+    input_weights_shape = std::vector<int>{0, 0};
+  }
+  // Input shapes common to both the LSTM and IndyLSTM models.
+  std::vector<std::vector<int>> input_shapes = {
+      {sequence_length, n_batch, n_input},  // input tensor
+
+      // Forward cell
+      input_weights_shape,  // input_to_input_weight tensor
+      {n_cell, n_input},    // input_to_forget_weight tensor
+      {n_cell, n_input},    // input_to_cell_weight tensor
+      {n_cell, n_input},    // input_to_output_weight tensor
+
+      recurrent_to_input_weights_shape,  // recurrent_to_input_weight tensor
+      {n_cell, n_output},                // recurrent_to_forget_weight tensor
+      {n_cell, n_output},                // recurrent_to_cell_weight tensor
+      {n_cell, n_output},                // recurrent_to_output_weight tensor
+
+      {(use_peephole & !use_cifg) ? n_cell : 0},  // cell_to_input_weight tensor
+      {use_peephole ? n_cell : 0},  // cell_to_forget_weight tensor
+      {use_peephole ? n_cell : 0},  // cell_to_output_weight tensor
+
+      {n_cell},  // input_gate_bias tensor
+      {n_cell},  // forget_gate_bias tensor
+      {n_cell},  // cell_bias tensor
+      {n_cell},  // output_gate_bias tensor
+
+      {0, 0},  // projection_weight tensor
+      {0},     // projection_bias tensor
+
+      {n_batch, n_output},  // output_state tensor
+      {n_batch, n_cell},    // cell_state tensor
+  };
+
+  // Build a regular bidirectional LSTM with full diagonal recurrent matrices.
+  UnidirectionalLSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                                 sequence_length, /*time_major=*/true,
+                                 /*use_cifg=*/use_cifg,
+                                 /*use_peephole=*/use_peephole,
+                                 /*use_projection_weights=*/false,
+                                 /*use_projection_bias=*/false,
+                                 /*cell_clip=*/0.0,
+                                 /*proj_clip=*/0.0, input_shapes);
+  SetLstmWeights(lstm);
+
+  if (!use_cifg) {
+    lstm.SetRecurrentToInputWeights({-0.0063535, 0.0, 0.0, 0.0,  //
+                                     0.0, 0.08183324, 0.0, 0.0,  //
+                                     0.0, 0.0, 0.48091322, 0.0,  //
+                                     0.0, 0.0, 0.0, 0.10629296});
+  }
+
+  lstm.SetRecurrentToCellWeights({-0.3407414, 0.0, 0.0, 0.0,   //
+                                  0.0, -0.00123841, 0.0, 0.0,  //
+                                  0.0, 0.0, -0.501764, 0.0,    //
+                                  0.0, 0.0, 0.0, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights({-0.48684245, 0.0, 0.0, 0.0,  //
+                                    0.0, 0.20864892, 0.0, 0.0,   //
+                                    0.0, 0.0, 0.36447752, 0.0,   //
+                                    0.0, 0.0, 0.0, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights({0.43385774, 0.0, 0.0, 0.0,   //
+                                    0.0, -0.39835793, 0.0, 0.0,  //
+                                    0.0, 0.0, 0.20047462, 0.0,   //
+                                    0.0, 0.0, 0.0, 0.39922136});
+
+  input_shapes[5] = {n_cell};  // recurrent_to_input_weight tensor
+  input_shapes[6] = {n_cell};  // recurrent_to_forget_weight tensor
+  input_shapes[7] = {n_cell};  // recurrent_to_cell_weight tensor
+  input_shapes[8] = {n_cell};  // recurrent_to_output_weight tensor
+
+  // Build a regular bidirectional LSTM with full diagonal recurrent matrices.
+  UnidirectionalLSTMOpModel indy_lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*time_major=*/true,
+      /*use_cifg=*/use_cifg,
+      /*use_peephole=*/use_peephole,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, input_shapes, TensorType_FLOAT32,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      /*diagonal_recurrent_weights=*/true);
+  SetLstmWeights(lstm);
+  SetLstmWeights(indy_lstm);
+  if (!use_cifg) {
+    indy_lstm.SetRecurrentToInputWeights(
+        {-0.0063535, 0.08183324, 0.48091322, 0.10629296});
+  }
+
+  indy_lstm.SetRecurrentToCellWeights(
+      {-0.3407414, -0.00123841, -0.501764, -0.16368064});
+
+  indy_lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, 0.20864892, 0.36447752, -0.01140004});
+
+  indy_lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.39835793, 0.20047462, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  indy_lstm.SetInput(0, batch0_start, batch0_end);
+
+  ASSERT_EQ(lstm.Invoke(), kTfLiteOk);
+  ASSERT_EQ(indy_lstm.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(indy_lstm.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(lstm.GetOutput(), 1e-6)));
+}
+
 #define QUANTIZE_PARAMETER_TEST(test) \
   INSTANTIATE_TEST_SUITE_P(test, test, ::testing::ValuesIn({false, true}));
 
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h
index 8854d46b92f..f3fc8219d8a 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test_util.h
@@ -35,13 +35,17 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
                             const std::vector<std::vector<int>>& input_shapes,
                             const TensorType& weights_type = TensorType_FLOAT32,
                             bool is_layer_norm = false,
-                            bool asymmetric_quantize_inputs = false)
+                            bool asymmetric_quantize_inputs = false,
+                            bool diagonal_recurrent_weights = false)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
         n_output_(n_output),
-        sequence_length_(sequence_length) {
+        sequence_length_(sequence_length),
+        diagonal_recurrent_weights_(diagonal_recurrent_weights) {
     input_ = AddInput(TensorType_FLOAT32);
+    const TensorType recurrent_weight_type =
+        diagonal_recurrent_weights_ ? TensorType_FLOAT32 : weights_type;
 
     if (use_cifg) {
       input_to_input_weights_ = AddNullInput();
@@ -56,12 +60,12 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
     if (use_cifg) {
       recurrent_to_input_weights_ = AddNullInput();
     } else {
-      recurrent_to_input_weights_ = AddInput(weights_type);
+      recurrent_to_input_weights_ = AddInput(recurrent_weight_type);
     }
 
-    recurrent_to_forget_weights_ = AddInput(weights_type);
-    recurrent_to_cell_weights_ = AddInput(weights_type);
-    recurrent_to_output_weights_ = AddInput(weights_type);
+    recurrent_to_forget_weights_ = AddInput(recurrent_weight_type);
+    recurrent_to_cell_weights_ = AddInput(recurrent_weight_type);
+    recurrent_to_output_weights_ = AddInput(recurrent_weight_type);
 
     if (use_peephole) {
       if (use_cifg) {
@@ -122,12 +126,13 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
 
     output_ = AddOutput(TensorType_FLOAT32);
 
-    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-                 BuiltinOptions_UnidirectionalSequenceLSTMOptions,
-                 CreateUnidirectionalSequenceLSTMOptions(
-                     builder_, ActivationFunctionType_TANH, cell_clip,
-                     proj_clip, time_major, asymmetric_quantize_inputs)
-                     .Union());
+    SetBuiltinOp(
+        BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+        BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+        CreateUnidirectionalSequenceLSTMOptions(
+            builder_, ActivationFunctionType_TANH, cell_clip, proj_clip,
+            time_major, asymmetric_quantize_inputs, diagonal_recurrent_weights)
+            .Union());
     BuildInterpreter(input_shapes);
   }
 
@@ -267,6 +272,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   int n_cell_;
   int n_output_;
   int sequence_length_;
+  bool diagonal_recurrent_weights_;
 
  private:
   int AddLayerNormCoeffsTensor(
@@ -279,6 +285,101 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   }
 };
 
+// The hybrid model has quantized weights.
+class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
+ public:
+  HybridUnidirectionalLSTMOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
+      bool time_major, bool use_cifg, bool use_peephole,
+      bool use_projection_weights, bool use_projection_bias, float cell_clip,
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
+      TensorType tensor_type, bool asymmetric_quantize_inputs,
+      bool diagonal_recurrent_weights = false)
+      : UnidirectionalLSTMOpModel(
+            n_batch, n_input, n_cell, n_output, sequence_length, time_major,
+            use_cifg, use_peephole, use_projection_weights, use_projection_bias,
+            cell_clip, proj_clip, input_shapes, tensor_type,
+            /*is_layer_norm=*/false, asymmetric_quantize_inputs,
+            diagonal_recurrent_weights) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_input_weights_, f);
+    } else {
+      SetWeights(recurrent_to_input_weights_, f);
+    }
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_forget_weights_, f);
+    } else {
+      SetWeights(recurrent_to_forget_weights_, f);
+    }
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_cell_weights_, f);
+    } else {
+      SetWeights(recurrent_to_cell_weights_, f);
+    }
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    if (diagonal_recurrent_weights_) {
+      PopulateTensor(recurrent_to_output_weights_, f);
+    } else {
+      SetWeights(recurrent_to_output_weights_, f);
+    }
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
+  }
+
+ protected:
+  TensorType tensor_type_;
+};
+
 }  // namespace tflite
 
 #endif  // THIRD_PARTY_TENSORFLOW_LITE_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 97d56a8ea06..d6b7daab0cf 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
index a9d8426bbab..0bb532fb166 100644
--- a/tensorflow/lite/kernels/unique.cc
+++ b/tensorflow/lite/kernels/unique.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 9f90cfde2d1..ef79d13fe11 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/unsorted_segment.cc b/tensorflow/lite/kernels/unsorted_segment.cc
index 1da6d5f505b..42369ccfb9e 100644
--- a/tensorflow/lite/kernels/unsorted_segment.cc
+++ b/tensorflow/lite/kernels/unsorted_segment.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/unsorted_segment_test.h b/tensorflow/lite/kernels/unsorted_segment_test.h
index 03f68cc6a55..5d9ee2a4372 100644
--- a/tensorflow/lite/kernels/unsorted_segment_test.h
+++ b/tensorflow/lite/kernels/unsorted_segment_test.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/kernels/var_handle.cc b/tensorflow/lite/kernels/var_handle.cc
index c179947049c..72b2073dfc1 100644
--- a/tensorflow/lite/kernels/var_handle.cc
+++ b/tensorflow/lite/kernels/var_handle.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/resource/resource_variable.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -58,6 +58,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output;
   TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
+  output->allocation_type = kTfLiteArenaRwPersistent;
   const int kBytesRequired = sizeof(int32_t);
   TfLiteTensorRealloc(kBytesRequired, output);
   output->bytes = kBytesRequired;
diff --git a/tensorflow/lite/kernels/variable_ops_test.cc b/tensorflow/lite/kernels/variable_ops_test.cc
index a5b0d678f26..35e297c5b4e 100644
--- a/tensorflow/lite/kernels/variable_ops_test.cc
+++ b/tensorflow/lite/kernels/variable_ops_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index 246ef0e9768..892a6522a4c 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <stdint.h>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
index 1e9fdc0d7b7..dc90cac1bdf 100644
--- a/tensorflow/lite/kernels/where_test.cc
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index b4e4bb3896e..5af1be67511 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cstring>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -499,12 +499,12 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node) {
   // |           |------------------------------->|            |
   // |           |         |            | <----   |            |
   // +-----------+         +------------+      \  +------------+
-  //                             |              \       |     ^
-  //                             | (2)       (5) \      | (4) | (3-2)
-  //                             v                \     v     |
+  //      |                      |              \       |     ^
+  //      | (6-1)                | (2)       (5) \      | (4) | (3-2)
+  //      v                      v                \     v     |
   // +-----------+         +------------+         +------------+
   // |   WHILE   |         |  SUBGRAPH  |         |  SUBGRAPH  |
-  // |   OUTPUT  |    (6)  |   OUTPUT   |         |   OUTPUT   |
+  // |   OUTPUT  |  (6-2)  |   OUTPUT   |         |   OUTPUT   |
   // |           |<-------------------------------|            |
   // +-----------+         +------------+         +------------+
   //
@@ -517,7 +517,9 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node) {
   // (4) Invoke body subgraph.
   // (5) Copy the outputs of body subgraph to the inputs condition subgraph.
   //     Jump back to step 2!
-  // (6) Copy the outputs of body subgraph to the outputs of WHILE op.
+  // (6) If body is never invoked, run the step 6-1, else run the step 6-2.
+  // (6-1) Copy the inputs of WHILE op to the outputs of WHILE op.
+  // (6-2) Copy the outputs of body subgraph to the outputs of WHILE op.
   //
   // The body subgraph shouldn't have dynamic sized outputs.
 
@@ -540,7 +542,7 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node) {
     }
 
     if (body_invoked) {
-      // Step 3-2. body->output -> body->inputs
+      // Step 3-2. body->outputs -> body->inputs
       TF_LITE_ENSURE_OK(
           context,
           CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
@@ -560,7 +562,7 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node) {
       body_subgraph->EnsureTensorDataIsReadable(tensor_index);
     }
 
-    // Step 5. body->output -> cond->inputs (fast)
+    // Step 5. body->outputs -> cond->inputs (fast)
     TF_LITE_ENSURE_OK(
         context,
         CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
@@ -568,7 +570,7 @@ TfLiteStatus Eval_static(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (body_invoked) {
-    // Step 6. Copy body->output -> node->outputs
+    // Step 6. Copy body->outputs -> node->outputs
     TF_LITE_ENSURE_OK(
         context,
         CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
diff --git a/tensorflow/lite/kernels/zeros_like.cc b/tensorflow/lite/kernels/zeros_like.cc
index 3a15e0c3d15..f1614f5a4eb 100644
--- a/tensorflow/lite/kernels/zeros_like.cc
+++ b/tensorflow/lite/kernels/zeros_like.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <stdint.h>
 #include <string.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/memory_planner.h b/tensorflow/lite/memory_planner.h
index dece428a961..a47e1576b65 100644
--- a/tensorflow/lite/memory_planner.h
+++ b/tensorflow/lite/memory_planner.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index c8b44908f80..0bfaf6b4da0 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_MODEL_H_
-#define TENSORFLOW_LITE_MODEL_H_
+#ifndef TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
+#define TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
 
 /// For documentation, see third_party/tensorflow/lite/core/model.h.
-#include "tensorflow/lite/core/model.h"  // IWYU pragma: export
 
-#endif  // TENSORFLOW_LITE_MODEL_H_
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/model_builder.h"
+
+#endif  // TENSORFLOW_LITE_CORE_SHIMS_CC_MODEL_H_
diff --git a/tensorflow/lite/model_builder.h b/tensorflow/lite/model_builder.h
index 2275012511c..a138444ee45 100644
--- a/tensorflow/lite/model_builder.h
+++ b/tensorflow/lite/model_builder.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_MODEL_BUILDER_H_
 
 /// For documentation, see third_party/tensorflow/lite/core/model_builder.h.
-#include "tensorflow/lite/core/model_builder.h"  // IWYU pragma: export
+
+#include "tensorflow/lite/core/model_builder.h"
+
+namespace tflite {
+using FlatBufferModel = ::tflite::impl::FlatBufferModel;
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MODEL_BUILDER_H_
diff --git a/tensorflow/lite/model_flex_test.cc b/tensorflow/lite/model_flex_test.cc
index 88c3be2c5b7..987dcc4f234 100644
--- a/tensorflow/lite/model_flex_test.cc
+++ b/tensorflow/lite/model_flex_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/model_xnnpack_test.cc b/tensorflow/lite/model_xnnpack_test.cc
index 9eb501ced6c..64e8104cb98 100644
--- a/tensorflow/lite/model_xnnpack_test.cc
+++ b/tensorflow/lite/model_xnnpack_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/macros.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/util.h"
 
diff --git a/tensorflow/lite/mutable_op_resolver.cc b/tensorflow/lite/mutable_op_resolver.cc
index 31d28f76895..497e67beac4 100644
--- a/tensorflow/lite/mutable_op_resolver.cc
+++ b/tensorflow/lite/mutable_op_resolver.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver_internal.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/mutable_op_resolver.h b/tensorflow/lite/mutable_op_resolver.h
index f37417dc746..e8265d648f6 100644
--- a/tensorflow/lite/mutable_op_resolver.h
+++ b/tensorflow/lite/mutable_op_resolver.h
@@ -22,8 +22,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
diff --git a/tensorflow/lite/mutable_op_resolver_test.cc b/tensorflow/lite/mutable_op_resolver_test.cc
index 16ad7504206..6f409a47533 100644
--- a/tensorflow/lite/mutable_op_resolver_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 2c3ce48aeda..1f1e5e1d8e6 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -62,7 +63,7 @@ cc_library(
     deps = [
         ":nnapi_implementation_headers",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
diff --git a/tensorflow/lite/nnapi/sl/BUILD b/tensorflow/lite/nnapi/sl/BUILD
index 8306ef54309..efd7adbe2c8 100644
--- a/tensorflow/lite/nnapi/sl/BUILD
+++ b/tensorflow/lite/nnapi/sl/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/lite:special_rules.bzl", "nnapi_sl_visibility_allowlist")
 _DEFAULT_VISIBILITY = ["//tensorflow/lite:__subpackages__"]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = _DEFAULT_VISIBILITY,
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index 5772d6bfa14..887fbcebdbd 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -5,6 +5,7 @@ load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZE
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
@@ -63,8 +64,8 @@ objc_library(
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_allowlist(),
     deps = [
-        "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:c_api_experimental",
         "//tensorflow/lite/delegates/coreml:coreml_delegate",
         "//tensorflow/lite/delegates/gpu:metal_delegate",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
index 61ec7313755..f9b85d46916 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '2.10.0'
+  s.version          = '2.11.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '359c3cdfc5fabac82b3c70b3b6de2b0a8c16874f' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => 'd5b57ca93e506df258271ea00fc29cf98383a374' }
   s.summary          = 'TensorFlow Lite for Objective-C'
   s.description      = <<-DESC
 
@@ -42,6 +42,7 @@ Pod::Spec.new do |s|
       objc_dir + '{apis,sources}/*.{h,m,mm}',
       tfl_dir + 'builtin_ops.h',
       tfl_dir + 'c/c_api.h',
+      tfl_dir + 'core/c/c_api.h',
       tfl_dir + 'c/c_api_experimental.h',
       tfl_dir + 'c/c_api_types.h',
       tfl_dir + 'c/common.h',
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
index 34f28be0cd2..fd7a4b61af3 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec.template
@@ -42,12 +42,6 @@ Pod::Spec.new do |s|
     core.public_header_files = objc_dir + 'apis/*.h'
     core.source_files = [
       objc_dir + '{apis,sources}/*.{h,m,mm}',
-      tfl_dir + 'builtin_ops.h',
-      tfl_dir + 'core/c/c_api.h',
-      tfl_dir + 'c/c_api_experimental.h',
-      tfl_dir + 'c/c_api_types.h',
-      tfl_dir + 'c/common.h',
-      tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
     ]
     core.exclude_files = [
       objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',
diff --git a/tensorflow/lite/objc/sources/TFLCommonUtil.mm b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
index 8b043ca6b5c..57362ceabb6 100644
--- a/tensorflow/lite/objc/sources/TFLCommonUtil.mm
+++ b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
@@ -17,7 +17,11 @@
 #import "TFLQuantizationParameters+Internal.h"
 #import "tensorflow/lite/objc/apis/TFLTensor.h"
 
+#ifdef COCOAPODS
+#import <TensorFlowLiteC/TensorFlowLiteC.h>
+#else
 #include "tensorflow/lite/core/c/c_api.h"
+#endif  // COCOAPODS
 
 NS_ASSUME_NONNULL_BEGIN
 
diff --git a/tensorflow/lite/objc/sources/TFLInterpreter.mm b/tensorflow/lite/objc/sources/TFLInterpreter.mm
index 3398a891223..d8c9a4a24db 100644
--- a/tensorflow/lite/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/objc/sources/TFLInterpreter.mm
@@ -25,9 +25,13 @@
 #import "tensorflow/lite/objc/apis/TFLInterpreterOptions.h"
 #import "tensorflow/lite/objc/apis/TFLTensor.h"
 
-#include "tensorflow/lite/c/c_api_experimental.h"
+#ifdef COCOAPODS
+#import <TensorFlowLiteC/TensorFlowLiteC.h>
+#else
 #include "tensorflow/lite/core/c/c_api.h"
+#include "tensorflow/lite/core/c/c_api_experimental.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif  // COCOAPODS
 
 NS_ASSUME_NONNULL_BEGIN
 
diff --git a/tensorflow/lite/objc/sources/TFLSignatureRunner.mm b/tensorflow/lite/objc/sources/TFLSignatureRunner.mm
index d4afb4356be..7993686a46c 100644
--- a/tensorflow/lite/objc/sources/TFLSignatureRunner.mm
+++ b/tensorflow/lite/objc/sources/TFLSignatureRunner.mm
@@ -22,7 +22,11 @@
 #import "TFLQuantizationParameters+Internal.h"
 #import "TFLTensor+Internal.h"
 
-#include "tensorflow/lite/c/c_api_experimental.h"
+#ifdef COCOAPODS
+#import <TensorFlowLiteC/TensorFlowLiteC.h>
+#else
+#include "tensorflow/lite/core/c/c_api_experimental.h"
+#endif  // COCOAPODS
 
 NS_ASSUME_NONNULL_BEGIN
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 9ba8d4d971a..706efec0329 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -499,19 +499,23 @@ void PrintInterpreterState(const Interpreter* interpreter) {
       }
       printf("  %d Input Tensors:",
              node.inputs != nullptr ? node.inputs->size : 0);
-      PrintTfLiteIntVector(
-          node.inputs,
-          /*collapse_consecutives=*/(node.delegate != nullptr));
-      PrintTotalBytesOfTensors(
-          subgraph, is_node_delegated ? TfLiteIntArrayView(&empty_int_array)
-                                      : TfLiteIntArrayView(node.inputs));
+      if (node.inputs) {
+        PrintTfLiteIntVector(
+            node.inputs,
+            /*collapse_consecutives=*/(node.delegate != nullptr));
+        PrintTotalBytesOfTensors(
+            subgraph, is_node_delegated ? TfLiteIntArrayView(&empty_int_array)
+                                        : TfLiteIntArrayView(node.inputs));
+      }
 
       printf("  %d Output Tensors:",
              node.outputs != nullptr ? node.outputs->size : 0);
-      PrintTfLiteIntVector(node.outputs);
-      PrintTotalBytesOfTensors(
-          subgraph, is_node_delegated ? TfLiteIntArrayView(&empty_int_array)
-                                      : TfLiteIntArrayView(node.outputs));
+      if (node.outputs) {
+        PrintTfLiteIntVector(node.outputs);
+        PrintTotalBytesOfTensors(
+            subgraph, is_node_delegated ? TfLiteIntArrayView(&empty_int_array)
+                                        : TfLiteIntArrayView(node.outputs));
+      }
 
       if (node.intermediates && node.intermediates->size) {
         printf("  %d Intermediate Tensors:", node.intermediates->size);
diff --git a/tensorflow/lite/optional_debug_tools.h b/tensorflow/lite/optional_debug_tools.h
index cd63e3e80de..d05519592a6 100644
--- a/tensorflow/lite/optional_debug_tools.h
+++ b/tensorflow/lite/optional_debug_tools.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 /// \file
+///
 /// Optional debugging functionality.
 /// For small sized binaries, these are not needed.
 #ifndef TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
@@ -23,7 +24,7 @@ limitations under the License.
 namespace tflite {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(const Interpreter* interpreter);
+void PrintInterpreterState(const impl::Interpreter* interpreter);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/optional_debug_tools_test.cc b/tensorflow/lite/optional_debug_tools_test.cc
index d882df50de0..cdb16b9ebd0 100644
--- a/tensorflow/lite/optional_debug_tools_test.cc
+++ b/tensorflow/lite/optional_debug_tools_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/portable_type_to_tflitetype.h b/tensorflow/lite/portable_type_to_tflitetype.h
index 52d7fdefe48..b60058543ea 100644
--- a/tensorflow/lite/portable_type_to_tflitetype.h
+++ b/tensorflow/lite/portable_type_to_tflitetype.h
@@ -27,7 +27,7 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 5e16e0d9538..8b0f88592f9 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -8,6 +8,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined
 # copybara:uncomment_end
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -21,6 +22,7 @@ cc_library(
         "noop_profiler.h",
         "profiler.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         ":profile_buffer",
@@ -95,6 +97,7 @@ cc_library(
 cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         ":memory_info",
@@ -116,6 +119,7 @@ cc_library(
     name = "time",
     srcs = ["time.cc"],
     hdrs = ["time.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
 )
 
@@ -132,6 +136,7 @@ cc_library(
     name = "memory_info",
     srcs = ["memory_info.cc"],
     hdrs = ["memory_info.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
 )
 
@@ -181,6 +186,7 @@ cc_library(
     name = "profile_summary_formatter",
     srcs = ["profile_summary_formatter.cc"],
     hdrs = ["profile_summary_formatter.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         "//tensorflow/core/util:stats_calculator_portable",
@@ -201,6 +207,7 @@ cc_library(
     name = "profile_summarizer",
     srcs = ["profile_summarizer.cc"],
     hdrs = ["profile_summarizer.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         ":memory_info",
@@ -241,6 +248,7 @@ cc_test(
         ":profiler",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/kernels:test_util",
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
index ac2f29657e1..6340162b5ba 100644
--- a/tensorflow/lite/profiling/buffered_profiler.h
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -79,8 +79,14 @@ class BufferedProfiler : public tflite::Profiler {
                    bool allow_dynamic_buffer_increase)
       : buffer_(max_num_initial_entries, false /*enabled*/,
                 allow_dynamic_buffer_increase),
-        supported_event_types_(~static_cast<uint64_t>(
-            EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT)) {}
+        supported_event_types_(
+            ~(static_cast<uint64_t>(
+                  EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT) |
+              static_cast<uint64_t>(EventType::TELEMETRY_EVENT) |
+              static_cast<uint64_t>(EventType::TELEMETRY_REPORT_SETTINGS) |
+              static_cast<uint64_t>(EventType::TELEMETRY_DELEGATE_EVENT) |
+              static_cast<uint64_t>(
+                  EventType::TELEMETRY_DELEGATE_REPORT_SETTINGS))) {}
 
   explicit BufferedProfiler(uint32_t max_num_entries)
       : BufferedProfiler(max_num_entries,
diff --git a/tensorflow/lite/profiling/memory_info.cc b/tensorflow/lite/profiling/memory_info.cc
index 4f1d5178699..9a4428ec45a 100644
--- a/tensorflow/lite/profiling/memory_info.cc
+++ b/tensorflow/lite/profiling/memory_info.cc
@@ -43,13 +43,18 @@ MemoryUsage GetMemoryUsage() {
   if (getrusage(RUSAGE_SELF, &res) == 0) {
     result.mem_footprint_kb = res.ru_maxrss;
   }
-#if defined(__GLIBC__) && __GLIBC_MINOR__ >= 33
+#if defined(__NO_MALLINFO__)
+  result.total_allocated_bytes = -1;
+  result.in_use_allocated_bytes = -1;
+#elif defined(__GLIBC__) && __GLIBC_MINOR__ >= 33
   const auto mem = mallinfo2();
+  result.total_allocated_bytes = mem.arena;
+  result.in_use_allocated_bytes = mem.uordblks;
 #else
   const auto mem = mallinfo();
-#endif  // defined(__GLIBC__)
   result.total_allocated_bytes = mem.arena;
   result.in_use_allocated_bytes = mem.uordblks;
+#endif  // defined(__NO_MALLINFO__)
 #elif defined(__APPLE__)
   struct task_vm_info vm_info;
   mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index 5cb448b4b3b..4bbf4e403a2 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -150,6 +150,20 @@ void ProfileSummarizer::ProcessProfiles(
       delegate_stats_calculator_->AddNodeStats(node_name_in_stats,
                                                "DelegateOpInvoke", node_num,
                                                node_exec_time, 0 /*memory */);
+    } else if (event->event_type ==
+               Profiler::EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT) {
+      // This event type handles the delegate ops that are profiled in the
+      // Operator-wise Profiling section, not in the Delegate internal section.
+      const std::string node_name(event->tag);
+      // For delegate op, node name is treated as the type in stats.
+      const std::string type_in_stats(node_name);
+      // Append event_metadata to node name because 'stats_calculator' can not
+      // distinguish two nodes w/ the same 'node_name'.
+      const auto node_name_in_stats =
+          "Delegate/" + node_name + ":" + std::to_string(event->event_metadata);
+
+      stats_calculator->AddNodeStats(node_name_in_stats, type_in_stats,
+                                     node_num, node_exec_time, 0 /*memory */);
     } else {
       // Note: a different stats_calculator could be used to record
       // non-op-invoke events so that these could be separated from
@@ -167,8 +181,9 @@ void ProfileSummarizer::ProcessProfiles(
                                      node_mem_usage.mem_footprint_kb * 1000.0);
     }
 
-    // Add total time except actual delegate ops since the elapsed time of the
-    // delegate ops inside are already combined at a fused DELEGATE op.
+    // Add total time except delegate ops that are profiled separately since the
+    // elapsed time of the delegate ops inside are already combined at a fused
+    // DELEGATE op.
     if (event->event_type !=
         Profiler::EventType::DELEGATE_OPERATOR_INVOKE_EVENT) {
       total_us_per_subgraph_map[subgraph_index] += node_exec_time;
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 079cc62319a..8c180e605c9 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/profiling/signpost_profiler.mm b/tensorflow/lite/profiling/signpost_profiler.mm
index e876d30815f..1b5aa0b63f3 100644
--- a/tensorflow/lite/profiling/signpost_profiler.mm
+++ b/tensorflow/lite/profiling/signpost_profiler.mm
@@ -69,6 +69,7 @@ uint32_t BeginEvent(const char *tag, EventType event_type, int64_t event_metadat
           os_signpost_interval_begin(log_, signpost_id, "operator invoke", "%s", msg);
           break;
         case EventType::DELEGATE_OPERATOR_INVOKE_EVENT:
+        case EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT:
           os_signpost_interval_begin(log_, signpost_id, "delegate operator invoke", "%s", msg);
           break;
         case EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT:
@@ -103,6 +104,7 @@ void EndEvent(uint32_t event_handle) override {
             os_signpost_interval_end(log_, signpost_id, "operator invoke");
             break;
           case EventType::DELEGATE_OPERATOR_INVOKE_EVENT:
+          case EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT:
             os_signpost_interval_end(log_, signpost_id, "delegate operator invoke");
             break;
           case EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT:
diff --git a/tensorflow/lite/profiling/telemetry/BUILD b/tensorflow/lite/profiling/telemetry/BUILD
index eb297ba051c..513c2a62531 100644
--- a/tensorflow/lite/profiling/telemetry/BUILD
+++ b/tensorflow/lite/profiling/telemetry/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -13,10 +14,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
-        ":telemetry_settings",
         ":telemetry_status",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/profiling/telemetry/c:profiler",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
     ],
 )
 
@@ -27,11 +28,11 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
-        ":telemetry_settings",
         ":telemetry_status",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
     ],
 )
 
@@ -42,24 +43,31 @@ cc_test(
         ":profiler",
         ":telemetry",
         ":telemetry_status",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting_internal",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
-cc_library(
-    name = "telemetry_status",
-    hdrs = ["telemetry_status.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = tflite_copts(),
-    deps = ["//tensorflow/lite/c:c_api_types"],
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    deps = [
+        ":profiler",
+        ":telemetry",
+        ":telemetry_status",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/profiling/telemetry/c:telemetry_setting",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 cc_library(
-    name = "telemetry_settings",
-    hdrs = ["telemetry_settings.h"],
+    name = "telemetry_status",
+    hdrs = ["telemetry_status.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
-    deps = [":telemetry_status"],
+    deps = ["//tensorflow/lite/core/c:c_api_types"],
 )
diff --git a/tensorflow/lite/profiling/telemetry/c/BUILD b/tensorflow/lite/profiling/telemetry/c/BUILD
new file mode 100644
index 00000000000..a02a5207d0b
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/c/BUILD
@@ -0,0 +1,50 @@
+load(
+    "//tensorflow/lite:build_def.bzl",
+    "tflite_cc_library_with_c_headers_test",
+    "tflite_copts",
+)
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+exports_files(
+    srcs = [
+        "profiler.h",
+        "telemetry_setting.h",
+    ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "profiler",
+    hdrs = ["profiler.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [":telemetry_setting"],
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "telemetry_setting",
+    hdrs = ["telemetry_setting.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        ":telemetry_setting_internal",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_library(
+    name = "telemetry_setting_internal",
+    srcs = ["telemetry_setting_internal.cc"],
+    hdrs = ["telemetry_setting_internal.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = ["//tensorflow/lite/core/c:common"],
+)
diff --git a/tensorflow/lite/profiling/telemetry/c/profiler.h b/tensorflow/lite/profiling/telemetry/c/profiler.h
new file mode 100644
index 00000000000..ff106cd985e
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/c/profiler.h
@@ -0,0 +1,85 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// C API for TFLite telemetry profiler.
+// See C++ interface in tflite::telemetry::TelemetryProfiler.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetryProfilerStruct {
+  // Data that profiler needs to identify itself. This data is owned by the
+  // profiler. The profiler is owned in the user code, so the profiler is
+  // responsible for deallocating this when it is destroyed.
+  void* data;
+
+  // Reports a telemetry event with status.
+  // `event_name` indicates the name of the event (e.g. "Invoke") and should not
+  // be nullptr.
+  // `status`: uint64_t representation of TelemetryStatusCode.
+  void (*ReportTelemetryEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* event_name,
+      uint64_t status);
+
+  // Reports an op telemetry event with status.
+  // Same as `ReportTelemetryEvent`, with additional args `op_idx` and
+  // `subgraph_idx`.
+  // `status`: uint64_t representation of TelemetryStatusCode.
+  void (*ReportTelemetryOpEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* event_name,
+      int64_t op_idx, int64_t subgraph_idx, uint64_t status);
+
+  // Reports the model and interpreter settings.
+  // `setting_name` indicates the name of the setting and should not be nullptr.
+  // `settings`'s lifespan is not guaranteed outside the scope of
+  // `ReportSettings` call.
+  void (*ReportSettings)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* setting_name,
+      const TfLiteTelemetrySettings* settings);
+
+  // Signals the beginning of an operator invocation.
+  // `op_name` is the name of the operator and should not be nullptr.
+  // Op invoke event are triggered with OPERATOR_INVOKE_EVENT type for TfLite
+  // ops and delegate kernels, and DELEGATE_OPERATOR_INVOKE_EVENT for delegate
+  // ops within a delegate kernels, if the instrumentation is in place.
+  // Returns event handle which can be passed to `EndOpInvokeEvent` later.
+  uint32_t (*ReportBeginOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+      int64_t op_idx, int64_t subgraph_idx);
+
+  // Signals the end to the event specified by `event_handle`.
+  void (*ReportEndOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, uint32_t event_handle);
+
+  // For op / delegate op with built-in performance measurements, they
+  // are able to report the elapsed time directly.
+  // `elapsed_time` is in microsecond.
+  void (*ReportOpInvokeEvent)(  // NOLINT
+      struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+      uint64_t elapsed_time, int64_t op_idx, int64_t subgraph_idx);
+} TfLiteTelemetryProfilerStruct;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_PROFILER_H_
diff --git a/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h b/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h
new file mode 100644
index 00000000000..e738e6fe730
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/c/telemetry_setting.h
@@ -0,0 +1,103 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TFLite model, interpreter or delegate settings that will be reported by
+// telemetry.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetrySettings {
+  // Source of the settings. Determines how `data` is interpreted.
+  // See tflite::telemetry::TelemetrySource for definition.
+  uint32_t source;
+
+  // Settings data. Interpretation based on `source`.
+  // If `source` is TFLITE_INTERPRETER, the type of `data` will
+  // be `TelemetryInterpreterSettings`.
+  // Otherwise, the data is provided by the individual delegate.
+  // Owned by the caller that exports TelemetrySettings (e.g. Interpreter).
+  const void* data;
+} TfLiteTelemetrySettings;
+
+typedef struct TfLiteTelemetryConversionMetadata
+    TfLiteTelemetryConversionMetadata;
+
+const int32_t* TfLiteTelemetryConversionMetadataGetModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata);
+
+size_t TfLiteTelemetryConversionMetadataGetNumModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata);
+
+// TfLite model information and settings of the interpreter.
+// Note: This struct does not comply with ABI stability.
+typedef struct TfLiteTelemetryInterpreterSettings
+    TfLiteTelemetryInterpreterSettings;
+
+const TfLiteTelemetryConversionMetadata*
+TfLiteTelemetryInterpreterSettingsGetConversionMetadata(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+// Telemetry data for a specific TFLite subgraph.
+typedef struct TfLiteTelemetrySubgraphInfo TfLiteTelemetrySubgraphInfo;
+
+size_t TfLiteTelemetryInterpreterSettingsGetNumSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+const TfLiteTelemetrySubgraphInfo*
+TfLiteTelemetryInterpreterSettingsGetSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings);
+
+size_t TfLiteTelemetrySubgraphInfoGetNumOpTypes(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+const int32_t* TfLiteTelemetrySubgraphInfoGetOpTypes(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+size_t TfLiteTelemetrySubgraphInfoGetNumQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+const TfLiteQuantization* TfLiteTelemetrySubgraphInfoGetQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+size_t TfLiteTelemetrySubgraphInfoGetNumCustomOpNames(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+const char** TfLiteTelemetrySubgraphInfoGetCustomOpNames(
+    TfLiteTelemetrySubgraphInfo* subgraph_info);
+
+// Telemetry information for GPU delegate.
+typedef struct TfLiteTelemetryGpuDelegateSettings
+    TfLiteTelemetryGpuDelegateSettings;
+
+size_t TfLiteTelemetryGpuDelegateSettingsGetNumNodesDelegated(
+    const TfLiteTelemetryGpuDelegateSettings* settings);
+
+int TfLiteTelemetryGpuDelegateSettingsGetBackend(
+    const TfLiteTelemetryGpuDelegateSettings* settings);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_H_
diff --git a/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.cc b/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.cc
new file mode 100644
index 00000000000..a95b6a912eb
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.cc
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
+
+extern "C" {
+
+const TfLiteTelemetryConversionMetadata*
+TfLiteTelemetryInterpreterSettingsGetConversionMetadata(
+    const TfLiteTelemetryInterpreterSettings* settings) {
+  if (settings == nullptr) return nullptr;
+  return settings->conversion_metadata.get();
+}
+
+const int32_t* TfLiteTelemetryConversionMetadataGetModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata) {
+  if (metadata == nullptr) return nullptr;
+  return metadata->model_optimization_modes.data();
+}
+
+size_t TfLiteTelemetryConversionMetadataGetNumModelOptimizationModes(
+    const TfLiteTelemetryConversionMetadata* metadata) {
+  if (metadata == nullptr) return 0;
+  return metadata->model_optimization_modes.size();
+}
+
+size_t TfLiteTelemetryInterpreterSettingsGetNumSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings) {
+  if (settings == nullptr) return 0;
+  return settings->subgraph_infos.size();
+}
+
+const TfLiteTelemetrySubgraphInfo*
+TfLiteTelemetryInterpreterSettingsGetSubgraphInfo(
+    const TfLiteTelemetryInterpreterSettings* settings) {
+  if (settings == nullptr) return nullptr;
+  return settings->subgraph_infos.data();
+}
+
+size_t TfLiteTelemetrySubgraphInfoGetNumOpTypes(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return 0;
+  return subgraph_info->op_types.size();
+}
+
+const int32_t* TfLiteTelemetrySubgraphInfoGetOpTypes(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return nullptr;
+  return subgraph_info->op_types.data();
+}
+
+size_t TfLiteTelemetrySubgraphInfoGetNumQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return 0;
+  return subgraph_info->quantizations.size();
+}
+
+const TfLiteQuantization* TfLiteTelemetrySubgraphInfoGetQuantizations(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return nullptr;
+  return subgraph_info->quantizations.data();
+}
+
+size_t TfLiteTelemetrySubgraphInfoGetNumCustomOpNames(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return 0;
+  return subgraph_info->custom_op_names.size();
+}
+
+const char** TfLiteTelemetrySubgraphInfoGetCustomOpNames(
+    TfLiteTelemetrySubgraphInfo* subgraph_info) {
+  if (subgraph_info == nullptr) return nullptr;
+  return subgraph_info->custom_op_names.data();
+}
+
+size_t TfLiteTelemetryGpuDelegateSettingsGetNumNodesDelegated(
+    const TfLiteTelemetryGpuDelegateSettings* settings) {
+  if (settings == nullptr) return 0;
+  return settings->num_nodes_delegated;
+}
+
+int TfLiteTelemetryGpuDelegateSettingsGetBackend(
+    const TfLiteTelemetryGpuDelegateSettings* settings) {
+  if (settings == nullptr) return 0;
+  return settings->backend;
+}
+
+}  // extern "C"
diff --git a/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h b/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h
new file mode 100644
index 00000000000..de9f072cb6e
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
+#define TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/core/c/common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct TfLiteTelemetryConversionMetadata {
+  std::vector<int32_t> model_optimization_modes;
+};
+
+struct TfLiteTelemetrySubgraphInfo {
+  std::vector<int32_t> op_types;
+  std::vector<TfLiteQuantization> quantizations;
+  std::vector<const char*> custom_op_names;
+};
+
+struct TfLiteTelemetryInterpreterSettings {
+  std::unique_ptr<TfLiteTelemetryConversionMetadata> conversion_metadata;
+  std::vector<TfLiteTelemetrySubgraphInfo> subgraph_infos;
+};
+
+struct TfLiteTelemetryGpuDelegateSettings {
+  // Reported by "GpuDelegate::DelegatePrepare" event.
+  size_t num_nodes_delegated;
+
+  // Reported by "GpuDelegateKernel::Prepare" event.
+  enum Backend : int {
+    UNKNOWN = 0,
+    OPENCL = 1,
+    OPENGL = 2,
+  };
+  Backend backend;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_C_TELEMETRY_SETTING_INTERNAL_H_
diff --git a/tensorflow/lite/profiling/telemetry/profiler.cc b/tensorflow/lite/profiling/telemetry/profiler.cc
index 2dee878c852..444fe60d8f4 100644
--- a/tensorflow/lite/profiling/telemetry/profiler.cc
+++ b/tensorflow/lite/profiling/telemetry/profiler.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/profiler.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 
 void TelemetryProfiler::AddEvent(const char* tag, EventType event_type,
                                  uint64_t metric, int64_t event_metadata1,
@@ -38,7 +38,8 @@ void TelemetryProfiler::AddEvent(const char* tag, EventType event_type,
       break;
     }
     case EventType::OPERATOR_INVOKE_EVENT:
-    case EventType::DELEGATE_OPERATOR_INVOKE_EVENT: {
+    case EventType::DELEGATE_OPERATOR_INVOKE_EVENT:
+    case EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT: {
       ReportOpInvokeEvent(tag, metric, event_metadata1, event_metadata2);
       break;
     }
@@ -53,9 +54,9 @@ void TelemetryProfiler::AddEventWithData(const char* tag, EventType event_type,
   switch (event_type) {
     case EventType::TELEMETRY_REPORT_SETTINGS:
     case EventType::TELEMETRY_DELEGATE_REPORT_SETTINGS: {
-      auto* settings = reinterpret_cast<const TelemetrySettings*>(data);
+      auto* settings = reinterpret_cast<const TfLiteTelemetrySettings*>(data);
       if (settings) {
-        ReportSettings(tag, *settings);
+        ReportSettings(tag, settings);
       }
       break;
     }
@@ -70,7 +71,8 @@ uint32_t TelemetryProfiler::BeginEvent(const char* tag, EventType event_type,
                                        int64_t event_metadata2) {
   switch (event_type) {
     case EventType::OPERATOR_INVOKE_EVENT:
-    case EventType::DELEGATE_OPERATOR_INVOKE_EVENT: {
+    case EventType::DELEGATE_OPERATOR_INVOKE_EVENT:
+    case EventType::DELEGATE_PROFILED_OPERATOR_INVOKE_EVENT: {
       return ReportBeginOpInvokeEvent(tag, event_metadata1, event_metadata2);
     }
     default:
@@ -84,4 +86,73 @@ void TelemetryProfiler::EndEvent(uint32_t event_handle) {
   ReportEndOpInvokeEvent(event_handle);
 }
 
-}  // namespace tflite
+// This TfLiteTelemetryProfiler class wraps the `TfLiteTelemetryProfilerStruct`
+// C API into the TelemetryProfiler interface. Users that uses the C API
+// will provide `TfLiteTelemetryProfilerStruct` to TfLiteInterpreter and then
+// TFLite runtime will build `TfLiteTelemetryProfiler` with it and register to
+// the interpreter.
+class TfLiteTelemetryProfiler : public TelemetryProfiler {
+ public:
+  explicit TfLiteTelemetryProfiler(TfLiteTelemetryProfilerStruct* profiler)
+      : profiler_(profiler) {}
+
+  void ReportTelemetryEvent(const char* event_name,
+                            TelemetryStatusCode status) override;
+  void ReportTelemetryOpEvent(const char* event_name, int64_t op_idx,
+                              int64_t subgraph_idx,
+                              TelemetryStatusCode status) override;
+  void ReportSettings(const char* setting_name,
+                      const TfLiteTelemetrySettings* settings) override;
+  uint32_t ReportBeginOpInvokeEvent(const char* op_name, int64_t op_idx,
+                                    int64_t subgraph_idx) override;
+  void ReportEndOpInvokeEvent(uint32_t event_handle) override;
+  void ReportOpInvokeEvent(const char* op_name, uint64_t elapsed_time,
+                           int64_t op_idx, int64_t subgraph_idx) override;
+
+ private:
+  // Owned by TfLiteTelemetryProfiler.
+  // Note, profiler_->data is owned by the caller.
+  TfLiteTelemetryProfilerStruct* profiler_ = nullptr;
+};
+
+void TfLiteTelemetryProfiler::ReportTelemetryEvent(const char* event_name,
+                                                   TelemetryStatusCode status) {
+  profiler_->ReportTelemetryEvent(profiler_, event_name, status.code());
+}
+
+void TfLiteTelemetryProfiler::ReportTelemetryOpEvent(
+    const char* event_name, int64_t op_idx, int64_t subgraph_idx,
+    TelemetryStatusCode status) {
+  profiler_->ReportTelemetryOpEvent(profiler_, event_name, op_idx, subgraph_idx,
+                                    status.code());
+}
+
+void TfLiteTelemetryProfiler::ReportSettings(
+    const char* setting_name, const TfLiteTelemetrySettings* settings) {
+  profiler_->ReportSettings(profiler_, setting_name, settings);
+}
+
+uint32_t TfLiteTelemetryProfiler::ReportBeginOpInvokeEvent(
+    const char* op_name, int64_t op_idx, int64_t subgraph_idx) {
+  return profiler_->ReportBeginOpInvokeEvent(profiler_, op_name, op_idx,
+                                             subgraph_idx);
+}
+
+void TfLiteTelemetryProfiler::ReportEndOpInvokeEvent(uint32_t event_handle) {
+  profiler_->ReportEndOpInvokeEvent(profiler_, event_handle);
+}
+
+void TfLiteTelemetryProfiler::ReportOpInvokeEvent(const char* op_name,
+                                                  uint64_t elapsed_time,
+                                                  int64_t op_idx,
+                                                  int64_t subgraph_idx) {
+  profiler_->ReportOpInvokeEvent(profiler_, op_name, elapsed_time, op_idx,
+                                 subgraph_idx);
+}
+
+TelemetryProfiler* MakeTfLiteTelemetryProfiler(
+    TfLiteTelemetryProfilerStruct* profiler) {
+  return new TfLiteTelemetryProfiler(profiler);
+}
+
+}  // namespace tflite::telemetry
diff --git a/tensorflow/lite/profiling/telemetry/profiler.h b/tensorflow/lite/profiling/telemetry/profiler.h
index 60dbac2e57c..915492eb13e 100644
--- a/tensorflow/lite/profiling/telemetry/profiler.h
+++ b/tensorflow/lite/profiling/telemetry/profiler.h
@@ -19,10 +19,11 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/core/api/profiler.h"
-#include "tensorflow/lite/profiling/telemetry/telemetry_settings.h"
+#include "tensorflow/lite/profiling/telemetry/c/profiler.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
 #include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 
 // Telemetry profiler interface.
 // When installed, the telemetry profilers accepts profiler events exported from
@@ -57,7 +58,7 @@ class TelemetryProfiler : public Profiler {
   // `settings`'s lifespan is not guaranteed outside the scope of
   // `ReportSettings` call.
   virtual void ReportSettings(const char* setting_name,
-                              const TelemetrySettings& settings) = 0;
+                              const TfLiteTelemetrySettings* settings) = 0;
 
   // Performance measurement events.
 
@@ -75,6 +76,7 @@ class TelemetryProfiler : public Profiler {
 
   // For op / delegate op with built-in performance measurements, they
   // are able to report the elapsed time directly.
+  // `elapsed_time` is in microsecond.
   virtual void ReportOpInvokeEvent(const char* op_name, uint64_t elapsed_time,
                                    int64_t op_idx, int64_t subgraph_idx) = 0;
 
@@ -92,6 +94,11 @@ class TelemetryProfiler : public Profiler {
   void EndEvent(uint32_t event_handle) final;
 };
 
-}  // namespace tflite
+// Creates a concrete TelemetryProfiler that wraps the
+// `TfLiteTelemetryProfilerStruct` C API.
+TelemetryProfiler* MakeTfLiteTelemetryProfiler(
+    TfLiteTelemetryProfilerStruct* profiler);
+
+}  // namespace tflite::telemetry
 
 #endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_PROFILER_H_
diff --git a/tensorflow/lite/profiling/telemetry/profiler_test.cc b/tensorflow/lite/profiling/telemetry/profiler_test.cc
new file mode 100644
index 00000000000..d9d20d9f08f
--- /dev/null
+++ b/tensorflow/lite/profiling/telemetry/profiler_test.cc
@@ -0,0 +1,156 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/telemetry/profiler.h"
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
+#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
+
+namespace tflite::telemetry {
+namespace {
+
+constexpr char kEventName[] = "event_name";
+constexpr char kSettingName[] = "setting_name";
+
+class MockTelemtryProfiler : public TelemetryProfiler {
+ public:
+  MOCK_METHOD(void, ReportTelemetryEvent,
+              (const char* event_name, TelemetryStatusCode status), (override));
+  MOCK_METHOD(void, ReportTelemetryOpEvent,
+              (const char* event_name, int64_t op_idx, int64_t subgraph_idx,
+               TelemetryStatusCode status),
+              (override));
+  MOCK_METHOD(void, ReportSettings,
+              (const char* setting_name,
+               const TfLiteTelemetrySettings* settings),
+              (override));
+  MOCK_METHOD(uint32_t, ReportBeginOpInvokeEvent,
+              (const char* op_name, int64_t op_idx, int64_t subgraph_idx),
+              (override));
+  MOCK_METHOD(void, ReportEndOpInvokeEvent, (uint32_t event_handle),
+              (override));
+  MOCK_METHOD(void, ReportOpInvokeEvent,
+              (const char* op_name, uint64_t elapsed_time, int64_t op_idx,
+               int64_t subgraph_idx),
+              (override));
+};
+
+class TelemetryStructTest : public ::testing::Test {
+ protected:
+  TelemetryStructTest() {
+    context_.profiler = &profiler_;
+
+    profiler_struct_.data = &mock_profiler_;
+    profiler_struct_.ReportTelemetryEvent =
+        [](struct TfLiteTelemetryProfilerStruct* profiler,
+           const char* event_name, uint64_t status) {
+          static_cast<MockTelemtryProfiler*>(profiler->data)
+              ->ReportTelemetryEvent(
+                  event_name, tflite::telemetry::TelemetryStatusCode(status));
+        };
+    profiler_struct_.ReportTelemetryOpEvent =
+        [](struct TfLiteTelemetryProfilerStruct* profiler,
+           const char* event_name, int64_t op_idx, int64_t subgraph_idx,
+           uint64_t status) {
+          static_cast<MockTelemtryProfiler*>(profiler->data)
+              ->ReportTelemetryOpEvent(
+                  event_name, op_idx, subgraph_idx,
+                  tflite::telemetry::TelemetryStatusCode(status));
+        };
+    profiler_struct_.ReportSettings =
+        [](struct TfLiteTelemetryProfilerStruct* profiler,
+           const char* setting_name, const TfLiteTelemetrySettings* settings) {
+          static_cast<MockTelemtryProfiler*>(profiler->data)
+              ->ReportSettings(setting_name, settings);
+        };
+    profiler_struct_.ReportBeginOpInvokeEvent =
+        [](struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+           int64_t op_idx, int64_t subgraph_idx) -> uint32_t {
+      return static_cast<MockTelemtryProfiler*>(profiler->data)
+          ->ReportBeginOpInvokeEvent(op_name, op_idx, subgraph_idx);
+    };
+    profiler_struct_.ReportEndOpInvokeEvent =
+        [](struct TfLiteTelemetryProfilerStruct* profiler,
+           uint32_t event_handle) {
+          return static_cast<MockTelemtryProfiler*>(profiler->data)
+              ->ReportEndOpInvokeEvent(event_handle);
+        };
+    profiler_struct_.ReportOpInvokeEvent =
+        [](struct TfLiteTelemetryProfilerStruct* profiler, const char* op_name,
+           uint64_t elapsed_time, int64_t op_idx, int64_t subgraph_idx) {
+          return static_cast<MockTelemtryProfiler*>(profiler->data)
+              ->ReportOpInvokeEvent(op_name, elapsed_time, op_idx,
+                                    subgraph_idx);
+        };
+    profiler_.reset(telemetry::MakeTfLiteTelemetryProfiler(&profiler_struct_));
+  }
+
+  MockTelemtryProfiler mock_profiler_;
+  std::unique_ptr<TelemetryProfiler> profiler_;
+  TfLiteContext context_;
+  TfLiteTelemetryProfilerStruct profiler_struct_;
+};
+
+TEST_F(TelemetryStructTest, TelemetryReportEvent) {
+  EXPECT_CALL(mock_profiler_,
+              ReportTelemetryEvent(kEventName, TelemetryStatusCode(kTfLiteOk)));
+
+  profiler_->ReportTelemetryEvent(kEventName, TelemetryStatusCode(kTfLiteOk));
+}
+
+TEST_F(TelemetryStructTest, TelemetryReportOpEvent) {
+  EXPECT_CALL(
+      mock_profiler_,
+      ReportTelemetryOpEvent(kEventName, 1, 2, TelemetryStatusCode(kTfLiteOk)));
+
+  profiler_->ReportTelemetryOpEvent(kEventName, 1, 2,
+                                    TelemetryStatusCode(kTfLiteOk));
+}
+
+TEST_F(TelemetryStructTest, TelemetryReportSettings) {
+  EXPECT_CALL(mock_profiler_, ReportSettings(kSettingName, testing::_));
+  TfLiteTelemetrySettings settings{};
+
+  profiler_->ReportSettings(kSettingName, &settings);
+}
+
+TEST_F(TelemetryStructTest, TelemetryReportBeginOpInvokeEvent) {
+  EXPECT_CALL(mock_profiler_, ReportBeginOpInvokeEvent(kSettingName, 1, 2));
+
+  profiler_->ReportBeginOpInvokeEvent(kSettingName, 1, 2);
+}
+
+TEST_F(TelemetryStructTest, TelemetryReportEndOpInvokeEvent) {
+  EXPECT_CALL(mock_profiler_, ReportEndOpInvokeEvent(1));
+
+  profiler_->ReportEndOpInvokeEvent(1);
+}
+
+TEST_F(TelemetryStructTest, TelemetryReportOpInvokeEvent) {
+  EXPECT_CALL(mock_profiler_, ReportOpInvokeEvent(kSettingName, 1, 2, 3));
+
+  profiler_->ReportOpInvokeEvent(kSettingName, 1, 2, 3);
+}
+
+}  // namespace
+}  // namespace tflite::telemetry
diff --git a/tensorflow/lite/profiling/telemetry/telemetry.cc b/tensorflow/lite/profiling/telemetry/telemetry.cc
index bb026db84d1..a913b3c9b95 100644
--- a/tensorflow/lite/profiling/telemetry/telemetry.cc
+++ b/tensorflow/lite/profiling/telemetry/telemetry.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 
 void TelemetryReportEvent(TfLiteContext* context, const char* event_name,
                           TfLiteStatus status) {
@@ -65,13 +65,15 @@ void TelemetryReportDelegateOpEvent(TfLiteContext* context, const char* op_name,
   }
 }
 
-void TelemetryReportSettings(TfLiteContext* context, const char* setting_name,
-                             const TelemetryInterpreterSettings& settings) {
+void TelemetryReportSettings(
+    TfLiteContext* context, const char* setting_name,
+    const TfLiteTelemetryInterpreterSettings* settings) {
   auto* profiler = reinterpret_cast<Profiler*>(context->profiler);
   if (profiler) {
-    TelemetrySettings telemetry_settings;
-    telemetry_settings.source = TelemetrySource::TFLITE_INTERPRETER;
-    telemetry_settings.data = reinterpret_cast<const void*>(&settings);
+    TfLiteTelemetrySettings telemetry_settings{};
+    telemetry_settings.source =
+        static_cast<uint32_t>(TelemetrySource::TFLITE_INTERPRETER);
+    telemetry_settings.data = reinterpret_cast<const void*>(settings);
     profiler->AddEventWithData(
         setting_name, Profiler::EventType::TELEMETRY_REPORT_SETTINGS,
         reinterpret_cast<const void*>(&telemetry_settings));
@@ -84,8 +86,8 @@ void TelemetryReportDelegateSettings(TfLiteContext* context,
                                      const void* settings) {
   auto* profiler = reinterpret_cast<Profiler*>(context->profiler);
   if (profiler) {
-    TelemetrySettings telemetry_settings;
-    telemetry_settings.source = source;
+    TfLiteTelemetrySettings telemetry_settings{};
+    telemetry_settings.source = static_cast<uint32_t>(source);
     telemetry_settings.data = settings;
     profiler->AddEventWithData(
         setting_name, Profiler::EventType::TELEMETRY_DELEGATE_REPORT_SETTINGS,
@@ -93,4 +95,4 @@ void TelemetryReportDelegateSettings(TfLiteContext* context,
   }
 }
 
-}  // namespace tflite
+}  // namespace tflite::telemetry
diff --git a/tensorflow/lite/profiling/telemetry/telemetry.h b/tensorflow/lite/profiling/telemetry/telemetry.h
index 8100d76fa38..bbb4bdcf8e9 100644
--- a/tensorflow/lite/profiling/telemetry/telemetry.h
+++ b/tensorflow/lite/profiling/telemetry/telemetry.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/profiling/telemetry/telemetry_settings.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting.h"
 #include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 
 // Methods for instrumenting TFLite runtime to export telemetry events to
 // profilers.
@@ -57,8 +57,9 @@ void TelemetryReportDelegateOpEvent(TfLiteContext* context, const char* op_name,
 
 // Reports model and interpreter level settings.
 // `setting_name` indicates the name of the setting.
-void TelemetryReportSettings(TfLiteContext* context, const char* setting_name,
-                             const TelemetryInterpreterSettings& settings);
+void TelemetryReportSettings(
+    TfLiteContext* context, const char* setting_name,
+    const TfLiteTelemetryInterpreterSettings* settings);
 
 // Reports delegate settings.
 // `setting_name` indicates the name of the setting.
@@ -69,6 +70,6 @@ void TelemetryReportDelegateSettings(TfLiteContext* context,
                                      TelemetrySource source,
                                      const void* settings);
 
-}  // namespace tflite
+}  // namespace tflite::telemetry
 
 #endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_H_
diff --git a/tensorflow/lite/profiling/telemetry/telemetry_settings.h b/tensorflow/lite/profiling/telemetry/telemetry_settings.h
deleted file mode 100644
index c4526e7e241..00000000000
--- a/tensorflow/lite/profiling/telemetry/telemetry_settings.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_SETTINGS_H_
-#define TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_SETTINGS_H_
-
-#include <map>
-#include <string>
-
-#include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
-
-namespace tflite {
-
-// TFLite model and interpreter settings that will be reported by telemetry.
-struct TelemetrySettings {
-  // Source of the settings. Determines how `data` is interpreted.
-  TelemetrySource source;
-  // Settings data. Interpretation based on `source`.
-  // If `source` is TFLITE_INTERPRETER, the type of `data` will
-  // be `TelemetryInterpreterSettings`.
-  // Otherwise, the data is provided by the individual delegate.
-  // Owned by the caller that exports TelemetrySettings (e.g. Interpreter).
-  const void* data = nullptr;
-};
-
-// TfLite model information and settings of the interpreter.
-struct TelemetryInterpreterSettings {
-  TelemetryInterpreterSettings() = default;
-
-  // Metadata from the TfLite model.
-  // Owned by the interpreter.
-  std::map<std::string, std::string>* model_metadata = nullptr;
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_SETTINGS_H_
diff --git a/tensorflow/lite/profiling/telemetry/telemetry_status.h b/tensorflow/lite/profiling/telemetry/telemetry_status.h
index 59a6d320587..54e588130e2 100644
--- a/tensorflow/lite/profiling/telemetry/telemetry_status.h
+++ b/tensorflow/lite/profiling/telemetry/telemetry_status.h
@@ -18,24 +18,26 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 
-// The source of a telemetry event.
+// The source of a telemetry event. Enum values intentionally follow proto
+// guidelines as they are used for Clearcut logging.
 enum class TelemetrySource : uint32_t {
-  TFLITE_INTERPRETER = 0,
+  UNKNOWN = 0,
+  TFLITE_INTERPRETER = 1,
 
   // For external delegate.
   // External delegate should identify themselves in telemetry event names by
   // prefixing the delegame name to it.
-  TFLITE_CUSTOM_DELEGATE = 1,
+  TFLITE_CUSTOM_DELEGATE = 2,
 
-  TFLITE_GPU = 2,
-  TFLITE_NNAPI = 3,
-  TFLITE_HEXAGON = 4,
-  TFLITE_XNNPACK = 5,
-  TFLITE_COREML = 6,
+  TFLITE_GPU = 3,
+  TFLITE_NNAPI = 4,
+  TFLITE_HEXAGON = 5,
+  TFLITE_XNNPACK = 6,
+  TFLITE_COREML = 7,
 };
 
 // A namespaced status code for telemetry events.
@@ -63,6 +65,6 @@ struct TelemetryStatusCode {
   }
 };
 
-}  // namespace tflite
+}  // namespace tflite::telemetry
 
 #endif  // TENSORFLOW_LITE_PROFILING_TELEMETRY_TELEMETRY_STATUS_H_
diff --git a/tensorflow/lite/profiling/telemetry/telemetry_test.cc b/tensorflow/lite/profiling/telemetry/telemetry_test.cc
index bb5f75b9265..73bb6b7a28b 100644
--- a/tensorflow/lite/profiling/telemetry/telemetry_test.cc
+++ b/tensorflow/lite/profiling/telemetry/telemetry_test.cc
@@ -18,18 +18,19 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/profiling/telemetry/c/telemetry_setting_internal.h"
 #include "tensorflow/lite/profiling/telemetry/profiler.h"
 #include "tensorflow/lite/profiling/telemetry/telemetry_status.h"
 
-namespace tflite {
+namespace tflite::telemetry {
 namespace {
 
 constexpr char kEventName[] = "event_name";
 constexpr char kSettingName[] = "setting_name";
 
-class MockTelemtryProfiler : public TelemetryProfiler {
+class MockTelemetryProfiler : public TelemetryProfiler {
  public:
   MOCK_METHOD(void, ReportTelemetryEvent,
               (const char* event_name, TelemetryStatusCode status), (override));
@@ -38,7 +39,8 @@ class MockTelemtryProfiler : public TelemetryProfiler {
                TelemetryStatusCode status),
               (override));
   MOCK_METHOD(void, ReportSettings,
-              (const char* setting_name, const TelemetrySettings& settings),
+              (const char* setting_name,
+               const TfLiteTelemetrySettings* settings),
               (override));
   MOCK_METHOD(uint32_t, ReportBeginOpInvokeEvent,
               (const char* op_name, int64_t op_idx, int64_t subgraph_idx),
@@ -55,7 +57,7 @@ class TelemetryTest : public ::testing::Test {
  protected:
   TelemetryTest() { context_.profiler = &profiler_; }
 
-  MockTelemtryProfiler profiler_;
+  MockTelemetryProfiler profiler_;
   TfLiteContext context_;
 };
 
@@ -94,9 +96,9 @@ TEST_F(TelemetryTest, TelemetryReportDelegateOpEvent) {
 
 TEST_F(TelemetryTest, TelemetryReportSettings) {
   EXPECT_CALL(profiler_, ReportSettings(kSettingName, testing::_));
+  TfLiteTelemetryInterpreterSettings settings{};
 
-  TelemetryReportSettings(&context_, kSettingName,
-                          TelemetryInterpreterSettings());
+  TelemetryReportSettings(&context_, kSettingName, &settings);
 }
 
 TEST_F(TelemetryTest, TelemetryReportDelegateSettings) {
@@ -108,4 +110,4 @@ TEST_F(TelemetryTest, TelemetryReportDelegateSettings) {
 }
 
 }  // namespace
-}  // namespace tflite
+}  // namespace tflite::telemetry
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 9033e020d62..9e90461c0c1 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//third_party/tflite_micro:__subpackages__",
@@ -100,6 +101,7 @@ py_library(
         "//tensorflow/lite/toco/logging:gen_html",
         "//tensorflow/lite/toco/logging:toco_conversion_log_proto_py",
         "//tensorflow/python:util",
+        "//tensorflow/python/platform",
         "@absl_py//absl:app",
     ],
 )
diff --git a/tensorflow/lite/python/analyzer_wrapper/BUILD b/tensorflow/lite/python/analyzer_wrapper/BUILD
index 1a217223d2f..d0b8c80b372 100644
--- a/tensorflow/lite/python/analyzer_wrapper/BUILD
+++ b/tensorflow/lite/python/analyzer_wrapper/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -22,8 +23,8 @@ cc_library(
     hdrs = ["model_analyzer.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite:model_builder",
         "//tensorflow/lite:version",
+        "//tensorflow/lite/core:model_builder",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
diff --git a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
index a34c1086a82..126292dab88 100644
--- a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
+++ b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "absl/strings/str_join.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/versioning/gpu_compatibility.h"
@@ -113,7 +113,7 @@ const std::string tensor_str(const int tensor_idx, const int subgraph_idx,
     ss << "T#" << tensor_idx;
   if (model && tensor_idx != -1) {
     const SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx);
-    if (subgraph) {
+    if (subgraph && subgraph->tensors()) {
       auto tensor = subgraph->tensors()->Get(tensor_idx);
       if (tensor && tensor->type() == tflite::TensorType_INT32) {
         ss << get_tensor_data_str(tensor, model);
@@ -141,7 +141,9 @@ void dump_tensor_detail(std::stringstream& out_stream,
                         const int subgraph_idx, const tflite::Model* model,
                         ModelStats* stats) {
   out_stream << tensor_str(tensor_idx, subgraph_idx);
-  out_stream << "(" << tensor->name()->str() << ") ";
+  if (tensor->name()) {
+    out_stream << "(" << tensor->name()->str() << ") ";
+  }
   // Prints `shape_signature` instead of `shape` if it's available since it
   // supports dynamic shapes.
   if (tensor->shape_signature()) {
@@ -449,19 +451,21 @@ std::string model_analyzer(const std::string& model_file_or_buffer,
     out_stream << ") -> [";
     dump_tensor_list(out_stream, subgraph->outputs(), i);
     out_stream << "]\n";
-    for (int j = 0; j < subgraph->operators()->Length(); ++j) {
-      const Operator* op = subgraph->operators()->Get(j);
-      const OperatorCode* op_code =
-          model->operator_codes()->Get(op->opcode_index());
-      out_stream << "  ";  // indents for operators
-      dump_node(out_stream, /*node_no=*/j, op_code, op, i, model);
-      if (check_gpu_compatibility) {
-        auto status =
-            CheckGpuDelegateCompatibility(op_code, op, subgraph, model);
-        if (!status.ok()) {
-          gpu_incompatible_nodes.push_back(j);
-          out_stream << "GPU COMPATIBILITY WARNING: " << status.message()
-                     << "\n";
+    if (subgraph->operators()) {
+      for (int j = 0; j < subgraph->operators()->Length(); ++j) {
+        const Operator* op = subgraph->operators()->Get(j);
+        const OperatorCode* op_code =
+            model->operator_codes()->Get(op->opcode_index());
+        out_stream << "  ";  // indents for operators
+        dump_node(out_stream, /*node_no=*/j, op_code, op, i, model);
+        if (check_gpu_compatibility) {
+          auto status =
+              CheckGpuDelegateCompatibility(op_code, op, subgraph, model);
+          if (!status.ok()) {
+            gpu_incompatible_nodes.push_back(j);
+            out_stream << "GPU COMPATIBILITY WARNING: " << status.message()
+                       << "\n";
+          }
         }
       }
     }
@@ -477,10 +481,12 @@ std::string model_analyzer(const std::string& model_file_or_buffer,
     // Dump Subgraph Tensors.
     out_stream << "\nTensors of " << subgraph_str(i) << "\n";
     auto tensors = subgraph->tensors();
-    for (int j = 0; j < tensors->Length(); ++j) {
-      auto tensor = tensors->Get(j);
-      out_stream << "  ";  // indents for tensors
-      dump_tensor_detail(out_stream, tensor, j, i, model, &stats);
+    if (tensors) {
+      for (int j = 0; j < tensors->Length(); ++j) {
+        auto tensor = tensors->Get(j);
+        out_stream << "  ";  // indents for tensors
+        dump_tensor_detail(out_stream, tensor, j, i, model, &stats);
+      }
     }
     out_stream << "\n";
   }
diff --git a/tensorflow/lite/python/authoring/BUILD b/tensorflow/lite/python/authoring/BUILD
index 59cfa1eacd7..f78076fb38a 100644
--- a/tensorflow/lite/python/authoring/BUILD
+++ b/tensorflow/lite/python/authoring/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 6b9666ba526..b2d067f89e7 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -187,6 +187,14 @@ class OpsSet(enum.Enum):
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = (
       "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8")
 
+  # Convert model using only stablehlo ops.
+  # This option can not be combined with other OpsSets.
+  # The feature is in early development.
+  # The code to execute StableHLO ops in the runtime is to be implemented
+  # and the serialization format is not stabilized yet.
+
+  EXPERIMENTAL_STABLEHLO_OPS = "EXPERIMENTAL_STABLEHLO_OPS"
+
   def __str__(self):
     return str(self.value)
 
@@ -206,7 +214,8 @@ def mlir_quantize(input_data_str,
                   enable_numeric_verify=False,
                   enable_whole_model_verify=False,
                   denylisted_ops=None,
-                  denylisted_nodes=None):
+                  denylisted_nodes=None,
+                  enable_variable_quantization=False):
   """Quantize `input_data_str` with calibration results.
 
   Args:
@@ -228,6 +237,9 @@ def mlir_quantize(input_data_str,
       ops will run with respective float and quantized output of previous ops.
     denylisted_ops: Experimental. Subject to change. Set of ops to denylist.
     denylisted_nodes: Experimental. Subject to change. Set of notes to denylist.
+    enable_variable_quantization: Experimental. Subject to change. Bool
+      indicating whether to enable quantization of the residual variables
+      remaining after the variable freezing pass.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
@@ -238,7 +250,7 @@ def mlir_quantize(input_data_str,
       convert_tensor_tf_type_to_tflite_type(input_data_type),
       convert_tensor_tf_type_to_tflite_type(output_data_type),
       enable_numeric_verify, enable_whole_model_verify, denylisted_ops,
-      denylisted_nodes)
+      denylisted_nodes, enable_variable_quantization)
 
 
 @convert_phase(Component.OPTIMIZE_TFLITE_MODEL, SubComponent.SPARSIFY)
@@ -501,6 +513,7 @@ def build_conversion_flags(inference_type=dtypes.float32,
                            enable_dynamic_update_slice=False,
                            preserve_assert_op=False,
                            guarantee_all_funcs_one_use=False,
+                           enable_mlir_variable_quantization=False,
                            **_):
   """Builds protocol buffer describing a conversion of a model.
 
@@ -585,6 +598,10 @@ def build_conversion_flags(inference_type=dtypes.float32,
       function only has a single use. This option will be helpful if the
       conversion fails when the `PartitionedCall` or `StatefulPartitionedCall`
       can't be properly inlined (default: False).
+    enable_mlir_variable_quantization: Enable MLIR variable quantization. There
+      is a variable freezing pass, but some variables may not be fully frozen by
+      it. This flag enables quantization of those residual variables in the MLIR
+      graph.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -618,6 +635,11 @@ def build_conversion_flags(inference_type=dtypes.float32,
       conversion_flags.enable_select_tf_ops = True
     if set(target_ops) == {OpsSet.SELECT_TF_OPS}:
       conversion_flags.force_select_tf_ops = True
+    if OpsSet.EXPERIMENTAL_STABLEHLO_OPS in target_ops:
+      conversion_flags.convert_to_stablehlo = True
+    if OpsSet.EXPERIMENTAL_STABLEHLO_OPS in target_ops and len(target_ops) > 1:
+      raise ValueError("StableHLO Ops set can not be specified with other Ops "
+                       "set together")
   if conversion_summary_dir:
     conversion_flags.conversion_summary_dir = conversion_summary_dir
   if select_user_tf_ops:
@@ -647,6 +669,8 @@ def build_conversion_flags(inference_type=dtypes.float32,
     conversion_flags.tf_quantization_mode = tf_quantization_mode
   conversion_flags.disable_infer_tensor_range = disable_infer_tensor_range
   conversion_flags.use_fake_quant_num_bits = use_fake_quant_num_bits
+  conversion_flags.enable_mlir_variable_quantization = (
+      enable_mlir_variable_quantization)
   return conversion_flags
 
 
@@ -655,7 +679,7 @@ def build_conversion_flags(inference_type=dtypes.float32,
 def convert_graphdef_with_arrays(input_data, input_arrays_with_shape,
                                  output_arrays, control_output_arrays,
                                  **kwargs):
-  """"Convert a frozen GraphDef that can't be loaded in TF.
+  """Convert a frozen GraphDef that can't be loaded in TF.
 
   Conversion can be customized by providing arguments that are forwarded to
   `build_model_flags` and `build_conversion_flags` (see documentation).
@@ -663,9 +687,9 @@ def convert_graphdef_with_arrays(input_data, input_arrays_with_shape,
   Args:
     input_data: Input data (i.e. often `sess.graph_def`),
     input_arrays_with_shape: Tuple of strings representing input tensor names
-      and list of integers representing input shapes
-      (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
-        into TensorFlow and when `input_tensors` is None.
+      and list of integers representing input shapes (e.g., [("foo" : [1, 16,
+      16, 3])]). Use only when graph cannot be loaded into TensorFlow and when
+      `input_tensors` is None.
     output_arrays: List of output tensors to freeze graph with. Use only when
       graph cannot be loaded into TensorFlow and when `output_tensors` is None.
     control_output_arrays: Control output node names. This is used when
@@ -869,7 +893,7 @@ def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
 
 
 def deduplicate_readonly_buffers(tflite_model):
-  """"Generates a new model byte array after deduplicating readonly buffers.
+  """Generates a new model byte array after deduplicating readonly buffers.
 
   This function should be invoked after the model optimization toolkit. The
   model optimization toolkit assumes that each tensor object owns its each
@@ -880,7 +904,6 @@ def deduplicate_readonly_buffers(tflite_model):
 
   Returns:
     TFLite flatbuffer in a bytes array, processed with the deduplication method.
-
   """
   # Load TFLite Flatbuffer byte array into an object.
   model = flatbuffer_utils.convert_bytearray_to_object(tflite_model)
diff --git a/tensorflow/lite/python/convert_phase.py b/tensorflow/lite/python/convert_phase.py
index 89c3fd69150..2fb367aef35 100644
--- a/tensorflow/lite/python/convert_phase.py
+++ b/tensorflow/lite/python/convert_phase.py
@@ -39,7 +39,7 @@ class Component(enum.Enum):
                                           ["name", "component"])
 
 
-class SubComponent(enum.Enum):
+class SubComponent(SubComponentItem, enum.Enum):
   """Enum class defining name of the converter subcomponents.
 
   This enum only defines the subcomponents in Python, there might be more
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 597801d22ff..6d9d424b7f7 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -44,19 +44,10 @@ def _tf_export(*x, **kwargs):
 class Delegate:
   """Python wrapper class to manage TfLiteDelegate objects.
 
-  The shared library is expected to have two functions:
-    TfLiteDelegate* tflite_plugin_create_delegate(
-        char**, char**, size_t, void (*report_error)(const char *))
-    void tflite_plugin_destroy_delegate(TfLiteDelegate*)
-
-  The first one creates a delegate object. It may return NULL to indicate an
-  error (with a suitable error message reported by calling report_error()).
-  The second one destroys delegate object and must be called for every
-  created delegate object. Passing NULL as argument value is allowed, i.e.
-
-    tflite_plugin_destroy_delegate(tflite_plugin_create_delegate(...))
-
-  always works.
+  The shared library is expected to have two functions,
+  tflite_plugin_create_delegate and tflite_plugin_destroy_delegate,
+  which should implement the API specified in
+  tensorflow/lite/delegates/external/external_delegate_interface.h.
   """
 
   def __init__(self, library, options=None):
@@ -85,6 +76,7 @@ def __init__(self, library, options=None):
         ctypes.POINTER(ctypes.c_char_p), ctypes.c_int,
         ctypes.CFUNCTYPE(None, ctypes.c_char_p)
     ]
+    # The return type is really 'TfLiteDelegate*', but 'void*' is close enough.
     self._library.tflite_plugin_create_delegate.restype = ctypes.c_void_p
 
     # Convert the options from a dictionary to lists of char pointers.
@@ -289,7 +281,8 @@ def get_input_details(self):
     """
     result = {}
     for input_name, tensor_index in self._inputs.items():
-      result[input_name] = self._interpreter._get_tensor_details(tensor_index)  # pylint: disable=protected-access
+      result[input_name] = self._interpreter._get_tensor_details(  # pylint: disable=protected-access
+          tensor_index, self._subgraph_index)
     return result
 
   def get_output_details(self):
@@ -302,7 +295,8 @@ def get_output_details(self):
     """
     result = {}
     for output_name, tensor_index in self._outputs:
-      result[output_name] = self._interpreter._get_tensor_details(tensor_index)  # pylint: disable=protected-access
+      result[output_name] = self._interpreter._get_tensor_details(  # pylint: disable=protected-access
+          tensor_index, self._subgraph_index)
     return result
 
 
@@ -565,11 +559,12 @@ def _get_op_details(self, op_index):
 
     return details
 
-  def _get_tensor_details(self, tensor_index):
+  def _get_tensor_details(self, tensor_index, subgraph_index):
     """Gets tensor details.
 
     Args:
       tensor_index: Tensor index of tensor to query.
+      subgraph_index: Index of the subgraph.
 
     Returns:
       A dictionary containing the following fields of the tensor:
@@ -589,15 +584,18 @@ def _get_tensor_details(self, tensor_index):
       ValueError: If tensor_index is invalid.
     """
     tensor_index = int(tensor_index)
-    tensor_name = self._interpreter.TensorName(tensor_index)
-    tensor_size = self._interpreter.TensorSize(tensor_index)
-    tensor_size_signature = self._interpreter.TensorSizeSignature(tensor_index)
-    tensor_type = self._interpreter.TensorType(tensor_index)
-    tensor_quantization = self._interpreter.TensorQuantization(tensor_index)
+    subgraph_index = int(subgraph_index)
+    tensor_name = self._interpreter.TensorName(tensor_index, subgraph_index)
+    tensor_size = self._interpreter.TensorSize(tensor_index, subgraph_index)
+    tensor_size_signature = self._interpreter.TensorSizeSignature(
+        tensor_index, subgraph_index)
+    tensor_type = self._interpreter.TensorType(tensor_index, subgraph_index)
+    tensor_quantization = self._interpreter.TensorQuantization(
+        tensor_index, subgraph_index)
     tensor_quantization_params = self._interpreter.TensorQuantizationParameters(
-        tensor_index)
+        tensor_index, subgraph_index)
     tensor_sparsity_params = self._interpreter.TensorSparsityParameters(
-        tensor_index)
+        tensor_index, subgraph_index)
 
     if not tensor_type:
       raise ValueError('Could not get tensor details')
@@ -641,9 +639,9 @@ def get_tensor_details(self):
       A list of dictionaries containing tensor information.
     """
     tensor_details = []
-    for idx in range(self._interpreter.NumTensors()):
+    for idx in range(self._interpreter.NumTensors(0)):
       try:
-        tensor_details.append(self._get_tensor_details(idx))
+        tensor_details.append(self._get_tensor_details(idx, subgraph_index=0))
       except ValueError:
         pass
     return tensor_details
@@ -675,7 +673,8 @@ def get_input_details(self):
         sparse tensor. This is empty if the tensor is dense.
     """
     return [
-        self._get_tensor_details(i) for i in self._interpreter.InputIndices()
+        self._get_tensor_details(i, subgraph_index=0)
+        for i in self._interpreter.InputIndices()
     ]
 
   def set_tensor(self, tensor_index, value):
@@ -734,7 +733,8 @@ def get_output_details(self):
       described for `get_input_details()`.
     """
     return [
-        self._get_tensor_details(i) for i in self._interpreter.OutputIndices()
+        self._get_tensor_details(i, subgraph_index=0)
+        for i in self._interpreter.OutputIndices()
     ]
 
   def get_signature_list(self):
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 38e0bc87194..52eb953a4a6 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -338,7 +338,7 @@ def testInvalidIndex(self):
     interpreter.allocate_tensors()
     # Invalid tensor index passed.
     with self.assertRaisesRegex(ValueError, 'Tensor with no shape found.'):
-      interpreter._get_tensor_details(4)
+      interpreter._get_tensor_details(4, 0)
     with self.assertRaisesRegex(ValueError, 'Invalid node index'):
       interpreter._get_op_details(4)
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index c07b58583f0..d9e654f8eee 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -12,7 +13,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
@@ -33,10 +34,10 @@ cc_library(
         "//tensorflow/lite:shared_library",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 2f8b3d32dde..5f9fdbaf833 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -25,14 +25,14 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
@@ -46,14 +46,6 @@ limitations under the License.
     return error_reporter_->exception(); \
   }
 
-#define TFLITE_PY_TENSOR_BOUNDS_CHECK(i)                                    \
-  if (i >= interpreter_->tensors_size() || i < 0) {                         \
-    PyErr_Format(PyExc_ValueError,                                          \
-                 "Invalid tensor index %d exceeds max tensor index %lu", i, \
-                 interpreter_->tensors_size());                             \
-    return nullptr;                                                         \
-  }
-
 #define TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(i, subgraph_index)             \
   if (i >= interpreter_->subgraph(subgraph_index)->tensors_size() || i < 0) { \
     PyErr_Format(PyExc_ValueError,                                            \
@@ -110,7 +102,9 @@ std::unique_ptr<Interpreter> CreateInterpreter(
 
 PyObject* PyArrayFromFloatVector(const float* data, npy_intp size) {
   void* pydata = malloc(size * sizeof(float));
-  memcpy(pydata, data, size * sizeof(float));
+  if (data != nullptr) {
+    memcpy(pydata, data, size * sizeof(float));
+  }
   PyObject* obj = PyArray_SimpleNewFromData(1, &size, NPY_FLOAT32, pydata);
   PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(obj), NPY_ARRAY_OWNDATA);
   return obj;
@@ -118,7 +112,9 @@ PyObject* PyArrayFromFloatVector(const float* data, npy_intp size) {
 
 PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
   void* pydata = malloc(size * sizeof(int));
-  memcpy(pydata, data, size * sizeof(int));
+  if (data != nullptr) {
+    memcpy(pydata, data, size * sizeof(int));
+  }
   PyObject* obj = PyArray_SimpleNewFromData(1, &size, NPY_INT32, pydata);
   PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(obj), NPY_ARRAY_OWNDATA);
   return obj;
@@ -364,27 +360,32 @@ PyObject* InterpreterWrapper::ResizeInputTensor(int i, PyObject* value,
   Py_RETURN_NONE;
 }
 
-int InterpreterWrapper::NumTensors() const {
+int InterpreterWrapper::NumTensors(int subgraph_index) const {
   if (!interpreter_) {
     return 0;
   }
-  return interpreter_->tensors_size();
+  return interpreter_->subgraph(subgraph_index)->tensors_size();
 }
 
-std::string InterpreterWrapper::TensorName(int i) const {
-  if (!interpreter_ || i >= interpreter_->tensors_size() || i < 0) {
+std::string InterpreterWrapper::TensorName(int tensor_index,
+                                           int subgraph_index) const {
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  if (!interpreter_ || tensor_index >= subgraph->tensors_size() ||
+      tensor_index < 0) {
     return "";
   }
 
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   return tensor->name ? tensor->name : "";
 }
 
-PyObject* InterpreterWrapper::TensorType(int i) const {
+PyObject* InterpreterWrapper::TensorType(int tensor_index,
+                                         int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
 
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   if (tensor->type == kTfLiteNoType) {
     PyErr_Format(PyExc_ValueError, "Tensor with no type found.");
     return nullptr;
@@ -398,11 +399,13 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
   return PyArray_TypeObjectFromType(code);
 }
 
-PyObject* InterpreterWrapper::TensorSize(int i) const {
+PyObject* InterpreterWrapper::TensorSize(int tensor_index,
+                                         int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
 
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   if (tensor->dims == nullptr) {
     PyErr_Format(PyExc_ValueError, "Tensor with no shape found.");
     return nullptr;
@@ -413,11 +416,13 @@ PyObject* InterpreterWrapper::TensorSize(int i) const {
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
-PyObject* InterpreterWrapper::TensorSizeSignature(int i) const {
+PyObject* InterpreterWrapper::TensorSizeSignature(int tensor_index,
+                                                  int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
 
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   const int32_t* size_signature_data = nullptr;
   int32_t size_signature_size = 0;
   if (tensor->dims_signature != nullptr && tensor->dims_signature->size != 0) {
@@ -433,10 +438,13 @@ PyObject* InterpreterWrapper::TensorSizeSignature(int i) const {
   return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
-PyObject* InterpreterWrapper::TensorSparsityParameters(int i) const {
+PyObject* InterpreterWrapper::TensorSparsityParameters(
+    int tensor_index, int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
+
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   if (tensor->sparsity == nullptr) {
     return PyDict_New();
   }
@@ -444,17 +452,23 @@ PyObject* InterpreterWrapper::TensorSparsityParameters(int i) const {
   return PyDictFromSparsityParam(*tensor->sparsity);
 }
 
-PyObject* InterpreterWrapper::TensorQuantization(int i) const {
+PyObject* InterpreterWrapper::TensorQuantization(int tensor_index,
+                                                 int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
+
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   return PyTupleFromQuantizationParam(tensor->params);
 }
 
-PyObject* InterpreterWrapper::TensorQuantizationParameters(int i) const {
+PyObject* InterpreterWrapper::TensorQuantizationParameters(
+    int tensor_index, int subgraph_index) const {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
-  TFLITE_PY_TENSOR_BOUNDS_CHECK(i);
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
+
+  const Subgraph* subgraph = interpreter_->subgraph(subgraph_index);
+  const TfLiteTensor* tensor = subgraph->tensor(tensor_index);
   const TfLiteQuantization quantization = tensor->quantization;
   float* scales_data = nullptr;
   int32_t* zero_points_data = nullptr;
@@ -485,11 +499,11 @@ PyObject* InterpreterWrapper::TensorQuantizationParameters(int i) const {
   return result;
 }
 
-PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value,
+PyObject* InterpreterWrapper::SetTensor(int tensor_index, PyObject* value,
                                         int subgraph_index) {
   TFLITE_PY_ENSURE_VALID_INTERPRETER();
   TFLITE_PY_SUBGRAPH_BOUNDS_CHECK(subgraph_index);
-  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(i, subgraph_index);
+  TFLITE_PY_SUBGRAPH_TENSOR_BOUNDS_CHECK(tensor_index, subgraph_index);
 
   std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
       PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr));
@@ -500,7 +514,8 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value,
   }
 
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-  TfLiteTensor* tensor = interpreter_->subgraph(subgraph_index)->tensor(i);
+  TfLiteTensor* tensor =
+      interpreter_->subgraph(subgraph_index)->tensor(tensor_index);
 
   if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
     PyErr_Format(PyExc_ValueError,
@@ -508,7 +523,7 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value,
                  " Got value of type %s"
                  " but expected type %s for input %d, name: %s ",
                  TfLiteTypeGetName(python_utils::TfLiteTypeFromPyArray(array)),
-                 TfLiteTypeGetName(tensor->type), i, tensor->name);
+                 TfLiteTypeGetName(tensor->type), tensor_index, tensor->name);
     return nullptr;
   }
 
@@ -517,7 +532,7 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value,
                  "Cannot set tensor: Dimension mismatch."
                  " Got %d"
                  " but expected %d for input %d.",
-                 PyArray_NDIM(array), tensor->dims->size, i);
+                 PyArray_NDIM(array), tensor->dims->size, tensor_index);
     return nullptr;
   }
 
@@ -527,7 +542,8 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value,
                    "Cannot set tensor: Dimension mismatch."
                    " Got %ld"
                    " but expected %d for dimension %d of input %d.",
-                   PyArray_SHAPE(array)[j], tensor->dims->data[j], j, i);
+                   PyArray_SHAPE(array)[j], tensor->dims->data[j], j,
+                   tensor_index);
       return nullptr;
     }
   }
@@ -684,13 +700,14 @@ PyObject* InterpreterWrapper::GetSubgraphIndexFromSignature(
   return PyLong_FromLong(static_cast<int64_t>(subgraph_index));
 }
 
-PyObject* InterpreterWrapper::GetTensor(int i, int subgraph_index) const {
+PyObject* InterpreterWrapper::GetTensor(int tensor_index,
+                                        int subgraph_index) const {
   // Sanity check accessor
   TfLiteTensor* tensor = nullptr;
   int type_num = 0;
 
-  PyObject* check_result = CheckGetTensorArgs(interpreter_.get(), i, &tensor,
-                                              &type_num, subgraph_index);
+  PyObject* check_result = CheckGetTensorArgs(
+      interpreter_.get(), tensor_index, &tensor, &type_num, subgraph_index);
   if (check_result == nullptr) return check_result;
   Py_XDECREF(check_result);
 
@@ -747,7 +764,7 @@ PyObject* InterpreterWrapper::GetTensor(int i, int subgraph_index) const {
         Py_DECREF(py_object);
         PyErr_Format(PyExc_ValueError,
                      "Could not create PyBytes from string %d of input %d.", j,
-                     i);
+                     tensor_index);
         return nullptr;
       }
       // PyArray_EMPTY produces an array full of Py_None, which we must decref.
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 5195ed17b8b..123bccdaa9a 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -34,7 +34,10 @@ struct TfLiteDelegate;
 // We forward declare TFLite classes here to avoid exposing them to SWIG.
 namespace tflite {
 class MutableOpResolver;
+
+namespace impl {
 class FlatBufferModel;
+}
 
 namespace interpreter_wrapper {
 
@@ -42,7 +45,7 @@ class PythonErrorReporter;
 
 class InterpreterWrapper {
  public:
-  using Model = FlatBufferModel;
+  using Model = impl::FlatBufferModel;
 
   // SWIG caller takes ownership of pointer.
   static InterpreterWrapper* CreateWrapperCPPFromFile(
@@ -75,17 +78,19 @@ class InterpreterWrapper {
   PyObject* ResizeInputTensor(int i, PyObject* value, bool strict,
                               int subgraph_index);
 
-  int NumTensors() const;
-  std::string TensorName(int i) const;
-  PyObject* TensorType(int i) const;
-  PyObject* TensorSize(int i) const;
-  PyObject* TensorSizeSignature(int i) const;
-  PyObject* TensorSparsityParameters(int i) const;
+  int NumTensors(int subgraph_index) const;
+  std::string TensorName(int tensor_index, int subgraph_index) const;
+  PyObject* TensorType(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSize(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSizeSignature(int tensor_index, int subgraph_index) const;
+  PyObject* TensorSparsityParameters(int tensor_index,
+                                     int subgraph_index) const;
   // Deprecated in favor of TensorQuantizationScales, below.
-  PyObject* TensorQuantization(int i) const;
-  PyObject* TensorQuantizationParameters(int i) const;
-  PyObject* SetTensor(int i, PyObject* value, int subgraph_index);
-  PyObject* GetTensor(int i, int subgraph_index) const;
+  PyObject* TensorQuantization(int tensor_index, int subgraph_index) const;
+  PyObject* TensorQuantizationParameters(int tensor_index,
+                                         int subgraph_index) const;
+  PyObject* SetTensor(int tensor_index, PyObject* value, int subgraph_index);
+  PyObject* GetTensor(int tensor_index, int subgraph_index) const;
   PyObject* GetSubgraphIndexFromSignature(const char* signature_key);
   PyObject* GetSignatureDefs() const;
   PyObject* ResetVariableTensors();
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index 21db54eda0f..80260a85386 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -99,7 +99,7 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
           },
           // LINT.IfChange
           py::arg("subgraph_index") = -1)
-          // LINT.ThenChange(//tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc)
+      // LINT.ThenChange(//tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc)
       .def(
           "Invoke",
           [](InterpreterWrapper& self, int subgraph_index) {
@@ -126,40 +126,50 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
       .def("NumTensors", &InterpreterWrapper::NumTensors)
       .def("TensorName", &InterpreterWrapper::TensorName)
       .def("TensorType",
-           [](const InterpreterWrapper& self, int i) {
-             return tensorflow::PyoOrThrow(self.TensorType(i));
+           [](const InterpreterWrapper& self, int tensor_index,
+              int subgraph_index) {
+             return tensorflow::PyoOrThrow(
+                 self.TensorType(tensor_index, subgraph_index));
            })
       .def("TensorSize",
-           [](const InterpreterWrapper& self, int i) {
-             return tensorflow::PyoOrThrow(self.TensorSize(i));
+           [](const InterpreterWrapper& self, int tensor_index,
+              int subgraph_index) {
+             return tensorflow::PyoOrThrow(
+                 self.TensorSize(tensor_index, subgraph_index));
            })
       .def("TensorSizeSignature",
-           [](const InterpreterWrapper& self, int i) {
-             return tensorflow::PyoOrThrow(self.TensorSizeSignature(i));
+           [](const InterpreterWrapper& self, int tensor_index,
+              int subgraph_index) {
+             return tensorflow::PyoOrThrow(
+                 self.TensorSizeSignature(tensor_index, subgraph_index));
            })
       .def("TensorSparsityParameters",
-           [](const InterpreterWrapper& self, int i) {
-             return tensorflow::PyoOrThrow(self.TensorSparsityParameters(i));
+           [](const InterpreterWrapper& self, int tensor_index,
+              int subgraph_index) {
+             return tensorflow::PyoOrThrow(
+                 self.TensorSparsityParameters(tensor_index, subgraph_index));
            })
       .def(
           "TensorQuantization",
-          [](const InterpreterWrapper& self, int i) {
-            return tensorflow::PyoOrThrow(self.TensorQuantization(i));
+          [](const InterpreterWrapper& self, int tensor_index,
+             int subgraph_index) {
+            return tensorflow::PyoOrThrow(
+                self.TensorQuantization(tensor_index, subgraph_index));
           },
           R"pbdoc(
             Deprecated in favor of TensorQuantizationParameters.
           )pbdoc")
-      .def(
-          "TensorQuantizationParameters",
-          [](InterpreterWrapper& self, int i) {
-            return tensorflow::PyoOrThrow(self.TensorQuantizationParameters(i));
-          })
+      .def("TensorQuantizationParameters",
+           [](InterpreterWrapper& self, int tensor_index, int subgraph_index) {
+             return tensorflow::PyoOrThrow(self.TensorQuantizationParameters(
+                 tensor_index, subgraph_index));
+           })
       .def(
           "SetTensor",
-          [](InterpreterWrapper& self, int i, py::handle& value,
+          [](InterpreterWrapper& self, int tensor_index, py::handle& value,
              int subgraph_index) {
             return tensorflow::PyoOrThrow(
-                self.SetTensor(i, value.ptr(), subgraph_index));
+                self.SetTensor(tensor_index, value.ptr(), subgraph_index));
           },
           py::arg("i"), py::arg("value"), py::arg("subgraph_index") = 0)
       .def(
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
index a95a81bc313..acc3dbd9fda 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.h
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -40,7 +40,7 @@ limitations under the License.
 // translation unit boundaries.
 //
 // For more info see https://sourceforge.net/p/numpy/mailman/message/5700519
-// See also tensorflow/python/lib/core/numpy.h for a similar approach.
+// See also tensorflow/tsl/python/lib/core/numpy.h for a similar approach.
 #define PY_ARRAY_UNIQUE_SYMBOL _tflite_numpy_api
 #ifndef TFLITE_IMPORT_NUMPY
 #define NO_IMPORT_ARRAY
@@ -50,7 +50,7 @@ limitations under the License.
 
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/string_util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index d649a37b1e3..41e5ed4fce2 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -238,7 +238,8 @@ def __init__(self,
                disable_per_channel=False,
                experimental_new_dynamic_range_quantizer=False,
                experimental_low_bit_qat=False,
-               full_integer_quantization_bias_type=None):
+               full_integer_quantization_bias_type=None,
+               experimental_mlir_variable_quantization=False):
     self._optimizations = optimizations
     for deprecated_optimization in [
         Optimize.OPTIMIZE_FOR_SIZE, Optimize.OPTIMIZE_FOR_LATENCY
@@ -264,6 +265,9 @@ def __init__(self,
     self._full_integer_quantization_bias_type = full_integer_quantization_bias_type
     self._validate_full_integer_quantization_bias_type()
 
+    self.enable_mlir_variable_quantization = (
+        experimental_mlir_variable_quantization)
+
   def is_post_training_int8_only_quantization(self):
     return (self.is_any_optimization_enabled() and
             self._representative_dataset is not None and
@@ -363,36 +367,55 @@ def converter_flags(self, inference_ty=None, inference_input_ty=None):
       return {
           "inference_type": (inference_ty if inference_ty is not None else
                              self.activations_type()),
-          "inference_input_type": _dtypes.float32,
-          "post_training_quantize": False,  # disable dynamic range quantization
-          "quantize_to_float16": False,  # disable float16 quantization
-          "disable_infer_tensor_range": is_low_bit_qat,
-          "use_fake_quant_num_bits": is_low_bit_qat,
+          "inference_input_type":
+              _dtypes.float32,
+          "post_training_quantize":
+              False,  # disable dynamic range quantization
+          "quantize_to_float16":
+              False,  # disable float16 quantization
+          "disable_infer_tensor_range":
+              is_low_bit_qat,
+          "use_fake_quant_num_bits":
+              is_low_bit_qat,
+          "enable_mlir_variable_quantization":
+              self.enable_mlir_variable_quantization,
       }
     elif self.is_post_training_dynamic_range_quantization():
       return {
-          "inference_type": _dtypes.float32,
-          "inference_input_type": _dtypes.float32,
-          "post_training_quantize": True,  # enable dynamic range quantization
-          "quantize_to_float16": False,  # disable float16 quantization
+          "inference_type":
+              _dtypes.float32,
+          "inference_input_type":
+              _dtypes.float32,
+          "post_training_quantize":
+              True,  # enable dynamic range quantization
+          "quantize_to_float16":
+              False,  # disable float16 quantization
           # experimental: disable per-channel (per-axis) quantization.
           "disable_per_channel_quantization":
               self._disable_per_channel,
           "enable_mlir_dynamic_range_quantizer":
-              self._enable_new_dynamic_range_quantizer
+              self._enable_new_dynamic_range_quantizer,
+          "enable_mlir_variable_quantization":
+              self.enable_mlir_variable_quantization
       }
     elif self.is_post_training_float16_quantization():
       return {
-          "inference_type": _dtypes.float32,
-          "inference_input_type": _dtypes.float32,
-          "post_training_quantize": True,
-          "quantize_to_float16": True,  # enable float16 quantization
+          "inference_type":
+              _dtypes.float32,
+          "inference_input_type":
+              _dtypes.float32,
+          "post_training_quantize":
+              True,
+          "quantize_to_float16":
+              True,  # enable float16 quantization
           "accumulation_type":
               self._target_spec._experimental_supported_accumulation_type,  # pylint: disable=protected-access
           "allow_bfloat16":
               self.is_bfloat16_quantization(),
           "enable_mlir_dynamic_range_quantizer":
-              self._enable_new_dynamic_range_quantizer
+              self._enable_new_dynamic_range_quantizer,
+          "enable_mlir_variable_quantization":
+              self.enable_mlir_variable_quantization
       }
     else:
       # Note this might still trigger (uint8) quantization to be compatible with
@@ -563,6 +586,8 @@ def __init__(self):
     # converted model as flex ops.
     self._experimental_allow_all_select_tf_ops = False
 
+    self._experimental_variable_quantization = False
+
   def _grappler_config(self, optimizers=None):
     """Creates a tf.compat.v1.ConfigProto for configuring Grappler.
 
@@ -588,7 +613,7 @@ def _grappler_config(self, optimizers=None):
     return _get_grappler_config(optimizers)
 
   def _quantize(self, result, input_type, output_type, activations_type,
-                bias_type, allow_float):
+                bias_type, allow_float, enable_variable_quantization):
     """Quantize the model."""
     # pylint: disable=protected-access
     custom_op_registerers_by_name = [
@@ -623,7 +648,8 @@ def _quantize(self, result, input_type, output_type, activations_type,
           calibrated,
           self._experimental_disable_per_channel,
           input_data_type=input_type,
-          output_data_type=output_type)
+          output_data_type=output_type,
+          enable_variable_quantization=enable_variable_quantization)
     else:
       return calibrate_quantize.calibrate_and_quantize(
           self.representative_dataset.input_gen,
@@ -770,7 +796,8 @@ def _save_conversion_params_metric(self,
         graph_def, self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
-        self._experimental_full_integer_quantization_bias_type)
+        self._experimental_full_integer_quantization_bias_type,
+        self._experimental_variable_quantization)
     converter_kwargs.update({
         "tf_version":
             self._metadata.environment.tensorflowVersion,
@@ -868,8 +895,10 @@ def _optimize_tflite_model(self, model, quant_mode, quant_io=True):
         q_activations_type = quant_mode.activations_type()
         q_bias_type = quant_mode.bias_type()
         q_allow_float = quant_mode.is_allow_float()
+        q_variable_quantization = quant_mode.enable_mlir_variable_quantization
         model = self._quantize(model, q_in_type, q_out_type, q_activations_type,
-                               q_bias_type, q_allow_float)
+                               q_bias_type, q_allow_float,
+                               q_variable_quantization)
 
       m_in_type = in_type if in_type else _dtypes.float32
       m_out_type = out_type if out_type else _dtypes.float32
@@ -1015,7 +1044,8 @@ def _validate_inputs(self, graph_def, input_tensors):
         graph_def, self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
-        self._experimental_full_integer_quantization_bias_type)
+        self._experimental_full_integer_quantization_bias_type,
+        self._experimental_variable_quantization)
     self._validate_inference_input_output_types(self._quant_mode)
 
     if not self._is_unknown_shapes_allowed():
@@ -1087,7 +1117,8 @@ def _convert_from_saved_model(self, graph_def):
         graph_def, self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
-        self._experimental_full_integer_quantization_bias_type)
+        self._experimental_full_integer_quantization_bias_type,
+        self._experimental_variable_quantization)
     self._validate_inference_input_output_types(quant_mode)
     converter_kwargs = {
         "enable_tflite_resource_variables":
@@ -2044,7 +2075,8 @@ def convert(self):
         self._graph_def, self._experimental_disable_per_channel,
         self.experimental_new_dynamic_range_quantizer,
         self._experimental_low_bit_qat,
-        self._experimental_full_integer_quantization_bias_type)
+        self._experimental_full_integer_quantization_bias_type,
+        self._experimental_variable_quantization)
 
     optimized_graph = self._optimize_tf_model(self._graph_def,
                                               self._input_tensors,
@@ -2221,6 +2253,11 @@ def __init__(self,
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
+    Note that in the converted TensorFlow Lite model, the input tensor's order
+    might be changed each time `convert` is called. To access input tensor
+    information, please consider using the `SignatureRunner` API
+    (`interpreter.get_signature_runner`).
+
     Returns:
       The converted data in serialized format. Either a TFLite Flatbuffer or a
       Graphviz graph depending on value in `output_format`.
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index fe845b189a7..182ef06ebee 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -247,7 +247,7 @@ def testConvertMultipleFunctions(self):
     self.assertEqual(sub_output['output_0'], -2)
     output_details = sub_signature_runner.get_output_details()
     self.assertEqual(1, len(output_details))
-    self.assertEqual('StatefulPartitionedCall:0',
+    self.assertEqual('StatefulPartitionedCall_1:0',
                      output_details['output_0']['name'])
     self.assertEqual(np.float32, output_details['output_0']['dtype'])
     self.assertTrue(([1] == output_details['output_0']['shape']).all())
@@ -2161,7 +2161,7 @@ def representative_dataset_gen():
     output_details = sub_signature_runner.get_output_details()
     self.assertLen(output_details, 1)
     self.assertStartsWith(output_details['output_0']['name'],
-                          'StatefulPartitionedCall:0')
+                          'StatefulPartitionedCall_1:0')
     self.assertEqual(inference_input_output_type.as_numpy_dtype,
                      output_details['output_0']['dtype'])
     self.assertTrue(([1] == output_details['output_0']['shape']).all())
@@ -2648,6 +2648,27 @@ def testMlirDynamicRangeQuantization(self, enable_new_dynamic_range_quantizer,
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
+  @parameterized.named_parameters(
+      ('EnableMlirVariableQuantizationNumState1', True, 1),
+      ('DisablMlirVariableQuantizationNumState1', False, 1),
+      ('EnableMlirVariableQuantizationNumState2', True, 2),
+      ('DisablMlirVariableQuantizationNumState2', False, 2),
+  )
+  @test_util.run_v2_only
+  def testVariableQuantization(self, variable_quantization, number_of_states):
+    model, calibration_gen = self._createReadAssignModel(number_of_states)
+
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = calibration_gen
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8  # or tf.uint8
+    converter.inference_output_type = tf.int8  # or tf.uint8
+    converter._experimental_variable_quantization = variable_quantization
+
+    converter.convert()
+    # TODO(b/261940892): Reinforce end-to-end test
+
   @test_util.run_v2_only
   def testSequentialModel(self):
     """Test a simple sequential tf.Keras model."""
@@ -2935,6 +2956,67 @@ def testQATLowBitKerasModel(self, num_bits, weight_only, low_bit):
     # 3 activations with full integer: conv_input, conv_output, reshape_output
     self.assertEqual(num_8bit_activations, 0 if weight_only else 3)
 
+  @test_util.run_v2_only
+  def testKerasConv2DTransposedWithBiasAndActivation(self):
+
+    class QuantConv2DTransposedWithBiasAndActivation(tf.keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.kernel = self.add_weight('kernel', (3, 3, input_shape[-1], 3))
+        self.bias = self.add_weight('bias', (3,))
+
+      def call(self, inputs):
+        filters = tf.quantization.fake_quant_with_min_max_vars(
+            self.kernel, -3.0, 3.0, narrow_range=True)
+        filters = tf.transpose(filters, (0, 1, 3, 2))
+        result = tf.nn.conv2d_transpose(inputs, filters,
+                                        [*inputs.shape[:-1], 3], 1)
+        result = tf.nn.bias_add(result, self.bias)
+        result = tf.nn.relu(result)
+
+        return tf.quantization.fake_quant_with_min_max_vars(
+            result, -3.0, 3.0, narrow_range=True)
+
+    inp = tf.keras.Input(shape=(6, 8, 6), batch_size=1)
+    x = tf.quantization.fake_quant_with_min_max_vars(
+        inp, -3.0, 3.0, narrow_range=True)
+    x = QuantConv2DTransposedWithBiasAndActivation()(x)
+
+    model = tf.keras.Model(inp, x)
+
+    tf_input_shape = (1, 6, 8, 6)
+    input_data = np.linspace(
+        0, 6, np.prod(tf_input_shape)).reshape(tf_input_shape)
+    tf_result = model(input_data)
+
+    converter = tf.lite.TFLiteConverter.from_keras_model(model)
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+
+    converted_model = converter.convert()
+    tf.lite.experimental.Analyzer.analyze(model_content=converted_model)
+
+    interpreter = tf.lite.Interpreter(model_content=converted_model)
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]['index']
+    output_index = interpreter.get_output_details()[0]['index']
+
+    interpreter.set_tensor(input_index, input_data.astype(np.float32))
+    interpreter.invoke()
+    tflite_result = interpreter.tensor(output_index)()
+
+    self.assertAllClose(
+        [np.linalg.norm(
+            tflite_result - tf_result.numpy().astype(np.float32))], [0.0])
+
+    num_float32_tensor = 0
+    for detail in interpreter.get_tensor_details():
+      if detail['dtype'] == np.float32:
+        num_float32_tensor += 1
+
+    # There should be only 2 float tensors, input and output.
+    self.assertEqual(num_float32_tensor, 2)
+
 
 class FromJaxModelTest(lite_v2_test_util.ModelTest):
 
diff --git a/tensorflow/lite/python/lite_v2_test_util.py b/tensorflow/lite/python/lite_v2_test_util.py
index a7257b43e65..2a4a781990e 100644
--- a/tensorflow/lite/python/lite_v2_test_util.py
+++ b/tensorflow/lite/python/lite_v2_test_util.py
@@ -245,3 +245,45 @@ def call(self, inputs):
     scores = tf.keras.layers.Flatten(name=output_name)(x)
     model = tf.keras.Model(input_tensor, scores)
     return model, input_name, output_name
+
+  def _createReadAssignModel(self, number_of_states=2):
+    dtype = float
+
+    class ReadAssign(tf.keras.layers.Layer):
+      """ReadAssign model for the variable quantization test."""
+
+      def __init__(self, number_of_states=2, **kwargs):
+        super().__init__(**kwargs)
+        self.number_of_states = number_of_states
+
+      def build(self, input_shape):
+        super().build(input_shape)
+
+        state_shape = (1, 2, 3)
+        self.states = [None] * self.number_of_states
+        for i in range(self.number_of_states):
+          self.states[i] = self.add_weight(
+              name=f'states{i}',
+              shape=state_shape,
+              trainable=False,
+              initializer=tf.zeros_initializer,
+              dtype=dtype,
+          )
+
+      def call(self, inputs):
+
+        for state in self.states:
+          memory = tf.keras.backend.concatenate([state, inputs], 1)
+          new_state = memory[:, : state.shape[1], :]
+          state.assign(new_state)
+
+        return inputs
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [np.random.uniform(-1, 1, size=(1, 2, 3)).astype(np.float32)]
+
+    inputs = tf.keras.layers.Input(shape=(2, 3), batch_size=1, dtype=dtype)
+    outputs = ReadAssign(number_of_states)(inputs)
+    model = tf.keras.Model(inputs, outputs)
+    return model, calibration_gen
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 9bc9be6d4dc..867c84c9fc0 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index ab7687b56d1..cd09e6d8a9e 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -12,9 +13,9 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:shared_library",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 79384b391eb..8f336bede26 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index ec79afcdd59..f6f3ab6cf2d 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -37,7 +37,9 @@ class BuiltinOpResolver;
 }  // namespace builtin
 }  // namespace ops
 
+namespace impl {
 class FlatBufferModel;
+}
 
 namespace interpreter_wrapper {
 class PythonErrorReporter;
@@ -113,7 +115,7 @@ class CalibrationWrapper {
       std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
       std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
           error_reporter,
-      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<tflite::impl::FlatBufferModel> model,
       std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader,
       std::unique_ptr<std::string> model_str_);
 
@@ -126,7 +128,7 @@ class CalibrationWrapper {
   std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
       error_reporter_;
   std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
-  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::impl::FlatBufferModel> model_;
   std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader_;
   std::unique_ptr<std::string> model_str_;
 };
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 2adf195796f..84048bbc164 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -81,7 +82,7 @@ cc_library(
     srcs = ["test_delegate.cc"],
     visibility = ["//tensorflow/lite:__subpackages__"],
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/python/testdata/control_flow_v1_saved_model/BUILD b/tensorflow/lite/python/testdata/control_flow_v1_saved_model/BUILD
index 4af113fc853..429497fca87 100644
--- a/tensorflow/lite/python/testdata/control_flow_v1_saved_model/BUILD
+++ b/tensorflow/lite/python/testdata/control_flow_v1_saved_model/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/python/testdata/test_delegate.cc b/tensorflow/lite/python/testdata/test_delegate.cc
index 1ec5cdbe009..65d594128fb 100644
--- a/tensorflow/lite/python/testdata/test_delegate.cc
+++ b/tensorflow/lite/python/testdata/test_delegate.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cstdlib>
 #include <cstring>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index f01b4e6b084..ba049747582 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -28,6 +28,7 @@
 from tensorflow.lite.toco.logging import gen_html
 from tensorflow.python import tf2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import gfile
 from tensorflow.python.util import keras_deps
 
 # Needed to enable TF2 by default.
@@ -255,7 +256,7 @@ def _convert_tf1_model(flags):
 
   # Convert model.
   output_data = converter.convert()
-  with open(flags.output_file, "wb") as f:
+  with gfile.GFile(flags.output_file, "wb") as f:
     f.write(output_data)
 
 
@@ -285,7 +286,7 @@ def _convert_tf2_model(flags):
 
   # Convert the model.
   tflite_model = converter.convert()
-  with open(flags.output_file, "wb") as f:
+  with gfile.GFile(flags.output_file, "wb") as f:
     f.write(tflite_model)
 
 
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 2a85f0bdc9d..12b07d113ca 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -35,7 +35,6 @@
 from tensorflow.python.framework import convert_to_constants as _convert_to_constants
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import error_interpolation as _error_interpolation
-from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training.saver import export_meta_graph as _export_meta_graph
 
@@ -264,7 +263,7 @@ def _convert_op_hints_if_present(sess, graph_def, output_tensors,
   if is_frozen_graph(sess):
     raise ValueError("Try to convert op hints, needs unfrozen graph.")
   output_arrays = [get_tensor_name(tensor) for tensor in output_tensors]
-  graph_def = tf_graph_util.convert_variables_to_constants(
+  graph_def = _convert_to_constants.convert_variables_to_constants(
       sess, graph_def, output_arrays + hinted_outputs_nodes)
   graph_def = convert_op_hints_to_stubs(graph_def=graph_def)
   return graph_def
@@ -304,8 +303,9 @@ def freeze_graph(sess, input_tensors, output_tensors):
 
   if not is_frozen_graph(sess):
     output_node_names = [tensor.name.split(":")[0] for tensor in output_tensors]
-    return tf_graph_util.convert_variables_to_constants(sess, graph_def,
-                                                        output_node_names)
+    return _convert_to_constants.convert_variables_to_constants(
+        sess, graph_def, output_node_names
+    )
   else:
     return sess.graph_def
 
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 68a23e7012c..049d2babf36 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -33,17 +33,17 @@ def wrapped_toco_convert(model_flags_str, toco_flags_str, input_data_str,
       enable_mlir_converter)
 
 
-def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
-                                       fully_quantize, inference_type,
-                                       input_data_type, output_data_type,
-                                       enable_numeric_verify,
-                                       enable_whole_model_verify,
-                                       denylisted_ops, denylisted_nodes):
+def wrapped_experimental_mlir_quantize(
+    input_data_str, disable_per_channel, fully_quantize, inference_type,
+    input_data_type, output_data_type, enable_numeric_verify,
+    enable_whole_model_verify, denylisted_ops, denylisted_nodes,
+    enable_variable_quantization):
   """Wraps experimental mlir quantize model."""
   return _pywrap_toco_api.ExperimentalMlirQuantizeModel(
       input_data_str, disable_per_channel, fully_quantize, inference_type,
       input_data_type, output_data_type, enable_numeric_verify,
-      enable_whole_model_verify, denylisted_ops, denylisted_nodes)
+      enable_whole_model_verify, denylisted_ops, denylisted_nodes,
+      enable_variable_quantization)
 
 
 def wrapped_experimental_mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 5a4b3b0cb47..72ec3e8f363 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -17,6 +17,7 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -77,7 +78,7 @@ py_test(
 #         "//tensorflow/lite/schema:schema_generated.h.oss",
 #     ],
 #     python_version = "PY3",
-#     # TODO(b/217577534): Enable this TAP with FlatBuffer 2.0 migration.
+#     # TODO(b/235550563): Enable this TAP with FlatBuffer 2.0.6 migration.
 #     tags = [
 #         "manual",
 #         "notap",
@@ -180,4 +181,10 @@ cc_library(
     ],
 )
 
+flatbuffer_cc_library(
+    name = "conversion_metadata_fbs",
+    srcs = ["conversion_metadata.fbs"],
+    compatible_with = get_compatible_with_portable(),
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/schema/builtin_ops_header/BUILD b/tensorflow/lite/schema/builtin_ops_header/BUILD
index a1c5c6d937e..106c8dbd5b9 100644
--- a/tensorflow/lite/schema/builtin_ops_header/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_header/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/lite/schema/builtin_ops_list/BUILD b/tensorflow/lite/schema/builtin_ops_list/BUILD
index c1eb2509277..c6353878116 100644
--- a/tensorflow/lite/schema/builtin_ops_list/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_list/BUILD
@@ -8,6 +8,7 @@
 # both the builtin_ops.h header and the builtin_ops_list.inc file?
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/lite/schema/conversion_metadata.fbs b/tensorflow/lite/schema/conversion_metadata.fbs
index 3fb981ff411..a0109cf14d8 100644
--- a/tensorflow/lite/schema/conversion_metadata.fbs
+++ b/tensorflow/lite/schema/conversion_metadata.fbs
@@ -42,6 +42,8 @@ table Environment {
 // ones. The thousand part of the value represents the optimization technique
 // and the later part is to distinguish different modes of that technique.
 enum ModelOptimizationMode : int32 {
+  // 0 is reserved as a placeholder
+
   PTQ_FLOAT16 = 1001,
   PTQ_DYNAMIC_RANGE = 1002,
   PTQ_FULL_INTEGER = 1003,
diff --git a/tensorflow/lite/schema/conversion_metadata_generated.h b/tensorflow/lite/schema/conversion_metadata_generated.h
new file mode 100644
index 00000000000..f0d6d8e97a9
--- /dev/null
+++ b/tensorflow/lite/schema/conversion_metadata_generated.h
@@ -0,0 +1,665 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+
+struct Environment;
+struct EnvironmentBuilder;
+struct EnvironmentT;
+
+struct SparsityBlockSize;
+struct SparsityBlockSizeBuilder;
+struct SparsityBlockSizeT;
+
+struct ConversionOptions;
+struct ConversionOptionsBuilder;
+struct ConversionOptionsT;
+
+struct ConversionMetadata;
+struct ConversionMetadataBuilder;
+struct ConversionMetadataT;
+
+enum ModelType : int32_t {
+  ModelType_NONE = 0,
+  ModelType_TF_SAVED_MODEL = 1,
+  ModelType_KERAS_MODEL = 2,
+  ModelType_TF_CONCRETE_FUNCTIONS = 3,
+  ModelType_TF_GRAPH_DEF = 4,
+  ModelType_TF_SESSION = 5,
+  ModelType_JAX = 6,
+  ModelType_MIN = ModelType_NONE,
+  ModelType_MAX = ModelType_JAX
+};
+
+inline const ModelType (&EnumValuesModelType())[7] {
+  static const ModelType values[] = {
+    ModelType_NONE,
+    ModelType_TF_SAVED_MODEL,
+    ModelType_KERAS_MODEL,
+    ModelType_TF_CONCRETE_FUNCTIONS,
+    ModelType_TF_GRAPH_DEF,
+    ModelType_TF_SESSION,
+    ModelType_JAX
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesModelType() {
+  static const char * const names[8] = {
+    "NONE",
+    "TF_SAVED_MODEL",
+    "KERAS_MODEL",
+    "TF_CONCRETE_FUNCTIONS",
+    "TF_GRAPH_DEF",
+    "TF_SESSION",
+    "JAX",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameModelType(ModelType e) {
+  if (flatbuffers::IsOutRange(e, ModelType_NONE, ModelType_JAX)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesModelType()[index];
+}
+
+enum ModelOptimizationMode : int32_t {
+  ModelOptimizationMode_PTQ_FLOAT16 = 1001,
+  ModelOptimizationMode_PTQ_DYNAMIC_RANGE = 1002,
+  ModelOptimizationMode_PTQ_FULL_INTEGER = 1003,
+  ModelOptimizationMode_PTQ_INT16 = 1004,
+  ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING = 2000,
+  ModelOptimizationMode_RANDOM_SPARSITY = 3001,
+  ModelOptimizationMode_BLOCK_SPARSITY = 3002,
+  ModelOptimizationMode_STRUCTURED_SPARSITY = 3003,
+  ModelOptimizationMode_MIN = ModelOptimizationMode_PTQ_FLOAT16,
+  ModelOptimizationMode_MAX = ModelOptimizationMode_STRUCTURED_SPARSITY
+};
+
+inline const ModelOptimizationMode (&EnumValuesModelOptimizationMode())[8] {
+  static const ModelOptimizationMode values[] = {
+    ModelOptimizationMode_PTQ_FLOAT16,
+    ModelOptimizationMode_PTQ_DYNAMIC_RANGE,
+    ModelOptimizationMode_PTQ_FULL_INTEGER,
+    ModelOptimizationMode_PTQ_INT16,
+    ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING,
+    ModelOptimizationMode_RANDOM_SPARSITY,
+    ModelOptimizationMode_BLOCK_SPARSITY,
+    ModelOptimizationMode_STRUCTURED_SPARSITY
+  };
+  return values;
+}
+
+inline const char *EnumNameModelOptimizationMode(ModelOptimizationMode e) {
+  switch (e) {
+    case ModelOptimizationMode_PTQ_FLOAT16: return "PTQ_FLOAT16";
+    case ModelOptimizationMode_PTQ_DYNAMIC_RANGE: return "PTQ_DYNAMIC_RANGE";
+    case ModelOptimizationMode_PTQ_FULL_INTEGER: return "PTQ_FULL_INTEGER";
+    case ModelOptimizationMode_PTQ_INT16: return "PTQ_INT16";
+    case ModelOptimizationMode_QUANTIZATION_AWARE_TRAINING: return "QUANTIZATION_AWARE_TRAINING";
+    case ModelOptimizationMode_RANDOM_SPARSITY: return "RANDOM_SPARSITY";
+    case ModelOptimizationMode_BLOCK_SPARSITY: return "BLOCK_SPARSITY";
+    case ModelOptimizationMode_STRUCTURED_SPARSITY: return "STRUCTURED_SPARSITY";
+    default: return "";
+  }
+}
+
+struct EnvironmentT : public flatbuffers::NativeTable {
+  typedef Environment TableType;
+  std::string tensorflow_version{};
+  uint32_t api_version = 0;
+  tflite::ModelType model_type = tflite::ModelType_NONE;
+};
+
+struct Environment FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EnvironmentT NativeTableType;
+  typedef EnvironmentBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TENSORFLOW_VERSION = 4,
+    VT_API_VERSION = 6,
+    VT_MODEL_TYPE = 8
+  };
+  const flatbuffers::String *tensorflow_version() const {
+    return GetPointer<const flatbuffers::String *>(VT_TENSORFLOW_VERSION);
+  }
+  uint32_t api_version() const {
+    return GetField<uint32_t>(VT_API_VERSION, 0);
+  }
+  tflite::ModelType model_type() const {
+    return static_cast<tflite::ModelType>(GetField<int32_t>(VT_MODEL_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TENSORFLOW_VERSION) &&
+           verifier.VerifyString(tensorflow_version()) &&
+           VerifyField<uint32_t>(verifier, VT_API_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_MODEL_TYPE, 4) &&
+           verifier.EndTable();
+  }
+  EnvironmentT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EnvironmentT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Environment> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EnvironmentBuilder {
+  typedef Environment Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_tensorflow_version(flatbuffers::Offset<flatbuffers::String> tensorflow_version) {
+    fbb_.AddOffset(Environment::VT_TENSORFLOW_VERSION, tensorflow_version);
+  }
+  void add_api_version(uint32_t api_version) {
+    fbb_.AddElement<uint32_t>(Environment::VT_API_VERSION, api_version, 0);
+  }
+  void add_model_type(tflite::ModelType model_type) {
+    fbb_.AddElement<int32_t>(Environment::VT_MODEL_TYPE, static_cast<int32_t>(model_type), 0);
+  }
+  explicit EnvironmentBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<Environment> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Environment>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Environment> CreateEnvironment(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> tensorflow_version = 0,
+    uint32_t api_version = 0,
+    tflite::ModelType model_type = tflite::ModelType_NONE) {
+  EnvironmentBuilder builder_(_fbb);
+  builder_.add_model_type(model_type);
+  builder_.add_api_version(api_version);
+  builder_.add_tensorflow_version(tensorflow_version);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Environment> CreateEnvironmentDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *tensorflow_version = nullptr,
+    uint32_t api_version = 0,
+    tflite::ModelType model_type = tflite::ModelType_NONE) {
+  auto tensorflow_version__ = tensorflow_version ? _fbb.CreateString(tensorflow_version) : 0;
+  return tflite::CreateEnvironment(
+      _fbb,
+      tensorflow_version__,
+      api_version,
+      model_type);
+}
+
+flatbuffers::Offset<Environment> CreateEnvironment(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparsityBlockSizeT : public flatbuffers::NativeTable {
+  typedef SparsityBlockSize TableType;
+  std::vector<uint32_t> values{};
+};
+
+struct SparsityBlockSize FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SparsityBlockSizeT NativeTableType;
+  typedef SparsityBlockSizeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const flatbuffers::Vector<uint32_t> *values() const {
+    return GetPointer<const flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+  SparsityBlockSizeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparsityBlockSizeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SparsityBlockSize> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparsityBlockSizeBuilder {
+  typedef SparsityBlockSize Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_values(flatbuffers::Offset<flatbuffers::Vector<uint32_t>> values) {
+    fbb_.AddOffset(SparsityBlockSize::VT_VALUES, values);
+  }
+  explicit SparsityBlockSizeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<SparsityBlockSize> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SparsityBlockSize>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint32_t>> values = 0) {
+  SparsityBlockSizeBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSizeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<uint32_t>(*values) : 0;
+  return tflite::CreateSparsityBlockSize(
+      _fbb,
+      values__);
+}
+
+flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConversionOptionsT : public flatbuffers::NativeTable {
+  typedef ConversionOptions TableType;
+  std::vector<tflite::ModelOptimizationMode> model_optimization_modes{};
+  bool allow_custom_ops = false;
+  bool enable_select_tf_ops = false;
+  bool force_select_tf_ops = false;
+  std::vector<std::unique_ptr<tflite::SparsityBlockSizeT>> sparsity_block_sizes{};
+  ConversionOptionsT() = default;
+  ConversionOptionsT(const ConversionOptionsT &o);
+  ConversionOptionsT(ConversionOptionsT&&) FLATBUFFERS_NOEXCEPT = default;
+  ConversionOptionsT &operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ConversionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConversionOptionsT NativeTableType;
+  typedef ConversionOptionsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODEL_OPTIMIZATION_MODES = 4,
+    VT_ALLOW_CUSTOM_OPS = 6,
+    VT_ENABLE_SELECT_TF_OPS = 8,
+    VT_FORCE_SELECT_TF_OPS = 10,
+    VT_SPARSITY_BLOCK_SIZES = 12
+  };
+  const flatbuffers::Vector<int32_t> *model_optimization_modes() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_MODEL_OPTIMIZATION_MODES);
+  }
+  bool allow_custom_ops() const {
+    return GetField<uint8_t>(VT_ALLOW_CUSTOM_OPS, 0) != 0;
+  }
+  bool enable_select_tf_ops() const {
+    return GetField<uint8_t>(VT_ENABLE_SELECT_TF_OPS, 0) != 0;
+  }
+  bool force_select_tf_ops() const {
+    return GetField<uint8_t>(VT_FORCE_SELECT_TF_OPS, 0) != 0;
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *>(VT_SPARSITY_BLOCK_SIZES);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MODEL_OPTIMIZATION_MODES) &&
+           verifier.VerifyVector(model_optimization_modes()) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_CUSTOM_OPS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ENABLE_SELECT_TF_OPS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_FORCE_SELECT_TF_OPS, 1) &&
+           VerifyOffset(verifier, VT_SPARSITY_BLOCK_SIZES) &&
+           verifier.VerifyVector(sparsity_block_sizes()) &&
+           verifier.VerifyVectorOfTables(sparsity_block_sizes()) &&
+           verifier.EndTable();
+  }
+  ConversionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConversionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConversionOptionsBuilder {
+  typedef ConversionOptions Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_model_optimization_modes(flatbuffers::Offset<flatbuffers::Vector<int32_t>> model_optimization_modes) {
+    fbb_.AddOffset(ConversionOptions::VT_MODEL_OPTIMIZATION_MODES, model_optimization_modes);
+  }
+  void add_allow_custom_ops(bool allow_custom_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_ALLOW_CUSTOM_OPS, static_cast<uint8_t>(allow_custom_ops), 0);
+  }
+  void add_enable_select_tf_ops(bool enable_select_tf_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_ENABLE_SELECT_TF_OPS, static_cast<uint8_t>(enable_select_tf_ops), 0);
+  }
+  void add_force_select_tf_ops(bool force_select_tf_ops) {
+    fbb_.AddElement<uint8_t>(ConversionOptions::VT_FORCE_SELECT_TF_OPS, static_cast<uint8_t>(force_select_tf_ops), 0);
+  }
+  void add_sparsity_block_sizes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes) {
+    fbb_.AddOffset(ConversionOptions::VT_SPARSITY_BLOCK_SIZES, sparsity_block_sizes);
+  }
+  explicit ConversionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ConversionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConversionOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConversionOptions> CreateConversionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> model_optimization_modes = 0,
+    bool allow_custom_ops = false,
+    bool enable_select_tf_ops = false,
+    bool force_select_tf_ops = false,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SparsityBlockSize>>> sparsity_block_sizes = 0) {
+  ConversionOptionsBuilder builder_(_fbb);
+  builder_.add_sparsity_block_sizes(sparsity_block_sizes);
+  builder_.add_model_optimization_modes(model_optimization_modes);
+  builder_.add_force_select_tf_ops(force_select_tf_ops);
+  builder_.add_enable_select_tf_ops(enable_select_tf_ops);
+  builder_.add_allow_custom_ops(allow_custom_ops);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ConversionOptions> CreateConversionOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *model_optimization_modes = nullptr,
+    bool allow_custom_ops = false,
+    bool enable_select_tf_ops = false,
+    bool force_select_tf_ops = false,
+    const std::vector<flatbuffers::Offset<tflite::SparsityBlockSize>> *sparsity_block_sizes = nullptr) {
+  auto model_optimization_modes__ = model_optimization_modes ? _fbb.CreateVector<int32_t>(*model_optimization_modes) : 0;
+  auto sparsity_block_sizes__ = sparsity_block_sizes ? _fbb.CreateVector<flatbuffers::Offset<tflite::SparsityBlockSize>>(*sparsity_block_sizes) : 0;
+  return tflite::CreateConversionOptions(
+      _fbb,
+      model_optimization_modes__,
+      allow_custom_ops,
+      enable_select_tf_ops,
+      force_select_tf_ops,
+      sparsity_block_sizes__);
+}
+
+flatbuffers::Offset<ConversionOptions> CreateConversionOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConversionMetadataT : public flatbuffers::NativeTable {
+  typedef ConversionMetadata TableType;
+  std::unique_ptr<tflite::EnvironmentT> environment{};
+  std::unique_ptr<tflite::ConversionOptionsT> options{};
+  ConversionMetadataT() = default;
+  ConversionMetadataT(const ConversionMetadataT &o);
+  ConversionMetadataT(ConversionMetadataT&&) FLATBUFFERS_NOEXCEPT = default;
+  ConversionMetadataT &operator=(ConversionMetadataT o) FLATBUFFERS_NOEXCEPT;
+};
+
+struct ConversionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ConversionMetadataT NativeTableType;
+  typedef ConversionMetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENVIRONMENT = 4,
+    VT_OPTIONS = 6
+  };
+  const tflite::Environment *environment() const {
+    return GetPointer<const tflite::Environment *>(VT_ENVIRONMENT);
+  }
+  const tflite::ConversionOptions *options() const {
+    return GetPointer<const tflite::ConversionOptions *>(VT_OPTIONS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ENVIRONMENT) &&
+           verifier.VerifyTable(environment()) &&
+           VerifyOffset(verifier, VT_OPTIONS) &&
+           verifier.VerifyTable(options()) &&
+           verifier.EndTable();
+  }
+  ConversionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ConversionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConversionMetadata> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConversionMetadataBuilder {
+  typedef ConversionMetadata Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_environment(flatbuffers::Offset<tflite::Environment> environment) {
+    fbb_.AddOffset(ConversionMetadata::VT_ENVIRONMENT, environment);
+  }
+  void add_options(flatbuffers::Offset<tflite::ConversionOptions> options) {
+    fbb_.AddOffset(ConversionMetadata::VT_OPTIONS, options);
+  }
+  explicit ConversionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<ConversionMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConversionMetadata>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<tflite::Environment> environment = 0,
+    flatbuffers::Offset<tflite::ConversionOptions> options = 0) {
+  ConversionMetadataBuilder builder_(_fbb);
+  builder_.add_options(options);
+  builder_.add_environment(environment);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline EnvironmentT *Environment::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<EnvironmentT>(new EnvironmentT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void Environment::UnPackTo(EnvironmentT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = tensorflow_version(); if (_e) _o->tensorflow_version = _e->str(); }
+  { auto _e = api_version(); _o->api_version = _e; }
+  { auto _e = model_type(); _o->model_type = _e; }
+}
+
+inline flatbuffers::Offset<Environment> Environment::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEnvironment(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Environment> CreateEnvironment(flatbuffers::FlatBufferBuilder &_fbb, const EnvironmentT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EnvironmentT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _tensorflow_version = _o->tensorflow_version.empty() ? 0 : _fbb.CreateString(_o->tensorflow_version);
+  auto _api_version = _o->api_version;
+  auto _model_type = _o->model_type;
+  return tflite::CreateEnvironment(
+      _fbb,
+      _tensorflow_version,
+      _api_version,
+      _model_type);
+}
+
+inline SparsityBlockSizeT *SparsityBlockSize::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<SparsityBlockSizeT>(new SparsityBlockSizeT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void SparsityBlockSize::UnPackTo(SparsityBlockSizeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+}
+
+inline flatbuffers::Offset<SparsityBlockSize> SparsityBlockSize::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparsityBlockSize(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SparsityBlockSize> CreateSparsityBlockSize(flatbuffers::FlatBufferBuilder &_fbb, const SparsityBlockSizeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityBlockSizeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _values = _o->values.size() ? _fbb.CreateVector(_o->values) : 0;
+  return tflite::CreateSparsityBlockSize(
+      _fbb,
+      _values);
+}
+
+inline ConversionOptionsT::ConversionOptionsT(const ConversionOptionsT &o)
+      : model_optimization_modes(o.model_optimization_modes),
+        allow_custom_ops(o.allow_custom_ops),
+        enable_select_tf_ops(o.enable_select_tf_ops),
+        force_select_tf_ops(o.force_select_tf_ops) {
+  sparsity_block_sizes.reserve(o.sparsity_block_sizes.size());
+  for (const auto &v : o.sparsity_block_sizes) { sparsity_block_sizes.emplace_back((v) ? new tflite::SparsityBlockSizeT(*v) : nullptr); }
+}
+
+inline ConversionOptionsT &ConversionOptionsT::operator=(ConversionOptionsT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(model_optimization_modes, o.model_optimization_modes);
+  std::swap(allow_custom_ops, o.allow_custom_ops);
+  std::swap(enable_select_tf_ops, o.enable_select_tf_ops);
+  std::swap(force_select_tf_ops, o.force_select_tf_ops);
+  std::swap(sparsity_block_sizes, o.sparsity_block_sizes);
+  return *this;
+}
+
+inline ConversionOptionsT *ConversionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConversionOptionsT>(new ConversionOptionsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConversionOptions::UnPackTo(ConversionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = model_optimization_modes(); if (_e) { _o->model_optimization_modes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->model_optimization_modes[_i] = static_cast<tflite::ModelOptimizationMode>(_e->Get(_i)); } } }
+  { auto _e = allow_custom_ops(); _o->allow_custom_ops = _e; }
+  { auto _e = enable_select_tf_ops(); _o->enable_select_tf_ops = _e; }
+  { auto _e = force_select_tf_ops(); _o->force_select_tf_ops = _e; }
+  { auto _e = sparsity_block_sizes(); if (_e) { _o->sparsity_block_sizes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { if(_o->sparsity_block_sizes[_i]) { _e->Get(_i)->UnPackTo(_o->sparsity_block_sizes[_i].get(), _resolver); } else { _o->sparsity_block_sizes[_i] = std::unique_ptr<tflite::SparsityBlockSizeT>(_e->Get(_i)->UnPack(_resolver)); }; } } }
+}
+
+inline flatbuffers::Offset<ConversionOptions> ConversionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConversionOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConversionOptions> CreateConversionOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConversionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConversionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _model_optimization_modes = _o->model_optimization_modes.size() ? _fbb.CreateVectorScalarCast<int32_t>(flatbuffers::data(_o->model_optimization_modes), _o->model_optimization_modes.size()) : 0;
+  auto _allow_custom_ops = _o->allow_custom_ops;
+  auto _enable_select_tf_ops = _o->enable_select_tf_ops;
+  auto _force_select_tf_ops = _o->force_select_tf_ops;
+  auto _sparsity_block_sizes = _o->sparsity_block_sizes.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SparsityBlockSize>> (_o->sparsity_block_sizes.size(), [](size_t i, _VectorArgs *__va) { return CreateSparsityBlockSize(*__va->__fbb, __va->__o->sparsity_block_sizes[i].get(), __va->__rehasher); }, &_va ) : 0;
+  return tflite::CreateConversionOptions(
+      _fbb,
+      _model_optimization_modes,
+      _allow_custom_ops,
+      _enable_select_tf_ops,
+      _force_select_tf_ops,
+      _sparsity_block_sizes);
+}
+
+inline ConversionMetadataT::ConversionMetadataT(const ConversionMetadataT &o)
+      : environment((o.environment) ? new tflite::EnvironmentT(*o.environment) : nullptr),
+        options((o.options) ? new tflite::ConversionOptionsT(*o.options) : nullptr) {
+}
+
+inline ConversionMetadataT &ConversionMetadataT::operator=(ConversionMetadataT o) FLATBUFFERS_NOEXCEPT {
+  std::swap(environment, o.environment);
+  std::swap(options, o.options);
+  return *this;
+}
+
+inline ConversionMetadataT *ConversionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ConversionMetadataT>(new ConversionMetadataT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ConversionMetadata::UnPackTo(ConversionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = environment(); if (_e) { if(_o->environment) { _e->UnPackTo(_o->environment.get(), _resolver); } else { _o->environment = std::unique_ptr<tflite::EnvironmentT>(_e->UnPack(_resolver)); } } }
+  { auto _e = options(); if (_e) { if(_o->options) { _e->UnPackTo(_o->options.get(), _resolver); } else { _o->options = std::unique_ptr<tflite::ConversionOptionsT>(_e->UnPack(_resolver)); } } }
+}
+
+inline flatbuffers::Offset<ConversionMetadata> ConversionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConversionMetadata(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConversionMetadata> CreateConversionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const ConversionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConversionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _environment = _o->environment ? CreateEnvironment(_fbb, _o->environment.get(), _rehasher) : 0;
+  auto _options = _o->options ? CreateConversionOptions(_fbb, _o->options.get(), _rehasher) : 0;
+  return tflite::CreateConversionMetadata(
+      _fbb,
+      _environment,
+      _options);
+}
+
+inline const tflite::ConversionMetadata *GetConversionMetadata(const void *buf) {
+  return flatbuffers::GetRoot<tflite::ConversionMetadata>(buf);
+}
+
+inline const tflite::ConversionMetadata *GetSizePrefixedConversionMetadata(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<tflite::ConversionMetadata>(buf);
+}
+
+inline bool VerifyConversionMetadataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::ConversionMetadata>(nullptr);
+}
+
+inline bool VerifySizePrefixedConversionMetadataBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::ConversionMetadata>(nullptr);
+}
+
+inline void FinishConversionMetadataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::ConversionMetadata> root) {
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedConversionMetadataBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::ConversionMetadata> root) {
+  fbb.FinishSizePrefixed(root);
+}
+
+inline std::unique_ptr<tflite::ConversionMetadataT> UnPackConversionMetadata(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ConversionMetadataT>(GetConversionMetadata(buf)->UnPack(res));
+}
+
+inline std::unique_ptr<tflite::ConversionMetadataT> UnPackSizePrefixedConversionMetadata(
+    const void *buf,
+    const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<tflite::ConversionMetadataT>(GetSizePrefixedConversionMetadata(buf)->UnPack(res));
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_CONVERSIONMETADATA_TFLITE_H_
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index b0d8e35fa8e..05a906db041 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -742,6 +742,9 @@ table UnidirectionalSequenceLSTMOptions {
 
   // Parameter for Unidirectional Sequence LSTM version 3.
   asymmetric_quantize_inputs:bool;
+
+  // Parameter for unidirectional sequence RNN version 4.
+  diagonal_recurrent_tensors:bool;
 }
 
 table BidirectionalSequenceLSTMOptions {
@@ -920,9 +923,13 @@ table SliceOptions {
 }
 
 table TransposeConvOptions {
+  // Parameters supported by version 1, 2, 3:
   padding:Padding;
   stride_w:int;
   stride_h:int;
+
+  // Parameters supported by version 4:
+  fused_activation_function:ActivationFunctionType = NONE;
 }
 
 table ExpandDimsOptions {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 97043f0c4ab..7ceae97edc2 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -6456,6 +6456,7 @@ struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   float proj_clip = 0.0f;
   bool time_major = false;
   bool asymmetric_quantize_inputs = false;
+  bool diagonal_recurrent_tensors = false;
 };
 
 struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -6466,7 +6467,8 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
     VT_CELL_CLIP = 6,
     VT_PROJ_CLIP = 8,
     VT_TIME_MAJOR = 10,
-    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12,
+    VT_DIAGONAL_RECURRENT_TENSORS = 14
   };
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -6483,6 +6485,9 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
   bool asymmetric_quantize_inputs() const {
     return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
   }
+  bool diagonal_recurrent_tensors() const {
+    return GetField<uint8_t>(VT_DIAGONAL_RECURRENT_TENSORS, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
@@ -6490,6 +6495,7 @@ struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatb
            VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
            VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
            VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DIAGONAL_RECURRENT_TENSORS, 1) &&
            verifier.EndTable();
   }
   UnidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -6516,6 +6522,9 @@ struct UnidirectionalSequenceLSTMOptionsBuilder {
   void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
     fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
   }
+  void add_diagonal_recurrent_tensors(bool diagonal_recurrent_tensors) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_DIAGONAL_RECURRENT_TENSORS, static_cast<uint8_t>(diagonal_recurrent_tensors), 0);
+  }
   explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -6533,10 +6542,12 @@ inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirection
     float cell_clip = 0.0f,
     float proj_clip = 0.0f,
     bool time_major = false,
-    bool asymmetric_quantize_inputs = false) {
+    bool asymmetric_quantize_inputs = false,
+    bool diagonal_recurrent_tensors = false) {
   UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
+  builder_.add_diagonal_recurrent_tensors(diagonal_recurrent_tensors);
   builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
   builder_.add_time_major(time_major);
   builder_.add_fused_activation_function(fused_activation_function);
@@ -8527,6 +8538,7 @@ struct TransposeConvOptionsT : public flatbuffers::NativeTable {
   tflite::Padding padding = tflite::Padding_SAME;
   int32_t stride_w = 0;
   int32_t stride_h = 0;
+  tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE;
 };
 
 struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -8535,7 +8547,8 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
-    VT_STRIDE_H = 8
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10
   };
   tflite::Padding padding() const {
     return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
@@ -8546,11 +8559,15 @@ struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   int32_t stride_h() const {
     return GetField<int32_t>(VT_STRIDE_H, 0);
   }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
            VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
            verifier.EndTable();
   }
   TransposeConvOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -8571,6 +8588,9 @@ struct TransposeConvOptionsBuilder {
   void add_stride_h(int32_t stride_h) {
     fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
   }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
   explicit TransposeConvOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -8586,10 +8606,12 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
     tflite::Padding padding = tflite::Padding_SAME,
     int32_t stride_w = 0,
-    int32_t stride_h = 0) {
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType_NONE) {
   TransposeConvOptionsBuilder builder_(_fbb);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
   builder_.add_padding(padding);
   return builder_.Finish();
 }
@@ -14272,6 +14294,7 @@ inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLS
   { auto _e = proj_clip(); _o->proj_clip = _e; }
   { auto _e = time_major(); _o->time_major = _e; }
   { auto _e = asymmetric_quantize_inputs(); _o->asymmetric_quantize_inputs = _e; }
+  { auto _e = diagonal_recurrent_tensors(); _o->diagonal_recurrent_tensors = _e; }
 }
 
 inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -14287,13 +14310,15 @@ inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirection
   auto _proj_clip = _o->proj_clip;
   auto _time_major = _o->time_major;
   auto _asymmetric_quantize_inputs = _o->asymmetric_quantize_inputs;
+  auto _diagonal_recurrent_tensors = _o->diagonal_recurrent_tensors;
   return tflite::CreateUnidirectionalSequenceLSTMOptions(
       _fbb,
       _fused_activation_function,
       _cell_clip,
       _proj_clip,
       _time_major,
-      _asymmetric_quantize_inputs);
+      _asymmetric_quantize_inputs,
+      _diagonal_recurrent_tensors);
 }
 
 inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -15313,6 +15338,7 @@ inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flat
   { auto _e = padding(); _o->padding = _e; }
   { auto _e = stride_w(); _o->stride_w = _e; }
   { auto _e = stride_h(); _o->stride_h = _e; }
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; }
 }
 
 inline flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -15326,11 +15352,13 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flat
   auto _padding = _o->padding;
   auto _stride_w = _o->stride_w;
   auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
   return tflite::CreateTransposeConvOptions(
       _fbb,
       _padding,
       _stride_w,
-      _stride_h);
+      _stride_h,
+      _fused_activation_function);
 }
 
 inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/signature_runner.cc b/tensorflow/lite/signature_runner.cc
index 4db4418cdcd..3223aefa3cc 100644
--- a/tensorflow/lite/signature_runner.cc
+++ b/tensorflow/lite/signature_runner.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/signature_runner.h"
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/signature_runner.h b/tensorflow/lite/signature_runner.h
index 0273182448e..ae904e99edd 100644
--- a/tensorflow/lite/signature_runner.h
+++ b/tensorflow/lite/signature_runner.h
@@ -20,12 +20,14 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/internal/signature_def.h"
 
 namespace tflite {
+namespace impl {
 class Interpreter;               // Class for friend declarations.
+}
 class SignatureRunnerJNIHelper;  // Class for friend declarations.
 class TensorHandle;              // Class for friend declarations.
 class SignatureRunnerHelper;     // Class for friend declarations.
@@ -150,7 +152,7 @@ class SignatureRunner {
   // SignatureRunner objects don't outlive their corresponding Subgraph objects.
   SignatureRunner(const internal::SignatureDef* signature_def,
                   Subgraph* subgraph);
-  friend class Interpreter;
+  friend class ::tflite::impl::Interpreter;
   friend class SignatureRunnerJNIHelper;
   friend class TensorHandle;
   friend class SignatureRunnerHelper;
diff --git a/tensorflow/lite/signature_runner_test.cc b/tensorflow/lite/signature_runner_test.cc
index 0bfd737856a..7a42e939f19 100644
--- a/tensorflow/lite/signature_runner_test.cc
+++ b/tensorflow/lite/signature_runner_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/interpreter_builder.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index 06ba145f332..23d1ed50486 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #ifdef TF_LITE_TENSORFLOW_PROFILER
 #include "tensorflow/lite/tensorflow_profiler_logger.h"
@@ -147,6 +147,7 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context,
   if (required_size > underlying_buffer_size_) {
     *arena_reallocated = true;
 #ifdef TF_LITE_TENSORFLOW_PROFILER
+    PauseHeapMonitoring(/*pause=*/true);
     OnTfLiteArenaAlloc(subgraph_index_, reinterpret_cast<std::uintptr_t>(this),
                        required_size);
 #endif
@@ -176,6 +177,9 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context,
     underlying_buffer_.reset(new_alloc);
     underlying_buffer_size_ = required_size;
     underlying_buffer_aligned_ptr_ = new_underlying_buffer_aligned_ptr;
+#ifdef TF_LITE_TENSORFLOW_PROFILER
+    PauseHeapMonitoring(/*pause=*/false);
+#endif
   } else {
     *arena_reallocated = false;
   }
diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h
index d358e2631b1..8f8859a6c0d 100644
--- a/tensorflow/lite/simple_memory_arena.h
+++ b/tensorflow/lite/simple_memory_arena.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
index 2226ecb1d96..fb21e145b62 100644
--- a/tensorflow/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/simple_memory_arena.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/simple_planner.cc b/tensorflow/lite/simple_planner.cc
index 7e336240598..3cf26384966 100644
--- a/tensorflow/lite/simple_planner.cc
+++ b/tensorflow/lite/simple_planner.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 
 namespace tflite {
@@ -52,9 +52,10 @@ TfLiteStatus SimplePlanner::ResetAllocations() {
 }
 
 TfLiteStatus SimplePlanner::ResetAllocationsAfter(int node) {
+  TfLiteTensor* tensors = graph_info_->tensors();
   for (int i = 0; i < static_cast<int>(allocs_.size()); ++i) {
     if (allocs_[i].node > node && allocs_[i].size > 0) {
-      TfLiteTensor& tensor = *graph_info_->tensor(i);
+      TfLiteTensor& tensor = tensors[i];
       if (tensor.allocation_type == kTfLiteArenaRw) {
         allocs_[i].free();
         tensor.data.raw = nullptr;
@@ -126,7 +127,8 @@ TfLiteStatus SimplePlanner::PlanAllocations() {
   }
 
   // Count references to node input tensors.
-  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
+  const size_t num_execution_nodes = graph_info_->num_execution_nodes();
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_inputs = node.inputs;
     for (int j = 0; j < node_inputs->size; ++j) {
@@ -138,7 +140,7 @@ TfLiteStatus SimplePlanner::PlanAllocations() {
   }
 
   // Go through the graph in execution order.
-  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
 
     // First queue output tensors for allocation.
@@ -172,9 +174,9 @@ TfLiteStatus SimplePlanner::ExecuteAllocations(int first_node, int last_node) {
   dealloc_node_.resize(graph_info_->num_tensors(), kNodeNotAssigned);
   allocs_.resize(graph_info_->num_tensors());
   // Set allocation and deallocation for temporary tensors.
-  for (size_t i = first_node; i <= static_cast<size_t>(last_node) &&
-                              i < graph_info_->num_execution_nodes();
-       ++i) {
+  const size_t num_execution_nodes = graph_info_->num_execution_nodes();
+  for (size_t i = first_node;
+       i <= static_cast<size_t>(last_node) && i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_temporaries = node.temporaries;
     for (int j = 0; j < node_temporaries->size; ++j) {
@@ -185,10 +187,12 @@ TfLiteStatus SimplePlanner::ExecuteAllocations(int first_node, int last_node) {
   }
 
   // Conduct the planned allocations.
-  for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
+  const int num_tensors = static_cast<int>(graph_info_->num_tensors());
+  TfLiteTensor* tensors = graph_info_->tensors();
+  for (int i = 0; i < num_tensors; ++i) {
     bool allocated = false;
     if (alloc_node_[i] >= first_node && alloc_node_[i] <= last_node) {
-      TfLiteTensor& tensor = *graph_info_->tensor(i);
+      TfLiteTensor& tensor = tensors[i];
       if (tensor.allocation_type == kTfLiteArenaRw) {
         if (allocs_[i].size != 0) {
           allocs_[i].free();
@@ -210,8 +214,10 @@ TfLiteStatus SimplePlanner::ExecuteAllocations(int first_node, int last_node) {
 
 TfLiteStatus SimplePlanner::ReleaseNonPersistentMemory() {
   // Set data pointers for all non-persistent tensors to nullptr.
-  for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
-    TfLiteTensor& tensor = *graph_info_->tensor(i);
+  const int num_tensors = static_cast<int>(graph_info_->num_tensors());
+  TfLiteTensor* tensors = graph_info_->tensors();
+  for (int i = 0; i < num_tensors; ++i) {
+    TfLiteTensor& tensor = tensors[i];
     if (tensor.allocation_type == kTfLiteArenaRw) {
       allocs_[i].free();
       tensor.data.raw = nullptr;
@@ -222,8 +228,10 @@ TfLiteStatus SimplePlanner::ReleaseNonPersistentMemory() {
 
 TfLiteStatus SimplePlanner::AcquireNonPersistentMemory() {
   // Resolve allocations for all tensors not on the persistent arena.
-  for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
-    TfLiteTensor& tensor = *graph_info_->tensor(i);
+  const int num_tensors = static_cast<int>(graph_info_->num_tensors());
+  TfLiteTensor* tensors = graph_info_->tensors();
+  for (int i = 0; i < num_tensors; ++i) {
+    TfLiteTensor& tensor = tensors[i];
     if (tensor.allocation_type == kTfLiteArenaRw) {
       TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i));
     }
diff --git a/tensorflow/lite/simple_planner.h b/tensorflow/lite/simple_planner.h
index a921c757fd2..db658839ac3 100644
--- a/tensorflow/lite/simple_planner.h
+++ b/tensorflow/lite/simple_planner.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/util.h"
diff --git a/tensorflow/lite/simple_planner_test.cc b/tensorflow/lite/simple_planner_test.cc
index a3892db8434..4e3f7e06186 100644
--- a/tensorflow/lite/simple_planner_test.cc
+++ b/tensorflow/lite/simple_planner_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/testing/util.h"
 
@@ -40,11 +40,13 @@ class TestOp {
   const std::vector<int>& inputs() const { return inputs_; }
   const std::vector<int>& outputs() const { return outputs_; }
   const std::vector<int>& temporaries() const { return temporaries_; }
+  const TfLiteRegistration& registration() const { return registration_; }
 
  private:
   std::vector<int> inputs_;
   std::vector<int> outputs_;
   std::vector<int> temporaries_;
+  TfLiteRegistration registration_;
 };
 
 // A test graph where inputs are processed by the given nodes to produce
@@ -70,6 +72,7 @@ class TestGraph {
         return lite;
       };
 
+      registrations_.push_back(node.registration());
       nodes_.push_back(TfLiteNode());
       nodes_.back().inputs = int_array(node.inputs());
       for (int t : node.inputs()) {
@@ -107,6 +110,9 @@ class TestGraph {
   const std::vector<int>& inputs() { return inputs_; }
   const std::vector<int>& outputs() { return outputs_; }
   const std::vector<int>& variables() { return variables_; }
+  const std::vector<TfLiteRegistration>& registrations() {
+    return registrations_;
+  }
 
   void SetVariables(const std::vector<int>& variables) {
     variables_ = variables;
@@ -123,6 +129,7 @@ class TestGraph {
  private:
   std::vector<TfLiteNode> nodes_;
   std::vector<TfLiteTensor> tensors_;
+  std::vector<TfLiteRegistration> registrations_;
   std::vector<int> inputs_;
   std::vector<int> outputs_;
   std::vector<int> variables_;
@@ -134,6 +141,9 @@ class TestGraphInfo : public GraphInfo {
   explicit TestGraphInfo(TestGraph* graph) : graph_(graph) {}
 
   size_t num_tensors() const override { return graph_->tensors()->size(); }
+  const TfLiteRegistration& registration(size_t index) const override {
+    return graph_->registrations()[index];
+  }
   TfLiteTensor* tensor(size_t index) override {
     return &graph_->tensors()->at(index);
   }
diff --git a/tensorflow/lite/stderr_reporter.h b/tensorflow/lite/stderr_reporter.h
index de8644c9683..2eacb9eca24 100644
--- a/tensorflow/lite/stderr_reporter.h
+++ b/tensorflow/lite/stderr_reporter.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <cstdarg>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index 8df20a59018..b79c97e7d03 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -16,24 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 
 #include <stddef.h>
-#include <stdint.h>
 
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
-void DynamicBuffer::AddString(const char* str, size_t len) {
+TfLiteStatus DynamicBuffer::AddString(const char* str, size_t len) {
+  // If `data_.size() + len` is greater than `SIZE_MAX` then the left hand side
+  // will overflow to something less than max_length_. After checking `len <=
+  // max_length_` we can use this subtraction to check for overflow.
+  if (len > max_length_ || data_.size() >= max_length_ - len)
+    return kTfLiteError;
   data_.resize(data_.size() + len);
   memcpy(data_.data() + offset_.back(), str, len);
   offset_.push_back(offset_.back() + len);
+  return kTfLiteOk;
 }
 
-void DynamicBuffer::AddString(const StringRef& string) {
-  AddString(string.str, string.len);
+TfLiteStatus DynamicBuffer::AddString(const StringRef& string) {
+  return AddString(string.str, string.len);
 }
 
 void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
@@ -83,11 +90,19 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   *buffer = reinterpret_cast<char*>(malloc(bytes));
 
   // Set num of string
+  //
+  // NOTE: The string buffer is accessed here as if it's native endian (instead
+  // of small endian, as documented in the header). This will protentially break
+  // when TFLite is ported to big endian platforms.
+  // TODO(b/165919229): This code will need changing if/when we port to a
+  // big-endian platform.
   memcpy(*buffer, &num_strings, sizeof(int32_t));
 
   // Set offset of strings.
   int32_t start = sizeof(int32_t) * (num_strings + 2);
   for (size_t i = 0; i < offset_.size(); i++) {
+    // TODO(b/165919229): This code will need changing if/when we port to a
+    // big-endian platform.
     int32_t offset = start + offset_[i];
     memcpy(*buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
   }
@@ -122,6 +137,12 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
 
 int GetStringCount(const void* raw_buffer) {
   // The first integers in the raw buffer is the number of strings.
+  //
+  // NOTE: The string buffer is accessed here as if it's native endian (instead
+  // of small endian, as documented in the header). This will protentially break
+  // when TFLite is ported to big endian platforms.
+  // TODO(b/165919229): This code will need changing if/when we port to a
+  // big-endian platform.
   return *static_cast<const int32_t*>(raw_buffer);
 }
 
@@ -131,12 +152,16 @@ int GetStringCount(const TfLiteTensor* tensor) {
 }
 
 StringRef GetString(const void* raw_buffer, int string_index) {
+  // NOTE: The string buffer is accessed here as if it's native endian (instead
+  // of small endian, as documented in the header). This will protentially break
+  // when TFLite is ported to big endian platforms.
+  // TODO(b/165919229): This code will need changing if/when we port to a
+  // big-endian platform.
   const int32_t* offset =
       static_cast<const int32_t*>(raw_buffer) + (string_index + 1);
-  return StringRef{
-      static_cast<const char*>(raw_buffer) + (*offset),
-      (*(offset + 1)) - (*offset),
-  };
+  const size_t string_len = (*(offset + 1)) - (*offset);
+  return StringRef{static_cast<const char*>(raw_buffer) + (*offset),
+                   string_len};
 }
 
 StringRef GetString(const TfLiteTensor* tensor, int string_index) {
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index ad4f331674e..fda2af5902f 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -44,31 +44,38 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <limits>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
-// Convenient structure to store string pointer and length.
+// Convenient structure to store string pointer and length. Note that
+// methods on DynamicBuffer enforce that the whole buffer (and by extension
+// every contained string) is of max length (2ul << 30) - 1. See
+// string_util.cc for more info.
 typedef struct {
   const char* str;
-  int len;
+  size_t len;
 } StringRef;
 
+constexpr uint64_t kDefaultMaxLength = std::numeric_limits<int>::max();
+
 // DynamicBuffer holds temporary buffer that will be used to create a dynamic
 // tensor. A typical usage is to initialize a DynamicBuffer object, fill in
 // content and call CreateStringTensor in op.Eval().
 class DynamicBuffer {
  public:
-  DynamicBuffer() : offset_({0}) {}
+  explicit DynamicBuffer(size_t max_length = kDefaultMaxLength)
+      : offset_({0}), max_length_(max_length) {}
 
   // Add string to dynamic buffer by resizing the buffer and copying the data.
-  void AddString(const StringRef& string);
+  TfLiteStatus AddString(const StringRef& string);
 
   // Add string to dynamic buffer by resizing the buffer and copying the data.
-  void AddString(const char* str, size_t len);
+  TfLiteStatus AddString(const char* str, size_t len);
 
   // Join a list of string with separator, and add as a single string to the
   // buffer.
@@ -93,7 +100,15 @@ class DynamicBuffer {
   // Data buffer to store contents of strings, not including headers.
   std::vector<char> data_;
   // Offset of the starting index of each string in data buffer.
-  std::vector<int32_t> offset_;
+  std::vector<size_t> offset_;
+  // Max length in number of characters that we permit the total
+  // buffer containing the concatenation of all added strings to be.
+  // For historical reasons this is limited to 32bit length. At this files
+  // inception, sizes were represented using 32bit which forced an implicit cap
+  // on the size of the buffer. When this was refactored to use size_t (which
+  // could be 64bit) we enforce that the buffer remains at most 32bit length to
+  // avoid a change in behavior.
+  const size_t max_length_;
 };
 
 // Return num of strings in a String tensor.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index 594a0f482d3..3c19ff59d8e 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/util.h"
@@ -54,7 +55,7 @@ TEST(StringUtil, TestStringUtil) {
   interpreter.SetTensorParametersReadOnly(
       2, kTfLiteString, "", {1}, quant, data.raw_bytes, sizeof(data.raw_bytes));
   TfLiteTensor* t2 = interpreter.tensor(2);
-  interpreter.AllocateTensors();
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 
   char s0[] = "ABC";
   string s1 = "DEFG";
@@ -62,10 +63,10 @@ TEST(StringUtil, TestStringUtil) {
 
   // Write strings to tensors
   DynamicBuffer buf0;
-  buf0.AddString(s0, 3);
+  ASSERT_EQ(buf0.AddString(s0, 3), kTfLiteOk);
   DynamicBuffer buf1;
-  buf1.AddString(s1.data(), s1.length());
-  buf0.AddString(s2, 0);
+  ASSERT_EQ(buf1.AddString(s1.data(), s1.length()), kTfLiteOk);
+  ASSERT_EQ(buf0.AddString(s2, 0), kTfLiteOk);
 
   auto new_shape = TfLiteIntArrayCreate(2);
   new_shape->data[0] = 2;
@@ -101,6 +102,42 @@ TEST(StringUtil, TestStringUtil) {
   ASSERT_EQ(t2->bytes, 15);
 }
 
+TEST(StringUtil, AddStringOverflow32Length) {
+  // Set max size to a small number so we can efficiently test for
+  // overflows.
+  const size_t max_size = 100;
+  DynamicBuffer buf{max_size};
+  std::string big_string(max_size + 1, 'A');
+  ASSERT_EQ(buf.AddString({big_string.data(), big_string.length()}),
+            kTfLiteError);
+}
+
+TEST(StringUtil, AddStringToFullBufferOverflow32Length) {
+  const size_t max_size = 100;
+  DynamicBuffer buf{max_size};
+  std::string big_string((max_size / 2) + 1, 'A');
+  ASSERT_EQ(buf.AddString({big_string.data(), big_string.length()}), kTfLiteOk);
+  EXPECT_EQ(buf.AddString({big_string.data(), big_string.length()}),
+            kTfLiteError);
+}
+
+TEST(StringUtil, TruncatesCharDataToLen) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  DynamicBuffer buf;
+  char fake_big[] = "ABCADASDA";
+  ASSERT_EQ(buf.AddString({fake_big, 3}), kTfLiteOk);
+  buf.WriteToTensorAsVector(t0);
+
+  StringRef added_string = GetString(t0, 0);
+  EXPECT_EQ(added_string.len, 3);
+  EXPECT_EQ(string(added_string.str, 3), "ABC");
+}
+
 TEST(StringUtil, TestAddJoinedStringCharSeparator) {
   Interpreter interpreter;
   interpreter.AddTensors(1);
diff --git a/tensorflow/lite/swift/BUILD.apple b/tensorflow/lite/swift/BUILD.apple
index 0b91a9cb050..e137c4ce772 100644
--- a/tensorflow/lite/swift/BUILD.apple
+++ b/tensorflow/lite/swift/BUILD.apple
@@ -6,6 +6,7 @@ load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework", "ios_uni
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/swift/Sources/Interpreter.swift b/tensorflow/lite/swift/Sources/Interpreter.swift
index ec342dff543..677d79ea384 100644
--- a/tensorflow/lite/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/swift/Sources/Interpreter.swift
@@ -81,8 +81,39 @@ public final class Interpreter {
   ///   - delegate: `Array` of `Delegate`s for the `Interpreter` to use to peform graph operations.
   ///       The default is `nil`.
   /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
-  public init(modelPath: String, options: Options? = nil, delegates: [Delegate]? = nil) throws {
+  public convenience init(modelPath: String, options: Options? = nil, delegates: [Delegate]? = nil)
+    throws
+  {
     guard let model = Model(filePath: modelPath) else { throw InterpreterError.failedToLoadModel }
+    try self.init(model: model, options: options, delegates: delegates)
+  }
+
+  /// Creates a new instance with the given values.
+  ///
+  /// - Parameters:
+  ///   - modelData: Binary data representing a TensorFlow Lite model.
+  ///   - options: Configurations for the `Interpreter`. The default is `nil` indicating that the
+  ///       `Interpreter` will determine the configuration options.
+  ///   - delegate: `Array` of `Delegate`s for the `Interpreter` to use to peform graph operations.
+  ///       The default is `nil`.
+  /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
+  public convenience init(modelData: Data, options: Options? = nil, delegates: [Delegate]? = nil)
+    throws
+  {
+    guard let model = Model(modelData: modelData) else { throw InterpreterError.failedToLoadModel }
+    try self.init(model: model, options: options, delegates: delegates)
+  }
+
+  /// Create a new instance with the given values.
+  ///
+  /// - Parameters:
+  ///   - model: An instantiated TensorFlow Lite model.
+  ///   - options: Configurations for the `Interpreter`. The default is `nil` indicating that the
+  ///       `Interpreter` will determine the configuration options.
+  ///   - delegate: `Array` of `Delegate`s for the `Interpreter` to use to peform graph operations.
+  ///       The default is `nil`.
+  /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
+  private init(model: Model, options: Options? = nil, delegates: [Delegate]? = nil) throws {
     guard let cInterpreterOptions = TfLiteInterpreterOptionsCreate() else {
       throw InterpreterError.failedToCreateInterpreter
     }
diff --git a/tensorflow/lite/swift/Sources/Model.swift b/tensorflow/lite/swift/Sources/Model.swift
index ade9a26c0d6..ec6b7043417 100644
--- a/tensorflow/lite/swift/Sources/Model.swift
+++ b/tensorflow/lite/swift/Sources/Model.swift
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+import Foundation
 import TensorFlowLiteC
 
 /// A TensorFlow Lite model used by the `Interpreter` to perform inference.
@@ -22,6 +23,12 @@ final class Model {
   /// The underlying `TfLiteModel` C pointer.
   let cModel: CModel?
 
+  /// The underlying data if data init is used
+  /// From c_api.h: The caller retains ownership of the `model_data` and should ensure that
+  // the lifetime of the `model_data` must be at least as long as the lifetime
+  // of the `TfLiteModel`.
+  let data: Data?
+
   /// Creates a new instance with the given `filePath`.
   ///
   /// - Precondition: Initialization can fail if the given `filePath` is invalid.
@@ -30,6 +37,17 @@ final class Model {
   init?(filePath: String) {
     guard !filePath.isEmpty, let cModel = TfLiteModelCreateFromFile(filePath) else { return nil }
     self.cModel = cModel
+    self.data = nil
+  }
+
+  /// Creates a new instance with the given `modelData`.
+  ///
+  /// - Precondition: Initialization can fail if the given `modelData` is invalid.
+  /// - Parameters:
+  ///   - modelData: Binary data representing a TensorFlow Lite model.
+  init?(modelData: Data) {
+    self.data = modelData
+    self.cModel = modelData.withUnsafeBytes { TfLiteModelCreate($0, modelData.count) }
   }
 
   deinit {
diff --git a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
index 77e59c9245a..5206174371f 100644
--- a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '2.10.0'
+  s.version          = '2.11.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '359c3cdfc5fabac82b3c70b3b6de2b0a8c16874f' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => 'd5b57ca93e506df258271ea00fc29cf98383a374' }
   s.summary          = 'TensorFlow Lite for Swift'
   s.description      = <<-DESC
 
@@ -15,6 +15,7 @@ Pod::Spec.new do |s|
 
   s.cocoapods_version = '>= 1.9.0'
   s.ios.deployment_target = '11.0'
+  s.swift_version = '5.0'
 
   s.module_name = 'TensorFlowLite'
   s.static_framework = true
diff --git a/tensorflow/lite/swift/Tests/InterpreterTests.swift b/tensorflow/lite/swift/Tests/InterpreterTests.swift
index 67d8120df4d..89138d99479 100644
--- a/tensorflow/lite/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/swift/Tests/InterpreterTests.swift
@@ -50,6 +50,22 @@ class InterpreterTests: XCTestCase {
     XCTAssertNil(interpreter.delegates)
   }
 
+  func testInit_WithData_ValidModelPath() {
+    XCTAssertNoThrow(
+      try Interpreter(modelData: try Data(contentsOf: URL(fileURLWithPath: AddModel.path))))
+  }
+
+  func testInitWithDataWithOptions() throws {
+    var options = Interpreter.Options()
+    options.threadCount = 2
+    let interpreter = try Interpreter(
+      modelData: try Data(contentsOf: URL(fileURLWithPath: AddQuantizedModel.path)),
+      options: options
+    )
+    XCTAssertNotNil(interpreter.options)
+    XCTAssertNil(interpreter.delegates)
+  }
+
   func testInputTensorCount() {
     XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
   }
diff --git a/tensorflow/lite/swift/Tests/ModelTests.swift b/tensorflow/lite/swift/Tests/ModelTests.swift
index fc82b1f3ce7..fe55c546816 100644
--- a/tensorflow/lite/swift/Tests/ModelTests.swift
+++ b/tensorflow/lite/swift/Tests/ModelTests.swift
@@ -24,10 +24,12 @@ class ModelTests: XCTestCase {
     super.setUp()
 
     let bundle = Bundle(for: type(of: self))
-    guard let modelPath = bundle.path(
-      forResource: Constant.modelInfo.name,
-      ofType: Constant.modelInfo.extension
-    ) else {
+    guard
+      let modelPath = bundle.path(
+        forResource: Constant.modelInfo.name,
+        ofType: Constant.modelInfo.extension
+      )
+    else {
       XCTFail("Failed to get the model file path.")
       return
     }
@@ -41,7 +43,10 @@ class ModelTests: XCTestCase {
   }
 
   func testInitWithFilePath() {
-    XCTAssertNotNil(Model(filePath: modelPath))
+    let model = Model(filePath: modelPath)
+    XCTAssertNotNil(model)
+    XCTAssertNotNil(model?.cModel)
+    XCTAssertNil(model?.data)
   }
 
   func testInitWithEmptyFilePath_FailsInitialization() {
@@ -51,6 +56,13 @@ class ModelTests: XCTestCase {
   func testInitWithInvalidFilePath_FailsInitialization() {
     XCTAssertNil(Model(filePath: "invalid/path"))
   }
+
+  func testInitWithData() throws {
+    let model = Model(modelData: try Data(contentsOf: URL(fileURLWithPath: modelPath)))
+    XCTAssertNotNil(model)
+    XCTAssertNotNil(model?.cModel)
+    XCTAssertNotNil(model?.data)
+  }
 }
 
 // MARK: - Constants
diff --git a/tensorflow/lite/tensorflow_profiler_logger.cc b/tensorflow/lite/tensorflow_profiler_logger.cc
index 0d4de8e6c3f..e24fafbdbae 100644
--- a/tensorflow/lite/tensorflow_profiler_logger.cc
+++ b/tensorflow/lite/tensorflow_profiler_logger.cc
@@ -18,26 +18,67 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <algorithm>
+#include <atomic>
 #include <memory>
 #include <string>
 
+#include "base/addressmap.h"
+#include "base/examine_stack.h"
+#include "base/low_level_alloc.h"
+#include "base/malloc_hook.h"
+#include "absl/base/call_once.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace {
 
 struct Statistics {
-  uint64_t total_bytes_allocated = 0LL;
-  uint64_t peak_bytes_in_use = 0LL;
+  int64_t total_bytes_allocated = 0LL;
+  int64_t peak_bytes_in_use = 0LL;
 };
+static Statistics g_stat_heap;
 static Statistics g_stat_dynamic;
 static Statistics g_stat_arena;
 
 static char g_current_op_name[256];
 
+static absl::Mutex g_api_mutex(
+    absl::kConstInit);  // To make sure public APIs are called synchronously.
+
+struct HeapEvent {
+  int64_t timestamp;
+  const void* ptr;
+  void* stack[3];
+  size_t size;
+  bool is_alloc;
+};
+
+constexpr int kThresholdToCapture = 1024 * 8;
+constexpr int MAX_HEAP_EVENTS = 200000;
+static HeapEvent g_heap_events[MAX_HEAP_EVENTS];
+
+static std::atomic<int> g_heap_event_idx = 0;
+static std::atomic<bool> g_pause_heap_monitor = false;
+static absl::once_flag g_install_hooks_once;
+
+// Low-level allocation that bypasses the hooks.
+static LowLevelAlloc::Arena* g_map_memory;
+
+static void* RawMalloc(size_t bytes) {
+  return LowLevelAlloc::AllocWithArena(bytes, g_map_memory);
+}
+
+static void RawFree(void* p) { LowLevelAlloc::Free(p); }
+
+// Address map recorded by OnMemoryAlloc() hook.
+static AddressMap<size_t>* g_allocation_map;
+
 // Adds memory trace information for TensorFlow profiler.
 // `stat`: Statistics object for the (de)allocation.
 // `is_allocating`: Whether memory is being allocated or deallocated.
@@ -107,6 +148,107 @@ void AddTraceMe(bool is_allocating, TfLiteTensor* tensor,
       /*level=*/tensorflow::profiler::TraceMeLevel::kInfo);
 }
 
+void AddArenaTrace(bool is_allocating, int subgraph_index, int arena_id,
+                   size_t allocation_bytes) {
+  std::string name = "Subgraph" + std::to_string(subgraph_index);
+  int64_t tensor_id = arena_id;
+  std::string dims = "";
+  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
+  const std::string allocator_name = "_tflite_arena";
+
+  tensorflow::profiler::TraceMe::InstantActivity(
+      [is_allocating, allocator_name, tensor_id, name, dims, allocation_bytes,
+       requested_bytes]() {
+        return AddTraceMeInternal(&g_stat_arena, is_allocating, allocator_name,
+                                  tensor_id, name, dims, allocation_bytes,
+                                  requested_bytes);
+      },
+      /*level=*/tensorflow::profiler::TraceMeLevel::kInfo);
+}
+
+void AddHeapTraceMe(int64_t now, const char* op_name, bool is_allocating,
+                    const void* address, size_t allocation_bytes) {
+  int64_t tensor_id = reinterpret_cast<int64_t>(address);
+  const std::string name = op_name;
+  const std::string dims = "[]";
+  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
+  const std::string allocator_name = "native_heap";
+
+  std::string res =
+      AddTraceMeInternal(&g_stat_heap, is_allocating, allocator_name, tensor_id,
+                         name, dims, allocation_bytes, requested_bytes);
+
+  tensorflow::profiler::TraceMeRecorder::Record(
+      {res, /*start_time=*/now, /*end_time=*/now});
+}
+
+char* GetOpnameFromStacks(void* stack[3]) {
+  char symbol1[64];
+  if (!absl::Symbolize(stack[0], symbol1, sizeof(symbol1))) {
+    snprintf(symbol1, sizeof(symbol1), "%p", stack[0]);
+  }
+  char symbol2[64];
+  if (!absl::Symbolize(stack[1], symbol2, sizeof(symbol2))) {
+    snprintf(symbol2, sizeof(symbol2), "%p", stack[1]);
+  }
+  char symbol3[64];
+  if (!absl::Symbolize(stack[2], symbol3, sizeof(symbol3))) {
+    snprintf(symbol3, sizeof(symbol3), "%p", stack[2]);
+  }
+  static char op_name[256];
+  snprintf(op_name, sizeof(op_name), "%s/%s/%s", symbol1, symbol2, symbol3);
+  return op_name;
+}
+
+// Hook for malloc().
+void OnMemoryAlloc(const void* ptr, size_t num_bytes) {
+  if (g_pause_heap_monitor || num_bytes < kThresholdToCapture ||
+      g_heap_event_idx >= MAX_HEAP_EVENTS)
+    return;
+
+  HeapEvent* current_event = &g_heap_events[g_heap_event_idx++];
+  current_event->timestamp = absl::GetCurrentTimeNanos();
+  current_event->ptr = ptr;
+  current_event->size = num_bytes;
+  g_allocation_map->Insert(ptr, num_bytes);
+  current_event->is_alloc = true;
+  absl::GetStackTrace(current_event->stack,
+                      /* max_depth = */ ABSL_ARRAYSIZE(current_event->stack),
+                      /* skip_count = */ 3);
+}
+
+// Hook for free().
+void OnMemoryDealloc(const void* ptr) {
+  if (g_pause_heap_monitor || ptr == nullptr ||
+      g_heap_event_idx >= MAX_HEAP_EVENTS)
+    return;
+  size_t free_size;
+  if (!g_allocation_map->FindAndRemove(ptr, &free_size)) return;
+
+  HeapEvent* current_event = &g_heap_events[g_heap_event_idx++];
+  current_event->timestamp = absl::GetCurrentTimeNanos();
+  current_event->ptr = ptr;
+  current_event->size = 0;  // Will figure out later.
+  current_event->size = free_size;
+  current_event->is_alloc = false;
+  absl::GetStackTrace(current_event->stack,
+                      /* max_depth = */ ABSL_ARRAYSIZE(current_event->stack),
+                      /* skip_count = */ 3);
+}
+
+// Set g_pause_heap_monitor to true and returns the old value.
+inline bool DisableHeapMonitor() ABSL_EXCLUSIVE_LOCKS_REQUIRED(g_api_mutex) {
+  bool old_g_heap_monitor = g_pause_heap_monitor;
+  g_pause_heap_monitor = true;
+  return old_g_heap_monitor;
+}
+
+// Restore g_pause_heap_monitor to the old status.
+inline void RestoreHeapMonitor(bool old_g_heap_monitor)
+    ABSL_EXCLUSIVE_LOCKS_REQUIRED(g_api_mutex) {
+  g_pause_heap_monitor = old_g_heap_monitor;
+}
+
 }  // namespace
 
 void OnTfLiteOpPrepare(const char* op_name, int subgraph_index,
@@ -120,6 +262,19 @@ void OnTfLiteOpPrepare(const char* op_name, int subgraph_index,
 
 tensorflow::profiler::TraceMe* OnTfLiteSubgraphInvoke(const char* name,
                                                       int index) {
+  absl::MutexLock lock(&g_api_mutex);
+
+  absl::call_once(g_install_hooks_once, [] {
+    if (g_map_memory == nullptr) g_map_memory = LowLevelAlloc::NewArena(0);
+    g_allocation_map = new AddressMap<size_t>(RawMalloc, RawFree);
+
+    MallocHook::AddNewHook(&OnMemoryAlloc);
+    MallocHook::AddDeleteHook(&OnMemoryDealloc);
+  });
+
+  // Disable heap monitoring to ignore heap activity of this function.
+  bool old_g_heap_monitor = DisableHeapMonitor();
+
   tensorflow::profiler::TraceMe* trace_me =
       new tensorflow::profiler::TraceMe([name, index]() {
         char eventName[256];
@@ -127,16 +282,47 @@ tensorflow::profiler::TraceMe* OnTfLiteSubgraphInvoke(const char* name,
         return tensorflow::profiler::TraceMeEncode(
             eventName, {{"subgraph_name", name}, {"subgraph_index", index}});
       });
+  RestoreHeapMonitor(old_g_heap_monitor);
   return trace_me;
 }
 
+void PauseHeapMonitoring(bool pause) {
+  absl::MutexLock lock(&g_api_mutex);
+
+  g_pause_heap_monitor = pause;
+}
+
+void OnTfLiteInterpreterEnd() {
+  absl::MutexLock lock(&g_api_mutex);
+
+  MallocHook::RemoveNewHook(&OnMemoryAlloc);
+  MallocHook::RemoveDeleteHook(&OnMemoryDealloc);
+
+  printf("Heap monitor captured %d events\n", g_heap_event_idx.load());
+  for (int i = 0; i < g_heap_event_idx; i++) {
+    HeapEvent* event = &g_heap_events[i];
+    AddHeapTraceMe(event->timestamp, GetOpnameFromStacks(event->stack),
+                   event->is_alloc, event->ptr, event->size);
+  }
+}
+
 void OnTfLiteSubgraphInvokeEnd(tensorflow::profiler::TraceMe* trace_me) {
+  absl::MutexLock lock(&g_api_mutex);
+
+  // Disable heap monitoring to ignore heap activity of this function.
+  bool old_g_heap_monitor = DisableHeapMonitor();
   delete trace_me;
+  RestoreHeapMonitor(old_g_heap_monitor);
 }
 
 tensorflow::profiler::TraceMe* OnTfLiteOpInvoke(const char* op_name,
                                                 int subgraph_index,
                                                 int node_index) {
+  absl::MutexLock lock(&g_api_mutex);
+
+  // Disable heap monitoring to ignore heap activity of this function.
+  bool old_g_heap_monitor = DisableHeapMonitor();
+
   snprintf(g_current_op_name, sizeof(g_current_op_name), "%s_%d", op_name,
            node_index);
   // Updates TF's current annotation object by creating scoped annotation obj.
@@ -153,48 +339,44 @@ tensorflow::profiler::TraceMe* OnTfLiteOpInvoke(const char* op_name,
                         {"subgraph_index", subgraph_index},
                         {"node_index", node_index}});
       });
+  RestoreHeapMonitor(old_g_heap_monitor);
   return trace_me;
 }
 
 void OnTfLiteOpInvokeEnd(tensorflow::profiler::TraceMe* trace_me) {
+  absl::MutexLock lock(&g_api_mutex);
+
+  // Disable heap monitoring to ignore heap activity of this function.
+  bool old_g_heap_monitor = DisableHeapMonitor();
   delete trace_me;
+  RestoreHeapMonitor(old_g_heap_monitor);
 }
 
 void OnTfLiteTensorAlloc(TfLiteTensor* tensor, size_t num_bytes) {
+  absl::MutexLock lock(&g_api_mutex);
+
   AddTraceMe(/*is_allocating=*/true, tensor, num_bytes);
 }
 
 void OnTfLiteTensorDealloc(TfLiteTensor* tensor) {
+  absl::MutexLock lock(&g_api_mutex);
+
   if (tensor != nullptr) {
     size_t num_bytes = tensor->bytes;
     AddTraceMe(/*is_allocating=*/false, tensor, num_bytes);
   }
 }
 
-void AddArenaTrace(bool is_allocating, int subgraph_index, int arena_id,
-                   size_t allocation_bytes) {
-  std::string name = "Subgraph" + std::to_string(subgraph_index);
-  int64_t tensor_id = arena_id;
-  std::string dims = "";
-  int64_t requested_bytes = is_allocating ? allocation_bytes : 0;
-  const std::string allocator_name = "_tflite_arena";
-
-  tensorflow::profiler::TraceMe::InstantActivity(
-      [is_allocating, allocator_name, tensor_id, name, dims, allocation_bytes,
-       requested_bytes]() {
-        return AddTraceMeInternal(&g_stat_arena, is_allocating, allocator_name,
-                                  tensor_id, name, dims, allocation_bytes,
-                                  requested_bytes);
-      },
-      /*level=*/tensorflow::profiler::TraceMeLevel::kInfo);
-}
-
 void OnTfLiteArenaAlloc(int subgraph_index, int arena_id, size_t num_bytes) {
+  absl::MutexLock lock(&g_api_mutex);
+
   if (num_bytes == 0) return;
   AddArenaTrace(/*is_allocating=*/true, subgraph_index, arena_id, num_bytes);
 }
 
 void OnTfLiteArenaDealloc(int subgraph_index, int arena_id, size_t num_bytes) {
+  absl::MutexLock lock(&g_api_mutex);
+
   if (num_bytes == 0) return;
   AddArenaTrace(/*is_allocating=*/false, subgraph_index, arena_id, num_bytes);
 }
diff --git a/tensorflow/lite/tensorflow_profiler_logger.h b/tensorflow/lite/tensorflow_profiler_logger.h
index e4b231f8ad6..61ac0bff966 100644
--- a/tensorflow/lite/tensorflow_profiler_logger.h
+++ b/tensorflow/lite/tensorflow_profiler_logger.h
@@ -77,6 +77,12 @@ TFLITE_ATTRIBUTE_WEAK void OnTfLiteArenaAlloc(int subgraph_index, int arena_id,
 TFLITE_ATTRIBUTE_WEAK void OnTfLiteArenaDealloc(int subgraph_index,
                                                 int arena_id, size_t num_bytes);
 
+// Pause / resume heap monitoring via malloc/free hooks.
+TFLITE_ATTRIBUTE_WEAK void PauseHeapMonitoring(bool pause);
+
+// Records end of Interpreter so logger can report saved heap allocations.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteInterpreterEnd();
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_TENSORFLOW_PROFILER_LOGGER_H_
diff --git a/tensorflow/lite/tensorflow_profiler_logger_shim.cc b/tensorflow/lite/tensorflow_profiler_logger_shim.cc
index d49f47a7dd0..72bf179f7e0 100644
--- a/tensorflow/lite/tensorflow_profiler_logger_shim.cc
+++ b/tensorflow/lite/tensorflow_profiler_logger_shim.cc
@@ -73,4 +73,12 @@ TFLITE_ATTRIBUTE_WEAK void OnTfLiteArenaDealloc(int subgraph_index,
                                                 int arena_id,
                                                 size_t num_bytes) {}
 
+// No-op for the weak symbol. Overridden by a strong symbol in
+// tensorflow_profiler_logger.cc.
+TFLITE_ATTRIBUTE_WEAK void PauseHeapMonitoring(bool pause) {}
+
+// No-op for the weak symbol. Overridden by a strong symbol in
+// tensorflow_profiler_logger.cc.
+TFLITE_ATTRIBUTE_WEAK void OnTfLiteInterpreterEnd() {}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/test_util.h b/tensorflow/lite/test_util.h
new file mode 100644
index 00000000000..48466ca4dda
--- /dev/null
+++ b/tensorflow/lite/test_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TEST_UTIL_H_
+
+#include "gtest/gtest.h"
+#include "tensorflow/lite/core/shims/c/shims_test_util.h"
+
+namespace tflite {
+namespace testing {
+
+class Test : public ::testing::Test {
+ public:
+  void SetUp() override {
+    ASSERT_EQ(TfLiteInitializeShimsForTest(), 0);
+  }
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TEST_UTIL_H_
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 6bf31df4d61..93ef2c0283a 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -22,6 +22,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -227,6 +228,7 @@ cc_library(
     hdrs = ["split.h"],
     deps = [
         "//tensorflow/lite:string",
+        "//third_party/eigen3",
     ],
 )
 
@@ -265,13 +267,13 @@ cc_library(
         ":split",
         ":test_runner",
         "@com_google_absl//absl/strings",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:custom_ops",
         "//tensorflow/lite/kernels/gradient:gradient_ops",
         "//tensorflow/lite/kernels:reference_ops",
@@ -612,7 +614,7 @@ cc_test(
         ":test_tflite_lib",
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/testing/build_def.bzl b/tensorflow/lite/testing/build_def.bzl
index 52818432eb1..d84d4b6ddc1 100644
--- a/tensorflow/lite/testing/build_def.bzl
+++ b/tensorflow/lite/testing/build_def.bzl
@@ -203,12 +203,6 @@ def mlir_generated_test_denylisted_models():
         "unidirectional_sequence_rnn",
     ]
 
-# Test cases which only work internally now.
-def no_oss_generated_test_models():
-    return [
-        "sparse_to_dense",
-    ]
-
 # List of models that fail generated tests for the conversion mode.
 # If you have to disable a test, please add here with a link to the appropriate
 # bug or issue.
@@ -452,7 +446,6 @@ def generated_test_models_all():
             (conversion mode, delegate to use, name of test, test tags, test args).
     """
     conversion_modes = generated_test_conversion_modes()
-    no_oss_tests = no_oss_generated_test_models()
     options = []
     for conversion_mode in conversion_modes:
         for delegate in generated_test_delegates():
@@ -461,10 +454,6 @@ def generated_test_models_all():
                 tags = []
                 args = []
 
-                # TODO(b/187992093): Exclude tests that are failing in OSS for now.
-                if test in no_oss_tests:
-                    tags.append("no_oss")
-
                 # Forward-compat coverage testing is largely redundant, and
                 # contributes to coverage test bloat.
                 if conversion_mode == "forward-compat":
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index c634fefbf55..3a54b558a09 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -29,7 +29,7 @@
 import os
 import sys
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing import generate_examples_lib
 from tensorflow.lite.testing import mlir_convert
@@ -155,4 +155,4 @@ def main(unused_args):
           "Usage: %s <path out> <zip file to generate>")
     exit(1)
   else:
-    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+    tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index b5791c8107d..c1162d4061f 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -30,7 +30,7 @@
 import re
 import zipfile
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 3811b736a38..cf4a88350f7 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -16,7 +17,7 @@ cc_library(
     hdrs = ["util.h"],
     deps = [
         ":input_generator",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:split",
         "//tensorflow/lite/testing:tflite_driver",
     ] + select({
@@ -69,10 +70,10 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/testing:join",
         "//tensorflow/lite/testing:split",
     ],
@@ -98,8 +99,8 @@ cc_library(
     hdrs = ["diff_analyzer.h"],
     deps = [
         "//tensorflow/lite:string",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:split",
     ],
 )
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
index 9ece15c1260..7ba14062fb9 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <fstream>
 #include <string>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
index 0354e79e1fe..41c1804306f 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer.h
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
index 249ac8305ce..ec8fc239086 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.cc
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
 
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
index 6950b64dbf1..6f76c47d15e 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.h
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/string_type.h"
 
diff --git a/tensorflow/lite/testing/kernel_test/util.h b/tensorflow/lite/testing/kernel_test/util.h
index ee779014e50..cac956da088 100644
--- a/tensorflow/lite/testing/kernel_test/util.h
+++ b/tensorflow/lite/testing/kernel_test/util.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <fstream>
 
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/testing/kernel_test/input_generator.h"
 #include "tensorflow/lite/testing/split.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
diff --git a/tensorflow/lite/testing/op_tests/abs.py b/tensorflow/lite/testing/op_tests/abs.py
index 1e56d0012c1..007deb706c3 100644
--- a/tensorflow/lite/testing/op_tests/abs.py
+++ b/tensorflow/lite/testing/op_tests/abs.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for abs."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/add_n.py b/tensorflow/lite/testing/op_tests/add_n.py
index e566ea54b7f..77e28db8532 100644
--- a/tensorflow/lite/testing/op_tests/add_n.py
+++ b/tensorflow/lite/testing/op_tests/add_n.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for add_n."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/arg_min_max.py b/tensorflow/lite/testing/op_tests/arg_min_max.py
index aa0fdfe9d2b..48667845afb 100644
--- a/tensorflow/lite/testing/op_tests/arg_min_max.py
+++ b/tensorflow/lite/testing/op_tests/arg_min_max.py
@@ -15,7 +15,7 @@
 """Test configs for arg_min_max."""
 import random
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -65,10 +65,10 @@ def build_graph(parameters):
       axis = -1
     if parameters["is_arg_max"]:
       out = tf.math.argmax(
-          input_value, axis, output_type=parameters["output_type"])
+          input=input_value, axis=axis, output_type=parameters["output_type"])
     else:
       out = tf.math.argmin(
-          input_value, axis, output_type=parameters["output_type"])
+          input=input_value, axis=axis, output_type=parameters["output_type"])
     return [input_value], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/atan2.py b/tensorflow/lite/testing/op_tests/atan2.py
index 02530764e9b..592fe9e1290 100644
--- a/tensorflow/lite/testing/op_tests/atan2.py
+++ b/tensorflow/lite/testing/op_tests/atan2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for atan2."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
index 94fc94cf2d0..fbd85541e58 100644
--- a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
+++ b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for batch_to_space_nd."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -94,7 +94,7 @@ def build_graph(parameters):
           dtype=tf.int32, name="crops", shape=shape)
       input_tensors.append(crops)
 
-    out = tf.batch_to_space_nd(input_tensor, block_shape, crops)
+    out = tf.batch_to_space(input_tensor, block_shape, crops)
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/binary_op.py b/tensorflow/lite/testing/op_tests/binary_op.py
index 24aec2e1ebe..934d6d51d86 100644
--- a/tensorflow/lite/testing/op_tests/binary_op.py
+++ b/tensorflow/lite/testing/op_tests/binary_op.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for binary_op."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/cast.py b/tensorflow/lite/testing/op_tests/cast.py
index 45239263a9d..5c7931fe7c1 100644
--- a/tensorflow/lite/testing/op_tests/cast.py
+++ b/tensorflow/lite/testing/op_tests/cast.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for cast."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -22,51 +22,103 @@
 @register_make_test_function()
 def make_cast_tests(options):
   """Generate examples for cast."""
-  test_parameters = [{
-      "input_dtype": [tf.float32],
-      "output_dtype": [tf.int16],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.int16],
-      "output_dtype": [tf.float32],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.int32],
-      "output_dtype": [tf.float32],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.int8],
-      "output_dtype": [tf.float32],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.float32],
-      "output_dtype": [tf.int8],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.uint16],
-      "output_dtype": [tf.float32],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.uint32],
-      "output_dtype": [tf.int32],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.uint8],
-      "output_dtype": [tf.int8],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.int8],
-      "output_dtype": [tf.uint8],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.uint16],
-      "output_dtype": [tf.int16],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }, {
-      "input_dtype": [tf.int16],
-      "output_dtype": [tf.uint16],
-      "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }]
+  test_parameters = [
+      {
+          "input_dtype": [tf.float32],
+          "output_dtype": [tf.int16],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int16],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int32],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int8],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float32],
+          "output_dtype": [tf.int8],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint16],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint32],
+          "output_dtype": [tf.int32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint8],
+          "output_dtype": [tf.int8],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int8],
+          "output_dtype": [tf.uint8],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.uint16],
+          "output_dtype": [tf.int16],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int16],
+          "output_dtype": [tf.uint16],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int32],
+          "output_dtype": [tf.float64],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float64],
+          "output_dtype": [tf.int32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float32],
+          "output_dtype": [tf.float64],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float64],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.int64],
+          "output_dtype": [tf.float64],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float64],
+          "output_dtype": [tf.int64],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float32],
+          "output_dtype": [tf.float16],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+      {
+          "input_dtype": [tf.float16],
+          "output_dtype": [tf.float32],
+          "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+      },
+  ]
 
   def build_graph(parameters):
     """Build the cast testing graph."""
diff --git a/tensorflow/lite/testing/op_tests/ceil.py b/tensorflow/lite/testing/op_tests/ceil.py
index 02d6ab3f76b..38e51723e31 100644
--- a/tensorflow/lite/testing/op_tests/ceil.py
+++ b/tensorflow/lite/testing/op_tests/ceil.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for ceil."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/complex_abs.py b/tensorflow/lite/testing/op_tests/complex_abs.py
index 272aacdf1a7..34598667582 100644
--- a/tensorflow/lite/testing/op_tests/complex_abs.py
+++ b/tensorflow/lite/testing/op_tests/complex_abs.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for complex abs."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/concat.py b/tensorflow/lite/testing/op_tests/concat.py
index 88bd16b77ad..9403c784a1a 100644
--- a/tensorflow/lite/testing/op_tests/concat.py
+++ b/tensorflow/lite/testing/op_tests/concat.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for concat."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/cond.py b/tensorflow/lite/testing/op_tests/cond.py
index 950f24fb989..969ffc5efc1 100644
--- a/tensorflow/lite/testing/op_tests/cond.py
+++ b/tensorflow/lite/testing/op_tests/cond.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for cond."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -37,16 +37,17 @@ def make_cond_tests(options):
 
   def build_graph(parameters):
     """Build the graph for cond tests."""
-    input1 = tf.placeholder(dtype=parameters["dtype"], shape=(1,))
-    input2 = tf.placeholder(dtype=parameters["dtype"], shape=(1,))
+    input1 = tf.compat.v1.placeholder(dtype=parameters["dtype"], shape=(1,))
+    input2 = tf.compat.v1.placeholder(dtype=parameters["dtype"], shape=(1,))
     # MLIR TFLite converter can't handle scalar inputs. This is a workaround
     # to input (1,) tensors and then reshape to scalar.
     # TODO(b/129003347): Remove the workaround after scalar inputs are
     # supported.
-    pred = tf.placeholder(dtype=tf.bool, shape=(1,))
+    pred = tf.compat.v1.placeholder(dtype=tf.bool, shape=(1,))
     pred_scalar = tf.reshape(pred, ())
 
-    out = tf.cond(pred_scalar, lambda: input1, lambda: input2)
+    out = tf.cond(
+        pred=pred_scalar, true_fn=lambda: input1, false_fn=lambda: input2)
     return [input1, input2, pred], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/constant.py b/tensorflow/lite/testing/op_tests/constant.py
index 8175341f1e1..aec24f43758 100644
--- a/tensorflow/lite/testing/op_tests/constant.py
+++ b/tensorflow/lite/testing/op_tests/constant.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for constant ops."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/control_dep.py b/tensorflow/lite/testing/op_tests/control_dep.py
index facc31c6e68..f240cb10e79 100644
--- a/tensorflow/lite/testing/op_tests/control_dep.py
+++ b/tensorflow/lite/testing/op_tests/control_dep.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for control_dep."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -37,7 +37,10 @@ def build_graph(parameters):
                                                   input_tensor - 1)
     with tf.control_dependencies([assert_op]):
       out = tf.nn.conv2d(
-          input_tensor, filter_value, strides=(1, 1, 1, 1), padding="SAME")
+          input=input_tensor,
+          filters=filter_value,
+          strides=(1, 1, 1, 1),
+          padding="SAME")
       return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/conv.py b/tensorflow/lite/testing/op_tests/conv.py
index 06e941f1df0..0c0f3949d8a 100644
--- a/tensorflow/lite/testing/op_tests/conv.py
+++ b/tensorflow/lite/testing/op_tests/conv.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -121,8 +121,8 @@ def build_graph(parameters):
       input_tensors = [input_tensor, filter_input]
 
     out = tf.nn.conv2d(
-        input_tensor,
-        filter_input,
+        input=input_tensor,
+        filters=filter_input,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
diff --git a/tensorflow/lite/testing/op_tests/conv2d_transpose.py b/tensorflow/lite/testing/op_tests/conv2d_transpose.py
index a4f202ab8cf..52783e3ed01 100644
--- a/tensorflow/lite/testing/op_tests/conv2d_transpose.py
+++ b/tensorflow/lite/testing/op_tests/conv2d_transpose.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv2d_transpose."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv3d.py b/tensorflow/lite/testing/op_tests/conv3d.py
index d879922d67b..28a3cf7c505 100644
--- a/tensorflow/lite/testing/op_tests/conv3d.py
+++ b/tensorflow/lite/testing/op_tests/conv3d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for exp."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv3d_transpose.py b/tensorflow/lite/testing/op_tests/conv3d_transpose.py
index 0b9f13d5321..075f9a8eedb 100644
--- a/tensorflow/lite/testing/op_tests/conv3d_transpose.py
+++ b/tensorflow/lite/testing/op_tests/conv3d_transpose.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv3d_transpose."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/conv_activation.py b/tensorflow/lite/testing/op_tests/conv_activation.py
index ff6a823299b..000d10dc436 100644
--- a/tensorflow/lite/testing/op_tests/conv_activation.py
+++ b/tensorflow/lite/testing/op_tests/conv_activation.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv with activations."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -96,8 +96,8 @@ def build_graph(parameters):
         input_tensors = [input_tensor, filter_input]
 
       out = tf.nn.conv2d(
-          input_tensor,
-          filter_input,
+          input=input_tensor,
+          filters=filter_input,
           strides=parameters["strides"],
           dilations=parameters["dilations"],
           padding=parameters["padding"],
diff --git a/tensorflow/lite/testing/op_tests/conv_bias_activation.py b/tensorflow/lite/testing/op_tests/conv_bias_activation.py
index 1024d91e3ae..dcc8fae8d27 100644
--- a/tensorflow/lite/testing/op_tests/conv_bias_activation.py
+++ b/tensorflow/lite/testing/op_tests/conv_bias_activation.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv followed with bias Add and activations."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -64,8 +64,8 @@ def get_tensor_shapes(parameters):
     @tf.function(jit_compile=True)
     def add_conv(input_tensor, filter_input, parameters):
       out = tf.nn.conv2d(
-          input_tensor,
-          filter_input,
+          input=input_tensor,
+          filters=filter_input,
           strides=parameters["strides"],
           dilations=parameters["dilations"],
           padding="VALID",
@@ -91,8 +91,8 @@ def build_graph(parameters):
         out = add_conv(input_tensor, filter_input, parameters)
       else:
         out = tf.nn.conv2d(
-            input_tensor,
-            filter_input,
+            input=input_tensor,
+            filters=filter_input,
             strides=parameters["strides"],
             dilations=parameters["dilations"],
             padding="VALID",
@@ -109,8 +109,8 @@ def build_graph(parameters):
         out = add_conv(out, filter_input_2, parameters)
       else:
         out = tf.nn.conv2d(
-            out,
-            filter_input_2,
+            input=out,
+            filters=filter_input_2,
             strides=parameters["strides"],
             dilations=parameters["dilations"],
             padding="VALID",
diff --git a/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py b/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
index 4461df2f3b1..79ea41346d5 100644
--- a/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
+++ b/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -56,15 +56,15 @@ def build_graph(parameters):
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        input_tensor,
-        filter_tensor,
+        input=input_tensor,
+        filters=filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        input_tensor,
-        filter_tensor,
+        input=input_tensor,
+        filters=filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
diff --git a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
index 5d1772cdcb5..ca4216f70fe 100644
--- a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
+++ b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for conv_with_shared_weights."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -61,15 +61,15 @@ def build_graph(parameters):
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        conv_input,
-        filter_tensor,
+        input=conv_input,
+        filters=filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        conv_input,
-        filter_tensor,
+        input=conv_input,
+        filters=filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
diff --git a/tensorflow/lite/testing/op_tests/cos.py b/tensorflow/lite/testing/op_tests/cos.py
index 232dfac456d..3c70c37bcbb 100644
--- a/tensorflow/lite/testing/op_tests/cos.py
+++ b/tensorflow/lite/testing/op_tests/cos.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for cos."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/cumsum.py b/tensorflow/lite/testing/op_tests/cumsum.py
index d86c62d2c80..0f5358a667e 100644
--- a/tensorflow/lite/testing/op_tests/cumsum.py
+++ b/tensorflow/lite/testing/op_tests/cumsum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for cumsum."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/depth_to_space.py b/tensorflow/lite/testing/op_tests/depth_to_space.py
index 119878c8a18..e7781bd4608 100644
--- a/tensorflow/lite/testing/op_tests/depth_to_space.py
+++ b/tensorflow/lite/testing/op_tests/depth_to_space.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for depth_to_space."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/depthwiseconv.py b/tensorflow/lite/testing/op_tests/depthwiseconv.py
index c4dc5eaca96..ef9da8b60f5 100644
--- a/tensorflow/lite/testing/op_tests/depthwiseconv.py
+++ b/tensorflow/lite/testing/op_tests/depthwiseconv.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for depthwiseconv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -118,10 +118,10 @@ def build_graph(parameters):
       input_tensors = [input_tensor, filter_input]
 
     out = tf.nn.depthwise_conv2d(
-        input_tensor,
-        filter_input,
+        input=input_tensor,
+        filter=filter_input,
         strides=parameters["strides"],
-        rate=parameters["rate"],
+        dilations=parameters["rate"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     return input_tensors, [out]
diff --git a/tensorflow/lite/testing/op_tests/dynamic_rnn.py b/tensorflow/lite/testing/op_tests/dynamic_rnn.py
index 6762d19e0eb..d6767d86827 100644
--- a/tensorflow/lite/testing/op_tests/dynamic_rnn.py
+++ b/tensorflow/lite/testing/op_tests/dynamic_rnn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for dynamic_rnn."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -44,8 +44,10 @@ def build_graph(parameters):
     num_cells = parameters["num_cells"]
     input_shape = (num_batches, time_step_size, input_vec_size)
 
-    input_tensor = tf.placeholder(dtype=parameters["dtype"], shape=input_shape)
-    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_cells, activation=tf.nn.relu)
+    input_tensor = tf.compat.v1.placeholder(
+        dtype=parameters["dtype"], shape=input_shape)
+    lstm_cell = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(
+        num_cells, activation=tf.nn.relu)
 
     output, _ = rnn.dynamic_rnn(
         lstm_cell, input_tensor, dtype=parameters["dtype"])
@@ -53,7 +55,7 @@ def build_graph(parameters):
 
   def build_inputs(parameters, sess, inputs, outputs):
     """Feed inputs, assign variables, and freeze graph."""
-    sess.run(tf.global_variables_initializer())
+    sess.run(tf.compat.v1.global_variables_initializer())
 
     num_batches = parameters["num_batches"]
     time_step_size = parameters["time_step_size"]
diff --git a/tensorflow/lite/testing/op_tests/dynamic_update_slice.py b/tensorflow/lite/testing/op_tests/dynamic_update_slice.py
index 2b9666d6b55..9663664e63a 100644
--- a/tensorflow/lite/testing/op_tests/dynamic_update_slice.py
+++ b/tensorflow/lite/testing/op_tests/dynamic_update_slice.py
@@ -15,7 +15,7 @@
 """Test configs for tensor_list_set_item."""
 import functools
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -53,10 +53,10 @@ def make_dynamic_update_slice_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListSetItem op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
-    item = tf.placeholder(
+    item = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"], shape=parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
                                                    parameters["element_shape"])
diff --git a/tensorflow/lite/testing/op_tests/elementwise.py b/tensorflow/lite/testing/op_tests/elementwise.py
index 8611d90935b..9c86511d633 100644
--- a/tensorflow/lite/testing/op_tests/elementwise.py
+++ b/tensorflow/lite/testing/op_tests/elementwise.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for elementwise ops."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/elu.py b/tensorflow/lite/testing/op_tests/elu.py
index 8119b975571..33c5d496e17 100644
--- a/tensorflow/lite/testing/op_tests/elu.py
+++ b/tensorflow/lite/testing/op_tests/elu.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for elu."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/embedding_lookup.py b/tensorflow/lite/testing/op_tests/embedding_lookup.py
index f7f15c3ccd8..467098892b7 100644
--- a/tensorflow/lite/testing/op_tests/embedding_lookup.py
+++ b/tensorflow/lite/testing/op_tests/embedding_lookup.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for embedding_lookup."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -42,7 +42,7 @@ def build_graph(parameters):
         dtype=parameters["ids_dtype"],
         name="ids",
         shape=parameters["ids_shape"])
-    out = tf.nn.embedding_lookup(params, ids)
+    out = tf.nn.embedding_lookup(params=params, ids=ids)
     return [params, ids], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/equal.py b/tensorflow/lite/testing/op_tests/equal.py
index 90ce5cd8995..923a80755e0 100644
--- a/tensorflow/lite/testing/op_tests/equal.py
+++ b/tensorflow/lite/testing/op_tests/equal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for equal."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/exp.py b/tensorflow/lite/testing/op_tests/exp.py
index 54e2b93ed01..48f5c1a8943 100644
--- a/tensorflow/lite/testing/op_tests/exp.py
+++ b/tensorflow/lite/testing/op_tests/exp.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for exp."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/expand_dims.py b/tensorflow/lite/testing/op_tests/expand_dims.py
index 7489a1563d6..173c153b2e7 100644
--- a/tensorflow/lite/testing/op_tests/expand_dims.py
+++ b/tensorflow/lite/testing/op_tests/expand_dims.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for expand_dims."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/expm1.py b/tensorflow/lite/testing/op_tests/expm1.py
index 5a93293aa6e..9d575ffe47e 100644
--- a/tensorflow/lite/testing/op_tests/expm1.py
+++ b/tensorflow/lite/testing/op_tests/expm1.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for expm1."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -36,7 +36,7 @@ def build_graph(parameters):
 
     input_tensor = tf.compat.v1.placeholder(
         dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    out = tf.expm1(input_tensor)
+    out = tf.math.expm1(input_tensor)
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/eye.py b/tensorflow/lite/testing/op_tests/eye.py
index f54315c5033..c2d709cb2d9 100644
--- a/tensorflow/lite/testing/op_tests/eye.py
+++ b/tensorflow/lite/testing/op_tests/eye.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for eye."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fill.py b/tensorflow/lite/testing/op_tests/fill.py
index cbe6cb590b5..c06e8536dc7 100644
--- a/tensorflow/lite/testing/op_tests/fill.py
+++ b/tensorflow/lite/testing/op_tests/fill.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for fill."""
-import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/floor.py b/tensorflow/lite/testing/op_tests/floor.py
index 6c391c15b9e..81e1308d487 100644
--- a/tensorflow/lite/testing/op_tests/floor.py
+++ b/tensorflow/lite/testing/op_tests/floor.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for floor."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fully_connected.py b/tensorflow/lite/testing/op_tests/fully_connected.py
index 7755f6a1ce5..477ad0b439e 100644
--- a/tensorflow/lite/testing/op_tests/fully_connected.py
+++ b/tensorflow/lite/testing/op_tests/fully_connected.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for fully_connected."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fused_batch_norm.py b/tensorflow/lite/testing/op_tests/fused_batch_norm.py
index b3c204feda5..0919fe6a023 100644
--- a/tensorflow/lite/testing/op_tests/fused_batch_norm.py
+++ b/tensorflow/lite/testing/op_tests/fused_batch_norm.py
@@ -15,7 +15,7 @@
 """Test configs for fused_batch_norm."""
 import numpy as np
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather.py b/tensorflow/lite/testing/op_tests/gather.py
index 4a32062dcd5..d23e917067a 100644
--- a/tensorflow/lite/testing/op_tests/gather.py
+++ b/tensorflow/lite/testing/op_tests/gather.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for gather."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather_nd.py b/tensorflow/lite/testing/op_tests/gather_nd.py
index fa60e148e7f..6ff50ddd881 100644
--- a/tensorflow/lite/testing/op_tests/gather_nd.py
+++ b/tensorflow/lite/testing/op_tests/gather_nd.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for gather_nd."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather_with_constant.py b/tensorflow/lite/testing/op_tests/gather_with_constant.py
index 2bc8d101b8b..12ecf4a9619 100644
--- a/tensorflow/lite/testing/op_tests/gather_with_constant.py
+++ b/tensorflow/lite/testing/op_tests/gather_with_constant.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for gather_with_constant."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/global_batch_norm.py b/tensorflow/lite/testing/op_tests/global_batch_norm.py
index b6dc472e24d..8cadbdcae0c 100644
--- a/tensorflow/lite/testing/op_tests/global_batch_norm.py
+++ b/tensorflow/lite/testing/op_tests/global_batch_norm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for global_batch_norm."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/greater.py b/tensorflow/lite/testing/op_tests/greater.py
index cd5384f1d24..da33c25b655 100644
--- a/tensorflow/lite/testing/op_tests/greater.py
+++ b/tensorflow/lite/testing/op_tests/greater.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for conv."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/greater_equal.py b/tensorflow/lite/testing/op_tests/greater_equal.py
index a8934c90a75..ae28e3bdc7e 100644
--- a/tensorflow/lite/testing/op_tests/greater_equal.py
+++ b/tensorflow/lite/testing/op_tests/greater_equal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for greater_equal."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/hardswish.py b/tensorflow/lite/testing/op_tests/hardswish.py
index 303c32b7a3a..e98d0f9c98d 100644
--- a/tensorflow/lite/testing/op_tests/hardswish.py
+++ b/tensorflow/lite/testing/op_tests/hardswish.py
@@ -16,7 +16,7 @@
 import functools
 
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/identify_dilated_conv.py b/tensorflow/lite/testing/op_tests/identify_dilated_conv.py
index b8b8914a765..3867ff18be8 100644
--- a/tensorflow/lite/testing/op_tests/identify_dilated_conv.py
+++ b/tensorflow/lite/testing/op_tests/identify_dilated_conv.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for identifying dilated conv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -65,8 +65,8 @@ def build_graph(parameters):
     # Use `tf.nn.convolution` here since it will create the `batch_to_space` and
     # the `space_to_batch` ops respectively.
     out = tf.nn.convolution(
-        input_tensor,
-        filter_input,
+        input=input_tensor,
+        filters=filter_input,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
diff --git a/tensorflow/lite/testing/op_tests/identify_dilated_conv1d.py b/tensorflow/lite/testing/op_tests/identify_dilated_conv1d.py
index 91f6c48d55a..207782e60b0 100644
--- a/tensorflow/lite/testing/op_tests/identify_dilated_conv1d.py
+++ b/tensorflow/lite/testing/op_tests/identify_dilated_conv1d.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for identifying dilated Conv1D."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -51,8 +51,8 @@ def build_graph(parameters):
     input_tensors = [input_tensor, filter_input]
 
     out = tf.nn.conv1d(
-        input_tensor,
-        filter_input,
+        input=input_tensor,
+        filters=filter_input,
         stride=parameters["stride"],
         dilations=parameters["dilations"],
         padding=parameters["padding"])
diff --git a/tensorflow/lite/testing/op_tests/identity.py b/tensorflow/lite/testing/op_tests/identity.py
index d8a4572ff46..1dbe5dfc480 100644
--- a/tensorflow/lite/testing/op_tests/identity.py
+++ b/tensorflow/lite/testing/op_tests/identity.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for identity."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/imag.py b/tensorflow/lite/testing/op_tests/imag.py
index 4c5a67d2836..6619bc1c807 100644
--- a/tensorflow/lite/testing/op_tests/imag.py
+++ b/tensorflow/lite/testing/op_tests/imag.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for imag op."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/irfft2d.py b/tensorflow/lite/testing/op_tests/irfft2d.py
index 369d936c6eb..bca2b8dcea0 100644
--- a/tensorflow/lite/testing/op_tests/irfft2d.py
+++ b/tensorflow/lite/testing/op_tests/irfft2d.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for irfft2d."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
diff --git a/tensorflow/lite/testing/op_tests/is_finite.py b/tensorflow/lite/testing/op_tests/is_finite.py
index 54702e53599..23d3e3b59f1 100644
--- a/tensorflow/lite/testing/op_tests/is_finite.py
+++ b/tensorflow/lite/testing/op_tests/is_finite.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for is_finite."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/l2norm.py b/tensorflow/lite/testing/op_tests/l2norm.py
index d258e211cd8..902895f046e 100644
--- a/tensorflow/lite/testing/op_tests/l2norm.py
+++ b/tensorflow/lite/testing/op_tests/l2norm.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for l2norm."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py b/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
index 8a2175e6f75..786a141fd63 100644
--- a/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
+++ b/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for l2norm_shared_epsilon."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/leaky_relu.py b/tensorflow/lite/testing/op_tests/leaky_relu.py
index dd27194d759..20e2c3b6222 100644
--- a/tensorflow/lite/testing/op_tests/leaky_relu.py
+++ b/tensorflow/lite/testing/op_tests/leaky_relu.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for leaky_relu."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/less.py b/tensorflow/lite/testing/op_tests/less.py
index 14c95f6f5b1..d20ac7fa3ca 100644
--- a/tensorflow/lite/testing/op_tests/less.py
+++ b/tensorflow/lite/testing/op_tests/less.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for less."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/less_equal.py b/tensorflow/lite/testing/op_tests/less_equal.py
index fc88783b3f9..09b3a428ee8 100644
--- a/tensorflow/lite/testing/op_tests/less_equal.py
+++ b/tensorflow/lite/testing/op_tests/less_equal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for less_equal."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/local_response_norm.py b/tensorflow/lite/testing/op_tests/local_response_norm.py
index 31aec12dc23..4cb3f5ef3c6 100644
--- a/tensorflow/lite/testing/op_tests/local_response_norm.py
+++ b/tensorflow/lite/testing/op_tests/local_response_norm.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for local_response_norm."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/log_softmax.py b/tensorflow/lite/testing/op_tests/log_softmax.py
index 3be01adfff0..876ff1efeab 100644
--- a/tensorflow/lite/testing/op_tests/log_softmax.py
+++ b/tensorflow/lite/testing/op_tests/log_softmax.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for log_softmax."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/logic.py b/tensorflow/lite/testing/op_tests/logic.py
index 005964f0661..6f7b4f4d0eb 100644
--- a/tensorflow/lite/testing/op_tests/logic.py
+++ b/tensorflow/lite/testing/op_tests/logic.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for logic operators."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/lstm.py b/tensorflow/lite/testing/op_tests/lstm.py
index 56e85af1cea..7a97b32049e 100644
--- a/tensorflow/lite/testing/op_tests/lstm.py
+++ b/tensorflow/lite/testing/op_tests/lstm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for lstm."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
@@ -65,8 +65,8 @@ def build_inputs(parameters, sess, inputs, outputs):
     """Feed inputs, assign variables, and freeze graph."""
 
     with tf.compat.v1.variable_scope("", reuse=True):
-      kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
-      bias = tf.get_variable("rnn/basic_lstm_cell/bias")
+      kernel = tf.compat.v1.get_variable("rnn/basic_lstm_cell/kernel")
+      bias = tf.compat.v1.get_variable("rnn/basic_lstm_cell/bias")
       kernel_values = create_tensor_data(parameters["dtype"],
                                          [kernel.shape[0], kernel.shape[1]], -1,
                                          1)
diff --git a/tensorflow/lite/testing/op_tests/matrix_band_part.py b/tensorflow/lite/testing/op_tests/matrix_band_part.py
index e5b1c2472b2..369f53a539e 100644
--- a/tensorflow/lite/testing/op_tests/matrix_band_part.py
+++ b/tensorflow/lite/testing/op_tests/matrix_band_part.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for matrix_band_part."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/matrix_diag.py b/tensorflow/lite/testing/op_tests/matrix_diag.py
index e0af1d135a4..6382fef5ebe 100644
--- a/tensorflow/lite/testing/op_tests/matrix_diag.py
+++ b/tensorflow/lite/testing/op_tests/matrix_diag.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for matrix_diag."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/matrix_set_diag.py b/tensorflow/lite/testing/op_tests/matrix_set_diag.py
index 1fa5b9f181c..cab5f90434a 100644
--- a/tensorflow/lite/testing/op_tests/matrix_set_diag.py
+++ b/tensorflow/lite/testing/op_tests/matrix_set_diag.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for matrix_set_diag."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/maximum.py b/tensorflow/lite/testing/op_tests/maximum.py
index 3ed76ed1ecb..bf264b63e6c 100644
--- a/tensorflow/lite/testing/op_tests/maximum.py
+++ b/tensorflow/lite/testing/op_tests/maximum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for maximum."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/minimum.py b/tensorflow/lite/testing/op_tests/minimum.py
index d3cb303f3ac..786806be793 100644
--- a/tensorflow/lite/testing/op_tests/minimum.py
+++ b/tensorflow/lite/testing/op_tests/minimum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for minimum."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/mirror_pad.py b/tensorflow/lite/testing/op_tests/mirror_pad.py
index 2e30574698f..0e108e4e597 100644
--- a/tensorflow/lite/testing/op_tests/mirror_pad.py
+++ b/tensorflow/lite/testing/op_tests/mirror_pad.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for mirror_pad."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -98,7 +98,7 @@ def build_graph(parameters):
       padding_matrix = tf.constant(np.array(parameters["padding_matrix"]))
       input_tensors = [input_tensor]
     output = tf.pad(
-        input_tensor, paddings=padding_matrix, mode=parameters["mode"])
+        tensor=input_tensor, paddings=padding_matrix, mode=parameters["mode"])
 
     return input_tensors, [output]
 
diff --git a/tensorflow/lite/testing/op_tests/multinomial.py b/tensorflow/lite/testing/op_tests/multinomial.py
index 765f21b9188..96c44e3116e 100644
--- a/tensorflow/lite/testing/op_tests/multinomial.py
+++ b/tensorflow/lite/testing/op_tests/multinomial.py
@@ -17,7 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -40,7 +40,7 @@ def make_multinomial_tests(options):
 
   def build_graph(parameters):
     """Build the op testing graph."""
-    tf.set_random_seed(seed=parameters["seed"])
+    tf.compat.v1.set_random_seed(seed=parameters["seed"])
     logits_tf = tf.compat.v1.placeholder(
         name="logits", dtype=tf.float32, shape=parameters["logits_shape"])
     num_samples_tf = tf.compat.v1.placeholder(
diff --git a/tensorflow/lite/testing/op_tests/nearest_upsample.py b/tensorflow/lite/testing/op_tests/nearest_upsample.py
index f2d5b2e7fe8..bbf62cb1fec 100644
--- a/tensorflow/lite/testing/op_tests/nearest_upsample.py
+++ b/tensorflow/lite/testing/op_tests/nearest_upsample.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for nearest upsample."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/neg.py b/tensorflow/lite/testing/op_tests/neg.py
index 4bcfc84fce5..9db32e92880 100644
--- a/tensorflow/lite/testing/op_tests/neg.py
+++ b/tensorflow/lite/testing/op_tests/neg.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for neg."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/not_equal.py b/tensorflow/lite/testing/op_tests/not_equal.py
index 947f7956cd7..f80d86bbbe7 100644
--- a/tensorflow/lite/testing/op_tests/not_equal.py
+++ b/tensorflow/lite/testing/op_tests/not_equal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for not_equal."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/one_hot.py b/tensorflow/lite/testing/op_tests/one_hot.py
index 34201bf756a..f5a3daeed37 100644
--- a/tensorflow/lite/testing/op_tests/one_hot.py
+++ b/tensorflow/lite/testing/op_tests/one_hot.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for one_hot."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/pack.py b/tensorflow/lite/testing/op_tests/pack.py
index dcd9fb8b550..c89a4bfbff1 100644
--- a/tensorflow/lite/testing/op_tests/pack.py
+++ b/tensorflow/lite/testing/op_tests/pack.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for pack."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/pad.py b/tensorflow/lite/testing/op_tests/pad.py
index 86646a9b0bb..fdde5dfa991 100644
--- a/tensorflow/lite/testing/op_tests/pad.py
+++ b/tensorflow/lite/testing/op_tests/pad.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for pad."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -112,7 +112,7 @@ def build_graph(parameters):
           dtype=tf.int32, name="padding", shape=shape)
       input_tensors = [input_tensor, paddings]
 
-    out = tf.pad(input_tensor, paddings=paddings)
+    out = tf.pad(tensor=input_tensor, paddings=paddings)
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/padv2.py b/tensorflow/lite/testing/op_tests/padv2.py
index 64cb1800bdd..783df593d49 100644
--- a/tensorflow/lite/testing/op_tests/padv2.py
+++ b/tensorflow/lite/testing/op_tests/padv2.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for padv2."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -80,7 +80,7 @@ def build_graph(parameters):
       input_tensors = [input_tensor, paddings]
 
     out = tf.pad(
-        input_tensor,
+        tensor=input_tensor,
         paddings=paddings,
         constant_values=parameters["constant_values"])
     return input_tensors, [out]
diff --git a/tensorflow/lite/testing/op_tests/parse_example.py b/tensorflow/lite/testing/op_tests/parse_example.py
index 65d15b6a6c0..f472f65b52e 100644
--- a/tensorflow/lite/testing/op_tests/parse_example.py
+++ b/tensorflow/lite/testing/op_tests/parse_example.py
@@ -16,7 +16,7 @@
 import string
 
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
@@ -64,13 +64,17 @@ def build_graph(parameters):
     if is_dense:
       feature_default_value = np.zeros(shape=feature_shape)
       if feature_dtype == tf.string:
-        feature_default_value = np.array(["missing"]*feature_shape[0])
-      features = {"x": tf.FixedLenFeature(shape=feature_shape,
-                                          dtype=feature_dtype,
-                                          default_value=feature_default_value)}
+        feature_default_value = np.array(["missing"] * feature_shape[0])
+      features = {
+          "x":
+              tf.io.FixedLenFeature(
+                  shape=feature_shape,
+                  dtype=feature_dtype,
+                  default_value=feature_default_value)
+      }
     else:  # Sparse
-      features = {"x": tf.VarLenFeature(dtype=feature_dtype)}
-    out = tf.parse_example(input_value, features)
+      features = {"x": tf.io.VarLenFeature(dtype=feature_dtype)}
+    out = tf.io.parse_example(serialized=input_value, features=features)
     output_tensor = out["x"]
     if not is_dense:
       output_tensor = out["x"].values
diff --git a/tensorflow/lite/testing/op_tests/placeholder_with_default.py b/tensorflow/lite/testing/op_tests/placeholder_with_default.py
index 5b7faf6dd46..83f046c62ea 100644
--- a/tensorflow/lite/testing/op_tests/placeholder_with_default.py
+++ b/tensorflow/lite/testing/op_tests/placeholder_with_default.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for placeholder_with_default."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import MAP_TF_TO_NUMPY_TYPE
diff --git a/tensorflow/lite/testing/op_tests/pool.py b/tensorflow/lite/testing/op_tests/pool.py
index f853944e66e..58dc93542d7 100644
--- a/tensorflow/lite/testing/op_tests/pool.py
+++ b/tensorflow/lite/testing/op_tests/pool.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for pool operators."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -115,8 +115,8 @@ def build_inputs(parameters, sess, inputs, outputs):
 def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
   """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
   return tf.sqrt(
-      tf.nn.avg_pool(
-          tf.square(input_tensor),
+      tf.nn.avg_pool2d(
+          input=tf.square(input_tensor),
           ksize=ksize,
           strides=strides,
           padding=padding,
@@ -131,12 +131,12 @@ def make_l2_pool_tests(options):
 @register_make_test_function()
 def make_avg_pool_tests(options):
   make_pool_tests(
-      tf.nn.avg_pool, allow_fully_quantize=True)(
+      tf.nn.avg_pool2d, allow_fully_quantize=True)(
           options, expected_tf_failures=160)
 
 
 @register_make_test_function()
 def make_max_pool_tests(options):
   make_pool_tests(
-      tf.nn.max_pool, allow_fully_quantize=True)(
+      tf.nn.max_pool2d, allow_fully_quantize=True)(
           options, expected_tf_failures=160)
diff --git a/tensorflow/lite/testing/op_tests/pool3d.py b/tensorflow/lite/testing/op_tests/pool3d.py
index 798846f98de..519a3259c48 100644
--- a/tensorflow/lite/testing/op_tests/pool3d.py
+++ b/tensorflow/lite/testing/op_tests/pool3d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for pool operators."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index c95c5400c6a..cf67a1a56f2 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for prelu."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/random_standard_normal.py b/tensorflow/lite/testing/op_tests/random_standard_normal.py
index f5847a9607b..7bdac646156 100644
--- a/tensorflow/lite/testing/op_tests/random_standard_normal.py
+++ b/tensorflow/lite/testing/op_tests/random_standard_normal.py
@@ -17,7 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -45,7 +45,7 @@ def make_random_standard_normal_tests(options):
 
   def build_graph(parameters):
     """Build the op testing graph."""
-    tf.set_random_seed(seed=parameters["seed"])
+    tf.compat.v1.set_random_seed(seed=parameters["seed"])
     input_value = tf.compat.v1.placeholder(
         name="shape",
         shape=parameters["input_shape"],
diff --git a/tensorflow/lite/testing/op_tests/random_uniform.py b/tensorflow/lite/testing/op_tests/random_uniform.py
index 71f87cd53e2..d3b237dbeb0 100644
--- a/tensorflow/lite/testing/op_tests/random_uniform.py
+++ b/tensorflow/lite/testing/op_tests/random_uniform.py
@@ -17,7 +17,7 @@
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -45,7 +45,7 @@ def make_random_uniform_tests(options):
 
   def build_graph(parameters):
     """Build the op testing graph."""
-    tf.set_random_seed(seed=parameters["seed"])
+    tf.compat.v1.set_random_seed(seed=parameters["seed"])
     input_value = tf.compat.v1.placeholder(
         name="shape",
         shape=parameters["input_shape"],
diff --git a/tensorflow/lite/testing/op_tests/range.py b/tensorflow/lite/testing/op_tests/range.py
index 6f7852f1396..d19bc229d04 100644
--- a/tensorflow/lite/testing/op_tests/range.py
+++ b/tensorflow/lite/testing/op_tests/range.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for range."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/rank.py b/tensorflow/lite/testing/op_tests/rank.py
index a5138cfa5dc..f51391baf3d 100644
--- a/tensorflow/lite/testing/op_tests/rank.py
+++ b/tensorflow/lite/testing/op_tests/rank.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for rank."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/real.py b/tensorflow/lite/testing/op_tests/real.py
index 2e075f3357e..33fb4de07c7 100644
--- a/tensorflow/lite/testing/op_tests/real.py
+++ b/tensorflow/lite/testing/op_tests/real.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for real op."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reciprocal.py b/tensorflow/lite/testing/op_tests/reciprocal.py
index 9d4609cfbcf..bcfa2793abb 100644
--- a/tensorflow/lite/testing/op_tests/reciprocal.py
+++ b/tensorflow/lite/testing/op_tests/reciprocal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for reciprocal."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py
index 39043eb383b..f280a59238e 100644
--- a/tensorflow/lite/testing/op_tests/reduce.py
+++ b/tensorflow/lite/testing/op_tests/reduce.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for reduce operators."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu.py b/tensorflow/lite/testing/op_tests/relu.py
index eb89e6e0c30..d37ca24179a 100644
--- a/tensorflow/lite/testing/op_tests/relu.py
+++ b/tensorflow/lite/testing/op_tests/relu.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for relu."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu1.py b/tensorflow/lite/testing/op_tests/relu1.py
index ee83cbfb874..cde0ebf8d99 100644
--- a/tensorflow/lite/testing/op_tests/relu1.py
+++ b/tensorflow/lite/testing/op_tests/relu1.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for relu1."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu6.py b/tensorflow/lite/testing/op_tests/relu6.py
index c4f943a4709..809e6ddc4c9 100644
--- a/tensorflow/lite/testing/op_tests/relu6.py
+++ b/tensorflow/lite/testing/op_tests/relu6.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for relu6."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reshape.py b/tensorflow/lite/testing/op_tests/reshape.py
index 35107c5ef57..b8f7a85cc05 100644
--- a/tensorflow/lite/testing/op_tests/reshape.py
+++ b/tensorflow/lite/testing/op_tests/reshape.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for reshape."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/resize_bilinear.py b/tensorflow/lite/testing/op_tests/resize_bilinear.py
index ec2a2c50b88..d7210ca56fd 100644
--- a/tensorflow/lite/testing/op_tests/resize_bilinear.py
+++ b/tensorflow/lite/testing/op_tests/resize_bilinear.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for resize_bilinear."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
index 2ea816c5a98..ae235745621 100644
--- a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
+++ b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for resize_nearest_neighbor."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -51,7 +51,7 @@ def build_graph(parameters):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.image.resize_nearest_neighbor(
+    out = tf.compat.v1.image.resize_nearest_neighbor(
         input_tensor,
         size=parameters["size"],
         align_corners=parameters["align_corners"],
diff --git a/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py b/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
index fb50331d870..4b039093afc 100644
--- a/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for resolve_constant_strided_slice."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 
diff --git a/tensorflow/lite/testing/op_tests/reverse_sequence.py b/tensorflow/lite/testing/op_tests/reverse_sequence.py
index 990e6f630d3..a9b14c6c2c5 100644
--- a/tensorflow/lite/testing/op_tests/reverse_sequence.py
+++ b/tensorflow/lite/testing/op_tests/reverse_sequence.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for reverse_sequence."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -50,7 +50,7 @@ def build_graph(parameters):
         name="input",
         shape=parameters["input_shape"])
     outs = tf.reverse_sequence(
-        input_value,
+        input=input_value,
         seq_lengths=parameters["seq_lengths"],
         batch_axis=parameters["batch_axis"],
         seq_axis=parameters["seq_axis"])
diff --git a/tensorflow/lite/testing/op_tests/reverse_v2.py b/tensorflow/lite/testing/op_tests/reverse_v2.py
index c01333d82af..2804dc3564d 100644
--- a/tensorflow/lite/testing/op_tests/reverse_v2.py
+++ b/tensorflow/lite/testing/op_tests/reverse_v2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for reverse_v2."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/rfft.py b/tensorflow/lite/testing/op_tests/rfft.py
index e9dc04958e2..c0b496a435c 100644
--- a/tensorflow/lite/testing/op_tests/rfft.py
+++ b/tensorflow/lite/testing/op_tests/rfft.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for rfft2d."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/rfft2d.py b/tensorflow/lite/testing/op_tests/rfft2d.py
index f32683545a5..84fa02040c1 100644
--- a/tensorflow/lite/testing/op_tests/rfft2d.py
+++ b/tensorflow/lite/testing/op_tests/rfft2d.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for rfft2d."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/roll.py b/tensorflow/lite/testing/op_tests/roll.py
index e5ce882f56e..4990a2ba35a 100644
--- a/tensorflow/lite/testing/op_tests/roll.py
+++ b/tensorflow/lite/testing/op_tests/roll.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for roll."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
diff --git a/tensorflow/lite/testing/op_tests/round.py b/tensorflow/lite/testing/op_tests/round.py
index d7ee3af315e..02479b9f2c0 100644
--- a/tensorflow/lite/testing/op_tests/round.py
+++ b/tensorflow/lite/testing/op_tests/round.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for round."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/scatter_nd.py b/tensorflow/lite/testing/op_tests/scatter_nd.py
index 8a365ae5b96..f0a18bac660 100644
--- a/tensorflow/lite/testing/op_tests/scatter_nd.py
+++ b/tensorflow/lite/testing/op_tests/scatter_nd.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for scatter_nd."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/segment_sum.py b/tensorflow/lite/testing/op_tests/segment_sum.py
index 9b926eb46ed..9123735da74 100644
--- a/tensorflow/lite/testing/op_tests/segment_sum.py
+++ b/tensorflow/lite/testing/op_tests/segment_sum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for segment_sum."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -39,7 +39,7 @@ def build_graph(parameters):
         name="data",
         shape=parameters["data_shape"])
     segment_ids = tf.constant(parameters["segment_ids"], dtype=tf.int32)
-    out = tf.segment_sum(data, segment_ids)
+    out = tf.math.segment_sum(data, segment_ids)
     return [data], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/shape.py b/tensorflow/lite/testing/op_tests/shape.py
index 4ee2692a699..4b91ea7d003 100644
--- a/tensorflow/lite/testing/op_tests/shape.py
+++ b/tensorflow/lite/testing/op_tests/shape.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for shape."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -46,7 +46,7 @@ def build_graph(parameters):
     new_shape = tf.compat.v1.placeholder(
         dtype=tf.int32, shape=shape_of_new_shape, name="new_shape")
     reshaped = tf.reshape(input_value, shape=new_shape)
-    out = tf.shape(reshaped, out_type=parameters["out_type"])
+    out = tf.shape(input=reshaped, out_type=parameters["out_type"])
     return [input_value, new_shape], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/shape_to_strided_slice.py b/tensorflow/lite/testing/op_tests/shape_to_strided_slice.py
index 8ce7c7e6c7f..993f3d9f98b 100644
--- a/tensorflow/lite/testing/op_tests/shape_to_strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/shape_to_strided_slice.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for strided_slice operators."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -35,7 +35,7 @@ def build_graph(parameters):
     strides = parameters["strides"]
     tensors = [input_tensor]
     out = tf.strided_slice(
-        tf.shape(input_tensor),
+        tf.shape(input=input_tensor),
         begin,
         end,
         strides,
diff --git a/tensorflow/lite/testing/op_tests/sigmoid.py b/tensorflow/lite/testing/op_tests/sigmoid.py
index bc426d4bd35..2c42b93f2ed 100644
--- a/tensorflow/lite/testing/op_tests/sigmoid.py
+++ b/tensorflow/lite/testing/op_tests/sigmoid.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for sigmoid."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/sigmoid_grad.py b/tensorflow/lite/testing/op_tests/sigmoid_grad.py
index 34ca4957203..0013aa9dba5 100644
--- a/tensorflow/lite/testing/op_tests/sigmoid_grad.py
+++ b/tensorflow/lite/testing/op_tests/sigmoid_grad.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for sigmoid grad op."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/sign.py b/tensorflow/lite/testing/op_tests/sign.py
index 39e372e2b26..b7804e2beca 100644
--- a/tensorflow/lite/testing/op_tests/sign.py
+++ b/tensorflow/lite/testing/op_tests/sign.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for sign."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/slice.py b/tensorflow/lite/testing/op_tests/slice.py
index 4cdfa17c2b3..b19f9bba1bf 100644
--- a/tensorflow/lite/testing/op_tests/slice.py
+++ b/tensorflow/lite/testing/op_tests/slice.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for slice."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/softmax.py b/tensorflow/lite/testing/op_tests/softmax.py
index d52ec9f3af5..f8c4a02df80 100644
--- a/tensorflow/lite/testing/op_tests/softmax.py
+++ b/tensorflow/lite/testing/op_tests/softmax.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for softmax."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -41,7 +41,7 @@ def build_graph(parameters):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.nn.softmax(input_tensor, dim=parameters["dim"])
+    out = tf.nn.softmax(input_tensor, axis=parameters["dim"])
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/softplus.py b/tensorflow/lite/testing/op_tests/softplus.py
index 7413c62494e..9e7c27c175b 100644
--- a/tensorflow/lite/testing/op_tests/softplus.py
+++ b/tensorflow/lite/testing/op_tests/softplus.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for exp."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
index 1911c451c92..d17d4e456f8 100644
--- a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
+++ b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for space_to_batch_nd."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -88,7 +88,7 @@ def build_graph(parameters):
           dtype=tf.int32, name="paddings", shape=shape)
       input_tensors.append(paddings)
 
-    out = tf.space_to_batch_nd(input_tensor, block_shape, paddings)
+    out = tf.space_to_batch(input_tensor, block_shape, paddings)
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/space_to_depth.py b/tensorflow/lite/testing/op_tests/space_to_depth.py
index ad89533c446..27f5dbde160 100644
--- a/tensorflow/lite/testing/op_tests/space_to_depth.py
+++ b/tensorflow/lite/testing/op_tests/space_to_depth.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for space_to_depth."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -40,7 +40,8 @@ def build_graph(parameters):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.space_to_depth(input_tensor, block_size=parameters["block_size"])
+    out = tf.compat.v1.space_to_depth(
+        input=input_tensor, block_size=parameters["block_size"])
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/sparse_to_dense.py b/tensorflow/lite/testing/op_tests/sparse_to_dense.py
index 1c3c8d3daec..ac2e127637d 100644
--- a/tensorflow/lite/testing/op_tests/sparse_to_dense.py
+++ b/tensorflow/lite/testing/op_tests/sparse_to_dense.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for sparse_to_dense."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
@@ -63,7 +63,7 @@ def build_graph(parameters):
       indices.add(generate_index(dense_shape))
     indices = tf.constant(tuple(indices), dtype=parameters["index_dtype"])
     # TODO(renjieliu): Add test for validate_indices case.
-    out = tf.sparse_to_dense(
+    out = tf.compat.v1.sparse_to_dense(
         indices,
         dense_shape,
         value,
diff --git a/tensorflow/lite/testing/op_tests/split.py b/tensorflow/lite/testing/op_tests/split.py
index a1c31c2d637..2233a89a963 100644
--- a/tensorflow/lite/testing/op_tests/split.py
+++ b/tensorflow/lite/testing/op_tests/split.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for split."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/splitv.py b/tensorflow/lite/testing/op_tests/splitv.py
index 738cf5e5b59..1d42715a286 100644
--- a/tensorflow/lite/testing/op_tests/splitv.py
+++ b/tensorflow/lite/testing/op_tests/splitv.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for splitv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/squeeze.py b/tensorflow/lite/testing/op_tests/squeeze.py
index 6dda4ad290b..42d7eb864be 100644
--- a/tensorflow/lite/testing/op_tests/squeeze.py
+++ b/tensorflow/lite/testing/op_tests/squeeze.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for squeeze."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/squeeze_transpose.py b/tensorflow/lite/testing/op_tests/squeeze_transpose.py
index 9343df286c1..d8eb14c52ce 100644
--- a/tensorflow/lite/testing/op_tests/squeeze_transpose.py
+++ b/tensorflow/lite/testing/op_tests/squeeze_transpose.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for squeeze_transpose."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -35,7 +35,7 @@ def build_graph(parameters):
         name="input",
         shape=parameters["input_shape"])
     out = tf.squeeze(input_tensor, axis=parameters["axis"])
-    out = tf.transpose(out, perm=[1, 2])
+    out = tf.transpose(a=out, perm=[1, 2])
     return [input_tensor], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
diff --git a/tensorflow/lite/testing/op_tests/static_hashtable.py b/tensorflow/lite/testing/op_tests/static_hashtable.py
index 07ad23dd050..a5dc6dc193a 100644
--- a/tensorflow/lite/testing/op_tests/static_hashtable.py
+++ b/tensorflow/lite/testing/op_tests/static_hashtable.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for static hashtable."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
@@ -41,7 +41,8 @@ def build_graph(parameters):
     initializer = tf.lookup.KeyValueTensorInitializer(key_tensor, value_tensor)
     table = tf.lookup.StaticHashTable(initializer, default_value)
 
-    with tf.control_dependencies([tf.initializers.tables_initializer()]):
+    with tf.control_dependencies(
+        [tf.compat.v1.initializers.tables_initializer()]):
       input_value = tf.compat.v1.placeholder(
           dtype=key_dtype, name="input", shape=parameters["input_shape"])
       out = table.lookup(key_tensor)
diff --git a/tensorflow/lite/testing/op_tests/static_rnn_with_control_flow_v2.py b/tensorflow/lite/testing/op_tests/static_rnn_with_control_flow_v2.py
index 74db36a0e89..163a655f199 100644
--- a/tensorflow/lite/testing/op_tests/static_rnn_with_control_flow_v2.py
+++ b/tensorflow/lite/testing/op_tests/static_rnn_with_control_flow_v2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for static_rnn_with_control_flow_v2."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -46,12 +46,12 @@ def build_graph(parameters):
     num_cells = parameters["num_cells"]
     inputs_after_split = []
     for i in range(time_step_size):
-      one_timestamp_input = tf.placeholder(
+      one_timestamp_input = tf.compat.v1.placeholder(
           dtype=parameters["dtype"],
           name="split_{}".format(i),
           shape=[num_batches, input_vec_size])
       inputs_after_split.append(one_timestamp_input)
-    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
+    lstm_cell = tf.compat.v1.nn.rnn_cell.BasicLSTMCell(
         num_cells, activation=tf.nn.relu, state_is_tuple=True)
     sequence_length = None
 
@@ -71,9 +71,9 @@ def build_graph(parameters):
   def build_inputs(parameters, sess, inputs, outputs):
     """Feed inputs, assign variables, and freeze graph."""
 
-    with tf.variable_scope("", reuse=True):
-      kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
-      bias = tf.get_variable("rnn/basic_lstm_cell/bias")
+    with tf.compat.v1.variable_scope("", reuse=True):
+      kernel = tf.compat.v1.get_variable("rnn/basic_lstm_cell/kernel")
+      bias = tf.compat.v1.get_variable("rnn/basic_lstm_cell/bias")
       kernel_values = create_tensor_data(parameters["dtype"],
                                          [kernel.shape[0], kernel.shape[1]], -1,
                                          1)
diff --git a/tensorflow/lite/testing/op_tests/stft.py b/tensorflow/lite/testing/op_tests/stft.py
index f15c93f9cfa..8996ca2f1e3 100644
--- a/tensorflow/lite/testing/op_tests/stft.py
+++ b/tensorflow/lite/testing/op_tests/stft.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for stft."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/strided_slice.py b/tensorflow/lite/testing/op_tests/strided_slice.py
index 97a5e3354a9..9dcaaa96c02 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for strided_slice operators."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/strided_slice_np_style.py b/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
index eae34c7bfad..043d6d7f817 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for strided_slice_np_style."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tanh.py b/tensorflow/lite/testing/op_tests/tanh.py
index c491e6913cf..33e501f8495 100644
--- a/tensorflow/lite/testing/op_tests/tanh.py
+++ b/tensorflow/lite/testing/op_tests/tanh.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for tanh."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_concat.py b/tensorflow/lite/testing/op_tests/tensor_list_concat.py
index c6026a2ba90..c0404866c34 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_concat.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_concat.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_concat."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -34,7 +34,7 @@ def make_tensor_list_concat_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListConcatV2 op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_dynamic_shape.py b/tensorflow/lite/testing/op_tests/tensor_list_dynamic_shape.py
index 834f291a3cb..b581bf6e7cc 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_dynamic_shape.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_dynamic_shape.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_dynamic_shape."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -34,7 +34,7 @@ def make_tensor_list_dynamic_shape_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListSetItem op testing graph."""
-    item = tf.placeholder(
+    item = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"], shape=parameters["element_shape"])
     tensor_list = list_ops.tensor_list_reserve(
         element_shape=None,
@@ -51,7 +51,8 @@ def loop_body(i, tensor_list):
       new_list = list_ops.tensor_list_set_item(tensor_list, i, new_item)
       return i + 1, new_list
 
-    _, tensor_list = tf.while_loop(condition, loop_body, init_state)
+    _, tensor_list = tf.while_loop(
+        cond=condition, body=loop_body, loop_vars=init_state)
     out = list_ops.tensor_list_stack(
         tensor_list,
         num_elements=parameters["num_elements"],
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_get_item.py b/tensorflow/lite/testing/op_tests/tensor_list_get_item.py
index 267a852fad9..0385f2ac544 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_get_item.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_get_item.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_get_item."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -35,7 +35,7 @@ def make_tensor_list_get_item_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListGetItem op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_length.py b/tensorflow/lite/testing/op_tests/tensor_list_length.py
index 1ad28ef97bb..2bc540b63cc 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_length.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_length.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_length."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -34,7 +34,7 @@ def make_tensor_list_length_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListLength op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_resize.py b/tensorflow/lite/testing/op_tests/tensor_list_resize.py
index 9948835a1e0..d382e50b962 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_resize.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_resize.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_resize."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraConvertOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
@@ -36,7 +36,7 @@ def make_tensor_list_resize_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListResize op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
diff --git a/tensorflow/lite/testing/op_tests/tensor_list_set_item.py b/tensorflow/lite/testing/op_tests/tensor_list_set_item.py
index 437feb93722..30c3a3e130b 100644
--- a/tensorflow/lite/testing/op_tests/tensor_list_set_item.py
+++ b/tensorflow/lite/testing/op_tests/tensor_list_set_item.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tensor_list_set_item."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -35,10 +35,10 @@ def make_tensor_list_set_item_tests(options):
 
   def build_graph(parameters):
     """Build the TensorListSetItem op testing graph."""
-    data = tf.placeholder(
+    data = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"],
         shape=[parameters["num_elements"]] + parameters["element_shape"])
-    item = tf.placeholder(
+    item = tf.compat.v1.placeholder(
         dtype=parameters["element_dtype"], shape=parameters["element_shape"])
     tensor_list = list_ops.tensor_list_from_tensor(data,
                                                    parameters["element_shape"])
diff --git a/tensorflow/lite/testing/op_tests/tensor_scatter_add.py b/tensorflow/lite/testing/op_tests/tensor_scatter_add.py
index 7e7fd79f90c..185b85a1375 100644
--- a/tensorflow/lite/testing/op_tests/tensor_scatter_add.py
+++ b/tensorflow/lite/testing/op_tests/tensor_scatter_add.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for TensorScatterAdd."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tensor_scatter_update.py b/tensorflow/lite/testing/op_tests/tensor_scatter_update.py
index 72d4843d55d..dbd936cb41e 100644
--- a/tensorflow/lite/testing/op_tests/tensor_scatter_update.py
+++ b/tensorflow/lite/testing/op_tests/tensor_scatter_update.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for TensorScatterUpdate."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tile.py b/tensorflow/lite/testing/op_tests/tile.py
index e29101de715..fedce0c35ef 100644
--- a/tensorflow/lite/testing/op_tests/tile.py
+++ b/tensorflow/lite/testing/op_tests/tile.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for tile."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/topk.py b/tensorflow/lite/testing/op_tests/topk.py
index 1aa7bf2b1d4..7a107dad30e 100644
--- a/tensorflow/lite/testing/op_tests/topk.py
+++ b/tensorflow/lite/testing/op_tests/topk.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for topk."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/transpose.py b/tensorflow/lite/testing/op_tests/transpose.py
index 6d3ecf40c3e..6bd630d8d77 100644
--- a/tensorflow/lite/testing/op_tests/transpose.py
+++ b/tensorflow/lite/testing/op_tests/transpose.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for transpose."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -74,11 +74,11 @@ def build_graph(parameters):
       perm = parameters["perm"]
       input_tensors = [input_tensor]
     else:
-      shape = [len(parameters["perm"]), 2]
+      shape = len(parameters["perm"])
       perm = tf.compat.v1.placeholder(dtype=tf.int32, name="perm", shape=shape)
       input_tensors = [input_tensor, perm]
 
-    out = tf.transpose(input_tensor, perm=perm)
+    out = tf.transpose(a=input_tensor, perm=perm)
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -94,5 +94,4 @@ def build_inputs(parameters, sess, inputs, outputs):
       options,
       test_parameters,
       build_graph,
-      build_inputs,
-      expected_tf_failures=9)
+      build_inputs)
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 164d2414d01..0e42e9513d7 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for transpose_conv."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -129,8 +129,8 @@ def build_graph(parameters):
     if not parameters["const_weight_bias"]:
       input_tensors = [input_tensor, filter_input]
       conv_outputs = tf.nn.conv2d(
-          input_tensor,
-          filter_input,
+          input=input_tensor,
+          filters=filter_input,
           strides=parameters["strides"],
           padding=parameters["padding"],
           data_format=parameters["data_format"])
diff --git a/tensorflow/lite/testing/op_tests/unfused_gru.py b/tensorflow/lite/testing/op_tests/unfused_gru.py
index 7933d50e59f..a72d4e531ae 100644
--- a/tensorflow/lite/testing/op_tests/unfused_gru.py
+++ b/tensorflow/lite/testing/op_tests/unfused_gru.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for unfused_gru."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unique.py b/tensorflow/lite/testing/op_tests/unique.py
index 9611bbbf7e6..8ce4bd6f27d 100644
--- a/tensorflow/lite/testing/op_tests/unique.py
+++ b/tensorflow/lite/testing/op_tests/unique.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for unique."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unpack.py b/tensorflow/lite/testing/op_tests/unpack.py
index 9cbd6cb343e..d3947322580 100644
--- a/tensorflow/lite/testing/op_tests/unpack.py
+++ b/tensorflow/lite/testing/op_tests/unpack.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for unpack."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py b/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
index 7db6b9480b9..d605071dae9 100644
--- a/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
+++ b/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for unroll_batch_matmul."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unsorted_segment.py b/tensorflow/lite/testing/op_tests/unsorted_segment.py
index c59b4de5b1b..5a6f4c03783 100644
--- a/tensorflow/lite/testing/op_tests/unsorted_segment.py
+++ b/tensorflow/lite/testing/op_tests/unsorted_segment.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for unsorted_segment ops."""
 
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/where.py b/tensorflow/lite/testing/op_tests/where.py
index d74c9eb0c6b..4ecb0c0b45a 100644
--- a/tensorflow/lite/testing/op_tests/where.py
+++ b/tensorflow/lite/testing/op_tests/where.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for where."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -87,7 +87,8 @@ def build_graph(parameters):
         name="input3",
         shape=parameters["input_shape_set"][1])
     less = tf.less(input_value1, input_value2)
-    where = tf.where_v2 if parameters["use_where_v2"] else tf.where
+    where = tf.compat.v2.where if parameters[
+        "use_where_v2"] else tf.compat.v1.where
     out = where(less, input_value1, input_value2)
     return [input_value1, input_value2], [out]
 
diff --git a/tensorflow/lite/testing/op_tests/where_v2.py b/tensorflow/lite/testing/op_tests/where_v2.py
index 8b8e4316600..b96d01f30a3 100644
--- a/tensorflow/lite/testing/op_tests/where_v2.py
+++ b/tensorflow/lite/testing/op_tests/where_v2.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for where_v2."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -113,7 +113,7 @@ def build_graph(parameters):
           dtype=parameters["input_dtype"],
           name="input_y",
           shape=parameters["input_shape_set"][1])
-    out = tf.where_v2(input_condition, input_value1, input_value2)
+    out = tf.compat.v2.where(input_condition, input_value1, input_value2)
     return [input_condition, input_value1, input_value2], [out]
 
   def build_input_shape(input_shape):
diff --git a/tensorflow/lite/testing/op_tests/while_loop.py b/tensorflow/lite/testing/op_tests/while_loop.py
index 69cff92b47a..7db0a3f5bc7 100644
--- a/tensorflow/lite/testing/op_tests/while_loop.py
+++ b/tensorflow/lite/testing/op_tests/while_loop.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Test configs for while_loop."""
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing import zip_test_utils
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -42,9 +42,9 @@ def build_graph(parameters):
     # to input (1,) tensors and then reshape to scalar.
     # TODO(b/129003347): Remove the workaround after scalar inputs are
     # supported.
-    num_iterations = tf.placeholder(
+    num_iterations = tf.compat.v1.placeholder(
         dtype=tf.int32, name="num_iterations", shape=(1,))
-    increment_value = tf.placeholder(
+    increment_value = tf.compat.v1.placeholder(
         dtype=parameters["dtype"], name="increment_value", shape=(1,))
     num_iterations_scalar = tf.reshape(num_iterations, ())
 
@@ -71,7 +71,8 @@ def body_fn(counter, value, increment_value):
       return [new_counter, new_value, increment_value]
 
     counter, value, result_increment_value = tf.while_loop(
-        cond_fn, body_fn, loop_vars=[1, increment_value, increment_value])
+        cond=cond_fn, body=body_fn,
+        loop_vars=[1, increment_value, increment_value])
     return [num_iterations,
             increment_value], [counter, value, result_increment_value]
 
diff --git a/tensorflow/lite/testing/op_tests/zeros_like.py b/tensorflow/lite/testing/op_tests/zeros_like.py
index 6c5d7ba58ee..12e6cffdd1d 100644
--- a/tensorflow/lite/testing/op_tests/zeros_like.py
+++ b/tensorflow/lite/testing/op_tests/zeros_like.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Test configs for zeros_like."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/selective_build_test.cc b/tensorflow/lite/testing/selective_build_test.cc
index 946f52a6e2a..15ed21abadc 100644
--- a/tensorflow/lite/testing/selective_build_test.cc
+++ b/tensorflow/lite/testing/selective_build_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/create_op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
diff --git a/tensorflow/lite/testing/split.h b/tensorflow/lite/testing/split.h
index 020dab13b1d..87dbe1599e9 100644
--- a/tensorflow/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
@@ -197,6 +198,17 @@ inline std::vector<std::complex<double>> Split(const string& s,
   return fields;
 }
 
+template <>
+inline std::vector<Eigen::half> Split(const string& s,
+                                      const string& delimiter) {
+  std::vector<Eigen::half> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(Eigen::half_impl::float_to_half_rtne(
+        strtof(s.data() + p.first, nullptr)));
+  }
+  return fields;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 0f3596cf7be..3238486ab36 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -26,16 +26,16 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/kernels/custom_ops_register.h"
 #include "tensorflow/lite/kernels/gradient/gradient_ops.h"
 #include "tensorflow/lite/kernels/parse_example/parse_example.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/signature_runner.h"
@@ -376,6 +376,8 @@ bool TfLiteDriver::DataExpectation::Check(bool verbose,
                                                                     tensor);
     case kTfLiteFloat64:
       return TypedCheck<double, double>(verbose, tensor);
+    case kTfLiteFloat16:
+      return TypedCheck<Eigen::half, float>(verbose, tensor);
     default:
       fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
       return false;
@@ -678,6 +680,15 @@ void TfLiteDriver::SetInput(const string& name, const string& csv_values) {
       SetTensorData(values, tensor->data.raw);
       break;
     }
+    case kTfLiteFloat16: {
+      const auto& values = testing::Split<Eigen::half>(csv_values, ",");
+      for (auto k : values) {
+        TFLITE_LOG(INFO) << "input" << k;
+      }
+      if (!CheckSizes<Eigen::half>(tensor->bytes, values.size())) return;
+      SetTensorData(values, tensor->data.raw);
+      break;
+    }
     default:
       Invalidate(absl::StrCat("Unsupported tensor type ",
                               TfLiteTypeGetName(tensor->type),
@@ -755,6 +766,9 @@ void TfLiteDriver::SetExpectation(const string& name,
     case kTfLiteComplex128:
       expected_output_[id]->SetData<std::complex<double>>(csv_values);
       break;
+    case kTfLiteFloat16:
+      expected_output_[id]->SetData<Eigen::half>(csv_values);
+      break;
     default:
       Invalidate(absl::StrCat("Unsupported tensor type ",
                               TfLiteTypeGetName(tensor->type),
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index e68031b2ba1..55f2a2d151e 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -19,14 +19,14 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/register_ref.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/testing/test_runner.h"
 
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index baf3eb529ff..09f3c95cfe3 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -25,12 +25,12 @@
 import zipfile
 
 import numpy as np
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
 
 from google.protobuf import text_format
 from tensorflow.lite.testing import _pywrap_string_util
 from tensorflow.lite.testing import generate_examples_report as report_lib
-from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.saved_model import signature_constants
 
 # pylint: disable=g-import-not-at-top
@@ -160,7 +160,7 @@ def freeze_graph(session, outputs):
   Returns:
     The frozen graph_def.
   """
-  return tf_graph_util.convert_variables_to_constants(
+  return convert_to_constants.convert_variables_to_constants(
       session, session.graph.as_graph_def(), [x.op.name for x in outputs])
 
 
@@ -332,7 +332,7 @@ def _get_tensor_info(tensors, default_name_prefix, normalize_func):
   for idx, tensor in enumerate(tensors):
     if not tensor.name:
       tensor.name = default_name_prefix + str(idx)
-    tensor_info = tf.saved_model.utils.build_tensor_info(tensor)
+    tensor_info = tf.compat.v1.saved_model.utils.build_tensor_info(tensor)
     tensor_name = normalize_func(tensor.name)
     tensor_info_map[tensor_name] = tensor_info
     tensor_names.append(tensor_name)
@@ -534,7 +534,7 @@ def build_example(label, param_dict_real, zip_path_label):
         # Build graph
         report["tf_log"] = ""
         report["tflite_converter_log"] = ""
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
 
         with tf.Graph().as_default():
           with tf.device("/cpu:0"):
@@ -546,7 +546,7 @@ def build_example(label, param_dict_real, zip_path_label):
               report["tf_log"] += traceback.format_exc()
               return None, report
 
-          sess = tf.Session()
+          sess = tf.compat.v1.Session()
           try:
             baseline_inputs, baseline_outputs = (
                 make_test_inputs(param_dict_real, sess, inputs, outputs))
@@ -574,14 +574,15 @@ def build_example(label, param_dict_real, zip_path_label):
           ]
 
           inference_signature = (
-              tf.saved_model.signature_def_utils.build_signature_def(
+              tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
                   inputs=tensor_info_inputs,
                   outputs=tensor_info_outputs,
                   method_name="op_test"))
           saved_model_dir = tempfile.mkdtemp("op_test")
-          saved_model_tags = [tf.saved_model.tag_constants.SERVING]
+          saved_model_tags = [tf.saved_model.SERVING]
           signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-          builder = tf.saved_model.builder.SavedModelBuilder(saved_model_dir)
+          builder = tf.compat.v1.saved_model.builder.SavedModelBuilder(
+              saved_model_dir)
           builder.add_meta_graph_and_variables(
               sess,
               saved_model_tags,
@@ -593,7 +594,7 @@ def build_example(label, param_dict_real, zip_path_label):
           # pylint: disable=g-long-ternary
           graph_def = freeze_graph(
               sess,
-              tf.global_variables() + inputs +
+              tf.compat.v1.global_variables() + inputs +
               outputs) if use_frozen_graph else sess.graph_def
 
         if "split_tflite_lstm_inputs" in param_dict_real:
@@ -690,9 +691,10 @@ def build_example(label, param_dict_real, zip_path_label):
   percent = 0
   if tf_success > 0:
     percent = float(converter_success) / float(tf_success) * 100.
-  tf.logging.info(("Archive %s Considered %d graphs, %d TF evaluated graphs "
-                   " and %d converted graphs (%.1f%%"), zip_path,
-                  total_conversions, tf_success, converter_success, percent)
+  tf.compat.v1.logging.info(
+      ("Archive %s Considered %d graphs, %d TF evaluated graphs "
+       " and %d converted graphs (%.1f%%"), zip_path, total_conversions,
+      tf_success, converter_success, percent)
 
   tf_failures = parameter_count - tf_success
 
diff --git a/tensorflow/lite/tflite_with_xnnpack.cc b/tensorflow/lite/tflite_with_xnnpack.cc
index 91fa4742b7e..22e8617ec74 100644
--- a/tensorflow/lite/tflite_with_xnnpack.cc
+++ b/tensorflow/lite/tflite_with_xnnpack.cc
@@ -14,17 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 // Corresponding weak declaration found in lite/tflite_with_xnnpack_optional.cc
 // when TFLITE_BUILD_WITH_XNNPACK_DELEGATE macro isn't defined.
 std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-AcquireXNNPACKDelegate(int num_threads) {
+AcquireXNNPACKDelegate() {
   auto opts = TfLiteXNNPackDelegateOptionsDefault();
-  // Note that we don't want to use the thread pool for num_threads == 1.
-  opts.num_threads = num_threads > 1 ? num_threads : 0;
   return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
       TfLiteXNNPackDelegateCreate(&opts), TfLiteXNNPackDelegateDelete);
 }
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.cc b/tensorflow/lite/tflite_with_xnnpack_optional.cc
index 868f850e1c4..5f0038a3d50 100644
--- a/tensorflow/lite/tflite_with_xnnpack_optional.cc
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 
 #ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
@@ -29,24 +29,27 @@ using TfLiteDelegatePtr =
     std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
 
 #ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
-TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(
+    TfLiteContext* context, bool enable_xnnpack_unsigned_quantized) {
   auto opts = TfLiteXNNPackDelegateOptionsDefault();
-  // Note that we don't want to use the thread pool for num_threads == 1.
-  opts.num_threads = num_threads > 1 ? num_threads : 0;
-  return TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&opts),
-                           TfLiteXNNPackDelegateDelete);
+  if (enable_xnnpack_unsigned_quantized) {
+    opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_QU8;
+  }
+  return TfLiteDelegatePtr(
+      TfLiteXNNPackDelegateCreateWithThreadpool(&opts, context),
+      TfLiteXNNPackDelegateDelete);
 }
 #else
 // Using weak symbols to create a delegate allows automatic injection of the
 // delegate simply by adding it as a dependency. See the strong override in
 // lite/tflite_with_xnnpack.cc,
-TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr
-AcquireXNNPACKDelegate(int num_threads) {
+TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr AcquireXNNPACKDelegate() {
   return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
-  return AcquireXNNPACKDelegate(num_threads);
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(
+    TfLiteContext* context, bool enable_xnnpack_unsigned_quantized) {
+  return AcquireXNNPACKDelegate();
 }
 #endif
 
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.h b/tensorflow/lite/tflite_with_xnnpack_optional.h
index afbdbd17356..7a850e0cf3d 100644
--- a/tensorflow/lite/tflite_with_xnnpack_optional.h
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -16,11 +16,12 @@ limitations under the License.
 #define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-MaybeCreateXNNPACKDelegate(int num_threads);
+MaybeCreateXNNPACKDelegate(TfLiteContext* context,
+                           bool enable_xnnpack_unsigned_quantized);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 620ba2837c9..6cbb617fb7d 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -8,8 +8,10 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -48,6 +50,7 @@ cc_library(
         "runtime/common.h",
         "runtime/types.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 761b5ef92d8..20910d57fe2 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 4ae177c9e92..2967c45c245 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/toco/logging/testdata/BUILD b/tensorflow/lite/toco/logging/testdata/BUILD
index cae495fdc7d..4a14eefef5a 100644
--- a/tensorflow/lite/toco/logging/testdata/BUILD
+++ b/tensorflow/lite/toco/logging/testdata/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index e320bebf359..7dcc5e09be6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_binary", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/lite:__subpackages__",
         "//tensorflow/tools/pip_package:__subpackages__",
@@ -37,11 +38,12 @@ cc_library(
     deps = [
         "@com_google_protobuf//:protobuf_headers",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/c:kernels",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index d8edb4e6785..e8489ef9b9f 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/text_format.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
 #include "tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.h"
@@ -32,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 #include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
-using mlir::lite::StringSet;
 
 void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
                                  toco::TocoFlags* toco_flags,
@@ -256,7 +256,8 @@ tflite::TensorType FromTocoDataTypeToTflitToTensorType(int inference_type) {
   }
 }
 
-int ToStringSet(PyObject* py_denylist, StringSet* string_set) {
+int ToStringSet(PyObject* py_denylist,
+                absl::flat_hash_set<std::string>* string_set) {
   using tflite::python_utils::ConvertFromPyString;
   // Ensure op_denylist is non null
   if (!py_denylist) {
@@ -293,7 +294,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
                             int input_data_type, int output_data_type,
                             bool enable_numeric_verify,
                             bool enable_whole_model_verify,
-                            PyObject* op_denylist, PyObject* node_denylist) {
+                            PyObject* op_denylist, PyObject* node_denylist,
+                            bool enable_variable_quantization) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -304,8 +306,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
     return nullptr;
   }
 
-  StringSet denylisted_ops;
-  StringSet denylisted_nodes;
+  absl::flat_hash_set<std::string> denylisted_ops;
+  absl::flat_hash_set<std::string> denylisted_nodes;
   if (ToStringSet(op_denylist, &denylisted_ops) == -1) {
     PyErr_Format(PyExc_ValueError, "Failed to convert op denylist PyObject");
     return nullptr;
@@ -337,7 +339,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
       *tflite_model, input_type, output_type, inference_tensor_type, {},
       disable_per_channel, fully_quantize, &builder, error_reporter.get(),
       enable_numeric_verify, enable_whole_model_verify,
-      /*legacy_float_scale=*/true, denylisted_ops, denylisted_nodes);
+      /*legacy_float_scale=*/true, denylisted_ops, denylisted_nodes,
+      enable_variable_quantization);
 
   if (status != kTfLiteOk) {
     error_reporter->exception();
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index d3b0971fc1d..3cd289037de 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -47,7 +47,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
                             bool enable_numeric_verify = false,
                             bool enable_whole_model_verify = false,
                             PyObject* op_denylist = nullptr,
-                            PyObject* node_denylist = nullptr);
+                            PyObject* node_denylist = nullptr,
+                            bool enable_variable_quantization = false);
 
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
index 059dab93057..f54c3d3f663 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index d0b1a60b852..18af8116002 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # To suppress build cleaner error about inclusion of schema_generate.h.
     features = ["-layering_check"],
     licenses = ["notice"],
@@ -133,11 +134,12 @@ cc_library(
         ":operator",
         ":types",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/tools:verifier",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
-        "//tensorflow/lite/tools:verifier",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index fa0e2125d48..bc1bdcf6493 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <string>
 
 #include "flatbuffers/flexbuffers.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/core/tools/verifier.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/lite/tools/verifier.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index 0247d679b4d..ca9ea6de26e 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -38,7 +38,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 51.
+// Next ID to use: 53.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -300,4 +300,11 @@ message TocoFlags {
 
   // Whether to ensure each function has a single use.
   optional bool guarantee_all_funcs_one_use = 50 [default = false];
+
+  // Whether to convert model to stablehlo.
+  optional bool convert_to_stablehlo = 51 [default = false];
+
+  // If false, skip the variable quantization passes.
+  // Note: This is an experimental feature
+  optional bool enable_mlir_variable_quantization = 52 [default = false];
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index d141a05372e..3fd9e26f26f 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,8 +1,14 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite", "verifier_internal_visibility_allowlist")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -121,6 +127,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/lite/python:schema_py",
+        "//tensorflow/lite/python:schema_util",
         "//tensorflow/python/platform",
         "@flatbuffers//:runtime_py",
     ],
@@ -167,6 +174,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_utils",
         "@com_googlesource_code_re2//:re2",
     ],
@@ -193,18 +201,21 @@ cc_test(
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "verifier",
     hdrs = ["verifier.h"],
+    copts = tflite_copts_warnings(),
+    tflite_deps = [
+        "//tensorflow/lite:framework",
+    ],
     deps = [
         ":verifier_internal",
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api:error_reporter",
         "//tensorflow/lite/core/api:op_resolver",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/tools:verifier",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -212,16 +223,15 @@ cc_library(
     ],
 )
 
-cc_library(
+cc_library_with_tflite(
     name = "verifier_internal",
     hdrs = ["verifier_internal.h"],
+    copts = tflite_copts_warnings(),
     visibility = verifier_internal_visibility_allowlist() + [
         "//tensorflow/lite/core/shims:__subpackages__",
         "//tensorflow/lite/java/src/main/native:__pkg__",
     ],
-    deps = [
-        "//tensorflow/lite/core/tools:verifier_internal",
-    ],
+    deps = ["//tensorflow/lite/core/tools:verifier_internal"],
 )
 
 cc_library(
@@ -255,7 +265,7 @@ cc_library(
     hdrs = ["command_line_flags.h"],
     copts = tflite_copts(),
     deps = [
-        "//tensorflow/lite/tools:logging",
+        ":logging",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -283,6 +293,7 @@ cc_library(
         "//tensorflow/core:tensorflow",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers",
         "@jsoncpp_git//:jsoncpp",
@@ -296,8 +307,8 @@ tf_cc_binary(
     srcs = ["list_flex_ops_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":command_line_flags",
         ":list_flex_ops",
-        "//tensorflow/lite/tools:command_line_flags",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -307,8 +318,8 @@ cc_library(
     srcs = ["list_flex_ops_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":command_line_flags",
         ":list_flex_ops",
-        "//tensorflow/lite/tools:command_line_flags",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -319,6 +330,7 @@ cc_library(
     hdrs = ["list_flex_ops.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_utils",
         "@jsoncpp_git//:jsoncpp",
     ],
@@ -329,8 +341,8 @@ tf_cc_binary(
     srcs = ["list_flex_ops_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":command_line_flags",
         ":list_flex_ops_no_kernel",
-        "//tensorflow/lite/tools:command_line_flags",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -377,9 +389,47 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         ":logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
 )
 
+cc_library(
+    name = "model_loader",
+    srcs = ["model_loader.cc"],
+    hdrs = ["model_loader.h"],
+    deps = [
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/core:model_builder",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "model_loader_test",
+    srcs = ["model_loader_test.cc"],
+    data = ["//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite"],
+    deps = [
+        ":model_loader",
+        "//tensorflow/lite:model_builder",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
 tflite_portable_test_suite()
+
+# copybara:uncomment_begin(google-only)
+# tf_proto_library(
+#     name = "op_kernel_set_proto",
+#     srcs = ["op_kernel_set.proto"],
+# )
+#
+# py_proto_library(
+#     name = "op_kernel_set_py_pb2",
+#     api_version = 2,
+#     deps = [":op_kernel_set_proto"],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index a1200487fea..805147d4858 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -113,8 +114,8 @@ cc_test(
         ":benchmark_tflite_model_lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
@@ -157,14 +158,15 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:simple_memory_arena_debug_dump",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools:model_loader",
         "//tensorflow/lite/tools:utils",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/delegates:tflite_execution_providers",
@@ -185,11 +187,11 @@ cc_library(
         ":benchmark_model_lib",
         ":benchmark_params",
         ":benchmark_utils",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/tools:logging",
         "@com_google_absl//absl/memory",
         "//tensorflow/core/util:stats_calculator_portable",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/nnapi:nnapi_util",
         "//tensorflow/lite/profiling:time",
@@ -221,7 +223,7 @@ cc_library(
         ":benchmark_utils",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/profiling:memory_info",
         "//tensorflow/lite/profiling:memory_usage_monitor",
         "//tensorflow/lite/profiling:time",
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index 0efade877a0..423c95fa9f5 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -28,11 +28,14 @@ list(APPEND TFLITE_BENCHMARK_SRCS
   ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
   ${TFLITE_SOURCE_DIR}/profiling/root_profiler.cc
+  ${TFLITE_SOURCE_DIR}/profiling/telemetry/profiler.cc
+  ${TFLITE_SOURCE_DIR}/profiling/telemetry/telemetry.cc
   ${TFLITE_SOURCE_DIR}/profiling/time.cc
   ${TFLITE_SOURCE_DIR}/tools/command_line_flags.cc
   ${TFLITE_SOURCE_DIR}/tools/delegates/default_execution_provider.cc
   ${TFLITE_SOURCE_DIR}/tools/delegates/delegate_provider.cc
   ${TFLITE_SOURCE_DIR}/tools/evaluation/utils.cc
+  ${TFLITE_SOURCE_DIR}/tools/model_loader.cc
   ${TFLITE_SOURCE_DIR}/tools/tool_params.cc
   ${TFLITE_SOURCE_DIR}/tools/utils.cc
 )
@@ -51,7 +54,7 @@ endif()
 if(TFLITE_ENABLE_XNNPACK)
   list(APPEND TFLITE_BENCHMARK_SRCS
     ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
-    ${TFLITE_SOURCE_DIR}/experimental/acceleration/configuration/c/xnnpack_plugin.cc)
+    ${TFLITE_SOURCE_DIR}/core/experimental/acceleration/configuration/c/xnnpack_plugin.cc)
 else()
   set(TFLITE_BENCHMARK_CC_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
 endif()  # TFLITE_ENABLE_XNNPACK
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 404a771c428..3685b682d96 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -32,6 +32,9 @@ and the following optional parameters:
     The number of warmup runs to do before starting the benchmark.
 *   `num_runs`: `int` (default=50) \
     The number of runs. Increase this to reduce variance.
+*   `max_secs` : float (default=150.0) \
+    The maximum number of seconds the benchmark will run before being
+    terminated.
 *   `run_delay`: `float` (default=-1.0) \
     The delay in seconds between subsequent benchmark runs. Non-positive values
     mean use no delay.
@@ -46,56 +49,69 @@ and the following optional parameters:
 *   `max_profiling_buffer_entries`: `int` (default=1024) \
     The initial max number of profiling events that will be stored during each
     inference run. It is only meaningful when `enable_op_profiling` is set to
-     `true`. Note, the actual value of this parameter will be adjusted if the
-     model has more nodes than the specified value of this parameter. Also, when
-     `allow_dynamic_profiling_buffer_increase` is set to `true`, the number of
-     profiling buffer entries will be increased dynamically.
+    `true`. Note, the actual value of this parameter will be adjusted if the
+    model has more nodes than the specified value of this parameter. Also, when
+    `allow_dynamic_profiling_buffer_increase` is set to `true`, the number of
+    profiling buffer entries will be increased dynamically.
 *   `allow_dynamic_profiling_buffer_increase`: `bool` (default=false) \
     Whether allowing dynamic increase on the number of profiling buffer entries.
-    It is only meaningful when `enable_op_profiling` is set to `true`.
-    Note, allowing dynamic buffer size increase may cause more profiling
-    overhead, thus it is preferred to set `max_profiling_buffer_entries` to a
-    large-enough value.
+    It is only meaningful when `enable_op_profiling` is set to `true`. Note,
+    allowing dynamic buffer size increase may cause more profiling overhead,
+    thus it is preferred to set `max_profiling_buffer_entries` to a large-enough
+    value.
 
 *   `profiling_output_csv_file`: `str` (default="") \
     File path to export profile data to as CSV. The results are printed to
     `stdout` if option is not set. Requires `enable_op_profiling` to be `true`
     and the path to include the name of the output CSV; otherwise results are
     printed to `stdout`.
-*  `print_preinvoke_state`: `bool` (default=false) \
+
+*   `print_preinvoke_state`: `bool` (default=false) \
     Whether to print out the TfLite interpreter internals just before calling
     tflite::Interpreter::Invoke. The internals will include allocated memory
     size of each tensor etc. Enabling this could help understand TfLite graph
     and memory usage.
-*  `print_postinvoke_state`: `bool` (default=false) \
+
+*   `print_postinvoke_state`: `bool` (default=false) \
     Whether to print out the TfLite interpreter internals just before benchmark
     completes (i.e. after all repeated Invoke calls complete). The internals
     will include allocated memory size of each tensor etc. Enabling this could
     help understand TfLite graph and memory usage, particularly when there are
     dynamic-shaped tensors in the graph.
-*  `report_peak_memory_footprint`: `bool` (default=false) \
+
+*   `report_peak_memory_footprint`: `bool` (default=false) \
     Whether to report the peak memory footprint by periodically checking the
     memory footprint. Internally, a separate thread will be spawned for this
     periodic check. Therefore, the performance benchmark result could be
     affected.
-*  `memory_footprint_check_interval_ms`: `int` (default=50) \
-   The interval in millisecond between two consecutive memory footprint checks.
-   This is only used when --report_peak_memory_footprint is set to true.
 
-*  `dry_run`: `bool` (default=false) \
+*   `memory_footprint_check_interval_ms`: `int` (default=50) \
+    The interval in millisecond between two consecutive memory footprint checks.
+    This is only used when --report_peak_memory_footprint is set to true.
+
+*   `dry_run`: `bool` (default=false) \
     Whether to run the tool just with simply loading the model, allocating
     tensors etc. but without actually invoking any op kernels.
-*  `verbose`: `bool` (default=false) \
+
+*   `verbose`: `bool` (default=false) \
     Whether to log parameters whose values are not set. By default, only log
     those parameters that are set by parsing their values from the commandline
     flags.
-*  `release_dynamic_tensors`: `bool` (default=false) \
+
+*   `release_dynamic_tensors`: `bool` (default=false) \
     Whether to configure the Interpreter to immediately release the memory of
     dynamic tensors in the graph once they are not used.
-*  `optimize_memory_for_large_tensors`: `int` (default=0) \
+
+*   `optimize_memory_for_large_tensors`: `int` (default=0) \
     Whether to optimize memory usage for large tensors with sacrificing latency.
     When the feature is enabled, `release_dynamic_tensors` is also enabled.
 
+This list of parameters is not exhaustive. See
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/benchmark_model.cc)
+and
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc)
+for all parameters that the binary takes.
+
 ### Model input parameters
 By default, the tool will use randomized data for model inputs. The following
 parameters allow users to specify customized input values to the model when
@@ -200,6 +216,10 @@ for model benchmarking.
 *   `external_delegate_path`: `string` (default="")
 *   `external_delegate_options`: `string` (default="")
 
+#### Stable delegate [Experimental]
+*   `stable_delegate_loader_settings`: `string` (default="") A path to the
+    JSON-encoded delegate [`TFLiteSettings`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto#L488) file, which is defined in `configuration.proto`.
+
 As some delegates are only available on certain platforms, when running the
 benchmark tool on a particular platform, specifying `--help` will print out all
 supported parameters.
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
index 22d0e1f4610..1d1482e0aec 100644
--- a/tensorflow/lite/tools/benchmark/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index 3f49f15bd63..f3d84730dc7 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/util/stats_calculator.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/profiling/memory_info.h"
 #include "tensorflow/lite/profiling/memory_usage_monitor.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_params.h"
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index 91d60350c2e..98c0a6eccf7 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/util/stats_calculator.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #include "tensorflow/lite/nnapi/nnapi_util.h"
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.h b/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
index c13161812f0..cc77a9885ab 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_params.h"
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index b1644410e3b..83cb10dd274 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef _WIN32
+#include <fcntl.h>
+#endif  // !defined(_WIN32)
+
 #include <fstream>
 #include <iostream>
 #include <memory>
@@ -24,8 +28,8 @@ limitations under the License.
 #include "absl/algorithm/algorithm.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
@@ -46,34 +50,72 @@ namespace benchmark {
 namespace {
 
 enum class ModelGraphType { FP32, INT8, STRING };
+enum class ModelReadOption { FROM_PATH, FROM_FD };
 
-BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
-                             ModelGraphType graph_type = ModelGraphType::FP32) {
-  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+void InitializeParams(
+    BenchmarkParams& params, int32_t num_runs, float min_secs, float max_secs,
+    ModelReadOption model_read_option = ModelReadOption::FROM_PATH,
+    ModelGraphType graph_type = ModelGraphType::FP32) {
   params.Set<int32_t>("num_runs", num_runs);
   params.Set<float>("min_secs", min_secs);
   params.Set<float>("max_secs", max_secs);
 
+  // by default, simply use the fp32 one.
+  std::string graph_path = *g_fp32_model_path;
   if (graph_type == ModelGraphType::INT8) {
-    params.Set<std::string>("graph", *g_int8_model_path);
+    graph_path = *g_int8_model_path;
   } else if (graph_type == ModelGraphType::STRING) {
-    params.Set<std::string>("graph", *g_string_model_path);
-  } else {
-    // by default, simply use the fp32 one.
-    params.Set<std::string>("graph", *g_fp32_model_path);
+    graph_path = *g_string_model_path;
   }
-  return params;
+  std::string fd_or_graph_path = graph_path;
+#ifndef _WIN32
+  if (model_read_option == ModelReadOption::FROM_FD) {
+    int fd = open(graph_path.c_str(), O_RDONLY);
+    ASSERT_GE(fd, 0);
+    struct stat stat_buf = {0};
+    ASSERT_EQ(fstat(fd, &stat_buf), 0);
+    size_t model_size = stat_buf.st_size;
+    size_t model_offset = 0;
+    fd_or_graph_path =
+        absl::StrFormat("fd:%d:%zu:%zu", fd, model_offset, model_size);
+  }
+#endif  // !defined(_WIN32)
+  params.Set<std::string>("graph", fd_or_graph_path);
 }
 
-BenchmarkParams CreateParams() { return CreateParams(2, 1.0f, 150.0f); }
+BenchmarkParams InitializeParams() {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(params, /*num_runs=*/2, /*min_secs=*/1.0f,
+                   /*max_secs=*/150.0f);
+  return params;
+}
 BenchmarkParams CreateFp32Params() {
-  return CreateParams(2, 1.0f, 150.0f, ModelGraphType::FP32);
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::FP32);
+  return params;
 }
 BenchmarkParams CreateInt8Params() {
-  return CreateParams(2, 1.0f, 150.0f, ModelGraphType::INT8);
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::INT8);
+  return params;
 }
 BenchmarkParams CreateStringParams() {
-  return CreateParams(2, 1.0f, 150.0f, ModelGraphType::STRING);
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_PATH, ModelGraphType::STRING);
+  return params;
+}
+BenchmarkParams CreateStringFdParams() {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(
+      params, /*num_runs=*/2, /*min_secs=*/1.0f, /*max_secs=*/150.0f,
+      /*model_read_option=*/ModelReadOption::FROM_FD, ModelGraphType::STRING);
+  return params;
 }
 
 std::string CreateFilePath(const std::string& file_name) {
@@ -160,6 +202,15 @@ TEST(BenchmarkTest, DoesntCrashStringModel) {
   benchmark.Run();
 }
 
+#ifndef _WIN32
+TEST(BenchmarkTest, DoesntCrashStringModelWithFd) {
+  ASSERT_THAT(g_int8_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateStringFdParams());
+  benchmark.Run();
+}
+#endif  // !defined(_WIN32)
+
 TEST(BenchmarkTest, SplitInputLayerNameAndValueFile) {
   std::vector<std::string> input_layer_value_files = {
       "input:/tmp/input",
@@ -413,9 +464,10 @@ class MaxDurationWorksTestListener : public BenchmarkListener {
 
 TEST(BenchmarkTest, MaxDurationWorks) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
-  TestBenchmark benchmark(CreateParams(100000000 /* num_runs */,
-                                       1000000.0f /* min_secs */,
-                                       0.001f /* max_secs */));
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  InitializeParams(params, 100000000 /* num_runs */, 1000000.0f /* min_secs */,
+                   0.001f /* max_secs */);
+  TestBenchmark benchmark(std::move(params));
   MaxDurationWorksTestListener listener;
   benchmark.AddListener(&listener);
   benchmark.Run();
@@ -424,7 +476,7 @@ TEST(BenchmarkTest, MaxDurationWorks) {
 TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
 
-  TestBenchmark benchmark(CreateParams());
+  TestBenchmark benchmark(InitializeParams());
   benchmark.Init();
   benchmark.Prepare();
 
@@ -446,6 +498,24 @@ TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
                            input_tensor->data.raw + input_tensor->bytes));
 }
 
+TEST(BenchmarkTest, InitializationFailedWhenInvalidGraphPathIsProvided) {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  params.Set<std::string>("graph", "invalid/path");
+
+  TestBenchmark benchmark(std::move(params));
+
+  EXPECT_EQ(benchmark.Init(), kTfLiteError);
+}
+
+TEST(BenchmarkTest, InitializationFailedWhenInvalidGraphFdIsProvided) {
+  BenchmarkParams params = BenchmarkTfLiteModel::DefaultParams();
+  params.Set<std::string>("graph", "fd:file:descriptor");
+
+  TestBenchmark benchmark(std::move(params));
+
+  EXPECT_EQ(benchmark.Init(), kTfLiteError);
+}
+
 }  // namespace
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 07870df303d..820b47e690a 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -34,13 +34,14 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
 #include "ruy/profiler/profiler.h"  // from @ruy
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/optional_debug_tools.h"
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/benchmark/profiling_listener.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/tools/model_loader.h"
 #include "tensorflow/lite/tools/utils.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
@@ -742,6 +744,15 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
   TFLITE_MAY_LOG(INFO, (created_delegates.size() >= 2))
       << "Going to apply " << created_delegates.size()
       << " delegates one after another.";
+
+  // If created_delegates is empty, 'require_full_delegation' flag will not be
+  // checked, thus CPU fallback will happen. Adding check here to avoid
+  // fallback in this situation.
+  if (created_delegates.empty() &&
+      params_.Get<bool>("require_full_delegation")) {
+    TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
+    return kTfLiteError;
+  }
   for (auto& created_delegate : created_delegates) {
     const auto* delegate_provider = created_delegate.provider;
     TfLiteDelegate* delegate = created_delegate.delegate.get();
@@ -854,13 +865,20 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
 }
 
 TfLiteStatus BenchmarkTfLiteModel::LoadModel() {
-  std::string graph = params_.Get<std::string>("graph");
-  model_ = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
-  if (!model_) {
-    TFLITE_LOG(ERROR) << "Failed to mmap model " << graph;
+  std::string fd_or_graph_path = params_.Get<std::string>("graph");
+  model_loader_ = tools::CreateModelLoaderFromPath(fd_or_graph_path);
+  if (!model_loader_) {
+    TFLITE_LOG(ERROR) << "Failed to initialize model loader with path "
+                      << fd_or_graph_path;
+    return kTfLiteError;
+  }
+  if (!model_loader_->Init()) {
+    TFLITE_LOG(ERROR) << "Failed to load model " << fd_or_graph_path;
     return kTfLiteError;
   }
-  TFLITE_LOG(INFO) << "Loaded model " << graph;
+  model_ = tflite::FlatBufferModel::BuildFromModel(
+      model_loader_->GetModel()->GetModel());
+  TFLITE_LOG(INFO) << "Loaded model " << fd_or_graph_path;
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index ac7f6f3cd3d..f5bf0aa6591 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -24,9 +24,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+#include "tensorflow/lite/tools/model_loader.h"
 #include "tensorflow/lite/tools/utils.h"
 
 namespace tflite {
@@ -128,6 +129,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::vector<Interpreter::TfLiteDelegatePtr> owned_delegates_;
   // Always TFLITE_LOG the benchmark result.
   BenchmarkLoggingListener log_output_;
+  std::unique_ptr<tools::ModelLoader> model_loader_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/BUILD b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
index a9383b6edd2..f43358bd85f 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["benchmark"],
     licenses = ["notice"],
 )
@@ -28,7 +29,7 @@ cc_library(
         "benchmark",
     ],
     deps = [
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
         "//tensorflow/tsl/util:stats_calculator_portable",
     ],
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h
index c80c97c9f46..30328aa5c7e 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_C_BENCHMARK_C_API_H_
 #define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_C_BENCHMARK_C_API_H_
 
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 
 // -----------------------------------------------------------------------------
 // Experimental C APIs for the benchmark tool, mainly intended to be used for
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/AndroidManifest.xml
index 1ae71ac5525..868633696c2 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/AndroidManifest.xml
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/AndroidManifest.xml
@@ -16,6 +16,7 @@
 -->
 
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
     package="org.tensorflow.lite.benchmark.delegateperformance">
 
     <!-- Necessary for loading custom models from disk and writing result to disk. -->
@@ -29,7 +30,10 @@
         android:minSdkVersion="23"
         android:targetSdkVersion="33" />
 
-    <application>
+    <application android:debuggable="true">
+        <!-- DPB needs to enable debugging to allow developers to copy their stable delegate
+        libraries into the application's data directory, to enable testing of such stable delegate
+        libraries. -->
         <!-- This Activity runs Delegate Performance Benchmark for latency. -->
         <activity
             android:name=".BenchmarkLatencyActivity"
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
index b9b175d3d41..8596decf9d3 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/BUILD
@@ -2,102 +2,48 @@
 #   Delegate Performance Benchmark (DPB) Android app.
 #   This provides model-level latency & accuracy testings for delegates, on Android.
 
-load("//tensorflow/lite/experimental/acceleration/mini_benchmark:build_defs.bzl", "validation_model")
-load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
-load("//tensorflow:tensorflow.bzl", "clean_dep")
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
 
-android_binary(
-    name = "delegate_performance_benchmark",
+android_library(
+    name = "delegate_performance_benchmark_lib",
     srcs = glob([
-        "src/**/*.java",
+        "src/main/java/**/*.java",
     ]),
-    assets = [":mobilenet_float_with_validation.tflite"],
-    assets_dir = "",
+    visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
+    deps = [
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs_android",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_java_proto_lite",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:benchmark_native",
+        "@flatbuffers//:runtime_android",
+    ],
+)
+
+# The main test app.
+android_binary(
+    name = "delegate_performance_benchmark",
+    assets = [
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models:accuracy_models",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models:latency_criteria_files",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models:latency_models",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:default_latency_criteria",
+    ],
+    assets_dir = "assets",
     custom_package = "org.tensorflow.lite.benchmark.delegate_performance",
     manifest = "AndroidManifest.xml",
     multidex = "native",
+    nocompress_extensions = [".tflite"],
     # In some platforms we don't have an Android SDK/NDK and this target
     # can't be built. We need to prevent the build system from trying to
     # use the target in that case.
     tags = ["manual"],
     visibility = ["//visibility:public"],
     deps = [
-        ":tensorflowlite_delegate_performance_benchmark_native",
-    ],
-)
-
-validation_model(
-    name = "mobilenet_float_with_validation.tflite",
-    jpegs = "//tensorflow/lite/experimental/acceleration/mini_benchmark:odt_classifier_testfiles",
-    main_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/models:mobilenet_v1_1.0_224.tflite",
-    metrics_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/metrics:mobilenet_metrics_tflite",
-)
-
-tflite_jni_binary(
-    name = "libtensorflowlite_delegate_performance_benchmark.so",
-    srcs = ["jni/delegate_performance_benchmark_jni.cc"],
-    deps = [
-        ":accuracy_benchmark",
-        ":latency_benchmark",
-        "//tensorflow/lite/java/jni",
-    ],
-)
-
-cc_library(
-    name = "latency_benchmark",
-    srcs = ["jni/latency_benchmark.cc"],
-    hdrs = ["jni/latency_benchmark.h"],
-    deps = [
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
-    ],
-)
-
-cc_library(
-    name = "accuracy_benchmark",
-    srcs = ["jni/accuracy_benchmark.cc"],
-    hdrs = ["jni/accuracy_benchmark.h"],
-    deps = [
-        "//tensorflow/lite:minimal_logging",
-        "@flatbuffers//:public_headers_lib",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:blocking_validator_runner",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_entrypoint",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_options",
-        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration:xnnpack_plugin",
-        "//tensorflow/lite/tools:command_line_flags",
-        "//tensorflow/lite/tools:tool_params",
-    ] + select({
-        # On Android, as the validation runs in a separate process as a
-        # different binary, any TFLite delegates to be validated need to
-        # include corresponding delegate plugins.
-        clean_dep("//tensorflow:android"): [
-            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_test(
-    name = "accuracy_benchmark_test",
-    srcs = ["jni/accuracy_benchmark_test.cc"],
-    deps = [
-        ":accuracy_benchmark",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
-        "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
-        "@com_google_googletest//:gtest_main",
+        ":delegate_performance_benchmark_lib",
     ],
 )
-
-cc_library(
-    name = "tensorflowlite_delegate_performance_benchmark_native",
-    srcs = ["libtensorflowlite_delegate_performance_benchmark.so"],
-)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
index 31119903c61..beed6843c10 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md
@@ -1,145 +1,256 @@
-# TensorFlow Lite Delegate Performance Benchmark with Android Apk
+# TensorFlow Lite Delegate Performance Benchmark (Android APK)
 
 ## Description
 
-This Android Delegate Performance Benchmark app is a simple wrapper around the
-TensorFlow Lite
+This Android Delegate Performance Benchmark (DPB) app is a simple wrapper around
+the TensorFlow Lite
 [benchmark tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
 and
-[MiniBenchmark](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/acceleration/mini_benchmark).
-
-Development against TensorFlow Lite needs both accuracy and latency evaluations
-for catching potential performance regressions. Pushing and executing both
-latency and accuracy testing binaries directly on an Android device is a valid
-approach to benchmarking, but it can result in subtle (but observable)
-differences in performance relative to execution within an actual Android app.
-In particular, Android's scheduler tailors behavior based on thread and process
-priorities, which differ between a foreground Activity/Application and a regular
-background binary executed via `adb shell ...`.
+[MiniBenchmark](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/acceleration/mini_benchmark)
+with the focus on testing Tensorflow Lite delegates that implement stable
+delegate ABI.
+
+Development of TensorFlow Lite delegates needs both accuracy and latency
+evaluations for catching potential performance regressions. Pushing and
+executing both latency and accuracy testing binaries directly on an Android
+device is a valid approach to benchmarking, but it can result in subtle (but
+observable) differences in performance relative to execution within an actual
+Android app. In particular, Android's scheduler tailors behavior based on thread
+and process priorities, which differ between a foreground Activity/Application
+and a regular background binary executed via `adb shell ...`.
 
 In addition to that, having multiple benchmarking apps for different performance
 metric evaluations could potentially cost development effort unnecessarily.
 
-To that end, this app offers a more faithful view of runtime performance
-(accuracy and latency) that developers can expect when deploying TensorFlow Lite
-with their application.
+To those ends, this app offers a more faithful view of runtime performance
+(accuracy and latency) that developers can expect when using TensorFlow Lite
+delegates with Android apps, and the app provides a single entrypoint to
+various performance metrics to avoid the need to switch between different
+benchmarking apps.
 
 ## To build/install/run
 
 ### Build
+1.  Clone the TensorFlow repo with
+
+    ```
+    git clone --recurse-submodules https://github.com/tensorflow/tensorflow.git
+    ```
+
+    Note: --recurse-submodules is necessary to prevent some issues with protobuf
+    compilation.
 
 1.  Refer to
-    https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/android/test
-    to edit the `WORKSPACE` to configure the android NDK/SDK.
+    [this page](https://www.tensorflow.org/lite/android/lite_build#set_up_build_environment_without_docker)
+    for setting up a development environment. Although there are several
+    practical tips:
+
+    -   When installing Bazel, for Ubuntu Linux, `sudo apt update && sudo apt
+        install bazel` may be the easiest way. However sometimes you may need
+        `sudo apt update && sudo apt install bazel-5.3.0` if prompted.
+    -   When installing Android NDK and SDK, using Android Studio's SDK Manager
+        may be the easiest way.
+    -   The versions which we have verified are working:
+        -   Android NDK API level: 21
+        -   Android SDK API level: 33
+        -   Android build tools version: 30.0.0
+    -   Run the `./configure` script in the root TensorFlow checkout directory,
+        and answer "Yes" when the script asks to interactively configure the
+        `./WORKSPACE` for Android builds.
 
 1.  Build for your specific platform, e.g.:
 
-```
-bazel build -c opt \
-  --config=android_arm64 \
-  tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:delegate_performance_benchmark
-```
+    ```
+    bazel build -c opt \
+      --config=android_arm64 \
+      tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:delegate_performance_benchmark
+    ```
 
 ### Install
 
-1.  Connect your phone. Install the benchmark APK on your phone with adb:
+1.  Connect to a physical device. Install the benchmark APK with adb:
 
-```
-adb install -r -d -g bazel-bin/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/delegate_performance_benchmark.apk
-```
+    ```
+    adb install -r -d -g bazel-bin/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/delegate_performance_benchmark.apk
+    ```
 
-Note: Make sure to install with "-g" option to grant the permission for reading
-external storage.
+    Note: Make sure to install with "-g" option to grant the permission for
+    reading external storage.
 
 ### Run
 
-#### Latency benchmarking
+#### Benchmarking with stable delegates
+
+The delegate-under-test must implement the
+[stable_delegate_interface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_interface.h)
+API. The stable delegate provider dynamically loads stable delegate symbols
+from the provided binary (shared object) file. In order to use Delegate
+Performance Benchmark with a stable delegate, you would need to push the shared
+object file to the file directory of Delegate Performance Benchmark:
+`/data/data/org.tensorflow.lite.benchmark.delegateperformance/files/`.
+
+
+1.  Build and push the stable delegate binary that you want to test.
+    Here we use the
+    [sample stable delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate)
+    as an example.
+
+    ```
+    bazel build -c opt \
+      --config=android_arm64 \
+      tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate
+
+    # Set the permissions so that we can overwrite a previously installed delegate.
+    chmod 755 bazel-bin/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so
+
+    # Ensure the delegateperformance files path exists.
+    adb shell run-as org.tensorflow.lite.benchmark.delegateperformance mkdir -p /data/data/org.tensorflow.lite.benchmark.delegateperformance/files
+
+    # Install the sample delegate.
+    adb push \
+      bazel-bin/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so \
+      /data/local/tmp/
+    adb shell run-as org.tensorflow.lite.benchmark.delegateperformance \
+      cp /data/local/tmp/libtensorflowlite_sample_stable_delegate.so \
+        /data/data/org.tensorflow.lite.benchmark.delegateperformance/files/
+    ```
+
+1. Dump the test sample delegate settings file on device. Example command:
+
+    ```
+    adb shell 'echo "{
+      \"stable_delegate_loader_settings\": {
+        \"delegate_path\": \"/data/data/org.tensorflow.lite.benchmark.delegateperformance/files/libtensorflowlite_sample_stable_delegate.so\"
+      }
+      // Add concrete delegate settings for the test target delegate.
+    }
+    "> /data/local/tmp/stable_delegate_settings.json'
+    ```
+
+#### Supported models
+
+Currently DPB uses a `mobilenet_v1_1.0_224.tflite` and
+`mobilenet_quant_v1_224.tflite` model for latency and accuracy benchmarking. The
+TF Lite model files are bundled into the app during the build process. We plan
+to expand the supported models based on future use cases.
+
+Note: The sample stable delegate provided here only supports ADD and SUB
+operations thus aforementioned mobilenet models would not actually be delegated.
+To test your own delegate against the models, please update
+`stable_delegate_loader_settings` with your delegate path.  To get feedback
+early in the development process, e.g. while working towards supporting more
+OPs, you can run the `benchmark_model` tool, which supports stable delegates and
+can be supplied with arbitrary models via the `--graph` CLI parameter. See [this document](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md#tf-lite-benchmark-tool)
+which shows how to run a model with ADD operations through the sample stable
+delegate.
 
-1.  Push the TensorFlow Lite model that you need to test.
+#### Latency benchmarking
 
-```
-adb push mobilenet_quant_v1_224.tflite /data/local/tmp
-```
+##### Options
 
-1.  Run the benchmark. Additional command-line flags are documented
+-   `tflite_settings_files`: `str` (required) the comma-delimited paths to the
+    JSON-encoded delegate `TFLiteSettings` file(s), which is defined in [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto).
+-   Additional optional command-line flags are documented
     [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/README.md)
-    and can be appended to the `args` string alongside the required `--graph`
-    flag (note that all args must be nested in the single quoted string that
-    follows the args key). {value=2}
+    and can be appended to the `args` string (note that all args must be nested
+    in the single quoted string that follows the args key).
+
+##### Steps
+
+1.  Run the latency benchmark by supplying the settings file via the required
+    `--tflite_settings_files` flag.
 
-```
-adb shell "am start -S \
+    ```
+    adb shell "am start -S \
   -n org.tensorflow.lite.benchmark.delegateperformance/org.tensorflow.lite.benchmark.delegateperformance.BenchmarkLatencyActivity \
-  --esa --args '--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite, \
-               --num_threads=4'"
-```
+  --esa --tflite_settings_files '/data/local/tmp/stable_delegate_settings.json'"
+    ```
 
 1.  The results will be available in Android logcat as well as the app's file
-    directory, e.g.: {value=3}
+    directory, e.g.:
 
-```
-adb logcat | grep "Inference timings in us"
+    ```
+    adb logcat -c && adb logcat -v color  | grep "Inference timings in us"
 
-... tflite  : Inference timings in us: Init: 1007529, First inference: 4098, Warmup (avg): 1686.59, Inference (avg): 1687.92
-```
+    ... tflite  : Inference timings in us: Init: 5811, First inference: 67743, Warmup (avg): 65539, Inference (avg): 65155.5
+    ```
 
-and `report.json`
+    The tool also shows overall results.
 
-```
-{
-  "name": "TFLite benchmark",
-  "status": "OK",
-  "average time in us": "init: 1007529, warmup: 1686.59, inference: 1687.92",
-  ...
-}
-```
+    ```
+    adb logcat -c && adb logcat -v color  | grep 'Latency benchmark result'
+    ```
 
-#### Accuracy benchmarking
-
-##### Options
+    which might show output like the following.
 
-**XNNPack delegate provider**
+    ```
+    ... TfLiteLatencyImpl: Latency benchmark result for /data/local/tmp/stable_delegate_settings.json: PASS
+    ```
 
-- `use_xnnpack`: `bool` (default=true)
+    For a summarized view, run
 
-**GPU delegate provider**
+    ```
+    adb shell run-as org.tensorflow.lite.benchmark.delegateperformance "cat /data/user/0/org.tensorflow.lite.benchmark.delegateperformance/files/delegate_performance_result/latency/mobilenet_v1_1.0_224.csv" | column -t -s,
+    ```
 
-- `use_gpu`: `bool` (default=false)
+    It would produce output like the following:
 
-**NNAPI delegate provider**
+    |Metric|0 (default_delegate)|0 (/data/local/tmp/stable_delegate_settings.json)|%|
+    |---|---|---|---|
+    |inference_latency_average_us|97936.44|97963.66|0.0%|
+    |initialization_memory_total_allocated_mebibyte|0.0|0.0|0%|
+    |overall_memory_max_rss_mebibyte|43.246094|0.65625|-98.5%|
+    |model_size_megabyte|-1.0E-6|-1.0E-6|0%|
+    |initialization_latency_us|39640.0|40155.0|1.3%|
+    |warmup_latency_standard_deviation|1644.0|1463.0|-11.0%|
+    |initialization_memory_in_use_mebibyte|23.091217|27.671188|19.8%|
+    |overall_memory_in_use_mebibyte|25.15126|29.752167|18.3%|
+    |warmup_latency_max_us|102423.0|101935.0|-0.5%|
+    |warmup_latency_min_us|97855.0|97868.0|0.0%|
+    |inference_latency_min_us|97597.0|97657.0|0.1%|
+    |overall_memory_total_allocated_mebibyte|0.0|0.0|0%|
+    |inference_latency_max_us|98295.0|98370.0|0.1%|
+    |initialization_memory_max_rss_mebibyte|34.39453|0.0|-100.0%|
+    |warmup_latency_average_us|98752.836|98672.164|-0.1%|
+    |inference_latency_standard_deviation|146.0|143.0|-2.1%|
 
-- `use_nnapi`: `bool` (default=false)
+#### Accuracy benchmarking
 
-##### Steps
+##### Options
 
-1.  Run the benchmark. Currently the accuracy benchmark only supports parsing
-    arguments for delegate selection.
+-   `tflite_settings_files`: `str` (required) the comma-delimited paths to the
+    JSON-encoded delegate `TFLiteSettings` file(s), which is defined in [configuration.proto](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto).
 
-TODO(b/250886376): allow passing delegate
-    binary that implements stable delegate ABI.
+##### Steps
 
-Run the test with the XNNPack delegate (default):
+1.  Run the accuracy benchmark by supplying the settings file via the required
+    `--tflite_settings_files` flag.
 
-```
-adb shell "am start -S \
+    ```
+    adb shell "am start -S \
   -n org.tensorflow.lite.benchmark.delegateperformance/org.tensorflow.lite.benchmark.delegateperformance.BenchmarkAccuracyActivity \
-  --esa --args ''"
-```
+  --esa --tflite_settings_files '/data/local/tmp/stable_delegate_settings.json'"
+    ```
 
-Run the test with the GPU delegate:
+1.  The results will be available in Android logcat, e.g.:
 
-```
-adb shell "am start -S \
-  -n org.tensorflow.lite.benchmark.delegateperformance/org.tensorflow.lite.benchmark.delegateperformance.BenchmarkAccuracyActivity \
-  --esa --args '--use_gpu=true'"
-```
+    ```
+    adb logcat -c && adb logcat -v color | grep "tflite"
 
-1.  The results will be available in Android logcat , e.g.: {value=2}
+    ... tflite  : tflite  :   accuracy: ok
+    ```
 
-TODO(b/250877013): improve performance thresholding and result reporting
+    For a summarized view, run
+    
+    ```
+    adb shell run-as org.tensorflow.lite.benchmark.delegateperformance "cat /data/user/0/org.tensorflow.lite.benchmark.delegateperformance/files/delegate_performance_result/accuracy/mobilenet_v1_1.0_224_with_validation.csv" | column -t -s,
+    ```
 
-```
-adb logcat | grep "tflite"
+    It would produce output like the following:
 
-... tflite  : tflite  :   accuracy: ok
-```
+    |Metric|0 (default_delegate)|0 (/data/local/tmp/stable_delegate_settings.json)|%|
+    |---|---|---|---|
+    |max_memory_kb|0.0|0.0|0%|
+    |symmetric_kl_divergence(average)|3.5098449E-9|3.5098449E-9|0%|
+    |mse(average)|1.0153732E-16|1.0153732E-16|0%|
+    |ok|1.0|1.0|0%|
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.cc
deleted file mode 100644
index 84c7f5e1675..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h"
-
-#include <errno.h>
-#include <stdio.h>
-#include <sys/stat.h>
-
-#include <cstddef>
-#include <fstream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "flatbuffers/buffer.h"  // from @flatbuffers
-#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/tools/command_line_flags.h"
-#include "tensorflow/lite/tools/tool_params.h"
-
-namespace tflite {
-namespace benchmark {
-namespace accuracy {
-namespace {
-
-template <typename T>
-Flag CreateFlag(const char* name, tools::ToolParams* params,
-                const std::string& usage) {
-  return Flag(
-      name,
-      [params, name](const T& val, int argv_position) {
-        params->Set<T>(name, val, argv_position);
-      },
-      params->Get<T>(name), usage, Flag::kOptional);
-}
-
-AccuracyBenchmarkStatus ConfigureTFLiteSettingsFromArgs(
-    const std::vector<std::string>& args, flatbuffers::FlatBufferBuilder& fbb) {
-  // TODO(b/241781387): Improve argument parsing for TFLite settings parameters
-  // and validator runner options.
-  tools::ToolParams params;
-  // Apply XNNPack delegate by default.
-  params.AddParam("use_xnnpack", tools::ToolParam::Create<bool>(true));
-  params.AddParam("use_nnapi", tools::ToolParam::Create<bool>(false));
-  params.AddParam("use_gpu", tools::ToolParam::Create<bool>(false));
-
-  std::vector<const char*> argv;
-  std::string arg0 = "(MiniBenchmarkAndroid)";
-  argv.push_back(const_cast<char*>(arg0.data()));
-  for (auto& arg : args) {
-    argv.push_back(arg.data());
-  }
-  int argc = argv.size();
-  if (!Flags::Parse(
-          &argc, argv.data(),
-          {
-              CreateFlag<bool>("use_gpu", &params,
-                               "Apply GPU delegate for benchmarking."),
-              CreateFlag<bool>("use_nnapi", &params,
-                               "Apply NNAPI delegate for benchmarking."),
-              CreateFlag<bool>("use_xnnpack", &params,
-                               "Apply XNNPack delegate for benchmarking."),
-          })) {
-    return kAccuracyBenchmarkArgumentParsingFailed;
-  }
-  bool use_xnnpack = params.Get<bool>("use_xnnpack");
-  bool use_gpu = params.Get<bool>("use_gpu");
-  bool use_nnapi = params.Get<bool>("use_nnapi");
-  // Use Delegate_NONE as the default value here for delegate because XNNPack
-  // delegate will still be applied as the default delegate unless it is
-  // specified as disabled.
-  Delegate delegate = Delegate_NONE;
-  if (use_gpu && use_nnapi) {
-    return kAccuracyBenchmarkMoreThanOneDelegateProvided;
-  } else if (use_gpu) {
-    delegate = Delegate_GPU;
-  } else if (use_nnapi) {
-    delegate = Delegate_NNAPI;
-  }
-
-  TFLiteSettingsBuilder tflite_settings_builder(fbb);
-  tflite_settings_builder.add_delegate(delegate);
-  tflite_settings_builder.add_disable_default_delegates(!use_xnnpack);
-  flatbuffers::Offset<TFLiteSettings> tflite_settings =
-      tflite_settings_builder.Finish();
-  fbb.Finish(tflite_settings);
-  return kAccuracyBenchmarkSuccess;
-}
-
-}  // namespace
-
-AccuracyBenchmarkStatus Benchmark(const std::vector<std::string>& args,
-                                  int model_fd, size_t model_offset,
-                                  size_t model_size,
-                                  const char* result_path_chars) {
-  std::string result_path(result_path_chars);
-  acceleration::ValidatorRunnerOptions options;
-  options.model_fd = model_fd;
-  options.model_offset = model_offset;
-  options.model_size = model_size;
-  options.data_directory_path = result_path;
-  options.storage_path = result_path + "/storage_path.fb";
-  int return_code = std::remove(options.storage_path.c_str());
-  if (return_code) {
-    TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
-                    "Failed to remove storage file (%s): %s.",
-                    options.storage_path.c_str(), strerror(errno));
-  }
-  options.per_test_timeout_ms = 5000;
-
-  acceleration::BlockingValidatorRunner runner(options);
-  acceleration::MinibenchmarkStatus status = runner.Init();
-  if (status != acceleration::kMinibenchmarkSuccess) {
-    TFLITE_LOG_PROD(
-        TFLITE_LOG_ERROR,
-        "MiniBenchmark BlockingValidatorRunner initialization failed with "
-        "error code %d",
-        status);
-    return kAccuracyBenchmarkRunnerInitializationFailed;
-  }
-
-  flatbuffers::FlatBufferBuilder fbb;
-  AccuracyBenchmarkStatus parse_status =
-      ConfigureTFLiteSettingsFromArgs(args, fbb);
-  if (parse_status != kAccuracyBenchmarkSuccess) {
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR,
-                    "Failed to parse arguments with error code %d",
-                    parse_status);
-    return parse_status;
-  }
-  std::vector<const TFLiteSettings*> settings = {
-      flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer())};
-  std::vector<const BenchmarkEvent*> results =
-      runner.TriggerValidation(settings);
-  if (results.size() != settings.size()) {
-    TFLITE_LOG_PROD(
-        TFLITE_LOG_ERROR,
-        "Number of result events (%zu) doesn't match the expectation (%zu).",
-        results.size(), settings.size());
-    return kAccuracyBenchmarkResultCountMismatch;
-  }
-  // The settings contains one test only. Therefore, the benchmark checks for
-  // the first result only.
-  if (!results[0]->result()->ok()) {
-    return kAccuracyBenchmarkFail;
-  }
-  return kAccuracyBenchmarkPass;
-}
-
-}  // namespace accuracy
-}  // namespace benchmark
-}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h
deleted file mode 100644
index d6b287a7e8a..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_ACCURACY_BENCHMARK_H_
-#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_ACCURACY_BENCHMARK_H_
-
-#include <cstddef>
-#include <string>
-#include <vector>
-
-namespace tflite {
-namespace benchmark {
-namespace accuracy {
-
-enum AccuracyBenchmarkStatus {
-  kAccuracyBenchmarkUnknownStatus = 0,
-
-  // General status codes.
-  kAccuracyBenchmarkPass = 10,
-  kAccuracyBenchmarkFail = 11,
-  kAccuracyBenchmarkSuccess = 12,
-
-  // Set of error codes that are used as the return codes to communicate between
-  // AccuracyBenchmark and the caller app.
-  kAccuracyBenchmarkRunnerInitializationFailed = 100,
-  kAccuracyBenchmarkResultCountMismatch = 101,
-  kAccuracyBenchmarkArgumentParsingFailed = 102,
-  kAccuracyBenchmarkMoreThanOneDelegateProvided = 103,
-};
-
-// Triggers MiniBenchmark testings. Parses the arguments passed from the
-// testing app to configure MiniBenchmark ValidatorRunner. The tests will
-// access and execute the pre-embedded models in the app via the model file
-// descriptor. The contents of a model are initialized using model_size
-// bytes starting at model_offset position in the file described by
-// model_fd. Any intermediate data and results will be dumped to the result
-// path given.
-//
-// Returns kAccuracyBenchmarkPass if the benchmark tests finish successfully
-// with a pass from MiniBenchmark. Otherwise, returns kAccuracyBenchmarkFail
-// or corresponding error codes.
-AccuracyBenchmarkStatus Benchmark(const std::vector<std::string>& args,
-                                  int model_fd, size_t model_offset,
-                                  size_t model_size,
-                                  const char* result_path_chars);
-
-}  // namespace accuracy
-}  // namespace benchmark
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_ACCURACY_BENCHMARK_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark_test.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark_test.cc
deleted file mode 100644
index 3e54e9e95bc..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark_test.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h"
-
-#include <fcntl.h>
-#include <stdio.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
-
-namespace tflite {
-namespace benchmark {
-namespace accuracy {
-namespace {
-
-class AccuracyBenchmarkTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    acceleration::MiniBenchmarkTestHelper helper;
-    should_perform_test_ = helper.should_perform_test();
-
-    if (!should_perform_test_) {
-      return;
-    }
-    std::string embedded_model_path = helper.DumpToTempFile(
-        "mobilenet_quant_with_validation.tflite",
-        g_tflite_acceleration_embedded_mobilenet_validation_model,
-        g_tflite_acceleration_embedded_mobilenet_validation_model_len);
-    ASSERT_TRUE(!embedded_model_path.empty());
-
-    model_fp_ = fopen(embedded_model_path.c_str(), "rb");
-    ASSERT_TRUE(model_fp_ != nullptr);
-    ASSERT_EQ(fseek(model_fp_, 0, SEEK_END), 0);
-    model_size_ = ftell(model_fp_);
-    ASSERT_NE(model_size_, -1);
-    ASSERT_EQ(fseek(model_fp_, 0, SEEK_SET), 0);
-
-    result_path_ = ::testing::TempDir();
-  }
-
-  void TearDown() override { fclose(model_fp_); }
-
-  std::string result_path_;
-  size_t model_size_;
-  FILE* model_fp_;
-  bool should_perform_test_ = true;
-};
-
-TEST_F(AccuracyBenchmarkTest, FailedWithInvalidModelFileDescriptor) {
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args;
-
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, 0, 0, 0, result_path_.c_str());
-
-  EXPECT_EQ(status, kAccuracyBenchmarkRunnerInitializationFailed);
-}
-
-TEST_F(AccuracyBenchmarkTest, FailedWithInvalidDelegateArguments) {
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args = {"--use_xnnpack=wrong_value"};
-
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-
-  EXPECT_EQ(status, kAccuracyBenchmarkArgumentParsingFailed);
-}
-
-TEST_F(AccuracyBenchmarkTest, WithMoreThanOneDelegateArguments) {
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args = {"--use_xnnpack=true", "--use_nnapi=true"};
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-  EXPECT_EQ(status, kAccuracyBenchmarkPass);
-
-  args = {"--use_gpu=true", "--use_nnapi=true"};
-  status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-  EXPECT_EQ(status, kAccuracyBenchmarkMoreThanOneDelegateProvided);
-}
-
-TEST_F(AccuracyBenchmarkTest, SucceedWithEmbeddedValidationWithoutXnnpack) {
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args;
-
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-
-  // TODO(b/253442685): verify that XNNPack was not used.
-  EXPECT_EQ(status, kAccuracyBenchmarkPass);
-}
-
-#ifdef __ANDROID__
-TEST_F(AccuracyBenchmarkTest, SucceedWithEmbeddedValidationOnGpu) {
-#else   // __ANDROID__
-TEST_F(AccuracyBenchmarkTest, DISABLED_SucceedWithEmbeddedValidationOnGpu) {
-#endif  // __ANDROID__
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args = {"--use_gpu=true"};
-
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-
-  EXPECT_EQ(status, kAccuracyBenchmarkPass);
-}
-
-TEST_F(AccuracyBenchmarkTest, SucceedWithEmbeddedValidationWithXNNPack) {
-  if (!should_perform_test_) {
-    std::cerr << "Skipping test";
-    return;
-  }
-  std::vector<std::string> args = {"--use_xnnpack=true"};
-
-  AccuracyBenchmarkStatus status =
-      Benchmark(args, fileno(model_fp_), 0, model_size_, result_path_.c_str());
-
-  // TODO(b/253442685): verify that XNNPack was used.
-  EXPECT_EQ(status, kAccuracyBenchmarkPass);
-}
-
-}  // namespace
-}  // namespace accuracy
-}  // namespace benchmark
-}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/delegate_performance_benchmark_jni.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/delegate_performance_benchmark_jni.cc
deleted file mode 100644
index 9622e52773a..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/delegate_performance_benchmark_jni.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <jni.h>
-
-#include <string>
-#include <vector>
-
-#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/accuracy_benchmark.h"
-#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.h"
-
-namespace {
-
-// A helper method that converts an array of strings passed from Java to a
-// vector of strings in C++.
-std::vector<std::string> toStringVector(JNIEnv* env,
-                                        jobjectArray string_array) {
-  int len = env->GetArrayLength(string_array);
-  std::vector<std::string> vec(len);
-  for (int i = 0; i < len; ++i) {
-    jstring str =
-        static_cast<jstring>(env->GetObjectArrayElement(string_array, i));
-    const char* chars = env->GetStringUTFChars(str, nullptr);
-    vec[i] = std::string(chars);
-    env->ReleaseStringUTFChars(str, chars);
-    env->DeleteLocalRef(str);
-  }
-  return vec;
-}
-
-}  // namespace
-
-extern "C" {
-
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_benchmark_delegateperformance_DelegatePerformanceBenchmark_latencyBenchmarkNativeRun(
-    JNIEnv* env, jclass clazz, jobjectArray args_obj, jstring result_path_obj) {
-  std::vector<std::string> args = toStringVector(env, args_obj);
-  const char* result_path_chars =
-      env->GetStringUTFChars(result_path_obj, nullptr);
-
-  tflite::benchmark::latency::Benchmark(args, result_path_chars);
-
-  env->ReleaseStringUTFChars(result_path_obj, result_path_chars);
-}
-
-JNIEXPORT jint JNICALL
-Java_org_tensorflow_lite_benchmark_delegateperformance_DelegatePerformanceBenchmark_accuracyBenchmarkNativeRun(
-    JNIEnv* env, jclass clazz, jobjectArray args_obj, jint model_fd,
-    jlong model_offset, jlong model_size, jstring result_path_obj) {
-  std::vector<std::string> args = toStringVector(env, args_obj);
-  const char* result_path_chars =
-      env->GetStringUTFChars(result_path_obj, nullptr);
-
-  int status = tflite::benchmark::accuracy::Benchmark(
-      args, static_cast<int>(model_fd), static_cast<size_t>(model_offset),
-      static_cast<size_t>(model_size), result_path_chars);
-
-  env->ReleaseStringUTFChars(result_path_obj, result_path_chars);
-  return status;
-}
-
-}  // extern "C"
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.cc
deleted file mode 100644
index d3d7cb03170..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <errno.h>
-#include <sys/stat.h>
-
-#include <fstream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
-
-namespace tflite {
-namespace benchmark {
-namespace latency {
-namespace {
-
-bool CreateDir(const char* path) {
-  struct stat st;
-  if (stat(path, &st) != 0) {
-    if (mkdir(path, 0777) != 0 && errno != EEXIST) {
-      return false;
-    }
-  } else if (!S_ISDIR(st.st_mode)) {
-    errno = ENOTDIR;
-    return false;
-  }
-  return true;
-}
-
-// The listener subscribes to the benchmark lifecycle and outputs the success
-// status of the benchmark to a local json file.
-class DelegatePerformanceReportingListener : public BenchmarkListener {
- public:
-  // Generates `report.json` for the success status of a benchmark run under
-  // `result_path` folder.
-  explicit DelegatePerformanceReportingListener(const char* result_path)
-      : result_path_(result_path) {
-    if (!result_path) {
-      TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
-                      "Report will be be streamed only to local log not to a "
-                      "file since the result path is null.");
-    }
-  }
-
-  // TFLite Benchmark Tool triggers this method at the end of a benchmark for
-  // logging the results.
-  void OnBenchmarkEnd(const BenchmarkResults& results) override {
-    ReportResult(results);
-  }
-
-  void ReportFailure(TfLiteStatus status) {
-    std::string status_msg =
-        status == kTfLiteError
-            ? "TFLite error"
-            : (status == kTfLiteDelegateError ? "TFLite delegate error"
-                                              : "Unknown error code");
-    Report(status_msg, std::vector<std::pair<std::string, std::string>>());
-  }
-
- private:
-  void Report(
-      const std::string& status,
-      const std::vector<std::pair<std::string, std::string>>& contents) {
-    std::string filename = result_path_ + "/report.json";
-    std::ofstream file;
-    file.open(filename.c_str());
-    if (!file.is_open()) {
-      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to open file %s",
-                      filename.c_str());
-      return;
-    }
-    std::stringstream report;
-    report << "{\n"
-           << "  \"name\": \"TFLite benchmark\",\n"
-           << "  \"status\": \"" << status << "\"";
-    for (const auto& content : contents) {
-      report << ",\n"
-             << "  \"" << content.first << "\": \"" << content.second << "\"";
-    }
-    report << "\n}\n";
-
-    auto report_str = report.str();
-    file << report_str;
-    file.close();
-
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "%s", report_str.c_str());
-  }
-
-  void ReportResult(const BenchmarkResults& results) {
-    std::vector<std::pair<std::string, std::string>> contents;
-    std::stringstream avg_time;
-    avg_time << "init: " << results.startup_latency_us() << ", "
-             << "warmup: " << results.warmup_time_us().avg() << ", "
-             << "inference: " << results.inference_time_us().avg();
-    contents.emplace_back("average time in us", avg_time.str());
-    std::stringstream overall_mem_usage;
-    overall_mem_usage << results.overall_mem_usage();
-    contents.emplace_back("overall memory usage", overall_mem_usage.str());
-
-    Report("OK", contents);
-  }
-
-  // Root of output path for intermediate results and data.
-  std::string result_path_;
-};
-
-// TODO(b/250877013): expose the results for performance thresholding.
-// `CsvExportingListener` subscribes to the benchmark lifecycle and outputs the
-// results of a benchmark run to local in csv format.
-class CsvExportingListener : public BenchmarkListener {
- public:
-  // Generates `benchmark_result.csv` for performance results of a benchmark run
-  // under `result_path` folder.
-  explicit CsvExportingListener(const char* result_path)
-      : result_path_(result_path) {}
-
-  // TFLite Benchmark Tool triggers this method at the end of a benchmark for
-  // logging the results.
-  void OnBenchmarkEnd(const BenchmarkResults& results) override {
-    WriteBenchmarkResultCsv(results);
-  }
-
- private:
-  void WriteBenchmarkResultCsv(const BenchmarkResults& results) {
-    auto init_us = results.startup_latency_us();
-    auto warmup_us = results.warmup_time_us();
-    auto inference_us = results.inference_time_us();
-    auto init_mem_usage = results.init_mem_usage();
-    auto overall_mem_usage = results.overall_mem_usage();
-
-    std::string filename = result_path_ + "/benchmark_result.csv";
-    std::ofstream file;
-    file.open(filename.c_str());
-    if (!file.is_open()) {
-      TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to open file %s",
-                      filename.c_str());
-      return;
-    }
-
-    file << "model_size,init_time,"
-         << "warmup_avg,warmup_min,warmup_max,warmup_stddev,"
-         << "inference_avg,inference_min,inference_max,inference_stddev,"
-         << "init_max_rss,init_total_alloc,init_in_use_alloc,"
-         << "overall_max_rss,overall_total_alloc,overall_in_use_alloc\n";
-    file << results.model_size_mb() << "," << init_us << "," << warmup_us.avg()
-         << "," << warmup_us.min() << "," << warmup_us.max() << ","
-         << warmup_us.std_deviation() << "," << inference_us.avg() << ","
-         << inference_us.min() << "," << inference_us.max() << ","
-         << inference_us.std_deviation() << ","
-         << (init_mem_usage.mem_footprint_kb / 1024.0) << ","
-         << (init_mem_usage.total_allocated_bytes / 1024.0 / 1024.0) << ","
-         << (init_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0) << ","
-         << (overall_mem_usage.mem_footprint_kb / 1024.0) << ","
-         << (overall_mem_usage.total_allocated_bytes / 1024.0 / 1024.0) << ","
-         << (overall_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0)
-         << "\n";
-    file.close();
-  }
-
-  // Root of output path for intermediate results and data.
-  std::string result_path_;
-};
-
-}  // namespace
-
-void Benchmark(const std::vector<std::string>& args, const char* result_path) {
-  // Constructs a fake argv command-line object for the benchmark.
-  std::vector<char*> argv;
-  std::string arg0 = "(BenchmarkModelAndroid)";
-  argv.push_back(const_cast<char*>(arg0.data()));
-  for (auto& arg : args) {
-    argv.push_back(const_cast<char*>(arg.data()));
-  }
-
-  // Create directory `result_path` if it doesn't already exist.
-  if (!CreateDir(result_path)) {
-    TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Failed to create output directory %s.",
-                    result_path);
-    return;
-  }
-
-  BenchmarkTfLiteModel benchmark;
-  // Generates general benchmark status JSON report.
-  DelegatePerformanceReportingListener delegatePerformanceReporting(
-      result_path);
-  benchmark.AddListener(&delegatePerformanceReporting);
-  // Generates performance benchmark result CSV report.
-  CsvExportingListener csvExporting(result_path);
-  benchmark.AddListener(&csvExporting);
-  auto status = benchmark.Run(static_cast<int>(argv.size()), argv.data());
-  if (status != kTfLiteOk) {
-    delegatePerformanceReporting.ReportFailure(status);
-  }
-}
-
-}  // namespace latency
-}  // namespace benchmark
-}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.h
deleted file mode 100644
index d3bf40ffa25..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/jni/latency_benchmark.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_LATENCY_BENCHMARK_H_
-#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_LATENCY_BENCHMARK_H_
-
-#include <string>
-#include <vector>
-
-namespace tflite {
-namespace benchmark {
-namespace latency {
-
-// Triggers TFLite Benchmark Tool. Passes the arguments from the testing app
-// intent extra directly down to the TFLite Benchmark Tool. Generates
-// report.json and benchmark_result.csv under `result_path`.
-void Benchmark(const std::vector<std::string>& args, const char* result_path);
-
-}  // namespace latency
-}  // namespace benchmark
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_JNI_LATENCY_BENCHMARK_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
new file mode 100644
index 00000000000..cca3579aa40
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
@@ -0,0 +1,85 @@
+# Description:
+#  Holds model-specific files. The app will bundle the files into assets.
+
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
+load("//tensorflow/lite/experimental/acceleration/mini_benchmark:build_defs.bzl", "validation_model")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
+    licenses = ["notice"],
+)
+
+# Embedded models for accuracy benchmarking.
+validation_model(
+    name = "mobilenet_v1_1.0_224_with_validation.tflite",
+    jpegs = "//tensorflow/lite/experimental/acceleration/mini_benchmark:odt_classifier_testfiles",
+    main_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/models:mobilenet_v1_1.0_224.tflite",
+    metrics_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/metrics:mobilenet_metrics_tflite",
+)
+
+validation_model(
+    name = "mobilenet_v1_1.0_224_quant_with_validation.tflite",
+    jpegs = "//tensorflow/lite/experimental/acceleration/mini_benchmark:odt_classifier_testfiles",
+    main_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/models:mobilenet_v1_1.0_224_quant.tflite",
+    metrics_model = "//tensorflow/lite/experimental/acceleration/mini_benchmark/metrics:mobilenet_metrics_tflite",
+)
+
+# Migrate the models into assets folder.
+ACCURACY_MODELS = [
+    "mobilenet_v1_1.0_224_with_validation.tflite",
+    "mobilenet_v1_1.0_224_quant_with_validation.tflite",
+]
+
+LATENCY_MODELS = [
+    "//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite",
+    "//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224_quant.tflite",
+]
+
+COPY_CMD = """
+  srcs=($(SRCS))
+  outs=($(OUTS))
+  for ((i = 0; i < $${#srcs[@]}; ++i)); do
+    echo $${srcs[$$i]};
+    cp $${srcs[$$i]} $${outs[$$i]};
+  done
+"""
+
+genrule(
+    name = "accuracy_models",
+    srcs = [":%s" % model for model in ACCURACY_MODELS],
+    outs = ["assets/accuracy/%s" % model for model in ACCURACY_MODELS],
+    cmd = COPY_CMD,
+)
+
+genrule(
+    name = "latency_models",
+    srcs = [model for model in LATENCY_MODELS],
+    outs = ["assets/latency/%s" % model.split(":")[-1] for model in LATENCY_MODELS],
+    cmd = COPY_CMD,
+)
+
+# Latency criteria for latency benchmarking.
+filegroup(
+    name = "latency_criteria_files",
+    srcs = [
+        ":mobilenet_v1_1_0_224_latency_criteria",
+        ":mobilenet_v1_1_0_224_quant_latency_criteria",
+    ],
+)
+
+proto_data(
+    name = "mobilenet_v1_1_0_224_latency_criteria",
+    src = "mobilenet_v1_1.0_224.textproto",
+    out = "assets/proto/mobilenet_v1_1.0_224.binarypb",
+    proto_deps = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_proto"],
+    proto_name = "tflite.proto.benchmark.LatencyCriteria",
+)
+
+proto_data(
+    name = "mobilenet_v1_1_0_224_quant_latency_criteria",
+    src = "mobilenet_v1_1.0_224_quant.textproto",
+    out = "assets/proto/mobilenet_v1_1.0_224_quant.binarypb",
+    proto_deps = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_proto"],
+    proto_name = "tflite.proto.benchmark.LatencyCriteria",
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224.textproto b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224.textproto
new file mode 100644
index 00000000000..99f3c610fdf
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224.textproto
@@ -0,0 +1,20 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+#
+# proto-file: third_party/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto
+# proto-message: LatencyCriteria
+
+initialization_max_regression_percentage_allowed: 5  # 5%
+average_warm_up_max_regression_percentage_allowed: 5  # 5%
+average_inference_max_regression_percentage_allowed: 5  # 5%
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224_quant.textproto b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224_quant.textproto
new file mode 100644
index 00000000000..99f3c610fdf
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/mobilenet_v1_1.0_224_quant.textproto
@@ -0,0 +1,20 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+#
+# proto-file: third_party/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto
+# proto-message: LatencyCriteria
+
+initialization_max_regression_percentage_allowed: 5  # 5%
+average_warm_up_max_regression_percentage_allowed: 5  # 5%
+average_inference_max_regression_percentage_allowed: 5  # 5%
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto.bzl b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto.bzl
new file mode 100644
index 00000000000..309e3aab2ab
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto.bzl
@@ -0,0 +1,85 @@
+"""BUILD rules for converting proto text files into binary format."""
+
+load("@bazel_skylib//lib:new_sets.bzl", "sets")
+
+def _descriptor_set_list(deps):
+    """Makes a list of distinct FileDescriptorSet files of deps's transitive dependencies"""
+    descriptor_set_set = sets.make()
+    for dep in deps:
+        if ProtoInfo in dep:
+            for descriptor_set in dep[ProtoInfo].transitive_descriptor_sets.to_list():
+                sets.insert(descriptor_set_set, descriptor_set)
+    return sets.to_list(descriptor_set_set)
+
+def _proto_data_impl(ctx):
+    output = ctx.actions.declare_file(
+        ctx.attr.out if ctx.attr.out else "%s.pb" % (ctx.attr.name,),
+    )
+
+    args = []
+    args.append("--deterministic_output")
+    args.append("--encode=%s" % (ctx.attr.proto_name,))
+
+    # Determine all proto descriptor set dependencies, as well as transitive dependencies. Pass
+    # them via --descriptor_set_in flag to the protoc tool.
+    #
+    # If descriptor_set_in exceeds 20000 characters, this implementation will need to be ported to
+    # support passing descriptor_set_in as a flagfile argument.
+    descriptor_set_in = []
+    descriptor_set_list = _descriptor_set_list(ctx.attr.proto_deps)
+    for file in ctx.files.proto_deps:
+        descriptor_set_in.append(file.path)
+    if descriptor_set_list:
+        args.append("--descriptor_set_in=%s" % ":".join([d.path for d in descriptor_set_list]))
+
+    redirect = [
+        # textproto input is passed via stdin.
+        "< '%s\'" % ctx.file.src.path,
+        # binaryproto output is emitted via stdout.
+        "> '%s'" % output.path,
+    ]
+
+    ctx.actions.run_shell(
+        outputs = [output],
+        inputs = [ctx.file.src] + descriptor_set_list,
+        tools = [ctx.executable._tool],
+        command = " ".join([ctx.executable._tool.path] + args + redirect),
+        use_default_shell_env = False,
+    )
+    return DefaultInfo(
+        files = depset([output]),
+        runfiles = ctx.runfiles(files = [output]),
+    )
+
+_TOOL = "@com_google_protobuf//:protoc"
+
+# BUILD rule to convert a protocol buffer in text format into the standard binary format.
+#
+# Args:
+#   name: The name of the build target.
+#   src: A text formatted protocol buffer.
+#   proto_name: The name of the message type in the .proto files that "src" file represents.
+#   proto_deps: The list of proto_library targets where "proto" is defined.
+#               Transitive dependencies are pulled in automatically.
+#   out: (optional) The name of output file. If out is not set then name of
+#        output file is name + ".binarypb" extension.
+proto_data = rule(
+    implementation = _proto_data_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "src": attr.label(
+            allow_single_file = True,
+        ),
+        "proto_name": attr.string(),
+        "proto_deps": attr.label_list(
+            providers = [ProtoInfo],
+        ),
+        "out": attr.string(),
+        "_tool": attr.label(
+            executable = True,
+            cfg = "exec",
+            allow_files = True,
+            default = Label(_TOOL),
+        ),
+    },
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/BUILD
new file mode 100644
index 00000000000..69ac63bc766
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/BUILD
@@ -0,0 +1,33 @@
+# Description:
+#  Holds model-agnostic files and proto definitions. The app will bundle the files into assets.
+
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
+    licenses = ["notice"],
+)
+
+proto_library(
+    name = "delegate_performance_proto",
+    srcs = ["delegate_performance.proto"],
+)
+
+cc_proto_library(
+    name = "delegate_performance_cc_proto",
+    deps = [":delegate_performance_proto"],
+)
+
+java_lite_proto_library(
+    name = "delegate_performance_java_proto_lite",
+    deps = [":delegate_performance_proto"],
+)
+
+proto_data(
+    name = "default_latency_criteria",
+    src = "default_latency_criteria.textproto",
+    out = "assets/proto/default_latency_criteria.binarypb",
+    proto_deps = [":delegate_performance_proto"],
+    proto_name = "tflite.proto.benchmark.LatencyCriteria",
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/default_latency_criteria.textproto b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/default_latency_criteria.textproto
new file mode 100644
index 00000000000..99f3c610fdf
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/default_latency_criteria.textproto
@@ -0,0 +1,20 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+#
+# proto-file: third_party/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto
+# proto-message: LatencyCriteria
+
+initialization_max_regression_percentage_allowed: 5  # 5%
+average_warm_up_max_regression_percentage_allowed: 5  # 5%
+average_inference_max_regression_percentage_allowed: 5  # 5%
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto
new file mode 100644
index 00000000000..551dc3f7043
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.proto
@@ -0,0 +1,115 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+syntax = "proto3";
+
+package tflite.proto.benchmark;
+
+option java_package = "tflite.proto.benchmark";
+
+// Parameters for latency thresholds of the delegate latency benchmarking
+// results. The delegate performance benchmark app generates a "PASS" result for
+// a model if no thresholds are breached.
+//
+// The latency criteria is designed to be model specific.
+//
+// Next ID: 4
+message LatencyCriteria {
+  // The maximum regression (%) of initialization time that is allowed.
+  // If initialization_max_regression_percentage_allow is 5 and the
+  // initialization time of the test delegate is slower than the reference
+  // delegate by more than 5%, it will fail the latency benchmarking.
+  //
+  // Note: It is only applied when comparing two stable delegates.
+  optional float initialization_max_regression_percentage_allowed = 1;
+  // The maximum regression (%) of warmup time that is allowed. The warmup is
+  // the inferences before the benchmark run. The default number of warmup
+  // inferences times is 1 and the number is configurable via TFLite Benchmark
+  // Tool arguments.
+  //
+  // If average_warm_up_max_regression_percentage_allowed is 5 and the warmup
+  // time of the test delegate is slower than the reference delegate by more
+  // than 5%, it will fail the latency benchmarking.
+  //
+  // Note: It is only applied when comparing two stable delegates.
+  optional float average_warm_up_max_regression_percentage_allowed = 2;
+  // The maximum regression (%) of inference time that is allowed.
+  // If average_inference_max_regression_percentage_allowed is 5 and the
+  // inference time of the test delegate is slower than the reference delegate
+  // by more than 5%, it will fail the latency benchmarking.
+  optional float average_inference_max_regression_percentage_allowed = 3;
+}
+
+// Which stage of benchmarking the event is for.
+enum BenchmarkEventType {
+  BENCHMARK_EVENT_TYPE_UNDEFINED = 0;
+  // Benchmark start. A start without an end can be interpreted as a test that
+  // has crashed or hung.
+  BENCHMARK_EVENT_TYPE_START = 1;
+  // Benchmarking completion. A model was successfully loaded, acceleration
+  // configured and inference run without errors. There may still be an issue
+  // with correctness of results, or with performance.
+  BENCHMARK_EVENT_TYPE_END = 2;
+  // Benchmark was not completed due to an error. The error may be a handled
+  // error (e.g., failure in a delegate), or a crash.
+  BENCHMARK_EVENT_TYPE_ERROR = 3;
+}
+
+// A handled error.
+//
+// Next ID: 2
+message ErrorCode {
+  // What the TF Lite level error is.
+  // See TfLiteStatus in tensorflow/lite/core/c/c_api_types.h for the meaning of
+  // the values.
+  optional int32 tflite_error = 1;
+}
+
+// An error that occurred during benchmarking.
+//
+// Used with event type ERROR.
+//
+// Next ID: 3
+message BenchmarkError {
+  // Handled tflite error.
+  optional ErrorCode error_code = 1;
+  optional string error_message = 2;
+}
+
+// A metric from a benchmark, for example an average inference time in
+// microseconds.
+message BenchmarkMetric {
+  // Metric name, for example inference_latency_average_us.
+  optional string name = 1;
+  // Metric value, for example 180000.
+  optional float value = 2;
+}
+
+// Outcome of a latency benchmark run.  If the benchmark run was successfully
+// completed, the message contains the latency metrics. The information is
+// intended to be compared against with other candidate acceleration
+// configurations. If the benchmark run was failed, the message contains the
+// handled errors for the callers to investigate further.
+//
+// Next ID: 4
+message LatencyResults {
+  // Type of the benchmark event.
+  optional BenchmarkEventType event_type = 1;
+  // Metrics that are intended to be compared against other acceleration
+  // configurations, used when type is END.
+  repeated BenchmarkMetric metrics = 2;
+  // Error during benchmark, used when type is ERROR.
+  optional BenchmarkError error = 3;
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyActivity.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyActivity.java
new file mode 100644
index 00000000000..9645a94cd1c
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyActivity.java
@@ -0,0 +1,54 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+/**
+ * {@link Activity} class for Delegate Performance Accuracy Benchmark.
+ *
+ * <p>This Activity receives test arguments via a command line specified in an intent extra. It
+ * performs accuracy benchmark tests via TFLite MiniBenchmark based on the input arguments. Please
+ * check the test example in
+ * tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
+ */
+public class BenchmarkAccuracyActivity extends Activity {
+
+  private static final String TAG = "TfLiteBenchmarkAccuracy";
+  private static final String TFLITE_SETTINGS_FILES_INTENT_KEY_0 = "--tflite_settings_files";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    Log.i(TAG, "Create benchmark accuracy activity.");
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    Bundle bundle = intent.getExtras();
+    String[] tfliteSettingsJsonFiles = bundle.getStringArray(TFLITE_SETTINGS_FILES_INTENT_KEY_0);
+    BenchmarkAccuracyImpl impl =
+        new BenchmarkAccuracyImpl(getApplicationContext(), tfliteSettingsJsonFiles);
+
+    if (impl.initialize()) {
+      impl.benchmark();
+    } else {
+      Log.e(TAG, "Failed to initialize the accuracy benchmarking.");
+    }
+    finish();
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyImpl.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyImpl.java
new file mode 100644
index 00000000000..a8b8a8d6e83
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkAccuracyImpl.java
@@ -0,0 +1,155 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.content.Context;
+import android.content.res.AssetFileDescriptor;
+import android.os.Trace;
+import android.util.Log;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import tflite.BenchmarkEvent;
+
+/**
+ * Impl class for Delegate Performance Accuracy Benchmark.
+ *
+ * <p>It performs accuracy benchmark tests via TFLite MiniBenchmark based on the input arguments.
+ * Please check the test example in
+ * tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
+ *
+ * <p>TODO(b/250877013): Consider improving the app's I/O interfaces.
+ *
+ * <p>Generates a Pass/Fail result. The test is a Pass if the target acceleration configuration (the
+ * second configuration if more than one TFLiteSettings JSON files are provided) passes the embedded
+ * metric thresholds in all models.
+ *
+ * <p>Generates a CSV file for each model to describe the benchmark results under
+ * delegate_performance_result/accuracy folder in the app files directory.
+ *
+ * <ul>
+ *   <li>1. delegate_performance_result/accuracy/<MODEL_NAME>.csv: the performance of each
+ *       acceleration configuration and relative performance differences in percentage values.
+ * </ul>
+ */
+public class BenchmarkAccuracyImpl {
+
+  private static final String TAG = "TfLiteAccuracyImpl";
+  private static final String ACCURACY_FOLDER_NAME = "accuracy";
+  // The test target entry is the second item in the TfLiteSettingsListEntry list.
+  private static final int TEST_TARGET_ENTRY_INDEX = 1;
+
+  private final Context context;
+  private final String[] tfliteSettingsJsonFiles;
+  private String resultFolderPath;
+
+  public BenchmarkAccuracyImpl(Context context, String[] tfliteSettingsJsonFiles) {
+    this.context = context;
+    this.tfliteSettingsJsonFiles = tfliteSettingsJsonFiles;
+  }
+
+  /**
+   * Initializes the test environment. Checks the validity of input arguments and creates the result
+   * folder.
+   *
+   * <p>Returns {@code true} if the initialization was successful. Otherwise, returns {@code false}.
+   */
+  public boolean initialize() {
+    if (tfliteSettingsJsonFiles == null || tfliteSettingsJsonFiles.length == 0) {
+      Log.e(TAG, "No TFLiteSettings file provided.");
+      return false;
+    }
+
+    try {
+      // Creates root result folder.
+      resultFolderPath =
+          DelegatePerformanceBenchmark.createResultFolder(
+              context.getFilesDir(), ACCURACY_FOLDER_NAME);
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to create result folder", e);
+      return false;
+    }
+    return true;
+  }
+
+  public void benchmark() {
+    Log.i(
+        TAG,
+        "Running accuracy benchmark with TFLiteSettings JSON files: "
+            + Arrays.toString(tfliteSettingsJsonFiles));
+    List<TfLiteSettingsListEntry> tfliteSettingsList =
+        DelegatePerformanceBenchmark.loadTfLiteSettingsList(tfliteSettingsJsonFiles);
+    if (tfliteSettingsList.size() < 2) {
+      Log.e(TAG, "Failed to load the TFLiteSettings JSON file.");
+      return;
+    }
+    String[] assets;
+    try {
+      assets = context.getAssets().list(ACCURACY_FOLDER_NAME);
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to list files from assets folder.", e);
+      return;
+    }
+    boolean passed = true;
+    TfLiteSettingsListEntry targetEntry = tfliteSettingsList.get(TEST_TARGET_ENTRY_INDEX);
+    for (String asset : assets) {
+      if (!asset.endsWith(".tflite")) {
+        Log.i(TAG, asset + " is not a model file. Skipping.");
+        continue;
+      }
+      String modelResultPath;
+      String modelName = DelegatePerformanceBenchmark.getModelName(asset);
+      try {
+        modelResultPath =
+            DelegatePerformanceBenchmark.createResultFolder(
+                context.getFilesDir(), ACCURACY_FOLDER_NAME + "/" + modelName);
+      } catch (IOException e) {
+        Log.e(TAG, "Failed to create result folder for " + modelName, e);
+        passed = false;
+        break;
+      }
+      try (AssetFileDescriptor modelFileDescriptor =
+          context.getAssets().openFd(ACCURACY_FOLDER_NAME + "/" + asset)) {
+        for (TfLiteSettingsListEntry tfliteSettingsListEntry : tfliteSettingsList) {
+          Trace.beginSection("Accuracy Benchmark");
+          BenchmarkEvent benchmarkEvent =
+              DelegatePerformanceBenchmark.runAccuracyBenchmark(
+                  tfliteSettingsListEntry,
+                  modelFileDescriptor.getParcelFileDescriptor().getFd(),
+                  modelFileDescriptor.getStartOffset(),
+                  modelFileDescriptor.getLength(),
+                  modelResultPath);
+          Trace.endSection();
+
+          tfliteSettingsListEntry.setAccuracyResults(benchmarkEvent);
+        }
+
+        passed &= targetEntry.metrics().containsKey("ok") && targetEntry.metrics().get("ok") > 0;
+        CsvWriter.writeReport(
+            tfliteSettingsList, String.format("%s/%s.csv", resultFolderPath, modelName));
+      } catch (IOException e) {
+        Log.e(TAG, "Failed to open assets file " + asset, e);
+        passed = false;
+        break;
+      }
+    }
+    Log.i(
+        TAG,
+        String.format(
+            "Accuracy benchmark result for %s: %s.",
+            targetEntry.filePath(), passed ? "Pass" : "Fail"));
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyActivity.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyActivity.java
new file mode 100644
index 00000000000..84c3fd72c82
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyActivity.java
@@ -0,0 +1,56 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+/**
+ * {@link Activity} class for Delegate Performance Latency Benchmark.
+ *
+ * <p>This Activity receives test arguments via a command line specified in an intent extra. It
+ * passes the arguments to the {@link BenchmarkLatencyImpl} class to perform latency benchmark tests
+ * via TFLite Benchmark Tool. Please check the test example in
+ * tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
+ */
+public class BenchmarkLatencyActivity extends Activity {
+
+  private static final String TAG = "TfLiteBenchmarkLatency";
+  private static final String TFLITE_SETTINGS_FILES_INTENT_KEY_0 = "--tflite_settings_files";
+  private static final String ARGS_INTENT_KEY_0 = "--args";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    Log.i(TAG, "Create benchmark latency activity.");
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    Bundle bundle = intent.getExtras();
+    String[] tfliteSettingsJsonFiles = bundle.getStringArray(TFLITE_SETTINGS_FILES_INTENT_KEY_0);
+    String[] args = bundle.getStringArray(ARGS_INTENT_KEY_0);
+
+    BenchmarkLatencyImpl impl =
+        new BenchmarkLatencyImpl(getApplicationContext(), tfliteSettingsJsonFiles, args);
+    if (impl.initialize()) {
+      impl.benchmark();
+    } else {
+      Log.e(TAG, "Failed to initialize the latency benchmarking.");
+    }
+    finish();
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyImpl.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyImpl.java
new file mode 100644
index 00000000000..5bbe259a280
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkLatencyImpl.java
@@ -0,0 +1,365 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import static org.tensorflow.lite.benchmark.delegateperformance.DelegatePerformanceBenchmark.checkArgument;
+import static org.tensorflow.lite.benchmark.delegateperformance.DelegatePerformanceBenchmark.checkNotNull;
+import static org.tensorflow.lite.benchmark.delegateperformance.DelegatePerformanceBenchmark.checkState;
+
+import android.content.Context;
+import android.content.res.AssetFileDescriptor;
+import android.os.Trace;
+import android.util.Log;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import tflite.StableDelegateLoaderSettings;
+import tflite.proto.benchmark.DelegatePerformance.LatencyCriteria;
+import tflite.proto.benchmark.DelegatePerformance.LatencyResults;
+
+/**
+ * Impl class for Delegate Performance Latency Benchmark.
+ *
+ * <p>It performs latency benchmark tests via TFLite Benchmark Tool based on the input arguments.
+ * Please check the test example in
+ * tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
+ *
+ * <p>Generates a CSV file under delegate_performance_result/latency folder to describe the
+ * benchmark results for each model.
+ *
+ * <ul>
+ *   <li>1. delegate_performance_result/latency/<MODEL_NAME>.csv: the performance of each
+ *       acceleration configuration and relative performance differences in percentage values.
+ * </ul>
+ *
+ * <p>Generates a Pass/Fail decision in the below two cases:
+ *
+ * <ul>
+ *   <li>1. the caller provides one TFLite Settings file. The Activity compares the latency
+ *       performance of the provided acceleration configuration against the default acceleration
+ *       configuation (blank TFLiteSettings). If the regression on the average inference latency
+ *       metric is within the corresponding threshold specified in the LatencyCriteria file for all
+ *       models, the Activity logs Pass. Otherwise, it logs Fail.
+ *   <li>2. the caller provides two TFLite Settings file with stable delegate loader settings. The
+ *       activity uses the first one as reference and the other as the test target. If any
+ *       regressions on latency metrics are within the thresholds specified in the LatencyCriteria
+ *       file, the Activity logs Pass. Otherwise, it logs Fail.
+ * </ul>
+ *
+ * The metrics for generating the Pass/Fail decision:
+ *
+ * <ul>
+ *   <li>1. initialization latency: model loading and interpreter initialization before the first
+ *       inference. This metric is only used in the above case #2.
+ *   <li>2. averge warmup latency: average time for the warmup inferences before the benchmark run.
+ *       The default number of warmup inferences is 1. This metric is only used in the above case
+ *       #2.
+ *   <li>3. average inference latency: average time for the inferences in the benchmark run.
+ * </ul>
+ */
+public final class BenchmarkLatencyImpl {
+
+  private static final String LATENCY_FOLDER_NAME = "latency";
+  private static final String PROTO_FOLDER_NAME = "proto";
+  private static final String TAG = "TfLiteLatencyImpl";
+  private static final String DEFAULT_LATENCY_CRITERIA_FILENAME = "default_latency_criteria";
+  private static final String LATENCY_CRITERIA_FILE_EXT = ".binarypb";
+  // Reference entry is the first item in the TfLiteSettingsListEntry list.
+  private static final int REFERENCE_ENTRY_INDEX = 0;
+  // The test target entry is the second item in the TfLiteSettingsListEntry list.
+  private static final int TEST_TARGET_ENTRY_INDEX = 1;
+
+  private final Context context;
+  private final String[] tfliteSettingsJsonFiles;
+  private final String[] args;
+  private LatencyCriteria defaultLatencyCriteria;
+  private String resultFolderPath;
+  /**
+   * {@code true} if the caller provides one delegate to this activity. If the flag is {@code true},
+   * the activity generates a PASS/FAIL result in the logs.
+   */
+  private boolean numberOfInputTfLiteSettingsIsOne;
+  /**
+   * {@code true} if the caller provides two stable delegates to this activity. If the flag is
+   * {@code true}, the activity generates a PASS/FAIL result in the logs.
+   */
+  private boolean compareTwoStableDelegates;
+
+  public BenchmarkLatencyImpl(Context context, String[] tfliteSettingsJsonFiles, String[] args) {
+    this.context = context;
+    this.tfliteSettingsJsonFiles = tfliteSettingsJsonFiles;
+    if (args == null) {
+      // The "--args" extra key was not provided.
+      this.args = new String[0];
+    } else {
+      this.args = args;
+    }
+  }
+
+  /**
+   * Initializes the test environment. Creates the result folder and loads the default latency
+   * criteria file.
+   *
+   * <p>Returns {@code true} if the initialization was successful. Otherwise, returns {@code false}.
+   */
+  public boolean initialize() {
+    if (tfliteSettingsJsonFiles == null || tfliteSettingsJsonFiles.length == 0) {
+      Log.e(TAG, "No TFLiteSettings file provided.");
+      return false;
+    }
+    numberOfInputTfLiteSettingsIsOne = tfliteSettingsJsonFiles.length == 1;
+
+    try {
+      // Creates root result folder.
+      resultFolderPath =
+          DelegatePerformanceBenchmark.createResultFolder(
+              context.getFilesDir(), LATENCY_FOLDER_NAME);
+    } catch (IOException e) {
+      Log.e(
+          TAG, "Failed to create result folder " + LATENCY_FOLDER_NAME + " in files directory.", e);
+      return false;
+    }
+
+    try {
+      // Loads default latency criteria.
+      defaultLatencyCriteria = loadLatencyCriteria(DEFAULT_LATENCY_CRITERIA_FILENAME);
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to load default latency criteria " + DEFAULT_LATENCY_CRITERIA_FILENAME, e);
+      return false;
+    }
+    return true;
+  }
+
+  /** Benchmarks the embedded model files with the input TFLiteSettings JSON files. */
+  public void benchmark() {
+    List<TfLiteSettingsListEntry> tfliteSettingsList =
+        DelegatePerformanceBenchmark.loadTfLiteSettingsList(tfliteSettingsJsonFiles);
+    if (tfliteSettingsList.size() < 2) {
+      Log.e(TAG, "Failed to load the TFLiteSettings JSON file.");
+      return;
+    }
+    TfLiteSettingsListEntry target = tfliteSettingsList.get(TEST_TARGET_ENTRY_INDEX);
+    compareTwoStableDelegates =
+        tfliteSettingsList.size() == 2
+            && isStableDelegate(tfliteSettingsList.get(REFERENCE_ENTRY_INDEX))
+            && isStableDelegate(target);
+
+    boolean passed = true;
+    String[] assets;
+    try {
+      assets = context.getAssets().list(LATENCY_FOLDER_NAME);
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to list files from assets folder.", e);
+      return;
+    }
+    for (String asset : assets) {
+      if (asset.endsWith(".tflite")) {
+        passed &= benchmarkModel(asset, tfliteSettingsList, args) == BenchmarkResultType.PASS;
+      }
+    }
+    // TODO(b/250877013): Improve the result reporting.
+    if (shouldGeneratePassFailDecision()) {
+      Log.i(
+          TAG,
+          String.format(
+              "Latency benchmark result for %s: %s",
+              target.filePath(), passed ? BenchmarkResultType.PASS : BenchmarkResultType.FAIL));
+    } else {
+      Log.i(
+          TAG,
+          "Skipping the Pass/Fail result generation because the activity receives 2 TFLiteSettings"
+              + " JSON files and at least one of the files is not using the stable delegate.");
+    }
+  }
+
+  /**
+   * Benchmarks a model file with the TfLiteSettingsListEntry list.
+   *
+   * <p>Returns {@code BenchmarkResultType.SKIP} is the latency module shouldn't produce a Pass/Fail
+   * result. Otherwise, returns {@code BenchmarkResultType.PASS} if the test target acceleration
+   * configuration doesn't breach the thresholds in the or {@code BenchmarkResultType.FAIL} if not.
+   * latency criteria file. Returns {@code BenchmarkResultType.UNKONWN} if the benchmark task
+   * encounters errors.
+   */
+  private BenchmarkResultType benchmarkModel(
+      String modelFilename, List<TfLiteSettingsListEntry> tfliteSettingsList, String[] args) {
+    String modelName = DelegatePerformanceBenchmark.getModelName(modelFilename);
+    try (AssetFileDescriptor modelFileDescriptor =
+        context.getAssets().openFd(LATENCY_FOLDER_NAME + "/" + modelFilename)) {
+
+      for (TfLiteSettingsListEntry tfliteSettingsListEntry : tfliteSettingsList) {
+        Log.i(
+            TAG,
+            "Running latency benchmark with model: "
+                + modelName
+                + ", settings: "
+                + tfliteSettingsListEntry.filePath()
+                + ", args: "
+                + Arrays.toString(args));
+        Trace.beginSection("Latency Benchmark");
+        LatencyResults results =
+            DelegatePerformanceBenchmark.runLatencyBenchmark(
+                args,
+                tfliteSettingsListEntry,
+                modelFileDescriptor.getParcelFileDescriptor().getFd(),
+                modelFileDescriptor.getStartOffset(),
+                modelFileDescriptor.getLength());
+        Trace.endSection();
+        tfliteSettingsListEntry.setLatencyResults(results);
+      }
+
+      CsvWriter.writeReport(tfliteSettingsList, resultFolderPath + "/" + modelName + ".csv");
+      if (!shouldGeneratePassFailDecision()) {
+        return BenchmarkResultType.SKIP;
+      }
+      return checkLatencyCriteria(tfliteSettingsList, tryLoadLatencyCriteria(modelName))
+          ? BenchmarkResultType.PASS
+          : BenchmarkResultType.FAIL;
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to open asset file " + LATENCY_FOLDER_NAME + "/" + modelFilename);
+    }
+    return BenchmarkResultType.UNKONWN;
+  }
+
+  /**
+   * Tries to load the model-specific latency criteria file by the model name.
+   *
+   * <p>Returns the latency criteria for the specific model if the loading was successful.
+   * Otherwise, returns the default latency criteria.
+   */
+  private LatencyCriteria tryLoadLatencyCriteria(String fileBasename) {
+    try {
+      return loadLatencyCriteria(fileBasename);
+    } catch (IOException e) {
+      Log.w(
+          TAG,
+          "Failed to load the latency criteria of "
+              + fileBasename
+              + ". Fallback to the default latency criteria.");
+    }
+    return defaultLatencyCriteria;
+  }
+
+  /** Loads the latency criteria file from Assets. */
+  private LatencyCriteria loadLatencyCriteria(String fileBasename) throws IOException {
+    String latencyCriteriaFileAssetPath =
+        PROTO_FOLDER_NAME + "/" + fileBasename + LATENCY_CRITERIA_FILE_EXT;
+    InputStream latencyCriteriaFile = context.getAssets().open(latencyCriteriaFileAssetPath);
+    return LatencyCriteria.parseFrom(latencyCriteriaFile);
+  }
+
+  /**
+   * Uses the latency criteria to check the benchmark results.
+   *
+   * <p>Returns true if the results meet any of the follow conditions:
+   *
+   * <ul>
+   *   <li>1. the caller provides one TFLite Settings file. The average latency time regression is
+   *       within the threshold specified in the latency criteria file after comparing the test
+   *       target accleration configuration against the default acceleration configuration.
+   *   <li>2. the caller provides two TFLite Settings file with stable delegate loader settings. The
+   *       method uses the first acceleration configuratin as reference configuration. The
+   *       initialization, average warmup and average inference latency regression are within the
+   *       thresholds specified in the latency criteria file after comparing the second
+   *       configuration against the reference configuration.
+   * </ul>
+   *
+   * TODO(b/250877013): Consider improving the result aggregation logic.
+   */
+  private boolean checkLatencyCriteria(
+      List<TfLiteSettingsListEntry> tfliteSettingsList, LatencyCriteria latencyCriteria) {
+    checkState(shouldGeneratePassFailDecision());
+    checkNotNull(latencyCriteria);
+    // This method checks the latency criteria when the number of entries is two.
+    checkArgument(tfliteSettingsList.size() == 2);
+
+    TfLiteSettingsListEntry reference = tfliteSettingsList.get(REFERENCE_ENTRY_INDEX);
+    TfLiteSettingsListEntry target = tfliteSettingsList.get(TEST_TARGET_ENTRY_INDEX);
+    boolean checkInferenceRegression =
+        checkLatencyThreshold(
+            reference,
+            target,
+            "inference_latency_average_us",
+            latencyCriteria.getAverageInferenceMaxRegressionPercentageAllowed());
+    if (numberOfInputTfLiteSettingsIsOne) {
+      // Check for inference latency regression only when the number of input files is one.
+      return checkInferenceRegression;
+    }
+    boolean checkInitializationRegression =
+        checkLatencyThreshold(
+            reference,
+            target,
+            "initialization_latency_us",
+            latencyCriteria.getInitializationMaxRegressionPercentageAllowed());
+    boolean checkWarmupRegression =
+        checkLatencyThreshold(
+            reference,
+            target,
+            "warmup_latency_average_us",
+            latencyCriteria.getAverageWarmUpMaxRegressionPercentageAllowed());
+    return checkInferenceRegression && checkInitializationRegression && checkWarmupRegression;
+  }
+
+  /**
+   * Currently this Activity generates a Pass/Fail result when it receives two stable delegate
+   * acceleration configurations or one acceleration configuration. Because the result is generated
+   * by comparing the same metrics between 2 delegates. So the comparison is fair if the two
+   * delegates are with the same delegate type or it is comparing with the default delegate, which
+   * is used when no acceleration configuration is provided.
+   *
+   * <p>TODO(b/250877013): Consider improving the I/O of this activity.
+   *
+   * <p>Returns true if the latency module receives two stable delegate acceleration configurations
+   * or one acceleration configuration. Otherwise, returns false.
+   */
+  private boolean shouldGeneratePassFailDecision() {
+    return compareTwoStableDelegates || numberOfInputTfLiteSettingsIsOne;
+  }
+
+  /**
+   * Returns true if the {@code TfLiteSettingsListEntry} refers to a stable delegate. Otherwise,
+   * returns false.
+   *
+   * <p>This Activity generates a Pass/Fail result if both of the two input TFLiteSettings JSON
+   * files refer to stable delegates. This method a helper function to know if a {@code
+   * TfLiteSettingsListEntry} refers to a stable delegate.
+   */
+  private boolean isStableDelegate(TfLiteSettingsListEntry entry) {
+    StableDelegateLoaderSettings settings = entry.tfliteSettings().stableDelegateLoaderSettings();
+    return settings != null
+        && settings.delegatePath() != null
+        && !settings.delegatePath().isEmpty();
+  }
+
+  /** Checks if the regression metric is within the thresholds provided. */
+  private boolean checkLatencyThreshold(
+      TfLiteSettingsListEntry reference,
+      TfLiteSettingsListEntry target,
+      String metricName,
+      float percentageThreshold) {
+    if (reference.metrics().containsKey(metricName) && target.metrics().containsKey(metricName)) {
+      float referenceMetricValue = reference.metrics().get(metricName);
+      float targetMetricValue = target.metrics().get(metricName);
+      if (referenceMetricValue == 0) {
+        return targetMetricValue == referenceMetricValue;
+      }
+      return (targetMetricValue - referenceMetricValue) / referenceMetricValue
+          <= percentageThreshold / 100f;
+    }
+    return false;
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java
new file mode 100644
index 00000000000..41996884f6d
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/BenchmarkResultType.java
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+/** Enumerates the possible benchmark result values. */
+public enum BenchmarkResultType {
+  /** Unknown benchmark result, possibly due to internal failures. */
+  UNKONWN("UNKNOWN"),
+  /** The benchmark activity skips the Pass/Fail result generation. */
+  SKIP("SKIP"),
+  /** All benchmark results don't breach the thresholds specified in the criteria file. */
+  PASS("PASS"),
+  /** Some benchmark results breach the thresholds specified in the criteria file. */
+  FAIL("FAIL");
+
+  private final String name;
+
+  BenchmarkResultType(String name) {
+    this.name = name;
+  }
+
+  @Override
+  public String toString() {
+    return name;
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/CsvWriter.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/CsvWriter.java
new file mode 100644
index 00000000000..ea78f6ea2b9
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/CsvWriter.java
@@ -0,0 +1,107 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.util.Log;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/** Helper class for writing the final report. */
+final class CsvWriter {
+  private static final String TAG = "TfLiteCsvWriter";
+
+  /**
+   * Writes the benchmark results into a CSV file.
+   *
+   * <p>Example output file:
+   *
+   * <p>| Metric | DELEGATE_TYPE_1 (PATH_1) | DELEGATE_TYPE_2 (PATH_2) | % | ...
+   *
+   * <p>| METRIC_1 | 1000 | 1200 | 20% | ...
+   *
+   * <p>...
+   */
+  public static void writeReport(
+      List<TfLiteSettingsListEntry> tfliteSettingsList, String filePath) {
+    if (tfliteSettingsList.isEmpty()) {
+      Log.e(TAG, "Invalid input to generate a CSV report.");
+      return;
+    }
+
+    Log.i(TAG, "Generating CSV report to " + filePath);
+    TfLiteSettingsListEntry reference = tfliteSettingsList.get(0);
+    try (PrintWriter writer = new PrintWriter(filePath)) {
+      StringBuilder sb = new StringBuilder();
+      // Heading row. It is structured as below:
+      // Metric, <REFERENCE_DELEGATE> (<PATH>), <CANDIDATE_DELEGATE> (<PATH>), %,...
+      sb.append("Metric,")
+          .append(reference.tfliteSettings().delegate())
+          .append(" (")
+          .append(reference.filePath())
+          .append(")");
+      for (int i = 1; i < tfliteSettingsList.size(); i++) {
+        TfLiteSettingsListEntry entry = tfliteSettingsList.get(i);
+        sb.append(",")
+            .append(entry.tfliteSettings().delegate())
+            .append(" (")
+            .append(entry.filePath())
+            .append("),%");
+      }
+      sb.append('\n');
+
+      // Metric rows.
+      for (Map.Entry<String, Float> referenceEntry : reference.metrics().entrySet()) {
+        String metricName = referenceEntry.getKey();
+        float referenceValue = referenceEntry.getValue();
+        sb.append(metricName).append(",").append(referenceValue);
+        for (int i = 1; i < tfliteSettingsList.size(); i++) {
+          sb.append(compareValues(metricName, referenceValue, tfliteSettingsList.get(i)));
+        }
+        sb.append('\n');
+      }
+
+      writer.write(sb.toString());
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to open report file " + filePath);
+    }
+  }
+
+  private static String compareValues(
+      String metricName, float referenceValue, TfLiteSettingsListEntry entry) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(",N/A,N/A");
+    if (entry.metrics().containsKey(metricName)) {
+      float value = entry.metrics().get(metricName);
+      sb.setLength(0);
+      sb.append(",").append(value);
+      if (value == referenceValue) {
+        sb.append(",0%");
+      } else if (referenceValue == 0) {
+        sb.append(",N/A");
+      } else {
+        sb.append(",").append(toPercentage((value - referenceValue) / referenceValue));
+      }
+    }
+    return sb.toString();
+  }
+
+  private static String toPercentage(float n) {
+    return String.format(Locale.ENGLISH, "%.1f", n * 100) + "%";
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegatePerformanceBenchmark.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegatePerformanceBenchmark.java
new file mode 100644
index 00000000000..5364ea962de
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/DelegatePerformanceBenchmark.java
@@ -0,0 +1,216 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.util.Log;
+import com.google.flatbuffers.FlatBufferBuilder;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import tflite.BenchmarkEvent;
+import tflite.TFLiteSettings;
+import tflite.proto.benchmark.DelegatePerformance.BenchmarkEventType;
+import tflite.proto.benchmark.DelegatePerformance.LatencyResults;
+
+/** Helper class for running delegate performance benchmark. */
+class DelegatePerformanceBenchmark {
+  private static final String DELEGATE_PERFORMANCE_RESULT_FOLDER = "delegate_performance_result";
+  private static final String TAG = "TfLiteBenchmarkHelper";
+  private static final String MODEL_EXT = ".tflite";
+
+  static {
+    System.loadLibrary("delegate_performance_benchmark");
+  }
+
+  public static String createResultFolder(File filesDir, String resultFolder) throws IOException {
+    File resultDir = new File(filesDir, DELEGATE_PERFORMANCE_RESULT_FOLDER + "/" + resultFolder);
+    String resultPath = resultDir.getAbsolutePath();
+    if (resultDir.exists() || resultDir.mkdirs()) {
+      Log.i(TAG, "Logging the result to " + resultPath);
+      return resultPath;
+    }
+    throw new IOException("Failed to create directory for " + resultPath);
+  }
+
+  /**
+   * Extracts the model name from the model file name.
+   *
+   * <p>Strips out the ".tflite" extension from the input. Returns "model" if the input filename is
+   * "model.tflite".
+   */
+  public static String getModelName(String filename) {
+    checkNotNull(filename);
+    checkArgument(filename.endsWith(MODEL_EXT));
+    return filename.substring(0, filename.length() - MODEL_EXT.length());
+  }
+
+  /**
+   * Returns a {@code LatencyResults} by parsing the outcome from a TFLite Benchmark Tool execution.
+   * If it fails to parse the outcome, this method returns a {@code LatencyResults} with an error
+   * event type.
+   */
+  public static LatencyResults runLatencyBenchmark(
+      String[] args,
+      TfLiteSettingsListEntry tfliteSettingslistEntry,
+      int modelFd,
+      long modelOffset,
+      long modelSize) {
+    byte[] tfliteSettingsByteArray =
+        new byte[tfliteSettingslistEntry.tfliteSettings().getByteBuffer().remaining()];
+    tfliteSettingslistEntry.tfliteSettings().getByteBuffer().get(tfliteSettingsByteArray);
+    tfliteSettingslistEntry.tfliteSettings().getByteBuffer().rewind();
+
+    byte[] latencyResultsByteArray =
+        latencyBenchmarkNativeRun(
+            args,
+            tfliteSettingsByteArray,
+            tfliteSettingslistEntry.filePath(),
+            modelFd,
+            modelOffset,
+            modelSize);
+    if (latencyResultsByteArray == null || latencyResultsByteArray.length == 0) {
+      Log.w(
+          TAG,
+          String.format(
+              "Received null response from native for %s. Treating this as error.",
+              tfliteSettingslistEntry.filePath()));
+      return LatencyResults.newBuilder()
+          .setEventType(BenchmarkEventType.BENCHMARK_EVENT_TYPE_ERROR)
+          .build();
+    }
+    try {
+      return LatencyResults.parseFrom(latencyResultsByteArray);
+    } catch (IOException e) {
+      Log.w(
+          TAG,
+          String.format(
+              "Failed to parse the results running %s with exception %s.",
+              tfliteSettingslistEntry.filePath(), e));
+      return LatencyResults.newBuilder()
+          .setEventType(BenchmarkEventType.BENCHMARK_EVENT_TYPE_ERROR)
+          .build();
+    }
+  }
+
+  /** Returns a {@code BenchmarkEvent} by parsing the outcome from a MiniBenchmark execution. */
+  public static BenchmarkEvent runAccuracyBenchmark(
+      TfLiteSettingsListEntry tfliteSettingslistEntry,
+      int modelFd,
+      long modelOffset,
+      long modelSize,
+      String resultPath) {
+    byte[] tfliteSettingsByteArray =
+        new byte[tfliteSettingslistEntry.tfliteSettings().getByteBuffer().remaining()];
+    tfliteSettingslistEntry.tfliteSettings().getByteBuffer().get(tfliteSettingsByteArray);
+    tfliteSettingslistEntry.tfliteSettings().getByteBuffer().rewind();
+
+    byte[] accuracyResultsByteArray =
+        accuracyBenchmarkNativeRun(
+            tfliteSettingsByteArray, modelFd, modelOffset, modelSize, resultPath);
+    ByteBuffer byteBuffer = ByteBuffer.wrap(accuracyResultsByteArray);
+    return BenchmarkEvent.getRootAsBenchmarkEvent(byteBuffer);
+  }
+
+  /**
+   * Loads the input TFLiteSettings JSON files into TfLiteSettingsListEntry instances.
+   *
+   * <p>If the number of input TFLiteSettings JSON files is 1, we add one default entry at the
+   * beginning as reference. The default entry contains a dummy TFLiteSettings structure, which lets
+   * the interpreter to apply the default acceleration.
+   */
+  public static List<TfLiteSettingsListEntry> loadTfLiteSettingsList(String[] jsonFilePaths) {
+    List<TfLiteSettingsListEntry> tfliteSettingsList = new ArrayList<>();
+    if (jsonFilePaths.length == 1) {
+      FlatBufferBuilder tfliteSettingsBuilder = new FlatBufferBuilder();
+      TFLiteSettings.startTFLiteSettings(tfliteSettingsBuilder);
+      int tfliteSettingsOffset = TFLiteSettings.endTFLiteSettings(tfliteSettingsBuilder);
+      tfliteSettingsBuilder.finish(tfliteSettingsOffset);
+      tfliteSettingsList.add(
+          TfLiteSettingsListEntry.create(
+              TFLiteSettings.getRootAsTFLiteSettings(tfliteSettingsBuilder.dataBuffer()),
+              "default_delegate"));
+    }
+    for (String jsonFilePath : jsonFilePaths) {
+      byte[] tfliteSettingsByteArray = loadTfLiteSettingsJsonNative(jsonFilePath);
+      if (tfliteSettingsByteArray == null || tfliteSettingsByteArray.length == 0) {
+        Log.e(TAG, "Failed to load TFLiteSetting from JSON file " + jsonFilePath);
+        return new ArrayList<>();
+      }
+
+      ByteBuffer byteBuffer = ByteBuffer.wrap(tfliteSettingsByteArray);
+      tfliteSettingsList.add(
+          TfLiteSettingsListEntry.create(
+              TFLiteSettings.getRootAsTFLiteSettings(byteBuffer), jsonFilePath));
+    }
+    return tfliteSettingsList;
+  }
+
+  /**
+   * Ensures that an object reference passed as a parameter to the calling method is not null.
+   *
+   * <p>TODO(b/250876587): Consider adding proper annotation support.
+   *
+   * @param reference an object reference
+   * @return the non-null reference that was validated
+   * @throws NullPointerException if {@code reference} is null
+   */
+  public static <T> T checkNotNull(/* @Nullable */ T reference) {
+    if (reference == null) {
+      throw new NullPointerException();
+    }
+    return reference;
+  }
+
+  /**
+   * Ensures the truth of an expression involving one or more parameters to the calling method.
+   *
+   * @param expression a boolean expression
+   * @throws IllegalArgumentException if {@code expression} is false
+   */
+  public static void checkArgument(boolean expression) {
+    if (!expression) {
+      throw new IllegalArgumentException();
+    }
+  }
+
+  /**
+   * Ensures the truth of an expression involving the state of the calling instance, but not
+   * involving any parameters to the calling method.
+   *
+   * @param expression a boolean expression
+   * @throws IllegalStateException if {@code expression} is false
+   */
+  public static void checkState(boolean expression) {
+    if (!expression) {
+      throw new IllegalStateException();
+    }
+  }
+
+  private static native byte[] latencyBenchmarkNativeRun(
+      String[] args,
+      byte[] tfliteSettings,
+      String tfliteSettingsPath,
+      int modelFd,
+      long modelOffset,
+      long modelSize);
+
+  private static native byte[] accuracyBenchmarkNativeRun(
+      byte[] tfliteSettings, int modelFd, long modelOffset, long modelSize, String resultPath);
+
+  private static native byte[] loadTfLiteSettingsJsonNative(String jsonFilePath);
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/TfLiteSettingsListEntry.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/TfLiteSettingsListEntry.java
new file mode 100644
index 00000000000..89a2e29c012
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/java/org/tensorflow/lite/benchmark/delegateperformance/TfLiteSettingsListEntry.java
@@ -0,0 +1,123 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark.delegateperformance;
+
+import android.util.Log;
+import java.util.HashMap;
+import tflite.BenchmarkEvent;
+import tflite.BenchmarkEventType;
+import tflite.BenchmarkMetric;
+import tflite.BenchmarkResult;
+import tflite.TFLiteSettings;
+import tflite.proto.benchmark.DelegatePerformance;
+import tflite.proto.benchmark.DelegatePerformance.LatencyResults;
+
+/**
+ * Helper class for store the data before and after benchmark runs. Parses both latency and accuracy
+ * benchmark results.
+ */
+final class TfLiteSettingsListEntry {
+  private static final String TAG = "TfLiteSettingsListEntry";
+
+  private final TFLiteSettings tfliteSettings;
+  private final String filePath;
+
+  private HashMap<String, Float> metrics = new HashMap<>();
+
+  private TfLiteSettingsListEntry(TFLiteSettings tfliteSettings, String filePath) {
+    if (tfliteSettings == null) {
+      throw new NullPointerException("Null tfliteSettings");
+    }
+    this.tfliteSettings = tfliteSettings;
+    if (filePath == null) {
+      throw new NullPointerException("Null filePath");
+    }
+    this.filePath = filePath;
+  }
+
+  TFLiteSettings tfliteSettings() {
+    return tfliteSettings;
+  }
+
+  String filePath() {
+    return filePath;
+  }
+
+  void setLatencyResults(LatencyResults latencyResults) {
+    if (latencyResults.getEventType()
+        != DelegatePerformance.BenchmarkEventType.BENCHMARK_EVENT_TYPE_END) {
+      Log.i(TAG, "The latency benchmarking is not completed successfully for " + filePath);
+      return;
+    }
+    for (DelegatePerformance.BenchmarkMetric metric : latencyResults.getMetricsList()) {
+      if (metric != null) {
+        metrics.put(metric.getName(), metric.getValue());
+      }
+    }
+  }
+
+  void setAccuracyResults(BenchmarkEvent accuracyEvent) {
+    if (accuracyEvent == null
+        || accuracyEvent.eventType() != BenchmarkEventType.END
+        || accuracyEvent.result() == null) {
+      Log.i(TAG, "The accuracy benchmarking is not completed successfully for " + filePath);
+      return;
+    }
+    BenchmarkResult accuracyResults = accuracyEvent.result();
+    for (int i = 0; i < accuracyResults.metricsLength(); i++) {
+      BenchmarkMetric metric = accuracyResults.metrics(i);
+      if (metric == null || metric.valuesLength() == 0) {
+        continue;
+      }
+      String metricName = metric.name();
+      float metricValue = metric.values(0);
+      if (metric.valuesLength() > 1) {
+        metricName += "(average)";
+        float sum = 0f;
+        for (int j = 0; j < metric.valuesLength(); j++) {
+          sum += metric.values(j);
+        }
+        metricValue = sum / metric.valuesLength();
+      }
+      metrics.put(metricName, metricValue);
+    }
+    metrics.put("ok", accuracyResults.ok() ? 1.0f : 0.0f);
+    metrics.put("max_memory_kb", (float) accuracyResults.maxMemoryKb());
+  }
+
+  HashMap<String, Float> metrics() {
+    return metrics;
+  }
+
+  @Override
+  public String toString() {
+    return "TfLiteSettingsListEntry{"
+        // TODO(b/265268620): Dump the entire TFLiteSettings buffer.
+        + "delegate="
+        + tfliteSettings.delegate()
+        + ", "
+        + "filePath="
+        + filePath
+        + ", "
+        + "metrics="
+        + metrics
+        + "}";
+  }
+
+  static TfLiteSettingsListEntry create(TFLiteSettings tfliteSettings, String filePath) {
+    return new TfLiteSettingsListEntry(tfliteSettings, filePath);
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
new file mode 100644
index 00000000000..eeee33fbd48
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
@@ -0,0 +1,81 @@
+# Description:
+#  Holds the native layer of the app.
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("//tensorflow:tensorflow.bzl", "clean_dep")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
+    licenses = ["notice"],
+)
+
+tflite_jni_binary(
+    name = "libdelegate_performance_benchmark.so",
+    srcs = ["delegate_performance_benchmark_jni.cc"],
+    deps = [
+        ":accuracy_benchmark",
+        ":latency_benchmark",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "status_codes",
+    hdrs = ["status_codes.h"],
+)
+
+cc_library(
+    name = "latency_benchmark",
+    srcs = ["latency_benchmark.cc"],
+    hdrs = ["latency_benchmark.h"],
+    deps = [
+        "//tensorflow/core/util:stats_calculator_portable",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/profiling:memory_info",
+        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_library(
+    name = "accuracy_benchmark",
+    srcs = ["accuracy_benchmark.cc"],
+    hdrs = ["accuracy_benchmark.h"],
+    deps = [
+        ":status_codes",
+        "@flatbuffers//:flatbuffers",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:blocking_validator_runner",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_entrypoint",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:validator_runner_options",
+        "//tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin",
+        "//tensorflow/lite/experimental/acceleration/configuration:stable_delegate_plugin",
+        "//tensorflow/lite/experimental/acceleration/configuration:xnnpack_plugin",
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:tool_params",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+    ] + select({
+        # On Android, as the validation runs in a separate process as a
+        # different binary, any TFLite delegates to be validated need to
+        # include corresponding delegate plugins.
+        clean_dep("//tensorflow:android"): [
+            "//tensorflow/lite/experimental/acceleration/configuration:gpu_plugin",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "benchmark_native",
+    srcs = ["libdelegate_performance_benchmark.so"],
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
new file mode 100644
index 00000000000..128e1ca4c7a
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.cc
@@ -0,0 +1,120 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include <cstddef>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/validator_runner_options.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h"
+
+namespace tflite {
+namespace benchmark {
+namespace accuracy {
+
+flatbuffers::Offset<BenchmarkEvent> Benchmark(
+    flatbuffers::FlatBufferBuilder& fbb, const TFLiteSettings& tflite_settings,
+    int model_fd, size_t model_offset, size_t model_size,
+    const char* result_path_chars) {
+  std::string result_path(result_path_chars);
+  acceleration::ValidatorRunnerOptions options;
+  options.model_fd = model_fd;
+  options.model_offset = model_offset;
+  options.model_size = model_size;
+  options.data_directory_path = result_path;
+
+  // This is a mitigation to prevent file lock collisions between the previous
+  // benchmark runs and a new benchmark run.
+  // TODO(b/265406729): Remove the mitigation.
+  // Example path: "storage_path_2019-02-01_12:12:18.fb"
+  std::stringstream ss;
+  std::time_t t = std::time(nullptr);
+  ss << result_path << "/storage_path_"
+     << std::put_time(std::localtime(&t), "%F_%T.fb");
+  options.storage_path = ss.str();
+  int return_code = std::remove(options.storage_path.c_str());
+  if (return_code) {
+    TFLITE_LOG_PROD(TFLITE_LOG_WARNING,
+                    "Failed to remove storage file (%s): %s.",
+                    options.storage_path.c_str(), strerror(errno));
+  }
+  options.per_test_timeout_ms = 5000;
+
+  acceleration::BlockingValidatorRunner runner(options);
+  acceleration::MinibenchmarkStatus status = runner.Init();
+  if (status != acceleration::kMinibenchmarkSuccess) {
+    TFLITE_LOG_PROD(
+        TFLITE_LOG_ERROR,
+        "MiniBenchmark BlockingValidatorRunner initialization failed with "
+        "error code %d",
+        status);
+    BenchmarkErrorBuilder error_builder(fbb);
+    error_builder.add_stage(BenchmarkStage_INITIALIZATION);
+    error_builder.add_exit_code(kBenchmarkRunnerInitializationFailed);
+    error_builder.add_mini_benchmark_error_code(status);
+    flatbuffers::Offset<BenchmarkError> error = error_builder.Finish();
+    BenchmarkEventBuilder builder(fbb);
+    builder.add_event_type(BenchmarkEventType_ERROR);
+    builder.add_error(error);
+    return builder.Finish();
+  }
+
+  std::vector<const TFLiteSettings*> settings = {&tflite_settings};
+  std::vector<flatbuffers::FlatBufferBuilder> results =
+      runner.TriggerValidation(settings);
+  if (results.size() != settings.size()) {
+    TFLITE_LOG_PROD(
+        TFLITE_LOG_ERROR,
+        "Number of result events (%zu) doesn't match the expectation (%zu).",
+        results.size(), settings.size());
+    flatbuffers::Offset<BenchmarkError> error =
+        CreateBenchmarkError(fbb, BenchmarkStage_INFERENCE,
+                             /*exit_code=*/kBenchmarkResultCountMismatch);
+    BenchmarkEventBuilder builder(fbb);
+    builder.add_event_type(BenchmarkEventType_ERROR);
+    builder.add_error(error);
+    return builder.Finish();
+  }
+  // The settings contains one test only. Therefore, the benchmark checks for
+  // the first result only.
+  TFLITE_CHECK_EQ(results.size(), 1);
+  BenchmarkEventT benchmark_event;
+  flatbuffers::GetRoot<tflite::BenchmarkEvent>(results[0].GetBufferPointer())
+      ->UnPackTo(&benchmark_event);
+  return CreateBenchmarkEvent(fbb, &benchmark_event);
+}
+
+}  // namespace accuracy
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
new file mode 100644
index 00000000000..a2aee7bbb75
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+
+namespace tflite {
+namespace benchmark {
+namespace accuracy {
+
+// Triggers MiniBenchmark testings. Uses the arguments passed from the testing
+// app to configure MiniBenchmark ValidatorRunner. The tests will access and
+// execute the pre-embedded models in the app via the model file descriptor. The
+// contents of a model are initialized using model_size bytes starting at
+// model_offset position in the file described by model_fd. Any intermediate
+// data and results will be dumped to the result path given.
+//
+// Returns a BenchmarkEvent flatbuffer offset. If the benchmark tests finish
+// successfully with a pass from MiniBenchmark, the returned offset contains the
+// concrete accuracy metrics and the overall result from MiniBenchmark.
+// Otherwise, the returned value contains an error code.
+flatbuffers::Offset<BenchmarkEvent> Benchmark(
+    flatbuffers::FlatBufferBuilder& fbb, const TFLiteSettings& tflite_settings,
+    int model_fd, size_t model_offset, size_t model_size,
+    const char* result_path_chars);
+
+}  // namespace accuracy
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_ACCURACY_BENCHMARK_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc
new file mode 100644
index 00000000000..60c18de4f39
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/delegate_performance_benchmark_jni.cc
@@ -0,0 +1,149 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h"
+
+namespace {
+
+// A helper method that converts an array of strings passed from Java to a
+// vector of strings in C++.
+std::vector<std::string> toStringVector(JNIEnv* env,
+                                        jobjectArray string_array) {
+  int len = env->GetArrayLength(string_array);
+  std::vector<std::string> vec(len);
+  for (int i = 0; i < len; ++i) {
+    jstring str =
+        static_cast<jstring>(env->GetObjectArrayElement(string_array, i));
+    const char* chars = env->GetStringUTFChars(str, nullptr);
+    vec[i] = std::string(chars);
+    env->ReleaseStringUTFChars(str, chars);
+    env->DeleteLocalRef(str);
+  }
+  return vec;
+}
+
+// Serializes the proto message into jbyteArray.
+jbyteArray CppProtoToBytes(JNIEnv* env, const proto2::MessageLite& proto) {
+  jbyteArray array = nullptr;
+  const int byte_size = proto.ByteSizeLong();
+  if (byte_size) {
+    array = env->NewByteArray(byte_size);
+    void* ptr = env->GetPrimitiveArrayCritical(array, nullptr);
+    proto.SerializeWithCachedSizesToArray(static_cast<uint8_t*>(ptr));
+    env->ReleasePrimitiveArrayCritical(array, ptr, 0);
+  }
+  return array;
+}
+
+}  // namespace
+
+extern "C" {
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_lite_benchmark_delegateperformance_DelegatePerformanceBenchmark_latencyBenchmarkNativeRun(
+    JNIEnv* env, jclass clazz, jobjectArray args_obj,
+    jbyteArray tflite_settings_byte_array, jstring tflite_settings_path_obj,
+    jint model_fd, jlong model_offset, jlong model_size) {
+  std::vector<std::string> args = toStringVector(env, args_obj);
+  const char* tflite_settings_path_chars =
+      env->GetStringUTFChars(tflite_settings_path_obj, nullptr);
+  jbyte* tflite_settings_bytes =
+      env->GetByteArrayElements(tflite_settings_byte_array, nullptr);
+  const tflite::TFLiteSettings* tflite_settings =
+      flatbuffers::GetRoot<tflite::TFLiteSettings>(
+          reinterpret_cast<const char*>(tflite_settings_bytes));
+
+  tflite::proto::benchmark::LatencyResults results =
+      tflite::benchmark::latency::Benchmark(
+          *tflite_settings, tflite_settings_path_chars,
+          static_cast<int>(model_fd), static_cast<size_t>(model_offset),
+          static_cast<size_t>(model_size), args);
+
+  env->ReleaseByteArrayElements(tflite_settings_byte_array,
+                                tflite_settings_bytes, JNI_ABORT);
+  env->ReleaseStringUTFChars(tflite_settings_path_obj,
+                             tflite_settings_path_chars);
+  return CppProtoToBytes(env, results);
+}
+
+// TODO(b/262411020): Consider returning jobject directly.
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_lite_benchmark_delegateperformance_DelegatePerformanceBenchmark_accuracyBenchmarkNativeRun(
+    JNIEnv* env, jclass clazz, jbyteArray tflite_settings_byte_array,
+    jint model_fd, jlong model_offset, jlong model_size,
+    jstring result_path_obj) {
+  const char* result_path_chars =
+      env->GetStringUTFChars(result_path_obj, nullptr);
+  jbyte* tflite_settings_bytes =
+      env->GetByteArrayElements(tflite_settings_byte_array, nullptr);
+  const tflite::TFLiteSettings* tflite_settings =
+      flatbuffers::GetRoot<tflite::TFLiteSettings>(
+          reinterpret_cast<const char*>(tflite_settings_bytes));
+  flatbuffers::FlatBufferBuilder fbb;
+
+  flatbuffers::Offset<tflite::BenchmarkEvent> benchmark_event =
+      tflite::benchmark::accuracy::Benchmark(
+          fbb, *tflite_settings, static_cast<int>(model_fd),
+          static_cast<size_t>(model_offset), static_cast<size_t>(model_size),
+          result_path_chars);
+  fbb.Finish(benchmark_event);
+
+  env->ReleaseByteArrayElements(tflite_settings_byte_array,
+                                tflite_settings_bytes, JNI_ABORT);
+  env->ReleaseStringUTFChars(result_path_obj, result_path_chars);
+
+  jbyteArray byte_array = nullptr;
+  if (fbb.GetSize() > 0) {
+    byte_array = env->NewByteArray(fbb.GetSize());
+    env->SetByteArrayRegion(
+        byte_array, 0, fbb.GetSize(),
+        reinterpret_cast<const jbyte*>(fbb.GetBufferPointer()));
+  }
+  return byte_array;
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_lite_benchmark_delegateperformance_DelegatePerformanceBenchmark_loadTfLiteSettingsJsonNative(
+    JNIEnv* env, jclass clazz, jstring json_file_path_obj) {
+  const char* json_file_path_chars =
+      env->GetStringUTFChars(json_file_path_obj, nullptr);
+
+  tflite::delegates::utils::TfLiteSettingsJsonParser parser;
+  parser.Parse(json_file_path_chars);
+
+  jbyteArray tflite_settings_byte_array = nullptr;
+  flatbuffers::uoffset_t tflite_settings_size = parser.GetBufferSize();
+  if (tflite_settings_size > 0) {
+    tflite_settings_byte_array = env->NewByteArray(tflite_settings_size);
+    env->SetByteArrayRegion(
+        tflite_settings_byte_array, 0, tflite_settings_size,
+        reinterpret_cast<const jbyte*>(parser.GetBufferPointer()));
+  }
+  env->ReleaseStringUTFChars(json_file_path_obj, json_file_path_chars);
+  return tflite_settings_byte_array;
+}
+
+}  // extern "C"
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
new file mode 100644
index 00000000000..fa89d95d9d6
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.cc
@@ -0,0 +1,230 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h"
+
+#include <errno.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/profiling/memory_info.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
+
+namespace tflite {
+namespace benchmark {
+namespace latency {
+namespace {
+
+static constexpr char kBenchmarkToolName[] = "(BenchmarkModelAndroid)";
+
+// The listener subscribes to the benchmark lifecycle and parses the
+// benchmarking results into a LatencyResults proto message. The message will be
+// reported later to the app.
+class DelegatePerformanceReportingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& unused) override {
+    results_proto_.set_event_type(proto::benchmark::BENCHMARK_EVENT_TYPE_START);
+  }
+
+  // TFLite Benchmark Tool triggers this method at the end of a benchmark for
+  // logging the results.
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    ReportResult(results);
+  }
+
+  void ReportFailure(TfLiteStatus status) {
+    std::string status_msg =
+        status == kTfLiteError
+            ? "TFLite error"
+            : (status == kTfLiteDelegateError ? "TFLite delegate error"
+                                              : "unexpected TFLite status");
+    TFLITE_LOG_PROD(TFLITE_LOG_ERROR,
+                    "Benchmark failed due to %s with status code %d.",
+                    status_msg.c_str(), status);
+    results_proto_.set_event_type(proto::benchmark::BENCHMARK_EVENT_TYPE_ERROR);
+    results_proto_.mutable_error()->mutable_error_code()->set_tflite_error(
+        status);
+    results_proto_.mutable_error()->set_error_message(status_msg);
+  }
+
+  const proto::benchmark::LatencyResults& GetResults() {
+    return results_proto_;
+  }
+
+ private:
+  // TODO(b/262399611): Consider putting metric related logic into a separate
+  // file.
+  void ReportResult(const BenchmarkResults& results) {
+    tensorflow::Stat<int64_t> warmup_us = results.warmup_time_us();
+    tensorflow::Stat<int64_t> inference_us = results.inference_time_us();
+    profiling::memory::MemoryUsage init_mem_usage = results.init_mem_usage();
+    profiling::memory::MemoryUsage overall_mem_usage =
+        results.overall_mem_usage();
+
+    AddMetric(/*name=*/"model_size_megabyte",
+              /*value=*/results.model_size_mb());
+    AddMetric(/*name=*/"initialization_latency_us",
+              /*value=*/results.startup_latency_us());
+    AddMetric(/*name=*/"warmup_latency_average_us", /*value=*/warmup_us.avg());
+    AddMetric(/*name=*/"warmup_latency_min_us", /*value=*/warmup_us.min());
+    AddMetric(/*name=*/"warmup_latency_max_us", /*value=*/warmup_us.max());
+    AddMetric(/*name=*/"warmup_latency_standard_deviation",
+              /*value=*/warmup_us.std_deviation());
+    AddMetric(/*name=*/"inference_latency_average_us",
+              /*value=*/inference_us.avg());
+    AddMetric(/*name=*/"inference_latency_min_us",
+              /*value=*/inference_us.min());
+    AddMetric(/*name=*/"inference_latency_max_us",
+              /*value=*/inference_us.max());
+    AddMetric(/*name=*/"inference_latency_standard_deviation",
+              /*value=*/inference_us.std_deviation());
+    AddMetric(/*name=*/"initialization_memory_max_rss_mebibyte",
+              /*value=*/init_mem_usage.mem_footprint_kb / 1024.0);
+    AddMetric(/*name=*/"initialization_memory_total_allocated_mebibyte",
+              /*value=*/init_mem_usage.total_allocated_bytes / 1024.0 / 1024.0);
+    AddMetric(
+        /*name=*/"initialization_memory_in_use_mebibyte",
+        /*value=*/init_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0);
+    AddMetric(/*name=*/"overall_memory_max_rss_mebibyte",
+              /*value=*/overall_mem_usage.mem_footprint_kb / 1024.0);
+    AddMetric(
+        /*name=*/"overall_memory_total_allocated_mebibyte",
+        /*value=*/overall_mem_usage.total_allocated_bytes / 1024.0 / 1024.0);
+    AddMetric(
+        /*name=*/"overall_memory_in_use_mebibyte",
+        /*value=*/overall_mem_usage.in_use_allocated_bytes / 1024.0 / 1024.0);
+    results_proto_.set_event_type(proto::benchmark::BENCHMARK_EVENT_TYPE_END);
+    TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Benchmark finished.");
+  }
+
+  void AddMetric(std::string name, float value) {
+    proto::benchmark::BenchmarkMetric* metric = results_proto_.add_metrics();
+    metric->set_name(name);
+    metric->set_value(value);
+  }
+
+  proto::benchmark::LatencyResults results_proto_;
+};
+
+// Converts the input TFLiteSettings into TFLite Benchmark Tool arguments.
+// Please see tensorflow/lite/tools/benchmark.
+std::vector<std::string> ParseArgumentsFromTfLiteSettings(
+    const TFLiteSettings& tflite_settings,
+    const std::string& tflite_settings_path) {
+  std::vector<std::string> args;
+  if (tflite_settings_path.empty()) {
+    return args;
+  }
+  if (tflite_settings.stable_delegate_loader_settings()) {
+    args.push_back(absl::StrFormat("--stable_delegate_settings_file=%s",
+                                   tflite_settings_path));
+    return args;
+  }
+  if (tflite_settings.delegate() == Delegate_XNNPACK) {
+    args.push_back("--use_xnnpack=true");
+    if (tflite_settings.xnnpack_settings() &&
+        tflite_settings.xnnpack_settings()->num_threads()) {
+      args.push_back(
+          absl::StrFormat("--num_threads=%d",
+                          tflite_settings.xnnpack_settings()->num_threads()));
+    } else if (tflite_settings.delegate() == Delegate_GPU) {
+      args.push_back("--use_gpu=true");
+      const tflite::GPUSettings* gpu_settings = tflite_settings.gpu_settings();
+      if (gpu_settings) {
+        if (gpu_settings->is_precision_loss_allowed()) {
+          args.push_back("--gpu_precision_loss_allowed=true");
+        }
+        if (gpu_settings->enable_quantized_inference()) {
+          args.push_back("--gpu_experimental_enable_quant=true");
+        }
+        if (gpu_settings->inference_preference() ==
+            GPUInferenceUsage_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED) {
+          args.push_back("--gpu_inference_for_sustained_speed=true");
+        }
+        if (gpu_settings->force_backend() == GPUBackend_OPENCL) {
+          args.push_back("--gpu_backend=cl");
+        } else if (gpu_settings->force_backend() == GPUBackend_OPENGL) {
+          args.push_back("--gpu_backend=gl");
+        }
+        if (gpu_settings->cache_directory()) {
+          args.push_back(
+              absl::StrFormat("--delegate_serialize_dir=%s",
+                              gpu_settings->cache_directory()->c_str()));
+        }
+        if (gpu_settings->model_token()) {
+          args.push_back(absl::StrFormat("--delegate_serialize_token=%s",
+                                         gpu_settings->model_token()->c_str()));
+        }
+      }
+    } else if (tflite_settings.disable_default_delegates()) {
+      // Currently TFLite Benchmark Tool doesn't support handling the case with
+      // applying XNNPack delegate explicitly and disabling the XNNPack delegate
+      // as the default delegate at the same time. When the
+      // "disable_default_delegates" configuration is set to true, it only takes
+      // effect if the delegate is not set to XNNPack. Otherwise, the default
+      // delegates will still be enabled.
+      args.push_back("--use_xnnpack=false");
+    }
+  }
+  return args;
+}
+}  // namespace
+
+proto::benchmark::LatencyResults Benchmark(
+    const TFLiteSettings& tflite_settings,
+    const std::string& tflite_settings_path, int model_fd, size_t model_offset,
+    size_t model_size, const std::vector<std::string>& args) {
+  // Constructs a fake argv command-line object for the benchmark.
+  std::vector<char*> argv;
+  argv.push_back(const_cast<char*>(kBenchmarkToolName));
+  std::string arg_graph =
+      absl::StrCat("--graph=fd:", model_fd, ":", model_offset, ":", model_size);
+  argv.push_back(const_cast<char*>(arg_graph.data()));
+  std::vector<std::string> args_from_tflite_settings =
+      ParseArgumentsFromTfLiteSettings(tflite_settings, tflite_settings_path);
+  for (const std::string& arg : args_from_tflite_settings) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+  // Keep the args here for any additional TFLite Benchmark Tool configurations.
+  for (const std::string& arg : args) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+
+  BenchmarkTfLiteModel benchmark;
+  DelegatePerformanceReportingListener delegatePerformanceReporting;
+  benchmark.AddListener(&delegatePerformanceReporting);
+  TfLiteStatus status = benchmark.Run(argv.size(), argv.data());
+  if (status != kTfLiteOk) {
+    delegatePerformanceReporting.ReportFailure(status);
+  }
+  return delegatePerformanceReporting.GetResults();
+}
+
+}  // namespace latency
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
new file mode 100644
index 00000000000..2122975ff0a
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h
@@ -0,0 +1,49 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
+
+namespace tflite {
+namespace benchmark {
+namespace latency {
+
+// Triggers TFLite Benchmark Tool. Passes the "args" from the testing app to
+// directly to TFLite Benchmark Tool. Converts the "tflite_settings" to
+// command-line options to configure TFLite Benchmark Tool. If the latency
+// benchmarking uses a stable delegate, the "tflite_settings_path" is passed to
+// enable the stable delegate provider. The contents of the tested model are
+// initialized using model_size bytes starting at model_offset position in the
+// file referenced by the file descriptor model_fd.
+//
+// Returns a LatencyResults proto message. If the benchmark tests finish
+// successfully from TFLite Benchmark Tool, the message contains the latency
+// metrics. Otherwise, the message contains the corresponding error.
+proto::benchmark::LatencyResults Benchmark(
+    const TFLiteSettings& tflite_settings,
+    const std::string& tflite_settings_path, int model_fd, size_t model_offset,
+    size_t model_size, const std::vector<std::string>& args);
+
+}  // namespace latency
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_LATENCY_BENCHMARK_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h
new file mode 100644
index 00000000000..805df6e9442
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h
@@ -0,0 +1,35 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
+
+namespace tflite {
+namespace benchmark {
+
+enum DelegatePerformanceBenchmarkStatus {
+  kBenchmarkUnknownStatus = 0,
+
+  // Set of error codes that are used as the return codes to communicate between
+  // the native layer and the caller app.
+  kBenchmarkRunnerInitializationFailed = 1000,
+  kBenchmarkResultCountMismatch = 1001,
+  kBenchmarkInvalidTfLiteSettings = 1002,
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_EXPERIMENTAL_DELEGATE_PERFORMANCE_ANDROID_SRC_MAIN_NATIVE_STATUS_CODES_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkAccuracyActivity.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkAccuracyActivity.java
deleted file mode 100644
index ebb10494ae9..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkAccuracyActivity.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.benchmark.delegateperformance;
-
-import android.app.Activity;
-import android.content.Intent;
-import android.content.res.AssetFileDescriptor;
-import android.os.Bundle;
-import android.os.Trace;
-import android.util.Log;
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * {@link Activity} class for Delegate Performance Accuracy Benchmark.
- *
- * <p>This Activity receives test arguments via a command line specified in an intent extra. It
- * performs accuracy benchmark tests via TFLite MiniBenchmark based on the input arguments. Please
- * check the test example in
- * https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
- * Current version doesn't write files to app file directory. The results can only be checked via
- * logcat logs.
- *
- * <p>TODO(b/250877013): add performance thresholding into the app. The activity will produce a
- * PASS/FAIL result based on the thresholds.
- */
-public class BenchmarkAccuracyActivity extends Activity {
-
-  private static final String TAG = "tflite_BenchmarkAccuracyActivity";
-  private static final String ACCURACY_RESULT_FOLDER = "accuracy";
-  private static final String ARGS_INTENT_KEY_0 = "--args";
-
-  @Override
-  public void onCreate(Bundle savedInstanceState) {
-    Log.i(TAG, "Create benchmark accuracy activity.");
-    super.onCreate(savedInstanceState);
-
-    Intent intent = getIntent();
-    Bundle bundle = intent.getExtras();
-    String[] args = bundle.getStringArray(ARGS_INTENT_KEY_0);
-
-    try {
-      String resultPath =
-          DelegatePerformanceBenchmark.createResultFolder(
-              getApplicationContext().getFilesDir(), ACCURACY_RESULT_FOLDER);
-
-      Log.i(TAG, "Running accuracy benchmark with args: " + Arrays.toString(args));
-      benchmarkAccuracy(args, resultPath);
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to create result folder", e);
-    }
-
-    finish();
-  }
-
-  private void benchmarkAccuracy(String[] args, String resultPath) {
-    String[] assets;
-    try {
-      assets = getAssets().list("");
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to list files from assets folder.", e);
-      return;
-    }
-    for (String asset : assets) {
-      // TODO(b/252976498): Move the embedded models to a specific folder under Assets.
-      if (!asset.endsWith(".tflite")) {
-        continue;
-      }
-      try (AssetFileDescriptor modelFileDescriptor = getAssets().openFd(asset)) {
-        Trace.beginSection("Accuracy Benchmark");
-        int status =
-            DelegatePerformanceBenchmark.runAccuracyBenchmark(
-                args,
-                modelFileDescriptor.getParcelFileDescriptor().getFd(),
-                modelFileDescriptor.getStartOffset(),
-                modelFileDescriptor.getLength(),
-                resultPath);
-        Trace.endSection();
-        Log.i(TAG, "Accuracy benchmark of " + asset + " finished with status: " + status);
-      } catch (IOException e) {
-        Log.e(TAG, "Failed to open assets file " + asset, e);
-      }
-    }
-  }
-}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkLatencyActivity.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkLatencyActivity.java
deleted file mode 100644
index c860a8e72b1..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/BenchmarkLatencyActivity.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.benchmark.delegateperformance;
-
-import android.app.Activity;
-import android.content.Intent;
-import android.os.Bundle;
-import android.os.Trace;
-import android.util.Log;
-import java.io.IOException;
-import java.util.Arrays;
-
-/**
- * {@link Activity} class for Delegate Performance Latency Benchmark.
- *
- * <p>This Activity receives test arguments via a command line specified in an intent extra. It
- * performs latency benchmark tests via TFLite Benchmark Tool based on the input arguments. Please
- * check the test example in
- * https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md.
- * Generates a JSON file and a CSV file to describe the benchmark results under
- * delegate_performance_result/latency folder in the app files directory.
- *
- * <ul>
- *   <li>1. delegate_performance_result/latency/report.json: the success status of the benchmark
- *       run.
- *   <li>2. delegate_performance_result/latency/benchmark_result.csv: the performance results of the
- *       benchmark run.
- * </ul>
- *
- * <p>TODO(b/250877013): add performance thresholding into the app. The activity will produce a
- * PASS/FAIL result based on the thresholds.
- */
-public class BenchmarkLatencyActivity extends Activity {
-
-  private static final String TAG = "tflite_BenchmarkLatencyActivity";
-  private static final String LATENCY_RESULT_FOLDER = "latency";
-
-  private static final String ARGS_INTENT_KEY_0 = "--args";
-
-  @Override
-  public void onCreate(Bundle savedInstanceState) {
-    Log.i(TAG, "Create benchmark latency activity.");
-    super.onCreate(savedInstanceState);
-
-    Intent intent = getIntent();
-    Bundle bundle = intent.getExtras();
-    String[] args = bundle.getStringArray(ARGS_INTENT_KEY_0);
-
-    try {
-      String resultPath =
-          DelegatePerformanceBenchmark.createResultFolder(
-              getApplicationContext().getFilesDir(), LATENCY_RESULT_FOLDER);
-
-      Log.i(TAG, "Running latency benchmark with args: " + Arrays.toString(args));
-      Trace.beginSection("Latency Benchmark");
-      DelegatePerformanceBenchmark.runLatencyBenchmark(args, resultPath);
-      Trace.endSection();
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to create result folder", e);
-    }
-
-    finish();
-  }
-}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/DelegatePerformanceBenchmark.java b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/DelegatePerformanceBenchmark.java
deleted file mode 100644
index 15ea9097a49..00000000000
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/org/tensorflow/lite/benchmark/delegate_performance/DelegatePerformanceBenchmark.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite.benchmark.delegateperformance;
-
-import android.util.Log;
-import java.io.File;
-import java.io.IOException;
-
-/** Helper class for running delegate performance benchmark. */
-class DelegatePerformanceBenchmark {
-  private static final String DELEGATE_PERFORMANCE_RESULT_FOLDER = "delegate_performance_result";
-  private static final String TAG = "tflite_DelegatePerformanceBenchmark";
-
-  static {
-    System.loadLibrary("tensorflowlite_delegate_performance_benchmark");
-  }
-
-  public static String createResultFolder(File filesDir, String resultFolder) throws IOException {
-    File resultDir = new File(filesDir, DELEGATE_PERFORMANCE_RESULT_FOLDER + "/" + resultFolder);
-    String resultPath = resultDir.getAbsolutePath();
-    if (resultDir.exists() || resultDir.mkdirs()) {
-      Log.i(TAG, "Logging the result to " + resultPath);
-      return resultPath;
-    }
-    throw new IOException("Failed to create directory for " + resultPath);
-  }
-
-  public static void runLatencyBenchmark(String[] args, String resultPath) {
-    latencyBenchmarkNativeRun(args, resultPath);
-  }
-
-  public static int runAccuracyBenchmark(
-      String[] args, int modelFd, long modelOffset, long modelSize, String resultPath) {
-    return accuracyBenchmarkNativeRun(args, modelFd, modelOffset, modelSize, resultPath);
-  }
-
-  private static native void latencyBenchmarkNativeRun(String[] args, String resultPath);
-
-  private static native int accuracyBenchmarkNativeRun(
-      String[] args, int modelFd, long modelOffset, long modelSize, String resultPath);
-}
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD
new file mode 100644
index 00000000000..d9f81eda17a
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/BUILD
@@ -0,0 +1,60 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Description:
+#  Holds the tests for the native layer of the app.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:__subpackages__"],
+    licenses = ["notice"],
+)
+
+cc_test(
+    name = "latency_benchmark_test",
+    srcs = ["latency_benchmark_test.cc"],
+    data = [
+        "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate",
+        "//tensorflow/lite/java/demo/app/src/main/assets:mobilenet_v1_1.0_224.tflite",
+        "//tensorflow/lite/tools/delegates/experimental/stable_delegate:test_sample_stable_delegate_settings.json",
+    ],
+    deps = [
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto:delegate_performance_cc_proto",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:latency_benchmark",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "accuracy_benchmark_test",
+    srcs = ["accuracy_benchmark_test.cc"],
+    data = [
+        "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:test_xnnpack_settings.json",
+        "//tensorflow/lite/tools/delegates/experimental/stable_delegate:test_sample_stable_delegate_settings.json",
+    ],
+    deps = [
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
+        "//tensorflow/lite/experimental/acceleration/mini_benchmark:mini_benchmark_test_helper",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:accuracy_benchmark",
+        "//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native:status_codes",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc
new file mode 100644
index 00000000000..b31c2e1cf20
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/accuracy_benchmark_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/accuracy_benchmark.h"
+
+#include <fcntl.h>
+#include <stdio.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
+#include "tensorflow/lite/experimental/acceleration/mini_benchmark/mini_benchmark_test_helper.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/status_codes.h"
+
+namespace tflite {
+namespace benchmark {
+namespace accuracy {
+namespace {
+
+class AccuracyBenchmarkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    acceleration::MiniBenchmarkTestHelper helper;
+    should_perform_test_ = helper.should_perform_test();
+
+    if (!should_perform_test_) {
+      return;
+    }
+    std::string embedded_model_path = helper.DumpToTempFile(
+        "mobilenet_quant_with_validation.tflite",
+        g_tflite_acceleration_embedded_mobilenet_validation_model,
+        g_tflite_acceleration_embedded_mobilenet_validation_model_len);
+    ASSERT_FALSE(embedded_model_path.empty());
+
+    model_fp_ = fopen(embedded_model_path.c_str(), "rb");
+    ASSERT_NE(model_fp_, nullptr);
+    ASSERT_EQ(fseek(model_fp_, 0, SEEK_END), 0);
+    model_size_ = ftell(model_fp_);
+    ASSERT_NE(model_size_, -1);
+    ASSERT_EQ(fseek(model_fp_, 0, SEEK_SET), 0);
+
+    result_path_ = ::testing::TempDir();
+  }
+
+  void TearDown() override { fclose(model_fp_); }
+
+  std::string result_path_;
+  size_t model_size_;
+  FILE* model_fp_;
+  bool should_perform_test_ = true;
+};
+
+TEST_F(AccuracyBenchmarkTest, FailedWithInvalidModelFileDescriptor) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+  delegates::utils::TfLiteSettingsJsonParser parser;
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<std::string> args;
+  const TFLiteSettings* tflite_settings = parser.Parse(
+      "third_party/tensorflow/lite/tools/delegates/experimental/"
+      "stable_delegate/test_sample_stable_delegate_settings.json");
+
+  flatbuffers::Offset<BenchmarkEvent> offset =
+      Benchmark(builder, *tflite_settings, /*model_fd=*/0,
+                /*model_offset=*/0, /*model_size=*/0, result_path_.c_str());
+  builder.Finish(offset);
+  const BenchmarkEvent* event =
+      flatbuffers::GetRoot<BenchmarkEvent>(builder.GetBufferPointer());
+
+  ASSERT_NE(event, nullptr);
+  EXPECT_EQ(event->event_type(), BenchmarkEventType_ERROR);
+  ASSERT_NE(event->error(), nullptr);
+  EXPECT_EQ(event->error()->stage(), BenchmarkStage_INITIALIZATION);
+  EXPECT_EQ(
+      event->error()->exit_code(),
+      DelegatePerformanceBenchmarkStatus::kBenchmarkRunnerInitializationFailed);
+}
+
+TEST_F(AccuracyBenchmarkTest, SucceedWithSampleStableDelegate) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+  delegates::utils::TfLiteSettingsJsonParser parser;
+  flatbuffers::FlatBufferBuilder builder;
+  const TFLiteSettings* tflite_settings = parser.Parse(
+      "third_party/tensorflow/lite/tools/delegates/experimental/"
+      "stable_delegate/test_sample_stable_delegate_settings.json");
+
+  flatbuffers::Offset<BenchmarkEvent> offset = Benchmark(
+      builder, *tflite_settings, /*model_fd=*/fileno(model_fp_),
+      /*model_offset=*/0, /*model_size=*/model_size_, result_path_.c_str());
+  builder.Finish(offset);
+  const BenchmarkEvent* event =
+      flatbuffers::GetRoot<BenchmarkEvent>(builder.GetBufferPointer());
+
+  // TODO(b/253442685): verify that the stable delegate was used.
+  ASSERT_NE(event, nullptr);
+  EXPECT_EQ(event->event_type(), BenchmarkEventType_END);
+  EXPECT_EQ(event->error(), nullptr);
+}
+
+TEST_F(AccuracyBenchmarkTest, SucceedWithEmbeddedValidationAndXNNPack) {
+  if (!should_perform_test_) {
+    std::cerr << "Skipping test";
+    return;
+  }
+  delegates::utils::TfLiteSettingsJsonParser parser;
+  flatbuffers::FlatBufferBuilder builder;
+  const TFLiteSettings* tflite_settings = parser.Parse(
+      "third_party/tensorflow/lite/delegates/utils/experimental/"
+      "stable_delegate/test_xnnpack_settings.json");
+
+  flatbuffers::Offset<BenchmarkEvent> offset = Benchmark(
+      builder, *tflite_settings, /*model_fd=*/fileno(model_fp_),
+      /*model_offset=*/0, /*model_size=*/model_size_, result_path_.c_str());
+  builder.Finish(offset);
+  const BenchmarkEvent* event =
+      flatbuffers::GetRoot<BenchmarkEvent>(builder.GetBufferPointer());
+
+  // TODO(b/253442685): verify that the XNNPack delegate was used.
+  ASSERT_NE(event, nullptr);
+  EXPECT_EQ(event->event_type(), BenchmarkEventType_END);
+  EXPECT_EQ(event->error(), nullptr);
+}
+
+}  // namespace
+}  // namespace accuracy
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc
new file mode 100644
index 00000000000..9814f514620
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/test/native/latency_benchmark_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/latency_benchmark.h"
+
+#include <fcntl.h>
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/buffer.h"  // from @flatbuffers
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/proto/delegate_performance.pb.h"
+
+namespace tflite {
+namespace benchmark {
+namespace latency {
+namespace {
+
+static constexpr char kModelPath[] =
+    "third_party/tensorflow/lite/java/demo/app/src/main/assets/"
+    "mobilenet_v1_1.0_224.tflite";
+static constexpr char kSettingsFilePath[] =
+    "third_party/tensorflow/lite/tools/delegates/experimental/stable_delegate/"
+    "test_sample_stable_delegate_settings.json";
+
+class LatencyBenchmarkTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    model_fp_ = fopen(kModelPath, "rb");
+    ASSERT_TRUE(model_fp_ != nullptr);
+    ASSERT_EQ(fseek(model_fp_, 0, SEEK_END), 0);
+    model_size_ = ftell(model_fp_);
+    ASSERT_NE(model_size_, -1);
+    ASSERT_EQ(fseek(model_fp_, 0, SEEK_SET), 0);
+    settings_ = parser_.Parse(kSettingsFilePath);
+  }
+
+  delegates::utils::TfLiteSettingsJsonParser parser_;
+  const TFLiteSettings* settings_;
+  size_t model_size_;
+  FILE* model_fp_;
+  std::vector<std::string> args_;
+};
+
+TEST_F(LatencyBenchmarkTest, FailedWithNullFileDescriptor) {
+  EXPECT_TRUE(Benchmark(*settings_, kSettingsFilePath,
+                        /*model_fd=*/0, /*model_offset=*/0,
+                        /*model_size=*/0, args_)
+                  .has_error());
+}
+
+TEST_F(LatencyBenchmarkTest, FailedWithInvalidNumThreadsSettings) {
+  flatbuffers::FlatBufferBuilder fbb;
+  flatbuffers::Offset<tflite::XNNPackSettings> xnnpack_settings =
+      CreateXNNPackSettings(fbb, /*num_threads=*/-3);
+  TFLiteSettingsBuilder tflite_settings_builder(fbb);
+  tflite_settings_builder.add_delegate(Delegate_XNNPACK);
+  tflite_settings_builder.add_xnnpack_settings(xnnpack_settings);
+  fbb.Finish(tflite_settings_builder.Finish());
+  const TFLiteSettings* settings =
+      flatbuffers::GetRoot<TFLiteSettings>(fbb.GetBufferPointer());
+
+  EXPECT_TRUE(Benchmark(*settings,
+                        /*tflite_settings_path=*/"example_path",
+                        fileno(model_fp_),
+                        /*model_offset=*/0, model_size_, args_)
+                  .has_error());
+}
+
+TEST_F(LatencyBenchmarkTest, SucceedWithNullTfLiteSettingsSettings) {
+  // TODO(b/253442685): verify that the default delegate was used.
+  EXPECT_EQ(Benchmark(*settings_, kSettingsFilePath, fileno(model_fp_),
+                      /*model_offset=*/0, model_size_, args_)
+                .event_type(),
+            proto::benchmark::BENCHMARK_EVENT_TYPE_END);
+}
+
+TEST_F(LatencyBenchmarkTest, SucceedWithSampleStableDelegate) {
+  // TODO(b/253442685): verify that the stable delegate was used.
+  EXPECT_EQ(Benchmark(*settings_, kSettingsFilePath, fileno(model_fp_),
+                      /*model_offset=*/0, model_size_, args_)
+                .event_type(),
+            proto::benchmark::BENCHMARK_EVENT_TYPE_END);
+}
+
+TEST_F(LatencyBenchmarkTest,
+       SucceedWithSampleStableDelegateAndBenchmarkToolArguments) {
+  std::vector<std::string> args = {"--warmup_runs=10"};
+
+  // TODO(b/253442685): verify that the stable delegate was used.
+  EXPECT_EQ(Benchmark(*settings_, kSettingsFilePath, fileno(model_fp_),
+                      /*model_offset=*/0, model_size_, args)
+                .event_type(),
+            proto::benchmark::BENCHMARK_EVENT_TYPE_END);
+}
+
+}  // namespace
+}  // namespace latency
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
index c447a05adfb..a03a18fcbe7 100644
--- a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
index 4d7359c9159..86289ca2e84 100644
--- a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
@@ -3,13 +3,14 @@ load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "strip_common_in
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
 strip_common_include_path_prefix(
     name = "strip_common_include_path_benchmark",
     hdr_labels = [
-        "//tensorflow/lite/c:c_api_types.h",
+        "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/tools:logging.h",
         "//tensorflow/lite/tools/benchmark/experimental/c:benchmark_c_api.h",
     ],
diff --git a/tensorflow/lite/tools/cmake/download_toolchains.sh b/tensorflow/lite/tools/cmake/download_toolchains.sh
index 329eac00618..1886502c5ff 100755
--- a/tensorflow/lite/tools/cmake/download_toolchains.sh
+++ b/tensorflow/lite/tools/cmake/download_toolchains.sh
@@ -54,20 +54,19 @@ case $1 in
     echo "ARMCC_PREFIX=${ARMCC_ROOT}/bin/aarch64-linux-gnu-"
 		;;
 	rpi0)
-    if [[ ! -d "${TOOLCHAINS_DIR}/arm-rpi-linux-gnueabihf" ]]; then
-      curl -L https://github.com/rvagg/rpi-newer-crosstools/archive/eb68350c5c8ec1663b7fe52c742ac4271e3217c5.tar.gz -o rpi-toolchain.tar.gz >&2
-      tar xzf rpi-toolchain.tar.gz -C ${TOOLCHAINS_DIR} >&2
-      mv ${TOOLCHAINS_DIR}/rpi-newer-crosstools-eb68350c5c8ec1663b7fe52c742ac4271e3217c5 ${TOOLCHAINS_DIR}/arm-rpi-linux-gnueabihf >&2
+    if [[ ! -d "${TOOLCHAINS_DIR}/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf" ]]; then
+      curl -LO https://storage.googleapis.com/mirror.tensorflow.org/developer.arm.com/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz >&2
+      tar xvf gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz -C ${TOOLCHAINS_DIR} >&2
     fi
-    ARMCC_ROOT=${TOOLCHAINS_DIR}/arm-rpi-linux-gnueabihf/x64-gcc-6.5.0/arm-rpi-linux-gnueabihf
-    echo "ARMCC_FLAGS=\"-march=armv6 -mfpu=vfp -funsafe-math-optimizations \
-      -isystem ${ARMCC_ROOT}/lib/gcc/arm-rpi-gnueabihf/6.5.0/include \
-      -isystem ${ARMCC_ROOT}/lib/gcc/arm-rpi-gnueabihf/6.5.0/include-fixed \
-      -isystem ${ARMCC_ROOT}/arm-rpi-linux-gnueabihf/include/c++/6.5.0 \
-      -isystem ${ARMCC_ROOT}/arm-rpi-linux-gnueabihf/sysroot/usr/include \
+    ARMCC_ROOT=${TOOLCHAINS_DIR}/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf
+    echo "ARMCC_FLAGS=\"-march=armv6 -mfpu=vfp -mfloat-abi=hard -funsafe-math-optimizations \
+      -isystem ${ARMCC_ROOT}/lib/gcc/arm-linux-gnueabihf/8.3.0/include \
+      -isystem ${ARMCC_ROOT}/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed \
+      -isystem ${ARMCC_ROOT}/arm-linux-gnueabihf/include/c++/8.3.0 \
+      -isystem ${ARMCC_ROOT}/arm-linux-gnueabihf/libc/usr/include \
       -isystem \"\${CROSSTOOL_PYTHON_INCLUDE_PATH}\" \
       -isystem /usr/include\""
-    echo "ARMCC_PREFIX=${ARMCC_ROOT}/bin/arm-rpi-linux-gnueabihf-"
+    echo "ARMCC_PREFIX=${ARMCC_ROOT}/bin/arm-linux-gnueabihf-"
     ;;
 	*)
 		echo "Usage: download_toolchains.sh [armhf|aarch64|rpi0]" >&2
diff --git a/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
index 1b0dc28f624..97b43298f13 100644
--- a/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
+++ b/tensorflow/lite/tools/cmake/modules/Findfarmhash.cmake
@@ -13,12 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# tensorflow-lite uses find_package for this package, so override the system
-# installation and build from source instead.
-include(farmhash)
-if(farmhash_POPULATED)
+# tensorflow-lite uses find_package for this package, so build from
+# source if the system version is not enabled.
+
+if(SYSTEM_FARMHASH)
+  include(FindPackageHandleStandardArgs)
+  find_path(FARMHASH_ROOT_DIR NAMES include/farmhash.h)
+  find_library(FARMHASH_LIB NAMES farmhash PATHS ${FARMHASH_ROOT_DIR}/lib ${FARMHASH_LIB_PATH})
+  find_path(FARMHASH_INCLUDE_DIRS NAMES farmhash.h PATHS ${FARMHASH_ROOT_DIR}/include)
+  find_package_handle_standard_args(farmhash DEFAULT_MSG FARMHASH_LIB FARMHASH_INCLUDE_DIRS)
+endif()
+
+if(farmhash_FOUND)
+  add_library(farmhash SHARED IMPORTED GLOBAL)
+  set_target_properties(farmhash PROPERTIES
+    IMPORTED_LOCATION ${FARMHASH_LIB}
+    INTERFACE_INCLUDE_DIRECTORIES ${FARMHASH_INCLUDE_DIRS}
+  )
+else()
+  include(farmhash)
+  if(farmhash_POPULATED)
+    get_target_property(FARMHASH_INCLUDE_DIRS farmhash INTERFACE_DIRECTORIES)
+  endif()
+endif()
+
+if(farmhash_FOUND OR farmhash_POPULATED)
   set(FARMHASH_FOUND TRUE)
-  get_target_property(FARMHASH_INCLUDE_DIRS farmhash INTERFACE_DIRECTORIES)
   add_library(farmhash::farmhash ALIAS farmhash)
   set(FARMHASH_LIBRARIES farmhash::farmhash)
 endif()
diff --git a/tensorflow/lite/tools/cmake/modules/Findgoogle_benchmark.cmake b/tensorflow/lite/tools/cmake/modules/Findgoogle_benchmark.cmake
new file mode 100644
index 00000000000..b5c7bab9fd4
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findgoogle_benchmark.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET google_benchmark OR google_benchmark_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  google_benchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.7.0
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/google_benchmark"
+)
+OverridableFetchContent_GetProperties(google_benchmark)
+if(NOT google_benchmark_POPULATED)
+  OverridableFetchContent_Populate(google_benchmark)
+endif()
+
+option(HAVE_GNU_POSIX_REGEX OFF)
+
+add_subdirectory(
+  "${google_benchmark_SOURCE_DIR}"
+  "${google_benchmark_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+
+include_directories(
+  AFTER
+  "${google_benchmark_SOURCE_DIR}/include"
+)
diff --git a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
index c7e20f9577d..0be87d21a8d 100644
--- a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
+++ b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   cpuinfo
   GIT_REPOSITORY https://github.com/pytorch/cpuinfo
   # Sync with tensorflow/third_party/cpuinfo/workspace.bzl
-  GIT_TAG 5e63739504f0f8e18e941bd63b2d6d42536c7d90
+  GIT_TAG 3dc310302210c1891ffcfb12ae67b11a3ad3a150
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/cpuinfo"
 )
diff --git a/tensorflow/lite/tools/cmake/modules/egl_headers.cmake b/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
index 400f4fc779d..cac542f077b 100644
--- a/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
@@ -17,7 +17,7 @@ if(TARGET egl_headers OR egl_headers_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   egl_headers
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index ee0e6376ba2..671799dc7d7 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG 3bb6a48d8c171cf20b5f8e48bfb4e424fbd4f79e
+  GIT_TAG 3460f3558e7b469efb8a225894e21929c8c77629
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
index 54b413f69ac..ad0ddcd38c1 100644
--- a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -17,7 +17,7 @@ if(TARGET flatbuffers OR flatbuffers_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   flatbuffers
diff --git a/tensorflow/lite/tools/cmake/modules/fp16_headers.cmake b/tensorflow/lite/tools/cmake/modules/fp16_headers.cmake
index 0d039c01328..300a61024b1 100644
--- a/tensorflow/lite/tools/cmake/modules/fp16_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/fp16_headers.cmake
@@ -18,7 +18,7 @@ if(TARGET fp16_headers OR fp16_headers_POPULATED OR TFLITE_ENABLE_XNNPACK)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   fp16_headers
diff --git a/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
index 22de734b858..5015eaf07d1 100644
--- a/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
@@ -17,7 +17,7 @@ if(TARGET opencl_headers OR opencl_headers_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   opencl_headers
diff --git a/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake b/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
index c6dd7b842be..5e243278fc4 100644
--- a/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
@@ -17,7 +17,7 @@ if(TARGET opengl_headers OR opengl_headers_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   opengl_headers
diff --git a/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
index 7e7ce0c5399..519d67ba3ff 100644
--- a/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
@@ -17,7 +17,7 @@ if(TARGET vulkan_headers OR vulkan_headers_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   vulkan_headers
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 4f57362f9e8..937ba133be6 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -17,13 +17,13 @@ if(TARGET xnnpack OR xnnpack_POPULATED)
   return()
 endif()
 
-include(FetchContent)
+include(OverridableFetchContent)
 
 OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG a50369c0fdd15f0f35b1a91c964644327a88d480
+  GIT_TAG 659147817805d17c7be2d60bd7bbca7e780f9c82
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index 4287003ad38..5e7179184cd 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite", "cc_test_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -52,6 +53,7 @@ cc_library_with_tflite(
         ":gpu_delegate_provider",
         ":hexagon_delegate_provider",
         ":nnapi_delegate_provider",
+        "//tensorflow/lite/tools/delegates/experimental/stable_delegate:delegate_provider",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index dc5bd540e65..0e5aba096f8 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -9,17 +9,17 @@ arguments and provides a TFLite delegate instance creation based on those
 parameters. This delegate registrar has been used in TFLite evaluation tools and
 the benchmark model tool.
 
-A particular TFLite delegate provider can be used by
-linking the corresponding library, e.g. adding it to the `deps` of a BUILD rule.
-Note that each delegate provider library has been configured with
-`alwayslink=1` in the BUILD rule so that it will be linked to any binary that
-directly or indirectly depends on it.
+A particular TFLite delegate provider can be used by linking the corresponding
+library, e.g. adding it to the `deps` of a BUILD rule. Note that each delegate
+provider library has been configured with `alwayslink=1` in the BUILD rule so
+that it will be linked to any binary that directly or indirectly depends on it.
 
 The following lists all implemented TFLite delegate providers and their
 corresponding list of parameters that each supports to create a particular
 TFLite delegate.
 
 ### Common parameters
+
 *   `num_threads`: `int` (default=-1) \
     The number of threads to use for running the inference on CPU. By default,
     this is set to the platform default value -1.
@@ -42,13 +42,24 @@ TFLite delegate.
     Unique tokens ensure that the delegate doesn't read inapplicable/invalid
     data. Note that delegate_serialize_dir is also required to enable this
     feature.
+*   `first_delegate_node_index`: `int` (default=0) \
+    The index of the first node that could be delegated. Debug only. Add
+    '--define=tflite_debug_delegate=true' in your build command line to use it.
+    \
+    Currently only supported by CoreML delegate.
+*   `last_delegate_node_index`: `int` (default=INT_MAX) \
+    The index of the last node that could be delegated. Debug only. Add
+    '--define=tflite_debug_delegate=true' in your build command line to use it.
+    \
+    Currently only supported by CoreML delegate.
 
 ### GPU delegate provider
 
-The GPU deleagte is supported on Android and iOS devices, or platforms where
-the delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
+The GPU delegate is supported on Android and iOS devices, or platforms where the
+delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
 
 #### Common options
+
 *   `use_gpu`: `bool` (default=false) \
     Whether to use the
     [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
@@ -64,18 +75,21 @@ the delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
     on non-iOS platforms.
 
 #### Android options
-*  `gpu_backend`: `string` (default="") \
+
+*   `gpu_backend`: `string` (default="") \
     Force the GPU delegate to use a particular backend for execution, and fail
     if unsuccessful. Should be one of: cl, gl. By default, the GPU delegate will
     try OpenCL first and then OpenGL if the former fails.
 
 #### iOS options
+
 *   `gpu_wait_type`: `string` (default="") \
     Which GPU wait_type option to use. Should be one of the following: passive,
     active, do_not_wait, aggressive. When left blank, passive mode is used by
     default.
 
 ### NNAPI delegate provider
+
 *   `use_nnapi`: `bool` (default=false) \
     Whether to use
     [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/).
@@ -92,9 +106,9 @@ the delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
     to use when executing using NNAPI. Should be one of the following:
     fast_single_answer, sustained_speed, low_power, undefined.
 *   `nnapi_execution_priority`: `string` (default="") \
-    The relative priority for executions of the model in NNAPI. Should be one
-    of the following: default, low, medium and high. This option requires
-    Android 11+.
+    The relative priority for executions of the model in NNAPI. Should be one of
+    the following: default, low, medium and high. This option requires Android
+    11+.
 *   `disable_nnapi_cpu`: `bool` (default=true) \
     Excludes the
     [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
@@ -110,24 +124,27 @@ the delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
     efficiently manage resources, which would significantly reduce overhead
     especially if the same delegate instance is to be used for multiple
     inferences.
-*   `nnapi_support_library_path`: `string` (default=""),
-    Path from which NNAPI support library will be loaded to construct the
-    delegate. In order to use NNAPI delegate with support library,
-    --nnapi_accelerator_name must be specified and must be equal to one of the
-    devices provided by the support library.
+*   `nnapi_support_library_path`: `string` (default=""), Path from which NNAPI
+    support library will be loaded to construct the delegate. In order to use
+    NNAPI delegate with support library, --nnapi_accelerator_name must be
+    specified and must be equal to one of the devices provided by the support
+    library.
 
 ### Hexagon delegate provider
+
 *   `use_hexagon`: `bool` (default=false) \
     Whether to use the Hexagon delegate. Not all devices may support the Hexagon
-    delegate, refer to the [TensorFlow Lite documentation](https://www.tensorflow.org/lite/performance/hexagon_delegate) for more
-    information about which devices/chipsets are supported and about how to get
-    the required libraries. To use the Hexagon delegate also build the
-    hexagon_nn:libhexagon_interface.so target and copy the library to the
+    delegate, refer to the
+    [TensorFlow Lite documentation](https://www.tensorflow.org/lite/performance/hexagon_delegate)
+    for more information about which devices/chipsets are supported and about
+    how to get the required libraries. To use the Hexagon delegate also build
+    the hexagon_nn:libhexagon_interface.so target and copy the library to the
     device. All libraries should be copied to /data/local/tmp on the device.
 *   `hexagon_profiling`: `bool` (default=false) \
     Whether to profile ops running on hexagon.
 
 ### XNNPACK delegate provider
+
 *   `use_xnnpack`: `bool` (default=false) \
     Whether to explicitly apply the XNNPACK delegate. Note the XNNPACK delegate
     could be implicitly applied by the TF Lite runtime regardless the value of
@@ -135,16 +152,53 @@ the delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
     `false` explicitly.
 
 ### CoreML delegate provider
+
 *   `use_coreml`: `bool` (default=false) \
-    Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/coreml).
+    Whether to use the
+    [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/coreml).
     This option is only available in iOS.
 *   `coreml_version`: `int` (default=0) \
     Target Core ML version for model conversion. The default value is 0 and it
     means using the newest version that's available on the device.
 
 ### External delegate provider
+
 *   `external_delegate_path`: `string` (default="") \
     Path to the external delegate library to use.
 *   `external_delegate_options`: `string` (default="") \
     A list of options to be passed to the external delegate library. Options
     should be in the format of `option1:value1;option2:value2;optionN:valueN`
+
+### Stable delegate provider [Experimental API]
+
+The stable delegate provider provides a `TfLiteOpaqueDelegate` object pointer
+and its corresponding deleter by loading a dynamic library that encapsulates the
+actual TFLite delegate implementation in a
+[`TfLiteStableDelegate`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h)
+struct instance.
+
+While the structure of the stable delegate provider is similar to the external
+delegate provider, which provides the
+[external delegates](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/external),
+the design objectives of the stable delegates and the external delegates are
+different.
+
+-   Stable delegates are designed to work with shared object files that support
+    ABI backward compatibility; that is, the delegate and the TF Lite runtime
+    won't need to be built using the exact same version of TF Lite as the app.
+    However, this is work in progress and the ABI stability is not yet
+    guaranteed.
+-   External delegates were developed mainly for delegate evaluation
+    (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/external).
+
+The stable delegates and the external delegates use different APIs for
+diagnosing errors, creating and destroying the delegates. For more details of
+the concrete API differences, please check
+[stable_delegate.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h)
+and
+[external_delegate.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/external/external_delegate.h).
+
+The stable delegate provider is not supported on Windows platform.
+
+*   `stable_abi_delegate_settings_file`: `string` (default="") \
+    Path to the delegate settings JSON file.
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/BUILD b/tensorflow/lite/tools/delegates/compatibility/common/BUILD
index 4d31c94de91..5bb532cf7e2 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/common/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -14,6 +15,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "@com_google_absl//absl/status",
     ],
@@ -29,9 +31,10 @@ cc_library(
         ":delegate_compatibility_checker_util",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:model_builder",
-        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "//tensorflow/lite/tools/versioning:op_signature",
diff --git a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
index 03d51065c0a..52df79cbfd4 100644
--- a/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
+++ b/tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
 
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
index 8b688a3c623..1cdac219ce6 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD b/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
index 91b4071e4d2..93d0c25af5c 100644
--- a/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/nnapi/BUILD
@@ -1,5 +1,6 @@
 # BUILD rules for NNAPI delegate compatibility checking.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -18,6 +19,7 @@ cc_library(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
     ],
 )
@@ -32,8 +34,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/nnapi:nnapi_lib",
         "//tensorflow/lite/tools/delegates/compatibility/common:delegate_compatibility_checker_base",
         "//tensorflow/lite/tools/delegates/compatibility/common:delegate_compatibility_checker_util",
@@ -52,7 +54,7 @@ cc_test(
     ],
     deps = [
         ":nnapi_compatibility_lib",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib_test.cc b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib_test.cc
index 691768f8ef0..8783aa14d7b 100644
--- a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib_test.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_compatibility_lib_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.cc b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.cc
index ef2fd310ae1..52c3be45bf5 100644
--- a/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/nnapi/nnapi_delegate_compatibility_checker.cc
@@ -27,9 +27,9 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_util.h"
 #include "tensorflow/lite/tools/delegates/compatibility/common/online_helper_delegate.h"
diff --git a/tensorflow/lite/tools/delegates/compatibility/protos/BUILD b/tensorflow/lite/tools/delegates/compatibility/protos/BUILD
index 2530cef65f5..bce6649a658 100644
--- a/tensorflow/lite/tools/delegates/compatibility/protos/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/protos/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index a4ff255c8d5..0017cfeca9d 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -87,6 +87,12 @@ TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
         params.Get<int>("max_delegated_partitions");
     coreml_opts.min_nodes_per_partition =
         params.Get<int>("min_nodes_per_partition");
+#ifdef TFLITE_DEBUG_DELEGATE
+    coreml_opts.first_delegate_node_index =
+        params.Get<int>("first_delegate_node_index");
+    coreml_opts.last_delegate_node_index =
+        params.Get<int>("last_delegate_node_index");
+#endif  // TFLITE_DEBUG_DELEGATE
     delegate = TfLiteDelegatePtr(TfLiteCoreMlDelegateCreate(&coreml_opts),
                                  &TfLiteCoreMlDelegateDelete);
     if (!delegate) {
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index f984c5d6f29..591ce1f8761 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 #include <string>
 #include <utility>
 
@@ -36,6 +37,11 @@ class DefaultExecutionProvider : public DelegateProvider {
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("delegate_serialize_token",
                              ToolParam::Create<std::string>(""));
+    default_params_.AddParam("first_delegate_node_index",
+                             ToolParam::Create<int32_t>(0));
+    default_params_.AddParam(
+        "last_delegate_node_index",
+        ToolParam::Create<int32_t>(std::numeric_limits<int32_t>::max()));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -62,6 +68,14 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
           "The minimal number of TFLite graph nodes of a partition that has to "
           "be reached for it to be delegated.A negative value or 0 means to "
           "use the default choice of each delegate."),
+      CreateFlag<int32_t>(
+          "first_delegate_node_index", params,
+          "The index of the first node that could be delegated. Used only when "
+          "TFLITE_DEBUG_DELEGATE is defined. Default is 0."),
+      CreateFlag<int32_t>(
+          "last_delegate_node_index", params,
+          "The index of the last node that could be delegated. Used only when "
+          "TFLITE_DEBUG_DELEGATE is defined. Default is INT_MAX."),
       CreateFlag<std::string>(
           "delegate_serialize_dir", params,
           "Directory to be used by delegates for serializing any model data. "
@@ -89,6 +103,10 @@ void DefaultExecutionProvider::LogParams(const ToolParams& params,
                  "Max number of delegated partitions", verbose);
   LOG_TOOL_PARAM(params, int32_t, "min_nodes_per_partition",
                  "Min nodes per partition", verbose);
+  LOG_TOOL_PARAM(params, int32_t, "first_delegate_node_index",
+                 "Index of the first node that could be delegated", verbose);
+  LOG_TOOL_PARAM(params, int32_t, "last_delegate_node_index",
+                 "Index of the first node that could be delegated", verbose);
   LOG_TOOL_PARAM(params, std::string, "delegate_serialize_dir",
                  "Directory for delegate serialization", verbose);
   LOG_TOOL_PARAM(params, std::string, "delegate_serialize_token",
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
new file mode 100644
index 00000000000..e517583c273
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
@@ -0,0 +1,63 @@
+# Provides stable ABI delegate.
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+cc_library(
+    name = "delegate_provider",
+    srcs = [
+        "stable_delegate_provider.cc",
+    ],
+    copts = tflite_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/core/shims:common",
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+    ] + select({
+        # Stable delegate does not support Windows because the shared library loader hasn't been
+        # extended to support Windows.
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "//tensorflow/lite/core/shims:delegate_plugin",
+            "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
+            "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tflite_settings_json_parser",
+            "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+            "//tensorflow/lite/experimental/acceleration/configuration/c:stable_delegate",
+        ],
+    }),
+    # Statically registers itself with DelegateProviderRegistrar.
+    alwayslink = 1,
+)
+
+cc_test(
+    name = "delegate_provider_test",
+    size = "small",
+    srcs = ["stable_delegate_provider_test.cc"],
+    data = [
+        ":test_invalid_settings.json",
+        ":test_missing_delegate_path_settings.json",
+        ":test_missing_stable_delegate_settings.json",
+        ":test_sample_stable_delegate_settings.json",
+        ":test_stable_xnnpack_settings.json",
+        "//tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:tensorflowlite_sample_stable_delegate",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:tensorflowlite_stable_xnnpack_delegate",
+    ],
+    # Disable the test on Windows as the shared library loader doesn't support it.
+    tags = ["no-windows"],
+    deps = [
+        ":delegate_provider",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/tools:tool_params",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+        "//tensorflow/lite/tools/delegates:delegate_provider_lib",
+        "@com_google_googletest//:gtest_main",
+        "@pthreadpool",
+    ],
+)
+
+exports_files(
+    srcs = [
+        "test_invalid_settings.json",
+        "test_sample_stable_delegate_settings.json",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
new file mode 100644
index 00000000000..c0c1f5147bd
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/core/shims/c/common.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+
+#if !defined(_WIN32)
+#include "tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/tflite_settings_json_parser.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#endif  // !defined(_WIN32)
+
+namespace tflite {
+namespace tools {
+
+// The stable delegate provider is disabled on Windows as the delegate shared
+// library loader doesn't support Windows platform.
+#if !defined(_WIN32)
+namespace {
+TfLiteDelegatePtr CreateStableDelegate(const std::string& settings_file_path) {
+  TfLiteDelegatePtr null_delegate = CreateNullDelegate();
+  if (settings_file_path.empty()) {
+    return null_delegate;
+  }
+  delegates::utils::TfLiteSettingsJsonParser parser;
+  const TFLiteSettings* tflite_settings = parser.Parse(settings_file_path);
+  if (!tflite_settings || !tflite_settings->stable_delegate_loader_settings() ||
+      !tflite_settings->stable_delegate_loader_settings()->delegate_path()) {
+    TFLITE_LOG(ERROR) << "Invalid TFLiteSettings for the stable delegate.";
+    return null_delegate;
+  }
+  std::string delegate_path = tflite_settings->stable_delegate_loader_settings()
+                                  ->delegate_path()
+                                  ->str();
+  auto stable_delegate_pointer =
+      delegates::utils::LoadDelegateFromSharedLibrary(delegate_path);
+  if (!stable_delegate_pointer || !stable_delegate_pointer->delegate_plugin) {
+    TFLITE_LOG(ERROR)
+        << "Failed to load stable ABI delegate pointer from stable ABI "
+           "delegate binary ("
+        << delegate_path << ".";
+    return null_delegate;
+  }
+  const TfLiteOpaqueDelegatePlugin* delegate_plugin =
+      stable_delegate_pointer->delegate_plugin;
+  return TfLiteDelegatePtr(delegate_plugin->create(tflite_settings),
+                           delegate_plugin->destroy);
+}
+}  // namespace
+#endif  // !defined(_WIN32)
+
+class StableAbiDelegateProvider : public DelegateProvider {
+ public:
+  StableAbiDelegateProvider() {
+    default_params_.AddParam("stable_delegate_settings_file",
+                             ToolParam::Create<std::string>(""));
+  }
+
+  std::vector<Flag> CreateFlags(ToolParams* params) const final;
+
+  void LogParams(const ToolParams& params, bool verbose) const final;
+
+  TfLiteDelegatePtr CreateTfLiteDelegate(const ToolParams& params) const final;
+  std::pair<TfLiteDelegatePtr, int> CreateRankedTfLiteDelegate(
+      const ToolParams& params) const final;
+
+  std::string GetName() const final { return "STABLE_DELEGATE"; }
+};
+REGISTER_DELEGATE_PROVIDER(StableAbiDelegateProvider);
+
+std::vector<Flag> StableAbiDelegateProvider::CreateFlags(
+    ToolParams* params) const {
+  std::vector<Flag> flags = {
+      CreateFlag<std::string>("stable_delegate_settings_file", params,
+                              "The path to the delegate settings JSON file.")};
+  return flags;
+}
+
+void StableAbiDelegateProvider::LogParams(const ToolParams& params,
+                                          bool verbose) const {
+  if (params.Get<std::string>("stable_delegate_settings_file").empty()) return;
+
+  LOG_TOOL_PARAM(params, std::string, "stable_delegate_settings_file",
+                 "Delegate settings file path", verbose);
+}
+
+TfLiteDelegatePtr StableAbiDelegateProvider::CreateTfLiteDelegate(
+    const ToolParams& params) const {
+#if !defined(_WIN32)
+  std::string stable_delegate_settings_file =
+      params.Get<std::string>("stable_delegate_settings_file");
+  return CreateStableDelegate(stable_delegate_settings_file);
+#else   // !defined(_WIN32)
+  return CreateNullDelegate();
+#endif  // !defined(_WIN32)
+}
+
+std::pair<TfLiteDelegatePtr, int>
+StableAbiDelegateProvider::CreateRankedTfLiteDelegate(
+    const ToolParams& params) const {
+  auto ptr = CreateTfLiteDelegate(params);
+  return std::make_pair(std::move(ptr), params.GetPosition<std::string>(
+                                            "stable_delegate_settings_file"));
+}
+
+}  // namespace tools
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider_test.cc b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider_test.cc
new file mode 100644
index 00000000000..6fb46a20187
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "pthreadpool.h"  // from @pthreadpool
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/tool_params.h"
+
+namespace tflite {
+namespace tools {
+namespace {
+
+static constexpr char kTestSettingsSrcDir[] =
+    "tensorflow/lite/tools/delegates/experimental/stable_delegate/";
+static constexpr char kGoodStableDelegateSettings[] =
+    "test_sample_stable_delegate_settings.json";
+static constexpr char kGoodXNNPackDelegateSettings[] =
+    "test_stable_xnnpack_settings.json";
+static constexpr char kBadMissingFile[] = "missing.json";
+static constexpr char kBadInvalidSettings[] = "test_invalid_settings.json";
+static constexpr char kBadMissingStableDelegateSettings[] =
+    "test_missing_stable_delegate_settings.json";
+static constexpr char kBadMissingDelegatePathSettings[] =
+    "test_missing_delegate_path_settings.json";
+
+std::vector<ProvidedDelegateList::ProvidedDelegate> CreateDelegates(
+    const std::string& settings_file_path) {
+  ToolParams params;
+  ProvidedDelegateList providers(&params);
+  providers.AddAllDelegateParams();
+  params.Set<std::string>("stable_delegate_settings_file", settings_file_path,
+                          /*position=*/1);
+
+  return providers.CreateAllRankedDelegates();
+}
+
+TEST(StableAbiDelegateProviderTest, CreateDelegate) {
+  auto delegates = CreateDelegates(std::string(kTestSettingsSrcDir) +
+                                   kGoodStableDelegateSettings);
+
+  // Only the stable ABI delegate is registered.
+  EXPECT_EQ(1, delegates.size());
+  EXPECT_EQ("STABLE_DELEGATE", delegates.front().provider->GetName());
+  EXPECT_NE(nullptr, delegates.front().delegate.get());
+  EXPECT_EQ(1, delegates.front().rank);
+}
+
+TEST(StableAbiDelegateProviderTest, CreateDelegateWithStableXNNPack) {
+  auto delegates = CreateDelegates(std::string(kTestSettingsSrcDir) +
+                                   kGoodXNNPackDelegateSettings);
+
+  EXPECT_EQ(1, delegates.size());
+  EXPECT_EQ("STABLE_DELEGATE", delegates.front().provider->GetName());
+  EXPECT_NE(nullptr, delegates.front().delegate.get());
+  EXPECT_EQ(1, delegates.front().rank);
+  pthreadpool_t threadpool = static_cast<pthreadpool_t>(
+      TfLiteXNNPackDelegateGetThreadPool(delegates.front().delegate.get()));
+  EXPECT_EQ(5, pthreadpool_get_threads_count(threadpool));
+}
+
+TEST(StableAbiDelegateProviderTest, CreateDelegateFailedWithInvalidSettings) {
+  std::vector<std::string> invalid_settings_names = {
+      kBadMissingFile, kBadInvalidSettings, kBadMissingStableDelegateSettings,
+      kBadMissingDelegatePathSettings};
+
+  for (const std::string& name : invalid_settings_names) {
+    auto delegates = CreateDelegates(std::string(kTestSettingsSrcDir) + name);
+
+    EXPECT_EQ(0, delegates.size());
+  }
+}
+
+TEST(StableAbiDelegateProviderTest, CreateDelegateFailedWithBlankSettingsPath) {
+  auto delegates = CreateDelegates("");
+
+  EXPECT_EQ(0, delegates.size());
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_invalid_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_invalid_settings.json
new file mode 100755
index 00000000000..7dcdf5d1321
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_invalid_settings.json
@@ -0,0 +1,4 @@
+// Test only invalid delegate settings file
+{
+  "invalid": "invalid"
+}
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_delegate_path_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_delegate_path_settings.json
new file mode 100755
index 00000000000..5024bbaa841
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_delegate_path_settings.json
@@ -0,0 +1,6 @@
+// Test only TF Lite settings file. Missing a delegate path in the stable
+// delegate settings.
+{
+  "delegate": "NONE",
+  "stable_delegate_loader_settings": {}
+}
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_stable_delegate_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_stable_delegate_settings.json
new file mode 100755
index 00000000000..508c467051d
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_missing_stable_delegate_settings.json
@@ -0,0 +1,4 @@
+// Test only TF Lite settings file without stable delegate loader settings.
+{
+  "delegate": "NONE"
+}
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json
new file mode 100755
index 00000000000..5e7d0394446
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_sample_stable_delegate_settings.json
@@ -0,0 +1,11 @@
+// Test only sample stable delegate settings file.
+//
+// This file follows the TFLiteSettings message structure in
+// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+// The stable delegate provider unit tests use it.
+{
+  "delegate": "NONE",
+  "stable_delegate_loader_settings": {
+    "delegate_path": "third_party/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so"
+  }
+}
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json
new file mode 100755
index 00000000000..c96368d93bc
--- /dev/null
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/test_stable_xnnpack_settings.json
@@ -0,0 +1,14 @@
+// Test only stable XNNPack delegate settings file.
+//
+// This file follows the TFLiteSettings message structure in
+// tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+// The stable delegate provider unit tests use it.
+{
+  "delegate": "XNNPACK",
+  "stable_delegate_loader_settings": {
+    "delegate_path": "third_party/tensorflow/lite/delegates/utils/experimental/stable_delegate/libtensorflowlite_stable_xnnpack_delegate.so"
+  },
+  "xnnpack_settings": {
+    "num_threads": 5
+  }
+}
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index c70de14d98f..462337ffa7d 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -17,6 +17,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_stable_tflite_abi", "cc_test_with_tflite")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -30,7 +31,7 @@ cc_library(
     hdrs = ["evaluation_stage.h"],
     copts = tflite_copts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
     ],
 )
@@ -69,18 +70,12 @@ cc_library_with_stable_tflite_abi(
         "//tensorflow/lite/tools/delegates:delegate_provider_lib",
     ],
     tflite_deps_selects = [{
-        # XNNPACK does not support s390x
-        # (see <https://github.com/tensorflow/tensorflow/pull/51655>).
-        "//tensorflow:linux_s390x": [],
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
             "//tensorflow/lite/core/shims:xnnpack_plugin",
         ],
     }],
     deps = select({
-        # XNNPACK does not support s390x
-        # (see <https://github.com/tensorflow/tensorflow/pull/51655>).
-        "//tensorflow:linux_s390x": [],
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
             "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage.h b/tensorflow/lite/tools/evaluation/evaluation_stage.h
index 36203a69804..13b8faaeb55 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_stage.h
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index b21fb6d7b83..994ebb282ae 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -19,6 +19,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 5077c412628..35f7d70d875 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -16,6 +16,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -107,9 +108,9 @@ cc_library(
     deps = [
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
@@ -129,7 +130,8 @@ cc_test(
     deps = [
         ":tflite_inference_stage",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/tools/evaluation:utils",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
@@ -169,6 +171,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "//tensorflow/tsl/util:stats_calculator_portable",
+        "@FP16",
     ],
 )
 
@@ -180,7 +183,7 @@ cc_test(
     linkstatic = 1,
     deps = [
         ":inference_profiler_stage",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
@@ -195,7 +198,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/core:tflite_portable_logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
@@ -226,7 +229,7 @@ cc_library(
         ":object_detection_average_precision_stage",
         ":tflite_inference_stage",
         "//tensorflow/core:tflite_portable_logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools/evaluation:evaluation_delegate_provider",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
         "//tensorflow/lite/tools/evaluation:utils",
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
index 2268796f179..613358e846c 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <random>
 
+#include "fp16.h"  // from @FP16
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
@@ -97,10 +98,12 @@ TfLiteStatus InferenceProfilerStage::Init(
   for (int i = 0; i < model_info_->inputs.size(); ++i) {
     const TfLiteType model_input_type = model_info_->inputs[i]->type;
     if (model_input_type == kTfLiteUInt8 || model_input_type == kTfLiteInt8 ||
-        model_input_type == kTfLiteFloat32) {
+        model_input_type == kTfLiteFloat32 ||
+        model_input_type == kTfLiteFloat16) {
     } else {
-      LOG(ERROR) << "InferenceProfilerStage only supports float/int8/uint8 "
-                    "input types";
+      LOG(ERROR)
+          << "InferenceProfilerStage only supports float16/float32/int8/uint8 "
+             "input types";
       return kTfLiteError;
     }
     auto* input_shape = model_info_->inputs[i]->dims;
@@ -112,6 +115,7 @@ TfLiteStatus InferenceProfilerStage::Init(
     float_tensors_.emplace_back();
     uint8_tensors_.emplace_back();
     int8_tensors_.emplace_back();
+    float16_tensors_.emplace_back();
   }
   // Preprocess output metadata for calculating diffs later.
   for (int i = 0; i < model_info_->outputs.size(); ++i) {
@@ -119,7 +123,7 @@ TfLiteStatus InferenceProfilerStage::Init(
     if (model_output_type == kTfLiteUInt8 || model_output_type == kTfLiteInt8 ||
         model_output_type == kTfLiteFloat32) {
     } else {
-      LOG(ERROR) << "InferenceProfilerStage only supports float/int8/uint8 "
+      LOG(ERROR) << "InferenceProfilerStage only supports float32/int8/uint8 "
                     "output types";
       return kTfLiteError;
     }
@@ -155,6 +159,19 @@ TfLiteStatus InferenceProfilerStage::Run() {
       GenerateRandomGaussianData(input_num_elements_[i], -1, 1,
                                  &(float_tensors_[i]));
       input_ptrs.push_back(float_tensors_[i].data());
+    } else if (model_input_type == kTfLiteFloat16) {
+      GenerateRandomGaussianData(input_num_elements_[i], -1, 1,
+                                 &(float_tensors_[i]));
+      for (size_t j = 0; j < float_tensors_[i].size(); j++) {
+        float16_tensors_[i][j] =
+            fp16_ieee_from_fp32_value(float_tensors_[i][j]);
+      }
+      input_ptrs.push_back(float16_tensors_[i].data());
+    } else {
+      LOG(ERROR)
+          << "InferenceProfilerStage only supports float16/float32/int8/uint8 "
+             "input types";
+      return kTfLiteError;
     }
   }
 
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
index 2bfbd76de55..86c3368f7cd 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -59,11 +59,12 @@ class InferenceProfilerStage : public EvaluationStage {
   // One Stat for each model output.
   std::vector<tsl::Stat<float>> error_stats_;
 
-  // One of the following 3 will be populated based on model_input_type_, and
-  // used as the input for the underlying model.
+  // One of the following vectors will be populated based on model_input_type_,
+  // and used as the input for the underlying model.
   std::vector<std::vector<float>> float_tensors_;
   std::vector<std::vector<int8_t>> int8_tensors_;
   std::vector<std::vector<uint8_t>> uint8_tensors_;
+  std::vector<std::vector<uint16_t>> float16_tensors_;
 };
 
 }  // namespace evaluation
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage_test.cc
index 3404006ac68..9b4e8603f33 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
index 5bb8a1cfa44..52f3edf376c 100644
--- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index d2ad0e86214..95173b6493d 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/profiling/time.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index 983c481d566..43fdda3d2de 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
index 1597a18d625..019f06413c1 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
diff --git a/tensorflow/lite/tools/evaluation/stages/utils/BUILD b/tensorflow/lite/tools/evaluation/stages/utils/BUILD
index f2a6c695705..2548d88a3d8 100644
--- a/tensorflow/lite/tools/evaluation/stages/utils/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/utils/BUILD
@@ -16,6 +16,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/lite:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/tools/evaluation/tasks/BUILD b/tensorflow/lite/tools/evaluation/tasks/BUILD
index 52b726c485e..a1d0a37d816 100644
--- a/tensorflow/lite/tools/evaluation/tasks/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/BUILD
@@ -2,12 +2,18 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
     licenses = ["notice"],
 )
 
+exports_files(
+    ["task_executor_c_api.h"],
+    visibility = ["//tensorflow/lite/tools/evaluation/tasks:__subpackages__"],
+)
+
 cc_library(
     name = "task_executor",
     srcs = ["task_executor.cc"],
@@ -34,3 +40,21 @@ cc_library(
         "@com_google_absl//absl/types:optional",
     ],
 )
+
+cc_library(
+    name = "task_executor_c_api",
+    srcs = ["task_executor_c_api.cc"],
+    hdrs = [
+        "task_executor_c_api.h",
+    ],
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite/tools/evaluation/tasks:__subpackages__",
+    ],
+    deps = [
+        ":task_executor",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
+    ],
+)
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
index d1f495b6cc1..5dba313ead8 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -26,7 +27,7 @@ cc_library(
     copts = tflite_copts(),
     linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 9e28775b06f..453d4ce74d9 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
index e31febdcad2..3a0b2fb9eb0 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -14,7 +15,7 @@ cc_library(
     copts = tflite_copts(),
     linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index e6809277cf8..c2a08244786 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
index f168e74fa25..0c6cf3e67f9 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/tools/evaluation/tasks:build_def.bzl", "task_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -14,7 +15,7 @@ cc_library(
     copts = tflite_copts(),
     linkopts = task_linkopts(),
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
index dc33f58b0f9..bbd5fdf5ec1 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/README.md
@@ -52,7 +52,7 @@ and the following optional parameters:
 
 *   `delegate`: `string` \
     If provided, tries to use the specified delegate on the test Interpreter.
-    Valid values: "nnapi", "gpu", "hexagon".
+    Valid values: "nnapi", "gpu", "hexagon", "coreml".
 
     NOTE: Please refer to the
     [Hexagon delegate documentation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
@@ -117,3 +117,9 @@ adb shell /data/local/tmp/run_eval \
 ```
 adb pull /data/local/tmp/inference_diff.txt ~/accuracy_tool
 ```
+
+## Running the binary on iOS
+
+Follow the instructions [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/ios/README.md)
+to run the binary on iOS using the
+[iOS evaluation app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/ios).
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index c4ca5ba1b04..207f63beb18 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
new file mode 100644
index 00000000000..37177ac5316
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
@@ -0,0 +1,37 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+# Main target for the inference diff tool iOS framework.
+# bazel build --config=ios_arm64 -c opt --cxxopt=-std=c++17 //tensorflow/lite/tools/evaluation/tasks/ios:TensorFlowLiteInferenceDiffC_framework
+ios_static_framework(
+    name = "TensorFlowLiteInferenceDiffC_framework",
+    hdrs = [
+        "//tensorflow/lite/tools:logging.h",
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor_c_api.h",
+    ],
+    bundle_name = "TensorFlowLiteInferenceDiffC",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/tools/evaluation/tasks:task_executor_c_api",
+        "//tensorflow/lite/tools/evaluation/tasks/inference_diff:run_eval_lib",
+    ],
+)
+
+# Used for building TensorFlowLiteInferenceDiffC_framework framework.
+build_test(
+    name = "framework_build_test",
+    # build_test targets are not meant to be run with sanitizers.
+    tags = [
+        "nomsan",
+        "notsan",
+    ],
+    targets = [
+        ":TensorFlowLiteInferenceDiffC_framework",
+    ],
+)
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/README.md b/tensorflow/lite/tools/evaluation/tasks/ios/README.md
new file mode 100644
index 00000000000..e4d1774c43c
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/README.md
@@ -0,0 +1,43 @@
+# TFLite iOS evaluation app.
+
+## Description
+
+An iOS app to evaluate TFLite models. This is mainly for running different
+evaluation tasks on iOS. Right now it only supports evaluating the inference
+difference between cpu and delegates.
+
+The app reads evaluation parameters from a JSON file named
+`evaluation_params.json` in its `evaluation_data` directory. Any downloaded
+models for evaluation should also be placed in `evaluation_data` directory.
+
+The JSON file specifies the name of the model file and other evaluation
+parameters like number of iterations, number of threads, delegate name. The
+default values in the JSON file are for the Mobilenet_v2_1.0_224 model
+([tflite&pb][mobilenet-model]).
+
+## Building / running the app
+
+*   Follow the [iOS build instructions][build-ios] to configure the Bazel
+    workspace and `.bazelrc` file correctly.
+
+*   Run `build_evaluation_framework.sh` script to build the evaluation
+    framework. This script will build the evaluation framework targeting iOS
+    arm64 and put it under `TFLiteEvaluation/TFLiteEvaluation/Frameworks`
+    directory.
+
+*   Update evaluation parameters in `evaluation_params.json`.
+
+*   Change `Build Phases -> Copy Bundle Resources` and add the model file to the
+    resources that need to be copied.
+
+*   Ensure that `Build Phases -> Link Binary With Library` contains the
+    `Accelerate framework` and `TensorFlowLiteInferenceDiffC.framework`.
+
+*   Now try running the app. The app has a single button that runs the
+    evaluation on the model and displays results in a text view below. You can
+    also see the console output section in your Xcode to see more detailed
+    information.
+
+[build-ios]: https://tensorflow.org/lite/guide/build_ios
+[mobilenet-model]: https://github.com/tensorflow/tflite-support/raw/master/tensorflow_lite_support/metadata/python/tests/testdata/image_classifier/mobilenet_v2_1.0_224.tflite
+[mobilenet-paper]: https://arxiv.org/pdf/1704.04861.pdf
\ No newline at end of file
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation.xcodeproj/project.pbxproj b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..645d5c6f6ba
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation.xcodeproj/project.pbxproj
@@ -0,0 +1,413 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 55;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1F367E8128ADDEE40035EC64 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1F367E8028ADDEE40035EC64 /* AppDelegate.m */; };
+		1F367E8728ADDEE40035EC64 /* EvaluationViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1F367E8628ADDEE40035EC64 /* EvaluationViewController.mm */; };
+		1F367E8A28ADDEE40035EC64 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1F367E8828ADDEE40035EC64 /* Main.storyboard */; };
+		1F367E8C28ADDEE50035EC64 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 1F367E8B28ADDEE50035EC64 /* Assets.xcassets */; };
+		1F367E9228ADDEE50035EC64 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 1F367E9128ADDEE50035EC64 /* main.m */; };
+		1F367E9D28ADE3EA0035EC64 /* CoreML.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1F367E9C28ADE3EA0035EC64 /* CoreML.framework */; };
+		1F367E9F28ADE4270035EC64 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1F367E9E28ADE4270035EC64 /* Accelerate.framework */; };
+		1F367EA628B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1F367EA528B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework */; };
+		1F367EA728B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 1F367EA528B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		1F367EAE28B02D9F0035EC64 /* evaluation_params.json in Resources */ = {isa = PBXBuildFile; fileRef = 1F367EAD28B02D9F0035EC64 /* evaluation_params.json */; };
+		1F367EB228B02F500035EC64 /* mobilenet_v2_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 1F367EB128B02F500035EC64 /* mobilenet_v2_1.0_224.tflite */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		1F367EA428AE042F0035EC64 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 8;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				1F367EA728B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		1F367E7C28ADDEE40035EC64 /* TFLiteEvaluation.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TFLiteEvaluation.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1F367E7F28ADDEE40035EC64 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		1F367E8028ADDEE40035EC64 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		1F367E9828ADDF1E0035EC64 /* EvaluationViewController.h */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.h; path = EvaluationViewController.h; sourceTree = "<group>"; };
+		1F367E8628ADDEE40035EC64 /* EvaluationViewController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = EvaluationViewController.mm; sourceTree = "<group>"; };
+		1F367E8928ADDEE40035EC64 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		1F367E8B28ADDEE50035EC64 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		1F367E9028ADDEE50035EC64 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		1F367E9128ADDEE50035EC64 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		1F367E9C28ADE3EA0035EC64 /* CoreML.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreML.framework; path = System/Library/Frameworks/CoreML.framework; sourceTree = SDKROOT; };
+		1F367E9E28ADE4270035EC64 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		1F367EA528B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = TensorFlowLiteInferenceDiffC.framework; path = TFLiteEvaluation/Frameworks/TensorFlowLiteInferenceDiffC.framework; sourceTree = "<group>"; };
+		1F367EAD28B02D9F0035EC64 /* evaluation_params.json */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.json; path = evaluation_params.json; sourceTree = "<group>"; };
+		1F367EB128B02F500035EC64 /* mobilenet_v2_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v2_1.0_224.tflite; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1F367E7928ADDEE40035EC64 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1F367E9D28ADE3EA0035EC64 /* CoreML.framework in Frameworks */,
+				1F367E9F28ADE4270035EC64 /* Accelerate.framework in Frameworks */,
+				1F367EA628B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		1F367E7328ADDEE40035EC64 = {
+			isa = PBXGroup;
+			children = (
+				1F367E7E28ADDEE40035EC64 /* TFLiteEvaluation */,
+				1F367E7D28ADDEE40035EC64 /* Products */,
+				1F367E9B28ADE3EA0035EC64 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		1F367E7D28ADDEE40035EC64 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1F367E7C28ADDEE40035EC64 /* TFLiteEvaluation.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		1F367E7E28ADDEE40035EC64 /* TFLiteEvaluation */ = {
+			isa = PBXGroup;
+			children = (
+				1F367EAC28B02D9F0035EC64 /* evaluation_data */,
+				1F367E7F28ADDEE40035EC64 /* AppDelegate.h */,
+				1F367E8028ADDEE40035EC64 /* AppDelegate.m */,
+				1F367E9828ADDF1E0035EC64 /* EvaluationViewController.h */,
+				1F367E8628ADDEE40035EC64 /* EvaluationViewController.mm */,
+				1F367E8828ADDEE40035EC64 /* Main.storyboard */,
+				1F367E8B28ADDEE50035EC64 /* Assets.xcassets */,
+				1F367E9028ADDEE50035EC64 /* Info.plist */,
+				1F367E9128ADDEE50035EC64 /* main.m */,
+			);
+			path = TFLiteEvaluation;
+			sourceTree = "<group>";
+		};
+		1F367E9B28ADE3EA0035EC64 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				1F367EA528B02BEF0035EC64 /* TensorFlowLiteInferenceDiffC.framework */,
+				1F367E9E28ADE4270035EC64 /* Accelerate.framework */,
+				1F367E9C28ADE3EA0035EC64 /* CoreML.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		1F367EAC28B02D9F0035EC64 /* evaluation_data */ = {
+			isa = PBXGroup;
+			children = (
+				1F367EB128B02F500035EC64 /* mobilenet_v2_1.0_224.tflite */,
+				1F367EAD28B02D9F0035EC64 /* evaluation_params.json */,
+			);
+			path = evaluation_data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1F367E7B28ADDEE40035EC64 /* TFLiteEvaluation */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1F367E9528ADDEE50035EC64 /* Build configuration list for PBXNativeTarget "TFLiteEvaluation" */;
+			buildPhases = (
+				1F367E7828ADDEE40035EC64 /* Sources */,
+				1F367E7928ADDEE40035EC64 /* Frameworks */,
+				1F367E7A28ADDEE40035EC64 /* Resources */,
+				1F367EA428AE042F0035EC64 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TFLiteEvaluation;
+			productName = TFLiteEvaluation;
+			productReference = 1F367E7C28ADDEE40035EC64 /* TFLiteEvaluation.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		1F367E7428ADDEE40035EC64 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastUpgradeCheck = 1340;
+				TargetAttributes = {
+					1F367E7B28ADDEE40035EC64 = {
+						CreatedOnToolsVersion = 13.4;
+					};
+				};
+			};
+			buildConfigurationList = 1F367E7728ADDEE40035EC64 /* Build configuration list for PBXProject "TFLiteEvaluation" */;
+			compatibilityVersion = "Xcode 13.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 1F367E7328ADDEE40035EC64;
+			productRefGroup = 1F367E7D28ADDEE40035EC64 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1F367E7B28ADDEE40035EC64 /* TFLiteEvaluation */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1F367E7A28ADDEE40035EC64 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1F367EAE28B02D9F0035EC64 /* evaluation_params.json in Resources */,
+				1F367EB228B02F500035EC64 /* mobilenet_v2_1.0_224.tflite in Resources */,
+				1F367E8C28ADDEE50035EC64 /* Assets.xcassets in Resources */,
+				1F367E8A28ADDEE40035EC64 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1F367E7828ADDEE40035EC64 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1F367E8728ADDEE40035EC64 /* EvaluationViewController.mm in Sources */,
+				1F367E9228ADDEE50035EC64 /* main.m in Sources */,
+				1F367E8128ADDEE40035EC64 /* AppDelegate.m in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		1F367E8828ADDEE40035EC64 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				1F367E8928ADDEE40035EC64 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		1F367E9328ADDEE50035EC64 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.5;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		1F367E9428ADDEE50035EC64 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 15.5;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		1F367E9628ADDEE50035EC64 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = DF5UT2LC7K;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/TFLiteEvaluation/Frameworks",
+				);
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_FILE = TFLiteEvaluation/Info.plist;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
+				INFOPLIST_KEY_UIMainStoryboardFile = Main;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = "-all_load";
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = google.com.TFLiteEvaluation;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		1F367E9728ADDEE50035EC64 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_TEAM = DF5UT2LC7K;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/TFLiteEvaluation",
+					"$(PROJECT_DIR)/TFLiteEvaluation/Frameworks",
+				);
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_FILE = TFLiteEvaluation/Info.plist;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
+				INFOPLIST_KEY_UIMainStoryboardFile = Main;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				OTHER_LDFLAGS = "-all_load";
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = google.com.TFLiteEvaluation;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1F367E7728ADDEE40035EC64 /* Build configuration list for PBXProject "TFLiteEvaluation" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1F367E9328ADDEE50035EC64 /* Debug */,
+				1F367E9428ADDEE50035EC64 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1F367E9528ADDEE50035EC64 /* Build configuration list for PBXNativeTarget "TFLiteEvaluation" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1F367E9628ADDEE50035EC64 /* Debug */,
+				1F367E9728ADDEE50035EC64 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 1F367E7428ADDEE40035EC64 /* Project object */;
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h
new file mode 100644
index 00000000000..f81af2f435b
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.h
@@ -0,0 +1,22 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.m b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.m
new file mode 100644
index 00000000000..e130cec062a
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/AppDelegate.m
@@ -0,0 +1,27 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "AppDelegate.h"
+
+@interface AppDelegate ()
+
+@end
+
+@implementation AppDelegate
+- (BOOL)application:(UIApplication *)application
+    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+  return YES;
+}
+@end
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AccentColor.colorset/Contents.json b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AccentColor.colorset/Contents.json
new file mode 100644
index 00000000000..eb878970081
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AccentColor.colorset/Contents.json
@@ -0,0 +1,11 @@
+{
+  "colors" : [
+    {
+      "idiom" : "universal"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 00000000000..9221b9bb1a3
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "scale" : "2x",
+      "size" : "20x20"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "3x",
+      "size" : "20x20"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "2x",
+      "size" : "29x29"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "3x",
+      "size" : "29x29"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "2x",
+      "size" : "40x40"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "3x",
+      "size" : "40x40"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "2x",
+      "size" : "60x60"
+    },
+    {
+      "idiom" : "iphone",
+      "scale" : "3x",
+      "size" : "60x60"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "1x",
+      "size" : "20x20"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "2x",
+      "size" : "20x20"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "1x",
+      "size" : "29x29"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "2x",
+      "size" : "29x29"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "1x",
+      "size" : "40x40"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "2x",
+      "size" : "40x40"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "1x",
+      "size" : "76x76"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "2x",
+      "size" : "76x76"
+    },
+    {
+      "idiom" : "ipad",
+      "scale" : "2x",
+      "size" : "83.5x83.5"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "scale" : "1x",
+      "size" : "1024x1024"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/Contents.json b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/Contents.json
new file mode 100644
index 00000000000..73c00596a7f
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard
new file mode 100644
index 00000000000..9b576fb0625
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Base.lproj/Main.storyboard
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="20037" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina6_1" orientation="portrait" appearance="light"/>
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="20020"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--Evaluation View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="EvaluationViewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="414" height="896"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="system" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="j0O-Lq-1tJ">
+                                <rect key="frame" x="64" y="44" width="286" height="63"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="63" id="8VO-Ln-L2h"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="24"/>
+                                <state key="normal" title="Evaluate model"/>
+                                <connections>
+                                    <action selector="onEvaluateModel:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Rb1-hs-Mub"/>
+                                </connections>
+                            </button>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" textAlignment="natural" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Vd4-Gf-qKO">
+                                <rect key="frame" x="26" y="125" width="368" height="727"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                        </subviews>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="Vd4-Gf-qKO" firstAttribute="top" secondItem="j0O-Lq-1tJ" secondAttribute="bottom" constant="18" id="Kd3-pP-C1k"/>
+                            <constraint firstItem="j0O-Lq-1tJ" firstAttribute="centerX" secondItem="8bC-Xf-vdC" secondAttribute="centerX" id="QJU-cq-L87"/>
+                            <constraint firstItem="Vd4-Gf-qKO" firstAttribute="trailing" secondItem="8bC-Xf-vdC" secondAttribute="trailingMargin" id="Tew-W4-Vq5"/>
+                            <constraint firstItem="j0O-Lq-1tJ" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="Uce-n7-kZI"/>
+                            <constraint firstItem="j0O-Lq-1tJ" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="64" id="Uhq-Rw-NKT"/>
+                            <constraint firstItem="Vd4-Gf-qKO" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="26" id="aXc-6M-kyL"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="Vd4-Gf-qKO" secondAttribute="bottom" constant="10" id="tz5-wP-LZs"/>
+                        </constraints>
+                    </view>
+                    <connections>
+                        <outlet property="resultsView" destination="Vd4-Gf-qKO" id="dBT-f6-SYw"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="140" y="122.78860569715144"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h
new file mode 100644
index 00000000000..cd0dcc646ae
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.h
@@ -0,0 +1,22 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
+
+@interface EvaluationViewController : UIViewController
+@property(weak, nonatomic) IBOutlet UITextView *resultsView;
+
+@end
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.mm b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.mm
new file mode 100644
index 00000000000..aa0e55d59d8
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/EvaluationViewController.mm
@@ -0,0 +1,141 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "EvaluationViewController.h"
+#import <algorithm>
+#include <fstream>
+#import <sstream>
+#import <string>
+#import <vector>
+
+#import <TensorFlowLiteInferenceDiffC/TensorFlowLiteInferenceDiffC.h>
+
+namespace {
+
+NSString* const kDocumentsPrefix = @"/Documents/";
+NSString* const kModelFileKey = @"model_file";
+NSString* const kOutputFilePathKey = @"output_file_path";
+
+NSString* FilePathForResourceName(NSString* filename) {
+  NSString* name = [filename stringByDeletingPathExtension];
+  NSString* extension = [filename pathExtension];
+  NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
+  if (file_path == NULL) {
+    TFLITE_LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
+                      << "' in bundle.";
+  }
+  return file_path;
+}
+
+NSDictionary* ParseEvaluationParamsJson() {
+  NSString* params_json_path = FilePathForResourceName(@"evaluation_params.json");
+  NSData* data = [NSData dataWithContentsOfFile:params_json_path];
+  NSDictionary* dict = [NSJSONSerialization JSONObjectWithData:data options:kNilOptions error:nil];
+  return dict;
+}
+
+std::string FormatCommandLineParam(NSString* key, NSString* value) {
+  std::ostringstream stream;
+  stream << "--" << [key UTF8String] << "=" << [value UTF8String];
+  return stream.str();
+}
+
+// Reads the |evaluation_params.json| to read command line parameters and returns them as a vector
+// of strings.
+// Returns the evaluation parameters as key-value pairs.
+void ReadCommandLineParameters(std::vector<std::string>* params) {
+  NSDictionary* param_dict = ParseEvaluationParamsJson();
+  for (NSString* key in param_dict) {
+    NSString* value = param_dict[key];
+    if ([key isEqualToString:kModelFileKey]) {
+      value = FilePathForResourceName(value);
+    }
+    if ([key isEqualToString:kOutputFilePathKey]) {
+      if (![value hasPrefix:kDocumentsPrefix]) {
+        TFLITE_LOG(FATAL) << "Output file must be under the Document directory";
+      }
+      // Replace the prefix "/Documents/" with the actual documents path on the device.
+      NSString* documents = [NSSearchPathForDirectoriesInDomains(
+          NSDocumentDirectory, NSUserDomainMask, YES) objectAtIndex:0];
+      NSString* relpath = [value substringFromIndex:[kDocumentsPrefix length]];
+      value = [documents stringByAppendingPathComponent:relpath];
+
+      // Create the output directory if necessary.
+      NSString* path = value.stringByDeletingLastPathComponent;
+      if (![[NSFileManager defaultManager] createDirectoryAtPath:path
+                                     withIntermediateDirectories:YES
+                                                      attributes:nil
+                                                           error:nil]) {
+        TFLITE_LOG(FATAL) << "Cannot create output directory: " << [path UTF8String];
+      }
+
+      // Create the output file.
+      std::string output_file_path_ = std::string([value UTF8String]);
+      std::unique_ptr<std::ofstream> output_file_ =
+          std::make_unique<std::ofstream>(output_file_path_, std::ios::out | std::ios::binary);
+      if (!output_file_->is_open()) {
+        TFLITE_LOG(ERROR) << "Cannot open output file: " << output_file_path_;
+      } else {
+        TFLITE_LOG(INFO) << "Create output file: " << output_file_path_;
+      }
+    }
+    params->push_back(FormatCommandLineParam(key, value));
+  }
+}
+
+std::vector<char*> StringVecToCharPtrVec(const std::vector<std::string>& str_vec) {
+  std::vector<char*> charptr_vec;
+  std::transform(str_vec.begin(), str_vec.end(), std::back_inserter(charptr_vec),
+                 [](const std::string& s) -> char* { return const_cast<char*>(s.c_str()); });
+  return charptr_vec;
+}
+
+std::string EvaluationMetricsToString(TfLiteEvaluationMetrics* metrics) {
+  std::ostringstream stream;
+  stream << "Num evaluation runs: " << TfLiteEvaluationMetricsGetNumRuns(metrics);
+  TfLiteEvaluationMetricsLatency ref_latency = TfLiteEvaluationMetricsGetReferenceLatency(metrics);
+  stream << "\nReference run latency: avg=" << ref_latency.avg_us
+         << "(us), std_dev=" << ref_latency.std_deviation_us << "(us)";
+  TfLiteEvaluationMetricsLatency test_latency = TfLiteEvaluationMetricsGetTestLatency(metrics);
+  stream << "\nTest run latency: avg=" << test_latency.avg_us
+         << "(us), std_dev=" << test_latency.std_deviation_us << "(us)";
+  for (int i = 0; i < TfLiteEvaluationMetricsGetOutputErrorCount(metrics); ++i) {
+    TfLiteEvaluationMetricsAccuracy error = TfLiteEvaluationMetricsGetOutputError(metrics, i);
+    stream << "\nOutputDiff[" << i << "]: avg_error=" << error.avg_value
+           << ", std_dev=" << error.std_deviation;
+  }
+
+  return stream.str();
+}
+
+TfLiteEvaluationMetrics* RunEvaluation() {
+  std::vector<std::string> command_line_params;
+  ReadCommandLineParameters(&command_line_params);
+  std::vector<char*> argv = StringVecToCharPtrVec(command_line_params);
+  int argc = static_cast<int>(argv.size());
+  TfLiteEvaluationTask* task = TfLiteEvaluationTaskCreate();
+  return TfLiteEvaluationTaskRunWithArgs(task, argc, argv.data());
+}
+}  // namespace
+
+@interface EvaluationViewController ()
+@end
+
+@implementation EvaluationViewController
+- (IBAction)onEvaluateModel:(UIButton*)sender {
+  TfLiteEvaluationMetrics* metrics = RunEvaluation();
+  [_resultsView setText:[NSString stringWithUTF8String:EvaluationMetricsToString(metrics).c_str()]];
+}
+@end
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Info.plist b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Info.plist
new file mode 100644
index 00000000000..7151360814e
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/Info.plist
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>UILaunchStoryboardName</key>
+	<string>Main</string>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>arm64</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/evaluation_data/evaluation_params.json b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/evaluation_data/evaluation_params.json
new file mode 100644
index 00000000000..a7b984cd0d7
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/evaluation_data/evaluation_params.json
@@ -0,0 +1,8 @@
+{
+    "model_file" : "mobilenet_v2_1.0_224.tflite",
+    "output_file_path" : "/Documents/outputs/inference_diff.txt",
+    "num_runs" : "50",
+    "num_threads" : "1",
+    "num_interpreter_threads" : "1",
+    "delegate" : "coreml"
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/main.m b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/main.m
new file mode 100644
index 00000000000..1c3eb83a898
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/TFLiteEvaluation/TFLiteEvaluation/main.m
@@ -0,0 +1,26 @@
+// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+
+int main(int argc, char* argv[]) {
+  NSString* appDelegateClassName;
+  @autoreleasepool {
+    // Setup code that might create autoreleased objects goes here.
+    appDelegateClassName = NSStringFromClass([AppDelegate class]);
+  }
+  return UIApplicationMain(argc, argv, nil, appDelegateClassName);
+}
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh b/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh
new file mode 100644
index 00000000000..310a3045ab3
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+WORKSPACE_ROOT=$(bazel info workspace 2> /dev/null)
+EVALUATION_DIR=third_party/tensorflow/lite/tools/evaluation/tasks
+DEST_DIR="${EVALUATION_DIR}/ios/TFLiteEvaluation/TFLiteEvaluation/Frameworks"
+FRAMEWORK_TARGET=TensorFlowLiteInferenceDiffC_framework
+
+function usage() {
+  echo "Usage: $(basename "$0")"
+  exit 1
+}
+
+function check_ios_configured() {
+  if [ ! -f "${WORKSPACE_ROOT}/${EVALUATION_DIR}/ios/BUILD" ]; then
+    echo "ERROR: Inference Diff framework BUILD file not found."
+    echo "Please enable iOS support by running the \"./configure\" script" \
+         "from the workspace root."
+    exit 1
+  fi
+}
+
+function build_framework() {
+  set -x
+  pushd "${WORKSPACE_ROOT}"
+
+# Build the framework.
+  bazel build --config=ios_arm64 -c opt --cxxopt=-std=c++17 \
+      "//${EVALUATION_DIR}/ios:${FRAMEWORK_TARGET}"
+
+# Get the generated framework path.
+BAZEL_OUTPUT_FILE_PATH=$(bazel cquery "//${EVALUATION_DIR}/ios:${FRAMEWORK_TARGET}" --config=ios_arm64 --output=starlark --starlark:expr="' '.join([f.path for f in target.files.to_list()])")
+
+# Copy the framework into the destination and unzip.
+  mkdir -p "${DEST_DIR}"
+  cp -f "${BAZEL_OUTPUT_FILE_PATH}" \
+      "${DEST_DIR}"
+  pushd "${DEST_DIR}"
+  unzip -o "${FRAMEWORK_TARGET}.zip"
+  rm -f "${FRAMEWORK_TARGET}.zip"
+
+  popd
+  popd
+}
+
+check_ios_configured
+build_framework
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.cc b/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.cc
new file mode 100644
index 00000000000..78678aa022d
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.cc
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+#include "tensorflow/lite/tools/evaluation/tasks/task_executor.h"
+#include "tensorflow/lite/tools/logging.h"
+
+extern "C" {
+
+struct TfLiteEvaluationMetrics {
+  tflite::evaluation::EvaluationStageMetrics metrics;
+};
+
+struct TfLiteEvaluationTask {
+  std::unique_ptr<tflite::evaluation::TaskExecutor> task_executor;
+};
+
+int32_t TfLiteEvaluationMetricsGetNumRuns(
+    const TfLiteEvaluationMetrics* metrics) {
+  return metrics->metrics.num_runs();
+}
+
+extern TfLiteEvaluationMetricsLatency TfLiteEvaluationMetricsGetTestLatency(
+    const TfLiteEvaluationMetrics* metrics) {
+  const auto& latency = metrics->metrics.process_metrics()
+                            .inference_profiler_metrics()
+                            .test_latency();
+  return {latency.last_us(), latency.max_us(), latency.min_us(),
+          latency.sum_us(),  latency.avg_us(), latency.std_deviation_us()};
+}
+
+extern TfLiteEvaluationMetricsLatency
+TfLiteEvaluationMetricsGetReferenceLatency(
+    const TfLiteEvaluationMetrics* metrics) {
+  const auto& latency = metrics->metrics.process_metrics()
+                            .inference_profiler_metrics()
+                            .reference_latency();
+  return {latency.last_us(), latency.max_us(), latency.min_us(),
+          latency.sum_us(),  latency.avg_us(), latency.std_deviation_us()};
+}
+
+extern size_t TfLiteEvaluationMetricsGetOutputErrorCount(
+    const TfLiteEvaluationMetrics* metrics) {
+  return metrics->metrics.process_metrics()
+      .inference_profiler_metrics()
+      .output_errors()
+      .size();
+}
+
+extern TfLiteEvaluationMetricsAccuracy TfLiteEvaluationMetricsGetOutputError(
+    const TfLiteEvaluationMetrics* metrics, int32_t output_error_index) {
+  int32_t output_count = TfLiteEvaluationMetricsGetOutputErrorCount(metrics);
+  if (output_error_index < 0 || output_error_index >= output_count) {
+    return {};
+  }
+  const auto& accuracy = metrics->metrics.process_metrics()
+                             .inference_profiler_metrics()
+                             .output_errors()
+                             .at(output_error_index);
+  return {accuracy.max_value(), accuracy.min_value(), accuracy.avg_value(),
+          accuracy.std_deviation()};
+}
+
+TfLiteEvaluationTask* TfLiteEvaluationTaskCreate() {
+  auto task_executor = tflite::evaluation::CreateTaskExecutor();
+  return new TfLiteEvaluationTask{std::move(task_executor)};
+}
+
+TfLiteEvaluationMetrics* TfLiteEvaluationTaskRunWithArgs(
+    TfLiteEvaluationTask* evaluation_task, int argc, char** argv) {
+  const auto metrics_optional =
+      evaluation_task->task_executor->Run(&argc, argv);
+  if (!metrics_optional.has_value()) {
+    TFLITE_LOG(ERROR) << "Failed to run the task evaluation!";
+    return new TfLiteEvaluationMetrics{};
+  }
+  return new TfLiteEvaluationMetrics{std::move(metrics_optional.value())};
+}
+
+}  // extern "C"
diff --git a/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h b/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h
new file mode 100644
index 00000000000..3509850a7a5
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/tasks/task_executor_c_api.h
@@ -0,0 +1,93 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
+
+#include <cstddef>
+#include <cstdint>
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::LatencyMetrics proto.
+// -----------------------------------------------------------------------------
+
+typedef struct TfLiteEvaluationMetricsLatency {
+  // Latency for the last Run.
+  int64_t last_us;
+  // Maximum latency observed for any Run.
+  int64_t max_us;
+  // Minimum latency observed for any Run.
+  int64_t min_us;
+  // Sum of all Run latencies.
+  int64_t sum_us;
+  // Average latency across all Runs.
+  double avg_us;
+  // Standard deviation for latency across all Runs.
+  int64_t std_deviation_us;
+} TfLiteEvaluationMetricsLatency;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::AccuracyMetrics proto.
+// -----------------------------------------------------------------------------
+
+typedef struct TfLiteEvaluationMetricsAccuracy {
+  // Maximum value observed for any Run.
+  float max_value;
+  // Minimum value observed for any Run.
+  float min_value;
+  // Average value across all Runs.
+  double avg_value;
+  // Standard deviation across all Runs.
+  float std_deviation;
+} TfLiteEvaluationMetricsAccuracy;
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::EvaluationStageMetrics type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteEvaluationMetrics TfLiteEvaluationMetrics;
+
+extern int32_t TfLiteEvaluationMetricsGetNumRuns(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsLatency TfLiteEvaluationMetricsGetTestLatency(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsLatency
+TfLiteEvaluationMetricsGetReferenceLatency(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern size_t TfLiteEvaluationMetricsGetOutputErrorCount(
+    const TfLiteEvaluationMetrics* metrics);
+
+extern TfLiteEvaluationMetricsAccuracy TfLiteEvaluationMetricsGetOutputError(
+    const TfLiteEvaluationMetrics* metrics, int32_t output_error_index);
+
+// -----------------------------------------------------------------------------
+// C APIs corresponding to tflite::evaluation::TaskExecutor type.
+// -----------------------------------------------------------------------------
+typedef struct TfLiteEvaluationTask TfLiteEvaluationTask;
+
+extern TfLiteEvaluationTask* TfLiteEvaluationTaskCreate();
+
+extern TfLiteEvaluationMetrics* TfLiteEvaluationTaskRunWithArgs(
+    TfLiteEvaluationTask* evaluation_task, int argc, char** argv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_TASKS_TASK_EXECUTOR_C_API_H_
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 5bddae05c1a..af57c1c1189 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -25,9 +25,9 @@ limitations under the License.
 // TODO(b/240438534): enable nnapi.
 #if defined(__ANDROID__)
 #define TFLITE_SUPPORTS_NNAPI_DELEGATE 1
-#if defined(CL_DELEGATE_NO_GL)
 #define TFLITE_SUPPORTS_GPU_DELEGATE 1
-#endif  // defined(CL_DELEGATE_NO_GL)
+#elif defined(CL_DELEGATE_NO_GL)
+#define TFLITE_SUPPORTS_GPU_DELEGATE 1
 #endif  // defined(__ANDROID__)
 #endif  // TFLITE_WITH_STABLE_ABI
 
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 2cb0b16f9f4..da145a09d5e 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -24,9 +24,11 @@
 import copy
 import random
 import re
+import struct
 
 import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
+from tensorflow.lite.python import schema_util
 from tensorflow.python.platform import gfile
 
 _TFLITE_FILE_IDENTIFIER = b'TFL3'
@@ -125,6 +127,14 @@ def strip_strings(model):
   model.signatureDefs = None
 
 
+def type_to_name(tensor_type):
+  """Converts a numerical enum to a readable tensor type."""
+  for name, value in schema_fb.TensorType.__dict__.items():
+    if value == tensor_type:
+      return name
+  return None
+
+
 def randomize_weights(model, random_seed=0, buffers_to_skip=None):
   """Randomize weights in a model.
 
@@ -144,18 +154,36 @@ def randomize_weights(model, random_seed=0, buffers_to_skip=None):
   if buffers_to_skip is not None:
     buffer_ids = [idx for idx in buffer_ids if idx not in buffers_to_skip]
 
+  buffer_types = {}
+  for graph in model.subgraphs:
+    for op in graph.operators:
+      if op.inputs is None:
+        break
+      for input_idx in op.inputs:
+        tensor = graph.tensors[input_idx]
+        buffer_types[tensor.buffer] = type_to_name(tensor.type)
+
   for i in buffer_ids:
     buffer_i_data = buffers[i].data
     buffer_i_size = 0 if buffer_i_data is None else buffer_i_data.size
+    if buffer_i_size == 0:
+      continue
 
     # Raw data buffers are of type ubyte (or uint8) whose values lie in the
     # range [0, 255]. Those ubytes (or unint8s) are the underlying
     # representation of each datatype. For example, a bias tensor of type
     # int32 appears as a buffer 4 times it's length of type ubyte (or uint8).
-    # TODO(b/152324470): This does not work for float as randomized weights may
-    # end up as denormalized or NaN/Inf floating point numbers.
-    for j in range(buffer_i_size):
-      buffer_i_data[j] = random.randint(0, 255)
+    # For floats, we need to generate a valid float and then pack it into
+    # the raw bytes in place.
+    buffer_type = buffer_types.get(i, 'INT8')
+    if buffer_type.startswith('FLOAT'):
+      format_code = 'e' if buffer_type == 'FLOAT16' else 'f'
+      for offset in range(0, buffer_i_size, struct.calcsize(format_code)):
+        value = random.uniform(-0.5, 0.5)  # See http://b/152324470#comment2
+        struct.pack_into(format_code, buffer_i_data, offset, value)
+    else:
+      for j in range(buffer_i_size):
+        buffer_i_data[j] = random.randint(0, 255)
 
 
 def rename_custom_ops(model, map_custom_op_renames):
@@ -172,6 +200,24 @@ def rename_custom_ops(model, map_custom_op_renames):
         op_code.customCode = map_custom_op_renames[op_code_str].encode('ascii')
 
 
+def opcode_to_name(model, op_code):
+  """Converts a TFLite op_code to the human readable name.
+
+  Args:
+    model: The input tflite model.
+    op_code: The op_code to resolve to a readable name.
+
+  Returns:
+    A string containing the human readable op name, or None if not resolvable.
+  """
+  op = model.operatorCodes[op_code]
+  code = max(op.builtinCode, op.deprecatedBuiltinCode)
+  for name, value in vars(schema_fb.BuiltinOperator).items():
+    if value == code:
+      return name
+  return None
+
+
 def xxd_output_to_bytes(input_cc_file):
   """Converts xxd output C++ source file to bytes (immutable).
 
@@ -226,3 +272,24 @@ def xxd_output_to_object(input_cc_file):
   """
   model_bytes = xxd_output_to_bytes(input_cc_file)
   return convert_bytearray_to_object(model_bytes)
+
+
+def count_resource_variables(model):
+  """Calculates the number of unique resource variables in a model.
+
+  Args:
+    model: the input tflite model, either as bytearray or object.
+
+  Returns:
+    An integer number representing the number of unique resource variables.
+  """
+  if not isinstance(model, schema_fb.ModelT):
+    model = convert_bytearray_to_object(model)
+  unique_shared_names = set()
+  for subgraph in model.subgraphs:
+    for op in subgraph.operators:
+      builtin_code = schema_util.get_builtin_code_from_operator_code(
+          model.operatorCodes[op.opcodeIndex])
+      if builtin_code == schema_fb.BuiltinOperator.VAR_HANDLE:
+        unique_shared_names.add(op.builtinOptions.sharedName)
+  return len(unique_shared_names)
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index 08bc723b6e7..94cc4a982fe 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -230,5 +230,19 @@ def testXxdOutputToBytes(self):
     self.assertEqual(initial_bytes, final_bytes)
 
 
+class CountResourceVariablesTest(test_util.TensorFlowTestCase):
+
+  def testCountResourceVariables(self):
+    # 1. SETUP
+    # Define the initial model
+    initial_model = test_utils.build_mock_model()
+
+    # 2. Confirm that resource variables for mock model is 1
+    # The mock model is created with two VAR HANDLE ops, but with the same
+    # shared name.
+    self.assertEqual(
+        flatbuffer_utils.count_resource_variables(initial_model), 1)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/tools/gen_op_registration.cc b/tensorflow/lite/tools/gen_op_registration.cc
index a88267f4704..832469fb526 100644
--- a/tensorflow/lite/tools/gen_op_registration.cc
+++ b/tensorflow/lite/tools/gen_op_registration.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "re2/re2.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/gen_op_registration.h b/tensorflow/lite/tools/gen_op_registration.h
index edb4c98e9af..ed378468bb2 100644
--- a/tensorflow/lite/tools/gen_op_registration.h
+++ b/tensorflow/lite/tools/gen_op_registration.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 #define TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/list_flex_ops.h b/tensorflow/lite/tools/list_flex_ops.h
index f9bc7b952df..f36e827c48e 100644
--- a/tensorflow/lite/tools/list_flex_ops.h
+++ b/tensorflow/lite/tools/list_flex_ops.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <set>
 #include <string>
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 
 namespace tflite {
 namespace flex {
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.cc b/tensorflow/lite/tools/model_loader.cc
similarity index 75%
rename from tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.cc
rename to tensorflow/lite/tools/model_loader.cc
index 80da3bed80c..9b9face0740 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.cc
+++ b/tensorflow/lite/tools/model_loader.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h"
+#include "tensorflow/lite/tools/model_loader.h"
 
 #include <cstdlib>
 #include <iostream>
@@ -24,57 +24,55 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
-namespace acceleration {
+namespace tools {
 
-MinibenchmarkStatus ModelLoader::Init() {
+bool ModelLoader::Init() {
   if (model_ && model_->initialized()) {
     // Already done.
-    return kMinibenchmarkSuccess;
+    return true;
   }
-  MinibenchmarkStatus status = InitInternal();
-  if (status != kMinibenchmarkSuccess) {
-    return status;
+  if (!InitInternal()) {
+    return false;
   }
   if (!model_ || !model_->initialized()) {
-    return kMinibenchmarkModelBuildFailed;
+    return false;
   }
-  return kMinibenchmarkSuccess;
+  return true;
 }
 
-MinibenchmarkStatus PathModelLoader::InitInternal() {
+bool PathModelLoader::InitInternal() {
   if (model_path_.empty()) {
-    return kMinibenchmarkPreconditionNotMet;
+    return false;
   }
   model_ = FlatBufferModel::VerifyAndBuildFromFile(model_path_.c_str());
-  return kMinibenchmarkSuccess;
+  return true;
 }
 
 #ifndef _WIN32
 
-MinibenchmarkStatus MmapModelLoader::InitInternal() {
+bool MmapModelLoader::InitInternal() {
   if (model_fd_ < 0 || model_offset_ < 0 || model_size_ < 0) {
-    return kMinibenchmarkPreconditionNotMet;
+    return false;
   }
   if (!MMAPAllocation::IsSupported()) {
-    return kMinibenchmarkUnsupportedPlatform;
+    return false;
   }
   auto allocation = std::make_unique<MMAPAllocation>(
       model_fd_, model_offset_, model_size_, tflite::DefaultErrorReporter());
   if (!allocation->valid()) {
-    return kMinibenchmarkModelReadFailed;
+    return false;
   }
   model_ = FlatBufferModel::VerifyAndBuildFromAllocation(std::move(allocation));
-  return kMinibenchmarkSuccess;
+  return true;
 }
 
-MinibenchmarkStatus PipeModelLoader::InitInternal() {
+bool PipeModelLoader::InitInternal() {
   if (pipe_fd_ < 0) {
-    return kMinibenchmarkPreconditionNotMet;
+    return false;
   }
 
   std::free(model_buffer_);
@@ -96,20 +94,18 @@ MinibenchmarkStatus PipeModelLoader::InitInternal() {
                     "%d bytes missing.",
                     std::strerror(errno), model_size_, remaining_bytes);
     // If read() failed with -1, or read partial or too much data.
-    return kMinibenchmarkModelReadFailed;
+    return false;
   }
 
   model_ = FlatBufferModel::BuildFromModel(tflite::GetModel(model_buffer_));
-  return kMinibenchmarkSuccess;
+  return true;
 }
 
 #endif  // !_WIN32
 
 std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
+#ifndef _WIN32
   if (absl::StartsWith(path, "fd:")) {
-#ifdef _WIN32
-    return kMinibenchmarkUnsupportedPlatform;
-#endif  // _WIN32
     std::vector<std::string> parts = absl::StrSplit(path, ':');
     int model_fd;
     size_t model_offset, model_size;
@@ -122,9 +118,6 @@ std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
                                              model_size);
   }
   if (absl::StartsWith(path, "pipe:")) {
-#ifdef _WIN32
-    return kMinibenchmarkUnsupportedPlatform;
-#endif  // _WIN32
     std::vector<std::string> parts = absl::StrSplit(path, ':');
     int read_fd, write_fd;
     size_t model_size;
@@ -139,8 +132,9 @@ std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path) {
     }
     return std::make_unique<PipeModelLoader>(read_fd, model_size);
   }
+#endif  // !_WIN32
   return std::make_unique<PathModelLoader>(path);
 }
 
-}  // namespace acceleration
+}  // namespace tools
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h b/tensorflow/lite/tools/model_loader.h
similarity index 79%
rename from tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h
rename to tensorflow/lite/tools/model_loader.h
index 839fcc86293..b925e74e8ad 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/model_loader.h
+++ b/tensorflow/lite/tools/model_loader.h
@@ -12,23 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_LOADER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_LOADER_H_
+#ifndef TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
+#define TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
 
-#include <stddef.h>
+#ifndef _WIN32
 #include <unistd.h>
+#endif  // !_WIN32
 
+#include <cstddef>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/experimental/acceleration/mini_benchmark/status_codes.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 
 namespace tflite {
-namespace acceleration {
+namespace tools {
 
 // Class to load the Model.
 class ModelLoader {
@@ -36,7 +37,7 @@ class ModelLoader {
   virtual ~ModelLoader() {}
 
   // Return whether the model is loaded successfully.
-  virtual MinibenchmarkStatus Init();
+  virtual bool Init();
 
   const FlatBufferModel* GetModel() const { return model_.get(); }
 
@@ -45,12 +46,10 @@ class ModelLoader {
   virtual bool IsLoadedFromFlatbufferBuilder() = 0;
 
  protected:
-  // ModelLoader() = default;
-
-  // Interface for subclass to create model_. If failed, Init() will return the
-  // error status; If succeeded but model_ is null, Init() function will return
-  // ModelBuildFailed.
-  virtual MinibenchmarkStatus InitInternal() = 0;
+  // Interface for subclass to create model_. Init() calls InitInternal(). If
+  // InitInternal() returns false, or if it returns true but model_ remains
+  // null, then Init() will return false.
+  virtual bool InitInternal() = 0;
 
   std::unique_ptr<FlatBufferModel> model_;
 };
@@ -64,7 +63,7 @@ class PathModelLoader : public ModelLoader {
   bool IsLoadedFromFlatbufferBuilder() override { return false; }
 
  protected:
-  MinibenchmarkStatus InitInternal() override;
+  bool InitInternal() override;
 
  private:
   const std::string model_path_;
@@ -92,7 +91,7 @@ class MmapModelLoader : public ModelLoader {
   bool IsLoadedFromFlatbufferBuilder() override { return false; }
 
  protected:
-  MinibenchmarkStatus InitInternal() override;
+  bool InitInternal() override;
 
  private:
   const int model_fd_ = -1;
@@ -119,10 +118,10 @@ class PipeModelLoader : public ModelLoader {
   bool IsLoadedFromFlatbufferBuilder() override { return true; }
 
  protected:
-  // Read the serialized Model from read_pipe_fd. Return ModelReadFailed if the
-  // readin bytes is less than read_size. This function also closes the
+  // Reads the serialized Model from read_pipe_fd. Returns false if the number
+  // of bytes read in is less than read_size. This function also closes the
   // read_pipe_fd and write_pipe_fd.
-  MinibenchmarkStatus InitInternal() override;
+  bool InitInternal() override;
 
  private:
   const int pipe_fd_ = -1;
@@ -146,7 +145,7 @@ class PipeModelLoader : public ModelLoader {
 // when possible.
 std::unique_ptr<ModelLoader> CreateModelLoaderFromPath(absl::string_view path);
 
-}  // namespace acceleration
+}  // namespace tools
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_MINI_BENCHMARK_MODEL_LOADER_H_
+#endif  // TENSORFLOW_LITE_TOOLS_MODEL_LOADER_H_
diff --git a/tensorflow/lite/tools/model_loader_test.cc b/tensorflow/lite/tools/model_loader_test.cc
new file mode 100644
index 00000000000..4131bddd036
--- /dev/null
+++ b/tensorflow/lite/tools/model_loader_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/model_loader.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace tools {
+namespace {
+
+static constexpr char kModelPath[] =
+    "third_party/tensorflow/lite/java/demo/app/src/main/assets/"
+    "mobilenet_v1_1.0_224.tflite";
+
+using ::testing::IsNull;
+using ::testing::Not;
+using ::testing::WhenDynamicCastTo;
+
+class ModelLoaderTest : public ::testing::Test {};
+
+TEST_F(ModelLoaderTest, CreateFromModelPath) {
+  auto model_loader = std::make_unique<PathModelLoader>(kModelPath);
+
+  ASSERT_NE(model_loader, nullptr);
+  EXPECT_TRUE(model_loader->Init());
+}
+
+TEST_F(ModelLoaderTest, CreateFromFdPath) {
+  int fd = open(kModelPath, O_RDONLY);
+  ASSERT_GE(fd, 0);
+  struct stat stat_buf = {0};
+  ASSERT_EQ(fstat(fd, &stat_buf), 0);
+  auto model_loader =
+      std::make_unique<MmapModelLoader>(fd, 0, stat_buf.st_size);
+  close(fd);
+
+  ASSERT_NE(model_loader, nullptr);
+  EXPECT_TRUE(model_loader->Init());
+}
+
+TEST_F(ModelLoaderTest, CreateFromPipePath) {
+  // Setup.
+  // Read the model and serialize it.
+  auto model = FlatBufferModel::BuildFromFile(kModelPath);
+  flatbuffers::FlatBufferBuilder fbb;
+  ModelT model_obj;
+  model->GetModel()->UnPackTo(&model_obj);
+  std::string model_description = model_obj.description;
+  fbb.Finish(CreateModel(fbb, &model_obj));
+  int pipe_fds[2];
+  ASSERT_EQ(pipe(pipe_fds), 0);
+  pid_t r = fork();
+  // Child thread to write to pipe.
+  if (r == 0) {
+    close(pipe_fds[0]);
+    int written_bytes = 0;
+    int remaining_bytes = fbb.GetSize();
+    uint8_t* buffer = fbb.GetBufferPointer();
+    while (remaining_bytes > 0 &&
+           (written_bytes = write(pipe_fds[1], buffer, remaining_bytes)) > 0) {
+      remaining_bytes -= written_bytes;
+      buffer += written_bytes;
+    }
+    close(pipe_fds[1]);
+    ASSERT_TRUE(written_bytes > 0 && remaining_bytes == 0);
+    _exit(0);
+  }
+
+  // Execute.
+  // Parent thread.
+  // Close the write pipe.
+  close(pipe_fds[1]);
+  auto model_loader =
+      std::make_unique<PipeModelLoader>(pipe_fds[0], fbb.GetSize());
+  ASSERT_NE(model_loader, nullptr);
+
+  // Verify.
+  EXPECT_TRUE(model_loader->Init());
+  EXPECT_EQ(model_loader->GetModel()->GetModel()->description()->string_view(),
+            model_description);
+}
+
+TEST_F(ModelLoaderTest, InvalidModelPath) {
+  auto model_loader = std::make_unique<PathModelLoader>("invalid/path");
+
+  ASSERT_NE(model_loader, nullptr);
+  EXPECT_FALSE(model_loader->Init());
+}
+
+TEST_F(ModelLoaderTest, InvalidFd) {
+  auto model_loader = std::make_unique<MmapModelLoader>(0, 5, 10);
+
+  ASSERT_NE(model_loader, nullptr);
+  EXPECT_FALSE(model_loader->Init());
+}
+
+TEST_F(ModelLoaderTest, InvalidPipe) {
+  auto model_loader = std::make_unique<PipeModelLoader>(-1, 10);
+
+  ASSERT_NE(model_loader, nullptr);
+  EXPECT_FALSE(model_loader->Init());
+}
+
+TEST_F(ModelLoaderTest, CreateModelLoaderFromValidPath) {
+  EXPECT_THAT(CreateModelLoaderFromPath("a/b/c").get(),
+              WhenDynamicCastTo<PathModelLoader*>(Not(IsNull())));
+  EXPECT_THAT(CreateModelLoaderFromPath("fd:1:2:3").get(),
+              WhenDynamicCastTo<MmapModelLoader*>(Not(IsNull())));
+  EXPECT_THAT(CreateModelLoaderFromPath("pipe:1:2:3").get(),
+              WhenDynamicCastTo<PipeModelLoader*>(Not(IsNull())));
+}
+
+TEST_F(ModelLoaderTest, CreateModelLoaderFromInvalidPath) {
+  EXPECT_EQ(CreateModelLoaderFromPath("fd:1"), nullptr);
+  EXPECT_EQ(CreateModelLoaderFromPath("fd:1:2:3:4"), nullptr);
+
+  EXPECT_EQ(CreateModelLoaderFromPath("pipe:1"), nullptr);
+  EXPECT_EQ(CreateModelLoaderFromPath("pipe:1:2:3:4"), nullptr);
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 28ba5cc8470..c599e511d3c 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -41,6 +42,7 @@ tf_cc_test(
         ":test_util",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
@@ -58,7 +60,8 @@ cc_library(
     deps = [
         ":model_utils",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -78,6 +81,7 @@ tf_cc_test(
     deps = [
         ":modify_model_interface",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
@@ -102,6 +106,7 @@ cc_library(
     deps = [
         ":operator_property",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
         "@flatbuffers",
@@ -118,6 +123,7 @@ tf_cc_test(
     deps = [
         ":quantization_wrapper_utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
@@ -144,6 +150,7 @@ tf_cc_test(
     deps = [
         ":operator_property",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -177,8 +184,8 @@ cc_library(
         ":model_utils",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
@@ -197,6 +204,7 @@ cc_library(
     deps = [
         ":operator_property",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_conversion_utils",
@@ -217,6 +225,7 @@ tf_cc_test(
         ":model_utils",
         ":test_util",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
@@ -231,6 +240,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
     ],
@@ -255,6 +265,7 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
@@ -284,6 +295,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@flatbuffers",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
@@ -320,6 +332,7 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
@@ -350,6 +363,7 @@ cc_library(
         ":quantization_utils",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/schema:schema_fbs",
@@ -408,6 +422,7 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index 95bee788c3f..c1c9a5f1720 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -17,9 +18,9 @@ cc_library(
     deps = [
         ":calibration_logger",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:lstm_shared",
         "//tensorflow/lite/kernels:op_macros",
@@ -38,9 +39,9 @@ cc_library(
     deps = [
         ":calibration_logger",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:lstm_shared",
         "//tensorflow/lite/kernels:op_macros",
@@ -67,10 +68,10 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_utils",
@@ -103,7 +104,8 @@ tf_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -146,6 +148,7 @@ cc_library(
     deps = [
         ":calibration_logger",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -160,8 +163,8 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -181,7 +184,7 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":calibration_logger",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
     ],
 )
 
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 4be8bc5ee0a..f2756e7a0d1 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <cstdio>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
index 9a60b7b7f2e..e32496a5667 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_BUILTIN_LOGGING_OPS_LSTM_H_
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_BUILTIN_LOGGING_OPS_LSTM_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
index 84a5d8dadf3..fd2835a88bc 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <limits>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace optimize {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
index 0a0214cd229..aa87325e161 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CALIBRATION_READER_H_
 
 #include "tensorflow/lite/context.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index 2f777ea43b7..b1fcb009bfe 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -24,14 +24,14 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/minimal_logging.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.h b/tensorflow/lite/tools/optimize/calibration/calibrator.h
index c2e2f7f0074..12edb48ba93 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.h
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index 50170e193d3..77bb1b3c75e 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 
 namespace {
 tensorflow::string* g_test_model_dir = nullptr;
diff --git a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc
index f6295cafdfe..0883464416e 100644
--- a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <cstdio>
 #include <vector>
 
-#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
diff --git a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
index d71b60eeeeb..b6d349539d6 100644
--- a/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
+++ b/tensorflow/lite/tools/optimize/calibration/custom_logging_ops/lstm.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_CUSTOM_LOGGING_OPS_LSTM_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op.h b/tensorflow/lite/tools/optimize/calibration/logging_op.h
index fe2418a36b6..1836a269355 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op.h
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_H_
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGING_OP_H_
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/debugging/python/BUILD b/tensorflow/lite/tools/optimize/debugging/python/BUILD
index ccae60e2cd6..274e0c40706 100644
--- a/tensorflow/lite/tools/optimize/debugging/python/BUILD
+++ b/tensorflow/lite/tools/optimize/debugging/python/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/lite/tools/optimize/debugging/python/debugger.py b/tensorflow/lite/tools/optimize/debugging/python/debugger.py
index 3de8c78aa8d..eab9bfc5518 100644
--- a/tensorflow/lite/tools/optimize/debugging/python/debugger.py
+++ b/tensorflow/lite/tools/optimize/debugging/python/debugger.py
@@ -365,7 +365,7 @@ def _collect_layer_statistics(self) -> Dict[str, Dict[str, float]]:
           op_detail = self._quant_interpreter._get_op_details(op_idx)  # pylint: disable=protected-access
           q_idx, f_idx = op_detail['inputs']
           quant_input_detail = self._quant_interpreter._get_tensor_details(  # pylint: disable=protected-access
-              q_idx)
+              q_idx, subgraph_index=0)
           for (metric_name, metric_fn
               ) in self._debug_options.layer_direct_compare_metrics.items():
             layer_statistics[tensor_name][metric_name].append(
@@ -494,7 +494,7 @@ def _get_numeric_verify_tensor_details(self) -> List[str]:
         if op_info['op_name'] == _NUMERIC_VERIFY_OP_NAME:
           self._numeric_verify_tensor_details.append(
               self._quant_interpreter._get_tensor_details(
-                  op_info['outputs'][0]))
+                  op_info['outputs'][0], subgraph_index=0))
           tensor_name = self._numeric_verify_tensor_details[-1]['name']
           self._numeric_verify_op_details[tensor_name] = op_info
     # pylint: enable=protected-access
@@ -537,7 +537,8 @@ def layer_statistics_dump(self, file: IO[str]) -> None:
       data['tensor_idx'] = self._numeric_verify_op_details[name]['inputs'][0]
       data['op_name'] = self._quant_interpreter._get_op_details(  # pylint: disable=protected-access
           self._defining_op[data['tensor_idx']])['op_name']
-      details = self._quant_interpreter._get_tensor_details(data['tensor_idx'])  # pylint: disable=protected-access
+      details = self._quant_interpreter._get_tensor_details(  # pylint: disable=protected-access
+          data['tensor_idx'], subgraph_index=0)
       data['scale'], data['zero_point'] = (
           details['quantization_parameters']['scales'][0],
           details['quantization_parameters']['zero_points'][0])
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 37f781599bd..6cf888b206e 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
diff --git a/tensorflow/lite/tools/optimize/model_utils.h b/tensorflow/lite/tools/optimize/model_utils.h
index 8d2a332c148..8ad854b77f8 100644
--- a/tensorflow/lite/tools/optimize/model_utils.h
+++ b/tensorflow/lite/tools/optimize/model_utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/model_utils_test.cc b/tensorflow/lite/tools/optimize/model_utils_test.cc
index 7b47e856f4f..65e3afe35e2 100644
--- a/tensorflow/lite/tools/optimize/model_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/model_utils_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
index 09d5b8d79fa..2af2324d70c 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.h b/tensorflow/lite/tools/optimize/modify_model_interface.h
index 78edadc4bed..e0c6bf3ae5f 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.h
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
 #define TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 76d0b63995e..e37c4fb1df3 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index e9b960536da..10a0fd08d2a 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -53,51 +53,62 @@ const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index,
 }  // namespace
 
 OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
-                                     int op_index) {
+                                     int op_index, int number_of_bits) {
   OpVariant op_variant = GetOperatorVariant(model, subgraph_index, op_index);
-  return GetOperatorProperty(op_variant);
+  return GetOperatorProperty(op_variant, number_of_bits);
 }
 
 // Update operation defintions in TensorFlow Lite dialect accordingly when there
 // are any needs on updating the kernel support level.
 // LINT.IfChange
-OperatorProperty GetOperatorProperty(OpVariant op_variant) {
+OperatorProperty GetOperatorProperty(OpVariant op_variant, int number_of_bits) {
   BuiltinOperator op_code = op_variant.op_code;
   OperatorProperty property;
+
+  // Default tensor property for activations.
+  TensorProperty tensor_property_default;
+  tensor_property_default.number_of_bits = number_of_bits;
+  tensor_property_default.symmetric = number_of_bits != 8;
+
+  // Default tensor property for weights.
+  TensorProperty tensor_property_weights_default;
+
   switch (op_code) {
     case BuiltinOperator_ABS:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_RSQRT:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_ADD:
-      property.inputs = {{0, {}}, {1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       property.quantize_input_as_activations = true;
       break;
     case BuiltinOperator_ARG_MAX:
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // ArgMax has no quantizable output.
       property.version = 2;
       property.quantizable_int16 = false;
       break;
     case BuiltinOperator_AVERAGE_POOL_2D:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 2;
       break;
     case BuiltinOperator_BATCH_MATMUL: {
-      property.inputs = {{0, {}}, {1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       property.quantize_input_as_activations = true;
       break;
@@ -106,8 +117,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
     case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_SPACE_TO_DEPTH:
       // We skip inputs 1 and 2 since they aren't real valued (they are shapes).
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -115,16 +126,16 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.quantizable_int16 = false;
       break;
     case BuiltinOperator_BROADCAST_TO:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 3;
       break;
     case BuiltinOperator_DEPTH_TO_SPACE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -133,7 +144,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_SPLIT:
       // We skip input 0 since it is the split dim which is not real valued.
-      property.inputs = {{1, {}}};
+      property.inputs = {{1, tensor_property_default}};
       property.arbitrary_outputs = true;
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
@@ -141,7 +152,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 2;
       break;
     case BuiltinOperator_SPLIT_V:
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       property.arbitrary_outputs = true;
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
@@ -150,7 +161,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_CONCATENATION:
       property.arbitrary_inputs = true;
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -161,8 +172,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       tensor_property.per_axis = true;
       tensor_property.per_axis_index = 0;
       tensor_property.symmetric = true;
-      property.inputs = {{0, {}}, {1, tensor_property}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}, {1, tensor_property}};
+      property.outputs = {{0, tensor_property_default}};
       property.biases = {2};
       property.version = 3;
       break;
@@ -172,8 +183,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       tensor_property.per_axis = true;
       tensor_property.per_axis_index = 0;
       tensor_property.symmetric = true;
-      property.inputs = {{2, {}}, {1, tensor_property}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{2, tensor_property_default}, {1, tensor_property}};
+      property.outputs = {{0, tensor_property_default}};
       property.biases = {3};
       property.version = 3;
       break;
@@ -184,10 +195,10 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       tensor_property.per_axis_index = 3;
       tensor_property.symmetric = true;
       property.inputs = {
-          {0, {}},
+          {0, tensor_property_default},
           {1, tensor_property},
       };
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.biases = {2};
       property.version = 3;
       break;
@@ -198,7 +209,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
     case BuiltinOperator_GREATER_EQUAL:
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
-      property.inputs = {{0, {}}, {1, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
       // Comparisons have no quantizable outputs.
       property.version = 2;
       property.quantizable_int16 = false;
@@ -206,16 +218,16 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
     case BuiltinOperator_EXPAND_DIMS:
       // We skip input 1 as it is not real valued (it's the index of axis) and
       // hence does not need to be quantized.
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 1;
       break;
     case BuiltinOperator_FILL: {
-      property.inputs = {{1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -225,15 +237,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
     case BuiltinOperator_FULLY_CONNECTED: {
       TensorProperty tensor_property;
       tensor_property.symmetric = true;
-      property.inputs = {{0, {}}, {1, tensor_property}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}, {1, tensor_property}};
+      property.outputs = {{0, tensor_property_default}};
       property.biases = {2};
       property.version = 4;
       break;
     }
     case BuiltinOperator_GATHER:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -241,22 +253,22 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 2;
       break;
     case BuiltinOperator_GATHER_ND:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 3;
       break;
     case BuiltinOperator_HARD_SWISH: {
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 1;
       property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOG_SOFTMAX: {
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
@@ -267,7 +279,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     }
     case BuiltinOperator_LOGISTIC: {
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // Logistic requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
@@ -323,19 +335,19 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_20.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {9, tensor_property_9},
             {10, tensor_property_9},
             {11, tensor_property_9},
-            {16, {}},
+            {16, tensor_property_weights_default},
             {19, tensor_property_19},
             {20, tensor_property_20},
             {21, tensor_property_20},
@@ -347,7 +359,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {15, tensor_property_15},
             {17, tensor_property_17},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             {0, tensor_property_20},
             {1, tensor_property_20},
@@ -392,16 +404,16 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_20.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
-            {16, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
+            {16, tensor_property_weights_default},
             {19, tensor_property_19},
             {20, tensor_property_20},
             {21, tensor_property_20},
@@ -413,7 +425,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {15, tensor_property_15},
             {17, tensor_property_17},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             {0, tensor_property_20},
             {1, tensor_property_20},
@@ -456,15 +468,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_20.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {9, tensor_property_9},
             {10, tensor_property_9},
             {11, tensor_property_9},
@@ -478,7 +490,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {14, tensor_property_14},
             {15, tensor_property_15},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             {0, tensor_property_20},
             {1, tensor_property_20},
@@ -524,15 +536,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_20.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {19, tensor_property_19},
             {20, tensor_property_20},
             {21, tensor_property_20},
@@ -596,19 +608,19 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_19.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {9, tensor_property_9},
             {10, tensor_property_9},
             {11, tensor_property_9},
-            {16, {}},
+            {16, tensor_property_weights_default},
             {19, tensor_property_19},
             {12, tensor_property_12},
             {13, tensor_property_13},
@@ -616,7 +628,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {15, tensor_property_15},
             {17, tensor_property_17},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             // Without layer normalization, intermediate tensors 0, 1, 2, 3 are
             // not used and their quantization parameters are ignored.
@@ -663,16 +675,16 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_19.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
-            {16, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
+            {16, tensor_property_weights_default},
             {19, tensor_property_19},
             {12, tensor_property_12},
             {13, tensor_property_13},
@@ -680,7 +692,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {15, tensor_property_15},
             {17, tensor_property_17},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             // Without layer normalization, intermediate tensors 0, 1, 2, 3 are
             // not used and their quantization parameters are ignored.
@@ -726,15 +738,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_19.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {9, tensor_property_9},
             {10, tensor_property_9},
             {11, tensor_property_9},
@@ -744,7 +756,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
             {14, tensor_property_14},
             {15, tensor_property_15},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             // Without layer normalization, intermediate tensors 0, 1, 2, 3 are
             // not used and their quantization parameters are ignored.
@@ -792,22 +804,22 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
         tensor_property_19.symmetric = true;
 
         property.inputs = {
-            {0, {}},
-            {1, {}},
-            {2, {}},
-            {3, {}},
-            {4, {}},
-            {5, {}},
-            {6, {}},
-            {7, {}},
-            {8, {}},
+            {0, tensor_property_default},
+            {1, tensor_property_weights_default},
+            {2, tensor_property_weights_default},
+            {3, tensor_property_weights_default},
+            {4, tensor_property_weights_default},
+            {5, tensor_property_weights_default},
+            {6, tensor_property_weights_default},
+            {7, tensor_property_weights_default},
+            {8, tensor_property_weights_default},
             {19, tensor_property_19},
             {12, tensor_property_12},
             {13, tensor_property_13},
             {14, tensor_property_14},
             {15, tensor_property_15},
         };
-        property.outputs = {{0, {}}};
+        property.outputs = {{0, tensor_property_default}};
         property.intermediates = {
             // Without layer normalization, intermediate tensors 0, 1, 2, 3 are
             // not used and their quantization parameters are ignored.
@@ -830,7 +842,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // L2 Norm requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
@@ -841,8 +853,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     }
     case BuiltinOperator_MAX_POOL_2D:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -850,7 +862,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_MAXIMUM:
       property.arbitrary_inputs = true;
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -858,13 +870,13 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 2;
       break;
     case BuiltinOperator_MEAN:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_MINIMUM:
       property.arbitrary_inputs = true;
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -872,14 +884,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 2;
       break;
     case BuiltinOperator_MUL:
-      property.inputs = {{0, {}}, {1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_PACK:
       property.arbitrary_inputs = true;
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -887,44 +900,45 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_PAD:
     case BuiltinOperator_PADV2:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 2;
       break;
     case BuiltinOperator_QUANTIZE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_PRELU:
-      property.inputs = {{0, {}}, {1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 1;
       property.quantizable_int16 = false;
       break;
     case BuiltinOperator_LEAKY_RELU:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_RELU_N1_TO_1:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 1;
       property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RESHAPE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -932,62 +946,63 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_RESIZE_BILINEAR:
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 2;
       break;
     case BuiltinOperator_REVERSE_V2:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 3;
       break;
     case BuiltinOperator_SCATTER_ND:
-      property.inputs = {{1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 1;
       break;
     case BuiltinOperator_SELECT:
-      property.inputs = {{1, {}}, {2, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{1, tensor_property_default},
+                         {2, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 1;
       break;
     case BuiltinOperator_SHAPE:
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // Shape has no quantizable output.
       property.version = 1;
       break;
     case BuiltinOperator_SLICE:
       // We skip inputs 1 and 2 since they aren't real valued (they are the
       // index and size).
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 2;
       break;
     case BuiltinOperator_SQUEEZE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 1;
       break;
     case BuiltinOperator_SOFTMAX: {
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // Softmax requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
@@ -998,8 +1013,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     }
     case BuiltinOperator_STRIDED_SLICE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -1007,14 +1022,15 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       break;
     case BuiltinOperator_SQUARED_DIFFERENCE:
     case BuiltinOperator_SUB:
-      property.inputs = {{0, {}}, {1, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       property.quantize_input_as_activations = true;
       break;
     case BuiltinOperator_SUM:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       property.restrict_same_input_output_scale = [](TensorType type) {
         // Only eight bit tensors can have the non same scale and zero point.
@@ -1023,7 +1039,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       };
       break;
     case BuiltinOperator_TANH: {
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       // Tanh requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
@@ -1047,19 +1063,19 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       tensor_property_state.number_of_bits = 16;
       tensor_property_state.state_tensor = true;
 
-      property.inputs = {{0, {}},
-                         {1, {}},
+      property.inputs = {{0, tensor_property_default},
+                         {1, tensor_property_default},
                          {2, tensor_property_time},
                          {4, tensor_property_state},
                          {3, tensor_property_bias}};
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 3;
       property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_TILE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -1067,8 +1083,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 3;
       break;
     case BuiltinOperator_TRANSPOSE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -1076,7 +1092,7 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 2;
       break;
     case BuiltinOperator_UNPACK:
-      property.inputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
       property.arbitrary_outputs = true;
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
@@ -1085,8 +1101,8 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.version = 1;
       break;
     case BuiltinOperator_MIRROR_PAD:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
@@ -1094,39 +1110,39 @@ OperatorProperty GetOperatorProperty(OpVariant op_variant) {
       property.quantizable_int16 = false;
       break;
     case BuiltinOperator_REDUCE_PROD:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     case BuiltinOperator_REDUCE_MAX:
     case BuiltinOperator_REDUCE_MIN:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.restrict_same_input_output_scale = [](TensorType) {
         return true;
       };
       property.version = 2;
       break;
     case BuiltinOperator_WHERE:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 1;
       break;
     case BuiltinOperator_ASSIGN_VARIABLE:
-      property.inputs = {{1, {}}};
+      property.inputs = {{1, tensor_property_default}};
       property.quantize_input_as_activations = true;
       property.version = 1;
       break;
     case BuiltinOperator_READ_VARIABLE:
-      property.outputs = {{0, {}}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 1;
       break;
     case BuiltinOperator_VAR_HANDLE:
       property.version = 1;
       break;
     case BuiltinOperator_GELU:
-      property.inputs = {{0, {}}};
-      property.outputs = {{0, {}}};
+      property.inputs = {{0, tensor_property_default}};
+      property.outputs = {{0, tensor_property_default}};
       property.version = 2;
       break;
     default:
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 9da5e88be18..12d228c3cba 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -144,8 +144,9 @@ struct OpVariant {
 };
 
 OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
-                                     int op_index);
-OperatorProperty GetOperatorProperty(OpVariant op_variant);
+                                     int op_index, int number_of_bits = 8);
+OperatorProperty GetOperatorProperty(OpVariant op_variant,
+                                     int number_of_bits = 8);
 
 }  // namespace operator_property
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
index c56e2bd07bc..8f0758efe35 100644
--- a/tensorflow/lite/tools/optimize/python/BUILD
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index c305cf069ad..c0d31a1d241 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -38,8 +38,12 @@ namespace optimize {
 namespace utils {
 
 namespace {
-const int8_t kMinQuantizedValue = -127;
-const int8_t kMaxQuantizedValue = 127;
+
+const int8_t kMinQuantizedValue8bit = -127;
+const int8_t kMaxQuantizedValue8bit = 127;
+
+const int8_t kMinQuantizedValue4bit = -7;
+const int8_t kMaxQuantizedValue4bit = 7;
 
 // The maximum number of dimensions supported in per-channel quantization.
 constexpr int kPerChannelMaxDim = 4;
@@ -227,7 +231,7 @@ TfLiteStatus GetSymmetricScalesFromMaxMin(QuantizationParametersT* quant_params,
   for (int channel_idx = 0; channel_idx < num_channels; ++channel_idx) {
     const float half_range = std::max(std::abs(quant_params->min[channel_idx]),
                                       std::abs(quant_params->max[channel_idx]));
-    scales->at(channel_idx) = half_range / kMaxQuantizedValue;
+    scales->at(channel_idx) = half_range / kMaxQuantizedValue8bit;
   }
   return kTfLiteOk;
 }
@@ -275,7 +279,7 @@ TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
       if (std::abs(bias_data[i]) >=
           0.5 * input_scale * weight_scales[i] * kScale) {
         quant_params->max[i] = 2.0 * std::abs(bias_data[i]) / kScale *
-                               (kMaxQuantizedValue / input_scale);
+                               (kMaxQuantizedValue8bit / input_scale);
         quant_params->min[i] = -quant_params->max[i];
       }
     }
@@ -287,10 +291,10 @@ TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
 
     // Need to adjust weight min/max; not compatible with bias.
     if (bias_half_range / kScale >= 0.5 * input_scale * weight_scales[0]) {
-      quant_params->min[0] =
-          2.0 * bias_half_range / kScale * (kMinQuantizedValue / input_scale);
-      quant_params->max[0] =
-          2.0 * bias_half_range / kScale * (kMaxQuantizedValue / input_scale);
+      quant_params->min[0] = 2.0 * bias_half_range / kScale *
+                             (kMinQuantizedValue8bit / input_scale);
+      quant_params->max[0] = 2.0 * bias_half_range / kScale *
+                             (kMaxQuantizedValue8bit / input_scale);
     }
   }
   return kTfLiteOk;
@@ -321,7 +325,7 @@ TfLiteStatus SymmetricPerChannelQuantization(TensorT* tensor,
 
   // Calculate scales per channel using max and min values from tensor.
   std::vector<float> scale_invs(channel_dim_size);
-  const float half_scale = kMaxQuantizedValue;
+  const float half_scale = kMaxQuantizedValue8bit;
   for (int channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
     const float half_range =
         std::max(std::abs(tensor->quantization->min[channel_idx]),
@@ -381,7 +385,8 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
                                        const std::vector<float>& scales_inv,
                                        const std::vector<int32_t>& dimension,
                                        int32_t channel_dim_index,
-                                       std::vector<int8_t>* output_value) {
+                                       std::vector<int8_t>* output_value,
+                                       TfLiteType type) {
   // Quantize the values.
   int indices[kPerChannelMaxDim];
   RuntimeShape unextended_tensor_dims(dimension.size(), dimension.data());
@@ -398,9 +403,15 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
           const float val = input[index];
           const int32_t quantized_value =
               static_cast<int32_t>(TfLiteRound(val * scales_inv[channel_idx]));
-          output_value->at(index) = std::min<int8_t>(
-              kMaxQuantizedValue,
-              std::max<int8_t>(kMinQuantizedValue, quantized_value));
+          if (type == kTfLiteInt4) {
+            output_value->at(index) = std::min<int8_t>(
+                kMaxQuantizedValue4bit,
+                std::max<int8_t>(kMinQuantizedValue4bit, quantized_value));
+          } else {
+            output_value->at(index) = std::min<int8_t>(
+                kMaxQuantizedValue8bit,
+                std::max<int8_t>(kMinQuantizedValue8bit, quantized_value));
+          }
         }
       }
     }
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index c2f7dcf6326..dc6ba7d7299 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -72,7 +72,8 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
                                        const std::vector<float>& scales_inv,
                                        const std::vector<int32_t>& dimension,
                                        int32_t channel_dim_index,
-                                       std::vector<int8_t>* output_value);
+                                       std::vector<int8_t>* output_value,
+                                       TfLiteType type = kTfLiteNoType);
 
 // Quantizes tensor using symmetric quantization with the min and max elements
 // of the tensor.
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index ae2a3a7bd07..0170d9aa8d8 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
@@ -820,8 +820,11 @@ TEST_F(QuantizationUtilsTest, SymmetricPerLayerBiasQuantize) {
               weight_scale * input_scale);
   EXPECT_THAT(model->subgraphs[0]->tensors[0]->quantization->zero_point[0], 0);
 
-  EXPECT_THAT(model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data,
-              ElementsAreArray({16, 0, 0, 0, 4, 0, 0, 0}));
+  const uint32_t* d1 = reinterpret_cast<const uint32_t*>(
+      model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data.data());
+  EXPECT_EQ(d1[0], 0x00000010);
+  EXPECT_EQ(d1[1], 0x00000004);
+
   EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT32);
 }
 
@@ -873,8 +876,12 @@ TEST_F(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
                 model.get(), model->subgraphs[0]->tensors[0].get(), input_scale,
                 weight_scales.data(), 2, &error_reporter_),
             kTfLiteOk);
-  EXPECT_THAT(model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data,
-              ElementsAreArray({16, 0, 0, 0, 2, 0, 0, 0}));
+
+  const uint32_t* d1 = reinterpret_cast<const uint32_t*>(
+      model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data.data());
+  EXPECT_EQ(d1[0], 0x00000010);
+  EXPECT_EQ(d1[1], 0x00000002);
+
   EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT32);
 }
 
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
index c2f232f4628..9a742e0ca48 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.cc
@@ -23,6 +23,9 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/operator_property.h"
 
 namespace tflite {
+namespace impl {
+class FlatBufferModel;
+}
 namespace optimize {
 namespace {
 
@@ -62,7 +65,7 @@ bool IntermediateTensorExists(ModelT* model) {
 }  // namespace
 
 TfLiteStatus LoadModel(const string& path, ModelT* model) {
-  auto input_model = FlatBufferModel::BuildFromFile(path.c_str());
+  auto input_model = impl::FlatBufferModel::BuildFromFile(path.c_str());
   if (!input_model) {
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h
index b94a3761863..a0b9039443e 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc
index f1eaddcd2c3..4bad151cff1 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_custom_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/quantization_wrapper_utils.h"
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
index e3f8f16f024..0d9eaf862df 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index c17bcabf39e..16266988fd9 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 0b953dc26a0..73f5a8b5ab9 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index ac0f2fb2652..681507c8e0d 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -27,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
@@ -76,10 +77,14 @@ TensorType GetBiasTensorType(TensorType& activation_type) {
 
 class QuantizeModelTest : public testing::Test {
  protected:
-  QuantizeModelTest() {
-    input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
+  QuantizeModelTest()
+      : QuantizeModelTest(ReadModel(internal::kConvModelWith0Plus10Weights)) {}
+
+  explicit QuantizeModelTest(std::unique_ptr<FlatBufferModel> input_model) {
+    input_model_ = std::move(input_model);
     readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
+    model_ =
+        *std::move(std::unique_ptr<tflite::ModelT>(readonly_model_->UnPack()));
   }
 
   std::unique_ptr<FlatBufferModel> input_model_;
@@ -130,13 +135,10 @@ void ExpectSameModels(const ModelT& model, const ModelT& expected_model) {
 class QuantizeConvModelTest : public QuantizeModelTest,
                               public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeConvModelTest() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConvModelTest()
+      : QuantizeModelTest(ReadModel(internal::kConvModelWith0Plus10Weights)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
   TensorType tensor_type_;
   TensorType bias_type_;
 };
@@ -402,11 +404,8 @@ TEST_P(QuantizeConvModelTest, Uint8InputAndOutput) {
 
 class QuantizeConvNoBiasModelTest : public QuantizeModelTest {
  protected:
-  QuantizeConvNoBiasModelTest() {
-    input_model_ = ReadModel(internal::kConvModelWithNoBias);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConvNoBiasModelTest()
+      : QuantizeModelTest(ReadModel(internal::kConvModelWithNoBias)) {}
 };
 
 TEST_F(QuantizeConvNoBiasModelTest, QuantizationSucceeds) {
@@ -422,11 +421,8 @@ TEST_F(QuantizeConvNoBiasModelTest, QuantizationSucceeds) {
 class QuantizeConcatModelTest : public QuantizeModelTest,
                                 public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeConcatModelTest() {
-    input_model_ = ReadModel(internal::kFloatConcatMax5Max10Max10);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConcatModelTest()
+      : QuantizeModelTest(ReadModel(internal::kFloatConcatMax5Max10Max10)) {}
 
   void SetUp() override {
     tensor_type_ = GetParam();
@@ -539,11 +535,8 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConcatModelInst, QuantizeConcatModelTest,
                                             TensorType_INT16}));
 class QuantizeSplitModelTest : public QuantizeModelTest {
  protected:
-  QuantizeSplitModelTest() {
-    input_model_ = ReadModel(internal::kModelSplit);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeSplitModelTest()
+      : QuantizeModelTest(ReadModel(internal::kModelSplit)) {}
 };
 
 // There are two outputs for split with different scales, the resulting model
@@ -607,11 +600,9 @@ TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
 
 class QuantizeConvModel1Test : public QuantizeModelTest {
  protected:
-  QuantizeConvModel1Test() {
-    input_model_ = ReadModel(internal::kConvModelWithMinus128Plus127Weights);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConvModel1Test()
+      : QuantizeModelTest(
+            ReadModel(internal::kConvModelWithMinus128Plus127Weights)) {}
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
@@ -711,13 +702,10 @@ TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
 class QuantizeConvModel2Test : public QuantizeModelTest,
                                public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeConvModel2Test() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConvModel2Test()
+      : QuantizeModelTest(ReadModel(internal::kConvModelWith0Plus10Weights)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
 
   TensorType tensor_type_;
   TensorType bias_type_;
@@ -936,11 +924,9 @@ TEST_P(QuantizeConvModel2Test, VerifyConvDisablePerChannelQuantization) {
 
 class QuantizeSoftmaxTest : public QuantizeModelTest {
  protected:
-  QuantizeSoftmaxTest() {
-    input_model_ = ReadModel(internal::kSingleSoftmaxModelMinMinus5MaxPlus5);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeSoftmaxTest()
+      : QuantizeModelTest(
+            ReadModel(internal::kSingleSoftmaxModelMinMinus5MaxPlus5)) {}
 };
 
 TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
@@ -998,11 +984,9 @@ TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
 
 class QuantizeAvgPoolTest : public QuantizeModelTest {
  protected:
-  QuantizeAvgPoolTest() {
-    input_model_ = ReadModel(internal::kSingleAvgPoolModelMinMinus5MaxPlus5);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeAvgPoolTest()
+      : QuantizeModelTest(
+            ReadModel(internal::kSingleAvgPoolModelMinMinus5MaxPlus5)) {}
 };
 
 TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
@@ -1060,11 +1044,8 @@ TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
 
 class QuantizeMultiInputAddWithReshapeTest : public QuantizeModelTest {
  protected:
-  QuantizeMultiInputAddWithReshapeTest() {
-    input_model_ = ReadModel(internal::kMultiInputAddWithReshape);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeMultiInputAddWithReshapeTest()
+      : QuantizeModelTest(ReadModel(internal::kMultiInputAddWithReshape)) {}
 };
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
@@ -1173,13 +1154,10 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
 class QuantizeConstInputTest : public QuantizeModelTest,
                                public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeConstInputTest() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kConstInputAddModel);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeConstInputTest()
+      : QuantizeModelTest(ReadModel(internal::kConstInputAddModel)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
 
   TensorType tensor_type_;
   TensorType bias_type_;
@@ -1234,11 +1212,8 @@ TEST_P(QuantizeConstInputTest, VerifyConstOpInput) {
 }
 class QuantizeArgMaxTest : public QuantizeModelTest {
  protected:
-  QuantizeArgMaxTest() {
-    input_model_ = ReadModel(internal::kModelWithArgMaxOp);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeArgMaxTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithArgMaxOp)) {}
 };
 
 TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
@@ -1278,11 +1253,8 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
 
 class QuantizeLSTMTest : public QuantizeModelTest {
  protected:
-  QuantizeLSTMTest() {
-    input_model_ = ReadModel(internal::kLstmCalibrated);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeLSTMTest()
+      : QuantizeModelTest(ReadModel(internal::kLstmCalibrated)) {}
 };
 
 TEST_F(QuantizeLSTMTest, VerifyLSTM) {
@@ -1303,11 +1275,8 @@ TEST_F(QuantizeLSTMTest, VerifyLSTM) {
 
 class QuantizeLSTM2Test : public QuantizeModelTest {
  protected:
-  QuantizeLSTM2Test() {
-    input_model_ = ReadModel(internal::kLstmCalibrated2);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeLSTM2Test()
+      : QuantizeModelTest(ReadModel(internal::kLstmCalibrated2)) {}
 };
 
 TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
@@ -1328,11 +1297,9 @@ TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
 
 class QuantizeUnidirectionalSequenceLSTMTest : public QuantizeModelTest {
  protected:
-  QuantizeUnidirectionalSequenceLSTMTest() {
-    input_model_ = ReadModel(internal::kUnidirectionalSequenceLstmCalibrated);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeUnidirectionalSequenceLSTMTest()
+      : QuantizeModelTest(
+            ReadModel(internal::kUnidirectionalSequenceLstmCalibrated)) {}
 };
 
 TEST_F(QuantizeUnidirectionalSequenceLSTMTest,
@@ -1355,11 +1322,8 @@ TEST_F(QuantizeUnidirectionalSequenceLSTMTest,
 
 class QuantizeSVDFTest : public QuantizeModelTest {
  protected:
-  QuantizeSVDFTest() {
-    input_model_ = ReadModel(internal::kSvdfCalibrated);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeSVDFTest()
+      : QuantizeModelTest(ReadModel(internal::kSvdfCalibrated)) {}
 };
 
 TEST_F(QuantizeSVDFTest, VerifySVDF) {
@@ -1415,11 +1379,7 @@ TEST_F(QuantizeSVDFTest, VerifySVDF) {
 
 class QuantizeFCTest : public QuantizeModelTest {
  protected:
-  QuantizeFCTest() {
-    input_model_ = ReadModel(internal::kModelWithFCOp);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeFCTest() : QuantizeModelTest(ReadModel(internal::kModelWithFCOp)) {}
 };
 
 TEST_F(QuantizeFCTest, VerifyFC) {
@@ -1469,13 +1429,10 @@ class QuantizeCustomOpTest
     : public QuantizeModelTest,
       public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
-  QuantizeCustomOpTest() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kModelMixed);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeCustomOpTest()
+      : QuantizeModelTest(ReadModel(internal::kModelMixed)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
 
   TensorType tensor_type_;
   TensorType bias_type_;
@@ -1513,11 +1470,8 @@ INSTANTIATE_TEST_SUITE_P(QuantizeCustomOpTest, QuantizeCustomOpTest,
 
 class QuantizeOp16x8Test : public QuantizeModelTest {
  protected:
-  QuantizeOp16x8Test() {
-    input_model_ = ReadModel(internal::kModelMixed16x8);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeOp16x8Test()
+      : QuantizeModelTest(ReadModel(internal::kModelMixed16x8)) {}
 };
 
 TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
@@ -1548,11 +1502,7 @@ TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
 
 class QuantizePackTest : public QuantizeModelTest {
  protected:
-  QuantizePackTest() {
-    input_model_ = ReadModel(internal::kModelPack);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizePackTest() : QuantizeModelTest(ReadModel(internal::kModelPack)) {}
 };
 
 TEST_F(QuantizePackTest, VerifyPack) {
@@ -1612,11 +1562,7 @@ class QuantizeMinimumMaximumTest
     : public QuantizeModelTest,
       public testing::WithParamInterface<const char*> {
  protected:
-  QuantizeMinimumMaximumTest() {
-    input_model_ = ReadModel(GetParam());
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeMinimumMaximumTest() : QuantizeModelTest(ReadModel(GetParam())) {}
 };
 
 TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
@@ -1688,11 +1634,8 @@ INSTANTIATE_TEST_SUITE_P(MinimumMaximumTestInst, QuantizeMinimumMaximumTest,
 
 class QuantizeUnpackTest : public QuantizeModelTest {
  protected:
-  QuantizeUnpackTest() {
-    input_model_ = ReadModel(internal::kModelWithUnpack);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeUnpackTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithUnpack)) {}
 };
 TEST_F(QuantizeUnpackTest, VerifyUnpack) {
   auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
@@ -1736,11 +1679,8 @@ TEST_F(QuantizeUnpackTest, VerifyUnpack) {
 
 class QuantizeTransposeTest : public QuantizeModelTest {
  protected:
-  QuantizeTransposeTest() {
-    input_model_ = ReadModel(internal::kModelWithTranspose);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeTransposeTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithTranspose)) {}
 };
 
 TEST_F(QuantizeTransposeTest, VerifyTranspose) {
@@ -1780,11 +1720,7 @@ TEST_F(QuantizeTransposeTest, VerifyTranspose) {
 
 class QuantizeQatTest : public QuantizeModelTest {
  protected:
-  QuantizeQatTest() {
-    input_model_ = ReadModel(internal::kQatModelWithFc);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeQatTest() : QuantizeModelTest(ReadModel(internal::kQatModelWithFc)) {}
 };
 
 TEST_F(QuantizeQatTest, VerifySingleQuantize) {
@@ -1840,13 +1776,11 @@ class QuantizeBroadcastToModelTest
     : public QuantizeModelTest,
       public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeBroadcastToModelTest() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kModelWithBroadcastToOp);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeBroadcastToModelTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithBroadcastToOp)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
+
   TensorType tensor_type_;
   TensorType bias_type_;
 };
@@ -1909,13 +1843,10 @@ class QuantizeGatherNDModelTest
     : public QuantizeModelTest,
       public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeGatherNDModelTest() {
-    tensor_type_ = GetParam();
-    bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kModelWithGatherNDOp);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeGatherNDModelTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithGatherNDOp)),
+        tensor_type_(GetParam()),
+        bias_type_(GetBiasTensorType(tensor_type_)) {}
 
   TensorType tensor_type_;
   TensorType bias_type_;
@@ -1974,11 +1905,8 @@ TEST_P(QuantizeGatherNDModelTest, QuantizeGatherND) {
 
 class QuantizeWhereModelTest : public QuantizeModelTest {
  protected:
-  QuantizeWhereModelTest() {
-    input_model_ = ReadModel(internal::kModelWithWhereOp);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
-  }
+  QuantizeWhereModelTest()
+      : QuantizeModelTest(ReadModel(internal::kModelWithWhereOp)) {}
 };
 
 TEST_F(QuantizeWhereModelTest, QuantizeWhere) {
@@ -2047,14 +1975,13 @@ class QuantizeResourcesModelTest
     : public QuantizeModelTest,
       public testing::WithParamInterface<TestType> {
  protected:
-  QuantizeResourcesModelTest() {
+  QuantizeResourcesModelTest()
+      : QuantizeModelTest(
+            ReadModel(internal::kModelWithResourceVarsCalibrated)) {
     TestType obj = GetParam();
     tensor_type_ = obj.tensor_type;
     modify_range_ = obj.modify_range;
     bias_type_ = GetBiasTensorType(tensor_type_);
-    input_model_ = ReadModel(internal::kModelWithResourceVarsCalibrated);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_, nullptr);
     if (modify_range_ != ModifyRangeType::kNone) {
       ModifyRange(&model_);
     }
@@ -2191,10 +2118,8 @@ class QuantizeConcatConstModelTest
     : public QuantizeModelTest,
       public testing::WithParamInterface<TensorType> {
  protected:
-  QuantizeConcatConstModelTest() {
-    input_model_ = ReadModel(internal::kFloatConcatMax5Max10Max10);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
+  QuantizeConcatConstModelTest()
+      : QuantizeModelTest(ReadModel(internal::kFloatConcatMax5Max10Max10)) {
     // Make one of the values constant.
     MakeInputConstant(&model_);
   }
@@ -2298,14 +2223,12 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConcatConstModelTest,
 class BiasInputTest : public QuantizeModelTest,
                       public testing::WithParamInterface<BiasTestType> {
  protected:
-  BiasInputTest() {
+  BiasInputTest()
+      : QuantizeModelTest(ReadModel(internal::kConvModelWith0Plus10Weights)) {
     BiasTestType obj = GetParam();
     tensor_type_ = obj.tensor_type;
     bias_type_ = obj.bias_type;
     is_valid_bias_type_ = obj.is_valid_bias_type;
-    input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
-    readonly_model_ = input_model_->GetModel();
-    readonly_model_->UnPackTo(&model_);
   }
   TensorType tensor_type_;
   TensorType bias_type_;
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index ad6cda6f649..a42662284c8 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index 9df520df37d..2b589584acd 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/context.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_portable.cc b/tensorflow/lite/tools/optimize/quantize_weights_portable.cc
index 897cf7c02dc..5a77092e1c2 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_portable.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_portable.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// clang-format off
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
+// clang-format on
 
 #include <algorithm>
 #include <memory>
@@ -24,8 +26,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 09142e45966..8f7a1150bef 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
diff --git a/tensorflow/lite/tools/optimize/reduced_precision_support_test.cc b/tensorflow/lite/tools/optimize/reduced_precision_support_test.cc
index ee485ec0db0..6b5cf538b50 100644
--- a/tensorflow/lite/tools/optimize/reduced_precision_support_test.cc
+++ b/tensorflow/lite/tools/optimize/reduced_precision_support_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index b1e5f313a7e..5bae22ba1de 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -16,7 +17,7 @@ pybind_extension(
     ],
     features = ["-use_header_modules"],
     deps = [
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal/utils:sparsity_format_converter",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@pybind11",
diff --git a/tensorflow/lite/tools/optimize/sparsity/format_converter_wrapper_pybind11.cc b/tensorflow/lite/tools/optimize/sparsity/format_converter_wrapper_pybind11.cc
index 237103c5a74..684679fabdc 100644
--- a/tensorflow/lite/tools/optimize/sparsity/format_converter_wrapper_pybind11.cc
+++ b/tensorflow/lite/tools/optimize/sparsity/format_converter_wrapper_pybind11.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/utils/sparsity_format_converter.h"
 
 namespace py = pybind11;
diff --git a/tensorflow/lite/tools/pip_package/Dockerfile.py3 b/tensorflow/lite/tools/pip_package/Dockerfile.py3
index 664f713e4ea..63373905a63 100644
--- a/tensorflow/lite/tools/pip_package/Dockerfile.py3
+++ b/tensorflow/lite/tools/pip_package/Dockerfile.py3
@@ -15,21 +15,27 @@
 ARG IMAGE
 FROM ${IMAGE}
 ARG PYTHON_VERSION
+ARG NUMPY_VERSION
 
 COPY update_sources.sh /
 RUN /update_sources.sh
 
 RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \
     apt-get install -y \
       build-essential \
       software-properties-common \
       zlib1g-dev  \
       curl \
+      wget \
       unzip \
       git && \
     apt-get clean
 
-RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# Install Bazel.
+RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.15.0/bazelisk-linux-amd64 \
+  -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
+
 # Install Python packages.
 RUN dpkg --add-architecture armhf
 RUN dpkg --add-architecture arm64
@@ -48,7 +54,7 @@ RUN curl -OL https://bootstrap.pypa.io/get-pip.py
 RUN python3 get-pip.py
 RUN rm get-pip.py
 RUN pip3 install --upgrade pip
-RUN pip3 install numpy~=1.19.2 setuptools pybind11
+RUN pip3 install numpy~=$NUMPY_VERSION setuptools pybind11
 RUN ln -sf /usr/include/python$PYTHON_VERSION /usr/include/python3
 RUN ln -sf /usr/local/lib/python$PYTHON_VERSION/dist-packages/numpy/core/include/numpy /usr/include/python3/numpy
 RUN curl -OL https://github.com/Kitware/CMake/releases/download/v3.16.8/cmake-3.16.8-Linux-x86_64.sh
diff --git a/tensorflow/lite/tools/pip_package/Makefile b/tensorflow/lite/tools/pip_package/Makefile
index aed48aca538..b5ea518970a 100644
--- a/tensorflow/lite/tools/pip_package/Makefile
+++ b/tensorflow/lite/tools/pip_package/Makefile
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 # Values: debian:<version>, ubuntu:<version>
-BASE_IMAGE ?= ubuntu:18.04
-PYTHON_VERSION ?= 3.9
+BASE_IMAGE ?= ubuntu:20.04
+PYTHON_VERSION ?= 3.10
+NUMPY_VERSION ?= 1.21.2
+
 # Values: rpi, aarch64, native
 TENSORFLOW_TARGET ?= native
 WHEEL_PROJECT_NAME ?= tflite_runtime
@@ -53,7 +55,11 @@ help:
 	@echo "make clean        -- remove wheel and deb files"
 
 docker-image:
-	docker build -t $(TAG_IMAGE) --build-arg IMAGE=$(BASE_IMAGE) --build-arg PYTHON_VERSION=$(PYTHON_VERSION) -f $(MAKEFILE_DIR)/Dockerfile.py3 $(MAKEFILE_DIR)/.
+	docker build -t $(TAG_IMAGE) \
+		--build-arg IMAGE=$(BASE_IMAGE) \
+		--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+		--build-arg NUMPY_VERSION=$(NUMPY_VERSION) \
+		-f $(MAKEFILE_DIR)/Dockerfile.py3 $(MAKEFILE_DIR)/.
 
 docker-shell: docker-image
 	mkdir -p $(TENSORFLOW_DIR)/bazel-ci_build-cache
@@ -67,4 +73,4 @@ docker-build: docker-image
 		--rm --interactive $(shell tty -s && echo --tty) \
 		$(DOCKER_PARAMS) \
 		$(TAG_IMAGE) \
-		/with_the_same_user /bin/bash -C /tensorflow/tensorflow/lite/tools/pip_package/build_pip_package_with_cmake.sh $(TENSORFLOW_TARGET)
+		/with_the_same_user /bin/bash -C /tensorflow/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh $(TENSORFLOW_TARGET)
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
index 9df45412432..5fbaed56c28 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -69,6 +69,13 @@ case "${TENSORFLOW_TARGET}" in
       --define tensorflow_mkldnn_contraction_kernel=0
       --define=raspberry_pi_with_neon=true"
     ;;
+  rpi0)
+    BAZEL_FLAGS="--config=elinux_armhf
+      --copt=-march=armv6 -mfpu=vfp -mfloat-abi=hard
+      --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
+      --define tensorflow_mkldnn_contraction_kernel=0
+      --define=raspberry_pi_with_neon=true"
+    ;;
   aarch64)
     BAZEL_FLAGS="--config=elinux_aarch64
       --define tensorflow_mkldnn_contraction_kernel=0
@@ -112,15 +119,20 @@ case "${TENSORFLOW_TARGET}" in
     ${PYTHON} setup.py bdist --plat-name=${WHEEL_PLATFORM_NAME} \
                        bdist_wheel --plat-name=${WHEEL_PLATFORM_NAME}
     ;;
+  rpi0)
+    WHEEL_PLATFORM_NAME="${WHEEL_PLATFORM_NAME:-linux-armv6l}"
+    ${PYTHON} setup.py bdist --plat-name=${WHEEL_PLATFORM_NAME} \
+                       bdist_wheel --plat-name=${WHEEL_PLATFORM_NAME}
+    ;;
   aarch64)
     WHEEL_PLATFORM_NAME="${WHEEL_PLATFORM_NAME:-linux-aarch64}"
     ${PYTHON} setup.py bdist --plat-name=${WHEEL_PLATFORM_NAME} \
                        bdist_wheel --plat-name=${WHEEL_PLATFORM_NAME}
     ;;
   *)
-    if [[ -n "${TENSORFLOW_TARGET}" ]] && [[ -n "${TENSORFLOW_TARGET_ARCH}" ]]; then
-      ${PYTHON} setup.py bdist --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH} \
-                         bdist_wheel --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH}
+    if [[ -n "${WHEEL_PLATFORM_NAME}" ]]; then
+      ${PYTHON} setup.py bdist --plat-name=${WHEEL_PLATFORM_NAME} \
+                         bdist_wheel --plat-name=${WHEEL_PLATFORM_NAME}
     else
       ${PYTHON} setup.py bdist bdist_wheel
     fi
@@ -158,6 +170,9 @@ case "${TENSORFLOW_TARGET}" in
   armhf)
     dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armhf
     ;;
+  rpi0)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armel
+    ;;
   aarch64)
     dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a arm64
     ;;
diff --git a/tensorflow/lite/tools/pip_package/setup_with_binary.py b/tensorflow/lite/tools/pip_package/setup_with_binary.py
index 58f197e773c..6798d3a2871 100644
--- a/tensorflow/lite/tools/pip_package/setup_with_binary.py
+++ b/tensorflow/lite/tools/pip_package/setup_with_binary.py
@@ -50,6 +50,7 @@
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
@@ -61,6 +62,7 @@
     package_dir={'': '.'},
     package_data={'': ['*.so', '*.pyd']},
     install_requires=[
-        'numpy >= 1.19.2',  # Better to keep sync with both TF ci_build
-                            # and OpenCV-Python requirement.
-    ])
+        'numpy >= 1.21.2',  # Better to keep sync with both TF ci_build
+        # and OpenCV-Python requirement.
+    ],
+)
diff --git a/tensorflow/lite/tools/randomize_weights.py b/tensorflow/lite/tools/randomize_weights.py
index 673a151611c..149a3249a82 100644
--- a/tensorflow/lite/tools/randomize_weights.py
+++ b/tensorflow/lite/tools/randomize_weights.py
@@ -28,6 +28,8 @@
 flags.DEFINE_multi_integer(
     'buffers_to_skip', [], 'Buffer indices in the TFLite model to be skipped, '
     'i.e., to be left unmodified.')
+flags.DEFINE_multi_string(
+    'ops_to_skip', [], 'Ops in the TFLite model to be skipped / unmodified.')
 flags.DEFINE_integer('random_seed', 0, 'Input to the random number generator.')
 
 flags.mark_flag_as_required('input_tflite_file')
@@ -35,7 +37,18 @@
 
 
 def main(_):
+  buffers_to_skip = FLAGS.buffers_to_skip
+  ops_to_skip = [op.upper() for op in FLAGS.ops_to_skip]
   model = flatbuffer_utils.read_model(FLAGS.input_tflite_file)
+
+  # Add in buffers for ops in ops_to_skip to the list of skipped buffers.
+  for graph in model.subgraphs:
+    for op in graph.operators:
+      op_name = flatbuffer_utils.opcode_to_name(model, op.opcodeIndex)
+      if op_name.upper() in ops_to_skip:
+        for input_idx in op.inputs:
+          buffers_to_skip.append(graph.tensors[input_idx].buffer)
+
   flatbuffer_utils.randomize_weights(model, FLAGS.random_seed,
                                      FLAGS.buffers_to_skip)
   flatbuffer_utils.write_model(model, FLAGS.output_tflite_file)
diff --git a/tensorflow/lite/tools/serialization/BUILD b/tensorflow/lite/tools/serialization/BUILD
index 5c5a35c11e6..0ac00ae0143 100644
--- a/tensorflow/lite/tools/serialization/BUILD
+++ b/tensorflow/lite/tools/serialization/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -15,10 +16,18 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "writer_lib_enum",
+    hdrs = ["enum_mapping.h"],
+    deps = [
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
 cc_library(
     name = "writer_lib",
     srcs = [
-        "enum_mapping.h",
         "writer_lib.cc",
     ],
     hdrs = [
@@ -32,10 +41,11 @@ cc_library(
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core:headers",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
+        "//tensorflow/lite/tools/serialization:writer_lib_enum",
         "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -48,7 +58,8 @@ cc_binary(
     deps = [
         ":writer_lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
     ],
 )
 
@@ -58,7 +69,8 @@ cc_binary(
     deps = [
         ":writer_lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/kernels:builtin_ops",
     ],
 )
 
@@ -69,10 +81,10 @@ cc_test(
     deps = [
         ":writer_lib",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core:headers",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/tools/serialization/option_writer_generator.cc b/tensorflow/lite/tools/serialization/option_writer_generator.cc
index be42b417477..bb05aa4555f 100644
--- a/tensorflow/lite/tools/serialization/option_writer_generator.cc
+++ b/tensorflow/lite/tools/serialization/option_writer_generator.cc
@@ -320,7 +320,7 @@ void GenerateImportForReshapeOp(FILE* fp) {
           "    const auto* params = reinterpret_cast<const "
           "TfLiteReshapeParams*>(builtin_op_data);\n"
           "    flatbuffers::Offset<void> union_type;\n"
-          "    if (node.inputs->size > 1 && (params->num_dimensions <= 0 || "
+          "    if (node_inputs_size > 1 && (params->num_dimensions <= 0 || "
           "params->num_dimensions > TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT))"
           " {\n"
           "      union_type = CreateReshapeOptions(*fbb).Union();\n"
diff --git a/tensorflow/lite/tools/serialization/writer.cc b/tensorflow/lite/tools/serialization/writer.cc
index ea74a8f8bfc..0f21b39c233 100644
--- a/tensorflow/lite/tools/serialization/writer.cc
+++ b/tensorflow/lite/tools/serialization/writer.cc
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/tools/serialization/writer_lib.h"
 
 int main(int argc, char* argv[]) {
diff --git a/tensorflow/lite/tools/serialization/writer_lib.cc b/tensorflow/lite/tools/serialization/writer_lib.cc
index 93ffb557c56..fbf7c99b694 100644
--- a/tensorflow/lite/tools/serialization/writer_lib.cc
+++ b/tensorflow/lite/tools/serialization/writer_lib.cc
@@ -26,8 +26,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/schema/reflection/schema_generated.h"
 #include "tensorflow/lite/schema/schema_conversion_utils.h"
@@ -78,7 +78,7 @@ TfLiteStatus WriteImpl(const std::string& filename, void* data, size_t size) {
 
 std::pair<BuiltinOptions, flatbuffers::Offset<void>> CreateBuiltinUnion(
     flatbuffers::FlatBufferBuilder* fbb, enum BuiltinOperator op,
-    void* builtin_op_data, const TfLiteNode& node) {
+    void* builtin_op_data, int node_inputs_size) {
   switch (op) {
 #include "tensorflow/lite/tools/serialization/option_writer_generated.h"
   }
@@ -131,7 +131,7 @@ SubgraphWriter::ExportOperators(flatbuffers::FlatBufferBuilder* fbb) {
       // builtin
       auto builtin_options_and_type = CreateBuiltinUnion(
           fbb, static_cast<enum BuiltinOperator>(registration.builtin_code),
-          node.builtin_data, node);
+          node.builtin_data, node.inputs->size);
       builtin_options = builtin_options_and_type.second;
       builtin_options_type = builtin_options_and_type.first;
     } else {
diff --git a/tensorflow/lite/tools/serialization/writer_lib.h b/tensorflow/lite/tools/serialization/writer_lib.h
index fba1c418ece..9fd1c570f2a 100644
--- a/tensorflow/lite/tools/serialization/writer_lib.h
+++ b/tensorflow/lite/tools/serialization/writer_lib.h
@@ -26,8 +26,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/schema/reflection/schema_generated.h"
diff --git a/tensorflow/lite/tools/serialization/writer_lib_test.cc b/tensorflow/lite/tools/serialization/writer_lib_test.cc
index e9142d03e5d..a346f51c0c7 100644
--- a/tensorflow/lite/tools/serialization/writer_lib_test.cc
+++ b/tensorflow/lite/tools/serialization/writer_lib_test.cc
@@ -26,13 +26,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
diff --git a/tensorflow/lite/tools/serialization/writer_test.cc b/tensorflow/lite/tools/serialization/writer_test.cc
index 227eaaf7410..8f0904d6792 100644
--- a/tensorflow/lite/tools/serialization/writer_test.cc
+++ b/tensorflow/lite/tools/serialization/writer_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/tools/serialization/writer_lib.h"
 
 int main(int argc, char* argv[]) {
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index 3fd73a3ea60..b1b47b9edf7 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -1,9 +1,10 @@
 # Utilities for signature_defs in TFLite
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "replace_with_portable_tf_lib_when_required")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension", "replace_with_portable_tf_lib_when_required")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -14,6 +15,7 @@ cc_library(
     name = "signature_def_util",
     srcs = ["signature_def_util.cc"],
     hdrs = ["signature_def_util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = replace_with_portable_tf_lib_when_required([
         "//tensorflow/core:lib_proto_parsing",
@@ -26,7 +28,8 @@ cc_library(
         "@flatbuffers",
         "//tensorflow/core:protos_all_cc_impl",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
@@ -46,10 +49,11 @@ cc_test(
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/core/platform:errors",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/c:c_api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
@@ -63,7 +67,8 @@ pybind_extension(
     ],
     deps = [
         ":signature_def_util",
-        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/python:pybind11_lib",
         "@pybind11",
     ],
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
index bcb96bdcf01..743e65818f8 100644
--- a/tensorflow/lite/tools/signature/signature_def_util.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/signature/signature_def_util.h"
 
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -24,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/signature/signature_def_util.h b/tensorflow/lite/tools/signature/signature_def_util.h
index 4f47ebe52d6..88ace59022a 100644
--- a/tensorflow/lite/tools/signature/signature_def_util.h
+++ b/tensorflow/lite/tools/signature/signature_def_util.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
index d1a7a65a95e..51da756059c 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_test.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/signature_constants.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
index 5fb4b5d19a3..b17926b30d7 100644
--- a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 #include "tensorflow/lite/tools/signature/signature_def_util.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 
diff --git a/tensorflow/lite/tools/strip_buffers/BUILD b/tensorflow/lite/tools/strip_buffers/BUILD
index 96b38618db8..ac055e62e28 100644
--- a/tensorflow/lite/tools/strip_buffers/BUILD
+++ b/tensorflow/lite/tools/strip_buffers/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -18,8 +19,9 @@ cc_library(
     deps = [
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -35,7 +37,8 @@ cc_binary(
         ":stripping_lib",
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools:command_line_flags",
     ],
@@ -50,7 +53,8 @@ cc_binary(
         ":stripping_lib",
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools:command_line_flags",
     ],
diff --git a/tensorflow/lite/tools/strip_buffers/reconstitute_buffers_into_fb.cc b/tensorflow/lite/tools/strip_buffers/reconstitute_buffers_into_fb.cc
index 0a08ebd00c7..9eb2d9b1da3 100644
--- a/tensorflow/lite/tools/strip_buffers/reconstitute_buffers_into_fb.cc
+++ b/tensorflow/lite/tools/strip_buffers/reconstitute_buffers_into_fb.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/strip_buffers/stripping_lib.h"
diff --git a/tensorflow/lite/tools/strip_buffers/strip_buffers_from_fb.cc b/tensorflow/lite/tools/strip_buffers/strip_buffers_from_fb.cc
index 69bfaced5dc..f84052d91ad 100644
--- a/tensorflow/lite/tools/strip_buffers/strip_buffers_from_fb.cc
+++ b/tensorflow/lite/tools/strip_buffers/strip_buffers_from_fb.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/strip_buffers/stripping_lib.h"
diff --git a/tensorflow/lite/tools/strip_buffers/stripping_lib.cc b/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
index 406df617cdb..a0da9607c1e 100644
--- a/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
+++ b/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
@@ -25,9 +25,9 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/model.h"
 
 #define TFLITE_SCHEMA_VERSION 3
 
diff --git a/tensorflow/lite/tools/strip_buffers/stripping_lib.h b/tensorflow/lite/tools/strip_buffers/stripping_lib.h
index c3ba4de5d92..c7991617f3a 100644
--- a/tensorflow/lite/tools/strip_buffers/stripping_lib.h
+++ b/tensorflow/lite/tools/strip_buffers/stripping_lib.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/test_utils.py b/tensorflow/lite/tools/test_utils.py
index 5b07e2b27f2..51a65c40e16 100644
--- a/tensorflow/lite/tools/test_utils.py
+++ b/tensorflow/lite/tools/test_utils.py
@@ -30,7 +30,9 @@ def build_mock_flatbuffer_model():
   schema_fb.BufferStart(builder)
   buffer0_offset = schema_fb.BufferEnd(builder)
 
-  schema_fb.BufferStartDataVector(builder, 10)
+  schema_fb.BufferStartDataVector(builder, 12)
+  builder.PrependUint8(11)
+  builder.PrependUint8(10)
   builder.PrependUint8(9)
   builder.PrependUint8(8)
   builder.PrependUint8(7)
@@ -41,7 +43,7 @@ def build_mock_flatbuffer_model():
   builder.PrependUint8(2)
   builder.PrependUint8(1)
   builder.PrependUint8(0)
-  buffer1_data_offset = builder.EndVector(10)
+  buffer1_data_offset = builder.EndVector()
   schema_fb.BufferStart(builder)
   schema_fb.BufferAddData(builder, buffer1_data_offset)
   buffer1_offset = schema_fb.BufferEnd(builder)
@@ -53,14 +55,14 @@ def build_mock_flatbuffer_model():
   builder.PrependUOffsetTRelative(buffer2_offset)
   builder.PrependUOffsetTRelative(buffer1_offset)
   builder.PrependUOffsetTRelative(buffer0_offset)
-  buffers_offset = builder.EndVector(3)
+  buffers_offset = builder.EndVector()
 
   string0_offset = builder.CreateString('input_tensor')
   schema_fb.TensorStartShapeVector(builder, 3)
   builder.PrependInt32(1)
   builder.PrependInt32(2)
   builder.PrependInt32(5)
-  shape0_offset = builder.EndVector(3)
+  shape0_offset = builder.EndVector()
   schema_fb.TensorStart(builder)
   schema_fb.TensorAddName(builder, string0_offset)
   schema_fb.TensorAddShape(builder, shape0_offset)
@@ -74,7 +76,7 @@ def build_mock_flatbuffer_model():
   builder.PrependFloat32(5.0)
   builder.PrependFloat32(10.0)
   builder.PrependFloat32(20.0)
-  quant1_min_offset = builder.EndVector(5)
+  quant1_min_offset = builder.EndVector()
 
   schema_fb.QuantizationParametersStartMaxVector(builder, 5)
   builder.PrependFloat32(10.0)
@@ -82,7 +84,7 @@ def build_mock_flatbuffer_model():
   builder.PrependFloat32(-50.0)
   builder.PrependFloat32(1.0)
   builder.PrependFloat32(2.0)
-  quant1_max_offset = builder.EndVector(5)
+  quant1_max_offset = builder.EndVector()
 
   schema_fb.QuantizationParametersStartScaleVector(builder, 5)
   builder.PrependFloat32(3.0)
@@ -90,7 +92,7 @@ def build_mock_flatbuffer_model():
   builder.PrependFloat32(5.0)
   builder.PrependFloat32(6.0)
   builder.PrependFloat32(7.0)
-  quant1_scale_offset = builder.EndVector(5)
+  quant1_scale_offset = builder.EndVector()
 
   schema_fb.QuantizationParametersStartZeroPointVector(builder, 5)
   builder.PrependInt64(1)
@@ -98,7 +100,7 @@ def build_mock_flatbuffer_model():
   builder.PrependInt64(3)
   builder.PrependInt64(-1)
   builder.PrependInt64(-2)
-  quant1_zero_point_offset = builder.EndVector(5)
+  quant1_zero_point_offset = builder.EndVector()
 
   schema_fb.QuantizationParametersStart(builder)
   schema_fb.QuantizationParametersAddMin(builder, quant1_min_offset)
@@ -113,7 +115,7 @@ def build_mock_flatbuffer_model():
   builder.PrependInt32(1)
   builder.PrependInt32(2)
   builder.PrependInt32(5)
-  shape1_offset = builder.EndVector(3)
+  shape1_offset = builder.EndVector()
   schema_fb.TensorStart(builder)
   schema_fb.TensorAddName(builder, string1_offset)
   schema_fb.TensorAddShape(builder, shape1_offset)
@@ -127,7 +129,7 @@ def build_mock_flatbuffer_model():
   builder.PrependInt32(1)
   builder.PrependInt32(2)
   builder.PrependInt32(5)
-  shape2_offset = builder.EndVector(3)
+  shape2_offset = builder.EndVector()
   schema_fb.TensorStart(builder)
   schema_fb.TensorAddName(builder, string2_offset)
   schema_fb.TensorAddShape(builder, shape2_offset)
@@ -139,45 +141,74 @@ def build_mock_flatbuffer_model():
   builder.PrependUOffsetTRelative(tensor2_offset)
   builder.PrependUOffsetTRelative(tensor1_offset)
   builder.PrependUOffsetTRelative(tensor0_offset)
-  tensors_offset = builder.EndVector(3)
+  tensors_offset = builder.EndVector()
 
   schema_fb.SubGraphStartInputsVector(builder, 1)
   builder.PrependInt32(0)
-  inputs_offset = builder.EndVector(1)
+  inputs_offset = builder.EndVector()
 
   schema_fb.SubGraphStartOutputsVector(builder, 1)
   builder.PrependInt32(2)
-  outputs_offset = builder.EndVector(1)
+  outputs_offset = builder.EndVector()
 
   schema_fb.OperatorCodeStart(builder)
   schema_fb.OperatorCodeAddBuiltinCode(builder, schema_fb.BuiltinOperator.ADD)
   schema_fb.OperatorCodeAddDeprecatedBuiltinCode(builder,
                                                  schema_fb.BuiltinOperator.ADD)
   schema_fb.OperatorCodeAddVersion(builder, 1)
-  code_offset = schema_fb.OperatorCodeEnd(builder)
+  code0_offset = schema_fb.OperatorCodeEnd(builder)
 
-  schema_fb.ModelStartOperatorCodesVector(builder, 1)
-  builder.PrependUOffsetTRelative(code_offset)
-  codes_offset = builder.EndVector(1)
+  schema_fb.OperatorCodeStart(builder)
+  schema_fb.OperatorCodeAddBuiltinCode(builder,
+                                       schema_fb.BuiltinOperator.VAR_HANDLE)
+  schema_fb.OperatorCodeAddDeprecatedBuiltinCode(
+      builder, schema_fb.BuiltinOperator.PLACEHOLDER_FOR_GREATER_OP_CODES)
+  schema_fb.OperatorCodeAddVersion(builder, 1)
+  code1_offset = schema_fb.OperatorCodeEnd(builder)
+
+  schema_fb.ModelStartOperatorCodesVector(builder, 2)
+  builder.PrependUOffsetTRelative(code1_offset)
+  builder.PrependUOffsetTRelative(code0_offset)
+  codes_offset = builder.EndVector()
 
   schema_fb.OperatorStartInputsVector(builder, 2)
   builder.PrependInt32(0)
   builder.PrependInt32(1)
-  op_inputs_offset = builder.EndVector(2)
+  op_inputs_offset = builder.EndVector()
 
   schema_fb.OperatorStartOutputsVector(builder, 1)
   builder.PrependInt32(2)
-  op_outputs_offset = builder.EndVector(1)
+  op_outputs_offset = builder.EndVector()
 
   schema_fb.OperatorStart(builder)
   schema_fb.OperatorAddOpcodeIndex(builder, 0)
   schema_fb.OperatorAddInputs(builder, op_inputs_offset)
   schema_fb.OperatorAddOutputs(builder, op_outputs_offset)
-  op_offset = schema_fb.OperatorEnd(builder)
+  op0_offset = schema_fb.OperatorEnd(builder)
+
+  shared_name = builder.CreateString('var')
+  schema_fb.VarHandleOptionsStart(builder)
+  schema_fb.VarHandleOptionsAddSharedName(builder, shared_name)
+  var_handle_options_offset = schema_fb.VarHandleOptionsEnd(builder)
+
+  schema_fb.OperatorStart(builder)
+  schema_fb.OperatorAddOpcodeIndex(builder, 1)
+  schema_fb.OperatorAddBuiltinOptionsType(
+      builder, schema_fb.BuiltinOptions.VarHandleOptions)
+  schema_fb.OperatorAddBuiltinOptions(builder, var_handle_options_offset)
+  op1_offset = schema_fb.OperatorEnd(builder)
+
+  schema_fb.OperatorStart(builder)
+  schema_fb.OperatorAddBuiltinOptionsType(
+      builder, schema_fb.BuiltinOptions.VarHandleOptions)
+  schema_fb.OperatorAddBuiltinOptions(builder, var_handle_options_offset)
+  op2_offset = schema_fb.OperatorEnd(builder)
 
-  schema_fb.SubGraphStartOperatorsVector(builder, 1)
-  builder.PrependUOffsetTRelative(op_offset)
-  ops_offset = builder.EndVector(1)
+  schema_fb.SubGraphStartOperatorsVector(builder, 3)
+  builder.PrependUOffsetTRelative(op2_offset)
+  builder.PrependUOffsetTRelative(op1_offset)
+  builder.PrependUOffsetTRelative(op0_offset)
+  ops_offset = builder.EndVector()
 
   string3_offset = builder.CreateString('subgraph_name')
   schema_fb.SubGraphStart(builder)
@@ -190,7 +221,7 @@ def build_mock_flatbuffer_model():
 
   schema_fb.ModelStartSubgraphsVector(builder, 1)
   builder.PrependUOffsetTRelative(subgraph_offset)
-  subgraphs_offset = builder.EndVector(1)
+  subgraphs_offset = builder.EndVector()
 
   signature_key = builder.CreateString('my_key')
   input_tensor_string = builder.CreateString('input_tensor')
@@ -210,10 +241,10 @@ def build_mock_flatbuffer_model():
 
   schema_fb.SignatureDefStartInputsVector(builder, 1)
   builder.PrependUOffsetTRelative(input_tensor)
-  signature_inputs_offset = builder.EndVector(1)
+  signature_inputs_offset = builder.EndVector()
   schema_fb.SignatureDefStartOutputsVector(builder, 1)
   builder.PrependUOffsetTRelative(output_tensor)
-  signature_outputs_offset = builder.EndVector(1)
+  signature_outputs_offset = builder.EndVector()
 
   schema_fb.SignatureDefStart(builder)
   schema_fb.SignatureDefAddSignatureKey(builder, signature_key)
@@ -222,7 +253,7 @@ def build_mock_flatbuffer_model():
   signature_offset = schema_fb.SignatureDefEnd(builder)
   schema_fb.ModelStartSignatureDefsVector(builder, 1)
   builder.PrependUOffsetTRelative(signature_offset)
-  signature_defs_offset = builder.EndVector(1)
+  signature_defs_offset = builder.EndVector()
 
   string4_offset = builder.CreateString('model_description')
   schema_fb.ModelStart(builder)
diff --git a/tensorflow/lite/tools/utils.h b/tensorflow/lite/tools/utils.h
index 76154eaeb04..2fc9c62de11 100644
--- a/tensorflow/lite/tools/utils.h
+++ b/tensorflow/lite/tools/utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 namespace utils {
diff --git a/tensorflow/lite/tools/verifier.h b/tensorflow/lite/tools/verifier.h
index 67a1fb134d1..93bc5433c80 100644
--- a/tensorflow/lite/tools/verifier.h
+++ b/tensorflow/lite/tools/verifier.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,8 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_H_
 #define TENSORFLOW_LITE_TOOLS_VERIFIER_H_
 
-/// For documentation, see
-/// third_party/tensorflow/lite/core/tools/verifier.h.
-#include "tensorflow/lite/core/tools/verifier.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/tools/verifier.h"
+
+namespace tflite {
+
+using ::tflite::Verify;  // NOLINT
+
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_TOOLS_VERIFIER_H_
diff --git a/tensorflow/lite/tools/verifier_internal.h b/tensorflow/lite/tools/verifier_internal.h
index 43e12a92de4..a3f499bc1fd 100644
--- a/tensorflow/lite/tools/verifier_internal.h
+++ b/tensorflow/lite/tools/verifier_internal.h
@@ -16,8 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
 #define TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
 
-/// For documentation, see
-/// third_party/tensorflow/lite/core/tools/verifier_internal.h.
-#include "tensorflow/lite/core/tools/verifier_internal.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/tools/verifier_internal.h"
+
+namespace tflite {
+namespace internal {
+
+using ::tflite::internal::VerifyFlatBufferAndGetModel;  // NOLINT
+
+}  // namespace internal
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_TOOLS_VERIFIER_INTERNAL_H_
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 2f0f669987c..89870313d12 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -5,6 +5,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -25,7 +26,7 @@ cc_library(
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
@@ -45,7 +46,7 @@ tf_cc_test(
     deps = [
         ":versioning",
         "//tensorflow/lite:builtin_op_data",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "@com_google_googletest//:gtest_main",
@@ -63,9 +64,9 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/lite:stderr_reporter",
-        "//tensorflow/lite/c:c_api_types",
-        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
@@ -86,7 +87,7 @@ tf_cc_test(
         ":op_signature",
         "//tensorflow/core/platform:resource_loader",
         "//tensorflow/lite:builtin_ops",
-        "//tensorflow/lite:model_builder",
+        "//tensorflow/lite/core:model_builder",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -105,7 +106,7 @@ cc_library(
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "//tensorflow/lite/schema:schema_utils",
@@ -128,7 +129,7 @@ tf_cc_test(
     deps = [
         ":gpu_compatibility",
         "//tensorflow/core/platform:resource_loader",
-        "//tensorflow/lite:model_builder",
+        "//tensorflow/lite/core:model_builder",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index 838a4a7f323..b0fe926ec50 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -872,6 +872,7 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
     case kTfLiteBuiltinFloorMod:
     case kTfLiteBuiltinGreater:
     case kTfLiteBuiltinGreaterEqual:
+    case kTfLiteBuiltinLogicalAnd:
     case kTfLiteBuiltinLess:
     case kTfLiteBuiltinLessEqual:
     case kTfLiteBuiltinMaximum:
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.h b/tensorflow/lite/tools/versioning/gpu_compatibility.h
index 344d232de44..bca7a88f120 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.h
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_VERSIONING_GPU_COMPATIBILITY_H_
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
 
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc b/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
index 4391209112a..e1742d5b018 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/versioning/op_signature.cc b/tensorflow/lite/tools/versioning/op_signature.cc
index 1a9b0adb24b..82021fae660 100644
--- a/tensorflow/lite/tools/versioning/op_signature.cc
+++ b/tensorflow/lite/tools/versioning/op_signature.cc
@@ -36,13 +36,22 @@ class MallocDataAllocator : public BuiltinDataAllocator {
 
 // Get the number of dimensions of a tensor with idx of an operator op.
 inline int GetNumDims(const SubGraph* subgraph, const Operator* op, int idx) {
-  return subgraph->tensors()->Get(op->inputs()->Get(idx))->shape()->size();
+  const flatbuffers::Vector<int32_t>* ret =
+      subgraph->tensors()->Get(op->inputs()->Get(idx))->shape();
+  if (ret) {
+    return ret->size();
+  } else {
+    return 0;
+  }
 }
 
 std::vector<OpSignatureTensorSpec> GetOpSignatureTensorSpecs(
     const flatbuffers::Vector<int32_t>* tensors, const SubGraph* subgraph,
     const Model* model) {
   std::vector<OpSignatureTensorSpec> tensor_specs;
+  if (!tensors) {
+    return tensor_specs;
+  }
   StderrReporter error_reporter;
 
   for (int32_t i = 0; i < tensors->Length(); ++i) {
@@ -196,7 +205,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
           filter_quant->scale()->Length() == num_filters) {
         op_sig.ext_options.conv_2d.is_per_channel_quantized = true;
       }
-      if (input_tensor->shape()->size()) {
+      if (input_tensor->shape() && input_tensor->shape()->size()) {
         int num_input_channels = input_tensor->shape()->Get(3);
         int num_filter_input_channels = filter_tensor->shape()->Get(3);
         op_sig.ext_options.conv_2d.is_grouped_convolution =
diff --git a/tensorflow/lite/tools/versioning/op_signature.h b/tensorflow/lite/tools/versioning/op_signature.h
index ac02910f1bb..86297ab73f8 100644
--- a/tensorflow/lite/tools/versioning/op_signature.h
+++ b/tensorflow/lite/tools/versioning/op_signature.h
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/versioning/op_signature_test.cc b/tensorflow/lite/tools/versioning/op_signature_test.cc
index 77b8b80de73..c8f28752bb6 100644
--- a/tensorflow/lite/tools/versioning/op_signature_test.cc
+++ b/tensorflow/lite/tools/versioning/op_signature_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 9839d73e623..d500469e4e0 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/schema/schema_utils.h"
@@ -116,6 +116,15 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           op_sig.outputs.at(0).type == kTfLiteInt8) {
         return 3;
       }
+
+      // If the op has signed int8 and int4 op_sig.inputs and op_sig.outputs,
+      // its version 7.
+      if (op_sig.inputs.at(0).type == kTfLiteInt8 &&
+          op_sig.inputs.at(1).type == kTfLiteInt4 &&
+          op_sig.outputs.at(0).type == kTfLiteInt8) {
+        return 7;
+      }
+
       auto depthwise_conv_params =
           reinterpret_cast<TfLiteDepthwiseConvParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(depthwise_conv_params != nullptr);
@@ -176,6 +185,15 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           op_sig.outputs.at(0).type == kTfLiteInt8) {
         return 4;
       }
+
+      // If the op has signed int8 and int4 op_sig.inputs and op_sig.outputs,
+      // its version 7.
+      if (op_sig.inputs.at(0).type == kTfLiteInt8 &&
+          op_sig.inputs.at(1).type == kTfLiteInt4 &&
+          op_sig.outputs.at(0).type == kTfLiteInt8) {
+        return 10;
+      }
+
       // If the op is a signed int8 hybrid operation, we need to return
       // version 3.
       if (op_sig.inputs.at(0).type == kTfLiteFloat32 &&
@@ -238,6 +256,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
     }
 
+    case BuiltinOperator_SIGN:
+      // Version 2 supports int32 inputs
+      if (op_sig.inputs.at(0).type == kTfLiteInt32) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_MUL:
       // Version 6 supports complex32 inputs
       if (op_sig.inputs.at(0).type == kTfLiteComplex64) {
@@ -297,6 +322,15 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_TRANSPOSE_CONV: {
+      auto transpose_conv_params =
+          reinterpret_cast<TfLiteTransposeConvParams*>(op_sig.builtin_data);
+
+      // TransposeConvOp has fused activation function from version 4.
+      if (transpose_conv_params != nullptr &&
+          transpose_conv_params->activation) {
+        return 4;
+      }
+
       if (op_sig.inputs.size() == 4 &&
           op_sig.inputs.at(3).type != kTfLiteNoType) {
         return 3;
@@ -309,10 +343,18 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     }
 
     case BuiltinOperator_LSTM: {
-      // If the input tensor is float and a weight is int8, this is a version
-      // 3 hybrid operation.
       auto lstm_params =
           reinterpret_cast<TfLiteLSTMParams*>(op_sig.builtin_data);
+      // If the input activation and output tensor are int16 and a weight is
+      // int8, this is a version 5.
+      if (lstm_params->kernel_type == kTfLiteLSTMFullKernel &&
+          op_sig.inputs.at(0).type == kTfLiteInt16 &&
+          op_sig.inputs.at(2).type == kTfLiteInt8 &&
+          op_sig.outputs.at(0).type == kTfLiteInt16) {
+        return 5;
+      }
+      // If the input tensor is float and a weight is int8, this is a version
+      // 3 hybrid operation.
       TFLITE_DCHECK(lstm_params != nullptr);
       if (lstm_params->kernel_type == kTfLiteLSTMFullKernel &&
           op_sig.inputs.at(0).type == kTfLiteFloat32 &&
@@ -432,6 +474,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_ABS:
+      // Version 5 supports int32
+      if (op_sig.inputs.at(0).type == kTfLiteInt32) {
+        return 5;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteInt16) {
         return op_sig.ext_options.abs.input_quantized ? 3 : 4;
       }
@@ -771,14 +817,25 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     }
 
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
+      auto unidirectional_sequence_lstm_params =
+          reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+              op_sig.builtin_data);
+      // If the input activation and output tensor are int16 and a weight is
+      // int8, this is a version 5.
+      if (op_sig.inputs.at(0).type == kTfLiteInt16 &&
+          op_sig.inputs.at(2).type == kTfLiteInt8 &&
+          op_sig.outputs.at(0).type == kTfLiteInt16) {
+        return 5;
+      }
+      if (unidirectional_sequence_lstm_params &&
+          unidirectional_sequence_lstm_params->diagonal_recurrent_tensors) {
+        return 4;
+      }
       // If the input tensor is float and a weight is int8, this is a version
       // 2 hybrid operation.
       if (op_sig.inputs.at(0).type == kTfLiteFloat32 &&
           op_sig.inputs.at(2).type == kTfLiteInt8 &&
           op_sig.outputs.at(0).type == kTfLiteFloat32) {
-        auto unidirectional_sequence_lstm_params =
-            reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
-                op_sig.builtin_data);
         if (unidirectional_sequence_lstm_params &&
             unidirectional_sequence_lstm_params->asymmetric_quantize_inputs) {
           return 3;
@@ -844,8 +901,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 2;
     case BuiltinOperator_CAST:
-      if (op_sig.inputs.at(0).type == kTfLiteUInt16 ||
-          op_sig.outputs.at(0).type == kTfLiteUInt16) {
+      if (op_sig.inputs.at(0).type == kTfLiteFloat64 ||
+          op_sig.outputs.at(0).type == kTfLiteFloat64 ||
+          op_sig.inputs.at(0).type == kTfLiteFloat16 ||
+          op_sig.outputs.at(0).type == kTfLiteFloat16) {
+        return 5;
+      } else if (op_sig.inputs.at(0).type == kTfLiteUInt16 ||
+                 op_sig.outputs.at(0).type == kTfLiteUInt16) {
         return 4;
       } else if (op_sig.inputs.at(0).type == kTfLiteInt8 ||
                  op_sig.outputs.at(0).type == kTfLiteInt8) {
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 0586591bdb5..ce91121a782 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -726,6 +726,16 @@ TEST(OpVersionTest, VersioningTransposeConvOperatorTest) {
           kTfLiteInt32, kTfLiteInt8, kTfLiteInt8, none_type}),
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  TfLiteTransposeConvParams transpose_conv_params = {};
+  transpose_conv_params.activation = kTfLiteActRelu;
+  fake_op_sig = {
+      .op = BuiltinOperator_TRANSPOSE_CONV,
+      .inputs = CreateOpSignatureTensorSpecs(std::vector<TfLiteType>{
+          kTfLiteInt32, kTfLiteInt8, kTfLiteInt8, none_type}),
+      .builtin_data = reinterpret_cast<void*>(&transpose_conv_params),
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 }
 
 TEST(OpVersionTest, VersioningSVDFOperatorTest) {
@@ -1029,6 +1039,26 @@ TEST(OpVersionTest, VersioningAbsTest) {
       .outputs = CreateOpSignatureTensorSpecs(kTfLiteInt16),
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_ABS;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+}
+TEST(OpVersionTest, VersioningSignTest) {
+  // Default.
+  OpSignature fake_op_sig;
+  fake_op_sig.op = BuiltinOperator_SIGN;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // int32 input is version 2.
+  fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_SIGN;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
 }
 TEST(OpVersionTest, VersioningBatchMatMulTest) {
   // Default.
@@ -1158,4 +1188,25 @@ TEST(OpVersionTest, VersioningGeluTest) {
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt8);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
 }
+
+TEST(OpVersionTest, VersioningUnidirectionalLstmTest) {
+  TfLiteUnidirectionalSequenceLSTMParams params = {};
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteFloat32, kTfLiteFloat32, kTfLiteFloat32});
+  fake_op_sig.outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  fake_op_sig.builtin_data = reinterpret_cast<void*>(&params);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(
+      std::vector<TfLiteType>{kTfLiteFloat32, kTfLiteFloat32, kTfLiteInt8});
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  params.asymmetric_quantize_inputs = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  params.diagonal_recurrent_tensors = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 8a9e62228db..a7a2913765c 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -72,12 +72,14 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_CONV_2D, 4}, "2.3.0"},
            {{BuiltinOperator_CONV_2D, 5}, "2.4.0"},
            {{BuiltinOperator_CONV_2D, 6}, "2.9.0"},
+           {{BuiltinOperator_CONV_2D, 7}, "2.11.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 4}, "2.2.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, "2.3.0"},
            {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, "2.3.0"},
+           {{BuiltinOperator_DEPTHWISE_CONV_2D, 7}, "2.11.0"},
            {{BuiltinOperator_ADD, 1}, "1.5.0"},
            {{BuiltinOperator_ADD, 2}, "1.14.0"},
            {{BuiltinOperator_ADD, 3}, "2.4.0"},
@@ -101,6 +103,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_CAST, 2}, "2.7.0"},
            {{BuiltinOperator_CAST, 3}, "2.8.0"},
            {{BuiltinOperator_CAST, 4}, "2.9.0"},
+           {{BuiltinOperator_CAST, 5}, "2.12.0"},
            {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
            {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
            {{BuiltinOperator_CONCATENATION, 3}, "2.3.0"},
@@ -121,6 +124,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_FULLY_CONNECTED, 7}, "2.3.0"},
            {{BuiltinOperator_FULLY_CONNECTED, 8}, "2.3.0"},
            {{BuiltinOperator_FULLY_CONNECTED, 9}, "2.3.0"},
+           {{BuiltinOperator_FULLY_CONNECTED, 10}, "2.11.0"},
            {{BuiltinOperator_GATHER, 1}, "1.6.0"},
            {{BuiltinOperator_GATHER, 2}, "1.14.0"},
            {{BuiltinOperator_GATHER, 3}, "1.15.0"},
@@ -187,6 +191,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
            {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
            {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 3}, "2.3.0"},
+           {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 4}, "2.12.0"},
            {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
            {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
            {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 3}, "1.14.0"},
@@ -247,6 +252,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_TRANSPOSE_CONV, 1}, "1.9.0"},
            {{BuiltinOperator_TRANSPOSE_CONV, 2}, "2.2.0"},
            {{BuiltinOperator_TRANSPOSE_CONV, 3}, "2.3.0"},
+           {{BuiltinOperator_TRANSPOSE_CONV, 4}, "2.13.0"},
            {{BuiltinOperator_SPARSE_TO_DENSE, 1}, "1.9.0"},
            {{BuiltinOperator_SPARSE_TO_DENSE, 2}, "1.14.0"},
            {{BuiltinOperator_SPARSE_TO_DENSE, 3}, "1.15.0"},
@@ -309,6 +315,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_SEGMENT_SUM, 1}, "2.2.0"},
            {{BuiltinOperator_SELECT, 1}, "1.14.0"},
            {{BuiltinOperator_SELECT, 2}, "1.14.0"},
+           {{BuiltinOperator_SELECT, 3}, "2.12.0"},
            {{BuiltinOperator_SELECT_V2, 1}, "2.2.0"},
            {{BuiltinOperator_IF, 1}, "1.15.0"},
            {{BuiltinOperator_FLOOR_DIV, 1}, "1.14.0"},
@@ -348,6 +355,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_ABS, 2}, "2.4.0"},
            {{BuiltinOperator_ABS, 3}, "2.5.0"},
            {{BuiltinOperator_ABS, 4}, "2.6.0"},
+           {{BuiltinOperator_ABS, 5}, "2.12.0"},
            {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
            {{BuiltinOperator_FILL, 1}, "1.13.0"},
            {{BuiltinOperator_FILL, 2}, "2.3.0"},
@@ -388,7 +396,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_UNSORTED_SEGMENT_MIN, 1}, "2.11.0"},
            {{BuiltinOperator_UNSORTED_SEGMENT_SUM, 1}, "2.10.0"},
            {{BuiltinOperator_ATAN2, 1}, "2.10.0"},
-           {{BuiltinOperator_SIGN, 1}, "2.11.0"}});
+           {{BuiltinOperator_SIGN, 1}, "2.11.0"},
+           {{BuiltinOperator_SIGN, 2}, "2.12.0"}});
 
   std::pair<BuiltinOperator, int> version_key = {op_code, op_version};
   auto it = op_version_map->find(version_key);
diff --git a/tensorflow/lite/tools/versioning/runtime_version_test.cc b/tensorflow/lite/tools/versioning/runtime_version_test.cc
index b9d3328dd45..4e687f5cd89 100644
--- a/tensorflow/lite/tools/versioning/runtime_version_test.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/visualize_test.py b/tensorflow/lite/tools/visualize_test.py
index 985d54c0832..909b049be05 100644
--- a/tensorflow/lite/tools/visualize_test.py
+++ b/tensorflow/lite/tools/visualize_test.py
@@ -35,7 +35,7 @@ def testFlatbufferToDict(self):
     model_dict = visualize.CreateDictFromFlatbuffer(model)
     self.assertEqual(test_utils.TFLITE_SCHEMA_VERSION, model_dict['version'])
     self.assertEqual(1, len(model_dict['subgraphs']))
-    self.assertEqual(1, len(model_dict['operator_codes']))
+    self.assertEqual(2, len(model_dict['operator_codes']))
     self.assertEqual(3, len(model_dict['buffers']))
     self.assertEqual(3, len(model_dict['subgraphs'][0]['tensors']))
     self.assertEqual(0, model_dict['subgraphs'][0]['tensors'][0]['buffer'])
diff --git a/tensorflow/lite/tutorials/BUILD b/tensorflow/lite/tutorials/BUILD
index 583aff1e77a..61c470903be 100644
--- a/tensorflow/lite/tutorials/BUILD
+++ b/tensorflow/lite/tutorials/BUILD
@@ -1,6 +1,7 @@
 # Example Estimator model
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/type_to_tflitetype.h b/tensorflow/lite/type_to_tflitetype.h
index 9334a2385a9..a7bca5c07bd 100644
--- a/tensorflow/lite/type_to_tflitetype.h
+++ b/tensorflow/lite/type_to_tflitetype.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include <complex>
 #include <string>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 // Most of the definitions have been moved to this subheader so that Micro
 // can include it without relying on <string> and <complex>, which isn't
diff --git a/tensorflow/lite/type_to_tflitetype_test.cc b/tensorflow/lite/type_to_tflitetype_test.cc
index cee47811f55..694ca91ee88 100644
--- a/tensorflow/lite/type_to_tflitetype_test.cc
+++ b/tensorflow/lite/type_to_tflitetype_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <type_traits>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/portable_type_to_tflitetype.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index 8bb69a7d1a9..65242cbdf7c 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -201,4 +201,33 @@ TfLiteStatus MultiplyAndCheckOverflow(size_t a, size_t b, size_t* product) {
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                           size_t* bytes, TfLiteContext context_) {
+  TF_LITE_ENSURE(&context_, bytes != nullptr);
+  // When 'dims_size' is 0, we simply assume it's a scalar. Therefore, we start
+  // 'count' as 1.
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) {
+    size_t old_count = count;
+    TF_LITE_ENSURE_MSG(
+        &context_,
+        MultiplyAndCheckOverflow(old_count, dims[k], &count) == kTfLiteOk,
+        "BytesRequired number of elements overflowed.\n");
+  }
+  size_t type_size = 0;
+  TF_LITE_ENSURE_OK(&context_, GetSizeOfType(&context_, type, &type_size));
+  TF_LITE_ENSURE_MSG(
+      &context_, MultiplyAndCheckOverflow(type_size, count, bytes) == kTfLiteOk,
+      "BytesRequired number of bytes overflowed.\n");
+
+  // GetSizeOfType doesn't work for kTfLiteInt4 due to it having 2 values packed
+  // into 1 byte so the output of GetSizeOfType is the same as int8 aka 1 byte.
+  // Thus the required bytes must be divided by half after everything for int4.
+  if (type == kTfLiteInt4) {
+    *bytes = (*bytes + 1) / 2;
+  }
+
+  return kTfLiteOk;
+}
 }  // namespace tflite
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index b9f73cf7d52..607effe7973 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
 
 namespace tflite {
 
@@ -111,6 +111,12 @@ inline bool IsResourceOrVariant(const TfLiteTensor* tensor) {
   return tensor->type == kTfLiteResource || tensor->type == kTfLiteVariant;
 }
 
+// Compute the number of bytes required to represent a tensor with dimensions
+// specified by the array dims (of length dims_size). Returns the status code
+// and bytes.
+TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                           size_t* bytes, TfLiteContext context);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/lite/util_test.cc b/tensorflow/lite/util_test.cc
index 3e03070be43..825d6f8a7e2 100644
--- a/tensorflow/lite/util_test.cc
+++ b/tensorflow/lite/util_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -138,5 +138,31 @@ TEST(MultiplyAndCheckOverflow, Validate) {
                                         1223423425, &res) == kTfLiteOk);
 }
 
+TEST(FourBitTest, BytesRequiredEven) {
+  TfLiteContext context;
+
+  int dims[] = {2, 3, 1, 5};
+  const int* dims_ptr = &dims[0];
+  size_t dims_size = 4;
+  size_t required_bytes_four_bit;
+  tflite::BytesRequired(kTfLiteInt4, dims_ptr, dims_size,
+                        &required_bytes_four_bit, context);
+
+  ASSERT_EQ(required_bytes_four_bit, 15);
+}
+
+TEST(FourBitTest, BytesRequiredOdd) {
+  TfLiteContext context;
+
+  int dims[] = {5, 1, 1, 1};
+  const int* dims_ptr = &dims[0];
+  size_t dims_size = 2;
+  size_t required_bytes_four_bit;
+  tflite::BytesRequired(kTfLiteInt4, dims_ptr, dims_size,
+                        &required_bytes_four_bit, context);
+
+  ASSERT_EQ(required_bytes_four_bit, 3);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index d2060faff21..23de19befda 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -7,17 +7,22 @@ tensorflow/compat_template_v1.__init__:.py
 tensorflow/compiler/mlir/glob_lit_test.bzl:
 tensorflow/compiler/mlir/quantization/tensorflow/internal_visibility_allowlist.bzl:
 tensorflow/compiler/mlir/tfrt/transforms/tpu_passes.h:
+tensorflow/compiler/xla/glob_lit_test.bzl:
 tensorflow/compiler/xla/mlir_hlo/WORKSPACE:
 tensorflow/compiler/xla/stream_executor/build_defs.bzl:
+tensorflow/core/platform/default/build_config/BUILD:
 tensorflow/core/platform/distribute:.bzl
 tensorflow/core/tfrt/mla/mla_test_utils.h:
 tensorflow/core/tfrt/mla/mla_utils.h:
 tensorflow/core/tfrt/saved_model/saved_model_mira_impl.h:
 tensorflow/core/tfrt/tpu/tpu_resources.h:
 tensorflow/core/tfrt/utils/bridge_graph_analysis.h:
-tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal.cc:
-tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc:
+tensorflow/dtensor/build_defs:.bzl
+tensorflow/dtensor/mlir/utils/dtensor_mlir_passes_internal:.cc
+tensorflow/dtensor/python/tests/test_backend_name:.py
+tensorflow/dtensor/python/tests/test_backend_util:.py
 tensorflow/examples/custom_ops_doc/multiplex_1/BUILD:
+tensorflow/examples/custom_ops_doc/multiplex_1/README.md:
 tensorflow/examples/custom_ops_doc/multiplex_1/multiplex_1_op.py:
 tensorflow/examples/custom_ops_doc/multiplex_2/BUILD:
 tensorflow/examples/custom_ops_doc/multiplex_2/multiplex_2_op.py:
@@ -31,14 +36,26 @@ tensorflow/examples/custom_ops_doc/sleep/BUILD:
 tensorflow/examples/custom_ops_doc/sleep/sleep_op.py:
 tensorflow/go/BUILD:
 tensorflow/go/README.md:
+tensorflow/go/core/framework/BUILD:
+tensorflow/go/core/protobuf/BUILD:
+tensorflow/go/genop/BUILD:
+tensorflow/go/genop/internal/BUILD:
+tensorflow/go/op/BUILD:
 tensorflow/go/op/wrappers.go:
+tensorflow/go/stream_executor/BUILD:
+tensorflow/go/tsl/profiler/protobuf/BUILD:
+tensorflow/go/tsl/protobuf/BUILD:
 tensorflow/java/README.md:
 tensorflow/java/src/main/native/BUILD:
 tensorflow/lite/core/c/special_rules.bzl:
+tensorflow/lite/core/interpreter.h:
+tensorflow/lite/core/interpreter_builder.h:
+tensorflow/lite/core/model_builder.h:
 tensorflow/lite/core/shims/BUILD:
 tensorflow/lite/core/shims/c/builtin_op_data.h:
 tensorflow/lite/core/shims/c/c_api.h:
 tensorflow/lite/core/shims/c/c_api_experimental.h:
+tensorflow/lite/core/shims/c/c_api_opaque.h:
 tensorflow/lite/core/shims/c/c_api_types.h:
 tensorflow/lite/core/shims/c/common.h:
 tensorflow/lite/core/shims/c/experimental/acceleration/configuration/delegate_plugin.h:
@@ -61,6 +78,8 @@ tensorflow/lite/core/shims/cc/tools/verifier_internal.h:
 tensorflow/lite/core/shims/cc_library_with_tflite.bzl:
 tensorflow/lite/core/shims/jni/jni_utils.cc:
 tensorflow/lite/core/shims/jni/jni_utils.h:
+tensorflow/lite/core/special_rules.bzl:
+tensorflow/lite/create_op_resolver.h:
 tensorflow/lite/delegates/coreml/builders/BUILD:
 tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h:
 tensorflow/lite/delegates/gpu/cl/serialization_generated.h:
@@ -68,15 +87,26 @@ tensorflow/lite/delegates/gpu/common/gpu_model_generated.h:
 tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h:
 tensorflow/lite/delegates/hexagon/hexagon_nn/BUILD:
 tensorflow/lite/experimental/acceleration/configuration/c/BUILD:
-tensorflow/lite/experimental/acceleration/configuration/c/vendor_delegate.h:
+tensorflow/lite/experimental/acceleration/configuration/c/stable_delegate.h:
 tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h:
+tensorflow/lite/experimental/acceleration/configuration/delegate_registry.h:
 tensorflow/lite/experimental/acceleration/mini_benchmark/libjpeg.h:
 tensorflow/lite/experimental/acceleration/mini_benchmark/special_rules.bzl:
+tensorflow/lite/interpreter.h:
+tensorflow/lite/interpreter_builder.h:
 tensorflow/lite/ios/BUILD:
 tensorflow/lite/java/jni/BUILD:
+tensorflow/lite/kernels/builtin_op_kernels.h:
+tensorflow/lite/kernels/register.h:
 tensorflow/lite/lib_package/BUILD:
+tensorflow/lite/model.h:
+tensorflow/lite/model_builder.h:
+tensorflow/lite/schema/conversion_metadata_generated.h:
 tensorflow/lite/schema/schema_generated.h:
 tensorflow/lite/special_rules.bzl:
+tensorflow/lite/test_util.h:
+tensorflow/lite/tools/verifier.h:
+tensorflow/lite/tools/verifier_internal.h:
 tensorflow/python/autograph/core/config:.py
 tensorflow/python/eager/benchmarks_test_base:.py
 tensorflow/python/framework/tfrt_utils:.py
@@ -86,9 +116,7 @@ tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh:
 tensorflow/python/tpu/profiler/pip_package/setup:.py
 tensorflow/python/tpu/tpu.bzl:
 tensorflow/security/fuzzing/tf_fuzzing:.bzl
-tensorflow/tools/benchmark/parse_onednn_benchmarks:.py
-tensorflow/tools/benchmark/run_models:.sh
-tensorflow/tools/benchmark/run_onednn_benchmarks:.sh
+tensorflow/tools/benchmark/onednn_benchmark_config:.sh
 tensorflow/tools/build_info/BUILD:
 tensorflow/tools/ci_build/a100/nightly.sh:
 tensorflow/tools/ci_build/horovod/gpu/nightly.sh:
@@ -139,7 +167,9 @@ tensorflow/tools/toolchains/remote_config/BUILD:
 tensorflow/tools/toolchains/win/BUILD:
 tensorflow/tools/toolchains/win/bazel_211/BUILD:
 tensorflow/tools/toolchains/win/tf_win_01072022/BUILD:
+tensorflow/tools/toolchains/win/tf_win_01112023/BUILD:
 tensorflow/tools/toolchains/win/tf_win_01122022/BUILD:
+tensorflow/tools/toolchains/win/tf_win_01232023/BUILD:
 tensorflow/tools/toolchains/win/tf_win_01242022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_01252022/BUILD:
 tensorflow/tools/toolchains/win/tf_win_02042022/BUILD:
@@ -156,6 +186,10 @@ tensorflow/tools/toolchains/win_1803/py36/BUILD:
 tensorflow/tools/toolchains/win_1803/py37/BUILD:
 tensorflow/tools/toolchains/win_1803/py38/BUILD:
 tensorflow/tools/toolchains/win_1803/py39/BUILD:
+tensorflow/tsl/mkl/BUILD:
+tensorflow/tsl/mkl/LICENSE:
+tensorflow/tsl/mkl/MKL_LICENSE:
+tensorflow/tsl/mkl/build_defs.bzl:
 tensorflow/tsl/platform/default/build_config/BUILD:
 tensorflow/virtual_root_template_v1.__init__:.py
 tensorflow/virtual_root_template_v2.__init__:.py
@@ -174,7 +208,6 @@ third_party/clang_toolchain/BUILD:
 third_party/clang_toolchain/cc_configure_clang.bzl:
 third_party/clang_toolchain/download_clang.bzl:
 third_party/codegen.BUILD:
-third_party/common.bzl:
 third_party/compute_library/BUILD:
 third_party/compute_library/build_defs.bzl:
 third_party/coremltools.BUILD:
@@ -246,8 +279,6 @@ third_party/llvm_openmp/expand_cmake_vars:.py
 third_party/llvm_openmp/openmp.bzl:
 third_party/lmdb.BUILD:
 third_party/mkl/BUILD:
-third_party/mkl/LICENSE:
-third_party/mkl/MKL_LICENSE:
 third_party/mkl/build_defs.bzl:
 third_party/mkl_dnn/LICENSE:
 third_party/mkl_dnn/build_defs.bzl:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index bf2801de475..f82d2d3d2b1 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_mlir", "if_oss", "if_windows", "if_xla_available", "py_test", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps")
+load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_mlir", "if_oss", "if_windows", "if_xla_available", "py_test", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "get_compatible_with_portable", "pywrap_tensorflow_macro", "tf_external_workspace_visible", "tf_monitoring_python_deps", "tf_py_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -25,12 +25,14 @@ visibility = [
     "//third_party/cloud_tpu/convergence_tools:__subpackages__",
     "//third_party/mlperf:__subpackages__",
     "//tensorflow:internal",
+    "//tensorflow/dtensor:dtensor-internal",
     "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
     "//third_party/py/cleverhans:__subpackages__",
     "//third_party/courier:__subpackages__",
     "//third_party/py/courier:__subpackages__",
+    "//third_party/py/lingvo:__subpackages__",
     "//third_party/py/reverb:__subpackages__",
     "//third_party/py/tensorfn:__subpackages__",
     "//third_party/py/neural_structured_learning:__subpackages__",
@@ -49,6 +51,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
@@ -266,6 +269,7 @@ py_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python/tools/api/generator:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
@@ -276,6 +280,7 @@ py_library(
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
+        "//tensorflow/python/distribute/failure_handling:preemption_watcher",
     ],
 )
 
@@ -295,6 +300,11 @@ alias(
     actual = "//tensorflow/python/util:tf_decorator",
 )
 
+# Deprecated.  Use
+#   //third_party/tensorflow/tsl/python/lib/core:bfloat16_lib
+# for the bfloat16-specific dependency, or
+#   //third_party/tensorflow/tsl/python/lib/core
+# for all custom types and python utilities.
 alias(
     name = "bfloat16_lib",
     actual = "//tensorflow/python/lib/core:bfloat16_lib",
@@ -312,7 +322,13 @@ tf_pybind_cc_library_wrapper(
 tf_python_pybind_extension(
     name = "_pywrap_tfcompile",
     srcs = ["tfcompile_wrapper.cc"],
+    dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
+        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+        "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+        "//tensorflow:windows": [],
+    }),
     features = ["-layering_check"],
+    static_deps = tf_python_pybind_static_deps(),
     deps = [
         ":tfcompile_headers_lib",
         "@pybind11",
@@ -358,11 +374,6 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_py_exception_registry",
     srcs = ["py_exception_registry_wrapper.cc"],
-    hdrs = [
-        "//tensorflow/c:headers",
-        "//tensorflow/c/eager:headers",
-        "//tensorflow/python/lib/core:py_exception_registry_hdr",
-    ],
     dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
         "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
         "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
@@ -373,6 +384,7 @@ tf_python_pybind_extension(
         ":py_exception_registry",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
+        "//tensorflow/python/lib/core:py_exception_registry",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:fixed_array",
         "@pybind11",
@@ -515,6 +527,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/array_ops:__pkg__",
         "//tensorflow/python/kernel_tests/control_flow:__pkg__",
@@ -534,6 +547,7 @@ tf_gen_op_wrapper_private_py(
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/contrib/quantization:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
 )
@@ -631,6 +645,15 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "optional_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "experimental_dataset_ops_gen",
     visibility = [
@@ -656,6 +679,7 @@ tf_gen_op_wrapper_private_py(
     name = "io_ops_gen",
     visibility = [
         "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/checkpoint:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/array_ops:__pkg__",
@@ -668,7 +692,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "linalg_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -754,6 +781,7 @@ tf_gen_op_wrapper_private_py(
         "//learning/brain/google/python/ops:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/math_ops:__pkg__",
     ],
@@ -764,6 +792,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/nn_ops:__pkg__",
         "//tensorflow/python/kernel_tests/sparse_ops:__pkg__",
@@ -804,6 +833,7 @@ tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
     visibility = [
         "//tensorflow/compiler/tf2xla:internal",
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/distribute:__pkg__",
     ],
 )
@@ -811,12 +841,14 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "stateless_random_ops_gen",
     visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
         "//tensorflow/python/data/experimental/ops:__pkg__",
     ],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "stateless_random_ops_v2_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -825,6 +857,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "list_ops_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -862,11 +895,15 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
-    visibility = ["//tensorflow/python/ops/signal:__pkg__"],
+    visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python/ops/signal:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
     name = "string_ops_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -961,6 +998,16 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "sync_ops_gen",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:sync_ops_op_lib",
+    ],
+)
+
 py_library(
     name = "array_grad",
     srcs = ["ops/array_grad.py"],
@@ -982,6 +1029,7 @@ py_library(
         "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_util",
     ],
 )
@@ -1021,6 +1069,7 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util",
@@ -1158,10 +1207,7 @@ py_library(
     name = "collective_ops",
     srcs = ["ops/collective_ops.py"],
     srcs_version = "PY3",
-    deps = [
-        ":collective_ops_gen",
-        "//tensorflow/python/framework:for_generated_wrappers",
-    ],
+    deps = [":collective_ops_gen"],
 )
 
 tf_py_test(
@@ -1266,6 +1312,7 @@ py_library(
         ":array_ops_gen",
         ":control_flow_ops_gen",
         ":control_flow_util",
+        ":functional_ops_gen",
         ":logging_ops_gen",
         ":math_ops",
         ":platform",
@@ -1277,6 +1324,7 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/util",
         "//tensorflow/python/util:tf_should_use",
     ],
@@ -1388,9 +1436,9 @@ py_library(
         ":functional_ops_gen",
         ":gradients_util",
         ":handle_data_util",
+        ":optional_ops_gen",
         ":pywrap_tensorflow",
         "//tensorflow/python/compat",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:c_api_util",
@@ -1426,6 +1474,7 @@ py_library(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:function_def_to_graph",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
@@ -1523,6 +1572,8 @@ py_library(
         ":variables",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops/ragged:ragged_functional_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -1594,7 +1645,6 @@ py_library(
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/ops/linalg/sparse",
         "//tensorflow/python/util",
     ],
 )
@@ -1622,6 +1672,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util",
@@ -1833,7 +1884,6 @@ py_library(
     srcs = ["ops/manip_grad.py"],
     srcs_version = "PY3",
     deps = [
-        ":control_flow_ops",
         ":manip_ops",
         "//tensorflow/python/framework:for_generated_wrappers",
     ],
@@ -1932,8 +1982,10 @@ py_library(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/util",
         "//third_party/py/numpy",
@@ -1964,13 +2016,16 @@ py_library(
         ":resource_variable_ops_gen",
         ":variables",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/checkpoint:tensor_callable",
         "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
         "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util",
     ],
 )
@@ -2213,19 +2268,8 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":array_ops",
-        ":clip_ops",
-        ":init_ops",
-        ":math_ops",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":random_ops",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
         "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_wrapper_impl",
-        "//tensorflow/python/util",
     ],
 )
 
@@ -2251,7 +2295,6 @@ py_library(
     deps = [
         ":sdca_ops_gen",
         "//tensorflow/python/framework:for_generated_wrappers",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -2618,6 +2661,7 @@ py_library(
         ":list_ops",
         ":math_ops",
         ":tf2",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -2625,6 +2669,8 @@ py_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:tf_should_use",
     ],
 )
@@ -2651,6 +2697,7 @@ py_library(
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/util",
     ],
@@ -2669,6 +2716,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util",
@@ -3014,7 +3062,10 @@ cuda_py_test(
     main = "ops/nn_batchnorm_test.py",
     python_version = "PY3",
     shard_count = 4,
-    tags = ["no_windows"],
+    tags = [
+        "no_mac_arm64",
+        "no_windows",
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3244,15 +3295,24 @@ py_library(
         otherwise = [],
     ),  # b/153585257
     srcs_version = "PY3",
-    deps = [":pywrap_tensorflow_internal"],
+    deps = [
+        ":pywrap_tensorflow_internal",
+        "//tensorflow/python/platform:self_check",
+    ],
 )
 
 pywrap_tensorflow_macro(
     name = "pywrap_tensorflow_internal",
     srcs = ["pywrap_tensorflow_internal.cc"],
     dynamic_deps = select({
-        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
-        "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+        "//tensorflow:macos": [
+            "//tensorflow:libtensorflow_cc.%s.dylib" % VERSION,
+            "//tensorflow:libtensorflow_framework.%s.dylib" % VERSION,
+        ],
+        "//conditions:default": [
+            "//tensorflow:libtensorflow_cc.so.%s" % VERSION,
+            "//tensorflow:libtensorflow_framework.so.%s" % VERSION,
+        ],
         "//tensorflow:windows": [],
     }),
     exports_filter = [
@@ -3317,6 +3377,7 @@ pywrap_tensorflow_macro(
         "@mkl_dnn_v1//:__subpackages__",
         "@nccl_archive//:__subpackages__",
         "@nsync//:__subpackages__",
+        "@nvtx_archive//:__subpackages__",
         "@org_sqlite//:__subpackages__",
         "@platforms//:__subpackages__",
         "@png//:__subpackages__",
@@ -3331,18 +3392,16 @@ pywrap_tensorflow_macro(
         "@upb//:__subpackages__",
         "@XNNPACK//:__subpackages__",
         "@zlib//:__subpackages__",
-    ],
+    ] + tsl_async_value_deps(),
     win_def_file = ":pywrap_tensorflow_filtered_def_file",
     deps = [
-        ":bfloat16_lib",
+        ":safe_ptr",
         "//tensorflow/python/grappler:cost_analyzer_lib",
         "//tensorflow/python/grappler:model_analyzer_lib",
         "//tensorflow/python/util:cpp_nest",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:function_parameter_canonicalizer",
         "//tensorflow/python/util:kernel_registry",
-        ":numpy_lib",
-        ":safe_ptr",
         "//tensorflow/python/lib/core:py_func_lib",
         "//tensorflow/python/lib/core:pybind11_absl",
         "//tensorflow/python/lib/core:pybind11_lib",
@@ -3370,11 +3429,12 @@ pywrap_tensorflow_macro(
         "//tensorflow/c/experimental/gradients/tape",
         "//tensorflow/cc/saved_model:fingerprinting_impl",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc",
         "//tensorflow/core/data/service:dispatcher_client",
         "//tensorflow/core/data/service:grpc_util",
         "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/data/service:py_utils",
-        "//tensorflow/core/function:runtime_client_cc",
+        "//tensorflow/core/function/runtime_client:runtime_client_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -3389,9 +3449,9 @@ pywrap_tensorflow_macro(
         "//tensorflow/core/debug",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/platform:stacktrace_handler",
-        "//tensorflow/core/profiler/convert:hlo_to_tools_data_impl",
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",
-        "//tensorflow/core/profiler/rpc/client:profiler_client_impl",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
+        "//tensorflow/tsl/python/lib/core",
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/core/profiler/backends/cpu:python_tracer",
         "//tensorflow/dtensor/cc:dtensor_device_cc",
@@ -3404,13 +3464,12 @@ pywrap_tensorflow_macro(
         "//tensorflow/python/client:tf_session_helper",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//tensorflow/compiler/mlir/python:mlir",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:optional",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
-        "//tensorflow/c/experimental/stream_executor:stream_executor",
+        "//tensorflow/c/experimental/stream_executor",
         "//tensorflow/c:env",
         "//tensorflow/c:kernels",
         "//tensorflow/c:kernels_experimental",
@@ -3427,7 +3486,7 @@ pywrap_tensorflow_macro(
         "//tensorflow/core:lib_internal_impl",
         "//tensorflow/core/profiler:profiler_impl",
         "//tensorflow/core/util:determinism",
-        "//tensorflow/stream_executor:stream_executor_impl",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
     ] + if_static([
         "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/platform:enable_tf2_utils",
@@ -3450,14 +3509,16 @@ pywrap_tensorflow_macro(
 filegroup(
     name = "win_lib_files_for_exported_symbols",
     srcs = [
-        ":bfloat16_lib",  # bfloat16
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",  # bfloat16
+        "//tensorflow/tsl/python/lib/core:custom_casts_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",  # float8_e4m3fn, float8_e5m2
         "//tensorflow/python/grappler:cost_analyzer_lib",
         "//tensorflow/python/util:cpp_nest",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:kernel_registry",
         "//tensorflow/python/grappler:model_analyzer_lib",  # model_analyzer
         ":ndarray_tensor",  # checkpoint_reader
-        ":numpy_lib",  # checkpoint_reader
+        "//tensorflow/tsl/python/lib/core:numpy",  # checkpoint_reader
         ":py_exception_registry",  # py_exception_registry
         "//tensorflow/python/lib/core:py_func_lib",
         "//tensorflow/python/framework:op_def_util_cc",  # op_def_util
@@ -3472,8 +3533,10 @@ filegroup(
         "//tensorflow/cc/saved_model:fingerprinting_impl",  # SavedModel fingerprinting
         "//tensorflow/compiler/jit:get_compiler_ir",  # tfe
         "//tensorflow/core/platform:statusor",  # tfe
+        "//tensorflow/core/platform:cpu_feature_guard",  # cpu_feature_guard
         "//tensorflow/compiler/jit:flags",  # tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc_impl",  # quantization
         "//tensorflow/core/common_runtime:graph_constructor",  # tf_session
         "//tensorflow/core/common_runtime:quantize_training",  # quantize_training
         "//tensorflow/core/common_runtime:session_options",  # device_lib, tfe, tf_session
@@ -3491,7 +3554,7 @@ filegroup(
         "//tensorflow/core/common_runtime/eager:eager_executor",  # tfe
         "//tensorflow/core/framework:attr_value_proto_cc",  # tf_text
         "//tensorflow/core/framework:op_def_proto_cc",  # tf_text
-        "//tensorflow/core/function:runtime_client_cc",  # runtime_client_pybind
+        "//tensorflow/core/function/runtime_client:runtime_client_cc",  # runtime_client_pybind
         "//tensorflow/core/grappler:devices",  # tf_cluster
         "//tensorflow/core/grappler:grappler_item",  # tf_item
         "//tensorflow/core/grappler:grappler_item_builder",  # tf_item
@@ -3512,10 +3575,10 @@ filegroup(
         "//tensorflow/tsl/platform:tensor_float_32_utils",  # tensor_float_32
         "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
         "//tensorflow/core/profiler/backends/cpu:traceme_recorder_impl",  # profiler
-        "//tensorflow/core/profiler/convert:hlo_to_tools_data_impl",  # profiler
-        "//tensorflow/core/profiler/lib:profiler_session_impl",  # profiler
-        "//tensorflow/core/profiler/rpc:profiler_server_impl",  # profiler
+        "//tensorflow/tsl/profiler/lib:profiler_session_impl",  # profiler
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",  # profiler
         "//tensorflow/core/profiler/rpc/client:profiler_client_impl",  # profiler
+        "//tensorflow/tsl/profiler/rpc/client:profiler_client_impl",
         "//tensorflow/python/framework:python_api_info",  # python_api_info
         "//tensorflow/python/framework:python_api_parameter_converter",  # python_api_parameter_converter
         "//tensorflow/core/util:port",  # util_port
@@ -3527,7 +3590,6 @@ filegroup(
         "//tensorflow/python/eager:pywrap_tfe_lib",  # pywrap_tfe_lib
         "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",  # stat_summarizer
         "//tensorflow/tools/graph_transforms:transform_graph_lib",  # transform_graph
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:quantize_model_cc",  # quantization
         "//tensorflow/dtensor/cc:dtensor_device_cc",  # DTensor
         "//tensorflow/dtensor/cc:tensor_layout",  # DTensor
     ] + if_xla_available([
@@ -3656,7 +3718,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":platform",
-        "//tensorflow/python/framework",
         "//tensorflow/python/framework:for_generated_wrappers",
     ],
 )
@@ -3993,8 +4054,8 @@ cuda_py_test(
     srcs = ["ops/factory_ops_test.py"],
     main = "ops/factory_ops_test.py",
     deps = [
+        ":platform_test",
         ":sparse_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:def_function",
@@ -4075,9 +4136,9 @@ tf_python_pybind_extension(
     srcs = ["//tensorflow/core/config:flags_api_wrapper.cc"],
     visibility = ["//tensorflow/core/config:__subpackages__"],
     deps = [
+        ":pybind11_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core/config:flags_headers",
-        "//tensorflow/python:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -4096,6 +4157,7 @@ cc_library(
     name = "unified_api_pywrap_required_headers",
     textual_hdrs = [
         "//tensorflow/python/lib/core:basic_hdrs",
+        "//tensorflow/tsl/python/lib/core:basic_hdrs",
         "//tensorflow/c:headers",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
@@ -4142,16 +4204,19 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:numpy_hdr",
         "//tensorflow/python/lib/core:py_exception_registry_hdr",
         "//tensorflow/python/lib/core:safe_ptr_hdr",
         "//tensorflow/python/util:util_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
-    # Only include TensorFlow header-only targets here.
-    # If a cc_library needs to depend on TensorFlow .cc files through srcs or
-    # deps, then you can use cc_header_only_library to keep only headers.
+    dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
+        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+        "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+        "//tensorflow:windows": [],
+    }),
+    static_deps = tf_python_pybind_static_deps(),
     deps = [
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/lib/core:pybind11_lib",
@@ -4318,11 +4383,6 @@ alias(
     actual = "//tensorflow/python/lib/core:py_util",
 )
 
-alias(
-    name = "numpy_lib",
-    actual = "//tensorflow/python/lib/core:numpy_lib",
-)
-
 alias(
     name = "py_exception_registry",
     actual = "//tensorflow/python/lib/core:py_exception_registry",
@@ -4461,6 +4521,11 @@ alias(
     actual = "//tensorflow/python/framework:ops",
 )
 
+alias(
+    name = "framework_indexed_slices",
+    actual = "//tensorflow/python/framework:indexed_slices",
+)
+
 alias(
     name = "framework_for_generated_wrappers",
     actual = "//tensorflow/python/framework:for_generated_wrappers",
@@ -4768,4 +4833,16 @@ py_library(
 #     api_version = 2,
 #     deps = [":protos_all"],
 # )
+#
+# # Special build rule for jax2tf.
+# # TODO(b/239052279): remove once TF->TFLite dependency cycle is resolved
+# py_library(
+#     name = "jax2tf_support",
+#     visibility = ["//third_party/py/jax/experimental/jax2tf:__subpackages__"],
+#     deps = [
+#         "//tensorflow/python/dlpack",
+#         "//tensorflow/python/ops/ragged:ragged_array_ops",
+#         "//tensorflow/python/saved_model",
+#     ],
+# )
 # copybara:uncomment_end
diff --git a/tensorflow/python/autograph/BUILD b/tensorflow/python/autograph/BUILD
index 408e704a0e7..5115a004d09 100644
--- a/tensorflow/python/autograph/BUILD
+++ b/tensorflow/python/autograph/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/CONTRIBUTING.md b/tensorflow/python/autograph/CONTRIBUTING.md
index f3587a43848..ba7140658b5 100644
--- a/tensorflow/python/autograph/CONTRIBUTING.md
+++ b/tensorflow/python/autograph/CONTRIBUTING.md
@@ -42,8 +42,11 @@ See the [AutoGraph style guide](STYLE_GUIDE.md).
 
 ## Unit tests
 
-Please include unit tests when contributing new features ([example here](converters/continue_statements_test.py)), as they help to a) prove that your code works correctly, and b) guard against future breaking
-changes to lower the maintenance cost.
+Please include unit tests when contributing new features
+([example here](converters/continue_statements_test.py)), as they help to prove
+that your code works correctly, and guard against future breaking changes to
+lower the maintenance cost.
+
 It's also helpful to check that any
 changes you propose do not break existing unit tests. You can run tests using the command,
 
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index a7ffd50c805..1f2276003c5 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 9324fa0f675..30c31a9afe0 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -130,7 +130,7 @@ def f(x):
     self.assertTransformedEquivalent(f, 2)
     self.assertTransformedEquivalent(f, -2)
 
-  def text_conditional_in_context_manager(self):
+  def test_conditional_in_context_manager(self):
 
     def f(x):
       with ops.name_scope(''):
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 5075dd01b7f..54421bf11c5 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
index bb7dc74bc39..281de754be8 100644
--- a/tensorflow/python/autograph/g3doc/reference/control_flow.md
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -475,7 +475,7 @@ has executed). AutoGraph keeps track of this by using a special value.
 This special value is converted to `None` (the default return value) upon
 exiting the function.
 
-Caution: TensorFlow control flow doe not support undefined values, and an
+Caution: TensorFlow control flow does not support undefined values, and an
 undefined return value is no exception. Therefore, AutoGraph will raise an
 error for TensorFlow control flow in which the return value is not known for
 all code paths.
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 3ad810f7486..f9d3dd06e6a 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/impl/testing/BUILD b/tensorflow/python/autograph/impl/testing/BUILD
index 713bf332dd1..6ba736d5608 100644
--- a/tensorflow/python/autograph/impl/testing/BUILD
+++ b/tensorflow/python/autograph/impl/testing/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index 52b216a5387..97ffac49511 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index f1907c8679c..1009428bf98 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -44,10 +45,10 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/utils",
+        "//tensorflow/python/autograph/utils:type_registry",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/experimental/ops:take_while_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 91e76110893..b934a6850a8 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -66,10 +66,7 @@ def loop_body(self_x):
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
-from tensorflow.python.data.experimental.ops import take_while_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
+from tensorflow.python.autograph.utils import type_registry
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import func_graph
@@ -97,6 +94,8 @@ def loop_body(self_x):
 # An example of this pattern is found in the implementation of distributed
 # datasets. Before it can be used though, we need to standardize the interface.
 
+for_loop_registry = type_registry.TypeRegistry()
+
 
 def _is_none_or_undef(value):
   """Tests whether a value is None or undefined.
@@ -142,10 +141,9 @@ def _verify_tf_condition(cond, tag):
   return cond
 
 
-def _verify_loop_init_vars(init_vars,
-                           symbol_names,
-                           first_iter_vars=None,
-                           extra_message=None):
+def verify_loop_init_vars(
+    init_vars, symbol_names, first_iter_vars=None, extra_message=None
+):
   """Ensures that all values in the state are valid to use in a TF loop.
 
   The init_vars may contain placeholder values derived from first_iter_vars.
@@ -179,7 +177,7 @@ def _verify_loop_init_vars(init_vars,
 
     error_msg = None
     if val is None:
-      error_msg = "'{}' may not be None before the loop".format(name)
+      error_msg = "'{}' is not allowed to be None before the loop".format(name)
     elif isinstance(val, variables.Undefined):
       error_msg = "'{}' must be defined before the loop".format(name)
       if extra_message:
@@ -259,16 +257,18 @@ def _verify_single_loop_var(
       if not _is_subshape(exit_shape, shape_invariant):
         raise ValueError(
             "'{}' has shape {} after one iteration, which does not conform with"
-            ' the shape invariant {}.'.format(
-                name, exit_shape, shape_invariant))
-
-
-def _verify_tf_loop_vars(init_vars,
-                         iter_entry_vars,
-                         iter_exit_vars,
-                         symbol_names,
-                         opts,
-                         check_shapes=True):
+            ' the shape invariant {}.'.format(name, exit_shape, shape_invariant)
+        )
+
+
+def verify_tf_loop_vars(
+    init_vars,
+    iter_entry_vars,
+    iter_exit_vars,
+    symbol_names,
+    opts,
+    check_shapes=True,
+):
   """Verifies loop variables for consistency."""
   if check_shapes and 'shape_invariants' in opts:
     shape_invariants = opts['shape_invariants']
@@ -427,45 +427,35 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
       get_state.
     opts: Optional dict of extra loop parameters.
   """
+
+  try:
+    for_fn = for_loop_registry.lookup(iter_)
+  except LookupError:
+    for_fn = _py_for_stmt
+
+  # TODO(bwieder): Refactor isinstance(iter_, ragged_tensor.RaggedTensor) to use
+  # the registry once python/autograph/utils does not depend on dataset_ops.
   if tensor_util.is_tf_type(iter_):
     if tensors.is_range_tensor(iter_):
-      _tf_range_for_stmt(iter_, extra_test, body, get_state, set_state,
-                         symbol_names, opts)
+      for_fn = _tf_range_for_stmt
     elif isinstance(iter_, ragged_tensor.RaggedTensor):
-      _tf_ragged_for_stmt(iter_, extra_test, body, get_state, set_state,
-                          symbol_names, opts)
+      for_fn = _tf_ragged_for_stmt
     else:
-      _known_len_tf_for_stmt(
-          iter_, extra_test, body, get_state, set_state, symbol_names, opts)
-
-  elif isinstance(iter_, dataset_ops.DatasetV2):
-    _tf_dataset_for_stmt(
-        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
-
-  elif isinstance(iter_, iterator_ops.OwnedIterator):
-    _tf_iterator_for_stmt(
-        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
-
-  elif isinstance(iter_, ragged_tensor.RaggedTensor):
-    _tf_ragged_for_stmt(
-        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
-
+      for_fn = _known_len_tf_for_stmt
   elif isinstance(iter_, distribute.Iterator):
-    _tf_iterator_for_stmt(
-        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
-
+    for_fn = _tf_iterator_for_stmt
   elif isinstance(iter_, distribute.Iterable):
     # TODO(b/162250181): Use _tf_iterator_for_stmt(iter(iter_)...
-    _tf_distributed_iterable_for_stmt(
-        iter_, extra_test, body, get_state, set_state, symbol_names, opts)
+    for_fn = _tf_distributed_iterable_for_stmt
 
-  else:
-    _py_for_stmt(iter_, extra_test, body, None, None)
+  for_fn(iter_, extra_test, body, get_state, set_state, symbol_names, opts)
 
 
-def _py_for_stmt(iter_, extra_test, body, get_state, set_state):
+def _py_for_stmt(
+    iter_, extra_test, body, get_state, set_state, symbol_names, opts
+):
   """Overload of for_stmt that executes a Python for loop."""
-  del get_state, set_state
+  del get_state, set_state, symbol_names, opts
 
   if __debug__:
     checker = _PythonLoopChecker()
@@ -569,7 +559,7 @@ def _tf_ragged_for_stmt(
     iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   """Overload of for_stmt that iterates over TF ragged tensors."""
   init_vars = get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
+  verify_loop_init_vars(init_vars, symbol_names)
 
   # TODO(mdan): Move this into len()? Requires eager support.
   if iter_.shape and iter_.shape[0] is not None:
@@ -690,7 +680,7 @@ def aug_set_state(aug_loop_vars):
     set_state(loop_vars)
 
   init_vars = aug_get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
+  verify_loop_init_vars(init_vars, symbol_names)
 
   def aug_body():
     """Main body passed to _tf_while_stmt."""
@@ -705,7 +695,7 @@ def main_path():
       # Note: this verification duplicates the one performed in tf_while_stmt,
       # but needs to be done earlier to prevent the tf.cond from blowing up
       # first.
-      _verify_tf_loop_vars(
+      verify_tf_loop_vars(
           init_vars, loop_vars, new_loop_vars, symbol_names, opts)
       return new_loop_vars
 
@@ -736,93 +726,6 @@ def aug_test():
       opts)
 
 
-def _general_purpose_scan(ds, init_state, body):
-  """Variant of Dataset.scan with semantics of general-purpose computation."""
-  # Datasets are typically intended for data preprocessing. However, in
-  # autograph loops they usually appear as general-purpose computations (for
-  # example, a custom training loop). These two use cases require significantly
-  # different optimization policies, the most important of which is the device
-  # placement. The flag override for use_default_device below instructs the
-  # runtime to treat the computation as general-purpose, rather than data
-  # preprocessing.
-  # TODO(mdan): s/use_default_device/specialize_for_input_pipeline.
-  # TODO(mdan): Don't use private symbols.
-  # pylint:disable=protected-access
-  return dataset_ops._ScanDataset(
-      ds, init_state, body, use_default_device=False)
-
-
-def _tf_dataset_for_stmt(
-    ds, extra_test, body, get_state, set_state, symbol_names, opts):
-  """Overload of _dataset_for_stmt with early stopping. See for_stmt."""
-  # Note: This is easier to follow with the insight that the computations in
-  # a dataset pipeline are transposed (aka fused).
-  # For example, given a pipeline input -> scan -> take_while -> reduce,
-  # and a dataset with input [1, 2, 3], the computations occur in the following
-  # order:
-  #  reduce(take_while(scan(1)))
-  #  reduce(take_while(scan(2)))
-  #  reduce(take_while(scan(3)))
-
-  init_vars = get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
-
-  # Workaround for Dataset.reduce not allowing empty state tensors - create
-  # a dummy state variable that remains unused.
-  # TODO(mdan): reduce should allow and match empty structures.
-  if not init_vars:
-    init_vars = (constant_op.constant(0),)
-    symbol_names = ('<internal dummy>',)
-
-    def dummy_set_state(unused_dummy):
-      pass
-
-    def dummy_get_state():
-      return (constant_op.constant(0),)
-
-    get_state, set_state = dummy_get_state, dummy_set_state
-
-  def scan_body(scan_state, scan_inputs):
-    """Main body of the Dataset.scan."""
-    loop_vars, iterate = scan_state, scan_inputs
-    set_state(loop_vars)
-
-    def main_path():
-      body(iterate)
-      new_loop_vars = get_state()
-      _verify_tf_loop_vars(
-          init_vars, loop_vars, new_loop_vars, symbol_names, opts,
-          check_shapes=False)
-      return new_loop_vars
-
-    if extra_test is not None:
-      extra_cond = extra_test()
-      new_loop_vars = control_flow_ops.cond(
-          extra_cond, main_path, lambda: loop_vars)
-    else:
-      # TODO(mdan): the optimizer should be able to remove an invariant cond?
-      extra_cond = (constant_op.constant(True),)  # dummy value, unused
-      new_loop_vars = main_path()
-
-    scan_outputs = new_loop_vars, extra_cond
-    new_scan_state = new_loop_vars
-    return new_scan_state, scan_outputs
-
-  def take_while_predicate(unused_loop_vars, extra_cond):
-    return extra_cond
-
-  def reduce_body(unused_reduce_state, scan_outputs):
-    output_loop_vars, unused_extra_cond = scan_outputs
-    new_reduce_state = output_loop_vars
-    return new_reduce_state
-
-  ds = _general_purpose_scan(ds, init_vars, scan_body)
-  if extra_test is not None:
-    ds = ds.apply(take_while_ops.take_while(take_while_predicate))
-  final_loop_vars = ds.reduce(init_vars, reduce_body)
-  set_state(final_loop_vars)
-
-
 def _tf_distributed_iterable_for_stmt(
     iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   """Overload of for_stmt that iterates over TF distributed datasets."""
@@ -833,7 +736,7 @@ def _tf_distributed_iterable_for_stmt(
         'for ... in distributed input loops.')
 
   init_vars = get_state()
-  _verify_loop_init_vars(init_vars, symbol_names)
+  verify_loop_init_vars(init_vars, symbol_names)
 
   if 'shape_invariants' in opts:
     opts['shape_invariants'] = _shape_invariants_mapping_to_positional_list(
@@ -843,7 +746,7 @@ def reduce_body(loop_vars, iterate):
     set_state(loop_vars)
     body(iterate)
     new_loop_vars = get_state()
-    _verify_tf_loop_vars(
+    verify_tf_loop_vars(
         init_vars, loop_vars, new_loop_vars, symbol_names, opts)
     return new_loop_vars
 
@@ -1198,7 +1101,7 @@ def autocast_to_tensor(v):
       set_state(init_vars)
 
   # This check runs regardless, in case we captured non-Tensor inputs.
-  _verify_loop_init_vars(
+  verify_loop_init_vars(
       init_vars, symbol_names, first_iter_vars, extra_message=failure_message)
 
   return success, init_vars, extra_shape_invariants
@@ -1269,7 +1172,7 @@ def aug_body(*loop_vars):
     set_state(loop_vars)
     body()
     new_loop_vars = get_state()
-    _verify_tf_loop_vars(
+    verify_tf_loop_vars(
         init_vars, loop_vars, new_loop_vars, symbol_names, opts)
 
     if require_one_iteration:
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 95c928138c7..9b7328cc3f5 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -592,7 +592,7 @@ def set_state(loop_vars):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '\'s\' may not be None'):
+    with self.assertRaisesRegex(ValueError, '\'s\' is not allowed to be None'):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, '\'s\' must be defined'):
       self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
@@ -1021,7 +1021,7 @@ def set_state(loop_vars):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, "'s' may not be None"):
+    with self.assertRaisesRegex(ValueError, "'s' is not allowed to be None"):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, "'s' must be defined"):
       self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 13880bf62dc..04250504773 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -17,20 +17,14 @@
 List of built-in functions: https://docs.python.org/3/library/functions.html
 """
 
-import functools
 import inspect
 
-import numpy as np
-
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.autograph.utils import tensors
-from tensorflow.python.data.experimental.ops import cardinality
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.autograph.utils import type_registry
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -40,20 +34,29 @@
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sort_ops
-from tensorflow.python.util import lazy_loader
-from tensorflow.python.util import nest
-
-# TODO(b/145618471): Remove this dependency.
-# Lazy import to work around circular dependencies
-input_lib = lazy_loader.LazyLoader(
-    'input_lib', globals(),
-    'tensorflow.python.distribute.input_lib')
-parallel_ops = lazy_loader.LazyLoader(
-    'parallel_ops', globals(),
-    'tensorflow.python.ops.parallel_for.control_flow_ops')
+from tensorflow.python.ops.parallel_for import control_flow_ops as parallel_ops
+
 
 UNSPECIFIED = object()
 
+abs_registry = type_registry.TypeRegistry()
+len_registry = type_registry.TypeRegistry()
+enumerate_registry = type_registry.TypeRegistry()
+zip_registry = type_registry.TypeRegistry()
+map_registry = type_registry.TypeRegistry()
+filter_registry = type_registry.TypeRegistry()
+any_registry = type_registry.TypeRegistry()
+all_registry = type_registry.TypeRegistry()
+next_registry = type_registry.TypeRegistry()
+
+
+def registry_lookup(reg, obj):
+  try:
+    return reg.lookup(obj)
+  except LookupError:
+    pass
+  return None
+
 
 def overload_of(f):
   if f in SUPPORTED_BUILTINS:
@@ -166,10 +169,11 @@ def super_in_original_context(f, args, caller_fn_scope):
 
 
 def abs_(x):
+  abs_override = registry_lookup(abs_registry, x)
+  if abs_override is not None:
+    return abs_override(x)
   if tensor_util.is_tf_type(x):
     return _tf_abs(x)
-  if isinstance(x, dataset_ops.DatasetV2):
-    return _tf_dataset_abs(x)
   return _py_abs(x)
 
 
@@ -177,15 +181,6 @@ def _tf_abs(x):
   return math_ops.abs(x)
 
 
-def _tf_dataset_abs(x):
-  specs = nest.flatten(x.element_spec)
-  if len(specs) == 1:
-    return x.map(math_ops.abs, num_parallel_calls=dataset_ops.AUTOTUNE)
-  return x.map(
-      lambda *e: nest.map_structure(math_ops.abs, e),
-      num_parallel_calls=dataset_ops.AUTOTUNE)
-
-
 def _py_abs(x):
   return abs(x)
 
@@ -230,14 +225,15 @@ def _py_int(x, base):
 
 
 def len_(s):
+  len_override = registry_lookup(len_registry, s)
+  if len_override is not None:
+    return len_override(s)
   if tensors.is_tensor_array(s):
     return _tf_tensor_array_len(s)
   elif tensors.is_tensor_list(s):
     return _tf_tensor_list_len(s)
   elif tensor_util.is_tf_type(s):
     return _tf_tensor_len(s)
-  if isinstance(s, dataset_ops.DatasetV2):
-    return _tf_dataset_len(s)
   return _py_len(s)
 
 
@@ -282,26 +278,6 @@ def raise_zero_rank_error():
                                raise_zero_rank_error)
 
 
-def _tf_dataset_len(s):
-  l = cardinality.cardinality(s)
-  msg = gen_string_ops.string_join([
-      'len requires dataset with definitive cardinality, got ',
-      gen_string_ops.as_string(l)
-  ])
-  # TODO (yongtang): UNKNOWN is treated as an error.
-  # In case there are more UNKNOWN cases for dataset, we could
-  # use dataset.reduce() to find out the length (in an expensive way).
-  with ops.control_dependencies([
-      control_flow_ops.Assert(
-          math_ops.logical_and(
-              math_ops.not_equal(l, cardinality.INFINITE),
-              math_ops.not_equal(l, cardinality.UNKNOWN)), [msg])
-  ]):
-    l = array_ops.identity(l)
-
-  return l
-
-
 def _py_len(s):
   return len(s)
 
@@ -442,31 +418,26 @@ def _py_range(start_or_stop, stop, step):
 
 
 def enumerate_(s, start=0):
-  if isinstance(s, dataset_ops.DatasetV2):
-    return _tf_dataset_enumerate(s, start)
-  if isinstance(s,
-                (input_lib.DistributedIterator, input_lib.DistributedDataset)):
-    raise NotImplementedError(
-        'use a for loop over the dataset and keep a separate counter')
+  enumerate_override = registry_lookup(enumerate_registry, s)
+  if enumerate_override is not None:
+    return enumerate_override(s, start)
   return _py_enumerate(s, start)
 
 
-def _tf_dataset_enumerate(s, start=0):
-  return s.enumerate(start)
-
-
 def _py_enumerate(s, start=0):
   return enumerate(s, start)
 
 
 def zip_(*iterables):
-  if all(isinstance(x, dataset_ops.DatasetV2) for x in iterables):
-    return _tf_dataset_zip(*iterables)
-  return _py_zip(*iterables)
-
-
-def _tf_dataset_zip(*iterables):
-  return dataset_ops.DatasetV2.zip(iterables)
+  zip_fn = _py_zip
+  # If the overridden function is not the same across all iterables, use _py_zip
+  for x in iterables:
+    zip_override = registry_lookup(zip_registry, x)
+    if zip_override is None or (zip_fn != _py_zip and zip_override != zip_fn):  # pylint: disable=comparison-with-callable
+      zip_fn = _py_zip
+      break
+    zip_fn = zip_override
+  return zip_fn(*iterables)
 
 
 def _py_zip(*iterables):
@@ -474,13 +445,15 @@ def _py_zip(*iterables):
 
 
 def map_(fn, *iterables):
-  if all(isinstance(x, dataset_ops.DatasetV2) for x in iterables):
-    return _tf_dataset_map(fn, *iterables)
-  return _py_map(fn, *iterables)
-
-
-def _tf_dataset_map(fn, *iterables):
-  return dataset_ops.DatasetV2.zip(iterables).map(fn)
+  map_fn = _py_map
+  # If the overridden function is not the same across all iterables, use _py_map
+  for x in iterables:
+    map_override = registry_lookup(map_registry, x)
+    if map_override is None or (map_fn != _py_map and map_override != map_fn):  # pylint: disable=comparison-with-callable
+      map_fn = _py_map
+      break
+    map_fn = map_override
+  return map_fn(fn, *iterables)
 
 
 def _py_map(fn, *iterables):
@@ -488,89 +461,12 @@ def _py_map(fn, *iterables):
 
 
 def next_(iterator, default=UNSPECIFIED):
-  if isinstance(iterator, iterator_ops.OwnedIterator):
-    return next_tf_iterator(iterator, default)
+  next_override = registry_lookup(next_registry, iterator)
+  if next_override is not None:
+    return next_override(iterator, default)
   return next_py(iterator, default)
 
 
-# TODO(mdan): These checks should be easier. Fix the nest API.
-def _verify_spec_compatible(input_name, spec_name, input_, spec):
-  """Verifies that a symbol has a type compatible vith a given spec.
-
-  Here, compatibility is viewed in the general TensorFlow sense: that the dtypes
-  are the same after implicit conversion, if both are tensors.
-
-  This verifier ensures consistent treatment of types across AutoGraph.
-
-  Args:
-    input_name: A name to use for `input_` in error messages.
-    spec_name: A name to use for `spec` in error messages.
-    input_: Any, value to verify.
-    spec: TypeSpec that `input_` must be compatible with.
-
-  Raises:
-    ValueError if the two types have been determined not to be compatible.
-  """
-  assert isinstance(spec, tensor_spec.TensorSpec)
-  if input is None:
-    # TODO(mdan): raise from None when switching to Py3.
-    raise ValueError('{} cannot be None'.format(input_name))
-
-  # TODO(mdan): Use TensorCompatible when ready.
-  if isinstance(input_, (bool, int, float, str, np.ndarray)):
-    input_ = ops.convert_to_tensor_v2(input_)
-
-  input_dtype = getattr(input_, 'dtype', None)
-
-  if input_dtype != spec.dtype:
-    input_dtype_str = 'no dtype' if input_dtype is None else str(input_dtype)
-
-    raise TypeError(
-        '{} must have the same dtype as {}. Expected {}, got {}'.format(
-            input_name, spec_name, spec.dtype, input_dtype_str))
-
-
-def _verify_structure_compatible(input_name, spec_name, input_, spec):
-  """Verifies that possibly-structured symbol has types compatible vith another.
-
-  See _verify_spec_compatible for a more concrete meaning of "compatible".
-  Unspec _verify_spec_compatible, which handles singular Tensor-spec objects,
-  verify_structures_compatible can process structures recognized by tf.nest.
-
-  Args:
-    input_name: A name to use for `input_` in error messages.
-    spec_name: A name to use for `spec` in error messages.
-    input_: Any, value to verify. May, but doesn't need to, be a structure.
-    spec: Any, value that `input_` must be compatible with. May, but doesn't
-      need to, be a structure.
-
-  Raises:
-    ValueError if the two types have been determined not to be compatible.
-  """
-  try:
-    nest.assert_same_structure(input_, spec, expand_composites=True)
-  except (ValueError, TypeError) as e:
-    raise TypeError(
-        '{} must have the same element structure as {}.\n\n{}'.format(
-            input_name, spec_name, str(e)))
-
-  nest.map_structure(
-      functools.partial(_verify_spec_compatible, input_name, spec_name), input_,
-      spec)
-
-
-def next_tf_iterator(iterator, default=UNSPECIFIED):
-  if default is UNSPECIFIED:
-    # Without a default, fall back to the "normal" behavior which raises
-    # a runtime exception.
-    return next(iterator)
-  opt_iterate = iterator.get_next_as_optional()
-  _verify_structure_compatible('the default argument', 'the iterate', default,
-                               iterator.element_spec)
-  return control_flow_ops.cond(opt_iterate.has_value(), opt_iterate.get_value,
-                               lambda: default)
-
-
 def next_py(iterator, default=UNSPECIFIED):
   if default is UNSPECIFIED:
     return next(iterator)
@@ -578,70 +474,34 @@ def next_py(iterator, default=UNSPECIFIED):
 
 
 def filter_(function, iterable):
-  if isinstance(iterable, dataset_ops.DatasetV2):
-    return _tf_dataset_filter(function, iterable)
+  filter_override = registry_lookup(filter_registry, iterable)
+  if filter_override is not None:
+    return filter_override(function, iterable)
   return _py_filter(function, iterable)
 
 
-def _tf_dataset_filter(function, iterable):
-  return iterable.filter(function)
-
-
 def _py_filter(function, iterable):
   return filter(function, iterable)
 
 
 def any_(iterable):
-  if isinstance(iterable, dataset_ops.DatasetV2):
-    return _tf_dataset_any(iterable)
+  any_override = registry_lookup(any_registry, iterable)
+  if any_override is not None:
+    return any_override(iterable)
   return _py_any(iterable)
 
 
-# any() operation is essentially a "if first True element exist".
-# For that it could be translated to `filter(True)` to filter out
-# only `True` element, and then `take(1)`. This works in tf.data
-# as tf.data's filter+take is done in pipeline so it will stop
-# as soon as `take(1)` returns.
-def _tf_dataset_any(iterable):
-  # check and make sure iterable.element_spec only consists of one
-  # element of tf.bool.
-  specs = nest.flatten(iterable.element_spec)
-  if len(specs) != 1 or specs[0].dtype != dtypes.bool:
-    raise ValueError('in graph mode, the "any" builtin only supports datasets '
-                     'that return bool scalars; got: {}'.format(
-                         iterable.element_spec))
-  ds = iterable.filter(lambda x: x)
-  ds = ds.take(1)
-  ds = ds.reduce(constant_op.constant(False, dtype=dtypes.bool), lambda _, y: y)
-  return ds
-
-
 def _py_any(iterable):
   return any(iterable)
 
 
 def all_(iterable):
-  if isinstance(iterable, dataset_ops.DatasetV2):
-    return _tf_dataset_all(iterable)
+  all_override = registry_lookup(all_registry, iterable)
+  if all_override is not None:
+    return all_override(iterable)
   return _py_all(iterable)
 
 
-# all() operation is similar to any() and could be translated
-# to `filter(False)` then `take(1)`, and check if `False` exists.
-def _tf_dataset_all(iterable):
-  # check and make sure iterable.element_spec only consists of one
-  # element of tf.bool.
-  specs = nest.flatten(iterable.element_spec)
-  if len(specs) != 1 or specs[0].dtype != dtypes.bool:
-    raise ValueError('in graph mode, the "all" builtin only supports datasets '
-                     'that return bool scalars; got: {}'.format(
-                         iterable.element_spec))
-  ds = iterable.filter(lambda x: math_ops.logical_not(x))
-  ds = ds.take(1)
-  ds = ds.reduce(constant_op.constant(True, dtype=dtypes.bool), lambda _, y: y)
-  return ds
-
-
 def _py_all(iterable):
   return all(iterable)
 
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 880186e0707..8ca2e0d9c2b 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 1500086b30b..2ddf42c38b7 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 97123b63fc5..b6a1d8fdb92 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -30,6 +30,9 @@
 # This lock seems to help avoid linecache concurrency errors.
 _linecache_lock = threading.Lock()
 
+# Cache all the builtin elements in a frozen set for faster lookup.
+_BUILTIN_FUNCTION_IDS = frozenset(id(v) for v in builtins.__dict__.values())
+
 
 def islambda(f):
   if not tf_inspect.isfunction(f):
@@ -58,7 +61,7 @@ def isnamedtuple(f):
 
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
-  if any(f is builtin for builtin in builtins.__dict__.values()):
+  if id(f) in _BUILTIN_FUNCTION_IDS:
     return True
   elif isinstance(f, types.BuiltinFunctionType):
     return True
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 09299a13854..ae6d64cfa91 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index ac4b8b77b68..6a1eae04998 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -31,7 +31,6 @@
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.autograph.pyct.static_analysis import annos
-from tensorflow.python.util import deprecation
 
 
 class Analyzer(cfg.GraphVisitor):
@@ -44,13 +43,6 @@ def __init__(self, graph, include_annotations):
   def init_state(self, _):
     return set()
 
-  date = "2023-09-23"
-  instructions = (
-      "Lambda fuctions will be no more assumed to be used in the "
-      "statement where they are used, or at least in the same block. "
-      "https://github.com/tensorflow/tensorflow/issues/56089")
-
-  @deprecation.deprecated(date, instructions, warn_once=True)
   def lamba_check(self, fn_ast_node):
     if isinstance(fn_ast_node, gast.Lambda):
       # Exception: lambda functions are assumed to be used only in the
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index b24fe52f59d..7e1b9513c3b 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/autograph/pyct/transpiler.py b/tensorflow/python/autograph/pyct/transpiler.py
index 3931735875c..013ccc562aa 100644
--- a/tensorflow/python/autograph/pyct/transpiler.py
+++ b/tensorflow/python/autograph/pyct/transpiler.py
@@ -271,7 +271,7 @@ def transform(self, obj, user_context):
     Args:
       obj: A Python object, function, type, etc.
       user_context: An opaque object (may be None) that is forwarded to
-        transform_ast, through the ctx.user_context argument.
+        transform_ast, through the ctx.user attribute.
     Returns:
       The result of calling transform_function.
 
@@ -304,7 +304,7 @@ def transform_module(self, mod, user_context):
     Args:
       mod: A Python module.
       user_context: An opaque object (may be None) that is forwarded to
-        transform_ast, through the ctx.user_context argument.
+        transform_ast, through the ctx.user attribute.
     Returns:
       List[Tuple[Any, Any]]. By default it returns the output of transform_ast,
       evaluated on each supported member, other than modules, together with a
@@ -332,7 +332,7 @@ def transform_function(self, fn, user_context):
     Args:
       fn: A function or lambda.
       user_context: An opaque object (may be None) that is forwarded to
-        transform_ast, through the ctx.user_context argument.
+        transform_ast, through the ctx.user attribute.
     Returns:
       Tuple[Any, Any]. By default it returns the output of transform_ast,
       together with a `transformer.Context` containing information about the
@@ -440,7 +440,7 @@ def transform_function(self, fn, user_context):
     Args:
       fn: A function or lambda.
       user_context: An opaque object (may be None) that is forwarded to
-        transform_ast, through the ctx.user_context argument.
+        transform_ast, through the ctx.user attribute.
     Returns:
       A tuple:
         * A function or lambda with the same signature and closure as `fn`
diff --git a/tensorflow/python/autograph/tests/BUILD b/tensorflow/python/autograph/tests/BUILD
index d6f1e0aa05c..62e5005151d 100644
--- a/tensorflow/python/autograph/tests/BUILD
+++ b/tensorflow/python/autograph/tests/BUILD
@@ -1,5 +1,7 @@
 load(":reference_test.bzl", "reference_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 py_library(
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 9e9d9eae639..1e8b01ca339 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -16,6 +17,15 @@ filegroup(
     visibility = ["//visibility:private"],
 )
 
+py_library(
+    name = "type_registry",
+    srcs = [
+        "type_registry.py",
+    ],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
 py_library(
     name = "utils",
     srcs = [
@@ -37,7 +47,6 @@ py_library(
         "//tensorflow/python:script_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/autograph/pyct",
-        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
diff --git a/tensorflow/python/autograph/utils/type_registry.py b/tensorflow/python/autograph/utils/type_registry.py
new file mode 100644
index 00000000000..dad222928c2
--- /dev/null
+++ b/tensorflow/python/autograph/utils/type_registry.py
@@ -0,0 +1,62 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registry mechanism implementing the registry pattern for general use."""
+
+
+class TypeRegistry(object):
+  """Provides a type registry for the python registry pattern.
+
+  Contains mappings between types and type specific objects, to implement the
+  registry pattern.
+
+  Some example uses of this would be to register different functions depending
+  on the type of object.
+  """
+
+  def __init__(self):
+    self._registry = {}
+
+  def register(self, obj, value):
+    """Registers a Python object within the registry.
+
+    Args:
+      obj: The object to add to the registry.
+      value: The stored value for the 'obj' type.
+
+    Raises:
+      KeyError: If the same obj is used twice.
+    """
+    if obj in self._registry:
+      raise KeyError(f"{type(obj)} has already been registered.")
+    self._registry[obj] = value
+
+  def lookup(self, obj):
+    """Looks up 'obj'.
+
+    Args:
+      obj: The object to lookup within the registry.
+
+    Returns:
+      Value for 'obj' in the registry if found.
+    Raises:
+      LookupError: if 'obj' has not been registered.
+    """
+    for registered in self._registry:
+      if isinstance(
+          obj, registered
+      ):
+        return self._registry[registered]
+
+    raise LookupError(f"{type(obj)} has not been registered.")
diff --git a/tensorflow/python/checkpoint/BUILD b/tensorflow/python/checkpoint/BUILD
index 187056214f3..2a3dd78b921 100644
--- a/tensorflow/python/checkpoint/BUILD
+++ b/tensorflow/python/checkpoint/BUILD
@@ -8,6 +8,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -17,7 +18,7 @@ package(
 py_library(
     name = "checkpoint_lib",
     deps = [
-        ":checkpoint",
+        ":checkpoint_core",
         ":checkpoint_management",
         ":checkpoint_options",
         ":functional_saver",
@@ -27,10 +28,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "async_checkpoint_helper",
+    srcs = ["async_checkpoint_helper.py"],
+    srcs_version = "PY3",
+    deps = ["//tensorflow/python/tpu:tpu_embedding_v2"],
+)
+
 py_library(
     name = "checkpoint",
+    srcs_version = "PY3",
+    deps = [
+        ":async_checkpoint_helper",
+        ":checkpoint_core",
+    ],
+)
+
+py_library(
+    name = "checkpoint_core",
     srcs = [
-        "__init__.py",
         "checkpoint.py",
     ],
     srcs_version = "PY3",
@@ -64,7 +80,7 @@ py_library(
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/saved_model:path_helpers",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
@@ -306,6 +322,7 @@ tf_py_test(
     srcs = ["restore_test.py"],
     deps = [
         ":restore",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python/eager:test",
     ],
 )
@@ -330,6 +347,7 @@ py_library(
     srcs = ["checkpoint_options.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:util",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -363,6 +381,28 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "tensor_callable",
+    srcs = ["tensor_callable.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/training/saving:saveable_object",
+    ],
+)
+
+tf_py_test(
+    name = "tensor_callable_test",
+    srcs = ["tensor_callable_test.py"],
+    deps = [
+        ":checkpoint",
+        ":tensor_callable",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/trackable:base",
+    ],
+)
+
 py_library(
     name = "checkpoint_management",
     srcs = ["checkpoint_management.py"],
@@ -423,6 +463,7 @@ tf_py_test(
         ":checkpoint",
         ":saveable_compat",
         ":testdata/generate_checkpoint",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/trackable:base",
diff --git a/tensorflow/python/checkpoint/__init__.py b/tensorflow/python/checkpoint/__init__.py
deleted file mode 100644
index a47b1342b26..00000000000
--- a/tensorflow/python/checkpoint/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""API defining checkpoint."""
-
-from tensorflow.python.checkpoint import checkpoint_view
diff --git a/tensorflow/python/checkpoint/async_checkpoint_helper.py b/tensorflow/python/checkpoint/async_checkpoint_helper.py
new file mode 100644
index 00000000000..520ae832152
--- /dev/null
+++ b/tensorflow/python/checkpoint/async_checkpoint_helper.py
@@ -0,0 +1,561 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for saving/loading Trackable objects asynchronously."""
+
+import atexit
+import collections
+import copy
+import threading
+import time
+import weakref
+
+from absl import logging
+
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute.sharded_variable import ShardedVariable
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.resource_variable_ops import UninitializedVariable
+from tensorflow.python.ops.variables import Variable
+from tensorflow.python.saved_model.pywrap_saved_model import metrics
+from tensorflow.python.tpu.tpu_embedding_v2 import TPUEmbedding
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.util import object_identity
+
+# Captures the timestamp of the first Checkpoint instantiation or end of a write
+# operation. Can be accessed by multiple Checkpoint instances.
+_END_TIME_OF_LAST_ASYNC_WRITE = None
+_END_TIME_OF_LAST_ASYNC_WRITE_LOCK = threading.Lock()
+
+# API label for cell names used in async checkpoint metrics.
+_ASYNC_CHECKPOINT = "async_checkpoint"
+
+
+def _get_duration_microseconds(start_time_seconds, end_time_seconds):
+  """Calculate the duration between start and end time.
+
+  Args:
+    start_time_seconds: The start time in seconds.
+    end_time_seconds: The end time in seconds.
+
+  Returns:
+    The duration between the start and the end time. Return 0 if
+    end_time_seconds < start_time_seconds.
+  """
+  if end_time_seconds < start_time_seconds:
+    # Avoid returning negative value in case of clock skew.
+    return 0
+  return round((end_time_seconds - start_time_seconds) * 1000000)
+
+
+class AsyncCheckpointHelper:
+  """Helper class for async checkpoint."""
+
+  def __init__(self, checkpointer_impl, root=None, **kwargs):
+    """Initialize AsyncCheckpoint.
+
+    Args:
+      checkpointer_impl: The Checkpoint class to power the AsyncCheckpoint.
+      root: The root object to checkpoint. `root` may be a trackable object or
+        `WeakRef` of a trackable object.
+      **kwargs: The keyword arguments representing the checkpointed variables.
+    """
+    # TODO(chienchunh): Make sure the processing for the root object is
+    #   consistent when integrating with the public API, e.g., adding all kwarg
+    #   items as the child of the root object.
+    if root:
+      trackable_root = root() if isinstance(root, weakref.ref) else root
+      kwargs["root"] = trackable_root
+      trackable_root._maybe_initialize_trackable()
+
+    self._checkpointer_impl = checkpointer_impl
+    self._checkpoint_items = kwargs
+
+    # The underlying Checkpoint instance and its items.
+    self._checkpoint = None
+    self._checkpoint_options = None
+
+    # The callback function that needs to be executed after checkpoint write.
+    # Currently this is only applied to the scenario where CheckpointManager is
+    # used, which triggers the _write() method.
+    self._async_write_done_callback = None
+
+    # The list of all nodes from the original checkpoint items.
+    # TODO(chienchunh): Consider changing this to local variable.
+    self._original_nodes = None
+    # The mapping between the original and the copied resource variables.
+    # The copied variables are used for the underlying checkpointing.
+    self._object_map = None
+    # A list of TPUEmbedding objects included in the checkpoint items.
+    self._tpu_embedding_objects = None
+
+    self._default_device = device_util.current() or "CPU:0"
+    self._default_device = device_util.canonicalize(self._default_device)
+
+    self._save_file_prefix = None
+    self._use_checkpoint_save = False
+    self._async_save_thread = None
+    self._async_save_thread_shutdown = False
+    # Semaphores for writing/reading the cpu-copied variables (self._var_pairs)
+    # TODO(chienchunh): Consider Queue/Condition instead of Semaphore.
+    self._writer_sem = threading.Semaphore(1)
+    self._reader_sem = threading.Semaphore(0)
+
+    # Register to join the async save thread upon exit.
+    atexit.register(self._join_async_save_thread)
+
+    global _END_TIME_OF_LAST_ASYNC_WRITE
+    with _END_TIME_OF_LAST_ASYNC_WRITE_LOCK:
+      if _END_TIME_OF_LAST_ASYNC_WRITE is None:
+        _END_TIME_OF_LAST_ASYNC_WRITE = time.time()
+
+  @def_function.function
+  def _copy_from_cpu(self):
+    """Copy the checkpointed variables from the host CPU to the accelerator.
+
+    TODO(chienchunh): Get the concrete function before firstly called to avoid
+                      hangining the accelerators idle during function tracing.
+    """
+    for accelerator_var, cpu_var in self._object_map.items():
+      if isinstance(accelerator_var, (ShardedVariable, TPUEmbedding)):
+        # Skip for SharededVariable and TPUEmbedding as their sub-variables will
+        # be copied over separately through other entries in the object map.
+        continue
+      with ops.device(accelerator_var.device):
+        accelerator_var.assign(cpu_var.read_value())
+
+  @def_function.function
+  def _copy_to_cpu(self):
+    """Copy the checkpointed variables from the accelerator to the host CPU.
+
+    TODO(chienchunh): Get the concrete function before firstly called to avoid
+                      hangining the accelerators idle during function tracing.
+    """
+    for accelerator_var, cpu_var in self._object_map.items():
+      if isinstance(accelerator_var, (ShardedVariable, TPUEmbedding)):
+        # Skip for SharededVariable and TPUEmbedding as their sub-variables will
+        # be copied over separately through other entries in the object map.
+        continue
+      with ops.device(cpu_var.device):
+        cpu_var.assign(accelerator_var.read_value())
+    for tpu_embedding in self._tpu_embedding_objects:
+      tpu_embedding._retrieve_variables()  # pylint: disable=protected-access
+
+  def _traverse_variables(self, to_traverse, visited):
+    """Create the copied nodes and variables while traversing the nodes.
+
+    This method performs a BFS to traverse the nodes while avoiding duplicated
+    visits. Throughout the process, self._mapping, self._original_nodes, and
+    self._var_pairs are populated.
+
+    Args:
+      to_traverse: A deque that stores the nodes to be traversed.
+      visited: A list of nodes that have been visited.
+    """
+    # pylint: disable=protected-access
+    while to_traverse:
+      current_trackable = to_traverse.popleft()
+      self._original_nodes.append(current_trackable)
+
+      if isinstance(current_trackable, (Variable, ShardedVariable)):
+        self._copy_trackable(current_trackable)
+      if isinstance(current_trackable, TPUEmbedding):
+        self._handle_tpu_embedding(current_trackable)
+
+      for child in current_trackable._trackable_children().values():
+        if child in visited:
+          continue
+        visited.add(child)
+        to_traverse.append(child)
+    # pylint: enable=protected-access
+
+  def _ensure_initialized(self):
+    """Initialize the async checkpoint internal state."""
+    if self._checkpoint is not None:
+      return
+
+    self._original_nodes = []
+    self._object_map = object_identity.ObjectIdentityDictionary()
+    self._tpu_embedding_objects = []
+
+    # Add the top-level checkpoint items to be traversed,
+    to_traverse = collections.deque([])
+    visited = object_identity.ObjectIdentitySet()
+    for v in self._checkpoint_items.values():
+      if isinstance(v, (Variable, ShardedVariable)):
+        self._copy_trackable(v)
+      elif isinstance(v, TPUEmbedding):
+        self._handle_tpu_embedding(v)
+      to_traverse.append(v)
+      visited.add(v)
+    self._traverse_variables(to_traverse, visited)
+
+    # Copy for the slot variables.
+    for current_trackable in self._original_nodes:
+      if (isinstance(current_trackable, optimizer_v1.Optimizer)
+          # Note: dir() is used rather than hasattr() here to avoid triggering
+          # custom __getattr__ code, see b/152031870 for context.
+          or "get_slot_names" in dir(current_trackable)):
+        slot_names = current_trackable.get_slot_names()
+        for slot_name in slot_names:
+          for original_variable in self._original_nodes:
+            if not isinstance(original_variable, Variable):
+              continue
+            try:
+              original_slot_variable = current_trackable.get_slot(
+                  original_variable, slot_name)
+            except (AttributeError, KeyError):
+              continue
+            if isinstance(original_slot_variable, (Variable, ShardedVariable)):
+              self._copy_trackable(original_slot_variable)
+
+    # Initiate the underlying Checkpoint instance with the copied items.
+    self._checkpoint = self._checkpointer_impl(**self._checkpoint_items)
+
+    # Pass the object map of the copied variables to the underlying Checkpoint.
+    self._checkpoint._saver._object_map = self._object_map  # pylint: disable=protected-access
+
+    # Initiate the async thread for checkpoint saving.
+    self._async_save_thread = threading.Thread(
+        target=self._async_save, daemon=True)
+    self._async_save_thread.start()
+
+  def _join_async_save_thread(self):
+    """Join the async save thread.
+
+    The steps for terminating the async save thread:
+    1). Wait until the last async save event is done.
+    2). Set _async_save_thread_shutdown flag to false to indicate termination.
+    3). Trigger the async save thread to check and fail the while-predicate.
+    4). Join the async save thread. (The thread may finish before joining.)
+    """
+    if self._writer_sem.acquire(timeout=3600):  # Step-1.
+      self._async_save_thread_shutdown = True  # Step-2.
+      self._reader_sem.release()  # Step-3.
+      logging.info("Joining the async save thread.")
+      if self._async_save_thread is not None:
+        self._async_save_thread.join()  # Step-4.
+    else:
+      logging.error("Timeout waiting for the async save thread; terminating the"
+                    " thread instead. The last checkpoint may be incomeplete.")
+
+  def _async_save(self):
+    """The thread function for the async checkpoint save."""
+    with context.executor_scope(
+        executor.new_executor(
+            enable_async=False, enable_streaming_enqueue=False)):
+      while self._reader_sem.acquire() and not self._async_save_thread_shutdown:
+        logging.info("Starting async checkpoint save on the device: %s",
+                     self._default_device)
+
+        async_save_start_time = time.time()
+
+        # Specify the ops placement on the worker if running with
+        # coordinator-worker mode. This is required as launching a new thread
+        # would clear the placement policy and make localhost the default
+        # placement, while the main thread's default placement would be the
+        # master worker's CPU:0.
+        with ops.device(self._default_device):
+          if self._use_checkpoint_save:
+            self._checkpoint.save(self._save_file_prefix,
+                                  self._checkpoint_options)
+          else:
+            self._checkpoint._write(  # pylint: disable=protected-access
+                self._save_file_prefix,
+                options=self._checkpoint_options,
+                write_done_callback=self._async_write_done_callback)
+        # Allow the next checkpoint event to overwrite the cpu-copied variables.
+        self._writer_sem.release()
+
+        async_save_end_time = time.time()
+        metrics.AddAsyncCheckpointWriteDuration(
+            api_label=_ASYNC_CHECKPOINT,
+            microseconds=_get_duration_microseconds(async_save_start_time,
+                                                    async_save_end_time))
+
+        # Measure the elapsed time since the last checkpoint.
+        # Due to the nature of async checkpoint, here it actually captures the
+        # duration between the start_time of the previous checkpoint and the
+        # start time of this checkpoint. As a result, the duration of the final
+        # async checkpoint is excluded, which is fine since it does not take
+        # much time.
+        global _END_TIME_OF_LAST_ASYNC_WRITE
+        with _END_TIME_OF_LAST_ASYNC_WRITE_LOCK:
+          metrics.AddTrainingTimeSaved(
+              api_label=_ASYNC_CHECKPOINT,
+              microseconds=_get_duration_microseconds(
+                  _END_TIME_OF_LAST_ASYNC_WRITE, async_save_start_time))
+          _END_TIME_OF_LAST_ASYNC_WRITE = async_save_start_time
+    logging.info("Async save thread reached the end of the execution.")
+
+  def _copy_for_variable(self, original_var):
+    """Create a new instance for the input trackable.
+
+    Args:
+      original_var: Input Variable object to be copied.
+    """
+    op_device = pydev.DeviceSpec.from_string(original_var.device).replace(
+        device_type="CPU", device_index=0).to_string()
+    with ops.device(op_device):
+      new_var = UninitializedVariable(
+          trainable=original_var.trainable,
+          shape=original_var.shape,
+          dtype=original_var.dtype,
+          name=original_var._shared_name)  # pylint: disable=protected-access
+    self._object_map[original_var] = new_var
+
+  def _copy_for_sharded_variable(self, original_var):
+    """Create a new instance for the input ShardedVariable.
+
+    Args:
+      original_var: Input ShardedVariable object to be copied.
+    """
+    copied_vars = []
+    for v in original_var._variables:  # pylint: disable=protected-access
+      self._copy_for_variable(v)
+      copied_vars.append(self._object_map[v])
+    self._object_map[original_var] = ShardedVariable(
+        copied_vars, name=original_var.name)
+
+  def _copy_trackable(self, original_trackable):
+    """Create a new instance for the input trackable.
+
+    Args:
+      original_trackable: The trackable instance to be copied.
+
+    Raises:
+      AttributeError: if the input trackable is not Variable or ShardedVariable.
+    """
+    if isinstance(original_trackable, ShardedVariable):
+      self._copy_for_sharded_variable(original_trackable)
+    elif isinstance(original_trackable, Variable):
+      self._copy_for_variable(original_trackable)
+    else:
+      raise AttributeError("Only Variable or ShardedVariable can be copied.")
+
+  def _handle_tpu_embedding(self, tpu_embedding):
+    """Handle TPUEmbedding.
+
+    Args:
+      tpu_embedding: TPUEmbedding object to be handled.
+
+    Raises:
+      AttributeError: if the input trackable is not TPUEmbedding type.
+    """
+    if not isinstance(tpu_embedding, TPUEmbedding):
+      raise AttributeError("Expecting TPUEmbedding type; got %s" %
+                           type(tpu_embedding))
+
+    # Create a dummy TPUEmbedding object and add it to the object_map. This is
+    # to prevent the TPUEmbedding's save_callback from being triggered because
+    # the embedding values have already being retrieved by AsyncCheckpoint.
+    # pylint: disable=protected-access
+    new_embedding = TPUEmbedding(
+        feature_config=tpu_embedding._feature_config,
+        optimizer=tpu_embedding._table_config[0].optimizer,
+        pipeline_execution_with_tensor_core=tpu_embedding
+        ._pipeline_execution_with_tensor_core)
+    self._object_map[tpu_embedding] = new_embedding
+    # pylint: enable=protected-access
+
+    if tpu_embedding not in self._tpu_embedding_objects:
+      self._tpu_embedding_objects.append(tpu_embedding)
+
+  @property
+  def save_counter(self):
+    """An integer variable numbering the checkpoint events.
+
+    This is maintained by the underlying tf.train.Checkpoing object employed by
+    AsyncCheckpoint class. The number starts at 0 and gets incremented for each
+    checkpoint event.
+
+    Returns:
+      The save counter variable.
+    """
+    self._ensure_initialized()
+    return self._checkpoint.save_counter
+
+  def write(self, save_path, options=None):
+    """Save the checkpointed variables.
+
+    Args:
+      save_path: The file prefix of the checkpoint file.
+      options: Optional CheckpointOption instance.
+
+    Returns:
+      The full path of the checkpoint file.
+    """
+    self._write(save_path, options)
+
+  def _write(self, save_path, options=None, write_done_callback=None):
+    """Save the checkpointed variables.
+
+    This method has exactly the same logic as save(), except it does not
+    increment the underlying save_counter, which is done by the caller, e.g.,
+    CheckpointManager.
+
+    Args:
+      save_path: The file prefix of the checkpoint file.
+      options: Optional CheckpointOption instance.
+      write_done_callback: Optional callback function executed after the async
+        write is done.
+
+    Returns:
+      The full path of the checkpoint file.
+    """
+    self._ensure_initialized()
+
+    write_start_time = time.time()
+
+    # Copy the variable values to the host CPU.
+    if self._writer_sem.acquire():
+      self._copy_to_cpu()
+
+    # Trigger the async thread to checkpoint the cpu-copied variables.
+    # Need to wait until the weight copying finishes before checkpoint save.
+    context.async_wait()
+    self._save_file_prefix = save_path
+    self._use_checkpoint_save = False
+
+    # Ensure that we do not request async checkpointing to the underlying
+    # checkpointer as this could lead to an infinite loop.
+    self._checkpoint_options = copy.copy(options) if options else None
+    if self._checkpoint_options:
+      self._checkpoint_options.experimental_enable_async_checkpoint = False
+
+    self._async_write_done_callback = write_done_callback
+    self._reader_sem.release()
+
+    write_end_time = time.time()
+    metrics.AddCheckpointWriteDuration(
+        api_label=_ASYNC_CHECKPOINT,
+        microseconds=_get_duration_microseconds(write_start_time,
+                                                write_end_time))
+
+    return save_path
+
+  def save(self, save_path, options=None):
+    """Save the checkpointed variables.
+
+    Args:
+      save_path: The file prefix of the checkpoint file.
+      options: Optional CheckpointOption instance.
+
+    Returns:
+      The full path of the checkpoint file.
+    """
+    # If this is the first time that AsyncCheckpoint.save() is called,
+    # initialize the cpu-copied variables and create the pair-wise mapping
+    # between the original model variables and the cpu-copied variables.
+    #
+    # This is not performed in the initializer because some variables, e.g.,
+    # slot variables of the optimizer, were not created until actually running
+    # the train function, so we could only get the complete list of the
+    # variables after some train steps were run.
+    self._ensure_initialized()
+
+    save_start_time = time.time()
+
+    # Copy the variable values to the host CPU.
+    if self._writer_sem.acquire():
+      self._copy_to_cpu()
+
+    # Retrieve the save counter from the underlying checkpoint object to
+    # re-construct the full path of the checkpoint file.
+    # This step has to happen before triggerting the underlying checkpoint;
+    # otherwise, the save_counter value may or may not have been updated.
+    save_counter = self._checkpoint.save_counter.numpy() + 1
+    full_path = "{}-{}".format(save_path, save_counter)
+
+    # Trigger the async thread to checkpoint the cpu-copied variables.
+    # Need to wait until the weight copying finishes before checkpoint save.
+    context.async_wait()
+    self._save_file_prefix = save_path
+    self._use_checkpoint_save = True
+
+    # Ensure that we do not request async checkpointing to the underlying
+    # checkpointer as this could lead to an infinite loop.
+    self._checkpoint_options = copy.copy(options) if options else None
+    if self._checkpoint_options:
+      self._checkpoint_options.experimental_enable_async_checkpoint = False
+
+    self._reader_sem.release()
+
+    save_end_time = time.time()
+    metrics.AddCheckpointWriteDuration(
+        api_label=_ASYNC_CHECKPOINT,
+        microseconds=_get_duration_microseconds(save_start_time, save_end_time))
+
+    return full_path
+
+  def read(self, save_path, options=None):
+    """Restore the checkpointed variables.
+
+    This method has exactly the same logic as restore(). This method is
+    implemented only to fulfill the duty of subclassing tf.train.Checkpoint.
+
+    Args:
+      save_path: The full name of the checkpoint file to be restored.
+      options: CheckpointOption instance.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration. See tf.train.Checkpoint.restore()
+      for more details.
+    """
+    return self.restore(save_path, options)
+
+  def restore(self, save_path, options=None):
+    """Restore the checkpointed variables.
+
+    Args:
+      save_path: The full name of the checkpoint file to be restored.
+      options: CheckpointOption instance.
+
+    Returns:
+      A load status object, which can be used to make assertions about the
+      status of a checkpoint restoration. See tf.train.Checkpoint.restore()
+      for more details.
+    """
+    # Ensure that we do not request async checkpointing to the underlying
+    # checkpointer as this could lead to an infinite loop.
+    self._checkpoint_options = (
+        copy.copy(options) if options else self._checkpoint_options)
+    if self._checkpoint_options:
+      self._checkpoint_options.experimental_enable_async_checkpoint = False
+
+    # Wait for any ongoing checkpoint event to finish.
+    with self._writer_sem:
+      # If _checkpoint has not been initialized yet, it means the restore() is
+      # called right after the coordinator is restarted. We directly restore
+      # the checkpointed items through tf.train.Checkpoint.restore().
+      if self._checkpoint is None:
+        tmp_checkpoint = self._checkpointer_impl(**self._checkpoint_items)
+        return tmp_checkpoint.restore(save_path, self._checkpoint_options)
+
+      # Restore the values of the cpu-copied variables.
+      status = self._checkpoint.restore(save_path, self._checkpoint_options)
+
+      # Restore the values of the original model.
+      self._copy_from_cpu()
+      return status
+
+  def sync(self):
+    """Sync on any ongoing save or restore events."""
+    with self._writer_sem:
+      logging.info("Sync on ongoing save/restore.")
diff --git a/tensorflow/python/checkpoint/benchmarks_test.py b/tensorflow/python/checkpoint/benchmarks_test.py
index b1c6eacdbbf..b93b1c57afe 100644
--- a/tensorflow/python/checkpoint/benchmarks_test.py
+++ b/tensorflow/python/checkpoint/benchmarks_test.py
@@ -17,7 +17,6 @@
 import os
 import time
 from tensorflow.python.checkpoint import checkpoint as util
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
@@ -26,29 +25,17 @@
 from tensorflow.python.platform import test
 from tensorflow.python.trackable import base
 from tensorflow.python.training import py_checkpoint_reader
-from tensorflow.python.training.saving import saveable_object
 
 
-class _TrivialSaveable(saveable_object.SaveableObject):
+class _TrivialRestore(base.Trackable):
 
-  def __init__(self, name):
-    op = lambda: array_ops.ones([])
-    super(_TrivialSaveable, self).__init__(
-        op=op,
-        specs=[saveable_object.SaveSpec(
-            op, "", name, dtype=dtypes.float32, device="CPU:0")],
-        name=name)
+  def _serialize_to_tensors(self):
+    return {base.VARIABLE_VALUE_KEY: array_ops.ones([])}
 
-  def restore(self, restored_tensors, restored_shapes):
+  def _restore_from_tensors(self, restored_tensors):
     return control_flow_ops.no_op()
 
 
-class _TrivialRestore(base.Trackable):
-
-  def _gather_saveables_for_checkpoint(self):
-    return {base.VARIABLE_VALUE_KEY: _TrivialSaveable}
-
-
 class _LazyTrivialObjects(module.Module):
 
   def __init__(self):
diff --git a/tensorflow/python/checkpoint/checkpoint.py b/tensorflow/python/checkpoint/checkpoint.py
index 247753bb756..3e011becb89 100644
--- a/tensorflow/python/checkpoint/checkpoint.py
+++ b/tensorflow/python/checkpoint/checkpoint.py
@@ -35,7 +35,6 @@
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import executor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -50,7 +49,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import utils_impl
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.trackable import autotrackable
 from tensorflow.python.trackable import base
@@ -64,6 +63,7 @@
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 # The callable that provide Keras default session that is needed for saving.
@@ -266,9 +266,6 @@ def __init__(self, object_graph_proto, save_path, save_path_tensor, reader,
     self.all_python_objects = object_identity.ObjectIdentityWeakSet()
     self.save_path_tensor = save_path_tensor
     self.save_path_string = save_path
-    self.reader = reader
-    if self.reader is None:
-      self.reader = py_checkpoint_reader.NewCheckpointReader(save_path)
     self.dtype_map = reader.get_variable_to_dtype_map()
     self.shape_map = reader.get_variable_to_shape_map()
     # A NewCheckpointReader for the most recent checkpoint, for streaming Python
@@ -324,10 +321,13 @@ def new_restore_ops(self, new_ops):
     if self.new_restore_ops_callback:
       self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
 
-  def restore_saveables(self,
-                        tensor_saveables,
-                        python_positions,
-                        registered_savers=None):
+  def restore_saveables(
+      self,
+      tensor_saveables,
+      python_positions,
+      registered_savers=None,
+      reader=None,
+  ):
     """Run or build restore operations for SaveableObjects.
 
     Args:
@@ -335,16 +335,20 @@ def restore_saveables(self,
       python_positions: List of CheckpointPositions bound to `PythonState`
         objects which must be restored eagerly.
       registered_savers: a dict mapping saver names-> object name -> Trackable.
+      reader: A `CheckpointReader`. If None, a new instance will be created.
 
     Returns:
       When graph building, a list of restore operations, either cached or newly
       created, to restore `tensor_saveables`.
     """
+    if reader is None:
+      reader = py_checkpoint_reader.NewCheckpointReader(self.save_path_string)
+
     restore_ops = []
     # Eagerly run restorations for Python state.
     for position in python_positions:
       key = position.object_proto.attributes[0].checkpoint_key
-      position.trackable.deserialize(self.reader.get_tensor(key))
+      position.trackable.deserialize(reader.get_tensor(key))
 
     # If we have new SaveableObjects, extract and cache restore ops.
     if tensor_saveables or registered_savers:
@@ -1208,11 +1212,8 @@ def _gather_serialized_tensors(self, object_graph_tensor=None):
         object_graph_tensor)
     return serialized_tensors, feed_additions, registered_savers, graph_proto
 
-  def _save_cached_when_graph_building(self,
-                                       file_prefix,
-                                       object_graph_tensor,
-                                       options,
-                                       write_done_callback=None):
+  def _save_cached_when_graph_building(self, file_prefix, object_graph_tensor,
+                                       options):
     """Create or retrieve save ops.
 
     Args:
@@ -1220,9 +1221,6 @@ def _save_cached_when_graph_building(self,
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
       options: `CheckpointOptions` object.
-      write_done_callback: Optional callback function to be executed once
-        the underlying checkpoint saving is finished. Example usage includes
-        updating the checkpoint internal state.
 
     Returns:
       A two-element tuple with a filename tensor and a feed_dict of tensors to
@@ -1233,68 +1231,26 @@ def _save_cached_when_graph_building(self,
     serialized_tensors, feed_additions, registered_savers, graph_proto = (
         self._gather_serialized_tensors(object_graph_tensor))
 
-    def _run_save():
-      """Create and execute the SaveOp for the checkpoint."""
-      if (self._last_save_object_graph != graph_proto
-          # When executing eagerly, we need to re-create SaveableObjects each
-          # time save() is called so they pick up new Tensors passed to their
-          # constructors. That means the Saver needs to be copied with a new
-          # var_list.
-          or context.executing_eagerly() or ops.inside_function()):
-        saver = functional_saver.MultiDeviceSaver(serialized_tensors,
-                                                  registered_savers)
-        save_op = saver.save(file_prefix, options=options)
-        with ops.device("/cpu:0"):
-          with ops.control_dependencies([save_op]):
-            self._cached_save_operation = array_ops.identity(file_prefix)
-        self._last_save_object_graph = graph_proto
-      return self._cached_save_operation, feed_additions
-
-    def _copy_tensors():
-      """Copy the tensors to the host CPU device."""
-      for trackable in serialized_tensors:
-        maybe_tensor = serialized_tensors[trackable]
-        if isinstance(maybe_tensor, ops.Tensor):
-          serialized_tensors[trackable] = _copy_single_tensor(maybe_tensor)
-        else:
-          for key, tensor in maybe_tensor.items():
-            serialized_tensors[trackable][key] = _copy_single_tensor(tensor)
-
-    def _async_save_fn():
-      """The thread function for executing async checkpoint save."""
-      with context.executor_scope(
-          executor.new_executor(
-              enable_async=False, enable_streaming_enqueue=False)):
-        _run_save()
-        # Update the internal checkpoint state if the checkpoint event is
-        # triggered from Checkpoint.save().
-        if write_done_callback:
-          write_done_callback(_convert_file_name_tensor_to_string(file_prefix))
-
-    if options.experimental_enable_async_checkpoint:
-      # Execute async-checkpoint.
-
-      # Step-1: Explicitly copy the tensors to their host CPU device.
-      _copy_tensors()
-
-      # Step-2: Execute the rest of the checkpoint operations on the host device
-      #         using an async executor.
-      global _ASYNC_CHECKPOINT_THREAD
-      if _ASYNC_CHECKPOINT_THREAD is not None:
-        _ASYNC_CHECKPOINT_THREAD.join()
-      _ASYNC_CHECKPOINT_THREAD = threading.Thread(target=_async_save_fn)
-      _ASYNC_CHECKPOINT_THREAD.start()
-
-      # Step-3: Return the expected checkpoint file path though the save op may
-      #         not have finished.
-      self._cached_save_operation = file_prefix
-      return self._cached_save_operation, feed_additions
-
-    # Execute the normal checkpoint, i.e., synchronous.
-    return _run_save()
-
-  def save(self, file_prefix, checkpoint_number=None, session=None,
-           options=None, write_done_callback=None):
+    if (self._last_save_object_graph != graph_proto
+        # When executing eagerly, we need to re-create SaveableObjects each
+        # time save() is called so they pick up new Tensors passed to their
+        # constructors. That means the Saver needs to be copied with a new
+        # var_list.
+        or context.executing_eagerly() or ops.inside_function()):
+      saver = functional_saver.MultiDeviceSaver(serialized_tensors,
+                                                registered_savers)
+      save_op = saver.save(file_prefix, options=options)
+      with ops.device("/cpu:0"):
+        with ops.control_dependencies([save_op]):
+          self._cached_save_operation = array_ops.identity(file_prefix)
+      self._last_save_object_graph = graph_proto
+    return self._cached_save_operation, feed_additions
+
+  def save(self,
+           file_prefix,
+           checkpoint_number=None,
+           session=None,
+           options=None):
     """Save a training checkpoint.
 
     The saved checkpoint includes variables created by this object and any
@@ -1313,9 +1269,6 @@ def save(self, file_prefix, checkpoint_number=None, session=None,
         eagerly. If not provided when graph building, the default session is
         used.
       options: Optional `tf.train.CheckpointOptions` object.
-      write_done_callback: Optional callback function to be executed once
-        the underlying checkpoint saving is finished. Example usage includes
-        updating the checkpoint internal state.
 
     Returns:
       The full path to the checkpoint.
@@ -1349,7 +1302,7 @@ def save(self, file_prefix, checkpoint_number=None, session=None,
       file_io.recursive_create_dir(os.path.dirname(file_prefix))
 
     save_path, new_feed_additions = self._save_cached_when_graph_building(
-        file_prefix_tensor, object_graph_tensor, options, write_done_callback)
+        file_prefix_tensor, object_graph_tensor, options)
 
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
@@ -1496,7 +1449,8 @@ def restore(self, save_path, options=None):
         options=options,
         saveables_cache=self._saveables_cache)
     restore_lib.CheckpointPosition(
-        checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root)
+        checkpoint=checkpoint, proto_id=0).restore(self._graph_view.root,
+                                                   reader)
 
     # Attached dependencies are not attached to the root, so should be restored
     # separately.
@@ -1523,7 +1477,8 @@ def restore(self, save_path, options=None):
           continue
 
         restore_lib.CheckpointPosition(
-            checkpoint=checkpoint, proto_id=proto_id).restore(ref.ref)
+            checkpoint=checkpoint,
+            proto_id=proto_id).restore(ref.ref, reader)
 
     load_status = CheckpointLoadStatus(
         checkpoint,
@@ -2164,6 +2119,20 @@ def __init__(self, root=None, **kwargs):
       if _END_TIME_OF_LAST_WRITE is None:
         _END_TIME_OF_LAST_WRITE = time.time()
 
+    # Store a reference to root and kwargs if we need to instantiate an
+    # AsyncCheckpointer later.
+    self._root = root
+    self._kwargs = kwargs
+    self._delete_tracking("_kwargs")
+
+    # Don't instantiate the AsyncCheckpointer unless required.
+    self._async_checkpointer_impl = None
+
+    # Store checkpoint options during the save/write calls so that subsequent
+    # read/restore calls are done properly. This is only populated when
+    # async read/write is enabled.
+    self._checkpoint_options = None
+
     attached_dependencies = None
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
@@ -2286,6 +2255,18 @@ def write(self, file_prefix, options=None):
       file_prefix = os.fspath(file_prefix)
     return self._write(file_prefix, options)
 
+  def _async_checkpointer(self):
+    """Returns an instantiated AsyncCheckpointHelper."""
+    if self._async_checkpointer_impl is None:
+      ach = LazyLoader(
+          "async_checkpoint_helper", globals(),
+          "tensorflow.python.checkpoint.async_checkpoint_helper"
+      )
+      self._async_checkpointer_impl = ach.AsyncCheckpointHelper(
+          Checkpoint, **self._kwargs)
+
+    return self._async_checkpointer_impl
+
   def _write(self, file_prefix, options=None, write_done_callback=None):
     """Internal method that implements Checkpoint.write().
 
@@ -2300,22 +2281,21 @@ def _write(self, file_prefix, options=None, write_done_callback=None):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
+    if options and options.experimental_enable_async_checkpoint:
+      self._checkpoint_options = options
+      return self._async_checkpointer()._write(  # pylint: disable=protected-access
+          file_prefix, options, write_done_callback)
+
     start_time = time.time()
     options = options or checkpoint_options.CheckpointOptions()
-    output = self._saver.save(
-        file_prefix=file_prefix,
-        options=options,
-        write_done_callback=write_done_callback)
+    output = self._saver.save(file_prefix=file_prefix, options=options)
     output = _convert_file_name_tensor_to_string(output)
-    # For async-checkpoint-v1 scenario, the callback function is triggered in
-    # TrackableSaver. For others, it is executed here.
-    if not options.experimental_enable_async_checkpoint and write_done_callback:
+
+    if write_done_callback:
       write_done_callback(output)
 
-    # Ensure save operations have completed, only when running in eager runtime
-    # and non-async checkpoint configuration.
-    if context.executing_eagerly(
-    ) and not options.experimental_enable_async_checkpoint:
+    # Ensure save operations have completed when running in eager runtime.
+    if context.executing_eagerly():
       context.async_wait()
 
     end_time = time.time()
@@ -2333,12 +2313,8 @@ def _write(self, file_prefix, options=None, write_done_callback=None):
                                                   end_time))
       _END_TIME_OF_LAST_WRITE = end_time
 
-    # Async-checkpoint-v1 may not have finished yet, so we can't measure its
-    # checkpoint size now. For other scenarios (sync-checkpoint and async-v2),
-    # this metric recording works fine here.
-    if not options.experimental_enable_async_checkpoint:
-      metrics.RecordCheckpointSize(
-          api_label=_CHECKPOINT_V2, filesize=_get_checkpoint_size(output))
+    metrics.RecordCheckpointSize(
+        api_label=_CHECKPOINT_V2, filesize=_get_checkpoint_size(output))
     return output
 
   @property
@@ -2394,17 +2370,16 @@ def save(self, file_prefix, options=None):
     Returns:
       The full path to the checkpoint.
     """
+    if options and options.experimental_enable_async_checkpoint:
+      self._checkpoint_options = options
+      return self._async_checkpointer().save(file_prefix, options)
+
     if isinstance(file_prefix, os.PathLike):
       file_prefix = os.fspath(file_prefix)
     # pylint:enable=line-too-long
     options = options or checkpoint_options.CheckpointOptions()
     graph_building = not context.executing_eagerly()
     if graph_building:
-      # Assert that async checkpoint is not used for non-eager mode.
-      if options.experimental_enable_async_checkpoint:
-        raise NotImplementedError(
-            "Async checkpoint is not supported for non-eager mode. ")
-
       if ops.inside_function():
         raise NotImplementedError(
             "Calling tf.train.Checkpoint.save() from a function is not "
@@ -2475,6 +2450,12 @@ def read(self, save_path, options=None):
       A load status object, which can be used to make assertions about the
       status of a checkpoint restoration.  See `restore` for details.
     """
+    if options and options.experimental_enable_async_checkpoint:
+      self._checkpoint_options = options
+    if (self._checkpoint_options and
+        self._checkpoint_options.experimental_enable_async_checkpoint):
+      return self._async_checkpointer().read(save_path, options)
+
     start_time = time.time()
     if isinstance(save_path, os.PathLike):
       save_path = os.fspath(save_path)
@@ -2591,14 +2572,20 @@ def restore(self, save_path, options=None):
       NotFoundError: if the a checkpoint or SavedModel cannot be found at
         `save_path`.
     """
+    if options and options.experimental_enable_async_checkpoint:
+      self._checkpoint_options = options
+    if (self._checkpoint_options and
+        self._checkpoint_options.experimental_enable_async_checkpoint):
+      return self._async_checkpointer().restore(save_path, options)
+
     orig_save_path = save_path
     if isinstance(save_path, os.PathLike):
       save_path = os.fspath(save_path)
 
     if save_path is not None and gfile.IsDirectory(save_path) and (
-        (gfile.Exists(utils_impl.get_saved_model_pb_path(save_path)) or
-         gfile.Exists(utils_impl.get_saved_model_pbtxt_path(save_path)))):
-      save_path = utils_impl.get_variables_path(save_path)
+        (gfile.Exists(path_helpers.get_saved_model_pb_path(save_path)) or
+         gfile.Exists(path_helpers.get_saved_model_pbtxt_path(save_path)))):
+      save_path = path_helpers.get_variables_path(save_path)
 
     try:
       status = self.read(save_path, options=options)
diff --git a/tensorflow/python/checkpoint/checkpoint_management.py b/tensorflow/python/checkpoint/checkpoint_management.py
index c6c242778bf..5f0668b7c9a 100644
--- a/tensorflow/python/checkpoint/checkpoint_management.py
+++ b/tensorflow/python/checkpoint/checkpoint_management.py
@@ -874,4 +874,6 @@ def restore_or_initialize(self):
 
     if self._init_fn is not None:
       self._init_fn()
+      logging.info(
+          "Customized initialization is done through the passed `init_fn`.")
     return None
diff --git a/tensorflow/python/checkpoint/checkpoint_management_test.py b/tensorflow/python/checkpoint/checkpoint_management_test.py
index 747f4402abb..8fc496100bf 100644
--- a/tensorflow/python/checkpoint/checkpoint_management_test.py
+++ b/tensorflow/python/checkpoint/checkpoint_management_test.py
@@ -141,7 +141,7 @@ def testRelativePath(self):
 
           # Restore "v0" from that checkpoint.
           save.restore(sess, name)
-          self.assertEqual(v0.eval(), 2.0)
+          self.assertEqual(self.evaluate(v0), 2.0)
 
 
 class CheckpointStateTest(test.TestCase):
diff --git a/tensorflow/python/checkpoint/checkpoint_options.py b/tensorflow/python/checkpoint/checkpoint_options.py
index d0b09455dc7..d46d40e6fdf 100644
--- a/tensorflow/python/checkpoint/checkpoint_options.py
+++ b/tensorflow/python/checkpoint/checkpoint_options.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Options for saving Checkpoints."""
 
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -36,10 +37,21 @@ class CheckpointOptions(object):
   """
 
   # Define object attributes in __slots__ for improved memory and performance.
-  __slots__ = ("experimental_io_device", "experimental_enable_async_checkpoint")
+  __slots__ = (
+      "experimental_io_device",
+      "experimental_enable_async_checkpoint",
+      "enable_async",
+  )
 
-  def __init__(self, experimental_io_device=None,
-               experimental_enable_async_checkpoint=False):
+  @deprecated_args(
+      None, "Use enable_async instead", "experimental_enable_async_checkpoint"
+  )
+  def __init__(
+      self,
+      experimental_io_device=None,
+      experimental_enable_async_checkpoint=False,
+      enable_async=False,
+  ):
     """Creates an object that stores options for a Checkpoint.
 
     Args:
@@ -53,8 +65,11 @@ def __init__(self, experimental_io_device=None,
         such as "/tmp" when running in a distributed setting. In that case pass
         a device for the host where the "/tmp" directory is accessible.
 
-      experimental_enable_async_checkpoint: bool Type. Indicates whether async
-        checkpoint is enabled. Default is False, i.e., no async checkpoint.
+      experimental_enable_async_checkpoint: bool Type. Deprecated, please use
+        the enable_async option.
+
+      enable_async: bool Type. Indicates whether async checkpointing is enabled.
+        Default is False, i.e., no async checkpoint.
 
         Async checkpoint moves the checkpoint file writing off the main thread,
         so that the model can continue to train while the checkpoing file
@@ -63,4 +78,5 @@ def __init__(self, experimental_io_device=None,
         may increase.
     """
     self.experimental_io_device = experimental_io_device
-    self.experimental_enable_async_checkpoint = experimental_enable_async_checkpoint
+    self.enable_async = experimental_enable_async_checkpoint or enable_async
+    self.experimental_enable_async_checkpoint = self.enable_async
diff --git a/tensorflow/python/checkpoint/checkpoint_test.py b/tensorflow/python/checkpoint/checkpoint_test.py
index c37003c0fc1..8f26922d2a9 100644
--- a/tensorflow/python/checkpoint/checkpoint_test.py
+++ b/tensorflow/python/checkpoint/checkpoint_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.checkpoint import checkpoint_options
 from tensorflow.python.checkpoint import graph_view
+from tensorflow.python.checkpoint import save_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -47,6 +48,12 @@
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 
+try:
+  import psutil  # pylint: disable=g-import-not-at-top
+  psutil_import_succeeded = True
+except ImportError:
+  psutil_import_succeeded = False
+
 
 class NonLayerTrackable(autotrackable.AutoTrackable):
 
@@ -106,17 +113,16 @@ def testAddVariable(self):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _, _ = (
-        graph_view.ObjectGraphView(obj).serialize_object_graph())
-    expected_checkpoint_names = (
+
+    expected_checkpoint_names = {
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
         "constant_initializer/.ATTRIBUTES/VARIABLE_VALUE",
         "duplicate/.ATTRIBUTES/VARIABLE_VALUE",
         "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
-    )
-    self.assertCountEqual(expected_checkpoint_names,
-                          [v.name for v in named_variables])
+    }
+    actual_checkpoint_names = _get_all_checkpoint_names(obj)
+    self.assertEqual(expected_checkpoint_names, set(actual_checkpoint_names))
 
   def testInitNotCalled(self):
 
@@ -186,6 +192,15 @@ def name(self):
     return self.non_dep_variable.name
 
 
+def _get_all_checkpoint_names(root):
+  serialized_tensors, _, _, _ = save_util.serialize_graph_view(
+      graph_view.ObjectGraphView(root))
+  checkpoint_names = []
+  for tensor_dict in serialized_tensors.values():
+    checkpoint_names.extend(tensor_dict.keys())
+  return checkpoint_names
+
+
 class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @parameterized.named_parameters(
@@ -205,6 +220,9 @@ def testMoreComplexSaveableReturned(self, enable_async_ckpt):
     ckpt_options = checkpoint_options.CheckpointOptions(
         experimental_enable_async_checkpoint=enable_async_ckpt)
     save_path = checkpoint.save(file_prefix=prefix, options=ckpt_options)
+    # TODO(chienchunh): Identify why sync needs to be called here.
+    if enable_async_ckpt:
+      checkpoint._async_checkpointer().sync()
     self.evaluate(v.non_dep_variable.assign(43.))
     self.evaluate(v.mirrored.assign(44.))
     checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
@@ -212,6 +230,9 @@ def testMoreComplexSaveableReturned(self, enable_async_ckpt):
     self.assertEqual(42., self.evaluate(v.mirrored))
     self.evaluate(v.non_dep_variable.assign(44.))
     save_path = checkpoint.save(file_prefix=prefix, options=ckpt_options)
+    # TODO(chienchunh): Identify why sync needs to be called here.
+    if enable_async_ckpt:
+      checkpoint._async_checkpointer().sync()
     self.evaluate(v.non_dep_variable.assign(45.))
     checkpoint.restore(save_path).assert_consumed().initialize_or_restore()
     self.assertEqual(44., self.evaluate(v.non_dep_variable))
@@ -409,11 +430,10 @@ def _get_checkpoint_name(self, name):
     root = autotrackable.AutoTrackable()
     trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = graph_view.ObjectGraphView(
-        root).serialize_object_graph()
-    with ops.name_scope("root/" + named_variable.name):
+    checkpoint_key = _get_all_checkpoint_names(root)[0]
+    with ops.name_scope("root/" + checkpoint_key):
       pass  # Make sure we can use this as an op name if we prefix it.
-    return named_variable.name
+    return checkpoint_key
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableNameEscaping(self):
@@ -431,9 +451,8 @@ def testNumberedPath(self):
     leaf = autotrackable.AutoTrackable()
     root.leaf = leaf
     trackable_utils.add_variable(leaf, name="v", shape=[])
-    (named_variable,), _, _ = graph_view.ObjectGraphView(
-        root).serialize_object_graph()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
+    checkpoint_key = _get_all_checkpoint_names(root)[0]
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", checkpoint_key)
 
   @test_util.run_in_graph_and_eager_modes
   def testLocalNameValidation(self):
@@ -442,10 +461,9 @@ def testLocalNameValidation(self):
     # Dots are escaped, which avoids conflicts with reserved names.
     root._track_trackable(leaf, name=".ATTRIBUTES")
     trackable_utils.add_variable(trackable=leaf, name="a", shape=[])
-    (named_variable,), _, _ = graph_view.ObjectGraphView(
-        root).serialize_object_graph()
+    checkpoint_key = _get_all_checkpoint_names(root)[0]
     self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
-                     named_variable.name)
+                     checkpoint_key)
 
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
@@ -1135,6 +1153,23 @@ def test_save_in_graph_but_no_session(self):
       with self.assertRaisesRegex(RuntimeError, "create a session"):
         ckpt.write(prefix)
 
+  def test_ckpt_files_closed_after_restoration(self):
+    if not psutil_import_succeeded:
+      self.skipTest(
+          "psutil is required to check that we've closed our files.")
+    root = autotrackable.AutoTrackable()
+    root.v = variables_lib.Variable(1)
+    ckpt = trackable_utils.Checkpoint(root=root)
+    save_path = ckpt.save(os.path.join(self.get_temp_dir(), "ckpt"))
+
+    root2 = autotrackable.AutoTrackable()
+    ckpt2 = trackable_utils.Checkpoint(root=root2)
+    ckpt2.restore(save_path)
+
+    proc = psutil.Process()
+    for file in proc.open_files():
+      self.assertNotIn(save_path, file[0])
+
 
 class SerializeToTensorTest(test.TestCase):
 
@@ -1151,8 +1186,9 @@ def _serialize_to_tensors(self):
         return {"v1": self.v1, "v2": self.v2}
 
       def _restore_from_tensors(self, restored_tensors):
-        self.v1.assign(restored_tensors["v1"])
-        self.v2.assign(restored_tensors["v2"])
+        return control_flow_ops.group(
+            self.v1.assign(restored_tensors["v1"]),
+            self.v2.assign(restored_tensors["v2"]))
 
     root = MultiTensor(variables_lib.Variable(1), variables_lib.Variable(2))
     child = MultiTensor(variables_lib.Variable(3), variables_lib.Variable(4))
@@ -1189,6 +1225,28 @@ def _restore_from_tensors(self, restored_tensors):
     self.assertAllEqual([1, 2, 3, 4],
                         self.evaluate([root.v1, root.v2, child.v1, child.v2]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_reference_variable(self):
+    # Test that refvariable is compatible with tf1 saver / tf2 checkpoint.
+
+    with self.cached_session() as sess:
+      root = autotrackable.AutoTrackable()
+      root.v = variables_lib.VariableV1(5, use_resource=False)
+      sess.run(root.v.initializer)
+      ckpt = trackable_utils.Checkpoint(root)
+      ckpt_path = os.path.join(self.get_temp_dir(), "ckpt")
+      ckpt.write(ckpt_path)
+
+      sess.run(root.v.assign(10))
+      saver = saver_lib.Saver(var_list=[root.v])
+      save_path = saver.save(sess, os.path.join(self.get_temp_dir(), "saver"))
+
+      ckpt.read(ckpt_path).assert_consumed().run_restore_ops()
+      self.assertEqual(5, sess.run(root.v))
+
+      saver.restore(sess, save_path)
+      self.assertEqual(10, sess.run(root.v))
+
 
 class TemplateTests(parameterized.TestCase, test.TestCase):
 
diff --git a/tensorflow/python/checkpoint/restore.py b/tensorflow/python/checkpoint/restore.py
index 0168ce73fd4..34dc8ad66f3 100644
--- a/tensorflow/python/checkpoint/restore.py
+++ b/tensorflow/python/checkpoint/restore.py
@@ -52,13 +52,13 @@ def __init__(self, checkpoint, proto_id):
     # object.
     self.skip_restore = False
 
-  def restore(self, trackable):
+  def restore(self, trackable, reader=None):
     """Restore this value into `trackable`."""
     with ops.init_scope():
       if self.bind_object(trackable):
         # This object's correspondence with a checkpointed object is new, so
         # process deferred restorations for it and its dependencies.
-        restore_ops = self._restore_descendants()
+        restore_ops = self._restore_descendants(reader)
         if restore_ops:
           self._checkpoint.new_restore_ops(restore_ops)
 
@@ -325,12 +325,15 @@ def _create_saveables_by_attribute_name(self, saveable_factories):
 
     return existing_restore_ops, named_saveables
 
-  def restore_ops(self):
+  def restore_ops(self, reader=None):
     """Create or fetch restore ops for this object's attributes.
 
     Requires that the `Trackable` Python object has been bound to an object
     ID in the checkpoint.
 
+    Args:
+      reader: A `CheckpointReader`. If None, a new instance will be created.
+
     Returns:
       A list of operations when graph building, or an empty list when executing
       eagerly.
@@ -341,7 +344,8 @@ def restore_ops(self):
     (restore_ops, tensor_saveables, python_positions,
      _) = self.gather_ops_or_named_saveables()
     restore_ops.extend(
-        self._checkpoint.restore_saveables(tensor_saveables, python_positions))
+        self._checkpoint.restore_saveables(
+            tensor_saveables, python_positions, reader=reader))
     return restore_ops
 
   @property
@@ -427,7 +431,7 @@ def create_slot_variable_position(self, optimizer_object, variable,
   def create_child_position(self, node_id):
     return CheckpointPosition(checkpoint=self.checkpoint, proto_id=node_id)
 
-  def _restore_descendants(self):
+  def _restore_descendants(self, reader=None):
     """Restore the bound Trackable and dependencies (may be deferred)."""
     # Attempt a breadth-first traversal, since presumably the user has more
     # control over shorter paths. If we don't have all of the dependencies at
@@ -461,9 +465,11 @@ def _restore_descendants(self):
       _queue_slot_variables(current_position, visit_queue)
 
     restore_ops.extend(
-        current_position.checkpoint.restore_saveables(tensor_saveables,
-                                                      python_positions,
-                                                      registered_savers))
+        current_position.checkpoint.restore_saveables(
+            tensor_saveables,
+            python_positions,
+            registered_savers,
+            reader=reader))
     return restore_ops
 
   def _single_restore(self):
diff --git a/tensorflow/python/checkpoint/restore_test.py b/tensorflow/python/checkpoint/restore_test.py
index 10bdcbf4d96..e3ef86138cd 100644
--- a/tensorflow/python/checkpoint/restore_test.py
+++ b/tensorflow/python/checkpoint/restore_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.checkpoint import restore
 from tensorflow.python.eager import test
 from tensorflow.python.module import module
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.trackable import autotrackable
 from tensorflow.python.trackable import base
@@ -97,7 +98,7 @@ def __init__(self):
         self.a = variables.Variable(5.0)
 
       def _restore_from_tensors(self, restored_tensors):
-        self.a.assign(restored_tensors["a"])
+        return self.a.assign(restored_tensors["a"])
 
       def _serialize_to_tensors(self):
         return {"a": self.a}
@@ -137,7 +138,7 @@ def __init__(self):
         self.a = variables.Variable(5.0)
 
       def _restore_from_tensors(self, restored_tensors):
-        self.a.assign(restored_tensors["a"])
+        return self.a.assign(restored_tensors["a"])
 
       def _serialize_to_tensors(self):
         return {"a": self.a}
@@ -149,8 +150,10 @@ def __init__(self):
         self.b = variables.Variable(6.0)
 
       def _restore_from_tensors(self, restored_tensors):
-        self.a.assign(restored_tensors["a"])
-        self.b.assign(restored_tensors["b"])
+        return control_flow_ops.group(
+            self.a.assign(restored_tensors["a"]),
+            self.b.assign(restored_tensors["b"])
+        )
 
       def _serialize_to_tensors(self):
         return {"a": self.a, "b": self.b}
diff --git a/tensorflow/python/checkpoint/save_util.py b/tensorflow/python/checkpoint/save_util.py
index 4a78e5015c2..5c455c5910e 100644
--- a/tensorflow/python/checkpoint/save_util.py
+++ b/tensorflow/python/checkpoint/save_util.py
@@ -224,8 +224,10 @@ def _get_tensors_from_trackable(
                                                     local_name)
     tensor_dict[checkpoint_key] = maybe_tensor
 
+    # TODO(b/261786493): Delete this when DCheckpoint is removed.
     if isinstance(maybe_tensor, saveable_object_lib.SaveSpec):
-      maybe_tensor.name = local_name + maybe_tensor.name
+      maybe_tensor.name = checkpoint_key
+      maybe_tensor.slice_spec = ""
 
     if object_graph_proto is not None:
       object_graph_proto.nodes[trackable_data.node_id].attributes.add(
diff --git a/tensorflow/python/checkpoint/save_util_v1.py b/tensorflow/python/checkpoint/save_util_v1.py
index 3b8bb205f1f..b6652e6758d 100644
--- a/tensorflow/python/checkpoint/save_util_v1.py
+++ b/tensorflow/python/checkpoint/save_util_v1.py
@@ -47,9 +47,10 @@ def get_checkpoint_factories_and_keys(object_names, object_map=None):
     object_names: a dictionary that maps `Trackable` objects to auto-generated
       string names.
     object_map: a dictionary mapping `Trackable` to copied `Trackable` objects.
-      The copied objects are generated from `Trackable._map_resources()` which
-      copies the object into another graph. Generally only resource objects
-      (e.g. Variables, Tables) will be in this map.
+      The copied objects are generated from `Trackable.
+      _export_to_saved_model_graph()` which copies the object into another
+      graph. Generally only resource objects (e.g. Variables, Tables) will be
+      in this map.
 
   Returns:
     A tuple of (
diff --git a/tensorflow/python/checkpoint/saveable_compat_test.py b/tensorflow/python/checkpoint/saveable_compat_test.py
index 8f6d5f5282b..566ac96e8d9 100644
--- a/tensorflow/python/checkpoint/saveable_compat_test.py
+++ b/tensorflow/python/checkpoint/saveable_compat_test.py
@@ -21,6 +21,7 @@
 from tensorflow.python.checkpoint import saveable_compat
 from tensorflow.python.checkpoint.testdata import generate_checkpoint
 from tensorflow.python.eager import test
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.trackable import base
 from tensorflow.python.training import checkpoint_utils
@@ -136,8 +137,9 @@ def _serialize_to_tensors(self):
         return {"-a": self.a, "-b": self.b}
 
       def _restore_from_tensors(self, restored_tensors):
-        self.a.assign(restored_tensors["-a"])
-        self.b.assign(restored_tensors["-b"])
+        return control_flow_ops.group(
+            self.a.assign(restored_tensors["-a"]),
+            self.b.assign(restored_tensors["-b"]))
 
     new = NewTrackable()
 
diff --git a/tensorflow/python/checkpoint/tensor_callable.py b/tensorflow/python/checkpoint/tensor_callable.py
new file mode 100644
index 00000000000..474e99bafcd
--- /dev/null
+++ b/tensorflow/python/checkpoint/tensor_callable.py
@@ -0,0 +1,41 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`Callable` class used for checkpointing."""
+
+from tensorflow.python.training.saving import saveable_object
+
+
+class Callable(saveable_object.SaveSpec):
+  """A callable that represents a Tensor that should be saved to checkpoint.
+
+  This can be returned from `_serialize_to_tensor` in place of a Tensor. The
+  callable will be executed on the specified device when the checkpoint is
+  about to be written.
+
+  Any class can use `Callable` for checkpointing, but for SavedModel export,
+  only resource-type variables* are supported.
+
+  * `resource_variable_ops.is_resource_variable(obj)` must return True.
+  """
+
+  def __init__(self, tensor_callable, dtype, device):
+    """Initializes a `Callable` object.
+
+    Args:
+      tensor_callable: A callable that takes no arguments and returns a Tensor.
+      dtype: Dtype of the tensor returned by the callable.
+      device: Device of the tensor returned by the callable.
+    """
+    super().__init__(tensor_callable, None, None, dtype, device)
diff --git a/tensorflow/python/checkpoint/tensor_callable_test.py b/tensorflow/python/checkpoint/tensor_callable_test.py
new file mode 100644
index 00000000000..4fca9d5f2a4
--- /dev/null
+++ b/tensorflow/python/checkpoint/tensor_callable_test.py
@@ -0,0 +1,73 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SaveableObject compatibility."""
+
+import os
+
+from tensorflow.python.checkpoint import checkpoint
+from tensorflow.python.checkpoint import tensor_callable
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import save as saved_model_save
+from tensorflow.python.trackable import base
+
+
+class IncrementWhenSave(base.Trackable):
+
+  def __init__(self):
+    self.read_counter = variables.Variable(0)
+
+  def _serialize_to_tensors(self):
+
+    def _get_and_increment_counter():
+      value = self.read_counter.read_value()
+      self.read_counter.assign_add(1)
+      return value
+
+    return {
+        "read_counter":
+            tensor_callable.Callable(_get_and_increment_counter,
+                                     self.read_counter.dtype,
+                                     self.read_counter.device)
+    }
+
+  def _restore_from_tensors(self, restored_tensors):
+    self.read_counter.assign(restored_tensors["read_counter"])
+
+
+class CallableTest(test.TestCase):
+
+  def test_callable(self):
+    trackable = IncrementWhenSave()
+    ckpt = checkpoint.Checkpoint(attr=trackable)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = ckpt.save(prefix)
+    self.assertEqual(1, self.evaluate(trackable.read_counter))
+    ckpt.save(prefix)
+    self.assertEqual(2, self.evaluate(trackable.read_counter))
+
+    ckpt.restore(save_path)
+    self.assertEqual(0, self.evaluate(trackable.read_counter))
+
+  def test_callable_saved_model_compatibility(self):
+    trackable = IncrementWhenSave()
+    trackable.read_counter.assign(15)
+    save_path = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegex(NotImplementedError, "returns a Callable"):
+      saved_model_save.save(trackable, save_path)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 0cba8ea5074..4e3a9638000 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_py
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -36,10 +37,10 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:numpy_hdr",
         "//tensorflow/python/lib/core:safe_ptr_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
         "//third_party/eigen3",
@@ -48,6 +49,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/core/framework:pywrap_required_hdrs",
         "//third_party/py/numpy:headers",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
@@ -194,9 +196,15 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/protobuf:master_proto_cc",
         "//tensorflow/core/protobuf:replay_log_proto_cc",
-    ],
+    ] + if_static(
+        extra_deps = [
+            "//tensorflow/core/protobuf:master_proto_cc",
+        ],
+        otherwise = [
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+        ],
+    ),
 )
 
 tf_cuda_library(
@@ -222,9 +230,9 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
-        "//tensorflow/python:numpy_lib",
         "//tensorflow/python:safe_ptr",
         "//tensorflow/python/framework:test_ops_kernels",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index a2f6c13d159..08664454261 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -85,7 +85,7 @@ def testUseExistingGraph(self):
       b = constant_op.constant(7.0, shape=[1, 1])
       c = math_ops.matmul(a, b, name='matmul')
     with session.Session(graph=g):
-      result = c.eval()
+      result = self.evaluate(c)
       self.assertAllEqual(result, [[42.0]])
 
   def testUseDefaultGraph(self):
@@ -94,7 +94,7 @@ def testUseDefaultGraph(self):
       b = constant_op.constant(7.0, shape=[1, 1])
       c = math_ops.matmul(a, b, name='matmul')
       with session.Session():
-        result = c.eval()
+        result = self.evaluate(c)
         self.assertAllEqual(result, [[42.0]])
 
   def testCreate(self):
@@ -107,7 +107,7 @@ def testCreate(self):
       copy_val = copy.eval({'W1:0': arr})
       self.assertAllEqual(arr, copy_val)
       # Test without feed.
-      copy_val = copy.eval()
+      copy_val = self.evaluate(copy)
       self.assertAllEqual(
           np.asarray(
               [[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]], dtype=np.float32),
@@ -175,7 +175,7 @@ def testErrorPayload(self):
     with session.Session():
       a = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(lambda e: e.op == a.op):
-        a.eval()
+        self.evaluate(a)
 
   def testErrorCodeWithNoNodeDef(self):
     with session.Session() as s:
@@ -206,7 +206,7 @@ def exc_predicate(e):
                 e.op._original_op._original_op == a.op)
 
       with self.assertRaisesOpError(exc_predicate):
-        c.eval()
+        self.evaluate(c)
 
   def testFetchNone(self):
     with session.Session() as s:
@@ -574,7 +574,7 @@ def testFetchTensorObject(self):
       self.assertAllEqual([[4.0, 4.0, 4.0]], results_with_list[0])
       results_with_single = s.run(c)
       self.assertAllEqual([[4.0, 4.0, 4.0]], results_with_single)
-      results_with_get = c.eval()
+      results_with_get = self.evaluate(c)
       self.assertAllEqual([[4.0, 4.0, 4.0]], results_with_get)
       a_val, b_val = s.run([a, b])  # Test multiple fetches.
       self.assertAllEqual([[1.0, 1.0]], a_val)
@@ -1001,25 +1001,25 @@ def testExtendWithStatefulOperations(self):
       c = math_ops.matmul(a, b)
       v = variables.Variable(c, name='testExtendWithStatefulOperations_v')
       v.initializer.run()
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       d = constant_op.constant(3.0, shape=[2, 3])
       e = math_ops.matmul(a, d)
       assign_e_to_v = state_ops.assign(v, e)
       # Extend will happen here.
-      e_val = e.eval()
+      e_val = self.evaluate(e)
       self.assertAllEqual([[6.0, 6.0, 6.0]], e_val)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       s.run(assign_e_to_v)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
 
   def testExtendWithGroupBy(self):
     with session.Session() as s:
       a = constant_op.constant(1.0, shape=[1, 2])
       p = variables.Variable(a, name='testExtendWithGroupBy_p')
-      a_val = a.eval()  # Force an Extend after this op.
+      a_val = self.evaluate(a)  # Force an Extend after this op.
       self.assertAllEqual([[1.0, 1.0]], a_val)
 
       b = constant_op.constant(2.0, shape=[1, 2])
@@ -1038,7 +1038,7 @@ def testTensorGetMethod(self):
       b = constant_op.constant(2.0, shape=[2, 3])
       c = math_ops.matmul(a, b)
 
-      c_val = c.eval()
+      c_val = self.evaluate(c)
       self.assertAllEqual([[4.0, 4.0, 4.0]], c_val)
 
       fed_c_val = c.eval(feed_dict={a.name: [[4.0, 4.0]]})
@@ -1052,19 +1052,19 @@ def testOperationRunMethod(self):
       v = variables.VariableV1(a, a.dtype)
       assign_a_to_v = state_ops.assign(v, a)
 
-      assign_a_to_v.eval()
+      self.evaluate(assign_a_to_v)
 
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[1.0, 1.0]], v_val)
 
       assign_b_to_v = state_ops.assign(v, b)
 
-      assign_b_to_v.eval()
-      v_val = v.eval()
+      self.evaluate(assign_b_to_v)
+      v_val = self.evaluate(v)
       self.assertAllEqual([[2.0, 2.0]], v_val)
 
       assign_b_to_v.eval(feed_dict={'b:0': [[3.0, 3.0]]})
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[3.0, 3.0]], v_val)
 
   def testDefaultGraph(self):
@@ -1077,17 +1077,17 @@ def testDefaultGraph(self):
       c = math_ops.matmul(a, b)
       v = variables.Variable(c, name='testDefaultGraph_v')
       v.initializer.run()
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       d = constant_op.constant(3.0, shape=[2, 3])
       e = math_ops.matmul(a, d)
       assign_e_to_v = state_ops.assign(v, e)
-      e_val = e.eval()
+      e_val = self.evaluate(e)
       self.assertAllEqual([[6.0, 6.0, 6.0]], e_val)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       s.run(assign_e_to_v)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
       self.assertEqual(ops.get_default_graph(), s.graph)
 
@@ -1105,18 +1105,18 @@ def _testDefaultGraphInThread(self, constructed_event, continue_event, i):
 
       assign_c_to_v = state_ops.assign(v, c)
       v.initializer.run()
-      assign_c_to_v.eval()
-      v_val = v.eval()
+      self.evaluate(assign_c_to_v)
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       d = constant_op.constant(3.0, shape=[2, 3])
       e = math_ops.matmul(a, d)
       assign_e_to_v = state_ops.assign(v, e)
-      e_val = e.eval()
+      e_val = self.evaluate(e)
       self.assertAllEqual([[6.0, 6.0, 6.0]], e_val)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
       s.run(assign_e_to_v)
-      v_val = v.eval()
+      v_val = self.evaluate(v)
       self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
       self.assertEqual(ops.get_default_graph(), s.graph)
 
@@ -1370,10 +1370,10 @@ def testInteractivePlacePrunedGraph(self):
     b = constant_op.constant(1.0, shape=[1, 2])
 
     # Only run the valid op, this should work.
-    b.eval()
+    self.evaluate(b)
 
     with self.assertRaises(errors.InvalidArgumentError):
-      a.eval()
+      self.evaluate(a)
     sess.close()
 
   @test_util.run_v1_only('b/120545219')
@@ -1829,11 +1829,11 @@ def testAsDefault(self):
     c = constant_op.constant(37)
     sess = session.Session()
     with sess.as_default():
-      self.assertEqual(37, c.eval())
+      self.assertEqual(37, self.evaluate(c))
 
     # Ensure that the session remains valid even when it is not captured.
     with session.Session().as_default():
-      self.assertEqual(37, c.eval())
+      self.assertEqual(37, self.evaluate(c))
 
   def testReentry(self):
     sess = session.Session()
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index fc57add79e1..5274cd0e13f 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
 
 // Must be included first
-#include "tensorflow/python/lib/core/numpy.h"
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index c3a5e58221f..d2753215217 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -23,6 +23,12 @@ limitations under the License.
 // clang-format on
 
 #include "Python.h"
+
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
+
 #include "absl/types/optional.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/c/c_api.h"
@@ -35,7 +41,6 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/version_info.h"
 #include "tensorflow/python/client/tf_session_helper.h"
-#include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
@@ -104,7 +109,7 @@ PYBIND11_MAKE_OPAQUE(TF_Status);
 
 PYBIND11_MODULE(_pywrap_tf_session, m) {
   // Numpy initialization code for array checks.
-  tensorflow::ImportNumpy();
+  tsl::ImportNumpy();
 
   py::class_<TF_Graph> TF_Graph_class(m, "TF_Graph");
   py::class_<TF_Operation> TF_Operation_class(m, "TF_Operation");
@@ -564,8 +569,12 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
 
   m.def("TF_NewGraph", TF_NewGraph, py::return_value_policy::reference,
         py::call_guard<py::gil_scoped_release>());
-  m.def("TF_DeleteGraph", TF_DeleteGraph,
-        py::call_guard<py::gil_scoped_release>());
+  // Note: Do not use gil_scoped_release here which eventually (re)aquires the
+  // GIL. As graphs may be (automatically) freed from threads still running
+  // after Python already started to finalize this will lead to
+  // force-termination. See
+  // https://github.com/tensorflow/tensorflow/issues/50853
+  m.def("TF_DeleteGraph", TF_DeleteGraph);
 
   m.def("TF_GraphGetOpDef",
         [](TF_Graph* graph, const char* op_name, TF_Buffer* output_op_def) {
@@ -797,6 +806,9 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
   m.def("TF_ImportGraphDefOptionsSetUniquifyNames",
         TF_ImportGraphDefOptionsSetUniquifyNames,
         py::call_guard<py::gil_scoped_release>());
+  m.def("TF_ImportGraphDefOptionsSetPropagateDeviceSpec",
+        tensorflow::TF_ImportGraphDefOptionsSetPropagateDeviceSpec,
+        py::call_guard<py::gil_scoped_release>());
   m.def("TF_ImportGraphDefOptionsRemapControlDependency",
         TF_ImportGraphDefOptionsRemapControlDependency,
         py::call_guard<py::gil_scoped_release>());
@@ -928,6 +940,15 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
           tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
         });
 
+  m.def("TF_GraphRemoveFunction", [](TF_Graph* graph, const char* func_name) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    // Release GIL.
+    py::gil_scoped_release release;
+    TF_GraphRemoveFunction(graph, func_name, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
+  });
+
   m.def(
       "TF_FunctionImportFunctionDef",
       [](py::bytes proto) {
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 74616b22ec0..79d3318c5c1 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ecf7a7023db..18553a00c40 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2022, 11, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 2, 6)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
@@ -67,7 +67,7 @@ def forward_compatible(year, month, day):
   """Return true if the forward compatibility window has expired.
 
   See [Version
-  compatibility](https://tensorflow.org/guide/version_compat#backward_forward).
+  compatibility](https://www.tensorflow.org/guide/versions#backward_and_partial_forward_compatibility).
 
   Forward-compatibility refers to scenarios where the producer of a TensorFlow
   model (a GraphDef or SavedModel) is compiled against a version of the
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
index 66e3ac200b2..4ae440e6179 100644
--- a/tensorflow/python/compiler/BUILD
+++ b/tensorflow/python/compiler/BUILD
@@ -2,6 +2,7 @@
 # Python APIs for various Tensorflow backends.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
index eaab38182f6..629646292c0 100644
--- a/tensorflow/python/compiler/mlir/BUILD
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/mlir/mlir_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
index 22a06e55bcf..a146e7ebed4 100644
--- a/tensorflow/python/compiler/mlir/mlir_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -126,7 +126,8 @@ def sqr(i):
         tensor_spec.TensorSpec(None, dtypes.float32))
     mlir_module = mlir.convert_function(concrete_function, show_debug_info=True)
     self.assertRegex(mlir_module, r'func @.*sqr.*\(')
-    self.assertRegex(mlir_module, r'callsite\(".*mlir_test.py":')
+    self.assertRegex(mlir_module, r'loc11 = loc\(".*mlir_test.py":123:1\)')
+    self.assertRegex(mlir_module, r'callsite\(#loc11')
 
   @test_util.run_v2_only
   def testImportWithCall(self):
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 551e4fbcf3e..470ed4831c4 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -10,6 +10,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 # xla_enable_strict_auto_jit to False to disable XLA tests.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -121,6 +122,7 @@ cuda_py_test(
         "no_rocm",
         "no_windows",
         "nomac",
+        "requires-net:external",
     ],
     xla_enable_strict_auto_jit = False,
     deps = [
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/BUILD b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
index 1bafc0b029b..162ab2e4ecf 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/BUILD
+++ b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
@@ -3,6 +3,7 @@
 #   numerics and latency.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py b/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
index ca7c64ca216..644092a9fd6 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/model_handler.py
@@ -333,11 +333,16 @@ class ModelHandlerV2(_ModelHandlerBase):
 
   @property
   def graph_func(self):
-    graph_func = load_graph_func(
-        saved_model_dir=self.model_config.saved_model_dir,
-        saved_model_tags=self.model_config.saved_model_tags,
-        saved_model_signature_key=self.model_config.saved_model_signature_key)
-    return convert_to_constants.convert_variables_to_constants_v2(graph_func)
+    try:
+      return self._graph_func
+    except:
+      graph_func = load_graph_func(
+          saved_model_dir=self.model_config.saved_model_dir,
+          saved_model_tags=self.model_config.saved_model_tags,
+          saved_model_signature_key=self.model_config.saved_model_signature_key)
+      self._graph_func = convert_to_constants.convert_variables_to_constants_v2(
+          graph_func)
+      return self._graph_func
 
   @property
   def input_tensor_names(self):
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
index 4a2c394ae76..0b222e8cbc8 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/result_analyzer.py
@@ -17,6 +17,7 @@
 import itertools
 import json
 from typing import Any, Callable, Optional, Sequence, Tuple, Union
+from collections import namedtuple
 
 import numpy as np
 
@@ -137,15 +138,37 @@ def analyze_test_latency(test_results: model_handler.TestResultCollection,
   return DataFrame(column_names=column_names, rows=rows)
 
 
+def str_histogram(buffer, desc):
+  hist, bin_edges = np.histogram(buffer)
+  max_num_elems = np.amax(hist)
+  bin_edges = ["{:.3g}".format(bin) for bin in bin_edges]
+  max_start_bin_width = max(len(bin) for bin in bin_edges)
+  max_end_bin_width = max(len(bin) for bin in bin_edges[1:])
+  MAX_WIDTH = 40
+  ret = "\n========================================================\n"
+  ret += "**** Output " + desc + " ****\n"
+  ret += "---- Histogram ----\n"
+  ret += "{:{width}}|  Num Elems | Visualization\n".format(
+      "Bin Range", width=max_start_bin_width + max_end_bin_width + 5)
+  for num, bin_start, bin_end in zip(hist, bin_edges, bin_edges[1:]):
+    bar = "#" * int(MAX_WIDTH * float(num) / float(max_num_elems))
+    ret += ("({:<{max_start_bin_width}}, {:<{max_end_bin_width}}) | {:10} | "
+            "{:}\n").format(
+        bin_start,
+        bin_end,
+        num,
+        bar,
+        max_start_bin_width=max_start_bin_width,
+        max_end_bin_width=max_end_bin_width,
+    )
+  return ret
+
+
 def analyze_test_numerics(test_results: model_handler.TestResultCollection,
-                          use_cpu_baseline: bool) -> DataFrame:
+                          use_cpu_baseline: bool) -> (DataFrame, str):
   """Analyzes test numerics."""
-  preprocess_funcs = {
-      "diff": lambda x, y: np.fabs(x - y),
-      # Ensures dividends are not zero to avoid exceptions/NaNs.
-      "rel_diff": lambda x, y: np.fabs(x - y) / np.fmax(np.fabs(y), 1.0e-6)
-  }
-  postprocess_funcs = {"mean": np.mean, "std": np.std}
+  preprocess_funcs = {"abs_diff": lambda x, y: np.fabs(x - y)}
+  postprocess_funcs = {"mean": np.mean}
   column_names = []
   columns = []
   base_result = (
@@ -163,18 +186,25 @@ def analyze_test_numerics(test_results: model_handler.TestResultCollection,
       for idx, tensor in enumerate(result.output_tensors):
         name = base_result.output_names[idx]
         cpu_tensor = base_result.output_tensors[idx]
-        metric_value = func1(func0(tensor, cpu_tensor)).item()
+        absdiff = func0(tensor, cpu_tensor)
+        metric_value = func1(absdiff).item()
+        cpu_tensor_hist = str_histogram(cpu_tensor, "cpu_tensor")
+        gpu_tensor_hist = str_histogram(tensor, "gpu_tensor")
+        abs_diff_hist = str_histogram(absdiff, "abs_diff")
+        hist_data = (cpu_tensor_hist, gpu_tensor_hist, abs_diff_hist)
         columns[-1][-1][name] = metric_value
-  return DataFrame(column_names=column_names, columns=columns)
+  return DataFrame(column_names=column_names, columns=columns), hist_data
 
 
-def check_column(df: DataFrame, name: str, fn: Callable[[float], bool]) -> bool:
+def check_column(df: DataFrame, row: int, name: str,
+                 fn: Callable[[float], bool]) -> bool:
   """Checks the values of a column using a custom function and logs abnormals.
 
   The check is only performed on TensorRT models, not native CPU/GPU models.
 
   Args:
     df: The DataFrame to be checked.
+    row: The row in the DataFrame
     name: The name of the column to be checked.
     fn: The function that takes a value of at the specified column and returns
       if the value statisfies the check.
@@ -183,11 +213,10 @@ def check_column(df: DataFrame, name: str, fn: Callable[[float], bool]) -> bool:
     Whether all the values of the specified column satisfies the provided check.
   """
   is_ok = True
-  for r in range(df.n_rows):
-    if df(r, "trt_model"):
-      if not fn(df(r, name)):
-        logging.error("Unsatisfied %s found at: %s", name, df(r))
-        is_ok = False
+  if df(row, "trt_model"):
+    if not fn(df(row, name)):
+      logging.error("Unsatisfied %s found at: %s", name, df(row))
+      is_ok = False
   return is_ok
 
 
@@ -198,17 +227,35 @@ def __init__(
       self,
       use_cpu_latency_baseline: bool,
       use_cpu_numerics_baseline: bool,
-      checkers: Sequence[Callable[[DataFrame], bool]],
+      perf_checkers: Sequence[Callable[[DataFrame], bool]],
+      acc_checkers: Sequence[Callable[[DataFrame], bool]],
   ):
     self._use_cpu_latency_baseline = use_cpu_latency_baseline
     self._use_cpu_numerics_baseline = use_cpu_numerics_baseline
-    self._checkers = checkers
+    self._perf_checkers = perf_checkers
+    self._acc_checkers = acc_checkers
+
 
   def analysis(
       self, test_results: model_handler.TestResultCollection
   ) -> Tuple[DataFrame, Sequence[bool]]:
     df = extract_test_info(test_results)
     df += analyze_test_latency(test_results, self._use_cpu_latency_baseline)
-    df += analyze_test_numerics(test_results, self._use_cpu_numerics_baseline)
-    checks = [c(df) for c in self._checkers]
-    return df, checks
+    df_acc, acc_hist = analyze_test_numerics(test_results,
+                                             self._use_cpu_numerics_baseline)
+    df += df_acc
+    # Index 0 and 1 are non trt_model results, so ignore them here.
+    df_analysis_config = namedtuple("df_analysis_config",
+                                    "precision df_row_index")
+    checker_config = [
+        df_analysis_config("FP32", df.n_rows - 3),
+        df_analysis_config("FP16", df.n_rows - 2),
+        df_analysis_config("INT8", df.n_rows - 1)
+    ]
+
+    checks = []
+    for cc in checker_config:
+      checks.append(self._perf_checkers[cc.precision](df, cc.df_row_index))
+      checks.append(self._acc_checkers[cc.precision](df, cc.df_row_index))
+
+    return df, checks, acc_hist
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
index 1171f3e59e4..8af5b35af9d 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
+++ b/tensorflow/python/compiler/tensorrt/model_tests/run_models.py
@@ -18,6 +18,7 @@
 import os
 import tempfile
 from typing import Callable, Iterable, Sequence
+from collections import namedtuple
 
 from absl import app
 from absl import flags
@@ -65,12 +66,33 @@
                   "The baseline version for numerical difference analysis.")
 
 flags.DEFINE_float(
-    "speedup_tolerance", 0.95,
-    "Log errors whenever mean TensorRT speedup is lower than the tolerance.")
+    "fp32_speedup_tolerance", 0.90,
+    "Log errors whenever mean TensorRT fp32 speedup is lower than the tolerance."
+)
 
 flags.DEFINE_float(
-    "diff_tolerance", 0.05,
-    "Log errors whenever mean TensorRT relative difference is larger than "
+    "fp16_speedup_tolerance", 0.90,
+    "Log errors whenever mean TensorRT fp16 speedup is lower than the tolerance."
+)
+
+flags.DEFINE_float(
+    "int8_speedup_tolerance", 0.90,
+    "Log errors whenever mean TensorRT int8 speedup is lower than the tolerance."
+)
+
+flags.DEFINE_float(
+    "fp32_abs_tolerance", 1e-05,
+    "Log errors whenever mean TensorRT fp32 absolute difference is larger than "
+    "the tolerance.")
+
+flags.DEFINE_float(
+    "fp16_abs_tolerance", 5e-02,
+    "Log errors whenever mean TensorRT fp16 absolute difference is larger than "
+    "the tolerance.")
+
+flags.DEFINE_float(
+    "int8_abs_tolerance", 5e-01,
+    "Log errors whenever mean TensorRT int8 absolute difference is larger than "
     "the tolerance.")
 
 flags.DEFINE_integer(
@@ -159,7 +181,7 @@ def _run_impl(
       test_results = manager.run(inputs)
 
       # Analyzes the latency and numerical results.
-      analysis_result_df, _ = self._analyzer.analysis(test_results)
+      analysis_result_df, _, acc_hist = self._analyzer.analysis(test_results)
 
       # Outputs the analysis results
       model_name = os.path.split(manager.model_config.saved_model_dir)[-1]
@@ -170,6 +192,9 @@ def _run_impl(
       with gfile.Open(
           os.path.join(test_dir, "default_tensorrt_params.txt"), "w") as f:
         f.write(repr(default_trt_converter_params))
+      with gfile.Open(os.path.join(test_dir, "accuracy_histograms.txt"),
+                      "w") as f:
+        [f.write(h) for h in acc_hist]
       self._write_analysis_result(analysis_result_df, test_dir)
 
   def run_trt_precision_tests(self) -> None:
@@ -213,19 +238,37 @@ def main(argv):
   if FLAGS.gpu_memory_limit_mb:
     set_up_gpu_memory_limit(FLAGS.gpu_memory_limit_mb)
 
+  tol = namedtuple("tol", "perf acc")
+  tolerances = {
+      trt.TrtPrecisionMode.FP32:
+          tol(perf=float(FLAGS.fp32_speedup_tolerance),
+              acc=float(FLAGS.fp32_abs_tolerance)),
+      trt.TrtPrecisionMode.FP16:
+          tol(perf=float(FLAGS.fp16_speedup_tolerance),
+              acc=float(FLAGS.fp16_abs_tolerance)),
+      trt.TrtPrecisionMode.INT8:
+          tol(perf=float(FLAGS.int8_speedup_tolerance),
+              acc=float(FLAGS.int8_abs_tolerance)),
+  }
+
   analyzer = result_analyzer.ResultAnalyzer(
       use_cpu_latency_baseline=FLAGS.latency_baseline == "CPU",
       use_cpu_numerics_baseline=FLAGS.numerics_baseline == "CPU",
-      checkers=[
-          functools.partial(
+      perf_checkers={
+          precision: functools.partial(
               result_analyzer.check_column,
               name="speedup",
-              fn=lambda x: x > FLAGS.speedup_tolerance),
-          functools.partial(
+              fn=lambda x: x > tol.perf)
+          for precision, tol in tolerances.items()
+      },
+      acc_checkers={
+          precision: functools.partial(
               result_analyzer.check_column,
-              name="rel_diff_mean",
-              fn=lambda x: all(v < FLAGS.diff_tolerance for v in x.values()))
-      ])
+              name="abs_diff_mean",
+              fn=lambda x: all(v < tol.acc for v in x.values()))
+          for precision, tol in tolerances.items()
+      })
+
   runner = SampleRunner(
       saved_model_dir=FLAGS.saved_model_dir,
       saved_model_tags=FLAGS.saved_model_tags,
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index f5153c78bc3..3284b9e892c 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -6,6 +6,7 @@
 load("//tensorflow/python/compiler/tensorrt:tensorrt.bzl", "tensorrt_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 6860fcffbac..06784c09106 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -341,10 +341,14 @@ def ExpectedMaxBatchSizes(self, run_params):
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
-    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-02
+    if run_params.precision_mode == "INT8":
+      return 3e-1
+    return 1.e-05 if run_params.precision_mode == "FP32" else 2.e-02
 
   def ExpectedRelativeTolerance(self, run_params):
     """The relative tolerance to compare floating point results."""
+    if run_params.precision_mode == "INT8":
+      return 1e-1
     return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-02
 
   def _GetParamsCached(self):
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 8dafce8e129..1abed7489ee 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/xla/experimental/BUILD b/tensorflow/python/compiler/xla/experimental/BUILD
index 9b6ae02988b..1d92f028123 100644
--- a/tensorflow/python/compiler/xla/experimental/BUILD
+++ b/tensorflow/python/compiler/xla/experimental/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/compiler/xla/xla.py b/tensorflow/python/compiler/xla/xla.py
index 83c42f1d685..05900322e18 100644
--- a/tensorflow/python/compiler/xla/xla.py
+++ b/tensorflow/python/compiler/xla/xla.py
@@ -62,7 +62,7 @@
 @tf_export('xla.experimental.compile')
 @deprecated(
     None, 'xla.experimental.compile is deprecated. Consider using '
-    'tf.function(jit_compile=True)',
+    '`@tf.function(jit_compile=True)`.',
     warn_once=True)
 def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
   """Builds an operator that compiles and runs `computation` with XLA.
@@ -72,34 +72,33 @@ def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
   Args:
     computation: A Python function that builds a computation to apply to the
       input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors.
+      `Tensor`s.
 
-      `computation` may return a list of operations and tensors.  Tensors must
-      come before operations in the returned list.  The return value of
-      `compile` is a list of tensors corresponding to the tensors from the
-      output of `computation`.
+      `computation` may return a list of `Tensor`s and `Operation`s.
+      `Tensor`s must come before `Operation`s in the returned list.
 
       All `Operation`s returned from `computation` will be executed when
       evaluating any of the returned output tensors.
     inputs: A list of inputs or `None` (equivalent to an empty list). Each input
-      can be a nested structure containing values that are convertible to
-      tensors. Note that passing an N-dimension list of compatible values will
-      result in a N-dimension list of scalar tensors rather than a single Rank-N
-      tensors. If you need different behavior, convert part of inputs to tensors
-      with `tf.convert_to_tensor`.
+      can be a nested structure containing values that can be converted to
+      `Tensor`s. Note that passing an N-dimension list of compatible values will
+      result in an N-dimension list of scalar `Tensor`s rather than a single
+      Rank-N `Tensor`. If you need a different behavior, convert parts of
+      `inputs` to `Tensor`s with `tf.convert_to_tensor`.
 
   Returns:
-    Same data structure as if computation(*inputs) is called directly with some
-    exceptions for correctness. Exceptions include:
-      1) None output: a NoOp would be returned which control-depends on
-         computation.
-      2) Single value output: A tuple containing the value would be returned.
-      3) Operation-only outputs: a NoOp would be returned which
-         control-depends on computation.
+    List of `Tensor`s corresponding to the `Tensor`s from
+      the output of `computation` i.e. the same return value as if
+      computation(*inputs) is called directly, with the following exceptions:
+      * None output: a NoOp would be returned with a control dependency on
+         `computation`.
+      * Single value output: a tuple containing the value would be returned.
+      * Operation-only outputs: a NoOp would be returned with a control
+      dependency on `computation`.
       TODO(b/121383831): Investigate into removing these special cases.
 
   Raises:
-    RuntimeError: if called when eager execution is enabled.
+    RuntimeError: When eager execution is enabled.
 
   Known issues:
     When a tf.random operation is built with XLA, the implementation doesn't
@@ -110,9 +109,9 @@ def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
       defined seed doesn't change the numbers generated by the operation.
       Second, when a seed is not specified, running the program multiple times
       will generate the same numbers.
-
   """
   if context.executing_eagerly():
+
     @def_function.function
     def xla_compile_wrapper():
       return _compile_internal(computation, inputs)
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index 13c1ec9b54a..79382b9aab1 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -1,4 +1,7 @@
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "data",
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 47f7036f63f..00f0e919d55 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -96,6 +97,7 @@ tf_py_test(
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:map_op",
     ],
 )
 
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index 3eac6cb26cc..7f73f1bf371 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -17,6 +17,7 @@
 
 from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import map_op
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -34,7 +35,7 @@ def benchmark_helper(chain_length, fn, use_inter_op_parallelism, label,
                          benchmark_id):
       dataset = dataset_ops.Dataset.range(10000)
       for _ in range(chain_length):
-        dataset = dataset_ops.MapDataset(
+        dataset = map_op._MapDataset(  # pylint: disable=protected-access
             dataset, fn, use_inter_op_parallelism=use_inter_op_parallelism)
       self.run_and_report_benchmark(
           dataset,
@@ -73,7 +74,7 @@ def benchmark_helper(fan_out, fn, use_inter_op_parallelism, label,
                          benchmark_id):
       dataset = dataset_ops.Dataset.from_tensors(
           tuple(0 for _ in range(fan_out))).repeat(None)
-      dataset = dataset_ops.MapDataset(
+      dataset = map_op._MapDataset(  # pylint: disable=protected-access
           dataset, fn, use_inter_op_parallelism=use_inter_op_parallelism)
       self.run_and_report_benchmark(
           dataset,
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index 85ab569eb09..a2d35f49e51 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 4372ce54c15..129e53be340 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -47,6 +47,7 @@
 @@ThreadingOptions
 
 @@assert_cardinality
+@@at
 @@bucket_by_sequence_length
 @@cardinality
 @@choose_from_datasets
@@ -126,6 +127,7 @@
 from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
 from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
 from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
+from tensorflow.python.data.experimental.ops.random_access import at
 from tensorflow.python.data.experimental.ops.random_ops import RandomDataset
 from tensorflow.python.data.experimental.ops.readers import CsvDataset
 from tensorflow.python.data.experimental.ops.readers import make_batched_features_dataset
@@ -140,10 +142,10 @@
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
 from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
 from tensorflow.python.data.ops.dataset_ops import DatasetSpec as DatasetStructure
-from tensorflow.python.data.ops.dataset_ops import enable_debug_mode
 from tensorflow.python.data.ops.dataset_ops import from_variant
 from tensorflow.python.data.ops.dataset_ops import get_structure
 from tensorflow.python.data.ops.dataset_ops import to_variant
+from tensorflow.python.data.ops.debug_mode import enable_debug_mode
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
 from tensorflow.python.data.ops.optional_ops import Optional
 from tensorflow.python.data.ops.optional_ops import OptionalSpec as OptionalStructure
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 8f40b68b4a9..eb33370e2bc 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 79f5145c1be..d43bc19d5fe 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,7 +1,10 @@
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_py_test(
     name = "assert_cardinality_test",
@@ -12,6 +15,7 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -153,23 +157,17 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "directed_interleave_dataset_test",
-    size = "medium",
-    srcs = ["directed_interleave_dataset_test.py"],
-    shard_count = 24,
+    name = "distributed_save_test",
+    size = "small",
+    srcs = ["distributed_save_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python/data/experimental/kernel_tests/service:test_base",
+        "//tensorflow/python/data/experimental/ops:distributed_save_op",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -189,6 +187,7 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -218,6 +217,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "index_shuffle_test",
+    size = "large",
+    srcs = ["index_shuffle_test.py"],
+    shard_count = 12,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:shuffle_ops",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "io_test",
     size = "medium",
@@ -338,6 +354,7 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -394,6 +411,7 @@ tf_py_test(
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:map_op",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -570,7 +588,7 @@ tf_py_test(
 
 tf_py_test(
     name = "tf_record_writer_test",
-    size = "small",
+    size = "medium",
     srcs = ["tf_record_writer_test.py"],
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
index eddc896974c..1a226418c47 100644
--- a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -84,16 +85,22 @@ def testIncorrectCardinality(self, num_elements, asserted_cardinality,
 class AssertCardinalityCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                       parameterized.TestCase):
 
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
-
-    def build_dataset(num_elements):
-      return dataset_ops.Dataset.range(num_elements).apply(
-          cardinality.assert_cardinality(num_elements))
+  def build_dataset(self, num_elements, options=None):
+    dataset = dataset_ops.Dataset.range(num_elements).apply(
+        cardinality.assert_cardinality(num_elements))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
-    verify_fn(self, lambda: build_dataset(200), num_outputs=200)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self.build_dataset(200, options), num_outputs=200)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 00af4911f40..e4f33b1ed15 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -587,14 +587,20 @@ def testHintShardingInvalidPattern(self):
       combinations.times(
           test_base.default_test_combinations(),
           combinations.combine(
-              auto_shard_policy=list(options_lib.AutoShardPolicy))))
+              auto_shard_policy=list(
+                  policy.name for policy in options_lib.AutoShardPolicy
+              )
+          ),
+      )
+  )
   def testEnumerateAutoShardPolicies(self, auto_shard_policy):
     """Verifies tf.data handles every auto-shard policy with no errors."""
+    policy_enum = options_lib.AutoShardPolicy[auto_shard_policy]
     dataset = dataset_ops.Dataset.list_files(self._filenames, shuffle=False)
     dataset = dataset.flat_map(core_readers.TFRecordDataset)
     dataset = dataset.batch(5)
     options = options_lib.Options()
-    options.experimental_distribute.auto_shard_policy = auto_shard_policy
+    options.experimental_distribute.auto_shard_policy = policy_enum
     dataset = dataset.with_options(options)
     dataset = distribute._AutoShardDataset(dataset, 5, 3)
     self.getDatasetOutput(dataset, requires_initialization=True)
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
deleted file mode 100644
index 37f7d8d6d31..00000000000
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import checkpoint_test_base
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import combinations
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.platform import test
-
-
-def _weights_type_combinations():
-  return combinations.combine(weights_type=["list", "tensor", "dataset"])
-
-
-def _get_weights_of_type(weights_list, weights_type):
-  if weights_type == "list":
-    return weights_list
-  if weights_type == "tensor":
-    return ops.convert_to_tensor(weights_list, name="weights")
-  return dataset_ops.Dataset.from_tensors(weights_list).repeat()
-
-
-class DirectedInterleaveDatasetTest(test_base.DatasetTestBase,
-                                    parameterized.TestCase):
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testBasic(self):
-    selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
-    input_datasets = [
-        dataset_ops.Dataset.from_tensors(i).repeat(100) for i in range(10)
-    ]
-    dataset = dataset_ops._DirectedInterleaveDataset(selector_dataset,
-                                                     input_datasets)
-    next_element = self.getNext(dataset)
-
-    for _ in range(100):
-      for i in range(10):
-        self.assertEqual(i, self.evaluate(next_element()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  def _normalize(self, vec):
-    return vec / vec.sum()
-
-  def _chi2(self, expected, actual):
-    actual = np.asarray(actual)
-    expected = np.asarray(expected)
-    diff = actual - expected
-    chi2 = np.sum(diff * diff / expected, axis=0)
-    return chi2
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _weights_type_combinations()))
-  def testSampleFromDatasets(self, weights_type):
-    random_seed.set_random_seed(1619)
-    num_samples = 5000
-    rand_probs = self._normalize(np.random.random_sample((5,)))
-
-    # Use chi-squared test to assert that the observed distribution matches the
-    # expected distribution. Based on the implementation in
-    # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py".
-    for probs in [[.85, .05, .1], rand_probs, [1.]]:
-      weights = _get_weights_of_type(np.asarray(probs), weights_type)
-      classes = len(probs)
-
-      # Create a dataset that samples each integer in `[0, num_datasets)`
-      # with probability given by `weights[i]`.
-      dataset = dataset_ops.Dataset.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes)
-      ], weights)
-      dataset = dataset.take(num_samples)
-
-      next_element = self.getNext(dataset)
-      freqs = np.zeros([classes])
-      for _ in range(num_samples):
-        freqs[self.evaluate(next_element())] += 1
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element())
-
-      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _weights_type_combinations()))
-  def testSampleFromDatasetsStoppingOnEmptyDataset(self, weights_type):
-    # Sampling stops when the first dataset is exhausted.
-    weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
-    datasets = [
-        dataset_ops.Dataset.from_tensors(np.int64(-1)),
-        dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
-        dataset_ops.Dataset.range(10).repeat()
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=True)
-
-    samples_list = self.getIteratorOutput(self.getNext(sample_dataset))
-    self.assertEqual(samples_list.count(-1), 1)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _weights_type_combinations()))
-  def testSampleFromDatasetsSkippingEmptyDataset(self, weights_type):
-    # Sampling skips the first dataset after it becomes empty.
-    weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
-    datasets = [
-        dataset_ops.Dataset.from_tensors(np.int64(-1)),
-        dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
-        dataset_ops.Dataset.range(10).repeat()
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=False).take(100)
-
-    samples_list = self.getIteratorOutput(self.getNext(sample_dataset))
-    self.assertLen(samples_list, 100)
-    self.assertEqual(samples_list.count(-1), 1)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _weights_type_combinations()))
-  def testSampleFromDatasetsWithZeroWeight(self, weights_type):
-    # Sampling stops when the second dataset is exhausted.
-    weights = _get_weights_of_type(np.asarray([0., 1.]), weights_type)
-    datasets = [
-        dataset_ops.Dataset.from_tensors(-1).repeat(2),
-        dataset_ops.Dataset.from_tensors(1).repeat(2)
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=True)
-    self.assertDatasetProduces(sample_dataset, [1, 1])
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         _weights_type_combinations()))
-  def testSampleFromEmptyDataset(self, weights_type):
-    weights = _get_weights_of_type(np.asarray([1., 0.]), weights_type)
-    datasets = [
-        dataset_ops.Dataset.range(0),
-        dataset_ops.Dataset.range(1).repeat()
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=True)
-    self.assertDatasetProduces(sample_dataset, [])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testSampleFromDatasetsSkippingDatasetsWithZeroWeight(self):
-    # Sampling skips the first dataset.
-    weights = np.asarray([0., 1.])
-    datasets = [
-        dataset_ops.Dataset.from_tensors(-1).repeat(),
-        dataset_ops.Dataset.from_tensors(1)
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=False)
-    self.assertDatasetProduces(sample_dataset, [1])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testSampleFromDatasetsAllWeightsAreZero(self):
-    # Sampling skips both datasets.
-    weights = np.asarray([0., 0.])
-    datasets = [
-        dataset_ops.Dataset.from_tensors(-1).repeat(),
-        dataset_ops.Dataset.from_tensors(1).repeat()
-    ]
-    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, weights=weights, stop_on_empty_dataset=False)
-    self.assertDatasetProduces(sample_dataset, [])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testSampleFromDatasetsCardinality(self):
-    ds1 = dataset_ops.Dataset.from_tensors([1.0]).repeat()
-    ds2 = dataset_ops.Dataset.from_tensors([2.0]).repeat()
-    ds = dataset_ops.Dataset.sample_from_datasets([ds1, ds2])
-    self.assertEqual(self.evaluate(ds.cardinality()), dataset_ops.INFINITE)
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testSampleFromDatasetsNested(self):
-    ds1 = dataset_ops.Dataset.range(10).window(2)
-    ds2 = dataset_ops.Dataset.range(10, 20).window(2)
-    ds = dataset_ops.Dataset.sample_from_datasets([ds1, ds2],
-                                                  weights=[0.3, 0.7])
-    ds = ds.flat_map(lambda x: x)
-    next_element = self.getNext(ds)
-    self.evaluate(next_element())
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testChooseFromDatasets(self):
-    words = [b"foo", b"bar", b"baz"]
-    datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
-    choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
-    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
-    dataset = dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset)
-    next_element = self.getNext(dataset)
-    for i in choice_array:
-      self.assertEqual(words[i], self.evaluate(next_element()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testChooseFromDatasetsStoppingOnEmptyDataset(self):
-    datasets = [
-        dataset_ops.Dataset.from_tensors(b"foo").repeat(2),
-        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
-        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
-    ]
-    choice_array = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int64)
-    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
-    dataset = dataset_ops.Dataset.choose_from_datasets(
-        datasets, choice_dataset, stop_on_empty_dataset=True)
-    self.assertDatasetProduces(dataset, [b"foo", b"foo"])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testChooseFromDatasetsSkippingEmptyDatasets(self):
-    datasets = [
-        dataset_ops.Dataset.from_tensors(b"foo").repeat(2),
-        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
-        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
-    ]
-    choice_array = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int64)
-    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
-    dataset = dataset_ops.Dataset.choose_from_datasets(
-        datasets, choice_dataset, stop_on_empty_dataset=False)
-    # Chooses 2 elements from the first dataset while the selector specifies 3.
-    self.assertDatasetProduces(
-        dataset,
-        [b"foo", b"foo", b"bar", b"bar", b"bar", b"baz", b"baz", b"baz"])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testChooseFromDatasetsChoiceDatasetIsEmpty(self):
-    datasets = [
-        dataset_ops.Dataset.from_tensors(b"foo").repeat(),
-        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
-        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
-    ]
-    dataset = dataset_ops.Dataset.choose_from_datasets(
-        datasets,
-        choice_dataset=dataset_ops.Dataset.range(0),
-        stop_on_empty_dataset=False)
-    self.assertDatasetProduces(dataset, [])
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testChooseFromDatasetsNested(self):
-    ds1 = dataset_ops.Dataset.range(10).window(2)
-    ds2 = dataset_ops.Dataset.range(10, 20).window(2)
-    choice_dataset = dataset_ops.Dataset.range(2).repeat(5)
-    ds = dataset_ops.Dataset.choose_from_datasets([ds1, ds2], choice_dataset)
-    ds = ds.flat_map(lambda x: x)
-    expected = []
-    for i in range(5):
-      for j in range(2):
-        expected.extend([10*j + 2*i, 10*j + 2*i + 1])
-    self.assertDatasetProduces(ds, expected)
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testErrors(self):
-    with self.assertRaisesRegex(ValueError, r"should have the same length"):
-      dataset_ops.Dataset.sample_from_datasets(
-          [dataset_ops.Dataset.range(10),
-           dataset_ops.Dataset.range(20)],
-          weights=[0.25, 0.25, 0.25, 0.25])
-
-    with self.assertRaisesRegex(TypeError, "`tf.float32` or `tf.float64`"):
-      dataset_ops.Dataset.sample_from_datasets(
-          [dataset_ops.Dataset.range(10),
-           dataset_ops.Dataset.range(20)],
-          weights=[1, 1])
-
-    with self.assertRaisesRegex(TypeError, "must have compatible"):
-      dataset_ops.Dataset.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(0),
-          dataset_ops.Dataset.from_tensors(0.0)
-      ])
-
-    with self.assertRaisesRegex(
-        ValueError, r"Invalid `datasets`. `datasets` should not be empty."):
-      dataset_ops.Dataset.sample_from_datasets(datasets=[], weights=[])
-
-    with self.assertRaisesRegex(TypeError, "tf.int64"):
-      dataset_ops.Dataset.choose_from_datasets(
-          [
-              dataset_ops.Dataset.from_tensors(0),
-              dataset_ops.Dataset.from_tensors(1)
-          ],
-          choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
-
-    with self.assertRaisesRegex(TypeError, "scalar"):
-      dataset_ops.Dataset.choose_from_datasets(
-          [
-              dataset_ops.Dataset.from_tensors(0),
-              dataset_ops.Dataset.from_tensors(1)
-          ],
-          choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
-
-    with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"):
-      dataset = dataset_ops.Dataset.choose_from_datasets(
-          [dataset_ops.Dataset.from_tensors(0)],
-          choice_dataset=dataset_ops.Dataset.from_tensors(
-              constant_op.constant(1, dtype=dtypes.int64)))
-      next_element = self.getNext(dataset)
-      self.evaluate(next_element())
-
-    with self.assertRaisesRegex(
-        ValueError, r"Invalid `datasets`. `datasets` should not be empty."):
-      dataset_ops.Dataset.choose_from_datasets(
-          datasets=[], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
-
-    with self.assertRaisesRegex(
-        TypeError, r"`choice_dataset` should be a `tf.data.Dataset`"):
-      datasets = [dataset_ops.Dataset.range(42)]
-      dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset=None)
-
-
-class SampleFromDatasetsCheckpointTest(checkpoint_test_base.CheckpointTestBase,
-                                       parameterized.TestCase):
-
-  def _build_dataset(self, probs, num_samples):
-    datasets = [
-        dataset_ops.Dataset.from_tensors(i).repeat(None)
-        for i in range(len(probs))
-    ]
-    dataset = dataset_ops.Dataset.sample_from_datasets(
-        datasets, probs, seed=1813)
-    return dataset.take(num_samples)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
-    verify_fn(
-        self, lambda: self._build_dataset([0.5, 0.5], 100), num_outputs=100)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/distributed_save_test.py b/tensorflow/python/data/experimental/kernel_tests/distributed_save_test.py
new file mode 100644
index 00000000000..62554b25e7e
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/distributed_save_test.py
@@ -0,0 +1,96 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data.experimental.distributed_save."""
+
+import os
+import shutil
+import tempfile
+import time
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
+from tensorflow.python.data.experimental.ops import distributed_save_op
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+# TODO(mpcallanan): Restructure this and snapshot_ft_test.py to share more.
+
+# Enum value for `SnapshotStreamInfo::DONE`.
+_DONE = 4
+
+
+class DistributedSaveTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._test_dir = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),
+        "distributed_save_test",
+    )
+
+  def tearDown(self):
+    super().tearDown()
+    try:
+      shutil.rmtree(self._test_dir)
+    except FileNotFoundError:
+      pass
+
+
+class DistributedSaveTfDataServiceTest(data_service_test_base.TestBase,
+                                       DistributedSaveTest):
+
+  def save(self, dataset):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    distributed_save_op.distributed_save(dataset, self._test_dir,
+                                         cluster.dispatcher_address())
+    return cluster
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSimple(self):
+    dataset = dataset_ops.Dataset.range(10)
+    cluster = self.save(dataset)
+    # TODO(b/250921378) Test loading.
+    self.assertTrue(
+        os.path.exists(os.path.join(self._test_dir, "snapshot.metadata")))
+    streams = lambda: cluster.snapshot_streams(self._test_dir)
+    while len(streams()) != 1 or streams()[0].state != _DONE:
+      time.sleep(0.1)
+    self.assertTrue(os.path.exists(os.path.join(self._test_dir, "DONE")))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testBadDispatcherAddress(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegex(ValueError, "must be a string"):
+      distributed_save_op.distributed_save(dataset, "", 1)
+    with self.assertRaisesRegex(ValueError, "must not be empty"):
+      distributed_save_op.distributed_save(dataset, "", "")
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testBadCardinality(self):
+    dataset = dataset_ops.Dataset.range(10).repeat()
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "Saving an infinite dataset is not allowed",
+    ):
+      self.save(dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
index acca9e592a0..b0fc5cec1f5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -182,20 +183,30 @@ def testName(self):
 class FromListCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                              parameterized.TestCase):
 
-  def _build_list_dataset(self, elements):
-    return from_list.from_list(elements)
+  def _build_list_dataset(self, elements, options=None):
+    dataset = from_list.from_list(elements)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     # Equal length elements
     elements = [
         np.tile(np.array([[1], [2], [3], [4]]), 20),
         np.tile(np.array([[12], [13], [14], [15]]), 22),
         np.array([37, 38, 39, 40])
     ]
-    verify_fn(self, lambda: self._build_list_dataset(elements), num_outputs=3)
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self._build_list_dataset(elements, options),
+        num_outputs=3)
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
@@ -214,5 +225,6 @@ def testDict(self, verify_fn):
     verify_fn(
         self, lambda: self._build_list_dataset(dict_elements), num_outputs=3)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/index_shuffle_test.py b/tensorflow/python/data/experimental/kernel_tests/index_shuffle_test.py
new file mode 100644
index 00000000000..f1acfefcfb6
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/index_shuffle_test.py
@@ -0,0 +1,230 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.index_shuffle()`."""
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import shuffle_ops
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class IndexShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def _build_dataset(self,
+                     seed=None,
+                     reshuffle_each_iteration=None,
+                     num_elements=10):
+    file_infos = []
+    for _ in range(5):
+      file_infos.append({"path": "unused", "num_elements": num_elements})
+
+    def reader_factory(files):
+      return dataset_ops.Dataset.range(
+          num_elements * array_ops.shape(files, out_type=dtypes.int64)[0])
+
+    return shuffle_ops.index_shuffle(
+        file_infos,
+        reader_factory,
+        seed=seed,
+        reshuffle_each_iteration=reshuffle_each_iteration)
+
+  def testProcessFileInfos(self):
+    file_infos = []
+    file_infos.append({
+        "path": "take_50",
+        "num_elements": 100,
+        "skip": 25,
+        "take": 50
+    })
+    file_infos.append({
+        "path": "skip_all",
+        "num_elements": 100,
+        "skip": -1,
+    })
+    file_infos.append({
+        "path": "take_all",
+        "num_elements": 100,
+        "take": -1
+    })
+    file_infos.append({
+        "path": "take_10",
+        "num_elements": 100,
+        "skip": 90,
+        "take": 20
+    })
+    result = shuffle_ops._process_file_infos(file_infos)
+    self.assertEqual(result["files"],
+                     ["take_50", "skip_all", "take_all", "take_10"])
+    self.assertEqual(result["num_elements"], 160)
+    inputs = [0, 49, 50, 51, 149, 150, 151, 159]
+    expected = [25, 74, 200, 201, 299, 390, 391, 399]
+    for i, expected in enumerate(expected):
+      self.assertEqual(
+          self.evaluate(
+              shuffle_ops._adjust_index([inputs[i]], result["thresholds"],
+                                        result["offsets"])), expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnseeded(self):
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth".
+    unshuffled_elements = np.arange(50)
+    shuffled_elements_1 = self.getDatasetOutput(
+        self._build_dataset(), requires_initialization=True)
+    shuffled_elements_2 = self.getDatasetOutput(
+        self._build_dataset(), requires_initialization=True)
+    self.assertAllEqual(
+        sorted(unshuffled_elements), sorted(shuffled_elements_1))
+    self.assertAllEqual(
+        sorted(unshuffled_elements), sorted(shuffled_elements_2))
+    self.assertNotEqual(shuffled_elements_1, shuffled_elements_2)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSameSeed(self):
+    # Assert that shuffling twice with the same seeds gives the same sequence.
+    shuffled_elements_1 = self.getDatasetOutput(
+        self._build_dataset(seed=42), requires_initialization=True)
+    shuffled_elements_2 = self.getDatasetOutput(
+        self._build_dataset(seed=42), requires_initialization=True)
+    self.assertEqual(shuffled_elements_1, shuffled_elements_2)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testLargeDataSet(self):
+    self._build_dataset(seed=42, num_elements=128*1024*1024)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testDifferentSeed(self):
+    # Assert that shuffling twice with a different seed gives a different
+    # permutation of the same elements.
+    shuffled_elements_1 = self.getDatasetOutput(
+        self._build_dataset(seed=42), requires_initialization=True)
+    shuffled_elements_2 = self.getDatasetOutput(
+        self._build_dataset(seed=24), requires_initialization=True)
+    self.assertNotEqual(shuffled_elements_1, shuffled_elements_2)
+    self.assertAllEqual(
+        sorted(shuffled_elements_1), sorted(shuffled_elements_2))
+
+  @combinations.generate(
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(reshuffle_each_iteration=[True, False])))
+  def testReshuffleEachIteration(self, reshuffle_each_iteration):
+    # Assert that `reshuffle_each_iteration` controls whether data is shuffled
+    # differently across different epochs.
+    dataset = self._build_dataset(
+        seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+    shuffled_elements_1 = self.getDatasetOutput(dataset)
+    shuffled_elements_2 = self.getDatasetOutput(dataset)
+    if reshuffle_each_iteration:
+      self.assertNotEqual(shuffled_elements_1, shuffled_elements_2)
+      self.assertAllEqual(
+          sorted(shuffled_elements_1), sorted(shuffled_elements_2))
+    else:
+      self.assertAllEqual(shuffled_elements_1, shuffled_elements_2)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(skip=[8, 2, 0, -1], take=[8, 2, 0, -1])))
+  def testSkipAndTake(self, skip, take):
+    num_elements = 10
+    file_infos = []
+    file_infos.append({
+        "path": "unused",
+        "num_elements": num_elements,
+        "skip": skip if skip >= 0 else num_elements,
+        "take": take if take >= 0 else num_elements,
+    })
+    start = skip if skip >= 0 else num_elements
+    stop = min(num_elements, skip + take if take >= 0 else num_elements)
+    expected = np.arange(start, stop)
+
+    def reader_factory(_):
+      return dataset_ops.Dataset.range(10)
+
+    dataset = shuffle_ops.index_shuffle(file_infos, reader_factory)
+    actual = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertAllEqual(sorted(expected), sorted(actual))
+
+
+class IndexShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                                 parameterized.TestCase):
+
+  def _build_dataset(
+      self,
+      num_elements_per_file,
+      num_files,
+      num_epochs,
+      seed=None,
+      reshuffle_each_iteration=None,
+      symbolic_checkpoint=None,
+  ):
+    file_infos = []
+    for _ in range(num_files):
+      file_infos.append({
+          "path": "unused",
+          "num_elements": num_elements_per_file,
+      })
+
+    def reader_factory(files):
+      return dataset_ops.Dataset.range(
+          num_elements_per_file *
+          array_ops.shape(files, out_type=dtypes.int64)[0])
+
+    dataset = shuffle_ops.index_shuffle(
+        file_infos,
+        reader_factory,
+        seed=seed,
+        reshuffle_each_iteration=reshuffle_each_iteration)
+    dataset = dataset.repeat(num_epochs)
+    if symbolic_checkpoint:
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      dataset = dataset.with_options(options)
+    return dataset
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              symbolic_checkpoint=[False, True],
+              reshuffle_each_iteration=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint, reshuffle_each_iteration):
+    seed = 42
+    num_elements_per_file = 8
+    num_files = 3
+    num_epochs = 2
+    num_outputs = num_elements_per_file * num_files * num_epochs
+    # pylint: disable=g-long-lambda
+    verify_fn(
+        self, lambda: self._build_dataset(
+            num_elements_per_file=num_elements_per_file,
+            num_files=num_files,
+            num_epochs=num_epochs,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration,
+            symbolic_checkpoint=symbolic_checkpoint), num_outputs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 3c3307ec388..f1c580af631 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
@@ -422,54 +423,63 @@ class MapAndBatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(drop_remainder=[True, False])))
-  def testNumParallelBatches(self, verify_fn, drop_remainder):
+          combinations.combine(
+              drop_remainder=[True, False], symbolic_checkpoint=[True, False])))
+  def test(self, verify_fn, drop_remainder, symbolic_checkpoint):
     range_size = 11
     num_shards = 3
     num_repeats = 2
     batch_size = 5
-    num_parallel_batches = 2
+    num_parallel_calls = 7
     total_outputs = (range_size // num_shards) * num_repeats
     if drop_remainder:
       num_outputs = total_outputs // batch_size
     else:
       num_outputs = int(math.ceil(total_outputs / batch_size))
 
-    def build_ds(range_start, drop_remainder):
+    def build_ds(range_start, drop_remainder=False, symbolic_checkpoint=False):
 
       def _map_fn(x):
         return math_ops.square(x)
 
-      return dataset_ops.Dataset.range(
-          range_start, range_start + range_size).shard(
-              num_shards=num_shards, index=0).repeat(num_repeats).apply(
-                  batching.map_and_batch(
-                      map_func=_map_fn,
-                      batch_size=batch_size,
-                      num_parallel_batches=num_parallel_batches,
-                      drop_remainder=drop_remainder))
-
-    verify_fn(self, lambda: build_ds(10, drop_remainder=drop_remainder),
-              num_outputs)
+      dataset = dataset_ops.Dataset.range(
+          range_start, range_start + range_size)
+      dataset = dataset.shard(num_shards=num_shards, index=0)
+      dataset = dataset.repeat(num_repeats)
+      dataset = dataset.apply(
+          batching.map_and_batch(
+              map_func=_map_fn,
+              batch_size=batch_size,
+              num_parallel_calls=num_parallel_calls,
+              drop_remainder=drop_remainder))
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self, lambda: build_ds(
+            10,
+            drop_remainder=drop_remainder,
+            symbolic_checkpoint=symbolic_checkpoint), num_outputs)
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
           combinations.combine(drop_remainder=[True, False])))
-  def testNumParallelCalls(self, verify_fn, drop_remainder):
+  def testNumParallelBatches(self, verify_fn, drop_remainder):
     range_size = 11
     num_shards = 3
     num_repeats = 2
     batch_size = 5
-    num_parallel_calls = 7
+    num_parallel_batches = 2
     total_outputs = (range_size // num_shards) * num_repeats
     if drop_remainder:
       num_outputs = total_outputs // batch_size
     else:
       num_outputs = int(math.ceil(total_outputs / batch_size))
 
-    def build_ds(range_start, drop_remainder=False):
+    def build_ds(range_start, drop_remainder):
 
       def _map_fn(x):
         return math_ops.square(x)
@@ -480,7 +490,7 @@ def _map_fn(x):
                   batching.map_and_batch(
                       map_func=_map_fn,
                       batch_size=batch_size,
-                      num_parallel_calls=num_parallel_calls,
+                      num_parallel_batches=num_parallel_batches,
                       drop_remainder=drop_remainder))
 
     verify_fn(self, lambda: build_ds(10, drop_remainder=drop_remainder),
diff --git a/tensorflow/python/data/experimental/kernel_tests/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/model_dataset_test.py
index b088d1647b3..0752ca077ee 100644
--- a/tensorflow/python/data/experimental/kernel_tests/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/model_dataset_test.py
@@ -18,6 +18,7 @@
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import map_op
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
@@ -44,7 +45,7 @@ def testAutotuneOption(self):
   @combinations.generate(test_base.default_test_combinations())
   def testParallelMapWithAutotune(self):
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset_ops.ParallelMapDataset(
+    dataset = map_op._ParallelMapDataset(  # pylint: disable=protected-access
         dataset,
         lambda x: x + 1,
         num_parallel_calls=1,
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index e3b63e8a2a4..082a234cbe1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,7 +1,10 @@
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_py_test(
     name = "filter_fusion_test",
@@ -77,7 +80,7 @@ tf_py_test(
 
 tf_py_test(
     name = "map_fusion_test",
-    size = "small",
+    size = "medium",
     srcs = ["map_fusion_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 5509b82756b..7bed6c5ba25 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -167,217 +167,6 @@ def _allow_all_map_threads(self):
     for i in range(4, 7):
       self.write_coordination_events[i].set()
 
-  @combinations.generate(
-      combinations.times(
-          test_base.default_test_combinations(),
-          combinations.combine(
-              sloppy=[False, True], prefetch_input_elements=[0, 1])))
-  def testSingleThreaded(self, sloppy, prefetch_input_elements):
-    # cycle_length=1,block_length=1 acts like `Dataset.interleave()` and
-    # `Dataset.flat_map()` and is single-threaded. No synchronization required.
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=1,
-            block_length=1,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=prefetch_input_elements))
-    for expected_element in self._interleave(
-        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
-      self.write_coordination_events[expected_element].set()
-      self.assertEqual(expected_element * expected_element,
-                       self.evaluate(next_element()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testSingleThreadedRagged(self):
-    # Tests a sequence with wildly different elements per iterator.
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([3, 7, 4]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=False,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-
-    # Add coordination values for 3 and 7
-    self.read_coordination_events[3] = threading.Semaphore(0)
-    self.write_coordination_events[3] = threading.Event()
-    self.read_coordination_events[7] = threading.Semaphore(0)
-    self.write_coordination_events[7] = threading.Event()
-
-    for expected_element in self._interleave(
-        [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
-      self.write_coordination_events[expected_element].set()
-      output = self.evaluate(next_element())
-      self.assertEqual(expected_element * expected_element, output)
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(sloppy=[False, True])))
-  def testTwoThreadsNoContention(self, sloppy):
-    # num_threads > 1.
-    # Explicit coordination should result in `Dataset.interleave()` behavior
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                         1)):
-      self.write_coordination_events[expected_element].set()
-      if done_first_event:  # First event starts the worker threads.
-        self.read_coordination_events[expected_element].acquire()
-      actual_element = self.evaluate(next_element())
-      if not done_first_event:
-        self.read_coordination_events[expected_element].acquire()
-        done_first_event = True
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(sloppy=[False, True])))
-  def testTwoThreadsNoContentionWithRaces(self, sloppy):
-    """Tests where all the workers race in producing elements.
-
-    Note: this is in contrast with the previous test which carefully sequences
-    the execution of the map functions.
-
-    Args:
-      sloppy: Whether to be sloppy or not.
-    """
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                         1)):
-      if done_first_event:  # First event starts the worker threads.
-        self._allow_all_map_threads()
-        self.read_coordination_events[expected_element].acquire()
-      else:
-        self.write_coordination_events[expected_element].set()
-      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-      actual_element = self.evaluate(next_element())
-      if not done_first_event:
-        done_first_event = True
-        self.assertTrue(
-            self.read_coordination_events[expected_element].acquire(False))
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(sloppy=[False, True])))
-  def testTwoThreadsNoContentionBlockLength(self, sloppy):
-    # num_threads > 1.
-    # Explicit coordination should result in `Dataset.interleave()` behavior
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=2,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                         2)):
-      self.write_coordination_events[expected_element].set()
-      if done_first_event:  # First event starts the worker threads.
-        self.read_coordination_events[expected_element].acquire()
-      actual_element = self.evaluate(next_element())
-      if not done_first_event:
-        done_first_event = True
-        self.read_coordination_events[expected_element].acquire()
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(sloppy=[False, True])))
-  def testTwoThreadsNoContentionWithRacesAndBlocking(self, sloppy):
-    """Tests where all the workers race in producing elements.
-
-    Note: this is in contrast with the previous test which carefully sequences
-    the execution of the map functions.
-
-
-    Args:
-      sloppy: Whether to be sloppy or not.
-    """
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=2,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                         2)):
-      if done_first_event:  # First event starts the worker threads.
-        self._allow_all_map_threads()
-        self.read_coordination_events[expected_element].acquire()
-      else:
-        self.write_coordination_events[expected_element].set()
-      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-      actual_element = self.evaluate(next_element())
-      if not done_first_event:
-        done_first_event = True
-        self.assertTrue(
-            self.read_coordination_events[expected_element].acquire(False))
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(sloppy=[False, True])))
@@ -412,130 +201,6 @@ def _testNonEmptyInputIntoEmptyOutputs(self, sloppy):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(next_element())
 
-  @combinations.generate(
-      combinations.times(
-          test_base.default_test_combinations(),
-          combinations.combine(
-              sloppy=[False, True], prefetch_input_elements=[1, 0])))
-  def testPartiallyEmptyOutputs(self, sloppy, prefetch_input_elements):
-    race_indices = {2, 8, 14}  # Sequence points when sloppy mode has race conds
-    # Mixture of non-empty and empty interleaved datasets.
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 0, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=prefetch_input_elements))
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
-      self.write_coordination_events[expected_element].set()
-      # First event starts the worker threads. Additionally, when running the
-      # sloppy case with prefetch_input_elements=0, we get stuck if we wait
-      # for the read coordination event for certain event orderings in the
-      # presence of finishing iterators.
-      if done_first_event and not (sloppy and (i in race_indices)):
-        self.read_coordination_events[expected_element].acquire()
-      actual_element = self.evaluate(next_element())
-      if not done_first_event or (sloppy and (i in race_indices)):
-        done_first_event = True
-        self.read_coordination_events[expected_element].acquire()
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testDelayedOutputSloppy(self):
-    # Explicitly control the sequence of events to ensure we correctly avoid
-    # head-of-line blocking.
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=True,
-            buffer_output_elements=1,
-            prefetch_input_elements=0))
-
-    mis_ordering = [
-        4, 4, 5, 4, 5, 5, 4, 5, 6, 6, 6, 5, 4, 4, 6, 6, 4, 4, 6, 5, 6, 6, 6, 6,
-        5, 5, 5, 5, 6, 6
-    ]
-    for element in mis_ordering:
-      self.write_coordination_events[element].set()
-      self.assertEqual(element * element, self.evaluate(next_element()))
-      self.assertTrue(self.read_coordination_events[element].acquire(False))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(test_base.default_test_combinations())
-  def testBlockLengthWithContentionSloppy(self):
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    done_first_event = False
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=True,
-            buffer_output_elements=1,
-            prefetch_input_elements=1))
-    # Test against a generating sequence that differs from the uncontended
-    # case, in order to prove sloppy correctness.
-    for i, expected_element in enumerate(
-        self._interleave(
-            [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
-            cycle_length=2,
-            block_length=3)):
-      self.write_coordination_events[expected_element].set()
-      if done_first_event:  # First event starts the worker threads.
-        self.read_coordination_events[expected_element].acquire()
-      actual_element = self.evaluate(next_element())
-      if not done_first_event:
-        self.read_coordination_events[expected_element].acquire()
-        done_first_event = True
-      self.assertEqual(
-          expected_element * expected_element, actual_element,
-          "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                 actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(sloppy=[False, True])))
-  def testEarlyExit(self, sloppy):
-    # Exiting without consuming all input should not block
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=3,
-            block_length=2,
-            sloppy=sloppy,
-            buffer_output_elements=1,
-            prefetch_input_elements=0))
-    for i in range(4, 7):
-      self.write_coordination_events[i].set()
-    elem = self.evaluate(next_element())  # Start all workers
-    # Allow the one successful worker to progress beyond the py_func again.
-    elem = int(math.sqrt(elem))
-    self.write_coordination_events[elem].set()
-    self.read_coordination_events[elem].acquire()
-    # Allow the prefetch to succeed
-    for i in range(4, 7):
-      self.read_coordination_events[i].acquire()
-      self.write_coordination_events[i].set()
-
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(sloppy=[False, True])))
@@ -581,39 +246,6 @@ def _interleave_fn(x):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testErrorsInOutputFn(self):
-    self.skipTest("b/131722904")
-    self._clear_coordination_events()
-    next_element = self.getNext(
-        self.dataset_fn(
-            input_values=np.int64([4, 5, 6]),
-            cycle_length=2,
-            block_length=1,
-            sloppy=False,
-            buffer_output_elements=1,
-            prefetch_input_elements=0))
-
-    except_on_element_indices = set([3])
-
-    for i, expected_element in enumerate(
-        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                         1)):
-      if i in except_on_element_indices:
-        self.error = ValueError()
-        self.write_coordination_events[expected_element].set()
-        with self.assertRaises(errors.InvalidArgumentError):
-          self.evaluate(next_element())
-      else:
-        self.write_coordination_events[expected_element].set()
-        actual_element = self.evaluate(next_element())
-        self.assertEqual(
-            expected_element * expected_element, actual_element,
-            "At index %s: %s expected, got: %s" % (i, expected_element,
-                                                   actual_element))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(next_element())
-
   @combinations.generate(test_base.default_test_combinations())
   def testErrorsInInputFn(self):
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
index 8169ed7d6a3..293230ae02e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
@@ -253,9 +253,6 @@ def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
   def setUp(self):
     super(EagerClusterReplicateTest, self).setUp()
 
-    if context.context().use_tfrt:
-      self.skipTest("b/171412104: This test requires distributed support.")
-
     # TODO(b/171412104): Move create server to __init__ once tfrt support it.
     self._cached_server1 = server_lib.Server.create_local_server()
     self._cached_server2 = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 52c0f227ded..503b89b2d27 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -29,6 +30,25 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "snapshot_ft_test",
+    size = "medium",
+    srcs = ["snapshot_ft_test.py"],
+    shard_count = 8,
+    # TODO(b/250921378): Fix tsan.
+    tags = [
+        "notsan",
+    ],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data/experimental/ops:distributed_save_op",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+    ],
+)
+
 tf_py_test(
     name = "coordinated_read_ft_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/coordinated_read_test.py b/tensorflow/python/data/experimental/kernel_tests/service/coordinated_read_test.py
index 7b4d8a3799b..b0828aadb14 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/coordinated_read_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/coordinated_read_test.py
@@ -37,7 +37,7 @@ class CoordinatedReadTest(data_service_test_base.TestBase,
   def testBasic(self, num_workers, num_consumers):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     ds = self.make_coordinated_read_dataset(cluster, num_consumers)
-    get_next = self.getNext(ds, requires_initialization=False)
+    get_next = self.getNext(ds, requires_initialization=True)
     results = [self.evaluate(get_next()) for _ in range(100)]
     self.checkCoordinatedReadGroups(results, num_consumers)
     cluster.stop_workers()
@@ -48,13 +48,13 @@ def testConsumerRestart(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
     num_consumers = 3
     ds = self.make_coordinated_read_dataset(cluster, num_consumers)
-    get_next = self.getNext(ds, requires_initialization=False)
+    get_next = self.getNext(ds, requires_initialization=True)
     _ = [self.evaluate(get_next()) for _ in range(20)]
 
     ds2 = self.make_coordinated_read_dataset(cluster, num_consumers)
     with self.assertRaisesRegex(errors.FailedPreconditionError,
                                 "current round has already reached"):
-      get_next_ds2 = self.getNext(ds2, requires_initialization=False)
+      get_next_ds2 = self.getNext(ds2, requires_initialization=True)
       _ = [self.evaluate(get_next_ds2()) for _ in range(20)]
     cluster.stop_workers()
 
@@ -122,7 +122,7 @@ def testBucketizing(self):
         deterministic=True)
 
     num_rounds = 4
-    get_next = self.getNext(ds)
+    get_next = self.getNext(ds, requires_initialization=True)
     results = []
     for i in range(num_rounds * num_consumers):
       results.append(self.evaluate(get_next()))
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/cross_trainer_cache_test.py b/tensorflow/python/data/experimental/kernel_tests/service/cross_trainer_cache_test.py
index 26fe0aad472..89187cc8f98 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/cross_trainer_cache_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/cross_trainer_cache_test.py
@@ -107,11 +107,11 @@ def testConcurrentReaders(self):
     # the element just read thus negating the use of the cache as other
     # trainers will not get the correct element.
     # Hence the need to calculate the size of the cache based on the
-    # number of CPU cores and the element size of 363. The extra 8
+    # number of CPU cores and the element size of 423. The extra 8
     # entries are simply a bit of margin.
     num_cpus = multiprocessing.cpu_count()
     cluster = self._create_cluster(
-        num_workers=1, cross_trainer_cache_size_bytes=(num_cpus + 8) * 363)
+        num_workers=1, cross_trainer_cache_size_bytes=(num_cpus + 8) * 423)
     num_readers = 20
     num_elements = 50
     dataset = dataset_ops.Dataset.range(10000000).repeat()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py b/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
index 5cd14b5379c..9cb099c6a93 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/dynamic_sharding_test.py
@@ -265,7 +265,7 @@ def testSampleFromDatasets(self):
     ds = ds.take(num_samples)
 
     freqs = np.zeros([classes])
-    for v in self.getDatasetOutput(ds):
+    for v in self.getDatasetOutput(ds, requires_initialization=True):
       freqs[v] += 1
 
     self.assertGreater(freqs[0], freqs[1])
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/fault_tolerance_test.py b/tensorflow/python/data/experimental/kernel_tests/service/fault_tolerance_test.py
index d1b7bbb2084..c60658fedf4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/fault_tolerance_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/fault_tolerance_test.py
@@ -274,7 +274,6 @@ def testRestartWorker(self, use_same_port, work_dir, fault_tolerant_mode):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testChangeProcessingModeAfterRestart(self):
-    self.skipTest("b/170910141")
     cluster = data_service_test_base.TestCluster(num_workers=1)
     num_elements = 100
     range_dataset = dataset_ops.Dataset.range(num_elements)
@@ -292,8 +291,9 @@ def testChangeProcessingModeAfterRestart(self):
             processing_mode="distributed_epoch",
             service=cluster.dispatcher_address(),
             job_name="test"))
-    with self.assertRaisesOpError("already an existing job with that name "
-                                  "using processing mode <parallel_epochs>"):
+    with self.assertRaisesOpError(
+        "Tried to create job with name test, but found an existing job with "
+        "different parameters"):
       next(iter(ds)).numpy()
 
   @combinations.generate(
@@ -311,6 +311,5 @@ def testDistributeLargeGraphThenRegisterWorker(self, work_dir):
     cluster.add_worker()
     self.assertAllEqual(next(it), tensor)
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/local_workers_test.py b/tensorflow/python/data/experimental/kernel_tests/service/local_workers_test.py
index aa97bb20283..3ee2e62c466 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/local_workers_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/local_workers_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 
+import multiprocessing
 
 class LocalWorkersTest(data_service_test_base.TestBase, parameterized.TestCase):
   """Tests reading from local workers if `target_workers` is `local`."""
@@ -134,7 +135,12 @@ def testMultipleConsumers(self):
     cluster = multi_process_cluster.MultiProcessCluster(
         num_local_workers=num_local_workers,
         num_remote_workers=num_remote_workers)
-    num_elements = 300
+    # Because the elements in datasets are prefetched one per
+    # CPU core, a static number here may be excessively large
+    # for small numbers of CPU cores, or too small for high
+    # CPU core count machines, or probably both.
+    # In this case the below formula should satisfy both needs.
+    num_elements = 50 + (multiprocessing.cpu_count() * 2)
     num_consumers = 8
     iterators = []
     for _ in range(num_consumers):
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/snapshot_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/service/snapshot_ft_test.py
new file mode 100644
index 00000000000..a69dd3e8089
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/service/snapshot_ft_test.py
@@ -0,0 +1,229 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fault tolerance tests for tf.data service snapshots."""
+import os
+import tempfile
+import time
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
+from tensorflow.python.data.experimental.ops import distributed_save_op
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+# Enum value for `SnapshotStreamInfo` states.
+_ORPHAN = 2
+_DONE = 4
+
+
+def write_file(path):
+  os.makedirs(os.path.dirname(path), exist_ok=True)
+  with open(path, "w") as _:
+    pass
+
+
+def get_stream_assignment(cluster, worker_idx, snapshot_idx=0):
+  while not cluster.workers[worker_idx].snapshot_task_progresses():
+    time.sleep(0.1)
+  return (
+      cluster.workers[worker_idx]
+      .snapshot_task_progresses()[snapshot_idx]
+      .snapshot_task_stream_index
+  )
+
+
+def get_stream_assignments(cluster, n, snapshot_idx=0):
+  assignments = {}
+  for i in range(n):
+    assignments[i] = get_stream_assignment(cluster, i, snapshot_idx)
+  return assignments
+
+
+class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._path = os.path.join(
+        tempfile.mkdtemp(dir=self.get_temp_dir()),
+        "snapshot_ft_test",
+    )
+
+  # This "manual" setup function is needed due to some bad interaction between
+  # `setUp` and `combinations` that causes the dataset to be out-of-scope.
+  # It additionally can't take in a `Dataset` as input.
+  def setup(self, num_workers=1, ds_size=10, num_sources=1):
+    ds = dataset_ops.Dataset.range(ds_size)
+    if num_sources > 1:
+      ds = dataset_ops.Dataset.zip((ds,) * num_sources)
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    distributed_save_op.distributed_save(
+        ds, self._path, cluster.dispatcher_address()
+    )
+    return cluster, ds
+
+  def splits_dir(self, stream_idx=0):
+    return os.path.join(
+        self._path,
+        "streams",
+        f"stream_{stream_idx}",
+        "splits",
+    )
+
+  def source_dir(self, stream_idx=0, source_idx=0):
+    return os.path.join(
+        self.splits_dir(stream_idx),
+        f"source_{source_idx}",
+    )
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoverySucceeds(self):
+    cluster, _ = self.setup()
+    cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryBlocksOverwrite(self):
+    cluster, ds = self.setup()
+    cluster.restart_dispatcher()
+    with self.assertRaisesOpError("is already started or completed"):
+      distributed_save_op.distributed_save(
+          ds, self._path, cluster.dispatcher_address()
+      )
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(
+              bad_stream_dir_name=["stream_", "stream_x", "stream_-1"]
+          ),
+      )
+  )
+  def testSnapshotRecoveryFailsWithBadStreamName(self, bad_stream_dir_name):
+    cluster, _ = self.setup()
+    os.makedirs(os.path.join(self._path, "streams", bad_stream_dir_name))
+    with self.assertRaisesRegex(ValueError, "can't parse"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(
+              bad_source_dir_name=["source_", "source_x", "source_-1"]
+          ),
+      )
+  )
+  def testSnapshotRecoveryFailsWithBadSourceName(self, bad_source_dir_name):
+    cluster, _ = self.setup()
+    os.makedirs(os.path.join(self.splits_dir(), bad_source_dir_name))
+    with self.assertRaisesRegex(ValueError, "can't parse"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryFailsWithOutOfBoundsSourceName(self):
+    cluster, _ = self.setup()
+    os.makedirs(os.path.join(self.splits_dir(), "source_1"))
+    with self.assertRaisesRegex(ValueError, "found conflict"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(
+              bad_split_filename=[
+                  "split_",
+                  "split_x_0",
+                  "split_-1_0",
+                  "split_0_x",
+                  "split_0_-1",
+              ]
+          ),
+      )
+  )
+  def testSnapshotRecoveryFailsWithBadSplitNames(self, bad_split_filename):
+    cluster, _ = self.setup()
+    write_file(os.path.join(self.source_dir(), bad_split_filename))
+    with self.assertRaisesRegex(
+        ValueError, "Expected split_<local_split_index>_<global_split_index>"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryFailsWithOutOfOrderSplitName(self):
+    cluster, _ = self.setup()
+    write_file(os.path.join(self.source_dir(), "split_1_0"))
+    with self.assertRaisesRegex(
+        ValueError, "The local split index 1 exceeds the global split index 0"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryFailsWithOutOfBoundsSplitName(self):
+    cluster, _ = self.setup()
+    write_file(os.path.join(self.source_dir(), "split_1_1"))
+    with self.assertRaisesRegex(ValueError, "found conflict"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryFailsWithMissingGlobalIndexInSplitNames(self):
+    cluster, _ = self.setup()
+    write_file(os.path.join(self.source_dir(), "split_0_1"))
+    with self.assertRaisesRegex(ValueError, "found missing global"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSnapshotRecoveryFailsWithDuplicateGlobalIndexInSplitName(self):
+    cluster, _ = self.setup()
+    write_file(os.path.join(self.source_dir(stream_idx=0), "split_0_1"))
+    write_file(os.path.join(self.source_dir(stream_idx=1), "split_0_1"))
+    with self.assertRaisesRegex(ValueError, "found duplicate global"):
+      cluster.restart_dispatcher()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testWorkersRetainStreamAssignmentsAfterDispatcherRestart(self):
+    n = 5
+    cluster, _ = self.setup(num_workers=n, ds_size=10000)
+    assignments = get_stream_assignments(cluster, n)
+    cluster.restart_dispatcher()
+    while len(cluster.snapshot_streams(self._path)) != n:
+      time.sleep(0.1)
+    for i in range(n):
+      self.assertEqual(get_stream_assignment(cluster, i), assignments[i])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testOrphanGetsReassigned(self):
+    n = 5
+    cluster, _ = self.setup(num_workers=n, ds_size=10000)
+    assignments = get_stream_assignments(cluster, n)
+    cluster.stop_worker(0)
+    while cluster.snapshot_streams(self._path)[assignments[0]].state != _ORPHAN:
+      time.sleep(0.1)
+    cluster.add_worker(start=True)
+    self.assertEqual(get_stream_assignment(cluster, n), assignments[0])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testLargeMultiSourceSnapshotRecoversAndCompletes(self):
+    n = 5
+    cluster, _ = self.setup(num_workers=n, ds_size=10000, num_sources=3)
+    cluster.restart_dispatcher()
+    streams = lambda: cluster.snapshot_streams(self._path)
+    while len(streams()) != n or any(
+        stream.state != _DONE for stream in streams()
+    ):
+      time.sleep(0.1)
+    self.assertTrue(os.path.exists(os.path.join(self._path, "DONE")))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/test_base.py b/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
index b17e1bac844..638188a50db 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/test_base.py
@@ -32,6 +32,8 @@
 # We use a faster than normal heartbeat interval so that tests run faster.
 TEST_HEARTBEAT_INTERVAL_MS = 100
 TEST_DISPATCHER_TIMEOUT_MS = 1000
+TEST_WORKER_TIMEOUT_MS = 200
+TEST_JOB_GC_CHECK_INTERNAL_MS = 1000
 PROTOCOL = "grpc"
 
 
@@ -118,6 +120,9 @@ def join(self):
   def num_tasks(self):
     return self._server._num_tasks()
 
+  def snapshot_task_progresses(self):
+    return self._server._snapshot_task_progresses()
+
   def worker_address(self):
     return self._server._address
 
@@ -125,16 +130,19 @@ def worker_address(self):
 class TestCluster:
   """Test tf.data service cluster."""
 
-  def __init__(self,
-               num_workers,
-               dispatcher_port=0,
-               work_dir=TMP_WORK_DIR,
-               fault_tolerant_mode=True,
-               job_gc_check_interval_ms=None,
-               job_gc_timeout_ms=None,
-               worker_shutdown_quiet_period_ms=0,
-               start=True,
-               data_transfer_protocol=None):
+  def __init__(
+      self,
+      num_workers,
+      dispatcher_port=0,
+      work_dir=TMP_WORK_DIR,
+      fault_tolerant_mode=True,
+      job_gc_check_interval_ms=TEST_JOB_GC_CHECK_INTERNAL_MS,
+      job_gc_timeout_ms=None,
+      worker_timeout_ms=TEST_WORKER_TIMEOUT_MS,
+      worker_shutdown_quiet_period_ms=0,
+      start=True,
+      data_transfer_protocol=None,
+  ):
     """Creates a tf.data service test cluster.
 
     Args:
@@ -150,6 +158,8 @@ def __init__(self,
         delete old and unused jobs, in milliseconds.
       job_gc_timeout_ms: How long a job needs to be unused before it becomes a
         candidate for garbage collection, in milliseconds.
+      worker_timeout_ms: How long to wait for a worker to heartbeat before
+        considering it missing, in milliseconds.
       worker_shutdown_quiet_period_ms: When shutting down a worker, how long to
         wait for the gRPC server to process the final requests.
       start: Whether to immediately start the servers in the cluster. If
@@ -169,8 +179,11 @@ def __init__(self,
             protocol=PROTOCOL,
             fault_tolerant_mode=fault_tolerant_mode,
             job_gc_check_interval_ms=job_gc_check_interval_ms,
-            job_gc_timeout_ms=job_gc_timeout_ms),
-        start=start)
+            job_gc_timeout_ms=job_gc_timeout_ms,
+            worker_timeout_ms=worker_timeout_ms,
+        ),
+        start=start,
+    )
 
     self.workers = []
     for _ in range(num_workers):
@@ -230,6 +243,9 @@ def num_registered_workers(self):
   def num_tasks_on_workers(self):
     return sum(worker.num_tasks() for worker in self.workers)
 
+  def snapshot_streams(self, path):
+    return self.dispatcher._snapshot_streams(path)
+
   def __del__(self):
     # Destroy workers before the dispatcher for clean shutdown.
     self.workers.clear()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/worker_tags_test.py b/tensorflow/python/data/experimental/kernel_tests/service/worker_tags_test.py
index a27831fd54e..fc70f32d2b7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/worker_tags_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/worker_tags_test.py
@@ -15,6 +15,7 @@
 """Tests tf.data service reading from workers with specific tags."""
 
 import time
+import multiprocessing
 
 from absl.testing import parameterized
 
@@ -134,7 +135,7 @@ def testCoordinatedRead(self, num_local_workers, num_remote_workers):
         worker_tags=[_COLOCATED_WORKER_TAG])
     num_consumers = 4
     dataset = self.make_coordinated_read_dataset(cluster, num_consumers)
-    get_next = self.getNext(dataset)
+    get_next = self.getNext(dataset, requires_initialization=True)
     results = [self.evaluate(get_next()) for _ in range(200)]
     self.checkCoordinatedReadGroups(results, num_consumers)
 
@@ -149,7 +150,10 @@ def testAddRemoteWorkersMidJob(self, num_local_workers, num_remote_workers):
         num_remote_workers=num_remote_workers,
         worker_tags=[_COLOCATED_WORKER_TAG])
 
-    num_elements = 300
+    # num_elements needs to be bigger than (100 + <cpu core count>), the extra
+    # 100 is just a bit of margin. The CPU core count is involved as
+    # elements are prefetched, one element per CPU core.
+    num_elements = 200 + multiprocessing.cpu_count()
     dataset = self.make_distributed_range_dataset(num_elements, cluster)
     get_next = self.getNext(dataset)
     results = [self.evaluate(get_next()) for _ in range(100)]
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 0bf79e8fdff..e695e53954e 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -60,8 +61,10 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
+        ":compression_ops",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/service:_pywrap_server_lib",
         "//tensorflow/python/data/experimental/service:_pywrap_utils",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
@@ -70,6 +73,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "distributed_save_op",
+    srcs = [
+        "distributed_save_op.py",
+    ],
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/util",
+    ],
+)
+
 py_library(
     name = "distribute",
     srcs = [
@@ -281,6 +296,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:random_op",
         "//tensorflow/python/data/util:structure",
     ],
 )
@@ -308,6 +324,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:map_op",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
@@ -340,8 +357,18 @@ py_library(
         "shuffle_ops.py",
     ],
     srcs_version = "PY3",
+    visibility = [
+        # TODO(jsimsa): Remove non-default visibility exception when `index_shuffle` is made public.
+        "//third_party/py/tensorflow_datasets:__subpackages__",
+        "//tensorflow:internal",
+    ],
     deps = [
+        ":random_access",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -413,6 +440,7 @@ py_library(
         ":counter",
         ":data_service_ops",
         ":distribute",
+        ":distributed_save_op",
         ":enumerate_ops",
         ":error_ops",
         ":from_list",
@@ -435,6 +463,7 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:debug_mode",
         "//tensorflow/python/data/util:nest",
     ],
 )
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 5f48990efcd..9b07a6b4471 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -92,6 +92,7 @@ def _apply_fn(dataset):
 
 
 @tf_export("data.experimental.dense_to_sparse_batch")
+@deprecation.deprecated(None, "Use `tf.data.Dataset.sparse_batch` instead.")
 def dense_to_sparse_batch(batch_size, row_shape):
   """A transformation that batches ragged elements into `tf.sparse.SparseTensor`s.
 
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index f67bca8e25e..2a8eaaae76a 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """The Counter Dataset."""
 from tensorflow.python import tf2
-from tensorflow.python.data.ops import counter_op
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.util import deprecation
@@ -56,7 +55,7 @@ def CounterV2(start=0, step=1, dtype=dtypes.int64):
   Returns:
     A `Dataset` of scalar `dtype` elements.
   """
-  return counter_op.counter(start, step, dtype)
+  return dataset_ops.Dataset.counter(start, step, dtype)
 
 
 @tf_export(v1=["data.experimental.Counter"])
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 6fb43458cbe..d76f9977bf3 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -203,13 +203,6 @@ def _get_compression_proto(compression):
                    f"Must be one of {[COMPRESSION_AUTO, COMPRESSION_NONE]}.")
 
 
-def _decide_compression(compression, data_transfer_protocol):
-  if (compression == COMPRESSION_AUTO and data_transfer_protocol != "grpc" and
-      data_transfer_protocol is not None):
-    return COMPRESSION_NONE
-  return compression
-
-
 def _to_tensor(dataset_id):
   """Converts `dataset_id` to Tensor."""
 
@@ -520,7 +513,6 @@ def _distribute(processing_mode,
   """
   processing_mode = _get_validated_sharding_policy(processing_mode)
   _validate_compression(compression)
-  compression = _decide_compression(compression, data_transfer_protocol)
 
   def _apply_fn(dataset):  # pylint: disable=missing-docstring
     dataset_id = _register_dataset(service, dataset, compression=compression)
@@ -989,16 +981,11 @@ def _get_element_spec():
     data_service_metadata = None
     dataset_id_val = tensor_util.constant_value(dataset_id)
     try:
-      if isinstance(dataset_id_val, str) or isinstance(dataset_id_val, bytes):
-        data_service_metadata = (
-            _pywrap_server_lib.TF_DATA_GetDataServiceMetadataByID(
-                dataset_id_val, address, protocol))
-      else:
-        # TODO(b/236725000): Remove this after the forward compatibility window
-        # has passed.
-        data_service_metadata = (
-            _pywrap_server_lib.TF_DATA_GetDataServiceMetadata(
-                dataset_id_val, address, protocol))
+      data_service_metadata = (
+          _pywrap_server_lib.TF_DATA_GetDataServiceMetadataByID(
+              dataset_id_val, address, protocol
+          )
+      )
     except NotImplementedError as err:
       raise ValueError(
           "The tf.data service is running an earlier version of TensorFlow "
diff --git a/tensorflow/python/data/experimental/ops/distributed_save_op.py b/tensorflow/python/data/experimental/ops/distributed_save_op.py
new file mode 100644
index 00000000000..eff5a66a79d
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/distributed_save_op.py
@@ -0,0 +1,72 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distributed saving of a dataset to disk."""
+
+from tensorflow.core.protobuf import snapshot_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util import lazy_loader
+
+# TODO(b/176933539): Use regular import.
+# TODO(b/238903802): Use TypeSpec serialization methods directly.
+nested_structure_coder = lazy_loader.LazyLoader(
+    "nested_structure_coder", globals(),
+    "tensorflow.python.saved_model.nested_structure_coder")
+
+
+# TODO(b/250921378): Add example to docstring and export to TF API.
+def distributed_save(dataset,
+                     directory,
+                     dispatcher_address,
+                     compression="AUTO"):
+  """Initiates the process of distributedly saving a dataset to disk.
+
+  Args:
+    dataset: The `tf.data.Dataset` to save.
+    directory: A string indicating the directory to which to save `dataset`.
+    dispatcher_address: A string indicating the address of the dispatcher for
+      the tf.data service instance used to save `dataset`.
+    compression: (Optional.) A string indicating whether and how to compress the
+      `dataset` materialization.  If `"AUTO"`, the tf.data runtime decides which
+      algorithm to use.  If `"GZIP"` or `"SNAPPY"`, that specific algorithm is
+      used.  If `None`, the `dataset` materialization is not compressed.
+
+  Returns:
+    `None`.
+
+  Raises:
+    ValueError: If not in eager mode.
+  """
+  if not context.executing_eagerly():
+    return RuntimeError("must be in eager mode")
+
+  if not isinstance(dispatcher_address, str):
+    raise ValueError("`dispatcher_address` must be a string, but is a "
+                     f"{type(dispatcher_address)} ({dispatcher_address}")
+  if not dispatcher_address:
+    raise ValueError("`dispatcher_address` must not be empty")
+
+  metadata = snapshot_pb2.DistributedSnapshotMetadata(
+      element_spec=nested_structure_coder.encode_structure(
+          dataset.element_spec).SerializeToString(),
+      compression=compression,
+  )
+
+  gen_experimental_dataset_ops.distributed_save(
+      dataset._variant_tensor,  # pylint: disable=protected-access
+      directory=directory,
+      address=dispatcher_address,
+      metadata=metadata.SerializeToString(),
+  )
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index cda9f8f69e2..263de71deb9 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Ignore_errors dataset transformations."""
-from tensorflow.python.data.ops import ignore_errors_op
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -47,6 +46,6 @@ def ignore_errors(log_warning=False):
     `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
-    return ignore_errors_op.IgnoreErrorsDataset(dataset, log_warning)
+    return dataset.ignore_errors(log_warning)
 
   return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index b12e96f6950..5fd14d63bd3 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -21,6 +21,7 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -44,7 +45,6 @@ def __init__(self, input_dataset, features, num_parallel_calls,
       self._deterministic = "false"
     # pylint: disable=protected-access
     self._features = parsing_ops._prepend_none_dimension(features)
-    # TODO(b/112859642): Pass sparse_index and sparse_values for SparseFeature
     params = parsing_ops._ParseOpParams.from_features(self._features, [
         parsing_ops.VarLenFeature, parsing_ops.SparseFeature,
         parsing_ops.FixedLenFeature, parsing_ops.FixedLenSequenceFeature,
@@ -102,8 +102,9 @@ def element_spec(self):
     return self._element_spec
 
 
-# TODO(b/111553342): add arguments names and example names as well.
 @tf_export("data.experimental.parse_example_dataset")
+@deprecation.deprecated(
+    None, "Use `tf.data.Dataset.map(tf.io.parse_example(...))` instead.")
 def parse_example_dataset(features, num_parallel_calls=1, deterministic=None):
   """A transformation that parses `Example` protos into a `dict` of tensors.
 
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index f523119d381..8e951ea962c 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -17,13 +17,15 @@
 
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import random_op
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+# TODO(b/260143413): Migrate users to `tf.data.Dataset.random`.
 @deprecation.deprecated(None, "Use `tf.data.Dataset.random(...)`.")
 @tf_export("data.experimental.RandomDataset", v1=[])
-class RandomDatasetV2(dataset_ops.RandomDataset):
+class RandomDatasetV2(random_op._RandomDataset):  # pylint: disable=protected-access
   """A `Dataset` of pseudorandom values."""
 
 
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 1198ac44a51..1ae47f4c9c8 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -24,6 +24,7 @@
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import parsing_ops
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import map_op
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
@@ -619,7 +620,7 @@ def apply_fn(dataset):
   # indefinitely, and all batches will be full-sized.
   dataset = dataset.batch(batch_size=batch_size,
                           drop_remainder=num_epochs is None)
-  dataset = dataset_ops.MapDataset(
+  dataset = map_op._MapDataset(  # pylint: disable=protected-access
       dataset, map_fn, use_inter_op_parallelism=False)
   dataset = dataset.prefetch(prefetch_buffer_size)
 
@@ -1066,7 +1067,7 @@ def apply_fn(dataset):
   # Extract values if the `Example` tensors are stored as key-value tuples.
   if dataset_ops.get_legacy_output_types(dataset) == (
       dtypes.string, dtypes.string):
-    dataset = dataset_ops.MapDataset(
+    dataset = map_op._MapDataset(  # pylint: disable=protected-access
         dataset, lambda _, v: v, use_inter_op_parallelism=False)
 
   # Apply dataset repeat and shuffle transformations.
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index d2a05958c66..ba4d3657802 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -13,12 +13,20 @@
 # limitations under the License.
 # ==============================================================================
 """Experimental shuffle ops."""
+
+import functools
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -48,8 +56,7 @@ def __init__(self, input_dataset, buffer_size, count=None, seed=None):
 
 
 @deprecation.deprecated(
-    None,
-    "Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by "
+    None, "Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by "
     "`tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take "
     "care of using the fused implementation.")
 @tf_export("data.experimental.shuffle_and_repeat")
@@ -103,3 +110,163 @@ def _apply_fn(dataset):  # pylint: disable=missing-docstring
     return _ShuffleAndRepeatDataset(dataset, buffer_size, count, seed)
 
   return _apply_fn
+
+
+def _process_file_infos(file_infos):
+  """Computes aggregate information about files to read.
+
+  The method collects information about the files to read, the total number of
+  elements, and arrays that can be used to account for elements to be skipped,
+  which can be specified via the "skip" and "take" keys.
+
+  To account for elements to skip, the range of each file can be divided into
+  three regions:
+  - S (elements to skip)
+  - T (elements to read)
+  - R (remainder of elements that will also be skipped)
+
+  The `thresholds` and `offsets` arrays are initialized as follows:
+  `thresholds = [0, T_1, T_1 + T_2, ...]` and
+  `offsets = [S_1, S_1 + R_1 + S_2, S_1 + R_1 + S_2 + R_2 + S_3, ...]`
+
+  This makes it possible to map an index from a contiguous range
+  `(0...num_elements_to_read)` to an index in the range of all elements,
+  skipping over elements as per the "skip" and "take" keys values. In
+  particular, for a given input index `X`, we find the greatest `thresholds`
+  value that is smaller or equal to `X`. Let `t(X)` denotes such index in the
+  `thresholds` array. The output index is computed as `X + offsets[t(X)]`.
+
+  Args:
+    file_infos: See `file_infos` argument of `index_shuffle` for details.
+
+  Returns:
+    A dictionary containing the following keys:
+      - `files`, the vector of pathnames of files to read
+      - `num_elements`, an integer identifying the total number of elements
+      - `offsets`, the vector of offsets to use for index adjustment (in case
+        any elements should be skipped)
+      - `thresholds`, the vector of thresholds to use for index adjustment (in
+        case any elements should be skipped)
+  """
+  files = []
+  num_elements = 0
+  offsets = np.int64([])
+  offset_sum = 0
+  thresholds = np.int64([])
+  threshold_sum = 0
+  adjustment_needed = False
+  for file_info in file_infos:
+    files.append(file_info["path"])
+    skip = 0
+    if "skip" in file_info:
+      if file_info["skip"] < -1:
+        raise ValueError("`skip` should be greater than `-1` but got {}".format(
+            file_info["skip"]))
+      if file_info["skip"] == -1:
+        skip = file_info["num_elements"]
+      else:
+        skip = min(file_info["skip"], file_info["num_elements"])
+    take = file_info["num_elements"] - skip
+    if "take" in file_info:
+      if file_info["take"] < -1:
+        raise ValueError("`take` should be greater than `-1` but got {}".format(
+            file_info["take"]))
+      # `file_info["take"] == -1` is a no-op
+      if file_info["take"] != -1:
+        take = min(file_info["take"], take)
+    remainder = file_info["num_elements"] - skip - take
+    if take != file_info["num_elements"]:
+      adjustment_needed = True
+    num_elements += take
+    offsets = np.append(offsets, offset_sum + skip)
+    offset_sum += skip + remainder
+    thresholds = np.append(thresholds, threshold_sum)
+    threshold_sum += take
+  result = {"files": files, "num_elements": num_elements}
+  if adjustment_needed:
+    result["offsets"] = offsets
+    result["thresholds"] = thresholds
+  return result
+
+
+def _adjust_index(index, thresholds, offsets):
+  """Adjusts index to account for elements to be skipped."""
+  t_index = array_ops.shape(
+      array_ops.boolean_mask(
+          thresholds,
+          math_ops.less_equal(thresholds, index)))[0] - 1
+  return index + array_ops.gather(offsets, t_index)
+
+
+# TODO(jsimsa): Expose this method in the public API. When we do, consider
+# defining `FileInfo` as a public API to encapsulate the information provided
+# through the `file_infos` argument.
+def index_shuffle(file_infos,
+                  reader_factory,
+                  seed=None,
+                  reshuffle_each_iteration=False,
+                  num_parallel_calls=dataset_ops.AUTOTUNE):
+  """Creates a (globally) shuffled dataset from the given set of files.
+
+  Unlike `tf.data.Dataset.shuffle()`, which uses an in-memory buffer to shuffle
+  elements of input dataset in a streaming fashion,
+  `tf.data.experimental.index_shuffle()` performs a global shuffle of element
+  indices and then reads the data in a shuffled order. The advantage of
+  `index_shuffle()` is that it can perform global shuffle of datasets that do
+  not fit into memory (as long as the array of their indices does) and that the
+  shuffling logic it provides is compatible with symbolic checkpointing. The
+  disadvantage of `index_shuffle()` is that reading data in a shuffled random
+  order will in general not be as efficient as reading data sequentially.
+
+  Args:
+    file_infos: A list of dictionaries that describe each file of the input
+      dataset. Each dictionary is expected to contain the "path" key, which
+      identifies the path of the file and the "num_elements" key, which
+      identifies the number of elements in the file. In addition, the "skip"
+      and "take" keys can be used to identify the number of elements to skip
+      and take respectively. By default, no elements are skipped and all
+      elements are taken.
+    reader_factory: A function that maps a sequence of filenames to an instance
+      of `tf.data.Dataset` that reads data from the files.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
+      seed that will be used to shuffle the order of elements. Default to
+      non-deterministic seed.
+    reshuffle_each_iteration: (Optional.) A `tf.bool` scalar `tf.Tensor`, that
+      determines whether to change the shuffle order each iteration. Defaults to
+      `False`.
+    num_parallel_calls: (Optional.) A `tf.int64` scalar `tf.Tensor`, that
+      determines the maximum number of random access operations to perform
+      in parallel. By default, the tf.data runtime uses autotuning to determine
+      the value dynamically.
+
+  Returns:
+    A `tf.data.Dataset` object, representing a globally shuffled dataset of
+    the input data.
+  """
+
+  result = _process_file_infos(file_infos)
+
+  def sequential_index_shuffle(seeds):
+    dataset = dataset_ops.Dataset.range(result["num_elements"])
+
+    def read_element(dataset, index):
+      # 1) Shuffle the index.
+      shuffled_index = stateless_random_ops.index_shuffle(
+          index, seeds, result["num_elements"] - 1)
+      # 2) If needed, adjust the index to the non-contiguous range.
+      if "thresholds" in result and "offsets" in result:
+        shuffled_index = _adjust_index(shuffled_index, result["thresholds"],
+                                       result["offsets"])
+      # 3) Perform the read.
+      return random_access.at(dataset, shuffled_index)
+
+    # We evaluate `reader_factory()` eagerly to prevent the dataset from being
+    # created on every lookup.
+    map_func = functools.partial(read_element, reader_factory(result["files"]))
+    return dataset.map(map_func, num_parallel_calls=num_parallel_calls)
+
+  rng_ds = dataset_ops.Dataset.random(
+      seed=seed,
+      rerandomize_each_iteration=reshuffle_each_iteration)
+  rng_ds = rng_ds.take(2).batch(2, drop_remainder=True)
+  return rng_ds.flat_map(sequential_index_shuffle)
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 865ce463c2c..204e6b9c290 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -96,8 +96,7 @@ def __init__(self,
     super(_LegacySnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
-@deprecation.deprecated(
-    None, "Use `tf.data.experimental.snapshot(...)` instead.")
+@deprecation.deprecated(None, "Use `tf.data.Dataset.shapshot(...)` instead.")
 def legacy_snapshot(path,
                     compression=None,
                     reader_path_prefix=None,
@@ -218,7 +217,7 @@ def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
   ```python
   dataset = ...
   dataset = dataset.enumerate()
-  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
+  dataset = dataset.apply(tf.data.Dataset.shapshot("/path/to/snapshot/dir",
       shard_func=lambda x, y: x % NUM_SHARDS, ...))
   dataset = dataset.map(lambda x, y: y)
   ```
@@ -242,7 +241,7 @@ def user_reader_func(datasets):
     # read datasets in parallel and interleave their elements
     return datasets.interleave(lambda x: x, num_parallel_calls=AUTOTUNE)
 
-  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
+  dataset = dataset.apply(tf.data.Dataset.shapshot("/path/to/snapshot/dir",
       reader_func=user_reader_func))
   ```
 
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 9c36f49c2a9..7d5f7c793ba 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test", "tf_python_pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index 2359f5158b0..8d14d4612a3 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -42,10 +42,20 @@ def _get_time_or_placeholder(value):
 
 @tf_export("data.experimental.service.DispatcherConfig")
 class DispatcherConfig(
-    collections.namedtuple("DispatcherConfig", [
-        "port", "protocol", "work_dir", "fault_tolerant_mode",
-        "worker_addresses", "job_gc_check_interval_ms", "job_gc_timeout_ms"
-    ])):
+    collections.namedtuple(
+        "DispatcherConfig",
+        [
+            "port",
+            "protocol",
+            "work_dir",
+            "fault_tolerant_mode",
+            "worker_addresses",
+            "job_gc_check_interval_ms",
+            "job_gc_timeout_ms",
+            "worker_timeout_ms",
+        ],
+    )
+):
   """Configuration class for tf.data service dispatchers.
 
   Fields:
@@ -77,25 +87,38 @@ class DispatcherConfig(
       longer with no consumers. This is useful if there is a large gap in
       time between when consumers read from the job. A lower value will reduce
       the time it takes to reclaim the resources from expired jobs.
+    worker_timeout_ms: How long to wait for a worker to heartbeat before
+      considering it missing. If not set, the runtime will select a reasonable
+      default.
   """
 
-  def __new__(cls,
-              port=0,
-              protocol=None,
-              work_dir=None,
-              fault_tolerant_mode=False,
-              worker_addresses=None,
-              job_gc_check_interval_ms=None,
-              job_gc_timeout_ms=None):
+  def __new__(
+      cls,
+      port=0,
+      protocol=None,
+      work_dir=None,
+      fault_tolerant_mode=False,
+      worker_addresses=None,
+      job_gc_check_interval_ms=None,
+      job_gc_timeout_ms=None,
+      worker_timeout_ms=None,
+  ):
     if protocol is None:
       protocol = _pywrap_utils.TF_DATA_DefaultProtocol()
     job_gc_check_interval_ms = _get_time_or_placeholder(
         job_gc_check_interval_ms)
     job_gc_timeout_ms = _get_time_or_placeholder(job_gc_timeout_ms)
-    return super(DispatcherConfig,
-                 cls).__new__(cls, port, protocol, work_dir,
-                              fault_tolerant_mode, worker_addresses,
-                              job_gc_check_interval_ms, job_gc_timeout_ms)
+    return super().__new__(
+        cls,
+        port,
+        protocol,
+        work_dir,
+        fault_tolerant_mode,
+        worker_addresses,
+        job_gc_check_interval_ms,
+        job_gc_timeout_ms,
+        worker_timeout_ms,
+    )
 
 
 @tf_export("data.experimental.service.DispatchServer", v1=[])
@@ -168,7 +191,9 @@ def __init__(self, config=None, start=True):
           fault_tolerant_mode=config.fault_tolerant_mode,
           worker_addresses=config.worker_addresses,
           job_gc_check_interval_ms=config.job_gc_check_interval_ms,
-          job_gc_timeout_ms=config.job_gc_timeout_ms)
+          job_gc_timeout_ms=config.job_gc_timeout_ms,
+          worker_timeout_ms=config.worker_timeout_ms,
+      )
     self._server = _pywrap_server_lib.TF_DATA_NewDispatchServer(
         config_proto.SerializeToString())
     if start:
@@ -251,6 +276,10 @@ def _num_workers(self):
     """Returns the number of workers registered with the dispatcher."""
     return self._server.num_workers()
 
+  def _snapshot_streams(self, path):
+    """Returns information about all the streams for a snapshot."""
+    return self._server.snapshot_streams(path)
+
 
 @tf_export("data.experimental.service.WorkerConfig")
 class WorkerConfig(
@@ -433,3 +462,11 @@ def _address(self):
   def _num_tasks(self):
     """Returns the number of tasks currently being executed on the worker."""
     return self._server.num_tasks()
+
+  def _snapshot_task_progresses(self):
+    """Returns the progresses of the snapshot tasks currently being executed.
+
+    Returns:
+      An `Iterable[common_pb2.SnapshotTaskProgress]`.
+    """
+    return self._server.snapshot_task_progresses()
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index b1456881896..0a374202b5a 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
+#include <vector>
 
 #include "Python.h"
 #include "absl/strings/str_cat.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/server_lib.h"
@@ -50,6 +52,16 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
              tensorflow::Status status = server->NumWorkers(&num_workers);
              tensorflow::MaybeRaiseFromStatus(status);
              return num_workers;
+           })
+      .def("snapshot_streams",
+           [](tensorflow::data::DispatchGrpcDataServer* server,
+              const std::string& path)
+               -> std::vector<tensorflow::data::SnapshotStreamInfoWrapper> {
+             std::vector<tensorflow::data::SnapshotStreamInfoWrapper> streams;
+             tensorflow::Status status =
+                 server->SnapshotStreams(path, &streams);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return streams;
            });
 
   py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer")
@@ -64,6 +76,16 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
              tensorflow::Status status = server->NumTasks(&num_tasks);
              tensorflow::MaybeRaiseFromStatus(status);
              return num_tasks;
+           })
+      .def("snapshot_task_progresses",
+           [](tensorflow::data::WorkerGrpcDataServer* server)
+               -> std::vector<tensorflow::data::SnapshotTaskProgressWrapper> {
+             std::vector<tensorflow::data::SnapshotTaskProgressWrapper>
+                 snapshot_task_progresses;
+             tensorflow::Status status =
+                 server->SnapshotTaskProgresses(&snapshot_task_progresses);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return snapshot_task_progresses;
            });
 
   m.def(
@@ -100,33 +122,6 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
       },
       py::return_value_policy::reference);
 
-  // TODO(b/236725000): Remove this after the forward compatibility window
-  // has passed.
-  m.def(
-      "TF_DATA_GetDataServiceMetadata",
-      [](int64_t dataset_id, const std::string& address,
-         const std::string& protocol) -> tensorflow::data::DataServiceMetadata {
-        tensorflow::data::DataServiceMetadata metadata;
-        tensorflow::data::DataServiceDispatcherClient client(address, protocol);
-        int64_t deadline_micros = tensorflow::kint64max;
-        tensorflow::Status status;
-        Py_BEGIN_ALLOW_THREADS;
-        status = tensorflow::data::grpc_util::Retry(
-            [&]() {
-              return client.GetDataServiceMetadata(absl::StrCat(dataset_id),
-                                                   metadata);
-            },
-            /*description=*/
-            tensorflow::strings::StrCat(
-                "Get data service metadata for dataset ", dataset_id,
-                " from dispatcher at ", address),
-            deadline_micros);
-        Py_END_ALLOW_THREADS;
-        tensorflow::MaybeRaiseFromStatus(status);
-        return metadata;
-      },
-      py::return_value_policy::reference);
-
   m.def(
       "TF_DATA_GetDataServiceMetadataByID",
       [](std::string dataset_id, const std::string& address,
@@ -161,4 +156,34 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
       .def_property_readonly(
           "compression", &tensorflow::data::DataServiceMetadata::compression)
       .def("__repr__", &tensorflow::data::DataServiceMetadata::DebugString);
+  py::class_<tensorflow::data::SnapshotTaskProgressWrapper>
+      snapshot_task_progress_wrapper(m, "SnapshotTaskProgressWrapper");
+  snapshot_task_progress_wrapper.def(py::init<>())
+      .def_property_readonly(
+          "snapshot_task_base_path",
+          [](const tensorflow::data::SnapshotTaskProgressWrapper&
+                 snapshot_task_progress_wrapper) -> py::bytes {
+            return snapshot_task_progress_wrapper.snapshot_task_base_path;
+          })
+      .def_property_readonly(
+          "snapshot_task_stream_index",
+          [](const tensorflow::data::SnapshotTaskProgressWrapper&
+                 snapshot_task_progress_wrapper) -> int {
+            return snapshot_task_progress_wrapper.snapshot_task_stream_index;
+          });
+  py::class_<tensorflow::data::SnapshotStreamInfoWrapper>
+      snapshot_stream_info_wrapper(m, "SnapshotStreamInfoWrapper");
+  snapshot_stream_info_wrapper.def(py::init<>())
+      .def_property_readonly(
+          "index",
+          [](const tensorflow::data::SnapshotStreamInfoWrapper&
+                 snapshot_stream_info_wrapper) -> int {
+            return snapshot_stream_info_wrapper.index;
+          })
+      .def_property_readonly(
+          "state",
+          [](const tensorflow::data::SnapshotStreamInfoWrapper&
+                 snapshot_stream_info_wrapper) -> int {
+            return snapshot_stream_info_wrapper.state;
+          });
 };
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 2399cf01141..d0566b158ef 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -41,6 +42,7 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/ops/ragged",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -148,6 +150,30 @@ py_library(
     ],
 )
 
+tf_py_test(
+    name = "choose_from_datasets_test",
+    size = "small",
+    srcs = ["choose_from_datasets_test.py"],
+    shard_count = 12,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "concatenate_test",
     size = "medium",
@@ -159,6 +185,7 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -169,9 +196,12 @@ tf_py_test(
     size = "small",
     srcs = ["counter_test.py"],
     deps = [
+        ":checkpoint_test_base",
+        ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
     ],
 )
 
@@ -216,6 +246,7 @@ tf_py_test(
     size = "small",
     srcs = ["enumerate_test.py"],
     deps = [
+        ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -223,6 +254,7 @@ tf_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
     ],
 )
 
@@ -295,6 +327,7 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/ops/ragged",
@@ -317,6 +350,7 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:from_generator_op",
         "//third_party/py/numpy",
     ],
 )
@@ -360,6 +394,7 @@ tf_py_test(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -536,6 +571,7 @@ cuda_py_test(
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:from_generator_op",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:structure",
@@ -547,7 +583,7 @@ cuda_py_test(
 
 tf_py_test(
     name = "len_test",
-    size = "small",
+    size = "medium",
     srcs = ["len_test.py"],
     deps = [
         ":test_base",
@@ -652,6 +688,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:from_generator_op",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:cancellation",
@@ -713,6 +750,7 @@ tf_py_test(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -750,6 +788,9 @@ tf_py_test(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:prefetch_op",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -791,6 +832,7 @@ tf_py_test(
     size = "small",
     srcs = ["random_test.py"],
     deps = [
+        ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_test_lib",
@@ -813,6 +855,7 @@ tf_py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
     ],
 )
 
@@ -861,6 +904,7 @@ tf_py_test(
     name = "repeat_test",
     size = "medium",
     srcs = ["repeat_test.py"],
+    shard_count = 2,
     deps = [
         ":checkpoint_test_base",
         ":test_base",
@@ -873,6 +917,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "sample_from_datasets_test",
+    size = "medium",
+    srcs = ["sample_from_datasets_test.py"],
+    shard_count = 24,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "scan_test",
     size = "medium",
@@ -892,7 +960,7 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
@@ -908,6 +976,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
     ],
 )
 
@@ -947,6 +1016,7 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
@@ -990,6 +1060,7 @@ tf_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
     ],
 )
@@ -1010,7 +1081,7 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/data/ops:options",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index d9f7228f9ff..7a712883245 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -19,7 +19,6 @@
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import pywrap_sanitizers
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.experimental.ops import random_access
@@ -277,8 +276,6 @@ def map_function(x):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testCheckpointLargeBatches(self):
-    if pywrap_sanitizers.is_tsan_enabled():
-      self.skipTest('Creating a large buffer causes OOM when using tsan.')
     # Batches of size 512M
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.ones((64, 1024, 1024), dtype=dtypes.float32)).repeat()
@@ -302,23 +299,35 @@ def testName(self, num_parallel_calls):
 class BatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                           parameterized.TestCase):
 
-  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+  def _build_dataset(self,
+                     multiplier=15.0,
+                     tensor_slice_len=2,
+                     batch_size=2,
+                     options=None):
     components = (np.arange(tensor_slice_len), np.array([[1, 2, 3]]) *
                   np.arange(tensor_slice_len)[:, np.newaxis],
                   np.array(multiplier) * np.arange(tensor_slice_len))
 
-    return dataset_ops.Dataset.from_tensor_slices(components).batch(batch_size)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).batch(
+        batch_size)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     tensor_slice_len = 8
     batch_size = 2
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
     num_outputs = tensor_slice_len // batch_size
-    verify_fn(self,
-              lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
-              num_outputs)
+    verify_fn(
+        self, lambda: self._build_dataset(15.0, tensor_slice_len, batch_size,
+                                          options), num_outputs)
 
   def _sparse(self, i):
     return sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/python/data/kernel_tests/checkpoint_test_base.py b/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
index d309fc705ce..0c2614299a6 100644
--- a/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
+++ b/tensorflow/python/data/kernel_tests/checkpoint_test_base.py
@@ -504,8 +504,8 @@ def does_not_match(self, expected, actual):
       self.match(expected, actual)
 
   def gen_break_points(self, num_outputs, num_samples=10):
-    """Generates `num_samples` breaks points in [0, num_outputs]."""
-    return np.linspace(0, num_outputs, num_samples, dtype=int)
+    """Generates `num_samples` unique break points in [0, num_outputs]."""
+    return np.unique(np.linspace(0, num_outputs, num_samples, dtype=int))
 
   def _build_graph(self, ds_fn, sparse_tensors=False):
     dataset = ds_fn()
diff --git a/tensorflow/python/data/kernel_tests/choose_from_datasets_test.py b/tensorflow/python/data/kernel_tests/choose_from_datasets_test.py
new file mode 100644
index 00000000000..74fe3256884
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/choose_from_datasets_test.py
@@ -0,0 +1,172 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.choose_from_dataset()`."""
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.platform import test
+
+
+class ChooseFromDatasetsTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromDatasets(self):
+    words = [b"foo", b"bar", b"baz"]
+    datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
+    choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
+    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
+    dataset = dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset)
+    next_element = self.getNext(dataset)
+    for i in choice_array:
+      self.assertEqual(words[i], self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromDatasetsStoppingOnEmptyDataset(self):
+    datasets = [
+        dataset_ops.Dataset.from_tensors(b"foo").repeat(2),
+        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
+        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
+    ]
+    choice_array = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int64)
+    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
+    dataset = dataset_ops.Dataset.choose_from_datasets(
+        datasets, choice_dataset, stop_on_empty_dataset=True)
+    self.assertDatasetProduces(dataset, [b"foo", b"foo"])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromDatasetsSkippingEmptyDatasets(self):
+    datasets = [
+        dataset_ops.Dataset.from_tensors(b"foo").repeat(2),
+        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
+        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
+    ]
+    choice_array = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int64)
+    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
+    dataset = dataset_ops.Dataset.choose_from_datasets(
+        datasets, choice_dataset, stop_on_empty_dataset=False)
+    # Chooses 2 elements from the first dataset while the selector specifies 3.
+    self.assertDatasetProduces(
+        dataset,
+        [b"foo", b"foo", b"bar", b"bar", b"bar", b"baz", b"baz", b"baz"])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromDatasetsChoiceDatasetIsEmpty(self):
+    datasets = [
+        dataset_ops.Dataset.from_tensors(b"foo").repeat(),
+        dataset_ops.Dataset.from_tensors(b"bar").repeat(),
+        dataset_ops.Dataset.from_tensors(b"baz").repeat(),
+    ]
+    dataset = dataset_ops.Dataset.choose_from_datasets(
+        datasets,
+        choice_dataset=dataset_ops.Dataset.range(0),
+        stop_on_empty_dataset=False)
+    self.assertDatasetProduces(dataset, [])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromDatasetsNested(self):
+    ds1 = dataset_ops.Dataset.range(10).window(2)
+    ds2 = dataset_ops.Dataset.range(10, 20).window(2)
+    choice_dataset = dataset_ops.Dataset.range(2).repeat(5)
+    ds = dataset_ops.Dataset.choose_from_datasets([ds1, ds2], choice_dataset)
+    ds = ds.flat_map(lambda x: x)
+    expected = []
+    for i in range(5):
+      for j in range(2):
+        expected.extend([10*j + 2*i, 10*j + 2*i + 1])
+    self.assertDatasetProduces(ds, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testErrors(self):
+    with self.assertRaisesRegex(TypeError, "tf.int64"):
+      dataset_ops.Dataset.choose_from_datasets(
+          [
+              dataset_ops.Dataset.from_tensors(0),
+              dataset_ops.Dataset.from_tensors(1)
+          ],
+          choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
+
+    with self.assertRaisesRegex(TypeError, "scalar"):
+      dataset_ops.Dataset.choose_from_datasets(
+          [
+              dataset_ops.Dataset.from_tensors(0),
+              dataset_ops.Dataset.from_tensors(1)
+          ],
+          choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"):
+      dataset = dataset_ops.Dataset.choose_from_datasets(
+          [dataset_ops.Dataset.from_tensors(0)],
+          choice_dataset=dataset_ops.Dataset.from_tensors(
+              constant_op.constant(1, dtype=dtypes.int64)))
+      next_element = self.getNext(dataset)
+      self.evaluate(next_element())
+
+    with self.assertRaisesRegex(
+        ValueError, r"Invalid `datasets`. `datasets` should not be empty."):
+      dataset_ops.Dataset.choose_from_datasets(
+          datasets=[], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
+
+    with self.assertRaisesRegex(
+        TypeError, r"`choice_dataset` should be a `tf.data.Dataset`"):
+      datasets = [dataset_ops.Dataset.range(42)]
+      dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset=None)
+
+
+class ChooseFromDatasetsCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                                       parameterized.TestCase):
+
+  def _build_dataset(self,
+                     num_datasets,
+                     num_elements_per_dataset,
+                     options=None):
+    datasets = [
+        dataset_ops.Dataset.range(num_elements_per_dataset)
+        for _ in range(num_datasets)
+    ]
+    indices = []
+    for i in range(num_datasets):
+      indices = indices + ([i] * num_elements_per_dataset)
+    shuffled_indices = stateless_random_ops.stateless_shuffle(
+        np.int64(indices), seed=[1, 2])
+    choice_dataset = dataset_ops.Dataset.from_tensor_slices(shuffled_indices)
+    dataset = dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self, lambda: self._build_dataset(5, 20, options), num_outputs=100)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
index 57008fcc19f..be0859425f1 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
@@ -162,22 +163,31 @@ def testName(self):
 class ConcatenateCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                 parameterized.TestCase):
 
-  def _build_concatenate_dataset(self, var_array):
+  def _build_concatenate_dataset(self, var_array, options=None):
     input_components = (np.tile(np.array([[1], [2], [3], [4]]), 20),
                         np.tile(np.array([[12], [13], [14], [15]]), 4))
     to_concatenate_components = (np.tile(
         np.array([[5], [6], [7], [8], [9]]), 20), var_array)
 
-    return dataset_ops.Dataset.from_tensor_slices(input_components).concatenate(
-        dataset_ops.Dataset.from_tensor_slices(to_concatenate_components))
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        input_components).concatenate(
+            dataset_ops.Dataset.from_tensor_slices(to_concatenate_components))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     num_outputs = 9
     array = np.tile(np.array([[16], [17], [18], [19], [20]]), 15)
-    verify_fn(self, lambda: self._build_concatenate_dataset(array), num_outputs)
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self._build_concatenate_dataset(array, options),
+              num_outputs)
 
 
 class ConcatenateRandomAccessTest(test_base.DatasetTestBase,
diff --git a/tensorflow/python/data/kernel_tests/counter_test.py b/tensorflow/python/data/kernel_tests/counter_test.py
index 7a9797e7d3d..383b5e75423 100644
--- a/tensorflow/python/data/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/kernel_tests/counter_test.py
@@ -15,8 +15,10 @@
 """Tests for `tf.data.Dataset.counter`."""
 from absl.testing import parameterized
 
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
@@ -40,5 +42,31 @@ def testCounter(self, start, step, expected_output):
       self.assertEqual(expected, self.evaluate(get_next()))
 
 
+class CounterCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                            parameterized.TestCase):
+
+  def _build_counter_dataset(self, start, step, num_outputs, options=None):
+    counter_dataset = dataset_ops.Dataset.counter(start, step)
+    range_dataset = dataset_ops.Dataset.range(num_outputs)
+    dataset = dataset_ops.Dataset.zip((counter_dataset, range_dataset))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    num_outputs = 10
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self, lambda: self._build_counter_dataset(
+            start=2, step=10, num_outputs=num_outputs, options=options),
+        num_outputs)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 7afa966d91b..30401e37448 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.ops import readers
@@ -625,10 +626,10 @@ class DebugDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def setUp(self):
     super(DebugDatasetTest, self).setUp()
-    dataset_ops.toggle_debug_mode(True)
+    debug_mode.toggle_debug_mode(True)
 
   def tearDown(self):
-    dataset_ops.toggle_debug_mode(False)
+    debug_mode.toggle_debug_mode(False)
     super(DebugDatasetTest, self).tearDown()
 
   @combinations.generate(test_base.eager_only_combinations())
diff --git a/tensorflow/python/data/kernel_tests/enumerate_test.py b/tensorflow/python/data/kernel_tests/enumerate_test.py
index d9bea8bf681..e0f57266807 100644
--- a/tensorflow/python/data/kernel_tests/enumerate_test.py
+++ b/tensorflow/python/data/kernel_tests/enumerate_test.py
@@ -15,8 +15,10 @@
 """Tests for `tf.data.Dataset.enumerate()`."""
 from absl.testing import parameterized
 
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -45,5 +47,29 @@ def testEnumerate(self):
                                          (21, (b"b", 2, 38.0))])
 
 
+class EnumerateCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                              parameterized.TestCase):
+
+  def _build_enumerate_dataset(self, start, stop, options=None):
+    dataset = dataset_ops.Dataset.range(start, stop).enumerate()
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    start = 2
+    stop = 10
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self, lambda: self._build_enumerate_dataset(
+            start=start, stop=stop, options=options), stop - start)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
index 896a8af4d25..e81ab7058bb 100644
--- a/tensorflow/python/data/kernel_tests/filter_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
@@ -165,19 +166,27 @@ def testName(self):
 class FilterCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                            parameterized.TestCase):
 
-  def _build_filter_range_graph(self, div):
-    return dataset_ops.Dataset.range(100).filter(
+  def _build_filter_range_dataset(self, div, options=None):
+    dataset = dataset_ops.Dataset.range(100).filter(
         lambda x: math_ops.not_equal(math_ops.mod(x, div), 2))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     div = 3
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
     num_outputs = sum(x % 3 != 2 for x in range(100))
-    verify_fn(self, lambda: self._build_filter_range_graph(div), num_outputs)
+    verify_fn(self, lambda: self._build_filter_range_dataset(div, options),
+              num_outputs)
 
-  def _build_filter_dict_graph(self):
+  def _build_filter_dict_dataset(self):
     return dataset_ops.Dataset.range(10).map(lambda x: {
         "foo": x * 2,
         "bar": x**2
@@ -189,9 +198,9 @@ def _build_filter_dict_graph(self):
                          checkpoint_test_base.default_test_combinations()))
   def testDict(self, verify_fn):
     num_outputs = sum((x**2) % 2 == 0 for x in range(10))
-    verify_fn(self, self._build_filter_dict_graph, num_outputs)
+    verify_fn(self, self._build_filter_dict_dataset, num_outputs)
 
-  def _build_sparse_filter(self):
+  def _build_sparse_filter_dataset(self):
 
     def _map_fn(i):
       return sparse_tensor.SparseTensor(
@@ -207,7 +216,7 @@ def _filter_fn(_, i):
       combinations.times(test_base.default_test_combinations(),
                          checkpoint_test_base.default_test_combinations()))
   def testSparse(self, verify_fn):
-    verify_fn(self, self._build_sparse_filter, num_outputs=5)
+    verify_fn(self, self._build_sparse_filter_dataset, num_outputs=5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index 873105bc4bb..a603fd96c7d 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -183,16 +184,22 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                             parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     # Complicated way of saying range(start, start+25).
     def build_ds(start):
 
       def map_fn(x):
         return dataset_ops.Dataset.range(x, x + 5)
 
-      return dataset_ops.Dataset.range(start, start + 5 * 5, 5).flat_map(map_fn)
+      dataset = dataset_ops.Dataset.range(start, start + 5 * 5, 5)
+      dataset = dataset.flat_map(map_fn)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
 
     verify_fn(self, lambda: build_ds(0), num_outputs=25)
 
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index 36404f312ba..546bfcf1543 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -20,6 +20,7 @@
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import from_generator_op
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -422,7 +423,7 @@ def finalize_py_func():
 
     dummy = constant_op.constant(37)
 
-    dataset = dataset_ops._GeneratorDataset(
+    dataset = from_generator_op._GeneratorDataset(
         dummy, lambda x: x, lambda x: x, finalize_fn,
         tensor_spec.TensorSpec((), dtypes.int32))
 
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index 49ad0f74eda..c54451f6579 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -368,21 +369,27 @@ def testName(self):
 class FromTensorSlicesCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                      parameterized.TestCase):
 
-  def _build_tensor_slices_dataset(self, components):
-    return dataset_ops.Dataset.from_tensor_slices(components)
+  def _build_tensor_slices_dataset(self, components, options=None):
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     # Equal length components
     components = (np.tile(np.array([[1], [2], [3], [4]]),
                           20), np.tile(np.array([[12], [13], [14], [15]]),
                                        22), np.array([37.0, 38.0, 39.0, 40.0]))
-
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
     verify_fn(
         self,
-        lambda: self._build_tensor_slices_dataset(components),
+        lambda: self._build_tensor_slices_dataset(components, options),
         num_outputs=4)
 
   @combinations.generate(
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index 60c6a84c1f7..aabf7820aa1 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -302,17 +302,24 @@ def testName(self):
 class FromTensorsCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                 parameterized.TestCase):
 
-  def _build_tensor_dataset(self, variable_array):
+  def _build_tensor_dataset(self, variable_array, options=None):
     components = (variable_array, np.array([1, 2, 3]), np.array(37.0))
-
-    return dataset_ops.Dataset.from_tensors(components)
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     arr = np.array(1)
-    verify_fn(self, lambda: self._build_tensor_dataset(arr), num_outputs=1)
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self, lambda: self._build_tensor_dataset(arr, options), num_outputs=1)
 
 
 class FromTensorsRandomAccessTest(test_base.DatasetTestBase,
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index 021de358262..28bef8337e1 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -393,19 +393,27 @@ class InterleaveDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
           combinations.combine(
+              symbolic_checkpoint=[False, True],
               cycle_length=2,
               block_length=[1, 3],
               num_parallel_calls=[None, 1, 2])))
-  def test(self, verify_fn, cycle_length, block_length, num_parallel_calls):
+  def test(self, verify_fn, symbolic_checkpoint, cycle_length, block_length,
+           num_parallel_calls):
 
     num_repeats = 2
     input_values = np.array([2, 3], dtype=np.int64)
 
     def _build_dataset():
-      return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
-          num_repeats).interleave(
-              lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
-              cycle_length, block_length, num_parallel_calls)
+      dataset = dataset_ops.Dataset.from_tensor_slices(input_values)
+      dataset = dataset.repeat(num_repeats)
+      dataset = dataset.interleave(
+          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x), cycle_length,
+          block_length, num_parallel_calls)
+      if num_parallel_calls is None:
+        options = options_lib.Options()
+        options.experimental_symbolic_checkpoint = symbolic_checkpoint
+        dataset = dataset.with_options(options)
+      return dataset
 
     num_outputs = np.sum(input_values) * num_repeats
     verify_fn(self, _build_dataset, num_outputs)
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 03851b2fe97..e5cd37bb6e1 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -23,6 +23,7 @@
 from tensorflow.python.client import session
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import from_generator_op
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
@@ -973,8 +974,9 @@ def finalize_fn(n):
     @def_function.function
     def fn():
       output_signature = tensor_spec.TensorSpec((), dtypes.int64)
-      dataset = dataset_ops._GeneratorDataset(1, init_fn, next_fn, finalize_fn,
-                                              output_signature)
+      dataset = from_generator_op._GeneratorDataset(1, init_fn, next_fn,
+                                                    finalize_fn,
+                                                    output_signature)
       iterator = iter(dataset)
       next(iterator)
 
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index f933754b07e..e4fe15a9755 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -24,7 +24,7 @@
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import pywrap_sanitizers
+from tensorflow.python import tf2
 from tensorflow.python.checkpoint import checkpoint as trackable_utils
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.experimental.ops import random_access
@@ -416,8 +416,7 @@ def testCaptureHashTable(self, apply_map):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
-  # TODO(b/123904513)
-  @combinations.generate(_test_combinations_with_mode_v1("graph"))
+  @combinations.generate(_test_combinations_with_mode("graph"))
   def testCaptureQueue(self, apply_map):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
@@ -432,8 +431,18 @@ def testCaptureQueue(self, apply_map):
 
     for element in elements:
       self.assertEqual(element, self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
+    # When the map function in `MapDataset` raises an OutOfRange error, TF1 and
+    # TF2 behave differently. TF1 raises an OutOfRangeError to signal the end of
+    # sequence while TF2 raises an InvalidArgumentError. This behavior is
+    # controlled by the `preserve_cardinality` argument of `map` transformation
+    # which is set to `True` for TF2 and `False` for TF1, which is for backward
+    # compatibility.
+    if tf2.enabled():
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(get_next())
+    else:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   # TODO(b/117581999): Possible deadlock in eager mode, debug.
   @combinations.generate(_test_combinations_with_mode_v1("graph"))
@@ -916,8 +925,7 @@ def _concat(i):
             for i in range(10)
         ])
 
-  # TODO(b/123904513)
-  @combinations.generate(_test_combinations_with_mode_v1("graph"))
+  @combinations.generate(_test_combinations_with_mode("graph"))
   def testParallelMapOutOfRangeError(self, apply_map):
 
     def raising_py_func(i):
@@ -934,8 +942,18 @@ def raising_py_func(i):
     get_next = self.getNext(dataset)
     for i in range(100):
       self.assertEqual(i, self.evaluate(get_next()))
-    with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next())
+    # When the map function in `MapDataset` raises an OutOfRange error, TF1 and
+    # TF2 behave differently. TF1 raises an OutOfRangeError to signal the end of
+    # sequence while TF2 raises an InvalidArgumentError. This behavior is
+    # controlled by the `preserve_cardinality` argument of `map` transformation
+    # which is set to `True` for TF2 and `False` for TF1, which is for backward
+    # compatibility.
+    if tf2.enabled():
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(get_next())
+    else:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   @combinations.generate(_test_combinations())
   def testConstantOutput(self, apply_map):
@@ -1319,8 +1337,6 @@ def map_function(x, y):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testCheckpointLargeBuffer(self):
-    if pywrap_sanitizers.is_tsan_enabled():
-      self.skipTest("Creating a large buffer causes OOM when using tsan.")
     # Tensor of size 512M
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.ones((128, 1024, 1024), dtype=dtypes.float32))
@@ -1347,10 +1363,12 @@ class MapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                         parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations(),
-                         combinations.combine(num_parallel_calls=[None, 2])))
-  def testCore(self, verify_fn, num_parallel_calls):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              num_parallel_calls=[None, 2], symbolic_checkpoint=[False, True])))
+  def testCore(self, verify_fn, num_parallel_calls, symbolic_checkpoint):
 
     tensor_slice_len = 7
     num_epochs = 2
@@ -1365,8 +1383,11 @@ def _build_ds():
       def _map_fn(x, y, z):
         return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-      return (dataset_ops.Dataset.from_tensor_slices(components).map(
-          _map_fn, num_parallel_calls=num_parallel_calls).repeat(num_epochs))
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn, num_parallel_calls=num_parallel_calls).repeat(num_epochs)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
 
     verify_fn(self, _build_ds, tensor_slice_len * num_epochs)
 
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 4f29aef069c..a544842f4c4 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import from_generator_op
 from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.eager import cancellation
@@ -380,7 +381,7 @@ def finalize_fn(n):
 
     @def_function.function
     def fn():
-      dataset = dataset_ops._GeneratorDataset(
+      dataset = from_generator_op._GeneratorDataset(
           1,
           init_fn,
           next_fn,
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index aaad4b7b0f3..231feafcc62 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -14,9 +14,6 @@
 # ==============================================================================
 """Tests for `tf.data.Options`."""
 
-import platform
-import sys
-
 from absl.testing import parameterized
 
 from tensorflow.core.framework import dataset_options_pb2
@@ -73,9 +70,6 @@ def testOptionsTwiceDifferentOptions(self):
 
   @combinations.generate(test_base.default_test_combinations())
   def testOptionsTwiceSameOption(self):
-    if sys.version_info >= (3, 8) and platform.system() == "Windows":
-      # TODO(b/165013260): Fix this
-      self.skipTest("Test is currently broken on Windows with Python 3.8")
     options1 = options_lib.Options()
     options1.autotune.enabled = False
     options2 = options_lib.Options()
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index 29534ce267e..cc736bce75f 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -246,7 +247,11 @@ def testPaddedBatchSparseError(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[0, 0]], values=([42]), dense_shape=[1, 1])
 
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegex(
+        TypeError, r'`padded_batch` is only supported for '
+        r'datasets that produce tensor elements but type spec of elements in '
+        r'the input dataset is not a subclass of TensorSpec: '
+        r'`SparseTensorSpec.*`\.$'):
       _ = dataset_ops.Dataset.from_tensors(st).repeat(10).padded_batch(10)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -255,9 +260,23 @@ def testPaddedBatchRaggedError(self):
     rt = ragged_tensor_value.RaggedTensorValue(
         np.array([0, 42]), np.array([0, 2], dtype=np.int64))
 
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegex(
+        TypeError, r'`padded_batch` is only supported for '
+        r'datasets that produce tensor elements but type spec of elements in '
+        r'the input dataset is not a subclass of TensorSpec: '
+        r'`RaggedTensorSpec.*`\.$'):
       _ = dataset_ops.Dataset.from_tensors(rt).repeat(10).padded_batch(10)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testPaddedBatchDatasetsError(self):
+
+    ds = dataset_ops.Dataset.range(10).map(
+        lambda x: dataset_ops.Dataset.range(1))
+
+    with self.assertRaisesRegex(
+        TypeError, r'`padded_batch` is not supported for datasets of datasets'):
+      _ = ds.padded_batch(3)
+
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorWrongRank(self):
     with self.assertRaisesRegex(
@@ -349,14 +368,20 @@ class PaddedBatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                 parameterized.TestCase):
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
 
     def build_dataset(seq_lens):
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+      dataset = dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
           lambda x: array_ops.fill([x], x)).padded_batch(
               batch_size=4, padded_shapes=[-1])
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      dataset = dataset.with_options(options)
+      return dataset
 
     seq_lens = np.random.randint(1, 20, size=(32,)).astype(np.int32)
     verify_fn(self, lambda: build_dataset(seq_lens), num_outputs=8)
diff --git a/tensorflow/python/data/kernel_tests/prefetch_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
index 16dc1aa4dbc..62eb6fb19d3 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -22,6 +22,8 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.data.ops import prefetch_op
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -51,7 +53,7 @@ def testInvalidBufferSize(self, buffer_size):
               buffer_size=[-1, None, 0, 42], slack_period=[1, 8])))
   def testPrefetchWithSlack(self, buffer_size, slack_period):
     dataset = dataset_ops.Dataset.range(100)
-    dataset = dataset_ops.PrefetchDataset(
+    dataset = prefetch_op._PrefetchDataset(  # pylint: disable=protected-access
         dataset, buffer_size, slack_period=slack_period)
     self.assertDatasetProduces(dataset, expected_output=range(100))
 
@@ -82,15 +84,21 @@ def testName(self):
 class PrefetchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                              parameterized.TestCase):
 
-  def build_dataset(self, seed=10):
-    return dataset_ops.Dataset.range(100).prefetch(10).shuffle(
-        buffer_size=10, seed=seed, reshuffle_each_iteration=False)
+  def build_dataset(self, options=None):
+    dataset = dataset_ops.Dataset.range(100).prefetch(10)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
-    verify_fn(self, self.build_dataset, num_outputs=100)
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self.build_dataset(options), num_outputs=100)
 
 
 class PrefetchRandomAccessTest(test_base.DatasetTestBase,
diff --git a/tensorflow/python/data/kernel_tests/random_test.py b/tensorflow/python/data/kernel_tests/random_test.py
index 76a4797a809..75f9f533460 100644
--- a/tensorflow/python/data/kernel_tests/random_test.py
+++ b/tensorflow/python/data/kernel_tests/random_test.py
@@ -13,10 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.Dataset.random()`."""
+import warnings
+
 from absl.testing import parameterized
 
+from tensorflow.python import tf2
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import random_seed
 from tensorflow.python.platform import test
@@ -34,9 +39,9 @@ def testDeterminism(self, global_seed, local_seed):
     random_seed.set_random_seed(global_seed)
     ds = dataset_ops.Dataset.random(seed=local_seed).take(10)
 
-    output_1 = self.getDatasetOutput(ds)
+    output_1 = self.getDatasetOutput(ds, requires_initialization=True)
     ds = self.graphRoundTrip(ds)
-    output_2 = self.getDatasetOutput(ds)
+    output_2 = self.getDatasetOutput(ds, requires_initialization=True)
 
     if expect_determinism:
       self.assertEqual(output_1, output_2)
@@ -46,11 +51,128 @@ def testDeterminism(self, global_seed, local_seed):
       # random number generation).
       self.assertNotEqual(output_1, output_2)
 
+  @combinations.generate(
+      combinations.times(test_base.graph_only_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testRerandomizeEachIterationEpochsIgnored(self, rerandomize):
+    with warnings.catch_warnings(record=True) as w:
+      dataset = dataset_ops.Dataset.random(
+          seed=42,
+          rerandomize_each_iteration=rerandomize,
+          name="random").take(10)
+    first_epoch = self.getDatasetOutput(dataset, requires_initialization=True)
+    second_epoch = self.getDatasetOutput(dataset, requires_initialization=True)
+    if rerandomize:
+      if not tf2.enabled() and rerandomize:
+        found_warning = False
+        for warning in w:
+          if ("In TF 1, the `rerandomize_each_iteration=True` option" in
+              str(warning)):
+            found_warning = True
+            break
+        self.assertTrue(found_warning)
+
+    self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testRerandomizeEachIterationEpochs(self, rerandomize):
+    dataset = dataset_ops.Dataset.random(
+        seed=42, rerandomize_each_iteration=rerandomize, name="random").take(10)
+    first_epoch = self.getDatasetOutput(dataset)
+    second_epoch = self.getDatasetOutput(dataset)
+
+    if rerandomize:
+      self.assertEqual(first_epoch == second_epoch,
+                       not rerandomize or rerandomize is None)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testRerandomizeRepeatEpochs(self, rerandomize):
+    dataset = dataset_ops.Dataset.random(
+        seed=42, rerandomize_each_iteration=rerandomize, name="random").take(10)
+    dataset = dataset.repeat(2)
+    next_element = self.getNext(dataset, requires_initialization=True)
+    first_epoch = []
+    for _ in range(10):
+      first_epoch.append(self.evaluate(next_element()))
+    second_epoch = []
+    for _ in range(10):
+      second_epoch.append(self.evaluate(next_element()))
+
+    if rerandomize:
+      self.assertEqual(first_epoch == second_epoch,
+                       not rerandomize or rerandomize is None)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.times(test_base.v2_eager_only_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testRerandomizeInsideFunction(self, rerandomize):
+
+    @def_function.function
+    def make_dataset():
+      dataset = dataset_ops.Dataset.random(
+          seed=42,
+          rerandomize_each_iteration=rerandomize,
+          name="random").take(10)
+      return dataset
+
+    dataset = make_dataset()
+    first_epoch = self.getDatasetOutput(dataset)
+    second_epoch = self.getDatasetOutput(dataset)
+
+    if rerandomize:
+      self.assertEqual(first_epoch == second_epoch,
+                       not rerandomize or rerandomize is None)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
   @combinations.generate(test_base.default_test_combinations())
   def testName(self):
     dataset = dataset_ops.Dataset.random(
         seed=42, name="random").take(1).map(lambda _: 42)
-    self.assertDatasetProduces(dataset, [42])
+    self.assertDatasetProduces(dataset, expected_output=[42],
+                               requires_initialization=True)
+
+
+class RandomCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                           parameterized.TestCase):
+
+  def _build_random_dataset(
+      self,
+      num_elements=10,
+      seed=None,
+      rerandomize_each_iteration=None):
+    dataset = dataset_ops.Dataset.random(
+        seed=seed, rerandomize_each_iteration=rerandomize_each_iteration)
+    # Checkpoint tests need the test dataset to be finite whereas `random` is
+    # infinite. Use `take` to limit the number of elements.
+    return dataset.take(num_elements)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              rerandomize_each_iteration=[True, False])))
+
+  def test(self, verify_fn, rerandomize_each_iteration):
+    seed = 55
+    num_elements = 10
+    # pylint: disable=g-long-lambda
+    verify_fn(
+        self,
+        lambda: self._build_random_dataset(
+            seed=seed,
+            num_elements=num_elements,
+            rerandomize_each_iteration=rerandomize_each_iteration),
+        num_elements)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/range_test.py b/tensorflow/python/data/kernel_tests/range_test.py
index f8e99a06bc6..af551250c2f 100644
--- a/tensorflow/python/data/kernel_tests/range_test.py
+++ b/tensorflow/python/data/kernel_tests/range_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -149,16 +150,23 @@ def testName(self):
 class RangeCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                           parameterized.TestCase):
 
-  def _build_range_dataset(self, start, stop):
-    return dataset_ops.Dataset.range(start, stop)
+  def _build_range_dataset(self, start, stop, options=None):
+    dataset = dataset_ops.Dataset.range(start, stop)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     start = 2
     stop = 10
-    verify_fn(self, lambda: self._build_range_dataset(start, stop),
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self._build_range_dataset(start, stop, options),
               stop - start)
 
 
diff --git a/tensorflow/python/data/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/kernel_tests/rejection_resample_test.py
index 9693c8e6d07..3772d24d109 100644
--- a/tensorflow/python/data/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/kernel_tests/rejection_resample_test.py
@@ -46,7 +46,7 @@ def testDistribution(self, initial_known):
             target_dist=target_dist,
             initial_dist=initial_dist,
             class_func=lambda c, _: c,
-            seed=27))
+            seed=27), requires_initialization=True)
 
     returned = []
     while len(returned) < 2000:
@@ -132,7 +132,7 @@ def testExhaustion(self):
         target_dist=target_dist,
         initial_dist=init_dist)
 
-    get_next = self.getNext(dataset)
+    get_next = self.getNext(dataset, requires_initialization=True)
     returned = []
     with self.assertRaises(errors.OutOfRangeError):
       while True:
@@ -164,7 +164,7 @@ def testOtherDtypes(self, target_dtype, init_dtype):
         class_func=lambda x: x % 2,
         target_dist=target_dist,
         initial_dist=init_dist)
-    get_next = self.getNext(dataset)
+    get_next = self.getNext(dataset, requires_initialization=True)
     self.evaluate(get_next())
 
 
diff --git a/tensorflow/python/data/kernel_tests/repeat_test.py b/tensorflow/python/data/kernel_tests/repeat_test.py
index 2b191569ca9..06e3d6cdd4b 100644
--- a/tensorflow/python/data/kernel_tests/repeat_test.py
+++ b/tensorflow/python/data/kernel_tests/repeat_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -77,41 +78,83 @@ def testName(self):
 class RepeatDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                   parameterized.TestCase):
 
-  def _build_repeat_dataset(self, count, take_count=3):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).take(
-        take_count).repeat(count)
+  def _build_repeat_dataset(self,
+                            num_elements,
+                            num_epochs,
+                            num_outputs=None,
+                            options=None):
+    dataset = dataset_ops.Dataset.range(num_elements).repeat(num_epochs)
+    if num_outputs:
+      range_dataset = dataset_ops.Dataset.range(num_outputs)
+      dataset = dataset_ops.Dataset.zip((dataset, range_dataset))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def testFiniteRepeat(self, verify_fn):
-    count = 10
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def testFiniteRepeat(self, verify_fn, symbolic_checkpoint):
+    num_elements = 10
+    num_epochs = 10
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
     verify_fn(
         self,
-        lambda: self._build_repeat_dataset(count),
-        num_outputs=(3 * count))
+        lambda: self._build_repeat_dataset(
+            num_elements, num_epochs, options=options),
+        num_outputs=(num_elements * num_epochs))
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def testEmptyRepeat(self, verify_fn):
-    verify_fn(self, lambda: self._build_repeat_dataset(0), num_outputs=0)
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def testEmptyRepeat(self, verify_fn, symbolic_checkpoint):
+    num_elements = 10
+    num_epochs = 0
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self._build_repeat_dataset(
+            num_elements, num_epochs, options=options),
+        num_outputs=0)
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testInfiniteRepeat(self):
-    self.verify_unused_iterator(
-        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
-    self.verify_multiple_breaks(
-        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
-    self.verify_reset_restored_iterator(
-        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def testInfiniteRepeat(self, verify_fn, symbolic_checkpoint):
+    num_elements = 10
+    num_epochs = -1
+    num_outputs = 100
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self._build_repeat_dataset(
+            num_elements, num_epochs, num_outputs=num_outputs, options=options),
+        num_outputs=num_outputs)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def testInfiniteEmptyRepeat(self, verify_fn):
-    verify_fn(self, lambda: self._build_repeat_dataset(-1, 0), num_outputs=0)
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def testInfiniteEmptyRepeat(self, verify_fn, symbolic_checkpoint):
+    num_elements = 0
+    num_epochs = -1
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self._build_repeat_dataset(
+            num_elements, num_epochs, options=options),
+        num_outputs=0)
 
 
 class RepeatRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/data/kernel_tests/sample_from_datasets_test.py b/tensorflow/python/data/kernel_tests/sample_from_datasets_test.py
new file mode 100644
index 00000000000..49b67bc13fb
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/sample_from_datasets_test.py
@@ -0,0 +1,328 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.sample_from_dataset()`."""
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import compat as tf_compat
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.platform import test
+
+
+def _weights_type_combinations():
+  return combinations.combine(weights_type=["list", "tensor", "dataset"])
+
+
+def _get_weights_of_type(weights_list, weights_type):
+  if weights_type == "list":
+    return weights_list
+  if weights_type == "tensor":
+    return ops.convert_to_tensor(weights_list, name="weights")
+  return dataset_ops.Dataset.from_tensors(weights_list).repeat()
+
+
+class SampleFromDatasetsTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def _normalize(self, vec):
+    return vec / vec.sum()
+
+  def _chi2(self, expected, actual):
+    actual = np.asarray(actual)
+    expected = np.asarray(expected)
+    diff = actual - expected
+    chi2 = np.sum(diff * diff / expected, axis=0)
+    return chi2
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _weights_type_combinations()))
+  def testSampleFromDatasets(self, weights_type):
+    random_seed.set_random_seed(1619)
+    num_samples = 5000
+    rand_probs = self._normalize(np.random.random_sample((5,)))
+
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "third_party/tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs, [1.]]:
+      weights = _get_weights_of_type(np.asarray(probs), weights_type)
+      classes = len(probs)
+
+      # Create a dataset that samples each integer in `[0, num_datasets)`
+      # with probability given by `weights[i]`.
+      dataset = dataset_ops.Dataset.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(i).repeat() for i in range(classes)
+      ], weights)
+      dataset = dataset.take(num_samples)
+
+      next_element = self.getNext(dataset, requires_initialization=True)
+      freqs = np.zeros([classes])
+      for _ in range(num_samples):
+        freqs[self.evaluate(next_element())] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
+
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _weights_type_combinations()))
+  def testSampleFromDatasetsStoppingOnEmptyDataset(self, weights_type):
+    # Sampling stops when the first dataset is exhausted.
+    weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
+    datasets = [
+        dataset_ops.Dataset.from_tensors(np.int64(-1)),
+        dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
+        dataset_ops.Dataset.range(10).repeat()
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=True)
+
+    samples_list = self.getIteratorOutput(self.getNext(
+        sample_dataset, requires_initialization=True))
+    self.assertEqual(samples_list.count(-1), 1)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _weights_type_combinations()))
+  def testSampleFromDatasetsSkippingEmptyDataset(self, weights_type):
+    # Sampling skips the first dataset after it becomes empty.
+    weights = _get_weights_of_type(np.asarray([.5, .1, .4]), weights_type)
+    datasets = [
+        dataset_ops.Dataset.from_tensors(np.int64(-1)),
+        dataset_ops.Dataset.from_tensors(np.int64(1)).repeat(),
+        dataset_ops.Dataset.range(10).repeat()
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=False).take(100)
+
+    samples_list = self.getIteratorOutput(self.getNext(
+        sample_dataset, requires_initialization=True))
+    self.assertLen(samples_list, 100)
+    self.assertEqual(samples_list.count(-1), 1)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _weights_type_combinations()))
+  def testSampleFromDatasetsWithZeroWeight(self, weights_type):
+    # Sampling stops when the second dataset is exhausted.
+    weights = _get_weights_of_type(np.asarray([0., 1.]), weights_type)
+    datasets = [
+        dataset_ops.Dataset.from_tensors(-1).repeat(2),
+        dataset_ops.Dataset.from_tensors(1).repeat(2)
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=True)
+    self.assertDatasetProduces(sample_dataset, [1, 1],
+                               requires_initialization=True)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _weights_type_combinations()))
+  def testSampleFromEmptyDataset(self, weights_type):
+    weights = _get_weights_of_type(np.asarray([1., 0.]), weights_type)
+    datasets = [
+        dataset_ops.Dataset.range(0),
+        dataset_ops.Dataset.range(1).repeat()
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=True)
+    self.assertDatasetProduces(sample_dataset, [],
+                               requires_initialization=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSampleFromDatasetsSkippingDatasetsWithZeroWeight(self):
+    # Sampling skips the first dataset.
+    weights = np.asarray([0., 1.])
+    datasets = [
+        dataset_ops.Dataset.from_tensors(-1).repeat(),
+        dataset_ops.Dataset.from_tensors(1)
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=False)
+    self.assertDatasetProduces(sample_dataset, [1])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSampleFromDatasetsAllWeightsAreZero(self):
+    # Sampling skips both datasets.
+    weights = np.asarray([0., 0.])
+    datasets = [
+        dataset_ops.Dataset.from_tensors(-1).repeat(),
+        dataset_ops.Dataset.from_tensors(1).repeat()
+    ]
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, weights=weights, stop_on_empty_dataset=False)
+    self.assertDatasetProduces(sample_dataset, [])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSampleFromDatasetsCardinality(self):
+    ds1 = dataset_ops.Dataset.from_tensors([1.0]).repeat()
+    ds2 = dataset_ops.Dataset.from_tensors([2.0]).repeat()
+    ds = dataset_ops.Dataset.sample_from_datasets([ds1, ds2])
+    self.assertEqual(self.evaluate(ds.cardinality()), dataset_ops.INFINITE)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSampleFromDatasetsNested(self):
+    ds1 = dataset_ops.Dataset.range(10).window(2)
+    ds2 = dataset_ops.Dataset.range(10, 20).window(2)
+    ds = dataset_ops.Dataset.sample_from_datasets([ds1, ds2],
+                                                  weights=[0.3, 0.7])
+    ds = ds.flat_map(lambda x: x)
+    next_element = self.getNext(ds, requires_initialization=True)
+    self.evaluate(next_element())
+
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testSampleFromDatasetsRerandomizeEachIterationEpochs(self, rerandomize):
+    if rerandomize is not None and not tf_compat.forward_compatible(
+        2022, 12, 17):
+      self.skipTest(
+          "target functionality not available due to forward compatibility")
+    dataset1 = dataset_ops.Dataset.range(0, 10)
+    dataset2 = dataset_ops.Dataset.range(100, 110)
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        [dataset1, dataset2],
+        seed=42,
+        weights=[0.5, 0.5],
+        stop_on_empty_dataset=True,
+        rerandomize_each_iteration=rerandomize)
+    first_epoch = self.getDatasetOutput(sample_dataset)
+    second_epoch = self.getDatasetOutput(sample_dataset)
+
+    if rerandomize:
+      self.assertNotEqual(first_epoch, second_epoch)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testSampleFromDatasetsRerandomizeRepeatEpochs(self, rerandomize):
+    if rerandomize is not None and not tf_compat.forward_compatible(
+        2022, 12, 17):
+      self.skipTest(
+          "target functionality not available due to forward compatibility")
+    dataset1 = dataset_ops.Dataset.range(0, 10)
+    dataset2 = dataset_ops.Dataset.range(100, 110)
+    sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+        [dataset1, dataset2],
+        seed=42,
+        weights=[0.5, 0.5],
+        stop_on_empty_dataset=True,
+        rerandomize_each_iteration=rerandomize)
+    sample_dataset = sample_dataset.repeat(2)
+    epochs = self.getDatasetOutput(sample_dataset, requires_initialization=True)
+    first_epoch = epochs[:len(epochs) // 2]
+    second_epoch = epochs[len(epochs) // 2:]
+
+    if rerandomize:
+      self.assertNotEqual(first_epoch, second_epoch)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(
+      combinations.times(test_base.v2_eager_only_combinations(),
+                         combinations.combine(rerandomize=[None, True, False])))
+  def testSampleFromDatasetsRerandomizeInsideFunction(self, rerandomize):
+    if rerandomize is not None and not tf_compat.forward_compatible(
+        2022, 12, 17):
+      self.skipTest(
+          "target functionality not available due to forward compatibility")
+    @def_function.function
+    def make_dataset():
+      dataset1 = dataset_ops.Dataset.range(0, 10)
+      dataset2 = dataset_ops.Dataset.range(100, 110)
+      sample_dataset = dataset_ops.Dataset.sample_from_datasets(
+          [dataset1, dataset2],
+          seed=42,
+          weights=[0.5, 0.5],
+          stop_on_empty_dataset=True,
+          rerandomize_each_iteration=rerandomize)
+      return sample_dataset
+
+    sample_dataset = make_dataset()
+    first_epoch = self.getDatasetOutput(sample_dataset)
+    second_epoch = self.getDatasetOutput(sample_dataset)
+
+    if rerandomize:
+      self.assertNotEqual(first_epoch, second_epoch)
+    else:
+      self.assertEqual(first_epoch, second_epoch)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testErrors(self):
+    with self.assertRaisesRegex(ValueError, r"should have the same length"):
+      dataset_ops.Dataset.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[0.25, 0.25, 0.25, 0.25])
+
+    with self.assertRaisesRegex(TypeError, "`tf.float32` or `tf.float64`"):
+      dataset_ops.Dataset.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[1, 1])
+
+    with self.assertRaisesRegex(TypeError, "must have compatible"):
+      dataset_ops.Dataset.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(0.0)
+      ])
+
+    with self.assertRaisesRegex(
+        ValueError, r"Invalid `datasets`. `datasets` should not be empty."):
+      dataset_ops.Dataset.sample_from_datasets(datasets=[], weights=[])
+
+
+class SampleFromDatasetsCheckpointTest(checkpoint_test_base.CheckpointTestBase,
+                                       parameterized.TestCase):
+
+  def _build_dataset(self, probs, num_samples, options=None):
+    datasets = [
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(len(probs))
+    ]
+    dataset = dataset_ops.Dataset.sample_from_datasets(
+        datasets, probs, seed=1813)
+    dataset = dataset.take(num_samples)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self._build_dataset([0.5, 0.5], 100, options),
+        num_outputs=100)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/scan_test.py b/tensorflow/python/data/kernel_tests/scan_test.py
index 3bff499ac05..860ad84d91f 100644
--- a/tensorflow/python/data/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/kernel_tests/scan_test.py
@@ -21,6 +21,8 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.data.ops import scan_op
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -281,7 +283,7 @@ def scan_fn(state, sample):
 
     data = variables.Variable(initial_value=array_ops.zeros((1, 1000, 1000)))
     dataset = dataset_ops.Dataset.from_tensor_slices(data)
-    dataset = dataset_ops._ScanDataset(
+    dataset = scan_op._ScanDataset(
         dataset, np.int64(1), scan_fn, use_default_device=use_default_device)
     get_next = self.getNext(dataset)
 
@@ -300,19 +302,26 @@ def testName(self):
 class ScanCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                          parameterized.TestCase):
 
-  def _build_dataset(self, num_elements):
+  def _build_dataset(self, num_elements, symbolic_checkpoint):
     dataset = dataset_ops.Dataset.from_tensors(1).repeat(num_elements)
-    return dataset.scan(
+    dataset = dataset.scan(
         initial_state=[0, 1],
         scan_func=lambda a, _: ([a[1], a[0] + a[1]], a[1]))
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    return dataset.with_options(options)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
-    num_outputs = 5
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
+    num_outputs = 10
     verify_fn(
-        self, lambda: self._build_dataset(num_outputs), num_outputs=num_outputs)
+        self,
+        lambda: self._build_dataset(num_outputs, symbolic_checkpoint),
+        num_outputs=num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 8881512514a..2f189ffda3a 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -101,19 +102,25 @@ def testName(self):
 class ShardCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                           parameterized.TestCase):
 
-  def _build_dataset(self, num_elements, num_shards, index):
-    return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
+  def _build_dataset(self, num_elements, num_shards, index, options=None):
+    dataset = dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True]),
           combinations.combine(
               elems=[10, 100], num_shards=[2, 5], index=[0, 1])))
-  def test(self, verify_fn, elems, num_shards, index):
+  def test(self, verify_fn, symbolic_checkpoint, elems, num_shards, index):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
     verify_fn(
         self,
-        lambda: self._build_dataset(elems, num_shards, index),
+        lambda: self._build_dataset(elems, num_shards, index, options),
         num_outputs=elems // num_shards)
 
 
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
index 84775e4cf84..d117ced2b12 100644
--- a/tensorflow/python/data/kernel_tests/skip_test.py
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -50,19 +51,25 @@ def testName(self):
 class SkipDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                 parameterized.TestCase):
 
-  def _build_skip_dataset(self, count):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).skip(count)
+  def _build_skip_dataset(self, count, options=None):
+    dataset = dataset_ops.Dataset.range(100).skip(count)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(count=[5], num_outputs=[5]) +
-          combinations.combine(count=[20, 10, -1], num_outputs=[0]) +
-          combinations.combine(count=[0], num_outputs=[10])))
-  def test(self, verify_fn, count, num_outputs):
-    verify_fn(self, lambda: self._build_skip_dataset(count), num_outputs)
+          combinations.combine(symbolic_checkpoint=[False, True]),
+          combinations.combine(count=[50], num_outputs=[50]) +
+          combinations.combine(count=[200, 100, -1], num_outputs=[0]) +
+          combinations.combine(count=[0], num_outputs=[100])))
+  def test(self, verify_fn, count, num_outputs, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self._build_skip_dataset(count, options),
+              num_outputs)
 
 
 class SkipRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/data/kernel_tests/take_test.py b/tensorflow/python/data/kernel_tests/take_test.py
index 025e74950e9..c45c7f3e100 100644
--- a/tensorflow/python/data/kernel_tests/take_test.py
+++ b/tensorflow/python/data/kernel_tests/take_test.py
@@ -20,6 +20,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -49,19 +50,25 @@ def testName(self):
 class TakeDatasetCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                 parameterized.TestCase):
 
-  def _build_take_dataset(self, count):
-    components = (np.arange(10),)
-    return dataset_ops.Dataset.from_tensor_slices(components).take(count)
+  def _build_take_dataset(self, count, options=None):
+    dataset = dataset_ops.Dataset.range(100).take(count)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(count=[5], num_outputs=[5]) +
-          combinations.combine(count=[20, 10, -1], num_outputs=[10]) +
+          combinations.combine(symbolic_checkpoint=[False, True]),
+          combinations.combine(count=[50], num_outputs=[50]) +
+          combinations.combine(count=[200, 100, -1], num_outputs=[100]) +
           combinations.combine(count=[0], num_outputs=[0])))
-  def test(self, verify_fn, count, num_outputs):
-    verify_fn(self, lambda: self._build_take_dataset(count), num_outputs)
+  def test(self, verify_fn, symbolic_checkpoint, count, num_outputs):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self._build_take_dataset(count, options),
+              num_outputs)
 
 
 class TakeRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/data/kernel_tests/take_while_test.py b/tensorflow/python/data/kernel_tests/take_while_test.py
index 26ab0579a91..934484cafdc 100644
--- a/tensorflow/python/data/kernel_tests/take_while_test.py
+++ b/tensorflow/python/data/kernel_tests/take_while_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
@@ -127,17 +128,24 @@ def testName(self):
 class TakeWhileCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                               parameterized.TestCase):
 
-  def _build_dataset(self, num_elements, upper_bound):
-    return dataset_ops.Dataset.range(num_elements).take_while(
-        predicate=lambda x: x < upper_bound)
+  def _build_dataset(self, num_elements, upper_bound, options=None):
+    dataset = dataset_ops.Dataset.range(num_elements)
+    dataset = dataset.take_while(predicate=lambda x: x < upper_bound)
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True]),
           combinations.combine(num_elements=[10, 23], upper_bound=[10, 23])))
-  def test(self, verify_fn, num_elements, upper_bound):
-    verify_fn(self, lambda: self._build_dataset(num_elements, upper_bound),
+  def test(self, verify_fn, symbolic_checkpoint, num_elements, upper_bound):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self,
+              lambda: self._build_dataset(num_elements, upper_bound, options),
               min(num_elements, upper_bound))
 
 
diff --git a/tensorflow/python/data/kernel_tests/unbatch_test.py b/tensorflow/python/data/kernel_tests/unbatch_test.py
index 671cefbc008..d86075cd2b7 100644
--- a/tensorflow/python/data/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/kernel_tests/unbatch_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -229,24 +230,36 @@ def testName(self):
 class UnbatchCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                             parameterized.TestCase):
 
-  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+  def build_dataset(self,
+                    multiplier=15.0,
+                    tensor_slice_len=2,
+                    batch_size=2,
+                    options=None):
     components = (np.arange(tensor_slice_len), np.array([[1, 2, 3]]) *
                   np.arange(tensor_slice_len)[:, np.newaxis],
                   np.array(multiplier) * np.arange(tensor_slice_len))
 
-    return dataset_ops.Dataset.from_tensor_slices(components).batch(
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).batch(
         batch_size).unbatch()
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
-  def test(self, verify_fn):
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, symbolic_checkpoint):
     tensor_slice_len = 8
     batch_size = 2
     num_outputs = tensor_slice_len
-    verify_fn(self,
-              lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
-              num_outputs)
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(
+        self,
+        lambda: self.build_dataset(15.0, tensor_slice_len, batch_size, options),
+        num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
index bd7596bab83..e63cfb689a1 100644
--- a/tensorflow/python/data/kernel_tests/window_test.py
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -20,7 +20,6 @@
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -225,10 +224,9 @@ def testWindowIgnoreErrors(self):
         dataset, expected_output=[np.float32([1., 2.]),
                                   np.float32([2., 3.])])
 
-  @combinations.generate(test_base.default_test_combinations())
+  # Eager-only because the test enumerates the dataset.
+  @combinations.generate(test_base.eager_only_combinations())
   def testNestedOutput(self):
-    if not context.executing_eagerly():
-      self.skipTest("self.evaluate() does not work with a dataset")
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset_ops.Dataset.zip((dataset, dataset)).window(10)
     for i, nested_dataset in enumerate(dataset):
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
index 6fdd76b135a..330b619d24b 100644
--- a/tensorflow/python/data/kernel_tests/zip_test.py
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
@@ -137,7 +138,7 @@ def testName(self):
 class ZipCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                         parameterized.TestCase):
 
-  def _build_dataset(self, arr):
+  def _build_dataset(self, arr, options=None):
     components = [
         np.tile(np.array([[1], [2], [3], [4]]), 20),
         np.tile(np.array([[12], [13], [14], [15]]), 22),
@@ -147,16 +148,22 @@ def _build_dataset(self, arr):
         dataset_ops.Dataset.from_tensor_slices(component)
         for component in components
     ]
-    return dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+    dataset = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+    if options:
+      dataset = dataset.with_options(options)
+    return dataset
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(elements=[[37.0, 38.0, 39.0, 40.0], [1.0, 2.0]]))
-  )
-  def test(self, verify_fn, elements):
-    verify_fn(self, lambda: self._build_dataset(elements), len(elements))
+          combinations.combine(elements=[[37.0, 38.0, 39.0, 40.0], [1.0, 2.0]]),
+          combinations.combine(symbolic_checkpoint=[False, True])))
+  def test(self, verify_fn, elements, symbolic_checkpoint):
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = symbolic_checkpoint
+    verify_fn(self, lambda: self._build_dataset(elements, options),
+              len(elements))
 
 
 class ZipRandomAccessTest(test_base.DatasetTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 987f54920be..02377040fb5 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -8,6 +9,7 @@ py_library(
     srcs = ["structured_function.py"],
     srcs_version = "PY3",
     deps = [
+        ":debug_mode",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python:util",
@@ -16,6 +18,11 @@ py_library(
     ],
 )
 
+py_library(
+    name = "debug_mode",
+    srcs = ["debug_mode.py"],
+)
+
 py_library(
     name = "batch_op",
     srcs = ["batch_op.py"],
@@ -24,6 +31,9 @@ py_library(
 py_library(
     name = "counter_op",
     srcs = ["counter_op.py"],
+    deps = [
+        "//third_party/py/numpy",
+    ],
 )
 
 py_library(
@@ -31,6 +41,21 @@ py_library(
     srcs = ["padded_batch_op.py"],
 )
 
+py_library(
+    name = "take_op",
+    srcs = ["take_op.py"],
+)
+
+py_library(
+    name = "scan_op",
+    srcs = ["scan_op.py"],
+)
+
+py_library(
+    name = "shuffle_op",
+    srcs = ["shuffle_op.py"],
+)
+
 py_library(
     name = "rebatch_op",
     srcs = ["rebatch_op.py"],
@@ -45,26 +70,63 @@ py_library(
     ],
 )
 
+py_library(
+    name = "skip_op",
+    srcs = ["skip_op.py"],
+)
+
 py_library(
     name = "dataset_ops",
-    srcs = ["dataset_ops.py"],
+    # Grouped together due to mutual dependencies, to avoid dependency cycles.
+    srcs = [
+        "dataset_autograph.py",
+        "dataset_ops.py",
+    ],
     srcs_version = "PY3",
     deps = [
         ":batch_op",
+        ":cache_op",
+        ":choose_from_datasets_op",
+        ":concatenate_op",
         ":counter_op",
+        ":debug_mode",
+        ":directed_interleave_op",
         ":filter_op",
+        ":flat_map_op",
+        ":from_generator_op",
+        ":from_sparse_tensor_slices_op",
         ":from_tensor_slices_op",
+        ":from_tensors_op",
+        ":group_by_window_op",
         ":ignore_errors_op",
+        ":interleave_op",
         ":iterator_ops",
         ":load_ops",
+        ":map_op",
         ":options",
         ":padded_batch_op",
+        ":prefetch_op",
         ":ragged_batch_op",
+        ":random_op",
+        ":range_op",
         ":rebatch_op",
+        ":repeat_op",
+        ":sample_from_datasets_op",
         ":save_ops",
+        ":scan_op",
+        ":shard_op",
+        ":shuffle_op",
+        ":skip_op",
+        ":snapshot_op",
         ":sparse_batch_op",
         ":structured_function",
+        ":take_op",
+        ":take_while_op",
+        ":unbatch_op",
+        ":unique_op",
+        ":window_op",
         ":zip_op",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -90,35 +152,112 @@ py_library(
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/data/util:traverse",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
 )
 
+py_library(
+    name = "cache_op",
+    srcs = ["cache_op.py"],
+)
+
+py_library(
+    name = "choose_from_datasets_op",
+    srcs = ["choose_from_datasets_op.py"],
+)
+
+py_library(
+    name = "concatenate_op",
+    srcs = ["concatenate_op.py"],
+)
+
+py_library(
+    name = "directed_interleave_op",
+    srcs = ["directed_interleave_op.py"],
+)
+
 py_library(
     name = "filter_op",
     srcs = ["filter_op.py"],
 )
 
+py_library(
+    name = "flat_map_op",
+    srcs = ["flat_map_op.py"],
+)
+
+py_library(
+    name = "from_generator_op",
+    srcs = ["from_generator_op.py"],
+)
+
+py_library(
+    name = "from_sparse_tensor_slices_op",
+    srcs = ["from_sparse_tensor_slices_op.py"],
+)
+
 py_library(
     name = "from_tensor_slices_op",
     srcs = ["from_tensor_slices_op.py"],
 )
 
+py_library(
+    name = "from_tensors_op",
+    srcs = ["from_tensors_op.py"],
+)
+
+py_library(
+    name = "group_by_window_op",
+    srcs = ["group_by_window_op.py"],
+)
+
+py_library(
+    name = "random_op",
+    srcs = ["random_op.py"],
+)
+
+py_library(
+    name = "range_op",
+    srcs = ["range_op.py"],
+)
+
+py_library(
+    name = "repeat_op",
+    srcs = ["repeat_op.py"],
+)
+
+py_library(
+    name = "sample_from_datasets_op",
+    srcs = ["sample_from_datasets_op.py"],
+)
+
 py_library(
     name = "ignore_errors_op",
     srcs = ["ignore_errors_op.py"],
 )
 
+py_library(
+    name = "interleave_op",
+    srcs = ["interleave_op.py"],
+)
+
 py_library(
     name = "iterator_ops",
-    srcs = ["iterator_ops.py"],
+    # Grouped together due to mutual dependencies, to avoid dependency cycles.
+    srcs = [
+        "iterator_autograph.py",
+        "iterator_ops.py",
+    ],
     srcs_version = "PY3",
     deps = [
         ":optional_ops",
         ":options",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -130,6 +269,8 @@ py_library(
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
     ],
 )
@@ -142,10 +283,21 @@ py_library(
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/platform",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util",
     ],
 )
 
+py_library(
+    name = "map_op",
+    srcs = ["map_op.py"],
+)
+
+py_library(
+    name = "prefetch_op",
+    srcs = ["prefetch_op.py"],
+)
+
 py_library(
     name = "ragged_batch_op",
     srcs = ["ragged_batch_op.py"],
@@ -181,10 +333,36 @@ py_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util",
     ],
 )
 
+py_library(
+    name = "snapshot_op",
+    srcs = ["snapshot_op.py"],
+)
+
+py_library(
+    name = "take_while_op",
+    srcs = ["take_while_op.py"],
+)
+
+py_library(
+    name = "unique_op",
+    srcs = ["unique_op.py"],
+)
+
+py_library(
+    name = "unbatch_op",
+    srcs = ["unbatch_op.py"],
+)
+
+py_library(
+    name = "window_op",
+    srcs = ["window_op.py"],
+)
+
 py_library(
     name = "zip_op",
     srcs = ["zip_op.py"],
@@ -198,6 +376,7 @@ py_library(
         ":dataset_ops",
         ":iterator_ops",
         ":options",
+        ":prefetch_op",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -209,6 +388,7 @@ py_library(
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:type_utils",
     ],
 )
 
@@ -217,13 +397,16 @@ py_library(
     srcs = ["optional_ops.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:optional_ops_gen",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/saved_model:nested_structure_coder",
     ],
 )
 
@@ -258,3 +441,8 @@ py_library(
         "//tensorflow/python/util:tf_export",
     ],
 )
+
+py_library(
+    name = "shard_op",
+    srcs = ["shard_op.py"],
+)
diff --git a/tensorflow/python/data/ops/batch_op.py b/tensorflow/python/data/ops/batch_op.py
index 24fe85364a1..1816013a87d 100644
--- a/tensorflow/python/data/ops/batch_op.py
+++ b/tensorflow/python/data/ops/batch_op.py
@@ -17,6 +17,7 @@
 import warnings
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -24,21 +25,21 @@
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def batch(self,
-          batch_size,
-          drop_remainder=False,
-          num_parallel_calls=None,
-          deterministic=None,
-          name=None):
+def _batch(input_dataset,
+           batch_size,
+           drop_remainder=False,
+           num_parallel_calls=None,
+           deterministic=None,
+           name=None):
   """See `Dataset.batch` for details."""
-  if num_parallel_calls is None or dataset_ops.DEBUG_MODE:
-    if deterministic is not None and not dataset_ops.DEBUG_MODE:
+  if num_parallel_calls is None or debug_mode.DEBUG_MODE:
+    if deterministic is not None and not debug_mode.DEBUG_MODE:
       warnings.warn("The `deterministic` argument has no effect unless the "
                     "`num_parallel_calls` argument is specified.")
-    return BatchDataset(self, batch_size, drop_remainder, name=name)
+    return _BatchDataset(input_dataset, batch_size, drop_remainder, name=name)
   else:
-    return ParallelBatchDataset(
-        self,
+    return _ParallelBatchDataset(
+        input_dataset,
         batch_size,
         drop_remainder,
         num_parallel_calls,
@@ -46,7 +47,7 @@ def batch(self,
         name=name)
 
 
-class BatchDataset(dataset_ops.UnaryDataset):
+class _BatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
 
   def __init__(self, input_dataset, batch_size, drop_remainder, name=None):
@@ -78,14 +79,14 @@ def __init__(self, input_dataset, batch_size, drop_remainder, name=None):
         batch_size=self._batch_size,
         drop_remainder=self._drop_remainder,
         **self._common_args)
-    super(BatchDataset, self).__init__(input_dataset, variant_tensor)
+    super().__init__(input_dataset, variant_tensor)
 
   @property
   def element_spec(self):
     return self._structure
 
 
-class ParallelBatchDataset(dataset_ops.UnaryDataset):
+class _ParallelBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input in parallel."""
 
   def __init__(self,
@@ -134,7 +135,7 @@ def __init__(self,
         deterministic=self._deterministic,
         **self._common_args)
 
-    super(ParallelBatchDataset, self).__init__(input_dataset, variant_tensor)
+    super().__init__(input_dataset, variant_tensor)
 
   @property
   def element_spec(self):
diff --git a/tensorflow/python/data/ops/cache_op.py b/tensorflow/python/data/ops/cache_op.py
new file mode 100644
index 00000000000..cc1029661a6
--- /dev/null
+++ b/tensorflow/python/data/ops/cache_op.py
@@ -0,0 +1,49 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.cache`."""
+
+from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _cache(input_dataset, filename, name):  # pylint: disable=unused-private-name
+  return CacheDataset(input_dataset, filename, name)
+
+
+class CacheDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that caches elements of its input."""
+
+  def __init__(self, input_dataset, filename, name=None):
+    """See `Dataset.cache()` for details."""
+    self._input_dataset = input_dataset
+    self._filename = ops.convert_to_tensor(
+        filename, dtype=dtypes.string, name="filename")
+    self._name = name
+    if tf2.enabled() and (context.executing_eagerly() or ops.inside_function()):
+      variant_tensor = gen_dataset_ops.cache_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          filename=self._filename,
+          cache=gen_dataset_ops.dummy_memory_cache(),
+          **self._common_args)
+    else:
+      variant_tensor = gen_dataset_ops.cache_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          filename=self._filename,
+          **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/choose_from_datasets_op.py b/tensorflow/python/data/ops/choose_from_datasets_op.py
new file mode 100644
index 00000000000..41b00b6825b
--- /dev/null
+++ b/tensorflow/python/data/ops/choose_from_datasets_op.py
@@ -0,0 +1,53 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.choose_from_datasets`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import directed_interleave_op
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+
+
+def _choose_from_datasets(  # pylint: disable=unused-private-name
+    datasets, choice_dataset, stop_on_empty_dataset=True
+):
+  """See `Dataset.choose_from_datasets()` for details."""
+
+  if not datasets:
+    raise ValueError("Invalid `datasets`. `datasets` should not be empty.")
+  if not isinstance(choice_dataset, dataset_ops.DatasetV2):
+    raise TypeError(
+        "Invalid `choice_dataset`. `choice_dataset` should be a "
+        f"`tf.data.Dataset` but is {type(choice_dataset)}."
+    )
+  if not structure.are_compatible(
+      choice_dataset.element_spec, tensor_spec.TensorSpec([], dtypes.int64)
+  ):
+    raise TypeError(
+        "Invalid `choice_dataset`. Elements of `choice_dataset` "
+        "must be scalar `tf.int64` tensors but are "
+        f"{choice_dataset.element_spec}."
+    )
+  # Replicates the `choice_dataset` component so that each split makes choices
+  # independently. This avoids the need for prohibitively expensive
+  # cross-split coordination.
+  # pylint: disable=protected-access
+  choice_dataset = dataset_ops._apply_rewrite(
+      choice_dataset, "replicate_on_split"
+  )
+  return directed_interleave_op._directed_interleave(  # pylint: disable=protected-access
+      choice_dataset, datasets, stop_on_empty_dataset
+  )
diff --git a/tensorflow/python/data/ops/concatenate_op.py b/tensorflow/python/data/ops/concatenate_op.py
new file mode 100644
index 00000000000..9cef9eae16b
--- /dev/null
+++ b/tensorflow/python/data/ops/concatenate_op.py
@@ -0,0 +1,63 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.concatenate`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import nest as tf_nest
+
+
+def _concatenate(input_dataset, dataset_to_concatenate, name):  # pylint: disable=unused-private-name
+  return _ConcatenateDataset(input_dataset, dataset_to_concatenate, name)
+
+
+class _ConcatenateDataset(dataset_ops.DatasetV2):
+  """A `Dataset` that concatenates its input with given dataset."""
+
+  def __init__(self, input_dataset, dataset_to_concatenate, name=None):
+    """See `Dataset.concatenate()` for details."""
+    self._input_dataset = input_dataset
+    self._dataset_to_concatenate = dataset_to_concatenate
+
+    def common_supertype(a, b):
+      result = a.most_specific_common_supertype([b])
+      if result is None:
+        raise TypeError(f"No common supertype of {a} and {b}.")
+      return result
+
+    try:
+      self._structure = tf_nest.map_structure(
+          common_supertype, input_dataset.element_spec,
+          dataset_to_concatenate.element_spec)
+    except (TypeError, ValueError) as e:
+      raise TypeError(f"Incompatible dataset elements:\n"
+                      f"  {input_dataset.element_spec} vs. "
+                      f"  {dataset_to_concatenate.element_spec}") from e
+
+    self._input_datasets = [input_dataset, dataset_to_concatenate]
+    self._name = name
+    # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.concatenate_dataset(
+        input_dataset._variant_tensor, dataset_to_concatenate._variant_tensor,
+        **self._common_args)
+    # pylint: enable=protected-access
+    super().__init__(variant_tensor)
+
+  def _inputs(self):
+    return self._input_datasets
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/counter_op.py b/tensorflow/python/data/ops/counter_op.py
index a422d05f3cd..b34478f2d90 100644
--- a/tensorflow/python/data/ops/counter_op.py
+++ b/tensorflow/python/data/ops/counter_op.py
@@ -18,7 +18,7 @@
 from tensorflow.python.framework import ops
 
 
-def counter(start, step, dtype, name=None):
+def _counter(start, step, dtype, name=None):
   with ops.name_scope("counter"):
     start = ops.convert_to_tensor(start, dtype=dtype, name="start")
     step = ops.convert_to_tensor(step, dtype=dtype, name="step")
diff --git a/tensorflow/python/data/ops/dataset_autograph.py b/tensorflow/python/data/ops/dataset_autograph.py
new file mode 100644
index 00000000000..93dedf822d0
--- /dev/null
+++ b/tensorflow/python/data/ops/dataset_autograph.py
@@ -0,0 +1,221 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Autograph specifc overrides for dataset_ops."""
+from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import nest
+
+
+def _general_purpose_scan(ds, init_state, body):
+  """Variant of Dataset.scan with semantics of general-purpose computation."""
+  # Datasets are typically intended for data preprocessing. However, in
+  # autograph loops they usually appear as general-purpose computations (for
+  # example, a custom training loop). These two use cases require significantly
+  # different optimization policies, the most important of which is the device
+  # placement. The flag override for use_default_device below instructs the
+  # runtime to treat the computation as general-purpose, rather than data
+  # preprocessing.
+
+  # Loaded lazily due to a circular dependency (dataset_ops ->
+  # scan_op -> dataset_ops).
+  # pylint: disable=g-import-not-at-top,protected-access
+  from tensorflow.python.data.ops import scan_op
+  return scan_op._ScanDataset(ds, init_state, body, use_default_device=False)
+  # pylint: enable=g-import-not-at-top,protected-access
+
+
+def _tf_ag_dataset_for_stmt(
+    ds, extra_test, body, get_state, set_state, symbol_names, opts
+):
+  """Overload of _dataset_for_stmt with early stopping. See for_stmt."""
+  # Note: This is easier to follow with the insight that the computations in
+  # a dataset pipeline are transposed (aka fused).
+  # For example, given a pipeline input -> scan -> take_while -> reduce,
+  # and a dataset with input [1, 2, 3], the computations occur in the following
+  # order:
+  #  reduce(take_while(scan(1)))
+  #  reduce(take_while(scan(2)))
+  #  reduce(take_while(scan(3)))
+
+  init_vars = get_state()
+  control_flow.verify_loop_init_vars(init_vars, symbol_names)
+
+  # Workaround for Dataset.reduce not allowing empty state tensors - create
+  # a dummy state variable that remains unused.
+  # TODO(mdan): reduce should allow and match empty structures.
+  if not init_vars:
+    init_vars = (constant_op.constant(0),)
+    symbol_names = ("<internal dummy>",)
+
+    def dummy_set_state(unused_dummy):
+      pass
+
+    def dummy_get_state():
+      return (constant_op.constant(0),)
+
+    get_state, set_state = dummy_get_state, dummy_set_state
+
+  def scan_body(scan_state, scan_inputs):
+    """Main body of the Dataset.scan."""
+    loop_vars, iterate = scan_state, scan_inputs
+    set_state(loop_vars)
+
+    def main_path():
+      body(iterate)
+      new_loop_vars = get_state()
+      control_flow.verify_tf_loop_vars(
+          init_vars,
+          loop_vars,
+          new_loop_vars,
+          symbol_names,
+          opts,
+          check_shapes=False)
+      return new_loop_vars
+
+    if extra_test is not None:
+      extra_cond = extra_test()
+      new_loop_vars = control_flow_ops.cond(extra_cond, main_path,
+                                            lambda: loop_vars)
+    else:
+      # TODO(mdan): the optimizer should be able to remove an invariant cond?
+      extra_cond = (constant_op.constant(True),)  # dummy value, unused
+      new_loop_vars = main_path()
+
+    scan_outputs = new_loop_vars, extra_cond
+    new_scan_state = new_loop_vars
+    return new_scan_state, scan_outputs
+
+  def take_while_predicate(unused_loop_vars, extra_cond):
+    return extra_cond
+
+  def reduce_body(unused_reduce_state, scan_outputs):
+    output_loop_vars, unused_extra_cond = scan_outputs
+    new_reduce_state = output_loop_vars
+    return new_reduce_state
+
+  ds = _general_purpose_scan(ds, init_vars, scan_body)
+  if extra_test is not None:
+    ds = ds.apply(take_while_ops.take_while(take_while_predicate))
+  final_loop_vars = ds.reduce(init_vars, reduce_body)
+  set_state(final_loop_vars)
+
+
+def _tf_ag_dataset_abs(ds):
+  specs = nest.flatten(ds.element_spec)
+  if len(specs) == 1:
+    return ds.map(math_ops.abs, num_parallel_calls=dataset_ops.AUTOTUNE)
+  return ds.map(
+      lambda *e: nest.map_structure(math_ops.abs, e),
+      num_parallel_calls=dataset_ops.AUTOTUNE)
+
+
+def _tf_ag_dataset_len(s):
+  """Autograph override of the builtin len for dataset_ops.DataSetV2."""
+  l = s.cardinality()
+  msg = gen_string_ops.string_join([
+      "len requires dataset with definitive cardinality, got ",
+      gen_string_ops.as_string(l),
+  ])
+  # TODO(yongtang): UNKNOWN is treated as an error.
+  # In case there are more UNKNOWN cases for dataset, we could
+  # use dataset.reduce() to find out the length (in an expensive way).
+  with ops.control_dependencies([
+      control_flow_ops.Assert(
+          math_ops.logical_and(
+              math_ops.not_equal(l, dataset_ops.INFINITE),
+              math_ops.not_equal(l, dataset_ops.UNKNOWN)), [msg])
+  ]):
+    l = array_ops.identity(l)
+
+  return l
+
+
+def _tf_ag_dataset_enumerate(ds, start=0):
+  return ds.enumerate(start)
+
+
+def _tf_ag_dataset_zip(*iterables):
+  return dataset_ops.DatasetV2.zip(iterables)
+
+
+def _tf_ag_dataset_map(fn, *iterables):
+  return dataset_ops.DatasetV2.zip(iterables).map(fn)
+
+
+def _tf_ag_dataset_filter(fn, iterable):
+  return iterable.filter(fn)
+
+
+# any() operation is essentially a "if first True element exist".
+# For that it could be translated to `filter(True)` to filter out
+# only `True` element, and then `take(1)`. This works in tf.data
+# as tf.data's filter+take is done in pipeline so it will stop
+# as soon as `take(1)` returns.
+def _tf_ag_dataset_any(iterable):
+  # check and make sure iterable.element_spec only consists of one
+  # element of tf.bool.
+  specs = nest.flatten(iterable.element_spec)
+  if len(specs) != 1 or specs[0].dtype != dtypes.bool:
+    raise ValueError('in graph mode, the "any" builtin only supports datasets '
+                     'that return bool scalars; got: {}'.format(
+                         iterable.element_spec))
+  ds = iterable.filter(lambda x: x)
+  ds = ds.take(1)
+  ds = ds.reduce(constant_op.constant(False, dtype=dtypes.bool), lambda _, y: y)
+  return ds
+
+
+# all() operation is similar to any() and could be translated
+# to `filter(False)` then `take(1)`, and check if `False` exists.
+def _tf_ag_dataset_all(iterable):
+  # check and make sure iterable.element_spec only consists of one
+  # element of tf.bool.
+  specs = nest.flatten(iterable.element_spec)
+  if len(specs) != 1 or specs[0].dtype != dtypes.bool:
+    raise ValueError('in graph mode, the "all" builtin only supports datasets '
+                     'that return bool scalars; got: {}'.format(
+                         iterable.element_spec))
+  ds = iterable.filter(math_ops.logical_not)
+  ds = ds.take(1)
+  ds = ds.reduce(constant_op.constant(True, dtype=dtypes.bool), lambda _, y: y)
+  return ds
+
+
+def register_overrides():
+  """Registers the autograph specific overrides for dataset_ops."""
+  control_flow.for_loop_registry.register(
+      dataset_ops.DatasetV2, _tf_ag_dataset_for_stmt
+  )
+  py_builtins.abs_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_abs)
+  py_builtins.len_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_len)
+  py_builtins.enumerate_registry.register(
+      dataset_ops.DatasetV2, _tf_ag_dataset_enumerate
+  )
+  py_builtins.zip_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_zip)
+  py_builtins.map_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_map)
+  py_builtins.filter_registry.register(
+      dataset_ops.DatasetV2, _tf_ag_dataset_filter
+  )
+  py_builtins.any_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_any)
+  py_builtins.all_registry.register(dataset_ops.DatasetV2, _tf_ag_dataset_all)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 34864d39913..4dfd71ffdb7 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -15,7 +15,6 @@
 """Python wrappers for Datasets."""
 import abc
 import functools
-import multiprocessing
 import queue
 import threading
 import warnings
@@ -25,12 +24,14 @@
 from tensorflow.core.framework import dataset_metadata_pb2
 from tensorflow.core.framework import dataset_options_pb2
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_autograph
+from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.ops import structured_function
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import structure
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
@@ -51,18 +52,18 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import gen_io_ops
-from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import base as tracking_base
 from tensorflow.python.trackable import resource as resource_lib
+from tensorflow.python.types import data as data_types
 from tensorflow.python.types import trace
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import lazy_loader
@@ -86,12 +87,17 @@
 def_function = lazy_loader.LazyLoader(
     "def_function", globals(),
     "tensorflow.python.eager.def_function")
-# Loaded lazily due to a circular dependency
-# dataset_ops->parsing_ops->dataset_ops
-# TODO(varshaan): Use a regular import.
-parsing_ops = lazy_loader.LazyLoader(
-    "parsing_ops", globals(),
-    "tensorflow.python.ops.parsing_ops")
+# TODO(b/240947712): Clean up the circular dependencies.
+# Loaded lazily due to a circular dependency (dataset_ops ->
+# prefetch_op -> dataset_ops).
+prefetch_op = lazy_loader.LazyLoader(
+    "prefetch_op", globals(),
+    "tensorflow.python.data.ops.prefetch_op")
+# Loaded lazily due to a circular dependency (dataset_ops ->
+# shuffle_op -> dataset_ops).
+shuffle_op = lazy_loader.LazyLoader(
+    "shuffle_op", globals(),
+    "tensorflow.python.data.ops.shuffle_op")
 
 
 ops.NotDifferentiable("ReduceDataset")
@@ -122,7 +128,7 @@ def _validate_and_encode(name):
   return name.encode("utf-8")
 
 
-def _get_type(value):
+def get_type(value):
   """Returns the type of `value` if it is a TypeSpec."""
 
   if isinstance(value, type_spec.TypeSpec):
@@ -136,6 +142,7 @@ class DatasetV2(
     collections_abc.Iterable,
     tracking_base.Trackable,
     composite_tensor.CompositeTensor,
+    data_types.DatasetV2,
     metaclass=abc.ABCMeta):
   """Represents a potentially large set of elements.
 
@@ -325,7 +332,7 @@ def _maybe_track_assets(self, graph_def):
       if node.name in asset_tracker:
         tensor_proto = node.attr["value"].tensor
         with context.eager_mode(), ops.device("CPU"):
-          node_value = parsing_ops.parse_tensor(
+          node_value = gen_parsing_ops.parse_tensor(
               tensor_proto.SerializeToString(), dtypes.string).numpy()
         asset_tracker[node.name] = ([
             self._track_trackable(asset.Asset(n),
@@ -468,7 +475,7 @@ def options(self):
     return self._options_attr
 
   def _apply_debug_options(self):
-    if DEBUG_MODE:
+    if debug_mode.DEBUG_MODE:
       # Disable autotuning and static optimizations that could introduce
       # parallelism or asynchrony.
       options = options_lib.Options()
@@ -731,7 +738,12 @@ def from_tensors(tensors, name=None):
     Returns:
       Dataset: A `Dataset`.
     """
-    return TensorDataset(tensors, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # from_tensors_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import from_tensors_op
+    return from_tensors_op._from_tensors(tensors, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def from_tensor_slices(tensors, name=None):
@@ -813,8 +825,10 @@ def from_tensor_slices(tensors, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops ->
     # from_tensor_slices_op -> dataset_ops).
-    from tensorflow.python.data.ops import from_tensor_slices_op  # pylint: disable=g-import-not-at-top
-    return from_tensor_slices_op.from_tensor_slices(tensors, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import from_tensor_slices_op
+    return from_tensor_slices_op._from_tensor_slices(tensors, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   class _GeneratorState:
     """Stores outstanding iterators created from a Python generator.
@@ -949,227 +963,14 @@ def from_generator(generator,
     Returns:
       Dataset: A `Dataset`.
     """
-    if not callable(generator):
-      raise TypeError("`generator` must be a Python callable.")
-
-    if output_signature is not None:
-      if output_types is not None:
-        raise TypeError("The `output_types` argument can not be used together "
-                        "with the `output_signature` argument.")
-      if output_shapes is not None:
-        raise TypeError("The `output_shapes` argument can not be used together "
-                        "with the `output_signature` argument.")
-      for spec in nest.flatten(output_signature):
-        if not isinstance(spec, type_spec.TypeSpec):
-          raise TypeError(f"`output_signature` must contain objects that are "
-                          f"subclass of `tf.TypeSpec` but found {type(spec)} "
-                          f"which is not.")
-    else:
-      if output_types is None:
-        raise TypeError("To specify the output signature you need to provide "
-                        "either the `output_signature` argument or the "
-                        "`output_types` argument.")
-
-    if output_signature is None:
-      if output_shapes is None:
-        output_shapes = nest.map_structure(
-            lambda _: tensor_shape.TensorShape(None), output_types)
-      else:
-        output_shapes = nest.map_structure_up_to(output_types,
-                                                 tensor_shape.as_shape,
-                                                 output_shapes)
-      output_signature = nest.map_structure_up_to(output_types,
-                                                  tensor_spec.TensorSpec,
-                                                  output_shapes, output_types)
-    if all(
-        isinstance(x, tensor_spec.TensorSpec)
-        for x in nest.flatten(output_signature)):
-      output_types = nest.pack_sequence_as(
-          output_signature, [x.dtype for x in nest.flatten(output_signature)])
-      output_shapes = nest.pack_sequence_as(
-          output_signature, [x.shape for x in nest.flatten(output_signature)])
-
-    if args is None:
-      args = ()
-    else:
-      args = tuple(ops.convert_n_to_tensor(args, name="args"))
-
-    generator_state = DatasetV2._GeneratorState(generator)
-
-    def get_iterator_id_fn(unused_dummy):
-      """Creates a unique `iterator_id` for each pass over the dataset.
-
-      The returned `iterator_id` disambiguates between multiple concurrently
-      existing iterators.
-
-      Args:
-        unused_dummy: Ignored value.
-
-      Returns:
-        A `tf.int64` tensor whose value uniquely identifies an iterator in
-        `generator_state`.
-      """
-      return script_ops.numpy_function(generator_state.get_next_id, args,
-                                       dtypes.int64)
-
-    def generator_next_fn(iterator_id_t):
-      """Generates the next element from iterator with ID `iterator_id_t`.
-
-      We map this function across an infinite repetition of the
-      `iterator_id_t`, and raise `StopIteration` to terminate the iteration.
-
-      Args:
-        iterator_id_t: A `tf.int64` tensor whose value uniquely identifies the
-          iterator in `generator_state` from which to generate an element.
-
-      Returns:
-        The next element to generate from the iterator.
-      """
-      if output_types and output_shapes:
-        flattened_types = [
-            dtypes.as_dtype(dt) for dt in nest.flatten(output_types)
-        ]
-        flattened_shapes = nest.flatten(output_shapes)
-
-        def generator_py_func(iterator_id):
-          """A `py_func` that will be called to invoke the iterator."""
-          # `next()` raises `StopIteration` when there are no more
-          # elements remaining to be generated.
-          values = next(generator_state.get_iterator(iterator_id))
-
-          # Use the same _convert function from the py_func() implementation to
-          # convert the returned values to arrays early, so that we can inspect
-          # their values.
-          try:
-            flattened_values = nest.flatten_up_to(output_types, values)
-          except (TypeError, ValueError) as e:
-            raise TypeError(
-                f"`generator` yielded an element that did not match the "
-                f"expected structure. The expected structure was "
-                f"{output_types}, but the yielded element was {values}.") from e
-          ret_arrays = []
-          for ret, dtype in zip(flattened_values, flattened_types):
-            try:
-              ret_arrays.append(
-                  script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
-                      ret,
-                      dtype=dtype.as_numpy_dtype))
-            except (TypeError, ValueError) as e:
-              raise TypeError(
-                  f"`generator` yielded an element that could not be "
-                  f"converted to the expected type. The expected type was "
-                  f"{dtype.name}, but the yielded element was {ret}.") from e
-
-          # Additional type and shape checking to ensure that the components of
-          # the generated element match the `output_types` and `output_shapes`
-          # arguments.
-          for (ret_array, expected_dtype,
-               expected_shape) in zip(ret_arrays, flattened_types,
-                                      flattened_shapes):
-            if ret_array.dtype != expected_dtype.as_numpy_dtype:
-              raise TypeError(
-                  f"`generator` yielded an element of type {ret_array.dtype} "
-                  f"where an element of type {expected_dtype.as_numpy_dtype} "
-                  f"was expected.")
-            if not expected_shape.is_compatible_with(ret_array.shape):
-              raise TypeError(
-                  f"`generator` yielded an element of shape {ret_array.shape} "
-                  f"where an element of shape {expected_shape} was expected.")
-
-          return ret_arrays
-
-        flat_values = script_ops.numpy_function(generator_py_func,
-                                                [iterator_id_t],
-                                                flattened_types)
-
-        # In debug mode the numpy_function will return a scalar if
-        # generator_py_func produces only a single value.
-        if not isinstance(flat_values, (list, tuple)):
-          flat_values = [flat_values]
-
-        # The `py_func()` op drops the inferred shapes, so we add them back in
-        # here.
-        if output_shapes is not None:
-          for ret_t, shape in zip(flat_values, flattened_shapes):
-            ret_t.set_shape(shape)
-
-        return nest.pack_sequence_as(output_types, flat_values)
-      else:
-        flat_output_types = structure.get_flat_tensor_types(output_signature)
-
-        def generator_py_func(iterator_id):
-          """A `py_func` that will be called to invoke the iterator."""
-          # `next()` raises `StopIteration` when there are no more
-          # elements remaining to be generated.
-          values = next(generator_state.get_iterator(iterator_id.numpy()))
-
-          try:
-            values = structure.normalize_element(values, output_signature)
-          except (TypeError, ValueError) as e:
-            raise TypeError(
-                f"`generator` yielded an element that did not match the "
-                f"expected structure. The expected structure was "
-                f"{output_signature}, but the yielded element was "
-                f"{values}.") from e
-
-          values_spec = structure.type_spec_from_value(values)
-
-          if not structure.are_compatible(values_spec, output_signature):
-            raise TypeError(
-                f"`generator` yielded an element of {values_spec} where an "
-                f"element of {output_signature} was expected.")
-
-          return structure.to_tensor_list(output_signature, values)
-
-        return script_ops.eager_py_func(
-            generator_py_func, inp=[iterator_id_t], Tout=flat_output_types)
-
-    def finalize_fn(iterator_id_t):
-      """Releases host-side state for the iterator with ID `iterator_id_t`."""
-
-      def finalize_py_func(iterator_id):
-        generator_state.iterator_completed(iterator_id)
-        # We return a dummy value so that the `finalize_fn` has a valid
-        # signature.
-        # NOTE(mrry): Explicitly create an array of `np.int64` because implicit
-        # casting in `py_func()` will create an array of `np.int32` on Windows,
-        # leading to a runtime error.
-        return np.array(0, dtype=np.int64)
-
-      return script_ops.numpy_function(finalize_py_func, [iterator_id_t],
-                                       dtypes.int64)
-
-    # This function associates each traversal of `generator` with a unique
-    # iterator ID.
-    def flat_map_fn(dummy_arg):
-      # The `get_iterator_id_fn` gets a unique ID for the current instance of
-      # of the generator.
-      # The `generator_next_fn` gets the next element from the iterator with the
-      # given ID, and raises StopIteration when that iterator contains no
-      # more elements.
-      return _GeneratorDataset(
-          dummy_arg,
-          get_iterator_id_fn,
-          generator_next_fn,
-          finalize_fn,
-          output_signature,
-          name=name)
-
-    # A single-element dataset that, each time it is evaluated, contains a
-    # freshly-generated and unique (for the returned dataset) int64
-    # ID that will be used to identify the appropriate Python state, which
-    # is encapsulated in `generator_state`, and captured in
-    # `get_iterator_id_map_fn`.
-    dummy = 0
-    id_dataset = Dataset.from_tensors(dummy, name=name)
-
-    # A dataset that contains all of the elements generated by a
-    # single iterator created from `generator`, identified by the
-    # iterator ID contained in `id_dataset`. Lifting the iteration
-    # into a flat_map here enables multiple repetitions and/or nested
-    # versions of the returned dataset to be created, because it forces
-    # the generation of a new ID for each version.
-    return id_dataset.flat_map(flat_map_fn, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # from_generator_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import from_generator_op
+    return from_generator_op._from_generator(generator, output_types,
+                                             output_shapes, args,
+                                             output_signature, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def range(*args, **kwargs):
@@ -1207,7 +1008,12 @@ def range(*args, **kwargs):
     Raises:
       ValueError: if len(args) == 0.
     """
-    return RangeDataset(*args, **kwargs)
+    # Loaded lazily due to a circular dependency (dataset_ops -> range_op ->
+    # -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import range_op
+    return range_op._range(*args, **kwargs)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def zip(datasets, name=None):
@@ -1257,8 +1063,10 @@ def zip(datasets, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops -> zip_op ->
     # dataset_ops).
-    from tensorflow.python.data.ops import zip_op  # pylint: disable=g-import-not-at-top
-    return zip_op.zip(datasets, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import zip_op
+    return zip_op._zip(datasets, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def concatenate(self, dataset, name=None):
     """Creates a `Dataset` by concatenating the given dataset with this dataset.
@@ -1288,7 +1096,12 @@ def concatenate(self, dataset, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return ConcatenateDataset(self, dataset, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # concatenate_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import concatenate_op
+    return concatenate_op._concatenate(self, dataset, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def counter(start=0, step=1, dtype=dtypes.int64, name=None):
@@ -1327,8 +1140,10 @@ def counter(start=0, step=1, dtype=dtypes.int64, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops -> counter_op
     # -> dataset_ops).
-    from tensorflow.python.data.ops import counter_op  # pylint: disable=g-import-not-at-top
-    return counter_op.counter(start, step, dtype, name=name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import counter_op
+    return counter_op._counter(start, step, dtype, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def rebatch(self, batch_size, drop_remainder=False, name=None):
     """Creates a `Dataset` that rebatches the elements from this dataset.
@@ -1377,10 +1192,12 @@ def rebatch(self, batch_size, drop_remainder=False, name=None):
     Returns:
       A `Dataset` of scalar `dtype` elements.
     """
-    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # Loaded lazily due to a circular dependency (dataset_ops -> rebatch_op ->
     # rebatch_op -> dataset_ops).
-    from tensorflow.python.data.ops import rebatch_op  # pylint: disable=g-import-not-at-top
-    return rebatch_op.rebatch(self, batch_size, drop_remainder, name=name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import rebatch_op
+    return rebatch_op._rebatch(self, batch_size, drop_remainder, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def prefetch(self, buffer_size, name=None):
     """Creates a `Dataset` that prefetches elements from this dataset.
@@ -1410,9 +1227,8 @@ def prefetch(self, buffer_size, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    if DEBUG_MODE:
-      return self
-    return PrefetchDataset(self, buffer_size, name=name)
+    return prefetch_op._prefetch(  # pylint: disable=protected-access
+        self, buffer_size, name=name)
 
   @staticmethod
   def list_files(file_pattern, shuffle=None, seed=None, name=None):
@@ -1478,9 +1294,11 @@ def list_files(file_pattern, shuffle=None, seed=None, name=None):
       # TODO(b/240947712): Remove lazy import after this method is factored out.
       # Loaded lazily due to a circular dependency (dataset_ops ->
       # from_tensor_slices_op -> dataset_ops).
-      from tensorflow.python.data.ops import from_tensor_slices_op  # pylint: disable=g-import-not-at-top
-      dataset = from_tensor_slices_op.TensorSliceDataset(
+      # pylint: disable=g-import-not-at-top,protected-access
+      from tensorflow.python.data.ops import from_tensor_slices_op
+      dataset = from_tensor_slices_op._TensorSliceDataset(
           matching_files, is_files=True, name=name)
+      # pylint: enable=g-import-not-at-top,protected-access
       if issubclass(Dataset, DatasetV1):
         dataset = DatasetV1Adapter(dataset)
       if shuffle:
@@ -1512,7 +1330,12 @@ def repeat(self, count=None, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return RepeatDataset(self, count, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> repeat_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access,redefined-outer-name
+    from tensorflow.python.data.ops import repeat_op
+    return repeat_op._repeat(self, count, name)
+    # pylint: enable=g-import-not-at-top,protected-access,redefined-outer-name
 
   def enumerate(self, start=0, name=None):
     """Enumerates the elements of this dataset.
@@ -1622,7 +1445,7 @@ def shuffle(self,
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return ShuffleDataset(
+    return shuffle_op._shuffle(  # pylint: disable=protected-access
         self, buffer_size, seed, reshuffle_each_iteration, name=name)
 
   def cache(self, filename="", name=None):
@@ -1676,7 +1499,12 @@ def cache(self, filename="", name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return CacheDataset(self, filename, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> cache_op ->
+    # -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import cache_op
+    return cache_op._cache(self, filename, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def take(self, count, name=None):
     """Creates a `Dataset` with at most `count` elements from this dataset.
@@ -1696,7 +1524,12 @@ def take(self, count, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return TakeDataset(self, count, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # take_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import take_op
+    return take_op._take(self, count, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def skip(self, count, name=None):
     """Creates a `Dataset` that skips `count` elements from this dataset.
@@ -1716,7 +1549,12 @@ def skip(self, count, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return SkipDataset(self, count, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # skip_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import skip_op
+    return skip_op._skip(self, count, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def shard(self, num_shards, index, name=None):
     """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
@@ -1785,7 +1623,10 @@ def shard(self, num_shards, index, name=None):
         placeholder tensor bypasses the early checking, and will instead result
         in an error during a session.run call.)
     """
-    return ShardDataset(self, num_shards, index, name=name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import shard_op
+    return shard_op._shard(self, num_shards, index, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def save(self,
            path,
@@ -1860,10 +1701,12 @@ def custom_shard_func(element):
     Raises:
       ValueError if `checkpoint` is passed into `checkpoint_args`.
     """
-    # Loaded lazily due to a circular dependency
-    # dataset_ops->save_ops->dataset_ops
-    from tensorflow.python.data.ops import save_op  # pylint: disable=g-import-not-at-top
-    save_op.save(self, path, compression, shard_func, checkpoint_args)
+    # Loaded lazily due to a circular dependency (dataset_ops -> save_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import save_op
+    save_op._save(self, path, compression, shard_func, checkpoint_args)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def load(path, element_spec=None, compression=None, reader_func=None):
@@ -1923,14 +1766,16 @@ def custom_reader_func(datasets):
       ValueError: If `element_spec` is not specified and the method is executed
         in graph mode.
     """
-    # Loaded lazily due to a circular dependency
-    # dataset_ops->load_ops->dataset_ops
-    from tensorflow.python.data.ops import load_op  # pylint: disable=g-import-not-at-top
-    return load_op.load(
+    # Loaded lazily due to a circular dependency (dataset_ops -> load_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import load_op
+    return load_op._load(
         path=path,
         element_spec=element_spec,
         compression=compression,
         reader_func=reader_func)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def batch(self,
             batch_size,
@@ -1989,9 +1834,11 @@ def batch(self,
     """
     # Loaded lazily due to a circular dependency (dataset_ops -> batch_op ->
     # dataset_ops).
-    from tensorflow.python.data.ops import batch_op  # pylint: disable=g-import-not-at-top,redefined-outer-name
-    return batch_op.batch(self, batch_size, drop_remainder, num_parallel_calls,
-                          deterministic, name)
+    # pylint: disable=g-import-not-at-top,protected-access,redefined-outer-name
+    from tensorflow.python.data.ops import batch_op
+    return batch_op._batch(self, batch_size, drop_remainder, num_parallel_calls,
+                           deterministic, name)
+    # pylint: enable=g-import-not-at-top,protected-access,redefined-outer-name
 
   def padded_batch(self,
                    batch_size,
@@ -2115,9 +1962,11 @@ def padded_batch(self,
     """
     # Loaded lazily due to a circular dependency (dataset_ops ->
     # padded_batch_op -> dataset_ops).
-    from tensorflow.python.data.ops import padded_batch_op  # pylint: disable=g-import-not-at-top
-    return padded_batch_op.padded_batch(self, batch_size, padded_shapes,
-                                        padding_values, drop_remainder, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import padded_batch_op
+    return padded_batch_op._padded_batch(self, batch_size, padded_shapes,
+                                         padding_values, drop_remainder, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def ragged_batch(self,
                    batch_size,
@@ -2176,9 +2025,11 @@ def ragged_batch(self,
     """
     # Loaded lazily due to a circular dependency (dataset_ops ->
     # ragged_batch_op -> dataset_ops).
-    from tensorflow.python.data.ops import ragged_batch_op  # pylint: disable=g-import-not-at-top
-    return ragged_batch_op.ragged_batch(self, batch_size, drop_remainder,
-                                        row_splits_dtype, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import ragged_batch_op
+    return ragged_batch_op._ragged_batch(self, batch_size, drop_remainder,
+                                         row_splits_dtype, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def sparse_batch(self, batch_size, row_shape, name=None):
     """Combines consecutive elements into `tf.sparse.SparseTensor`s.
@@ -2224,8 +2075,10 @@ def sparse_batch(self, batch_size, row_shape, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops ->
     # sparse_batch_op -> dataset_ops).
-    from tensorflow.python.data.ops import sparse_batch_op  # pylint: disable=g-import-not-at-top
-    return sparse_batch_op.sparse_batch(self, batch_size, row_shape, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import sparse_batch_op
+    return sparse_batch_op._sparse_batch(self, batch_size, row_shape, name)
+    # pylint: disable=g-import-not-at-top,protected-access
 
   def map(self,
           map_func,
@@ -2380,19 +2233,17 @@ def map(self,
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    if num_parallel_calls is None or DEBUG_MODE:
-      if deterministic is not None and not DEBUG_MODE:
-        warnings.warn("The `deterministic` argument has no effect unless the "
-                      "`num_parallel_calls` argument is specified.")
-      return MapDataset(self, map_func, preserve_cardinality=True, name=name)
-    else:
-      return ParallelMapDataset(
-          self,
-          map_func,
-          num_parallel_calls,
-          deterministic,
-          preserve_cardinality=True,
-          name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> map_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import map_op
+    return map_op._map_v2(
+        self,
+        map_func,
+        num_parallel_calls=num_parallel_calls,
+        deterministic=deterministic,
+        name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def flat_map(self, map_func, name=None):
     """Maps `map_func` across this dataset and flattens the result.
@@ -2427,7 +2278,12 @@ def flat_map(
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    return FlatMapDataset(self, map_func, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> flat_map_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import flat_map_op
+    return flat_map_op._flat_map(self, map_func, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def ignore_errors(self, log_warning=False, name=None):
     """Drops elements that cause errors.
@@ -2452,8 +2308,10 @@ def ignore_errors(self, log_warning=False, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops ->
     # ignore_errors_op -> dataset_ops).
-    from tensorflow.python.data.ops import ignore_errors_op  # pylint: disable=g-import-not-at-top
-    return ignore_errors_op.ignore_errors(self, log_warning, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import ignore_errors_op
+    return ignore_errors_op._ignore_errors(self, log_warning, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def interleave(self,
                  map_func,
@@ -2563,27 +2421,13 @@ def interleave(
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    if block_length is None:
-      block_length = 1
-
-    if cycle_length is None:
-      cycle_length = AUTOTUNE
-
-    if num_parallel_calls is None or DEBUG_MODE:
-      if deterministic is not None and not DEBUG_MODE:
-        warnings.warn("The `deterministic` argument has no effect unless the "
-                      "`num_parallel_calls` argument is specified.")
-      return InterleaveDataset(
-          self, map_func, cycle_length, block_length, name=name)
-    else:
-      return ParallelInterleaveDataset(
-          self,
-          map_func,
-          cycle_length,
-          block_length,
-          num_parallel_calls,
-          deterministic=deterministic,
-          name=name)
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> interleave_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import interleave_op
+    return interleave_op._interleave(self, map_func, cycle_length, block_length,
+                                     num_parallel_calls, deterministic, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def filter(self, predicate, name=None):
     """Filters this dataset according to `predicate`.
@@ -2608,8 +2452,10 @@ def filter(self, predicate, name=None):
     """
     # Loaded lazily due to a circular dependency (dataset_ops -> filter_op ->
     # dataset_ops).
-    from tensorflow.python.data.ops import filter_op  # pylint: disable=g-import-not-at-top
-    return filter_op.filter(self, predicate, name)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import filter_op
+    return filter_op._filter(self, predicate, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def apply(self, transformation_func):
     """Applies a transformation function to this dataset.
@@ -2784,9 +2630,12 @@ def window(
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    if shift is None:
-      shift = size
-    return WindowDataset(self, size, shift, stride, drop_remainder, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> window_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import window_op
+    return window_op._window(self, size, shift, stride, drop_remainder, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def reduce(self, initial_state, reduce_func, name=None):
     """Reduces the input dataset to a single element.
@@ -3057,8 +2906,12 @@ def unbatch(self, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-    normalized_dataset = normalize_to_dense(self)
-    return _UnbatchDataset(normalized_dataset, name=name)
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> unbatch_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import unbatch_op
+    return unbatch_op._unbatch(self, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def with_options(self, options, name=None):
     """Returns a new `tf.data.Dataset` with the given options set.
@@ -3169,22 +3022,13 @@ def group_by_window(self,
       ValueError: if neither or both of {`window_size`, `window_size_func`} are
         passed.
     """
-    if (window_size is not None and window_size_func or
-        not (window_size is not None or window_size_func)):
-      raise ValueError("Either the `window_size` argument or the "
-                       "`window_size_func` argument must be specified.")
-
-    if window_size is not None:
-
-      def constant_window_func(unused_key):
-        return ops.convert_to_tensor(window_size, dtype=dtypes.int64)
-
-      window_size_func = constant_window_func
-
-    assert window_size_func is not None
-
-    return _GroupByWindowDataset(
-        self, key_func, reduce_func, window_size_func, name=name)
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> group_by_window_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import group_by_window_op
+    return group_by_window_op._group_by_window(
+        self, key_func, reduce_func, window_size, window_size_func, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def bucket_by_sequence_length(self,
                                 element_length_func,
@@ -3334,25 +3178,55 @@ def batching_fn(bucket_id, grouped_dataset):
         name=name)
 
   @staticmethod
-  def random(seed=None, name=None):
+  def random(seed=None, rerandomize_each_iteration=None, name=None):
     """Creates a `Dataset` of pseudorandom values.
 
     The dataset generates a sequence of uniformly distributed integer values.
 
+    `rerandomize_each_iteration` controls whether the sequence of random number
+    generated should be re-randomized for each epoch. The default value is False
+    where the dataset generates the same sequence of random numbers for each
+    epoch.
+
     >>> ds1 = tf.data.Dataset.random(seed=4).take(10)
     >>> ds2 = tf.data.Dataset.random(seed=4).take(10)
     >>> print(list(ds1.as_numpy_iterator())==list(ds2.as_numpy_iterator()))
     True
 
+    >>> ds3 = tf.data.Dataset.random(seed=4).take(10)
+    >>> ds3_first_epoch = list(ds3.as_numpy_iterator())
+    >>> ds3_second_epoch = list(ds3.as_numpy_iterator())
+    >>> print(ds3_first_epoch == ds3_second_epoch)
+    True
+
+    >>> ds4 = tf.data.Dataset.random(
+    ...     seed=4, rerandomize_each_iteration=True).take(10)
+    >>> ds4_first_epoch = list(ds4.as_numpy_iterator())
+    >>> ds4_second_epoch = list(ds4.as_numpy_iterator())
+    >>> print(ds4_first_epoch == ds4_second_epoch)
+    False
+
     Args:
       seed: (Optional) If specified, the dataset produces a deterministic
         sequence of values.
+      rerandomize_each_iteration: (Optional) If set to False, the dataset
+      generates the same sequence of random numbers for each epoch. If set to
+      True, it generates a different deterministic sequence of random numbers
+      for each epoch. It is defaulted to False if left unspecified.
       name: (Optional.) A name for the tf.data operation.
 
     Returns:
       Dataset: A `Dataset`.
     """
-    return RandomDataset(seed=seed, name=name)
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> random_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import random_op
+    return random_op._random(
+        seed=seed,
+        rerandomize_each_iteration=rerandomize_each_iteration,
+        name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def snapshot(self,
                path,
@@ -3435,31 +3309,13 @@ def user_reader_func(datasets):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-
-    project_func = None
-    input_dataset = self
-    if shard_func is None:
-      input_dataset = input_dataset.enumerate(name=name)
-      # This sets the amount of parallelism based on the number of CPU cores on
-      # the machine where this Python code is executed, which may differ from
-      # the number of CPU cores where the input pipeline graph is actually
-      # executed (e.g. remote Cloud TPU workers).
-      local_shard_func = lambda index, _: index % multiprocessing.cpu_count()
-      project_func = lambda _, elem: elem
-    else:
-      local_shard_func = shard_func
-    dataset = _SnapshotDataset(
-        input_dataset=input_dataset,
-        path=path,
-        compression=compression,
-        reader_func=reader_func,
-        # This will not do the right thing where the graph is built on a
-        # different machine than the executor (e.g. Cloud TPUs).
-        shard_func=local_shard_func,
-        name=name)
-    if project_func is not None:
-      dataset = dataset.map(project_func, name=name)
-    return dataset
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> snapshot_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import snapshot_op
+    return snapshot_op._snapshot(
+        self, path, compression, reader_func, shard_func, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def scan(self, initial_state, scan_func, name=None):
     """A transformation that scans a function across an input dataset.
@@ -3489,8 +3345,12 @@ def scan(self, initial_state, scan_func, name=None):
       A new `Dataset` with the transformation applied as described above.
     """
 
-    return _ScanDataset(
-        self, initial_state=initial_state, scan_func=scan_func, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # scan_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import scan_op
+    return scan_op._scan(self, initial_state, scan_func, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def take_while(self, predicate, name=None):
     """A transformation that stops dataset iteration based on a `predicate`.
@@ -3509,8 +3369,12 @@ def take_while(self, predicate, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-
-    return _TakeWhileDataset(self, predicate, name=name)
+    # Loaded lazily due to a circular dependency (
+    # dataset_ops -> take_while_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import take_while_op
+    return take_while_op._take_while(self, predicate, name=name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def unique(self, name=None):
     """A transformation that discards duplicate elements of a `Dataset`.
@@ -3532,8 +3396,12 @@ def unique(self, name=None):
     Returns:
       A new `Dataset` with the transformation applied as described above.
     """
-
-    return _UniqueDataset(self, name=name)
+    # Loaded lazily due to a circular dependency (dataset_ops -> unique_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import unique_op
+    return unique_op._unique(self, name)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   def rejection_resample(self,
                          class_func,
@@ -3637,7 +3505,8 @@ def add_class_value(*x):
   def sample_from_datasets(datasets,
                            weights=None,
                            seed=None,
-                           stop_on_empty_dataset=False):
+                           stop_on_empty_dataset=False,
+                           rerandomize_each_iteration=None):
     """Samples elements at random from the datasets in `datasets`.
 
     Creates a dataset by interleaving elements of `datasets` with `weight[i]`
@@ -3679,6 +3548,11 @@ def sample_from_datasets(datasets,
         samples starts off as the user intends, but may change as input datasets
         become empty. This can be difficult to detect since the dataset starts
         off looking correct. Default to `False` for backward compatibility.
+      rerandomize_each_iteration: An optional `bool`. The boolean argument
+      controls whether the sequence of random numbers used to determine which
+      dataset to sample from will be rerandomized each epoch. That is, it
+      determinies whether datasets will be sampled in the same order across
+      different epochs (the default behavior) or not.
 
     Returns:
       A dataset that interleaves elements from `datasets` at random, according
@@ -3690,88 +3564,18 @@ def sample_from_datasets(datasets,
         - If `datasets` is empty, or
         - If `weights` is specified and does not match the length of `datasets`.
     """
-
-    def _skip_datasets_with_zero_weight(datasets, weights):
-      datasets_and_weights = [(dataset, weight)
-                              for (dataset, weight) in zip(datasets, weights)
-                              if weight > 0]
-      return (zip(*datasets_and_weights) if datasets_and_weights else
-              ([datasets[0].take(0)], [1.]))
-
-    if not datasets:
-      raise ValueError("Invalid `datasets`. `datasets` should not be empty.")
-
-    if not isinstance(weights, DatasetV2):
-      if weights is None:
-        # Select inputs with uniform probability.
-        logits = [[1.0] * len(datasets)]
-
-      else:
-        if isinstance(weights, ops.Tensor):
-          if not weights.shape.is_compatible_with([len(datasets)]):
-            raise ValueError(f"Invalid `weights`. The shape of `weights` "
-                             f"should be compatible with `[len(datasets)]` "
-                             f"but is {weights.shape}.")
-        else:
-          if len(datasets) != len(weights):
-            raise ValueError(f"Invalid `weights`. `weights` should have the "
-                             f"same length as `datasets` but got "
-                             f"`len(weights)={len(weights)}` vs. "
-                             f"`len(datasets)={len(datasets)}`.")
-
-        # Use the given `weights` as the probability of choosing the respective
-        # input.
-        if not isinstance(weights, ops.Tensor):
-          datasets, weights = _skip_datasets_with_zero_weight(datasets, weights)
-        weights = ops.convert_to_tensor(weights, name="weights")
-        if weights.dtype not in (dtypes.float32, dtypes.float64):
-          raise TypeError(f"Invalid `weights`. `weights` type must be either "
-                          f"`tf.float32` or `tf.float64` but is "
-                          f"{weights.dtype}.")
-
-        # The `stateless_multinomial()` op expects log-probabilities, as opposed
-        # to weights.
-        logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)
-
-      # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When
-      # it is a `Dataset`, it is possible that evaluating it has a side effect
-      # the user depends on.
-      if len(datasets) == 1:
-        return datasets[0]
-
-      def select_dataset_constant_logits(seed):
-        return array_ops.squeeze(
-            gen_stateless_random_ops.stateless_multinomial(
-                logits, 1, seed=seed),
-            axis=[0, 1])
-
-      selector_input = MapDataset(
-          RandomDataset(seed).batch(2),
-          select_dataset_constant_logits,
-          use_inter_op_parallelism=False)
-
-    else:
-      # Use each element of the given `weights` dataset as the probability of
-      # choosing the respective input.
-      #
-      # The `stateless_multinomial()` op expects log-probabilities, as opposed
-      # to weights.
-      logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
-
-      def select_dataset_varying_logits(logits, seed):
-        return array_ops.squeeze(
-            gen_stateless_random_ops.stateless_multinomial(
-                logits, 1, seed=seed),
-            axis=[0, 1])
-
-      logits_and_seeds = Dataset.zip((logits_ds, RandomDataset(seed).batch(2)))
-      selector_input = MapDataset(
-          logits_and_seeds,
-          select_dataset_varying_logits,
-          use_inter_op_parallelism=False)
-
-    return _DirectedInterleaveDataset(selector_input, datasets,
-                                      stop_on_empty_dataset)
+    # Loaded lazily due to a circular dependency
+    # (dataset_ops -> sample_from_datasets_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import sample_from_datasets_op
+    return sample_from_datasets_op._sample_from_datasets(  # pylint: disable=protected-access
+        datasets,
+        weights,
+        seed,
+        stop_on_empty_dataset,
+        rerandomize_each_iteration,
+    )
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   def choose_from_datasets(datasets,
@@ -3817,27 +3621,17 @@ def choose_from_datasets(datasets,
       TypeError: If `datasets` or `choice_dataset` has the wrong type.
       ValueError: If `datasets` is empty.
     """
-    if not datasets:
-      raise ValueError("Invalid `datasets`. `datasets` should not be empty.")
-    if not isinstance(choice_dataset, DatasetV2):
-      raise TypeError(f"Invalid `choice_dataset`. `choice_dataset` should be a "
-                      f"`tf.data.Dataset` but is {type(choice_dataset)}.")
-    if not structure.are_compatible(choice_dataset.element_spec,
-                                    tensor_spec.TensorSpec([], dtypes.int64)):
-      raise TypeError(f"Invalid `choice_dataset`. Elements of `choice_dataset` "
-                      f"must be scalar `tf.int64` tensors but are "
-                      f"{choice_dataset.element_spec}.")
-    # Replicate the `choice_dataset` component so that each split makes choices
-    # independently. This avoids the need for prohibitively expensive
-    # cross-split coordination.
-    choice_dataset = _apply_rewrite(choice_dataset, "replicate_on_split")
-    # pylint: disable=protected-access
-    return _DirectedInterleaveDataset(choice_dataset, datasets,
-                                      stop_on_empty_dataset)
+    # Loaded lazily due to a circular dependency
+    # (dataset_ops -> choose_from_datasets_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import choose_from_datasets_op
+    return choose_from_datasets_op._choose_from_datasets(
+        datasets, choice_dataset, stop_on_empty_dataset)
+    # pylint: enable=g-import-not-at-top,protected-access
 
 
 @tf_export(v1=["data.Dataset"])
-class DatasetV1(DatasetV2):
+class DatasetV1(DatasetV2, data_types.DatasetV1):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
@@ -4105,7 +3899,13 @@ def from_sparse_tensor_slices(sparse_tensor):
     Returns:
       Dataset: A `Dataset` of rank-(N-1) sparse tensors.
     """
-    return DatasetV1Adapter(SparseTensorSliceDataset(sparse_tensor))
+    # Loaded lazily due to a circular dependency (dataset_ops ->
+    # from_sparse_tensor_slices_op -> dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import from_sparse_tensor_slices_op
+    return from_sparse_tensor_slices_op._from_sparse_tensor_slices(
+        sparse_tensor)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @staticmethod
   @functools.wraps(DatasetV2.from_generator)
@@ -4222,17 +4022,16 @@ def map(self,
           num_parallel_calls=None,
           deterministic=None,
           name=None):
-    if num_parallel_calls is None or DEBUG_MODE:
-      return DatasetV1Adapter(
-          MapDataset(self, map_func, preserve_cardinality=False))
-    else:
-      return DatasetV1Adapter(
-          ParallelMapDataset(
-              self,
-              map_func,
-              num_parallel_calls,
-              deterministic,
-              preserve_cardinality=False))
+    # Loaded lazily due to a circular dependency (dataset_ops -> map_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import map_op
+    return map_op._map_v1(
+        self,
+        map_func,
+        num_parallel_calls=num_parallel_calls,
+        deterministic=deterministic)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @deprecation.deprecated(None, "Use `tf.data.Dataset.map()")
   def map_with_legacy_function(self,
@@ -4264,25 +4063,16 @@ def map_with_legacy_function(self,
     Returns:
       Dataset: A `Dataset`.
     """
-    if num_parallel_calls is None:
-      if deterministic is not None:
-        warnings.warn("The `deterministic` argument has no effect unless the "
-                      "`num_parallel_calls` argument is specified.")
-      return DatasetV1Adapter(
-          MapDataset(
-              self,
-              map_func,
-              preserve_cardinality=False,
-              use_legacy_function=True))
-    else:
-      return DatasetV1Adapter(
-          ParallelMapDataset(
-              self,
-              map_func,
-              num_parallel_calls,
-              deterministic,
-              preserve_cardinality=False,
-              use_legacy_function=True))
+    # Loaded lazily due to a circular dependency (dataset_ops -> map_op ->
+    # dataset_ops).
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import map_op
+    return map_op._map_v1_with_legacy_function(
+        self,
+        map_func,
+        num_parallel_calls=num_parallel_calls,
+        deterministic=deterministic)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @functools.wraps(DatasetV2.flat_map)
   def flat_map(self, map_func, name=None):
@@ -4329,8 +4119,10 @@ def filter_with_legacy_function(self, predicate):
     """
     # Loaded lazily due to a circular dependency (dataset_ops -> filter_op ->
     # dataset_ops).
-    from tensorflow.python.data.ops import filter_op  # pylint: disable=g-import-not-at-top
-    return filter_op.FilterDataset(self, predicate, use_legacy_function=True)
+    # pylint: disable=g-import-not-at-top,protected-access
+    from tensorflow.python.data.ops import filter_op
+    return filter_op._FilterDataset(self, predicate, use_legacy_function=True)
+    # pylint: enable=g-import-not-at-top,protected-access
 
   @functools.wraps(DatasetV2.apply)
   def apply(self, transformation_func):
@@ -4839,6 +4631,13 @@ def __eq__(self, other):
             self._dataset_shape == other._dataset_shape)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        DatasetSpec, struct_pb2.TypeSpecProto.DATA_DATASET_SPEC
+    )
+)
+
+
 class _NumpyIterator:
   """Iterator over a dataset with elements converted to numpy."""
 
@@ -4914,644 +4713,18 @@ def _trackable_children(self,
 batch_op = lazy_loader.LazyLoader(
     "batch_op", globals(),
     "tensorflow.python.data.ops.batch_op")
-BatchDataset = batch_op.BatchDataset
-
-
-class TensorDataset(DatasetSource):
-  """A `Dataset` with a single element."""
-
-  def __init__(self, element, name=None):
-    """See `Dataset.from_tensors()` for details."""
-    element = structure.normalize_element(element)
-    self._structure = structure.type_spec_from_value(element)
-    self._tensors = structure.to_tensor_list(self._structure, element)
-    self._name = name
-    variant_tensor = gen_dataset_ops.tensor_dataset(
-        self._tensors,
-        output_shapes=structure.get_flat_tensor_shapes(self._structure),
-        metadata=self._metadata.SerializeToString())
-    super(TensorDataset, self).__init__(variant_tensor)
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-
-class SparseTensorSliceDataset(DatasetSource):
-  """A `Dataset` that splits a rank-N `tf.sparse.SparseTensor` into its rows."""
-
-  def __init__(self, sparse_tensor):
-    """See `Dataset.from_sparse_tensor_slices()` for details."""
-    if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
-      raise TypeError(f"Invalid `sparse_tensor`. `sparse_tensor` must be a "
-                      f"`tf.sparse.SparseTensor`. Got {type(sparse_tensor)}.")
-    self._sparse_tensor = sparse_tensor
-
-    indices_shape = self._sparse_tensor.indices.get_shape()
-    shape_shape = self._sparse_tensor.dense_shape.get_shape()
-    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
-    self._structure = (tensor_spec.TensorSpec([None, rank], dtypes.int64),
-                       tensor_spec.TensorSpec([None],
-                                              self._sparse_tensor.dtype),
-                       tensor_spec.TensorSpec([rank], dtypes.int64))
-
-    variant_tensor = gen_dataset_ops.sparse_tensor_slice_dataset(
-        self._sparse_tensor.indices, self._sparse_tensor.values,
-        self._sparse_tensor.dense_shape)
-    super(SparseTensorSliceDataset, self).__init__(variant_tensor)
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-
-class _GeneratorDataset(DatasetSource):
-  """A `Dataset` that generates elements by invoking a function."""
-
-  def __init__(self,
-               init_args,
-               init_func,
-               next_func,
-               finalize_func,
-               output_signature,
-               name=None):
-    """Constructs a `_GeneratorDataset`.
-
-    Args:
-      init_args: A (nested) structure representing the arguments to `init_func`.
-      init_func: A TensorFlow function that will be called on `init_args` each
-        time a C++ iterator over this dataset is constructed. Returns a (nested)
-        structure representing the "state" of the dataset.
-      next_func: A TensorFlow function that will be called on the result of
-        `init_func` to produce each element, and that raises `OutOfRangeError`
-        to terminate iteration.
-      finalize_func: A TensorFlow function that will be called on the result of
-        `init_func` immediately before a C++ iterator over this dataset is
-        destroyed. The return value is ignored.
-      output_signature: A (nested) structure of `tf.TypeSpec` objects describing
-        the output of `next_func`.
-      name: Optional. A name for the tf.data transformation.
-    """
-    self._init_args = init_args
-
-    self._init_structure = structure.type_spec_from_value(init_args)
-
-    self._init_func = structured_function.StructuredFunctionWrapper(
-        init_func,
-        self._transformation_name(),
-        input_structure=self._init_structure)
-
-    self._next_func = structured_function.StructuredFunctionWrapper(
-        next_func,
-        self._transformation_name(),
-        input_structure=self._init_func.output_structure)
-
-    self._finalize_func = structured_function.StructuredFunctionWrapper(
-        finalize_func,
-        self._transformation_name(),
-        input_structure=self._init_func.output_structure)
-
-    self._output_signature = output_signature
-
-    self._name = name
-
-    variant_tensor = gen_dataset_ops.generator_dataset(
-        structure.to_tensor_list(self._init_structure, self._init_args) +
-        self._init_func.function.captured_inputs,
-        self._next_func.function.captured_inputs,
-        self._finalize_func.function.captured_inputs,
-        init_func=self._init_func.function,
-        next_func=self._next_func.function,
-        finalize_func=self._finalize_func.function,
-        **self._common_args)
-    super(_GeneratorDataset, self).__init__(variant_tensor)
-
-  @property
-  def element_spec(self):
-    return self._output_signature
-
-  def _transformation_name(self):
-    return "Dataset.from_generator()"
-
-
-class ConcatenateDataset(DatasetV2):
-  """A `Dataset` that concatenates its input with given dataset."""
-
-  def __init__(self, input_dataset, dataset_to_concatenate, name=None):
-    """See `Dataset.concatenate()` for details."""
-    self._input_dataset = input_dataset
-    self._dataset_to_concatenate = dataset_to_concatenate
-
-    def common_supertype(a, b):
-      result = a.most_specific_common_supertype([b])
-      if result is None:
-        raise TypeError(f"No common supertype of {a} and {b}.")
-      return result
-
-    try:
-      self._structure = tf_nest.map_structure(
-          common_supertype, input_dataset.element_spec,
-          dataset_to_concatenate.element_spec)
-    except (TypeError, ValueError) as e:
-      raise TypeError(
-          f"Incompatible dataset elements:\n"
-          f"  {input_dataset.element_spec} vs. "
-          f"  {dataset_to_concatenate.element_spec}") from e
-
-    self._input_datasets = [input_dataset, dataset_to_concatenate]
-    self._name = name
-    # pylint: disable=protected-access
-    variant_tensor = gen_dataset_ops.concatenate_dataset(
-        input_dataset._variant_tensor, dataset_to_concatenate._variant_tensor,
-        **self._common_args)
-    # pylint: enable=protected-access
-    super(ConcatenateDataset, self).__init__(variant_tensor)
-
-  def _inputs(self):
-    return self._input_datasets
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-
-class RepeatDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` that repeats its input several times."""
-
-  def __init__(self, input_dataset, count, name=None):
-    """See `Dataset.repeat()` for details."""
-    self._input_dataset = input_dataset
-    if count is None:
-      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
-    else:
-      self._count = ops.convert_to_tensor(
-          count, dtype=dtypes.int64, name="count")
-    self._name = name
-    variant_tensor = gen_dataset_ops.repeat_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        count=self._count,
-        **self._common_args)
-    super(RepeatDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class RangeDataset(DatasetSource):
-  """A `Dataset` of a step separated range of values."""
-
-  def __init__(self, *args, **kwargs):
-    """See `Dataset.range()` for details."""
-    self._parse_args(*args, **kwargs)
-    self._structure = tensor_spec.TensorSpec([], self._output_type)
-    variant_tensor = gen_dataset_ops.range_dataset(
-        start=self._start,
-        stop=self._stop,
-        step=self._step,
-        **self._common_args)
-    super(RangeDataset, self).__init__(variant_tensor)
-
-  def _parse_args(self, *args, **kwargs):
-    """Parse arguments according to the same rules as the `range()` builtin."""
-    if len(args) == 1:
-      self._start = self._build_tensor(0, "start")
-      self._stop = self._build_tensor(args[0], "stop")
-      self._step = self._build_tensor(1, "step")
-    elif len(args) == 2:
-      self._start = self._build_tensor(args[0], "start")
-      self._stop = self._build_tensor(args[1], "stop")
-      self._step = self._build_tensor(1, "step")
-    elif len(args) == 3:
-      self._start = self._build_tensor(args[0], "start")
-      self._stop = self._build_tensor(args[1], "stop")
-      self._step = self._build_tensor(args[2], "step")
-    else:
-      raise ValueError(f"Invalid `args`. The lenght of `args` should be "
-                       f"between 1 and 3 but was {len(args)}.")
-    if "output_type" in kwargs:
-      self._output_type = kwargs["output_type"]
-    else:
-      self._output_type = dtypes.int64
-    self._name = kwargs["name"] if "name" in kwargs else None
-
-  def _build_tensor(self, int64_value, name):
-    return ops.convert_to_tensor(int64_value, dtype=dtypes.int64, name=name)
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-
-class CacheDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` that caches elements of its input."""
-
-  def __init__(self, input_dataset, filename, name=None):
-    """See `Dataset.cache()` for details."""
-    self._input_dataset = input_dataset
-    self._filename = ops.convert_to_tensor(
-        filename, dtype=dtypes.string, name="filename")
-    self._name = name
-    if tf2.enabled() and (context.executing_eagerly() or ops.inside_function()):
-      variant_tensor = gen_dataset_ops.cache_dataset_v2(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          filename=self._filename,
-          cache=gen_dataset_ops.dummy_memory_cache(),
-          **self._common_args)
-    else:
-      variant_tensor = gen_dataset_ops.cache_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          filename=self._filename,
-          **self._common_args)
-    super(CacheDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class ShuffleDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` that randomly shuffles the elements of its input."""
-
-  def __init__(self,
-               input_dataset,
-               buffer_size,
-               seed=None,
-               reshuffle_each_iteration=None,
-               name=None):
-    """See `Dataset.shuffle()` for details."""
-    self._input_dataset = input_dataset
-    self._buffer_size = ops.convert_to_tensor(
-        buffer_size, dtype=dtypes.int64, name="buffer_size")
-    self._seed, self._seed2 = random_seed.get_seed(seed)
-    if reshuffle_each_iteration is None:
-      reshuffle_each_iteration = True
-    self._reshuffle_each_iteration = reshuffle_each_iteration
-    self._name = name
-
-    if (tf2.enabled() and
-        (context.executing_eagerly() or ops.inside_function())):
-      variant_tensor = gen_dataset_ops.shuffle_dataset_v3(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          buffer_size=self._buffer_size,
-          seed=self._seed,
-          seed2=self._seed2,
-          seed_generator=gen_dataset_ops.dummy_seed_generator(),
-          reshuffle_each_iteration=self._reshuffle_each_iteration,
-          **self._common_args)
-    else:
-      variant_tensor = gen_dataset_ops.shuffle_dataset(
-          input_dataset._variant_tensor,  # pylint: disable=protected-access
-          buffer_size=self._buffer_size,
-          seed=self._seed,
-          seed2=self._seed2,
-          reshuffle_each_iteration=self._reshuffle_each_iteration,
-          **self._common_args)
-    super(ShuffleDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class TakeDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` containing the first `count` elements from its input."""
-
-  def __init__(self, input_dataset, count, name=None):
-    """See `Dataset.take()` for details."""
-    self._input_dataset = input_dataset
-    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
-    self._name = name
-    variant_tensor = gen_dataset_ops.take_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        count=self._count,
-        **self._common_args)
-    super(TakeDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class SkipDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` skipping the first `count` elements from its input."""
-
-  def __init__(self, input_dataset, count, name=None):
-    """See `Dataset.skip()` for details."""
-    self._input_dataset = input_dataset
-    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
-    self._name = name
-    variant_tensor = gen_dataset_ops.skip_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        count=self._count,
-        **self._common_args)
-    super(SkipDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class ShardDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` for sharding its input."""
-
-  def __init__(self, input_dataset, num_shards, index, name=None):
-    """See `Dataset.shard()` for details."""
-    self._input_dataset = input_dataset
-    self._num_shards = ops.convert_to_tensor(
-        num_shards, dtype=dtypes.int64, name="num_shards")
-    self._index = ops.convert_to_tensor(index, dtype=dtypes.int64, name="index")
-    self._name = name
-    variant_tensor = gen_dataset_ops.shard_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        num_shards=self._num_shards,
-        index=self._index,
-        **self._common_args)
-    super(ShardDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class MapDataset(UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self,
-               input_dataset,
-               map_func,
-               use_inter_op_parallelism=True,
-               preserve_cardinality=False,
-               use_legacy_function=False,
-               name=None):
-    """See `Dataset.map()` for details."""
-    self._input_dataset = input_dataset
-    self._use_inter_op_parallelism = use_inter_op_parallelism
-    self._preserve_cardinality = preserve_cardinality
-    self._map_func = structured_function.StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        use_legacy_function=use_legacy_function)
-    self._name = name
-    variant_tensor = gen_dataset_ops.map_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,
-        f=self._map_func.function,
-        use_inter_op_parallelism=self._use_inter_op_parallelism,
-        preserve_cardinality=self._preserve_cardinality,
-        **self._common_args)
-    super(MapDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
-
-  @property
-  def element_spec(self):
-    return self._map_func.output_structure
-
-  def _transformation_name(self):
-    return "Dataset.map()"
-
-
-class ParallelMapDataset(UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input in parallel."""
-
-  def __init__(self,
-               input_dataset,
-               map_func,
-               num_parallel_calls,
-               deterministic,
-               use_inter_op_parallelism=True,
-               preserve_cardinality=False,
-               use_legacy_function=False,
-               name=None):
-    """See `Dataset.map()` for details."""
-    self._input_dataset = input_dataset
-    self._use_inter_op_parallelism = use_inter_op_parallelism
-    self._map_func = structured_function.StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        use_legacy_function=use_legacy_function)
-    if deterministic is None:
-      self._deterministic = "default"
-    elif deterministic:
-      self._deterministic = "true"
-    else:
-      self._deterministic = "false"
-    self._preserve_cardinality = preserve_cardinality
-    self._num_parallel_calls = ops.convert_to_tensor(
-        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-    self._name = name
-    variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,
-        f=self._map_func.function,
-        num_parallel_calls=self._num_parallel_calls,
-        deterministic=self._deterministic,
-        use_inter_op_parallelism=self._use_inter_op_parallelism,
-        preserve_cardinality=self._preserve_cardinality,
-        **self._common_args)
-    super(ParallelMapDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
+BatchDataset = batch_op._BatchDataset  # pylint: disable=protected-access
+PrefetchDataset = prefetch_op._PrefetchDataset  # pylint: disable=protected-access
+ShuffleDataset = shuffle_op._ShuffleDataset  # pylint: disable=protected-access
 
-  @property
-  def element_spec(self):
-    return self._map_func.output_structure
-
-  def _transformation_name(self):
-    return "Dataset.map()"
-
-
-class FlatMapDataset(UnaryDataset):
-  """A `Dataset` that maps a function over its input and flattens the result."""
-
-  def __init__(self, input_dataset, map_func, name=None):
-    """See `Dataset.flat_map()` for details."""
-    self._input_dataset = input_dataset
-    self._map_func = structured_function.StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
-    if not isinstance(self._map_func.output_structure, DatasetSpec):
-      raise TypeError(
-          "The `map_func` argument must return a `Dataset` object. Got "
-          f"{_get_type(self._map_func.output_structure)!r}.")
-    self._structure = self._map_func.output_structure._element_spec  # pylint: disable=protected-access
-    self._name = name
-    variant_tensor = gen_dataset_ops.flat_map_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,
-        f=self._map_func.function,
-        **self._common_args)
-    super(FlatMapDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-  def _transformation_name(self):
-    return "Dataset.flat_map()"
-
-
-class InterleaveDataset(UnaryDataset):
-  """A `Dataset` that interleaves the result of transformed inputs."""
-
-  def __init__(self,
-               input_dataset,
-               map_func,
-               cycle_length,
-               block_length,
-               name=None):
-    """See `Dataset.interleave()` for details."""
-
-    self._input_dataset = input_dataset
-    self._map_func = structured_function.StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
-    if not isinstance(self._map_func.output_structure, DatasetSpec):
-      raise TypeError(
-          "The `map_func` argument must return a `Dataset` object. Got "
-          f"{_get_type(self._map_func.output_structure)!r}.")
-    self._structure = self._map_func.output_structure._element_spec  # pylint: disable=protected-access
-    self._cycle_length = ops.convert_to_tensor(
-        cycle_length, dtype=dtypes.int64, name="cycle_length")
-    self._block_length = ops.convert_to_tensor(
-        block_length, dtype=dtypes.int64, name="block_length")
-    self._name = name
-    variant_tensor = gen_dataset_ops.interleave_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
-        self._cycle_length,
-        self._block_length,
-        f=self._map_func.function,
-        **self._common_args)
-    super(InterleaveDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-  def _transformation_name(self):
-    return "Dataset.interleave()"
-
-
-class ParallelInterleaveDataset(UnaryDataset):
-  """A `Dataset` that maps a function over its input and interleaves the result."""
-
-  def __init__(self,
-               input_dataset,
-               map_func,
-               cycle_length,
-               block_length,
-               num_parallel_calls,
-               buffer_output_elements=AUTOTUNE,
-               prefetch_input_elements=AUTOTUNE,
-               deterministic=None,
-               name=None):
-    """See `Dataset.interleave()` for details."""
-    self._input_dataset = input_dataset
-    self._map_func = structured_function.StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
-    if not isinstance(self._map_func.output_structure, DatasetSpec):
-      raise TypeError(
-          "The `map_func` argument must return a `Dataset` object. Got "
-          f"{_get_type(self._map_func.output_structure)!r}.")
-    self._structure = self._map_func.output_structure._element_spec  # pylint: disable=protected-access
-    self._cycle_length = ops.convert_to_tensor(
-        cycle_length, dtype=dtypes.int64, name="cycle_length")
-    self._block_length = ops.convert_to_tensor(
-        block_length, dtype=dtypes.int64, name="block_length")
-    self._buffer_output_elements = ops.convert_to_tensor(
-        buffer_output_elements,
-        dtype=dtypes.int64,
-        name="buffer_output_elements")
-    self._prefetch_input_elements = ops.convert_to_tensor(
-        prefetch_input_elements,
-        dtype=dtypes.int64,
-        name="prefetch_input_elements")
-
-    self._num_parallel_calls = ops.convert_to_tensor(
-        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-    if deterministic is None:
-      deterministic_string = "default"
-    elif deterministic:
-      deterministic_string = "true"
-    else:
-      deterministic_string = "false"
-
-    self._name = name
-    variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
-        self._cycle_length,
-        self._block_length,
-        self._buffer_output_elements,
-        self._prefetch_input_elements,
-        self._num_parallel_calls,
-        f=self._map_func.function,
-        deterministic=deterministic_string,
-        **self._common_args)
-    super(ParallelInterleaveDataset, self).__init__(input_dataset,
-                                                    variant_tensor)
-
-  def _functions(self):
-    return [self._map_func]
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-  def _transformation_name(self):
-    return "Dataset.interleave()"
-
-
-class PrefetchDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` that asynchronously prefetches its input."""
-
-  def __init__(self, input_dataset, buffer_size, slack_period=None, name=None):
-    """See `Dataset.prefetch()` for details."""
-    self._input_dataset = input_dataset
-    if buffer_size is None:
-      buffer_size = AUTOTUNE
-    self._buffer_size = ops.convert_to_tensor(
-        buffer_size, dtype=dtypes.int64, name="buffer_size")
-    self._name = name
-    # pylint: disable=protected-access
-    # We colocate the prefetch dataset with its input as this collocation only
-    # happens automatically in graph mode.
-    with ops.colocate_with(input_dataset._variant_tensor):
-      variant_tensor = gen_dataset_ops.prefetch_dataset(
-          input_dataset._variant_tensor,
-          buffer_size=self._buffer_size,
-          slack_period=slack_period,
-          **self._common_args)
-    super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class WindowDataset(UnaryDataset):
-  """A dataset that creates window datasets from the input elements."""
 
-  def __init__(self,
-               input_dataset,
-               size,
-               shift,
-               stride,
-               drop_remainder,
-               name=None):
-    """See `window()` for more details."""
-    self._input_dataset = input_dataset
-    self._size = ops.convert_to_tensor(size, dtype=dtypes.int64, name="size")
-    self._shift = ops.convert_to_tensor(shift, dtype=dtypes.int64, name="shift")
-    self._stride = ops.convert_to_tensor(
-        stride, dtype=dtypes.int64, name="stride")
-    self._drop_remainder = ops.convert_to_tensor(
-        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-    self._structure = nest.pack_sequence_as(
-        get_legacy_output_classes(input_dataset), [
-            DatasetSpec(  # pylint: disable=g-complex-comprehension
-                structure.convert_legacy_structure(
-                    output_type, output_shape, output_class))
-            for output_class, output_shape, output_type in zip(
-                nest.flatten(get_legacy_output_classes(input_dataset)),
-                nest.flatten(get_legacy_output_shapes(input_dataset)),
-                nest.flatten(get_legacy_output_types(input_dataset)))
-        ])
-    self._name = name
-    variant_tensor = gen_dataset_ops.window_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        size=self._size,
-        shift=self._shift,
-        stride=self._stride,
-        drop_remainder=self._drop_remainder,
-        **self._common_args)
-    super(WindowDataset, self).__init__(input_dataset, variant_tensor)
-
-  @property
-  def element_spec(self):
-    return self._structure
+# TODO(b/254291122): Remove.
+# Loaded lazily due to a circular dependency (dataset_ops ->
+# repeat_op -> dataset_ops).
+repeat_op = lazy_loader.LazyLoader(
+    "repeat_op", globals(),
+    "tensorflow.python.data.ops.repeat_op")
+RepeatDataset = repeat_op._RepeatDataset  # pylint: disable=protected-access
 
 
 class _OptionsDataset(UnaryUnchangedStructureDataset):
@@ -5625,137 +4798,6 @@ def element_spec(self):
     return self._element_spec
 
 
-class _UnbatchDataset(UnaryDataset):
-  """A dataset that splits the elements of its input into multiple elements."""
-
-  def __init__(self, input_dataset, name=None):
-    """See `unbatch()` for more details."""
-    flat_shapes = input_dataset._flat_shapes  # pylint: disable=protected-access
-    if any(s.ndims == 0 for s in flat_shapes):
-      raise ValueError("Cannot unbatch an input with scalar components.")
-    known_batch_dim = tensor_shape.Dimension(None)
-    for s in flat_shapes:
-      try:
-        known_batch_dim = known_batch_dim.merge_with(s[0])
-      except ValueError:
-        raise ValueError(f"`unbatch()` is only supported for datasets of "
-                         f"elements whose components have a matching leading "
-                         f"dimension. Encountered both {known_batch_dim} and "
-                         f"{s[0]}.")
-    self._input_dataset = input_dataset
-    self._structure = nest.map_structure(
-        lambda component_spec: component_spec._unbatch(),  # pylint: disable=protected-access
-        get_structure(input_dataset))
-    self._name = name
-    variant_tensor = ged_ops.unbatch_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **self._common_args)
-    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
-
-  @property
-  def element_spec(self):
-    return self._structure
-
-
-class _GroupByWindowDataset(UnaryDataset):
-  """A `Dataset` that groups its input and performs a windowed reduction."""
-
-  def __init__(self,
-               input_dataset,
-               key_func,
-               reduce_func,
-               window_size_func,
-               name=None):
-    """See `group_by_window()` for details."""
-    self._input_dataset = input_dataset
-    self._make_key_func(key_func, input_dataset)
-    self._make_reduce_func(reduce_func, input_dataset)
-    self._make_window_size_func(window_size_func)
-    self._name = name
-    variant_tensor = ged_ops.group_by_window_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        self._key_func.function.captured_inputs,
-        self._reduce_func.function.captured_inputs,
-        self._window_size_func.function.captured_inputs,
-        key_func=self._key_func.function,
-        reduce_func=self._reduce_func.function,
-        window_size_func=self._window_size_func.function,
-        **self._common_args)
-    super(_GroupByWindowDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _make_window_size_func(self, window_size_func):
-    """Make wrapping defun for window_size_func."""
-
-    def window_size_func_wrapper(key):
-      return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
-
-    self._window_size_func = structured_function.StructuredFunctionWrapper(
-        window_size_func_wrapper,
-        self._transformation_name(),
-        input_structure=tensor_spec.TensorSpec([], dtypes.int64))
-    if not self._window_size_func.output_structure.is_compatible_with(
-        tensor_spec.TensorSpec([], dtypes.int64)):
-      raise ValueError(f"Invalid `window_size_func`. `window_size_func` must "
-                       f"return a single `tf.int64` scalar tensor but its "
-                       f"return type is "
-                       f"{self._window_size_func.output_structure}.")
-
-  def _make_key_func(self, key_func, input_dataset):
-    """Make wrapping defun for key_func."""
-
-    def key_func_wrapper(*args):
-      return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
-
-    self._key_func = structured_function.StructuredFunctionWrapper(
-        key_func_wrapper, self._transformation_name(), dataset=input_dataset)
-    if not self._key_func.output_structure.is_compatible_with(
-        tensor_spec.TensorSpec([], dtypes.int64)):
-      raise ValueError(f"Invalid `key_func`. `key_func` must return a single "
-                       f"`tf.int64` scalar tensor but its return type is "
-                       f"{self._key_func.output_structure}.")
-
-  def _make_reduce_func(self, reduce_func, input_dataset):
-    """Make wrapping defun for reduce_func."""
-    nested_dataset = DatasetSpec(input_dataset.element_spec)
-    input_structure = (tensor_spec.TensorSpec([], dtypes.int64), nested_dataset)
-    self._reduce_func = structured_function.StructuredFunctionWrapper(
-        reduce_func,
-        self._transformation_name(),
-        input_structure=input_structure)
-    if not isinstance(self._reduce_func.output_structure, DatasetSpec):
-      raise TypeError(f"Invalid `reduce_func`. `reduce_func` must return a "
-                      f"single `tf.data.Dataset` object but its return type "
-                      f"is {self._reduce_func.output_structure}.")
-    # pylint: disable=protected-access
-    self._element_spec = (self._reduce_func.output_structure._element_spec)
-
-  @property
-  def element_spec(self):
-    return self._element_spec
-
-  def _functions(self):
-    return [self._key_func, self._reduce_func, self._window_size_func]
-
-  def _transformation_name(self):
-    return "Dataset.group_by_window()"
-
-
-class RandomDataset(DatasetSource):
-  """A `Dataset` of pseudorandom values."""
-
-  def __init__(self, seed=None, name=None):
-    """A `Dataset` of pseudorandom values."""
-    self._seed, self._seed2 = random_seed.get_seed(seed)
-    self._name = name
-    variant_tensor = ged_ops.random_dataset(
-        seed=self._seed, seed2=self._seed2, **self._common_args)
-    super(RandomDataset, self).__init__(variant_tensor)
-
-  @property
-  def element_spec(self):
-    return tensor_spec.TensorSpec([], dtypes.int64)
-
-
 def _get_prob_original_static(initial_dist_t, target_dist_t):
   """Returns the static probability of sampling from the original.
 
@@ -5970,292 +5012,6 @@ def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
   return a_i, m
 
 
-class _TakeWhileDataset(UnaryUnchangedStructureDataset):
-  """A dataset that stops iteration when `predicate` returns false."""
-
-  def __init__(self, input_dataset, predicate, name=None):
-    """See `take_while()` for details."""
-
-    self._input_dataset = input_dataset
-    wrapped_func = structured_function.StructuredFunctionWrapper(
-        predicate, self._transformation_name(), dataset=self._input_dataset)
-
-    if not wrapped_func.output_structure.is_compatible_with(
-        tensor_spec.TensorSpec([], dtypes.bool)):
-      raise ValueError(f"Invalid `predicate`. `predicate` must return a "
-                       f"`tf.bool` scalar tensor but its return type is"
-                       f"{wrapped_func.output_structure}.")
-
-    self._predicate = wrapped_func
-    self._name = name
-    variant_tensor = ged_ops.take_while_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        other_arguments=self._predicate.function.captured_inputs,
-        predicate=self._predicate.function,
-        **self._common_args)
-    super(_TakeWhileDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._predicate]
-
-  def _transformation_name(self):
-    return "Dataset.take_while()"
-
-
-class _UniqueDataset(UnaryUnchangedStructureDataset):
-  """A `Dataset` contains the unique elements from its input."""
-
-  def __init__(self, input_dataset, name=None):
-    """See `unique()` for details."""
-    self._input_dataset = input_dataset
-    for ty in nest.flatten(get_legacy_output_types(input_dataset)):
-      if ty not in (dtypes.int32, dtypes.int64, dtypes.string):
-        raise TypeError(
-            f"`unique()` does not support type {ty}, only `tf.int32`, "
-            f"`tf.int64`, and `tf.string` are supported.")
-    self._name = name
-    variant_tensor = ged_ops.unique_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **self._common_args)
-    super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
-
-
-class _SnapshotDataset(UnaryUnchangedStructureDataset):
-  """A dataset that allows saving and re-use of already processed data."""
-
-  def __init__(self,
-               input_dataset,
-               path,
-               shard_func,
-               compression=None,
-               reader_func=None,
-               pending_snapshot_expiry_seconds=None,
-               use_legacy_function=False,
-               name=None):
-
-    if reader_func is None:
-      reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
-          lambda x: x,
-          cycle_length=multiprocessing.cpu_count(),
-          num_parallel_calls=AUTOTUNE)
-
-    self._input_dataset = input_dataset
-    self._path = path
-    self._compression = compression
-
-    self._reader_func = structured_function.StructuredFunctionWrapper(
-        reader_func,
-        self._transformation_name() + ".reader_func",
-        # Dataset of datasets of input elements
-        input_structure=DatasetSpec(DatasetSpec(input_dataset.element_spec)),
-        use_legacy_function=use_legacy_function)
-    self._shard_func = structured_function.StructuredFunctionWrapper(
-        shard_func,
-        self._transformation_name() + ".shard_func",
-        dataset=input_dataset,
-        use_legacy_function=use_legacy_function)
-
-    if ((not self._shard_func.output_structure.is_compatible_with(
-        tensor_spec.TensorSpec([], dtypes.int32))) and
-        (not self._shard_func.output_structure.is_compatible_with(
-            tensor_spec.TensorSpec([], dtypes.int64)))):
-      raise TypeError(f"Invalid `shard_func`. `shard_func` must return "
-                      f"`tf.int64` scalar tensor but its return type is "
-                      f"{self._shard_func.output_structure}.")
-
-    self._name = name
-    variant_tensor = ged_ops.snapshot_dataset_v2(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        path,
-        self._reader_func.function.captured_inputs,
-        self._shard_func.function.captured_inputs,
-        compression=compression,
-        reader_func=self._reader_func.function,
-        shard_func=self._shard_func.function,
-        **self._common_args)
-    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._reader_func, self._shard_func]
-
-  def _transformation_name(self):
-    return "Dataset.snapshot()"
-
-
-class _ScanDataset(UnaryDataset):
-  """A dataset that scans a function across its input."""
-
-  def __init__(self,
-               input_dataset,
-               initial_state,
-               scan_func,
-               use_default_device=None,
-               name=None):
-    """See `scan()` for details."""
-    self._input_dataset = input_dataset
-    self._initial_state = structure.normalize_element(initial_state)
-
-    # Compute initial values for the state classes, shapes and types based on
-    # the initial state. The shapes may be refined by running `tf_scan_func` one
-    # or more times below.
-    self._state_structure = structure.type_spec_from_value(self._initial_state)
-
-    # Iteratively rerun the scan function until reaching a fixed point on
-    # `self._state_shapes`.
-    need_to_rerun = True
-    while need_to_rerun:
-
-      wrapped_func = structured_function.StructuredFunctionWrapper(
-          scan_func,
-          self._transformation_name(),
-          input_structure=(self._state_structure, input_dataset.element_spec),
-          add_to_graph=False)
-      if not (isinstance(wrapped_func.output_types, collections_abc.Sequence)
-              and len(wrapped_func.output_types) == 2):
-        raise TypeError(f"Invalid `scan_func`. `scan_func` should return a "
-                        f"pair consisting of new state and the output value "
-                        f"but its return type is "
-                        f"{wrapped_func.output_structure}.")
-
-      new_state_classes, self._output_classes = wrapped_func.output_classes
-
-      # Extract and validate class information from the returned values.
-      new_state_classes, output_classes = wrapped_func.output_classes
-      old_state_classes = nest.map_structure(
-          lambda component_spec: component_spec._to_legacy_output_classes(),  # pylint: disable=protected-access
-          self._state_structure)
-      for new_state_class, old_state_class in zip(
-          nest.flatten(new_state_classes), nest.flatten(old_state_classes)):
-        if not issubclass(new_state_class, old_state_class):
-          raise TypeError(f"Invalid `scan_func`. The element classes for the "
-                          f"new state must match the initial state. Expected "
-                          f"{old_state_classes}, got {new_state_classes}.")
-
-      # Extract and validate type information from the returned values.
-      new_state_types, output_types = wrapped_func.output_types
-      old_state_types = nest.map_structure(
-          lambda component_spec: component_spec._to_legacy_output_types(),  # pylint: disable=protected-access
-          self._state_structure)
-      for new_state_type, old_state_type in zip(
-          nest.flatten(new_state_types), nest.flatten(old_state_types)):
-        if new_state_type != old_state_type:
-          raise TypeError(f"Invalid `scan_func`. The element types for the "
-                          f"new state must match the initial state. Expected "
-                          f"{old_state_types}, got {new_state_types}.")
-
-      # Extract shape information from the returned values.
-      new_state_shapes, output_shapes = wrapped_func.output_shapes
-      old_state_shapes = nest.map_structure(
-          lambda component_spec: component_spec._to_legacy_output_shapes(),  # pylint: disable=protected-access
-          self._state_structure)
-      self._element_spec = structure.convert_legacy_structure(
-          output_types, output_shapes, output_classes)
-
-      flat_state_shapes = nest.flatten(old_state_shapes)
-      flat_new_state_shapes = nest.flatten(new_state_shapes)
-      weakened_state_shapes = [
-          original.most_specific_compatible_shape(new)
-          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
-      ]
-
-      need_to_rerun = False
-      for original_shape, weakened_shape in zip(flat_state_shapes,
-                                                weakened_state_shapes):
-        if original_shape.ndims is not None and (
-            weakened_shape.ndims is None or
-            original_shape.as_list() != weakened_shape.as_list()):
-          need_to_rerun = True
-          break
-
-      if need_to_rerun:
-        # TODO(b/110122868): Support a "most specific compatible structure"
-        # method for combining structures, to avoid using legacy structures
-        # in this method.
-        self._state_structure = structure.convert_legacy_structure(
-            old_state_types,
-            nest.pack_sequence_as(old_state_shapes, weakened_state_shapes),
-            old_state_classes)
-
-    self._scan_func = wrapped_func
-    self._scan_func.function.add_to_graph(ops.get_default_graph())
-
-    self._name = name
-    # pylint: disable=protected-access
-    if use_default_device is not None:
-      variant_tensor = ged_ops.scan_dataset(
-          self._input_dataset._variant_tensor,
-          structure.to_tensor_list(self._state_structure, self._initial_state),
-          self._scan_func.function.captured_inputs,
-          f=self._scan_func.function,
-          preserve_cardinality=True,
-          use_default_device=use_default_device,
-          **self._common_args)
-    else:
-      variant_tensor = ged_ops.scan_dataset(
-          self._input_dataset._variant_tensor,
-          structure.to_tensor_list(self._state_structure, self._initial_state),
-          self._scan_func.function.captured_inputs,
-          f=self._scan_func.function,
-          preserve_cardinality=True,
-          **self._common_args)
-    super(_ScanDataset, self).__init__(input_dataset, variant_tensor)
-
-  def _functions(self):
-    return [self._scan_func]
-
-  @property
-  def element_spec(self):
-    return self._element_spec
-
-  def _transformation_name(self):
-    return "Dataset.scan()"
-
-
-class _DirectedInterleaveDataset(DatasetV2):
-  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
-
-  def __init__(self, selector_input, data_inputs, stop_on_empty_dataset=False):
-    self._selector_input = selector_input
-    self._data_inputs = list(data_inputs)
-    self._stop_on_empty_dataset = stop_on_empty_dataset
-
-    spec = self._data_inputs[0].element_spec
-    for i, data_input in enumerate(self._data_inputs[1:]):
-      def common_supertype(a, b):
-        result = a.most_specific_common_supertype([b])
-        if result is None:
-          raise TypeError(f"No common supertype of {a} and {b}.")
-        return result
-
-      try:
-        spec = nest.map_structure(common_supertype, spec,
-                                  data_input.element_spec)
-      except (TypeError, ValueError) as e:
-        raise TypeError(f"Invalid `datasets`. `datasets` must have compatible "
-                        f"element specs.\n Dataset 0 "
-                        f"element_spec={data_inputs[0].element_spec}.\n"
-                        f"Dataset {i+1} "
-                        f"element_spec={data_input.element_spec}.") from e
-    self._element_spec = spec
-
-    # pylint: disable=protected-access
-    variant_tensor = (
-        ged_ops.directed_interleave_dataset(
-            self._selector_input._variant_tensor,
-            [data_input._variant_tensor for data_input in self._data_inputs],
-            stop_on_empty_dataset=self._stop_on_empty_dataset,
-            **self._flat_structure))
-
-    super(_DirectedInterleaveDataset, self).__init__(variant_tensor)
-
-  def _inputs(self):
-    return [self._selector_input] + self._data_inputs
-
-  @property
-  def element_spec(self):
-    return self._element_spec
-
-
 def _apply_rewrite(dataset, rewrite):
   # pylint: disable=protected-access
   return _VariantDataset(
@@ -6346,60 +5102,4 @@ def _resource_resolver(op, resource_reads, resource_writes):
   return updated
 
 
-DEBUG_MODE = False
-
-
-@tf_export("data.experimental.enable_debug_mode")
-def enable_debug_mode():
-  """Enables debug mode for tf.data.
-
-  Example usage with pdb module:
-  ```
-  import tensorflow as tf
-  import pdb
-
-  tf.data.experimental.enable_debug_mode()
-
-  def func(x):
-    # Python 3.7 and older requires `pdb.Pdb(nosigint=True).set_trace()`
-    pdb.set_trace()
-    x = x + 1
-    return x
-
-  dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
-  dataset = dataset.map(func)
-
-  for item in dataset:
-    print(item)
-  ```
-
-  The effect of debug mode is two-fold:
-
-  1) Any transformations that would introduce asynchrony, parallelism, or
-  non-determinism to the input pipeline execution will be forced to execute
-  synchronously, sequentially, and deterministically.
-
-  2) Any user-defined functions passed into tf.data transformations such as
-  `map` will be wrapped in `tf.py_function` so that their body is executed
-  "eagerly" as a Python function as opposed to a traced TensorFlow graph, which
-  is the default behavior. Note that even when debug mode is enabled, the
-  user-defined function is still traced  to infer the shape and type of its
-  outputs; as a consequence, any `print` statements or breakpoints will be
-  triggered once during the tracing before the actual execution of the input
-  pipeline.
-
-  NOTE: As the debug mode setting affects the construction of the tf.data input
-  pipeline, it should be enabled before any tf.data definitions.
-
-  Raises:
-    ValueError: When invoked from graph mode.
-  """
-  if context.executing_eagerly():
-    toggle_debug_mode(True)
-  else:
-    raise ValueError("`enable_debug_mode() is only supported in eager mode.")
-
-
-def toggle_debug_mode(debug_mode):
-  global DEBUG_MODE
-  DEBUG_MODE = debug_mode
+dataset_autograph.register_overrides()
diff --git a/tensorflow/python/data/ops/debug_mode.py b/tensorflow/python/data/ops/debug_mode.py
new file mode 100644
index 00000000000..d6ac28e3562
--- /dev/null
+++ b/tensorflow/python/data/ops/debug_mode.py
@@ -0,0 +1,77 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python debug mode enabler."""
+
+from tensorflow.python.eager import context
+from tensorflow.python.util.tf_export import tf_export
+
+
+DEBUG_MODE = False
+
+
+@tf_export("data.experimental.enable_debug_mode")
+def enable_debug_mode():
+  """Enables debug mode for tf.data.
+
+  Example usage with pdb module:
+  ```
+  import tensorflow as tf
+  import pdb
+
+  tf.data.experimental.enable_debug_mode()
+
+  def func(x):
+    # Python 3.7 and older requires `pdb.Pdb(nosigint=True).set_trace()`
+    pdb.set_trace()
+    x = x + 1
+    return x
+
+  dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+  dataset = dataset.map(func)
+
+  for item in dataset:
+    print(item)
+  ```
+
+  The effect of debug mode is two-fold:
+
+  1) Any transformations that would introduce asynchrony, parallelism, or
+  non-determinism to the input pipeline execution will be forced to execute
+  synchronously, sequentially, and deterministically.
+
+  2) Any user-defined functions passed into tf.data transformations such as
+  `map` will be wrapped in `tf.py_function` so that their body is executed
+  "eagerly" as a Python function as opposed to a traced TensorFlow graph, which
+  is the default behavior. Note that even when debug mode is enabled, the
+  user-defined function is still traced  to infer the shape and type of its
+  outputs; as a consequence, any `print` statements or breakpoints will be
+  triggered once during the tracing before the actual execution of the input
+  pipeline.
+
+  NOTE: As the debug mode setting affects the construction of the tf.data input
+  pipeline, it should be enabled before any tf.data definitions.
+
+  Raises:
+    ValueError: When invoked from graph mode.
+  """
+  if context.executing_eagerly():
+    toggle_debug_mode(True)
+  else:
+    raise ValueError("`enable_debug_mode() is only supported in eager mode.")
+
+
+def toggle_debug_mode(debug_mode):
+  global DEBUG_MODE
+  DEBUG_MODE = debug_mode
diff --git a/tensorflow/python/data/ops/directed_interleave_op.py b/tensorflow/python/data/ops/directed_interleave_op.py
new file mode 100644
index 00000000000..b8ce72db626
--- /dev/null
+++ b/tensorflow/python/data/ops/directed_interleave_op.py
@@ -0,0 +1,72 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.shuffle`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _directed_interleave(  # pylint: disable=unused-private-name
+    selector_input, data_inputs, stop_on_empty_dataset=False
+):
+  return _DirectedInterleaveDataset(
+      selector_input, data_inputs, stop_on_empty_dataset=stop_on_empty_dataset
+  )
+
+
+class _DirectedInterleaveDataset(dataset_ops.DatasetV2):
+  """A substitute for `Dataset.interleave()` on a fixed list of datasets."""
+
+  def __init__(self, selector_input, data_inputs, stop_on_empty_dataset=False):
+    self._selector_input = selector_input
+    self._data_inputs = list(data_inputs)
+    self._stop_on_empty_dataset = stop_on_empty_dataset
+
+    spec = self._data_inputs[0].element_spec
+    for i, data_input in enumerate(self._data_inputs[1:]):
+      def common_supertype(a, b):
+        result = a.most_specific_common_supertype([b])
+        if result is None:
+          raise TypeError(f"No common supertype of {a} and {b}.")
+        return result
+
+      try:
+        spec = nest.map_structure(common_supertype, spec,
+                                  data_input.element_spec)
+      except (TypeError, ValueError) as e:
+        raise TypeError(f"Invalid `datasets`. `datasets` must have compatible "
+                        f"element specs.\n Dataset 0 "
+                        f"element_spec={data_inputs[0].element_spec}.\n"
+                        f"Dataset {i+1} "
+                        f"element_spec={data_input.element_spec}.") from e
+    self._element_spec = spec
+
+    # pylint: disable=protected-access
+    variant_tensor = (
+        ged_ops.directed_interleave_dataset(
+            self._selector_input._variant_tensor,
+            [data_input._variant_tensor for data_input in self._data_inputs],
+            stop_on_empty_dataset=self._stop_on_empty_dataset,
+            **self._flat_structure))
+
+    super().__init__(variant_tensor)
+
+  def _inputs(self):
+    return [self._selector_input] + self._data_inputs
+
+  @property
+  def element_spec(self):
+    return self._element_spec
diff --git a/tensorflow/python/data/ops/filter_op.py b/tensorflow/python/data/ops/filter_op.py
index 326c285c50c..fb36a8c5d4f 100644
--- a/tensorflow/python/data/ops/filter_op.py
+++ b/tensorflow/python/data/ops/filter_op.py
@@ -21,11 +21,11 @@
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def filter(self, predicate, name=None):  # pylint: disable=redefined-builtin
-  return FilterDataset(self, predicate, name=name)
+def _filter(input_dataset, predicate, name=None):  # pylint: disable=redefined-builtin
+  return _FilterDataset(input_dataset, predicate, name=name)
 
 
-class FilterDataset(dataset_ops.UnaryUnchangedStructureDataset):
+class _FilterDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
   def __init__(self,
@@ -52,7 +52,7 @@ def __init__(self,
         other_arguments=self._predicate.function.captured_inputs,
         predicate=self._predicate.function,
         **self._common_args)
-    super(FilterDataset, self).__init__(input_dataset, variant_tensor)
+    super().__init__(input_dataset, variant_tensor)
 
   def _functions(self):
     return [self._predicate]
diff --git a/tensorflow/python/data/ops/flat_map_op.py b/tensorflow/python/data/ops/flat_map_op.py
new file mode 100644
index 00000000000..baf0e02da4b
--- /dev/null
+++ b/tensorflow/python/data/ops/flat_map_op.py
@@ -0,0 +1,57 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.flat_map`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _flat_map(input_dataset, map_func, name=None):  # pylint: disable=unused-private-name
+  """See `Dataset.flat_map()` for details."""
+  return _FlatMapDataset(input_dataset, map_func, name)
+
+
+class _FlatMapDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(self, input_dataset, map_func, name=None):
+
+    self._input_dataset = input_dataset
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec):
+      raise TypeError(
+          "The `map_func` argument must return a `Dataset` object. Got "
+          f"{dataset_ops.get_type(self._map_func.output_structure)!r}.")
+    # pylint: disable=protected-access
+    self._structure = self._map_func.output_structure._element_spec
+    self._name = name
+    variant_tensor = gen_dataset_ops.flat_map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._structure
+
+  def _transformation_name(self):
+    return "Dataset.flat_map()"
diff --git a/tensorflow/python/data/ops/from_generator_op.py b/tensorflow/python/data/ops/from_generator_op.py
new file mode 100644
index 00000000000..248fc4f2844
--- /dev/null
+++ b/tensorflow/python/data/ops/from_generator_op.py
@@ -0,0 +1,400 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.from_generator`."""
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import script_ops
+
+
+def _from_generator(generator, output_types, output_shapes, args,
+                    output_signature, name):
+  """Creates a `Dataset` whose elements are generated by `generator`.
+
+  Note: The current implementation of `Dataset.from_generator()` uses
+  `tf.numpy_function` and inherits the same constraints. In particular, it
+  requires the dataset and iterator related operations to be placed
+  on a device in the same process as the Python program that called
+  `Dataset.from_generator()`. In particular, using `from_generator` will
+  preclude the use of tf.data service for scaling out dataset processing.
+  The body of `generator` will not be serialized in a `GraphDef`, and you
+  should not use this method if you need to serialize your model and restore
+  it in a different environment.
+
+  The `generator` argument must be a callable object that returns
+  an object that supports the `iter()` protocol (e.g. a generator function).
+
+  The elements generated by `generator` must be compatible with either the
+  given `output_signature` argument or with the given `output_types` and
+  (optionally) `output_shapes` arguments, whichever was specified.
+
+  The recommended way to call `from_generator` is to use the
+  `output_signature` argument. In this case the output will be assumed to
+  consist of objects with the classes, shapes and types defined by
+  `tf.TypeSpec` objects from `output_signature` argument:
+
+  >>> def gen():
+  ...   ragged_tensor = tf.ragged.constant([[1, 2], [3]])
+  ...   yield 42, ragged_tensor
+  >>>
+  >>> dataset = tf.data.Dataset.from_generator(
+  ...      gen,
+  ...      output_signature=(
+  ...          tf.TensorSpec(shape=(), dtype=tf.int32),
+  ...          tf.RaggedTensorSpec(shape=(2, None), dtype=tf.int32)))
+  >>>
+  >>> list(dataset.take(1))
+  [(<tf.Tensor: shape=(), dtype=int32, numpy=42>,
+  <tf.RaggedTensor [[1, 2], [3]]>)]
+
+  There is also a deprecated way to call `from_generator` by either with
+  `output_types` argument alone or together with `output_shapes` argument.
+  In this case the output of the function will be assumed to consist of
+  `tf.Tensor` objects with the types defined by `output_types` and with the
+  shapes which are either unknown or defined by `output_shapes`.
+
+  Note: If `generator` depends on mutable global variables or other external
+  state, be aware that the runtime may invoke `generator` multiple times
+  (in order to support repeating the `Dataset`) and at any time
+  between the call to `Dataset.from_generator()` and the production of the
+  first element from the generator. Mutating global variables or external
+  state can cause undefined behavior, and we recommend that you explicitly
+  cache any external state in `generator` before calling
+  `Dataset.from_generator()`.
+
+  Note: While the `output_signature` parameter makes it possible to yield
+  `Dataset` elements, the scope of `Dataset.from_generator()` should be
+  limited to logic that cannot be expressed through tf.data operations. Using
+  tf.data operations within the generator function is an anti-pattern and may
+  result in incremental memory growth.
+
+  Args:
+    generator: A callable object that returns an object that supports the
+      `iter()` protocol. If `args` is not specified, `generator` must take no
+      arguments; otherwise it must take as many arguments as there are values in
+      `args`.
+    output_types: (Optional.) A (nested) structure of `tf.DType` objects
+      corresponding to each component of an element yielded by `generator`.
+    output_shapes: (Optional.) A (nested) structure of `tf.TensorShape` objects
+      corresponding to each component of an element yielded by `generator`.
+    args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated and
+      passed to `generator` as NumPy-array arguments.
+    output_signature: (Optional.) A (nested) structure of `tf.TypeSpec` objects
+      corresponding to each component of an element yielded by `generator`.
+    name: (Optional.) A name for the tf.data operations used by
+      `from_generator`.
+
+  Returns:
+    Dataset: A `Dataset`.
+  """
+  if not callable(generator):
+    raise TypeError("`generator` must be a Python callable.")
+
+  if output_signature is not None:
+    if output_types is not None:
+      raise TypeError("The `output_types` argument can not be used together "
+                      "with the `output_signature` argument.")
+    if output_shapes is not None:
+      raise TypeError("The `output_shapes` argument can not be used together "
+                      "with the `output_signature` argument.")
+    for spec in nest.flatten(output_signature):
+      if not isinstance(spec, type_spec.TypeSpec):
+        raise TypeError(f"`output_signature` must contain objects that are "
+                        f"subclass of `tf.TypeSpec` but found {type(spec)} "
+                        f"which is not.")
+  else:
+    if output_types is None:
+      raise TypeError("To specify the output signature you need to provide "
+                      "either the `output_signature` argument or the "
+                      "`output_types` argument.")
+
+  if output_signature is None:
+    if output_shapes is None:
+      output_shapes = nest.map_structure(
+          lambda _: tensor_shape.TensorShape(None), output_types)
+    else:
+      output_shapes = nest.map_structure_up_to(output_types,
+                                               tensor_shape.as_shape,
+                                               output_shapes)
+    output_signature = nest.map_structure_up_to(output_types,
+                                                tensor_spec.TensorSpec,
+                                                output_shapes, output_types)
+  if all(
+      isinstance(x, tensor_spec.TensorSpec)
+      for x in nest.flatten(output_signature)):
+    output_types = nest.pack_sequence_as(
+        output_signature, [x.dtype for x in nest.flatten(output_signature)])
+    output_shapes = nest.pack_sequence_as(
+        output_signature, [x.shape for x in nest.flatten(output_signature)])
+
+  if args is None:
+    args = ()
+  else:
+    args = tuple(ops.convert_n_to_tensor(args, name="args"))
+
+  generator_state = dataset_ops.DatasetV2._GeneratorState(generator)  # pylint: disable=protected-access
+
+  def get_iterator_id_fn(unused_dummy):
+    """Creates a unique `iterator_id` for each pass over the dataset.
+
+    The returned `iterator_id` disambiguates between multiple concurrently
+    existing iterators.
+
+    Args:
+      unused_dummy: Ignored value.
+
+    Returns:
+      A `tf.int64` tensor whose value uniquely identifies an iterator in
+      `generator_state`.
+    """
+    return script_ops.numpy_function(generator_state.get_next_id, args,
+                                     dtypes.int64)
+
+  def generator_next_fn(iterator_id_t):
+    """Generates the next element from iterator with ID `iterator_id_t`.
+
+    We map this function across an infinite repetition of the
+    `iterator_id_t`, and raise `StopIteration` to terminate the iteration.
+
+    Args:
+      iterator_id_t: A `tf.int64` tensor whose value uniquely identifies the
+        iterator in `generator_state` from which to generate an element.
+
+    Returns:
+      The next element to generate from the iterator.
+    """
+    if output_types and output_shapes:
+      flattened_types = [
+          dtypes.as_dtype(dt) for dt in nest.flatten(output_types)
+      ]
+      flattened_shapes = nest.flatten(output_shapes)
+
+      def generator_py_func(iterator_id):
+        """A `py_func` that will be called to invoke the iterator."""
+        # `next()` raises `StopIteration` when there are no more
+        # elements remaining to be generated.
+        values = next(generator_state.get_iterator(iterator_id))
+
+        # Use the same _convert function from the py_func() implementation to
+        # convert the returned values to arrays early, so that we can inspect
+        # their values.
+        try:
+          flattened_values = nest.flatten_up_to(output_types, values)
+        except (TypeError, ValueError) as e:
+          raise TypeError(
+              f"`generator` yielded an element that did not match the "
+              f"expected structure. The expected structure was "
+              f"{output_types}, but the yielded element was {values}.") from e
+        ret_arrays = []
+        for ret, dtype in zip(flattened_values, flattened_types):
+          try:
+            ret_arrays.append(
+                script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
+                    ret,
+                    dtype=dtype.as_numpy_dtype))
+          except (TypeError, ValueError) as e:
+            raise TypeError(
+                f"`generator` yielded an element that could not be "
+                f"converted to the expected type. The expected type was "
+                f"{dtype.name}, but the yielded element was {ret}.") from e
+
+        # Additional type and shape checking to ensure that the components of
+        # the generated element match the `output_types` and `output_shapes`
+        # arguments.
+        for (ret_array, expected_dtype,
+             expected_shape) in zip(ret_arrays, flattened_types,
+                                    flattened_shapes):
+          if ret_array.dtype != expected_dtype.as_numpy_dtype:
+            raise TypeError(
+                f"`generator` yielded an element of type {ret_array.dtype} "
+                f"where an element of type {expected_dtype.as_numpy_dtype} "
+                f"was expected.")
+          if not expected_shape.is_compatible_with(ret_array.shape):
+            raise TypeError(
+                f"`generator` yielded an element of shape {ret_array.shape} "
+                f"where an element of shape {expected_shape} was expected.")
+
+        return ret_arrays
+
+      flat_values = script_ops.numpy_function(generator_py_func,
+                                              [iterator_id_t], flattened_types)
+
+      # In debug mode the numpy_function will return a scalar if
+      # generator_py_func produces only a single value.
+      if not isinstance(flat_values, (list, tuple)):
+        flat_values = [flat_values]
+
+      # The `py_func()` op drops the inferred shapes, so we add them back in
+      # here.
+      if output_shapes is not None:
+        for ret_t, shape in zip(flat_values, flattened_shapes):
+          ret_t.set_shape(shape)
+
+      return nest.pack_sequence_as(output_types, flat_values)
+    else:
+      flat_output_types = structure.get_flat_tensor_types(output_signature)
+
+      def generator_py_func(iterator_id):
+        """A `py_func` that will be called to invoke the iterator."""
+        # `next()` raises `StopIteration` when there are no more
+        # elements remaining to be generated.
+        values = next(generator_state.get_iterator(iterator_id.numpy()))
+
+        try:
+          values = structure.normalize_element(values, output_signature)
+        except (TypeError, ValueError) as e:
+          raise TypeError(
+              f"`generator` yielded an element that did not match the "
+              f"expected structure. The expected structure was "
+              f"{output_signature}, but the yielded element was "
+              f"{values}.") from e
+
+        values_spec = structure.type_spec_from_value(values)
+
+        if not structure.are_compatible(values_spec, output_signature):
+          raise TypeError(
+              f"`generator` yielded an element of {values_spec} where an "
+              f"element of {output_signature} was expected.")
+
+        return structure.to_tensor_list(output_signature, values)
+
+      return script_ops.eager_py_func(
+          generator_py_func, inp=[iterator_id_t], Tout=flat_output_types)
+
+  def finalize_fn(iterator_id_t):
+    """Releases host-side state for the iterator with ID `iterator_id_t`."""
+
+    def finalize_py_func(iterator_id):
+      generator_state.iterator_completed(iterator_id)
+      # We return a dummy value so that the `finalize_fn` has a valid
+      # signature.
+      # NOTE(mrry): Explicitly create an array of `np.int64` because implicit
+      # casting in `py_func()` will create an array of `np.int32` on Windows,
+      # leading to a runtime error.
+      return np.array(0, dtype=np.int64)
+
+    return script_ops.numpy_function(finalize_py_func, [iterator_id_t],
+                                     dtypes.int64)
+
+  # This function associates each traversal of `generator` with a unique
+  # iterator ID.
+  def flat_map_fn(dummy_arg):
+    # The `get_iterator_id_fn` gets a unique ID for the current instance of
+    # of the generator.
+    # The `generator_next_fn` gets the next element from the iterator with the
+    # given ID, and raises StopIteration when that iterator contains no
+    # more elements.
+    return _GeneratorDataset(
+        dummy_arg,
+        get_iterator_id_fn,
+        generator_next_fn,
+        finalize_fn,
+        output_signature,
+        name=name)
+
+  # A single-element dataset that, each time it is evaluated, contains a
+  # freshly-generated and unique (for the returned dataset) int64
+  # ID that will be used to identify the appropriate Python state, which
+  # is encapsulated in `generator_state`, and captured in
+  # `get_iterator_id_map_fn`.
+  dummy = 0
+  id_dataset = dataset_ops.Dataset.from_tensors(dummy, name=name)
+
+  # A dataset that contains all of the elements generated by a
+  # single iterator created from `generator`, identified by the
+  # iterator ID contained in `id_dataset`. Lifting the iteration
+  # into a flat_map here enables multiple repetitions and/or nested
+  # versions of the returned dataset to be created, because it forces
+  # the generation of a new ID for each version.
+  return id_dataset.flat_map(flat_map_fn, name=name)
+
+
+class _GeneratorDataset(dataset_ops.DatasetSource):
+  """A `Dataset` that generates elements by invoking a function."""
+
+  def __init__(self,
+               init_args,
+               init_func,
+               next_func,
+               finalize_func,
+               output_signature,
+               name=None):
+    """Constructs a `_GeneratorDataset`.
+
+    Args:
+      init_args: A (nested) structure representing the arguments to `init_func`.
+      init_func: A TensorFlow function that will be called on `init_args` each
+        time a C++ iterator over this dataset is constructed. Returns a (nested)
+        structure representing the "state" of the dataset.
+      next_func: A TensorFlow function that will be called on the result of
+        `init_func` to produce each element, and that raises `OutOfRangeError`
+        to terminate iteration.
+      finalize_func: A TensorFlow function that will be called on the result of
+        `init_func` immediately before a C++ iterator over this dataset is
+        destroyed. The return value is ignored.
+      output_signature: A (nested) structure of `tf.TypeSpec` objects describing
+        the output of `next_func`.
+      name: Optional. A name for the tf.data transformation.
+    """
+    self._init_args = init_args
+
+    self._init_structure = structure.type_spec_from_value(init_args)
+
+    self._init_func = structured_function.StructuredFunctionWrapper(
+        init_func,
+        self._transformation_name(),
+        input_structure=self._init_structure)
+
+    self._next_func = structured_function.StructuredFunctionWrapper(
+        next_func,
+        self._transformation_name(),
+        input_structure=self._init_func.output_structure)
+
+    self._finalize_func = structured_function.StructuredFunctionWrapper(
+        finalize_func,
+        self._transformation_name(),
+        input_structure=self._init_func.output_structure)
+
+    self._output_signature = output_signature
+
+    self._name = name
+
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        structure.to_tensor_list(self._init_structure, self._init_args) +
+        self._init_func.function.captured_inputs,
+        self._next_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        init_func=self._init_func.function,
+        next_func=self._next_func.function,
+        finalize_func=self._finalize_func.function,
+        **self._common_args)
+    super().__init__(variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._output_signature
+
+  def _transformation_name(self):
+    return "Dataset.from_generator()"
diff --git a/tensorflow/python/data/ops/from_sparse_tensor_slices_op.py b/tensorflow/python/data/ops/from_sparse_tensor_slices_op.py
new file mode 100644
index 00000000000..9f70b4e72db
--- /dev/null
+++ b/tensorflow/python/data/ops/from_sparse_tensor_slices_op.py
@@ -0,0 +1,53 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.from_sparse_tensor_slices`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _from_sparse_tensor_slices(sparse_tensor):  # pylint: disable=unused-private-name
+  return dataset_ops.DatasetV1Adapter(_SparseTensorSliceDataset(sparse_tensor))
+
+
+class _SparseTensorSliceDataset(dataset_ops.DatasetSource):
+  """A `Dataset` that splits a rank-N `tf.sparse.SparseTensor` into its rows."""
+
+  def __init__(self, sparse_tensor):
+    """See `Dataset.from_sparse_tensor_slices()` for details."""
+    if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
+      raise TypeError(f"Invalid `sparse_tensor`. `sparse_tensor` must be a "
+                      f"`tf.sparse.SparseTensor`. Got {type(sparse_tensor)}.")
+    self._sparse_tensor = sparse_tensor
+
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
+    self._structure = (tensor_spec.TensorSpec([None, rank], dtypes.int64),
+                       tensor_spec.TensorSpec([None],
+                                              self._sparse_tensor.dtype),
+                       tensor_spec.TensorSpec([rank], dtypes.int64))
+
+    variant_tensor = gen_dataset_ops.sparse_tensor_slice_dataset(
+        self._sparse_tensor.indices, self._sparse_tensor.values,
+        self._sparse_tensor.dense_shape)
+    super().__init__(variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/from_tensor_slices_op.py b/tensorflow/python/data/ops/from_tensor_slices_op.py
index 9ce28118f18..e3951a0aedf 100644
--- a/tensorflow/python/data/ops/from_tensor_slices_op.py
+++ b/tensorflow/python/data/ops/from_tensor_slices_op.py
@@ -21,11 +21,11 @@
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def from_tensor_slices(tensors, name=None):
-  return TensorSliceDataset(tensors, name=name)
+def _from_tensor_slices(tensors, name=None):
+  return _TensorSliceDataset(tensors, name=name)
 
 
-class TensorSliceDataset(dataset_ops.DatasetSource):
+class _TensorSliceDataset(dataset_ops.DatasetSource):
   """A `Dataset` of slices from a dataset element."""
 
   def __init__(self, element, is_files=False, name=None):
@@ -51,7 +51,7 @@ def __init__(self, element, is_files=False, name=None):
         output_shapes=structure.get_flat_tensor_shapes(self._structure),
         is_files=is_files,
         metadata=self._metadata.SerializeToString())
-    super(TensorSliceDataset, self).__init__(variant_tensor)
+    super().__init__(variant_tensor)
 
   @property
   def element_spec(self):
diff --git a/tensorflow/python/data/ops/from_tensors_op.py b/tensorflow/python/data/ops/from_tensors_op.py
new file mode 100644
index 00000000000..1588784ee55
--- /dev/null
+++ b/tensorflow/python/data/ops/from_tensors_op.py
@@ -0,0 +1,43 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.from_tensors`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _from_tensors(tensors, name):  # pylint: disable=unused-private-name
+  return _TensorDataset(tensors, name)
+
+
+class _TensorDataset(dataset_ops.DatasetSource):
+  """A `Dataset` with a single element."""
+
+  def __init__(self, element, name=None):
+    """See `tf.data.Dataset.from_tensors` for details."""
+    element = structure.normalize_element(element)
+    self._structure = structure.type_spec_from_value(element)
+    self._tensors = structure.to_tensor_list(self._structure, element)
+    self._name = name
+    variant_tensor = gen_dataset_ops.tensor_dataset(
+        self._tensors,
+        output_shapes=structure.get_flat_tensor_shapes(self._structure),
+        metadata=self._metadata.SerializeToString())
+    super().__init__(variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/group_by_window_op.py b/tensorflow/python/data/ops/group_by_window_op.py
new file mode 100644
index 00000000000..967264a25f7
--- /dev/null
+++ b/tensorflow/python/data/ops/group_by_window_op.py
@@ -0,0 +1,132 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.group_by_window`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _group_by_window(input_dataset,  # pylint: disable=unused-private-name
+                     key_func,
+                     reduce_func,
+                     window_size=None,
+                     window_size_func=None,
+                     name=None):
+  """See `Dataset.group_by_window()` for details."""
+
+  if (window_size is not None and window_size_func or
+      not (window_size is not None or window_size_func)):
+    raise ValueError("Either the `window_size` argument or the "
+                     "`window_size_func` argument must be specified.")
+
+  if window_size is not None:
+
+    def constant_window_func(unused_key):
+      return ops.convert_to_tensor(window_size, dtype=dtypes.int64)
+
+    window_size_func = constant_window_func
+
+  assert window_size_func is not None
+
+  return _GroupByWindowDataset(
+      input_dataset, key_func, reduce_func, window_size_func, name=name)
+
+
+class _GroupByWindowDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that groups its input and performs a windowed reduction."""
+
+  def __init__(self,
+               input_dataset,
+               key_func,
+               reduce_func,
+               window_size_func,
+               name=None):
+    """See `group_by_window()` for details."""
+    self._input_dataset = input_dataset
+    self._make_key_func(key_func, input_dataset)
+    self._make_reduce_func(reduce_func, input_dataset)
+    self._make_window_size_func(window_size_func)
+    self._name = name
+    variant_tensor = ged_ops.group_by_window_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _make_window_size_func(self, window_size_func):
+    """Make wrapping defun for window_size_func."""
+
+    def window_size_func_wrapper(key):
+      return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
+
+    self._window_size_func = structured_function.StructuredFunctionWrapper(
+        window_size_func_wrapper,
+        self._transformation_name(),
+        input_structure=tensor_spec.TensorSpec([], dtypes.int64))
+    if not self._window_size_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.int64)):
+      raise ValueError(f"Invalid `window_size_func`. `window_size_func` must "
+                       f"return a single `tf.int64` scalar tensor but its "
+                       f"return type is "
+                       f"{self._window_size_func.output_structure}.")
+
+  def _make_key_func(self, key_func, input_dataset):
+    """Make wrapping defun for key_func."""
+
+    def key_func_wrapper(*args):
+      return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
+
+    self._key_func = structured_function.StructuredFunctionWrapper(
+        key_func_wrapper, self._transformation_name(), dataset=input_dataset)
+    if not self._key_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.int64)):
+      raise ValueError(f"Invalid `key_func`. `key_func` must return a single "
+                       f"`tf.int64` scalar tensor but its return type is "
+                       f"{self._key_func.output_structure}.")
+
+  def _make_reduce_func(self, reduce_func, input_dataset):
+    """Make wrapping defun for reduce_func."""
+    nested_dataset = dataset_ops.DatasetSpec(input_dataset.element_spec)
+    input_structure = (tensor_spec.TensorSpec([], dtypes.int64), nested_dataset)
+    self._reduce_func = structured_function.StructuredFunctionWrapper(
+        reduce_func,
+        self._transformation_name(),
+        input_structure=input_structure)
+    if not isinstance(self._reduce_func.output_structure,
+                      dataset_ops.DatasetSpec):
+      raise TypeError(f"Invalid `reduce_func`. `reduce_func` must return a "
+                      f"single `tf.data.Dataset` object but its return type "
+                      f"is {self._reduce_func.output_structure}.")
+    # pylint: disable=protected-access
+    self._element_spec = (self._reduce_func.output_structure._element_spec)
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+  def _functions(self):
+    return [self._key_func, self._reduce_func, self._window_size_func]
+
+  def _transformation_name(self):
+    return "Dataset.group_by_window()"
diff --git a/tensorflow/python/data/ops/ignore_errors_op.py b/tensorflow/python/data/ops/ignore_errors_op.py
index cae46aa461e..4f2f58fe1c6 100644
--- a/tensorflow/python/data/ops/ignore_errors_op.py
+++ b/tensorflow/python/data/ops/ignore_errors_op.py
@@ -18,11 +18,11 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
-def ignore_errors(self, log_warning=False, name=None):
-  return IgnoreErrorsDataset(self, log_warning, name)
+def _ignore_errors(input_dataset, log_warning=False, name=None):
+  return _IgnoreErrorsDataset(input_dataset, log_warning, name)
 
 
-class IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
+class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that drops erroneous elements from its input."""
 
   def __init__(self, input_dataset, log_warning, name=None):
diff --git a/tensorflow/python/data/ops/interleave_op.py b/tensorflow/python/data/ops/interleave_op.py
new file mode 100644
index 00000000000..75372e7d344
--- /dev/null
+++ b/tensorflow/python/data/ops/interleave_op.py
@@ -0,0 +1,170 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.interleave`."""
+
+import warnings
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import debug_mode
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _interleave(  # pylint: disable=unused-private-name
+    input_dataset,
+    map_func,
+    cycle_length=None,
+    block_length=None,
+    num_parallel_calls=None,
+    deterministic=None,
+    name=None):
+  """See `Dataset.interleave()` for details."""
+  if block_length is None:
+    block_length = 1
+
+  if cycle_length is None:
+    cycle_length = dataset_ops.AUTOTUNE
+
+  if num_parallel_calls is None or debug_mode.DEBUG_MODE:
+    if deterministic is not None and not debug_mode.DEBUG_MODE:
+      warnings.warn("The `deterministic` argument has no effect unless the "
+                    "`num_parallel_calls` argument is specified.")
+    return _InterleaveDataset(
+        input_dataset, map_func, cycle_length, block_length, name=name)
+  else:
+    return _ParallelInterleaveDataset(
+        input_dataset,
+        map_func,
+        cycle_length,
+        block_length,
+        num_parallel_calls,
+        deterministic=deterministic,
+        name=name)
+
+
+class _InterleaveDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that interleaves the result of transformed inputs."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               cycle_length,
+               block_length,
+               name=None):
+    """See `Dataset.interleave()` for details."""
+
+    self._input_dataset = input_dataset
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec):
+      raise TypeError(
+          "The `map_func` argument must return a `Dataset` object. Got "
+          f"{dataset_ops.get_type(self._map_func.output_structure)!r}.")
+    self._structure = self._map_func.output_structure._element_spec  # pylint: disable=protected-access
+    self._cycle_length = ops.convert_to_tensor(
+        cycle_length, dtype=dtypes.int64, name="cycle_length")
+    self._block_length = ops.convert_to_tensor(
+        block_length, dtype=dtypes.int64, name="block_length")
+    self._name = name
+    variant_tensor = gen_dataset_ops.interleave_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+        self._cycle_length,
+        self._block_length,
+        f=self._map_func.function,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._structure
+
+  def _transformation_name(self):
+    return "Dataset.interleave()"
+
+
+class _ParallelInterleaveDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over its input and interleaves the result.
+  """
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               cycle_length,
+               block_length,
+               num_parallel_calls,
+               buffer_output_elements=dataset_ops.AUTOTUNE,
+               prefetch_input_elements=dataset_ops.AUTOTUNE,
+               deterministic=None,
+               name=None):
+    """See `Dataset.interleave()` for details."""
+    self._input_dataset = input_dataset
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec):
+      raise TypeError(
+          "The `map_func` argument must return a `Dataset` object. Got "
+          f"{dataset_ops.get_type(self._map_func.output_structure)!r}.")
+    self._structure = self._map_func.output_structure._element_spec  # pylint: disable=protected-access
+    self._cycle_length = ops.convert_to_tensor(
+        cycle_length, dtype=dtypes.int64, name="cycle_length")
+    self._block_length = ops.convert_to_tensor(
+        block_length, dtype=dtypes.int64, name="block_length")
+    self._buffer_output_elements = ops.convert_to_tensor(
+        buffer_output_elements,
+        dtype=dtypes.int64,
+        name="buffer_output_elements")
+    self._prefetch_input_elements = ops.convert_to_tensor(
+        prefetch_input_elements,
+        dtype=dtypes.int64,
+        name="prefetch_input_elements")
+
+    self._num_parallel_calls = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+    if deterministic is None:
+      deterministic_string = "default"
+    elif deterministic:
+      deterministic_string = "true"
+    else:
+      deterministic_string = "false"
+
+    self._name = name
+    variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+        self._cycle_length,
+        self._block_length,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
+        self._num_parallel_calls,
+        f=self._map_func.function,
+        deterministic=deterministic_string,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._structure
+
+  def _transformation_name(self):
+    return "Dataset.interleave()"
diff --git a/tensorflow/python/data/ops/iterator_autograph.py b/tensorflow/python/data/ops/iterator_autograph.py
new file mode 100644
index 00000000000..5baac27c591
--- /dev/null
+++ b/tensorflow/python/data/ops/iterator_autograph.py
@@ -0,0 +1,119 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Autograph specifc overrides for tf.data.ops."""
+import functools
+
+import numpy as np
+
+from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+
+
+# TODO(mdan): These checks should be easier. Fix the nest API.
+def _verify_spec_compatible(input_name, spec_name, input_, spec):
+  """Verifies that a symbol has a type compatible vith a given spec.
+
+  Here, compatibility is viewed in the general TensorFlow sense: that the dtypes
+  are the same after implicit conversion, if both are tensors.
+
+  This verifier ensures consistent treatment of types across AutoGraph.
+
+  Args:
+    input_name: A name to use for `input_` in error messages.
+    spec_name: A name to use for `spec` in error messages.
+    input_: Any, value to verify.
+    spec: TypeSpec that `input_` must be compatible with.
+
+  Raises:
+    ValueError if the two types have been determined not to be compatible.
+  """
+  assert isinstance(spec, tensor_spec.TensorSpec)
+  if input is None:
+    # TODO(mdan): raise from None when switching to Py3.
+    raise ValueError("{} cannot be None".format(input_name))
+
+  # TODO(mdan): Use TensorCompatible when ready.
+  if isinstance(input_, (bool, int, float, str, np.ndarray)):
+    input_ = ops.convert_to_tensor_v2(input_)
+
+  input_dtype = getattr(input_, "dtype", None)
+
+  if input_dtype != spec.dtype:
+    input_dtype_str = "no dtype" if input_dtype is None else str(input_dtype)
+
+    raise TypeError(
+        "{} must have the same dtype as {}. Expected {}, got {}".format(
+            input_name, spec_name, spec.dtype, input_dtype_str
+        )
+    )
+
+
+def _verify_structure_compatible(input_name, spec_name, input_, spec):
+  """Verifies that possibly-structured symbol has types compatible vith another.
+
+  See _verify_spec_compatible for a more concrete meaning of "compatible".
+  Unspec _verify_spec_compatible, which handles singular Tensor-spec objects,
+  verify_structures_compatible can process structures recognized by tf.nest.
+
+  Args:
+    input_name: A name to use for `input_` in error messages.
+    spec_name: A name to use for `spec` in error messages.
+    input_: Any, value to verify. May, but doesn't need to, be a structure.
+    spec: Any, value that `input_` must be compatible with. May, but doesn't
+      need to, be a structure.
+
+  Raises:
+    ValueError if the two types have been determined not to be compatible.
+  """
+  try:
+    nest.assert_same_structure(input_, spec, expand_composites=True)
+  except (ValueError, TypeError) as e:
+    raise TypeError(
+        "{} must have the same element structure as {}.\n\n{}".format(
+            input_name, spec_name, str(e)
+        )
+    ) from e
+
+  nest.map_structure(
+      functools.partial(_verify_spec_compatible, input_name, spec_name), input_,
+      spec)
+
+
+def _next_tf_iterator(iterator, default=py_builtins.UNSPECIFIED):
+  if default is py_builtins.UNSPECIFIED:
+    # Without a default, fall back to the "normal" behavior which raises
+    # a runtime exception.
+    return next(iterator)
+  opt_iterate = iterator.get_next_as_optional()
+  _verify_structure_compatible(
+      "the default argument", "the iterate", default, iterator.element_spec
+  )
+  return control_flow_ops.cond(
+      opt_iterate.has_value(), opt_iterate.get_value, lambda: default
+  )
+
+
+def register_overrides():
+  py_builtins.next_registry.register(
+      iterator_ops.OwnedIterator, _next_tf_iterator
+  )
+  control_flow.for_loop_registry.register(
+      iterator_ops.OwnedIterator, control_flow._tf_iterator_for_stmt  # pylint: disable=protected-access
+  )
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index c67c615349d..5527df2ba8d 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -17,6 +17,9 @@
 import threading
 import warnings
 
+from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.checkpoint import saveable_compat
+from tensorflow.python.data.ops import iterator_autograph
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.util import nest
@@ -29,7 +32,9 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_utils
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util import _pywrap_utils
@@ -78,13 +83,6 @@
     "tensorflow.python.autograph.core.ag_ctx")
 
 
-# Avoid circular dependency for `type_utils` which transitively depends
-# on Autograph which in turn depends on tf.data.
-type_utils = lazy_loader.LazyLoader(
-    "type_utils", globals(),
-    "tensorflow.python.framework.type_utils")
-
-
 def _device_stack_is_empty():
   if context.executing_eagerly():
     return context.context().device_name is None
@@ -94,6 +92,7 @@ def _device_stack_is_empty():
   return not bool(device_stack)
 
 
+@saveable_compat.legacy_saveable_name("ITERATOR")
 @tf_export(v1=["data.Iterator"])
 class Iterator(trackable.Trackable):
   """Represents the state of iterating through a `Dataset`."""
@@ -533,12 +532,16 @@ def element_spec(self):
 
     return self._element_spec
 
-  def _gather_saveables_for_checkpoint(self):
-
-    def _saveable_factory(name):
-      return _IteratorSaveable(self._iterator_resource, name)
+  def _serialize_to_tensors(self):
+    serialized_iterator = gen_dataset_ops.serialize_iterator(
+        self._iterator_resource,
+        options_lib.ExternalStatePolicy.FAIL.value)
+    return {"_STATE": serialized_iterator}
 
-    return {"ITERATOR": _saveable_factory}
+  def _restore_from_tensors(self, restored_tensors):
+    with ops.colocate_with(self._iterator_resource):
+      return [gen_dataset_ops.deserialize_iterator(
+          self._iterator_resource, restored_tensors["_STATE"])]
 
 
 _uid_counter = 0
@@ -653,6 +656,7 @@ def get_next_as_optional(self):
     raise NotImplementedError("Iterator.get_next_as_optional()")
 
 
+@saveable_compat.legacy_saveable_name("ITERATOR")
 class OwnedIterator(IteratorBase):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset.
 
@@ -850,22 +854,23 @@ def get_next_as_optional(self):
               output_shapes=structure.get_flat_tensor_shapes(
                   self.element_spec)), self.element_spec)
 
-  def _gather_saveables_for_checkpoint(self):
-
-    def _saveable_factory(name):
-      """Returns a SaveableObject for serialization/deserialization."""
-      policy = None
-      if self._dataset:
-        policy = self._dataset.options().experimental_external_state_policy
-      if policy:
-        return _IteratorSaveable(
-            self._iterator_resource,
-            name,
-            external_state_policy=policy)
-      else:
-        return _IteratorSaveable(self._iterator_resource, name)
+  def _serialize_to_tensors(self):
+    serialized_iterator = None
+    if (self._dataset and
+        self._dataset.options().experimental_external_state_policy):
+      serialized_iterator = gen_dataset_ops.serialize_iterator(
+          self._iterator_resource,
+          self._dataset.options().experimental_external_state_policy.value)
+    else:
+      serialized_iterator = gen_dataset_ops.serialize_iterator(
+          self._iterator_resource,
+          options_lib.ExternalStatePolicy.FAIL.value)
+    return {"_STATE": serialized_iterator}
 
-    return {"ITERATOR": _saveable_factory}
+  def _restore_from_tensors(self, restored_tensors):
+    with ops.colocate_with(self._iterator_resource):
+      return [gen_dataset_ops.deserialize_iterator(
+          self._iterator_resource, restored_tensors["_STATE"])]
 
   def __tf_tracing_type__(self, _):
     return self._type_spec
@@ -948,6 +953,13 @@ def restore(self, restored_tensors, restored_shapes):
       return gen_dataset_ops.deserialize_iterator(self.op, restored_tensors[0])
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        IteratorSpec, struct_pb2.TypeSpecProto.DATA_ITERATOR_SPEC
+    )
+)
+
+
 @deprecation.deprecated(
     None, "Use `tf.data.Iterator.get_next_as_optional()` instead.")
 @tf_export("data.experimental.get_next_as_optional")
@@ -968,3 +980,4 @@ def get_next_as_optional(iterator):
 
 
 _pywrap_utils.RegisterType("OwnedIterator", OwnedIterator)
+iterator_autograph.register_overrides()
diff --git a/tensorflow/python/data/ops/load_op.py b/tensorflow/python/data/ops/load_op.py
index a2ef7a8a6a2..a10ce151744 100644
--- a/tensorflow/python/data/ops/load_op.py
+++ b/tensorflow/python/data/ops/load_op.py
@@ -21,18 +21,11 @@
 from tensorflow.python.eager import context
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.util import lazy_loader
-
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
-nested_structure_coder = lazy_loader.LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
+from tensorflow.python.saved_model import nested_structure_coder
 
 
-def load(path,
-         element_spec,
-         compression,
-         reader_func):
+def _load(path, element_spec, compression, reader_func):
   return _LoadDataset(path, element_spec, compression, reader_func)
 
 
@@ -75,7 +68,7 @@ def __init__(self, path, element_spec=None, compression=None,
         compression=compression,
         reader_func=self._reader_func.function,
         **self._flat_structure)
-    super(_LoadDataset, self).__init__(variant_tensor)
+    super().__init__(variant_tensor)
 
   def _functions(self):
     return [self._reader_func]
diff --git a/tensorflow/python/data/ops/map_op.py b/tensorflow/python/data/ops/map_op.py
new file mode 100644
index 00000000000..1751ac8d219
--- /dev/null
+++ b/tensorflow/python/data/ops/map_op.py
@@ -0,0 +1,182 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.map`."""
+
+import warnings
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import debug_mode
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _map_v2(input_dataset,  # pylint: disable=unused-private-name
+            map_func,
+            num_parallel_calls=None,
+            deterministic=None,
+            name=None):
+  """See `Dataset.map()` for details."""
+  if num_parallel_calls is None or debug_mode.DEBUG_MODE:
+    if deterministic is not None and not debug_mode.DEBUG_MODE:
+      warnings.warn("The `deterministic` argument has no effect unless the "
+                    "`num_parallel_calls` argument is specified.")
+    return _MapDataset(
+        input_dataset, map_func, preserve_cardinality=True, name=name)
+  else:
+    return _ParallelMapDataset(
+        input_dataset,
+        map_func,
+        num_parallel_calls=num_parallel_calls,
+        deterministic=deterministic,
+        preserve_cardinality=True,
+        name=name)
+
+
+def _map_v1(input_dataset,  # pylint: disable=unused-private-name
+            map_func,
+            num_parallel_calls=None,
+            deterministic=None):
+  """See `Dataset.map()` for details."""
+  if num_parallel_calls is None or debug_mode.DEBUG_MODE:
+    return dataset_ops.DatasetV1Adapter(
+        _MapDataset(input_dataset, map_func, preserve_cardinality=False))
+  else:
+    return dataset_ops.DatasetV1Adapter(
+        _ParallelMapDataset(
+            input_dataset,
+            map_func,
+            num_parallel_calls,
+            deterministic,
+            preserve_cardinality=False))
+
+
+def _map_v1_with_legacy_function(  # pylint: disable=unused-private-name
+    input_dataset,
+    map_func,
+    num_parallel_calls=None,
+    deterministic=None):
+  """See `Dataset.map()` for details."""
+  if num_parallel_calls is None:
+    if deterministic is not None:
+      warnings.warn("The `deterministic` argument has no effect unless the "
+                    "`num_parallel_calls` argument is specified.")
+    return dataset_ops.DatasetV1Adapter(
+        _MapDataset(
+            input_dataset,
+            map_func,
+            preserve_cardinality=False,
+            use_legacy_function=True))
+  else:
+    return dataset_ops.DatasetV1Adapter(
+        _ParallelMapDataset(
+            input_dataset,
+            map_func,
+            num_parallel_calls,
+            deterministic,
+            preserve_cardinality=False,
+            use_legacy_function=True))
+
+
+class _MapDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its input."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=True,
+               use_legacy_function=False,
+               name=None):
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+    self._preserve_cardinality = preserve_cardinality
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+    self._name = name
+    variant_tensor = gen_dataset_ops.map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "Dataset.map()"
+
+
+class _ParallelMapDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its input in parallel."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               num_parallel_calls,
+               deterministic,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False,
+               use_legacy_function=False,
+               name=None):
+    """See `Dataset.map()` for details."""
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+    self._map_func = structured_function.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+    if deterministic is None:
+      self._deterministic = "default"
+    elif deterministic:
+      self._deterministic = "true"
+    else:
+      self._deterministic = "false"
+    self._preserve_cardinality = preserve_cardinality
+    self._num_parallel_calls = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+    self._name = name
+    variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        num_parallel_calls=self._num_parallel_calls,
+        deterministic=self._deterministic,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "Dataset.map()"
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 507f0e52341..fdf8c2816bd 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -16,6 +16,7 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.data.ops import prefetch_op
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -211,7 +212,8 @@ def _create_device_dataset(prototype_ds, incarnation_id, prefetch_buffer_size,
   ds = _ReincarnatedPerDeviceGenerator(prototype_ds, incarnation_id)
   if prefetch_buffer_size > 0:
     if experimental_slack:
-      ds = dataset_ops.PrefetchDataset(ds, prefetch_buffer_size, slack_period=1)
+      ds = prefetch_op._PrefetchDataset(  # pylint: disable=protected-access
+          ds, prefetch_buffer_size, slack_period=1)
     else:
       ds = ds.prefetch(prefetch_buffer_size)
   return ds
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 740a08c8115..01ba395e374 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -15,13 +15,15 @@
 """A type for representing values that may or may not exist."""
 import abc
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_optional_ops
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -127,7 +129,7 @@ def empty(element_spec):
     Returns:
       A `tf.experimental.Optional` with no value.
     """
-    return _OptionalImpl(gen_dataset_ops.optional_none(), element_spec)
+    return _OptionalImpl(gen_optional_ops.optional_none(), element_spec)
 
   @staticmethod
   def from_value(value):
@@ -152,8 +154,9 @@ def from_value(value):
         encoded_value = structure.to_tensor_list(element_spec, value)
 
     return _OptionalImpl(
-        gen_dataset_ops.optional_from_value(encoded_value, name=scope),
-        element_spec)
+        gen_optional_ops.optional_from_value(encoded_value, name=scope),
+        element_spec,
+    )
 
 
 class _OptionalImpl(Optional):
@@ -170,7 +173,9 @@ def __init__(self, variant_tensor, element_spec):
 
   def has_value(self, name=None):
     with ops.colocate_with(self._variant_tensor):
-      return gen_dataset_ops.optional_has_value(self._variant_tensor, name=name)
+      return gen_optional_ops.optional_has_value(
+          self._variant_tensor, name=name
+      )
 
   def get_value(self, name=None):
     # TODO(b/110122868): Consolidate the restructuring logic with similar logic
@@ -178,11 +183,12 @@ def get_value(self, name=None):
     with ops.name_scope(name, "OptionalGetValue",
                         [self._variant_tensor]) as scope:
       with ops.colocate_with(self._variant_tensor):
-        result = gen_dataset_ops.optional_get_value(
+        result = gen_optional_ops.optional_get_value(
             self._variant_tensor,
             name=scope,
             output_types=structure.get_flat_tensor_types(self._element_spec),
-            output_shapes=structure.get_flat_tensor_shapes(self._element_spec))
+            output_shapes=structure.get_flat_tensor_shapes(self._element_spec),
+        )
       # NOTE: We do not colocate the deserialization of composite tensors
       # because not all ops are guaranteed to have non-GPU kernels.
       return structure.from_tensor_list(self._element_spec, result)
@@ -256,3 +262,10 @@ def _to_legacy_output_shapes(self):
 
   def _to_legacy_output_classes(self):
     return self
+
+
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        OptionalSpec, struct_pb2.TypeSpecProto.OPTIONAL_SPEC
+    )
+)
diff --git a/tensorflow/python/data/ops/options.py b/tensorflow/python/data/ops/options.py
index 103db7e6dba..be1fd38eb73 100644
--- a/tensorflow/python/data/ops/options.py
+++ b/tensorflow/python/data/ops/options.py
@@ -571,6 +571,17 @@ class Options(options_lib.OptionsBase):
       "frequency is determined by the number of devices attached to this "
       "input pipeline. If None, defaults to False.")
 
+  experimental_symbolic_checkpoint = options_lib.create_option(
+      name="experimental_symbolic_checkpoint",
+      ty=bool,
+      docstring="Whether to checkpoint internal input pipeline state "
+      "maintaining cursors into data sources that identify last "
+      "element(s) produced as output to the tf.data consumer. This "
+      "is alternative to the default 'explicit' checkpointing which "
+      "stores the internal input pipeline state in the checkpoint. "
+      "Note that symbolic checkpointing is not supported for "
+      "transformations that can reorder elements.")
+
   experimental_threading = options_lib.create_option(
       name="experimental_threading",
       ty=ThreadingOptions,
@@ -622,6 +633,8 @@ def _to_proto(self):
     pb.optimization_options.CopyFrom(self.experimental_optimization._to_proto())  # pylint: disable=protected-access
     if self.experimental_slack is not None:
       pb.slack = self.experimental_slack
+    if self.experimental_symbolic_checkpoint is not None:
+      pb.symbolic_checkpoint = self.experimental_symbolic_checkpoint
     pb.threading_options.CopyFrom(self.threading._to_proto())  # pylint: disable=protected-access
     return pb
 
@@ -637,6 +650,8 @@ def _from_proto(self, pb):
     self.experimental_optimization._from_proto(pb.optimization_options)  # pylint: disable=protected-access
     if pb.WhichOneof("optional_slack") is not None:
       self.experimental_slack = pb.slack
+    if pb.WhichOneof("optional_symbolic_checkpoint") is not None:
+      self.experimental_symbolic_checkpoint = pb.symbolic_checkpoint
     self.threading._from_proto(pb.threading_options)  # pylint: disable=protected-access
 
   def _set_mutable(self, mutable):
diff --git a/tensorflow/python/data/ops/padded_batch_op.py b/tensorflow/python/data/ops/padded_batch_op.py
index 27a63c11326..6d69d34dcfa 100644
--- a/tensorflow/python/data/ops/padded_batch_op.py
+++ b/tensorflow/python/data/ops/padded_batch_op.py
@@ -29,22 +29,22 @@
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def padded_batch(self,
-                 batch_size,
-                 padded_shapes=None,
-                 padding_values=None,
-                 drop_remainder=False,
-                 name=None):
+def _padded_batch(input_dataset,
+                  batch_size,
+                  padded_shapes=None,
+                  padding_values=None,
+                  drop_remainder=False,
+                  name=None):
   """See `tf.data.Dataset.padded_batch` for details."""
   if padded_shapes is None:
-    padded_shapes = dataset_ops.get_legacy_output_shapes(self)
+    padded_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
     for i, shape in enumerate(nest.flatten(padded_shapes)):
       # A `tf.TensorShape` is only false if its *rank* is unknown.
       if not shape:
         raise ValueError(f"You must provide `padded_shapes` argument because "
                          f"component {i} has unknown rank.")
   return _PaddedBatchDataset(
-      self,
+      input_dataset,
       batch_size,
       padded_shapes,
       padding_values,
@@ -191,9 +191,13 @@ def __init__(self,
 
     def check_types(component_spec):
       if not isinstance(component_spec, tensor_spec.TensorSpec):
+        if isinstance(component_spec, dataset_ops.DatasetSpec):
+          raise TypeError(
+              "`padded_batch` is not supported for datasets of datasets")
         raise TypeError(f"`padded_batch` is only supported for datasets that "
-                        f"produce tensor elements but the input dataset "
-                        f"spec contains: `{component_spec}`.")
+                        f"produce tensor elements but type spec of elements in "
+                        f"the input dataset is not a subclass of TensorSpec: "
+                        f"`{component_spec}`.")
 
     nest.map_structure(check_types, input_dataset.element_spec)
     self._input_dataset = input_dataset
@@ -251,7 +255,7 @@ def _padded_shape_to_batch_shape(s):
         drop_remainder=self._drop_remainder,
         output_shapes=structure.get_flat_tensor_shapes(self._structure),
         metadata=self._metadata.SerializeToString())
-    super(_PaddedBatchDataset, self).__init__(input_dataset, variant_tensor)
+    super().__init__(input_dataset, variant_tensor)
 
   @property
   def element_spec(self):
diff --git a/tensorflow/python/data/ops/prefetch_op.py b/tensorflow/python/data/ops/prefetch_op.py
new file mode 100644
index 00000000000..d4982c67e02
--- /dev/null
+++ b/tensorflow/python/data/ops/prefetch_op.py
@@ -0,0 +1,51 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.prefetch`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import debug_mode
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _prefetch(input_dataset, buffer_size, name=None):  # pylint: disable=unused-private-name
+  """See `Dataset.prefetch()` for details."""
+  if debug_mode.DEBUG_MODE:
+    return input_dataset
+  return _PrefetchDataset(input_dataset, buffer_size, name=name)
+
+
+class _PrefetchDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that asynchronously prefetches its input."""
+
+  def __init__(self, input_dataset, buffer_size, slack_period=None, name=None):
+    """See `Dataset.prefetch()` for details."""
+    self._input_dataset = input_dataset
+    if buffer_size is None:
+      buffer_size = dataset_ops.AUTOTUNE
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    self._name = name
+    # pylint: disable=protected-access
+    # We colocate the prefetch dataset with its input as this collocation only
+    # happens automatically in graph mode.
+    with ops.colocate_with(input_dataset._variant_tensor):
+      variant_tensor = gen_dataset_ops.prefetch_dataset(
+          input_dataset._variant_tensor,
+          buffer_size=self._buffer_size,
+          slack_period=slack_period,
+          **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/ragged_batch_op.py b/tensorflow/python/data/ops/ragged_batch_op.py
index 1868925873d..02a147e796d 100644
--- a/tensorflow/python/data/ops/ragged_batch_op.py
+++ b/tensorflow/python/data/ops/ragged_batch_op.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """The implementation of `tf.data.Dataset.ragged_batch`."""
-from tensorflow.python.data.ops import batch_op
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
 from tensorflow.python.data.util import nest
@@ -23,14 +22,13 @@
 from tensorflow.python.ops.ragged import ragged_tensor
 
 
-def ragged_batch(self,
-                 batch_size,
-                 drop_remainder=False,
-                 row_splits_dtype=dtypes.int64,
-                 name=None):
-  ragged_dataset = _DenseToRaggedDataset(self, row_splits_dtype, name)
-  return batch_op.BatchDataset(
-      ragged_dataset, batch_size=batch_size, drop_remainder=drop_remainder)
+def _ragged_batch(input_dataset,
+                  batch_size,
+                  drop_remainder=False,
+                  row_splits_dtype=dtypes.int64,
+                  name=None):
+  ragged_dataset = _DenseToRaggedDataset(input_dataset, row_splits_dtype, name)
+  return ragged_dataset.batch(batch_size, drop_remainder)
 
 
 class _DenseToRaggedDataset(dataset_ops.UnaryDataset):
@@ -102,7 +100,7 @@ def to_ragged_variant(value):
     self._mapped_dataset = input_dataset.map(map_fn)
     self._name = name
     variant = self._mapped_dataset._variant_tensor  # pylint: disable=protected-access
-    super(_DenseToRaggedDataset, self).__init__(input_dataset, variant)
+    super().__init__(input_dataset, variant)
 
   @property
   def element_spec(self):
diff --git a/tensorflow/python/data/ops/random_op.py b/tensorflow/python/data/ops/random_op.py
new file mode 100644
index 00000000000..435f08ae328
--- /dev/null
+++ b/tensorflow/python/data/ops/random_op.py
@@ -0,0 +1,64 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.random`."""
+
+import warnings
+
+from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import random_seed
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _random(  # pylint: disable=unused-private-name
+    seed=None,
+    rerandomize_each_iteration=None,
+    name=None):
+  """See `Dataset.random()` for details."""
+  return _RandomDataset(
+      seed=seed,
+      rerandomize_each_iteration=rerandomize_each_iteration,
+      name=name)
+
+
+class _RandomDataset(dataset_ops.DatasetSource):
+  """A `Dataset` of pseudorandom values."""
+
+  def __init__(self, seed=None, rerandomize_each_iteration=None, name=None):
+    """A `Dataset` of pseudorandom values."""
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    self._rerandomize = rerandomize_each_iteration
+    self._name = name
+    if rerandomize_each_iteration:
+      if not tf2.enabled():
+        warnings.warn("In TF 1, the `rerandomize_each_iteration=True` option "
+                      "is only supported for repeat-based epochs.")
+      variant_tensor = ged_ops.random_dataset_v2(
+          seed=self._seed,
+          seed2=self._seed2,
+          seed_generator=gen_dataset_ops.dummy_seed_generator(),
+          rerandomize_each_iteration=self._rerandomize,
+          **self._common_args)
+    else:
+      variant_tensor = ged_ops.random_dataset(
+          seed=self._seed, seed2=self._seed2, **self._common_args)
+    super().__init__(variant_tensor)
+
+  @property
+  def element_spec(self):
+    return tensor_spec.TensorSpec([], dtypes.int64)
diff --git a/tensorflow/python/data/ops/range_op.py b/tensorflow/python/data/ops/range_op.py
new file mode 100644
index 00000000000..5f0c463eace
--- /dev/null
+++ b/tensorflow/python/data/ops/range_op.py
@@ -0,0 +1,70 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.range`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _range(*args, **kwargs):  # pylint: disable=unused-private-name
+  return _RangeDataset(*args, **kwargs)
+
+
+class _RangeDataset(dataset_ops.DatasetSource):
+  """A `Dataset` of a step separated range of values."""
+
+  def __init__(self, *args, **kwargs):
+    """See `Dataset.range()` for details."""
+    self._parse_args(*args, **kwargs)
+    self._structure = tensor_spec.TensorSpec([], self._output_type)
+    variant_tensor = gen_dataset_ops.range_dataset(
+        start=self._start,
+        stop=self._stop,
+        step=self._step,
+        **self._common_args)
+    super().__init__(variant_tensor)
+
+  def _parse_args(self, *args, **kwargs):
+    """Parses arguments according to the same rules as the `range()` builtin."""
+    if len(args) == 1:
+      self._start = self._build_tensor(0, "start")
+      self._stop = self._build_tensor(args[0], "stop")
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 2:
+      self._start = self._build_tensor(args[0], "start")
+      self._stop = self._build_tensor(args[1], "stop")
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 3:
+      self._start = self._build_tensor(args[0], "start")
+      self._stop = self._build_tensor(args[1], "stop")
+      self._step = self._build_tensor(args[2], "step")
+    else:
+      raise ValueError(f"Invalid `args`. The length of `args` should be "
+                       f"between 1 and 3 but was {len(args)}.")
+    if "output_type" in kwargs:
+      self._output_type = kwargs["output_type"]
+    else:
+      self._output_type = dtypes.int64
+    self._name = kwargs["name"] if "name" in kwargs else None
+
+  def _build_tensor(self, int64_value, name):
+    return ops.convert_to_tensor(int64_value, dtype=dtypes.int64, name=name)
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index f878eb6adb1..a0c25ab7337 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -70,8 +70,10 @@ def _create_or_validate_filenames_dataset(filenames, name=None):
           "The `filenames` argument must contain `tf.string` elements. Got "
           f"`{filenames.dtype!r}` elements.")
     filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
-    filenames = from_tensor_slices_op.TensorSliceDataset(
-        filenames, is_files=True, name=name)
+    filenames = from_tensor_slices_op._TensorSliceDataset(  # pylint: disable=protected-access
+        filenames,
+        is_files=True,
+        name=name)
   return filenames
 
 
diff --git a/tensorflow/python/data/ops/rebatch_op.py b/tensorflow/python/data/ops/rebatch_op.py
index 18de8e6f9be..527e2ba4389 100644
--- a/tensorflow/python/data/ops/rebatch_op.py
+++ b/tensorflow/python/data/ops/rebatch_op.py
@@ -25,14 +25,14 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
-def rebatch(input_dataset, batch_size, drop_remainder=False, name=None):
-  return RebatchDataset(input_dataset, batch_size, drop_remainder, name)
+def _rebatch(input_dataset, batch_size, drop_remainder=False, name=None):
+  return _RebatchDataset(input_dataset, batch_size, drop_remainder, name)
 
 
-class RebatchDataset(dataset_ops.UnaryDataset):
+class _RebatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that rebatches elements from its input into new batch sizes.
 
-  `RebatchDataset(input_dataset, batch_sizes)` is functionally equivalent to
+  `_RebatchDataset(input_dataset, batch_sizes)` is functionally equivalent to
   `input_dataset.unbatch().batch(N)`, where the value of N cycles through the
   `batch_sizes` input list. The elements produced by this dataset have the same
   rank as the elements of the input dataset.
@@ -68,7 +68,7 @@ def __init__(self,
         drop_remainder=drop_remainder,
         **self._flat_structure)
     # LINT.ThenChange(//tensorflow/core/grappler/optimizers/data/auto_shard.cc)
-    super(RebatchDataset, self).__init__(input_dataset, variant_tensor)
+    super().__init__(input_dataset, variant_tensor)
 
   def _compute_static_batch_dim(self):
     """Computes the static batch dimension of a dataset if it can be determined.
diff --git a/tensorflow/python/data/ops/repeat_op.py b/tensorflow/python/data/ops/repeat_op.py
new file mode 100644
index 00000000000..f85e53638b3
--- /dev/null
+++ b/tensorflow/python/data/ops/repeat_op.py
@@ -0,0 +1,44 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.repeat`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _repeat(input_dataset, count, name):  # pylint: disable=unused-private-name
+  return _RepeatDataset(input_dataset, count, name)
+
+
+class _RepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that repeats its input several times."""
+
+  def __init__(self, input_dataset, count, name=None):
+    """See `Dataset.repeat()` for details."""
+    self._input_dataset = input_dataset
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(
+          count, dtype=dtypes.int64, name="count")
+    self._name = name
+    variant_tensor = gen_dataset_ops.repeat_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        count=self._count,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/sample_from_datasets_op.py b/tensorflow/python/data/ops/sample_from_datasets_op.py
new file mode 100644
index 00000000000..f7ee1a2d8c0
--- /dev/null
+++ b/tensorflow/python/data/ops/sample_from_datasets_op.py
@@ -0,0 +1,121 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.sample_from_datasets`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import directed_interleave_op
+from tensorflow.python.data.ops import map_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import math_ops
+
+
+def _sample_from_datasets(datasets,  # pylint: disable=unused-private-name
+                          weights=None,
+                          seed=None,
+                          stop_on_empty_dataset=False,
+                          rerandomize_each_iteration=None):
+  """See `Dataset.sample_from_datasets()` for details."""
+
+  def _skip_datasets_with_zero_weight(datasets, weights):
+    datasets_and_weights = [(dataset, weight)
+                            for (dataset, weight) in zip(datasets, weights)
+                            if weight > 0]
+    return (zip(*datasets_and_weights) if datasets_and_weights else
+            ([datasets[0].take(0)], [1.]))
+
+  if not datasets:
+    raise ValueError("Invalid `datasets`. `datasets` should not be empty.")
+
+  if not isinstance(weights, dataset_ops.DatasetV2):
+    if weights is None:
+      # Select inputs with uniform probability.
+      logits = [[1.0] * len(datasets)]
+
+    else:
+      if isinstance(weights, ops.Tensor):
+        if not weights.shape.is_compatible_with([len(datasets)]):
+          raise ValueError(f"Invalid `weights`. The shape of `weights` "
+                           f"should be compatible with `[len(datasets)]` "
+                           f"but is {weights.shape}.")
+      else:
+        if len(datasets) != len(weights):
+          raise ValueError(f"Invalid `weights`. `weights` should have the "
+                           f"same length as `datasets` but got "
+                           f"`len(weights)={len(weights)}` vs. "
+                           f"`len(datasets)={len(datasets)}`.")
+
+      # Use the given `weights` as the probability of choosing the respective
+      # input.
+      if not isinstance(weights, ops.Tensor):
+        datasets, weights = _skip_datasets_with_zero_weight(datasets, weights)
+      weights = ops.convert_to_tensor(weights, name="weights")
+      if weights.dtype not in (dtypes.float32, dtypes.float64):
+        raise TypeError(f"Invalid `weights`. `weights` type must be either "
+                        f"`tf.float32` or `tf.float64` but is "
+                        f"{weights.dtype}.")
+
+      # The `stateless_multinomial()` op expects log-probabilities, as opposed
+      # to weights.
+      logits = array_ops.expand_dims(math_ops.log(weights, name="logits"), 0)
+
+    # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When
+    # it is a `Dataset`, it is possible that evaluating it has a side effect
+    # the user depends on.
+    if len(datasets) == 1:
+      return datasets[0]
+
+    def select_dataset_constant_logits(seed):
+      return array_ops.squeeze(
+          gen_stateless_random_ops.stateless_multinomial(
+              logits, 1, seed=seed),
+          axis=[0, 1])
+
+    selector_input = map_op._MapDataset(  # pylint: disable=protected-access
+        dataset_ops.Dataset.random(
+            seed=seed,
+            rerandomize_each_iteration=rerandomize_each_iteration).batch(2),
+        select_dataset_constant_logits,
+        use_inter_op_parallelism=False)
+
+  else:  # isinstance(weights, DatasetV2)
+    # Use each element of the given `weights` dataset as the probability of
+    # choosing the respective input.
+    #
+    # The `stateless_multinomial()` op expects log-probabilities, as opposed
+    # to weights.
+    logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits"))
+
+    def select_dataset_varying_logits(logits, seed):
+      return array_ops.squeeze(
+          gen_stateless_random_ops.stateless_multinomial(
+              logits, 1, seed=seed),
+          axis=[0, 1])
+
+    logits_and_seeds = dataset_ops.Dataset.zip(
+        (logits_ds,
+         dataset_ops.Dataset.random(
+             seed=seed,
+             rerandomize_each_iteration=rerandomize_each_iteration).batch(2)))
+    selector_input = map_op._MapDataset(  # pylint: disable=protected-access
+        logits_and_seeds,
+        select_dataset_varying_logits,
+        use_inter_op_parallelism=False)
+
+  return directed_interleave_op._directed_interleave(  # pylint: disable=protected-access
+      selector_input, datasets, stop_on_empty_dataset
+  )
diff --git a/tensorflow/python/data/ops/save_op.py b/tensorflow/python/data/ops/save_op.py
index 86fe494603a..0518b6af356 100644
--- a/tensorflow/python/data/ops/save_op.py
+++ b/tensorflow/python/data/ops/save_op.py
@@ -25,22 +25,18 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.util import lazy_loader
-
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
-nested_structure_coder = lazy_loader.LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
+from tensorflow.python.saved_model import nested_structure_coder
 
 
-def save(self,
-         path,
-         compression=None,
-         shard_func=None,
-         checkpoint_args=None):
+def _save(input_dataset,
+          path,
+          compression=None,
+          shard_func=None,
+          checkpoint_args=None):
   """Implements the save function and checkpoint functionality."""
   if context.executing_eagerly() and checkpoint_args:
-    save_dataset = _SaveDataset(self, path, shard_func, compression)
+    save_dataset = _SaveDataset(input_dataset, path, shard_func, compression)
     save_iterator = iter(save_dataset)
 
     if "checkpoint" in checkpoint_args:
@@ -59,7 +55,7 @@ def save(self,
       manager.save(check_interval=True)
   else:
     dataset, shard_func, use_shard_func, path = set_save_dataset_attributes(
-        self, shard_func, path)
+        input_dataset, shard_func, path)
     ged_ops.save_dataset(
         dataset._variant_tensor,   # pylint: disable=protected-access
         path=path,
@@ -87,7 +83,7 @@ def __init__(self, dataset, path, shard_func, compression):
         output_types=structure.get_flat_tensor_types(dataset.element_spec),
         output_shapes=structure.get_flat_tensor_shapes(dataset.element_spec),
     )
-    super(_SaveDataset, self).__init__(dataset, variant_tensor)
+    super().__init__(dataset, variant_tensor)
 
   def _functions(self):
     return [self._shard_func]
diff --git a/tensorflow/python/data/ops/scan_op.py b/tensorflow/python/data/ops/scan_op.py
new file mode 100644
index 00000000000..537eb6975c8
--- /dev/null
+++ b/tensorflow/python/data/ops/scan_op.py
@@ -0,0 +1,160 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.shuffle`."""
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.compat import collections_abc
+
+
+def _scan(input_dataset,
+          initial_state,
+          scan_func,
+          use_default_device=None,
+          name=None):
+  return _ScanDataset(
+      input_dataset, initial_state, scan_func, use_default_device, name=name)
+
+
+class _ScanDataset(dataset_ops.UnaryDataset):
+  """A dataset that scans a function across its input."""
+
+  def __init__(self,
+               input_dataset,
+               initial_state,
+               scan_func,
+               use_default_device=None,
+               name=None):
+    """See `scan()` for details."""
+    self._input_dataset = input_dataset
+    self._initial_state = structure.normalize_element(initial_state)
+
+    # Compute initial values for the state classes, shapes and types based on
+    # the initial state. The shapes may be refined by running `tf_scan_func` one
+    # or more times below.
+    self._state_structure = structure.type_spec_from_value(self._initial_state)
+
+    # Iteratively rerun the scan function until reaching a fixed point on
+    # `self._state_shapes`.
+    need_to_rerun = True
+    while need_to_rerun:
+
+      wrapped_func = structured_function.StructuredFunctionWrapper(
+          scan_func,
+          self._transformation_name(),
+          input_structure=(self._state_structure, input_dataset.element_spec),
+          add_to_graph=False)
+      if not (isinstance(wrapped_func.output_types, collections_abc.Sequence)
+              and len(wrapped_func.output_types) == 2):
+        raise TypeError(f"Invalid `scan_func`. `scan_func` should return a "
+                        f"pair consisting of new state and the output value "
+                        f"but its return type is "
+                        f"{wrapped_func.output_structure}.")
+
+      new_state_classes, self._output_classes = wrapped_func.output_classes
+
+      # Extract and validate class information from the returned values.
+      new_state_classes, output_classes = wrapped_func.output_classes
+      old_state_classes = nest.map_structure(
+          lambda component_spec: component_spec._to_legacy_output_classes(),  # pylint: disable=protected-access
+          self._state_structure)
+      for new_state_class, old_state_class in zip(
+          nest.flatten(new_state_classes), nest.flatten(old_state_classes)):
+        if not issubclass(new_state_class, old_state_class):
+          raise TypeError(f"Invalid `scan_func`. The element classes for the "
+                          f"new state must match the initial state. Expected "
+                          f"{old_state_classes}, got {new_state_classes}.")
+
+      # Extract and validate type information from the returned values.
+      new_state_types, output_types = wrapped_func.output_types
+      old_state_types = nest.map_structure(
+          lambda component_spec: component_spec._to_legacy_output_types(),  # pylint: disable=protected-access
+          self._state_structure)
+      for new_state_type, old_state_type in zip(
+          nest.flatten(new_state_types), nest.flatten(old_state_types)):
+        if new_state_type != old_state_type:
+          raise TypeError(f"Invalid `scan_func`. The element types for the "
+                          f"new state must match the initial state. Expected "
+                          f"{old_state_types}, got {new_state_types}.")
+
+      # Extract shape information from the returned values.
+      new_state_shapes, output_shapes = wrapped_func.output_shapes
+      old_state_shapes = nest.map_structure(
+          lambda component_spec: component_spec._to_legacy_output_shapes(),  # pylint: disable=protected-access
+          self._state_structure)
+      self._element_spec = structure.convert_legacy_structure(
+          output_types, output_shapes, output_classes)
+
+      flat_state_shapes = nest.flatten(old_state_shapes)
+      flat_new_state_shapes = nest.flatten(new_state_shapes)
+      weakened_state_shapes = [
+          original.most_specific_compatible_shape(new)
+          for original, new in zip(flat_state_shapes, flat_new_state_shapes)
+      ]
+
+      need_to_rerun = False
+      for original_shape, weakened_shape in zip(flat_state_shapes,
+                                                weakened_state_shapes):
+        if original_shape.ndims is not None and (
+            weakened_shape.ndims is None or
+            original_shape.as_list() != weakened_shape.as_list()):
+          need_to_rerun = True
+          break
+
+      if need_to_rerun:
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # in this method.
+        self._state_structure = structure.convert_legacy_structure(
+            old_state_types,
+            nest.pack_sequence_as(old_state_shapes, weakened_state_shapes),
+            old_state_classes)
+
+    self._scan_func = wrapped_func
+    self._scan_func.function.add_to_graph(ops.get_default_graph())
+
+    self._name = name
+    # pylint: disable=protected-access
+    if use_default_device is not None:
+      variant_tensor = ged_ops.scan_dataset(
+          self._input_dataset._variant_tensor,
+          structure.to_tensor_list(self._state_structure, self._initial_state),
+          self._scan_func.function.captured_inputs,
+          f=self._scan_func.function,
+          preserve_cardinality=True,
+          use_default_device=use_default_device,
+          **self._common_args)
+    else:
+      variant_tensor = ged_ops.scan_dataset(
+          self._input_dataset._variant_tensor,
+          structure.to_tensor_list(self._state_structure, self._initial_state),
+          self._scan_func.function.captured_inputs,
+          f=self._scan_func.function,
+          preserve_cardinality=True,
+          **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._scan_func]
+
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+  def _transformation_name(self):
+    return "Dataset.scan()"
diff --git a/tensorflow/python/data/ops/shard_op.py b/tensorflow/python/data/ops/shard_op.py
new file mode 100644
index 00000000000..800af81a048
--- /dev/null
+++ b/tensorflow/python/data/ops/shard_op.py
@@ -0,0 +1,43 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.shard`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _shard(input_dataset, num_shards, index, name):  # pylint: disable=unused-private-name
+  """See `Dataset.shard()` for details."""
+  return _ShardDataset(input_dataset, num_shards, index, name)
+
+
+class _ShardDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` for sharding its input."""
+
+  def __init__(self, input_dataset, num_shards, index, name):
+    """See `Dataset.shard()` for details."""
+    self._input_dataset = input_dataset
+    self._num_shards = ops.convert_to_tensor(
+        num_shards, dtype=dtypes.int64, name="num_shards")
+    self._index = ops.convert_to_tensor(index, dtype=dtypes.int64, name="index")
+    self._name = name
+    variant_tensor = gen_dataset_ops.shard_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_shards=self._num_shards,
+        index=self._index,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/shuffle_op.py b/tensorflow/python/data/ops/shuffle_op.py
new file mode 100644
index 00000000000..a85bf09f439
--- /dev/null
+++ b/tensorflow/python/data/ops/shuffle_op.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.shuffle`."""
+from tensorflow.python import tf2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import random_seed
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _shuffle(  # pylint: disable=unused-private-name
+    input_dataset,
+    buffer_size,
+    seed=None,
+    reshuffle_each_iteration=None,
+    name=None):
+  return _ShuffleDataset(
+      input_dataset, buffer_size, seed, reshuffle_each_iteration, name=name)
+
+
+class _ShuffleDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that randomly shuffles the elements of its input."""
+
+  def __init__(self,
+               input_dataset,
+               buffer_size,
+               seed=None,
+               reshuffle_each_iteration=None,
+               name=None):
+    """See `Dataset.shuffle()` for details."""
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    if reshuffle_each_iteration is None:
+      reshuffle_each_iteration = True
+    self._reshuffle_each_iteration = reshuffle_each_iteration
+    self._name = name
+
+    if (tf2.enabled() and
+        (context.executing_eagerly() or ops.inside_function())):
+      variant_tensor = gen_dataset_ops.shuffle_dataset_v3(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          buffer_size=self._buffer_size,
+          seed=self._seed,
+          seed2=self._seed2,
+          seed_generator=gen_dataset_ops.dummy_seed_generator(),
+          reshuffle_each_iteration=self._reshuffle_each_iteration,
+          **self._common_args)
+    else:
+      variant_tensor = gen_dataset_ops.shuffle_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          buffer_size=self._buffer_size,
+          seed=self._seed,
+          seed2=self._seed2,
+          reshuffle_each_iteration=self._reshuffle_each_iteration,
+          **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/skip_op.py b/tensorflow/python/data/ops/skip_op.py
new file mode 100644
index 00000000000..9a2959b648d
--- /dev/null
+++ b/tensorflow/python/data/ops/skip_op.py
@@ -0,0 +1,38 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.skip`."""
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _skip(self, count, name=None):  # pylint: disable=unused-private-name
+  return _SkipDataset(self, count, name)
+
+
+class _SkipDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` skipping the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count, name=None):
+    """See `Dataset.skip()` for details."""
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+    self._name = name
+    variant_tensor = gen_dataset_ops.skip_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        count=self._count,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/snapshot_op.py b/tensorflow/python/data/ops/snapshot_op.py
new file mode 100644
index 00000000000..9fc93790051
--- /dev/null
+++ b/tensorflow/python/data/ops/snapshot_op.py
@@ -0,0 +1,119 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.snapshot`."""
+
+import multiprocessing
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _snapshot(input_dataset,  # pylint: disable=unused-private-name
+              path,
+              compression="AUTO",
+              reader_func=None,
+              shard_func=None,
+              name=None):
+  """See `Dataset.snapshot()` for details."""
+
+  project_func = None
+  if shard_func is None:
+    input_dataset = input_dataset.enumerate(name=name)
+    # This sets the amount of parallelism based on the number of CPU cores on
+    # the machine where this Python code is executed, which may differ from
+    # the number of CPU cores where the input pipeline graph is actually
+    # executed (e.g. remote Cloud TPU workers).
+    local_shard_func = lambda index, _: index % multiprocessing.cpu_count()
+    project_func = lambda _, elem: elem
+  else:
+    local_shard_func = shard_func
+  dataset = _SnapshotDataset(
+      input_dataset=input_dataset,
+      path=path,
+      compression=compression,
+      reader_func=reader_func,
+      # This will not do the right thing where the graph is built on a
+      # different machine than the executor (e.g. Cloud TPUs).
+      shard_func=local_shard_func,
+      name=name)
+  if project_func is not None:
+    dataset = dataset.map(project_func, name=name)
+  return dataset
+
+
+class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that allows saving and re-use of already processed data."""
+
+  def __init__(self,
+               input_dataset,
+               path,
+               shard_func,
+               compression=None,
+               reader_func=None,
+               pending_snapshot_expiry_seconds=None,
+               use_legacy_function=False,
+               name=None):
+
+    if reader_func is None:
+      reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
+          lambda x: x,
+          cycle_length=multiprocessing.cpu_count(),
+          num_parallel_calls=dataset_ops.AUTOTUNE)
+
+    self._input_dataset = input_dataset
+    self._path = path
+    self._compression = compression
+
+    self._reader_func = structured_function.StructuredFunctionWrapper(
+        reader_func,
+        self._transformation_name() + ".reader_func",
+        # Dataset of datasets of input elements
+        input_structure=dataset_ops.DatasetSpec(
+            dataset_ops.DatasetSpec(input_dataset.element_spec)),
+        use_legacy_function=use_legacy_function)
+    self._shard_func = structured_function.StructuredFunctionWrapper(
+        shard_func,
+        self._transformation_name() + ".shard_func",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+
+    if ((not self._shard_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.int32))) and
+        (not self._shard_func.output_structure.is_compatible_with(
+            tensor_spec.TensorSpec([], dtypes.int64)))):
+      raise TypeError(f"Invalid `shard_func`. `shard_func` must return "
+                      f"`tf.int64` scalar tensor but its return type is "
+                      f"{self._shard_func.output_structure}.")
+
+    self._name = name
+    variant_tensor = ged_ops.snapshot_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        path,
+        self._reader_func.function.captured_inputs,
+        self._shard_func.function.captured_inputs,
+        compression=compression,
+        reader_func=self._reader_func.function,
+        shard_func=self._shard_func.function,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._reader_func, self._shard_func]
+
+  def _transformation_name(self):
+    return "Dataset.snapshot()"
diff --git a/tensorflow/python/data/ops/sparse_batch_op.py b/tensorflow/python/data/ops/sparse_batch_op.py
index 603b7904e8a..c274a9a35c2 100644
--- a/tensorflow/python/data/ops/sparse_batch_op.py
+++ b/tensorflow/python/data/ops/sparse_batch_op.py
@@ -21,8 +21,8 @@
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
-def sparse_batch(self, batch_size, row_shape, name=None):
-  return _DenseToSparseBatchDataset(self, batch_size, row_shape, name)
+def _sparse_batch(input_dataset, batch_size, row_shape, name=None):
+  return _DenseToSparseBatchDataset(input_dataset, batch_size, row_shape, name)
 
 
 class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
diff --git a/tensorflow/python/data/ops/structured_function.py b/tensorflow/python/data/ops/structured_function.py
index 7f5fb82b990..b1ad61d51e5 100644
--- a/tensorflow/python/data/ops/structured_function.py
+++ b/tensorflow/python/data/ops/structured_function.py
@@ -16,6 +16,7 @@
 
 import warnings
 
+from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.eager import context
@@ -36,9 +37,6 @@
 autograph_ctx = lazy_loader.LazyLoader(
     "autograph_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
-dataset_ops = lazy_loader.LazyLoader(
-    "dataset_ops", globals(),
-    "tensorflow.python.data.ops.dataset_ops")
 
 
 def _should_pack(arg):
@@ -249,7 +247,7 @@ def wrapped_fn(*args):  # pylint: disable=missing-docstring
     else:
       defun_kwargs.update({"func_name": func_name})
       defun_kwargs.update({"_tf_data_function": True})
-      if dataset_ops.DEBUG_MODE:
+      if debug_mode.DEBUG_MODE:
         fn_factory = trace_py_function(defun_kwargs)
       else:
         if def_function.functions_run_eagerly():
diff --git a/tensorflow/python/data/ops/take_op.py b/tensorflow/python/data/ops/take_op.py
new file mode 100644
index 00000000000..ee86836c738
--- /dev/null
+++ b/tensorflow/python/data/ops/take_op.py
@@ -0,0 +1,39 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.skip`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _take(self, count, name=None):  # pylint: disable=unused-private-name
+  return _TakeDataset(self, count, name=name)
+
+
+class _TakeDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` containing the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count, name=None):
+    """See `Dataset.take()` for details."""
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+    self._name = name
+    variant_tensor = gen_dataset_ops.take_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        count=self._count,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/take_while_op.py b/tensorflow/python/data/ops/take_while_op.py
new file mode 100644
index 00000000000..de93101d327
--- /dev/null
+++ b/tensorflow/python/data/ops/take_while_op.py
@@ -0,0 +1,58 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.take_while`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import structured_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _take_while(input_dataset, predicate, name=None):  # pylint: disable=unused-private-name
+  """See `Dataset.take_while()` for details."""
+  return _TakeWhileDataset(input_dataset, predicate, name=name)
+
+
+class _TakeWhileDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that stops iteration when `predicate` returns false."""
+
+  def __init__(self, input_dataset, predicate, name=None):
+    """See `take_while()` for details."""
+
+    self._input_dataset = input_dataset
+    wrapped_func = structured_function.StructuredFunctionWrapper(
+        predicate, self._transformation_name(), dataset=self._input_dataset)
+
+    if not wrapped_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.bool)):
+      raise ValueError(f"Invalid `predicate`. `predicate` must return a "
+                       f"`tf.bool` scalar tensor but its return type is"
+                       f"{wrapped_func.output_structure}.")
+
+    self._predicate = wrapped_func
+    self._name = name
+    variant_tensor = ged_ops.take_while_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._predicate]
+
+  def _transformation_name(self):
+    return "Dataset.take_while()"
diff --git a/tensorflow/python/data/ops/unbatch_op.py b/tensorflow/python/data/ops/unbatch_op.py
new file mode 100644
index 00000000000..0e5b085aecc
--- /dev/null
+++ b/tensorflow/python/data/ops/unbatch_op.py
@@ -0,0 +1,58 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.unbatch`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _unbatch(input_dataset, name=None):  # pylint: disable=unused-private-name
+  """See `Dataset.unbatch()` for details."""
+  normalized_dataset = dataset_ops.normalize_to_dense(input_dataset)
+  return _UnbatchDataset(normalized_dataset, name=name)
+
+
+class _UnbatchDataset(dataset_ops.UnaryDataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset, name=None):
+    """See `unbatch()` for more details."""
+    flat_shapes = input_dataset._flat_shapes  # pylint: disable=protected-access
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError as e:
+        raise ValueError(
+            f"`unbatch()` is only supported for datasets of elements whose "
+            f"components have a matching leading dimension. Encountered both "
+            f"{known_batch_dim} and {s[0]}.") from e
+    self._input_dataset = input_dataset
+    self._structure = nest.map_structure(
+        lambda component_spec: component_spec._unbatch(),  # pylint: disable=protected-access
+        dataset_ops.get_structure(input_dataset))
+    self._name = name
+    variant_tensor = ged_ops.unbatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/unique_op.py b/tensorflow/python/data/ops/unique_op.py
new file mode 100644
index 00000000000..a50a85b2ea6
--- /dev/null
+++ b/tensorflow/python/data/ops/unique_op.py
@@ -0,0 +1,42 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.unique`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_experimental_dataset_ops
+
+
+def _unique(input_dataset, name):  # pylint: disable=unused-private-name
+  return _UniqueDataset(input_dataset, name)
+
+
+class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset containing the unique elements of an input dataset."""
+
+  def __init__(self, input_dataset, name=None):
+    """See `tf.data.Dataset.unique` for details."""
+    self._input_dataset = input_dataset
+    for ty in nest.flatten(dataset_ops.get_legacy_output_types(input_dataset)):
+      if ty not in (dtypes.int32, dtypes.int64, dtypes.string):
+        raise TypeError(
+            f"`tf.data.Dataset.unique` does not support type {ty} -- only "
+            f"`tf.int32`, `tf.int64`, and `tf.string` are supported.")
+    self._name = name
+    variant_tensor = gen_experimental_dataset_ops.unique_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/ops/window_op.py b/tensorflow/python/data/ops/window_op.py
new file mode 100644
index 00000000000..5df4a428814
--- /dev/null
+++ b/tensorflow/python/data/ops/window_op.py
@@ -0,0 +1,76 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.Dataset.window`."""
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def _window(input_dataset, size, shift, stride, drop_remainder, name):
+  if shift is None:
+    shift = size
+  return _WindowDataset(
+      input_dataset, size, shift, stride, drop_remainder, name=name)
+
+
+class _WindowDataset(dataset_ops.UnaryDataset):
+  """A dataset that creates window datasets from the input elements."""
+
+  def __init__(self,
+               input_dataset,
+               size,
+               shift,
+               stride,
+               drop_remainder,
+               name=None):
+    """See `window()` for more details."""
+    self._input_dataset = input_dataset
+    self._size = ops.convert_to_tensor(size, dtype=dtypes.int64, name="size")
+    self._shift = ops.convert_to_tensor(shift, dtype=dtypes.int64, name="shift")
+    self._stride = ops.convert_to_tensor(
+        stride, dtype=dtypes.int64, name="stride")
+    self._drop_remainder = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
+    self._structure = nest.pack_sequence_as(
+        dataset_ops.get_legacy_output_classes(input_dataset),
+        [
+            dataset_ops.DatasetSpec(  # pylint: disable=g-complex-comprehension
+                structure.convert_legacy_structure(output_type, output_shape,
+                                                   output_class))
+            for output_class, output_shape, output_type in zip(
+                nest.flatten(
+                    dataset_ops.get_legacy_output_classes(input_dataset)),
+                nest.flatten(
+                    dataset_ops.get_legacy_output_shapes(input_dataset)),
+                nest.flatten(
+                    dataset_ops.get_legacy_output_types(input_dataset)))
+        ])
+    self._name = name
+    variant_tensor = gen_dataset_ops.window_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        size=self._size,
+        shift=self._shift,
+        stride=self._stride,
+        drop_remainder=self._drop_remainder,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._structure
diff --git a/tensorflow/python/data/ops/zip_op.py b/tensorflow/python/data/ops/zip_op.py
index 5199a9922cd..7c3448add9d 100644
--- a/tensorflow/python/data/ops/zip_op.py
+++ b/tensorflow/python/data/ops/zip_op.py
@@ -19,7 +19,7 @@
 from tensorflow.python.ops import gen_dataset_ops
 
 
-def zip(datasets, name):  # pylint: disable=redefined-builtin
+def _zip(datasets, name):  # pylint: disable=redefined-builtin
   return _ZipDataset(datasets, name)
 
 
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index dfb52c5c394..16e83070e1c 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -76,6 +77,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/user_ops:ops",
         "@wrapt",
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 2642b966f38..14e9cb0e4ff 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -26,10 +26,12 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import internal
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
@@ -96,8 +98,6 @@ def normalize_element(element, element_signature=None):
     components = nest.flatten_up_to(element_signature, element)
     pack_as = element_signature
   with ops.name_scope("normalize_element"):
-    # Imported here to avoid circular dependency.
-    from tensorflow.python.data.ops import dataset_ops  # pylint: disable=g-import-not-at-top
     for i, (t, spec) in enumerate(zip(components, flattened_signature)):
       try:
         if spec is None:
@@ -108,14 +108,17 @@ def normalize_element(element, element_signature=None):
         normalized_components.append(
             ops.convert_to_tensor(t, name="component_%d" % i))
       else:
-        if isinstance(spec, sparse_tensor.SparseTensorSpec):
+        # To avoid a circular dependency between dataset_ops and structure,
+        # we check the class name instead of using `isinstance`.
+        if spec.__class__.__name__ == "DatasetSpec":
+          normalized_components.append(t)
+        elif isinstance(spec, sparse_tensor.SparseTensorSpec):
           normalized_components.append(sparse_tensor.SparseTensor.from_value(t))
         elif isinstance(spec, ragged_tensor.RaggedTensorSpec):
           normalized_components.append(
               ragged_tensor.convert_to_tensor_or_ragged_tensor(
                   t, name="component_%d" % i))
-        elif isinstance(
-            spec, (tensor_array_ops.TensorArraySpec, dataset_ops.DatasetSpec)):
+        elif isinstance(spec, (tensor_array_ops.TensorArraySpec)):
           normalized_components.append(t)
         elif isinstance(spec, NoneTensorSpec):
           normalized_components.append(NoneTensor())
@@ -339,6 +342,19 @@ def _to_tensor_list_helper(encode_fn, element_spec, element):
 
   def reduce_fn(state, value):
     spec, component = value
+    if isinstance(spec, internal.TensorSpec):
+      try:
+        component = ops.convert_to_tensor(component, spec.dtype)
+      except (TypeError, ValueError):
+        raise ValueError(
+            f"Value {component} is not convertible to a tensor with "
+            f"dtype {spec.dtype} and shape {spec.shape}."
+        )
+      if not component.shape.is_compatible_with(spec.shape):
+        raise ValueError(
+            f"Value {component} is not convertible to a tensor with "
+            f"dtype {spec.dtype} and shape {spec.shape}."
+        )
     return encode_fn(state, spec, component)
 
   return functools.reduce(
@@ -505,7 +521,7 @@ def _type_spec(self):
 
 # TODO(b/149584798): Move this to framework and add tests for non-tf.data
 # functionality.
-@type_spec.register("tf.NoneTensorSpec")
+@type_spec_registry.register("tf.NoneTensorSpec")
 class NoneTensorSpec(type_spec.BatchableTypeSpec):
   """Type specification for `None` value."""
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 0112781789a..cf166bf2a63 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -9,6 +9,7 @@
 # ":grpc_debug_server": Server interface for grpc:// debug URLs.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/debug/cli/BUILD b/tensorflow/python/debug/cli/BUILD
index 1864525f85c..0c0bdfec6d7 100644
--- a/tensorflow/python/debug/cli/BUILD
+++ b/tensorflow/python/debug/cli/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/debug/cli/curses_widgets.py b/tensorflow/python/debug/cli/curses_widgets.py
index f9ceec2c1c1..6a4c4edf71f 100644
--- a/tensorflow/python/debug/cli/curses_widgets.py
+++ b/tensorflow/python/debug/cli/curses_widgets.py
@@ -162,13 +162,12 @@ def render(self,
         the shortcut menu item.
       forward_command: (`str`) command for going forward. Used to construct the
         shortcut menu item.
-       latest_command_attribute: font attribute for lastest command.
+       latest_command_attribute: font attribute for latest command.
        old_command_attribute: font attribute for old (non-latest) command.
 
     Returns:
       (`debugger_cli_common.RichTextLines`) the navigation bar text with
         attributes.
-
     """
     output = RL("| ")
     output += RL(
diff --git a/tensorflow/python/debug/examples/v1/BUILD b/tensorflow/python/debug/examples/v1/BUILD
index 6e087094f8c..9a58a5320e7 100644
--- a/tensorflow/python/debug/examples/v1/BUILD
+++ b/tensorflow/python/debug/examples/v1/BUILD
@@ -1,4 +1,7 @@
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_binary(
     name = "debug_fibonacci",
@@ -122,6 +125,7 @@ sh_test(
     ],
     tags = [
         "no_windows",
+        "noasan",
         "v1only",
     ],
 )
diff --git a/tensorflow/python/debug/examples/v2/BUILD b/tensorflow/python/debug/examples/v2/BUILD
index cfd77ed5d32..fc220fbdd08 100644
--- a/tensorflow/python/debug/examples/v2/BUILD
+++ b/tensorflow/python/debug/examples/v2/BUILD
@@ -1,4 +1,7 @@
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_binary(
     name = "debug_fibonacci_v2",
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index 9bc3aa5d7ec..722b77c9a0b 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/debug/lib/debug_events_monitors_test.py b/tensorflow/python/debug/lib/debug_events_monitors_test.py
index dfab43a17ab..1246d4a406b 100644
--- a/tensorflow/python/debug/lib/debug_events_monitors_test.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors_test.py
@@ -141,64 +141,51 @@ def unique_sum(xs):
 
       traces = test_monitor.graph_execution_traces
       if tensor_debug_mode == "CONCISE_HEALTH":
-        self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
-        self.assertEqual(traces[0].op_type, "Placeholder")
+        self.assertLen(traces, 2)  # [Unique:0 , Sum:0].
+        self.assertEqual(traces[0].op_type, "Unique")
         self.assertEqual(traces[0].output_slot, 0)
-        self.assertEqual(traces[1].op_type, "Unique")
-        self.assertEqual(traces[1].output_slot, 0)
         # Unique:1 is not traced under CONCISE_HEALTH mode, as it's int-dtype.
-        self.assertEqual(traces[2].op_type, "Sum")
-        self.assertEqual(traces[2].output_slot, 0)
+        self.assertEqual(traces[1].op_type, "Sum")
+        self.assertEqual(traces[1].output_slot, 0)
         # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count].
         self.assertLen(traces[0].debug_tensor_value, 5)
         self.assertLen(traces[1].debug_tensor_value, 5)
-        self.assertLen(traces[2].debug_tensor_value, 5)
       elif tensor_debug_mode == "FULL_HEALTH":
-        self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
-        self.assertEqual(traces[0].op_type, "Placeholder")
+        self.assertLen(traces, 2)  # [Unique:0 , Sum:0].
+        self.assertEqual(traces[0].op_type, "Unique")
         self.assertEqual(traces[0].output_slot, 0)
-        self.assertEqual(traces[1].op_type, "Unique")
-        self.assertEqual(traces[1].output_slot, 0)
         # Unique:1 is not traced under FULL_HEALTH mode, as it's int-dtype.
-        self.assertEqual(traces[2].op_type, "Sum")
-        self.assertEqual(traces[2].output_slot, 0)
+        self.assertEqual(traces[1].op_type, "Sum")
+        self.assertEqual(traces[1].output_slot, 0)
         # [tensor_id, device_id, dtype, rank, element_count,
         #  neg_inf_count, pos_inf_count, nan_count,
         #  neg_finite_count, zero_count, pos_finite_count].
         self.assertLen(traces[0].debug_tensor_value, 11)
         self.assertLen(traces[1].debug_tensor_value, 11)
-        self.assertLen(traces[2].debug_tensor_value, 11)
       elif tensor_debug_mode == "FULL_TENSOR":
-        # [Placeholder:0, Unique:0, Unique:1, Const:0, Sum:0].
-        self.assertLen(traces, 5)
-        self.assertEqual(traces[0].op_type, "Placeholder")
+        # [Unique:0, Unique:1, Const:0, Sum:0].
+        self.assertEqual(traces[0].op_type, "Unique")
         self.assertEqual(traces[0].output_slot, 0)
         self.assertIsNone(traces[0].debug_tensor_value)
         self.assertAllEqual(
             reader.graph_execution_trace_to_tensor_value(traces[0]),
-            [2., 6., 8., 1., 2.])
+            [2., 6., 8., 1.])
         self.assertEqual(traces[1].op_type, "Unique")
-        self.assertEqual(traces[1].output_slot, 0)
+        self.assertEqual(traces[1].output_slot, 1)
         self.assertIsNone(traces[1].debug_tensor_value)
         self.assertAllEqual(
             reader.graph_execution_trace_to_tensor_value(traces[1]),
-            [2., 6., 8., 1.])
-        self.assertEqual(traces[2].op_type, "Unique")
-        self.assertEqual(traces[2].output_slot, 1)
-        self.assertIsNone(traces[2].debug_tensor_value)
-        self.assertAllEqual(
-            reader.graph_execution_trace_to_tensor_value(traces[2]),
             [0, 1, 2, 3, 0])
-        self.assertEqual(traces[3].op_type, "Const")
+        self.assertEqual(traces[2].op_type, "Const")
+        self.assertEqual(traces[2].output_slot, 0)
+        self.assertIsNone(traces[2].debug_tensor_value)
+        self.assertAllClose(
+            reader.graph_execution_trace_to_tensor_value(traces[2]), [0])
+        self.assertEqual(traces[3].op_type, "Sum")
         self.assertEqual(traces[3].output_slot, 0)
         self.assertIsNone(traces[3].debug_tensor_value)
         self.assertAllClose(
-            reader.graph_execution_trace_to_tensor_value(traces[3]), [0])
-        self.assertEqual(traces[4].op_type, "Sum")
-        self.assertEqual(traces[4].output_slot, 0)
-        self.assertIsNone(traces[4].debug_tensor_value)
-        self.assertAllClose(
-            reader.graph_execution_trace_to_tensor_value(traces[4]), 17.)
+            reader.graph_execution_trace_to_tensor_value(traces[3]), 17.)
 
 
 class AlertDataObjectsTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py
index 0e7e2c81aa5..a6f2947dd1f 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@@ -285,10 +285,10 @@ def func(x, y):
       reader.update()
       graph_exec_traces = reader.graph_execution_traces()
       executed_op_types = [trace.op_type for trace in graph_exec_traces
-                           if trace.op_type != "Const"]
+                           if trace.op_type not in ["Const", "Placeholder"]]
       self.assertCountEqual(
           executed_op_types,
-          ["Placeholder", "Placeholder", "AddV2", "Sub", "RealDiv"])
+          ["AddV2", "Sub", "RealDiv"])
       if tensor_debug_mode == "CURT_HEALTH":
         for trace in graph_exec_traces:
           # 1st element: tensor_id, should be >= 0.
@@ -404,10 +404,10 @@ def func(x, y):
       reader.update()
       graph_exec_traces = reader.graph_execution_traces()
       executed_op_types = [trace.op_type for trace in graph_exec_traces
-                           if trace.op_type != "Const"]
+                           if trace.op_type not in ["Const", "Placeholder"]]
       self.assertEqual(
           executed_op_types,
-          ["Placeholder", "Placeholder", "LogicalAnd", "LogicalNot"])
+          ["LogicalAnd", "LogicalNot"])
       for trace in graph_exec_traces:
         tensor_id = reader.graph_execution_trace_to_tensor_id(trace)
         self.assertGreaterEqual(tensor_id, 0)
@@ -502,7 +502,6 @@ def sin1p_log_sum(x, y):
                       set(reader.device_name_map().values()))
 
       # Verify the recorded graph-building history.
-      placeholder_op_digests = reader.graph_op_digests(op_type="Placeholder")
       add_op_digests = reader.graph_op_digests(op_type="AddV2")
       self.assertLen(add_op_digests, 2)
       self.assertEqual(
@@ -527,59 +526,33 @@ def sin1p_log_sum(x, y):
         self._verifyStackFrames(stack_frames)
 
       graph_exec_traces = [trace for trace in reader.graph_execution_traces()
-                           if trace.op_type != "Const"]
+                           if trace.op_type not in ["Const", "Placeholder"]]
       executed_op_types = [digest.op_type for digest in graph_exec_traces]
       self.assertEqual(
-          executed_op_types,
-          ["Placeholder", "Placeholder", "Placeholder", "Placeholder",
-           "AddV2", "Log", "AddV2", "Sin"])
-      placeholder_traces = graph_exec_traces[:4]
-      non_placeholder_traces = graph_exec_traces[4:]
+          executed_op_types, ["AddV2", "Log", "AddV2", "Sin"])
 
       # Verify the graph ID stack of each op.
-      # The outer function's 1st Placeholder.
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[0].graph_ids[-1]).name,
-          "sin1p_log_sum")
-      # The outer function's 2nd Placeholder.
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[1].graph_ids[-1]).name,
-          "sin1p_log_sum")
-      # The inner function's 1st Placeholder.
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[2].graph_ids[-1]).name,
-          "log_sum")
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[2].graph_ids[-2]).name,
-          "sin1p_log_sum")
-      # The inner function's 2nd Placeholder.
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[3].graph_ids[-1]).name,
-          "log_sum")
-      self.assertEqual(
-          reader.graph_by_id(placeholder_traces[3].graph_ids[-2]).name,
-          "sin1p_log_sum")
       # 1st AddV2 op.
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[0].graph_ids[-1]).name,
+          reader.graph_by_id(graph_exec_traces[0].graph_ids[-1]).name,
           "log_sum")
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[0].graph_ids[-2]).name,
+          reader.graph_by_id(graph_exec_traces[0].graph_ids[-2]).name,
           "sin1p_log_sum")
       # Log op.
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[1].graph_ids[-1]).name,
+          reader.graph_by_id(graph_exec_traces[1].graph_ids[-1]).name,
           "log_sum")
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[1].graph_ids[-2]).name,
+          reader.graph_by_id(graph_exec_traces[1].graph_ids[-2]).name,
           "sin1p_log_sum")
       # 2nd AddV2 op.
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[2].graph_ids[-1]).name,
+          reader.graph_by_id(graph_exec_traces[2].graph_ids[-1]).name,
           "sin1p_log_sum")
       # Sin op.
       self.assertEqual(
-          reader.graph_by_id(non_placeholder_traces[3].graph_ids[-1]).name,
+          reader.graph_by_id(graph_exec_traces[3].graph_ids[-1]).name,
           "sin1p_log_sum")
 
       if tensor_debug_mode == "NO_TENSOR":
@@ -592,61 +565,37 @@ def sin1p_log_sum(x, y):
         # In each case, the 1st element of debug_tensor_value is the ID of the
         # symbolic tenosr and the 2nd element is a zero indicating there is no
         # inf or nan.
-        self.assertAllClose(  # 1st outer placeholder.
-            placeholder_traces[0].debug_tensor_value,
-            [placeholder_op_digests[0].output_tensor_ids[0], 0.0])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[1].debug_tensor_value,
-            [placeholder_op_digests[1].output_tensor_ids[0], 0.0])
-        self.assertAllClose(  # 1st inner placeholder.
-            placeholder_traces[2].debug_tensor_value,
-            [placeholder_op_digests[2].output_tensor_ids[0], 0.0])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[3].debug_tensor_value,
-            [placeholder_op_digests[3].output_tensor_ids[0], 0.0])
         self.assertAllClose(  # 1st AddV2 op.
-            non_placeholder_traces[0].debug_tensor_value,
+            graph_exec_traces[0].debug_tensor_value,
             [add_op_digests[0].output_tensor_ids[0], 0.0])
         self.assertAllClose(  # Log op.
-            non_placeholder_traces[1].debug_tensor_value,
+            graph_exec_traces[1].debug_tensor_value,
             [log_op_digests[0].output_tensor_ids[0], 0.0])
         self.assertAllClose(  # 2nd AddV2 op.
-            non_placeholder_traces[2].debug_tensor_value,
+            graph_exec_traces[2].debug_tensor_value,
             [add_op_digests[1].output_tensor_ids[0], 0.0])
         self.assertAllClose(  # Sin op.
-            non_placeholder_traces[3].debug_tensor_value,
+            graph_exec_traces[3].debug_tensor_value,
             [sin_op_digests[0].output_tensor_ids[0], 0.0])
       elif tensor_debug_mode == "CONCISE_HEALTH":
         # 1st element: tensor_id.
         # 2nd element: element count. Remaining elements: all zero because there
         # is no -inf, inf or nan.
-        self.assertAllClose(  # 1st outer placeholder.
-            placeholder_traces[0].debug_tensor_value,
-            [placeholder_op_digests[0].output_tensor_ids[0], 1., 0., 0., 0.])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[1].debug_tensor_value,
-            [placeholder_op_digests[1].output_tensor_ids[0], 1., 0., 0., 0.])
-        self.assertAllClose(  # 1st inner placeholder.
-            placeholder_traces[2].debug_tensor_value,
-            [placeholder_op_digests[2].output_tensor_ids[0], 1., 0., 0., 0.])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[3].debug_tensor_value,
-            [placeholder_op_digests[3].output_tensor_ids[0], 1., 0., 0., 0.])
         # 1st AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[0].debug_tensor_value,
+            graph_exec_traces[0].debug_tensor_value,
             [add_op_digests[0].output_tensor_ids[0], 1.0, 0.0, 0.0, 0.0])
         # Log op.
         self.assertAllClose(
-            non_placeholder_traces[1].debug_tensor_value,
+            graph_exec_traces[1].debug_tensor_value,
             [log_op_digests[0].output_tensor_ids[0], 1.0, 0.0, 0.0, 0.0])
         # 2nd AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[2].debug_tensor_value,
+            graph_exec_traces[2].debug_tensor_value,
             [add_op_digests[1].output_tensor_ids[0], 1.0, 0.0, 0.0, 0.0])
         # Sin op.
         self.assertAllClose(
-            non_placeholder_traces[3].debug_tensor_value,
+            graph_exec_traces[3].debug_tensor_value,
             [sin_op_digests[0].output_tensor_ids[0], 1.0, 0.0, 0.0, 0.0])
       elif tensor_debug_mode == "FULL_HEALTH":
         # Elements: [
@@ -655,40 +604,24 @@ def sin1p_log_sum(x, y):
         #   dtype, rank, element_count,
         #   neg_inf_count, pos_inf_count, nan_count
         #   neg_finite_count, zero_count, pos_finite_count]
-        self.assertAllClose(  # 1st outer placeholder.
-            placeholder_traces[0].debug_tensor_value,
-            [placeholder_op_digests[0].output_tensor_ids[0],
-             -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[1].debug_tensor_value,
-            [placeholder_op_digests[1].output_tensor_ids[0],
-             -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
-        self.assertAllClose(  # 1st inner placeholder.
-            placeholder_traces[2].debug_tensor_value,
-            [placeholder_op_digests[2].output_tensor_ids[0],
-             -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[3].debug_tensor_value,
-            [placeholder_op_digests[3].output_tensor_ids[0],
-             -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
         # 1st AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[0].debug_tensor_value,
+            graph_exec_traces[0].debug_tensor_value,
             [add_op_digests[0].output_tensor_ids[0],
              -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
         # Log op.
         self.assertAllClose(
-            non_placeholder_traces[1].debug_tensor_value,
+            graph_exec_traces[1].debug_tensor_value,
             [log_op_digests[0].output_tensor_ids[0],
              -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
         # 2nd AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[2].debug_tensor_value,
+            graph_exec_traces[2].debug_tensor_value,
             [add_op_digests[1].output_tensor_ids[0],
              -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
         # Sin op.
         self.assertAllClose(
-            non_placeholder_traces[3].debug_tensor_value,
+            graph_exec_traces[3].debug_tensor_value,
             [sin_op_digests[0].output_tensor_ids[0],
              -1, 1, 0, 1, 0, 0, 0, 0, 0, 1])
       elif tensor_debug_mode == "SHAPE":
@@ -697,58 +630,36 @@ def sin1p_log_sum(x, y):
         # 3rd element: rank (scalar).
         # 4th element: element count (1).
         # Remaining elements: shape padded to fixed length (6).
-        self.assertAllClose(  # 1st outer placeholder.
-            placeholder_traces[0].debug_tensor_value,
-            [placeholder_op_digests[0].output_tensor_ids[0],
-             1, 0, 1, 0, 0, 0, 0, 0, 0])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[1].debug_tensor_value,
-            [placeholder_op_digests[1].output_tensor_ids[0],
-             1, 0, 1, 0, 0, 0, 0, 0, 0])
-        self.assertAllClose(  # 1st inner placeholder.
-            placeholder_traces[2].debug_tensor_value,
-            [placeholder_op_digests[2].output_tensor_ids[0],
-             1, 0, 1, 0, 0, 0, 0, 0, 0])
-        self.assertAllClose(  # 2nd outer placeholder.
-            placeholder_traces[3].debug_tensor_value,
-            [placeholder_op_digests[3].output_tensor_ids[0],
-             1, 0, 1, 0, 0, 0, 0, 0, 0])
         # 1st AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[0].debug_tensor_value,
+            graph_exec_traces[0].debug_tensor_value,
             [add_op_digests[0].output_tensor_ids[0], 1, 0, 1, 0, 0, 0, 0, 0, 0])
         # Log op.
         self.assertAllClose(
-            non_placeholder_traces[1].debug_tensor_value,
+            graph_exec_traces[1].debug_tensor_value,
             [log_op_digests[0].output_tensor_ids[0], 1, 0, 1, 0, 0, 0, 0, 0, 0])
         # 2nd AddV2 op.
         self.assertAllClose(
-            non_placeholder_traces[2].debug_tensor_value,
+            graph_exec_traces[2].debug_tensor_value,
             [add_op_digests[1].output_tensor_ids[0], 1, 0, 1, 0, 0, 0, 0, 0, 0])
         # Sin op.
         self.assertAllClose(
-            non_placeholder_traces[3].debug_tensor_value,
+            graph_exec_traces[3].debug_tensor_value,
             [sin_op_digests[0].output_tensor_ids[0], 1, 0, 1, 0, 0, 0, 0, 0, 0])
       else:  # FULL_TENSOR.
-        placeholder_full_tensor_values = [
-            reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in placeholder_traces]
-        self.assertAllClose(placeholder_full_tensor_values[0], x)  # Input x.
-        self.assertAllClose(placeholder_full_tensor_values[1], y)  # Input y.
-        self.assertAllClose(placeholder_full_tensor_values[2], x)  # Input x.
-        self.assertAllClose(placeholder_full_tensor_values[3], y)  # Input y.
-        non_placeholder_full_tensor_values = [
+
+        full_tensor_values = [
             reader.graph_execution_trace_to_tensor_value(trace)
-            for trace in non_placeholder_traces]
+            for trace in graph_exec_traces]
         self.assertAllClose(
-            non_placeholder_full_tensor_values[0], 5.0)  # 1st AddV2 op.
+            full_tensor_values[0], 5.0)  # 1st AddV2 op.
         self.assertAllClose(
-            non_placeholder_full_tensor_values[1], np.log(5.0))  # Log op.
+            full_tensor_values[1], np.log(5.0))  # Log op.
         self.assertAllClose(
-            non_placeholder_full_tensor_values[2],
+            full_tensor_values[2],
             np.log(5.0) + 1.0)  # 2nd AddV2 op.
         self.assertAllClose(
-            non_placeholder_full_tensor_values[3],
+            full_tensor_values[3],
             np.sin(np.log(5.0) + 1.0))  # Sin op.
 
   @parameterized.named_parameters(
diff --git a/tensorflow/python/debug/wrappers/BUILD b/tensorflow/python/debug/wrappers/BUILD
index 761de434ff9..5032d862d76 100644
--- a/tensorflow/python/debug/wrappers/BUILD
+++ b/tensorflow/python/debug/wrappers/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 0a2853fec2f..330376d818e 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
         "//third_party/py/keras:__subpackages__",  # TODO(scottzhu): remove this once keras is relying on tf.__internal__.
@@ -75,7 +76,7 @@ pytype_strict_library(
         "//tensorflow/python:nccl_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:tensor_spec",
@@ -129,6 +130,7 @@ py_library(
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/distribute/experimental",
         "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
+        "//tensorflow/python/distribute/failure_handling:preemption_watcher",
     ],
 )
 
@@ -473,6 +475,7 @@ cuda_py_test(
     ],
     python_version = "PY3",
     tags = [
+        "no_pip",  # TODO(b/266520226)
         "no_windows",  # TODO(b/184424727): Re-enable this.
         "noasan",  # TODO(b/180630068)
         "nomsan",  # TODO(b/180630068)
@@ -545,6 +548,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/autograph/operators",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
@@ -696,6 +700,7 @@ distribute_py_test(
 tpu_py_test(
     name = "tpu_strategy_test",
     srcs = ["tpu_strategy_test.py"],
+    args = ["--tpu_use_tfrt=true"],
     disable_experimental = True,
     python_version = "PY3",
     tags = ["no_oss"],
@@ -807,6 +812,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion_registry",
     ],
 )
 
@@ -820,6 +826,7 @@ py_library(
         ":packed_distributed_variable",
         ":reduce_util",
         ":values_util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
@@ -830,6 +837,8 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
@@ -856,6 +865,7 @@ py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/types",
     ],
 )
@@ -1241,6 +1251,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_cuda_asan",  # times out
+        "no_pip",  # TODO(b/266520226)
     ],
     deps = [
         ":collective_util",
@@ -2085,6 +2096,7 @@ cuda_py_test(
     shard_count = 12,
     tags = [
         "multi_gpu",
+        "no_pip",  # TODO(b/266520226)
         "noasan",
         "nomsan",
     ],  # b/175904958
@@ -2368,6 +2380,7 @@ tpu_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:summary_test_util",
         "//tensorflow/python/module",
         "//tensorflow/python/platform",
         "//tensorflow/python/tpu:device_assignment",
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 37cad238202..b084d56402a 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index e118460ab43..65208dab3e8 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -60,8 +60,9 @@ def expand_range_expression(range_exp):
         sub_range = sub_range * 2
       else:
         assert len(sub_range) == 2
+      num_digits = len(sub_range[0])
       for i in range(int(sub_range[0]), int(sub_range[1]) + 1):
-        yield i
+        yield str(i).zfill(num_digits)
 
   hosts = []
   try:
@@ -75,8 +76,7 @@ def expand_range_expression(range_exp):
       if m.group(3) is None:
         hosts.append(prefix)
       else:
-        hosts.extend(
-            prefix + str(i) for i in expand_range_expression(m.group(3)))
+        hosts.extend(prefix + i for i in expand_range_expression(m.group(3)))
   except Exception as e:
     raise ValueError('Invalid hostlist format "%s": %s' % (hostlist, e))
   return hosts
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 9bb9e7ec1b2..7e09ebf7c21 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -34,6 +34,9 @@ def test_expand_hostlist(self):
     self.assertEqual(
         expand_hostlist('n[1-2],m5,o[3-4,6,7-9]'),
         ['n1', 'n2', 'm5', 'o3', 'o4', 'o6', 'o7', 'o8', 'o9'])
+    self.assertEqual(
+        expand_hostlist('n[0001-0003],m5,o[009-011]'),
+        ['n0001', 'n0002', 'n0003', 'm5', 'o009', 'o010', 'o011'])
 
   def test_expand_tasks_per_node(self):
     self.assertEqual(expand_tasks_per_node('2'), [2])
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 51afef8290f..4fcba89ebee 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index 2a147cb5f3c..6e40bac576d 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -276,6 +276,16 @@ def get_master(self):
   def get_job_name(self):
     return self.task_type
 
+  def get_coordination_service_leader(self):
+    """Returns the location for coordination service.
+
+    The coordination service should be located on TPU worker0.
+
+    Returns:
+      A string indicate the location path.
+    """
+    return '/job:' + self.get_job_name() + '/task:0'
+
   def get_tpu_system_metadata(self):
     """Returns the metadata of the TPU system.
 
diff --git a/tensorflow/python/distribute/coordinator/BUILD b/tensorflow/python/distribute/coordinator/BUILD
index c338d829189..5028e703baf 100644
--- a/tensorflow/python/distribute/coordinator/BUILD
+++ b/tensorflow/python/distribute/coordinator/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -192,3 +193,24 @@ tf_py_test(
         "//tensorflow/python/eager:test",
     ],
 )
+
+tf_py_test(
+    name = "get_task_states_test",
+    srcs = ["get_task_states_test.py"],
+    flaky = True,
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "no_oss",  # TODO(b/219580021)
+        "noasan",  # Multi-process runner does not work with test sanitizers
+        "nomac",  # TODO(b/177065434)
+        "notsan",  # Multi-process runner does not work with test sanitizers
+    ],
+    deps = [
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training:server_lib",
+    ],
+)
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator.py b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
index 819266e60e6..360477a4949 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
@@ -692,7 +692,8 @@ def _preemption_handler(self):
         try:
           # TODO(haoyuzhang): support partial cluster recovery
           logging.info("Cluster now being recovered.")
-          context.context().update_server_def(self._server_def)
+          with metric_utils.monitored_timer("server_def_update"):
+            context.context().update_server_def(self._server_def)
 
           # Cluster updated successfully, clear the update signal, and notify
           # all workers that they are recovered from failure.
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
index 4bbc381e86a..e6aa951651f 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
@@ -1496,6 +1496,26 @@ def worker_fn(iterator):
     self.coordinator.schedule(worker_fn, args=(distributed_iterator,))
     self.assertEqual(self._tracing_count, 1)
 
+  def testPerWorkerDatasetBuild(self):
+    # Test that we can use Dataset type as input to worker_fn.
+    @def_function.function
+    def worker_fn(dataset):
+      return next(iter(dataset))
+
+    dataset_vals = [1, 2]
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(dataset_vals)
+    per_worker_dataset = self.coordinator.create_per_worker_dataset(dataset)
+    per_worker_dataset = per_worker_dataset.build()
+
+    result = self.coordinator.schedule(worker_fn, args=(per_worker_dataset,))
+    self.coordinator.join()
+    result = result.fetch()
+    self.assertEqual(result, dataset_vals[0])
+
+    # Test that the build() output type specs match the input Dataset spec.
+    for remote_value in per_worker_dataset._values:
+      self.assertEqual(remote_value._type_spec, dataset._type_spec)
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/distribute/coordinator/get_task_states_test.py b/tensorflow/python/distribute/coordinator/get_task_states_test.py
new file mode 100644
index 00000000000..7d69a9cbc1a
--- /dev/null
+++ b/tensorflow/python/distribute/coordinator/get_task_states_test.py
@@ -0,0 +1,164 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Get task states test for parameter server strategy in TF2."""
+
+import threading
+import time
+
+from tensorflow.core.lib.core import error_codes_pb2
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.coordinator import cluster_coordinator
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import errors
+from tensorflow.python.training import server_lib
+
+_PULL_FREQ_IN_SEC = 2
+_COORDINATION_ERROR_PAYLOAD_KEY = "type.googleapis.com/tensorflow.CoordinationServiceError"
+
+
+# TODO(b/249134783): This should be in a common util since it may also be used
+# by the main code.
+class RepeatedTimer(object):
+  """Threaded Repeated Timer from http://shortn/_3hMZTFr1Iv."""
+
+  def __init__(self, interval, function, *args):
+    self._timer = None
+    self.interval = interval
+    self.function = function
+    self.args = args
+    self.start_time = time.time()
+    self.is_running = False
+    self.start()
+
+  def _get_duration_sec(self):
+    return int(time.time() - self.start_time)
+
+  def _run(self):
+    self.is_running = False
+    self.start()
+    self.function(*self.args)
+
+  def start(self):
+    if not self.is_running:
+      self._timer = threading.Timer(self.interval, self._run)
+      self._timer.start()
+      self.is_running = True
+
+  def stop(self):
+    duration = self._get_duration_sec()
+    self._timer.cancel()
+    self.is_running = False
+    return duration
+
+
+class GetTaskStatesTest(object):  # pylint: disable=missing-docstring
+
+  def setUp(self, num_workers, num_ps):
+    super().setUp()
+
+    self._cluster = multi_worker_test_base.create_multi_process_cluster(
+        num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
+    self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict()
+    self._cluster_def["chief"] = [
+        "localhost:%d" % multi_worker_test_base.pick_unused_port()
+    ]
+    cluster_resolver = SimpleClusterResolver(
+        server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc")
+
+    context.context().configure_coordination_service(
+        service_type="standalone",
+        service_leader="/job:ps/replica:0/task:0",
+        heartbeat_timeout_in_ms=_PULL_FREQ_IN_SEC * 1000)
+    self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        cluster_resolver)
+    self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy)
+
+    self.num_workers = num_workers
+    self.num_ps = num_ps
+
+    self.states = None
+    self.polling_thread = RepeatedTimer(
+        interval=_PULL_FREQ_IN_SEC, function=self.get_task_states)
+
+  def tearDown(self):
+    super().tearDown()
+    self.polling_thread.stop()
+    self._cluster.stop()
+    self._cluster = None
+
+  def get_task_states(self):
+    self.states = context.context().get_task_states([("worker",
+                                                      self.num_workers),
+                                                     ("ps", self.num_ps)])
+
+  def testAllTasksHealthy(self):
+    time.sleep(_PULL_FREQ_IN_SEC * 1.5)
+    self.assertLen(self.states, self.num_workers + self.num_ps)
+    for state in self.states:
+      self.assertIsNone(state)
+
+  def testWorkerPreempted(self):
+    self._cluster.kill_task("worker", 0)
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    self.assertLen(self.states, self.num_workers + self.num_ps)
+    self.assertIsInstance(self.states[0], errors.UnavailableError)
+    self.assertIn("/job:worker/replica:0/task:0", self.states[0]._message)
+    self.assertEqual(self.states[0]._error_code, error_codes_pb2.UNAVAILABLE)
+    self.assertIn(_COORDINATION_ERROR_PAYLOAD_KEY,
+                  self.states[0]._experimental_payloads)
+    for i in range(1, self.num_workers + self.num_ps):
+      self.assertIsNone(self.states[i])
+    self._cluster.start_task("worker", 0)
+    context.context().update_server_def(context.get_server_def())
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    for state in self.states:
+      self.assertIsNone(state)
+
+  def testCoordinationServicePreempted(self):
+    self._cluster.kill_task("ps", 0)
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    # `states` is None since Coordination Service is not available.
+    self.assertIsNone(self.states)
+    # Simulate the restart of all the tasks.
+    for index in range(1, self.num_ps):
+      self._cluster.kill_task("ps", index)
+    for index in range(self.num_workers):
+      self._cluster.kill_task("worker", index)
+    for index in range(self.num_ps):
+      self._cluster.start_task("ps", index)
+    for index in range(self.num_workers):
+      self._cluster.start_task("worker", index)
+    context.context().update_server_def(context.get_server_def())
+    time.sleep(_PULL_FREQ_IN_SEC * 2)
+    self.assertLen(self.states, self.num_workers + self.num_ps)
+    for state in self.states:
+      self.assertIsNone(state)
+
+
+class MultiWorkerGetTaskStatesTest(GetTaskStatesTest, test.TestCase):
+  """This covers the cases where multiple workers and PS are used."""
+
+  def setUp(self):
+    super().setUp(2, 2)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/coordinator/metric_utils.py b/tensorflow/python/distribute/coordinator/metric_utils.py
index f5ab26029be..8979f17486d 100644
--- a/tensorflow/python/distribute/coordinator/metric_utils.py
+++ b/tensorflow/python/distribute/coordinator/metric_utils.py
@@ -45,10 +45,16 @@ def _init():
       time_buckets,
       'Sampler to track the time (in seconds) for fetching remote_value.')
 
+  server_def_update_sampler = monitoring.Sampler(
+      '/tensorflow/api/ps_strategy/coordinator/server_def_update', time_buckets,
+      'Sample to track the time (in seconds) for updating the server def upon '
+      'worker recovery.')
+
   _METRICS_MAPPING = {
       'function_tracing': function_tracing_sampler,
       'closure_execution': closure_execution_sampler,
-      'remote_value_fetch': remote_value_fetch_sampler
+      'remote_value_fetch': remote_value_fetch_sampler,
+      'server_def_update': server_def_update_sampler,
   }
 
 
@@ -80,5 +86,12 @@ def get_metric_summary(metric_name):
   ret['max'] = histogram_proto.max
   ret['num'] = histogram_proto.num
   ret['sum'] = histogram_proto.sum
-  # TODO(haoyuzhang): consider reporting the distribution in buckets.
+
+  bucket_limits = histogram_proto.bucket_limit
+  bucket_vals = histogram_proto.bucket
+  ret['histogram'] = {}
+  # Add lower limit as 0, since all these metrics are durations
+  bucket_limits.insert(0, 0)
+  for lb, ub, val in zip(bucket_limits[:-1], bucket_limits[1:], bucket_vals):
+    ret['histogram'][(lb, ub)] = val
   return ret
diff --git a/tensorflow/python/distribute/coordinator/values.py b/tensorflow/python/distribute/coordinator/values.py
index a9cc252e3b6..3fd5c85e5a9 100644
--- a/tensorflow/python/distribute/coordinator/values.py
+++ b/tensorflow/python/distribute/coordinator/values.py
@@ -339,6 +339,27 @@ def disallow_variable_creation(next_creator, **kwargs):
     self._coordinator = coordinator
     self._element_spec = None
 
+  def build(self):
+    """Trigger dataset creation on workers without creating an iterator.
+
+    Returns:
+      A PerWorkerValues object containing a tuple of RemoteValues, themselves
+      containing the built Dataset for each worker
+    """
+    def _create_per_worker_dataset():
+      dataset = self._dataset_fn()
+      return dataset
+
+    # pylint: disable=protected-access
+    per_worker_dataset = self._coordinator._create_per_worker_resources(
+        _create_per_worker_dataset)
+    # hack type_spec of RemoteValues
+    for dataset_remote_value in per_worker_dataset._values:
+      dataset_remote_value._type_spec = dataset_ops.DatasetSpec(
+          self._dataset_fn.structured_outputs.element_spec)
+
+    return per_worker_dataset
+
   def __iter__(self):
     # We would like users to create iterators outside `tf.function`s so that we
     # can track them.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index c1638e9f86c..e1cdc072196 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -48,6 +48,14 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
+
+try:
+  import dill  # pylint:disable=g-import-not-at-top
+
+  _REGISTER_DECORATOR = dill.register
+except ImportError:
+  _REGISTER_DECORATOR = lambda fn, *_: fn
+
 CollectiveReplicaLauncher = cross_device_utils.CollectiveReplicaLauncher
 CommunicationImplementation = collective_util.CommunicationImplementation
 ReduceOp = reduce_util.ReduceOp
@@ -1327,6 +1335,15 @@ def f():
     get_global_mpr(num_processes).run(replica_fn)
 
 
+@_REGISTER_DECORATOR(CollectiveOpsTest)
+def _save_test_case(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return CollectiveOpsTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
 if __name__ == "__main__":
   # Set default inter op thread pool size to one to ensure we don't exhaust the
   # thread pool with the additional executors to run collectives in eager.
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index cd0a935de58..57f76602699 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -20,7 +20,7 @@
 
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import values as value_lib
-from tensorflow.python.eager import backprop
+from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
@@ -575,14 +575,14 @@ def all_gather_with_padding(
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
   """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
   if any(isinstance(v, indexed_slices.IndexedSlices) for v in values):
-    return backprop.aggregate_indexed_slices_gradients(values)
+    return backprop_util.AggregateIndexedSlicesGradients(values)
   else:
     return accumulation_fn(values)
 
 
 def divide_by_n_tensors_or_indexed_slices(value, n):
   if isinstance(value, indexed_slices.IndexedSlices):
-    value = backprop.flatten_nested_indexed_slices(value)
+    value = backprop_util.FlattenNestedIndexedSlices(value)
     return indexed_slices.IndexedSlices(value.values / n, value.indices,
                                         value.dense_shape)
   else:
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
index 3752cf9bddd..81304c14415 100644
--- a/tensorflow/python/distribute/experimental/BUILD
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -1,4 +1,10 @@
+load(
+    "//tensorflow/dtensor:build_defs.bzl",
+    "dtensor_test",
+)
+
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -15,5 +21,79 @@ py_library(
         "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
+        "//tensorflow/python/distribute/failure_handling:preemption_watcher",
+    ],
+)
+
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    deps = [
+        ":dtensor_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:input_util",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util",
+    ],
+)
+
+dtensor_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dtensor_util",
+        ":mirrored_strategy",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python:mesh_util",
+        "//tensorflow/dtensor/python/tests:test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "dtensor_util",
+    srcs = ["dtensor_util.py"],
+    deps = [
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/python/distribute:values",
+    ],
+)
+
+dtensor_test(
+    name = "dtensor_util_test",
+    srcs = ["dtensor_util_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dtensor_util",
+        ":mirrored_strategy",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/dtensor/python/tests:test_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/experimental/__init__.py b/tensorflow/python/distribute/experimental/__init__.py
index f6eb66a10af..172f526e0e4 100644
--- a/tensorflow/python/distribute/experimental/__init__.py
+++ b/tensorflow/python/distribute/experimental/__init__.py
@@ -20,4 +20,5 @@
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute.failure_handling import failure_handling
+from tensorflow.python.distribute.failure_handling import preemption_watcher
 # pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/experimental/dtensor_util.py b/tensorflow/python/distribute/experimental/dtensor_util.py
new file mode 100644
index 00000000000..f59f4d2dd6b
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/dtensor_util.py
@@ -0,0 +1,99 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for strategies that are backed by DTensor."""
+
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.ops import summary_ops_v2
+
+
+class DTensorDistributedValue(values.DistributedValues):
+  """DistributedValue backed by a DTensor instance.
+
+  This class is useful to align the interface between DTensor and tf.distribute.
+  Most of the tf.distribute API will accept/return DistributedValue, whereas
+  DTensor low level API will only accept DTensor instance. In order to avoid
+  the conversion back and forth between DistributedValue and DTensor, we
+  introduce this class so that it can work with both side.
+  """
+
+  def __init__(self, dtensor):
+    if not d_api.is_dtensor(dtensor):
+      raise ValueError("The DTensorDistributedValue can only be built with "
+                       f"DTensor instance, got {type(dtensor)}")
+    super().__init__(d_api.unpack(dtensor))
+    self._dtensor = dtensor
+
+  def get_dtensor(self):
+    return self._dtensor
+
+  @property
+  def values(self):
+    # Note that this method exists so that it match the interface for PerReplica
+    # The public API in `tf.types.experimental.distributed.PerReplica` doesn't
+    # define any methods.
+    return self._values
+
+
+class DTensorReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for strategy that is backed by DTensor.
+
+  Since the DTensor is operated in the global context, most of the methods from
+  existing strategy ReplicaContext is not applicable since they need to access
+  local values. For now most of the methods in this class will raise explicit
+  error to user, and we will add more support for local values in future.
+  """
+  _UNSUPPORTED_ERROR_MSG = (
+      "Strategy that is backed by DTensor is run with a global context, and "
+      "doesn't support operations for local context, like any call to merge/"
+      "gather/reduce or local replica ID. Please use any strategy that is not "
+      "backed by DTensor")
+
+  def __init__(self, strategy):
+    # Since DTensor strategy only runs in a global context, and we can't have
+    # a local replica ID in the sync group. For now we pass None to parent, and
+    # raise an explicit error when it is accessed.
+    super().__init__(strategy, replica_id_in_sync_group=None)
+
+  def __enter__(self):
+    # This is a copy of parent class, without any check about whether the
+    # current replica is the first one (since DTensor only has one).
+    distribute_lib._push_per_thread_mode(self._thread_context)  # # pylint: disable=protected-access
+
+    summary_state = summary_ops_v2._summary_state  # pylint: disable=protected-access
+    self._summary_recording_distribution_strategy = (
+        summary_state.is_recording_distribution_strategy)
+    summary_state.is_recording_distribution_strategy = True
+
+  @property
+  def replica_id_in_sync_group(self):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+  @property
+  def _replica_id(self):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+  def merge_call(self, merge_fn, args=(), kwargs=None):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+  def all_reduce(self, reduce_op, value, options=None):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+  def all_gather(self, value, axis, options=None):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
+
+  def _update(self, var, fn, args=(), kwargs=None, group=True):
+    raise NotImplementedError(self._UNSUPPORTED_ERROR_MSG)
diff --git a/tensorflow/python/distribute/experimental/dtensor_util_test.py b/tensorflow/python/distribute/experimental/dtensor_util_test.py
new file mode 100644
index 00000000000..11f8ba6fbc3
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/dtensor_util_test.py
@@ -0,0 +1,109 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for DTensor related utilities in tf.distribute."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as values_lib
+from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.distribute.experimental import mirrored_strategy
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+
+
+class DTensorDistributedValueTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh(['batch'], global_ids, local_ids,
+                            test_util.create_device_list((2,), device))
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    tensor_1 = constant_op.constant([1.0])
+    tensor_2 = constant_op.constant([2.0])
+    batch_layout = layout.Layout.batch_sharded(
+        self.mesh, batch_dim='batch', rank=1)
+    self.dtensor = d_api.pack([tensor_1, tensor_2], batch_layout)
+
+  @parameterized.named_parameters([
+      ('py_floats', [1.0, 2.0]),
+      ('np_floats', np.array([1.0, 2.0])),
+      ('tf_const', lambda: constant_op.constant([1.0, 2.0])),
+      ('distribute_value', values_lib.PerReplica([1.0, 2.0])),
+  ])
+  def test_input_validation(self, inputs):
+    if callable(inputs):
+      inputs = inputs()
+
+    with self.assertRaisesRegex(ValueError, 'can only be built with DTensor'):
+      dtensor_util.DTensorDistributedValue(inputs)
+
+  def test_unpack(self):
+    v = dtensor_util.DTensorDistributedValue(self.dtensor)
+
+    self.assertIs(self.dtensor, v.get_dtensor())
+
+    per_replica_result = v.values
+    self.assertLen(per_replica_result, 2)
+    self.assertAllClose(per_replica_result[0], constant_op.constant([1.0]))
+    self.assertAllClose(per_replica_result[1], constant_op.constant([2.0]))
+
+
+class DTensorReplicaContextTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh(['batch'], global_ids, local_ids,
+                            test_util.create_device_list((2,), device))
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  def test_unsupported_methods(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    replica_context = dtensor_util.DTensorReplicaContext(strategy)
+
+    expected_error = replica_context._UNSUPPORTED_ERROR_MSG
+
+    self.assertEqual(replica_context.num_replicas_in_sync, 2)
+
+    with self.assertRaisesRegex(NotImplementedError, expected_error):
+      _ = replica_context.replica_id_in_sync_group
+
+    with self.assertRaisesRegex(NotImplementedError, expected_error):
+      replica_context.merge_call(None)
+
+    with self.assertRaisesRegex(NotImplementedError, expected_error):
+      replica_context.all_reduce(reduce_util.ReduceOp.SUM, None)
+
+    with self.assertRaisesRegex(NotImplementedError, expected_error):
+      replica_context.all_gather([], 0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy.py b/tensorflow/python/distribute/experimental/mirrored_strategy.py
new file mode 100644
index 00000000000..17aa76519ea
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy.py
@@ -0,0 +1,381 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implement a MirroredStrategy based on the DTensor low level API.
+
+This is an experiment to validate the viability of the DTensor API, and expose
+any potential feature gaps between the current API and the need.
+"""
+import functools
+
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import config as d_config
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import input_util
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python import mesh_util
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import values as values_lib
+from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+
+# Default dimension name used for the mesh created when user provide a list
+# of devices. For mirrored strategy, it should be a 1D mesh with batch dim only.
+_DEFAULT_BATCH_MESH_DIM_NAME = 'batch'
+
+
+class MirroredStrategy(distribute_lib.Strategy):
+  """Synchronous training across multiple replicas on one machine.
+
+  This strategy is typically used for training on one machine with multiple
+  accelerators (GPUs/TPUs).
+
+  For example, a variable created under a `MirroredStrategy` is a distributed
+  variable with layout replicated on each dimension. The variables will be
+  placed on the `mesh` that is specified in the __init__.
+  """
+
+  def __init__(self, mesh=None, devices=None, cross_device_ops=None):
+    """Synchronous training across multiple replicas on one machine.
+
+    Args:
+      mesh: optional DTensor mesh for the computation. Note that either `mesh`
+        or `devices` should be provided, and not both. The mesh should be 1D,
+        and will be used to split the input data among that dimension.
+      devices: a list of device strings, such as ['/gpu:0', '/gpu:1']. If both
+        `mesh` and `devices` are None, all the available GPU/TPU will be used.
+        If no accelerators are found, CPU is used.
+      cross_device_ops: optional, a descendant of `CrossDeviceOps`. The value is
+        ignored at the moment, and support will be added later.
+    """
+    self._validate_init_args(mesh, devices)
+    if not mesh:
+      mesh = self._build_mesh_from_device_list(devices)
+
+    extended = MirroredExtended(container_strategy=self, mesh=mesh)
+    super().__init__(extended)
+    self._mesh = mesh
+    self._devices = devices
+
+  @classmethod
+  def _validate_init_args(cls, mesh, devices):
+    if mesh and devices:
+      raise ValueError('Mesh and devices can not be provided at the same time. '
+                       f'received mesh = {mesh}, devices = {devices}')
+
+    # For mirrored strategy, the mesh should be 1D, and only contains a batch
+    # dimension, we will use that dimension to shard the inputs.
+    if mesh and len(mesh.shape()) != 1:
+      raise ValueError('The mesh for MirroredStrategy must be 1D, received: '
+                       f'{len(mesh.shape())}D')
+
+  @classmethod
+  def _build_mesh_from_device_list(cls, devices):
+    if devices:
+      mesh = mesh_util.create_mesh(
+          mesh_dims=[(_DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
+          devices=devices)
+    else:
+      # Trying to detect if there is any GPU/TPUs attached.
+      device_type = d_config.preferred_device_type()
+      devices = d_config.local_devices(device_type)
+      mesh = mesh_util.create_mesh(
+          mesh_dims=[(_DEFAULT_BATCH_MESH_DIM_NAME, len(devices))],
+          device_type=device_type)
+    return mesh
+
+
+class MirroredExtended(distribute_lib.StrategyExtendedV2):
+  """Strategy extension contains the concrete logic for variable creation."""
+
+  def __init__(self, container_strategy, mesh):
+    super().__init__(container_strategy)
+    self._mesh = mesh
+
+  def _create_variable(self, next_creator, **kwargs):
+    # Make sure the pop the `use_resource` which is not supported by the
+    # base tf.Variable. The `use_resource` is added by
+    # creator_with_resource_vars in distribute_lib.py
+    kwargs.pop('use_resource', None)
+
+    # Ignore the colocate_with for the mirrored strategy. Each of the device
+    # will get same copy of variable in the DTensor's case.
+    # `colocate_with` is added when user call:
+    # strategy.extended.colocate_vars_with(variable)
+    kwargs.pop('colocate_with', None)
+
+    # Make sure to call DVariable initializer under the scope so that it will
+    # have the proper replicated layout. The initial_value is multi-typed,
+    # eg it can be a tensor, or a python/numpy type, or a callable that
+    # produce tensor/python/numpy types. In all those cases, we need to wrap
+    # them invoke convert_to_tensor() under the scope so that the proper
+    # layout can be assigned.
+
+    # TODO(scottzhu): The layout information should be injected via kwargs, or
+    # lazily set later.
+    initial_value = kwargs.pop('initial_value')
+    def new_initial_value():
+      if callable(initial_value):
+        init_var = ops.convert_to_tensor(initial_value())
+      else:
+        init_var = ops.convert_to_tensor(initial_value)
+      rank = init_var.shape.rank
+      return d_api.copy_to_mesh(
+          init_var, layout.Layout.replicated(self._mesh, rank))
+
+    return d_variable.DVariable(new_initial_value, **kwargs)
+
+  @property
+  def _num_replicas_in_sync(self):
+    # The mesh should be 1D with batch sharding only.
+    # In the model parallel case, it should only return the size of
+    # batch dimension.
+    return self._mesh.size
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def worker_devices(self):
+    # Note that we return the local device here since this is a single worker
+    # setting, and the local devices will be all the devices in the current
+    # mesh. In the multi-worker mirrored strategy, this value should be
+    # expanded to the global device list.
+    return tuple(self._mesh.local_devices())
+
+  @property
+  def parameter_devices(self):
+    # Same as the worker_devices.
+    return self.worker_devices
+
+  def _in_multi_worker_mode(self):
+    # This method is mostly used in the input relate context and high level API.
+    # In the single client mesh DTensor context, this is False.
+    return False
+
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
+  def _experimental_distribute_dataset(self, dataset, options):
+    # Strategy always assume the user input data is a batched dataset for
+    # experimental_distribute_dataset().
+    # TODO(yuefengz): Add check for whether a dataset is batched for all
+    # strategies.
+
+    # TODO(b/265198795): Support dataset already batched to global batch size.
+    # Since DTensorDataset doesn't support batched dataset that is already
+    # batched global batch size, it only supports dataset that is batched to
+    # local batch size, we need to infer the batch size, and unbatch the dataset
+    # until the b/265198795 is resolved.
+    batch_size = distribute.compute_batch_size(dataset)
+
+    # There are multiple case that the batch is not static, eg partial batch,
+    # or uneven batch, in all those case, it will return -1.
+    if batch_size.numpy() < 0:
+      # When we don't have a static batch size.
+      raise ValueError('DTensor strategy requires a static batch size for now.'
+                       'The dynamic batch size will be supported in future')
+    # Unbatch the dataset for now since the DTensorDataset has some limitation
+    # about the local batch size as well as the mesh size.
+    dataset = dataset.unbatch()
+
+    def _create_batch_layout(tensor_spec):
+      # For unbatched dataset, the new layout need to have +1 rank for
+      # the batched result.
+      rank = len(tensor_spec.shape) + 1
+      return layout.Layout.batch_sharded(
+          self._mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
+
+    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
+
+    return input_util.DTensorDataset(
+        dataset=dataset,
+        mesh=self._mesh,
+        layouts=layouts,
+        global_batch_size=batch_size,
+        dataset_already_batched=False,
+        batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME,
+        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
+        prefetch=None,
+        tf_data_service_config=None
+    )
+
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError(
+        'Strategy.make_dataset_iterator() is deprecated, and only available '
+        'in the V1 API.')
+
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
+    raise NotImplementedError(
+        'Strategy.make_input_fn_iterator() is deprecated, and only available '
+        'in the V1 API.')
+
+  def _distribute_datasets_from_function(self, dataset_fn, options):
+    # TODO(scottzhu): Implement the logic for options in future
+    del options
+    # Single worker for now, this will change when deal with different input
+    # options or multiple workers.
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=1,
+        input_pipeline_id=0,
+        num_replicas_in_sync=self._num_replicas_in_sync
+    )
+    dataset = dataset_fn(input_context)
+
+    # Note that the dataset should already batched to local per-relica batch
+    def _create_batch_layout(tensor_spec):
+      rank = len(tensor_spec.shape)
+      return layout.Layout.batch_sharded(
+          self._mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
+
+    layouts = nest.map_structure(_create_batch_layout, dataset.element_spec)
+
+    batch_size = distribute.compute_batch_size(dataset)
+    # There are multiple case that the batch is not static, eg partial batch,
+    # or uneven batch, in all those case, it will return -1.
+    if batch_size.numpy() < 0:
+      # When we don't have a static batch size.
+      raise ValueError('DTensor strategy requires a static batch size for now.'
+                       'The dynamic batch size will be supported in future')
+    global_batch_size = batch_size.numpy() * self._num_replicas_in_sync
+
+    return input_util.DTensorDataset(
+        dataset=dataset,
+        mesh=self._mesh,
+        layouts=layouts,
+        global_batch_size=global_batch_size,
+        dataset_already_batched=True,
+        batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME,
+        # TODO(scottzhu): Add prefetch support by inspecting the input dataset.
+        prefetch=None,
+        tf_data_service_config=None
+    )
+
+  def _experimental_distribute_values_from_function(self, value_fn):
+    per_replica_values = []
+    for replica_id in range(self._num_replicas_in_sync):
+      per_replica_values.append(value_fn(
+          distribute_lib.ValueContext(replica_id,
+                                      self._num_replicas_in_sync)))
+    # Instead of using the DistributeVariable, return a DTensor instead since
+    # the run() will expect a DTensor instance.
+    result = distribute_utils.regroup(per_replica_values, always_wrap=True)
+    map_fn = functools.partial(_convert_per_replica_to_dtensor, mesh=self._mesh)
+    return nest.map_structure(map_fn, result)
+
+  def call_for_each_replica(self, fn, args, kwargs):
+    """Run `fn` once per replica.
+
+    This is a method that expected by the strategy base class in its `run()`.
+
+    Args:
+      fn: function to run (will be run once per replica).
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
+
+    Returns:
+      Merged return value of `fn` across all replicas.
+    """
+    # Comparing to the existing MirroredStrategy, which will run the fn on
+    # each of the replica with individual thread, the DTensor will just run
+    # the fn once with the DTensor inputs, and the distribution will be handled
+    # by the DTensor.
+
+    distribute_lib._require_cross_replica_or_default_context_extended(self)   # pylint: disable=protected-access
+    if kwargs is None:
+      kwargs = {}
+
+    # For any value that is not DTensor, eg normal tf.Tensor or
+    # DistributedValues, we need to convert them into DTensor.
+    map_fn = functools.partial(_convert_inputs_to_dtensor, mesh=self._mesh)
+    d_args = nest.map_structure(map_fn, args)
+    d_kwargs = nest.map_structure(map_fn, kwargs)
+
+    with self._container_strategy().scope():
+      with dtensor_util.DTensorReplicaContext(self._container_strategy()):
+        dtensor_result = fn(*d_args, **d_kwargs)
+
+    return nest.map_structure(
+        dtensor_util.DTensorDistributedValue,
+        dtensor_result)
+
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    if isinstance(value, dtensor_util.DTensorDistributedValue):
+      value = value.get_dtensor()
+    if not d_api.is_dtensor(value):
+      # This is the current behavior for mirrored strategy, should we raise an
+      # error for unsupported types?
+      return value
+
+    # Unpack the dtensor components and gather the tensors on the axis
+    components = d_api.unpack(value)
+    return array_ops.concat(components, axis=axis)
+
+
+def _convert_inputs_to_dtensor(inputs, mesh):
+  """Convert any input types to DTensor instance."""
+  if d_api.is_dtensor(inputs):
+    return inputs
+  elif isinstance(inputs, dtensor_util.DTensorDistributedValue):
+    return inputs.get_dtensor()
+  elif isinstance(inputs, values_lib.DistributedValues):
+    return _convert_per_replica_to_dtensor(inputs, mesh)
+  else:
+    # For the rest of the types, we will convert it to dtensor.
+    # Any of the inputs will be replicate to all the devices.
+    rank = len(inputs.shape)
+    replicated_layout = layout.Layout.replicated(mesh, rank=rank)
+    return d_api.copy_to_mesh(inputs, replicated_layout)
+
+
+def _convert_per_replica_to_dtensor(per_replica_value, mesh):
+  """Convert a PreReplica result to a DTensor instance.
+
+  Args:
+    per_replica_value: A PerReplica instance whose value will be converted
+      to DTensor.
+    mesh: The mesh used for layout creation.
+
+  Returns:
+    A DTensor instance that packed from per_replica_value with batch sharded
+      layout.
+  """
+  values = per_replica_value.values
+  if isinstance(values[0], (float, int)):
+    rank = 0
+  else:
+    rank = len(values[0].shape)
+
+  if rank == 0:
+    result = []
+    # dtensor.pack requires each component to have same rank as the packed
+    # result. When the individual value is scalar, it needs to be expanded into
+    # 1D tensor.
+    for v in values:
+      result.append(array_ops.expand_dims_v2(v, axis=0))
+    rank += 1
+  else:
+    result = list(values)   # dtensor.pack requires a list as input.
+
+  # TODO(scottzhu): Note that the result tensor could be a partial value and
+  # not always batch shard or fully replicaed. See
+  # http://screenshot/6ERkXyX95KqftCw as an example.
+  batch_layout = layout.Layout.batch_sharded(
+      mesh, batch_dim=_DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
+
+  return d_api.pack(result, batch_layout)
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy_test.py b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
new file mode 100644
index 00000000000..dcb43a1d231
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
@@ -0,0 +1,476 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for MirroredStrategy backed by DTensor API."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import api as d_api
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout
+from tensorflow.dtensor.python import mesh_util
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute.experimental import dtensor_util
+from tensorflow.python.distribute.experimental import mirrored_strategy
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import variables
+
+
+class StrategyBaseTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh(['batch'], global_ids, local_ids,
+                            test_util.create_device_list((2,), device))
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+  @parameterized.named_parameters([
+      ('py_floats', lambda: [1.0, 2.0], True),
+      ('np_floats', lambda: np.array([1.0, 2.0]), True),
+      ('tf_const', lambda: constant_op.constant([1.0, 2.0]), True),
+      ('py_floats_callable', lambda: [1.0, 2.0], False),
+      ('np_floats_callable', lambda: np.array([1.0, 2.0]), False),
+      ('tf_const_callable', lambda: constant_op.constant([1.0, 2.0]), False),
+  ])
+  def test_variable_creation(self, init_value, convert_callable):
+    if convert_callable:
+      init_value = init_value()
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    with strategy.scope():
+      v = variables.Variable(init_value)
+
+    self.assertIsInstance(v, d_variable.DVariable)
+    self.assertIsNotNone(v.layout)
+    self.assertEqual(v.layout, layout.Layout.replicated(self.mesh, rank=1))
+
+  def test_mesh(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    self.assertEqual(strategy._mesh, self.mesh)
+
+  def test_strategy_extension(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    self.assertIsInstance(strategy.extended, distribute_lib.StrategyExtendedV2)
+
+  def test_num_replica_in_sync(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    self.assertEqual(strategy.num_replicas_in_sync, 2)
+
+  def test_worker_devices(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    worker_devices = strategy.extended.worker_devices
+    self.assertLen(worker_devices, 2)
+    self.assertEqual(worker_devices, tuple(self.mesh.local_devices()))
+
+  def test_parameter_devices(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    parameter_devices = strategy.extended.parameter_devices
+    self.assertLen(parameter_devices, 2)
+    self.assertEqual(parameter_devices, tuple(self.mesh.local_devices()))
+
+  def test_variable_created_in_scope(self):
+    strategy1 = mirrored_strategy.MirroredStrategy(self.mesh)
+    with strategy1.scope():
+      v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    v2 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    strategy2 = mirrored_strategy.MirroredStrategy(self.mesh)
+    with strategy2.scope():
+      v3 = variables.Variable(constant_op.constant([1.0, 2.0]))
+
+    self.assertTrue(strategy1.extended.variable_created_in_scope(v1))
+    self.assertFalse(strategy1.extended.variable_created_in_scope(v2))
+    self.assertFalse(strategy1.extended.variable_created_in_scope(v3))
+    self.assertTrue(strategy2.extended.variable_created_in_scope(v3))
+
+  def test_colocate_vars_with(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    with strategy.scope():
+      v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
+      with strategy.extended.colocate_vars_with(v1):
+        v2 = variables.Variable(constant_op.constant([2.0, 3.0]))
+
+    # We assert the layout for the variable, and make sure they are same.
+    self.assertEqual(v1.layout, v2.layout)
+
+  def test_in_multi_worker_mode(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    self.assertFalse(strategy.extended._in_multi_worker_mode())
+
+  def test_run_with_tensor_inputs(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    tensor_input = constant_op.constant(3.0)
+
+    @def_function.function
+    def replica_fn(inputs):
+      return inputs * 2.0
+
+    result = strategy.run(replica_fn, args=(tensor_input,))
+    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result.values, 2)
+    self.assertAllClose(result.values[0], constant_op.constant(6.0))
+    self.assertAllClose(result.values[1], constant_op.constant(6.0))
+
+  def test_run_with_distribute_value_input(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+
+    def value_fn(value_context):
+      return value_context.num_replicas_in_sync
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    @def_function.function
+    def replica_fn(inputs):
+      return inputs * 2
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result.values, 2)
+    # Note that the scalar value from
+    # experimental_distribute_values_from_function will be up rank to 1D since
+    # batched shared dtensor need at least be 1D. So the result from the
+    # strategy.run is [4], instead of just 4.
+    self.assertAllClose(result.values[0], constant_op.constant([4]))
+    self.assertAllClose(result.values[1], constant_op.constant([4]))
+
+  def test_nested_structure_output(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    array_value = np.array([3., 2., 1.])
+    def value_fn(ctx):
+      value = array_value[ctx.replica_id_in_sync_group]
+      return {'a': value,
+              'b': constant_op.constant([value + 1.0, value + 2.0])}
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    @def_function.function
+    def replica_fn(inputs):
+      result = {}
+      for key in inputs:
+        result[key] = inputs[key] * 2.0
+      return result
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    self.assertLen(result.keys(), 2)
+    self.assertIsInstance(result['a'], dtensor_util.DTensorDistributedValue)
+    self.assertAllClose(result['a'].values[0], constant_op.constant([6.0]))
+    self.assertAllClose(result['a'].values[1], constant_op.constant([4.0]))
+
+    self.assertIsInstance(result['b'], dtensor_util.DTensorDistributedValue)
+    self.assertAllClose(result['b'].values[0],
+                        constant_op.constant([8.0, 10.0]))
+    self.assertAllClose(result['b'].values[1], constant_op.constant([6.0, 8.0]))
+
+  def test_inputs_with_dtensor_distribute_values(self):
+
+    @def_function.function
+    def replica_fn_1(inputs):
+      return inputs * 2.0
+
+    @def_function.function
+    def replica_fn_2(inputs):
+      return inputs + 1.0
+
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    tensor_input = constant_op.constant(3.0)
+
+    result_1 = strategy.run(replica_fn_1, args=(tensor_input,))
+    self.assertIsInstance(result_1, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result_1.values, 2)
+    self.assertAllClose(result_1.values[0], constant_op.constant(6.0))
+    self.assertAllClose(result_1.values[1], constant_op.constant(6.0))
+
+    result_2 = strategy.run(replica_fn_2, args=(result_1,))
+    self.assertIsInstance(result_2, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result_2.values, 2)
+    self.assertAllClose(result_2.values[0], constant_op.constant(7.0))
+    self.assertAllClose(result_2.values[1], constant_op.constant(7.0))
+
+  def test_get_replica_context(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+
+    tensor_input = constant_op.constant(3)
+
+    @def_function.function
+    def replica_fn(inputs):
+      replica_context = distribution_strategy_context.get_replica_context()
+      self.assertIsInstance(replica_context, dtensor_util.DTensorReplicaContext)
+      return inputs * replica_context.num_replicas_in_sync
+
+    # Default replica context
+    self.assertIsNotNone(distribution_strategy_context.get_replica_context())
+    with strategy.scope():
+      self.assertIsNone(distribution_strategy_context.get_replica_context())
+
+      result = strategy.run(replica_fn, args=(tensor_input,))
+
+    self.assertLen(result.values, 2)
+    self.assertAllClose(result.values[0], constant_op.constant(6))
+    self.assertAllClose(result.values[1], constant_op.constant(6))
+
+  def test_gather_non_dtensor_value(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    tensor_input = constant_op.constant(3.0)
+
+    result = strategy.gather(tensor_input, axis=0)
+    self.assertAllClose(result, tensor_input)
+
+  def test_gather_dtensor_value(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+
+    def value_fn(value_context):
+      start = value_context.replica_id_in_sync_group
+      return array_ops.reshape(math_ops.range(start=start, limit=start + 6),
+                               shape=(1, 2, 3))
+
+    distribute_result = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    result = strategy.gather(distribute_result, axis=0)
+    self.assertEqual(result.shape, [2, 2, 3])
+    self.assertAllClose(result, [[[0, 1, 2], [3, 4, 5]],
+                                 [[1, 2, 3], [4, 5, 6]]])
+    result = strategy.gather(distribute_result, axis=1)
+    self.assertEqual(result.shape, [1, 4, 3])
+    self.assertAllClose(result, [[[0, 1, 2], [3, 4, 5], [1, 2, 3], [4, 5, 6]]])
+    result = strategy.gather(distribute_result, axis=2)
+    self.assertEqual(result.shape, [1, 2, 6])
+    self.assertAllClose(result, [[[0, 1, 2, 1, 2, 3], [3, 4, 5, 4, 5, 6]]])
+
+
+class InvalidMeshTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2, 1))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh(['batch', 'model'], global_ids, local_ids,
+                            test_util.create_device_list((2,), device))
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh_2d = self.configTestMesh(mesh_dict)
+
+  def test_invalid_mesh_shape(self):
+    with self.assertRaisesRegex(
+        ValueError, 'The mesh for MirroredStrategy must be 1D, received: 2D'):
+      mirrored_strategy.MirroredStrategy(self.mesh_2d)
+
+
+class StrategyCreationTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    device_type = test_util.preferred_device_type()
+    if device_type != 'TPU':
+      test_util.reset_logical_devices(device_type, 2)
+    self.device_type = device_type
+
+  def test_explicit_device_list(self):
+
+    device_list = [f'/{self.device_type}:{i}' for i in range(2)]
+    strategy = mirrored_strategy.MirroredStrategy(devices=device_list)
+    mesh = strategy._mesh
+    self.assertEqual(mesh.num_local_devices(), 2)
+    self.assertEqual(mesh.shape(), [2,])
+    self.assertEqual(mesh.dim_names, ['batch'])
+    self.assertIn(
+        f'/job:localhost/replica:0/task:0/device:{self.device_type}:0',
+        mesh.local_devices()[0])
+    self.assertIn(
+        f'/job:localhost/replica:0/task:0/device:{self.device_type}:1',
+        mesh.local_devices()[1])
+
+  def test_implicit_device_list(self):
+    strategy = mirrored_strategy.MirroredStrategy()
+    mesh = strategy._mesh
+    self.assertEqual(mesh.num_local_devices(), 2)
+    self.assertEqual(mesh.shape(), [2,])
+    self.assertIn(
+        f'/job:localhost/replica:0/task:0/device:{self.device_type}:0',
+        mesh.local_devices()[0])
+    self.assertIn(
+        f'/job:localhost/replica:0/task:0/device:{self.device_type}:1',
+        mesh.local_devices()[1])
+
+  def test_mesh_with_device_list(self):
+    device_list = [f'/{self.device_type}:{i}' for i in range(2)]
+    mesh = mesh_util.create_mesh([('batch', 2)], devices=device_list)
+    with self.assertRaisesRegex(
+        ValueError, 'Mesh and devices can not be provided at the same time'):
+      _ = mirrored_strategy.MirroredStrategy(mesh=mesh, devices=device_list)
+
+
+class StrategyDatasetTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super().setUp()
+    global_ids = test_util.create_device_ids_array((2,))
+    local_ids = np.ravel(global_ids).tolist()
+    mesh_dict = {
+        device: layout.Mesh(['batch'], global_ids, local_ids,
+                            test_util.create_device_list((2,), device))
+        for device in ['TPU', 'GPU', 'CPU']
+    }
+    self.mesh = self.configTestMesh(mesh_dict)
+
+    self.images = stateless_random_ops.stateless_random_uniform(
+        [8, 8, 3], seed=(1, 2), minval=0, maxval=255)
+    self.labels = stateless_random_ops.stateless_random_uniform(
+        [1], seed=(1, 2), minval=0, maxval=10)
+
+    self.dataset = dataset_ops.Dataset.from_tensors(
+        (self.images, self.labels)).repeat()
+
+  def test_create_batched_dataset(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    global_batch_size = 8
+    dataset = self.dataset.batch(global_batch_size).prefetch(2)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(dataset)
+    element = next(iter(distributed_dataset))
+    batched_image, batched_label = element
+    self.assertEqual(batched_image.shape, [global_batch_size, 8, 8, 3])
+    self.assertEqual(batched_label.shape, [global_batch_size, 1])
+
+    # Make sure when unpack the tensor, each of them has enough shards.
+    self.assertLen(d_api.unpack(batched_image), self.mesh.num_local_devices())
+    self.assertLen(d_api.unpack(batched_label), self.mesh.num_local_devices())
+
+  def test_uneven_batched_dataset(self):
+    elements = [[1, 2, 3], [1, 2], [1, 2, 3, 4]]
+    dataset = dataset_ops.Dataset.from_generator(
+        lambda: elements, dtypes.int64).repeat()
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    with self.assertRaisesRegex(ValueError, 'requires a static batch size'):
+      strategy.experimental_distribute_dataset(dataset)
+
+  def test_create_partial_batched_dataset(self):
+    # TODO(b/210887657): Support last partial batch.
+    self.skipTest('Test failed due to last partial batch')
+    dataset = dataset_ops.Dataset.from_tensors(
+        (self.images, self.labels)).repeat(30)  # There is a last partial batch
+
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    global_batch_size = 8
+    dataset = dataset.batch(global_batch_size).prefetch(2)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(dataset)
+    expected_element_batch_size = [8, 8, 8, 6]
+    # The last batch with 6 element will fail to produce with StopIteration.
+    iterator = iter(distributed_dataset)
+    for batch_size in expected_element_batch_size:
+      element = next(iterator)
+      batched_image, batched_label = element
+      self.assertEqual(batched_image.shape, [batch_size, 8, 8, 3])
+      self.assertEqual(batched_label.shape, [batch_size, 1])
+
+      # Make sure when unpack the tensor, each of them has enough shards.
+      self.assertLen(d_api.unpack(batched_image), self.mesh.num_local_devices())
+      self.assertLen(d_api.unpack(batched_label), self.mesh.num_local_devices())
+
+  def test_deprecated_strategy_methods(self):
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    with self.assertRaisesRegex(
+        NotImplementedError, 'only available in the V1 API'):
+      strategy.make_dataset_iterator(self.dataset)
+
+    with self.assertRaisesRegex(
+        NotImplementedError, 'only available in the V1 API'):
+      strategy.make_input_fn_iterator(lambda _: self.dataset)
+
+  def test_distribute_dataset_from_fn(self):
+    local_batch_size = 4
+    global_batch_size = 8
+    def dataset_fn(option):
+      del option
+      return dataset_ops.Dataset.from_tensors(
+          (self.images, self.labels)).repeat().batch(
+              local_batch_size, drop_remainder=True).prefetch(2)
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    distributed_dataset = strategy.distribute_datasets_from_function(
+        dataset_fn, None)
+
+    element = next(iter(distributed_dataset))
+    batched_image, batched_label = element
+    self.assertEqual(batched_image.shape, [global_batch_size, 8, 8, 3])
+    self.assertEqual(batched_label.shape, [global_batch_size, 1])
+
+    # Make sure there are two shards when unpack, and each of them has 4 as
+    # batch size
+    unpacked_images = d_api.unpack(batched_image)
+    self.assertLen(unpacked_images, self.mesh.num_local_devices())
+    self.assertEqual(unpacked_images[0].shape, [local_batch_size, 8, 8, 3])
+    self.assertEqual(unpacked_images[1].shape, [local_batch_size, 8, 8, 3])
+
+  def test_distribute_values_from_function(self):
+    array_value = np.array([3., 2., 1.])
+    def value_fn(ctx):
+      return array_value[ctx.replica_id_in_sync_group]
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+    self.assertDTensorEqual(
+        constant_op.constant([3., 2.], dtype=dtypes.float64),
+        layout.Layout.batch_sharded(self.mesh, batch_dim='batch', rank=1),
+        distributed_values)
+
+  def test_distribute_values_from_function_with_nested_structure(self):
+    array_value = np.array([3., 2., 1.])
+    def value_fn(ctx):
+      value = array_value[ctx.replica_id_in_sync_group]
+      return {'a': value,
+              'b': constant_op.constant([value + 1.0, value + 2.0])}
+    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+    self.assertIsInstance(distributed_values, dict)
+    self.assertDTensorEqual(
+        constant_op.constant([3., 2.], dtype=dtypes.float64),
+        layout.Layout.batch_sharded(self.mesh, batch_dim='batch', rank=1),
+        distributed_values['a'])
+    unpacked_a = d_api.unpack(distributed_values['a'])
+    # Note that this might have a slight behavior difference, the original
+    # mirrored strategy may return scalar for each PerReplica. The DTensor
+    # implementation is more strict and always ensures the PerReplica
+    # value has the same rank as the global-view Tensor.
+    self.assertAllClose(unpacked_a[0], [3.])
+    self.assertAllClose(unpacked_a[1], [2.])
+    self.assertDTensorEqual(
+        constant_op.constant([4., 5., 3., 4.], dtype=dtypes.float64),
+        layout.Layout.batch_sharded(self.mesh, batch_dim='batch', rank=1),
+        distributed_values['b'])
+
+  # TODO(scottzhu): Add test for unpacking the dataset in tf.function
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/experimental/rpc/BUILD b/tensorflow/python/distribute/experimental/rpc/BUILD
index 6039e3b4758..978b3a53777 100644
--- a/tensorflow/python/distribute/experimental/rpc/BUILD
+++ b/tensorflow/python/distribute/experimental/rpc/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/distribute/failure_handling/BUILD b/tensorflow/python/distribute/failure_handling/BUILD
index cf31f307908..c8e1215afcc 100644
--- a/tensorflow/python/distribute/failure_handling/BUILD
+++ b/tensorflow/python/distribute/failure_handling/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -43,11 +44,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "preemption_watcher",
+    srcs = ["preemption_watcher.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":failure_handling_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
 tf_py_test(
     name = "failure_handler_test",
+    timeout = "long",
     srcs = ["failure_handler_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     tags = [
+        "no_pip",  # TODO(b/266520226)
         "no_windows",  # TODO(b/197981388)
     ],
     deps = [
@@ -77,6 +92,7 @@ tf_py_test(
     srcs = ["gce_failure_handler_test.py"],
     shard_count = 8,
     tags = [
+        "no_pip",  # TODO(b/266520226)
         "noasan",  # TODO(b/226154233): Flaky test
         "nomsan",  # TODO(b/226154233): Flaky test
     ],
diff --git a/tensorflow/python/distribute/failure_handling/__init__.py b/tensorflow/python/distribute/failure_handling/__init__.py
index 20c0104d5b5..5568cb5c27c 100644
--- a/tensorflow/python/distribute/failure_handling/__init__.py
+++ b/tensorflow/python/distribute/failure_handling/__init__.py
@@ -16,3 +16,4 @@
 
 from tensorflow.python.distribute.failure_handling.failure_handling import PreemptionCheckpointHandler
 from tensorflow.python.distribute.failure_handling.failure_handling import TerminationConfig
+from tensorflow.python.distribute.failure_handling.tpu_preemption_watcher import PreemptionWatcher
diff --git a/tensorflow/python/distribute/failure_handling/failure_handler_test.py b/tensorflow/python/distribute/failure_handling/failure_handler_test.py
index 17bcd9d4063..311fe82218a 100644
--- a/tensorflow/python/distribute/failure_handling/failure_handler_test.py
+++ b/tensorflow/python/distribute/failure_handling/failure_handler_test.py
@@ -46,6 +46,13 @@
 from tensorflow.python.training import server_lib
 
 
+try:
+  import dill  # pylint:disable=g-import-not-at-top
+
+  _REGISTER_DECORATOR = dill.register
+except ImportError:
+  _REGISTER_DECORATOR = lambda fn, *_: fn
+
 mock = test.mock
 
 
@@ -121,7 +128,8 @@ def worker_fn(self,
                 raise_app_error_on_worker=None,
                 training_restarted=None,
                 training_finished=None,
-                termination_config=failure_handling.TerminationConfig()):
+                termination_config=failure_handling.TerminationConfig(),
+                api_wrapping_train=True):
 
     strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
 
@@ -205,7 +213,11 @@ def train_step():
         for step in range(
             preemption_handler.total_run_calls % STEPS_PER_EPOCH,
             STEPS_PER_EPOCH):
-          preemption_handler.run(distributed_train_step, epoch, step)
+          if api_wrapping_train:
+            preemption_handler.run(distributed_train_step, epoch, step)
+          else:
+            preemption_handler._save_checkpoint_if_preempted()
+            distributed_train_step(epoch, step)
         # Add some randomness to when preemption actually happens. We should
         # trigger it for sure if the training is coming to an end and it hasn't
         # been triggered yet.
@@ -225,9 +237,12 @@ def train_step():
           strategy.num_replicas_in_sync * EPOCHS_TO_RUN * STEPS_PER_EPOCH)
 
   @combinations.generate(
-      combinations.combine(input_arg=['checkpoint', 'manager'],
-                           mwms_mode=['local', 'multi_worker'],))
-  def test_preemption_checkpointing(self, input_arg, mwms_mode):
+      combinations.combine(
+          input_arg=['checkpoint', 'manager'],
+          mwms_mode=['local', 'multi_worker'],
+          api_wrapping_train=[True, False]))
+  def test_preemption_checkpointing(self, input_arg, mwms_mode,
+                                    api_wrapping_train):
     has_chief = False
 
     if _is_oss():
@@ -251,6 +266,7 @@ def test_preemption_checkpointing(self, input_arg, mwms_mode):
           args=(checkpoint_dir, cluster_spec, input_arg,
                 [training_started_event
                 ], None, training_restarted, training_finished),
+          kwargs={'api_wrapping_train': api_wrapping_train},
           rpc_layer=rpc_layer,
           return_output=True,
           dependence_on_chief=has_chief)
@@ -445,6 +461,65 @@ def sending_sigterm(training_started_event):
                        [training_started_event], None, training_restarted,
                        training_finished, termination_config)
 
+  def test_passed_in_save_fn(self):
+    if _is_oss():
+      rpc_layer = 'grpc'
+    else:
+      rpc_layer = 'grpc+loas'
+
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt')
+    gfile.MakeDirs(checkpoint_dir)
+
+    save_fn = lambda: print('Do nothing')
+    termination_config = failure_handling.TerminationConfig(
+        save_fn=save_fn)
+
+    has_chief = False
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=has_chief,
+        num_workers=CLUSTER_SIZE)
+    training_started_event = multi_process_runner.manager().Event()
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        self.worker_fn,
+        cluster_spec,
+        args=(checkpoint_dir, cluster_spec, 'checkpoint',
+              [training_started_event], None, None, None, termination_config),
+        rpc_layer=rpc_layer,
+        return_output=True,
+        dependence_on_chief=has_chief)
+
+    logging.info('Cluster starting.')
+    mpr.start()
+    while not training_started_event.is_set():
+      time.sleep(1)
+
+    killed_worker = random.randrange(0, CLUSTER_SIZE)
+    logging.info('sending SIGTERM')
+    os.kill(mpr.get_process_id('worker', killed_worker), signal.SIGTERM)
+    logging.info('SIGTERM sent')
+
+    # 5 is the grace period length
+    raise_if_not_all_exit(5, mpr)
+
+    match_group = [
+        re.search(r'.*ckpt-(\d+).index', a_file)
+        for a_file in gfile.ListDirectory(checkpoint_dir)
+    ]
+
+    # By default, as tested by other test cases, checkpoint will be saved.
+    # This passed in save_fn skips it.
+    self.assertEmpty(match_group)
+
+
+@_REGISTER_DECORATOR(PreemptionCheckpointTest)
+def _save_test_case(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return PreemptionCheckpointTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
 
 if __name__ == '__main__':
   test_util.main()
diff --git a/tensorflow/python/distribute/failure_handling/failure_handling.py b/tensorflow/python/distribute/failure_handling/failure_handling.py
index 2a76fdbcb72..cc1f02cbbb6 100644
--- a/tensorflow/python/distribute/failure_handling/failure_handling.py
+++ b/tensorflow/python/distribute/failure_handling/failure_handling.py
@@ -30,10 +30,10 @@
 from tensorflow.core.distributed_runtime.preemption import gen_check_preemption_op
 from tensorflow.python.checkpoint import checkpoint as checkpoint_lib
 from tensorflow.python.checkpoint import checkpoint_management
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute.failure_handling import failure_handling_util
 from tensorflow.python.eager import context
-from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -54,7 +54,6 @@
 _ITERATION_VARIABLE = 'checkpointed_runs'
 _STOP_WATCHING_CLUSTER_VALUE = 'STOP_WATCHER'
 PREEMPTION_KEY = 'TF_DEFAULT_PREEMPTION_NOTICE_KEY'
-ENABLE_TESTING_FOR_TPU = False
 
 
 # TODO(wxinyi): add type annotations.
@@ -139,7 +138,8 @@ class TerminationConfig(object):
   def __init__(self,
                termination_watcher_fn=None,
                exit_fn=None,
-               grace_period=None):
+               grace_period=None,
+               save_fn=None):
     """Creates a `TerminationConfig` object.
 
     Args:
@@ -157,12 +157,19 @@ def __init__(self,
       grace_period: the length of time between receiving a preemption signal and
         the actual preemption. A change is **NOT** recommended for users on
         Google Borg, Google Cloud Platform, or users with a short grace period.
+      save_fn: an optional function letting you configure how to save a
+        checkpoint. This is useful if you'd like to pass extra argument to
+        `tf.train.CheckpointManager.save` or `tf.train.Checkpoint.save`. By
+        default, if not configured, the API will save checkpoint without extra
+        arguments.
     """
     self.termination_watcher_fn = termination_watcher_fn
     self.exit_fn = exit_fn
     self.grace_period = grace_period
+    self.save_fn = save_fn
 
 
+# TODO(wxinyi): add some tests for TerminationConfig.
 # TODO(wxinyi): configure the exit function based on device type (GPU or TPU).
 class GcpGpuTerminationConfig(TerminationConfig):
   """Configurations for GCP GPU VM."""
@@ -171,12 +178,14 @@ def __init__(  # pylint: disable=super-init-not-called
       self,
       termination_watcher_fn=None,
       exit_fn=None,
-      grace_period=None):
+      grace_period=None,
+      save_fn=None):
     self.termination_watcher_fn = termination_watcher_fn or failure_handling_util.termination_watcher_function_gce
     self.exit_fn = exit_fn or failure_handling_util.gce_exit_fn
     self.grace_period = (
         grace_period if grace_period or grace_period == 0 else
         failure_handling_util.GRACE_PERIOD_GCE)
+    self.save_fn = save_fn
 
 
 class GcpCpuTerminationConfig(TerminationConfig):
@@ -186,10 +195,12 @@ def __init__(  # pylint: disable=super-init-not-called
       self,
       termination_watcher_fn=None,
       exit_fn=None,
-      grace_period=None):
+      grace_period=None,
+      save_fn=None):
     self.termination_watcher_fn = termination_watcher_fn or failure_handling_util.termination_watcher_function_gce
     self.exit_fn = exit_fn or failure_handling_util.gce_exit_fn
     self.grace_period = grace_period or 0
+    self.save_fn = save_fn
 
 
 class BorgTerminationConfig(TerminationConfig):
@@ -199,11 +210,28 @@ def __init__(  # pylint: disable=super-init-not-called
       self,
       termination_watcher_fn=None,
       exit_fn=None,
-      grace_period=None):
+      grace_period=None,
+      save_fn=None):
     self.termination_watcher_fn = termination_watcher_fn
     default_exit_fn = lambda: sys.exit(42)
     self.exit_fn = exit_fn or default_exit_fn
     self.grace_period = grace_period or 0
+    self.save_fn = save_fn
+
+
+class BorgTPUTerminationConfig(TerminationConfig):
+  """Configurations for Borg."""
+
+  def __init__(  # pylint: disable=super-init-not-called
+      self,
+      termination_watcher_fn=None,
+      exit_fn=None,
+      grace_period=None,
+      save_fn=None):
+    self.termination_watcher_fn = termination_watcher_fn
+    self.exit_fn = exit_fn or failure_handling_util.default_tpu_exit_fn
+    self.grace_period = grace_period or 0
+    self.save_fn = save_fn
 
 
 def _complete_config_for_environment(platform_device, termination_config):
@@ -214,19 +242,28 @@ def _complete_config_for_environment(platform_device, termination_config):
   if platform_device is failure_handling_util.PlatformDevice.GCE_GPU:
     return GcpGpuTerminationConfig(termination_config.termination_watcher_fn,
                                    termination_config.exit_fn,
-                                   termination_config.grace_period)
+                                   termination_config.grace_period,
+                                   termination_config.save_fn)
 
   elif platform_device is failure_handling_util.PlatformDevice.GCE_CPU:
     return GcpCpuTerminationConfig(termination_config.termination_watcher_fn,
                                    termination_config.exit_fn,
-                                   termination_config.grace_period)
+                                   termination_config.grace_period,
+                                   termination_config.save_fn)
+
+  elif platform_device is failure_handling_util.PlatformDevice.INTERNAL_TPU:
+    return BorgTPUTerminationConfig(termination_config.termination_watcher_fn,
+                                    termination_config.exit_fn,
+                                    termination_config.grace_period,
+                                    termination_config.save_fn)
 
   else:
     # The default we chose are the same as the ones used by Borg. So we just
     # return this.
     return BorgTerminationConfig(
         termination_config.termination_watcher_fn,
-        termination_config.exit_fn, termination_config.grace_period)
+        termination_config.exit_fn, termination_config.grace_period,
+        termination_config.save_fn)
 
 
 # TODO(wxinyi): add release updates.
@@ -422,6 +459,8 @@ def __init__(self,
         `tf.distribute.experimental.TerminationConfig` object to configure for a
         platform other than Google Borg or GCP.
     """
+    # TODO(wxinyi): Maybe make checkpoint_or_checkpoint_manager optional if
+    # save_fn is passed. For now it's still useful for restore.
     if isinstance(checkpoint_or_checkpoint_manager,
                   checkpoint_lib.Checkpoint) and not checkpoint_dir:
       raise errors.InvalidArgumentError('When a checkpoint is passed, a '
@@ -434,6 +473,14 @@ def __init__(self,
     self._checkpoint_dir = checkpoint_dir
 
     self._platform_device = failure_handling_util.detect_platform()
+
+    completed_termination_config = _complete_config_for_environment(
+        self._platform_device, self._termination_config)
+    self._termination_watcher_fn = completed_termination_config.termination_watcher_fn
+    self._exit_fn = completed_termination_config.exit_fn
+    self._grace_period = completed_termination_config.grace_period
+    self._save_fn = completed_termination_config.save_fn
+
     if self._platform_device in (failure_handling_util.PlatformDevice.GCE_TPU,
                                  failure_handling_util.PlatformDevice.GCE_CPU):
       # While running MultiWorkerMirroredStrategy training with GPUs and CPUs
@@ -443,12 +490,7 @@ def __init__(self,
                                 'usage with TPU or CPU device on GCP.')
 
     elif self._platform_device == failure_handling_util.PlatformDevice.INTERNAL_TPU:
-      if ENABLE_TESTING_FOR_TPU:
-        self._initialize_for_tpu_strategy()
-
-      else:
-        raise NotImplementedError('PreemptionCheckpointHandler does not support'
-                                  ' usage with TPU yet.')
+      self._initialize_for_tpu_strategy()
 
     else:
       self._initialize_for_multi_worker_mirrored()
@@ -462,6 +504,7 @@ def _initialize_for_tpu_strategy(self):
     self._cluster_wise_termination_watcher_thread = None
     self._maybe_create_checkpoint_manager()
     self._read_checkpoint_manager.restore_or_initialize()
+    self._run_counter = 0
 
   def _initialize_for_multi_worker_mirrored(self):
     """Makes configurations and start watchers for using with MWMS."""
@@ -528,13 +571,7 @@ def _initialize_for_multi_worker_mirrored(self):
     # step number to save a checkpoint has been aligned.
     self._received_checkpoint_step = threading.Event()
 
-    completed_termination_config = _complete_config_for_environment(
-        self._platform_device, self._termination_config)
-    self._termination_watcher_fn = completed_termination_config.termination_watcher_fn
-    self._exit_fn = completed_termination_config.exit_fn
-    self._grace_period = completed_termination_config.grace_period
-
-    distribution_strategy_api_counter.get_cell(
+    distribute_lib.distribution_strategy_input_api_counter.get_cell(
         self._platform_device.name,
         'PreemptionCheckpointHandler').increase_by(1)
 
@@ -558,7 +595,7 @@ def _initialize_for_multi_worker_mirrored(self):
 
     self._poll_termination_signal_thread = None
 
-    if completed_termination_config.termination_watcher_fn:
+    if self._termination_watcher_fn:
       self._start_polling_for_termination_signal()
     else:
       self._start_watching_for_signal()
@@ -704,6 +741,10 @@ def total_run_calls(self):
     value to infer the starting epoch and step after training restores, as shown
     in the example above.
     """
+    if (self._platform_device ==
+        failure_handling_util.PlatformDevice.INTERNAL_TPU):
+      raise NotImplementedError('Please create variables saved in checkpoint '
+                                'to keep track of steps and epochs.')
     return self._run_counter
 
   def run(self,
@@ -798,7 +839,7 @@ def _run_for_multi_worker_mirrored(self, distributed_train_function, *args,
                                      **kwargs):
     """PreemptionCheckpointManager.run implementation for MWMS."""
     try:
-      self._checkpoint_if_preempted()
+      self._check_preemption_and_maybe_checkpoint()
       run_begin_time = time.time()
       result = distributed_train_function(*args, **kwargs)
       new_run_time = time.time() - run_begin_time
@@ -818,28 +859,103 @@ def _run_for_multi_worker_mirrored(self, distributed_train_function, *args,
 
     return result
 
+  # TODO(wxinyi): maybe export as public API.
+  # Disabling line-too-long check since we do not want to break the line when
+  # converted to public documentation.
+  # pylint: disable=line-too-long
+  def _save_checkpoint_if_preempted(self, *args, **kwargs):
+    """Saves a checkpoint if a preemption signal has been made available.
+
+    This method works for both tf.distribute.MultiWorkerMirroredStrategy and
+    tf.distribute.TPUStrategy. However, this method will add a synchronization
+    point between worker and coordinator in the use case of TPUStrategy. If this
+    is a concern, use `watch_error_scope` and `run` instead.
+
+    ```python
+    strategy = tf.distribute.TPUStrategy()
+    # initialization omitted
+
+    with strategy.scope():
+      # Save in the checkpoint.
+      trained_step = tf.Variable(initial_value=tf.constant(0, dtype=tf.dtypes.int64), name='trained_step', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+      checkpoint_manager = tf.train.CheckpointManager(checkpoint, directory, max_to_keep=1)
+      preemption_handler = tf.distribute.experimental.PreemptionCheckpointHandler(cluster_resolver, checkpoint_manager)
+
+    while trained_step.numpy() < NUM_STEPS:
+      train_multi_step_function()
+      preemption_handler.save_checkpoint_if_preempted()
+    ```
+
+    Args:
+      *args: args for `tf.train.CheckpointManager.save()` to save checkpoint.
+      **kwargs: kwargs for `tf.train.CheckpointManager.save()` to save.
+    """
+    # pylint: enable=line-too-long
+    if (self._platform_device ==
+        failure_handling_util.PlatformDevice.INTERNAL_TPU):
+
+      try:
+        with context.async_scope():
+          gen_check_preemption_op.check_preemption(
+              preemption_key=PREEMPTION_KEY)
+      except errors.AbortedError as abort_error:
+        if abort_error.experimental_payloads.get(
+            b'type.googleapis.com/tensorflow.distributed_runtime.WorkerPreemption'
+        ):
+          logging.info('Clearing preemption error to save checkpoint...')
+
+          context.async_clear_error()
+          self._save_checkpoint(*args, **kwargs)
+
+          # For TPU training, the default behavior is that it will block until
+          # workers are down and returns with error.
+          self._exit_fn()
+
+        else:
+          raise
+
+    else:
+      self._check_preemption_and_maybe_checkpoint(*args, **kwargs)
+      self._run_counter += 1
+      self._estimated_run_time = 0
+
   @tf_contextlib.contextmanager
   def _watch_error_scope(self):
     """Sync error and maybe save checkpoint."""
     # TODO(wxinyi): export as public API
-    try:
-      with context.async_scope():
-        yield
-    except errors.AbortedError as abort_error:
-      if abort_error.experimental_payloads.get(
-          b'type.googleapis.com/tensorflow.distributed_runtime.WorkerPreemption'
-      ):
-        logging.info('Clearing preemption error to save checkpoint...')
+    if self._platform_device == failure_handling_util.PlatformDevice.INTERNAL_TPU:
+      try:
+        with context.async_scope():
+          yield
+      except errors.AbortedError as abort_error:
+        if abort_error.experimental_payloads.get(
+            b'type.googleapis.com/tensorflow.distributed_runtime.WorkerPreemption'
+        ):
+          logging.info('Clearing preemption error to save checkpoint...')
 
-        context.async_clear_error()
-        self._save_checkpoint()
+          context.async_clear_error()
+          self._save_checkpoint()
 
-      else:
+          self._exit_fn()
+
+        else:
+          raise
+    else:
+      try:
+        yield
+      except errors.OpError as e:
+        if not self._local_mode:
+          logging.info('Propagating error to cluster: %r: %s', e, e)
+          try:
+            context.context().report_error_to_cluster(e.error_code, e.message)
+          except Exception as ex:  # pylint: disable=broad-except
+            logging.info('Ignoring error during error propagation: %r:%s', ex, ex)
         raise
 
-  def _save_checkpoint(self):
+  def _save_checkpoint(self, *args, **kwargs):
     """Saves the checkpoint and exit program."""
-    distribution_strategy_api_counter.get_cell(
+    distribute_lib.distribution_strategy_input_api_counter.get_cell(
         self._platform_device.name,
         'PreemptionCheckpointHandler Saving Checkpoint').increase_by(1)
     logging.info('PreemptionCheckpointHandler: Starting saving a checkpoint.')
@@ -849,7 +965,10 @@ def _save_checkpoint(self):
 
     start_time = time.monotonic()
 
-    self._write_checkpoint_manager.save()
+    if self._save_fn:
+      self._save_fn(*args, **kwargs)
+    else:
+      self._write_checkpoint_manager.save(*args, **kwargs)
 
     end_time = time.monotonic()
 
@@ -857,7 +976,7 @@ def _save_checkpoint(self):
                  self._write_checkpoint_manager.directory)
     self._checkpoint_time = end_time - start_time
 
-  def _checkpoint_if_preempted(self):
+  def _check_preemption_and_maybe_checkpoint(self, *args, **kwargs):
     """Checkpoint if any worker has received a preemption signal.
 
     This function handles preemption signal reported by any worker in the
@@ -877,7 +996,15 @@ def _checkpoint_if_preempted(self):
     info is available, if the worker has not finished these steps yet, keep
     training; otherwise, checkpoint and exit with a cluster-recognized restart
     code.
+
+    Args:
+      *args: args for `tf.train.CheckpointManager.save()` to save checkpoint.
+      **kwargs: kwargs for `tf.train.CheckpointManager.save()` to save.
     """
+    if self._platform_device == failure_handling_util.PlatformDevice.INTERNAL_TPU:
+      gen_check_preemption_op.check_preemption(preemption_key=PREEMPTION_KEY)
+      return
+
     if self._final_checkpoint_countdown:
       run_count_config_key = _FINAL_RUN_COUNT_KEY
 
@@ -887,7 +1014,7 @@ def _checkpoint_if_preempted(self):
     if self._received_checkpoint_step.is_set():
 
       if self._step_to_checkpoint == str(self._run_counter):
-        self._save_checkpoint()
+        self._save_checkpoint(*args, **kwargs)
 
         if self._time_to_exit():
           self._stop_poll_termination_signal_thread()
@@ -957,8 +1084,8 @@ def _time_to_exit(self):
     # means it's time to exit: when there is a grace period, a worker
     # receives preemption signal and sets the step key. Then all workers
     # receive the step key and set their local _received_checkpoint_step
-    # event, enters this branch in _checkpoint_if_preempted, make a
-    # checkpoint. Then they set _final_checkpoint_countdown to True, clear
+    # event, enters this branch in _check_preemption_and_maybe_checkpoint, make
+    # a checkpoint. Then they set _final_checkpoint_countdown to True, clear
     # _received_checkpoint_step, and continue training. New preemption
     # signals anywhere in the cluster will not be handled, because
     # _PREEMPTION_WORKER_KEY is occupied. The only chance that
@@ -1003,7 +1130,7 @@ def _watch_step_to_save_key(self):
     # value so we can join the thread executing _watch_step_to_save_key.
     if step_value != _STOP_WATCHING_CLUSTER_VALUE:
       # This must be set before we set the ack key below, otherwise its value
-      # in _checkpoint_if_preempted may be outdated.
+      # in _check_preemption_and_maybe_checkpoint may be outdated.
       self._step_to_checkpoint = step_value
       self._received_checkpoint_step.set()
 
@@ -1014,7 +1141,8 @@ def _watch_step_to_save_key(self):
           'preemption awareness acknowledged', ack_key)
 
       # If a positive grace_period is not configured, we get the
-      # _INITIAL_RUN_COUNT_KEY and then we're done. _checkpoint_if_preempted
+      # _INITIAL_RUN_COUNT_KEY and then we're done.
+      # _check_preemption_and_maybe_checkpoint
       # will save a checkpoint and then exit. Otherwise, we need to move on to
       # wait for the _FINAL_RUN_COUNT_KEY, the one that the preempted worker
       # will set after we utilize the extended grace period to train, so that
@@ -1035,12 +1163,3 @@ def _watch_step_to_save_key(self):
 # TODO(wxinyi): remove this line after we move the Keras callback prototype and
 # change gce test usage.
 WorkerPreemptionHandler = PreemptionCheckpointHandler
-
-
-# ------------------------------------------------------------------------------
-# Metrics to track which distribution strategy APIs (reusable by future APIs)
-distribution_strategy_api_counter = monitoring.Counter(
-    '/tensorflow/api/distribution_strategy/api',
-    'Counter to track the usage of the distribute strategy APIs',
-    'platform or accelerator',
-    'api')
diff --git a/tensorflow/python/distribute/failure_handling/failure_handling_util.py b/tensorflow/python/distribute/failure_handling/failure_handling_util.py
index 277304ce26e..5151fd6fa02 100644
--- a/tensorflow/python/distribute/failure_handling/failure_handling_util.py
+++ b/tensorflow/python/distribute/failure_handling/failure_handling_util.py
@@ -16,11 +16,11 @@
 import enum
 import os
 import sys
-
 import requests
 
 from six.moves.urllib import request
 from tensorflow.python.eager import context
+from tensorflow.python.platform import tf_logging as logging
 
 
 GCP_METADATA_HEADER = {'Metadata-Flavor': 'Google'}
@@ -33,6 +33,24 @@ def gce_exit_fn():
   sys.exit(_RESTARTABLE_EXIT_CODE)
 
 
+def default_tpu_exit_fn():
+  """Default exit function to run after saving checkpoint for TPUStrategy.
+
+  For TPUStrategy, we want the coordinator to exit after workers are down so
+  that restarted coordinator would not connect to workers scheduled to be
+  preempted. This function achieves so by attempting to get a key-value store
+  from coordination service, which will block until workers are done and then
+  returns with error. Then we have the coordinator sys.exit(42) to re-schedule
+  the job.
+  """
+  logging.info('Waiting for workers to exit...')
+  try:
+    context.context().get_config_key_value('BLOCK_TILL_EXIT')
+  except:  # pylint: disable=bare-except
+    logging.info('Restarting cluster due to preemption.')
+    sys.exit(42)
+
+
 def request_compute_metadata(path):
   """Returns GCE VM compute metadata."""
   gce_metadata_endpoint = 'http://' + os.environ.get(
@@ -85,17 +103,17 @@ class PlatformDevice(enum.Enum):
 def detect_platform():
   """Returns the platform and device information."""
   if on_gcp():
-    if context.context().list_physical_devices('GPU'):
+    if context.context().list_logical_devices('GPU'):
       return PlatformDevice.GCE_GPU
-    elif context.context().list_physical_devices('TPU'):
+    elif context.context().list_logical_devices('TPU'):
       return PlatformDevice.GCE_TPU
     else:
       return PlatformDevice.GCE_CPU
 
   else:
-    if context.context().list_physical_devices('GPU'):
+    if context.context().list_logical_devices('GPU'):
       return PlatformDevice.INTERNAL_GPU
-    elif context.context().list_physical_devices('TPU'):
+    elif context.context().list_logical_devices('TPU'):
       return PlatformDevice.INTERNAL_TPU
     else:
       return PlatformDevice.INTERNAL_CPU
diff --git a/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py b/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
index 30936c3ba5d..3e4ef4b1c5b 100644
--- a/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
+++ b/tensorflow/python/distribute/failure_handling/gce_failure_handler_test.py
@@ -44,6 +44,13 @@
 from tensorflow.python.training import server_lib
 
 
+try:
+  import dill  # pylint:disable=g-import-not-at-top
+
+  _REGISTER_DECORATOR = dill.register
+except ImportError:
+  _REGISTER_DECORATOR = lambda fn, *_: fn
+
 mock = test.mock
 
 
@@ -109,7 +116,8 @@ def worker_fn(
       training_finished=None,
       frequent_send=False,
       training_restarted=None,
-      termination_config=failure_handling.TerminationConfig(grace_period=0)):
+      termination_config=failure_handling.TerminationConfig(grace_period=0),
+      api_wrapping_train=True):
 
     strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
 
@@ -188,22 +196,38 @@ def train_step():
         checkpoint_index = [
             a_match.group(1) for a_match in match_group if a_match
         ]
-        if termination_config.grace_period > 0:
-          # Two checkpoints were saved for the extended grace period.
-          self.assertEqual(
-              max([int(ckpt_index) for ckpt_index in checkpoint_index]), 2)
+        self.assertNotEmpty(checkpoint_index)
+
+        if api_wrapping_train:
+          if termination_config.grace_period > 0:
+            # Two checkpoints were saved for the extended grace period.
+            self.assertEqual(
+                max([int(ckpt_index) for ckpt_index in checkpoint_index]), 2)
+          else:
+            self.assertEqual(
+                max([int(ckpt_index) for ckpt_index in checkpoint_index]), 1)
+
         else:
+          # Test if arguments to _save_checkpoint_if_preempted are passed
+          # successfully.
           self.assertEqual(
-              max([int(ckpt_index) for ckpt_index in checkpoint_index]), 1)
+              max([int(ckpt_index) for ckpt_index in checkpoint_index]),
+              preemption_handler.total_run_calls)
+
+      for epoch in range(preemption_handler.total_run_calls // STEPS_PER_EPOCH,
+                         EPOCHS_TO_RUN):
 
-      for epoch in range(
-          preemption_handler.total_run_calls // STEPS_PER_EPOCH,
-          EPOCHS_TO_RUN):
+        for step in range(preemption_handler.total_run_calls % STEPS_PER_EPOCH,
+                          STEPS_PER_EPOCH):
 
-        for step in range(
-            preemption_handler.total_run_calls % STEPS_PER_EPOCH,
-            STEPS_PER_EPOCH):
-          preemption_handler.run(distributed_train_step, epoch, step)
+          # Testing two different APIs to save checkpoint.
+          if api_wrapping_train:
+            preemption_handler.run(distributed_train_step, epoch, step)
+
+          else:
+            preemption_handler._save_checkpoint_if_preempted(
+                checkpoint_number=preemption_handler.total_run_calls)
+            distributed_train_step(epoch, step)
 
       logging.info('Training finished.')
       training_finished.set()
@@ -317,9 +341,11 @@ def test_basic_run(self, input_arg, mwms_mode):
           grace_period=[0, 7],
           input_arg=['checkpoint', 'manager'],
           mwms_mode=['local', 'multi_worker'],
+          api_wrapping_train=[True, False]
       ))
   def test_multiple_workers_preempted_consecutively(self, grace_period,
-                                                    input_arg, mwms_mode):
+                                                    input_arg, mwms_mode,
+                                                    api_wrapping_train):
 
     checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt/')
     if _is_oss():
@@ -345,6 +371,7 @@ def test_multiple_workers_preempted_consecutively(self, grace_period,
           args=(checkpoint_dir, cluster_spec, input_arg, maintenance_event,
                 training_finished, True, training_restarted,
                 termination_config),
+          kwargs={'api_wrapping_train': api_wrapping_train},
           rpc_layer=rpc_layer,
           return_output=True,
           dependence_on_chief=has_chief)
@@ -392,9 +419,12 @@ def test_multiple_workers_preempted_consecutively(self, grace_period,
                        training_restarted, termination_config)
 
   @combinations.generate(
-      combinations.combine(input_arg=['checkpoint', 'manager'],
-                           mwms_mode=['local', 'multi_worker'],))
-  def test_grace_period_continue_training(self, input_arg, mwms_mode):
+      combinations.combine(
+          input_arg=['checkpoint', 'manager'],
+          mwms_mode=['local', 'multi_worker'],
+          api_wrapping_train=[True, False]))
+  def test_grace_period_continue_training(self, input_arg, mwms_mode,
+                                          api_wrapping_train):
     checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt/')
     grace_period = 7
     if _is_oss():
@@ -422,6 +452,7 @@ def test_grace_period_continue_training(self, input_arg, mwms_mode):
           args=(checkpoint_dir, cluster_spec, input_arg, maintenance_event,
                 training_finished, False, training_restarted,
                 termination_config),
+          kwargs={'api_wrapping_train': api_wrapping_train},
           rpc_layer=rpc_layer,
           return_output=True,
           dependence_on_chief=has_chief)
@@ -474,5 +505,14 @@ def test_grace_period_continue_training(self, input_arg, mwms_mode):
         self.assertTrue(training_finished.is_set())
 
 
+@_REGISTER_DECORATOR(GceFailureHandlingTest)
+def _save_test_case(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return GceFailureHandlingTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
 if __name__ == '__main__':
   test_util.main()
diff --git a/tensorflow/python/distribute/failure_handling/preemption_watcher.py b/tensorflow/python/distribute/failure_handling/preemption_watcher.py
new file mode 100644
index 00000000000..c6a236771e1
--- /dev/null
+++ b/tensorflow/python/distribute/failure_handling/preemption_watcher.py
@@ -0,0 +1,107 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides a utility class for preemption detection and recovery."""
+
+import threading
+
+from absl import logging
+
+from tensorflow.python.distribute.failure_handling.failure_handling_util import detect_platform
+from tensorflow.python.distribute.failure_handling.failure_handling_util import PlatformDevice
+from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
+from tensorflow.python.util.tf_export import tf_export
+
+
+_preemption_watcher_initialization_counter = monitoring.Counter(
+    "/tensorflow/api/distribution_strategy/preemption_watcher_initialized",
+    "Counter for usages of PreemptionWatcher",
+)
+_preemption_handling_counter = monitoring.Counter(
+    "/tensorflow/api/distribution_strategy/preemption_watcher_handled",
+    "Counter for number of preempions catched and handled by PreemptionWatcher",
+)
+
+_PREEMPTION_KEY = "TF_DEFAULT_PREEMPTION_NOTICE_KEY"
+
+
+@tf_export("distribute.experimental.PreemptionWatcher", v1=[])
+class PreemptionWatcher:
+  """Watch preemption signal and store it.
+
+  Notice: Currently only support Borg TPU environment with TPUClusterResolver.
+
+  This class provides a way to monitor the preemption signal during training on
+  TPU. It will start a background thread to watch the training process, trying
+  to fetch preemption message from the coordination service. When preemption
+  happens, the preempted worker will write the preemption message to the
+  coordination service. Thus getting a non-empty preemption message means there
+  is a preemption happened.
+
+  User can use the preemption message as a reliable preemption indicator, and
+  then set the coordinator to reconnect to the TPU worker instead of a fully
+  restart triggered by Borg. For example, a training process with
+  preemption recovery will be like:
+
+  ```python
+  keep_running = True
+  preemption_watcher = None
+  while keep_running:
+    try:
+      # Initialize TPU cluster and stratygy.
+      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+      tf.config.experimental_connect_to_cluster(resolver)
+      tf.tpu.experimental.initialize_tpu_system(resolver)
+      strategy = tf.distribute.TPUStrategy(resolver)
+
+      # PreemptionWatcher must be created after connected to cluster.
+      preemption_watcher = tf.distribute.experimental.PreemptionWatcher()
+      train_model(strategy)
+      keep_running = False
+    except Exception as e:
+      if preemption_watcher and preemption_watcher.preemption_message:
+        keep_running = True
+      else:
+        raise e
+  ```
+
+  Attributes:
+    preemption_message: A variable to store the preemption message fetched from
+      the coordination service. If it is not None, then there is a preemption
+      happened.
+  """
+
+  def __init__(self):
+    # TODO(b/254321514): Integrate with GPU and cloud enviornmenmt.
+    self._preemption_message = None
+    platform = detect_platform()
+    if platform != PlatformDevice.INTERNAL_TPU:
+      logging.warning("Preemption watcher does not support environment: %s",
+                      platform)
+    else:
+      _preemption_watcher_initialization_counter.get_cell().increase_by(1)
+      threading.Thread(target=self._watch_preemption_key, daemon=True).start()
+
+  @property
+  def preemption_message(self):
+    """Returns the preemption message."""
+    return self._preemption_message
+
+  def _watch_preemption_key(self):
+    logging.info("Watching preemption signal.")
+    message = context.context().get_config_key_value(_PREEMPTION_KEY)
+    _preemption_handling_counter.get_cell().increase_by(1)
+    logging.info("Preemption signal received.")
+    self._preemption_message = message
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 6baedea2071..6e06fdf339e 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -20,6 +20,7 @@
 
 import six
 
+from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import cardinality as cardinality_lib
 from tensorflow.python.data.experimental.ops import distribute
@@ -2170,3 +2171,20 @@ def _rebatch(spec):
   return values.PerReplicaSpec(
       *nest.map_structure(_rebatch, per_replica_spec._value_specs))
   # pylint: enable=protected-access
+
+
+def _ag_enumerate_not_implemented(s, unused_start):
+  msg = (
+      f"enumerate not supported with {s.__class__.__name__} types within "
+      "tf.functions. Use a for loop over the dataset and keep a separate "
+      "counter instead."
+  )
+  raise NotImplementedError(msg)
+
+
+py_builtins.enumerate_registry.register(
+    DistributedIterator, _ag_enumerate_not_implemented
+)
+py_builtins.enumerate_registry.register(
+    DistributedDataset, _ag_enumerate_not_implemented
+)
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 9b9c163f59e..44b5e2412a2 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -503,6 +503,32 @@ def testIterableIterator(self, distribution):
     for i, element in enumerate(iterator):
       self.assertAllEqual(distribution.experimental_local_results(element), [i])
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+          ],
+          use_iterator=[False, True]))
+  def testIteratorAndDatasetEnumerateError(self, distribution, use_iterator):
+    # enumerate is not supported within tf.function for these types.
+    dataset = dataset_ops.Dataset.range(10).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+
+    if use_iterator:
+      iterable = iter(dist_dataset)
+    else:
+      iterable = dist_dataset
+
+    @def_function.function
+    def enumerate_fn(iterable):
+      for _, batch in enumerate(iterable):
+        distribution.experimental_local_results(batch)
+
+    with self.assertRaises(NotImplementedError):
+      enumerate_fn(iterable)
+
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index 6ae0b6a0811..b302d18bbc4 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -2,7 +2,10 @@ load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 distribute_py_test(
     name = "saved_model_test",
diff --git a/tensorflow/python/distribute/mirrored_run.py b/tensorflow/python/distribute/mirrored_run.py
index bc365c1587f..be58c82cf18 100644
--- a/tensorflow/python/distribute/mirrored_run.py
+++ b/tensorflow/python/distribute/mirrored_run.py
@@ -15,7 +15,6 @@
 """Class MirroredStrategy implementing tf.distribute.Strategy."""
 
 import contextlib
-import functools
 import threading
 import weakref
 
@@ -76,11 +75,13 @@ def call_for_each_replica(strategy, fn, args=None, kwargs=None):
       # the tf.function. We use _clone() instead of @tf.function wrapped
       # call_for_each_replica() because we would like to retain the arguments to
       # the @tf.function decorator of fn.
+      def wrapped_fn(*args, **kwargs):
+        return call_for_each_replica(strategy, fn.python_function, args, kwargs)
+
       wrapped = fn._clone(  # pylint: disable=protected-access
-          python_function=functools.partial(call_for_each_replica, strategy,
-                                            fn.python_function))
+          python_function=wrapped_fn)
       _cfer_fn_cache[strategy][fn] = wrapped
-    return wrapped(args, kwargs)
+    return wrapped(*args, **kwargs)
 
   if context.executing_eagerly():
     logging.log_first_n(
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index d8e1e73d1f3..59a31484c3c 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -273,7 +273,7 @@ def replica_fn(input):
   Args:
     devices: a list of device strings such as `['/gpu:0', '/gpu:1']`.  If
       `None`, all available GPUs are used. If no GPUs are found, CPU is used.
-    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
+    cross_device_ops: optional, a descendant of `CrossDeviceOps`. If this is not
       set, `NcclAllReduce()` will be used by default.  One would customize this
       if NCCL isn't available or if a special implementation that exploits
       the particular hardware is available.
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 5b31ffe743e..15e32e78a9c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -32,6 +32,14 @@
 from tensorflow.python.eager import test
 
 
+try:
+  import dill  # pylint:disable=g-import-not-at-top
+
+  _REGISTER_DECORATOR = dill.register
+except ImportError:
+  _REGISTER_DECORATOR = lambda fn, *_: fn
+
+
 def fn_that_adds_task_type_in_return_data():
   return multi_worker_test_base.get_task_type()
 
@@ -641,5 +649,32 @@ def num_gpus_fn():
       self.assertAllEqual(result.return_value, [2, 2])
 
 
+@_REGISTER_DECORATOR(MultiProcessRunnerTest)
+def _save_multi_process_runner_test(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return MultiProcessRunnerTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
+@_REGISTER_DECORATOR(MultiProcessPoolRunnerTest)
+def _save_multi_process_pool_runner_test(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return MultiProcessPoolRunnerTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
+@_REGISTER_DECORATOR(MultiProcessRunnerMultiGPUTest)
+def _save_multi_process_runner_multi_gpu_test(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return MultiProcessRunnerMultiGPUTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 52a82698a60..547542e2ab8 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -36,6 +36,14 @@
 from tensorflow.python.ops import variable_scope
 
 
+try:
+  import dill  # pylint:disable=g-import-not-at-top
+
+  _REGISTER_DECORATOR = dill.register
+except ImportError:
+  _REGISTER_DECORATOR = lambda fn, *_: fn
+
+
 # TODO(b/151232436): This test doesn't work with check health enabled because it
 # relies on barrier around creating strategies. Check health performs
 # communications inside strategy constructor, which makes the barrier
@@ -52,12 +60,14 @@
 class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
+    super(MultiWorkerContinuousRunTest, self).setUp()
+    self._maybe_setup_gpus(setup=True)
+
+  def _maybe_setup_gpus(self, setup=False):
     self._gpus = config.list_physical_devices('GPU')
     self._local_device = '/device:GPU:0' if self._gpus else '/device:CPU:0'
-    super(MultiWorkerContinuousRunTest, self).setUp()
 
-  def _maybe_setup_gpus(self):
-    if self._gpus:
+    if self._gpus and not setup:
       # Set virtual GPU with memory limit of 64MB so that multiple worker
       # processes can share the physical GPU
       config.set_logical_device_configuration(
@@ -131,5 +141,14 @@ def worker_fn():
           cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
 
+@_REGISTER_DECORATOR(MultiWorkerContinuousRunTest)
+def _save_test_case(pickler, obj):
+  def reconstruct(*args, **kwargs):
+    del args, kwargs
+    return MultiWorkerContinuousRunTest()
+
+  return pickler.save_reduce(reconstruct, (), obj=obj)
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index 60cf5713d7e..08045358afb 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -17,6 +17,7 @@
 from tensorflow.python.distribute import device_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 
@@ -361,5 +362,5 @@ def _tensor_conversion_packed_var_and_device(var,
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     PackedVarAndDevice, _tensor_conversion_packed_var_and_device)
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index 4fcd72ee4c6..6b3f12ffddb 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 # Pybind rules must live in tensorflow/python due to header rule visibility.
 exports_files(
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index aab86ebc418..f4e187d406e 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -30,6 +30,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -224,13 +225,15 @@ def _gather_saveables_for_checkpoint(self):
       return self._v._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
     return {trackable.VARIABLE_VALUE_KEY: self._v}
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map, tensor_map,
+                                   options, **kwargs):
     """For implementing `Trackable`."""
     # By delegating this method to the wrapped variable, SavedModel with
     # AggregatingVariable are identical to SavedModel with normal variables.
-    obj_map, resource_map = self._v._map_resources(save_options)  # pylint:disable=protected-access
-    obj_map[self] = obj_map[self._v]
-    return obj_map, resource_map
+    resource_list = self._v._export_to_saved_model_graph(object_map, tensor_map,  # pylint:disable=protected-access
+                                                         options, **kwargs)
+    object_map[self] = object_map[self._v]
+    return resource_list
 
   # pylint: disable=multiple-statements
   def __add__(self, o):
@@ -513,13 +516,15 @@ def _operator(v, *args, **kwargs):
   def _gather_saveables_for_checkpoint(self):
     return {trackable.VARIABLE_VALUE_KEY: self._v}
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map, tensor_map,
+                                   options, **kwargs):
     """For implementing `Trackable`."""
     # By delegating this method to the wrapped variable, SavedModel with
     # AggregatingVariable are identical to SavedModel with normal variables.
-    obj_map, resource_map = self._v._map_resources(save_options)  # pylint:disable=protected-access
-    obj_map[self] = obj_map[self._v]
-    return obj_map, resource_map
+    resource_list = self._v._export_to_saved_model_graph(object_map, tensor_map,  # pylint:disable=protected-access
+                                                         options, **kwargs)
+    object_map[self] = object_map[self._v]
+    return resource_list
 
 
 # Register a conversion function which reads the value of the variable,
@@ -528,8 +533,8 @@ def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(AggregatingVariable,
-                                        _tensor_conversion_aggregate)
+tensor_conversion_registry.register_tensor_conversion_function(
+    AggregatingVariable, _tensor_conversion_aggregate)
 
 
 # Register a conversion function which reads the value of the variable,
@@ -538,8 +543,8 @@ def _tensor_conversion_caching(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(CachingVariable,
-                                        _tensor_conversion_caching)
+tensor_conversion_registry.register_tensor_conversion_function(
+    CachingVariable, _tensor_conversion_caching)
 
 CachingVariable._overload_overloadable_operators()  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index e5579cc1a68..f42973ba5c4 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices as indexed_slices_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
@@ -725,16 +726,16 @@ def _saveable_factory(name=self.name):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map, tensor_map,
+                                   options, **kwargs):
     """For implementing `Trackable`."""
-    obj_map, resource_map = {}, {}
+    resource_list = []
     for v in self._variables + [self._saving_variable]:
-      v_obj_map, v_resource_map = v._map_resources(save_options)  # pylint:disable=protected-access
-      obj_map.update(v_obj_map)
-      resource_map.update(v_resource_map)
-    obj_map[self] = ShardedVariable([obj_map[self._saving_variable]],
-                                    name=self.name)
-    return obj_map, resource_map
+      resource_list.extend(v._export_to_saved_model_graph(  # pylint:disable=protected-access
+          object_map, tensor_map, options, **kwargs))
+    object_map[self] = ShardedVariable([object_map[self._saving_variable]],
+                                       name=self.name)
+    return resource_list
 
   @property
   def _unique_id(self):
@@ -888,7 +889,8 @@ def _var_to_tensor(var, dtype=None, name=None, as_ref=False):
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
-ops.register_tensor_conversion_function(ShardedVariable, _var_to_tensor)
+tensor_conversion_registry.register_tensor_conversion_function(
+    ShardedVariable, _var_to_tensor)
 
 ShardedVariable._overload_all_operators()  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/distribute/tpu_replicated_variable.py b/tensorflow/python/distribute/tpu_replicated_variable.py
index 1d60822aa53..2162da19e5a 100644
--- a/tensorflow/python/distribute/tpu_replicated_variable.py
+++ b/tensorflow/python/distribute/tpu_replicated_variable.py
@@ -24,6 +24,7 @@
 from tensorflow.python.distribute import tpu_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_tpu_partition_ops as tpu_partition_ops
@@ -163,16 +164,39 @@ def variables(self):
       return [self._vars[0]]
     return self._vars
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map, tensor_map,
+                                   options, **kwargs):
     """For implementing `Trackable`."""
     first_var = self._vars[0]
-    obj_map, resource_map = first_var._map_resources(save_options)  # pylint:disable=protected-access
+    resource_list = first_var._export_to_saved_model_graph(  # pylint:disable=protected-access
+        object_map, tensor_map, options, **kwargs)
     for v in self._vars[1:]:
-      obj_map[v] = obj_map[first_var]
-      resource_map[v.handle] = resource_map[first_var.handle]
-    obj_map[self] = obj_map[first_var]
-    resource_map[self] = resource_map[first_var.handle]
-    return obj_map, resource_map
+      object_map[v] = object_map[first_var]
+      tensor_map[v.handle] = tensor_map[first_var.handle]
+      resource_list.append(v.handle)
+    object_map[self] = object_map[first_var]
+    tensor_map[self] = tensor_map[first_var.handle]
+    resource_list.append(self)
+    return resource_list
+
+  def _export_to_saved_model_graph(self, object_map=None,
+                                   tensor_map=None,
+                                   options=None,
+                                   **kwargs):
+    """For implementing `Trackable`."""
+    first_var = self._vars[0]
+    resource_list = first_var._export_to_saved_model_graph(  # pylint:disable=protected-access
+        object_map=object_map,
+        tensor_map=tensor_map,
+        options=options)
+    for v in self._vars[1:]:
+      object_map[v] = object_map[first_var]
+      tensor_map[v.handle] = tensor_map[first_var.handle]
+      resource_list.append(v.handle)
+    object_map[self] = object_map[first_var]
+    tensor_map[self] = tensor_map[first_var.handle]
+    resource_list.append(self)
+    return resource_list
 
   def _gather_saveables_for_saved_model(self):
     return {trackable.VARIABLE_VALUE_KEY: self._vars[0]}
@@ -293,5 +317,5 @@ def _tensor_conversion_tpu_replicated_var(var,
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(TPUReplicatedVariable,
-                                        _tensor_conversion_tpu_replicated_var)
+tensor_conversion_registry.register_tensor_conversion_function(
+    TPUReplicatedVariable, _tensor_conversion_tpu_replicated_var)
diff --git a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
index 2e78c0e6be9..ecdfd9f95f1 100644
--- a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
@@ -32,6 +32,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import summary_test_util
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -369,22 +370,34 @@ def test_spmd_with_summary(self):
     strategy, _ = get_tpu_strategy(enable_spmd=True)
     summary_dir = self.get_temp_dir()
     writer = summary_ops.create_file_writer_v2(summary_dir)
+    const_multiple = 2
+    num_iters = 10
+    expected_event_count = num_iters + 1
 
     with strategy.scope():
-      step = variables.Variable(0, dtype=dtypes.int64)
+      step = variables.Variable(1, dtype=dtypes.int64)
 
     @def_function.function
     def run():
       with writer.as_default():
-        summary_ops.scalar("result", step * 2, step=step)
-        step.assign_add(1)
+        with summary_ops.record_if(True):
+          summary_ops.scalar("result", step * const_multiple, step=step)
+          step.assign_add(1)
 
-    for _ in range(10):
+    for _ in range(num_iters):
       strategy.run(run, args=())
 
     for val in step.values:
       for var in val.variables:
-        self.assertAllEqual(10, var)
+        self.assertAllEqual(expected_event_count, var)
+
+    events = summary_test_util.events_from_logdir(summary_dir)
+    self.assertLen(events, expected_event_count)
+
+    # Event[0] is generic metadata and summary_ops data starts at event[1].
+    for logged_step in range(1, expected_event_count):
+      self.assertEqual(events[logged_step].summary.value[0].simple_value,
+                       logged_step * const_multiple)
 
     config.set_soft_device_placement(original_device_placement)
 
diff --git a/tensorflow/python/distribute/v1/BUILD b/tensorflow/python/distribute/v1/BUILD
index fec07aba1f1..b764cfdcf21 100644
--- a/tensorflow/python/distribute/v1/BUILD
+++ b/tensorflow/python/distribute/v1/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index e01505d0593..9a1cd517466 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -17,6 +17,7 @@
 import copy
 import weakref
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -24,8 +25,10 @@
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
@@ -34,6 +37,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.types import core
@@ -378,7 +382,8 @@ def _per_replica_to_tensor(var, dtype=None, name=None, as_ref=False):
 
 # Register a conversion function to provide a useful error message when users
 # try to use PerReplica values in the wrong contexts
-ops.register_tensor_conversion_function(PerReplica, _per_replica_to_tensor)
+tensor_conversion_registry.register_tensor_conversion_function(
+    PerReplica, _per_replica_to_tensor)
 
 
 class PerReplicaSpec(type_spec.TypeSpec):
@@ -410,6 +415,13 @@ def _from_components(self, tensor_list):
     return PerReplica(tensor_list)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        PerReplicaSpec, struct_pb2.TypeSpecProto.PER_REPLICA_SPEC
+    )
+)
+
+
 # Note that unlike PerReplica, Mirrored values inherit from
 # DistributedDelegate and so can be used directly in cross-replica mode.
 # TODO(tomhennigan) Should this extend CompositeTensor?
@@ -461,9 +473,12 @@ def is_subtype_of(self, other):
   def most_specific_common_supertype(self, others):
     return self if all(self == other for other in others) else None
 
-  def _placeholder_value(self):
+  def placeholder_value(self, placeholder_context=None):
     return self.distributed_variable
 
+  def _to_tensors(self, value):
+    return []
+
   def __hash__(self) -> int:
     return hash(self.components)
 
@@ -997,27 +1012,39 @@ def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
-  def _map_resources(self, save_options):
-    """For implementing `Trackable`."""
+  def _export_to_saved_model_graph(self,
+                                   object_map=None,
+                                   tensor_map=None,
+                                   options=None,
+                                   **kwargs):
     # Initialize for self._primary first, so that obj_map[self._primary] and
     # resource_map[self._primary.handle] contain mapped values.
-    obj_map, resource_map = self._primary._map_resources(save_options)  # pylint:disable=protected-access
+    resource_list = self._primary._export_to_saved_model_graph(  # pylint:disable=protected-access
+        object_map=object_map,
+        tensor_map=tensor_map,
+        options=options,
+        **kwargs)
     for v in [v for v in self._values if v != self._primary]:
-
-      if (save_options.experimental_variable_policy  # pylint:disable=protected-access
+      if (options.experimental_variable_policy  # pylint:disable=protected-access
           ._expand_distributed_variables()):
-        v_obj_map, v_resource_map = v._map_resources(save_options)  # pylint:disable=protected-access
-        obj_map.update(v_obj_map)
-        resource_map.update(v_resource_map)
+        resource_list.extend(
+            v._export_to_saved_model_graph(  # pylint:disable=protected-access
+                object_map=object_map,
+                tensor_map=tensor_map,
+                options=options,
+                **kwargs))  # pylint:disable=protected-access
       else:
-        obj_map[v] = obj_map[self._primary]
-        resource_map[v.handle] = resource_map[self._primary.handle]
-    obj_map[self] = obj_map[self._primary]
-    resource_map[self] = resource_map[self._primary.handle]
+        object_map[v] = object_map[self._primary]
+        tensor_map[v.handle] = tensor_map[self._primary.handle]
+        resource_list.append(v.handle)
+    object_map[self] = object_map[self._primary]
+    tensor_map[self] = tensor_map[self._primary.handle]
+    resource_list.append(self)
     if self._packed_var is not None:
-      resource_map[self._packed_var.packed_handle] = resource_map[
+      tensor_map[self._packed_var.packed_handle] = tensor_map[
           self._primary.handle]
-    return obj_map, resource_map
+      resource_list.append(self._packed_var.packed_handle)
+    return resource_list
 
   def _write_object_proto(self, proto, options):
     """Update a SavedObject proto for the caller.
@@ -1049,7 +1076,13 @@ def is_distributed_variable(self):
 
   def __tf_experimental_restore_capture__(
       self, concrete_function, internal_capture):
-    concrete_function.graph.capture_distributed_variable(self, internal_capture)
+    graph = concrete_function.graph
+    # Add given distributed variable to captures with given placeholder.
+    graph.replace_capture(self, internal_capture)
+    tape.record_operation(
+        "captured_value", [internal_capture], [self],
+        backward_function=lambda x: [x],
+        forward_function=lambda x: [x])
     return self
 
 
@@ -1408,8 +1441,8 @@ def _tensor_conversion_distributed_var(var,
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(DistributedVariable,
-                                        _tensor_conversion_distributed_var)
+tensor_conversion_registry.register_tensor_conversion_function(
+    DistributedVariable, _tensor_conversion_distributed_var)
 
 
 # MirroredVariables
@@ -1417,8 +1450,8 @@ def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(MirroredVariable,
-                                        _tensor_conversion_mirrored)
+tensor_conversion_registry.register_tensor_conversion_function(
+    MirroredVariable, _tensor_conversion_mirrored)
 
 
 # Mirrored Values
@@ -1427,8 +1460,8 @@ def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
       value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(Mirrored,
-                                        _tensor_conversion_mirrored_val)
+tensor_conversion_registry.register_tensor_conversion_function(
+    Mirrored, _tensor_conversion_mirrored_val)
 
 
 # SyncOnReadVariables
@@ -1436,8 +1469,8 @@ def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
-ops.register_tensor_conversion_function(SyncOnReadVariable,
-                                        _tensor_conversion_sync_on_read)
+tensor_conversion_registry.register_tensor_conversion_function(
+    SyncOnReadVariable, _tensor_conversion_sync_on_read)
 
 
 class VariablePolicy(object):
diff --git a/tensorflow/python/distribute/values_v2.py b/tensorflow/python/distribute/values_v2.py
index 37868357ab1..f7ae4f469ed 100644
--- a/tensorflow/python/distribute/values_v2.py
+++ b/tensorflow/python/distribute/values_v2.py
@@ -22,6 +22,7 @@
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
@@ -263,4 +264,5 @@ def _tensor_conversion(var, dtype=None, name=None, as_ref=False):
       var.read_value(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-ops.register_tensor_conversion_function(DistributedVariable, _tensor_conversion)
+tensor_conversion_registry.register_tensor_conversion_function(
+    DistributedVariable, _tensor_conversion)
diff --git a/tensorflow/python/dlpack/BUILD b/tensorflow/python/dlpack/BUILD
index 0b03bdc5bdd..93faa20fa74 100644
--- a/tensorflow/python/dlpack/BUILD
+++ b/tensorflow/python/dlpack/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index df871999603..1af9cdc8f40 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -12,6 +12,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -55,7 +56,6 @@ cc_library(
         "//tensorflow/core/util:managed_stack_trace",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
-        "//tensorflow/python:numpy_lib",
         "//tensorflow/python:py_exception_registry",
         "//tensorflow/python:pybind11_status",
         "//tensorflow/python/lib/core:py_seq_tensor",
@@ -64,6 +64,7 @@ cc_library(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:stack_trace",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -153,7 +154,6 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":context",
         "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tfe",
     ],
@@ -194,7 +194,10 @@ py_library(
     name = "context",
     srcs = ["context.py"],
     srcs_version = "PY3",
-    visibility = ["//tensorflow:internal"],
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/jax_tpu_embedding:__subpackages__",
+    ],
     deps = [
         ":executor",
         ":monitoring",
@@ -379,6 +382,7 @@ cuda_py_test(
     ],
     deps = [
         ":backprop",
+        ":backprop_util",
         ":context",
         ":test",
         "//tensorflow/python:array_ops",
@@ -437,7 +441,6 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":core",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
     ],
@@ -533,7 +536,6 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:unconnected_gradients",
         "//tensorflow/python:util",
-        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
@@ -559,7 +561,6 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:util",
-        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
     ],
 )
@@ -781,10 +782,10 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "lift_to_graph",
+        ":test",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -886,12 +887,12 @@ cuda_py_test(
         "no_oss",  # This test launches local server
     ],
     deps = [
+        ":remote",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/eager:remote",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -976,10 +977,7 @@ py_library(
 py_binary(
     name = "gen_gradient_input_output_exclusions",
     srcs = ["gen_gradient_input_output_exclusions.py"],
-    deps = [
-        ":gradient_input_output_exclusions",
-        "//tensorflow:tensorflow_py",
-    ],
+    deps = [":gradient_input_output_exclusions"],
 )
 
 # Needed for the test below.
@@ -991,7 +989,7 @@ py_test(
     name = "gradient_input_output_exclusions_test",
     srcs = ["gradient_input_output_exclusions_test.py"],
     data = [
-        "//tensorflow/python/eager:pywrap_gradient_exclusions.cc",
+        ":pywrap_gradient_exclusions.cc",
     ],
     python_version = "PY3",
     srcs_version = "PY3",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index a3800fc7d90..b34a6ac58a6 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -41,8 +41,8 @@
 from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import _pywrap_utils
@@ -50,20 +50,9 @@
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util import variable_utils
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Note that we need to lazy load the following two modules to avoid creating
-# circular dependencies.
-# TODO(b/119775953): fix the circular dependencies.
-pfor_ops = LazyLoader(
-    "pfor_ops", globals(),
-    "tensorflow.python.ops.parallel_for.control_flow_ops")
-
-function = LazyLoader("function", globals(),
-                      "tensorflow.python.eager.function")
-
 _op_attr_type_cache = {}
 
 
@@ -591,44 +580,6 @@ def vjp(dy=None):
   return decorated
 
 
-def flatten_nested_indexed_slices(grad):
-  assert isinstance(grad, indexed_slices.IndexedSlices)
-  if isinstance(grad.values, ops.Tensor):
-    return grad
-  else:
-    assert isinstance(grad.values, indexed_slices.IndexedSlices)
-    g = flatten_nested_indexed_slices(grad.values)
-    return indexed_slices.IndexedSlices(
-        g.values, array_ops.gather(grad.indices, g.indices), g.dense_shape)
-
-
-def aggregate_indexed_slices_gradients(grads):
-  """Aggregates gradients containing `IndexedSlices`s."""
-  if len(grads) < 1:
-    return None
-  if len(grads) == 1:
-    return grads[0]
-  grads = [g for g in grads if g is not None]
-  # If any gradient is a `Tensor`, sum them up and return a dense tensor
-  # object.
-  if any(isinstance(g, ops.Tensor) for g in grads):
-    return math_ops.add_n(grads)
-
-  # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
-  # int64. It is to make sure the inputs of `concat` all have same the data
-  # type.
-  grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
-
-  grads = [flatten_nested_indexed_slices(x) for x in grads]
-  # Form IndexedSlices out of the concatenated values and indices.
-  concat_grad = indexed_slices.IndexedSlices(
-      array_ops.concat([x.values for x in grads], axis=0),
-      array_ops.concat([x.indices for x in grads], axis=0),
-      grads[0].dense_shape)
-
-  return concat_grad
-
-
 def _aggregate_grads(gradients):
   """Aggregate gradients from multiple sources.
 
@@ -649,7 +600,7 @@ def _aggregate_grads(gradients):
     assert all(
         isinstance(g, (ops.Tensor, indexed_slices.IndexedSlices))
         for g in gradients)
-    return aggregate_indexed_slices_gradients(gradients)
+    return backprop_util.AggregateIndexedSlicesGradients(gradients)
 
 
 def _num_elements(grad):
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 9e7fe53a341..e29bc66fcf7 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 import functools
+import sys
 
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape as tape_lib
@@ -1670,6 +1672,9 @@ def grad_fn(x):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testRecomputeGradWithDifferentShape(self):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264947738)
+      self.skipTest('Not working in Python 3.11')
 
     @custom_gradient.recompute_grad
     def outer(x):
@@ -1697,17 +1702,21 @@ def outer_dict(x):
       self.assertAllEqual(y[0], [2.0, 3.0])
       self.assertAllEqual(y[1], 2.0)
 
+  @parameterized.parameters([(True), (False)])
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testRecomputeGradWithNestedFunctionAndWhileLoop(self):
+  def testRecomputeGradWithNestedFunctionAndWhileLoop(self, reduce_retracing):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264947738)
+      self.skipTest('Not working in Python 3.11')
 
     @custom_gradient.recompute_grad
-    @def_function.function
+    @def_function.function(reduce_retracing=reduce_retracing)
     def outer(x):
 
-      @def_function.function
+      @def_function.function(reduce_retracing=reduce_retracing)
       def middle(y):
 
-        @def_function.function
+        @def_function.function(reduce_retracing=reduce_retracing)
         def inner(z):
           return z + 1
 
@@ -2078,12 +2087,12 @@ def _assert_indexed_slices_equal(self, left, right):
         self.evaluate(ops.convert_to_tensor(right)))
 
   def testNoGradients(self):
-    self.assertIsNone(backprop.aggregate_indexed_slices_gradients([]))
+    self.assertIsNone(backprop_util.AggregateIndexedSlicesGradients([]))
 
   def testOneGradient(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    result = backprop.aggregate_indexed_slices_gradients([t])
+    result = backprop_util.AggregateIndexedSlicesGradients([t])
     self._assert_indexed_slices_equal(t, result)
 
   def testMultipleGradients(self):
@@ -2092,7 +2101,7 @@ def testMultipleGradients(self):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = backprop.aggregate_indexed_slices_gradients([t0, t1])
+    result = backprop_util.AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
   def testMultipleGradientsWithNones(self):
@@ -2102,7 +2111,7 @@ def testMultipleGradientsWithNones(self):
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     t3 = None
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = backprop.aggregate_indexed_slices_gradients([t0, t1, t3])
+    result = backprop_util.AggregateIndexedSlicesGradients([t0, t1, t3])
     self._assert_indexed_slices_equal(total, result)
 
   def testMixedTensorAndIndexedSlices(self):
@@ -2110,7 +2119,7 @@ def testMixedTensorAndIndexedSlices(self):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = backprop.aggregate_indexed_slices_gradients([t0, t1])
+    result = backprop_util.AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
 
diff --git a/tensorflow/python/eager/backprop_util.py b/tensorflow/python/eager/backprop_util.py
index 5e321bc3d0c..b6509a48307 100644
--- a/tensorflow/python/eager/backprop_util.py
+++ b/tensorflow/python/eager/backprop_util.py
@@ -17,9 +17,12 @@
 from tensorflow.core.config import flags
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import handle_data_util
+from tensorflow.python.ops import math_ops
 
 
 def _DTypeFromTensor(tensor):
@@ -60,3 +63,42 @@ def IsTrainable(tensor_or_dtype):
     trainable_dtypes.extend([dtypes.qint8, dtypes.qint16, dtypes.qint32,
                              dtypes.quint8, dtypes.quint16])
   return dtype.base_dtype in trainable_dtypes
+
+
+def FlattenNestedIndexedSlices(grad):
+  assert isinstance(grad, indexed_slices.IndexedSlices)
+  if isinstance(grad.values, ops.Tensor):
+    return grad
+  else:
+    assert isinstance(grad.values, indexed_slices.IndexedSlices)
+    g = FlattenNestedIndexedSlices(grad.values)
+    return indexed_slices.IndexedSlices(
+        g.values, array_ops.gather(grad.indices, g.indices), g.dense_shape)
+
+
+def AggregateIndexedSlicesGradients(grads):
+  """Aggregates gradients containing `IndexedSlices`s."""
+  if len(grads) < 1:
+    return None
+  if len(grads) == 1:
+    return grads[0]
+  grads = [g for g in grads if g is not None]
+  # If any gradient is a `Tensor`, sum them up and return a dense tensor
+  # object.
+  if any(isinstance(g, ops.Tensor) for g in grads):
+    return math_ops.add_n(grads)
+
+  # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
+  # int64. It is to make sure the inputs of `concat` all have same the data
+  # type.
+  grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
+
+  grads = [FlattenNestedIndexedSlices(x) for x in grads]
+  # Form IndexedSlices out of the concatenated values and indices.
+  concat_grad = indexed_slices.IndexedSlices(
+      array_ops.concat([x.values for x in grads], axis=0),
+      array_ops.concat([x.indices for x in grads], axis=0),
+      grads[0].dense_shape)
+
+  return concat_grad
+
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
index 0794103375d..4633cdfe327 100644
--- a/tensorflow/python/eager/benchmarks/BUILD
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/eager/benchmarks/resnet50/BUILD b/tensorflow/python/eager/benchmarks/resnet50/BUILD
index 5c26e4c51e6..48593c4ad7d 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/BUILD
+++ b/tensorflow/python/eager/benchmarks/resnet50/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -41,6 +42,7 @@ cuda_py_test(
     srcs = ["resnet50_test.py"],
     shard_count = 4,
     tags = [
+        "no_pip",
         "no_windows",  # TODO(b/141617449): needs investigation
         "optonly",
         "oss_serial",
@@ -59,6 +61,7 @@ cuda_py_test(
     srcs = ["hvp_test.py"],
     shard_count = 7,
     tags = [
+        "no_pip",
         "no_windows",  # TODO(b/141617449): needs investigation
         "optonly",
         "oss_serial",
@@ -80,6 +83,7 @@ cuda_py_test(
     srcs = ["resnet50_graph_test.py"],
     shard_count = 4,
     tags = [
+        "no_pip",
         "no_windows",  # TODO(b/141617449): needs investigation
         "noasan",
         "nomsan",
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 57f1c74e7b8..3fec84b06c3 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -462,7 +462,6 @@ def __init__(self,
       execution_mode = SYNC
     self._default_is_async = execution_mode == ASYNC
     self._use_tfrt = is_tfrt_enabled()
-    self._use_tfrt_distributed_runtime = None
     self._jit_compile_rewrite = jit_compile_rewrite_enabled()
     self._server_def = server_def
     self._collective_ops_server_def = None
@@ -504,9 +503,10 @@ def _set_global_seed(self, seed):
     # to int.
     try:
       hash(seed)
+      self._rng = random.Random(seed)
     except TypeError:
       seed = int(np.array(seed))
-    self._rng = random.Random(seed)
+      self._rng = random.Random(seed)
     # Also clear the kernel cache, to reset any existing seeds
     if self._context_handle is not None:
       pywrap_tfe.TFE_ContextClearCaches(self._context_handle)
@@ -577,11 +577,6 @@ def ensure_initialized(self):
           pywrap_tfe.TFE_ContextOptionsSetAsync(opts, True)
         if self._use_tfrt is not None:
           pywrap_tfe.TFE_ContextOptionsSetTfrt(opts, self._use_tfrt)
-        # pylint: disable=g-backslash-continuation
-        if self._use_tfrt is not None and \
-            self._use_tfrt_distributed_runtime is not None:
-          pywrap_tfe.TFE_ContextOptionsSetTfrtDistributedRuntime(
-              opts, self._use_tfrt_distributed_runtime)
         pywrap_tfe.TFE_ContextOptionsSetRunEagerOpAsFunction(opts, True)
         pywrap_tfe.TFE_ContextOptionsSetJitCompileRewrite(
             opts, self._jit_compile_rewrite)
@@ -740,6 +735,7 @@ def configure_coordination_service(self,
                                      enable_health_check=True,
                                      cluster_register_timeout_in_ms=0,
                                      heartbeat_timeout_in_ms=0,
+                                     shutdown_barrier_timeout_in_ms=0,
                                      coordinated_jobs=None):
     """Enable distributed coordination service with specified configs."""
     if self._context_handle:
@@ -752,6 +748,7 @@ def configure_coordination_service(self,
     config.enable_health_check = enable_health_check
     config.cluster_register_timeout_in_ms = cluster_register_timeout_in_ms
     config.heartbeat_timeout_in_ms = heartbeat_timeout_in_ms
+    config.shutdown_barrier_timeout_in_ms = shutdown_barrier_timeout_in_ms
     if coordinated_jobs is not None:
       if isinstance(coordinated_jobs, list):
         config.coordinated_job_list.extend(coordinated_jobs)
@@ -808,6 +805,19 @@ def get_task_states(self, job_configs):
     else:
       raise ValueError("Context is not initialized.")
 
+  def wait_at_barrier(self, barrier_id, timeout_in_ms):
+    """Blocks until all coordinated tasks are at the barrier.
+
+    The barrier may fail if it times out or if one of the tasks is unhealthy.
+
+    Args:
+      barrier_id: Unique string identifying the barrier.
+      timeout_in_ms: Duration before the barrier times out and fails.
+    """
+    ensure_initialized()
+    pywrap_tfe.TFE_WaitAtBarrier(self._context_handle, barrier_id,
+                                 timeout_in_ms)
+
   def clear_kernel_cache(self):
     """Clear kernel cache and reset all stateful kernels."""
     if self._context_handle is not None:
@@ -1736,9 +1746,22 @@ def set_logical_cpu_devices(self, num_cpus, prefix=""):
     pywrap_tfe.TFE_SetLogicalCpuDevices(self._context_handle, num_cpus, prefix)
     self._initialize_logical_devices()
 
-  def get_compiler_ir(self, device_name, function_name, args, stage="hlo"):
-    return pywrap_tfe.TF_GetCompilerIr(self._context_handle, function_name,
-                                       stage, device_name, args)
+  def get_compiler_ir(
+      self,
+      device_name,
+      function_name,
+      flat_args,
+      captured_inputs,
+      stage="hlo",
+  ):
+    return pywrap_tfe.TF_GetCompilerIr(
+        self._context_handle,
+        function_name,
+        stage,
+        device_name,
+        flat_args,
+        captured_inputs,
+    )
 
   @deprecated(
       None, "XLA:CPU and XLA:GPU devices are deprecated", warn_once=True)
@@ -1927,28 +1950,6 @@ def use_tfrt(self, tfrt):
         raise ValueError("use_tfrt should be set before being initialized.")
       self._use_tfrt = tfrt
 
-  @property
-  def use_tfrt_distributed_runtime(self):
-    return self._use_tfrt_distributed_runtime
-
-  @use_tfrt_distributed_runtime.setter
-  def use_tfrt_distributed_runtime(self, enable):
-    """Sets whether to use TFRT distributed runtime.
-
-    This is only effective when use_tfrt is also true. Note that currently TFRT
-    distributed runtime is not function complete and this config is for testing
-    only.
-    Args:
-      enable: A boolean to set whether to use TFRT distributed runtime.
-    """
-    if not isinstance(enable, bool):
-      raise ValueError("Expecting a boolean but got %s" % type(enable))
-
-    if self._use_tfrt_distributed_runtime != enable:
-      if self._initialized:
-        raise ValueError("use_tfrt should be set before being initialized.")
-      self._use_tfrt_distributed_runtime = enable
-
   @property
   def operation_timeout_in_ms(self):
     return self.config.operation_timeout_in_ms
diff --git a/tensorflow/python/eager/executor.py b/tensorflow/python/eager/executor.py
index 762e327b70f..4d01f8e9cab 100644
--- a/tensorflow/python/eager/executor.py
+++ b/tensorflow/python/eager/executor.py
@@ -69,6 +69,9 @@ def clear_error(self):
     pywrap_tfe.TFE_ExecutorClearError(self._handle)
 
 
-def new_executor(enable_async, enable_streaming_enqueue=True):
-  handle = pywrap_tfe.TFE_NewExecutor(enable_async, enable_streaming_enqueue)
+def new_executor(enable_async,
+                 enable_streaming_enqueue=True,
+                 in_flight_nodes_limit=0):
+  handle = pywrap_tfe.TFE_NewExecutor(enable_async, enable_streaming_enqueue,
+                                      in_flight_nodes_limit)
   return Executor(handle)
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index ceecc73910d..eac28a51119 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -15,6 +15,7 @@
 
 import functools
 import gc
+import sys
 import weakref
 
 from absl.testing import parameterized
@@ -781,6 +782,10 @@ def testRecordingSelectively(self):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testOpWithNoTrainableOutputs(self):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264947738)
+      self.skipTest("Not working in Python 3.11")
+
     v = variables.Variable(1.)
     with forwardprop.ForwardAccumulator(v, 11.):
       v.assign_sub(0.5)
@@ -878,6 +883,9 @@ def testSpecialForwardFunctionUsed(self):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testVariableWatched(self):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264947738)
+      self.skipTest("Not working in Python 3.11")
     v = variables.Variable([1., 2., 3.])
     with forwardprop.ForwardAccumulator(v, constant_op.constant([.1, -.2,
                                                                  .3])) as acc:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 0f59471972d..3d4fa62d337 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -18,7 +18,6 @@
 
 # TODO(b/243822285): Reduce this list as much as possible.
 # Constants
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import IMPLEMENTS_ATTRIBUTE_NAME
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import _BACKWARD_PREFIX
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import _FORWARD_PREFIX
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import _INFERENCE_PREFIX
@@ -39,7 +38,6 @@
 from tensorflow.python.eager.polymorphic_function.quarantine import remove_function_callback
 from tensorflow.python.eager.polymorphic_function.monomorphic_function import _function_callbacks
 
-# TODO(b/244360686): Remove in favor of tf.function.
+# TODO(b/258247871): Remove in favor of tf.function.
 # QUARANTINED - Defun API
-from tensorflow.python.eager.polymorphic_function.quarantine import defun
 from tensorflow.python.eager.polymorphic_function.quarantine import defun_with_attributes
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index 2215fe50dd9..a6535b28080 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 59a06222783..65b8af3f0e1 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for operations in eager execution."""
 import gc
+import sys
 import threading
 import weakref
 
@@ -434,6 +435,9 @@ def init_fn():
     t1.join()
 
   def testWeakrefEagerTensor(self):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264947738)
+      self.skipTest('Not working in Python 3.11')
     x = constant_op.constant([[1.]])
     x.at1 = constant_op.constant([[2.]])
     x.at2 = 3.
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index f707ee20121..2ad9fa108cf 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -2,18 +2,27 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
+py_library(
+    name = "attributes",
+    srcs = ["attributes.py"],
+    srcs_version = "PY3",
+    deps = [],
+)
+
 py_library(
     name = "monomorphic_function",
     srcs = ["monomorphic_function.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow/python/eager:__pkg__"],
     deps = [
+        ":attributes",
         ":function_context",
         ":function_spec",
-        ":saved_model_utils",
+        ":saved_model_exported_concrete",
         "//tensorflow/core/function/polymorphism:function_cache",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -41,7 +50,9 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow/python/eager:__pkg__"],
     deps = [
+        ":attributes",
         ":monomorphic_function",
+        "//tensorflow/core/function/capture:capture_container",
     ],
 )
 
@@ -51,6 +62,8 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":attributes",
+        ":compiler_ir",
         ":function_spec",
         "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
@@ -221,10 +234,7 @@ py_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/util",
     ],
 )
@@ -323,9 +333,17 @@ py_library(
     deps = [
         "//tensorflow/python:constant_op",
         "//tensorflow/python/saved_model/registration",
-        "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:base",
-        "//tensorflow/python/trackable:resource",
+    ],
+)
+
+py_library(
+    name = "saved_model_exported_concrete",
+    srcs = ["saved_model_exported_concrete.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python/trackable:base",
     ],
 )
 
@@ -361,3 +379,51 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_library(
+    name = "compiler_ir",
+    srcs = ["compiler_ir.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core/function/trace_type",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/util",
+    ],
+)
+
+tf_xla_py_test(
+    name = "compiler_ir_test",
+    srcs = ["compiler_ir_test.py"],
+    disabled_backends = [
+        "cpu_ondemand",
+    ],
+    enable_mlir_bridge = True,
+    python_version = "PY3",
+    tags = [
+        "no_mac",
+        "no_pip",
+        "no_tfrt",  # TODO(b/185944215)
+        "no_windows",
+    ],
+    use_xla_device = False,
+    deps = [
+        ":compiler_ir",
+        ":polymorphic_function",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/util",
+    ],
+)
diff --git a/tensorflow/python/eager/polymorphic_function/attributes.py b/tensorflow/python/eager/polymorphic_function/attributes.py
new file mode 100644
index 00000000000..d731214fcb0
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/attributes.py
@@ -0,0 +1,109 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""This file lists FunctionDef attributes and corresponding allowlists."""
+
+# IMPORTANT: The usage of all the attributes below should be considered tech
+# debt and new additions to this list are discouraged.
+#
+# Historically, attributes have been used as means to pipe extra information
+# down to runtime that is not related to the actual function definition itself.
+#
+# This information is better layered independently and future work is encouraged
+# to pursue that direction instead.
+
+API_IMPLEMENTS = "api_implements"
+API_PREFERRED_DEVICE = "api_preferred_device"
+BACKWARD_FUNCTION = "backward_function_name"
+DISABLE_CALL_SHAPE_INFERENCE = "_disable_call_shape_inference"
+EAGER_RUNTIME_CONSTRUCTION_CONTEXT = "_construction_context"
+FORWARD_FUNCTION = "forward_function_name"
+GO_BACKWARDS = "go_backwards"
+IMPLEMENTS = "_implements"
+INPUT_SHAPES = "_input_shapes"
+INTS_ON_DEVICE = "experimental_ints_on_device"
+NO_INLINE = "_noinline"
+ORIGINAL_FUNCTION_NAME = "_original_func_name"
+OUTPUTS_ON_OP_DEVICE = "_OutputsOnOpDevice"
+QUANTIZED_COMPOSITE_FUNCTION = "tf_quant.composite_function"
+QUANTIZED_OPS = "tf_quant.quantized_ops"
+SHARED_RENDEZVOUS = "shared_rendezvous"
+TF_DATA_FUNCTION = "_tf_data_function"
+TFTRT_ALLOW_BUILD_AT_RUNTIME = "_tftrt_allow_build_at_runtime"
+TFTRT_CONVERT_FUNCTION = "_tftrt_convert_function"
+TFTRT_IS_DYN_OP = "_tftrt_is_dyn_op"
+TFTRT_LOGGER = "_tftrt_trt_logger_name"
+TFTRT_MAX_BATCH_SIZE = "_tftrt_max_batch_size"
+TFTRT_MAX_CACHED_ENGINES = "_tftrt_max_cached_engines"
+TFTRT_MAX_WORKSPACE_SIZE = "_tftrt_max_workspace_size_bytes"
+TFTRT_MIN_SEGMENT_SIZE = "_tftrt_minimum_segment_size"
+TFTRT_PRECISION_MODE = "_tftrt_precision_mode"
+TFTRT_PROFILE_STRATEGY = "_tftrt_profile_strategy"
+TFTRT_USE_CALIBRATION = "_tftrt_use_calibration"
+TFTRT_USE_IMPLICIT_BATCH = "_tftrt_use_implicit_batch"
+TIME_MAJOR = "time_major"
+XLA_COMPILE = "_XlaMustCompile"
+XLA_COMPILE_OPTIONAL = "_XlaCompile"
+XLA_SCOPE = "_XlaScope"
+XLA_SEPERATE_COMPILED_GRADIENTS = "_XlaSeparateCompiledGradients"
+
+POLYMORPHIC_FUNCTION_ALLOWLIST = frozenset({
+    API_IMPLEMENTS,
+    API_PREFERRED_DEVICE,
+    IMPLEMENTS,
+})
+
+TRACING_COMPILER_ALLOWLIST = frozenset().union(
+    POLYMORPHIC_FUNCTION_ALLOWLIST,
+    {
+        GO_BACKWARDS,
+        INTS_ON_DEVICE,
+        NO_INLINE,
+        OUTPUTS_ON_OP_DEVICE,
+        SHARED_RENDEZVOUS,
+        TF_DATA_FUNCTION,
+        TIME_MAJOR,
+        XLA_COMPILE,
+    },
+)
+
+MONOMORPHIC_FUNCTION_ALLOWLIST = frozenset().union(
+    TRACING_COMPILER_ALLOWLIST,
+    {
+        BACKWARD_FUNCTION,
+        DISABLE_CALL_SHAPE_INFERENCE,
+        EAGER_RUNTIME_CONSTRUCTION_CONTEXT,
+        FORWARD_FUNCTION,
+        INPUT_SHAPES,
+        ORIGINAL_FUNCTION_NAME,
+        QUANTIZED_COMPOSITE_FUNCTION,
+        QUANTIZED_OPS,
+        TFTRT_ALLOW_BUILD_AT_RUNTIME,
+        TFTRT_CONVERT_FUNCTION,
+        TFTRT_IS_DYN_OP,
+        TFTRT_LOGGER,
+        TFTRT_MAX_BATCH_SIZE,
+        TFTRT_MAX_CACHED_ENGINES,
+        TFTRT_MAX_WORKSPACE_SIZE,
+        TFTRT_MIN_SEGMENT_SIZE,
+        TFTRT_PRECISION_MODE,
+        TFTRT_PROFILE_STRATEGY,
+        TFTRT_USE_CALIBRATION,
+        TFTRT_USE_IMPLICIT_BATCH,
+        XLA_COMPILE_OPTIONAL,
+        XLA_SCOPE,
+        XLA_SEPERATE_COMPILED_GRADIENTS,
+    },
+)
diff --git a/tensorflow/python/eager/polymorphic_function/compiler_ir.py b/tensorflow/python/eager/polymorphic_function/compiler_ir.py
new file mode 100644
index 00000000000..69bf65df89b
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/compiler_ir.py
@@ -0,0 +1,110 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implmentation for defining get_compiler_ir."""
+from typing import List, Optional
+
+from tensorflow.core.function import trace_type
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import random_ops
+
+from tensorflow.python.util import nest
+
+
+def maybe_get_device_name(device_name):
+  # TODO(cheshire): This is a hack to get the current "preferred" device,
+  # there is no current API to get it otherwise.
+  if device_name is None:
+    device_name = random_ops.random_normal([]).device
+  return device_name
+
+
+# TODO(fmuham): Use trace_type._flatten here instead when available
+def make_handledata_tensor_specs(resource_vars):
+  """Convert tf.Variable list to its corresponding TensorSpec list."""
+  if not all(x.dtype is dtypes.resource for x in resource_vars):
+    raise RuntimeError("Resource_vars must be tf.resource list.")
+  inner_context = trace_type.InternalTracingContext()
+  trace_type_inputs = trace_type.from_value(
+      tuple(resource_vars), inner_context
+  ).components
+  handledata_mapping = inner_context.get_handledata_mapping()
+
+  def to_resource_spec(traced_input):
+    try:
+      my_id = id(traced_input)
+      handled_data = handledata_mapping[my_id]
+      shape_and_type = handled_data.shape_and_type[0]
+      spec = tensor_spec.TensorSpec(
+          shape=shape_and_type.shape, dtype=shape_and_type.dtype
+      )
+      return spec
+    except Exception as e:
+      raise ValueError(
+          "Fail to convert tf.Variable list to TensorSpec list. The error"
+          " is: %s" % e
+      ) from e
+
+  return [to_resource_spec(trace_type) for trace_type in trace_type_inputs]
+
+
+def from_concrete_function(
+    concrete_fn,
+    specialized_flat_specs: Optional[List[tensor_spec.TensorSpec]] = None,
+):
+  """Generate the Compiler Ir from tf concrete function with TensorSpec.
+
+  Args:
+    concrete_fn: returned by using get_concrete_function.
+    specialized_flat_specs: specialized flat tf.TensorSpecs for function args.
+
+  Returns:
+    Function callable that generate the HLO text.
+
+  Raises:
+      ValueError: if concrete_fn is not "compilable" without concrete
+      inputs.
+  """
+  context.ensure_initialized()
+  fn_name = concrete_fn.name
+  filtered_flat_specs = specialized_flat_specs or list(
+      nest.flatten(concrete_fn.structured_input_signature)
+  )
+
+  if not all(s.shape.is_fully_defined() for s in filtered_flat_specs):
+    raise ValueError(
+        f"Only support static input shape but got inputs = {concrete_fn.inputs}"
+    )
+
+  def compiler_ir_generator(stage="hlo", device_name=None):
+    device_name = maybe_get_device_name(device_name)
+    res_bytes = context.context().get_compiler_ir(
+        device_name=device_name,
+        function_name=fn_name,
+        flat_args=filtered_flat_specs,
+        captured_inputs=concrete_fn.captured_inputs,
+        stage=stage,
+    )
+    if stage in (
+        "hlo_serialized",
+        "optimized_hlo_serialized",
+        "optimized_hlo_proto_serialized",
+    ):
+      return res_bytes
+    else:
+      return res_bytes.decode("utf-8")
+
+  return compiler_ir_generator
diff --git a/tensorflow/python/eager/polymorphic_function/compiler_ir_test.py b/tensorflow/python/eager/polymorphic_function/compiler_ir_test.py
new file mode 100644
index 00000000000..e3b30130783
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/compiler_ir_test.py
@@ -0,0 +1,240 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager.polymorphic_function import compiler_ir
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class CompilerIrTest(xla_test.XLATestCase):
+
+  def _compareTwoMethodsCompilerIROutput(self, f, args, kwargs):
+    flat_args = list(args) + list(kwargs.values())
+    if not all([isinstance(x, ops.Tensor) for x in flat_args]):
+      self.skipTest('It only support args and kwargs are all tf.Tensor types.')
+
+    args_spec = nest.map_structure(tensor_spec.TensorSpec.from_tensor, args)
+    kwargs_spec = nest.map_structure(tensor_spec.TensorSpec.from_tensor, kwargs)
+
+    hlo_1 = f.experimental_get_compiler_ir(*args, **kwargs)()
+    hlo_2 = f.experimental_get_compiler_ir(*args_spec, **kwargs_spec)()
+
+    if hlo_1 != hlo_2:
+      self.fail(
+          'The tensor_spec way experimental_get_compiler_ir give diff result'
+          ' to normal experimental_get_compiler_ir.'
+          f' \nhlo(concrete_input):\n{hlo_1}\nhlo(tensor_spec):\n{hlo_2}\n'
+      )
+
+  def test_zero_input(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @polymorphic_function.function(jit_compile=True, autograph=False)
+      def fun_tf():
+        return array_ops.zeros((10), dtype=dtypes.int32)
+
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [], {})
+
+  def test_constant_slice(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      # Constant slice. This is the common case.
+      x = array_ops.zeros((10,), dtype=dtypes.int32)
+
+      @polymorphic_function.function(jit_compile=True, autograph=False)
+      def fun_tf(x):
+        begin = 0
+        return x[begin:5]
+
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [x], {})
+
+  def test_compile_time_constant(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      # Non-constant slice, but compile-time constant depending only on shapes.
+      x = array_ops.zeros((10,), dtype=dtypes.int32)
+
+      @polymorphic_function.function(jit_compile=True, autograph=False)
+      def fun_tf(x):
+        # begin is a compile-time constant, even if x is not
+        begin = array_ops.shape_v2(x)[0] - 2
+        return x[begin:]
+
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [x], {})
+
+  def test_capture_constant(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      # Capture a constant
+      outer_ct = [3.0]
+      x = ops.convert_to_tensor([2.0, 3.0, 4.0], dtype=dtypes.float32)
+
+      @polymorphic_function.function(jit_compile=True, autograph=False)
+      def fun_tf(x):
+        return x * gen_array_ops.broadcast_to(outer_ct, x.shape) + 1.0
+
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [x], {})
+
+  def test_unsupported_dynamic_input(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @polymorphic_function.function(jit_compile=True)
+      def f(x):
+        return x
+
+      with self.assertRaisesRegex(
+          ValueError, 'Only support static input shape but got'
+      ):
+        args_spec = [tensor_spec.TensorSpec((None), dtype=dtypes.float32)]
+        concrete_fn = f.get_concrete_function(*args_spec)
+        _ = compiler_ir.from_concrete_function(concrete_fn)(stage='hlo')
+
+  def test_unsupported_shape_depend_input(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      # Those cases output shapes are dynamic.
+      @polymorphic_function.function(jit_compile=True)
+      def f2(x):
+        return x[x[0] : 0]
+
+      args = [ops.convert_to_tensor([1, 2, 3, 4])]
+      args_spec = nest.map_structure(tensor_spec.TensorSpec.from_tensor, args)
+      concrete_fn = f2.get_concrete_function(*args_spec)
+      if test_util.is_mlir_bridge_enabled():
+        with self.assertRaisesRegex(
+            ValueError, 'TF to XLA legalization failed'
+        ):
+          _ = compiler_ir.from_concrete_function(concrete_fn)(stage='hlo')
+      else:
+        _ = compiler_ir.from_concrete_function(concrete_fn)(stage='hlo')
+
+  def test_make_handledata_tensor_specs(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      v1 = variables.Variable([0.1, 0.1])
+      v3 = variables.Variable([1], dtype=dtypes.int32)
+
+      @polymorphic_function.function(jit_compile=True)
+      def f4(a, b):
+        return (a + b) * v1 - math_ops.cast(v3, dtypes.float32)
+
+      a = constant_op.constant([1.1, 1.1])
+      b = constant_op.constant([2.2, 2.2])
+
+      kwargs = {'b': a, 'a': b}
+
+      kwargs_spec = nest.map_structure(
+          tensor_spec.TensorSpec.from_tensor, kwargs
+      )
+      concrete_fn = f4.get_concrete_function(**kwargs_spec)
+      captured_inputs = concrete_fn.captured_inputs
+      captured_spec = compiler_ir.make_handledata_tensor_specs(captured_inputs)
+      self.assertEqual(len(captured_spec), 2)
+      self.assertEqual(
+          captured_spec[0], tensor_spec.TensorSpec((2), dtype=dtypes.float32)
+      )
+      self.assertEqual(
+          captured_spec[1], tensor_spec.TensorSpec((1), dtype=dtypes.int32)
+      )
+
+  def test_capture_variable_1(self):
+    if 'gpu' in self.device.lower():
+      self.skipTest('Skip test on GPU')
+
+    with ops.device('device:{}:0'.format(self.device)):
+      v1 = variables.Variable([0.1, 0.1])
+      v3 = variables.Variable([1], dtype=dtypes.int32)
+
+      @polymorphic_function.function(jit_compile=True)
+      def f4(a, b):
+        return (a + b) * v1 - math_ops.cast(v3, dtypes.float32)
+
+      a = constant_op.constant([1.1, 1.1])
+      b = constant_op.constant([2.2, 2.2])
+
+      kwargs = {'b': a, 'a': b}
+      self._compareTwoMethodsCompilerIROutput(f4, [], kwargs)
+
+  def test_capture_variable_2(self):
+    if not test_util.is_mlir_bridge_enabled():
+      self.skipTest('Non_milr_bridge will fail here.')
+
+    if 'gpu' in self.device.lower():
+      self.skipTest('Skip test on GPU')
+
+    with ops.device('device:{}:0'.format(self.device)):
+      v2 = variables.Variable(2.0, dtype=dtypes.float32)
+      v3 = variables.Variable(3.0, dtype=dtypes.float32)
+
+      @polymorphic_function.function(jit_compile=True)
+      def fun_tf(x):
+        # Defining tf.constants inside func_body is okay.
+        t4 = constant_op.constant(4.0, dtype=dtypes.float32)
+        t5 = constant_op.constant(5.0, dtype=dtypes.float32)
+        return (x * v3 + t4 + v2) * v3 + t5
+
+      x = constant_op.constant(2.0, dtype=dtypes.float32)
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [x], {})
+
+  def test_capture_constants(self):
+    if 'gpu' in self.device.lower():
+      self.skipTest('Skip test on GPU')
+
+    with ops.device('device:{}:0'.format(self.device)):
+      v2 = variables.Variable(2.0, dtype=dtypes.float32)
+      v3 = variables.Variable(3.0, dtype=dtypes.float32)
+      t4 = constant_op.constant([4.0, 5.0], dtype=dtypes.float32)
+      t5 = constant_op.constant([5.0, 6.0], dtype=dtypes.float32)
+
+      @polymorphic_function.function(jit_compile=True)
+      def fun_tf(x):
+        return (x * v3 + t4 + v2) * v3 + t5
+
+      x = constant_op.constant([2.0, 3.0], dtype=dtypes.float32)
+      self._compareTwoMethodsCompilerIROutput(fun_tf, [x], {})
+
+  def test_from_concrete_function_with_args(self):
+    with ops.device('device:{}:0'.format(self.device)):
+      v2 = variables.Variable(2.0, dtype=dtypes.float32)
+      v3 = variables.Variable(3.0, dtype=dtypes.float32)
+      # Capturing tf.constants outside func_body is not okay.
+      t4 = constant_op.constant(4.0, dtype=dtypes.float32)
+      t5 = constant_op.constant(5.0, dtype=dtypes.float32)
+
+      @polymorphic_function.function(jit_compile=True)
+      def fun_tf(x):
+        return (x * v3 + t4 + v2) * v3 + t5
+
+      concrete_fn = fun_tf.get_concrete_function(
+          tensor_spec.TensorSpec((None,), dtype=dtypes.float32)
+      )
+
+      x = tensor_spec.TensorSpec((10,), dtype=dtypes.float32)
+      hlo_1 = compiler_ir.from_concrete_function(concrete_fn, [x])(stage='hlo')
+      self.assertIn('f32[10]', hlo_1)
+      x = tensor_spec.TensorSpec((20,), dtype=dtypes.float32)
+      hlo_2 = compiler_ir.from_concrete_function(concrete_fn, [x])(stage='hlo')
+      self.assertIn('f32[20]', hlo_2)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/polymorphic_function/composite_tensor_utils.py b/tensorflow/python/eager/polymorphic_function/composite_tensor_utils.py
index b27e6c471c5..b7f0523e9da 100644
--- a/tensorflow/python/eager/polymorphic_function/composite_tensor_utils.py
+++ b/tensorflow/python/eager/polymorphic_function/composite_tensor_utils.py
@@ -15,9 +15,6 @@
 """Utility to manipulate CompositeTensors in tf.function."""
 
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import type_spec
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import nest
 
@@ -40,28 +37,3 @@ def flatten_with_variables(inputs):
     else:
       flat_inputs.append(value)
   return flat_inputs
-
-
-# TODO(b/246437883): Consider removing this helper function once the variable
-# branch is removed from _get_defun_input.
-def flatten_with_variables_or_variable_specs(arg):
-  """Gets defun input and doesn't expand `ResourceVariable`s or `VariableSpec`s."""
-  flat_inputs = []
-  for value in nest.flatten(arg):
-    if (isinstance(value, composite_tensor.CompositeTensor) and
-        not _pywrap_utils.IsResourceVariable(value)):
-      # Replace any composite tensors with their TypeSpecs. This is important
-      # for ensuring that shape information that's not preserved by the
-      # TypeSpec (such as the number of values in a SparseTensor) gets
-      # properly masked.
-      spec = value._type_spec  # pylint: disable=protected-access
-      flat_inputs.extend(flatten_with_variables_or_variable_specs(spec))
-    elif (isinstance(value, type_spec.TypeSpec) and
-          not isinstance(value, tensor_spec.TensorSpec) and
-          not isinstance(value, resource_variable_ops.VariableSpec)):
-      component_specs = value._component_specs  # pylint: disable=protected-access
-      components = flatten_with_variables_or_variable_specs(component_specs)
-      flat_inputs.extend(components)
-    else:
-      flat_inputs.append(value)
-  return flat_inputs
diff --git a/tensorflow/python/eager/polymorphic_function/function_context.py b/tensorflow/python/eager/polymorphic_function/function_context.py
index 2c78b0464d8..ff0aec9e36a 100644
--- a/tensorflow/python/eager/polymorphic_function/function_context.py
+++ b/tensorflow/python/eager/polymorphic_function/function_context.py
@@ -14,12 +14,9 @@
 # ==============================================================================
 """Context information for a tf.function."""
 
-import collections
-from typing import Any, NamedTuple, Tuple
+from typing import NamedTuple, Any
 
-from tensorflow.core.function import trace_type
 from tensorflow.core.function.polymorphism import function_cache
-from tensorflow.core.function.polymorphism import function_type
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import func_graph as func_graph_module
@@ -120,26 +117,3 @@ def _enclosing_xla_context():
     # find the original graph with the XLAControlFlowContext.
     graph = getattr(graph, "outer_graph", None)
   return None
-
-
-def make_cache_key(
-    args: Any,
-    captures: Any = None,
-) -> Tuple[function_cache.FunctionContext, function_type.FunctionType,
-           trace_type.WeakrefDeletionObserver]:
-  """Computes the cache key given the function arguments."""
-  if captures is None:
-    captures = dict()
-  signature_context = trace_type.InternalTracingContext()
-  args_signature = trace_type.from_value(args, signature_context)
-  captures_dict_tracetype = trace_type.from_value(captures, signature_context)
-
-  # TODO(fmuham): Use the actual FunctionType
-  dummy_function_type = function_type.FunctionType([
-      function_type.Parameter("args_kwargs",
-                              function_type.Parameter.POSITIONAL_ONLY, False,
-                              args_signature)
-  ], collections.OrderedDict(captures_dict_tracetype.mapping))
-
-  return (make_function_context(), dummy_function_type,
-          signature_context.deletion_observer)
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec.py b/tensorflow/python/eager/polymorphic_function/function_spec.py
index 4987669435c..514baa8aff3 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec.py
+++ b/tensorflow/python/eager/polymorphic_function/function_spec.py
@@ -15,6 +15,8 @@
 """Defines an input type specification for tf.function."""
 
 import functools
+import inspect
+from typing import Any, Dict, Tuple
 import weakref
 
 import numpy as np
@@ -31,7 +33,6 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 
 # Sentinel value used by with ConcreteFunction's structured signature to
 # indicate that a non-tensor parameter should use the value that was
@@ -39,12 +40,149 @@
 BOUND_VALUE = object()
 
 
+def to_fullargspec(function_type: function_type_lib.FunctionType,
+                   default_values: Dict[str, Any],
+                   is_bound_method: bool) -> inspect.FullArgSpec:
+  """Generates backwards compatible FullArgSpec from FunctionType."""
+  args = []
+  varargs = None
+  varkw = None
+  defaults = []
+  kwonlyargs = []
+  kwonlydefaults = {}
+
+  for parameter in function_type.parameters.values():
+    if parameter.kind in [
+        inspect.Parameter.POSITIONAL_ONLY,
+        inspect.Parameter.POSITIONAL_OR_KEYWORD
+    ]:
+      args.append(parameter.name)
+      if parameter.default is not inspect.Parameter.empty:
+        defaults.append(default_values[parameter.name])
+    elif parameter.kind is inspect.Parameter.KEYWORD_ONLY:
+      kwonlyargs.append(parameter.name)
+      if parameter.default is not inspect.Parameter.empty:
+        kwonlydefaults[parameter.name] = default_values[parameter.name]
+    elif parameter.kind is inspect.Parameter.VAR_POSITIONAL:
+      varargs = parameter.name
+    elif parameter.kind is inspect.Parameter.VAR_KEYWORD:
+      varkw = parameter.name
+
+  if (is_bound_method and (not args or args[0] != "self")):
+    args.insert(0, "self")
+
+  return inspect.FullArgSpec(
+      args,
+      varargs,
+      varkw,
+      tuple(defaults) if defaults else None,
+      kwonlyargs,
+      kwonlydefaults if kwonlydefaults else None,
+      annotations={})
+
+
+def _to_default_values(fullargspec):
+  """Returns default values from the function's inspected fullargspec."""
+  if fullargspec.defaults is not None:
+    defaults = {
+        name: value for name, value in zip(
+            fullargspec.args[-len(fullargspec.defaults):], fullargspec.defaults)
+    }
+  else:
+    defaults = {}
+
+  if fullargspec.kwonlydefaults is not None:
+    defaults.update(fullargspec.kwonlydefaults)
+
+  defaults = {
+      function_type_lib.sanitize_arg_name(name): value
+      for name, value in defaults.items()
+  }
+
+  return defaults
+
+
+def to_function_type(fullargspec):
+  """Generates FunctionType and default values from fullargspec."""
+  default_values = _to_default_values(fullargspec)
+  parameters = []
+
+  for arg in fullargspec.args:
+    arg_name = function_type_lib.sanitize_arg_name(arg)
+    parameters.append(
+        function_type_lib.Parameter(
+            arg_name, function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+            arg_name in default_values, None))
+
+  if fullargspec.varargs is not None:
+    parameters.append(
+        function_type_lib.Parameter(fullargspec.varargs,
+                                    function_type_lib.Parameter.VAR_POSITIONAL,
+                                    False, None))
+
+  for kwarg in fullargspec.kwonlyargs:
+    parameters.append(
+        function_type_lib.Parameter(
+            function_type_lib.sanitize_arg_name(kwarg),
+            function_type_lib.Parameter.KEYWORD_ONLY, kwarg in default_values,
+            None))
+
+  if fullargspec.varkw is not None:
+    parameters.append(
+        function_type_lib.Parameter(fullargspec.varkw,
+                                    function_type_lib.Parameter.VAR_KEYWORD,
+                                    False, None))
+
+  return function_type_lib.FunctionType(parameters), default_values
+
+
+def to_input_signature(function_type):
+  """Extracts an input_signature from function_type instance."""
+  constrained_parameters = list(function_type.parameters.keys())
+
+  # self does not have a constraint in input_signature
+  if "self" in constrained_parameters:
+    constrained_parameters.pop(0)
+
+  # There are no parameters to constrain.
+  if not constrained_parameters:
+    return tuple()
+
+  constraints = []
+  is_auto_constrained = False
+
+  for parameter_name in constrained_parameters:
+    parameter = function_type.parameters[parameter_name]
+    constraint = None
+    if parameter.type_constraint:
+      # Generate legacy constraint representation.
+      constraint = parameter.type_constraint.placeholder_value(
+          trace_type.InternalPlaceholderContext(unnest_only=True)
+      )
+      if any(
+          not isinstance(arg, tensor_spec.TensorSpec)
+          for arg in nest.flatten([constraint], expand_composites=True)):
+        # input_signature only supports contiguous TensorSpec composites
+        is_auto_constrained = True
+        break
+      else:
+        constraints.append(constraint)
+
+  # All constraints were generated by FunctionType
+  if is_auto_constrained and not constraints:
+    return tuple()
+
+  # If the list is empty then there was no input_signature specified.
+  return tuple(constraints) if constraints else None
+
+
 # TODO(b/214462107): Clean up and migrate to core/function when unblocked.
 class FunctionSpec(object):
   """Specification of how to bind arguments to a function."""
 
   @classmethod
-  def from_function_and_signature(cls, python_function,
+  def from_function_and_signature(cls,
+                                  python_function,
                                   input_signature,
                                   is_pure=False,
                                   jit_compile=None):
@@ -54,18 +192,25 @@ def from_function_and_signature(cls, python_function,
       python_function: a function to inspect
       input_signature: a signature of the function (None, if variable)
       is_pure: if True all input arguments (including variables and constants)
-      will be converted to tensors and no variable changes allowed.
+        will be converted to tensors and no variable changes allowed.
       jit_compile: see `tf.function`
 
     Returns:
       instance of FunctionSpec
     """
     _validate_signature(input_signature)
-    _validate_python_function(python_function, input_signature)
 
-    fullargspec = tf_inspect.getfullargspec(python_function)
-    # Checks if the `fullargspec` contains self or cls as its first argument.
-    is_method = tf_inspect.isanytargetmethod(python_function)
+    function_type = function_type_lib.FunctionType.from_callable(
+        python_function)
+    default_values = function_type_lib.FunctionType.get_default_values(
+        python_function)
+
+    is_bound_method = inspect.ismethod(python_function)
+
+    if input_signature is not None:
+      input_signature = tuple(input_signature)
+      function_type = function_type_lib.add_type_constraints(
+          function_type, input_signature, default_values)
 
     # Get the function's name.  Remove functools.partial wrappers if necessary.
     while isinstance(python_function, functools.partial):
@@ -73,153 +218,68 @@ def from_function_and_signature(cls, python_function,
     name = getattr(python_function, "__name__", "f")
 
     return FunctionSpec(
-        fullargspec,
-        is_method,
-        input_signature,
+        function_type,
+        default_values,
+        is_bound_method,
         is_pure=is_pure,
         jit_compile=jit_compile,
         name=name)
 
+  @classmethod
+  def from_fullargspec_and_signature(cls,
+                                     fullargspec,
+                                     is_bound_method,
+                                     input_signature,
+                                     is_pure=False,
+                                     name=None,
+                                     jit_compile=None):
+    """Construct FunctionSpec from legacy FullArgSpec format."""
+    function_type, default_values = to_function_type(fullargspec)
+    if input_signature:
+      input_signature = tuple(input_signature)
+      _validate_signature(input_signature)
+      function_type = function_type_lib.add_type_constraints(
+          function_type, input_signature, default_values)
+
+    return FunctionSpec(function_type, default_values, is_bound_method, is_pure,
+                        name, jit_compile)
+
+  # TODO(fmuham): Remove redundant is_bound_method.
   def __init__(self,
-               fullargspec,
-               is_method,
-               input_signature,
+               function_type,
+               default_values,
+               is_bound_method,
                is_pure=False,
                name=None,
                jit_compile=None):
     """Constructs a FunctionSpec describing a python function.
 
     Args:
-      fullargspec: `tf_inspect.FullArgSpec` object describing the function.
-      is_method: True if the function is a method.
-      input_signature: a signature of the function (None, if variable)
+      function_type: A FunctionType describing the python function signature.
+      default_values: Dictionary mapping parameter names to default values.
+      is_bound_method: True if the underlying function is a bound method.
       is_pure: if True all input arguments (including variables and constants)
         will be converted to tensors and no variable changes allowed.
       name: Name of the function
       jit_compile: see `tf.function`.
     """
-    self._fullargspec = fullargspec
-    self._is_method = is_method
+    self._function_type = function_type
+    self._default_values = default_values
+    self._fullargspec = to_fullargspec(function_type, default_values,
+                                       is_bound_method)
+    self._is_bound_method = is_bound_method
     self._is_pure = is_pure
     self._jit_compile = jit_compile
 
     # TODO(edloper): Include name when serializing for SavedModel?
     self._name = name or "f"
-
-    if self._is_method:
-      # Remove `self`: default arguments shouldn't be matched to it.
-      # TODO(b/127938157): Should this error out if there is no arg to
-      # be removed?
-      args = fullargspec.args[1:]
-    else:
-      args = fullargspec.args
-
-    # A cache mapping from argument name to index, for canonicalizing
-    # arguments that are called in a keyword-like fashion.
-    self._args_to_indices = {arg: i for i, arg in enumerate(args)}
-    self._arg_names = args
-
-    # A cache mapping from arg index to default value, for canonicalization.
-    default_values = fullargspec.defaults
-    offset = len(args) - len(default_values or [])
-    self._arg_indices_to_default_values = {
-        offset + index: default
-        for index, default in enumerate(default_values or [])
-    }
-    self._arg_indices_no_default_values = set(range(len(args))) - set(
-        self._arg_indices_to_default_values)
-
-    _validate_signature(input_signature)
-    if input_signature is None:
-      self._input_signature = None
-    else:
-      self._input_signature = tuple(input_signature)
-      self._flat_input_signature = tuple(nest.flatten(input_signature,
-                                                      expand_composites=True))
-    self.validate_input_signature_with_argspec()
-    self._default_values = self._make_default_values()
-    self._function_type = self._make_function_type()
-
-  def _make_default_values(self):
-    """Returns default values from the function's inspected fullargspec."""
-    if self.fullargspec.defaults is not None:
-      defaults = {
-          name: value for name, value in zip(
-              self.fullargspec.args[-len(self.fullargspec.defaults):],
-              self.fullargspec.defaults)
-      }
-    else:
-      defaults = {}
-
-    if self.fullargspec.kwonlydefaults is not None:
-      defaults.update(self.fullargspec.kwonlydefaults)
-
-    return defaults
+    self._input_signature = to_input_signature(function_type)
 
   @property
   def default_values(self):
     """Returns dict mapping parameter names to default values."""
     return self._default_values
 
-  def _make_function_type(self):
-    """Repackages fullargspec information into an equivalent FunctionType."""
-    parameters = []
-
-    arg_kind = (
-        function_type_lib.Parameter.POSITIONAL_ONLY
-        if self.fullargspec.kwonlyargs else
-        function_type_lib.Parameter.POSITIONAL_OR_KEYWORD)
-    for arg in self.fullargspec.args:
-      # TODO(b/249802365): Add sanitization warning when load-bearing.
-      parameters.append(
-          function_type_lib.Parameter(
-              tensor_spec.sanitize_spec_name(arg), arg_kind, arg
-              in self.default_values, None))
-
-    if self.fullargspec.varargs is not None:
-      parameters.append(
-          function_type_lib.Parameter(
-              self.fullargspec.varargs,
-              function_type_lib.Parameter.VAR_POSITIONAL, False, None))
-
-    for kwarg in self.fullargspec.kwonlyargs:
-      # TODO(b/249802365): Add sanitization warning when load-bearing.
-      parameters.append(
-          function_type_lib.Parameter(
-              tensor_spec.sanitize_spec_name(kwarg),
-              function_type_lib.Parameter.KEYWORD_ONLY, kwarg
-              in self.default_values, None))
-
-    if self.fullargspec.varkw is not None:
-      parameters.append(
-          function_type_lib.Parameter(self.fullargspec.varkw,
-                                      function_type_lib.Parameter.VAR_KEYWORD,
-                                      False, None))
-
-    # Annotate with Type Constraints if needed.
-    if self.input_signature:
-      scanned_index = 0
-      for i, param in enumerate(parameters):
-        if (param.name != "self" and
-            param.kind != function_type_lib.Parameter.VAR_POSITIONAL and
-            param.kind != function_type_lib.Parameter.VAR_KEYWORD):
-          if scanned_index < len(self.input_signature):
-            type_constraint = trace_type.from_value(
-                self.input_signature[scanned_index],
-                trace_type.InternalTracingContext(is_legacy_signature=True))
-            parameters[i] = function_type_lib.Parameter(param.name, param.kind,
-                                                        param.optional,
-                                                        type_constraint)
-            scanned_index += 1
-          elif param.name in self.default_values:
-            type_constraint = trace_type.from_value(
-                self.default_values[param.name])
-            parameters[i] = function_type_lib.Parameter(param.name, param.kind,
-                                                        param.optional,
-                                                        type_constraint)
-
-    return function_type_lib.FunctionType(parameters)
-
   @property
   def function_type(self):
     """Returns a FunctionType representing the Python function signature."""
@@ -229,25 +289,21 @@ def function_type(self):
   def fullargspec(self):
     return self._fullargspec
 
+  # TODO(fmuham): Remove redundant property.
   @property
   def is_method(self):
-    return self._is_method
-
-  @property
-  def args_to_indices(self):
-    return self._args_to_indices
-
-  @property
-  def kwargs_to_include(self):
-    return self._kwargs_to_include
+    """Returns True if the function is a method with a class instance bound."""
+    return self._is_bound_method
 
+  # TODO(fmuham): Replace usages with FunctionType and remove.
   @property
   def input_signature(self):
     return self._input_signature
 
+  # TODO(fmuham): Replace usages with FunctionType and remove.
   @property
   def flat_input_signature(self):
-    return self._flat_input_signature
+    return tuple(nest.flatten(self.input_signature, expand_composites=True))
 
   @property
   def is_pure(self):
@@ -257,17 +313,41 @@ def is_pure(self):
   def jit_compile(self):
     return self._jit_compile
 
+  # TODO(fmuham): Replace usages and remove.
   @property
   def arg_names(self):
-    return self._arg_names
+    return list(
+        p.name
+        for p in self.function_type.parameters.values()
+        if (
+            p.kind is function_type_lib.Parameter.POSITIONAL_ONLY
+            or p.kind is function_type_lib.Parameter.POSITIONAL_OR_KEYWORD
+        )
+    )
+
+  def make_canonicalized_monomorphic_type(
+      self,
+      args: Any,
+      kwargs: Any,
+      captures: Any = None,
+  ) -> Tuple[function_type_lib.FunctionType,
+             trace_type.WeakrefDeletionObserver]:
+    """Generates function type given the function arguments."""
+    if captures is None:
+      captures = dict()
+
+    kwargs = {
+        function_type_lib.sanitize_arg_name(name): value
+        for name, value in kwargs.items()
+    }
 
-  @property
-  def vararg_name(self):
-    return self._fullargspec.varargs
+    _, function_type, type_context = (
+        function_type_lib.canonicalize_to_monomorphic(
+            args, kwargs, self.default_values, captures, self.function_type
+        )
+    )
 
-  @property
-  def varkw_name(self):
-    return self._fullargspec.varkw
+    return function_type, type_context
 
   def signature_summary(self, default_values=False):
     """Returns a string summarizing this function's signature.
@@ -290,65 +370,6 @@ def signature_summary(self, default_values=False):
           args[-1] += "={}".format(self._fullargspec.kwonlydefaults[arg_name])
     return f"{self._name}({', '.join(args)})"
 
-  def validate_input_signature_with_argspec(self):
-    """Checks the python_function's args to be valid against input_signature."""
-    if self.input_signature is not None:
-      arglen = len(self.input_signature)
-      arg_names_len = len(self.arg_names)
-      defaults = self.fullargspec.defaults or ()
-      unbound_self_arg = 1 if (not self.is_method and arg_names_len > 0 and
-                               self.arg_names[0] == "self") else 0
-      if not all(d is BOUND_VALUE for d in defaults):
-        default_arg_len = len(defaults)
-        required_arg_len = arg_names_len - default_arg_len - unbound_self_arg
-        # The input signature must cover all required function arguments.
-        if arglen < required_arg_len:
-          missing_tensor_specs = self.arg_names[
-              arglen:required_arg_len]
-          raise TypeError(
-              f"The decorated tf.function has {required_arg_len} "
-              f"required argument(s), but tf.function was only passed an "
-              f"input_signature of length {arglen}. This covers {arglen} "
-              f"required argument(s): {self.arg_names[:arglen]}, "
-              f"but TensorSpecs are still required for the remaining "
-              f"{len(missing_tensor_specs)} argument(s):"
-              f" {missing_tensor_specs}.")
-
-  def _validate_inputs(self, flat_inputs):
-    """Raises an error if inputs contain illegal values."""
-    for inp in flat_inputs:
-      # TODO(b/183107079): Allow these once they're handled properly.
-      if isinstance(inp, weakref.ref):
-        raise ValueError(
-            f"weakref input {inp} not supported for function {self._name}")
-
-  def validate_inputs_with_signature(self, args, kwargs):
-    """Checks args and kwargs against the specified input_signature."""
-    if kwargs:
-      raise ValueError("Cannot define a TensorFlow function from a Python "
-                       "function with keyword arguments when "
-                       "input_signature is provided, got keyword arguments "
-                       f"({kwargs}) with input_signature "
-                       f"({self.input_signature}).")
-    if args:
-      # If args are provided, they must match the input signature.
-      if not is_same_structure(self.input_signature, args):
-        raise ValueError("Structure of Python function inputs does not match "
-                         f"input_signature: inputs ({args}), "
-                         f"input_signature ({self.input_signature}).")
-      flat_inputs = nest.flatten(args, expand_composites=True)
-      if any(not isinstance(arg, (ops.Tensor, tensor_spec.DenseSpec,
-                                  resource_variable_ops.BaseResourceVariable))
-             for arg in flat_inputs):
-        raise ValueError("When input_signature is provided, all inputs to "
-                         "the Python function must be Tensors, Variables, "
-                         "tf.TensorSpec or tf.VariableSpec objects.")
-      if any(not spec.is_compatible_with(other)
-             for spec, other in zip(self.flat_input_signature, flat_inputs)):
-        raise ValueError("Python inputs incompatible with input_signature: "
-                         f"inputs ({args}), input_signature "
-                         f"({self.input_signature}).")
-
   def canonicalize_function_inputs(self, args, kwargs):
     """Canonicalizes `args` and `kwargs`.
 
@@ -381,104 +402,35 @@ def canonicalize_function_inputs(self, args, kwargs):
         argument when an input signature is specified, or when the inputs
         do not conform to the input signature.
     """
-    kwargs = {key: kwargs[key] for key in kwargs}
-    if self._is_pure:
+    if self.is_pure:
       args, kwargs = _convert_variables_to_tensors(args, kwargs)
+    args, kwargs = self.bind_function_inputs(args, kwargs)
+    args, kwargs = cast_inputs(args, kwargs, self.input_signature)
+    filtered_flat_args = filter_function_inputs(args, kwargs)
+
+    return args, kwargs, filtered_flat_args
 
-    # Pre-calculate to reduce overhead
-    arglen = len(args)
-    if self._input_signature is not None:
-      if arglen > len(self._input_signature):
-        raise TypeError(f"{self.signature_summary()} has an input_signature "
-                        f"{self._input_signature} which specifies "
-                        f"{len(self._input_signature)} positional arguments, "
-                        f"but got {arglen}.")
-      for arg in six.iterkeys(kwargs):
-        index = self._args_to_indices.get(arg, None)
-        if index is None:
-          raise TypeError(f"{self.signature_summary()} got unexpected keyword "
-                          f"argument `{arg}`.")
-        if index >= len(self._input_signature):
-          raise TypeError(
-              f"{self.signature_summary()} got keyword argument `{arg}` that "
-              "was not included in input_signature.")
-
-    if not kwargs:
-      inputs = args
-      if self._arg_indices_to_default_values:
-        try:
-          inputs += tuple(self._arg_indices_to_default_values[i]
-                          for i in range(arglen, len(self._arg_names)))
-        except KeyError:
-          missing_args = [
-              self._arg_names[i]
-              for i in range(arglen, len(self._arg_names))
-              if i not in self._arg_indices_to_default_values
-          ]
-          raise TypeError(f"{self.signature_summary()} missing required "
-                          f"arguments: {', '.join(missing_args)}.")
-
-      if self._fullargspec.kwonlydefaults:
-        kwargs.update(self._fullargspec.kwonlydefaults)
-    else:
-      # Maps from index of arg to its corresponding value, according to `args`
-      # and `kwargs`; seeded with the default values for the named args that
-      # aren't in `args`.
-      arg_indices_to_values = {
-          index: default for index, default in six.iteritems(
-              self._arg_indices_to_default_values) if index >= arglen
-      }
-      consumed_args = []
-      missing_arg_indices = self._arg_indices_no_default_values - set(
-          range(arglen))
-      for arg, value in six.iteritems(kwargs):
-        index = self._args_to_indices.get(arg, None)
-        if index is not None:
-          if index < arglen:
-            raise TypeError(f"{self.signature_summary()} got two values for "
-                            f"{arg!r}.")
-          arg_indices_to_values[index] = value
-          # These arguments in 'kwargs' might also belong to
-          # positional arguments
-          missing_arg_indices.discard(index)
-          consumed_args.append(arg)
-      for arg in consumed_args:
-        # After this loop, `kwargs` will only contain keyword_only arguments,
-        # and all positional_or_keyword arguments have been moved to `inputs`.
-        kwargs.pop(arg)
-      inputs = args + _deterministic_dict_values(arg_indices_to_values)
-      # Exclude positional args with values
-      if missing_arg_indices:
-        missing_args = [self._arg_names[i] for i in sorted(missing_arg_indices)]
-        if len(missing_args) == 1:
-          raise TypeError(f"{self.signature_summary()} missing 1 required "
-                          f"argument: {missing_args[0]}.")
-        else:
-          raise TypeError(f"{self.signature_summary()} missing required "
-                          f"arguments: {', '.join(missing_args)}.")
-
-      if kwargs and self._input_signature is not None:
-        raise TypeError("Keyword arguments are not supported when "
-                        "input_signature is provided. Signature: "
-                        f"{self.signature_summary()}. Keyword arguments: "
-                        f"{kwargs}.")
-
-      if self._fullargspec.kwonlydefaults:
-        for (kwarg, default) in self._fullargspec.kwonlydefaults.items():
-          kwargs.setdefault(kwarg, default)
-
-    if self._input_signature is None:
-      inputs, flat_inputs, filtered_flat_inputs = _convert_numpy_inputs(inputs)
-      kwargs, flat_kwargs, filtered_flat_kwargs = _convert_numpy_inputs(kwargs)
-      flat_inputs += flat_kwargs
-      filtered_flat_inputs += filtered_flat_kwargs
-    else:
-      inputs, flat_inputs, filtered_flat_inputs = convert_inputs_to_signature(
-          inputs, self._input_signature, self._flat_input_signature)
-
-    self._validate_inputs(flat_inputs)
-
-    return inputs, kwargs, filtered_flat_inputs
+  def bind_function_inputs(self, args, kwargs):
+    """Bind `args` and `kwargs` into a canonicalized signature args, kwargs."""
+    sanitized_kwargs = {
+        function_type_lib.sanitize_arg_name(k): v for k, v in kwargs.items()
+    }
+    if len(kwargs) != len(sanitized_kwargs):
+      raise ValueError(f"Name collision after sanitization. Please rename "
+                       f"tf.function input parameters. Original: "
+                       f"{sorted(kwargs.keys())}, Sanitized: "
+                       f"{sorted(sanitized_kwargs.keys())}")
+
+    try:
+      bound_arguments = self.function_type.bind_with_defaults(
+          args, sanitized_kwargs, self.default_values)
+    except Exception as e:
+      raise TypeError(
+          f"Binding inputs to tf.function `{self._name}` failed due to `{e}`."
+          f"Received args: {args} and kwargs: {sanitized_kwargs} for signature:"
+          f" {self.function_type}."
+      ) from e
+    return bound_arguments.args, bound_arguments.kwargs
 
 
 def _validate_signature(signature):
@@ -498,47 +450,15 @@ def _validate_signature(signature):
 
   if any(not isinstance(arg, tensor_spec.TensorSpec)
          for arg in nest.flatten(signature, expand_composites=True)):
-    bad_args = [arg for arg in nest.flatten(signature, expand_composites=True)
-                if not isinstance(arg, tensor_spec.TensorSpec)]
+    bad_args = [
+        arg for arg in nest.flatten(signature, expand_composites=True)
+        if not isinstance(arg, tensor_spec.TensorSpec)
+    ]
     raise TypeError("input_signature must be a possibly nested sequence of "
                     f"TensorSpec objects, got invalid args {bad_args} with "
                     f"types {list(six.moves.map(type, bad_args))}.")
 
 
-def _validate_python_function(python_function, input_signature):
-  """Checks the python_function to be valid against the input_signature."""
-  if not callable(python_function):
-    raise TypeError(f"{python_function} is not a callable object.")
-
-  if input_signature is not None:
-    fullargspec = tf_inspect.getfullargspec(python_function)
-    if set(fullargspec.kwonlyargs) - set(fullargspec.kwonlydefaults or ()):
-      nodefault_kwonlyargs = set(fullargspec.kwonlyargs)
-      if fullargspec.kwonlydefaults is not None:
-        nodefault_kwonlyargs -= set(fullargspec.kwonlydefaults)
-      raise ValueError("Cannot build TF function from "
-                       f"{python_function.__name__}: keyword-only arguments "
-                       "must have default values when input_signature is "
-                       "provided. Got keyword-only arguments without default "
-                       f"values: {sorted(nodefault_kwonlyargs)}.")
-
-
-def is_same_structure(structure1, structure2, check_values=False):
-  """Check two structures for equality, optionally of types and of values."""
-  try:
-    nest.assert_same_structure(structure1, structure2, expand_composites=True)
-  except (ValueError, TypeError):
-    return False
-  if check_values:
-    flattened1 = nest.flatten(structure1, expand_composites=True)
-    flattened2 = nest.flatten(structure2, expand_composites=True)
-    # First check the types to avoid AttributeErrors.
-    if any(type(f1) is not type(f2) for f1, f2 in zip(flattened1, flattened2)):
-      return False
-    return flattened1 == flattened2
-  return True
-
-
 def _to_tensor_or_tensor_spec(x):
   return (x if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec)) else
           ops.convert_to_tensor(x))
@@ -550,12 +470,22 @@ def _deterministic_dict_values(dictionary):
 
 def _convert_variables_to_tensors(args, kwargs):
   args = [_to_tensor_or_tensor_spec(x) for x in args]
-  kwargs = {kw: _to_tensor_or_tensor_spec(x)
-            for kw, x in kwargs.items()}
+  kwargs = {kw: _to_tensor_or_tensor_spec(x) for kw, x in kwargs.items()}
   return tuple(args), kwargs
 
 
-def _convert_numpy_inputs(inputs):
+def cast_inputs(args, kwargs, input_signature):
+  """Casts args, kwargs to TF values based on an optional input_signature."""
+  if input_signature is None:
+    args = cast_numpy_inputs(args)
+  else:
+    args = cast_inputs_to_signature(args, input_signature)
+  kwargs = cast_numpy_inputs(kwargs)
+
+  return args, kwargs
+
+
+def cast_numpy_inputs(inputs):
   """Converts numpy array inputs to tensors."""
   flat_inputs = composite_tensor_utils.flatten_with_variables(inputs)
 
@@ -581,20 +511,20 @@ def _convert_numpy_inputs(inputs):
       filtered_flat_inputs.append(flat_inputs[index])
       need_packing = True
   if need_packing:
-    return (
-        nest.pack_sequence_as(
-            structure=inputs,
-            flat_sequence=nest.flatten(flat_inputs, expand_composites=True),
-            expand_composites=True),
-        flat_inputs,
-        filtered_flat_inputs)
+    return nest.pack_sequence_as(
+        structure=inputs,
+        flat_sequence=nest.flatten(flat_inputs, expand_composites=True),
+        expand_composites=True)
   else:
-    return inputs, flat_inputs, filtered_flat_inputs
+    return inputs
 
 
-def convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
+def cast_inputs_to_signature(inputs, input_signature):
   """Converts inputs to pass into a function with an explicit signature."""
 
+  flat_input_signature = tuple(
+      nest.flatten(input_signature, expand_composites=True))
+
   def format_error_message(inputs, input_signature):
     return ("  inputs: (\n" + "    " + ",\n    ".join(str(i) for i in inputs) +
             ")\n" + "  input_signature: (\n" + "    " +
@@ -612,8 +542,8 @@ def format_error_message(inputs, input_signature):
                      f"{format_error_message(inputs, input_signature)}.")
 
   need_packing = False
-  for index, (value, spec) in enumerate(zip(flatten_inputs,
-                                            flat_input_signature)):
+  for index, (value,
+              spec) in enumerate(zip(flatten_inputs, flat_input_signature)):
     if (isinstance(spec, tensor_spec.TensorSpec) and
         not isinstance(value, tensor_spec.TensorSpec) and
         not _pywrap_utils.IsTensor(value)):
@@ -627,9 +557,8 @@ def format_error_message(inputs, input_signature):
                          "tensors:\n"
                          f"{format_error_message(inputs, input_signature)}.")
 
-  if any(not spec.is_compatible_with(other) for spec, other in zip(
-      flat_input_signature,
-      flatten_inputs)):
+  if any(not spec.is_compatible_with(other)
+         for spec, other in zip(flat_input_signature, flatten_inputs)):
     raise ValueError("Python inputs incompatible with input_signature:\n"
                      f"{format_error_message(inputs, input_signature)}.")
 
@@ -639,12 +568,25 @@ def format_error_message(inputs, input_signature):
         flat_sequence=flatten_inputs,
         expand_composites=True)
 
-  flat_inputs = composite_tensor_utils.flatten_with_variables(inputs)
+  return inputs
+
+
+def filter_function_inputs(args, kwargs):
+  """Filters and flattens args and kwargs."""
+  flat_inputs = composite_tensor_utils.flatten_with_variables(
+      args) + composite_tensor_utils.flatten_with_variables(kwargs)
 
-  return (inputs, flat_inputs, [
+  for inp in flat_inputs:
+    # TODO(b/183107079): Allow these once they're handled properly.
+    if isinstance(inp, weakref.ref):
+      raise ValueError(f"weakref input {inp} not supported for tf.function.")
+
+  filtered_flat_inputs = [
       t for t in flat_inputs
       if isinstance(t, (ops.Tensor, resource_variable_ops.BaseResourceVariable))
-  ])
+  ]
+
+  return filtered_flat_inputs
 
 
 def _get_variable_specs(args):
@@ -659,3 +601,20 @@ def _get_variable_specs(args):
       # arg is a CompositeTensor spec.
       variable_specs.extend(_get_variable_specs(arg._component_specs))  # pylint: disable=protected-access
   return variable_specs
+
+
+# TODO(fmuham): Replace usages with TraceType and remove.
+def is_same_structure(structure1, structure2, check_values=False):
+  """Check two structures for equality, optionally of types and of values."""
+  try:
+    nest.assert_same_structure(structure1, structure2, expand_composites=True)
+  except (ValueError, TypeError):
+    return False
+  if check_values:
+    flattened1 = nest.flatten(structure1, expand_composites=True)
+    flattened2 = nest.flatten(structure2, expand_composites=True)
+    # First check the types to avoid AttributeErrors.
+    if any(type(f1) is not type(f2) for f1, f2 in zip(flattened1, flattened2)):
+      return False
+    return flattened1 == flattened2
+  return True
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec_test.py b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
index 1de737f6c64..258e4b45ff4 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec_test.py
+++ b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
@@ -218,6 +218,58 @@ def foo(x, y, z=3):  # pylint: disable=unused-argument
                 type_constraint[2])
         ]))
 
+  @parameterized.product(
+      ({
+          'input_signature': (tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None)),
+          'type_constraint': (tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None))
+      }, {
+          'input_signature': ([
+              tensor_spec.TensorSpec(shape=None),
+              tensor_spec.TensorSpec(shape=None)
+          ], tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None)),
+          'type_constraint': (trace_type.from_value([
+              tensor_spec.TensorSpec(shape=None),
+              tensor_spec.TensorSpec(shape=None)
+          ], trace_type.InternalTracingContext(is_legacy_signature=True)),
+                              tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None))
+      }),
+      decorator=(dummy_tf_decorator, transparent_decorator),
+  )
+  def test_varargs(self, input_signature, type_constraint, decorator):
+
+    @decorator
+    def foo(*my_var_args):  # pylint: disable=unused-argument
+      pass
+
+    spec = function_spec.FunctionSpec.from_function_and_signature(
+        foo, input_signature)
+    self.assertEqual(
+        tuple(spec.fullargspec),
+        (['my_var_args_0', 'my_var_args_1', 'my_var_args_2'
+         ], None, None, None, [], None, {}))
+    self.assertEqual(spec.is_method, False)
+    self.assertEqual(spec.input_signature, input_signature)
+    self.assertEqual(spec.default_values, {})
+    self.assertEqual(
+        spec.function_type,
+        function_type_lib.FunctionType([
+            function_type_lib.Parameter(
+                'my_var_args_0', function_type_lib.Parameter.POSITIONAL_ONLY,
+                False, type_constraint[0]),
+            function_type_lib.Parameter(
+                'my_var_args_1', function_type_lib.Parameter.POSITIONAL_ONLY,
+                False, type_constraint[1]),
+            function_type_lib.Parameter(
+                'my_var_args_2', function_type_lib.Parameter.POSITIONAL_ONLY,
+                False, type_constraint[2])
+        ]))
+
   @parameterized.product(
       ({
           'input_signature': None,
@@ -269,10 +321,10 @@ def foo(x, y, *, z=3):  # pylint: disable=unused-argument
         spec.function_type,
         function_type_lib.FunctionType([
             function_type_lib.Parameter(
-                'x', function_type_lib.Parameter.POSITIONAL_ONLY, False,
+                'x', function_type_lib.Parameter.POSITIONAL_OR_KEYWORD, False,
                 type_constraint[0]),
             function_type_lib.Parameter(
-                'y', function_type_lib.Parameter.POSITIONAL_ONLY, False,
+                'y', function_type_lib.Parameter.POSITIONAL_OR_KEYWORD, False,
                 type_constraint[1]),
             function_type_lib.Parameter(
                 'z', function_type_lib.Parameter.KEYWORD_ONLY, True,
@@ -307,8 +359,85 @@ def foo(x, y, *, z=3):  # pylint: disable=unused-argument
       }),
       decorator=(dummy_tf_decorator, transparent_decorator),
   )
-  def test_method_bound(self, input_signature, type_constraint, decorator):
+  def test_method_bound_internal(
+      self, input_signature, type_constraint, decorator
+  ):
+
+    def testing_decorator(func):
+      spec = function_spec.FunctionSpec.from_function_and_signature(
+          func, input_signature
+      )
+      self.assertEqual(
+          tuple(spec.fullargspec),
+          (['self', 'x', 'y'], None, None, (1,), [], None, {}),
+      )
+
+      self.assertEqual(spec.is_method, False)
+      self.assertEqual(spec.default_values, {'y': 1})
+
+      self.assertEqual(
+          spec.function_type,
+          function_type_lib.FunctionType([
+              function_type_lib.Parameter(
+                  'self',
+                  function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+                  False,
+                  type_constraint[0],
+              ),
+              function_type_lib.Parameter(
+                  'x',
+                  function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+                  False,
+                  type_constraint[1],
+              ),
+              function_type_lib.Parameter(
+                  'y',
+                  function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+                  True,
+                  type_constraint[2],
+              ),
+          ]),
+      )
+
+      return func
+
+    class MyClass:
+
+      @testing_decorator
+      def foo(self, x, y=1):
+        pass
 
+    MyClass().foo(1)
+
+  @parameterized.product(
+      ({
+          'input_signature': None,
+          'type_constraint': (None, None, None)
+      }, {
+          'input_signature': (tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None)),
+          'type_constraint': (tensor_spec.TensorSpec(shape=None),
+                              tensor_spec.TensorSpec(shape=None))
+      }, {
+          'input_signature': (tensor_spec.TensorSpec(shape=None),),
+          'type_constraint': (tensor_spec.TensorSpec(shape=None),
+                              trace_type.from_value(1))
+      }, {
+          'input_signature': ([
+              tensor_spec.TensorSpec(shape=None),
+              tensor_spec.TensorSpec(shape=None)
+          ], tensor_spec.TensorSpec(shape=None)),
+          'type_constraint': (trace_type.from_value([
+              tensor_spec.TensorSpec(shape=None),
+              tensor_spec.TensorSpec(shape=None)
+          ], trace_type.InternalTracingContext(is_legacy_signature=True)),
+                              tensor_spec.TensorSpec(shape=None))
+      }),
+      decorator=(dummy_tf_decorator, transparent_decorator),
+  )
+  def test_method_bound_external(
+      self, input_signature, type_constraint, decorator
+  ):
     class MyClass:
 
       @decorator
@@ -319,21 +448,19 @@ def foo(self, x, y=1):
         MyClass().foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
-        (['self', 'x', 'y'], None, None, (1,), [], None, {}))
-    self.assertEqual(spec.is_method, decorator is transparent_decorator)
+        (['self', 'x', 'y'], None, None, (1,), [], None, {}),
+    )
+    self.assertEqual(spec.is_method, True)
     self.assertEqual(spec.default_values, {'y': 1})
     self.assertEqual(
         spec.function_type,
         function_type_lib.FunctionType([
-            function_type_lib.Parameter(
-                'self', function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
-                False, type_constraint[0]),
             function_type_lib.Parameter(
                 'x', function_type_lib.Parameter.POSITIONAL_OR_KEYWORD, False,
-                type_constraint[1]),
+                type_constraint[0]),
             function_type_lib.Parameter(
                 'y', function_type_lib.Parameter.POSITIONAL_OR_KEYWORD, True,
-                type_constraint[2])
+                type_constraint[1])
         ]))
 
   @parameterized.product(
@@ -395,5 +522,22 @@ def foo(self, x, y=1):
         ]))
 
 
+# TODO(fmuham): Remove when is_same_structure is removed.
+class SameStructureTest(test.TestCase):
+
+  def test_same_structure(self):
+    self.assertTrue(function_spec.is_same_structure([1, 2, 3], [1, 2, 3], True))
+    self.assertTrue(
+        function_spec.is_same_structure([1, 2, 3], [1, 2, 4], False)
+    )
+
+    self.assertFalse(
+        function_spec.is_same_structure([1, 2, 3], [1, 2, 4], True)
+    )
+    self.assertFalse(
+        function_spec.is_same_structure([1, 2, 3], [1, 2, 3, 4], False)
+    )
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py b/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
index b1507ea6f61..75a275eae26 100644
--- a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
@@ -21,16 +21,18 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.function import trace_type
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.client import pywrap_tf_session
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import forwardprop_util
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
 from tensorflow.python.eager.polymorphic_function import function_spec
+from tensorflow.python.eager.polymorphic_function import saved_model_exported_concrete
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
@@ -55,28 +57,11 @@
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
-from tensorflow.python.util import lazy_loader
-from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_inspect
 
 
-np_arrays = lazy_loader.LazyLoader(
-    "np_arrays", globals(),
-    "tensorflow.python.ops.numpy_ops.np_arrays")
-
-saved_model_utils = lazy_loader.LazyLoader(
-    "saved_model_utils", globals(),
-    "tensorflow.python.eager.polymorphic_function.saved_model_utils"
-)
-
-FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
-BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
-IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
-SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
-
-
 def _is_type_subset(a, b):
   """Returns true if `b` is a subset of type `a` (or if a is not a TypeSpec.)"""
   if isinstance(a, type_spec.TypeSpec):
@@ -100,6 +85,9 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
+    if key not in attributes_lib.MONOMORPHIC_FUNCTION_ALLOWLIST:
+      raise ValueError(
+          f"ConcreteFunction does not support `{key}` as an attribute.")
     if isinstance(value, attr_value_pb2.AttrValue):
       attrs[key] = value
     # bool type check has to happen before int since bool is a subclass of int.
@@ -274,7 +262,7 @@ def __init__(self, name, graph, inputs, outputs, attrs):
     self._num_outputs = len(signature.output_arg)
     self._output_types = [o.type for o in signature.output_arg]
     self._output_shapes = [o.shape for o in outputs]
-    self._control_captures = graph.control_captures
+    self._control_captures = graph._function_captures.control  # pylint: disable=protected-access
     # Shallow copy outputs since ConcreteFunction may mutate it.
     self._func_graph_outputs = list(outputs)
     self.grad_func_name = None
@@ -310,23 +298,38 @@ def _get_definition(self):
     function_def.ParseFromString(compat.as_bytes(proto_data))
     return function_def
 
-  def add_to_graph(self, g=None):
+  def add_to_graph(self, g=None, overwrite=False):
     """Add the function to the current context or a graph, if supplied.
 
     Args:
       g: the graph to add the function to. If not supplied, the function will
         be added to the current context.
+      overwrite: A bool. If True, this function will overwrite any existing
+        function of the same signature name in the graph `g` or context.
     """
     # pylint: disable=protected-access
     if not g and context.executing_eagerly():
       ctx = context.context()
-      if not ctx.has_function(self.name):
+      if ctx.has_function(self.name):
+        if overwrite:
+          ctx.remove_function(self.name)
+          ctx.add_function_def(self.definition)
+      else:
         ctx.add_function_def(self.definition)
     else:
-      if not g._is_function(self.name):
+      if g._is_function(self.name):
+        if overwrite:
+          g._remove_function(self.name)
+          g._add_function(self)
+      else:
         g._add_function(self)
+
       for f in self.graph._functions.values():
-        if not g._is_function(f.name):
+        if g._is_function(f.name):
+          if overwrite:
+            g._remove_function(f.name)
+            g._add_function(f)
+        else:
           g._add_function(f)
     # pylint: enable=protected-access
 
@@ -434,14 +437,14 @@ def _create_forward_backward_with_graph(attrs, forward_graph, backwards_graph):
   # be directly optimized downstream.
   # See for more details:
   # https://github.com/tensorflow/community/blob/master/rfcs/20190610-standardizing-composite_ops.md#appendix-future-support-for-optimizing-gradient-functions
-  common_attributes.pop(IMPLEMENTS_ATTRIBUTE_NAME, None)
+  common_attributes.pop(attributes_lib.IMPLEMENTS, None)
   backward_function_attr = _parse_func_attrs(
-      {FORWARD_FUNCTION_ATTRIBUTE_NAME: forward_function_name})
+      {attributes_lib.FORWARD_FUNCTION: forward_function_name})
   backward_function_attr.update(common_attributes)
   backward_function = ConcreteFunction(
       backwards_graph, attrs=backward_function_attr)
   forward_function_attr = _parse_func_attrs({
-      BACKWARD_FUNCTION_ATTRIBUTE_NAME:
+      attributes_lib.BACKWARD_FUNCTION:
       backward_function.name})
   forward_function_attr.update(common_attributes)
   forward_function = _EagerDefinedFunction(
@@ -704,15 +707,11 @@ def _build_functions_for_outputs(
     """Forward+backward functions where the backward function sees `outputs`."""
     # First figure out which of `outputs` are trainable. We'll accept gradients
     # for each of these in the backward function.
-    handles_to_variables = self._func_graph.variable_captures
     trainable_outputs = []
     trainable_indices = []
     for index, output in enumerate(outputs):
 
       if backprop_util.IsTrainable(output):
-        # Swap in the Variable object for resource handles if we can so
-        # sparse gradients work.
-        output = handles_to_variables.get(id(output), output)
         trainable_outputs.append(output)
         trainable_indices.append(index)
 
@@ -935,7 +934,7 @@ def _wrap_backward_function_with_jvp_backprop(
                 grad_ys=gradients_wrt_output_tangents,
                 src_graph=forward_wrapper.graph)
           dinputs = [
-              backprop.aggregate_indexed_slices_gradients((existing, new))
+              backprop_util.AggregateIndexedSlicesGradients((existing, new))
               for existing, new in zip(dinputs, gradients_wrt_inputs)
               if existing is not None or new is not None]
           dinputs.extend(gradients_wrt_inputs[len(dinputs):])
@@ -1322,7 +1321,7 @@ def __init__(self, func_graph, attrs=None, shared_func_graph=True, spec=None):
     # spec defines the structured signature.
     self._set_function_spec(spec)
 
-    if attrs and IMPLEMENTS_ATTRIBUTE_NAME in attrs:
+    if attrs and attributes_lib.IMPLEMENTS in attrs:
       # The alternative is to silently drop "implements" tag
       # but it seems likely it would lead to hard to catch bugs.
       # Another alternative is to make func_body to preserve the order
@@ -1343,8 +1342,8 @@ def __init__(self, func_graph, attrs=None, shared_func_graph=True, spec=None):
           "To pass a variable to such function use  "
           "use variable.read_value().".format(
               name=func_graph.name,
-              attr=IMPLEMENTS_ATTRIBUTE_NAME,
-              value=attrs[IMPLEMENTS_ATTRIBUTE_NAME],
+              attr=attributes_lib.IMPLEMENTS,
+              value=attrs[attributes_lib.IMPLEMENTS],
               inputs=self.inputs,
               captured=self._captured_inputs))
     self._output_shapes = tuple(
@@ -1390,6 +1389,7 @@ def _initialize_function_spec(self):
     assert not self._function_spec, "already initialized"
     spec = self._pre_initialized_function_spec
     args = spec.fullargspec.args
+    # TODO(fmuham): Use annotate_type_constraint here instead.
     arg_specs, kwarg_specs = self.structured_input_signature
     vararg_indices = range(len(spec.arg_names), len(arg_specs))
     fullargspec = tf_inspect.FullArgSpec(
@@ -1401,12 +1401,15 @@ def _initialize_function_spec(self):
         kwonlydefaults=dict(
             (k, function_spec.BOUND_VALUE) for k in kwarg_specs),
         annotations=spec.fullargspec.annotations)
-    self._function_spec = function_spec.FunctionSpec(
-        fullargspec,
-        spec.is_method,
-        spec.input_signature,
-        spec.is_pure,
-        name=self._func_graph.name)
+    self._function_spec = (
+        function_spec.FunctionSpec.from_fullargspec_and_signature(
+            fullargspec,
+            spec.is_method,
+            spec.input_signature,
+            spec.is_pure,
+            name=self._func_graph.name,
+        )
+    )
 
   @property
   def variables(self):
@@ -1513,9 +1516,14 @@ def _call_with_flat_signature(self, args, kwargs, cancellation_manager):
           f"positional arguments, got {len(args)}.")
     args = list(args)
     kwargs = dict(kwargs)
+    kwargs = {
+        function_type_lib.sanitize_arg_name(k): v for k, v in kwargs.items()
+    }
     for keyword in self._arg_keywords[len(args):]:
       try:
-        args.append(kwargs.pop(compat.as_str(keyword)))
+        args.append(
+            kwargs.pop(
+                function_type_lib.sanitize_arg_name(compat.as_str(keyword))))
       except KeyError:
         specified_keywords = (
             list(self._arg_keywords[:len(args)]) + list(kwargs.keys()))
@@ -1604,6 +1612,10 @@ def _structured_signature_check_arg_types(self, args, kwargs):
       name = self._function_spec.arg_names[i]
       self._structured_signature_check_arg_type(arg, spec, name,
                                                 signature_context)
+    kwarg_specs = {
+        function_type_lib.sanitize_arg_name(k): v
+        for k, v in kwarg_specs.items()
+    }
     for (name, arg) in kwargs.items():
       self._structured_signature_check_arg_type(arg, kwarg_specs[name], name,
                                                 signature_context)
@@ -1973,12 +1985,14 @@ def output_dtypes(self):
             self._func_graph.structured_outputs),
         expand_composites=False)
 
-  def add_to_graph(self, g=None):
+  def add_to_graph(self, g=None, overwrite=False):
     """Registers the function, adds it to the graph g or default graph.
 
     Args:
       g: If specified, registers the function with this graph. Defaults to the
         current context (either the default graph or the eager context).
+      overwrite: A bool. If True, its forward function will overwrite
+        any existing function of the same signature name in the graph `g`.
     """
     # If we are not executing eagerly, adds the function to default graph if no
     # graph is specified.
@@ -1987,7 +2001,7 @@ def add_to_graph(self, g=None):
 
     if not context.executing_eagerly() and not g:
       g = ops.get_default_graph()
-    self._delayed_rewrite_functions.forward().add_to_graph(g)
+    self._delayed_rewrite_functions.forward().add_to_graph(g, overwrite)
 
   def add_gradient_functions_to_graph(self, g=None):
     """Add forward/backward functions to graph `g` or the current context."""
@@ -2278,7 +2292,7 @@ def _export_to_saved_model_graph(self, object_map, tensor_map,
           (f"Unable to save function {self.name} for the following reason(s):\n"
            + "\n".join(self.graph.saving_errors)))
     self.add_to_graph()
-    object_map[self] = saved_model_utils.ExportedConcreteFunction(
+    object_map[self] = saved_model_exported_concrete.ExportedConcreteFunction(
         self, tensor_map)
     return []
 
@@ -2301,7 +2315,7 @@ def release(self):
     self._func_graph = None
 
   def __del__(self):
-    if func_graph_module is None or memory is None or self._func_graph is None:
+    if func_graph_module is None or self._func_graph is None:
       return
     try:
       func_graph_module.dismantle_func_graph(self._func_graph)
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
index 861f2a20124..2917ca886c1 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
@@ -69,22 +69,24 @@ def __call__(self, x):
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.function.trace_type import default_types
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import monitoring
+from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
+from tensorflow.python.eager.polymorphic_function import compiler_ir
 from tensorflow.python.eager.polymorphic_function import function_spec as function_spec_lib
-from tensorflow.python.eager.polymorphic_function import monomorphic_function
 from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
@@ -399,29 +401,32 @@ def run_functions_eagerly(run_eagerly):
 
   Calling `tf.config.run_functions_eagerly(True)` will make all
   invocations of `tf.function` run eagerly instead of running as a traced graph
-  function.
-
-  This can be useful for debugging.
+  function. This can be useful for debugging. As the code now runs line-by-line,
+  you can add arbitrary `print` messages or pdb breakpoints to monitor the
+  inputs/outputs of each Tensorflow operation. However, you should avoid using
+  this for actual production because it significantly slows down execution.
 
   >>> def my_func(a):
-  ...  print("Python side effect")
+  ...  print(f'a: {a}')
   ...  return a + a
   >>> a_fn = tf.function(my_func)
 
   >>> # A side effect the first time the function is traced
+  >>> # In tracing time, `a` is printed with shape and dtype only
   >>> a_fn(tf.constant(1))
-  Python side effect
+  a: Tensor("a:0", shape=(), dtype=int32)
   <tf.Tensor: shape=(), dtype=int32, numpy=2>
 
-  >>> # No further side effect, as the traced function is called
+  >>> # `print` is a python side effect, it won't execute as the traced function
+  >>> # is called
   >>> a_fn(tf.constant(2))
   <tf.Tensor: shape=(), dtype=int32, numpy=4>
 
   >>> # Now, switch to eager running
   >>> tf.config.run_functions_eagerly(True)
-  >>> # Side effect, as the function is called directly
+  >>> # The code now runs eagerly and the actual value of `a` is printed
   >>> a_fn(tf.constant(2))
-  Python side effect
+  a: 2
   <tf.Tensor: shape=(), dtype=int32, numpy=4>
 
   >>> # Turn this back off
@@ -533,7 +538,8 @@ def __init__(self,
                jit_compile=None,
                reduce_retracing=False,
                experimental_implements=None,
-               experimental_autograph_options=None):
+               experimental_autograph_options=None,
+               experimental_attributes=None,):
     """Initializes a `Function`.
 
     Args:
@@ -545,6 +551,7 @@ def __init__(self,
       reduce_retracing: See the documentation for `tf.function`.
       experimental_implements: See the documentation for `tf.function`.
       experimental_autograph_options: See the documentation for `tf.function`.
+      experimental_attributes: See the documentation for `tf.function`.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -557,7 +564,22 @@ def __init__(self,
         input_signature,
         jit_compile=jit_compile,
     )
-    self._implements = experimental_implements
+
+    self._attributes = {}
+    if experimental_implements is not None:
+      self._attributes = self._create_implements_attribute(
+          experimental_implements
+      )
+
+    if experimental_attributes is not None:
+      self._attributes.update(experimental_attributes)
+
+    for attribute in self._attributes:
+      if attribute not in attributes_lib.POLYMORPHIC_FUNCTION_ALLOWLIST:
+        raise ValueError(
+            f"`{attribute} is not supported by tf.function as an attribute."
+        )
+
     # If `True`, the function uses the rendezvous of the parent. This is only
     # needed to support code where raw send/recv operations are inserted and
     # when functions are run in graph mode where they may not be inlined.
@@ -573,7 +595,7 @@ def __init__(self,
     self._name = name
     self._key_for_call_stats = self._get_key_for_call_stats()
     self._omit_frequent_tracing_warning = False
-    ops._tf_function_api_guage.get_cell().set(True)  # pylint: disable=protected-access
+    ops._tf_function_api_gauge.get_cell().set(True)  # pylint: disable=protected-access
 
   @property
   def name(self):
@@ -651,11 +673,11 @@ def wrapped_fn(*args, **kwds):
         self._python_function,
         wrapped_fn))
 
-  def _create_implements_attribute(self):
-    """Creates the attribute value corresponding to IMPLEMENTS_ATTRIBUTE_NAME."""
+  def _create_implements_attribute(self, implements_arg):
+    """Creates the attribute value corresponding to attribute_lib.IMPLEMENTS."""
     attributes = {}
-    if isinstance(self._implements, str):
-      # First check if the IMPLEMENTS_ATTRIBUTE_NAME is specified as a
+    if isinstance(implements_arg, str):
+      # First check if the attribute_lib.IMPLEMENTS is specified as a
       # NameAttrList. This is used when apart from the function name being
       # implemented, a list of attributes is also being specified.
       # The attributes are specified as key-value pairs in the NameAttrList
@@ -665,31 +687,25 @@ def _create_implements_attribute(self):
       try:
         attr_value = attr_value_pb2.AttrValue()
         nameattrlist = attr_value_pb2.NameAttrList()
-        _text_format.Merge(self._implements, nameattrlist)
+        _text_format.Merge(implements_arg, nameattrlist)
         attr_value.func.CopyFrom(nameattrlist)
-        attributes[monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME] = attr_value
+        attributes[attributes_lib.IMPLEMENTS] = attr_value
       except (_text_format.ParseError, DecodeError):
-        attributes[
-            monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME] = self._implements
+        attributes[attributes_lib.IMPLEMENTS] = implements_arg
     return attributes
 
   def _compiler(self, fn):
     """Returns a TracingCompiler generated from the input function."""
-    attributes = {}
-
-    if self._implements is not None:
-      attributes = self._create_implements_attribute()
+    attributes = self._attributes.copy()
 
     share = self._shared_rendezvous
     if share is not None:
-      attributes[monomorphic_function.SHARED_RENDEZVOUS_ATTRIBUTE_NAME] = share
+      attributes[attributes_lib.SHARED_RENDEZVOUS] = share
 
     if self._jit_compile is not None:
-      attributes.update(_XlaMustCompile=bool(self._jit_compile))
+      attributes[attributes_lib.XLA_COMPILE] = bool(self._jit_compile)
       if self._jit_compile:
-        attributes.update(_noinline=True)
-    if not attributes:
-      attributes = None
+        attributes[attributes_lib.NO_INLINE] = True
 
     try:
       name = fn.__name__
@@ -720,8 +736,6 @@ def _initialize(self, args, kwds, add_initializers_to=None):
       kwds: Keyword arguments to the python callable.
       add_initializers_to: Where to collect variable initializers, if not None.
     """
-    self.function_spec.validate_input_signature_with_argspec()
-
     created_variables = []
     lifted_initializer_graph = func_graph_module.FuncGraph("initializer")
 
@@ -773,7 +787,7 @@ def _clone(self, python_function):
         autograph=self._autograph,
         jit_compile=self._jit_compile,
         reduce_retracing=self._reduce_retracing,
-        experimental_implements=self._implements,
+        experimental_attributes=self._attributes,
         experimental_autograph_options=self._experimental_autograph_options)
 
     if self._shared_rendezvous:
@@ -994,6 +1008,32 @@ def experimental_get_compiler_ir(self, *args, **kwargs):
       raise ValueError("Compiler IR can only be returned for functions marked "
                        "with 'jit_compile=True'")
 
+    is_tensor_spec = lambda x: isinstance(x, tensor_spec.TensorSpec)
+
+    def _check_inputs(args, kwargs):
+      all_inputs = list(args) + list(kwargs.values())
+      # Emtpy input is okay.
+      if not all_inputs:
+        return
+      if any(map(is_tensor_spec, all_inputs)) and any(
+          map(lambda x: not is_tensor_spec(x), all_inputs)
+      ):
+        raise ValueError(
+            "experimental_get_compiler_ir supports either "
+            "(1) all inputs are TensorSpec  or "
+            "(2) all inputs are tf.Tensor/python variables"
+        )
+
+    _check_inputs(args, kwargs)
+    if (
+        len(args) + len(kwargs.values()) > 0
+        and all(map(is_tensor_spec, args))
+        and all(map(is_tensor_spec, kwargs.values()))
+    ):
+      # For the case inputs are not empty and input types are all tf.TensorSpec
+      concrete_fn = self.get_concrete_function(*args, **kwargs)
+      return compiler_ir.from_concrete_function(concrete_fn)
+
     concrete_fn = self.get_concrete_function(*args, **kwargs)
     fn_name = concrete_fn.name
 
@@ -1002,15 +1042,14 @@ def experimental_get_compiler_ir(self, *args, **kwargs):
         concrete_fn._function_spec.canonicalize_function_inputs(args, kwargs))
 
     def compiler_ir_generator(stage="hlo", device_name=None):
-      # TODO(cheshire): This is a hack to get the current "preferred" device,
-      # there is no current API to get it otherwise.
-      if device_name is None:
-        device_name = random_ops.random_normal([]).device
+      device_name = compiler_ir.maybe_get_device_name(device_name)
       res_bytes = context.context().get_compiler_ir(
           device_name=device_name,
-          stage=stage,
           function_name=fn_name,
-          args=list(filtered_flat_args) + concrete_fn.captured_inputs)
+          flat_args=list(filtered_flat_args),
+          captured_inputs=concrete_fn.captured_inputs,
+          stage=stage,
+      )
       if stage in ("hlo_serialized", "optimized_hlo_serialized",
                    "optimized_hlo_proto_serialized"):
         return res_bytes
@@ -1138,20 +1177,24 @@ def _list_all_concrete_functions_for_serialization(self):
     Returns:
       A list of instances of `ConcreteFunction`.
     """
-    concrete_functions = self._list_all_concrete_functions()
     seen_signatures = []
-    for concrete_function in concrete_functions:
-      signature = concrete_function.structured_input_signature
-      flattened = nest.flatten(signature)
-      if any(
-          isinstance(arg, func_graph_module.UnknownArgument)
-          for arg in flattened):
-        logging.info("Unsupported signature for serialization: %s.", signature)
-        continue
-      equal_to_signature = functools.partial(
-          function_spec_lib.is_same_structure, signature, check_values=True)
-      if not any(equal_to_signature(s) for s in seen_signatures):
-        seen_signatures.append(signature)
+    if self.input_signature is not None:
+      seen_signatures.append((self.input_signature, {}))
+    else:
+      concrete_functions = self._list_all_concrete_functions()
+      for concrete_function in concrete_functions:
+        signature = concrete_function.structured_input_signature
+        flattened = nest.flatten(signature)
+        if any(
+            isinstance(arg, func_graph_module.UnknownArgument)
+            for arg in flattened):
+          logging.info("Unsupported signature for serialization: %s.",
+                       signature)
+          continue
+        equal_to_signature = functools.partial(
+            function_spec_lib.is_same_structure, signature, check_values=True)
+        if not any(equal_to_signature(s) for s in seen_signatures):
+          seen_signatures.append(signature)
 
     # Re-create concrete functions for these signatures. Re-creating ensures
     # that if the cache key has changed, the function will be traced again.
@@ -1216,6 +1259,9 @@ def get_concrete_function(self, *args, **kwargs):
     concrete._garbage_collector.release()  # pylint: disable=protected-access
     return concrete
 
+  def __tf_tracing_type__(self, signature_context):
+    return default_types.Weakref(weakref.ref(self))
+
   def __get__(self, instance, owner):
     """Makes it possible to decorate instance methods."""
     del owner
@@ -1277,6 +1323,7 @@ def function(
     reduce_retracing=False,
     experimental_implements=None,
     experimental_autograph_options=None,
+    experimental_attributes=None,
     experimental_relax_shapes=None,
     experimental_compile=None,
     experimental_follow_type_hints=None  # pylint: disable=unused-argument
@@ -1578,6 +1625,8 @@ def function(
       project.
     experimental_autograph_options: Optional tuple of
       `tf.autograph.experimental.Feature` values.
+    experimental_attributes: Optional dictionary of attributes to include in the
+      generated FunctionDefs.
     experimental_relax_shapes: Deprecated. Use `reduce_retracing`
       instead.
     experimental_compile: Deprecated alias to 'jit_compile'.
@@ -1623,7 +1672,8 @@ def decorated(inner_function):
                 jit_compile,
                 "experimental_compile",
                 experimental_compile),
-            experimental_implements=experimental_implements))
+            experimental_implements=experimental_implements,
+            experimental_attributes=experimental_attributes))
 
   # This code path is for the `foo = tf.function(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
index 8e0d4c636d9..2c167673aff 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
@@ -37,7 +37,7 @@
 from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
-from tensorflow.python.eager.polymorphic_function import monomorphic_function
+from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
 from tensorflow.python.eager.polymorphic_function import polymorphic_function
 from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.framework import composite_tensor
@@ -288,12 +288,12 @@ def testImplementsAttributeBasic(self):
         name = f.signature.name
         if 'forward' in name or 'backward' in name:
           not_present += 1
-          self.assertNotIn(monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME,
+          self.assertNotIn(attributes_lib.IMPLEMENTS,
                            f.attr, f)
         else:
           present += 1
           self.assertEqual(
-              f.attr[monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME].s,
+              f.attr[attributes_lib.IMPLEMENTS].s,
               'func'.encode('ascii'), f)
       self.assertEqual(not_present, 2, fdefs)
       self.assertEqual(present, 1, fdefs)
@@ -335,13 +335,13 @@ def testImplementsAttributeWorksOnVariables(self):
       functions = ops.get_default_graph().as_graph_def().library.function
       # Verify that we created only one function
       self.assertLen(functions, 1)
-      # Verify that eval() reads the current values.
+      # Verify that self.evaluate() reads the current values.
       a.initializer.run()
       b.initializer.run()
-      self.assertEqual(r1.eval(), 2)
+      self.assertEqual(self.evaluate(r1), 2)
 
-      a.assign_add([1]).eval()
-      self.assertEqual(r1.eval(), 3)
+      self.evaluate(a.assign_add([1]))
+      self.assertEqual(self.evaluate(r1), 3)
 
   def testImplementsAttributeWorksOnConstants(self):
     with context.graph_mode(), self.cached_session():
@@ -353,10 +353,10 @@ def testImplementsAttributeWorksOnConstants(self):
       functions = ops.get_default_graph().as_graph_def().library.function
       self.assertLen(functions, 1)
       self.assertLen(functions[0].signature.input_arg, 2)
-      # Verify that eval() reads the current values.
+      # Verify that self.evaluate() reads the current values.
       a.initializer.run()
-      self.assertEqual(r1.eval(), 3)
-      self.assertEqual(r2.eval(), 3)
+      self.assertEqual(self.evaluate(r1), 3)
+      self.assertEqual(self.evaluate(r2), 3)
 
   def testImplementsAttributeSpecializes(self):
     with context.graph_mode(), self.cached_session():
@@ -371,10 +371,10 @@ def testImplementsAttributeSpecializes(self):
 
       self.assertLen(functions[0].signature.input_arg, 2)
       self.assertLen(functions[1].signature.input_arg, 2)
-      # Verify that eval() reads the current values.
+      # Verify that self.evaluate() reads the current values.
       a.initializer.run()
-      numpy.testing.assert_equal(r1.eval(), [3.])
-      numpy.testing.assert_equal(r2.eval(), [3., 3.])
+      numpy.testing.assert_equal(self.evaluate(r1), [3.])
+      numpy.testing.assert_equal(self.evaluate(r2), [3., 3.])
 
   def testImplementsWorksWithTensorSpec(self):
     v = polymorphic_function.function(
@@ -404,11 +404,11 @@ def testImplementsAttributeAsNameAttrList(self):
         name = f.signature.name
         if 'forward' in name or 'backward' in name:
           not_present += 1
-          self.assertNotIn(monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME,
+          self.assertNotIn(attributes_lib.IMPLEMENTS,
                            f.attr, f)
         else:
           present += 1
-          attr_value = f.attr[monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME]
+          attr_value = f.attr[attributes_lib.IMPLEMENTS]
           self.assertIsNotNone(attr_value.func, f)
           self.assertEqual(attr_value.func.name, 'embedding_matmul')
           name_attrs = attr_value.func.attr
@@ -635,7 +635,7 @@ def f(_):
       return 1.0
 
     with self.assertRaisesRegex(
-        TypeError, r'could not be represented through the generic tracing'):
+        TypeError, r'Could not generate a generic TraceType'):
       f(set([]))
 
   def testBasicGraphMode(self):
@@ -1627,13 +1627,15 @@ def foo():
     with ops.colocate_with(y):
       self.assertIn(compat.as_bytes('GPU:0'), self.evaluate(foo()))
 
-  def testVariablesAreTracked(self):
+  @parameterized.parameters([(True), (False)])
+  def testVariablesAreTracked(self, reduce_retracing):
     v = resource_variable_ops.ResourceVariable(1.0)
 
     def foo(x):
       return v * x
 
-    defined = polymorphic_function.function(foo)
+    defined = polymorphic_function.function(
+        foo, reduce_retracing=reduce_retracing)
 
     x = constant_op.constant([1.0])
     self.assertEqual(1., self.evaluate(defined(x)))
@@ -1670,14 +1672,14 @@ def foo(a):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegex(TypeError, 'specifies 1 .* got 2'):
+    with self.assertRaisesRegex(TypeError, 'too many positional arguments'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
-    with self.assertRaisesRegex(ValueError,
-                                'Structure of Python function inputs.*'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       defined()
 
-    with self.assertRaisesRegex(ValueError,
-                                'inputs incompatible with input_signature'):
+    with self.assertRaisesRegex(
+        TypeError, '.*was expected to be of type.* but is.*'
+    ):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
@@ -1880,9 +1882,10 @@ def test_fn():
       self.assertLen(internal_captures, 1)
       self.assertEqual(internal_captures[0].op.type, op_type)
 
-  def testVariableAliasIdInStructuredInputSignature(self):
+  @parameterized.parameters([(True), (False)])
+  def testVariableAliasIdInStructuredInputSignature(self, reduce_retracing):
 
-    @polymorphic_function.function
+    @polymorphic_function.function(reduce_retracing=reduce_retracing)
     def foo(v1, v2):
       return v1 + v2
 
@@ -1921,9 +1924,10 @@ def foo(v1, v2):
   def _total_function_cache_def_func(self, defined):
     return defined._list_all_concrete_functions()  # pylint: disable=protected-access
 
-  def testVariableRetracingOnDtypeChanges(self):
+  @parameterized.parameters([(True), (False)])
+  def testVariableRetracingOnDtypeChanges(self, reduce_retracing):
 
-    @polymorphic_function.function
+    @polymorphic_function.function(reduce_retracing=reduce_retracing)
     def defined(a, b):
       return a + b
 
@@ -2173,6 +2177,32 @@ def f(x):
       f(ragged_factory_ops.constant([[[1, 2], [3]], [[4, 5, 6]]]))
       self.assertEqual(trace_count[0], 3)
 
+  def testCompositeTensorsWithReducedRetracing(self):
+    inp = ragged_factory_ops.constant([[1, 2], [3]])
+
+    @polymorphic_function.function(reduce_retracing=True)
+    def f(x):
+      return x
+
+    output = f(inp)
+    self.assertTrue(math_ops.reduce_all(math_ops.equal(inp, output)))
+
+  def testMultipleInputsWithReducedRetracing(self):
+    tensor1 = ragged_factory_ops.constant([[1, 2], [3]])
+    tensor2 = ragged_factory_ops.constant([[[1, 2], [3]], [[4, 5, 6]]])
+    variable1 = variables.Variable(1.0)
+    variable2 = variables.Variable(2.0)
+
+    @polymorphic_function.function(reduce_retracing=True)
+    def f(a, b, c, d):
+      return [a, b, c, d]
+
+    output = f(tensor1, tensor2, variable1, variable2)
+    self.assertTrue(math_ops.reduce_all(math_ops.equal(tensor1, output[0])))
+    self.assertTrue(math_ops.reduce_all(math_ops.equal(tensor2, output[1])))
+    self.assertTrue(math_ops.reduce_all(math_ops.equal(variable1, output[2])))
+    self.assertTrue(math_ops.reduce_all(math_ops.equal(variable2, output[3])))
+
   def test_concrete_function_shape_mismatch(self):
 
     @polymorphic_function.function
@@ -2441,6 +2471,37 @@ def g(**kwargs):
         g(a='f', l='e', m='a', p='g', q='d', r='b', v='c'),
         b'a=f, l=e, m=a, p=g, q=d, r=b, v=c')
 
+  def testSameConcreteFunctionDifferentKwargOrder(self):
+    @polymorphic_function.function
+    def foo(**kwargs):
+      return kwargs['a'] + math_ops.cast(kwargs['b'], dtypes.float32)
+
+    foo(a=constant_op.constant(1.0), b=constant_op.constant(1))
+    foo(b=constant_op.constant(1), a=constant_op.constant(1.0))
+
+    self.assertLen(total_function_cache(foo), 1)
+
+  def testEmptyInputSignatures(self):
+
+    class Foo:
+
+      @polymorphic_function.function(input_signature=[])
+      def bar_none(self):
+        return 1
+
+      @polymorphic_function.function(input_signature=[])
+      def bar_one(self, x=0):
+        return x
+
+      @polymorphic_function.function(input_signature=[])
+      def bar_two(self, x=0, y=1):
+        return x + y
+
+    foo = Foo()
+    self.assertEqual(foo.bar_none.input_signature, ())
+    self.assertEqual(foo.bar_one.input_signature, ())
+    self.assertEqual(foo.bar_two.input_signature, ())
+
   # pylint: disable=g-long-lambda
   @parameterized.named_parameters([
       dict(
@@ -2457,7 +2518,7 @@ def g(**kwargs):
           testcase_name='ExtraPositionalArg',
           conc_args=lambda: (1, 2),
           call_args=lambda: (1, 2, 3),
-          error=r'func\(x, y\) takes 2 .* got 3'),
+          error=r'too many positional arguments'),
       dict(
           testcase_name='MissingKeywordOnlyArg',
           conc_args=lambda: (1, 2),
@@ -2469,7 +2530,7 @@ def g(**kwargs):
           conc_args=lambda: (1, 2),
           call_args=lambda: (1, 2),
           call_kwargs=lambda: {'c': constant_op.constant(1.0)},
-          error=r'func\(x, y\) got unexpected keyword arguments: c'),
+          error=r'got an unexpected keyword argument'),
       dict(
           testcase_name='ExpectedRaggedGotNest',
           conc_args=lambda: (ragged_factory_ops.constant([[1, 2], [3]]),),
@@ -2534,7 +2595,7 @@ def g(**kwargs):
           conc_args=lambda: (1, 2),
           call_args=lambda: (1, 2),
           call_kwargs=lambda: {'x': 3},
-          error=r"func\(x, y\) got two values for 'x'"),
+          error=r'multiple values for argument'),
   ])
   # pylint: enable=g-long-lambda
   @test_util.run_in_graph_and_eager_modes
@@ -2763,7 +2824,7 @@ def func2(x, y=3, *args, **kwargs):
     c5_summary = 'func2(x=8, y)'
     self.assertEqual(c5.pretty_printed_signature(verbose=False), c5_summary)
 
-  def testPrettyPrintedExplicitSignatureWithKeywordArg(self):  # b/159639913
+  def testPrettyPrintedExplicitSignatureWithKeywordArg(self):
 
     @polymorphic_function.function(
         input_signature=[tensor_spec.TensorSpec(None)])
@@ -2771,9 +2832,9 @@ def fn(a, b=1):
       return a + b
 
     concrete_fn = fn.get_concrete_function()
-    self.assertEqual(concrete_fn.pretty_printed_signature(False), 'fn(a)')
+    self.assertEqual(concrete_fn.pretty_printed_signature(False), 'fn(a, b=1)')
     self.assertEqual(
-        concrete_fn.pretty_printed_signature(True), 'fn(a)\n'
+        concrete_fn.pretty_printed_signature(True), 'fn(a, b=1)\n'
         '  Args:\n'
         '    a: float32 Tensor, shape=<unknown>\n'
         '  Returns:\n'
@@ -2859,7 +2920,7 @@ def add1(self, x, y):
           return x + y
 
     foo = Foo()
-    with self.assertRaisesRegex(TypeError, 'got two values'):
+    with self.assertRaisesRegex(TypeError, 'multiple values for argument'):
       foo.add1(2, x=3)  # pylint: disable=redundant-keyword-arg,no-value-for-parameter
 
   def testWithExtraWrapperMissingArgs(self):
@@ -2889,25 +2950,28 @@ def add3(self, x, y):
           return x + y
 
     foo = Foo()
-    with self.assertRaisesRegex(
-        TypeError, 'missing 1 required positional argument: \'y\''):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'y\''):
       foo.add1(2)  # pylint: disable=no-value-for-parameter
 
-    with self.assertRaisesRegex(TypeError, 'missing 1 required argument: x'):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'x\''):
       foo.add1(y=2)  # pylint: disable=no-value-for-parameter
 
-    with self.assertRaisesRegex(
-        TypeError, 'missing 1 required positional argument: \'y\''):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'y\''):
       foo.add2(2)  # pylint: disable=no-value-for-parameter
 
-    with self.assertRaisesRegex(TypeError, 'missing 1 required argument: x'):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'x\''):
       foo.add2(y=2)  # pylint: disable=no-value-for-parameter
 
-    with self.assertRaisesRegex(
-        TypeError, 'missing 1 required positional argument: \'y\''):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'y\''):
       foo.add3(2)  # pylint: disable=no-value-for-parameter
 
-    with self.assertRaisesRegex(TypeError, 'missing 1 required argument: x'):
+    with self.assertRaisesRegex(TypeError,
+                                'missing a required argument: \'x\''):
       foo.add3(y=2)  # pylint: disable=no-value-for-parameter
 
   def testMissingArgsTfFunctionedMethod(self):
@@ -2923,8 +2987,7 @@ def decorated_method(self, position_arg1, position_arg2):
 
     a_instance = A()
     tf_method_pos = polymorphic_function.function(a_instance.func)
-    with self.assertRaisesRegex(
-        TypeError, '.* missing 1 required argument: position_arg1'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       tf_method_pos(position_arg2='foo')
 
     # tf.function-decorated instance methods need to be tested because of
@@ -2932,8 +2995,7 @@ def decorated_method(self, position_arg1, position_arg2):
     tf_func_decorated_method = polymorphic_function.function(
         a_instance.decorated_method)
     tf_func_decorated_method(position_arg1='foo', position_arg2='bar')
-    with self.assertRaisesRegex(
-        TypeError, '.* missing 1 required argument: position_arg1'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       tf_func_decorated_method(position_arg2='bar')
 
   def testMissingArgsTfFunctionedObject(self):
@@ -2949,8 +3011,7 @@ def __call__(self, position_arg1, position_arg2):
     # the special inspect results.
     tf_func_obj = polymorphic_function.function(a_instance)
     tf_func_obj(position_arg1=1, position_arg2=2)
-    with self.assertRaisesRegex(
-        TypeError, '.* missing 1 required argument: position_arg1'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       tf_func_obj(position_arg2='bar')
 
   def testMissingArgsTfFunctionedFunctions(self):
@@ -2966,19 +3027,16 @@ def func_pos_3args(position_arg1, position_arg2, position_arg3):
 
     tf_func_pos = polymorphic_function.function(func_pos)
     with self.assertRaisesRegex(
-        TypeError, '.* missing 1 required argument: position_arg1'):
+        TypeError, 'missing a required argument'):
       tf_func_pos(position_arg2='foo')
 
     tf_func_with_default = polymorphic_function.function(func_with_default)
     tf_func_with_default(position_arg='bar')
-    with self.assertRaisesRegex(TypeError,
-                                '.* missing 1 required argument: position_arg'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       tf_func_with_default(named_arg='foo')
 
     tf_func_pos_3args = polymorphic_function.function(func_pos_3args)
-    with self.assertRaisesRegex(
-        TypeError,
-        '.* missing required arguments: position_arg1, position_arg3'):
+    with self.assertRaisesRegex(TypeError, 'missing a required argument'):
       tf_func_pos_3args(position_arg2='foo')
 
   def testShapeInferencePropagateConstNestedStack(self):
@@ -3507,22 +3565,20 @@ def f6(self, arg1, arg4=4, **kwargs):
     m = MyModule()
     tf_func_dec = polymorphic_function.function(
         input_signature=(tensor_spec.TensorSpec([], dtypes.int32),))
-    at_declare_error_msg = 'TensorSpecs are still required.*arg2.*arg3'
-    at_call_error_msg = 'specifies 1 positional arguments, but got 3.'
-
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    error_message = 'input_signature missing type constraint'
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(m.f1)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(m.f2)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_declare_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(m.f3)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(m.f4)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(m.f5)(1, 2, 3)
 
     self.assertEqual(tf_func_dec(m.f6)(1).numpy(), 5)
@@ -3530,37 +3586,36 @@ def f6(self, arg1, arg4=4, **kwargs):
   def testInputSignatureMissingTensorSpecsFunction(self):
     tf_func_dec = polymorphic_function.function(
         input_signature=(tensor_spec.TensorSpec([], dtypes.int32),))
-    at_dec_error_msg = 'TensorSpecs are still required.*arg2.*arg3'
-    at_call_error_msg = 'specifies 1 positional arguments, but got 3'
+    error_message = 'input_signature missing type constraint'
     # pylint: disable=unused-argument
     def f1(arg1, arg2, arg3):
       pass
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(f1)(1, 2, 3)
 
     def f2(arg1, arg2, arg3, **kwargs):
       pass
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(f2)(1, 2, 3)
 
     def f3(arg1, arg2, arg3, arg4=4, **kwargs):
       pass
 
-    with self.assertRaisesRegex(TypeError, at_dec_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(f3)(1, 2, 3)
 
     def f4(arg1, arg2, arg3, *args):
       pass
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(f4)(1, 2, 3)
 
     def f5(arg1, arg2, arg3, *args, **kwargs):
       pass
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(f5)(1, 2, 3)
     # pyline: enable=unused-argument
 
@@ -3571,21 +3626,20 @@ def f6(arg1, arg4=4, **kwargs):
   def testInputSignatureMissingTensorSpecsLambdaFunction(self):
     tf_func_dec = polymorphic_function.function(
         input_signature=(tensor_spec.TensorSpec([], dtypes.int32),))
-    at_dec_error_msg = 'TensorSpecs are still required.*arg2.*arg3'
-    at_call_error_msg = 'specifies 1 positional arguments, but got 3.'
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    error_message = 'input_signature missing type constraint'
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(lambda ar1, arg2, arg3: None)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(lambda arg1, arg2, arg3, **kwargs: None)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_dec_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(lambda arg1, arg2, arg3, arg4=4, **kwargs: None)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(lambda arg1, arg2, arg3, *args: None)(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError, at_call_error_msg):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(lambda arg1, arg2, arg3, *args, **kwargs: None)(1, 2, 3)
 
     self.assertEqual(
@@ -3607,22 +3661,23 @@ def f(arg1, arg2, arg3, arg4=4):
     else:  # lambda_function
       f = lambda arg1, arg2, arg3, arg4=4: arg1 + arg2 + arg3 + arg4
 
+    error_message = 'input_signature missing type constraint'
     tf_func_dec = polymorphic_function.function(
-        input_signature=(tensor_spec.TensorSpec([], dtypes.int32),))
-    with self.assertRaisesRegex(TypeError,
-                                'TensorSpecs are still required.*arg3'):
+        input_signature=(tensor_spec.TensorSpec([], dtypes.int32),)
+    )
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(functools.partial(f, 1))(2, 3)
 
-    with self.assertRaisesRegex(TypeError,
-                                'specifies 1 positional arguments, but got 3.'):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(functools.partial(f, arg4=5))(1, 2, 3)
 
-    with self.assertRaisesRegex(TypeError,
-                                'specifies 1 positional arguments, but got 2.'):
+    with self.assertRaisesRegex(TypeError, error_message):
       tf_func_dec(functools.partial(f, 1, arg4=5))(2, 3)
 
-    self.assertAllEqual(tf_func_dec(functools.partial(f, 1, 2, arg4=5))(3),
-                        array_ops.constant(11))
+    self.assertAllEqual(
+        tf_func_dec(functools.partial(f, 1, 2, arg4=5))(3),
+        array_ops.constant(11),
+    )
 
   @test_util.run_in_graph_and_eager_modes
   def test_variable_naming(self):
@@ -3816,6 +3871,15 @@ def non_unique_arg_names(x, **kwargs):
                  constant_op.constant(3.),
                  constant_op.constant(4.)))
 
+  def testDuplicatedSanitizedNames(self):
+    @polymorphic_function.function
+    def foo(**kwargs):
+      return kwargs['a_b'] + kwargs['a/b']
+
+    error_message = 'Name collision after sanitization.'
+    with self.assertRaisesRegex(ValueError, error_message):
+      foo(**{'a_b': 1, 'a/b': 2})
+
   def testVariableCreatorScope(self):
     created_variables = []
     captured_variables = []
@@ -3861,6 +3925,15 @@ def add(a, b):
 
     self.assertAllEqual(add(v, v), 2.0)
 
+  def testSameVariableTwiceWithReducedRetracing(self):
+    v = variables.Variable(2.0)
+
+    @polymorphic_function.function(reduce_retracing=True)
+    def add(a, b):
+      return a + b
+
+    self.assertAllEqual(add(v, v), 4.0)
+
   def testVariableUpdate(self):
     v1 = variables.Variable(1.0)
     v2 = variables.Variable(2.0)
@@ -4026,7 +4099,7 @@ def testClone(self, input_signature, autograph, autograph_options, implements,
     self.assertEqual(func._name, cloned._name)
     self.assertEqual(input_signature, cloned.input_signature)
     self.assertEqual(autograph, cloned._autograph)
-    self.assertEqual(implements, cloned._implements)
+    self.assertEqual(func._attributes, cloned._attributes)
     self.assertEqual(autograph_options, cloned._experimental_autograph_options)
     self.assertEqual(relax_shapes, cloned._reduce_retracing)
     self.assertEqual(compile_, cloned._jit_compile)
@@ -4102,7 +4175,7 @@ def test_pickle(self, input_signature, autograph, autograph_options,
     self.assertEqual(func._name, cloned._name)
     self.assertEqual(input_signature, cloned.input_signature)
     self.assertEqual(autograph, cloned._autograph)
-    self.assertEqual(implements, cloned._implements)
+    self.assertEqual(func._attributes, cloned._attributes)
     self.assertEqual(autograph_options, cloned._experimental_autograph_options)
     self.assertEqual(relax_shapes, cloned._reduce_retracing)
 
@@ -4708,12 +4781,12 @@ def lazy_capture(x):
 
     # dtype mismatch
     value = constant_op.constant(1)
-    with self.assertRaisesRegex(ValueError, 'Value .* to a tensor with dtype'):
+    with self.assertRaisesRegex(ValueError, 'Tensor conversion requested'):
       lazy_capture(2.0)
 
     # shape mismatch
     value = constant_op.constant([1.0])
-    with self.assertRaisesRegex(ValueError, 'Value .* shape'):
+    with self.assertRaisesRegex(AssertionError, 'Failed to cast'):
       lazy_capture(2.0)
 
   def testDeferredCaptureReturnNestWithCompositeTensor(self):
@@ -4776,7 +4849,9 @@ def testMaybeCreateCapturePlaceholderWithValidCapture(self):
     @polymorphic_function.function
     def f():
       func = lambda: x
-      return ops.get_default_graph()._maybe_create_capture_placeholder(func)
+      # TODO(b/263520817): Remove access to private attribute.
+      return ops.get_default_graph(
+          )._function_captures._create_capture_placeholder(func)
 
     x = {
         'tensor': constant_op.constant(0),
@@ -4798,33 +4873,15 @@ def testMaybeCreateCapturePlaceholderWithInvalidCapture(self):
     @polymorphic_function.function
     def f():
       func = lambda: x
-      return ops.get_default_graph()._maybe_create_capture_placeholder(func)
+      # TODO(b/263520817): Remove access to private attribute.
+      return ops.get_default_graph(
+          )._function_captures._create_capture_placeholder(func)
 
     # Set is not supported
     x = set([1, 2])
     with self.assertRaises(NotImplementedError):
       f()
 
-  # TODO(panzf): remove this test after exposing manual API, as the integration
-  # testcase can be turned on at that time.
-  def test_inner_nested_tf_function_raise_error(self):
-
-    @polymorphic_function.function
-    def tf_f():
-
-      @polymorphic_function.function
-      def tf_g():
-        cx = ops.get_default_graph()._experimental_capture_side_input_by_ref(  # pylint: disable=protected-access
-            'lambda: x', lambda: x)
-        return cx
-
-      return tf_g()
-
-    x = constant_op.constant(0)  # pylint: disable=unused-variable
-    with self.assertRaisesRegex(NotImplementedError,
-                                'Manual side input usage for inner nested'):
-      tf_f()
-
   @parameterized.parameters(
       (1, int, 2, int, 2),
       (1, constant_op.constant, 2, constant_op.constant, 1))
@@ -4843,6 +4900,20 @@ def f():
     _ = f()
     self.assertLen(total_function_cache(f), expected_len)
 
+  def testByRefCaptureWithInputSignature(self):
+
+    @polymorphic_function.function(input_signature=[])
+    def f():
+      func = lambda: x
+      return ops.get_default_graph()._experimental_capture_side_input_by_ref(  # pylint: disable=protected-access
+          'lambda: x', func)
+
+    x = 1
+    _ = f()
+    x = 2
+    _ = f()
+    self.assertLen(total_function_cache(f), 2)
+
   def testFunctoolsLruCache(self):
     self.skipTest(
         "b/194845243: inspect.getfullargspec doesn't unwrap Python decorators.")
@@ -4854,6 +4925,31 @@ def f(a):
 
     self.assertAllEqual(f(1), array_ops.constant(2))
 
+  def testGraphRemoveFunction(self):
+    @polymorphic_function.function
+    def g(x):
+      return x + 1
+
+    @polymorphic_function.function
+    def f(x):
+      return g(x)
+
+    graph = f.get_concrete_function(constant_op.constant(1)).graph
+    graph_def = graph.as_graph_def()
+    func_name = graph_def.library.function[0].signature.name
+
+    self.assertLen(graph_def.library.function, 1)
+    self.assertTrue(graph._is_function(func_name))
+
+    graph._remove_function(func_name)
+    updated_graph_def = graph.as_graph_def()
+
+    self.assertEmpty(updated_graph_def.library.function)
+    self.assertFalse(graph._is_function(func_name))
+
+    with self.assertRaisesRegex(ValueError, 'not found'):
+      graph._remove_function(func_name)
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
index d64fe909616..58b83a2ecfa 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_xla_jit_test.py
@@ -38,11 +38,31 @@
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 @test_util.with_eager_op_as_function
 class FunctionTest(xla_test.XLATestCase):
 
+  def _compareTwoMethodsCompilerIROutput(self, f, args, kwargs):
+    """Assert the two differnet methods (tensor_spec inputs or tensor inputs) experimental_get_compiler give same HLO text."""
+    flat_args = list(args) + list(kwargs.values())
+    if not all([isinstance(x, ops.Tensor) for x in flat_args]):
+      self.skipTest('It only support args and kwargs are all tf.Tensor types.')
+
+    args_spec = nest.map_structure(tensor_spec.TensorSpec.from_tensor, args)
+    kwargs_spec = nest.map_structure(tensor_spec.TensorSpec.from_tensor, kwargs)
+
+    hlo_1 = f.experimental_get_compiler_ir(*args, **kwargs)()
+    hlo_2 = f.experimental_get_compiler_ir(*args_spec, **kwargs_spec)()
+
+    if hlo_1 != hlo_2:
+      self.fail(
+          'The tensor_spec way experimental_get_compiler_ir give diff result to'
+          f' normal experimental_get_compiler_ir. \nhlo_1:\n{hlo_1}'
+          f'\nhlo_2:\n{hlo_2}\n'
+      )
+
   def testAutoclusteringWithTfFunction(self):
     if 'tpu' in self.device.lower():
       self.skipTest('Autoclustering does not run on TPU')
@@ -189,6 +209,7 @@ def fn(x, y):
       matches = re.findall('channel_id=([0-9]*),', hlo_str)
       self.assertLen(matches, 2)
       self.assertNotEqual(matches[0], matches[1])
+      self._compareTwoMethodsCompilerIROutput(fn, [inputs, inputs], {})
 
   def testCollectiveReduceGroupAssignment(self):
     if not test_util.is_mlir_bridge_enabled():
@@ -209,6 +230,7 @@ def fn(x):
       # instructions generated by XLA.
       hlo_str = fn.experimental_get_compiler_ir(inputs)()
       self.assertIn('replica_groups={{0}}', hlo_str)
+      self._compareTwoMethodsCompilerIROutput(fn, [inputs], {})
 
   @test_util.disable_mlir_bridge('TODO(b/155782411): MLIR bridge does not'
                                  'support stack traces')
@@ -222,6 +244,7 @@ def fn(x, y):
       inputs = constant_op.constant([1, 2, 2, 3, 3])
       self.assertIn('polymorphic_function_xla_jit_test',
                     fn.experimental_get_compiler_ir(inputs, inputs)())
+      self._compareTwoMethodsCompilerIROutput(fn, [inputs, inputs], {})
 
   @test_util.disable_mlir_bridge('TODO(b/155782411): MLIR bridge does not'
                                  'support stack traces')
@@ -239,6 +262,7 @@ def g(x, y):
       inputs = constant_op.constant([1, 2, 2, 3, 3])
       self.assertIn('polymorphic_function_xla_jit_test',
                     g.experimental_get_compiler_ir(inputs, inputs)())
+      self._compareTwoMethodsCompilerIROutput(g, [inputs, inputs], {})
 
   def testPythonStackTrace(self):
     with ops.device('device:{}:0'.format(self.device)):
@@ -788,7 +812,7 @@ def f(a, b):
       a = array_ops.ones([3, 4, 3], dtype=dtypes.float32)
       b = constant_op.constant([0, 2, 1], dtype=dtypes.int32)
 
-      self.assertIn('{1,2,0}',
+      self.assertIn('{2,1,0}',
                     f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo'))
 
   @test_util.disable_mlir_bridge('TODO(b/168732524): MLIR bridge does not '
@@ -864,6 +888,7 @@ def f(a, b):
       self.assertIn(
           'label',
           f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo_dot'))
+      self._compareTwoMethodsCompilerIROutput(f, [a, b], {})
 
   def testGetCompilerIrNoDevicePlacement(self):
     if 'gpu' not in self.device.lower():
@@ -879,6 +904,7 @@ def f(a, b):
     self.assertIn(
         'label',
         f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo_dot'))
+    self._compareTwoMethodsCompilerIROutput(f, [a, b], {})
 
   def testGetCompilerIrNonTensors(self):
     with ops.device('device:{}:0'.format(self.device)):
@@ -891,6 +917,7 @@ def f(l):
 
       self.assertIn('tuple',
                     f.experimental_get_compiler_ir(l)())
+      self._compareTwoMethodsCompilerIROutput(f, [l], {})
 
   def testGetCompilerIrSerialized(self):
     with ops.device('device:{}:0'.format(self.device)):
@@ -904,6 +931,7 @@ def fn(x):
         hlo = fn.experimental_get_compiler_ir(inputs)(
             stage=stage, device_name=f'/device:{self.device}:0')
         self.assertIsInstance(hlo, bytes)
+      self._compareTwoMethodsCompilerIROutput(fn, [inputs], {})
 
   def testDotOptimizedHlo(self):
     with ops.device('device:{}:0'.format(self.device)):
@@ -1171,12 +1199,35 @@ def testSinglePassArgmax(self):
       def f(x):
         return math_ops.argmax(x)
 
-      hlo = f.experimental_get_compiler_ir(
-          array_ops.ones([10], dtype=dtypes.float32))(
-              stage='hlo')
+      inputs = array_ops.ones([10], dtype=dtypes.float32)
+      hlo = f.experimental_get_compiler_ir(inputs)(stage='hlo')
 
       # Test that reduction occurs only once.
       self.assertGreater(hlo.count('reduce'), 1)
+      self._compareTwoMethodsCompilerIROutput(f, [inputs], {})
+
+  def testExperimentalGetCompilerIRBasic(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @polymorphic_function.function(jit_compile=True)
+      def inner_tf_func(x):
+        return math_ops.sin(x)
+
+      x = constant_op.constant([2.0, 3.0])
+      self._compareTwoMethodsCompilerIROutput(inner_tf_func, [x], {})
+
+  def testExperimentalGetCompilerIRAutograph(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @polymorphic_function.function(jit_compile=True, autograph=True)
+      def f(x, y):
+        if x[0] > 1:
+          return y[0]
+        else:
+          return y[1]
+
+      x, y = constant_op.constant([2, 3]), constant_op.constant([2, 3])
+      self._compareTwoMethodsCompilerIROutput(f, [x, y], {})
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/polymorphic_function/quarantine.py b/tensorflow/python/eager/polymorphic_function/quarantine.py
index 6887cd7d785..4af582ce157 100644
--- a/tensorflow/python/eager/polymorphic_function/quarantine.py
+++ b/tensorflow/python/eager/polymorphic_function/quarantine.py
@@ -22,345 +22,7 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/244360686): Remove in favor of tf.function.
-def defun(func=None,
-          input_signature=None,
-          autograph=True,
-          experimental_autograph_options=None,
-          reduce_retracing=False):
-  """Compiles a Python function into a callable TensorFlow graph.
-
-  `defun` (short for "define function") compiles a Python function
-  composed of TensorFlow operations into a callable that executes a `tf.Graph`
-  containing those operations. The callable produced by `defun` contains only
-  the subgraph of TensorFlow operations that were executed when the Python
-  function was called with a particular input signature, defined as a list
-  of the shapes and dtypes of the Python function's Tensor-valued arguments and
-  the values of its non-Tensor Python objects.
-
-  When eager execution is enabled, the ability to create graphs from Python
-  functions makes it possible to incrementally trade off debuggability and
-  interactivity for performance.  Functions compiled with `defun` cannot be
-  inspected with `pdb`; however, executing a graph
-  generated by `defun` sometimes takes less time and memory than eagerly
-  executing the corresponding Python function, since specifying computations as
-  graphs allows for optimizations like automatic buffer reuse and
-  parallelization among ops. Note that executing a `defun`-compiled function
-  incurs a small constant overhead, so eagerly executing sufficiently small
-  Python functions might take less time than executing their corresponding
-  `defun`-generated graphs.
-
-  For a Python function to be compatible with `defun`, all of its arguments must
-  be hashable Python objects or lists thereof. The function itself may not
-  modify the list/map structure of its arguments. Additionally, it must return
-  zero or more `tf.Tensor` objects. If the Python function returns
-  a `tf.Variable`, its compiled version will return the value of that variable
-  as a `tf.Tensor`.
-
-  Executing a graph generated by `defun` respects device annotations (i.e.,
-  all `with tf.device` directives present in a Python function will also be
-  present in its corresponding graph), but it is not yet possible to execute the
-  generated graphs across multiple machines.
-
-  _Example Usage_
-
-  ```python
-  import tensorflow as tf
-
-  tf.compat.v1.enable_eager_execution()
-
-  # A simple example.
-  def f(x, y):
-    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
-
-  g = tf.contrib.eager.defun(f)
-
-  x = tf.constant([[2.0, 3.0]])
-  y = tf.constant([[3.0, -2.0]])
-
-  # `f` and `g` will return the same value, but `g` will be executed as a
-  # TensorFlow graph.
-  assert f(x, y).numpy() == g(x, y).numpy()
-
-  # `defun` is capable of compiling Python functions that close over Python
-  # objects, including Tensors and Variables.
-  @tf.contrib.eager.defun
-  def h():
-    return f(x, y)
-
-  assert (h().numpy() == f(x, y).numpy()).all()
-
-  # `defun` automatically lifts variables out of the graphs it creates,
-  # allowing you to compile the `call` methods of `tf.keras.layers.Layer` and
-  # `tf.keras.Model` objects.
-  class MyModel(tf.keras.Model):
-
-    def __init__(self, keep_probability=0.2):
-      super().__init__()
-      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-      self.keep_probability = keep_probability
-
-    @tf.contrib.eager.defun
-    def call(self, inputs, training=True):
-      x = self.dense2(self.dense1(inputs))
-      if training:
-        return tf.nn.dropout(x, self.keep_probability)
-      else:
-        return x
-
-  model = MyModel()
-  model(x, training=True)  # executes a graph, with dropout
-  model(x, training=False) # executes a graph, without dropout
-
-  # `defun`-compiled functions are differentiable.
-  optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01)
-  with tf.GradientTape() as tape:
-    outputs = model(x)
-  gradient = tape.gradient(outputs, model.trainable_variables)
-  optimizer.apply_gradients((grad, var) for grad, var in zip(gradient,
-                            model.trainable_variables))
-  ```
-
-  When using `defun`, there are subtleties regarding inputs, Python control
-  flow, and variable creation that one should be aware of. For concreteness, let
-  `f` be a Python function that returns zero or more `tf.Tensor` objects and
-  let `F = defun(f)`. `F` builds a graph for each unique input signature it
-  sees, Python control flow is baked into graphs, and operations related to
-  variable initialization are automatically lifted out of the graphs that `F`
-  generates and placed in the eager context if executing eagerly or into an
-  outer graph otherwise.
-
-  _Input Signatures_
-
-  By default, `F = tf.contrib.eager.defun(f)` instantiates a separate graph
-  for every unique sequence of the shapes and dtypes of Tensor arguments and
-  the values of Python objects it is invoked with. For example, calling
-  `F(tf.random.uniform([2])` will execute a different graph than
-  `F(tf.random.uniform([3])` because the two inputs have different shapes.
-  The first time that `F(*args, **kwargs)` is called with a particular sequence
-  of Tensor shapes and dtypes and Python values, it constructs a graph by
-  tracing the execution of `f(*args, **kwargs)`; this graph is bound to an
-  input signature inferred from `(*args, **kwargs)` and cached for future reuse.
-
-  NumPy arrays passed as inputs to `F` are converted to `tf.Tensor` objects
-  before being passed to `f`, and are treated as Tensors for caching. This
-  allows a function to be called multiple times with NumPy arrays having
-  different values but the same shape and dtype without re-tracing each time.
-
-  `tf.contrib.eager.defun` caches graphs for your convenience, letting you
-  define TensorFlow functions without explicitly specifying their signatures.
-  However, this policy is conservative and potentially expensive; for example,
-  when different invocations of your function have differently-shaped Tensor
-  inputs, this policy might generate more graph functions than necessary. To
-  eliminate such costs, `tf.contrib.eager.defun` allows you to supply an
-  optional `input_signature` argument specifying the shapes and dtypes of the
-  inputs. In particular, the shapes may be partially unspecified, with `None`s
-  in the unknown dimensions.  When an input signature is provided,
-  `tf.contrib.eager.defun` will only instantiate a single graph for the
-  decorated Python function. The following is an example:
-
-  ```python
-  import tensorflow as tf
-
-  # The first `TensorSpec` below describes the shape and dtype of `words`,
-  # and the second describes the shape and dtype of `another_tensor`. Note that
-  # the last dimension of the `words` `TensorSpec` is left unspecified.
-  @tf.contrib.eager.defun(input_signature=[
-    tf.contrib.eager.TensorSpec(shape=[50, 300, None], dtype=tf.float32),
-    tf.contrib.eager.TensorSpec(shape=[300, 100], dtype=tf.float32)
-  ])
-  def my_sequence_model(words, another_tensor):
-    ...
-
-  # Note how the third dimension of the first input can vary freely.
-  words = tf.random.uniform(([50, 300, 10])
-  second_input = tf.random.uniform([300, 100])
-  my_sequence_model(words, second_input)
-
-  words = tf.random.uniform(([50, 300, 20])
-  my_sequence_model(words, second_input)
-
-  # Passing an input with an incompatible shape will raise an error.
-  words = tf.random.uniform(([50, 100, 20])
-  my_sequence_model(words, second_input)  # <---- This will raise an error.
-
-  ```
-
-  Python functions that are compiled with an `input_signature` must only accept
-  Tensors as arguments and must not take unnamed keyword arguments (**kwargs).
-
-  _Tracing_
-
-  Be aware that because `F` only logs TensorFlow operations, all the other
-  Python code that `f` executes will only shape the _construction_ of the graphs
-  that `F` executes: the Python code won't be executed when the graphs
-  themselves are executed, though it will be executed every time the Python
-  function is traced (and a given Python function might be traced multiple
-  times, once for each input signature it is invoked with). For example, whereas
-  the Python function
-
-  ```python
-  import tensorflow as tf
-  import numpy as np
-
-  tf.compat.v1.enable_eager_execution()
-
-  def add_noise():
-    return tf.eye(5) + np.random.randn(5, 5)
-  ```
-
-  will return a different output everytime it is invoked, the compiled function
-  `compiled = tf.contrib.eager.defun(add_noise)` will return the same value
-  every time it is called, since a particular random offset generated by NumPy
-  will be inserted into the graph as a TensorFlow constant. The solution is to
-  replace the call to `np.random.randn` with `tf.random.normal((5, 5))`.
-
-  _Python Side-Effects_
-
-  A corollary of the previous discussion on tracing is the following: If a
-  Python function `f` has Python side-effects, then executing `f` multiple times
-  will not necessarily be semantically equivalent to executing `F =
-  tf.contrib.eager.defun(f)` multiple times; this difference is due to the fact
-  that `defun` only captures the subgraph of TensorFlow operations that is
-  constructed when `f` is called in a graph-building context.
-
-  _Python Control Flow_
-
-  The structure of many machine learning computations depend upon whether one is
-  training or validating, and it is common to nest specialized logic under `if
-  training:` blocks. By mapping each input signature to a unique graph, `defun`
-  lets users transparently compile such code, as the following code snippet
-  demonstrates:
-
-  ```python
-  import tensorflow as tf
-
-  tf.compat.v1.enable_eager_execution()
-
-  @tf.contrib.eager.defun
-  def lossy_matmul(W, x, training=True):
-    outputs = tf.matmul(W, x)
-    if training:
-      outputs = tf.nn.dropout(outputs, keep_probability=0.2)
-    return outputs
-
-  W = tf.random.normal((3, 5))
-  x = tf.random.normal((5, 1))
-
-  # Executes a graph that applies dropout.
-  lossy_outputs = lossy_matmul(W, x, training=True)
-
-  # Executes a graph that does not apply dropout.
-  exact_outputs = lossy_matmul(W, x, training=False)
-  ```
-
-  _TensorFlow Control Flow_
-
-  When `autograph` is `True`, data-dependent control flow is allowed as well.
-  Control flow statements that depend on `Tensor` values are staged into
-  corresponding TensorFlow ops. For example, the following code will work as
-  expected:
-
-  ```python
-  @tf.contrib.eager.defun
-  def dynamic_rnn_loop(cell, seq):
-    state, output = cell.zero_state()
-    for input in seq:
-      state, output = cell(input, state)
-    return output
-  ```
-
-  For more information see `tf.autograph`.
-
-  _Variables_
-
-  TensorFlow operations related to variable creation and initialization are
-  automatically lifted out of the graphs generated by `defun`. In practice, this
-  implies that variable creation and initialization only happen the first time
-  `F` is called, and that variables are reused every time thereafter. Many
-  TensorFlow APIs, like `tf.keras.layers.Layer` objects, create variables the
-  first time they are called and reuse them thereafter. Automatic variable
-  lifting makes it possible to compile these APIs without extra effort, at the
-  cost of introducing a discrepancy between the semantics of executing Python
-  functions and their corresponding compiled functions. For example:
-
-  ```python
-  import tensorflow as tf
-
-  tf.compat.v1.enable_eager_execution()
-
-  def fn():
-    x = tf.Variable(0.0)
-    x.assign_add(1.0)
-    return x.read_value()
-
-  # `fn` is a Python function, so x is created, initialized, and destroyed upon
-  # every invocation
-  assert fn().numpy() == fn().numpy() == 1.0
-
-  compiled = tf.contrib.eager.defun(fn)
-
-  # Compiling `fn` with `defun` hoists all variables outside of the generated
-  # graph, so initialization happens exactly once.
-  assert compiled().numpy() == 1.0
-  assert compiled().numpy() == 2.0
-  ```
-
-  Finally, because each input signature is bound to a unique graph, if your
-  Python function constructs `tf.Variable` objects, then each graph constructed
-  for that Python function will reference a unique set of variables. To
-  circumvent this problem, we recommend against compiling Python functions that
-  create `tf.Variable` objects. Instead, Python functions should either
-  lexically close over `tf.Variable` objects or accept them as arguments,
-  preferably encapsulated in an object-oriented container. If you must create
-  variables inside your Python function and you want each graph generated for it
-  to reference the same set of variables, add logic to your Python function that
-  ensures that variables are only created the first time it is called and are
-  reused for every subsequent invocation; note that this is precisely what
-  `tf.keras.layers.Layer` objects do, so we recommend using them to represent
-  variable-bearing computations whenever possible.
-
-  Args:
-    func: function to be compiled. If `func` is None, returns a decorator that
-      can be invoked with a single argument - `func`. The end result is
-      equivalent to providing all the arguments up front. In other words,
-      defun(input_signature=...)(func) is equivalent to defun(func,
-      input_signature=...). The former allows the following use case:
-      @tf.contrib.eager.defun(input_signature=...) def foo(...): ...
-    input_signature: A possibly nested sequence of `tf.contrib.eager.TensorSpec`
-      objects specifying the shapes and dtypes of the Tensors that will be
-      supplied to this function. If `None`, a separate function is instantiated
-      for each inferred input signature.  If a signature is specified, every
-      input to `func` must be a `Tensor`, and `func` cannot accept `**kwargs`.
-    autograph: Whether `func` should be compiled before constructing the graph.
-      See https://www.tensorflow.org/guide/autograph for more information.
-    experimental_autograph_options: Experimental knobs (in the form of a tuple
-      of tensorflow.autograph.Feature values) to control behavior when
-      autograph=True.
-    reduce_retracing: When True, `tf.function` uses
-      `tf.types.experimental.TraceType` to trace supertypes of arguments to
-      reduce the number of traces.
-
-  Returns:
-     If `func` is not None, returns a callable that will execute the compiled
-     function (and return zero or more `tf.Tensor` objects).
-     If `func` is None, returns a decorator that, when invoked with a single
-     `func` argument, returns a callable equivalent to the case above.
-
-  Raises:
-    TypeError: If `input_signature` is neither `None` nor a sequence of
-      `tf.contrib.eager.TensorSpec` objects.
-  """
-  return defun_with_attributes(
-      func=func,
-      input_signature=input_signature,
-      autograph=autograph,
-      experimental_autograph_options=experimental_autograph_options,
-      reduce_retracing=reduce_retracing)
-
-
-# TODO(b/244360686): Remove in favor of tf.function.
+# TODO(b/258247871): Remove in favor of tf.function.
 @tf_export("__internal__.function.defun_with_attributes", v1=[])
 def defun_with_attributes(func=None,
                           input_signature=None,
diff --git a/tensorflow/python/eager/polymorphic_function/quarantine_test.py b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
index 6116baca1bd..30258eaec84 100644
--- a/tensorflow/python/eager/polymorphic_function/quarantine_test.py
+++ b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
@@ -71,7 +71,7 @@ def total_function_cache(defined):
   return defined._list_all_concrete_functions()  # pylint: disable=protected-access
 
 
-# TODO(b/244360686): Do not delete these tests, migrate to use tf.function or
+# TODO(b/258247871): Do not delete these tests, migrate to use tf.function or
 # TracingCompiler.
 class DefunTest(test.TestCase, parameterized.TestCase):
 
@@ -82,7 +82,7 @@ def testExternalControlDependency(self):
 
       op = v.assign_add(1.0)
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def f():
         with ops.control_dependencies([op]):
           return 1.0
@@ -93,7 +93,7 @@ def f():
   def testInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @quarantine.defun(reduce_retracing=True)
+    @quarantine.defun_with_attributes(reduce_retracing=True)
     def func(a):
       if a._shape_tuple()[0] is None:
         unknown_dim[0] = True
@@ -113,7 +113,7 @@ def func(a):
   def testNestedInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @quarantine.defun(reduce_retracing=True)
+    @quarantine.defun_with_attributes(reduce_retracing=True)
     def func(a_, b_=None):
       del a_  # Only used to check which cache is used.
       self.assertEqual(b_[0]._shape_tuple(), ())
@@ -156,7 +156,7 @@ def add(x, y):
       _ = x * y
       return x + y
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def add_2(x, y):
       _ = x * y
       return x + y
@@ -166,13 +166,13 @@ def add_2(x, y):
 
   def testNestedFunctionGraphNotOutOfDate(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def f():
       return constant_op.constant(1.)
 
     class _Model(object):
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def g(self):
         self.f = f.get_concrete_function()
 
@@ -190,7 +190,7 @@ def g(self):
 
   def testGraphEagerIsolation(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def f():
       self.v = variables.Variable(1.0)
       return self.v.read_value()
@@ -207,7 +207,7 @@ def f(x):
       return x
 
     x = random_ops.random_uniform([2, 2]).numpy()
-    defined = quarantine.defun(f)
+    defined = quarantine.defun_with_attributes(f)
     defined(x)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -248,7 +248,7 @@ class MyNdarray(numpy.ndarray):
 
   def testNumpyDtypeInputSupported(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def f(x, dtype):
       return constant_op.constant(dtype(x))
 
@@ -265,7 +265,7 @@ def f(**kwargs):
       return x
 
     x = random_ops.random_uniform([2, 2]).numpy()
-    defined = quarantine.defun(f)
+    defined = quarantine.defun_with_attributes(f)
     defined(x=x)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -283,7 +283,7 @@ def f(**kwargs):
 
   def testFuncListAttr(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def test_function(val):
 
       def fn1():
@@ -298,7 +298,7 @@ def fn3(x=3):
       fn5 = functools.partial(fn3, 5)
 
       return gen_functional_ops.case(val, [], [dtypes.float32], [
-          quarantine.defun(f).get_concrete_function()
+          quarantine.defun_with_attributes(f).get_concrete_function()
           for f in (fn1, fn2, fn3, fn4, fn5)
       ])
 
@@ -313,7 +313,7 @@ def fn3(x=3):
   @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def test_function():
 
       def loop_test(_):
@@ -334,7 +334,7 @@ def variable_creator():
       return self.v.read_value()
 
     self.v = None
-    defined = quarantine.defun(variable_creator)
+    defined = quarantine.defun_with_attributes(variable_creator)
     defined()  # Create the variable.
     self.assertIsInstance(self.v, resource_variable_ops.ResourceVariable)
 
@@ -352,7 +352,7 @@ def sum_gather():
       gpu_result = math_ops.reduce_sum(array_ops.gather(v_gpu, [1, 2]))
       return cpu_result, gpu_result
 
-    defined = quarantine.defun(sum_gather)
+    defined = quarantine.defun_with_attributes(sum_gather)
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
     expected = self.evaluate(sum_gather())
@@ -361,7 +361,7 @@ def sum_gather():
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testCallOptionsMemory(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def model(x):
       return x + constant_op.constant(1.)
 
@@ -377,7 +377,7 @@ def testLayerInDefun(self):
         kernel_initializer=init_ops.ones_initializer(),
         bias_initializer=init_ops.zeros_initializer())
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def model(x):
       return conv(x)
 
@@ -397,7 +397,7 @@ class _Obj(object):
       def __init__(self):
         self.v = None
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def f(self):
         if self.v is None:
           self.v = variables.Variable(1.)
@@ -418,7 +418,7 @@ def __hash__(self):
     def func(foo):
       return constant_op.constant([id(foo)])
 
-    defined = quarantine.defun(func)
+    defined = quarantine.defun_with_attributes(func)
     foo_1 = Foo()
     defined(foo_1)
     self.assertLen(total_function_cache(defined), 1)
@@ -432,7 +432,7 @@ def testCacheTensorDtypeCollision(self):
     def func(t):
       return t + t
 
-    defined = quarantine.defun(func)
+    defined = quarantine.defun_with_attributes(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -446,7 +446,7 @@ def testCacheTensorShapeCollision(self):
     def func(t):
       return t + t
 
-    defined = quarantine.defun(func)
+    defined = quarantine.defun_with_attributes(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -460,7 +460,7 @@ def testCacheTensorShapeDtypeCollision(self):
     def func(t):
       return t + t
 
-    defined = quarantine.defun(func)
+    defined = quarantine.defun_with_attributes(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
     self.assertLen(total_function_cache(defined), 1)
@@ -475,7 +475,7 @@ def func(t):
       return t + t
 
     with context.graph_mode(), self.cached_session():
-      defined = quarantine.defun(func, reduce_retracing=True)
+      defined = quarantine.defun_with_attributes(func, reduce_retracing=True)
 
       p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
@@ -504,7 +504,7 @@ def func(foo, bar=1, baz=2):
       del baz
       return
 
-    defined = quarantine.defun(func)
+    defined = quarantine.defun_with_attributes(func)
     defined(0, baz=20)
     self.assertLen(total_function_cache(defined), 1)
 
@@ -534,7 +534,7 @@ def full_function(a, b, c=3):
     partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2)
 
-    defined = quarantine.defun(partial)
+    defined = quarantine.defun_with_attributes(partial)
     func_a, func_b, func_c = defined(2)
     self.assertEqual(func_a.numpy(), a)
     self.assertEqual(func_b.numpy(), b)
@@ -547,7 +547,7 @@ def foo(a):
       return a
 
     signature = [tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.float32)]
-    defined = quarantine.defun(foo, input_signature=signature)
+    defined = quarantine.defun_with_attributes(foo, input_signature=signature)
     a = array_ops.ones([2])
     self.assertAllEqual(a, defined(a))
     self.assertLen(total_function_cache(defined), 1)
@@ -564,7 +564,7 @@ def bar(a):
       return a
 
     signature = [tensor_spec.TensorSpec((2, None), dtypes.float32)]
-    defined = quarantine.defun(bar, input_signature=signature)
+    defined = quarantine.defun_with_attributes(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
     self.assertLen(total_function_cache(defined), 1)
@@ -578,7 +578,7 @@ def bar(a):
 
   def testInputSignatureWithDictInPositionalArgs(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def f(*_args, **_kwargs):
       return None
 
@@ -594,7 +594,7 @@ def testInputSignatureWithCompatibleInputs(self):
     rank2_spec = tensor_spec.TensorSpec(
         shape=(None, None), dtype=dtypes.float32)
 
-    @quarantine.defun(input_signature=[rank2_spec])
+    @quarantine.defun_with_attributes(input_signature=[rank2_spec])
     def func(a):
       self.assertEqual([None, None], a.shape.as_list())
       return array_ops.shape(a)
@@ -613,7 +613,7 @@ def testNestedInputSignatures(self):
     def expected_foo(a, b):
       return [a, b]
 
-    @quarantine.defun(input_signature=[
+    @quarantine.defun_with_attributes(input_signature=[
         [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
         tensor_spec.TensorSpec((1,), dtypes.float32),
     ])
@@ -661,7 +661,7 @@ def testNestedInputSignaturesWithDict(self):
     def expected_bar(a):
       return a
 
-    @quarantine.defun(input_signature=[{
+    @quarantine.defun_with_attributes(input_signature=[{
         'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
         'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
         'c': tensor_spec.TensorSpec((1,), dtypes.float32)
@@ -703,7 +703,7 @@ def foo(a, b):
     with self.assertRaisesRegex(
         TypeError, 'input_signature must be either a '
         'tuple or a list.*'):
-      quarantine.defun(foo, input_signature=signature)
+      quarantine.defun_with_attributes(foo, input_signature=signature)
 
   def testInputsIncompatibleWithNestedSignatureRaisesError(self):
 
@@ -712,7 +712,7 @@ def foo(a, b):
 
     signature = [[tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
                  [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2]
-    defined = quarantine.defun(foo, input_signature=signature)
+    defined = quarantine.defun_with_attributes(foo, input_signature=signature)
     a = array_ops.ones([1])
 
     with self.assertRaisesRegex(ValueError,
@@ -726,7 +726,7 @@ def foo(a, b):
 
   def testUnderspecifiedInputSignature(self):
 
-    @quarantine.defun(input_signature=[
+    @quarantine.defun_with_attributes(input_signature=[
         tensor_spec.TensorSpec([], dtypes.float32),
     ])
     def foo(a, training=True):
@@ -737,13 +737,7 @@ def foo(a, training=True):
 
     x = constant_op.constant(1.0)
     with self.assertRaisesRegex(
-        TypeError, 'got keyword argument `training` '
-        'that was not included in input_signature'):
-      foo(x, training=True)
-
-    with self.assertRaisesRegex(
-        TypeError, 'got keyword argument `training` '
-        'that was not included in input_signature'):
+        TypeError, 'Parameter .* was expected to be of type .* but is .*'):
       foo(x, training=False)
 
     self.assertAllEqual(x.numpy(), foo(x).numpy())
@@ -756,7 +750,8 @@ def full_function(a, b, c=3.0):
     partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2.0)
     signature = [tensor_spec.TensorSpec([], dtypes.float32)]
-    defined = quarantine.defun(partial, input_signature=signature)
+    defined = quarantine.defun_with_attributes(
+        partial, input_signature=signature)
     x = constant_op.constant(2.0)
     func_a, func_b, func_c = defined(x)
     self.assertEqual(func_a.numpy(), a)
@@ -765,7 +760,7 @@ def full_function(a, b, c=3.0):
 
   def testInputSignatureWithKeywordPositionalArgs(self):
 
-    @quarantine.defun(input_signature=[
+    @quarantine.defun_with_attributes(input_signature=[
         tensor_spec.TensorSpec([], dtypes.float32),
         tensor_spec.TensorSpec([], dtypes.int64)
     ])
@@ -801,7 +796,7 @@ def foo(a, b, **kwargs):
       del kwargs
       return a, b
 
-    x = quarantine.defun(
+    x = quarantine.defun_with_attributes(
         foo,
         input_signature=[
             tensor_spec.TensorSpec([], dtypes.float32),
@@ -820,7 +815,7 @@ def f(rt):
     signature = [
         ragged_tensor.RaggedTensorSpec(shape=[3, None], dtype=dtypes.int32)
     ]
-    defined = quarantine.defun(f, input_signature=signature)
+    defined = quarantine.defun_with_attributes(f, input_signature=signature)
     rt1 = ragged_factory_ops.constant([[1], [], [2, 3, 4]])
     out1 = defined(rt1)
     self.assertLen(total_function_cache(defined), 1)
@@ -862,18 +857,18 @@ def f(a, b, c=3, *, d=4):
         tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
         tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
     ]
-    defined = quarantine.defun(f, input_signature=signature)
+    defined = quarantine.defun_with_attributes(f, input_signature=signature)
     self.assertEqual(defined(1, 2).numpy(), 10)
 
-    defined = quarantine.defun(
+    defined = quarantine.defun_with_attributes(
         functools.partial(f, c=4), input_signature=signature)
     self.assertEqual(defined(1, 2).numpy(), 11)
 
-    defined = quarantine.defun(
+    defined = quarantine.defun_with_attributes(
         functools.partial(f, d=5), input_signature=signature)
     self.assertEqual(defined(1, 2).numpy(), 11)
 
-    defined = quarantine.defun(
+    defined = quarantine.defun_with_attributes(
         functools.partial(f, d=array_ops.constant(5)),
         input_signature=signature)
     self.assertEqual(defined(1, 2).numpy(), 11)
@@ -882,7 +877,9 @@ def f(a, b, c=3, *, d=4):
     save(mod, '/tmp/kwonlyf', defined.get_concrete_function(*signature))
     loaded = load('/tmp/kwonlyf')
     result = loaded.signatures['serving_default'](
-        a=array_ops.constant(1), b=array_ops.constant(2))
+        a=array_ops.constant(1),
+        b=array_ops.constant(2),
+        d=array_ops.constant(5))
     self.assertEqual(result['output_0'].numpy(), 11)
 
   def testInputSignatureWithKeywordOnlyArgsNoDefaults(self):
@@ -895,13 +892,25 @@ def test_func(a, *, b):
       return a + b
 
     with self.assertRaisesRegex(
-        ValueError, "keyword-only arguments must have default values.*'b'"):
-      quarantine.defun(test_func, input_signature=signature)
+        TypeError,
+        (
+            'Since input_signature is defined, keyword-only parameter `b` must'
+            ' have a default value'
+        ),
+    ):
+      quarantine.defun_with_attributes(test_func, input_signature=signature)
 
     test_func_lambda = lambda a, *, b: a + b
     with self.assertRaisesRegex(
-        ValueError, "keyword-only arguments must have default values.*'b'"):
-      quarantine.defun(test_func_lambda, input_signature=signature)
+        TypeError,
+        (
+            'Since input_signature is defined, keyword-only parameter `b` must'
+            ' have a default value'
+        ),
+    ):
+      quarantine.defun_with_attributes(
+          test_func_lambda, input_signature=signature
+      )
 
   def testTensorKeywordArguments(self):
 
@@ -909,7 +918,7 @@ def foo(a, b):
       del a
       return b
 
-    defined = quarantine.defun(foo)
+    defined = quarantine.defun_with_attributes(foo)
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
@@ -943,72 +952,28 @@ def foo(a, b):
     self.assertAllEqual(six, 2.0)
     self.assertAllEqual(seven, 2.0)
 
-  def testFunctionWithExtraAttributes(self):
-
-    @quarantine.defun_with_attributes(attributes={
-        'experimental_1': 'value1',
-        'experimental_2': 2
-    })
-    def matmul(x, y):
-      return math_ops.matmul(x, y)
-
-    def add(x, y):
-      return math_ops.add(x, y)
-
-    defun_add = quarantine.defun_with_attributes(
-        add, attributes={
-            'experimental_3': True,
-            'experimental_4': 1.0
-        })
-
-    with context.graph_mode(), self.cached_session():
-      with ops.get_default_graph().as_default():
-        t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-        sq = matmul(t, t)
-        double = defun_add(t, t)
-        self.assertAllEqual(sq.eval().reshape(-1), [7, 10, 15, 22])
-        self.assertAllEqual(double.eval().reshape(-1), [2, 4, 6, 8])
-
-        graph = ops.get_default_graph()
-        # pylint: disable=protected-access
-        self.assertLen(graph._functions, 2)
-        functions = list(graph._functions.values())
-        self.assertRegex(functions[0].definition.signature.name, '.*matmul.*')
-        attrs = functions[0].definition.attr
-        self.assertLen(attrs, 2)
-        self.assertEqual(attrs['experimental_1'].s, b'value1')
-        self.assertEqual(attrs['experimental_2'].i, 2)
-
-        self.assertRegex(functions[1].definition.signature.name, '.*add.*')
-        attrs = functions[1].definition.attr
-        self.assertLen(attrs, 2)
-        self.assertEqual(attrs['experimental_3'].b, True)
-        self.assertEqual(attrs['experimental_4'].f, 1.0)
-        # pylint: enable=protected-access
-
   def testFunctionWithInvalidAttribute(self):
-
-    @quarantine.defun_with_attributes(attributes={'experimental_1': ['value1']})
     def add(x, y):
       return math_ops.add(x, y)
 
-    with self.assertRaisesRegex(ValueError,
-                                'Attribute experimental_1 must be .* Got .*'):
-      with context.graph_mode(), self.cached_session():
-        with ops.get_default_graph().as_default():
-          t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-          add(t, t)
+    with self.assertRaisesRegex(
+        ValueError,
+        'TracingCompiler does not support `experimental_1` as an attribute.',
+    ):
+      quarantine.defun_with_attributes(
+          add, attributes={'experimental_1': 'value1'}
+      )
 
   def testRegisterFunction(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def add(x, y):
       return math_ops.add(x, y)
 
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun(matmul)
+    defun_matmul = quarantine.defun_with_attributes(matmul)
 
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
@@ -1070,7 +1035,7 @@ def matmul(x, y):
 
   def testRegisterConcreteFunction(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def py_add(x, y):
       return math_ops.add(x, y)
 
@@ -1079,7 +1044,7 @@ def py_add(x, y):
         tensor_spec.TensorSpec(None, dtypes.float32),
         tensor_spec.TensorSpec(None, dtypes.float32))
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def py_composite(x, y):
       return x, add(x, y)
 
@@ -1136,7 +1101,7 @@ def testEagerCaptures(self):
     for captured, op_type in [(large_tensor, 'Placeholder'),
                               (small_tensor, 'Const'), (v, 'Placeholder')]:
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def test_fn():
         return captured + 1  # pylint: disable=cell-var-from-loop
 
@@ -1150,7 +1115,7 @@ def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun(
+    defun_matmul = quarantine.defun_with_attributes(
         matmul,
         input_signature=[
             tensor_spec.TensorSpec(shape=(2, 2), dtype=dtypes.float32),
@@ -1179,7 +1144,7 @@ def testRegisterFunctionWithCache(self):
     def matmul(x, y):
       return math_ops.matmul(x, y)
 
-    defun_matmul = quarantine.defun(matmul)
+    defun_matmul = quarantine.defun_with_attributes(matmul)
 
     with context.graph_mode(), self.cached_session():
       with ops.get_default_graph().as_default():
@@ -1200,7 +1165,7 @@ def matmul(x, y):
 
   def testCallingFunctionWithDifferentVariables(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def foo(v):
       v.assign_add(1.0)
       return v.read_value()
@@ -1215,7 +1180,7 @@ def foo(v):
 
     w = resource_variable_ops.ResourceVariable(0.0)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def bar(v):
       del v
       return constant_op.constant(1.0)
@@ -1226,7 +1191,7 @@ def bar(v):
 
   def testCallingFunctionWithNonTensorsFails(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def foo(x):
       return x
 
@@ -1234,7 +1199,39 @@ def foo(x):
     with self.assertRaises((TypeError, ValueError)):
       graph_function('Not a Tensor.')
 
-  def testSwapImplementationWithGrapplerPlugin(self):
+  @parameterized.parameters([
+      (
+          quarantine.defun_with_attributes(
+              attributes={
+                  'api_implements': 'random_boost',
+                  'api_preferred_device': 'CPU',
+              }
+          ),
+          quarantine.defun_with_attributes(
+              attributes={
+                  'api_implements': 'random_boost',
+                  'api_preferred_device': 'GPU',
+              }
+          ),
+      ),
+      (
+          polymorphic_function.function(
+              experimental_attributes={
+                  'api_implements': 'random_boost',
+                  'api_preferred_device': 'CPU',
+              }
+          ),
+          polymorphic_function.function(
+              experimental_attributes={
+                  'api_implements': 'random_boost',
+                  'api_preferred_device': 'GPU',
+              }
+          ),
+      ),
+  ])
+  def testSwapImplementationWithGrapplerPlugin(
+      self, cpu_decorator, gpu_decorator
+  ):
     # Set the min_graph_nodes to -1 since the graph in this test is too small,
     # and will be ignored by grappler if don't set this.
     rewrites = rewriter_config_pb2.RewriterConfig()
@@ -1247,17 +1244,11 @@ def testSwapImplementationWithGrapplerPlugin(self):
     with context.graph_mode(), self.cached_session(
         config=config_proto, graph=ops.Graph(), use_gpu=True):
 
-      @quarantine.defun_with_attributes(attributes={
-          'api_implements': 'random_boost',
-          'api_preferred_device': 'CPU'
-      })
+      @cpu_decorator
       def cpu_boost(x):
         return math_ops.add(x, 2.0)
 
-      @quarantine.defun_with_attributes(attributes={
-          'api_implements': 'random_boost',
-          'api_preferred_device': 'GPU'
-      })
+      @gpu_decorator
       def gpu_boost(x):
         return math_ops.add(x, 4.0)
 
@@ -1305,7 +1296,7 @@ def on_cpu(x):
     def on_gpu(x):
       return x + 4
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def run_on_cpu(t):
       concrete_func = on_cpu.get_concrete_function(t)
       concrete_func.add_to_graph()
@@ -1319,11 +1310,11 @@ def run_on_cpu(t):
   def testDefunFunctionSeparateGraphs(self):
     with context.graph_mode():
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def add(x):
         return x + 5
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def maybe_add(x, should_add):
         if should_add:
           return add(x)
@@ -1348,7 +1339,7 @@ def maybe_add(x, should_add):
 
   def testCacheKeyOverlappingShapes(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(t):
       return t
 
@@ -1357,7 +1348,7 @@ def defined(t):
     defined(array_ops.zeros([1, 21]))
     self.assertLen(total_function_cache(defined), 2)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined_again(t):
       return defined(t)
 
@@ -1368,7 +1359,7 @@ def defined_again(t):
 
   def testCacheTensorSpecIdenticalToTensor(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(t):
       return t
 
@@ -1379,7 +1370,7 @@ def defined(t):
 
   def testCacheKeyNestedLists(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(l):
       return l
 
@@ -1401,7 +1392,7 @@ class TestClass:
       a = attr.ib()
       b = attr.ib()
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(l):
       return l
 
@@ -1425,7 +1416,7 @@ def defined(l):
 
   def testDistinctVariablesNoRetracing(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(a, b, c):
       return a + b + c
 
@@ -1444,7 +1435,7 @@ def defined(a, b, c):
 
   def testRetracingOnDifferentVaribleCombinationPatterns(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(a, b, c):
       return a + b + c
 
@@ -1472,7 +1463,7 @@ def defined(a, b, c):
 
   def testDeepcopyVariableNoRetracing(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def defined(a, b, c):
       return a + b + c
 
@@ -1490,7 +1481,7 @@ def testDecoratedMethodInspect(self):
 
     class DefunnedMiniModel:
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def call(self, inputs, training=True):
         pass
 
@@ -1501,7 +1492,7 @@ def call(self, inputs, training=True):
   @test_util.disable_tfrt('b/173429686')
   def testExecutorType(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def add_five(x):
       return x + 5
 
@@ -1522,7 +1513,7 @@ def add_five(x):
   @test_util.assert_no_garbage_created
   def testReferenceCycles(self):
 
-    fn = quarantine.defun(lambda x: 2. * x)
+    fn = quarantine.defun_with_attributes(lambda x: 2. * x)
 
     fn(constant_op.constant(4.0))
     weak_fn = weakref.ref(fn)
@@ -1535,11 +1526,11 @@ def testReferenceCycles(self):
   @test_util.run_in_graph_and_eager_modes
   def testShapeCaching(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(x):
       return array_ops.shape(x)
 
-    @quarantine.defun(
+    @quarantine.defun_with_attributes(
         input_signature=[tensor_spec.TensorSpec([None, None], dtypes.float32)])
     def calls_func(x):
       return func(x)
@@ -1552,7 +1543,7 @@ def calls_func(x):
   def testLimitedRetracing(self):
     trace_count = [0]
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(x):
       trace_count[0] += 1
       return x
@@ -1579,7 +1570,7 @@ def testCollectionValueAccess(self):
         ops.add_to_collection('x', x)
         ops.add_to_collection('y', y)
 
-        @quarantine.defun
+        @quarantine.defun_with_attributes
         def fn():
           x_const = constant_op.constant(ops.get_collection('x')[0])
           y_const = constant_op.constant(ops.get_collection('y')[0])
@@ -1598,7 +1589,7 @@ def testCollectionVariableValueAccess(self):
       with self.session(graph=g):
         v = resource_variable_ops.ResourceVariable(1.0)
 
-        @quarantine.defun
+        @quarantine.defun_with_attributes
         def f():
           return v.read_value()
 
@@ -1611,7 +1602,7 @@ def testCollectionVariableValueWrite(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g):
 
-        @quarantine.defun
+        @quarantine.defun_with_attributes
         def f():
           v = resource_variable_ops.ResourceVariable(2.0)
           return v
@@ -1628,7 +1619,7 @@ class MultiDeviceDefunTest(test.TestCase, parameterized.TestCase):
   def testMultiDeviceOutput(self):
     """Tests that functions can produce outputs on multiple devices."""
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(a, b, transpose_a):
       with ops.device('/device:CPU:0'):
         m1 = math_ops.matmul(a, b, transpose_a=transpose_a)
@@ -1646,7 +1637,7 @@ def func(a, b, transpose_a):
   @test_util.run_gpu_only
   def testEmptyBody(self):
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(a, b):
       return b, a
 
@@ -1682,7 +1673,7 @@ def testMultiDeviceInt32(self):
     with ops.device('/device:GPU:0'):
       int_gpu = constant_op.constant(7, dtype=dtypes.int32)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(int_cpu, resource, int_gpu):
       with ops.device('/device:CPU:0'):
         m1 = int_cpu * resource + int_gpu
@@ -1708,7 +1699,7 @@ def func(int_cpu, resource, int_gpu):
   def testMultiDeviceColocateWith(self):
     """Tests that function's outputs respect colocation constraints."""
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(a, b):
       with ops.colocate_with(a):
         ra = 2 * a
@@ -1738,7 +1729,7 @@ def testMultiDeviceResources(self):
       g1 = resource_variable_ops.ResourceVariable(3.0)
       g2 = resource_variable_ops.ResourceVariable(5.0)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(resource1, resource2):
       with ops.device('/device:CPU:0'):
         result1 = resource1 * g2
@@ -1767,7 +1758,7 @@ def testOutputResources(self):
     with ops.device('/device:GPU:0'):
       g1 = resource_variable_ops.ResourceVariable(3.0)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(resource1, resource2):
       with ops.device('/device:CPU:0'):
         result1 = resource1 * 5
@@ -1823,7 +1814,7 @@ def testPassResourceThroughNestedFunctionCall(self):
     def inner(resource1):
       return resource1 * 2, resource1.handle
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def outer(resource1):
       with ops.device('/device:CPU:0'):
         r1, _ = inner(resource1)
@@ -1852,7 +1843,7 @@ def inner(resource1):
       resource1.assign_add(2.0)
       return resource1 * 2, resource1.handle
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def outer(resource1):
       with ops.device('/device:CPU:0'):
         r1, res1 = inner(resource1)
@@ -1894,7 +1885,7 @@ def testComplexInputOutputDevicePattern(self):
     for tensor in [cg0, cg1]:
       self.assertRegex(tensor.backing_device, 'GPU:0')
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1):
       with ops.device('/device:CPU:0'):
         m1 = rc0 * cg0
@@ -1930,7 +1921,7 @@ def testArgumentPruning(self):
       g2 = constant_op.constant(13.0)
       g3 = constant_op.constant(17.0)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def func(g1, g2, c1, g3, c2):  # pylint: disable=unused-argument
       # arguments g1 and g2 are unused and can be pruned by grappler.
       return c1 * g3 * c2
@@ -2036,7 +2027,7 @@ def testBackwardNoneGradient(self):
     model = variables.Variable(1.0, name='model')
     count = variables.Variable(0)
 
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def forward_pass(value):
       count.assign_add(1)
       residuals = value - model
@@ -2063,7 +2054,7 @@ class DefunArgumentNamingTest(test.TestCase, parameterized.TestCase):
   """Tests for recognizable export signatures from concrete functions."""
 
   def testBasic(self):
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def fn(a, b):
       return a + b, a * b
     # Call the function to make def_function happy
@@ -2087,7 +2078,7 @@ def fn(a, b):
         fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
 
   def testVariable(self):
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def fn(a, b):
       return a + b, a * b
     # Call the function to make def_function happy
@@ -2105,7 +2096,7 @@ def fn(a, b):
     self.assertLen(fn_op.graph.structured_outputs, 2)
 
   def testDictReturned(self):
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def fn(x, z=(1., 2.), y=3.):
       z1, z2 = z
       return {'alpha': x + y + z1, 'beta': x * y + z2}
@@ -2154,7 +2145,7 @@ def fn(x, z=(1., 2.), y=3.):
   def testMethod(self):
     class HasMethod(object):
 
-      @quarantine.defun
+      @quarantine.defun_with_attributes
       def method(self, x):
         return x
 
@@ -2196,7 +2187,7 @@ def testMethodSignature(self):
 
     class HasMethod(object):
 
-      @quarantine.defun(
+      @quarantine.defun_with_attributes(
           input_signature=(tensor_spec.TensorSpec(
               shape=None, dtype=dtypes.float64, name='y'),))
       def method(self, x):
@@ -2222,7 +2213,7 @@ def method(self, x):
         [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
 
   def testVariadic(self):
-    @quarantine.defun
+    @quarantine.defun_with_attributes
     def variadic_fn(x, *args, **kwargs):
       return x + math_ops.add_n(list(args) + list(kwargs.values()))
 
@@ -2244,7 +2235,7 @@ def variadic_fn(x, *args, **kwargs):
         [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs])
 
   def testVariadicInputSignature(self):
-    @quarantine.defun(
+    @quarantine.defun_with_attributes(
         input_signature=(
             tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
             tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
@@ -2277,7 +2268,7 @@ def f():
       with ops.device('cpu'):
         return test_ops.device_placement_op()
 
-    func = quarantine.defun(f)
+    func = quarantine.defun_with_attributes(f)
     with ops.device('cpu:0'):
       output = self.evaluate(func())
       self.assertIn(compat.as_bytes('CPU:0'), output)
@@ -2295,7 +2286,7 @@ def multi_device_fn():
       s3 = test_ops.device_placement_op()
       return s0, s1, s2, s3
 
-    defined = quarantine.defun(multi_device_fn)
+    defined = quarantine.defun_with_attributes(multi_device_fn)
     outputs = self.evaluate(defined())
     self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
diff --git a/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py b/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py
new file mode 100644
index 00000000000..f15e41bf006
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py
@@ -0,0 +1,116 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""ExportedConcreteFunction class and its associated functions.
+
+Part of saved model utils, a shim layer for working with
+functions exported/restored from saved models.
+This functionality should ultimately be moved into a first-class core API.
+"""
+
+import gc
+from tensorflow.python.trackable import base as trackable
+
+
+# TODO(kathywu): Delete this class when ConcreteFunctions can be copied with new
+# captures.
+class ExportedConcreteFunction(trackable.Trackable):
+  """A callable class that uses captures from the exported SavedModel graph."""
+  __slots__ = ("function", "tensor_map")
+
+  def __init__(self, function, tensor_map):
+    self.function = function
+    self.tensor_map = tensor_map
+
+  def __call__(self, *args, **kwargs):
+    _, _, filtered_flat_args = (
+        self.function._function_spec.canonicalize_function_inputs(args, kwargs))
+    export_captures = _map_captures_to_created_tensors(
+        self.function.graph.captures, self.tensor_map, self.function)
+    return self.function._call_flat(filtered_flat_args, export_captures)
+
+
+def _map_captures_to_created_tensors(original_captures, tensor_map, function):
+  """Maps eager tensors captured by a function to Graph resources for export.
+
+  Args:
+    original_captures: A dictionary mapping from tensors captured by the
+      function to interior placeholders for those tensors (inside the function
+      body).
+    tensor_map: A dictionary mapping from resource tensors owned by the eager
+      context to resource tensors in the exported graph.
+    function: Function with the original captures. Only used when raising the
+      AssertionError.
+
+  Returns:
+    A list of stand-in tensors which belong to the exported graph, corresponding
+    to the function's captures.
+
+  Raises:
+    AssertionError: If the function references a resource which is not part of
+      `tensor_map`.
+  """
+  export_captures = []
+  for exterior, interior in original_captures:
+    mapped_resource = tensor_map.get(exterior, None)
+    if mapped_resource is None:
+      _raise_untracked_capture_error(function.name, exterior, interior)
+    export_captures.append(mapped_resource)
+  return export_captures
+
+
+def _raise_untracked_capture_error(function_name, capture,
+                                   internal_capture=None,
+                                   node_path=None):
+  """Raises AssertionError due to being unable to export a function."""
+  msg = ("Tried to export a function which references an 'untracked' resource. "
+         "TensorFlow objects (e.g. tf.Variable) captured by functions must be "
+         "'tracked' by assigning them to an attribute of a tracked object or "
+         "assigned to an attribute of the main object directly. See the "
+         "information below:"
+         f"\n\tFunction name = {function_name}")
+
+  if node_path is not None:
+    msg += f"\n\tPath to Function = {node_path}"
+
+  msg += f"\n\tCaptured Tensor = {capture}"
+  msg += f"\n\t{_get_trackable_parent_error_string(capture)}"
+
+  if internal_capture is not None:
+    msg += f"\n\tInternal Tensor = {internal_capture}"
+  raise AssertionError(msg)
+
+
+def _get_trackable_parent_error_string(capture):
+  """Gets error string with the capture's parent object."""
+  parent = getattr(capture, "_parent_trackable", None)
+  if parent is not None:
+    return f"Trackable referencing this tensor = {parent()}"
+
+  # Try to figure out where the resource came from by iterating over objects
+  # which reference it. This is slow and doesn't help us figure out how to
+  # match it to other objects when loading the SavedModel as a checkpoint,
+  # so we can't continue saving. But we can at least tell the user what
+  # needs attaching.
+  trackable_referrers = []
+  for primary_referrer in gc.get_referrers(capture):
+    if isinstance(primary_referrer, trackable.Trackable):
+      trackable_referrers.append(primary_referrer)
+    for secondary_referrer in gc.get_referrers(primary_referrer):
+      if isinstance(secondary_referrer, trackable.Trackable):
+        trackable_referrers.append(secondary_referrer)
+  return ("Trackable Python objects referring to this tensor "
+          "(from gc.get_referrers, limited to two hops) = [\n\t\t{}]"
+          .format("\n\t\t".join([repr(obj) for obj in trackable_referrers])))
diff --git a/tensorflow/python/eager/polymorphic_function/saved_model_utils.py b/tensorflow/python/eager/polymorphic_function/saved_model_utils.py
index d5f312e1f8e..a42f572bd0c 100644
--- a/tensorflow/python/eager/polymorphic_function/saved_model_utils.py
+++ b/tensorflow/python/eager/polymorphic_function/saved_model_utils.py
@@ -18,22 +18,14 @@
 This functionality should ultimately be moved into a first-class core API.
 """
 
-import gc
-import warnings
-
 import numpy
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import handle_data_util
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import registration
-from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import base as trackable
-from tensorflow.python.trackable import resource
 
 
 @registration.register_tf_serializable()
@@ -88,190 +80,3 @@ def _deserialize_from_proto(cls, object_proto, operation_attributes,
     else:
       imported_constant = constant_op.constant(ndarray)
     return imported_constant
-
-
-# TODO(kathywu): Delete this class when ConcreteFunctions can be copied with new
-# captures.
-class ExportedConcreteFunction(trackable.Trackable):
-  """A callable class that uses captures from the exported SavedModel graph."""
-  __slots__ = ("function", "tensor_map")
-
-  def __init__(self, function, tensor_map):
-    self.function = function
-    self.tensor_map = tensor_map
-
-  def __call__(self, *args, **kwargs):
-    _, _, filtered_flat_args = (
-        self.function._function_spec.canonicalize_function_inputs(args, kwargs))
-    export_captures = _map_captures_to_created_tensors(
-        self.function.graph.captures, self.tensor_map, self.function)
-    return self.function._call_flat(filtered_flat_args, export_captures)
-
-
-def _map_captures_to_created_tensors(original_captures, tensor_map, function):
-  """Maps eager tensors captured by a function to Graph resources for export.
-
-  Args:
-    original_captures: A dictionary mapping from tensors captured by the
-      function to interior placeholders for those tensors (inside the function
-      body).
-    tensor_map: A dictionary mapping from resource tensors owned by the eager
-      context to resource tensors in the exported graph.
-    function: Function with the original captures. Only used when raising the
-      AssertionError.
-
-  Returns:
-    A list of stand-in tensors which belong to the exported graph, corresponding
-    to the function's captures.
-
-  Raises:
-    AssertionError: If the function references a resource which is not part of
-      `tensor_map`.
-  """
-  export_captures = []
-  for exterior, interior in original_captures:
-    mapped_resource = tensor_map.get(exterior, None)
-    if mapped_resource is None:
-      _raise_untracked_capture_error(function.name, exterior, interior)
-    export_captures.append(mapped_resource)
-  return export_captures
-
-
-def _raise_untracked_capture_error(function_name, capture,
-                                   internal_capture=None,
-                                   node_path=None):
-  """Raises AssertionError due to being unable to export a function."""
-  msg = ("Tried to export a function which references an 'untracked' resource. "
-         "TensorFlow objects (e.g. tf.Variable) captured by functions must be "
-         "'tracked' by assigning them to an attribute of a tracked object or "
-         "assigned to an attribute of the main object directly. See the "
-         "information below:"
-         f"\n\tFunction name = {function_name}")
-
-  if node_path is not None:
-    msg += f"\n\tPath to Function = {node_path}"
-
-  msg += f"\n\tCaptured Tensor = {capture}"
-  msg += f"\n\t{_get_trackable_parent_error_string(capture)}"
-
-  if internal_capture is not None:
-    msg += f"\n\tInternal Tensor = {internal_capture}"
-  raise AssertionError(msg)
-
-
-def _get_trackable_parent_error_string(capture):
-  """Gets error string with the capture's parent object."""
-  parent = getattr(capture, "_parent_trackable", None)
-  if parent is not None:
-    return f"Trackable referencing this tensor = {parent()}"
-
-  # Try to figure out where the resource came from by iterating over objects
-  # which reference it. This is slow and doesn't help us figure out how to
-  # match it to other objects when loading the SavedModel as a checkpoint,
-  # so we can't continue saving. But we can at least tell the user what
-  # needs attaching.
-  trackable_referrers = []
-  for primary_referrer in gc.get_referrers(capture):
-    if isinstance(primary_referrer, trackable.Trackable):
-      trackable_referrers.append(primary_referrer)
-    for secondary_referrer in gc.get_referrers(primary_referrer):
-      if isinstance(secondary_referrer, trackable.Trackable):
-        trackable_referrers.append(secondary_referrer)
-  return ("Trackable Python objects referring to this tensor "
-          "(from gc.get_referrers, limited to two hops) = [\n\t\t{}]"
-          .format("\n\t\t".join([repr(obj) for obj in trackable_referrers])))
-
-
-def get_tensor_from_node(node):
-  """Resolves a saved model graph node into a tensor to be captured.
-
-  Args:
-    node: a tensor, variable, or resource to be resolved into a capturable
-      tensor
-
-  Returns:
-    A list of tensors.
-  Raises:
-    ValueError: if the node cannot be converted into a tensor.
-  """
-  with ops.init_scope():
-    # TODO(b/210144904): Use __tf_tensor__ instead of `is_[...]` checks
-    if getattr(node, "is_distributed_variable", False):
-      return node
-    elif getattr(node, "is_distributed_table", False):
-      return node
-    elif getattr(node, "is_sharded_variable", False):
-      return node
-    elif resource_variable_ops.is_resource_variable(node):
-      return node.handle
-    elif isinstance(node, asset.Asset):
-      return node.asset_path
-    elif tensor_util.is_tf_type(node):
-      return node
-    elif isinstance(node, resource.CapturableResource):
-      # Note: this executes restored functions in the CapturableResource.
-      return node.resource_handle
-    raise ValueError(f"Cannot convert node {node} to tensor.")
-
-
-def restore_captures(concrete_function, inputs):
-  """Restore captures for the concrete function.
-
-  Used at deserialization time.  For functions that are being deserialized,
-  saved model restores objects that tensors were captured from, but functions
-  only know about their tensors -- object information is destroyed by tracing.
-  This additional logic extracts the tensors which the function originally
-  captured.
-
-  Args:
-    concrete_function: the concrete function for which to restore captures
-    inputs: a list tensors or other Python objects (such as variables) which
-      contain tensors that were originally captured by the function
-  """
-  bound_inputs = [get_tensor_from_node(obj) for obj in inputs]
-  bound_variables = [
-      obj for obj in inputs
-      if isinstance(obj, (variables_lib.Variable,
-                          resource_variable_ops.BaseResourceVariable))
-  ]
-  # TODO(b/205010575): This is only injecting the captured inputs into the
-  # concrete function, note that we did not modify the FuncGraph
-  # itself.
-  captured_inputs_list = []
-  concrete_function.set_variables(bound_variables)
-  if bound_inputs:
-    for bound_input, internal_capture in zip(
-        bound_inputs, concrete_function.inputs[-len(bound_inputs):]):
-      # Distributed inputs have special logic for capturing, so we call their
-      # custom restoration methods
-      if hasattr(bound_input, "__tf_experimental_restore_capture__"):
-        captured_inputs_list.append(
-            bound_input.__tf_experimental_restore_capture__(
-                concrete_function, internal_capture))
-      else:
-        captured_inputs_list.append(bound_input)
-        concrete_function.graph.replace_capture(bound_input, internal_capture)
-        if internal_capture.dtype == dtypes.resource:
-          if resource_variable_ops.is_resource_variable(bound_input):
-            try:
-              handle = bound_input.handle
-            except ValueError:
-              # For mirrored variables we'll copy handle data for components
-              # as they get captured.
-              pass
-            else:
-              handle_data_util.copy_handle_data(handle, internal_capture)
-          else:
-            # TODO(b/213451747): Remove need to call copy_handle_data
-            handle_data_util.copy_handle_data(bound_input, internal_capture)
-        # Setting "captures" first means "capture" won't create a new
-        # placeholder for this input.
-        concrete_function.graph.capture(bound_input)
-
-  if any([inp is None for inp in captured_inputs_list]):
-    warnings.warn("Trying to load ShardedVariables using tf.saved_model.load. "
-                  "This won't work if using a tf.distribute.Strategy, and may "
-                  "use excess memory if not using a Strategy. Ignore this "
-                  "warning if using tf.keras.models.load_model.")
-  concrete_function.set_external_captures(captured_inputs_list)
-
diff --git a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
index fe32eff64b0..87b6a8d7394 100644
--- a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
+++ b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
@@ -14,13 +14,18 @@
 # ==============================================================================
 """Tracing Compiler implementation."""
 
+import collections
 import threading
 import types as types_lib
 from typing import List
 import weakref
 
+from tensorflow.core.function import trace_type
+from tensorflow.core.function.capture import capture_container
 from tensorflow.core.function.polymorphism import function_cache
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.python.eager import monitoring
+from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
 from tensorflow.python.eager.polymorphic_function import function_context
 from tensorflow.python.eager.polymorphic_function import function_spec
 from tensorflow.python.eager.polymorphic_function import monomorphic_function
@@ -29,7 +34,6 @@
 from tensorflow.python.profiler import trace
 from tensorflow.python.util import compat
 from tensorflow.python.util import lazy_loader
-from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -103,7 +107,7 @@ def __init__(self,
         argspec has keyword arguments.
     """
     self._python_function = python_function
-    pure_function = attributes and monomorphic_function.IMPLEMENTS_ATTRIBUTE_NAME in attributes
+    pure_function = attributes and attributes_lib.IMPLEMENTS in attributes
     self._function_spec = function_spec.FunctionSpec.from_function_and_signature(
         python_function, input_signature, is_pure=pure_function)
     self._name = name
@@ -111,12 +115,19 @@ def __init__(self,
     self._autograph_options = autograph_options
     self._reduce_retracing = reduce_retracing
     self._function_cache = function_cache.FunctionCache()
+
     self._function_attributes = attributes or {}
+    for attribute in self._function_attributes:
+      if attribute not in attributes_lib.TRACING_COMPILER_ALLOWLIST:
+        raise ValueError(
+            f"TracingCompiler does not support `{attribute}` as an attribute."
+        )
+
     self._capture_by_value = capture_by_value
     self.tracing_count = 0
     # Maintein a dict of all captures: identifier -> lambda function. It's used
     # to get runtime values for all captures during ConcreteFunction dispatch,
-    self._captures_container = func_graph_module.CapturesContainer()
+    self._func_captures = capture_container.FunctionCaptures()
     self._lock = threading.RLock()
     # _descriptor_cache is a of instance of a class to an instance-specific
     # `TracingCompiler`, used to make sure tf.function-decorated methods
@@ -183,23 +194,20 @@ def _get_concrete_function_garbage_collected(self, *args, **kwargs):
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
-    if self.input_signature:
-      self._function_spec.validate_inputs_with_signature(args, kwargs)
+    if self.input_signature and (args or kwargs):
+      # Check to see if a valid type can be generated from the args, kwargs
+      self._function_spec.make_canonicalized_monomorphic_type(args, kwargs)
 
     with self._lock:
       concrete_function, _ = self._maybe_define_concrete_function(args, kwargs)
       seen_names = set()
-      captured = object_identity.ObjectIdentitySet(
-          concrete_function.graph.internal_captures)
-      # pylint: disable=protected-access
-      concrete_function._arg_keywords = []
+      concrete_function._arg_keywords = []  # pylint: disable=protected-access
       prefix_counts = {}
-      # pylint: enable=protected-access
-      num_positional = 0
-      for arg in concrete_function.graph.inputs:
-        if arg in captured:
-          break
-        num_positional += 1
+      graph = concrete_function.graph
+      num_captures = len(
+          graph.internal_captures + graph.deferred_internal_captures)
+      num_positional = len(graph.inputs) - num_captures
+      for arg in concrete_function.graph.inputs[:num_positional]:
         user_arg_name = compat.as_str(arg.op.get_attr("_user_specified_name"))
         proposal = user_arg_name
         while proposal in seen_names:
@@ -264,20 +272,30 @@ def __get__(self, instance, owner):
     # Return the cached `TracingCompiler` for the instance
     return self._descriptor_cache[instance]
 
-  def _create_concrete_function(self, args, kwargs):
-    """Create a `ConcreteFunction` from `args` and `kwargs`."""
+  def _create_concrete_function(self, args, kwargs, func_graph):
+    """Create a `ConcreteFunction` from `args`, `kwargs`, and `func_graph`."""
     self.tracing_count += 1
 
     arglen = len(args)
     base_arg_names = self._function_spec.arg_names[:arglen]
     num_missing_args = arglen - len(self._function_spec.arg_names)
-    missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
-    # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
-    # where arg is based on the self._function_spec.vararg_name.
-    missing_arg_names = [
-        "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
-    ]
-    arg_names = base_arg_names + missing_arg_names
+    if num_missing_args > 0:
+      # Must have variable positional args if there are missing args.
+      var_arg_name = next(
+          p.name
+          for p in self._function_spec.function_type.parameters.values()
+          if p.kind is function_type_lib.Parameter.VAR_POSITIONAL
+      )
+      missing_arg_names = [var_arg_name] * num_missing_args
+      # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
+      # where arg is based on the self._function_spec.vararg_name.
+      missing_arg_names = [
+          "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
+      ]
+      arg_names = base_arg_names + missing_arg_names
+    else:
+      arg_names = base_arg_names
+
     concrete_function = monomorphic_function.ConcreteFunction(
         func_graph_module.func_graph_from_py_func(
             self._name,
@@ -285,10 +303,12 @@ def _create_concrete_function(self, args, kwargs):
             args,
             kwargs,
             None,
+            func_graph=func_graph,
             autograph=self._autograph,
             autograph_options=self._autograph_options,
             arg_names=arg_names,
-            capture_by_value=self._capture_by_value),
+            capture_by_value=self._capture_by_value,
+            create_placeholders=False),
         self._function_attributes,
         spec=self.function_spec,
         # Tell the ConcreteFunction to clean up its graph once it goes out of
@@ -322,18 +342,20 @@ def _maybe_define_function(self, args, kwargs):
         self._function_spec.canonicalize_function_inputs(args, kwargs))
 
     if self.input_signature is not None:
-      args = self.input_signature
-      kwargs = {}
+      args = (*self.input_signature, *args[len(self.input_signature):])
 
     # Get runtime values of captures
-    captures = self._captures_container.get_snapshot()
+    captures = self._func_captures.get_by_ref_snapshot()
+
+    current_func_context = function_context.make_function_context()
 
     # cache_key_deletion_observer is useless here. It's based on all captures.
     # A new cache key will be built later when saving ConcreteFunction because
     # only active captures should be saved.
-    lookup_func_context, lookup_func_type, _ = function_context.make_cache_key(
-        (args, kwargs), captures)
-    concrete_function = self._function_cache.lookup(lookup_func_context,
+    lookup_func_type, lookup_func_context = (
+        self._function_spec.make_canonicalized_monomorphic_type(
+            args, kwargs, captures))
+    concrete_function = self._function_cache.lookup(current_func_context,
                                                     lookup_func_type)
     if concrete_function is not None:
       return concrete_function, filtered_flat_args
@@ -342,8 +364,7 @@ def _maybe_define_function(self, args, kwargs):
       with trace.Trace("tf.function-graph_building"):
         logging.vlog(
             1, "Creating new FuncGraph for Python function %r (key: %r, %r)",
-            self._python_function, lookup_func_context,
-            lookup_func_type)
+            self._python_function, current_func_context, lookup_func_type)
         logging.vlog(2, "Python function signature [args: %s] [kwargs: %s]",
                      args, kwargs)
         ag_status = (
@@ -351,25 +372,43 @@ def _maybe_define_function(self, args, kwargs):
             if self._autograph else ag_ctx.Status.DISABLED)
         with ag_ctx.ControlStatusCtx(
             status=ag_status, options=self._autograph_options):
+          func_graph = func_graph_module.FuncGraph(
+              self._name, capture_by_value=self._capture_by_value)
           if self.input_signature is None and self._reduce_retracing:
-            general_func_type = self._function_cache.generalize(
-                lookup_func_context, lookup_func_type)
-            placeholder_bound_args = general_func_type.placeholder_arguments()
-            args, kwargs = placeholder_bound_args.args[0]
-
-          concrete_function = self._create_concrete_function(args, kwargs)
-
-          graph_capture_container = concrete_function.graph._capture_func_lib  # pylint: disable=protected-access
+            target_func_type = self._function_cache.generalize(
+                current_func_context, lookup_func_type)
+          else:
+            target_func_type = lookup_func_type
+          handledata_mapping = lookup_func_context.get_handledata_mapping()
+          placeholder_mapping = lookup_func_context.get_placeholder_mapping()
+          placeholder_context = trace_type.InternalPlaceholderContext(
+              func_graph, placeholder_mapping, handledata_mapping)
+          with func_graph.as_default():
+            placeholder_bound_args = target_func_type.placeholder_arguments(
+                placeholder_context)
+          if self.function_spec.is_method:
+            # TODO(fmuham): canonicalize_function_inputs removes self arg.
+            args = placeholder_bound_args.args[1:]
+          else:
+            args = placeholder_bound_args.args
+          kwargs = placeholder_bound_args.kwargs
+
+          concrete_function = self._create_concrete_function(
+              args, kwargs, func_graph)
+
+          # TODO(b/263520817): Remove access to private attribute.
+          graph_capture_container = concrete_function.graph._function_captures  # pylint: disable=protected-access
           # Maintain the list of all captures
-          self._captures_container.update(graph_capture_container)
+          self._func_captures.merge_by_ref_with(graph_capture_container)
           # Get current active captures snapshot
-          captures = graph_capture_container.get_snapshot()
+          captures = graph_capture_container.get_by_ref_snapshot()
 
           # Create a cache_key with args and captures
-          traced_func_context, traced_func_type, traced_func_deletion_observer = (
-              function_context.make_cache_key((args, kwargs), captures))
+          traced_func_deletion_observer = lookup_func_context.deletion_observer
+          traced_func_type = _insert_capture_type(
+              target_func_type, captures, lookup_func_context)
 
-          self._function_cache.add(traced_func_context, traced_func_type,
+          self._function_cache.add(current_func_context, traced_func_type,
                                    traced_func_deletion_observer,
                                    concrete_function)
 
@@ -463,3 +502,11 @@ def bound_method_wrapper(*args, **kwargs):
   wrapped_instance_func = tf_decorator.make_decorator(bound_method,
                                                       instance_func)
   return wrapped_instance_func
+
+
+def _insert_capture_type(original_func_type, captures, type_context):
+  capture_types = collections.OrderedDict()
+  for name, value in captures.items():
+    capture_types[name] = trace_type.from_value(value, type_context)
+  return function_type_lib.FunctionType(
+      original_func_type.parameters.values(), capture_types)
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 9ed8504a9f3..203c834928b 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 364> a = {{
+  static std::array<OpIndexInfo, 365> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -337,6 +337,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StatelessParameterizedTruncatedNormal", 1, {1}},
       {"StatelessRandomBinomial"},
       {"StatelessRandomGammaV2", 1, {1}},
+      {"StatelessRandomGammaV3", 3, {1, 2, 3}},
       {"StatelessRandomNormal"},
       {"StatelessRandomNormalV2"},
       {"StatelessRandomPoisson"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b8cf5076061..5a014cc556c 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include "tensorflow/python/eager/pywrap_tensor.h"
 
-#include <stdlib.h>
-#include <string.h>
+#include <stdlib.h>  // NOLINT
+#include <string.h>  // NOLINT
 
-#include <cmath>
+#include <cmath>  // NOLINT
 
 #include "structmember.h"  // NOLINT // For PyMemberDef
 #include "pybind11/pybind11.h"
@@ -35,7 +39,6 @@ limitations under the License.
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
-#include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 3e7d09b5126..fe916c8ac17 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -15,10 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
 #define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
 
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
+
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/lib/core/numpy.h"
 
 bool EagerTensor_CheckExact(const PyObject* o);
 int64_t PyEagerTensor_ID(const PyObject* tensor);
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index 052bbcdf71e..ec69af99fec 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -22,6 +22,7 @@
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import remote_utils
@@ -179,21 +180,26 @@ def connect_to_cluster(cluster_spec_or_resolver,
     job_def.tasks[0] = "localhost:{}".format(local_port)
 
   if context.context().coordination_service is None:
+    service_type = remote_utils.coordination_service_type(protocol)
+    service_leader = ""
     # Maybe enable coordination service for the communication protocol
     # TODO(b/243839559): Fix UPTC + Coordination service crashing
-    if isinstance(cluster_spec_or_resolver, cluster_resolver.ClusterResolver):
+    if isinstance(cluster_spec_or_resolver,
+                  tpu_cluster_resolver.TPUClusterResolver):
       is_uptc_sess = ".uptc-worker." in cluster_spec_or_resolver.master()
-      coordination_service = remote_utils.coordination_service_type(
+      service_type = remote_utils.coordination_service_type(
           protocol, is_uptc_sess)
-    else:
-      coordination_service = remote_utils.coordination_service_type(protocol)
-    if coordination_service:
+      service_leader = cluster_spec_or_resolver.get_coordination_service_leader(
+      )
+    if service_type:
       # If `enable_health_check` is true, coordination service agent would
       # do connecting (and tasks would send heartbeat if connection is set up)
       # while creating eager contexts. Enabling health check does not mutate
       # coordination service.
       context.context().configure_coordination_service(
-          coordination_service, enable_health_check=False)
+          service_type=service_type,
+          service_leader=service_leader,
+          enable_health_check=False)
 
   server_def = ServerDef(
       cluster=cluster_def,
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index b446b63787b..d25ae713253 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 1979c48b991..c2f1317df0d 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index bba1fa98e99..e38b3c2b6bf 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -159,6 +159,7 @@
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
@@ -166,14 +167,17 @@
 
 _FEATURE_COLUMN_DEPRECATION_WARNING = """\
     Warning: tf.feature_column is not recommended for new code. Instead,
-    feature preprocessing can be done directly using [Keras preprocessing
-    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns).
-    See the [migration guide](https://tensorflow.org/guide/migrate) for details.
+    feature preprocessing can be done directly using either [Keras preprocessing
+    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
+    or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
+    built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
+    for details.
     """
 
 _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
-    'Use Keras preprocessing layers instead. Each of `tf.feature_column.*` has'
-    ' a functional equivalent in `tf.keras.layers` for feature preprocessing '
+    'Use Keras preprocessing layers instead, either directly or via the '
+    '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
+    'a functional equivalent in `tf.keras.layers` for feature preprocessing '
     'when training a Keras model.')
 
 
@@ -240,9 +244,8 @@ def _get_logits():  # pylint: disable=missing-docstring
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    v1=['feature_column.input_layer'],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export(v1=['feature_column.input_layer'])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -376,9 +379,8 @@ def weights(self):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    v1=['feature_column.linear_model'],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export(v1=['feature_column.linear_model'])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -768,9 +770,8 @@ def _transform_features(features, feature_columns):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    v1=['feature_column.make_parse_example_spec'],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export(v1=['feature_column.make_parse_example_spec'])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -3124,7 +3125,6 @@ def _collect_leaf_level_keys(cross):
   return leaf_level_keys
 
 
-
 class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
                        collections.namedtuple('_IndicatorColumn',
                                               ['categorical_column'])):
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index ee6bca3b3c9..320bf22d042 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -171,13 +171,16 @@
                                'APIs instead.')
 _FEATURE_COLUMN_DEPRECATION_WARNING = """\
     Warning: tf.feature_column is not recommended for new code. Instead,
-    feature preprocessing can be done directly using [Keras preprocessing
-    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns).
-    See the [migration guide](https://tensorflow.org/guide/migrate) for details.
+    feature preprocessing can be done directly using either [Keras preprocessing
+    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
+    or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
+    built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
+    for details.
     """
 _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
-    'Use Keras preprocessing layers instead. Each of `tf.feature_column.*` has'
-    ' a functional equivalent in `tf.keras.layers` for feature preprocessing '
+    'Use Keras preprocessing layers instead, either directly or via the '
+    '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
+    'a functional equivalent in `tf.keras.layers` for feature preprocessing '
     'when training a Keras model.')
 
 
@@ -462,8 +465,8 @@ def _transform_features_v2(features, feature_columns, state_manager):
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
 @tf_export(
     'feature_column.make_parse_example_spec',
-    v1=[],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+    v1=[])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def make_parse_example_spec_v2(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -525,9 +528,8 @@ def make_parse_example_spec_v2(feature_columns):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.embedding_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.embedding_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def embedding_column(categorical_column,
                      dimension,
                      combiner='mean',
@@ -642,9 +644,8 @@ def model_fn(features, ...):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    v1=['feature_column.shared_embedding_columns'],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export(v1=['feature_column.shared_embedding_columns'])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def shared_embedding_columns(categorical_columns,
                              dimension,
                              combiner='mean',
@@ -839,8 +840,8 @@ def model_fn(features, ...):
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
 @tf_export(
     'feature_column.shared_embeddings',
-    v1=[],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+    v1=[])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -1016,9 +1017,8 @@ def model_fn(features, ...):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.numeric_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.numeric_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -1117,9 +1117,8 @@ def numeric_column(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.bucketized_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.bucketized_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input bucketed by `boundaries`.
 
@@ -1205,9 +1204,8 @@ def bucketized_column(source_column, boundaries):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.categorical_column_with_hash_bucket',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.categorical_column_with_hash_bucket')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1278,9 +1276,8 @@ def categorical_column_with_hash_bucket(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    v1=['feature_column.categorical_column_with_vocabulary_file'],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1389,8 +1386,8 @@ def categorical_column_with_vocabulary_file(key,
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
 @tf_export(
     'feature_column.categorical_column_with_vocabulary_file',
-    v1=[],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+    v1=[])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def categorical_column_with_vocabulary_file_v2(key,
                                                vocabulary_file,
                                                vocabulary_size=None,
@@ -1525,9 +1522,8 @@ def categorical_column_with_vocabulary_file_v2(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.categorical_column_with_vocabulary_list',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def categorical_column_with_vocabulary_list(key,
                                             vocabulary_list,
                                             dtype=None,
@@ -1645,9 +1641,8 @@ def categorical_column_with_vocabulary_list(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.categorical_column_with_identity',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.categorical_column_with_identity')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `CategoricalColumn` that returns identity values.
 
@@ -1724,9 +1719,8 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.indicator_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.indicator_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1770,9 +1764,8 @@ def indicator_column(categorical_column):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.weighted_categorical_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.weighted_categorical_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def weighted_categorical_column(categorical_column,
                                 weight_feature_key,
                                 dtype=dtypes.float32):
@@ -1849,10 +1842,12 @@ def weighted_categorical_column(categorical_column,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.crossed_column',
-    deprecation_inst='Use `tf.keras.layers.experimental.preprocessing.HashedCrossing` instead for feature crossing when preprocessing data to train a Keras model.'
-)
+@tf_export('feature_column.crossed_column')
+@deprecation.deprecated(
+    None,
+    'Use `tf.keras.layers.experimental.preprocessing.HashedCrossing` '
+    'instead for feature crossing when preprocessing data to train a '
+    'Keras model.')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
diff --git a/tensorflow/python/feature_column/sequence_feature_column.py b/tensorflow/python/feature_column/sequence_feature_column.py
index 1f6b5ee61ee..551899d1a4e 100644
--- a/tensorflow/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -28,19 +28,23 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 _FEATURE_COLUMN_DEPRECATION_WARNING = """\
     Warning: tf.feature_column is not recommended for new code. Instead,
-    feature preprocessing can be done directly using [Keras preprocessing
-    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns).
-    See the [migration guide](https://tensorflow.org/guide/migrate) for details.
+    feature preprocessing can be done directly using either [Keras preprocessing
+    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
+    or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
+    built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
+    for details.
     """
 
 _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
-    'Use Keras preprocessing layers instead. Each of `tf.feature_column.*` has'
-    ' a functional equivalent in `tf.keras.layers` for feature preprocessing '
+    'Use Keras preprocessing layers instead, either directly or via the '
+    '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
+    'a functional equivalent in `tf.keras.layers` for feature preprocessing '
     'when training a Keras model.')
 
 
@@ -95,9 +99,8 @@ def concatenate_context_input(context_input, sequence_input):
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.sequence_categorical_column_with_identity',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.sequence_categorical_column_with_identity')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def sequence_categorical_column_with_identity(key,
                                               num_buckets,
                                               default_value=None):
@@ -146,9 +149,8 @@ def sequence_categorical_column_with_identity(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.sequence_categorical_column_with_hash_bucket',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def sequence_categorical_column_with_hash_bucket(key,
                                                  hash_bucket_size,
                                                  dtype=dtypes.string):
@@ -194,9 +196,8 @@ def sequence_categorical_column_with_hash_bucket(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.sequence_categorical_column_with_vocabulary_file',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def sequence_categorical_column_with_vocabulary_file(key,
                                                      vocabulary_file,
                                                      vocabulary_size=None,
@@ -265,9 +266,8 @@ def sequence_categorical_column_with_vocabulary_file(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.sequence_categorical_column_with_vocabulary_list',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def sequence_categorical_column_with_vocabulary_list(key,
                                                      vocabulary_list,
                                                      dtype=None,
@@ -333,9 +333,8 @@ def sequence_categorical_column_with_vocabulary_list(key,
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
-@tf_export(
-    'feature_column.sequence_numeric_column',
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+@tf_export('feature_column.sequence_numeric_column')
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def sequence_numeric_column(key,
                             shape=(1,),
                             default_value=0.,
diff --git a/tensorflow/python/feature_column/serialization.py b/tensorflow/python/feature_column/serialization.py
index c3de2079b8c..2b1204836f4 100644
--- a/tensorflow/python/feature_column/serialization.py
+++ b/tensorflow/python/feature_column/serialization.py
@@ -19,6 +19,7 @@
 from tensorflow.python.feature_column import feature_column_v2 as fc_lib
 from tensorflow.python.feature_column import sequence_feature_column as sfc_lib
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -26,14 +27,17 @@
 
 _FEATURE_COLUMN_DEPRECATION_WARNING = """\
     Warning: tf.feature_column is not recommended for new code. Instead,
-    feature preprocessing can be done directly using [Keras preprocessing
-    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns).
-    See the [migration guide](https://tensorflow.org/guide/migrate) for details.
+    feature preprocessing can be done directly using either [Keras preprocessing
+    layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
+    or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
+    built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
+    for details.
     """
 
 _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
-    'Use Keras preprocessing layers instead. Each of `tf.feature_column.*` has'
-    ' a functional equivalent in `tf.keras.layers` for feature preprocessing '
+    'Use Keras preprocessing layers instead, either directly or via the '
+    '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
+    'a functional equivalent in `tf.keras.layers` for feature preprocessing '
     'when training a Keras model.')
 
 _FEATURE_COLUMNS = [
@@ -50,8 +54,8 @@
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
 @tf_export(
     '__internal__.feature_column.serialize_feature_column',
-    v1=[],
-    deprecation_inst=_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
+    v1=[])
+@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
 def serialize_feature_column(fc):
   """Serializes a FeatureColumn or a raw string key.
 
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 5fc5c944db5..df3f73962ec 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -6,9 +6,11 @@ load(
     "if_oss",
     "if_xla_available",
     "py_test",
+    "tf_cc_binary",
     "tf_cc_shared_object",
     "tf_cc_test",
     "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_python_pybind_extension")
 load("//tensorflow:pytype.default.bzl", "pytype_library", "pytype_strict_library")
@@ -17,8 +19,8 @@ load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additiona
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_xla_deps_py")
 
 visibility = [
-    # copybara:uncomment "//third_party/auroraml:__subpackages__",
     "//tensorflow:__subpackages__",
+    "//tensorflow/dtensor:dtensor-internal",
     "//learning/brain/tfrt:__subpackages__",
     "//third_party/py/tensorflow_numerics:__subpackages__",
     "//tensorflow_models/google:__subpackages__",
@@ -26,6 +28,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
@@ -83,16 +86,49 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":op_reg_offset_proto_cc",
+        ":python_op_gen_annotator",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
 
+cc_library(
+    name = "python_op_gen_annotator",
+    srcs = ["python_op_gen_annotator.cc"],
+    hdrs = ["python_op_gen_annotator.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":kythe_metadata_proto_cc",
+        ":op_reg_offset_proto_cc",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "python_op_gen_annotator_test",
+    srcs = ["python_op_gen_annotator_test.cc"],
+    deps = [
+        ":kythe_metadata_proto_cc",
+        ":python_op_gen_annotator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:protobuf",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_python_op_gen",
     srcs = ["python_op_gen_wrapper.cc"],
@@ -110,12 +146,16 @@ cc_library(
     srcs = ["python_op_gen_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":op_reg_offset_proto_cc",
         ":python_op_gen",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
@@ -124,6 +164,7 @@ tf_cc_test(
     name = "python_op_gen_test",
     srcs = ["python_op_gen_test.cc"],
     deps = [
+        ":op_reg_offset_proto_cc",
         ":python_op_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:op_gen_lib",
@@ -133,6 +174,65 @@ tf_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "op_reg_offset_proto",
+    srcs = ["op_reg_offset.proto"],
+    cc_api_version = 3,
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_proto_library(
+    name = "kythe_metadata_proto",
+    srcs = ["kythe_metadata.proto"],
+    cc_api_version = 3,
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_cc_binary(
+    name = "offset_counter",
+    srcs = ["offset_counter.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":offset_counter_helper",
+        ":op_reg_offset_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:strcat",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/util:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "offset_counter_helper",
+    srcs = ["offset_counter_helper.cc"],
+    hdrs = ["offset_counter_helper.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":op_reg_offset_proto_cc",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:regexp",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:strcat",
+        "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "offset_counter_helper_test",
+    srcs = ["offset_counter_helper_test.cc"],
+    deps = [
+        ":offset_counter_helper",
+        ":op_reg_offset_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 py_library(
     name = "for_generated_wrappers",
     srcs_version = "PY3",
@@ -146,6 +246,7 @@ py_library(
         ":op_def_registry",
         ":ops",
         ":registry",
+        ":tensor_conversion_registry",
         ":tensor_shape",
         ":versions",
     ],
@@ -275,6 +376,7 @@ py_library(
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:execute",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/profiler:trace",
     ],
 )
@@ -315,6 +417,8 @@ py_library(
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/lib/core:_pywrap_bfloat16",
+        "//tensorflow/python/lib/core:_pywrap_custom_casts",
+        "//tensorflow/python/lib/core:_pywrap_float8",
         "//tensorflow/python/types:doc_typealias",
     ],
 )
@@ -436,7 +540,10 @@ py_library(
         "convert_to_constants.py",
     ],
     srcs_version = "PY3",
-    visibility = visibility,
+    visibility = visibility + [
+        "//learning/deepmind/deepfunc:__subpackages__",
+        "//platforms/darwinn:__subpackages__",
+    ],
     deps = [
         ":dtypes",
         ":ops",
@@ -444,6 +551,8 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
         "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -644,9 +753,9 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:numpy_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
@@ -730,9 +839,9 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:numpy_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
@@ -855,9 +964,9 @@ tf_python_pybind_extension(
         "//tensorflow/core/distributed_runtime/coordination:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
-        "//tensorflow/python/lib/core:numpy_hdr",
         "//tensorflow/tsl/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/tsl/distributed_runtime/coordination:pywrap_required_hdrs",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     deps = [
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
@@ -969,9 +1078,14 @@ py_library(
         ":tensor_conversion_registry",
         ":tensor_shape",
         ":type_spec",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types",
         "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -989,6 +1103,8 @@ py_library(
     name = "tensor_conversion_registry",
     srcs = ["tensor_conversion_registry.py"],
     srcs_version = "PY3",
+    # TODO(b/266747022): remove extra visibility
+    visibility = visibility + ["//learning/brain/experimental:__subpackages__"],
     deps = [
         "//tensorflow/python/eager:context",
     ],
@@ -1010,6 +1126,7 @@ py_library(
         ":tensor_util",
         ":type_spec",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:handle_data_util",
         "//tensorflow/python:resource_variable_ops",
@@ -1022,7 +1139,6 @@ py_library(
         "//tensorflow/python/eager:tape",
         "//tensorflow/python/platform",
         "//tensorflow/python/saved_model:save_context",
-        "//tensorflow/python/types",
         "//tensorflow/python/util",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -1204,6 +1320,9 @@ py_library(
         ":ops",
         ":tensor_util",
         ":type_spec",
+        ":type_spec_registry",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types",
     ],
 )
@@ -1257,10 +1376,10 @@ py_library(
         ":tensor_shape",
         ":tensor_spec",
         ":type_spec",
-        # TODO(b/232002704) remove `:type_utils` import when used internally (by non-tests)
-        # elsewhere
-        ":type_utils",
+        ":type_spec_registry",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:composite_tensor_ops",
+        "//tensorflow/python/saved_model:nested_structure_coder",
     ],
 )
 
@@ -1295,6 +1414,7 @@ py_library(
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util",
     ],
 )
@@ -1308,15 +1428,27 @@ pytype_library(
         ":composite_tensor",
         ":dtypes",
         ":tensor_shape",
+        ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types",
         "//tensorflow/python/util",
         "//third_party/py/numpy",
     ],
 )
 
+pytype_library(
+    name = "type_spec_registry",
+    srcs = ["type_spec_registry.py"],
+    srcs_version = "PY3",
+    visibility = visibility + ["//third_party/py/tensorflow_gnn:__subpackages__"],
+    deps = [
+        "//tensorflow/python/types",
+    ],
+)
+
 pytype_library(
     name = "type_utils",
     srcs = ["type_utils.py"],
@@ -1343,9 +1475,13 @@ py_library(
         ":dtypes",
         ":tensor_shape",
         ":type_spec",
+        ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/eager:graph_only_ops",
+        "//tensorflow/python/platform",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/types",
         "//tensorflow/python/util",
         "//third_party/py/numpy",
     ],
@@ -1414,6 +1550,7 @@ py_library(
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:sync_ops_gen",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
@@ -1753,10 +1890,11 @@ tf_gen_op_wrapper_py(
     deps = [":test_ops_kernels"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "test_ops_kernels",
     srcs = ["test_ops.cc"],
-    linkstatic = 1,
+    hdrs = ["test_ops.h"],
+    gpu_srcs = ["test_ops.cu.cc"],
     deps = [
         "@com_google_absl//absl/time",
         "//tensorflow/core:framework",
@@ -1848,6 +1986,7 @@ tf_py_test(
     deps = [
         ":extension_type",
         ":test_lib",
+        ":type_spec_registry",
         ":type_utils",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1906,6 +2045,7 @@ tf_py_test(
         ":for_generated_wrappers",
         ":test_lib",
         ":type_spec",
+        ":type_spec_registry",
         ":type_utils",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform_test",
@@ -2008,7 +2148,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.py"],
@@ -2247,3 +2387,15 @@ tf_python_pybind_extension(
 #     deps = [":cpp_shape_inference_proto"],
 # )
 # copybara:uncomment_end
+
+py_library(
+    name = "summary_test_util",
+    testonly = True,
+    srcs = ["summary_test_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+    ],
+)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 60cdf3c5aa9..bd7ce5e8f61 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -102,6 +102,12 @@
 MUST_RUN_ORDER_INSENSITIVE_STATEFUL_OPS = frozenset((
     "InfeedEnqueue",
     "InfeedEnqueueTuple",
+    "EnqueueTPUEmbeddingSparseBatch",
+    "EnqueueTPUEmbeddingIntegerBatch",
+    "EnqueueTPUEmbeddingSparseTensorBatch",
+    "EnqueueTPUEmbeddingRaggedTensorBatch",
+    "EnqueueTPUEmbeddingArbitraryTensorBatch",
+    "DynamicEnqueueTPUEmbeddingArbitraryTensorBatch",
 ))
 
 # These ops are order-insensitive ans should in theory run, but at the moment
@@ -116,11 +122,6 @@
     "CudnnRNNV3",
     "CudnnRNNBackpropV2",
     "CudnnRNNBackpropV3",
-    "EnqueueTPUEmbeddingSparseBatch",
-    "EnqueueTPUEmbeddingIntegerBatch",
-    "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch",
-    "EnqueueTPUEmbeddingArbitraryTensorBatch",
     "RestoreV2",
     "SaveV2",
 ))
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 7eafe126a71..22bc292dd0f 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -245,13 +245,21 @@ def set_optimizer_experimental_options(options):
 def get_soft_device_placement():
   """Return status of soft device placement flag.
 
-  If enabled, an op will be placed on CPU if any of the following are true
-    1. there's no GPU implementation for the OP
+  If enabled, ops can be placed on different devices than the device explicitly
+  assigned by the user. This potentially has a large performance cost due to an
+  increase in data communication between devices.
+
+  Some cases where soft_device_placement would modify device assignment are:
+    1. no GPU/TPU implementation for the OP
     2. no GPU devices are known or registered
     3. need to co-locate with reftype input(s) which are from CPU
+    4. an OP can not be compiled by XLA.  Common for TPU which always requires
+         the XLA compiler.
 
-  If disabled, the placement is strict and CPU fallback is not allowed.
-  An error is raised when an Op cannot be placed onto its intended device.
+  For TPUs, if this option is true, a feature called automatic outside
+  compilation is enabled. Automatic outside compilation will move uncompilable
+  ops within a TPU program to instead run on the host. This can be used when
+  encountering compilation failures due to unsupported ops.
 
   Returns:
    A boolean indicating if soft placement is enabled.
@@ -263,10 +271,21 @@ def get_soft_device_placement():
 def set_soft_device_placement(enabled):
   """Enable or disable soft device placement.
 
-  If enabled, an op will be placed on CPU if any of the following are true
-    1. there's no GPU implementation for the OP
+  If enabled, ops can be placed on different devices than the device explicitly
+  assigned by the user. This potentially has a large performance cost due to an
+  increase in data communication between devices.
+
+  Some cases where soft_device_placement would modify device assignment are:
+    1. no GPU/TPU implementation for the OP
     2. no GPU devices are known or registered
     3. need to co-locate with reftype input(s) which are from CPU
+    4. an OP can not be compiled by XLA.  Common for TPU which always requires
+         the XLA compiler.
+
+  For TPUs, if this option is true, a feature called automatic outside
+  compilation is enabled. Automatic outside compilation will move uncompilable
+  ops within a TPU program to instead run on the host. This can be used when
+  encountering compilation failures due to unsupported ops.
 
   Note: by default soft device placement is enabled when running in eager mode
   (for convenience) and disabled in graph mode (for performance).
@@ -885,35 +904,12 @@ def enable_mlir_bridge():
   context.context().enable_mlir_bridge = True
 
 
-@tf_export('config.experimental.enable_mlir_graph_optimization')
-def enable_mlir_graph_optimization():
-  """Enables experimental MLIR-Based TensorFlow Compiler Optimizations.
-
-  DO NOT USE, DEV AND TESTING ONLY AT THE MOMENT.
-
-  NOTE: MLIR-Based TensorFlow Compiler is under active development and has
-  missing features, please refrain from using. This API exists for development
-  and testing only.
-
-  TensorFlow Compiler Optimizations are responsible general graph level
-  optimizations that in the current stack mostly done by Grappler graph
-  optimizers.
-  """
-  context.context().enable_mlir_graph_optimization = True
-
-
 @tf_export('config.experimental.disable_mlir_bridge')
 def disable_mlir_bridge():
   """Disables experimental MLIR-Based TensorFlow Compiler Bridge."""
   context.context().enable_mlir_bridge = False
 
 
-@tf_export('config.experimental.disable_mlir_graph_optimization')
-def disable_mlir_graph_optimization():
-  """Disables experimental MLIR-Based TensorFlow Compiler Optimizations."""
-  context.context().enable_mlir_graph_optimization = False
-
-
 @tf_export('config.experimental.enable_op_determinism', v1=[])
 def enable_op_determinism():
   """Configures TensorFlow ops to run deterministically.
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index a4a5cdce03e..a94ac690384 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -233,22 +233,6 @@ def testResetMlirFlags(self):
         context.context().config.experimental.mlir_bridge_rollout,
         config_pb2.ConfigProto.Experimental.MLIR_BRIDGE_ROLLOUT_UNSPECIFIED)
 
-  @reset_eager
-  def testEnableMlirGraphOptimization(self):
-    # Default value of enable_mlir_graph_optimization is false.
-    self.assertFalse(
-        context.context().config.experimental.enable_mlir_graph_optimization)
-
-    # Tests enabling mlir graph optimization.
-    config.enable_mlir_graph_optimization()
-    self.assertTrue(
-        context.context().config.experimental.enable_mlir_graph_optimization)
-
-    # Tests disabling mlir graph optimization.
-    config.disable_mlir_graph_optimization()
-    self.assertFalse(
-        context.context().config.experimental.enable_mlir_graph_optimization)
-
   @test_util.run_gpu_only
   @reset_eager
   def testJit(self):
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index c8edc4512b8..c9c8652e094 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.profiler import trace
@@ -343,9 +344,9 @@ def _constant_tensor_conversion_function(v, dtype=None, name=None,
   return constant(v, dtype=dtype, name=name)
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     (list, tuple), _constant_tensor_conversion_function, 100)
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     object, _constant_tensor_conversion_function, 200)
 
 
@@ -379,7 +380,7 @@ def _tensor_shape_tensor_conversion_function(s,
   return constant(s_list, dtype=dtype, name=name)
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     tensor_shape.TensorShape, _tensor_shape_tensor_conversion_function, 100)
 
 
@@ -402,5 +403,5 @@ def _dimension_tensor_conversion_function(d,
   return constant(d.value, dtype=dtype, name=name)
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     tensor_shape.Dimension, _dimension_tensor_conversion_function, 100)
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 19517feabee..a0d953d9ca0 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -35,8 +35,10 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saver import export_meta_graph
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import object_identity
+from tensorflow.python.util.tf_export import tf_export
 
 # Lazy load the single eager module to avoid introducing new dependencies for
 # graph_util:convert_variables_to_constants (eg in
@@ -191,10 +193,14 @@ def convert_variable_to_constant(self, incoming_edge, tensor_data):
       incoming_edge: The edge into the argument to be converted.
       tensor_data: The constant value.
     """
-    function = self.converted_self().function
     index = incoming_edge.destination.index
-    function.signature.input_arg[index].type = tensor_data.dtype
+    for edge in self.outgoing_edges:
+      if edge.source.index == index:
+        edge.destination.convertible.convert_variable_to_constant(
+            edge, tensor_data)
 
+    function = self.converted_self().function
+    function.signature.input_arg[index].type = tensor_data.dtype
     # TODO(b/176982859): Find a more satisfying way to update shape information
     # than clearing it, or migrate users to a workflow that does not require
     # freezing.
@@ -206,11 +212,6 @@ def convert_variable_to_constant(self, incoming_edge, tensor_data):
       arg_attrs["_output_shapes"].list.shape[0].unknown_rank = True
       del arg_attrs["_output_shapes"].list.shape[0].dim[:]
 
-    for edge in self.outgoing_edges:
-      if edge.source.index == index:
-        edge.destination.convertible.convert_variable_to_constant(
-            edge, tensor_data)
-
   def create_edges(self):
     for n in self._nodes.values():
       n.create_edges()
@@ -524,20 +525,18 @@ def converted_self(self):
       converted_names = self._enclosing_graph.converted_function_names
       for attr_name in self._function_attributes:
         attr = node.attr[attr_name]
-        if attr.HasField("func"):
+        if attr.HasField(
+            "func") and self._enclosing_graph.is_converted_function(
+                attr.func.name):
           attr.func.name = converted_names[attr.func.name]
         elif attr.HasField("list"):
           for func in attr.list.func:
-            func.name = converted_names[func.name]
+            if self._enclosing_graph.is_converted_function(func.name):
+              func.name = converted_names[func.name]
     return self._converted_self
 
   def convert_variable_to_constant(self, incoming_edge, tensor_data):
-    node = self.converted_self()
     index = incoming_edge.destination.index
-    if index >= self._first_function_input:
-      node.update_dtype(self._type_attribute,
-                        index - self._first_function_input, tensor_data.dtype)
-
     # The loop below is reasonable but not correct in general:
     # The outgoing edges going into the functions are correct, because the
     # inputs map to the function inputs. But the edges going into other nodes do
@@ -554,6 +553,11 @@ def convert_variable_to_constant(self, incoming_edge, tensor_data):
       if edge.source.index == index and isinstance(dest, _Function):
         dest.convert_variable_to_constant(edge, tensor_data)
 
+    node = self.converted_self()
+    if index >= self._first_function_input:
+      node.update_dtype(self._type_attribute,
+                        index - self._first_function_input, tensor_data.dtype)
+
   def create_edges(self):
     """Creates edges related to a function caller.
 
@@ -712,6 +716,11 @@ def rename_function(self, old_name, new_name):
     func.function.signature.name = new_name
     self.functions[new_name] = func
 
+  def is_converted_function(self, function_name):
+    # Only converted functions will be renamed.
+    return (function_name not in self.converted_self().functions) and (
+        function_name in self.converted_function_names)
+
   def converted_self(self):
     if self._converted_self is None:
       copied_graph = graph_pb2.GraphDef()
@@ -1287,3 +1296,47 @@ def convert_variables_to_constants_from_session_graph(
           variable_names_allowlist=variable_names_allowlist,
           variable_names_denylist=variable_names_denylist))
   return graph_def
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions="This API was designed for TensorFlow v1. See "
+    "https://www.tensorflow.org/guide/migrate for instructions on how to "
+    "migrate your code to TensorFlow v2."
+)
+@tf_export(v1=["graph_util.convert_variables_to_constants"])
+def convert_variables_to_constants(sess,
+                                   input_graph_def,
+                                   output_node_names,
+                                   variable_names_whitelist=None,
+                                   variable_names_blacklist=None):
+  """Replaces all the variables in a graph with constants of the same values.
+
+  If you have a trained graph containing Variable ops, it can be convenient to
+  convert them all to Const ops holding the same values. This makes it possible
+  to describe the network fully with a single GraphDef file, and allows the
+  removal of a lot of ops related to loading and saving the variables.
+
+  Args:
+    sess: Active TensorFlow session containing the variables.
+    input_graph_def: GraphDef object holding the network.
+    output_node_names: List of name strings for the result nodes of the graph.
+    variable_names_whitelist: The set of variable names to convert (by default,
+      all variables are converted).
+    variable_names_blacklist: The set of variable names to omit converting to
+      constants.
+
+  Returns:
+    GraphDef containing a simplified version of the original.
+
+  Raises:
+    RuntimeError: if a DT_RESOURCE op is found whose ancestor Variables are both
+      denylisted AND whitelisted for freezing.
+  """
+  ret = convert_variables_to_constants_from_session_graph(
+      session=sess,
+      graph_def=input_graph_def,
+      output_node_names=output_node_names,
+      variable_names_allowlist=variable_names_whitelist,
+      variable_names_denylist=variable_names_blacklist)
+  return ret
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index a787bbf50d2..b952e8c5c9b 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -27,11 +27,17 @@
 from tensorflow.python.framework import _dtypes
 from tensorflow.python.types import doc_typealias
 from tensorflow.python.lib.core import _pywrap_bfloat16
+from tensorflow.python.lib.core import _pywrap_custom_casts
+from tensorflow.python.lib.core import _pywrap_float8
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.types import trace
 from tensorflow.core.function import trace_type
+from tensorflow.tools.docs import doc_controls
 
 _np_bfloat16 = _pywrap_bfloat16.TF_bfloat16_type()
+_np_float8_e4m3fn = _pywrap_float8.TF_float8_e4m3fn_type()
+_np_float8_e5m2 = _pywrap_float8.TF_float8_e5m2_type()
+_pywrap_custom_casts.TF_register_custom_casts()
 
 
 class DTypeMeta(type(_dtypes.DType), abc.ABCMeta):
@@ -121,6 +127,10 @@ def min(self):
       except:
         if self.base_dtype == bfloat16:
           return _np_bfloat16(float.fromhex("-0x1.FEp127"))
+        elif self.base_dtype == float8_e5m2:
+          return _np_float8_e5m2(float.fromhex("-0x1.Cp15"))
+        elif self.base_dtype == float8_e4m3fn:
+          return _np_float8_e4m3fn(float.fromhex("-0x1.Cp8"))
         raise TypeError(f"Cannot find minimum value of {self}.")
 
   @property
@@ -147,6 +157,10 @@ def max(self):
       except:
         if self.base_dtype == bfloat16:
           return _np_bfloat16(float.fromhex("0x1.FEp127"))
+        elif self.base_dtype == float8_e5m2:
+          return _np_float8_e5m2(float.fromhex("0x1.Cp15"))
+        elif self.base_dtype == float8_e4m3fn:
+          return _np_float8_e4m3fn(float.fromhex("0x1.Cp8"))
         raise TypeError(f"Cannot find maximum value of {self}.")
 
   @property
@@ -198,6 +212,11 @@ def most_specific_common_supertype(
     """See tf.types.experimental.TraceType base class."""
     return self if all(self == other for other in types) else None
 
+  @doc_controls.do_not_doc_inheritable
+  def placeholder_value(self, placeholder_context=None):
+    """TensorShape does not support placeholder values."""
+    raise NotImplementedError
+
   @classmethod
   def experimental_type_proto(cls) -> Type[types_pb2.SerializedDType]:
     """Returns the type of proto associated with DType serialization."""
@@ -243,7 +262,6 @@ def __reduce__(self):
 # Define data type range of numpy dtype
 dtype_range = {
     np.bool_: (False, True),
-    np.bool8: (False, True),
     np.uint8: (0, 255),
     np.uint16: (0, 65535),
     np.int8: (-128, 127),
@@ -401,6 +419,23 @@ def __reduce__(self):
     doc="16-bit bfloat (brain floating point).")
 tf_export("dtypes.bfloat16", "bfloat16").export_constant(__name__, "bfloat16")
 
+float8_e5m2 = DType(types_pb2.DT_FLOAT8_E5M2)
+doc_typealias.document(
+    obj=float8_e5m2,
+    doc="8-bit float with 5 exponent bits and 2 mantissa bits.")
+tf_export("dtypes.experimental.float8_e5m2",
+          "experimental.float8_e5m2").export_constant(__name__, "float8_e5m2")
+
+float8_e4m3fn = DType(types_pb2.DT_FLOAT8_E4M3FN)
+doc_typealias.document(
+    obj=float8_e4m3fn,
+    doc="8-bit float with 4 exponent bits and 3 mantissa bits, with extended "
+    "finite range.  This type has no representation for inf, and only two NaN "
+    "values: 0xFF for negative NaN, and 0x7F for positive NaN.")
+tf_export("dtypes.experimental.float8_e4m3fn",
+          "experimental.float8_e4m3fn").export_constant(__name__,
+                                                        "float8_e4m3fn")
+
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
 variant_ref = DType(types_pb2.DT_VARIANT_REF)
 float16_ref = DType(types_pb2.DT_HALF_REF)
@@ -426,6 +461,8 @@ def __reduce__(self):
 quint16_ref = DType(types_pb2.DT_QUINT16_REF)
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
+float8_e5m2_ref = DType(types_pb2.DT_FLOAT8_E5M2_REF)
+float8_e4m3fn_ref = DType(types_pb2.DT_FLOAT8_E4M3FN_REF)
 
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
@@ -451,6 +488,8 @@ def __reduce__(self):
     types_pb2.DT_QUINT16: quint16,
     types_pb2.DT_QINT32: qint32,
     types_pb2.DT_BFLOAT16: bfloat16,
+    types_pb2.DT_FLOAT8_E5M2: float8_e5m2,
+    types_pb2.DT_FLOAT8_E4M3FN: float8_e4m3fn,
     types_pb2.DT_RESOURCE: resource,
     types_pb2.DT_VARIANT: variant,
     types_pb2.DT_HALF_REF: float16_ref,
@@ -474,6 +513,8 @@ def __reduce__(self):
     types_pb2.DT_QUINT16_REF: quint16_ref,
     types_pb2.DT_QINT32_REF: qint32_ref,
     types_pb2.DT_BFLOAT16_REF: bfloat16_ref,
+    types_pb2.DT_FLOAT8_E5M2_REF: float8_e5m2_ref,
+    types_pb2.DT_FLOAT8_E4M3FN_REF: float8_e4m3fn_ref,
     types_pb2.DT_RESOURCE_REF: resource_ref,
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
@@ -501,6 +542,8 @@ def __reduce__(self):
     types_pb2.DT_QUINT16: "quint16",
     types_pb2.DT_QINT32: "qint32",
     types_pb2.DT_BFLOAT16: "bfloat16",
+    types_pb2.DT_FLOAT8_E5M2: "float8_e5m2",
+    types_pb2.DT_FLOAT8_E4M3FN: "float8_e4m3fn",
     types_pb2.DT_RESOURCE: "resource",
     types_pb2.DT_VARIANT: "variant",
     types_pb2.DT_HALF_REF: "float16_ref",
@@ -524,6 +567,8 @@ def __reduce__(self):
     types_pb2.DT_QUINT16_REF: "quint16_ref",
     types_pb2.DT_QINT32_REF: "qint32_ref",
     types_pb2.DT_BFLOAT16_REF: "bfloat16_ref",
+    types_pb2.DT_FLOAT8_E5M2_REF: "float8_e5m2_ref",
+    types_pb2.DT_FLOAT8_E4M3FN_REF: "float8_e4m3fn_ref",
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
@@ -550,7 +595,7 @@ def __reduce__(self):
 _np_quint16 = np.dtype([("quint16", np.uint16)])
 _np_qint32 = np.dtype([("qint32", np.int32)])
 
-# _np_bfloat16 is defined by a module import.
+# _np_bfloat16, _np_float8* are defined by module imports.
 
 # Custom struct dtype for directly-fed ResourceHandles of supported type(s).
 np_resource = np.dtype([("resource", np.ubyte)])
@@ -580,6 +625,8 @@ def __reduce__(self):
     _np_quint16: quint16,
     _np_qint32: qint32,
     _np_bfloat16: bfloat16,
+    _np_float8_e5m2: float8_e5m2,
+    _np_float8_e4m3fn: float8_e4m3fn,
 }
 
 # Map (some) NumPy platform dtypes to TF ones using their fixed-width
@@ -644,7 +691,10 @@ def __reduce__(self):
         _np_qint32,
     types_pb2.DT_BFLOAT16:
         _np_bfloat16,
-
+    types_pb2.DT_FLOAT8_E5M2:
+        _np_float8_e5m2,
+    types_pb2.DT_FLOAT8_E4M3FN:
+        _np_float8_e4m3fn,
     # Ref types
     types_pb2.DT_HALF_REF:
         np.float16,
@@ -688,6 +738,10 @@ def __reduce__(self):
         _np_qint32,
     types_pb2.DT_BFLOAT16_REF:
         _np_bfloat16,
+    types_pb2.DT_FLOAT8_E5M2_REF:
+        _np_float8_e5m2,
+    types_pb2.DT_FLOAT8_E4M3FN_REF:
+        _np_float8_e4m3fn,
 }
 
 _QUANTIZED_DTYPES_NO_REF = frozenset([qint8, quint8, qint16, quint16, qint32])
@@ -718,16 +772,29 @@ def __reduce__(self):
 
 @tf_export("dtypes.as_dtype", "as_dtype")
 def as_dtype(type_value):
-  """Converts the given `type_value` to a `DType`.
+  """Converts the given `type_value` to a `tf.DType`.
+
+  Inputs can be existing `tf.DType` objects, a [`DataType`
+  enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+  a string type name, or a
+  [`numpy.dtype`](https://numpy.org/doc/stable/reference/generated/numpy.dtype.html).
+
+  Examples:
+  >>> tf.as_dtype(2)  # Enum value for float64.
+  tf.float64
+
+  >>> tf.as_dtype('float')
+  tf.float32
+
+  >>> tf.as_dtype(np.int32)
+  tf.int32
 
-  Note: `DType` values are interned. When passed a new `DType` object,
-  `as_dtype` always returns the interned value.
+  Note: `DType` values are interned (i.e. a single instance of each dtype is
+  stored in a map). When passed a new `DType` object, `as_dtype` always returns
+  the interned value.
 
   Args:
-    type_value: A value that can be converted to a `tf.DType` object. This may
-      currently be a `tf.DType` object, a [`DataType`
-      enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
-        a string type name, or a [`numpy.dtype`](https://numpy.org/doc/stable/reference/generated/numpy.dtype.html).
+    type_value: A value that can be converted to a `tf.DType` object.
 
   Returns:
     A `DType` corresponding to `type_value`.
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index d777c0ac585..702514e8d07 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -93,6 +93,9 @@ def testNumpyConversion(self):
     self.assertIs(dtypes.string,
                   dtypes.as_dtype(np.array(["foo", "bar"]).dtype))
     self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool_))
+    self.assertIs(dtypes.float8_e5m2, dtypes.as_dtype(dtypes._np_float8_e5m2))
+    self.assertIs(dtypes.float8_e4m3fn,
+                  dtypes.as_dtype(dtypes._np_float8_e4m3fn))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
@@ -109,7 +112,8 @@ class AnotherObject(object):
   def testRealDtype(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.bool, dtypes.uint8, dtypes.int8,
-        dtypes.int16, dtypes.int32, dtypes.int64
+        dtypes.int16, dtypes.int32, dtypes.int64, dtypes.float8_e5m2,
+        dtypes.float8_e4m3fn
     ]:
       self.assertIs(dtype.real_dtype, dtype)
     self.assertIs(dtypes.complex64.real_dtype, dtypes.float32)
@@ -132,6 +136,8 @@ def testStringConversion(self):
     self.assertIs(dtypes.quint8, dtypes.as_dtype("quint8"))
     self.assertIs(dtypes.qint32, dtypes.as_dtype("qint32"))
     self.assertIs(dtypes.bfloat16, dtypes.as_dtype("bfloat16"))
+    self.assertIs(dtypes.float8_e5m2, dtypes.as_dtype("float8_e5m2"))
+    self.assertIs(dtypes.float8_e4m3fn, dtypes.as_dtype("float8_e4m3fn"))
     self.assertIs(dtypes.float32_ref, dtypes.as_dtype("float32_ref"))
     self.assertIs(dtypes.float64_ref, dtypes.as_dtype("float64_ref"))
     self.assertIs(dtypes.int32_ref, dtypes.as_dtype("int32_ref"))
@@ -147,6 +153,9 @@ def testStringConversion(self):
     self.assertIs(dtypes.quint8_ref, dtypes.as_dtype("quint8_ref"))
     self.assertIs(dtypes.qint32_ref, dtypes.as_dtype("qint32_ref"))
     self.assertIs(dtypes.bfloat16_ref, dtypes.as_dtype("bfloat16_ref"))
+    self.assertIs(dtypes.float8_e5m2_ref, dtypes.as_dtype("float8_e5m2_ref"))
+    self.assertIs(dtypes.float8_e4m3fn_ref,
+                  dtypes.as_dtype("float8_e4m3fn_ref"))
     with self.assertRaises(TypeError):
       dtypes.as_dtype("not_a_type")
 
@@ -175,6 +184,8 @@ def testIsInteger(self):
     self.assertEqual(dtypes.as_dtype("string").is_integer, False)
     self.assertEqual(dtypes.as_dtype("bool").is_integer, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("float8_e5m2").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("float8_e4m3fn").is_integer, False)
     self.assertEqual(dtypes.as_dtype("qint8").is_integer, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_integer, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_integer, False)
@@ -195,6 +206,8 @@ def testIsFloating(self):
     self.assertEqual(dtypes.as_dtype("string").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bool").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_floating, True)
+    self.assertEqual(dtypes.as_dtype("float8_e5m2").is_floating, True)
+    self.assertEqual(dtypes.as_dtype("float8_e4m3fn").is_floating, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
@@ -215,6 +228,8 @@ def testIsComplex(self):
     self.assertEqual(dtypes.as_dtype("string").is_complex, False)
     self.assertEqual(dtypes.as_dtype("bool").is_complex, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("float8_e5m2").is_complex, False)
+    self.assertEqual(dtypes.as_dtype("float8_e4m3fn").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint8").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_complex, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_complex, False)
@@ -235,6 +250,8 @@ def testIsUnsigned(self):
     self.assertEqual(dtypes.as_dtype("complex64").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("complex128").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("bfloat16").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("float8_e5m2").is_unsigned, False)
+    self.assertEqual(dtypes.as_dtype("float8_e4m3fn").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("qint8").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_unsigned, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_unsigned, False)
@@ -297,6 +314,12 @@ def testMinMax(self):
       if numpy_dtype == dtypes.bfloat16.as_numpy_dtype:
         self.assertEqual(dtype.min, float.fromhex("-0x1.FEp127"))
         self.assertEqual(dtype.max, float.fromhex("0x1.FEp127"))
+      if numpy_dtype == dtypes.float8_e5m2.as_numpy_dtype:
+        self.assertEqual(dtype.min, -57344.0)
+        self.assertEqual(dtype.max, 57344.0)
+      if numpy_dtype == dtypes.float8_e4m3fn.as_numpy_dtype:
+        self.assertEqual(dtype.min, -448.0)
+        self.assertEqual(dtype.max, 448.0)
 
   def testLimitsUndefinedError(self):
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 704b822ad81..81e494609b5 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -33,10 +33,23 @@ class InaccessibleTensorError(ValueError):
 
 @tf_export("errors.OperatorNotAllowedInGraphError", v1=[])
 class OperatorNotAllowedInGraphError(TypeError):
-  """An error is raised for unsupported operator in Graph execution.
+  """Raised when an unsupported operator is present in Graph execution.
+
+  For example, using a `tf.Tensor` as a Python `bool` inside a Graph will
+  raise `OperatorNotAllowedInGraphError`. Iterating over values inside a
+  `tf.Tensor` is also not supported in Graph execution.
+
+  Example:
+  >>> @tf.function
+  ... def iterate_over(t):
+  ...   a,b,c = t
+  ...   return a
+  >>>
+  >>> iterate_over(tf.constant([1, 2, 3]))
+  Traceback (most recent call last):
+  ...
+  OperatorNotAllowedInGraphError: ...
 
-  For example, using a `tf.Tensor` as a Python `bool` in Graph execution
-  is not allowed.
   """
   pass
 
@@ -211,20 +224,25 @@ def __str__(self):
 tf_export("errors.DATA_LOSS").export_constant(__name__, "DATA_LOSS")
 
 
-# pylint: disable=line-too-long
 @tf_export("errors.CancelledError")
 class CancelledError(OpError):
-  """Raised when an operation or step is cancelled.
+  """Raised when an operation is cancelled.
 
-  For example, a long-running operation (e.g.
-  `tf.queue.QueueBase.enqueue` may be
-  cancelled by running another operation (e.g.
-  `tf.queue.QueueBase.close`,
-  or by `tf.Session.close`.
-  A step that is running such a long-running operation will fail by raising
-  `CancelledError`.
+  For example, a long-running operation e.g.`tf.queue.QueueBase.enqueue`, or a
+  `tf.function` call may be cancelled by either running another operation e.g.
+  `tf.queue.QueueBase.close` or a remote worker failure.
+
+  This long-running operation will fail by raising `CancelledError`.
+
+  Example:
+  >>> q = tf.queue.FIFOQueue(10, tf.float32, ((),))
+  >>> q.enqueue((10.0,))
+  >>> q.close()
+  >>> q.enqueue((10.0,))
+  Traceback (most recent call last):
+    ...
+  CancelledError: ...
 
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -233,9 +251,6 @@ def __init__(self, node_def, op, message, *args):
                                          *args)
 
 
-# pylint: enable=line-too-long
-
-
 @tf_export("errors.UnknownError")
 class UnknownError(OpError):
   """Unknown error.
@@ -245,8 +260,6 @@ class UnknownError(OpError):
   is not known to this address space. Also, errors raised by APIs that
   do not return enough error information may be converted to this
   error.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -266,8 +279,6 @@ class InvalidArgumentError(OpError):
   Traceback (most recent call last):
      ...
   InvalidArgumentError: ...
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -281,8 +292,6 @@ class DeadlineExceededError(OpError):
   """Raised when a deadline expires before an operation could complete.
 
   This exception is not currently used.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -299,8 +308,6 @@ class NotFoundError(OpError):
   `tf.WholeFileReader.read`
   operation could raise `NotFoundError` if it receives the name of a file that
   does not exist.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -312,12 +319,14 @@ def __init__(self, node_def, op, message, *args):
 class AlreadyExistsError(OpError):
   """Raised when an entity that we attempted to create already exists.
 
+  An API raises this this error to avoid overwriting an existing resource,
+  value, etc. Calling a creation API multiple times with the same arguments
+  could raise this error if the creation API is not idempotent.
+
   For example, running an operation that saves a file
-  (e.g. `tf.train.Saver.save`)
+  (e.g. `tf.saved_model.save`)
   could potentially raise this exception if an explicit filename for an
   existing file was passed.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -334,8 +343,6 @@ class PermissionDeniedError(OpError):
   `tf.WholeFileReader.read`
   operation could raise `PermissionDeniedError` if it receives the name of a
   file for which the user does not have the read file permission.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -346,11 +353,9 @@ def __init__(self, node_def, op, message, *args):
 
 @tf_export("errors.UnauthenticatedError")
 class UnauthenticatedError(OpError):
-  """The request does not have valid authentication credentials.
+  """Raised when the request does not have valid authentication credentials.
 
   This exception is not currently used.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -361,12 +366,12 @@ def __init__(self, node_def, op, message, *args):
 
 @tf_export("errors.ResourceExhaustedError")
 class ResourceExhaustedError(OpError):
-  """Some resource has been exhausted.
+  """Raised when some resource has been exhausted while running operation.
 
   For example, this error might be raised if a per-user quota is
-  exhausted, or perhaps the entire file system is out of space.
-
-  @@__init__
+  exhausted, or perhaps the entire file system is out of space. If running into
+  `ResourceExhaustedError` due to out of memory (OOM), try to use smaller batch
+  size or reduce dimension size of model weights.
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -377,13 +382,14 @@ def __init__(self, node_def, op, message, *args):
 
 @tf_export("errors.FailedPreconditionError")
 class FailedPreconditionError(OpError):
-  """Operation was rejected because the system is not in a state to execute it.
+  """Raised when some prerequisites are not met when running an operation.
 
-  This exception is most commonly raised when running an operation
-  that reads a `tf.Variable`
-  before it has been initialized.
+  This typically indicates that system is not in state to execute the operation
+  and requires preconditions to be met before successfully executing current
+  operation.
 
-  @@__init__
+  For example, this exception is commonly raised when running an operation
+  that reads a `tf.Variable` before it has been initialized.
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -394,15 +400,13 @@ def __init__(self, node_def, op, message, *args):
 
 @tf_export("errors.AbortedError")
 class AbortedError(OpError):
-  """The operation was aborted, typically due to a concurrent action.
+  """Raised when an operation was aborted, typically due to a concurrent action.
 
   For example, running a
   `tf.queue.QueueBase.enqueue`
   operation may raise `AbortedError` if a
   `tf.queue.QueueBase.close` operation
   previously ran.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -412,15 +416,16 @@ def __init__(self, node_def, op, message, *args):
 
 @tf_export("errors.OutOfRangeError")
 class OutOfRangeError(OpError):
-  """Raised when an operation iterates past the valid input range.
+  """Raised when an operation iterates past the valid range.
 
-  This exception is raised in "end-of-file" conditions, such as when a
-  `tf.queue.QueueBase.dequeue`
-  operation is blocked on an empty queue, and a
-  `tf.queue.QueueBase.close`
-  operation executes.
+  Unlike `InvalidArgumentError`, this error indicates a problem may be fixed if
+  the system state changes. For example, if a list grows and the operation is
+  now within the valid range. `OutOfRangeError` overlaps with
+  `FailedPreconditionError` and should be preferred as the more specific error
+  when iterating or accessing a range.
 
-  @@__init__
+  For example, iterating a TF dataset past the last item in the dataset will
+  raise this error.
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -438,8 +443,6 @@ class UnimplementedError(OpError):
   the `tf.nn.max_pool2d` operation
   would raise this error if pooling was requested on the batch dimension,
   because this is not yet supported.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -454,8 +457,6 @@ class InternalError(OpError):
 
   This exception is raised when some invariant expected by the runtime
   has been broken. Catching this exception is not recommended.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -468,8 +469,6 @@ class UnavailableError(OpError):
   """Raised when the runtime is currently unavailable.
 
   This exception is not currently used.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
@@ -482,11 +481,14 @@ def __init__(self, node_def, op, message, *args):
 class DataLossError(OpError):
   """Raised when unrecoverable data loss or corruption is encountered.
 
+  This could be due to:
+  * A truncated file.
+  * A corrupted file.
+  * Specifying the wrong data format.
+
   For example, this may be raised by running a
   `tf.WholeFileReader.read`
   operation, if the file is truncated while it is being read.
-
-  @@__init__
   """
 
   def __init__(self, node_def, op, message, *args):
diff --git a/tensorflow/python/framework/errors_test_helper.cc b/tensorflow/python/framework/errors_test_helper.cc
index 833eb0e4484..270e21f2410 100644
--- a/tensorflow/python/framework/errors_test_helper.cc
+++ b/tensorflow/python/framework/errors_test_helper.cc
@@ -21,8 +21,8 @@ PYBIND11_MODULE(_errors_test_helper, m) {
   m.def("TestRaiseFromStatus", [](int code) {
     tensorflow::Status status(static_cast<tensorflow::error::Code>(code),
                               "test message");
-    status.SetPayload("key1", "value1");
-    status.SetPayload("key2", "value2");
+    status.SetPayload("key1", absl::Cord("value1"));
+    status.SetPayload("key2", absl::Cord("value2"));
     MaybeRaiseRegisteredFromStatus(status);
     return 0;
   });
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
index dbaf4e92435..21cf7445a69 100644
--- a/tensorflow/python/framework/experimental/BUILD
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_python_pybind_extension")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/framework/extension_type.py b/tensorflow/python/framework/extension_type.py
index 09a51ec128e..c192fb429e8 100644
--- a/tensorflow/python/framework/extension_type.py
+++ b/tensorflow/python/framework/extension_type.py
@@ -16,8 +16,10 @@
 
 import abc
 import typing
+import warnings
 import typing_extensions
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import extension_type_field
@@ -26,6 +28,7 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import composite_tensor_ops
 from tensorflow.python.ops import gen_math_ops
@@ -301,6 +304,26 @@ def _type_spec(self):  # CompositeTensor API.
     return self._tf_extension_type_cached_type_spec
 
 
+@tf_export('experimental.extension_type.as_dict')
+def as_dict(value):
+  """Extracts the attributes of `value` and their values to a dict format.
+
+  Unlike `dataclasses.asdict()`, this function is not recursive and in case of
+  nested `ExtensionType` objects, only the top level object is converted to a
+  dict.
+
+  Args:
+    value: An `ExtensionType` object.
+
+  Returns:
+    A dict that contains the attributes of `value` and their values.
+  """
+  return {
+      field.name: getattr(value, field.name)
+      for field in value._tf_extension_type_fields()  # pylint: disable=protected-access
+  }
+
+
 def pack(value):
   """Returns a copy of `value` with fields packed in a single Variant.
 
@@ -361,9 +384,6 @@ def is_packed(value):
 # ==============================================================================
 # Base class for the tf.ExtensionType TypeSpecs
 # ==============================================================================
-# TODO(b/184565242) Support customizing type relaxation for tracing.
-# TODO(b/184565242) Support conversion to/from FullType.
-# TODO(b/195884675) Support batch and unbatch.
 
 
 class ExtensionTypeSpec(type_spec.TypeSpec):
@@ -507,6 +527,67 @@ def _tf_extension_type_with_packed(self, value):
     return copy
 
 
+class _ExtensionTypeSpecCodec:
+  """Codec for `tf.ExtensionTypeSpec`."""
+
+  def can_encode(self, pyobj):
+    """Returns true if `pyobj` can be encoded as an ExtensionTypeSpec."""
+    if isinstance(pyobj, ExtensionTypeSpec):
+      try:
+        type_spec_registry.get_name(type(pyobj))
+        return True
+      except ValueError:
+        return False
+    return False
+
+  def do_encode(self, extension_type_spec_value, encode_fn):
+    """Returns an encoded proto for the given `tf.ExtensionTypeSpec`."""
+    type_spec_class_name = type_spec_registry.get_name(
+        type(extension_type_spec_value))
+
+    type_state = extension_type_spec_value._serialize()  # pylint: disable=protected-access
+    num_flat_components = len(
+        nest.flatten(
+            extension_type_spec_value._component_specs, expand_composites=True))  # pylint: disable=protected-access
+    encoded_type_spec = struct_pb2.StructuredValue()
+    encoded_type_spec.type_spec_value.CopyFrom(
+        struct_pb2.TypeSpecProto(
+            type_spec_class=struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC,
+            type_state=encode_fn(type_state),
+            type_spec_class_name=type_spec_class_name,
+            num_flat_components=num_flat_components))
+    return encoded_type_spec
+
+  def can_decode(self, value):
+    """Returns true if `value` can be decoded into a `tf.ExtensionTypeSpec`."""
+    if value.HasField('type_spec_value'):
+      type_spec_class_enum = value.type_spec_value.type_spec_class
+      return (
+          type_spec_class_enum == struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC)
+    return False
+
+  def do_decode(self, value, decode_fn):
+    """Returns the `tf.TypeSpec` encoded by the proto `value`."""
+    type_spec_proto = value.type_spec_value
+    class_name = type_spec_proto.type_spec_class_name
+
+    try:
+      type_spec_class = type_spec_registry.lookup(class_name)
+    except ValueError:
+      type_spec_class = AnonymousExtensionTypeSpec
+      warnings.warn(
+          f"The type '{class_name}' has not been registered. "
+          'Falling back to using AnonymousExtensionTypeSpec '
+          'instead.'
+      )
+
+    # pylint: disable=protected-access
+    return type_spec_class._deserialize(decode_fn(type_spec_proto.type_state))
+
+
+nested_structure_coder.register_codec(_ExtensionTypeSpecCodec())
+
+
 @tf_export('experimental.ExtensionTypeBatchEncoder')
 class ExtensionTypeBatchEncoder(type_spec.TypeSpecBatchEncoder):
   """Class used to encode and decode extension type values for batching.
@@ -802,7 +883,6 @@ def wrapped_init(self, *args, **kwargs):
 _NO_DEFAULT = extension_type_field.ExtensionTypeField.NO_DEFAULT
 
 
-# TODO(b/184565242) Consider using the templating system from autograph here.
 def _build_extension_type_constructor(cls):
   """Builds a constructor for tf.ExtensionType subclass `cls`."""
   fields = cls._tf_extension_type_fields()  # pylint: disable=protected-access
@@ -931,7 +1011,7 @@ def _add_type_spec(cls):
   # If the user included an explicit `__name__` attribute, then use that to
   # register the TypeSpec (so it can be used in SavedModel signatures).
   if '__name__' in cls.__dict__:
-    type_spec.register(cls.__dict__['__name__'] + '.Spec')(spec)
+    type_spec_registry.register(cls.__dict__['__name__'] + '.Spec')(spec)
 
 
 # ==============================================================================
@@ -1011,7 +1091,7 @@ def _type_spec(self):  # CompositeTensor API.
     return self._tf_extension_type_cached_type_spec
 
 
-@type_spec.register('tf.AnonymousExtensionType.Spec')
+@type_spec_registry.register('tf.AnonymousExtensionType.Spec')
 class AnonymousExtensionTypeSpec(ExtensionTypeSpec):
   """TypeSpec for AnonymousExtensionType."""
 
diff --git a/tensorflow/python/framework/extension_type_test.py b/tensorflow/python/framework/extension_type_test.py
index e78a1817fa5..52fba09eed4 100644
--- a/tensorflow/python/framework/extension_type_test.py
+++ b/tensorflow/python/framework/extension_type_test.py
@@ -38,6 +38,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.framework.type_utils import fulltypes_for_flat_tensors
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
@@ -234,6 +235,13 @@ def testEagerRepr(self):
     self.assertEqual(expected, repr(mt))
     self.assertEqual(expected, repr(mt))
 
+  def testAsDict(self):
+    values = constant_op.constant([1, 2, 3, 4])
+    mask = constant_op.constant([True, True, False, True])
+    mt = MaskedTensorV1(values, mask)
+    mt_dict = extension_type.as_dict(mt)
+    self.assertEqual({'values': values, 'mask': mask}, mt_dict)
+
   def testConstructorSignature(self):
 
     class MyType(extension_type.ExtensionType):
@@ -1454,7 +1462,7 @@ def f(x, y):
     loaded_model = load.load(path)
 
     with self.assertRaises(ValueError):
-      type_spec.lookup('tf.test.MaskedTensorV1')
+      type_spec_registry.lookup('tf.test.MaskedTensorV1')
 
     t = constant_op.constant([10, 20, 30])
     v1 = loaded_model.f(t, t)
@@ -1524,10 +1532,10 @@ def temporarily_add_dispatch(op, typ, fn):
 @contextlib.contextmanager
 def temporarily_register_type_spec(name, cls):
   """Context manager for making temporary changes to the TypeSpec registry."""
-  type_spec.register(name)(cls)
+  type_spec_registry.register(name)(cls)
   yield
-  assert type_spec._TYPE_SPEC_TO_NAME.pop(cls) == name
-  assert type_spec._NAME_TO_TYPE_SPEC.pop(name) is cls
+  assert type_spec_registry._TYPE_SPEC_TO_NAME.pop(cls) == name
+  assert type_spec_registry._NAME_TO_TYPE_SPEC.pop(name) is cls
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index dc14361d666..ba3722f4d14 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -14,6 +14,11 @@ def AppendBFloat16ArrayToTensorProto(
     tensor_proto.half_val.append(nparray[i])
 
 
+def AppendFloat8ArrayToTensorProto(
+    tensor_proto, np.ndarray[np.uint8_t, ndim=1] nparray):
+  tensor_proto.float8_val += nparray.tobytes()
+
+
 def AppendFloat16ArrayToTensorProto(
     # For numpy, npy_half is a typedef for npy_uint16,
     # see: https://github.com/numpy/numpy/blob/master/doc/source/reference/c-api.coremath.rst#half-precision-functions
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index b81b9c5df88..52bad75b06e 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -18,15 +18,16 @@
 
 # Classes used when building a Graph.
 from tensorflow.python.framework.device import DeviceSpec
+from tensorflow.python.framework.indexed_slices import IndexedSlices
 from tensorflow.python.framework.ops import Graph
 from tensorflow.python.framework.ops import Operation
 from tensorflow.python.framework.ops import Tensor
-from tensorflow.python.framework.ops import IndexedSlices
 
 from tensorflow.python.framework.sparse_tensor import SparseTensor
 from tensorflow.python.framework.sparse_tensor import SparseTensorValue
 
 # Utilities used when building a Graph.
+from tensorflow.python.framework.indexed_slices import convert_to_tensor_or_indexed_slices
 from tensorflow.python.framework.ops import device
 from tensorflow.python.framework.ops import container
 from tensorflow.python.framework.ops import name_scope
@@ -41,7 +42,6 @@
 from tensorflow.python.framework.ops import get_collection
 from tensorflow.python.framework.ops import get_collection_ref
 from tensorflow.python.framework.ops import convert_to_tensor
-from tensorflow.python.framework.ops import convert_to_tensor_or_indexed_slices
 from tensorflow.python.framework.random_seed import get_seed
 from tensorflow.python.framework.random_seed import set_random_seed
 from tensorflow.python.framework.sparse_tensor import convert_to_tensor_or_sparse_tensor
@@ -59,7 +59,7 @@
 from tensorflow.python.framework.tensor_shape import TensorShape
 
 # Needed when interfacing tensorflow to new array libraries
-from tensorflow.python.framework.ops import register_tensor_conversion_function
+from tensorflow.python.framework.tensor_conversion_registry import register_tensor_conversion_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index e0da91b45d5..9de11d98930 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -16,13 +16,14 @@
 
 import collections as py_collections
 import traceback
-from typing import Any, Hashable, Callable, Mapping
+from typing import Any, Callable, Hashable
 import weakref
 
 import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.function import trace_type
+from tensorflow.core.function.capture import capture_container
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
@@ -43,11 +44,8 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import save_context
-from tensorflow.python.types import core
 from tensorflow.python.util import compat
-from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
@@ -70,7 +68,6 @@
 
 class UnknownArgument(object):
   """Signifies an argument which is not currently handled."""
-  pass
 
 
 def convert_structure_to_signature(structure, arg_names=None,
@@ -142,38 +139,6 @@ def encode_arg(arg, path):
   return nest.pack_sequence_as(structure, mapped)
 
 
-class CapturesContainer(object):
-  """A container class to store captures with a dict."""
-
-  def __init__(self):
-    # A dict that maps capture identifier -> function
-    self._captures = py_collections.OrderedDict()
-
-  def add_capture(self, identifier: Hashable,
-                  func: Callable[[], Any]):
-    self._captures[identifier] = func
-
-  def update(self, container: "CapturesContainer"):
-    # Add captures to self from other Container if not exist
-    assert isinstance(container, CapturesContainer)
-    for key, func in container.captures.items():
-      if key not in self._captures:
-        self._captures[key] = func
-
-  def get_snapshot(self) -> Mapping[Hashable, Any]:
-    snapshot = {}
-    for key, func in self.captures.items():
-      snapshot[key] = func()
-    return snapshot
-
-  @property
-  def captures(self) -> Mapping[Hashable, Any]:
-    return self._captures
-
-  def __len__(self):
-    return len(self._captures)
-
-
 @tf_export("__internal__.FuncGraph", v1=[])
 class FuncGraph(ops.Graph):
   """Graph representing a function body.
@@ -199,8 +164,6 @@ class FuncGraph(ops.Graph):
       or the global default Graph.
     captures: Maps external tensor -> internal tensor (i.e. input placeholder).
       The entries are in the order they were captured.
-    control_captures: Set of external ops on which this graph has a control
-      dependency.
     seed: The graph-level random seed.
     capture_by_value: If True, the func graph will capture Variables by value
       instead of reference.
@@ -235,12 +198,11 @@ def __init__(self,
         initializing the FuncGraph. See the docstring for FuncGraph for more
         information.
     """
-    super(FuncGraph, self).__init__()
+    super().__init__()
     self.name = name
     self.inputs = []
     self.outputs = []
     self.control_outputs = []
-    self.control_captures = object_identity.ObjectIdentitySet()
     self.structured_input_signature = structured_input_signature
     self.structured_outputs = structured_outputs
     self._resource_tensor_inputs = object_identity.ObjectIdentitySet()
@@ -248,6 +210,7 @@ def __init__(self,
     self._watched_variables = object_identity.ObjectIdentityWeakSet()
     self.is_control_flow_graph = False
 
+    self._function_captures = capture_container.FunctionCaptures()
     outer_graph = ops.get_default_graph()
     self._weak_outer_graph = weakref.ref(outer_graph)
     while outer_graph.building_function:
@@ -256,13 +219,6 @@ def __init__(self,
     # active when the FuncGraph was traced. This will not be a FuncGraph.
     self._fallback_outer_graph = outer_graph
     self._captures = py_collections.OrderedDict()
-    # Maps capture identifier -> lambda function that returns capture values
-    # Used to get runtime value to determine if retracing is needed.
-    self._capture_func_lib = CapturesContainer()
-    # Maps capture identifier -> a container with the same structure as
-    # the original side input, except tensors are replaced with placeholders.
-    # Used to fetch existing placeholders and prevent repeated creatation.
-    self._capture_placeholder_lib = py_collections.OrderedDict()
     # If not None, records the names of output args of this function. Used to
     # preserve the output names in the signature of a serialized+deserialized
     # function. Private at the moment mostly because it's often out of date.
@@ -281,10 +237,6 @@ def __init__(self,
       self.capture_by_value = False
 
     self._building_function = True
-    # Map from resource tensor name to last op (in program order) which uses
-    # this tensor. Used to enforce that execution order matches program order
-    # for resource tensors.
-    self._last_op_using_resource_tensor = {}
 
     graph = self.outer_graph
 
@@ -412,18 +364,12 @@ def f():
     if key is None:
       key = object()
     if key not in self._deferred_captures:
+      trace_ctx = trace_type.InternalTracingContext(True)
+      spec = trace_type.from_value(spec, trace_ctx)
 
       if placeholder is None:
-
-        def convert_to_placeholder(s):
-          if not isinstance(s, tensor_spec.DenseSpec):
-            raise TypeError(
-                "Expected a nest of `TypeSpec` objects, found %s of type %s." %
-                (s, type(s)))
-          return array_ops.placeholder(dtype=s.dtype, shape=s.shape)
-
-        placeholder = nest.map_structure(
-            convert_to_placeholder, spec, expand_composites=True)
+        placeholder_ctx = trace_type.InternalPlaceholderContext(self)
+        placeholder = spec.placeholder_value(placeholder_ctx)
 
       def wrapped_closure():
 
@@ -444,17 +390,9 @@ def wrapped_closure():
 
         if not context.executing_eagerly():
           graph = ops.get_default_graph()
-
-          # In the case of control flow, we need to capture the
-          # external_captures (deferred or not) of the body_graph (i.e.
-          # `WhileBodyFuncGraph) in `cond_graph` (i.e. WhileCondFuncGraph) and
-          # create the corresponding placeholders in `cond_graph` so that it
-          # expects to receive these as arguments. However, doing so requires
-          # having evaluated the call_time_value already (and maybe repeatedly),
-          # so we skip adding deferred_captures to the control flow graph but
-          # add it to its outer graph.
-          while graph.is_control_flow_graph:
-            graph = graph.outer_graph
+          assert isinstance(
+              graph,
+              FuncGraph), "This API should only be used in TF2 enviroment."
 
           with graph.as_default():
             ret_nest = graph.capture_call_time_value(
@@ -462,17 +400,8 @@ def wrapped_closure():
         else:
           ret_nest = closure()
 
-        nest.assert_same_structure(spec, ret_nest, expand_composites=True)
-        # This uses the tensor dtype defined in `spec` when converting values
-        # in `ret_nest` to tensors.
-        # pylint: disable=protected-access
-        y = nest.map_structure(
-            lambda s, r: s._to_components(r),
-            spec,
-            ret_nest,
-            expand_composites=False)
-        # pylint: enable=protected-access
-        return nest.flatten(y, expand_composites=True)
+        ret_nest = spec._cast(ret_nest, trace_type.InternalCastContext)  # pylint: disable=protected-access
+        return spec._to_tensors(ret_nest)  # pylint: disable=protected-access
 
       wrapped_closure.output_spec = spec
       self._deferred_captures[key] = (wrapped_closure, placeholder)
@@ -500,7 +429,7 @@ def control_dependencies(self, control_inputs):
         `Tensor` objects.
     """
     if control_inputs is None:
-      return super(FuncGraph, self).control_dependencies(control_inputs)
+      return super().control_dependencies(control_inputs)
 
     filtered_control_inputs = []
     for c in control_inputs:
@@ -513,13 +442,13 @@ def control_dependencies(self, control_inputs):
         graph_element = c
       if graph_element is not None and getattr(graph_element, "graph",
                                                None) is not self:
-        self.control_captures.add(graph_element)
+        self._function_captures.control.add(graph_element)
       else:
         filtered_control_inputs.append(graph_element)
-    return super(FuncGraph, self).control_dependencies(filtered_control_inputs)
+    return super().control_dependencies(filtered_control_inputs)
 
   def as_default(self):
-    outer_cm = super(FuncGraph, self).as_default()
+    outer_cm = super().as_default()
 
     @tf_contextlib.contextmanager
     def inner_cm():
@@ -746,7 +675,7 @@ def _create_op_internal(
         inp = ctxt.AddValue(inp)
       inp = self.capture(inp)
       captured_inputs.append(inp)
-    return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
+    return super()._create_op_internal(  # pylint: disable=protected-access
         op_type, captured_inputs, dtypes, input_types, name, attrs, op_def,
         compute_device)
 
@@ -839,6 +768,52 @@ def _experimental_capture_side_input_by_ref(self, identifier: Hashable,
                                               func: Callable[[], Any]) ->...:
     """Implement capturing side input by reference for tf.function.
 
+    Note that this API will only register the capture in the func_graph where
+    it is called. In the case of nested graph, like nested tf.function or
+    tf.while, the outer graph is not aware of this capture in the inner graph.
+    Thus, the outer tf.function will not retrace when the by-ref capture
+    changes. It's the user's responsibility to call this API in the outer
+    func_graph as well if proper retracing is needed.
+
+    For example:
+
+    ```
+    x = 1
+
+    # Correct usage
+    @tf.function
+    def f_1():
+      graph = tf.compat.v1.get_default_graph()
+      # Capture the same x for the outer tf.function
+      graph._experimental_capture_side_input_by_ref("x", lambda: x)
+
+      @tf.function
+      def g():
+        graph = tf.compat.v1.get_default_graph()
+        cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+        return cap_x + 1
+
+      return g()
+
+    # Incorrect usage
+    @tf.function
+    def f_2():
+
+      @tf.function
+      def g():
+        graph = tf.compat.v1.get_default_graph()
+        cap_x = graph._experimental_capture_side_input_by_ref("x", lambda: x)
+        return cap_x + 1
+
+      return g()
+
+    assert f_1() == 2
+    assert f_2() == 2
+    x = 2
+    assert f_1() == 3
+    assert f_2() == 2  # This is incorrect
+    ```
+
     Args:
       identifier: A hashable object as the key for the capture.
       func: A Python function that takes no arguments and returns the value of
@@ -849,67 +824,10 @@ def _experimental_capture_side_input_by_ref(self, identifier: Hashable,
         are replaced with placehoders, and non-tensors remain the same.
 
     """
-    # Support manual capture for inner nested tf.function is not possible at the
-    # moment. Inner here means any tf.function wrapped by another tf.function.
-    # Usage inside the outer most tf.function only is fine.
-    # The infeasibility is due to it's impossible to determine the
-    # definition scope of the captured side input. This info is needed when
-    # propagating inner tf.function captures to outer tf.function.
-    if isinstance(self.outer_graph, FuncGraph):
-      raise NotImplementedError(
-          ("Manual side input usage for inner nested tf.function is not "
-           f"supported. Got side input: {identifier}."))
-
-    # Prevent repeated captures
-    if identifier in self._capture_placeholder_lib:
-      return self._capture_placeholder_lib[identifier]
-
-    nested_placeholder = self._maybe_create_capture_placeholder(func)
-    self._capture_func_lib.add_capture(identifier, func)
-    self._capture_placeholder_lib[identifier] = nested_placeholder
-    return nested_placeholder
-
-  def _maybe_create_capture_placeholder(self, func: Callable[[], Any]) -> ...:
-    """Create placeholder if the input is tensor."""
-    values_nest = func()
 
-    if context.executing_eagerly():
-      return values_nest
-
-    values_flat = nest.flatten(values_nest)
-    # Return values in flat format. It consists of placeholders and non-tensor
-    # values.
-    return_flat = []
-    tensor_spec_flat = []
-    # Create return_flat and replace tensors with None. Later, each None is
-    # replaced again by corresponding placeholders
-    for value in values_flat:
-      if isinstance(value, core.Tensor):
-        return_flat.append(None)
-        tensor_spec_flat.append(type_spec.type_spec_from_value(value))
-      elif isinstance(value, set) or isinstance(value, frozenset):
-        raise NotImplementedError(
-            (f"Side input returned by '{tf_inspect.getsource(func).strip()}' "
-             f"has element of {type(value)} type, which is currently not "
-             "supported by tf.function."))
-      else:
-        return_flat.append(value)
-    if tensor_spec_flat:
-
-      def tensor_func():
-        values = nest.flatten(func())
-        return [value for value in values if isinstance(value, core.Tensor)]
-
-      placeholder_flat = self.capture_call_time_value(
-          tensor_func, tensor_spec_flat)
-      # replace None that represents tensors with placehoders
-      flat_ptr = 0
-      for idx, item in enumerate(return_flat):
-        if item is None:
-          return_flat[idx] = placeholder_flat[flat_ptr]
-          flat_ptr += 1
-    return_nest = nest.pack_sequence_as(values_nest, return_flat)
-    return return_nest
+    placeholder = self._function_captures.capture_by_ref(
+        func, identifier)
+    return placeholder
 
   @property
   def captures(self):
@@ -991,22 +909,8 @@ def pop_capture(self, tensor):
     return capture[1]
 
   def clear_captures(self):
-    # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
-    # Clearing captures using clear() leaves some cycles around.
-    while self._captures:
-      self._captures.popitem()
-    memory.dismantle_ordered_dict(self._captures)
-    while self._deferred_captures:
-      self._deferred_captures.popitem()
-    memory.dismantle_ordered_dict(self._deferred_captures)
-
-  def capture_distributed_variable(self, variable, placeholder):
-    """Add given distributed variable to captures with given placeholder."""
-    self._captures[id(variable)] = (variable, placeholder)
-    tape.record_operation(
-        "captured_value", [placeholder], [variable],
-        backward_function=lambda x: [x],
-        forward_function=lambda x: [x])
+    self._captures.clear()
+    self._deferred_captures.clear()
 
   def capture_eager_tensor(self, tensor, name):
     capture = self._captures.get(id(tensor))
@@ -1055,11 +959,7 @@ def deferred_internal_captures(self):
   @property
   def variable_captures(self):
     """Map of python object ids of variables to variables which are captured."""
-    return {
-        id(self._captures[id(v)][1]): v
-        for v in self.variables
-        if id(v) in self._captures
-    }
+    return self.variables
 
   def mark_as_unsaveable(self, error_message):
     """Marks this FuncGraph as unsaveable.
@@ -1086,6 +986,8 @@ def saving_errors(self):
     """Returns set of errors preventing this FuncGraph from being saved."""
     return self._saving_errors
 
+  # TODO(b/263520817): Add function_captures property.
+
   def _add_scope_exit_callback(self, fn):
     """Add a function to call when this graph exits the default scope."""
     if not callable(fn):
@@ -1112,6 +1014,7 @@ def func_graph_from_py_func(name,
                             op_return_value=None,
                             collections=None,
                             capture_by_value=None,
+                            create_placeholders=True,
                             acd_record_initial_resource_uses=False):
   """Returns a `FuncGraph` generated from `python_func`.
 
@@ -1150,6 +1053,9 @@ def func_graph_from_py_func(name,
     capture_by_value: An optional boolean. If True, the func graph will capture
       Variables by value instead of reference. By default inherit from outer
       graphs, and failing that will default to False.
+    create_placeholders: An optional boolean. If True, then func graph will
+      create placeholders for the inputs as graph ops. If False, the input args
+      and kwargs will be treated as the input placeholders.
     acd_record_initial_resource_uses: If `True` and `add_control_dependencies`
       is enabled, the results (those marked with
       AutomaticControlDependencies.mark_result) will be annotated with a private
@@ -1183,17 +1089,17 @@ def func_graph_from_py_func(name,
     if signature is not None:
       args = signature
       kwargs = {}
-   # Get placeholders for args and kwargs
-    func_args = _get_defun_inputs_from_args(args, arg_names)
-    func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
-
-    for arg in nest.flatten([func_args, func_kwargs], expand_composites=True):
-      if isinstance(arg, ops.Tensor) and arg.dtype == dtypes.resource:
-        func_graph._resource_tensor_inputs.add(arg)  # pylint: disable=protected-access
-      # TODO(b/209081027): Remove this after ResourceVariable subclasses
-      # CompositeTensor and we can expand ResourceVariable.
-      elif isinstance(arg, resource_variable_ops.ResourceVariable):
-        func_graph._resource_tensor_inputs.add(arg.handle)  # pylint: disable=protected-access
+
+    if create_placeholders:
+      func_args, func_kwargs = _create_placeholders(args, kwargs, arg_names)
+    else:
+      func_args, func_kwargs = args, kwargs
+
+    input_trace_types = trace_type.from_value([func_args, func_kwargs])
+    func_graph.inputs = input_trace_types._to_tensors([func_args, func_kwargs])  # pylint: disable=protected-access
+    for arg in func_graph.inputs:
+      if arg.dtype == dtypes.resource:
+        func_graph._resource_tensor_inputs.add(arg)  # pylint:disable=protected-access
 
     signature_context = trace_type.InternalTracingContext()
     # Convert all Tensors into TensorSpecs before saving the structured inputs.
@@ -1206,22 +1112,17 @@ def func_graph_from_py_func(name,
         convert_structure_to_signature(
             func_kwargs, signature_context=signature_context))
 
-    flat_func_args = nest.flatten(func_args, expand_composites=True)
-    flat_func_kwargs = nest.flatten(func_kwargs, expand_composites=True)
-    # Temporarily set inputs to allow graph building code to inspect
-    # them. Reassigned below.
-    func_graph.inputs = [
-        arg for arg in flat_func_args + flat_func_kwargs
-        if isinstance(arg, ops.Tensor)
-    ]
-
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
     func_args_before = nest.pack_sequence_as(
-        func_args, flat_func_args, expand_composites=True)
+        func_args,
+        nest.flatten(func_args, expand_composites=True),
+        expand_composites=True)
     func_kwargs_before = nest.pack_sequence_as(
-        func_kwargs, flat_func_kwargs, expand_composites=True)
+        func_kwargs,
+        nest.flatten(func_kwargs, expand_composites=True),
+        expand_composites=True)
 
     def convert(x):
       """Converts a function output to a Tensor."""
@@ -1289,6 +1190,16 @@ def autograph_handler(*args, **kwargs):
       func_outputs = nest.map_structure(
           convert, func_outputs, expand_composites=True)
 
+      # flatten and unflatten func_args and func_kwargs to maintain parity
+      # from flattening which sorts by key
+      func_args = nest.pack_sequence_as(
+          func_args,
+          nest.flatten(func_args, expand_composites=True),
+          expand_composites=True)
+      func_kwargs = nest.pack_sequence_as(
+          func_kwargs,
+          nest.flatten(func_kwargs, expand_composites=True),
+          expand_composites=True)
       check_func_mutation(func_args_before, func_kwargs_before, func_args,
                           func_kwargs, original_func)
     finally:
@@ -1461,100 +1372,35 @@ def _create_substitute_placeholder(value, name=None, dtype=None, shape=None):
   return placeholder
 
 
-def _get_defun_inputs_from_args(args, names):
-  """Maps Python function positional args to graph-construction inputs."""
-  return _get_defun_inputs(args, names, structured_args=args)
+def _create_placeholders(args, kwargs, arg_names=None):
+  """Create placeholders given positional args and keyword args."""
+  signature_context = trace_type.InternalTracingContext(
+      is_legacy_signature=True)
+  arg_trace_types = trace_type.from_value(tuple(args), signature_context)
+  kwarg_trace_types = trace_type.from_value(kwargs, signature_context)
 
+  handledata_mapping = signature_context.get_handledata_mapping()
+  placeholder_mapping = signature_context.get_placeholder_mapping()
+  placeholder_context = trace_type.InternalPlaceholderContext(
+      ops.get_default_graph(), handledata_mapping, placeholder_mapping)
 
-def _get_defun_inputs_from_kwargs(kwargs):
-  """Maps Python function keyword args to graph-construction inputs."""
-  if kwargs:
-    names, args = zip(*sorted(kwargs.items()))
-  else:
-    names = []
-    args = []
-  return _get_defun_inputs(args, names, structured_args=kwargs)
-
+  if arg_names is None:
+    arg_names = [None] * len(arg_trace_types.components)
 
-def _get_defun_inputs(args, names, structured_args):
-  """Maps python function args to graph-construction inputs.
+  # Create placeholders for trace type args and trace type kwargs
+  func_args = []
+  for name, trace_type_arg in zip(arg_names, arg_trace_types.components):
+    placeholder_context.update_naming_scope(name)
+    placeholder = trace_type_arg.placeholder_value(placeholder_context)
+    func_args.append(placeholder)
 
-  Args:
-    args: A list of user-specified arguments. If `structured_args` is a list,
-      `args` is the same with `structured_args`. If `structured_args` is a dict,
-      `args` is the values of the dict.
-    names: A list of strings with user-specified argument names, same length as
-      `args`. May be `None`, in which case a generic name is used.
-    structured_args: The original argument list or dictionary.
+  func_kwargs = {}
+  for name, trace_type_kwarg in zip(*sorted(kwarg_trace_types.mapping.items())):
+    placeholder_context.update_naming_scope(name)
+    placeholder = trace_type_kwarg.placeholder_value(placeholder_context)
+    func_kwargs[name] = placeholder
 
-  Returns:
-    Placeholders with the same structure as `structured_args`.
-  """
-  function_inputs = []
-  if names is None:
-    names = [None] * len(args)
-
-  for arg_value, name in zip(args, names):
-    for val in composite_tensor_utils.flatten_with_variables_or_variable_specs(
-        arg_value):
-      function_inputs.append(_get_defun_input(val, name))
-  return nest.pack_sequence_as(
-      structured_args,
-      nest.flatten(function_inputs, expand_composites=True),
-      expand_composites=True)
-
-
-def _get_defun_input(arg, name):
-  """Maps a python function arg to a graph-construction input."""
-  func_graph = ops.get_default_graph()
-  if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
-    arg_is_spec = isinstance(arg, tensor_spec.TensorSpec)
-    if arg_is_spec and arg.name:
-      requested_name = arg.name
-    else:
-      requested_name = name
-    try:
-      placeholder = graph_placeholder(
-          arg.dtype, arg.shape, name=requested_name)
-    except ValueError as e:
-      # Sometimes parameter names are not valid op names, so fall back to
-      # unnamed placeholders.
-      logging.warning(e)
-      placeholder = graph_placeholder(arg.dtype, arg.shape)
-    if not arg_is_spec:
-      handle_data_util.copy_handle_data(arg, placeholder)
-    if name is not None:
-      # Record the requested/user-specified name in case it's different than
-      # the uniquified name, for validation when exporting signatures.
-      placeholder.op._set_attr(  # pylint: disable=protected-access
-          "_user_specified_name",
-          attr_value_pb2.AttrValue(s=compat.as_bytes(requested_name)))
-    return placeholder
-  # TODO(b/246437883): Investigate how to remove this branch.
-  elif isinstance(arg, (resource_variable_ops.BaseResourceVariable,
-                        resource_variable_ops.VariableSpec)):
-    if isinstance(arg, resource_variable_ops.VariableSpec):
-      name = arg.name or name
-      with func_graph.outer_graph.as_default():
-        placeholder = graph_placeholder(dtypes.resource, [], name=name)
-        # TODO(b/246438937): Replace this with nest.pack_sequence_as after we
-        # can expand Variables.
-        arg = resource_variable_ops.ResourceVariable(
-            shape=arg.shape,
-            dtype=arg.dtype,
-            handle=placeholder,
-            trainable=arg.trainable)
-
-    # Capture arg variables to create placeholders for them. These will be
-    # removed as captures after the function is traced (since otherwise we'd
-    # just add it back with a new placeholder when the variable was referenced).
-    placeholder = func_graph.capture(arg.handle, name=name)
-    placeholder.op._set_attr(  # pylint: disable=protected-access
-        "_user_specified_name",
-        attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
-    return arg
-  else:
-    return arg
+  return tuple(func_args), func_kwargs
 
 
 def dismantle_func_graph(func_graph):
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index cb9398ff75b..583b245c059 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1371,5 +1371,7 @@ def _type_list_to_str(types):
     dtypes.qint16: "qi16",
     dtypes.quint16: "qu16",
     dtypes.qint32: "qi32",
-    dtypes.bfloat16: "b16"
+    dtypes.bfloat16: "b16",
+    dtypes.float8_e5m2: "f8e5m2",
+    dtypes.float8_e4m3fn: "f8e4m3fn"
 }
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index bbd9cb9981b..ec4e11f802f 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -34,7 +34,8 @@
 def function_def_to_graph(fdef,
                           structured_input_signature=None,
                           structured_outputs=None,
-                          input_shapes=None):
+                          input_shapes=None,
+                          propagate_device_spec=False):
   """Converts a FunctionDef to a FuncGraph (sub-class Graph).
 
   The returned FuncGraph's `name`, `inputs` and `outputs` fields will be set.
@@ -56,6 +57,8 @@ def function_def_to_graph(fdef,
       specified, its length must match length of `fdef.signature.input_arg`. If
       a shape is None, the corresponding input placeholder will have unknown
       shape.
+    propagate_device_spec: Optional. Whether to propagate assigned device
+      information when constructing a new Graph from a FunctionDef.
 
   Returns:
     A FuncGraph.
@@ -84,7 +87,8 @@ def function_def_to_graph(fdef,
 
   with func_graph.as_default():
     # Add all function nodes to the graph.
-    importer.import_graph_def_for_function(graph_def, name="")
+    importer.import_graph_def_for_function(
+        graph_def, name="", propagate_device_spec=propagate_device_spec)
 
     # Initialize fields specific to FuncGraph.
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index bc1ffeb58b4..b09bafab11c 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -282,10 +282,11 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
+# TODO(b/266958376): Remove once all references to this are moved to
+# convert_variables_to_constants.
 @deprecation.deprecated(
     date=None,
     instructions=_DEPRECATION_MSG)
-@tf_export(v1=["graph_util.convert_variables_to_constants"])
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -314,16 +315,13 @@ def convert_variables_to_constants(sess,
     RuntimeError: if a DT_RESOURCE op is found whose ancestor Variables are both
       denylisted AND whitelisted for freezing.
   """
-  ret = convert_to_constants.convert_variables_to_constants_from_session_graph(
-      session=sess,
-      graph_def=input_graph_def,
-      output_node_names=output_node_names,
-      variable_names_allowlist=variable_names_whitelist,
-      variable_names_denylist=variable_names_blacklist)
-  # The previous code logic generated an empty versions field, we clear it here
-  # to maintain backwards compatibility.
-  ret.versions.Clear()
-  return ret
+  return convert_to_constants.convert_variables_to_constants(
+      sess,
+      input_graph_def,
+      output_node_names,
+      variable_names_whitelist,
+      variable_names_blacklist,
+  )
 
 
 @deprecation.deprecated(
@@ -378,15 +376,23 @@ def remove_training_nodes(input_graph, protected_nodes=None):
   types_to_splice = {"Identity": True}
   control_input_names = set()
   node_names_with_control_input = set()
+  node_in_colocated = set()
+
   for node in nodes_after_removal:
     for node_input in node.input:
       if "^" in node_input:
         control_input_names.add(node_input.replace("^", ""))
         node_names_with_control_input.add(node.name)
+    # Prevent colocated nodes from being lost.
+    if "_class" in node.attr:
+      for colocated_node_name in node.attr["_class"].list.s:
+        node_in_colocated.add(_get_colocated_node_name(colocated_node_name))
 
   names_to_splice = {}
   for node in nodes_after_removal:
     if node.op in types_to_splice and node.name not in protected_nodes:
+      if node.name in node_in_colocated:
+        continue
       # We don't want to remove nodes that have control edge inputs, because
       # they might be involved in subtle dependency issues that removing them
       # will jeopardize.
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 053703d222a..de0112166e9 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import compat
 from tensorflow.python.platform import test
 
 
@@ -213,6 +214,13 @@ def set_attr_dtype(self, node, key, value):
     node.attr[key].CopyFrom(
         attr_value_pb2.AttrValue(type=value.as_datatype_enum))
 
+  def set_attr_list(self, node, key, value_list):
+    node.attr[key].CopyFrom(
+        attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(s=value_list)
+        )
+    )
+
   def set_attr_tensor(self, node, key, value, dtype, shape=None):
     node.attr[key].CopyFrom(
         attr_value_pb2.AttrValue(
@@ -222,11 +230,14 @@ def set_attr_tensor(self, node, key, value, dtype, shape=None):
   def testRemoveTrainingNodes(self):
     a_constant_name = "a_constant"
     b_constant_name = "b_constant"
+    c_constant_name = "c_constant"
     a_check_name = "a_check"
     b_check_name = "b_check"
     a_identity_name = "a_identity"
     b_identity_name = "b_identity"
+    c_identity_name = "c_identity"
     add_name = "add"
+    sub_name = "sub"
     graph_def = graph_pb2.GraphDef()
     a_constant = self.create_constant_node_def(
         a_constant_name, value=1, dtype=dtypes.float32, shape=[])
@@ -250,6 +261,20 @@ def testRemoveTrainingNodes(self):
                                     [a_identity_name, b_identity_name])
     self.set_attr_dtype(add_node, "T", dtypes.float32)
     graph_def.node.extend([add_node])
+    c_constant = self.create_constant_node_def(
+        c_constant_name, value=1, dtype=dtypes.float32, shape=[]
+    )
+    graph_def.node.extend([c_constant])
+    c_identity_node = self.create_node_def(
+        "Identity", c_identity_name, [c_constant_name]
+    )
+    graph_def.node.extend([c_identity_node])
+
+    sub_node = self.create_node_def(
+        "Sub", sub_name, [c_constant_name, c_identity_name]
+    )
+    self.set_attr_list(sub_node, "_class", [compat.as_bytes(c_identity_name)])
+    graph_def.node.extend([sub_node])
 
     expected_output = graph_pb2.GraphDef()
     a_constant = self.create_constant_node_def(
@@ -262,6 +287,20 @@ def testRemoveTrainingNodes(self):
                                     [a_constant_name, b_constant_name])
     self.set_attr_dtype(add_node, "T", dtypes.float32)
     expected_output.node.extend([add_node])
+    c_constant = self.create_constant_node_def(
+        c_constant_name, value=1, dtype=dtypes.float32, shape=[]
+    )
+    expected_output.node.extend([c_constant])
+    c_identity_node = self.create_node_def(
+        "Identity", c_identity_name, [c_constant_name]
+    )
+    expected_output.node.extend([c_identity_node])
+
+    sub_node = self.create_node_def(
+        "Sub", sub_name, [c_constant_name, c_identity_name]
+    )
+    self.set_attr_list(sub_node, "_class", [compat.as_bytes(c_identity_name)])
+    expected_output.node.extend([sub_node])
 
     output = graph_util.remove_training_nodes(graph_def)
     self.assertProtoEquals(expected_output, output)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index f3de0d3509c..4a177b44062 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -207,10 +207,13 @@ def _ConvertInputMapValues(name, input_map):
 
 def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
                                      return_elements,
-                                     validate_colocation_constraints):
+                                     validate_colocation_constraints,
+                                     propagate_device_spec=False):
   """Populates the TF_ImportGraphDefOptions `options`."""
   c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
   c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
+  c_api.TF_ImportGraphDefOptionsSetPropagateDeviceSpec(options,
+                                                       propagate_device_spec)
 
   for input_src, input_dst in input_map.items():
     input_src = compat.as_str(input_src)
@@ -409,10 +412,13 @@ def import_graph_def(graph_def,
 
 
 def import_graph_def_for_function(  # pylint: disable=invalid-name
-    graph_def, name=None):
+    graph_def, name=None, propagate_device_spec=False):
   """Like import_graph_def but does not validate colocation constraints."""
   return _import_graph_def_internal(
-      graph_def, validate_colocation_constraints=False, name=name)
+      graph_def,
+      validate_colocation_constraints=False,
+      name=name,
+      propagate_device_spec=propagate_device_spec)
 
 
 def _import_graph_def_internal(  # pylint: disable=invalid-name
@@ -421,7 +427,8 @@ def _import_graph_def_internal(  # pylint: disable=invalid-name
     return_elements=None,
     validate_colocation_constraints=True,
     name=None,
-    producer_op_list=None):
+    producer_op_list=None,
+    propagate_device_spec=False):
   """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
@@ -450,6 +457,8 @@ def _import_graph_def_internal(  # pylint: disable=invalid-name
       unrecognized attrs for ops in `graph_def` that have their default value
       according to `producer_op_list` will be removed. This will allow some more
       `GraphDef`s produced by later binaries to be accepted by earlier binaries.
+    propagate_device_spec: Whether to propagate assigned device information
+      when importing a graph from a GraphDef into the current default `Graph`.
 
   Returns:
     A list of `Operation` and/or `Tensor` objects from the imported graph,
@@ -487,7 +496,8 @@ def _import_graph_def_internal(  # pylint: disable=invalid-name
   scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
   options = scoped_options.options
   _PopulateTFImportGraphDefOptions(options, prefix, input_map, return_elements,
-                                   validate_colocation_constraints)
+                                   validate_colocation_constraints,
+                                   propagate_device_spec)
 
   # _ProcessNewOps mutates the new operations. _mutation_lock ensures a
   # Session.run call cannot occur between creating the TF_Operations in the
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index c94195131f9..c9bb7ce060c 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
@@ -28,6 +29,7 @@
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import internal
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -278,6 +280,13 @@ def _from_components(self, tensor_list):
       return IndexedSlices(*tensor_list)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        IndexedSlicesSpec, struct_pb2.TypeSpecProto.INDEXED_SLICES_SPEC
+    )
+)
+
+
 @tf_export(v1=["convert_to_tensor_or_indexed_slices"])
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
diff --git a/tensorflow/python/framework/kythe_metadata.proto b/tensorflow/python/framework/kythe_metadata.proto
new file mode 100644
index 00000000000..c7c65b43183
--- /dev/null
+++ b/tensorflow/python/framework/kythe_metadata.proto
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2019 The Kythe Authors. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The content of this file is copied from
+ * https://github.com/kythe/kythe/blob/master/kythe/proto/metadata.proto and
+ * https://github.com/kythe/kythe/blob/master/kythe/proto/storage.proto
+ * and shouldn't be modified unless the protos is outdated.
+ */
+
+syntax = "proto3";
+
+package tensorflow;
+
+// Schema for the JSON-encoded Kythe metadata describing the relationship
+// between source and target code for generated code.
+message GeneratedCodeInfo {
+  enum Type {
+    NONE = 0;
+    KYTHE0 = 1;  // Initial metadata document type.
+  }
+
+  Type type = 1;
+  repeated MappingRule meta = 2;  // Only relevant if type == kythe0.
+}
+
+// Metadata for a single mapping between a generated source range and a node
+// in the source language or file.
+message MappingRule {
+  enum Type {
+    NONE = 0;
+    NOP = 1;             // Dummy rule that contains no relevant information.
+    ANCHOR_DEFINES = 2;  // Rule describing a generates edge between target
+                         // range and source definition.
+    ANCHOR_ANCHOR = 3;   // Rule describing an imputes edge between target range
+                         // and source range.
+  }
+
+  Type type = 1;
+  // If type == anchor_defines, this should generally be a reverse generates
+  // edge, %/kythe/edge/generates, indicating that the specified vname generated
+  // the source range.
+  // If type == anchor_anchor, this should generally be a forward imputes edge,
+  // /kythe/edge/imputes, indicating that the range in the source file produced
+  // the text in the target file.
+  // If semantic is not NONE, this field is ignored and the identified
+  // declaration at the indicated text range is given the associated semantic.
+  string edge = 2;
+
+  // Fields only relevant if type == anchor_defines.
+  VName vname = 3;   // The semantic node in the source language
+                     // which generated the text range.
+  uint32 begin = 4;  // Beginning of the range to match in the generated text.
+  uint32 end = 5;    // End of the range to match in the generated text.
+
+  enum Semantic {
+    SEMA_NONE = 0;
+    SEMA_WRITE = 1;
+    SEMA_READ_WRITE = 2;
+  }
+
+  Semantic semantic = 11;
+
+  // Fields only relevant if type == anchor_anchor.
+  VName source_vname = 6;   // Anchor node in the generating source file.
+                            // Note: the signature in this vname, if present,
+                            // will typically be replaced by the target indexer
+                            // using its own anchor-construction rules based on
+                            // source_begin and source_end.
+  uint32 source_begin = 7;  // loc/start of the anchor node in the source file.
+  uint32 source_end = 8;    // loc/end of the anchor node in the source file.
+  uint32 target_begin = 9;  // Start of the range in the generated text.
+  uint32 target_end = 10;   // End of the range in the generated text.
+}
+
+message VName {
+  // A language-specific signature assigned by the analyzer.
+  // e.g., "com.google.common.collect.Lists.newLinkedList<#1>()"
+  string signature = 1;
+
+  // The corpus this name belongs to.
+  // e.g., "kythe", "chromium", "github.com/creachadair/imath", "aosp"
+  // The corpus label "kythe" is reserved for internal use.
+  string corpus = 2;
+
+  // A corpus-specific root label, designating a subordinate collection within
+  // the corpus.  If a corpus stores files in unrelated directory structures,
+  // for example, the root can be used to distinguish them.  Or, if a corpus
+  // incorporates subprojects, the root can be a project ID that it governs.
+  // This may also be used to distinguish virtual subgroups of a corpus such as
+  // generated files.
+  string root = 3;
+
+  // A path-structured label describing the location of this object relative to
+  // the corpus and the root.  For code, this will generally be the relative
+  // path to the file containing the code, e.g., "storage/service.go" in kythe.
+  // The individual elements of the path are separated by "/".
+  // The path must not start with "/".
+  // The path must be normalized to a canonical form (with no path
+  // elements "", ".", or "..").
+  //
+  // However, this need not be a true file path; virtual objects like figments
+  // can assign an ad-hoc abstract ID, or omit it entirely.
+  //
+  // Examples:
+  //   "devools/kythe/platform/go/datastore.go" (a file)
+  //   "type/cpp/void.cc" (a type figment)
+  string path = 4;
+
+  // The language this name belongs to.
+  // e.g., "c++", "python", "elisp", "haskell", "java"
+  //
+  // The schema will define specific labels for each supported language, so we
+  // don't wind up with a confusion of names like "cxx", "cpp", "C++", etc.
+  // Prototype: Official language name converted to lowercase.  If a version
+  // number is necessary, include it, e.g., "python3".
+  string language = 5;
+
+  // Other fields we may need in the future, but do not currently use:
+  // branch -- a branch name within the corpus depot, e.g., "gslb_branch".
+  // client -- a source-control client ID, e.g., "sergey:googlex:8:citc".
+
+  // Note: We have intentionally NOT included a revision or timestamp here.
+  // Time should be recorded as facts belonging to the appropriate Nodes and
+  // Edges.  Having records of when something existed may be important, but time
+  // is not a good axis for a name -- a name should say "what" something is, not
+  // "when".  So we will store timestamps, revisions, and other markers of this
+  // kind as facts inside the graph.
+}
diff --git a/tensorflow/python/framework/offset_counter.cc b/tensorflow/python/framework/offset_counter.cc
new file mode 100644
index 00000000000..278bc58e9b5
--- /dev/null
+++ b/tensorflow/python/framework/offset_counter.cc
@@ -0,0 +1,70 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Usage:
+//   offset_counter [ --out_path FILENAME ] [ SRC_FILE1 SRC_FILE2  ... ]
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/python/framework/offset_counter_helper.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/init_main.h"
+#include "tensorflow/tsl/platform/strcat.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
+
+inline constexpr absl::string_view kUsage =
+    "offset_counter reads C++ source codes, scans for the location of where "
+    "the REGISTER_OP gets called, and outputs a OpRegOffsets proto to stdout "
+    "or a file.";
+
+int main(int argc, char* argv[]) {
+  std::string out_path;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("out_path", &out_path, "Output file path."),
+  };
+  const std::string usage_string =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  const bool parse_result = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(usage_string.c_str(), &argc, &argv);
+  if (!parse_result) {
+    LOG(ERROR) << usage_string;
+    return -1;
+  }
+
+  tensorflow::OpRegOffsets op_reg_offsets;
+  for (size_t i = 1; i < argc; ++i) {
+    TF_CHECK_OK(tensorflow::FindOpRegistationFromFile(argv[i], op_reg_offsets));
+  }
+
+  if (out_path.empty()) {
+    std::cout << op_reg_offsets.SerializeAsString();
+  } else {
+    std::ofstream f(out_path);
+    if (f.bad()) {
+      TF_CHECK_OK(tsl::errors::IOError(
+          tsl::strings::StrCat("Cannot open file: ", out_path), errno));
+    }
+    f << op_reg_offsets.SerializeAsString();
+    f.close();
+  }
+
+  return 0;
+}
diff --git a/tensorflow/python/framework/offset_counter_helper.cc b/tensorflow/python/framework/offset_counter_helper.cc
new file mode 100644
index 00000000000..5800cb7a969
--- /dev/null
+++ b/tensorflow/python/framework/offset_counter_helper.cc
@@ -0,0 +1,59 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/offset_counter_helper.h"
+
+#include <cstdint>
+#include <fstream>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/regexp.h"
+#include "tensorflow/tsl/platform/strcat.h"
+
+namespace tensorflow {
+
+tsl::Status FindOpRegistationFromFile(absl::string_view filename,
+                                      OpRegOffsets& op_reg_offsets) {
+  static constexpr LazyRE2 reg_pattern = {
+      R"regex((REGISTER_OP)\("([\w>]+)"\))regex"};
+  std::ifstream f(std::string{filename});
+  if (f.bad()) {
+    return tsl::errors::IOError(
+        tsl::strings::StrCat("Cannot open file: ", filename), errno);
+  }
+  std::string line;
+  absl::string_view reg_keyword, op_name;
+  uint32_t offsets = 0;
+  while (std::getline(f, line)) {
+    if (RE2::PartialMatch(line, *reg_pattern, &reg_keyword, &op_name)) {
+      // Set the [start, end] to the beginning char of REGISTER_OP
+      // See phase 1 in go/pywald-tf-ops-xref for more details.
+      uint32_t offset_start = offsets + (line.data() - reg_keyword.data());
+      uint32_t offset_end = offset_start;
+      auto op_reg_offset = op_reg_offsets.add_offsets();
+      op_reg_offset->set_name(std::string{op_name});
+      op_reg_offset->set_filepath(std::string{filename});
+      op_reg_offset->set_start(offset_start);
+      op_reg_offset->set_end(offset_end);
+    }
+    offsets += line.size() + 1;
+  }
+  f.close();
+  return tsl::OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/offset_counter_helper.h b/tensorflow/python/framework/offset_counter_helper.h
new file mode 100644
index 00000000000..6d8d8b458b6
--- /dev/null
+++ b/tensorflow/python/framework/offset_counter_helper.h
@@ -0,0 +1,30 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_OFFSET_COUNTER_HELPER_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_OFFSET_COUNTER_HELPER_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+
+namespace tensorflow {
+tsl::Status FindOpRegistationFromFile(absl::string_view filename,
+                                      OpRegOffsets& op_reg_offsets);
+}
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_OFFSET_COUNTER_HELPER_H_
diff --git a/tensorflow/python/framework/offset_counter_helper_test.cc b/tensorflow/python/framework/offset_counter_helper_test.cc
new file mode 100644
index 00000000000..f354c579245
--- /dev/null
+++ b/tensorflow/python/framework/offset_counter_helper_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/offset_counter_helper.h"
+
+#include <string>
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(OffsetCounterHelper, FindOpRegistationFromFile) {
+  std::string content = R"code(
+REGISTER_OP("Test>Op1");
+REGISTER_OP("Test>Op2")
+    .Input("input: int32")
+    .Output("output: int32");
+)code";
+  Env* env = Env::Default();
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
+  TF_ASSERT_OK(WriteStringToFile(env, fname, content));
+
+  OpRegOffsets actual;
+  TF_CHECK_OK(FindOpRegistationFromFile(fname, actual));
+
+  // EqualsProto is not available in OSS. b/135192747
+  EXPECT_EQ(actual.offsets(0).name(), "Test>Op1");
+  EXPECT_EQ(actual.offsets(0).filepath(), fname);
+  EXPECT_EQ(actual.offsets(0).start(), 1);
+  EXPECT_EQ(actual.offsets(0).end(), 1);
+
+  EXPECT_EQ(actual.offsets(1).name(), "Test>Op2");
+  EXPECT_EQ(actual.offsets(1).filepath(), fname);
+  EXPECT_EQ(actual.offsets(1).start(), 26);
+  EXPECT_EQ(actual.offsets(1).end(), 26);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 892e15eb8e1..07fb7890c2a 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -419,7 +419,7 @@ def testAttrFuncWithFuncWithAttrs(self):
       @eager_function.defun_with_attributes(
           input_signature=(tensor_spec.TensorSpec(None, dtypes.float32),),
           autograph=False,
-          attributes={"_dummy_attr": 15})
+          attributes={"_implements": 15})
       def fn(x):
         return 2 + x
 
@@ -433,7 +433,7 @@ def fn(x):
           value {
             func {
               name: '%s'
-              attr { key: "_dummy_attr" value { i: 15 } }
+              attr { key: "_implements" value { i: 15 } }
             }
           }
         }
diff --git a/tensorflow/python/framework/op_reg_offset.proto b/tensorflow/python/framework/op_reg_offset.proto
new file mode 100644
index 00000000000..0b73cb56407
--- /dev/null
+++ b/tensorflow/python/framework/op_reg_offset.proto
@@ -0,0 +1,21 @@
+syntax = "proto3";
+
+package tensorflow;
+
+// OpRegOffset describes the location of an op registration (i.e. where
+// REGISTER_OP is called).
+message OpRegOffset {
+  // Name of the op.
+  string name = 1;
+  // Filepath of where the op is registered.
+  string filepath = 2;
+  // The start offset of the op registration in the file.
+  uint32 start = 3;
+  // The end offset of the op registration in the file.
+  uint32 end = 4;
+}
+
+// OpRegOffsets is a collection of OpRegOffset.
+message OpRegOffsets {
+  repeated OpRegOffset offsets = 1;
+}
\ No newline at end of file
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 122413d7832..62737fae089 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -67,7 +67,6 @@
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lock_util
-from tensorflow.python.util import memory
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_stack
@@ -104,7 +103,7 @@
     "/tensorflow/api/enable_control_flow_v2",
     "Whether enable_control_flow_v2() is called.")
 
-_tf_function_api_guage = monitoring.BoolGauge(
+_tf_function_api_gauge = monitoring.BoolGauge(
     "/tensorflow/api/tf_function",
     "Whether tf.function() is used.")
 
@@ -1036,8 +1035,14 @@ def ref(self):
     return object_identity.Reference(self)
 
   def __tf_tracing_type__(self, signature_context):
-    return tensor_spec.TensorSpec(
+    spec = tensor_spec.TensorSpec(
         self.shape, self.dtype).__tf_tracing_type__(signature_context)
+    # TODO(b/263894631): Store handle data in the TensorSpec itself. Once
+    # implemented, the following section under the if condition can be removed.
+    if self.dtype == dtypes.resource or self.dtype == dtypes.variant:
+      handle_data = get_handle_data(self)
+      signature_context.add_handledata(id(spec), handle_data)
+    return spec
 
 
 # TODO(agarwal): consider getting rid of this.
@@ -1275,7 +1280,8 @@ def ndim(self):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
-  @deprecation.deprecated(None, "Use tf.identity instead.")
+  @deprecation.deprecated(
+      None, "Use tf.identity with explicit device placement instead.")
   def cpu(self):
     """A copy of this Tensor with contents backed by host memory."""
     return self._copy(context.context(), "CPU:0")
@@ -1550,7 +1556,7 @@ def pack_eager_tensors(tensors, ctx=None):
   if ctx is None:
     ctx = context.context()
 
-  # Propogate handle data for resource variables
+  # Propagate handle data for resource variables
   packed_tensor = ctx.pack_eager_tensors(tensors)
   if handle_data is not None:
     packed_tensor._handle_data = handle_data  # pylint: disable=protected-access
@@ -2094,7 +2100,7 @@ def __init__(self,
         control_op = None
         if isinstance(c, Operation):
           control_op = c
-        elif isinstance(c, (Tensor, IndexedSlices)):
+        elif isinstance(c, (Tensor, indexed_slices.IndexedSlices)):
           control_op = c.op
         else:
           raise TypeError(f"Control input must be an Operation, "
@@ -2113,7 +2119,8 @@ def __init__(self,
 
     # Removes this frame from the Python traceback.
     # We adjust stacklevel directly to avoid triggering serialization.
-    self.traceback._stacklevel += 1  # pylint: disable=protected-access
+    if self.traceback is not None:
+      self.traceback._stacklevel += 1  # pylint: disable=protected-access
 
   @classmethod
   def _from_c_op(cls, c_op, g):
@@ -3671,6 +3678,15 @@ def _add_function(self, function):
     if self._graph_def_versions.min_consumer < 12:
       self._graph_def_versions.min_consumer = 12
 
+  def _remove_function(self, name):
+    self._check_not_finalized()
+    if not self._is_function(name):
+      raise ValueError(f"Function {name!r} is not found in {self!r}.")
+
+    with self._c_graph.get() as c_graph:
+      pywrap_tf_session.TF_GraphRemoveFunction(c_graph, compat.as_bytes(name))
+      del self._functions[compat.as_str(name)]
+
   @property
   def building_function(self):
     """Returns True iff this graph represents a function."""
@@ -5085,7 +5101,7 @@ def my_func(pred, tensor):
       # The hasattr(handle) is designed to match ResourceVariables. This is so
       # control dependencies on a variable or on an unread variable don't
       # trigger reads.
-      if (isinstance(c, IndexedSlices) or
+      if (isinstance(c, indexed_slices.IndexedSlices) or
           (hasattr(c, "_handle") and hasattr(c, "op"))):
         c = c.op
       c = self.as_graph_element(c)
@@ -5579,7 +5595,7 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       if not hasattr(op, "device"):
-        op = internal_convert_to_tensor_or_indexed_slices(op)
+        op = convert_to_tensor(op)
       return device(op.device)
     else:
       return NullContextmanager()
@@ -5615,10 +5631,47 @@ def control_dependencies(control_inputs):
 
   See `tf.Graph.control_dependencies` for more details.
 
-  Note: *In TensorFlow 2 with eager and/or Autograph, you should not require
-  this method, as ops execute in the expected order thanks to automatic control
-  dependencies.* Only use `tf.control_dependencies` when working with v1
-  `tf.Graph` code.
+  In TensorFlow 2 with eager and/or Autograph, you should not need this method
+  most of the times, as ops execute in the expected order thanks to automatic
+  control dependencies. Only use it to manually control ordering, for example as
+  a workaround to known issues such as `tf.function` with `tf.debugging.assert*`
+  and `tf.py_function`.
+  For example:
+
+  >>> @tf.function(
+  ...   input_signature=[tf.TensorSpec([None, None], tf.float32),
+  ...                    tf.TensorSpec([None, None], tf.float32)])
+  ... def my_assert_func_1(x, bias):
+  ...   # `tf.function` attempts to execute `tf.math.add` in parallel to
+  ...   # `assert_equal`. As a result an error can get raised from `tf.math.add`
+  ...   # without triggering the assertion error.
+  ...   tf.assert_equal(tf.shape(x)[1],
+  ...                   tf.shape(bias)[1],
+  ...                   message='bad shape')
+  ...   return x + bias
+
+  >>> # Error raised in either `add` or `assert`
+  >>> my_assert_func_1(tf.ones((2, 5)), tf.ones((2, 7)))
+  Traceback (most recent call last):
+     ...
+  InvalidArgumentError: ...
+
+
+  >>> @tf.function(
+  ...   input_signature=[tf.TensorSpec([None, None], tf.float32),
+  ...                    tf.TensorSpec([None, None], tf.float32)])
+  ... def my_assert_func_2(x, bias):
+  ...   with tf.control_dependencies(
+  ...       [tf.assert_equal(tf.shape(x)[1],
+  ...                       tf.shape(bias)[1],
+  ...                       message='bad shape')]):
+  ...     return x + bias
+
+  >>> # Error raised in `assert`
+  >>> my_assert_func_2(tf.ones((2, 5)), tf.ones((2, 7)))
+  Traceback (most recent call last):
+     ...
+  InvalidArgumentError: ...
 
   When eager execution is enabled, any callable object in the `control_inputs`
   list will be called.
@@ -6126,22 +6179,22 @@ def enable_eager_execution(config=None, device_policy=None,
       be picked automatically. The value picked may change between TensorFlow
       releases.
       Valid values:
-      - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
+      - DEVICE_PLACEMENT_EXPLICIT: raises an error if the
         placement is not correct.
-      - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
+      - DEVICE_PLACEMENT_WARN: copies the tensors which are not
         on the right device but logs a warning.
-      - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
+      - DEVICE_PLACEMENT_SILENT: silently copies the tensors.
         Note that this may hide performance problems as there is no notification
         provided when operations are blocked on the tensor being copied between
         devices.
-      - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
+      - DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
     execution_mode: (Optional.) Policy controlling how operations dispatched are
       actually executed. When set to None, an appropriate value will be picked
       automatically. The value picked may change between TensorFlow releases.
       Valid values:
-      - tf.contrib.eager.SYNC: executes each operation synchronously.
-      - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+      - SYNC: executes each operation synchronously.
+      - ASYNC: executes each operation asynchronously. These
         operations may return "non-ready" handles.
 
   Raises:
@@ -6209,13 +6262,9 @@ def enable_eager_execution_internal(config=None,
                            context.DEVICE_PLACEMENT_WARN,
                            context.DEVICE_PLACEMENT_SILENT,
                            context.DEVICE_PLACEMENT_SILENT_FOR_INT32):
-    raise ValueError(
-        "device_policy must be one of None, tf.contrib.eager.DEVICE_PLACEMENT_*"
-    )
+    raise ValueError("device_policy must be one of None, DEVICE_PLACEMENT_*")
   if execution_mode not in (None, context.SYNC, context.ASYNC):
-    raise ValueError(
-        "execution_mode must be one of None, tf.contrib.eager.SYNC, "
-        "tf.contrib.eager.ASYNC")
+    raise ValueError("execution_mode must be one of None, SYNC, " "ASYNC")
   if context.default_execution_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
         _default_graph_stack._global_default_graph is not None)  # pylint: disable=protected-access
@@ -6262,7 +6311,6 @@ def eager_run(main=None, argv=None):
   ```python
   import tensorflow as tf
   # Import subject to future changes:
-  from tensorflow.contrib.eager.python import tfe
 
   def main(_):
     u = tf.constant(6.0)
@@ -6475,10 +6523,8 @@ class GraphKeys(object):
     and all `MODEL_VARIABLES` variables will be in `GLOBAL_VARIABLES`.
   * `LOCAL_VARIABLES`: the subset of `Variable` objects that are local to each
     machine. Usually used for temporarily variables, like counters.
-    Note: use `tf.contrib.framework.local_variable` to add to this collection.
   * `MODEL_VARIABLES`: the subset of `Variable` objects that are used in the
-    model for inference (feed forward). Note: use
-    `tf.contrib.framework.model_variable` to add to this collection.
+    model for inference (feed forward).
   * `TRAINABLE_VARIABLES`: the subset of `Variable` objects that will
     be trained by an optimizer. See
     `tf.compat.v1.trainable_variables`
@@ -6512,7 +6558,7 @@ class GraphKeys(object):
   # Key to collect local variables that are local to the machine and are not
   # saved/restored.
   LOCAL_VARIABLES = "local_variables"
-  # Key to collect local variables which are used to accumulate interal state
+  # Key to collect local variables which are used to accumulate internal state
   # to be used in tf.metrics.*.
   METRIC_VARIABLES = "metric_variables"
   # Key to collect model variables defined by layers.
@@ -6610,7 +6656,7 @@ def dismantle_graph(graph):
     graph: A `Graph` object to destroy. Neither it nor any of its ops are usable
       after this function runs.
   """
-  memory.dismantle_ordered_dict(graph._functions)  # pylint: disable=protected-access
+  graph._functions.clear()  # pylint: disable=protected-access
 
   # Now clean up Operation<->Graph reference cycles by clearing all of the
   # attributes for the Graph and its ops.
@@ -7165,30 +7211,18 @@ def _op_to_colocate_with(v, graph):
       return graph.capture(v.handle).op, device_only_candidate
     else:
       return v.handle.op, device_only_candidate
-  return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op, None
+  if isinstance(v, EagerTensor) and not context.executing_eagerly():
+    return convert_to_tensor(v, as_ref=True).op, None
+  elif isinstance(v, internal.NativeObject):
+    return v.op, None
+  else:
+    return convert_to_tensor(v, as_ref=True).op, None
 
 
 def _is_keras_symbolic_tensor(x):
   return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
 
 
-# These symbols were originally defined in this module; import them for
-# backwards compatibility until all references have been updated to access
-# them from the indexed_slices.py module.
-IndexedSlices = indexed_slices.IndexedSlices
-IndexedSlicesValue = indexed_slices.IndexedSlicesValue
-convert_to_tensor_or_indexed_slices = \
-    indexed_slices.convert_to_tensor_or_indexed_slices
-convert_n_to_tensor_or_indexed_slices = \
-    indexed_slices.convert_n_to_tensor_or_indexed_slices
-internal_convert_to_tensor_or_indexed_slices = \
-    indexed_slices.internal_convert_to_tensor_or_indexed_slices
-internal_convert_n_to_tensor_or_indexed_slices = \
-    indexed_slices.internal_convert_n_to_tensor_or_indexed_slices
-register_tensor_conversion_function = \
-    tensor_conversion_registry.register_tensor_conversion_function
-
-
 # Helper functions for op wrapper modules generated by `python_op_gen`.
 
 
@@ -7360,6 +7394,13 @@ def get_resource_handle_data(graph_op):
       compat.as_bytes(handle_data))
 
 
+def get_handle_data(source_t):
+  """Obtains HandleData from a tensor."""
+  if isinstance(source_t, EagerTensor):
+    return source_t._handle_data  # pylint: disable=protected-access
+  return get_resource_handle_data(source_t)
+
+
 def _copy_handle_data_to_arg_def(tensor, arg_def):
   handle_data = get_resource_handle_data(tensor)
   if handle_data.shape_and_type:
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index f956bbb9906..123ff2820e8 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -44,6 +44,7 @@
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -3568,7 +3569,7 @@ def __iter__(self):
     return iter(self._components)
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     _MyTuple, conversion_func=lambda x, *_, **__: _TupleTensor(x))
 
 
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ab6bee325bd..5fcd35ae07f 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -14,13 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/framework/python_op_gen.h"
 
-#include <stdio.h>
-
+#include <algorithm>
+#include <cstdio>
 #include <sstream>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
-#include "absl/strings/escaping.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -28,12 +32,10 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -146,14 +148,16 @@ string TensorPBString(const TensorProto& pb) {
 
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
-  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name, bool add_type_annotations)
+  GenEagerPythonOp(
+      const OpDef& op_def, const ApiDef& api_def, const string& function_name,
+      bool add_type_annotations,
+      python_op_gen_internal::GeneratedCodeAnnotator* annotator = nullptr)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name,
-                                            add_type_annotations) {
+                                            add_type_annotations, annotator) {
     op_name_ = function_name_;
     absl::ConsumePrefix(&op_name_, "_");
   }
-  ~GenEagerPythonOp() override {}
+  ~GenEagerPythonOp() override = default;
 
   string Code() override;
 
@@ -231,10 +235,12 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
       params_with_default_;
 };
 
-string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                        const string& function_name,
-                        bool add_type_annotations) {
-  return GenEagerPythonOp(op_def, api_def, function_name, add_type_annotations)
+string GetEagerPythonOp(
+    const OpDef& op_def, const ApiDef& api_def, const string& function_name,
+    bool add_type_annotations,
+    python_op_gen_internal::GeneratedCodeAnnotator* annotator = nullptr) {
+  return GenEagerPythonOp(op_def, api_def, function_name, add_type_annotations,
+                          annotator)
       .Code();
 }
 
@@ -876,6 +882,12 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   }
 
   AddExport();
+  if (annotator_ != nullptr) {
+    // The generated function name will start at the character after
+    // the current cursor + len("def ")
+    annotator_->AddAnnotation(op_def_, function_name_,
+                              /*offset_start =*/result_.length() + 5);
+  }
   AddDefLine(function_name_, parameters);
   if (add_type_annotations_) {
     AddReturnTypeAnnotation(type_annotations);
@@ -1206,8 +1218,12 @@ void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
 
 string GetPythonOpsImpl(
     const OpList& ops, const ApiDefMap& api_defs,
-    const std::vector<string>& hidden_ops, const string& source_file_name = "",
-    const std::unordered_set<string> type_annotate_ops = {}) {
+    const OpRegOffsets& op_reg_offsets, absl::Span<const string> hidden_ops,
+    absl::Span<const string> source_file_list,
+    const std::unordered_set<string>& type_annotate_ops = {}) {
+  python_op_gen_internal::GeneratedCodeAnnotator annotator;
+  bool annotate = !op_reg_offsets.offsets().empty();
+
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
@@ -1218,9 +1234,9 @@ This file is MACHINE GENERATED! Do not edit.
 
   // Mention the original source file so someone tracing back through
   // generated Python code will know where to look next.
-  if (!source_file_name.empty()) {
+  if (!source_file_list.empty()) {
     strings::StrAppend(&result, "Original C++ source file: ");
-    strings::StrAppend(&result, source_file_name);
+    strings::StrAppend(&result, absl::StrJoin(source_file_list, ", "));
     strings::StrAppend(&result, "\n");
   }
 
@@ -1243,6 +1259,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 from typing import TypeVar
 )");
+  if (annotate) {
+    annotator.SetBase(result.length());
+  }
 
   for (const auto& op_def : ops.op()) {
     const auto* api_def = api_defs.GetApiDef(op_def.name());
@@ -1288,9 +1307,18 @@ from typing import TypeVar
     auto iter = type_annotate_ops.find(op_def.name());
     bool add_type_annotations = iter != type_annotate_ops.end();
 
-    strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name,
-                                        add_type_annotations));
+    strings::StrAppend(
+        &result,
+        GetEagerPythonOp(op_def, *api_def, function_name, add_type_annotations,
+                         annotate ? &annotator : nullptr));
+    if (annotate) {
+      annotator.SetBase(result.length());
+    }
+  }
+
+  if (annotate) {
+    annotator.FillSourceOffsets(op_reg_offsets);
+    strings::StrAppend(&result, annotator.BuildKytheMetadata());
   }
 
   return result;
@@ -1299,19 +1327,21 @@ from typing import TypeVar
 }  // namespace
 
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name,
-                    const std::unordered_set<string> type_annotate_ops) {
-  return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name,
-                          type_annotate_ops);
+                    const OpRegOffsets& op_reg_offsets,
+                    absl::Span<const string> hidden_ops,
+                    absl::Span<const string> source_file_list,
+                    const std::unordered_set<string>& type_annotate_ops) {
+  return GetPythonOpsImpl(ops, api_defs, op_reg_offsets, hidden_ops,
+                          source_file_list, type_annotate_ops);
 }
 
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name,
-                    const std::unordered_set<string> type_annotate_ops) {
-  printf("%s", GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name,
-                                type_annotate_ops)
+                    const OpRegOffsets& op_reg_offsets,
+                    absl::Span<const string> hidden_ops,
+                    absl::Span<const string> source_file_list,
+                    const std::unordered_set<string>& type_annotate_ops) {
+  printf("%s", GetPythonOpsImpl(ops, api_defs, op_reg_offsets, hidden_ops,
+                                source_file_list, type_annotate_ops)
                    .c_str());
 }
 
@@ -1320,7 +1350,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   ops.ParseFromArray(op_list_buf, op_list_len);
 
   ApiDefMap api_def_map(ops);
-  return GetPythonOpsImpl(ops, api_def_map, {});
+  return GetPythonOpsImpl(ops, api_def_map, OpRegOffsets(), {}, {});
 }
 
 string GetArgAnnotation(
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index c39837b0e53..35de50c17b9 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -16,12 +16,15 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_
 
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
 
 namespace tensorflow {
 
@@ -30,12 +33,16 @@ namespace tensorflow {
 // api_defs is typically constructed directly from ops.
 // hidden_ops should be a list of Op names that should get a leading _
 // in the output.
-// source_file_name is optional and contains the name of the original C++ source
+// source_file_list is optional and contains the name of the original C++ source
 // file where the ops' REGISTER_OP() calls reside.
+// op_reg_offsets contains the location of the ops' REGISTER_OP() calls
+// in the file. If specified, returned string will contain a metadata comment
+// which contains indexing information for Kythe.
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name,
-                    const std::unordered_set<string> type_annotate_ops);
+                    const OpRegOffsets& op_reg_offsets,
+                    absl::Span<const string> hidden_ops,
+                    absl::Span<const string> source_file_list,
+                    const std::unordered_set<string>& type_annotate_ops);
 
 // Prints the output of GetPrintOps to stdout.
 // hidden_ops should be a list of Op names that should get a leading _
@@ -43,9 +50,10 @@ string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 // Optional fourth argument is the name of the original C++ source file
 // where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name,
-                    const std::unordered_set<string> type_annotate_ops);
+                    const OpRegOffsets& op_reg_offsets,
+                    absl::Span<const string> hidden_ops,
+                    absl::Span<const string> source_file_list,
+                    const std::unordered_set<string>& type_annotate_ops);
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/framework/python_op_gen_annotator.cc b/tensorflow/python/framework/python_op_gen_annotator.cc
new file mode 100644
index 00000000000..523ba1b65a4
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_annotator.cc
@@ -0,0 +1,83 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen_annotator.h"
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/python/framework/kythe_metadata.pb.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+
+#define KYTHE_CORPUS "github.com/tensorflow/tensorflow"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+
+void GeneratedCodeAnnotator::AddAnnotation(const OpDef& op_def,
+                                           absl::string_view function_name,
+                                           uint32_t offset_start) {
+  const uint32_t start_byte = base_pos_ + offset_start;
+  const uint32_t end_byte = start_byte + function_name.size();
+  byte_offsets_map_[op_def.name()].generated_start = start_byte;
+  byte_offsets_map_[op_def.name()].generated_end = end_byte;
+}
+
+void GeneratedCodeAnnotator::FillSourceOffsets(
+    const OpRegOffsets& op_reg_offsets) {
+  for (const OpRegOffset& offset : op_reg_offsets.offsets()) {
+    if (byte_offsets_map_.find(offset.name()) != byte_offsets_map_.end()) {
+      byte_offsets_map_[offset.name()].file_path = offset.filepath();
+      byte_offsets_map_[offset.name()].source_start = offset.start();
+      byte_offsets_map_[offset.name()].source_end = offset.end();
+    }
+  }
+}
+
+string GeneratedCodeAnnotator::BuildKytheMetadata() {
+  GeneratedCodeInfo generated_code_info;
+  generated_code_info.set_type(GeneratedCodeInfo::KYTHE0);
+  for (const auto& [_, offsets] : byte_offsets_map_) {
+    if (offsets.file_path.empty()) {
+      continue;
+    }
+    MappingRule* meta = generated_code_info.add_meta();
+    meta->set_type(MappingRule::ANCHOR_ANCHOR);
+    meta->set_edge("/kythe/edge/imputes");
+    meta->set_source_begin(offsets.source_start);
+    meta->set_source_end(offsets.source_end);
+    meta->set_target_begin(offsets.generated_start);
+    meta->set_target_end(offsets.generated_end);
+
+    VName* vname = meta->mutable_source_vname();
+    vname->set_signature(
+        absl::StrFormat("@%d:%d", offsets.source_start, offsets.source_end));
+    vname->set_corpus(KYTHE_CORPUS);
+    vname->set_path(offsets.file_path);
+    vname->set_language("c++");
+  }
+  return "# kythe.proto.metadata.GeneratedCodeInfo:" +
+         absl::Base64Escape(generated_code_info.SerializeAsString());
+}
+
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen_annotator.h b/tensorflow/python/framework/python_op_gen_annotator.h
new file mode 100644
index 00000000000..c63530fae6b
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_annotator.h
@@ -0,0 +1,66 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_ANNOTATOR_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_ANNOTATOR_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+
+// Collects and builds the generated code metadata for Kythe indexing.
+class GeneratedCodeAnnotator {
+ public:
+  // Adds annotation of generated function and calculates the offset of
+  // generated source based on base_pos_.
+  void AddAnnotation(const OpDef& op_def, absl::string_view function_name,
+                     uint32_t offset_start);
+  // Updates base cursor.
+  void SetBase(uint32_t pos) { base_pos_ = pos; }
+  // Builds Kythe metadata from the offset map.
+  string BuildKytheMetadata();
+  // Fills the source offsets from OpRegOffsets.
+  void FillSourceOffsets(const OpRegOffsets& op_reg_offsets);
+
+  // Structure to store byte offsets of generated symbols.
+  struct ByteOffsets {
+    // The offsets of the symbol in the source file. Only valid if file_path
+    // is set.
+    uint32_t source_start = 0;
+    uint32_t source_end = 0;
+    // The offsets of the symbol in the generated file.
+    uint32_t generated_start = 0;
+    uint32_t generated_end = 0;
+    // The file path of the source file.
+    string file_path;
+  };
+
+ private:
+  uint32_t base_pos_ = 0;
+  std::unordered_map<string, ByteOffsets> byte_offsets_map_;
+};
+
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_ANNOTATOR_H_
diff --git a/tensorflow/python/framework/python_op_gen_annotator_test.cc b/tensorflow/python/framework/python_op_gen_annotator_test.cc
new file mode 100644
index 00000000000..6caeed3617b
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_annotator_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen_annotator.h"
+
+#include <utility>
+
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/python/framework/kythe_metadata.pb.h"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+namespace {
+
+using ::testing::StartsWith;
+
+GeneratedCodeInfo ParseMetadata(string metadata) {
+  GeneratedCodeInfo generated_code_info;
+  std::pair<string, string> p = absl::StrSplit(metadata, ':');
+  string serialized_generated_code_info;
+  absl::Base64Unescape(p.second, &serialized_generated_code_info);
+  generated_code_info.ParseFromString(serialized_generated_code_info);
+  return generated_code_info;
+}
+
+TEST(PythonOpGenAnnotatorTest, AddAnnotationWithoutSourceOffsets) {
+  GeneratedCodeAnnotator annotator;
+  OpDef fakeOpDef;
+  fakeOpDef.set_name("fake_op");
+  annotator.AddAnnotation(fakeOpDef, "fake_op", 0);
+  string meta = annotator.BuildKytheMetadata();
+  ASSERT_THAT(meta, StartsWith("# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo actual = ParseMetadata(meta);
+  GeneratedCodeInfo expected;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString("type: KYTHE0", &expected));
+  EXPECT_EQ(actual.SerializeAsString(), expected.SerializeAsString());
+}
+
+TEST(PythonOpGenAnnotatorTest, AddAnnotationWithSourceOffsets) {
+  GeneratedCodeAnnotator annotator;
+  OpDef fakeOpDef;
+  fakeOpDef.set_name("fake_op");
+  OpRegOffsets fakeOffsets;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      R"pb(
+        offsets {
+          name: "fake_op",
+          filepath: "file/path/to/fake_op.cc",
+          start: 7,
+          end: 11,
+        }
+      )pb",
+      &fakeOffsets));
+  annotator.AddAnnotation(fakeOpDef, "fake_op", 100);
+  annotator.FillSourceOffsets(fakeOffsets);
+
+  string meta = annotator.BuildKytheMetadata();
+  ASSERT_THAT(meta, StartsWith("# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo actual = ParseMetadata(meta);
+
+  EXPECT_EQ(actual.meta(0).type(), MappingRule::ANCHOR_ANCHOR);
+  EXPECT_EQ(actual.meta(0).edge(), "/kythe/edge/imputes");
+  EXPECT_EQ(actual.meta(0).source_vname().path(), "file/path/to/fake_op.cc");
+  EXPECT_EQ(actual.meta(0).source_begin(), 7);
+  EXPECT_EQ(actual.meta(0).source_end(), 11);
+  EXPECT_EQ(actual.meta(0).target_begin(), 100);
+  EXPECT_EQ(actual.meta(0).target_end(), 107);
+}
+
+TEST(PythonOpGenAnnotatorTest, AddAnnotationWithSourceOffsetsAndNonZeroBase) {
+  GeneratedCodeAnnotator annotator;
+  OpDef fakeOpDef;
+  fakeOpDef.set_name("fake_op");
+  OpRegOffsets fakeOffsets;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      R"pb(
+        offsets {
+          name: "fake_op",
+          filepath: "file/path/to/fake_op.cc",
+          start: 7,
+          end: 11,
+        }
+      )pb",
+      &fakeOffsets));
+  annotator.SetBase(10);
+  annotator.AddAnnotation(fakeOpDef, "fake_op", 100);
+  annotator.FillSourceOffsets(fakeOffsets);
+
+  string meta = annotator.BuildKytheMetadata();
+  ASSERT_THAT(meta, StartsWith("# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo actual = ParseMetadata(meta);
+
+  EXPECT_EQ(actual.meta(0).type(), MappingRule::ANCHOR_ANCHOR);
+  EXPECT_EQ(actual.meta(0).edge(), "/kythe/edge/imputes");
+  EXPECT_EQ(actual.meta(0).source_vname().path(), "file/path/to/fake_op.cc");
+  EXPECT_EQ(actual.meta(0).source_begin(), 7);
+  EXPECT_EQ(actual.meta(0).source_end(), 11);
+  EXPECT_EQ(actual.meta(0).target_begin(), 110);
+  EXPECT_EQ(actual.meta(0).target_end(), 117);
+}
+
+TEST(PythonOpGenAnnotatorTest, AddMultipleAnnotation) {
+  GeneratedCodeAnnotator annotator;
+  OpDef fakeOpDef;
+  OpRegOffsets fakeOffsets;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      R"pb(
+        offsets {
+          name: "fake_op_1",
+          filepath: "file/path/to/fake_op.cc",
+          start: 7,
+          end: 11,
+        }
+        offsets {
+          name: "fake_op_2",
+          filepath: "file/path/to/fake_op.cc",
+          start: 101,
+          end: 103,
+        }
+      )pb",
+      &fakeOffsets));
+  fakeOpDef.set_name("fake_op_1");
+  annotator.AddAnnotation(fakeOpDef, "fake_op_1", 10);
+  fakeOpDef.set_name("fake_op_2");
+  annotator.AddAnnotation(fakeOpDef, "fake_op_2", 100);
+  annotator.FillSourceOffsets(fakeOffsets);
+
+  string meta = annotator.BuildKytheMetadata();
+  ASSERT_THAT(meta, StartsWith("# kythe.proto.metadata.GeneratedCodeInfo:"));
+  GeneratedCodeInfo actual = ParseMetadata(meta);
+
+  EXPECT_EQ(actual.meta_size(), 2);
+}
+
+}  // namespace
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index adcde1052fd..1ef9a465481 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -18,9 +18,13 @@ limitations under the License.
 #include <float.h>
 #include <stdio.h>
 
+#include <cmath>
 #include <iomanip>
+#include <locale>
+#include <set>
 #include <sstream>
 #include <unordered_map>
+#include <vector>
 
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_replace.h"
@@ -37,8 +41,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -513,14 +515,16 @@ const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
 }
 
 GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                         const string& function_name, bool add_type_annotations)
+                         const string& function_name, bool add_type_annotations,
+                         GeneratedCodeAnnotator* annotator = nullptr)
     : op_def_(op_def),
       api_def_(api_def),
       function_name_(function_name),
       add_type_annotations_(add_type_annotations),
-      num_outs_(op_def.output_arg_size()) {}
+      num_outs_(op_def.output_arg_size()),
+      annotator_(annotator) {}
 
-GenPythonOp::~GenPythonOp() {}
+GenPythonOp::~GenPythonOp() = default;
 
 string GenPythonOp::Code() {
   // This has all the input args followed by those attrs that don't have
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 08d9b3c8a66..ba99d949262 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -17,11 +17,13 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
 
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/framework/python_op_gen_annotator.h"
 
 namespace tensorflow {
 namespace python_op_gen_internal {
@@ -71,7 +73,8 @@ class ParamNames {
 class GenPythonOp {
  public:
   GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-              const string& function_name, bool add_type_annotations_);
+              const string& function_name, bool add_type_annotations_,
+              GeneratedCodeAnnotator* annotator);
   virtual ~GenPythonOp();
 
   virtual string Code();
@@ -100,6 +103,7 @@ class GenPythonOp {
   const string function_name_;
   bool add_type_annotations_;
   const int num_outs_;
+  GeneratedCodeAnnotator* annotator_ = nullptr;
 
   // Return value from Code() is prelude_ + result_.
   string prelude_;  // Code before function definition
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 50ea887cccd..83060637cfb 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -13,32 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/framework/python_op_gen.h"
-
+#include <cstdio>
 #include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/strings/scanner.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
+#include "tensorflow/python/framework/python_op_gen.h"
+#include "tensorflow/tsl/lib/io/buffered_inputstream.h"
+#include "tensorflow/tsl/lib/io/random_inputstream.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/str_util.h"
+#include "tensorflow/tsl/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
 
+constexpr char kUsage[] =
+    "This tool generates python wrapper for tensorflow ops.";
+
 Status ReadOpListFromFile(const string& filename,
                           std::vector<string>* op_list) {
   std::unique_ptr<RandomAccessFile> file;
-  TF_CHECK_OK(Env::Default()->NewRandomAccessFile(filename, &file));
+  TF_RETURN_IF_ERROR(Env::Default()->NewRandomAccessFile(filename, &file));
   std::unique_ptr<io::InputBuffer> input_buffer(
       new io::InputBuffer(file.get(), 256 << 10));
   string line_contents;
@@ -60,56 +75,44 @@ Status ReadOpListFromFile(const string& filename,
   return OkStatus();
 }
 
-// The argument parsing is deliberately simplistic to support our only
-// known use cases:
-//
-// 1. Read all op names from a file.
-// 2. Read all op names from the arg as a comma-delimited list.
-//
-// Expected command-line argument syntax:
-// ARG ::= '@' FILENAME
-//       |  OP_NAME [',' OP_NAME]*
-//       |  ''
-Status ParseOpListCommandLine(const char* arg, std::vector<string>* op_list) {
-  std::vector<string> op_names = str_util::Split(arg, ',');
-  if (op_names.size() == 1 && op_names[0].empty()) {
-    return OkStatus();
-  } else if (op_names.size() == 1 && op_names[0].substr(0, 1) == "@") {
-    const string filename = op_names[0].substr(1);
-    return tensorflow::ReadOpListFromFile(filename, op_list);
-  } else {
-    *op_list = std::move(op_names);
-  }
+Status ReadOpRegOffsetsFromFile(absl::string_view filename,
+                                OpRegOffsets* op_reg_offsets) {
+  std::unique_ptr<RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(
+      Env::Default()->NewRandomAccessFile(std::string(filename), &file));
+  io::RandomAccessInputStream input_stream(file.get());
+  io::BufferedInputStream in(&input_stream, 1 << 20);
+  string contents;
+  TF_RETURN_IF_ERROR(in.ReadAll(&contents));
+  op_reg_offsets->ParseFromString(contents);
   return OkStatus();
 }
 
-// Use the name of the current executable to infer the C++ source file
-// where the REGISTER_OP() call for the operator can be found.
-// Returns the name of the file.
-// Returns an empty string if the current executable's name does not
-// follow a known pattern.
-string InferSourceFileName(const char* argv_zero) {
-  StringPiece command_str = io::Basename(argv_zero);
-
-  // For built-in ops, the Bazel build creates a separate executable
-  // with the name gen_<op type>_ops_py_wrappers_cc containing the
-  // operators defined in <op type>_ops.cc
-  const char* kExecPrefix = "gen_";
-  const char* kExecSuffix = "_py_wrappers_cc";
-  if (absl::ConsumePrefix(&command_str, kExecPrefix) &&
-      str_util::EndsWith(command_str, kExecSuffix)) {
-    command_str.remove_suffix(strlen(kExecSuffix));
-    return strings::StrCat(command_str, ".cc");
-  } else {
-    return string("");
+std::vector<string> GetSourceFileListFromOpRegOffsets(
+    const OpRegOffsets& offsets) {
+  std::unordered_set<string> source_file_list;
+  for (const auto& offset : offsets.offsets()) {
+    source_file_list.insert(offset.filepath());
   }
+  return std::vector<string>(source_file_list.begin(), source_file_list.end());
 }
 
-void PrintAllPythonOps(const std::vector<string>& op_list,
-                       const std::vector<string>& api_def_dirs,
-                       const string& source_file_name,
-                       bool op_list_is_allowlist,
-                       const std::unordered_set<string> type_annotate_ops) {
+// Generates Python wapper functions for the registered ops given ApiDefs in
+// `api_def_dirs` and write the result to `out_path` or print to stdout if
+// `out_path` is empty.
+//
+// The ops in `hidden_op_list` will be private in python and the ops in
+// `op_allowlist` will be skipped.
+//
+// If `source_file_name` is not empty, a comment block will be generated
+// to show the source file name that the generated file is generated from.
+Status PrintAllPythonOps(
+    absl::Span<const string> api_def_dirs,
+    absl::Span<const string> source_file_list, const string& out_path,
+    const OpRegOffsets& op_reg_offsets,
+    absl::Span<const string> op_allowlist = {},
+    absl::Span<const string> hidden_op_list = {},
+    const std::unordered_set<string>& type_annotate_ops = {}) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
@@ -119,68 +122,125 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
 
     for (const auto& api_def_dir : api_def_dirs) {
       std::vector<string> api_files;
-      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
-                                        &api_files));
-      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+      TF_RETURN_IF_ERROR(env->GetMatchingPaths(
+          io::JoinPath(api_def_dir, "*.pbtxt"), &api_files));
+      TF_RETURN_IF_ERROR(api_def_map.LoadFileList(env, api_files));
     }
     api_def_map.UpdateDocs();
   }
 
-  if (op_list_is_allowlist) {
-    std::unordered_set<string> allowlist(op_list.begin(), op_list.end());
-    OpList pruned_ops;
+  OpList pruned_ops;
+  if (!op_allowlist.empty()) {
+    std::unordered_set<string> allowlist(op_allowlist.begin(),
+                                         op_allowlist.end());
     for (const auto& op_def : ops.op()) {
       if (allowlist.find(op_def.name()) != allowlist.end()) {
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintPythonOps(pruned_ops, api_def_map, {}, source_file_name,
-                   type_annotate_ops);
   } else {
-    PrintPythonOps(ops, api_def_map, op_list, source_file_name,
-                   type_annotate_ops);
+    pruned_ops = ops;
+  }
+
+  string result =
+      GetPythonOps(pruned_ops, api_def_map, op_reg_offsets, hidden_op_list,
+                   source_file_list, type_annotate_ops);
+
+  if (out_path.empty()) {
+    printf("%s", result.c_str());
+  } else {
+    std::unique_ptr<tensorflow::WritableFile> file;
+    TF_RETURN_IF_ERROR(
+        tensorflow::Env::Default()->NewWritableFile(out_path, &file));
+    TF_RETURN_IF_ERROR(file->Append(result));
   }
+
+  return OkStatus();
 }
 
 }  // namespace
 }  // namespace tensorflow
 
 int main(int argc, char* argv[]) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  std::string api_def_dirs_raw;
+  std::string op_allowlist_raw;
+  std::string op_allowlist_filename;
+  std::string hidden_op_list_raw;
+  std::string hidden_op_list_filename;
+  std::string op_reg_offset_filename;
+  std::string out_path;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag(
+          "api_def_dirs", &api_def_dirs_raw,
+          "A comma separated directory list of where the api def files are."),
+      tsl::Flag("op_allowlist", &op_allowlist_raw,
+                "A comma separated list of allowed op names. All other ops "
+                "will be ignored. op_allowlist and op_allowlist_filename "
+                "cannot be set at the same time."),
+      tsl::Flag("op_allowlist_filename", &op_allowlist_filename,
+                "The name of the file that contains a list of allowed ops. "
+                "op_allowlist and op_allowlist_filename cannot be set at the "
+                "same time."),
+      tsl::Flag("hidden_op_list", &hidden_op_list_raw,
+                "A comma separated list of hidden op names. hidden_op_list and "
+                "hidden_op_list_filename cannot be set at the same time."),
+      tsl::Flag("hidden_op_list_filename", &hidden_op_list_filename,
+                "The name of the file that contains a list of hidden ops. "
+                "hidden_op_list and hidden_op_list_filename cannot be set at "
+                "the same time."),
+      tsl::Flag("op_reg_offset_filename", &op_reg_offset_filename,
+                "The name of the file that contains mapping between op names "
+                "and its location of op registration."),
+      tsl::Flag("out_path", &out_path,
+                "The destination of the output Python source. The result will "
+                "be printed into stdout if out_path is empty."),
+  };
+  const std::string kUsageString = absl::StrCat(
+      tensorflow::kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  const bool parse_result = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_result ||
+      (!op_allowlist_raw.empty() && !op_allowlist_filename.empty()) ||
+      (!hidden_op_list_raw.empty() && !hidden_op_list_filename.empty())) {
+    LOG(ERROR) << kUsageString;
+    return -1;
+  }
+  std::vector<std::string> op_allowlist;
+  if (!op_allowlist_raw.empty()) {
+    op_allowlist =
+        tsl::str_util::Split(op_allowlist_raw, ',', tsl::str_util::SkipEmpty());
+  } else if (!op_allowlist_filename.empty()) {
+    TF_CHECK_OK(
+        tensorflow::ReadOpListFromFile(op_allowlist_filename, &op_allowlist));
+  }
 
-  tensorflow::string source_file_name =
-      tensorflow::InferSourceFileName(argv[0]);
+  std::vector<std::string> hidden_op_list;
+  if (!hidden_op_list_raw.empty()) {
+    hidden_op_list = tsl::str_util::Split(hidden_op_list_raw, ',',
+                                          tsl::str_util::SkipEmpty());
+  } else if (!hidden_op_list_filename.empty()) {
+    TF_CHECK_OK(tensorflow::ReadOpListFromFile(hidden_op_list_filename,
+                                               &hidden_op_list));
+  }
 
-  // Usage:
-  //   gen_main api_def_dir1,api_def_dir2,...
-  //       [ @FILENAME | OpName[,OpName]* ] [0 | 1]
-  if (argc < 2) {
-    return -1;
+  tensorflow::OpRegOffsets op_reg_offsets;
+  if (!op_reg_offset_filename.empty()) {
+    TF_CHECK_OK(tensorflow::ReadOpRegOffsetsFromFile(op_reg_offset_filename,
+                                                     &op_reg_offsets));
   }
-  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
-      argv[1], ",", tensorflow::str_util::SkipEmpty());
+
+  std::vector<std::string> source_file_list =
+      tensorflow::GetSourceFileListFromOpRegOffsets(op_reg_offsets);
+
+  std::vector<std::string> api_def_dirs =
+      tsl::str_util::Split(api_def_dirs_raw, ",", tsl::str_util::SkipEmpty());
 
   // Add op name here to generate type annotations for it
-  const std::unordered_set<tensorflow::string> type_annotate_ops{};
-
-  if (argc == 2) {
-    tensorflow::PrintAllPythonOps({}, api_def_dirs, source_file_name,
-                                  false /* op_list_is_allowlist */,
-                                  type_annotate_ops);
-  } else if (argc == 3) {
-    std::vector<tensorflow::string> hidden_ops;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &hidden_ops));
-    tensorflow::PrintAllPythonOps(hidden_ops, api_def_dirs, source_file_name,
-                                  false /* op_list_is_allowlist */,
-                                  type_annotate_ops);
-  } else if (argc == 4) {
-    std::vector<tensorflow::string> op_list;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &op_list));
-    tensorflow::PrintAllPythonOps(op_list, api_def_dirs, source_file_name,
-                                  tensorflow::string(argv[3]) == "1",
-                                  type_annotate_ops);
-  } else {
-    return -1;
-  }
+  const std::unordered_set<std::string> type_annotate_ops{};
+
+  TF_CHECK_OK(tensorflow::PrintAllPythonOps(
+      api_def_dirs, source_file_list, out_path, op_reg_offsets, op_allowlist,
+      hidden_op_list, type_annotate_ops));
+
   return 0;
 }
diff --git a/tensorflow/python/framework/python_op_gen_test.cc b/tensorflow/python/framework/python_op_gen_test.cc
index 9119232430e..3d4dac5cf0c 100644
--- a/tensorflow/python/framework/python_op_gen_test.cc
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/python/framework/python_op_gen.h"
 
+#include <unordered_set>
+#include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/python/framework/op_reg_offset.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -53,7 +57,9 @@ TEST(PythonOpGen, TypeAnnotateAllOps) {
     type_annotate_ops.insert(op.name());
   }
 
-  string code = GetPythonOps(ops, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(ops, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string all_types =
       ", _dtypes.BFloat16, _dtypes.Bool, _dtypes.Complex128, "
@@ -122,7 +128,9 @@ TEST(PythonOpGen, TypeAnnotateSingleTypeTensor) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string typed_bar =
       "def bar(x: _ops.Tensor[_dtypes.String], y: _ops.Tensor[_dtypes.QInt8], "
@@ -184,7 +192,9 @@ TEST(PythonOpGen, TypeAnnotateMultiTypeTensor) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string typed_foo =
       "def foo(x: _ops.Tensor[TV_Foo_T], y: _ops.Tensor[TV_Foo_T2], name=None) "
@@ -243,7 +253,9 @@ TEST(PythonOpGen, GenerateCorrectTypeVars) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string typevars_foo = R"(
 TV_Foo_T = TypeVar("TV_Foo_T", _dtypes.Int8, _dtypes.UInt8)
@@ -304,7 +316,9 @@ TEST(PythonOpGen, TypeAnnotateFallback) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string typed_foo_fallback =
       "def foo_eager_fallback(x: _ops.Tensor[TV_Foo_T], y: "
@@ -363,7 +377,9 @@ TEST(PythonOpGen, GenerateTypeVarAboveOp) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string typevar_foo = "TV_Foo_";
   const string def_foo = "def foo";
@@ -420,7 +436,9 @@ TEST(PythonOpGen, TypeAnnotateDefaultParams) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string params =
       "def foo_bar(x: _ops.Tensor[_dtypes.Float32], t: TV_FooBar_t, "
@@ -469,12 +487,64 @@ TEST(PythonOpGen, NoTypingSequenceTensors) {
   protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
   ApiDefMap api_def_map(op_defs);
 
-  string code = GetPythonOps(op_defs, api_def_map, {}, "", type_annotate_ops);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   /* source_file_list= */ {}, type_annotate_ops);
 
   const string baz_def_line = "def baz(inputs, name=None):";
 
   ExpectHasSubstr(code, baz_def_line);
 }
 
+TEST(PythonOpGen, InsertCommentsForSourceFileLocation) {
+  std::unordered_set<string> type_annotate_ops{};
+  std::vector<string> source_file_list{"some_ops.cc", "another_ops.cc"};
+  OpList op_defs;
+  ApiDefMap api_def_map(op_defs);
+  string code =
+      GetPythonOps(op_defs, api_def_map, OpRegOffsets(), /* hidden_ops= */ {},
+                   source_file_list, type_annotate_ops);
+
+  ExpectHasSubstr(code,
+                  "Original C++ source file: some_ops.cc, another_ops.cc");
+}
+
+TEST(PythonOpGen, GenerateMetadataWhenOpRegOffsetsIsPresent) {
+  constexpr char kBaseOpDef[] = R"(
+  op {
+    name: "Baz"
+  }
+  )";
+
+  std::unordered_set<string> type_annotate_ops{"Baz"};
+
+  OpList op_defs;
+  OpRegistry::Global()->Export(false, &op_defs);
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);
+  ApiDefMap api_def_map(op_defs);
+
+  OpRegOffsets offsets;
+  auto* offset = offsets.add_offsets();
+  offset->set_name("Baz");
+  offset->set_filepath("some_ops.cc");
+  offset->set_start(0);
+  offset->set_end(0);
+
+  string code =
+      GetPythonOps(op_defs, api_def_map, offsets, {}, {}, type_annotate_ops);
+
+  ExpectHasSubstr(code, "# kythe.proto.metadata.GeneratedCodeInfo:");
+}
+
+TEST(PythonOpGen, NotGenerateMetadataWhenOpRegOffsetsIsEmpty) {
+  std::unordered_set<string> type_annotate_ops{};
+  OpList op_defs;
+  ApiDefMap api_def_map(op_defs);
+  string code = GetPythonOps(op_defs, api_def_map, OpRegOffsets(), {}, {},
+                             type_annotate_ops);
+
+  ExpectDoesNotHaveSubstr(code, "# kythe.proto.metadata.GeneratedCodeInfo:");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index c5ff971eb48..7162a4eaecf 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -31,7 +31,6 @@ def raise_exception():
 
 class SmartCondTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testTrue(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -39,9 +38,8 @@ def testTrue(self):
         y = constant_op.constant(5)
         z = smart_cond.smart_cond(True, lambda: math_ops.multiply(x, 16),
                                   lambda: math_ops.multiply(y, 5))
-        self.assertEqual(z.eval(), 32)
+        self.assertEqual(self.evaluate(z), 32)
 
-  @test_util.run_deprecated_v1
   def testFalse(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -49,7 +47,7 @@ def testFalse(self):
         y = constant_op.constant(3)
         z = smart_cond.smart_cond(False, lambda: math_ops.multiply(x, 16),
                                   lambda: math_ops.multiply(y, 3))
-        self.assertEqual(z.eval(), 9)
+        self.assertEqual(self.evaluate(z), 9)
 
   def testUnknown(self):
     with ops.Graph().as_default():
@@ -77,7 +75,7 @@ def testPlaceholderWithDefault(self):
         x = array_ops.placeholder_with_default(1, shape=())
         y = smart_cond.smart_cond(x > 0, lambda: constant_op.constant(1),
                                   lambda: constant_op.constant(2))
-        self.assertEqual(y.eval(), 1)
+        self.assertEqual(self.evaluate(y), 1)
         self.assertEqual(y.eval(feed_dict={x: -1}), 2)
 
   def testMissingArg1(self):
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 3a659b140ea..8be95f7b79a 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python import tf2
 from tensorflow.python.framework import composite_tensor
@@ -28,7 +29,9 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import internal
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -370,7 +373,7 @@ def _is_eager(self):
 
 
 @tf_export("SparseTensorSpec")
-@type_spec.register("tf.SparseTensorSpec")
+@type_spec_registry.register("tf.SparseTensorSpec")
 class SparseTensorSpec(type_spec.BatchableTypeSpec):
   """Type specification for a `tf.sparse.SparseTensor`."""
 
@@ -508,6 +511,13 @@ def from_value(cls, value):
                       f"{value} of type {type(value).__name__}.")
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        SparseTensorSpec, struct_pb2.TypeSpecProto.SPARSE_TENSOR_SPEC
+    )
+)
+
+
 # TODO(b/133606651) Delete the SparseTensor registration when CompositeTensor
 # is updated to define a _type_spec field (since registration will be
 # automatic).  Do *not* delete the SparseTensorValue registration.
diff --git a/tensorflow/python/framework/summary_test_util.py b/tensorflow/python/framework/summary_test_util.py
new file mode 100644
index 00000000000..caaa255f48a
--- /dev/null
+++ b/tensorflow/python/framework/summary_test_util.py
@@ -0,0 +1,58 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities to test summaries."""
+
+import os
+
+from tensorflow.core.util import event_pb2
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.platform import gfile
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.compat.v1.Event protos in the event file.
+  """
+  records = list(tf_record.tf_record_iterator(filepath))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.compat.v1.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 8bd96628ab5..626675fd87e 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -19,11 +19,14 @@
 
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.function import trace_type
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import tf2
 from tensorflow.python.eager import monitoring
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import trace
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 _TENSORSHAPE_V2_OVERRIDE = None
 
@@ -742,8 +745,13 @@ def as_dimension(value):
 class TensorShape(trace.TraceType, trace_type.Serializable):
   """Represents the shape of a `Tensor`.
 
-  A `TensorShape` represents a possibly-partial shape specification for a
-  `Tensor`. It may be one of the following:
+  >>> t = tf.constant([[1,2,3],[4,5,6]])
+  >>> t.shape
+  TensorShape([2, 3])
+
+  `TensorShape` is the *static* shape representation of a Tensor.
+  During eager execution a Tensor always has a fully specified shape but
+  when tracing a `tf.function` it may be one of the following:
 
   * *Fully-known shape:* has a known number of dimensions and a known size
     for each dimension. e.g. `TensorShape([16, 256])`
@@ -752,12 +760,57 @@ class TensorShape(trace.TraceType, trace_type.Serializable):
   * *Unknown shape:* has an unknown number of dimensions, and an unknown
     size in all dimensions. e.g. `TensorShape(None)`
 
+  During function tracing `t.shape` will return a `TensorShape` object
+  representing the shape of Tensor as it is known during tracing.
+  This static representation will be partially defined in cases where the
+  exact shape depends on the values within the tensors. To get the
+  *dynamic* representation, please use `tf.shape(t)`
+  which will return Tensor representing the fully defined shape of `t`.
+  This way, you can express logic that manipulates the shapes of tensors by
+  building other tensors that depend on the dynamic shape of `t`.
+
+  Note: `tf.RaggedTensor.shape` also returns a `tf.TensorShape`,
+  the lengths of any ragged dimensions are unknown (`None`).
+
+  For example, this function prints the `TensorShape' (`t.shape`), when you
+  trace the function, and returns a tensor `tf.shape(t)` for given input `t`:
+
+  >>> @tf.function
+  ... def get_dynamic_shape(t):
+  ...   print("tracing...")
+  ...   print(f"static shape is {t.shape}")
+  ...   return tf.shape(t)
+
+  Just calling the function traces it with a fully-specified static shape:
+
+  >>> result = get_dynamic_shape(tf.constant([[1, 1, 1], [0, 0, 0]]))
+  tracing...
+  static shape is (2, 3)
+  >>> result.numpy()
+  array([2, 3], dtype=int32)
+
+  But `tf.function` can also trace the function with a partially specified
+  (or even unspecified) shape:
+
+  >>> cf1 = get_dynamic_shape.get_concrete_function(tf.TensorSpec(
+  ...                                               shape=[None, 2]))
+  tracing...
+  static shape is (None, 2)
+  >>> cf1(tf.constant([[1., 0],[1, 0],[1, 0]])).numpy()
+  array([3, 2], dtype=int32)
+
+  >>> cf2 = get_dynamic_shape.get_concrete_function(tf.TensorSpec(shape=None))
+  tracing...
+  static shape is <unknown>
+  >>> cf2(tf.constant([[[[[1., 0]]]]])).numpy()
+  array([1, 1, 1, 1, 2], dtype=int32)
+
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
   `"Foo"`. See [Shape
   functions](https://www.tensorflow.org/guide/create_op#shape_functions_in_c)
   for details of shape functions and how to register them. Alternatively,
-  you may set the shape explicitly using `tf.Tensor.set_shape`.
+  you may set the shape explicitly using `tf.Tensor.ensure_shape`.
   """
   __slots__ = ["_dims"]
 
@@ -1224,6 +1277,11 @@ def most_specific_common_supertype(
     ]
     return TensorShape(dims)
 
+  @doc_controls.do_not_doc_inheritable
+  def placeholder_value(self, placeholder_context=None):
+    raise NotImplementedError("A graph placeholder is not currently supported"
+                              "for an object of type: TensorShape.")
+
   @classmethod
   def experimental_type_proto(cls) -> Type[tensor_shape_pb2.TensorShapeProto]:
     """Returns the type of proto associated with TensorShape serialization."""
@@ -1436,6 +1494,30 @@ def __concat__(self, other):
 trace_type.register_serializable(TensorShape)
 
 
+class _TensorShapeCodec:
+  """Codec for `TensorShape`."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, TensorShape)
+
+  def do_encode(self, tensor_shape_value, encode_fn):
+    del encode_fn
+    encoded_tensor_shape = struct_pb2.StructuredValue()
+    encoded_tensor_shape.tensor_shape_value.CopyFrom(
+        tensor_shape_value.as_proto())
+    return encoded_tensor_shape
+
+  def can_decode(self, value):
+    return value.HasField("tensor_shape_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return TensorShape(value.tensor_shape_value)
+
+
+nested_structure_coder.register_codec(_TensorShapeCodec())
+
+
 def as_shape(shape):
   """Converts the given object to a TensorShape."""
   if isinstance(shape, TensorShape):
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index a51f028d6c4..85d1e25a8b7 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -18,15 +18,24 @@
 
 import numpy as np
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.function import trace_type
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
+from tensorflow.python.ops import handle_data_util
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.types import core as core_tf_types
+from tensorflow.python.types import internal
 from tensorflow.python.util import _pywrap_utils
+from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -128,13 +137,47 @@ def _to_legacy_output_classes(self):
 
 
 @tf_export("TensorSpec")
-@type_spec.register("tf.TensorSpec")
+@type_spec_registry.register("tf.TensorSpec")
 class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
-                 trace_type.Serializable):
-  """Describes a tf.Tensor.
+                 trace_type.Serializable, internal.TensorSpec):
+  """Describes the type of a tf.Tensor.
+
+  >>> t = tf.constant([[1,2,3],[4,5,6]])
+  >>> tf.TensorSpec.from_tensor(t)
+  TensorSpec(shape=(2, 3), dtype=tf.int32, name=None)
+
+  Contains metadata for describing the the nature of `tf.Tensor` objects
+  accepted or returned by some TensorFlow APIs.
+
+  For example, it can be used to constrain the type of inputs accepted by
+  a tf.function:
+
+  >>> @tf.function(input_signature=[tf.TensorSpec([1, None])])
+  ... def constrained_foo(t):
+  ...   print("tracing...")
+  ...   return t
+
+  Now the `tf.function` is able to assume that `t` is always of the type
+  `tf.TensorSpec([1, None])` which will avoid retracing as well as enforce the
+  type restriction on inputs.
+
+  As a result, the following call with tensor of type `tf.TensorSpec([1, 2])`
+  triggers a trace and succeeds:
+  >>> constrained_foo(tf.constant([[1., 2]])).numpy()
+  tracing...
+  array([[1., 2.]], dtype=float32)
+
+  The following subsequent call with tensor of type `tf.TensorSpec([1, 4])`
+  does not trigger a trace and succeeds:
+  >>> constrained_foo(tf.constant([[1., 2, 3, 4]])).numpy()
+  array([[1., 2., 3., 4.], dtype=float32)
+
+  But the following call with tensor of type `tf.TensorSpec([2, 2])` fails:
+  >>> constrained_foo(tf.constant([[1., 2], [3, 4]])).numpy()
+  Traceback (most recent call last):
+  ...
+  ValueError: Python inputs incompatible with input_signature
 
-  Metadata for describing the `tf.Tensor` objects accepted or returned
-  by some TensorFlow APIs.
   """
 
   __slots__ = []
@@ -174,6 +217,80 @@ def is_compatible_with(self, spec_or_tensor):  # pylint:disable=useless-super-de
     """
     return super(TensorSpec, self).is_compatible_with(spec_or_tensor)
 
+  def placeholder_value(self, placeholder_context):
+    """Generates a graph_placholder with the given TensorSpec information."""
+    if placeholder_context.unnest_only:
+      return self
+
+    name = self.name or placeholder_context.naming_scope
+    context_graph = placeholder_context.context_graph
+    placeholder = self._graph_placeholder(context_graph, name=name)
+    if name is not None:
+      # Record the requested/user-specified name in case it's different than
+      # the uniquified name, for validation when exporting signatures.
+      placeholder.op._set_attr(  # pylint: disable=protected-access
+          "_user_specified_name",
+          attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
+    # TODO(b/263894631): Add an assertion for a TensorSpec of type resource or
+    # variant which must have handle data associated with it.
+    if ((self.dtype == dtypes.resource or self.dtype == dtypes.variant)
+        and placeholder_context.has_handledata(id(self))):
+      handle_data = placeholder_context.get_handledata(id(self))
+      if (handle_data is not None
+          and handle_data.is_set
+          and handle_data.shape_and_type):
+        handle_data_util.set_handle_data(placeholder, handle_data)
+    return placeholder
+
+  def _graph_placeholder(self, graph, name=None):
+    """Graph-only version of tf.compat.v1.placeholder(), for internal use only."""
+    dtype = self.dtype.base_dtype
+    shape = self.shape
+    dtype_value = attr_value_pb2.AttrValue(type=dtype.as_datatype_enum)
+    if isinstance(shape, (list, tuple)):
+      shape = tensor_shape.TensorShape(shape)
+    shape = attr_value_pb2.AttrValue(shape=shape.as_proto())
+    attrs = {"dtype": dtype_value, "shape": shape}
+    try:
+      op = graph._create_op_internal(  # pylint: disable=protected-access
+          "Placeholder", [], [dtype], input_types=[],
+          attrs=attrs, name=name)
+    except ValueError as e:
+      # TODO(b/262413656) Sometimes parameter names are not valid op names, in
+      # which case an unnamed placeholder is created instead. Update this logic
+      # to sanitize the name instead of falling back on unnamed placeholders.
+      logging.warning(e)
+      op = graph._create_op_internal(  # pylint: disable=protected-access
+          "Placeholder", [], [dtype], input_types=[], attrs=attrs)
+    (result,) = op.outputs
+    if op_callbacks.should_invoke_op_callbacks():
+      # TODO(b/147670703): Once the special-op creation code paths
+      # are unified. Remove this `if` block.
+      callback_outputs = op_callbacks.invoke_op_callbacks(
+          "Placeholder", tuple(), attrs, tuple(op.outputs),
+          op_name=name, graph=graph)
+      if callback_outputs is not None:
+        (result,) = callback_outputs
+    return result
+
+  def _to_tensors(self, value):
+    assert isinstance(value, ops.Tensor)
+    return [value]
+
+  def _cast(self, value, casting_context):
+    """Cast value to a tensor that is compatiable to this TensorSpec."""
+    # This method is mainly used to cast Python primitives to tensor.
+    # Currently, cast tensor to tensor with different types are not supported.
+    # For example, casting int32 to float32 would raise a ValueError.
+    value = ops.convert_to_tensor(value, self.dtype)
+    value_spec = self.from_tensor(value, self.name)
+    if self.name is None:
+      value_spec._name = None  # pylint: disable=protected-access
+    assert value_spec.is_subtype_of(
+        self
+    ), f"Failed to cast {value_spec!r} to tensor_spec {self!r}"
+    return value
+
   @classmethod
   def from_spec(cls, spec, name=None):
     """Returns a `TensorSpec` with the same shape and dtype as `spec`.
@@ -217,14 +334,7 @@ def value_type(self):
     return ops.Tensor
 
   def _to_components(self, value):
-    try:
-      value = ops.convert_to_tensor(value, self._dtype)
-    except (TypeError, ValueError):
-      raise ValueError(f"Value {value} is not convertible to a tensor with "
-                       f"dtype {self._dtype} and shape {self._shape}.")
-    if not value.shape.is_compatible_with(self._shape):
-      raise ValueError(f"Value {value} is not convertible to a tensor with "
-                       f"dtype {self._dtype} and shape {self._shape}.")
+    assert isinstance(value, core_tf_types.Tensor)
     return value
 
   def _from_components(self, components):
@@ -279,8 +389,43 @@ def _without_tensor_names(self) -> "TensorSpec":
 trace_type.register_serializable(TensorSpec)
 
 
+class _TensorSpecCodec:
+  """Codec for `TensorSpec`."""
+
+  def can_encode(self, pyobj):
+    # BoundedTensorSpec has its own decoder.
+    return (isinstance(pyobj, TensorSpec) and
+            not isinstance(pyobj, BoundedTensorSpec))
+
+  def do_encode(self, tensor_spec_value, encode_fn):
+    encoded_tensor_spec = struct_pb2.StructuredValue()
+    encoded_tensor_spec.tensor_spec_value.CopyFrom(
+        struct_pb2.TensorSpecProto(
+            shape=encode_fn(tensor_spec_value.shape).tensor_shape_value,
+            dtype=encode_fn(tensor_spec_value.dtype).tensor_dtype_value,
+            name=tensor_spec_value.name))
+    return encoded_tensor_spec
+
+  def can_decode(self, value):
+    return value.HasField("tensor_spec_value")
+
+  def do_decode(self, value, decode_fn):
+    name = value.tensor_spec_value.name
+    return TensorSpec(
+        shape=decode_fn(
+            struct_pb2.StructuredValue(
+                tensor_shape_value=value.tensor_spec_value.shape)),
+        dtype=decode_fn(
+            struct_pb2.StructuredValue(
+                tensor_dtype_value=value.tensor_spec_value.dtype)),
+        name=(name if name else None))
+
+
+nested_structure_coder.register_codec(_TensorSpecCodec())
+
+
 # TODO(b/133606651): Should is_compatible_with should check min/max bounds?
-@type_spec.register("tf.BoundedTensorSpec")
+@type_spec_registry.register("tf.BoundedTensorSpec")
 class BoundedTensorSpec(TensorSpec, trace_type.Serializable):
   """A `TensorSpec` that specifies minimum and maximum values.
 
@@ -426,6 +571,45 @@ def __reduce__(self):
   def _serialize(self):
     return (self._shape, self._dtype, self._minimum, self._maximum, self._name)
 
+
+class _BoundedTensorSpecCodec:
+  """Codec for `BoundedTensorSpec`."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, BoundedTensorSpec)
+
+  def do_encode(self, bounded_tensor_spec_value, encode_fn):
+    """Returns an encoded proto for the given `tf.BoundedTensorSpec`."""
+    encoded_bounded_tensor_spec = struct_pb2.StructuredValue()
+    encoded_bounded_tensor_spec.bounded_tensor_spec_value.CopyFrom(
+        struct_pb2.BoundedTensorSpecProto(
+            shape=encode_fn(bounded_tensor_spec_value.shape).tensor_shape_value,
+            dtype=encode_fn(bounded_tensor_spec_value.dtype).tensor_dtype_value,
+            name=bounded_tensor_spec_value.name,
+            minimum=tensor_util.make_tensor_proto(
+                bounded_tensor_spec_value.minimum),
+            maximum=tensor_util.make_tensor_proto(
+                bounded_tensor_spec_value.maximum)))
+    return encoded_bounded_tensor_spec
+
+  def can_decode(self, value):
+    return value.HasField("bounded_tensor_spec_value")
+
+  def do_decode(self, value, decode_fn):
+    btsv = value.bounded_tensor_spec_value
+    name = btsv.name
+    return BoundedTensorSpec(
+        shape=decode_fn(
+            struct_pb2.StructuredValue(tensor_shape_value=btsv.shape)),
+        dtype=decode_fn(
+            struct_pb2.StructuredValue(tensor_dtype_value=btsv.dtype)),
+        minimum=tensor_util.MakeNdarray(btsv.minimum),
+        maximum=tensor_util.MakeNdarray(btsv.maximum),
+        name=(name if name else None))
+
+
+nested_structure_coder.register_codec(_BoundedTensorSpecCodec())
+
 trace_type.register_serializable(BoundedTensorSpec)
 _pywrap_utils.RegisterType("TensorSpec", TensorSpec)
 
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 0148a4bc34e..a932b089913 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -253,6 +253,95 @@ def testSerialization(self):
                      trace_type.deserialize(trace_type.serialize(nameless)))
     self.assertEqual(named, trace_type.deserialize(trace_type.serialize(named)))
 
+  def testPlaceholderWithName(self):
+    placeholder_context = trace_type.InternalPlaceholderContext(
+        ops.get_default_graph())
+    spec = tensor_spec.TensorSpec([1], np.float32, name="test")
+    placeholder = spec.placeholder_value(placeholder_context)
+    self.assertEqual(placeholder.name, f"{spec.name}:0")
+    self.assertEqual(placeholder.dtype, spec.dtype)
+    self.assertEqual(placeholder.shape, spec.shape)
+
+  def testMultiplePlaceholdersWithNames(self):
+    placeholder_context = trace_type.InternalPlaceholderContext(
+        ops.get_default_graph())
+    spec1 = tensor_spec.TensorSpec([1, 2], np.float32, name="test1")
+    spec2 = tensor_spec.TensorSpec([3], np.int32, name="test2")
+    spec3 = tensor_spec.TensorSpec(None, np.float32, name="test3")
+    placeholder1 = spec1.placeholder_value(placeholder_context)
+    placeholder2 = spec2.placeholder_value(placeholder_context)
+    placeholder3 = spec3.placeholder_value(placeholder_context)
+    self.assertEqual(placeholder1.name, f"{spec1.name}:0")
+    self.assertEqual(placeholder2.name, f"{spec2.name}:0")
+    self.assertEqual(placeholder3.name, f"{spec3.name}:0")
+    self.assertEqual(placeholder1.dtype, spec1.dtype)
+    self.assertEqual(placeholder2.dtype, spec2.dtype)
+    self.assertEqual(placeholder3.dtype, spec3.dtype)
+    self.assertEqual(placeholder1.shape, spec1.shape)
+    self.assertEqual(placeholder2.shape, spec2.shape)
+    self.assertEqual(placeholder3.shape, spec3.shape)
+
+  def testPlaceholderWithoutName(self):
+    placeholder_context = trace_type.InternalPlaceholderContext(
+        ops.get_default_graph())
+    spec = tensor_spec.TensorSpec([1], np.float32)
+    placeholder = spec.placeholder_value(placeholder_context)
+    self.assertEqual(placeholder.name, "Placeholder:0")
+    self.assertEqual(placeholder.dtype, spec.dtype)
+    self.assertEqual(placeholder.shape, spec.shape)
+
+  def testMultiplePlaceholdersWithoutNames(self):
+    placeholder_context = trace_type.InternalPlaceholderContext(
+        ops.get_default_graph())
+    spec1 = tensor_spec.TensorSpec([1, 2], np.float32)
+    spec2 = tensor_spec.TensorSpec([3], np.int32)
+    spec3 = tensor_spec.TensorSpec(None, np.float32)
+    placeholder1 = spec1.placeholder_value(placeholder_context)
+    placeholder2 = spec2.placeholder_value(placeholder_context)
+    placeholder3 = spec3.placeholder_value(placeholder_context)
+    self.assertEqual(placeholder1.name, "Placeholder:0")
+    self.assertEqual(placeholder2.name, "Placeholder_1:0")
+    self.assertEqual(placeholder3.name, "Placeholder_2:0")
+    self.assertEqual(placeholder1.dtype, spec1.dtype)
+    self.assertEqual(placeholder2.dtype, spec2.dtype)
+    self.assertEqual(placeholder3.dtype, spec3.dtype)
+    self.assertEqual(placeholder1.shape, spec1.shape)
+    self.assertEqual(placeholder2.shape, spec2.shape)
+    self.assertEqual(placeholder3.shape, spec3.shape)
+
+  def testGraphPlaceholderWithValidName(self):
+    spec = tensor_spec.TensorSpec((2, 3), dtypes.float32, name="test")
+    placeholder = spec._graph_placeholder(ops.get_default_graph(), spec.name)
+    self.assertEqual(placeholder.name, f"{spec.name}:0")
+    self.assertEqual(placeholder.dtype, spec.dtype)
+    self.assertEqual(placeholder.shape, spec.shape)
+
+  def testGraphPlaceholderWithInvalidName(self):
+    spec = tensor_spec.TensorSpec((1, 2), dtypes.int32, name="a%!")
+    placeholder = spec._graph_placeholder(ops.get_default_graph(), spec.name)
+    self.assertEqual(placeholder.name, "Placeholder:0")
+    self.assertEqual(placeholder.dtype, spec.dtype)
+    self.assertEqual(placeholder.shape, spec.shape)
+
+  def testCastPythinPrimitives(self):
+    spec = tensor_spec.TensorSpec([], dtypes.float32)
+    ctx = trace_type.InternalCastContext()
+    value = spec._cast(1, ctx)
+    self.assertEqual(value.dtype, spec.dtype)
+
+  def testCastTensor(self):
+    spec = tensor_spec.TensorSpec([], dtypes.float32)
+    ctx = trace_type.InternalCastContext()
+    # _cast does not support cast int tensor to float tensor
+    with self.assertRaises(ValueError):
+      _ = spec._cast(constant_op.constant(1, dtype=dtypes.int32), ctx)
+
+  def testCastAssert(self):
+    spec = tensor_spec.TensorSpec([], dtypes.float32)
+    ctx = trace_type.InternalCastContext()
+    with self.assertRaises(AssertionError):
+      _ = spec._cast([1, 2, 3], ctx)
+
 
 class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 9046b38aa2a..1ba2c4b995f 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -71,10 +71,48 @@ def FastAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
           proto_values, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
 
 
+def ExtractBitsFromFloat8e5m2(x):
+  return np.asarray(
+      x, dtype=dtypes.float8_e5m2.as_numpy_dtype).view(np.uint8).item()
+
+
+def SlowAppendFloat8e5m2ArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat8e5m2(x) for x in proto_values])
+
+
+def FastAppendFloat8e5m2ArrayToTensorProto(tensor_proto, proto_values):
+  fast_tensor_util.AppendFloat8ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values,
+                 dtype=dtypes.float8_e5m2.as_numpy_dtype).view(np.uint8))
+
+
+def ExtractBitsFromFloat8e4m3fn(x):
+  return np.asarray(
+      x, dtype=dtypes.float8_e4m3fn.as_numpy_dtype).view(np.uint8).item()
+
+
+def SlowAppendFloat8e4m3fnArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat8e4m3fn(x) for x in proto_values])
+
+
+def FastAppendFloat8e4m3fnArrayToTensorProto(tensor_proto, proto_values):
+  fast_tensor_util.AppendFloat8ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values,
+                 dtype=dtypes.float8_e4m3fn.as_numpy_dtype).view(np.uint8))
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
           FastAppendBFloat16ArrayToTensorProto,
+      dtypes.float8_e5m2.as_numpy_dtype:
+          FastAppendFloat8e5m2ArrayToTensorProto,
+      dtypes.float8_e4m3fn.as_numpy_dtype:
+          FastAppendFloat8e4m3fnArrayToTensorProto,
       np.float16:
           _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
@@ -155,27 +193,52 @@ def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.bool_val.extend([x.item() for x in proto_values])
 
   _NP_TO_APPEND_FN = {
-      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
-      np.float16: SlowAppendFloat16ArrayToTensorProto,
-      np.float32: SlowAppendFloat32ArrayToTensorProto,
-      np.float64: SlowAppendFloat64ArrayToTensorProto,
-      np.int32: SlowAppendIntArrayToTensorProto,
-      np.int64: SlowAppendInt64ArrayToTensorProto,
-      np.uint8: SlowAppendIntArrayToTensorProto,
-      np.uint16: SlowAppendIntArrayToTensorProto,
-      np.uint32: SlowAppendUInt32ArrayToTensorProto,
-      np.uint64: SlowAppendUInt64ArrayToTensorProto,
-      np.int8: SlowAppendIntArrayToTensorProto,
-      np.int16: SlowAppendIntArrayToTensorProto,
-      np.complex64: SlowAppendComplex64ArrayToTensorProto,
-      np.complex128: SlowAppendComplex128ArrayToTensorProto,
-      np.object_: SlowAppendObjectArrayToTensorProto,
-      np.bool_: SlowAppendBoolArrayToTensorProto,
-      dtypes.qint8.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
-      dtypes.quint8.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
-      dtypes.qint16.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
-      dtypes.quint16.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
-      dtypes.qint32.as_numpy_dtype: SlowAppendQIntArrayToTensorProto,
+      dtypes.bfloat16.as_numpy_dtype:
+          SlowAppendBFloat16ArrayToTensorProto,
+      dtypes.float8_e5m2.as_numpy_dtype:
+          SlowAppendFloat8e5m2ArrayToTensorProto,
+      dtypes.float8_e4m3fn.as_numpy_dtype:
+          SlowAppendFloat8e4m3fnArrayToTensorProto,
+      np.float16:
+          SlowAppendFloat16ArrayToTensorProto,
+      np.float32:
+          SlowAppendFloat32ArrayToTensorProto,
+      np.float64:
+          SlowAppendFloat64ArrayToTensorProto,
+      np.int32:
+          SlowAppendIntArrayToTensorProto,
+      np.int64:
+          SlowAppendInt64ArrayToTensorProto,
+      np.uint8:
+          SlowAppendIntArrayToTensorProto,
+      np.uint16:
+          SlowAppendIntArrayToTensorProto,
+      np.uint32:
+          SlowAppendUInt32ArrayToTensorProto,
+      np.uint64:
+          SlowAppendUInt64ArrayToTensorProto,
+      np.int8:
+          SlowAppendIntArrayToTensorProto,
+      np.int16:
+          SlowAppendIntArrayToTensorProto,
+      np.complex64:
+          SlowAppendComplex64ArrayToTensorProto,
+      np.complex128:
+          SlowAppendComplex128ArrayToTensorProto,
+      np.object_:
+          SlowAppendObjectArrayToTensorProto,
+      np.bool_:
+          SlowAppendBoolArrayToTensorProto,
+      dtypes.qint8.as_numpy_dtype:
+          SlowAppendQIntArrayToTensorProto,
+      dtypes.quint8.as_numpy_dtype:
+          SlowAppendQIntArrayToTensorProto,
+      dtypes.qint16.as_numpy_dtype:
+          SlowAppendQIntArrayToTensorProto,
+      dtypes.quint16.as_numpy_dtype:
+          SlowAppendQIntArrayToTensorProto,
+      dtypes.qint32.as_numpy_dtype:
+          SlowAppendQIntArrayToTensorProto,
       # NOTE(touts): Intentionally no way to feed a DT_BFLOAT16.
   }
 
@@ -235,7 +298,8 @@ def _FlattenToStrings(nested_strings):
 _TENSOR_CONTENT_TYPES = frozenset([
     dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.uint8,
     dtypes.int16, dtypes.int8, dtypes.int64, dtypes.qint8, dtypes.quint8,
-    dtypes.qint16, dtypes.quint16, dtypes.qint32, dtypes.uint32, dtypes.uint64
+    dtypes.qint16, dtypes.quint16, dtypes.qint32, dtypes.uint32, dtypes.uint64,
+    dtypes.float8_e5m2, dtypes.float8_e4m3fn
 ])
 
 
@@ -612,6 +676,9 @@ def MakeNdarray(tensor):
     # of the fp16: we need to reinterpret this as a proper float16
     values = np.fromiter(tensor.half_val, dtype=np.uint16)
     values.dtype = tensor_dtype.as_numpy_dtype
+  elif tensor_dtype == dtypes.float8_e5m2 or tensor_dtype == dtypes.float8_e4m3fn:
+    values = np.fromiter(tensor.float8_val, dtype=np.uint8)
+    values.dtype = tensor_dtype.as_numpy_dtype
   elif tensor_dtype == dtypes.float32:
     values = np.fromiter(tensor.float_val, dtype=dtype)
   elif tensor_dtype == dtypes.float64:
@@ -758,7 +825,11 @@ def _ConstantValue(tensor, partial):
       if value is None and not partial:
         return None
       values.append(value)
-    return np.array(values)
+    try:
+      return np.array(values)
+    except ValueError:
+      # If partial=True, some of the elements of values may be None.
+      return np.array(values, dtype=object)
   elif tensor.op.type == "Unpack":
     # We can't handle axis != 0 Unpacks at the moment.
     if tensor.op.get_attr("axis") != 0:
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index edb79a8874d..87c1f5cd12a 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -256,6 +256,42 @@ def testBfloat16(self):
     self.assertEqual(test_type, a.dtype)
     self.assertAllClose(np.array([10.0, 20.0], dtype=test_type), a)
 
+  def testFloat8e5m2(self):
+    test_type = dtypes.float8_e5m2.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(np.array([10.0, 20.0], dtype=test_type))
+    # 10.0: "I" = 73 = 10010 01: 2^(18 - 15) * (1 + 1/4)
+    # 20.0: "M" = 77 = 10011 01: 2^(19 - 15) * (1 + 1/4)
+    self.assertProtoEquals(
+        """
+      dtype: DT_FLOAT8_E5M2
+      tensor_shape {
+        dim {
+          size: 2
+        }
+      }
+      tensor_content: "IM"
+      """, t)
+
+    a = tensor_util.MakeNdarray(t)
+    self.assertEqual(test_type, a.dtype)
+    self.assertAllClose(np.array([10.0, 20.0], dtype=test_type), a)
+
+  def testFloat8e4m3fn(self):
+    test_type = dtypes.float8_e4m3fn.as_numpy_dtype
+    t = tensor_util.make_tensor_proto(np.array([10.0, 20.0], dtype=test_type))
+    # 10.0: "R" = 82 = 1010 010: 2^(10 - 7) * (1 + 1/4)
+    # 20.0: "Z" = 90 = 1011 010: 2^(11 - 7) * (1 + 1/4)
+    self.assertProtoEquals(
+        """
+      dtype: DT_FLOAT8_E4M3FN
+      tensor_shape {
+        dim {
+          size: 2
+        }
+      }
+      tensor_content: "RZ"
+      """, t)
+
   def testInt(self):
     t = tensor_util.make_tensor_proto(10)
     self.assertProtoEquals("""
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index b4551f579d1..1a6560aa636 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This file defines ops and op kernels that are only used by Python tests.
+
+#include "tensorflow/python/framework/test_ops.h"
+
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
@@ -66,6 +70,7 @@ REGISTER_OP("GetDeadline")
 
 REGISTER_OP("SleepOp")
     .Input("sleep_seconds: int32")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SleepIdentityOp")
@@ -73,6 +78,7 @@ REGISTER_OP("SleepIdentityOp")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_RESOURCE_HANDLE_OP(StubResource);
@@ -197,6 +203,7 @@ class GetDeadlineOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     if (!ctx->deadline()) {
       ctx->SetStatus(errors::InvalidArgument("Deadline has not ben set."));
+      return;
     }
     Tensor* output;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
@@ -221,6 +228,20 @@ class SleepOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("SleepOp").Device(DEVICE_CPU), SleepOp);
 
+#if GOOGLE_CUDA
+class SleepGpuOp : public OpKernel {
+ public:
+  explicit SleepGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    GpuSleep(ctx, ctx->input(0).scalar<int>()());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("SleepOp").Device(DEVICE_GPU).HostMemory("sleep_seconds"), SleepGpuOp);
+#endif  // GOOGLE_CUDA
+
 class SleepIdentityOp : public OpKernel {
  public:
   explicit SleepIdentityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/python/framework/test_ops.cu.cc b/tensorflow/python/framework/test_ops.cu.cc
new file mode 100644
index 00000000000..355dd9691f4
--- /dev/null
+++ b/tensorflow/python/framework/test_ops.cu.cc
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace tensorflow {
+
+namespace {
+
+__global__ void sleep_kernel(int seconds) {
+#if __CUDA_ARCH__ >= 700  // __nanosleep requires compute capability 7.0
+  int64_t nanoseconds = int64_t{seconds} * 1'000'000'000;
+  // Passing too high a number to __nanosleep makes it sleep for much less time
+  // than the passed-in number. So only pass 1,000,000 and keep calling
+  // __nanosleep in a loop.
+  for (int64_t i = 0; i < nanoseconds; i += 1'000'000) {
+    __nanosleep(1'000'000);
+  }
+#endif
+}
+
+}  // namespace
+
+void GpuSleep(OpKernelContext* ctx, int seconds) {
+  auto* cu_stream = ctx->eigen_device<GPUDevice>().stream();
+  CHECK(cu_stream);  // Crash OK
+  TF_CHECK_OK(GpuLaunchKernel(sleep_kernel, 1, 1, 0, cu_stream, seconds));
+}
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/python/framework/test_ops.h b/tensorflow/python/framework/test_ops.h
new file mode 100644
index 00000000000..c3f54ba24a4
--- /dev/null
+++ b/tensorflow/python/framework/test_ops.h
@@ -0,0 +1,26 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_TEST_OPS_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_TEST_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Run a kernel on the GPU that sleeps for the given time
+void GpuSleep(OpKernelContext* ctx, int seconds);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_TEST_OPS_H_
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 49bbfb24cdf..d24f004efcb 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -68,6 +68,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.ops import gen_sync_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -685,6 +686,17 @@ def decorator(self, *args, **kwargs):
       """Warms up, gets object counts, runs the test, checks for new objects."""
       with context.eager_mode():
         gc.disable()
+        # Python 3.11 removed "errors" and "skipped" as members of
+        # unittest.case._Outcome so get them from the test result object
+        # instead.
+        test_errors = None
+        test_skipped = None
+        if hasattr(self._outcome, "errors"):
+          test_errors = self._outcome.errors
+          test_skipped = self._outcome.skipped
+        else:
+          test_errors = self._outcome.result.errors
+          test_skipped = self._outcome.result.skipped
         # Run the test 2 times as warmup, in an attempt to fill up caches, which
         # should not grow as the test is run repeatedly below.
         #
@@ -709,8 +721,7 @@ def decorator(self, *args, **kwargs):
         # These objects are retained across gc collections so we exclude them
         # from the object count calculation.
         obj_count_by_type = _get_object_count_by_type(
-            exclude=gc.get_referents(self._outcome.errors,
-                                     self._outcome.skipped))
+            exclude=gc.get_referents(test_errors, test_skipped))
 
         if ops.has_default_graph():
           collection_sizes_before = {
@@ -745,8 +756,7 @@ def decorator(self, *args, **kwargs):
         # There should be no new Python objects hanging around.
         obj_count_by_type = (
             _get_object_count_by_type(
-                exclude=gc.get_referents(self._outcome.errors,
-                                         self._outcome.skipped)) -
+                exclude=gc.get_referents(test_errors, test_skipped)) -
             obj_count_by_type)
 
         # There should be no newly registered functions hanging around.
@@ -2946,7 +2956,22 @@ def _GetNdArray(self, a):
       else:
         a = self.evaluate(a)
     if not isinstance(a, np.ndarray):
-      return np.array(a)
+      try:
+        return np.array(a)
+      except ValueError as e:
+        # TODO(b/264461299): NumPy 1.24 no longer infers dtype=object from
+        # ragged sequences.
+        # See:
+        # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
+        # Fixing this correctly requires clarifying the API contract of this
+        # function with respect to ragged sequences and possibly updating all
+        # users. As a backwards compatibility measure, if array
+        # creation fails with an "inhomogeneous shape" error, try again with
+        # an explicit dtype=object, which should restore the previous behavior.
+        if "inhomogeneous shape" in str(e):
+          return np.array(a, dtype=object)
+        else:
+          raise
     return a
 
   def evaluate_if_both_tensors(self, a, b):
@@ -2973,12 +2998,15 @@ def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     self.assertEqual(a.shape, b.shape, shape_mismatch_msg)
 
     msgs = [msg]
-    # np.allclose does not always work for our custom bfloat16 extension type
-    # when type promotions are involved, so we first cast any bfloat16 arrays
-    # to float32.
+    # np.allclose does not always work for our custom bfloat16 and float8
+    # extension types when type promotions are involved, so we first cast any
+    # arrays of such types to float32.
     a_dtype = a.dtype
-    a = a.astype(np.float32) if a.dtype == dtypes.bfloat16.as_numpy_dtype else a
-    b = b.astype(np.float32) if b.dtype == dtypes.bfloat16.as_numpy_dtype else b
+    custom_dtypes = (dtypes.bfloat16.as_numpy_dtype,
+                     dtypes.float8_e5m2.as_numpy_dtype,
+                     dtypes.float8_e4m3fn.as_numpy_dtype)
+    a = a.astype(np.float32) if a.dtype in custom_dtypes else a
+    b = b.astype(np.float32) if b.dtype in custom_dtypes else b
     if not np.allclose(a, b, rtol=rtol, atol=atol):
       # Adds more details to np.testing.assert_allclose.
       #
@@ -3216,9 +3244,7 @@ def assertAllEqual(self, a, b, msg=None):
 
     same = (a == b)
 
-    if (a.dtype in [
-        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
-    ]):
+    if dtypes.as_dtype(a.dtype).is_floating:
       same = np.logical_or(same, np.logical_and(np.isnan(a), np.isnan(b)))
     msgs = [msg]
     if not np.all(same):
@@ -3582,7 +3608,7 @@ def _GetPyList(self, a):
     elif isinstance(a, ragged_tensor_value.RaggedTensorValue):
       return a.to_list()
     else:
-      return np.array(a).tolist()
+      return np.array(a, dtype=object).tolist()
 
   def _assertRaggedEqual(self, a, b, msg):
     """Asserts that two ragged tensors are equal."""
@@ -3966,3 +3992,59 @@ def Reset(self):
   def Get(self):
     value = _test_metrics_util.test_counter_value(self.name, self.label)
     return value - self.last_value
+
+
+@tf_export("test.experimental.sync_devices")
+def sync_devices():
+  """Synchronizes all devices.
+
+  By default, GPUs run asynchronously. This means that when you run an op on the
+  GPU, like `tf.linalg.matmul`, the op may still be running on the GPU when the
+  function returns. Non-GPU devices can also be made to run asynchronously by
+  calling `tf.config.experimental.set_synchronous_execution(False)`. Calling
+  `sync_devices()` blocks until pending ops have finished executing. This is
+  primarily useful for measuring performance during a benchmark.
+
+  For example, here is how you can measure how long `tf.linalg.matmul` runs:
+
+  >>> import time
+  >>> x = tf.random.normal((4096, 4096))
+  >>> tf.linalg.matmul(x, x)  # Warmup.
+  >>> tf.test.experimental.sync_devices()  # Block until warmup has completed.
+  >>>
+  >>> start = time.time()
+  >>> y = tf.linalg.matmul(x, x)
+  >>> tf.test.experimental.sync_devices()  # Block until matmul has completed.
+  >>> end = time.time()
+  >>> print(f'Time taken: {end - start}')
+
+  If the call to `sync_devices()` was omitted, the time printed could be too
+  small. This is because the op could still be running asynchronously when
+  the line `end = time.time()` is executed.
+
+  Raises:
+    RuntimeError: If run outside Eager mode. This must be called in Eager mode,
+      outside any `tf.function`s.
+  """
+  if not context.executing_eagerly():
+    raise RuntimeError(
+        "sync_devices() must only be called in Eager mode, outside tf.functions"
+    )
+
+  # There are two sources of asynchrony in TensorFlow:
+  #
+  # 1. On GPUs, kernels are run on a CUDA stream, which is inherently
+  #    asynchronous.
+  # 2. Calling `tf.config.experimental.set_synchronous_execution(False)` makes
+  #    all ops asynchronous, in which case TensorFlow maintains internal queues
+  #    of pending ops.
+  #
+  # Calling SyncDevice addresses source (1). Calling async_await addresses
+  # source (2). It is important that SyncDevice() is called before async_wait(),
+  # otherwise the SyncDevice op itself may still be pending on an internal
+  # TensorFlow queue when the sync_devices() Python function returns.
+  devices = config.list_logical_devices()
+  for dev in devices:
+    with ops.device(dev.name):
+      gen_sync_ops.SyncDevice()
+  context.async_wait()
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 8b8c4d7abce..911f0a75641 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -18,6 +18,7 @@
 import copy
 import random
 import threading
+import time
 import unittest
 import weakref
 
@@ -33,6 +34,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -1118,5 +1120,57 @@ def add_two(x):
         self.assertTrue(isinstance(t, ops.Tensor) for t in results)
 
 
+class SyncDevicesTest(test_util.TensorFlowTestCase):
+
+  def tearDown(self):
+    super().tearDown()
+    config.set_synchronous_execution(True)
+
+  def test_sync_device_cpu(self):
+    with context.eager_mode(), ops.device("/CPU:0"):
+      config.set_synchronous_execution(False)
+      start = time.time()
+      test_ops.sleep_op(sleep_seconds=1)
+      self.assertLess(time.time() - start, 1.0)
+      test_util.sync_devices()
+      self.assertGreater(time.time() - start, 1.0)
+
+      config.set_synchronous_execution(True)
+      start = time.time()
+      test_ops.sleep_op(sleep_seconds=1)
+      self.assertGreaterEqual(time.time() - start, 1.0)
+      start = time.time()
+      test_util.sync_devices()
+      self.assertLess(time.time() - start, 1.0)
+
+  def test_sync_device_gpu(self):
+    if not test_util.is_gpu_available(min_cuda_compute_capability=(7, 0)):
+      # sleep_op requires compute capability 7.0
+      self.skipTest("Requires GPU with compute capability 7.0")
+
+    with context.eager_mode(), ops.device("/GPU:0"):
+      config.set_synchronous_execution(False)
+      start = time.time()
+      test_ops.sleep_op(sleep_seconds=1)
+      self.assertLess(time.time() - start, 1.0)
+      test_util.sync_devices()
+      self.assertGreater(time.time() - start, 1.0)
+
+      config.set_synchronous_execution(True)
+      start = time.time()
+      test_ops.sleep_op(sleep_seconds=1)
+      self.assertLess(time.time() - start, 1.0)
+      start = time.time()
+      test_util.sync_devices()
+      self.assertGreaterEqual(time.time() - start, 1.0)
+
+  def test_sync_devices_graph_mode_error(self):
+    with context.graph_mode():
+      with self.assertRaisesRegex(
+          RuntimeError, r"sync_devices\(\) must only be called in Eager mode"
+      ):
+        test_util.sync_devices()
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 6f70d933228..a9215e83c64 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -16,7 +16,6 @@
 
 import abc
 import functools
-import re
 from typing import Any, List, Optional, Sequence, Type
 import warnings
 
@@ -27,7 +26,11 @@
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.platform import tf_logging as logging
+# TODO(b/238903802): Remove dependency on nested_structure_coder.
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.types import internal
 from tensorflow.python.types import trace
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
@@ -36,22 +39,20 @@
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 # Use LazyLoader to avoid circular dependencies.
-tensor_spec = LazyLoader(
-    "tensor_spec", globals(),
-    "tensorflow.python.framework.tensor_spec")
 ops = LazyLoader("ops", globals(),
                  "tensorflow.python.framework.ops")
-# TODO(b/238903802): Remove this dependency.
-nested_structure_coder = LazyLoader(
-    "nested_structure_coder", globals(),
-    "tensorflow.python.saved_model.nested_structure_coder")
 
 
 @tf_export("TypeSpec", v1=["TypeSpec", "data.experimental.Structure"])
 class TypeSpec(
-    trace.TraceType, trace_type.Serializable, metaclass=abc.ABCMeta):
+    internal.TypeSpec,
+    trace.TraceType,
+    trace_type.Serializable,
+    metaclass=abc.ABCMeta,
+):
   """Specifies a TensorFlow value type.
 
   A `tf.TypeSpec` provides metadata describing an object accepted or returned
@@ -220,10 +221,40 @@ def experimental_as_proto(self) -> struct_pb2.TypeSpecProto:
     """
     return nested_structure_coder.encode_structure(self).type_spec_value
 
-  # TODO(b/223659753): Return the actual Tensor-based value instead of spec.
-  def _placeholder_value(self) -> "TypeSpec":
-    """Value used for tracing a function signature with this TraceType."""
-    return self
+  @doc_controls.do_not_doc_inheritable
+  def placeholder_value(self, placeholder_context):
+    """Value used for tracing a function signature with this TraceType.
+
+    WARNING: Do not override.
+
+    Args:
+      placeholder_context: A class container for context information when
+        creating a placeholder value.
+
+    Returns:
+      A `CompositeTensor` placeholder whose components are recursively composed
+        of placeholders themselves.
+    """
+    if placeholder_context.unnest_only:
+      return self
+
+    component_placeholders = nest.map_structure(
+        lambda x: x.placeholder_value(placeholder_context),
+        self._component_specs)
+    return self._from_components(component_placeholders)
+
+  def _to_tensors(self, value):
+    value_spec = type_spec_from_value(value)
+    assert value_spec.is_subtype_of(self)
+    return [arg for arg in nest.flatten(value, expand_composites=True)
+            if isinstance(arg, ops.Tensor)]
+
+  def _cast(self, value, casting_context):
+    cast_components = nest.map_structure(
+        lambda spec, v: spec._cast(v, casting_context),  # pylint: disable=protected-access
+        self._component_specs,
+        self._to_components(value))
+    return self._from_components(cast_components)
 
   # TODO(b/225058047): Reconsider semantics.
   def is_compatible_with(self, spec_or_value):
@@ -828,7 +859,7 @@ def _from_compatible_tensor_list(
 
 def get_batchable_flat_tensor_specs(spec, context_spec=None):
   """Returns the flat tensor specs for `spec`."""
-  if isinstance(spec, tensor_spec.TensorSpec):
+  if isinstance(spec, internal.TensorSpec):
     return [spec]
   elif hasattr(spec, "__batch_encoder__"):
     encoding_specs = nest.map_structure(
@@ -846,7 +877,7 @@ def get_batchable_flat_tensor_specs(spec, context_spec=None):
 
 def batchable_to_tensor_list(spec, value, minimum_rank=0):
   """Returns a list of tensors encoding `value`, whose type is `spec`."""
-  if isinstance(spec, tensor_spec.TensorSpec):
+  if isinstance(spec, internal.TensorSpec):
     return [value]
   elif hasattr(spec, "__batch_encoder__"):
     encoded_value = spec.__batch_encoder__.encode(spec, value, minimum_rank)
@@ -861,7 +892,7 @@ def batchable_to_tensor_list(spec, value, minimum_rank=0):
 
 def batchable_from_tensor_list(spec, tensor_list):
   """Returns a value with type `spec` decoded from `tensor_list`."""
-  if isinstance(spec, tensor_spec.TensorSpec):
+  if isinstance(spec, internal.TensorSpec):
     assert len(tensor_list) == 1
     return tensor_list[0]
   elif hasattr(spec, "__batch_encoder__"):
@@ -930,7 +961,7 @@ def _type_spec_from_value(value) -> TypeSpec:
   """Returns a `TypeSpec` that represents the given `value`."""
   if isinstance(value, ops.Tensor):
     # Note: we do not include Tensor names when constructing TypeSpecs.
-    return tensor_spec.TensorSpec(value.shape, value.dtype)
+    return trace_type.from_value(value)
 
   if isinstance(value, composite_tensor.CompositeTensor):
     return value._type_spec  # pylint: disable=protected-access
@@ -982,69 +1013,11 @@ def register_type_spec_from_value_converter(type_object,
 
 _pywrap_utils.RegisterType("TypeSpec", TypeSpec)
 
-_TYPE_SPEC_TO_NAME = {}
-_NAME_TO_TYPE_SPEC = {}
-
-# Regular expression for valid TypeSpec names.
-_REGISTERED_NAME_RE = re.compile(r"^(\w+\.)+\w+$")
-
-
-# TODO(b/173744905) tf_export this as "tf.register_type_spec".  (And add a
-# usage example to the docstring, once the API is public.)
-#
-# TODO(b/173744905) Update this decorator to apply to ExtensionType rather than
-# TypeSpec (once we do refactoring to move to_components/from_components from
-# TypeSpec to ExtensionType).
-def register(name):
-  """Decorator used to register a globally unique name for a TypeSpec subclass.
-
-  Args:
-    name: The name of the type spec.  Must be globally unique.  Must have the
-      form `"{project_name}.{type_name}"`.  E.g. `"my_project.MyTypeSpec"`.
-
-  Returns:
-    A class decorator that registers the decorated class with the given name.
-  """
-  if not isinstance(name, str):
-    raise TypeError("Expected `name` to be a string; got %r" % (name,))
-  if not _REGISTERED_NAME_RE.match(name):
-    raise ValueError(
-        "Registered name must have the form '{project_name}.{type_name}' "
-        "(e.g. 'my_project.MyTypeSpec'); got %r." % name)
-
-  def decorator_fn(cls):
-    if not (isinstance(cls, type) and issubclass(cls, TypeSpec)):
-      raise TypeError("Expected `cls` to be a TypeSpec; got %r" % (cls,))
-    if cls in _TYPE_SPEC_TO_NAME:
-      raise ValueError("Class %s.%s has already been registered with name %s." %
-                       (cls.__module__, cls.__name__, _TYPE_SPEC_TO_NAME[cls]))
-    if name in _NAME_TO_TYPE_SPEC:
-      raise ValueError("Name %s has already been registered for class %s.%s." %
-                       (name, _NAME_TO_TYPE_SPEC[name].__module__,
-                        _NAME_TO_TYPE_SPEC[name].__name__))
-    _TYPE_SPEC_TO_NAME[cls] = name
-    _NAME_TO_TYPE_SPEC[name] = cls
-    return cls
-
-  return decorator_fn
-
-
-# TODO(edloper) tf_export this as "tf.get_type_spec_name" (or some similar name)
-def get_name(cls):
-  """Returns the registered name for TypeSpec `cls`."""
-  if not (isinstance(cls, type) and issubclass(cls, TypeSpec)):
-    raise TypeError("Expected `cls` to be a TypeSpec; got %r" % (cls,))
-  if cls not in _TYPE_SPEC_TO_NAME:
-    raise ValueError("TypeSpec %s.%s has not been registered." %
-                     (cls.__module__, cls.__name__))
-  return _TYPE_SPEC_TO_NAME[cls]
-
-
-# TODO(edloper) tf_export this as "tf.lookup_type_spec" (or some similar name)
-def lookup(name):
-  """Returns the TypeSpec that has been registered with name `name`."""
-  if not isinstance(name, str):
-    raise TypeError("Expected `name` to be a string; got %r" % (name,))
-  if name not in _NAME_TO_TYPE_SPEC:
-    raise ValueError("No TypeSpec has been registered with name %r" % (name,))
-  return _NAME_TO_TYPE_SPEC[name]
+# TODO(flang) delete type_spec.py's dependency on type_spec_registry.py once
+# all legacy references type_spec's registry functions below have migrated
+_TYPE_SPEC_TO_NAME = type_spec_registry._TYPE_SPEC_TO_NAME  # pylint: disable=protected-access
+_NAME_TO_TYPE_SPEC = type_spec_registry._NAME_TO_TYPE_SPEC  # pylint: disable=protected-access
+lookup = type_spec_registry.lookup
+get_name = type_spec_registry.get_name
+register = type_spec_registry.register
+_REGISTERED_NAME_RE = type_spec_registry._REGISTERED_NAME_RE  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/type_spec_registry.py b/tensorflow/python/framework/type_spec_registry.py
new file mode 100644
index 00000000000..e67a5b7b017
--- /dev/null
+++ b/tensorflow/python/framework/type_spec_registry.py
@@ -0,0 +1,87 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registry for custom TypeSpecs."""
+
+import re
+
+from tensorflow.python.types import internal
+
+
+_TYPE_SPEC_TO_NAME = {}
+_NAME_TO_TYPE_SPEC = {}
+
+# Regular expression for valid TypeSpec names.
+_REGISTERED_NAME_RE = re.compile(r"^(\w+\.)+\w+$")
+
+
+# TODO(b/173744905) tf_export this as "tf.register_type_spec".  (And add a
+# usage example to the docstring, once the API is public.)
+#
+# TODO(b/173744905) Update this decorator to apply to ExtensionType rather than
+# TypeSpec (once we do refactoring to move to_components/from_components from
+# TypeSpec to ExtensionType).
+def register(name):
+  """Decorator used to register a globally unique name for a TypeSpec subclass.
+
+  Args:
+    name: The name of the type spec.  Must be globally unique.  Must have the
+      form `"{project_name}.{type_name}"`.  E.g. `"my_project.MyTypeSpec"`.
+
+  Returns:
+    A class decorator that registers the decorated class with the given name.
+  """
+  if not isinstance(name, str):
+    raise TypeError("Expected `name` to be a string; got %r" % (name,))
+  if not _REGISTERED_NAME_RE.match(name):
+    raise ValueError(
+        "Registered name must have the form '{project_name}.{type_name}' "
+        "(e.g. 'my_project.MyTypeSpec'); got %r." % name)
+
+  def decorator_fn(cls):
+    if not (isinstance(cls, type) and issubclass(cls, internal.TypeSpec)):
+      raise TypeError("Expected `cls` to be a TypeSpec; got %r" % (cls,))
+    if cls in _TYPE_SPEC_TO_NAME:
+      raise ValueError("Class %s.%s has already been registered with name %s." %
+                       (cls.__module__, cls.__name__, _TYPE_SPEC_TO_NAME[cls]))
+    if name in _NAME_TO_TYPE_SPEC:
+      raise ValueError("Name %s has already been registered for class %s.%s." %
+                       (name, _NAME_TO_TYPE_SPEC[name].__module__,
+                        _NAME_TO_TYPE_SPEC[name].__name__))
+    _TYPE_SPEC_TO_NAME[cls] = name
+    _NAME_TO_TYPE_SPEC[name] = cls
+    return cls
+
+  return decorator_fn
+
+
+# TODO(edloper) tf_export this as "tf.get_type_spec_name" (or some similar name)
+def get_name(cls):
+  """Returns the registered name for TypeSpec `cls`."""
+  if not (isinstance(cls, type) and issubclass(cls, internal.TypeSpec)):
+    raise TypeError("Expected `cls` to be a TypeSpec; got %r" % (cls,))
+  if cls not in _TYPE_SPEC_TO_NAME:
+    raise ValueError("TypeSpec %s.%s has not been registered." %
+                     (cls.__module__, cls.__name__))
+  return _TYPE_SPEC_TO_NAME[cls]
+
+
+# TODO(edloper) tf_export this as "tf.lookup_type_spec" (or some similar name)
+def lookup(name):
+  """Returns the TypeSpec that has been registered with name `name`."""
+  if not isinstance(name, str):
+    raise TypeError("Expected `name` to be a string; got %r" % (name,))
+  if name not in _NAME_TO_TYPE_SPEC:
+    raise ValueError("No TypeSpec has been registered with name %r" % (name,))
+  return _NAME_TO_TYPE_SPEC[name]
diff --git a/tensorflow/python/framework/type_spec_test.py b/tensorflow/python/framework/type_spec_test.py
index 7366a4976e1..9044037bf93 100644
--- a/tensorflow/python/framework/type_spec_test.py
+++ b/tensorflow/python/framework/type_spec_test.py
@@ -29,6 +29,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.framework.type_utils import fulltypes_for_flat_tensors
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -51,7 +52,7 @@ def __init__(self, x, y, color="red"):
     self.color = color
 
 
-@type_spec.register("tf.TwoTensorsSpec")
+@type_spec_registry.register("tf.TwoTensorsSpec")
 class TwoTensorsSpec(type_spec.TypeSpec):
   """A TypeSpec for the TwoTensors value type."""
 
@@ -85,12 +86,12 @@ def from_value(cls, value):
                value.color)
 
 
-@type_spec.register("tf.TwoTensorsSpecTwin")
+@type_spec_registry.register("tf.TwoTensorsSpecTwin")
 class TwoTensorsSpecTwin(TwoTensorsSpec):
   pass
 
 
-@type_spec.register("tf.TwoTensorsSpecVariableSerialize")
+@type_spec_registry.register("tf.TwoTensorsSpecVariableSerialize")
 class TwoTensorsSpecVariableSerialize(TwoTensorsSpec):
 
   def _serialize(self):
@@ -120,7 +121,7 @@ def __init__(self, x, y, color="red"):
     self.color = color
 
 
-@type_spec.register("tf.TwoCompositesSpec")
+@type_spec_registry.register("tf.TwoCompositesSpec")
 class TwoCompositesSpec(type_spec.BatchableTypeSpec):
   """A TypeSpec for the TwoTensors value type."""
 
@@ -172,7 +173,7 @@ def __init__(self, x):
     self.nest = x
 
 
-@type_spec.register("tf.NestOfTensorsSpec")
+@type_spec_registry.register("tf.NestOfTensorsSpec")
 class NestOfTensorsSpec(type_spec.TypeSpec):
   """A TypeSpec for the NestOfTensors value type."""
 
@@ -648,6 +649,25 @@ def testFromValue(self):
     spec = type_spec.type_spec_from_value(value)
     self.assertEqual(spec, TwoTensorsSpec.from_value(value))
 
+  def testCast(self):
+    spec = TwoTensorsSpec([], dtypes.int32, [], dtypes.float32)
+    foo = spec._from_components([1, 2.3])
+    ctx = trace_type.InternalCastContext()
+    value = spec._cast(foo, ctx)
+    tensor_type = type(ops.convert_to_tensor([1, 2, 3]))
+    self.assertIsInstance(value.x, tensor_type)
+    self.assertIsInstance(value.y, tensor_type)
+    self.assertEqual(value.x.dtype, dtypes.int32)
+    self.assertEqual(value.y.dtype, dtypes.float32)
+
+    bar = TwoComposites(
+        ragged_factory_ops.constant([[1, 2], [3]], dtypes.int32),
+        ragged_factory_ops.constant([[5], [6, 7, 8]], dtypes.float32))
+    bar_spec = type_spec.type_spec_from_value(bar)
+    value = bar_spec._cast(bar, ctx)
+    self.assertEqual(value.x.dtype, dtypes.int32)
+    self.assertEqual(value.y.dtype, dtypes.float32)
+
   def testNestedRagged(self):
     # Check that TwoCompositeSpecs are compatible if one has a nested
     # RaggedTensorSpec w/ ragged_rank=0 and the other has a corresponding
@@ -666,61 +686,63 @@ def testNestedRagged(self):
 
   def testRegistry(self):
     self.assertEqual("tf.TwoCompositesSpec",
-                     type_spec.get_name(TwoCompositesSpec))
-    self.assertEqual("tf.TwoTensorsSpec", type_spec.get_name(TwoTensorsSpec))
+                     type_spec_registry.get_name(TwoCompositesSpec))
+    self.assertEqual("tf.TwoTensorsSpec",
+                     type_spec_registry.get_name(TwoTensorsSpec))
     self.assertEqual(TwoCompositesSpec,
-                     type_spec.lookup("tf.TwoCompositesSpec"))
-    self.assertEqual(TwoTensorsSpec, type_spec.lookup("tf.TwoTensorsSpec"))
+                     type_spec_registry.lookup("tf.TwoCompositesSpec"))
+    self.assertEqual(TwoTensorsSpec,
+                     type_spec_registry.lookup("tf.TwoTensorsSpec"))
 
   def testRegistryTypeErrors(self):
     with self.assertRaisesRegex(TypeError, "Expected `name` to be a string"):
-      type_spec.register(None)
+      type_spec_registry.register(None)
 
     with self.assertRaisesRegex(TypeError, "Expected `name` to be a string"):
-      type_spec.register(TwoTensorsSpec)
+      type_spec_registry.register(TwoTensorsSpec)
 
     with self.assertRaisesRegex(TypeError, "Expected `cls` to be a TypeSpec"):
-      type_spec.register("tf.foo")(None)
+      type_spec_registry.register("tf.foo")(None)
 
     with self.assertRaisesRegex(TypeError, "Expected `cls` to be a TypeSpec"):
-      type_spec.register("tf.foo")(ragged_tensor.RaggedTensor)
+      type_spec_registry.register("tf.foo")(ragged_tensor.RaggedTensor)
 
   def testRegistryDuplicateErrors(self):
     with self.assertRaisesRegex(
         ValueError, "Name tf.TwoCompositesSpec has already been registered "
         "for class __main__.TwoCompositesSpec."):
 
-      @type_spec.register("tf.TwoCompositesSpec")  # pylint: disable=unused-variable
+      @type_spec_registry.register("tf.TwoCompositesSpec")  # pylint: disable=unused-variable
       class NewTypeSpec(TwoCompositesSpec):
         pass
 
     with self.assertRaisesRegex(
         ValueError, "Class __main__.TwoCompositesSpec has already been "
         "registered with name tf.TwoCompositesSpec"):
-      type_spec.register("tf.NewName")(TwoCompositesSpec)
+      type_spec_registry.register("tf.NewName")(TwoCompositesSpec)
 
   def testRegistryNameErrors(self):
     for bad_name in ["foo", "", "hello world"]:
       with self.assertRaises(ValueError):
-        type_spec.register(bad_name)
+        type_spec_registry.register(bad_name)
 
   def testRegistryLookupErrors(self):
     with self.assertRaises(TypeError):
-      type_spec.lookup(None)
+      type_spec_registry.lookup(None)
     with self.assertRaisesRegex(
         ValueError, "No TypeSpec has been registered with name 'foo.bar'"):
-      type_spec.lookup("foo.bar")
+      type_spec_registry.lookup("foo.bar")
 
   def testRegistryGetNameErrors(self):
     with self.assertRaises(TypeError):
-      type_spec.get_name(None)
+      type_spec_registry.get_name(None)
 
     class Foo(TwoCompositesSpec):
       pass
 
     with self.assertRaisesRegex(
         ValueError, "TypeSpec __main__.Foo has not been registered."):
-      type_spec.get_name(Foo)
+      type_spec_registry.get_name(Foo)
 
   def testSerialization(self):
     spec = TwoTensorsSpec([5, 3], dtypes.int32, [None], dtypes.bool)
diff --git a/tensorflow/python/grappler/BUILD b/tensorflow/python/grappler/BUILD
index e0028e30008..3d1c95c22a9 100644
--- a/tensorflow/python/grappler/BUILD
+++ b/tensorflow/python/grappler/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/core/platform:build_config.bzl", "tf_protos_grappler")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -144,7 +145,7 @@ tf_py_test(
 
 tf_py_test(
     name = "datasets_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "datasets_test.py",
     ],
@@ -248,6 +249,13 @@ tf_python_pybind_extension(
         "//tensorflow/core/grappler/optimizers:pywrap_required_hdrs",
         "//tensorflow/core/grappler/verifiers:pywrap_required_hdrs",
     ],
+    # This fails Windows builds. Please check b/266870200 for details.
+    #    dynamic_deps = ["//tensorflow/python:_pywrap_tensorflow_internal.so"] + select({
+    #        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+    #        "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+    #        "//tensorflow:windows": [],
+    #    }),
+    #    static_deps = tf_python_pybind_static_deps(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 4873215f54b..ec04cb132f1 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -43,7 +43,7 @@
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import sysconfig
+from tensorflow.python.platform import sysconfig as sysconfig_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -593,7 +593,8 @@ def test_conv_pool(self, mode):
   def test_depthwise_conv2d(self, mode):
     """Test grad ops with depthwise convolution2d graph."""
     self._maybe_skip(mode)
-    cudnn_version_str = sysconfig.get_build_info().get('cudnn_version', '0.0')
+    cudnn_version_str = sysconfig_lib.get_build_info().get(
+        'cudnn_version', '0.0')
     cudnn_version = tuple([int(x) for x in cudnn_version_str.split('.')])
     if cudnn_version < (8,):
       # Depthwise conv2d ops are only enabled in auto_mixed_precision as of
diff --git a/tensorflow/python/grappler/remapper_test.py b/tensorflow/python/grappler/remapper_test.py
index a51dd026fd9..1139d5d67f4 100644
--- a/tensorflow/python/grappler/remapper_test.py
+++ b/tensorflow/python/grappler/remapper_test.py
@@ -38,7 +38,7 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import sysconfig
+from tensorflow.python.platform import sysconfig as sysconfig_lib
 from tensorflow.python.platform import test
 from tensorflow.python.util import _pywrap_utils
 
@@ -92,7 +92,8 @@ def maybe_skip_test(self, mode):
       # The cublaslt matmul with gelu epilog is only supported since cuda 11.4.
       if not test.is_gpu_available(cuda_only=True):
         self.skipTest('This test requires GPU.')
-      cuda_version_str = sysconfig.get_build_info().get('cuda_version', '0.0')
+      cuda_version_str = sysconfig_lib.get_build_info().get(
+          'cuda_version', '0.0')
       cuda_version = tuple([int(x) for x in cuda_version_str.split('.')])
       if cuda_version < (11, 4):
         self.skipTest('This test requires CUDA >= 11.4.')
@@ -171,12 +172,9 @@ def gelu_exact(x):
     if mode == 'mkl':
       config.append((dtypes.float32, gelu_exact, b'GeluExact'))
       config.append((dtypes.float32, gelu_approximate, b'GeluApproximate'))
-      # Gelu exact (approximate=False) is not supported with bfloat16 precision
-      # since no support for Erf with bfloat16 data type.
-      # TODO(intel-tf): Enable gelu exact with bfloat16, when Erf op is
-      # supported with bfloat16.
       if _pywrap_utils.IsBF16SupportedByOneDNNOnThisCPU():
         config.append((dtypes.bfloat16, gelu_approximate, b'GeluApproximate'))
+        config.append((dtypes.bfloat16, gelu_exact, b'GeluExact'))
     elif mode == 'cuda':
       config.append((dtypes.float32, gelu_approximate, b'GeluApproximate'))
       config.append((dtypes.float16, gelu_approximate, b'GeluApproximate'))
diff --git a/tensorflow/python/integration_testing/BUILD b/tensorflow/python/integration_testing/BUILD
index 5796191d269..7ab9b1a7500 100644
--- a/tensorflow/python/integration_testing/BUILD
+++ b/tensorflow/python/integration_testing/BUILD
@@ -1,4 +1,7 @@
 # Description:
 #   This directory is only for tests that should be run on a pip whl.
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index c29f0d1f4bb..7864f7f1e43 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras API (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index d81a6f5b839..04eb3e82b2f 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -3,6 +3,7 @@
 #   related to dist-strat used by Keras..
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove this deps when distribute test are converted to integration test.
     default_visibility = [
         "//tensorflow:internal",
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 6b02493d3d7..10e99eac36e 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras engine API (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//tensorflow/python:__pkg__",
diff --git a/tensorflow/python/keras/initializers/BUILD b/tensorflow/python/keras/initializers/BUILD
index b16ea4f14c1..dd0b48f4683 100644
--- a/tensorflow/python/keras/initializers/BUILD
+++ b/tensorflow/python/keras/initializers/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras initializer API (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/python/keras:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index f39a9eb412d..7f84cfa35f6 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras layers (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//tensorflow/python/distribute:__pkg__",
diff --git a/tensorflow/python/keras/layers/legacy_rnn/BUILD b/tensorflow/python/keras/layers/legacy_rnn/BUILD
index 67bab38ee95..b2418ab3789 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/BUILD
+++ b/tensorflow/python/keras/layers/legacy_rnn/BUILD
@@ -2,6 +2,7 @@
 #   Contains the legacy TF RNN APIs (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow/python/keras:__subpackages__",
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
index f143b1251fc..3369f783514 100644
--- a/tensorflow/python/keras/legacy_tf_layers/BUILD
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -2,6 +2,7 @@
 #   Contains the legacy TF layers (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow/python/keras:__subpackages__",
diff --git a/tensorflow/python/keras/mixed_precision/BUILD b/tensorflow/python/keras/mixed_precision/BUILD
index 13558e91b94..5edbf8f4dbe 100644
--- a/tensorflow/python/keras/mixed_precision/BUILD
+++ b/tensorflow/python/keras/mixed_precision/BUILD
@@ -17,6 +17,7 @@
 #   Contains the Keras Mixed Precision API (TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # TODO(scottzhu): Remove these two deps and convert the test to integration test.
         "//tensorflow/python/distribute:__pkg__",  # For collective_all_reduce_strategy_test
@@ -88,6 +89,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/types",
     ],
diff --git a/tensorflow/python/keras/mixed_precision/autocast_variable.py b/tensorflow/python/keras/mixed_precision/autocast_variable.py
index 08dc07183cc..f542eb9da64 100644
--- a/tensorflow/python/keras/mixed_precision/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/autocast_variable.py
@@ -17,6 +17,7 @@
 import threading
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -349,12 +350,14 @@ def _gather_saveables_for_checkpoint(self):
     # models with normal variables, and vice versa.
     return self._variable._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map, tensor_map, options,
+                                   **kwargs):
     # By delegating this method to the wrapped variable, SavedModel with
     # AutoCastVariables are identical to SavedModel with normal variables.
-    obj_map, resource_map = self._variable._map_resources(save_options)  # pylint:disable=protected-access
-    obj_map[self] = obj_map[self._variable]
-    return obj_map, resource_map
+    resource_list = self._variable._export_to_saved_model_graph(  # pylint:disable=protected-access
+        object_map, tensor_map, options, **kwargs)
+    object_map[self] = object_map[self._variable]
+    return resource_list
 
   # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
   # to_proto().
@@ -488,8 +491,8 @@ def __rmatmul__(self, o):
   # pylint: enable=multiple-statements
 
 
-ops.register_tensor_conversion_function(AutoCastVariable,
-                                        AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
+tensor_conversion_registry.register_tensor_conversion_function(
+    AutoCastVariable, AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
 
 
 def create_autocast_variable(variable):
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index ff37ebb9bbb..07fe37fa426 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras OptimizerV2 API (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//tensorflow/python:__pkg__",
diff --git a/tensorflow/python/keras/protobuf/BUILD b/tensorflow/python/keras/protobuf/BUILD
index 3c80916dc20..ff49119c715 100644
--- a/tensorflow/python/keras/protobuf/BUILD
+++ b/tensorflow/python/keras/protobuf/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/python/keras:__subpackages__",
     ],
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index fd9c13a6611..995bb5f619a 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras save model API (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//tensorflow/python/distribute:__pkg__",
diff --git a/tensorflow/python/keras/saving/saved_model/BUILD b/tensorflow/python/keras/saving/saved_model/BUILD
index 3d4f138c668..eac151b3a24 100644
--- a/tensorflow/python/keras/saving/saved_model/BUILD
+++ b/tensorflow/python/keras/saving/saved_model/BUILD
@@ -17,6 +17,7 @@
 #   Keras saving and loading files for SavedModel.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/python/keras/saving:__subpackages__",
     ],
diff --git a/tensorflow/python/keras/saving/utils_v1/BUILD b/tensorflow/python/keras/saving/utils_v1/BUILD
index 41fb0e9a685..98a1ea40dac 100644
--- a/tensorflow/python/keras/saving/utils_v1/BUILD
+++ b/tensorflow/python/keras/saving/utils_v1/BUILD
@@ -17,6 +17,7 @@
 #   Keras saving and loading libraries.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/python/keras:__subpackages__",
     ],
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 9cf5566b2bf..3b59025d03e 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -2,6 +2,7 @@
 #   Contains the Keras Utilities (internal TensorFlow version).
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(scottzhu): Remove non-keras deps from TF.
     default_visibility = [
         "//tensorflow/python/feature_column:__pkg__",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index fcb9a38e6b8..ffd9d452891 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 # CPU-only tests should use tf_py_test, GPU tests use cuda_py_test
 # Please avoid the py_tests and cuda_py_tests (plural) while we
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index ab02d81556a..5c82a0cb604 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -3,7 +3,10 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "array_ops_test",
@@ -440,7 +443,8 @@ cuda_py_test(
 
 cuda_py_test(
     name = "pad_op_test",
-    size = "small",
+    size = "medium",
+    timeout = "moderate",
     srcs = ["pad_op_test.py"],
     tags = ["no_mac"],  # test is times out on mac b/186262388
     xla_tags = [
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index c14703cde39..c76b119fd24 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for array_ops."""
 import re
+import sys
 import time
 import unittest
 
@@ -143,6 +144,7 @@ def testNarrowMatrixConjugateTranspose(self):
 class BooleanMaskTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
+    super().setUp()
     self.rng = np.random.RandomState(42)
 
   def CheckVersusNumpy(self, ndims_mask, arr_shape, make_mask=None, axis=None):
@@ -451,7 +453,7 @@ def testReverse1DimAuto(self):
         np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64,
         np.int64, np.bool_, np.float16, np.float32, np.float64, np.complex64,
         np.complex128,
-        np.array(b"").dtype.type
+        np.array(b"").dtype.type, dtypes.bfloat16.as_numpy_dtype
     ]:
       self._reverse1DimAuto(dtype)
 
@@ -460,7 +462,7 @@ def testReverse2DimAuto(self):
         np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64,
         np.int64, np.bool_, np.float16, np.float32, np.float64, np.complex64,
         np.complex128,
-        np.array(b"").dtype.type
+        np.array(b"").dtype.type, dtypes.bfloat16.as_numpy_dtype
     ]:
       self._reverse2DimAuto(dtype)
 
@@ -626,7 +628,7 @@ def casts_to_bool_nparray(x):
 STRIDED_SLICE_TYPES = [
     dtypes.int32, dtypes.int64, dtypes.int16, dtypes.int8, dtypes.uint8,
     dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128,
-    dtypes.bool
+    dtypes.bool, dtypes.bfloat16
 ]
 
 
@@ -684,6 +686,9 @@ def testTensorSliceEagerMemory(self):
   @test_util.assert_no_new_pyobjects_executing_eagerly
   @test_util.assert_no_garbage_created
   def testVariableSliceEagerMemory(self):
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/265082239)
+      self.skipTest("Not working in Python 3.11")
     with context.eager_mode():
       v = variables.Variable([1., 2.])
       v[0]  # pylint: disable=pointless-statement
@@ -1855,6 +1860,72 @@ def testOutOfBoundAxis(self):
               max_range=input_max,
               axis=2**31 - 1))
 
+  @test_util.run_v2_only
+  def testInvalidAxis(self):
+
+    @def_function.function
+    def test_quantize_and_dequantize_v2():
+      gen_array_ops.quantize_and_dequantize_v2(
+          input=[2.5],
+          input_min=[1.0],
+          input_max=[10.0],
+          signed_input=True,
+          num_bits=1,
+          range_given=True,
+          round_mode="HALF_TO_EVEN",
+          narrow_range=True,
+          axis=0x7fffffff)
+
+    @def_function.function
+    def test_quantize_and_dequantize_v3():
+      gen_array_ops.quantize_and_dequantize_v3(
+          input=[2.5],
+          input_min=[1.0],
+          input_max=[10.0],
+          num_bits=1,
+          signed_input=True,
+          range_given=True,
+          narrow_range=True,
+          axis=0x7fffffff)
+
+    @def_function.function
+    def test_quantize_and_dequantize_v4():
+      gen_array_ops.quantize_and_dequantize_v4(
+          input=[2.5],
+          input_min=[1.0],
+          input_max=[10.0],
+          signed_input=True,
+          num_bits=1,
+          range_given=True,
+          round_mode="HALF_TO_EVEN",
+          narrow_range=True,
+          axis=0x7fffffff)
+
+    @def_function.function
+    def test_quantize_and_dequantize_v4_grad():
+      gen_array_ops.quantize_and_dequantize_v4_grad(
+          gradients=[2.5],
+          input=[2.5],
+          input_min=[1.0],
+          input_max=[10.0],
+          axis=0x7fffffff)
+
+    with self.assertRaisesRegex(
+        ValueError, "Axis cannot be >= kint32max value, got 2147483647"):
+      test_quantize_and_dequantize_v2()
+
+    with self.assertRaisesRegex(
+        ValueError, "Axis cannot be >= kint32max value, got 2147483647"):
+      test_quantize_and_dequantize_v3()
+
+    with self.assertRaisesRegex(
+        ValueError, "Axis cannot be >= kint32max value, got 2147483647"):
+      test_quantize_and_dequantize_v4()
+
+    with self.assertRaisesRegex(
+        ValueError, "Axis cannot be >= kint32max value, got 2147483647"):
+      test_quantize_and_dequantize_v4_grad()
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class SortedSearchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py b/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py
index 243dde98d93..1e0af65b0cb 100644
--- a/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/batchtospace_op_test.py
@@ -18,6 +18,7 @@
 op is tested in tandem with its reverse SpaceToBatch op.
 """
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -44,12 +45,14 @@ def batch_to_space(*args, **kwargs):
     return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
-class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
+class BatchToSpaceDepthToSpace(test.TestCase, parameterized.TestCase,
+                               PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
+  @parameterized.parameters(np.float32, dtypes.bfloat16.as_numpy_dtype)
   @test_util.run_deprecated_v1
-  def testDepthToSpaceTranspose(self):
-    x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
+  def testDepthToSpaceTranspose(self, dtype):
+    x = np.arange(20 * 5 * 8 * 7, dtype=dtype).reshape([20, 5, 8, 7])
     block_size = 2
     for crops_dtype in [dtypes.int64, dtypes.int32]:
       crops = array_ops.zeros((2, 2), dtype=crops_dtype)
diff --git a/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
index 44986875a0f..301e3ffe98f 100644
--- a/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
@@ -28,7 +28,9 @@
 class BroadcastToTest(test_util.TensorFlowTestCase):
 
   def testBroadcastToBasic(self):
-    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64,
+                  np.float16, np.float32, np.float64,
+                  dtypes.bfloat16.as_numpy_dtype]:
       with self.session():
         x = np.array([1, 2, 3], dtype=dtype)
         v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
diff --git a/tensorflow/python/kernel_tests/array_ops/cast_op_test.py b/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
index 5ecae14f418..5a5e77a7a06 100644
--- a/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/cast_op_test.py
@@ -19,6 +19,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -110,6 +111,18 @@ def testBfloat16(self):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
       self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
 
+  def testFloat8(self):
+    a = np.random.uniform(-100, 100, 100).astype(np.float32)
+    for float8 in (dtypes.float8_e4m3fn, dtypes.float8_e5m2):
+      # Including float8_e4m3fn should cover the float8 combinations without
+      # loss of precision.
+      for dtype in (dtypes.float32, dtypes.bfloat16, dtypes.float16,
+                    dtypes.float8_e4m3fn):
+        with self.cached_session(use_gpu=True):
+          b = ops.convert_to_tensor(a, float8)
+          c = math_ops.cast(math_ops.cast(b, dtype), float8)
+          self.assertAllEqual(b, c)
+
   def testRandom(self):
     self._testAll(np.random.normal(0, 10, 210).reshape([2, 3, 5, 7]))
     self._testAll(np.random.normal(0, 1e6, 210).reshape([2, 3, 5, 7]))
@@ -221,6 +234,55 @@ def testSaturate(self):
         correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
         self.assertAllEqual(correct, y)
 
+  def testSaturateRealToComplex(self):
+    in_types = (dtypes.float32, dtypes.float64)
+    out_types = (dtypes.complex64, dtypes.complex128)
+    for in_type in in_types:
+      for out_type in out_types:
+        lo, hi = in_type.min, in_type.max
+        x = constant_op.constant([lo, lo + 1, lo // 2, hi // 2, hi - 1, hi],
+                                 dtype=in_type)
+        y = math_ops.saturate_cast(x, dtype=out_type)
+        self.assertEqual(y.dtype, out_type)
+        x, y = self.evaluate([x, y])
+        correct = np.maximum(out_type.real_dtype.min,
+                             np.minimum(out_type.real_dtype.max, x))
+        self.assertAllEqual(correct, y)
+
+  def testSaturateComplexToReal(self):
+    in_types = (dtypes.complex64, dtypes.complex128)
+    out_types = (dtypes.float32, dtypes.float64)
+    for in_type in in_types:
+      for out_type in out_types:
+        lo, hi = in_type.real_dtype.min, in_type.real_dtype.max
+        x = constant_op.constant([lo, lo + 1, lo // 2, hi // 2, hi - 1, hi],
+                                 dtype=in_type)
+        y = math_ops.saturate_cast(x, dtype=out_type)
+        self.assertEqual(y.dtype, out_type)
+        x, y = self.evaluate([x, y])
+        correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
+        self.assertAllEqual(correct, y)
+
+  @test_util.disable_xla("Clamp is not implemented for C128 in XLA")
+  def testSaturateComplexToComplex(self):
+    in_types = (dtypes.complex64, dtypes.complex128)
+    out_types = (dtypes.complex64, dtypes.complex128)
+    for in_type in in_types:
+      for out_type in out_types:
+        lo, hi = in_type.real_dtype.min, in_type.real_dtype.max
+        x_real = constant_op.constant(
+            [lo, lo + 1, lo // 2, hi // 2, hi - 1, hi],
+            dtype=in_type.real_dtype)
+        x = math_ops.complex(x_real, array_ops.transpose(x_real))
+        y = math_ops.saturate_cast(x, dtype=out_type)
+        self.assertEqual(y.dtype, out_type)
+        x, y = self.evaluate([x, y])
+        correct = np.maximum(
+            out_type.real_dtype.min,
+            np.minimum(out_type.real_dtype.max, np.real(x))) + 1j * np.maximum(
+                out_type.real_dtype.min,
+                np.minimum(out_type.real_dtype.max, np.imag(x)))
+        self.assertAllEqual(correct, y)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops/concat_op_test.py b/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
index b18ef1e1a39..76fc6212c96 100644
--- a/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
@@ -93,6 +93,17 @@ def testInt32GPU(self):
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
+  def testBfloat16GPU(self):
+    with test_util.use_gpu():
+      p1 = np.random.rand(2, 3).astype(dtypes.bfloat16.as_numpy_dtype)
+      p2 = np.random.rand(2, 3).astype(dtypes.bfloat16.as_numpy_dtype)
+      x1 = constant_op.constant(p1)
+      x2 = constant_op.constant(p2)
+      c = array_ops.concat([x1, x2], 0)
+      result = self.evaluate(c)
+    self.assertAllEqual(result[:2, :], p1)
+    self.assertAllEqual(result[2:, :], p2)
+
   def testRefType(self):
     with test_util.use_gpu():
       p1 = np.random.rand(4, 4).astype("f")
diff --git a/tensorflow/python/kernel_tests/array_ops/constant_op_test.py b/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
index a9dabf662a8..a580075f3fa 100644
--- a/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/constant_op_test.py
@@ -276,10 +276,12 @@ def testSparseValuesRaiseErrors(self):
                                 "setting an array element with a sequence"):
       c = constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
 
-    with self.assertRaisesRegex(ValueError, "Expected.*to be a dense tensor"):
+    with self.assertRaisesRegex(
+        ValueError, "(Expected.*to be a dense tensor|inhomogeneous shape)"):
       c = constant_op.constant([[1, 2], [3]])
 
-    with self.assertRaisesRegex(ValueError, "Expected.*to be a dense tensor"):
+    with self.assertRaisesRegex(
+        ValueError, "(Expected.*to be a dense tensor|inhomogeneous shape)"):
       c = constant_op.constant([[1, 2], [3], [4, 5]])
 
 
diff --git a/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py b/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py
index e564807c539..d2a166a6013 100644
--- a/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/depthtospace_op_test.py
@@ -15,6 +15,7 @@
 
 """Functional tests for DepthToSpace op."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import device_lib
@@ -31,7 +32,7 @@
 from tensorflow.python.platform import tf_logging
 
 
-class DepthToSpaceTest(test.TestCase):
+class DepthToSpaceTest(test.TestCase, parameterized.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
@@ -63,19 +64,13 @@ def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
         self.assertAllEqual(output_nhwc, outputs)
 
+  @parameterized.parameters(dtypes.float32, dtypes.float16, dtypes.bfloat16)
   @test_util.run_deprecated_v1
-  def testBasic(self):
+  def testBasic(self, dtype):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
     x_out = [[[[1], [2]], [[3], [4]]]]
-    self._testOne(x_np, block_size, x_out)
-
-  @test_util.run_deprecated_v1
-  def testBasicFloat16(self):
-    x_np = [[[[1, 2, 3, 4]]]]
-    block_size = 2
-    x_out = [[[[1], [2]], [[3], [4]]]]
-    self._testOne(x_np, block_size, x_out, dtype=dtypes.float16)
+    self._testOne(x_np, block_size, x_out, dtype)
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
diff --git a/tensorflow/python/kernel_tests/array_ops/diag_op_test.py b/tensorflow/python/kernel_tests/array_ops/diag_op_test.py
index 214f801ca9e..91dfcf236b7 100644
--- a/tensorflow/python/kernel_tests/array_ops/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/diag_op_test.py
@@ -19,6 +19,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -93,7 +94,7 @@ def repack_diagonals_in_tests(tests, align=None):
 
 # Test cases shared by MatrixDiagV2, MatrixDiagPartV2, and MatrixSetDiagV2.
 def square_cases(align=None):
-  # pyformat: disable
+# pyformat: disable
   mat = np.array([[[1, 2, 3, 4, 5],
                    [6, 7, 8, 9, 1],
                    [3, 4, 5, 6, 7],
@@ -165,13 +166,12 @@ def square_cases(align=None):
                             [0, 0, 0, 8, 9],
                             [0, 0, 0, 0, 5],
                             [0, 0, 0, 0, 0],
-                            [0, 0, 0, 0, 0]]]))
-  # pyformat: enable
+                            [0, 0, 0, 0, 0]]]))  # pyformat: enable
   return (mat, repack_diagonals_in_tests(tests, align))
 
 
 def tall_cases(align=None):
-  # pyformat: disable
+# pyformat: disable
   mat = np.array([[[1, 2, 3],
                    [4, 5, 6],
                    [7, 8, 9],
@@ -252,13 +252,12 @@ def tall_cases(align=None):
                             [0, 0, 3],
                             [0, 0, 0],
                             [0, 0, 0],
-                            [0, 0, 0]]]))
-  # pyformat: enable
+                            [0, 0, 0]]]))  # pyformat: enable
   return (mat, repack_diagonals_in_tests(tests, align))
 
 
 def fat_cases(align=None):
-  # pyformat: disable
+# pyformat: disable
   mat = np.array([[[1, 2, 3, 4],
                    [5, 6, 7, 8],
                    [9, 1, 2, 3]],
@@ -311,8 +310,7 @@ def fat_cases(align=None):
                             [0, 0, 2, 3]],
                            [[4, 5, 6, 7],
                             [0, 9, 1, 2],
-                            [0, 0, 5, 6]]]))
-  # pyformat: enable
+                            [0, 0, 5, 6]]]))  # pyformat: enable
   return (mat, repack_diagonals_in_tests(tests, align))
 
 
@@ -429,6 +427,7 @@ def _testVectorBatch(self, dtype):
 
   @test_util.run_deprecated_v1
   def testVectorBatch(self):
+    self._testVectorBatch(dtypes_lib.bfloat16.as_numpy_dtype)
     self._testVectorBatch(np.float32)
     self._testVectorBatch(np.float64)
     self._testVectorBatch(np.int32)
@@ -656,6 +655,7 @@ def _testSquareBatch(self, dtype):
 
   @test_util.run_deprecated_v1
   def testSquareBatch(self):
+    self._testSquareBatch(dtypes_lib.bfloat16.as_numpy_dtype)
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
     self._testSquareBatch(np.int32)
@@ -841,6 +841,7 @@ def _testSquareBatch(self, dtype):
 
   @test_util.run_deprecated_v1
   def testSquareBatch(self):
+    self._testSquareBatch(dtypes_lib.bfloat16.as_numpy_dtype)
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
     self._testSquareBatch(np.int32)
@@ -1065,6 +1066,13 @@ def testInvalidRank(self):
     with self.assertRaisesRegex(ValueError, "must be at least rank 1"):
       array_ops.diag(0.0)
 
+  def testInvalidInput(self):
+    with self.session():
+      with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+        op = array_ops.matrix_diag(
+            k=1070828000000, diagonal=np.ones((2, 2, 2, 2)))
+        self.evaluate(op)
+
 
 class DiagPartOpTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/array_ops/edit_distance_op_test.py b/tensorflow/python/kernel_tests/array_ops/edit_distance_op_test.py
index 8aa22cb812a..3dc2f4565b7 100644
--- a/tensorflow/python/kernel_tests/array_ops/edit_distance_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/edit_distance_op_test.py
@@ -15,8 +15,9 @@
 """Tests for tensorflow.kernels.edit_distance_op."""
 
 import numpy as np
-
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -225,6 +226,66 @@ def testEditDistanceBadIndices(self):
                          "to outside of the buffer for the output tensor|"
                          r"Dimension -\d+ must be >= 0"))
 
+  def testEmptyShapeWithEditDistanceRaisesError(self):
+    para = {
+        "hypothesis_indices": [[]],
+        "hypothesis_values": ["tmp/"],
+        "hypothesis_shape": [],
+        "truth_indices": [[]],
+        "truth_values": [""],
+        "truth_shape": [],
+        "normalize": False,
+    }
+
+    # Check edit distance raw op with empty shape in eager mode.
+    with self.assertRaisesRegex(
+        (errors.InvalidArgumentError, ValueError),
+        (
+            r"Input Hypothesis SparseTensors must have rank at least 2, but"
+            " hypothesis_shape rank is: 0|Input SparseTensors must have rank "
+            "at least 2, but truth_shape rank is: 0"
+        ),
+    ):
+      array_ops.gen_array_ops.EditDistance(**para)
+
+    # Check raw op with tf.function
+    @def_function.function
+    def TestFunction():
+      """Wrapper function for edit distance call."""
+      array_ops.gen_array_ops.EditDistance(**para)
+
+    with self.assertRaisesRegex(
+        ValueError,
+        (
+            "Input Hypothesis SparseTensors must have rank at least 2, but"
+            " hypothesis_shape rank is: 0"
+        ),
+    ):
+      TestFunction()
+
+    # Check with python wrapper API
+    hypothesis_indices = [[]]
+    hypothesis_values = [0]
+    hypothesis_shape = []
+    truth_indices = [[]]
+    truth_values = [1]
+    truth_shape = []
+    expected_output = []  # dummy ignored
+
+    with self.assertRaisesRegex(
+        ValueError,
+        (
+            "Input Hypothesis SparseTensors must have rank at least 2, but"
+            " hypothesis_shape rank is: 0"
+        ),
+    ):
+      self._testEditDistance(
+          hypothesis=(hypothesis_indices, hypothesis_values, hypothesis_shape),
+          truth=(truth_indices, truth_values, truth_shape),
+          normalize=False,
+          expected_output=expected_output,
+      )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
index b8a16c186b4..8d549ce92ba 100644
--- a/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_nd_op_test.py
@@ -46,6 +46,8 @@ def _testSimpleDtype(self, dtype):
     self.assertEqual([3], gather_nd_t.get_shape())
 
   def testSimpleDtype(self):
+    self._testSimpleDtype(dtypes.bfloat16.as_numpy_dtype)
+    self._testSimpleDtype(np.float16)
     self._testSimpleDtype(np.float32)
     self._testSimpleDtype(np.float64)
     self._testSimpleDtype(np.int32)
diff --git a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
index bc45487c8af..0356a4b9421 100644
--- a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
@@ -35,8 +35,8 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.int64, dtypes.float32,
-               dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.bfloat16, dtypes.float32, dtypes.complex64,
+               dtypes.complex128)
 
 # TODO(virimia): Add a benchmark for gather_v2, with batch_dims and axis set.
 
@@ -169,11 +169,12 @@ def testHigherRank(self):
                 source_slice = ((slice(None),) * outer_dims + (source_index,) +
                                 (slice(None),) * inner_dims)
                 correct_params_grad[dest_slice] += gather_grad[source_slice]
-              self.assertAllClose(
+              self.assertAllCloseAccordingToType(
                   correct_params_grad,
                   self.evaluate(params_grad),
                   atol=2e-6,
-                  rtol=2e-6)
+                  rtol=2e-6,
+              )
 
   def testHigherRankGradientTape(self):
     # We check that scalar and empty indices shapes work as well
@@ -238,11 +239,12 @@ def testHigherRankGradientTape(self):
               source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
               correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(
+            self.assertAllCloseAccordingToType(
                 correct_params_grad,
                 self.evaluate(params_grad),
                 atol=2e-6,
-                rtol=2e-6)
+                rtol=2e-6,
+            )
 
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
diff --git a/tensorflow/python/kernel_tests/array_ops/inplace_ops_test.py b/tensorflow/python/kernel_tests/array_ops/inplace_ops_test.py
index c0c7782e4e2..dae8b333f8d 100644
--- a/tensorflow/python/kernel_tests/array_ops/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/inplace_ops_test.py
@@ -27,7 +27,7 @@
 class InplaceOpsTest(test_util.TensorFlowTestCase):
 
   def testBasicUpdate(self):
-    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64, dtypes.bfloat16]:
       with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
@@ -61,7 +61,7 @@ def testBasicUpdateBool(self):
       self.assertAllClose(x, y)
 
   def testBasicAdd(self):
-    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64, dtypes.bfloat16]:
       with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
@@ -80,7 +80,7 @@ def testBasicAdd(self):
         self.assertAllClose(x, y)
 
   def testBasicSub(self):
-    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64, dtypes.bfloat16]:
       with test_util.use_gpu():
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
@@ -161,7 +161,7 @@ def testError(self):
   def testEmpty(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
-        dtypes.uint8
+        dtypes.uint8, dtypes.bfloat16
     ]:
       with test_util.use_gpu():
         test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
diff --git a/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py b/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
index 40b6384fc13..65291165da8 100644
--- a/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/manip_ops_test.py
@@ -72,7 +72,7 @@ def testIntTypes(self):
 
   @test_util.run_deprecated_v1
   def testFloatTypes(self):
-    for t in [np.float32, np.float64]:
+    for t in [np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype]:
       self._testAll(np.random.rand(5).astype(t), 2, 0)
       if NP_ROLL_CAN_MULTISHIFT:
         self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
diff --git a/tensorflow/python/kernel_tests/array_ops/one_hot_op_test.py b/tensorflow/python/kernel_tests/array_ops/one_hot_op_test.py
index fc559a9c6c4..57ccc7f42af 100644
--- a/tensorflow/python/kernel_tests/array_ops/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/one_hot_op_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for tensorflow.ops.one_hot_op."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -22,7 +23,7 @@
 from tensorflow.python.platform import test
 
 
-class OneHotTest(test.TestCase):
+class OneHotTest(test.TestCase, parameterized.TestCase):
 
   def _testOneHot(self,
                   truth,
@@ -100,33 +101,20 @@ def _testDefaultBasic(self, dtype):
   def testDefaultNoDtype(self):
     self._testDefaultBasic(None)
 
-  def testFloatBasic(self):
-    self._testBasic(np.float32)
-    self._testDefaultBasic(np.float32)
-
-  def testDoubleBasic(self):
-    self._testBasic(np.float64)
-    self._testDefaultBasic(np.float64)
-
-  def testInt8Basic(self):
-    self._testBasic(np.int8)
-    self._testDefaultBasic(np.int8)
-
-  def testInt32Basic(self):
-    self._testBasic(np.int32)
-    self._testDefaultBasic(np.int32)
-
-  def testInt64Basic(self):
-    self._testBasic(np.int64)
-    self._testDefaultBasic(np.int64)
-
-  def testComplex64Basic(self):
-    self._testBasic(np.complex64)
-    self._testDefaultBasic(np.complex64)
-
-  def testComplex128Basic(self):
-    self._testBasic(np.complex128)
-    self._testDefaultBasic(np.complex128)
+  @parameterized.parameters(
+      np.float16,
+      dtypes.bfloat16.as_numpy_dtype,
+      np.float32,
+      np.float64,
+      np.int8,
+      np.int32,
+      np.int64,
+      np.complex64,
+      np.complex128,
+  )
+  def testBasic(self, dtype):
+    self._testBasic(dtype)
+    self._testDefaultBasic(dtype)
 
   def _testBatch(self, dtype):
     indices = np.asarray([[0, 2, -1, 1], [1, 0, 1, -1]], dtype=np.int64)
@@ -227,47 +215,22 @@ def _testEmpty(self, dtype):
         dtype=dtype,
         truth=truth)
 
-  def testHalfBatch(self):
-    self._testEmpty(np.float16)
-    self._testBatch(np.float16)
-    self._testDefaultValuesBatch(np.float16)
-    self._testValueTypeBatch(np.float16)
-
-  def testFloatBatch(self):
-    self._testEmpty(np.float32)
-    self._testBatch(np.float32)
-    self._testDefaultValuesBatch(np.float32)
-    self._testValueTypeBatch(np.float32)
-
-  def testDoubleBatch(self):
-    self._testEmpty(np.float64)
-    self._testBatch(np.float64)
-    self._testDefaultValuesBatch(np.float64)
-    self._testValueTypeBatch(np.float64)
-
-  def testInt8Batch(self):
-    self._testEmpty(np.int8)
-    self._testBatch(np.int8)
-    self._testDefaultValuesBatch(np.int8)
-    self._testValueTypeBatch(np.int8)
-
-  def testInt32Batch(self):
-    self._testEmpty(np.int32)
-    self._testBatch(np.int32)
-    self._testDefaultValuesBatch(np.int32)
-    self._testValueTypeBatch(np.int32)
-
-  def testInt64Batch(self):
-    self._testEmpty(np.int64)
-    self._testBatch(np.int64)
-    self._testDefaultValuesBatch(np.int64)
-    self._testValueTypeBatch(np.int64)
-
-  def testComplexBatch(self):
-    self._testEmpty(np.complex64)
-    self._testBatch(np.complex64)
-    # self._testDefaultValuesBatch(np.complex64)
-    self._testValueTypeBatch(np.complex64)
+  @parameterized.parameters(
+      np.float16,
+      dtypes.bfloat16.as_numpy_dtype,
+      np.float32,
+      np.float64,
+      np.int8,
+      np.int32,
+      np.int64,
+      np.complex64,
+  )
+  def testBatch(self, dtype):
+    self._testEmpty(dtype)
+    self._testBatch(dtype)
+    if dtype != np.complex64:
+      self._testDefaultValuesBatch(dtype)
+    self._testValueTypeBatch(dtype)
 
   def testSimpleCases(self):
     indices = [0, 1, 2]
diff --git a/tensorflow/python/kernel_tests/array_ops/pad_op_test.py b/tensorflow/python/kernel_tests/array_ops/pad_op_test.py
index eee7f24cab8..121fff01649 100644
--- a/tensorflow/python/kernel_tests/array_ops/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/pad_op_test.py
@@ -119,6 +119,10 @@ def pad(x):
 
     with self.cached_session():
       jacob_t, jacob_n = gradient_checker_v2.compute_gradient(pad, [x])
+      if x.dtype == dtypes.bfloat16.as_numpy_dtype:
+        # Compare bf16 analytical gradients to fp32 numerical gradients.
+        x_fp32 = constant_op.constant(x, shape=x.shape, dtype=dtypes.float32)
+        _, jacob_n = gradient_checker_v2.compute_gradient(pad, [x_fp32])
       tol = 1e-3 if x.dtype == np.float16 else 4e-5
       self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
 
@@ -127,12 +131,18 @@ def _testAll(self, np_inputs, paddings, constant_values):
                  "constant"):
       # Zero-sized input is not allowed for REFLECT mode, but we still want
       # zero-sized input test cases for the other modes.
-      if np_inputs.size or mode.upper() != "REFLECT":
-        self._testPad(np_inputs, paddings, mode=mode,
-                      constant_values=constant_values)
-        if np_inputs.dtype == np.float32:
-          self._testGradient(np_inputs, paddings, mode=mode,
-                             constant_values=constant_values)
+      if not np_inputs.size and mode.upper() == "REFLECT":
+        continue
+      # Empty tensor is not allowed for MirrorPad.
+      if 0 in np_inputs.shape and mode.upper() in ["REFLECT", "SYMMETRIC"]:
+        continue
+      self._testPad(
+          np_inputs, paddings, mode=mode, constant_values=constant_values)
+      if np_inputs.dtype in [
+          np.float32, np.float16, dtypes.bfloat16.as_numpy_dtype
+      ]:
+        self._testGradient(
+            np_inputs, paddings, mode=mode, constant_values=constant_values)
 
   def testInputDims(self):
     with test_util.use_gpu():
@@ -278,13 +288,19 @@ def testIntTypes(self):
           [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
   def testFloatTypes(self):
-    self.skipTest("b/183965033")
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]:
       self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
       self._testAll(
           np.random.rand(2, 3, 4).astype(t), [[0, 0], [0, 0], [0, 0]], -12.34)
       self._testAll(
-          np.random.rand(12, 13, 14).astype(t), [[0, 0], [3, 3], [3, 3]], 1.41)
+          np.random.rand(1, 3, 4).astype(t), [[0, 0], [1, 1], [2, 2]], 1.41)
+
+  def testEmptyTensor(self):
+    for t in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]:
       self._testAll(np.random.rand(0, 3, 4).astype(t),
                     [[0, 0], [2, 1], [2, 3]], 0.0)
 
diff --git a/tensorflow/python/kernel_tests/array_ops/reshape_op_test.py b/tensorflow/python/kernel_tests/array_ops/reshape_op_test.py
index 7b6a0d55f5b..cca07beb5a1 100644
--- a/tensorflow/python/kernel_tests/array_ops/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/reshape_op_test.py
@@ -69,6 +69,14 @@ def testFloatBasic(self):
     x = np.arange(1., 7.).reshape([1, 6]).astype(np.float32)
     self._testBothReshape(x, [2, 3])
 
+  def testFloat16Basic(self):
+    x = np.arange(1., 7.).reshape([1, 6]).astype(np.float16)
+    self._testBothReshape(x, [2, 3])
+
+  def testBfloat16Basic(self):
+    x = np.arange(1., 7.).reshape([1, 6]).astype(dtypes.bfloat16.as_numpy_dtype)
+    self._testBothReshape(x, [2, 3])
+
   def testDoubleBasic(self):
     x = np.arange(1., 7.).reshape([1, 6]).astype(np.float64)
     self._testBothReshape(x, [2, 3])
diff --git a/tensorflow/python/kernel_tests/array_ops/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/array_ops/reverse_sequence_op_test.py
index 6f7f9eca619..f5b71e23811 100644
--- a/tensorflow/python/kernel_tests/array_ops/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/reverse_sequence_op_test.py
@@ -101,6 +101,12 @@ def testSeqLengthInt32(self):
   def testFloatBasic(self):
     self._testBasic(np.float32)
 
+  def testFloat16Basic(self):
+    self._testBasic(np.float16)
+
+  def testBfloat16Basic(self):
+    self._testBasic(dtypes.bfloat16.as_numpy_dtype)
+
   def testDoubleBasic(self):
     self._testBasic(np.float64)
 
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
index c2e45ebdc54..25fd53740c7 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
@@ -35,7 +35,8 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-GRADIENT_TESTS_DTYPES = (dtypes.float16, dtypes.float32, dtypes.float64)
+GRADIENT_TESTS_DTYPES = (dtypes.bfloat16, dtypes.float16, dtypes.float32,
+                         dtypes.float64)
 
 
 def _AsType(v, vtype):
@@ -151,18 +152,20 @@ def _VariableRankTest(self,
         self.evaluate(tf_scatter(ref_var, indices, updates))
 
         # Compare
-        self.assertAllClose(new, self.evaluate(ref_var))
+        tol = 1e-6 if vtype != dtypes.bfloat16.as_numpy_dtype else 1e-2
+        self.assertAllClose(new, self.evaluate(ref_var), rtol=tol, atol=tol)
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
     for vtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
-                  np.complex128):
+                  np.complex128, dtypes.bfloat16.as_numpy_dtype):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
   def testSimple(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
-                  dtypes.complex64, dtypes.complex128):
+                  dtypes.complex64, dtypes.complex128,
+                  dtypes.bfloat16.as_numpy_dtype):
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
       ref = variables.Variable([0, 0, 0, 0, 0, 0, 0, 0], dtype=dtype)
       expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
@@ -187,7 +190,7 @@ def testString(self):
 
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
-    for dtype in (dtypes.int32, dtypes.float32):
+    for dtype in (dtypes.int32, dtypes.float32, dtypes.bfloat16):
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtype)
       ref = resource_variable_ops.ResourceVariable([0, 0, 0, 0, 0, 0, 0, 0],
                                                    dtype=dtype)
@@ -245,7 +248,8 @@ def testVariableRankSub(self):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.int32, np.float16, np.float32, np.float64):
+    for vtype in (np.int32, np.float16, np.float32, np.float64,
+                  dtypes.bfloat16.as_numpy_dtype):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
@@ -661,9 +665,9 @@ def testGradientsRank2ElementUpdate(self, use_tape):
 
         updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
                                                  [grad_vals])
-      expected_updates_grad = np.array([1, 4], dtype=dtype.as_numpy_dtype())
+      expected_updates_grad = np.array([1, 4], dtype=dtype.as_numpy_dtype)
       expected_input_grad = np.array([[1, 2], [3, 4]],
-                                     dtype=dtype.as_numpy_dtype())
+                                     dtype=dtype.as_numpy_dtype)
       self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
@@ -684,9 +688,9 @@ def testGradientsRank2SliceUpdate(self, use_tape):
         updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
                                                  [grad_vals])
       expected_updates_grad = np.array([[1, 2], [3, 4]],
-                                       dtype=dtype.as_numpy_dtype())
+                                       dtype=dtype.as_numpy_dtype)
       expected_input_grad = np.array([[3, 4], [1, 2]],
-                                     dtype=dtype.as_numpy_dtype())
+                                     dtype=dtype.as_numpy_dtype)
       self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
@@ -710,9 +714,9 @@ def testGradientsRank3SliceUpdate(self, use_tape):
         updates_grad, input_grad = tape.gradient([outputs], [updates, input_],
                                                  [grad_vals])
       expected_updates_grad = np.array([[[3, 4], [5, 6]], [[1, 2], [7, 8]]],
-                                       dtype=dtype.as_numpy_dtype())
+                                       dtype=dtype.as_numpy_dtype)
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
-                                     dtype=dtype.as_numpy_dtype())
+                                     dtype=dtype.as_numpy_dtype)
       self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
@@ -739,10 +743,10 @@ def testGradientsRank7SliceUpdate(self, use_tape):
                                                  [grad_vals])
       expected_updates_grad = np.array(
           [[[[[[[3, 4], [5, 6]]]], [[[[1, 2], [7, 8]]]]]]],
-          dtype=dtype.as_numpy_dtype())
+          dtype=dtype.as_numpy_dtype)
       expected_input_grad = np.array(
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
-          dtype=dtype.as_numpy_dtype())
+          dtype=dtype.as_numpy_dtype)
       self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
index bd6b385dfc0..093ccb84a69 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_ops_test.py
@@ -156,16 +156,14 @@ def _VariableRankTest(self,
                 np.random.randn(*(indices_shape + extra_shape)), vtype)
 
           # Clips small values to avoid division by zero.
-          def clip_small_values(x):
-            threshold = 1e-4
-            sign = np.sign(x)
+          threshold = np.array(1e-4, dtype=vtype)
+          sign = np.sign(updates)
+          if vtype == np.int32:
+            threshold = 1
+            sign = np.random.choice([-1, 1], updates.shape)
+          updates = np.where(
+              np.abs(updates) < threshold, threshold * sign, updates)
 
-            if isinstance(x, np.int32):
-              threshold = 1
-              sign = np.random.choice([-1, 1])
-            return threshold * sign if np.abs(x) < threshold else x
-
-          updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
 
           # Scatter via numpy
@@ -180,13 +178,18 @@ def clip_small_values(x):
           self.evaluate(ref.initializer)
           self.evaluate(tf_scatter(ref, indices, updates))
           self.assertAllCloseAccordingToType(
-              self.evaluate(ref), new, half_rtol=5e-3, half_atol=5e-3)
+              self.evaluate(ref),
+              new,
+              half_rtol=5e-3,
+              half_atol=5e-3,
+              bfloat16_rtol=5e-2,
+              bfloat16_atol=5e-2)
 
   def _VariableRankTests(self,
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    vtypes = [np.float32, np.float64]
+    vtypes = [np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype]
     if tf_scatter != state_ops.scatter_div:
       vtypes.append(np.int32)
       # float16 is numerically unstable for div
diff --git a/tensorflow/python/kernel_tests/array_ops/shape_ops_test.py b/tensorflow/python/kernel_tests/array_ops/shape_ops_test.py
index f1133897fe2..083df45d903 100644
--- a/tensorflow/python/kernel_tests/array_ops/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/shape_ops_test.py
@@ -487,6 +487,8 @@ def testUnknownInputShape(self):
   def testTypes(self):
     types_to_test = {
         "bool": (dtypes.bool, bool),
+        "bfloat16": (dtypes.bfloat16, float),
+        "float16": (dtypes.float16, float),
         "float32": (dtypes.float32, float),
         "float64": (dtypes.float64, float),
         "complex64": (dtypes.complex64, complex),
diff --git a/tensorflow/python/kernel_tests/array_ops/slice_op_test.py b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
index a5ee51dba75..d1eee67f6dc 100644
--- a/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/slice_op_test.py
@@ -145,6 +145,7 @@ def testSingleDimension(self):
         slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
+  @test_util.run_without_tensor_float_32("Use FP32 in conv3d.")
   def test3Dimension(self):
     with self.cached_session():
       input_shape = [8, 16, 16, 16, 8]
@@ -248,6 +249,7 @@ def testSimple(self):
           np.float64,
           np.complex64,
           np.complex128,
+          dtypes.bfloat16.as_numpy_dtype,
       ]:
         inp = np.random.rand(4, 4).astype(dtype)
         a = constant_op.constant(
diff --git a/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py
index 5e682364837..e1e395ad055 100644
--- a/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/spacetobatch_op_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Functional tests for SpaceToBatch and BatchToSpace ops."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -92,38 +93,41 @@ def batch_to_space(*args, **kwargs):
     return gen_array_ops.batch_to_space(*args, **kwargs)
 
 
-class SpaceToBatchTest(test.TestCase, PythonOpImpl):
+class SpaceToBatchTest(test.TestCase, parameterized.TestCase, PythonOpImpl):
   """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops.
 
   This uses the Python compatibility wrapper that forwards to space_to_batch_nd.
   """
 
-  def _testPad(self, inputs, paddings, block_size, outputs):
+  def _testPad(self,
+               inputs,
+               paddings,
+               block_size,
+               outputs,
+               dtype=dtypes.float32):
     with self.cached_session():
       # outputs = space_to_batch(inputs)
       x_tf = self.space_to_batch(
-          math_ops.cast(inputs, dtypes.float32),
-          paddings,
-          block_size=block_size)
+          math_ops.cast(inputs, dtype), paddings, block_size=block_size)
       self.assertAllEqual(x_tf, outputs)
       # inputs = batch_to_space(outputs)
       x_tf = self.batch_to_space(
-          math_ops.cast(outputs, dtypes.float32),
-          paddings,
-          block_size=block_size)
+          math_ops.cast(outputs, dtype), paddings, block_size=block_size)
       self.assertAllEqual(x_tf, inputs)
 
-  def _testOne(self, inputs, block_size, outputs):
+  def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     paddings = np.zeros((2, 2), dtype=np.int32)
-    self._testPad(inputs, paddings, block_size, outputs)
+    self._testPad(inputs, paddings, block_size, outputs, dtype)
 
   # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  @parameterized.parameters(dtypes.float32, dtypes.float16, dtypes.bfloat16,
+                            dtypes.uint8)
   @test_util.run_deprecated_v1
-  def testSmallInput2x2(self):
+  def testSmallInput2x2(self, dtype):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
     x_out = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-    self._testOne(x_np, block_size, x_out)
+    self._testOne(x_np, block_size, x_out, dtype)
 
   # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py
index 099cde46916..115a6710042 100644
--- a/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/spacetodepth_op_test.py
@@ -50,7 +50,9 @@ def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
     x_out = [[[[1, 2, 3, 4]]]]
-    for dtype in [dtypes.float32, dtypes.float16, dtypes.uint8]:
+    for dtype in [
+        dtypes.float32, dtypes.float16, dtypes.bfloat16, dtypes.uint8
+    ]:
       self._testOne(x_np, block_size, x_out, dtype=dtype)
 
 
diff --git a/tensorflow/python/kernel_tests/array_ops/split_op_test.py b/tensorflow/python/kernel_tests/array_ops/split_op_test.py
index 1d5062fc65a..ba164e6ecd1 100644
--- a/tensorflow/python/kernel_tests/array_ops/split_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/split_op_test.py
@@ -27,7 +27,7 @@
 from tensorflow.python.platform import test
 
 _TEST_DTYPES = (dtypes.int8, dtypes.float32, dtypes.float64, dtypes.complex64,
-                dtypes.complex128)
+                dtypes.complex128, dtypes.bfloat16)
 
 
 class SplitOpTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/array_ops/stack_op_test.py b/tensorflow/python/kernel_tests/array_ops/stack_op_test.py
index f46e4f05e03..a27f3a7a103 100644
--- a/tensorflow/python/kernel_tests/array_ops/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/stack_op_test.py
@@ -83,8 +83,9 @@ def f():
       y = gen_array_ops.parallel_concat(values=[["tf"]], shape=0)
       return y
 
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                r"0th dimension of value .* is less than"):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, r"0th dimension .* must be greater than"
+    ):
       f()
 
   def testSimpleParallelGPU(self):
diff --git a/tensorflow/python/kernel_tests/control_flow/BUILD b/tensorflow/python/kernel_tests/control_flow/BUILD
index e24bae7392f..9794d545f38 100644
--- a/tensorflow/python/kernel_tests/control_flow/BUILD
+++ b/tensorflow/python/kernel_tests/control_flow/BUILD
@@ -1,8 +1,12 @@
 # Tests of TensorFlow control flow ops written using the Python API.
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "cond_v2_test",
@@ -24,7 +28,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:while_v2",
         "//tensorflow/python/compat",
-    ],
+    ] + tf_additional_xla_deps_py(),
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py b/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
index 7a356b56380..13cbe397d1f 100644
--- a/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/cond_v2_test.py
@@ -15,6 +15,8 @@
 
 """Tests for cond_v2."""
 
+import os
+
 from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
@@ -26,22 +28,29 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.module import module as module_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gen_optional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load as load_lib
+from tensorflow.python.saved_model import save as save_lib
 from tensorflow.python.training import saver
 from tensorflow.python.util import compat
 
+
 _OPTIONAL_OPS = frozenset([
     "OptionalFromValue", "OptionalNone", "OptionalHasValue", "OptionalGetValue"
 ])
@@ -142,6 +151,82 @@ def false_fn():
     output = build_cond_with_indexed_slices()
     self.assertAllEqual(output, [1.])
 
+  def testCondNestedFunctionGradientWithSavedModel(self):
+    class Model(module_lib.Module):
+
+      def __init__(self):
+        self.v = resource_variable_ops.ResourceVariable([[1., 1.], [1., 1.]])
+
+      @def_function.function
+      def call(self, x, cond):
+
+        @def_function.function
+        def true_fn():
+          # Einsum doesn't have a symbolic gradient op registered.
+          # Taking gradient of an einsum op will fail if its python gradient
+          # function is not found after loaded from a SavedModel.
+          return gen_linalg_ops.einsum([x, self.v], "ab,bc->ac")
+
+        @def_function.function
+        def false_fn():
+          return x
+
+        return cond_v2.cond_v2(cond > 0, true_fn, false_fn)
+
+    model = Model()
+    x = constant_op.constant([[1., 1.], [1., 1.]])
+    cond = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      y = tape.gradient(model.call(x, cond), model.v)
+
+    self.assertAllEqual(y, [[2., 2.], [2., 2.]])
+
+    saved_model_dir = os.path.join(self.create_tempdir(), "saved_model")
+    save_lib.save(model, saved_model_dir)
+    loaded_model = load_lib.load(saved_model_dir)
+    with backprop.GradientTape() as tape:
+      y = tape.gradient(loaded_model.call(x, cond), loaded_model.v)
+
+    self.assertAllEqual(y, [[2., 2.], [2., 2.]])
+
+  def testCondNestedFunctionGradientWithXlaDynamicCondition(self):
+
+    v = resource_variable_ops.ResourceVariable([[1., 1.], [1., 1.]])
+
+    @def_function.function(
+        jit_compile=True,
+        input_signature=[
+            tensor_spec.TensorSpec([None, 2]),
+            tensor_spec.TensorSpec([]),
+        ],
+    )
+    def f(x, cond):
+
+      @def_function.function
+      def true_fn():
+        return gen_linalg_ops.einsum([x, v], "ab,bc->ac")
+
+      @def_function.function
+      def false_fn():
+        return x
+
+      return cond_v2.cond_v2(cond > 0, true_fn, false_fn)
+
+    x = constant_op.constant([[1., 1.], [1., 1.]])
+    cond = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      # Shape of x in HLO graph should be [<=2, 2].
+      y = tape.gradient(f(x, cond), v)
+
+    self.assertAllEqual(y, [[2., 2.], [2., 2.]])
+
+    x = constant_op.constant([[1., 1.], [1., 1.], [1., 1.]])
+    with backprop.GradientTape() as tape:
+      # HLO graph should be re-compiled to handle x with shape [<=3, 2].
+      y = tape.gradient(f(x, cond), v)
+
+    self.assertAllEqual(y, [[3., 3.], [3., 3.]])
+
   def testExternalControlDependencies(self):
     with ops.Graph().as_default(), self.test_session():
       v = variables.Variable(1.0)
@@ -1310,12 +1395,14 @@ def f(c):
       x = constant_op.constant(1., name="x")
 
       def then_branch():
-        return x ** 2., gen_dataset_ops.optional_from_value(
-            [constant_op.constant(1)])
+        return x**2.0, gen_optional_ops.optional_from_value(
+            [constant_op.constant(1)]
+        )
 
       def else_branch():
-        return x ** 3., gen_dataset_ops.optional_from_value(
-            [constant_op.constant(1.)])
+        return x**3.0, gen_optional_ops.optional_from_value(
+            [constant_op.constant(1.0)]
+        )
 
       y, _ = cond_v2.cond_v2(c, then_branch, else_branch)
       return gradients_impl.gradients(y, x)
diff --git a/tensorflow/python/kernel_tests/control_flow/scan_ops_test.py b/tensorflow/python/kernel_tests/control_flow/scan_ops_test.py
index 50429ee3c2f..7747af49cdc 100644
--- a/tensorflow/python/kernel_tests/control_flow/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/control_flow/scan_ops_test.py
@@ -69,8 +69,14 @@ def handle_options(func, x, axis, exclusive, reverse):
 class CumsumTest(test.TestCase):
 
   valid_dtypes = [
-      np.int32, np.int64, np.float16, np.float32, np.float64, np.complex64,
-      np.complex128
+      np.int32,
+      np.int64,
+      np.float16,
+      np.float32,
+      np.float64,
+      np.complex64,
+      np.complex128,
+      dtypes.bfloat16.as_numpy_dtype,
   ]
 
   def _compare(self, x, axis, exclusive, reverse):
@@ -103,7 +109,12 @@ def testAxisType(self):
 
   @test_util.run_deprecated_v1
   def testNaN(self):
-    for dtype in (np.float16, np.float32, np.float64):
+    for dtype in (
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ):
       for nan_idx in range(0, 5):
         x = np.arange(1, 6).reshape([5]).astype(dtype)
         x[nan_idx] = np.nan
@@ -202,8 +213,14 @@ def testGradient2D(self):
 class CumprodTest(test.TestCase):
 
   valid_dtypes = [
-      np.int32, np.int64, np.float16, np.float32, np.float64, np.complex64,
-      np.complex128
+      np.int32,
+      np.int64,
+      np.float16,
+      np.float32,
+      np.float64,
+      np.complex64,
+      np.complex128,
+      dtypes.bfloat16.as_numpy_dtype,
   ]
 
   def _compare(self, x, axis, exclusive, reverse):
diff --git a/tensorflow/python/kernel_tests/custom_ops/BUILD b/tensorflow/python/kernel_tests/custom_ops/BUILD
index f6dae06e03a..eea611fd911 100644
--- a/tensorflow/python/kernel_tests/custom_ops/BUILD
+++ b/tensorflow/python/kernel_tests/custom_ops/BUILD
@@ -3,7 +3,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 # Custom op tests
 tf_custom_op_library(
diff --git a/tensorflow/python/kernel_tests/data_structures/BUILD b/tensorflow/python/kernel_tests/data_structures/BUILD
index f2a0bbd9d4a..5897b4e7370 100644
--- a/tensorflow/python/kernel_tests/data_structures/BUILD
+++ b/tensorflow/python/kernel_tests/data_structures/BUILD
@@ -3,7 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/kernel_tests/data_structures/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/data_structures/dynamic_partition_op_test.py
index 58dae908379..844c66c0994 100644
--- a/tensorflow/python/kernel_tests/data_structures/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/dynamic_partition_op_test.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import config
@@ -30,12 +31,13 @@
 from tensorflow.python.platform import test
 
 
-class DynamicPartitionTest(test.TestCase):
+class DynamicPartitionTest(test.TestCase, parameterized.TestCase):
 
+  @parameterized.parameters(dtypes.float32, dtypes.bfloat16)
   @test_util.run_deprecated_v1
-  def testSimpleOneDimensional(self):
+  def testSimpleOneDimensional(self, dtype):
     with self.session():
-      data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
+      data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtype)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
@@ -366,5 +368,6 @@ def testMultiGPU(self):
     if device_list:
       self.assertAllEqual(results, np.zeros((len(device_list), 10, 100)))
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/data_structures/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/data_structures/dynamic_stitch_op_test.py
index 05e0cb6be29..6e2ffb0fced 100644
--- a/tensorflow/python/kernel_tests/data_structures/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/dynamic_stitch_op_test.py
@@ -64,7 +64,12 @@ def testSimpleOneDimensional(self):
     # Test various datatypes in the simple case to ensure that the op was
     # registered under those types.
     dtypes_to_test = [
-        dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
+        dtypes.float32,
+        dtypes.float16,
+        dtypes.bfloat16,
+        dtypes.qint8,
+        dtypes.quint8,
+        dtypes.qint32,
     ]
     for dtype in dtypes_to_test:
       indices = [
@@ -221,6 +226,19 @@ def testErrorDataAndIndicesSizeMismatch(self):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  def testOutOfBoundsIndexRaisesInvalidArgument(self):
+    with self.assertRaisesRegex(errors.InvalidArgumentError, "out of range"):
+      indices = [[-1000], [405], [519], [758], [1015]]
+      data = [
+          [110.27793884277344],
+          [120.29475402832031],
+          [157.2418212890625],
+          [157.2626953125],
+          [188.45382690429688],
+      ]
+
+      self.evaluate(self.stitch_op(indices, data))
+
 
 class DynamicStitchTest(DynamicStitchTestBase, test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py b/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
index f1e14a0279c..54a34d7e473 100644
--- a/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/lookup_ops_test.py
@@ -41,6 +41,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import variables
@@ -573,6 +574,20 @@ def false_fn():
     self.evaluate(lookup_ops.tables_initializer())
     self.assertAllEqual(grad, -10.)
 
+  def testImportShapeInference(self, is_anonymous):
+    v = variables.Variable(1)
+
+    @def_function.function(jit_compile=True)
+    def foo():
+      return gen_lookup_ops.lookup_table_import_v2(
+          table_handle=v.handle, keys=[1.1, 2.2], values=1
+      )
+
+    with self.assertRaisesRegex(
+        ValueError, r"Shape must be at least rank 1 but is rank 0"
+    ):
+      foo()
+
   def testExportShapeInference(self, is_anonymous):
     table = self.getHashTable()(
         lookup_ops.KeyValueTensorInitializer(
diff --git a/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
index 509ebdad6dd..ea82744fede 100644
--- a/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/tensor_array_ops_test.py
@@ -302,6 +302,7 @@ def _testTensorArrayUnpackReadMaybeLegacy(self):
     self._testTensorArrayUnpackRead(dtypes.complex64)
     self._testTensorArrayUnpackRead(dtypes.complex128)
     self._testTensorArrayUnpackRead(dtypes.string)
+    self._testTensorArrayUnpackRead(dtypes.bfloat16)
 
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
@@ -357,6 +358,7 @@ def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.complex64)
     self._testTensorArraySplitRead(dtypes.complex128)
     self._testTensorArraySplitRead(dtypes.string)
+    self._testTensorArraySplitRead(dtypes.bfloat16)
 
   @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
   @test_util.run_v1_only("v2 does not support TensorArray.grad.")
@@ -1785,6 +1787,17 @@ def testSkipEagerTensorArrayInt64GPU(self):
       self.assertAllEqual(v0, -3)
       self.assertAllEqual(v1, 100)
 
+  @test_util.deprecated_graph_mode_only
+  def testTensorArrayScatterBfloat16GPU(self):
+    if not test.is_gpu_available():
+      return
+    with self.session(force_gpu=True) as sess:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.bfloat16, tensor_array_name="foo", size=5)
+      ta = ta.scatter(
+          indices=[3, 4], value=array_ops.ones([2], dtype=dtypes.bfloat16))
+      self.assertAllEqual(ta.stack(), [0., 0., 0., 1., 1.])
+
   def testInferShapeFalseValid(self):
     ta = tensor_array_ops.TensorArray(
         dtypes.float32, size=3, infer_shape=False, element_shape=[None, 10, 20])
@@ -1833,6 +1846,22 @@ def testStackShapeOnStaticSize(self):
     ta = ta.write(0, [0])
     self.assertEqual([42, 1], ta.stack().shape.as_list())
 
+  def testTensorArrayConcatFailsWhenMissingStepContainer(self):
+    @def_function.function
+    def func():
+      y = data_flow_ops.TensorArrayConcatV2(
+          handle=["a", "b"],
+          flow_in=0.1,
+          dtype=dtypes.int32,
+          element_shape_except0=1,
+      )
+      return y
+
+    with self.assertRaisesRegex(
+        errors.NotFoundError, "Container .* does not exist"
+    ):
+      self.evaluate(func())
+
 
 class TensorArrayBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 6a34e52fccf..b8c22235170 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "bijector_test",
diff --git a/tensorflow/python/kernel_tests/histogram_ops_test.py b/tensorflow/python/kernel_tests/histogram_ops_test.py
index 95a107fdf63..67aa14e4cc0 100644
--- a/tensorflow/python/kernel_tests/histogram_ops_test.py
+++ b/tensorflow/python/kernel_tests/histogram_ops_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for tensorflow.ops.histogram_ops."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -25,7 +26,7 @@
 from tensorflow.python.platform import test
 
 
-class BinValuesFixedWidth(test.TestCase):
+class BinValuesFixedWidth(test.TestCase, parameterized.TestCase):
 
   def test_empty_input_gives_all_zero_counts(self):
     # Bins will be:
@@ -39,23 +40,14 @@ def test_empty_input_gives_all_zero_counts(self):
       self.assertEqual(dtypes.int32, bins.dtype)
       self.assertAllClose(expected_bins, self.evaluate(bins))
 
-  def test_1d_values_int32_output(self):
+  @parameterized.parameters(
+      np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+  )
+  def test_1d_values_int32_output(self, dtype):
     # Bins will be:
     #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    value_range = [0.0, 5.0]
-    values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-    expected_bins = [0, 0, 1, 2, 4, 4]
-    with self.cached_session():
-      bins = histogram_ops.histogram_fixed_width_bins(
-          values, value_range, nbins=5, dtype=dtypes.int64)
-      self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, self.evaluate(bins))
-
-  def test_1d_float64_values_int32_output(self):
-    # Bins will be:
-    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-    value_range = np.float64([0.0, 5.0])
-    values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
+    value_range = np.array([0.0, 5.0]).astype(dtype)
+    values = np.array([-1.0, 0.0, 1.5, 2.0, 5.0, 15]).astype(dtype)
     expected_bins = [0, 0, 1, 2, 4, 4]
     with self.cached_session():
       bins = histogram_ops.histogram_fixed_width_bins(
@@ -87,7 +79,6 @@ def test_negative_nbins(self):
         self.evaluate(bins)
 
 
-
 class HistogramFixedWidthTest(test.TestCase):
 
   def setUp(self):
@@ -192,5 +183,18 @@ def test_range_overflow(self):
         nbins=2)
     self.assertAllEqual(hist, [1, 1])
 
+  def test_large_range(self):
+    hist = histogram_ops.histogram_fixed_width(
+        values=constant_op.constant(
+            [-(2**31), 2**31 - 1], dtype=dtypes.int32
+        ),
+        value_range=constant_op.constant(
+            [-(2**31), 2**31 - 1], dtype=dtypes.int32
+        ),
+        nbins=2,
+    )
+    self.assertAllEqual(hist, [1, 1])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index 81edb44161e..b23024d46c1 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_py_test(
     name = "attention_ops_test",
diff --git a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_op_test.py
index 3247fbb428a..afdb60dd89a 100644
--- a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_op_test.py
@@ -41,14 +41,23 @@ def _VerifyValues(self, image, ksizes, strides, rates, padding, patches):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    out_tensor = array_ops.extract_image_patches(
-        constant_op.constant(image),
-        ksizes=ksizes,
-        strides=strides,
-        rates=rates,
-        padding=padding,
-        name="im2col")
-    self.assertAllClose(patches, self.evaluate(out_tensor))
+    for dtype in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
+      out_tensor = array_ops.extract_image_patches(
+          constant_op.constant(image, dtype=dtype),
+          ksizes=ksizes,
+          strides=strides,
+          rates=rates,
+          padding=padding,
+          name="im2col",
+      )
+      self.assertAllClose(
+          np.array(patches, dtype=dtype), self.evaluate(out_tensor)
+      )
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/image_ops/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/image_ops/extract_volume_patches_op_test.py
index aa3250b3c57..c267cd2c5c2 100644
--- a/tensorflow/python/kernel_tests/image_ops/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/image_ops/extract_volume_patches_op_test.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -41,7 +42,12 @@ def _VerifyValues(self, image, ksizes, strides, padding, patches):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dtype in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       out_tensor = array_ops.extract_volume_patches(
           constant_op.constant(image.astype(dtype)),
           ksizes=ksizes,
diff --git a/tensorflow/python/kernel_tests/io_ops/BUILD b/tensorflow/python/kernel_tests/io_ops/BUILD
index 9d134157559..69c8d5352a1 100644
--- a/tensorflow/python/kernel_tests/io_ops/BUILD
+++ b/tensorflow/python/kernel_tests/io_ops/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_py_test(
     name = "checkpoint_ops_test",
diff --git a/tensorflow/python/kernel_tests/io_ops/parsing_ops_test.py b/tensorflow/python/kernel_tests/io_ops/parsing_ops_test.py
index 1029353466d..7c1b9a22622 100644
--- a/tensorflow/python/kernel_tests/io_ops/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops/parsing_ops_test.py
@@ -2308,11 +2308,17 @@ def _decode_v1(self, words):
       byte_tensor = parsing_ops.decode_raw_v1(example_tensor, dtypes.uint8)
       return self.evaluate(byte_tensor)
 
-  def _decode_v2(self, words, fixed_length=None):
+  def _decode_v2(
+      self, words, fixed_length=None, dtype=dtypes.uint8, little_endian=True
+  ):
     with self.cached_session():
       examples = np.array(words)
       byte_tensor = parsing_ops.decode_raw(
-          examples, dtypes.uint8, fixed_length=fixed_length)
+          examples,
+          dtype,
+          little_endian=little_endian,
+          fixed_length=fixed_length,
+      )
       return self.evaluate(byte_tensor)
 
   def _ordinalize(self, words, fixed_length=None):
@@ -2367,6 +2373,39 @@ def testDecodeRawV2VariableLength(self):
     self.assertAllEqual(expected.shape, observed.shape)
     self.assertAllEqual(expected, observed)
 
+  def testDecodeRawInvalidFixedLengthSize(self):
+    input_bytes = ["1"]
+    # Different error messages depending on shape inference vs kernel.
+    with self.assertRaisesRegex(
+        (ValueError, errors_impl.InvalidArgumentError),
+        "must be a multiple of|evenly divisible by",
+    ):
+      self.evaluate(
+          self._decode_v2(input_bytes, fixed_length=7, dtype=dtypes.float32)
+      )
+
+  def testDecodeRawExtendedInputBytesLittleEndian(self):
+    # Input bytes properly padded internally to at least dtype size.
+    input_bytes = ["\x01\x23"]
+    observed = self._decode_v2(
+        input_bytes, fixed_length=8, dtype=dtypes.int32, little_endian=True
+    )
+    expected = np.array([[0x00002301, 0]], dtype=np.int32)
+
+    self.assertAllEqual(expected.shape, observed.shape)
+    self.assertAllEqual(expected, observed)
+
+  def testDecodeRawExtendedInputBytesBigEndian(self):
+    # Input bytes properly padded internally to at least dtype size.
+    input_bytes = [b"\x01\x23"]
+    observed = self._decode_v2(
+        input_bytes, fixed_length=8, dtype=dtypes.int32, little_endian=False
+    )
+    expected = np.array([[0x01230000, 0]], dtype=np.int32)
+
+    self.assertAllEqual(expected.shape, observed.shape)
+    self.assertAllEqual(expected, observed)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class DecodeJSONExampleTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index ba3c795e600..09cae490148 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "cholesky_op_test",
@@ -617,7 +620,6 @@ cuda_py_test(
     srcs = ["matrix_solve_ls_op_test.py"],
     tags = [
         "noasan",  # TODO(b/337374867) fails with -fsanitize=null
-        "nomac",  # TODO(b/233829166): Breaks nightly build
     ],
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index e368ebecdab..6c3882773a4 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -119,6 +119,24 @@ def test_is_non_singular_auto_set(self):
       linalg.LinearOperatorComposition(
           [operator_1, operator_2], is_non_singular=False)
 
+  def test_is_spd_is_auto_set(self):
+    matrix = [[11., 0.], [1., 8.]]
+    x = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+    y = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=True)
+
+    operator = linalg.LinearOperatorComposition(
+        [x, y, y.H, x.H], is_non_singular=None)
+
+    self.assertTrue(operator.is_self_adjoint)
+    self.assertTrue(operator.is_positive_definite)
+    self.assertTrue(operator.is_non_singular)
+
+    with self.assertRaisesRegex(ValueError, "self-adjoint"):
+      linalg.LinearOperatorComposition([x, x.H], is_self_adjoint=False)
+
+    with self.assertRaisesRegex(ValueError, "non-singular"):
+      linalg.LinearOperatorComposition([x, x.H], is_non_singular=False)
+
   def test_name(self):
     matrix = [[11., 0.], [1., 8.]]
     operator_1 = linalg.LinearOperatorFullMatrix(matrix, name="left")
@@ -259,6 +277,14 @@ def test_shape_tensors_when_only_dynamically_available(self):
       self.assertAllEqual(
           (1, 2, 3, 5), operator.shape_tensor().eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
+  def test_is_square_set_for_aat_form(self):
+    mat_ph = array_ops.placeholder(dtypes.float64)  # No shape set at all.
+    x = linalg.LinearOperatorFullMatrix(mat_ph, is_square=False)
+
+    operator = linalg.LinearOperatorComposition([x, x.H])
+    self.assertTrue(operator.is_square)
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(SquareLinearOperatorCompositionTest)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index 9e1e237579b..f3a2d713eec 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -124,6 +124,41 @@ def test_convert_variables_to_tensors(self):
       sess.run([tril.initializer])
       self.check_convert_variables_to_tensors(operator)
 
+  def test_llt_composition_with_pd_l(self):
+    l = linalg_lib.LinearOperatorLowerTriangular(
+        [[1., 0.], [0.5, 0.2]], is_non_singular=True, is_positive_definite=True)
+    self.assertIs(l, (l @ l.H).cholesky())
+
+  def test_llt_composition_with_non_pd_l(self):
+    # The tril matrix here is selected so that multiplying the rows by the sign
+    # (the correct thing to do) is different than multiplying the columns.
+    l = linalg_lib.LinearOperatorLowerTriangular(
+        [[-1., 0., 0.], [0.5, 0.2, 0.], [0.1, 0.1, 1.]], is_non_singular=True)
+    llt = l @ l.H
+    chol = llt.cholesky()
+    self.assertIsInstance(chol, linalg_lib.LinearOperatorLowerTriangular)
+    self.assertGreater(self.evaluate(chol.diag_part()).min(), 0)
+    self.assertAllClose(
+        self.evaluate(llt.to_dense()), self.evaluate(
+            (chol @ chol.H).to_dense()))
+
+  def test_llt_composition_with_non_pd_complex_l(self):
+    # The tril matrix here is selected so that multiplying the rows by the sign
+    # (the correct thing to do) is different than multiplying the columns.
+    i = math_ops.complex(0., 1.)
+    l = linalg_lib.LinearOperatorLowerTriangular(
+        [[-1. + i, 0., 0.], [0.5, 0.2 - 2 * i, 0.], [0.1, 0.1, 1.]],
+        is_non_singular=True)
+    llt = l @ l.H
+    chol = llt.cholesky()
+    self.assertIsInstance(chol, linalg_lib.LinearOperatorLowerTriangular)
+    self.assertGreater(self.evaluate(math_ops.real(chol.diag_part())).min(), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.imag(chol.diag_part())).min(), 0)
+    self.assertAllClose(
+        self.evaluate(llt.to_dense()), self.evaluate(
+            (chol @ chol.H).to_dense()))
+
 
 if __name__ == "__main__":
   config.enable_tensor_float_32_execution(False)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 934954c9b4b..1105b52300d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -275,8 +275,12 @@ def test_linear_operator_matmul_hints_closed(self):
 
     self.assertTrue(operator_matmul.is_square)
     self.assertTrue(operator_matmul.is_non_singular)
-    self.assertEqual(None, operator_matmul.is_self_adjoint)
-    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    # A @ A is SA since A is.
+    self.assertEqual(True, operator_matmul.is_self_adjoint)
+
+    # A @ A is non-singular (since A is) and A @ A = A @ A.H is semi-def so...
+    self.assertEqual(True, operator_matmul.is_positive_definite)
 
   def test_linear_operator_matmul_hints_false(self):
     matrix1 = array_ops.placeholder_with_default(
@@ -308,19 +312,20 @@ def test_linear_operator_matmul_hints_false(self):
 
     operator_matmul = operator2.matmul(operator2, adjoint_arg=True)
 
+    # Composition recognizes this as the form A @ A.H, which is square, SA.
+    self.assertTrue(operator_matmul.is_square)
+    self.assertTrue(operator_matmul.is_self_adjoint)
+
     if context.executing_eagerly():
-      self.assertTrue(operator_matmul.is_square)
       # False since we specified is_non_singular=False.
       self.assertFalse(operator_matmul.is_non_singular)
     else:
-      self.assertIsNone(operator_matmul.is_square)
       # May be non-singular, since it's the composition of two non-square.
       # TODO(b/136162840) This is a bit inconsistent, and should probably be
       # False since we specified operator2.is_non_singular == False.
       self.assertIsNone(operator_matmul.is_non_singular)
 
     # No way to deduce these, even in Eager mode.
-    self.assertIsNone(operator_matmul.is_self_adjoint)
     self.assertIsNone(operator_matmul.is_positive_definite)
 
   def test_linear_operator_matmul_hint_infer_square(self):
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 88783679946..e28d1b2cae2 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -21,6 +21,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
@@ -298,6 +299,64 @@ def test_incompatible_dimensions_raise(self):
     # pylint: enable=g-error-prone-assert-raises
 
 
+class IsAdjointPairTest(test.TestCase):
+
+  def test_one_is_explicitly_adjoint_of_other_returns_true(self):
+    x = linalg_lib.LinearOperatorFullMatrix(
+        [[1., 2.], [3., 4.]], is_self_adjoint=False)
+    self.assertTrue(linear_operator_util.is_adjoint_pair(x, x.H))
+    self.assertTrue(linear_operator_util.is_adjoint_pair(x.H, x))
+
+  def test_repeated_non_self_adjoint_operator_returns_false(self):
+    x = linalg_lib.LinearOperatorFullMatrix(
+        [[1., 2.], [3., 4.]], is_self_adjoint=False)
+    self.assertFalse(linear_operator_util.is_adjoint_pair(x, x))
+
+  def test_repeated_self_adjoint_operator_returns_true(self):
+    x = linalg_lib.LinearOperatorFullMatrix(
+        [[1., 2.], [2., 1.]], is_self_adjoint=True)
+    self.assertTrue(linear_operator_util.is_adjoint_pair(x, x))
+
+  def test_pair_of_non_self_adjoint_operator_returns_false(self):
+    x = linalg_lib.LinearOperatorFullMatrix(
+        [[1., 2.], [3., 4.]], is_self_adjoint=False)
+    y = linalg_lib.LinearOperatorFullMatrix(
+        [[10., 20.], [3., 4.]], is_self_adjoint=False)
+    self.assertFalse(linear_operator_util.is_adjoint_pair(x, y))
+
+
+class IsAATFormTest(test.TestCase):
+  # Careful when writing tests to avoid LinearOperatorDiag, since D.H is D for
+  # real D and this will be confusing.
+
+  def test_empty_operators_raises(self):
+    with self.assertRaisesRegex(ValueError, "empty operators"):
+      linear_operator_util.is_aat_form(operators=[])
+
+  def test_odd_length_returns_false(self):
+    x = linalg_lib.LinearOperatorFullMatrix([[1., 2.], [2., 1]],
+                                            is_self_adjoint=True)
+    self.assertFalse(linear_operator_util.is_aat_form([x]))
+    self.assertFalse(linear_operator_util.is_aat_form([x, x, x.H]))
+
+  def test_length_2_aat_form_with_sa_x(self):
+    x = linalg_lib.LinearOperatorFullMatrix([[1., 2.], [2., 1]],
+                                            is_self_adjoint=True)
+    self.assertTrue(linear_operator_util.is_aat_form([x, x.H]))
+
+  def test_length_2_aat_form_with_non_sa_x(self):
+    x = linalg_lib.LinearOperatorFullMatrix([[1., 5.], [2., 1]],
+                                            is_self_adjoint=False)
+    self.assertTrue(linear_operator_util.is_aat_form([x, x.H]))
+
+  def test_length_4_aat_form(self):
+    x = linalg_lib.LinearOperatorFullMatrix([[1., 2.], [5., 1]],
+                                            is_self_adjoint=False)
+    y = linalg_lib.LinearOperatorFullMatrix([[10., 2.], [3., 10]],
+                                            is_self_adjoint=False)
+    self.assertTrue(linear_operator_util.is_aat_form([x, y, y.H, x.H]))
+
+
 class DummyOperatorWithHint(object):
 
   def __init__(self, **kwargs):
diff --git a/tensorflow/python/kernel_tests/linalg/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/linalg/matrix_exponential_op_test.py
index 485c78960c3..95b11e6b3ae 100644
--- a/tensorflow/python/kernel_tests/linalg/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/linalg/matrix_exponential_op_test.py
@@ -43,6 +43,7 @@ def np_expm(x):  # pylint: disable=invalid-name
   return y
 
 
+@test_util.run_all_without_tensor_float_32("Avoid TF32-based matmuls.")
 class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
diff --git a/tensorflow/python/kernel_tests/linalg/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/linalg/matrix_inverse_op_test.py
index 126e455d8bc..fd70e436ed5 100644
--- a/tensorflow/python/kernel_tests/linalg/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/linalg/matrix_inverse_op_test.py
@@ -18,10 +18,12 @@
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
@@ -30,24 +32,34 @@
 
 class InverseOpTest(test.TestCase):
 
+  def _high_precision_matmul(self, a, b, adjoint_b):
+    """Do a higher-precision matmul, casting either to float32 or float64."""
+    if a.dtype == dtypes.float16:
+      a = math_ops.cast(a, dtypes.float32)
+      b = math_ops.cast(b, dtypes.float32)
+
+    ret = test_util.matmul_without_tf32(a, b, adjoint_b=adjoint_b)
+    return math_ops.cast(ret, a.dtype)
+
   def _verifyInverse(self, x, np_type):
     for adjoint in False, True:
       y = x.astype(np_type)
-      with self.cached_session():
+      with self.cached_session(use_gpu=test_util.is_gpu_available()):
         # Verify that x^{-1} * x == Identity matrix.
         inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
-        tf_ans = test_util.matmul_without_tf32(inv, y, adjoint_b=adjoint)
-        np_ans = np.identity(y.shape[-1])
+        tf_ans = self._high_precision_matmul(inv, y, adjoint_b=adjoint)
+        np_ans = np.identity(y.shape[-1]).astype(np_type)
         if x.ndim > 2:
           tiling = list(y.shape)
           tiling[-2:] = [1, 1]
           np_ans = np.tile(np_ans, tiling)
         out = self.evaluate(tf_ans)
-        self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
+        self.assertAllCloseAccordingToType(
+            np_ans, out, atol=0.001, half_atol=0.1)
         self.assertShapeEqual(y, tf_ans)
 
   def _verifyInverseReal(self, x):
-    for np_type in [np.float32, np.float64]:
+    for np_type in [np.float16, np.float32, np.float64]:
       self._verifyInverse(x, np_type)
 
   def _verifyInverseComplex(self, x):
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index cafab822e7a..46ebae180e1 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/linalg/testdata/BUILD b/tensorflow/python/kernel_tests/linalg/testdata/BUILD
index 66d2332ee5f..e57e4503de8 100644
--- a/tensorflow/python/kernel_tests/linalg/testdata/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/testdata/BUILD
@@ -1,6 +1,7 @@
 # Data files for kernel tests.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/math_ops/BUILD b/tensorflow/python/kernel_tests/math_ops/BUILD
index 0812ace1751..cf85c3ac0ef 100644
--- a/tensorflow/python/kernel_tests/math_ops/BUILD
+++ b/tensorflow/python/kernel_tests/math_ops/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -27,6 +28,7 @@ cuda_py_test(
 cuda_py_test(
     name = "approx_topk_test",
     srcs = ["approx_topk_test.py"],
+    tags = ["no_oss"],
     xla_enable_strict_auto_jit = True,
     xla_enabled = True,
     deps = [
@@ -89,6 +91,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
+    tags = ["no_mac_arm64"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -292,6 +295,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
+    tags = ["no_mac_arm64"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -413,7 +417,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["tensordot_op_test.py"],
     shard_count = 20,
-    tags = ["no_rocm"],
+    tags = [
+        "no_mac_arm64",
+        "no_rocm",
+    ],
     xla_enable_strict_auto_jit = False,  # b/161856380
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/math_ops/aggregate_ops_test.py b/tensorflow/python/kernel_tests/math_ops/aggregate_ops_test.py
index f80fa15e677..65c813fba1b 100644
--- a/tensorflow/python/kernel_tests/math_ops/aggregate_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/aggregate_ops_test.py
@@ -37,8 +37,8 @@ class AddNTest(test.TestCase):
   def _supported_types(self):
     if test.is_gpu_available():
       return [
-          dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
-          dtypes.complex128, dtypes.int64
+          dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
+          dtypes.complex64, dtypes.complex128, dtypes.int64
       ]
     return [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
             dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
@@ -61,8 +61,10 @@ def testAddN(self):
           actual = self.evaluate(math_ops.add_n(data))
           expected = np.sum(np.vstack(
               [np.expand_dims(d, 0) for d in data]), axis=0)
-          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
-          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+          self.assertAllCloseAccordingToType(
+              expected, actual, rtol=2e-6, atol=2e-6,
+              float_rtol=2e-6, float_atol=2e-6, half_rtol=5e-3, half_atol=5e-3,
+              bfloat16_rtol=2e-2, bfloat16_atol=2e-2)
 
   @test_util.run_deprecated_v1
   def testUnknownShapes(self):
@@ -75,8 +77,10 @@ def testUnknownShapes(self):
           actual = sess.run(math_ops.add_n([data_ph] * count), {data_ph: data})
           expected = np.sum(np.vstack([np.expand_dims(data, 0)] * count),
                             axis=0)
-          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
-          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+          self.assertAllCloseAccordingToType(
+              expected, actual, rtol=2e-6, atol=2e-6,
+              float_rtol=2e-6, float_atol=2e-6, half_rtol=5e-3, half_atol=5e-3,
+              bfloat16_rtol=2e-2, bfloat16_atol=2e-2)
 
   @test_util.run_deprecated_v1
   def testVariant(self):
diff --git a/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py b/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
index d3b04f64b59..f1f5271e01d 100644
--- a/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/approx_topk_test.py
@@ -23,6 +23,7 @@
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -147,18 +148,18 @@ def ann(qy, db, k):
   def test_l2ann(self, dtype, k, db_size, qy_size, feature_dim):
     qy = self._rng.random([qy_size, feature_dim])
     db = self._rng.random([db_size, feature_dim])
-    db_half_norm = np.linalg.norm(db, axis=1) / 2
+    db_half_norm_sq = np.linalg.norm(db, axis=1)**2 / 2
     qy_op = constant_op.constant(qy, dtype=dtype)
     db_op = constant_op.constant(db, dtype=dtype)
-    db_half_norm_op = constant_op.constant(db_half_norm, dtype=dtype)
+    db_half_norm_sq_op = constant_op.constant(db_half_norm_sq, dtype=dtype)
     # Must jit-compile to access the xla kernel.
     @function(jit_compile=True)
-    def ann(qy, db, db_half_norm, k):
-      scores = db_half_norm - math_ops.matmul(qy, db, transpose_b=True)
+    def ann(qy, db, db_half_norm_sq, k):
+      scores = db_half_norm_sq - math_ops.matmul(qy, db, transpose_b=True)
       return nn_ops.approx_min_k(scores, k)
 
-    _, idx = self.evaluate(ann(qy_op, db_op, db_half_norm_op, k))
-    scores = self.evaluate(db_half_norm_op -
+    _, idx = self.evaluate(ann(qy_op, db_op, db_half_norm_sq_op, k))
+    scores = self.evaluate(db_half_norm_sq_op -
                            math_ops.matmul(qy_op, db_op, transpose_b=True))
     gt = np.argsort(scores)[:, :k]
     ann_recall = self.compute_recall(idx, gt)
@@ -215,6 +216,26 @@ def ann_with_grads(db, out_grads):
         ann_with_grads(db_op, out_grads_op))
     self.assertAllClose(expected_in_grads, result_in_grads)
 
+  def test_invalid_input(self):
+    @function(jit_compile=True)
+    def fuzz_jit():
+      return nn_ops.approx_max_k(
+          [
+              183.39395141601562,
+              62.6842041015625,
+              83.8385238647461,
+              204.36642456054688,
+          ],
+          4774,
+          reduction_dimension=0x8282828,
+          recall_target=135.9822179933652,
+          reduction_input_size_override=6154,
+          aggregate_to_topk=True,
+      )
+
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      fuzz_jit()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/math_ops/argmax_op_test.py b/tensorflow/python/kernel_tests/math_ops/argmax_op_test.py
index 562b89dd938..1a8996f4bd2 100644
--- a/tensorflow/python/kernel_tests/math_ops/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/argmax_op_test.py
@@ -15,6 +15,7 @@
 """Tests for tensorflow.ops.argmax_op."""
 import functools
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -24,7 +25,7 @@
 from tensorflow.python.platform import test
 
 
-class ArgMaxTest(test.TestCase):
+class ArgMaxTest(test.TestCase, parameterized.TestCase):
 
   def _testArg(self,
                method,
@@ -91,10 +92,13 @@ def _testDim(self, dtype):
       self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
       self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
 
-  def testFloat(self):
-    self._testBasic(np.float32)
-    self._testTieBreaking(np.float32)
-    self._testDim(np.float32)
+  @parameterized.parameters(np.float16, np.float32, np.float64, np.int16,
+                            np.int32, np.int64, np.bool_,
+                            dtypes.bfloat16.as_numpy_dtype)
+  def testTypes(self, dtype):
+    self._testBasic(dtype,)
+    self._testTieBreaking(dtype)
+    self._testDim(dtype)
 
   def testFloatInt32Output(self):
     x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
@@ -113,31 +117,6 @@ def testFloatInt32Output(self):
       self.assertEqual(np.int32, tf_ans.dtype)
       self.assertAllEqual(tf_ans, expected_values)
 
-  def testDouble(self):
-    self._testBasic(np.float64)
-    self._testTieBreaking(np.float64)
-    self._testDim(np.float64)
-
-  def testInt16(self):
-    self._testBasic(np.int16)
-    self._testTieBreaking(np.int16)
-    self._testDim(np.int16)
-
-  def testInt32(self):
-    self._testBasic(np.int32)
-    self._testTieBreaking(np.int32)
-    self._testDim(np.int32)
-
-  def testInt64(self):
-    self._testBasic(np.int64)
-    self._testTieBreaking(np.int64)
-    self._testDim(np.int64)
-
-  def testBool(self):
-    self._testBasic(np.bool_)
-    self._testTieBreaking(np.bool_)
-    self._testDim(np.bool_)
-
   def testEmpty(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
diff --git a/tensorflow/python/kernel_tests/math_ops/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/math_ops/batch_matmul_op_test.py
index 00997d0bf6e..971e6dbd89a 100644
--- a/tensorflow/python/kernel_tests/math_ops/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/batch_matmul_op_test.py
@@ -18,6 +18,7 @@
 
 from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -57,7 +58,14 @@ def _compare(self, x_in, y_in, adjoint_a, adjoint_b, static_shape):
     x = x_in if not adjoint_a else x_in.reshape(x_t_shape)
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
     is_floating = x.dtype != np.int32
-    tol = 100 * np.finfo(x.dtype).eps if is_floating else 0
+    # np.finfo doesn't support bfloat16. So, we manually compute the eps which
+    # defines the difference between 1.0 and the next smallest representable
+    # float larger than 1.0. For bfloat16, the difference is 1/128.
+    if x.dtype == dtypes.bfloat16.as_numpy_dtype:
+      epsilon = 0.0078125
+    elif is_floating:
+      epsilon = np.finfo(x.dtype).eps
+    tol = 100 * epsilon if is_floating else 0
     with self.cached_session(use_gpu=is_floating) as sess:
       if static_shape:
         z0 = math_ops.matmul(x, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
@@ -155,12 +163,23 @@ def _checkGrad(self, x_in, y_in, adjoint_a, adjoint_b):
     y_t_shape = y_in.shape[:-2] + (y_in.shape[-1], y_in.shape[-2])
     x = x_in if not adjoint_a else x_in.reshape(x_t_shape)
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
-    epsilon = np.finfo(x.dtype).eps
+    # np.finfo doesn't support bfloat16. So, we manually compute the eps which
+    # defines the difference between 1.0 and the next smallest representable
+    # float larger than 1.0. For bfloat16, the difference is 1/128.
+    if x.dtype == dtypes.bfloat16.as_numpy_dtype:
+      epsilon = 0.0078125
+    else:
+      epsilon = np.finfo(x.dtype).eps
     # Since our gradient is linear, a larger delta decreases the error.
     delta = 10 * epsilon**(1.0 / 3.0)
 
     def Loss(x, y):
-      return math_ops.reduce_sum(math_ops.matmul(x, y, adjoint_a, adjoint_b))
+      z = math_ops.matmul(x, y, adjoint_a, adjoint_b)
+      # To avoid the high error when reduce_sum over the bfloat16 values, we
+      # cast the results to float32.
+      if z.dtype == dtypes.bfloat16:
+        z = math_ops.cast(z, dtype=dtypes.float32)
+      return math_ops.reduce_sum(z)
 
     with self.cached_session():
       ((x_jacob_t, y_jacob_t),
@@ -261,8 +280,10 @@ def benchmarkBatchMatMulBroadcast(self):
 
 if __name__ == "__main__":
   dtypes_to_test = [
-      np.float16, np.float32, np.float64, np.int32, np.complex64, np.complex128
+      np.float16, np.float32, np.float64, np.int32, np.complex64, np.complex128,
+      dtypes.bfloat16.as_numpy_dtype
   ]
+
   for dtype_ in dtypes_to_test:
     for adjoint_a_ in False, True:
       for adjoint_b_ in False, True:
diff --git a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
index a67e6bfbc82..b97a1cb0652 100644
--- a/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/bincount_op_test.py
@@ -138,7 +138,7 @@ def test_shape_function(self):
     # size must be scalar.
     with self.assertRaisesRegex(
         (ValueError, errors.InvalidArgumentError),
-        "Shape must be rank 0 but is rank 1(?s).*Bincount"):
+        "(?s)Shape must be rank 0 but is rank 1.*Bincount"):
       gen_math_ops.bincount([1, 2, 3, 1, 6, 8], [1], [])
     # size must be positive.
     with self.assertRaisesRegex((ValueError, errors.InvalidArgumentError),
diff --git a/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py b/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
index eb973bbf33d..0dce5afb63b 100644
--- a/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/clip_ops_test.py
@@ -298,7 +298,6 @@ def testClipByNormNotClippedWithAxes(self):
     self.assertAllClose(np_ans, tf_ans)
 
   # ClipByGlobalNorm tests
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
     with self.session():
@@ -312,15 +311,14 @@ def testClipByGlobalNormClipped(self):
       np_ans_1 = [0.8, -1.6]
 
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1])
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
     with self.session():
@@ -334,15 +332,14 @@ def testClipByGlobalNormClippedTensor(self):
       np_ans_1 = [0.8, -1.6]
 
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1])
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
     with self.session():
@@ -358,8 +355,8 @@ def testClipByGlobalNormSupportsNone(self):
       ans, norm = clip_ops.clip_by_global_norm((x0, None, x1, None), clip_norm)
       self.assertTrue(ans[1] is None)
       self.assertTrue(ans[3] is None)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[2].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[2])
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
@@ -400,7 +397,6 @@ def testClipByGlobalNormPreservesDenseShape(self):
     self.assertEqual(dense_shape, slices.dense_shape)
     self.assertEqual(dense_shape, modified_slices.dense_shape)
 
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
     with self.session():
@@ -412,15 +408,14 @@ def testClipByGlobalNormNotClipped(self):
       clip_norm = 6.0
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1])
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
     with self.session():
@@ -432,15 +427,14 @@ def testClipByGlobalNormZero(self):
       clip_norm = 6.0
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1])
       tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 0.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
-  @test_util.run_deprecated_v1
   def testClipByGlobalNormInf(self):
     # Expect all NaNs when global norm is inf.
     with self.session():
@@ -450,8 +444,8 @@ def testClipByGlobalNormInf(self):
       clip_norm = 6.0
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
-      tf_ans_1 = ans[0].eval()
-      tf_ans_2 = ans[1].eval()
+      tf_ans_1 = self.evaluate(ans[0])
+      tf_ans_2 = self.evaluate(ans[1])
       tf_norm = self.evaluate(norm)
       self.assertAllEqual(tf_norm, float('inf'))
       self.assertAllEqual(tf_ans_1, np.full([2, 3], float('nan')))
diff --git a/tensorflow/python/kernel_tests/math_ops/cumulative_logsumexp_test.py b/tensorflow/python/kernel_tests/math_ops/cumulative_logsumexp_test.py
index 789d7aed8f8..6957f9da63e 100644
--- a/tensorflow/python/kernel_tests/math_ops/cumulative_logsumexp_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cumulative_logsumexp_test.py
@@ -26,7 +26,12 @@
 
 
 class CumulativeLogsumexpTest(test.TestCase):
-  valid_dtypes = [dtypes.float32, dtypes.float64]
+  valid_dtypes = [
+      dtypes.float32,
+      dtypes.float64,
+      dtypes.float16,
+      dtypes.bfloat16,
+  ]
 
   def _computeLogSumExp(self, x, **kwargs):
     result_naive = math_ops.cumsum(math_ops.exp(x), **kwargs)
@@ -40,7 +45,8 @@ def _testLogSumExp(self, x, dtype=dtypes.float32, use_gpu=False, **kwargs):
       result_naive, result_fused = self.evaluate(
           self._computeLogSumExp(x, **kwargs))
 
-    self.assertAllClose(result_naive, result_fused)
+    tol = 2e-2 if dtype in [dtypes.float16, dtypes.bfloat16] else 1e-6
+    self.assertAllClose(result_naive, result_fused, rtol=tol, atol=tol)
 
   def _testLogSumExpAllArgs(self, x, axis=0, use_gpu=False):
     for dtype in self.valid_dtypes:
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
index 753846c1015..f9d1a9fe325 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_binary_test.py
@@ -39,6 +39,10 @@
 _FLOORDIV = lambda x, y: x // y
 _MOD = lambda x, y: x % y
 
+# x and y must be numpy array object
+np_xlogy = lambda x, y: x * np.log(y)
+np_xlog1py = lambda x, y: x * np.log1p(y)
+
 
 # TODO(zongheng): it'd be great to factor out this function and various random
 # SparseTensor gen funcs.
@@ -260,6 +264,15 @@ def testFloatVariableOverload(self):
     self.assertAllEqual(np_result, left_result)
     self.assertAllEqual(np_result, right_result)
 
+  def testBFloat16Basic(self):
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
+    x = np.linspace(-20, 20, 10).reshape(1, 2, 5).astype(bfloat16)  # pylint: disable=too-many-function-args
+    # y cannot be zero
+    y = np.linspace(-20, 20, 10).reshape(1, 2, 5).astype(bfloat16)  # pylint: disable=too-many-function-args
+    self._compareCpu(x, y, np.true_divide, math_ops.xdivy)
+    self._compareCpu(x, y, np_xlogy, math_ops.xlogy)
+    self._compareCpu(x, y, np_xlog1py, math_ops.xlog1py)
+
   @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float64)  # pylint: disable=too-many-function-args
@@ -292,6 +305,23 @@ def testDoubleBasic(self):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  def testBfloat16Basic(self):
+    bf16_np = dtypes_lib.bfloat16.as_numpy_dtype
+    x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(bf16_np)  # pylint: disable=too-many-function-args
+    y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(bf16_np)  # pylint: disable=too-many-function-args
+    self._compareBoth(x, y, np.add, math_ops.add)
+    self._compareBoth(x, y, np.subtract, math_ops.subtract)
+    self._compareBoth(x, y, np.multiply, math_ops.multiply)
+    self._compareBoth(x, bf16_np(y + 0.1), np.true_divide, math_ops.truediv)
+    self._compareBoth(x, bf16_np(y + 0.1), np.floor_divide, math_ops.floordiv)
+    self._compareBoth(x, y, np.add, _ADD)
+    self._compareBoth(x, y, np.subtract, _SUB)
+    self._compareBoth(x, y, np.multiply, _MUL)
+    self._compareBoth(x, bf16_np(y + 0.1), np.true_divide, _TRUEDIV)
+    self._compareBoth(x, bf16_np(y + 0.1), np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.maximum, math_ops.maximum)
+    self._compareBoth(x, y, np.minimum, math_ops.minimum)
+
   def testUint8Basic(self):
     x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint8)
     y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint8)
@@ -852,6 +882,26 @@ def testPowNegativeExponentGpu(self):
     z = math_ops.pow(x, y)
     self.assertAllEqual(self.evaluate(z), [0, 1, 1, 1, -1])
 
+  def testFloorModInfDenominator(self):
+    """Regression test for GitHub issue #58369."""
+    if not test_util.is_gpu_available():
+      self.skipTest("Requires GPU")
+
+    dtypes = [
+        dtypes_lib.bfloat16.as_numpy_dtype,
+        np.float16,
+        np.float32,
+        np.float64,
+    ]
+
+    for dtype in dtypes:
+      x = np.array([4, 0, -1, 4, 0, -1], dtype=dtype)
+      y = np.array([np.inf, np.inf, np.inf, -np.inf, -np.inf, -np.inf],
+                   dtype=dtype)
+      expected = np.array([4, 0, np.inf, -np.inf, 0, -1], dtype=dtype)
+
+      self.assertAllClose(self.evaluate(math_ops.mod(x, y)), expected)
+
 
 class ComparisonOpTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
index 6422f033ab2..2fb954b2a2e 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_unary_test.py
@@ -412,16 +412,16 @@ def f(x):
     y = (x + .5).astype(bfloat16)  # no zero
     z = (x + 15.5).astype(bfloat16)  # all positive
     k = np.arange(-0.90, 0.90, 0.05).astype(bfloat16)  # between -1 and 1
-    self._compareCpu(x, np.abs, math_ops.abs)
-    self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.abs, math_ops.abs)
+    self._compareBoth(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
     self._compareBoth(x, np.negative, _NEG)
-    self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareBoth(y, compute_f32(self._inv), math_ops.reciprocal)
     self._compareCpu(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(z, compute_f32(np.log), math_ops.log)
     self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
-    self._compareCpu(y, np.sign, math_ops.sign)
+    self._compareBoth(y, np.sign, math_ops.sign)
     self._compareCpu(z, self._rsqrt, math_ops.rsqrt)
     self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
     self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
@@ -438,6 +438,7 @@ def f(x):
                       grad_tol=1e-2)
     self._compareBoth(x, compute_f32(np.vectorize(math.erf)), math_ops.erf)
     self._compareBoth(x, compute_f32(np.vectorize(math.erfc)), math_ops.erfc)
+    self._compareBoth(x, compute_f32(np.square), math_ops.square)
 
   @test.disable_with_predicate(
       pred=test.is_built_with_rocm, skip_message="On ROCm this test fails")
diff --git a/tensorflow/python/kernel_tests/math_ops/matmul_op_test.py b/tensorflow/python/kernel_tests/math_ops/matmul_op_test.py
index 59b76249fc5..25405cd92e2 100644
--- a/tensorflow/python/kernel_tests/math_ops/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/matmul_op_test.py
@@ -26,8 +26,6 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
 # TODO(yangzihao): Currently matmul autotuning is disabled by default. Use
@@ -151,6 +149,10 @@ def Test(self):
     if not use_static_shape_ or a_np_.dtype in (np.int32, np.int64, np.float16):
       self.skipTest("Skipping infeasible gradient test.")
 
+    if (a_np_.dtype == dtypes.bfloat16.as_numpy_dtype and
+        not test_util.is_gpu_available()):
+      self.skipTest("The bfloat16 tests might fail on CPU")
+
     # Transpose and possibly conjugate a_np_ and b_np_ according to the
     # attributes such that tf.matmul(effective_a_np, effective_b_np, **kwargs)
     # results in a valid matrix multiplication and produces the same result as
@@ -158,7 +160,13 @@ def Test(self):
     effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
     effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
 
-    epsilon = np.finfo(a_np_.dtype).eps
+    # np.finfo doesn't support bfloat16. So, we manually compute the eps which
+    # defines the difference between 1.0 and the next smallest representable
+    # float larger than 1.0. For bfloat16, the difference is 1/128.
+    if a_np_.dtype == dtypes.bfloat16.as_numpy_dtype:
+      epsilon = 0.0078125
+    else:
+      epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
     with self.session():
@@ -177,32 +185,6 @@ def Test(self):
   return Test
 
 
-@test_util.with_eager_op_as_function
-class MatMulStatsTest(test_lib.TestCase):
-
-  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
-  def testSimpleStatistics(self):
-    a = variables.Variable(random_ops.random_normal([25, 16]))
-    b = variables.Variable(random_ops.random_normal([16, 9]))
-    math_ops.matmul(a, b)
-    g = ops.get_default_graph()
-    for op in g.get_operations():
-      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-      if op.name == "MatMul":
-        self.assertEqual(7200, flops)
-
-  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
-  def testTransposedStatistics(self):
-    a = variables.Variable(random_ops.random_normal([16, 25]))
-    b = variables.Variable(random_ops.random_normal([16, 9]))
-    math_ops.matmul(a, b, transpose_a=True)
-    g = ops.get_default_graph()
-    for op in g.get_operations():
-      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-      if op.name == "MatMul":
-        self.assertEqual(7200, flops)
-
-
 try:
   # @ operator supported since python 3.5.
   infix_matmul = operator.matmul
@@ -265,8 +247,9 @@ def testInfixMatmulDoesDotProduct(self):
   trans_options = [[False, False], [True, False], [False, True]]
   dtypes_to_test = [
       np.int32, np.int64, np.float16, np.float32, np.float64, np.complex64,
-      np.complex128
+      np.complex128, dtypes.bfloat16.as_numpy_dtype
   ]
+
   # TF2 does not support placeholders under eager so we skip it
   for use_static_shape in set([True, tf2.enabled()]):
     for dtype in dtypes_to_test:
@@ -275,6 +258,9 @@ def testInfixMatmulDoesDotProduct(self):
           for k in sizes:
             # Construct compatible random matrices a_np of size [m, k] and b_np
             # of size [k, n].
+            # Add seed value to make the tests for bfloat16 stable.
+            if dtype == dtypes.bfloat16.as_numpy_dtype:
+              np.random.seed(12)
             a_np = np.random.normal(-5, 5, m * k).astype(dtype).reshape([m, k])
             if dtype in (np.complex64, np.complex128):
               a_np.imag = np.random.normal(-5, 5,
diff --git a/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py b/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
index 5a2c3db16fd..c409c4cffe5 100644
--- a/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/reduction_ops_test.py
@@ -17,6 +17,7 @@
 import itertools
 import numbers
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -150,25 +151,48 @@ def _makeRandom(self, shape, dtype):
       data -= 2j * data
     return data
 
-  def _compare(self, x, reduction_axes, keepdims, feed_dict=None):
+  def _compare(self,
+               x,
+               reduction_axes,
+               keepdims,
+               feed_dict=None,
+               rtol=1e-6,
+               atol=1e-6):
     np_ans = self._np_reduce(x, reduction_axes, keepdims)
     with self.cached_session() as sess:
       tf_ans = self._tf_reduce(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
-    self.assertAllClose(np_ans, out)
+    self.assertAllClose(np_ans, out, rtol=rtol, atol=atol)
     self.assertShapeEqual(np_ans, tf_ans)
 
-  def _compareAll(self, x, reduction_axes, feed_dict=None):
+  def _compareAll(self,
+                  x,
+                  reduction_axes,
+                  feed_dict=None,
+                  rtol=1e-6,
+                  atol=1e-6):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
-      self._compareAll(x, reduction_axes[0])
-    self._compare(x, reduction_axes, keepdims=False, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, keepdims=True, feed_dict=feed_dict)
-
-  def _compareAllAxes(self, x, feed_dict=None):
-    self._compareAll(x, None)
+      self._compareAll(x, reduction_axes[0], rtol=rtol, atol=atol)
+    self._compare(
+        x,
+        reduction_axes,
+        keepdims=False,
+        feed_dict=feed_dict,
+        rtol=rtol,
+        atol=atol)
+    self._compare(
+        x,
+        reduction_axes,
+        keepdims=True,
+        feed_dict=feed_dict,
+        rtol=rtol,
+        atol=atol)
+
+  def _compareAllAxes(self, x, feed_dict=None, rtol=1e-6, atol=1e-6):
+    self._compareAll(x, None, rtol=rtol, atol=atol)
     for axes in _powerset(range(x.ndim)):
-      self._compareAll(x, axes, feed_dict)
+      self._compareAll(x, axes, feed_dict, rtol=rtol, atol=atol)
 
   def _compareGradient(self, x, reduction_axes, rtol=1e-8, atol=1e-8):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
@@ -243,6 +267,12 @@ def testFloat16(self):
       tf_out_mean = self.evaluate(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
+  @test_util.run_deprecated_v1
+  def testBfloat16(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.bfloat16)
+      self._compareAllAxes(np_arr, rtol=1e-3, atol=5.)
+
   @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
@@ -408,8 +438,8 @@ def testEmptyGradients(self):
   @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session():
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                    dtypes.complex64, dtypes.complex128):
+      for dtype in (dtypes.bfloat16, dtypes.float16, dtypes.float32,
+                    dtypes.float64, dtypes.complex64, dtypes.complex128):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_sum(x, [0])
@@ -494,6 +524,12 @@ def testFloat32(self):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
+  def testBfloat16(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.bfloat16)
+      self._compareAllAxes(np_arr, rtol=1e-3, atol=1.)
+
   @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
@@ -530,7 +566,8 @@ def testEmptyGradients(self):
   @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session():
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.bfloat16, dtypes.float16, dtypes.float32,
+                    dtypes.float64):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_mean(x, [0]).eval()
@@ -538,7 +575,7 @@ def testDegenerate(self):
         self.assertTrue(np.all(np.isnan(y)))
 
 
-class EuclideanNormReductionTest(BaseReductionTest):
+class EuclideanNormReductionTest(BaseReductionTest, parameterized.TestCase):
 
   def _tf_reduce(self, x, reduction_axes, keepdims):
     return math_ops.reduce_euclidean_norm(x, reduction_axes, keepdims)
@@ -575,38 +612,26 @@ def testSingleton(self):
       np_arr = np.array([-1.]).astype(dtype)
       self._compareAll(np_arr, None)
 
+  @parameterized.parameters(
+      dtypes.bfloat16,
+      dtypes.float32,
+      dtypes.float64,
+      dtypes.int32,
+      dtypes.complex64,
+      dtypes.complex128,
+  )
   @test_util.run_deprecated_v1
-  def testInt32(self):
-    for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
-      self._compareAllAxes(np_arr)
-
-  @test_util.run_deprecated_v1
-  def testFloat32(self):
-    for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
-      self._compareAllAxes(np_arr)
-
-  @test_util.run_deprecated_v1
-  def testFloat64(self):
-    for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
-      self._compareAllAxes(np_arr)
-
-  @test_util.run_deprecated_v1
-  def testComplex64(self):
+  def testTypes(self, dtype):
     for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
-      self._compareAllAxes(np_arr)
+      np_arr = self._makeIncremental((2,) * rank, dtype)
+      rtol, atol = (1e-2, 5e-1) if dtype == dtypes.bfloat16 else (1e-6, 1e-6)
+      self._compareAllAxes(np_arr, rtol=rtol, atol=atol)
 
   @test_util.run_deprecated_v1
-  def testComplex128(self):
-    for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
-      self._compareAllAxes(np_arr)
-
+  def testDegenerate(self):
     with self.session():
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.bfloat16, dtypes.float16, dtypes.float32,
+                    dtypes.float64):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_euclidean_norm(x, [0]).eval()
@@ -671,6 +696,13 @@ def testFloat32(self):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
+  def testBfloat16(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.bfloat16) * \
+               np.array([0.01]).astype(dtypes.bfloat16.as_numpy_dtype)
+      self._compareAllAxes(np_arr, rtol=1e-2, atol=1e-2)
+
   @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
@@ -723,7 +755,8 @@ def testEmptyGradients(self):
   @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session():
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.bfloat16, dtypes.float16, dtypes.float32,
+                    dtypes.float64):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
         y = math_ops.reduce_prod(x, [0])
@@ -925,6 +958,22 @@ def testDoubleReduce3D(self):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  def testBfloat16Reduce3D(self):
+    # Create a 3D array of floats and reduce across all possible
+    # dimensions
+    np_arr = np.arange(-31,
+                       -1).reshape([2, 3,
+                                    5]).astype(dtypes.bfloat16.as_numpy_dtype)
+    self._compareAll(np_arr, None)
+    self._compareAll(np_arr, [])
+    self._compareAll(np_arr, [0])
+    self._compareAll(np_arr, [1])
+    self._compareAll(np_arr, [2])
+    self._compareAll(np_arr, [0, 1])
+    self._compareAll(np_arr, [1, 2])
+    self._compareAll(np_arr, [0, 2])
+    self._compareAll(np_arr, [0, 1, 2])
+
   @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
diff --git a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
index 018dc63b04e..7692e463d55 100644
--- a/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/segment_reduction_ops_test.py
@@ -40,6 +40,10 @@ def _input(self, input_shape, dtype=dtypes_lib.int32):
       num_elem *= x
     values = np.arange(1, num_elem + 1)
     np_values = values.reshape(input_shape).astype(dtype.as_numpy_dtype)
+    if dtype == dtypes_lib.bfloat16:
+      # Large numbers from arange lead to high absolute diff with bfloat16, so
+      # scale down.
+      np_values *= np.array(0.00001, dtype=dtype.as_numpy_dtype)
     # Add a non-zero imaginary component to complex types.
     if dtype.is_complex:
       np_values -= 1j * np_values
@@ -84,9 +88,15 @@ def _sqrt_n_reduce_op(self, x):
 
 class SegmentReductionOpTest(SegmentReductionHelper, parameterized.TestCase):
 
-  @parameterized.parameters((dtypes_lib.float32), (dtypes_lib.float64),
-                            (dtypes_lib.int64), (dtypes_lib.int32),
-                            (dtypes_lib.complex64), (dtypes_lib.complex128))
+  @parameterized.parameters(
+      (dtypes_lib.float32),
+      (dtypes_lib.float64),
+      (dtypes_lib.int64),
+      (dtypes_lib.int32),
+      (dtypes_lib.complex64),
+      (dtypes_lib.complex128),
+      (dtypes_lib.bfloat16),
+  )
   def testValues(self, dtype):
     # Each item is np_op1, np_op2, tf_op
     ops_list = [(np.add, None, math_ops.segment_sum),
@@ -119,7 +129,7 @@ def testValues(self, dtype):
                 indices, np_x, np_op1, np_op2, initial_value=initial_value)
             s = tf_op(data=tf_x, segment_ids=indices)
             tf_ans = self.evaluate(s)
-            self.assertAllClose(np_ans, tf_ans)
+            self.assertAllCloseAccordingToType(np_ans, tf_ans)
             # NOTE(mrry): The static shape inference that computes
             # `tf_ans.shape` can only infer that sizes from dimension 1
             # onwards, because the size of dimension 0 is data-dependent
@@ -330,10 +340,6 @@ def testValues(self, dtype):
               # sqrt_n doesn't support integers
               if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
                 continue
-              # todo(philjd): enable this test once real_div supports bfloat16
-              if (np_op2 in [self._sqrt_n_reduce_op, self._mean_reduce_op] and
-                  dtype == dtypes_lib.bfloat16):
-                continue
               np_ans = self._segmentReduce(
                   indices,
                   np_x,
@@ -343,8 +349,6 @@ def testValues(self, dtype):
                   initial_value=init_op(dtype))
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
               tf_ans = self.evaluate(s)
-              if dtype is dtypes_lib.bfloat16:
-                tf_ans = tf_ans.astype(np.float32)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
               self.assertShapeEqual(np_ans, s)
 
@@ -393,8 +397,12 @@ def testGradientsTFGradients(self):
               self.assertAllCloseAccordingToType(jacob_t, jacob_n,
                                                  half_atol=1e-2)
 
-  @parameterized.parameters((dtypes_lib.float16), (dtypes_lib.float32),
-                            (dtypes_lib.float64))
+  @parameterized.parameters(
+      (dtypes_lib.float16),
+      (dtypes_lib.float32),
+      (dtypes_lib.float64),
+      (dtypes_lib.bfloat16),
+  )
   @test_util.run_in_graph_and_eager_modes
   def testGradientsGradientTape(self, dtype):
     num_cols = 2
@@ -418,7 +426,12 @@ def f(x):
                 gradient_checker_v2.compute_gradient(f, [np_x], delta=1.))
             # pylint: enable=cell-var-from-loop
             self.assertAllCloseAccordingToType(
-                jacob_n, gradient_tape_jacob_t, half_atol=1e-2)
+                jacob_n,
+                gradient_tape_jacob_t,
+                half_atol=1e-2,
+                bfloat16_rtol=5e-1,
+                bfloat16_atol=5e-1,
+            )
 
   @test_util.run_deprecated_v1
   def testProdGrad(self):
@@ -582,8 +595,11 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
 
   def testValues(self):
     dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32
+        dtypes_lib.float32,
+        dtypes_lib.float64,
+        dtypes_lib.int64,
+        dtypes_lib.int32,
+        dtypes_lib.bfloat16,
     ]
 
     index_dtypes = [dtypes_lib.int32, dtypes_lib.int64]
@@ -624,7 +640,7 @@ def testValues(self):
                     segment_ids=math_ops.cast(segment_indices,
                                               segment_ids_dtype))
                 tf_ans = self.evaluate(s)
-                self.assertAllClose(np_ans, tf_ans)
+                self.assertAllCloseAccordingToType(np_ans, tf_ans)
                 # NOTE(mrry): The static shape inference that computes
                 # `tf_ans.shape` can only infer that sizes from dimension 1
                 # onwards, because the size of dimension 0 is data-dependent
diff --git a/tensorflow/python/kernel_tests/math_ops/sets_test.py b/tensorflow/python/kernel_tests/math_ops/sets_test.py
index a35214173d5..f4d52c1d93f 100644
--- a/tensorflow/python/kernel_tests/math_ops/sets_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/sets_test.py
@@ -82,6 +82,16 @@ def _test_set_size_2d(self, dtype):
     self.assertAllEqual(
         [0, 3], self._set_size(_dense_to_sparse([[], [1, 9, 2]], dtype)))
 
+  def test_invalid_size(self):
+    with self.assertRaisesRegex((ValueError, errors_impl.InvalidArgumentError,
+                                 errors_impl.InternalError),
+                                "Index out of range|expected <"):
+      sparse_data = sparse_tensor_lib.SparseTensor(
+          constant_op.constant([[804, 7450], [48245, 2577]], dtypes.int64),
+          constant_op.constant([1, 1], dtypes.int64),
+          constant_op.constant([812, 390], dtypes.int64))
+      self.evaluate(sets.set_size(sparse_data, validate_indices=False))
+
   @test_util.run_deprecated_v1
   def test_set_size_duplicates_2d(self):
     for dtype in _DTYPES:
diff --git a/tensorflow/python/kernel_tests/math_ops/topk_op_test.py b/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
index cac77f1f14b..a28dc129396 100644
--- a/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/topk_op_test.py
@@ -55,7 +55,8 @@ def _validateTopK(self,
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.floating):
+        if not np.issubdtype(np_expected_values.dtype, np.floating) and \
+            np_expected_values.dtype != dtypes.bfloat16.as_numpy_dtype:
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
@@ -122,6 +123,7 @@ def _testLargeSort(self, dtype):
   def testLargeSort(self):
     self._testLargeSort(np.float32)
     self._testLargeSort(np.float16)
+    self._testLargeSort(dtypes.bfloat16.as_numpy_dtype)
 
   def _testLargeTopK(self, dtype):
     b = 10
@@ -136,6 +138,7 @@ def _testLargeTopK(self, dtype):
   def testLargeTopK(self):
     self._testLargeTopK(np.float32)
     self._testLargeTopK(np.float16)
+    self._testLargeTopK(dtypes.bfloat16.as_numpy_dtype)
 
   def _testMediumTopK(self, dtype):
     b = 5
@@ -150,6 +153,7 @@ def _testMediumTopK(self, dtype):
   def testMediumTopK(self):
     self._testMediumTopK(np.float32)
     self._testMediumTopK(np.float16)
+    self._testMediumTopK(dtypes.bfloat16.as_numpy_dtype)
 
   def testStableSort(self):
     b = 5
diff --git a/tensorflow/python/kernel_tests/math_ops/transpose_op_test.py b/tensorflow/python/kernel_tests/math_ops/transpose_op_test.py
index fc1a07af611..e50b0810854 100644
--- a/tensorflow/python/kernel_tests/math_ops/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/transpose_op_test.py
@@ -154,7 +154,10 @@ def test5DGPU(self):
         4, 1, 2, 3, 0
     ]] * 6 + [[1, 2, 3, 4, 0]] * 6
 
-    datatypes = [np.int8, np.float16, np.float32, np.float64, np.complex128]
+    datatypes = [
+        np.int8, np.float16, np.float32, np.float64, np.complex128,
+        dtypes.bfloat16.as_numpy_dtype
+    ]
     for datatype in datatypes:
       for input_shape, perm in zip(large_shapes, perms):
         with self.subTest(
@@ -360,6 +363,16 @@ def testHalf(self):
     self._compare(
         np.arange(0, 16).reshape([1, 2, 1, 2, 1, 2, 1, 2]).astype(np.float16))
 
+  def testBfloat16(self):
+    self._compare(
+        np.arange(0, 21).reshape([3, 7]).astype(dtypes.bfloat16.as_numpy_dtype))
+    self._compare(
+        np.arange(0, 210).reshape([2, 3, 5,
+                                   7]).astype(dtypes.bfloat16.as_numpy_dtype))
+    self._compare(
+        np.arange(0, 16).reshape([1, 2, 1, 2, 1, 2, 1,
+                                  2]).astype(dtypes.bfloat16.as_numpy_dtype))
+
   def testFloat(self):
     self._compare_cpu_gpu(np.arange(0, 21).reshape([3, 7]).astype(np.float32))
     self._compare_cpu_gpu(
diff --git a/tensorflow/python/kernel_tests/nn_ops/BUILD b/tensorflow/python/kernel_tests/nn_ops/BUILD
index 8e946444c23..73e8f0056c9 100644
--- a/tensorflow/python/kernel_tests/nn_ops/BUILD
+++ b/tensorflow/python/kernel_tests/nn_ops/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -207,7 +208,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
     shard_count = 30,
-    tags = ["no_cuda11"],
+    tags = [
+        "no_cuda11",
+        "no_mac_arm64",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -222,6 +226,7 @@ cuda_py_test(
     srcs = ["conv_ops_test.py"],
     shard_count = 4,
     tags = [
+        "no_mac_arm64",
         "optonly",  # times out
     ],
     deps = [
@@ -289,6 +294,7 @@ cuda_py_test(
     srcs = ["cudnn_deterministic_ops_test.py"],
     tags = [
         "no_cuda_asan",  # TODO(b/171509035): re-enable.
+        "no_rocm_pre_53",
     ],
     xla_enable_strict_auto_jit = True,
     deps = [
@@ -302,6 +308,7 @@ cuda_py_test(
     srcs = ["cudnn_d9m_test.py"],
     tags = [
         "no_cuda_asan",  # TODO(b/171509035): re-enable.
+        "no_rocm",  #This is test is specific to CUDA and enables determinism through a CUDA specific env var.
     ],
     deps = [
         ":cudnn_deterministic_base",
@@ -563,6 +570,7 @@ cuda_py_test(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
         "//tensorflow/python:tensor_array_ops",
@@ -651,6 +659,7 @@ cuda_py_test(
 cuda_py_test(
     name = "xent_op_d9m_test",
     size = "medium",
+    timeout = "long",
     srcs = ["xent_op_d9m_test.py"],
     tags = [
         "no_windows",  # Flaky on Windows CPU: https://github.com/tensorflow/tensorflow/issues/55827
diff --git a/tensorflow/python/kernel_tests/nn_ops/bias_op_base.py b/tensorflow/python/kernel_tests/nn_ops/bias_op_base.py
index c7914ef273a..b70cc349140 100644
--- a/tensorflow/python/kernel_tests/nn_ops/bias_op_base.py
+++ b/tensorflow/python/kernel_tests/nn_ops/bias_op_base.py
@@ -26,6 +26,7 @@
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -118,13 +119,17 @@ def testIntTypes(self):
           np.array([1, 2, 3]).astype(t))
 
   def testFloatTypes(self):
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]:
       self._testAll(
           np.random.rand(4, 3, 3).astype(t),
           np.random.rand(3).astype(t))
 
   def test4DFloatTypes(self):
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]:
       self._testAll(
           np.random.rand(4, 3, 2, 3).astype(t),
           np.random.rand(3).astype(t))
@@ -136,7 +141,9 @@ def test4DFloatTypes(self):
           np.random.rand(2048).astype(t))
 
   def test5DFloatTypes(self):
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16, np.float32, np.float64, dtypes.bfloat16.as_numpy_dtype
+    ]:
       self._testAll(
           np.random.rand(4, 3, 2, 3, 4).astype(t),
           np.random.rand(4).astype(t))
@@ -191,6 +198,9 @@ def bias_add_grad_function(upstream_gradients):
                                                     output_tensor, output_shape)
       (input_jacob_a, input_jacob_n), (bias_jacob_a, bias_jacob_n) = jacobians
       # Test gradient of BiasAddGrad
+      if dtype == dtypes.bfloat16:
+        # L2Loss is not supported for bfloat16 on CPU.
+        output_tensor = math_ops.cast(output_tensor, dtype=dtypes.float32)
       bias_add_grad = gradients_impl.gradients(
           nn_ops.l2_loss(output_tensor), bias_tensor)[0]
       grad_jacob_a, grad_jacob_n = gradient_checker.compute_gradient(
@@ -208,9 +218,9 @@ def _testGradient(self, np_input, bias, dtype, data_format, use_gpu):
       input_jacob_a, bias_jacob_a, grad_jacob_a = jacob_a
       input_jacob_n, bias_jacob_n, grad_jacob_n = jacob_n
 
-      if dtype == np.float16:
-        # Compare fp16 analytical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
+      if dtype in [np.float16, dtypes.bfloat16.as_numpy_dtype]:
+        # Compare fp16/bf16 analytical gradients to fp32 numerical gradients,
+        # since fp16/bf16 numerical gradients are too imprecise unless great
         # care is taken with choosing the inputs and the delta. This is
         # a weaker, but pragmatic, check (in particular, it does not test
         # the op itself, only its gradient).
@@ -232,7 +242,8 @@ def _testGradient(self, np_input, bias, dtype, data_format, use_gpu):
 
   def testGradientTensor2D(self):
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.bfloat16):
         np_input = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
                             dtype=dtype.as_numpy_dtype).reshape(3, 2)
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
@@ -241,7 +252,8 @@ def testGradientTensor2D(self):
   def testGradientTensor3D(self):
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
                                    ("NCHW", False), ("NCHW", True)]:
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.bfloat16):
         # pylint: disable=too-many-function-args
         np_input = np.array(
             [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
@@ -251,7 +263,8 @@ def testGradientTensor3D(self):
 
   def testGradientTensor4D(self):
     for (data_format, use_gpu) in [("NHWC", False)]:
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.bfloat16):
         np_input = np.arange(
             1.0, 49.0,
             dtype=dtype.as_numpy_dtype).reshape([2, 3, 4, 2]).astype(np.float32)
@@ -273,7 +286,8 @@ def testGradientTensor4D(self):
   def testGradientTensor5D(self):
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
                                    ("NCHW", False), ("NCHW", True)]:
-      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.bfloat16):
         np_input = np.arange(
             1.0, 49.0,
             dtype=dtype.as_numpy_dtype).reshape([1, 2, 3, 4,
diff --git a/tensorflow/python/kernel_tests/nn_ops/conv1d_transpose_test.py b/tensorflow/python/kernel_tests/nn_ops/conv1d_transpose_test.py
index aeb1ab91d7f..a7aee60614a 100644
--- a/tensorflow/python/kernel_tests/nn_ops/conv1d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/conv1d_transpose_test.py
@@ -127,6 +127,7 @@ def testConv1DTransposeValid(self):
 
   @test_util.run_deprecated_v1
   def testGradient(self):
+    self.skipTest("b/262851489: Fix nightly build for GPU.")
     x_shape = [2, 4, 3]
     f_shape = [3, 2, 3]
     y_shape = [2, 8, 2]
diff --git a/tensorflow/python/kernel_tests/nn_ops/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/nn_ops/conv2d_transpose_test.py
index 124aea0a417..a2956539d78 100644
--- a/tensorflow/python/kernel_tests/nn_ops/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/conv2d_transpose_test.py
@@ -161,6 +161,7 @@ def testConv2DTransposeValid(self):
 
   @test_util.run_deprecated_v1
   def testGradient(self):
+    self.skipTest("b/262851489: Fix nightly build for GPU.")
     x_shape = [2, 6, 4, 3]
     f_shape = [3, 3, 2, 3]
     y_shape = [2, 12, 8, 2]
@@ -319,5 +320,16 @@ def testConv2DTransposeInvalidOutputShape(self):
             strides=[1])
         self.evaluate(op)
 
+  def testConv2DTransposeLargeOutputShape(self):
+    # On GPU, this test does try to allocate the output tensor and OOMs.
+    with test_util.device(use_gpu=False):
+      with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+        op = nn_ops.conv2d_transpose(
+            input=np.ones((2, 2, 2, 2)),
+            output_shape=[114078056, 179835296],
+            strides=[10],
+            filters=[[[[1]]]])
+        self.evaluate(op)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/nn_ops/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/nn_ops/conv3d_transpose_test.py
index 1e683808296..573b483cea2 100644
--- a/tensorflow/python/kernel_tests/nn_ops/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/conv3d_transpose_test.py
@@ -126,7 +126,7 @@ def testConv3DTransposeShapeMismatch(self):
     x_value = np.random.random_sample(x_shape).astype(np.float64)
     f_value = np.random.random_sample(f_shape).astype(np.float64)
     nn_ops.conv3d_transpose(
-        x_value, f_value, y_shape, strides, data_format='NCDHW')
+        x_value, f_value, y_shape, strides, data_format="NCDHW")
 
   def testConv3DTransposeOutputShapeType(self):
     # Test case for GitHub issue 18887
@@ -200,6 +200,7 @@ def testConv3DTransposeValid(self):
 
   @test_util.run_deprecated_v1
   def testGradient(self):
+    self.skipTest("b/262851489: Fix nightly build for GPU.")
     x_shape = [2, 3, 4, 3, 2]
     f_shape = [3, 3, 3, 2, 2]
     y_shape = [2, 6, 8, 6, 2]
@@ -218,6 +219,20 @@ def testGradient(self):
     err_tolerance = 0.00055
     self.assertLess(err, err_tolerance)
 
+  def testConv3DTransposeZeroShapeDoNotRaiseError(self):
+    with self.cached_session():
+      x_value = np.zeros([10, 0, 2, 3, 3])
+      f_value = np.ones((3, 3, 3, 3, 3))
+      y_shape = np.stack([10, 0, 2, 3, 3])
+      output = nn_ops.conv3d_transpose(
+          x_value,
+          f_value,
+          y_shape,
+          strides=(1, 1, 1),
+          data_format="NDHWC",
+          padding="SAME",
+      )
+      _ = self.evaluate(output)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py b/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
index c41448453e6..71fd8cef406 100644
--- a/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/conv_ops_test.py
@@ -171,7 +171,7 @@ def _DtypesToTest(self, use_gpu):
       if test_util.GpuSupportsHalfMatMulAndConv():
         out.append(dtypes.float16)
       if not test.is_built_with_rocm():
-        out.append(dtypes.float64)
+        out.extend([dtypes.float64, dtypes.bfloat16])
       return out
 
     return [dtypes.float32, dtypes.float64, dtypes.float16, dtypes.bfloat16]
diff --git a/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_ops_test.py b/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_ops_test.py
index 4c20bd31a94..00c613e53ea 100644
--- a/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_ops_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_ops_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for TF_DETERMINISTIC_OPS=1."""
 
+import os
 from tensorflow.python.framework import config
 from tensorflow.python.kernel_tests.nn_ops import cudnn_deterministic_base
 from tensorflow.python.platform import test
@@ -23,4 +24,6 @@
 if __name__ == '__main__':
   # TODO(reedwm): Merge this file with cudnn_deterministic_base.py.
   config.enable_op_determinism()
+
+  os.environ['TF_DETERMINISTIC_OPS'] = '1'
   test.main()
diff --git a/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py b/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
index 25086f6ea49..a9f63ad6ce9 100644
--- a/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
+++ b/tensorflow/python/kernel_tests/nn_ops/depthwise_conv_op_base.py
@@ -416,16 +416,17 @@ def testDepthwiseConv2DCudnn(self):
       tf_logging.info(
           "Testing DepthwiseConv2DCudnn, %dth config: %r * %r, stride: %d, "
           "padding: %s", index, input_size, filter_size, stride, padding)
-      data_type = dtypes.float16
-      self._VerifyValues(
-          input_size,
-          filter_size,
-          stride,
-          padding,
-          data_type,
-          use_gpu=True,
-          data_format="NCHW",
-          dilations=dilations)
+      data_types = [dtypes.float16, dtypes.bfloat16]
+      for data_type in data_types:
+        self._VerifyValues(
+            input_size,
+            filter_size,
+            stride,
+            padding,
+            data_type,
+            use_gpu=True,
+            data_format="NCHW",
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2D(self):
@@ -509,11 +510,14 @@ def testDepthwiseConv2DExplicit(self):
           "Testing DepthwiseConv2D, %dth config: %r * %r, stride: %d, padding: "
           "%s", index, input_size, filter_size, stride, padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
-      optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
+      # on the ROCm platform and its support for bfloat16 is unknown.
+      data_types = [dtypes.float16, dtypes.float32]
+      if not test.is_built_with_rocm():
+        data_types.extend([dtypes.float64, dtypes.bfloat16])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
-      for data_type in [dtypes.float16, dtypes.float32] + optional_float64:
+      for data_type in data_types:
         for data_format in data_formats:
+          tolerance = 2e-2 if data_type == dtypes.bfloat16 else None
           self._VerifyValues(
               input_size,
               filter_size,
@@ -522,7 +526,8 @@ def testDepthwiseConv2DExplicit(self):
               data_type,
               use_gpu=True,
               data_format=data_format,
-              dilations=dilations)
+              dilations=dilations,
+              tolerance=tolerance)
 
 
 # This is testing against hand calculated results.
@@ -662,6 +667,7 @@ def _ConstructAndTestGradient(self,
           dtypes.float16: 4e-0,
           dtypes.float32: 8e-4,
           dtypes.float64: 1e-12,
+          dtypes.bfloat16: 1e-0,
       }[data_type]
 
       input_tensor = constant_op.constant(
@@ -743,18 +749,19 @@ def testDepthwiseConv2DInputGradCudnn(self):
           "Testing DepthwiseConv2DInputGradCudnn, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
-      data_type = dtypes.float16
-      self._ConstructAndTestGradient(
-          input_size,
-          filter_size,
-          output_size,
-          stride,
-          padding,
-          data_type,
-          test_input=True,
-          use_gpu=True,
-          data_format="NCHW",
-          dilations=dilations)
+      data_types = [dtypes.float16, dtypes.bfloat16]
+      for data_type in data_types:
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=True,
+            use_gpu=True,
+            data_format="NCHW",
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGrad(self):
@@ -825,10 +832,12 @@ def testDepthwiseConv2DInputGradExplicit(self):
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
-      optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
+      # on the ROCm platform and its support for bfloat16 is unknown.
+      data_types = [dtypes.float16, dtypes.float32]
+      if not test.is_built_with_rocm():
+        data_types.extend([dtypes.float64, dtypes.bfloat16])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
-      for data_type in [dtypes.float16, dtypes.float32] + optional_float64:
+      for data_type in data_types:
         for data_format in data_formats:
           self._ConstructAndTestGradient(
               input_size,
@@ -853,29 +862,30 @@ def testDepthwiseConv2DFilterGradCudnn(self):
           "Testing DepthwiseConv2DFilterGradCudnn, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
-      data_type = dtypes.float16
-      self._ConstructAndTestGradient(
-          input_size,
-          filter_size,
-          output_size,
-          stride,
-          padding,
-          data_type,
-          test_input=False,
-          use_gpu=True,
-          data_format="NCHW",
-          dilations=dilations)
-      self._ConstructAndTestGradient(
-          input_size,
-          filter_size,
-          output_size,
-          stride,
-          padding,
-          data_type,
-          test_input=False,
-          use_gpu=True,
-          data_format="NHWC",
-          dilations=dilations)
+      data_types = [dtypes.float16, dtypes.bfloat16]
+      for data_type in data_types:
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=False,
+            use_gpu=True,
+            data_format="NCHW",
+            dilations=dilations)
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=False,
+            use_gpu=True,
+            data_format="NHWC",
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGrad(self):
@@ -935,10 +945,12 @@ def testDepthwiseConv2DFilterGradExplicit(self):
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
       # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
-      optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
+      # on the ROCm platform and its support for bfloat16 is unknown.
+      data_types = [dtypes.float16, dtypes.float32]
+      if not test.is_built_with_rocm():
+        data_types.extend([dtypes.float64, dtypes.bfloat16])
       data_formats = ["NHWC", "NCHW"] if test.is_gpu_available() else ["NHWC"]
-      for data_type in [dtypes.float16, dtypes.float32] + optional_float64:
+      for data_type in data_types:
         for data_format in data_formats:
           self._ConstructAndTestGradient(
               input_size,
@@ -954,25 +966,26 @@ def testDepthwiseConv2DFilterGradExplicit(self):
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
                             stride, padding, dtype):
-    x1 = np.random.rand(*filter_sizes).astype(dtype)
-    x2 = np.random.rand(*output_sizes).astype(dtype)
+    x1 = np.random.rand(*filter_sizes)
+    x2 = np.random.rand(*output_sizes)
     if isinstance(padding, list):
       padding = [(0, 0)] + padding + [(0, 0)]
 
-    def _GetVal(use_gpu):
+    def _GetVal(use_gpu, dtype):
       with self.cached_session(use_gpu=use_gpu):
         t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
-        t1 = constant_op.constant(x1, shape=filter_sizes)
-        t2 = constant_op.constant(x2, shape=output_sizes)
+        t1 = constant_op.constant(x1, shape=filter_sizes, dtype=dtype)
+        t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
         ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
-    gpu_value = _GetVal(use_gpu=True)
-    cpu_value = _GetVal(use_gpu=False)
-    self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
+    rtol, atol = (1e-1, 1e-1) if dtype == "bfloat16" else (1e-4, 1e-4)
+    gpu_value = _GetVal(use_gpu=True, dtype=dtype)
+    cpu_value = _GetVal(use_gpu=False, dtype=dtype)
+    self.assertAllClose(cpu_value, gpu_value, rtol=rtol, atol=atol)
 
   @test_util.run_gpu_only
   def testDepthwiseConv2DInputGradCompare(self):
@@ -986,12 +999,14 @@ def testDepthwiseConv2DInputGradCompare(self):
           padding)
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float32")
-      # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
       if test.is_built_with_rocm():
         continue
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float64")
+      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
+                                 padding, "bfloat16")
 
   @test_util.run_gpu_only
   def testDepthwiseConv2DInputGradExplicitCompare(self):
@@ -1005,28 +1020,30 @@ def testDepthwiseConv2DInputGradExplicitCompare(self):
           padding)
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float32")
-      # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
       if test.is_built_with_rocm():
         continue
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding, "float64")
+      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
+                                 padding, "bfloat16")
 
   def _CompareBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
                              stride, padding, dtype):
-    x0 = np.random.rand(*input_sizes).astype(dtype)
-    x2 = np.random.rand(*output_sizes).astype(dtype)
+    x0 = np.random.rand(*input_sizes)
+    x2 = np.random.rand(*output_sizes)
     padding_nhwc = padding
     padding_nchw = padding
     if isinstance(padding, list):
       padding_nhwc = [(0, 0)] + padding + [(0, 0)]
       padding_nchw = [(0, 0)] + [(0, 0)] + padding
 
-    def _GetVal(use_gpu, data_format="NHWC"):
+    def _GetVal(use_gpu, dtype, data_format="NHWC"):
       with self.cached_session(use_gpu=use_gpu):
-        t0 = constant_op.constant(x0, shape=input_sizes)
+        t0 = constant_op.constant(x0, shape=input_sizes, dtype=dtype)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
-        t2 = constant_op.constant(x2, shape=output_sizes)
+        t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
         strides = [1, stride, stride, 1]
         padding = padding_nhwc
         if data_format == "NCHW":
@@ -1045,10 +1062,11 @@ def _GetVal(use_gpu, data_format="NHWC"):
         self.assertShapeEqual(ret, backprop)
         return ret
 
-    cpu_value = _GetVal(use_gpu=False)
+    cpu_value = _GetVal(use_gpu=False, dtype=dtype)
     for data_format in ["NHWC", "NCHW"]:
-      gpu_value = _GetVal(use_gpu=True, data_format=data_format)
-      self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
+      gpu_value = _GetVal(use_gpu=True, dtype=dtype, data_format=data_format)
+      self.assertAllCloseAccordingToType(
+          cpu_value, gpu_value, rtol=1e-4, atol=1e-4, bfloat16_rtol=1e-0)
 
   @test_util.run_gpu_only
   def testDepthwiseConv2DFilterGradCompare(self):
@@ -1062,13 +1080,16 @@ def testDepthwiseConv2DFilterGradCompare(self):
           padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float32")
-      # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
       if test.is_built_with_rocm():
         continue
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float64")
 
+      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
+                                  padding, "bfloat16")
+
   @test_util.run_gpu_only
   def testDepthwiseConv2DFilterGradExplicitCompare(self):
     for index, (input_size, filter_size, output_size, stride, padding,
@@ -1081,9 +1102,78 @@ def testDepthwiseConv2DFilterGradExplicitCompare(self):
           padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float32")
-      # double datatype is currently not supported for convolution ops
-      # on the ROCm platform
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
       if test.is_built_with_rocm():
         continue
       self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
                                   padding, "float64")
+
+      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
+                                  padding, "bfloat16")
+
+  def _CompareForward(self, input_sizes, filter_sizes, output_sizes, stride,
+                      padding, dtype):
+    x1 = np.random.rand(*input_sizes)
+    x2 = np.random.rand(*filter_sizes)
+    if isinstance(padding, list):
+      padding = [(0, 0)] + padding + [(0, 0)]
+
+    def _GetVal(use_gpu, dtype):
+      with self.cached_session(use_gpu=use_gpu):
+        t1 = constant_op.constant(x1, shape=input_sizes, dtype=dtype)
+        t2 = constant_op.constant(x2, shape=filter_sizes, dtype=dtype)
+        output = nn_ops.depthwise_conv2d_native(
+            t1, t2, strides=[1, stride, stride, 1], padding=padding)
+        ret = self.evaluate(output)
+        self.assertShapeEqual(ret, output)
+        return ret
+
+    gpu_value = _GetVal(use_gpu=True, dtype=dtype)
+    cpu_value = _GetVal(use_gpu=False, dtype=dtype)
+    self.assertAllCloseAccordingToType(
+        cpu_value, gpu_value, rtol=1e-4, atol=1e-4, bfloat16_rtol=1e-1)
+
+  @test_util.run_gpu_only
+  def testDepthwiseConv2DForwardCompare(self):
+    for index, (input_size, filter_size, output_size, stride, padding,
+                dilations) in enumerate(ConfigsToTest()):
+      if dilations:
+        continue
+      tf_logging.info(
+          "Testing DepthwiseConv2DForwardCompare, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "float32")
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
+      if test.is_built_with_rocm():
+        continue
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "float64")
+
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "bfloat16")
+
+  @test_util.run_gpu_only
+  def testDepthwiseConv2DForwardExplicitCompare(self):
+    for index, (input_size, filter_size, output_size, stride, padding,
+                dilations) in enumerate(ConfigsToTestExplicit()):
+      if dilations:
+        continue
+      tf_logging.info(
+          "Testing DepthwiseConv2DForwardCompare, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
+      # Convolutions on the ROCm platform don't support double dtype. And its
+      # support for bf16 is unknown. So, we skip these tests.
+      if test.is_built_with_rocm():
+        continue
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "float64")
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "float32")
+
+      self._CompareForward(input_size, filter_size, output_size, stride,
+                           padding, "bfloat16")
diff --git a/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
index 59b20de84b4..21c48dd51b9 100644
--- a/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
@@ -351,7 +351,7 @@ def testPoolingRatioHasMoreDimThanInput(self):
             name=None)
         self.evaluate(result)
 
-  def testPoolingRatioValueOutOfRange(self):
+  def testPoolingRatioIllegalSmallValue(self):
     with self.cached_session() as _:
       # Whether turn on `TF2_BEHAVIOR` generates different error messages
       with self.assertRaisesRegex(
@@ -368,6 +368,16 @@ def testPoolingRatioValueOutOfRange(self):
         )
         self.evaluate(result)
 
+  def testPoolingIllegalRatioForBatch(self):
+    with self.cached_session() as _:
+      with self.assertRaises(errors.UnimplementedError):
+        result = nn_ops.gen_nn_ops.fractional_avg_pool(
+            np.zeros([3, 30, 50, 3]),
+            [2, 3, 1.5, 1],
+            True,
+            True)
+        self.evaluate(result)
+
 
 class FractionalAvgPoolGradTest(test.TestCase):
   """Tests for FractionalAvgPoolGrad.
diff --git a/tensorflow/python/kernel_tests/nn_ops/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/nn_ops/fractional_max_pool_op_test.py
index 9102973fa13..d1c7c3680e3 100644
--- a/tensorflow/python/kernel_tests/nn_ops/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/fractional_max_pool_op_test.py
@@ -338,7 +338,7 @@ def testPoolingRatioHasMoreDimThanInput(self):
             name=None)
         self.evaluate(result)
 
-  def testPoolingRatioValueOutOfRange(self):
+  def testPoolingRatioIllegalSmallValue(self):
     with self.cached_session() as _:
       # Whether turn on `TF2_BEHAVIOR` generates different error messages
       with self.assertRaisesRegex(
@@ -355,6 +355,16 @@ def testPoolingRatioValueOutOfRange(self):
         )
         self.evaluate(result)
 
+  def testPoolingIllegalRatioForBatch(self):
+    with self.cached_session() as _:
+      with self.assertRaises(errors.UnimplementedError):
+        result = nn_ops.fractional_max_pool(
+            np.zeros([3, 30, 50, 3]),
+            [2, 3, 1.5, 1],
+            True,
+            True)
+        self.evaluate(result)
+
 
 class FractionalMaxPoolGradTest(test.TestCase):
   """Tests for FractionalMaxPoolGrad.
diff --git a/tensorflow/python/kernel_tests/nn_ops/morphological_ops_test.py b/tensorflow/python/kernel_tests/nn_ops/morphological_ops_test.py
index 095be60db9f..c01b1378a96 100644
--- a/tensorflow/python/kernel_tests/nn_ops/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/morphological_ops_test.py
@@ -14,10 +14,12 @@
 # ==============================================================================
 """Functional tests for morphological filtering operations."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker_v2
@@ -26,9 +28,10 @@
 from tensorflow.python.platform import test
 
 
-class DilationTest(test.TestCase):
+class DilationTest(test.TestCase, parameterized.TestCase):
 
-  def _VerifyValues(self, image, kernel, strides, rates, padding, out, use_gpu):
+  def _VerifyValues(self, image, kernel, strides, rates, padding, out, use_gpu,
+                    dtype):
     """Verifies the output values of the dilation function.
 
     Args:
@@ -45,15 +48,15 @@ def _VerifyValues(self, image, kernel, strides, rates, padding, out, use_gpu):
 
     with self.cached_session(use_gpu=use_gpu):
       out_tensor = nn_ops.dilation2d(
-          constant_op.constant(image),
-          constant_op.constant(kernel),
+          constant_op.constant(image, dtype=dtype),
+          constant_op.constant(kernel, dtype=dtype),
           strides=strides,
           rates=rates,
           padding=padding,
           name="dilation2d")
-      self.assertAllClose(out, self.evaluate(out_tensor))
+      self.assertAllCloseAccordingToType(out, self.evaluate(out_tensor))
 
-  def _testDilationValidPadding(self, use_gpu):
+  def _testDilationValidPadding(self, use_gpu, dtype):
     # [1, 2, 2, 1]
     image = [[[[.1], [.2]], [[.3], [.4]]]]
     # [2, 2, 1]
@@ -67,9 +70,10 @@ def _testDilationValidPadding(self, use_gpu):
         rates=[1, 1],
         padding="VALID",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationSamePadding(self, use_gpu):
+  def _testDilationSamePadding(self, use_gpu, dtype):
     # [1, 2, 2, 1]
     image = [[[[.1], [.2]], [[.3], [.4]]]]
     # [2, 2, 1]
@@ -83,9 +87,10 @@ def _testDilationSamePadding(self, use_gpu):
         rates=[1, 1],
         padding="SAME",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationSamePaddingDepth(self, use_gpu):
+  def _testDilationSamePaddingDepth(self, use_gpu, dtype):
     # [1, 2, 2, 3]
     image = [[[[.1, .2, .0], [.2, .3, .1]], [[.3, .4, .2], [.4, .5, .3]]]]
     # [2, 2, 3]
@@ -99,9 +104,10 @@ def _testDilationSamePaddingDepth(self, use_gpu):
         rates=[1, 1],
         padding="SAME",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationSamePaddingBatch(self, use_gpu):
+  def _testDilationSamePaddingBatch(self, use_gpu, dtype):
     # [2, 2, 2, 1]
     image = [[[[.1], [.2]], [[.3], [.4]]], [[[.2], [.3]], [[.4], [.5]]]]
     # [2, 2, 1]
@@ -115,9 +121,10 @@ def _testDilationSamePaddingBatch(self, use_gpu):
         rates=[1, 1],
         padding="SAME",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationValidPaddingNonSquareWindow(self, use_gpu):
+  def _testDilationValidPaddingNonSquareWindow(self, use_gpu, dtype):
     # [1, 2, 2, 1]
     image = [[[[.1], [.2]], [[.3], [.4]]]]
     # [1, 2, 1]
@@ -131,9 +138,10 @@ def _testDilationValidPaddingNonSquareWindow(self, use_gpu):
         rates=[1, 1],
         padding="VALID",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationSamePaddingRate(self, use_gpu):
+  def _testDilationSamePaddingRate(self, use_gpu, dtype):
     # [1, 3, 3, 1]
     image = [[[[.1], [.2], [.3]], [[.4], [.5], [.6]], [[.7], [.8], [.9]]]]
     # [2, 2, 1]
@@ -151,9 +159,10 @@ def _testDilationSamePaddingRate(self, use_gpu):
         rates=[2, 2],
         padding="SAME",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationValidPaddingUnevenStride(self, use_gpu):
+  def _testDilationValidPaddingUnevenStride(self, use_gpu, dtype):
     # [1, 3, 3, 1]
     image = [[[[.1], [.2], [.3], [.4]], [[.5], [.6], [.7], [.8]],
               [[.9], [1.0], [1.1], [1.2]]]]
@@ -168,20 +177,28 @@ def _testDilationValidPaddingUnevenStride(self, use_gpu):
         rates=[1, 1],
         padding="VALID",
         out=out,
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def testDilation(self):
+  @parameterized.parameters(dtypes.float32, dtypes.bfloat16)
+  def testDilation(self, dtype):
     for use_gpu in True, False:
-      self._testDilationValidPadding(use_gpu)
-      self._testDilationSamePadding(use_gpu)
-      self._testDilationSamePaddingDepth(use_gpu)
-      self._testDilationSamePaddingBatch(use_gpu)
-      self._testDilationValidPaddingNonSquareWindow(use_gpu)
-      self._testDilationSamePaddingRate(use_gpu)
-      self._testDilationValidPaddingUnevenStride(use_gpu)
-
-  def _ConstructAndTestGradient(self, image_shape, kernel_shape, strides, rates,
-                                padding, use_gpu):
+      self._testDilationValidPadding(use_gpu, dtype)
+      self._testDilationSamePadding(use_gpu, dtype)
+      self._testDilationSamePaddingDepth(use_gpu, dtype)
+      self._testDilationSamePaddingBatch(use_gpu, dtype)
+      self._testDilationValidPaddingNonSquareWindow(use_gpu, dtype)
+      self._testDilationSamePaddingRate(use_gpu, dtype)
+      self._testDilationValidPaddingUnevenStride(use_gpu, dtype)
+
+  def _ConstructAndTestGradient(self,
+                                image_shape,
+                                kernel_shape,
+                                strides,
+                                rates,
+                                padding,
+                                use_gpu,
+                                dtype=dtypes.float32):
     """Verifies the gradients of the dilation function.
 
     Args:
@@ -201,9 +218,10 @@ def _ConstructAndTestGradient(self, image_shape, kernel_shape, strides, rates,
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    image_tensor = constant_op.constant(image, shape=image_shape, name="input")
+    image_tensor = constant_op.constant(
+        image, shape=image_shape, name="input", dtype=dtype)
     kernel_tensor = constant_op.constant(
-        kernel, shape=kernel_shape, name="filter")
+        kernel, shape=kernel_shape, name="filter", dtype=dtype)
 
     def compute_dilation2d(image_tensor, kernel_tensor):
       return nn_ops.dilation2d(
@@ -226,18 +244,22 @@ def compute_dilation2d(image_tensor, kernel_tensor):
         err = max(err1, err2)
 
     print("Dilation gradient error = %f" % err)
-    self.assertLess(err, 1e-4)
+    if dtype == dtypes.bfloat16:
+      self.assertLess(err, 4.0)
+    else:
+      self.assertLess(err, 1e-4)
 
-  def _testDilationGradValidPadding_1x1x1(self, use_gpu):
+  def _testDilationGradValidPadding_1x1x1(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 1],
         kernel_shape=[1, 1, 1],
         strides=[1, 1],
         rates=[1, 1],
         padding="VALID",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradDeterminismError(self, use_gpu):
+  def _testDilationGradDeterminismError(self, use_gpu, dtype):
     if use_gpu and test.is_gpu_available(cuda_only=True):
       try:
         config.enable_op_determinism()
@@ -250,7 +272,8 @@ def _testDilationGradDeterminismError(self, use_gpu):
               strides=[1, 1],
               rates=[1, 1],
               padding="VALID",
-              use_gpu=use_gpu)
+              use_gpu=use_gpu,
+              dtype=dtype)
       finally:
         config.disable_op_determinism()
     else:
@@ -262,74 +285,82 @@ def _testDilationGradDeterminismError(self, use_gpu):
             strides=[1, 1],
             rates=[1, 1],
             padding="VALID",
-            use_gpu=use_gpu)
+            use_gpu=use_gpu,
+            dtype=dtype)
       finally:
         config.disable_op_determinism()
 
-  def _testDilationGradSamePadding_1x1x1(self, use_gpu):
+  def _testDilationGradSamePadding_1x1x1(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 1],
         kernel_shape=[1, 1, 1],
         strides=[1, 1],
         rates=[1, 1],
         padding="SAME",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradSamePadding_1x1x2(self, use_gpu):
+  def _testDilationGradSamePadding_1x1x2(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 2],
         kernel_shape=[1, 1, 2],
         strides=[1, 1],
         rates=[1, 1],
         padding="SAME",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradValidPadding_2x2x1(self, use_gpu):
+  def _testDilationGradValidPadding_2x2x1(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 1],
         kernel_shape=[2, 2, 1],
         strides=[1, 1],
         rates=[1, 1],
         padding="VALID",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradSamePadding_2x2x1(self, use_gpu):
+  def _testDilationGradSamePadding_2x2x1(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 1],
         kernel_shape=[2, 2, 1],
         strides=[1, 1],
         rates=[1, 1],
         padding="SAME",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradSamePaddingBatch_2x2x1(self, use_gpu):
+  def _testDilationGradSamePaddingBatch_2x2x1(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[4, 3, 3, 1],
         kernel_shape=[2, 2, 1],
         strides=[1, 1],
         rates=[1, 1],
         padding="SAME",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def _testDilationGradSamePadding_2x2x4(self, use_gpu):
+  def _testDilationGradSamePadding_2x2x4(self, use_gpu, dtype):
     self._ConstructAndTestGradient(
         image_shape=[1, 3, 3, 4],
         kernel_shape=[2, 2, 4],
         strides=[1, 1],
         rates=[1, 1],
         padding="SAME",
-        use_gpu=use_gpu)
+        use_gpu=use_gpu,
+        dtype=dtype)
 
-  def testDilationGrad(self):
+  @parameterized.parameters(dtypes.float32, dtypes.bfloat16)
+  def testDilationGrad(self, dtype):
     for use_gpu in True, False:
-      self._testDilationGradDeterminismError(use_gpu)
-      self._testDilationGradValidPadding_1x1x1(use_gpu)
-      self._testDilationGradSamePadding_1x1x1(use_gpu)
-      self._testDilationGradSamePadding_1x1x2(use_gpu)
-      self._testDilationGradValidPadding_2x2x1(use_gpu)
-      self._testDilationGradSamePadding_2x2x1(use_gpu)
-      self._testDilationGradSamePaddingBatch_2x2x1(use_gpu)
-      self._testDilationGradSamePadding_2x2x4(use_gpu)
+      self._testDilationGradDeterminismError(use_gpu, dtype)
+      self._testDilationGradValidPadding_1x1x1(use_gpu, dtype)
+      self._testDilationGradSamePadding_1x1x1(use_gpu, dtype)
+      self._testDilationGradSamePadding_1x1x2(use_gpu, dtype)
+      self._testDilationGradValidPadding_2x2x1(use_gpu, dtype)
+      self._testDilationGradSamePadding_2x2x1(use_gpu, dtype)
+      self._testDilationGradSamePaddingBatch_2x2x1(use_gpu, dtype)
+      self._testDilationGradSamePadding_2x2x4(use_gpu, dtype)
 
 
 class ErosionTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/nn_ops/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/nn_ops/pooling_ops_3d_test.py
index df05c731637..45b81aa712c 100644
--- a/tensorflow/python/kernel_tests/nn_ops/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/pooling_ops_3d_test.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -24,8 +25,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -49,7 +49,7 @@ def GetTestConfigs():
 class PoolingTest(test.TestCase):
 
   def _VerifyOneTest(self, pool_func, input_sizes, window, strides, padding,
-                     data_format, expected, use_gpu):
+                     data_format, data_type, expected, use_gpu):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -59,6 +59,7 @@ def _VerifyOneTest(self, pool_func, input_sizes, window, strides, padding,
       strides: Tuple of strides for dims: planes, rows, cols.
       padding: Padding type.
       data_format: The data format we use to run the pooling operation.
+      data_type: The data type to use to run the pooling operation.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether to run ops on GPU.
     """
@@ -68,8 +69,11 @@ def _VerifyOneTest(self, pool_func, input_sizes, window, strides, padding,
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
+    if data_type == dtypes.bfloat16:
+      x = [f * 0.1 for f in x]
+      expected = [f * 0.1 for f in expected]
     with self.cached_session(use_gpu=use_gpu):
-      t = constant_op.constant(x, shape=input_sizes)
+      t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
       window = [1] + list(window) + [1]
       strides = [1] + list(strides) + [1]
       if data_format == "NCDHW":
@@ -87,13 +91,20 @@ def _VerifyOneTest(self, pool_func, input_sizes, window, strides, padding,
       vals = self.evaluate(t)
     # Verifies values.
     actual = vals.flatten()
-    self.assertAllClose(expected, actual)
+    rtol = atol = 1e-6
+    if data_type == dtypes.bfloat16:
+      rtol = atol = 2e-2
+    self.assertAllClose(expected, actual, rtol=rtol, atol=atol)
 
   def _VerifyValues(self, pool_func, input_sizes, window, strides,
                     padding, expected):
     for data_format, use_gpu in GetTestConfigs():
       self._VerifyOneTest(pool_func, input_sizes, window, strides, padding,
-                          data_format, expected, use_gpu)
+                          data_format, dtypes.float32, expected, use_gpu)
+      # Don't test bfloat16 on GPU if there is no GPU available.
+      if (not use_gpu) or test_util.is_gpu_available(cuda_only=True):
+        self._VerifyOneTest(pool_func, input_sizes, window, strides, padding,
+                            data_format, dtypes.bfloat16, expected, use_gpu)
 
   def testAvgPool3dValidPadding(self):
     expected_output = [20.5, 21.5, 22.5]
@@ -125,21 +136,39 @@ def testAvgPool3dSamePaddingDifferentStrides(self):
         padding="SAME",
         expected=expected_output)
 
-  def testMaxPool3dGrad(self):
+  def testAvgPool3dGrad(self):
     with self.assertRaises(
         (errors.ResourceExhaustedError, errors.InvalidArgumentError)):
-      with self.cached_session():
-        orig_input_shape = constant_op.constant(
-            1879048192, shape=[5], dtype=dtypes.int32)
+      for dtype in [dtypes.float32, dtypes.bfloat16]:
+        with self.cached_session():
+          orig_input_shape = constant_op.constant(
+              1879048192, shape=[5], dtype=dtypes.int32
+          )
+          grad = constant_op.constant(1, shape=[1, 3, 2, 4, 2], dtype=dtype)
+          t = gen_nn_ops.AvgPool3DGrad(
+              orig_input_shape=orig_input_shape,
+              grad=grad,
+              ksize=[1, 1, 1, 1, 1],
+              strides=[1, 1, 1, 1, 1],
+              padding="SAME",
+              data_format="NDHWC",
+          )
+          self.evaluate(t)
+
+  def testAvgPool3dGradEmptyInput(self):
+    for data_format, use_gpu in GetTestConfigs():
+      with self.cached_session(use_gpu=use_gpu):
+        orig_input_shape = constant_op.constant([5, 6, 7, 0, 8],
+                                                dtype=dtypes.int32)
         grad = constant_op.constant(
-            1, shape=[1, 3, 2, 4, 2], dtype=dtypes.float32)
+            1, shape=[5, 6, 7, 0, 8], dtype=dtypes.float32)
         t = gen_nn_ops.AvgPool3DGrad(
             orig_input_shape=orig_input_shape,
             grad=grad,
             ksize=[1, 1, 1, 1, 1],
             strides=[1, 1, 1, 1, 1],
             padding="SAME",
-            data_format="NDHWC")
+            data_format=data_format)
         self.evaluate(t)
 
   def testMaxPool3dValidPadding(self):
@@ -261,15 +290,60 @@ def DISABLED_testAvgPool3dEmptyOutTensor(self):
     values = self.evaluate(avg_pool_3d)
     self.assertEqual(values.shape, (30, 1, 1, 0, 17))
 
-  def _ConstructAndTestGradientForConfig(self,
-                                         pool_func,
-                                         input_sizes,
-                                         output_sizes,
-                                         window,
-                                         strides,
-                                         padding,
-                                         data_format,
-                                         use_gpu):
+  def _getJacobians(self,
+                    pool_func,
+                    input_sizes,
+                    output_sizes,
+                    window,
+                    strides,
+                    padding,
+                    data_format,
+                    use_gpu,
+                    dtype=np.float32):
+    with self.cached_session(use_gpu=use_gpu):
+      x = np.arange(np.prod(input_sizes)).reshape(input_sizes).astype(dtype)
+      input_tensor = constant_op.constant(x, shape=input_sizes)
+      ksize = [1, window[0], window[1], window[2], 1]
+      strides = [1, strides[0], strides[1], strides[2], 1]
+      if data_format == "NCDHW":
+        ksize = test_util.NHWCToNCHW(ksize)
+        strides = test_util.NHWCToNCHW(strides)
+        input_tensor = test_util.NHWCToNCHW(input_tensor)
+        output_sizes = test_util.NHWCToNCHW(output_sizes)
+
+      def func(in_tensor):
+        return pool_func(
+            in_tensor,
+            ksize=ksize,
+            strides=strides,
+            padding=padding,
+            data_format=data_format)
+
+      input_jacob_a, input_jacob_n = gradient_checker_v2.compute_gradient(
+          func, [input_tensor])
+
+      def pool_grad_function(upstream_gradients):
+        with backprop.GradientTape() as tape:
+          tape.watch(input_tensor)
+          pool_output = pool_func(
+              input_tensor,
+              ksize=ksize,
+              strides=strides,
+              padding=padding,
+              data_format=data_format)
+          gradient_injector_output = pool_output * upstream_gradients
+          return tape.gradient(gradient_injector_output, input_tensor)
+
+      upstream_tensor = constant_op.constant(
+          2 * np.random.rand(*output_sizes) - 1, dtype=dtype)
+      grad_jacob_a, grad_jacob_n = gradient_checker_v2.compute_gradient(
+          pool_grad_function, [upstream_tensor])
+
+      return ((input_jacob_a, grad_jacob_a), (input_jacob_n, grad_jacob_n))
+
+  def _ConstructAndTestGradientForConfig(self, pool_func, input_sizes,
+                                         output_sizes, window, strides, padding,
+                                         data_format, data_type, use_gpu):
     """Verifies the gradients of a pooling function.
 
     Args:
@@ -281,64 +355,37 @@ def _ConstructAndTestGradientForConfig(self,
       strides: Tuple of strides for dims: planes, rows, cols.
       padding: Padding type.
       data_format: Data format string.
+      data_type: The data type to use to run the pooling operation.
       use_gpu: Whether to run on GPU.
     """
-    total_size = 1
-    for s in input_sizes:
-      total_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x = np.arange(1, total_size + 1, dtype=np.float32)
-    with self.cached_session(use_gpu=use_gpu):
-      input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
-      err_g_margin = 1e-3
-      err_gg_margin = 1.5e-2
-      if pool_func == nn_ops.avg_pool3d:
-        func_name = "avg_pool3d"
-        x_init_value = None
-      else:
-        x_init_value = np.asfarray(np.arange(1, total_size + 1),
-                                   dtype=np.float32).reshape(input_sizes)
-        func_name = "max_pool3d"
-
-      ksize = [1, window[0], window[1], window[2], 1]
-      strides = [1, strides[0], strides[1], strides[2], 1]
-      t = input_tensor
-
-      if data_format == "NCDHW":
-        ksize = test_util.NHWCToNCHW(ksize)
-        strides = test_util.NHWCToNCHW(strides)
-        t = test_util.NHWCToNCHW(t)
-        output_sizes = test_util.NHWCToNCHW(output_sizes)
-
-      t = pool_func(
-          t,
-          ksize=ksize,
-          strides=strides,
-          padding=padding,
-          data_format=data_format,
-          name=func_name)
-      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
-
-      err_g = gradient_checker.compute_gradient_error(
-          input_tensor,
+    jacob_a, jacob_n = self._getJacobians(
+        pool_func,
+        input_sizes,
+        output_sizes,
+        window,
+        strides,
+        padding,
+        data_format,
+        use_gpu,
+        dtype=data_type.as_numpy_dtype)
+
+    if data_type == dtypes.bfloat16:
+      # Compare bf16 analytical gradients to fp32 numerical gradients.
+      _, jacob_n = self._getJacobians(
+          pool_func,
           input_sizes,
-          t,
           output_sizes,
-          x_init_value=x_init_value,
-          delta=1e-2)
-      err_gg = gradient_checker.compute_gradient_error(
-          input_tensor,
-          input_sizes,
-          t_g,
-          input_sizes,
-          x_init_value=x_init_value,
-          delta=1e-2)
-
-    print("%s gradient error = " % func_name, err_g)
-    self.assertLess(err_g, err_g_margin)
-    print("%s second-order gradient error = " % func_name, err_gg)
-    self.assertLess(err_gg, err_gg_margin)
+          window,
+          strides,
+          padding,
+          data_format,
+          use_gpu,
+          dtype=np.float32)
+
+    input_jacob_a, grad_jacob_a = jacob_a
+    input_jacob_n, grad_jacob_n = jacob_n
+    self.assertAllClose(input_jacob_a, input_jacob_n, rtol=1e-3, atol=1e-3)
+    self.assertAllClose(grad_jacob_a, grad_jacob_n, rtol=1e-3, atol=1e-3)
 
   def _ConstructAndTestGradient(self,
                                 pool_func,
@@ -346,10 +393,19 @@ def _ConstructAndTestGradient(self,
     """Runs _ConstructAndTestGradientForConfig for all tests configurations."""
 
     for data_format, use_gpu in GetTestConfigs():
-      self._ConstructAndTestGradientForConfig(pool_func,
-                                              data_format=data_format,
-                                              use_gpu=use_gpu,
-                                              **kwargs)
+      self._ConstructAndTestGradientForConfig(
+          pool_func,
+          data_format=data_format,
+          data_type=dtypes.float32,
+          use_gpu=use_gpu,
+          **kwargs)
+      if use_gpu and test_util.is_gpu_available(cuda_only=True):
+        self._ConstructAndTestGradientForConfig(
+            pool_func,
+            data_format=data_format,
+            data_type=dtypes.bfloat16,
+            use_gpu=use_gpu,
+            **kwargs)
 
   @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_1_3d(self):
diff --git a/tensorflow/python/kernel_tests/nn_ops/pooling_ops_test.py b/tensorflow/python/kernel_tests/nn_ops/pooling_ops_test.py
index e279f6e0027..3135d640468 100644
--- a/tensorflow/python/kernel_tests/nn_ops/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/pooling_ops_test.py
@@ -85,6 +85,9 @@ def GetTestConfigsDicts(v1_fn,
     configs1 += [(data_format, use_gpu, dtypes.float16),
                  (data_format, use_gpu, dtypes.float64)]
 
+    if use_gpu:
+      configs1 += [(data_format, use_gpu, dtypes.bfloat16)]
+
   # Convert from tuple to dict and add v1/v2 versions.
   ret = []
   for data_format, use_gpu, data_type in configs1:
@@ -543,7 +546,7 @@ def testRawAvgPoolLargeKsizeRaiseError(self):
       with self.cached_session():
         t = gen_nn_ops.avg_pool(
             value=np.ones([1, 1, 1, 1]),
-            ksize=[1, 1e20, 1, 1],
+            ksize=[1, 9223372036854775807, 1, 1],
             strides=[1, 1, 1, 1],
             padding="SAME",
             data_format="NHWC")
@@ -784,6 +787,19 @@ def testMaxPoolInvalidFilterSize(self, **kwargs):
         t = self.evaluate(
             nn_ops.max_pool(t, ksize=[1, 1, 2, 1], strides=1, padding="VALID"))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMaxPoolWithArgmaxKsizeOverflow(self):
+    with self.assertRaisesRegex(
+        (ValueError, errors_impl.InvalidArgumentError),
+        "ksize must be a postive int32 value"):
+      with self.cached_session():
+        t = gen_nn_ops.max_pool_with_argmax(
+            input=[[[[1, 1, 1]]]],
+            ksize=[1, -2**31, 4, 1],
+            strides=[1, 1000, 1, 7],
+            padding="SAME")
+        self.evaluate(t)
+
   # Tests for DepthwiseMaxPooling on CPU only.
   @parameterized.parameters(
       GetTestConfigsDicts(
@@ -906,8 +922,8 @@ def testDepthwiseMaxPoolInvalidConfigs(self):
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
     # double datatype is currently not supported for pooling ops
     # on the ROCm platform
-    for dtype in [np.float32, np.float16] \
-        + [np.float64] if not test.is_built_with_rocm() else []:
+    for dtype in [np.float32, np.float16
+                 ] + [np.float64] if not test.is_built_with_rocm() else []:
       tensor_input = np.random.rand(*input_shape).astype(dtype)
       with self.cached_session():
         t = constant_op.constant(tensor_input, shape=input_shape)
@@ -923,8 +939,8 @@ def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
                            padding):
     # double datatype is currently not supported for pooling ops
     # on the ROCm platform
-    for dtype in [np.float32, np.float16] \
-        + [np.float64] if not test.is_built_with_rocm() else []:
+    for dtype in [np.float32, np.float16, dtypes.bfloat16.as_numpy_dtype
+                 ] + [np.float64] if not test.is_built_with_rocm() else []:
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
@@ -950,14 +966,19 @@ def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
       self.assertAllCloseAccordingToType(
-          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
+          cpu_val,
+          gpu_val,
+          half_rtol=0.01,
+          half_atol=0.01,
+          bfloat16_rtol=0.02,
+          bfloat16_atol=0.1)
 
   def _CompareMaxPoolingGradBk(self, input_shape, output_shape, ksize, strides,
                                padding):
     # double datatype is currently not supported for pooling ops
     # on the ROCm platform
-    for dtype in [np.float32, np.float16] \
-        + [np.float64] if not test.is_built_with_rocm() else []:
+    for dtype in [np.float32, np.float16, dtypes.bfloat16.as_numpy_dtype
+                 ] + [np.float64] if not test.is_built_with_rocm() else []:
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
@@ -1018,6 +1039,32 @@ def testMaxPoolingWithArgmax(self):
                             [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
         self.assertAllEqual(argmax.ravel(), config.argmax)
 
+  def testDepthwiseMaxPoolingWithArgmax(self):
+    tensor_input = [89, 73, -109]
+    Config = collections.namedtuple("Config", ["use_gpu", "padding"])
+    configs = [
+        Config(False, "SAME"),
+        Config(False, "VALID"),
+        Config(True, "SAME"),
+        Config(True, "VALID"),
+    ]
+
+    for config in configs:
+      with GetDeviceScope(self, use_gpu=config.use_gpu):
+        t = constant_op.constant(tensor_input, shape=[1, 1, 1, 3])
+        out_op, argmax_op = nn_ops.max_pool_with_argmax(
+            t,
+            ksize=[1, 1, 1, 3],
+            strides=[1, 1, 1, 3],
+            padding=config.padding,
+        )
+        out, argmax = self.evaluate([out_op, argmax_op])
+        # TODO(b/259733542): Fix below asserts once bug is fixed.
+        # self.assertShapeEqual(out, out_op)
+        # self.assertShapeEqual(argmax, argmax_op)
+        self.assertAllClose(out.ravel(), [89, 73, -109])
+        self.assertAllClose(argmax.ravel(), [0, 1, 2])
+
   def testMaxPoolingGradWithArgmax(self):
     orig_input = [
         1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
@@ -1059,8 +1106,8 @@ def testMaxPoolingGradThrowDeterminismError(self):
       try:
         config_exec.enable_op_determinism()
         orig_input = [
-            1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
-            0.0, 1.0, 0.0, 1.0
+            1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0,
+            0.0, 0.0, 1.0, 0.0, 1.0
         ]
         tensor_input = [11.0, 12.0, 13.0, 14.0, 21.0, 22.0, 23.0, 24.0]
 
@@ -1069,9 +1116,9 @@ def testMaxPoolingGradThrowDeterminismError(self):
           t = constant_op.constant(tensor_input, shape=[2, 2, 2, 1])
           argmax_t = constant_op.constant(
               [0, 1, 3, 5, 0, 2, 6, 8], shape=[2, 2, 2, 1], dtype=dtypes.int64)
-          with self.assertRaisesRegexp(
-              errors_impl.UnimplementedError, "Determinism is not yet supported "
-              "for MaxPoolGradWithArgmax."):
+          with self.assertRaisesRegex(
+              errors_impl.UnimplementedError, "Determinism is not yet supported"
+              " for MaxPoolGradWithArgmax."):
             out_op = gen_nn_ops.max_pool_grad_with_argmax(
                 orig_in,
                 t,
@@ -1087,8 +1134,8 @@ def testMaxPoolingGradThrowDeterminismError(self):
       try:
         config_exec.enable_op_determinism()
         orig_input = [
-            1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
-            0.0, 1.0, 0.0, 1.0
+            1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0,
+            0.0, 0.0, 1.0, 0.0, 1.0
         ]
         tensor_input = [11.0, 12.0, 13.0, 14.0, 21.0, 22.0, 23.0, 24.0]
 
@@ -2335,7 +2382,7 @@ def testOpEdgeCases(self):
 
   @test_util.run_deprecated_v1
   def testEdgeCasesRaiseErrors(self):
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "NCHW_VECT_C.*is not supported with "
         "explicit padding|XLA does not support pooling ops with explicit "
         "padding"):
@@ -2345,7 +2392,7 @@ def testEdgeCasesRaiseErrors(self):
           strides=[1, 2, 2, 1],
           padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
           data_format="NCHW_VECT_C")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Explicit padding is not supported with an input "
                     "tensor of rank 5"):
       nn_ops.max_pool_v2(
@@ -2354,7 +2401,7 @@ def testEdgeCasesRaiseErrors(self):
           strides=[1, 2, 2, 1, 1],
           padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
           data_format="NCHW")
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         ValueError, "Attr 'padding' of 'MaxPoolV2' Op passed "
                     "string 'EXPLICIT'"):
       gen_nn_ops.max_pool_v2(
@@ -2367,7 +2414,7 @@ def testEdgeCasesRaiseErrors(self):
   @test_util.run_deprecated_v1
   def testEdgeCasesExcessPadding(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           (errors_impl.UnimplementedError, errors_impl.InvalidArgumentError),
           "Right padding 2 needs to be smaller than the window size 2|"
           "XLA does not support pooling ops with explicit padding"):
@@ -2385,7 +2432,7 @@ def testEdgeCasesExcessPadding(self):
   @test_util.run_deprecated_v1
   def testNegativePadding(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "All elements of explicit_paddings must be "
                       "nonnegative for"):
         input_sizes = [1, 3, 3, 1]
@@ -2402,7 +2449,7 @@ def testNegativePadding(self):
   @test_util.run_deprecated_v1
   def testExplicitPaddingBatch(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
-      with self.assertRaisesRegexp(
+      with self.assertRaisesRegex(
           ValueError, "Nonzero explicit padding in the batch or depth "
                       "dimensions is not supported"):
         input_sizes = [1, 3, 3, 1]
@@ -2510,6 +2557,21 @@ def testAvgPoolGradInvalidInputShapeRaiseError(self):
             data_format="NHWC")
         self.evaluate(t)
 
+  def testAvgPoolGradInvalidStrideRaiseErrorProperly(self):
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      with self.cached_session():
+        orig_input_shape = [11, 9, 78, 9]
+        grad = constant_op.constant(
+            0.1, shape=[16, 16, 16, 16], dtype=dtypes.float64)
+        t = gen_nn_ops.AvgPoolGrad(
+            orig_input_shape=orig_input_shape,
+            grad=grad,
+            ksize=[1, 40, 128, 1],
+            strides=[1, 128, 128, 30],
+            padding="SAME",
+            data_format="NHWC")
+        self.evaluate(t)
+
 
 def GetMaxPoolFwdTest(input_size, filter_size, strides, padding):
 
diff --git a/tensorflow/python/kernel_tests/nn_ops/relu_op_test.py b/tensorflow/python/kernel_tests/nn_ops/relu_op_test.py
index 731b5de6ae7..92c2f3d84ac 100644
--- a/tensorflow/python/kernel_tests/nn_ops/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/relu_op_test.py
@@ -68,7 +68,12 @@ def testNumbersCPU(self):
   def testNumbersGPU(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testRelu(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
@@ -237,7 +242,12 @@ def testNumbersCPU(self):
   def testNumbersGPU(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
-    for t in [np.float16, np.float64, np.double]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testRelu6(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
@@ -304,7 +314,12 @@ def testNumbersCPU(self):
   def testNumbersGPU(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testLeakyRelu(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           alpha=0.1)
@@ -430,7 +445,12 @@ def testNumbersCPU(self):
   def testNumbersGPU(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testNaNPropagation(self):
@@ -623,7 +643,12 @@ def testNumbersCPU(self):
   def testNumbersGPU(self):
     if not test.is_gpu_available():
       self.skipTest("No GPU available")
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testCrelu(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
diff --git a/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py b/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
index c82513ac29d..263d540420f 100644
--- a/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/rnn_cell_test.py
@@ -38,6 +38,7 @@
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import rnn_cell_impl
@@ -1406,6 +1407,43 @@ def testLSTMBlockCellGradErrorHandling(self):
               h_grad=h_grad,
               use_peephole=use_peephole))
 
+  def testLSTMBlockInvalidArgument(self):
+    # Test case for GitHub issue 58175
+    forget_bias = -121.22699269620765
+    cell_clip = -106.82307555235684
+    use_peephole = False
+    seq_len_max = math_ops.saturate_cast(
+        random_ops.random_uniform(
+            [13, 11, 0], minval=0, maxval=64, dtype=dtypes.int64
+        ),
+        dtype=dtypes.int64,
+    )
+    x = random_ops.random_uniform([1, 3, 15], dtype=dtypes.float32)
+    cs_prev = random_ops.random_uniform([3, 0], dtype=dtypes.float32)
+    h_prev = random_ops.random_uniform([3, 0], dtype=dtypes.float32)
+    w = random_ops.random_uniform([15, 0], dtype=dtypes.float32)
+    wci = random_ops.random_uniform([0], dtype=dtypes.float32)
+    wcf = random_ops.random_uniform([0], dtype=dtypes.float32)
+    wco = random_ops.random_uniform([0], dtype=dtypes.float32)
+    b = random_ops.random_uniform([0], dtype=dtypes.float32)
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self.evaluate(
+          gen_rnn_ops.BlockLSTM(
+              forget_bias=forget_bias,
+              cell_clip=cell_clip,
+              use_peephole=use_peephole,
+              seq_len_max=seq_len_max,
+              x=x,
+              cs_prev=cs_prev,
+              h_prev=h_prev,
+              w=w,
+              wci=wci,
+              wcf=wcf,
+              wco=wco,
+              b=b,
+          )
+      )
+
 
 class BidirectionalRNNTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/nn_ops/softmax_op_test.py b/tensorflow/python/kernel_tests/nn_ops/softmax_op_test.py
index 6231fa3272e..8fcb04ca8bd 100644
--- a/tensorflow/python/kernel_tests/nn_ops/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/softmax_op_test.py
@@ -179,6 +179,18 @@ def testBfloat16(self):
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float32),
         dtype=dtypes.bfloat16)
 
+  @unittest.skipUnless(test.is_built_with_gpu_support(),
+                       "Test only applicable when running on GPUs")
+  def testBfloat16GPU(self):
+    if test.is_gpu_available(cuda_only=True):
+      rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)]
+      for row, col in zip(rows, cols):
+        logging.info("Testing softmax bfloat16 dtype in shape [%d, %d]", row,
+                     col)
+        data = np.random.rand(row, col)
+        self._testAll(data.astype(dtypes.bfloat16.as_numpy_dtype))
+
   def test1DTensorAsInput(self):
     self._testSoftmax(
         np.array([3., 2., 3., 9.]).astype(np.float64), use_gpu=False)
diff --git a/tensorflow/python/kernel_tests/nn_ops/softplus_op_test.py b/tensorflow/python/kernel_tests/nn_ops/softplus_op_test.py
index fa7ec241187..97e7fcbb041 100644
--- a/tensorflow/python/kernel_tests/nn_ops/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/softplus_op_test.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -37,19 +38,31 @@ def _testSoftplus(self, np_features, use_gpu=False):
     with self.cached_session(use_gpu=use_gpu):
       softplus = nn_ops.softplus(np_features)
       tf_softplus = self.evaluate(softplus)
-    self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
+    self.assertAllCloseAccordingToType(
+        np_softplus, tf_softplus, bfloat16_rtol=5e-2, bfloat16_atol=5e-2
+    )
     self.assertTrue(np.all(tf_softplus > 0))
     self.assertShapeEqual(np_softplus, softplus)
 
   def testNumbers(self):
-    for t in [np.float16, np.float32, np.float64]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testSoftplus(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=False)
       self._testSoftplus(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=True)
-      log_eps = np.log(np.finfo(t).eps)
+      if t == dtypes.bfloat16.as_numpy_dtype:
+        # bfloat16 dtype doesn't have finfo.
+        # Calculate epsilon using machine_epsilon = base ^ (-(precision - 1))
+        log_eps = np.log(2 ** (-(8 - 1)))
+      else:
+        log_eps = np.log(np.finfo(t).eps)
       one = t(1)
       ten = t(10)
       self._testSoftplus(
diff --git a/tensorflow/python/kernel_tests/nn_ops/softsign_op_test.py b/tensorflow/python/kernel_tests/nn_ops/softsign_op_test.py
index 2e81c332bb6..e77b395225c 100644
--- a/tensorflow/python/kernel_tests/nn_ops/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/softsign_op_test.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -38,7 +39,12 @@ def _testSoftsign(self, np_features, use_gpu=False):
     self.assertShapeEqual(np_softsign, softsign)
 
   def testNumbers(self):
-    for t in [np.float64, np.double]:
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        dtypes.bfloat16.as_numpy_dtype,
+    ]:
       self._testSoftsign(
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=False)
diff --git a/tensorflow/python/kernel_tests/nn_ops/xent_op_test_base.py b/tensorflow/python/kernel_tests/nn_ops/xent_op_test_base.py
index 551388587e5..80948fa7328 100644
--- a/tensorflow/python/kernel_tests/nn_ops/xent_op_test_base.py
+++ b/tensorflow/python/kernel_tests/nn_ops/xent_op_test_base.py
@@ -88,7 +88,7 @@ def _testXentND(self, np_labels, np_logits, dim=-1):
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
 
   def _testSingleClass(self, expected_gradient=[[2.0], [1.0], [0.0], [0.0]]):
-    for dtype in np.float16, np.float32:
+    for dtype in np.float16, np.float32, dtypes.bfloat16.as_numpy_dtype:
       loss, gradient = self._opFwdBwd(
           labels=np.array([[-1.], [0.], [1.], [1.]]).astype(dtype),
           logits=np.array([[1.], [-1.], [0.], [1.]]).astype(dtype))
@@ -172,6 +172,13 @@ def testHalf(self):
     logits = np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16)
     self._testXent2D(labels, logits)
 
+  def testBfloat16(self):
+    labels = np.array([[0., 0., 0., 1.],
+                       [0., .5, .5, 0.]]).astype(dtypes.bfloat16.as_numpy_dtype)
+    logits = np.array([[1., 1., 1., 1.],
+                       [1., 2., 3., 4.]]).astype(dtypes.bfloat16.as_numpy_dtype)
+    self._testXent2D(labels, logits)
+
   def testFloat(self):
     labels = np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float32)
     logits = np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index 6e3bad71c66..9fb6544584e 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/quantization_ops/BUILD b/tensorflow/python/kernel_tests/quantization_ops/BUILD
index b16e075758a..878ac572ec6 100644
--- a/tensorflow/python/kernel_tests/quantization_ops/BUILD
+++ b/tensorflow/python/kernel_tests/quantization_ops/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 2cc20c05e13..abaf9fe9904 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index 8ca668285f2..ee4a22e5f47 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -16,10 +16,12 @@
 
 import numpy as np
 
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_grad
@@ -244,6 +246,29 @@ def testQuadraticLoss(self):
     dloss_dalpha_val = self.evaluate(dloss_dalpha)
     self.assertAllClose(expected, dloss_dalpha_val, atol=1e-1, rtol=1e-1)
 
+  @test_util.run_deprecated_v1
+  def testQuadraticLossV3(self):
+    """Statistical test for the gradient.
+
+    This is the same test as in testQuadraticLoss but for
+    StatelessRandomGammaV3.
+    """
+    shape = constant_op.constant([10000])
+    t = 0.3
+    alpha = constant_op.constant(0.5, dtype=dtypes.float32)
+    key = constant_op.constant([0], dtype=dtypes.uint64)
+    counter = constant_op.constant([10, 20], dtype=dtypes.uint64)
+    # Use PHILOX algorithm
+    alg = constant_op.constant(1)
+    expected = 1 + 2 * alpha - 2 * t
+
+    sample = gen_stateless_random_ops_v2.stateless_random_gamma_v3(
+        shape=shape, key=key, counter=counter, alg=alg, alpha=alpha)
+    loss = math_ops.reduce_mean(math_ops.square(sample - t))
+    dloss_dalpha = gradients_impl.gradients(loss, alpha)[0]
+    dloss_dalpha_val = self.evaluate(dloss_dalpha)
+    self.assertAllClose(expected, dloss_dalpha_val, atol=1e-1, rtol=1e-1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/random_index_shuffle_test.py b/tensorflow/python/kernel_tests/random/random_index_shuffle_test.py
index e2504b16d6a..5542e0883d8 100644
--- a/tensorflow/python/kernel_tests/random/random_index_shuffle_test.py
+++ b/tensorflow/python/kernel_tests/random/random_index_shuffle_test.py
@@ -17,7 +17,10 @@
 import itertools
 
 from absl.testing import parameterized
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_random_index_shuffle_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops as stateless
@@ -84,6 +87,17 @@ def testBatchedBroadcastSeedAndMaxval(self, seed, seed_dtype, max_index,
     self.assertLen(new_index, max_index + 1)
     self.assertLen(set(new_index), max_index + 1)
 
+  def test_unknown_shape(self):
+
+    @def_function.function
+    def shuffle(repeats):
+      indices = array_ops.repeat(2, repeats)
+      return stateless.index_shuffle(indices, seed=(1, 2), max_index=10)
+
+    new_index = shuffle(constant_op.constant(2))
+    self.assertAllGreaterEqual(new_index, 0)
+    self.assertAllLessEqual(new_index, 10)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 569e691b89f..752f97f95a7 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -104,8 +104,7 @@ def float_cases(shape_dtypes=(None,)):
   def wrap(op, dtype, shape, shape_dtype, seed, **kwargs):
     device_type = get_device().device_type
     # Some dtypes are not supported on some devices
-    if (dtype == dtypes.float16 and device_type in ('XLA_GPU', 'XLA_CPU') or
-        dtype == dtypes.bfloat16 and device_type == 'GPU' and
+    if (dtype == dtypes.bfloat16 and device_type == 'GPU' and
         not test_util.is_gpu_available(
             cuda_only=True, min_cuda_compute_capability=(8, 0))):
       dtype = dtypes.float32
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index b7146e9028f..3bb584e6d14 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/sparse_ops/BUILD b/tensorflow/python/kernel_tests/sparse_ops/BUILD
index 62dba641d9c..bf3eeb2042e 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/BUILD
+++ b/tensorflow/python/kernel_tests/sparse_ops/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_ops_test.py
index 60f61cf4f14..ee46404aedf 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_ops_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
@@ -1175,6 +1176,18 @@ def testBasic(self):
       self._assertSparseTensorValueEqual(expected, max_tf)
       self._assertSparseTensorValueEqual(expected, min_tf)
 
+  def testInvalidSparseInputs(self):
+    with test_util.force_cpu():
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError),
+          ".*Index rank .* and shape rank .* do not match.*",
+      ):
+        self.evaluate(
+            gen_sparse_ops.sparse_sparse_maximum(
+                [[1]], [0], [2], [[]], [1], [2]
+            )
+        )
+
   @test_util.run_deprecated_v1
   def testRandom(self):
     np.random.seed(1618)
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_reorder_op_test.py
index 78bd4d4e816..42d45b2abfa 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_reorder_op_test.py
@@ -37,10 +37,10 @@ def _SparseTensorPlaceholder(self):
         array_ops.placeholder(dtypes.float64),
         array_ops.placeholder(dtypes.int64))
 
-  def _SparseTensorValue_5x6(self, permutation):
+  def _SparseTensorValue_5x6(self, permutation, dtype=dtypes.float64):
     ind = np.array([[0, 0], [1, 0], [1, 3], [1, 4], [3, 2],
                     [3, 3]]).astype(np.int64)
-    val = np.array([0, 10, 13, 14, 32, 33]).astype(np.float64)
+    val = np.array([0, 10, 13, 14, 32, 33]).astype(dtype.as_numpy_dtype)
 
     ind = ind[permutation]
     val = val[permutation]
@@ -107,11 +107,12 @@ def testFeedAlreadyInOrder(self):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
-  def testOutOfOrder(self):
-    expected_output_val = self._SparseTensorValue_5x6(np.arange(6))
+  @parameterized.parameters(dtypes.bfloat16, dtypes.float64)
+  def testOutOfOrder(self, dtype):
+    expected_output_val = self._SparseTensorValue_5x6(np.arange(6), dtype)
     with self.session() as sess:
       for _ in range(5):  # To test various random permutations
-        input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
+        input_val = self._SparseTensorValue_5x6(np.random.permutation(6), dtype)
         sp_output = sparse_ops.sparse_reorder(input_val)
 
         output_val = self.evaluate(sp_output)
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_to_dense_op_py_test.py
index a65f5aa03dd..af321792e36 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_to_dense_op_py_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for tensorflow.kernels.sparse_op."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -28,16 +29,21 @@
 
 
 @test_util.with_eager_op_as_function
-class SparseToDenseTest(test.TestCase):
+class SparseToDenseTest(test.TestCase, parameterized.TestCase):
 
   def testInt(self):
     tf_ans = sparse_ops.sparse_to_dense([1, 3], [5], 1, 0)
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
-  def testFloat(self):
-    tf_ans = sparse_ops.sparse_to_dense([1, 3], [5], 1.0, 0.0)
-    np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32)
+  @parameterized.parameters(
+      dtypes.bfloat16, dtypes.float16, dtypes.float32, dtypes.float64
+  )
+  def testFloatTypes(self, dtype):
+    tf_ans = sparse_ops.sparse_to_dense(
+        [1, 3], [5], array_ops.constant(1.0, dtype=dtype), 0.0
+    )
+    np_ans = np.array([0, 1, 0, 1, 0]).astype(dtype.as_numpy_dtype)
     self.assertAllClose(np_ans, tf_ans)
 
   def testComplex(self):
diff --git a/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py b/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
index ffd0c606a18..a30d82591da 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
+++ b/tensorflow/python/kernel_tests/sparse_ops/sparse_xent_op_test_base.py
@@ -194,6 +194,14 @@ def testHalf(self):
           np_logits=np.array([[1., 1., 1., 1.], [1., 2., 3.,
                                                  4.]]).astype(np.float16))
 
+  def testBfloat16(self):
+    for label_dtype in np.int32, np.int64:
+      self._testXent(
+          np_labels=np.array([3, 0]).astype(label_dtype),
+          np_logits=np.array([[1., 1., 1., 1.],
+                              [1., 2., 3.,
+                               4.]]).astype(dtypes.bfloat16.as_numpy_dtype))
+
   def testEmpty(self):
     self._testXent(
         np_labels=np.zeros((0,), dtype=np.int32), np_logits=np.zeros((0, 3)))
diff --git a/tensorflow/python/kernel_tests/strings_ops/BUILD b/tensorflow/python/kernel_tests/strings_ops/BUILD
index 865af5751dc..09f92f9c84e 100644
--- a/tensorflow/python/kernel_tests/strings_ops/BUILD
+++ b/tensorflow/python/kernel_tests/strings_ops/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 tf_py_test(
     name = "as_string_op_test",
diff --git a/tensorflow/python/kernel_tests/summary_ops/BUILD b/tensorflow/python/kernel_tests/summary_ops/BUILD
index 485293b3c38..b5835717861 100644
--- a/tensorflow/python/kernel_tests/summary_ops/BUILD
+++ b/tensorflow/python/kernel_tests/summary_ops/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "summary_ops_test",
diff --git a/tensorflow/python/kernel_tests/tensor_priority_test.py b/tensorflow/python/kernel_tests/tensor_priority_test.py
index b14c6dc9a14..bb779f26eff 100644
--- a/tensorflow/python/kernel_tests/tensor_priority_test.py
+++ b/tensorflow/python/kernel_tests/tensor_priority_test.py
@@ -16,6 +16,7 @@
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.platform import test as test_lib
 
 
@@ -70,7 +71,7 @@ def raise_to_delegate(value, dtype=None, name=None, as_ref=False):
       del value, dtype, name, as_ref  # Unused.
       raise TypeError
 
-    ops.register_tensor_conversion_function(
+    tensor_conversion_registry.register_tensor_conversion_function(
         NumpyArraySubclass, raise_to_delegate, priority=0)
     tensor = ops.convert_to_tensor([[10.0, 20.0]])
     rhs = NumpyArraySubclass(shape=(1, 2), buffer=np.array([1.0, 2.0]))
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index c3513365daa..1a4452857f4 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/kernel_tests/variables/BUILD b/tensorflow/python/kernel_tests/variables/BUILD
index 7b6c54eb0f7..ddf406b6de4 100644
--- a/tensorflow/python/kernel_tests/variables/BUILD
+++ b/tensorflow/python/kernel_tests/variables/BUILD
@@ -2,7 +2,10 @@
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "dense_update_ops_no_tsan_test",
@@ -62,6 +65,7 @@ cuda_py_test(
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/python/kernel_tests/variables/dense_update_ops_test.py b/tensorflow/python/kernel_tests/variables/dense_update_ops_test.py
index c04e046b81e..8ea7459458d 100644
--- a/tensorflow/python/kernel_tests/variables/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/dense_update_ops_test.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -53,7 +54,10 @@ def _initAssignSubFetch(self, x, y, use_gpu):
       return self.evaluate(p), new_value
 
   def _testTypes(self, vals):
-    for dtype in [np.float32, np.float64, np.int32, np.int64]:
+    for dtype in [
+        np.float32, np.float64, np.int32, np.int64,
+        dtypes.bfloat16.as_numpy_dtype
+    ]:
       x = np.zeros(vals.shape).astype(dtype)
       y = vals.astype(dtype)
       var_value, op_value = self._initAssignFetch(x, y, use_gpu=False)
diff --git a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
index 2620eb8068f..343df9f850d 100644
--- a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
@@ -639,7 +639,8 @@ def testScatterMaxScalar(self):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[6]])
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterAddVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 1.5],
@@ -652,7 +653,8 @@ def testScatterAddVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([2.5], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 4.0], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterSubVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 2.5],
@@ -665,7 +667,8 @@ def testScatterSubVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([1.5], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 1.0], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterMaxVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 4.0],
@@ -688,7 +691,8 @@ def testScatterMaxVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 3.5], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterMinVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 4.0],
@@ -711,7 +715,8 @@ def testScatterMinVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 2.0], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterMulVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 4.0],
@@ -724,7 +729,8 @@ def testScatterMulVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([3.0], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 12.0], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterDivVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 6.0],
@@ -737,7 +743,8 @@ def testScatterDivVariableMethod(self, dtype):
                 indices=[1], values=constant_op.constant([2.0], dtype=dtype))))
     self.assertAllCloseAccordingToType([0.0, 3.0], self.evaluate(v))
 
-  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64)
+  @parameterized.parameters(dtypes.float16, dtypes.float32, dtypes.float64,
+                            dtypes.bfloat16)
   @test_util.run_in_graph_and_eager_modes
   def testScatterUpdateVariableMethod(self, dtype):
     v = resource_variable_ops.ResourceVariable([0.0, 6.0],
@@ -1466,7 +1473,7 @@ def testScatterAddDeterministic(self):
       # least 1024 elements. Test that op determinism ensures the op is
       # deterministc.
       v = resource_variable_ops.ResourceVariable(array_ops.zeros([1024]))
-      delta = ops.IndexedSlices(
+      delta = indexed_slices.IndexedSlices(
           values=np.random.normal(size=(1_000_000,)),
           indices=array_ops.zeros((1_000_000,), dtype=np.int32),
           dense_shape=(1024,))
@@ -1688,6 +1695,7 @@ def testGatherWithBatchDimsMatchesTensor(self, params_shape, indices_shape,
       dict(dtype=dtypes.bool),
       dict(dtype=dtypes.int64),
       dict(dtype=dtypes.half),
+      dict(dtype=dtypes.bfloat16),
       dict(dtype=dtypes.float32),
       dict(dtype=dtypes.double),
   ])
diff --git a/tensorflow/python/kernel_tests/variables/variable_ops_test.py b/tensorflow/python/kernel_tests/variables/variable_ops_test.py
index 31359da5643..29ee5a1f02e 100644
--- a/tensorflow/python/kernel_tests/variables/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/variable_ops_test.py
@@ -29,6 +29,7 @@
 from tensorflow.python.platform import test
 
 _NP_TO_TF = {
+    dtypes.bfloat16.as_numpy_dtype: dtypes.bfloat16,
     np.float16: dtypes.float16,
     np.float32: dtypes.float32,
     np.float64: dtypes.float64,
@@ -51,7 +52,7 @@ def _initFetch(self, x, tftype, use_gpu=None):
   def _testTypes(self, vals):
     for dtype in [
         np.float16, np.float32, np.float64, np.complex64, np.complex128,
-        np.int32, np.int64
+        np.int32, np.int64, dtypes.bfloat16.as_numpy_dtype
     ]:
       self.setUp()
       x = vals.astype(dtype)
@@ -150,7 +151,10 @@ def testAssignNoShapeNoValidateShape(self):
 
   @test_util.run_deprecated_v1
   def testAssignUpdate(self):
-    for dtype in [dtypes.float32, dtypes.int64, dtypes.uint32, dtypes.uint8]:
+    for dtype in [
+        dtypes.float32, dtypes.int64, dtypes.uint32, dtypes.uint8,
+        dtypes.bfloat16
+    ]:
       var = state_ops.variable_op([1, 2], dtype)
       added = state_ops.assign_add(var, [[2, 3]])
       self.assertEqual([1, 2], added.get_shape())
diff --git a/tensorflow/python/layers/BUILD b/tensorflow/python/layers/BUILD
index 2dcdfcebaa4..50212d40a9d 100644
--- a/tensorflow/python/layers/BUILD
+++ b/tensorflow/python/layers/BUILD
@@ -5,6 +5,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/lib/BUILD b/tensorflow/python/lib/BUILD
index e28fb34413d..a77f75adeba 100644
--- a/tensorflow/python/lib/BUILD
+++ b/tensorflow/python/lib/BUILD
@@ -5,6 +5,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index c887176a877..2a969bc6f87 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -22,37 +22,20 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
 
-cc_library(
-    name = "numpy_lib",
-    srcs = ["numpy.cc"],
-    hdrs = ["numpy.h"],
-    deps = [
-        "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
-    ],
-)
-
+# Deprecated, use //third_party/tensorflow/tsl/python/lib/core:bfloat16_lib
 cc_library(
     name = "bfloat16_lib",
-    srcs = [
-        "bfloat16.cc",
-        "float8_e4m3b11.cc",
-    ],
     hdrs = [
         "bfloat16.h",
-        "float8_e4m3b11.h",
     ],
     deps = [
-        ":numpy_lib",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:types",
-        "//third_party/eigen3",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -61,7 +44,27 @@ tf_python_pybind_extension(
     srcs = ["bfloat16_wrapper.cc"],
     hdrs = ["bfloat16.h"],
     deps = [
-        "//tensorflow/compiler/xla/python:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:bfloat16_hdr",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_float8",
+    srcs = ["float8_wrapper.cc"],
+    deps = [
+        "//tensorflow/tsl/python/lib/core:float8_hdr",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_custom_casts",
+    srcs = ["custom_casts_wrapper.cc"],
+    deps = [
+        "//tensorflow/tsl/python/lib/core:custom_casts_hdr",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -77,12 +80,13 @@ cc_library(
         ],
     ),
     deps = [
-        ":bfloat16_lib",
-        ":numpy_lib",
         ":py_util",
         "//tensorflow/c:c_api_no_xla",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",
+        "//tensorflow/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -175,11 +179,6 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "numpy_hdr",
-    srcs = ["numpy.h"],
-)
-
 filegroup(
     name = "safe_ptr_hdr",
     srcs = ["safe_ptr.h"],
@@ -193,10 +192,8 @@ filegroup(
 filegroup(
     name = "basic_hdrs",
     srcs = [
-        "bfloat16.h",
         "ndarray_tensor.h",
         "ndarray_tensor_bridge.h",
-        "numpy.h",
         "py_exception_registry.h",
         "pybind11_status.h",
         "safe_ptr.h",
@@ -211,7 +208,6 @@ cc_library(
     deps = [
         ":ndarray_tensor",
         ":ndarray_tensor_bridge",
-        ":numpy_lib",
         ":py_util",
         ":safe_ptr",
         "//tensorflow/c:tf_status_helper",
@@ -225,6 +221,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/python/eager:pywrap_tfe_lib",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
@@ -272,14 +269,13 @@ cc_library(
 cc_library(
     name = "ndarray_tensor_headers",
     hdrs = [
-        "bfloat16.h",
         "ndarray_tensor.h",
         "ndarray_tensor_bridge.h",
-        "numpy.h",
         "safe_ptr.h",
         "safe_pyobject_ptr.h",
         "//tensorflow/c:headers",
         "//tensorflow/c/eager:headers",
+        "//tensorflow/tsl/python/lib/core:numpy_hdr",
     ],
     features = [
         "-parse_headers",
@@ -288,11 +284,11 @@ cc_library(
         "//tensorflow:ndarray_tensor_allow_list",
     ]),
     deps = [
-        ":numpy_lib",
         "//tensorflow/c:pywrap_required_hdrs",
         "//tensorflow/c:tf_status_headers",
         "//tensorflow/core:framework_internal_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
@@ -306,9 +302,7 @@ cc_library(
         "//tensorflow:ndarray_tensor_allow_list",
     ]),
     deps = [
-        ":bfloat16_lib",
         ":ndarray_tensor_bridge",
-        ":numpy_lib",
         ":safe_ptr",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
@@ -316,6 +310,9 @@ cc_library(
         "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/tsl/python/lib/core:bfloat16_lib",
+        "//tensorflow/tsl/python/lib/core:float8_lib",
+        "//tensorflow/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -327,7 +324,6 @@ cc_library(
     deps = [
         ":ndarray_tensor",
         ":ndarray_tensor_bridge",
-        ":numpy_lib",
         ":py_util",
         ":safe_ptr",
         "//tensorflow/c:tensor_interface",
@@ -337,6 +333,7 @@ cc_library(
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
     ],
 )
@@ -354,9 +351,9 @@ cc_library(
 )
 
 tf_py_test(
-    name = "bfloat16_test",
+    name = "custom_float_test",
     size = "small",
-    srcs = ["bfloat16_test.py"],
+    srcs = ["custom_float_test.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
deleted file mode 100644
index 04cf3679013..00000000000
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ /dev/null
@@ -1,1859 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/python/lib/core/bfloat16.h"
-
-#include <array>
-#include <cmath>
-#include <limits>
-#include <locale>
-
-#include "tensorflow/python/lib/core/float8_e4m3b11.h"
-// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
-#include <Python.h>
-
-#include "absl/strings/str_cat.h"
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/lib/core/numpy.h"
-
-namespace tensorflow {
-namespace {
-
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
-// Safe container for an owned PyObject. On destruction, the reference count of
-// the contained object will be decremented.
-using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
-Safe_PyObjectPtr make_safe(PyObject* object) {
-  return Safe_PyObjectPtr(object);
-}
-
-bool PyLong_CheckNoOverflow(PyObject* object) {
-  if (!PyLong_Check(object)) {
-    return false;
-  }
-  int overflow = 0;
-  PyLong_AsLongAndOverflow(object, &overflow);
-  return (overflow == 0);
-}
-
-template <typename T, typename Enable = void>
-struct TypeDescriptor {
-  // typedef ... T;  // Representation type in memory for NumPy values of type
-  // static int Dtype() { return NPY_...; }  // Numpy type number for T.
-};
-
-template <typename T>
-struct CustomFloatTypeDescriptor {
-  static int Dtype() { return npy_type; }
-
-  // Registered numpy type ID. Global variable populated by the registration
-  // code. Protected by the GIL.
-  static int npy_type;
-
-  static PyTypeObject type;
-  // Pointer to the python type object we are using. This is either a pointer
-  // to type, if we choose to register it, or to the python type
-  // registered by another system into NumPy.
-  static PyTypeObject* type_ptr;
-
-  static PyNumberMethods number_methods;
-
-  static PyArray_ArrFuncs arr_funcs;
-
-  static PyArray_Descr npy_descr;
-};
-template <typename T>
-int CustomFloatTypeDescriptor<T>::npy_type = NPY_NOTYPE;
-template <typename T>
-PyTypeObject* CustomFloatTypeDescriptor<T>::type_ptr = nullptr;
-
-// Representation of a Python custom float object.
-template <typename T>
-struct PyCustomFloat {
-  PyObject_HEAD;  // Python object header
-  T value;
-};
-
-// Returns true if 'object' is a PyCustomFloat.
-template <typename T>
-bool PyCustomFloat_Check(PyObject* object) {
-  return PyObject_IsInstance(
-      object, reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type));
-}
-
-// Extracts the value of a PyCustomFloat object.
-template <typename T>
-T PyCustomFloat_CustomFloat(PyObject* object) {
-  return reinterpret_cast<PyCustomFloat<T>*>(object)->value;
-}
-
-// Constructs a PyCustomFloat object from PyCustomFloat<T>::T.
-template <typename T>
-Safe_PyObjectPtr PyCustomFloat_FromT(T x) {
-  Safe_PyObjectPtr ref =
-      make_safe(TypeDescriptor<T>::type.tp_alloc(&TypeDescriptor<T>::type, 0));
-  PyCustomFloat<T>* p = reinterpret_cast<PyCustomFloat<T>*>(ref.get());
-  if (p) {
-    p->value = x;
-  }
-  return ref;
-}
-
-// Converts a Python object to a reduced float value. Returns true on success,
-// returns false and reports a Python error on failure.
-template <typename T>
-bool CastToCustomFloat(PyObject* arg, T* output) {
-  if (PyCustomFloat_Check<T>(arg)) {
-    *output = PyCustomFloat_CustomFloat<T>(arg);
-    return true;
-  }
-  if (PyFloat_Check(arg)) {
-    double d = PyFloat_AsDouble(arg);
-    if (PyErr_Occurred()) {
-      return false;
-    }
-    // TODO(phawkins): check for overflow
-    *output = T(d);
-    return true;
-  }
-  if (PyLong_CheckNoOverflow(arg)) {
-    long l = PyLong_AsLong(arg);  // NOLINT
-    if (PyErr_Occurred()) {
-      return false;
-    }
-    // TODO(phawkins): check for overflow
-    *output = T(static_cast<float>(l));
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Half)) {
-    Eigen::half f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Float)) {
-    float f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, Double)) {
-    double f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsScalar(arg, LongDouble)) {
-    long double f;
-    PyArray_ScalarAsCtype(arg, &f);
-    *output = T(f);
-    return true;
-  }
-  if (PyArray_IsZeroDim(arg)) {
-    Safe_PyObjectPtr ref;
-    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
-    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
-      ref = make_safe(PyArray_Cast(arr, TypeDescriptor<T>::Dtype()));
-      if (PyErr_Occurred()) {
-        return false;
-      }
-      arg = ref.get();
-      arr = reinterpret_cast<PyArrayObject*>(arg);
-    }
-    *output = *reinterpret_cast<T*>(PyArray_DATA(arr));
-    return true;
-  }
-  return false;
-}
-
-template <typename T>
-bool SafeCastToCustomFloat(PyObject* arg, T* output) {
-  if (PyCustomFloat_Check<T>(arg)) {
-    *output = PyCustomFloat_CustomFloat<T>(arg);
-    return true;
-  }
-  return false;
-}
-
-// Converts a PyReduceFloat into a PyFloat.
-template <typename T>
-PyObject* PyCustomFloat_Float(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  return PyFloat_FromDouble(static_cast<double>(static_cast<float>(x)));
-}
-
-// Converts a PyReduceFloat into a PyInt.
-template <typename T>
-PyObject* PyCustomFloat_Int(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  long y = static_cast<long>(static_cast<float>(x));  // NOLINT
-  return PyLong_FromLong(y);
-}
-
-// Negates a PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Negative(PyObject* self) {
-  T x = PyCustomFloat_CustomFloat<T>(self);
-  return PyCustomFloat_FromT<T>(-x).release();
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Add(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x + y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_add(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Subtract(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x - y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_subtract(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_Multiply(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x * y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_multiply(a, b);
-}
-
-template <typename T>
-PyObject* PyCustomFloat_TrueDivide(PyObject* a, PyObject* b) {
-  T x, y;
-  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
-    return PyCustomFloat_FromT<T>(x / y).release();
-  }
-  return PyArray_Type.tp_as_number->nb_true_divide(a, b);
-}
-
-// Python number methods for PyCustomFloat objects.
-template <typename T>
-PyNumberMethods CustomFloatTypeDescriptor<T>::number_methods = {
-    PyCustomFloat_Add<T>,       // nb_add
-    PyCustomFloat_Subtract<T>,  // nb_subtract
-    PyCustomFloat_Multiply<T>,  // nb_multiply
-    nullptr,                    // nb_remainder
-    nullptr,                    // nb_divmod
-    nullptr,                    // nb_power
-    PyCustomFloat_Negative<T>,  // nb_negative
-    nullptr,                    // nb_positive
-    nullptr,                    // nb_absolute
-    nullptr,                    // nb_nonzero
-    nullptr,                    // nb_invert
-    nullptr,                    // nb_lshift
-    nullptr,                    // nb_rshift
-    nullptr,                    // nb_and
-    nullptr,                    // nb_xor
-    nullptr,                    // nb_or
-    PyCustomFloat_Int<T>,       // nb_int
-    nullptr,                    // reserved
-    PyCustomFloat_Float<T>,     // nb_float
-
-    nullptr,  // nb_inplace_add
-    nullptr,  // nb_inplace_subtract
-    nullptr,  // nb_inplace_multiply
-    nullptr,  // nb_inplace_remainder
-    nullptr,  // nb_inplace_power
-    nullptr,  // nb_inplace_lshift
-    nullptr,  // nb_inplace_rshift
-    nullptr,  // nb_inplace_and
-    nullptr,  // nb_inplace_xor
-    nullptr,  // nb_inplace_or
-
-    nullptr,                      // nb_floor_divide
-    PyCustomFloat_TrueDivide<T>,  // nb_true_divide
-    nullptr,                      // nb_inplace_floor_divide
-    nullptr,                      // nb_inplace_true_divide
-    nullptr,                      // nb_index
-};
-
-// Constructs a new PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_New(PyTypeObject* type, PyObject* args,
-                            PyObject* kwds) {
-  if (kwds && PyDict_Size(kwds)) {
-    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
-    return nullptr;
-  }
-  Py_ssize_t size = PyTuple_Size(args);
-  if (size != 1) {
-    PyErr_Format(PyExc_TypeError,
-                 "expected number as argument to %s constructor",
-                 TypeDescriptor<T>::kTypeName);
-    return nullptr;
-  }
-  PyObject* arg = PyTuple_GetItem(args, 0);
-
-  T value;
-  if (PyCustomFloat_Check<T>(arg)) {
-    Py_INCREF(arg);
-    return arg;
-  } else if (CastToCustomFloat<T>(arg, &value)) {
-    return PyCustomFloat_FromT<T>(value).release();
-  } else if (PyArray_Check(arg)) {
-    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
-    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
-      return PyArray_Cast(arr, TypeDescriptor<T>::Dtype());
-    } else {
-      Py_INCREF(arg);
-      return arg;
-    }
-  }
-  PyErr_Format(PyExc_TypeError, "expected number, got %s",
-               Py_TYPE(arg)->tp_name);
-  return nullptr;
-}
-
-// Comparisons on PyCustomFloats.
-template <typename T>
-PyObject* PyCustomFloat_RichCompare(PyObject* a, PyObject* b, int op) {
-  T x, y;
-  if (!SafeCastToCustomFloat<T>(a, &x) || !SafeCastToCustomFloat<T>(b, &y)) {
-    return PyGenericArrType_Type.tp_richcompare(a, b, op);
-  }
-  bool result;
-  switch (op) {
-    case Py_LT:
-      result = x < y;
-      break;
-    case Py_LE:
-      result = x <= y;
-      break;
-    case Py_EQ:
-      result = x == y;
-      break;
-    case Py_NE:
-      result = x != y;
-      break;
-    case Py_GT:
-      result = x > y;
-      break;
-    case Py_GE:
-      result = x >= y;
-      break;
-    default:
-      LOG(FATAL) << "Invalid op type " << op;
-  }
-  return PyBool_FromLong(result);
-}
-
-// Implementation of repr() for PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Repr(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  std::string v = absl::StrCat(static_cast<float>(x));
-  return PyUnicode_FromString(v.c_str());
-}
-
-// Implementation of str() for PyCustomFloat.
-template <typename T>
-PyObject* PyCustomFloat_Str(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  std::string v = absl::StrCat(static_cast<float>(x));
-  return PyUnicode_FromString(v.c_str());
-}
-
-// _Py_HashDouble changed its prototype for Python 3.10 so we use an overload to
-// handle the two possibilities.
-// NOLINTNEXTLINE(clang-diagnostic-unused-function)
-Py_hash_t HashImpl(Py_hash_t (*hash_double)(PyObject*, double), PyObject* self,
-                   double value) {
-  return hash_double(self, value);
-}
-
-// NOLINTNEXTLINE(clang-diagnostic-unused-function)
-Py_hash_t HashImpl(Py_hash_t (*hash_double)(double), PyObject* self,
-                   double value) {
-  return hash_double(value);
-}
-
-// Hash function for PyCustomFloat.
-template <typename T>
-Py_hash_t PyCustomFloat_Hash(PyObject* self) {
-  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
-  return HashImpl(&_Py_HashDouble, self, static_cast<double>(x));
-}
-
-// Python type for PyCustomFloat objects.
-template <typename T>
-PyTypeObject CustomFloatTypeDescriptor<T>::type = {
-    PyVarObject_HEAD_INIT(nullptr, 0) TypeDescriptor<T>::kTypeName,  // tp_name
-    sizeof(PyCustomFloat<T>),  // tp_basicsize
-    0,                         // tp_itemsize
-    nullptr,                   // tp_dealloc
-#if PY_VERSION_HEX < 0x03080000
-    nullptr,  // tp_print
-#else
-    0,  // tp_vectorcall_offset
-#endif
-    nullptr,                                        // tp_getattr
-    nullptr,                                        // tp_setattr
-    nullptr,                                        // tp_compare / tp_reserved
-    PyCustomFloat_Repr<T>,                          // tp_repr
-    &CustomFloatTypeDescriptor<T>::number_methods,  // tp_as_number
-    nullptr,                                        // tp_as_sequence
-    nullptr,                                        // tp_as_mapping
-    PyCustomFloat_Hash<T>,                          // tp_hash
-    nullptr,                                        // tp_call
-    PyCustomFloat_Str<T>,                           // tp_str
-    nullptr,                                        // tp_getattro
-    nullptr,                                        // tp_setattro
-    nullptr,                                        // tp_as_buffer
-                                                    // tp_flags
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
-    TypeDescriptor<T>::kTpDoc,     // tp_doc
-    nullptr,                       // tp_traverse
-    nullptr,                       // tp_clear
-    PyCustomFloat_RichCompare<T>,  // tp_richcompare
-    0,                             // tp_weaklistoffset
-    nullptr,                       // tp_iter
-    nullptr,                       // tp_iternext
-    nullptr,                       // tp_methods
-    nullptr,                       // tp_members
-    nullptr,                       // tp_getset
-    nullptr,                       // tp_base
-    nullptr,                       // tp_dict
-    nullptr,                       // tp_descr_get
-    nullptr,                       // tp_descr_set
-    0,                             // tp_dictoffset
-    nullptr,                       // tp_init
-    nullptr,                       // tp_alloc
-    PyCustomFloat_New<T>,          // tp_new
-    nullptr,                       // tp_free
-    nullptr,                       // tp_is_gc
-    nullptr,                       // tp_bases
-    nullptr,                       // tp_mro
-    nullptr,                       // tp_cache
-    nullptr,                       // tp_subclasses
-    nullptr,                       // tp_weaklist
-    nullptr,                       // tp_del
-    0,                             // tp_version_tag
-};
-
-// Numpy support
-template <typename T>
-PyArray_ArrFuncs CustomFloatTypeDescriptor<T>::arr_funcs;
-
-template <typename T>
-PyArray_Descr CustomFloatTypeDescriptor<T>::npy_descr = {
-    PyObject_HEAD_INIT(nullptr)  //
-                                 /*typeobj=*/
-    (&TypeDescriptor<T>::type),
-    /*kind=*/TypeDescriptor<T>::kNpyDescrKind,
-    /*type=*/TypeDescriptor<T>::kNpyDescrType,
-    /*byteorder=*/TypeDescriptor<T>::kNpyDescrByteorder,
-    /*flags=*/NPY_NEEDS_PYAPI | NPY_USE_SETITEM,
-    /*type_num=*/0,
-    /*elsize=*/sizeof(T),
-    /*alignment=*/alignof(T),
-    /*subarray=*/nullptr,
-    /*fields=*/nullptr,
-    /*names=*/nullptr,
-    /*f=*/&CustomFloatTypeDescriptor<T>::arr_funcs,
-    /*metadata=*/nullptr,
-    /*c_metadata=*/nullptr,
-    /*hash=*/-1,  // -1 means "not computed yet".
-};
-
-// Implementations of NumPy array methods.
-
-template <typename T>
-PyObject* NPyCustomFloat_GetItem(void* data, void* arr) {
-  T x;
-  memcpy(&x, data, sizeof(T));
-  return PyFloat_FromDouble(static_cast<float>(x));
-}
-
-template <typename T>
-int NPyCustomFloat_SetItem(PyObject* item, void* data, void* arr) {
-  T x;
-  if (!CastToCustomFloat<T>(item, &x)) {
-    PyErr_Format(PyExc_TypeError, "expected number, got %s",
-                 Py_TYPE(item)->tp_name);
-    return -1;
-  }
-  memcpy(data, &x, sizeof(T));
-  return 0;
-}
-
-void ByteSwap16(void* value) {
-  char* p = reinterpret_cast<char*>(value);
-  std::swap(p[0], p[1]);
-}
-
-template <typename T>
-int NPyCustomFloat_Compare(const void* a, const void* b, void* arr) {
-  T x;
-  memcpy(&x, a, sizeof(T));
-
-  T y;
-  memcpy(&y, b, sizeof(T));
-  float fy(y);
-  float fx(x);
-
-  if (fx < fy) {
-    return -1;
-  }
-  if (fy < fx) {
-    return 1;
-  }
-  // NaNs sort to the end.
-  if (!Eigen::numext::isnan(fx) && Eigen::numext::isnan(fy)) {
-    return -1;
-  }
-  if (Eigen::numext::isnan(fx) && !Eigen::numext::isnan(fy)) {
-    return 1;
-  }
-  return 0;
-}
-
-template <typename T>
-void NPyCustomFloat_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
-                              npy_intp sstride, npy_intp n, int swap,
-                              void* arr) {
-  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
-                "Not supported");
-  char* dst = reinterpret_cast<char*>(dstv);
-  char* src = reinterpret_cast<char*>(srcv);
-  if (!src) {
-    return;
-  }
-  if (swap && sizeof(T) == sizeof(int16_t)) {
-    for (npy_intp i = 0; i < n; i++) {
-      char* r = dst + dstride * i;
-      memcpy(r, src + sstride * i, sizeof(T));
-      ByteSwap16(r);
-    }
-  } else if (dstride == sizeof(T) && sstride == sizeof(T)) {
-    memcpy(dst, src, n * sizeof(T));
-  } else {
-    for (npy_intp i = 0; i < n; i++) {
-      memcpy(dst + dstride * i, src + sstride * i, sizeof(T));
-    }
-  }
-}
-
-template <typename T>
-void NPyCustomFloat_CopySwap(void* dst, void* src, int swap, void* arr) {
-  if (!src) {
-    return;
-  }
-  memcpy(dst, src, sizeof(T));
-  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
-                "Not supported");
-  if (swap && sizeof(T) == sizeof(int16_t)) {
-    ByteSwap16(dst);
-  }
-}
-
-template <typename T>
-npy_bool NPyCustomFloat_NonZero(void* data, void* arr) {
-  T x;
-  memcpy(&x, data, sizeof(x));
-  return x != static_cast<T>(0);
-}
-
-template <typename T>
-int NPyCustomFloat_Fill(void* buffer_raw, npy_intp length, void* ignored) {
-  T* const buffer = reinterpret_cast<T*>(buffer_raw);
-  const float start(buffer[0]);
-  const float delta = static_cast<float>(buffer[1]) - start;
-  for (npy_intp i = 2; i < length; ++i) {
-    buffer[i] = static_cast<T>(start + i * delta);
-  }
-  return 0;
-}
-
-template <typename T>
-void NPyCustomFloat_DotFunc(void* ip1, npy_intp is1, void* ip2, npy_intp is2,
-                            void* op, npy_intp n, void* arr) {
-  char* c1 = reinterpret_cast<char*>(ip1);
-  char* c2 = reinterpret_cast<char*>(ip2);
-  float acc = 0.0f;
-  for (npy_intp i = 0; i < n; ++i) {
-    T* const b1 = reinterpret_cast<T*>(c1);
-    T* const b2 = reinterpret_cast<T*>(c2);
-    acc += static_cast<float>(*b1) * static_cast<float>(*b2);
-    c1 += is1;
-    c2 += is2;
-  }
-  T* out = reinterpret_cast<T*>(op);
-  *out = static_cast<T>(acc);
-}
-
-template <typename T>
-int NPyCustomFloat_CompareFunc(const void* v1, const void* v2, void* arr) {
-  T b1 = *reinterpret_cast<const T*>(v1);
-  T b2 = *reinterpret_cast<const T*>(v2);
-  if (b1 < b2) {
-    return -1;
-  }
-  if (b1 > b2) {
-    return 1;
-  }
-  return 0;
-}
-
-template <typename T>
-int NPyCustomFloat_ArgMaxFunc(void* data, npy_intp n, npy_intp* max_ind,
-                              void* arr) {
-  const T* bdata = reinterpret_cast<const T*>(data);
-  // Start with a max_val of NaN, this results in the first iteration preferring
-  // bdata[0].
-  float max_val = std::numeric_limits<float>::quiet_NaN();
-  for (npy_intp i = 0; i < n; ++i) {
-    // This condition is chosen so that NaNs are always considered "max".
-    if (!(static_cast<float>(bdata[i]) <= max_val)) {
-      max_val = static_cast<float>(bdata[i]);
-      *max_ind = i;
-      // NumPy stops at the first NaN.
-      if (Eigen::numext::isnan(max_val)) {
-        break;
-      }
-    }
-  }
-  return 0;
-}
-
-template <typename T>
-int NPyCustomFloat_ArgMinFunc(void* data, npy_intp n, npy_intp* min_ind,
-                              void* arr) {
-  const T* bdata = reinterpret_cast<const T*>(data);
-  float min_val = std::numeric_limits<float>::quiet_NaN();
-  // Start with a min_val of NaN, this results in the first iteration preferring
-  // bdata[0].
-  for (npy_intp i = 0; i < n; ++i) {
-    // This condition is chosen so that NaNs are always considered "min".
-    if (!(static_cast<float>(bdata[i]) >= min_val)) {
-      min_val = static_cast<float>(bdata[i]);
-      *min_ind = i;
-      // NumPy stops at the first NaN.
-      if (Eigen::numext::isnan(min_val)) {
-        break;
-      }
-    }
-  }
-  return 0;
-}
-
-template <>
-struct TypeDescriptor<unsigned char> {
-  typedef unsigned char T;
-  static int Dtype() { return NPY_UBYTE; }
-};
-
-template <>
-struct TypeDescriptor<unsigned short> {  // NOLINT
-  typedef unsigned short T;              // NOLINT
-  static int Dtype() { return NPY_USHORT; }
-};
-
-// We register "int", "long", and "long long" types for portability across
-// Linux, where "int" and "long" are the same type, and Windows, where "long"
-// and "longlong" are the same type.
-template <>
-struct TypeDescriptor<unsigned int> {
-  typedef unsigned int T;
-  static int Dtype() { return NPY_UINT; }
-};
-
-template <>
-struct TypeDescriptor<unsigned long> {  // NOLINT
-  typedef unsigned long T;              // NOLINT
-  static int Dtype() { return NPY_ULONG; }
-};
-
-template <>
-struct TypeDescriptor<unsigned long long> {  // NOLINT
-  typedef unsigned long long T;              // NOLINT
-  static int Dtype() { return NPY_ULONGLONG; }
-};
-
-template <>
-struct TypeDescriptor<signed char> {
-  typedef signed char T;
-  static int Dtype() { return NPY_BYTE; }
-};
-
-template <>
-struct TypeDescriptor<short> {  // NOLINT
-  typedef short T;              // NOLINT
-  static int Dtype() { return NPY_SHORT; }
-};
-
-template <>
-struct TypeDescriptor<int> {
-  typedef int T;
-  static int Dtype() { return NPY_INT; }
-};
-
-template <>
-struct TypeDescriptor<long> {  // NOLINT
-  typedef long T;              // NOLINT
-  static int Dtype() { return NPY_LONG; }
-};
-
-template <>
-struct TypeDescriptor<long long> {  // NOLINT
-  typedef long long T;              // NOLINT
-  static int Dtype() { return NPY_LONGLONG; }
-};
-
-template <>
-struct TypeDescriptor<bool> {
-  typedef unsigned char T;
-  static int Dtype() { return NPY_BOOL; }
-};
-
-template <>
-struct TypeDescriptor<Eigen::half> {
-  typedef Eigen::half T;
-  static int Dtype() { return NPY_HALF; }
-};
-
-template <>
-struct TypeDescriptor<float> {
-  typedef float T;
-  static int Dtype() { return NPY_FLOAT; }
-};
-
-template <>
-struct TypeDescriptor<double> {
-  typedef double T;
-  static int Dtype() { return NPY_DOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<long double> {
-  typedef long double T;
-  static int Dtype() { return NPY_LONGDOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<float>> {
-  typedef std::complex<float> T;
-  static int Dtype() { return NPY_CFLOAT; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<double>> {
-  typedef std::complex<double> T;
-  static int Dtype() { return NPY_CDOUBLE; }
-};
-
-template <>
-struct TypeDescriptor<std::complex<long double>> {
-  typedef std::complex<long double> T;
-  static int Dtype() { return NPY_CLONGDOUBLE; }
-};
-
-template <typename T>
-float CastToFloat(T value) {
-  return static_cast<float>(value);
-}
-
-template <typename T>
-float CastToFloat(std::complex<T> value) {
-  return CastToFloat(value.real());
-}
-
-// Performs a NumPy array cast from type 'From' to 'To'.
-template <typename From, typename To>
-void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
-             void* toarr) {
-  const auto* from =
-      reinterpret_cast<typename TypeDescriptor<From>::T*>(from_void);
-  auto* to = reinterpret_cast<typename TypeDescriptor<To>::T*>(to_void);
-  for (npy_intp i = 0; i < n; ++i) {
-    to[i] = static_cast<typename TypeDescriptor<To>::T>(
-        static_cast<To>(CastToFloat(from[i])));
-  }
-}
-
-// Registers a cast between T (a reduced float) and type 'OtherT'. 'numpy_type'
-// is the NumPy type corresponding to 'OtherT'.
-template <typename T, typename OtherT>
-bool RegisterCustomFloatCast(int numpy_type = TypeDescriptor<OtherT>::Dtype()) {
-  PyArray_Descr* descr = PyArray_DescrFromType(numpy_type);
-  if (PyArray_RegisterCastFunc(descr, TypeDescriptor<T>::Dtype(),
-                               NPyCast<OtherT, T>) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCastFunc(&CustomFloatTypeDescriptor<T>::npy_descr,
-                               numpy_type, NPyCast<T, OtherT>) < 0) {
-    return false;
-  }
-  return true;
-}
-
-template <typename T>
-bool RegisterCasts() {
-  if (!RegisterCustomFloatCast<T, Eigen::half>(NPY_HALF)) {
-    return false;
-  }
-
-  if (!RegisterCustomFloatCast<T, float>(NPY_FLOAT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, double>(NPY_DOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long double>(NPY_LONGDOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, bool>(NPY_BOOL)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned char>(NPY_UBYTE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned short>(NPY_USHORT)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned int>(NPY_UINT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned long>(NPY_ULONG)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, unsigned long long>(  // NOLINT
-          NPY_ULONGLONG)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, signed char>(NPY_BYTE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, short>(NPY_SHORT)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, int>(NPY_INT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long>(NPY_LONG)) {  // NOLINT
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, long long>(NPY_LONGLONG)) {  // NOLINT
-    return false;
-  }
-  // Following the numpy convention. imag part is dropped when converting to
-  // float.
-  if (!RegisterCustomFloatCast<T, std::complex<float>>(NPY_CFLOAT)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, std::complex<double>>(NPY_CDOUBLE)) {
-    return false;
-  }
-  if (!RegisterCustomFloatCast<T, std::complex<long double>>(NPY_CLONGDOUBLE)) {
-    return false;
-  }
-
-  // Safe casts from T to other types
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_FLOAT,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_DOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_LONGDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CFLOAT,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CLONGDOUBLE,
-                              NPY_NOSCALAR) < 0) {
-    return false;
-  }
-
-  // Safe casts to T from other types
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BOOL),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_UBYTE),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BYTE),
-                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
-    return false;
-  }
-
-  return true;
-}
-
-template <typename InType, typename OutType, typename Functor>
-struct UnaryUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    char* o = args[1];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) = Functor()(x);
-      i0 += steps[0];
-      o += steps[1];
-    }
-  }
-};
-
-template <typename InType, typename OutType, typename OutType2,
-          typename Functor>
-struct UnaryUFunc2 {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype(),
-            TypeDescriptor<OutType2>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    char* o0 = args[1];
-    char* o1 = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      std::tie(*reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o0),
-               *reinterpret_cast<typename TypeDescriptor<OutType2>::T*>(o1)) =
-          Functor()(x);
-      i0 += steps[0];
-      o0 += steps[1];
-      o1 += steps[2];
-    }
-  }
-};
-
-template <typename InType, typename OutType, typename Functor>
-struct BinaryUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType>::Dtype(),
-            TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      auto y = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i1);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
-          Functor()(x, y);
-      i0 += steps[0];
-      i1 += steps[1];
-      o += steps[2];
-    }
-  }
-};
-
-template <typename InType, typename InType2, typename OutType, typename Functor>
-struct BinaryUFunc2 {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType2>::Dtype(),
-            TypeDescriptor<OutType>::Dtype()};
-  }
-  static void Call(char** args, const npy_intp* dimensions,
-                   const npy_intp* steps, void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o = args[2];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
-      auto y =
-          *reinterpret_cast<const typename TypeDescriptor<InType2>::T*>(i1);
-      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
-          Functor()(x, y);
-      i0 += steps[0];
-      i1 += steps[1];
-      o += steps[2];
-    }
-  }
-};
-
-template <typename UFunc, typename CustomFloatT>
-bool RegisterUFunc(PyObject* numpy, const char* name) {
-  std::vector<int> types = UFunc::Types();
-  PyUFuncGenericFunction fn =
-      reinterpret_cast<PyUFuncGenericFunction>(UFunc::Call);
-  Safe_PyObjectPtr ufunc_obj = make_safe(PyObject_GetAttrString(numpy, name));
-  if (!ufunc_obj) {
-    return false;
-  }
-  PyUFuncObject* ufunc = reinterpret_cast<PyUFuncObject*>(ufunc_obj.get());
-  if (static_cast<int>(types.size()) != ufunc->nargs) {
-    PyErr_Format(PyExc_AssertionError,
-                 "ufunc %s takes %d arguments, loop takes %lu", name,
-                 ufunc->nargs, types.size());
-    return false;
-  }
-  if (PyUFunc_RegisterLoopForType(ufunc, TypeDescriptor<CustomFloatT>::Dtype(),
-                                  fn, const_cast<int*>(types.data()),
-                                  nullptr) < 0) {
-    return false;
-  }
-  return true;
-}
-
-namespace ufuncs {
-
-template <typename T>
-struct Add {
-  T operator()(T a, T b) { return a + b; }
-};
-template <typename T>
-struct Subtract {
-  T operator()(T a, T b) { return a - b; }
-};
-template <typename T>
-struct Multiply {
-  T operator()(T a, T b) { return a * b; }
-};
-template <typename T>
-struct TrueDivide {
-  T operator()(T a, T b) { return a / b; }
-};
-
-inline std::pair<float, float> divmod(float a, float b) {
-  if (b == 0.0f) {
-    float nan = std::numeric_limits<float>::quiet_NaN();
-    return {nan, nan};
-  }
-  float mod = std::fmod(a, b);
-  float div = (a - mod) / b;
-  if (mod != 0.0f) {
-    if ((b < 0.0f) != (mod < 0.0f)) {
-      mod += b;
-      div -= 1.0f;
-    }
-  } else {
-    mod = std::copysign(0.0f, b);
-  }
-
-  float floordiv;
-  if (div != 0.0f) {
-    floordiv = std::floor(div);
-    if (div - floordiv > 0.5f) {
-      floordiv += 1.0f;
-    }
-  } else {
-    floordiv = std::copysign(0.0f, a / b);
-  }
-  return {floordiv, mod};
-}
-
-template <typename T>
-struct FloorDivide {
-  T operator()(T a, T b) {
-    return T(divmod(static_cast<float>(a), static_cast<float>(b)).first);
-  }
-};
-template <typename T>
-struct Remainder {
-  T operator()(T a, T b) {
-    return T(divmod(static_cast<float>(a), static_cast<float>(b)).second);
-  }
-};
-template <typename T>
-struct DivmodUFunc {
-  static std::vector<int> Types() {
-    return {TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype(),
-            TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype()};
-  }
-  static void Call(char** args, npy_intp* dimensions, npy_intp* steps,
-                   void* data) {
-    const char* i0 = args[0];
-    const char* i1 = args[1];
-    char* o0 = args[2];
-    char* o1 = args[3];
-    for (npy_intp k = 0; k < *dimensions; k++) {
-      T x = *reinterpret_cast<const T*>(i0);
-      T y = *reinterpret_cast<const T*>(i1);
-      float floordiv, mod;
-      std::tie(floordiv, mod) =
-          divmod(static_cast<float>(x), static_cast<float>(y));
-      *reinterpret_cast<T*>(o0) = T(floordiv);
-      *reinterpret_cast<T*>(o1) = T(mod);
-      i0 += steps[0];
-      i1 += steps[1];
-      o0 += steps[2];
-      o1 += steps[3];
-    }
-  }
-};
-template <typename T>
-struct Fmod {
-  T operator()(T a, T b) {
-    return T(std::fmod(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Negative {
-  T operator()(T a) { return -a; }
-};
-template <typename T>
-struct Positive {
-  T operator()(T a) { return a; }
-};
-template <typename T>
-struct Power {
-  T operator()(T a, T b) {
-    return T(std::pow(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Abs {
-  T operator()(T a) { return T(std::abs(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cbrt {
-  T operator()(T a) { return T(std::cbrt(static_cast<float>(a))); }
-};
-template <typename T>
-struct Ceil {
-  T operator()(T a) { return T(std::ceil(static_cast<float>(a))); }
-};
-template <typename T>
-struct CopySign;
-
-template <typename T>
-struct Exp {
-  T operator()(T a) { return T(std::exp(static_cast<float>(a))); }
-};
-template <typename T>
-struct Exp2 {
-  T operator()(T a) { return T(std::exp2(static_cast<float>(a))); }
-};
-template <typename T>
-struct Expm1 {
-  T operator()(T a) { return T(std::expm1(static_cast<float>(a))); }
-};
-template <typename T>
-struct Floor {
-  T operator()(T a) { return T(std::floor(static_cast<float>(a))); }
-};
-template <typename T>
-struct Frexp {
-  std::pair<T, int> operator()(T a) {
-    int exp;
-    float f = std::frexp(static_cast<float>(a), &exp);
-    return {T(f), exp};
-  }
-};
-template <typename T>
-struct Heaviside {
-  T operator()(T bx, T h0) {
-    float x = static_cast<float>(bx);
-    if (Eigen::numext::isnan(x)) {
-      return bx;
-    }
-    if (x < 0) {
-      return T(0.0f);
-    }
-    if (x > 0) {
-      return T(1.0f);
-    }
-    return h0;  // x == 0
-  }
-};
-template <typename T>
-struct Conjugate {
-  T operator()(T a) { return a; }
-};
-template <typename T>
-struct IsFinite {
-  bool operator()(T a) { return std::isfinite(static_cast<float>(a)); }
-};
-template <typename T>
-struct IsInf {
-  bool operator()(T a) { return std::isinf(static_cast<float>(a)); }
-};
-template <typename T>
-struct IsNan {
-  bool operator()(T a) { return Eigen::numext::isnan(static_cast<float>(a)); }
-};
-template <typename T>
-struct Ldexp {
-  T operator()(T a, int exp) {
-    return T(std::ldexp(static_cast<float>(a), exp));
-  }
-};
-template <typename T>
-struct Log {
-  T operator()(T a) { return T(std::log(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log2 {
-  T operator()(T a) { return T(std::log2(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log10 {
-  T operator()(T a) { return T(std::log10(static_cast<float>(a))); }
-};
-template <typename T>
-struct Log1p {
-  T operator()(T a) { return T(std::log1p(static_cast<float>(a))); }
-};
-template <typename T>
-struct LogAddExp {
-  T operator()(T bx, T by) {
-    float x = static_cast<float>(bx);
-    float y = static_cast<float>(by);
-    if (x == y) {
-      // Handles infinities of the same sign.
-      return T(x + std::log(2.0f));
-    }
-    float out = std::numeric_limits<float>::quiet_NaN();
-    if (x > y) {
-      out = x + std::log1p(std::exp(y - x));
-    } else if (x < y) {
-      out = y + std::log1p(std::exp(x - y));
-    }
-    return T(out);
-  }
-};
-template <typename T>
-struct LogAddExp2 {
-  T operator()(T bx, T by) {
-    float x = static_cast<float>(bx);
-    float y = static_cast<float>(by);
-    if (x == y) {
-      // Handles infinities of the same sign.
-      return T(x + 1.0f);
-    }
-    float out = std::numeric_limits<float>::quiet_NaN();
-    if (x > y) {
-      out = x + std::log1p(std::exp2(y - x)) / std::log(2.0f);
-    } else if (x < y) {
-      out = y + std::log1p(std::exp2(x - y)) / std::log(2.0f);
-    }
-    return T(out);
-  }
-};
-template <typename T>
-struct Modf {
-  std::pair<T, T> operator()(T a) {
-    float integral;
-    float f = std::modf(static_cast<float>(a), &integral);
-    return {T(f), T(integral)};
-  }
-};
-
-template <typename T>
-struct Reciprocal {
-  T operator()(T a) { return T(1.f / static_cast<float>(a)); }
-};
-template <typename T>
-struct Rint {
-  T operator()(T a) { return T(std::rint(static_cast<float>(a))); }
-};
-template <typename T>
-struct Sign {
-  T operator()(T a) {
-    float f(a);
-    if (f < 0) {
-      return T(-1);
-    }
-    if (f > 0) {
-      return T(1);
-    }
-    return a;
-  }
-};
-template <typename T>
-struct SignBit {
-  bool operator()(T a) { return std::signbit(static_cast<float>(a)); }
-};
-template <typename T>
-struct Sqrt {
-  T operator()(T a) { return T(std::sqrt(static_cast<float>(a))); }
-};
-template <typename T>
-struct Square {
-  T operator()(T a) {
-    float f(a);
-    return T(f * f);
-  }
-};
-template <typename T>
-struct Trunc {
-  T operator()(T a) { return T(std::trunc(static_cast<float>(a))); }
-};
-
-// Trigonometric functions
-template <typename T>
-struct Sin {
-  T operator()(T a) { return T(std::sin(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cos {
-  T operator()(T a) { return T(std::cos(static_cast<float>(a))); }
-};
-template <typename T>
-struct Tan {
-  T operator()(T a) { return T(std::tan(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arcsin {
-  T operator()(T a) { return T(std::asin(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arccos {
-  T operator()(T a) { return T(std::acos(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctan {
-  T operator()(T a) { return T(std::atan(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctan2 {
-  T operator()(T a, T b) {
-    return T(std::atan2(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Hypot {
-  T operator()(T a, T b) {
-    return T(std::hypot(static_cast<float>(a), static_cast<float>(b)));
-  }
-};
-template <typename T>
-struct Sinh {
-  T operator()(T a) { return T(std::sinh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Cosh {
-  T operator()(T a) { return T(std::cosh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Tanh {
-  T operator()(T a) { return T(std::tanh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arcsinh {
-  T operator()(T a) { return T(std::asinh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arccosh {
-  T operator()(T a) { return T(std::acosh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Arctanh {
-  T operator()(T a) { return T(std::atanh(static_cast<float>(a))); }
-};
-template <typename T>
-struct Deg2rad {
-  T operator()(T a) {
-    static constexpr float radians_per_degree = M_PI / 180.0f;
-    return T(static_cast<float>(a) * radians_per_degree);
-  }
-};
-template <typename T>
-struct Rad2deg {
-  T operator()(T a) {
-    static constexpr float degrees_per_radian = 180.0f / M_PI;
-    return T(static_cast<float>(a) * degrees_per_radian);
-  }
-};
-
-template <typename T>
-struct Eq {
-  npy_bool operator()(T a, T b) { return a == b; }
-};
-template <typename T>
-struct Ne {
-  npy_bool operator()(T a, T b) { return a != b; }
-};
-template <typename T>
-struct Lt {
-  npy_bool operator()(T a, T b) { return a < b; }
-};
-template <typename T>
-struct Gt {
-  npy_bool operator()(T a, T b) { return a > b; }
-};
-template <typename T>
-struct Le {
-  npy_bool operator()(T a, T b) { return a <= b; }
-};
-template <typename T>
-struct Ge {
-  npy_bool operator()(T a, T b) { return a >= b; }
-};
-template <typename T>
-struct Maximum {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fa) || fa > fb ? a : b;
-  }
-};
-template <typename T>
-struct Minimum {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fa) || fa < fb ? a : b;
-  }
-};
-template <typename T>
-struct Fmax {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fb) || fa > fb ? a : b;
-  }
-};
-template <typename T>
-struct Fmin {
-  T operator()(T a, T b) {
-    float fa(a), fb(b);
-    return Eigen::numext::isnan(fb) || fa < fb ? a : b;
-  }
-};
-
-template <typename T>
-struct LogicalNot {
-  npy_bool operator()(T a) { return !a; }
-};
-template <typename T>
-struct LogicalAnd {
-  npy_bool operator()(T a, T b) { return a && b; }
-};
-template <typename T>
-struct LogicalOr {
-  npy_bool operator()(T a, T b) { return a || b; }
-};
-template <typename T>
-struct LogicalXor {
-  npy_bool operator()(T a, T b) {
-    return static_cast<bool>(a) ^ static_cast<bool>(b);
-  }
-};
-
-template <typename T>
-struct NextAfter;
-
-template <typename T>
-struct Spacing {
-  T operator()(T x) {
-    // Compute the distance between the input and the next number with greater
-    // magnitude. The result should have the sign of the input.
-    T away(std::copysign(std::numeric_limits<float>::infinity(),
-                         static_cast<float>(x)));
-    return NextAfter<T>()(x, away) - x;
-  }
-};
-
-template <typename T>
-bool RegisterUFuncs(PyObject* numpy) {
-  bool ok =
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Add<T>>, T>(numpy, "add") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Subtract<T>>, T>(numpy,
-                                                               "subtract") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Multiply<T>>, T>(numpy,
-                                                               "multiply") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(numpy,
-                                                                 "divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp<T>>, T>(numpy,
-                                                                "logaddexp") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp2<T>>, T>(
-          numpy, "logaddexp2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Negative<T>>, T>(numpy,
-                                                              "negative") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Positive<T>>, T>(numpy,
-                                                              "positive") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(
-          numpy, "true_divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::FloorDivide<T>>, T>(
-          numpy, "floor_divide") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Power<T>>, T>(numpy, "power") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy,
-                                                                "remainder") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy, "mod") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmod<T>>, T>(numpy, "fmod") &&
-      RegisterUFunc<ufuncs::DivmodUFunc<T>, T>(numpy, "divmod") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "absolute") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "fabs") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rint<T>>, T>(numpy, "rint") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sign<T>>, T>(numpy, "sign") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Heaviside<T>>, T>(numpy,
-                                                                "heaviside") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Conjugate<T>>, T>(numpy,
-                                                               "conjugate") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp<T>>, T>(numpy, "exp") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp2<T>>, T>(numpy, "exp2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Expm1<T>>, T>(numpy, "expm1") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log<T>>, T>(numpy, "log") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log2<T>>, T>(numpy, "log2") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log10<T>>, T>(numpy, "log10") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log1p<T>>, T>(numpy, "log1p") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sqrt<T>>, T>(numpy, "sqrt") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Square<T>>, T>(numpy, "square") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cbrt<T>>, T>(numpy, "cbrt") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Reciprocal<T>>, T>(numpy,
-                                                                "reciprocal") &&
-
-      // Trigonometric functions
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sin<T>>, T>(numpy, "sin") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cos<T>>, T>(numpy, "cos") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tan<T>>, T>(numpy, "tan") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsin<T>>, T>(numpy, "arcsin") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccos<T>>, T>(numpy, "arccos") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctan<T>>, T>(numpy, "arctan") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Arctan2<T>>, T>(numpy,
-                                                              "arctan2") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Hypot<T>>, T>(numpy, "hypot") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sinh<T>>, T>(numpy, "sinh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cosh<T>>, T>(numpy, "cosh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tanh<T>>, T>(numpy, "tanh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsinh<T>>, T>(numpy,
-                                                             "arcsinh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccosh<T>>, T>(numpy,
-                                                             "arccosh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctanh<T>>, T>(numpy,
-                                                             "arctanh") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Deg2rad<T>>, T>(numpy,
-                                                             "deg2rad") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rad2deg<T>>, T>(numpy,
-                                                             "rad2deg") &&
-
-      // Comparison functions
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Eq<T>>, T>(numpy, "equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ne<T>>, T>(numpy,
-                                                            "not_equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Lt<T>>, T>(numpy, "less") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Gt<T>>, T>(numpy, "greater") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Le<T>>, T>(numpy,
-                                                            "less_equal") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ge<T>>, T>(numpy,
-                                                            "greater_equal") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Maximum<T>>, T>(numpy,
-                                                              "maximum") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Minimum<T>>, T>(numpy,
-                                                              "minimum") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmax<T>>, T>(numpy, "fmax") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmin<T>>, T>(numpy, "fmin") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalAnd<T>>, T>(
-          numpy, "logical_and") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalOr<T>>, T>(
-          numpy, "logical_or") &&
-      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalXor<T>>, T>(
-          numpy, "logical_xor") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::LogicalNot<T>>, T>(
-          numpy, "logical_not") &&
-
-      // Floating point functions
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsFinite<T>>, T>(numpy,
-                                                                 "isfinite") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsInf<T>>, T>(numpy, "isinf") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsNan<T>>, T>(numpy, "isnan") &&
-      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::SignBit<T>>, T>(numpy,
-                                                                "signbit") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::CopySign<T>>, T>(numpy,
-                                                               "copysign") &&
-      RegisterUFunc<UnaryUFunc2<T, T, T, ufuncs::Modf<T>>, T>(numpy, "modf") &&
-      RegisterUFunc<BinaryUFunc2<T, int, T, ufuncs::Ldexp<T>>, T>(numpy,
-                                                                  "ldexp") &&
-      RegisterUFunc<UnaryUFunc2<T, T, int, ufuncs::Frexp<T>>, T>(numpy,
-                                                                 "frexp") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Floor<T>>, T>(numpy, "floor") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Ceil<T>>, T>(numpy, "ceil") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Trunc<T>>, T>(numpy, "trunc") &&
-      RegisterUFunc<BinaryUFunc<T, T, ufuncs::NextAfter<T>>, T>(numpy,
-                                                                "nextafter") &&
-      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Spacing<T>>, T>(numpy, "spacing");
-
-  return ok;
-}
-
-}  // namespace ufuncs
-
-template <typename T>
-bool RegisterNumpyDtype(PyObject* numpy) {
-  // If another module (presumably either TF or JAX) has registered a bfloat16
-  // type, use it. We don't want two bfloat16 types if we can avoid it since it
-  // leads to confusion if we have two different types with the same name. This
-  // assumes that the other module has a sufficiently complete bfloat16
-  // implementation. The only known NumPy bfloat16 extension at the time of
-  // writing is this one (distributed in TF and JAX).
-  // TODO(phawkins): distribute the bfloat16 extension as its own pip package,
-  // so we can unambiguously refer to a single canonical definition of bfloat16.
-  int typenum =
-      PyArray_TypeNumFromName(const_cast<char*>(TypeDescriptor<T>::kTypeName));
-  if (typenum != NPY_NOTYPE) {
-    PyArray_Descr* descr = PyArray_DescrFromType(typenum);
-    // The test for an argmax function here is to verify that the
-    // bfloat16 implementation is sufficiently new, and, say, not from
-    // an older version of TF or JAX.
-    if (descr && descr->f && descr->f->argmax) {
-      TypeDescriptor<T>::npy_type = typenum;
-      TypeDescriptor<T>::type_ptr = descr->typeobj;
-      return true;
-    }
-  }
-
-  TypeDescriptor<T>::type.tp_base = &PyGenericArrType_Type;
-
-  if (PyType_Ready(&TypeDescriptor<T>::type) < 0) {
-    return false;
-  }
-
-  // Initializes the NumPy descriptor.
-  PyArray_ArrFuncs& arr_funcs = CustomFloatTypeDescriptor<T>::arr_funcs;
-  PyArray_InitArrFuncs(&arr_funcs);
-  arr_funcs.getitem = NPyCustomFloat_GetItem<T>;
-  arr_funcs.setitem = NPyCustomFloat_SetItem<T>;
-  arr_funcs.compare = NPyCustomFloat_Compare<T>;
-  arr_funcs.copyswapn = NPyCustomFloat_CopySwapN<T>;
-  arr_funcs.copyswap = NPyCustomFloat_CopySwap<T>;
-  arr_funcs.nonzero = NPyCustomFloat_NonZero<T>;
-  arr_funcs.fill = NPyCustomFloat_Fill<T>;
-  arr_funcs.dotfunc = NPyCustomFloat_DotFunc<T>;
-  arr_funcs.compare = NPyCustomFloat_CompareFunc<T>;
-  arr_funcs.argmax = NPyCustomFloat_ArgMaxFunc<T>;
-  arr_funcs.argmin = NPyCustomFloat_ArgMinFunc<T>;
-
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
-  Py_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr) = &PyArrayDescr_Type;
-#else
-  Py_SET_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr, &PyArrayDescr_Type);
-#endif
-  TypeDescriptor<T>::npy_type =
-      PyArray_RegisterDataType(&CustomFloatTypeDescriptor<T>::npy_descr);
-  TypeDescriptor<T>::type_ptr = &TypeDescriptor<T>::type;
-  if (TypeDescriptor<T>::Dtype() < 0) {
-    return false;
-  }
-
-  Safe_PyObjectPtr typeDict_obj =
-      make_safe(PyObject_GetAttrString(numpy, "sctypeDict"));
-  if (!typeDict_obj) return false;
-  // Add the type object to `numpy.typeDict`: that makes
-  // `numpy.dtype(type_name)` work.
-  if (PyDict_SetItemString(
-          typeDict_obj.get(), TypeDescriptor<T>::kTypeName,
-          reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type)) < 0) {
-    return false;
-  }
-
-  // Support dtype(type_name)
-  if (PyDict_SetItemString(TypeDescriptor<T>::type.tp_dict, "dtype",
-                           reinterpret_cast<PyObject*>(
-                               &CustomFloatTypeDescriptor<T>::npy_descr)) < 0) {
-    return false;
-  }
-
-  return RegisterCasts<T>() && ufuncs::RegisterUFuncs<T>(numpy);
-}
-
-namespace ufuncs {
-
-template <>
-struct CopySign<bfloat16> {
-  bfloat16 operator()(bfloat16 a, bfloat16 b) {
-    // LLVM is smart enough to turn this into (a & 0x7fff) | (b & 0x8000).
-    bfloat16 abs_a = Eigen::numext::abs(a);
-    return std::signbit(static_cast<float>(b)) ? -abs_a : abs_a;
-  }
-};
-
-template <>
-struct NextAfter<bfloat16> {
-  bfloat16 operator()(bfloat16 from, bfloat16 to) {
-    uint16_t from_as_int, to_as_int;
-    const uint16_t sign_mask = 1 << 15;
-    float from_as_float(from), to_as_float(to);
-    memcpy(&from_as_int, &from, sizeof(bfloat16));
-    memcpy(&to_as_int, &to, sizeof(bfloat16));
-    if (Eigen::numext::isnan(from_as_float) ||
-        Eigen::numext::isnan(to_as_float)) {
-      return bfloat16(std::numeric_limits<float>::quiet_NaN());
-    }
-    if (from_as_int == to_as_int) {
-      return to;
-    }
-    if (from_as_float == 0) {
-      if (to_as_float == 0) {
-        return to;
-      } else {
-        // Smallest subnormal signed like `to`.
-        uint16_t out_int = (to_as_int & sign_mask) | 1;
-        bfloat16 out;
-        memcpy(&out, &out_int, sizeof(bfloat16));
-        return out;
-      }
-    }
-    uint16_t from_sign = from_as_int & sign_mask;
-    uint16_t to_sign = to_as_int & sign_mask;
-    uint16_t from_abs = from_as_int & ~sign_mask;
-    uint16_t to_abs = to_as_int & ~sign_mask;
-    uint16_t magnitude_adjustment =
-        (from_abs > to_abs || from_sign != to_sign) ? 0xFFFF : 0x0001;
-    uint16_t out_int = from_as_int + magnitude_adjustment;
-    bfloat16 out;
-    memcpy(&out, &out_int, sizeof(bfloat16));
-    return out;
-  }
-};
-
-}  // namespace ufuncs
-
-using bfloat16 = Eigen::bfloat16;
-
-template <>
-struct TypeDescriptor<bfloat16> : CustomFloatTypeDescriptor<bfloat16> {
-  typedef bfloat16 T;
-  static constexpr const char* kTypeName = "bfloat16";
-  static constexpr const char* kTpDoc = "bfloat16 floating-point values";
-  // We must register bfloat16 with a kind other than "f", because numpy
-  // considers two types with the same kind and size to be equal, but
-  // float16 != bfloat16.
-  // The downside of this is that NumPy scalar promotion does not work with
-  // bfloat16 values.
-  static constexpr char kNpyDescrKind = 'V';
-  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
-  // character is unique.
-  static constexpr char kNpyDescrType = 'E';
-  static constexpr char kNpyDescrByteorder = '=';
-};
-
-template <>
-struct TypeDescriptor<float8_e4m3b11>
-    : CustomFloatTypeDescriptor<float8_e4m3b11> {
-  typedef float8_e4m3b11 T;
-  static constexpr const char* kTypeName = "float8_e4m3b11";
-  static constexpr const char* kTpDoc = "float8_e4m3b11 floating-point values";
-  // We must register float8_e4m3b11 with a kind other than "f", because numpy
-  // considers two types with the same kind and size to be equal, and we
-  // expect multiple 1 byte floating point types.
-  // The downside of this is that NumPy scalar promotion does not work with
-  // float8_e4m3b11 values.
-  static constexpr char kNpyDescrKind = 'V';
-  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
-  // character is unique.
-  static constexpr char kNpyDescrType = 'L';
-  static constexpr char kNpyDescrByteorder = '=';
-};
-
-namespace ufuncs {
-
-template <>
-struct CopySign<float8_e4m3b11> {
-  float8_e4m3b11 operator()(float8_e4m3b11 a, float8_e4m3b11 b) {
-    return float8_e4m3b11::FromRep((a.rep() & 0x7f) | (b.rep() & 0x80));
-  }
-};
-
-template <>
-struct NextAfter<float8_e4m3b11> {
-  float8_e4m3b11 operator()(float8_e4m3b11 from, float8_e4m3b11 to) {
-    uint8_t from_rep = from.rep();
-    uint8_t to_rep = to.rep();
-    if (from_rep == 0x80 || to_rep == 0x80) {
-      return float8_e4m3b11::FromRep(0x80);
-    }
-    if (from_rep == to_rep) {
-      return to;
-    }
-    if (from_rep == 0) {
-      return float8_e4m3b11::FromRep(0x01 | (to_rep & 0x80));
-    }
-    const uint16_t sign_mask = 0x80;
-    uint8_t from_sign = from_rep & sign_mask;
-    uint8_t to_sign = to_rep & sign_mask;
-    uint8_t from_abs = from_rep & ~sign_mask;
-    uint8_t to_abs = to_rep & ~sign_mask;
-    uint8_t magnitude_adjustment =
-        (from_abs > to_abs || from_sign != to_sign) ? 0xFF : 0x0001;
-    uint8_t out_int = from_rep + magnitude_adjustment;
-    if (out_int == 0x80) {
-      out_int = 0x0;
-    }
-    return float8_e4m3b11::FromRep(out_int);
-  }
-};
-
-}  // namespace ufuncs
-
-}  // namespace
-
-// Initializes the module.
-bool Initialize() {
-  ImportNumpy();
-  import_umath1(false);
-
-  Safe_PyObjectPtr numpy_str = make_safe(PyUnicode_FromString("numpy"));
-  if (!numpy_str) {
-    return false;
-  }
-  Safe_PyObjectPtr numpy = make_safe(PyImport_Import(numpy_str.get()));
-  if (!numpy) {
-    return false;
-  }
-
-  if (!RegisterNumpyDtype<bfloat16>(numpy.get())) {
-    return false;
-  }
-  if (!RegisterNumpyDtype<float8_e4m3b11>(numpy.get())) {
-    return false;
-  }
-  // TODO(parkers): Enable CanCast to-from fp8 and bf16 and f16.
-  return true;
-}
-
-bool RegisterNumpyBfloat16() {
-  if (TypeDescriptor<bfloat16>::Dtype() != NPY_NOTYPE) {
-    // Already initialized.
-    return true;
-  }
-  if (!Initialize()) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
-    }
-    PyErr_Print();
-    return false;
-  }
-  return true;
-}
-
-PyObject* Bfloat16Dtype() {
-  return reinterpret_cast<PyObject*>(TypeDescriptor<bfloat16>::type_ptr);
-}
-
-int Bfloat16NumpyType() { return TypeDescriptor<bfloat16>::Dtype(); }
-
-PyObject* Float8_E4M3B11Dtype() {
-  return reinterpret_cast<PyObject*>(TypeDescriptor<float8_e4m3b11>::type_ptr);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/bfloat16.h b/tensorflow/python/lib/core/bfloat16.h
index 6c1e971dfb5..05bbdfe362f 100644
--- a/tensorflow/python/lib/core/bfloat16.h
+++ b/tensorflow/python/lib/core/bfloat16.h
@@ -18,20 +18,15 @@ limitations under the License.
 
 #include <Python.h>
 
-namespace tensorflow {
-
-// Register the bfloat16 numpy type. Returns true on success.
-bool RegisterNumpyBfloat16();
-
-// Returns a pointer to the bfloat16 dtype object.
-PyObject* Bfloat16Dtype();
-
-// Returns the id number of the bfloat16 numpy type.
-int Bfloat16NumpyType();
-
-// Returns a pointer to the float8_e4m3b11 dtype object.
-PyObject* Float8_E4M3B11Dtype();
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
 
+namespace tensorflow {
+// NOLINTBEGIN(misc-unused-using-decls)
+using tsl::Bfloat16Dtype;
+using tsl::Bfloat16NumpyType;
+using tsl::Float8_E4M3B11Dtype;
+using tsl::RegisterNumpyBfloat16;
+// NOLINTEND(misc-unused-using-decls)
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
deleted file mode 100644
index 901a9281e68..00000000000
--- a/tensorflow/python/lib/core/bfloat16_test.py
+++ /dev/null
@@ -1,681 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test cases for the bfloat16,float8_e4m3b11 Python types."""
-
-import collections
-import copy
-import itertools
-import math
-import sys
-from typing import Type
-
-from absl.testing import absltest
-from absl.testing import parameterized
-
-import numpy as np
-
-# pylint: disable=unused-import,g-bad-import-order
-from tensorflow.python.framework import dtypes
-from tensorflow.python.lib.core import _pywrap_bfloat16
-from tensorflow.python.platform import test
-
-bfloat16 = _pywrap_bfloat16.TF_bfloat16_type()
-float8_e4m3b11 = _pywrap_bfloat16.TF_float8_e4m3b11_type()
-
-
-def numpy_assert_allclose(a, b, float_type, **kwargs):
-  a = a.astype(np.float32) if a.dtype == float_type else a
-  b = b.astype(np.float32) if b.dtype == float_type else b
-  return np.testing.assert_allclose(a, b, **kwargs)
-
-
-def numpy_promote_types(
-    a: Type[np.generic], b: Type[np.generic], float_type: Type[np.generic],
-    next_largest_fp_type: Type[np.generic]) -> Type[np.generic]:
-  if a == float_type and b == float_type:
-    return float_type
-  if a == float_type:
-    a = next_largest_fp_type
-  if b == float_type:
-    b = next_largest_fp_type
-  return np.promote_types(a, b)
-
-
-def truncate(x, float_type):
-  if isinstance(x, np.ndarray):
-    return x.astype(float_type).astype(np.float32)
-  else:
-    return type(x)(float_type(x))
-
-
-def test_binary_operation(a, b, op, float_type):
-  a = float_type(a)
-  b = float_type(b)
-  expected = op(np.float32(a), np.float32(b))
-  result = op(a, b)
-  if math.isnan(expected):
-    if not math.isnan(result):
-      raise AssertionError("%s expected to be nan." % repr(result))
-  else:
-    np.testing.assert_equal(
-        truncate(expected, float_type=float_type), float(result))
-
-
-epsilon = {
-    bfloat16: float.fromhex("1.0p-7"),
-    float8_e4m3b11: float.fromhex("1.0p-3"),
-}
-
-# Values that should round trip exactly to float and back.
-FLOAT_VALUES = {}
-FLOAT_VALUES[bfloat16] = [
-    0.0, 1.0, -1, 0.5, -0.5, epsilon[bfloat16], 1.0 + epsilon[bfloat16],
-    1.0 - epsilon[bfloat16], -1.0 - epsilon[bfloat16], -1.0 + epsilon[bfloat16],
-    3.5, 4, 5, 7,
-    float("inf"),
-    float("-inf"),
-    float("nan")
-]
-
-FLOAT_VALUES[float8_e4m3b11] = [
-    0.0,
-    1.0,
-    -1,
-    0.5,
-    -0.5,
-    epsilon[float8_e4m3b11],
-    1.0 + epsilon[float8_e4m3b11],
-    1.0 - epsilon[float8_e4m3b11],
-    -1.0 - epsilon[float8_e4m3b11],
-    -1.0 + epsilon[float8_e4m3b11],
-    3.5,
-    4,
-    5,
-    7,
-    float(30),  # max float
-    float(-30),  # min float
-    float("nan")
-]
-
-
-# pylint: disable=g-complex-comprehension
-@parameterized.named_parameters(({
-    "testcase_name": "_" + dtype.__name__,
-    "float_type": dtype
-} for dtype in [bfloat16, float8_e4m3b11]))
-class CustomFloatTest(parameterized.TestCase):
-  """Tests the non-numpy Python methods of the custom float type."""
-
-  def testRoundTripToFloat(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      np.testing.assert_equal(v, float(float_type(v)))
-
-  def testRoundTripNumpyTypes(self, float_type):
-    for dtype in [np.float16, np.float32, np.float64, np.longdouble]:
-      np.testing.assert_equal(-3.75, dtype(float_type(dtype(-3.75))))
-      np.testing.assert_equal(1.5, float(float_type(dtype(1.5))))
-      np.testing.assert_equal(4.5, dtype(float_type(np.array(4.5, dtype))))
-      np.testing.assert_equal(
-          np.array([2, 5, -1], float_type),
-          float_type(np.array([2, 5, -1], dtype)))
-
-  def testRoundTripToInt(self, float_type):
-    for v in {
-        bfloat16: [
-            -256, -255, -34, -2, -1, 0, 1, 2, 10, 47, 128, 255, 256, 512
-        ],
-        float8_e4m3b11: list(range(-30, 30, 2)) + list(range(-15, 15, 2)),
-    }[float_type]:
-      self.assertEqual(v, int(float_type(v)))
-
-  def testRoundTripToNumpy(self, float_type):
-    for dtype in [
-        float_type, np.float16, np.float32, np.float64, np.longdouble
-    ]:
-      with self.subTest(dtype.__name__):
-        for v in FLOAT_VALUES[float_type]:
-          np.testing.assert_equal(v, float_type(dtype(v)))
-          np.testing.assert_equal(v, dtype(float_type(dtype(v))))
-          np.testing.assert_equal(v, dtype(float_type(np.array(v, dtype))))
-        if dtype != float_type:
-          np.testing.assert_equal(
-              np.array(FLOAT_VALUES[float_type], dtype),
-              float_type(np.array(FLOAT_VALUES[float_type],
-                                  dtype)).astype(dtype))
-
-  def testStr(self, float_type):
-    for value in [
-        0.0, 1.0, -3.5,
-        float.fromhex("1.0p-7"),
-        float("inf"),
-        float("-inf"),
-        float("nan")
-    ]:
-      self.assertEqual("%.6g" % float(float_type(value)),
-                       str(float_type(value)))
-
-  def testRepr(self, float_type):
-    for value in [
-        0.0, 1.0, -3.5,
-        float.fromhex("1.0p-7"),
-        float("inf"),
-        float("-inf"),
-        float("nan")
-    ]:
-      self.assertEqual("%.6g" % float(float_type(value)),
-                       repr(float_type(value)))
-
-  def testItem(self, float_type):
-    self.assertIsInstance(float_type(0).item(), float)
-
-  def testHashZero(self, float_type):
-    """Tests that negative zero and zero hash to the same value."""
-    self.assertEqual(hash(float_type(-0.0)), hash(float_type(0.0)))
-
-  def testHashNumbers(self, float_type):
-    for value in np.extract(
-        np.isfinite(FLOAT_VALUES[float_type]), FLOAT_VALUES[float_type]):
-      with self.subTest(value):
-        self.assertEqual(hash(value), hash(float_type(value)), str(value))
-
-  def testHashNan(self, float_type):
-    for name, nan in [("PositiveNan", float_type(float("nan"))),
-                      ("NegativeNan", float_type(float("-nan")))]:
-      with self.subTest(name):
-        nan_hash = hash(nan)
-        nan_object_hash = object.__hash__(nan)
-        # The hash of a NaN is either 0 or a hash of the object pointer.
-        self.assertIn(nan_hash, (sys.hash_info.nan, nan_object_hash), str(nan))
-
-  def testHashInf(self, float_type):
-    if float_type == float8_e4m3b11:
-      self.skipTest("Not supported")  # no inf for e4m3b11
-    self.assertEqual(sys.hash_info.inf, hash(float_type(float("inf"))), "inf")
-    self.assertEqual(-sys.hash_info.inf, hash(float_type(float("-inf"))),
-                     "-inf")
-
-  # Tests for Python operations
-  def testNegate(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      np.testing.assert_equal(
-          float(float_type(-float(float_type(v)))), float(-float_type(v)))
-
-  def testAdd(self, float_type):
-    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
-                 (float("inf"), -2.25), (float("-inf"), -2.25),
-                 (3.5, float("nan"))]:
-      test_binary_operation(a, b, op=lambda a, b: a + b, float_type=float_type)
-
-  def testAddScalarTypePromotion(self, float_type):
-    """Tests type promotion against Numpy scalar values."""
-    types = [float_type, np.float16, np.float32, np.float64, np.longdouble]
-    for lhs_type in types:
-      for rhs_type in types:
-        expected_type = numpy_promote_types(
-            lhs_type,
-            rhs_type,
-            float_type=float_type,
-            next_largest_fp_type={
-                bfloat16: np.float32,
-                float8_e4m3b11: np.float32,
-            }[float_type])
-        actual_type = type(lhs_type(3.5) + rhs_type(2.25))
-        self.assertEqual(expected_type, actual_type)
-
-  def testAddArrayTypePromotion(self, float_type):
-    self.assertEqual(np.float32,
-                     type(float_type(3.5) + np.array(2.25, np.float32)))
-    self.assertEqual(np.float32,
-                     type(np.array(3.5, np.float32) + float_type(2.25)))
-
-  def testSub(self, float_type):
-    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
-                 (-2.25, float("inf")), (-2.25, float("-inf")),
-                 (3.5, float("nan"))]:
-      test_binary_operation(a, b, op=lambda a, b: a - b, float_type=float_type)
-
-  def testMul(self, float_type):
-    for a, b in [(0, 0), (1, 0), (1, -1), (3.5, -2.25), (float("inf"), -2.25),
-                 (float("-inf"), -2.25), (3.5, float("nan"))]:
-      test_binary_operation(a, b, op=lambda a, b: a * b, float_type=float_type)
-
-  def testDiv(self, float_type):
-    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
-                 (float("inf"), -2.25), (float("-inf"), -2.25),
-                 (3.5, float("nan"))]:
-      test_binary_operation(a, b, op=lambda a, b: a / b, float_type=float_type)
-
-  def testLess(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v < w, float_type(v) < float_type(w))
-
-  def testLessEqual(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v <= w, float_type(v) <= float_type(w))
-
-  def testGreater(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v > w, float_type(v) > float_type(w))
-
-  def testGreaterEqual(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v >= w, float_type(v) >= float_type(w))
-
-  def testEqual(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v == w, float_type(v) == float_type(w))
-
-  def testNotEqual(self, float_type):
-    for v in FLOAT_VALUES[float_type]:
-      for w in FLOAT_VALUES[float_type]:
-        self.assertEqual(v != w, float_type(v) != float_type(w))
-
-  def testNan(self, float_type):
-    a = np.isnan(float_type(float("nan")))
-    self.assertTrue(a)
-    numpy_assert_allclose(
-        np.array([1.0, a]), np.array([1.0, a]), float_type=float_type)
-
-    a = np.array(
-        [float_type(1.34375),
-         float_type(1.4375),
-         float_type(float("nan"))],
-        dtype=float_type)
-    b = np.array(
-        [float_type(1.3359375),
-         float_type(1.4375),
-         float_type(float("nan"))],
-        dtype=float_type)
-    numpy_assert_allclose(
-        a,
-        b,
-        rtol=0.1,
-        atol=0.1,
-        equal_nan=True,
-        err_msg="",
-        verbose=True,
-        float_type=float_type)
-
-  def testSort(self, float_type):
-    values_to_sort = np.float32(FLOAT_VALUES[float_type])
-    sorted_f32 = np.sort(values_to_sort)
-    sorted_bf16 = np.sort(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
-    np.testing.assert_equal(sorted_f32, np.float32(sorted_bf16))
-
-  def testArgmax(self, float_type):
-    values_to_sort = np.float32(
-        float_type(np.float32(FLOAT_VALUES[float_type])))
-    argmax_f32 = np.argmax(values_to_sort)
-    argmax_bf16 = np.argmax(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
-    np.testing.assert_equal(argmax_f32, argmax_bf16)
-
-  def testArgmaxOnNan(self, float_type):
-    """Ensures we return the right thing for multiple NaNs."""
-    one_with_nans = np.array(
-        [1.0, float("nan"), float("nan")], dtype=np.float32)
-    np.testing.assert_equal(
-        np.argmax(one_with_nans.astype(float_type)), np.argmax(one_with_nans))
-
-  def testArgmaxOnNegativeInfinity(self, float_type):
-    """Ensures we return the right thing for negative infinities."""
-    inf = np.array([float("-inf")], dtype=np.float32)
-    np.testing.assert_equal(np.argmax(inf.astype(float_type)), np.argmax(inf))
-
-  def testArgmin(self, float_type):
-    values_to_sort = np.float32(
-        float_type(np.float32(FLOAT_VALUES[float_type])))
-    argmin_f32 = np.argmin(values_to_sort)
-    argmin_bf16 = np.argmin(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
-    np.testing.assert_equal(argmin_f32, argmin_bf16)
-
-  def testArgminOnNan(self, float_type):
-    """Ensures we return the right thing for multiple NaNs."""
-    one_with_nans = np.array(
-        [1.0, float("nan"), float("nan")], dtype=np.float32)
-    np.testing.assert_equal(
-        np.argmin(one_with_nans.astype(float_type)), np.argmin(one_with_nans))
-
-  def testArgminOnPositiveInfinity(self, float_type):
-    """Ensures we return the right thing for positive infinities."""
-    inf = np.array([float("inf")], dtype=np.float32)
-    np.testing.assert_equal(np.argmin(inf.astype(float_type)), np.argmin(inf))
-
-  def testDtypeFromString(self, float_type):
-    assert np.dtype(float_type.__name__) == np.dtype(float_type)
-
-
-BinaryOp = collections.namedtuple("BinaryOp", ["op"])
-
-UNARY_UFUNCS = [
-    np.negative, np.positive, np.absolute, np.fabs, np.rint, np.sign,
-    np.conjugate, np.exp, np.exp2, np.expm1, np.log, np.log10, np.log1p,
-    np.log2, np.sqrt, np.square, np.cbrt, np.reciprocal, np.sin, np.cos, np.tan,
-    np.arcsin, np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, np.arcsinh,
-    np.arccosh, np.arctanh, np.deg2rad, np.rad2deg, np.floor, np.ceil, np.trunc
-]
-
-BINARY_UFUNCS = [
-    np.add, np.subtract, np.multiply, np.divide, np.logaddexp, np.logaddexp2,
-    np.floor_divide, np.power, np.remainder, np.fmod, np.heaviside, np.arctan2,
-    np.hypot, np.maximum, np.minimum, np.fmax, np.fmin, np.copysign
-]
-
-BINARY_PREDICATE_UFUNCS = [
-    np.equal, np.not_equal, np.less, np.greater, np.less_equal,
-    np.greater_equal, np.logical_and, np.logical_or, np.logical_xor
-]
-
-
-# pylint: disable=g-complex-comprehension
-@parameterized.named_parameters(({
-    "testcase_name": "_" + dtype.__name__,
-    "float_type": dtype
-} for dtype in [bfloat16, float8_e4m3b11]))
-class CustomFloatNumPyTest(parameterized.TestCase):
-  """Tests the NumPy integration of the float8_e4m3b11 type."""
-
-  def testDtype(self, float_type):
-    self.assertEqual(float_type, np.dtype(float_type))
-
-  def testDeepCopyDoesNotAlterHash(self, float_type):
-    # For context, see https://github.com/google/jax/issues/4651. If the hash
-    # value of the type descriptor is not initialized correctly, a deep copy
-    # can change the type hash.
-    dtype = np.dtype(float_type)
-    h = hash(dtype)
-    _ = copy.deepcopy(dtype)
-    self.assertEqual(h, hash(dtype))
-
-  def testArray(self, float_type):
-    x = np.array([[1, 2, 3]], dtype=float_type)
-    self.assertEqual(float_type, x.dtype)
-    self.assertEqual("[[1 2 3]]", str(x))
-    np.testing.assert_equal(x, x)
-    numpy_assert_allclose(x, x, float_type=float_type)
-    self.assertTrue((x == x).all())
-
-  def testComparisons(self, float_type):
-    x = np.array([30, 7, -30], dtype=np.float32)
-    bx = x.astype(float_type)
-    y = np.array([17, 7, 0], dtype=np.float32)
-    by = y.astype(float_type)
-    np.testing.assert_equal(x == y, bx == by)
-    np.testing.assert_equal(x != y, bx != by)
-    np.testing.assert_equal(x < y, bx < by)
-    np.testing.assert_equal(x > y, bx > by)
-    np.testing.assert_equal(x <= y, bx <= by)
-    np.testing.assert_equal(x >= y, bx >= by)
-
-  def testEqual2(self, float_type):
-    if float_type == float8_e4m3b11:
-      self.skipTest("Not supported")  # out of range.
-    a = np.array([401408], float_type)
-    b = np.array([82432], float_type)
-    self.assertFalse(a.__eq__(b))
-
-  def testCanCast(self, float_type):
-    allowed_casts = [
-        (np.bool_, float_type),
-        (np.int8, float_type),
-        (np.uint8, float_type),
-        (float_type, np.float32),
-        (float_type, np.float64),
-        (float_type, np.longdouble),
-        (float_type, np.complex64),
-        (float_type, np.complex128),
-        (float_type, np.clongdouble),
-    ]
-    all_dtypes = [
-        np.float16, np.float32, np.float64, np.longdouble, np.int8, np.int16,
-        np.int32, np.int64, np.complex64, np.complex128, np.clongdouble,
-        np.uint8, np.uint16, np.uint32, np.uint64, np.intc, np.int_,
-        np.longlong, np.uintc, np.ulonglong
-    ]
-    for d in all_dtypes:
-      with self.subTest(d.__name__):
-        self.assertEqual((float_type, d) in allowed_casts,
-                         np.can_cast(float_type, d))
-        self.assertEqual((d, float_type) in allowed_casts,
-                         np.can_cast(d, float_type))
-
-  def testCasts(self, float_type):
-    for dtype in [
-        np.float16, np.float32, np.float64, np.longdouble, np.int8, np.int16,
-        np.int32, np.int64, np.complex64, np.complex128, np.clongdouble,
-        np.uint8, np.uint16, np.uint32, np.uint64, np.intc, np.int_,
-        np.longlong, np.uintc, np.ulonglong
-    ]:
-      x = np.array([[1, 2, 3]], dtype=dtype)
-      y = x.astype(float_type)
-      z = y.astype(dtype)
-      self.assertTrue(np.all(x == y))
-      self.assertEqual(float_type, y.dtype)
-      self.assertTrue(np.all(x == z))
-      self.assertEqual(dtype, z.dtype)
-
-  def testConformNumpyComplex(self, float_type):
-    for dtype in [np.complex64, np.complex128, np.clongdouble]:
-      x = np.array([1.5, 2.5 + 2.j, 3.25], dtype=dtype)
-      y_np = x.astype(np.float32)
-      y_tf = x.astype(float_type)
-      numpy_assert_allclose(y_np, y_tf, atol=2e-2, float_type=float_type)
-
-      z_np = y_np.astype(dtype)
-      z_tf = y_tf.astype(dtype)
-      numpy_assert_allclose(z_np, z_tf, atol=2e-2, float_type=float_type)
-
-  def testArange(self, float_type):
-    np.testing.assert_equal(
-        np.arange(100, dtype=np.float32).astype(float_type),
-        np.arange(100, dtype=float_type))
-    np.testing.assert_equal(
-        np.arange(-16, 16, 1, dtype=np.float32).astype(float_type),
-        np.arange(-16, 16, 1, dtype=float_type))
-    np.testing.assert_equal(
-        np.arange(-0., -7., -0.25, dtype=np.float32).astype(float_type),
-        np.arange(-0., -7., -0.25, dtype=float_type))
-    np.testing.assert_equal(
-        np.arange(-30., 30., 2., dtype=np.float32).astype(float_type),
-        np.arange(-30., 30., 2., dtype=float_type))
-
-  def testUnaryUfunc(self, float_type):
-    for op in UNARY_UFUNCS:
-      with self.subTest(op.__name__):
-        rng = np.random.RandomState(seed=42)
-        x = rng.randn(3, 7, 10).astype(float_type)
-        numpy_assert_allclose(
-            op(x).astype(np.float32),
-            truncate(op(x.astype(np.float32)), float_type=float_type),
-            rtol=1e-4,
-            float_type=float_type)
-
-  def testBinaryUfunc(self, float_type):
-    for op in BINARY_UFUNCS:
-      with self.subTest(op.__name__):
-        rng = np.random.RandomState(seed=42)
-        x = rng.randn(3, 7, 10).astype(float_type)
-        y = rng.randn(4, 1, 7, 10).astype(float_type)
-        numpy_assert_allclose(
-            op(x, y).astype(np.float32),
-            truncate(
-                op(x.astype(np.float32), y.astype(np.float32)),
-                float_type=float_type),
-            rtol=1e-4,
-            float_type=float_type)
-
-  def testBinaryPredicateUfunc(self, float_type):
-    for op in BINARY_PREDICATE_UFUNCS:
-      with self.subTest(op.__name__):
-        rng = np.random.RandomState(seed=42)
-        x = rng.randn(3, 7).astype(float_type)
-        y = rng.randn(4, 1, 7).astype(float_type)
-        np.testing.assert_equal(
-            op(x, y), op(x.astype(np.float32), y.astype(np.float32)))
-
-  def testPredicateUfunc(self, float_type):
-    for op in [np.isfinite, np.isinf, np.isnan, np.signbit, np.logical_not]:
-      with self.subTest(op.__name__):
-        rng = np.random.RandomState(seed=42)
-        shape = (3, 7, 10)
-        posinf_flips = rng.rand(*shape) < 0.1
-        neginf_flips = rng.rand(*shape) < 0.1
-        nan_flips = rng.rand(*shape) < 0.1
-        vals = rng.randn(*shape)
-        vals = np.where(posinf_flips, np.inf, vals)
-        vals = np.where(neginf_flips, -np.inf, vals)
-        vals = np.where(nan_flips, np.nan, vals)
-        vals = vals.astype(float_type)
-        np.testing.assert_equal(op(vals), op(vals.astype(np.float32)))
-
-  def testDivmod(self, float_type):
-    rng = np.random.RandomState(seed=42)
-    x = rng.randn(3, 7).astype(float_type)
-    y = rng.randn(4, 1, 7).astype(float_type)
-    o1, o2 = np.divmod(x, y)
-    e1, e2 = np.divmod(x.astype(np.float32), y.astype(np.float32))
-    numpy_assert_allclose(
-        o1,
-        truncate(e1, float_type=float_type),
-        rtol=1e-2,
-        float_type=float_type)
-    numpy_assert_allclose(
-        o2,
-        truncate(e2, float_type=float_type),
-        rtol=1e-2,
-        float_type=float_type)
-
-  def testModf(self, float_type):
-    rng = np.random.RandomState(seed=42)
-    x = rng.randn(3, 7).astype(float_type)
-    o1, o2 = np.modf(x)
-    e1, e2 = np.modf(x.astype(np.float32))
-    numpy_assert_allclose(
-        o1.astype(np.float32),
-        truncate(e1, float_type=float_type),
-        rtol=1e-2,
-        float_type=float_type)
-    numpy_assert_allclose(
-        o2.astype(np.float32),
-        truncate(e2, float_type=float_type),
-        rtol=1e-2,
-        float_type=float_type)
-
-  def testLdexp(self, float_type):
-    rng = np.random.RandomState(seed=42)
-    x = rng.randn(3, 7).astype(float_type)
-    y = rng.randint(-50, 50, (1, 7)).astype(np.int32)
-    self.assertEqual(np.ldexp(x, y).dtype, x.dtype)
-    numpy_assert_allclose(
-        np.ldexp(x, y).astype(np.float32),
-        truncate(np.ldexp(x.astype(np.float32), y), float_type=float_type),
-        rtol=1e-2,
-        atol=1e-6,
-        float_type=float_type)
-
-  def testFrexp(self, float_type):
-    rng = np.random.RandomState(seed=42)
-    x = rng.randn(3, 7).astype(float_type)
-    mant1, exp1 = np.frexp(x)
-    mant2, exp2 = np.frexp(x.astype(np.float32))
-    np.testing.assert_equal(exp1, exp2)
-    numpy_assert_allclose(mant1, mant2, rtol=1e-2, float_type=float_type)
-
-  def testCopySign(self, float_type):
-    if float_type == float8_e4m3b11:
-      self.skipTest("Not supported")  # Nans don't have payload.
-    for nan_payload in list(range(1, 128)):
-      with self.subTest(nan_payload):
-        inf_bits = 0x7f80
-        nan_bits = inf_bits | nan_payload
-        nan = np.uint16(nan_bits).view(bfloat16)
-        nan_with_sign = np.copysign(nan, bfloat16(-1))
-        nan_with_sign_bits = nan_with_sign.view(np.uint16)
-        np.testing.assert_equal(nan_bits | (1 << 15), nan_with_sign_bits)
-
-  def testNextAfter(self, float_type):
-    one = np.array(1., dtype=float_type)
-    two = np.array(2., dtype=float_type)
-    zero = np.array(0., dtype=float_type)
-    nan = np.array(np.nan, dtype=float_type)
-    np.testing.assert_equal(np.nextafter(one, two) - one, epsilon[float_type])
-    np.testing.assert_equal(
-        np.nextafter(one, zero) - one, -epsilon[float_type] / 2)
-    np.testing.assert_equal(np.isnan(np.nextafter(nan, one)), True)
-    np.testing.assert_equal(np.isnan(np.nextafter(one, nan)), True)
-    np.testing.assert_equal(np.nextafter(one, one), one)
-    smallest_denormal = {
-        bfloat16: float.fromhex("1.0p-133"),
-        float8_e4m3b11: float.fromhex("1.0p-13"),
-    }[float_type]
-    np.testing.assert_equal(np.nextafter(zero, one), smallest_denormal)
-    np.testing.assert_equal(np.nextafter(zero, -one), -smallest_denormal)
-    for a, b in itertools.permutations([0., nan], 2):
-      np.testing.assert_equal(
-          np.nextafter(
-              np.array(a, dtype=np.float32), np.array(b, dtype=np.float32)),
-          np.nextafter(
-              np.array(a, dtype=float_type), np.array(b, dtype=float_type)))
-
-  def testSpacing(self, float_type):
-    # Sweep a variety of binades to see that spacing gives the proper ULP.
-    # All subnormals have a fixed distance of 2^-133.
-    with self.subTest(name="Subnormals"):
-      if float_type == float8_e4m3b11:
-        self.skipTest("Not supported")
-      for i in {
-          float8_e4m3b11: range(-13, -10),
-          bfloat16: range(-133, -126)
-      }[float_type]:
-        power_of_two = float_type(2.0**i)
-        distance = {
-            float8_e4m3b11: float.fromhex("0x1p-13"),
-            bfloat16: float.fromhex("0x1p-133")
-        }[float_type]
-        np.testing.assert_equal(np.spacing(power_of_two), distance)
-        np.testing.assert_equal(np.spacing(-power_of_two), -distance)
-    # Normals have a distance which depends on their binade.
-    with self.subTest(name="Normals"):
-      for i in {
-          float8_e4m3b11: range(-10, 4),
-          bfloat16: range(-126, 127)
-      }[float_type]:
-        power_of_two = float_type(2.0**i)
-        distance = epsilon[float_type] * power_of_two
-        np.testing.assert_equal(np.spacing(power_of_two), distance)
-        np.testing.assert_equal(np.spacing(-power_of_two), -distance)
-    inf = float_type(float("inf"))
-    nan = float_type(float("nan"))
-    # Check that spacing agrees with arithmetic involving nextafter.
-    with self.subTest(name="NextAfter"):
-      for x in FLOAT_VALUES[float_type]:
-        x_float_type = float_type(x)
-        spacing = np.spacing(x_float_type)
-        toward = np.copysign(inf, x_float_type)
-        nextup = np.nextafter(x_float_type, toward)
-        np.testing.assert_equal(spacing, nextup - x_float_type)
-    # Check that spacing for special values gives the correct answer.
-    with self.subTest(name="NonFinite"):
-      np.testing.assert_equal(np.spacing(nan), np.spacing(np.float32(nan)))
-      if float_type != float8_e4m3b11:  # inf not supported.
-        np.testing.assert_equal(np.spacing(inf), np.spacing(np.float32(inf)))
-
-
-if __name__ == "__main__":
-  absltest.main()
diff --git a/tensorflow/python/lib/core/bfloat16_wrapper.cc b/tensorflow/python/lib/core/bfloat16_wrapper.cc
index 8a3c6f893d3..0ec5b75c2dc 100644
--- a/tensorflow/python/lib/core/bfloat16_wrapper.cc
+++ b/tensorflow/python/lib/core/bfloat16_wrapper.cc
@@ -14,14 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "pybind11/pybind11.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
 
 PYBIND11_MODULE(_pywrap_bfloat16, m) {
-  tensorflow::RegisterNumpyBfloat16();
+  tsl::RegisterNumpyBfloat16();
 
   m.def("TF_bfloat16_type",
-        [] { return pybind11::handle(tensorflow::Bfloat16Dtype()); });
+        [] { return pybind11::handle(tsl::Bfloat16Dtype()); });
 
   m.def("TF_float8_e4m3b11_type",
-        [] { return pybind11::handle(tensorflow::Float8_E4M3B11Dtype()); });
+        [] { return pybind11::handle(tsl::Float8_E4M3B11Dtype()); });
 }
diff --git a/tensorflow/python/lib/core/custom_casts_wrapper.cc b/tensorflow/python/lib/core/custom_casts_wrapper.cc
new file mode 100644
index 00000000000..8f52b9a963e
--- /dev/null
+++ b/tensorflow/python/lib/core/custom_casts_wrapper.cc
@@ -0,0 +1,21 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/tsl/python/lib/core/custom_casts.h"
+
+PYBIND11_MODULE(_pywrap_custom_casts, m) {
+  m.def("TF_register_custom_casts", [] { return tsl::RegisterCustomCasts(); });
+}
diff --git a/tensorflow/python/lib/core/custom_float_test.py b/tensorflow/python/lib/core/custom_float_test.py
new file mode 100644
index 00000000000..18a51469706
--- /dev/null
+++ b/tensorflow/python/lib/core/custom_float_test.py
@@ -0,0 +1,726 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for custom float Python types."""
+
+import collections
+import copy
+import itertools
+import math
+import sys
+from typing import Type
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import numpy as np
+
+# pylint: disable=unused-import,g-bad-import-order
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.core import _pywrap_bfloat16
+from tensorflow.python.lib.core import _pywrap_float8
+from tensorflow.python.lib.core import _pywrap_custom_casts
+from tensorflow.python.platform import test
+
+bfloat16 = _pywrap_bfloat16.TF_bfloat16_type()
+float8_e4m3b11 = _pywrap_bfloat16.TF_float8_e4m3b11_type()
+float8_e4m3fn = _pywrap_float8.TF_float8_e4m3fn_type()
+float8_e5m2 = _pywrap_float8.TF_float8_e5m2_type()
+_pywrap_custom_casts.TF_register_custom_casts()
+
+
+def numpy_assert_allclose(a, b, float_type, **kwargs):
+  a = a.astype(np.float32) if a.dtype == float_type else a
+  b = b.astype(np.float32) if b.dtype == float_type else b
+  return np.testing.assert_allclose(a, b, **kwargs)
+
+
+def numpy_promote_types(
+    a: Type[np.generic], b: Type[np.generic], float_type: Type[np.generic],
+    next_largest_fp_type: Type[np.generic]) -> Type[np.generic]:
+  if a == float_type and b == float_type:
+    return float_type
+  if a == float_type:
+    a = next_largest_fp_type
+  if b == float_type:
+    b = next_largest_fp_type
+  return np.promote_types(a, b)
+
+
+def truncate(x, float_type):
+  if isinstance(x, np.ndarray):
+    return x.astype(float_type).astype(np.float32)
+  else:
+    return type(x)(float_type(x))
+
+
+def test_binary_operation(a, b, op, float_type):
+  a = float_type(a)
+  b = float_type(b)
+  expected = op(np.float32(a), np.float32(b))
+  result = op(a, b)
+  if math.isnan(expected):
+    if not math.isnan(result):
+      raise AssertionError("%s expected to be nan." % repr(result))
+  else:
+    np.testing.assert_equal(
+        truncate(expected, float_type=float_type), float(result))
+
+
+def dtype_has_inf(dtype):
+  """Determines if the dtype has an `inf` representation."""
+  inf = float("inf")
+  is_inf = False
+  try:
+    x = dtype(inf)
+    is_inf = np.isinf(x)
+  except (OverflowError, ValueError):
+    pass
+  return is_inf
+
+
+# Configure bounds and properties for our custom types, to be used in tests
+# below.
+FLOAT_EPSILON = {
+    bfloat16: float.fromhex("1.0p-7"),
+    float8_e4m3b11: float.fromhex("1.0p-3"),
+    float8_e4m3fn: float.fromhex("1.0p-3"),
+    float8_e5m2: float.fromhex("1.0p-2"),
+}
+
+FLOAT_MAX = {
+    bfloat16: float.fromhex("1.FEp127"),
+    float8_e4m3b11: float.fromhex("1.Ep4"),
+    float8_e4m3fn: float.fromhex("1.Cp8"),
+    float8_e5m2: float.fromhex("1.Cp15"),
+}
+
+FLOAT_SMALLEST_SUBNORMAL = {
+    bfloat16: float.fromhex("1.0p-133"),
+    float8_e4m3b11: float.fromhex("1.0p-13"),
+    float8_e4m3fn: float.fromhex("1.0p-9"),
+    float8_e5m2: float.fromhex("1.0p-16"),
+}
+
+FLOAT_SMALLEST_NORMAL = {
+    bfloat16: float.fromhex("1.0p-126"),
+    float8_e4m3b11: float.fromhex("1.0p-10"),
+    float8_e4m3fn: float.fromhex("1.0p-6"),
+    float8_e5m2: float.fromhex("1.0p-14"),
+}
+
+# Values that should round trip exactly to float and back.
+# pylint: disable=g-complex-comprehension
+FLOAT_VALUES = {
+    dtype: [
+        0.0, 1.0, -1.0, 0.5, -0.5, FLOAT_EPSILON[dtype],
+        1.0 + FLOAT_EPSILON[dtype], 1.0 - FLOAT_EPSILON[dtype],
+        -1.0 - FLOAT_EPSILON[dtype], -1.0 + FLOAT_EPSILON[dtype], 3.5, 4, 5, 7,
+        FLOAT_MAX[dtype], -FLOAT_MAX[dtype], float("nan"), float("-nan"),
+        float("inf") if dtype_has_inf(dtype) else 0.0,
+        float("-inf") if dtype_has_inf(dtype) else 0.0
+    ] for dtype in FLOAT_EPSILON.keys()
+}
+
+# Values that should round trip exactly to integer and back.
+INT_VALUES = {
+    bfloat16: [0, 1, 2, 10, 34, 47, 128, 255, 256, 512],
+    float8_e4m3b11:
+        list(range(0, 30, 2)) + list(range(1, 15, 2)),
+    float8_e4m3fn: [
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22,
+        24, 26, 28, 30, 32, 36, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104,
+        112, 120, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352,
+        384, 416, 448
+    ],
+    float8_e5m2: [
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+        64, 80, 96, 112, 128, 160, 192, 224, 256, 320, 384, 448, 512, 640, 768,
+        896, 1024, 1280, 1536, 1792, 2048, 2560, 3072, 3584, 4096, 5120, 6144,
+        7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, 28672, 32768,
+        40960, 49152, 57344
+    ]
+}
+
+BITS_TYPE = {
+    bfloat16: np.uint16,
+    float8_e4m3b11: np.uint8,
+    float8_e4m3fn: np.uint8,
+    float8_e5m2: np.uint8
+}
+
+
+# pylint: disable=g-complex-comprehension
+@parameterized.named_parameters(({
+    "testcase_name": "_" + dtype.__name__,
+    "float_type": dtype
+} for dtype in [bfloat16, float8_e4m3b11, float8_e4m3fn, float8_e5m2]))
+class CustomFloatTest(parameterized.TestCase):
+  """Tests the non-numpy Python methods of the custom float type."""
+
+  def testRoundTripToFloat(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      np.testing.assert_equal(v, float(float_type(v)))
+
+  def testRoundTripNumpyTypes(self, float_type):
+    for dtype in [np.float16, np.float32, np.float64, np.longdouble]:
+      for f in FLOAT_VALUES[float_type]:
+        np.testing.assert_equal(dtype(f), dtype(float_type(dtype(f))))
+        np.testing.assert_equal(float(dtype(f)), float(float_type(dtype(f))))
+        np.testing.assert_equal(dtype(f), dtype(float_type(np.array(f, dtype))))
+
+      np.testing.assert_equal(
+          dtype(np.array(FLOAT_VALUES[float_type], float_type)),
+          np.array(FLOAT_VALUES[float_type], dtype))
+
+  def testRoundTripToInt(self, float_type):
+    for v in INT_VALUES[float_type]:
+      self.assertEqual(v, int(float_type(v)))
+      self.assertEqual(-v, int(float_type(-v)))
+
+  def testRoundTripToNumpy(self, float_type):
+    for dtype in [
+        float_type, np.float16, np.float32, np.float64, np.longdouble
+    ]:
+      with self.subTest(dtype.__name__):
+        for v in FLOAT_VALUES[float_type]:
+          np.testing.assert_equal(dtype(v), dtype(float_type(dtype(v))))
+          np.testing.assert_equal(dtype(v), dtype(float_type(dtype(v))))
+          np.testing.assert_equal(
+              dtype(v), dtype(float_type(np.array(v, dtype))))
+        if dtype != float_type:
+          np.testing.assert_equal(
+              np.array(FLOAT_VALUES[float_type], dtype),
+              float_type(np.array(FLOAT_VALUES[float_type],
+                                  dtype)).astype(dtype))
+
+  def testBetweenCustomTypes(self, float_type):
+    for dtype in [bfloat16, float8_e4m3b11, float8_e4m3fn, float8_e5m2]:
+      x = np.array(FLOAT_VALUES[float_type], dtype=dtype)
+      y = x.astype(float_type)
+      z = x.astype(float).astype(float_type)
+      numpy_assert_allclose(y, z, float_type=float_type)
+
+  def testStr(self, float_type):
+    for value in FLOAT_VALUES[float_type]:
+      self.assertEqual("%.6g" % float(float_type(value)),
+                       str(float_type(value)))
+
+  def testFromStr(self, float_type):
+    self.assertEqual(float_type(1.2), float_type("1.2"))
+    self.assertTrue(np.isnan(float_type("nan")))
+    self.assertTrue(np.isnan(float_type("-nan")))
+    if dtype_has_inf(float_type):
+      self.assertEqual(float_type(float("inf")), float_type("inf"))
+      self.assertEqual(float_type(float("-inf")), float_type("-inf"))
+
+  def testRepr(self, float_type):
+    for value in FLOAT_VALUES[float_type]:
+      self.assertEqual("%.6g" % float(float_type(value)),
+                       repr(float_type(value)))
+
+  def testItem(self, float_type):
+    self.assertIsInstance(float_type(0).item(), float)
+
+  def testHashZero(self, float_type):
+    """Tests that negative zero and zero hash to the same value."""
+    self.assertEqual(hash(float_type(-0.0)), hash(float_type(0.0)))
+
+  def testHashNumbers(self, float_type):
+    for value in np.extract(
+        np.isfinite(FLOAT_VALUES[float_type]), FLOAT_VALUES[float_type]):
+      with self.subTest(value):
+        self.assertEqual(hash(value), hash(float_type(value)), str(value))
+
+  def testHashNan(self, float_type):
+    for name, nan in [("PositiveNan", float_type(float("nan"))),
+                      ("NegativeNan", float_type(float("-nan")))]:
+      with self.subTest(name):
+        nan_hash = hash(nan)
+        nan_object_hash = object.__hash__(nan)
+        # The hash of a NaN is either 0 or a hash of the object pointer.
+        self.assertIn(nan_hash, (sys.hash_info.nan, nan_object_hash),
+                      str(nan))
+
+  def testHashInf(self, float_type):
+    if dtype_has_inf(float_type):
+      self.assertEqual(sys.hash_info.inf, hash(float_type(float("inf"))), "inf")
+      self.assertEqual(-sys.hash_info.inf, hash(float_type(float("-inf"))),
+                       "-inf")
+
+  # Tests for Python operations
+  def testNegate(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      np.testing.assert_equal(
+          float(float_type(-float(float_type(v)))), float(-float_type(v)))
+
+  def testAdd(self, float_type):
+    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
+                 (float("inf"), -2.25), (float("-inf"), -2.25),
+                 (3.5, float("nan"))]:
+      test_binary_operation(a, b, op=lambda a, b: a + b, float_type=float_type)
+
+  def testAddScalarTypePromotion(self, float_type):
+    """Tests type promotion against Numpy scalar values."""
+    types = [float_type, np.float16, np.float32, np.float64, np.longdouble]
+    for lhs_type in types:
+      for rhs_type in types:
+        expected_type = numpy_promote_types(
+            lhs_type,
+            rhs_type,
+            float_type=float_type,
+            next_largest_fp_type=np.float32)
+        actual_type = type(lhs_type(3.5) + rhs_type(2.25))
+        self.assertEqual(expected_type, actual_type)
+
+  def testAddArrayTypePromotion(self, float_type):
+    self.assertEqual(np.float32,
+                     type(float_type(3.5) + np.array(2.25, np.float32)))
+    self.assertEqual(np.float32,
+                     type(np.array(3.5, np.float32) + float_type(2.25)))
+
+  def testSub(self, float_type):
+    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
+                 (-2.25, float("inf")), (-2.25, float("-inf")),
+                 (3.5, float("nan"))]:
+      test_binary_operation(a, b, op=lambda a, b: a - b, float_type=float_type)
+
+  def testMul(self, float_type):
+    for a, b in [(0, 0), (1, 0), (1, -1), (3.5, -2.25), (float("inf"), -2.25),
+                 (float("-inf"), -2.25), (3.5, float("nan"))]:
+      test_binary_operation(a, b, op=lambda a, b: a * b, float_type=float_type)
+
+  def testDiv(self, float_type):
+    for a, b in [(0, 0), (1, 0), (1, -1), (2, 3.5), (3.5, -2.25),
+                 (float("inf"), -2.25), (float("-inf"), -2.25),
+                 (3.5, float("nan"))]:
+      test_binary_operation(a, b, op=lambda a, b: a / b, float_type=float_type)
+
+  def testLess(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v < w, float_type(v) < float_type(w))
+
+  def testLessEqual(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v <= w, float_type(v) <= float_type(w))
+
+  def testGreater(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v > w, float_type(v) > float_type(w))
+
+  def testGreaterEqual(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v >= w, float_type(v) >= float_type(w))
+
+  def testEqual(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v == w, float_type(v) == float_type(w))
+
+  def testNotEqual(self, float_type):
+    for v in FLOAT_VALUES[float_type]:
+      for w in FLOAT_VALUES[float_type]:
+        self.assertEqual(v != w, float_type(v) != float_type(w))
+
+  def testNan(self, float_type):
+    a = np.isnan(float_type(float("nan")))
+    self.assertTrue(a)
+    numpy_assert_allclose(
+        np.array([1.0, a]), np.array([1.0, a]), float_type=float_type)
+
+    a = np.array(
+        [float_type(1.34375),
+         float_type(1.4375),
+         float_type(float("nan"))],
+        dtype=float_type)
+    b = np.array(
+        [float_type(1.3359375),
+         float_type(1.4375),
+         float_type(float("nan"))],
+        dtype=float_type)
+    numpy_assert_allclose(
+        a,
+        b,
+        rtol=0.1,
+        atol=0.1,
+        equal_nan=True,
+        err_msg="",
+        verbose=True,
+        float_type=float_type)
+
+  def testSort(self, float_type):
+    # Note: np.sort doesn't work properly with NaNs since they always compare
+    # False.
+    values_to_sort = np.float32(
+        [x for x in FLOAT_VALUES[float_type] if not np.isnan(x)])
+    sorted_f32 = np.sort(values_to_sort)
+    sorted_float_type = np.sort(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
+    np.testing.assert_equal(sorted_f32, np.float32(sorted_float_type))
+
+  def testArgmax(self, float_type):
+    values_to_sort = np.float32(
+        float_type(np.float32(FLOAT_VALUES[float_type])))
+    argmax_f32 = np.argmax(values_to_sort)
+    argmax_float_type = np.argmax(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
+    np.testing.assert_equal(argmax_f32, argmax_float_type)
+
+  def testArgmaxOnNan(self, float_type):
+    """Ensures we return the right thing for multiple NaNs."""
+    one_with_nans = np.array(
+        [1.0, float("nan"), float("nan")], dtype=np.float32)
+    np.testing.assert_equal(
+        np.argmax(one_with_nans.astype(float_type)), np.argmax(one_with_nans))
+
+  def testArgmaxOnNegativeInfinity(self, float_type):
+    """Ensures we return the right thing for negative infinities."""
+    inf = np.array([float("-inf")], dtype=np.float32)
+    np.testing.assert_equal(np.argmax(inf.astype(float_type)), np.argmax(inf))
+
+  def testArgmin(self, float_type):
+    values_to_sort = np.float32(
+        float_type(np.float32(FLOAT_VALUES[float_type])))
+    argmin_f32 = np.argmin(values_to_sort)
+    argmin_float_type = np.argmin(values_to_sort.astype(float_type))  # pylint: disable=too-many-function-args
+    np.testing.assert_equal(argmin_f32, argmin_float_type)
+
+  def testArgminOnNan(self, float_type):
+    """Ensures we return the right thing for multiple NaNs."""
+    one_with_nans = np.array(
+        [1.0, float("nan"), float("nan")], dtype=np.float32)
+    np.testing.assert_equal(
+        np.argmin(one_with_nans.astype(float_type)), np.argmin(one_with_nans))
+
+  def testArgminOnPositiveInfinity(self, float_type):
+    """Ensures we return the right thing for positive infinities."""
+    inf = np.array([float("inf")], dtype=np.float32)
+    np.testing.assert_equal(np.argmin(inf.astype(float_type)), np.argmin(inf))
+
+  def testDtypeFromString(self, float_type):
+    assert np.dtype(float_type.__name__) == np.dtype(float_type)
+
+
+BinaryOp = collections.namedtuple("BinaryOp", ["op"])
+
+UNARY_UFUNCS = [
+    np.negative, np.positive, np.absolute, np.fabs, np.rint, np.sign,
+    np.conjugate, np.exp, np.exp2, np.expm1, np.log, np.log10, np.log1p,
+    np.log2, np.sqrt, np.square, np.cbrt, np.reciprocal, np.sin, np.cos, np.tan,
+    np.arcsin, np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, np.arcsinh,
+    np.arccosh, np.arctanh, np.deg2rad, np.rad2deg, np.floor, np.ceil, np.trunc
+]
+
+BINARY_UFUNCS = [
+    np.add, np.subtract, np.multiply, np.divide, np.logaddexp, np.logaddexp2,
+    np.floor_divide, np.power, np.remainder, np.fmod, np.heaviside, np.arctan2,
+    np.hypot, np.maximum, np.minimum, np.fmax, np.fmin, np.copysign
+]
+
+BINARY_PREDICATE_UFUNCS = [
+    np.equal, np.not_equal, np.less, np.greater, np.less_equal,
+    np.greater_equal, np.logical_and, np.logical_or, np.logical_xor
+]
+
+
+# pylint: disable=g-complex-comprehension
+@parameterized.named_parameters(({
+    "testcase_name": "_" + dtype.__name__,
+    "float_type": dtype
+} for dtype in [bfloat16, float8_e4m3b11, float8_e4m3fn, float8_e5m2]))
+class CustomFloatNumPyTest(parameterized.TestCase):
+  """Tests NumPy integration of the custom float types."""
+
+  def testDtype(self, float_type):
+    self.assertEqual(float_type, np.dtype(float_type))
+
+  def testDeepCopyDoesNotAlterHash(self, float_type):
+    # For context, see https://github.com/google/jax/issues/4651. If the hash
+    # value of the type descriptor is not initialized correctly, a deep copy
+    # can change the type hash.
+    dtype = np.dtype(float_type)
+    h = hash(dtype)
+    _ = copy.deepcopy(dtype)
+    self.assertEqual(h, hash(dtype))
+
+  def testArray(self, float_type):
+    x = np.array([[1, 2, 3]], dtype=float_type)
+    self.assertEqual(float_type, x.dtype)
+    self.assertEqual("[[1 2 3]]", str(x))
+    np.testing.assert_equal(x, x)
+    numpy_assert_allclose(x, x, float_type=float_type)
+    self.assertTrue((x == x).all())
+
+  def testComparisons(self, float_type):
+    x = np.array([30, 7, -30], dtype=np.float32)
+    bx = x.astype(float_type)
+    y = np.array([17, 7, 0], dtype=np.float32)
+    by = y.astype(float_type)
+    np.testing.assert_equal(x == y, bx == by)
+    np.testing.assert_equal(x != y, bx != by)
+    np.testing.assert_equal(x < y, bx < by)
+    np.testing.assert_equal(x > y, bx > by)
+    np.testing.assert_equal(x <= y, bx <= by)
+    np.testing.assert_equal(x >= y, bx >= by)
+
+  def testEqual2(self, float_type):
+    a = np.array([31], float_type)
+    b = np.array([15], float_type)
+    self.assertFalse(a.__eq__(b))
+
+  def testCanCast(self, float_type):
+    allowed_casts = [
+        (np.bool_, float_type),
+        (np.int8, float_type),
+        (np.uint8, float_type),
+        (float_type, np.float32),
+        (float_type, np.float64),
+        (float_type, np.longdouble),
+        (float_type, np.complex64),
+        (float_type, np.complex128),
+        (float_type, np.clongdouble),
+    ]
+    all_dtypes = [
+        np.float16, np.float32, np.float64, np.longdouble, np.int8, np.int16,
+        np.int32, np.int64, np.complex64, np.complex128, np.clongdouble,
+        np.uint8, np.uint16, np.uint32, np.uint64, np.intc, np.int_,
+        np.longlong, np.uintc, np.ulonglong
+    ]
+    for d in all_dtypes:
+      with self.subTest(d.__name__):
+        self.assertEqual((float_type, d) in allowed_casts,
+                         np.can_cast(float_type, d))
+        self.assertEqual((d, float_type) in allowed_casts,
+                         np.can_cast(d, float_type))
+
+  def testCasts(self, float_type):
+    for dtype in [
+        np.float16, np.float32, np.float64, np.longdouble, np.int8, np.int16,
+        np.int32, np.int64, np.complex64, np.complex128, np.clongdouble,
+        np.uint8, np.uint16, np.uint32, np.uint64, np.intc, np.int_,
+        np.longlong, np.uintc, np.ulonglong
+    ]:
+      x = np.array([[1, 2, 3]], dtype=dtype)
+      y = x.astype(float_type)
+      z = y.astype(dtype)
+      self.assertTrue(np.all(x == y))
+      self.assertEqual(float_type, y.dtype)
+      self.assertTrue(np.all(x == z))
+      self.assertEqual(dtype, z.dtype)
+
+  def testConformNumpyComplex(self, float_type):
+    for dtype in [np.complex64, np.complex128, np.clongdouble]:
+      x = np.array([1.5, 2.5 + 2.j, 3.5], dtype=dtype)
+      y_np = x.astype(np.float32)
+      y_tf = x.astype(float_type)
+      numpy_assert_allclose(y_np, y_tf, atol=2e-2, float_type=float_type)
+
+      z_np = y_np.astype(dtype)
+      z_tf = y_tf.astype(dtype)
+      numpy_assert_allclose(z_np, z_tf, atol=2e-2, float_type=float_type)
+
+  def testArange(self, float_type):
+    np.testing.assert_equal(
+        np.arange(100, dtype=np.float32).astype(float_type),
+        np.arange(100, dtype=float_type))
+    np.testing.assert_equal(
+        np.arange(-8, 8, 1, dtype=np.float32).astype(float_type),
+        np.arange(-8, 8, 1, dtype=float_type))
+    np.testing.assert_equal(
+        np.arange(-0., -2., -0.25, dtype=np.float32).astype(float_type),
+        np.arange(-0., -2., -0.25, dtype=float_type))
+    np.testing.assert_equal(
+        np.arange(-16., 16., 2., dtype=np.float32).astype(float_type),
+        np.arange(-16., 16., 2., dtype=float_type))
+
+  def testUnaryUfunc(self, float_type):
+    for op in UNARY_UFUNCS:
+      with self.subTest(op.__name__):
+        rng = np.random.RandomState(seed=42)
+        x = rng.randn(3, 7, 10).astype(float_type)
+        numpy_assert_allclose(
+            op(x).astype(np.float32),
+            truncate(op(x.astype(np.float32)), float_type=float_type),
+            rtol=1e-4,
+            float_type=float_type)
+
+  def testBinaryUfunc(self, float_type):
+    for op in BINARY_UFUNCS:
+      with self.subTest(op.__name__):
+        rng = np.random.RandomState(seed=42)
+        x = rng.randn(3, 7, 10).astype(float_type)
+        y = rng.randn(4, 1, 7, 10).astype(float_type)
+        numpy_assert_allclose(
+            op(x, y).astype(np.float32),
+            truncate(
+                op(x.astype(np.float32), y.astype(np.float32)),
+                float_type=float_type),
+            rtol=1e-4,
+            float_type=float_type)
+
+  def testBinaryPredicateUfunc(self, float_type):
+    for op in BINARY_PREDICATE_UFUNCS:
+      with self.subTest(op.__name__):
+        rng = np.random.RandomState(seed=42)
+        x = rng.randn(3, 7).astype(float_type)
+        y = rng.randn(4, 1, 7).astype(float_type)
+        np.testing.assert_equal(
+            op(x, y), op(x.astype(np.float32), y.astype(np.float32)))
+
+  def testPredicateUfunc(self, float_type):
+    for op in [np.isfinite, np.isinf, np.isnan, np.signbit, np.logical_not]:
+      with self.subTest(op.__name__):
+        rng = np.random.RandomState(seed=42)
+        shape = (3, 7, 10)
+        posinf_flips = rng.rand(*shape) < 0.1
+        neginf_flips = rng.rand(*shape) < 0.1
+        nan_flips = rng.rand(*shape) < 0.1
+        vals = rng.randn(*shape)
+        vals = np.where(posinf_flips, np.inf, vals)
+        vals = np.where(neginf_flips, -np.inf, vals)
+        vals = np.where(nan_flips, np.nan, vals)
+        vals = vals.astype(float_type)
+        np.testing.assert_equal(op(vals), op(vals.astype(np.float32)))
+
+  def testDivmod(self, float_type):
+    rng = np.random.RandomState(seed=42)
+    x = rng.randn(3, 7).astype(float_type)
+    y = rng.randn(4, 1, 7).astype(float_type)
+    o1, o2 = np.divmod(x, y)
+    e1, e2 = np.divmod(x.astype(np.float32), y.astype(np.float32))
+    numpy_assert_allclose(
+        o1,
+        truncate(e1, float_type=float_type),
+        rtol=1e-2,
+        float_type=float_type)
+    numpy_assert_allclose(
+        o2,
+        truncate(e2, float_type=float_type),
+        rtol=1e-2,
+        float_type=float_type)
+
+  def testModf(self, float_type):
+    rng = np.random.RandomState(seed=42)
+    x = rng.randn(3, 7).astype(float_type)
+    o1, o2 = np.modf(x)
+    e1, e2 = np.modf(x.astype(np.float32))
+    numpy_assert_allclose(
+        o1.astype(np.float32),
+        truncate(e1, float_type=float_type),
+        rtol=1e-2,
+        float_type=float_type)
+    numpy_assert_allclose(
+        o2.astype(np.float32),
+        truncate(e2, float_type=float_type),
+        rtol=1e-2,
+        float_type=float_type)
+
+  def testLdexp(self, float_type):
+    rng = np.random.RandomState(seed=42)
+    x = rng.randn(3, 7).astype(float_type)
+    y = rng.randint(-50, 50, (1, 7)).astype(np.int32)
+    self.assertEqual(np.ldexp(x, y).dtype, x.dtype)
+    numpy_assert_allclose(
+        np.ldexp(x, y).astype(np.float32),
+        truncate(np.ldexp(x.astype(np.float32), y), float_type=float_type),
+        rtol=1e-2,
+        atol=1e-6,
+        float_type=float_type)
+
+  def testFrexp(self, float_type):
+    rng = np.random.RandomState(seed=42)
+    x = rng.randn(3, 7).astype(float_type)
+    mant1, exp1 = np.frexp(x)
+    mant2, exp2 = np.frexp(x.astype(np.float32))
+    np.testing.assert_equal(exp1, exp2)
+    numpy_assert_allclose(mant1, mant2, rtol=1e-2, float_type=float_type)
+
+  def testCopySign(self, float_type):
+    for bits in list(range(1, 128)):
+      with self.subTest(bits):
+        bits_type = BITS_TYPE[float_type]
+        val = bits_type(bits).view(float_type)
+        val_with_sign = np.copysign(val, float_type(-1))
+        val_with_sign_bits = val_with_sign.view(bits_type)
+        num_bits = np.iinfo(bits_type).bits
+        np.testing.assert_equal(bits | (1 << (num_bits - 1)),
+                                val_with_sign_bits)
+
+  def testNextAfter(self, float_type):
+    one = np.array(1., dtype=float_type)
+    two = np.array(2., dtype=float_type)
+    zero = np.array(0., dtype=float_type)
+    nan = np.array(np.nan, dtype=float_type)
+    np.testing.assert_equal(
+        np.nextafter(one, two) - one, FLOAT_EPSILON[float_type])
+    np.testing.assert_equal(
+        np.nextafter(one, zero) - one, -FLOAT_EPSILON[float_type] / 2)
+    np.testing.assert_equal(np.isnan(np.nextafter(nan, one)), True)
+    np.testing.assert_equal(np.isnan(np.nextafter(one, nan)), True)
+    np.testing.assert_equal(np.nextafter(one, one), one)
+    smallest_denormal = FLOAT_SMALLEST_SUBNORMAL[float_type]
+    np.testing.assert_equal(np.nextafter(zero, one), smallest_denormal)
+    np.testing.assert_equal(np.nextafter(zero, -one), -smallest_denormal)
+    for a, b in itertools.permutations([0., nan], 2):
+      np.testing.assert_equal(
+          np.nextafter(
+              np.array(a, dtype=np.float32), np.array(b, dtype=np.float32)),
+          np.nextafter(
+              np.array(a, dtype=float_type), np.array(b, dtype=float_type)))
+
+  def testSpacing(self, float_type):
+    # Sweep a variety of binades to see that spacing gives the proper ULP.
+    with self.subTest(name="Subnormals"):
+      for i in range(
+          int(np.log2(FLOAT_SMALLEST_SUBNORMAL[float_type])),
+          int(np.log2(FLOAT_SMALLEST_NORMAL[float_type]))):
+        power_of_two = float_type(2.0**i)
+        distance = FLOAT_SMALLEST_SUBNORMAL[float_type]
+        np.testing.assert_equal(np.spacing(power_of_two), distance)
+        np.testing.assert_equal(np.spacing(-power_of_two), -distance)
+    # Normals have a distance which depends on their binade.
+    with self.subTest(name="Normals"):
+      for i in range(
+          int(np.log2(FLOAT_SMALLEST_NORMAL[float_type])),
+          int(np.log2(FLOAT_MAX[float_type]))):
+        power_of_two = float_type(2.0**i)
+        distance = FLOAT_EPSILON[float_type] * power_of_two
+        np.testing.assert_equal(np.spacing(power_of_two), distance)
+        np.testing.assert_equal(np.spacing(-power_of_two), -distance)
+
+    # Check that spacing agrees with arithmetic involving nextafter.
+    with self.subTest(name="NextAfter"):
+      for x in FLOAT_VALUES[float_type]:
+        x_float_type = float_type(x)
+        spacing = np.spacing(x_float_type)
+        toward = np.copysign(float_type(2.0 * np.abs(x) + 1), x_float_type)
+        nextup = np.nextafter(x_float_type, toward)
+        if np.isnan(spacing):
+          self.assertTrue(np.isnan(nextup - x_float_type))
+        else:
+          np.testing.assert_equal(spacing, nextup - x_float_type)
+
+    # Check that spacing for special values gives the correct answer.
+    with self.subTest(name="NonFinite"):
+      nan = float_type(float("nan"))
+      np.testing.assert_equal(np.spacing(nan), np.spacing(np.float32(nan)))
+      if dtype_has_inf(float_type):
+        inf = float_type(float("inf"))
+        np.testing.assert_equal(np.spacing(inf), np.spacing(np.float32(inf)))
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/python/lib/core/float8_e4m3b11.cc b/tensorflow/python/lib/core/float8_e4m3b11.cc
deleted file mode 100644
index 7f47da5a2bd..00000000000
--- a/tensorflow/python/lib/core/float8_e4m3b11.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/python/lib/core/float8_e4m3b11.h"
-
-#include <stdio.h>
-
-namespace tensorflow {
-
-uint8_t float_to_float8_e4m3b11(float v) {
-  static_assert(sizeof(float) == sizeof(uint32_t), "Invalid");
-  uint32_t tmp = *reinterpret_cast<uint32_t*>(&v);
-
-  uint32_t sign = (tmp & 0x80000000) >> 24;
-  uint32_t exponent = (tmp >> 23) & 0xff;
-  uint32_t mantissa = tmp & 0x7fffff;
-  // subnormals
-  if (exponent < 127 - 10) {
-    if (exponent < 127 - 14) {
-      return 0x00;
-    }
-    uint32_t shifted_mantissa =
-        (0x800000 | mantissa) >> (10 - ((exponent - 127)));
-    if (shifted_mantissa == 0) return 0x00;
-    return sign | shifted_mantissa;
-  }
-  if (exponent > 127 + 4) {
-    if (exponent == 255 && mantissa != 0) {
-      return 0x80;  // nan.
-    }
-    return 0x7f | sign;
-  }
-  exponent = exponent - (127 - 11);
-  uint8_t result = sign | (exponent << 3) | (mantissa >> 20);
-  if (result == 0x80) {
-    result = 0;
-  }
-  return result;
-}
-
-static uint32_t clz_uint32(uint32_t x) {
-#ifdef __GNUC__
-  return __builtin_clz(x);
-#else
-  uint32_t out = 32;
-  while (x != 0) {
-    x = x >> 1;
-    out -= 1;
-  }
-  return out;
-#endif
-}
-
-float float8_e4m3b11_to_float(uint8_t v) {
-  if (v == 0x80) {
-    return NAN;
-  }
-  if (v == 0) {
-    return 0;
-  }
-  uint32_t sign = (0x80 & v) << 24;
-  uint32_t exponent = (((v & 0x78) >> 3) + (127 - 11));
-  uint32_t mantissa = (v & 0x7) << 20;
-  // subnormals
-  if ((v & 0x78) == 0) {
-    uint32_t nzeros = clz_uint32(v & 0x7);
-    mantissa = ((v & 0x7) << (nzeros - 29 + 21)) & (0x3 << 21);
-    uint32_t tmp = sign | ((0x72 - nzeros + 31) << 23) | mantissa;
-    return *reinterpret_cast<float*>(&tmp);
-  }
-  uint32_t tmp = sign | (exponent << 23) | mantissa;
-  return *reinterpret_cast<float*>(&tmp);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/float8_e4m3b11.h b/tensorflow/python/lib/core/float8_e4m3b11.h
deleted file mode 100644
index 0869b95aaf2..00000000000
--- a/tensorflow/python/lib/core/float8_e4m3b11.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
-#define TENSORFLOW_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
-
-#include <stdint.h>
-
-#include <cmath>
-#include <cstring>
-#include <memory>
-
-namespace tensorflow {
-
-uint8_t float_to_float8_e4m3b11(float v);
-float float8_e4m3b11_to_float(uint8_t v);
-
-class float8_e4m3b11 {
- public:
-  // Exponent: 4, Mantissa: 3, bias: 11
-  float8_e4m3b11() {}
-  float8_e4m3b11(float v) : rep_(float_to_float8_e4m3b11(v)) {}  // NOLINT
-
-  operator float() const {  // NOLINT: Allow implicit conversion to float,
-                            // because it is lossless.
-    return float8_e4m3b11_to_float(rep_);
-  }
-
-  float8_e4m3b11 operator-() const {
-    if ((rep_ & 0x7f) == 0x00) {
-      return *this;
-    }  // nan or 0.
-    float8_e4m3b11 result = *this;
-    result.rep_ = result.rep_ ^ 0x80;
-    return result;
-  }
-
-  uint8_t rep() const { return rep_; }
-
-  static float8_e4m3b11 FromRep(uint8_t rep) {
-    float8_e4m3b11 result;
-    memcpy(&result, &rep, sizeof(float8_e4m3b11));
-    return result;
-  }
-
- private:
-  uint8_t rep_;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
diff --git a/tensorflow/python/lib/core/float8_wrapper.cc b/tensorflow/python/lib/core/float8_wrapper.cc
new file mode 100644
index 00000000000..1ba151be11e
--- /dev/null
+++ b/tensorflow/python/lib/core/float8_wrapper.cc
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
+
+PYBIND11_MODULE(_pywrap_float8, m) {
+  tsl::RegisterNumpyFloat8e4m3fn();
+  tsl::RegisterNumpyFloat8e5m2();
+
+  m.def("TF_float8_e4m3fn_type",
+        [] { return pybind11::handle(tsl::Float8e4m3fnDtype()); });
+
+  m.def("TF_float8_e5m2_type",
+        [] { return pybind11::handle(tsl::Float8e5m2Dtype()); });
+}
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 4b0c9b8898c..bab9e41b90b 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 
-#include <cstring>
-#include <optional>
+#include <cstring>   // NOLINT
+#include <optional>  // NOLINT
 
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
@@ -24,9 +28,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
+#include "tensorflow/core/util/port.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
-#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
 
 namespace tensorflow {
 namespace {
@@ -176,7 +181,7 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       // custom struct type.
       return PyArrayDescr_to_TF_DataType(descr, out_tf_datatype);
     default:
-      if (pyarray_type == Bfloat16NumpyType()) {
+      if (pyarray_type == tsl::Bfloat16NumpyType()) {
         *out_tf_datatype = TF_BFLOAT16;
         break;
       } else if (pyarray_type == NPY_ULONGLONG) {
@@ -199,6 +204,12 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
         // be different on certain platforms.
         *out_tf_datatype = TF_UINT32;
         break;
+      } else if (pyarray_type == tsl::Float8e5m2NumpyType()) {
+        *out_tf_datatype = TF_FLOAT8_E5M2;
+        break;
+      } else if (pyarray_type == tsl::Float8e4m3fnNumpyType()) {
+        *out_tf_datatype = TF_FLOAT8_E4M3FN;
+        break;
       }
       return errors::Internal("Unsupported numpy type: ",
                               numpy_type_name(pyarray_type));
@@ -457,13 +468,26 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   }
   PyArrayObject* py_array =
       reinterpret_cast<PyArrayObject*>(safe_out_array.get());
+
   if (TF_TensorType(tensor.get()) == TF_STRING) {
     Status s = CopyTF_TensorStringsToPyArray(tensor.get(), nelems, py_array);
     if (!s.ok()) {
       return s;
     }
-  } else if (static_cast<size_t>(PyArray_NBYTES(py_array)) !=
-             TF_TensorByteSize(tensor.get())) {
+  } else if ((!IsZenDnnEnabled()) &&
+             (static_cast<size_t>(PyArray_NBYTES(py_array)) !=
+              TF_TensorByteSize(tensor.get()))) {
+    return errors::Internal("ndarray was ", PyArray_NBYTES(py_array),
+                            " bytes but TF_Tensor was ",
+                            TF_TensorByteSize(tensor.get()), " bytes");
+  }
+  // Default TF implementation compares ndarray bytes with TF_Tensor bytes
+  // which is allocated using allocate_output API for fixed size.
+  // ZenDNN mempool optimization utilizes pre-allocated buffers with TF_Tensor.
+  // Hence, the amount of allocated TF_Tensor bytes is always greater than or
+  // equal to the number of bytes requested by the op execution.
+  else if (IsZenDnnEnabled() && (static_cast<size_t>(PyArray_NBYTES(py_array)) >
+                                 TF_TensorByteSize(tensor.get()))) {
     return errors::Internal("ndarray was ", PyArray_NBYTES(py_array),
                             " bytes but TF_Tensor was ",
                             TF_TensorByteSize(tensor.get()), " bytes");
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index b3b1fe2fdc7..a4f2623abd2 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // clang-format off
 // Must be included first.
-#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/tsl/python/lib/core/numpy.h"
 // clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
@@ -25,8 +25,9 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
 
 namespace tensorflow {
 
@@ -187,7 +188,13 @@ Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
       *out_pyarray_type = NPY_INT32;
       break;
     case TF_BFLOAT16:
-      *out_pyarray_type = Bfloat16NumpyType();
+      *out_pyarray_type = tsl::Bfloat16NumpyType();
+      break;
+    case TF_FLOAT8_E5M2:
+      *out_pyarray_type = tsl::Float8e5m2NumpyType();
+      break;
+    case TF_FLOAT8_E4M3FN:
+      *out_pyarray_type = tsl::Float8e4m3fnNumpyType();
       break;
     default:
       return errors::Internal("Tensorflow type ", tf_datatype,
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.h b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
index 029c0d3ef0a..1618f0455ac 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.h
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
@@ -15,8 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
 
-// Must be included first.
-#include "tensorflow/python/lib/core/numpy.h"
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include <functional>
 
diff --git a/tensorflow/python/lib/core/numpy.cc b/tensorflow/python/lib/core/numpy.cc
deleted file mode 100644
index 9a832ce9648..00000000000
--- a/tensorflow/python/lib/core/numpy.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// We define the PY_ARRAY_UNIQUE_SYMBOL in this .cc file and provide an
-// ImportNumpy function to populate it.
-#define TF_IMPORT_NUMPY
-
-#include "tensorflow/python/lib/core/numpy.h"
-
-namespace tensorflow {
-
-void ImportNumpy() {
-  import_array1();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h
deleted file mode 100644
index 0098d938a08..00000000000
--- a/tensorflow/python/lib/core/numpy.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PYTHON_LIB_CORE_NUMPY_H_
-#define TENSORFLOW_PYTHON_LIB_CORE_NUMPY_H_
-
-#ifdef PyArray_Type
-#error "Numpy cannot be included before numpy.h."
-#endif
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-// We import_array in the tensorflow init function only.
-#define PY_ARRAY_UNIQUE_SYMBOL _tensorflow_numpy_api
-#ifndef TF_IMPORT_NUMPY
-#define NO_IMPORT_ARRAY
-#endif
-
-// Place `<locale>` before <Python.h> to avoid build failure in macOS.
-#include <locale>
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-namespace tensorflow {
-
-// Import numpy.  This wrapper function exists so that the
-// PY_ARRAY_UNIQUE_SYMBOL can be safely defined in a .cc file to
-// avoid weird linking issues.  Should be called only from our
-// module initialization function.
-void ImportNumpy();
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PYTHON_LIB_CORE_NUMPY_H_
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 925929fa2f0..b1249f306fb 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/lib/core/py_func.h"
-
-#include <Python.h>
-
 // clang-format: off
 // Must be included first.
-#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_func.h"
+
+#include "tensorflow/tsl/python/lib/core/numpy.h"
 // clang-format: on
 
+#include <Python.h>
+
 #include <array>
 
 #include "numpy/arrayobject.h"
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 262d91e768d..3be4d3fb31f 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
 
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 
@@ -29,10 +33,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
-#include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
-
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index 7b32194c83e..2ea5aec78aa 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -1,12 +1,18 @@
 # python/lib/io package
 
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_oss")
+
+# copybara:uncomment_begin(google-only)
+# load("//third_party/zlib:BUILD_defs.bzl", "brittle_test_relying_on_stable_zlib_output")
+# copybara:uncomment_end
 
 visibility = [
     "//tensorflow:__subpackages__",
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
@@ -72,10 +78,17 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "tf_record_test",
+    name = if_oss("tf_record_test", "_tf_record_test"),
     size = "small",
     srcs = ["tf_record_test.py"],
+    main = "tf_record_test.py",
     python_version = "PY3",
+    tags = [
+        # copybara:uncomment_begin(google-only)
+        # "manual",
+        # "notap",
+        # copybara:uncomment_end
+    ],
     deps = [
         ":lib",
         "//tensorflow/python:errors",
@@ -83,3 +96,13 @@ tf_py_test(
         "//tensorflow/python/platform:client_testlib",
     ],
 )
+
+# _tf_record_test relies on stable zlib output only during its execution.
+# It should not be brittle with respect to upgrades of zlib software or
+# different computers using different zlib software.
+# copybara:uncomment_begin(google-only)
+# brittle_test_relying_on_stable_zlib_output(
+#     name = "tf_record_test",
+#     test_target = ":_tf_record_test",
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index 2b1a6ea2656..cfda480f200 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -55,12 +55,13 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
          bool fully_quantize, int inference_type, int input_data_type,
          int output_data_type, bool enable_numeric_verify,
          bool enable_whole_model_verify, py::object op_blocklist,
-         py::object node_blocklist) {
+         py::object node_blocklist, bool enable_variable_quantization) {
         return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
             input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
             inference_type, input_data_type, output_data_type,
             enable_numeric_verify, enable_whole_model_verify,
-            op_blocklist.ptr(), node_blocklist.ptr()));
+            op_blocklist.ptr(), node_blocklist.ptr(),
+            enable_variable_quantization));
       },
       py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
       py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
@@ -69,6 +70,7 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
       py::arg("enable_whole_model_verify") = false,
       py::arg("op_blocklist") = py::none(),
       py::arg("node_blocklist") = py::none(),
+      py::arg("enable_variable_quantization") = false,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 590653c4594..9a700eb564f 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -111,4 +111,12 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
           return output;
         });
+
+  m.def("ExperimentalWriteBytecode", [](const std::string &filename,
+                                        const std::string &mlir_txt) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    tensorflow::ExperimentalWriteBytecode(filename, mlir_txt, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  });
 };
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 217ec20565b..b0a929223f1 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index ccea810a185..86c035ecf80 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -69,6 +69,7 @@
 from tensorflow.python.distribute.parameter_server_strategy_v2 import *
 from tensorflow.python.distribute.coordinator.cluster_coordinator import *
 from tensorflow.python.distribute.failure_handling.failure_handling import *
+from tensorflow.python.distribute.failure_handling.preemption_watcher import *
 
 tf_export('__internal__.decorator.make_decorator', v1=[])(make_decorator)
 tf_export('__internal__.decorator.unwrap', v1=[])(unwrap)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index feceefb8a60..ecfdc983104 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 # 'Constant' gets imported in the module 'array_ops'.
@@ -42,15 +43,11 @@
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
-math_ops = lazy_loader.LazyLoader(
-    "math_ops", globals(), "tensorflow.python.ops.math_ops")
-
 # Used for slicing to specify a new 1 size dimension
 newaxis = None
 tf_export("newaxis").export_constant(__name__, "newaxis")
@@ -275,6 +272,17 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   >>> a_identity.numpy()
   5
 
+  This function can also be used to explicitly transfer tensors between devices.
+  For example, to transfer a tensor in GPU memory back to host memory, one can
+  use:
+
+  >>> with tf.device("/gpu:0"):
+  ...   x_on_gpu = tf.constant(1)
+  >>> with tf.device("/cpu:0"):
+  ...   x_on_cpu = tf.identity(x_on_gpu)
+  >>> x_on_cpu.device
+  '/job:localhost/replica:0/task:0/device:CPU:0'
+
   Args:
     input: A `Tensor`, a `Variable`, a `CompositeTensor` or anything that can be
     converted to a tensor using `tf.convert_to_tensor`.
@@ -701,17 +709,27 @@ def shape_internal(input, name=None, optimize=True, out_type=None):
 @dispatch.add_dispatch_support
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
-  """Returns shape of tensors.
+  """Returns shape of a list of tensors.
+
+  Given a list of tensors, `tf.shape_n` is much faster than applying `tf.shape`
+  to each tensor individually.
+  >>> a = tf.ones([1, 2])
+  >>> b = tf.ones([2, 3])
+  >>> c = tf.ones([3, 4])
+  >>> tf.shape_n([a, b, c])
+  [<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>,
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>,
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 4], dtype=int32)>]
 
   Args:
-    input: A list of at least 1 `Tensor` object with the same type.
+    input: A list of at least 1 `Tensor` object with the same dtype.
     out_type: The specified output type of the operation (`int32` or `int64`).
       Defaults to `tf.int32`(optional).
     name: A name for the operation (optional).
 
   Returns:
-    A list with the same length as `input` of `Tensor` objects with
-      type `out_type`.
+    A list of `Tensor` specifying the shape of each input tensor with type of
+    `out_type`.
   """
 
   return gen_array_ops.shape_n(input, out_type=out_type, name=name)
@@ -1593,8 +1611,8 @@ def _autopacking_conversion_function(v, dtype=None, name=None, as_ref=False):
 
 # NOTE: Register this conversion function to run *before* one that
 # assumes every element is a value.
-ops.register_tensor_conversion_function((list, tuple),
-                                        _autopacking_conversion_function, 99)
+tensor_conversion_registry.register_tensor_conversion_function(
+    (list, tuple), _autopacking_conversion_function, 99)
 
 
 @tf_export("unstack")
@@ -4622,19 +4640,19 @@ def squeeze_v2(input, axis=None, name=None):
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
   tf.shape(tf.squeeze(t, [2, 4]))  # [1, 2, 3, 1]
   ```
-  
+
   Unlike the older op `tf.compat.v1.squeeze`, this op does not accept a
   deprecated `squeeze_dims` argument.
 
   Note: if `input` is a `tf.RaggedTensor`, then this operation takes `O(N)`
   time, where `N` is the number of elements in the squeezed dimensions.
-  
+
   Note: If squeeze is performed on dimensions of unknown sizes, then the
   returned Tensor will be of unknown shape. A common situation is when the
   first (batch) dimension is of size `None`, `tf.squeeze` returns
   `<unknown>` shape which may be a surprise. Specify the `axis=` argument
   to get the expected result, as illustrated in the following example:
-  
+
   ```python
   @tf.function
   def func(x):
@@ -4645,7 +4663,7 @@ def func(x):
     y = tf.squeeze(x)
     print('shape of tf.squeeze(x):', y.shape)
     return 0
- 
+
   _ = func.get_concrete_function(tf.TensorSpec([None, 1, 2], dtype=tf.int32))
   # Output is.
   # x.shape: (None, 1, 2)
@@ -6891,13 +6909,13 @@ def repeat_with_axis(data, repeats, axis, name=None):
       # Non-XLA path implementation
       # E.g., repeats = [3, 4, 0, 2, 1].
       # E.g., repeats_scan = [3, 7, 7, 9, 10].
-      repeats_scan = math_ops.cumsum(repeats)
+      repeats_scan = gen_math_ops.cumsum(repeats)
       # This concat just prepends 0 to handle the case when repeats are empty.
       # E.g., output_size = [0, 3, 7, 7, 9, 10][-1] = 10.
       output_size = concat([zeros(1, dtype=repeats_scan.dtype), repeats_scan],
                            axis=0)[-1]
       # E.g., output_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
-      output_indices = math_ops.range(output_size, dtype=repeats.dtype)
+      output_indices = gen_math_ops.range(output_size, dtype=repeats.dtype)
       # E.g., gather_indices = [0, 0, 0, 1, 1, 1, 1, 3, 3, 4].
       gather_indices = searchsorted(
           repeats_scan, output_indices, side="right", out_type=repeats.dtype)
diff --git a/tensorflow/python/ops/array_ops_test.py b/tensorflow/python/ops/array_ops_test.py
index 3f063a24a27..0c82f5ac098 100644
--- a/tensorflow/python/ops/array_ops_test.py
+++ b/tensorflow/python/ops/array_ops_test.py
@@ -18,6 +18,7 @@
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -91,6 +92,20 @@ def g(x):
     conc = g.get_concrete_function(tensor_spec.TensorSpec([10, None]))
     self.assertAllEqual(conc.output_shapes.as_list(), [10])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testParallelConcatFailsWithRankZeroShape(self):
+    op = array_ops.ParallelConcat
+    para = {"shape": 0, "values": [1]}
+
+    def func():
+      y = op(**para)
+      return y
+
+    with self.assertRaisesRegex(
+        Exception, "(rank|dimension) of .* must be greater than .* 0"
+    ):
+      func()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 99d301fc6da..9204f9bf0d3 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -44,7 +44,6 @@
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as is_quantile_resource_initialized
 # pylint: enable=unused-import
 
-from tensorflow.python.trackable import resource
 from tensorflow.python.training import saver
 
 
@@ -68,10 +67,10 @@ class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for QuantileAccumulator."""
 
   def __init__(self, resource_handle, create_op, num_streams, name):
-    self._resource_handle = resource_handle
+    self.resource_handle = resource_handle
     self._num_streams = num_streams
     self._create_op = create_op
-    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+    bucket_boundaries = get_bucket_boundaries(self.resource_handle,
                                               self._num_streams)
     slice_spec = ''
     specs = []
@@ -83,17 +82,17 @@ def make_save_spec(tensor, suffix):
       specs += [
           make_save_spec(bucket_boundaries[i], '_bucket_boundaries_' + str(i))
       ]
-    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+    super(QuantileAccumulatorSaveable, self).__init__(self.resource_handle,
                                                       specs, name)
 
   def restore(self, restored_tensors, unused_tensor_shapes):
     bucket_boundaries = restored_tensors
     with ops.control_dependencies([self._create_op]):
       return quantile_resource_deserialize(
-          self._resource_handle, bucket_boundaries=bucket_boundaries)
+          self.resource_handle, bucket_boundaries=bucket_boundaries)
 
 
-class QuantileAccumulator(resource.TrackableResource):
+class QuantileAccumulator():
   """SaveableObject implementation for QuantileAccumulator.
 
      The bucket boundaries are serialized and deserialized from checkpointing.
@@ -105,22 +104,23 @@ def __init__(self,
                num_quantiles,
                name=None,
                max_elements=None):
+    del max_elements  # Unused.
+
     self._eps = epsilon
     self._num_streams = num_streams
     self._num_quantiles = num_quantiles
-    super(QuantileAccumulator, self).__init__()
 
     with ops.name_scope(name, 'QuantileAccumulator') as name:
       self._name = name
-      self._resource_handle = self._create_resource()
+      self.resource_handle = self._create_resource()
       self._init_op = self._initialize()
       is_initialized_op = self.is_initialized()
     resources.register_resource(self.resource_handle, self._init_op,
                                 is_initialized_op)
-    self._saveable = QuantileAccumulatorSaveable(
-        self.resource_handle, self._init_op, self._num_streams,
-        self.resource_handle.name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS,
+                          QuantileAccumulatorSaveable(
+                              self.resource_handle, self._init_op,
+                              self._num_streams, self.resource_handle.name))
 
   def _create_resource(self):
     return quantile_resource_handle_op(
@@ -139,12 +139,15 @@ def initializer(self):
   def is_initialized(self):
     return is_quantile_resource_initialized(self.resource_handle)
 
-  @property
-  def saveable(self):
-    return self._saveable
+  def _serialize_to_tensors(self):
+    raise NotImplementedError('When the need arises, TF2 compatibility can be '
+                              'added by implementing this method, along with '
+                              '_restore_from_tensors below.')
 
-  def _gather_saveables_for_checkpoint(self):
-    return {'quantile_accumulator', self._saveable}
+  def _restore_from_tensors(self, restored_tensors):
+    raise NotImplementedError('When the need arises, TF2 compatibility can be '
+                              'added by implementing this method, along with '
+                              '_serialize_to_tensors above.')
 
   def add_summaries(self, float_columns, example_weights):
     summaries = make_quantile_summaries(float_columns, example_weights,
@@ -183,7 +186,7 @@ def __init__(self, resource_handle, create_op, name):
                                         name + '_serialized'),
     ]
     super(_TreeEnsembleSavable, self).__init__(resource_handle, specs, name)
-    self._resource_handle = resource_handle
+    self.resource_handle = resource_handle
     self._create_op = create_op
 
   def restore(self, restored_tensors, unused_restored_shapes):
@@ -199,12 +202,12 @@ def restore(self, restored_tensors, unused_restored_shapes):
     """
     with ops.control_dependencies([self._create_op]):
       return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
-          self._resource_handle,
+          self.resource_handle,
           stamp_token=restored_tensors[0],
           tree_ensemble_serialized=restored_tensors[1])
 
 
-class TreeEnsemble(resource.TrackableResource):
+class TreeEnsemble():
   """Creates TreeEnsemble resource."""
 
   def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
@@ -213,14 +216,15 @@ def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
     self._is_local = is_local
     with ops.name_scope(name, 'TreeEnsemble') as name:
       self._name = name
-      self._resource_handle = self._create_resource()
+      self.resource_handle = self._create_resource()
       self._init_op = self._initialize()
       is_initialized_op = self.is_initialized()
       # Adds the variable to the savable list.
       if not is_local:
-        self._saveable = _TreeEnsembleSavable(
-            self.resource_handle, self.initializer, self.resource_handle.name)
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS,
+                              _TreeEnsembleSavable(
+                                  self.resource_handle, self.initializer,
+                                  self.resource_handle.name))
       resources.register_resource(
           self.resource_handle,
           self.initializer,
@@ -247,9 +251,15 @@ def is_initialized(self):
     return gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
         self.resource_handle)
 
-  def _gather_saveables_for_checkpoint(self):
-    if not self._is_local:
-      return {'tree_ensemble': self._saveable}
+  def _serialize_to_tensors(self):
+    raise NotImplementedError('When the need arises, TF2 compatibility can be '
+                              'added by implementing this method, along with '
+                              '_restore_from_tensors below.')
+
+  def _restore_from_tensors(self, restored_tensors):
+    raise NotImplementedError('When the need arises, TF2 compatibility can be '
+                              'added by implementing this method, along with '
+                              '_serialize_to_tensors above.')
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index a7d2fb57aa2..d4291b0aa3f 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -36,6 +36,10 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
   This operation randomly samples a tensor of sampled classes
   (`sampled_candidates`) from the range of integers `[0, range_max)`.
 
+  See the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
+  for a quick course on Candidate Sampling.
+
   The elements of `sampled_candidates` are drawn without replacement
   (if `unique=True`) or with replacement (if `unique=False`) from
   the base distribution.
@@ -47,12 +51,37 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
   and `sampled_expected_count` representing the number of times each
   of the target classes (`true_classes`) and the sampled
   classes (`sampled_candidates`) is expected to occur in an average
-  tensor of sampled classes.  These values correspond to `Q(y|x)`
-  defined in [this
-  document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
+  tensor of sampled classes. These values correspond to `Q(y|x)`
+  defined in the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
   If `unique=True`, then these are post-rejection probabilities and we
   compute them approximately.
 
+  Note that this function (and also other `*_candidate_sampler`
+  functions) only gives you the ingredients to implement the various
+  Candidate Sampling algorithms listed in the big table in the
+  [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
+  still need to implement the algorithms yourself.
+
+  For example, according to that table, the phrase "negative samples"
+  may mean different things in different algorithms. For instance, in
+  NCE, "negative samples" means `S_i` (which is just the sampled
+  classes) which may overlap with true classes, while in Sampled
+  Logistic, "negative samples" means `S_i - T_i` which excludes the
+  true classes. The return value `sampled_candidates` corresponds to
+  `S_i`, not to any specific definition of "negative samples" in any
+  specific algorithm. It's your responsibility to pick an algorithm
+  and calculate the "negative samples" defined by that algorithm
+  (e.g. `S_i - T_i`).
+
+  As another example, the `true_classes` argument is for calculating
+  the `true_expected_count` output (as a by-product of this function's
+  main calculation), which may be needed by some algorithms (according
+  to that table). It's not for excluding true classes in the return
+  value `sampled_candidates`. Again that step is algorithm-specific
+  and should be carried out by you.
+
   Args:
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
@@ -67,10 +96,10 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.  The
-      sampled classes, either with possible duplicates (`unique=False`) or all
-      unique (`unique=True`). In either case, `sampled_candidates` is
-      independent of the true classes.
+    sampled_candidates: A tensor of type `int64` and shape
+      `[num_sampled]`. The sampled classes, either with possible
+      duplicates (`unique=False`) or all unique (`unique=True`). As
+      noted above, `sampled_candidates` may overlap with true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -99,6 +128,10 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
   This operation randomly samples a tensor of sampled classes
   (`sampled_candidates`) from the range of integers `[0, range_max)`.
 
+  See the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
+  for a quick course on Candidate Sampling.
+
   The elements of `sampled_candidates` are drawn without replacement
   (if `unique=True`) or with replacement (if `unique=False`) from
   the base distribution.
@@ -118,11 +151,36 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
   of the target classes (`true_classes`) and the sampled
   classes (`sampled_candidates`) is expected to occur in an average
   tensor of sampled classes.  These values correspond to `Q(y|x)`
-  defined in [this
-  document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
+  defined in the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
   If `unique=True`, then these are post-rejection probabilities and we
   compute them approximately.
 
+  Note that this function (and also other `*_candidate_sampler`
+  functions) only gives you the ingredients to implement the various
+  Candidate Sampling algorithms listed in the big table in the
+  [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
+  still need to implement the algorithms yourself.
+
+  For example, according to that table, the phrase "negative samples"
+  may mean different things in different algorithms. For instance, in
+  NCE, "negative samples" means `S_i` (which is just the sampled
+  classes) which may overlap with true classes, while in Sampled
+  Logistic, "negative samples" means `S_i - T_i` which excludes the
+  true classes. The return value `sampled_candidates` corresponds to
+  `S_i`, not to any specific definition of "negative samples" in any
+  specific algorithm. It's your responsibility to pick an algorithm
+  and calculate the "negative samples" defined by that algorithm
+  (e.g. `S_i - T_i`).
+
+  As another example, the `true_classes` argument is for calculating
+  the `true_expected_count` output (as a by-product of this function's
+  main calculation), which may be needed by some algorithms (according
+  to that table). It's not for excluding true classes in the return
+  value `sampled_candidates`. Again that step is algorithm-specific
+  and should be carried out by you.
+
   Args:
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
@@ -135,8 +193,9 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape
+      `[num_sampled]`. The sampled classes. As noted above,
+      `sampled_candidates` may overlap with true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -162,6 +221,10 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
   This operation randomly samples a tensor of sampled classes
   (`sampled_candidates`) from the range of integers `[0, range_max)`.
 
+  See the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
+  for a quick course on Candidate Sampling.
+
   The elements of `sampled_candidates` are drawn without replacement
   (if `unique=True`) or with replacement (if `unique=False`) from
   the base distribution.
@@ -178,11 +241,36 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
   of the target classes (`true_classes`) and the sampled
   classes (`sampled_candidates`) is expected to occur in an average
   tensor of sampled classes.  These values correspond to `Q(y|x)`
-  defined in [this
-  document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
+  defined in the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
   If `unique=True`, then these are post-rejection probabilities and we
   compute them approximately.
 
+  Note that this function (and also other `*_candidate_sampler`
+  functions) only gives you the ingredients to implement the various
+  Candidate Sampling algorithms listed in the big table in the
+  [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
+  still need to implement the algorithms yourself.
+
+  For example, according to that table, the phrase "negative samples"
+  may mean different things in different algorithms. For instance, in
+  NCE, "negative samples" means `S_i` (which is just the sampled
+  classes) which may overlap with true classes, while in Sampled
+  Logistic, "negative samples" means `S_i - T_i` which excludes the
+  true classes. The return value `sampled_candidates` corresponds to
+  `S_i`, not to any specific definition of "negative samples" in any
+  specific algorithm. It's your responsibility to pick an algorithm
+  and calculate the "negative samples" defined by that algorithm
+  (e.g. `S_i - T_i`).
+
+  As another example, the `true_classes` argument is for calculating
+  the `true_expected_count` output (as a by-product of this function's
+  main calculation), which may be needed by some algorithms (according
+  to that table). It's not for excluding true classes in the return
+  value `sampled_candidates`. Again that step is algorithm-specific
+  and should be carried out by you.
+
   Args:
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
@@ -195,8 +283,9 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape
+      `[num_sampled]`. The sampled classes. As noted above,
+      `sampled_candidates` may overlap with true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -235,6 +324,10 @@ def fixed_unigram_candidate_sampler(true_classes,
   This operation randomly samples a tensor of sampled classes
   (`sampled_candidates`) from the range of integers `[0, range_max)`.
 
+  See the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf)
+  for a quick course on Candidate Sampling.
+
   The elements of `sampled_candidates` are drawn without replacement
   (if `unique=True`) or with replacement (if `unique=False`) from
   the base distribution.
@@ -248,11 +341,36 @@ def fixed_unigram_candidate_sampler(true_classes,
   of the target classes (`true_classes`) and the sampled
   classes (`sampled_candidates`) is expected to occur in an average
   tensor of sampled classes.  These values correspond to `Q(y|x)`
-  defined in [this
-  document](http://www.tensorflow.org/extras/candidate_sampling.pdf).
+  defined in the [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf).
   If `unique=True`, then these are post-rejection probabilities and we
   compute them approximately.
 
+  Note that this function (and also other `*_candidate_sampler`
+  functions) only gives you the ingredients to implement the various
+  Candidate Sampling algorithms listed in the big table in the
+  [Candidate Sampling Algorithms
+  Reference](http://www.tensorflow.org/extras/candidate_sampling.pdf). You
+  still need to implement the algorithms yourself.
+
+  For example, according to that table, the phrase "negative samples"
+  may mean different things in different algorithms. For instance, in
+  NCE, "negative samples" means `S_i` (which is just the sampled
+  classes) which may overlap with true classes, while in Sampled
+  Logistic, "negative samples" means `S_i - T_i` which excludes the
+  true classes. The return value `sampled_candidates` corresponds to
+  `S_i`, not to any specific definition of "negative samples" in any
+  specific algorithm. It's your responsibility to pick an algorithm
+  and calculate the "negative samples" defined by that algorithm
+  (e.g. `S_i - T_i`).
+
+  As another example, the `true_classes` argument is for calculating
+  the `true_expected_count` output (as a by-product of this function's
+  main calculation), which may be needed by some algorithms (according
+  to that table). It's not for excluding true classes in the return
+  value `sampled_candidates`. Again that step is algorithm-specific
+  and should be carried out by you.
+
   Args:
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
@@ -290,8 +408,9 @@ def fixed_unigram_candidate_sampler(true_classes,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape
+      `[num_sampled]`. The sampled classes. As noted above,
+      `sampled_candidates` may overlap with true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 9fd84dbbfca..884816b65ac 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -272,6 +272,84 @@ def _decorator(func):
   return _decorator
 
 
+def _binary_assert_doc_v2(sym, opname, test_var):
+  """Common docstring for v2 assert_* ops that compare two tensors element-wise.
+
+  Args:
+    sym: Binary operation symbol, i.e. "=="
+    opname: Name for the symbol, i.e. "assert_equal"
+    test_var: A number used in the docstring example
+
+  Returns:
+    Decorator that adds the appropriate docstring to the function for
+  symbol `sym`.
+  """
+
+  def _decorator(func):
+    """Decorator that adds docstring to the function for symbol `sym`.
+
+    Args:
+      func: Function for a TensorFlow op
+
+    Returns:
+      A version of `func` with documentation attached.
+    """
+
+    func.__doc__ = """
+    Assert the condition `x {sym} y` holds element-wise.
+
+    This Op checks that `x[i] {sym} y[i]` holds for every pair of (possibly
+    broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+    trivially satisfied.
+
+    If `x` {sym} `y` does not hold, `message`, as well as the first `summarize`
+    entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
+
+    When using inside `tf.function`, this API takes effects during execution.
+    It's recommended to use this API with `tf.control_dependencies` to
+    ensure the correct execution order.
+
+    In the following example, without `tf.control_dependencies`, errors may
+    not be raised at all.
+    Check `tf.control_dependencies` for more details.
+
+    >>> def check_size(x):
+    ...   with tf.control_dependencies([
+    ...       tf.debugging.{opname}(tf.size(x), {test_var},
+    ...                       message='Bad tensor size')]):
+    ...     return x
+
+    >>> check_size(tf.ones([2, 3], tf.float32))
+    Traceback (most recent call last):
+       ...
+    InvalidArgumentError: ...
+
+    Args:
+      x:  Numeric `Tensor`.
+      y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+      message: A string to prefix to the default message. (optional)
+      summarize: Print this many entries of each tensor. (optional)
+      name: A name for this operation (optional).  Defaults to "{opname}".
+
+    Returns:
+      Op that raises `InvalidArgumentError` if `x {sym} y` is False. This can
+        be used with `tf.control_dependencies` inside of `tf.function`s to
+        block followup computation until the check has executed.
+      @compatibility(eager)
+      returns None
+      @end_compatibility
+
+    Raises:
+      InvalidArgumentError: if the check can be performed immediately and
+        `x == y` is False. The check can be performed immediately during eager
+        execution or if `x` and `y` are statically known.
+    """.format(
+        sym=sym, opname=opname, test_var=test_var)
+    return func
+
+  return _decorator
+
+
 def _make_assert_msg_data(sym, x, y, summarize, test_op):
   """Subroutine of _binary_assert that generates the components of the default error message when running in eager mode.
 
@@ -681,36 +759,8 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
 @tf_export('debugging.assert_equal', 'assert_equal', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('==', 'assert_equal', 3)
 def assert_equal_v2(x, y, message=None, summarize=None, name=None):
-  """Assert the condition `x == y` holds element-wise.
-
-  This Op checks that `x[i] == y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If `x` and `y` are not equal, `message`, as well as the first `summarize`
-  entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    message: A string to prefix to the default message.
-    summarize: Print this many entries of each tensor.
-    name: A name for this operation (optional).  Defaults to "assert_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x == y` is False. This can be
-      used with `tf.control_dependencies` inside of `tf.function`s to block
-      followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x == y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
   return assert_equal(x=x, y=y, summarize=summarize, message=message, name=name)
 
 
@@ -730,39 +780,8 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
 @tf_export('debugging.assert_none_equal', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('!=', 'assert_none_equal', 6)
 def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
-  """Assert the condition `x != y` holds for all elements.
-
-  This Op checks that `x[i] != y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If any elements of `x` and `y` are equal, `message`, as well as the first
-  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
-  is raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).  Defaults to
-    "assert_none_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x != y` is ever False. This can
-      be used with `tf.control_dependencies` inside of `tf.function`s to block
-      followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x != y` is False for any pair of elements in `x` and `y`. The check can
-      be performed immediately during eager execution or if `x` and `y` are
-      statically known.
-  """
   return assert_none_equal(x=x, y=y, summarize=summarize, message=message,
                            name=name)
 
@@ -920,37 +939,8 @@ def assert_near(
 @tf_export('debugging.assert_less', 'assert_less', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('<', 'assert_less', 3)
 def assert_less_v2(x, y, message=None, summarize=None, name=None):
-  """Assert the condition `x < y` holds element-wise.
-
-  This Op checks that `x[i] < y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If `x` is not less than `y` element-wise, `message`, as well as the first
-  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
-  raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    message: A string to prefix to the default message.
-    summarize: Print this many entries of each tensor.
-    name: A name for this operation (optional).  Defaults to "assert_less".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x < y` is False.
-    This can be used with `tf.control_dependencies` inside of `tf.function`s
-    to block followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x < y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
   return assert_less(x=x, y=y, summarize=summarize, message=message, name=name)
 
 
@@ -966,37 +956,8 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
 @tf_export('debugging.assert_less_equal', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('<=', 'assert_less_equal', 3)
 def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
-  """Assert the condition `x <= y` holds element-wise.
-
-  This Op checks that `x[i] <= y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If `x` is not less or equal than `y` element-wise, `message`, as well as the
-  first `summarize` entries of `x` and `y` are printed, and
-  `InvalidArgumentError` is raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    message: A string to prefix to the default message.
-    summarize: Print this many entries of each tensor.
-    name: A name for this operation (optional). Defaults to "assert_less_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x <= y` is False. This can be
-      used with `tf.control_dependencies` inside of `tf.function`s to block
-      followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x <= y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
   return assert_less_equal(x=x, y=y,
                            summarize=summarize, message=message, name=name)
 
@@ -1014,37 +975,8 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('>', 'assert_greater', 9)
 def assert_greater_v2(x, y, message=None, summarize=None, name=None):
-  """Assert the condition `x > y` holds element-wise.
-
-  This Op checks that `x[i] > y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If `x` is not greater than `y` element-wise, `message`, as well as the first
-  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
-  raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    message: A string to prefix to the default message.
-    summarize: Print this many entries of each tensor.
-    name: A name for this operation (optional).  Defaults to "assert_greater".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x > y` is False. This can be
-      used with `tf.control_dependencies` inside of `tf.function`s to block
-      followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x > y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
   return assert_greater(x=x, y=y, summarize=summarize, message=message,
                         name=name)
 
@@ -1061,38 +993,8 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  #
 @tf_export('debugging.assert_greater_equal', v1=[])
 @dispatch.register_binary_elementwise_assert_api
 @dispatch.add_dispatch_support
+@_binary_assert_doc_v2('>=', 'assert_greater_equal', 9)
 def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
-  """Assert the condition `x >= y` holds element-wise.
-
-  This Op checks that `x[i] >= y[i]` holds for every pair of (possibly
-  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
-  trivially satisfied.
-
-  If `x` is not greater or equal to `y` element-wise, `message`, as well as the
-  first `summarize` entries of `x` and `y` are printed, and
-  `InvalidArgumentError` is raised.
-
-  Args:
-    x:  Numeric `Tensor`.
-    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
-    message: A string to prefix to the default message.
-    summarize: Print this many entries of each tensor.
-    name: A name for this operation (optional).  Defaults to
-    "assert_greater_equal".
-
-  Returns:
-    Op that raises `InvalidArgumentError` if `x >= y` is False. This can be
-      used with `tf.control_dependencies` inside of `tf.function`s to block
-      followup computation until the check has executed.
-    @compatibility(eager)
-    returns None
-    @end_compatibility
-
-  Raises:
-    InvalidArgumentError: if the check can be performed immediately and
-      `x >= y` is False. The check can be performed immediately during eager
-      execution or if `x` and `y` are statically known.
-  """
   return assert_greater_equal(x=x, y=y, summarize=summarize, message=message,
                               name=name)
 
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 3daa2737894..170f7b80f65 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -38,8 +38,8 @@
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import default_gradient
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_optional_ops
 from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import math_ops
@@ -254,7 +254,8 @@ def _build_cond(pred,
 
   # Create the If op.
   with ops.control_dependencies(
-      list(true_graph.control_captures) + list(false_graph.control_captures)):
+      list(true_graph._function_captures.control) + list(  # pylint: disable=protected-access
+          false_graph._function_captures.control)):  # pylint: disable=protected-access
     true_stateful_ops = [
         op for op in true_graph.get_operations() if op._is_stateful
     ]
@@ -763,7 +764,7 @@ def _pack_sequence_as(structured_outputs, op_outputs):
 
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
-    return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
+    return [gen_optional_ops.optional_from_value([t]) for t in intermediates]
 
 
 def _create_dummy_input(func_graph, template_tensor):
@@ -792,14 +793,45 @@ def _create_none_optionals(func_graph, n):
     A list of tensors in func_graph.
   """
   with func_graph.as_default():
-    return [gen_dataset_ops.optional_none() for _ in range(n)]
+    return [gen_optional_ops.optional_none() for _ in range(n)]
+
+
+# TODO(b/265317139): remove this function and move this dynamic dimension
+# handling logic to XLA once XLA shape is ready for dynamic dimensions.
+def _convert_dynamic_dimension_to_zero(shape):
+  """Converts dynamic dimensions in `shape` to zero.
+
+  The fake params created to match the intermediates captured in other branches
+  could have dynamic dimensions. But the XLA shape is not able to handle
+  dynamic dimensions in TF TensorShape. Setting the dynamic dimensions to
+  size zero will help avoid failing safety checks in bridge. When XLA
+  DynamicConditional op reconciles branch differences, XLA will replace the
+  dimension size 0 with a bounded dimension determined from the shape of
+  real argument in the other branch.
+
+  Note: Rank unknown shapes are returned as they are.
+
+  Args:
+    shape: The TensorShape of fake param.
+
+  Returns:
+    The new TensorShape with dynamic dimensions set to zero.
+  """
+  if shape.rank is None:
+    return shape
+
+  return tensor_shape.TensorShape(
+      [0 if d is None else d for d in shape.as_list()]
+  )
 
 
 def _create_fakeparams(func_graph, template_tensors):
-  """Create FakeParams for the XLA case."""
+  """Creates FakeParams for the XLA case."""
   with func_graph.as_default():
-    return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
-            for t in template_tensors]
+    return [
+        gen_functional_ops.fake_param(
+            dtype=t.dtype, shape=_convert_dynamic_dimension_to_zero(t.shape))
+        for t in template_tensors]
 
 
 def _check_same_outputs(op_type, graphs):
@@ -1007,15 +1039,16 @@ def _capture_helper(self, tensor, name):
         else:
           # 'tensor' hasn't been wrapped, do it now.
           with self._forward_graph.as_default():
-            optional = gen_dataset_ops.optional_from_value([tensor])
+            optional = gen_optional_ops.optional_from_value([tensor])
           self.op_needs_rewrite = True
         self._wrapped_intermediates[tensor_id] = optional
 
       optional = self._wrapped_intermediates[tensor_id]
       captured_optional = super(_CondGradFuncGraph,
                                 self)._capture_helper(optional, name)
-      captured_tensor = gen_dataset_ops.optional_get_value(
-          captured_optional, [tensor.dtype], [tensor.shape])[0]
+      captured_tensor = gen_optional_ops.optional_get_value(
+          captured_optional, [tensor.dtype], [tensor.shape]
+      )[0]
 
     self._indirect_captures[tensor_id] = captured_tensor
     return captured_tensor
@@ -1203,7 +1236,7 @@ def _build_case(branch_index,
 
   # Create the Case op.
   with ops.control_dependencies(
-      sum((list(bg.control_captures) for bg in branch_graphs), [])):
+      sum((list(bg._function_captures.control) for bg in branch_graphs), [])):  # pylint: disable=protected-access
 
     def _make_op(inputs):
       case_op, tensors = util.get_op_and_outputs(op_fn(
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e8ba4162212..397a07572cf 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1329,6 +1329,33 @@ def _cast_indexed_slice_indices(a, b):
 def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
+  Note: This op is automatically used in a `tf.function` to convert Python
+  if-statements when the predicate is a `tf.Tensor`, unless `autograph=False` is
+  explicitly specified in `tf.function` args. For example, the following are
+  equivalent:
+
+  >>> @tf.function
+  ... def fun1(x,y):
+  ...   if x > 0:  # AutoGraph converts if-statement to tf.cond().
+  ...     z = y+1
+  ...   else:
+  ...     z = y-1
+  ...   return z
+  >>> fun1(tf.constant(7), tf.constant(3)).numpy()
+  4
+
+  >>> @tf.function
+  ... def fun2(x,y):
+  ...   pred = x > 0
+  ...   true_fn =  lambda: y+1
+  ...   false_fn = lambda: y-1
+  ...   return tf.cond(pred, true_fn, false_fn)  # Use tf.cond() explicitly.
+  >>> fun1(tf.constant(7), tf.constant(3)).numpy()
+  4
+
+  For more information, see [tf.function and AutoGraph guide](
+  https://www.tensorflow.org/guide/function#autograph_transformations).
+
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
   `false_fn` must have the same non-zero number and type of outputs.
 
@@ -1339,10 +1366,11 @@ def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   it has frequently surprised users who expected a lazier semantics.
   Consider the following simple program:
 
-  ```python
-  z = tf.multiply(a, b)
-  result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
-  ```
+  >>> x, y = tf.constant(2, dtype=tf.int32), tf.constant(4, dtype=tf.int32)
+  >>> z = tf.multiply(x, y)
+  >>> r = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
+  >>> r.numpy()
+  10
 
   If `x < y`, the `tf.add` operation will be executed and `tf.square`
   operation will not be executed. Since `z` is needed for at least one
@@ -1385,15 +1413,15 @@ def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
 
   Example:
 
-  ```python
-  x = tf.constant(2)
-  y = tf.constant(5)
-  def f1(): return tf.multiply(x, 17)
-  def f2(): return tf.add(y, 23)
-  r = tf.cond(tf.less(x, y), f1, f2)
-  # r is set to f1().
-  # Operations in f2 (e.g., tf.add) are not executed.
-  ```
+  >>> x = tf.constant(2)
+  >>> y = tf.constant(5)
+  >>> def f1(): return tf.multiply(x, 7)
+  >>> def f2(): return tf.add(y, 3)
+  >>> r = tf.cond(tf.less(x, y), f1, f2)
+  >>> # r is set to f1().
+  >>> # Operations in f2 (e.g., tf.add) are not executed.
+  >>> r.numpy()
+  14
 
   """
   return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
@@ -2358,6 +2386,33 @@ def while_loop_v2(cond,
                   name=None):
   """Repeat `body` while the condition `cond` is true.
 
+  Note: This op is automatically used in a `tf.function` to convert Python for-
+  and while- loops when the loop variable is a `tf.Tensor`, unless
+  `autograph=False` is explicitly specified in `tf.function` args. For example,
+  the following are equivalent:
+
+  >>> @tf.function
+  ... def sumSquare(n):
+  ...   i, result = tf.constant(0), tf.constant(0)
+  ...   while i < n: # AutoGraph converts while-loop to tf.while_loop().
+  ...     result += i * i
+  ...     i += 1
+  ...   return result
+  >>> sumSquare(10).numpy()
+  285
+
+  >>> @tf.function
+  ... def sumSquare2(n):
+  ...   i, result = tf.constant(0), tf.constant(0)
+  ...   c = lambda i, _: tf.less(i, n)
+  ...   b = lambda i, result: (i + 1, result + i * i)
+  ...   return tf.while_loop(c, b, [i, result])[1]
+  >>> sumSquare2(10).numpy()
+  285
+
+  For more information, see [tf.function and AutoGraph guide
+  ](https://www.tensorflow.org/guide/function#autograph_transformations).
+
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
   returning a (possibly nested) tuple, namedtuple or list of tensors of the same
   arity (length and structure) and types as `loop_vars`. `loop_vars` is a
@@ -2380,9 +2435,9 @@ def while_loop_v2(cond,
   is unchanged across the iterations of the loop. An error will be raised
   if the shape of a loop variable after an iteration is determined to be more
   general than or incompatible with its shape invariant. For example, a shape
-  of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
-  compatible with [11, 17]. By default (if the argument `shape_invariants` is
-  not specified), it is assumed that the initial shape of each tensor in
+  of `[11, None]` is more general than a shape of `[11, 17]`, and `[11, 21]` is
+  not compatible with `[11, 17]`. By default (if the argument `shape_invariants`
+  is not specified), it is assumed that the initial shape of each tensor in
   `loop_vars` is the same in every iteration. The `shape_invariants` argument
   allows the caller to specify a less specific shape invariant for each loop
   variable, which is needed if the shape varies between iterations. The
@@ -2392,22 +2447,22 @@ def while_loop_v2(cond,
   SparseTensor and IndexedSlices are treated specially as follows:
 
   a) If a loop variable is a SparseTensor, the shape invariant must be
-  TensorShape([r]) where r is the rank of the dense tensor represented
+  `TensorShape([r])` where `r` is the rank of the dense tensor represented
   by the sparse tensor. It means the shapes of the three tensors of the
-  SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
+  SparseTensor are `([None], [None, r], [r])`. NOTE: The shape invariant here
   is the shape of the SparseTensor.dense_shape property. It must be the shape of
   a vector.
 
   b) If a loop variable is an IndexedSlices, the shape invariant must be
   a shape invariant of the values tensor of the IndexedSlices. It means
-  the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
-  [shape.ndims]).
+  the shapes of the three tensors of the IndexedSlices are `(shape, [shape[0]],
+  [shape.ndims])`.
 
   `while_loop` implements non-strict semantics, enabling multiple iterations
   to run in parallel. The maximum number of parallel iterations can be
   controlled by `parallel_iterations`, which gives users some control over
   memory consumption and execution order. For correct programs, `while_loop`
-  should return the same result for any parallel_iterations > 0.
+  should return the same result for any `parallel_iterations > 0`.
 
   For training, TensorFlow stores the tensors that are produced in the
   forward inference and are needed in back propagation. These tensors are a
@@ -2443,38 +2498,37 @@ def while_loop_v2(cond,
 
   Example:
 
-  ```python
-  i = tf.constant(0)
-  c = lambda i: tf.less(i, 10)
-  b = lambda i: (tf.add(i, 1), )
-  r = tf.while_loop(c, b, [i])
-  ```
+  >>> i = tf.constant(0)
+  >>> c = lambda i: tf.less(i, 10)
+  >>> b = lambda i: (tf.add(i, 1), )
+  >>> r = tf.while_loop(c, b, [i])[0]
+  >>> r.numpy()
+  10
 
   Example with nesting and a namedtuple:
 
-  ```python
-  import collections
-  Pair = collections.namedtuple('Pair', 'j, k')
-  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
-  c = lambda i, p: i < 10
-  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
-  ijk_final = tf.while_loop(c, b, ijk_0)
-  ```
+  >>> import collections
+  >>> Pair = collections.namedtuple('Pair', 'j, k')
+  >>> ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  >>> c = lambda i, p: i < 10
+  >>> b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  >>> ijk_final = tf.while_loop(c, b, ijk_0)[1]
+  >>> ijk_final[0].numpy(), ijk_final[1].numpy()
+  (32, 64)
 
   Example using shape_invariants:
 
-  ```python
-  i0 = tf.constant(0)
-  m0 = tf.ones([2, 2])
-  c = lambda i, m: i < 10
-  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
-  tf.while_loop(
-      c, b, loop_vars=[i0, m0],
-      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
-  ```
+  >>> i0 = tf.constant(0)
+  >>> m0 = tf.ones([2, 2])
+  >>> c = lambda i, m: i < 10
+  >>> b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  >>> tf.while_loop(
+  ...     c, b, loop_vars=[i0, m0],
+  ...     shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])[1]
+  <tf.Tensor: shape=(2048, 2), dtype=float32, numpy=...>
 
   Example which demonstrates non-strict semantics: In the following
-  example, the final value of the counter `i` does not depend on `x`. So
+  example, the final value of `counter` does not depend on `x`. So
   the `while_loop` can increment the counter parallel to updates of `x`.
   However, because the loop counter at one loop iteration depends
   on the value at the previous iteration, the loop counter itself cannot
@@ -2491,26 +2545,30 @@ def while_loop_v2(cond,
   counter thread because the thread incrementing `x` depends on the value
   of the counter.
 
-  ```python
-  import tensorflow as tf
-
-  n = 10000
-  x = tf.constant(list(range(n)))
-  c = lambda i, x: i < n
-  b = lambda i, x: (tf.compat.v1.Print(i + 1, [i]), tf.compat.v1.Print(x + 1,
-  [i], "x:"))
-  i, out = tf.while_loop(c, b, (0, x))
-  with tf.compat.v1.Session() as sess:
-      print(sess.run(i))  # prints [0] ... [9999]
-
-      # The following line may increment the counter and x in parallel.
-      # The counter thread may get ahead of the other thread, but not the
-      # other way around. So you may see things like
-      # [9996] x:[9987]
-      # meaning that the counter thread is on iteration 9996,
-      # while the other thread is on iteration 9987
-      print(sess.run(out).shape)
-  ```
+  >>> with tf.compat.v1.Session() as sess:
+  ...   n = 10
+  ...   c = lambda i, x: i < n
+  ...   b = lambda i, x: (
+  ...       tf.compat.v1.Print(i + 1, [i], "Updating i based on i == "),
+  ...       # Let x depend on i
+  ...       tf.compat.v1.Print(x + i, [i], "Updating x based on i == "))
+  ...
+  ...   # Make x to be a big matrix so its updating thread would run slowly
+  ...   x = tf.zeros([1000, 100], dtype=tf.int32)
+  ...   counter = tf.constant(0)
+  ...   counter_out, x_out = tf.while_loop(c, b, (counter, x))
+  ...
+  ...   # The following line may increment the counter and x in parallel.
+  ...   # The counter thread may get ahead of the x thread, but not the
+  ...   # other way around. For example, the log may contain these messages:
+  ...   # ```
+  ...   # Updating i based on i == [9]
+  ...   # Updating x based on i == [3]
+  ...   # ```
+  ...   # meaning that the counter(i) thread is on iteration 9,
+  ...   # while the x thread is on iteration 3.
+  ...   print(sess.run(x_out).shape)
+  (1000, 100)
 
   """
   return while_loop(
@@ -2983,6 +3041,44 @@ def tuple_v2(tensors, control_inputs=None, name=None):
 
   See also `tf.group` and `tf.control_dependencies`.
 
+  Example:
+  >>> with tf.Graph().as_default():
+  ...   with tf.compat.v1.Session() as sess:
+  ...     v = tf.Variable(0.0)
+  ...     a = tf.constant(1.0)
+  ...     sess.run(tf.compat.v1.global_variables_initializer())
+  ...     for i in range(5):
+  ...       update_op = v.assign_add(1.0)
+  ...       b = a + v
+  ...       res_b = sess.run(b)
+  ...       res_v = sess.run(v)
+  ...       print(res_v)
+  0.0
+  0.0
+  0.0
+  0.0
+  0.0
+
+  >>> with tf.Graph().as_default():
+  ...   with tf.compat.v1.Session() as sess:
+  ...     v = tf.Variable(0.0)
+  ...     a = tf.constant(1.0)
+  ...     sess.run(tf.compat.v1.global_variables_initializer())
+  ...     for i in range(5):
+  ...       update_op = v.assign_add(1.0)
+  ...       calc = [a + v]
+  ...       # `tf.tuple` ensures `update_op` is run before `b`
+  ...       b = tf.tuple(calc, [tf.group(update_op)])
+  ...       res_b = sess.run(b)
+  ...       res_v = sess.run(v)
+  ...       print(res_v)
+  1.0
+  2.0
+  3.0
+  4.0
+  5.0
+
+
   Args:
     tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
     control_inputs: List of additional ops to finish before returning.
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 03223b7a899..a441042cd3e 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -319,7 +319,24 @@ def get_func_graph(op, input_shapes, func_name):
   # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
   with op.graph.as_default():
     func_graph = function_def_to_graph.function_def_to_graph(
-        fdef, input_shapes)
+        fdef, input_shapes=input_shapes)
+
+  # TODO(xjun): Ideally we want to retrieve the gradient functions instead of
+  # re-create them. But the lifetime of gradient functions of PartitionedCall
+  # ops is attached to ParitionedCall ops in the original func_graph and
+  # when we are inside this function we don't have access to the original
+  # func_graph or PartitionedCall ops. See cl/499362867 and cl/273858076 for
+  # more context.
+  for operation in func_graph.get_operations():
+    if operation.type in ["PartitionedCall", "StatefulPartitionedCall"]:
+      f = graph._get_function(operation.get_attr("f").name)  # pylint: disable=protected-access
+      try:
+        cf = function.ConcreteFunction(f.graph, attrs=f.definition.attr)
+      except AttributeError:
+        # f is not found or f is a _DefinedFunction that doesn't have a graph.
+        continue
+      operation._gradient_function = cf._get_gradient_function()  # pylint: disable=protected-access
+
   return func_graph
 
 
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 400825fce0a..316796f2c2b 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -50,7 +50,7 @@
 
 
 def _get_context_device_type():
-  """Parse the current context and return the device type, eg CPU/GPU."""
+  """Parses the current context and returns the device type, eg CPU/GPU."""
   current_device = context.context().device_name
   if current_device is None:
     return None
@@ -498,7 +498,7 @@ def ctc_beam_search_decoder_v2(inputs,
 
 
 def _ctc_state_trans(label_seq):
-  """Compute CTC alignment model transition matrix.
+  """Computes CTC alignment model transition matrix.
 
   Args:
     label_seq: tensor of shape [batch_size, max_seq_length]
@@ -889,54 +889,99 @@ def ctc_loss_v3(labels,
                 name=None):
   """Computes CTC (Connectionist Temporal Classification) loss.
 
-  This op implements the CTC loss as presented in (Graves et al., 2006).
+  This op implements the CTC loss as presented in
+  [Graves et al., 2006](https://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Connectionist temporal classification (CTC) is a type of neural network output
+  and associated scoring function, for training recurrent neural networks (RNNs)
+  such as LSTM networks to tackle sequence problems where the timing is
+  variable. It can be used for tasks like on-line handwriting recognition or
+  recognizing phones in speech audio. CTC refers to the outputs and scoring, and
+  is independent of the underlying neural network structure.
 
   Notes:
 
-  - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss
-    setting of preprocess_collapse_repeated=False, ctc_merge_repeated=True
-  - Labels may be supplied as either a dense, zero-padded tensor with a
-    vector of label sequence lengths OR as a SparseTensor.
-  - On TPU: Only dense padded labels are supported.
-  - On CPU and GPU: Caller may use SparseTensor or dense padded labels 
-    but calling with a SparseTensor will be significantly faster.
-  - Default blank label is 0 rather num_classes - 1, unless overridden by
-    blank_index.
+  - This class performs the softmax operation for you, so `logits` should be
+    e.g. linear projections of outputs by an LSTM.
+  - Outputs true repeated classes with blanks in between, and can also output
+    repeated classes with no blanks in between that need to be collapsed by the
+    decoder.
+  - `labels` may be supplied as either a dense, zero-padded `Tensor` with a
+    vector of label sequence lengths OR as a `SparseTensor`.
+  - On TPU: Only dense padded `labels` are supported.
+  - On CPU and GPU: Caller may use `SparseTensor` or dense padded `labels`
+    but calling with a `SparseTensor` will be significantly faster.
+  - Default blank label is `0` instead of `num_labels - 1` (where `num_labels`
+    is the innermost dimension size of `logits`), unless overridden by
+    `blank_index`.
+
+  >>> tf.random.set_seed(50)
+  >>> batch_size = 8
+  >>> num_labels = 6
+  >>> max_label_length = 5
+  >>> num_frames = 12
+  >>> labels = tf.random.uniform([batch_size, max_label_length],
+  ...                            minval=1, maxval=num_labels, dtype=tf.int64)
+  >>> logits = tf.random.uniform([num_frames, batch_size, num_labels])
+  >>> label_length = tf.random.uniform([batch_size], minval=2,
+  ...                                  maxval=max_label_length, dtype=tf.int64)
+  >>> label_mask = tf.sequence_mask(label_length, maxlen=max_label_length,
+  ...                               dtype=label_length.dtype)
+  >>> labels *= label_mask
+  >>> logit_length = [num_frames] * batch_size
+  >>> with tf.GradientTape() as t:
+  ...   t.watch(logits)
+  ...   ref_loss = tf.nn.ctc_loss(
+  ...       labels=labels,
+  ...       logits=logits,
+  ...       label_length=label_length,
+  ...       logit_length=logit_length,
+  ...       blank_index=0)
+  >>> ref_grad = t.gradient(ref_loss, logits)
 
   Args:
-    labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
-    logits: tensor of shape [frames, batch_size, num_labels], if
-      logits_time_major == False, shape is [batch_size, frames, num_labels].
-    label_length: tensor of shape [batch_size], None if labels is SparseTensor
-      Length of reference label sequence in labels.
-    logit_length: tensor of shape [batch_size] Length of input sequence in
-      logits.
-    logits_time_major: (optional) If True (default), logits is shaped [time,
-      batch, logits]. If False, shape is [batch, time, logits]
+    labels: `Tensor` of shape `[batch_size, max_label_seq_length]` or
+      `SparseTensor`.
+    logits: `Tensor` of shape `[frames, batch_size, num_labels]`. If
+      `logits_time_major == False`, shape is `[batch_size, frames, num_labels]`.
+    label_length: `Tensor` of shape `[batch_size]`. None, if `labels` is a
+      `SparseTensor`. Length of reference label sequence in `labels`.
+    logit_length: `Tensor` of shape `[batch_size]`. Length of input sequence in
+      `logits`.
+    logits_time_major: (optional) If True (default), `logits` is shaped [frames,
+      batch_size, num_labels]. If False, shape is
+      `[batch_size, frames, num_labels]`.
     unique: (optional) Unique label indices as computed by
-      ctc_unique_labels(labels).  If supplied, enable a faster, memory efficient
-      implementation on TPU.
+      `ctc_unique_labels(labels)`.  If supplied, enable a faster, memory
+      efficient implementation on TPU.
     blank_index: (optional) Set the class index to use for the blank label.
-      Negative values will start from num_classes, ie, -1 will reproduce the
-      ctc_loss behavior of using num_classes - 1 for the blank symbol. There is
+      Negative values will start from `num_labels`, ie, `-1` will reproduce the
+      ctc_loss behavior of using `num_labels - 1` for the blank symbol. There is
       some memory/performance overhead to switching from the default of 0 as an
-      additional shifted copy of the logits may be created.
+      additional shifted copy of `logits` may be created.
     name: A name for this `Op`. Defaults to "ctc_loss_dense".
 
   Returns:
-    loss: tensor of shape [batch_size], negative log probabilities.
+    loss: A 1-D `float Tensor` of shape `[batch_size]`, containing negative log
+    probabilities.
+
+  Raises:
+    ValueError: Argument `blank_index` must be provided when `labels` is a
+    `SparseTensor`.
 
   References:
       Connectionist Temporal Classification - Labeling Unsegmented Sequence Data
       with Recurrent Neural Networks:
         [Graves et al., 2006](https://dl.acm.org/citation.cfm?id=1143891)
         ([pdf](http://www.cs.toronto.edu/~graves/icml_2006.pdf))
+
+      https://en.wikipedia.org/wiki/Connectionist_temporal_classification
   """
   if isinstance(labels, sparse_tensor.SparseTensor):
     if blank_index is None:
       raise ValueError(
-          "Argument `blank_index` must be provided when labels is a "
-          "SparseTensor.")
+          "Argument `blank_index` must be provided when `labels` is a "
+          "`SparseTensor`.")
 
     if blank_index < 0:
       blank_index += _get_dim(logits, 2)
@@ -1002,10 +1047,10 @@ def ctc_loss_dense(labels,
   using the batched forward backward algorithm described in (Sim et al., 2017).
 
   Notes:
-    Significant differences from tf.compat.v1.nn.ctc_loss:
-      Supports GPU and TPU (tf.compat.v1.nn.ctc_loss supports CPU only):
+    Significant differences from `tf.compat.v1.nn.ctc_loss`:
+      Supports GPU and TPU (`tf.compat.v1.nn.ctc_loss` supports CPU only):
         For batched operations, GPU and TPU are significantly faster than using
-        ctc_loss on CPU.
+        `ctc_loss` on CPU.
         This implementation runs on CPU, but significantly slower than ctc_loss.
       Blank label is 0 rather num_classes - 1, unless overridden by blank_index.
       Logits and labels are dense arrays with padding rather than SparseTensor.
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index f10b02f56b7..c7fb57707e7 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -62,9 +62,10 @@ def log1pexp(x):
   is NaN.  For example:
 
   ```python
-  x = tf.constant(100.)
-  y = log1pexp(x)
-  dy_dx = tf.gradients(y, x) # Will be NaN when evaluated.
+  with tf.GradientTape() as tape:
+    tape.watch(x)
+    y=log1pexp(x)
+  dy_dx = tape.gradient(y, x) # Will be NaN when evaluated.
   ```
 
   The gradient expression can be analytically simplified to provide numerical
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 46eb4a0b149..daf86504a1a 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 71911764ef8..d91770bbcf0 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -16,6 +16,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -136,7 +137,8 @@ def _embedding_lookup_and_transform(params,
     if not any(
         isinstance(p, resource_variable_ops.BaseResourceVariable)
         for p in params):
-      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
+      params = indexed_slices.convert_n_to_tensor_or_indexed_slices(
+          params, name="params")
     ids = ops.convert_to_tensor(ids, name="ids")
     if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
       with _colocate_with(params[0]):
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index d4fce37a005..5fee2bdfd86 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -19,7 +19,6 @@
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python import pywrap_tfe
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
@@ -160,7 +159,8 @@ def _DefaultGradYs(grad_ys,
   if len(grad_ys) != len(ys):
     raise ValueError(f"Length mismatch. Passed {len(grad_ys)} grad_ys for "
                      f"{len(ys)} ys")
-  grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
+  grad_ys = indexed_slices.convert_n_to_tensor_or_indexed_slices(
+      grad_ys, name="grad_y")
   new_grad_ys = []
   for i, (y, grad_y) in enumerate(zip(ys, grad_ys)):
     with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
@@ -532,8 +532,8 @@ def _GradientsHelper(ys,
     # Get a uid for this call to gradients that can be used to help
     # cluster ops for compilation.
     gradient_uid = ops.get_default_graph().unique_name("uid")
-    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+    ys = indexed_slices.convert_n_to_tensor_or_indexed_slices(ys, name="y")
+    xs = indexed_slices.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
     xs_set = object_identity.ObjectIdentitySet(xs)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
@@ -925,6 +925,20 @@ class AggregationMethod:
     the "AddN" op. This method of summing gradients may reduce
     performance, but it can improve memory utilization because the
     gradients can be released earlier.
+  * `EXPERIMENTAL_ACCUMULATE_N`: Same as `EXPERIMENTAL_TREE`.
+
+  Example usage when computing gradient:
+
+  >>> @tf.function
+  ... def example():
+  ...   x = tf.constant(1.0)
+  ...   y = x * 2.0
+  ...   z = y + y + y + y
+  ...   return tf.gradients(z, [x, y],
+  ...     aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
+  >>> example()
+  [<tf.Tensor: shape=(), dtype=float32, numpy=8.0>,
+   <tf.Tensor: shape=(), dtype=float32, numpy=4.0>]
 
   """
   ADD_N = 0
@@ -1015,7 +1029,7 @@ def _AggregatedGrads(grads,
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
-        out_grads[i] = backprop.aggregate_indexed_slices_gradients(out_grad)  # pylint: disable=protected-access
+        out_grads[i] = backprop_util.AggregateIndexedSlicesGradients(out_grad)  # pylint: disable=protected-access
     else:  # not out_grad
       # out_grads[i] is [], thus its aggregation is simply None.
       out_grads[i] = None
diff --git a/tensorflow/python/ops/handle_data_util.py b/tensorflow/python/ops/handle_data_util.py
index 52ef8afd2bc..52157d12d7a 100644
--- a/tensorflow/python/ops/handle_data_util.py
+++ b/tensorflow/python/ops/handle_data_util.py
@@ -40,10 +40,7 @@ def copy_handle_data(source_t, target_t):
   """
   if (target_t.dtype == dtypes.resource or
       target_t.dtype == dtypes.variant):
-    if isinstance(source_t, ops.EagerTensor):
-      handle_data = source_t._handle_data  # pylint: disable=protected-access
-    else:
-      handle_data = get_resource_handle_data(source_t)
+    handle_data = ops.get_handle_data(source_t)
     if (handle_data is not None
         and handle_data.is_set
         and handle_data.shape_and_type):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 9806a0c41f6..6797ffe6757 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4810,7 +4810,8 @@ def crop_and_resize_v2(image,
   sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
   common output size specified by `crop_size`. This is more general than the
   `crop_to_bounding_box` op which extracts a fixed size slice from the input
-  image and does not allow resizing or aspect ratio change.
+  image and does not allow resizing or aspect ratio change. The crops occur
+  first and then the resize.
 
   Returns a tensor with `crops` from the input `image` at positions defined at
   the bounding box locations in `boxes`. The cropped boxes are all resized (with
@@ -4855,25 +4856,49 @@ def crop_and_resize_v2(image,
   Returns:
     A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
 
-  Example:
+  Usage example:
+
+  >>> BATCH_SIZE = 1
+  >>> NUM_BOXES = 5
+  >>> IMAGE_HEIGHT = 256
+  >>> IMAGE_WIDTH = 256
+  >>> CHANNELS = 3
+  >>> CROP_SIZE = (24, 24)
+
+  >>> image = tf.random.normal(shape=(
+  ...   BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) )
+  >>> boxes = tf.random.uniform(shape=(NUM_BOXES, 4))
+  >>> box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0,
+  ...   maxval=BATCH_SIZE, dtype=tf.int32)
+  >>> output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE)
+  >>> output.shape
+  TensorShape([5, 24, 24, 3])
+
+  Example with linear interpolation:
+
+  >>> image = np.arange(0, 18, 2).astype('float32').reshape(3, 3)
+  >>> result = tf.image.crop_and_resize(
+  ...   image[None, :, :, None],
+  ...   np.asarray([[0.5,0.5,1,1]]), [0], [3, 3], method='bilinear')
+  >>> result[0][:, :, 0]
+  <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
+    array([[ 8.,  9., 10.],
+           [11., 12., 13.],
+           [14., 15., 16.]], dtype=float32)>
+
+  Example with nearest interpolation:
+
+  >>> image = np.arange(0, 18, 2).astype('float32').reshape(3, 3)
+  >>> result = tf.image.crop_and_resize(
+  ...   image[None, :, :, None],
+  ...   np.asarray([[0.5,0.5,1,1]]), [0], [3, 3], method='nearest')
+  >>> result[0][:, :, 0]
+  <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
+    array([[ 8., 10., 10.],
+           [14., 16., 16.],
+           [14., 16., 16.]], dtype=float32)>
+
 
-  ```python
-  import tensorflow as tf
-  BATCH_SIZE = 1
-  NUM_BOXES = 5
-  IMAGE_HEIGHT = 256
-  IMAGE_WIDTH = 256
-  CHANNELS = 3
-  CROP_SIZE = (24, 24)
-
-  image = tf.random.normal(shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH,
-  CHANNELS) )
-  boxes = tf.random.uniform(shape=(NUM_BOXES, 4))
-  box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0,
-  maxval=BATCH_SIZE, dtype=tf.int32)
-  output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE)
-  output.shape  #=> (5, 24, 24, 3)
-  ```
   """
   return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size,
                                        method, extrapolation_value, name)
@@ -5562,19 +5587,14 @@ def _sort_scores_and_boxes(scores, boxes):
         representing the index of the scores in a sorted descending order.
     """
     with ops.name_scope('sort_scores_and_boxes'):
-      batch_size = array_ops.shape(boxes)[0]
-      num_boxes = array_ops.shape(boxes)[1]
       sorted_scores_indices = sort_ops.argsort(
           scores, axis=1, direction='DESCENDING')
-      index_offsets = math_ops.range(batch_size) * num_boxes
-      indices = array_ops.reshape(
-          sorted_scores_indices + array_ops.expand_dims(index_offsets, 1), [-1])
-      sorted_scores = array_ops.reshape(
-          array_ops.gather(array_ops.reshape(scores, [-1]), indices),
-          [batch_size, -1])
-      sorted_boxes = array_ops.reshape(
-          array_ops.gather(array_ops.reshape(boxes, [-1, 4]), indices),
-          [batch_size, -1, 4])
+      sorted_scores = array_ops.gather(
+          scores, sorted_scores_indices, axis=1, batch_dims=1
+      )
+      sorted_boxes = array_ops.gather(
+          boxes, sorted_scores_indices, axis=1, batch_dims=1
+      )
     return sorted_scores, sorted_boxes, sorted_scores_indices
 
   batch_dims = array_ops.shape(boxes)[:-2]
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index e229dd65253..848a85250bb 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -4054,6 +4054,15 @@ def testPad(self):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  def testImageResizeAntialiasWithInvalidInput(self):
+    with self.session():
+      with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+        op = image_ops.resize_images_v2(
+            images=np.ones((2, 2, 2, 2)),
+            size=[1801181592, 1846789676],
+            antialias=True)
+        self.evaluate(op)
+
 
 # half_pixel_centers not supported by XLA
 @test_util.for_all_test_methods(test_util.disable_xla, "b/127616992")
@@ -4722,7 +4731,7 @@ def testSynthetic(self):
 
       # Smooth ramps compress well, but not too well
       self.assertGreaterEqual(len(png0), 400)
-      self.assertLessEqual(len(png0), 750)
+      self.assertLessEqual(len(png0), 1150)
 
   def testSyntheticUint16(self):
     with self.cached_session():
@@ -4737,7 +4746,7 @@ def testSyntheticUint16(self):
 
       # Smooth ramps compress well, but not too well
       self.assertGreaterEqual(len(png0), 800)
-      self.assertLessEqual(len(png0), 1500)
+      self.assertLessEqual(len(png0), 2100)
 
   def testSyntheticTwoChannel(self):
     with self.cached_session():
@@ -5155,6 +5164,47 @@ def testInvalidTensorInput(self):
           max_output_size_per_class=max_output_size_per_class,
           max_total_size=max_total_size)
 
+  def testLargeMaxOutputSizePerClass(self):
+    # Ensure the max_output_size_per_class doesn't result in overflows.
+    boxes = [[[
+        [0, 0, 1, 1],
+        [0, 0.1, 1, 1.1],
+        [0, -0.1, 1, 0.9],
+        [0, 10, 1, 11],
+        [0, 10.1, 1, 11.1],
+        [0, 100, 1, 101],
+    ]]]
+    scores = [[[0.9, 0.75, 0.6, 0.95, 0.5, 0.3]]]
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        image_ops.combined_non_max_suppression(
+            boxes=boxes,
+            scores=scores,
+            max_output_size_per_class=2**31 - 1,
+            max_total_size=8,
+            pad_per_class=True,
+            clip_boxes=False,
+        )
+    )
+
+    self.assertAllClose(
+        nmsed_boxes,
+        [[
+            [0, 10, 1, 11],
+            [0, 0, 1, 1],
+            [0, 0.1, 1.0, 1.1],
+            [0, -0.1, 1, 0.9],
+            [0, 10.1, 1, 11.1],
+            [0, 100, 1, 101],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+        ]],
+    )
+    self.assertAllClose(nmsed_classes, [[3, 0, 1, 2, 4, 5, 0, 0]])
+    self.assertAllClose(
+        nmsed_scores, [[0.95, 0.9, 0.75, 0.6, 0.5, 0.3, 0.0, 0.0]]
+    )
+    self.assertAllClose(valid_detections, [6])
+
 
 class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index 752becbaafb..f649ebc5a9c 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -17,6 +18,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/ops/signal",
         "//tensorflow/python/trackable:data_structures",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/ops/linalg/cholesky_registrations.py b/tensorflow/python/ops/linalg/cholesky_registrations.py
index 438e167d140..9d77b17d7ba 100644
--- a/tensorflow/python/ops/linalg/cholesky_registrations.py
+++ b/tensorflow/python/ops/linalg/cholesky_registrations.py
@@ -14,28 +14,93 @@
 # ==============================================================================
 """Registrations for LinearOperator.cholesky."""
 
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_composition
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
 from tensorflow.python.ops.linalg import linear_operator_kronecker
 from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import linear_operator_util
+
+LinearOperatorLowerTriangular = (
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
 
 
 # By default, compute the Cholesky of the dense matrix, and return a
 # LowerTriangular operator. Methods below specialize this registration.
 @linear_operator_algebra.RegisterCholesky(linear_operator.LinearOperator)
 def _cholesky_linear_operator(linop):
-  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+  return LinearOperatorLowerTriangular(
       linalg_ops.cholesky(linop.to_dense()),
       is_non_singular=True,
       is_self_adjoint=False,
       is_square=True)
 
 
+def _is_llt_product(linop):
+  """Determines if linop = L @ L.H for L = LinearOperatorLowerTriangular."""
+  if len(linop.operators) != 2:
+    return False
+  if not linear_operator_util.is_aat_form(linop.operators):
+    return False
+  return isinstance(linop.operators[0], LinearOperatorLowerTriangular)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_composition.LinearOperatorComposition)
+def _cholesky_linear_operator_composition(linop):
+  """Computes Cholesky(LinearOperatorComposition)."""
+  # L @ L.H will be handled with special code below. Why is L @ L.H the most
+  # important special case?
+  # Note that Diag @ Diag.H  and Diag @ TriL and TriL @ Diag are already
+  # compressed to Diag or TriL by diag matmul
+  # registration. Similarly for Identity and ScaledIdentity.
+  # So these would not appear in a LinearOperatorComposition unless explicitly
+  # constructed as such. So the most important thing to check is L @ L.H.
+  if not _is_llt_product(linop):
+    return LinearOperatorLowerTriangular(
+        linalg_ops.cholesky(linop.to_dense()),
+        is_non_singular=True,
+        is_self_adjoint=False,
+        is_square=True)
+
+  left_op = linop.operators[0]
+
+  # left_op.is_positive_definite ==> op already has positive diag. So return it.
+  if left_op.is_positive_definite:
+    return left_op
+
+  # Recall that the base class has already verified linop.is_positive_definite,
+  # else linop.cholesky() would have raised.
+  # So in particular, we know the diagonal has nonzero entries.
+  # In the generic case, we make op have positive diag by dividing each row
+  # by the sign of the diag. This is equivalent to setting A = L @ D where D is
+  # diag(sign(1 / L.diag_part())). Then A is lower triangular with positive diag
+  # and A @ A^H = L @ D @ D^H @ L^H = L @ L^H = linop.
+  # This also works for complex L, since sign(x + iy) = exp(i * angle(x + iy)).
+  diag_sign = array_ops.expand_dims(math_ops.sign(left_op.diag_part()), axis=-2)
+  return LinearOperatorLowerTriangular(
+      tril=left_op.tril / diag_sign,
+      is_non_singular=left_op.is_non_singular,
+      # L.is_self_adjoint ==> L is diagonal ==> L @ D is diagonal ==> SA
+      # L.is_self_adjoint is False ==> L not diagonal ==> L @ D not diag ...
+      is_self_adjoint=left_op.is_self_adjoint,
+      # L.is_positive_definite ==> L has positive diag ==> L = L @ D
+      #   ==> (L @ D).is_positive_definite.
+      # L.is_positive_definite is False could result in L @ D being PD or not..
+      # Consider L = [[1, 0], [-2, 1]] and quadratic form with x = [1, 1].
+      # Note we will already return left_op if left_op.is_positive_definite
+      # above, but to be explicit write this below.
+      is_positive_definite=True if left_op.is_positive_definite else None,
+      is_square=True,
+  )
+
+
 @linear_operator_algebra.RegisterCholesky(
     linear_operator_diag.LinearOperatorDiag)
 def _cholesky_diag(diag_operator):
@@ -75,23 +140,23 @@ def _cholesky_scaled_identity(identity_operator):
 @linear_operator_algebra.RegisterCholesky(
     linear_operator_block_diag.LinearOperatorBlockDiag)
 def _cholesky_block_diag(block_diag_operator):
-    # We take the cholesky of each block on the diagonal.
+  # We take the cholesky of each block on the diagonal.
   return linear_operator_block_diag.LinearOperatorBlockDiag(
       operators=[
           operator.cholesky() for operator in block_diag_operator.operators],
       is_non_singular=True,
-      is_self_adjoint=False,
+      is_self_adjoint=None,  # Let the operators passed in decide.
       is_square=True)
 
 
 @linear_operator_algebra.RegisterCholesky(
     linear_operator_kronecker.LinearOperatorKronecker)
 def _cholesky_kronecker(kronecker_operator):
-    # Cholesky decomposition of a Kronecker product is the Kronecker product
-    # of cholesky decompositions.
+  # Cholesky decomposition of a Kronecker product is the Kronecker product
+  # of cholesky decompositions.
   return linear_operator_kronecker.LinearOperatorKronecker(
       operators=[
           operator.cholesky() for operator in kronecker_operator.operators],
       is_non_singular=True,
-      is_self_adjoint=False,
+      is_self_adjoint=None,  # Let the operators passed in decide.
       is_square=True)
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 2ac6de464c0..f867b109d36 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -1339,7 +1340,7 @@ def make_composite_tensor(cls, module_name="tf.linalg"):
 
   spec_name = "{}Spec".format(cls.__name__)
   spec_type = type(spec_name, (_LinearOperatorSpec,), {"value_type": cls})
-  type_spec.register("{}.{}".format(module_name, spec_name))(spec_type)
+  type_spec_registry.register("{}.{}".format(module_name, spec_name))(spec_type)
   cls._type_spec = property(spec_type.from_operator)  # pylint: disable=protected-access
   return cls
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 5b7a23fb5dc..53be0883f44 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -22,6 +22,7 @@
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorComposition"]
@@ -173,6 +174,29 @@ def __init__(self,
             "The composition of non-singular operators is always non-singular.")
       is_non_singular = True
 
+    if _composition_must_be_self_adjoint(operators):
+      if is_self_adjoint is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError(
+            "The composition was determined to be self-adjoint but user "
+            "provided incorrect `False` hint.")
+      is_self_adjoint = True
+
+    if linear_operator_util.is_aat_form(operators):
+      if is_square is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError(
+            "The composition was determined have the form "
+            "A @ A.H, hence it must be square. The user "
+            "provided an incorrect `False` hint.")
+      is_square = True
+
+    if linear_operator_util.is_aat_form(operators) and is_non_singular:
+      if is_positive_definite is False:  # pylint:disable=g-bool-id-comparison
+        raise ValueError(
+            "The composition was determined to be non-singular and have the "
+            "form A @ A.H, hence it must be positive-definite. The user "
+            "provided an incorrect `False` hint.")
+      is_positive_definite = True
+
     # Initialization.
 
     if name is None:
@@ -292,3 +316,25 @@ def _composite_tensor_fields(self):
   @property
   def _experimental_parameter_ndims_to_matrix_ndims(self):
     return {"operators": [0] * len(self.operators)}
+
+
+def _composition_must_be_self_adjoint(operators):
+  """Runs some checks to see if composition operators must be SA.
+
+  Args:
+    operators: List of LinearOperators.
+
+  Returns:
+    True if the composition must be SA. False if it is not SA OR if we did not
+      determine whether the composition is SA.
+  """
+  if len(operators) == 1 and operators[0].is_self_adjoint:
+    return True
+
+  # Check for forms like A @ A.H or (A1 @ A2) @ (A2.H @ A1.H) or ...
+  if linear_operator_util.is_aat_form(operators):
+    return True
+
+  # Done checking...could still be SA.
+  # We may not catch some cases. E.g. (A @ I) @ A.H is SA, but is not AAT form.
+  return False
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index bb6fce07f32..f79eb431902 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -380,6 +380,7 @@ def test_log_abs_det(self):
   return test_log_abs_det
 
 
+@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_operator_matmul_with_same_type(use_placeholder, shapes_info, dtype):
   """op_a.matmul(op_b), in the case where the same type is returned."""
   def test_operator_matmul_with_same_type(self):
@@ -491,6 +492,7 @@ def _test_matmul_base(
     self.assertAC(op_matmul_v, mat_matmul_v)
 
 
+@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_matmul(
     use_placeholder,
     shapes_info,
@@ -511,6 +513,7 @@ def test_matmul(self):
   return test_matmul
 
 
+@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_matmul_with_broadcast(
     use_placeholder,
     shapes_info,
@@ -810,6 +813,7 @@ def test_diag_part(self):
   return test_diag_part
 
 
+@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_composite_tensor(use_placeholder, shapes_info, dtype):
   def test_composite_tensor(self):
     with self.session(graph=ops.Graph()) as sess:
@@ -850,6 +854,7 @@ def body(op):
   return test_composite_tensor
 
 
+@test_util.run_without_tensor_float_32("Use FP32 in matmul")
 def _test_saved_model(use_placeholder, shapes_info, dtype):
   def test_saved_model(self):
     with self.session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index c4430d5ab4f..63016560f17 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -479,6 +479,32 @@ def reshape_inv(y):
 ################################################################################
 
 
+def is_adjoint_pair(x, y):
+  """True iff x and y are adjoints of each other (by id, not entries)."""
+  if x is y:  # Note that if x is y then all of their hints are the same!
+    if x.is_self_adjoint is False:  # pylint:disable=g-bool-id-comparison
+      return False
+    if x.is_self_adjoint:
+      return True
+  # Use the fact that if x = LinearOperatorAdjoint(y), then x.H is y.
+  return x.H is y or y.H is x
+
+
+def is_aat_form(operators):
+  """Returns True if operators is of the form A @ A.H, possibly recursively."""
+  operators = list(operators)
+  if not operators:
+    raise ValueError("AAT form is undefined for empty operators")
+
+  if len(operators) % 2:
+    return False
+
+  # Check for forms like (A1 @ A2) @ (A2.H @ A1.H)
+  return all(
+      is_adjoint_pair(operators[i], operators[-1 - i])
+      for i in range(len(operators) // 2))
+
+
 def use_operator_or_provided_hint_unless_contradicting(
     operator, hint_attr_name, provided_hint_value, message):
   """Get combined hint in the case where operator.hint should equal hint.
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index e894e5b5704..b27255f272b 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -2,6 +2,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -26,6 +27,13 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":gen_sparse_csr_matrix_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops/linalg:linalg_impl",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index f5cb00c13f8..ae970417cea 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//engedu/ml/tf_from_scratch:__pkg__",
         "//tensorflow:internal",
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 85d4f2b1f84..213191b7baa 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -70,13 +70,14 @@
 import numbers
 import numpy as np
 
+from tensorflow.python.compat import compat as tf_compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -966,6 +967,12 @@ def cast(x, dtype, name=None):
 
   Note casting nan and inf values to integral types has undefined behavior.
 
+  Note this operation can lead to a loss of precision when converting native
+  Python `float` and `complex` variables to `tf.float64` or `tf.complex128`
+  tensors, since the input is first converted to the `float32` data type and
+  then widened. It is recommended to use `tf.convert_to_tensor` instead of
+  `tf.cast` for any non-tensor inputs.
+
   Args:
     x: A `Tensor` or `SparseTensor` or `IndexedSlices` of numeric type. It could
       be `uint8`, `uint16`, `uint32`, `uint64`, `int8`, `int16`, `int32`,
@@ -981,6 +988,7 @@ def cast(x, dtype, name=None):
 
   Raises:
     TypeError: If `x` cannot be cast to the `dtype`.
+
   """
   base_type = dtypes.as_dtype(dtype).base_dtype
   if isinstance(x,
@@ -999,10 +1007,15 @@ def cast(x, dtype, name=None):
       # allows some conversions that cast() can't do, e.g. casting numbers to
       # strings.
       x = ops.convert_to_tensor(x, name="x")
+      if x.dtype.is_complex and base_type.is_floating:
+        logging.warn(
+            f"You are casting an input of type {x.dtype.name} to an "
+            f"incompatible dtype {base_type.name}.  This will "
+            "discard the imaginary part and may not be what you "
+            "intended."
+        )
       if x.dtype != base_type:
         x = gen_math_ops.cast(x, base_type, name=name)
-    if x.dtype.is_complex and base_type.is_floating:
-      logging.warn("Casting complex to real discards imaginary part.")
     return x
 
 
@@ -1012,9 +1025,10 @@ def cast(x, dtype, name=None):
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
-  This function casts the input to `dtype` without applying any scaling.  If
+  This function casts the input to `dtype` without overflow.  If
   there is a danger that values would over or underflow in the cast, this op
-  applies the appropriate clamping before the cast.
+  applies the appropriate clamping before the cast.  See `tf.cast` for more
+  details.
 
   Args:
     value: A `Tensor`.
@@ -1029,14 +1043,52 @@ def saturate_cast(value, dtype, name=None):
   with ops.name_scope(name, "saturate_cast", [value]) as name:
     value = ops.convert_to_tensor(value, name="value")
     dtype = dtypes.as_dtype(dtype).base_dtype
-    if value.dtype.min < dtype.min:
-      value = gen_math_ops.maximum(
-          value,
-          ops.convert_to_tensor(dtype.min, dtype=value.dtype, name="min"))
-    if value.dtype.max > dtype.max:
-      value = gen_math_ops.minimum(
-          value,
-          ops.convert_to_tensor(dtype.max, dtype=value.dtype, name="max"))
+
+    in_dtype = value.dtype
+    if in_dtype.is_complex:
+      if dtype.is_complex:
+        # Clamp real and imag components separately, if required.
+        real_in_dtype = in_dtype.real_dtype
+        real_out_dtype = dtype.real_dtype
+        if real_in_dtype.min < real_out_dtype.min or real_in_dtype.max > real_out_dtype.max:
+          value = gen_math_ops._clip_by_value(
+              value,
+              ops.convert_to_tensor(
+                  builtins.complex(real_out_dtype.min, real_out_dtype.min),
+                  dtype=in_dtype),
+              ops.convert_to_tensor(
+                  builtins.complex(real_out_dtype.max, real_out_dtype.max),
+                  dtype=in_dtype),
+              name="clamp")
+        return cast(value, dtype, name=name)
+      else:
+        # Extract real component and fall through to clamp+cast.
+        value = real(value)
+        logging.warn("Casting complex to real discards imaginary part.")
+        in_dtype = in_dtype.real_dtype
+
+    # in_dtype is real, but out_dtype could be complex.
+    out_real_dtype = dtype.real_dtype
+    if in_dtype.min < out_real_dtype.min or in_dtype.max > out_real_dtype.max:
+
+      # Wrap changes to maintain TensorFlow's forward-compatibility window.
+      if not dtype.is_complex and not tf_compat.forward_compatible(2023, 1, 16):
+        # Old behavior using max/min.
+        if in_dtype.min < dtype.min:
+          value = gen_math_ops.maximum(
+              value,
+              ops.convert_to_tensor(dtype.min, dtype=value.dtype, name="min"))
+        if in_dtype.max > dtype.max:
+          value = gen_math_ops.minimum(
+              value,
+              ops.convert_to_tensor(dtype.max, dtype=value.dtype, name="max"))
+      else:
+        # New behavior using clip.
+        value = gen_math_ops._clip_by_value(
+            value,
+            ops.convert_to_tensor(out_real_dtype.min, dtype=in_dtype),
+            ops.convert_to_tensor(out_real_dtype.max, dtype=in_dtype),
+            name="clamp")
     return cast(value, dtype, name=name)
 
 
@@ -2130,8 +2182,8 @@ def _range_tensor_conversion_function(value, dtype=None, name=None,
   return range(value.start, value.stop, value.step, dtype=dtype, name=name)
 
 
-ops.register_tensor_conversion_function(builtins.range,
-                                        _range_tensor_conversion_function)
+tensor_conversion_registry.register_tensor_conversion_function(
+    builtins.range, _range_tensor_conversion_function)
 
 
 # Reduction operations
@@ -3588,7 +3640,7 @@ def matmul(a,
       for some support for `tf.sparse.SparseTensor` multiplication.
     b_is_sparse: If `True`, `b` is treated as a sparse matrix. Notice, this
       **does not support `tf.sparse.SparseTensor`**, it just makes optimizations
-      that assume most values in `a` are zero.
+      that assume most values in `b` are zero.
       See `tf.sparse.sparse_dense_matmul`
       for some support for `tf.sparse.SparseTensor` multiplication.
     output_type: The output datatype if needed. Defaults to None in which case
@@ -3830,40 +3882,6 @@ def matmul_wrapper(a, b, name=None):  # pylint: disable=missing-function-docstri
 @dispatch.add_dispatch_support
 
 
-@ops.RegisterStatistics("MatMul", "flops")
-def _calc_mat_mul_flops(graph, node):
-  """Calculates the compute resources needed for MatMul."""
-  transpose_a = node.attr["transpose_a"].b
-  a_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
-  a_shape.assert_is_fully_defined()
-  if transpose_a:
-    k = int(a_shape[0])
-  else:
-    k = int(a_shape[1])
-  output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
-  output_shape.assert_is_fully_defined()
-  output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (k * output_count * 2))
-
-
-@ops.RegisterStatistics("BatchMatMul", "flops")
-@ops.RegisterStatistics("BatchMatMulV2", "flops")
-@ops.RegisterStatistics("BatchMatMulV3", "flops")
-def _calc_batch_mat_mul_flops(graph, node):
-  """Calculates the compute resources needed for BatchMatMul."""
-  transpose_a = node.attr["transpose_a"].b
-  a_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
-  a_shape.assert_is_fully_defined()
-  if transpose_a:
-    k = int(a_shape[-2])
-  else:
-    k = int(a_shape[-1])
-  output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
-  output_shape.assert_is_fully_defined()
-  output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (k * output_count * 2))
-
-
 def _as_indexed_slices(x, optimize=True):
   """Convert 'x' to IndexedSlices.
 
@@ -3990,8 +4008,8 @@ def add(x, y, name=None):
 
   Args:
     x: A `tf.Tensor`. Must be one of the following types: bfloat16, half,
-      float32, float64, uint8, int8, int16, int32, int64, complex64, complex128,
-      string.
+      float16, float32, float64, uint8, uint16, uint32, uint64, int8, int16,
+      int32, int64, complex64, complex128, string.
     y: A `tf.Tensor`. Must have the same type as x.
     name: A name for the operation (optional)
   """
@@ -4024,10 +4042,10 @@ def add_n(inputs, name=None):
 
   See Also:
 
-  * `tf.reduce_sum(inputs, axis=0)` - This performe the same mathematical
+  * `tf.reduce_sum(inputs, axis=0)` - This performs the same mathematical
     operation, but `tf.add_n` may be more efficient because it sums the
     tensors directly. `reduce_sum` on the other hand calls
-    `tf.convert_to_tensor` on the list of tensors, unncessairly stacking them
+    `tf.convert_to_tensor` on the list of tensors, unnecessarily stacking them
     into a single tensor before summing.
 
   Args:
@@ -4046,7 +4064,7 @@ def add_n(inputs, name=None):
   if not inputs or not isinstance(inputs, collections_abc.Iterable):
     raise ValueError("Inputs must be an iterable of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape.")
-  inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
+  inputs = indexed_slices.convert_n_to_tensor_or_indexed_slices(inputs)
   if not all(
       isinstance(x, (ops.Tensor, indexed_slices.IndexedSlices))
       for x in inputs):
@@ -4064,7 +4082,6 @@ def add_n(inputs, name=None):
   return gen_math_ops.add_n(inputs, name=name)
 
 
-
 @tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated(None, "Use `tf.math.add_n` Instead")
@@ -4124,7 +4141,7 @@ def _input_error():
 
   if not inputs or not isinstance(inputs, (list, tuple)):
     raise _input_error()
-  inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
+  inputs = indexed_slices.convert_n_to_tensor_or_indexed_slices(inputs)
   if not all(isinstance(x, ops.Tensor) for x in inputs):
     raise _input_error()
   if not all(x.dtype == inputs[0].dtype for x in inputs):
@@ -5264,6 +5281,40 @@ def reciprocal_no_nan(x, name=None):
     return gen_math_ops.div_no_nan(one, x, name=scope)
 
 
+@tf_export("math.xdivy")
+@dispatch.register_binary_elementwise_api
+@dispatch.add_dispatch_support
+def xdivy(x, y, name=None):
+  """Computes `x / y`.
+
+  Given `x` and `y`, computes `x / y`. This function safely returns
+  zero when `x = 0`, no matter what the value of `y` is.
+
+  Example:
+
+  >>> tf.math.xdivy(1., 2.)
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.5>
+  >>> tf.math.xdivy(0., 1.)
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.0>
+  >>> tf.math.xdivy(0., 0.)
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.0>
+  >>> tf.math.xdivy(1., 0.)
+  <tf.Tensor: shape=(), dtype=float32, numpy=inf>
+
+  Args:
+    x: A `tf.Tensor` of type `half`, `float32`, `float64`, `complex64`,
+      `complex128`
+    y: A `tf.Tensor` of type `half`, `float32`, `float64`, `complex64`,
+      `complex128`
+    name: A name for the operation (optional).
+
+  Returns:
+    `x / y`.
+  """
+  with ops.name_scope(name, "xdivy", [x]):
+    return gen_math_ops.xdivy(x, y)
+
+
 @tf_export("math.xlog1py")
 @dispatch.register_binary_elementwise_api
 @dispatch.add_dispatch_support
@@ -5285,10 +5336,10 @@ def xlog1py(x, y, name=None):
   <tf.Tensor: shape=(), dtype=float32, numpy=0.>
 
   Args:
-    x: A `tf.Tensor` of type `bfloat16`, `half`, `float32`, `float64`,
-      `complex64`, `complex128`
-    y: A `tf.Tensor` of type `bfloat16`, `half`, `float32`, `float64`,
-      `complex64`, `complex128`
+    x: A `tf.Tensor` of type `half`, `float32`, `float64`, `complex64`,
+      `complex128`
+    y: A `tf.Tensor` of type `half`, `float32`, `float64`, `complex64`,
+      `complex128`
     name: A name for the operation (optional).
 
   Returns:
@@ -5559,8 +5610,7 @@ def acos(x, name=None):
 
   Args:
     x: A `Tensor`. Must be one of the following types: `bfloat16`, `half`,
-      `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`,
-      `complex64`, `complex128`.
+      `float32`, `float64`, `complex64`, `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -5618,7 +5668,6 @@ def floor(x, name=None):
 dispatch.register_binary_elementwise_api(gen_math_ops.squared_difference)
 dispatch.register_binary_elementwise_api(gen_math_ops.truncate_div)
 dispatch.register_binary_elementwise_api(gen_math_ops.truncate_mod)
-dispatch.register_binary_elementwise_api(gen_math_ops.xdivy)
 dispatch.register_binary_elementwise_api(gen_math_ops.xlogy)
 dispatch.register_binary_elementwise_api(gen_math_ops.zeta)
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index c2d2bf24283..663836e6f9f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -685,6 +685,12 @@ def testTruncateDivideInt(self):
     np_result = self.numpySafeTruncateDivInt(nums, divs)
     self.assertAllEqual(tf_result, np_result)
 
+  def testTruncateDivideFloat(self):
+    nums, divs = self.floatTestData()
+    tf_result = math_ops.truncatediv(nums, divs)
+    np_result = np.trunc(nums / divs)
+    self.assertAllEqual(tf_result, np_result)
+
   @test_util.deprecated_graph_mode_only
   def testDivideName(self):
     op = math_ops.divide(
diff --git a/tensorflow/python/ops/memory_tests/BUILD b/tensorflow/python/ops/memory_tests/BUILD
index 64814e134ec..6ec8b8ad49a 100644
--- a/tensorflow/python/ops/memory_tests/BUILD
+++ b/tensorflow/python/ops/memory_tests/BUILD
@@ -4,7 +4,10 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "custom_gradient_memory_test",
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5004e8625be..c54969c1442 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -95,7 +95,8 @@ def _test_inference(self,
     # An atol value of 1e-3 is too small for float16's, because some adjacent
     # float16 values that y_val can take are greater than 1e-3 apart, e.g.
     # 2.16602 and 2.16797.
-    atol = 2e-3 if x_dtype == np.float16 else 1e-3
+    atol = 2e-3 if x_dtype in [np.float16, dtypes.bfloat16.as_numpy_dtype
+                              ] else 1e-3
     self.assertAllClose(y_ref, y_val, atol=atol)
 
   def _running_mean(self, old_mean, new_val, factor):
@@ -177,12 +178,17 @@ def _test_training(self,
                                                     old_mean_val, old_var_val,
                                                     exponential_avg_factor,
                                                     epsilon, data_format)
-    y_atol = 2e-3 if x_dtype == np.float16 else 1e-3
+    y_atol = 1e-3
+    if x_dtype == np.float16:
+      y_atol = 2e-3
+    elif x_dtype == dtypes.bfloat16.as_numpy_dtype:
+      y_atol = 1e-2
     self.assertAllClose(y_ref, y_val, atol=y_atol)
     self.assertAllClose(mean_ref, mean_val, atol=1e-3)
     self.assertAllClose(var_ref, var_val, atol=1e-3)
 
-  def _compute_gradient_error_float16(self, x, x32, x_shape, y, y32, y_shape):
+  def _compute_gradient_error_float16(self, x, x32, x_shape, y, y32, y_shape,
+                                      x_dtype):
     """Computes the gradient error for float16 inputs and/or outputs.
 
     This returns the same value as gradient_checker.compute_gradient_error. The
@@ -199,12 +205,13 @@ def _compute_gradient_error_float16(self, x, x32, x_shape, y, y32, y_shape):
       y: The output tensor.
       y32: A float32 version of y. Must be calculated based on x32, not x.
       y_shape: The shape of y.
+      x_dtype: The type of x, float16 or bfloat16.
 
     Returns:
       The maximum error in between the two Jacobians, as in
       gradient_checker.compute_gradient_error.
     """
-    x_init_val = np.random.random_sample(x_shape).astype(np.float16)
+    x_init_val = np.random.random_sample(x_shape).astype(x_dtype)
     x32_init_val = x_init_val.astype(np.float32)
 
     # TODO(reedwm): Do not perform the unnecessary computations in
@@ -252,7 +259,7 @@ def _test_gradient(self,
           exponential_avg_factor=exponential_avg_factor,
           data_format=data_format,
           is_training=is_training)
-      if x_dtype != np.float16:
+      if x_dtype not in [np.float16, dtypes.bfloat16.as_numpy_dtype]:
         err_x = gradient_checker.compute_gradient_error(x, x_shape, y, x_shape)
         err_scale = gradient_checker.compute_gradient_error(
             scale, scale_shape, y, x_shape)
@@ -270,13 +277,17 @@ def _test_gradient(self,
             exponential_avg_factor=exponential_avg_factor,
             is_training=is_training)
         err_x = self._compute_gradient_error_float16(x, x32, x_shape, y, y32,
-                                                     x_shape)
+                                                     x_shape, x_dtype)
         err_scale = self._compute_gradient_error_float16(
-            scale, scale, scale_shape, y, y32, x_shape)
+            scale, scale, scale_shape, y, y32, x_shape, x_dtype)
         err_offset = self._compute_gradient_error_float16(
-            offset, offset, scale_shape, y, y32, x_shape)
+            offset, offset, scale_shape, y, y32, x_shape, x_dtype)
 
-    x_err_tolerance = 2e-3 if x_dtype == np.float16 else 1e-3
+    x_err_tolerance = 1e-3
+    if x_dtype == np.float16:
+      x_err_tolerance = 2e-3
+    elif dtypes.bfloat16.as_numpy_dtype:
+      x_err_tolerance = 2e-2
     scale_err_tolerance = 1e-3
     self.assertLess(err_x, x_err_tolerance)
     self.assertLess(err_scale, scale_err_tolerance)
@@ -331,7 +342,7 @@ def _test_grad_grad(self,
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
 
-      if x_dtype != np.float16:
+      if x_dtype not in [np.float16, dtypes.bfloat16.as_numpy_dtype]:
         err_grad_grad_y_1 = gradient_checker.compute_gradient_error(
             grad_y, x_shape, grad_x, x_shape)
         err_grad_grad_y_2 = gradient_checker.compute_gradient_error(
@@ -363,20 +374,22 @@ def _test_grad_grad(self,
         grad_x32, grad_scale32, grad_offset32 = gradients_impl.gradients(
             y32, [x32, scale, offset], grad_y32)
         err_grad_grad_y_1 = self._compute_gradient_error_float16(
-            grad_y, grad_y32, x_shape, grad_x, grad_x32, x_shape)
+            grad_y, grad_y32, x_shape, grad_x, grad_x32, x_shape, x_dtype)
         err_grad_grad_y_2 = self._compute_gradient_error_float16(
-            grad_y, grad_y32, x_shape, grad_scale, grad_scale32, scale_shape)
+            grad_y, grad_y32, x_shape, grad_scale, grad_scale32, scale_shape,
+            x_dtype)
         err_grad_grad_y_3 = self._compute_gradient_error_float16(
-            grad_y, grad_y32, x_shape, grad_offset, grad_offset32, scale_shape)
+            grad_y, grad_y32, x_shape, grad_offset, grad_offset32, scale_shape,
+            x_dtype)
         # In freeze mode, grad_x is not a function of x.
         if is_training:
           err_grad_x_1 = self._compute_gradient_error_float16(
-              x, x32, x_shape, grad_x, grad_x32, x_shape)
+              x, x32, x_shape, grad_x, grad_x32, x_shape, x_dtype)
         err_grad_x_2 = self._compute_gradient_error_float16(
-            x, x32, x_shape, grad_scale, grad_scale32, scale_shape)
+            x, x32, x_shape, grad_scale, grad_scale32, scale_shape, x_dtype)
 
         err_grad_scale = self._compute_gradient_error_float16(
-            scale, scale, scale_shape, grad_x, grad_x32, x_shape)
+            scale, scale, scale_shape, grad_x, grad_x32, x_shape, x_dtype)
 
     self.assertLess(err_grad_grad_y_1, err_tolerance)
     self.assertLess(err_grad_grad_y_2, err_tolerance)
@@ -396,8 +409,10 @@ def _runtests(self, x_shape, is_training, gradient_test=False,
     if test.is_gpu_available(cuda_only=True) and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
-    for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32, dtypes.bfloat16.as_numpy_dtype]:
       for use_gpu in use_gpu_vals:
+        if dtype == dtypes.bfloat16.as_numpy_dtype and not use_gpu:
+          continue
         for data_format in data_format_list:
           if data_format == 'NHWC' or data_format == 'NDHWC':
             scale_shape = x_shape[-1:]
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index d809a71f529..be9f4a53e41 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -150,9 +150,10 @@ def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
   data_format = op.get_attr("data_format").decode()
+  shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
   return [
       nn_ops.conv3d_backprop_input_v2(
-          array_ops.shape(op.inputs[0]),
+          shape_0,
           op.inputs[1],
           grad,
           dilations=op.get_attr("dilations"),
@@ -161,7 +162,7 @@ def _Conv3DGrad(op, grad):
           data_format=data_format),
       nn_ops.conv3d_backprop_filter_v2(
           op.inputs[0],
-          array_ops.shape(op.inputs[1]),
+          shape_1,
           grad,
           dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index b6383318a12..fd326397e55 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -109,15 +109,13 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
 
 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
 @dispatch.add_dispatch_support
-def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
-    _sentinel=None,
+def sigmoid_cross_entropy_with_logits(
     labels=None,
     logits=None,
     name=None):
   """See sigmoid_cross_entropy_with_logits_v2."""
   # pylint: disable=protected-access
-  nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", _sentinel,
-                           labels, logits)
+  nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", labels, logits)
   # pylint: enable=protected-access
 
   with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
@@ -1687,11 +1685,6 @@ def fused_batch_norm(
   if variance is None:
     variance = constant_op.constant([])
 
-  # Set a minimum epsilon to 1.001e-5, which is a requirement by CUDNN to
-  # prevent exception (see cudnn.h).
-  min_epsilon = 1.001e-5
-  epsilon = epsilon if epsilon > min_epsilon else min_epsilon
-
   y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
       x,
       scale,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a70a356e648..4fabeb97863 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -3717,7 +3717,7 @@ def gelu(features, approximate=False, name=None):
       dtype=float32)
 
   Args:
-    features: A `Tensor` representing preactivation values.
+    features: A `float Tensor` representing preactivation values.
     approximate: An optional `bool`. Defaults to `False`. Whether to enable
       approximation.
     name: A name for the operation (optional).
@@ -3725,6 +3725,9 @@ def gelu(features, approximate=False, name=None):
   Returns:
     A `Tensor` with the same type as `features`.
 
+  Raises:
+    ValueError: if `features` is not a floating point `Tensor`.
+
   References:
     [Gaussian Error Linear Units (GELUs)](https://arxiv.org/abs/1606.08415).
   """
@@ -3971,14 +3974,9 @@ def log_softmax_v2(logits, axis=None, name=None):
   return _wrap_2d_function(logits, gen_nn_ops.log_softmax, axis, name)
 
 
-def _ensure_xent_args(name, sentinel, labels, logits):
-  # Make sure that all arguments were passed as named arguments.
-  if sentinel is not None:
-    raise ValueError(
-        f"Only call {name} with named arguments (labels=..., logits=..., ...). "
-        f"Received unnamed argument: {sentinel}")
+def _ensure_xent_args(name, labels, logits):
   if labels is None or logits is None:
-    raise ValueError("Both `labels` and `logits` must be provided. "
+    raise ValueError(f"Both `labels` and `logits` must be provided for {name}"
                      f"Received: labels={labels} and logits={logits}")
 
 
@@ -4181,7 +4179,6 @@ def _move_dim_to_end(tensor, dim_index, rank):
 @dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
-    _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
     logits=None,
     dim=-1,
@@ -4218,7 +4215,6 @@ def softmax_cross_entropy_with_logits(
   this function.**
 
   Args:
-    _sentinel: Used to prevent positional parameters. Internal, do not use.
     labels: Each vector along the class dimension should hold a valid
       probability distribution e.g. for the case in which labels are of shape
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
@@ -4235,8 +4231,7 @@ def softmax_cross_entropy_with_logits(
     not have the last dimension of `labels`.
   """
   dim = deprecated_argument_lookup("axis", axis, "dim", dim)
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
-                    logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", labels, logits)
 
   with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg",
                       [logits, labels]) as name:
@@ -4278,7 +4273,6 @@ def _sparse_softmax_cross_entropy_with_rank_2_logits(logits, labels, name):
 @tf_export(v1=["nn.sparse_softmax_cross_entropy_with_logits"])
 @dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits(
-    _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
     logits=None,
     name=None):
@@ -4311,7 +4305,6 @@ def sparse_softmax_cross_entropy_with_logits(
   this function.**
 
   Args:
-    _sentinel: Used to prevent positional parameters. Internal, do not use.
     labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
       `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
       must be an index in `[0, num_classes)`. Other values will raise an
@@ -4331,8 +4324,7 @@ def sparse_softmax_cross_entropy_with_logits(
     ValueError: If logits are scalars (need to have rank >= 1) or if the rank
       of the labels is not equal to the rank of the logits minus one.
   """
-  _ensure_xent_args("sparse_softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("sparse_softmax_cross_entropy_with_logits", labels, logits)
 
   # TODO(pcmurray) Raise an error when the label is not an index in
   # [0, num_classes). Note: This could break users who call this with bad
@@ -4796,6 +4788,11 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   Returns:
     A `Tensor` of format specified by `data_format`.
     The max pooled output tensor.
+
+  Raises:
+    ValueError: If
+      - explicit padding is used with an input tensor of rank 5.
+      - explicit padding is used with data_format='NCHW_VECT_C'.
   """
   if input.shape is not None:
     n = len(input.shape) - 2
@@ -5050,6 +5047,9 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   Returns:
     A `Tensor` of format specified by `data_format`.
     The max pooled output tensor.
+
+  Raises:
+    ValueError: If explicit padding is used with data_format='NCHW_VECT_C'.
   """
   with ops.name_scope(name, "MaxPool2d", [input]) as name:
     if data_format is None:
@@ -5655,6 +5655,77 @@ def dummy_rng_step():
                   default_name="stateless_dropout")
 
 
+@tf_export("nn.experimental.general_dropout")
+@dispatch.add_dispatch_support
+def general_dropout(x, rate, uniform_sampler, noise_shape=None, name=None):
+  """Computes dropout: randomly sets elements to zero to prevent overfitting.
+
+  Please see `tf.nn.experimental.stateless_dropout` for an overview
+  of dropout.
+
+  Unlike `tf.nn.experimental.stateless_dropout`, here you can supply a
+  custom sampler function `uniform_sampler` that (given a shape and a
+  dtype) generates a random, `Uniform[0, 1)`-distributed tensor (of
+  that shape and dtype).  `uniform_sampler` can be
+  e.g. `tf.random.stateless_random_uniform` or
+  `tf.random.Generator.uniform`.
+
+  For example, if you are using `tf.random.Generator` to generate
+  random numbers, you can use this code to do dropouts:
+
+  >>> g = tf.random.Generator.from_seed(7)
+  >>> sampler = g.uniform
+  >>> x = tf.constant([1.1, 2.2, 3.3, 4.4, 5.5])
+  >>> rate = 0.5
+  >>> tf.nn.experimental.general_dropout(x, rate, sampler)
+  <tf.Tensor: shape=(5,), ..., numpy=array([ 0. ,  4.4,  6.6,  8.8, 11. ], ...)>
+  >>> tf.nn.experimental.general_dropout(x, rate, sampler)
+  <tf.Tensor: shape=(5,), ..., numpy=array([2.2, 0. , 0. , 8.8, 0. ], ...)>
+
+  It has better performance than using
+  `tf.nn.experimental.stateless_dropout` and
+  `tf.random.Generator.make_seeds`:
+
+  >>> g = tf.random.Generator.from_seed(7)
+  >>> x = tf.constant([1.1, 2.2, 3.3, 4.4, 5.5])
+  >>> rate = 0.5
+  >>> tf.nn.experimental.stateless_dropout(x, rate, g.make_seeds(1)[:, 0])
+  <tf.Tensor: shape=(5,), ..., numpy=array([ 2.2,  4.4,  6.6,  0. , 11. ], ...)>
+  >>> tf.nn.experimental.stateless_dropout(x, rate, g.make_seeds(1)[:, 0])
+  <tf.Tensor: shape=(5,), ..., numpy=array([2.2, 0. , 6.6, 8.8, 0. ], ...>
+
+  because generating and consuming seeds cost extra
+  computation. `tf.nn.experimental.general_dropout` can let you avoid
+  them.
+
+  Args:
+    x: A floating point tensor.
+    rate: A scalar `Tensor` with the same type as x. The probability
+      that each element is dropped. For example, setting rate=0.1 would drop
+      10% of input elements.
+    uniform_sampler: a callable of signature `(shape, dtype) ->
+      Tensor[shape, dtype]`, used to generate a tensor of uniformly-distributed
+      random numbers in the range `[0, 1)`, of the given shape and dtype.
+    noise_shape: A 1-D integer `Tensor`, representing the
+      shape for randomly generated keep/drop flags.
+    name: A name for this operation.
+
+  Returns:
+    A Tensor of the same shape and dtype of `x`.
+
+  Raises:
+    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating point
+      tensor. `rate=1` is disallowed, because the output would be all zeros,
+      which is likely not what was intended.
+  """
+  def dummy_rng_step():
+    pass
+  return _dropout(x=x, rate=rate, noise_shape=noise_shape,
+                  uniform_sampler=uniform_sampler,
+                  dummy_rng_step=dummy_rng_step, name=name,
+                  default_name="general_dropout")
+
+
 def _dropout(x, rate, noise_shape, uniform_sampler, dummy_rng_step, name,
              default_name):
   """Shared implementation of the various dropout functions.
@@ -5665,7 +5736,7 @@ def _dropout(x, rate, noise_shape, uniform_sampler, dummy_rng_step, name,
     noise_shape: same as the namesake in `dropout_v2`.
     uniform_sampler: a callable of signature `(shape, dtype) ->
       Tensor`, used to generate a tensor of uniformly-distributed
-      random numbers, of the given shape and dtype.
+      random numbers in the range `[0, 1)`, of the given shape and dtype.
     dummy_rng_step: a callable of signature `() -> None`, to make a
       dummy RNG call in the fast path. In the fast path where rate is
       0, we don't need to generate random numbers, but some samplers
@@ -6479,10 +6550,37 @@ def in_top_k(predictions, targets, k, name=None):
 @tf_export("math.in_top_k", "nn.in_top_k", v1=[])
 @dispatch.add_dispatch_support
 def in_top_k_v2(targets, predictions, k, name=None):
-  return in_top_k(predictions, targets, k, name)
+  """Outputs whether the targets are in the top `K` predictions.
 
+  This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+  prediction for the target class is finite (not inf, -inf, or nan) and among
+  the top `k` predictions among all predictions for example `i`.
+  `predictions` does not have to be normalized.
+
+  Note that the behavior of `InTopK` differs from the `TopK` op in its handling
+  of ties; if multiple classes have the same prediction value and straddle the
+  top-`k` boundary, all of those classes are considered to be in the top `k`.
+
+  >>> target = tf.constant([0, 1, 3])
+  >>> pred = tf.constant([
+  ...  [1.2, -0.3, 2.8, 5.2],
+  ...  [0.1, 0.0, 0.0, 0.0],
+  ...  [0.0, 0.5, 0.3, 0.3]],
+  ...  dtype=tf.float32)
+  >>> print(tf.math.in_top_k(target, pred, 2))
+  tf.Tensor([False  True  True], shape=(3,), dtype=bool)
 
-in_top_k_v2.__doc__ = in_top_k.__doc__
+  Args:
+    targets: A `batch_size` vector of class ids. Must be `int32` or `int64`.
+    predictions: A `batch_size` x `classes` tensor of type `float32`.
+    k: An `int`. The parameter to specify search space.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with the same shape of `targets` with type of `bool`. Each
+      element specifies if the target falls into top-k predictions.
+  """
+  return in_top_k(predictions, targets, k, name)
 
 
 tf_export(v1=["nn.quantized_avg_pool"])(
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 35c32539633..9516af87e71 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -20,6 +20,7 @@
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,6 +36,7 @@
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import stateful_random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -231,7 +233,9 @@ def testGradient(self, x_shape):
 class L2LossTest(test_lib.TestCase):
 
   def testL2Loss(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
+    for dtype in [dtypes.float32, dtypes.float64] + \
+                 [dtypes.bfloat16] if test_util.is_gpu_available(
+                                          cuda_only=True) else []:
       x = constant_op.constant([1.0, 0.0, 3.0, 2.0],
                                shape=[2, 2],
                                name="x",
@@ -314,21 +318,28 @@ def testL2NormalizeComplex(self):
     ("stateful_v2", nn_ops.dropout_v2),
     ("stateless", functools.partial(nn_ops.stateless_dropout, seed=(1, 2))),
     ("stateless_philox", functools.partial(
-        nn_ops.stateless_dropout, seed=(1, 2), rng_alg="philox"))]
+        nn_ops.stateless_dropout, seed=(1, 2), rng_alg="philox")),
+    ("generator", functools.partial(  # pylint: disable=g-long-lambda
+        nn_ops.general_dropout, uniform_sampler=lambda shape, dtype: (  # pylint: disable=g-long-lambda
+            stateful_random_ops.Generator.from_seed(1).uniform(
+                shape=shape, dtype=dtype)))),
+    ]
 
 
 class DropoutTest(test_lib.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ("_%s_%s_%s" % (case_name, use_noise_shape, keep_prob), dropout_fn,  # pylint: disable=g-complex-comprehension
-       use_noise_shape, keep_prob)
+      ("_%s_%s_%s" % (case_name, use_noise_shape, keep_prob), case_name,  # pylint: disable=g-complex-comprehension
+       dropout_fn, use_noise_shape, keep_prob)
       for keep_prob in [0.1, 0.5, 0.8]
       for use_noise_shape in ["no", "concrete", "partial"]
       for case_name, dropout_fn in DROPOUT_FNS)
-  def testDropout(self, dropout_fn, use_noise_shape, keep_prob):
+  def testDropout(self, case_name, dropout_fn, use_noise_shape, keep_prob):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
     # that it is producing approximately the right number of ones over a large
     # number of samples, based on the keep probability.
+    if "generator" in case_name and not context.executing_eagerly():
+      self.skipTest("tf.random.Generator can only be used in TF2.")
     if use_noise_shape == "no":
       x_dim = 70
       y_dim = 30
@@ -360,11 +371,13 @@ def testDropout(self, dropout_fn, use_noise_shape, keep_prob):
     self.assertLess(rel_error, 0.15)
 
   @parameterized.named_parameters(
-      ("_%s_%s" % (case_name, keep_prob), dropout_fn, keep_prob)  # pylint: disable=g-complex-comprehension
+      ("_%s_%s" % (case_name, keep_prob), case_name, dropout_fn, keep_prob)  # pylint: disable=g-complex-comprehension
       for keep_prob in [0.1, 0.5, 0.8]
       for case_name, dropout_fn in DROPOUT_FNS)
-  def testShapedDropoutCorrelation(self, dropout_fn, keep_prob):
+  def testShapedDropoutCorrelation(self, case_name, dropout_fn, keep_prob):
     # Runs a shaped dropout and tests that the correlations are correct.
+    if "generator" in case_name and not context.executing_eagerly():
+      self.skipTest("tf.random.Generator can only be used in TF2.")
     x_dim = 40
     y_dim = 30
     num_iter = 10
@@ -384,7 +397,6 @@ def testShapedDropoutCorrelation(self, dropout_fn, keep_prob):
       for use_keep_prob in [False, True]
       for keep_prob in [0.1, 0.5, 0.8]
       for case_name, dropout_fn in DROPOUT_FNS)
-  @test_util.run_deprecated_v1
   def testDropoutPlaceholderRateAndKeepProb(self, case_name, dropout_fn,
                                             keep_prob, use_keep_prob):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
@@ -392,26 +404,26 @@ def testDropoutPlaceholderRateAndKeepProb(self, case_name, dropout_fn,
     # number of samples, based on the keep probability.
     if use_keep_prob and case_name != "stateful_v1":
       self.skipTest("Only V1 `dropout` has the `keep_prob` argument.")
+    if "generator" in case_name and not context.executing_eagerly():
+      self.skipTest("tf.random.Generator can only be used in TF2.")
     x_dim = 70
     y_dim = 30
     num_iter = 10
-    with self.cached_session():
-      t = constant_op.constant(
-          1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-      keep_prob_placeholder = array_ops.placeholder(dtypes.float32)
+    t = constant_op.constant(
+        1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    final_count = 0
+    for _ in range(0, num_iter):
       if use_keep_prob:
-        dropout = dropout_fn(t, keep_prob=keep_prob_placeholder)
+        dropout = dropout_fn(t, keep_prob=keep_prob)
       else:
-        dropout = dropout_fn(t, rate=1 - keep_prob_placeholder)
-      final_count = 0
+        dropout = dropout_fn(t, rate=1 - keep_prob)
       self.assertEqual([x_dim, y_dim], dropout.get_shape())
-      for _ in range(0, num_iter):
-        value = dropout.eval(feed_dict={keep_prob_placeholder: keep_prob})
-        final_count += np.count_nonzero(value)
-        # Verifies that there are only two values: 0 and 1/keep_prob.
-        sorted_value = np.unique(np.sort(value))
-        self.assertEqual(0, sorted_value[0])
-        self.assertAllClose(1 / keep_prob, sorted_value[1])
+      value = self.evaluate(dropout)
+      final_count += np.count_nonzero(value)
+      # Verifies that there are only two values: 0 and 1/keep_prob.
+      sorted_value = np.unique(np.sort(value))
+      self.assertEqual(0, sorted_value[0])
+      self.assertAllClose(1 / keep_prob, sorted_value[1])
     # Check that we are in the 15% error range
     expected_count = x_dim * y_dim * keep_prob * num_iter
     rel_error = math.fabs(final_count - expected_count) / expected_count
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 32ee246b535..f64f177e710 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -52,6 +52,16 @@ def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
 def verify_tensor_all_finite_v2(x, message, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
+  >>> @tf.function
+  ... def f(x):
+  ...   x = tf.debugging.assert_all_finite(x, 'Input x must be all finite')
+  ...   return x + 1
+
+  >>> f(tf.constant([np.inf, 1, 2]))
+  Traceback (most recent call last):
+     ...
+  InvalidArgumentError: ...
+
   Args:
     x: Tensor to check.
     message: Message to log on failure.
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 2ed7b23e3f4..05821dd2e76 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "numpy",
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
index a4307891e65..2eccb3694ce 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/BUILD
+++ b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
@@ -1,6 +1,8 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 licenses(["notice"])
 
 py_test(
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
index 2e088240c91..2b5240196b7 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
@@ -1,4 +1,7 @@
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_binary(
     name = "micro_benchmarks",
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index 957227f1203..c97c05fe97a 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -385,14 +385,14 @@ def kron(a, b):  # pylint: disable=missing-function-docstring
   # pylint: disable=protected-access,g-complex-comprehension
   a, b = np_array_ops._promote_dtype(a, b)
   t_a = np_utils.cond(
-      a.ndim < b.ndim,
+      a.shape.rank < b.shape.rank,
       lambda: np_array_ops.reshape(  # pylint: disable=g-long-lambda
-          a, np_array_ops._pad_left_to(b.ndim, a.shape)),
+          a, np_array_ops._pad_left_to(b.shape.rank, a.shape)),
       lambda: a)
   t_b = np_utils.cond(
-      b.ndim < a.ndim,
+      b.shape.rank < a.shape.rank,
       lambda: np_array_ops.reshape(  # pylint: disable=g-long-lambda
-          b, np_array_ops._pad_left_to(a.ndim, b.shape)),
+          b, np_array_ops._pad_left_to(a.shape.rank, b.shape)),
       lambda: b)
 
   def _make_shape(shape, prepend):
@@ -526,9 +526,11 @@ def f(x1, x2):
     # Same as the `x2_safe` trick above
     d_safe = array_ops.where_v2(
         math_ops.equal(d, 0), constant_op.constant(1, d.dtype), d)
+    x1 = math_ops.abs(x1)
+    x2 = math_ops.abs(x2)
     return array_ops.where_v2(
         math_ops.equal(d, 0), constant_op.constant(0, d.dtype),
-        math_ops.abs(x1 * x2) // d_safe)
+        x1 * (x2 // d_safe))
 
   return _bin_op(f, x1, x2)
 
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index 5215ccd29dc..09121ef9756 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -125,6 +125,13 @@ def testVDot(self):
     return self._testBinaryOp(
         np_math_ops.vdot, np.vdot, 'vdot', operands=operands)
 
+  def testLcm(self):
+    a = np_array_ops.array(6, dtype=np.int8)
+    b = np_array_ops.array(22, dtype=np.int8)
+    res_tf = np_math_ops.lcm(a, b)
+    res_np = np.lcm(np.array(a), np.array(b))
+    self.assertEqual(res_tf, res_np)
+
   def _testUnaryOp(self, math_fun, np_fun, name):
 
     def run_test(a):
@@ -351,6 +358,7 @@ def testNumpyMethodsOnTensor(self, np_method):
 
 
 if __name__ == '__main__':
+  ops.enable_tensor_equality()
   ops.enable_eager_execution()
   ops.enable_numpy_style_type_promotion()
   np_math_ops.enable_numpy_methods_on_tensor()
diff --git a/tensorflow/python/ops/optional_grad.py b/tensorflow/python/ops/optional_grad.py
index ecafaee2af3..5702dae119d 100644
--- a/tensorflow/python/ops/optional_grad.py
+++ b/tensorflow/python/ops/optional_grad.py
@@ -15,15 +15,16 @@
 """Gradient functions for optional ops."""
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_optional_ops
 
 
 @ops.RegisterGradient("OptionalFromValue")
 def _OptionalFromValueGrad(op, grad):
-  return gen_dataset_ops.optional_get_value(
-      grad, [t.dtype for t in op.inputs], [t.shape for t in op.inputs])
+  return gen_optional_ops.optional_get_value(
+      grad, [t.dtype for t in op.inputs], [t.shape for t in op.inputs]
+  )
 
 
 @ops.RegisterGradient("OptionalGetValue")
 def _OptionalGetValueGrad(unused_op, *grads):
-  return gen_dataset_ops.optional_from_value(grads)
+  return gen_optional_ops.optional_from_value(grads)
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 9bbf521e328..7b6f77a6e0d 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 0aad1dd89a1..a2eed212eff 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -23,7 +23,6 @@
 import numpy as np
 
 from google.protobuf import text_format
-
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.core.framework import graph_pb2
@@ -50,9 +49,9 @@
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gen_optional_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import image_ops
@@ -434,6 +433,7 @@ def loop_fn(i):
     self._test_loop_fn(loop_fn, 3)
 
   def test_conv2d_backprop_input(self):
+    self.skipTest("b/262851489: Fix nightly build for GPU.")
     x_shape = [2, 12, 12, 3]
     filt = random_ops.random_uniform([3, 3, 3, 7])
     grad = random_ops.random_uniform([3, 2, 5, 5, 7])
@@ -1555,12 +1555,13 @@ class OptionalTest(PForTestCase):
   def test_optional_from_value(self):
 
     def loop_fn(i):
-      o = gen_dataset_ops.optional_from_value(
-          [i, i + 1, constant_op.constant(3)])
-      gen_dataset_ops.optional_none()
-      return gen_dataset_ops.optional_get_value(
-          o, [dtypes.int32, dtypes.int32, dtypes.int32],
-          [[], [], []])
+      o = gen_optional_ops.optional_from_value(
+          [i, i + 1, constant_op.constant(3)]
+      )
+      gen_optional_ops.optional_none()
+      return gen_optional_ops.optional_get_value(
+          o, [dtypes.int32, dtypes.int32, dtypes.int32], [[], [], []]
+      )
 
     self._test_loop_fn(loop_fn, 2)
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 0f3fe01af54..d875c45b3a4 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -16,12 +16,12 @@
 # pylint: disable=missing-docstring,g-direct-tensorflow-import
 
 import collections
+from functools import partial
 import string
 import sys
 import traceback
 
 import numpy as np
-from functools import partial
 
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.core.framework import full_type_pb2
@@ -41,12 +41,12 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gen_optional_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_sparse_ops
@@ -2835,7 +2835,7 @@ def _convert_clip_by_value(pfor_input):
   t = pfor_input.stacked_input(0)
   clip_value_min = pfor_input.unstacked_input(1)
   clip_value_max = pfor_input.unstacked_input(2)
-  return wrap(gen_math_ops.clip_by_value(t, clip_value_min, clip_value_max),
+  return wrap(gen_math_ops._clip_by_value(t, clip_value_min, clip_value_max),
               True)
 
 
@@ -3910,8 +3910,9 @@ def _untile_variant(t):
 def _convert_optional_from_value(pfor_input):
   pfor_input.stack_inputs()
   return wrap(
-      gen_dataset_ops.optional_from_value([x.t for x in pfor_input.inputs]),
-      True)
+      gen_optional_ops.optional_from_value([x.t for x in pfor_input.inputs]),
+      True,
+  )
 
 
 @RegisterPFor("OptionalGetValue")
@@ -3926,8 +3927,9 @@ def _convert_optional_get_value(pfor_input):
         [tensor_util.constant_value(pfor_input.pfor.loop_len_vector)])
     shape = loop_len_shape.concatenate(shape)
     output_shapes.append(shape.as_proto())
-  results = gen_dataset_ops.optional_get_value(handle, output_types,
-                                               output_shapes)
+  results = gen_optional_ops.optional_get_value(
+      handle, output_types, output_shapes
+  )
   return [wrap(t, True) for t in results]
 
 
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index c22de7c931d..b1c6987c040 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -321,6 +322,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":segment_id_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bincount_ops",
         "//tensorflow/python:check_ops",
@@ -333,6 +335,8 @@ py_library(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//third_party/py/numpy",
     ],
 )
@@ -346,6 +350,7 @@ py_library(
         ":ragged_tensor_value",
         ":ragged_util",
         ":row_partition",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -364,6 +369,8 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/saved_model:nested_structure_coder",
         "//third_party/py/numpy",
     ],
 )
@@ -404,6 +411,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:extension_type",
         "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index 32b74b4ef06..92a58e8190b 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -41,8 +41,10 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
       dict(
           pylist=[np.array([1, 2]), np.array([3])],
           preferred_dtype=dtypes.string),
-      dict(pylist=np.array([[1, 2], [3]]), preferred_dtype=dtypes.float32),
-      dict(pylist=np.array([[1, 2], [3]]), preferred_dtype=dtypes.string),
+      dict(pylist=np.array([[1, 2], [3]], dtype=object),
+           preferred_dtype=dtypes.float32),
+      dict(pylist=np.array([[1, 2], [3]], dtype=object),
+           preferred_dtype=dtypes.string),
       dict(
           pylist=[np.array([[1], np.array([2])]), [np.array([3])]],
           preferred_dtype=dtypes.float32),
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 2626446d3d2..8151ce162f2 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -118,7 +118,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase,
           expected_shape=(3, None, None, None)),
       dict(
           pylist=np.array([np.array([[], []]),
-                           np.array([]), [[], [[]]]]),
+                           np.array([]), [[], [[]]]], dtype=object),
           expected_shape=(3, None, None, None)),
 
       #=========================================================================
@@ -400,10 +400,11 @@ def testDefaultInnerShapeForPylistHelper(self,
 def _normalize_pylist(item):
   """Convert all (possibly nested) np.arrays contained in item to list."""
   # convert np.arrays in current level to list
-  if np.ndim(item) == 0:
+  if not isinstance(item, (list, np.ndarray)):
     return item
   level = (x.tolist() if isinstance(x, np.ndarray) else x for x in item)
-  return [_normalize_pylist(el) if np.ndim(el) != 0 else el for el in level]
+  return [_normalize_pylist(el) if isinstance(item, (list, np.ndarray))
+          else el for el in level]
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index 2b48338ae8d..b90970d30bb 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -121,7 +121,7 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
           expected_shape=(3, None, None, None)),
       dict(
           pylist=np.array([np.array([[], []]),
-                           np.array([]), [[], [[]]]]),
+                           np.array([]), [[], [[]]]], dtype=object),
           expected_shape=(3, None, None, None)),
 
       #=========================================================================
@@ -320,10 +320,11 @@ def testRaggedValuesError(self,
 def _normalize_pylist(item):
   """Convert all (possibly nested) np.arrays contained in item to list."""
   # convert np.arrays in current level to list
-  if np.ndim(item) == 0:
+  if not isinstance(item, (list, np.ndarray)):
     return item
   level = (x.tolist() if isinstance(x, np.ndarray) else x for x in item)
-  return [_normalize_pylist(el) if np.ndim(el) != 0 else el for el in level]
+  return [_normalize_pylist(el) if isinstance(item, (list, np.ndarray))
+          else el for el in level]
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
index dd8a432d14e..e743b603301 100644
--- a/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
@@ -108,8 +108,8 @@ def testRaggedOneHot(self,
       dict(indices=[[1]], depth=4, axis=-2,
            # Note: the only negative `axis` value supported by
            # array_ops.one_hot is -1.
-           message=(r'axis must be >= -1|'  # graph mode
-                    r'(?i)Expected axis.* to be -1 or between.*'),  # eager mode
+           message=(r'(?i)axis must be >= -1|'  # graph mode
+                    r'Expected axis.* to be -1 or between.*'),  # eager mode
            exception=(ValueError, errors.InvalidArgumentError,
                       errors.UnknownError)),
   ])  # pyformat: disable
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
index 74583fa5c47..d1ceb908233 100644
--- a/tensorflow/python/ops/ragged/ragged_operators.py
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -21,6 +21,233 @@
 from tensorflow.python.util import tf_decorator
 
 
+# =============================================================================
+# Equality Docstring
+# =============================================================================
+def ragged_eq(self, other):  # pylint: disable=g-doc-args
+  """Returns result of elementwise `==` or False if not broadcast-compatible.
+
+  Compares two ragged tensors elemewise for equality if they are
+  broadcast-compatible; or returns False if they are not
+  [broadcast-compatible](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+  Note that this behavior differs from `tf.math.equal`, which raises an
+  exception if the two ragged tensors are not broadcast-compatible.
+
+  For example:
+
+  >>> rt1 = tf.ragged.constant([[1, 2], [3]])
+  >>> rt1 == rt1
+  <tf.RaggedTensor [[True, True], [True]]>
+
+  >>> rt2 = tf.ragged.constant([[1, 2], [4]])
+  >>> rt1 == rt2
+  <tf.RaggedTensor [[True, True], [False]]>
+
+  >>> rt3 = tf.ragged.constant([[1, 2], [3, 4]])
+  >>> # rt1 and rt3 are not broadcast-compatible.
+  >>> rt1 == rt3
+  False
+
+  >>> # You can also compare a `tf.RaggedTensor` to a `tf.Tensor`.
+  >>> t = tf.constant([[1, 2], [3, 4]])
+  >>> rt1 == t
+  False
+  >>> t == rt1
+  False
+  >>> rt4 = tf.ragged.constant([[1, 2], [3, 4]])
+  >>> rt4 == t
+  <tf.RaggedTensor [[True, True], [True, True]]>
+  >>> t == rt4
+  <tf.RaggedTensor [[True, True], [True, True]]>
+
+  Args:
+    other: The right-hand side of the `==` operator.
+
+  Returns:
+    The ragged tensor result of the elementwise `==` operation, or `False` if
+    the arguments are not broadcast-compatible.
+  """
+  return math_ops.tensor_equals(self, other)
+
+
+# =============================================================================
+# Ordering Docstring
+# =============================================================================
+def ragged_ge(self, other):  # pylint: disable=g-doc-args
+  """Elementwise `>=` comparison of two convertible-to-ragged-tensor values.
+
+  Computes the elemewise `>=` comparison of two values that are convertible to
+  ragged tenors, with [broadcasting]
+  (http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) support.
+  Raises an exception if two values are not broadcast-compatible.
+
+  For example:
+
+  >>> rt1 = tf.ragged.constant([[1, 2], [3]])
+  >>> rt1 >= rt1
+  <tf.RaggedTensor [[True, True], [True]]>
+
+  >>> rt2 = tf.ragged.constant([[2, 1], [3]])
+  >>> rt1 >= rt2
+  <tf.RaggedTensor [[False, True], [True]]>
+
+  >>> rt3 = tf.ragged.constant([[1, 2], [3, 4]])
+  >>> # rt1 and rt3 are not broadcast-compatible.
+  >>> rt1 >= rt3
+  Traceback (most recent call last):
+  ...
+  InvalidArgumentError: ...
+
+  >>> # You can also compare a `tf.RaggedTensor` to a `tf.Tensor`.
+  >>> rt4 = tf.ragged.constant([[1, 2],[3, 4]])
+  >>> t1 = tf.constant([[2, 1], [4, 3]])
+  >>> rt4 >= t1
+  <tf.RaggedTensor [[False, True],
+   [False, True]]>
+  >>> t1 >= rt4
+  <tf.RaggedTensor [[True, False],
+   [True, False]]>
+
+  >>> # Compares a `tf.RaggedTensor` to a `tf.Tensor` with broadcasting.
+  >>> t2 = tf.constant([[2]])
+  >>> rt4 >= t2
+  <tf.RaggedTensor [[False, True],
+   [True, True]]>
+  >>> t2 >= rt4
+  <tf.RaggedTensor [[True, True],
+   [False, False]]>
+
+  Args:
+    other: The right-hand side of the `>=` operator.
+
+  Returns:
+    A `tf.RaggedTensor` of dtype `tf.bool` with the shape that `self` and
+    `other` broadcast to.
+
+  Raises:
+    InvalidArgumentError: If `self` and `other` are not broadcast-compatible.
+  """
+  return math_ops.greater_equal(self, other)
+
+
+# =============================================================================
+# Logical Docstring
+# =============================================================================
+
+
+# =============================================================================
+# Arithmetic Docstring
+# =============================================================================
+def ragged_abs(self, name=None):  # pylint: disable=g-doc-args
+  r"""Computes the absolute value of a ragged tensor.
+
+  Given a ragged tensor of integer or floating-point values, this operation
+  returns a ragged tensor of the same type, where each element contains the
+  absolute value of the corresponding element in the input.
+
+  Given a ragged tensor `x` of complex numbers, this operation returns a tensor
+  of type `float32` or `float64` that is the absolute value of each element in
+  `x`. For a complex number \\(a + bj\\), its absolute value is computed as
+  \\(\sqrt{a^2 + b^2}\\).
+
+  For example:
+
+  >>> # real number
+  >>> x = tf.ragged.constant([[-2.2, 3.2], [-4.2]])
+  >>> tf.abs(x)
+  <tf.RaggedTensor [[2.2, 3.2], [4.2]]>
+
+  >>> # complex number
+  >>> x = tf.ragged.constant([[-2.2 + 4.7j], [-3.2 + 5.7j], [-4.2 + 6.7j]])
+  >>> tf.abs(x)
+  <tf.RaggedTensor [[5.189412298131649],
+   [6.536818798161687],
+   [7.907591289387685]]>
+
+  Args:
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor` of the same size and type as `x`, with absolute values.
+    Note, for `complex64` or `complex128` input, the returned `RaggedTensor`
+    will be of type `float32` or `float64`, respectively.
+  """
+  return math_ops.abs(self, name=name)
+
+
+# ===========================================================================
+def ragged_and(self, y, name=None):  # pylint: disable=g-doc-args
+  r"""Returns the truth value of elementwise `x & y`.
+
+  Logical AND function.
+
+  Requires that `x` and `y` have the same shape or have
+  [broadcast-compatible](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  shapes. For example, `y` can be:
+
+    - A single Python boolean, where the result will be calculated by applying
+      logical AND with the single element to each element in `x`.
+    - A `tf.Tensor` object of dtype `tf.bool` of the same shape or
+      [broadcast-compatible](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+      shape. In this case, the result will be the element-wise logical AND of
+      `x` and `y`.
+    - A `tf.RaggedTensor` object of dtype `tf.bool` of the same shape or
+      [broadcast-compatible](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+      shape. In this case, the result will be the element-wise logical AND of
+      `x` and `y`.
+
+  For example:
+
+  >>> # `y` is a Python boolean
+  >>> x = tf.ragged.constant([[True, False], [True]])
+  >>> y = True
+  >>> x & y
+  <tf.RaggedTensor [[True, False], [True]]>
+  >>> tf.math.logical_and(x, y)  # Equivalent of x & y
+  <tf.RaggedTensor [[True, False], [True]]>
+  >>> y & x
+  <tf.RaggedTensor [[True, False], [True]]>
+  >>> tf.math.reduce_all(x & y)  # Reduce to a scalar bool Tensor.
+  <tf.Tensor: shape=(), dtype=bool, numpy=False>
+
+  >>> # `y` is a tf.Tensor of the same shape.
+  >>> x = tf.ragged.constant([[True, False], [True, False]])
+  >>> y = tf.constant([[True, False], [False, True]])
+  >>> x & y
+  <tf.RaggedTensor [[True, False], [False, False]]>
+
+  >>> # `y` is a tf.Tensor of a broadcast-compatible shape.
+  >>> x = tf.ragged.constant([[True, False], [True]])
+  >>> y = tf.constant([[True], [False]])
+  >>> x & y
+  <tf.RaggedTensor [[True, False], [False]]>
+
+  >>> # `y` is a `tf.RaggedTensor` of the same shape.
+  >>> x = tf.ragged.constant([[True, False], [True]])
+  >>> y = tf.ragged.constant([[False, True], [True]])
+  >>> x & y
+  <tf.RaggedTensor [[False, False], [True]]>
+
+  >>> # `y` is a `tf.RaggedTensor` of a broadcast-compatible shape.
+  >>> x = tf.ragged.constant([[[True, True, False]], [[]], [[True, False]]])
+  >>> y = tf.ragged.constant([[[True]], [[True]], [[False]]], ragged_rank=1)
+  >>> x & y
+  <tf.RaggedTensor [[[True, True, False]], [[]], [[False, False]]]>
+
+  Args:
+    y: A Python boolean or a `tf.Tensor` or `tf.RaggedTensor` of dtype
+      `tf.bool`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tf.RaggedTensor` of dtype `tf.bool` with the shape that `x` and `y`
+    broadcast to.
+  """
+  return math_ops.logical_and(self, y, name)
+
+
+# Helper Methods.
 def _right(operator):
   """Right-handed version of an operator: swap args x and y."""
   return tf_decorator.make_decorator(operator, lambda y, x: operator(x, y))
@@ -41,19 +268,20 @@ def ragged_hash(self):
 ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
 
 # Equality
-ragged_tensor.RaggedTensor.__eq__ = math_ops.tensor_equals
+ragged_tensor.RaggedTensor.__eq__ = ragged_eq
 ragged_tensor.RaggedTensor.__ne__ = math_ops.tensor_not_equals
 ragged_tensor.RaggedTensor.__hash__ = ragged_hash
 
 # Ordering operators
-ragged_tensor.RaggedTensor.__ge__ = math_ops.greater_equal
+ragged_tensor.RaggedTensor.__ge__ = ragged_ge
 ragged_tensor.RaggedTensor.__gt__ = math_ops.greater
 ragged_tensor.RaggedTensor.__le__ = math_ops.less_equal
 ragged_tensor.RaggedTensor.__lt__ = math_ops.less
 
 # Logical operators
-ragged_tensor.RaggedTensor.__and__ = math_ops.logical_and
-ragged_tensor.RaggedTensor.__rand__ = _right(math_ops.logical_and)
+ragged_tensor.RaggedTensor.__and__ = ragged_and
+ragged_tensor.RaggedTensor.__rand__ = _right(ragged_and)
+
 ragged_tensor.RaggedTensor.__invert__ = math_ops.logical_not
 ragged_tensor.RaggedTensor.__ror__ = _right(math_ops.logical_or)
 ragged_tensor.RaggedTensor.__or__ = math_ops.logical_or
@@ -61,7 +289,7 @@ def ragged_hash(self):
 ragged_tensor.RaggedTensor.__rxor__ = _right(math_ops.logical_xor)
 
 # Arithmetic operators
-ragged_tensor.RaggedTensor.__abs__ = math_ops.abs
+ragged_tensor.RaggedTensor.__abs__ = ragged_abs
 ragged_tensor.RaggedTensor.__add__ = math_ops.add
 ragged_tensor.RaggedTensor.__radd__ = _right(math_ops.add)
 ragged_tensor.RaggedTensor.__div__ = math_ops.div
@@ -81,11 +309,30 @@ def ragged_hash(self):
 ragged_tensor.RaggedTensor.__rtruediv__ = _right(math_ops.truediv)
 
 
-# Dummy methods
-def _dummy_bool(_):
-  """Dummy method to prevent a RaggedTensor from being used as a Python bool."""
+def ragged_bool(self):  # pylint: disable=g-doc-args
+  """Raises TypeError when a RaggedTensor is used as a Python bool.
+
+  To prevent RaggedTensor from being used as a bool, this function always raise
+  TypeError when being called.
+
+  For example:
+
+  >>> x = tf.ragged.constant([[1, 2], [3]])
+  >>> result = True if x else False  # Evaluate x as a bool value.
+  Traceback (most recent call last):
+  ...
+  TypeError: RaggedTensor may not be used as a boolean.
+
+  >>> x = tf.ragged.constant([[1]])
+  >>> r = (x == 1)  # tf.RaggedTensor [[True]]
+  >>> if r:  # Evaluate r as a bool value.
+  ...   pass
+  Traceback (most recent call last):
+  ...
+  TypeError: RaggedTensor may not be used as a boolean.
+  """
   raise TypeError("RaggedTensor may not be used as a boolean.")
 
 
-ragged_tensor.RaggedTensor.__bool__ = _dummy_bool
-ragged_tensor.RaggedTensor.__nonzero__ = _dummy_bool
+ragged_tensor.RaggedTensor.__bool__ = ragged_bool  # Python3 bool conversion.
+ragged_tensor.RaggedTensor.__nonzero__ = ragged_bool  # Python2 bool conversion.
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index aaf48bf9783..c759b8254ac 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -14,8 +14,10 @@
 # ==============================================================================
 """Tests for ragged_range op."""
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.platform import googletest
 
@@ -111,6 +113,14 @@ def testKernelErrors(self):
                                 r'Requires \(\(limit - start\) / delta\) <='):
       self.evaluate(ragged_math_ops.range(0.1, 1e10, 1e-10))
 
+    with self.assertRaisesRegex(errors.InvalidArgumentError, 'overflowed'):
+      self.evaluate(
+          gen_ragged_math_ops.ragged_range(
+              starts=[0, 0],
+              limits=[2**31 - 1, 1],
+              deltas=[1, 1],
+              Tsplits=dtypes.int32))
+
   def testShape(self):
     self.assertAllEqual(
         ragged_math_ops.range(0, 0, 1).shape.as_list(), [1, None])
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 44f0431618a..0616344869c 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -20,6 +20,7 @@
 import typing
 import numpy as np
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.framework import composite_tensor
@@ -32,6 +33,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,6 +43,7 @@
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged.row_partition import RowPartition
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.types import core as core_types
 from tensorflow.python.types import internal as internal_types
 from tensorflow.python.util import dispatch
@@ -2305,7 +2308,7 @@ def match_row_splits_dtypes(*tensors, **kwargs):
 # RaggedTensorSpec
 #===============================================================================
 @tf_export("RaggedTensorSpec")
-@type_spec.register("tf.RaggedTensorSpec")
+@type_spec_registry.register("tf.RaggedTensorSpec")
 class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   """Type specification for a `tf.RaggedTensor`."""
 
@@ -2647,6 +2650,13 @@ def from_value(cls, value):
           flat_values_spec=flat_values_spec)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        RaggedTensorSpec, struct_pb2.TypeSpecProto.RAGGED_TENSOR_SPEC
+    )
+)
+
+
 type_spec.register_type_spec_from_value_converter(
     ragged_tensor_value.RaggedTensorValue, RaggedTensorSpec.from_value)
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 9ac19f620db..6e59850af20 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -1912,18 +1912,19 @@ def assertNumpyObjectTensorsRecursivelyEqual(self, a, b, msg):
   @parameterized.named_parameters([
       ('Shape_2_R',
        [[1, 2], [3, 4, 5]],
-       np.array([int32array([1, 2]), int32array([3, 4, 5])])),
+       np.array([int32array([1, 2]), int32array([3, 4, 5])], dtype=object)),
       ('Shape_2_2',
        [[1, 2], [3, 4]],
        np.array([[1, 2], [3, 4]])),
       ('Shape_2_R_2',
        [[[1, 2], [3, 4]], [[5, 6]]],
-       np.array([int32array([[1, 2], [3, 4]]), int32array([[5, 6]])])),
+       np.array([int32array([[1, 2], [3, 4]]), int32array([[5, 6]])],
+                dtype=object)),
       ('Shape_3_2_R',
        [[[1], []], [[2, 3], [4]], [[], [5, 6, 7]]],
        np.array([[int32array([1]), int32array([])],
                  [int32array([2, 3]), int32array([4])],
-                 [int32array([]), int32array([5, 6, 7])]])),
+                 [int32array([]), int32array([5, 6, 7])]], dtype=object)),
       ('Shape_0_R',
        ragged_factory_ops.constant_value([], ragged_rank=1, dtype=np.int32),
        np.zeros([0, 0], dtype=np.int32)),
diff --git a/tensorflow/python/ops/ragged/row_partition.py b/tensorflow/python/ops/ragged/row_partition.py
index a4035fda5d0..ce6e0fe5695 100644
--- a/tensorflow/python/ops/ragged/row_partition.py
+++ b/tensorflow/python/ops/ragged/row_partition.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -29,12 +30,14 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util.tf_export import tf_export
 
 #===============================================================================
@@ -204,7 +207,7 @@ def from_value_rowids(cls,
     partitioned_rows = [[] for _ in nrows]
     for (value, rowid) in zip(values, value_rowids):
       partitioned_rows[rowid].append(value)
-    ``
+    ```
 
     Args:
       value_rowids: A 1-D integer tensor with shape `[nvals]`, which corresponds
@@ -1205,7 +1208,7 @@ def _type_spec(self):
 # of precomputed row-partition encodings (rather than always using row_splits).
 
 
-@type_spec.register("tf.RowPartitionSpec")
+@type_spec_registry.register("tf.RowPartitionSpec")
 class RowPartitionSpec(type_spec.TypeSpec):
   """Type specification for a `tf.RowPartition`."""
 
@@ -1383,6 +1386,13 @@ def __deepcopy__(self, memo):
     return RowPartitionSpec(nrows, nvals, uniform_row_length, dtype)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        RowPartitionSpec, struct_pb2.TypeSpecProto.ROW_PARTITION_SPEC
+    )
+)
+
+
 #===============================================================================
 # Helper Functions
 #===============================================================================
diff --git a/tensorflow/python/ops/random_grad.py b/tensorflow/python/ops/random_grad.py
index 31c67788ea9..32d2b35abe9 100644
--- a/tensorflow/python/ops/random_grad.py
+++ b/tensorflow/python/ops/random_grad.py
@@ -102,19 +102,56 @@ def _StatelessRandomGammaV2Grad(op, grad):  # pylint: disable=invalid-name
   sample = op.outputs[0]
 
   with ops.control_dependencies([grad]):
-    # Note that the shape handling is slightly different for stateless_gamma,
-    # in particular num_sample_dimensions is different.
-    num_sample_dimensions = array_ops.shape(shape)[0] - array_ops.rank(alpha)
-    # Make the parameters alpha broadcastable with samples by appending
-    # unit dimensions.
-    alpha_broadcastable = add_leading_unit_dimensions(alpha,
-                                                      num_sample_dimensions)
-    partial_a = gen_random_ops.random_gamma_grad(alpha_broadcastable, sample)
+    return (None, None, _StatelessGammaGradAlpha(shape, alpha, sample, grad))
+
+
+@ops.RegisterGradient("StatelessRandomGammaV3")
+def _StatelessRandomGammaV3Grad(op, grad):  # pylint: disable=invalid-name
+  """Returns the gradient of a Gamma sample w.r.t. alpha.
+
+  The gradient is computed using implicit differentiation
+  (Figurnov et al., 2018).
+
+  Args:
+    op: A `StatelessRandomGamma` operation. We assume that the inputs to the
+      operation are `shape`, `key`, `counter`, `alg`, and `alpha` tensors, and
+      the output is the `sample` tensor.
+    grad: The incoming gradient `dloss / dsample` of the same shape as
+      `op.outputs[0]`.
 
-    # The first two inputs are shape, seed, third input is alpha.
-    return (None, None,
-            math_ops.reduce_sum(
-                grad * partial_a, axis=math_ops.range(num_sample_dimensions)))
+  Returns:
+    A `Tensor` with derivatives `dloss / dalpha`.
+
+  References:
+    Implicit Reparameterization Gradients:
+      [Figurnov et al., 2018]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients)
+      ([pdf]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf))
+  """
+  shape = op.inputs[0]
+  alpha = op.inputs[4]
+  sample = op.outputs[0]
+
+  with ops.control_dependencies([grad]):
+    return (None, None, None, None,
+            _StatelessGammaGradAlpha(shape, alpha, sample, grad))
+
+
+def _StatelessGammaGradAlpha(shape, alpha, sample, grad):
+  """Returns gradients of a gamma sampler wrt alpha."""
+  # Note that the shape handling is slightly different for stateless_gamma,
+  # in particular num_sample_dimensions is different.
+  num_sample_dimensions = array_ops.shape(shape)[0] - array_ops.rank(alpha)
+  # Make the parameters alpha broadcastable with samples by appending
+  # unit dimensions.
+  alpha_broadcastable = add_leading_unit_dimensions(alpha,
+                                                    num_sample_dimensions)
+  partial_a = gen_random_ops.random_gamma_grad(alpha_broadcastable, sample)
+
+  # The first two inputs are shape, seed, third input is alpha.
+  return math_ops.reduce_sum(
+      grad * partial_a, axis=math_ops.range(num_sample_dimensions))
 
 
 def _Ndtr(x):
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 356d1a6be01..e96bba07628 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -23,6 +23,9 @@
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
+from tensorflow.core.function import trace_type
+from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.checkpoint import tensor_callable
 from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.compat import compat as forward_compat
 from tensorflow.python.eager import context
@@ -37,6 +40,7 @@
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
@@ -51,6 +55,7 @@
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import core
 from tensorflow.python.util import _pywrap_utils
@@ -462,6 +467,9 @@ def __repr__(self):
 
   def __tf_tracing_type__(self, signature_context):
     alias_id = signature_context.alias_global_id(self._handle._id)  # pylint:disable=protected-access
+    # TODO(xjun): Create variable placeholders directly from VariableSpec
+    # without using original values.
+    signature_context.add_placeholder(alias_id, self)
     return VariableSpec(shape=self.shape,
                         dtype=self.dtype,
                         trainable=self.trainable,
@@ -664,17 +672,56 @@ def count_up_to(self, limit):
     return gen_state_ops.resource_count_up_to(
         self.handle, limit=limit, T=self.dtype)
 
-  def _map_resources(self, save_options):
+  def _export_to_saved_model_graph(self, object_map=None, tensor_map=None,
+                                   options=None, **kwargs):
     """For implementing `Trackable`."""
     new_variable = None
-    if save_options.experimental_variable_policy._save_variable_devices():  # pylint:disable=protected-access
+    if options.experimental_variable_policy._save_variable_devices():  # pylint:disable=protected-access
       with ops.device(self.device):
         new_variable = copy_to_graph_uninitialized(self)
     else:
       new_variable = copy_to_graph_uninitialized(self)
-    obj_map = {self: new_variable}
-    resource_map = {self.handle: new_variable.handle}
-    return obj_map, resource_map
+    object_map[self] = new_variable
+    tensor_map[self.handle] = new_variable.handle
+    return [self.handle]
+
+  def _serialize_to_tensors(self):
+    """Implements Trackable._serialize_to_tensors."""
+
+    def _read_variable_closure():
+      v = self
+      with ops.device(v.device):
+        if context.executing_eagerly() and not v.is_initialized():
+          # A SaveSpec tensor value of `None` indicates that the variable is
+          # uninitialized.
+          return None
+        # Read the variable without making a copy to limit memory usage.
+        x = v.read_value_no_copy()
+        # To allow variables placed on non-CPU devices to be checkpointed,
+        # we copy them to CPU on the same machine first.
+        with ops.device("/device:CPU:0"):
+          return array_ops.identity(x)
+
+    return {
+        trackable.VARIABLE_VALUE_KEY:
+            tensor_callable.Callable(
+                _read_variable_closure, dtype=self.dtype, device=self.device)
+    }
+
+  def _restore_from_tensors(self, restored_tensors):
+    """Implements Trackable._restore_from_tensors."""
+    with ops.device(self.device):
+      restored_tensor = array_ops.identity(
+          restored_tensors[trackable.VARIABLE_VALUE_KEY])
+      try:
+        assigned_variable = shape_safe_assign_variable_handle(
+            self.handle, self.shape, restored_tensor)
+      except ValueError as e:
+        raise ValueError(
+            f"Received incompatible tensor with shape {restored_tensor.shape} "
+            f"when attempting to restore variable with shape {self.shape} "
+            f"and name {self.name}.") from e
+      return assigned_variable
 
   def _read_variable_op(self, no_copy=False):
     """Reads the value of the variable.
@@ -2229,8 +2276,8 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
-ops.register_tensor_conversion_function(BaseResourceVariable,
-                                        _dense_var_to_tensor)
+tensor_conversion_registry.register_tensor_conversion_function(
+    BaseResourceVariable, _dense_var_to_tensor)
 
 
 class _UnreadVariable(BaseResourceVariable):
@@ -2637,12 +2684,38 @@ def most_specific_common_supertype(self, others):
     return super().most_specific_common_supertype(others)
 
   # TraceType method
-  def _placeholder_value(self):
-    if self.alias_id is None:
-      raise NotImplementedError(f"VariableSpec._placeholder_value doesn't "
-                                f"support alias_id=None, got self: {self}.")
-
-    return super()._placeholder_value()
+  def placeholder_value(self, placeholder_context):
+    if placeholder_context.unnest_only:
+      return self
+
+    name = self.name or placeholder_context.naming_scope
+    context_graph = placeholder_context.context_graph
+    if placeholder_context.has_placeholder(self.alias_id):
+      # Get reference to the existing variable if alias_id already
+      # exists in the PlaceholderContext
+      variable = placeholder_context.get_placeholder(self.alias_id)
+    else:
+      spec = tensor_spec.TensorSpec([], dtypes.resource)
+      spec_context = trace_type.InternalPlaceholderContext(
+          context_graph.outer_graph)
+      spec_context.update_naming_scope(name)
+      placeholder = spec.placeholder_value(spec_context)
+      variable = self._from_components([placeholder])
+      # (b/262771247) ShardedVariable break without this and VariableSpecs
+      # without alias_id are not TraceTypes.
+      if self.alias_id is not None:
+        placeholder_context.add_placeholder(self.alias_id, variable)
+    # Capture the Variable's placeholder within the default graph of
+    # the current thread.
+    placeholder = context_graph.capture(variable.handle, name=name)
+    placeholder.op._set_attr(  # pylint: disable=protected-access
+        "_user_specified_name",
+        attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
+    return variable
+
+  def _to_tensors(self, value):
+    assert isinstance(value, BaseResourceVariable)
+    return [value.handle]
 
   def _get_structure(self):
     # shape, dtype, trainable, and alias_id are all leaves.
@@ -2661,6 +2734,13 @@ def __eq__(self, other):
             self.alias_id == other.alias_id)
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        VariableSpec, struct_pb2.TypeSpecProto.VARIABLE_SPEC
+    )
+)
+
+
 _pywrap_utils.RegisterType("VariableSpec", VariableSpec)
 
 
diff --git a/tensorflow/python/ops/risc/BUILD b/tensorflow/python/ops/risc/BUILD
index ed7230cad2e..a37c18f28c8 100644
--- a/tensorflow/python/ops/risc/BUILD
+++ b/tensorflow/python/ops/risc/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 3b5ac4e02da..98555b930bb 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -23,6 +23,7 @@
 
 import numpy as np
 
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
@@ -41,16 +42,11 @@
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util import variable_utils
 from tensorflow.python.util.tf_export import tf_export
 
-autograph = lazy_loader.LazyLoader(
-    "autograph", globals(),
-    "tensorflow.python.autograph.impl.api")
-
 
 # Map from EagerPyFunc token to tuple (tape, eager args, eager outputs);
 # used for differentiation.
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index 321b7e85a21..57e736fb647 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/ops/sobol_ops_test.py b/tensorflow/python/ops/sobol_ops_test.py
index 24abf790704..a795cbdcf4f 100644
--- a/tensorflow/python/ops/sobol_ops_test.py
+++ b/tensorflow/python/ops/sobol_ops_test.py
@@ -139,5 +139,15 @@ def test_non_scalar_input(self):
           num_results=constant_op.constant([1, 0]),
           skip=constant_op.constant([1])))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDimNumResultsOverflow(self):
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        r'num_results\*dim must be less than 2147483647'):
+      self.evaluate(
+          gen_math_ops.sobol_sample(
+              dim=2560, num_results=16384000, skip=0, dtype=dtypes.float32))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 15c4c0efc6e..dcf6db6ad07 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -864,6 +864,7 @@ def sparse_reorder(sp_input, name=None):
 
 @tf_export("sparse.reshape", v1=["sparse.reshape", "sparse_reshape"])
 @deprecation.deprecated_endpoints("sparse_reshape")
+@dispatch.add_dispatch_support
 def sparse_reshape(sp_input, shape, name=None):
   """Reshapes a `SparseTensor` to represent values in a new dense shape.
 
@@ -2676,25 +2677,32 @@ def sparse_softmax(sp_input, name=None):
   Hence, the `SparseTensor` result has exactly the same non-zero indices and
   shape.
 
-  Example:
+  Example using a 3-D SparseTensor:
 
-  ```python
-  # First batch:
-  # [?   e.]
-  # [1.  ? ]
-  # Second batch:
-  # [e   ? ]
-  # [e   e ]
-  shape = [2, 2, 2]  # 3-D SparseTensor
-  values = np.asarray([[[0., np.e], [1., 0.]], [[np.e, 0.], [np.e, np.e]]])
-  indices = np.vstack(np.where(values)).astype(np.int64).T
-
-  result = tf.sparse.softmax(tf.sparse.SparseTensor(indices, values, shape))
-  # ...returning a 3-D SparseTensor, equivalent to:
-  # [?   1.]     [1    ?]
-  # [1.  ? ] and [.5  .5]
-  # where ? means implicitly zero.
-  ```
+    >>> st = tf.sparse.from_dense(
+    ...   [[[0., np.e],
+    ...     [1., 0.]],
+    ...
+    ...    [[np.e, 0.],
+    ...     [np.e, np.e]]])
+    >>> res = tf.sparse.softmax(st)
+    >>> res.indices
+    <tf.Tensor: shape=(5, 3), dtype=int64, numpy=
+    array([[0, 0, 1],
+           [0, 1, 0],
+           [1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]])>
+    >>> res.values
+    <tf.Tensor: ... numpy=array([1. , 1. , 1. , 0.5, 0.5], dtype=float32)>
+    >>> res.dense_shape
+    <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 2])>
+    >>> tf.sparse.to_dense(res)
+    <tf.Tensor: shape=(2, 2, 2), dtype=float32, numpy=
+    array([[[0. , 1. ],
+            [1. , 0. ]],
+           [[1. , 0. ],
+            [0.5, 0.5]]], dtype=float32)>
 
   Args:
     sp_input: N-D `SparseTensor`, where `N >= 2`.
@@ -2803,32 +2811,78 @@ def sparse_minimum(sp_a, sp_b, name=None):
 @tf_export("sparse.transpose", v1=["sparse.transpose", "sparse_transpose"])
 @deprecation.deprecated_endpoints("sparse_transpose")
 def sparse_transpose(sp_input, perm=None, name=None):
-  """Transposes a `SparseTensor`
+  """Transposes a `SparseTensor`.
 
-  The returned tensor's dimension i will correspond to the input dimension
-  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
-  the rank of the input tensor. Hence by default, this operation performs a
-  regular matrix transpose on 2-D input Tensors.
-
-  For example, if `sp_input` has shape `[4, 5]` and `indices` / `values`:
+  Permutes the dimensions according to the value of `perm`.  This is the sparse
+  version of `tf.transpose`.
 
-      [0, 3]: b
-      [0, 1]: a
-      [3, 1]: d
-      [2, 0]: c
+  The returned tensor's dimension `i` will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is the rank
+  of the input tensor. Hence, by default, this operation performs a regular
+  matrix transpose on 2-D input Tensors.
 
-  then the output will be a `SparseTensor` of shape `[5, 4]` and
-  `indices` / `values`:
+  For example:
 
-      [0, 2]: c
-      [1, 0]: a
-      [1, 3]: d
-      [3, 0]: b
+  >>> x = tf.SparseTensor(indices=[[0, 1], [0, 3], [2, 3], [3, 1]],
+  ...                     values=[1.1, 2.2, 3.3, 4.4],
+  ...                     dense_shape=[4, 5])
+  >>> print('x =', tf.sparse.to_dense(x))
+  x = tf.Tensor(
+  [[0.  1.1 0.  2.2 0. ]
+  [0.  0.  0.  0.  0. ]
+  [0.  0.  0.  3.3 0. ]
+  [0.  4.4 0.  0.  0. ]], shape=(4, 5), dtype=float32)
+
+  >>> x_transpose = tf.sparse.transpose(x)
+  >>> print('x_transpose =', tf.sparse.to_dense(x_transpose))
+  x_transpose = tf.Tensor(
+  [[0.  0.  0.  0. ]
+  [1.1 0.  0.  4.4]
+  [0.  0.  0.  0. ]
+  [2.2 0.  3.3 0. ]
+  [0.  0.  0.  0. ]], shape=(5, 4), dtype=float32)
+
+  Equivalently, you could call `tf.sparse.transpose(x, perm=[1, 0])`.  The
+  `perm` argument is more useful for n-dimensional tensors where n > 2.
+
+  >>> x = tf.SparseTensor(indices=[[0, 0, 1], [0, 0, 3], [1, 2, 3], [1, 3, 1]],
+  ...                     values=[1.1, 2.2, 3.3, 4.4],
+  ...                     dense_shape=[2, 4, 5])
+  >>> print('x =', tf.sparse.to_dense(x))
+  x = tf.Tensor(
+  [[[0.  1.1 0.  2.2 0. ]
+    [0.  0.  0.  0.  0. ]
+    [0.  0.  0.  0.  0. ]
+    [0.  0.  0.  0.  0. ]]
+  [[0.  0.  0.  0.  0. ]
+    [0.  0.  0.  0.  0. ]
+    [0.  0.  0.  3.3 0. ]
+    [0.  4.4 0.  0.  0. ]]], shape=(2, 4, 5), dtype=float32)
+
+  As above, simply calling `tf.sparse.transpose` will default to `perm=[2,1,0]`.
+
+  To take the transpose of a batch of sparse matrices, where 0 is the batch
+  dimension, you would set `perm=[0,2,1]`.
+
+  >>> x_transpose = tf.sparse.transpose(x, perm=[0, 2, 1])
+  >>> print('x_transpose =', tf.sparse.to_dense(x_transpose))
+  x_transpose = tf.Tensor(
+  [[[0.  0.  0.  0. ]
+    [1.1 0.  0.  0. ]
+    [0.  0.  0.  0. ]
+    [2.2 0.  0.  0. ]
+    [0.  0.  0.  0. ]]
+  [[0.  0.  0.  0. ]
+    [0.  0.  0.  4.4]
+    [0.  0.  0.  0. ]
+    [0.  0.  3.3 0. ]
+    [0.  0.  0.  0. ]]], shape=(2, 5, 4), dtype=float32)
 
   Args:
     sp_input: The input `SparseTensor`.
-    perm: A permutation of the dimensions of `sp_input`.
-    name: A name prefix for the returned tensors (optional)
+    perm: A permutation vector of the dimensions of `sp_input`.
+    name: A name prefix for the returned tensors (optional).
+
   Returns:
     A transposed `SparseTensor`.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index e95b56f9949..0303f95c83d 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -16,7 +16,6 @@
 import enum
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -51,6 +50,25 @@
 
 @tf_export("random.Algorithm", "random.experimental.Algorithm")
 class Algorithm(enum.Enum):
+  """A random-number-generation (RNG) algorithm.
+
+  Many random-number generators (e.g. the `alg` argument of
+  `tf.random.Generator` and `tf.random.stateless_uniform`) in TF allow
+  you to choose the algorithm used to generate the (pseudo-)random
+  numbers. You can set the algorithm to be one of the options below.
+
+  * `PHILOX`: The Philox algorithm introduced in the paper ["Parallel
+    Random Numbers: As Easy as 1, 2,
+    3"](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf).
+  * `THREEFRY`: The ThreeFry algorithm introduced in the paper
+    ["Parallel Random Numbers: As Easy as 1, 2,
+    3"](https://www.thesalmons.org/john/random123/papers/random123sc11.pdf).
+  * `AUTO_SELECT`: Allow TF to automatically select the algorithm
+    depending on the accelerator device. Note that with this option,
+    running the same TF program on different devices may result in
+    different random numbers. Also note that TF may select an
+    algorithm that is different from `PHILOX` and `THREEFRY`.
+  """
   # The numbers here must match framework/rng_alg.h
   PHILOX = 1
   THREEFRY = 2
@@ -170,7 +188,7 @@ def uint32s_to_uint64(x):
                              constant_op.constant(32, dtypes.uint64)))
 
 
-@tf_export("random.experimental.stateless_split")
+@tf_export("random.split", "random.experimental.stateless_split")
 @dispatch.add_dispatch_support
 def split(seed, num=2, alg="auto_select"):
   """Splits an RNG seed into `num` new seeds by adding a leading axis.
@@ -178,7 +196,7 @@ def split(seed, num=2, alg="auto_select"):
   Example:
 
   >>> seed = [1, 2]
-  >>> new_seeds = tf.random.experimental.stateless_split(seed, num=3)
+  >>> new_seeds = tf.random.split(seed, num=3)
   >>> print(new_seeds)
   tf.Tensor(
   [[1105988140 1738052849]
@@ -206,7 +224,7 @@ def split(seed, num=2, alg="auto_select"):
                                   minval=None, maxval=None, alg=alg)
 
 
-@tf_export("random.experimental.stateless_fold_in")
+@tf_export("random.fold_in", "random.experimental.stateless_fold_in")
 @dispatch.add_dispatch_support
 def fold_in(seed, data, alg="auto_select"):
   """Folds in data to an RNG seed to form a new RNG seed.
@@ -250,24 +268,35 @@ def fold_in(seed, data, alg="auto_select"):
 @tf_export("random.experimental.index_shuffle")
 @dispatch.add_dispatch_support
 def index_shuffle(index, seed, max_index):
-  """Outputs the position of `index` in a permutation of [0, ..., max_index].
+  """Outputs the position of `index` in a permutation of `[0, ..., max_index]`.
+
+  For each possible `seed` and `max_index` there is one pseudorandom
+  permutation of the sequence `S=[0, ..., max_index]`. Instead of
+  materializing the full array we can compute the new position of any
+  integer `i` (`0 <= i <= max_index`) in `S`. This can be useful for
+  very large `max_index`s by avoiding allocating large chunks of
+  memory.
 
-  For each possible `seed` and `max_index` there is one pseudorandom permutation
-  of the sequence S=[0, ..., max_index]. Instead of materializing the full array
-  we can compute the new position of any single element in S. This can be useful
-  for very large `max_index`s.
+  In the simplest case, `index` and `max_index` are scalars, and
+  `seed` is a length-2 vector (as typical for stateless RNGs). But
+  you can add a leading batch dimension to all of them. If some of
+  them don't have the batch dimension while others do, `index_shuffle`
+  will add a batch dimension to the former by broadcasting.
 
-  The input `index` and output can be used as indices to shuffle a vector.
-  For example:
+  The input `index` and output can be used as indices to shuffle a
+  vector.  For example:
 
   >>> vector = tf.constant(['e0', 'e1', 'e2', 'e3'])
-  >>> indices = tf.random.experimental.index_shuffle(tf.range(4), [5, 9], 3)
+  >>> indices = tf.random.experimental.index_shuffle(
+  ...   index=tf.range(4), seed=[5, 9], max_index=3)
+  >>> print(indices)
+  tf.Tensor([2 0 1 3], shape=(4,), dtype=int32)
   >>> shuffled_vector = tf.gather(vector, indices)
   >>> print(shuffled_vector)
   tf.Tensor([b'e2' b'e0' b'e1' b'e3'], shape=(4,), dtype=string)
 
   More usefully, it can be used in a streaming (aka online) scenario such as
-  `tf.data`,  where each element of `vector` is processed individually and the
+  `tf.data`, where each element of `vector` is processed individually and the
   whole `vector` is never materialized in memory.
 
   >>> dataset = tf.data.Dataset.range(10)
@@ -276,13 +305,12 @@ def index_shuffle(index, seed, max_index):
   >>> print(list(dataset.as_numpy_iterator()))
   [3, 8, 0, 1, 2, 7, 6, 9, 4, 5]
 
-  This operation is stateless (like other `tf.random.stateless_*` functions),
-  meaning the output is fully determined by the `seed` (other inputs being
-  equal).
-  Each `seed` choice corresponds to one permutation, so when calling this
-  function
-  multiple times for the same shuffling, please make sure to use the same
-  `seed`. For example:
+  This operation is stateless (like the `tf.random.stateless_*`
+  functions), meaning the output is fully determined by the `seed`
+  (other inputs being equal).  Each `seed` choice corresponds to one
+  permutation, so when calling this function multiple times for the
+  same shuffling, please make sure to use the same `seed`. For
+  example:
 
   >>> seed = [5, 9]
   >>> idx0 = tf.random.experimental.index_shuffle(0, seed, 3)
@@ -294,23 +322,24 @@ def index_shuffle(index, seed, max_index):
   tf.Tensor([b'e2' b'e0' b'e1' b'e3'], shape=(4,), dtype=string)
 
   Args:
-    index: An integer scalar tensor or vector with values in [0, `max_index`].
-      It can be seen as either a value `v` in the sequence `S`=[0, ...,
-      `max_index`] to be permutated, or as an index of an element `e` in a
-      shuffled vector.
-    seed: A tensor of shape [2] or [n, 2] with dtype int32/uint32/int64/uint64.
-      The RNG seed. If the rank is unknown during graph building it must be 1 at
-      runtime.
-    max_index: A non-negative tensor with the same shape and dtype as `index`.
-      The upper bound (inclusive).
+    index: An integer scalar tensor or vector with values in `[0,
+      max_index]`.  It can be seen as either a value `v` in the
+      sequence `S=[0, ..., max_index]` to be permutated, or as an
+      index of an element `e` in a shuffled vector.
+    seed: A tensor of shape [2] or [n, 2] with dtype `int32`,
+      `uint32`, `int64` or `uint64`.  The RNG seed. If the rank is
+      unknown during graph-building time it must be 1 at runtime.
+    max_index: A non-negative tensor with the same shape and dtype as
+      `index`.  The upper bound (inclusive).
 
   Returns:
-    If all inputs were scalar (shape [2] for `seed`) the output will be a scalar
-    with the same dtype as `index`. The output can be seen as the new position
-    of `v` in `S`, or as the index of `e` in the vector before shuffling.
-    If one or multiple inputs were vectors (shape [n, 2] for `seed`) then the
-    output will be a vector of the same size which each element shuffled
-    independently. Scalar values are broadcasted in this case.
+    If all inputs were scalar (shape [2] for `seed`), the output will
+    be a scalar with the same dtype as `index`. The output can be seen
+    as the new position of `v` in `S`, or as the index of `e` in the
+    vector before shuffling.  If one or multiple inputs were vectors
+    (shape [n, 2] for `seed`), then the output will be a vector of the
+    same size which each element shuffled independently. Scalar values
+    are broadcasted in this case.
   """
   # We expect users to pass a seed with shape [2] to be consistent with other
   # stateless_* ops, but the raw op expects shape [3].
@@ -658,14 +687,10 @@ def stateless_random_gamma(shape,
     broadcast_shape = array_ops.broadcast_dynamic_shape(
         array_ops.shape(alpha), array_ops.shape(beta))
     alpha_broadcast = array_ops.broadcast_to(alpha, broadcast_shape)
-    if compat.forward_compatible(2022, 11, 29):
-      alg = "auto_select"
-      key, counter, alg = _get_key_counter_alg(seed, alg)
-      rnd = gen_stateless_random_ops_v2.stateless_random_gamma_v3(
-          shape, key=key, counter=counter, alg=alg, alpha=alpha_broadcast)
-    else:
-      rnd = gen_stateless_random_ops.stateless_random_gamma_v2(
-          shape, seed=seed, alpha=alpha_broadcast)
+    alg = "auto_select"
+    key, counter, alg = _get_key_counter_alg(seed, alg)
+    rnd = gen_stateless_random_ops_v2.stateless_random_gamma_v3(
+        shape, key=key, counter=counter, alg=alg, alpha=alpha_broadcast)
     result = math_ops.maximum(
         np.finfo(alpha.dtype.as_numpy_dtype).tiny, rnd / beta)
     tensor_util.maybe_set_static_shape(result, shape)
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index 65e6b7ccabd..08b790b2c7c 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -3,9 +3,10 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
-        "//tensorflow:structured_tensor_whitelist",
+        "//tensorflow:structured_tensor_allowlist",
     ],
     licenses = ["notice"],
 )
@@ -49,6 +50,11 @@ py_library(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 1d5f6d599f2..4e36b6b9631 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -24,7 +24,6 @@
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -224,8 +223,8 @@ def make_template_internal(name_,
   """
 
   if kwargs:
-    func_ = tf_decorator.make_decorator(func_,
-                                        functools.partial(func_, **kwargs))
+    func_ = functools.partial(func_, **kwargs)
+
   if context.executing_eagerly():
     if unique_name_ is not None:
       raise ValueError(
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index fd42a035101..17b5807e768 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +32,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_control_flow_ops
@@ -38,6 +40,7 @@
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1182,9 +1185,9 @@ def stack(self, name=None):
 
 
     >>> ta = tf.TensorArray(tf.int32, size=3)
-    >>> ta.write(0, tf.constant([1, 2]))
-    >>> ta.write(1, tf.constant([3, 4]))
-    >>> ta.write(2, tf.constant([5, 6]))
+    >>> ta = ta.write(0, tf.constant([1, 2]))
+    >>> ta = ta.write(1, tf.constant([3, 4]))
+    >>> ta = ta.write(2, tf.constant([5, 6]))
     >>> ta.stack()
     <tf.Tensor: shape=(3, 2), dtype=int32, numpy=
     array([[1, 2],
@@ -1343,7 +1346,7 @@ def _check_dtypes(value, dtype):
 
 
 @tf_export("TensorArraySpec")
-@type_spec.register("tf.TensorArraySpec")
+@type_spec_registry.register("tf.TensorArraySpec")
 class TensorArraySpec(type_spec.TypeSpec):
   """Type specification for a `tf.TensorArray`."""
 
@@ -1473,6 +1476,13 @@ def _to_legacy_output_classes(self):
     return TensorArray
 
 
+nested_structure_coder.register_codec(
+    nested_structure_coder.BuiltInTypeSpecCodec(
+        TensorArraySpec, struct_pb2.TypeSpecProto.TENSOR_ARRAY_SPEC
+    )
+)
+
+
 # Register the TypeSpec for TensorArray.  If TensorArray is updated to be a
 # CompositeTensor, then this registration can be deleted.
 type_spec.register_type_spec_from_value_converter(
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
index 96847c6681a..27e5709e223 100644
--- a/tensorflow/python/ops/v1_compat_tests/BUILD
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cuda_py_test(
     name = "gradient_checker_test",
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c5e01ff8d4d..2fa58b64d2b 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -27,6 +27,7 @@
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -1087,7 +1088,7 @@ def method(self, *args, **kwargs):
   setattr(_LazyEvalTensor, _name, _make_op_method(_name))
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     _LazyEvalTensor,
     lambda val, dtype, name, as_ref: val._as_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
     )
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index c7cf50ac767..87957c6080a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -27,6 +27,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -100,10 +101,10 @@ class VariableAggregationV2(enum.Enum):
   """Indicates how a distributed variable will be aggregated.
 
   `tf.distribute.Strategy` distributes a model by making multiple copies
-  (called "replicas") acting data-parallel on different elements of the input
-  batch. When performing some variable-update operation, say
-  `var.assign_add(x)`, in a model, we need to resolve how to combine the
-  different values for `x` computed in the different replicas.
+  (called "replicas") acting on different elements of the input batch in a
+  data parallel model. When performing some variable-update operation,
+  for example `var.assign_add(x)`, in a model, we need to resolve how to combine
+  the different values for `x` computed in the different replicas.
 
   * `NONE`: This is the default, giving an error if you use a
     variable-update operation with multiple replicas.
@@ -112,6 +113,21 @@ class VariableAggregationV2(enum.Enum):
   * `ONLY_FIRST_REPLICA`: This is for when every replica is performing the same
     update, but we only want to perform the update once. Used, e.g., for the
     global step counter.
+
+  For example:
+
+  >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+  >>> with strategy.scope():
+  ...   v = tf.Variable(5.0, aggregation=tf.VariableAggregation.MEAN)
+  >>> @tf.function
+  ... def update_fn():
+  ...   return v.assign_add(1.0)
+  >>> strategy.run(update_fn)
+  PerReplica:{
+    0: <tf.Tensor: shape=(), dtype=float32, numpy=6.0>,
+    1: <tf.Tensor: shape=(), dtype=float32, numpy=6.0>
+  }
+
   """
   NONE = 0
   SUM = 1
@@ -2739,6 +2755,18 @@ def __ipow__(self, other):
         " if you want a new python Tensor object.", 1)
     return self**other
 
+  def _serialize_to_tensors(self):
+    """Implements Trackable._serialize_to_tensors."""
+    return {trackable.VARIABLE_VALUE_KEY: self}
+
+  def _restore_from_tensors(self, restored_tensors):
+    """Implements Trackable._restore_from_tensors."""
+    restored_tensor = restored_tensors[trackable.VARIABLE_VALUE_KEY]
+    return state_ops.assign(
+        self,
+        restored_tensor,
+        validate_shape=self.get_shape().is_fully_defined())
+
 
 def _try_guard_against_uninitialized_dependencies(name, initial_value):
   """Attempt to guard against dependencies on uninitialized variables.
@@ -3112,8 +3140,8 @@ def assign_sub(self, value, use_locking=False, name=None, read_value=True):
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
-ops.register_tensor_conversion_function(RefVariable,
-                                        RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
+tensor_conversion_registry.register_tensor_conversion_function(
+    RefVariable, RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
 
 
 @tf_export(v1=["global_variables"])
@@ -3472,5 +3500,5 @@ def report_uninitialized_variables(var_list=None,
         return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     PartitionedVariable, PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 44d6ba0f21f..959c0225bd6 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -90,7 +90,8 @@ def while_loop(cond,
   # `wrapped_body` below.
   loop_vars = _tensor_array_to_flow(loop_vars)
   loop_vars = nest.map_structure(
-      ops.internal_convert_to_tensor_or_indexed_slices, loop_vars,
+      indexed_slices.internal_convert_to_tensor_or_indexed_slices,
+      loop_vars,
       expand_composites=True)
 
   # `loop_vars_signature` is a structure of TypeSpecs and has the same
@@ -298,7 +299,8 @@ def wrapped_body(loop_counter, maximum_iterations_arg, *args):
     _check_inputs_outputs_types_match(body_graph, flattened_loop_vars)
 
     with ops.control_dependencies(
-        list(cond_graph.control_captures) + list(body_graph.control_captures)):
+        list(cond_graph._function_captures.control) + list(  # pylint: disable=protected-access
+            body_graph._function_captures.control)):  # pylint: disable=protected-access
       output_shapes = [t.shape for t in body_graph.outputs]
       orig_loop_vars_range = slice(first_loop_var_index,
                                    first_loop_var_index + num_flattened_outputs)
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index b701f255c9c..62572602552 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -5,9 +5,11 @@ load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additiona
 
 visibility = [
     "//tensorflow:__subpackages__",
+    "//tensorflow/dtensor:dtensor-internal",
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
@@ -34,6 +36,7 @@ py_library(
             "benchmark.py",  # In platform_benchmark.
             "analytics.py",  # In platform_analytics.
             "device_context.py",  # In platform_device_context.
+            "self_check.py",  # In self_check
         ],
     ) + ["build_info.py"],
     srcs_version = "PY3",
@@ -52,6 +55,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "self_check",
+    srcs = ["self_check.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":_pywrap_cpu_feature_guard",
+        ":build_info",
+    ],
+)
+
 py_library(
     name = "benchmark",
     srcs = ["benchmark.py"],
@@ -115,6 +128,8 @@ tf_py_test(
     ],
     python_version = "PY3",
     tags = [
+        "no_mac",  # TODO(b/259295275) re-enable after fixing sysconfig.get_path breakage
+        "no_oss",  # TODO(b/259295275) re-enable after fixing sysconfig.get_path breakage
         "no_pip",
         "no_windows",
     ],
@@ -130,6 +145,10 @@ tf_py_test(
     size = "small",
     srcs = ["flags_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_mac",  # TODO(b/259295275) re-enable after fixing sysconfig.get_path breakage
+        "no_oss",  # TODO(b/263966250) re-enable after fixing sysconfig.get_path breakage
+    ],
     deps = [
         ":client_testlib",
         ":platform",
@@ -142,6 +161,7 @@ tf_py_test(
     srcs = ["stacktrace_handler_test.py"],
     python_version = "PY3",
     tags = [
+        "no_oss",  # TODO(b/263966250) re-enable after fixing sysconfig.get_path breakage
         "no_windows",
         "nomac",
     ],
@@ -176,11 +196,11 @@ tf_python_pybind_extension(
     ],
 )
 
-pybind_extension(
+tf_python_pybind_extension(
     name = "_pywrap_cpu_feature_guard",
     srcs = ["cpu_feature_guard_wrapper.cc"],
     deps = [
-        "//tensorflow/core/platform:cpu_feature_guard",
+        "//tensorflow/core/platform:cpu_feature_guard_hdr",  # Only depend on header to avoid ODR issues.
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index cc5316b4da1..4afb70071ef 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -51,23 +51,7 @@
 def _rename_function(f, arg_num, name):
   """Rename the given function's name appears in the stack trace."""
   func_code = f.__code__
-  if sys.version_info > (3, 8, 0, "alpha", 3):
-    # Python3.8 / PEP570 added co_posonlyargcount argument to CodeType.
-    new_code = types.CodeType(
-        arg_num, func_code.co_posonlyargcount, 0, func_code.co_nlocals,
-        func_code.co_stacksize, func_code.co_flags, func_code.co_code,
-        func_code.co_consts, func_code.co_names, func_code.co_varnames,
-        func_code.co_filename, name, func_code.co_firstlineno,
-        func_code.co_lnotab, func_code.co_freevars, func_code.co_cellvars)
-  else:
-    new_code = types.CodeType(arg_num, 0, func_code.co_nlocals,
-                              func_code.co_stacksize, func_code.co_flags,
-                              func_code.co_code, func_code.co_consts,
-                              func_code.co_names, func_code.co_varnames,
-                              func_code.co_filename, name,
-                              func_code.co_firstlineno, func_code.co_lnotab,
-                              func_code.co_freevars, func_code.co_cellvars)
-
+  new_code = func_code.replace(co_argcount=arg_num, co_name=name)
   return types.FunctionType(new_code, f.__globals__, name, f.__defaults__,
                             f.__closure__)
 
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index ad271c55dbf..ec23b721d1e 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -55,10 +55,19 @@ def get_lib():
 
 @tf_export('sysconfig.get_compile_flags')
 def get_compile_flags():
-  """Get the compilation flags for custom operators.
+  """Returns the compilation flags for compiling with TensorFlow.
+
+  The returned list of arguments can be passed to the compiler for compiling
+  against TensorFlow headers. The result is platform dependent.
+
+  For example, on a typical Linux system with Python 3.7 the following command
+  prints `['-I/usr/local/lib/python3.7/dist-packages/tensorflow/include',
+  '-D_GLIBCXX_USE_CXX11_ABI=1', '-DEIGEN_MAX_ALIGN_BYTES=64']`
+
+  >>> print(tf.sysconfig.get_compile_flags())
 
   Returns:
-    The compilation flags.
+    A list of strings for the compiler flags.
   """
   flags = []
   flags.append('-I%s' % get_include())
@@ -81,10 +90,19 @@ def get_compile_flags():
 
 @tf_export('sysconfig.get_link_flags')
 def get_link_flags():
-  """Get the link flags for custom operators.
+  """Returns the linker flags for linking with TensorFlow.
+
+  The returned list of arguments can be passed to the linker for linking against
+  TensorFlow. The result is platform dependent.
+
+  For example, on a typical Linux system with Python 3.7 the following command
+  prints `['-L/usr/local/lib/python3.7/dist-packages/tensorflow',
+  '-l:libtensorflow_framework.so.2']`
+
+  >>> print(tf.sysconfig.get_link_flags())
 
   Returns:
-    The link flags.
+    A list of strings for the linker flags.
   """
   is_mac = _platform.system() == 'Darwin'
   ver = _VERSION.split('.')[0]
diff --git a/tensorflow/python/platform/sysconfig_test.py b/tensorflow/python/platform/sysconfig_test.py
index ff56d6e088a..07473a6e92f 100644
--- a/tensorflow/python/platform/sysconfig_test.py
+++ b/tensorflow/python/platform/sysconfig_test.py
@@ -16,25 +16,25 @@
 import re
 
 from tensorflow.python.platform import googletest
-from tensorflow.python.platform import sysconfig
+from tensorflow.python.platform import sysconfig as sysconfig_lib
 from tensorflow.python.platform import test
 
 
 class SysconfigTest(googletest.TestCase):
 
   def test_get_build_info_works(self):
-    build_info = sysconfig.get_build_info()
+    build_info = sysconfig_lib.get_build_info()
     self.assertIsInstance(build_info, dict)
 
   def test_rocm_cuda_info_matches(self):
-    build_info = sysconfig.get_build_info()
+    build_info = sysconfig_lib.get_build_info()
     self.assertEqual(build_info["is_rocm_build"], test.is_built_with_rocm())
     self.assertEqual(build_info["is_cuda_build"], test.is_built_with_cuda())
 
   def test_compile_flags(self):
     # Must contain an include directory, and define _GLIBCXX_USE_CXX11_ABI,
     # EIGEN_MAX_ALIGN_BYTES
-    compile_flags = sysconfig.get_compile_flags()
+    compile_flags = sysconfig_lib.get_compile_flags()
 
     def list_contains(items, regex_str):
       regex = re.compile(regex_str)
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 168dd34aa22..1740436c6bd 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -87,7 +87,42 @@ def _logger_find_caller():  # pylint: disable=g-wrong-blank-lines
 
 @tf_export('get_logger')
 def get_logger():
-  """Return TF logger instance."""
+  """Return TF logger instance.
+
+  Returns:
+    An instance of the Python logging library Logger.
+
+  See Python documentation (https://docs.python.org/3/library/logging.html)
+  for detailed API. Below is only a summary.
+
+  The logger has 5 levels of logging from the most serious to the least:
+
+  1. FATAL
+  2. ERROR
+  3. WARN
+  4. INFO
+  5. DEBUG
+
+  The logger has the following methods, based on these logging levels:
+
+  1. fatal(msg, *args, **kwargs)
+  2. error(msg, *args, **kwargs)
+  3. warn(msg, *args, **kwargs)
+  4. info(msg, *args, **kwargs)
+  5. debug(msg, *args, **kwargs)
+
+  The `msg` can contain string formatting.  An example of logging at the `ERROR`
+  level
+  using string formating is:
+
+  >>> tf.get_logger().error("The value %d is invalid.", 3)
+
+  You can also specify the logging verbosity.  In this case, the
+  WARN level log will not be emitted:
+
+  >>> tf.get_logger().setLevel(ERROR)
+  >>> tf.get_logger().warn("This is a warning.")
+  """
   global _logger
 
   # Use double-checked locking to avoid taking lock unnecessarily.
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index b51df1e066d..d234fb75ba9 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
index aa493d6e1d5..a7556d93dfd 100644
--- a/tensorflow/python/profiler/integration_test/BUILD
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 28f228fa258..26d447525b5 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/python/profiler:__subpackages__"],
     licenses = ["notice"],
 )
@@ -17,6 +18,22 @@ py_library(
     ],
 )
 
+py_test(
+    name = "flops_registry_test",
+    srcs = ["flops_registry_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":flops_registry",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+    ],
+)
+
 py_library(
     name = "model_analyzer_testlib",
     srcs = ["model_analyzer_testlib.py"],
@@ -84,23 +101,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/profiler:__subpackages__",
     ],
     deps = [
-        ":traceme_wrapper",
-        "@pybind11",
-    ],
-)
-
-cc_library(
-    name = "traceme_wrapper",
-    hdrs = ["traceme_wrapper.h"],
-    copts = tf_profiler_copts(),
-    visibility = [
-        "//tensorflow/compiler/xla/python:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme_for_pybind",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
+        "//tensorflow/compiler/xla/python/profiler/internal:traceme_wrapper",
         "@pybind11",
     ],
 )
@@ -163,12 +164,12 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/convert:xplane_to_tools_data",
-        "//tensorflow/core/profiler/convert:xplane_to_trace_events",
         "//tensorflow/core/profiler/lib:profiler_session_for_pybind",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc:profiler_server_for_pybind",
         "//tensorflow/core/profiler/rpc/client:capture_profile",
         "//tensorflow/core/profiler/rpc/client:save_profile",
+        "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/python/profiler/internal/flops_registry.py b/tensorflow/python/profiler/internal/flops_registry.py
index 64574c4dc58..4e012e69268 100644
--- a/tensorflow/python/profiler/internal/flops_registry.py
+++ b/tensorflow/python/profiler/internal/flops_registry.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Register flops statistics for various TensorFlow operations.
 """
+import numpy as np
+
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 
@@ -33,9 +35,9 @@
     "AvgPool", "MaxPool", "AvgPoolGrad", "MaxPoolGrad", "Conv2DBackpropInput",
     "Conv2DBackpropFilter",
     # Other ops
-    "AddN",
+    "AddN", "MatMul",
     # Ops implemented in core tensorflow:
-    "MatMul", "Conv2D", "DepthwiseConv2dNative", "BiasAdd", "Dilation2D",
+    "Conv2D", "DepthwiseConv2dNative", "BiasAdd", "Dilation2D",
 ])
 
 
@@ -441,3 +443,37 @@ def _add_n_flops(graph, node):
   in_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   in_shape.assert_is_fully_defined()
   return ops.OpStats("flops", in_shape.num_elements() * (len(node.input) - 1))
+
+
+@ops.RegisterStatistics("MatMul", "flops")
+def _calc_mat_mul_flops(graph, node):
+  """Calculates the compute resources needed for MatMul."""
+  transpose_a = node.attr["transpose_a"].b
+  a_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
+  a_shape.assert_is_fully_defined()
+  if transpose_a:
+    k = int(a_shape[0])
+  else:
+    k = int(a_shape[1])
+  output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
+  output_shape.assert_is_fully_defined()
+  output_count = np.prod(output_shape.as_list())
+  return ops.OpStats("flops", (k * output_count * 2))
+
+
+@ops.RegisterStatistics("BatchMatMul", "flops")
+@ops.RegisterStatistics("BatchMatMulV2", "flops")
+@ops.RegisterStatistics("BatchMatMulV3", "flops")
+def _calc_batch_mat_mul_flops(graph, node):
+  """Calculates the compute resources needed for BatchMatMul."""
+  transpose_a = node.attr["transpose_a"].b
+  a_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
+  a_shape.assert_is_fully_defined()
+  if transpose_a:
+    k = int(a_shape[-2])
+  else:
+    k = int(a_shape[-1])
+  output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
+  output_shape.assert_is_fully_defined()
+  output_count = np.prod(output_shape.as_list())
+  return ops.OpStats("flops", (k * output_count * 2))
diff --git a/tensorflow/python/profiler/internal/flops_registry_test.py b/tensorflow/python/profiler/internal/flops_registry_test.py
new file mode 100644
index 00000000000..a7c644da5cd
--- /dev/null
+++ b/tensorflow/python/profiler/internal/flops_registry_test.py
@@ -0,0 +1,51 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.profiler.internal import flops_registry  # pylint: disable=unused-import
+
+
+class FlopsRegistryTest(test.TestCase):
+
+  @test_util.run_v1_only('Test requires a Graph and NodeDef inspection')
+  def testSimpleStatistics(self):
+    a = variables.Variable(random_ops.random_normal([25, 16]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, 'flops').value
+      if op.name == 'MatMul':
+        self.assertEqual(7200, flops)
+
+  @test_util.run_v1_only('Test requires a Graph and NodeDef inspection')
+  def testTransposedStatistics(self):
+    a = variables.Variable(random_ops.random_normal([16, 25]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b, transpose_a=True)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, 'flops').value
+      if op.name == 'MatMul':
+        self.assertEqual(7200, flops)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
index a63e0e5ca1d..93b4f0e31ce 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
@@ -33,11 +33,11 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
-#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -228,8 +228,8 @@ tensorflow::Status Trace(
   TF_RETURN_IF_ERROR(ValidateOptions(opts));
 
   {
-    TF_RETURN_IF_ERROR(tensorflow::profiler::Trace(logdir, num_tracing_attempts,
-                                                   opts, is_cloud_tpu_session));
+    TF_RETURN_IF_ERROR(tensorflow::profiler::CaptureRemoteTrace(
+        logdir, num_tracing_attempts, opts, is_cloud_tpu_session));
   }
   return OkStatus();
 }
@@ -261,7 +261,7 @@ tensorflow::Status ProfilerSessionWrapper::Stop(tensorflow::string* result) {
     tensorflow::profiler::XSpace xspace;
     tensorflow::Status status = session_->CollectData(&xspace);
     session_.reset();
-    tensorflow::profiler::ConvertXSpaceToTraceEventsString(xspace, result);
+    tsl::profiler::ConvertXSpaceToTraceEventsString(xspace, result);
     TF_RETURN_IF_ERROR(status);
   }
   return OkStatus();
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index 1e5d0b16994..4e89d83ef6d 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
+#include "tensorflow/compiler/xla/python/profiler/internal/traceme_wrapper.h"
 
 #include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
 
 namespace py = ::pybind11;
 
-using ::tensorflow::profiler::TraceMeWrapper;
+using ::xla::profiler::TraceMeWrapper;
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper>(m, "TraceMe", py::module_local())
diff --git a/tensorflow/python/profiler/profiler_v2_test.py b/tensorflow/python/profiler/profiler_v2_test.py
index 9abffb5bf6b..b0b4ff301f6 100644
--- a/tensorflow/python/profiler/profiler_v2_test.py
+++ b/tensorflow/python/profiler/profiler_v2_test.py
@@ -59,12 +59,10 @@ def test_save_profile(self):
 
     profiler.stop()
     file_list = gfile.ListDirectory(logdir)
-    self.assertEqual(len(file_list), 2)
+    self.assertEqual(len(file_list), 1)
     for file_name in gfile.ListDirectory(logdir):
       if gfile.IsDirectory(os.path.join(logdir, file_name)):
         self.assertEqual(file_name, 'plugins')
-      else:
-        self.assertTrue(file_name.endswith('.profile-empty'))
     profile_dir = os.path.join(logdir, 'plugins', 'profile')
     run = gfile.ListDirectory(profile_dir)[0]
     hostname = socket.gethostname()
@@ -84,7 +82,7 @@ def test_profile_with_options(self):
 
     profiler.stop()
     file_list = gfile.ListDirectory(logdir)
-    self.assertEqual(len(file_list), 2)
+    self.assertEqual(len(file_list), 1)
 
   def test_context_manager_with_options(self):
     logdir = self.get_temp_dir()
@@ -98,7 +96,7 @@ def test_context_manager_with_options(self):
       self.assertAllEqual(15, product)
 
     file_list = gfile.ListDirectory(logdir)
-    self.assertEqual(len(file_list), 2)
+    self.assertEqual(len(file_list), 1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/pywrap_dtensor_device.cc b/tensorflow/python/pywrap_dtensor_device.cc
index 0deedea2e9b..45b014d7cb3 100644
--- a/tensorflow/python/pywrap_dtensor_device.cc
+++ b/tensorflow/python/pywrap_dtensor_device.cc
@@ -38,9 +38,11 @@ using tensorflow::dtensor::ExperimentalSetDefaultLayout;
 using tensorflow::dtensor::ExperimentalSetDefaultMesh;
 using tensorflow::dtensor::FetchLayout;
 using tensorflow::dtensor::GetFunctionCacheHitAndMissCount;
+using tensorflow::dtensor::IsDTensor;
 using tensorflow::dtensor::IsSparseDTensor;
 using tensorflow::dtensor::Mesh;
 using tensorflow::dtensor::Pack;
+using tensorflow::dtensor::SetIteratorElementLayouts;
 using tensorflow::dtensor::SetSameShapePolicy;
 using tensorflow::dtensor::SetTPUCoreIDs;
 using tensorflow::dtensor::SparsePack;
@@ -105,13 +107,13 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
   });
   m.def("AddMesh", [](const py::capsule& device_info,
                       const std::string& serialized_mesh, bool is_async,
-                      bool is_host_mesh) {
+                      bool is_host_mesh, int in_flight_nodes_limit) {
     std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
         TF_NewStatus(), TF_DeleteStatus);
     AddMesh(
         serialized_mesh,
         PyCapsule_GetPointer(device_info.ptr(), "TFE_CustomDevice_DeviceInfo"),
-        is_async, is_host_mesh, status.get());
+        is_async, is_host_mesh, in_flight_nodes_limit, status.get());
     if (TF_GetCode(status.get()) != TF_OK) {
       PyErr_SetString(PyExc_ValueError, TF_Message(status.get()));
       throw py::error_already_set();
@@ -302,6 +304,20 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
         return tensorflow::PyoOrThrow(
             PyUnicode_FromString(layout_string.c_str()));
       });
+  m.def("IsDTensor", [](const py::handle& context,
+                        const py::handle& dtensor_handle,
+                        const py::capsule& device_info) {
+    std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+        TF_NewStatus(), TF_DeleteStatus);
+    bool is_dtensor = IsDTensor(
+        static_cast<TFE_Context*>(PyCapsule_GetPointer(context.ptr(), nullptr)),
+        EagerTensor_Handle(dtensor_handle.ptr()), device_info, status.get());
+    if (TF_GetCode(status.get()) != TF_OK) {
+      PyErr_SetString(PyExc_ValueError, TF_Message(status.get()));
+      throw py::error_already_set();
+    }
+    return is_dtensor;
+  });
   m.def("IsSparseDTensor", [](const py::handle& context,
                               const py::handle& dtensor_handle,
                               const py::capsule& device_info) {
@@ -327,6 +343,18 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
         static_cast<TFE_Context*>(PyCapsule_GetPointer(context.ptr(), nullptr)),
         device_info, status.get());
   });
+  m.def("SetIteratorElementLayouts",
+        [](const py::handle& context, const py::handle& dtensor_handle,
+           const std::vector<std::string>& element_layouts,
+           const py::capsule& device_info) {
+          std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+              TF_NewStatus(), TF_DeleteStatus);
+          SetIteratorElementLayouts(
+              static_cast<TFE_Context*>(
+                  PyCapsule_GetPointer(context.ptr(), nullptr)),
+              EagerTensor_Handle(dtensor_handle.ptr()), element_layouts,
+              device_info, status.get());
+        });
   py::class_<Mesh>(m, "Mesh")
       .def(py::init(&Mesh::CreateMesh))
       .def_property_readonly("name", &Mesh::name)
@@ -337,5 +365,8 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
       .def("contains_dim", &Mesh::IsMeshDim, py::arg("dim_name"),
            "Returns True if a Mesh contains the given dimension name.")
       .def("device_type", &Mesh::device_type,
-           "Returns the device_type of a Mesh.");
+           "Returns the device_type of a Mesh.")
+      .def("use_xla_spmd", &Mesh::use_xla_spmd,
+           "Returns True if Mesh will use XLA for SPMD instead of DTensor "
+           "SPMD.");
 }
diff --git a/tensorflow/python/pywrap_mlir.py b/tensorflow/python/pywrap_mlir.py
index f02a1c161e8..26bb4f78540 100644
--- a/tensorflow/python/pywrap_mlir.py
+++ b/tensorflow/python/pywrap_mlir.py
@@ -78,3 +78,7 @@ def experimental_convert_saved_model_v1_to_mlir(saved_model_path,
 def experimental_run_pass_pipeline(mlir_txt, pass_pipeline, show_debug_info):
   return ExperimentalRunPassPipeline(
       mlir_txt.encode('utf-8'), pass_pipeline.encode('utf-8'), show_debug_info)
+
+
+def experimental_write_bytecode(filename, mlir_txt):
+  return ExperimentalWriteBytecode(filename.encode('utf-8'), mlir_txt.encode())
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 5acea136300..e2c24fea34b 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -3,8 +3,10 @@
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_google")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     # TODO(drpng): change that to //third_party/tensorflow:internal
     # when we have migrated all users.
     default_visibility = ["//visibility:public"],
@@ -19,6 +21,7 @@ py_strict_library(
     deps = [
         ":builder",
         ":constants",
+        ":fingerprinting",
         ":load",
         ":loader",
         ":main_op",
@@ -68,6 +71,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":constants",
+        ":path_helpers",
         ":pywrap_saved_model",
         ":signature_def_utils",
         ":utils",
@@ -76,8 +80,8 @@ py_strict_library(
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:saver",
-        "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/util",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -91,6 +95,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":constants",
+        ":path_helpers",
         ":pywrap_saved_model",
         ":signature_def_utils",
         ":utils",
@@ -192,6 +197,17 @@ tf_py_test(
     ],
 )
 
+py_strict_library(
+    name = "path_helpers",
+    srcs = ["path_helpers.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":constants",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_strict_library(
     name = "utils",
     srcs = [
@@ -200,12 +216,10 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":constants",
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:util",
@@ -286,6 +300,7 @@ py_strict_library(
         ":function_serialization",
         ":revived_types",
         ":signature_constants",
+        "//tensorflow/core/function/capture:restore_captures",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -295,6 +310,7 @@ py_strict_library(
         "//tensorflow/python/eager:function",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/logging",
     ],
 )
@@ -329,6 +345,7 @@ py_strict_library(
     deps = [
         ":builder",
         ":function_serialization",
+        ":path_helpers",
         ":pywrap_saved_model",
         ":revived_types",
         ":save_context",
@@ -359,10 +376,12 @@ py_strict_library(
         "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/checkpoint:save_util_v1",
         "//tensorflow/python/checkpoint:saveable_compat",
+        "//tensorflow/python/checkpoint:tensor_callable",
         "//tensorflow/python/checkpoint:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager/polymorphic_function:saved_model_exported_concrete",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:asset",
@@ -407,10 +426,12 @@ py_library(
         ":load_options",
         ":load_v1_in_v2",
         ":loader",
+        ":path_helpers",
         ":pywrap_saved_model",
         ":revived_types",
         ":utils",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/capture:restore_captures",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -430,7 +451,6 @@ py_library(
         "//tensorflow/python/distribute:values_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:asset",
@@ -455,10 +475,8 @@ py_strict_library(
         ":loader",
         ":pywrap_saved_model",
         ":signature_serialization",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
         "//tensorflow/python:platform",
@@ -486,6 +504,7 @@ cuda_py_test(
     deps = [
         ":load",
         ":save",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:cond_v2",  # b/118513001
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -500,8 +519,9 @@ cuda_py_test(
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:resource",
-        "@absl_py//absl/testing:parameterized",
-    ],
+    ] + if_google([
+        "//tensorflow/cc/experimental/tf2:runtime_pybind",
+    ]),
 )
 
 tf_py_test(
@@ -611,23 +631,9 @@ py_strict_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:optional_ops",
-        "//tensorflow/python/distribute:values",
-        "//tensorflow/python/framework:extension_type",
-        "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/ops/numpy_ops:numpy",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/types",
         "//tensorflow/python/util",
         "//tensorflow/python/util:tf_export",
     ],
@@ -641,6 +647,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:type_spec_registry",
     ],
 )
 
@@ -719,6 +726,13 @@ tf_python_pybind_extension(
         "pywrap_saved_model_fingerprinting.h",
         "pywrap_saved_model_metrics.h",
     ],
+    # This fails Windows builds. Please check b/266870200 for details.
+    #    dynamic_deps = ["//tensorflow/python:_pywrap_tensorflow_internal.so"] + select({
+    #        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+    #        "//conditions:default": ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+    #        "//tensorflow:windows": [],
+    #    }),
+    #    static_deps = tf_python_pybind_static_deps(),
     features = ["-layering_check"],
     visibility = ["//tensorflow/python/training:__subpackages__"],
     deps = [
@@ -759,12 +773,22 @@ tf_py_test(
     ],
 )
 
+py_strict_library(
+    name = "fingerprinting",
+    srcs = ["fingerprinting.py"],
+    deps = [
+        ":pywrap_saved_model",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
 tf_py_test(
     name = "fingerprinting_test",
     size = "small",
     srcs = ["fingerprinting_test.py"],
     python_version = "PY3",
     deps = [
+        ":fingerprinting",
         "//tensorflow/python/eager:test",
     ],
 )
@@ -776,6 +800,7 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":save",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 2a31a748f8c..cb4a3c11a2f 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -29,8 +29,8 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -111,6 +111,7 @@ def __init__(self, export_dir):
     # on the SavedModel MUST use the add_meta_graph() API which does not save
     # weights.
     self._has_saved_variables = False
+    self._saved_asset_files = set()
 
   def _save_and_write_assets(self, meta_graph_def, assets_list=None):
     """Saves asset to the meta graph and writes asset files to disk.
@@ -129,7 +130,8 @@ def _save_and_write_assets(self, meta_graph_def, assets_list=None):
       return
 
     # Copy assets from source path to destination path.
-    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir,
+                                   self._saved_asset_files)
 
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
@@ -141,7 +143,7 @@ def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
       meta_graph_def: The meta graph def to add to the SavedModel.
       tags: The set of tags to annotate the meta graph def with.
       signature_def_map: The map of signature defs to be added to the meta graph
-          def.
+        def.
     """
     for tag in tags:
       meta_graph_def.meta_info_def.tags.append(tag)
@@ -245,15 +247,15 @@ def add_meta_graph(self,
     Args:
       tags: The set of tags to annotate the meta graph def with.
       signature_def_map: The map of signature defs to be added to the meta graph
-          def.
+        def.
       assets_list: Assets to be saved with SavedModel. Note
           that this list should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       clear_devices: Set to true if the device info on the default graph should
-          be cleared.
+        be cleared.
       init_op: Op or group of ops to execute when the graph is loaded. Note
           that when the init_op is specified it is run after the restore op at
-          load-time.
+        load-time.
       train_op: Op or group of opts that trains the model when run. This will
         not be run automatically when the graph is loaded, instead saved in
         a SignatureDef accessible through the exported MetaGraph.
@@ -327,10 +329,10 @@ def add_meta_graph_and_variables(self,
         def.
       assets_list: Assets to be saved with SavedModel.
       clear_devices: Set to true if the device info on the default graph should
-          be cleared.
+        be cleared.
       init_op: Op or group of ops to execute when the graph is loaded. Note
           that when the init_op is specified it is run after the restore op at
-          load-time.
+        load-time.
       train_op: Op or group of ops that trains the model when run. This will
         not be run automatically when the graph is loaded, instead saved in
         a SignatureDef accessible through the exported MetaGraph.
@@ -360,8 +362,8 @@ def add_meta_graph_and_variables(self,
     _add_op_to_signature_def_map(signature_def_map, train_op,
                                  constants.TRAIN_OP_SIGNATURE_KEY)
 
-    saved_model_utils.get_or_create_variables_dir(self._export_dir)
-    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+    path_helpers.get_or_create_variables_dir(self._export_dir)
+    variables_path = path_helpers.get_variables_path(self._export_dir)
 
     saver = self._maybe_create_saver(saver)
 
@@ -464,7 +466,8 @@ def _save_and_write_assets(self, assets_collection_to_add=None):
       return
 
     # Copy assets from source path to destination path.
-    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir,
+                                   self._saved_asset_files)
 
   def _maybe_add_main_op(self, main_op):
     """Adds main op to the SavedModel.
@@ -587,8 +590,8 @@ def add_meta_graph_and_variables(self,
     # Add assets and ops
     self._add_collections(assets_collection, main_op, None)
 
-    saved_model_utils.get_or_create_variables_dir(self._export_dir)
-    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+    path_helpers.get_or_create_variables_dir(self._export_dir)
+    variables_path = path_helpers.get_variables_path(self._export_dir)
 
     saver = self._maybe_create_saver(saver)
 
@@ -759,9 +762,21 @@ def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
   asset_proto.tensor_info.name = asset_tensor.name
 
 
-def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
-  """Copy all assets from source path to destination path."""
-  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+def copy_assets_to_destination_dir(asset_filename_map, destination_dir,
+                                   saved_files=None):
+  """Copy all assets from source path to destination path.
+
+  Args:
+    asset_filename_map: a dict of filenames used for saving the asset in
+      the SavedModel to full paths from which the filenames were derived.
+    destination_dir: the destination directory that assets are stored in.
+    saved_files: a set of destination filepaths that have already been copied
+      and will be skipped
+  """
+  if saved_files is None:
+    saved_files = set()
+
+  assets_destination_dir = path_helpers.get_or_create_assets_dir(
       destination_dir)
 
   # Copy each asset from source path to destination path.
@@ -770,11 +785,14 @@ def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
         compat.as_bytes(assets_destination_dir),
         compat.as_bytes(asset_basename))
 
-    # Only copy the asset file to the destination if it does not already
-    # exist. This is to ensure that an asset with the same name defined as
-    # part of multiple graphs is only copied the first time.
-    if not file_io.file_exists(asset_destination_filepath):
-      file_io.copy(asset_source_filepath, asset_destination_filepath)
+    # Copy if source file exists, src & dst are not the same, and dst is not in
+    # saved_files
+    if (file_io.file_exists(asset_source_filepath) and
+        asset_source_filepath != asset_destination_filepath and
+        asset_destination_filepath not in saved_files):
+      file_io.copy(
+          asset_source_filepath, asset_destination_filepath, overwrite=True)
+      saved_files.add(asset_destination_filepath)
 
   tf_logging.info("Assets written to: %s",
                   compat.as_text(assets_destination_dir))
diff --git a/tensorflow/python/saved_model/fingerprinting.md b/tensorflow/python/saved_model/fingerprinting.md
new file mode 100644
index 00000000000..2c4bf4155b6
--- /dev/null
+++ b/tensorflow/python/saved_model/fingerprinting.md
@@ -0,0 +1,26 @@
+## SavedModel Fingerprinting
+
+This document describes the implementation details of SavedModel fingerprinting. 
+The design document (RFC) can be found [here](https://github.com/tensorflow/community/pull/415).
+
+### Implementation
+
+The code that implements SavedModel fingerprinting can be found in :
+- `tensorflow/python/saved_model/fingerprinting.py`: Public python methods for accessing the fingerprint.
+- `tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.*`: Python wrappers for C++ fingerprint methods. For internal use only.
+- `tensorflow/cc/saved_model/fingerprint.*`: C++ methods for creating and reading the fingerprint.
+- `tensorflow/core/graph/regularization/`: Code that "regularizes" the GraphDef. See the README
+
+Generally speaking, most of the implementation for SavedModel fingerprinting is in C++. The code in this directory is meant to make these methods accessible in Python for the purposes of creating a public API
+as well as instrumenting the Python side of the code base.
+
+### Instrumentation
+
+The current SavedModel reading and loading APIs are instrumented such that they log
+the fingerprint every time they are called. The APIs that are instrumented are:
+- `tf.saved_model.save`
+- `tf.saved_model.load`
+- `tensorflow::LoadSavedModel`
+- `tensorflow::SavedModelV2Bundle::Load`
+
+
diff --git a/tensorflow/python/saved_model/fingerprinting.py b/tensorflow/python/saved_model/fingerprinting.py
new file mode 100644
index 00000000000..a727d6c523b
--- /dev/null
+++ b/tensorflow/python/saved_model/fingerprinting.py
@@ -0,0 +1,97 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods for SavedModel fingerprinting.
+
+This module contains classes and functions for reading the SavedModel
+fingerprint.
+"""
+
+from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting as fingerprinting_pywrap
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("saved_model.experimental.Fingerprint", v1=[])
+class Fingerprint(object):
+  """The SavedModel fingerprint.
+
+  Each attribute of this class is named after a field name in the
+  FingerprintDef proto and contains the value of the respective field in the
+  protobuf.
+
+  Attributes:
+    saved_model_checksum: A uint64 containing the `saved_model_checksum`.
+    graph_def_program_hash: A uint64 containing `graph_def_program_hash`.
+    signature_def_hash: A uint64 containing the `signature_def_hash`.
+    saved_object_graph_hash: A uint64 containing the `saved_object_graph_hash`.
+    checkpoint_hash: A uint64 containing the`checkpoint_hash`.
+    version: An int32 containing the producer field of the VersionDef.
+  """
+
+  def __init__(
+      self,
+      saved_model_checksum=None,
+      graph_def_program_hash=None,
+      signature_def_hash=None,
+      saved_object_graph_hash=None,
+      checkpoint_hash=None,
+      version=None,
+  ):
+    """Initializes the instance based on values in the SavedModel fingerprint.
+
+    Args:
+      saved_model_checksum: Value of the`saved_model_checksum`.
+      graph_def_program_hash: Value of the `graph_def_program_hash`.
+      signature_def_hash: Value of the `signature_def_hash`.
+      saved_object_graph_hash: Value of the `saved_object_graph_hash`.
+      checkpoint_hash: Value of the `checkpoint_hash`.
+      version: Value of the producer field of the VersionDef.
+    """
+    self.saved_model_checksum = saved_model_checksum
+    self.graph_def_program_hash = graph_def_program_hash
+    self.signature_def_hash = signature_def_hash
+    self.saved_object_graph_hash = saved_object_graph_hash
+    self.checkpoint_hash = checkpoint_hash
+    self.version = version
+
+
+@tf_export("saved_model.experimental.read_fingerprint", v1=[])
+def read_fingerprint(export_dir):
+  """Reads the fingerprint of a SavedModel in `export_dir`.
+
+  Returns a `tf.saved_model.experimental.Fingerprint` object that contains
+  the values of the SavedModel fingerprint, which is persisted on disk in the
+  `fingerprint.pb` file in the `export_dir`.
+  TODO(b/265199038): Add link to TensorFlow SavedModel guide.
+
+  Args:
+    export_dir: The directory that contains the SavedModel.
+
+  Returns:
+    A `tf.saved_model.experimental.Fingerprint`.
+
+  Raises:
+    ValueError: If no or an invalid fingerprint is found.
+  """
+  fingerprint_map = fingerprinting_pywrap.GetFingerprintMap(export_dir)
+  if not fingerprint_map:
+    raise ValueError(f"No or invalid fingerprint found in: {export_dir}.")
+  return Fingerprint(
+      fingerprint_map["saved_model_checksum"],
+      fingerprint_map["graph_def_program_hash"],
+      fingerprint_map["signature_def_hash"],
+      fingerprint_map["saved_object_graph_hash"],
+      fingerprint_map["checkpoint_hash"],
+      fingerprint_map["version"],
+  )
diff --git a/tensorflow/python/saved_model/fingerprinting_test.py b/tensorflow/python/saved_model/fingerprinting_test.py
index c0540675392..7f98cca3430 100644
--- a/tensorflow/python/saved_model/fingerprinting_test.py
+++ b/tensorflow/python/saved_model/fingerprinting_test.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Tests for SavedModel fingerprinting.
 
-These tests verify that FingerprintDef protobuf is written correctly in
-`tf.saved_model.save`.
+These tests verify that fingerprint is written correctly and that APIs for
+reading it are correct.
 """
 import os
 import shutil
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model.fingerprinting import read_fingerprint
 from tensorflow.python.saved_model.pywrap_saved_model import constants
 from tensorflow.python.trackable import autotrackable
 
@@ -72,8 +73,9 @@ def test_basic_module(self):
 
     fingerprint_def = self._read_fingerprint(
         file_io.join(save_dir, constants.FINGERPRINT_FILENAME))
+
     # We cannot check this value due to non-determinism in serialization.
-    self.assertGreater(fingerprint_def.graph_def_checksum, 0)
+    self.assertGreater(fingerprint_def.saved_model_checksum, 0)
     self.assertEqual(fingerprint_def.graph_def_program_hash,
                      14830488309055091319)
     self.assertEqual(fingerprint_def.signature_def_hash, 1050878586713189074)
@@ -115,6 +117,37 @@ def test_model_saved_with_different_signature_options(self):
     self.assertEqual(fingerprint_sig.signature_def_hash,
                      fingerprint_input_sig.signature_def_hash)
 
+  def test_read_fingerprint_api(self):
+    save_dir = self._create_saved_model()
+    fingerprint = read_fingerprint(save_dir)
+
+    fingerprint_def = self._read_fingerprint(
+        file_io.join(save_dir, constants.FINGERPRINT_FILENAME)
+    )
+
+    self.assertEqual(
+        fingerprint.saved_model_checksum, fingerprint_def.saved_model_checksum
+    )
+    self.assertEqual(
+        fingerprint.graph_def_program_hash,
+        fingerprint_def.graph_def_program_hash,
+    )
+    self.assertEqual(
+        fingerprint.signature_def_hash, fingerprint_def.signature_def_hash
+    )
+    self.assertEqual(
+        fingerprint.saved_object_graph_hash,
+        fingerprint_def.saved_object_graph_hash,
+    )
+    self.assertEqual(
+        fingerprint.checkpoint_hash, fingerprint_def.checkpoint_hash
+    )
+    self.assertEqual(fingerprint.version, fingerprint_def.version.producer)
+
+  def test_read_fingerprint_api_invalid(self):
+    with self.assertRaisesRegex(ValueError, "No or invalid fingerprint"):
+      read_fingerprint("foo")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index e5350d30e32..4bfdecb290a 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -153,9 +153,9 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto):
       saved_object_graph_pb2.FunctionSpec.JitCompile.OFF: False,
   }.get(function_spec_proto.jit_compile)
 
-  return function_spec_lib.FunctionSpec(
+  return function_spec_lib.FunctionSpec.from_fullargspec_and_signature(
       fullargspec=fullargspec,
-      is_method=False,
+      is_bound_method=False,
       input_signature=input_signature,
       jit_compile=jit_compile)
 
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 96ef3a83c34..3566080cf4f 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -19,6 +19,7 @@
 import os
 import sys
 
+from tensorflow.core.function.capture import restore_captures
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.checkpoint import checkpoint_options
@@ -44,9 +45,11 @@
 from tensorflow.python.saved_model import load_options
 from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model import registration
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import autotrackable
@@ -54,6 +57,7 @@
 from tensorflow.python.trackable import data_structures
 from tensorflow.python.trackable import resource
 from tensorflow.python.trackable import trackable_utils
+from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -366,7 +370,7 @@ def _setup_function_captures(self, concrete_function_name, nodes):
     concrete_function = self._concrete_functions[concrete_function_name]
     proto = self._proto.concrete_functions[concrete_function_name]
     inputs = [nodes[node_id] for node_id in proto.bound_inputs]
-    function_saved_model_utils.restore_captures(concrete_function, inputs)
+    restore_captures.restore_captures(concrete_function, inputs)
 
   def _initialize_loaded_nodes(self):
     nodes = {}
@@ -513,7 +517,7 @@ def _load_nodes(self):
 
   def _restore_checkpoint(self):
     """Load state from checkpoint into the deserialized objects."""
-    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+    variables_path = path_helpers.get_variables_path(self._export_dir)
     # TODO(b/205010730): Clean use of private methods of TrackableSaver.
     # pylint: disable=protected-access
     saver = checkpoint.TrackableSaver(graph_view.ObjectGraphView(self.get(0)))
@@ -529,6 +533,8 @@ def _restore_checkpoint(self):
     ckpt = load_status._checkpoint
 
     if not context.executing_eagerly():
+      reader = py_checkpoint_reader.NewCheckpointReader(variables_path)
+
       # When running in eager mode, the `restore` call above has already run and
       # restored the state of trackables, and calling `position.restore_ops()`
       # would re-run the restore. In graph mode, that will return a cached list
@@ -546,7 +552,7 @@ def _restore_checkpoint(self):
               f"not supported in graph mode. The loaded object {obj} uses the "
               f"saver registered with the name {registered_saver}.")
 
-        restore_ops = position.restore_ops()
+        restore_ops = position.restore_ops(reader)
         if restore_ops:
           if resource_variable_ops.is_resource_variable(obj):
             if len(restore_ops) == 1:
@@ -643,11 +649,14 @@ def _recreate_user_object(self, proto, node_id):
       # to be able to load the "optimizer" object (OptimizerV2), which has
       # special logic around adding slot variables with `add_slot` in this file.
       try:
-        import keras.optimizers.optimizer_v2 as _  # pylint: disable=g-import-not-at-top
-      except ImportError as e:
-        raise ImportError(
-            "Error when importing Keras. Unable to load SavedModel that "
-            "contains an optimizer without the Keras module.") from e
+        import keras.optimizers.legacy as _  # pylint: disable=g-import-not-at-top
+      except ImportError:
+        try:
+          import keras.optimizers.optimizer_v2 as _  # pylint: disable=g-import-not-at-top
+        except ImportError as e:
+          raise ImportError(
+              "Error when importing Keras. Unable to load SavedModel that "
+              "contains an optimizer without the Keras module.") from e
     looked_up = revived_types.deserialize(proto)
     if looked_up is None:
       return self._recreate_base_user_object(proto, node_id)
@@ -977,6 +986,11 @@ def load_partial(export_dir, filters, tags=None, options=None):
       root = load_v1_in_v2.load(export_dir, tags)
       root.graph_debug_info = debug_info
 
+  # Read and log SavedModel checksum, if it is nonzero.
+  saved_model_checksum = fingerprinting.MaybeReadSavedModelChecksum(export_dir)
+  if saved_model_checksum != 0:
+    metrics.SetReadFingerprint(saved_model_checksum=str(saved_model_checksum))
+
   if filters:
     return {node_id: loader.get(node_id) for node_id in filters}
   else:
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 441bdf672d2..311c9ec64a5 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -27,6 +27,8 @@
 
 from absl.testing import parameterized
 import numpy as np
+
+# Import for py bindings to runtime
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.checkpoint import saveable_compat
 from tensorflow.python.client import session as session_lib
@@ -76,7 +78,14 @@
 from tensorflow.python.util import tf_inspect
 
 
-def cycle(obj, cycles, signatures=None, options=None):
+def cycle(
+    obj,
+    cycles,
+    signatures=None,
+    save_option=None,
+    load_option=None,
+    use_cpp_bindings=False,
+):
   to_save = obj
   # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
   # point w.r.t. saving/restoring, ideally after 2nd saving.
@@ -86,39 +95,84 @@ def cycle(obj, cycles, signatures=None, options=None):
     # just makes sure we aren't throwing errors and have enough
     # device("CPU") blocks to satisfy the placer.
     with test_util.use_gpu():
-      save.save(to_save, path, signatures, options=options)
-      loaded = load.load(path)
+      save.save(to_save, path, signatures, options=save_option)
+      loaded = test_load(
+          path, options=load_option, use_cpp_bindings=use_cpp_bindings
+      )
       signatures = loaded.signatures
     to_save = loaded
   return loaded
 
 
-@parameterized.named_parameters(
-    dict(testcase_name="ReloadOnce", cycles=1),
-    dict(testcase_name="ReloadTwice", cycles=2),
-    dict(testcase_name="ReloadThrice", cycles=3)
-)
+def _test_load_base(path, tags=None, options=None,
+                    use_cpp_bindings=False):  # pylint: disable=unused-argument
+  return load.load(path, tags=tags, options=options)
+
+
+def _test_load_internal(path, tags=None, options=None, use_cpp_bindings=False):
+  if use_cpp_bindings:
+    runtime = runtime_pybind.Runtime()
+    return runtime.Import(path)
+  return _test_load_base(path, tags=tags, options=options,
+                         use_cpp_bindings=use_cpp_bindings)
+
+# replaced by copy.bara.sky
+run_external = True
+
+
+def test_load(path, **kwargs):
+  if not run_external:
+    return _test_load_internal(path, **kwargs)
+  return _test_load_base(path, **kwargs)
+
+
+def _load_test_params():
+  params = [
+      dict(testcase_name="ReloadOncePy", cycles=1, use_cpp_bindings=False),
+      dict(testcase_name="ReloadTwicePy", cycles=2, use_cpp_bindings=False),
+      dict(testcase_name="ReloadThricePy", cycles=3, use_cpp_bindings=False),
+  ]
+  if not run_external:
+    params.append(dict(testcase_name="ReloadOnceCpp", cycles=1,
+                       use_cpp_bindings=True))
+  return params
+
+
+def _test_params():
+  params = [dict(testcase_name="LoadWithPython", use_cpp_bindings=False)]
+  if not run_external:
+    params.append(dict(testcase_name="LoadWithCpp", use_cpp_bindings=True))
+  return params
+
+
+@parameterized.named_parameters(*_load_test_params())
 class LoadTest(test.TestCase, parameterized.TestCase):
 
-  def test_structure_import(self, cycles):
+  def test_structure_import(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
     root.dep_one = autotrackable.AutoTrackable()
     root.dep_two = autotrackable.AutoTrackable()
     root.dep_two.dep = autotrackable.AutoTrackable()
     root.dep_three = root.dep_two.dep
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
   @test_util.run_in_graph_and_eager_modes
-  def test_variables(self, cycles):
+  def test_variables(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.v1 = variables.Variable(1., trainable=True)
-    root.v2 = variables.Variable(2., trainable=False)
+    root.v1 = variables.Variable(1.0, trainable=True)
+    root.v2 = variables.Variable(2.0, trainable=False)
     self.evaluate([root.v1.initializer, root.v2.initializer])
 
     for _ in range(cycles):
-      imported = cycle(root, 1)
+      imported = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
       self.evaluate([imported.v1.initializer, imported.v2.initializer])
 
     if not context.executing_eagerly():
@@ -130,85 +184,107 @@ def test_variables(self, cycles):
     self.assertEqual(self.evaluate(imported.v2), 2.0)
     self.assertFalse(imported.v2.trainable)
 
-  def test_variables_name(self, cycles):
+  def test_variables_name(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
     # Test 2 variables with same name: should work as the checkpoint
     # is based on object name and not on variable name.
-    root.v1 = variables.Variable(1., trainable=True, name="v1")
-    root.v2 = variables.Variable(2., trainable=False, name="v1")
-    imported = cycle(root, cycles)
+    root.v1 = variables.Variable(1.0, trainable=True, name="v1")
+    root.v2 = variables.Variable(2.0, trainable=False, name="v1")
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(imported.v1.numpy(), 1.0)
     self.assertEqual(imported.v2.numpy(), 2.0)
     self.assertEqual(imported.v1.name, root.v1.name)
     self.assertEqual(imported.v2.name, root.v2.name)
     with variable_scope.variable_scope("foo"):
-      imported = cycle(root, cycles)
+      imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
       self.assertTrue(imported.v1.name.startswith("foo/"))
       self.assertTrue(imported.v2.name.startswith("foo/"))
 
   @test_util.disable_xla("This test never passed for XLA")
-  def test_partially_defined_variable_shape(self, cycles):
+  def test_partially_defined_variable_shape(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     class MakeVariable(module.Module):
 
       def __init__(self):
         self.v = None
 
       @def_function.function(
-          input_signature=[tensor_spec.TensorSpec([None], dtypes.int64)])
+          input_signature=[tensor_spec.TensorSpec([None], dtypes.int64)]
+      )
       def make_variable(self, initial_value):
         if self.v is None:
           self.v = variables.Variable(initial_value)
 
     m = MakeVariable()
     m.make_variable([1, 2, 3])
-    m = cycle(m, cycles)
+    m = cycle(m, cycles, use_cpp_bindings=use_cpp_bindings)
     m.v.assign([1, 2, 3, 4])
     self.assertEqual([None], tensor_shape.as_shape(m.v.shape).as_list())
 
   @test_util.run_in_graph_and_eager_modes
-  def test_capture_variables(self, cycles):
+  def test_capture_variables(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.weights = variables.Variable(2.)
+    root.weights = variables.Variable(2.0)
     self.evaluate(root.weights.initializer)
     root.f = def_function.function(
         lambda x: root.weights * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
     for _ in range(cycles):
-      imported = cycle(root, 1)
+      imported = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
       self.evaluate(imported.weights.initializer)
-    self.assertEqual(4., self.evaluate(imported.f(constant_op.constant(2.))))
+    self.assertEqual(4.0, self.evaluate(imported.f(constant_op.constant(2.0))))
     self.evaluate(imported.weights.assign(4.0))
-    self.assertEqual(8., self.evaluate(imported.f(constant_op.constant(2.))))
+    self.assertEqual(8.0, self.evaluate(imported.f(constant_op.constant(2.0))))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_capture_constant(self, cycles):
+  def test_capture_constant(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    captured_constant = constant_op.constant(2.)
+    captured_constant = constant_op.constant(2.0)
     root.f = def_function.function(
         lambda x: captured_constant * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    imported = cycle(root, cycles)
-    self.assertEqual(4., self.evaluate(imported.f(constant_op.constant(2.))))
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(4.0, self.evaluate(imported.f(constant_op.constant(2.0))))
 
-  def test_control_outputs(self, cycles):
+  def test_control_outputs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     exported = autotrackable.AutoTrackable()
-    exported.v = variables.Variable(1.)
+    exported.v = variables.Variable(1.0)
     exported.f = def_function.function(
-        lambda: exported.v.assign(2., name="should_be_control_output"))
+        lambda: exported.v.assign(2.0, name="should_be_control_output")
+    )
     exported_graph = exported.f.get_concrete_function().graph
     self.assertIn(
         exported_graph.get_operation_by_name("should_be_control_output"),
-        exported_graph.control_outputs)
+        exported_graph.control_outputs,
+    )
 
-    imported = cycle(exported, cycles)
+    imported = cycle(exported, cycles, use_cpp_bindings=use_cpp_bindings)
     # Calling get_concrete_function wraps in a second call operation; we want to
     # inspect the original function body for the control output; digging into
     # graph.as_graph_def() and its FunctionDefLibrary is another option.
-    imported_concrete, = imported.f.concrete_functions
+    (imported_concrete,) = imported.f.concrete_functions
     imported_graph = imported_concrete.graph
     self.assertIn(
         imported_graph.get_operation_by_name("should_be_control_output"),
-        imported_graph.control_outputs)
+        imported_graph.control_outputs,
+    )
 
   def _make_asset(self, contents):
     fd, filename = tempfile.mkstemp(prefix=self.get_temp_dir())
@@ -217,7 +293,10 @@ def _make_asset(self, contents):
     return filename
 
   @test_util.run_in_graph_and_eager_modes
-  def test_assets(self, cycles):
+  def test_assets(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
@@ -233,13 +312,16 @@ def test_assets(self, cycles):
     load_dir = os.path.join(self.get_temp_dir(), "load_dir")
     file_io.rename(save_dir, load_dir)
 
-    imported = load.load(load_dir)
+    imported = test_load(load_dir, use_cpp_bindings=use_cpp_bindings)
     with open(self.evaluate(imported.asset1.asset_path), "r") as f:
       self.assertEqual("contents 1", f.read())
     with open(self.evaluate(imported.asset2.asset_path), "r") as f:
       self.assertEqual("contents 2", f.read())
 
-  def test_cond_prune(self, cycles):
+  def test_cond_prune(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     x_in = []
     x_out = []
 
@@ -254,50 +336,60 @@ def f(x, y):
       return xx, 2 * y
 
     f_wrapped = wrap_function.wrap_function(
-        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2)
+        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2
+    )
     f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
 
     class Adder(module.Module):
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
+          ]
+      )
       def add(self, x):
         return f_pruned(x)
 
     root = Adder()
-    root.add(constant_op.constant(1.))
-    root = cycle(root, cycles)
-    root.add(constant_op.constant(1.))
+    root.add(constant_op.constant(1.0))
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    root.add(constant_op.constant(1.0))
 
-  def test_capture_assets(self, cycles):
+  def test_capture_assets(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
     root.vocab = asset.Asset(self._make_asset("contents"))
     root.f = def_function.function(
-        lambda: root.vocab.asset_path,
-        input_signature=[])
-    imported = cycle(root, cycles)
+        lambda: root.vocab.asset_path, input_signature=[]
+    )
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     original_output = root.f().numpy()
     imported_output = imported.f().numpy()
     self.assertNotEqual(original_output, imported_output)
     with open(imported_output, "r") as f:
       self.assertEqual("contents", f.read())
 
-  def test_capture_assets_in_graph(self, cycles):
+  def test_capture_assets_in_graph(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
     root.vocab = asset.Asset(self._make_asset("contents"))
     root.f = def_function.function(
-        lambda: root.vocab.asset_path,
-        input_signature=[])
+        lambda: root.vocab.asset_path, input_signature=[]
+    )
 
     original_output = root.f().numpy()
 
     if cycles > 1:
-      root = cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1, use_cpp_bindings=use_cpp_bindings)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
     with ops.Graph().as_default():
-      imported = load.load(path)
+      imported = test_load(path, use_cpp_bindings=use_cpp_bindings)
       imported_tensor = imported.f()
       with monitored_session.MonitoredSession() as sess:
         imported_output = sess.run(imported_tensor)
@@ -306,23 +398,34 @@ def test_capture_assets_in_graph(self, cycles):
         with open(imported_output, "r") as f:
           self.assertEqual("contents", f.read())
 
-  def test_dedup_assets(self, cycles):
+  def test_dedup_assets(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     vocab = self._make_asset("contents")
     root = autotrackable.AutoTrackable()
     root.asset1 = asset.Asset(vocab)
     root.asset2 = asset.Asset(vocab)
-    imported = cycle(root, cycles)
-    self.assertEqual(imported.asset1.asset_path.numpy(),
-                     imported.asset2.asset_path.numpy())
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(
+        imported.asset1.asset_path.numpy(), imported.asset2.asset_path.numpy()
+    )
 
-  def test_asset_fspath(self, cycles):
+  def test_asset_fspath(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     vocab = pathlib.Path(self._make_asset("contents"))
     root = autotrackable.AutoTrackable()
     root.asset = asset.Asset(vocab)
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertTrue(hasattr(imported, "asset"))
 
-  def test_implicit_input_signature(self, cycles):
+  def test_implicit_input_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     @def_function.function
     def func(x):
       return 2 * x
@@ -331,27 +434,36 @@ def func(x):
     root.f = func
 
     # Add two traces.
-    root.f(constant_op.constant(1.))
+    root.f(constant_op.constant(1.0))
     root.f(constant_op.constant(1))
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
-    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    self.assertEqual(4.0, imported.f(constant_op.constant(2.0)).numpy())
     self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
 
-  def test_explicit_input_signature(self, cycles):
+  def test_explicit_input_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )
     def func(x):
       return 2 * x
 
     root = autotrackable.AutoTrackable()
     root.f = func
 
-    imported = cycle(root, cycles)
-    self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(4.0, imported.f(constant_op.constant(2.0)).numpy())
+
+  def test_explicit_save_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
-  def test_explicit_save_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
@@ -360,27 +472,39 @@ def func(x):
     root.f = func
 
     imported = cycle(
-        root, cycles, {
-            "f":
-                root.f.get_concrete_function(
-                    tensor_spec.TensorSpec(None, dtypes.float32))
-        })
-    self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
-
-  def test_nested_functions(self, cycles):
+        root,
+        cycles,
+        signatures={
+            "f": root.f.get_concrete_function(
+                tensor_spec.TensorSpec(None, dtypes.float32)
+            )
+        },
+        use_cpp_bindings=use_cpp_bindings,
+    )
+    self.assertEqual(4.0, imported.f(constant_op.constant(2.0)).numpy())
+
+  def test_nested_functions(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     f = def_function.function(
-        lambda x: x*2.0,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        lambda x: x * 2.0,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
     g = def_function.function(
         lambda x: f(x) + 1.0,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
 
     root = autotrackable.AutoTrackable()
     root.g = g
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     imported.g(constant_op.constant([1.0]))
 
-  def test_function_with_default_bool_input(self, cycles):
+  def test_function_with_default_bool_input(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def func(x, training=False):
       if training:
@@ -395,12 +519,15 @@ def func(x, training=False):
     self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
 
-  def test_function_with_default_none_input(self, cycles):
+  def test_function_with_default_none_input(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def func(x, dtype=None):
       if dtype:
@@ -411,36 +538,50 @@ def func(x, dtype=None):
     root = autotrackable.AutoTrackable()
     root.f = def_function.function(func)
 
-    self.assertAllEqual([0.0, 0.0, 0.0],
-                        root.f(constant_op.constant([1, 2, 3])).numpy())
-    self.assertAllEqual([0.0, 0.0, 0.0],
-                        root.f(constant_op.constant([1.0, 2.0, 3.0])).numpy())
-    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
-                        root.f(constant_op.constant([1, 2, 3, 4])).numpy())
-    self.assertAllEqual([0, 0, 0],
-                        root.f(
-                            constant_op.constant([1.0, 2.0, 3.0]),
-                            dtype=dtypes.int32).numpy())
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0], root.f(constant_op.constant([1, 2, 3])).numpy()
+    )
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0], root.f(constant_op.constant([1.0, 2.0, 3.0])).numpy()
+    )
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0, 0.0], root.f(constant_op.constant([1, 2, 3, 4])).numpy()
+    )
+    self.assertAllEqual(
+        [0, 0, 0],
+        root.f(
+            constant_op.constant([1.0, 2.0, 3.0]), dtype=dtypes.int32
+        ).numpy(),
+    )
 
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertLen(concrete_functions, 4)
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
-    self.assertAllEqual([0.0, 0.0, 0.0],
-                        imported.f(constant_op.constant([1, 2, 3]),
-                                   None).numpy())
-    self.assertAllEqual([0.0, 0.0, 0.0],
-                        imported.f(constant_op.constant([1.0, 2.0,
-                                                         3.0])).numpy())
-    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
-                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
-    self.assertAllEqual([0, 0, 0],
-                        imported.f(
-                            constant_op.constant([1.0, 2.0, 3.0]),
-                            dtype=dtypes.int32).numpy())
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0],
+        imported.f(constant_op.constant([1, 2, 3]), None).numpy(),
+    )
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0],
+        imported.f(constant_op.constant([1.0, 2.0, 3.0])).numpy(),
+    )
+    self.assertAllEqual(
+        [0.0, 0.0, 0.0, 0.0],
+        imported.f(constant_op.constant([1, 2, 3, 4])).numpy(),
+    )
+    self.assertAllEqual(
+        [0, 0, 0],
+        imported.f(
+            constant_op.constant([1.0, 2.0, 3.0]), dtype=dtypes.int32
+        ).numpy(),
+    )
 
-  def test_function_with_str_bytes_input(self, cycles):
+  def test_function_with_str_bytes_input(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def func(x, y):
@@ -456,13 +597,16 @@ def func(x, y):
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertLen(concrete_functions, 3)
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertAllEqual(b"ab", imported.f("a", "b"))
     self.assertAllEqual(b"ab", imported.f("a", constant_op.constant("b")))
     self.assertAllEqual(b"ab", imported.f(constant_op.constant("a"), "b"))
 
-  def test_function_no_return(self, cycles):
+  def test_function_no_return(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class TrackableWithOneVariable(autotrackable.AutoTrackable):
 
@@ -481,14 +625,17 @@ def increase(self, by=1.0):
     obj.increase()
     self.assertEqual(16.0, obj.variable.numpy())
 
-    imported = cycle(obj, cycles)
+    imported = cycle(obj, cycles, use_cpp_bindings=use_cpp_bindings)
 
     imported.increase(constant_op.constant(10.0))
     self.assertEqual(26.0, imported.variable.numpy())
     imported.increase(constant_op.constant(1.0))
     self.assertEqual(27.0, imported.variable.numpy())
 
-  def test_structured_inputs(self, cycles):
+  def test_structured_inputs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def func(x, training=True):
       # x is a nested structure, we care about one particular tensor.
@@ -512,16 +659,22 @@ def func(x, training=True):
     # matching signature will be valid on the loaded model.
     self.assertEqual(31, root.f(input1).numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     with self.assertRaisesRegex(
-        ValueError, "Could not find matching concrete function to call"):
+        ValueError, "Could not find matching concrete function to call"
+    ):
       imported.f(input2)
 
     self.assertEqual(31, imported.f(input1).numpy())
     self.assertEqual(32, imported.f(input3).numpy())
 
-  def test_structured_inputs_bare_concrete_function(self, cycles):
+  def test_structured_inputs_bare_concrete_function(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def func(x, training=True):
       # x is a nested structure, we care about one particular tensor.
@@ -541,7 +694,7 @@ def func(x, training=True):
     root = autotrackable.AutoTrackable()
     root.f = def_function.function(func).get_concrete_function(input1)
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     with self.assertRaises(TypeError):
       imported.f(input2)
@@ -549,8 +702,10 @@ def func(x, training=True):
     self.assertEqual(31, imported.f(input1).numpy())
     self.assertEqual(32, imported.f(input3).numpy())
 
-  def test_structured_output(self, cycles):
-
+  def test_structured_output(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     # Use fields with non-alphabetical order
     named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
 
@@ -569,7 +724,7 @@ def func(input1, input2):
     self.assertEqual(3, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     result = imported.f(constant_op.constant(2), constant_op.constant(5))
     self.assertEqual(7, result[0].a.numpy())
@@ -578,8 +733,10 @@ def func(input1, input2):
     self.assertEqual(5, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
-  def test_pretty_print_signature(self, cycles):
-
+  def test_pretty_print_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
 
     def func(input1, input2):
@@ -588,11 +745,13 @@ def func(input1, input2):
 
     root = autotrackable.AutoTrackable()
     root.f = def_function.function(func).get_concrete_function(
-        constant_op.constant(2), constant_op.constant(3))
+        constant_op.constant(2), constant_op.constant(3)
+    )
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(
-        imported.f.pretty_printed_signature(), """func(input1, input2)
+        imported.f.pretty_printed_signature(),
+        """func(input1, input2)
   Args:
     input1: int32 Tensor, shape=()
     input2: int32 Tensor, shape=()
@@ -601,9 +760,14 @@ def func(input1, input2):
       <1>: int32 Tensor, shape=()
       <2>: int32 Tensor, shape=()
       <3>: int32 Tensor, shape=()
-      <4>: float32 Tensor, shape=()""")
+      <4>: float32 Tensor, shape=()""",
+    )
+
+  def test_positional_arguments(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
-  def test_positional_arguments(self, cycles):
     def func(x, training=False, abc=7.1, defg=7.7):
       del abc
       if training:
@@ -621,13 +785,17 @@ def func(x, training=False, abc=7.1, defg=7.7):
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
     self.assertEqual(6, root.f(constant_op.constant(1), defg=7.0).numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
     self.assertEqual(6, imported.f(constant_op.constant(1), defg=7.0).numpy())
 
-  def test_additional_kwargs(self, cycles):
+  def test_additional_kwargs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     def func(x, training=False, **options):
       del options
       if training:
@@ -641,15 +809,20 @@ def func(x, training=False, **options):
     x = constant_op.constant(10)
     self.assertEqual(7, root.f(x, learning_rate=0.5, epochs=3).numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     with self.assertRaisesRegex(
-        ValueError, "Could not find matching concrete function to call.*"):
+        ValueError, "Could not find matching concrete function to call.*"
+    ):
       imported.f(x, learning_rate=0.5, epochs=4)
 
     self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
 
-  def test_member_function(self, cycles):
+  def test_member_function(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     class TrackableWithMember(autotrackable.AutoTrackable):
 
       def __init__(self):
@@ -669,12 +842,16 @@ def f(self, x, training=False):
     self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
     self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
 
-  def test_side_effect_listing(self, cycles):
+  def test_side_effect_listing(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     class M(autotrackable.AutoTrackable):
 
       def __init__(self):
@@ -682,84 +859,104 @@ def __init__(self):
         self.var = None
 
       @def_function.function(
-          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+      )
       def f(self, x):
         if self.var is None:
-          self.var = variables.Variable(2.)
+          self.var = variables.Variable(2.0)
         return x * self.var
 
     m = M()
     cycle(m, cycles)
     self.assertEqual(4.0, m.f(constant_op.constant(2.0)).numpy())
 
-  def test_basic_backprop(self, cycles):
-    weight = variables.Variable(1., trainable=True)
-    bias = variables.Variable(0., trainable=True)
+  def test_basic_backprop(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    weight = variables.Variable(1.0, trainable=True)
+    bias = variables.Variable(0.0, trainable=True)
     g = def_function.function(
-        lambda x: x*weight + bias,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        lambda x: x * weight + bias,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
 
     root = autotrackable.AutoTrackable()
     root.weight = weight
     root.bias = bias
     root.g = g
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     with backprop.GradientTape() as t:
       x = constant_op.constant([3.5])
       loss = imported.g(x)
       grad = t.gradient(loss, [imported.weight, imported.bias])
       self.assertAllClose(grad, [3.5, 1.0])
 
-  def test_nested_backprop(self, cycles):
-    weight = variables.Variable(1., trainable=True)
-    bias = variables.Variable(0., trainable=True)
+  def test_nested_backprop(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    weight = variables.Variable(1.0, trainable=True)
+    bias = variables.Variable(0.0, trainable=True)
 
     # Note: this function gets called from other function defs via a
     # "PartitionedCall" op node.
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(None, dtypes.float32),
-        tensor_spec.TensorSpec(None, dtypes.float32)])
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(None, dtypes.float32),
+            tensor_spec.TensorSpec(None, dtypes.float32),
+        ]
+    )
     def mul(x, y):
       return x * y
 
     # Note: this function gets called from other function defs via a
     # "StatefulPartitionedCall" op node.
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(None, dtypes.float32)])
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )
     def f(x):
       return mul(weight.read_value(), x)
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(None, dtypes.float32)])
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )
     def g(x):
-      return f(x) + bias,
+      return (f(x) + bias,)
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(None, dtypes.float32)])
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )
     def h(x):
-      return g(x) + bias,
+      return (g(x) + bias,)
 
     root = autotrackable.AutoTrackable()
     root.weight = weight
     root.bias = bias
     root.g = h
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     with backprop.GradientTape() as t:
       x = constant_op.constant([3.5])
       loss = imported.g(x)
     grad = t.gradient(loss, [imported.weight, imported.bias])
     self.assertAllClose(grad, [3.5, 2.0])
 
-  def test_while_loop_backprop(self, cycles):
-    weight = variables.Variable(2., trainable=True)
+  def test_while_loop_backprop(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    weight = variables.Variable(2.0, trainable=True)
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(dtype=dtypes.float32, shape=(None, None))])
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(dtype=dtypes.float32, shape=(None, None))
+        ]
+    )
     def g(x):
       """Adds rows of matrix x after multiplying each entry by v."""
       i_0 = constant_op.constant(0)
-      s_0 = constant_op.constant([0., 0.])
+      s_0 = constant_op.constant([0.0, 0.0])
       cond = lambda i, _: i < array_ops.shape(x)[1]
       body = lambda i, s: (i + 1, s + weight * x[:, i])
       i_end, s_end = control_flow_ops.while_loop(cond, body, (i_0, s_0))
@@ -769,13 +966,13 @@ def g(x):
     root = autotrackable.AutoTrackable()
     root.weight = weight
     root.g = g
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     def get_gradient(obj):
       with backprop.GradientTape() as t:
-        x = constant_op.constant([[1., 2., 3.], [1., -2, 3.]])
+        x = constant_op.constant([[1.0, 2.0, 3.0], [1.0, -2, 3.0]])
         y = obj.g(x)
-        self.assertAllClose(y, obj.weight * [6., 2.])
+        self.assertAllClose(y, obj.weight * [6.0, 2.0])
         loss = math_ops.reduce_sum(y)  # weight * 8.
         self.assertAllEqual(t.watched_variables(), [obj.weight])
         return t.gradient(loss, obj.weight)
@@ -783,59 +980,72 @@ def get_gradient(obj):
     imported_gradient = get_gradient(imported)
     original_gradient = get_gradient(root)
     self.assertIsNotNone(original_gradient)
-    self.assertAllClose(original_gradient, 8.)
+    self.assertAllClose(original_gradient, 8.0)
     self.assertIsNotNone(imported_gradient)
-    self.assertAllClose(imported_gradient, 8.)
+    self.assertAllClose(imported_gradient, 8.0)
 
-  def _test_restored_func_with_captured_var_backprop(self, cycles, dtype):
-    weight = variables.Variable(2., trainable=True, dtype=dtype)
+  def _test_restored_func_with_captured_var_backprop(
+      self, cycles, use_cpp_bindings, dtype
+  ):
+    weight = variables.Variable(2.0, trainable=True, dtype=dtype)
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(dtype=dtype, shape=())])
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(dtype=dtype, shape=())]
+    )
     def g(x):
       return x * weight
 
     root = autotrackable.AutoTrackable()
     root.weight = weight
     root.g = g
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     def get_gradient(obj):
       with backprop.GradientTape() as t:
-        x = constant_op.constant(2., dtype=dtype)
+        x = constant_op.constant(2.0, dtype=dtype)
         y = obj.g(x)
-        self.assertAllClose(y, obj.weight * 2.)
+        self.assertAllClose(y, obj.weight * 2.0)
         self.assertAllEqual(t.watched_variables(), [obj.weight])
         return t.gradient(y, obj.weight)
 
     imported_gradient = get_gradient(imported)
     original_gradient = get_gradient(root)
     self.assertIsNotNone(original_gradient)
-    self.assertAllClose(original_gradient, 2.)
+    self.assertAllClose(original_gradient, 2.0)
     self.assertIsNotNone(imported_gradient)
-    self.assertAllClose(imported_gradient, 2.)
+    self.assertAllClose(imported_gradient, 2.0)
 
-  def test_nested_fn_backprop(self, cycles):
-    weight = variables.Variable(2., trainable=True)
+  def test_nested_fn_backprop(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    weight = variables.Variable(2.0, trainable=True)
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec(dtype=dtypes.float32, shape=(None, None))])
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(dtype=dtypes.float32, shape=(None, None))
+        ]
+    )
     def g(x):
       weight.read_value()  # Just get the tape to watch the variable
       handle = array_ops.identity(weight.handle)
+
       @def_function.function
       def launder_var_handle():
         return array_ops.identity(handle)
+
       return x + resource_variable_ops.read_variable_op(
-          launder_var_handle(), dtypes.float32)
+          launder_var_handle(), dtypes.float32
+      )
 
     root = autotrackable.AutoTrackable()
     root.weight = weight
     root.g = g
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+
     def get_gradient(obj, persistent):
       with backprop.GradientTape(persistent=persistent) as t:
-        x = constant_op.constant([[1., 2., 3.], [1., -2, 3.]])
+        x = constant_op.constant([[1.0, 2.0, 3.0], [1.0, -2, 3.0]])
         y = obj.g(x)
         self.assertAllClose(y, obj.weight + x)
         loss = math_ops.reduce_sum(y)
@@ -844,21 +1054,40 @@ def get_gradient(obj, persistent):
     imported_gradient = get_gradient(imported, persistent=False)
     original_gradient = get_gradient(root, persistent=False)
     self.assertIsNotNone(original_gradient)
-    self.assertAllClose(original_gradient, 6.)
+    self.assertAllClose(original_gradient, 6.0)
     self.assertIsNotNone(imported_gradient)
-    self.assertAllClose(imported_gradient, 6.)
+    self.assertAllClose(imported_gradient, 6.0)
+
+  def test_restored_func_with_captured_var_backprop_float32(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    self._test_restored_func_with_captured_var_backprop(
+        cycles, use_cpp_bindings, dtypes.float32
+    )
 
-  def test_restored_func_with_captured_var_backprop_float32(self, cycles):
-    self._test_restored_func_with_captured_var_backprop(cycles, dtypes.float32)
+  def test_restored_func_with_captured_var_backprop_float64(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    self._test_restored_func_with_captured_var_backprop(
+        cycles, use_cpp_bindings, dtypes.float64
+    )
 
-  def test_restored_func_with_captured_var_backprop_float64(self, cycles):
-    self._test_restored_func_with_captured_var_backprop(cycles, dtypes.float64)
+  def test_callable(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
-  def test_callable(self, cycles):
     class M1(autotrackable.AutoTrackable):
 
       @def_function.function(
-          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+      )
       def __call__(self, x):
         return x
 
@@ -866,9 +1095,9 @@ def __call__(self, x):
     root.m1 = M1()
     root.m2 = autotrackable.AutoTrackable()
     root.m2.__call__ = def_function.function(
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
-            lambda x: x*3.0)
-    imported = cycle(root, cycles)
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )(lambda x: x * 3.0)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     x = constant_op.constant(1.0)
 
     self.assertTrue(callable(imported.m1))
@@ -883,77 +1112,95 @@ def __call__(self, x):
     # Verify that user objects without `__call__` attribute are not callable.
     self.assertFalse(callable(imported))
 
-  def test_chain_callable(self, cycles):
+  def test_chain_callable(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     func = def_function.function(
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
-            lambda x: x*3.0)
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+    )(lambda x: x * 3.0)
     root = autotrackable.AutoTrackable()
     root.__call__ = autotrackable.AutoTrackable()
     root.__call__.__call__ = autotrackable.AutoTrackable()
     root.__call__.__call__.__call__ = func
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertTrue(callable(imported))
     x = constant_op.constant(1.0)
     self.assertAllEqual(imported(x).numpy(), 3.0)
 
-  def test_load_in_graph_mode(self, cycles):
+  def test_load_in_graph_mode(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.v1 = variables.Variable(1., name="v_one", trainable=False)
-    root.v2 = variables.Variable(2., name="v_two", trainable=True)
+    root.v1 = variables.Variable(1.0, name="v_one", trainable=False)
+    root.v2 = variables.Variable(2.0, name="v_two", trainable=True)
     root.f = def_function.function(
         lambda x: root.v2 * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
 
     if cycles > 1:
-      root = cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1, use_cpp_bindings=use_cpp_bindings)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
     with ops.Graph().as_default() as g:
-      imported = load.load(path)
+      imported = test_load(path, use_cpp_bindings=use_cpp_bindings)
       var_v1 = imported.v1
       self.assertFalse(var_v1.trainable)
       var_v2 = imported.v2
       self.assertTrue(var_v2.trainable)
-      output = imported.f(constant_op.constant(2.))
+      output = imported.f(constant_op.constant(2.0))
       with monitored_session.MonitoredSession() as sess:
         self.assertEqual(1.0, sess.run(var_v1))
         self.assertEqual(4.0, sess.run(output))
-      self.assertCountEqual([var_v1, var_v2],
-                            g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      self.assertCountEqual(
+          [var_v1, var_v2], g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      )
       # load() should not add to TRAINABLE_VARIABLES. Higher levels of model
       # building control retraining or frozen use of imported SavedModels.
-      self.assertCountEqual([],
-                            g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+      self.assertCountEqual(
+          [], g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      )
 
-  def test_load_in_func_graph(self, cycles):
+  def test_load_in_func_graph(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.v1 = variables.Variable(1.)
-    root.v2 = variables.Variable(2.)
+    root.v1 = variables.Variable(1.0)
+    root.v2 = variables.Variable(2.0)
     root.f = def_function.function(
         lambda x: root.v2 * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
 
     if cycles > 1:
-      root = cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1, use_cpp_bindings=use_cpp_bindings)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
 
     closure = autotrackable.AutoTrackable()
+
     @def_function.function
     def func(x):
       if not hasattr(closure, "model"):
         closure.model = load.load(path)
       return closure.model.f(x)
 
-    inputs = constant_op.constant(2.)
+    inputs = constant_op.constant(2.0)
     self.assertEqual(4.0, func(inputs).numpy())
 
-  def test_soft_matching(self, cycles):
+  def test_soft_matching(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)]
+    )
     def func(x):
       return 2 * x
 
@@ -966,7 +1213,7 @@ def func(x):
     concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
     self.assertLen(concrete_functions, 1)
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     with self.assertRaisesRegex(ValueError, "Python inputs incompatible"):
       # We cannot call the function with a constant of shape ().
@@ -975,12 +1222,17 @@ def func(x):
     # TODO(vbardiovsky): When classes are revived with input_signatures, we
     # should also check that the calls below are not generating any more
     # concrete functions.
-    self.assertAllEqual([2, 4, 6, 8],
-                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
-    self.assertAllEqual([2, 4, 6],
-                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+    self.assertAllEqual(
+        [2, 4, 6, 8], imported.f(constant_op.constant([1, 2, 3, 4])).numpy()
+    )
+    self.assertAllEqual(
+        [2, 4, 6], imported.f(constant_op.constant([1, 2, 3])).numpy()
+    )
 
-  def test_jit_compile(self, cycles):
+  def test_jit_compile(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     # It'd be nice to use parameterize here, but the library does not support
     # having parameterized test methods inside already-parameterized classes.
@@ -988,18 +1240,21 @@ def test_jit_compile(self, cycles):
 
       @def_function.function(jit_compile=jit_compile)
       def f(x):
-        return x + 1.
+        return x + 1.0
 
       root = module.Module()
       root.f = f
       save_dir = os.path.join(self.get_temp_dir(), "saved_model")
       save.save(root, save_dir)
 
-      imported = cycle(root, cycles)
+      imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
       self.assertEqual(imported.f._jit_compile, jit_compile)
 
-  def test_get_concrete_function(self, cycles):
+  def test_get_concrete_function(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def func(x, training=False):
@@ -1009,30 +1264,40 @@ def func(x, training=False):
         return 3 * x
 
     func.get_concrete_function(
-        tensor_spec.TensorSpec([None], dtypes.int32), True)
+        tensor_spec.TensorSpec([None], dtypes.int32), True
+    )
     func.get_concrete_function(tensor_spec.TensorSpec([None], dtypes.float32))
 
     root = autotrackable.AutoTrackable()
     root.f = func
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     concrete = imported.f.get_concrete_function(
-        training=True, x=tensor_spec.TensorSpec([None], dtypes.int32))
+        training=True, x=tensor_spec.TensorSpec([None], dtypes.int32)
+    )
 
-    self.assertAllEqual([2, 4, 6, 8],
-                        concrete(x=constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual(
+        [2, 4, 6, 8], concrete(x=constant_op.constant([1, 2, 3, 4])).numpy()
+    )
     with self.assertRaisesRegex(
-        ValueError, "Could not find matching concrete function to call"):
+        ValueError, "Could not find matching concrete function to call"
+    ):
       imported.f.get_concrete_function(
-          tensor_spec.TensorSpec([None], dtypes.int32))
+          tensor_spec.TensorSpec([None], dtypes.int32)
+      )
     imported.f.get_concrete_function(
-        tensor_spec.TensorSpec([None], dtypes.int32), True)
+        tensor_spec.TensorSpec([None], dtypes.int32), True
+    )
 
-  def test_concrete_function(self, cycles):
+  def test_concrete_function(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)]
+    )
     def func(x):
       return 2 * x
 
@@ -1043,44 +1308,64 @@ def func(x):
     self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
 
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = cycle(root, cycles, signatures={})
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
 
-    self.assertAllEqual([2, 4, 6, 8],
-                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
-    self.assertAllEqual([2, 4, 6],
-                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+    self.assertAllEqual(
+        [2, 4, 6, 8], imported.f(constant_op.constant([1, 2, 3, 4])).numpy()
+    )
+    self.assertAllEqual(
+        [2, 4, 6], imported.f(constant_op.constant([1, 2, 3])).numpy()
+    )
 
-  def test_concrete_function_captures(self, cycles):
+  def test_concrete_function_captures(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class Root(module.Module):
 
       def __init__(self):
-        self.v = variables.Variable(1.)
-        self.v1 = variables.Variable(1.)
+        self.v = variables.Variable(1.0)
+        self.v1 = variables.Variable(1.0)
 
       @def_function.function(
-          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)]
+      )
       def use_v(self, x):
-        return self.v + self.v1 + 1.
+        return self.v + self.v1 + 1.0
 
     root = Root()
-    self.assertIn(root.v.handle,
-                  root.use_v.get_concrete_function().graph.external_captures)
-    root = cycle(root, cycles, signatures=root.use_v.get_concrete_function())
+    self.assertIn(
+        root.v.handle,
+        root.use_v.get_concrete_function().graph.external_captures,
+    )
+    root = cycle(
+        root,
+        cycles,
+        signatures=root.use_v.get_concrete_function(),
+        use_cpp_bindings=use_cpp_bindings,
+    )
     func_captures = root.use_v.get_concrete_function().graph.external_captures
     self.assertLen(func_captures, 2)
     self.assertTrue(any(root.v.handle is t for t in func_captures))
     self.assertTrue(any(root.v1.handle is t for t in func_captures))
     signature_captures = root.signatures[
-        "serving_default"].graph.external_captures
+        "serving_default"
+    ].graph.external_captures
     self.assertLen(signature_captures, 2)
     self.assertTrue(any(root.v.handle is t for t in signature_captures))
     self.assertTrue(any(root.v1.handle is t for t in signature_captures))
 
-  def test_concrete_function_arg_names(self, cycles):
+  def test_concrete_function_arg_names(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)]
+    )
     def func(x):
       return 2 * x
 
@@ -1090,12 +1375,19 @@ def func(x):
     self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
 
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = cycle(root, cycles, signatures={})
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
+
+    self.assertAllEqual(
+        [2, 4, 6], imported.f(x=constant_op.constant([1, 2, 3])).numpy()
+    )
 
-    self.assertAllEqual([2, 4, 6],
-                        imported.f(x=constant_op.constant([1, 2, 3])).numpy())
+  def test_concrete_function_no_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
-  def test_concrete_function_no_signature(self, cycles):
     @def_function.function
     def func(x):
       return 2 * x
@@ -1104,64 +1396,108 @@ def func(x):
     root.f = func.get_concrete_function(constant_op.constant([1]))
     self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = cycle(root, cycles, signatures={})
-    self.assertAllEqual([6],
-                        imported.f(constant_op.constant([3])).numpy())
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
+    self.assertAllEqual([6], imported.f(constant_op.constant([3])).numpy())
 
   @test_util.run_in_graph_and_eager_modes
-  def test_concrete_function_backprop(self, cycles):
+  def test_concrete_function_backprop(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)]
+    )
     def func(x):
-      return x ** 2.
+      return x**2.0
+
     root = autotrackable.AutoTrackable()
     root.f = func.get_concrete_function()
 
     def _compute_gradient(function):
       with backprop.GradientTape() as tape:
-        inp = constant_op.constant(1.)
+        inp = constant_op.constant(1.0)
         tape.watch(inp)
         output = function(inp)
       return tape.gradient(output, inp)
 
-    self.assertAllEqual(2., _compute_gradient(root.f))
+    self.assertAllEqual(2.0, _compute_gradient(root.f))
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = cycle(root, cycles, signatures={})
-    self.assertAllEqual(2., _compute_gradient(imported.f))
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
+    self.assertAllEqual(2.0, _compute_gradient(imported.f))
 
-  def test_revived_concrete_function_kwargs(self, cycles):
+  def test_revived_concrete_function_kwargs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def func(x, y):
-      return x * (y + 1.)
+      return x * (y + 1.0)
+
     root = autotrackable.AutoTrackable()
     root.f = func.get_concrete_function(
         tensor_spec.TensorSpec([], dtypes.float32),
-        tensor_spec.TensorSpec([], dtypes.float32))
-    self.assertEqual(8., root.f(y=constant_op.constant(3.),
-                                x=constant_op.constant(2.)).numpy())
+        tensor_spec.TensorSpec([], dtypes.float32),
+    )
+    self.assertEqual(
+        8.0,
+        root.f(
+            y=constant_op.constant(3.0), x=constant_op.constant(2.0)
+        ).numpy(),
+    )
     # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
-    imported = cycle(root, cycles, signatures={})
-    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
-                                    x=constant_op.constant(2.)).numpy())
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
+    self.assertEqual(
+        8.0,
+        imported.f(
+            y=constant_op.constant(3.0), x=constant_op.constant(2.0)
+        ).numpy(),
+    )
 
-  def test_revived_concrete_function_tensorspec_kwargs(self, cycles):
+  def test_revived_concrete_function_tensorspec_kwargs(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def func(*args):
       x, y = args
-      return x * (y + 1.)
+      return x * (y + 1.0)
+
     root = autotrackable.AutoTrackable()
     root.f = func.get_concrete_function(
         tensor_spec.TensorSpec([], dtypes.float32, name="x"),
-        tensor_spec.TensorSpec([], dtypes.float32, name="y"))
-    self.assertEqual(8., root.f(y=constant_op.constant(3.),
-                                x=constant_op.constant(2.)).numpy())
-    imported = cycle(root, cycles, signatures={})
-    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
-                                    x=constant_op.constant(2.)).numpy())
+        tensor_spec.TensorSpec([], dtypes.float32, name="y"),
+    )
+    self.assertEqual(
+        8.0,
+        root.f(
+            y=constant_op.constant(3.0), x=constant_op.constant(2.0)
+        ).numpy(),
+    )
+    imported = cycle(
+        root, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
+    self.assertEqual(
+        8.0,
+        imported.f(
+            y=constant_op.constant(3.0), x=constant_op.constant(2.0)
+        ).numpy(),
+    )
 
-  def test_concrete_function_variable_argument(self, cycles):
+  def test_concrete_function_variable_argument(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     capture = variables.Variable(0)
 
     @def_function.function
@@ -1179,7 +1515,7 @@ def func(v):
     self.assertEqual(2, vsave.numpy())
     self.assertEqual(-1, capture.numpy())
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     vload = variables.Variable(1)
     imported.f(vload)
@@ -1191,7 +1527,10 @@ def func(v):
 
     self.assertEqual(-1, capture.numpy())
 
-  def test_function_and_component(self, cycles):
+  def test_function_and_component(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def func(v):
@@ -1200,80 +1539,97 @@ def func(v):
     root = autotrackable.AutoTrackable()
     root.func = func
     root.concrete_func = func.get_concrete_function(
-        tensor_spec.TensorSpec(None, dtypes.int32))
+        tensor_spec.TensorSpec(None, dtypes.int32)
+    )
     one = constant_op.constant(1)
     self.assertEqual(2, root.func(one).numpy())
     self.assertEqual(2, root.concrete_func(one).numpy())
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(2, imported.func(one).numpy())
     self.assertEqual(2, imported.concrete_func(one).numpy())
 
-  def test_dict(self, cycles):
+  def test_dict(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.variables = dict(a=variables.Variable(1.))
-    root.variables["b"] = variables.Variable(2.)
+    root.variables = dict(a=variables.Variable(1.0))
+    root.variables["b"] = variables.Variable(2.0)
     root.variables["c"] = 1
     root.funcs = dict(
-        a=def_function.function(lambda: constant_op.constant(100.)))
+        a=def_function.function(lambda: constant_op.constant(100.0))
+    )
     root.funcs["conc"] = root.funcs["a"].get_concrete_function()
-    imported = cycle(root, cycles)
-    self.assertEqual(1., imported.variables["a"].numpy())
-    self.assertEqual(2., imported.variables["b"].numpy())
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(1.0, imported.variables["a"].numpy())
+    self.assertEqual(2.0, imported.variables["b"].numpy())
     self.assertEqual(set(["a", "b"]), set(imported.variables.keys()))
-    self.assertEqual(100., imported.funcs["a"]().numpy())
-    self.assertEqual(100., imported.funcs["conc"]().numpy())
+    self.assertEqual(100.0, imported.funcs["a"]().numpy())
+    self.assertEqual(100.0, imported.funcs["conc"]().numpy())
 
-  def test_list(self, cycles):
+  def test_list(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.variables = [variables.Variable(1.)]
+    root.variables = [variables.Variable(1.0)]
     root.variables.append(1)
-    root.variables.append(variables.Variable(3.))
-    imported = cycle(root, cycles)
-    self.assertEqual(1., imported.variables[0].numpy())
-    self.assertEqual(3., imported.variables[2].numpy())
+    root.variables.append(variables.Variable(3.0))
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(1.0, imported.variables[0].numpy())
+    self.assertEqual(3.0, imported.variables[2].numpy())
     self.assertIs(None, imported.variables[1])
     self.assertLen(imported.variables, 3)
 
-  def test_tuple(self, cycles):
+  def test_tuple(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    root.variables = (variables.Variable(1.), 1, variables.Variable(3.))
-    imported = cycle(root, cycles)
-    self.assertEqual(1., imported.variables[0].numpy())
-    self.assertEqual(3., imported.variables[2].numpy())
+    root.variables = (variables.Variable(1.0), 1, variables.Variable(3.0))
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertEqual(1.0, imported.variables[0].numpy())
+    self.assertEqual(3.0, imported.variables[2].numpy())
     self.assertIs(None, imported.variables[1])
     self.assertLen(imported.variables, 3)
 
-  def test_functions_list(self, cycles):
+  def test_functions_list(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = autotrackable.AutoTrackable()
-    v1 = variables.Variable(1.)
-    root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1 ** 2))]
+    v1 = variables.Variable(1.0)
+    root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1**2))]
     root.variables = [v1]
 
     @def_function.function
     def _v2_loss():
       if len(root.variables) == 1:
-        v2 = variables.Variable(2.)
+        v2 = variables.Variable(2.0)
         root.variables.append(v2)
       return math_ops.reduce_sum(root.variables[1] ** 2)
 
     root.losses.append(_v2_loss)
-    self.assertAllClose([1., 4.], [loss() for loss in root.losses])
-    imported = cycle(root, cycles)
-    self.assertAllClose([1., 4.], [loss() for loss in imported.losses])
-    imported.variables[0].assign(3.)
-    imported.variables[1].assign(4.)
-    self.assertAllClose([9., 16.], [loss() for loss in imported.losses])
-
-  def test_captured_constant(self, cycles):
+    self.assertAllClose([1.0, 4.0], [loss() for loss in root.losses])
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertAllClose([1.0, 4.0], [loss() for loss in imported.losses])
+    imported.variables[0].assign(3.0)
+    imported.variables[1].assign(4.0)
+    self.assertAllClose([9.0, 16.0], [loss() for loss in imported.losses])
+
+  def test_captured_constant(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     const = array_ops.zeros([100])
     root = autotrackable.AutoTrackable()
-    root.f = def_function.function(lambda: const + 1.)
-    root.g = def_function.function(lambda: const + 2.)
+    root.f = def_function.function(lambda: const + 1.0)
+    root.g = def_function.function(lambda: const + 2.0)
     self.assertAllClose(array_ops.ones([100]), root.f())
-    self.assertAllClose(2. * array_ops.ones([100]), root.g())
-    imported = cycle(root, cycles)
+    self.assertAllClose(2.0 * array_ops.ones([100]), root.g())
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllClose(array_ops.ones([100]), imported.f())
-    self.assertAllClose(2. * array_ops.ones([100]), imported.g())
+    self.assertAllClose(2.0 * array_ops.ones([100]), imported.g())
     # TODO(b/123408994): Use the public get_concrete_function.
     f_concrete = imported.f._list_all_concrete_functions_for_serialization()[0]
     g_concrete = imported.g._list_all_concrete_functions_for_serialization()[0]
@@ -1281,10 +1637,12 @@ def test_captured_constant(self, cycles):
     self.assertLen(g_concrete.captured_inputs, 1)
     # We should be using the same captured EagerTensor in both functions, not
     # duplicating the constant.
-    self.assertIs(f_concrete.captured_inputs[0],
-                  g_concrete.captured_inputs[0])
+    self.assertIs(f_concrete.captured_inputs[0], g_concrete.captured_inputs[0])
 
-  def test_functions_accessed_once(self, cycles):
+  def test_functions_accessed_once(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class Exported(autotrackable.AutoTrackable):
 
@@ -1296,32 +1654,42 @@ def make_func(self):
         @def_function.function
         def f():
           return constant_op.constant(self._counter)
+
         f.get_concrete_function()  # force a trace
         self._counter += 1
         return f
 
     exported = Exported()
-    imported = cycle(exported, cycles)
+    imported = cycle(exported, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(0, imported.make_func().numpy())
     self.assertEqual(1, exported.make_func().numpy())
 
-  def test_overwritten_signatures_error(self, cycles):
+  def test_overwritten_signatures_error(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     exported = autotrackable.AutoTrackable()
-    exported.f = def_function.function(lambda: constant_op.constant(1.))
+    exported.f = def_function.function(lambda: constant_op.constant(1.0))
     imported = cycle(
-        exported, cycles,
-        signatures={"key": exported.f.get_concrete_function()})
-    self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
+        exported,
+        cycles,
+        signatures={"key": exported.f.get_concrete_function()},
+        use_cpp_bindings=use_cpp_bindings,
+    )
+    self.assertEqual(1.0, imported.signatures["key"]()["output_0"].numpy())
     imported.signatures = {"key1": imported.signatures["key"]}
     with self.assertRaisesRegex(ValueError, "signatures"):
       save.save(imported, tempfile.mkdtemp(prefix=self.get_temp_dir()))
 
-  def test_signature_loading(self, cycles):
+  def test_signature_loading(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class Exported(autotrackable.AutoTrackable):
 
       def __init__(self):
-        self.v = variables.Variable(3.)
+        self.v = variables.Variable(3.0)
 
       @def_function.function
       def do(self, x):
@@ -1332,27 +1700,36 @@ def do(self, x):
         exported,
         cycles,
         signatures=exported.do.get_concrete_function(
-            tensor_spec.TensorSpec(None, dtypes.float32)))
+            tensor_spec.TensorSpec(None, dtypes.float32)
+        ),
+        use_cpp_bindings=use_cpp_bindings,
+    )
     self.assertEqual(["serving_default"], list(imported.signatures.keys()))
     imported_function = imported.signatures["serving_default"]
-    two = constant_op.constant(2.)
-    self.assertEqual(6., imported_function(x=two)["output_0"].numpy())
-    imported.v.assign(4.)
-    self.assertEqual(8., imported_function(x=two)["output_0"].numpy())
-    self.assertEqual(8., imported_function(two)["output_0"].numpy())
+    two = constant_op.constant(2.0)
+    self.assertEqual(6.0, imported_function(x=two)["output_0"].numpy())
+    imported.v.assign(4.0)
+    self.assertEqual(8.0, imported_function(x=two)["output_0"].numpy())
+    self.assertEqual(8.0, imported_function(two)["output_0"].numpy())
     with self.assertRaises(TypeError):
       # The signatures mapping is immutable
       imported.signatures["random_key"] = 3
 
-  def test_names_normalized(self, cycles):
+  def test_names_normalized(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     class ObjWithFunction(module.Module):
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec([], dtype=dtypes.int32, name="A-b"),
-          tensor_spec.TensorSpec([], dtype=dtypes.int32, name="A/D"),
-          tensor_spec.TensorSpec([], dtype=dtypes.int32, name="bar"),
-          tensor_spec.TensorSpec([], dtype=dtypes.int32, name="e"),
-      ])
+      @def_function.function(
+          input_signature=[
+              tensor_spec.TensorSpec([], dtype=dtypes.int32, name="A-b"),
+              tensor_spec.TensorSpec([], dtype=dtypes.int32, name="A/D"),
+              tensor_spec.TensorSpec([], dtype=dtypes.int32, name="bar"),
+              tensor_spec.TensorSpec([], dtype=dtypes.int32, name="e"),
+          ]
+      )
       def foo(self, a, b, c, d=10, **options):
         del options
         return a + b + c + d
@@ -1360,19 +1737,26 @@ def foo(self, a, b, c, d=10, **options):
     exported = ObjWithFunction()
 
     with self.assertLogs(level="WARNING") as logs:
-      imported = cycle(exported, cycles)
+      imported = cycle(exported, cycles, use_cpp_bindings=use_cpp_bindings)
 
     expected_message = (
         "WARNING:absl:Function `foo` contains input name(s) A-b, A/D with "
         "unsupported characters which will be renamed to a_b, a_d in the "
-        "SavedModel.")
+        "SavedModel."
+    )
     self.assertIn(expected_message, logs.output)
 
     loaded_signature = imported.signatures["serving_default"].inputs
-    self.assertEqual("a_b:0", loaded_signature[0].name)
-    self.assertEqual("a_d:0", loaded_signature[1].name)
+    self.assertTrue(
+        {"a_b:0", "a_d:0"}.issubset({arg.name for arg in loaded_signature}),
+    )
 
-  def test_multiple_argument_signatures_no_positional(self, cycles):
+  def test_multiple_argument_signatures_no_positional(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class Exported(autotrackable.AutoTrackable):
 
@@ -1382,18 +1766,26 @@ def do(self, x, y):
 
     exported = Exported()
     imported = cycle(
-        exported, cycles, signatures=exported.do.get_concrete_function(
+        exported,
+        cycles,
+        signatures=exported.do.get_concrete_function(
             tensor_spec.TensorSpec(None, dtypes.float32),
-            tensor_spec.TensorSpec(None, dtypes.float32)))
+            tensor_spec.TensorSpec(None, dtypes.float32),
+        ),
+        use_cpp_bindings=use_cpp_bindings,
+    )
     with self.assertRaises(TypeError):
       imported.signatures["serving_default"](
-          constant_op.constant(1.),
-          y=constant_op.constant(2.))
+          constant_op.constant(1.0), y=constant_op.constant(2.0)
+      )
     self.assertEqual(
-        {"output_0": 3.},
-        self.evaluate(imported.signatures["serving_default"](
-            x=constant_op.constant(1.),
-            y=constant_op.constant(2.))))
+        {"output_0": 3.0},
+        self.evaluate(
+            imported.signatures["serving_default"](
+                x=constant_op.constant(1.0), y=constant_op.constant(2.0)
+            )
+        ),
+    )
 
   def _make_model_with_tables(self):
     default_val = -1
@@ -1418,14 +1810,20 @@ def _make_lookup_function(table):
     root.lookup2 = _make_lookup_function(table2)
     return root
 
-  def test_table(self, cycles):
+  def test_table(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = self._make_model_with_tables()
     imported = cycle(root, cycles, signatures={})
     keys = constant_op.constant(["brain", "test", "foo", "surgery"])
     self.assertAllEqual([0, -1, -1, 2], imported.lookup1(keys).numpy())
     self.assertAllEqual([2, 0, 1, -1], imported.lookup2(keys).numpy())
 
-  def test_table_collections_untouched_eager(self, cycles):
+  def test_table_collections_untouched_eager(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def _gather_nonempty_collections():
       graph = ops.get_default_graph()
@@ -1439,22 +1837,25 @@ def _gather_nonempty_collections():
     root = self._make_model_with_tables()
     # Warm up collections to ignore those that don't expand every iteration,
     # e.g. the __varscope collection.
-    cycle(root, 1)
+    cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
     original_collections = _gather_nonempty_collections()
-    cycle(root, cycles)
+    cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(original_collections, _gather_nonempty_collections())
 
-  def test_table_in_graph(self, cycles):
+  def test_table_in_graph(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = self._make_model_with_tables()
 
     if cycles > 1:
-      root = cycle(root, cycles - 1)
+      root = cycle(root, cycles - 1, use_cpp_bindings=use_cpp_bindings)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
-    imported = cycle(root, 1)
+    imported = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
 
     with ops.Graph().as_default():
-      imported = load.load(path)
+      imported = test_load(path, use_cpp_bindings=use_cpp_bindings)
       keys = constant_op.constant(["brain", "test", "foo", "surgery"])
       output1 = imported.lookup1(keys)
       output2 = imported.lookup2(keys)
@@ -1462,7 +1863,10 @@ def test_table_in_graph(self, cycles):
         self.assertAllEqual([0, -1, -1, 2], sess.run(output1))
         self.assertAllEqual([2, 0, 1, -1], sess.run(output2))
 
-  def test_preserve_argspec(self, cycles):
+  def test_preserve_argspec(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(a, b, c):  # pylint: disable=unused-argument
       return None
@@ -1471,12 +1875,16 @@ def f(a, b, c):  # pylint: disable=unused-argument
 
     root = autotrackable.AutoTrackable()
     root.f = def_function.function(f)
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     restored_fullargspec = tf_inspect.getfullargspec(imported.f)
     self.assertEqual(original_fullargspec, restored_fullargspec)
 
-  def test_canonicalize_inputs(self, cycles):
+  def test_canonicalize_inputs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     @def_function.function(autograph=False)
     def func(a=1, b=2, c=3, training=True):
       if training:
@@ -1486,7 +1894,8 @@ def func(a=1, b=2, c=3, training=True):
 
     # TODO(b/123501567): Work-around to trigger generic traces of a function
     # with extra non tensor args.
-    signature = 3*[tensor_spec.TensorSpec(None, dtypes.float32)]
+    signature = 3 * [tensor_spec.TensorSpec(None, dtypes.float32)]
+
     @def_function.function(input_signature=signature)
     def trigger(a, b, c):
       func(a, b, c, True)
@@ -1496,15 +1905,20 @@ def trigger(a, b, c):
 
     root = autotrackable.AutoTrackable()
     root.f = func
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
     self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
 
-    with self.assertRaisesRegex(ValueError,
-                                "Could not find matching concrete function"):
+    with self.assertRaisesRegex(
+        ValueError, "Could not find matching concrete function"
+    ):
       root.f(["hello", 1.0])
 
-  def test_prefer_specific_trace(self, cycles):
+  def test_prefer_specific_trace(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     @def_function.function(autograph=False)
     def func(a):
       if isinstance(a, int):
@@ -1517,13 +1931,16 @@ def func(a):
 
     root = autotrackable.AutoTrackable()
     root.f = func
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(2, root.f(2).numpy())
     self.assertAllEqual(4, root.f(3).numpy())
     self.assertAllEqual(3, root.f(constant_op.constant(2)).numpy())
     self.assertAllEqual(4, root.f(constant_op.constant(3)).numpy())
 
-  def test_partial_with_non_tensor_defaults(self, cycles):
+  def test_partial_with_non_tensor_defaults(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(x, y=3):
       return x + y
@@ -1534,10 +1951,14 @@ def f(x, y=3):
     root.f = func
     self.assertAllEqual(root.f(1), 6)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(root.f(1), 6)
 
-  def test_partial_with_positional(self, cycles):
+  def test_partial_with_positional(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
     def f(x, y):
       return x + y
 
@@ -1547,10 +1968,15 @@ def f(x, y):
     root.f = func
     self.assertAllEqual(root.f(1), 6)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(root.f(1), 6)
 
-  def test_partial_with_positional_captured_tensors(self, cycles):
+  def test_partial_with_positional_captured_tensors(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(x, y):
       return x + y
@@ -1562,10 +1988,13 @@ def f(x, y):
     root.f = func
     self.assertAllEqual(root.f(1), 13)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(root.f(1), 13)
 
-  def test_partial_keyword_hiding_default(self, cycles):
+  def test_partial_keyword_hiding_default(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(x=3, training=True, y=7):
       if training:
@@ -1580,11 +2009,14 @@ def f(x=3, training=True, y=7):
     self.assertEqual(root.f().numpy(), 9)
     self.assertEqual(root.f(training=False).numpy(), 11)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(root.f().numpy(), 9)
     self.assertEqual(root.f(training=False).numpy(), 11)
 
-  def test_partial_with_kwargs(self, cycles):
+  def test_partial_with_kwargs(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(a, b, *args, **kwargs):
       args_sum = sum(args)
@@ -1593,19 +2025,26 @@ def f(a, b, *args, **kwargs):
     constant_tensor = constant_op.constant(10)
     func = def_function.function(
         functools.partial(
-            f, 7, 1, 2, learning_rate=3, some_tensor=constant_tensor))
+            f, 7, 1, 2, learning_rate=3, some_tensor=constant_tensor
+        )
+    )
 
     root = autotrackable.AutoTrackable()
     root.f = func
     self.assertEqual(root.f(constant_op.constant(4)).numpy(), 44)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(root.f(constant_op.constant(5)).numpy(), 45)
 
-  def test_partial_bind_only_first_argument(self, cycles):
+  def test_partial_bind_only_first_argument(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     if sys.version_info[0] < 3:
-      self.skipTest("Test is only valid in python3. Only then we get some more "
-                    "advanced inspection of partials where this is allowed.")
+      self.skipTest(
+          "Test is only valid in python3. Only then we get some more "
+          "advanced inspection of partials where this is allowed."
+      )
 
     def f(x, y):
       return x + y
@@ -1617,10 +2056,13 @@ def f(x, y):
     root.f = tf_func
     self.assertAllEqual(root.f(y=constant_op.constant(7)), 12)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertAllEqual(root.f(y=constant_op.constant(9)), 14)
 
-  def test_partial_with_passed_fn_as_default(self, cycles):
+  def test_partial_with_passed_fn_as_default(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def f(x, y):
       return x(3) + y
@@ -1634,10 +2076,13 @@ def my_func(a):
     root.f = func
     self.assertEqual(root.f(constant_op.constant(3)).numpy(), 9)
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(root.f(constant_op.constant(3)).numpy(), 9)
 
-  def test_partial_with_input_signature(self, cycles):
+  def test_partial_with_input_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def full_function(a, b, c=3.0):
       return a, b, c
@@ -1653,25 +2098,32 @@ def full_function(a, b, c=3.0):
     a, b, c = root.f(2.0)
     self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 2.0, 4))
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     a, b, c = root.f(3.0)
     self.assertAllEqual([a.numpy(), b.numpy(), c.numpy()], (1, 3.0, 4))
 
-  def test_convert_to_input_signature(self, cycles):
+  def test_convert_to_input_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)]
+    )
     def func(x):
       return x
 
     root = autotrackable.AutoTrackable()
     root.f = func
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertEqual([2], root.f([2]).numpy())
 
-  def test_named_tuple(self, cycles):
+  def test_named_tuple(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class NamedTupleType(collections.namedtuple("NamedTupleType", ["a", "b"])):
       pass
@@ -1683,102 +2135,126 @@ def f(x):
     f.get_concrete_function(
         NamedTupleType(
             a=tensor_spec.TensorSpec(None, dtypes.float32, name="a"),
-            b=tensor_spec.TensorSpec(None, dtypes.float32, name="b")))
+            b=tensor_spec.TensorSpec(None, dtypes.float32, name="b"),
+        )
+    )
     obj = autotrackable.AutoTrackable()
     obj.__call__ = f
     if sys.version_info.major == 3 and sys.version_info.minor < 5:
       # TODO(allenl): figure out why this doesn't work in Python3.4
       self.skipTest("Not working in Python 3.4")
-    imported = cycle(obj, cycles)
-    self.assertAllClose(3.,
-                        imported(NamedTupleType(a=constant_op.constant(1.),
-                                                b=constant_op.constant(2.))))
+    imported = cycle(obj, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertAllClose(
+        3.0,
+        imported(
+            NamedTupleType(
+                a=constant_op.constant(1.0), b=constant_op.constant(2.0)
+            )
+        ),
+    )
 
-  def test_extra_args(self, cycles):
+  def test_extra_args(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def f(x):
-      return math_ops.add(x["a"], 1.)
+      return math_ops.add(x["a"], 1.0)
+
     # Trigger a trace.
     f({"a": constant_op.constant(2.0)})
 
     obj = autotrackable.AutoTrackable()
     obj.__call__ = f
-    imported = cycle(obj, cycles)
+    imported = cycle(obj, cycles, use_cpp_bindings=use_cpp_bindings)
 
     self.assertEqual(4.0, imported({"a": 3.0}).numpy())
 
     with self.assertRaisesRegex(
-        ValueError, "Could not find matching concrete function to call"):
+        ValueError, "Could not find matching concrete function to call"
+    ):
       imported({"a": 2.0, "b": 3.0})
 
-  def test_shapes_available(self, cycles):
+  def test_shapes_available(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec([None, 3], dtypes.int32),
-        tensor_spec.TensorSpec([None, 2], dtypes.int32)
-    ])
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec([None, 3], dtypes.int32),
+            tensor_spec.TensorSpec([None, 2], dtypes.int32),
+        ]
+    )
     def func(x, y):
       return array_ops.concat([x, y], axis=1)
 
     root = autotrackable.AutoTrackable()
     root.f = func
 
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
 
     imported_graph = root.f.get_concrete_function().graph
     input_x, input_y = imported_graph.inputs
     self.assertEqual([None, 3], input_x.shape.as_list())
     self.assertEqual([None, 2], input_y.shape.as_list())
-    output, = imported_graph.outputs
+    (output,) = imported_graph.outputs
     self.assertEqual([None, 5], output.shape.as_list())
     signature = root.signatures["serving_default"]
-    self.assertEqual(
-        [None, 3], signature.inputs[0].shape.as_list())
-    self.assertEqual(
-        [None, 2], signature.inputs[1].shape.as_list())
-    self.assertEqual(
-        [None, 5], signature.outputs[0].shape.as_list())
-
-  def test_variables_destroyed(self, cycles):
-    v1 = variables.Variable(1.)
+    self.assertEqual([None, 3], signature.inputs[0].shape.as_list())
+    self.assertEqual([None, 2], signature.inputs[1].shape.as_list())
+    self.assertEqual([None, 5], signature.outputs[0].shape.as_list())
+
+  def test_variables_destroyed(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    v1 = variables.Variable(1.0)
     weak_v1 = weakref.ref(v1)
     root = checkpoint.Checkpoint(v=v1)
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     del v1
     self.assertIsNone(weak_v1())
     weak_v2 = weakref.ref(root.v)
     del root
     self.assertIsNone(weak_v2())
 
-  def test_variable_attributes_preserved(self, cycles):
+  def test_variable_attributes_preserved(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     v = variables.Variable(
-        1.,
+        1.0,
         trainable=False,
         synchronization=variables.VariableSynchronization.NONE,
-        aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.assertEqual(variables.VariableSynchronization.NONE,
-                     v.synchronization)
-    self.assertEqual(variables.VariableAggregation.ONLY_FIRST_REPLICA,
-                     v.aggregation)
+        aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA,
+    )
+    self.assertEqual(variables.VariableSynchronization.NONE, v.synchronization)
+    self.assertEqual(
+        variables.VariableAggregation.ONLY_FIRST_REPLICA, v.aggregation
+    )
     root = autotrackable.AutoTrackable()
     root.v = v
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(False, root.v.trainable)
-    self.assertEqual(variables.VariableSynchronization.NONE,
-                     root.v.synchronization)
-    self.assertEqual(variables.VariableAggregation.ONLY_FIRST_REPLICA,
-                     root.v.aggregation)
+    self.assertEqual(
+        variables.VariableSynchronization.NONE, root.v.synchronization
+    )
+    self.assertEqual(
+        variables.VariableAggregation.ONLY_FIRST_REPLICA, root.v.aggregation
+    )
 
-  def test_captured_dataset(self, cycles):
+  def test_captured_dataset(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class HasDataset(module.Module):
 
       def __init__(self):
         super(HasDataset, self).__init__()
-        self.dataset = (
-            dataset_ops.Dataset.range(5)
-            .map(lambda x: x ** 2))
+        self.dataset = dataset_ops.Dataset.range(5).map(lambda x: x**2)
 
       @def_function.function
       def __call__(self, x):
@@ -1790,42 +2266,60 @@ def __call__(self, x):
     root = HasDataset()
     self.assertEqual(
         3 * (1 + 4 + 9 + 16),
-        root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
-    root = cycle(root, cycles)
+        root(constant_op.constant(3, dtype=dtypes.int64)).numpy(),
+    )
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(
         3 * (1 + 4 + 9 + 16),
-        root(constant_op.constant(3, dtype=dtypes.int64)).numpy())
+        root(constant_op.constant(3, dtype=dtypes.int64)).numpy(),
+    )
 
-  def test_tuple_signature(self, cycles):
+  def test_tuple_signature(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = checkpoint.Checkpoint()
     root.f = def_function.function(
-        lambda: (array_ops.ones([]), array_ops.zeros([])),
-        input_signature=())
-    root = cycle(root, cycles, signatures=root.f)
-    self.assertEqual(({"output_0": 1., "output_1": 0.}),
-                     self.evaluate(root.signatures["serving_default"]()))
+        lambda: (array_ops.ones([]), array_ops.zeros([])), input_signature=()
+    )
+    root = cycle(
+        root, cycles, signatures=root.f, use_cpp_bindings=use_cpp_bindings
+    )
+    self.assertEqual(
+        ({"output_0": 1.0, "output_1": 0.0}),
+        self.evaluate(root.signatures["serving_default"]()),
+    )
 
-  def test_version_info(self, cycles):
+  def test_version_info(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = checkpoint.Checkpoint()
-    root = cycle(root, cycles)
+    root = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(versions.__version__, root.tensorflow_version)
     self.assertEqual(versions.__git_version__, root.tensorflow_git_version)
 
-  def test_load_grad_save(self, cycles):
+  def test_load_grad_save(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = checkpoint.Checkpoint()
-    root.v = variables.Variable(2.)
+    root.v = variables.Variable(2.0)
     root.f = def_function.function(lambda x: root.v * x)
     root.g = def_function.function(root.f)
     for _ in range(cycles):
       with backprop.GradientTape() as tape:
-        inp = constant_op.constant(2.)
+        inp = constant_op.constant(2.0)
         tape.watch(inp)
         output = root.g(inp)
-        self.assertAllClose(4., output)
-      self.assertAllClose(2., tape.gradient(output, inp))
-      root = cycle(root, 1)
+        self.assertAllClose(4.0, output)
+      self.assertAllClose(2.0, tape.gradient(output, inp))
+      root = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
 
-  def test_destroy_resource(self, cycles):
+  def test_destroy_resource(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     def get_handle():
       return resource_variable_ops.var_handle_op(
@@ -1833,7 +2327,8 @@ def get_handle():
           dtype=dtypes.float32,
           shared_name="my_var_name",
           name="my_var",
-          container="my_container")
+          container="my_container",
+      )
 
     class MyResource(resource.TrackableResource):
 
@@ -1842,12 +2337,14 @@ def _create_resource(self):
 
       def _initialize(self):
         resource_variable_ops.assign_variable_op(
-            self.resource_handle, 1.0, name="assign")
+            self.resource_handle, 1.0, name="assign"
+        )
 
       def _destroy_resource(self):
         handle = get_handle()
         resource_variable_ops.destroy_resource_op(
-            handle, ignore_lookup_error=True)
+            handle, ignore_lookup_error=True
+        )
 
     class MyModel(autotrackable.AutoTrackable):
 
@@ -1859,11 +2356,12 @@ def __init__(self):
       def increase(self):
         handle = self.resource.resource_handle
         resource_variable_ops.assign_add_variable_op(
-            handle, 10.0, name="assign_add")
+            handle, 10.0, name="assign_add"
+        )
         return resource_variable_ops.read_variable_op(handle, dtypes.float32)
 
     root = MyModel()
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(11, imported.increase().numpy())  # Create the resource.
 
     handle = imported.resource.resource_handle
@@ -1873,28 +2371,37 @@ def increase(self):
     del imported
 
     # Try to destroy the resource again, should fail.
-    with self.assertRaisesRegex(errors.NotFoundError,
-                                r"Resource .* does not exist."):
+    with self.assertRaisesRegex(
+        errors.NotFoundError, r"Resource .* does not exist."
+    ):
       resource_variable_ops.destroy_resource_op(
-          handle, ignore_lookup_error=False)
+          handle, ignore_lookup_error=False
+      )
 
-  def test_function_called_as_operation(self, cycles):
+  def test_function_called_as_operation(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @framework_function.Defun(dtypes.float32)
     def inner(x):
-      return x + 1.
+      return x + 1.0
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)])
+        input_signature=[tensor_spec.TensorSpec([], dtypes.float32)]
+    )
     def outer(x):
       return inner(x)
 
     root = module.Module()
     root.f = outer
-    imported = cycle(root, cycles)
-    self.assertAllClose(2., imported.f(constant_op.constant(1.)))
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+    self.assertAllClose(2.0, imported.f(constant_op.constant(1.0)))
 
-  def test_ragged(self, cycles):
+  def test_ragged(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @def_function.function
     def f(x, c=1):
@@ -1903,31 +2410,36 @@ def f(x, c=1):
 
     for c in (1, 2, 3):
       _ = f.get_concrete_function(
-          ragged_tensor.RaggedTensorSpec([None, None], dtype=dtypes.int32),
-          c)
+          ragged_tensor.RaggedTensorSpec([None, None], dtype=dtypes.int32), c
+      )
 
     obj = autotrackable.AutoTrackable()
     obj.f = f
 
-    imported1 = cycle(obj, cycles, signatures={})
+    imported1 = cycle(
+        obj, cycles, signatures={}, use_cpp_bindings=use_cpp_bindings
+    )
     rt = ragged_factory_ops.constant([[1, 2], [3]])
     self.assertAllEqual(imported1.f(rt), [[2, 3], [4]])
     self.assertAllEqual(imported1.f(rt, 2), [[3, 4], [5]])
     self.assertAllEqual(imported1.f(rt, 3), [[4, 5], [6]])
 
-    imported2 = cycle(obj, cycles)
+    imported2 = cycle(obj, cycles, use_cpp_bindings=use_cpp_bindings)
     rt = ragged_factory_ops.constant([[1, 2], [3]])
     self.assertAllEqual(imported2.f(rt, 1), [[2, 3], [4]])
     self.assertAllEqual(imported2.f(rt, 2), [[3, 4], [5]])
     self.assertAllEqual(imported2.f(rt, 3), [[4, 5], [6]])
 
-  def test_accepts_io_device(self, cycles):
+  def test_accepts_io_device(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     options = load_options.LoadOptions()
     self.assertIsNone(options.experimental_io_device)
     options = load_options.LoadOptions(experimental_io_device="/job:localhost")
     self.assertEqual("/job:localhost", options.experimental_io_device)
 
-  def _custom_saveable_object(self, cycles):
+  def _custom_saveable_object(self, cycles, use_cpp_bindings):
     if context.is_tfrt_enabled():
       self.skipTest("Disable due to b/190539415.")
     root = autotrackable.AutoTrackable()
@@ -1937,59 +2449,76 @@ def _custom_saveable_object(self, cycles):
     root.table2.insert("idk", 21)
 
     @def_function.function(
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)]
+    )
     def lookup(key):
       return root.table.lookup(key)
 
     root.lookup = lookup
 
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(self.evaluate(imported.lookup("foo")), 15)
     self.assertEqual(self.evaluate(imported.lookup("idk")), -1)
 
     if not saveable_compat.force_checkpoint_conversion_enabled():
-      self.assertEqual({"table"},
-                       imported.table._self_saveable_object_factories.keys())
-
-  def test_load_custom_saveable_object(self, cycles):
-    self._custom_saveable_object(cycles)
+      self.assertEqual(
+          {"table"}, imported.table._self_saveable_object_factories.keys()
+      )
 
-  def test_load_custom_saveable_object_ckpt_conversion(self, cycles):
+  def test_load_custom_saveable_object(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+    self._custom_saveable_object(cycles, use_cpp_bindings=use_cpp_bindings)
+
+  def test_load_custom_saveable_object_ckpt_conversion(
+      self, cycles, use_cpp_bindings
+  ):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     # Tests custom saveable object with checkpoint conversion enabled (forces
     # Trackable-based checkpoint implementation).
     saveable_compat.force_checkpoint_conversion()
-    self._custom_saveable_object(cycles)
+    self._custom_saveable_object(cycles, use_cpp_bindings=use_cpp_bindings)
 
-  def test_load_resource_with_dependency(self, cycles):
+  def test_load_resource_with_dependency(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     # Test with StaticHashTable, which has a _initializer attribute that tracks
     # the Asset vocab table.
 
     class MyLookupModel(autotrackable.AutoTrackable):
 
       def __init__(self, vocab_file):
-
         vocab_initializer = lookup_ops.TextFileInitializer(
             vocab_file,
             key_dtype=dtypes.string,
             key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
             value_dtype=dtypes.int64,
-            value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-        self._vocab_table = lookup_ops.StaticHashTable(vocab_initializer,
-                                                       default_value=-1)
+            value_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        )
+        self._vocab_table = lookup_ops.StaticHashTable(
+            vocab_initializer, default_value=-1
+        )
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec((None,), dtypes.string)])
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec((None,), dtypes.string)]
+      )
       def __call__(self, inputs):
         return self._vocab_table.lookup(inputs)
 
     vocab_file = self._make_asset("\n".join(["a", "b", "c", "d"]))
     root = MyLookupModel(vocab_file)
-    imported = cycle(root, cycles)
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
     file_io.delete_file(vocab_file)
-    self.assertAllEqual(imported(constant_op.constant(["d", "b"])),
-                        [3, 1])
+    self.assertAllEqual(imported(constant_op.constant(["d", "b"])), [3, 1])
 
-  def test_custom_gradients(self, cycles):
+  def test_custom_gradients(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     @custom_gradient.custom_gradient
     def log1pexp(x):
@@ -2026,8 +2555,13 @@ def f(x):
     root = autotrackable.AutoTrackable()
     root.f = f
     loaded = cycle(
-        root, cycles, options=save_options.SaveOptions(
-            experimental_custom_gradients=True))
+        root,
+        cycles,
+        save_option=save_options.SaveOptions(
+            experimental_custom_gradients=True
+        ),
+        use_cpp_bindings=use_cpp_bindings,
+    )
     with backprop.GradientTape() as tape2:
       with backprop.GradientTape() as tape:
         tape.watch(v)
@@ -2038,17 +2572,25 @@ def f(x):
     self.assertAllClose(grads, expected_grads)
     self.assertAllClose(grad_grads, expected_grad_grads)
 
-  def test_custom_gradients_with_none_grad(self, cycles):
+  def test_custom_gradients_with_none_grad(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     # https://github.com/google/jax/issues/7123
 
     @custom_gradient.custom_gradient
     def f(params, state):
       def grad_fn(*args):
         return args
+
       return (params, state), grad_fn
-    @def_function.function(input_signature=[
-        tensor_spec.TensorSpec([], dtypes.float32),
-        tensor_spec.TensorSpec([], dtypes.int32)])
+
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec([], dtypes.float32),
+            tensor_spec.TensorSpec([], dtypes.int32),
+        ]
+    )
     def predict(params, state):
       return f(params, state)
 
@@ -2063,8 +2605,13 @@ def predict(params, state):
     root = autotrackable.AutoTrackable()
     root.fn = predict
     loaded = cycle(
-        root, cycles, options=save_options.SaveOptions(
-            experimental_custom_gradients=True))
+        root,
+        cycles,
+        save_option=save_options.SaveOptions(
+            experimental_custom_gradients=True
+        ),
+        use_cpp_bindings=use_cpp_bindings,
+    )
 
     with backprop.GradientTape() as tape:
       tape.watch(params)
@@ -2074,30 +2621,40 @@ def predict(params, state):
     self.assertAllClose(grads, expected_grads)
 
 
+@parameterized.named_parameters(*_test_params())
 class SingleCycleTests(test.TestCase, parameterized.TestCase):
 
-  def test_load_with_tags(self):
+  def test_load_with_tags(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Cpp bindings do not support Tags.")
     root = autotrackable.AutoTrackable()
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
+
     with self.assertRaises(ValueError):
       load.load(path, tags=[tag_constants.EVAL])
     load.load(path, tags=[tag_constants.SERVING])
     load.load(path, tags=tag_constants.SERVING)
     load.load(path, tags=set([tag_constants.SERVING]))
 
-  def test_save_load_contains_with_fspath(self):
+  def test_save_load_contains_with_fspath(self, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Cpp bindings cannot work with pathlib object.")
     root = autotrackable.AutoTrackable()
     path = pathlib.Path(tempfile.mkdtemp(prefix=self.get_temp_dir()))
     save.save(root, path)
     self.assertTrue(loader_impl.contains_saved_model(path))
-    load.load(path)
 
-  def test_single_restore_op_used(self):
+    test_load(path, use_cpp_bindings=use_cpp_bindings)
+
+  def test_single_restore_op_used(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = module.Module()
-    root.v1 = variables.Variable(1.)
-    root.v2 = variables.Variable(2.)
-    root.v3 = variables.Variable(3.)
+    root.v1 = variables.Variable(1.0)
+    root.v2 = variables.Variable(2.0)
+    root.v3 = variables.Variable(3.0)
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
     restore_count = 0
@@ -2108,49 +2665,61 @@ def _count_restores(op_type, *unused_args, **unused_kwargs):
         restore_count += 1
 
     op_callbacks.add_op_callback(_count_restores)
-    load.load(path)
+    save.save(root, path)
+    test_load(path, use_cpp_bindings=use_cpp_bindings)
     op_callbacks.remove_op_callback(_count_restores)
     self.assertEqual(1, restore_count)
 
-  def test_docstring_examples(self):
+  def test_docstring_examples(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
-    exported = checkpoint.Checkpoint(v=variables.Variable(3.))
+    exported = checkpoint.Checkpoint(v=variables.Variable(3.0))
     exported.f = def_function.function(
         lambda x: exported.v * x,
         input_signature=[
-            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
+        ],
+    )
     save.save(exported, path)
-    imported = load.load(path)
-    self.assertEqual(3., imported.v.numpy())
-    self.assertEqual(6., imported.f(x=constant_op.constant(2.)).numpy())
+    imported = test_load(path)
+    self.assertEqual(3.0, imported.v.numpy())
+    self.assertEqual(6.0, imported.f(x=constant_op.constant(2.0)).numpy())
 
     save.save(exported, path, exported.f.get_concrete_function())
-    imported = load.load(path)
+    imported = test_load(path, use_cpp_bindings=use_cpp_bindings)
     f = imported.signatures["serving_default"]
     self.assertAllEqual(
-        [[-3.]],
-        f(x=constant_op.constant([[-1.]]))["output_0"].numpy())
+        [[-3.0]], f(x=constant_op.constant([[-1.0]]))["output_0"].numpy()
+    )
 
-  def test_object_with_extra_dependencies(self):
+  def test_object_with_extra_dependencies(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class Extra(autotrackable.AutoTrackable):
 
       def _trackable_children(self, save_type, **kwargs):
         children = super(Extra, self)._trackable_children(save_type, **kwargs)
-        children["a"] = variables.Variable(5.)
+        children["a"] = variables.Variable(5.0)
         return children
 
     root = Extra()
     path = tempfile.mkdtemp(prefix=self.get_temp_dir())
     save.save(root, path)
-    imported = load.load(path)
+    imported = test_load(path)
     self.assertEqual(5, self.evaluate(imported.a))
 
-  def test_save_cached_variable(self):
+  def test_save_cached_variable(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     with ops.Graph().as_default(), session_lib.Session() as session:
       obj = autotrackable.AutoTrackable()
-      obj.v = variables.Variable(2., caching_device=lambda op: op.device)
-      obj.w = variables.Variable(3.)
+      obj.v = variables.Variable(2.0, caching_device=lambda op: op.device)
+      obj.w = variables.Variable(3.0)
       session.run([obj.v.initializer, obj.w.initializer])
 
       @def_function.function
@@ -2165,9 +2734,9 @@ def wrapped_total(x):
       def increment_v(x):
         obj.v.assign_add(x)
 
-      session.run(increment_v(constant_op.constant(3.)))  # generate signatures
+      session.run(increment_v(constant_op.constant(3.0)))  # generate signatures
       self.assertAllClose(8, total())
-      self.assertAllClose(13, wrapped_total(constant_op.constant(5.)))
+      self.assertAllClose(13, wrapped_total(constant_op.constant(5.0)))
 
       obj.total = total
       obj.wrapped_total = wrapped_total.get_concrete_function()
@@ -2175,43 +2744,56 @@ def increment_v(x):
 
       save_dir = os.path.join(self.get_temp_dir(), "saved_model")
       save.save(obj, save_dir, signatures=total.get_concrete_function())
-      imported = load.load(save_dir)
+      imported = test_load(save_dir)
       session.run(variables.global_variables_initializer())
       self.assertAllClose(8, imported.total())
       session.run(imported.increment_v(4))
       self.assertAllClose(12, imported.total())
-      self.assertAllClose(15, imported.wrapped_total(constant_op.constant(3.)))
-      self.assertAllClose({"output_0": 12},
-                          imported.signatures["serving_default"]())
+      self.assertAllClose(15, imported.wrapped_total(constant_op.constant(3.0)))
+      self.assertAllClose(
+          {"output_0": 12}, imported.signatures["serving_default"]()
+      )
 
     # Try loading and running the function in eager mode
-    imported = load.load(save_dir)
+    imported = test_load(save_dir)
     self.assertAllClose(8, imported.total())
     imported.increment_v(5)
     self.assertAllClose(13, imported.total())
-    self.assertAllClose(13.5, imported.wrapped_total(constant_op.constant(.5)))
-    self.assertAllClose({"output_0": 13},
-                        imported.signatures["serving_default"]())
+    self.assertAllClose(13.5, imported.wrapped_total(constant_op.constant(0.5)))
+    self.assertAllClose(
+        {"output_0": 13}, imported.signatures["serving_default"]()
+    )
 
   # TODO(allenl, kkb): Use the new memory checker here once it's fast enough (3
   # iterations took hundreds of seconds). It would be really nice to check
   # allocations at a lower level.
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def test_functions_cleaned(self):
+  def test_functions_cleaned(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     if sys.version_info.major < 3:
       self.skipTest("Not working in Python 2")
+    if sys.version_info.major == 3 and sys.version_info.minor == 11:
+      # TODO(b/264948173)
+      self.skipTest("Not working in Python 3.11")
     root = module.Module()
-    root.v = variables.Variable(1.)
+    root.v = variables.Variable(1.0)
     root.f = def_function.function(
         lambda x: x + root.v,
         input_signature=[
-            tensor_spec.TensorSpec(shape=[], dtype=dtypes.float32)])
-    cycle(root, 1)
+            tensor_spec.TensorSpec(shape=[], dtype=dtypes.float32)
+        ],
+    )
+    cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
 
-  def test_load_partial_object(self):
+  def test_load_partial_object(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = module.Module()
     root.variables_holder = module.Module()
-    root.variables_holder.v = variables.Variable(1.)
+    root.variables_holder.v = variables.Variable(1.0)
 
     class Adder(module.Module):
 
@@ -2225,8 +2807,9 @@ def __call__(self, y):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir)
 
-    imported = load.load_partial(save_dir,
-                                 ["root.variables_holder.v", "root.adder"])
+    imported = load.load_partial(
+        save_dir, ["root.variables_holder.v", "root.adder"]
+    )
     v = imported["root.variables_holder.v"]
     adder = imported["root.adder"]
     self.assertEqual(self.evaluate(v), 1)
@@ -2234,28 +2817,37 @@ def __call__(self, y):
     self.assertEqual(self.evaluate(v), 6)
 
     with self.assertRaisesRegex(
-        ValueError, "does not include all required objects for loading"):
+        ValueError, "does not include all required objects for loading"
+    ):
       imported = load.load_partial(save_dir, ["root.adder"])
 
-  def test_load_partial_checkpoint(self):
+  def test_load_partial_checkpoint(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     root = module.Module()
     root.variables_holder = module.Module()
-    root.variables_holder.v = variables.Variable(1.)
+    root.variables_holder.v = variables.Variable(1.0)
 
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir)
 
     loaded = module.Module()
-    loaded.v = variables.Variable(2.)
+    loaded.v = variables.Variable(2.0)
 
     load.load_partial(
-        save_dir, {"root": loaded},
-        options=load_options.LoadOptions(allow_partial_checkpoint=True))
+        save_dir,
+        {"root": loaded},
+        options=load_options.LoadOptions(allow_partial_checkpoint=True),
+    )
     self.assertEqual(loaded.variables_holder.v.numpy(), 1)
     with self.assertRaisesRegex(AssertionError, "were not bound"):
       load.load_partial(save_dir, {"root": loaded})
 
-  def test_call_untraced_function_raises_error(self):
+  def test_call_untraced_function_raises_error(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class ObjWithFunction(module.Module):
 
@@ -2265,19 +2857,24 @@ def foo(self, a):
 
     root = ObjWithFunction()
     with self.assertLogs(level="WARNING") as logs:
-      loaded = cycle(root, 1)
+      loaded = cycle(root, 1, use_cpp_bindings=use_cpp_bindings)
 
     expected_save_message = (
         "WARNING:absl:Found untraced functions such as foo while saving "
         "(showing 1 of 1). These functions will not be directly callable after "
-        "loading.")
+        "loading."
+    )
     self.assertIn(expected_save_message, logs.output)
 
     with self.assertRaisesRegex(
-        ValueError, "Found zero restored functions for caller function."):
+        ValueError, "Found zero restored functions for caller function."
+    ):
       loaded.foo(1)
 
-  def test_restored_function_execute_eagerly(self):
+  def test_restored_function_execute_eagerly(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     try:
       def_function.run_functions_eagerly(True)
 
@@ -2289,8 +2886,9 @@ def __call__(self, inputs, training=False):
 
       model = MyModel()
       model.__call__.get_concrete_function(
-          tensor_spec.TensorSpec([None], dtypes.float32))
-      loaded = cycle(model, 1)
+          tensor_spec.TensorSpec([None], dtypes.float32)
+      )
+      loaded = cycle(model, 1, use_cpp_bindings=use_cpp_bindings)
 
       # Calling the function should not throw an exception.
       loaded(constant_op.constant([1.0]))
@@ -2298,7 +2896,12 @@ def __call__(self, inputs, training=False):
     finally:
       def_function.run_functions_eagerly(False)
 
-  def test_restored_model_concrete_function_is_deterministic(self):
+  def test_restored_model_concrete_function_is_deterministic(
+      self, use_cpp_bindings
+  ):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     previous_concrete_function = None
     for _ in range(100):
 
@@ -2311,27 +2914,37 @@ def __call__(self, x):
       model = MyModel()
       model(array_ops.ones((7, 3), dtype=dtypes.float32))
       model.__call__.get_concrete_function(
-          tensor_spec.TensorSpec([None, 3], dtypes.float32))
-      loaded = cycle(model, 1)
+          tensor_spec.TensorSpec([None, 3], dtypes.float32)
+      )
+      loaded = cycle(model, 1, use_cpp_bindings=use_cpp_bindings)
 
       # Ensure the newly loaded concrete function is the same as the previous
       # after a cycle of serialization / deserialization.
       new_concrete_function = loaded.__call__.get_concrete_function(
-          tensor_spec.TensorSpec([None, 3], dtypes.float32))
+          tensor_spec.TensorSpec([None, 3], dtypes.float32)
+      )
       if previous_concrete_function is not None:
-        self.assertEqual(previous_concrete_function.pretty_printed_signature(),
-                         new_concrete_function.pretty_printed_signature())
+        self.assertEqual(
+            previous_concrete_function.pretty_printed_signature(),
+            new_concrete_function.pretty_printed_signature(),
+        )
 
       previous_concrete_function = new_concrete_function
 
-  def test_garbage_collection_capturable_resource_doesnt_raise_exception(self):
+  def test_garbage_collection_capturable_resource_doesnt_raise_exception(
+      self, use_cpp_bindings
+  ):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     model = module.Module()
     model.mapping = lookup_ops.StaticHashTable(
         lookup_ops.KeyValueTensorInitializer(
-            keys=math_ops.range(1, dtype=dtypes.int32),
-            values=["foo"]),
-        "default_value")
-    loaded = cycle(model, 1)
+            keys=math_ops.range(1, dtype=dtypes.int32), values=["foo"]
+        ),
+        "default_value",
+    )
+    loaded = cycle(model, 1, use_cpp_bindings=use_cpp_bindings)
     del model
     del loaded
     # Exceptions raised during garbage collection are simply printed to stderr
@@ -2344,7 +2957,10 @@ def test_garbage_collection_capturable_resource_doesnt_raise_exception(self):
     if "Exception ignored in" in stderr.getvalue():
       raise Exception(stderr.getvalue())
 
-  def test_captured_dataset_with_asset(self):
+  def test_captured_dataset_with_asset(self, use_cpp_bindings):
+    # TODO(b/264869753) Fix SingleCycleTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class HasDataset(module.Module):
 
@@ -2368,7 +2984,8 @@ def __call__(self, x):
     root = HasDataset(temp_dir, file_name)
     self.assertEqual(
         18,  # 3 * (1 + 2 + 3)
-        root(constant_op.constant(3, dtype=dtypes.int32)).numpy())
+        root(constant_op.constant(3, dtype=dtypes.int32)).numpy(),
+    )
 
     save_dir = os.path.join(self.get_temp_dir(), "save_dir")
     save.save(root, save_dir)
@@ -2379,13 +2996,15 @@ def __call__(self, x):
     load_dir = os.path.join(self.get_temp_dir(), "load_dir")
     file_io.rename(save_dir, load_dir)
 
-    loaded = load.load(load_dir)
+    loaded = test_load(load_dir, use_cpp_bindings=use_cpp_bindings)
     self.assertEqual(
         18,  # 3 * (1 + 2 + 3)
-        loaded(constant_op.constant(3, dtype=dtypes.int32)).numpy())
+        loaded(constant_op.constant(3, dtype=dtypes.int32)).numpy(),
+    )
 
 
-class DeferredInitModuleVariablesTest(test.TestCase):
+# TODO(b/264882754) Support Cpp bindings DeferredInitModuleVariablesTest
+class DeferredInitModuleVariablesTest(test.TestCase, parameterized.TestCase):
 
   def test_deferred_init_module_variables(self):
     """Defer initialization of variables in a module to the load stage."""
@@ -2495,23 +3114,29 @@ def _make_asset(self, contents):
       f.write(contents)
     return filename
 
-  def test_assets(self):
+  @parameterized.named_parameters(*_test_params())
+  def test_assets(self, use_cpp_bindings):
+    # TODO(b/264882754) Fix DeferredInitModuleVariablesTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
 
     class MyLookupModel(autotrackable.AutoTrackable):
 
       def __init__(self, vocab_file):
-
         vocab_initializer = lookup_ops.TextFileInitializer(
             vocab_file,
             key_dtype=dtypes.string,
             key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
             value_dtype=dtypes.int64,
-            value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-        self._vocab_table = lookup_ops.StaticHashTable(vocab_initializer,
-                                                       default_value=-1)
+            value_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        )
+        self._vocab_table = lookup_ops.StaticHashTable(
+            vocab_initializer, default_value=-1
+        )
 
-      @def_function.function(input_signature=[
-          tensor_spec.TensorSpec((None,), dtypes.string)])
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec((None,), dtypes.string)]
+      )
       def __call__(self, inputs):
         return self._vocab_table.lookup(inputs)
 
@@ -2520,16 +3145,18 @@ def __call__(self, inputs):
 
     save_dir = os.path.join(self.get_temp_dir(), "save_dir")
     save.save_and_return_nodes(
-        root, save_dir, experimental_skip_checkpoint=True)
+        root, save_dir, experimental_skip_checkpoint=True
+    )
     file_io.delete_file(vocab_file)
     load_dir = os.path.join(self.get_temp_dir(), "load_dir")
     file_io.rename(save_dir, load_dir)
 
-    imported = load.load(
+    imported = test_load(
         load_dir,
-        options=load_options.LoadOptions(experimental_skip_checkpoint=True))
-    self.assertAllEqual(imported(constant_op.constant(["d", "b"])),
-                        [3, 1])
+        options=load_options.LoadOptions(experimental_skip_checkpoint=True),
+        use_cpp_bindings=use_cpp_bindings,
+    )
+    self.assertAllEqual(imported(constant_op.constant(["d", "b"])), [3, 1])
 
 
 class _TestModel(module.Module):
@@ -2543,16 +3170,21 @@ def __init__(self, rows, cols):
   def __call__(self, x):
     with ops.device("/cpu:0"):
       self.table = variables.Variable(
-          constant_op.constant(1., shape=[self.rows, self.cols]))
+          constant_op.constant(1.0, shape=[self.rows, self.cols])
+      )
       x = math_ops.matmul(self.table, x)
       x = math_ops.reduce_sum(x, axis=0)
     return x
 
 
-class SavedModelLoadMemoryTests(test.TestCase):
+@parameterized.named_parameters(*_test_params())
+class SavedModelLoadMemoryTests(test.TestCase, parameterized.TestCase):
 
   @test_util.run_gpu_only
-  def test_no_oom_loading_large_tenor(self):
+  def test_no_oom_loading_large_tenor(self, use_cpp_bindings):
+    # TODO(b/264882686) Fix DeferredInitModuleVariablesTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
     if not config.get_soft_device_placement():
       self.skipTest("This test only works for soft device placement is on")
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -2565,18 +3197,19 @@ def test_no_oom_loading_large_tenor(self):
         model,
         save_dir,
         options=save_options.SaveOptions(
-            experimental_variable_policy=save_options.VariablePolicy
-            .SAVE_VARIABLE_DEVICES),
+            experimental_variable_policy=save_options.VariablePolicy.SAVE_VARIABLE_DEVICES
+        ),
     )
-    loaded_on_cpu = load.load(
-        export_dir=save_dir,
+    loaded_on_cpu = test_load(
+        path=save_dir,
         options=load_options.LoadOptions(
-            experimental_variable_policy=save_options.VariablePolicy
-            .SAVE_VARIABLE_DEVICES),
+            experimental_variable_policy=save_options.VariablePolicy.SAVE_VARIABLE_DEVICES
+        ),
+        use_cpp_bindings=use_cpp_bindings,
     )
-    loaded_on_gpu = load.load(export_dir=save_dir)
-    self.assertTrue("CPU" in loaded_on_cpu.table.device)
-    self.assertTrue("GPU" in loaded_on_gpu.table.device)
+    loaded_on_gpu = test_load(save_dir)
+    self.assertIn("CPU", loaded_on_cpu.table.device)
+    self.assertIn("GPU", loaded_on_gpu.table.device)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index b4c2f593710..f74f276d247 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -21,11 +21,9 @@
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import function_deserialization
 from tensorflow.python.saved_model import loader_impl
@@ -62,8 +60,9 @@ def __init__(self, init_fn, asset_paths):
     self._init_fn = init_fn
 
   def _create_resource(self):
-    return array_ops.placeholder(
-        dtype=dtypes.resource, shape=[], name="unused_resource")
+    # Return a constant here so that when re-saved, the traced `create_resource`
+    # has valid returns.
+    return constant_op.constant(1.)
 
   def _initialize(self):
     return self._init_fn(*[path.asset_path for path in self._asset_paths])
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 836993fe420..2f3c4e532f9 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -29,6 +29,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
@@ -57,7 +58,7 @@ def parse_saved_model_with_debug_info(export_dir):
   saved_model = parse_saved_model(export_dir)
 
   debug_info_path = file_io.join(
-      saved_model_utils.get_debug_dir(export_dir),
+      path_helpers.get_debug_dir(export_dir),
       constants.DEBUG_INFO_FILENAME_PB)
   debug_info = graph_debug_info_pb2.GraphDebugInfo()
   if file_io.file_exists(debug_info_path):
@@ -348,7 +349,7 @@ def __init__(self, export_dir):
         variables to be loaded are located.
     """
     self._export_dir = export_dir
-    self._variables_path = saved_model_utils.get_variables_path(export_dir)
+    self._variables_path = path_helpers.get_variables_path(export_dir)
     self._saved_model = parse_saved_model(export_dir)
 
   @property
diff --git a/tensorflow/python/saved_model/metrics_test.py b/tensorflow/python/saved_model/metrics_test.py
index 4835c0b374b..6435c256e2a 100644
--- a/tensorflow/python/saved_model/metrics_test.py
+++ b/tensorflow/python/saved_model/metrics_test.py
@@ -19,7 +19,6 @@
 """
 
 import os
-import shutil
 
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -30,6 +29,7 @@
 from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.trackable import autotrackable
 
@@ -40,7 +40,6 @@ def _create_save_v2_model(self):
     root = autotrackable.AutoTrackable()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir)
-    self.addCleanup(shutil.rmtree, save_dir)
     return save_dir
 
   def _create_save_v1_model(self):
@@ -52,7 +51,6 @@ def _create_save_v1_model(self):
         constant_op.constant(5.0)
         builder_.add_meta_graph_and_variables(sess, ["foo"])
       builder_.save()
-    self.addCleanup(shutil.rmtree, save_dir)
     return save_dir
 
   def test_python_save(self):
@@ -114,6 +112,21 @@ def test_loader_v1(self):
     self.assertEqual(metrics.GetReadApi(loader_impl._LOADER_LABEL), 1)
     self.assertEqual(metrics.GetRead(write_version="1"), read_count + 1)
 
+  def test_save_sets_write_fingerprint_metric(self):
+    exported_dir = self._create_save_v2_model()
+
+    self.assertEqual(
+        metrics.GetWriteFingerprint(),
+        str(fingerprinting.MaybeReadSavedModelChecksum(exported_dir)))
+
+  def test_load_sets_read_fingerprint_metric(self):
+    exported_dir = self._create_save_v2_model()
+    load.load(exported_dir)
+
+    self.assertEqual(
+        metrics.GetWriteFingerprint(),
+        str(fingerprinting.MaybeReadSavedModelChecksum(exported_dir)))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index bef54965351..c28419477fc 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -19,6 +19,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 077c803ca88..61131ac20c8 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -31,22 +31,9 @@
 import warnings
 
 from tensorflow.core.protobuf import struct_pb2
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import optional_ops
-from tensorflow.python.distribute import values
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import extension_type
-from tensorflow.python.framework import indexed_slices
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import type_spec
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import row_partition
+from tensorflow.python.framework import type_spec_registry
+from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
@@ -77,7 +64,9 @@ def _get_decoders():
 
 
 def _map_structure(pyobj, coders):
-  for can, do in coders:
+  # Iterate through the codecs in the reverse order they were registered in,
+  #   as the most specific codec should be checked first.
+  for can, do in reversed(coders):
     if can(pyobj):
       recursion_fn = functools.partial(_map_structure, coders=coders)
       return do(pyobj, recursion_fn)
@@ -349,27 +338,6 @@ def do_decode(self, value, decode_fn):
     return value.bool_value
 
 
-class _TensorShapeCodec:
-  """Codec for `TensorShape`."""
-
-  def can_encode(self, pyobj):
-    return isinstance(pyobj, tensor_shape.TensorShape)
-
-  def do_encode(self, tensor_shape_value, encode_fn):
-    del encode_fn
-    encoded_tensor_shape = struct_pb2.StructuredValue()
-    encoded_tensor_shape.tensor_shape_value.CopyFrom(
-        tensor_shape_value.as_proto())
-    return encoded_tensor_shape
-
-  def can_decode(self, value):
-    return value.HasField("tensor_shape_value")
-
-  def do_decode(self, value, decode_fn):
-    del decode_fn
-    return tensor_shape.TensorShape(value.tensor_shape_value)
-
-
 class _TensorTypeCodec:
   """Codec for `TensorType`."""
 
@@ -390,72 +358,79 @@ def do_decode(self, value, decode_fn):
     return dtypes.DType(value.tensor_dtype_value)
 
 
-class _TensorSpecCodec:
-  """Codec for `TensorSpec`."""
+class BuiltInTypeSpecCodec:
+  """Codec for built-in `TypeSpec` classes.
 
-  def can_encode(self, pyobj):
-    # BoundedTensorSpec has its own decoder.
-    return (isinstance(pyobj, tensor_spec.TensorSpec) and
-            not isinstance(pyobj, tensor_spec.BoundedTensorSpec))
-
-  def do_encode(self, tensor_spec_value, encode_fn):
-    encoded_tensor_spec = struct_pb2.StructuredValue()
-    encoded_tensor_spec.tensor_spec_value.CopyFrom(
-        struct_pb2.TensorSpecProto(
-            shape=encode_fn(tensor_spec_value.shape).tensor_shape_value,
-            dtype=encode_fn(tensor_spec_value.dtype).tensor_dtype_value,
-            name=tensor_spec_value.name))
-    return encoded_tensor_spec
+  Built-in TypeSpec's that do not require a custom codec implementation
+  register themselves by instantiating this class and passing it to
+  register_codec.
 
-  def can_decode(self, value):
-    return value.HasField("tensor_spec_value")
+  Attributes:
+    type_spec_class: The built-in TypeSpec class that the
+      codec is instantiated for.
+    type_spec_proto_enum: The proto enum value for the built-in TypeSpec class.
+  """
 
-  def do_decode(self, value, decode_fn):
-    name = value.tensor_spec_value.name
-    return tensor_spec.TensorSpec(
-        shape=decode_fn(
-            struct_pb2.StructuredValue(
-                tensor_shape_value=value.tensor_spec_value.shape)),
-        dtype=decode_fn(
-            struct_pb2.StructuredValue(
-                tensor_dtype_value=value.tensor_spec_value.dtype)),
-        name=(name if name else None))
+  _BUILT_IN_TYPE_SPEC_PROTOS = []
+  _BUILT_IN_TYPE_SPECS = []
+
+  def __init__(self, type_spec_class, type_spec_proto_enum):
+    if not issubclass(type_spec_class, internal.TypeSpec):
+      raise ValueError(
+          f"The type '{type_spec_class}' does not subclass tf.TypeSpec.")
+
+    if type_spec_class in self._BUILT_IN_TYPE_SPECS:
+      raise ValueError(
+          f"The type '{type_spec_class}' already has an instantiated codec.")
+
+    if type_spec_proto_enum in self._BUILT_IN_TYPE_SPEC_PROTOS:
+      raise ValueError(
+          f"The proto value '{type_spec_proto_enum}' is already registered."
+      )
 
+    if (not isinstance(type_spec_proto_enum, int)
+        or type_spec_proto_enum <= 0
+        or type_spec_proto_enum > 10):
+      raise ValueError(f"The proto value '{type_spec_proto_enum}' is invalid.")
 
-class _BoundedTensorSpecCodec:
-  """Codec for `BoundedTensorSpec`."""
+    self.type_spec_class = type_spec_class
+    self.type_spec_proto_enum = type_spec_proto_enum
+
+    self._BUILT_IN_TYPE_SPECS.append(type_spec_class)
+    self._BUILT_IN_TYPE_SPEC_PROTOS.append(type_spec_proto_enum)
 
   def can_encode(self, pyobj):
-    return isinstance(pyobj, tensor_spec.BoundedTensorSpec)
-
-  def do_encode(self, bounded_tensor_spec_value, encode_fn):
-    """Returns an encoded proto for the given `tf.BoundedTensorSpec`."""
-    encoded_bounded_tensor_spec = struct_pb2.StructuredValue()
-    encoded_bounded_tensor_spec.bounded_tensor_spec_value.CopyFrom(
-        struct_pb2.BoundedTensorSpecProto(
-            shape=encode_fn(bounded_tensor_spec_value.shape).tensor_shape_value,
-            dtype=encode_fn(bounded_tensor_spec_value.dtype).tensor_dtype_value,
-            name=bounded_tensor_spec_value.name,
-            minimum=tensor_util.make_tensor_proto(
-                bounded_tensor_spec_value.minimum),
-            maximum=tensor_util.make_tensor_proto(
-                bounded_tensor_spec_value.maximum)))
-    return encoded_bounded_tensor_spec
+    """Returns true if `pyobj` can be encoded as the built-in TypeSpec."""
+    return isinstance(pyobj, self.type_spec_class)
+
+  def do_encode(self, type_spec_value, encode_fn):
+    """Returns an encoded proto for the given built-in TypeSpec."""
+    type_state = type_spec_value._serialize()  # pylint: disable=protected-access
+    num_flat_components = len(nest.flatten(
+        type_spec_value._component_specs, expand_composites=True))  # pylint: disable=protected-access
+    encoded_type_spec = struct_pb2.StructuredValue()
+    encoded_type_spec.type_spec_value.CopyFrom(
+        struct_pb2.TypeSpecProto(
+            type_spec_class=self.type_spec_proto_enum,
+            type_state=encode_fn(type_state),
+            type_spec_class_name=self.type_spec_class.__name__,
+            num_flat_components=num_flat_components))
+    return encoded_type_spec
 
   def can_decode(self, value):
-    return value.HasField("bounded_tensor_spec_value")
+    """Returns true if `value` can be decoded into its built-in TypeSpec."""
+    if value.HasField("type_spec_value"):
+      type_spec_class_enum = value.type_spec_value.type_spec_class
+      return type_spec_class_enum == self.type_spec_proto_enum
+    return False
 
   def do_decode(self, value, decode_fn):
-    btsv = value.bounded_tensor_spec_value
-    name = btsv.name
-    return tensor_spec.BoundedTensorSpec(
-        shape=decode_fn(
-            struct_pb2.StructuredValue(tensor_shape_value=btsv.shape)),
-        dtype=decode_fn(
-            struct_pb2.StructuredValue(tensor_dtype_value=btsv.dtype)),
-        minimum=tensor_util.MakeNdarray(btsv.minimum),
-        maximum=tensor_util.MakeNdarray(btsv.maximum),
-        name=(name if name else None))
+    """Returns the built in `TypeSpec` encoded by the proto `value`."""
+    type_spec_proto = value.type_spec_value
+    # pylint: disable=protected-access
+    return self.type_spec_class._deserialize(
+        decode_fn(type_spec_proto.type_state)
+    )
 
 
 # TODO(b/238903802): Use TraceType serialization and specific protos.
@@ -463,27 +438,9 @@ class _TypeSpecCodec:
   """Codec for `tf.TypeSpec`."""
 
   # Mapping from enum value to type (TypeSpec subclass).
+  # Must leave this for backwards-compatibility until all external usages
+  # have been removed.
   TYPE_SPEC_CLASS_FROM_PROTO = {
-      struct_pb2.TypeSpecProto.SPARSE_TENSOR_SPEC:
-          sparse_tensor.SparseTensorSpec,
-      struct_pb2.TypeSpecProto.INDEXED_SLICES_SPEC:
-          indexed_slices.IndexedSlicesSpec,
-      struct_pb2.TypeSpecProto.RAGGED_TENSOR_SPEC:
-          ragged_tensor.RaggedTensorSpec,
-      struct_pb2.TypeSpecProto.TENSOR_ARRAY_SPEC:
-          tensor_array_ops.TensorArraySpec,
-      struct_pb2.TypeSpecProto.DATA_DATASET_SPEC:
-          dataset_ops.DatasetSpec,
-      struct_pb2.TypeSpecProto.DATA_ITERATOR_SPEC:
-          iterator_ops.IteratorSpec,
-      struct_pb2.TypeSpecProto.OPTIONAL_SPEC:
-          optional_ops.OptionalSpec,
-      struct_pb2.TypeSpecProto.PER_REPLICA_SPEC:
-          values.PerReplicaSpec,
-      struct_pb2.TypeSpecProto.VARIABLE_SPEC:
-          resource_variable_ops.VariableSpec,
-      struct_pb2.TypeSpecProto.ROW_PARTITION_SPEC:
-          row_partition.RowPartitionSpec,
   }
 
   # Mapping from type (TypeSpec subclass) to enum value.
@@ -491,14 +448,14 @@ class _TypeSpecCodec:
       (cls, enum) for (enum, cls) in TYPE_SPEC_CLASS_FROM_PROTO.items())
 
   def can_encode(self, pyobj):
-    """Returns true if `pyboj` can be encoded as a TypeSpec."""
+    """Returns true if `pyobj` can be encoded as a TypeSpec."""
     if type(pyobj) in self.TYPE_SPEC_CLASS_TO_PROTO:  # pylint: disable=unidiomatic-typecheck
       return True
 
     # Check if it's a registered type.
-    if isinstance(pyobj, type_spec.TypeSpec):
+    if isinstance(pyobj, internal.TypeSpec):
       try:
-        type_spec.get_name(type(pyobj))
+        type_spec_registry.get_name(type(pyobj))
         return True
       except ValueError:
         return False
@@ -511,16 +468,13 @@ def do_encode(self, type_spec_value, encode_fn):
     type_spec_class_name = type(type_spec_value).__name__
 
     if type_spec_class is None:
-      type_spec_class_name = type_spec.get_name(type(type_spec_value))
-      if isinstance(type_spec_value, extension_type.ExtensionTypeSpec):
-        type_spec_class = struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC
-      else:
-        type_spec_class = struct_pb2.TypeSpecProto.REGISTERED_TYPE_SPEC
-        # Support for saving registered TypeSpecs is currently experimental.
-        # Issue a warning to indicate the limitations.
-        warnings.warn("Encoding a StructuredValue with type %s; loading this "
-                      "StructuredValue will require that this type be "
-                      "imported and registered." % type_spec_class_name)
+      type_spec_class_name = type_spec_registry.get_name(type(type_spec_value))
+      type_spec_class = struct_pb2.TypeSpecProto.REGISTERED_TYPE_SPEC
+      # Support for saving registered TypeSpecs is currently experimental.
+      # Issue a warning to indicate the limitations.
+      warnings.warn("Encoding a StructuredValue with type %s; loading this "
+                    "StructuredValue will require that this type be "
+                    "imported and registered." % type_spec_class_name)
 
     type_state = type_spec_value._serialize()  # pylint: disable=protected-access
     num_flat_components = len(
@@ -535,6 +489,7 @@ def do_encode(self, type_spec_value, encode_fn):
     return encoded_type_spec
 
   def can_decode(self, value):
+    """Returns true if `value` can be decoded into a `tf.TypeSpec`."""
     return value.HasField("type_spec_value")
 
   def do_decode(self, value, decode_fn):
@@ -545,19 +500,12 @@ def do_decode(self, value, decode_fn):
 
     if type_spec_class_enum == struct_pb2.TypeSpecProto.REGISTERED_TYPE_SPEC:
       try:
-        type_spec_class = type_spec.lookup(class_name)
+        type_spec_class = type_spec_registry.lookup(class_name)
       except ValueError as e:
         raise ValueError(
             f"The type '{class_name}' has not been registered.  It must be "
             "registered before you load this object (typically by importing "
             "its module).") from e
-    elif type_spec_class_enum == struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC:
-      try:
-        type_spec_class = type_spec.lookup(class_name)
-      except ValueError:
-        type_spec_class = extension_type.AnonymousExtensionTypeSpec
-        warnings.warn("The type %r has not been registered.  Falling back to "
-                      "using AnonymousExtensionTypeSpec instead.")
     else:
       if type_spec_class_enum not in self.TYPE_SPEC_CLASS_FROM_PROTO:
         raise ValueError(
@@ -578,11 +526,8 @@ def do_decode(self, value, decode_fn):
     _Float64Codec(),
     _NoneCodec(),
     _Int64Codec(),
-    _TensorShapeCodec(),
     _BoolCodec(),
-    _BoundedTensorSpecCodec(),
-    _TensorTypeCodec(),
     _DictCodec(),
-    _TensorSpecCodec(),
     _TypeSpecCodec(),
+    _TensorTypeCodec(),
 ]
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index b9329cbb38c..3a0be78de3e 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -29,9 +29,11 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
+from tensorflow.python.framework import type_spec_registry
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.types import internal
 
 
 class NestedStructureCoderTest(test.TestCase):
@@ -434,6 +436,37 @@ def testUnregisteredTypeSpec(self):
     with self.assertRaises(nested_structure_coder.NotEncodableError):
       nested_structure_coder.encode_structure(structure)
 
+  def testBuiltInTypeSpecCodecInvalidInputs(self):
+    class Foo:
+      pass
+
+    class Bar(internal.TypeSpec):
+      pass
+
+    with self.assertRaisesRegex(
+        ValueError, "The type '(.*?)' does not subclass tf.TypeSpec."):
+      nested_structure_coder.BuiltInTypeSpecCodec(Foo, 0)
+    with self.assertRaisesRegex(
+        ValueError, "The type '(.*?)' already has an instantiated codec."):
+      nested_structure_coder.BuiltInTypeSpecCodec(
+          dataset_ops.DatasetSpec, struct_pb2.TypeSpecProto.DATA_DATASET_SPEC)
+    with self.assertRaisesRegex(
+        ValueError, "The proto value '(.*?)' is already registered."):
+      nested_structure_coder.BuiltInTypeSpecCodec(
+          Bar, struct_pb2.TypeSpecProto.DATA_DATASET_SPEC)
+    with self.assertRaisesRegex(
+        ValueError, "The proto value '(.*?)' is invalid."):
+      nested_structure_coder.BuiltInTypeSpecCodec(Bar, 0)
+    with self.assertRaisesRegex(
+        ValueError, "The proto value '(.*?)' is invalid."):
+      nested_structure_coder.BuiltInTypeSpecCodec(Bar, 11)
+    with self.assertRaisesRegex(
+        ValueError, "The proto value '(.*?)' is invalid."):
+      nested_structure_coder.BuiltInTypeSpecCodec(Bar, 12)
+    with self.assertRaisesRegex(
+        ValueError, "The proto value '(.*?)' is invalid."):
+      nested_structure_coder.BuiltInTypeSpecCodec(Bar, 13)
+
 
 # Trivial TypeSpec class for testing.
 class UnregisteredTypeSpec(type_spec.TypeSpec):
@@ -445,7 +478,7 @@ class UnregisteredTypeSpec(type_spec.TypeSpec):
 
 
 # Trivial TypeSpec class for testing.
-@type_spec.register("NestedStructureTest.RegisteredTypeSpec")
+@type_spec_registry.register("NestedStructureTest.RegisteredTypeSpec")
 class RegisteredTypeSpec(type_spec.TypeSpec):
   value_type = property(lambda self: None)
   _component_specs = property(lambda self: ())
diff --git a/tensorflow/python/saved_model/path_helpers.py b/tensorflow/python/saved_model/path_helpers.py
new file mode 100644
index 00000000000..8bfc356c71f
--- /dev/null
+++ b/tensorflow/python/saved_model/path_helpers.py
@@ -0,0 +1,82 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Path helpers utility functions."""
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def get_or_create_variables_dir(export_dir):
+  """Return variables sub-directory, or create one if it doesn't exist."""
+  variables_dir = get_variables_dir(export_dir)
+  file_io.recursive_create_dir(variables_dir)
+  return variables_dir
+
+
+def get_variables_dir(export_dir):
+  """Return variables sub-directory in the SavedModel."""
+  return file_io.join(
+      compat.as_text(export_dir), compat.as_text(constants.VARIABLES_DIRECTORY))
+
+
+def get_variables_path(export_dir):
+  """Return the variables path, used as the prefix for checkpoint files."""
+  return file_io.join(
+      compat.as_text(get_variables_dir(export_dir)),
+      compat.as_text(constants.VARIABLES_FILENAME))
+
+
+def get_or_create_assets_dir(export_dir):
+  """Return assets sub-directory, or create one if it doesn't exist."""
+  assets_destination_dir = get_assets_dir(export_dir)
+
+  file_io.recursive_create_dir(assets_destination_dir)
+
+  return assets_destination_dir
+
+
+def get_assets_dir(export_dir):
+  """Return path to asset directory in the SavedModel."""
+  return file_io.join(
+      compat.as_text(export_dir), compat.as_text(constants.ASSETS_DIRECTORY))
+
+
+def get_or_create_debug_dir(export_dir):
+  """Returns path to the debug sub-directory, creating if it does not exist."""
+  debug_dir = get_debug_dir(export_dir)
+
+  file_io.recursive_create_dir(debug_dir)
+
+  return debug_dir
+
+
+def get_saved_model_pbtxt_path(export_dir):
+  return file_io.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+
+
+def get_saved_model_pb_path(export_dir):
+  return file_io.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+
+def get_debug_dir(export_dir):
+  """Returns path to the debug sub-directory in the SavedModel."""
+  return file_io.join(
+      compat.as_text(export_dir), compat.as_text(constants.DEBUG_DIRECTORY))
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
index bf84f17356f..8e001eb61ca 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 #include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 
@@ -38,13 +39,42 @@ void DefineFingerprintingModule(py::module main_module) {
         SavedModel saved_model_pb;
         saved_model_pb.ParseFromString(serialized_saved_model);
 
-        return py::bytes(fingerprinting::CreateFingerprintDef(
-                             saved_model_pb.meta_graphs(0), export_dir)
-                             .SerializeAsString());
+        return py::bytes(
+            fingerprinting::CreateFingerprintDef(saved_model_pb, export_dir)
+                .SerializeAsString());
       },
       py::arg("saved_model"), py::arg("export_dir"),
       py::doc(
           "Returns the serialized FingerprintDef of a serialized SavedModel."));
+
+  m.def(
+      "MaybeReadSavedModelChecksum",
+      [](std::string export_dir) {
+        StatusOr<FingerprintDef> fingerprint =
+            fingerprinting::ReadSavedModelFingerprint(export_dir);
+        if (fingerprint.ok()) {
+          return fingerprint->saved_model_checksum();
+        }
+        return (uint64_t)0;
+      },
+      py::arg("export_dir"),
+      py::doc(
+          "Reads the fingerprint checksum from SavedModel directory. Returns "
+          "0 if an error occurs."));
+
+  m.def(
+      "GetFingerprintMap",
+      [](std::string export_dir) {
+        StatusOr<FingerprintDef> fingerprint =
+            fingerprinting::ReadSavedModelFingerprint(export_dir);
+        if (fingerprint.ok()) {
+          return fingerprinting::MakeFingerprintMap(*fingerprint);
+        }
+        return std::unordered_map<std::string, uint64_t>();
+      },
+      py::arg("export_dir"),
+      py::doc("Returns the fingerprint protobuf as a dictionary. Returns "
+              "an empty dictionary if invalid fingerprint file."));
 }
 
 }  // namespace python
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
index cd5d970ee00..289787b0648 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
@@ -35,9 +35,9 @@ def test_fingerprint_def_is_deserialized_correctly(self):
     fingerprint_def = fingerprint_pb2.FingerprintDef()
     fingerprint_def.ParseFromString(
         fingerprinting.CreateFingerprintDef(file_content, export_dir))
-    # We cannot check the value of the graph_def_checksum due to non-determinism
-    # in serialization.
-    self.assertGreater(fingerprint_def.graph_def_checksum, 0)
+    # We cannot check the value of the saved_model_checksum due to
+    # non-determinism in serialization.
+    self.assertGreater(fingerprint_def.saved_model_checksum, 0)
     self.assertEqual(fingerprint_def.graph_def_program_hash,
                      10127142238652115842)
     self.assertEqual(fingerprint_def.signature_def_hash, 5693392539583495303)
@@ -47,5 +47,62 @@ def test_fingerprint_def_is_deserialized_correctly(self):
     # check its value here.
     self.assertGreater(fingerprint_def.checkpoint_hash, 0)
 
+  def test_read_fingerprint_from_file(self):
+    export_dir = test.test_src_dir_path(
+        "cc/saved_model/testdata/VarsAndArithmeticObjectGraph")
+    self.assertEqual(
+        fingerprinting.MaybeReadSavedModelChecksum(export_dir),
+        15788619162413586750)
+
+  def test_read_nonexistent_fingerprint_from_file(self):
+    export_dir = test.test_src_dir_path("cc/saved_model/testdata/AssetModule")
+    self.assertEqual(fingerprinting.MaybeReadSavedModelChecksum(export_dir), 0)
+
+  def test_get_fingerprint_map_valid(self):
+    export_dir = test.test_src_dir_path(
+        "cc/saved_model/testdata/VarsAndArithmeticObjectGraph"
+    )
+    fingerprint_map = fingerprinting.GetFingerprintMap(export_dir)
+
+    fingerprint_def = fingerprint_pb2.FingerprintDef()
+    with file_io.FileIO(os.path.join(export_dir, "fingerprint.pb"), "rb") as f:
+      fingerprint_def.ParseFromString(f.read())
+
+    self.assertEqual(
+        fingerprint_map["saved_model_checksum"],
+        fingerprint_def.saved_model_checksum,
+    )
+    self.assertEqual(
+        fingerprint_map["graph_def_program_hash"],
+        fingerprint_def.graph_def_program_hash,
+    )
+    self.assertEqual(
+        fingerprint_map["signature_def_hash"],
+        fingerprint_def.signature_def_hash,
+    )
+    self.assertEqual(
+        fingerprint_map["saved_object_graph_hash"],
+        fingerprint_def.saved_object_graph_hash,
+    )
+    self.assertEqual(
+        fingerprint_map["checkpoint_hash"], fingerprint_def.checkpoint_hash
+    )
+    self.assertEqual(
+        fingerprint_map["version"], fingerprint_def.version.producer
+    )
+
+
+def test_get_fingerprint_map_nonexistent(self):
+  export_dir = test.test_src_dir_path("cc/saved_model/testdata/AssetModule")
+  fingerprint_map = fingerprinting.GetFingerprintMap(export_dir)
+  self.assertEmpty(fingerprint_map)
+
+
+def test_get_fingerprint_map_invalid_saved_model(self):
+  export_dir = test.test_src_dir_path("not_a_saved_model")
+  fingerprint_map = fingerprinting.GetFingerprintMap(export_dir)
+  self.assertEmpty(fingerprint_map)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
index 8169f4713d6..9237e655d49 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics.cc
@@ -97,6 +97,36 @@ void DefineMetricsModule(py::module main_module) {
       py::doc("Get value of '/tensorflow/core/saved_model/read/api' "
               "counter for `api_label` cell."));
 
+  m.def(
+      "SetReadFingerprint",
+      [](const char* saved_model_checksum) {
+        metrics::SavedModelReadFingerprint().Set(saved_model_checksum);
+      },
+      py::kw_only(), py::arg("saved_model_checksum"),
+      py::doc("Set the '/tensorflow/core/saved_model/read/fingerprint' gauge "
+              "with `saved_model_checksum`."));
+
+  m.def(
+      "GetReadFingerprint",
+      []() { return metrics::SavedModelReadFingerprint().value(); },
+      py::doc("Get value of '/tensorflow/core/saved_model/read/fingerprint' "
+              "gauge."));
+
+  m.def(
+      "SetWriteFingerprint",
+      [](const char* saved_model_checksum) {
+        metrics::SavedModelWriteFingerprint().Set(saved_model_checksum);
+      },
+      py::kw_only(), py::arg("saved_model_checksum"),
+      py::doc("Set the '/tensorflow/core/saved_model/write/fingerprint' gauge "
+              "with `saved_model_checksum`."));
+
+  m.def(
+      "GetWriteFingerprint",
+      []() { return metrics::SavedModelWriteFingerprint().value(); },
+      py::doc("Get value of '/tensorflow/core/saved_model/write/fingerprint' "
+              "gauge."));
+
   m.def(
       "AddCheckpointReadDuration",
       [](const char* api_label, double microseconds) {
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
index c66fdf7bf50..d66f2623c0d 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_metrics_test.py
@@ -118,6 +118,15 @@ def test_filesize(self):
   def test_invalid_file(self):
     self.assertEqual(metrics.CalculateFileSize("not_a_file.txt"), -1)
 
+  def test_SM_fingerprint(self):
+    self.assertEqual(metrics.GetReadFingerprint(), "")
+    metrics.SetReadFingerprint(saved_model_checksum="foo")
+    self.assertEqual(metrics.GetReadFingerprint(), "foo")
+
+    self.assertEqual(metrics.GetWriteFingerprint(), "")
+    metrics.SetWriteFingerprint(saved_model_checksum="foo")
+    self.assertEqual(metrics.GetWriteFingerprint(), "foo")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/registration/BUILD b/tensorflow/python/saved_model/registration/BUILD
index 3917739c9a8..467e1810607 100644
--- a/tensorflow/python/saved_model/registration/BUILD
+++ b/tensorflow/python/saved_model/registration/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 5918e5dff80..fb456d0e8d5 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -25,6 +25,7 @@
 from tensorflow.core.config import flags
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import fingerprint_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
@@ -37,6 +38,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
+from tensorflow.python.eager.polymorphic_function import saved_model_exported_concrete
 from tensorflow.python.eager.polymorphic_function import saved_model_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import error_interpolation
@@ -52,6 +54,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import function_serialization
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model import pywrap_saved_model
 from tensorflow.python.saved_model import registration
 from tensorflow.python.saved_model import revived_types
@@ -601,7 +604,13 @@ def _generate_signatures(signature_functions, object_map):
     mapped_inputs, exterior_argument_placeholders = (
         _map_function_arguments_to_created_inputs(argument_inputs,
                                                   signature_key, function.name))
-    outputs = object_map[function](*mapped_inputs)
+    kwarg_names = list(
+        sorted(
+            object_map[function].function.structured_input_signature[1].keys()))
+    outputs = object_map[function](**{
+        kwarg_name: mapped_input
+        for kwarg_name, mapped_input in zip(kwarg_names, mapped_inputs)
+    })
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs),
@@ -828,7 +837,7 @@ def call_with_mapped_captures(function, args):
       return object_map[function](*args)
     # Registered saver/restore functions do not appear in `object_map`, because
     # they are not in the object graph.
-    return saved_model_utils.ExportedConcreteFunction(
+    return saved_model_exported_concrete.ExportedConcreteFunction(
         function, tensor_map)(*args)
 
   for obj in object_map.values():
@@ -1038,7 +1047,7 @@ def _export_debug_info(exported_graph, export_dir):
       exported_operations)
   file_io.atomic_write_string_to_file(
       file_io.join(
-          utils_impl.get_or_create_debug_dir(export_dir),
+          path_helpers.get_or_create_debug_dir(export_dir),
           constants.DEBUG_INFO_FILENAME_PB),
       graph_debug_info.SerializeToString(deterministic=True))
 
@@ -1271,11 +1280,11 @@ def save_and_return_nodes(obj,
   # Write the checkpoint, copy assets into the assets directory, and write out
   # the SavedModel proto itself.
   if not experimental_skip_checkpoint:
-    utils_impl.get_or_create_variables_dir(export_dir)
+    path_helpers.get_or_create_variables_dir(export_dir)
     ckpt_options = checkpoint_options.CheckpointOptions(
         experimental_io_device=options.experimental_io_device)
     object_saver.save(
-        utils_impl.get_variables_path(export_dir), options=ckpt_options)
+        path_helpers.get_variables_path(export_dir), options=ckpt_options)
   builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
                                               export_dir)
   # Note that this needs to be the last file operation when saving the
@@ -1302,9 +1311,15 @@ def save_and_return_nodes(obj,
     fingerprint_path = file_io.join(
         compat.as_str(export_dir),
         compat.as_str(constants.FINGERPRINT_FILENAME))
-    fingerprint_proto = fingerprinting.CreateFingerprintDef(
+    fingerprint_serialized = fingerprinting.CreateFingerprintDef(
         saved_model_serialized, export_dir)
-    file_io.atomic_write_string_to_file(fingerprint_path, fingerprint_proto)
+    file_io.atomic_write_string_to_file(fingerprint_path,
+                                        fingerprint_serialized)
+    # We need to deserialize the fingerprint in order to send its values.
+    fingerprint_proto = fingerprint_pb2.FingerprintDef()
+    fingerprint_proto.ParseFromString(fingerprint_serialized)
+    metrics.SetWriteFingerprint(
+        saved_model_checksum=str(fingerprint_proto.saved_model_checksum))
 
   path = file_io.join(
       compat.as_str(export_dir),
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 478346c1354..b7b0cb1e150 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -42,6 +42,7 @@
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -1062,6 +1063,26 @@ def _calls_save():
     with self.assertRaisesRegex(AssertionError, "tf.function"):
       _calls_save()
 
+  def test_rewrite_asset_to_same_destination(self):
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    asset_path = os.path.join(self.get_temp_dir(), "asset")
+
+    def save_and_load(label):
+      with open(asset_path, "w") as f:
+        f.write(label)
+
+      model = autotrackable.AutoTrackable()
+      model.asset = asset.Asset(asset_path)
+      model.fn = def_function.function(lambda: io_ops.read_file(model.asset))
+      self.assertEqual(label, model.fn().numpy().decode("utf-8"))
+
+      save.save(model, save_dir)
+      imported = load.load(save_dir)
+      self.assertEqual(label, imported.fn().numpy().decode("utf-8"))
+
+    save_and_load("first")
+    save_and_load("second")
+
 
 class ExportMetaGraphTests(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index 72fe04da341..ec02a9dac79 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -26,6 +26,8 @@
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.fingerprinting import Fingerprint
+from tensorflow.python.saved_model.fingerprinting import read_fingerprint
 from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/tracing_utils.py b/tensorflow/python/saved_model/tracing_utils.py
index e4e9ca3ca0a..84ea7b94c53 100644
--- a/tensorflow/python/saved_model/tracing_utils.py
+++ b/tensorflow/python/saved_model/tracing_utils.py
@@ -15,6 +15,7 @@
 """Tracing utilities used by SavedModel."""
 
 from tensorflow.python.checkpoint import saveable_compat
+from tensorflow.python.checkpoint import tensor_callable
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 
@@ -39,6 +40,13 @@ def trace_save_and_restore(obj):
     @def_function.function
     def save_fn():
       tensor_dict = obj_save_fn()
+      if any(isinstance(v, tensor_callable.Callable)
+             for v in tensor_dict.values()):
+        raise NotImplementedError(
+            f"Unable to export SavedModel with object of type {type(obj)} "
+            "because it returns a Callable in `_serialize_to_tensors`. "
+            "If you need this functionality please file a feature request.")
+
       if legacy_name:
         # If there is a legacy decorator, append the name to the keys.
         return {f"{legacy_name}{key}": value
diff --git a/tensorflow/python/saved_model/tracing_utils_test.py b/tensorflow/python/saved_model/tracing_utils_test.py
index e312c47464a..63f23be7e45 100644
--- a/tensorflow/python/saved_model/tracing_utils_test.py
+++ b/tensorflow/python/saved_model/tracing_utils_test.py
@@ -18,6 +18,7 @@
 
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 
 from tensorflow.python.saved_model import tracing_utils
@@ -34,8 +35,9 @@ def _serialize_to_tensors(self):
     return {"a": self.a, "b": self.b}
 
   def _restore_from_tensors(self, restored_tensors):
-    self.a.assign(restored_tensors["a"])
-    self.b.assign(restored_tensors["b"])
+    return control_flow_ops.group(
+        self.a.assign(restored_tensors["a"]),
+        self.b.assign(restored_tensors["b"]))
 
 
 class TracingUtilsTest(test.TestCase):
@@ -52,7 +54,8 @@ def test_trace_save_and_restore_concrete(self):
     t._serialize_to_tensors = (def_function.function(t._serialize_to_tensors)
                                .get_concrete_function())
     restored_tensor_spec = t._serialize_to_tensors.structured_outputs
-    t._restore_from_tensors = (def_function.function(t._restore_from_tensors)
+    # The wrapped tf.function doesn't matter.
+    t._restore_from_tensors = (def_function.function(lambda x: x)
                                .get_concrete_function(restored_tensor_spec))
 
     save_fn, restore_fn = tracing_utils.trace_save_and_restore(t)
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 42b42b31f2d..2ff0cc467f2 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -23,11 +23,8 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import nested_structure_coder
-from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -210,70 +207,6 @@ def get_element_from_tensor_info(tensor_info, graph=None, import_scope=None):
       ops.prepend_name_scope(tensor_info.name, import_scope=import_scope))
 
 
-# Path helpers.
-
-
-def get_or_create_variables_dir(export_dir):
-  """Return variables sub-directory, or create one if it doesn't exist."""
-  variables_dir = get_variables_dir(export_dir)
-  file_io.recursive_create_dir(variables_dir)
-  return variables_dir
-
-
-def get_variables_dir(export_dir):
-  """Return variables sub-directory in the SavedModel."""
-  return file_io.join(
-      compat.as_text(export_dir), compat.as_text(constants.VARIABLES_DIRECTORY))
-
-
-def get_variables_path(export_dir):
-  """Return the variables path, used as the prefix for checkpoint files."""
-  return file_io.join(
-      compat.as_text(get_variables_dir(export_dir)),
-      compat.as_text(constants.VARIABLES_FILENAME))
-
-
-def get_or_create_assets_dir(export_dir):
-  """Return assets sub-directory, or create one if it doesn't exist."""
-  assets_destination_dir = get_assets_dir(export_dir)
-
-  file_io.recursive_create_dir(assets_destination_dir)
-
-  return assets_destination_dir
-
-
-def get_assets_dir(export_dir):
-  """Return path to asset directory in the SavedModel."""
-  return file_io.join(
-      compat.as_text(export_dir), compat.as_text(constants.ASSETS_DIRECTORY))
-
-
-def get_or_create_debug_dir(export_dir):
-  """Returns path to the debug sub-directory, creating if it does not exist."""
-  debug_dir = get_debug_dir(export_dir)
-
-  file_io.recursive_create_dir(debug_dir)
-
-  return debug_dir
-
-
-def get_saved_model_pbtxt_path(export_dir):
-  return file_io.join(
-      compat.as_bytes(compat.path_to_str(export_dir)),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
-
-
-def get_saved_model_pb_path(export_dir):
-  return file_io.join(
-      compat.as_bytes(compat.path_to_str(export_dir)),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
-
-
-def get_debug_dir(export_dir):
-  """Returns path to the debug sub-directory in the SavedModel."""
-  return file_io.join(
-      compat.as_text(export_dir), compat.as_text(constants.DEBUG_DIRECTORY))
-
 # Based on tensor_bundle/byte_swap.cc
 byte_swappable = [
     dtypes.float16, dtypes.float32, dtypes.float64, dtypes.bfloat16,
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index 0d4a238d1d5..35682d155f8 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow:tensorflow.bzl", "py_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "summary",
diff --git a/tensorflow/python/summary/writer/BUILD b/tensorflow/python/summary/writer/BUILD
index 3ea989c3fb2..fcd9eecd4fe 100644
--- a/tensorflow/python/summary/writer/BUILD
+++ b/tensorflow/python/summary/writer/BUILD
@@ -1,6 +1,9 @@
 load("//tensorflow:tensorflow.bzl", "py_tests")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "writer",
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index bf72a198d6d..bbc3716884d 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "Python.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "pybind11/chrono.h"
 #include "pybind11/complex.h"
@@ -405,7 +406,8 @@ static py::object TFE_ClearScalarCache() {
 static py::bytes TFE_GetCompilerIr(py::handle& ctx,
                                    const char* concrete_function_name,
                                    const char* stage, const char* device_name,
-                                   py::handle& inputs) {
+                                   py::handle& flat_arg_inputs,
+                                   py::handle& captured_inputs) {
   EagerContext* context = ContextFromInterface(
       reinterpret_cast<ImmediateExecutionContext*>(InputTFE_Context(ctx)));
 
@@ -435,12 +437,58 @@ static py::bytes TFE_GetCompilerIr(py::handle& ctx,
     }
   }();
 
-  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(inputs);
+  CompilerArgSource compiler_arg_source = [&] {
+    if (PyList_Size(flat_arg_inputs.ptr()) == 0) {
+      return CompilerArgSource::CONCRETE_INPUT;
+    }
+    PyObject* elem = PyList_GetItem(flat_arg_inputs.ptr(), 0);
+    if (swig::IsTensorSpec(elem)) {
+      return CompilerArgSource::TENSOR_SPEC;
+    } else if (swig::IsTensor(elem)) {
+      return CompilerArgSource::CONCRETE_INPUT;
+    } else {
+      ThrowValueError(
+          tensorflow::strings::StrCat(
+              "Only accept tf.TensorSpec or tf.Tensor but got type ",
+              elem->ob_type->tp_name)
+              .c_str());
+    }
+  }();
+
+  Py_ssize_t flat_arg_len = PyList_Size(flat_arg_inputs.ptr());
+  Py_ssize_t captured_input_len = PyList_Size(captured_inputs.ptr());
+  std::vector<ArgShapeAndDType> flat_args;
+  std::vector<const TensorHandle*> captured_input_handles;
+
+  if (compiler_arg_source == CompilerArgSource::TENSOR_SPEC) {
+    flat_args.resize(flat_arg_len);
+    captured_input_handles.reserve(captured_input_len);
+    for (Py_ssize_t i = 0; i < flat_arg_len; ++i) {
+      PyObject* elem_ptr = PyList_GetItem(flat_arg_inputs.ptr(), i);
+      py::object elem = py::reinterpret_borrow<py::object>(elem_ptr);
+      py::object py_dtype = elem.attr("dtype");
+      py::object py_shape = elem.attr("shape");
+      int dtype = py::cast<int>(py_dtype.attr("_type_enum"));
+      auto shape = py::cast<std::vector<int64_t>>(py_shape);
+      flat_args[i].dtype = DataType(dtype);
+      flat_args[i].shape = TensorShape(shape);
+    }
+  } else if (compiler_arg_source == CompilerArgSource::CONCRETE_INPUT) {
+    captured_input_handles.reserve(flat_arg_len + captured_input_len);
+    TFE_InputTensorHandles handles =
+        InputTFE_InputTensorHandles(flat_arg_inputs);
+    for (TFE_TensorHandle* tensor_handle : handles) {
+      AbstractTensorHandle* abstract_tensor_handle = unwrap(tensor_handle);
+      captured_input_handles.push_back(
+          TensorHandleFromInterface(abstract_tensor_handle));
+    }
+  }
 
-  std::vector<const TensorHandle*> input_handles;
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(captured_inputs);
   for (TFE_TensorHandle* tensor_handle : handles) {
     AbstractTensorHandle* abstract_tensor_handle = unwrap(tensor_handle);
-    input_handles.push_back(TensorHandleFromInterface(abstract_tensor_handle));
+    captured_input_handles.push_back(
+        TensorHandleFromInterface(abstract_tensor_handle));
   }
 
   DeviceNameUtils::ParsedName input_device_name;
@@ -461,9 +509,9 @@ static py::bytes TFE_GetCompilerIr(py::handle& ctx,
             .c_str());
   }
 
-  StatusOr<std::string> hlo_str =
-      GetCompilerIr(selected_stage, context->pflr(), concrete_function_name,
-                    *selected_device, context, input_handles);
+  StatusOr<std::string> hlo_str = GetCompilerIr(
+      selected_stage, context->pflr(), concrete_function_name, *selected_device,
+      context, flat_args, captured_input_handles, compiler_arg_source);
 
   if (!hlo_str.ok()) {
     ThrowValueError(absl::StrFormat("Failed getting HLO text: '%s'",
@@ -959,42 +1007,49 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tasks.length = task_len;
     TFE_GetTaskStates(tensorflow::InputTFE_Context(ctx), tasks, state.get(),
                       status.get());
-    PyObject* output = PyList_New(task_len);
+    py::list output(task_len);
     for (size_t i = 0; i < task_len; ++i) {
       auto code = TF_GetCode(&state[i]);
       if (code != TF_Code::TF_OK) {
-        PyObject* payloads = PyDict_New();
+        py::dict payloads;
         for (const auto& payload :
              tensorflow::errors::GetPayloads(state[i].status)) {
-          PyDict_SetItemString(payloads, payload.first.c_str(),
-                               PyUnicode_FromString(payload.second.c_str()));
+          payloads[payload.first.c_str()] = payload.second;
         }
-        PyObject* exception_class =
-            tensorflow::PyExceptionRegistry::Lookup(code);
-        if (exception_class == nullptr) {
+        auto exception_class = py::reinterpret_steal<py::object>(
+            tensorflow::PyExceptionRegistry::Lookup(code));
+        if (!exception_class) {
           status->status = tensorflow::errors::Internal(absl::StrCat(
               "Fail to find the corresponding exception class for ", code));
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
         }
-        // If `Py_BuildValue` returns NULL, it would raise an exception.
-        // So we may not raise another exception here when `value` is NULL.
-        PyObject* value = Py_BuildValue("sssO", nullptr, nullptr,
-                                        TF_Message(&state[i]), payloads);
-        PyObject* instance = PyObject_CallObject(exception_class, value);
-        PyList_SetItem(output, i, instance);
+        output[i] = exception_class(py::none(), py::none(),
+                                    TF_Message(&state[i]), payloads);
       } else {
-        PyList_SetItem(output, i, Py_None);
+        output[i] = py::none();
       }
     }
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
-    return tensorflow::PyoOrThrow(output);
+    return tensorflow::PyoOrThrow(output.release().ptr());
   });
 
+  m.def("TFE_WaitAtBarrier",
+        [](py::handle& ctx, const char* barrier_id, int64_t timeout_in_ms) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+
+          TFE_WaitAtBarrier(tensorflow::InputTFE_Context(ctx), barrier_id,
+                            timeout_in_ms, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
+
   // TFE_Executor logic
   m.def(
       "TFE_NewExecutor",
-      [](const bool is_async, const bool enable_streaming_enqueue) {
-        TFE_Executor* exc = TFE_NewExecutor(is_async, enable_streaming_enqueue);
+      [](const bool is_async, const bool enable_streaming_enqueue,
+         const int in_flight_nodes_limit) {
+        TFE_Executor* exc = TFE_NewExecutor(is_async, enable_streaming_enqueue,
+                                            in_flight_nodes_limit);
         return exc;
       },
       py::return_value_policy::reference);
@@ -1236,8 +1291,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_ContextOptionsSetDevicePlacementPolicy",
         &TFE_ContextOptionsSetDevicePlacementPolicy);
   m.def("TFE_ContextOptionsSetTfrt", &TFE_ContextOptionsSetTfrt);
-  m.def("TFE_ContextOptionsSetTfrtDistributedRuntime",
-        &TFE_ContextOptionsSetTfrtDistributedRuntime);
   // Experimental feature, intentionally not exposed as a C API yet.
   m.def("TFE_ContextOptionsSetRunEagerOpAsFunction",
         [](TFE_ContextOptions* options, bool run_eager_op_as_function) {
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index c6ba7e66d65..2ce41c22bbc 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.bzl", "if_google", "if_xla_available", "py_binary"
 load("//tensorflow/python/tools:tools.bzl", "saved_model_compile_aot")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -329,6 +330,7 @@ py_library(
         # broken saved_model_cli.
         ":saved_model_aot_compile",
         ":saved_model_utils",
+        "@absl_py//absl/flags:argparse_flags",
         "//tensorflow/python",
         "//tensorflow/python/debug/wrappers:local_cli_wrapper",
     ] + if_google([
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 06bfae350df..6ee4993283a 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -7,6 +7,7 @@ load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API
 load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index f46ef153575..e223c6f1e95 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -26,6 +26,7 @@ TENSORFLOW_API_INIT_FILES = [
     "__internal__/tf2/__init__.py",
     "__internal__/train/__init__.py",
     "__internal__/types/__init__.py",
+    "__internal__/types/data/__init__.py",
     "__internal__/saved_model/__init__.py",
     "__internal__/saved_model/load/__init__.py",
     "__internal__/tracking/__init__.py",
@@ -53,8 +54,10 @@ TENSORFLOW_API_INIT_FILES = [
     "distribute/experimental/partitioners/__init__.py",
     "distribute/experimental/rpc/__init__.py",
     "dtypes/__init__.py",
+    "dtypes/experimental/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
+    "experimental/extension_type/__init__.py",
     "experimental/dtensor/__init__.py",
     "experimental/numpy/__init__.py",
     "experimental/numpy/random/__init__.py",
@@ -102,6 +105,7 @@ TENSORFLOW_API_INIT_FILES = [
     "summary/experimental/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "test/experimental/__init__.py",
     "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
@@ -177,6 +181,7 @@ KERAS_API_INIT_FILES = [
     "keras/regularizers/__init__.py",
     "keras/utils/__init__.py",
     "keras/utils/experimental/__init__.py",
+    "keras/utils/legacy/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
 ]
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 45543aefeef..50549b3d242 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -4,6 +4,9 @@
 TENSORFLOW_API_INIT_FILES_V1 = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "__internal__/__init__.py",
+    "__internal__/types/__init__.py",
+    "__internal__/types/data/__init__.py",
     "app/__init__.py",
     "audio/__init__.py",
     "autograph/__init__.py",
@@ -24,8 +27,10 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "distribute/experimental/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
+    "dtypes/experimental/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
+    "experimental/extension_type/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
     "io/gfile/__init__.py",
@@ -87,6 +92,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "test/experimental/__init__.py",
     "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
@@ -159,6 +165,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/preprocessing/text/__init__.py",
     "keras/regularizers/__init__.py",
     "keras/utils/__init__.py",
+    "keras/utils/legacy/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
 ]
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index ca82b0c1df4..aaed1c56868 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -45,7 +45,7 @@
 from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.client import session
-from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import importer
 from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import loader
@@ -214,14 +214,14 @@ def freeze_graph_with_def_protos(input_graph_def,
         if variable_names_denylist else None)
 
     if input_meta_graph_def:
-      output_graph_def = graph_util.convert_variables_to_constants(
+      output_graph_def = convert_to_constants.convert_variables_to_constants(
           sess,
           input_meta_graph_def.graph_def,
           output_node_names.replace(" ", "").split(","),
           variable_names_whitelist=variable_names_whitelist,
           variable_names_blacklist=variable_names_denylist)
     else:
-      output_graph_def = graph_util.convert_variables_to_constants(
+      output_graph_def = convert_to_constants.convert_variables_to_constants(
           sess,
           input_graph_def,
           output_node_names.replace(" ", "").split(","),
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index e0ebf27cfb9..4a6dacdad82 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -20,12 +20,14 @@
 """
 
 import argparse
+
 import ast
 import os
 import re
-import sys
 
 from absl import app  # pylint: disable=unused-import
+from absl import flags
+from absl.flags import argparse_flags
 import numpy as np
 
 from tensorflow.core.example import example_pb2
@@ -59,6 +61,162 @@
 _OP_DENYLIST = set(['WriteFile', 'ReadFile', 'PrintV2'])
 
 
+# Custom SavedModel CLI flags
+_SMCLI_DIR = flags.DEFINE_string(
+    name='dir', default=None, help='Directory containing the SavedModel.')
+
+_SMCLI_ALL = flags.DEFINE_bool(
+    name='all', default=False,
+    help='If set, outputs all available information in the given SavedModel.')
+
+_SMCLI_TAG_SET = flags.DEFINE_string(
+    name='tag_set', default=None,
+    help='Comma-separated set of tags that identify variant graphs in the '
+    'SavedModel.')
+
+_SMCLI_SIGNATURE_DEF = flags.DEFINE_string(
+    name='signature_def', default=None,
+    help='Specifies a SignatureDef (by key) within the SavedModel to display '
+    'input(s) and output(s) for.')
+
+_SMCLI_LIST_OPS = flags.DEFINE_bool(
+    name='list_ops', default=False,
+    help='If set, will output ops used by a MetaGraphDef specified by tag_set.')
+
+_SMCLI_INPUTS = flags.DEFINE_string(
+    name='inputs', default='',
+    help='Specifies input data files to pass to numpy.load(). Format should be '
+    '\'<input_key>=<filename>\' or \'<input_key>=<filename>[<variable_name>]\','
+    ' separated by \';\'. File formats are limited to .npy, .npz, or pickle.')
+
+_SMCLI_INPUT_EXPRS = flags.DEFINE_string(
+    name='input_exprs', default='',
+    help='Specifies Python literal expressions or numpy functions. Format '
+    'should be "<input_key>=\'<python_expression>\'", separated by \';\'. Numpy'
+    ' can be accessed with \'np\'. Note that expressions are passed to '
+    'literal_eval(), making this flag susceptible to code injection. Overrides '
+    'duplicate input keys provided with the --inputs flag.')
+
+_SMCLI_INPUT_EXAMPLES = flags.DEFINE_string(
+    name='input_examples', default='',
+    help='Specifies tf.train.Example objects as inputs. Format should be '
+    '\'<input_key>=[{{feature0:value_list,feature1:value_list}}]\', where input'
+    ' keys are separated by \';\'. Overrides duplicate input keys provided with'
+    ' the --inputs and --input_exprs flags.')
+
+_SMCLI_OUTDIR = flags.DEFINE_string(
+    name='outdir', default=None,
+    help='If specified, writes CLI output to the given directory.')
+
+_SMCLI_OVERWRITE = flags.DEFINE_bool(
+    name='overwrite', default=False,
+    help='If set, overwrites output file if it already exists.')
+
+_SMCLI_TF_DEBUG = flags.DEFINE_bool(
+    name='tf_debug', default=False,
+    help='If set, uses the Tensorflow Debugger (tfdbg) to watch intermediate '
+    'Tensors and runtime GraphDefs while running the SavedModel.')
+
+_SMCLI_WORKER = flags.DEFINE_string(
+    name='worker', default=None,
+    help='If specified, runs the session on the given worker (bns or gRPC '
+    'path).')
+
+_SMCLI_INIT_TPU = flags.DEFINE_bool(
+    name='init_tpu', default=False,
+    help='If set, calls tpu.initialize_system() on the session. '
+                  'Should only be set if the specified worker is a TPU job.')
+
+_SMCLI_USE_TFRT = flags.DEFINE_bool(
+    name='use_tfrt', default=False,
+    help='If set, runs a TFRT session, instead of a TF1 session.')
+
+_SMCLI_OP_DENYLIST = flags.DEFINE_string(
+    name='op_denylist', default=None,
+    help='If specified, detects and reports the given ops. List of ops should '
+    'be comma-separated. If not specified, the default list of ops is '
+    '[WriteFile, ReadFile, PrintV2]. To specify an empty list, pass in the '
+    'empty string.')
+
+_SMCLI_OUTPUT_DIR = flags.DEFINE_string(
+    name='output_dir', default=None,
+    help='Output directory for the SavedModel.')
+
+_SMCLI_MAX_WORKSPACE_SIZE_BYTES = flags.DEFINE_integer(
+    name='max_workspace_size_bytes', default=2 << 20,
+    help='The maximum temporary GPU memory which the TensorRT engine can use at'
+    ' execution time.')
+
+_SMCLI_PRECISION_MODE = flags.DEFINE_enum(
+    name='precision_mode', default='FP32', enum_values=['FP32', 'FP16', 'INT8'],
+    help='TensorRT data precision. One of FP32, FP16, or INT8.')
+
+_SMCLI_MINIMUM_SEGMENT_SIZE = flags.DEFINE_integer(
+    name='minimum_segment_size', default=3,
+    help='The minimum number of nodes required for a subgraph to be replaced in'
+    ' a TensorRT node.')
+
+_SMCLI_CONVERT_TF1_MODEL = flags.DEFINE_bool(
+    name='convert_tf1_model', default=False,
+    help='Support TensorRT conversion for TF1 models.')
+
+_SMCLI_OUTPUT_PREFIX = flags.DEFINE_string(
+    name='output_prefix', default=None,
+    help='Output directory + filename prefix for the resulting header(s) and '
+    'object file(s).')
+
+_SMCLI_SIGNATURE_DEF_KEY = flags.DEFINE_string(
+    name='signature_def_key',
+    default=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    help='SavedModel SignatureDef key to use.')
+
+_SMCLI_CHECKPOINT_PATH = flags.DEFINE_string(
+    name='checkpoint_path', default=None,
+    help='Custom checkpoint to use. Uses SavedModel variables by default.')
+
+_SMCLI_VARIABLES_TO_FEED = flags.DEFINE_string(
+    name='variables_to_feed', default='',
+    help='Names of the variables that will be fed into the SavedModel graph. '
+    'Pass in \'\' to feed no variables, \'all\' to feed all variables, or a '
+    'comma-separated list of variable names. Variables not fed will be frozen. '
+    '*NOTE* Variables passed here must be set *by the user*. These variables '
+    'will NOT be frozen, and their values will be uninitialized in the compiled'
+    ' object.')
+
+_SMCLI_TARGET_TRIPLE = flags.DEFINE_string(
+    name='target_triple', default='x86_64-pc-linux',
+    help='Triple identifying a target variation, containing information such as'
+    ' processor architecture, vendor, operating system, and environment. '
+    'Defaults to \'x86_64-pc-linux\'.')
+
+_SMCLI_TARGET_CPU = flags.DEFINE_string(
+    name='target_cpu', default='',
+    help='Target CPU name for LLVM during AOT compilation. Examples include '
+    '\'x86_64\', \'skylake\', \'haswell\', \'westmere\', \'\' (unknown).')
+
+_SMCLI_CPP_CLASS = flags.DEFINE_string(
+    name='cpp_class', default=None,
+    help='The name of the generated C++ class, wrapping the generated function.'
+    ' Format should be [[<optional_namespace>::],...]<class_name>, i.e. the '
+    'same syntax as C++ for specifying a class. This class will be generated in'
+    ' the given namespace(s), or, if none are specified, the global namespace.')
+
+_SMCLI_MULTITHREADING = flags.DEFINE_string(
+    name='multithreading', default='False',
+    help='Enable multithreading in the compiled computation. Note that with '
+    'this flag enabled, the resulting object files may have external '
+    'dependencies on multithreading libraries, such as \'nsync\'.')
+
+command_required_flags = {
+    'show': ['dir'],
+    'run': ['dir', 'tag_set', 'signature_def'],
+    'scan': ['dir'],
+    'convert': ['dir', 'output_dir', 'tag_set'],
+    'freeze_model': ['dir', 'output_prefix', 'tag_set'],
+    'aot_compile_cpu': ['cpp_class'],
+}
+
+
 def _show_tag_sets(saved_model_dir):
   """Prints the tag-sets stored in SavedModel directory.
 
@@ -73,6 +231,40 @@ def _show_tag_sets(saved_model_dir):
     print('%r' % ', '.join(sorted(tag_set)))
 
 
+def _get_ops_in_metagraph(meta_graph_def):
+  """Returns a set of the ops in the MetaGraph.
+
+  Returns the set of all the ops used in the MetaGraphDef indicated by the
+  tag_set stored in SavedModel directory.
+
+  Args:
+    meta_graph_def: MetaGraphDef to list the ops of.
+
+  Returns:
+    A set of ops.
+  """
+  return set(meta_graph_lib.ops_used_by_graph_def(meta_graph_def.graph_def))
+
+
+def _show_ops_in_metagraph(saved_model_dir, tag_set):
+  """Prints the ops in the MetaGraph.
+
+  Prints all the ops used in the MetaGraphDef indicated by the tag_set stored in
+  SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef in string format, separated by
+      ','. For tag-set contains multiple tags, all tags must be passed in.
+  """
+  meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir,
+                                                        tag_set)
+  all_ops_set = _get_ops_in_metagraph(meta_graph_def)
+  print(
+      'The MetaGraph with tag set %s contains the following ops:' %
+      meta_graph_def.meta_info_def.tags, all_ops_set)
+
+
 def _show_signature_def_map_keys(saved_model_dir, tag_set):
   """Prints the keys for each SignatureDef in the SignatureDef map.
 
@@ -291,9 +483,9 @@ def in_print(s):
 
 
 def _show_all(saved_model_dir):
-  """Prints tag-set, SignatureDef and Inputs/Outputs information in SavedModel.
+  """Prints tag-set, ops, SignatureDef, and Inputs/Outputs of SavedModel.
 
-  Prints all tag-set, SignatureDef and Inputs/Outputs information stored in
+  Prints all tag-set, ops, SignatureDef and Inputs/Outputs information stored in
   SavedModel directory.
 
   Args:
@@ -310,6 +502,7 @@ def _show_all(saved_model_dir):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
       _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key,
                            indent=1)
+    _show_ops_in_metagraph(saved_model_dir, tag_set)
   _show_defined_functions(saved_model_dir)
 
 
@@ -573,9 +766,9 @@ def preprocess_input_exprs_arg_string(input_exprs_str, safe=True):
     if safe:
       try:
         input_dict[input_key] = ast.literal_eval(expr)
-      except:
+      except Exception as exc:
         raise RuntimeError(
-            f'Expression "{expr}" is not a valid python literal.')
+            f'Expression "{expr}" is not a valid python literal.') from exc
     else:
       # ast.literal_eval does not work with numpy expressions
       input_dict[input_key] = eval(expr)  # pylint: disable=eval-used
@@ -743,173 +936,163 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
   return tensor_key_feed_dict
 
 
-def show(args):
-  """Function triggered by show command.
-
-  Args:
-    args: A namespace parsed from command line.
-  """
+def show():
+  """Function triggered by show command."""
   # If all tag is specified, display all information.
-  if args.all:
-    _show_all(args.dir)
+  if _SMCLI_ALL.value:
+    _show_all(_SMCLI_DIR.value)
   else:
-    # If no tag is specified, display all tag_set, if no signature_def key is
-    # specified, display all SignatureDef keys, else show input output tensor
-    # information corresponding to the given SignatureDef key
-    if args.tag_set is None:
-      _show_tag_sets(args.dir)
+    # If no tag is specified, display all tag_sets.
+    # If a tag set is specified:
+    # # If list_ops is set, display all ops in the specified MetaGraphDef.
+    # # If no signature_def key is specified, display all SignatureDef keys.
+    # # If a signature_def is specified, show its corresponding input output
+    # # tensor information.
+    if _SMCLI_TAG_SET.value is None:
+      if _SMCLI_LIST_OPS.value:
+        print('--list_ops must be paired with a tag-set or with --all.')
+      _show_tag_sets(_SMCLI_DIR.value)
     else:
-      if args.signature_def is None:
-        _show_signature_def_map_keys(args.dir, args.tag_set)
+      if _SMCLI_LIST_OPS.value:
+        _show_ops_in_metagraph(_SMCLI_DIR.value, _SMCLI_TAG_SET.value)
+      if _SMCLI_SIGNATURE_DEF.value is None:
+        _show_signature_def_map_keys(_SMCLI_DIR.value, _SMCLI_TAG_SET.value)
       else:
-        _show_inputs_outputs(args.dir, args.tag_set, args.signature_def)
+        _show_inputs_outputs(
+            _SMCLI_DIR.value, _SMCLI_TAG_SET.value, _SMCLI_SIGNATURE_DEF.value)
 
 
-def run(args):
+def run():
   """Function triggered by run command.
 
-  Args:
-    args: A namespace parsed from command line.
-
   Raises:
     AttributeError: An error when neither --inputs nor --input_exprs is passed
     to run command.
   """
-  if not args.inputs and not args.input_exprs and not args.input_examples:
+  if not _SMCLI_INPUTS.value and not _SMCLI_INPUT_EXPRS.value and not _SMCLI_INPUT_EXAMPLES.value:
     raise AttributeError(
         'At least one of --inputs, --input_exprs or --input_examples must be '
         'required')
   tensor_key_feed_dict = load_inputs_from_input_arg_string(
-      args.inputs, args.input_exprs, args.input_examples)
+      _SMCLI_INPUTS.value,
+      _SMCLI_INPUT_EXPRS.value,
+      _SMCLI_INPUT_EXAMPLES.value)
   run_saved_model_with_feed_dict(
-      args.dir,
-      args.tag_set,
-      args.signature_def,
+      _SMCLI_DIR.value,
+      _SMCLI_TAG_SET.value,
+      _SMCLI_SIGNATURE_DEF.value,
       tensor_key_feed_dict,
-      args.outdir,
-      args.overwrite,
-      worker=args.worker,
-      init_tpu=args.init_tpu,
-      use_tfrt=args.use_tfrt,
-      tf_debug=args.tf_debug)
-
+      _SMCLI_OUTDIR.value,
+      _SMCLI_OVERWRITE.value,
+      worker=_SMCLI_WORKER.value,
+      init_tpu=_SMCLI_INIT_TPU.value,
+      use_tfrt=_SMCLI_USE_TFRT.value,
+      tf_debug=_SMCLI_TF_DEBUG.value)
 
-def scan(args):
-  """Function triggered by scan command.
 
-  Args:
-    args: A namespace parsed from command line.
-  """
-  if args.tag_set and args.op_denylist:
+def scan():
+  """Function triggered by scan command."""
+  if _SMCLI_TAG_SET.value and _SMCLI_OP_DENYLIST.value:
     scan_meta_graph_def(
-        saved_model_utils.get_meta_graph_def(args.dir, args.tag_set),
-        _get_op_denylist_set(args.op_denylist))
-  elif args.tag_set:
+        saved_model_utils.get_meta_graph_def(
+            _SMCLI_DIR.value, _SMCLI_TAG_SET.value),
+        _get_op_denylist_set(_SMCLI_OP_DENYLIST.value))
+  elif _SMCLI_TAG_SET.value:
     scan_meta_graph_def(
-        saved_model_utils.get_meta_graph_def(args.dir, args.tag_set),
+        saved_model_utils.get_meta_graph_def(
+            _SMCLI_DIR.value, _SMCLI_TAG_SET.value),
         _OP_DENYLIST)
   else:
-    saved_model = saved_model_utils.read_saved_model(args.dir)
-    if args.op_denylist:
+    saved_model = saved_model_utils.read_saved_model(_SMCLI_DIR.value)
+    if _SMCLI_OP_DENYLIST.value:
       for meta_graph_def in saved_model.meta_graphs:
         scan_meta_graph_def(meta_graph_def,
-                            _get_op_denylist_set(args.op_denylist))
+                            _get_op_denylist_set(_SMCLI_OP_DENYLIST.value))
     else:
       for meta_graph_def in saved_model.meta_graphs:
         scan_meta_graph_def(meta_graph_def, _OP_DENYLIST)
 
 
-def convert_with_tensorrt(args):
-  """Function triggered by 'convert tensorrt' command.
-
-  Args:
-    args: A namespace parsed from command line.
-  """
+def convert_with_tensorrt():
+  """Function triggered by 'convert tensorrt' command."""
   # Import here instead of at top, because this will crash if TensorRT is
   # not installed
   from tensorflow.python.compiler.tensorrt import trt_convert as trt  # pylint: disable=g-import-not-at-top
 
-  if not args.convert_tf1_model:
+  if not _SMCLI_CONVERT_TF1_MODEL.value:
     params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
-        max_workspace_size_bytes=args.max_workspace_size_bytes,
-        precision_mode=args.precision_mode,
-        minimum_segment_size=args.minimum_segment_size)
+        max_workspace_size_bytes=_SMCLI_MAX_WORKSPACE_SIZE_BYTES.value,
+        precision_mode=_SMCLI_PRECISION_MODE.value,
+        minimum_segment_size=_SMCLI_MINIMUM_SEGMENT_SIZE.value)
     try:
       converter = trt.TrtGraphConverterV2(
-          input_saved_model_dir=args.dir,
-          input_saved_model_tags=args.tag_set.split(','),
+          input_saved_model_dir=_SMCLI_DIR.value,
+          input_saved_model_tags=_SMCLI_TAG_SET.value.split(','),
           **params._asdict())
       converter.convert()
-    except Exception as e:
+    except Exception as exc:
       raise RuntimeError(
-          '{}. Try passing "--convert_tf1_model=True".'.format(e))
-    converter.save(output_saved_model_dir=args.output_dir)
+          '{}. Try passing "--convert_tf1_model=True".'.format(exc)) from exc
+    converter.save(output_saved_model_dir=_SMCLI_OUTPUT_DIR.value)
   else:
     trt.create_inference_graph(
         None,
         None,
         max_batch_size=1,
-        max_workspace_size_bytes=args.max_workspace_size_bytes,
-        precision_mode=args.precision_mode,
-        minimum_segment_size=args.minimum_segment_size,
+        max_workspace_size_bytes=_SMCLI_MAX_WORKSPACE_SIZE_BYTES.value,
+        precision_mode=_SMCLI_PRECISION_MODE.value,
+        minimum_segment_size=_SMCLI_MINIMUM_SEGMENT_SIZE.value,
         is_dynamic_op=True,
-        input_saved_model_dir=args.dir,
-        input_saved_model_tags=args.tag_set.split(','),
-        output_saved_model_dir=args.output_dir)
-
+        input_saved_model_dir=_SMCLI_DIR.value,
+        input_saved_model_tags=_SMCLI_TAG_SET.value.split(','),
+        output_saved_model_dir=_SMCLI_OUTPUT_DIR.value)
 
-def freeze_model(args):
-  """Function triggered by freeze_model command.
 
-  Args:
-    args: A namespace parsed from command line.
-  """
+def freeze_model():
+  """Function triggered by freeze_model command."""
   checkpoint_path = (
-      args.checkpoint_path
-      or os.path.join(args.dir, 'variables/variables'))
-  if not args.variables_to_feed:
+      _SMCLI_CHECKPOINT_PATH.value
+      or os.path.join(_SMCLI_DIR.value, 'variables/variables'))
+  if not _SMCLI_VARIABLES_TO_FEED.value:
     variables_to_feed = []
-  elif args.variables_to_feed.lower() == 'all':
+  elif _SMCLI_VARIABLES_TO_FEED.value.lower() == 'all':
     variables_to_feed = None  # We will identify them after.
   else:
-    variables_to_feed = args.variables_to_feed.split(',')
+    variables_to_feed = _SMCLI_VARIABLES_TO_FEED.value.split(',')
 
   saved_model_aot_compile.freeze_model(
       checkpoint_path=checkpoint_path,
       meta_graph_def=saved_model_utils.get_meta_graph_def(
-          args.dir, args.tag_set),
-      signature_def_key=args.signature_def_key,
+          _SMCLI_DIR.value, _SMCLI_TAG_SET.value),
+      signature_def_key=_SMCLI_SIGNATURE_DEF_KEY.value,
       variables_to_feed=variables_to_feed,
-      output_prefix=args.output_prefix)
-
+      output_prefix=_SMCLI_OUTPUT_PREFIX.value)
 
-def aot_compile_cpu(args):
-  """Function triggered by aot_compile_cpu command.
 
-  Args:
-    args: A namespace parsed from command line.
-  """
+def aot_compile_cpu():
+  """Function triggered by aot_compile_cpu command."""
   checkpoint_path = (
-      args.checkpoint_path
-      or os.path.join(args.dir, 'variables/variables'))
-  if not args.variables_to_feed:
+      _SMCLI_CHECKPOINT_PATH.value
+      or os.path.join(_SMCLI_DIR.value, 'variables/variables'))
+  if not _SMCLI_VARIABLES_TO_FEED.value:
     variables_to_feed = []
-  elif args.variables_to_feed.lower() == 'all':
+  elif _SMCLI_VARIABLES_TO_FEED.value.lower() == 'all':
     variables_to_feed = None  # We will identify them after.
   else:
-    variables_to_feed = args.variables_to_feed.split(',')
+    variables_to_feed = _SMCLI_VARIABLES_TO_FEED.value.split(',')
 
   saved_model_aot_compile.aot_compile_cpu_meta_graph_def(
       checkpoint_path=checkpoint_path,
       meta_graph_def=saved_model_utils.get_meta_graph_def(
-          args.dir, args.tag_set),
-      signature_def_key=args.signature_def_key,
+          _SMCLI_DIR.value, _SMCLI_TAG_SET.value),
+      signature_def_key=_SMCLI_SIGNATURE_DEF_KEY.value,
       variables_to_feed=variables_to_feed,
-      output_prefix=args.output_prefix,
-      target_triple=args.target_triple,
-      target_cpu=args.target_cpu,
-      cpp_class=args.cpp_class,
-      multithreading=args.multithreading.lower() not in ('f', 'false', '0'))
+      output_prefix=_SMCLI_OUTPUT_PREFIX.value,
+      target_triple=_SMCLI_TARGET_TRIPLE.value,
+      target_cpu=_SMCLI_TARGET_CPU.value,
+      cpp_class=_SMCLI_CPP_CLASS.value,
+      multithreading=(
+          _SMCLI_MULTITHREADING.value.lower() not in ('f', 'false', '0')))
 
 
 def add_show_subparser(subparsers):
@@ -929,32 +1112,15 @@ def add_show_subparser(subparsers):
       ' MetaGraph.\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
       ' --signature_def serving_default\n\n'
+      'To show all ops in a MetaGraph.\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      ' --list_ops\n\n'
       'To show all available information in the SavedModel:\n'
       '$saved_model_cli show --dir /tmp/saved_model --all')
   parser_show = subparsers.add_parser(
       'show',
       description=show_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  parser_show.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to inspect')
-  parser_show.add_argument(
-      '--all',
-      action='store_true',
-      help='if set, will output all information in given SavedModel')
-  parser_show.add_argument(
-      '--tag_set',
-      type=str,
-      default=None,
-      help='tag-set of graph in SavedModel to show, separated by \',\'')
-  parser_show.add_argument(
-      '--signature_def',
-      type=str,
-      default=None,
-      metavar='SIGNATURE_DEF_KEY',
-      help='key of SignatureDef to display input(s) and output(s) for')
   parser_show.set_defaults(func=show)
 
 
@@ -975,71 +1141,6 @@ def add_run_subparser(subparsers):
              'https://www.tensorflow.org/guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
       'run', description=run_msg, formatter_class=argparse.RawTextHelpFormatter)
-  parser_run.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to execute')
-  parser_run.add_argument(
-      '--tag_set',
-      type=str,
-      required=True,
-      help='tag-set of graph in SavedModel to load, separated by \',\'')
-  parser_run.add_argument(
-      '--signature_def',
-      type=str,
-      required=True,
-      metavar='SIGNATURE_DEF_KEY',
-      help='key of SignatureDef to run')
-  msg = ('Loading inputs from files, in the format of \'<input_key>=<filename>,'
-         ' or \'<input_key>=<filename>[<variable_name>]\', separated by \';\'.'
-         ' The file format can only be from .npy, .npz or pickle.')
-  parser_run.add_argument('--inputs', type=str, default='', help=msg)
-  msg = ('Specifying inputs by python expressions, in the format of'
-         ' "<input_key>=\'<python expression>\'", separated by \';\'. '
-         'numpy module is available as \'np\'. Please note that the expression '
-         'will be evaluated as-is, and is susceptible to code injection. '
-         'When this is set, the value will override duplicate input keys from '
-         '--inputs option.')
-  parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
-  msg = (
-      'Specifying tf.Example inputs as list of dictionaries. For example: '
-      '<input_key>=[{feature0:value_list,feature1:value_list}]. Use ";" to '
-      'separate input keys. Will override duplicate input keys from --inputs '
-      'and --input_exprs option.')
-  parser_run.add_argument('--input_examples', type=str, default='', help=msg)
-  parser_run.add_argument(
-      '--outdir',
-      type=str,
-      default=None,
-      help='if specified, output tensor(s) will be saved to given directory')
-  parser_run.add_argument(
-      '--overwrite',
-      action='store_true',
-      help='if set, output file will be overwritten if it already exists.')
-  parser_run.add_argument(
-      '--tf_debug',
-      action='store_true',
-      help='if set, will use TensorFlow Debugger (tfdbg) to watch the '
-           'intermediate Tensors and runtime GraphDefs while running the '
-           'SavedModel.')
-  parser_run.add_argument(
-      '--worker',
-      type=str,
-      default=None,
-      help='if specified, a Session will be run on the worker. '
-           'Valid worker specification is a bns or gRPC path.')
-  parser_run.add_argument(
-      '--init_tpu',
-      action='store_true',
-      default=None,
-      help='if specified, tpu.initialize_system will be called on the Session. '
-           'This option should be only used if the worker is a TPU job.')
-  parser_run.add_argument(
-      '--use_tfrt',
-      action='store_true',
-      default=None,
-      help='if specified, TFRT session will be used, instead of TF1 session.')
   parser_run.set_defaults(func=run)
 
 
@@ -1056,21 +1157,6 @@ def add_scan_subparser(subparsers):
       'scan',
       description=scan_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  parser_scan.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to execute')
-  parser_scan.add_argument(
-      '--tag_set',
-      type=str,
-      help='tag-set of graph in SavedModel to scan, separated by \',\'')
-  parser_scan.add_argument(
-      '--op_denylist',
-      type=str,
-      help=('list of ops to detect and report, separated by \',\'. If not set, '
-            'default is WriteFile,ReadFile,PrintV2. For empty list, pass in '
-            '\'\''))
   parser_scan.set_defaults(func=scan)
 
 
@@ -1087,21 +1173,6 @@ def add_convert_subparser(subparsers):
       'convert',
       description=convert_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  parser_convert.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to convert')
-  parser_convert.add_argument(
-      '--output_dir',
-      type=str,
-      required=True,
-      help='output directory for the converted SavedModel')
-  parser_convert.add_argument(
-      '--tag_set',
-      type=str,
-      required=True,
-      help='tag-set of graph in SavedModel to convert, separated by \',\'')
   convert_subparsers = parser_convert.add_subparsers(
       title='conversion methods',
       description='valid conversion methods',
@@ -1110,76 +1181,9 @@ def add_convert_subparser(subparsers):
       'tensorrt',
       description='Convert the SavedModel with Tensorflow-TensorRT integration',
       formatter_class=argparse.RawTextHelpFormatter)
-  parser_convert_with_tensorrt.add_argument(
-      '--max_workspace_size_bytes',
-      type=int,
-      default=2 << 20,
-      help=('the maximum GPU temporary memory which the TRT engine can use at '
-            'execution time'))
-  parser_convert_with_tensorrt.add_argument(
-      '--precision_mode',
-      type=str,
-      default='FP32',
-      help='one of FP32, FP16 and INT8')
-  parser_convert_with_tensorrt.add_argument(
-      '--minimum_segment_size',
-      type=int,
-      default=3,
-      help=('the minimum number of nodes required for a subgraph to be replaced'
-            'in a TensorRT node'))
-  parser_convert_with_tensorrt.add_argument(
-      '--convert_tf1_model',
-      type=bool,
-      default=False,
-      help='support TRT conversion for TF1 models')
   parser_convert_with_tensorrt.set_defaults(func=convert_with_tensorrt)
 
 
-def _parse_common_freeze_and_aot(parser_compile):
-  """Parse arguments shared by freeze model and aot_compile."""
-  parser_compile.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to convert')
-  parser_compile.add_argument(
-      '--output_prefix',
-      type=str,
-      required=True,
-      help=('output directory + filename prefix for the resulting header(s) '
-            'and object file(s)'))
-  parser_compile.add_argument(
-      '--tag_set',
-      type=str,
-      required=True,
-      help='tag-set of graph in SavedModel to convert, separated by \',\'')
-  parser_compile.add_argument(
-      '--signature_def_key',
-      type=str,
-      default=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-      help=('signature_def key to use.  '
-            'default: DEFAULT_SERVING_SIGNATURE_DEF_KEY'))
-  parser_compile.add_argument(
-      '--checkpoint_path',
-      type=str,
-      default=None,
-      help='Custom checkpoint to use (default: use the SavedModel variables)')
-  parser_compile.add_argument(
-      '--variables_to_feed',
-      type=str,
-      default='',
-      help=('The names of variables that will be fed into the network.  '
-            'Options are: empty (default; all variables are frozen, none may '
-            'be fed), \'all\' (all variables may be fed), or a '
-            'comma-delimited list of names of variables that may be fed.  In '
-            'the last case, the non-fed variables will be frozen in the graph.'
-            '**NOTE** Any variables passed to `variables_to_feed` *must be set '
-            'by the user*.  These variables will NOT be frozen and their '
-            'values will be uninitialized in the compiled object '
-            '(this applies to all input arguments from the signature as '
-            'well).'))
-
-
 def add_freeze_model_subparser(subparsers):
   """Add parser for `freeze_model`."""
   compile_msg = '\n'.join(
@@ -1195,7 +1199,6 @@ def add_freeze_model_subparser(subparsers):
       'freeze_model',
       description=compile_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  _parse_common_freeze_and_aot(parser_compile)
   parser_compile.set_defaults(func=freeze_model)
 
 
@@ -1216,7 +1219,7 @@ def add_aot_compile_cpu_subparser(subparsers):
        '',
        'For example, to disable XLA fast math when compiling:',
        '',
-       'XLA_FLAGS="--xla_cpu_enable_fast_math=false" $saved_model_cli '
+       'XLA_FLAGS="--xla_cpu_enable_fast_math=false" $saved_model_cli ',
        'aot_compile_cpu ...',
        '',
        'Some possibly useful flags:',
@@ -1229,42 +1232,6 @@ def add_aot_compile_cpu_subparser(subparsers):
       'aot_compile_cpu',
       description=compile_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  _parse_common_freeze_and_aot(parser_compile)
-  parser_compile.add_argument(
-      '--target_triple',
-      type=str,
-      default='x86_64-pc-linux',
-      help=('Target triple for LLVM during AOT compilation.  Examples: '
-            'x86_64-none-darwin, x86_64-apple-ios, arm64-none-ios, '
-            'armv7-none-android.  More examples are available in tfcompile.bzl '
-            'in the tensorflow codebase.'))
-  parser_compile.add_argument(
-      '--target_cpu',
-      type=str,
-      default='',
-      help=('Target cpu name for LLVM during AOT compilation.  Examples: '
-            'x86_64, skylake, haswell, westmere, <empty> (unknown).  For '
-            'a complete list of options, run (for x86 targets): '
-            '`llc -march=x86 -mcpu=help`'))
-  parser_compile.add_argument(
-      '--cpp_class',
-      type=str,
-      required=True,
-      help=('The name of the generated C++ class, wrapping the generated '
-            'function.  The syntax of this flag is '
-            '[[<optional_namespace>::],...]<class_name>.  This mirrors the '
-            'C++ syntax for referring to a class, where multiple namespaces '
-            'may precede the class name, separated by double-colons.  '
-            'The class will be generated in the given namespace(s), or if no '
-            'namespaces are given, within the global namespace.'))
-  parser_compile.add_argument(
-      '--multithreading',
-      type=str,
-      default='False',
-      help=('Enable multithreading in the compiled computation.  '
-            'Note that if using this option, the resulting object files '
-            'may have external dependencies on multithreading libraries '
-            'like nsync.'))
 
   parser_compile.set_defaults(func=aot_compile_cpu)
 
@@ -1275,8 +1242,9 @@ def create_parser():
   Returns:
     A namespace parsed from command line arguments.
   """
-  parser = argparse.ArgumentParser(
-      description='saved_model_cli: Command-line interface for SavedModel')
+  parser = argparse_flags.ArgumentParser(
+      description='saved_model_cli: Command-line interface for SavedModel',
+      conflict_handler='resolve')
   parser.add_argument('-v', '--version', action='version', version='0.1.0')
 
   subparsers = parser.add_subparsers(
@@ -1304,12 +1272,17 @@ def create_parser():
 
 def main():
   logging.set_verbosity(logging.INFO)
-  parser = create_parser()
-  args = parser.parse_args()
-  if not hasattr(args, 'func'):
-    parser.error('too few arguments')
-  args.func(args)
+
+  def smcli_main(argv):
+    parser = create_parser()
+    if len(argv) < 2:
+      parser.error('Too few arguments.')
+    flags.mark_flags_as_required(command_required_flags[argv[1]])
+    args = parser.parse_args()
+    args.func()
+
+  app.run(smcli_main)
 
 
 if __name__ == '__main__':
-  sys.exit(main())
+  main()
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index 7bc8e853afd..b727d7869f5 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -33,6 +33,8 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import parsing_config
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -61,13 +63,13 @@ def setUp(self):
     if platform.system() == 'Windows':
       self.skipTest('Skipping failing tests on Windows.')
 
-  def testShowCommandAll(self):
+  @test.mock.patch.object(saved_model_cli, '_get_ops_in_metagraph')
+  def testShowCommandAll(self, get_ops_mock):
+    # Mocking _get_ops_in_metagraph because it returns a nondeterministically
+    # ordered set of ops.
+    get_ops_mock.return_value = {'Op1', 'Op2', 'Op3'}
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(['show', '--dir', base_path, '--all'])
-    with captured_output() as (out, err):
-      saved_model_cli.show(args)
-    output = out.getvalue().strip()
+
     # pylint: disable=line-too-long
     exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
@@ -147,13 +149,28 @@ def testShowCommandAll(self):
         dtype: DT_FLOAT
         shape: (-1, 1)
         name: y:0
-  Method name is: tensorflow/serving/predict"""
+  Method name is: tensorflow/serving/predict
+The MetaGraph with tag set ['serve'] contains the following ops:"""
     # pylint: enable=line-too-long
-    self.maxDiff = None  # Produce a useful error msg if the comparison fails
-    self.assertMultiLineEqual(output, exp_out)
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS(
+        ['saved_model_cli', 'show', '--dir', base_path, '--all'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    get_ops_mock.assert_called_once()
+    output = out.getvalue().strip()
+    self.maxDiff = None  # Produce useful error msg if the comparison fails
+    self.assertIn(exp_out, output)
+    self.assertIn('Op1', output)
+    self.assertIn('Op2', output)
+    self.assertIn('Op3', output)
     self.assertEqual(err.getvalue().strip(), '')
 
-  def testShowAllWithFunctions(self):
+  @test.mock.patch.object(saved_model_cli, '_get_ops_in_metagraph')
+  def testShowAllWithFunctions(self, get_ops_mock):
 
     class DummyModel(autotrackable.AutoTrackable):
       """Model with callable polymorphic functions specified."""
@@ -175,6 +192,9 @@ def func2(self, x):
       def __call__(self, y, c=7):
         return y + 2 * c
 
+    # Mocking _get_ops_in_metagraph because it returns a nondeterministically
+    # ordered set of ops.
+    get_ops_mock.return_value = {'Op1'}
     saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
     dummy_model = DummyModel()
     # Call with specific values to create new polymorphic function traces.
@@ -182,11 +202,7 @@ def __call__(self, y, c=7):
     dummy_model(constant_op.constant(5))
     with self.cached_session():
       save.save(dummy_model, saved_model_dir)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(['show', '--dir', saved_model_dir, '--all'])
-    with captured_output() as (out, err):
-      saved_model_cli.show(args)
-    output = out.getvalue().strip()
+
     exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['__saved_model_init_op']:
@@ -210,6 +226,7 @@ def __call__(self, y, c=7):
         shape: (2, 2)
         name: PartitionedCall:0
   Method name is: tensorflow/serving/predict
+The MetaGraph with tag set ['serve'] contains the following ops: {'Op1'}
 
 Concrete Functions:
   Function Name: '__call__'
@@ -238,11 +255,21 @@ def __call__(self, y, c=7):
         Argument #1
           x: TensorSpec(shape=(2, 2), dtype=tf.float32, name='x')
 """.strip()  # pylint: enable=line-too-long
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS(
+        ['saved_model_cli', 'show', '--dir', saved_model_dir, '--all'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
     self.maxDiff = None  # Produce a useful error msg if the comparison fails
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
-  def testShowAllWithPureConcreteFunction(self):
+  @test.mock.patch.object(saved_model_cli, '_get_ops_in_metagraph')
+  def testShowAllWithPureConcreteFunction(self, get_ops_mock):
 
     class DummyModel(autotrackable.AutoTrackable):
       """Model with a callable concrete function."""
@@ -260,15 +287,14 @@ def __init__(self):
       def multiply(self, a, b):
         return a * b
 
+    # Mocking _get_ops_in_metagraph because it returns a nondeterministically
+    # ordered set of ops.
+    get_ops_mock.return_value = {'Op1'}
     saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
     dummy_model = DummyModel()
     with self.cached_session():
       save.save(dummy_model, saved_model_dir)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(['show', '--dir', saved_model_dir, '--all'])
-    with captured_output() as (out, err):
-      saved_model_cli.show(args)
-    output = out.getvalue().strip()
+
     exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
 
 signature_def['__saved_model_init_op']:
@@ -296,6 +322,7 @@ def multiply(self, a, b):
         shape: ()
         name: PartitionedCall:0
   Method name is: tensorflow/serving/predict
+The MetaGraph with tag set ['serve'] contains the following ops: {'Op1'}
 
 Concrete Functions:
   Function Name: 'pure_concrete_function'
@@ -306,29 +333,37 @@ def multiply(self, a, b):
         Argument #2
           b: TensorSpec(shape=(), dtype=tf.float32, name='b')
 """.strip()  # pylint: enable=line-too-long
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS(
+        ['saved_model_cli', 'show', '--dir', saved_model_dir, '--all'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
     self.maxDiff = None  # Produce a useful error msg if the comparison fails
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
   def testShowCommandTags(self):
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(['show', '--dir', base_path])
+
+    exp_out = 'The given SavedModel contains the following tag-sets:\n\'serve\''
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS(['saved_model_cli', 'show', '--dir', base_path])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with captured_output() as (out, err):
-      saved_model_cli.show(args)
+      saved_model_cli.show()
     output = out.getvalue().strip()
-    exp_out = 'The given SavedModel contains the following tag-sets:\n\'serve\''
     self.assertMultiLineEqual(output, exp_out)
     self.assertEqual(err.getvalue().strip(), '')
 
   def testShowCommandSignature(self):
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(
-        ['show', '--dir', base_path, '--tag_set', 'serve'])
-    with captured_output() as (out, err):
-      saved_model_cli.show(args)
-    output = out.getvalue().strip()
+
     exp_header = ('The given SavedModel MetaGraphDef contains SignatureDefs '
                   'with the following keys:')
     exp_start = 'SignatureDef key: '
@@ -336,30 +371,37 @@ def testShowCommandSignature(self):
         '"classify_x2_to_y3"', '"classify_x_to_y"', '"regress_x2_to_y3"',
         '"regress_x_to_y"', '"regress_x_to_y2"', '"serving_default"'
     ]
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS(
+        ['saved_model_cli', 'show', '--dir', base_path, '--tag_set', 'serve'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
     # Order of signatures does not matter
     self.assertMultiLineEqual(
         output,
-        '\n'.join([exp_header] + [exp_start + exp_key for exp_key in exp_keys]))
+        '\n'.join([exp_header] +
+                  [exp_start + exp_key for exp_key in exp_keys]))
     self.assertEqual(err.getvalue().strip(), '')
 
   def testShowCommandErrorNoTagSet(self):
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args(
-        ['show', '--dir', base_path, '--tag_set', 'badtagset'])
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli', 'show', '--dir', base_path,
+        '--tag_set', 'badtagset'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaises(RuntimeError):
-      saved_model_cli.show(args)
+      saved_model_cli.show()
 
   def testShowCommandInputsOutputs(self):
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    self.parser = saved_model_cli.create_parser()
-    args = self.parser.parse_args([
-        'show', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default'
-    ])
-    with captured_output() as (out, err):
-      saved_model_cli.show(args)
-    output = out.getvalue().strip()
+
     expected_output = (
         'The given SavedModel SignatureDef contains the following input(s):\n'
         '  inputs[\'x\'] tensor_info:\n'
@@ -368,15 +410,78 @@ def testShowCommandInputsOutputs(self):
         '  outputs[\'y\'] tensor_info:\n'
         '      dtype: DT_FLOAT\n      shape: (-1, 1)\n      name: y:0\n'
         'Method name is: tensorflow/serving/predict')
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli', 'show', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default'
+    ])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
     self.assertEqual(output, expected_output)
     self.assertEqual(err.getvalue().strip(), '')
 
+  def testShowCommandListOps(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli', 'show', '--dir', base_path, '--tag_set', 'serve',
+        '--list_ops'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
+    self.assertIn(
+        'The MetaGraph with tag set [\'serve\'] contains the following ops:',
+        output)
+    self.assertIn('\'VariableV2\'', output)
+    self.assertIn('\'Add\'', output)
+    self.assertIn('\'RestoreV2\'', output)
+    self.assertIn('\'ShardedFilename\'', output)
+    self.assertIn('\'Placeholder\'', output)
+    self.assertIn('\'Mul\'', output)
+    self.assertIn('\'Pack\'', output)
+    self.assertIn('\'Reshape\'', output)
+    self.assertIn('\'SaveV2\'', output)
+    self.assertIn('\'Const\'', output)
+    self.assertIn('\'Identity\'', output)
+    self.assertIn('\'Assign\'', output)
+    self.assertIn('\'ParseExample\'', output)
+    self.assertIn('\'StringJoin\'', output)
+    self.assertIn('\'MergeV2Checkpoints\'', output)
+    self.assertIn('\'NoOp\'', output)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandListOpsNoTags(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+
+    exp_out = ('--list_ops must be paired with a tag-set or with --all.\n'
+               'The given SavedModel contains the following tag-sets:\n'
+               '\'serve\'').strip()
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli', 'show', '--dir', base_path, '--list_ops'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    with captured_output() as (out, err):
+      saved_model_cli.show()
+    output = out.getvalue().strip()
+    self.maxDiff = None  # Produce a useful error msg if the comparison fails
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
   def testPrintREFTypeTensor(self):
-    ref_tensor_info = meta_graph_pb2.TensorInfo()
-    ref_tensor_info.dtype = types_pb2.DT_FLOAT_REF
+    ref_tensor_info = meta_graph_pb2.TensorInfo(
+        dtype=types_pb2.DT_FLOAT_REF)
     with captured_output() as (out, err):
       saved_model_cli._print_tensor_info(ref_tensor_info)
-    self.assertTrue('DT_FLOAT_REF' in out.getvalue().strip())
+    self.assertIn('DT_FLOAT_REF', out.getvalue().strip())
     self.assertEqual(err.getvalue().strip(), '')
 
   def testInputPreProcessFormats(self):
@@ -385,13 +490,13 @@ def testInputPreProcessFormats(self):
     input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
     input_expr_dict = saved_model_cli.preprocess_input_exprs_arg_string(
         input_expr_str, safe=False)
-    self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
-    self.assertTrue(input_dict['input2'] == ('file2', None))
+    self.assertEqual(input_dict['input1'], ('/path/file.txt', 'ab3'))
+    self.assertEqual(input_dict['input2'], ('file2', None))
     print(input_expr_dict['input3'])
     self.assertAllClose(input_expr_dict['input3'], np.zeros([2, 2]))
     self.assertAllClose(input_expr_dict['input4'], [4, 5])
-    self.assertTrue(len(input_dict) == 2)
-    self.assertTrue(len(input_expr_dict) == 2)
+    self.assertLen(input_dict, 2)
+    self.assertLen(input_expr_dict, 2)
 
   def testInputPreProcessExamplesWithStrAndBytes(self):
     input_examples_str = 'inputs=[{"text":["foo"], "bytes":[b"bar"]}]'
@@ -429,9 +534,9 @@ def testInputPreProcessFileNames(self):
     input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
                  r'input:0=c:\PROGRA~1\data.npy')
     input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
-    self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
-                                             'v:0'))
-    self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
+    self.assertEqual(input_dict['inputx'], (r'C:\Program Files\data.npz',
+                                            'v:0'))
+    self.assertEqual(input_dict['input:0'], (r'c:\PROGRA~1\data.npy', None))
 
   def testInputPreProcessErrorBadFormat(self):
     input_str = 'inputx=file[[v1]v2'
@@ -508,195 +613,316 @@ def testInputParserErrorWrongName(self):
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInputExamples(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(),
-                              'new_dir' + ('tfrt' if use_tfrt else ''))
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x_to_y', '--input_examples',
-        'inputs=[{"x":[8.0],"x2":[5.0]}, {"x":[4.0],"x2":[3.0]}]', '--outdir',
-        output_dir
-    ] + (['--use_tfrt'] if use_tfrt else []))
-    saved_model_cli.run(args)
+                              'input_examples' + ('tfrt' if use_tfrt else ''))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x_to_y', '--input_examples',
+        'inputs=[{"x":[8.0],"x2":[5.0]}, {"x":[4.0],"x2":[3.0]}]',
+        '--outdir', output_dir
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    saved_model_cli.run()
     y_actual = np.load(os.path.join(output_dir, 'outputs.npy'))
     y_expected = np.array([[6.0], [4.0]])
     self.assertAllEqual(y_expected, y_actual)
 
+  @parameterized.named_parameters(('non_tfrt', False))
+  def testRunCommandLongInputExamples(self, use_tfrt):
+
+    class DummyModel(autotrackable.AutoTrackable):
+      """Model with callable polymorphic functions specified."""
+
+      @def_function.function(input_signature=[
+          tensor_spec.TensorSpec(shape=None, dtype=dtypes.string),
+      ])
+      def func(self, inputs):
+        ex = parsing_ops.parse_example(serialized=inputs, features={
+            'variable0': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable1': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable2': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable3': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable4': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable5': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable6': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable7': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable8': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+            'variable9': parsing_config.FixedLenFeature(
+                (), dtypes.float32),
+        })
+        return {'outputs': sum(ex.values())}
+
+    saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
+    dummy_model = DummyModel()
+    func = getattr(dummy_model, 'func')
+
+    with self.cached_session():
+      save.save(dummy_model, saved_model_dir, signatures={'func': func})
+
+    output_dir = os.path.join(
+        test.get_temp_dir(),
+        'long_input_examples' + ('tfrt' if use_tfrt else ''))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    input_examples = (
+        'inputs=[{"variable0":[0.0],"variable1":[1.0],"variable2":[2.0],'
+        '"variable3":[3.0],"variable4":[4.0],"variable5":[5.0],'
+        '"variable6":[6.0],"variable7":[7.0],"variable8":[8.0],'
+        '"variable9":[9.0]}, {"variable0":[10.0],"variable1":[1.0],'
+        '"variable2":[2.0],"variable3":[3.0],"variable4":[4.0],'
+        '"variable5":[5.0],"variable6":[6.0],"variable7":[7.0],'
+        '"variable8":[8.0],"variable9":[9.0]}]')
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', saved_model_dir, '--tag_set', 'serve',
+        '--signature_def', 'func', '--input_examples', input_examples,
+        '--outdir', output_dir
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    saved_model_cli.run()
+    y_actual = np.load(os.path.join(output_dir, 'outputs.npy'))
+    y_expected = np.array([45.0, 55.0])
+    self.assertAllEqual(y_expected, y_actual)
+
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandExistingOutdir(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    input_path = os.path.join(test.get_temp_dir(), 'testRunCommand_inputs.npz')
     x = np.array([[1], [2]])
     x_notused = np.zeros((6, 3))
-    input_path = os.path.join(test.get_temp_dir(), 'testRunCommand_inputs.npz')
     np.savez(input_path, x0=x, x1=x_notused)
     output_file = os.path.join(test.get_temp_dir(), 'outputs.npy')
     if os.path.exists(output_file):
       os.remove(output_file)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x2_to_y3', '--inputs', 'inputs=' + input_path +
-        '[x0]', '--outdir',
-        test.get_temp_dir()
-    ] + (['--use_tfrt'] if use_tfrt else []))
-    saved_model_cli.run(args)
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x2_to_y3', '--inputs',
+        'inputs=' + input_path + '[x0]', '--outdir', test.get_temp_dir()
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    saved_model_cli.run()
     y_actual = np.load(output_file)
     y_expected = np.array([[3.5], [4.0]])
     self.assertAllClose(y_expected, y_actual)
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandNewOutdir(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    x = np.array([[1], [2]])
-    x_notused = np.zeros((6, 3))
     input_path = os.path.join(test.get_temp_dir(),
                               'testRunCommandNewOutdir_inputs.npz')
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    np.savez(input_path, x0=x, x1=x_notused)
     output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
     if os.path.isdir(output_dir):
       shutil.rmtree(output_dir)
-    np.savez(input_path, x0=x, x1=x_notused)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default', '--inputs', 'x=' + input_path +
-        '[x0]', '--outdir', output_dir
-    ] + (['--use_tfrt'] if use_tfrt else []))
-    saved_model_cli.run(args)
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default', '--inputs', 'x=' +
+        input_path + '[x0]', '--outdir', output_dir
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    saved_model_cli.run()
     y_actual = np.load(os.path.join(output_dir, 'y.npy'))
     y_expected = np.array([[2.5], [3.0]])
     self.assertAllClose(y_expected, y_actual)
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandOutOverwrite(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    x = np.array([[1], [2]])
-    x_notused = np.zeros((6, 3))
     input_path = os.path.join(test.get_temp_dir(),
                               'testRunCommandOutOverwrite_inputs.npz')
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
     np.savez(input_path, x0=x, x1=x_notused)
     output_file = os.path.join(test.get_temp_dir(), 'y.npy')
     open(output_file, 'a').close()
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
-        test.get_temp_dir(), '--overwrite'
-    ] + (['--use_tfrt'] if use_tfrt else []))
-    saved_model_cli.run(args)
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default', '--inputs', 'x=' +
+        input_path + '[x0]', '--outdir', test.get_temp_dir(),
+        '--overwrite'
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
+    saved_model_cli.run()
     y_actual = np.load(output_file)
     y_expected = np.array([[2.5], [3.0]])
     self.assertAllClose(y_expected, y_actual)
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInvalidInputKeyError(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x2_to_y3', '--input_exprs', 'x2=[1,2,3]'
-    ] + (['--use_tfrt'] if use_tfrt else []))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x2_to_y3',
+        '--input_exprs', 'x2=[1,2,3]'
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaises(ValueError):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInvalidSignature(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'INVALID_SIGNATURE', '--input_exprs', 'x2=[1,2,3]'
-    ] + (['--use_tfrt'] if use_tfrt else []))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'INVALID_SIGNATURE',
+        '--input_exprs', 'x2=[1,2,3]'
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaisesRegex(ValueError,
-                                'Could not find signature "INVALID_SIGNATURE"'):
-      saved_model_cli.run(args)
+                                'Could not find signature '
+                                '"INVALID_SIGNATURE"'):
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInputExamplesNotListError(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x_to_y', '--input_examples', 'inputs={"x":8.0,"x2":5.0}',
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x_to_y',
+        '--input_examples', 'inputs={"x":8.0,"x2":5.0}',
         '--outdir', output_dir
-    ] + (['--use_tfrt'] if use_tfrt else []))
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaisesRegex(ValueError, 'must be a list'):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInputExamplesFeatureValueNotListError(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x_to_y', '--input_examples', 'inputs=[{"x":8.0,"x2":5.0}]',
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x_to_y',
+        '--input_examples', 'inputs=[{"x":8.0,"x2":5.0}]',
         '--outdir', output_dir
-    ] + (['--use_tfrt'] if use_tfrt else []))
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaisesRegex(ValueError, 'feature value must be a list'):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInputExamplesFeatureBadType(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'regress_x_to_y', '--input_examples', 'inputs=[{"x":[[1],[2]]}]',
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'regress_x_to_y',
+        '--input_examples', 'inputs=[{"x":[[1],[2]]}]',
         '--outdir', output_dir
-    ] + (['--use_tfrt'] if use_tfrt else []))
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaisesRegex(ValueError, 'is not supported'):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandOutputFileExistError(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    x = np.array([[1], [2]])
-    x_notused = np.zeros((6, 3))
     input_path = os.path.join(test.get_temp_dir(),
                               'testRunCommandOutOverwrite_inputs.npz')
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
     np.savez(input_path, x0=x, x1=x_notused)
     output_file = os.path.join(test.get_temp_dir(), 'y.npy')
     open(output_file, 'a').close()
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
-        test.get_temp_dir()
-    ] + (['--use_tfrt'] if use_tfrt else []))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default', '--inputs', 'x=' +
+        input_path + '[x0]', '--outdir', test.get_temp_dir()
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaises(RuntimeError):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandInputNotGivenError(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default'
-    ] + (['--use_tfrt'] if use_tfrt else []))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default'
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaises(AttributeError):
-      saved_model_cli.run(args)
+      saved_model_cli.run()
 
   @parameterized.named_parameters(('non_tfrt', False))
   def testRunCommandWithDebuggerEnabled(self, use_tfrt):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    x = np.array([[1], [2]])
-    x_notused = np.zeros((6, 3))
     input_path = os.path.join(test.get_temp_dir(),
                               'testRunCommandNewOutdir_inputs.npz')
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    np.savez(input_path, x0=x, x1=x_notused)
     output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
     if os.path.isdir(output_dir):
       shutil.rmtree(output_dir)
-    np.savez(input_path, x0=x, x1=x_notused)
-    args = self.parser.parse_args([
-        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
-        'serving_default', '--inputs', 'x=' + input_path +
-        '[x0]', '--outdir', output_dir, '--tf_debug'
-    ] + (['--use_tfrt'] if use_tfrt else []))
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
+        'run', '--dir', base_path, '--tag_set', 'serve',
+        '--signature_def', 'serving_default', '--inputs', 'x=' +
+        input_path + '[x0]', '--outdir', output_dir, '--tf_debug'
+        ] + (['--use_tfrt'] if use_tfrt else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
 
     def fake_wrapper_session(sess):
       return sess
@@ -706,7 +932,7 @@ def fake_wrapper_session(sess):
         'LocalCLIDebugWrapperSession',
         side_effect=fake_wrapper_session,
         autospec=True) as fake:
-      saved_model_cli.run(args)
+      saved_model_cli.run()
       fake.assert_called_with(test.mock.ANY)
 
     y_actual = np.load(os.path.join(output_dir, 'y.npy'))
@@ -714,11 +940,15 @@ def fake_wrapper_session(sess):
     self.assertAllClose(y_expected, y_actual)
 
   def testScanCommand(self):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    args = self.parser.parse_args(['scan', '--dir', base_path])
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli', 'scan', '--dir', base_path])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with captured_output() as (out, _):
-      saved_model_cli.scan(args)
+      saved_model_cli.scan()
     output = out.getvalue().strip()
     self.assertIn(('MetaGraph with tag set [\'serve\'] does not contain the '
                    'default denylisted ops: {\''), output)
@@ -727,14 +957,17 @@ def testScanCommand(self):
     self.assertIn('\'PrintV2\'', output)
 
   def testScanCommandFoundCustomDenylistedOp(self):
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
-    args = self.parser.parse_args([
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
         'scan', '--dir', base_path, '--tag_set', 'serve', '--op_denylist',
-        'VariableV2,Assign,Relu6'
-    ])
+        'VariableV2,Assign,Relu6'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with captured_output() as (out, _):
-      saved_model_cli.scan(args)
+      saved_model_cli.scan()
     output = out.getvalue().strip()
     self.assertIn(('MetaGraph with tag set [\'serve\'] contains the following'
                    ' denylisted ops:'), output)
@@ -745,16 +978,19 @@ def testAOTCompileCPUWrongSignatureDefKey(self):
     if not test.is_built_with_xla():
       self.skipTest('Skipping test because XLA is not compiled in.')
 
-    self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir')
-    args = self.parser.parse_args([
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',
         'aot_compile_cpu', '--dir', base_path, '--tag_set', 'serve',
         '--output_prefix', output_dir, '--cpp_class', 'Compiled',
-        '--signature_def_key', 'MISSING'
-    ])
+        '--signature_def_key', 'MISSING'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with self.assertRaisesRegex(ValueError, 'Unable to find signature_def'):
-      saved_model_cli.aot_compile_cpu(args)
+      saved_model_cli.aot_compile_cpu()
 
   class AOTCompileDummyModel(autotrackable.AutoTrackable):
     """Model compatible with XLA compilation."""
@@ -814,27 +1050,32 @@ def testAOTCompileCPUFreezesAndCompiles(self, variables_to_feed, func,
       self.evaluate(dummy_model.write_var.initializer)
       save.save(dummy_model, saved_model_dir, signatures={'func': func})
 
-    self.parser = saved_model_cli.create_parser()
     output_prefix = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir/out')
-    args = [  # Use the default seving signature_key.
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',  # Use the default serving signature_key.
         'aot_compile_cpu', '--dir', saved_model_dir, '--tag_set', 'serve',
         '--signature_def_key', 'func', '--output_prefix', output_prefix,
-        '--variables_to_feed', variables_to_feed, '--cpp_class', 'Generated'
-    ]
-    if target_triple:
-      args.extend(['--target_triple', target_triple])
-    args = self.parser.parse_args(args)
+        '--variables_to_feed', variables_to_feed,
+        '--cpp_class', 'Generated'
+        ] + (['--target_triple', target_triple] if target_triple else []))
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with test.mock.patch.object(logging, 'warn') as captured_warn:
-      saved_model_cli.aot_compile_cpu(args)
+      saved_model_cli.aot_compile_cpu()
     self.assertRegex(
         str(captured_warn.call_args),
-        'Signature input key \'y\'.*has been pruned while freezing the graph.')
+        'Signature input key \'y\'.*has been pruned while freezing the '
+        'graph.')
     self.assertTrue(file_io.file_exists('{}.o'.format(output_prefix)))
     self.assertTrue(file_io.file_exists('{}.h'.format(output_prefix)))
-    self.assertTrue(file_io.file_exists('{}_metadata.o'.format(output_prefix)))
+    self.assertTrue(file_io.file_exists(
+        '{}_metadata.o'.format(output_prefix)))
     self.assertTrue(
         file_io.file_exists('{}_makefile.inc'.format(output_prefix)))
-    header_contents = file_io.read_file_to_string('{}.h'.format(output_prefix))
+    header_contents = file_io.read_file_to_string(
+        '{}.h'.format(output_prefix))
     self.assertIn('class Generated', header_contents)
     self.assertIn('arg_feed_x_data', header_contents)
     self.assertIn('result_fetch_res_data', header_contents)
@@ -844,7 +1085,7 @@ def testAOTCompileCPUFreezesAndCompiles(self, variables_to_feed, func,
       # Read-only-variables' setters preserve constness.
       self.assertIn('set_var_param_my_var_data(const float', header_contents)
       self.assertNotIn('set_var_param_my_var_data(float', header_contents)
-    if func == dummy_model.func_write:
+    if func == dummy_model.func_write:  # pylint: disable=comparison-with-callable
       # Writeable variables setters do not preserve constness.
       self.assertIn('set_var_param_write_var_data(float', header_contents)
       self.assertNotIn('set_var_param_write_var_data(const float',
@@ -858,26 +1099,26 @@ def testFreezeModel(self):
     if not test.is_built_with_xla():
       self.skipTest('Skipping test because XLA is not compiled in.')
 
-    variables_to_feed = 'all'
-    func = 'func2'
     saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
     dummy_model = self.AOTCompileDummyModel()
-    func = getattr(dummy_model, func)
+    func = getattr(dummy_model, 'func2')
     with self.cached_session():
       self.evaluate(dummy_model.var.initializer)
       self.evaluate(dummy_model.write_var.initializer)
       save.save(dummy_model, saved_model_dir, signatures={'func': func})
 
-    self.parser = saved_model_cli.create_parser()
     output_prefix = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir/out')
-    args = [  # Use the default seving signature_key.
+
+    saved_model_cli.flags.FLAGS.unparse_flags()
+    saved_model_cli.flags.FLAGS([
+        'saved_model_cli',  # Use the default seving signature_key.
         'freeze_model', '--dir', saved_model_dir, '--tag_set', 'serve',
         '--signature_def_key', 'func', '--output_prefix', output_prefix,
-        '--variables_to_feed', variables_to_feed
-    ]
-    args = self.parser.parse_args(args)
+        '--variables_to_feed', 'all'])
+    parser = saved_model_cli.create_parser()
+    parser.parse_args()
     with test.mock.patch.object(logging, 'warn'):
-      saved_model_cli.freeze_model(args)
+      saved_model_cli.freeze_model()
     self.assertTrue(
         file_io.file_exists(os.path.join(output_prefix, 'frozen_graph.pb')))
     self.assertTrue(
diff --git a/tensorflow/python/tools/selective_registration_header_lib.py b/tensorflow/python/tools/selective_registration_header_lib.py
index 836c5a562a4..2476d9924a9 100644
--- a/tensorflow/python/tools/selective_registration_header_lib.py
+++ b/tensorflow/python/tools/selective_registration_header_lib.py
@@ -64,21 +64,46 @@ def _get_ops_from_graphdef(graph_def):
   return ops
 
 
+def get_ops_from_nodedef(node_def):
+  """Gets the op and kernel needed from the given NodeDef.
+
+  Args:
+    node_def: TF NodeDef to get op/kernel information.
+
+  Returns:
+    A tuple of (op_name, kernel_name). If the op is not in the allowlist of ops
+    without kernel and there is no kernel found, then return None.
+  """
+  if not node_def.device:
+    node_def.device = '/cpu:0'
+  kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
+      node_def.SerializeToString())
+  op = str(node_def.op)
+  if kernel_class or op in OPS_WITHOUT_KERNEL_ALLOWLIST:
+    return (op, str(kernel_class.decode('utf-8')) if kernel_class else None)
+  else:
+    tf_logging.warning('Warning: no kernel found for op %s', op)
+    return None
+
+
 def _get_ops_from_nodedefs(node_defs):
-  """Gets the ops and kernels needed from the list of NodeDef."""
+  """Gets the ops and kernels needed from the list of NodeDef.
+
+  If a NodeDef's op is not in the allowlist of ops without kernel and there is
+  no kernel found for this NodeDef, then skip that NodeDef and proceed to the
+  next one.
+
+  Args:
+    node_defs: list of NodeDef's to get op/kernel information.
+
+  Returns:
+    A set of (op_name, kernel_name) tuples.
+  """
   ops = set()
   for node_def in node_defs:
-    if not node_def.device:
-      node_def.device = '/cpu:0'
-    kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
-        node_def.SerializeToString())
-    op = str(node_def.op)
-    if kernel_class or op in OPS_WITHOUT_KERNEL_ALLOWLIST:
-      op_and_kernel = (op, str(kernel_class.decode('utf-8'))
-                       if kernel_class else None)
+    op_and_kernel = get_ops_from_nodedef(node_def)
+    if op_and_kernel:
       ops.add(op_and_kernel)
-    else:
-      tf_logging.warning('Warning: no kernel found for op %s', op)
   return ops
 
 
@@ -110,7 +135,7 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
       if op_and_kernel not in ops:
         ops.add(op_and_kernel)
 
-  return list(sorted(ops))
+  return sorted(ops)
 
 
 def get_header_from_ops_and_kernels(ops_and_kernels,
@@ -125,6 +150,7 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
   Returns:
     the string of the header that should be written as ops_to_register.h.
   """
+  ops_and_kernels = sorted(ops_and_kernels)
   ops = set(op for op, _ in ops_and_kernels)
   result_list = []
 
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index f1f546010c1..a7d78a0cc1b 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -8,12 +8,14 @@ load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 # to use TPU symbols. They are accessible from tf.contrib.tpu in TF 1.x and
 # tf.tpu and tf.compat.v1.tpu in TF 2.x.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain:__subpackages__",
         "//learning/deepmind:__subpackages__",
         "//learning/serving:__subpackages__",
         "//research/graph:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//third_party/py/jax_tpu_embedding:__subpackages__",
         "//waymo/ml/deploy/sync_test/tools:__subpackages__",
     ],
     licenses = ["notice"],
@@ -109,6 +111,30 @@ pytype_library(
     ],
 )
 
+py_library(
+    name = "tpu_replication",
+    srcs = ["tpu_replication.py"],
+    deps = [
+        ":device_assignment",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:c_api_util",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/tpu/ops",
+        "//tensorflow/python/types",
+        "//tensorflow/python/util",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
 py_library(
     name = "tpu_estimator",
     srcs = [
@@ -229,6 +255,7 @@ pytype_library(
         ":topology",
         ":tpu_feed",
         ":tpu_function",
+        ":tpu_replication",
         ":tpu_sharding",
         "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/core:protos_all_py",
@@ -254,6 +281,7 @@ pytype_library(
         "//tensorflow/python/compiler/xla",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/ops",
@@ -269,6 +297,7 @@ pytype_library(
         ":tpu_feed",
         ":tpu_function",
         ":tpu_name_util",
+        ":tpu_replication",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
         "//tensorflow/python:array_ops",
@@ -498,6 +527,8 @@ pytype_library(
     srcs = ["feature_column.py"],
     deps = [
         ":tpu_lib",
+        ":tpu_py",
+        ":tpu_replication",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:variable_scope",
@@ -512,6 +543,8 @@ pytype_library(
     deps = [
         ":feature_column",
         ":tpu_lib",
+        ":tpu_py",
+        ":tpu_replication",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:variable_scope",
@@ -586,6 +619,8 @@ pytype_library(
     deps = [
         ":tpu_embedding_v2_utils",
         ":tpu_lib",
+        ":tpu_py",
+        ":tpu_replication",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_utils",
@@ -630,6 +665,7 @@ pytype_strict_library(
         "//tensorflow/python/types",
         "//tensorflow/python/util",
         "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -657,6 +693,7 @@ pytype_strict_library(
         ":tpu_embedding_base",
         ":tpu_embedding_v2_utils",
         ":tpu_py",
+        ":tpu_replication",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/tpu/client/BUILD b/tensorflow/python/tpu/client/BUILD
index 77243341fc8..e31ac2a3854 100644
--- a/tensorflow/python/tpu/client/BUILD
+++ b/tensorflow/python/tpu/client/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/python/tpu/client/pip_package/BUILD b/tensorflow/python/tpu/client/pip_package/BUILD
index d060044f740..c82ee936796 100644
--- a/tensorflow/python/tpu/client/pip_package/BUILD
+++ b/tensorflow/python/tpu/client/pip_package/BUILD
@@ -1,7 +1,10 @@
 # Description:
 #  Tools for building the Cloud TPU Client pip package.
 
-package(default_visibility = ["//visibility:private"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
+)
 
 licenses(["notice"])
 
diff --git a/tensorflow/python/tpu/experimental/BUILD b/tensorflow/python/tpu/experimental/BUILD
index 3675bf2fcdf..7962d3d149d 100644
--- a/tensorflow/python/tpu/experimental/BUILD
+++ b/tensorflow/python/tpu/experimental/BUILD
@@ -1,4 +1,7 @@
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_library(
     name = "experimental",
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index fb5acd1c037..5d9c804fe22 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -905,7 +905,7 @@ def __new__(cls, *args, **kwargs):
     if 'embedding_lookup_device' in kwargs:
       cls._embedding_lookup_device = kwargs['embedding_lookup_device']
       del kwargs['embedding_lookup_device']
-    return _TPUEmbeddingColumnV2.__new__(cls, *args, **kwargs)
+    return _TPUEmbeddingColumnV2.__new__(cls, *args, **kwargs)  # pytype: disable=wrong-keyword-args  # always-use-return-annotations
 
   def __init__(self, *args, **kwargs):
     # For __init__, just capture the inference dense shape and call parent.
@@ -1033,7 +1033,7 @@ def __new__(cls, *args, **kwargs):
       cls._embedding_lookup_device = kwargs['embedding_lookup_device']
       del kwargs['embedding_lookup_device']
 
-    return _TPUSharedEmbeddingColumnV2.__new__(cls, *args, **kwargs)
+    return _TPUSharedEmbeddingColumnV2.__new__(cls, *args, **kwargs)  # pytype: disable=wrong-keyword-args  # always-use-return-annotations
 
   def __init__(self, *args, **kwargs):
     # For __init__, just capture the inference dense shape and call parent.
diff --git a/tensorflow/python/tpu/ops/BUILD b/tensorflow/python/tpu/ops/BUILD
index 1a97e98cc69..2c7d716ba28 100644
--- a/tensorflow/python/tpu/ops/BUILD
+++ b/tensorflow/python/tpu/ops/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index db84af586c5..12798942e00 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/python/tpu/tests/BUILD b/tensorflow/python/tpu/tests/BUILD
index e37caa0dede..1ab859c1855 100644
--- a/tensorflow/python/tpu/tests/BUILD
+++ b/tensorflow/python/tpu/tests/BUILD
@@ -4,6 +4,7 @@ load("//tensorflow:pytype.default.bzl", "pytype_library")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index bbb56357cf1..3049f9eef9a 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -1401,7 +1401,7 @@ def split_compile_and_replicate(
                                                padding_spec)
     if padding_maps:
       dynamic_shape_inputs = True
-      logging.info("TPU has inputs with dynamic shapes: %s", unpadded_inputs[0])
+      logging.info("TPU has inputs with dynamic shapes: %s", inputs[0])
 
   metadata_kwargs["step_marker_location"] = getattr(
       computation, "step_marker_location", "STEP_MARK_AT_ENTRY")
diff --git a/tensorflow/python/tpu/tpu_embedding_for_serving.py b/tensorflow/python/tpu/tpu_embedding_for_serving.py
index 79ac80b4092..b4ff82610cd 100644
--- a/tensorflow/python/tpu/tpu_embedding_for_serving.py
+++ b/tensorflow/python/tpu/tpu_embedding_for_serving.py
@@ -15,6 +15,7 @@
 """Mid level API for Serving TPU Embeddings."""
 
 from typing import Any, Iterable, Optional, Text, Union, Dict
+from absl import logging
 
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import tpu_strategy
@@ -334,10 +335,17 @@ def _embedding_lookup_for_sparse_tensor(
   Returns:
     Embedding lookup result.
   """
-  if not feature.output_shape and feature.max_sequence_length > 0:
+  inp_rank = inp.shape.rank
+  # The input rank can be None for sequence input tensor.
+  if (
+      not feature.output_shape
+      and feature.max_sequence_length > 0
+      and (inp_rank is None or inp_rank == 2)
+  ):
     batch_size = math_ops.cast(array_ops.shape(inp)[0], dtype=dtypes.int64)
-    sparse_shape = array_ops.stack([batch_size, feature.max_sequence_length],
-                                   axis=0)
+    sparse_shape = array_ops.stack(
+        [batch_size, feature.max_sequence_length], axis=0
+    )
     # TPU Embedding truncates sequences to max_sequence_length, and if we
     # don't truncate, scatter_nd will error out if the index was out of
     # bounds.
@@ -351,7 +359,14 @@ def _embedding_lookup_for_sparse_tensor(
         array_ops.gather(table.read_value(), truncated_inp.values),
         dense_output_shape)
   else:
-    inp_rank = inp.dense_shape.get_shape()[0]
+    if feature.max_sequence_length > 0:
+      logging.warning(
+          (
+              "max_sequence_length setting will be ignored because the rank of"
+              " the input tensor is %d which is not 2."
+          ),
+          inp_rank,
+      )
     if (not feature.validate_weights_and_indices and inp_rank is not None and
         inp_rank <= 2):
       return embedding_ops.embedding_lookup_sparse_v2(
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 1ca35652e5b..619a73c21a8 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -181,7 +181,7 @@ def tpu_step(tpu_features):
       strategy.run(tpu_step, args=(tpu_features, ))
 
   @tf.function
-  def evalution_step(dataset_iterator, num_steps):
+  def evaluation_step(dataset_iterator, num_steps):
     def tpu_step(tpu_features):
       activations = embedding.dequeue()
       model_output = model(activations)
@@ -325,7 +325,7 @@ def __init__(
 
     if self._using_tpu:
       # Extract a list of callable learning rates also in fixed order. Each
-      # table in the config proto will get a index into this list and we will
+      # table in the config proto will get an index into this list, and we will
       # pass this list in the same order after evaluation to the
       # send_tpu_embedding_gradients op.
       self._dynamic_learning_rates = []
@@ -890,6 +890,15 @@ def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
         # Add one dimension to the last axis.
         sample_indices = array_ops.pad(
             sample_indices, paddings=[[0, 0], [0, 1]])
+    else:
+      if feature.max_sequence_length > 0:
+        logging.warning(
+            (
+                "Input tensor is rank %d which is above 2, the"
+                " max_sequence_length setting will be ignored."
+            ),
+            tensor.shape.rank,
+        )
     indices.append(sample_indices)
     values.append(math_ops.cast(tensor.values, dtypes.int64))
     # If we have weights they must be a SparseTensor.
@@ -1007,9 +1016,9 @@ def _raise_error_for_incorrect_control_flow_context(self):
           "Current graph {} does not match graph which contains "
           "TPUReplicateContext {}. This is most likely due to the fact that "
           "enqueueing embedding data is called inside control flow or a "
-          "nested function inside `strategy.run`. This is not supported "
-          "because outside compilation fails to extract the enqueue ops as "
-          "head of computation.".format(ops.get_default_graph(), graph))
+          "tf.function inside `strategy.run`. This is not supported because "
+          "outside compilation fails to extract the enqueue ops as the head of "
+          "a computation.".format(ops.get_default_graph(), graph))
     return in_tpu_ctx
 
   def _raise_error_for_non_direct_inputs(self, features):
@@ -1102,7 +1111,7 @@ def enqueue(
            a. If feature config has max_sequence_length equals 0 or output shape
               set (the max_sequence_length setting will be ignored), the
               output shape will be the input shape excluding the last dimension.
-           b. Otherwize if the tensor is rank 2, the output shape will be input
+           b. Otherwise, if the tensor is rank 2, the output shape will be input
               shape  with last dimension set as max_sequence_length. If the
               tensor is above rank 2, the output shape will be the input shape
               excluding the last dimension and the last dimension of the output
@@ -1263,10 +1272,6 @@ def generate_enqueue_ops():
         if name is not None:
           _add_key_attr(enqueue_op, name)
 
-        # Ensure that this op has outbound control flow, otherwise it won't be
-        # executed.
-        ops.get_default_graph().control_outputs.append(enqueue_op)
-
       tpu.outside_compilation(generate_enqueue_ops)
 
     elif device is None:
@@ -1296,7 +1301,6 @@ def generate_enqueue_ops():
           if name is not None:
             _add_key_attr(enqueue_op, name)
           enqueue_ops.append(enqueue_op)
-      ops.get_default_graph().control_outputs.extend(enqueue_ops)
     else:
       mode_override = "train" if training else "inference"
       device_spec = tf_device.DeviceSpec.from_string(device)
@@ -1313,7 +1317,6 @@ def generate_enqueue_ops():
         # Apply the name tag to the op.
         if name is not None:
           _add_key_attr(enqueue_op, name)
-        ops.get_default_graph().control_outputs.append(enqueue_op)
 
   def _get_input_shapes(self, tensors,
                         in_tpu_context: bool) -> List[TensorShape]:
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index fed71768baa..b613231c79f 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -53,7 +53,9 @@ def __init__(
       weight_decay_factor: Optional[float],
       multiply_weight_decay_factor_by_learning_rate: bool,
       clipvalue: Optional[ClipValueType] = None,
-      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None):
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
     self.learning_rate = learning_rate
     self.use_gradient_accumulation = use_gradient_accumulation
     self.clip_weight_min = clip_weight_min
@@ -79,6 +81,7 @@ def __init__(
           f"Argument `slot_variable_creation_fn` must be either None or a "
           f"callable. Received: {slot_variable_creation_fn}")
     self.slot_variable_creation_fn = slot_variable_creation_fn
+    self.low_dimensional_packing_status = low_dimensional_packing_status
 
   @abc.abstractmethod
   def _slot_names(self) -> List[Text]:
@@ -125,6 +128,10 @@ def _set_optimization_parameters(
       if self.multiply_weight_decay_factor_by_learning_rate:
         parameters.multiply_weight_decay_factor_by_learning_rate = True
 
+    parameters.low_dimensional_packing_status = (
+        self.low_dimensional_packing_status
+    )
+
   @abc.abstractmethod
   def _load(self) -> Callable[..., ops.Operation]:
     """Returns the load function for the optimizer."""
@@ -220,14 +227,17 @@ class SGD(_Optimizer):
   algorithm.
   """
 
-  def __init__(self,
-               learning_rate: Union[float, Callable[[], float]] = 0.01,
-               use_gradient_accumulation: bool = True,
-               clip_weight_min: Optional[float] = None,
-               clip_weight_max: Optional[float] = None,
-               weight_decay_factor: Optional[float] = None,
-               multiply_weight_decay_factor_by_learning_rate: bool = None,
-               clipvalue: Optional[ClipValueType] = None):
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]] = 0.01,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      clipvalue: Optional[ClipValueType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
     """Optimization parameters for stochastic gradient descent.
 
     Args:
@@ -251,10 +261,22 @@ def __init__(self,
         accuracy). See
         'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for more
         information on gradient accumulation and its impact on tpu embeddings.
+      low_dimensional_packing_status: Status of the low-dimensional embedding
+        packing optimization controls whether to optimize the packing of
+        1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
+        memory.
     """
-    super().__init__(learning_rate, use_gradient_accumulation, clip_weight_min,
-                     clip_weight_max, weight_decay_factor,
-                     multiply_weight_decay_factor_by_learning_rate, clipvalue)
+    super().__init__(
+        learning_rate,
+        use_gradient_accumulation,
+        clip_weight_min,
+        clip_weight_max,
+        weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        clipvalue,
+        None,
+        low_dimensional_packing_status,
+    )
 
   def _slot_names(self) -> List[Text]:
     return []
@@ -331,7 +353,9 @@ def __init__(
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: bool = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
-      clipvalue: Optional[ClipValueType] = None):
+      clipvalue: Optional[ClipValueType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -348,20 +372,31 @@ def __init__(
         `weight_decay_factor` is multiplied by the current learning rate.
       slot_variable_creation_fn: If you wish do directly control the creation of
         the slot variables, set this to a callable taking three parameters: a
-          table variable, a list of slot names to create for it, and a list of
-          initializers. This function should return a dict with the slot names
-          as keys and the created variables as values with types matching the
-          table variable. When set to None (the default), uses the built-in
-          variable creation.
+        table variable, a list of slot names to create for it, and a list of
+        initializers. This function should return a dict with the slot names as
+        keys and the created variables as values with types matching the table
+        variable. When set to None (the default), uses the built-in variable
+        creation.
       clipvalue: Controls clipping of the gradient. Set to either a single
         positive scalar value to get clipping or a tuple of scalar values (min,
         max) to set a separate maximum or minimum. If one of the two entries is
         None, then there will be no clipping that direction.
+      low_dimensional_packing_status: Status of the low-dimensional embedding
+        packing optimization controls whether to optimize the packing of
+        1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
+        memory.
     """
-    super().__init__(learning_rate, use_gradient_accumulation, clip_weight_min,
-                     clip_weight_max, weight_decay_factor,
-                     multiply_weight_decay_factor_by_learning_rate, clipvalue,
-                     slot_variable_creation_fn)
+    super().__init__(
+        learning_rate,
+        use_gradient_accumulation,
+        clip_weight_min,
+        clip_weight_max,
+        weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        clipvalue,
+        slot_variable_creation_fn,
+        low_dimensional_packing_status,
+    )
     if initial_accumulator_value <= 0:
       raise ValueError(
           f"Argument `initial_accumulator_value` must be a positive float. "
@@ -447,7 +482,9 @@ def __init__(
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: bool = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
-      clipvalue: Optional[ClipValueType] = None):
+      clipvalue: Optional[ClipValueType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
     """Optimization parameters for Adagrad + Momentum.
 
     Args:
@@ -469,20 +506,31 @@ def __init__(
         `weight_decay_factor` is multiplied by the current learning rate.
       slot_variable_creation_fn: If you wish do directly control the creation of
         the slot variables, set this to a callable taking three parameters: a
-          table variable, a list of slot names to create for it, and a list of
-          initializers. This function should return a dict with the slot names
-          as keys and the created variables as values with types matching the
-          table variable. When set to None (the default), uses the built-in
-          variable creation.
+        table variable, a list of slot names to create for it, and a list of
+        initializers. This function should return a dict with the slot names as
+        keys and the created variables as values with types matching the table
+        variable. When set to None (the default), uses the built-in variable
+        creation.
       clipvalue: Controls clipping of the gradient. Set to either a single
         positive scalar value to get clipping or a tuple of scalar values (min,
         max) to set a separate maximum or minimum. If one of the two entries is
         None, then there will be no clipping that direction.
+      low_dimensional_packing_status: Status of the low-dimensional embedding
+        packing optimization controls whether to optimize the packing of
+        1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
+        memory.
     """
-    super().__init__(learning_rate, use_gradient_accumulation, clip_weight_min,
-                     clip_weight_max, weight_decay_factor,
-                     multiply_weight_decay_factor_by_learning_rate, clipvalue,
-                     slot_variable_creation_fn)
+    super().__init__(
+        learning_rate,
+        use_gradient_accumulation,
+        clip_weight_min,
+        clip_weight_max,
+        weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        clipvalue,
+        slot_variable_creation_fn,
+        low_dimensional_packing_status,
+    )
     if epsilon <= 0:
       raise ValueError("Adagrad momentum: epsilon must be positive")
     if exponent <= 0:
@@ -582,7 +630,9 @@ def __init__(
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
       clipvalue: Optional[ClipValueType] = None,
       multiply_linear_by_learning_rate: bool = False,
-      allow_zero_accumulator: bool = False):
+      allow_zero_accumulator: bool = False,
+      low_dimensional_packing_status: bool = False,
+  ):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -608,11 +658,11 @@ def __init__(
         `weight_decay_factor` is multiplied by the current learning rate.
       slot_variable_creation_fn: If you wish do directly control the creation of
         the slot variables, set this to a callable taking three parameters: a
-          table variable, a list of slot names to create for it, and a list of
-          initializers. This function should return a dict with the slot names
-          as keys and the created variables as values with types matching the
-          table variable. When set to None (the default), uses the built-in
-          variable creation.
+        table variable, a list of slot names to create for it, and a list of
+        initializers. This function should return a dict with the slot names as
+        keys and the created variables as values with types matching the table
+        variable. When set to None (the default), uses the built-in variable
+        creation.
       clipvalue: Controls clipping of the gradient. Set to either a single
         positive scalar value to get clipping or a tuple of scalar values (min,
         max) to set a separate maximum or minimum. If one of the two entries is
@@ -630,11 +680,22 @@ def __init__(
         allow zero and near-zero accumulator values at the cost of some
         performance; this only needs to be set if you are using an initial
         accumulator value of zero, which is uncommon.
+      low_dimensional_packing_status: Status of the low-dimensional embedding
+        packing optimization controls whether to optimize the packing of
+        1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
+        memory.
     """
-    super().__init__(learning_rate, use_gradient_accumulation, clip_weight_min,
-                     clip_weight_max, weight_decay_factor,
-                     multiply_weight_decay_factor_by_learning_rate, clipvalue,
-                     slot_variable_creation_fn)
+    super().__init__(
+        learning_rate,
+        use_gradient_accumulation,
+        clip_weight_min,
+        clip_weight_max,
+        weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        clipvalue,
+        slot_variable_creation_fn,
+        low_dimensional_packing_status,
+    )
     if initial_accumulator_value <= 0:
       raise ValueError(
           f"Argument `initial_accumulator_value` must be a positive float. "
@@ -739,7 +800,9 @@ def __init__(
       weight_decay_factor: Optional[float] = None,
       multiply_weight_decay_factor_by_learning_rate: bool = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
-      clipvalue: Optional[ClipValueType] = None):
+      clipvalue: Optional[ClipValueType] = None,
+      low_dimensional_packing_status: bool = False,
+  ):
     """Optimization parameters for Adam.
 
     See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
@@ -769,21 +832,31 @@ def __init__(
         `weight_decay_factor` is multiplied by the current learning rate.
       slot_variable_creation_fn: If you wish do directly control the creation of
         the slot variables, set this to a callable taking three parameters: a
-          table variable, a list of slot names to create for it, and a list of
-          initializers. This function should return a dict with the slot names
-          as keys and the created variables as values with types matching the
-          table variable. When set to None (the default), uses the built-in
-          variable creation.
+        table variable, a list of slot names to create for it, and a list of
+        initializers. This function should return a dict with the slot names as
+        keys and the created variables as values with types matching the table
+        variable. When set to None (the default), uses the built-in variable
+        creation.
       clipvalue: Controls clipping of the gradient. Set to either a single
         positive scalar value to get clipping or a tiple of scalar values (min,
         max) to set a separate maximum or minimum. If one of the two entries is
         None, then there will be no clipping that direction.
+      low_dimensional_packing_status: Status of the low-dimensional embedding
+        packing optimization controls whether to optimize the packing of
+        1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
+        memory.
     """
     super(Adam, self).__init__(
-        learning_rate, use_gradient_accumulation, clip_weight_min,
-        clip_weight_max, weight_decay_factor,
-        multiply_weight_decay_factor_by_learning_rate, clipvalue,
-        slot_variable_creation_fn)
+        learning_rate,
+        use_gradient_accumulation,
+        clip_weight_min,
+        clip_weight_max,
+        weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        clipvalue,
+        slot_variable_creation_fn,
+        low_dimensional_packing_status,
+    )
     if beta_1 < 0. or beta_1 >= 1.:
       raise ValueError(
           f"Argument `beta_1` must be >= 0 and < 1. Received: {beta_1}.")
@@ -1049,10 +1122,14 @@ def _set_table_descriptor(
     else:
       parameters.learning_rate.constant = self.optimizer.learning_rate
 
+    if self.optimizer.low_dimensional_packing_status:
+      parameters.low_dimensional_packing_status = (
+          optimization_parameters_pb2.LowDimensionalPackingStatus.Status.ENABLED
+      )
     # Use optimizer to handle the rest of the parameters.
     self.optimizer._set_optimization_parameters(parameters)  # pylint: disable=protected-access
     if self.quantization_config:
-      self.quantization_config._set_quantization_parameters(parameters)  # pylint: disable=protected-access
+      self.quantization_config._set_optimization_parameters(parameters)  # pylint: disable=protected-access
 
 
 @tf_export("tpu.experimental.embedding.FeatureConfig")
@@ -1163,11 +1240,12 @@ def __init__(self,
   def __repr__(self):
     return ("FeatureConfig(table={table!r}, "
             "max_sequence_length={max_sequence_length!r}, "
-            "validate_weights_and_indices={"
-            "validate_weights_and_indices!r}, name={name!r})".format(
+            "validate_weights_and_indices={validate_weights_and_indices!r}, "
+            "output_shape={output_shape!r}, name={name!r})".format(
                 table=self.table,
                 max_sequence_length=self.max_sequence_length,
                 validate_weights_and_indices=self.validate_weights_and_indices,
+                output_shape=self.output_shape,
                 name=self.name))
 
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
index 8fc33d22ed0..62a32c9915d 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -93,14 +93,15 @@ def test_feature_config_repr(self):
         combiner='sum', name='table')
 
     feature_config = tpu_embedding_v2_utils.FeatureConfig(
-        table=table, name='feature')
+        table=table, output_shape=[16, 4], name='feature')
 
     self.assertEqual(
         repr(feature_config),
         'FeatureConfig(table=TableConfig(vocabulary_size=2, dim=4, '
         'initializer=None, optimizer=None, combiner=\'sum\', '
         'name=\'table\', quantization_config=None), max_sequence_length=0, '
-        'validate_weights_and_indices=True, name=\'feature\')')
+        'validate_weights_and_indices=True, output_shape=TensorShape([16, 4]), '
+        'name=\'feature\')')
 
   def test_quantization_config_num_buckets(self):
     with self.assertRaisesRegex(ValueError, 'num_buckets'):
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 697ff9ef7c5..796a5c075aa 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -169,6 +169,27 @@ def tpu_fn(x):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(35., shape=(strategy.num_replicas_in_sync)))
 
+  def testJitCompile(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn(x):
+      logging_ops.print_v2("Outside compiled", x)
+
+    # jit_compile=True should have no effect for TPU.
+    @def_function.function(jit_compile=True)
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        tpu.outside_compilation(outside_fn, x2)
+        return x2 + 5.0
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(35., shape=(strategy.num_replicas_in_sync)))
+
   def testHostInputOutput(self):
     strategy = get_tpu_strategy()
 
diff --git a/tensorflow/python/tpu/tpu_replication.py b/tensorflow/python/tpu/tpu_replication.py
new file mode 100644
index 00000000000..c00187aeeb4
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_replication.py
@@ -0,0 +1,692 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file8 except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+
+"""OutsideCompilation, TPUReplicateContext, and supporting functions."""
+
+from typing import Any, Callable, List, Optional, Text, Tuple, Union
+from absl import logging
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.types import core as core_types
+from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
+
+_MAX_WARNING_LINES = 5
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_DENYLISTED_OPS = frozenset([
+    "Placeholder",
+])
+
+
+# XLA doesn't currently support reading of intermediate tensors, thus some ops
+# are not supported.
+_UNSUPPORTED_OPS = frozenset([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+])
+
+
+def is_tpu_strategy(strategy: Any) -> bool:
+  is_tpu_strat = lambda k: k.__name__.startswith("TPUStrategy")
+  clz = strategy.__class__
+  return is_tpu_strat(clz) or any(map(is_tpu_strat, clz.__bases__))
+
+
+def _enclosing_tpu_device_assignment(
+) -> Optional[device_assignment_lib.DeviceAssignment]:
+  if not distribution_strategy_context.has_strategy():
+    return None
+  strategy = distribution_strategy_context.get_strategy()
+  if not is_tpu_strategy(strategy):
+    return None
+  return strategy.extended._device_assignment  # pylint: disable=protected-access
+
+
+class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU computation.
+
+  The primary role of `TPUReplicateContext` is to mark operators inside a
+  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
+  is a unique name.
+
+  We use a `ControlFlowContext` to perform the annotation since it integrates
+  with Tensorflow constructs like ResourceVariables. For example, if a
+  `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
+  """
+
+  def __init__(self, name: Text, num_replicas: int, pivot: ops.Operation):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
+    super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_v2_context = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
+    self._name = name
+    self._name_as_bytes = compat.as_bytes(name)
+    self._tpu_relicate_attr_buf = c_api_util.ScopedTFBuffer(
+        attr_value_pb2.AttrValue(s=self._name_as_bytes).SerializeToString())
+    self._unsupported_ops = []
+    self._pivot = pivot
+    self._replicated_vars = {}
+
+  def get_replicated_var_handle(self,
+                                name: Text,
+                                handle_id: Text,
+                                vars_: Union[List[core_types.Tensor],
+                                             List[variables.Variable]],
+                                is_mirrored: bool = False,
+                                is_packed: bool = False) -> core_types.Tensor:
+    """Returns a variable handle for replicated TPU variable 'var'.
+
+    This is a method used by an experimental replicated variable implementation
+    and is not intended as a public API.
+
+    Args:
+      name: The common name of the variable.
+      handle_id: Unique ID of the variable handle, used as the cache key.
+      vars_: The replicated TPU variables or handles.
+      is_mirrored: Whether the variables are mirrored, which guarantees the
+        values in each replica are always the same.
+      is_packed: Whether the replicated variables are packed into one variable.
+
+    Returns:
+      The handle of the TPU replicated input node.
+    """
+    device_assignment = _enclosing_tpu_device_assignment()
+    # We don't need to put device assignment as part of the replicated_vars key
+    # because each TPUReplicateContext will only have one device assignment.
+    handle = self._replicated_vars.get(handle_id)
+    if handle is not None:
+      return handle
+
+    if device_assignment is not None and not is_packed:
+      # Find a variable copy for each replica in the device assignment.
+      # Note that the order of devices for replicas for the variable and the
+      # device assignment might not match.
+      job_name = pydev.DeviceSpec.from_string(vars_[0].device).job
+      devices_to_vars = {device_util.canonicalize(v.device): v for v in vars_}
+      replicated_vars = []
+      for replica_id in range(device_assignment.num_replicas):
+        for logical_core in range(device_assignment.num_cores_per_replica):
+          device = device_util.canonicalize(
+              device_assignment.tpu_device(
+                  replica=replica_id, logical_core=logical_core, job=job_name))
+          if device in devices_to_vars:
+            replicated_vars.append(devices_to_vars[device])
+            break
+        else:
+          raise ValueError(
+              "Failed to find a variable on any device in replica {} for "
+              "current device assignment".format(replica_id))
+    else:
+      replicated_vars = vars_
+
+    # Builds a TPUReplicatedInput node for the variable, if one does not already
+    # exist. The TPUReplicatedInput node must belong to the enclosing
+    # control-flow scope of the TPUReplicateContext.
+    # TODO(phawkins): consider changing the contract of the TPU encapsulation
+    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
+    # instead.
+
+    _, graph = _enclosing_tpu_context_and_graph()
+    with graph.as_default():
+      # If replicated_vars are variables, get the handles. Note that this can be
+      # done inside TPUReplicateContext because replicated_vars.handle may
+      # create new ops.
+      if isinstance(replicated_vars[0], variables.Variable):
+        replicated_vars = [v.handle for v in replicated_vars]
+      # pylint: disable=protected-access
+      saved_context = graph._get_control_flow_context()
+      graph._set_control_flow_context(self.outer_context)
+      handle = tpu_ops.tpu_replicated_input(
+          replicated_vars,
+          name=name + "/handle",
+          is_mirrored_variable=is_mirrored,
+          is_packed=is_packed)
+      graph._set_control_flow_context(saved_context)
+      # pylint: enable=protected-access
+    self._replicated_vars[handle_id] = handle
+    return handle
+
+  def report_unsupported_operations(self) -> None:
+    if self._unsupported_ops:
+      op_str = "\n".join(
+          "  %s (%s)" % (op.type, op.name) for op in
+          self._unsupported_ops[:_MAX_WARNING_LINES])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops
+            ) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more",
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
+
+  def EnterGradientColocation(self, op: ops.Operation, gradient_uid: Text):
+    if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # If we are in TF 2 functions (control flow V2 functions, or
+        # tf.function()), we need to attach _xla_outside_compilation attribute
+        # directly because we are not in TPUReplicateContext.
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR).decode("ascii")
+        except ValueError:
+          # The attr was not present: do nothing.
+          return
+        parts = outside_attr.split(".")
+        cluster = parts[0] + "." + gradient_uid
+        self._outside_compilation_v2_context = OutsideCompilationV2Context(
+            cluster)
+        self._outside_compilation_v2_context.Enter()
+        return
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR).decode("ascii")
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an outside_compilation
+          # cluster C in a forward computation we would like to put the ops
+          # corresponding to the gradient of X into a new outside_compilation
+          # cluster C'. However, if we take the gradient of X twice, the second
+          # one should get yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is the
+          # cluster that X was in before we took gradients, and a 'gradient_uid'
+          # which is different for every invocation of gradients, and put the
+          # gradient of X in cluster 'root_cluster.gradient_uid'.
+          #
+          # When taking a gradient of a gradient, some ops will be colocated
+          # with Op in the forward pass (e.g., cluster root_cluster) and some in
+          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
+          # We need all of the grad-of-grad ops to be in the same cluster to
+          # avoid cyclic dependencies between clusters. We adopt a heuristic
+          # that puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          cluster = parts[0] + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op: ops.Operation, gradient_uid: Text):
+    if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # Inside a TF2 tf.function or control flow graph and `op` was not
+        # marked to be outside compiled.
+        assert self._outside_compilation_v2_context is None
+        return
+      if self._outside_compilation_v2_context is not None:
+        # Inside a TF2 tf.function or control flow graph and `op` was
+        # marked to be outside compiled.
+        self._outside_compilation_v2_context.Exit()
+        self._outside_compilation_v2_context = None
+        return
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            ("Badly nested gradient colocation: "
+             + f"empty stack when popping Op {op.name}")
+        )
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op,
+            ("Badly nested gradient colocation, " +
+             f"expected {last_op}, got {op.name}")
+        )
+
+  def _EnterOutsideCompilationScope(self, cluster: Optional[Text] = None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the type and device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def type(self):
+        return "FakeOp"
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        if isinstance(device, pydev.DeviceSpec):
+          self._device = device.to_string()
+        else:
+          self._device = device
+
+      def _set_device_from_string(self, device_str):
+        self._device = device_str
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise ValueError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self) -> None:
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def HostComputeCore(self) -> List[Text]:
+    return self._host_compute_core
+
+  def _RemoveExternalControlEdges(
+      self,
+      op: ops.Operation) -> Tuple[List[ops.Operation], List[ops.Operation]]:
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
+  def AddOp(self, op: ops.Operation) -> None:
+    # pylint: disable=protected-access
+    if op.type in _DENYLISTED_OPS:
+      logging.error(
+          "Operation of type %s (%s) is not supported on the TPU. "
+          "Execution will fail if this op is used in the graph. ", op.type,
+          op.name)
+
+    if op.type in _UNSUPPORTED_OPS:
+      self._unsupported_ops.append(op)
+
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
+      raise NotImplementedError(
+          f"Non-resource Variables are not supported inside TPU computations "
+          f"(operator name: {op.name})")
+
+    # TensorFlowOpLayer may clone nodes that are in tpu.rewrite()s. It'll add
+    # the "_cloned" attribute and we should continue in that case.
+    if (_TPU_REPLICATE_ATTR in op.node_def.attr and
+        "_cloned" not in op.node_def.attr):
+      raise ValueError(f"TPU computations cannot be nested on op ({op})")
+    op._set_attr_with_buf(_TPU_REPLICATE_ATTR,
+                          self._tpu_relicate_attr_buf.buffer)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
+
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    (internal_control_inputs,
+     external_control_inputs) = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not internal_control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in range(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x is not x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_control_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_control_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val: core_types.Tensor) -> core_types.Tensor:
+    """Add `val` to the current context and its outer context recursively."""
+    if not self._outer_context:
+      return val
+
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
+    result = val
+    self._values.add(val.name)
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
+    return result
+
+  def AddInnerOp(self, op: ops.Operation):
+    self.AddOp(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  @property
+  def grad_state(self):
+    # Define the gradient loop state associated with the TPUReplicateContext to
+    # be None as the TPUReplicateContext does not get nested nor does the
+    # grad_state outside the TPUReplicateContext affect the graph inside so the
+    # grad_state should be as if this is the top-level gradient state.
+    return None
+
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self) -> ops.Operation:
+    return self._pivot
+
+  def RequiresUniqueFunctionRetracing(self):
+    # More context: b/158152827. TPU stack uses the TPUReplicateContext to
+    # create replicated variable handles and cluster TPU computations, thus we
+    # always retrace a tf.function when the wrapped TPUReplicateContext changes.
+    return True
+
+
+def _enclosing_tpu_context_and_graph() -> Tuple[Any, Any]:
+  """Returns the TPUReplicateContext and its associated graph."""
+  graph = ops.get_default_graph()
+  while graph is not None:
+    # pylint: disable=protected-access
+    context_ = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+    while context_ is not None:
+      if isinstance(context_, TPUReplicateContext):
+        return context_, graph
+      context_ = context_.outer_context
+    graph = getattr(graph, "outer_graph", None)
+  raise ValueError("get_replicated_var_handle() called without "
+                   "TPUReplicateContext. This shouldn't happen. Please file "
+                   "a bug.")
+
+
+class OutsideCompilationV2Context(control_flow_ops.ControlFlowContext):
+  """The context for outside compilation in Tensorflow 2.0.
+
+  Every op added in this context will be assigned an _xla_outside_compilation
+  attribute.
+  """
+
+  def __init__(self, name: Text):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._name = name
+
+  def AddOp(self, op: ops.Operation) -> None:
+    if self._outer_context:
+      self._outer_context.AddOp(op)
+    # pylint: disable=protected-access
+    op._set_attr("_xla_outside_compilation",
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
+
+  def AddInnerOp(self, op: ops.Operation) -> None:
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+    # pylint: disable=protected-access
+    op._set_attr("_xla_outside_compilation",
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    raise NotImplementedError
+
+
+@tf_export(v1=["tpu.outside_compilation"])
+def outside_compilation(computation: Callable[..., Any], *args,
+                        **kwargs) -> Any:
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  `tf.tpu.outside_compilation()` is used to run ops in `computation` on CPU
+  instead of running on TPU. For example, users can run ops that are not
+  supported on TPU's (e.g. tf.summary.write()) by explicitly placing those
+  ops on CPU's. Below usage of outside compilation will place ops in
+  `computation_with_string_ops` on CPU.
+
+  Example usage:
+
+  ```python
+  def computation_with_string_ops(x):
+    # strings types are not supported on TPU's and below ops must
+    # run on CPU instead.
+    output = tf.strings.format('1{}', x)
+    return tf.strings.to_number(output)
+
+  def tpu_computation():
+    # Expected output is 11.
+    output = tf.tpu.outside_compilation(computation_with_string_ops, 1)
+  ```
+
+  Outside compilation should be called inside TPUReplicateContext. That is,
+  `tf.tpu.outside_compilation()` should be called inside a function that is
+  passed to `tpu.split_compile_and_replicate()` -- this is implied when
+  outside compilation is invoked inside a function passed to TPUStrategy
+  `run()`. If invoked outside of TPUReplicateContext,
+  then this simply returns the result of `computation`, and therefore,
+  would be a no-op. Note that outside compilation is different from
+  `tf.distribute.experimental.TPUStrategy.merge_call()` as logic in
+  outside compilation is replicated and executed separately for each
+  replica. On the other hand, `merge_call()` requires a `merge_fn`
+  to aggregate the inputs from different replicas and is executed only
+  once.
+
+  For variables placed in TPU device, which includes variables created inside
+  TPUStrategy scope, outside compilation logic must not include variable
+  read/write. For variables placed on host, which is the case when variables
+  created via TPUEstimator, variable read/write is only allowed if the variable
+  is not accessed by any other ops in the TPU computation. Variable read/write
+  from outside compilation cluster is not visible from TPU computation and
+  vice versa. Therefore, if outside compilation logic contains such host
+  variables read/write ops and if the variables are accessed by TPU
+  computation as well, then this may lead to deadlock.
+
+  Internally, `tf.tpu.outside_compilation()` adds outside compilation
+  attributes to all ops in `computation`. During later graph pass, these
+  ops with outside compilation attribute is extracted out and replicated
+  into a host-side graph. Inputs to this extract host-side graph is sent
+  from TPU computation graph to host graph via a pair of XlaSendToHost and
+  XlaRecvFromHost ops. Note that using `tf.tpu.outside_compilation()`
+  may result in tensor transfer between TPU and CPU, leading to non-trivial
+  performance impact.
+
+  Args:
+    computation: A Python function that builds the computation to place on the
+      host.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
+  Returns:
+    The Tensors returned by computation.
+  """
+  args = [] if args is None else args
+  graph = ops.get_default_graph()
+
+  # If we are in TF 2 functions (control flow V2 functions, or tf.function()),
+  # we need to attach _xla_outside_compilation attribute directly because we are
+  # not in TPUReplicateContext.
+  if isinstance(graph, func_graph.FuncGraph):
+    try:
+      tpu_context, _ = _enclosing_tpu_context_and_graph()
+    except ValueError:
+      logging.warning(
+          "Outside compilation attempted outside TPUReplicateContext "
+          "scope. As no enclosing TPUReplicateContext can be found, "
+          "returning the result of `computation` as is.")
+      return computation(*args, **kwargs)
+
+    # pylint: disable=protected-access
+    outside_compilation_name = str(tpu_context._outside_compilation_counter)
+    tpu_context._outside_compilation_counter = (
+        tpu_context._outside_compilation_counter + 1)
+    # pylint: enable=protected-access
+
+    outside_compilation_context = OutsideCompilationV2Context(
+        outside_compilation_name)
+    outside_compilation_context.Enter()
+    args = [] if args is None else args
+    retval = computation(*args, **kwargs)
+    outside_compilation_context.Exit()
+    return retval
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args, **kwargs)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index 91ebc3cdaa0..feb8a0c522a 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -20,7 +20,10 @@
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import monitoring
+from tensorflow.python.eager.def_function import function
+from tensorflow.python.eager.def_function import functions_run_eagerly
+from tensorflow.python.eager.def_function import run_functions_eagerly
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -35,6 +38,11 @@
 _LOCAL_MASTERS = ("", "local")
 
 
+_tpu_worker_address = monitoring.StringGauge(
+    "/tensorflow/tpu/worker_address",
+    "The worker address that the coordinator/client connects to.", "address")
+
+
 @tf_export("tpu.experimental.initialize_tpu_system")
 def initialize_tpu_system(cluster_resolver=None):
   """Initialize the TPU devices.
@@ -91,7 +99,7 @@ def initialize_tpu_system(cluster_resolver=None):
     job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
 
   if context.executing_eagerly():
-    @function.defun_with_attributes
+    @function(autograph=False)
     def _tpu_init_fn():
       # In TF1, we usually close chips when compilation fails to clear the data
       # in infeed. In TF2, we don't need to do this because infeed is no longer
@@ -105,6 +113,15 @@ def _tpu_init_fn():
     # The TPU_SYSTEM device must match the device used in tpu.initialize_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
+    run_eagerly = functions_run_eagerly()
+    if run_eagerly:
+      logging.warning(
+          "It looks like tf.function behavior was disabled, perhaps using"
+          " tf.config.run_functions_eagerly."
+          " tf.tpu.experimental.initialize_tpu_system requires tf.function to"
+          " work. This primitive will override the disable."
+      )
+      run_functions_eagerly(False)
     try:
       with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
         output = _tpu_init_fn()
@@ -114,7 +131,9 @@ def _tpu_init_fn():
           None, None,
           "TPUs not found in the cluster. Failed in initialization: "
           + str(e))
-
+    finally:
+      if run_eagerly is not None:
+        run_functions_eagerly(run_eagerly)
     # Clear out the eager context caches since the memory is invalid now.
     context.context()._initialize_logical_devices()  # pylint: disable=protected-access
 
@@ -144,6 +163,12 @@ def _tpu_init_fn():
   cluster_resolver.set_tpu_topology(serialized_topology)
   _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology
 
+  # Record the address of the TPU worker-0 that the coordinator connects to.
+  # This can be used to associate the TPU worker with the right coordinator when
+  # aggregating the metrics for the application. An example of the address:
+  # /bns/mb/borg/mb/bns/chienchunh/chienchunh_group_49640234.1.tfm_train_tpu_worker/0
+  _tpu_worker_address.get_cell("address").set(cluster_resolver.get_master())
+
   return tpu_topology
 
 
@@ -202,15 +227,28 @@ def shutdown_tpu_system(cluster_resolver=None):
       # avoid the output node match multiple devices error.
       job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
 
-    @function.defun_with_attributes
+    @function(autograph=False)
     def _tpu_shutdown_fn():
       tpu.shutdown_system(job=job)
 
     # The TPU_SYSTEM device must match the device used in tpu.shutdown_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
-    with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
-      _tpu_shutdown_fn()
+    run_eagerly = functions_run_eagerly()
+    if run_eagerly:
+      logging.warning(
+          "It looks like tf.function behavior was disabled, perhaps using"
+          " tf.config.run_functions_eagerly."
+          " tf.tpu.experimental.shutdown_tpu_system requires tf.function to"
+          " work. This primitive will override the disable."
+      )
+      run_functions_eagerly(False)
+    try:
+      with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
+        _tpu_shutdown_fn()
+    finally:
+      if run_eagerly is not None:
+        run_functions_eagerly(run_eagerly)
 
     # Clear out the eager context caches since the memory is invalid now.
     logging.info("Clearing out eager caches")
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 09b58578a05..53ecbadde64 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -122,6 +123,7 @@ py_library(
         ":base",
         "//tensorflow/python:lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_conversion_registry",
     ],
 )
 
diff --git a/tensorflow/python/trackable/asset.py b/tensorflow/python/trackable/asset.py
index c49a057d72e..85a6048804d 100644
--- a/tensorflow/python/trackable/asset.py
+++ b/tensorflow/python/trackable/asset.py
@@ -18,21 +18,15 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.trackable import base
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/205183809): Remove once nested_structure_coder no longer adds
-# dependency cycles.
-saved_model_utils = lazy_loader.LazyLoader(
-    "saved_model_utils", globals(),
-    "tensorflow.python.saved_model.utils_impl")
-
-
 @tf_export("saved_model.Asset")
 class Asset(base.Trackable):
   """Represents a file asset to hermetically include in a SavedModel.
@@ -93,7 +87,7 @@ def _deserialize_from_proto(cls, object_proto, export_dir, asset_file_def,
                               **unused_kwargs):
     proto = object_proto.asset
     filename = file_io.join(
-        saved_model_utils.get_assets_dir(export_dir),
+        path_helpers.get_assets_dir(export_dir),
         asset_file_def[proto.asset_file_def_index].filename)
     asset = cls(filename)
     if not context.executing_eagerly():
@@ -118,5 +112,5 @@ def _export_to_saved_model_graph(self, tensor_map, **unused_kwargs):
     return [self.asset_path]
 
 
-ops.register_tensor_conversion_function(
+tensor_conversion_registry.register_tensor_conversion_function(
     Asset, lambda asset, **kw: ops.convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/trackable/base.py b/tensorflow/python/trackable/base.py
index 9edb9e7872f..02d061d35fa 100644
--- a/tensorflow/python/trackable/base.py
+++ b/tensorflow/python/trackable/base.py
@@ -636,8 +636,9 @@ def _handle_deferred_dependencies(self, name, trackable):
   def _gather_saveables_for_checkpoint(self):
     """Returns a dictionary of values to checkpoint with this object.
 
-    NOTE: This method is deprecated, please use `_serialize_to_tensors` and
-    `_restore_from_tensors` instead.
+    NOTE: This method is deprecated, prefer implementing `_serialize_to_tensors`
+    and `_restore_from_tensors` instead. This method is only used in the
+    deprecated `tf.compat.v1.train.Saver`.
 
     Keys in the returned dictionary are local to this object and in a separate
     namespace from dependencies. Values may either be `SaveableObject` factories
@@ -673,11 +674,7 @@ def _gather_saveables_for_checkpoint(self):
     from tensorflow.python.training.saving import saveable_object_util
     # pylint: enable=g-import-not-at-top
     if saveable_object_util.trackable_has_serialize_to_tensor(self):
-
-      def create_saveable(name=""):
-        return saveable_object_util.TrackableSaveable(self, name)
-
-      return {"": create_saveable}
+      return saveable_object_util.saveable_objects_from_trackable(self)
     else:
       return getattr(self, "_self_saveable_object_factories", {})
 
@@ -725,6 +722,10 @@ def _trackable_children(self):
     If the `name` attribute should be saved to the checkpoint, then convert it
     a `tf.Variable`.
 
+    **TF1 Saver Compatibility**
+    If your Trackable needs to be comatible with `tf.compat.v1.train.Saver`,
+    implement `_gather_saveables_from_checkpoint`.
+
     Returns:
       A dictionary mapping names to tensors.
     """
@@ -744,27 +745,6 @@ def _restore_from_tensors(self, restored_tensors):
     """
     raise NotImplementedError
 
-  def _map_resources(self, save_options):  # pylint: disable=unused-argument
-    """Makes new resource handle ops corresponding to existing resource tensors.
-
-    Internal sub-classes can override this to inform model saving how to add new
-    resource handle ops to the main GraphDef of a SavedModel (TF 1.x style
-    graph), which allows session based APIs (e.g, C++ loader API) to interact
-    with resources owned by this object.
-
-    Args:
-      save_options: A tf.saved_model.SaveOptions instance.
-
-    Returns:
-      A tuple of (object_map, resource_map):
-        object_map: A dictionary mapping from objects that hold existing
-          resource tensors to replacement objects created to hold the new
-          resource tensors.
-        resource_map: A dictionary mapping from existing resource tensors to
-          newly created resource tensors.
-    """
-    return {}, {}
-
   def _serialize_to_proto(self, object_proto=None, **kwargs):
     """Returns a proto of any type to be saved into the SavedModel.
 
@@ -1043,9 +1023,9 @@ def _trackable_children(self):
     return {name: ref for name, ref in self._checkpoint_dependencies}
 
   def _export_to_saved_model_graph(self,
-                                   object_map=None,
-                                   tensor_map=None,
-                                   options=None,
+                                   object_map,
+                                   tensor_map,
+                                   options,
                                    **kwargs):
     """Creates a copy of this object's tensors onto SavedModel graph.
 
@@ -1071,8 +1051,6 @@ def _export_to_saved_model_graph(self,
     Returns:
       Flat list of original tensors that have been copied.
     """
-    del kwargs  # Unused.
-    self_object_map, self_tensor_map = self._map_resources(options)
-    object_map.update(self_object_map)
-    tensor_map.update(self_tensor_map)
-    return list(self_tensor_map.keys())
+    _, _, _ = object_map, tensor_map, options
+    del kwargs
+    return []
diff --git a/tensorflow/python/trackable/data_structures.py b/tensorflow/python/trackable/data_structures.py
index 449861df465..dfe0301a101 100644
--- a/tensorflow/python/trackable/data_structures.py
+++ b/tensorflow/python/trackable/data_structures.py
@@ -30,15 +30,10 @@
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.trackable import base
 from tensorflow.python.trackable import layer_utils
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
 
-module = lazy_loader.LazyLoader(
-    "module", globals(), "tensorflow.python.module.module")
-
-
 class NoDependency:
   """Allows attribute assignment to `Trackable` objects with no dependency.
 
@@ -245,7 +240,8 @@ def trainable_weights(self):
       return []
     trainable_variables = []
     for obj in self._values:
-      if isinstance(obj, (TrackableDataStructure, module.Module)):
+      if isinstance(obj, base.Trackable) and hasattr(
+          obj, "trainable_variables"):
         trainable_variables += obj.trainable_variables
     trainable_extra_variables = [
         v for v in self._self_extra_variables if v.trainable
@@ -262,14 +258,16 @@ def non_trainable_weights(self):
     ]
     non_trainable_variables = []
     for obj in self._values:
-      if isinstance(obj, (TrackableDataStructure, module.Module)):
+      if isinstance(obj, base.Trackable) and hasattr(
+          obj, "non_trainable_variables"):
         non_trainable_variables += obj.non_trainable_variables
 
     if not self._self_trainable:
       # Return order is all trainable vars, then all non-trainable vars.
       trainable_variables = []
       for obj in self._values:
-        if isinstance(obj, (TrackableDataStructure, module.Module)):
+        if isinstance(obj, base.Trackable) and hasattr(
+            obj, "trainable_variables"):
           trainable_variables += obj.trainable_variables
 
       non_trainable_variables = (
diff --git a/tensorflow/python/trackable/resource.py b/tensorflow/python/trackable/resource.py
index 1eae37a5a7e..11ec2e29167 100644
--- a/tensorflow/python/trackable/resource.py
+++ b/tensorflow/python/trackable/resource.py
@@ -172,7 +172,8 @@ def resource_handle(self):
         self._resource_handle = self._create_resource()
     return self._resource_handle
 
-  def _map_resources(self, _):
+  def _export_to_saved_model_graph(
+      self, object_map, tensor_map, **unused_kwargs):
     """For implementing `Trackable`."""
     new_obj = copy.copy(self)
     # pylint: disable=protected-access
@@ -180,9 +181,9 @@ def _map_resources(self, _):
       new_resource = new_obj._create_resource()
     new_obj._resource_handle = new_resource
     # pylint: enable=protected-access
-    obj_map = {self: new_obj}
-    resource_map = {self.resource_handle: new_resource}
-    return obj_map, resource_map
+    object_map[self] = new_obj
+    tensor_map[self.resource_handle] = new_resource
+    return [self.resource_handle]
 
   def _trackable_children(self, save_type, **kwargs):
     children = super()._trackable_children(save_type, **kwargs)
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 42f5bd6c596..0077a9e30fe 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -107,6 +108,7 @@ py_library(
         "//tensorflow/python:learning_rate_decay",
         "//tensorflow/python:sdca_ops",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/checkpoint:checkpoint_view",
         "//tensorflow/python/trackable:base_delegate",
         "//tensorflow/python/training/experimental:loss_scale_optimizer",
         "//tensorflow/python/training/experimental:mixed_precision",
@@ -120,7 +122,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":training_lib",
-        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_core",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
         "//tensorflow/python/training/tracking:base",
@@ -334,6 +336,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/summary",
         "//tensorflow/python/util:tf_export",
     ],
@@ -394,6 +397,7 @@ py_library(
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:tf_export",
     ],
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 88564ef064d..f43274ed101 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -50,6 +50,18 @@ def load_checkpoint(ckpt_dir_or_file):
   If `ckpt_dir_or_file` resolves to a directory with multiple checkpoints,
   reader for the latest checkpoint is returned.
 
+  Example usage:
+
+  ```python
+  import tensorflow as tf
+  a = tf.Variable(1.0)
+  b = tf.Variable(2.0)
+  ckpt = tf.train.Checkpoint(var_list={'a': a, 'b': b})
+  ckpt_path = ckpt.save('tmp-ckpt')
+  reader= tf.train.load_checkpoint(ckpt_path)
+  print(reader.get_tensor('var_list/a/.ATTRIBUTES/VARIABLE_VALUE'))  # 1.0
+  ```
+
   Args:
     ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint
       file.
@@ -72,6 +84,22 @@ def load_checkpoint(ckpt_dir_or_file):
 def load_variable(ckpt_dir_or_file, name):
   """Returns the tensor value of the given variable in the checkpoint.
 
+  When the variable name is unknown, you can use `tf.train.list_variables` to
+  inspect all the variable names.
+
+  Example usage:
+
+  ```python
+  import tensorflow as tf
+  a = tf.Variable(1.0)
+  b = tf.Variable(2.0)
+  ckpt = tf.train.Checkpoint(var_list={'a': a, 'b': b})
+  ckpt_path = ckpt.save('tmp-ckpt')
+  var= tf.train.load_variable(
+      ckpt_path, 'var_list/a/.ATTRIBUTES/VARIABLE_VALUE')
+  print(var)  # 1.0
+  ```
+
   Args:
     ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
     name: Name of the variable to return.
@@ -94,7 +122,7 @@ def list_variables(ckpt_dir_or_file):
 
   Example usage:
 
-    ```python
+  ```python
   import tensorflow as tf
   import os
   ckpt_directory = "/tmp/training_checkpoints/ckpt"
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
index 7c0773cd657..346c9d49722 100644
--- a/tensorflow/python/training/experimental/BUILD
+++ b/tensorflow/python/training/experimental/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 66fd15082ea..cb7282561ac 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -24,6 +24,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -356,7 +357,8 @@ def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None,
   @end_compatibility
   """
   with ops.name_scope(name, "input_producer", tensor_list):
-    tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensor_list)
+    tensor_list = indexed_slices.convert_n_to_tensor_or_indexed_slices(
+        tensor_list)
     if not tensor_list:
       raise ValueError(
           "Expected at least one tensor in slice_input_producer().")
@@ -623,15 +625,18 @@ def _restore_sparse_tensors(stored_list, sparse_info_list):
 
 
 def _validate(tensor_list):
-  tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensor_list)
+  tensor_list = indexed_slices.convert_n_to_tensor_or_indexed_slices(
+      tensor_list)
   if not tensor_list:
     raise ValueError("Expected at least one tensor in batch().")
   return tensor_list
 
 
 def _validate_join(tensor_list_list):
-  tensor_list_list = [ops.convert_n_to_tensor_or_indexed_slices(tl)
-                      for tl in tensor_list_list]
+  tensor_list_list = [
+      indexed_slices.convert_n_to_tensor_or_indexed_slices(tl)
+      for tl in tensor_list_list
+  ]
   if not tensor_list_list:
     raise ValueError("Expected at least one input in batch_join().")
   return tensor_list_list
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index a23789a7676..b473af6b298 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -677,7 +677,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
       if g is not None:
         try:
           # Convert the grad to Tensor or IndexedSlices if necessary.
-          g = ops.convert_to_tensor_or_indexed_slices(g)
+          g = indexed_slices.convert_to_tensor_or_indexed_slices(g)
         except TypeError:
           raise TypeError(
               "Gradient must be convertible to a Tensor"
@@ -779,7 +779,7 @@ def update(v, g):
 
       try:
         # Convert the grad to Tensor or IndexedSlices if necessary.
-        g = ops.convert_to_tensor_or_indexed_slices(g)
+        g = indexed_slices.convert_to_tensor_or_indexed_slices(g)
       except TypeError:
         raise TypeError("Gradient must be convertible to a Tensor"
                         " or IndexedSlices, or None: %s" % g)
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index f1d639bb41a..4fa6c57f37f 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -511,6 +511,9 @@ def _build_internal(self,
       raise ValueError("save and restore operations need to be built together "
                        " when eager execution is not enabled.")
 
+    if not isinstance(names_to_saveables, dict):
+      names_to_saveables = saveable_object_util.op_list_to_dict(
+          names_to_saveables)
     saveables = saveable_object_util.validate_and_slice_inputs(
         names_to_saveables)
     if max_to_keep is None:
@@ -1808,6 +1811,8 @@ def saver_from_object_based_checkpoint(checkpoint_path,
   if builder is None:
     builder = BulkSaverBuilder()
 
+  if not isinstance(var_list, dict):
+    var_list = saveable_object_util.op_list_to_dict(var_list)
   saveables = saveable_object_util.validate_and_slice_inputs(var_list)
   current_names = set()
   for saveable in saveables:
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 0eda4402bf1..397625f46d1 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -48,6 +49,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":saveable_object",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:saveable_compat",
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 05c5a73bdd9..9fdc4e77732 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -29,6 +29,7 @@
 from tensorflow.python.framework import type_spec
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -50,6 +51,8 @@
                      "VarHandleOp",
                      "ReadVariableOp"])
 
+_REF_VARIABLE_OPS = frozenset(["Variable", "VariableV2", "AutoReloadVariable"])
+
 
 def set_cpu0(device_string):
   """Creates a new device string based on `device_string` but using /CPU:0.
@@ -188,8 +191,7 @@ def saveable_objects_for_op(op, name):
         raise ValueError(
             f"Slices must all be from the same tensor: {slice_name} != "
             f"{variable._save_slice_info.full_name}")
-      if variable.op.type in ["Variable", "VariableV2",
-                              "AutoReloadVariable"]:
+      if variable.op.type in _REF_VARIABLE_OPS:
         yield ReferenceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
       else:
@@ -199,9 +201,13 @@ def saveable_objects_for_op(op, name):
   elif isinstance(op, trackable.Trackable) and not isinstance(
       op, variables.Variable):
     # pylint: disable=protected-access
-    for attr, factory in saveable_objects_from_trackable(op).items():
+    for attr, factory in saveable_objects_from_trackable(
+        op, tf1_saver=True).items():
       if attr == trackable.VARIABLE_VALUE_KEY:
-        # Keep original name for classes masquerading as variables.
+        # Keep original name for classes masquerading as variables and
+        # Trackables that define _serialize_to_tensors.
+        full_name = name
+      elif attr == trackable_utils.SERIALIZE_TO_TENSORS_NAME:
         full_name = name
       else:
         full_name = name + "_" + attr
@@ -227,8 +233,7 @@ def saveable_objects_for_op(op, name):
         raise TypeError(
             "names_to_saveables must be a dict mapping string "
             f"names to Tensors/Variables. Not a variable: {variable}")
-      if variable.op.type in ["Variable", "VariableV2",
-                              "AutoReloadVariable"]:
+      if variable.op.type in _REF_VARIABLE_OPS:
         yield ReferenceVariableSaveable(variable, "", name)
       else:
         yield ResourceVariableSaveable(variable, "", name)
@@ -237,6 +242,10 @@ def saveable_objects_for_op(op, name):
 def op_list_to_dict(op_list, convert_variable_to_tensor=True):
   """Create a dictionary of names to operation lists.
 
+  This method is only used when the variable name matters (e.g. when saving
+  or restoring from a TF1 name-based checkpoint). In TF2, this can be called
+  from `tf.train.Checkpoint.restore` when loading from a name-based checkpoint.
+
   Args:
     op_list: A (nested) list, tuple, or set of Variables or SaveableObjects.
     convert_variable_to_tensor: Whether or not to convert single Variables
@@ -286,7 +295,8 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
     elif isinstance(var, trackable.Trackable) and not resource_or_ref_variable:
       trackable_saveables = [
           (factory() if callable(factory) else factory)
-          for factory in saveable_objects_from_trackable(var).values()]
+          for factory in (
+              saveable_objects_from_trackable(var, tf1_saver=True).values())]
       names_to_saveables.update(
           op_list_to_dict(trackable_saveables))
     else:
@@ -360,9 +370,6 @@ def validate_and_slice_inputs(names_to_saveables):
     ValueError: If the same operation is given in more than one value
       (this also applies to slices of SlicedVariables).
   """
-  if not isinstance(names_to_saveables, dict):
-    names_to_saveables = op_list_to_dict(names_to_saveables)
-
   saveables = []
   seen_ops = object_identity.ObjectIdentitySet()
   for name, op in sorted(names_to_saveables.items(),
@@ -576,8 +583,20 @@ def is_factory_for_restored_saveable_object(factory):
 
 
 @tf_export("__internal__.tracking.saveable_objects_from_trackable", v1=[])
-def saveable_objects_from_trackable(obj):
-  """Returns SaveableObject factory dict from a Trackable."""
+def saveable_objects_from_trackable(obj, tf1_saver=False):
+  """Returns SaveableObject factory dict from a Trackable.
+
+  Args:
+    obj: A `Trackable`
+    tf1_saver: Boolean, whether this is being called from a TF1 Saver (
+        `tf.compat.v1.train.Saver`). When this is True, the SaveableObject will
+        be generated from `obj`'s legacy `_gather_saveables_for_checkpoint` fn.
+        When saving with TF2, `Trackable._serialize_from_tensors` is preferred.
+
+  Returns:
+    A dict mapping attribute names to SaveableObject factories (callables that
+    produce a SaveableObject).
+  """
   if isinstance(obj, python_state.PythonState):
     return {
         python_state.PYTHON_STATE:
@@ -586,6 +605,12 @@ def saveable_objects_from_trackable(obj):
                 state_callback=obj.serialize,
                 restore_callback=obj.deserialize)
     }
+
+  if tf1_saver:
+    saveable_factories = obj._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    if saveable_factories:
+      return saveable_factories
+
   if trackable_has_serialize_to_tensor(obj):
 
     def create_saveable(name="", call_with_mapped_captures=None):
@@ -598,24 +623,29 @@ def create_saveable(name="", call_with_mapped_captures=None):
 
       specs = []
       local_names = []
-      prefix = saveable_compat.get_saveable_name(obj) or ""
       for tensor_name, maybe_tensor in tensor_dict.items():
         local_names.append(tensor_name)
-        spec_name = name + trackable_utils.escape_local_name(tensor_name)
 
         if not isinstance(maybe_tensor, dict):
           maybe_tensor = {"": maybe_tensor}
 
+        spec_name = name + trackable_utils.escape_local_name(tensor_name)
         # Create separate specs for each slice spec.
         for slice_spec, tensor in maybe_tensor.items():
-          specs.append(saveable_object.SaveSpec(tensor, slice_spec, spec_name))
+          if isinstance(tensor, saveable_object.SaveSpec):
+            spec = tensor
+            spec.name = spec_name
+            spec.slice_spec = slice_spec
+          else:
+            spec = saveable_object.SaveSpec(tensor, slice_spec, spec_name)
+          specs.append(spec)
 
       return TrackableSaveable(
           obj=obj,
           specs=specs,
           name=name,
           local_names=local_names,
-          prefix=prefix,
+          prefix=saveable_compat.get_saveable_name(obj) or "",
           call_with_mapped_captures=call_with_mapped_captures)
 
     return {trackable_utils.SERIALIZE_TO_TENSORS_NAME: create_saveable}
@@ -640,24 +670,25 @@ def restore(self, restored_tensors, restored_shapes):
     for n, local_name in enumerate(self._local_names):
       restored_tensor_dict[local_name] = restored_tensors[n]
 
-    def restore_from_tensors():
-      restore_fn = self._trackable._restore_from_tensors  # pylint: disable=protected-access
-      if (self._call_with_mapped_captures and
-          isinstance(restore_fn, core.ConcreteFunction)):
-        self._call_with_mapped_captures(restore_fn, [restored_tensor_dict])
-      else:
-        restore_fn(restored_tensor_dict)
+    restore_fn = self._trackable._restore_from_tensors  # pylint: disable=protected-access
 
-      # In graph mode, this wrapper function is converted into a tf.function,
-      # and to ensure that _restore_from_tensors is executed, there must be at
-      # least one returned tensor. `_restore_from_tensors` may return zero
-      # tensors so create a dummy constant here.
-      return constant_op.constant(1)
+    # When restoring a RefVariable, call the restore function directly.
+    # pylint: disable=protected-access
+    if not ops.executing_eagerly_outside_functions() and any([
+        spec._tensor.op.type in _REF_VARIABLE_OPS
+        for spec in self.specs
+        if isinstance(spec._tensor, ops.Tensor)]):
+      return restore_fn(restored_tensor_dict)
+    # pylint: enable=protected-access
 
-    if not ops.executing_eagerly_outside_functions():
-      restore_from_tensors = def_function.function(restore_from_tensors,
-                                                   autograph=False)
-    return restore_from_tensors()
+    if (self._call_with_mapped_captures and
+        isinstance(restore_fn, core.ConcreteFunction)):
+      ret = self._call_with_mapped_captures(restore_fn, [restored_tensor_dict])
+    else:
+      ret = restore_fn(restored_tensor_dict)
+    if ret is not None:
+      return ret
+    return control_flow_ops.no_op()
 
   def get_proto_names_and_checkpoint_keys(self):
     return [(self._prefix + local_name, spec.name)
@@ -709,12 +740,28 @@ def _constant_state():
 
 
 def trackable_has_serialize_to_tensor(obj):
-  # pylint: disable=protected-access
-  obj_serialize_fn = obj._serialize_to_tensors
-  if hasattr(obj_serialize_fn, "__func__"):
-    obj_serialize_fn = obj_serialize_fn.__func__
-  return trackable.Trackable._serialize_to_tensors != obj_serialize_fn
-  # pylint: enable=protected-access
+  """Returns whether obj's class has `_serialize_to_tensors` defined."""
+  try:
+    if "_serialize_to_tensors" in obj.__dict__:
+      # In some cases (e.g. restored objects), the object may have
+      # `_serialize_to_tensors` even if the class does not.
+      return True
+  except AttributeError:  # Data structure proxy wrappers don't have __dict__.
+    pass
+
+  # Use MRO so that if a parent class has `_serialize_to_tensors`, but the
+  # object class has not yet been migrated, we'll continue to use the obj
+  # class's `_gather_saveables_for_checkpoint` method.
+  for t in type(obj).mro():
+    if t is trackable.Trackable:
+      # Base case. Return False since _serialize_to_tensors will raise a
+      # NotImplemented Error.
+      return False
+    elif "_serialize_to_tensors" in t.__dict__:
+      return True
+    elif "_gather_saveables_for_checkpoint" in t.__dict__:
+      return False
+  return False
 
 
 def _convert_to_string(x):
diff --git a/tensorflow/python/training/saving/saveable_object_util_test.py b/tensorflow/python/training/saving/saveable_object_util_test.py
index e9caf442feb..71d9f019350 100644
--- a/tensorflow/python/training/saving/saveable_object_util_test.py
+++ b/tensorflow/python/training/saving/saveable_object_util_test.py
@@ -182,7 +182,7 @@ def _serialize_to_tensors(self):
     }
 
   def _restore_from_tensors(self, restored_tensors):
-    self.assign(restored_tensors["value"])
+    return self.assign(restored_tensors["value"])
 
 
 class SaveableCompatibilityEndToEndTest(test.TestCase):
@@ -222,5 +222,30 @@ def test_checkpoint_comparison(self):
     self.assertEqual(10, self.evaluate(to_convert.read()))
 
 
+class HasSerializeToTensorTest(test.TestCase):
+  def test_has_serialize_to_tensor(self):
+    class ReturnsTrue(base.Trackable):
+      def _serialize_to_tensors(self):
+        return {}
+
+    class ReturnsFalse(base.Trackable):
+      pass
+
+    class SubclassReturnsFalse(ReturnsTrue):
+      def _gather_saveables_for_checkpoint(self):
+        return {}
+
+    self.assertTrue(saveable_object_util.trackable_has_serialize_to_tensor(
+        ReturnsTrue()))
+    self.assertFalse(saveable_object_util.trackable_has_serialize_to_tensor(
+        ReturnsFalse()))
+
+    # This should return False, because even though its parent class has
+    # `_serialize_to_tensors`, the class itself defines
+    # `_gather_saveables_for_checkpoint`.
+    self.assertFalse(saveable_object_util.trackable_has_serialize_to_tensor(
+        SubclassReturnsFalse()))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index ce541afbffa..2a8b7c53b70 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -2,6 +2,7 @@
 #   Utilities for reading and writing object-based checkpoints.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:internal",
     ],
@@ -167,7 +168,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/checkpoint:checkpoint_core",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/saved_model:utils",
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 21b53645267..6c7c8cbe58c 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -68,6 +68,7 @@
 from tensorflow.python.training.basic_loops import basic_train_loop
 from tensorflow.python.trackable.python_state import PythonState
 from tensorflow.python.checkpoint.checkpoint import Checkpoint
+from tensorflow.python.checkpoint.checkpoint_view import CheckpointView
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index b55e690c4a1..544da4bfae8 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
@@ -23,6 +24,7 @@ pytype_strict_library(
     srcs = [
         "__init__.py",
         "core.py",
+        "data.py",
         "distribute.py",
         "internal.py",
         "trace.py",
@@ -30,7 +32,7 @@ pytype_strict_library(
     srcs_version = "PY3",
     visibility = [
         "//tensorflow:__subpackages__",
-        "//tensorflow:types_whitelist",
+        "//tensorflow:types_allowlist",
     ],
     deps = [
         ":doc_typealias",
diff --git a/tensorflow/python/types/core.py b/tensorflow/python/types/core.py
index 7d66f22bb3f..055012bfe86 100644
--- a/tensorflow/python/types/core.py
+++ b/tensorflow/python/types/core.py
@@ -179,9 +179,10 @@ def experimental_get_compiler_ir(self, *args, **kwargs):
     backwards compatibility of returned IR or the allowed values of `stage`.
 
     Args:
-      *args: Arguments used for compilation; same arguments as used for calling
-        the function. Need to be eager tensors.
-      **kwargs: Keyword arguments used for compilation.
+      *args: compilation args supports inputs either: (1) all inputs are
+        TensorSpec or (2) all inputs are tf.Tensor/Python variables.
+      **kwargs: Keyword arguments used for compilation. Same requirement as
+        compiliation args.
 
     Returns:
       Function callable with the following kwargs:
@@ -230,8 +231,11 @@ def f(x):
       ```
 
     Raises:
-      ValueError: If an invalid `stage` is selected or if applied to a function
-        which is not compiled (`jit_compile=True` is not set).
+      ValueError:
+        (1) If an invalid `stage` is selected
+        (2) or if applied to a function which is not compiled
+        (`jit_compile=True` is not set).
+        (3) or if input shapes are not fully defined for tf.TensorSpec inputs
       TypeError: When called with input in graph mode.
     """
     pass
diff --git a/tensorflow/python/types/data.py b/tensorflow/python/types/data.py
new file mode 100644
index 00000000000..59c09001086
--- /dev/null
+++ b/tensorflow/python/types/data.py
@@ -0,0 +1,29 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dataset types."""
+
+import abc
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("__internal__.types.data.Dataset", v1=[])
+class DatasetV2(abc.ABC):
+  """Represents the TensorFlow 2 type `tf.data.Dataset`."""
+
+
+@tf_export(v1=["__internal__.types.data.Dataset"])
+class DatasetV1(DatasetV2, abc.ABC):
+  """Represents the TensorFlow 1 type `tf.data.Dataset`."""
diff --git a/tensorflow/python/types/internal.py b/tensorflow/python/types/internal.py
index 305b301412d..fbd53539952 100644
--- a/tensorflow/python/types/internal.py
+++ b/tensorflow/python/types/internal.py
@@ -24,3 +24,17 @@ class NativeObject(object):
 
   The most notable example of NativeObject is Tensor.
   """
+
+
+class TypeSpec(object):
+  """Interface for internal isinstance checks to framework/type_spec.py.
+
+  This helps to avoid circular dependencies.
+  """
+
+
+class TensorSpec(object):
+  """Interface for internal isinstance checks to framework/tensor_spec.py.
+
+  This helps to avoid circular dependencies.
+  """
diff --git a/tensorflow/python/types/trace.py b/tensorflow/python/types/trace.py
index 8f8d39826b7..bd4437e9a38 100644
--- a/tensorflow/python/types/trace.py
+++ b/tensorflow/python/types/trace.py
@@ -26,7 +26,7 @@
 """
 
 import abc
-from typing import Optional, Sequence
+from typing import Optional, Sequence, Any
 from typing_extensions import Protocol
 from typing_extensions import runtime_checkable
 from tensorflow.python.util.tf_export import tf_export
@@ -73,8 +73,9 @@ class Mango(Fruit):
 
   ```python
   class FruitTraceType(tf.types.experimental.TraceType):
-    def __init__(self, fruit_type):
-      self.fruit_type = fruit_type
+    def __init__(self, fruit):
+      self.fruit_type = type(fruit)
+      self.fruit_value = fruit
 
     def is_subtype_of(self, other):
        return (type(other) is FruitTraceType and
@@ -83,10 +84,13 @@ def is_subtype_of(self, other):
     def most_specific_common_supertype(self, others):
        return self if all(self == other for other in others) else None
 
+    def placeholder_value(self, placeholder_context=None):
+      return self.fruit_value
+
   class Fruit:
 
    def __tf_tracing_type__(self, context):
-     return FruitTraceType(type(self))
+     return FruitTraceType(self)
   ```
 
   Now if we try calling it again:
@@ -153,21 +157,24 @@ def most_specific_common_supertype(self, other):
     ```
     """
 
-  # TODO(b/221309709): Polish into a stable placeholder_value.
-  @doc_controls.do_not_doc_inheritable
-  def _placeholder_value(self):
+  @abc.abstractmethod
+  def placeholder_value(self, placeholder_context=None) -> Any:
     """Creates a placeholder for tracing.
 
-    Often it is more useful to trace with a placeholder value than an actual
-    one. For example, a placeholder value can represent multiple different
+    tf.funcion traces with the placeholder value rather than the actual value.
+    For example, a placeholder value can represent multiple different
     actual values. This means that the trace generated with that placeholder
     value is more general and reusable which saves expensive retracing.
 
+    Args:
+      placeholder_context: A `PlaceholderContext` container for context
+                           information when creating a placeholder value.
+
     For the `Fruit` example shared above, implementing:
 
     ```python
     class FruitTraceType:
-      def _placeholder_value():
+      def placeholder_value(self, placeholder_context=None):
         return Fruit()
     ```
     instructs tf.function to trace with the `Fruit()` objects
@@ -181,14 +188,43 @@ def _placeholder_value():
     ```python
     @tf.function
     def foo(x):
-      # Here `x` can be the placeholder value
+      # Here `x` is be the placeholder value
       ...
 
     foo(x) # Here `x` is the actual value
     ```
     """
+
+  @doc_controls.do_not_doc_inheritable
+  def _to_tensors(self, value):
+    """Breaks down a value of this type into Tensors.
+
+    Args:
+      value: An input value belonging to this TraceType
+    """
     raise NotImplementedError
 
+  @doc_controls.do_not_doc_inheritable
+  def _cast(self, value, casting_context) -> Any:  # pylint:disable=unused-argument
+    """Cast value to this type.
+
+    Args:
+      value: An input value belonging to this TraceType.
+      casting_context: A context reserved for future usage such as to determine
+        casting rules.
+
+    Returns:
+      The value casted to this TraceType.
+
+    Raises:
+      AssertionError: When _cast is not overloaded in subclass,
+        the value is returned directly, and it should be the same to
+        self.placeholder_value().
+    """
+    assert value == self.placeholder_value(
+        PlaceholderContext()), f"Cannt cast {value!r} to type {self!r}."
+    return value
+
   @abc.abstractmethod
   def __hash__(self) -> int:
     pass
@@ -208,7 +244,14 @@ class TracingContext(metaclass=abc.ABCMeta):
   __tf_tracing_type__ calls while constructing the TraceType for a particular
   set of objects.
   """
-  pass
+
+
+class PlaceholderContext():
+  """Contains context information for generating placeholders within a scope."""
+
+
+class CastContext():
+  """Contains context info and rules for casting values to a TypeSpec."""
 
 
 @runtime_checkable
diff --git a/tensorflow/python/user_ops/BUILD b/tensorflow/python/user_ops/BUILD
index d1ba854a44c..40af85ab3cf 100644
--- a/tensorflow/python/user_ops/BUILD
+++ b/tensorflow/python/user_ops/BUILD
@@ -8,6 +8,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 0abd09520b7..3bee1ddc4c8 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -25,6 +25,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 1cded58c1a3..91e592cecb1 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -441,12 +441,7 @@ def _add_name_scope_wrapper(func, api_signature):
 
   func_signature = tf_inspect.signature(func)
   func_argspec = tf_inspect.getargspec(func)
-  kw = None
-  if hasattr(func_argspec, "keywords"):
-    kw = func_argspec.keywords
-  elif hasattr(func_argspec, "varkw"):
-    kw = func_argspec.varkw
-  if "name" in func_signature.parameters or kw is not None:
+  if "name" in func_signature.parameters or func_argspec.keywords is not None:
     return func  # No wrapping needed (already has name parameter).
 
   name_index = list(api_signature.parameters).index("name")
@@ -556,12 +551,7 @@ def add_type_based_api_dispatcher(target):
 
   _, unwrapped = tf_decorator.unwrap(target)
   target_argspec = tf_inspect.getargspec(unwrapped)
-  kw = None
-  if hasattr(target_argspec, "keywords"):
-    kw = target_argspec.keywords
-  elif hasattr(target_argspec, "varkw"):
-    kw = target_argspec.varkw
-  if target_argspec.varargs or kw:
+  if target_argspec.varargs or target_argspec.keywords:
     # @TODO(b/194903203) Add v2 dispatch support for APIs that take varargs
     # and keywords.  Examples of APIs that take varargs and kwargs: meshgrid,
     # einsum, map_values, map_flat_values.
@@ -591,13 +581,8 @@ def _check_signature(api_signature, func):
   """
   # Special case: if func_signature is (*args, **kwargs), then assume it's ok.
   func_argspec = tf_inspect.getargspec(func)
-  kw = None
-  if hasattr(func_argspec, "keywords"):
-    kw = func_argspec.keywords
-  elif hasattr(func_argspec, "varkw"):
-    kw = func_argspec.varkw
-  if (func_argspec.varargs is not None and kw is not None and
-      not func_argspec.args):
+  if (func_argspec.varargs is not None and func_argspec.keywords is not None
+      and not func_argspec.args):
     return
 
   func_signature = tf_inspect.signature(func)
diff --git a/tensorflow/python/util/memory.py b/tensorflow/python/util/memory.py
deleted file mode 100644
index 71989f4d1cc..00000000000
--- a/tensorflow/python/util/memory.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functions related to Python memory management."""
-
-
-# TODO(b/115366440): Delete this function when a custom OrderedDict is added
-def dismantle_ordered_dict(ordered_dict):
-  """Remove reference cycle in OrderedDict `ordered_dict`.
-
-  Helpful for making sure the garbage collector doesn't need to run after
-  using an OrderedDict.
-
-  Args:
-    ordered_dict: A `OrderedDict` object to destroy. This object is unusable
-      after this function runs.
-  """
-  # OrderedDict, makes a simple reference loop
-  # and hides it in an __attribute in some Python versions. We don't need to
-  # throw an error if we can't find it, but if we do find it we can break the
-  # loop to avoid creating work for the garbage collector.
-  problematic_cycle = ordered_dict.__dict__.get("_OrderedDict__root", None)  # pylint: disable=protected-access
-  if problematic_cycle:
-    try:
-      del problematic_cycle[0][:]
-    except TypeError:
-      # This is probably not one of the problematic Python versions. Continue
-      # with the rest of our cleanup.
-      pass
diff --git a/tensorflow/python/util/protobuf/BUILD b/tensorflow/python/util/protobuf/BUILD
index 2bcfb96a16d..9ca428c988b 100644
--- a/tensorflow/python/util/protobuf/BUILD
+++ b/tensorflow/python/util/protobuf/BUILD
@@ -23,6 +23,7 @@ visibility = [
 ]
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
     licenses = ["notice"],
 )
diff --git a/tensorflow/python/util/stack_trace.h b/tensorflow/python/util/stack_trace.h
index d5261a171d7..3b16e1f9d71 100644
--- a/tensorflow/python/util/stack_trace.h
+++ b/tensorflow/python/util/stack_trace.h
@@ -62,25 +62,31 @@ class StackTrace final {
 
     StackTrace result;
 #if PY_VERSION_HEX >= 0x030B0000
+    PyFrameObject* oldframe;
     PyFrameObject* frame = PyThreadState_GetFrame(PyThreadState_GET());
 #else
     const PyFrameObject* frame = PyThreadState_GET()->frame;
 #endif
     int i = 0;
 #if PY_VERSION_HEX >= 0x030B0000
-    for (; i < limit && frame != nullptr; PyFrame_GetBack(frame), ++i) {
+    for (; i < limit && frame != nullptr; oldframe = frame,
+                                          frame = PyFrame_GetBack(frame),
+                                          Py_DECREF(oldframe), ++i) {
       PyCodeObject* code_obj = PyFrame_GetCode(frame);
 #else
     for (; i < limit && frame != nullptr; frame = frame->f_back, ++i) {
       PyCodeObject* code_obj = frame->f_code;
+      Py_XINCREF(code_obj);
 #endif
       DCHECK(code_obj != nullptr);
 
-      Py_INCREF(code_obj);
       int line_number =
           PyFrame_GetLineNumber(const_cast<PyFrameObject*>(frame));
       result.code_objs_.push_back(std::make_pair(code_obj, line_number));
     }
+#if PY_VERSION_HEX >= 0x030B0000
+    Py_XDECREF(frame);
+#endif
     return result;
   }
 
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index e52b865f7cd..906bf87c61b 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -56,6 +56,60 @@ def count_calls(target):
     return CallCounter(target)
 """
 import inspect
+from typing import Dict, Any
+
+
+def _make_default_values(fullargspec: inspect.FullArgSpec) -> Dict[str, Any]:
+  """Returns default values from the function's fullargspec."""
+  if fullargspec.defaults is not None:
+    defaults = {
+        name: value for name, value in zip(
+            fullargspec.args[-len(fullargspec.defaults):], fullargspec.defaults)
+    }
+  else:
+    defaults = {}
+
+  if fullargspec.kwonlydefaults is not None:
+    defaults.update(fullargspec.kwonlydefaults)
+
+  return defaults
+
+
+def fullargspec_to_signature(
+    fullargspec: inspect.FullArgSpec) -> inspect.Signature:
+  """Repackages fullargspec information into an equivalent inspect.Signature."""
+  defaults = _make_default_values(fullargspec)
+  parameters = []
+
+  for arg in fullargspec.args:
+    parameters.append(
+        inspect.Parameter(
+            arg,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            default=defaults.get(arg, inspect.Parameter.empty),
+        )
+    )
+
+  if fullargspec.varargs is not None:
+    parameters.append(
+        inspect.Parameter(fullargspec.varargs, inspect.Parameter.VAR_POSITIONAL)
+    )
+
+  for kwarg in fullargspec.kwonlyargs:
+    parameters.append(
+        inspect.Parameter(
+            kwarg,
+            inspect.Parameter.KEYWORD_ONLY,
+            default=defaults.get(kwarg, inspect.Parameter.empty),
+        )
+    )
+
+  if fullargspec.varkw is not None:
+    parameters.append(
+        inspect.Parameter(fullargspec.varkw, inspect.Parameter.VAR_KEYWORD)
+    )
+
+  return inspect.Signature(parameters)
 
 
 def make_decorator(target,
@@ -72,7 +126,7 @@ def make_decorator(target,
       function calling make_decorator.
     decorator_doc: Documentation specific to this application of
       `decorator_func` to `target`.
-    decorator_argspec: The new callable signature of this decorator.
+    decorator_argspec: Override the signature using FullArgSpec.
 
   Returns:
     The `decorator_func` argument with new metadata attached.
@@ -101,9 +155,35 @@ def make_decorator(target,
   # Keeping a second handle to `target` allows callers to detect whether the
   # decorator was modified using `rewrap`.
   decorator_func.__original_wrapped__ = target
+  if decorator_argspec:
+    decorator_func.__signature__ = fullargspec_to_signature(
+        decorator_argspec)
+  elif callable(target):
+    try:
+      signature = inspect.signature(target)
+    except (TypeError, ValueError):
+      # Certain callables such as builtins can not be inspected for signature.
+      pass
+    else:
+      bound_instance = _get_bound_instance(target)
+      # Present the decorated func as a method as well
+      if bound_instance and 'self' in signature.parameters:
+        signature = inspect.Signature(list(signature.parameters.values())[1:])
+        decorator_func.__self__ = bound_instance
+
+      decorator_func.__signature__ = signature
+
   return decorator_func
 
 
+def _get_bound_instance(target):
+  """Returns the instance any of the targets is attached to."""
+  decorators, target = unwrap(target)
+  for decorator in decorators:
+    if inspect.ismethod(decorator.decorated_target):
+      return decorator.decorated_target.__self__
+
+
 def _has_tf_decorator_attr(obj):
   """Checks if object has _tf_decorator attribute.
 
@@ -113,9 +193,8 @@ def _has_tf_decorator_attr(obj):
   Args:
     obj: Python object.
   """
-  return (
-      hasattr(obj, '_tf_decorator') and
-      isinstance(getattr(obj, '_tf_decorator'), TFDecorator))
+  return (hasattr(obj, '_tf_decorator') and
+          isinstance(getattr(obj, '_tf_decorator'), TFDecorator))
 
 
 def rewrap(decorator_func, previous_target, new_target):
@@ -246,6 +325,15 @@ def __init__(self,
     else:
       self.__doc__ = ''
 
+    if decorator_argspec:
+      self.__signature__ = fullargspec_to_signature(decorator_argspec)
+    elif callable(target):
+      try:
+        self.__signature__ = inspect.signature(target)
+      except (TypeError, ValueError):
+        # Certain callables such as builtins can not be inspected for signature.
+        pass
+
   def __get__(self, instance, owner):
     return self._decorated_target.__get__(instance, owner)
 
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 446656192d9..1517524b6cd 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -16,6 +16,7 @@
 
 # pylint: disable=unused-import
 import functools
+import inspect
 
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -110,24 +111,31 @@ def testInitCapturesTarget(self):
                   tf_decorator.TFDecorator('', test_function).decorated_target)
 
   def testInitCapturesDecoratorName(self):
-    self.assertEqual('decorator name',
-                     tf_decorator.TFDecorator('decorator name',
-                                              test_function).decorator_name)
+    self.assertEqual(
+        'decorator name',
+        tf_decorator.TFDecorator('decorator name',
+                                 test_function).decorator_name)
 
   def testInitCapturesDecoratorDoc(self):
-    self.assertEqual('decorator doc',
-                     tf_decorator.TFDecorator('', test_function,
-                                              'decorator doc').decorator_doc)
+    self.assertEqual(
+        'decorator doc',
+        tf_decorator.TFDecorator('', test_function,
+                                 'decorator doc').decorator_doc)
 
   def testInitCapturesNonNoneArgspec(self):
-    argspec = tf_inspect.ArgSpec(
+    argspec = tf_inspect.FullArgSpec(
         args=['a', 'b', 'c'],
         varargs=None,
-        keywords=None,
-        defaults=(1, 'hello'))
-    self.assertIs(argspec,
-                  tf_decorator.TFDecorator('', test_function, '',
-                                           argspec).decorator_argspec)
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations=None,
+    )
+    self.assertIs(
+        argspec,
+        tf_decorator.TFDecorator('', test_function, '',
+                                 argspec).decorator_argspec)
 
   def testInitSetsDecoratorNameToTargetName(self):
     self.assertEqual('test_function',
@@ -177,6 +185,7 @@ def testDocstringOnBoundProperty(self):
                      TestDecoratedClass().return_params.__doc__)
 
   def testTarget__get__IsProxied(self):
+
     class Descr(object):
 
       def __get__(self, instance, owner):
@@ -231,15 +240,34 @@ def testUpdatesDict_doesNotOverridePresentEntries(self):
     del test_wrapper.foobar
 
   def testSetsTFDecoratorArgSpec(self):
-    argspec = tf_inspect.ArgSpec(
+    argspec = tf_inspect.FullArgSpec(
         args=['a', 'b', 'c'],
-        varargs=None,
-        keywords=None,
-        defaults=(1, 'hello'))
+        varargs='args',
+        kwonlyargs={},
+        defaults=(1, 'hello'),
+        kwonlydefaults=None,
+        varkw='kwargs',
+        annotations=None)
     decorated = tf_decorator.make_decorator(test_function, test_wrapper, '', '',
                                             argspec)
     decorator = getattr(decorated, '_tf_decorator')
     self.assertEqual(argspec, decorator.decorator_argspec)
+    self.assertEqual(
+        inspect.signature(decorated),
+        inspect.Signature([
+            inspect.Parameter('a', inspect.Parameter.POSITIONAL_OR_KEYWORD),
+            inspect.Parameter(
+                'b', inspect.Parameter.POSITIONAL_OR_KEYWORD, default=1
+            ),
+            inspect.Parameter(
+                'c',
+                inspect.Parameter.POSITIONAL_OR_KEYWORD,
+                default='hello',
+            ),
+            inspect.Parameter('args', inspect.Parameter.VAR_POSITIONAL),
+            inspect.Parameter('kwargs', inspect.Parameter.VAR_KEYWORD),
+        ]),
+    )
 
   def testSetsDecoratorNameToFunctionThatCallsMakeDecoratorIfAbsent(self):
 
@@ -275,10 +303,10 @@ def testRewrapMutatesAffectedFunction(self):
     def new_target(x):
       return x * 3
 
-    self.assertEqual((1 * 2 + 1) ** 2, test_rewrappable_decorated(1))
+    self.assertEqual((1 * 2 + 1)**2, test_rewrappable_decorated(1))
     prev_target, _ = tf_decorator.unwrap(test_rewrappable_decorated)
     tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
-    self.assertEqual((1 * 3 + 1) ** 2, test_rewrappable_decorated(1))
+    self.assertEqual((1 * 3 + 1)**2, test_rewrappable_decorated(1))
 
   def testRewrapOfDecoratorFunction(self):
 
@@ -289,7 +317,7 @@ def new_target(x):
     # In this case, only the outer decorator (test_injectable_decorator_square)
     # should be preserved.
     tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
-    self.assertEqual((1 * 3) ** 2, test_rewrappable_decorated(1))
+    self.assertEqual((1 * 3)**2, test_rewrappable_decorated(1))
 
 
 class TfDecoratorUnwrapTest(test.TestCase):
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index aabe017d17e..00e947b10a6 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -291,7 +291,6 @@ def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
     self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
-    self._deprecation_inst = kwargs.get('deprecation_inst', None)
 
     self._validate_symbol_names()
 
@@ -349,14 +348,6 @@ def __call__(self, func):
     self.set_attr(undecorated_func, api_names_attr, self._names)
     self.set_attr(undecorated_func, api_names_attr_v1, self._names_v1)
 
-    if self._deprecation_inst is not None:
-      # Inline import to avoid dependency cycle between deprecation
-      # utility and tf_export
-      from tensorflow.python.util import deprecation  # pylint: disable=g-import-not-at-top
-      deprecation_wrapper = deprecation.deprecated(
-          None, self._deprecation_inst, warn_once=True)
-      func = deprecation_wrapper(func)
-
     for name in self._names:
       _NAME_TO_SYMBOL_MAPPING[name] = func
     for name_v1 in self._names_v1:
@@ -411,7 +402,7 @@ def export_constant(self, module_name, name):
 
 def kwarg_only(f):
   """A wrapper that throws away all non-kwarg arguments."""
-  f_argspec = tf_inspect.getargspec(f)
+  f_argspec = tf_inspect.getfullargspec(f)
 
   def wrapper(*args, **kwargs):
     if args:
@@ -421,13 +412,9 @@ def wrapper(*args, **kwargs):
           .format(f=f.__name__, kwargs=f_argspec.args))
     return f(**kwargs)
 
-  return tf_decorator.make_decorator(f, wrapper, decorator_argspec=f_argspec)
+  return tf_decorator.make_decorator(
+      f, wrapper, decorator_argspec=f_argspec)
 
 
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
-estimator_export = functools.partial(
-    api_export,
-    api_name=ESTIMATOR_API_NAME,
-    is_deprecated=True,
-    deprecation_inst='Use tf.keras instead.')
 keras_export = functools.partial(api_export, api_name=KERAS_API_NAME)
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index eb229e0181a..a2264c0ef30 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -141,38 +141,6 @@ def testExportClasses(self):
     self.assertEqual(['TestClassA1'], tf_export.get_v1_names(TestClassA))
     self.assertEqual(['TestClassB1'], tf_export.get_v1_names(TestClassB))
 
-  def testExportClassInEstimator(self):
-    export_decorator_a = tf_export.tf_export('TestClassA1')
-    export_decorator_a(TestClassA)
-    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
-
-    export_decorator_b = tf_export.estimator_export('estimator.TestClassB1')
-    export_decorator_b(TestClassB)
-    self.assertTrue('_tf_api_names' not in TestClassB.__dict__)
-    self.assertEqual(('TestClassA1',), TestClassA._tf_api_names)
-    self.assertEqual(['TestClassA1'], tf_export.get_v1_names(TestClassA))
-    self.assertEqual(['estimator.TestClassB1'],
-                     tf_export.get_v1_names(TestClassB))
-
-  @test.mock.patch.object(logging, 'warning', autospec=True)
-  def testExportDeprecated(self, mock_warning):
-    export_decorator = tf_export.estimator_export(
-        'estimator.TestClassA', is_deprecated=True)
-    export_decorator(TestClassA)
-
-    export_decorator2 = tf_export.tf_export('TestClassB1')
-    export_decorator2(TestClassB)
-
-    # Deprecation should trigger a runtime warning
-    TestClassA()
-    self.assertEqual(1, mock_warning.call_count)
-    # Deprecation should only warn once, upon first call
-    TestClassA()
-    self.assertEqual(1, mock_warning.call_count)
-    # No warning should be triggered when inherting from a deprecated class
-    TestClassB()
-    self.assertEqual(1, mock_warning.call_count)
-
   def testExportSingleConstant(self):
     module1 = self._CreateMockModule('module1')
 
@@ -215,19 +183,9 @@ def testRaisesExceptionIfInvalidSymbolName(self):
     with self.assertRaises(tf_export.InvalidSymbolNameError):
       tf_export.tf_export('estimator.invalid')
 
-    # All symbols exported by Estimator must be under tf.estimator package.
-    with self.assertRaises(tf_export.InvalidSymbolNameError):
-      tf_export.estimator_export('invalid')
-    with self.assertRaises(tf_export.InvalidSymbolNameError):
-      tf_export.estimator_export('Estimator.invalid')
-    with self.assertRaises(tf_export.InvalidSymbolNameError):
-      tf_export.estimator_export('invalid.estimator')
-
   def testRaisesExceptionIfInvalidV1SymbolName(self):
     with self.assertRaises(tf_export.InvalidSymbolNameError):
       tf_export.tf_export('valid', v1=['estimator.invalid'])
-    with self.assertRaises(tf_export.InvalidSymbolNameError):
-      tf_export.estimator_export('estimator.valid', v1=['invalid'])
 
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index c51228efbe9..0e106932c5f 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -35,10 +35,18 @@ def signature(obj, *, follow_wrapped=True):
 Parameter = _inspect.Parameter
 Signature = _inspect.Signature
 
-try:
+if hasattr(_inspect, 'ArgSpec'):
   ArgSpec = _inspect.ArgSpec
-except:
-  pass
+else:
+  ArgSpec = collections.namedtuple(
+      'ArgSpec',
+      [
+          'args',
+          'varargs',
+          'keywords',
+          'defaults',
+      ],
+  )
 
 
 if hasattr(_inspect, 'FullArgSpec'):
@@ -82,21 +90,12 @@ def _getargspec(target):
       from FullArgSpec.
     """
     fullargspecs = getfullargspec(target)
-    if hasattr(_inspect, 'ArgSpec'):
-      argspecs = ArgSpec(
-          args=fullargspecs.args,
-          varargs=fullargspecs.varargs,
-          keywords=fullargspecs.varkw,
-          defaults=fullargspecs.defaults)
-    else:
-      argspecs = FullArgSpec(
-          args=fullargspecs.args,
-          varargs=fullargspecs.varargs,
-          varkw=fullargspecs.varkw,
-          defaults=fullargspecs.defaults,
-          kwonlyargs=[],
-          kwonlydefaults=None,
-          annotations={})
+    argspecs = ArgSpec(
+        args=fullargspecs.args,
+        varargs=fullargspecs.varargs,
+        keywords=fullargspecs.varkw,
+        defaults=fullargspecs.defaults,
+    )
     return argspecs
 else:
   _getargspec = _inspect.getargspec
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index bf6c91a2d6c..2666f90e4a7 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -72,22 +72,30 @@ def testGetArgSpecOnDecoratorsThatDontProvideArgspec(self):
     self.assertEqual((2, 'Hello'), argspec.defaults)
 
   def testGetArgSpecOnDecoratorThatChangesArgspec(self):
-    argspec = tf_inspect.ArgSpec(
+    argspec = tf_inspect.FullArgSpec(
         args=['a', 'b', 'c'],
         varargs=None,
-        keywords=None,
-        defaults=(1, 'hello'))
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
 
     decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
                                          argspec)
-    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
+    self.assertEqual(argspec, tf_inspect.getfullargspec(decorator))
 
   def testGetArgSpecIgnoresDecoratorsThatDontProvideArgspec(self):
-    argspec = tf_inspect.ArgSpec(
+    argspec = tf_inspect.FullArgSpec(
         args=['a', 'b', 'c'],
         varargs=None,
-        keywords=None,
-        defaults=(1, 'hello'))
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
 
     inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
                                                '', argspec)
@@ -95,16 +103,30 @@ def testGetArgSpecIgnoresDecoratorsThatDontProvideArgspec(self):
     self.assertEqual(argspec, tf_inspect.getargspec(outer_decorator))
 
   def testGetArgSpecReturnsOutermostDecoratorThatChangesArgspec(self):
-    outer_argspec = tf_inspect.ArgSpec(
-        args=['a'], varargs=None, keywords=None, defaults=None)
-    inner_argspec = tf_inspect.ArgSpec(
-        args=['b'], varargs=None, keywords=None, defaults=None)
+    outer_argspec = tf_inspect.FullArgSpec(
+        args=['a'],
+        varargs=None,
+        varkw=None,
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
+    inner_argspec = tf_inspect.FullArgSpec(
+        args=['b'],
+        varargs=None,
+        varkw=None,
+        defaults=(),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
 
     inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
                                                '', inner_argspec)
     outer_decorator = tf_decorator.TFDecorator('', inner_decorator, '',
                                                outer_argspec)
-    self.assertEqual(outer_argspec, tf_inspect.getargspec(outer_decorator))
+    self.assertEqual(outer_argspec, tf_inspect.getfullargspec(outer_decorator))
 
   def testGetArgSpecOnPartialPositionalArgumentOnly(self):
     """Tests getargspec on partial function with only positional arguments."""
@@ -231,23 +253,31 @@ def func(m=1, n=2):
   def testGetArgSpecOnPartialWithDecoratorThatChangesArgspec(self):
     """Tests getargspec on partial function with decorated argspec."""
 
-    argspec = tf_inspect.ArgSpec(
+    argspec = tf_inspect.FullArgSpec(
         args=['a', 'b', 'c'],
         varargs=None,
-        keywords=None,
-        defaults=(1, 'hello'))
+        varkw=None,
+        defaults=(1, 'hello'),
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={},
+    )
     decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
                                          argspec)
-    partial_argspec = tf_inspect.ArgSpec(
-        args=['a', 'b', 'c'],
-        varargs=None,
-        keywords=None,
-        defaults=(2, 1, 'hello'))
+    signature = inspect.Signature([
+        inspect.Parameter(
+            'a', inspect.Parameter.KEYWORD_ONLY, default=2
+        ),
+        inspect.Parameter(
+            'b', inspect.Parameter.KEYWORD_ONLY, default=1
+        ),
+        inspect.Parameter(
+            'c', inspect.Parameter.KEYWORD_ONLY, default='hello'
+        ),
+    ])
     partial_with_decorator = functools.partial(decorator, a=2)
-
-    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
-    self.assertEqual(partial_argspec,
-                     tf_inspect.getargspec(partial_with_decorator))
+    self.assertEqual(argspec, tf_inspect.getfullargspec(decorator))
+    self.assertEqual(signature, inspect.signature(partial_with_decorator))
 
   def testGetArgSpecOnCallableObject(self):
 
@@ -616,8 +646,8 @@ class MyModule2:
       pass
     module = MyModule2()
     self.assertFalse(tf_inspect.isanytargetmethod(module))
-    def f2():
-      pass
+    def f2(x):
+      del x
     self.assertFalse(tf_inspect.isanytargetmethod(f2))
     f2 = functools.partial(f2, 1)
     self.assertFalse(tf_inspect.isanytargetmethod(f2))
@@ -796,11 +826,14 @@ def wrapper(*args, **kwargs):
     decorated = tf_decorator.make_decorator(
         func,
         wrapper,
-        decorator_argspec=tf_inspect.ArgSpec(
+        decorator_argspec=tf_inspect.FullArgSpec(
             args=['a', 'b', 'c'],
             varargs=None,
-            keywords=None,
-            defaults=(3, 'hello')))
+            kwonlyargs={},
+            defaults=(3, 'hello'),
+            kwonlydefaults=None,
+            varkw=None,
+            annotations=None))
 
     self.assertEqual({
         'a': 4,
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index ce720805295..ddcaff30f6b 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -13,6 +13,33 @@ in
 
 | Advisory Number                                                                                                     | Type                                                                               | Versions affected   | Reported by                                                                        | Additional Information
 | ------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | :-----------------: | ---------------------------------------------------------------------------------- | --------------------------------------------------------------
+| [TFSA-2022-170](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-170.md) | `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode        | <= 2.11.0           | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-169](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-169.md) | `CHECK` failure in `SobolSample` via missing validation                            | <= 2.11.0           | (multiple authors)                                                                 |
+| [TFSA-2022-168](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-168.md) | Heap overflow in `QuantizeAndDequantizeV2`                                         | <= 2.11.0           |  Reported via OSS VRP                                                                      |
+| [TFSA-2022-167](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-167.md) | OOB write in grappler                                                              | <= 2.11.0           | (discovered internally)                                                            |
+| [TFSA-2022-166](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-166.md) | Invalid char to bool conversion when printing a tensor                             | <= 2.11.0           | (discovered internally)                                                            |
+| [TFSA-2022-165](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-165.md) | FractionalMaxPool and FractionalAvgPool heap out-of-buffer                         | <= 2.11.0           |  Reported via OSS VRP                                                                    |
+| [TFSA-2022-164](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-164.md) | `CHECK_EQ` fail via input in `SparseMatrixNNZ`                                     | <= 2.11.0           | Kang Hong Jin                                                                      |
+| [TFSA-2022-163](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-163.md) | Segfault in `CompositeTensorVariantToComponents`                                   | <= 2.11.0           | pattarakritr@smu.edu.sg                                                            |
+| [TFSA-2022-162](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-162.md) | `CHECK` fail via inputs in `PyFunc`                                                | <= 2.11.0           | pattarakritr@smu.edu.sg                                                            |
+| [TFSA-2022-161](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-161.md) | `CHECK` fail via inputs in `SdcaOptimizer`                                         | <= 2.11.0           | Zizhuang Deng of IIE, UCAS                                                         |
+| [TFSA-2022-160](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-160.md) | `CHECK` fail via inputs in `SparseFillEmptyRowsGrad`                               | <= 2.11.0           | Jiawei Liu, PhD student at University of Illinois, Urbana-Champaign                |
+| [TFSA-2022-159](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-159.md) | `FractionalMaxPoolGrad` Heap OOB                                                   | <= 2.11.0           | Yu Tian from Qihoo 360 AIVul Team                                                                           |
+| [TFSA-2022-158](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-158.md) | `tf.raw_ops.Mfcc` crashes                                                          | <= 2.11.0           | Yu Tian from Qihoo 360 AIVul Team                                                                           |
+| [TFSA-2022-157](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-157.md) | `MirrorPadGrad` heap oob                                                           | <= 2.11.0           | Yu Tian from Qihoo 360 AIVul Team                                                                           |
+| [TFSA-2022-156](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-156.md) | Buffer overflow in `CONV_3D_TRANSPOSE` on TFLite                                   | <= 2.11.0           | Thibaut Goetghebuer-Planchon, Arm Ltd.                                             |
+| [TFSA-2022-155](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-155.md) | `CHECK_EQ` fail in `tf.raw_ops.TensorListResize`                                   | <= 2.11.0           | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-154](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-154.md) | Segfault in `tf.raw_ops.TensorListConcat`                                          | <= 2.11.0           | Tong Liu, ShanghaiTech University                                                  |
+| [TFSA-2022-153](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-153.md) | `CHECK` fail in `BCast` overflow                                                   | <= 2.11.0           | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-152](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-152.md) | Segfault via invalid attributes in `pywrap_tfe_src.cc`                             | <= 2.11.0           | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-151](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-151.md) | FPE in `tf.image.generate_bounding_box_proposals`                                  | <= 2.11.0           | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-150](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-150.md) | Overflow in `tf.keras.losses.poisson`                                              | >= 2.9.0, <= 2.11.0  | Pattarakrit Rattankul                                                              |
+| [TFSA-2022-149](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-149.md) | Overflow in `ResizeNearestNeighborGrad`                                            | <= 2.11.0           | Neophytos Christou from the Secure Systems Lab (SSL) at Brown University           |
+| [TFSA-2022-148](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-148.md) | Overflow in `ImageProjectiveTransformV2`                                           | <= 2.11.0           | Neophytos Christou from the Secure Systems Lab (SSL) at Brown University           |
+| [TFSA-2022-147](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-147.md) | Overflow in `FusedResizeAndPadConv2D`                                              | <= 2.11.0           | Neophytos Christou from the Secure Systems Lab (SSL) at Brown University           |
+| [TFSA-2022-146](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-146.md) | Seg fault in `ndarray_tensor_bridge` due to zero and large input                   | <= 2.11.0           | Pattarakrit Rattanukul                                                             |
+| [TFSA-2022-145](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-145.md) | OOB seg fault in `DynamicStitch` due to missing validation                         | <= 2.11.0           | Zizhuang Deng of IIE, UCAS                                                         |
+| [TFSA-2022-144](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-144.md) | ThreadUnsafeUnigramCandidateSampler Heap OOB                                       | <= 2.11.0           | Yu Tian of Qihoo 360 AIVul Team                                                    |
 | [TFSA-2022-143](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-143.md) | OOB read in `Gather_nd` op in TF Lite Micro                                        | <= 2.10.0           | Hui Peng from Baidu Security                                                       |
 | [TFSA-2022-142](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-142.md) | `CHECK`-fail in `tensorflow::full_type::SubstituteFromAttrs`                       | <= 2.10.0           | (discovered internally)                                                            |
 | [TFSA-2022-141](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2022-141.md) | Integer overflow in math ops                                                       | <= 2.10.0           | (discovered internally)                                                            |
diff --git a/tensorflow/security/advisory/tfsa-2022-144.md b/tensorflow/security/advisory/tfsa-2022-144.md
new file mode 100644
index 00000000000..a928434b2d7
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-144.md
@@ -0,0 +1,30 @@
+## TFSA-2022-144: ThreadUnsafeUnigramCandidateSampler Heap OOB
+
+### CVE Number
+CVE-2022-41880
+
+### Impact
+When the [`BaseCandidateSamplerOp`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/candidate_sampler_ops.cc) function  receives a value in `true_classes` larger than `range_max`, a heap oob vulnerability occurs.
+```python
+tf.raw_ops.ThreadUnsafeUnigramCandidateSampler(
+    true_classes=[[0x100000,1]],
+    num_true = 2,
+    num_sampled = 2,
+    unique = False,
+    range_max = 2,
+    seed = 2,
+    seed2 = 2)
+```
+
+### Patches
+We have patched the issue in GitHub commit [b389f5c944cadfdfe599b3f1e4026e036f30d2d4](https://github.com/tensorflow/tensorflow/commit/b389f5c944cadfdfe599b3f1e4026e036f30d2d4).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian of Qihoo 360 AIVul Team.
diff --git a/tensorflow/security/advisory/tfsa-2022-145.md b/tensorflow/security/advisory/tfsa-2022-145.md
new file mode 100644
index 00000000000..f4dce161761
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-145.md
@@ -0,0 +1,44 @@
+## TFSA-2022-145: OOB seg fault in `DynamicStitch` due to missing validation
+
+### CVE Number
+CVE-2022-41883
+
+### Impact
+ [`tf.raw_ops.DynamicStitch`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/dynamic_stitch_op.cc) specifies input sizes when it is [registered](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/data_flow_ops.cc). 
+```cpp
+REGISTER_OP("DynamicStitch")
+    .Input("indices: N * int32")
+    .Input("data: N * T")
+    .Output("merged: T")
+    .Attr("N : int >= 1")
+    .Attr("T : type")
+    .SetShapeFn(DynamicStitchShapeFunction);
+```
+When it receives a differing number of inputs, such as when it is called with an `indices` size 1 and a `data` size 2, it will crash.
+```python
+import tensorflow as tf
+
+# indices = 1*[tf.random.uniform([1,2], dtype=tf.dtypes.int32, maxval=100)]
+indices = [tf.constant([[0, 1]]),]
+
+# data = 2*[tf.random.uniform([1,2], dtype=tf.dtypes.float32, maxval=100)]
+data = [tf.constant([[5, 6]]), tf.constant([[7, 8]])]
+
+tf.raw_ops.DynamicStitch(
+    indices=indices, 
+    data=data)
+```
+
+### Patches
+We have patched the issue in GitHub commit [f5381e0e10b5a61344109c1b7c174c68110f7629](https://github.com/tensorflow/tensorflow/commit/f5381e0e10b5a61344109c1b7c174c68110f7629).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, as it is also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Zizhuang Deng of IIE, UCAS
+
diff --git a/tensorflow/security/advisory/tfsa-2022-146.md b/tensorflow/security/advisory/tfsa-2022-146.md
new file mode 100644
index 00000000000..23789031b7b
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-146.md
@@ -0,0 +1,34 @@
+## TFSA-2022-146: Seg fault in `ndarray_tensor_bridge` due to zero and large inputs
+
+### CVE Number
+CVE-2022-41884
+
+### Impact
+If a numpy array is created with a shape such that one element is zero and the others sum to a large number, an error will be raised. E.g. the following raises an error:
+```python
+np.ones((0, 2**31, 2**31))
+```
+An example of a proof of concept:
+```python
+import numpy as np
+import tensorflow as tf
+
+input_val = tf.constant([1])
+shape_val = np.array([i for i in range(21)])
+
+tf.broadcast_to(input=input_val,shape=shape_val)
+```
+The return value of `PyArray_SimpleNewFromData`, which returns null on such shapes, is not checked.
+
+### Patches
+We have patched the issue in GitHub commit [2b56169c16e375c521a3bc8ea658811cc0793784](https://github.com/tensorflow/tensorflow/commit/2b56169c16e375c521a3bc8ea658811cc0793784).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattanukul.
diff --git a/tensorflow/security/advisory/tfsa-2022-147.md b/tensorflow/security/advisory/tfsa-2022-147.md
new file mode 100644
index 00000000000..9bec1a556a4
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-147.md
@@ -0,0 +1,33 @@
+## TFSA-2022-147: Overflow in `FusedResizeAndPadConv2D`
+
+### CVE Number
+CVE-2022-41885
+
+### Impact
+When [`tf.raw_ops.FusedResizeAndPadConv2D`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/conv_ops_fused_image_transform.cc) is given a large tensor shape, it overflows.
+```python
+import tensorflow as tf
+
+mode = "REFLECT"
+strides = [1, 1, 1, 1]
+padding = "SAME"
+resize_align_corners = False
+input = tf.constant(147, shape=[3,3,1,1], dtype=tf.float16)
+size = tf.constant([1879048192,1879048192], shape=[2], dtype=tf.int32)
+paddings = tf.constant([3,4], shape=[2], dtype=tf.int32)
+filter = tf.constant(123, shape=[1,3,4,1], dtype=tf.float16)
+tf.raw_ops.FusedResizeAndPadConv2D(input=input, size=size, paddings=paddings, filter=filter, mode=mode, strides=strides, padding=padding, resize_align_corners=resize_align_corners)
+```
+
+### Patches
+We have patched the issue in GitHub commit [d66e1d568275e6a2947de97dca7a102a211e01ce](https://github.com/tensorflow/tensorflow/commit/d66e1d568275e6a2947de97dca7a102a211e01ce).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Neophytos Christou from the Secure Systems Lab (SSL) at Brown University.
diff --git a/tensorflow/security/advisory/tfsa-2022-148.md b/tensorflow/security/advisory/tfsa-2022-148.md
new file mode 100644
index 00000000000..dd1a41e2aa0
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-148.md
@@ -0,0 +1,30 @@
+## TFSA-2022-148: Overflow in `ImageProjectiveTransformV2`
+
+### CVE Number
+CVE-2022-41886
+
+### Impact
+When [`tf.raw_ops.ImageProjectiveTransformV2`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/image_ops.cc) is given a large output shape, it overflows.
+```python
+import tensorflow as tf
+
+interpolation = "BILINEAR"
+fill_mode = "REFLECT"
+images = tf.constant(0.184634328, shape=[2,5,8,3], dtype=tf.float32)
+transforms = tf.constant(0.378575385, shape=[2,8], dtype=tf.float32)
+output_shape = tf.constant([1879048192,1879048192], shape=[2], dtype=tf.int32)
+tf.raw_ops.ImageProjectiveTransformV2(images=images, transforms=transforms, output_shape=output_shape, interpolation=interpolation, fill_mode=fill_mode)
+```
+
+### Patches
+We have patched the issue in GitHub commit [8faa6ea692985dbe6ce10e1a3168e0bd60a723ba](https://github.com/tensorflow/tensorflow/commit/8faa6ea692985dbe6ce10e1a3168e0bd60a723ba).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Neophytos Christou from the Secure Systems Lab (SSL) at Brown University.
diff --git a/tensorflow/security/advisory/tfsa-2022-149.md b/tensorflow/security/advisory/tfsa-2022-149.md
new file mode 100644
index 00000000000..74dc599395d
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-149.md
@@ -0,0 +1,29 @@
+## TFSA-2022-149: Overflow in `ResizeNearestNeighborGrad`
+
+### CVE Number
+CVE-2022-41907
+
+### Impact
+When [`tf.raw_ops.ResizeNearestNeighborGrad`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/resize_nearest_neighbor_op.cc) is given a large `size` input, it overflows.
+```
+import tensorflow as tf
+
+align_corners = True
+half_pixel_centers = False
+grads = tf.constant(1, shape=[1,8,16,3], dtype=tf.float16)
+size = tf.constant([1879048192,1879048192], shape=[2], dtype=tf.int32)
+tf.raw_ops.ResizeNearestNeighborGrad(grads=grads, size=size, align_corners=align_corners, half_pixel_centers=half_pixel_centers)
+```
+
+### Patches
+We have patched the issue in GitHub commit [00c821af032ba9e5f5fa3fe14690c8d28a657624](https://github.com/tensorflow/tensorflow/commit/00c821af032ba9e5f5fa3fe14690c8d28a657624).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Neophytos Christou from the Secure Systems Lab (SSL) at Brown University.
diff --git a/tensorflow/security/advisory/tfsa-2022-150.md b/tensorflow/security/advisory/tfsa-2022-150.md
new file mode 100644
index 00000000000..3fb5dd55a47
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-150.md
@@ -0,0 +1,29 @@
+## TFSA-2022-150: Overflow in `tf.keras.losses.poisson`
+
+### CVE Number
+CVE-2022-41887
+
+### Impact
+[`tf.keras.losses.poisson`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/losses.py) receives a `y_pred` and `y_true` that are passed through `functor::mul` in [`BinaryOp`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/cwise_ops_common.h). If the resulting dimensions overflow an `int32`, TensorFlow will crash due to a size mismatch during broadcast assignment.
+```python
+import numpy as np
+import tensorflow as tf
+
+true_value = tf.reshape(shape=[1, 2500000000], tensor = tf.zeros(dtype=tf.bool, shape=[50000, 50000]))
+pred_value = np.array([[[-2]], [[8]]], dtype = np.float64)
+
+tf.keras.losses.poisson(y_true=true_value,y_pred=pred_value)
+```
+
+### Patches
+We have patched the issue in GitHub commit [c5b30379ba87cbe774b08ac50c1f6d36df4ebb7c](https://github.com/tensorflow/tensorflow/commit/c5b30379ba87cbe774b08ac50c1f6d36df4ebb7c).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1 and 2.9.3, as these are also affected and still in supported range. However, we will not cherrypick this commit into TensorFlow 2.8.x, as it depends on Eigen behavior that changed between 2.8 and 2.9.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul.
diff --git a/tensorflow/security/advisory/tfsa-2022-151.md b/tensorflow/security/advisory/tfsa-2022-151.md
new file mode 100644
index 00000000000..51d8d1c6d61
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-151.md
@@ -0,0 +1,28 @@
+## TFSA-2022-151: FPE in `tf.image.generate_bounding_box_proposals`
+
+### CVE Number
+CVE-2022-41888
+
+### Impact
+When running on GPU, [`tf.image.generate_bounding_box_proposals`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc) receives a `scores` input that must be of rank 4 but is not checked.
+```python
+import tensorflow as tf
+
+a = tf.constant(value=[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+b = tf.constant(value=[1])
+
+tf.image.generate_bounding_box_proposals(scores=a,bbox_deltas=a,image_info=a,anchors=a,pre_nms_topn=b)
+```
+
+### Patches
+We have patched the issue in GitHub commit [cf35502463a88ca7185a99daa7031df60b3c1c98](https://github.com/tensorflow/tensorflow/commit/cf35502463a88ca7185a99daa7031df60b3c1c98).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul.
diff --git a/tensorflow/security/advisory/tfsa-2022-152.md b/tensorflow/security/advisory/tfsa-2022-152.md
new file mode 100644
index 00000000000..95455640670
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-152.md
@@ -0,0 +1,30 @@
+## TFSA-2022-152: Segfault via invalid attributes in `pywrap_tfe_src.cc`
+
+### CVE Number
+CVE-2022-41889
+
+### Impact
+If a list of quantized tensors is assigned to an attribute, the pywrap code fails to parse the tensor and returns a `nullptr`, which is not caught. An example can be seen in [`tf.compat.v1.extract_volume_patches`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc) by passing in quantized tensors as input `ksizes`.
+```python
+import numpy as np
+import tensorflow as tf
+
+a_input = np.array([1, -1], dtype= np.int32)
+a_ksizes =  a_strides = tf.constant(dtype=tf.dtypes.qint16, value=[[1, 4], [5, 2]])
+
+
+tf.compat.v1.extract_volume_patches(input=a_input,ksizes=a_ksizes,strides=a_strides,padding='VALID')
+```
+
+### Patches
+We have patched the issue in GitHub commit [e9e95553e5411834d215e6770c81a83a3d0866ce](https://github.com/tensorflow/tensorflow/commit/e9e95553e5411834d215e6770c81a83a3d0866ce).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul.
diff --git a/tensorflow/security/advisory/tfsa-2022-153.md b/tensorflow/security/advisory/tfsa-2022-153.md
new file mode 100644
index 00000000000..b73e9ccd8f7
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-153.md
@@ -0,0 +1,27 @@
+## TFSA-2022-153: `CHECK` fail in `BCast` overflow
+
+### CVE Number
+CVE-2022-41890
+
+### Impact
+If [`BCast::ToShape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/bcast.h) is given input larger than an `int32`, it will crash, despite being supposed to handle up to an `int64`. An example can be seen in [`tf.experimental.numpy.outer`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/bcast.h) by passing in large input to the input `b`.
+```python
+import tensorflow as tf
+
+value = tf.constant(shape=[2, 1024, 1024, 1024], value=False)
+
+tf.experimental.numpy.outer(a=6,b=value)
+```
+
+### Patches
+We have patched the issue in GitHub commit [8310bf8dd188ff780e7fc53245058215a05bdbe5](https://github.com/tensorflow/tensorflow/commit/8310bf8dd188ff780e7fc53245058215a05bdbe5).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul.
diff --git a/tensorflow/security/advisory/tfsa-2022-154.md b/tensorflow/security/advisory/tfsa-2022-154.md
new file mode 100644
index 00000000000..fa1d50e258c
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-154.md
@@ -0,0 +1,28 @@
+## TFSA-2022-154: Segfault in `tf.raw_ops.TensorListConcat`
+
+### CVE Number
+CVE-2022-41891
+
+### Impact
+If [`tf.raw_ops.TensorListConcat`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/list_kernels.h) is given `element_shape=[]`, it results segmentation fault which can be used to trigger a denial of service attack.
+```python
+import tensorflow as tf
+tf.raw_ops.TensorListConcat(
+    input_handle=tf.data.experimental.to_variant(tf.data.Dataset.from_tensor_slices([1, 2, 3])),
+    element_dtype=tf.dtypes.float32,
+    element_shape=[]
+)
+```
+
+### Patches
+We have patched the issue in GitHub commit [fc33f3dc4c14051a83eec6535b608abe1d355fde](https://github.com/tensorflow/tensorflow/commit/fc33f3dc4c14051a83eec6535b608abe1d355fde).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Tong Liu, ShanghaiTech University
diff --git a/tensorflow/security/advisory/tfsa-2022-155.md b/tensorflow/security/advisory/tfsa-2022-155.md
new file mode 100644
index 00000000000..24eedcc3948
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-155.md
@@ -0,0 +1,29 @@
+## TFSA-2022-155: `CHECK_EQ` fail in `tf.raw_ops.TensorListResize`
+
+### CVE Number
+CVE-2022-41893
+
+### Impact
+If [`tf.raw_ops.TensorListResize`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/list_kernels.cc) is given a nonscalar value for input `size`, it results `CHECK` fail which can be used to trigger a denial of service attack.
+```python
+import numpy as np
+import tensorflow as tf
+
+a = data_structures.tf_tensor_list_new(elements = tf.constant(value=[3, 4, 5]))
+b = np.zeros([0, 2, 3, 3])
+
+tf.raw_ops.TensorListResize(input_handle=a, size=b)
+```
+
+### Patches
+We have patched the issue in GitHub commit [888e34b49009a4e734c27ab0c43b0b5102682c56](https://github.com/tensorflow/tensorflow/commit/888e34b49009a4e734c27ab0c43b0b5102682c56).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul
diff --git a/tensorflow/security/advisory/tfsa-2022-156.md b/tensorflow/security/advisory/tfsa-2022-156.md
new file mode 100644
index 00000000000..4e7b166d8db
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-156.md
@@ -0,0 +1,52 @@
+## TFSA-2022-156: Buffer overflow in `CONV_3D_TRANSPOSE` on TFLite
+
+### CVE Number
+CVE-2022-41894
+
+### Impact
+The reference kernel of the [`CONV_3D_TRANSPOSE`](https://github.com/tensorflow/tensorflow/blob/091e63f0ea33def7ecad661a5ac01dcafbafa90b/tensorflow/lite/kernels/internal/reference/conv3d_transpose.h#L121) TensorFlow Lite operator wrongly increments the data_ptr when adding the bias to the result.
+
+Instead of `data_ptr += num_channels;` it should be `data_ptr += output_num_channels;` as if the number of input channels is different than the number of output channels, the wrong result will be returned and a buffer overflow will occur if num_channels > output_num_channels.
+
+An attacker can craft a model with a specific number of input channels in a way similar to the attached example script. It is then possible to write specific values through the bias of the layer outside the bounds of the buffer. This attack only works if the reference kernel resolver is used in the interpreter (i.e. `experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF` is used).
+```python
+import tensorflow as tf
+model = tf.keras.Sequential(
+    [
+        tf.keras.layers.InputLayer(input_shape=(2, 2, 2, 1024), batch_size=1),
+        tf.keras.layers.Conv3DTranspose(
+            filters=8,
+            kernel_size=(2, 2, 2),
+            padding="same",
+            data_format="channels_last",
+        ),
+    ]
+)
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+tflite_model = converter.convert()
+
+interpreter = tf.lite.Interpreter(
+    model_content=tflite_model,
+    experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
+)
+
+interpreter.allocate_tensors()
+interpreter.set_tensor(
+    interpreter.get_input_details()[0]["index"], tf.zeros(shape=[1, 2, 2, 2, 1024])
+)
+interpreter.invoke()
+```
+
+### Patches
+We have patched the issue in GitHub commit [72c0bdcb25305b0b36842d746cc61d72658d2941](https://github.com/tensorflow/tensorflow/commit/72c0bdcb25305b0b36842d746cc61d72658d2941).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Thibaut Goetghebuer-Planchon, Arm Ltd.
diff --git a/tensorflow/security/advisory/tfsa-2022-157.md b/tensorflow/security/advisory/tfsa-2022-157.md
new file mode 100644
index 00000000000..7357c2463f1
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-157.md
@@ -0,0 +1,27 @@
+## TFSA-2022-157: `MirrorPadGrad` heap oob
+
+### CVE Number
+CVE-2022-41895
+
+### Impact
+If [`MirrorPadGrad`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/mirror_pad_op.cc) is given outsize input `paddings`, TensorFlow will give a heap OOB error.
+
+```python
+import tensorflow as tf
+tf.raw_ops.MirrorPadGrad(input=[1],
+             paddings=[[0x77f00000,0xa000000]],
+             mode = 'REFLECT')
+```
+
+### Patches
+We have patched the issue in GitHub commit [717ca98d8c3bba348ff62281fdf38dcb5ea1ec92](https://github.com/tensorflow/tensorflow/commit/717ca98d8c3bba348ff62281fdf38dcb5ea1ec92).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian from Qihoo 360 AIVul Team. 
diff --git a/tensorflow/security/advisory/tfsa-2022-158.md b/tensorflow/security/advisory/tfsa-2022-158.md
new file mode 100644
index 00000000000..9e8d9a6025c
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-158.md
@@ -0,0 +1,32 @@
+## TFSA-2022-158: `tf.raw_ops.Mfcc` crashes
+
+### CVE Number
+CVE-2022-41896
+
+### Impact
+If [`ThreadUnsafeUnigramCandidateSampler`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/image/mirror_pad_op.cc) is given input `filterbank_channel_count` greater than the allowed max size, TensorFlow will crash.
+
+```python
+import tensorflow as tf
+tf.raw_ops.Mfcc(
+    spectrogram = [[[1.38, 6.32, 5.75, 9.51]]],
+    sample_rate = 2,
+    upper_frequency_limit = 5.0,
+    lower_frequency_limit = 1.0,
+    filterbank_channel_count = 2**31 - 1,
+    dct_coefficient_count = 1
+)
+```
+
+### Patches
+We have patched the issue in GitHub commit [39ec7eaf1428e90c37787e5b3fbd68ebd3c48860](https://github.com/tensorflow/tensorflow/commit/39ec7eaf1428e90c37787e5b3fbd68ebd3c48860).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian from Qihoo 360 AIVul Team.
diff --git a/tensorflow/security/advisory/tfsa-2022-159.md b/tensorflow/security/advisory/tfsa-2022-159.md
new file mode 100644
index 00000000000..e173c87a673
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-159.md
@@ -0,0 +1,32 @@
+## TFSA-2022-159: `FractionalMaxPoolGrad` Heap OOB
+
+### CVE Number
+CVE-2022-41897
+
+### Impact
+If [`FractionMaxPoolGrad`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/fractional_max_pool_op.cc) is given outsize inputs `row_pooling_sequence` and `col_pooling_sequence`, TensorFlow will crash.
+
+```python
+import tensorflow as tf
+tf.raw_ops.FractionMaxPoolGrad(
+	orig_input = [[[[1, 1, 1, 1, 1]]]],
+    orig_output = [[[[1, 1, 1]]]],
+    out_backprop = [[[[3], [3], [6]]]],
+    row_pooling_sequence = [-0x4000000, 1, 1], 
+    col_pooling_sequence = [-0x4000000, 1, 1], 
+    overlapping = False
+ )
+```
+
+### Patches
+We have patched the issue in GitHub commit [d71090c3e5ca325bdf4b02eb236cfb3ee823e927](https://github.com/tensorflow/tensorflow/commit/d71090c3e5ca325bdf4b02eb236cfb3ee823e927).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Yu Tian from Qihoo 360 AIVul Team.
diff --git a/tensorflow/security/advisory/tfsa-2022-160.md b/tensorflow/security/advisory/tfsa-2022-160.md
new file mode 100644
index 00000000000..4d9a2d5da87
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-160.md
@@ -0,0 +1,27 @@
+## TFSA-2022-160: `CHECK` fail via inputs in `SparseFillEmptyRowsGrad`
+
+### CVE Number
+CVE-2022-41898
+
+### Impact
+If [`SparseFillEmptyRowsGrad`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/sparse_fill_empty_rows_op_gpu.cu.cc) is given empty inputs, TensorFlow will crash.
+
+```python
+import tensorflow as tf
+tf.raw_ops.SparseFillEmptyRowsGrad(
+    reverse_index_map=[], grad_values=[], name=None
+)
+```
+
+### Patches
+We have patched the issue in GitHub commit [af4a6a3c8b95022c351edae94560acc61253a1b8](https://github.com/tensorflow/tensorflow/commit/af4a6a3c8b95022c351edae94560acc61253a1b8).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Jiawei Liu, PhD student at University of Illinois, Urbana-Champaign.
diff --git a/tensorflow/security/advisory/tfsa-2022-161.md b/tensorflow/security/advisory/tfsa-2022-161.md
new file mode 100644
index 00000000000..2cd3955bdaf
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-161.md
@@ -0,0 +1,42 @@
+## TFSA-2022-161: `CHECK` fail via inputs in `SdcaOptimizer`
+
+### CVE Number
+CVE-2022-41899
+
+### Impact
+Inputs `dense_features` or `example_state_data` not of rank 2 will trigger a `CHECK` fail in [`SdcaOptimizer`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/sdca_internal.cc).
+
+```python
+import tensorflow as tf
+
+tf.raw_ops.SdcaOptimizer(
+    sparse_example_indices=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.int64, maxval=100)],
+    sparse_feature_indices=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.int64, maxval=100)],
+    sparse_feature_values=8 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100)],
+    dense_features=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100)],
+    example_weights=tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100),
+    example_labels=tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100),
+    sparse_indices=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.int64, maxval=100)],
+    sparse_weights=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100)],
+    dense_weights=4 * [tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100)],
+    example_state_data=tf.random.uniform([5,5,5,3], dtype=tf.dtypes.float32, maxval=100),
+    loss_type="squared_loss",
+    l1=0.0,
+    l2=0.0,
+    num_loss_partitions=1,
+    num_inner_iterations=1,
+    adaptative=False,)
+```
+
+### Patches
+We have patched the issue in GitHub commit [80ff197d03db2a70c6a111f97dcdacad1b0babfa](https://github.com/tensorflow/tensorflow/commit/80ff197d03db2a70c6a111f97dcdacad1b0babfa).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Zizhuang Deng of IIE, UCAS
diff --git a/tensorflow/security/advisory/tfsa-2022-162.md b/tensorflow/security/advisory/tfsa-2022-162.md
new file mode 100644
index 00000000000..13285917878
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-162.md
@@ -0,0 +1,30 @@
+## TFSA-2022-162: `CHECK` fail via inputs in `PyFunc`
+
+### CVE Number
+CVE-2022-41908
+
+### Impact
+An input `token` that is not a UTF-8 bytestring will trigger a `CHECK` fail in [`tf.raw_ops.PyFunc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/lib/core/py_func.cc).
+
+```python
+import tensorflow as tf
+
+value = tf.constant(value=[1,2])
+token = b'\xb0'
+dataType = [tf.int32]
+
+tf.raw_ops.PyFunc(input=value,token=token,Tout=dataType)
+```
+
+### Patches
+We have patched the issue in GitHub commit [9f03a9d3bafe902c1e6beb105b2f24172f238645](https://github.com/tensorflow/tensorflow/commit/9f03a9d3bafe902c1e6beb105b2f24172f238645).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by pattarakritr@smu.edu.sg
diff --git a/tensorflow/security/advisory/tfsa-2022-163.md b/tensorflow/security/advisory/tfsa-2022-163.md
new file mode 100644
index 00000000000..e2fcf14123d
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-163.md
@@ -0,0 +1,30 @@
+## TFSA-2022-163: Segfault in `CompositeTensorVariantToComponents`
+
+### CVE Number
+CVE-2022-41909
+
+### Impact
+An input `encoded` that is not a valid `CompositeTensorVariant` tensor will trigger a segfault in [`tf.raw_ops.CompositeTensorVariantToComponents`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/lib/core/py_func.cc).
+
+```python
+import tensorflow as tf
+
+encode = tf.raw_ops.EmptyTensorList(element_dtype=tf.int32, element_shape=[10, 15], max_num_elements=2)
+meta= ""
+component=[tf.int32]
+
+print(tf.raw_ops.CompositeTensorVariantToComponents(encoded=encode,metadata=meta,Tcomponents=component))
+```
+
+### Patches
+We have patched the issue in GitHub commits [bf594d08d377dc6a3354d9fdb494b32d45f91971](https://github.com/tensorflow/tensorflow/commit/bf594d08d377dc6a3354d9fdb494b32d45f91971) and [660ce5a89eb6766834bdc303d2ab3902aef99d3d](https://github.com/tensorflow/tensorflow/commit/660ce5a89eb6766834bdc303d2ab3902aef99d3d).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by pattarakritr@smu.edu.sg
diff --git a/tensorflow/security/advisory/tfsa-2022-164.md b/tensorflow/security/advisory/tfsa-2022-164.md
new file mode 100644
index 00000000000..f735405118b
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-164.md
@@ -0,0 +1,25 @@
+## TFSA-2022-164: `CHECK_EQ` fail via input in `SparseMatrixNNZ`
+
+### CVE Number
+CVE-2022-41901
+
+### Impact
+An input `sparse_matrix` that is not a matrix with a shape with rank 0 will trigger a `CHECK` fail in [`tf.raw_ops.SparseMatrixNNZ`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/sparse/sparse_matrix.h).
+
+```python
+import tensorflow as tf
+tf.raw_ops.SparseMatrixNNZ(sparse_matrix=[])
+```
+
+### Patches
+We have patched the issue in GitHub commits [f856d02e5322821aad155dad9b3acab1e9f5d693](https://github.com/tensorflow/tensorflow/commit/f856d02e5322821aad155dad9b3acab1e9f5d693).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Kang Hong Jin
diff --git a/tensorflow/security/advisory/tfsa-2022-165.md b/tensorflow/security/advisory/tfsa-2022-165.md
new file mode 100644
index 00000000000..cf93073b5ee
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-165.md
@@ -0,0 +1,20 @@
+## TFSA-2022-165: FractionalMaxPool and FractionalAvgPool heap out-of-buffer
+
+### CVE Number
+CVE-2022-41900
+
+### Impact
+An input `pooling_ratio` that is smaller than 1 will trigger a heap OOB in [`tf.raw_ops.FractionalMaxPool`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/fractional_max_pool_op.cc) and [`tf.raw_ops.FractionalAvgPool`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/fractional_avg_pool_op.cc).
+
+### Patches
+We have patched the issue in GitHub commit [216525144ee7c910296f5b05d214ca1327c9ce48](https://github.com/tensorflow/tensorflow/commit/216525144ee7c910296f5b05d214ca1327c9ce48).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported through the OSS VRP program.
diff --git a/tensorflow/security/advisory/tfsa-2022-166.md b/tensorflow/security/advisory/tfsa-2022-166.md
new file mode 100644
index 00000000000..3e2df6aab13
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-166.md
@@ -0,0 +1,18 @@
+# TFSA-2022-166: Invalid char to bool conversion when printing a tensor
+
+### CVE Number
+CVE-2022-41911
+
+### Impact
+When [printing a tensor](https://github.com/tensorflow/tensorflow/blob/807cae8a807960fd7ac2313cde73a11fc15e7942/tensorflow/core/framework/tensor.cc#L1200-L1227), we get it's data as a `const char*` array (since that's the underlying storage) and then we typecast it to the element type. However, conversions from `char` to `bool` are undefined if the `char` is not `0` or `1`, so sanitizers/fuzzers will crash.
+
+### Patches
+We have patched the issue in GitHub commit [1be743703279782a357adbf9b77dcb994fe8b508](https://github.com/tensorflow/tensorflow/commit/1be743703279782a357adbf9b77dcb994fe8b508).
+
+The fix will be included in TensorFlow 2.11.0. We will also cherrypick this commit on TensorFlow 2.10.1, TensorFlow 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+This vulnerability was discovered via internal fuzzing.
diff --git a/tensorflow/security/advisory/tfsa-2022-167.md b/tensorflow/security/advisory/tfsa-2022-167.md
new file mode 100644
index 00000000000..5446fba9ae6
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-167.md
@@ -0,0 +1,16 @@
+## TFSA-2022-167: OOB write in grappler
+
+### CVE Number
+CVE-2022-41902
+
+### Impact
+The function [MakeGrapplerFunctionItem](https://https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/utils/functions.cc#L221) takes arguments that determine the sizes of inputs and outputs. If the inputs given are greater than or equal to the sizes of the outputs, an out-of-bounds memory read or a crash is triggered.
+
+### Patches
+We have patched the issue in GitHub commit [a65411a1d69edfb16b25907ffb8f73556ce36bb7](https://github.com/tensorflow/tensorflow/commit/a65411a1d69edfb16b25907ffb8f73556ce36bb7).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
diff --git a/tensorflow/security/advisory/tfsa-2022-168.md b/tensorflow/security/advisory/tfsa-2022-168.md
new file mode 100644
index 00000000000..41ddda9c2cf
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-168.md
@@ -0,0 +1,35 @@
+## TFSA-2022-168: Heap overflow in `QuantizeAndDequantizeV2`
+
+### CVE Number
+CVE-2022-41910
+
+### Impact
+The function [QuantizeAndDequantizeV2/V3/V4Grad](https://https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/array_ops.cc) an `axis` less than `kint32max`, or its shape inference can have address sanitizer issues.
+```python
+import tensorflow as tf
+@tf.function
+def test():
+    tf.raw_ops.QuantizeAndDequantizeV2(input=[2.5],
+    								   input_min=[1.0],
+    								   input_max=[10.0],
+    								   signed_input=True,
+    								   num_bits=1,
+    								   range_given=True,
+    								   round_mode='HALF_TO_EVEN',
+    								   narrow_range=True,
+    								   axis=0x7fffffff)
+test()
+```
+
+### Patches
+We have patched the issue in GitHub commit [7b174a0f2e40ff3f3aa957aecddfd5aaae35eccb](https://github.com/tensorflow/tensorflow/commit/7b174a0f2e40ff3f3aa957aecddfd5aaae35eccb).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported through the OSS VRP program.
diff --git a/tensorflow/security/advisory/tfsa-2022-169.md b/tensorflow/security/advisory/tfsa-2022-169.md
new file mode 100644
index 00000000000..ff88a86a480
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-169.md
@@ -0,0 +1,29 @@
+## TFSA-2022-169: `CHECK` failure in `SobolSample` via missing validation
+
+### CVE Number
+CVE-2022-35935
+
+
+### Impact
+Another instance of CVE-2022-35935, where `SobolSample` is vulnerable to a denial of service via assumed scalar inputs, was found and fixed.
+```python
+import tensorflow as tf
+tf.raw_ops.SobolSample(dim=tf.constant([1,0]), num_results=tf.constant([1]), skip=tf.constant([1]))
+```
+
+### Patches
+We have patched the issue in GitHub commits [c65c67f88ad770662e8f191269a907bf2b94b1bf](https://github.com/tensorflow/tensorflow/commit/c65c67f88ad770662e8f191269a907bf2b94b1bf) and [02400ea266bd811fc016a848445de1bbff3a23a0](https://github.com/tensorflow/tensorflow/commit/02400ea266bd811fc016a848445de1bbff3a23a0)
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick both commits on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range. TensorFlow 2.7.4 will have the first commit cherrypicked.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by:
+- Kang Hong Jin from Singapore Management University
+- Neophytos Christou, Secure Systems Labs, Brown University
+- 刘力源, Information System & Security and Countermeasures Experiments Center, Beijing Institute of Technology
+- Pattarakrit Rattankul
diff --git a/tensorflow/security/advisory/tfsa-2022-170.md b/tensorflow/security/advisory/tfsa-2022-170.md
new file mode 100644
index 00000000000..2dd55689808
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2022-170.md
@@ -0,0 +1,30 @@
+## TFSA-2022-169: `CHECK` fail in `TensorListScatter` and `TensorListScatterV2` in eager mode
+
+### CVE Number
+CVE-2022-35991
+
+### Impact
+Another instance of CVE-2022-35991, where `TensorListScatter` and `TensorListScatterV2` crash via non scalar inputs in`element_shape`, was found in eager mode and fixed.
+```python
+import tensorflow as tf
+arg_0=tf.random.uniform(shape=(2, 2, 2), dtype=tf.float16, maxval=None)
+arg_1=tf.random.uniform(shape=(2, 2, 2), dtype=tf.int32, maxval=65536)
+arg_2=tf.random.uniform(shape=(2, 2, 2), dtype=tf.int32, maxval=65536)
+arg_3=''
+tf.raw_ops.TensorListScatter(tensor=arg_0, indices=arg_1, 
+element_shape=arg_2, name=arg_3)
+```
+
+### Patches
+We have patched the issue in GitHub commit [bf9932fc907aff0e9e8cccf769e8b00d30fd81a1](https://github.com/tensorflow/tensorflow/commit/bf9932fc907aff0e9e8cccf769e8b00d30fd81a1).
+
+The fix will be included in TensorFlow 2.11. We will also cherrypick this commit on TensorFlow 2.10.1, 2.9.3, and TensorFlow 2.8.4, as these are also affected and still in supported range.
+
+
+### For more information
+Please consult [our security guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+
+### Attribution
+This vulnerability has been reported by Pattarakrit Rattankul
+
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 5f03f1ae5ba..e4436c7089a 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/AreAttrValuesEqual_fuzz.cc b/tensorflow/security/fuzzing/cc/AreAttrValuesEqual_fuzz.cc
index 788d96258e0..2ea1515bebb 100644
--- a/tensorflow/security/fuzzing/cc/AreAttrValuesEqual_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/AreAttrValuesEqual_fuzz.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 
diff --git a/tensorflow/security/fuzzing/cc/BUILD b/tensorflow/security/fuzzing/cc/BUILD
index c5d6217f0d2..c32a54abfb5 100644
--- a/tensorflow/security/fuzzing/cc/BUILD
+++ b/tensorflow/security/fuzzing/cc/BUILD
@@ -5,39 +5,36 @@ load(
     "tf_proto_library",
 )
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_cc_fuzz_test",
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "status_fuzz",
     srcs = ["status_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
-        ":fuzz_helpers",
+        ":fuzz_domains",
         "//tensorflow/core/platform:status",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "arg_def_case_fuzz",
     srcs = ["arg_def_case_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "base64_fuzz",
     srcs = ["base64_fuzz.cc"],
     tags = ["no_oss"],
@@ -45,24 +42,20 @@ tf_cc_test(
         "//tensorflow/core/platform:base64",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:stringpiece",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "bfloat16_fuzz",
     srcs = ["bfloat16_fuzz.cc"],
     tags = ["no_oss"],  # b/175698644
     deps = [
         "//tensorflow/core:test",
         "//tensorflow/core/framework:bfloat16",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "checkpoint_reader_fuzz",
     srcs = ["checkpoint_reader_fuzz.cc"],
     data = glob(["checkpoint_reader_testdata/*"]),
@@ -77,8 +70,6 @@ tf_cc_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:resource_loader",
         "//tensorflow/core/platform:status",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -92,121 +83,105 @@ tf_proto_library(
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "cleanpath_fuzz",
     srcs = ["cleanpath_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/strings",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "consume_leading_digits_fuzz",
     srcs = ["consume_leading_digits_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "joinpath_fuzz",
     srcs = ["joinpath_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/strings",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "status_group_fuzz",
     srcs = ["status_group_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
-        ":fuzz_helpers",
+        ":fuzz_domains",
         "//tensorflow/core/platform:status",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "fuzz_helpers",
-    hdrs = ["fuzz_helpers.h"],
+    name = "fuzz_domains",
+    testonly = True,
+    hdrs = ["fuzz_domains.h"],
     deps = [
         "//tensorflow/core/platform:status",
+        "@com_google_fuzztest//fuzztest",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "stringprintf_fuzz",
     srcs = ["stringprintf_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:stringprintf",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "string_replace_fuzz",
     srcs = ["string_replace_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "tstring_fuzz",
     srcs = ["tstring_fuzz.cc"],
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core/platform:tstring",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "AreAttrValuesEqual_fuzz",
     srcs = ["AreAttrValuesEqual_fuzz.cc"],
     tags = ["no_oss"],  # b/175698644
     deps = [
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:attr_value_util",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "ParseAttrValue_fuzz",
     srcs = ["ParseAttrValue_fuzz.cc"],
     tags = ["no_oss"],  # b/175698644
     deps = [
         "//tensorflow/core/framework:attr_value_proto_cc",
         "//tensorflow/core/framework:attr_value_util",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "parseURI_fuzz",
     srcs = ["parseURI_fuzz.cc"],
     tags = ["no_oss"],
@@ -214,8 +189,6 @@ tf_cc_test(
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/strings",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc b/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
index 349ac71e910..76de0be10f9 100644
--- a/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ParseAttrValue_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 
diff --git a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
index 8cbc2e46727..b72f2b1e42b 100644
--- a/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/arg_def_case_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
diff --git a/tensorflow/security/fuzzing/cc/base64_fuzz.cc b/tensorflow/security/fuzzing/cc/base64_fuzz.cc
index acb25705fec..a964c860fb6 100644
--- a/tensorflow/security/fuzzing/cc/base64_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/base64_fuzz.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/base64.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
diff --git a/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc b/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
index 9987fa9ef84..97383ec3d22 100644
--- a/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/bfloat16_fuzz.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cstdlib>
 #include <vector>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/framework/bfloat16.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc b/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
index 19b06e24409..f40b441244c 100644
--- a/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/checkpoint_reader_fuzz.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <set>
 #include <string>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
diff --git a/tensorflow/security/fuzzing/cc/cleanpath_fuzz.cc b/tensorflow/security/fuzzing/cc/cleanpath_fuzz.cc
index 4f2f0a22e8a..1dd36fa9204 100644
--- a/tensorflow/security/fuzzing/cc/cleanpath_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/cleanpath_fuzz.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/platform/path.h"
 
diff --git a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
index b91a7175836..3b5ab89841f 100644
--- a/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/consume_leading_digits_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
diff --git a/tensorflow/security/fuzzing/cc/core/framework/BUILD b/tensorflow/security/fuzzing/cc/core/framework/BUILD
new file mode 100644
index 00000000000..dbf2dbbf9b3
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/core/framework/BUILD
@@ -0,0 +1,14 @@
+# Fuzztest TensorShape domains
+
+cc_library(
+    name = "tensor_shape_domains",
+    testonly = 1,
+    srcs = ["tensor_shape_domains.cc"],
+    hdrs = ["tensor_shape_domains.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:statusor",
+        "@com_google_fuzztest//fuzztest",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
new file mode 100644
index 00000000000..77f5f4b3af6
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.cc
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace fuzzing {
+namespace {
+
+using ::fuzztest::Arbitrary;
+using ::fuzztest::Domain;
+using ::fuzztest::Filter;
+using ::fuzztest::Map;
+using ::fuzztest::VectorOf;
+
+Domain<StatusOr<TensorShape>> AnyStatusOrTensorShape() {
+  return Map(
+      [](std::vector<int64_t> v) -> StatusOr<TensorShape> {
+        TensorShape out;
+        TF_RETURN_IF_ERROR(TensorShape::BuildTensorShape(v, &out));
+        return out;
+      },
+      VectorOf(Arbitrary<int64_t>()));
+}
+
+}  // namespace
+
+Domain<TensorShape> AnyValidTensorShape() {
+  return Map([](StatusOr<TensorShape> t) { return *t; },
+             Filter([](auto mfts) { return mfts.status().ok(); },
+                    AnyStatusOrTensorShape()));
+}
+
+}  // namespace fuzzing
+}  // namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h
new file mode 100644
index 00000000000..86d2e1517db
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/core/framework/tensor_shape_domains.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
+
+#include <tuple>
+#include <utility>
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow::fuzzing {
+
+fuzztest::Domain<TensorShape> AnyValidTensorShape();
+
+}  // namespace tensorflow::fuzzing
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_CORE_FRAMEWORK_TENSOR_SHAPE_DOMAINS_H_
diff --git a/tensorflow/security/fuzzing/cc/core/function/BUILD b/tensorflow/security/fuzzing/cc/core/function/BUILD
new file mode 100644
index 00000000000..4382bb1a06d
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/core/function/BUILD
@@ -0,0 +1,14 @@
+# Fuzzing TensorFlow/core/function with GFT
+
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_cc_fuzz_test",
+)
+
+tf_cc_fuzz_test(
+    name = "runtime_client_fuzz",
+    srcs = ["runtime_client_fuzz.cc"],
+    deps = [
+        "//tensorflow/core/function/runtime_client:runtime_client_cc",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
new file mode 100644
index 00000000000..98002f57aac
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/core/function/runtime_client_fuzz.cc
@@ -0,0 +1,97 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/function/runtime_client/runtime_client.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+FunctionDef EmptyFunctionDefGenerator(int number_of_input_arguments,
+                                      int number_of_output_arguments) {
+  std::vector<string> in_def_vec;
+  in_def_vec.reserve(number_of_input_arguments);
+  for (int c = 0; c < number_of_input_arguments; ++c) {
+    in_def_vec.push_back(absl::StrCat("in", c, ":float"));
+  }
+  std::vector<FunctionDefHelper::Node> body_nodes;
+  if (number_of_output_arguments > number_of_input_arguments) {
+    Tensor const_value(DataTypeToEnum<float>::value, {});
+    const_value.scalar<float>()() = 0;
+    body_nodes.push_back(
+        {{"zero"}, "Const", {}, {{"value", const_value}, {"dtype", DT_FLOAT}}});
+  }
+  std::vector<string> out_def_vec;
+  out_def_vec.reserve(number_of_output_arguments);
+  std::vector<std::pair<string, string>> ret_def;
+  ret_def.reserve(number_of_output_arguments);
+  for (int c = 0; c < number_of_output_arguments; ++c) {
+    string output_id = "out" + std::to_string(c);
+    out_def_vec.push_back(output_id + ":float");
+    if (c < number_of_input_arguments) {
+      ret_def.emplace_back(output_id, "in" + std::to_string(c));
+    } else {
+      ret_def.emplace_back(output_id, "zero:output");
+    }
+  }
+  return FunctionDefHelper::Create("TestFunction", in_def_vec, out_def_vec, {},
+                                   body_nodes, ret_def);
+}
+
+fuzztest::Domain<FunctionDef> FunctionDefDomain() {
+  return fuzztest::Map(EmptyFunctionDefGenerator, fuzztest::InRange(0, 7),
+                       fuzztest::InRange(1, 7));
+}
+
+class FuzzRuntimeClient {
+ public:
+  FuzzRuntimeClient()
+      : ctx_(InitLocalEagerContextPtr()), rt_(core::function::Runtime(*ctx_)) {}
+
+  void CreateFunctionFuzz(FunctionDef def) {
+    TF_CHECK_OK(rt_.CreateFunction(def));
+  }
+
+ private:
+  EagerContextPtr ctx_;
+  core::function::Runtime rt_;
+
+  EagerContextPtr InitLocalEagerContextPtr() {
+    SessionOptions opts;
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        opts, "/job:localhost/replica:0/task:0", &devices));
+    return EagerContextPtr(new EagerContext(
+        opts, ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+        /*async=*/false,
+        /*device_mgr=*/new DynamicDeviceMgr(std::move(devices)),
+        /*device_mgr_owned=*/true,
+        /*rendezvous=*/nullptr,
+        /*cluster_flr=*/nullptr,
+        /*collective_executor_mgr=*/nullptr,
+        /*run_eager_op_as_function=*/true));
+  }
+};
+
+FUZZ_TEST_F(FuzzRuntimeClient, CreateFunctionFuzz)
+    .WithDomains(FunctionDefDomain());
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/fuzz_domains.h b/tensorflow/security/fuzzing/cc/fuzz_domains.h
new file mode 100644
index 00000000000..3ce225ff186
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/fuzz_domains.h
@@ -0,0 +1,39 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
+#define TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
+
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace helper {
+
+inline fuzztest::Domain<tensorflow::error::Code> AnyErrorCode() {
+  // We cannot build a `Status` with error_code of 0 and a message, so force
+  // error code to be non-zero.
+  return fuzztest::Map(
+      [](uint32_t code) {
+        return static_cast<tensorflow::error::Code>(code);
+      },
+      fuzztest::Filter(
+        [](uint32_t code) { return code != 0; },
+        fuzztest::Arbitrary<uint32_t>()
+      ));
+}
+
+}  // namespace helper
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_CC_FUZZ_DOMAINS_H_
diff --git a/tensorflow/security/fuzzing/cc/fuzz_session.h b/tensorflow/security/fuzzing/cc/fuzz_session.h
index 7bc1d7b65a5..96390e07b39 100644
--- a/tensorflow/security/fuzzing/cc/fuzz_session.h
+++ b/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/security/fuzzing/cc/joinpath_fuzz.cc b/tensorflow/security/fuzzing/cc/joinpath_fuzz.cc
index 7dee193348c..4037218b97a 100644
--- a/tensorflow/security/fuzzing/cc/joinpath_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/joinpath_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/platform/path.h"
 
diff --git a/tensorflow/security/fuzzing/cc/ops/BUILD b/tensorflow/security/fuzzing/cc/ops/BUILD
index 8b8bcbb836f..9ed4c3d90af 100644
--- a/tensorflow/security/fuzzing/cc/ops/BUILD
+++ b/tensorflow/security/fuzzing/cc/ops/BUILD
@@ -4,12 +4,14 @@
 # Note that these fuzzers cover a large part of TF, they are not granular.
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_cc_fuzz_test",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 # A trivial fuzzer with no pre-specified corpus.
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "identity_fuzz",
     srcs = ["identity_fuzz.cc"],
     tags = ["no_oss"],
@@ -18,12 +20,10 @@ tf_cc_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:array",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "concat_fuzz",
     srcs = ["concat_fuzz.cc"],
     tags = ["no_oss"],
@@ -32,12 +32,10 @@ tf_cc_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:array",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "add_fuzz",
     srcs = ["add_fuzz.cc"],
     tags = ["no_oss"],
@@ -46,12 +44,10 @@ tf_cc_test(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:array",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
-tf_cc_test(
+tf_cc_fuzz_test(
     name = "matmul_fuzz",
     srcs = ["matmul_fuzz.cc"],
     tags = ["no_oss"],
@@ -61,7 +57,29 @@ tf_cc_test(
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/security/fuzzing/cc:fuzz_session",
-        "@com_google_fuzztest//fuzztest",
-        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_fuzz_test(
+    name = "bincount_fuzz",
+    srcs = ["bincount_fuzz.cc"],
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:bincount_op",
+        "//tensorflow/security/fuzzing/cc:fuzz_session",
+    ],
+)
+
+tf_cc_fuzz_test(
+    name = "string_to_number_fuzz",
+    srcs = ["string_to_number_fuzz.cc"],
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:string_to_number_op",
+        "//tensorflow/security/fuzzing/cc:fuzz_session",
     ],
 )
diff --git a/tensorflow/security/fuzzing/cc/ops/add_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/add_fuzz.cc
index ec716f68223..648f2ce2a3f 100644
--- a/tensorflow/security/fuzzing/cc/ops/add_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/add_fuzz.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
diff --git a/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
new file mode 100644
index 00000000000..61c2af314ae
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/ops/bincount_fuzz.cc
@@ -0,0 +1,58 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/security/fuzzing/cc/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Creates FuzzBincount class that wraps a single operation node session.
+class FuzzBincount : public FuzzSession<Tensor, int32> {
+  void BuildGraph(const Scope& scope) override {
+    auto op_node1 =
+        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_INT32);
+    auto op_node2 =
+        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_INT32);
+    auto op_node3 =
+        tensorflow::ops::Placeholder(scope.WithOpName("input3"), DT_INT32);
+    tensorflow::ops::Bincount(scope.WithOpName("output"), op_node1, op_node2,
+                              op_node3);
+  }
+  void FuzzImpl(const Tensor& data, const int32& nbins) final {
+    Tensor size(DT_INT32, TensorShape({1}));
+    size.flat<int32>()(0) = nbins;
+
+    // weights must be the same shape as data or a length 0
+    // in which case it acts as all weights equal to 1
+    Tensor weights(DT_INT32, TensorShape({0}));
+
+    Status s = RunInputsWithStatus(
+        {{"input1", data}, {"input2", size}, {"input3", weights}});
+    if (!s.ok()) {
+      LOG(ERROR) << "Execution failed: " << s.error_message();
+    }
+  }
+};
+
+// Setup up fuzzing test.
+FUZZ_TEST_F(FuzzBincount, Fuzz)
+    .WithDomains(AnyTensor<int32>(), fuzztest::Arbitrary<int32>());
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
index 5aa7292e57c..0d71bf07643 100644
--- a/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/concat_fuzz.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
diff --git a/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
index 7b8c9d160cb..7dac64e6670 100644
--- a/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/identity_fuzz.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
diff --git a/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
index a66e9eb5202..4a63f64fc82 100644
--- a/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/ops/matmul_fuzz.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/security/fuzzing/cc/fuzz_session.h"
diff --git a/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc b/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc
new file mode 100644
index 00000000000..feabbbf9115
--- /dev/null
+++ b/tensorflow/security/fuzzing/cc/ops/string_to_number_fuzz.cc
@@ -0,0 +1,47 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+
+#include <gtest/gtest.h>
+#include "fuzztest/fuzztest.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/security/fuzzing/cc/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Creates FuzzStringToNumber class that wraps a single operation node session.
+class FuzzStringToNumber : public FuzzSession<std::string> {
+  void BuildGraph(const Scope& scope) override {
+    auto op_node =
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
+    tensorflow::ops::StringToNumber(scope.WithOpName("output"), op_node);
+  }
+  void FuzzImpl(const std::string& input_string) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tensorflow::tstring>()() = input_string;
+    Status s = RunInputsWithStatus({{"input", input_tensor}});
+    if (!s.ok()) {
+      LOG(ERROR) << "Execution failed: " << s.error_message();
+    }
+  }
+};
+
+// Setup up fuzzing test.
+FUZZ_TEST_F(FuzzStringToNumber, Fuzz)
+    .WithDomains(fuzztest::OneOf(fuzztest::InRegexp("[-.0-9]+"),
+                                 fuzztest::Arbitrary<std::string>()));
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc b/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
index 6438f01984d..fc538a90175 100644
--- a/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/parseURI_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/stringpiece.h"
diff --git a/tensorflow/security/fuzzing/cc/status_fuzz.cc b/tensorflow/security/fuzzing/cc/status_fuzz.cc
index a5ef8789341..9e390fab10b 100644
--- a/tensorflow/security/fuzzing/cc/status_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/status_fuzz.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <string>
 #include <string_view>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/security/fuzzing/cc/fuzz_helpers.h"
+#include "tensorflow/security/fuzzing/cc/fuzz_domains.h"
 
 // This is a fuzzer for `tensorflow::Status`. Since `Status` is used almost
 // everywhere, we need to ensure that the common functionality is safe. We don't
@@ -29,9 +29,8 @@ limitations under the License.
 
 namespace {
 
-void FuzzTest(uint32_t code, std::string_view error_message) {
-  tensorflow::error::Code error_code = helper::BuildRandomErrorCode(code);
-
+void FuzzTest(tensorflow::error::Code error_code,
+              std::string_view error_message) {
   tensorflow::Status s = tensorflow::Status(error_code, error_message);
   const std::string actual_message = s.ToString();
   const std::size_t pos = actual_message.rfind(error_message);
@@ -42,6 +41,7 @@ void FuzzTest(uint32_t code, std::string_view error_message) {
   // unused and then produces an error if also compiling with `-Werror`.
   (void)pos;
 }
-FUZZ_TEST(CC_FUZZING, FuzzTest);
+FUZZ_TEST(CC_FUZZING, FuzzTest)
+    .WithDomains(helper::AnyErrorCode(), fuzztest::Arbitrary<std::string>());
 
 }  // namespace
diff --git a/tensorflow/security/fuzzing/cc/status_group_fuzz.cc b/tensorflow/security/fuzzing/cc/status_group_fuzz.cc
index b117d49db8f..83620aa41bc 100644
--- a/tensorflow/security/fuzzing/cc/status_group_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/status_group_fuzz.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/security/fuzzing/cc/fuzz_helpers.h"
+#include "tensorflow/security/fuzzing/cc/fuzz_domains.h"
 
 // This is a fuzzer for `tensorflow::StatusGroup`. Since `Status` is used almost
 // everywhere, we need to ensure that the common functionality is safe. We don't
@@ -25,10 +25,9 @@ limitations under the License.
 
 namespace {
 
-void FuzzTest(uint32_t code, bool is_derived) {
+void FuzzTest(tensorflow::error::Code error_code, bool is_derived) {
   const std::string error_message = "ERROR";
   tensorflow::StatusGroup sg;
-  tensorflow::error::Code error_code = helper::BuildRandomErrorCode(code);
   tensorflow::Status s = tensorflow::Status(error_code, error_message);
 
   if (is_derived) {
@@ -43,6 +42,7 @@ void FuzzTest(uint32_t code, bool is_derived) {
   sg.as_concatenated_status().IgnoreError();
   sg.AttachLogMessages();
 }
-FUZZ_TEST(CC_FUZZING, FuzzTest);
+FUZZ_TEST(CC_FUZZING, FuzzTest)
+    .WithDomains(helper::AnyErrorCode(), fuzztest::Arbitrary<bool>());
 
 }  // namespace
diff --git a/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc b/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
index dfc25210fe2..ca280a05736 100644
--- a/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/string_replace_fuzz.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
diff --git a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
index adab8f3fdbe..a37c82a2490 100644
--- a/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/stringprintf_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/stringprintf.h"
 
 // This is a fuzzer for tensorflow::strings::Printf
diff --git a/tensorflow/security/fuzzing/cc/tstring_fuzz.cc b/tensorflow/security/fuzzing/cc/tstring_fuzz.cc
index c93cf6a0277..e69aa09b458 100644
--- a/tensorflow/security/fuzzing/cc/tstring_fuzz.cc
+++ b/tensorflow/security/fuzzing/cc/tstring_fuzz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "testing/fuzzing/fuzztest.h"
+#include "fuzztest/fuzztest.h"
 #include "tensorflow/core/platform/tstring.h"
 
 // This is a fuzzer for tensorflow::tstring
diff --git a/tensorflow/security/fuzzing/tf_fuzzing.bzl b/tensorflow/security/fuzzing/tf_fuzzing.bzl
index f398de77295..9223d2942bd 100644
--- a/tensorflow/security/fuzzing/tf_fuzzing.bzl
+++ b/tensorflow/security/fuzzing/tf_fuzzing.bzl
@@ -19,6 +19,34 @@
 # mutators to do structure aware fuzzing) or any other type of content that is
 # not classified elsewhere.
 
+# tf_cc_fuzz_test is a cc_test modified to include fuzzing support and dependencies for go/fuzztest.
+def tf_cc_fuzz_test(
+        name,
+        fuzzing_dict = [],
+        corpus = [],
+        parsers = [],
+        componentid = None,
+        hotlists = [],
+        data = [],
+        deps = [],
+        tags = [],
+        **kwargs):
+    tf_fuzz_target(
+        name = name,
+        fuzzing_dict = fuzzing_dict,
+        corpus = corpus,
+        parsers = parsers,
+        componentid = componentid,
+        hotlists = hotlists,
+        data = data,
+        deps = deps + [
+            "@com_google_fuzztest//fuzztest",
+            "@com_google_fuzztest//fuzztest:fuzztest_gtest_main",
+        ],
+        tags = tags,
+        **kwargs
+    )
+
 # tf_cc_fuzz_target is a cc_test modified to include fuzzing support.
 def tf_fuzz_target(
         name,
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
deleted file mode 100644
index 268b528ba5a..00000000000
--- a/tensorflow/stream_executor/BUILD
+++ /dev/null
@@ -1,418 +0,0 @@
-# GPU executor library for data-parallel kernel launches and cross-platform
-# HPC-library APIs.
-#
-# Throughout this file, all targets are built with the standard crosstool and
-# do not link against restricted binary blobs.
-
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library", "transitive_hdrs")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "launch_dim",
-    hdrs = [
-        "gpu_launch_dim.h",
-        "launch_dim.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:launch_dim",
-    ],
-)
-
-cc_library(
-    name = "device_description",
-    hdrs = ["device_description.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:device_description",
-    ],
-)
-
-cc_library(
-    name = "event",
-    hdrs = [
-        "device_memory.h",
-        "event.h",
-        "kernel.h",
-        "kernel_spec.h",
-        "platform.h",
-        "stream.h",
-        "stream_executor_internal.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:event",
-    ],
-)
-
-cc_library(
-    name = "kernel",
-    hdrs = [
-        "blas.h",
-        "device_description.h",
-        "device_options.h",
-        "event.h",
-        "kernel.h",
-        "kernel_spec.h",
-        "launch_dim.h",
-        "multi_platform_manager.h",
-        "platform.h",
-        "plugin_registry.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "timer.h",
-        "trace_listener.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:kernel",
-    ],
-)
-
-cc_library(
-    name = "kernel_spec",
-    hdrs = ["kernel_spec.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:kernel_spec",
-    ],
-)
-
-cc_library(
-    name = "kernel_cache_config",
-    hdrs = ["kernel_cache_config.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:kernel_cache_config",
-    ],
-)
-
-cc_library(
-    name = "module_spec",
-    hdrs = ["module_spec.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:module_spec",
-    ],
-)
-
-# Aliases for backwards compatibility.
-alias(
-    name = "stream_header",
-    actual = ":stream_executor_headers",
-)
-
-alias(
-    name = "stream",
-    actual = ":stream_executor",
-)
-
-cc_library(
-    name = "timer",
-    hdrs = [
-        "blas.h",
-        "kernel.h",
-        "stream.h",
-        "stream_executor.h",
-        "timer.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:timer",
-    ],
-)
-
-cc_library(
-    name = "platform",
-    hdrs = ["platform.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:platform",
-    ],
-)
-
-cc_library(
-    name = "rng",
-    hdrs = ["rng.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:rng",
-    ],
-)
-
-cc_library(
-    name = "temporary_device_memory",
-    hdrs = ["temporary_device_memory.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:temporary_device_memory",
-    ],
-)
-
-cc_library(
-    name = "temporary_memory_manager",
-    hdrs = ["temporary_memory_manager.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:temporary_memory_manager",
-    ],
-)
-
-cc_library(
-    name = "fft",
-    hdrs = ["fft.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:fft",
-    ],
-)
-
-cc_library(
-    name = "blas",
-    hdrs = ["blas.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:blas",
-    ],
-)
-
-cc_library(
-    name = "device_memory",
-    hdrs = ["device_memory.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:device_memory",
-    ],
-)
-
-cc_library(
-    name = "host_or_device_scalar",
-    hdrs = ["host_or_device_scalar.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:host_or_device_scalar",
-    ],
-)
-
-cc_library(
-    name = "device_options",
-    hdrs = ["device_options.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:device_options",
-    ],
-)
-
-cc_library(
-    name = "executor_cache",
-    hdrs = [
-        "blas.h",
-        "executor_cache.h",
-        "fft.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "platform.h",
-        "stream.h",
-        "stream_executor_internal.h",
-        "trace_listener.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:executor_cache",
-    ],
-)
-
-cc_library(
-    name = "multi_platform_manager",
-    hdrs = ["multi_platform_manager.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
-    ],
-)
-
-cc_library(
-    name = "plugin",
-    hdrs = ["plugin.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:plugin",
-    ],
-)
-
-cc_library(
-    name = "plugin_registry",
-    hdrs = ["plugin_registry.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:plugin_registry",
-    ],
-)
-
-cc_library(
-    name = "scratch_allocator",
-    hdrs = ["scratch_allocator.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:scratch_allocator",
-    ],
-)
-
-cc_library(
-    name = "data_type",
-    hdrs = ["data_type.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:data_type",
-    ],
-)
-
-cc_library(
-    name = "dnn",
-    hdrs = ["dnn.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:dnn",
-    ],
-)
-
-cc_library(
-    name = "stream_executor_internal",
-    hdrs = [
-        "stream_executor_internal.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_internal",
-    ],
-)
-
-cc_library(
-    name = "stream_executor_pimpl_header",
-    hdrs = [
-        "device_description.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "stream_executor_pimpl.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl_header",
-    ],
-)
-
-# It implements :stream_executor_pimpl_header
-tf_cuda_library(
-    name = "stream_executor_pimpl",
-    hdrs = ["stream_executor_pimpl.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_pimpl",
-    ],
-)
-
-# The stream_executor_headers target does not prescribe an implementation.
-cc_library(
-    name = "stream_executor_headers",
-    textual_hdrs = [
-        "blas.h",
-        "device_description.h",
-        "device_memory.h",
-        "device_memory_allocator.h",
-        "device_options.h",
-        "dnn.h",
-        "event.h",
-        "executor_cache.h",
-        "fft.h",
-        "gpu_launch_dim.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "launch_dim.h",
-        "module_spec.h",
-        "multi_platform_manager.h",
-        "platform.h",
-        "plugin.h",
-        "plugin_registry.h",
-        "rng.h",
-        "stream.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-        "timer.h",
-        "trace_listener.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":host_or_device_scalar",
-        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
-    ],
-)
-
-cc_library(
-    name = "stream_executor",
-    textual_hdrs = [
-        "blas.h",
-        "device_description.h",
-        "device_memory.h",
-        "device_memory_allocator.h",
-        "device_options.h",
-        "dnn.h",
-        "event.h",
-        "executor_cache.h",
-        "fft.h",
-        "gpu_launch_dim.h",
-        "kernel.h",
-        "kernel_cache_config.h",
-        "kernel_spec.h",
-        "launch_dim.h",
-        "module_spec.h",
-        "multi_platform_manager.h",
-        "platform.h",
-        "plugin.h",
-        "plugin_registry.h",
-        "rng.h",
-        "stream.h",
-        "stream_executor.h",
-        "stream_executor_internal.h",
-        "stream_executor_pimpl.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
-        "timer.h",
-        "trace_listener.h",
-    ],
-    deps = [":stream_executor_headers"] + if_static([":stream_executor_impl"]),
-)
-
-transitive_hdrs(
-    name = "stream_executor_install_hdrs",
-    deps = [":stream_executor_headers"],
-)
-
-cc_library(
-    name = "stream_executor_impl",
-    deps = ["//tensorflow/compiler/xla/stream_executor:stream_executor_impl"],
-)
-
-cc_library(
-    name = "allocator_stats",
-    hdrs = ["allocator_stats.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:allocator_stats",
-    ],
-)
-
-cc_library(
-    name = "device_memory_allocator",
-    hdrs = ["device_memory_allocator.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-    ],
-)
-
-cc_library(
-    name = "tf_allocator_adapter",
-    hdrs = ["tf_allocator_adapter.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor:tf_allocator_adapter",
-    ],
-)
-
-alias(
-    name = "cuda_platform",
-    actual = "//tensorflow/compiler/xla/stream_executor/cuda:all_runtime",
-)
-
-alias(
-    name = "rocm_platform",
-    actual = "//tensorflow/stream_executor/rocm:all_runtime",
-)
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
deleted file mode 100644
index 4387744e4db..00000000000
--- a/tensorflow/stream_executor/allocator_stats.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/allocator_stats.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
deleted file mode 100644
index f54601ee0ea..00000000000
--- a/tensorflow/stream_executor/blas.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/blas.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
deleted file mode 100644
index be90b1ca5dd..00000000000
--- a/tensorflow/stream_executor/cuda/BUILD
+++ /dev/null
@@ -1,381 +0,0 @@
-# Description:
-#   CUDA-platform specific StreamExecutor support code.
-
-load("//tensorflow:tensorflow.bzl", "check_deps", "tf_copts")
-load(
-    "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
-    "stream_executor_friends",
-    "tf_additional_cudnn_plugin_copts",
-)
-load(
-    "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "if_static",
-)
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
-
-package(
-    default_visibility = [":friends"],
-    features = ["-layering_check"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "cuda_platform_id",
-    hdrs = ["cuda_platform_id.h"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id"],
-)
-
-cc_library(
-    name = "cuda_platform",
-    hdrs = if_cuda_is_configured(["cuda_platform.h"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cuda_activation",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform",
-    ],
-    alwayslink = True,  # Registers itself with the MultiPlatformManager.
-)
-
-cc_library(
-    name = "cuda_diagnostics",
-    hdrs = if_cuda_is_configured(["cuda_diagnostics.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_diagnostics",
-    ],
-)
-
-# Buildozer can not remove dependencies inside select guards, so we have to use
-# an intermediate target.
-cc_library(name = "ptxas_wrapper")
-
-# Buildozer can not remove dependencies inside select guards, so we have to use
-# an intermediate target.
-cc_library(name = "fatbinary_wrapper")
-
-cc_library(
-    name = "cuda_driver",
-    hdrs = if_cuda_is_configured(["cuda_driver.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_driver",
-    ],
-)
-
-# The activation library is tightly coupled to the executor library.
-# TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
-cc_library(
-    name = "cuda_activation_header",
-    hdrs = ["cuda_activation.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_activation_header",
-    ],
-)
-
-cc_library(
-    name = "cuda_activation",
-    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_activation",
-    ],
-)
-
-cc_library(
-    name = "cuda_gpu_executor_header",
-    textual_hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_gpu_executor_header",
-    ],
-)
-
-cc_library(
-    name = "cublas_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_stub",
-    ],
-)
-
-alias(
-    name = "cublas_lib",
-    actual = select({
-        "//tensorflow:oss": ":cublas_stub",
-        "//conditions:default": "//third_party/gpus/cuda:cublas_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cublas_lt_header",
-    hdrs = if_cuda_is_configured([
-        "cuda_blas_lt.h",
-        "cuda_blas_utils.h",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_header",
-    ],
-)
-
-cc_library(
-    name = "cublas_lt_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_lt_stub",
-    ],
-)
-
-alias(
-    name = "cublas_lt_lib",
-    actual = select({
-        "//tensorflow:oss": ":cublas_lt_stub",
-        "//conditions:default": "//third_party/gpus/cuda:cublas_lt_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cublas_plugin",
-    hdrs = if_cuda_is_configured([
-        "cuda_blas.h",
-        "cuda_blas_lt.h",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "cuda_blas_utils",
-    hdrs = if_cuda_is_configured(["cuda_blas_utils.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_blas_utils",
-    ],
-)
-
-cc_library(
-    name = "cufft_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cufft_stub",
-    ],
-)
-
-alias(
-    name = "cufft_lib",
-    actual = select({
-        "//tensorflow:oss": ":cufft_stub",
-        "//conditions:default": "//third_party/gpus/cuda:cufft_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cufft_plugin",
-    hdrs = if_cuda_is_configured(["cuda_fft.h"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cufft_plugin",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "cudnn_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_stub",
-    ],
-)
-
-alias(
-    name = "cudnn_lib",
-    actual = select({
-        "//tensorflow:oss": ":cudnn_stub",
-        "//conditions:default": "//third_party/gpus/cudnn:cudnn",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda_dnn_headers",
-    textual_hdrs = ["cuda_dnn.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_dnn_headers",
-    ],
-)
-
-cc_library(
-    name = "cudnn_plugin",
-    hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
-    copts = tf_additional_cudnn_plugin_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "curand_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:curand_stub",
-    ],
-)
-
-alias(
-    name = "curand_lib",
-    actual = select({
-        "//tensorflow:oss": ":curand_stub",
-        "//conditions:default": "//third_party/gpus/cuda:curand_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "curand_plugin",
-    hdrs = if_cuda_is_configured(["cuda_rng.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:curand_plugin",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "cupti_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cupti_stub",
-    ],
-)
-
-cc_library(
-    name = "cusolver_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cusolver_stub",
-    ],
-)
-
-alias(
-    name = "cusolver_lib",
-    actual = select({
-        "//tensorflow:oss": ":cusolver_stub",
-        "//conditions:default": "//third_party/gpus/cuda:cusolver_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cusparse_stub",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cusparse_stub",
-    ],
-)
-
-alias(
-    name = "cusparse_lib",
-    actual = select({
-        "//tensorflow:oss": ":cusparse_stub",
-        "//conditions:default": "//third_party/gpus/cuda:cusparse_static",
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda_kernel",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_kernel",
-    ],
-)
-
-# TODO(leary) we likely need to canonicalize/eliminate this.
-cc_library(
-    name = "cuda_helpers",
-    textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_helpers",
-    ],
-)
-
-cc_library(
-    name = "cuda_event",
-    hdrs = if_cuda_is_configured(["cuda_event.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_event",
-    ],
-)
-
-cc_library(
-    name = "cuda_stream",
-    hdrs = if_cuda_is_configured(["cuda_stream.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_stream",
-    ],
-)
-
-cc_library(
-    name = "cuda_timer",
-    hdrs = if_cuda_is_configured(["cuda_timer.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_timer",
-    ],
-)
-
-cc_library(
-    name = "cuda_asm_compiler",
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_asm_compiler",
-    ],
-)
-
-cc_library(
-    name = "cuda_gpu_executor",
-    hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_gpu_executor",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "all_runtime",
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/cuda:all_runtime",
-    ],
-    alwayslink = 1,
-)
-
-# To avoid duplication, check that the C++ or python library does not depend on
-# the stream executor cuda plugins. Targets that want to use cuda APIs should
-# instead depend on the dummy plugins in //tensorflow/tsl/platform/default/build_config
-# and use header only targets.
-check_deps(
-    name = "cuda_plugins_check_deps",
-    disallowed_deps = if_static(
-        [],
-        otherwise = [
-            ":all_runtime",
-            ":cuda_driver",
-            ":cuda_platform",
-            ":cudnn_plugin",
-            ":cufft_plugin",
-            ":curand_plugin",
-            "//tensorflow/stream_executor:cuda_platform",
-        ],
-    ),
-    deps = [
-        "//tensorflow:tensorflow_cc",
-        "//tensorflow/python:pywrap_tensorflow_internal",
-    ],
-)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
deleted file mode 100644
index 71ac1b7a7f1..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
deleted file mode 100644
index 7cdbeaec4c8..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas_lt.h b/tensorflow/stream_executor/cuda/cuda_blas_lt.h
deleted file mode 100644
index fe701a3ec0d..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_blas_lt.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_lt.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_blas_utils.h b/tensorflow/stream_executor/cuda/cuda_blas_utils.h
deleted file mode 100644
index c8789f0b488..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_blas_utils.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_blas_utils.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
deleted file mode 100644
index 10d18a35e72..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
deleted file mode 100644
index ef32095c99c..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
deleted file mode 100644
index 5c8f14fb1e9..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
deleted file mode 100644
index 906a5af0a07..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_event.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
deleted file mode 100644
index f6718d4ded8..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
deleted file mode 100644
index 120a7451b78..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
deleted file mode 100644
index 192212db6cf..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_helpers.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
deleted file mode 100644
index 4894a0251ce..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_kernel.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
deleted file mode 100644
index 1b8b74642cd..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_platform_id.h b/tensorflow/stream_executor/cuda/cuda_platform_id.h
deleted file mode 100644
index 92016573ddd..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_platform_id.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_platform_id.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
deleted file mode 100644
index 7f4e78ac165..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_rng.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
deleted file mode 100644
index f8c81a3784d..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_stream.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
deleted file mode 100644
index 03a3728f6e9..00000000000
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_timer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
diff --git a/tensorflow/stream_executor/data_type.h b/tensorflow/stream_executor/data_type.h
deleted file mode 100644
index 55152039584..00000000000
--- a/tensorflow/stream_executor/data_type.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/data_type.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
deleted file mode 100644
index b393a117292..00000000000
--- a/tensorflow/stream_executor/device_description.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
-// device and platform properties. Also contains convenience functions for
-// checking/calculating launch dimensionality based on device properties.
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
-
-#include "tensorflow/compiler/xla/stream_executor/device_description.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
deleted file mode 100644
index e67ff492ce6..00000000000
--- a/tensorflow/stream_executor/device_memory.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
-
-#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
deleted file mode 100644
index 4f7bc323e91..00000000000
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/device_options.h b/tensorflow/stream_executor/device_options.h
deleted file mode 100644
index f982d33cdbd..00000000000
--- a/tensorflow/stream_executor/device_options.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/device_options.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_OPTIONS_H_
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
deleted file mode 100644
index e71e85f5496..00000000000
--- a/tensorflow/stream_executor/dnn.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
-
-#include "tensorflow/compiler/xla/stream_executor/dnn.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
diff --git a/tensorflow/stream_executor/event.h b/tensorflow/stream_executor/event.h
deleted file mode 100644
index 4641b9f9fe5..00000000000
--- a/tensorflow/stream_executor/event.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/event.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_EVENT_H_
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
deleted file mode 100644
index 764ef1aef67..00000000000
--- a/tensorflow/stream_executor/executor_cache.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/executor_cache.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_CACHE_H_
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
deleted file mode 100644
index 14f9f8f1263..00000000000
--- a/tensorflow/stream_executor/fft.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_FFT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_FFT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/fft.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_FFT_H_
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
deleted file mode 100644
index ccf6559e9c4..00000000000
--- a/tensorflow/stream_executor/gpu/BUILD
+++ /dev/null
@@ -1,176 +0,0 @@
-# Description:
-#   GPU-platform specific StreamExecutor support code.
-
-load(
-    "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
-load("//tensorflow:tensorflow.bzl", "tf_copts")
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
-
-package(
-    default_visibility = [
-        "//tensorflow/compiler/tf2xla:__subpackages__",
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "gpu_activation_header",
-    hdrs = ["gpu_activation.h"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation_header"],
-)
-
-cc_library(
-    name = "gpu_activation",
-    hdrs = if_gpu_is_configured(["gpu_activation.h"]),
-    deps = if_gpu_is_configured([
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_activation",
-    ]),
-)
-
-cc_library(
-    name = "gpu_diagnostics_header",
-    hdrs = if_gpu_is_configured(["gpu_diagnostics.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_diagnostics_header",
-    ],
-)
-
-cc_library(
-    name = "gpu_driver_header",
-    hdrs = if_gpu_is_configured(["gpu_driver.h"]),
-    visibility = [
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/util/autotune_maps:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
-    ],
-)
-
-cc_library(
-    name = "gpu_event_header",
-    hdrs = if_gpu_is_configured(["gpu_event.h"]),
-    deps = if_gpu_is_configured([
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_event_header",
-    ]),
-)
-
-cc_library(
-    name = "gpu_event",
-    hdrs = if_gpu_is_configured(["gpu_event.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_event",
-    ],
-)
-
-cc_library(
-    name = "gpu_executor_header",
-    hdrs = if_gpu_is_configured(["gpu_executor.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
-    ],
-)
-
-cc_library(
-    name = "gpu_helpers_header",
-    hdrs = if_gpu_is_configured(["gpu_helpers.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_helpers_header"],
-)
-
-cc_library(
-    name = "gpu_kernel_header",
-    hdrs = if_gpu_is_configured(["gpu_kernel.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_kernel_header"],
-)
-
-cc_library(
-    name = "gpu_rng_header",
-    hdrs = if_gpu_is_configured(["gpu_rng.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_rng_header"],
-)
-
-cc_library(
-    name = "gpu_stream_header",
-    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream_header"],
-)
-
-cc_library(
-    name = "gpu_stream",
-    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream"],
-)
-
-cc_library(
-    name = "gpu_timer_header",
-    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer_header"],
-)
-
-cc_library(
-    name = "gpu_timer",
-    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_timer"],
-)
-
-cc_library(
-    name = "gpu_types_header",
-    hdrs = if_gpu_is_configured(["gpu_types.h"]),
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
-    ],
-)
-
-cc_library(
-    name = "gpu_asm_opts",
-    hdrs = ["gpu_asm_opts.h"],
-    visibility = [
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-    deps = ["//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts"],
-)
-
-cc_library(
-    name = "asm_compiler",
-    hdrs = if_gpu_is_configured(["asm_compiler.h"]),
-    copts = tf_copts(),
-    visibility = [
-        "//tensorflow/compiler/mlir/disc:__subpackages__",
-        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-    deps = if_gpu_is_configured([
-        "//tensorflow/compiler/xla/stream_executor/gpu:asm_compiler",
-    ]),
-)
-
-cc_library(
-    name = "redzone_allocator",
-    hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
-    copts = tf_copts(),
-    visibility = [
-        "//tensorflow/compiler/xla/service/gpu:__subpackages__",
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-    deps = if_gpu_is_configured([
-        "//tensorflow/compiler/xla/stream_executor/gpu:redzone_allocator",
-    ]),
-)
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.h b/tensorflow/stream_executor/gpu/asm_compiler.h
deleted file mode 100644
index 1cb3ee052b6..00000000000
--- a/tensorflow/stream_executor/gpu/asm_compiler.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_activation.h b/tensorflow/stream_executor/gpu/gpu_activation.h
deleted file mode 100644
index 052a3e5ae4b..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_activation.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_activation.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_asm_opts.h b/tensorflow/stream_executor/gpu/gpu_asm_opts.h
deleted file mode 100644
index 81f5fb03141..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_asm_opts.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ASM_OPTS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_diagnostics.h b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
deleted file mode 100644
index 558bd628292..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_diagnostics.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
deleted file mode 100644
index 9cc10269444..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_event.h b/tensorflow/stream_executor/gpu/gpu_event.h
deleted file mode 100644
index 5813438b1af..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_event.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_event.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
deleted file mode 100644
index 31b2b7e2b42..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_helpers.h b/tensorflow/stream_executor/gpu/gpu_helpers.h
deleted file mode 100644
index 4838929ed73..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_helpers.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_helpers.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_kernel.h b/tensorflow/stream_executor/gpu/gpu_kernel.h
deleted file mode 100644
index 956f561fbd4..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_kernel.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_kernel.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
deleted file mode 100644
index 160862dfeae..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_rng.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_rng.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
deleted file mode 100644
index 00e6e801331..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_stream.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_timer.h b/tensorflow/stream_executor/gpu/gpu_timer.h
deleted file mode 100644
index 7cf79036ebb..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_timer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
deleted file mode 100644
index 7383efde5d5..00000000000
--- a/tensorflow/stream_executor/gpu/gpu_types.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/tensorflow/stream_executor/gpu/redzone_allocator.h b/tensorflow/stream_executor/gpu/redzone_allocator.h
deleted file mode 100644
index c90215d6c7b..00000000000
--- a/tensorflow/stream_executor/gpu/redzone_allocator.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_REDZONE_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/gpu_launch_dim.h b/tensorflow/stream_executor/gpu_launch_dim.h
deleted file mode 100644
index d4019ec3b76..00000000000
--- a/tensorflow/stream_executor/gpu_launch_dim.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_LAUNCH_DIM_H_
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
deleted file mode 100644
index 32918c9df43..00000000000
--- a/tensorflow/stream_executor/host/BUILD
+++ /dev/null
@@ -1,100 +0,0 @@
-# Description:
-#   Host-platform specific StreamExecutor support code.
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "host_platform_id",
-    hdrs = [
-        "host_platform_id.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
-        "//tensorflow/stream_executor:platform",
-    ],
-)
-
-cc_library(
-    name = "host_platform",
-    hdrs = [
-        "host_platform.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":host_gpu_executor",
-        ":host_platform_id",
-        "//tensorflow/compiler/xla/stream_executor/host:host_platform",
-        "//tensorflow/stream_executor:executor_cache",
-        "//tensorflow/stream_executor:multi_platform_manager",
-        "//tensorflow/stream_executor:stream_executor_headers",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
-    ],
-    alwayslink = True,  # Registers itself with the MultiPlatformManager.
-)
-
-cc_library(
-    name = "host_stream",
-    hdrs = [
-        "host_stream.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/host:host_stream",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/stream_executor:kernel",
-        "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cc_library(
-    name = "host_timer",
-    hdrs = [
-        "host_timer.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/host:host_timer",
-        "//tensorflow/stream_executor:stream_executor_internal",
-        "//tensorflow/stream_executor:timer",
-        "//tensorflow/stream_executor/platform",
-    ],
-)
-
-# TODO(22689637): Rename this target.
-cc_library(
-    name = "host_gpu_executor",
-    hdrs = [
-        "host_gpu_executor.h",
-    ],
-    deps = [
-        ":host_platform_id",
-        ":host_stream",
-        ":host_timer",
-        "//tensorflow/compiler/xla/stream_executor/host:host_gpu_executor",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/platform:profile_utils_cpu_utils",
-        "//tensorflow/stream_executor",
-        "//tensorflow/stream_executor:kernel",
-        "//tensorflow/stream_executor:rng",
-        "//tensorflow/stream_executor:stream_executor_internal",
-        "//tensorflow/stream_executor:stream_executor_pimpl",
-        "//tensorflow/stream_executor:timer",
-        "//tensorflow/stream_executor/lib",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-    ],
-    alwayslink = True,
-)
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
deleted file mode 100644
index 1d6c1e4b0c2..00000000000
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host/host_gpu_executor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/host/host_platform.h b/tensorflow/stream_executor/host/host_platform.h
deleted file mode 100644
index f4d37da3aac..00000000000
--- a/tensorflow/stream_executor/host/host_platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host/host_platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_H_
diff --git a/tensorflow/stream_executor/host/host_platform_id.h b/tensorflow/stream_executor/host/host_platform_id.h
deleted file mode 100644
index dcca4903f6c..00000000000
--- a/tensorflow/stream_executor/host/host_platform_id.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
deleted file mode 100644
index 9ba86476fd6..00000000000
--- a/tensorflow/stream_executor/host/host_stream.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host/host_stream.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_STREAM_H_
diff --git a/tensorflow/stream_executor/host/host_timer.h b/tensorflow/stream_executor/host/host_timer.h
deleted file mode 100644
index c22f7459761..00000000000
--- a/tensorflow/stream_executor/host/host_timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host/host_timer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_HOST_TIMER_H_
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
deleted file mode 100644
index 5f481a07391..00000000000
--- a/tensorflow/stream_executor/host_or_device_scalar.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/host_or_device_scalar.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
deleted file mode 100644
index 32158ef85d5..00000000000
--- a/tensorflow/stream_executor/kernel.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/kernel.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_H_
diff --git a/tensorflow/stream_executor/kernel_cache_config.h b/tensorflow/stream_executor/kernel_cache_config.h
deleted file mode 100644
index 7078ce5f637..00000000000
--- a/tensorflow/stream_executor/kernel_cache_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
-
-#include "tensorflow/compiler/xla/stream_executor/kernel_cache_config.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_CACHE_CONFIG_H_
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
deleted file mode 100644
index 4baef08eb19..00000000000
--- a/tensorflow/stream_executor/kernel_spec.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
-#define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
-
-#include "tensorflow/compiler/xla/stream_executor/kernel_spec.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
deleted file mode 100644
index a57e8b45712..00000000000
--- a/tensorflow/stream_executor/launch_dim.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/launch_dim.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LAUNCH_DIM_H_
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
deleted file mode 100644
index 1c64ae62fa8..00000000000
--- a/tensorflow/stream_executor/lib/BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "if_windows")
-load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "lib",
-    hdrs = glob(["**/*.h"]),
-    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:stacktrace",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
-        "//tensorflow/stream_executor/platform",
-        "//tensorflow/compiler/xla/stream_executor/platform",
-        "//tensorflow/compiler/xla/stream_executor/lib",
-    ] + if_static([
-        "//tensorflow/core/platform:env_impl",
-    ]),
-)
diff --git a/tensorflow/stream_executor/lib/array_slice.h b/tensorflow/stream_executor/lib/array_slice.h
deleted file mode 100644
index 0a3d43c1db6..00000000000
--- a/tensorflow/stream_executor/lib/array_slice.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/array_slice.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ARRAY_SLICE_H_
diff --git a/tensorflow/stream_executor/lib/demangle.h b/tensorflow/stream_executor/lib/demangle.h
deleted file mode 100644
index 157beddd398..00000000000
--- a/tensorflow/stream_executor/lib/demangle.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/demangle.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_DEMANGLE_H_
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
deleted file mode 100644
index 9f83cd98609..00000000000
--- a/tensorflow/stream_executor/lib/env.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/env.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
diff --git a/tensorflow/stream_executor/lib/error.h b/tensorflow/stream_executor/lib/error.h
deleted file mode 100644
index 0cff9146c01..00000000000
--- a/tensorflow/stream_executor/lib/error.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/error.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_ERROR_H_
diff --git a/tensorflow/stream_executor/lib/human_readable.h b/tensorflow/stream_executor/lib/human_readable.h
deleted file mode 100644
index 006d3f11153..00000000000
--- a/tensorflow/stream_executor/lib/human_readable.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/human_readable.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_HUMAN_READABLE_H_
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
deleted file mode 100644
index af6499ff991..00000000000
--- a/tensorflow/stream_executor/lib/initialize.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/initialize.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/lib/mathutil.h b/tensorflow/stream_executor/lib/mathutil.h
deleted file mode 100644
index 88ad1d7240e..00000000000
--- a/tensorflow/stream_executor/lib/mathutil.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/mathutil.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_MATHUTIL_H_
diff --git a/tensorflow/stream_executor/lib/numbers.h b/tensorflow/stream_executor/lib/numbers.h
deleted file mode 100644
index 2417126c232..00000000000
--- a/tensorflow/stream_executor/lib/numbers.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/numbers.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_NUMBERS_H_
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
deleted file mode 100644
index 7766141a580..00000000000
--- a/tensorflow/stream_executor/lib/path.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/path.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
diff --git a/tensorflow/stream_executor/lib/process_state.h b/tensorflow/stream_executor/lib/process_state.h
deleted file mode 100644
index 863709956fa..00000000000
--- a/tensorflow/stream_executor/lib/process_state.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/process_state.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_PROCESS_STATE_H_
diff --git a/tensorflow/stream_executor/lib/stacktrace.h b/tensorflow/stream_executor/lib/stacktrace.h
deleted file mode 100644
index 65123846cf1..00000000000
--- a/tensorflow/stream_executor/lib/stacktrace.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/stacktrace.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STACKTRACE_H_
diff --git a/tensorflow/stream_executor/lib/static_threadlocal.h b/tensorflow/stream_executor/lib/static_threadlocal.h
deleted file mode 100644
index 3b682efaa90..00000000000
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/static_threadlocal.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
deleted file mode 100644
index bdea7b0f82a..00000000000
--- a/tensorflow/stream_executor/lib/status.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/status.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
deleted file mode 100644
index 32f0ec02143..00000000000
--- a/tensorflow/stream_executor/lib/statusor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/stream_executor/lib/thread_options.h b/tensorflow/stream_executor/lib/thread_options.h
deleted file mode 100644
index 3eb7d88cb10..00000000000
--- a/tensorflow/stream_executor/lib/thread_options.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/thread_options.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREAD_OPTIONS_H_
diff --git a/tensorflow/stream_executor/lib/threadpool.h b/tensorflow/stream_executor/lib/threadpool.h
deleted file mode 100644
index 5b4c9cbe30f..00000000000
--- a/tensorflow/stream_executor/lib/threadpool.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/lib/threadpool.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_THREADPOOL_H_
diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h
deleted file mode 100644
index 8f2fa14412d..00000000000
--- a/tensorflow/stream_executor/module_spec.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
-#define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
-
-#include "tensorflow/compiler/xla/stream_executor/module_spec.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
deleted file mode 100644
index 84368fa01f6..00000000000
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
deleted file mode 100644
index f0452f2f53e..00000000000
--- a/tensorflow/stream_executor/platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_H_
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
deleted file mode 100644
index ac7ebe779a9..00000000000
--- a/tensorflow/stream_executor/platform/BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/core/platform:build_config.bzl", "tf_stream_executor_deps")
-load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "platform",
-    textual_hdrs = [
-        "initialize.h",
-        "logging.h",
-        "platform.h",
-        "port.h",
-    ],
-    deps = [
-        "//tensorflow/core/platform:macros",
-        "//tensorflow/core/platform:types",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/compiler/xla/stream_executor/platform",
-    ] + tf_stream_executor_deps("platform", "//tensorflow/stream_executor/platform/"),
-)
-
-cc_library(
-    name = "dso_loader",
-    hdrs = ["dso_loader.h"],
-    deps = [
-        ":platform",
-        "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
-    ] + tf_stream_executor_deps("dso_loader", "//tensorflow/stream_executor/platform/"),
-)
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
deleted file mode 100644
index 90ffba0e46e..00000000000
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_copts")
-
-licenses(["notice"])
-
-package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
-
-cc_library(
-    name = "platform",
-    textual_hdrs = ["initialize.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/platform/default:platform",
-    ],
-)
-
-cc_library(
-    name = "dso_loader",
-    hdrs = ["dso_loader.h"],
-    compatible_with = [],
-    copts = tf_copts(),
-    tags = [
-        "manual",
-        "nobuilder",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/platform/default:dso_loader",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/strings",
-        "@local_config_tensorrt//:tensorrt_headers",
-    ],
-)
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
deleted file mode 100644
index 41b73a4a510..00000000000
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_DSO_LOADER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_DSO_LOADER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/default/initialize.h b/tensorflow/stream_executor/platform/default/initialize.h
deleted file mode 100644
index 97315682cea..00000000000
--- a/tensorflow/stream_executor/platform/default/initialize.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/default/initialize.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/dso_loader.h b/tensorflow/stream_executor/platform/dso_loader.h
deleted file mode 100644
index a7f26058001..00000000000
--- a/tensorflow/stream_executor/platform/dso_loader.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/dso_loader.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/initialize.h b/tensorflow/stream_executor/platform/initialize.h
deleted file mode 100644
index b4b25da5189..00000000000
--- a/tensorflow/stream_executor/platform/initialize.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/initialize.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/logging.h b/tensorflow/stream_executor/platform/logging.h
deleted file mode 100644
index 9f3ad505be4..00000000000
--- a/tensorflow/stream_executor/platform/logging.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/logging.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_LOGGING_H_
diff --git a/tensorflow/stream_executor/platform/platform.h b/tensorflow/stream_executor/platform/platform.h
deleted file mode 100644
index 2e4cdc4d7af..00000000000
--- a/tensorflow/stream_executor/platform/platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/platform/port.h b/tensorflow/stream_executor/platform/port.h
deleted file mode 100644
index 07bf41e81ac..00000000000
--- a/tensorflow/stream_executor/platform/port.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PORT_H_
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
deleted file mode 100644
index 3ffaa4ddd8c..00000000000
--- a/tensorflow/stream_executor/plugin.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
-
-#include "tensorflow/compiler/xla/stream_executor/plugin.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_H_
diff --git a/tensorflow/stream_executor/plugin_registry.h b/tensorflow/stream_executor/plugin_registry.h
deleted file mode 100644
index a782adff4db..00000000000
--- a/tensorflow/stream_executor/plugin_registry.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
-#define TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
-
-#include "tensorflow/compiler/xla/stream_executor/plugin_registry.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_PLUGIN_REGISTRY_H_
diff --git a/tensorflow/stream_executor/rng.h b/tensorflow/stream_executor/rng.h
deleted file mode 100644
index c532a3872ad..00000000000
--- a/tensorflow/stream_executor/rng.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_RNG_H_
-#define TENSORFLOW_STREAM_EXECUTOR_RNG_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rng.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_RNG_H_
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
deleted file mode 100644
index 82409bdea10..00000000000
--- a/tensorflow/stream_executor/rocm/BUILD
+++ /dev/null
@@ -1,206 +0,0 @@
-# Description:
-#   ROCm-platform specific StreamExecutor support code.
-
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "//tensorflow/compiler/xla/stream_executor:build_defs.bzl",
-    "stream_executor_friends",
-)
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = stream_executor_friends(),
-)
-
-cc_library(
-    name = "rocm_diagnostics",
-    hdrs = if_rocm_is_configured(["rocm_diagnostics.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_diagnostics"],
-)
-
-cc_library(
-    name = "rocm_driver",
-    hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_driver"],
-)
-
-cc_library(
-    name = "rocm_activation",
-    hdrs = if_rocm_is_configured(["rocm_activation.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_activation"],
-)
-
-cc_library(
-    name = "rocm_event",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_event"],
-)
-
-cc_library(
-    name = "rocm_gpu_executor",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_gpu_executor"],
-)
-
-cc_library(
-    name = "rocm_kernel",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_kernel"],
-)
-
-cc_library(
-    name = "rocm_platform",
-    hdrs = if_rocm_is_configured(["rocm_platform.h"]),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform"],
-)
-
-cc_library(
-    name = "rocm_platform_id",
-    hdrs = ["rocm_platform_id.h"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id"],
-)
-
-cc_library(
-    name = "rocblas_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocblas_if_static"],
-)
-
-cc_library(
-    name = "rocblas_wrapper",
-    hdrs = if_rocm_is_configured(["rocblas_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocblas_wrapper"],
-)
-
-cc_library(
-    name = "rocblas_plugin",
-    hdrs = if_rocm_is_configured(["rocm_blas.h"]),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocblas_plugin"],
-)
-
-cc_library(
-    name = "hipfft_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipfft_if_static"],
-)
-
-cc_library(
-    name = "hipfft_plugin",
-    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipfft_plugin"],
-)
-
-cc_library(
-    name = "rocfft_if_static",
-    deps = if_static([
-        "@local_config_rocm//rocm:rocfft",
-    ]),
-)
-
-cc_library(
-    name = "rocfft_plugin",
-    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
-    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
-    visibility = ["//visibility:public"],
-    copts = if_dcu(["-DTENSORFLOW_USING_DCU"]),
-    deps = if_rocm_is_configured([
-        ":rocfft_if_static",
-        ":rocm_platform_id",
-        "//tensorflow/stream_executor:event",
-        "//tensorflow/stream_executor:fft",
-        "//tensorflow/stream_executor:plugin_registry",
-        "//tensorflow/stream_executor:scratch_allocator",
-        "//tensorflow/stream_executor/gpu:gpu_activation",
-        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
-        "//tensorflow/stream_executor/gpu:gpu_executor_header",
-        "//tensorflow/stream_executor/gpu:gpu_stream_header",
-        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
-        "//tensorflow/stream_executor/lib",
-        "//tensorflow/stream_executor/platform",
-        "//tensorflow/stream_executor/platform:dso_loader",
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-    alwayslink = True,
-)
-
-cc_library(
-    name = "miopen_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:miopen_if_static"],
-)
-
-cc_library(
-    name = "miopen_plugin",
-    hdrs = if_rocm_is_configured(["rocm_dnn.h"]),
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:miopen_plugin"],
-)
-
-cc_library(
-    name = "hiprand_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hiprand_if_static"],
-)
-
-cc_library(
-    name = "rocrand_plugin",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocrand_plugin"],
-)
-
-cc_library(
-    name = "hipsparse_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipsparse_if_static"],
-)
-
-cc_library(
-    name = "hipsparse_wrapper",
-    hdrs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipsparse_wrapper"],
-)
-
-cc_library(
-    name = "rocsolver_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocsolver_if_static"],
-)
-
-cc_library(
-    name = "rocsolver_wrapper",
-    hdrs = if_rocm_is_configured(["rocsolver_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocsolver_wrapper"],
-)
-
-cc_library(
-    name = "hipsolver_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_if_static"],
-)
-
-cc_library(
-    name = "hipsolver_wrapper",
-    hdrs = if_rocm_is_configured(["hipsolver_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:hipsolver_wrapper"],
-)
-
-cc_library(
-    name = "roctracer_if_static",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:roctracer_if_static"],
-)
-
-cc_library(
-    name = "roctracer_wrapper",
-    hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:roctracer_wrapper"],
-)
-
-cc_library(
-    name = "rocm_helpers",
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:rocm_helpers"],
-)
-
-cc_library(
-    name = "all_runtime",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla/stream_executor/rocm:all_runtime"],
-)
diff --git a/tensorflow/stream_executor/rocm/hipsolver_wrapper.h b/tensorflow/stream_executor/rocm/hipsolver_wrapper.h
deleted file mode 100644
index 81a2df04393..00000000000
--- a/tensorflow/stream_executor/rocm/hipsolver_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSOLVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/hipsparse_wrapper.h b/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
deleted file mode 100644
index 91ae9be15c7..00000000000
--- a/tensorflow/stream_executor/rocm/hipsparse_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/hipsparse_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_HIPSPARSE_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocblas_wrapper.h b/tensorflow/stream_executor/rocm/rocblas_wrapper.h
deleted file mode 100644
index cbada28160b..00000000000
--- a/tensorflow/stream_executor/rocm/rocblas_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocblas_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_activation.h b/tensorflow/stream_executor/rocm/rocm_activation.h
deleted file mode 100644
index 7de1743cf53..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_activation.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_activation.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
deleted file mode 100644
index 4655d9a1e18..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_blas.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.h b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
deleted file mode 100644
index 36495874096..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_diagnostics.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_diagnostics.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
deleted file mode 100644
index 573fe1ac04c..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
deleted file mode 100644
index 8100c4dc9dc..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.h b/tensorflow/stream_executor/rocm/rocm_fft.h
deleted file mode 100644
index 7b8a8fa3ec1..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_fft.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_fft.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.h b/tensorflow/stream_executor/rocm/rocm_platform.h
deleted file mode 100644
index 4a1da0954a2..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.h b/tensorflow/stream_executor/rocm/rocm_platform_id.h
deleted file mode 100644
index 5cd1d42cf0f..00000000000
--- a/tensorflow/stream_executor/rocm/rocm_platform_id.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocm_platform_id.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/rocm/rocsolver_wrapper.h b/tensorflow/stream_executor/rocm/rocsolver_wrapper.h
deleted file mode 100644
index 3321af721fd..00000000000
--- a/tensorflow/stream_executor/rocm/rocsolver_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/rocsolver_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCSOLVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/roctracer_wrapper.h b/tensorflow/stream_executor/rocm/roctracer_wrapper.h
deleted file mode 100644
index c1a4816208e..00000000000
--- a/tensorflow/stream_executor/rocm/roctracer_wrapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/rocm/roctracer_wrapper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/scratch_allocator.h b/tensorflow/stream_executor/scratch_allocator.h
deleted file mode 100644
index afa24f175cc..00000000000
--- a/tensorflow/stream_executor/scratch_allocator.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/scratch_allocator.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_SCRATCH_ALLOCATOR_H_
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
deleted file mode 100644
index a664362127e..00000000000
--- a/tensorflow/stream_executor/stream.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/stream.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_H_
diff --git a/tensorflow/stream_executor/stream_executor.h b/tensorflow/stream_executor/stream_executor.h
deleted file mode 100644
index 0ea8d4044df..00000000000
--- a/tensorflow/stream_executor/stream_executor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
deleted file mode 100644
index 4ffcecd0c14..00000000000
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
deleted file mode 100644
index 746feabd45d..00000000000
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/tensorflow/stream_executor/temporary_device_memory.h b/tensorflow/stream_executor/temporary_device_memory.h
deleted file mode 100644
index c2ac26cffb3..00000000000
--- a/tensorflow/stream_executor/temporary_device_memory.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
-
-#include "tensorflow/compiler/xla/stream_executor/temporary_device_memory.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
deleted file mode 100644
index f342b11ae1b..00000000000
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/temporary_memory_manager.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.h b/tensorflow/stream_executor/tf_allocator_adapter.h
deleted file mode 100644
index db84b4a6f90..00000000000
--- a/tensorflow/stream_executor/tf_allocator_adapter.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
diff --git a/tensorflow/stream_executor/timer.h b/tensorflow/stream_executor/timer.h
deleted file mode 100644
index 07fc408d24b..00000000000
--- a/tensorflow/stream_executor/timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/timer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TIMER_H_
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
deleted file mode 100644
index dba351a3de7..00000000000
--- a/tensorflow/stream_executor/tpu/BUILD
+++ /dev/null
@@ -1,229 +0,0 @@
-# Description: StreamExecutor Interface for TPUs
-
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
-package(
-    default_visibility = [
-        "//learning/brain/experimental/dtensor:__subpackages__",
-        "//learning/brain/google/xla/kernels:__subpackages__",
-        "//learning/brain/tfrc/executor:__subpackages__",
-        "//tensorflow/compiler/jit:__subpackages__",
-        "//tensorflow/compiler/mlir:__subpackages__",
-        "//tensorflow/compiler/xla:__subpackages__",
-        "//tensorflow/compiler/xla/backends/profiler/tpu:__subpackages__",
-        "//tensorflow/compiler/xrt:__subpackages__",
-        "//tensorflow/core/tpu:__subpackages__",
-        "//tensorflow/dtensor:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "c_api_decl",
-    hdrs = [
-        "c_api_decl.h",
-        "c_api_defn.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",
-    ],
-)
-
-cc_library(
-    name = "tpu_executor_c_api_hdrs",
-    hdrs = ["tpu_executor_c_api.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_c_api_hdrs",
-    ],
-)
-
-cc_library(
-    name = "c_api_conversions",
-    hdrs = ["c_api_conversions.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",
-    ],
-)
-
-cc_library(
-    name = "noncopyable_buffer",
-    hdrs = ["noncopyable_buffer.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:noncopyable_buffer",
-    ],
-)
-
-cc_library(
-    name = "status_helper",
-    hdrs = ["status_helper.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:status_helper",
-    ],
-)
-
-cc_library(
-    name = "proto_helper",
-    hdrs = ["proto_helper.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:proto_helper",
-    ],
-)
-
-cc_library(
-    name = "tpu_executor",
-    hdrs = [
-        "tpu_executor.h",
-        "tpu_platform.h",
-        "tpu_stream.h",
-        "tpu_timer.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "tpu_platform_hdr",
-    hdrs = ["tpu_platform.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_hdr",
-    ],
-)
-
-cc_library(
-    name = "tpu_executor_hdrs",
-    hdrs = [
-        "tpu_executor.h",
-        "tpu_executor_interface.h",
-        "tpu_platform.h",
-        "tpu_platform_interface.h",
-        "tpu_stream.h",
-        "tpu_stream_interface.h",
-        "tpu_timer.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_hdrs",
-    ],
-)
-
-cc_library(
-    name = "tpu_platform_id",
-    hdrs = ["tpu_platform_id.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_id",
-    ],
-)
-
-cc_library(
-    name = "tpu_executor_base",
-    hdrs = [
-        "tpu_event.h",
-        "tpu_executor.h",
-        "tpu_platform.h",
-        "tpu_stream.h",
-        "tpu_timer.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_base",
-    ],
-)
-
-cc_library(
-    name = "tpu_node_context",
-    hdrs = ["tpu_node_context.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_node_context",
-    ],
-)
-
-cc_library(
-    name = "tpu_transfer_manager_interface",
-    hdrs = ["tpu_transfer_manager_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_transfer_manager_interface",
-    ],
-)
-
-cc_library(
-    name = "tpu_transfer_manager",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_transfer_manager",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
-    name = "tpu_transfer_manager_base",
-    hdrs = ["tpu_transfer_manager.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_transfer_manager_base",
-    ],
-)
-
-cc_library(
-    name = "tpu_op_executable",
-    hdrs = ["tpu_op_executable.h"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_op_executable",
-    ],
-)
-
-cc_library(
-    name = "tpu_platform_interface",
-    hdrs = ["tpu_platform_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_platform_interface",
-    ],
-)
-
-cc_library(
-    name = "tpu_executor_interface",
-    hdrs = ["tpu_executor_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executor_interface",
-    ],
-)
-
-cc_library(
-    name = "tpu_stream_interface",
-    hdrs = ["tpu_stream_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_stream_interface",
-    ],
-)
-
-cc_library(
-    name = "tpu_executable_interface",
-    hdrs = ["tpu_executable_interface.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executable_interface",
-    ],
-)
-
-cc_library(
-    name = "tpu_executable",
-    hdrs = ["tpu_executable.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_executable",
-    ],
-)
-
-cc_library(
-    name = "tpu_topology_external",
-    hdrs = ["tpu_topology.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_topology_external",
-    ],
-)
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.h b/tensorflow/stream_executor/tpu/c_api_conversions.h
deleted file mode 100644
index 38610ff04df..00000000000
--- a/tensorflow/stream_executor/tpu/c_api_conversions.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_CONVERSIONS_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
-
-#endif
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
deleted file mode 100644
index 7bcb54e1682..00000000000
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DECL_H_
diff --git a/tensorflow/stream_executor/tpu/c_api_defn.h b/tensorflow/stream_executor/tpu/c_api_defn.h
deleted file mode 100644
index a4a8746c4fb..00000000000
--- a/tensorflow/stream_executor/tpu/c_api_defn.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_defn.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_C_API_DEFN_H_
diff --git a/tensorflow/stream_executor/tpu/noncopyable_buffer.h b/tensorflow/stream_executor/tpu/noncopyable_buffer.h
deleted file mode 100644
index 0bdd57e384e..00000000000
--- a/tensorflow/stream_executor/tpu/noncopyable_buffer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/noncopyable_buffer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_NONCOPYABLE_BUFFER_H_
diff --git a/tensorflow/stream_executor/tpu/proto_helper.h b/tensorflow/stream_executor/tpu/proto_helper.h
deleted file mode 100644
index b3881b0e562..00000000000
--- a/tensorflow/stream_executor/tpu/proto_helper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_PROTO_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/status_helper.h b/tensorflow/stream_executor/tpu/status_helper.h
deleted file mode 100644
index c888d915c62..00000000000
--- a/tensorflow/stream_executor/tpu/status_helper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_STATUS_HELPER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_event.h b/tensorflow/stream_executor/tpu/tpu_event.h
deleted file mode 100644
index 51a7f21cc9a..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_event.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_event.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.h b/tensorflow/stream_executor/tpu/tpu_executable.h
deleted file mode 100644
index 3785d9e581d..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_executable.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executable.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
deleted file mode 100644
index f97a05a634c..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executable_interface.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTABLE_INTERFACE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
deleted file mode 100644
index be72e568b62..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
deleted file mode 100644
index 996746f3b56..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_c_api.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_C_API_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_interface.h b/tensorflow/stream_executor/tpu/tpu_executor_interface.h
deleted file mode 100644
index a48c0d5ddde..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_executor_interface.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_executor_interface.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_INTERFACE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.h b/tensorflow/stream_executor/tpu/tpu_node_context.h
deleted file mode 100644
index 4248766e383..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_node_context.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_op_executable.h b/tensorflow/stream_executor/tpu/tpu_op_executable.h
deleted file mode 100644
index da54ed79cb0..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_op_executable.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_op_executable.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_OP_EXECUTABLE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
deleted file mode 100644
index c3d03ebf2d4..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_id.h b/tensorflow/stream_executor/tpu/tpu_platform_id.h
deleted file mode 100644
index 8fc2b54c8ce..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_platform_id.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_id.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.h b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
deleted file mode 100644
index 067139c5b7c..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_INTERFACE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
deleted file mode 100644
index 48897a79897..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_stream.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_stream_interface.h b/tensorflow/stream_executor/tpu/tpu_stream_interface.h
deleted file mode 100644
index 100793f448d..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_stream_interface.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_stream_interface.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_INTERFACE_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_timer.h b/tensorflow/stream_executor/tpu/tpu_timer.h
deleted file mode 100644
index c3707ce5325..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_timer.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_timer.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.h b/tensorflow/stream_executor/tpu/tpu_topology.h
deleted file mode 100644
index fca09e142ed..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_topology.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_topology.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TOPOLOGY_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
deleted file mode 100644
index 7a689852280..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
deleted file mode 100644
index 97cb3741cb9..00000000000
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
-
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_INTERFACE_H_
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
deleted file mode 100644
index 3e442f813d8..00000000000
--- a/tensorflow/stream_executor/trace_listener.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
-
-#include "tensorflow/compiler/xla/stream_executor/trace_listener.h"
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_TRACE_LISTENER_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 0214579ec1a..94f24765967 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -20,8 +20,11 @@ load(
 load(
     "//tensorflow/tsl:tsl.bzl",
     "tsl_gpu_library",
+    _cc_header_only_library = "cc_header_only_library",
     _clean_dep = "clean_dep",
     _if_cuda_or_rocm = "if_cuda_or_rocm",
+    _if_nccl = "if_nccl",
+    _transitive_hdrs = "transitive_hdrs",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -69,7 +72,7 @@ def register_extension_info(**kwargs):
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.12.0"
+VERSION = "2.13.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 
@@ -79,6 +82,8 @@ two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
 workspace_root = Label("//:WORKSPACE").workspace_root or "."
 
 clean_dep = _clean_dep
+cc_header_only_library = _cc_header_only_library
+transitive_hdrs = _transitive_hdrs
 
 def if_oss(oss_value, google_value = []):
     """Returns one of the arguments based on the non-configurable build env.
@@ -296,11 +301,12 @@ def if_override_eigen_strong_inline(a):
         "//conditions:default": [],
     })
 
-def if_nccl(if_true, if_false = []):
+if_nccl = _if_nccl
+
+def if_zendnn(if_true, if_false = []):
     return select({
-        "//tensorflow:no_nccl_support": if_false,
-        "//tensorflow:windows": if_false,
-        "//conditions:default": if_true,
+        clean_dep("//tensorflow:linux_x86_64"): if_true,
+        "//conditions:default": if_false,
     })
 
 def if_libtpu(if_true, if_false = []):
@@ -432,6 +438,7 @@ def tf_copts(
         if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
         if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
         if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
+        if_zendnn(["-DAMD_ZENDNN"]) +
         if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
         if_android_arm(["-mfpu=neon"]) +
         if_linux_x86_64(["-msse3"]) +
@@ -529,7 +536,8 @@ def tf_gen_op_libs(
         sub_directory = "ops/",
         deps = None,
         is_external = True,
-        compatible_with = None):
+        compatible_with = None,
+        features = []):
     # Make library out of each op so it can also be used to generate wrappers
     # for various languages.
     if not deps:
@@ -538,6 +546,7 @@ def tf_gen_op_libs(
         cc_library(
             name = n + "_op_lib",
             copts = tf_copts(is_external = is_external),
+            features = features,
             srcs = [sub_directory + n + ".cc"],
             deps = deps + [clean_dep("//tensorflow/core:framework")],
             compatible_with = compatible_with,
@@ -736,6 +745,8 @@ def tf_cc_shared_object(
             name + longsuffix,
         )]
 
+    testonly = kwargs.pop("testonly", False)
+
     for name_os, name_os_major, name_os_full in names:
         # Windows DLLs cant be versioned
         if name_os.endswith(".dll"):
@@ -783,6 +794,7 @@ def tf_cc_shared_object(
                     "-Wl,-soname," + soname,
                 ],
             }),
+            testonly = testonly,
             visibility = visibility,
             **kwargs
         )
@@ -797,6 +809,7 @@ def tf_cc_shared_object(
                 "//conditions:default": [":lib%s.so%s" % (name, longsuffix)],
             }),
             visibility = visibility,
+            testonly = testonly,
         )
 
 # buildozer: disable=function-docstring-args
@@ -819,7 +832,6 @@ def tf_cc_shared_library_opensource(
         win_def_file = None,
         visibility = None):
     """Configures the shared object file for TensorFlow."""
-    data_extra = tf_binary_additional_data_deps() if framework_so != [] else []
     names = _get_shared_library_name_os_version_matrix(
         name,
         per_os_targets = per_os_targets,
@@ -843,7 +855,7 @@ def tf_cc_shared_library_opensource(
             name_os_full,
             additional_linker_inputs = additional_linker_inputs,
             copts = copts,
-            data = data + data_extra,
+            data = data,
             deps = deps + framework_so,
             dynamic_deps = dynamic_deps,
             exports_filter = exports_filter,
@@ -1216,6 +1228,91 @@ def tf_gen_op_wrappers_cc(
         compatible_with = compatible_with,
     )
 
+OpRegistrationSrcInfo = provider(
+    "Info needed to extract op registration sources.",
+    fields = {
+        "srcs": "depset of source Files that contains op registrations.",
+    },
+)
+
+def _collect_op_reg_srcs_aspect_impl(_target, ctx):
+    """Aspect implementation function for collect_op_reg_srcs_aspect.
+
+    This aspect will traverse the dependency graph along the "deps" attribute of the target
+    and return an OpRegistrationSrcInfo provider.
+
+    OpRegistrationSrcInfo will have the union of the srcs of the C++ dependencies
+    that are generated by tf_gen_op_libs and tf_kernel_library and
+    their filename end with "_ops.cc" or "_op.cc".
+    """
+    direct, transitive = [], []
+    if (hasattr(ctx.rule.attr, "generator_function") and
+        ctx.rule.attr.generator_function in ["tf_gen_op_libs", "tf_kernel_library"] and
+        hasattr(ctx.rule.attr, "srcs")):
+        # Assuming the filename of op registration source files ends with "_ops.cc" or "_op.cc"
+        direct += [
+            src
+            for src in ctx.rule.files.srcs
+            if src.path.endswith("_op.cc") or src.path.endswith("_ops.cc")
+        ]
+    if hasattr(ctx.rule.attr, "deps"):
+        for dep in ctx.rule.attr.deps:
+            if OpRegistrationSrcInfo in dep:
+                transitive.append(dep[OpRegistrationSrcInfo].srcs)
+    if not direct and not transitive:
+        return []
+    return [OpRegistrationSrcInfo(srcs = depset(direct = direct, transitive = transitive))]
+
+collect_op_reg_srcs_aspect = aspect(
+    attr_aspects = ["deps"],
+    required_providers = [CcInfo],
+    implementation = _collect_op_reg_srcs_aspect_impl,
+)
+
+def _generate_op_reg_offsets_impl(ctx):
+    op_reg_srcs = []
+    for dep in ctx.attr.deps:
+        if OpRegistrationSrcInfo in dep:
+            for src in dep[OpRegistrationSrcInfo].srcs.to_list():
+                op_reg_srcs.append(src)
+
+    args = ctx.actions.args()
+    args.add(ctx.outputs.out.path, format = "--out_path=%s")
+    args.add_all(op_reg_srcs)
+
+    ctx.actions.run(
+        outputs = [ctx.outputs.out],
+        inputs = op_reg_srcs + ctx.files.tf_binary_additional_srcs,
+        tools = [ctx.executable._offset_counter],
+        executable = ctx.executable._offset_counter,
+        arguments = [args],
+    )
+
+generate_op_reg_offsets = rule(
+    attrs = {
+        "out": attr.output(),
+        "deps": attr.label_list(
+            aspects = [collect_op_reg_srcs_aspect],
+            mandatory = True,
+            allow_files = True,
+            providers = [CcInfo],
+        ),
+        # This is for carrying the required files for _offset_counter to execute.
+        "tf_binary_additional_srcs": attr.label_list(
+            cfg = "exec",
+            mandatory = True,
+            allow_files = True,
+        ),
+        "_offset_counter": attr.label(
+            cfg = "exec",
+            executable = True,
+            allow_files = True,
+            default = "//tensorflow/python/framework:offset_counter",
+        ),
+    },
+    implementation = _generate_op_reg_offsets_impl,
+)
+
 # Generates a Python library target wrapping the ops registered in "deps".
 #
 # Args:
@@ -1288,24 +1385,25 @@ def tf_gen_op_wrapper_py(
         testonly = testonly,
     )
 
+    pygen_args = []
+
     # Invoke the previous cc_binary to generate a python file.
     if not out:
         out = "ops/gen_" + name + ".py"
 
+    extra_srcs = []
     if hidden:
-        op_list_arg = ",".join(hidden)
-        op_list_is_whitelist = False
+        pygen_args.append("--hidden_op_list=" + ",".join(hidden))
+    elif hidden_file:
+        # `hidden_file` is file containing a list of op names to be hidden in the
+        # generated module.
+        pygen_args.append("--hidden_op_list_filename=$(location " + hidden_file + ")")
+        extra_srcs.append(hidden_file)
     elif op_allowlist:
-        op_list_arg = ",".join(op_allowlist)
-        op_list_is_whitelist = True
-    else:
-        op_list_arg = "''"
-        op_list_is_whitelist = False
+        pygen_args.append("--op_allowlist=" + ",".join(op_allowlist))
 
     # Prepare ApiDef directories to pass to the genrule.
-    if not api_def_srcs:
-        api_def_args_str = ","
-    else:
+    if api_def_srcs:
         api_def_args = []
         for api_def_src in api_def_srcs:
             # Add directory of the first ApiDef source to args.
@@ -1315,33 +1413,32 @@ def tf_gen_op_wrapper_py(
                 "$$(dirname $$(echo $(locations " + api_def_src +
                 ") | cut -d\" \" -f1))",
             )
-        api_def_args_str = ",".join(api_def_args)
+        pygen_args.append("--api_def_dirs=" + ",".join(api_def_args))
 
-    if hidden_file:
-        # `hidden_file` is file containing a list of op names to be hidden in the
-        # generated module.
-        native.genrule(
-            name = name + "_pygenrule",
-            outs = [out],
-            srcs = api_def_srcs + [hidden_file],
-            tools = [tool_name] + tf_binary_additional_srcs(),
-            cmd = ("$(location " + tool_name + ") " + api_def_args_str +
-                   " @$(location " + hidden_file + ") > $@"),
-            compatible_with = compatible_with,
-            testonly = testonly,
-        )
-    else:
-        native.genrule(
-            name = name + "_pygenrule",
-            outs = [out],
-            srcs = api_def_srcs,
-            tools = [tool_name] + tf_binary_additional_srcs(),
-            cmd = ("$(location " + tool_name + ") " + api_def_args_str + " " +
-                   op_list_arg + " " +
-                   ("1" if op_list_is_whitelist else "0") + " > $@"),
-            compatible_with = compatible_with,
-            testonly = testonly,
-        )
+    op_reg_offset_out = "gen_" + name + "_reg_offsets.pb"
+    generate_op_reg_offsets(
+        name = name + "_reg_offsets",
+        out = op_reg_offset_out,
+        # Feed an empty dep list if not indexing to skip unnecessary aspect propagation.
+        deps = select({
+            clean_dep("//tensorflow:api_indexable"): deps,
+            "//conditions:default": [],
+        }),
+        tf_binary_additional_srcs = tf_binary_additional_srcs(),
+        testonly = testonly,
+    )
+    extra_srcs.append(op_reg_offset_out)
+    pygen_args.append("--op_reg_offset_filename=$(location " + op_reg_offset_out + ")")
+
+    native.genrule(
+        name = name + "_pygenrule",
+        outs = [out],
+        srcs = api_def_srcs + extra_srcs,
+        tools = [tool_name] + tf_binary_additional_srcs(),
+        cmd = ("$(location " + tool_name + ") " + " ".join(pygen_args) + " > $@"),
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
 
     # Make a py_library out of the generated python file.
     if not generated_target_name:
@@ -1397,7 +1494,10 @@ def tf_cc_test(
                 "-lpthread",
                 "-lm",
             ],
-            clean_dep("//third_party/compute_library:build_with_acl"): ["-fopenmp"],
+            clean_dep("//third_party/compute_library:build_with_acl"): [
+                "-fopenmp",
+                "-lm",
+            ],
         }) + linkopts + _rpath_linkopts(name),
         deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
             [
@@ -1411,6 +1511,52 @@ def tf_cc_test(
         **kwargs
     )
 
+def tf_cc_shared_test(
+        name,
+        srcs,
+        deps,
+        data = [],
+        extra_copts = [],
+        suffix = "",
+        linkopts = lrt_if_needed(),
+        kernels = [],
+        **kwargs):
+    cc_test(
+        name = "%s%s" % (name, suffix),
+        srcs = srcs,
+        copts = tf_copts() + extra_copts,
+        linkopts = select({
+            clean_dep("//tensorflow:android"): [
+                "-pie",
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            clean_dep("//tensorflow:macos"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+            clean_dep("//third_party/compute_library:build_with_acl"): [
+                "-fopenmp",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                clean_dep("//third_party/mkl:intel_binary_blob"),
+            ],
+        ),
+        dynamic_deps = if_static(
+            extra_deps = [],
+            macos = ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+            otherwise = ["//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(),
+        exec_properties = tf_exec_properties(kwargs),
+        **kwargs
+    )
+
 register_extension_info(
     extension = tf_cc_test,
     label_regex_for_dep = "{extension_name}",
@@ -1932,97 +2078,6 @@ def _get_repository_roots(ctx, files):
             result[root] -= 1
     return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
-# Bazel rule for collecting the header files that a target depends on.
-def _transitive_hdrs_impl(ctx):
-    outputs = _get_transitive_headers([], ctx.attr.deps)
-    return struct(files = outputs)
-
-_transitive_hdrs = rule(
-    attrs = {
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = [CcInfo],
-        ),
-    },
-    implementation = _transitive_hdrs_impl,
-)
-
-def transitive_hdrs(name, deps = [], **kwargs):
-    _transitive_hdrs(name = name + "_gather", deps = deps)
-    native.filegroup(name = name, srcs = [":" + name + "_gather"])
-
-# Bazel rule for collecting the transitive parameters from a set of dependencies into a library.
-# Propagates defines and includes.
-def _transitive_parameters_library_impl(ctx):
-    defines = depset(
-        transitive = [dep[CcInfo].compilation_context.defines for dep in ctx.attr.original_deps],
-    )
-    system_includes = depset(
-        transitive = [dep[CcInfo].compilation_context.system_includes for dep in ctx.attr.original_deps],
-    )
-    includes = depset(
-        transitive = [dep[CcInfo].compilation_context.includes for dep in ctx.attr.original_deps],
-    )
-    quote_includes = depset(
-        transitive = [dep[CcInfo].compilation_context.quote_includes for dep in ctx.attr.original_deps],
-    )
-    framework_includes = depset(
-        transitive = [dep[CcInfo].compilation_context.framework_includes for dep in ctx.attr.original_deps],
-    )
-    return CcInfo(
-        compilation_context = cc_common.create_compilation_context(
-            defines = depset(direct = defines.to_list()),
-            system_includes = depset(direct = system_includes.to_list()),
-            includes = depset(direct = includes.to_list()),
-            quote_includes = depset(direct = quote_includes.to_list()),
-            framework_includes = depset(direct = framework_includes.to_list()),
-        ),
-    )
-
-_transitive_parameters_library = rule(
-    attrs = {
-        "original_deps": attr.label_list(
-            allow_empty = True,
-            allow_files = True,
-            providers = [CcInfo],
-        ),
-    },
-    implementation = _transitive_parameters_library_impl,
-)
-
-# Create a header only library that includes all the headers exported by
-# the libraries in deps.
-#
-# **NOTE**: The headers brought in are **NOT** fully transitive; certain
-# deep headers may be missing.  If this creates problems, you must find
-# a header-only version of the cc_library rule you care about and link it
-# *directly* in addition to your use of the cc_header_only_library
-# intermediary.
-#
-# For:
-#   * Eigen: it's a header-only library.  Add it directly to your deps.
-#   * GRPC: add a direct dep on @com_github_grpc_grpc//:grpc++_public_hdrs.
-#
-def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], compatible_with = None, **kwargs):
-    _transitive_hdrs(
-        name = name + "_gather",
-        deps = deps,
-        compatible_with = compatible_with,
-    )
-    _transitive_parameters_library(
-        name = name + "_gathered_parameters",
-        original_deps = deps,
-        compatible_with = compatible_with,
-    )
-    cc_library(
-        name = name,
-        hdrs = [":" + name + "_gather"],
-        includes = includes,
-        compatible_with = compatible_with,
-        deps = [":" + name + "_gathered_parameters"] + extra_deps,
-        **kwargs
-    )
-
 def tf_custom_op_library_additional_deps():
     return [
         "@com_google_protobuf//:protobuf_headers",  # copybara:comment
@@ -2330,6 +2385,13 @@ def pywrap_tensorflow_macro_opensource(
     })
     additional_linker_inputs = if_windows([], otherwise = ["%s.lds" % vscriptname])
 
+    # This is needed so that libtensorflow_cc is included in the pip package.
+    srcs += select({
+        clean_dep("//tensorflow:macos"): [clean_dep("//tensorflow:libtensorflow_cc.%s.dylib" % VERSION_MAJOR)],
+        clean_dep("//tensorflow:windows"): [],
+        "//conditions:default": [clean_dep("//tensorflow:libtensorflow_cc.so.%s" % VERSION_MAJOR)],
+    })
+
     tf_cc_shared_library_opensource(
         name = cc_shared_library_name,
         srcs = srcs,
@@ -2988,7 +3050,6 @@ def pybind_extension_opensource(
                 ],
             }),
             visibility = visibility,
-            win_def_file = if_windows(win_def_file, otherwise = None),
         )
 
         # cc_shared_library can generate more than one file.
@@ -3067,7 +3128,7 @@ def pybind_extension_opensource(
     native.py_library(
         name = name,
         data = select({
-            "@org_tensorflow//tensorflow:windows": [pyd_file],
+            clean_dep("//tensorflow:windows"): [pyd_file],
             "//conditions:default": [so_file],
         }) + pytype_srcs,
         deps = pytype_deps,
@@ -3083,6 +3144,23 @@ def pybind_extension_opensource(
 # Export open source version of pybind_extension under base name as well.
 pybind_extension = pybind_extension_opensource
 
+# Note: we cannot add //third_party/tf_runtime:__subpackages__ here,
+# because that builds all of tf_runtime's packages, and some of them
+# are known not to build on big endian systems.
+# See b/148087476 and
+# https://github.com/tensorflow/tensorflow/issues/57844.
+# TODO(b/254083070): remove this definition once the packages move to TSL.
+def tsl_async_value_deps():
+    return [
+        "@tf_runtime//:async_value",
+        "@tf_runtime//:dtype",
+        "@tf_runtime//:support",
+        "@tf_runtime//:concurrent_vector",
+        "@tf_runtime//:ref_count",
+        "@tf_runtime//third_party/llvm_derived:unique_any",
+        "@tf_runtime//third_party/llvm_derived:in_place",
+    ]
+
 def tf_python_pybind_static_deps(testonly = False):
     # TODO(b/146808376): Reduce the dependencies to those that are really needed.
     static_deps = [
@@ -3148,6 +3226,7 @@ def tf_python_pybind_static_deps(testonly = False):
         "@upb//:__subpackages__",
         "@zlib//:__subpackages__",
     ]
+    static_deps += tsl_async_value_deps()
     static_deps += [] if not testonly else [
         "@com_google_benchmark//:__subpackages__",
         "@com_google_googletest//:__subpackages__",
@@ -3159,7 +3238,7 @@ def tf_python_pybind_extension_opensource(
         name,
         srcs,
         module_name = None,
-        hdrs = [],
+        hdrs = [],  # TODO(b/264128506): Drop after migration to cc_shared_library.
         deps = [],
         dynamic_deps = [],
         static_deps = [],
@@ -3320,15 +3399,9 @@ def filegroup(**kwargs):
 def genrule(**kwargs):
     native.genrule(**kwargs)
 
-def internal_hlo_deps():
-    return []
-
 def internal_tfrt_deps():
     return []
 
-def internal_cuda_deps():
-    return []
-
 def _tf_gen_options_header_impl(ctx):
     header_depset = depset([ctx.outputs.output_header])
 
diff --git a/tensorflow/tensorflow.default.bzl b/tensorflow/tensorflow.default.bzl
index c2aea8d61f1..537664f9c21 100644
--- a/tensorflow/tensorflow.default.bzl
+++ b/tensorflow/tensorflow.default.bzl
@@ -11,11 +11,8 @@ load(
     _get_compatible_with_cloud = "get_compatible_with_cloud",
     _get_compatible_with_portable = "get_compatible_with_portable",
     _if_indexing_source_code = "if_indexing_source_code",
-    _if_nccl = "if_nccl",
     _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
     _if_portable = "if_portable",
-    _internal_cuda_deps = "internal_cuda_deps",
-    _internal_hlo_deps = "internal_hlo_deps",
     _internal_tfrt_deps = "internal_tfrt_deps",
     _pybind_extension = "pybind_extension",
     _pybind_library = "pybind_library",
@@ -52,7 +49,6 @@ load(
 
 clean_dep = _clean_dep
 if_not_mobile_or_arm_or_lgpl_restricted = _if_not_mobile_or_arm_or_lgpl_restricted
-if_nccl = _if_nccl
 if_portable = _if_portable
 ADDITIONAL_API_INDEXABLE_SETTINGS = _ADDITIONAL_API_INDEXABLE_SETTINGS
 if_indexing_source_code = _if_indexing_source_code
@@ -91,8 +87,6 @@ tf_generate_proto_text_sources = _tf_generate_proto_text_sources
 tf_kernel_library = _tf_kernel_library
 filegroup = _filegroup
 genrule = _genrule
-internal_hlo_deps = _internal_hlo_deps
 internal_tfrt_deps = _internal_tfrt_deps
-internal_cuda_deps = _internal_cuda_deps
 tf_disable_ptxas_warning_flags = _tf_disable_ptxas_warning_flags
 replace_with_portable_tf_lib_when_required = _replace_with_portable_tf_lib_when_required
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 3c1a796c15f..456809e70f5 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -12,3 +12,6 @@
 *PyInit_*
 *SE_*
 *SP_*
+*tsl*
+*lite*
+*TFL*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index d26fa4e5278..23f67f17e9c 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -14,6 +14,9 @@ tensorflow {
     *PyInit_*;
     *SE_*;
     *SP_*;
+    *tsl*;
+    *lite*;
+    *TFL*;
   local:
     *;
 };
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index dff25be25e0..62be4b97aea 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/android/test/BUILD b/tensorflow/tools/android/test/BUILD
index 2db686ad428..284449872ac 100644
--- a/tensorflow/tools/android/test/BUILD
+++ b/tensorflow/tools/android/test/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/android/test/assets/BUILD b/tensorflow/tools/android/test/assets/BUILD
index a817650ee77..d043d15bcfc 100644
--- a/tensorflow/tools/android/test/assets/BUILD
+++ b/tensorflow/tools/android/test/assets/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 211f9ec2141..f37d4d9178a 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -1,6 +1,7 @@
 # TensorFlow API backwards compatibility test goldens.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index f71201f56db..a01ea52b9f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -125,6 +125,12 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.CoordinationServiceConfig"
     }
+    field {
+      name: "disable_optimize_for_static_graph"
+      number: 24
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     enum_type {
       name: "MlirBridgeRollout"
       value {
@@ -139,14 +145,16 @@ tf_proto {
         name: "MLIR_BRIDGE_ROLLOUT_DISABLED"
         number: 2
       }
-      value {
-        name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED"
-        number: 3
+      reserved_range {
+        start: 3
+        end: 3
       }
-      value {
-        name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED"
-        number: 4
+      reserved_range {
+        start: 4
+        end: 4
       }
+      reserved_name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED"
+      reserved_name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED"
     }
     reserved_range {
       start: 2
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 7d7c2ea2f91..7b8a37835ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -254,6 +254,12 @@ tf_proto {
         type: TYPE_MESSAGE
         type_name: ".tensorflow.CoordinationServiceConfig"
       }
+      field {
+        name: "disable_optimize_for_static_graph"
+        number: 24
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       enum_type {
         name: "MlirBridgeRollout"
         value {
@@ -268,14 +274,16 @@ tf_proto {
           name: "MLIR_BRIDGE_ROLLOUT_DISABLED"
           number: 2
         }
-        value {
-          name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED"
-          number: 3
+        reserved_range {
+          start: 3
+          end: 3
         }
-        value {
-          name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED"
-          number: 4
+        reserved_range {
+          start: 4
+          end: 4
         }
+        reserved_name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_ENABLED"
+        reserved_name: "MLIR_BRIDGE_ROLLOUT_SAFE_MODE_FALLBACK_ENABLED"
       }
       reserved_range {
         start: 2
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
index d5e416c1c11..474d346f4c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-d-type.pbtxt
@@ -94,4 +94,8 @@ tf_class {
     name: "most_specific_common_supertype"
     argspec: "args=[\'self\', \'types\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 65351445ee9..32069558329 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -126,6 +126,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "gpu_host_mem_limit_in_mb"
+        number: 13
+        label: LABEL_OPTIONAL
+        type: TYPE_FLOAT
+      }
+      field {
+        name: "gpu_host_mem_disallow_growth"
+        number: 14
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices-spec.pbtxt
index 0a5be68e74c..918ab5832ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlicesSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.indexed_slices.IndexedSlicesSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -41,4 +42,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-optional-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-optional-spec.pbtxt
index 8d8cc0fc866..8e5ce78fedf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-optional-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-optional-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.OptionalSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
index ca2d49101d4..cc2bd457a11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensorSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -66,4 +67,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
index 892e758dafb..2b653950ea5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -54,4 +55,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-array-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-array-spec.pbtxt
index 55e2bbab852..53ed9f53a78 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-array-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-array-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.TensorArraySpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.tensor_array_ops.TensorArraySpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
index 5f08af16b0a..826ec0d7bdd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "num_elements"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "with_rank"
     argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
index 64e84054558..dd5b735e309 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -4,8 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.tensor_spec.DenseSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TensorSpec\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
@@ -63,4 +65,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-type-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-type-spec.pbtxt
index d9b40a16a66..a53768ec546 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-type-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-type-spec.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.TypeSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -39,4 +40,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt
new file mode 100644
index 00000000000..8589534a09d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__"
+tf_module {
+  member {
+    name: "types"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.-dataset.pbtxt
new file mode 100644
index 00000000000..b32429eff6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.__internal__.types.data.Dataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.types.data.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.types.data.DatasetV2\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.pbtxt
new file mode 100644
index 00000000000..7925cb0a088
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.data.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.types.data"
+tf_module {
+  member {
+    name: "Dataset"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.pbtxt
new file mode 100644
index 00000000000..b7cd8b43a64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.types.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.types"
+tf_module {
+  member {
+    name: "data"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
index c85e5468270..fbe832bdc28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
@@ -12,18 +12,10 @@ tf_module {
     name: "disable_mlir_bridge"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "disable_mlir_graph_optimization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "enable_mlir_bridge"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "enable_mlir_graph_optimization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "enable_tensor_float_32_execution"
     argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset-spec.pbtxt
index 1e41da17adc..cad78d5aedf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -50,4 +51,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 2c80af87cd7..edb2e0efe75 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -149,7 +149,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -173,7 +173,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 84362b951f8..7806aa4dc29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 401f2237f49..54a672927fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "experimental_slack"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_symbolic_checkpoint"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_threading"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 3fd7b5cef6f..777f85c6be9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 96271d1bca9..121abace8d5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index cbc93eff8dd..eb915152990 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
index adc5eb41b55..41b1e81d1e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -50,4 +51,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
index ff036daa30f..ba66e735ac9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.data.experimental.OptionalStructure"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index bcd129e7b96..26ab3f8c38d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index d3f11b3bcc1..3dd850aeb5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
index e1f50a56ac1..22306f98377 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.experimental.Structure"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -39,4 +40,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
index 5b7f4612a5a..c5b9201048c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "worker_addresses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "worker_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 46e7e47db7d..353239b3984 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "connect"
     argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "get_coordination_service_leader"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
index c9c9039d104..159db83f333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.-d-type.pbtxt
@@ -94,4 +94,8 @@ tf_class {
     name: "most_specific_common_supertype"
     argspec: "args=[\'self\', \'types\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
new file mode 100644
index 00000000000..a8e109f3ce9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.dtypes.experimental"
+tf_module {
+  member {
+    name: "float8_e4m3fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float8_e5m2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
index 04b7f5fff83..18608439fdc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "float16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
index 85a22c7eaff..73d6558e5e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -64,6 +65,10 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "with_dtype"
     argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-structured-tensor.-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-structured-tensor.-spec.pbtxt
index 19d57bece92..5325245d5b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-structured-tensor.-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-structured-tensor.-spec.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -56,4 +57,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.extension_type.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.extension_type.pbtxt
new file mode 100644
index 00000000000..def31ca6c60
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.extension_type.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental.extension_type"
+tf_module {
+  member_method {
+    name: "as_dict"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index a89621279cc..393688804c5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -28,6 +28,18 @@ tf_module {
     name: "StructuredTensor"
     mtype: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeMetaclass\'>"
   }
+  member {
+    name: "extension_type"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "float8_e4m3fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float8_e5m2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member_method {
     name: "async_clear_error"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index f3ae553bee9..3e20e33b244 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index b7a2f7b9bef..8b8f2bfa544 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index e903ce0e67f..5a748cb56a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 0af7fff9e54..1da2e289449 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -171,9 +171,13 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 9bdc1a0bd2e..c41c8ecfcab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index fef56a2ad59..139ec7c6574 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 60c5495411c..0c96903110e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index f12f31a61df..6e4e36f69aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 9d53d492a5a..15ddfac2709 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -203,6 +207,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 18ec6f90df7..1f68f0fbddf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -170,6 +170,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -198,6 +202,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 470694c00c2..eb1b34b23f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index cfebd67fec3..9cc0b8dbac6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -170,6 +170,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -198,6 +202,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 41def2a908c..7ccaf2368b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -168,6 +168,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -196,6 +200,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index c97ed978978..d0dec3ac325 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 08e5addfd74..f70c40b52c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index b75fa795c74..32551f5e177 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -171,6 +171,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -199,6 +203,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 5e04a52762e..e178f2d5ca9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index 3949a3a8029..a3bee2b4ca6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -172,6 +172,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -204,6 +208,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
index 9538fe382a0..72c4532e8b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_STABLEHLO_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.experimental.pbtxt
index b6535abc1cd..43a45a2374b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.nn.experimental"
 tf_module {
+  member_method {
+    name: "general_dropout"
+    argspec: "args=[\'x\', \'rate\', \'uniform_sampler\', \'noise_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stateless_dropout"
     argspec: "args=[\'x\', \'rate\', \'seed\', \'rng_alg\', \'noise_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 74d60c856c9..66a05569b23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -362,7 +362,7 @@ tf_module {
   }
   member_method {
     name: "sigmoid_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "silu"
@@ -374,7 +374,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'-1\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "static_bidirectional_rnn"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 5c9fc80af3a..b71cb8ab2c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index dd12069b310..164c309b47f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 17291ceb4b0..91c14590b1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index fda5f97b9c0..ff442c372a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -183,6 +183,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -211,6 +215,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 8590d970b73..1cc528e7dac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index cfe2ad5221a..4b4765c164e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 8501a75b408..350bfaf071a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
@@ -206,6 +210,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index fe23df08dc4..f9b9a4e9f18 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -177,6 +177,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -205,6 +209,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 6da29bbf76c..48ee0fb00b0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -179,6 +179,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
@@ -207,6 +211,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index aa31c32640d..370a214cfb3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1040,6 +1040,14 @@ tf_module {
     name: "control_flow_v2_enabled"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "conv2d_backprop_filter_v2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_input_v2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "convert_to_tensor"
     argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\', \'dtype_hint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -1350,7 +1358,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_attributes\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -2424,6 +2432,10 @@ tf_module {
     name: "to_int64"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
   }
+  member_method {
+    name: "tpu_partitioned_output_v2"
+    argspec: "args=[\'inputs\', \'num_splits\', \'partition_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "trace"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index d6db4a471cc..54b8b65a0bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "fixed_unigram_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "fold_in"
+    argspec: "args=[\'seed\', \'data\', \'alg\'], varargs=None, keywords=None, defaults=[\'auto_select\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -72,6 +76,10 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'seed\', \'num\', \'alg\'], varargs=None, keywords=None, defaults=[\'2\', \'auto_select\'], "
+  }
   member_method {
     name: "stateless_binomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index f918975ac5a..9e373836765 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -832,6 +832,10 @@ tf_module {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveReduceScatterV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'max_subdivs_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "CollectiveReduceV2"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'max_subdivs_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'-1\', \'None\'], "
@@ -928,10 +932,18 @@ tf_module {
     name: "Conv2DBackpropFilter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "Conv2DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "Conv2DBackpropInput"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "Conv2DBackpropInputV2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "Conv3D"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -1320,6 +1332,10 @@ tf_module {
     name: "DisableCopyOnRead"
     argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DistributedSave"
+    argspec: "args=[\'dataset\', \'directory\', \'address\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "Div"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3320,6 +3336,10 @@ tf_module {
     name: "RandomDataset"
     argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "RandomDatasetV2"
+    argspec: "args=[\'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'rerandomize_each_iteration\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'\', \'None\'], "
+  }
   member_method {
     name: "RandomGamma"
     argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -4056,10 +4076,18 @@ tf_module {
     name: "SegmentProd"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SegmentProdV2"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SegmentSum"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SegmentSumV2"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Select"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4860,6 +4888,10 @@ tf_module {
     name: "SymbolicGradient"
     argspec: "args=[\'input\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SyncDevice"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "TFRecordDataset"
     argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
@@ -4892,10 +4924,18 @@ tf_module {
     name: "TPUPartitionedInput"
     argspec: "args=[\'inputs\', \'partition_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
+  member_method {
+    name: "TPUPartitionedInputV2"
+    argspec: "args=[\'inputs\', \'partition_dims\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "TPUPartitionedOutput"
     argspec: "args=[\'inputs\', \'num_splits\', \'partition_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
+  member_method {
+    name: "TPUPartitionedOutputV2"
+    argspec: "args=[\'inputs\', \'num_splits\', \'partition_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "TPUReplicateMetadata"
     argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'use_spmd_for_xla_partitioning\', \'tpu_compile_options_proto\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'False\', \'\', \'None\'], "
@@ -5304,10 +5344,18 @@ tf_module {
     name: "UniformQuantize"
     argspec: "args=[\'input\', \'scales\', \'zero_points\', \'Tout\', \'quantization_min_val\', \'quantization_max_val\', \'quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "UniformQuantizedAdd"
+    argspec: "args=[\'lhs\', \'rhs\', \'lhs_scales\', \'lhs_zero_points\', \'rhs_scales\', \'rhs_zero_points\', \'output_scales\', \'output_zero_points\', \'lhs_quantization_min_val\', \'lhs_quantization_max_val\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'output_quantization_min_val\', \'output_quantization_max_val\', \'lhs_quantization_axis\', \'rhs_quantization_axis\', \'output_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "UniformQuantizedClipByValue"
     argspec: "args=[\'operand\', \'min\', \'max\', \'scales\', \'zero_points\', \'quantization_min_val\', \'quantization_max_val\', \'quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "UniformQuantizedConvolution"
+    argspec: "args=[\'lhs\', \'rhs\', \'lhs_scales\', \'lhs_zero_points\', \'rhs_scales\', \'rhs_zero_points\', \'output_scales\', \'output_zero_points\', \'Tout\', \'padding\', \'lhs_quantization_min_val\', \'lhs_quantization_max_val\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'output_quantization_min_val\', \'output_quantization_max_val\', \'window_strides\', \'explicit_padding\', \'lhs_dilation\', \'rhs_dilation\', \'batch_group_count\', \'feature_group_count\', \'dimension_numbers\', \'lhs_quantization_axis\', \'rhs_quantization_axis\', \'output_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'1\', \'1\', \'\', \'-1\', \'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "UniformQuantizedConvolutionHybrid"
     argspec: "args=[\'lhs\', \'rhs\', \'rhs_scales\', \'rhs_zero_points\', \'Tout\', \'padding\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'window_strides\', \'explicit_padding\', \'lhs_dilation\', \'rhs_dilation\', \'batch_group_count\', \'feature_group_count\', \'dimension_numbers\', \'rhs_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'1\', \'1\', \'\', \'-1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.experimental.pbtxt
new file mode 100644
index 00000000000..827530d46c7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.test.experimental"
+tf_module {
+  member_method {
+    name: "sync_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
index 5c85c0628ec..8b1a69200aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "mock"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
index 0446af617bb..829d45dc73f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_nesterov\', \'exponent\', \'beta2\', \'epsilon\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.0\', \'False\', \'2\', \'1\', \'1e-10\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_nesterov\', \'exponent\', \'beta2\', \'epsilon\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.0\', \'False\', \'2\', \'1\', \'1e-10\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index 5762d8dafb8..0e3dda0dad1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index f66e284a0fa..f52fec3b97f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
index a34b40c6145..4a1ec711640 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'beta\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'multiply_linear_by_learning_rate\', \'allow_zero_accumulator\'], varargs=None, keywords=None, defaults=[\'0.001\', \'-0.5\', \'0.0\', \'0.0\', \'0.0\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'beta\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'multiply_linear_by_learning_rate\', \'allow_zero_accumulator\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'-0.5\', \'0.0\', \'0.0\', \'0.0\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 66c6ac61cb1..37fb55880ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.01\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
index 7533de743f2..6172c69684e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-options.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.train.CheckpointOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.checkpoint.checkpoint_options.CheckpointOptions\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "enable_async"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_enable_async_checkpoint"
     mtype: "<type \'member_descriptor\'>"
@@ -12,6 +16,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'enable_async\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
index d5e416c1c11..474d346f4c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-d-type.pbtxt
@@ -94,4 +94,8 @@ tf_class {
     name: "most_specific_common_supertype"
     argspec: "args=[\'self\', \'types\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices-spec.pbtxt
index 0a5be68e74c..918ab5832ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlicesSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.indexed_slices.IndexedSlicesSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -41,4 +42,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-optional-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-optional-spec.pbtxt
index 8d8cc0fc866..8e5ce78fedf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-optional-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-optional-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.OptionalSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
index ca2d49101d4..cc2bd457a11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensorSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -66,4 +67,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
index 892e758dafb..2b653950ea5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -54,4 +55,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-array-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-array-spec.pbtxt
index 55e2bbab852..53ed9f53a78 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-array-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-array-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.TensorArraySpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.tensor_array_ops.TensorArraySpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
index 5f08af16b0a..826ec0d7bdd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "num_elements"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "with_rank"
     argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
index 64e84054558..dd5b735e309 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -4,8 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.tensor_spec.DenseSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TensorSpec\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
@@ -63,4 +65,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-type-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-type-spec.pbtxt
index d9b40a16a66..a53768ec546 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-type-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-type-spec.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.TypeSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -39,4 +40,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
index 0fed27a9548..6730e696aee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-func-graph.pbtxt
@@ -115,10 +115,6 @@ tf_class {
     name: "capture_call_time_value"
     argspec: "args=[\'self\', \'closure\', \'spec\', \'key\', \'default_value\', \'placeholder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "capture_distributed_variable"
-    argspec: "args=[\'self\', \'variable\', \'placeholder\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "capture_eager_tensor"
     argspec: "args=[\'self\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.-function.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.-function.pbtxt
index b6d01844a77..8b0261ac347 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.-function.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.function.-function.pbtxt
@@ -23,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'python_function\', \'name\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'python_function\', \'name\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_attributes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "experimental_get_compiler_ir"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
index f9831f9ab99..af305829c57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
@@ -42,7 +42,7 @@ tf_module {
   }
   member_method {
     name: "saveable_objects_from_trackable"
-    argspec: "args=[\'obj\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'obj\', \'tf1_saver\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sticky_attribute_assignment"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.-dataset.pbtxt
new file mode 100644
index 00000000000..3447586a3a9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.-dataset.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.__internal__.types.data.Dataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.types.data.DatasetV2\'>"
+  is_instance: "<class \'abc.ABC\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.pbtxt
new file mode 100644
index 00000000000..7925cb0a088
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.data.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.types.data"
+tf_module {
+  member {
+    name: "Dataset"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.pbtxt
index 783cfb2357f..c84f4366b15 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.types.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "Tensor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "data"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
index 8e9b8a4a764..5d0f15c1c1e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
@@ -12,18 +12,10 @@ tf_module {
     name: "disable_mlir_bridge"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "disable_mlir_graph_optimization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "enable_mlir_bridge"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "enable_mlir_graph_optimization"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "enable_op_determinism"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset-spec.pbtxt
index 1e41da17adc..cad78d5aedf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset-spec.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -50,4 +51,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index e53f4b19ed1..46d0eaef03f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -140,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 853cae58e53..0cb02cb619d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator-spec.pbtxt
index c0e2e4f6d99..8a2da33b1ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator-spec.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.data.IteratorSpec"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.IteratorSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -45,4 +46,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 401f2237f49..54a672927fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "experimental_slack"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_symbolic_checkpoint"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_threading"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 8e9e4c4733a..708840f54ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -141,7 +141,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index b182e7e991e..b2ef1c239fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index e83936520d6..3a584720de4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 1a046333b5b..fa9e57a4223 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV2\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.random_op._RandomDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<class \'collections.abc.Iterable\'>"
@@ -11,7 +11,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -143,7 +143,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index d6bf50676ce..c4dd0f9a825 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -142,7 +142,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 4dcacecb6e6..c702334cdf0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -88,6 +88,10 @@ tf_module {
     name: "assert_cardinality"
     argspec: "args=[\'expected_cardinality\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "at"
+    argspec: "args=[\'dataset\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
index 5b7f4612a5a..c5b9201048c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "worker_addresses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "worker_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 46e7e47db7d..353239b3984 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -31,6 +31,10 @@ tf_class {
     name: "connect"
     argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "get_coordination_service_leader"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-watcher.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-watcher.pbtxt
new file mode 100644
index 00000000000..4f8b61365b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-preemption-watcher.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.distribute.experimental.PreemptionWatcher"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.failure_handling.preemption_watcher.PreemptionWatcher\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "preemption_message"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-termination-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-termination-config.pbtxt
index 96254bdf9ae..fbc8a537b5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-termination-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-termination-config.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'termination_watcher_fn\', \'exit_fn\', \'grace_period\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'termination_watcher_fn\', \'exit_fn\', \'grace_period\', \'save_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
index 38666af9f77..993904e7ee8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "PreemptionCheckpointHandler"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "PreemptionWatcher"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TPUStrategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
index c9c9039d104..159db83f333 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.-d-type.pbtxt
@@ -94,4 +94,8 @@ tf_class {
     name: "most_specific_common_supertype"
     argspec: "args=[\'self\', \'types\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
new file mode 100644
index 00000000000..a8e109f3ce9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.dtypes.experimental"
+tf_module {
+  member {
+    name: "float8_e4m3fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float8_e5m2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index d13c9f25600..8fc62fc1692 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "float16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
index 85a22c7eaff..73d6558e5e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-dynamic-ragged-shape.-spec.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -64,6 +65,10 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "with_dtype"
     argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-structured-tensor.-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-structured-tensor.-spec.pbtxt
index 19d57bece92..5325245d5b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-structured-tensor.-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-structured-tensor.-spec.pbtxt
@@ -5,6 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.BatchableTypeSpec\'>"
   is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
   is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
   is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
   is_instance: "<type \'object\'>"
@@ -56,4 +57,8 @@ tf_class {
     name: "most_specific_compatible_type"
     argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
index 94cd3ef19af..c38c797228b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "random"
-    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'seed\', \'rerandomize_each_iteration\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "range"
@@ -143,7 +143,7 @@ tf_class {
   }
   member_method {
     name: "sample_from_datasets"
-    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'datasets\', \'weights\', \'seed\', \'stop_on_empty_dataset\', \'rerandomize_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
index 8685eff5b08..aef16dd6637 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-layout.pbtxt
@@ -62,8 +62,4 @@ tf_class {
     name: "to_string"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "unravel"
-    argspec: "args=[\'self\', \'unpacked_tensors\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
index 94527fbabb9..ee551474c28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-mesh.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dim_names\', \'global_device_ids\', \'local_device_ids\', \'local_devices\', \'mesh_name\', \'global_devices\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+    argspec: "args=[\'self\', \'dim_names\', \'global_device_ids\', \'local_device_ids\', \'local_devices\', \'mesh_name\', \'global_devices\', \'use_xla_spmd\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\'], "
   }
   member_method {
     name: "as_proto"
@@ -91,4 +91,8 @@ tf_class {
     name: "unravel_index"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "use_xla_spmd"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
index 2f69d500919..2ace8ed93e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "barrier"
-    argspec: "args=[\'mesh\', \'barrier_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'mesh\', \'barrier_name\', \'timeout_in_ms\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "call_with_layout"
@@ -50,15 +50,15 @@ tf_module {
   }
   member_method {
     name: "create_distributed_mesh"
-    argspec: "args=[\'mesh_dims\', \'mesh_name\', \'local_devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
+    argspec: "args=[\'mesh_dims\', \'mesh_name\', \'local_devices\', \'device_type\', \'use_xla_spmd\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "create_mesh"
-    argspec: "args=[\'mesh_dims\', \'mesh_name\', \'devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'mesh_dims\', \'mesh_name\', \'devices\', \'device_type\', \'use_xla_spmd\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "create_tpu_mesh"
-    argspec: "args=[\'mesh_dim_names\', \'mesh_shape\', \'mesh_name\', \'ring_dims\', \'ring_axes\', \'ring_bounds\', \'can_split_host_across_rings\', \'build_ring_across_rings\', \'rotate_ring_across_rings\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\', \'False\'], "
+    argspec: "args=[\'mesh_dim_names\', \'mesh_shape\', \'mesh_name\', \'ring_dims\', \'ring_axes\', \'ring_bounds\', \'can_split_host_across_rings\', \'build_ring_across_rings\', \'rotate_ring_across_rings\', \'use_xla_spmd\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "device_name"
@@ -82,15 +82,19 @@ tf_module {
   }
   member_method {
     name: "initialize_accelerator_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "initialize_multi_client"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "initialize_tpu_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "is_dtensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "job_name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.extension_type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.extension_type.pbtxt
new file mode 100644
index 00000000000..def31ca6c60
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.extension_type.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental.extension_type"
+tf_module {
+  member_method {
+    name: "as_dict"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index d5cce60856b..26cf4ef1de6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -36,6 +36,18 @@ tf_module {
     name: "dtensor"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "extension_type"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "float8_e4m3fn"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float8_e5m2"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "numpy"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt
index df049982d56..9bb29eb95b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-constant.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.Constant"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Constant\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Constant\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt
index db9dbc938e5..e5d15fcef70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.GlorotNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt
index d6eff04e435..192844cddda 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-glorot-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.GlorotUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-normal.pbtxt
index ac72d863a4a..1339d3f532a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.HeNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-uniform.pbtxt
index 4ed03645169..47c48f2f83a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-he-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.HeUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt
index 036a99c4280..3a1bc5b07ce 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-identity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.Identity"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Identity\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Identity\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt
index b96e2728353..845ae3489d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-initializer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.initializers.Initializer"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-normal.pbtxt
index 75576f5d71f..1d0983b1387 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.LecunNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-uniform.pbtxt
index 4365a3e327c..2d5a3f67b75 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-lecun-uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.LecunUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt
index d0d62e37b2f..3b5d3313674 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-ones.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.Ones"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Ones\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Ones\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt
index 0daaeba08bd..45b4cf91f3e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-orthogonal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.Orthogonal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Orthogonal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Orthogonal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt
index 81a6f0f339f..d8fd1d91629 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.RandomNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt
index 79268ac990d..f25b37d4b29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-random-uniform.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.RandomUniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt
index eaad43c65fb..b9a240acb66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-truncated-normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.TruncatedNormal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.TruncatedNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt
index b16b03048a7..6ecf90a02f4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-variance-scaling.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.VarianceScaling"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt
index 9762f61639f..dc4d8fd34a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.-zeros.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.Zeros"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Zeros\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Zeros\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
index d1167655aec..ecea5e4e8b4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.constant"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Constant\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Constant\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
index 113deb64ccc..cd152f8202b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.glorot_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
index 619e380b1b1..324e042bf87 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.glorot_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.GlorotUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.GlorotUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_normal.pbtxt
index cf079e2838e..77bf11172e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.he_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_uniform.pbtxt
index a108161739c..890ba4f1aff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.he_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.he_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.HeUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.HeUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
index 52ec842fca4..31a6545b6f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.identity"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Identity\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Identity\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_normal.pbtxt
index 4427fb1ff49..4bb454c93f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_normal.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.lecun_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_uniform.pbtxt
index af5fbd7a064..3899e825f25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.lecun_uniform.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.initializers.lecun_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.LecunUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.LecunUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
index dd6f0317b2c..67181778e00 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.ones"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Ones\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Ones\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
index 9f4ef25e841..2a47274c787 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.orthogonal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Orthogonal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Orthogonal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index baed423d368..c8d6de730ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -126,7 +126,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -134,6 +134,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'initializer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'initializer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
index 3f1b53f3cd1..8cdd24c25fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.random_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
index 845520c4e6e..d91eba9b154 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.random_uniform"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.RandomUniform\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.RandomUniform\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
index 5a6c2328b42..2edc69bb658 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.truncated_normal"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.TruncatedNormal\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
index 7ef9f298609..911b3cff565 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.variance_scaling"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.VarianceScaling\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.VarianceScaling\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
index 9ad586b7d85..842dfc55ed2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.initializers.zeros"
 tf_class {
-  is_instance: "<class \'keras.initializers.initializers_v2.Zeros\'>"
-  is_instance: "<class \'keras.initializers.initializers_v2.Initializer\'>"
+  is_instance: "<class \'keras.initializers.initializers.Zeros\'>"
+  is_instance: "<class \'keras.initializers.initializers.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
index 9538fe382a0..72c4532e8b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_STABLEHLO_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index b33364c167b..fc468e3a7a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -114,7 +114,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'name\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -186,7 +186,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'loss\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_crossentropy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
index 9ba43e45777..3b9f9b76a33 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.AUC"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.AUC\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.AUC\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -160,6 +160,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -188,6 +192,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
index da27bfdf447..715db695af0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.Accuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Accuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.Accuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
index bd837548465..ec04a495c77 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.BinaryAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.BinaryAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
index 2cfd6926ddd..08452e07338 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.BinaryCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.BinaryCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-io-u.pbtxt
index 5a64efc4b99..122baf3e2df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-io-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.metrics.BinaryIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.BinaryIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.BinaryIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
index e7d1619fc22..a9f2ccbfcb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.CategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
index 747b329c6c2..3c1266320aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.CategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.CategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
index d64ec2dfb33..fdfc4d41cc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.CategoricalHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.CategoricalHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
index 0927ace7747..ed14e4b908c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.CosineSimilarity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.CosineSimilarity\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f-beta-score.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f-beta-score.pbtxt
new file mode 100644
index 00000000000..0264e63743c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f-beta-score.pbtxt
@@ -0,0 +1,255 @@
+path: "tensorflow.metrics.FBetaScore"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'beta\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'fbeta_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f1-score.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f1-score.pbtxt
new file mode 100644
index 00000000000..f14d03a9931
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-f1-score.pbtxt
@@ -0,0 +1,256 @@
+path: "tensorflow.metrics.F1Score"
+tf_class {
+  is_instance: "<class \'keras.metrics.f_score_metrics.F1Score\'>"
+  is_instance: "<class \'keras.metrics.f_score_metrics.FBetaScore\'>"
+  is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
+  is_instance: "<class \'keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<class \'keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "compute_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype_policy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_masking"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variable_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'average\', \'threshold\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'f1_score\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregationV2.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'y_true_shape\', \'y_pred_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_state"
+    argspec: "args=[\'self\', \'metrics\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_state"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
index 29b32fcd2e2..a3efe76a9e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.FalseNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalseNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalseNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
index 410d4c978fc..c80fe2207a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.FalsePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.FalsePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.FalsePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
index 3ac4f1f52cb..341ffd1fe68 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.Hinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Hinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.Hinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-io-u.pbtxt
index 0563183a361..22659a4c0ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-io-u.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.IoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
index 6003a6c6a55..4f6484160b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.KLDivergence"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.KLDivergence\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.KLDivergence\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
index b41a2eb2770..9382a4f160c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.LogCoshError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.LogCoshError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.LogCoshError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
index a461ac93374..84ad8e43297 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.MeanAbsoluteError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsoluteError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
index 1535c5b3f13..f6afb6e6ec6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.MeanAbsolutePercentageError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanAbsolutePercentageError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
index 4c0511a726f..71421cb3428 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.metrics.MeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-metric-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-metric-wrapper.pbtxt
index da86a7e91df..080ab6cde12 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-metric-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-metric-wrapper.pbtxt
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
index bf9864f394c..9c62d1ce3fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.MeanRelativeError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanRelativeError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
index 2ba035cceb5..a3a41faca8f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.MeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
index 7b890f19478..131e181ad51 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.MeanSquaredLogarithmicError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.MeanSquaredLogarithmicError\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
index c34225fee03..b74318957ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -164,6 +164,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -192,6 +196,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
index 712de29d5c3..d1d27a99afc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
index 7530283b0fa..1f146561e76 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -155,6 +155,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -183,6 +187,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-io-u.pbtxt
index c0621c6048b..698bbe5b6f3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-io-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.metrics.OneHotIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-mean-io-u.pbtxt
index d3c41c645e5..f0d91c4cd21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-mean-io-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-one-hot-mean-io-u.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.metrics.OneHotMeanIoU"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.OneHotMeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.MeanIoU\'>"
-  is_instance: "<class \'keras.metrics.metrics.IoU\'>"
-  is_instance: "<class \'keras.metrics.metrics._IoUBase\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.OneHotMeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.MeanIoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics.IoU\'>"
+  is_instance: "<class \'keras.metrics.iou_metrics._IoUBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
index 8a8d428f01e..b861e4e91fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.Poisson"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Poisson\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.Poisson\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
index 348f04448b5..1bbffd74db4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision-at-recall.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.PrecisionAtRecall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.PrecisionAtRecall\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.PrecisionAtRecall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
index d5850e89abf..039a1551743 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.Precision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Precision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Precision\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
index d5277bb64c6..4a97685610f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall-at-precision.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.RecallAtPrecision"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RecallAtPrecision\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.RecallAtPrecision\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
index 403df5245e6..ea7559f39d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.Recall"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.Recall\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.Recall\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -156,6 +156,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -184,6 +188,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
index 966c6a10827..d41c2d7d04d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.RootMeanSquaredError"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'keras.metrics.regression_metrics.RootMeanSquaredError\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
@@ -158,6 +158,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -186,6 +190,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
index cba881c1525..886c2c2b1b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.SensitivityAtSpecificity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SensitivityAtSpecificity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
index e58ed71fe2a..7ac60607f95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.SparseCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
index 2b719610cc3..b3371b4da4c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.SparseCategoricalCrossentropy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'keras.metrics.probabilistic_metrics.SparseCategoricalCrossentropy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
index 22ca187a205..6a38c37d8ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.SparseTopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.SparseTopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
index f0b7fa17854..0cf965c8fc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.SpecificityAtSensitivity"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SpecificityAtSensitivity\'>"
-  is_instance: "<class \'keras.metrics.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
index 5ba36ee4e44..16a26512478 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.SquaredHinge"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.SquaredHinge\'>"
+  is_instance: "<class \'keras.metrics.hinge_metrics.SquaredHinge\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
index 99cc3b0d3d8..dd4c07687cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
index ea4d21e29ea..7dc0ac61bb5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.metrics.TopKCategoricalAccuracy"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'keras.metrics.accuracy_metrics.TopKCategoricalAccuracy\'>"
   is_instance: "<class \'keras.metrics.base_metric.MeanMetricWrapper\'>"
   is_instance: "<class \'keras.metrics.base_metric.Mean\'>"
   is_instance: "<class \'keras.metrics.base_metric.Reduce\'>"
@@ -159,6 +159,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -187,6 +191,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
index 4e1673fb0fc..7b8d4d3eaef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.TrueNegatives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TrueNegatives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TrueNegatives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
index 1dea96b0435..7ad47a7137e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.metrics.TruePositives"
 tf_class {
-  is_instance: "<class \'keras.metrics.metrics.TruePositives\'>"
-  is_instance: "<class \'keras.metrics.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics.TruePositives\'>"
+  is_instance: "<class \'keras.metrics.confusion_metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'keras.metrics.base_metric.Metric\'>"
   is_instance: "<class \'keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
@@ -157,6 +157,10 @@ tf_class {
     name: "build"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "build_from_config"
+    argspec: "args=[\'self\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
@@ -185,6 +189,10 @@ tf_class {
     name: "from_config"
     argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_build_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
index d362810f45f..ae88186b122 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -36,6 +36,14 @@ tf_module {
     name: "CosineSimilarity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "F1Score"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FBetaScore"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -214,7 +222,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -282,7 +290,7 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'metric\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.experimental.pbtxt
index b6535abc1cd..43a45a2374b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.nn.experimental"
 tf_module {
+  member_method {
+    name: "general_dropout"
+    argspec: "args=[\'x\', \'rate\', \'uniform_sampler\', \'noise_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stateless_dropout"
     argspec: "args=[\'x\', \'rate\', \'seed\', \'rng_alg\', \'noise_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
index 0e511b1b86e..c812f52aad9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adafactor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adafactor.pbtxt
new file mode 100644
index 00000000000..f922a6ca6c3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adafactor.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.optimizers.Adafactor"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adafactor.Adafactor\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_2_decay\', \'epsilon_1\', \'epsilon_2\', \'clip_threshold\', \'relative_step\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.8\', \'1e-30\', \'0.001\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adafactor\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
index 2e167925f97..31093140897 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam-w.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam-w.pbtxt
new file mode 100644
index 00000000000..9d2fcbd5329
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam-w.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.optimizers.AdamW"
+tf_class {
+  is_instance: "<class \'keras.optimizers.adamw.AdamW\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "learning_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "lr"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'weight_decay\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.004\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'AdamW\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\'], "
+  }
+  member_method {
+    name: "add_variable_from_reference"
+    argspec: "args=[\'self\', \'model_variable\', \'variable_name\', \'shape\', \'initial_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "aggregate_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'skip_gradients_aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exclude_from_weight_decay"
+    argspec: "args=[\'self\', \'var_list\', \'var_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize_variable_values"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'tape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_step"
+    argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 085a3e587fa..6cb277ea82c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
index 1b7c7fcd73d..3fc2d462463 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adamax\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index f48cab7576a..3ee119579aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'l2_shrinkage_regularization_strength\', \'beta\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Ftrl\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
index 772a29d996a..02d08f62d71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Nadam\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
index 8086f498228..0e373ebdfa0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -17,6 +17,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'0\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\'], "
@@ -73,8 +77,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
index b504869b458..d98bd066c45 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 47407d335cb..fbd17b65d47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.sgd.SGD\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,9 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
   }
   member_method {
     name: "add_variable"
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adadelta.pbtxt
index 3ea74df786c..e4c49206f66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adadelta.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adadelta\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adafactor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adafactor.pbtxt
index d29c7ac9ae6..b3d4855ed47 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adafactor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adafactor.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Adafactor"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adafactor.Adafactor\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adafactor.Adafactor\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_2_decay\', \'epsilon_1\', \'epsilon_2\', \'clip_threshold\', \'relative_step\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.8\', \'1e-30\', \'0.001\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adafactor\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adagrad.pbtxt
index 0fd8dddba0b..67c39423afb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adagrad.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adagrad\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'grad\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam-w.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam-w.pbtxt
index 5155eebe572..b29b348bc4c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam-w.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam-w.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.AdamW"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adamw.AdamW\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adamw.AdamW\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'weight_decay\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.004\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'AdamW\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
index e634bddc7fc..d49d9fd1270 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adam\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adamax.pbtxt
index 316bec03ef1..6f2ce8f4974 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-adamax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Adamax\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-ftrl.pbtxt
index c54be2cb0be..78b267dca78 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-ftrl.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'l2_shrinkage_regularization_strength\', \'beta\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Ftrl\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-nadam.pbtxt
index 75bcb734a47..14251766c08 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-nadam.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'Nadam\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-optimizer.pbtxt
index cc3e2cf7855..e200fc62c0a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.experimental.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -17,6 +17,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'name\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\'], varargs=None, keywords=kwargs, defaults=[\'0\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\'], "
@@ -73,8 +77,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-r-m-sprop.pbtxt
index bba47e18a6a..799b2420f60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-r-m-sprop.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,6 +18,10 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'100\', \'True\', \'RMSprop\'], "
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-s-g-d.pbtxt
index 36a37fd33fe..b1bcf7f0098 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.experimental.-s-g-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.optimizers.experimental.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.sgd.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer.Optimizer\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_experimental.optimizer._BaseOptimizer\'>"
+  is_instance: "<class \'keras.optimizers.sgd.SGD\'>"
+  is_instance: "<class \'keras.optimizers.optimizer.Optimizer\'>"
+  is_instance: "<class \'keras.optimizers.optimizer._BaseOptimizer\'>"
   is_instance: "<class \'tensorflow.python.trackable.autotrackable.AutoTrackable\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
@@ -18,9 +18,13 @@ tf_class {
     name: "lr"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'amsgrad\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'weight_decay\', \'clipnorm\', \'clipvalue\', \'global_clipnorm\', \'use_ema\', \'ema_momentum\', \'ema_overwrite_frequency\', \'jit_compile\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'False\', \'None\', \'None\', \'None\', \'None\', \'False\', \'0.99\', \'None\', \'True\', \'SGD\'], "
   }
   member_method {
     name: "add_variable"
@@ -74,8 +78,4 @@ tf_class {
     name: "update_step"
     argspec: "args=[\'self\', \'gradient\', \'variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adadelta.pbtxt
index f5ec7e2bfde..1bd6cb3a569 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adadelta.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Adadelta"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adadelta.Adadelta\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adadelta.Adadelta\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adagrad.pbtxt
index fabaf758e52..e497bac1f0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adagrad.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Adagrad"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adagrad.Adagrad\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adagrad.Adagrad\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adam.pbtxt
index 8e6fae8d978..b0deb1ea152 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Adam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adam.Adam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adam.Adam\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adamax.pbtxt
index 93619fbed87..1957dfcf3ef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-adamax.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Adamax"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.adamax.Adamax\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.adamax.Adamax\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-ftrl.pbtxt
index c2b4cc9bfec..685ec01a0a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-ftrl.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Ftrl"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.ftrl.Ftrl\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.ftrl.Ftrl\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-nadam.pbtxt
index 1c583162464..24876319230 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-nadam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.Nadam"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.nadam.Nadam\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.nadam.Nadam\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-optimizer.pbtxt
index ead1779fed0..4f7daaa4c1e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-optimizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.optimizers.legacy.Optimizer"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-r-m-sprop.pbtxt
index ec185129199..6b2a808e33b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-r-m-sprop.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.RMSprop"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.rmsprop.RMSprop\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.rmsprop.RMSprop\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-s-g-d.pbtxt
index fe4bb73be1f..14f9f794750 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.legacy.-s-g-d.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.optimizers.legacy.SGD"
 tf_class {
-  is_instance: "<class \'keras.optimizers.optimizer_v2.gradient_descent.SGD\'>"
-  is_instance: "<class \'keras.optimizers.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'keras.optimizers.legacy.gradient_descent.SGD\'>"
+  is_instance: "<class \'keras.optimizers.legacy.optimizer_v2.OptimizerV2\'>"
   is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
index 52ef18fc544..62186376a74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Adadelta"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Adafactor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Adagrad"
     mtype: "<type \'type\'>"
@@ -12,6 +16,10 @@ tf_module {
     name: "Adam"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "AdamW"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Adamax"
     mtype: "<type \'type\'>"
@@ -50,7 +58,7 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "get"
@@ -58,6 +66,6 @@ tf_module {
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'optimizer\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
index 775b2cafd4f..29f8495e1bb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
@@ -30,10 +30,10 @@ tf_module {
   }
   member_method {
     name: "deserialize"
-    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'config\', \'custom_objects\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "serialize"
-    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'learning_rate_schedule\', \'use_legacy_format\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 610cb0b048d..09be48c84a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -572,6 +572,14 @@ tf_module {
     name: "control_dependencies"
     argspec: "args=[\'control_inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "conv2d_backprop_filter_v2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_input_v2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "convert_to_tensor"
     argspec: "args=[\'value\', \'dtype\', \'dtype_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -674,7 +682,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'jit_compile\', \'reduce_retracing\', \'experimental_implements\', \'experimental_autograph_options\', \'experimental_attributes\', \'experimental_relax_shapes\', \'experimental_compile\', \'experimental_follow_type_hints\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -1112,6 +1120,10 @@ tf_module {
     name: "timestamp"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tpu_partitioned_output_v2"
+    argspec: "args=[\'inputs\', \'num_splits\', \'partition_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "transpose"
     argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index 7071429a75a..e7494a4e95e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "fixed_unigram_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "fold_in"
+    argspec: "args=[\'seed\', \'data\', \'alg\'], varargs=None, keywords=None, defaults=[\'auto_select\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -64,6 +68,10 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'seed\', \'num\', \'alg\'], varargs=None, keywords=None, defaults=[\'2\', \'auto_select\'], "
+  }
   member_method {
     name: "stateless_binomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index f918975ac5a..9e373836765 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -832,6 +832,10 @@ tf_module {
     name: "CollectiveReduce"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveReduceScatterV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'max_subdivs_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "CollectiveReduceV2"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'ordering_token\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'max_subdivs_per_device\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'-1\', \'None\'], "
@@ -928,10 +932,18 @@ tf_module {
     name: "Conv2DBackpropFilter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "Conv2DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "Conv2DBackpropInput"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "Conv2DBackpropInputV2"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'explicit_paddings\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'[]\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "Conv3D"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -1320,6 +1332,10 @@ tf_module {
     name: "DisableCopyOnRead"
     argspec: "args=[\'resource\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DistributedSave"
+    argspec: "args=[\'dataset\', \'directory\', \'address\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "Div"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3320,6 +3336,10 @@ tf_module {
     name: "RandomDataset"
     argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "RandomDatasetV2"
+    argspec: "args=[\'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'rerandomize_each_iteration\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'\', \'None\'], "
+  }
   member_method {
     name: "RandomGamma"
     argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -4056,10 +4076,18 @@ tf_module {
     name: "SegmentProd"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SegmentProdV2"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SegmentSum"
     argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SegmentSumV2"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Select"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4860,6 +4888,10 @@ tf_module {
     name: "SymbolicGradient"
     argspec: "args=[\'input\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SyncDevice"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "TFRecordDataset"
     argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
@@ -4892,10 +4924,18 @@ tf_module {
     name: "TPUPartitionedInput"
     argspec: "args=[\'inputs\', \'partition_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
+  member_method {
+    name: "TPUPartitionedInputV2"
+    argspec: "args=[\'inputs\', \'partition_dims\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "TPUPartitionedOutput"
     argspec: "args=[\'inputs\', \'num_splits\', \'partition_dim\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
+  member_method {
+    name: "TPUPartitionedOutputV2"
+    argspec: "args=[\'inputs\', \'num_splits\', \'partition_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "TPUReplicateMetadata"
     argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'use_spmd_for_xla_partitioning\', \'tpu_compile_options_proto\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'False\', \'\', \'None\'], "
@@ -5304,10 +5344,18 @@ tf_module {
     name: "UniformQuantize"
     argspec: "args=[\'input\', \'scales\', \'zero_points\', \'Tout\', \'quantization_min_val\', \'quantization_max_val\', \'quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "UniformQuantizedAdd"
+    argspec: "args=[\'lhs\', \'rhs\', \'lhs_scales\', \'lhs_zero_points\', \'rhs_scales\', \'rhs_zero_points\', \'output_scales\', \'output_zero_points\', \'lhs_quantization_min_val\', \'lhs_quantization_max_val\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'output_quantization_min_val\', \'output_quantization_max_val\', \'lhs_quantization_axis\', \'rhs_quantization_axis\', \'output_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "UniformQuantizedClipByValue"
     argspec: "args=[\'operand\', \'min\', \'max\', \'scales\', \'zero_points\', \'quantization_min_val\', \'quantization_max_val\', \'quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "UniformQuantizedConvolution"
+    argspec: "args=[\'lhs\', \'rhs\', \'lhs_scales\', \'lhs_zero_points\', \'rhs_scales\', \'rhs_zero_points\', \'output_scales\', \'output_zero_points\', \'Tout\', \'padding\', \'lhs_quantization_min_val\', \'lhs_quantization_max_val\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'output_quantization_min_val\', \'output_quantization_max_val\', \'window_strides\', \'explicit_padding\', \'lhs_dilation\', \'rhs_dilation\', \'batch_group_count\', \'feature_group_count\', \'dimension_numbers\', \'lhs_quantization_axis\', \'rhs_quantization_axis\', \'output_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'1\', \'1\', \'\', \'-1\', \'-1\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "UniformQuantizedConvolutionHybrid"
     argspec: "args=[\'lhs\', \'rhs\', \'rhs_scales\', \'rhs_zero_points\', \'Tout\', \'padding\', \'rhs_quantization_min_val\', \'rhs_quantization_max_val\', \'window_strides\', \'explicit_padding\', \'lhs_dilation\', \'rhs_dilation\', \'batch_group_count\', \'feature_group_count\', \'dimension_numbers\', \'rhs_quantization_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'[]\', \'[]\', \'[]\', \'1\', \'1\', \'\', \'-1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.-fingerprint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.-fingerprint.pbtxt
new file mode 100644
index 00000000000..33475723ae7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.-fingerprint.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.saved_model.experimental.Fingerprint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.fingerprinting.Fingerprint\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'saved_model_checksum\', \'graph_def_program_hash\', \'signature_def_hash\', \'saved_object_graph_hash\', \'checkpoint_hash\', \'version\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.pbtxt
index e042ccd59f5..9cb86a2ea52 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.saved_model.experimental"
 tf_module {
+  member {
+    name: "Fingerprint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrackableResource"
     mtype: "<class \'tensorflow.python.trackable.resource._ResourceMetaclass\'>"
@@ -8,4 +12,8 @@ tf_module {
     name: "VariablePolicy"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member_method {
+    name: "read_fingerprint"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.experimental.pbtxt
new file mode 100644
index 00000000000..827530d46c7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.test.experimental"
+tf_module {
+  member_method {
+    name: "sync_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index 382ca5edca5..05ac0b2ab34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "assert_equal_graph_def"
     argspec: "args=[\'expected\', \'actual\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
index 0446af617bb..829d45dc73f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad-momentum.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_nesterov\', \'exponent\', \'beta2\', \'epsilon\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.0\', \'False\', \'2\', \'1\', \'1e-10\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_nesterov\', \'exponent\', \'beta2\', \'epsilon\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.0\', \'False\', \'2\', \'1\', \'1e-10\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index 5762d8dafb8..0e3dda0dad1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index f66e284a0fa..f52fec3b97f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
index a34b40c6145..4a1ec711640 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-f-t-r-l.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'beta\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'multiply_linear_by_learning_rate\', \'allow_zero_accumulator\'], varargs=None, keywords=None, defaults=[\'0.001\', \'-0.5\', \'0.0\', \'0.0\', \'0.0\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'beta\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\', \'multiply_linear_by_learning_rate\', \'allow_zero_accumulator\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.001\', \'-0.5\', \'0.0\', \'0.0\', \'0.0\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 66c6ac61cb1..37fb55880ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\', \'low_dimensional_packing_status\'], varargs=None, keywords=None, defaults=[\'0.01\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
index 7533de743f2..6172c69684e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-options.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.train.CheckpointOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.checkpoint.checkpoint_options.CheckpointOptions\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "enable_async"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_enable_async_checkpoint"
     mtype: "<type \'member_descriptor\'>"
@@ -12,6 +16,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'experimental_io_device\', \'experimental_enable_async_checkpoint\', \'enable_async\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.-trace-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.-trace-type.pbtxt
index 518b1635b63..f0dcd5aee78 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.-trace-type.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.types.experimental.-trace-type.pbtxt
@@ -13,4 +13,8 @@ tf_class {
     name: "most_specific_common_supertype"
     argspec: "args=[\'self\', \'others\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 789942df520..6165905c9f3 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tools/api:__subpackages__",
         "//third_party/py/keras/api:__subpackages__",
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index e8811111bfd..63463086db8 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -7,6 +7,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 9ab20c70f17..ef57ace60ed 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -100,6 +100,29 @@ def _InitPathConstants():
 _V1_APIS_FROM_KERAS = ['layers', 'nn.rnn_cell']
 _V2_APIS_FROM_KERAS = ['initializers', 'losses', 'metrics', 'optimizers']
 
+_PY311_INT_ENUM_METHODS = [
+    ('__init__', "args=['self'], varargs=args, keywords=kwds, defaults=None"),
+    ('as_integer_ratio', None),
+    ('bit_count', None),
+    ('bit_length', None),
+    ('conjugate', None),
+    ('from_bytes', None),
+    ('to_bytes', None),
+]
+_PY311_INT_ENUM_MEMBERS = [
+    ('denominator', "<type 'getset_descriptor'>"),
+    ('imag', "<type 'getset_descriptor'>"),
+    ('numerator', "<type 'getset_descriptor'>"),
+    ('real', "<type 'getset_descriptor'>"),
+]
+# pylint: disable=line-too-long
+_PY311_UPDATED_MEMBER_TYPES = {
+    "<class 'enum.EnumMeta'>": "<class 'enum.EnumType'>",
+    "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>":
+    "<class \'google.protobuf.internal.python_message.GeneratedProtocolMessageType\'>",
+}
+# pylint: enable=line-too-long
+
 # TODO(annarev): remove this once we test with newer version of
 # estimator that actually has compat v1 version.
 if not hasattr(tf.compat.v1, 'estimator'):
@@ -218,6 +241,50 @@ def _GetTFNumpyGoldenPattern(api_version):
                                      api_version))
 
 
+def _UpdateExpectedDict(expected_dict):
+  """Update the expected dictionary of TFAPIObject protos.
+
+  Given an expected dictionary of TFAPIObject protos, update it such that it
+  conforms to the Python 3.11 API.
+
+  Args:
+    expected_dict: a dict of TFAPIObject protos constructed from golden files.
+
+  Returns:
+    A modified expected_dict that conforms to the Python 3.11 API.
+  """
+  for key in expected_dict:
+    module_or_class = None
+    if expected_dict[key].HasField('tf_module'):
+      module_or_class = expected_dict[key].tf_module
+    elif expected_dict[key].HasField('tf_class'):
+      module_or_class = expected_dict[key].tf_class
+      instances = ' '.join(module_or_class.is_instance)
+      if 'exceptions' in instances or 'TypeError' in instances:
+        # BaseException has a new method, add_note()
+        module_or_class.member_method.add(name='add_note')
+      elif (
+          'AutoShardPolicy' in instances
+          or 'ShardingPolicy' in instances
+          or 'PaddingSpec' in instances
+      ):
+        # For classes that inherit from enum.IntEnum, the TFAPIObject protos
+        # constructed using the TF package have these additional members and
+        # methods. So we need to add them to the golden files as well.
+        for member_name, member_type in _PY311_INT_ENUM_MEMBERS:
+          module_or_class.member.add(name=member_name, mtype=member_type)
+        for method_name, argspec in _PY311_INT_ENUM_METHODS:
+          module_or_class.member_method.add(name=method_name, argspec=argspec)
+
+    if module_or_class is not None:
+      # Update member types that have changed in Python 3.11
+      for member in module_or_class.member:
+        if member.mtype in _PY311_UPDATED_MEMBER_TYPES:
+          member.mtype = _PY311_UPDATED_MEMBER_TYPES[member.mtype]
+
+  return expected_dict
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -244,7 +311,7 @@ def _AssertProtoDictEquals(self,
 
     Args:
       expected_dict: a dict of TFAPIObject protos constructed from golden files.
-      actual_dict: a ict of TFAPIObject protos constructed by reading from the
+      actual_dict: a dict of TFAPIObject protos constructed by reading from the
         TF package linked to the test.
       verbose: Whether to log the full diffs, or simply report which files were
         different.
@@ -255,6 +322,10 @@ def _AssertProtoDictEquals(self,
     """
     diffs = []
     verbose_diffs = []
+    # Update the expected protos if on Python 3.11
+    # TODO(b/264951243): Need to come up with a better solution post TF 2.12.
+    if sys.version_info.major == 3 and sys.version_info.minor >= 11:
+      expected_dict = _UpdateExpectedDict(expected_dict)
 
     expected_keys = set(expected_dict.keys())
     actual_keys = set(actual_dict.keys())
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 9956843436e..a8ca0436e19 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index 936f8ab07b1..91a1600cae3 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -77,7 +77,6 @@ bash download_models.sh
 ```
 
 ## Comparing performance with vanilla TF
-For TF OSS only:
 
 We provide example scripts comparing TF-oneDNN performance with vanilla TF's
 that users can modify for their own benchmarks. The scripts assume that models
@@ -96,11 +95,14 @@ Showing runtimes in microseconds. `?` means not available.
                Model,  Batch,        Vanilla,         oneDNN,    Speedup
           bert-large,      1,              x,              y,        x/y
           bert-large,     16,            ...,            ...,        ...
+          bert-large,     64,            ...,            ...,        ...          
            inception,      1,            ...,            ...,        ...
            inception,     16,            ...,            ...,        ...
+           inception,     64,            ...,            ...,        ...           
                                         ⋮
         ssd-resnet34,      1,              ?,            ...,          ?
         ssd-resnet34,     16,              ?,            ...,          ?
+        ssd-resnet34,     64,              ?,            ...,          ?        
 ```
 
 Vanilla TF can't run `ssd-resnet34` on CPU because it doesn't support NCHW
diff --git a/tensorflow/tools/benchmark/download_models.sh b/tensorflow/tools/benchmark/download_models.sh
index 666c3dad1c8..e79cecd8473 100644
--- a/tensorflow/tools/benchmark/download_models.sh
+++ b/tensorflow/tools/benchmark/download_models.sh
@@ -14,9 +14,11 @@
 # limitations under the License.
 # ==============================================================================
 
+source onednn_benchmark_config.sh
+
 # Store models in home directory
-mkdir -p ~/tf-graphs
-cd ~/tf-graphs
+mkdir -p ${TF_GRAPHS}
+cd ${TF_GRAPHS}
 
 # Download TF graphs linked from MLPerf Inference v2.0
 # https://github.com/mlcommons/inference#mlperf-inference-v20-submission-02252022
diff --git a/tensorflow/tools/benchmark/onednn_benchmark_config.sh b/tensorflow/tools/benchmark/onednn_benchmark_config.sh
new file mode 100644
index 00000000000..33dc0c07db5
--- /dev/null
+++ b/tensorflow/tools/benchmark/onednn_benchmark_config.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Stores shared and platform-specific benchmark configurations
+
+# Path to store downloaded TensorFlow models.
+export TF_GRAPHS=~/tf-graphs
+export BUILDER=bazel
+export BENCH="${BUILDER}-bin/tensorflow/tools/benchmark/benchmark_model"
+
+configure_build() {
+  cd ../../..
+  yes "" | ./configure
+}
+
+# Input $1: 0 if oneDNN is off, 1 otherwise.
+build_benchmark_tool() {
+   ${BUILDER} build --dynamic_mode=off //tensorflow/tools/benchmark:benchmark_model
+}
\ No newline at end of file
diff --git a/tensorflow/tools/benchmark/run_models.sh b/tensorflow/tools/benchmark/run_models.sh
index 3e0adb9f678..72a5549ed6d 100644
--- a/tensorflow/tools/benchmark/run_models.sh
+++ b/tensorflow/tools/benchmark/run_models.sh
@@ -14,61 +14,69 @@
 # limitations under the License.
 # ==============================================================================
 
+# Run models downloaded with download_models.sh with the TF benchmark tool.
+# This script must be called from its current directory.
 set -x
-export MODELS=~/tf-graphs
-export BENCH=bazel-bin/tensorflow/tools/benchmark/benchmark_model
-
-# Navigate to TF root directory
+source onednn_benchmark_config.sh
 date
-cd ../../..
-yes "" | ./configure
-bazel build --config=opt --dynamic_mode=off //tensorflow/tools/benchmark:benchmark_model
+
+# Navigate to the workspace root directory and configure build configurations. 
+configure_build
 pwd
 date
 
-for BATCH in 1 16; do
-  for ONEDNN in 0 1; do
+for ONEDNN in 0 1; do
+  # Build the benchmark tool. Can add code to handle different build configs
+  # when TF-oneDNN is disabled/enabled.
+  build_benchmark_tool ${ONEDNN}
+  date
+
+  for BATCH in 1 16 64; do
+    # Print information for parse_onednn_benchmarks.py
     echo "BATCH=${BATCH}, ONEDNN=${ONEDNN}"
     export TF_ENABLE_ONEDNN_OPTS=${ONEDNN}
+
+    # Run each graph.
     date
     ${BENCH} \
-      --graph=${MODELS}/resnet50_v1-5.pb \
+      --graph=${TF_GRAPHS}/resnet50_v1-5.pb \
       --input_layer="input_tensor:0" \
       --input_layer_shape="${BATCH},224,224,3" \
       --input_layer_type="float" \
       --output_layer="softmax_tensor:0"
     date
     ${BENCH} \
-      --graph=${MODELS}/inception.pb \
+      --graph=${TF_GRAPHS}/inception.pb \
       --input_layer="input:0" \
       --input_layer_shape="${BATCH},224,224,3" \
       --input_layer_type="float" \
       --output_layer="output:0"
     date
     ${BENCH} \
-      --graph=${MODELS}/mobilenet-v1.pb \
+      --graph=${TF_GRAPHS}/mobilenet-v1.pb \
       --input_layer="input:0" \
       --input_layer_shape="${BATCH},224,224,3" \
       --input_layer_type="float" \
       --output_layer="MobilenetV1/Predictions/Reshape_1:0"
     date
     ${BENCH} \
-      --graph=${MODELS}/ssd-mobilenet-v1.pb \
+      --graph=${TF_GRAPHS}/ssd-mobilenet-v1.pb \
       --input_layer="image_tensor:0" \
       --input_layer_shape="${BATCH},300,300,3" \
       --input_layer_type="uint8" \
       --output_layer="detection_classes:0"
     date
     ${BENCH} \
-      --graph=${MODELS}/ssd-resnet34.pb \
+      --graph=${TF_GRAPHS}/ssd-resnet34.pb \
       --input_layer="image:0" \
       --input_layer_shape="${BATCH},3,1200,1200" \
       --input_layer_type="float" \
       --output_layer="detection_classes:0"
     date
+    # Only run BERT with batch size 1 for now.
     if [[ $BATCH == 1 ]]; then
       ${BENCH} \
-        --graph=${MODELS}/bert-large.pb \
+        --graph=${TF_GRAPHS}/bert-large.pb \
         --input_layer="input_ids:0,input_mask:0,segment_ids:0" \
         --input_layer_shape="1,384:1,384:1,384" \
         --input_layer_type="int32,int32,int32" \
diff --git a/tensorflow/tools/benchmark/run_onednn_benchmarks.sh b/tensorflow/tools/benchmark/run_onednn_benchmarks.sh
index ac6d2a63a60..4b57f59f6ea 100644
--- a/tensorflow/tools/benchmark/run_onednn_benchmarks.sh
+++ b/tensorflow/tools/benchmark/run_onednn_benchmarks.sh
@@ -14,11 +14,14 @@
 # limitations under the License.
 # ==============================================================================
 
+# Benchmarks all downloaded models, parses, and summarizes results.
+
 set -x
+source onednn_benchmark_config.sh
 
 export OUTDIR=~/onednn_benchmarks
 mkdir -p ${OUTDIR}
 bash run_models.sh 2>&1 | tee ${OUTDIR}/verbose.log
 grep -v 'profiler_session\|xplane' ${OUTDIR}/verbose.log > ${OUTDIR}/run.log
-grep "\+ bazel-bin\|no stats:\|'BATCH=" ${OUTDIR}/run.log > ${OUTDIR}/to_parse.log
-python parse_onednn_benchmarks.py ${OUTDIR}/to_parse.log | tee ${OUTDIR}/results.csv
+grep "\+ ${BUILDER}-bin\|no stats:\|'BATCH=" ${OUTDIR}/run.log > ${OUTDIR}/to_parse.log
+python3 parse_onednn_benchmarks.py ${OUTDIR}/to_parse.log | tee ${OUTDIR}/results.csv
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64 b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
index db392b15759..19312dbb49e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
@@ -12,7 +12,9 @@ RUN yum -y check-update || true && \
         ncurses-devel \
         nss-devel \
         readline-devel \
-        sqlite-devel && \
+        sqlite-devel \
+        epel-release && \
+    yum install -y hdf5-devel && \
     yum clean all
 
 COPY install/install_bazel.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython
index d6fb964772f..9cdd3d04161 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython
@@ -40,8 +40,10 @@ COPY --from=devtoolset /dt7 /dt7
 COPY --from=devtoolset /dt9 /dt9
 
 # Install TensorRT.
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
 RUN echo \
-    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 / \
+    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 / \
     > /etc/apt/sources.list.d/nvidia-ml.list \
       && \
     apt-get update && apt-get install -y \
@@ -84,12 +86,14 @@ RUN /install/build_and_install_python.sh "3.7.7"
 RUN /install/build_and_install_python.sh "3.8.2"
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
+RUN /install/build_and_install_python.sh "3.11.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
 
 ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
 COPY install/install_latest_clang.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython
index f581c38b3bf..4e74d96b668 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython
@@ -80,12 +80,14 @@ RUN /install/build_and_install_python.sh "3.7.7"
 RUN /install/build_and_install_python.sh "3.8.2"
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
+RUN /install/build_and_install_python.sh "3.11.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
 
 ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
 COPY install/install_latest_clang.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
new file mode 100644
index 00000000000..1976af965a0
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -0,0 +1,50 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+
+FROM gcr.io/tensorflow-sigs/build@sha256:a88f88dda0131155466f0b7ee2314a32d02a3a01d98195bb151705dbf2726d1c
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+RUN /install/build_and_install_python.sh "3.9.4"
+RUN /install/build_and_install_python.sh "3.10.0"
+RUN /install/build_and_install_python.sh "3.11.0"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "jax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "jax"
+# https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "jax"
+RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "jax"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
index 4bb5594c2bb..32834ccac2a 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
@@ -17,9 +17,9 @@ ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
     curl libnuma-dev gnupg sudo libelf1 build-essential \
   && curl -k -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.0/ ubuntu main" \
+  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main" \
     | tee /etc/apt/sources.list.d/rocm.list \
-  && apt-get update && apt-get install -y --no-install-recommends \
+  && apt-get update && apt-get install -y \
     rocm-dev rocm-libs rccl \
   && apt-get clean && rm -rf /var/lib/apt/lists/*
 
@@ -46,11 +46,9 @@ COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
 # Install additional packages needed for this image:
-# - bsdmainutils (hexdump) for MLIR generated GPU kernels
 # - dependencies to build Python from source
 # - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
-    bsdmainutils \
     libbz2-dev \
     libffi-dev \
     libgdbm-dev \
@@ -73,11 +71,15 @@ COPY install/build_and_install_python.sh /install/
 RUN /install/build_and_install_python.sh "3.7.7"
 RUN /install/build_and_install_python.sh "3.8.2"
 RUN /install/build_and_install_python.sh "3.9.4"
+RUN /install/build_and_install_python.sh "3.10.0"
+RUN /install/build_and_install_python.sh "3.11.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "nojax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "nojax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "nojax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "nojax"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "nojax"
 
 ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
 COPY install/install_latest_clang.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 47997e7e9da..642321ef8c0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -3,10 +3,10 @@
 FROM ubuntu:focal
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
-ARG ROCM_DEB_REPO=https://repo.radeon.com/rocm/apt/5.1/
+ARG ROCM_DEB_REPO=https://repo.radeon.com/rocm/apt/5.3/
 ARG ROCM_BUILD_NAME=ubuntu
 ARG ROCM_BUILD_NUM=main
-ARG ROCM_PATH=/opt/rocm-5.1.0
+ARG ROCM_PATH=/opt/rocm-5.3.0
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
@@ -54,7 +54,7 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
   libnuma-dev \
   pciutils \
   virtualenv \
-  python-pip \
+  python3-pip \
   libxml2 \
   libxml2-dev \
   wget && \
@@ -78,6 +78,7 @@ ENV OPENCL_ROOT=$ROCM_PATH/opencl
 ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
 ENV PATH="$ROCM_PATH/bin:${PATH}"
 ENV PATH="$OPENCL_ROOT/bin:${PATH}"
+ENV PATH="/usr/local/bin:${PATH}"
 
 # Add target file to help determine which device(s) to build for
 RUN bash -c 'echo -e "gfx900\ngfx906\ngfx908" >> ${ROCM_PATH}/bin/target.lst'
diff --git a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
index ae26ae0ad9c..da330f24504 100644
--- a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
@@ -16,12 +16,16 @@
 set -x
 
 ARM_SKIP_TESTS="-//tensorflow/lite/... \
--//tensorflow/python:nn_grad_test \
+-//tensorflow/core/platform:ram_file_system_test \
 -//tensorflow/python/client:session_list_devices_test \
--//tensorflow/python/data/kernel_tests:iterator_test_cpu \
--//tensorflow/python/data/kernel_tests:iterator_test_gpu \
--//tensorflow/python/eager:forwardprop_test \
--//tensorflow/python/kernel_tests/nn_ops:conv_ops_test \
--//tensorflow/python/kernel_tests/nn_ops:conv2d_backprop_filter_grad_test \
+-//tensorflow/python/compiler/xla:xla_test_cpu \
+-//tensorflow/python/compiler/xla:xla_test_gpu \
+-//tensorflow/python/data/experimental/kernel_tests:checkpoint_input_pipeline_hook_test \
+-//tensorflow/python/data/kernel_tests:iterator_test \
+-//tensorflow/python/distribute:parameter_server_strategy_test_cpu \
+-//tensorflow/python/distribute:parameter_server_strategy_test_gpu \
+-//tensorflow/python/distribute/failure_handling:gce_failure_handler_test \
 -//tensorflow/python/kernel_tests/nn_ops:atrous_conv2d_test \
--//tensorflow/python/training:server_lib_test"
+-//tensorflow/python/kernel_tests/nn_ops:conv_ops_test \
+-//tensorflow/python/training:server_lib_test \
+-//tensorflow/python/debug/lib:source_remote_test"
diff --git a/tensorflow/tools/ci_build/builds/android.sh b/tensorflow/tools/ci_build/builds/android.sh
index 86ab53212c3..3b49f7a8b45 100755
--- a/tensorflow/tools/ci_build/builds/android.sh
+++ b/tensorflow/tools/ci_build/builds/android.sh
@@ -18,7 +18,9 @@ set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
-configure_android_workspace
+# To setup Android via `configure` script.
+export TF_SET_ANDROID_WORKSPACE=1
+yes "" | ./configure
 
 # The Bazel builds are intentionally built for x86 and arm64 to maximize build
 # coverage while minimizing compilation time. For full build coverage and
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index c366f1ed426..e5f2d9597d4 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -26,7 +26,9 @@ copy_lib() {
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
-configure_android_workspace
+# To setup Android via `configure` script.
+export TF_SET_ANDROID_WORKSPACE=1
+yes "" | ./configure
 
 CPUS=armeabi-v7a,arm64-v8a
 
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
index c9d81f62977..f2ee5c8cacd 100755
--- a/tensorflow/tools/ci_build/builds/pip_new.sh
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -523,6 +523,7 @@ run_test_with_bazel() {
 
   if [[ "${IS_OSS_SERIAL}" == "1" ]]; then
     remove_test_filter_tag -no_oss
+    remove_test_filter_tag -oss_serial
     add_test_filter_tag oss_serial
   else
     add_test_filter_tag -oss_serial
diff --git a/tensorflow/tools/ci_build/ctpu/BUILD b/tensorflow/tools/ci_build/ctpu/BUILD
index a5c98f1b53c..65bd4e5553a 100644
--- a/tensorflow/tools/ci_build/ctpu/BUILD
+++ b/tensorflow/tools/ci_build/ctpu/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
index ddeedb851e0..b70c82eebf8 100755
--- a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
+++ b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
@@ -185,7 +185,7 @@ esac
 # TODO(klimek): Automate linking in all non-gcc / non-kernel include
 # directories.
 mkdir -p "/${TARGET}/usr/include/x86_64-linux-gnu"
-PYTHON_VERSIONS=("python3.7m" "python3.8" "python3.9" "python3.10")
+PYTHON_VERSIONS=("python3.7m" "python3.8" "python3.9" "python3.10" "python3.11")
 for v in "${PYTHON_VERSIONS[@]}"; do
   ln -s "/usr/local/include/${v}" "/${TARGET}/usr/include/x86_64-linux-gnu/${v}"
 done
diff --git a/tensorflow/tools/ci_build/gpu_build/BUILD b/tensorflow/tools/ci_build/gpu_build/BUILD
index bca1d85c62c..743f8f19dba 100644
--- a/tensorflow/tools/ci_build/gpu_build/BUILD
+++ b/tensorflow/tools/ci_build/gpu_build/BUILD
@@ -3,6 +3,7 @@
 # learning applications.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/tools/ci_build/install/BUILD b/tensorflow/tools/ci_build/install/BUILD
index 6a89bd6e5b1..bb3d0d893b5 100644
--- a/tensorflow/tools/ci_build/install/BUILD
+++ b/tensorflow/tools/ci_build/install/BUILD
@@ -1,7 +1,10 @@
 """Description: BUILD file for shell script to install bazel using the installer.
 """
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 sh_binary(
     name = "install_bazel",
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 7d60d2bd9a9..137b5b33e7f 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -14,8 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-# Select bazel version.
-BAZEL_VERSION="5.1.1"
+BAZEL_VERSION="5.3.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 2c65c2a7ef2..4ba37d01b45 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -17,8 +17,7 @@
 # This script is to be used to install bzel on non x86_64 systems
 # It will compile bazel from source and install it in /usr/local/bin
 
-# Select bazel version.
-BAZEL_VERSION="5.1.1"
+BAZEL_VERSION="5.3.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index 733e0fd808b..a56a81f3b13 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -26,6 +26,24 @@ rm "get-pip.py"
 
 PYTHON_VERSION=$(echo ${PIP##*.})  # only the last number, eg. 10
 
+JAX_PACKAGES=(
+  # https://github.com/numpy/numpy/issues/22623
+  "setuptools<=65.5.1"
+  "wheel"
+  "cloudpickle"
+  "colorama>=0.4.4"
+  "matplotlib"
+  "pillow>=9.1.0"
+  "rich"
+  "absl-py"
+  "portpicker"
+  "six"
+  "opt-einsum"
+  "auditwheel"
+  "msgpack"
+  "typing_extensions"
+)
+
 PACKAGES=(
   "absl-py"
   "argparse"
@@ -44,7 +62,7 @@ PACKAGES=(
   "pandas"
   "packaging"
   "portpicker"
-  "protobuf"
+  "protobuf==3.20.3"
   "psutil"
   "py-cpuinfo"
   "pybind11"
@@ -65,13 +83,31 @@ PACKAGES=(
 "${PIP}" "install" "--upgrade" "pip"
 "${PIP}" "install" "--upgrade" "setuptools" "virtualenv"
 
-"${PIP_INSTALL[@]}" "${PACKAGES[@]}"
+if [[ "$2" == "jax" ]]; then
+  "${PIP_INSTALL[@]}" "${JAX_PACKAGES[@]}"
+else
+  "${PIP_INSTALL[@]}" "${PACKAGES[@]}"
+fi
 
-# Special casing by version of Python
-# E.g., numpy supports py3.10 only from 1.21.3
-if [[ ${PYTHON_VERSION} -eq 10 ]]; then
-  "${PIP_INSTALL[@]}" "numpy==1.21.3"
+if [[ "$2" == "jax" ]]; then
+  # Special casing by version of Python
+  # E.g., numpy supports py3.10 only from 1.21.3
+  if [[ ${PYTHON_VERSION} -eq 10 ]]; then
+    "${PIP_INSTALL[@]}" "numpy==1.21.3" "scipy==1.7.2"
+  elif [[ ${PYTHON_VERSION} -eq 11 ]]; then
+    "${PIP_INSTALL[@]}" "numpy==1.23.4" "scipy==1.9.2"
+  else
+    "${PIP_INSTALL[@]}" "numpy==1.20.3" "scipy==1.5.4"
+  fi
 else
-  "${PIP_INSTALL[@]}" "numpy==1.19"
+  # Special casing by version of Python
+  # E.g., numpy supports py3.10 only from 1.21.3
+  if [[ ${PYTHON_VERSION} -eq 10 ]]; then
+    "${PIP_INSTALL[@]}" "numpy==1.21.3"
+  elif [[ ${PYTHON_VERSION} -eq 11 ]]; then
+    "${PIP_INSTALL[@]}" "numpy==1.23.4"
+  else
+    "${PIP_INSTALL[@]}" "numpy==1.19"
+  fi
 fi
 
diff --git a/tensorflow/tools/ci_build/linux/rocm/rocm_py310_pip.sh b/tensorflow/tools/ci_build/linux/rocm/rocm_py310_pip.sh
new file mode 100755
index 00000000000..6c85e24fcbb
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/rocm/rocm_py310_pip.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_python_pip_deps python3.10
+
+install_bazelisk
+
+# Export required variables for running pip.sh
+export OS_TYPE="UBUNTU"
+export CONTAINER_TYPE="ROCM"
+export TF_PYTHON_VERSION='python3.10'
+export PYTHON_BIN_PATH="$(which ${TF_PYTHON_VERSION})"
+
+# .bazelrc does not have all config options listed within it for ROCm (as it does for others)
+# So explicitly calling the configure script here to properly setup the ROCm config options
+yes "" | TF_NEED_ROCM=1 ./configure
+
+# Setup some env vars needed by the underlying script(s) for building and testing
+export N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+export TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+export TF_TESTS_PER_GPU=1
+export N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
+
+#Add filter tags specific to ROCm version
+rocm_major_version=`cat /opt/rocm/.info/version | cut -d "." -f 1`
+rocm_minor_version=`cat /opt/rocm/.info/version | cut -d "." -f 2`
+TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=""
+if [[ $rocm_major_version -ge 5 ]]; then
+	if [[ $rocm_minor_version -lt 3 ]]; then
+		TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+	fi
+else
+    TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+fi
+
+# # Export optional variables for running pip.sh
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py39,-no_rocm${TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC}'
+export TF_BUILD_FLAGS="--config=release_base "
+export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
+ --test_env=TF2_BEHAVIOR=1 \
+ --local_test_jobs=${N_TEST_JOBS} \
+ --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+ --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+ --test_env=HSA_TOOLS_LIB=libroctracer64.so \
+ --test_timeout 600,900,2400,7200 \
+ --build_tests_only \
+ --test_output=errors \
+ --test_sharding_strategy=disabled \
+ --define=no_tensorflow_py_deps=true \
+ --test_lang_filters=py \
+ --verbose_failures=true \
+ --keep_going "
+export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
+export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
+export IS_NIGHTLY="${IS_NIGHTLY:-0}"
+export TF_PROJECT_NAME="tensorflow_rocm"  # single pip package!
+export TF_PIP_TEST_ROOT="pip_test"
+
+# To build both tensorflow and tensorflow-gpu pip packages
+export TF_BUILD_BOTH_GPU_PACKAGES=0
+
+./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/linux/rocm/rocm_py37_pip.sh b/tensorflow/tools/ci_build/linux/rocm/rocm_py37_pip.sh
index 7acf67ff0ab..8544376dc7b 100755
--- a/tensorflow/tools/ci_build/linux/rocm/rocm_py37_pip.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/rocm_py37_pip.sh
@@ -42,8 +42,22 @@ export N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
+#Add filter tags specific to ROCm version
+rocm_major_version=`cat /opt/rocm/.info/version | cut -d "." -f 1`
+rocm_minor_version=`cat /opt/rocm/.info/version | cut -d "." -f 2`
+TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=""
+if [[ $rocm_major_version -ge 5 ]]; then
+	if [[ $rocm_minor_version -lt 3 ]]; then
+		TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+	fi
+else
+    TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+fi
+
+
+
 # # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37,-no_rocm'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py39,-no_rocm'${TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC}
 export TF_BUILD_FLAGS="--config=release_base "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
  --test_env=TF2_BEHAVIOR=1 \
diff --git a/tensorflow/tools/ci_build/linux/rocm/rocm_py38_pip.sh b/tensorflow/tools/ci_build/linux/rocm/rocm_py38_pip.sh
index 9bae011f8b7..22252ccf9ef 100755
--- a/tensorflow/tools/ci_build/linux/rocm/rocm_py38_pip.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/rocm_py38_pip.sh
@@ -42,8 +42,22 @@ export N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
+#Add filter tags specific to ROCm version
+rocm_major_version=`cat /opt/rocm/.info/version | cut -d "." -f 1`
+rocm_minor_version=`cat /opt/rocm/.info/version | cut -d "." -f 2`
+TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=""
+if [[ $rocm_major_version -ge 5 ]]; then
+	if [[ $rocm_minor_version -lt 3 ]]; then
+		TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+	fi
+else
+    TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+fi
+
+
+
 # # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38,-no_rocm'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py39,-no_rocm'${TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC}
 export TF_BUILD_FLAGS="--config=release_base "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
  --test_env=TF2_BEHAVIOR=1 \
diff --git a/tensorflow/tools/ci_build/linux/rocm/rocm_py39_pip.sh b/tensorflow/tools/ci_build/linux/rocm/rocm_py39_pip.sh
index fd104ae518b..b3bad8287f1 100755
--- a/tensorflow/tools/ci_build/linux/rocm/rocm_py39_pip.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/rocm_py39_pip.sh
@@ -42,8 +42,20 @@ export N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 # Get the default test targets for bazel.
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
+#Add filter tags specific to ROCm version
+rocm_major_version=`cat /opt/rocm/.info/version | cut -d "." -f 1`
+rocm_minor_version=`cat /opt/rocm/.info/version | cut -d "." -f 2`
+TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=""
+if [[ $rocm_major_version -ge 5 ]]; then
+	if [[ $rocm_minor_version -lt 3 ]]; then
+		TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+	fi
+else
+    TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC=",no_rocm_pre_53"	
+fi
+
 # # Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py39,-no_rocm'
+export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py39,-no_rocm'${TF_TEST_FILTER_TAGS_ROCM_VERSION_SPECIFIC}
 export TF_BUILD_FLAGS="--config=release_base "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
  --test_env=TF2_BEHAVIOR=1 \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh
index f12992e9dd5..d88b31de152 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_multi.sh
@@ -25,7 +25,7 @@ echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS}
 echo ""
 
 # First positional argument (if any) specifies the ROCM_INSTALL_DIR
-ROCM_INSTALL_DIR=/opt/rocm-5.1.0
+ROCM_INSTALL_DIR=/opt/rocm-5.3.0
 if [[ -n $1 ]]; then
     ROCM_INSTALL_DIR=$1
 fi
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
index 5ea5e2e8f0b..2a94e561556 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@@ -27,7 +27,7 @@ echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS}
 echo ""
 
 # First positional argument (if any) specifies the ROCM_INSTALL_DIR
-ROCM_INSTALL_DIR=/opt/rocm-5.1.0
+ROCM_INSTALL_DIR=/opt/rocm-5.3.0
 if [[ -n $1 ]]; then
     ROCM_INSTALL_DIR=$1
 fi
@@ -61,4 +61,3 @@ bazel test \
       -//tensorflow/core/tpu/... \
       -//tensorflow/lite/... \
       -//tensorflow/compiler/tf2tensorrt/... \
-
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
deleted file mode 100644
index bb6043d7d22..00000000000
--- a/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_bazelisk
-
-# Setup virtual environment and install dependencies
-setup_venv_ubuntu python3.7
-
-export PYTHON_BIN_PATH=$(which python)
-"$PYTHON_BIN_PATH" tensorflow/tools/ci_build/update_version.py --nightly
-
-# Build the pip package
-bazel build \
-  --config=release_cpu_linux \
-  --action_env=PYTHON_BIN_PATH="$PYTHON_BIN_PATH" \
-  tensorflow/tools/pip_package:build_pip_package
-
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
-
-# Upload wheel files
-upload_wheel_cpu_ubuntu
-
-# Remove and cleanup virtual environment
-remove_venv_ubuntu
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
deleted file mode 100644
index 13a89857606..00000000000
--- a/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_bazelisk
-
-# Setup virtual environment and install dependencies
-setup_venv_ubuntu python3.7
-
-export PYTHON_BIN_PATH=$(which python)
-"$PYTHON_BIN_PATH" tensorflow/tools/ci_build/update_version.py --nightly
-
-# Build the pip package
-bazel build \
-  --config=release_gpu_linux \
-  --action_env=PYTHON_BIN_PATH="$PYTHON_BIN_PATH" \
-  tensorflow/tools/pip_package:build_pip_package
-
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
-
-# Upload wheel files
-upload_wheel_gpu_ubuntu
-
-# Remove and cleanup virtual environment
-remove_venv_ubuntu
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
deleted file mode 100644
index a2c450536fd..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Selects a version of Xcode.
-export DEVELOPER_DIR=/Applications/Xcode_11.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Set up py37 via pyenv and check it worked
-PY_VERSION=3.7.9
-setup_python_from_pyenv_macos "${PY_VERSION}"
-python -m venv .tf-venv && source .tf-venv/bin/activate
-
-# Set up and install MacOS pip dependencies.
-install_macos_pip_deps
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-# Pass PYENV_VERSION since we're using pyenv. See b/182399580
-bazel test \
-  --config=release_cpu_macos \
-  --config=nonccl \
-  --action_env PYENV_VERSION="${PY_VERSION}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" \
-  --test_output=errors \
-  -- ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/... -//tensorflow/compiler/aot/...
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
deleted file mode 100644
index c91daf692b3..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py37_pip.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-source tensorflow/tools/ci_build/release/mac_build_utils.sh
-install_bazelisk
-
-# Selects a version of Xcode.
-export DEVELOPER_DIR=/Applications/Xcode_11.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Set up python version via pyenv
-export PYENV_VERSION=3.7.9
-setup_python_from_pyenv_macos "${PYENV_VERSION}"
-
-PIP_WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/pip-whl"
-bazel_build_wheel ${PIP_WHL_DIR}
-
-for WHL_PATH in $(ls "${PIP_WHL_DIR}"/tensorflow*.whl); do
-  bazel_test_wheel ${WHL_PATH}
-done
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
index 23f20205ad3..e26e6015eb2 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
@@ -63,10 +63,11 @@ export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
     --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium \
     --test_output=errors --verbose_failures=true --test_keep_going"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
-export TF_PIP_TESTS="test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS="-no_oss,-oss_serial,-v1only,-benchmark-test,-no_aarch64"
+export TF_PIP_TESTS="test_pip_virtualenv_clean test_pip_virtualenv_oss_serial"
+export TF_TEST_FILTER_TAGS="-no_oss,-v1only,-benchmark-test,-no_aarch64"
 export TF_PIP_TEST_ROOT="pip_test"
 export TF_AUDITWHEEL_TARGET_PLAT="manylinux2014"
+export TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES="tensorflow-io"
 
 if [ ${IS_NIGHTLY} == 1 ]; then
   ./tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
deleted file mode 100644
index 5a272508c83..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_nonpip.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Update bazel
-install_bazelisk
-
-# Setup virtual environment and install dependencies
-setup_venv_ubuntu python3.7
-
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py37,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test \
-  --config=release_cpu_linux \
-  --repo_env=PYTHON_BIN_PATH="$(which python)" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" \
-  --test_lang_filters=py \
-  --test_output=errors \
-  --local_test_jobs=8 \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
-
-# Remove and cleanup virtual environment
-remove_venv_ubuntu
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
deleted file mode 100644
index 4a56ca79265..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py37_pip.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.7'
-export PYTHON_BIN_PATH="$(which ${TF_PYTHON_VERSION})"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
deleted file mode 100644
index 719b986f767..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Update bazel
-install_bazelisk
-
-# Setup virtual environment and install dependencies
-setup_venv_ubuntu python3.7
-
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37,-no_cuda11"
-
-set +e
-bazel test \
-  --config=release_gpu_linux \
-  --repo_env=PYTHON_BIN_PATH="$(which python)" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" \
-  --test_lang_filters=py \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
-
-# Remove and cleanup virtual environment
-remove_venv_ubuntu
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
deleted file mode 100644
index e4014ded702..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.7'
-export PYTHON_BIN_PATH="$(which ${TF_PYTHON_VERSION})"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37,-no_cuda11'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11.8 --action_env=TF_CUDNN_VERSION=8.6 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"  # single pip package!
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
deleted file mode 100644
index 4b696bb744e..00000000000
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
+++ /dev/null
@@ -1,24 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-)
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
deleted file mode 100644
index 2b7a3e72750..00000000000
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-    bash -l tensorflow\tools\ci_build\release\windows\gpu_py37_full\release_pip_rename.sh
-)
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda114/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda114/cpu_py37.bat
deleted file mode 100644
index 4b696bb744e..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda114/cpu_py37.bat
+++ /dev/null
@@ -1,24 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-)
diff --git a/tensorflow/tools/ci_build/rel/windows_cuda114/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows_cuda114/gpu_py37.bat
deleted file mode 100644
index 2b7a3e72750..00000000000
--- a/tensorflow/tools/ci_build/rel/windows_cuda114/gpu_py37.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python37
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-if "%IS_NIGHTLY%" == "1" (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
-) else (
-    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-    bash -l tensorflow\tools\ci_build\release\windows\gpu_py37_full\release_pip_rename.sh
-)
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 0ff6b6407f4..09799b36bf1 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -15,13 +15,14 @@
 # ==============================================================================
 # External `common.sh`
 
-# Keep in sync with tensorflow_estimator and configure.py.
+# Keeps Bazel versions of the build scripts.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=5.1.1
+LATEST_BAZEL_VERSION=5.3.0
 # LINT.ThenChange(
-#   //tensorflow_estimator/google/kokoro/common.sh,
+#   //tensorflow/opensource_only/.bazelversion,
 #   //tensorflow/tools/ci_build/install/install_bazel.sh,
-#   //tensorflow/tools/ci_build/install/install_bazel_from_source.sh)
+#   //tensorflow/tools/ci_build/install/install_bazel_from_source.sh,
+#   //tensorflow_estimator/google/kokoro/common.sh)
 
 # Run flaky functions with retries.
 # run_with_retry cmd
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index a12ab7d3ce1..d96a9b69c8e 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -18,7 +18,7 @@ echo on
 @REM Set Environment Variables
 @REM
 IF NOT DEFINED PYTHON_DIRECTORY (
-  SET PYTHON_DIRECTORY=Python37
+  SET PYTHON_DIRECTORY=Python39
 )
 SET PY_EXE=C:\%PYTHON_DIRECTORY%\python.exe
 SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
@@ -29,20 +29,6 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 @REM NOTE: Windows doesn't have any additional requirements from the common ones.
 %PY_EXE% -m pip install -r tensorflow/tools/ci_build/release/requirements_common.txt
 
-:: Set cuda related environment variables. If we are not using CUDA, these are not used.
-IF NOT DEFINED TF_CUDA_VERSION (
-  SET TF_CUDA_VERSION=11.2
-)
-IF NOT DEFINED TF_CUDNN_VERSION (
-  SET TF_CUDNN_VERSION=8
-)
-SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
-SET CUDA_TOOLKIT_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
-SET CUDNN_INSTALL_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
-SET PATH=%CUDA_TOOLKIT_PATH%\extras\CUPTI\libx64;%PATH%
-SET PATH=%CUDA_TOOLKIT_PATH%\bin;%PATH%
-SET PATH=%CUDNN_INSTALL_PATH%\bin;%PATH%
-
 @REM
 @REM Setup Bazelisk
 @REM
diff --git a/tensorflow/tools/ci_build/release/requirements_common.txt b/tensorflow/tools/ci_build/release/requirements_common.txt
index 3a1656e7696..051387bf64e 100644
--- a/tensorflow/tools/ci_build/release/requirements_common.txt
+++ b/tensorflow/tools/ci_build/release/requirements_common.txt
@@ -1,20 +1,22 @@
 # To have reproducible builds, these dependencies should be pinned always.
 # Prefer pinning to the same version as in setup.py for now.
 # This will change in the future.
-
-absl-py ~= 0.13.0
+absl-py ~= 1.0.0
 astunparse ~= 1.6.3
 flatbuffers ~= 2.0
 google_pasta ~= 0.2
-h5py ~= 3.7.0  # NOTE: Earliest version for M1 wheels
-numpy ~= 1.21.4  # NOTE: Earliest version for h5py in Python 3.10
+h5py ~= 3.8.0  # Earliest version for Python 3.11
+# TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
+# to support TFX. Remove when Apache Beam upgrades to newer NumPy.
+numpy ~= 1.22.0; python_version < '3.11'
+numpy ~= 1.23.2; python_version >= '3.11' # Earliest version for Python 3.11
 opt_einsum ~= 3.3.0
-protobuf ~= 3.19.3  # NOTE: Earliest version for Python 3.10
+protobuf ~= 3.20.3  # NOTE: Earliest version for Python 3.10
 six ~= 1.16.0
-termcolor ~= 1.1.0
+termcolor ~= 2.1.1
 typing_extensions ~= 3.10.0.0
-wheel ~= 0.36.2
-wrapt ~= 1.12.1
+wheel ~= 0.38.1
+wrapt ~= 1.14.1
 
 # We need to pin the gast dependency exactly
 gast == 0.4.0
@@ -23,14 +25,15 @@ gast == 0.4.0
 # Note that here we want the latest version that matches TF major.minor version
 # Note that we must use nightly here as these are used in nightly jobs
 # For release jobs, we will pin these on the release branch
-keras-nightly ~= 2.12.0.dev
-tb-nightly ~= 2.9.0.a
-tf-estimator-nightly ~= 2.12.0.dev
+keras-nightly ~= 2.13.0.dev
+tb-nightly ~= 2.12.0.a
+tf-estimator-nightly ~= 2.13.0.dev
 
 # Test dependencies
-grpcio ~= 1.43.0  # NOTE: Earliest version for Python 3.10
+grpcio ~= 1.49.1 # Earliest version for Python 3.11
 portpicker ~= 1.4.0
-scipy ~= 1.7.2  # NOTE: Earliest version for Python 3.10
+scipy ~= 1.7.2; python_version < '3.11'
+scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
 
 # This is usually vendored in setuptools but ensure it gets installed in CI anyway
 # No bound here, we prefer the one in setuptools
diff --git a/tensorflow/tools/ci_build/release/requirements_mac.txt b/tensorflow/tools/ci_build/release/requirements_mac.txt
index 42afc3e5486..0e0f832d088 100644
--- a/tensorflow/tools/ci_build/release/requirements_mac.txt
+++ b/tensorflow/tools/ci_build/release/requirements_mac.txt
@@ -1,12 +1,12 @@
 -r requirements_common.txt
 
 # Dependencies only required for Mac
-certifi ~= 2020.12.5
+certifi ~= 2022.12.07
 
 # Install build related dependencies
 twine ~= 3.6.0
 setuptools
 
 # Test dependencies which don't exist on Windows
-jax ~= 0.3.14
-jaxlib ~= 0.3.14
+jax ~= 0.3.24
+jaxlib ~= 0.3.24 # Earliest version for Python 3.11
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index d2f5fb321b8..b4396a74022 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -19,13 +19,6 @@ function run_configure_for_cpu_build {
   yes "" | ./configure
 }
 
-function run_configure_for_gpu_build {
-  # Enable CUDA support
-  export TF_NEED_CUDA=1
-
-  yes "" | ./configure
-}
-
 function set_remote_cache_options {
   echo "build --remote_instance_name=projects/tensorflow-testing/instances/default_instance" >> "${TMP_BAZELRC}"
   echo "build --remote_default_exec_properties=build=windows-x64" >> "${TMP_BAZELRC}"
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 8b58ee14e42..ff35174767e 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -53,15 +53,3 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-
-# Setting default values to CUDA related environment variables
-export TF_CUDA_VERSION=${TF_CUDA_VERSION:-11.2}
-export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-8}
-export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-6.0}
-export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
-export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
-
-# Add Cuda and Cudnn dll directories into PATH
-export PATH="$(cygpath -u "${CUDA_TOOLKIT_PATH}")/bin:$PATH"
-export PATH="$(cygpath -u "${CUDA_TOOLKIT_PATH}")/extras/CUPTI/libx64:$PATH"
-export PATH="$(cygpath -u "${CUDNN_INSTALL_PATH}")/bin:$PATH"
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index fbaef9c36b8..da8fc8eeaa4 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow:tensorflow.bzl", "VERSION_MAJOR", "py_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//third_party/py/keras/api:__subpackages__",
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index c31c6cf3e73..06c110f4040 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -7,6 +7,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//learning/brain/testing/tf2_migration_tools:__pkg__",
         "//tensorflow:internal",
diff --git a/tensorflow/tools/compatibility/all_renames_v2.py b/tensorflow/tools/compatibility/all_renames_v2.py
index f6224810c80..a662afc8d6d 100644
--- a/tensorflow/tools/compatibility/all_renames_v2.py
+++ b/tensorflow/tools/compatibility/all_renames_v2.py
@@ -22,6 +22,8 @@
 # function_reorders or function_keyword_renames in tf_upgrade_v2.py,
 # use the OLD function name.
 # These renames happen after the arguments have been processed.
+# After modifying this dict, run the following to update reorders_v2.py:
+# bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 manual_symbol_renames = {
     "tf.batch_to_space_nd":
         "tf.batch_to_space",
@@ -281,18 +283,8 @@
         "tf.batch_to_space",
     "tf.quantize_v2":
         "tf.quantization.quantize",
-    "tf.sparse_add":
-        "tf.sparse.add",
-    "tf.sparse_concat":
-        "tf.sparse.concat",
-    "tf.sparse_split":
-        "tf.sparse.split",
     "tf.sparse_matmul":
         "tf.linalg.matmul",
-    "tf.sparse_reduce_sum":
-        "tf.sparse.reduce_sum",
-    "tf.sparse_reduce_max":
-        "tf.sparse.reduce_max",
     "tf.random.stateless_multinomial":
         "tf.random.stateless_categorical",
     "tf.substr":
@@ -358,10 +350,16 @@
         "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
     "tf.lite.constants.FLOAT":
         "tf.float32",
+    "tf.lite.constants.FLOAT16":
+        "tf.float16",
+    "tf.lite.constants.INT16":
+        "tf.int16",
     "tf.lite.constants.INT32":
         "tf.int32",
     "tf.lite.constants.INT64":
         "tf.int64",
+    "tf.lite.constants.INT8":
+        "tf.int8",
     "tf.lite.constants.STRING":
         "tf.string",
     "tf.lite.constants.QUANTIZED_UINT8":
@@ -378,16 +376,56 @@
     # symbol with the same name.
     "tf.saved_model.load":
         "tf.compat.v1.saved_model.load",
+    "tf.saved_model.loader.load":
+        "tf.compat.v1.saved_model.load",
     "tf.saved_model.load_v2":
         "tf.compat.v2.saved_model.load",
     "tf.image.resize_images":
         "tf.image.resize",
-    "tf.random_poisson":
-        "tf.random.poisson",
+    "tf.assert_equal":
+        "tf.compat.v1.assert_equal",
+    "tf.assert_greater":
+        "tf.compat.v1.assert_greater",
+    "tf.assert_greater_equal":
+        "tf.compat.v1.assert_greater_equal",
+    "tf.assert_integer":
+        "tf.compat.v1.assert_integer",
+    "tf.assert_less":
+        "tf.compat.v1.assert_less",
+    "tf.assert_less_equal":
+        "tf.compat.v1.assert_less_equal",
+    "tf.assert_near":
+        "tf.compat.v1.assert_near",
+    "tf.assert_negative":
+        "tf.compat.v1.assert_negative",
+    "tf.assert_non_negative":
+        "tf.compat.v1.assert_non_negative",
+    "tf.assert_non_positive":
+        "tf.compat.v1.assert_non_positive",
+    "tf.assert_none_equal":
+        "tf.compat.v1.assert_none_equal",
+    "tf.assert_positive":
+        "tf.compat.v1.assert_positive",
+    "tf.assert_rank":
+        "tf.compat.v1.assert_rank",
+    "tf.assert_rank_at_least":
+        "tf.compat.v1.assert_rank_at_least",
+    "tf.assert_rank_in":
+        "tf.compat.v1.assert_rank_in",
+    "tf.assert_scalar":
+        "tf.compat.v1.assert_scalar",
+    "tf.assert_type":
+        "tf.compat.v1.assert_type",
+    "tf.assert_variables_initialized":
+        "tf.compat.v1.assert_variables_initialized",
+    "tf.debugging.assert_equal":
+        "tf.compat.v1.debugging.assert_equal",
     "tf.debugging.assert_greater":
         "tf.compat.v1.debugging.assert_greater",
     "tf.debugging.assert_greater_equal":
         "tf.compat.v1.debugging.assert_greater_equal",
+    "tf.debugging.assert_integer":
+        "tf.compat.v1.debugging.assert_integer",
     "tf.debugging.assert_less":
         "tf.compat.v1.debugging.assert_less",
     "tf.debugging.assert_less_equal":
@@ -404,30 +442,22 @@
         "tf.compat.v1.debugging.assert_none_equal",
     "tf.debugging.assert_positive":
         "tf.compat.v1.debugging.assert_positive",
-    "tf.debugging.assert_equal":
-        "tf.compat.v1.debugging.assert_equal",
-    "tf.debugging.assert_scalar":
-        "tf.compat.v1.debugging.assert_scalar",
-    "tf.assert_equal":
-        "tf.compat.v1.assert_equal",
-    "tf.assert_less":
-        "tf.compat.v1.assert_less",
-    "tf.assert_greater":
-        "tf.compat.v1.assert_greater",
     "tf.debugging.assert_rank":
         "tf.compat.v1.debugging.assert_rank",
     "tf.debugging.assert_rank_at_least":
         "tf.compat.v1.debugging.assert_rank_at_least",
     "tf.debugging.assert_rank_in":
         "tf.compat.v1.debugging.assert_rank_in",
+    "tf.debugging.assert_scalar":
+        "tf.compat.v1.debugging.assert_scalar",
+    "tf.debugging.assert_type":
+        "tf.compat.v1.debugging.assert_type",
     "tf.errors.exception_type_from_error_code":
         "tf.compat.v1.errors.exception_type_from_error_code",
     "tf.errors.error_code_from_exception_type":
         "tf.compat.v1.errors.error_code_from_exception_type",
     "tf.errors.raise_exception_on_not_ok_status":
         "tf.compat.v1.errors.raise_exception_on_not_ok_status",
-    "tf.assert_rank":
-        "tf.compat.v1.assert_rank",
     "tf.nn.max_pool":
         "tf.nn.max_pool2d",
     "tf.nn.avg_pool":
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index c37bd60aaaa..ed595ba0513 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -452,20 +452,23 @@ def _maybe_add_arg_names(self, node, full_name):
                      "script cannot handle these automatically." % full_name)
 
       reordered = function_reorders[full_name]
+      new_args = []
       new_keywords = []
       idx = 0
       for arg in node.args:
         if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
           continue  # Can't move Starred to keywords
         keyword_arg = reordered[idx]
-        keyword = ast.keyword(arg=keyword_arg, value=arg)
-        new_keywords.append(keyword)
+        if keyword_arg:
+          new_keywords.append(ast.keyword(arg=keyword_arg, value=arg))
+        else:
+          new_args.append(arg)
         idx += 1
 
       if new_keywords:
         self.add_log(INFO, node.lineno, node.col_offset,
                      "Added keywords to args of function %r" % full_name)
-        node.args = []
+        node.args = new_args
         node.keywords = new_keywords + (node.keywords or [])
         return True
     return False
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 162adeb72e3..4067b091316 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -16,9 +16,7 @@
 """List of renames to apply when converting from TF 1.0 to TF 2.0.
 
 THIS FILE IS AUTOGENERATED: To update, please run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
-pyformat --in_place third_party/tensorflow/tools/compatibility/renames_v2.py
+  bazel run tensorflow/tools/compatibility/update:generate_v2_renames_map
 This file should be updated whenever endpoints are deprecated.
 """
 renames = {
@@ -30,6 +28,8 @@
         'tf.version.COMPILER_VERSION',
     'tf.CXX11_ABI_FLAG':
         'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.CXX_VERSION':
+        'tf.sysconfig.CXX_VERSION',
     'tf.ConditionalAccumulator':
         'tf.compat.v1.ConditionalAccumulator',
     'tf.ConditionalAccumulatorBase':
@@ -150,38 +150,10 @@
         'tf.math.angle',
     'tf.app.run':
         'tf.compat.v1.app.run',
-    'tf.assert_greater_equal':
-        'tf.compat.v1.assert_greater_equal',
-    'tf.assert_integer':
-        'tf.debugging.assert_integer',
-    'tf.assert_less_equal':
-        'tf.compat.v1.assert_less_equal',
-    'tf.assert_near':
-        'tf.compat.v1.assert_near',
-    'tf.assert_negative':
-        'tf.compat.v1.assert_negative',
-    'tf.assert_non_negative':
-        'tf.compat.v1.assert_non_negative',
-    'tf.assert_non_positive':
-        'tf.compat.v1.assert_non_positive',
-    'tf.assert_none_equal':
-        'tf.compat.v1.assert_none_equal',
-    'tf.assert_positive':
-        'tf.compat.v1.assert_positive',
     'tf.assert_proper_iterable':
         'tf.debugging.assert_proper_iterable',
-    'tf.assert_rank_at_least':
-        'tf.compat.v1.assert_rank_at_least',
-    'tf.assert_rank_in':
-        'tf.compat.v1.assert_rank_in',
     'tf.assert_same_float_dtype':
         'tf.debugging.assert_same_float_dtype',
-    'tf.assert_scalar':
-        'tf.compat.v1.assert_scalar',
-    'tf.assert_type':
-        'tf.debugging.assert_type',
-    'tf.assert_variables_initialized':
-        'tf.compat.v1.assert_variables_initialized',
     'tf.assign':
         'tf.compat.v1.assign',
     'tf.assign_add':
@@ -251,7 +223,7 @@
     'tf.delete_session_tensor':
         'tf.compat.v1.delete_session_tensor',
     'tf.depth_to_space':
-        'tf.compat.v1.depth_to_space',
+        'tf.nn.depth_to_space',
     'tf.dequantize':
         'tf.quantization.dequantize',
     'tf.deserialize_many_sparse':
@@ -278,8 +250,6 @@
         'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape':
         'tf.compat.v1.disable_v2_tensorshape',
-    'tf.distribute.experimental.ParameterServerStrategy':
-        'tf.compat.v1.distribute.experimental.ParameterServerStrategy',
     'tf.distribute.get_loss_reduction':
         'tf.compat.v1.distribute.get_loss_reduction',
     'tf.distributions.Bernoulli':
@@ -514,10 +484,6 @@
         'tf.compat.v1.keras.backend.get_session',
     'tf.keras.backend.set_session':
         'tf.compat.v1.keras.backend.set_session',
-    'tf.keras.experimental.export_saved_model':
-        'tf.compat.v1.keras.experimental.export_saved_model',
-    'tf.keras.experimental.load_from_saved_model':
-        'tf.compat.v1.keras.experimental.load_from_saved_model',
     'tf.keras.layers.CuDNNGRU':
         'tf.compat.v1.keras.layers.CuDNNGRU',
     'tf.keras.layers.CuDNNLSTM':
@@ -535,15 +501,33 @@
     'tf.keras.metrics.cosine_proximity':
         'tf.keras.losses.cosine_similarity',
     'tf.keras.models.LinearModel':
-        'tf.compat.v1.keras.models.LinearModel',
+        'tf.keras.experimental.LinearModel',
     'tf.keras.models.WideDeepModel':
-        'tf.compat.v1.keras.models.WideDeepModel',
+        'tf.keras.experimental.WideDeepModel',
+    'tf.keras.optimizers.Adadelta':
+        'tf.keras.optimizers.legacy.Adadelta',
+    'tf.keras.optimizers.Adagrad':
+        'tf.keras.optimizers.legacy.Adagrad',
+    'tf.keras.optimizers.Adam':
+        'tf.keras.optimizers.legacy.Adam',
+    'tf.keras.optimizers.Adamax':
+        'tf.keras.optimizers.legacy.Adamax',
+    'tf.keras.optimizers.Ftrl':
+        'tf.keras.optimizers.legacy.Ftrl',
+    'tf.keras.optimizers.Nadam':
+        'tf.keras.optimizers.legacy.Nadam',
+    'tf.keras.optimizers.Optimizer':
+        'tf.keras.optimizers.legacy.Optimizer',
+    'tf.keras.optimizers.RMSprop':
+        'tf.keras.optimizers.legacy.RMSprop',
+    'tf.keras.optimizers.SGD':
+        'tf.keras.optimizers.legacy.SGD',
     'tf.keras.utils.DeterministicRandomTestTool':
         'tf.compat.v1.keras.utils.DeterministicRandomTestTool',
-    'tf.keras.utils.track_tf1_style_variables':
-        'tf.compat.v1.keras.utils.track_tf1_style_variables',
     'tf.keras.utils.get_or_create_layer':
         'tf.compat.v1.keras.utils.get_or_create_layer',
+    'tf.keras.utils.track_tf1_style_variables':
+        'tf.compat.v1.keras.utils.track_tf1_style_variables',
     'tf.layers.AveragePooling1D':
         'tf.compat.v1.layers.AveragePooling1D',
     'tf.layers.AveragePooling2D':
@@ -632,12 +616,8 @@
         'tf.compat.v1.lite.OpHint',
     'tf.lite.TocoConverter':
         'tf.compat.v1.lite.TocoConverter',
-    'tf.lite.constants.FLOAT16':
-        'tf.compat.v1.lite.constants.FLOAT16',
     'tf.lite.constants.GRAPHVIZ_DOT':
         'tf.compat.v1.lite.constants.GRAPHVIZ_DOT',
-    'tf.lite.constants.INT8':
-        'tf.compat.v1.lite.constants.INT8',
     'tf.lite.constants.TFLITE':
         'tf.compat.v1.lite.constants.TFLITE',
     'tf.lite.experimental.convert_op_hints_to_stubs':
@@ -733,7 +713,7 @@
     'tf.make_template':
         'tf.compat.v1.make_template',
     'tf.manip.gather_nd':
-        'tf.compat.v1.manip.gather_nd',
+        'tf.gather_nd',
     'tf.manip.reshape':
         'tf.reshape',
     'tf.manip.reverse':
@@ -768,18 +748,6 @@
         'tf.linalg.matrix_transpose',
     'tf.matrix_triangular_solve':
         'tf.linalg.triangular_solve',
-    'tf.mixed_precision.DynamicLossScale':
-        'tf.compat.v1.mixed_precision.DynamicLossScale',
-    'tf.mixed_precision.FixedLossScale':
-        'tf.compat.v1.mixed_precision.FixedLossScale',
-    'tf.mixed_precision.LossScale':
-        'tf.compat.v1.mixed_precision.LossScale',
-    'tf.mixed_precision.experimental.DynamicLossScale':
-        'tf.compat.v1.mixed_precision.experimental.DynamicLossScale',
-    'tf.mixed_precision.experimental.FixedLossScale':
-        'tf.compat.v1.mixed_precision.experimental.FixedLossScale',
-    'tf.mixed_precision.experimental.LossScale':
-        'tf.compat.v1.mixed_precision.experimental.LossScale',
     'tf.metrics.accuracy':
         'tf.compat.v1.metrics.accuracy',
     'tf.metrics.auc':
@@ -848,12 +816,24 @@
         'tf.compat.v1.metrics.true_positives_at_thresholds',
     'tf.min_max_variable_partitioner':
         'tf.compat.v1.min_max_variable_partitioner',
+    'tf.mixed_precision.DynamicLossScale':
+        'tf.compat.v1.mixed_precision.DynamicLossScale',
+    'tf.mixed_precision.FixedLossScale':
+        'tf.compat.v1.mixed_precision.FixedLossScale',
+    'tf.mixed_precision.LossScale':
+        'tf.compat.v1.mixed_precision.LossScale',
     'tf.mixed_precision.MixedPrecisionLossScaleOptimizer':
         'tf.compat.v1.mixed_precision.MixedPrecisionLossScaleOptimizer',
     'tf.mixed_precision.disable_mixed_precision_graph_rewrite':
         'tf.compat.v1.mixed_precision.disable_mixed_precision_graph_rewrite',
     'tf.mixed_precision.enable_mixed_precision_graph_rewrite':
         'tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite',
+    'tf.mixed_precision.experimental.DynamicLossScale':
+        'tf.compat.v1.mixed_precision.experimental.DynamicLossScale',
+    'tf.mixed_precision.experimental.FixedLossScale':
+        'tf.compat.v1.mixed_precision.experimental.FixedLossScale',
+    'tf.mixed_precision.experimental.LossScale':
+        'tf.compat.v1.mixed_precision.experimental.LossScale',
     'tf.mod':
         'tf.math.floormod',
     'tf.model_variables':
@@ -992,6 +972,8 @@
         'tf.random.gamma',
     'tf.random_normal':
         'tf.random.normal',
+    'tf.random_poisson':
+        'tf.random.poisson',
     'tf.random_shuffle':
         'tf.random.shuffle',
     'tf.random_uniform':
@@ -1068,10 +1050,8 @@
         'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
     'tf.saved_model.is_valid_signature':
         'tf.compat.v1.saved_model.is_valid_signature',
-    'tf.saved_model.loader.load':
-        'tf.compat.v1.saved_model.loader.load',
     'tf.saved_model.loader.maybe_saved_model_directory':
-        'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
+        'tf.saved_model.contains_saved_model',
     'tf.saved_model.main_op.main_op':
         'tf.compat.v1.saved_model.main_op.main_op',
     'tf.saved_model.main_op.main_op_with_restore':
@@ -1079,7 +1059,7 @@
     'tf.saved_model.main_op_with_restore':
         'tf.compat.v1.saved_model.main_op_with_restore',
     'tf.saved_model.maybe_saved_model_directory':
-        'tf.compat.v1.saved_model.maybe_saved_model_directory',
+        'tf.saved_model.contains_saved_model',
     'tf.saved_model.predict_signature_def':
         'tf.compat.v1.saved_model.predict_signature_def',
     'tf.saved_model.regression_signature_def':
@@ -1146,10 +1126,6 @@
         'tf.compat.v1.scatter_nd_add',
     'tf.scatter_nd_sub':
         'tf.compat.v1.scatter_nd_sub',
-    'tf.scatter_nd_max':
-        'tf.compat.v1.scatter_nd_max',
-    'tf.scatter_nd_min':
-        'tf.compat.v1.scatter_nd_min',
     'tf.scatter_nd_update':
         'tf.compat.v1.scatter_nd_update',
     'tf.scatter_sub':
@@ -1171,9 +1147,9 @@
     'tf.self_adjoint_eigvals':
         'tf.linalg.eigvalsh',
     'tf.serialize_many_sparse':
-        'tf.compat.v1.serialize_many_sparse',
+        'tf.io.serialize_many_sparse',
     'tf.serialize_sparse':
-        'tf.compat.v1.serialize_sparse',
+        'tf.io.serialize_sparse',
     'tf.serialize_tensor':
         'tf.io.serialize_tensor',
     'tf.set_random_seed':
@@ -1189,7 +1165,7 @@
     'tf.sets.set_union':
         'tf.sets.union',
     'tf.space_to_depth':
-        'tf.compat.v1.space_to_depth',
+        'tf.nn.space_to_depth',
     'tf.sparse.SparseConditionalAccumulator':
         'tf.compat.v1.sparse.SparseConditionalAccumulator',
     'tf.sparse.matmul':
@@ -1202,6 +1178,10 @@
         'tf.compat.v1.sparse.reduce_max_sparse',
     'tf.sparse.reduce_sum_sparse':
         'tf.compat.v1.sparse.reduce_sum_sparse',
+    'tf.sparse_add':
+        'tf.sparse.add',
+    'tf.sparse_concat':
+        'tf.sparse.concat',
     'tf.sparse_fill_empty_rows':
         'tf.sparse.fill_empty_rows',
     'tf.sparse_mask':
@@ -1214,8 +1194,12 @@
         'tf.sparse.minimum',
     'tf.sparse_placeholder':
         'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max':
+        'tf.sparse.reduce_max',
     'tf.sparse_reduce_max_sparse':
         'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum':
+        'tf.sparse.reduce_sum',
     'tf.sparse_reduce_sum_sparse':
         'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder':
@@ -1227,15 +1211,17 @@
     'tf.sparse_retain':
         'tf.sparse.retain',
     'tf.sparse_segment_mean':
-        'tf.compat.v1.sparse_segment_mean',
+        'tf.sparse.segment_mean',
     'tf.sparse_segment_sqrt_n':
-        'tf.compat.v1.sparse_segment_sqrt_n',
+        'tf.sparse.segment_sqrt_n',
     'tf.sparse_segment_sum':
-        'tf.compat.v1.sparse_segment_sum',
+        'tf.sparse.segment_sum',
     'tf.sparse_slice':
         'tf.sparse.slice',
     'tf.sparse_softmax':
         'tf.sparse.softmax',
+    'tf.sparse_split':
+        'tf.sparse.split',
     'tf.sparse_tensor_dense_matmul':
         'tf.sparse.sparse_dense_matmul',
     'tf.sparse_tensor_to_dense':
@@ -1390,8 +1376,6 @@
         'tf.compat.v1.tpu.shard',
     'tf.tpu.shutdown_system':
         'tf.compat.v1.tpu.shutdown_system',
-    'tf.tpu.XLAOptions':
-        'tf.compat.v1.tpu.XLAOptions',
     'tf.trace':
         'tf.linalg.trace',
     'tf.train.AdadeltaOptimizer':
@@ -1619,7 +1603,7 @@
     'tf.variables_initializer':
         'tf.compat.v1.variables_initializer',
     'tf.verify_tensor_all_finite':
-        'tf.compat.v1.verify_tensor_all_finite',
+        'tf.debugging.assert_all_finite',
     'tf.wrap_function':
         'tf.compat.v1.wrap_function',
     'tf.write_file':
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index 7c482af8b68..99a0d1b4e47 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -16,25 +16,25 @@
 """List of renames to apply when converting from TF 1.0 to TF 2.0.
 
 THIS FILE IS AUTOGENERATED: To update, please run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+  bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 This file should be updated whenever a function is added to
 self.reordered_function_names in tf_upgrade_v2.py.
 """
 reorders = {
-    'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name', 'block_shape'],
-    'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
-    'tf.cond': ['pred', 'true_fn', 'false_fn', 'strict', 'name', 'fn1', 'fn2'],
-    'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
-    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype', 'dtype_hint'],
+    'tf.argmax': [None, None, 'name', 'dimension', 'output_type'],
+    'tf.argmin': [None, None, 'name', 'dimension', 'output_type'],
+    'tf.batch_to_space': [None, 'crops', 'block_size', 'name', 'block_shape'],
+    'tf.boolean_mask': [None, None, 'name', 'axis'],
+    'tf.cond': [None, None, None, 'strict', 'name', 'fn1', 'fn2'],
+    'tf.confusion_matrix': [None, None, None, 'dtype', 'name', 'weights'],
+    'tf.convert_to_tensor': [None, None, 'name', 'preferred_dtype', 'dtype_hint'],
     'tf.data.experimental.RaggedTensorStructure': ['dtype', 'shape', 'ragged_rank'],
     'tf.data.experimental.SparseTensorStructure': ['dtype', 'shape'],
     'tf.data.experimental.TensorArrayStructure': ['dtype', 'element_shape', 'dynamic_size', 'infer_shape'],
     'tf.data.experimental.TensorStructure': ['dtype', 'shape'],
-    'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
-    'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.debugging.assert_all_finite': ['t', 'msg', 'name', 'x', 'message'],
+    'tf.decode_csv': [None, None, None, None, 'name', 'na_value', 'select_cols'],
+    'tf.depth_to_space': [None, None, 'name', 'data_format'],
     'tf.estimator.BaselineClassifier': ['model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'config', 'loss_reduction'],
     'tf.estimator.BaselineRegressor': ['model_dir', 'label_dimension', 'weight_column', 'optimizer', 'config', 'loss_reduction'],
     'tf.estimator.DNNClassifier': ['hidden_units', 'feature_columns', 'model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'activation_fn', 'dropout', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm'],
@@ -43,102 +43,102 @@
     'tf.estimator.DNNRegressor': ['hidden_units', 'feature_columns', 'model_dir', 'label_dimension', 'weight_column', 'optimizer', 'activation_fn', 'dropout', 'input_layer_partitioner', 'config', 'warm_start_from', 'loss_reduction', 'batch_norm'],
     'tf.estimator.LinearClassifier': ['feature_columns', 'model_dir', 'n_classes', 'weight_column', 'label_vocabulary', 'optimizer', 'config', 'partitioner', 'warm_start_from', 'loss_reduction', 'sparse_combiner'],
     'tf.estimator.LinearRegressor': ['feature_columns', 'model_dir', 'label_dimension', 'weight_column', 'optimizer', 'config', 'partitioner', 'warm_start_from', 'loss_reduction', 'sparse_combiner'],
-    'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
-    'tf.gradients': ['ys', 'xs', 'grad_ys', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method', 'stop_gradients', 'unconnected_gradients'],
-    'tf.hessians': ['ys', 'xs', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method'],
-    'tf.image.sample_distorted_bounding_box': ['image_size', 'bounding_boxes', 'seed', 'seed2', 'min_object_covered', 'aspect_ratio_range', 'area_range', 'max_attempts', 'use_image_if_no_bounding_boxes', 'name'],
+    'tf.feature_column.categorical_column_with_vocabulary_file': [None, None, None, 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.gather_nd': [None, None, 'name', 'batch_dims'],
+    'tf.gradients': [None, None, None, None, 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method', 'stop_gradients', 'unconnected_gradients'],
+    'tf.hessians': [None, None, 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method'],
+    'tf.image.sample_distorted_bounding_box': [None, None, None, 'seed2', 'min_object_covered', 'aspect_ratio_range', 'area_range', 'max_attempts', 'use_image_if_no_bounding_boxes', 'name'],
     'tf.initializers.uniform_unit_scaling': ['factor', 'seed', 'dtype'],
-    'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
-    'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
-    'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
-    'tf.io.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
-    'tf.io.serialize_sparse': ['sp_input', 'name', 'out_type'],
-    'tf.linalg.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
-    'tf.math.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.math.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.math.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.io.decode_csv': [None, None, None, None, 'name', 'na_value', 'select_cols'],
+    'tf.io.parse_example': [None, None, 'name', 'example_names'],
+    'tf.io.parse_single_example': [None, None, 'name', 'example_names'],
+    'tf.io.serialize_many_sparse': [None, 'name', 'out_type'],
+    'tf.io.serialize_sparse': [None, 'name', 'out_type'],
+    'tf.linalg.norm': [None, None, None, None, None, 'keep_dims'],
+    'tf.manip.gather_nd': [None, None, 'name', 'batch_dims'],
+    'tf.math.argmax': [None, None, 'name', 'dimension', 'output_type'],
+    'tf.math.argmin': [None, None, 'name', 'dimension', 'output_type'],
+    'tf.math.confusion_matrix': [None, None, None, 'dtype', 'name', 'weights'],
     'tf.math.in_top_k': ['predictions', 'targets', 'k', 'name'],
-    'tf.math.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.math.reduce_all': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_any': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_logsumexp': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_max': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_mean': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_min': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_prod': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_sum': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.multinomial': [None, None, 'seed', 'name', 'output_dtype'],
     'tf.nn.avg_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
     'tf.nn.avg_pool2d': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
     'tf.nn.conv1d': ['value', 'filters', 'stride', 'padding', 'use_cudnn_on_gpu', 'data_format', 'name', 'input', 'dilations'],
-    'tf.nn.conv2d': ['input', 'filter', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
+    'tf.nn.conv2d': [None, 'filter', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
     'tf.nn.conv2d_backprop_input': ['input_sizes', 'filter', 'out_backprop', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
-    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format', 'filters', 'dilations'],
-    'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.convolution': [None, 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format', 'filters', 'dilations'],
+    'tf.nn.crelu': [None, 'name', 'axis'],
     'tf.nn.ctc_beam_search_decoder': ['inputs', 'sequence_length', 'beam_width', 'top_paths', 'merge_repeated'],
-    'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
-    'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
-    'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.depth_to_space': [None, None, 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': [None, None, None, None, 'rate', 'name', 'data_format', 'dilations'],
+    'tf.nn.embedding_lookup': [None, None, 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
+    'tf.nn.embedding_lookup_sparse': [None, None, None, 'partition_strategy', 'name', 'combiner', 'max_norm'],
     'tf.nn.fractional_avg_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.fractional_max_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
     'tf.nn.max_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
-    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims', 'keepdims'],
-    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format', 'dilations'],
-    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
-    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name', 'axis'],
-    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
-    'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims', 'keepdims'],
-    'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
-    'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
-    'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
-    'tf.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
-    'tf.quantize_v2': ['input', 'min_range', 'max_range', 'T', 'mode', 'name', 'round_mode', 'narrow_range', 'axis', 'ensure_minimum_range'],
-    'tf.random.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.nn.moments': [None, None, None, 'name', 'keep_dims', 'keepdims'],
+    'tf.nn.pool': [None, None, None, 'padding', 'dilation_rate', 'strides', 'name', 'data_format', 'dilations'],
+    'tf.nn.separable_conv2d': [None, None, None, None, None, 'rate', 'name', 'data_format', 'dilations'],
+    'tf.nn.softmax_cross_entropy_with_logits': ['labels', 'logits', 'dim', 'name', 'axis'],
+    'tf.nn.space_to_batch': [None, 'paddings', 'block_size', 'name', 'block_shape'],
+    'tf.nn.space_to_depth': [None, None, 'name', 'data_format'],
+    'tf.nn.weighted_moments': [None, None, None, 'name', 'keep_dims', 'keepdims'],
+    'tf.norm': [None, None, None, None, None, 'keep_dims'],
+    'tf.pad': [None, None, None, 'name', 'constant_values'],
+    'tf.parse_example': [None, None, 'name', 'example_names'],
+    'tf.parse_single_example': [None, None, 'name', 'example_names'],
+    'tf.quantize_v2': [None, None, None, None, None, 'name', 'round_mode', 'narrow_range', 'axis', 'ensure_minimum_range'],
+    'tf.random.multinomial': [None, None, 'seed', 'name', 'output_dtype'],
     'tf.random.poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
     'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
-    'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
-    'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reverse_sequence': ['input', 'seq_lengths', 'seq_axis', 'batch_axis', 'name', 'seq_dim', 'batch_dim'],
-    'tf.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
-    'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
-    'tf.shape': ['input', 'name', 'out_type'],
-    'tf.size': ['input', 'name', 'out_type'],
-    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
-    'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
-    'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
-    'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
-    'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
-    'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
-    'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.reduce_all': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_any': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_join': [None, None, 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
+    'tf.reduce_logsumexp': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_max': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_mean': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_min': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_prod': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reduce_sum': [None, None, None, None, 'reduction_indices', 'keep_dims'],
+    'tf.reverse_sequence': [None, None, None, None, None, 'seq_dim', 'batch_dim'],
+    'tf.serialize_many_sparse': [None, 'name', 'out_type'],
+    'tf.serialize_sparse': [None, 'name', 'out_type'],
+    'tf.shape': [None, 'name', 'out_type'],
+    'tf.size': [None, 'name', 'out_type'],
+    'tf.space_to_batch': [None, 'paddings', 'block_size', 'name', 'block_shape'],
+    'tf.space_to_depth': [None, None, 'name', 'data_format'],
+    'tf.sparse.add': [None, None, None, 'thresh'],
+    'tf.sparse.concat': [None, None, 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
+    'tf.sparse.reduce_max': [None, None, None, 'reduction_axes', 'keep_dims'],
+    'tf.sparse.segment_mean': [None, None, None, 'name', 'num_segments'],
+    'tf.sparse.segment_sqrt_n': [None, None, None, 'name', 'num_segments'],
+    'tf.sparse.segment_sum': [None, None, None, 'name', 'num_segments'],
     'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
-    'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
-    'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
-    'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
-    'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
-    'tf.sparse_segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
-    'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_add': [None, None, None, 'thresh'],
+    'tf.sparse_concat': [None, None, 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
+    'tf.sparse_matmul': [None, None, None, None, 'a_is_sparse', 'b_is_sparse', 'name'],
+    'tf.sparse_reduce_max': [None, None, None, 'reduction_axes', 'keep_dims'],
+    'tf.sparse_segment_mean': [None, None, None, 'name', 'num_segments'],
+    'tf.sparse_segment_sqrt_n': [None, None, None, 'name', 'num_segments'],
+    'tf.sparse_segment_sum': [None, None, None, 'name', 'num_segments'],
     'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
-    'tf.strings.length': ['input', 'name', 'unit'],
-    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
-    'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
-    'tf.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.strings.length': [None, 'name', 'unit'],
+    'tf.strings.reduce_join': [None, None, 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
+    'tf.strings.substr': [None, None, None, 'name', 'unit'],
+    'tf.substr': [None, None, None, 'name', 'unit'],
     'tf.test.assert_equal_graph_def': ['actual', 'expected', 'checkpoint_v2', 'hash_table_shared_name'],
-    'tf.train.sdca_fprint': ['input', 'name'],
-    'tf.train.sdca_optimizer': ['sparse_example_indices', 'sparse_feature_indices', 'sparse_feature_values', 'dense_features', 'example_weights', 'example_labels', 'sparse_indices', 'sparse_weights', 'dense_weights', 'example_state_data', 'loss_type', 'l1', 'l2', 'num_loss_partitions', 'num_inner_iterations', 'adaptative', 'name'],
-    'tf.train.sdca_shrink_l1': ['weights', 'l1', 'l2', 'name'],
-    'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
-    'tf.tuple': ['tensors', 'name', 'control_inputs'],
+    'tf.transpose': [None, None, 'name', 'conjugate'],
+    'tf.tuple': [None, 'name', 'control_inputs'],
     'tf.uniform_unit_scaling_initializer': ['factor', 'seed', 'dtype'],
+    'tf.verify_tensor_all_finite': ['t', 'msg', 'name', 'x', 'message'],
     'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
 }
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 7cfe72d018b..b6632d28894 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -187,7 +187,6 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
         },
         "tf.nn.softmax_cross_entropy_with_logits": {
             "dim": "axis",
-            "_sentinel": None,
         },
         "tf.nn.softmax_cross_entropy_with_logits_v2": {
             "dim": "axis"
@@ -255,6 +254,10 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
             "t": "x",
             "msg": "message",
         },
+        "tf.verify_tensor_all_finite": {
+            "t": "x",
+            "msg": "message",
+        },
         "tf.sparse.add": {
             "thresh": "threshold",
         },
@@ -553,11 +556,9 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
     self.change_to_function = {}
 
     # pylint: disable=line-too-long
-    # This list should just contain names of functions that had
-    # their arguments reordered. After adding a function name to the list
-    # run the following to update reorders_v2.py:
-    # bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-    # bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+    # This list contains names of functions that had their arguments reordered.
+    # After modifying this list, run the following to update reorders_v2.py:
+    # bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
     # pylint: enable=line-too-long
     self.reordered_function_names = {
         "tf.io.serialize_sparse",
@@ -663,13 +664,12 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
         "tf.estimator.BaselineRegressor",
         "tf.initializers.uniform_unit_scaling",
         "tf.uniform_unit_scaling_initializer",
-        "tf.train.sdca_fprint",
-        "tf.train.sdca_optimizer",
-        "tf.train.sdca_shrink_l1",
         "tf.data.experimental.TensorStructure",
         "tf.data.experimental.SparseTensorStructure",
         "tf.data.experimental.RaggedTensorStructure",
         "tf.data.experimental.TensorArrayStructure",
+        "tf.debugging.assert_all_finite",
+        "tf.gather_nd",
     }
 
     # Manual mapping of function names to be reordered to their list of argument
@@ -904,6 +904,13 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
         "(or alternatively tf.keras.models.save_model), and "
         "tf.keras.models.load_model(path) instead.")
 
+    saved_model_load_warning = (
+        ast_edits.WARNING,
+        "tf.saved_model.load works differently in 2.0 compared to 1.0. See "
+        "migration information in the documentation of "
+        "tf.compat.v1.saved_model.load."
+        "\nThe calls have been converted to compat.v1.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     # You can use *. to add items which do not check the FQN, and apply to e.g.,
@@ -1260,6 +1267,8 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
         "tf.summary.scalar": summary_api_comment,
         "tf.summary.tensor_summary": summary_api_comment,
         "tf.summary.text": summary_api_comment,
+        "tf.saved_model.load": saved_model_load_warning,
+        "tf.saved_model.loader.load": saved_model_load_warning,
     }
     all_renames_v2.add_contrib_direct_import_support(self.function_warnings)
 
@@ -1412,6 +1421,7 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
     }
     all_renames_v2.add_contrib_direct_import_support(self.function_arg_warnings)
 
+    # pylint: disable=line-too-long
     # Specially handled functions
     # Each transformer is a callable which will be called with the arguments
     #   transformer(parent, node, full_name, name, logs)
@@ -1427,6 +1437,9 @@ def __init__(self, import_rename=False, upgrade_compat_v1_import=False):
     #   may get messy)
     # - a replacement for node, if the whole call node was replaced. The caller
     #   will take care of changing parent.
+    # After modifying this dict, run the following to update reorders_v2.py:
+    # bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
+    # pylint: enable=line-too-long
     canned_estimator_msg_optimizer = (
         "tf.keras.optimizers.* only, so the call was converted to compat.v1. "
         "Please note that tf.train.Optimizers have one-to-one correspondents "
@@ -1665,7 +1678,8 @@ def preprocess(self, root_node, after_compat_v1_upgrade=False):
     return root_node, visitor.log, visitor.warnings_and_errors
 
   def clear_preprocessing(self):
-    self.__init__()
+    self.__init__(import_rename=self.import_rename,
+                  upgrade_compat_v1_import=self.upgrade_compat_v1_import)
 
 
 def _is_ast_str(node):
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index be31413813b..5099dd0697e 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -125,8 +125,9 @@ def _upgrade(self,
                                      "test_out.py", out_file))
     return count, report, errors, out_file.getvalue()
 
-  def _upgrade_multiple(self, old_file_texts):
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+  def _upgrade_multiple(self, upgrade_compat_v1_import, old_file_texts):
+    upgrader = ast_edits.ASTCodeUpgrader(
+        tf_upgrade_v2.TFAPIChangeSpec(True, upgrade_compat_v1_import))
     results = []
     for old_file_text in old_file_texts:
       in_file = io.StringIO(old_file_text)
@@ -377,14 +378,12 @@ def testReorderFileNeedsUpdate(self):
     added_names_message = """Some function names in
 self.reordered_function_names are not in reorders_v2.py.
 Please run the following commands to update reorders_v2.py:
-bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 """
     removed_names_message = """%s in self.reorders_v2 does not match
 any name in self.reordered_function_names.
 Please run the following commands to update reorders_v2.py:
-bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 """
     self.assertTrue(
         reordered_function_names.issubset(function_reorders),
@@ -418,16 +417,15 @@ def testRenameArgs(self):
             "dilation_rate_a, strides_a, name_a, data_format_a)\n")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text,
-                     ("tf.nn.pool(input=input_a, window_shape=window_shape_a,"
-                      " pooling_type=pooling_type_a, padding=padding_a, "
-                      "dilations=dilation_rate_a, strides=strides_a, "
-                      "name=name_a, data_format=data_format_a)\n"))
+                     ("tf.nn.pool(input_a, window_shape_a, pooling_type_a, "
+                      "padding=padding_a, dilations=dilation_rate_a, "
+                      "strides=strides_a, name=name_a, "
+                      "data_format=data_format_a)\n"))
 
   def testReorder(self):
     text = "tf.boolean_mask(a, b, c, d)\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text,
-                     "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
+    self.assertEqual(new_text, "tf.boolean_mask(a, b, name=c, axis=d)\n")
 
   def testLearningRateDecay(self):
     for decay in ["tf.train.exponential_decay",
@@ -1034,8 +1032,8 @@ def testRandomMultinomialToRandomCategorical(self):
         )
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     expected_text = (
-        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
-        "name=name, dtype=output_dtype)\n"
+        "tf.random.categorical(logits, samples, seed=seed, name=name, "
+        "dtype=output_dtype)\n"
         )
     self.assertEqual(new_text, expected_text)
 
@@ -1044,8 +1042,8 @@ def testRandomMultinomialToRandomCategorical(self):
         )
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     expected_text = (
-        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
-        "name=name, dtype=output_dtype)\n"
+        "tf.random.categorical(logits, samples, seed=seed, name=name, "
+        "dtype=output_dtype)\n"
         )
     self.assertEqual(new_text, expected_text)
 
@@ -1065,7 +1063,7 @@ def testConvolutionOpUpdate(self):
     )
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     expected_text = (
-        "tf.nn.convolution(input=input, filters=filter, padding=padding, "
+        "tf.nn.convolution(input, filters=filter, padding=padding, "
         "strides=strides, dilations=dilation_rate, name=name, "
         "data_format=data_format)"
     )
@@ -1074,8 +1072,8 @@ def testConvolutionOpUpdate(self):
   def test_substr(self):
     text = "tf.substr(input, pos, len, name, unit)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual("tf.strings.substr(input=input, pos=pos, len=len, "
-                     "name=name, unit=unit)\n", new_text)
+    self.assertEqual("tf.strings.substr(input, pos, len, name=name, "
+                     "unit=unit)\n", new_text)
     self.assertEqual(errors, [])
 
   def testColocateGradientsWithOps(self):
@@ -1090,8 +1088,7 @@ def testColocateGradientsWithOps(self):
     self.assertIn("tf.gradients no longer takes", report)
 
     text = "tf.gradients(y, x, grad_ys, name, colocate, gate)\n"
-    expected = ("tf.gradients(ys=y, xs=x, grad_ys=grad_ys, name=name, "
-                "gate_gradients=gate)\n")
+    expected = ("tf.gradients(y, x, grad_ys, name, gate_gradients=gate)\n")
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
 
@@ -1132,12 +1129,12 @@ def testExportSavedModelRename(self):
 
   def testArgmin(self):
     text = "tf.argmin(input, name=n, dimension=1, output_type=type)"
-    expected_text = "tf.argmin(input=input, name=n, axis=1, output_type=type)"
+    expected_text = "tf.argmin(input, name=n, axis=1, output_type=type)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
-    text = "tf.argmin(input, 0)"
-    expected_text = "tf.argmin(input=input, axis=0)"
+    text = "tf.argmin(input, 0, n)"
+    expected_text = "tf.argmin(input, 0, name=n)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1146,14 +1143,19 @@ def testArgmin(self):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+    text = "tf.arg_min(input, dimension=0)"
+    expected_text = "tf.argmin(input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testArgmax(self):
     text = "tf.argmax(input, name=n, dimension=1, output_type=type)"
-    expected_text = "tf.argmax(input=input, name=n, axis=1, output_type=type)"
+    expected_text = "tf.argmax(input, name=n, axis=1, output_type=type)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
-    text = "tf.argmax(input, 0)"
-    expected_text = "tf.argmax(input=input, axis=0)"
+    text = "tf.argmax(input, 0, n)"
+    expected_text = "tf.argmax(input, 0, name=n)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1162,6 +1164,11 @@ def testArgmax(self):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+    text = "tf.arg_max(input, dimension=0)"
+    expected_text = "tf.argmax(input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testAutograph(self):
     text = "tf.autograph.to_graph(f, True, arg_values=None, arg_types=None)"
     expected_text = "tf.autograph.to_graph(f, True)"
@@ -1193,7 +1200,7 @@ def testBatchToSpace(self):
 
     text = "tf.batch_to_space(input, crops, block_size, name)"
     expected_text = (
-        "tf.batch_to_space(input=input, crops=crops, block_shape=block_size, "
+        "tf.batch_to_space(input, crops=crops, block_shape=block_size, "
         "name=name)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
@@ -1297,39 +1304,37 @@ def testSoftMaxCrossEntropyWithLogitsDoesntNest(self):
 
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
-    expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
-                     "a_is_sparse=e, b_is_sparse=f, name=g)\n")
+    expected_text = ("tf.linalg.matmul(a, b, c, d, a_is_sparse=e, "
+                     "b_is_sparse=f, name=g)\n")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
   def testWeightedMoments(self):
     text = "tf.nn.weighted_moments(x, axes, freq, name, kd)"
     expected_text = (
-        "tf.nn.weighted_moments(x=x, axes=axes, frequency_weights=freq, "
-        "name=name, keepdims=kd)")
+        "tf.nn.weighted_moments(x, axes, freq, name=name, keepdims=kd)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
   def testSparseAdd(self):
-    text = "tf.sparse.add(a, b, t)"
-    expected_text = "tf.sparse.add(a=a, b=b, threshold=t)"
+    text = "tf.sparse.add(a, b, thresh=t)"
+    expected_text = "tf.sparse.add(a, b, threshold=t)"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
   def testSparseConcat(self):
     text = "tf.sparse.concat(ax, inp, name, exp, concat)"
     expected_text = (
-        "tf.sparse.concat(axis=ax, sp_inputs=inp, name=name, "
-        "expand_nonconcat_dims=exp, axis=concat)")
+        "tf.sparse.concat(ax, inp, name=name, expand_nonconcat_dims=exp, "
+        "axis=concat)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
   def testSeparableConv2D(self):
     text = "tf.nn.separable_conv2d(inp, d, pt, strides, pad, rate, name, fmt)"
     expected_text = (
-        "tf.nn.separable_conv2d(input=inp, depthwise_filter=d, "
-        "pointwise_filter=pt, strides=strides, padding=pad, "
-        "dilations=rate, name=name, data_format=fmt)")
+        "tf.nn.separable_conv2d(inp, d, pt, strides, pad, dilations=rate, "
+        "name=name, data_format=fmt)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1338,16 +1343,16 @@ def testConv2D(self):
         "tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu, "
         "data_format)")
     expected_text = (
-        "tf.nn.conv2d(input=input, filters=filter, strides=strides, "
-        "padding=padding, data_format=data_format)")
+        "tf.nn.conv2d(input, filters=filter, strides=strides, padding=padding, "
+        "data_format=data_format)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
     text = (
         "tf.nn.conv2d(input, filter=filter, strides=strides, padding=padding, "
         "use_cudnn_on_gpu=use_cudnn_on_gpu)")
-    expected_text = ("tf.nn.conv2d(input=input, filters=filter, "
-                     "strides=strides, padding=padding)")
+    expected_text = ("tf.nn.conv2d(input, filters=filter, strides=strides, "
+                     "padding=padding)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1380,8 +1385,8 @@ def testSpacetoBatch(self):
 
     text = "tf.nn.space_to_batch(input, paddings, block_size, name)"
     expected_text = (
-        "tf.space_to_batch(input=input, paddings=paddings, "
-        "block_shape=block_size, name=name)")
+        "tf.space_to_batch(input, paddings=paddings, block_shape=block_size, "
+        "name=name)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1395,15 +1400,15 @@ def testInTopK(self):
   def testDepthToSpace(self):
     text = "tf.nn.depth_to_space(input, block_size, name, data_format)"
     expected_text = (
-        "tf.nn.depth_to_space(input=input, block_size=block_size, "
-        "name=name, data_format=data_format)")
+        "tf.nn.depth_to_space(input, block_size, name=name, "
+        "data_format=data_format)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
   def testEmbeddingLookup(self):
     text = ("tf.nn.embedding_lookup(params, ids, partition_strategy, name, "
             "validate_indices, max_norm)")
-    expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
+    expected_text = ("tf.nn.embedding_lookup(params, ids, "
                      "partition_strategy=partition_strategy, name=name, "
                      "max_norm=max_norm)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -1412,10 +1417,9 @@ def testEmbeddingLookup(self):
   def testEmbeddingLookupSparse(self):
     text = ("tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, "
             "partition_strategy, name, combiner, max_norm)")
-    expected_text = ("tf.nn.embedding_lookup_sparse(params=params, "
-                     "sp_ids=sp_ids, sp_weights=sp_weights, "
-                     "partition_strategy=partition_strategy, name=name, "
-                     "combiner=combiner, max_norm=max_norm)")
+    expected_text = ("tf.nn.embedding_lookup_sparse(params, sp_ids, "
+                     "sp_weights, partition_strategy=partition_strategy, "
+                     "name=name, combiner=combiner, max_norm=max_norm)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1428,8 +1432,8 @@ def testNnInTopK(self):
 
   def testSpaceToDepth(self):
     text = "tf.nn.space_to_depth(input, block_size, name, data_format)"
-    expected_text = ("tf.nn.space_to_depth(input=input, block_size=block_size, "
-                     "name=name, data_format=data_format)")
+    expected_text = ("tf.nn.space_to_depth(input, block_size, name=name, "
+                     "data_format=data_format)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -1579,8 +1583,8 @@ def testImageResizeExtraPositionalArgs(self):
         self.assertIn(s, new_text)
 
   def testCond(self):
-    text = "tf.cond(a, b, c, True)"
-    expected_text = "tf.cond(pred=a, true_fn=b, false_fn=c)"
+    text = "tf.cond(a, b, c, True, d)"
+    expected_text = "tf.cond(a, b, c, name=d)"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
     self.assertIn("tf.cond", errors[0])
@@ -1589,16 +1593,16 @@ def testCond(self):
   def testParens(self):
     text = """
 def _log_prob(self, x):
-  return tf.reduce_logsumexp(
+  return tf.debugging.assert_all_finite(
       (self.mixture_distribution.logits + self.distribution.log_prob(
           x[..., tf.newaxis])),
-          axis=-1)"""
+          message='Nans or Infs found')"""
     expected_text = """
 def _log_prob(self, x):
-  return tf.reduce_logsumexp(
-      input_tensor=(self.mixture_distribution.logits + self.distribution.log_prob(
+  return tf.debugging.assert_all_finite(
+      x=(self.mixture_distribution.logits + self.distribution.log_prob(
           x[..., tf.newaxis])),
-          axis=-1)"""
+          message='Nans or Infs found')"""
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
@@ -1662,10 +1666,11 @@ def test_CriticalSection_upgrade(self):
     self.assertEqual(expected, new_text)
 
   def test_sample_distorted_bounding_box(self):
-    # pylint: disable=line-too-long
-    text = "tf.image.sample_distorted_bounding_box(a, b, c, d, e, f, g, h, i, j)"
-    expected = "tf.image.sample_distorted_bounding_box(image_size=a, bounding_boxes=b, seed=c, min_object_covered=e, aspect_ratio_range=f, area_range=g, max_attempts=h, use_image_if_no_bounding_boxes=i, name=j)"
-    # pylint: enable=line-too-long
+    text = ("tf.image.sample_distorted_bounding_box(a, b, c, d, e, f, g, h, i, "
+            "j)")
+    expected = ("tf.image.sample_distorted_bounding_box(a, b, c, "
+                "min_object_covered=e, aspect_ratio_range=f, area_range=g, "
+                "max_attempts=h, use_image_if_no_bounding_boxes=i, name=j)")
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
 
@@ -1678,7 +1683,6 @@ def test_contrib_initialize(self):
   def test_contrib_framework_argsort(self):
     text = "tf.contrib.framework.argsort"
     expected = "tf.argsort"
-    # pylint: enable=line-too-long
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
 
@@ -1939,8 +1943,18 @@ def test_avg_pool_2d(self):
   def test_saved_model_load(self):
     text = "tf.saved_model.load(sess, ['foo_graph'])"
     expected = "tf.compat.v1.saved_model.load(sess, ['foo_graph'])"
-    _, _, _, new_text = self._upgrade(text)
+    _, report, _, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
+    expected_info = "tf.saved_model.load works differently in 2.0"
+    self.assertIn(expected_info, report)
+
+  def test_saved_model_loader_load(self):
+    text = "tf.saved_model.loader.load(sess, ['foo_graph'])"
+    expected = "tf.compat.v1.saved_model.load(sess, ['foo_graph'])"
+    _, report, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    expected_info = "tf.saved_model.load works differently in 2.0"
+    self.assertIn(expected_info, report)
 
   def test_saved_model_load_v2(self):
     text = "tf.saved_model.load_v2('/tmp/blah')"
@@ -2054,28 +2068,24 @@ def test_strings_split(self, text, expected_text):
 
   def test_sdca_to_raw_ops(self):
     text = "tf.train.sdca_fprint(input_tensor)"
-    expected_text = "tf.raw_ops.SdcaFprint(input=input_tensor)"
+    expected_text = "tf.raw_ops.SdcaFprint(input_tensor)"
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
     text = "tf.train.sdca_fprint(input, name=n)"
-    expected_text = "tf.raw_ops.SdcaFprint(input=input, name=n)"
+    expected_text = "tf.raw_ops.SdcaFprint(input, name=n)"
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
     text = "tf.train.sdca_shrink_l1(w, l, ll)"
-    expected_text = "tf.raw_ops.SdcaShrinkL1(weights=w, l1=l, l2=ll)"
+    expected_text = "tf.raw_ops.SdcaShrinkL1(w, l, ll)"
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
     text = (
         "tf.train.sdca_optimizer(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)")
     expected_text = (
-        "tf.raw_ops.SdcaOptimizer(sparse_example_indices=a, "
-        "sparse_feature_indices=b, sparse_feature_values=c, dense_features=d, "
-        "example_weights=e, example_labels=f, sparse_indices=g, "
-        "sparse_weights=h, dense_weights=i, example_state_data=j, loss_type=k, "
-        "l1=l, l2=m, num_loss_partitions=n, num_inner_iterations=o)")
+        "tf.raw_ops.SdcaOptimizer(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)")
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
@@ -2356,21 +2366,32 @@ def test_import_analysis(self):
                   report)
     self.assertEmpty(errors)
 
-  def test_api_spec_reset_between_files(self):
-    for old_symbol, new_symbol in [
-        ("tf.conj(a)", "tf.math.conj(a)"),
-        ("tf.to_int32(x)", "tf.cast(x, dtype=tf.int32)")]:
-
-      ## Test that the api spec is reset in between files:
-      import_header = "import tensorflow.compat.v2 as tf\n"
-      text_a = import_header + old_symbol
-      expected_text_a = import_header + old_symbol
-      text_b = old_symbol
-      expected_text_b = new_symbol
-      results = self._upgrade_multiple([text_a, text_b])
-      result_a, result_b = results[0], results[1]
-      self.assertEqual(result_a[3], expected_text_a)
-      self.assertEqual(result_b[3], expected_text_b)
+  @parameterized.parameters(
+      [False,
+       "import tensorflow.compat.v2 as tf\ntf.conj(a)",
+       "import tensorflow.compat.v2 as tf\ntf.conj(a)",
+       "tf.conj(a)",
+       "tf.math.conj(a)"],
+      [False,
+       "import tensorflow.compat.v2 as tf\ntf.to_int32(x)",
+       "import tensorflow.compat.v2 as tf\ntf.to_int32(x)",
+       "tf.to_int32(x)",
+       "tf.cast(x, dtype=tf.int32)"],
+      # Verify that upgrade_compat_v1_import option persists between files
+      [True,
+       "import tensorflow.compat.v1 as tf\ntf.conj(a)",
+       "import tensorflow.compat.v2 as tf\ntf.math.conj(a)",
+       "import tensorflow.compat.v1 as tf\ntf.to_int32(x)",
+       "import tensorflow.compat.v2 as tf\ntf.cast(x, dtype=tf.int32)"],
+  )  # pyformat: disable
+  def test_api_spec_reset_between_files(self,
+                                        upgrade_compat_v1_import,
+                                        text_a, expected_text_a,
+                                        text_b, expected_text_b):
+    results = self._upgrade_multiple(upgrade_compat_v1_import, [text_a, text_b])
+    result_a, result_b = results[0], results[1]
+    self.assertEqual(result_a[3], expected_text_a)
+    self.assertEqual(result_b[3], expected_text_b)
 
   def test_model_to_estimator_checkpoint_warning(self):
     text = "tf.keras.estimator.model_to_estimator(model)"
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index c0cdd66215c..6229655c4be 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -1,4 +1,5 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
@@ -9,13 +10,16 @@ py_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        "@six_archive//:six",
+        # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
+        # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
+        "//tensorflow/python:modules_with_exports",
         "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
         "//tensorflow/tools/compatibility:all_renames_v2",
-        "@six_archive//:six",
     ],
 )
 
@@ -25,6 +29,8 @@ py_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
+        # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
         "//tensorflow/python:no_contrib",
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index eb38108a252..57d0441d015 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -16,9 +16,9 @@
 """Script for updating tensorflow/tools/compatibility/renames_v2.py.
 
 To update renames_v2.py, run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
-  pyformat --in_place third_party/tensorflow/tools/compatibility/renames_v2.py
+  bazel run tensorflow/tools/compatibility/update:generate_v2_renames_map
+Afterwards, you need to update reoders_v2.py by running:
+  bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 """
 # pylint: enable=line-too-long
 import sys
@@ -27,6 +27,7 @@
 import tensorflow as tf
 
 from tensorflow import python as tf_python  # pylint: disable=unused-import
+from tensorflow.python import modules_with_exports  # pylint: disable=unused-import
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
@@ -55,9 +56,7 @@
 \"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
 
 THIS FILE IS AUTOGENERATED: To update, please run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
-pyformat --in_place third_party/tensorflow/tools/compatibility/renames_v2.py
+  bazel run tensorflow/tools/compatibility/update:generate_v2_renames_map
 This file should be updated whenever endpoints are deprecated.
 \"\"\"
 """
@@ -83,8 +82,9 @@ def visit(unused_path, unused_parent, children):
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
-  visitor.do_not_descend_map['tf.compat'] = ['v1']
+  visitor.private_map['tf.compat'] = ['v1', 'v2']
   traverse.traverse(tf.compat.v2, visitor)
+  traverse.traverse(tf.compat.v2.estimator, visitor)
   return v2_names
 
 
@@ -95,9 +95,12 @@ def collect_constant_renames():
     Set of tuples of the form (current name, new name).
   """
   renames = set()
-  for module in sys.modules.values():
-    constants_v1_list = tf_export.get_v1_constants(module)
-    constants_v2_list = tf_export.get_v2_constants(module)
+  for module in sys.modules.copy().values():
+    try:
+      constants_v1_list = tf_export.get_v1_constants(module)
+      constants_v2_list = tf_export.get_v2_constants(module)
+    except:  # pylint: disable=bare-except
+      pass
 
     # _tf_api_constants attribute contains a list of tuples:
     # (api_names_list, constant_name)
@@ -128,33 +131,45 @@ def collect_function_renames():
   # Set of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
   renames = set()
+  all_v2_names = get_all_v2_names()
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v1 = tf_export.get_v1_names(attr)
+      api_names_v1 = [
+          name for name in tf_export.get_v1_names(attr)
+          if '.__internal__.' not in name
+      ]
       api_names_v2 = tf_export.get_v2_names(attr)
+
+      if not api_names_v2:
+        # It is possible that a different function is exported with the same
+        # name. For e.g. when creating a different function to rename arguments.
+        # Determine if this is the case to not do a useless rename to compat.v1
+        # for the function and its aliases.
+        # Note that unsafe v1 to v2 renames created here are overridden by the
+        # manual_symbol_renames in all_renames_v2.py.
+        api_names_v2 = [name for name in api_names_v1 if name in all_v2_names]
+
       deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
         renames.add((name, get_canonical_name(api_names_v2, name)))
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
-  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
-  traverse.traverse(tf, visitor)
-
-  # It is possible that a different function is exported with the
-  # same name. For e.g. when creating a different function to
-  # rename arguments. Exclude it from renames in this case.
-  v2_names = get_all_v2_names()
-  renames = set((name, new_name) for name, new_name in renames
-                if name not in v2_names)
+  visitor.private_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf.version, visitor)
+  traverse.traverse(tf.compat.v1, visitor)
+  traverse.traverse(tf.compat.v1.estimator, visitor)
+  traverse.traverse(tf.compat.v2, visitor)
+  traverse.traverse(tf.compat.v2.estimator, visitor)
+
   return renames
 
 
 def get_rename_line(name, canonical_name):
-  return '    \'tf.%s\': \'tf.%s\'' % (name, canonical_name)
+  return '    \'tf.%s\':\n        \'tf.%s\'' % (name, canonical_name)
 
 
 def update_renames_v2(output_file_path):
@@ -167,8 +182,7 @@ def update_renames_v2(output_file_path):
   function_renames = collect_function_renames()
   constant_renames = collect_constant_renames()
   all_renames = function_renames.union(constant_renames)
-  manual_renames = set(
-      all_renames_v2.manual_symbol_renames.keys())
+  manual_renames = all_renames_v2.manual_symbol_renames
 
   # List of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
index bdfaaa7c9af..4180a96f860 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -16,8 +16,7 @@
 """Script for updating tensorflow/tools/compatibility/reorders_v2.py.
 
 To update reorders_v2.py, run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+  bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 """
 # pylint: enable=line-too-long
 
@@ -55,52 +54,112 @@
 \"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
 
 THIS FILE IS AUTOGENERATED: To update, please run:
-  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
-  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+  bazel run tensorflow/tools/compatibility/update:generate_v2_reorders_map
 This file should be updated whenever a function is added to
 self.reordered_function_names in tf_upgrade_v2.py.
 \"\"\"
 """
 
 
-def collect_function_arg_names(function_names):
+def collect_function_arg_names(function_names, return_all_args_function_names,
+                               function_renames):
   """Determines argument names for reordered function signatures.
 
   Args:
     function_names: Functions to collect arguments for.
+    return_all_args_function_names: Functions to collect all argument names for.
+    function_renames: Function renames between v1 and v2.
 
   Returns:
-    Dictionary mapping function name to its arguments.
+    Dictionary mapping function names to a list of argument names. Each argument
+    name list can have leading `None` elements to indicate that some of the
+    function arguments did not change between v1 and v2.
   """
-  # Map from reordered function name to its arguments.
-  function_to_args = {}
+  function_name_v1_to_attr = {}
+  function_name_v2_to_attr = {}
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects arguments for reordered functions."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v1 = tf_export.get_v1_names(attr)
-      api_names_v1 = ['tf.%s' % name for name in api_names_v1]
-      matches_function_names = any(
-          name in function_names for name in api_names_v1)
-      if matches_function_names:
-        if tf_inspect.isclass(attr):
-          # Get constructor arguments if attr is a class
-          arg_list = tf_inspect.getargspec(
-              getattr(attr, '__init__'))[0]
-          arg_list = arg_list[1:]  # skip 'self' argument
-        else:
-          # Get function arguments.
-          # getargspec returns a tuple of (args, varargs, keywords, defaults)
-          # we just look at args.
-          arg_list = tf_inspect.getargspec(attr)[0]
+
+      api_names_v1 = ['tf.' + name for name in tf_export.get_v1_names(attr)]
+      if any(name in function_names for name in api_names_v1):
         for name in api_names_v1:
-          function_to_args[name] = arg_list
+          function_name_v1_to_attr[name] = attr
+
+      api_names_v2 = ['tf.' + name for name in tf_export.get_v2_names(attr)]
+      for name in api_names_v2:
+        function_name_v2_to_attr[name] = attr
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
-  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
-  traverse.traverse(tf, visitor)
+  visitor.private_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf.compat.v1, visitor)
+  traverse.traverse(tf.compat.v1.estimator, visitor)
+  traverse.traverse(tf.compat.v2, visitor)
+  traverse.traverse(tf.compat.v2.estimator, visitor)
+
+  def get_arguments_list(attr):
+    if tf_inspect.isclass(attr):
+      # Get constructor arguments if attr is a class
+      arg_list = tf_inspect.getargspec(
+          getattr(attr, '__init__'))[0]
+      return arg_list[1:]  # skip 'self' argument
+    else:
+      # Get function arguments.
+      # getargspec returns a tuple of (args, varargs, keywords, defaults)
+      # we just look at args.
+      return tf_inspect.getargspec(attr)[0]
+
+  # Map from reordered function name to its arguments.
+  function_to_args = {}
+
+  if any(name not in function_name_v1_to_attr for name in function_names):
+    raise ValueError(
+        'Symbols not found in `tf.compat.v1`: '
+        f'`{"`, `".join(function_names - function_name_v1_to_attr.keys())}`'
+    )
+
+  for name_v1, attr_v1 in function_name_v1_to_attr.items():
+    args_v1 = get_arguments_list(attr_v1)
+
+    # Per `return_all_args_function_names override`, return all argument names
+    # without comparing with v2.
+    if name_v1 in return_all_args_function_names:
+      function_to_args[name_v1] = args_v1
+      continue
+
+    name_v2 = name_v1
+    if name_v1 in function_renames:
+      name_v2 = function_renames[name_v1]
+      # If the rename is simply mapping `tf.x` to `tf.compat.v1.x`, there is no
+      # change in the arguments, we shouldn't have it in the list.
+      if name_v2.startswith('tf.compat.v1.'):
+        raise ValueError(f'Symbol `{name_v1}` is renamed to `{name_v2}`, '
+                         'no need to add keyword argument names, '
+                         'remove from `reordered_function_names`')
+
+    if name_v2 not in function_name_v2_to_attr:
+      raise ValueError(f'Symbol `{name_v2}` not found in `tf.compat.v2`')
+    args_v2 = get_arguments_list(function_name_v2_to_attr[name_v2])
+
+    # If there is no change in the arguments, we shouldn't have it in the list.
+    if args_v1 == args_v2:
+      raise ValueError(f'Symbol `{name_v1}` has no changes in arguments, '
+                       'no need to add keyword argument names, '
+                       'remove from `reordered_function_names`')
+
+    # Compare v1/v2 argument names and put `None` as long as they're identical.
+    needed_arg_names = []
+    same_so_far = True
+    for index, arg in enumerate(args_v1):
+      if same_so_far and index < len(args_v2) and arg == args_v2[index]:
+        needed_arg_names.append(None)
+      else:
+        same_so_far = False
+        needed_arg_names.append(arg)
+    function_to_args[name_v1] = needed_arg_names
 
   return function_to_args
 
@@ -116,10 +175,16 @@ def update_reorders_v2(output_file_path):
     output_file_path: File path to write output to. Any existing contents
       would be replaced.
   """
-  reordered_function_names = (
-      tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
-
-  all_reorders = collect_function_arg_names(reordered_function_names)
+  spec = tf_upgrade_v2.TFAPIChangeSpec()
+  reordered_function_names = spec.reordered_function_names
+  # We assume that `function_transformers` operate on the keyword arguments, so
+  # for those we will expand all the arguments
+  need_kwargs_function_names = spec.function_transformers.keys()
+  function_renames = spec.symbol_renames
+
+  all_reorders = collect_function_arg_names(reordered_function_names,
+                                            need_kwargs_function_names,
+                                            function_renames)
 
   # List of reorder lines to write to output file in the form:
   #   'tf.function_name': ['arg1', 'arg2', ...]
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index dbc3eccc6b2..444c2d151ab 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -42,11 +42,23 @@ DUMPBIN_CMD = "\"{}\" /SYMBOLS".format("%{dumpbin_bin_path}")
 EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
-INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+INCLUDEPRE_RE = re.compile(r"absl::lts_[0-9]+::base_internal::ThrowStdOutOfRange|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::str_format_internal::FormatArgImpl|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::ByChar|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::numbers_internal::FastIntToBuffer|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::StrCat|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::StrAppend|" # for _pywrap_tfe
+                           r"absl::lts_[0-9]+::hash_internal|" # for _pywrap_tfcompile
+                           r"absl::lts_[0-9]+::container_internal|" # for _pywrap_tfcompile
+                           r"absl::lts_[0-9]+::Cord|" # for tensorflow::Status
+                           r"absl::lts_[0-9]+::Cord::DestroyCordSlow|" # for tensorflow::Status
+                           r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"google::protobuf::internal::ArenaImpl::AllocateAligned|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::internal::LogMessage|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::MessageLite::SerializeAsString|" # for pywrap_saved_model
+                           r"google::protobuf::MessageLite::ParseFromString|" # for pywrap_saved_model
                            r"absl::Mutex::ReaderLock|" # for //tensorflow/contrib/rnn:python/ops/_gru_ops.so and more ops
                            r"absl::Mutex::ReaderUnlock|" # for //tensorflow/contrib/rnn:python/ops/_gru_ops.so and more ops
                            r"tensorflow::internal::LogMessage|"
@@ -141,7 +153,7 @@ def get_symbols(path_to_lib, re_filter):
   # Example symbol line:
   # 954 00000000 SECT2BD notype ()    External    | ?IsSequence@swig@tensorflow@@YA_NPEAU_object@@@Z (bool __cdecl tensorflow::swig::IsSequence(struct _object *))
   # Anomaly symbol line:
-  # 00B 00000000 SECT4  notype       External     | _tensorflow_numpy_api.
+  # 00B 00000000 SECT4  notype       External     | _tsl_numpy_api.
   sym_filtered = []
   re_filter_comp = re.compile(r"{}".format(re_filter))
 
@@ -276,8 +288,21 @@ def main():
     def_fp.write("\t ?NewSession@tensorflow@@YAPEAVSession@1@AEBUSessionOptions@1@@Z\n")
     def_fp.write("\t ??1SavedModelBundleInterface@tensorflow@@UEAA@XZ\n")
     def_fp.write("\t ?MaybeSavedModelDirectory@tensorflow@@YA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z\n")
-    def_fp.write("\t ?_TensorShapeProto_default_instance_@tensorflow@@3VTensorShapeProtoDefaultTypeInternal@1@A\n")
-    def_fp.write("\t ?_GraphDef_default_instance_@tensorflow@@3VGraphDefDefaultTypeInternal@1@A\n")
+    def_fp.write("\t ??_7HistogramProto@tensorflow@@6B@\n")
+    def_fp.write("\t ??_7ConfigProto@tensorflow@@6B@\n") # for _pywrap_tfe
+    def_fp.write("\t ??_7CoordinatedTask@tensorflow@@6B@\n") # for _pywrap_tfe
+    def_fp.write("\t ?InternalSwap@CoordinatedTask@tensorflow@@AEAAXPEAV12@@Z\n") # for _pywrap_tfe
+    def_fp.write("\t ?kSeed@MixingHashState@hash_internal@lts_20220623@absl@@0QEBXEB\n") # for _pywrap_tfcompile
+    def_fp.write("\t ?kEmptyGroup@container_internal@lts_20220623@absl@@3QBW4ctrl_t@123@B\n") # for _pywrap_tfcompile
+    def_fp.write("\t ??_7GraphDef@tensorflow@@6B@\n")
+    def_fp.write("\t ??_7DeviceProperties@tensorflow@@6B@\n")
+    def_fp.write("\t ??_7MetaGraphDef@tensorflow@@6B@\n")
+    def_fp.write("\t ??_7SavedModel@tensorflow@@6B@\n")
+    def_fp.write("\t ??0CoordinatedTask@tensorflow@@QEAA@XZ\n") # for _pywrap_tfe
+    def_fp.write("\t ?Set@ArenaStringPtr@internal@protobuf@google@@QEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAVArena@34@@Z\n") # _pywrap_tfe
+    def_fp.write("\t ??1CoordinatedTask@tensorflow@@UEAA@XZ\n") # for _pywrap_tfe
+    def_fp.write("\t ?CopyFrom@CoordinatedTask@tensorflow@@QEAAXAEBV12@@Z\n") # for _pywrap_tfe
+    def_fp.write("\t ??0CoordinatedTask@tensorflow@@IEAA@PEAVArena@protobuf@google@@_N@Z\n") # for _pywrap_tfe
 
     # Each symbols returned by undname matches the same position in candidates.
     # We compare on undname but use the decorated name from candidates.
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 81d7120e037..052daf22b1c 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -49,10 +49,22 @@ tensorflow::tfprof::SerializeToString
 [//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool] # graph_analyze
 tensorflow::grappler::graph_analyzer::GraphAnalyzerTool
 
-[//tensorflow/python:bfloat16_lib] # bfloat16
-tensorflow::RegisterNumpyBfloat16
-tensorflow::Bfloat16Dtype
-tensorflow::Float8_E4M3B11Dtype
+[//tensorflow/tsl/python/lib/core:bfloat16_lib] # bfloat16
+tsl::RegisterNumpyBfloat16
+tsl::Bfloat16Dtype
+tsl::Bfloat16NumpyType
+tsl::Float8_E4M3B11Dtype
+
+[//tensorflow/tsl/python/lib/core:float8_lib] # float8_e4m3fn, float8_e5m2
+tsl::RegisterNumpyFloat8e4m3fn
+tsl::Float8e4m3fnDtype
+tsl::Float8e4m3fnNumpyType
+tsl::RegisterNumpyFloat8e5m2
+tsl::Float8e5m2Dtype
+tsl::Float8e5m2NumpyType
+
+[//tensorflow/tsl/python/lib/core:custom_casts_lib]
+tsl::RegisterCustomCasts
 
 [//tensorflow/python:py_func_lib] # py_func
 tensorflow::InitializePyTrampoline
@@ -92,6 +104,8 @@ tensorflow::SessionState::kTensorHandleResourceTypeName
 [//tensorflow/core/data/service:dispatcher_client] # dispatcher_client
 tensorflow::data::DataServiceDispatcherClient::EnsureInitialized
 tensorflow::data::DataServiceDispatcherClient::GetDataServiceMetadata
+tensorflow::data::DataServiceDispatcherClient::GetSnapshotSplit
+tensorflow::data::DataServiceDispatcherClient::Initialize
 
 [//tensorflow/core/data/service:grpc_util] # grpc_util
 tensorflow::data::grpc_util::Retry
@@ -102,7 +116,9 @@ tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
 tensorflow::data::DispatchGrpcDataServer::NumWorkers
+tensorflow::data::DispatchGrpcDataServer::SnapshotStreams
 tensorflow::data::WorkerGrpcDataServer::NumTasks
+tensorflow::data::WorkerGrpcDataServer::SnapshotTaskProgresses
 tensorflow::data::NewDispatchServer
 tensorflow::data::NewWorkerServer
 
@@ -225,6 +241,7 @@ tensorflow::ExperimentalRunPassPipeline
 tensorflow::ExperimentalConvertSavedModelV1ToMlirLite
 tensorflow::ExperimentalConvertSavedModelV1ToMlir
 tensorflow::ExperimentalConvertSavedModelToMlir
+tensorflow::ExperimentalWriteBytecode
 tensorflow::ImportGraphDef
 tensorflow::ImportFunction
 
@@ -276,9 +293,9 @@ tensorflow::GetHandleShapeAndType
 tensorflow::SetHandleShapeAndType
 tensorflow::AddWhileInputHack
 
-[//tensorflow/python:numpy_lib] # tf_session
-tensorflow::ImportNumpy
-_tensorflow_numpy_api
+[//tensorflow/tsl/python/lib/core:numpy] # tf_session
+tsl::ImportNumpy
+_tsl_numpy_api
 
 [//tensorflow/python/client:tf_session_helper] # tf_session
 tensorflow::TF_NewSessionRef
@@ -362,18 +379,15 @@ tensorflow::grappler::GetNumAvailableLogicalCPUCores
 [//tensorflow/core/profiler/internal:traceme_recorder_impl] # profiler
 tensorflow::profiler::TraceMeRecorder::Record
 
-[//tensorflow/core/profiler/lib:profiler_session_impl] # profiler
-tensorflow::ProfilerSession::Create
-tensorflow::ProfilerSession::CollectData
-tensorflow::ProfilerSession::Status
-tensorflow::ProfilerSession::~ProfilerSession
+[//tensorflow/tsl/profiler/lib:profiler_session_impl] # profiler
+tsl::ProfilerSession::Create
+tsl::ProfilerSession::CollectData
+tsl::ProfilerSession::Status
+tsl::ProfilerSession::~ProfilerSession
 
-[//tensorflow/core/profiler/rpc:profiler_server_impl] # profiler
-tensorflow::profiler::ProfilerServer::StartProfilerServer
-tensorflow::profiler::ProfilerServer::~ProfilerServer
-
-[//tensorflow/core/profiler/convert:hlo_to_tools_data_impl] # profiler
-tensorflow::profiler::ConvertHloProtoToToolData
+[//tensorflow/tsl/profiler/rpc:profiler_server_impl] # profiler
+tsl::profiler::ProfilerServer::StartProfilerServer
+tsl::profiler::ProfilerServer::~ProfilerServer
 
 [//tensorflow/core/profiler/rpc/client:profiler_client_impl] # profiler
 tensorflow::profiler::ProfileGrpc
@@ -384,6 +398,15 @@ tensorflow::profiler::RemoteProfilerSession::GetServiceAddress
 tensorflow::profiler::RemoteProfilerSession::WaitForCompletion
 tensorflow::profiler::RemoteProfilerSession::~RemoteProfilerSession
 
+[//tensorflow/tsl/profiler/rpc/client:profiler_client_impl] # profiler
+tsl::profiler::ProfileGrpc
+tsl::profiler::NewSessionGrpc
+tsl::profiler::MonitorGrpc
+tsl::profiler::RemoteProfilerSession::Create
+tsl::profiler::RemoteProfilerSession::GetServiceAddress
+tsl::profiler::RemoteProfilerSession::WaitForCompletion
+tsl::profiler::RemoteProfilerSession::~RemoteProfilerSession
+
 [//tensorflow/compiler/xla:status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
 xla::status_macros::MakeErrorStream::Impl::~Impl
@@ -421,6 +444,8 @@ tensorflow::metrics::SavedModelWrite
 tensorflow::metrics::SavedModelRead
 tensorflow::metrics::SavedModelWriteApi
 tensorflow::metrics::SavedModelReadApi
+tensorflow::metrics::SavedModelWriteFingerprint
+tensorflow::metrics::SavedModelReadFingerprint
 tensorflow::metrics::CheckpointReadDuration
 tensorflow::metrics::CheckpointWriteDuration
 tensorflow::metrics::AsyncCheckpointWriteDuration
@@ -429,6 +454,8 @@ tensorflow::metrics::CheckpointSize
 
 [//tensorflow/cc/saved_model:fingerprinting_impl] # SavedModel Fingerprinting
 tensorflow::saved_model::fingerprinting::CreateFingerprintDef
+tensorflow::saved_model::fingerprinting::ReadSavedModelFingerprint
+tensorflow::saved_model::fingerprinting::MakeFingerprintMap
 
 
 [//tensorflow/compiler/jit:flags] # tfe
@@ -482,6 +509,9 @@ tensorflow::CopyPythonAPITensorLists
 [//tensorflow/core/platform:statusor] # tfe
 tensorflow::internal_statusor::Helper::Crash
 
+[//tensorflow/core/platform:cpu_feature_guard] # cpu_feature_guard
+tensorflow::port::InfoAboutUnusedCPUFeatures
+
 [//tensorflow/core/function:runtime_client_cc] # runtime_client
 tensorflow::core::function::GlobalEagerContext
 tensorflow::core::function::GlobalPythonEagerContext
@@ -527,9 +557,11 @@ tensorflow::dtensor::TPUCoreLocationsToIDs
 tensorflow::dtensor::Pack
 tensorflow::dtensor::Unpack
 tensorflow::dtensor::FetchLayout
+tensorflow::dtensor::IsDTensor
 tensorflow::dtensor::SparsePack
 tensorflow::dtensor::IsSparseDTensor
 tensorflow::dtensor::GetFunctionCacheHitAndMissCount
+tensorflow::dtensor::SetIteratorElementLayouts
 
 [//tensorflow/dtensor/cc:tensor_layout] # DTensor
 tensorflow::dtensor::Mesh
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
index fd53cc27dc2..7fd29bc1748 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -86,6 +86,7 @@ RUN apt-get update && apt-get install -y \
 
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
+    keras_preprocessing \
     tb-nightly \
     h5py \
     matplotlib \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
index 1fad14c3426..b85b5bfc47b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -86,6 +86,7 @@ RUN apt-get update && apt-get install -y \
 
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
+    keras_preprocessing \
     tb-nightly \
     h5py \
     matplotlib \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 9e6c23329d2..19d1ad9fc0a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -83,6 +83,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
@@ -92,8 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     pandas \
     future \
     portpicker \
-    enum34 \
-    'protobuf < 4'
+    enum34
 
 # Installs bazelisk
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index fe596ac67ca..2855c97668e 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -83,6 +83,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
@@ -92,8 +93,7 @@ RUN python3 -m pip --no-cache-dir install \
     pandas \
     future \
     portpicker \
-    enum34 \
-    'protobuf < 4'
+    enum34
 
 # Installs bazelisk
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 772b41779b6..e6055ed8df1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -76,15 +76,15 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
         echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.2/lib64
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.8/lib64:/usr/local/cuda-11.2/lib64
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
@@ -128,6 +128,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
@@ -137,8 +138,7 @@ RUN python3 -m pip --no-cache-dir install \
     pandas \
     future \
     portpicker \
-    enum34 \
-    'protobuf < 4'
+    enum34
 
 # Installs bazelisk
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 370e2b0775d..b88d1419853 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -76,15 +76,15 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
         echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.2/lib64
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.8/lib64:/usr/local/cuda-11.2/lib64
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
@@ -128,6 +128,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
@@ -137,8 +138,7 @@ RUN python3 -m pip --no-cache-dir install \
     pandas \
     future \
     portpicker \
-    enum34 \
-    'protobuf < 4'
+    enum34
 
 # Installs bazelisk
 RUN mkdir /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index 0609f62daad..eb85c99f9dc 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Let us install tzdata painlessly
 ENV DEBIAN_FRONTEND=noninteractive
@@ -43,6 +43,11 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-nvcc-${CUDA/./-} \
+        cuda-cupti-${CUDA/./-} \
+        cuda-nvprune-${CUDA/./-} \
+        cuda-libraries-${CUDA/./-} \
         cuda-command-line-tools-${CUDA/./-} \
         libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
@@ -52,26 +57,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
         libcusparse-${CUDA/./-} \
         curl \
         libcudnn8=${CUDNN}+cuda${CUDA} \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
 # Install TensorRT if not building for PowerPC
-# NOTE: libnvinfer uses cuda11.1 versions
+# NOTE: libnvinfer uses cuda11.6 versions
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 # dynamic linker run-time bindings
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index ae2f251bb05..a5119762657 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Let us install tzdata painlessly
 ENV DEBIAN_FRONTEND=noninteractive
@@ -43,6 +43,11 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-nvcc-${CUDA/./-} \
+        cuda-cupti-${CUDA/./-} \
+        cuda-nvprune-${CUDA/./-} \
+        cuda-libraries-${CUDA/./-} \
         cuda-command-line-tools-${CUDA/./-} \
         libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
@@ -52,26 +57,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
         libcusparse-${CUDA/./-} \
         curl \
         libcudnn8=${CUDNN}+cuda${CUDA} \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
 # Install TensorRT if not building for PowerPC
-# NOTE: libnvinfer uses cuda11.1 versions
+# NOTE: libnvinfer uses cuda11.6 versions
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 # dynamic linker run-time bindings
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 80e3eb5fdc8..99de8d49217 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -82,6 +82,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 50cdfb558eb..be4ae665019 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -82,6 +82,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 1df80fef615..9de1b668e87 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -76,15 +76,15 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
         echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.2/lib64
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.8/lib64:/usr/local/cuda-11.2/lib64
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
@@ -127,6 +127,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index 87ca0ef2d2c..4dc1f97a045 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -76,15 +76,15 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
         echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.2/lib64
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.8/lib64:/usr/local/cuda-11.2/lib64
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
@@ -127,6 +127,7 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m pip --no-cache-dir install \
     Pillow \
     h5py \
+    keras_preprocessing \
     tb-nightly \
     matplotlib \
     mock \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
index 526c8fba2c5..e5f2cf7c822 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Let us install tzdata painlessly
 ENV DEBIAN_FRONTEND=noninteractive
@@ -43,6 +43,11 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-nvcc-${CUDA/./-} \
+        cuda-cupti-${CUDA/./-} \
+        cuda-nvprune-${CUDA/./-} \
+        cuda-libraries-${CUDA/./-} \
         cuda-command-line-tools-${CUDA/./-} \
         libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
@@ -52,26 +57,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
         libcusparse-${CUDA/./-} \
         curl \
         libcudnn8=${CUDNN}+cuda${CUDA} \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
 # Install TensorRT if not building for PowerPC
-# NOTE: libnvinfer uses cuda11.1 versions
+# NOTE: libnvinfer uses cuda11.6 versions
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 # dynamic linker run-time bindings
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
index f287467ec44..29a47290a45 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -22,17 +22,17 @@
 ARG UBUNTU_VERSION=20.04
 
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Let us install tzdata painlessly
 ENV DEBIAN_FRONTEND=noninteractive
@@ -43,6 +43,11 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-nvcc-${CUDA/./-} \
+        cuda-cupti-${CUDA/./-} \
+        cuda-nvprune-${CUDA/./-} \
+        cuda-libraries-${CUDA/./-} \
         cuda-command-line-tools-${CUDA/./-} \
         libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
@@ -52,26 +57,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
         libcusparse-${CUDA/./-} \
         curl \
         libcudnn8=${CUDNN}+cuda${CUDA} \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
 # Install TensorRT if not building for PowerPC
-# NOTE: libnvinfer uses cuda11.1 versions
+# NOTE: libnvinfer uses cuda11.6 versions
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 # dynamic linker run-time bindings
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index a7c7c3f56af..bf5c2eeeb83 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,15 +1,15 @@
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Needed for string substitution
 SHELL ["/bin/bash", "-c"]
@@ -53,15 +53,15 @@ RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
         echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin-dev=${LIBNVINFER}+cuda11.8 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.8 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.2/lib64
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:/usr/include/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs:/usr/local/cuda-11.8/lib64:/usr/local/cuda-11.2/lib64
 ENV TF_NEED_CUDA 1
 ENV TF_NEED_TENSORRT 1
 ENV TF_CUDA_VERSION=${CUDA}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 339fc4cdd34..09575580d37 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,15 +1,15 @@
 ARG ARCH=
-ARG CUDA=11.2
-FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.1-base-ubuntu${UBUNTU_VERSION} as base
+ARG CUDA=11.8
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}.0-base-ubuntu${UBUNTU_VERSION} as base
 # ARCH and CUDA are specified again because the FROM directive resets ARGs
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.1.0.77-1
+ARG CUDNN=8.6.0.163-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
-ARG LIBNVINFER=7.2.2-1
-ARG LIBNVINFER_MAJOR_VERSION=7
+ARG LIBNVINFER=8.4.3-1
+ARG LIBNVINFER_MAJOR_VERSION=8
 
 # Let us install tzdata painlessly
 ENV DEBIAN_FRONTEND=noninteractive
@@ -20,6 +20,11 @@ SHELL ["/bin/bash", "-c"]
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
     apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-nvcc-${CUDA/./-} \
+        cuda-cupti-${CUDA/./-} \
+        cuda-nvprune-${CUDA/./-} \
+        cuda-libraries-${CUDA/./-} \
         cuda-command-line-tools-${CUDA/./-} \
         libcublas-${CUDA/./-} \
         cuda-nvrtc-${CUDA/./-} \
@@ -29,26 +34,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
         libcusparse-${CUDA/./-} \
         curl \
         libcudnn8=${CUDNN}+cuda${CUDA} \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libzmq3-dev \
         pkg-config \
         software-properties-common \
         unzip
 
 # Install TensorRT if not building for PowerPC
-# NOTE: libnvinfer uses cuda11.1 versions
+# NOTE: libnvinfer uses cuda11.6 versions
 RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
-        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub && \
-        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+        echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /"  > /etc/apt/sources.list.d/tensorRT.list && \
         apt-get update && \
-        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
-        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.0 \
+        apt-get install -y --no-install-recommends libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
+        libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda11.6 \
         && apt-get clean \
         && rm -rf /var/lib/apt/lists/*; }
 
 # For CUDA profiling, TensorFlow requires CUPTI.
-ENV LD_LIBRARY_PATH /usr/local/cuda-11.0/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH /usr/local/cuda-11.8/targets/x86_64-linux/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
 # dynamic linker run-time bindings
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 3f4f2e59870..37537e04134 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -31,7 +31,6 @@ releases:
     nightly:
         tag_specs:
             - "{nightly}{jupyter}"
-            - "{_TAG_PREFIX}{ubuntu-devel}"
     # Built per-release and pushed to tensorflow/tensorflow
     # --arg _TAG_PREFIX=<val> should be set to "1.11" (for example) or "latest".
     versioned:
diff --git a/tensorflow/tools/dockerfiles/tests/build-cpu.sh b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
index c506108cde1..bd14176ec87 100755
--- a/tensorflow/tools/dockerfiles/tests/build-cpu.sh
+++ b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
@@ -27,7 +27,7 @@ export TF_NEED_CUDA=0
 # TensorRT build failing as of 2019-12-18, see
 # https://github.com/tensorflow/tensorflow/issues/35115
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
+export PYTHON_BIN_PATH=$(which python3.9)
 export TMP=/tmp
 yes "" | /usr/local/bin/python configure.py
 
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
index 9d88546f8d5..04e3e3b9333 100755
--- a/tensorflow/tools/dockerfiles/tests/build-gpu.sh
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -28,7 +28,7 @@ export TF_NEED_CUDA=1
 # https://github.com/tensorflow/tensorflow/issues/35115
 export TF_NEED_TENSORRT=0
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.7)
+export PYTHON_BIN_PATH=$(which python3.9)
 export TMP=/tmp
 yes "" | /usr/local/bin/python configure.py
 
diff --git a/tensorflow/tools/dockerfiles/tflite-android.Dockerfile b/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
index c3c44718ae8..17f6c6ca11e 100644
--- a/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
@@ -22,7 +22,7 @@ RUN cd ${ANDROID_DEV_HOME} && \
     rm ${ANDROID_SDK_FILENAME}
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r19c-linux-x86_64.zip
+ENV ANDROID_NDK_FILENAME android-ndk-r21e-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index bcfe59c496b..4f39970c961 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -10,6 +10,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index b76b9ffd3a5..d9dcb28d6fb 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -175,8 +175,11 @@ def _score_name(self, path: doc_generator_visitor.ApiPath) -> TfNameScore:
                    tf_export.ESTIMATOR_API_NAME]
 
     for api_name in all_exports:
-      canonical = tf_export.get_canonical_name_for_symbol(
-          self._index[name], api_name=api_name)
+      try:
+        canonical = tf_export.get_canonical_name_for_symbol(
+            self._index[name], api_name=api_name)
+      except AttributeError:
+        canonical = None
       if canonical is not None:
         break
 
@@ -220,6 +223,7 @@ def build_docs(output_dir, code_url_prefix, search_hints):
 
   do_not_document = ["tf.__internal__",
                      "tf.keras.__internal__",
+                     "tf.keras.wrappers",
                      "tf.__operators__",
                      "tf.tools",
                      "tf.compat.v1.pywrap_tensorflow",
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 2e7dcf35568..d6efdc33e5e 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -6,6 +6,7 @@
 load("//tensorflow:tensorflow.bzl", "if_oss")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 8e6cdd7b1d8..0ffcc2bc5ac 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -12,6 +12,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index c8a2da64f40..a8782974f33 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -157,7 +157,6 @@ genrule(
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_tensorrt//:LICENSE",
-        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
@@ -166,11 +165,8 @@ genrule(
         "@tf_runtime//third_party/llvm_derived:LICENSE",
         "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [],
+        "//tensorflow:linux_aarch64": [],
+        "//conditions:default": ["@nasm//:LICENSE"],
     }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
@@ -227,7 +223,6 @@ genrule(
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_tensorrt//:LICENSE",
-        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
@@ -236,11 +231,8 @@ genrule(
         "@tf_runtime//third_party/llvm_derived:LICENSE",
         "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [],
+        "//tensorflow:linux_aarch64": [],
+        "//conditions:default": ["@nasm//:LICENSE"],
     }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index 42a8658074b..2db0d011742 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -5,6 +5,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index 1963c98fbe2..f6ab1fb0d2a 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -9,6 +9,7 @@ load(
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index b0f0bc95e83..f92587c23be 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -64,7 +64,7 @@ transitive_hdrs(
         "//tensorflow/python/framework:python_op_gen",
         "//tensorflow/python/client:tf_session_helper",
         "//third_party/eigen3",
-        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
@@ -106,6 +106,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/core/function/trace_type:serialization_test_proto_py",
     "//tensorflow/core/function/trace_type:serialization",
     "//tensorflow/core:protos_all_proto_srcs",
+    "//tensorflow/dtensor/python/tests:test_util",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
     "//tensorflow/lite/tools:visualize",
@@ -193,14 +194,12 @@ filegroup(
         "@FP16//:LICENSE",
         "@FXdiv//:LICENSE",
         "@XNNPACK//:LICENSE",
-        "@arm_neon_2_x86_sse//:LICENSE",
         "@astunparse_archive//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_google_protobuf//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cpuinfo//:LICENSE",
-        "@cpuinfo//deps/clog:LICENSE",
         "@curl//:COPYING",
         "@dill_archive//:LICENSE",
         "@dlpack//:LICENSE",
@@ -220,7 +219,6 @@ filegroup(
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_tensorrt//:LICENSE",
-        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@opt_einsum_archive//:LICENSE",
         "@pasta//:LICENSE",
@@ -238,11 +236,11 @@ filegroup(
         "@typing_extensions_archive//:LICENSE",
         "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:ios": [],
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [],
+        "//tensorflow:linux_aarch64": [],
+        "//conditions:default": [
+            "@arm_neon_2_x86_sse//:LICENSE",
+            "@nasm//:LICENSE",
+        ],
     }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dc6270d70bf..a217b21b5dd 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -207,6 +207,7 @@ function prepare_src() {
 
   mkdir -p ${TMPDIR}/third_party
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
+  cp -LR $RUNFILES/../local_config_cuda/cuda/_virtual_includes/cuda_headers_virtual/third_party/gpus ${TMPDIR}/third_party
 
   reorganize_includes "${TMPDIR}"
 
@@ -280,7 +281,6 @@ function usage() {
   echo "  Options:"
   echo "    --project_name <name> set project name to name"
   echo "    --cpu                 build tensorflow_cpu"
-  echo "    --gpu                 build tensorflow_gpu"
   echo "    --gpudirect           build tensorflow_gpudirect"
   echo "    --rocm                build tensorflow_rocm"
   echo "    --nightly_flag        build tensorflow nightly"
@@ -292,7 +292,6 @@ function main() {
   PKG_NAME_FLAG=""
   PROJECT_NAME=""
   CPU_BUILD=0
-  GPU_BUILD=0
   GPUDIRECT_BUILD=0
   ROCM_BUILD=0
   NIGHTLY_BUILD=0
@@ -305,8 +304,6 @@ function main() {
       exit 1
     elif [[ "$1" == "--nightly_flag" ]]; then
       NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
     elif [[ "$1" == "--cpu" ]]; then
       CPU_BUILD=1
     elif [[ "$1" == "--gpudirect" ]]; then
@@ -336,8 +333,8 @@ function main() {
     fi
   done
 
-  if [[ $(( GPU_BUILD + CPU_BUILD + GPUDIRECT_BUILD + ROCM_BUILD )) -gt "1" ]]; then
-    echo "Only one of [--gpu, --cpu, --gpudirect, --rocm] may be provided."
+  if [[ $(( CPU_BUILD + GPUDIRECT_BUILD + ROCM_BUILD )) -gt "1" ]]; then
+    echo "Only one of [--cpu, --gpudirect, --rocm] may be provided."
     usage
     exit 1
   fi
@@ -362,8 +359,6 @@ function main() {
 
   if [[ -n ${PROJECT_NAME} ]]; then
     PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
   elif [[ ${NIGHTLY_BUILD} == "1" && ${GPUDIRECT_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly_gpudirect"
   elif [[ ${NIGHTLY_BUILD} == "1" && ${ROCM_BUILD} == "1" ]]; then
@@ -372,8 +367,6 @@ function main() {
     PKG_NAME_FLAG="--project_name tf_nightly_cpu"
   elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
   elif [[ ${GPUDIRECT_BUILD} == "1" ]]; then
     PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
   elif [[ ${ROCM_BUILD} == "1" ]]; then
diff --git a/tensorflow/tools/pip_package/redundant_tensorflow_gpu/README.md b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/README.md
new file mode 100644
index 00000000000..cb996a71ce3
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/README.md
@@ -0,0 +1,30 @@
+tensorflow-gpu has been removed. Please install tensorflow instead.
+The tensorflow package supports GPU accelerated operations via Nvidia CUDA.
+
+## Removal Information
+
+tensorflow and tensorflow-gpu have been the same package since TensorFlow
+2.1, released in September 2019. Although the checksums differ due to metadata,
+they were built in the same way and both provide GPU support via Nvidia CUDA.
+As of January 2023, tensorflow-gpu has been removed and has been replaced with
+this new, empty package that generates an error upon installation.
+
+All existing versions of tensorflow-gpu are still available, but the
+TensorFlow team has stopped releasing any new tensorflow-gpu packages, and
+will not release any patches for existing tensorflow-gpu versions.
+
+## About this package
+
+This simple package raises a warning if setup.py is executed as part of a
+package installation. This intentionally prevents users from installing
+the package.
+
+To build and upload this package's source distribution (sdist) to testpypi:
+
+```
+$ vim setup.cfg  # update the version number and package name
+$ python3 -m pip install --user twine
+$ python3 setup.py sdist
+$ twine upload --repository testpypi dist/*
+$ pip3 install the_name_of_your_test_package -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple
+```
diff --git a/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.cfg b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.cfg
new file mode 100644
index 00000000000..7de87430ba4
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.cfg
@@ -0,0 +1,49 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+[metadata]
+name = tensorflow-gpu
+version = 2.12.0
+description = Removed: please install "tensorflow" instead.
+long_description = file:README.md
+long_description_content_type=text/markdown
+python_requires = >=3.7
+license = Apache 2.0
+keywords = tensorflow, tensor, machine learning
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Intended Audience :: Developers
+    Intended Audience :: Education
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: Apache Software License
+    License :: OSI Approved :: Apache Software License
+    Operating System :: OS Independent
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3 :: Only
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Artificial Intelligence
+    Topic :: Scientific/Engineering :: Mathematics
+    Topic :: Software Development
+    Topic :: Software Development :: Libraries
+    Topic :: Software Development :: Libraries :: Python Modules
+project_urls =
+    Homepage = https://www.tensorflow.org
+    Issue Tracker = https://github.com/tensorflow/tensorflow/issues
+    Download = https://github.com/tensorflow/tensorflow/tags
+author = Google, Inc.
+author_email = packages@tensorflow.org
+
+[options]
+install_requires =
+    python_version>"3.7"
diff --git a/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.py b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.py
new file mode 100644
index 00000000000..5423d46b713
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tensorflow_gpu/setup.py
@@ -0,0 +1,40 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Minimal setup.py that can build an sdist, but warns on install."""
+
+import sys
+
+import setuptools
+
+TF_REMOVAL_WARNING = """
+
+=========================================================
+The "tensorflow-gpu" package has been removed!
+
+Please install "tensorflow" instead.
+
+Other than the name, the two packages have been identical
+since TensorFlow 2.1, or roughly since Sep 2019. For more
+information, see: pypi.org/project/tensorflow-gpu
+=========================================================
+
+"""
+
+# Cover all "pip install" situations
+if "bdist_wheel" in sys.argv or "install" in sys.argv or "bdist_egg" in sys.argv:
+  raise Exception(TF_REMOVAL_WARNING)
+
+if __name__ == "__main__":
+  setuptools.setup()
diff --git a/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/README.md b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/README.md
new file mode 100644
index 00000000000..1d68043e4bc
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/README.md
@@ -0,0 +1,29 @@
+tf-nightly-gpu has been removed. Please install tf-nightly instead.
+The tf-nightly package supports GPU accelerated operations via Nvidia CUDA.
+
+## Removal Information
+
+tf-nightly and tf-nightly-gpu have been the same package since TensorFlow
+2.1, released in September 2019. Although the checksums differ due to metadata,
+they were built in the same way and both provide GPU support via Nvidia CUDA.
+As of January 2023, tf-nightly-gpu has been removed and has been replaced with
+this new, empty package that generates an error upon installation.
+
+The TensorFlow team has stopped releasing any new tf-nightly-gpu packages, and
+tf-nightly-gpu packages may disappear at any time. Please switch to tf-nightly.
+
+## About this package
+
+This simple package raises a warning if setup.py is executed as part of a
+package installation. This intentionally prevents users from installing
+the package.
+
+To build and upload this package's source distribution (sdist) to testpypi:
+
+```
+$ vim setup.cfg  # update the version number and package name
+$ python3 -m pip install --user twine
+$ python3 setup.py sdist
+$ twine upload --repository testpypi dist/*
+$ pip3 install the_name_of_your_test_package -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple
+```
diff --git a/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.cfg b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.cfg
new file mode 100644
index 00000000000..f1c8d326dff
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.cfg
@@ -0,0 +1,49 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+[metadata]
+name = tf-nightly-gpu
+version = 2.12.0
+description = Removed: please install "tf-nightly" instead.
+long_description = file:README.md
+long_description_content_type=text/markdown
+python_requires = >=3.7
+license = Apache 2.0
+keywords = tensorflow, tensor, machine learning
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Intended Audience :: Developers
+    Intended Audience :: Education
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: Apache Software License
+    License :: OSI Approved :: Apache Software License
+    Operating System :: OS Independent
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3 :: Only
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Artificial Intelligence
+    Topic :: Scientific/Engineering :: Mathematics
+    Topic :: Software Development
+    Topic :: Software Development :: Libraries
+    Topic :: Software Development :: Libraries :: Python Modules
+project_urls =
+    Homepage = https://www.tensorflow.org
+    Issue Tracker = https://github.com/tensorflow/tensorflow/issues
+    Download = https://github.com/tensorflow/tensorflow/tags
+author = Google, Inc.
+author_email = packages@tensorflow.org
+
+[options]
+install_requires =
+    python_version>"3.7"
diff --git a/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.py b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.py
new file mode 100644
index 00000000000..ceffb1c0b84
--- /dev/null
+++ b/tensorflow/tools/pip_package/redundant_tf_nightly_gpu/setup.py
@@ -0,0 +1,40 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Minimal setup.py that can build an sdist, but warns on install."""
+
+import sys
+
+import setuptools
+
+TF_REMOVAL_WARNING = """
+
+=========================================================
+The "tf-nightly-gpu" package has been removed!
+
+Please install "tf-nightly" instead.
+
+Other than the name, the two packages have been identical
+since tf-nightly 2.1, or roughly since Sep 2019. For more
+information, see: pypi.org/project/tf-nightly-gpu
+=========================================================
+
+"""
+
+# Cover all "pip install" situations
+if "bdist_wheel" in sys.argv or "install" in sys.argv or "bdist_egg" in sys.argv:
+  raise Exception(TF_REMOVAL_WARNING)
+
+if __name__ == "__main__":
+  setuptools.setup()
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index c3f29b7c927..45989a7d69b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -33,6 +33,7 @@
 import os
 import re
 import sys
+import platform
 
 from setuptools import Command
 from setuptools import find_packages
@@ -46,7 +47,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.12.0'
+_VERSION = '2.13.0'
 
 
 # We use the same setup.py for all tensorflow_* packages and for the nightly
@@ -88,8 +89,12 @@ def standard_or_nightly(standard, nightly):
     'gast >= 0.2.1, <= 0.4.0',
     'google_pasta >= 0.1.1',
     'h5py >= 2.9.0',
+    # TODO(b/239052279): replace with external dependency on JAX repo once JAX
+    # no longer relies on TF.
+    'jax >= 0.3.15',
     'libclang >= 13.0.0',
-    'numpy >= 1.20',
+    # TODO(b/263178356): numpy 1.24 breaks TF's tests
+    'numpy >= 1.22, <1.24',
     'opt_einsum >= 2.3.2',
     'packaging',
     # TODO(b/182876485): Protobuf 3.20 results in linker errors on Windows
@@ -99,7 +104,7 @@ def standard_or_nightly(standard, nightly):
     # See also: https://github.com/protocolbuffers/protobuf/issues/9954
     # See also: https://github.com/tensorflow/tensorflow/issues/56077
     # This is a temporary patch for now, to patch previous TF releases.
-    'protobuf >= 3.9.2, < 3.20',
+    'protobuf>=3.20.3,<5.0.0dev,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5',
     'setuptools',
     'six >= 1.12.0',
     'termcolor >= 1.1.0',
@@ -118,15 +123,31 @@ def standard_or_nightly(standard, nightly):
     # current release version. These also usually have "alpha" or "dev" in their
     # version name.
     # These are all updated during the TF release process.
-    standard_or_nightly('tensorboard >= 2.10, < 2.11',
-                        'tb-nightly ~= 2.11.0.a'),
+    standard_or_nightly('tensorboard >= 2.11, < 2.12',
+                        'tb-nightly ~= 2.12.0.a'),
     standard_or_nightly('tensorflow_estimator >= 2.11.0rc0, < 2.12',
-                        'tf-estimator-nightly ~= 2.12.0.dev'),
+                        'tf-estimator-nightly ~= 2.13.0.dev'),
     standard_or_nightly('keras >= 2.11.0rc1, < 2.12',
-                        'keras-nightly ~= 2.12.0.dev'),
+                        'keras-nightly ~= 2.13.0.dev'),
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
 
+FAKE_REQUIRED_PACKAGES = [
+    # The depedencies here below are not actually used but are needed for
+    # package managers like poetry to parse as they are confused by the
+    # different architectures having different requirements.
+    # The entries here should be a simple duplicate of those in the collaborator
+    # build section.
+    standard_or_nightly('tensorflow-cpu-aws', 'tf-nightly-cpu-aws') + '==' +
+    _VERSION + ';platform_system=="Linux" and (platform_machine=="arm64" or '
+    'platform_machine=="aarch64")',
+    standard_or_nightly('tensorflow-intel', 'tf-nightly-intel') + '==' +
+    _VERSION + ';platform_system=="Windows"',
+]
+
+if platform.system() == 'Linux' and platform.machine() == 'x86_64':
+  REQUIRED_PACKAGES.append(FAKE_REQUIRED_PACKAGES)
+
 if collaborator_build:
   # If this is a collaborator build, then build an "installer" wheel and
   # add the collaborator packages as the only dependencies.
@@ -300,6 +321,7 @@ def find_files(pattern, root):
     list(find_files('*.h', 'google/com_google_protobuf/src')) +
     list(find_files('*.inc', 'google/com_google_protobuf/src')) +
     list(find_files('*', 'third_party/eigen3')) +
+    list(find_files('*', 'third_party/gpus')) +
     list(find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
     list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
     list(find_files('*', 'tensorflow/include/external/eigen_archive')))
@@ -332,7 +354,7 @@ def find_files(pattern, root):
         'install_headers': InstallHeaders,
         'install': InstallCommand,} if not collaborator_build else {},
     # Supported Python versions
-    python_requires='>=3.7',
+    python_requires='>=3.8',
     # PyPI package information.
     classifiers=sorted([
         'Development Status :: 5 - Production/Stable',
@@ -343,10 +365,10 @@ def find_files(pattern, root):
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
         'Programming Language :: Python :: 3 :: Only',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 8533a97df31..854b9dc32e6 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -22,6 +22,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/BUILD b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
index f641ca13ad5..718c303966b 100644
--- a/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
@@ -8,6 +8,7 @@ load(
 licenses(["notice"])
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tools/tensorflow_builder:__subpackages__",
     ],
diff --git a/tensorflow/tools/tensorflow_builder/config_detector/BUILD b/tensorflow/tools/tensorflow_builder/config_detector/BUILD
index 0172ce14157..482af703ff5 100644
--- a/tensorflow/tools/tensorflow_builder/config_detector/BUILD
+++ b/tensorflow/tools/tensorflow_builder/config_detector/BUILD
@@ -1,7 +1,10 @@
 # Description:
 #   TensorFlow builder (TensorFlow on Demand project).
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 py_binary(
     name = "config_detector",
diff --git a/tensorflow/tools/tensorflow_builder/config_detector/data/golden/BUILD b/tensorflow/tools/tensorflow_builder/config_detector/data/golden/BUILD
index ef8166ca8b1..d919382920a 100644
--- a/tensorflow/tools/tensorflow_builder/config_detector/data/golden/BUILD
+++ b/tensorflow/tools/tensorflow_builder/config_detector/data/golden/BUILD
@@ -1,6 +1,7 @@
 # TODO(hyey): describe this package.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tools/tensorflow_builder/config_detector:__subpackages__",
     ],
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index f13ca6cb822..f0345421204 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
@@ -55,6 +56,7 @@ py_library(
         ":system_info_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform",
+        "@absl_py//absl/logging",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index c96f8421edd..a44088e833f 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -138,7 +138,9 @@ def run_and_gather_logs(name,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name)
+    gpu_short_name_match = re.search(
+        r"(Tesla|NVIDIA) (K40|K80|P100|V100|A100)", gpu_name
+    )
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index 894a6e57d75..1b683990096 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -12,10 +12,11 @@ RUN /setup.packages.sh /builder.packages.txt
 COPY builder.devtoolset/fixlinks.sh /fixlinks.sh
 COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
 COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
-RUN /build_devtoolset.sh devtoolset-9 /dt9 
+COPY builder.devtoolset/glibc2.17-inline.patch /glibc2.17-inline.patch
+RUN /build_devtoolset.sh devtoolset-9 /dt9
 
 ################################################################################
-FROM nvidia/cuda:11.2.2-base-ubuntu20.04 as devel
+FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as devel
 ################################################################################
 COPY --from=builder /dt9 /dt9
 
@@ -55,3 +56,7 @@ RUN /setup.python.sh $PYTHON_VERSION devel.requirements.txt
 # Setup build and environment
 COPY devel.usertools /usertools
 COPY devel.bashrc /root/.bashrc
+
+# Don't use the bazel cache when a new docker image is created.
+RUN echo build --action_env=DOCKER_CACHEBUSTER=$(date +%s%N)$RANDOM >> /etc/bazel.bazelrc
+RUN echo build --host_action_env=DOCKER_HOST_CACHEBUSTER=$(date +%s%N)$RANDOM >> /etc/bazel.bazelrc
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
index 0a63ee58d1e..e073aea17bc 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/build_devtoolset.sh
@@ -178,14 +178,21 @@ cp "./x86_64-pc-linux-gnu/libstdc++-v3/src/.libs/libstdc++_nonshared44.a" \
 ;;
 esac
 
-
 # Link in architecture specific includes from the system; note that we cannot
 # link in the whole x86_64-linux-gnu folder, as otherwise we're overlaying
 # system gcc paths that we do not want to find.
 # TODO(klimek): Automate linking in all non-gcc / non-kernel include
 # directories.
 mkdir -p "/${TARGET}/usr/include/x86_64-linux-gnu"
-PYTHON_VERSIONS=("python3.7m" "python3.8" "python3.9" "python3.10")
+PYTHON_VERSIONS=("python3.8" "python3.9" "python3.10" "python3.11")
 for v in "${PYTHON_VERSIONS[@]}"; do
   ln -s "/usr/local/include/${v}" "/${TARGET}/usr/include/x86_64-linux-gnu/${v}"
 done
+
+# Patch glibc to be compatable with modern clang
+case "${VERSION}" in
+devtoolset-9)
+  cd /
+  patch -p0 < /glibc2.17-inline.patch
+;;
+esac
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/glibc2.17-inline.patch b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/glibc2.17-inline.patch
new file mode 100644
index 00000000000..db8c3423a38
--- /dev/null
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/builder.devtoolset/glibc2.17-inline.patch
@@ -0,0 +1,11 @@
+--- /dt9/usr/include/x86_64-linux-gnu/sys/cdefs.h       2013-09-30 13:58:17.000000000 +0000
++++ /dt9/usr/include/x86_64-linux-gnu/sys/cdefs.new.h   2022-11-04 17:17:31.727061220 +0000
+@@ -320,7 +320,7 @@
+
+ /* GCC 4.3 and above with -std=c99 or -std=gnu99 implements ISO C99
+    inline semantics, unless -fgnu89-inline is used.  */
+-#if (!defined __cplusplus || __GNUC_PREREQ (4,3)) && defined __GNUC__
++#if (!defined __cplusplus || __GNUC_PREREQ (4,3) || defined __clang__) && defined __GNUC__
+ # if defined __GNUC_STDC_INLINE__ || defined __cplusplus
+ #  define __extern_inline extern __inline __attribute__ ((__gnu_inline__))
+ #  define __extern_always_inline \
\ No newline at end of file
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index dcf88013c7a..c8b3761deb1 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -1,23 +1,24 @@
 # All required CUDA packages
-cuda-command-line-tools-11-2
-cuda-cudart-dev-11-2
-cuda-cupti-11-2
-cuda-nvprune-11-2
-cuda-libraries-11-2
-cuda-libraries-dev-11-2
-libcufft-11-2
-libcurand-11-2
-libcusolver-dev-11-2
-libcusparse-dev-11-2
-libcublas-dev-11-2
+cuda-command-line-tools-11-8
+cuda-cudart-dev-11-8
+cuda-nvcc-11-8
+cuda-cupti-11-8
+cuda-nvprune-11-8
+cuda-libraries-11-8
+cuda-libraries-dev-11-8
+libcufft-11-8
+libcurand-11-8
+libcusolver-dev-11-8
+libcusparse-dev-11-8
+libcublas-dev-11-8
 # CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
-libcudnn8-dev=8.1.0.77-1+cuda11.2
-libcudnn8=8.1.0.77-1+cuda11.2
+libcudnn8-dev=8.6.0.163-1+cuda11.8
+libcudnn8=8.6.0.163-1+cuda11.8
 # TensorRT: See https://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#maclearn-net-repo-install-rpm
-libnvinfer-plugin7=7.2.2-1+cuda11.1
-libnvinfer7=7.2.2-1+cuda11.1
-libnvinfer-dev=7.2.2-1+cuda11.1
-libnvinfer-plugin-dev=7.2.2-1+cuda11.1 
+libnvinfer-plugin8=8.4.3-1+cuda11.6
+libnvinfer8=8.4.3-1+cuda11.6
+libnvinfer-dev=8.4.3-1+cuda11.6
+libnvinfer-plugin-dev=8.4.3-1+cuda11.6
 
 # Other build-related tools
 apt-transport-https
@@ -25,7 +26,8 @@ autoconf
 automake
 build-essential
 ca-certificates
-clang-8
+llvm-16
+clang-16
 clang-format-12
 colordiff
 curl
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index 417fb4cecd9..104c4401887 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -1,53 +1,50 @@
 # Cloned from internal ci_build dependencies
 # See here for format definitions: https://www.python.org/dev/peps/pep-0440/#version-specifiers
-# 
+#
 # To have reproducible builds, these dependencies should be pinned always.
 # Prefer pinning to the same version as in setup.py for now.
 # This will change in the future.
-
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
 flatbuffers ~= 2.0
 google_pasta ~= 0.2
-h5py ~= 3.6.0
-numpy ~= 1.21.4
+h5py ~= 3.8.0 # Earliest version for Python 3.11
+# TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
+# to support TFX. Remove when Apache Beam upgrades to newer NumPy.
+numpy ~= 1.22.0; python_version < '3.11'
+numpy ~= 1.23.2; python_version >= '3.11' # Earliest version for Python 3.11
 opt_einsum ~= 3.3.0
 packaging ~= 21.3
 protobuf ~= 3.20.1
 six ~= 1.16.0
-termcolor ~= 1.1.0
+termcolor ~= 2.1.1
 typing_extensions ~= 3.10.0.0
-wheel ~= 0.36.2
-wrapt ~= 1.12.1
-
+wheel ~= 0.38.1
+wrapt ~= 1.14.1
 # We need to pin the gast dependency exactly
 gast == 0.4.0
-
 # Finally, install tensorboard and estimator and keras
 # Note that here we want the latest version that matches TF major.minor version
 # Note that we must use nightly here as these are used in nightly jobs
 # For release jobs, we will pin these on the release branch
 # Note that the CACHEBUSTER variable, set in the CI builds, will force these to
 # be the latest version.
-keras-nightly ~= 2.11.0.dev
-tb-nightly ~= 2.9.0.a
-tf-estimator-nightly ~= 2.11.0.dev
-
+keras-nightly ~= 2.13.0.dev
+tb-nightly ~= 2.12.0.a
+tf-estimator-nightly ~= 2.13.0.dev
 # Test dependencies
-grpcio ~= 1.42.0
+grpcio ~= 1.49.1 # Earliest version for Python 3.11
 portpicker ~= 1.4.0
-scipy ~= 1.7.3
+scipy ~= 1.7.2; python_version < '3.11'
+scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
 # Required for TFLite import from JAX tests
-jax ~= 0.3.14
-jaxlib ~= 0.3.14
-
+jax ~= 0.3.24
+jaxlib ~= 0.3.24 # Earliest version for Python 3.11
 # Needs to be addressed. Unblocked 2.4 branchcut cl/338377048
-PyYAML ~= 5.4.1
-
+PyYAML ~= 6.0
 # For uploading
 auditwheel ~= 5.0.0
 twine ~= 3.6.0
-
 # For user tool scripts
 junitparser ~= 2.2.0
 lxml ~= 4.9.1
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
index 85cbc7b7058..c696a91182c 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/code_check_full.bats
@@ -48,6 +48,7 @@ do_external_licenses_check(){
 @bazel_tools//tools
 @local
 @com_google_absl//absl
+@pybind11_abseil//pybind11_abseil
 @org_tensorflow//
 @com_github_googlecloudplatform_google_cloud_cpp//google
 @com_github_grpc_grpc//src/compiler
@@ -65,6 +66,7 @@ EOF
 @bazel_tools//tools/
 @org_tensorflow//tensorflow
 @com_google_absl//
+@pybind11_abseil//pybind11_abseil
 //external
 @local
 @com_github_googlecloudplatform_google_cloud_cpp//
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
index 9b23c3ff28f..5735efe8a88 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
@@ -10,7 +10,7 @@ build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/
 # Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
 # different compilation methods. E.g. for a PR to test a new CUDA version, set
 # the CACHEBUSTER to the PR number.
-build --action_env=CACHEBUSTER=20220325
+build --action_env=CACHEBUSTER=501872366
 
 # Use Python 3.X as installed in container image
 build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
@@ -37,7 +37,7 @@ build --copt=-mavx --host_copt=-mavx
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -50,8 +50,8 @@ test --test_summary=short
 # Pass --config=nonpip to run the same suite of tests. If you want to run just
 # one test for investigation, you don't need --config=nonpip; just run the
 # bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:nonpip_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -74,8 +74,8 @@ test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 test:pip --config=pip_venv
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -98,14 +98,14 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.11_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.11_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.11_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.11_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.12_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.12_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.11_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc
new file mode 100644
index 00000000000..e50e7779f64
--- /dev/null
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_clang.bazelrc
@@ -0,0 +1,123 @@
+# This bazelrc can build a CPU-supporting TF package.
+
+# Convenient cache configurations
+# Use a cache directory mounted to /tf/cache. Very useful!
+build:sigbuild_local_cache --disk_cache=/tf/cache
+# Use the public-access TF DevInfra cache (read only)
+build:sigbuild_remote_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+# Write to the TF DevInfra cache (only works for internal TF CI)
+build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --google_default_credentials
+# Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
+# different compilation methods. E.g. for a PR to test a new CUDA version, set
+# the CACHEBUSTER to the PR number.
+build --action_env=CACHEBUSTER=501872366
+
+# Use Python 3.X as installed in container image
+build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
+build --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
+build --python_path="/usr/bin/python3"
+
+# Build TensorFlow v2
+build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
+
+# Prevent double-compilation of some TF code, ref. b/183279666 (internal)
+# > TF's gen_api_init_files has a genrule to run the core TensorFlow code
+# > on the host machine. If we don't have --distinct_host_configuration=false,
+# > the core TensorFlow code will be built once for the host and once for the
+# > target platform.
+# See also https://docs.bazel.build/versions/master/guide.html#build-configurations-and-cross-compilation
+build --distinct_host_configuration=false
+
+# Target the AVX instruction set
+build --copt=-mavx --host_copt=-mavx
+
+# Disable clang extention that rejects type definitions within offsetof. 
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build --copt=-Wno-gnu-offsetof-extensions
+
+# Store performance profiling log in the mounted artifact directory.
+# The profile can be viewed by visiting chrome://tracing in a Chrome browser.
+# See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
+build --profile=/tf/pkg/profile.json.gz
+
+# Use the NVCC toolchain to compile for manylinux2014
+build --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+
+# Test-related settings below this point.
+test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test --local_test_jobs=HOST_CPUS
+test --test_env=LD_LIBRARY_PATH
+# Give only the list of failed tests at the end of the log
+test --test_summary=short
+
+# "nonpip" tests are regular py_test tests.
+# Pass --config=nonpip to run the same suite of tests. If you want to run just
+# one test for investigation, you don't need --config=nonpip; just run the
+# bazel test invocation as normal.
+test:nonpip_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
+# odd to attempt to validate the quality of the pip package. The wheel is
+# installed into a virtual environment, and then that venv is used to run all
+# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
+# drops all the bazel dependencies for each py_test; this makes all the tests
+# use the wheel's TensorFlow installation instead of the one made available
+# through bazel. This must be done in a different root directory, //bazel_pip/...,
+# because "import tensorflow" run from the root directory would instead import
+# the folder instead of the venv package.
+# 
+# Pass --config=pip to run the same suite of tests. If you want to run just one
+# test for investigation, you'll need --config=pip_venv instead, and then you
+# can specify whichever target you want.
+test:pip_venv --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
+test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
+test:pip_venv --python_path="/bazel_pip/bin/python3"
+test:pip_venv --define=no_tensorflow_py_deps=true
+test:pip --config=pip_venv
+# Yes, we don't exclude the gpu tests on pip for some reason.
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
+test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# For building libtensorflow archives
+test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# For Remote Build Execution.
+build:rbe --google_default_credentials
+build:rbe --bes_backend=buildeventservice.googleapis.com
+build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:rbe --bes_timeout=600s
+build:rbe --define=EXECUTOR=remote
+build:rbe --distinct_host_configuration=false
+build:rbe --flaky_test_attempts=3
+build:rbe --jobs=800
+build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
+build:rbe --remote_timeout=3600
+build:rbe --spawn_strategy=remote,worker,standalone,local
+build:rbe --remote_download_toplevel
+build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
+build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
+build:rbe --host_crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+# Python config is the same across all containers because the binary is the same
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+build:rbe --project_id="tensorflow-testing"
+
+# For continuous builds
+test:pycpp_filters --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:pycpp_filters --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
index bede422c440..d6f129837de 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -10,7 +10,7 @@ build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/
 # Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
 # different compilation methods. E.g. for a PR to test a new CUDA version, set
 # the CACHEBUSTER to the PR number.
-build --action_env=CACHEBUSTER=20220325
+build --action_env=CACHEBUSTER=501872366
 
 # Use Python 3.X as installed in container image
 build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
@@ -41,13 +41,10 @@ build --@local_config_cuda//:enable_cuda
 build --repo_env TF_NEED_CUDA=1
 build --action_env=TF_CUDA_VERSION="11"
 build --action_env=TF_CUDNN_VERSION="8"
-build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
+build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
 build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
-# TensorRT 7 for CUDA 11.1 is compatible with CUDA 11.2, but requires
-# libnvrtc.so.11.1. See https://github.com/NVIDIA/TensorRT/issues/1064.
-# TODO(b/187962120): Remove CUDA-11.1 when upgrading to TensorRT 8.
-build --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib:/usr/local/cuda-11.1/lib64""
-build --crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
+build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
+build --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -62,7 +59,7 @@ build --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,com
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
-test --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64"
+test --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 # Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
 test --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
 # Give only the list of failed tests at the end of the log
@@ -72,8 +69,8 @@ test --test_summary=short
 # Pass --config=nonpip to run the same suite of tests. If you want to run just
 # one test for investigation, you don't need --config=nonpip; just run the
 # bazel test invocation as normal.
-test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
-test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
+test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -95,8 +92,8 @@ test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages
 test:pip_venv --python_path="/bazel_pip/bin/python3"
 test:pip_venv --define=no_tensorflow_py_deps=true
 # Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
-test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
+test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
 test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
@@ -119,27 +116,24 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.11_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.11_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.11_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.11_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.11_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.12_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.12_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.12_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.12_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.11_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
-# TensorRT 7 for CUDA 11.1 is compatible with CUDA 11.2, but requires
-# libnvrtc.so.11.1. See https://github.com/NVIDIA/TensorRT/issues/1064.
-# TODO(b/187962120): Remove when upgrading to TensorRT 8.
-test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.11_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.11_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.11_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.11_config_python"
+test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.12_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.12_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.12_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc
new file mode 100644
index 00000000000..10351836a95
--- /dev/null
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_clang.bazelrc
@@ -0,0 +1,153 @@
+# This bazelrc can build a GPU-supporting TF package.
+
+# Convenient cache configurations
+# Use a cache directory mounted to /tf/cache. Very useful!
+build:sigbuild_local_cache --disk_cache=/tf/cache
+# Use the public-access TF DevInfra cache (read only)
+build:sigbuild_remote_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+# Write to the TF DevInfra cache (only works for internal TF CI)
+build:sigbuild_remote_cache_push --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --google_default_credentials
+# Change the value of CACHEBUSTER when upgrading the toolchain, or when testing
+# different compilation methods. E.g. for a PR to test a new CUDA version, set
+# the CACHEBUSTER to the PR number.
+build --action_env=CACHEBUSTER=501872366
+
+# Use Python 3.X as installed in container image
+build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
+build --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
+build --python_path="/usr/bin/python3"
+
+# Build TensorFlow v2
+build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
+
+# Prevent double-compilation of some TF code, ref. b/183279666 (internal)
+# > TF's gen_api_init_files has a genrule to run the core TensorFlow code
+# > on the host machine. If we don't have --distinct_host_configuration=false,
+# > the core TensorFlow code will be built once for the host and once for the
+# > target platform.
+# See also https://docs.bazel.build/versions/master/guide.html#build-configurations-and-cross-compilation
+build --distinct_host_configuration=false
+
+# Target the AVX instruction set
+build --copt=-mavx --host_copt=-mavx
+
+# Disable clang extention that rejects type definitions within offsetof. 
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build --copt=-Wno-gnu-offsetof-extensions
+
+# Store performance profiling log in the mounted artifact directory.
+# The profile can be viewed by visiting chrome://tracing in a Chrome browser.
+# See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
+build --profile=/tf/pkg/profile.json.gz
+
+# CUDA: Set up compilation CUDA version and paths
+build --@local_config_cuda//:enable_cuda
+build --@local_config_cuda//:cuda_compiler=clang
+build --repo_env TF_NEED_CUDA=1
+build --config cuda_clang
+build --action_env=TF_CUDA_VERSION="11"
+build --action_env=TF_CUDNN_VERSION="8"
+build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
+build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-16/bin/clang"
+build --action_env=TF_CUDA_CLANG="1"
+build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
+build --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+
+# CUDA: Enable TensorRT optimizations
+# https://developer.nvidia.com/tensorrt
+build --repo_env TF_NEED_TENSORRT=1
+
+# CUDA: Select supported compute capabilities (supported graphics cards).
+# This is the same as the official TensorFlow builds.
+# See https://developer.nvidia.com/cuda-gpus#compute
+# TODO(angerson, perfinion): What does sm_ vs compute_ mean?
+# TODO(angerson, perfinion): How can users select a good value for this?
+build --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
+
+# Test-related settings below this point.
+test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
+test --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
+# Give only the list of failed tests at the end of the log
+test --test_summary=short
+
+# "nonpip" tests are regular py_test tests.
+# Pass --config=nonpip to run the same suite of tests. If you want to run just
+# one test for investigation, you don't need --config=nonpip; just run the
+# bazel test invocation as normal.
+test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
+test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
+test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
+# odd to attempt to validate the quality of the pip package. The wheel is
+# installed into a virtual environment, and then that venv is used to run all
+# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
+# drops all the bazel dependencies for each py_test; this makes all the tests
+# use the wheel's TensorFlow installation instead of the one made available
+# through bazel. This must be done in a different root directory, //bazel_pip/...,
+# because "import tensorflow" run from the root directory would instead import
+# the folder instead of the venv package.
+# 
+# Pass --config=pip to run the same suite of tests. If you want to run just one
+# test for investigation, you'll need --config=pip_venv instead, and then you
+# can specify whichever target you want.
+test:pip_venv --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
+test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
+test:pip_venv --python_path="/bazel_pip/bin/python3"
+test:pip_venv --define=no_tensorflow_py_deps=true
+# Yes, we don't exclude the gpu tests on pip for some reason.
+test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
+
+# For building libtensorflow archives
+test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# For Remote Build Execution.
+build:rbe --google_default_credentials
+build:rbe --bes_backend=buildeventservice.googleapis.com
+build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:rbe --bes_timeout=600s
+build:rbe --define=EXECUTOR=remote
+build:rbe --distinct_host_configuration=false
+build:rbe --flaky_test_attempts=3
+build:rbe --jobs=800
+build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
+build:rbe --remote_timeout=3600
+build:rbe --spawn_strategy=remote,worker,standalone,local
+build:rbe --remote_download_toplevel
+build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
+build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
+build:rbe --host_crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.12-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.12-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.12-clang_config_platform//:platform"
+# Python config is the same across all containers because the binary is the same
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+build:rbe --project_id="tensorflow-testing"
+
+# For Remote build execution -- GPU configuration
+build:rbe --repo_env=REMOTE_GPU_TESTING=1
+test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.12-clang_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.12-clang_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.12-clang_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.12-clang_config_python"
+
+# For continuous builds
+test:pycpp_filters --test_tag_filters=-no_oss,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:pycpp_filters --build_tag_filters=-no_oss,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
+test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/nvidia.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/nvidia.bazelrc
index 63d35330f1e..18ae0c3a90e 120000
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/nvidia.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/nvidia.bazelrc
@@ -1 +1 @@
-gpu.bazelrc
\ No newline at end of file
+gpu.bazelrc
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
index c09979556de..27cedb6de7b 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
@@ -1,6 +1,7 @@
 # Test dependencies for pip tests
-grpcio ~= 1.42.0
+grpcio ~= 1.49.1
 portpicker ~= 1.4.0
-scipy ~= 1.7.3
-jax ~= 0.2.26
-jaxlib ~= 0.1.75
+scipy ~= 1.7.2; python_version < '3.11'
+scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
+jax ~= 0.3.24
+jaxlib ~= 0.3.24
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
index 5983a463d2c..81c7166c73f 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
@@ -36,7 +36,7 @@ teardown_file() {
     # Googlers: search for "test_tf_whl_size"
     case "$TF_WHEEL" in
         # CPU:
-        *cpu*manylinux*) LARGEST_OK_SIZE=220 ;;
+        *cpu*manylinux*) LARGEST_OK_SIZE=240 ;;
         # GPU:
         *manylinux*)     LARGEST_OK_SIZE=580 ;;
         # Unknown:
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh b/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
index 57a6aed2181..5eee4ae203e 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/setup.sources.sh
@@ -35,6 +35,9 @@ apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F5
 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
 
+# LLVM/Clang: https://apt.llvm.org/
+apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
+
 # Set up custom sources
 cat >/etc/apt/sources.list.d/custom.list <<SOURCES
 # Nvidia CUDA packages: 18.04 has more available than 20.04, and we use those
@@ -43,4 +46,8 @@ deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1
 # More Python versions: Deadsnakes
 deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
 deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
+
+# LLVM/Clang repository
+deb http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main
+deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main
 SOURCES
diff --git a/tensorflow/tools/tfg_graph_transforms/BUILD b/tensorflow/tools/tfg_graph_transforms/BUILD
index 20b777f84d5..a8e57957ba1 100644
--- a/tensorflow/tools/tfg_graph_transforms/BUILD
+++ b/tensorflow/tools/tfg_graph_transforms/BUILD
@@ -3,7 +3,10 @@
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 cc_library(
     name = "utils",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/README.md b/tensorflow/tools/toolchains/cpus/aarch64/README.md
new file mode 100644
index 00000000000..8a98f57c59e
--- /dev/null
+++ b/tensorflow/tools/toolchains/cpus/aarch64/README.md
@@ -0,0 +1,21 @@
+# AARCH64 toolchain
+
+Toolchain for performing TensorFlow AARCH64 builds such as used in Github
+Actions ARM_CI and ARM_CD.
+
+Maintainer: @elfringham (Linaro LDCG)
+
+********************************************************************************
+
+This repository contains a toolchain for use with the specially constructed
+Docker containers that match those created by SIG Build for x86 architecture
+builds, but modified for AARCH64 builds.
+
+These Docker containers have been constructed to perform builds of TensorFlow
+that are compatible with manylinux2014 requirements but in an environment that
+has the C++11 Dual ABI enabled.
+
+The Docker containers are available from
+[Docker Hub](https://hub.docker.com/r/linaro/tensorflow-arm64-build/tags) The
+source Dockerfiles are available from
+[Linaro git](https://git.linaro.org/ci/dockerfiles.git/tree/tensorflow-arm64-build)
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
new file mode 100644
index 00000000000..14b93353950
--- /dev/null
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64.bzl
@@ -0,0 +1,242 @@
+"""Repository rule for aarch64 docker container
+"""
+
+load(
+    "//third_party/remote_config:common.bzl",
+    "err_out",
+    "get_host_environ",
+    "raw_exec",
+    "which",
+)
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
+_TF_SYSROOT = "TF_SYSROOT"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+def verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "unfiltered_compile_flags",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " +
+            str(missing) +
+            ".\nWe only got: " +
+            str(params) +
+            ".",
+        )
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+# BEGIN cc_configure common functions.
+def find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sysroot:
+        sysroot += ["--sysroot", tf_sysroot]
+    result = raw_exec(repository_ctx, [cc, "-E", "-x" + lang, "-", "-v"] +
+                                      sysroot)
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    return [
+        _normalize_include_path(repository_ctx, p.strip())
+        for p in inc_dirs.split("\n")
+    ]
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sysroot,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sysroot,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
+
+def auto_configure_fail(msg):
+    """Output failure message when aarch64 gcc configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sAARCH64 gcc Configuration Error:%s %s\n" % (red, no_color, msg))
+
+# END cc_configure common functions (see TODO above).
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
+
+def _tpl_path(repository_ctx, filename):
+    return repository_ctx.path(Label("//tensorflow/tools/toolchains/cpus/aarch64/%s.tpl" % filename))
+
+def _create_local_aarch64_repository(repository_ctx):
+    """Creates the repository containing files set up to build with gcc."""
+
+    # Resolve all labels before doing any real work. Resolving causes the
+    # function to be restarted with all previous state being lost. This
+    # can easily lead to a O(n^2) runtime in the number of labels.
+    # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
+    tpl_paths = {filename: _tpl_path(repository_ctx, filename) for filename in [
+        "crosstool:BUILD",
+        "crosstool:cc_toolchain_config.bzl",
+    ]}
+
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc
+
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc_fullpath,
+        tf_sysroot,
+    )
+
+    aarch64_defines = {}
+    aarch64_defines["%{builtin_sysroot}"] = tf_sysroot
+    aarch64_defines["%{compiler}"] = "gcc"
+
+    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX)
+    if not host_compiler_prefix:
+        host_compiler_prefix = "/usr/bin"
+
+    aarch64_defines["%{host_compiler_prefix}"] = host_compiler_prefix
+
+    aarch64_defines["%{linker_bin_path}"] = host_compiler_prefix
+    aarch64_defines["%{host_compiler_path}"] = str(cc)
+    aarch64_defines["%{host_compiler_warnings}"] = ""
+    aarch64_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
+    aarch64_defines["%{compiler_deps}"] = ":aarch64_gcc_pieces"
+    aarch64_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+    aarch64_defines["%{unfiltered_compile_flags}"] = ""
+
+    verify_build_defines(aarch64_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        tpl_paths["crosstool:BUILD"],
+        aarch64_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        tpl_paths["crosstool:cc_toolchain_config.bzl"],
+        {},
+    )
+
+_ENVIRONS = [
+    _GCC_HOST_COMPILER_PATH,
+    _GCC_HOST_COMPILER_PREFIX,
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+]
+
+remote_aarch64_configure = repository_rule(
+    implementation = _create_local_aarch64_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
+)
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
new file mode 100644
index 00000000000..1329c3aba39
--- /dev/null
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -0,0 +1,70 @@
+"""Configurations of AARCH64 builds used with Docker container."""
+
+load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/py:python_configure.bzl", "remote_python_configure")
+
+def ml2014_tf_aarch64_configs(name_container_map, env):
+    for name, container in name_container_map.items():
+        exec_properties = {
+            "container-image": container,
+            "Pool": "default",
+        }
+
+        remote_aarch64_configure(
+            name = "%s_config_aarch64" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+        remote_platform_configure(
+            name = "%s_config_aarch64_platform" % name,
+            platform = "linux",
+            platform_exec_properties = exec_properties,
+        )
+
+        remote_python_configure(
+            name = "%s_config_python" % name,
+            environ = env,
+            exec_properties = exec_properties,
+            platform_constraint = "@%s_config_aarch64_platform//:platform_constraint" % name,
+        )
+
+def aarch64_compiler_configure():
+    ml2014_tf_aarch64_configs(
+        name_container_map = {
+            "ml2014_aarch64": "docker://localhost/tensorflow-build-aarch64",
+            "ml2014_aarch64-python3.7": "docker://localhost/tensorflow-build-aarch64:latest-python3.7",
+            "ml2014_aarch64-python3.8": "docker://localhost/tensorflow-build-aarch64:latest-python3.8",
+            "ml2014_aarch64-python3.9": "docker://localhost/tensorflow-build-aarch64:latest-python3.9",
+            "ml2014_aarch64-python3.10": "docker://localhost/tensorflow-build-aarch64:latest-python3.10",
+        },
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.17",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt10/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "aarch64-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "generic",
+            "BAZEL_TARGET_LIBC": "glibc_2.17",
+            "BAZEL_TARGET_SYSTEM": "aarch64-unknown-linux-gnu",
+            "CC": "/dt10/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_aarch64",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "",
+            "GCC_HOST_COMPILER_PATH": "/dt10/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt10/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt10/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "",
+            "TF_CUDA_VERSION": "",
+            "TF_CUDNN_VERSION": "",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "0",
+            "TF_NEED_TENSORRT": "0",
+            "TF_SYSROOT": "/dt10",
+            "TF_TENSORRT_VERSION": "",
+        },
+    )
diff --git a/third_party/llvm/temporary.patch b/tensorflow/tools/toolchains/cpus/aarch64/crosstool/BUILD
similarity index 100%
rename from third_party/llvm/temporary.patch
rename to tensorflow/tools/toolchains/cpus/aarch64/crosstool/BUILD
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/crosstool/BUILD.tpl b/tensorflow/tools/toolchains/cpus/aarch64/crosstool/BUILD.tpl
new file mode 100644
index 00000000000..3f6a9aaed53
--- /dev/null
+++ b/tensorflow/tools/toolchains/cpus/aarch64/crosstool/BUILD.tpl
@@ -0,0 +1,78 @@
+# This file is expanded from a template by aarch64_configure.bzl
+# Update aarch64_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-aarch64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:aarch64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:aarch64",
+    ],
+    toolchain = ":cc-compiler-local-aarch64",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local-aarch64",
+        "aarch64": ":cc-compiler-local-aarch64",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local-aarch64",
+    all_files = "%{compiler_deps}",
+    compiler_files = "%{compiler_deps}",
+    ar_files = "%{compiler_deps}",
+    as_files = "%{compiler_deps}",
+    dwp_files = ":empty",
+    linker_files = "%{compiler_deps}",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux_aarch64",
+    toolchain_config = ":cc-compiler-local-aarch64-config",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-aarch64-config",
+    cpu = "local",
+    builtin_include_directories = [%{cxx_builtin_include_directories}],
+    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
+    host_compiler_path = "%{host_compiler_path}",
+    host_compiler_prefix = "%{host_compiler_prefix}",
+    host_compiler_warnings = [%{host_compiler_warnings}],
+    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
+    linker_bin_path = "%{linker_bin_path}",
+    builtin_sysroot = "%{builtin_sysroot}",
+    compiler = "%{compiler}",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "aarch64_gcc_pieces",
+    srcs = glob([
+        "usr/aarch64-linux-gnu/**",
+        "usr/libexec/**",
+        "usr/lib/gcc/aarch64-unknown-linux-gnu/**",
+        "usr/include/**",
+    ]),
+)
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/crosstool/cc_toolchain_config.bzl.tpl b/tensorflow/tools/toolchains/cpus/aarch64/crosstool/cc_toolchain_config.bzl.tpl
new file mode 100644
index 00000000000..7dc9446a407
--- /dev/null
+++ b/tensorflow/tools/toolchains/cpus/aarch64/crosstool/cc_toolchain_config.bzl.tpl
@@ -0,0 +1,640 @@
+"""cc_toolchain_config rule for configuring AARCH64 toolchains on Linux."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
+
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
+
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
+        ],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
+    )
+
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
+    )
+
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
+    )
+
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "msvc":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("/WHOLEARCHIVE:"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
+    )
+
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _nologo():
+    return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = [
+                                "-fexperimental-new-pass-manager",
+                                "-fmerge-all-constants",
+                            ]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = ([
+                            flag_group(flags = ["-Wl,-no-as-needed"])
+                        ] if cpu == "local" else []) + ([
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
+                        ] if ctx.attr.linker_bin_path else []) + [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
+                        ],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
+        ]
+    else:
+        fail("Unreachable")
+
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = _features(cpu, compiler, ctx),
+            action_configs = action_configs,
+            artifact_name_patterns = artifact_name_patterns,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = "local",
+            target_system_name = "local",
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
+            make_variables = [],
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local"]),
+        "compiler": attr.string(values = ["clang", "gcc", "unknown"], default = "gcc"),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/tensorflow/tools/toolchains/embedded/arm-linux/aarch64-linux-toolchain.BUILD b/tensorflow/tools/toolchains/embedded/arm-linux/aarch64-linux-toolchain.BUILD
index 2b50aef57f5..3438401eac6 100644
--- a/tensorflow/tools/toolchains/embedded/arm-linux/aarch64-linux-toolchain.BUILD
+++ b/tensorflow/tools/toolchains/embedded/arm-linux/aarch64-linux-toolchain.BUILD
@@ -3,65 +3,65 @@ package(default_visibility = ["//visibility:public"])
 filegroup(
     name = "gcc",
     srcs = [
-        "bin/aarch64-linux-gnu-gcc",
+        "bin/aarch64-none-linux-gnu-gcc",
     ],
 )
 
 filegroup(
     name = "ar",
     srcs = [
-        "bin/aarch64-linux-gnu-ar",
+        "bin/aarch64-none-linux-gnu-ar",
     ],
 )
 
 filegroup(
     name = "ld",
     srcs = [
-        "bin/aarch64-linux-gnu-ld",
+        "bin/aarch64-none-linux-gnu-ld",
     ],
 )
 
 filegroup(
     name = "nm",
     srcs = [
-        "bin/aarch64-linux-gnu-nm",
+        "bin/aarch64-none-linux-gnu-nm",
     ],
 )
 
 filegroup(
     name = "objcopy",
     srcs = [
-        "bin/aarch64-linux-gnu-objcopy",
+        "bin/aarch64-none-linux-gnu-objcopy",
     ],
 )
 
 filegroup(
     name = "objdump",
     srcs = [
-        "bin/aarch64-linux-gnu-objdump",
+        "bin/aarch64-none-linux-gnu-objdump",
     ],
 )
 
 filegroup(
     name = "strip",
     srcs = [
-        "bin/aarch64-linux-gnu-strip",
+        "bin/aarch64-none-linux-gnu-strip",
     ],
 )
 
 filegroup(
     name = "as",
     srcs = [
-        "bin/aarch64-linux-gnu-as",
+        "bin/aarch64-none-linux-gnu-as",
     ],
 )
 
 filegroup(
     name = "compiler_pieces",
     srcs = glob([
-        "aarch64-linux-gnu/**",
+        "aarch64-none-linux-gnu/**",
         "libexec/**",
-        "lib/gcc/aarch64-linux-gnu/**",
+        "lib/gcc/aarch64-none-linux-gnu/**",
         "include/**",
     ]),
 )
diff --git a/tensorflow/tools/toolchains/embedded/arm-linux/armhf-linux-toolchain.BUILD b/tensorflow/tools/toolchains/embedded/arm-linux/armhf-linux-toolchain.BUILD
index db2e9bbe1e1..b0d51b96c86 100644
--- a/tensorflow/tools/toolchains/embedded/arm-linux/armhf-linux-toolchain.BUILD
+++ b/tensorflow/tools/toolchains/embedded/arm-linux/armhf-linux-toolchain.BUILD
@@ -3,65 +3,65 @@ package(default_visibility = ["//visibility:public"])
 filegroup(
     name = "gcc",
     srcs = [
-        "bin/arm-linux-gnueabihf-gcc",
+        "bin/arm-none-linux-gnueabihf-gcc",
     ],
 )
 
 filegroup(
     name = "ar",
     srcs = [
-        "bin/arm-linux-gnueabihf-ar",
+        "bin/arm-none-linux-gnueabihf-ar",
     ],
 )
 
 filegroup(
     name = "ld",
     srcs = [
-        "bin/arm-linux-gnueabihf-ld",
+        "bin/arm-none-linux-gnueabihf-ld",
     ],
 )
 
 filegroup(
     name = "nm",
     srcs = [
-        "bin/arm-linux-gnueabihf-nm",
+        "bin/arm-none-linux-gnueabihf-nm",
     ],
 )
 
 filegroup(
     name = "objcopy",
     srcs = [
-        "bin/arm-linux-gnueabihf-objcopy",
+        "bin/arm-none-linux-gnueabihf-objcopy",
     ],
 )
 
 filegroup(
     name = "objdump",
     srcs = [
-        "bin/arm-linux-gnueabihf-objdump",
+        "bin/arm-none-linux-gnueabihf-objdump",
     ],
 )
 
 filegroup(
     name = "strip",
     srcs = [
-        "bin/arm-linux-gnueabihf-strip",
+        "bin/arm-none-linux-gnueabihf-strip",
     ],
 )
 
 filegroup(
     name = "as",
     srcs = [
-        "bin/arm-linux-gnueabihf-as",
+        "bin/arm-none-linux-gnueabihf-as",
     ],
 )
 
 filegroup(
     name = "compiler_pieces",
     srcs = glob([
-        "arm-linux-gnueabihf/**",
+        "arm-none-linux-gnueabihf/**",
         "libexec/**",
-        "lib/gcc/arm-linux-gnueabihf/**",
+        "lib/gcc/arm-none-linux-gnueabihf/**",
         "include/**",
     ]),
 )
diff --git a/tensorflow/tools/toolchains/embedded/arm-linux/cc_config.bzl.tpl b/tensorflow/tools/toolchains/embedded/arm-linux/cc_config.bzl.tpl
index afbea6a3e34..d67d65e8b56 100644
--- a/tensorflow/tools/toolchains/embedded/arm-linux/cc_config.bzl.tpl
+++ b/tensorflow/tools/toolchains/embedded/arm-linux/cc_config.bzl.tpl
@@ -17,7 +17,7 @@ load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 def _impl(ctx):
     if (ctx.attr.cpu == "aarch64"):
-        toolchain_identifier = "aarch64-linux-gnu"
+        toolchain_identifier = "aarch64-none-linux-gnu"
         host_system_name = "aarch64"
         target_system_name = "aarch64"
         target_cpu = "aarch64"
@@ -245,13 +245,13 @@ def _impl(ctx):
                         flag_group(
                             flags = [
                                 "-isystem",
-                                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include",
+                                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-none-linux-gnu/11.3.1/include",
                                 "-isystem",
-                                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include-fixed",
+                                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-none-linux-gnu/11.3.1/include-fixed",
                                 "-isystem",
-                                "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
+                                "%{AARCH64_COMPILER_PATH}%/aarch64-none-linux-gnu/include/c++/11.3.1/",
                                 "-isystem",
-                                "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                                "%{AARCH64_COMPILER_PATH}%/aarch64-none-linux-gnu/libc/usr/include/",
                                 "-isystem",
                                 "%{PYTHON_INCLUDE_PATH}%",
                                 "-isystem",
@@ -344,13 +344,13 @@ def _impl(ctx):
                         flag_group(
                             flags = [
                                 "-isystem",
-                                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include",
+                                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-none-linux-gnueabihf/11.3.1/include",
                                 "-isystem",
-                                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed",
+                                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-none-linux-gnueabihf/11.3.1/include-fixed",
                                 "-isystem",
-                                "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
+                                "%{ARMHF_COMPILER_PATH}%/arm-none-linux-gnueabihf/include/c++/11.3.1/",
                                 "-isystem",
-                                "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "%{ARMHF_COMPILER_PATH}%/arm-none-linux-gnueabihf/libc/usr/include/",
                                 "-isystem",
                                 "%{PYTHON_INCLUDE_PATH}%",
                                 "-isystem",
@@ -470,18 +470,18 @@ def _impl(ctx):
 
     if (ctx.attr.cpu == "aarch64"):
         cxx_builtin_include_directories = [
-                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include",
-                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include-fixed",
-                "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
-                "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-none-linux-gnu/11.3.1/include",
+                "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-none-linux-gnu/11.3.1/include-fixed",
+                "%{AARCH64_COMPILER_PATH}%/aarch64-none-linux-gnu/include/c++/11.3.1/",
+                "%{AARCH64_COMPILER_PATH}%/aarch64-none-linux-gnu/libc/usr/include/",
                 "/usr/include",
             ]
     elif (ctx.attr.cpu == "armhf"):
         cxx_builtin_include_directories = [
-                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include",
-                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed",
-                "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
-                "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-none-linux-gnueabihf/11.3.1/include",
+                "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-none-linux-gnueabihf/11.3.1/include-fixed",
+                "%{ARMHF_COMPILER_PATH}%/arm-none-linux-gnueabihf/include/c++/11.3.1/",
+                "%{ARMHF_COMPILER_PATH}%/arm-none-linux-gnueabihf/libc/usr/include/",
                 "/usr/include",
             ]
     else:
@@ -495,88 +495,88 @@ def _impl(ctx):
         tool_paths = [
             tool_path(
                 name = "ar",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-ar",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-ar",
             ),
             tool_path(name = "compat-ld", path = "/bin/false"),
             tool_path(
                 name = "cpp",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-cpp",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-cpp",
             ),
             tool_path(
                 name = "dwp",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-dwp",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-dwp",
             ),
             tool_path(
                 name = "gcc",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-gcc",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-gcc",
             ),
             tool_path(
                 name = "gcov",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-gcov",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-gcov",
             ),
             tool_path(
                 name = "ld",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-ld",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-ld",
             ),
             tool_path(
                 name = "nm",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-nm",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-nm",
             ),
             tool_path(
                 name = "objcopy",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-objcopy",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-objcopy",
             ),
             tool_path(
                 name = "objdump",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-objdump",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-objdump",
             ),
             tool_path(
                 name = "strip",
-                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-linux-gnu-strip",
+                path = "%{AARCH64_COMPILER_PATH}%/bin/aarch64-none-linux-gnu-strip",
             ),
         ]
     elif (ctx.attr.cpu == "armhf"):
         tool_paths = [
             tool_path(
                 name = "ar",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ar",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-ar",
             ),
             tool_path(name = "compat-ld", path = "/bin/false"),
             tool_path(
                 name = "cpp",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-cpp",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-cpp",
             ),
             tool_path(
                 name = "dwp",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-dwp",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-dwp",
             ),
             tool_path(
                 name = "gcc",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcc",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-gcc",
             ),
             tool_path(
                 name = "gcov",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-gcov",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-gcov",
             ),
             tool_path(
                 name = "ld",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-ld",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-ld",
             ),
             tool_path(
                 name = "nm",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-nm",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-nm",
             ),
             tool_path(
                 name = "objcopy",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objcopy",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-objcopy",
             ),
             tool_path(
                 name = "objdump",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-objdump",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-objdump",
             ),
             tool_path(
                 name = "strip",
-                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-linux-gnueabihf-strip",
+                path = "%{ARMHF_COMPILER_PATH}%/bin/arm-none-linux-gnueabihf-strip",
             ),
         ]
     else:
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 776ada3c5e7..5aa904e704b 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -120,9 +120,9 @@ def initialize_rbe_configs():
         name = "ubuntu20.04-gcc9_manylinux2014-rocm",
         compiler = "/dt9/usr/bin/gcc",
         compiler_prefix = "/usr/bin",
-        rocm_version = "5.0",  # Any version will do.
+        rocm_version = "5.3",  # Any version will do.
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9"],
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
         python_install_path = "/usr/local",
     )
 
@@ -158,7 +158,7 @@ def initialize_rbe_configs():
         cuda_version = "11.1",
         cudnn_version = "8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10"],
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "7.2",
         sysroot = "/dt9",
@@ -172,7 +172,7 @@ def initialize_rbe_configs():
         cuda_version = "11.1",
         cudnn_version = "8",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10"],
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "7.2",
         python_install_path = "/usr/local",
@@ -184,7 +184,7 @@ def initialize_rbe_configs():
         cuda_version = "11.4",
         cudnn_version = "8.2",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10"],
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "7.2",
         sysroot = "/dt9",
@@ -198,12 +198,38 @@ def initialize_rbe_configs():
         cuda_version = "11.4",
         cudnn_version = "8.2",
         os = "ubuntu20.04-manylinux2014-multipython",
-        python_versions = ["3.7", "3.8", "3.9", "3.10"],
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
         tensorrt_install_path = "/usr",
         tensorrt_version = "7.2",
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-clang_manylinux2014-cuda11.8-cudnn8.6-tensorrt8.4",
+        compiler = "/clang_rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6/bin/clang",
+        cuda_version = "11.8",
+        cudnn_version = "8.6",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "8.4",
+        sysroot = "/dt9",
+        python_install_path = "/usr/local",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu20.04-gcc9_manylinux2014-cuda11.8-cudnn8.6-tensorrt8.4",
+        compiler = "/dt9/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "11.8",
+        cudnn_version = "8.6",
+        os = "ubuntu20.04-manylinux2014-multipython",
+        python_versions = ["3.7", "3.8", "3.9", "3.10", "3.11"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "8.4",
+        python_install_path = "/usr/local",
+    )
+
     tensorflow_rbe_win_config(
         name = "windows_py37",
         python_bin_path = "C:/Python37/python.exe",
@@ -220,7 +246,6 @@ def initialize_rbe_configs():
     sigbuild_tf_configs(
         name_container_map = {
             "sigbuild-r2.9": "docker://gcr.io/tensorflow-sigs/build@sha256:ce8e5b828a43ce2ea0a9d0a9d4f5d967a9bf79c0596b005a96c4ab91a8462347",
-            "sigbuild-r2.9-python3.7": "docker://gcr.io/tensorflow-sigs/build@sha256:1c78bb9c42a17b13dbdefe2074bf7f391f583a56d44d240d0feb921325f8727d",
             "sigbuild-r2.9-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:d064667c4b904bb1c658a6be913fc41a1e5d822a5feb9cdac849973a050b9901",
             "sigbuild-r2.9-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:ce8e5b828a43ce2ea0a9d0a9d4f5d967a9bf79c0596b005a96c4ab91a8462347",
             "sigbuild-r2.9-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5a3224e8d7f592b2f343e0c8fe9521605c7c02f51f0c5cc9f9652614d1961850",
@@ -260,7 +285,6 @@ def initialize_rbe_configs():
     sigbuild_tf_configs(
         name_container_map = {
             "sigbuild-r2.10": "docker://gcr.io/tensorflow-sigs/build@sha256:20d777f0200b7196349b9d25dec92ed4b34e966e8a8ab661d9b1b93c05d95c88",
-            "sigbuild-r2.10-python3.7": "docker://gcr.io/tensorflow-sigs/build@sha256:4ea896995b7feb85aa6c08f20a0ba4d0243294b0c856f5dbe97b9885c3b11d9c",
             "sigbuild-r2.10-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:1091a4dc92c3227771ee748eb3f4eee1c32555f2e9805fcb341602b35e3da7a2",
             "sigbuild-r2.10-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:20d777f0200b7196349b9d25dec92ed4b34e966e8a8ab661d9b1b93c05d95c88",
             "sigbuild-r2.10-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:52c5bcfd3ce479c2f5148d7a9a119334148a33a3302b08e88e1045059dead62c",
@@ -299,29 +323,28 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-128": "docker://gcr.io/tensorflow-sigs/build@sha256:20d777f0200b7196349b9d25dec92ed4b34e966e8a8ab661d9b1b93c05d95c88",
-            "sigbuild-128-python3.7": "docker://gcr.io/tensorflow-sigs/build@sha256:4ea896995b7feb85aa6c08f20a0ba4d0243294b0c856f5dbe97b9885c3b11d9c",
-            "sigbuild-128-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:1091a4dc92c3227771ee748eb3f4eee1c32555f2e9805fcb341602b35e3da7a2",
-            "sigbuild-128-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:20d777f0200b7196349b9d25dec92ed4b34e966e8a8ab661d9b1b93c05d95c88",
-            "sigbuild-128-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:52c5bcfd3ce479c2f5148d7a9a119334148a33a3302b08e88e1045059dead62c",
+            "sigbuild-57469": "docker://gcr.io/tensorflow-sigs/build@sha256:771eb6cc8e4ba94b033f15a6b69d1d2eb52d28da6811f6e6a328ad814204679e",
+            "sigbuild-57469-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:4341556586b640cd4b328959172e0a18767595e3446553c45353ef649d749388",
+            "sigbuild-57469-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:771eb6cc8e4ba94b033f15a6b69d1d2eb52d28da6811f6e6a328ad814204679e",
+            "sigbuild-57469-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:4d2241fea2a5ed629d9f7b68d9458bc0ce1f86651d02abcb596966c3cb92b492",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
         env = {
             "ABI_LIBC_VERSION": "glibc_2.19",
             "ABI_VERSION": "gcc",
-            "BAZEL_COMPILER": "/usr/lib/llvm-14/bin/clang",
+            "BAZEL_COMPILER": "/usr/lib/llvm-15/bin/clang",
             "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
             "BAZEL_TARGET_CPU": "k8",
             "BAZEL_TARGET_LIBC": "glibc_2.19",
             "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC": "/usr/lib/llvm-14/bin/clang",
+            "CC": "/usr/lib/llvm-15/bin/clang",
             "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
             "CLEAR_CACHE": "1",
             "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-14/bin/clang",
-            "HOST_CXX_COMPILER": "/usr/lib/llvm-14/bin/clang",
-            "HOST_C_COMPILER": "/usr/lib/llvm-14/bin/clang",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-15/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-15/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-15/bin/clang",
             "PYTHON_BIN_PATH": "/usr/bin/python3",
             "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
             "TF_CUDA_CLANG": "1",
@@ -338,11 +361,10 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.11": "docker://gcr.io/tensorflow-sigs/build@sha256:f69fc3aeab047c705406c0310ed50c8ca926988a34c1a19353cd4fa3f4865af7",
-            "sigbuild-r2.11-python3.7": "docker://gcr.io/tensorflow-sigs/build@sha256:cc36072f817a6b511696f90c98198c06976eea6ab9cdb7bbe9299c4997aa16c6",
-            "sigbuild-r2.11-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:3b4836d210caa225ae0f6fe938259309c3553db50a0d11b84ef8ff8313b57d27",
-            "sigbuild-r2.11-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:f69fc3aeab047c705406c0310ed50c8ca926988a34c1a19353cd4fa3f4865af7",
-            "sigbuild-r2.11-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:71c9337f214850ce019696f10c4a5fbd1b408695d56addb86a7b3c0e5b76b76f",
+            "sigbuild-r2.11": "docker://gcr.io/tensorflow-sigs/build@sha256:19624dc8e664f4e00a85eee637711481ec00a22a9522a2575609f1ddce613615",
+            "sigbuild-r2.11-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:bae2680dfb4457a9c6112aaf5f164dd677e4b14da0a1c6dabd81a573f8ec0d5d",
+            "sigbuild-r2.11-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:19624dc8e664f4e00a85eee637711481ec00a22a9522a2575609f1ddce613615",
+            "sigbuild-r2.11-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:5d55a95abee68d2e32ed5905708e0580b154939fd67e638e39bb4d2aa83d7ad6",
         },
         # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
         # and manylinux2014 is 2.17.
@@ -375,3 +397,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "7.2",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.12": "docker://gcr.io/tensorflow-sigs/build@sha256:a88f88dda0131155466f0b7ee2314a32d02a3a01d98195bb151705dbf2726d1c",
+            "sigbuild-r2.12-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:2e37dbc5f98fbe7491b5b101219a0605942fea7b6a4fd563ff26fe55a991bc17",
+            "sigbuild-r2.12-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:a88f88dda0131155466f0b7ee2314a32d02a3a01d98195bb151705dbf2726d1c",
+            "sigbuild-r2.12-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:91f7ffae5e6618ea0242fa294be8ad9f5385b27dfba1bb0dafce2a49f1680d3c",
+            "sigbuild-r2.12-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:f9695b4e183fc49480ecea295e31ded9e171ce51941e5c022e96f9813873ea05",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.8",
+            "TF_CUDNN_VERSION": "8.6",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.4",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.12-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:a88f88dda0131155466f0b7ee2314a32d02a3a01d98195bb151705dbf2726d1c",
+            "sigbuild-r2.12-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:2e37dbc5f98fbe7491b5b101219a0605942fea7b6a4fd563ff26fe55a991bc17",
+            "sigbuild-r2.12-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:a88f88dda0131155466f0b7ee2314a32d02a3a01d98195bb151705dbf2726d1c",
+            "sigbuild-r2.12-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:91f7ffae5e6618ea0242fa294be8ad9f5385b27dfba1bb0dafce2a49f1680d3c",
+            "sigbuild-r2.12-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:f9695b4e183fc49480ecea295e31ded9e171ce51941e5c022e96f9813873ea05",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-16/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.2",
+            "TF_CUDNN_VERSION": "8.1",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "7.2",
+        },
+    )
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index 19ef183a5b4..769b1e1377c 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -5,11 +5,12 @@ container_digests = {
     # TF now uses only this container
     "cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython": "sha256:48612bd85709cd014711d0b0f87e0806f3567d06d2e81c6e860516b87498b821",
     # JAX manylinux2014 configs.
-    "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:3764b49e64a16e5778995fc21a119ff0e364174ecbf461f741701b48e6d4f204",
-    "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:6531a7ed4b1524e9f997ad10ad214af79e1042bc7ec6efe2f3e0692bafdb968f",
+    "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
+    "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:a4373ccb9dfeefaf2b98dab0efa76e327553e381ed184a0c0432f24121049a93",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:de84197c7d2170c103bcab5fb017ea35460a471275b21db12ff9ddcf403d40e9",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
-    "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:26720ebae4d6d12b1fca529616bfacfd0460990d4725af35e0f4af3c2422f227",
+    "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
     # Unused?
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
@@ -98,6 +99,13 @@ containers = {
         "digest": container_digests["cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython.
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython",
+        "digest": container_digests["cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython.
     "rocm-ubuntu18.04-manylinux2010-multipython": {
         "registry": "gcr.io",
diff --git a/tensorflow/tools/toolchains/win/BUILD b/tensorflow/tools/toolchains/win/BUILD
index 290cfe462a9..e0e71813c32 100644
--- a/tensorflow/tools/toolchains/win/BUILD
+++ b/tensorflow/tools/toolchains/win/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name: "container-image"
-          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:450a2ba8d8e990212ca504e52a47155da75159d5baccad0bdd90dfdced3cc174"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:6840d4ea7db934541cc36ef0ca10fb6703b28bd49f762d22768d6d5728969d03"
         }
         properties:{
           name: "OSFamily"
diff --git a/tensorflow/tools/toolchains/win/tf_win_01112023/BUILD b/tensorflow/tools/toolchains/win/tf_win_01112023/BUILD
new file mode 100644
index 00000000000..f245f6d0789
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01112023/BUILD
@@ -0,0 +1,630 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x64",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x86",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x86;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = [
+        "/MACHINE:X64",
+        "/DEFAULTLIB:clang_rt.builtins-x86_64.lib",
+    ],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "clang_installation_error.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "clang_installation_error.bat",
+    msvc_link_path = "clang_installation_error.bat",
+    msvc_ml_path = "clang_installation_error.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "clang_installation_error.bat",
+        "ml": "clang_installation_error.bat",
+        "cpp": "clang_installation_error.bat",
+        "gcc": "clang_installation_error.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "clang_installation_error.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_01112023/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_01112023/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..72ef48ae6d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01112023/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_01112023/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/tf_win_01112023/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000..55ba44f761e
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01112023/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/tensorflow/tools/toolchains/win/tf_win_01112023/toolchain_image_info b/tensorflow/tools/toolchains/win/tf_win_01112023/toolchain_image_info
new file mode 100644
index 00000000000..a80bff589ca
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01112023/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:768fbd44022f6f4e3f76701bf2fc623f514ce9fe1a5796ea9533d2ac3e0474de   8f5265f62e0c        5 minutes ago       15.9GB
\ No newline at end of file
diff --git a/tensorflow/tools/toolchains/win/tf_win_01112023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_01112023/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..d913692b7e0
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01112023/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1392 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++0x"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_01232023/BUILD b/tensorflow/tools/toolchains/win/tf_win_01232023/BUILD
new file mode 100644
index 00000000000..f245f6d0789
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01232023/BUILD
@@ -0,0 +1,630 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
+load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "mingw_compiler_files",
+    srcs = [":builtin_include_directory_paths_mingw"],
+)
+
+filegroup(
+    name = "clangcl_compiler_files",
+    srcs = [":builtin_include_directory_paths_clangcl"],
+)
+
+filegroup(
+    name = "msvc_compiler_files",
+    srcs = [":builtin_include_directory_paths_msvc"],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_x86_windows|msvc-cl": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows|msvc-cl": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "arm64_windows|msvc-cl": ":cc-compiler-arm64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows|clang-cl": ":cc-compiler-x64_windows-clang-cl",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "x64_x86_windows": ":cc-compiler-x64_x86_windows",
+        "x64_arm_windows": ":cc-compiler-x64_arm_windows",
+        "x64_arm64_windows": ":cc-compiler-arm64_windows",
+        "arm64_windows": ":cc-compiler-arm64_windows",
+        "x64_arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "arm64_windows|clang-cl": ":cc-compiler-arm64_windows-clang-cl",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msys_x64",
+    toolchain_identifier = "msys_x64",
+)
+
+cc_toolchain_config(
+    name = "msys_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/usr/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "msys",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/usr/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/usr/bin/ar",
+        "cpp": "c:/tools/msys64/usr/bin/cpp",
+        "dwp": "c:/tools/msys64/usr/bin/dwp",
+        "gcc": "c:/tools/msys64/usr/bin/gcc",
+        "gcov": "c:/tools/msys64/usr/bin/gcov",
+        "ld": "c:/tools/msys64/usr/bin/ld",
+        "nm": "c:/tools/msys64/usr/bin/nm",
+        "objcopy": "c:/tools/msys64/usr/bin/objcopy",
+        "objdump": "c:/tools/msys64/usr/bin/objdump",
+        "strip": "c:/tools/msys64/usr/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":mingw_compiler_files",
+    compiler_files = ":mingw_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    abi_libc_version = "local",
+    abi_version = "local",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "c:/tools/msys64/mingw64/",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    target_libc = "mingw",
+    target_system_name = "local",
+    tool_bin_path = "c:/tools/msys64/mingw64/bin",
+    tool_paths = {
+        "ar": "c:/tools/msys64/mingw64/bin/ar",
+        "cpp": "c:/tools/msys64/mingw64/bin/cpp",
+        "dwp": "c:/tools/msys64/mingw64/bin/dwp",
+        "gcc": "c:/tools/msys64/mingw64/bin/gcc",
+        "gcov": "c:/tools/msys64/mingw64/bin/gcov",
+        "ld": "c:/tools/msys64/mingw64/bin/ld",
+        "nm": "c:/tools/msys64/mingw64/bin/nm",
+        "objcopy": "c:/tools/msys64/mingw64/bin/objcopy",
+        "objdump": "c:/tools/msys64/mingw64/bin/objdump",
+        "strip": "c:/tools/msys64/mingw64/bin/strip",
+    },
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
+    toolchain_identifier = "msvc_x64",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X64"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x64",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/ml64.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x64/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_x86_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_x86",
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_x86",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X86"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [
+        "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    ],
+    dbg_mode_debug_flag = "/DEBUG:FULL",
+    default_link_flags = ["/MACHINE:X86"],
+    fastbuild_mode_debug_flag = "/DEBUG:FASTLINK",
+    host_system_name = "local",
+    msvc_cl_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+    msvc_env_include = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\include;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\shared;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\um;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\winrt;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.20348.0\\cppwinrt",
+    msvc_env_lib = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\ucrt\\x86;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.20348.0\\um\\x86",
+    msvc_env_path = "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x86;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX64\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\VC\\VCPackages;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TestWindow;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\CommonExtensions\\Microsoft\\TeamFoundation\\Team Explorer;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\bin\\Roslyn;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\devinit;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\10.0.20348.0\\x64;C:\\Program Files (x86)\\Windows Kits\\10\\bin\\x64;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\\\MSBuild\\Current\\Bin;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\IDE\\;C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\Common7\\Tools\\;;C:\\Windows\\system32",
+    msvc_env_tmp = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp",
+    msvc_lib_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+    msvc_link_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+    msvc_ml_path = "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/lib.exe",
+        "ml": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/ml.exe",
+        "cpp": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcc": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/cl.exe",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "C:/Program Files (x86)/Microsoft Visual Studio/2019/Community/VC/Tools/MSVC/14.29.30133/bin/HostX64/x86/link.exe",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_x86_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_32",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_x86_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_arm_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_x64_arm",
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+cc_toolchain_config(
+    name = "msvc_x64_arm",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm.bat",
+    msvc_link_path = "vc_installation_error_arm.bat",
+    msvc_ml_path = "vc_installation_error_arm.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm.bat",
+        "ml": "vc_installation_error_arm.bat",
+        "cpp": "vc_installation_error_arm.bat",
+        "gcc": "vc_installation_error_arm.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_x64_arm",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_arm_windows",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_arm_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":msvc_compiler_files",
+    compiler_files = ":msvc_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":msvc_arm64",
+    toolchain_identifier = "msvc_arm64",
+)
+
+cc_toolchain_config(
+    name = "msvc_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "msvc_not_found",
+    msvc_env_lib = "msvc_not_found",
+    msvc_env_path = "msvc_not_found",
+    msvc_env_tmp = "msvc_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "msvc_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_x64",
+    toolchain_identifier = "clang_cl_x64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_x64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:X64"],
+    compiler = "clang-cl",
+    cpu = "x64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = [
+        "/MACHINE:X64",
+        "/DEFAULTLIB:clang_rt.builtins-x86_64.lib",
+    ],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "clang_installation_error.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "clang_installation_error.bat",
+    msvc_link_path = "clang_installation_error.bat",
+    msvc_ml_path = "clang_installation_error.bat",
+    target_libc = "msvcrt",
+    target_system_name = "local",
+    tool_paths = {
+        "ar": "clang_installation_error.bat",
+        "ml": "clang_installation_error.bat",
+        "cpp": "clang_installation_error.bat",
+        "gcc": "clang_installation_error.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "clang_installation_error.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:x86_64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-arm64_windows-clang-cl",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":clangcl_compiler_files",
+    compiler_files = ":clangcl_compiler_files",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":clang_cl_arm64",
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+cc_toolchain_config(
+    name = "clang_cl_arm64",
+    abi_libc_version = "local",
+    abi_version = "local",
+    archiver_flags = ["/MACHINE:ARM64"],
+    compiler = "clang-cl",
+    cpu = "arm64_windows",
+    cxx_builtin_include_directories = [],
+    dbg_mode_debug_flag = "/DEBUG",
+    default_link_flags = ["/MACHINE:ARM64"],
+    fastbuild_mode_debug_flag = "/DEBUG",
+    host_system_name = "local",
+    msvc_cl_path = "vc_installation_error_arm64.bat",
+    msvc_env_include = "clang_cl_not_found",
+    msvc_env_lib = "clang_cl_not_found",
+    msvc_env_path = "clang_cl_not_found",
+    msvc_env_tmp = "clang_cl_not_found",
+    msvc_lib_path = "vc_installation_error_arm64.bat",
+    msvc_link_path = "vc_installation_error_arm64.bat",
+    msvc_ml_path = "vc_installation_error_arm64.bat",
+    target_libc = "msvcrt",
+    target_system_name = "aarch64-pc-windows-msvc",
+    tool_paths = {
+        "ar": "vc_installation_error_arm64.bat",
+        "ml": "vc_installation_error_arm64.bat",
+        "cpp": "vc_installation_error_arm64.bat",
+        "gcc": "vc_installation_error_arm64.bat",
+        "gcov": "wrapper/bin/msvc_nop.bat",
+        "ld": "vc_installation_error_arm64.bat",
+        "nm": "wrapper/bin/msvc_nop.bat",
+        "objcopy": "wrapper/bin/msvc_nop.bat",
+        "objdump": "wrapper/bin/msvc_nop.bat",
+        "strip": "wrapper/bin/msvc_nop.bat",
+    },
+    toolchain_identifier = "clang_cl_arm64",
+)
+
+toolchain(
+    name = "cc-toolchain-arm64_windows-clang-cl",
+    exec_compatible_with = [
+        "@platforms//os:windows",
+        "@bazel_tools//tools/cpp:clang-cl",
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:arm64",
+        "@platforms//os:windows",
+    ],
+    toolchain = ":cc-compiler-arm64_windows-clang-cl",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+armeabi_cc_toolchain_config(name = "stub_armeabi-v7a")
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@platforms//cpu:armv7",
+        "@platforms//os:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_01232023/armeabi_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_01232023/armeabi_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..72ef48ae6d6
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01232023/armeabi_cc_toolchain_config.bzl
@@ -0,0 +1,82 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "feature",
+    "tool_path",
+)
+
+def _impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "llvm-profdata", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+armeabi_cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {},
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tools/toolchains/win/tf_win_01232023/builtin_include_directory_paths_msvc b/tensorflow/tools/toolchains/win/tf_win_01232023/builtin_include_directory_paths_msvc
new file mode 100644
index 00000000000..55ba44f761e
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01232023/builtin_include_directory_paths_msvc
@@ -0,0 +1,6 @@
+This file is generated by cc_configure and contains builtin include directories
+that msvc reported. This file is a dependency of every compilation action and
+changes to it will be reflected in the action cache key. When some of these
+paths change, Bazel will make sure to rerun the action, even though none of
+declared action inputs or the action commandline changes.
+
diff --git a/tensorflow/tools/toolchains/win/tf_win_01232023/toolchain_image_info b/tensorflow/tools/toolchains/win/tf_win_01232023/toolchain_image_info
new file mode 100644
index 00000000000..db39c0d5529
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01232023/toolchain_image_info
@@ -0,0 +1,2 @@
+REPOSITORY                                            TAG                 DIGEST                                                                    IMAGE ID            CREATED             SIZE
+gcr.io/tensorflow-testing/tf-win2019-docker-staging   latest              sha256:6840d4ea7db934541cc36ef0ca10fb6703b28bd49f762d22768d6d5728969d03   2b9602ce9aab        5 minutes ago       15.9GB
\ No newline at end of file
diff --git a/tensorflow/tools/toolchains/win/tf_win_01232023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_01232023/windows_cc_toolchain_config.bzl
new file mode 100644
index 00000000000..d913692b7e0
--- /dev/null
+++ b/tensorflow/tools/toolchains/win/tf_win_01232023/windows_cc_toolchain_config.bzl
@@ -0,0 +1,1392 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule for Windows"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _use_msvc_toolchain(ctx):
+    return ctx.attr.cpu in ["x64_windows", "arm64_windows"] and (ctx.attr.compiler == "msvc-cl" or ctx.attr.compiler == "clang-cl")
+
+def _impl(ctx):
+    if _use_msvc_toolchain(ctx):
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "object_file",
+                prefix = "",
+                extension = ".obj",
+            ),
+            artifact_name_pattern(
+                category_name = "static_library",
+                prefix = "",
+                extension = ".lib",
+            ),
+            artifact_name_pattern(
+                category_name = "alwayslink_static_library",
+                prefix = "",
+                extension = ".lo.lib",
+            ),
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+            artifact_name_pattern(
+                category_name = "dynamic_library",
+                prefix = "",
+                extension = ".dll",
+            ),
+            artifact_name_pattern(
+                category_name = "interface_library",
+                prefix = "",
+                extension = ".if.lib",
+            ),
+        ]
+    else:
+        artifact_name_patterns = [
+            artifact_name_pattern(
+                category_name = "executable",
+                prefix = "",
+                extension = ".exe",
+            ),
+        ]
+
+    if _use_msvc_toolchain(ctx):
+        cpp_link_nodeps_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_static_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_static_library,
+            implies = [
+                "nologo",
+                "archiver_flags",
+                "input_param_flags",
+                "linker_param_file",
+                "msvc_env",
+            ],
+            tools = [tool(path = ctx.attr.msvc_lib_path)],
+        )
+
+        assemble_action = action_config(
+            action_name = ACTION_NAMES.assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        preprocess_assemble_action = action_config(
+            action_name = ACTION_NAMES.preprocess_assemble,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_ml_path)],
+        )
+
+        c_compile_action = action_config(
+            action_name = ACTION_NAMES.c_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        linkstamp_compile_action = action_config(
+            action_name = ACTION_NAMES.linkstamp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "default_compile_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+                "unfiltered_compile_flags",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_compile_action = action_config(
+            action_name = ACTION_NAMES.cpp_compile,
+            implies = [
+                "compiler_input_flags",
+                "compiler_output_flags",
+                "nologo",
+                "msvc_env",
+                "parse_showincludes",
+                "user_compile_flags",
+                "sysroot",
+            ],
+            tools = [tool(path = ctx.attr.msvc_cl_path)],
+        )
+
+        cpp_link_executable_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_executable,
+            implies = [
+                "nologo",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        cpp_link_dynamic_library_action = action_config(
+            action_name = ACTION_NAMES.cpp_link_dynamic_library,
+            implies = [
+                "nologo",
+                "shared_flag",
+                "linkstamps",
+                "output_execpath_flags",
+                "input_param_flags",
+                "user_link_flags",
+                "linker_subsystem_flag",
+                "linker_param_file",
+                "msvc_env",
+                "no_stripping",
+                "has_configured_linker_path",
+                "def_file",
+            ],
+            tools = [tool(path = ctx.attr.msvc_link_path)],
+        )
+
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            linkstamp_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        action_configs = []
+
+    if _use_msvc_toolchain(ctx):
+        msvc_link_env_feature = feature(
+            name = "msvc_link_env",
+            env_sets = [
+                env_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    env_entries = [env_entry(key = "LIB", value = ctx.attr.msvc_env_lib)],
+                ),
+            ],
+        )
+
+        shared_flag_feature = feature(
+            name = "shared_flag",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/DLL"])],
+                ),
+            ],
+        )
+
+        determinism_feature = feature(
+            name = "determinism",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/wd4117",
+                                "-D__DATE__=\"redacted\"",
+                                "-D__TIMESTAMP__=\"redacted\"",
+                                "-D__TIME__=\"redacted\"",
+                            ] + (["-Wno-builtin-macro-redefined"] if ctx.attr.compiler == "clang-cl" else []),
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        sysroot_feature = feature(
+            name = "sysroot",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["--sysroot=%{sysroot}"],
+                            iterate_over = "sysroot",
+                            expand_if_available = "sysroot",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        unfiltered_compile_flags_feature = feature(
+            name = "unfiltered_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{unfiltered_compile_flags}"],
+                            iterate_over = "unfiltered_compile_flags",
+                            expand_if_available = "unfiltered_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archive_param_file_feature = feature(
+            name = "archive_param_file",
+        )
+
+        compiler_param_file_feature = feature(
+            name = "compiler_param_file",
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(
+            name = "copy_dynamic_libraries_to_binary",
+        )
+
+        input_param_flags_feature = feature(
+            name = "input_param_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/IMPLIB:%{interface_library_output_path}"],
+                            expand_if_available = "interface_library_output_path",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{libopts}"],
+                            iterate_over = "libopts",
+                            expand_if_available = "libopts",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            iterate_over = "libraries_to_link",
+                            flag_groups = [
+                                flag_group(
+                                    iterate_over = "libraries_to_link.object_files",
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file_group",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "object_file",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "interface_library",
+                                    ),
+                                ),
+                                flag_group(
+                                    flag_groups = [
+                                        flag_group(
+                                            flags = ["%{libraries_to_link.name}"],
+                                            expand_if_false = "libraries_to_link.is_whole_archive",
+                                        ),
+                                        flag_group(
+                                            flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                            expand_if_true = "libraries_to_link.is_whole_archive",
+                                        ),
+                                    ],
+                                    expand_if_equal = variable_with_value(
+                                        name = "libraries_to_link.type",
+                                        value = "static_library",
+                                    ),
+                                ),
+                            ],
+                            expand_if_available = "libraries_to_link",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.fastbuild_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        user_compile_flags_feature = feature(
+            name = "user_compile_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_compile_flags}"],
+                            iterate_over = "user_compile_flags",
+                            expand_if_available = "user_compile_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        archiver_flags_feature = feature(
+            name = "archiver_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                        flag_group(
+                            flags = ctx.attr.archiver_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ctx.attr.default_link_flags)],
+                ),
+            ],
+        )
+
+        static_link_msvcrt_feature = feature(
+            name = "static_link_msvcrt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MT"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MTd"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"])],
+                ),
+            ],
+        )
+
+        dynamic_link_msvcrt_feature = feature(
+            name = "dynamic_link_msvcrt",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MD"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/MDd"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                    with_features = [with_feature_set(not_features = ["dbg", "static_link_msvcrt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                    with_features = [with_feature_set(features = ["dbg"], not_features = ["static_link_msvcrt"])],
+                ),
+            ],
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = [ctx.attr.dbg_mode_debug_flag, "/INCREMENTAL:NO"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2"])],
+                ),
+            ],
+            implies = ["frame_pointer"],
+        )
+
+        supports_interface_shared_libraries_feature = feature(
+            name = "supports_interface_shared_libraries",
+            enabled = True,
+        )
+
+        user_link_flags_feature = feature(
+            name = "user_link_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{user_link_flags}"],
+                            iterate_over = "user_link_flags",
+                            expand_if_available = "user_link_flags",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "/DCOMPILER_MSVC",
+                                "/DNOMINMAX",
+                                "/D_WIN32_WINNT=0x0601",
+                                "/D_CRT_SECURE_NO_DEPRECATE",
+                                "/D_CRT_SECURE_NO_WARNINGS",
+                                "/bigobj",
+                                "/Zm500",
+                                "/EHsc",
+                                "/wd4351",
+                                "/wd4291",
+                                "/wd4250",
+                                "/wd4996",
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_compile_env_feature = feature(
+            name = "msvc_compile_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                    ],
+                    env_entries = [env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include)],
+                ),
+            ],
+        )
+
+        preprocessor_defines_feature = feature(
+            name = "preprocessor_defines",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/D%{preprocessor_defines}"],
+                            iterate_over = "preprocessor_defines",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        generate_pdb_file_feature = feature(
+            name = "generate_pdb_file",
+        )
+
+        output_execpath_flags_feature = feature(
+            name = "output_execpath_flags",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/OUT:%{output_execpath}"],
+                            expand_if_available = "output_execpath",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        disable_assertions_feature = feature(
+            name = "disable_assertions",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+        supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+        no_stripping_feature = feature(name = "no_stripping")
+
+        linker_param_file_feature = feature(
+            name = "linker_param_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions +
+                              [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["@%{linker_param_file}"],
+                            expand_if_available = "linker_param_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        ignore_noisy_warnings_feature = feature(
+            name = "ignore_noisy_warnings",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_static_library],
+                    flag_groups = [flag_group(flags = ["/ignore:4221"])],
+                ),
+            ],
+        )
+
+        no_legacy_features_feature = feature(name = "no_legacy_features")
+
+        parse_showincludes_feature = feature(
+            name = "parse_showincludes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                    ],
+                    flag_groups = [flag_group(flags = ["/showIncludes"])],
+                ),
+            ],
+        )
+
+        treat_warnings_as_errors_feature = feature(
+            name = "treat_warnings_as_errors",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile] + all_link_actions,
+                    flag_groups = [flag_group(flags = ["/WX"])],
+                ),
+            ],
+        )
+
+        windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+        no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+        include_paths_feature = feature(
+            name = "include_paths",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/I%{quote_include_paths}"],
+                            iterate_over = "quote_include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{include_paths}"],
+                            iterate_over = "include_paths",
+                        ),
+                        flag_group(
+                            flags = ["/I%{system_include_paths}"],
+                            iterate_over = "system_include_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        linkstamps_feature = feature(
+            name = "linkstamps",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["%{linkstamp_paths}"],
+                            iterate_over = "linkstamp_paths",
+                            expand_if_available = "linkstamp_paths",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            enabled = True,
+            implies = ["copy_dynamic_libraries_to_binary"],
+        )
+
+        linker_subsystem_flag_feature = feature(
+            name = "linker_subsystem_flag",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+                ),
+            ],
+        )
+
+        frame_pointer_feature = feature(
+            name = "frame_pointer",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Oy-"])],
+                ),
+            ],
+        )
+
+        compiler_output_flags_feature = feature(
+            name = "compiler_output_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.assemble],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}", "/Zi"],
+                                    expand_if_available = "output_file",
+                                    expand_if_not_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_not_available = "output_preprocess_file",
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fo%{output_file}"],
+                                    expand_if_not_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                            expand_if_not_available = "output_assembly_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/Fa%{output_file}"],
+                                    expand_if_available = "output_assembly_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                        flag_group(
+                            flag_groups = [
+                                flag_group(
+                                    flags = ["/P", "/Fi%{output_file}"],
+                                    expand_if_available = "output_preprocess_file",
+                                ),
+                            ],
+                            expand_if_available = "output_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        nologo_feature = feature(
+            name = "nologo",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["/nologo"])],
+                ),
+            ],
+        )
+
+        smaller_binary_feature = feature(
+            name = "smaller_binary",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                    with_features = [with_feature_set(features = ["opt"])],
+                ),
+            ],
+        )
+
+        compiler_input_flags_feature = feature(
+            name = "compiler_input_flags",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/c", "%{source_file}"],
+                            expand_if_available = "source_file",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        def_file_feature = feature(
+            name = "def_file",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(
+                            flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                            expand_if_available = "def_file_path",
+                        ),
+                    ],
+                ),
+            ],
+        )
+
+        msvc_env_feature = feature(
+            name = "msvc_env",
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                        env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                        env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                    ],
+                ),
+            ],
+            implies = ["msvc_compile_env", "msvc_link_env"],
+        )
+        features = [
+            no_legacy_features_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            msvc_compile_env_feature,
+            msvc_link_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            dynamic_link_msvcrt_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            frame_pointer_feature,
+            disable_assertions_feature,
+            determinism_feature,
+            treat_warnings_as_errors_feature,
+            smaller_binary_feature,
+            ignore_noisy_warnings_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            archive_param_file_feature,
+            compiler_param_file_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        targets_windows_feature = feature(
+            name = "targets_windows",
+            implies = ["copy_dynamic_libraries_to_binary"],
+            enabled = True,
+        )
+
+        copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+        gcc_env_feature = feature(
+            name = "gcc_env",
+            enabled = True,
+            env_sets = [
+                env_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.assemble,
+                        ACTION_NAMES.preprocess_assemble,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_static_library,
+                    ],
+                    env_entries = [
+                        env_entry(key = "PATH", value = ctx.attr.tool_bin_path),
+                    ],
+                ),
+            ],
+        )
+
+        default_compile_flags_feature = feature(
+            name = "default_compile_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.linkstamp_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_header_parsing,
+                        ACTION_NAMES.cpp_module_compile,
+                        ACTION_NAMES.cpp_module_codegen,
+                        ACTION_NAMES.lto_backend,
+                        ACTION_NAMES.clif_match,
+                    ],
+                    flag_groups = [flag_group(flags = ["-std=gnu++0x"])],
+                ),
+            ],
+        )
+
+        default_link_flags_feature = feature(
+            name = "default_link_flags",
+            enabled = True,
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+
+        supports_dynamic_linker_feature = feature(
+            name = "supports_dynamic_linker",
+            enabled = True,
+        )
+
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g", "-Og"])],
+                ),
+            ],
+        )
+
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = [
+                        "-g0",
+                        "-O3",
+                        "-DNDEBUG",
+                        "-ffunction-sections",
+                        "-fdata-sections",
+                    ])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+        )
+
+        if ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+            archive_param_file_feature = feature(
+                name = "archive_param_file",
+            )
+
+            compiler_param_file_feature = feature(
+                name = "compiler_param_file",
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                default_compile_flags_feature,
+                archive_param_file_feature,
+                compiler_param_file_feature,
+                default_link_flags_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+            ]
+        else:
+            supports_pic_feature = feature(
+                name = "supports_pic",
+                enabled = True,
+            )
+
+            sysroot_feature = feature(
+                name = "sysroot",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                            ACTION_NAMES.cpp_link_executable,
+                            ACTION_NAMES.cpp_link_dynamic_library,
+                            ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["--sysroot=%{sysroot}"],
+                                expand_if_available = "sysroot",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            fdo_optimize_feature = feature(
+                name = "fdo_optimize",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-fprofile-use=%{fdo_profile_path}",
+                                    "-fprofile-correction",
+                                ],
+                                expand_if_available = "fdo_profile_path",
+                            ),
+                        ],
+                    ),
+                ],
+                provides = ["profile"],
+            )
+
+            treat_warnings_as_errors_feature = feature(
+                name = "treat_warnings_as_errors",
+                flag_sets = [
+                    flag_set(
+                        actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                        flag_groups = [flag_group(flags = ["-Werror"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions,
+                        flag_groups = [flag_group(flags = ["-Wl,-fatal-warnings"])],
+                    ),
+                ],
+            )
+
+            user_compile_flags_feature = feature(
+                name = "user_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = [
+                            ACTION_NAMES.assemble,
+                            ACTION_NAMES.preprocess_assemble,
+                            ACTION_NAMES.linkstamp_compile,
+                            ACTION_NAMES.c_compile,
+                            ACTION_NAMES.cpp_compile,
+                            ACTION_NAMES.cpp_header_parsing,
+                            ACTION_NAMES.cpp_module_compile,
+                            ACTION_NAMES.cpp_module_codegen,
+                            ACTION_NAMES.lto_backend,
+                            ACTION_NAMES.clif_match,
+                        ],
+                        flag_groups = [
+                            flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                                expand_if_available = "user_compile_flags",
+                            ),
+                        ],
+                    ),
+                ],
+            )
+
+            features = [
+                targets_windows_feature,
+                copy_dynamic_libraries_to_binary_feature,
+                gcc_env_feature,
+                supports_pic_feature,
+                default_compile_flags_feature,
+                default_link_flags_feature,
+                fdo_optimize_feature,
+                supports_dynamic_linker_feature,
+                dbg_feature,
+                opt_feature,
+                user_compile_flags_feature,
+                treat_warnings_as_errors_feature,
+                sysroot_feature,
+            ]
+
+    tool_paths = [
+        tool_path(name = name, path = path)
+        for name, path in ctx.attr.tool_paths.items()
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = ctx.attr.cxx_builtin_include_directories,
+        toolchain_identifier = ctx.attr.toolchain_identifier,
+        host_system_name = ctx.attr.host_system_name,
+        target_system_name = ctx.attr.target_system_name,
+        target_cpu = ctx.attr.cpu,
+        target_libc = ctx.attr.target_libc,
+        compiler = ctx.attr.compiler,
+        abi_version = ctx.attr.abi_version,
+        abi_libc_version = ctx.attr.abi_libc_version,
+        tool_paths = tool_paths,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+        "toolchain_identifier": attr.string(),
+        "host_system_name": attr.string(),
+        "target_system_name": attr.string(),
+        "target_libc": attr.string(),
+        "abi_version": attr.string(),
+        "abi_libc_version": attr.string(),
+        "tool_paths": attr.string_dict(),
+        "cxx_builtin_include_directories": attr.string_list(),
+        "archiver_flags": attr.string_list(default = []),
+        "default_link_flags": attr.string_list(default = []),
+        "msvc_env_tmp": attr.string(default = "msvc_not_found"),
+        "msvc_env_path": attr.string(default = "msvc_not_found"),
+        "msvc_env_include": attr.string(default = "msvc_not_found"),
+        "msvc_env_lib": attr.string(default = "msvc_not_found"),
+        "msvc_cl_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_ml_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_link_path": attr.string(default = "vc_installation_error.bat"),
+        "msvc_lib_path": attr.string(default = "vc_installation_error.bat"),
+        "dbg_mode_debug_flag": attr.string(),
+        "fastbuild_mode_debug_flag": attr.string(),
+        "tool_bin_path": attr.string(default = "not_found"),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/tensorflow/tsl/BUILD b/tensorflow/tsl/BUILD
index 0569f359510..111bd651883 100644
--- a/tensorflow/tsl/BUILD
+++ b/tensorflow/tsl/BUILD
@@ -3,6 +3,8 @@ load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 # Config setting to use in select()s to distinguish open source build from
 # google internal build on configurable attributes.
 #
@@ -29,6 +31,7 @@ alias(
         "@local_config_cuda//:is_cuda_enabled",
         "@local_config_cuda//cuda:using_clang",
     ),
+    visibility = ["//visibility:public"],
 )
 
 selects.config_setting_group(
@@ -37,6 +40,7 @@ selects.config_setting_group(
         ":is_cuda_enabled",
         ":oss",
     ],
+    visibility = ["//visibility:public"],
 )
 
 # Crosses between framework_shared_object and a bunch of other configurations
@@ -77,6 +81,7 @@ selects.config_setting_group(
         ":macos_x86_64_with_framework_shared_object",
         ":macos_arm64_with_framework_shared_object",
     ],
+    visibility = ["//visibility:public"],
 )
 
 # This should be removed after Tensorflow moves to cc_shared_library
@@ -354,6 +359,7 @@ selects.config_setting_group(
         ":linux_aarch64",
         ":linux_armhf",
     ],
+    visibility = ["//visibility:public"],
 )
 
 # TODO(jakeharmon): Remove equivalent from tensorflow/BUILD
@@ -396,6 +402,21 @@ selects.config_setting_group(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_nccl_support",
+    define_values = dict(
+        if_google({"GOOGLE_CUDA_COMPILER": "clang"}),
+        no_nccl_support = "true",
+    ),
+    visibility = ["//visibility:public"],
+)
+
 # DO NOT ADD ANY NEW EXCEPTIONS TO THIS LIST!
 # Instead, please use public APIs or public build rules TF provides.
 # If you need functionality that is not exposed, we will work with you to expand our public APIs.
@@ -412,6 +433,7 @@ package_group(
         "//learning/pathways/...",
         "//smartass/brain/configure/...",
         "//tensorflow/...",
+        "//tensorflow/tsl/...",
         "//tensorflow_decision_forests/...",
         "//tensorflow_federated/...",
         "//third_party/cloud_tpu/convergence_tools/sdc_monitoring/...",
@@ -428,6 +450,7 @@ bzl_library(
     srcs = ["tsl.bzl"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/tsl/mkl:build_defs_bzl",
         "//tensorflow/tsl/platform:rules_cc_bzl",
         "//third_party/compute_library:build_defs_bzl",
         "//third_party/mkl:build_defs_bzl",
diff --git a/tensorflow/tsl/c/BUILD b/tensorflow/tsl/c/BUILD
new file mode 100644
index 00000000000..af62b1ed11e
--- /dev/null
+++ b/tensorflow/tsl/c/BUILD
@@ -0,0 +1,131 @@
+# Description:
+# C API for TensorFlow, for use by client language bindings.
+
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.bzl", "tsl_copts", "tsl_gpu_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+# -----------------------------------------------------------------------------
+# Public targets
+
+filegroup(
+    name = "headers",
+    srcs = [
+        "tsl_status.h",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "*test*",
+        ],
+    ),
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+    ],
+)
+
+tsl_gpu_library(
+    name = "c_api",
+    hdrs = [
+        "tsl_status.h",
+    ],
+    copts = tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tsl_status_internal",
+    ],
+)
+
+tsl_gpu_library(
+    name = "tsl_status_internal",
+    hdrs = [
+        "tsl_status.h",
+        "tsl_status_internal.h",
+    ],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "tsl_status",
+    srcs = ["tsl_status.cc"],
+    hdrs = ["tsl_status.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tsl_status_internal",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+tsl_cc_test(
+    name = "tsl_status_test",
+    srcs = ["tsl_status_test.cc"],
+    deps = [
+        ":tsl_status",
+        ":tsl_status_internal",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "tsl_status_headers",
+    hdrs = ["tsl_status.h"],
+    visibility = ["//visibility:public"],
+)
+
+tsl_gpu_library(
+    name = "tsl_status_helper",
+    srcs = ["tsl_status_helper.cc"],
+    hdrs = ["tsl_status_helper.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tsl_status",
+        ":tsl_status_internal",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+    ],
+)
+
+tsl_cc_test(
+    name = "tsl_status_helper_test",
+    srcs = ["tsl_status_helper_test.cc"],
+    deps = [
+        ":tsl_status_helper",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+filegroup(
+    name = "tsl_status_internal_headers",
+    srcs = ["tsl_status_internal.h"],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+    ],
+)
diff --git a/tensorflow/tsl/c/tsl_status.cc b/tensorflow/tsl/c/tsl_status.cc
new file mode 100644
index 00000000000..0bddcc7e62c
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status.cc
@@ -0,0 +1,54 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/c/tsl_status.h"
+
+#include "tensorflow/tsl/c/tsl_status_internal.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+using ::tsl::Status;
+using ::tsl::error::Code;
+using ::tsl::errors::IOError;
+
+TSL_Status* TSL_NewStatus() { return new TSL_Status; }
+
+void TSL_DeleteStatus(TSL_Status* s) { delete s; }
+
+void TSL_SetStatus(TSL_Status* s, TSL_Code code, const char* msg) {
+  if (code == TSL_OK) {
+    s->status = ::tsl::OkStatus();
+    return;
+  }
+  s->status = Status(static_cast<Code>(code), tsl::StringPiece(msg));
+}
+
+void TSL_SetPayload(TSL_Status* s, const char* key, const char* value) {
+  s->status.SetPayload(key, absl::Cord(absl::string_view(value)));
+}
+
+void TSL_SetStatusFromIOError(TSL_Status* s, int error_code,
+                              const char* context) {
+  // TODO(b/139060984): Handle windows when changing its filesystem
+  s->status = IOError(context, error_code);
+}
+
+TSL_Code TSL_GetCode(const TSL_Status* s) {
+  return static_cast<TSL_Code>(s->status.code());
+}
+
+const char* TSL_Message(const TSL_Status* s) {
+  return s->status.error_message().c_str();
+}
diff --git a/tensorflow/tsl/c/tsl_status.h b/tensorflow/tsl/c/tsl_status.h
new file mode 100644
index 00000000000..6d63e58e032
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_C_TSL_STATUS_H_
+#define TENSORFLOW_TSL_C_TSL_STATUS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TSL_Status TSL_Status;
+
+// --------------------------------------------------------------------------
+// TSL_Code holds an error code.  The enum values here are identical to
+// corresponding values in error_codes.proto.
+typedef enum TSL_Code {
+  TSL_OK = 0,
+  TSL_CANCELLED = 1,
+  TSL_UNKNOWN = 2,
+  TSL_INVALID_ARGUMENT = 3,
+  TSL_DEADLINE_EXCEEDED = 4,
+  TSL_NOT_FOUND = 5,
+  TSL_ALREADY_EXISTS = 6,
+  TSL_PERMISSION_DENIED = 7,
+  TSL_UNAUTHENTICATED = 16,
+  TSL_RESOURCE_EXHAUSTED = 8,
+  TSL_FAILED_PRECONDITION = 9,
+  TSL_ABORTED = 10,
+  TSL_OUT_OF_RANGE = 11,
+  TSL_UNIMPLEMENTED = 12,
+  TSL_INTERNAL = 13,
+  TSL_UNAVAILABLE = 14,
+  TSL_DATA_LOSS = 15,
+} TSL_Code;
+
+// --------------------------------------------------------------------------
+
+// Return a new status object.
+extern TSL_Status* TSL_NewStatus(void);
+
+// Delete a previously created status object.
+extern void TSL_DeleteStatus(TSL_Status*);
+
+// Record <code, msg> in *s.  Any previous information is lost.
+// A common use is to clear a status: TSL_SetStatus(s, TSL_OK, "");
+extern void TSL_SetStatus(TSL_Status* s, TSL_Code code, const char* msg);
+
+// Record <key, value> as a payload in *s. The previous payload having the
+// same key (if any) is overwritten. Payload will not be added if the Status
+// is OK.
+void TSL_SetPayload(TSL_Status* s, const char* key, const char* value);
+
+// Convert from an I/O error code (e.g., errno) to a TSL_Status value.
+// Any previous information is lost. Prefer to use this instead of TSL_SetStatus
+// when the error comes from I/O operations.
+extern void TSL_SetStatusFromIOError(TSL_Status* s, int error_code,
+                                     const char* context);
+
+// Return the code record in *s.
+extern TSL_Code TSL_GetCode(const TSL_Status* s);
+
+// Return a pointer to the (null-terminated) error message in *s.  The
+// return value points to memory that is only usable until the next
+// mutation to *s.  Always returns an empty string if TSL_GetCode(s) is
+// TSL_OK.
+extern const char* TSL_Message(const TSL_Status* s);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_TSL_C_TSL_STATUS_H_
diff --git a/tensorflow/tsl/c/tsl_status_helper.cc b/tensorflow/tsl/c/tsl_status_helper.cc
new file mode 100644
index 00000000000..698f3b9e534
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status_helper.cc
@@ -0,0 +1,91 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/c/tsl_status_helper.h"
+
+#include "tensorflow/tsl/c/tsl_status_internal.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace tsl {
+
+void Set_TSL_Status_from_Status(TSL_Status* tsl_status, const Status& status) {
+  tensorflow::error::Code code = status.code();
+  const char* message(status.error_message().c_str());
+
+  switch (code) {
+    case tensorflow::error::OK:
+      assert(TSL_GetCode(tsl_status) == TSL_OK);
+      break;
+    case tensorflow::error::CANCELLED:
+      TSL_SetStatus(tsl_status, TSL_CANCELLED, message);
+      break;
+    case tensorflow::error::UNKNOWN:
+      TSL_SetStatus(tsl_status, TSL_UNKNOWN, message);
+      break;
+    case tensorflow::error::INVALID_ARGUMENT:
+      TSL_SetStatus(tsl_status, TSL_INVALID_ARGUMENT, message);
+      break;
+    case tensorflow::error::DEADLINE_EXCEEDED:
+      TSL_SetStatus(tsl_status, TSL_DEADLINE_EXCEEDED, message);
+      break;
+    case tensorflow::error::NOT_FOUND:
+      TSL_SetStatus(tsl_status, TSL_NOT_FOUND, message);
+      break;
+    case tensorflow::error::ALREADY_EXISTS:
+      TSL_SetStatus(tsl_status, TSL_ALREADY_EXISTS, message);
+      break;
+    case tensorflow::error::PERMISSION_DENIED:
+      TSL_SetStatus(tsl_status, TSL_PERMISSION_DENIED, message);
+      break;
+    case tensorflow::error::UNAUTHENTICATED:
+      TSL_SetStatus(tsl_status, TSL_UNAUTHENTICATED, message);
+      break;
+    case tensorflow::error::RESOURCE_EXHAUSTED:
+      TSL_SetStatus(tsl_status, TSL_RESOURCE_EXHAUSTED, message);
+      break;
+    case tensorflow::error::FAILED_PRECONDITION:
+      TSL_SetStatus(tsl_status, TSL_FAILED_PRECONDITION, message);
+      break;
+    case tensorflow::error::ABORTED:
+      TSL_SetStatus(tsl_status, TSL_ABORTED, message);
+      break;
+    case tensorflow::error::OUT_OF_RANGE:
+      TSL_SetStatus(tsl_status, TSL_OUT_OF_RANGE, message);
+      break;
+    case tensorflow::error::UNIMPLEMENTED:
+      TSL_SetStatus(tsl_status, TSL_UNIMPLEMENTED, message);
+      break;
+    case tensorflow::error::INTERNAL:
+      TSL_SetStatus(tsl_status, TSL_INTERNAL, message);
+      break;
+    case tensorflow::error::UNAVAILABLE:
+      TSL_SetStatus(tsl_status, TSL_UNAVAILABLE, message);
+      break;
+    case tensorflow::error::DATA_LOSS:
+      TSL_SetStatus(tsl_status, TSL_DATA_LOSS, message);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  errors::CopyPayloads(status, tsl_status->status);
+}
+
+Status StatusFromTSL_Status(const TSL_Status* tsl_status) {
+  return tsl_status->status;
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/c/tsl_status_helper.h b/tensorflow/tsl/c/tsl_status_helper.h
new file mode 100644
index 00000000000..4734752394e
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status_helper.h
@@ -0,0 +1,44 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_C_TSL_STATUS_HELPER_H_
+#define TENSORFLOW_TSL_C_TSL_STATUS_HELPER_H_
+
+#include <memory>
+
+#include "tensorflow/tsl/c/tsl_status.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tsl {
+// Set the attribute of "tsl_status" from the attributes of "status".
+void Set_TSL_Status_from_Status(TSL_Status* TSL_status,
+                                const tsl::Status& status);
+
+// Returns a "status" from "tsl_status".
+Status StatusFromTSL_Status(const TSL_Status* tsl_status);
+
+namespace internal {
+struct TSL_StatusDeleter {
+  void operator()(TSL_Status* tsl_status) const {
+    TSL_DeleteStatus(tsl_status);
+  }
+};
+}  // namespace internal
+
+using TSL_StatusPtr = std::unique_ptr<TSL_Status, internal::TSL_StatusDeleter>;
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_C_TSL_STATUS_HELPER_H_
diff --git a/tensorflow/tsl/c/tsl_status_helper_test.cc b/tensorflow/tsl/c/tsl_status_helper_test.cc
new file mode 100644
index 00000000000..dc0970d89f1
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status_helper_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/c/tsl_status_helper.h"
+
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+namespace {
+
+TEST(StatusHelper, TestStatusHelper) {
+  TSL_Status* s = TSL_NewStatus();
+  Status cc_status(errors::InvalidArgument("some error"));
+  cc_status.SetPayload("key1", absl::Cord("value1"));
+  cc_status.SetPayload("key2", absl::Cord("value2"));
+  Set_TSL_Status_from_Status(s, cc_status);
+  ASSERT_EQ(TSL_INVALID_ARGUMENT, TSL_GetCode(s));
+  ASSERT_EQ(std::string("some error"), TSL_Message(s));
+
+  Status another_cc_status(StatusFromTSL_Status(s));
+  ASSERT_FALSE(another_cc_status.ok());
+  ASSERT_EQ(std::string("some error"), another_cc_status.error_message());
+  ASSERT_EQ(error::INVALID_ARGUMENT, another_cc_status.code());
+  // Ensure the payloads are not lost during conversions
+  ASSERT_EQ(cc_status.GetPayload("key1"), another_cc_status.GetPayload("key1"));
+  ASSERT_EQ(cc_status.GetPayload("key2"), another_cc_status.GetPayload("key2"));
+  TSL_DeleteStatus(s);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/tensorflow/tsl/c/tsl_status_internal.h b/tensorflow/tsl/c/tsl_status_internal.h
new file mode 100644
index 00000000000..8bb7b554fd6
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status_internal.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_C_TSL_STATUS_INTERNAL_H_
+#define TENSORFLOW_TSL_C_TSL_STATUS_INTERNAL_H_
+
+#include "tensorflow/tsl/platform/status.h"
+
+// Internal structures used by the status C API. These are likely to change
+// and should not be depended on.
+
+struct TSL_Status {
+  tsl::Status status;
+};
+
+#endif  // TENSORFLOW_TSL_C_TSL_STATUS_INTERNAL_H_
diff --git a/tensorflow/tsl/c/tsl_status_test.cc b/tensorflow/tsl/c/tsl_status_test.cc
new file mode 100644
index 00000000000..650118bb775
--- /dev/null
+++ b/tensorflow/tsl/c/tsl_status_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/c/tsl_status.h"
+
+#include <utility>
+
+#include "tensorflow/tsl/c/tsl_status_internal.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+namespace {
+
+TEST(TSL_Status, PayloadsSet) {
+  TSL_Status* tsl_status = TSL_NewStatus();
+  TSL_SetStatus(tsl_status, TSL_CANCELLED, "Error Message");
+  TSL_SetPayload(tsl_status, "a", "1");
+  TSL_SetPayload(tsl_status, "b", "2");
+  TSL_SetPayload(tsl_status, "c", "3");
+
+  const std::unordered_map<std::string, std::string> payloads =
+      errors::GetPayloads(tsl_status->status);
+  EXPECT_EQ(payloads.size(), 3);
+  EXPECT_EQ(payloads.at("a"), "1");
+  EXPECT_EQ(payloads.at("b"), "2");
+  EXPECT_EQ(payloads.at("c"), "3");
+  TSL_DeleteStatus(tsl_status);
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/BUILD b/tensorflow/tsl/concurrency/BUILD
new file mode 100644
index 00000000000..60d011962d0
--- /dev/null
+++ b/tensorflow/tsl/concurrency/BUILD
@@ -0,0 +1,81 @@
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "async_value",
+    srcs = [
+        "async_value.cc",
+        "async_value_ref.cc",
+    ],
+    hdrs = [
+        "async_value.h",
+        "async_value_ref.h",
+        "chain.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":concurrent_vector",
+        ":ref_count",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tsl_cc_test(
+    name = "async_value_test",
+    srcs = ["async_value_test.cc"],
+    deps = [
+        ":async_value",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+tsl_cc_test(
+    name = "async_value_ref_test",
+    srcs = ["async_value_ref_test.cc"],
+    deps = [
+        ":async_value",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "concurrent_vector",
+    hdrs = ["concurrent_vector.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tsl_cc_test(
+    name = "concurrent_vector_test",
+    srcs = ["concurrent_vector_test.cc"],
+    deps = [
+        ":concurrent_vector",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "ref_count",
+    hdrs = ["ref_count.h"],
+    compatible_with = get_compatible_with_portable(),
+)
diff --git a/tensorflow/tsl/concurrency/async_value.cc b/tensorflow/tsl/concurrency/async_value.cc
new file mode 100644
index 00000000000..2ca9aabc44a
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value.cc
@@ -0,0 +1,278 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/async_value.h"
+
+#include <cstdlib>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "tensorflow/tsl/concurrency/async_value_ref.h"
+
+namespace tsl {
+
+namespace internal {
+
+void* AlignedAlloc(size_t alignment, size_t size) {
+  size = (size + alignment - 1) / alignment * alignment;
+#ifdef _WIN32
+  // MSVC runtime doesn't support aligned_alloc(). See
+  // https://developercommunity.visualstudio.com/t/c17-stdaligned-alloc%E7%BC%BA%E5%A4%B1/468021#T-N473365
+  return _aligned_malloc(size, alignment);
+#elif defined(__ANDROID__) || defined(OS_ANDROID)
+  return memalign(alignment, size);
+#else
+  // posix_memalign requires that the requested alignment be at least
+  // alignof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  if (alignment <= alignof(void*)) return std::malloc(size);
+  void* ptr = nullptr;
+  if (posix_memalign(&ptr, alignment, size) != 0)
+    return nullptr;
+  else
+    return ptr;
+#endif
+}
+
+void AlignedFree(void* ptr) {
+#ifdef _WIN32
+  // _aligned_alloc() must be paired with _aligned_free().
+  //
+  // Attempting to use free() with a pointer returned by _aligned_malloc()
+  // results in runtime issues that are hard to debug.
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+}  // namespace internal
+
+// This is a singly linked list of nodes waiting for notification, hanging off
+// of AsyncValue.  When the value becomes available or if an error occurs, the
+// callbacks are informed.
+class NotifierListNode {
+ public:
+  explicit NotifierListNode(absl::AnyInvocable<void()> notification)
+      : next_(nullptr), notification_(std::move(notification)) {}
+
+ private:
+  friend class AsyncValue;
+  // This is the next thing waiting on the AsyncValue.
+  NotifierListNode* next_;
+  absl::AnyInvocable<void()> notification_;
+};
+
+uint16_t AsyncValue::CreateTypeInfoAndReturnTypeIdImpl(
+    const TypeInfo& type_info) {
+  size_t type_id = GetTypeInfoTableSingleton()->emplace_back(type_info) + 1;
+  // Detect overflow.
+  assert(type_id < std::numeric_limits<uint16_t>::max() &&
+         "Too many different AsyncValue types.");
+  return type_id;
+}
+
+AsyncValue::TypeInfoTable* AsyncValue::GetTypeInfoTableSingleton() {
+  constexpr int kInitialCapacity = 64;
+  static auto* type_info_table = new TypeInfoTable(kInitialCapacity);
+  return type_info_table;
+}
+
+std::atomic<size_t> AsyncValue::total_allocated_async_values_;
+
+const AsyncValue::TypeInfo& AsyncValue::GetTypeInfo() const {
+  TypeInfoTable* type_info_table = AsyncValue::GetTypeInfoTableSingleton();
+  assert(type_id_ != 0);
+  return (*type_info_table)[type_id_ - 1];
+}
+
+// This is called when the value is set into the ConcreteAsyncValue buffer, or
+// when the IndirectAsyncValue is forwarded to an available AsyncValue, and we
+// need to change our state and clear out the notifications. The current state
+// must be unavailable (i.e. kUnconstructed or kConstructed).
+void AsyncValue::NotifyAvailable(State available_state) {
+  assert((kind() == Kind::kConcrete || kind() == Kind::kIndirect) &&
+         "Should only be used by ConcreteAsyncValue or IndirectAsyncValue");
+
+  assert(available_state == State::kConcrete ||
+         available_state == State::kError);
+
+  // Mark the value as available, ensuring that new queries for the state see
+  // the value that got filled in.
+  auto old_value = waiters_and_state_.exchange(
+      WaitersAndState(nullptr, available_state), std::memory_order_acq_rel);
+  assert(old_value.state() == State::kUnconstructed ||
+         old_value.state() == State::kConstructed);
+
+  RunWaiters(old_value.waiter());
+}
+
+void AsyncValue::RunWaiters(NotifierListNode* list) {
+  while (list) {
+    auto* node = list;
+    // TODO(chky): pass state into notification_ so that waiters do not need to
+    // check atomic state again.
+    node->notification_();
+    list = node->next_;
+    delete node;
+  }
+}
+
+// If the value is available or becomes available, this calls the closure
+// immediately. Otherwise, the add closure to the waiter list where it will be
+// called when the value becomes available.
+void AsyncValue::EnqueueWaiter(absl::AnyInvocable<void()> waiter,
+                               WaitersAndState old_value) {
+  // Create the node for our waiter.
+  auto* node = new NotifierListNode(std::move(waiter));
+  auto old_state = old_value.state();
+
+  // Swap the next link in. old_value.state() must be unavailable when
+  // evaluating the loop condition. The acquire barrier on the compare_exchange
+  // ensures that prior changes to waiter list are visible here as we may call
+  // RunWaiter() on it. The release barrier ensures that prior changes to *node
+  // appear to happen before it's added to the list.
+  node->next_ = old_value.waiter();
+  auto new_value = WaitersAndState(node, old_state);
+  while (!waiters_and_state_.compare_exchange_weak(old_value, new_value,
+                                                   std::memory_order_acq_rel,
+                                                   std::memory_order_acquire)) {
+    // While swapping in our waiter, the value could have become available.  If
+    // so, just run the waiter.
+    if (old_value.state() == State::kConcrete ||
+        old_value.state() == State::kError) {
+      assert(old_value.waiter() == nullptr);
+      node->notification_();
+      delete node;
+      return;
+    }
+    // Update the waiter list in new_value.
+    node->next_ = old_value.waiter();
+  }
+
+  // compare_exchange_weak succeeds. The old_value must be in either
+  // kUnconstructed or kConstructed state.
+  assert(old_value.state() == State::kUnconstructed ||
+         old_value.state() == State::kConstructed);
+}
+
+void AsyncValue::SetError(absl::Status status) {
+  assert(!status.ok());
+  if (kind() == Kind::kConcrete) {
+    GetTypeInfo().set_error(this, std::move(status));
+  } else {
+    assert(kind() == Kind::kIndirect);
+    auto error_av = MakeErrorAsyncValueRef(std::move(status));
+    static_cast<IndirectAsyncValue*>(this)->ForwardTo(std::move(error_av));
+  }
+}
+
+// Mark this IndirectAsyncValue as forwarding to the specified value.  This
+// gives the IndirectAsyncValue a +1 reference.
+void IndirectAsyncValue::ForwardTo(RCReference<AsyncValue> value) {
+  assert(IsUnavailable());
+
+  auto s = value->state();
+  if (s == State::kConcrete || s == State::kError) {
+    assert(!value_ && "IndirectAsyncValue::ForwardTo is called more than once");
+    auto* concrete_value = value.release();
+    if (concrete_value->kind() == Kind::kIndirect) {
+      auto* indirect_value = static_cast<IndirectAsyncValue*>(concrete_value);
+      concrete_value = indirect_value->value_;
+      assert(concrete_value != nullptr);
+      assert(concrete_value->kind() == Kind::kConcrete);
+      concrete_value->AddRef();
+      indirect_value->DropRef();
+    }
+    value_ = concrete_value;
+    type_id_ = concrete_value->type_id_;
+    NotifyAvailable(s);
+  } else {
+    // Copy value here because the evaluation order of
+    // value->AndThen(std::move(value)) is not defined prior to C++17.
+    AsyncValue* value2 = value.get();
+    value2->AndThen(
+        [this2 = FormRef(this), value2 = std::move(value)]() mutable {
+          this2->ForwardTo(std::move(value2));
+        });
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Functions for awaiting on the async values.
+//===----------------------------------------------------------------------===//
+
+void BlockUntilReady(AsyncValue* async_value) {
+  absl::BlockingCounter cnt(1);
+  async_value->AndThen([&] { cnt.DecrementCount(); });
+  cnt.Wait();
+}
+
+void RunWhenReady(absl::Span<AsyncValue* const> values,
+                  absl::AnyInvocable<void()> callee) {
+  // Perform a quick scan of the arguments.  If they are all available,
+  // then we can run the callee synchronously.
+  absl::InlinedVector<AsyncValue*, 4> unavailable_values;
+  for (auto i : values) {
+    if (!i->IsAvailable()) unavailable_values.push_back(i);
+  }
+
+  // If we can synchronously call 'callee', then do it and we're done.
+  if (unavailable_values.empty()) return callee();
+
+  // If there is exactly one unavailable value, then we can just AndThen it.
+  if (unavailable_values.size() == 1) {
+    unavailable_values[0]->AndThen(
+        [callee = std::move(callee)]() mutable { callee(); });
+    return;
+  }
+
+  struct CounterAndCallee {
+    std::atomic<size_t> counter;
+    absl::AnyInvocable<void()> callee;
+  };
+
+  // Otherwise, we have multiple unavailable values.  Put a counter on the heap
+  // and have each unavailable value decrement and test it.
+  auto* data =
+      new CounterAndCallee{{unavailable_values.size()}, std::move(callee)};
+
+  for (auto* val : unavailable_values) {
+    val->AndThen([data]() {
+      // Decrement the counter unless we're the last to be here.
+      if (data->counter.fetch_sub(1) != 1) return;
+
+      // If we are the last one, then run the callee and free the data.
+      data->callee();
+      delete data;
+    });
+  }
+}
+
+void RunWhenReady(absl::Span<RCReference<AsyncValue> const> values,
+                  absl::AnyInvocable<void()> callee) {
+  absl::InlinedVector<AsyncValue*, 8> pointers;
+  pointers.reserve(values.size());
+  for (const auto& ref : values) {
+    pointers.push_back(ref.get());
+  }
+  RunWhenReady(pointers, std::move(callee));
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/async_value.h b/tensorflow/tsl/concurrency/async_value.h
new file mode 100644
index 00000000000..e26d838a373
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value.h
@@ -0,0 +1,993 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
+#define TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/tsl/concurrency/concurrent_vector.h"
+#include "tensorflow/tsl/concurrency/ref_count.h"
+
+namespace tsl {
+
+class NotifierListNode;
+
+namespace internal {
+
+template <typename T>
+class ConcreteAsyncValue;
+
+template <typename T>
+constexpr bool kMaybeBase = std::is_class<T>::value && !std::is_final<T>::value;
+
+// TODO(ezhulenev): Switch to `tsl::port::Aligned(Malloc|Free)` once TFRT will
+// be able to properly depend on TSL in the open source build.
+void* AlignedAlloc(size_t alignment, size_t size);
+void AlignedFree(void* ptr);
+
+}  // namespace internal
+
+// This is a future of the specified value type. Arbitrary C++ types may be used
+// here, even non-copyable types and expensive ones like tensors.
+//
+// An AsyncValue is in one of two states: unavailable or available. If it is in
+// the unavailable state, it may have a list of waiters which are notified when
+// the value transitions to another state.
+//
+// The actual payload data is stored in the templated subclass
+// ConcreteAsyncValue. This achieves good cache efficiency by storing the meta
+// data and the payload data in consecutive memory locations.
+class AsyncValue {
+ public:
+  ~AsyncValue();
+
+  // Return true if state is kUnconstructed.
+  bool IsUnconstructed() const;
+
+  // Return true if state is kConstructed.
+  bool IsConstructed() const;
+
+  // Return true if state is kConcrete.
+  bool IsConcrete() const;
+
+  // Return true if state is kError.
+  bool IsError() const;
+
+  // Return true if this async value is resolved to a concrete value or error.
+  bool IsAvailable() const;
+  bool IsUnavailable() const { return !IsAvailable(); }
+
+  // Return true if this is an IndirectAsyncValue that hasn't been resolved.
+  // Currently an IndirectAsyncValue is available if and only if it is resolved.
+  bool IsUnresolvedIndirect() const;
+
+  // Return reference count. This should be used for testing and debugging only.
+  uint32_t NumRef() const { return refcount_.load(std::memory_order_acquire); }
+
+  // Return true if reference count is 1.
+  bool IsUnique() const {
+    // Conservatively return false if it is an IndirectAsyncValue, because the
+    // refcount of an IndirectAsyncValue may not match the refcount of the
+    // underlying AsyncValue.
+    return (kind() != Kind::kIndirect) &&
+           (refcount_.load(std::memory_order_acquire) == 1);
+  }
+
+  // Add a new reference to this object.
+  //
+  // Return this object for convenience. e.g. when a call-site wants to feed an
+  // AsyncValue object into a function call that takes the object at +1, we can
+  // write: foo(av->AddRef());
+  AsyncValue* AddRef() { return AddRef(1); }
+  AsyncValue* AddRef(uint32_t count);
+
+  // Drop a reference to this object, potentially deallocating it.
+  void DropRef() { DropRef(1); }
+  void DropRef(uint32_t count);
+
+  // Return the stored value as type T. For the consumer of the AsyncValue, this
+  // requires that the state be `kConcrete`. For the producer of the AsyncValue,
+  // the state may also be `constructed`, as long as the producer can ensure
+  // race-free access to the data (e.g. no concurrent writes and reads and no
+  // concurrent changing the state to `kError). For both cases, T must be the
+  // exact type or a base type of the payload type of this AsyncValue. When T is
+  // a base type of the payload type, the following additional conditions are
+  // required:
+  // 1) Both the payload type and T are polymorphic (have virtual function) or
+  //    neither are.
+  // 2) The payload type does not use multiple inheritance.
+  // The above conditions are required since we store only the offset of the
+  // payload type in AsyncValue as data_traits_.buf_offset. Violation of either
+  // 1) or 2) requires additional pointer adjustments to get the proper pointer
+  // for the base type, which we do not have sufficient information to perform
+  // at runtime.
+  template <typename T>
+  const T& get() const;
+
+  // Same as the const overload of get(), except for returning a non-const ref.
+  template <typename T>
+  T& get();
+
+  // Returns the underlying error. IsError() must be true.
+  const absl::Status& GetError() const;
+
+  // Returns the underlying error, or nullptr if there is none.
+  const absl::Status* GetErrorIfPresent() const;
+
+  template <typename T>
+  bool IsType() const {
+    return GetTypeId<T>() == type_id_;
+  }
+
+  // Change state to kConcrete. Requires that this AsyncValue
+  // previously have state constructed.
+  void SetStateConcrete();
+
+  // Construct the payload of the AsyncValue in place and change its state to
+  // kConcrete. Requires that this AsyncValue previously have state
+  // kUnconstructed or kConstructed.
+  template <typename T, typename... Args>
+  void emplace(Args&&... args);
+
+  void SetError(absl::Status status);
+
+  // If the value is available or becomes available, this calls the closure
+  // immediately.  Otherwise, adds the waiter to the waiter list and calls it
+  // when the value becomes available.
+  template <typename WaiterT>
+  void AndThen(WaiterT waiter);
+
+  // Return the total number of async values that are currently live in the
+  // process. This is intended for debugging/assertions only, and shouldn't be
+  // used for mainline logic in the runtime.
+  static size_t GetNumAsyncValueInstances() {
+    assert(AsyncValueAllocationTrackingEnabled() &&
+           "AsyncValue instance tracking disabled!");
+    return total_allocated_async_values_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if we track the number of alive AsyncValue instances in
+  // total_allocated_async_values_.
+  static bool AsyncValueAllocationTrackingEnabled() {
+    // For now we track the number of alive AsyncValue instances only in debug
+    // builds.
+#ifdef NDEBUG
+    return false;
+#else
+    return true;
+#endif
+  }
+
+  // What sort of AsyncValue this is.
+  //
+  // We make this an unsigned type so that loading the enum from the bitfield
+  // does not sign extend.
+  enum class Kind : uint8_t {
+    kConcrete = 0,  // ConcreteAsyncValue
+    kIndirect = 1,  // IndirectAsyncValue
+  };
+
+  // Return the kind of this AsyncValue.
+  Kind kind() const { return kind_; }
+
+  class State {
+   public:
+    // The state of AsyncValue.
+    enum StateEnum : int8_t {
+      // The underlying value's constructor is not called and the value is not
+      // available for consumption. This state can transition to kConstructed,
+      // kConcrete and kError.
+      kUnconstructed = 0,
+      // The underlying value's constructor is called but the value is not
+      // available for consumption. This state can transition to
+      // kConcrete and kError.
+      kConstructed = 1,
+      // The underlying value is available for consumption. This state can not
+      // transition to any other state.
+      kConcrete = 2,
+      // This AsyncValue is available and contains an error. This state can not
+      // transition to any other state.
+      kError = 3,
+    };
+
+    State(StateEnum s) : state_(s) {}  // NOLINT
+
+    operator StateEnum() { return state_; }  // NOLINT
+
+    // Return true if state is kUnconstructed.
+    bool IsUnconstructed() const { return state_ == kUnconstructed; }
+
+    // Return true if state is kConstructed.
+    bool IsConstructed() const { return state_ == kConstructed; }
+
+    // Return true if state is kConcrete.
+    bool IsConcrete() const { return state_ == kConcrete; }
+
+    // Return true if state is kError.
+    bool IsError() const { return state_ == kError; }
+
+    // Return true if this async value is resolved to a concrete value or error.
+    bool IsAvailable() const { return state_ == kConcrete || state_ == kError; }
+    bool IsUnavailable() const { return !IsAvailable(); }
+
+    const char* DebugString() const {
+      switch (state_) {
+        case kUnconstructed:
+          return "kUnconstructed";
+        case kConstructed:
+          return "kConstructed";
+        case kConcrete:
+          return "kConcrete";
+        case kError:
+          return "kError";
+      }
+    }
+
+   private:
+    StateEnum state_;
+  };
+
+  // Return which state this AsyncValue is in.
+  State state() const {
+    return waiters_and_state_.load(std::memory_order_acquire).state();
+  }
+
+ protected:
+  // -----------------------------------------------------------
+  // Implementation details follow.  Clients should ignore them.
+
+  // Utility template for tag dispatching.
+  template <typename T>
+  struct TypeTag {};
+
+  friend class IndirectAsyncValue;
+  template <typename T>
+  AsyncValue(Kind kind, State state, bool is_refcounted, TypeTag<T>)
+      : refcount_(1),
+        kind_(kind),
+        has_vtable_(std::is_polymorphic<T>()),
+        is_refcounted_(is_refcounted),
+        type_id_(GetTypeId<T>()),
+        waiters_and_state_(WaitersAndState(nullptr, state)) {
+    if (AsyncValueAllocationTrackingEnabled() && is_refcounted)
+      total_allocated_async_values_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  AsyncValue(Kind kind, State state, bool is_refcounted)
+      : refcount_(1),
+        kind_(kind),
+        has_vtable_(false),
+        is_refcounted_(is_refcounted),
+        type_id_(0),
+        waiters_and_state_(WaitersAndState(nullptr, state)) {
+    if (AsyncValueAllocationTrackingEnabled() && is_refcounted)
+      total_allocated_async_values_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  AsyncValue(const AsyncValue&) = delete;
+  AsyncValue& operator=(const AsyncValue&) = delete;
+
+  void NotifyAvailable(State available_state);
+  void Destroy();
+  void RunWaiters(NotifierListNode* list);
+
+  // IsTypeIdCompatible returns true if the type value stored in this AsyncValue
+  // instance can be safely cast to `T`. This is a conservative check. I.e.
+  // IsTypeIdCompatible may return true even if the value cannot be safely cast
+  // to `T`. However, if it returns false then the value definitely cannot be
+  // safely cast to `T`. This means it is useful mainly as a debugging aid for
+  // use in assert() etc.
+
+  template <typename T, std::enable_if_t<internal::kMaybeBase<T>>* = nullptr>
+  bool IsTypeIdCompatible() const {
+    // We can't do a GetTypeId<T> in this case because `T` might be an abstract
+    // class.  So we conservatively return true.
+    return true;
+  }
+
+  template <typename T, std::enable_if_t<!internal::kMaybeBase<T>>* = nullptr>
+  bool IsTypeIdCompatible() const {
+    return GetTypeId<T>() == type_id_;
+  }
+
+  // Return the ID of the given type. Note that at most 2^16-2 (approx. 64K)
+  // unique types can be used in AsyncValues, since the ID is 16 bits, and 0 and
+  // 2^16-1 are not allowed to be used as type IDs.
+  template <typename T>
+  static uint16_t GetTypeId() {
+    return internal::ConcreteAsyncValue<T>::concrete_type_id_;
+  }
+
+  // Creates a AsyncValue::TypeInfo object for `T` and store it in the global
+  // TypeInfo table. Returns the "type id" for `T` which currently happens to
+  // be one plus the index of this TypeInfo object in the TypeInfo table.
+  //
+  // This should only be called from the initializer for the static
+  // ConcreteAsyncValue concrete_type_id_ field.
+  template <typename T>
+  static uint16_t CreateTypeInfoAndReturnTypeId() {
+    return CreateTypeInfoAndReturnTypeIdImpl(
+        MakeTypeInfo<internal::ConcreteAsyncValue<T>>());
+  }
+
+  std::atomic<uint32_t> refcount_{1};
+
+  Kind kind_ : 2;
+  // has_vtable_ has the same value for a given payload type T. If we want to
+  // use the unused bits here for other purpose in the future, we can move
+  // has_vtable_ to a global vector<bool> indexed by type_id_.
+  const bool has_vtable_ : 1;
+
+  // When is_refcounted_ is false, `AddRef` and `DropRef` have no effect in
+  // optimized builds. We always do reference counting in debug builds to verify
+  // that async values used correctly and we do not have accidental dangling
+  // references.
+  const bool is_refcounted_ : 1;
+
+  // This is a 16-bit value that identifies the type.
+  uint16_t type_id_ = 0;
+
+  // The waiter list and the state are compacted into one single atomic word as
+  // accesses to them are tightly related. To change the state from unavailable
+  // (i.e. kUnconstructed or kConstructed) to available
+  // (i.e. kConcrete or kError), we also need to empty the waiter
+  // list. To add a node to the waiter list, we want to make sure the state is
+  // unavailable, otherwise we could run the new node immediately.
+  //
+  // Invariant: If the state is not available, then the waiter list must be
+  // nullptr.
+  struct WaitersAndState {
+    // We rely on the fact that all `NotifierListNode` values are aligned at
+    // least to 8 bytes and we can encode state in the lowest 3 bits. We use
+    // the conservative estimation of the minimal alignment of pointers returned
+    // from memory allocation functions.
+    //
+    // See: https://en.cppreference.com/w/cpp/types/max_align_t
+    static_assert(alignof(std::max_align_t) >= 8 &&
+                  sizeof(NotifierListNode*) == 8);
+
+    static constexpr uint64_t kStateMask = (1ull << 2) - 1;
+    static constexpr uint64_t kPointerMask = ~kStateMask;
+
+    WaitersAndState(NotifierListNode* ptr, State state) {
+      value = (reinterpret_cast<uintptr_t>(ptr) & kPointerMask) |
+              (state & kStateMask);
+    }
+
+    State state() const {
+      return State(static_cast<State::StateEnum>(value & kStateMask));
+    }
+
+    NotifierListNode* waiter() const {
+      return reinterpret_cast<NotifierListNode*>(value & kPointerMask);
+    }
+
+    uintptr_t value;
+  };
+
+  std::atomic<WaitersAndState> waiters_and_state_;
+
+  // We assume (and static_assert) that this is the offset of
+  // ConcreteAsyncValue::data_, which is the same as the offset of
+  // ConcreteAsyncValue::error_.
+  static constexpr int kDataOffset = 16;
+
+ private:
+  // Information about a ConcreteAsyncValue<T> subclass.
+  struct TypeInfo {
+    // Destructor returns the size of the derived AsyncValue to be deallocated.
+    using DestructorFn = size_t (*)(AsyncValue*);
+    using GetErrorFn = const absl::Status& (*)(const AsyncValue*);
+    using SetErrorFn = void (*)(AsyncValue*, absl::Status);
+    using HasDataFn = bool (*)(const AsyncValue*);
+
+    DestructorFn destructor;
+    GetErrorFn get_error;
+    SetErrorFn set_error;
+    HasDataFn has_data;
+  };
+
+  template <typename Derived>
+  static TypeInfo MakeTypeInfo() {
+    return TypeInfo{
+        [](AsyncValue* v) {
+          static_cast<Derived*>(v)->~Derived();
+          return sizeof(Derived);
+        },
+        [](const AsyncValue* v) -> const absl::Status& {
+          return static_cast<const Derived*>(v)->GetError();
+        },
+        [](AsyncValue* v, absl::Status status) {
+          static_cast<Derived*>(v)->SetError(std::move(status));
+        },
+        [](const AsyncValue* v) {
+          return static_cast<const Derived*>(v)->HasData();
+        },
+    };
+  }
+
+  static uint16_t CreateTypeInfoAndReturnTypeIdImpl(const TypeInfo& type_info);
+
+  template <typename T>
+  const T& GetConcreteValue() const;
+
+  // Get the TypeInfo instance for this AsyncValue.
+  const TypeInfo& GetTypeInfo() const;
+
+  using TypeInfoTable = internal::ConcurrentVector<TypeInfo>;
+
+  // Returns the TypeInfoTable instance (there is one per process).
+  static TypeInfoTable* GetTypeInfoTableSingleton();
+
+  void EnqueueWaiter(absl::AnyInvocable<void()> waiter,
+                     WaitersAndState old_value);
+
+  // This is a global counter of the number of AsyncValue instances currently
+  // live in the process.  This is intended to be used for debugging only, and
+  // is only kept in sync if AsyncValueAllocationTrackingEnabled() returns
+  // true.
+  static std::atomic<size_t> total_allocated_async_values_;
+};
+
+// We only optimize the code for 64-bit architectures for now.
+static_assert(sizeof(AsyncValue) == 16 || sizeof(void*) != 8,
+              "Unexpected size for AsyncValue");
+
+//===----------------------------------------------------------------------===//
+// Functions for awaiting on the async values.
+//===----------------------------------------------------------------------===//
+
+// Blocks the caller thread until the async value becomes available.
+void BlockUntilReady(AsyncValue* async_value);
+
+// Runs the `callee` when all async values become available.
+void RunWhenReady(absl::Span<AsyncValue* const> values,
+                  absl::AnyInvocable<void()> callee);
+void RunWhenReady(absl::Span<RCReference<AsyncValue> const> values,
+                  absl::AnyInvocable<void()> callee);
+
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// Under the normal behavior, if an AsyncValue is in kConstructed state (i.e.
+// the payload data is constructed), it will destruct the payload data when the
+// AsyncValue enters the error state (e.g. on AsyncValue::SetError()).
+//
+// However, for the payload types that inherit from
+// `KeepAsyncValuePayloadOnError`, AsyncValue exhibits a different behavior: the
+// payload value if constructed will be kept valid when the AsyncValue goes into
+// the error state. This behavior is intended for use only in the TpuRuntime
+// code. All the other code shall *not* use this behavior. We keep this struct
+// in the `internal` namespace to indicate this restriction.
+struct KeepAsyncValuePayloadOnError {};
+
+// Async value itself is a container that either holds `absl::Status` (in error
+// state) or a concrete value of type `T` (in concrete state). Async value that
+// holds a status is typically a bad idea, and should be expressed as a plain
+// async value.
+//
+// Example:
+//  - Prefer `AsyncValueRef<Chain>` to `AsyncValueRef<absl::Status>`.
+//    Instead of a `Chain` it can be any other empty struct to signal that only
+//    the potential error is important.
+//
+//  - Prefer `AsyncValueRef<T>` to `AsyncValueRef<absl::StatusOr<T>>`.
+//    Similar to the `absl::StatusOr<T>` async value will be either in error
+//    state holding an `absl::Status` error, or in concrete state holding a
+//    value of type `T`.
+//
+template <typename T>
+struct IsStatus : std::false_type {};
+
+template <>
+struct IsStatus<absl::Status> : std::true_type {};
+
+template <typename T>
+struct IsStatus<absl::StatusOr<T>> : std::true_type {};
+
+// Subclass for storing the payload of the AsyncValue
+template <typename T>
+class ConcreteAsyncValue : public AsyncValue {
+ public:
+  // Async value does not support `absl::Status` or `absl::StatusOr` payload.
+  static_assert(!IsStatus<T>::value, "invalid payload type");
+
+  // Tag type for making a ConcreteAsyncValue without calling underlying value's
+  // constructor.
+  struct UnconstructedPayload {
+    bool is_refcounted = true;
+  };
+
+  // Tag type for making a ConcreteAsyncValue with the underlying value
+  // constructed but not available for consumption.
+  struct ConstructedPayload {
+    bool is_refcounted = true;
+  };
+
+  // Tag type for making a ConcreteAsyncValue with the underlying value
+  // constructed and available for consumption.
+  struct ConcretePayload {
+    bool is_refcounted = true;
+  };
+
+  // Make a ConcreteAsyncValue with kUnconstructed state.
+  explicit ConcreteAsyncValue(UnconstructedPayload payload)
+      : AsyncValue(Kind::kConcrete, State::kUnconstructed,
+                   payload.is_refcounted, TypeTag<T>()) {
+    VerifyOffsets();
+  }
+
+  // Make a ConcreteAsyncValue with kError state.
+  explicit ConcreteAsyncValue(absl::Status status)
+      : AsyncValue(Kind::kConcrete, State::kError,
+                   /*is_refcounted=*/true, TypeTag<T>()),
+        data_store_{std::move(status)} {
+    VerifyOffsets();
+  }
+
+  // Make a ConcreteAsyncValue with kConstructed state.
+  template <typename... Args>
+  explicit ConcreteAsyncValue(ConstructedPayload payload, Args&&... args)
+      : AsyncValue(Kind::kConcrete, State::kConstructed, payload.is_refcounted,
+                   TypeTag<T>()),
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {}
+
+  // Make a ConcreteAsyncValue with kConcrete state.
+  template <typename... Args>
+  explicit ConcreteAsyncValue(ConcretePayload payload, Args&&... args)
+      : AsyncValue(Kind::kConcrete, State::kConcrete, payload.is_refcounted,
+                   TypeTag<T>()),
+        data_store_{TypeTag<T>(), std::forward<Args>(args)...} {}
+
+  ~ConcreteAsyncValue() { Destroy(); }
+
+  // Return the underlying error. IsError() must return true.
+  const absl::Status& GetError() const {
+    assert(IsError());
+    return data_store_.error();
+  }
+
+  void SetError(absl::Status status) {
+    data_store_.SetError(state(), std::move(status));
+    NotifyAvailable(State::kError);
+  }
+
+  const T& get() const {
+    assert(HasData());
+    return data_store_.data();
+  }
+
+  T& get() {
+    assert(HasData());
+    return data_store_.data();
+  }
+
+  // Construct the payload of the AsyncValue in place and change its state to
+  // kConcrete. Requires that this AsyncValue previously have state
+  // unavailable.
+  template <typename... Args>
+  void emplace(Args&&... args) {
+    data_store_.EmplaceData(std::forward<Args>(args)...);
+    NotifyAvailable(State::kConcrete);
+  }
+
+  static bool classof(const AsyncValue* v) {
+    return v->kind() == AsyncValue::Kind::kConcrete;
+  }
+
+ private:
+  friend class AsyncValue;
+
+  // Data and error layout when the payload does not inherit from
+  // KeepAsyncValuePayloadOnError. This type destructs the payload value on
+  // error. It never keeps both data and error alive at the same time.
+  class DataOrError {
+   public:
+    DataOrError() {}
+
+    explicit DataOrError(absl::Status status)
+        : error_{new absl::Status(std::move(status))} {}
+
+    template <typename... Args>
+    explicit DataOrError(TypeTag<T>, Args&&... args)
+        : data_{std::forward<Args>(args)...} {}
+
+    ~DataOrError() {}
+
+    void Destroy(State s) {
+      if (s == State::kError) {
+        delete error_;
+      } else if (s == State::kConstructed || s == State::kConcrete) {
+        data_.~T();
+      }
+    }
+
+    void SetError(State s, absl::Status status) {
+      assert(s == State::kUnconstructed || s == State::kConstructed);
+      if (s == State::kConstructed) {
+        data_.~T();
+      }
+      error_ = new absl::Status(std::move(status));
+    }
+
+    template <typename... Args>
+    void EmplaceData(Args&&... args) {
+      new (&data_) T(std::forward<Args>(args)...);
+    }
+
+    bool HasData(State s) const {
+      return s == State::kConstructed || s == State::kConcrete;
+    }
+
+    absl::Status& error() { return *error_; }
+    T& data() { return data_; }
+    const absl::Status& error() const { return *error_; }
+    const T& data() const { return data_; }
+
+   private:
+    friend class ConcreteAsyncValue;
+    union {
+      absl::Status* error_;
+      T data_;
+    };
+  };
+
+  // Data and error layout when the payload inherits from
+  // KeepAsyncValuePayloadOnError. This type does to destruct the payload value
+  // on error. It may keep both data and error alive at the same time.
+  class DataAndError {
+   public:
+    explicit DataAndError(absl::Status status) { SetError(std::move(status)); }
+
+    template <typename... Args>
+    explicit DataAndError(TypeTag<T>, Args&&... args) {
+      EmplaceData(std::forward<Args>(args)...);
+    }
+
+    void Destroy(State s) {
+      if (HasData()) data().~T();
+      error_.reset();
+      has_data_ = false;
+    }
+
+    void SetError(State s, absl::Status status) {
+      assert(!error_);
+      error_ = std::make_unique<absl::Status>(std::move(status));
+    }
+
+    template <typename... Args>
+    void EmplaceData(Args&&... args) {
+      assert(!HasData());
+      new (&data_) T(std::forward<Args>(args)...);
+      has_data_ = true;
+    }
+
+    T& data() { return *reinterpret_cast<T*>(&data_); }
+    const T& data() const { return *reinterpret_cast<const T*>(&data_); }
+
+    bool HasData(State s) const { return has_data_; }
+    bool HasData() const { return has_data_; }
+    const absl::Status& error() const { return *error_; }
+    absl::Status& error() { return *error_; }
+
+   private:
+    friend class ConcreteAsyncValue;
+    using StorageT = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+
+    StorageT data_;
+    bool has_data_ = false;
+    std::unique_ptr<absl::Status> error_;
+  };
+
+  using DataStoreT =
+      std::conditional_t<std::is_base_of_v<KeepAsyncValuePayloadOnError, T>,
+                         DataAndError, DataOrError>;
+  DataStoreT data_store_;
+
+  void Destroy() { data_store_.Destroy(state()); }
+  bool HasData() const { return data_store_.HasData(state()); }
+
+  static void VerifyOffsets() {
+    static_assert(offsetof(ConcreteAsyncValue<T>, data_store_.data_) ==
+                      AsyncValue::kDataOffset,
+                  "Offset of ConcreteAsyncValue::data_ is assumed to be "
+                  "AsyncValue::kDataOffset == 16");
+    static_assert(offsetof(ConcreteAsyncValue<T>, data_store_.error_) ==
+                      AsyncValue::kDataOffset,
+                  "Offset of ConcreteAsyncValue::error_ is assumed to be "
+                  "AsyncValue::kDataOffset == 16");
+  }
+
+  static const uint16_t concrete_type_id_;
+};
+
+template <typename T>
+const uint16_t ConcreteAsyncValue<T>::concrete_type_id_ =
+    AsyncValue::CreateTypeInfoAndReturnTypeId<T>();
+}  // namespace internal
+
+struct DummyValueForErrorAsyncValue {};
+
+class ErrorAsyncValue
+    : public internal::ConcreteAsyncValue<DummyValueForErrorAsyncValue> {
+ public:
+  ErrorAsyncValue(absl::Status status)  // NOLINT
+      : internal::ConcreteAsyncValue<DummyValueForErrorAsyncValue>(
+            std::move(status)) {}
+};
+
+// IndirectAsyncValue represents an uncomputed AsyncValue of unspecified kind
+// and type. IndirectAsyncValue is used when an AsyncValue must be returned,
+// but the value it holds is not ready and the producer of the value doesn't
+// know what type it will ultimately be, or whether it will be an error.
+class IndirectAsyncValue : public AsyncValue {
+  friend class AsyncValue;
+
+ public:
+  IndirectAsyncValue()
+      : AsyncValue(Kind::kIndirect, State::kUnconstructed,
+                   /*is_refcounted=*/true) {}
+
+  IndirectAsyncValue* AddRef() { return AddRef(1); }
+  IndirectAsyncValue* AddRef(uint32_t count) {
+    return static_cast<IndirectAsyncValue*>(AsyncValue::AddRef(count));
+  }
+
+  // Mark this IndirectAsyncValue as forwarding to the specified value. This
+  // gives the IndirectAsyncValue a +1 reference.
+  // This method must be called at most once.
+  void ForwardTo(RCReference<AsyncValue> value);
+
+  static bool classof(const AsyncValue* v) {
+    return v->kind() == AsyncValue::Kind::kIndirect;
+  }
+
+ private:
+  ~IndirectAsyncValue() { Destroy(); }
+
+  void Destroy() {
+    if (value_) {
+      value_->DropRef();
+      value_ = nullptr;
+    }
+  }
+
+  AsyncValue* value_ = nullptr;
+};
+
+// -----------------------------------------------------------
+// Implementation details follow.  Clients should ignore them.
+//
+inline AsyncValue::~AsyncValue() {
+  assert(waiters_and_state_.load().waiter() == nullptr &&
+         "An async value with waiters should never have refcount of zero");
+  if (AsyncValueAllocationTrackingEnabled() && is_refcounted_)
+    total_allocated_async_values_.fetch_sub(1, std::memory_order_relaxed);
+
+  // Catch use-after-free errors more eagerly, by triggering the size assertion
+  // in the 'get' accessor.
+  type_id_ = ~0;
+}
+
+inline bool AsyncValue::IsAvailable() const {
+  auto s = state();
+  return s == State::kConcrete || s == State::kError;
+}
+
+inline bool AsyncValue::IsError() const { return state() == State::kError; }
+
+inline bool AsyncValue::IsUnconstructed() const {
+  return state() == State::kUnconstructed;
+}
+
+inline bool AsyncValue::IsConstructed() const {
+  return state() == State::kConstructed;
+}
+
+inline bool AsyncValue::IsConcrete() const {
+  return state() == State::kConcrete;
+}
+
+// Return true if this is an IndirectAsyncValue that hasn't been resolved.
+// Currently an IndirectAsyncValue is available if and only if it is resolved.
+inline bool AsyncValue::IsUnresolvedIndirect() const {
+  return IsUnavailable() && (kind() == Kind::kIndirect);
+}
+
+inline AsyncValue* AsyncValue::AddRef(uint32_t count) {
+  // Always enable reference counting in debug builds to verify that the use of
+  // async values is "ref count correct". In optimized builds the async value
+  // owner is responsible for destructing the non-reference-counted async value.
+#if defined(NDEBUG)
+  if (!is_refcounted_) return this;
+#endif
+
+  if (count > 0) {
+    assert(refcount_.load(std::memory_order_relaxed) > 0);
+    // Increasing the reference counter can always be done with
+    // memory_order_relaxed: New references to an object can only be formed from
+    // an existing reference, and passing an existing reference from one thread
+    // to another must already provide any required synchronization.
+    refcount_.fetch_add(count, std::memory_order_relaxed);
+  }
+  return this;
+}
+
+inline void AsyncValue::DropRef(uint32_t count) {
+  // Always enable reference counting in debug builds to verify that the use of
+  // async values is "ref count correct". In optimized builds the async value
+  // owner is responsible for destructing the non-reference-counted async value.
+#if defined(NDEBUG)
+  if (!is_refcounted_) return;
+#endif
+
+  assert(refcount_.load(std::memory_order_relaxed) > 0);
+  // We expect that `count` argument will often equal the actual reference count
+  // here; optimize for that. If `count` == reference count, only an acquire
+  // barrier is needed to prevent the effects of the deletion from leaking
+  // before this point.
+  auto is_last_ref = refcount_.load(std::memory_order_acquire) == count;
+  if (!is_last_ref) {
+    // If `count` != reference count, a release barrier is needed in
+    // addition to an acquire barrier so that prior changes by this thread
+    // cannot be seen to occur after this decrement.
+    is_last_ref =
+        refcount_.fetch_sub(count, std::memory_order_acq_rel) == count;
+  }
+  // Destroy this value if the refcount drops to zero.
+  if (is_last_ref) {
+    Destroy();
+  }
+}
+
+template <typename T>
+const T& AsyncValue::GetConcreteValue() const {
+  // Make sure both T (the stored type) and BaseT have vtable_ptr or
+  // neither have the vtable_ptr.
+  assert(std::is_polymorphic<T>::value == has_vtable_);
+  assert(IsTypeIdCompatible<T>() && "Incorrect accessor");
+
+  const char* this_ptr = reinterpret_cast<const char*>(this);
+  return *reinterpret_cast<const T*>(this_ptr + AsyncValue::kDataOffset);
+}
+
+template <typename T>
+const T& AsyncValue::get() const {
+  auto s = state();
+  (void)s;
+
+  switch (kind()) {
+    case Kind::kConcrete:
+#ifndef NDEBUG
+      // TODO(ezhulenev): Use `DLOG_IF` when absl logging is available.
+      if (!GetTypeInfo().has_data(this)) {
+        std::cerr << "Cannot call get() when ConcreteAsyncValue"  // Crash OK
+                  << " isn't constructed; state: " << s.DebugString() << ","
+                  << " error message: "
+                  << (IsError() ? GetError().message() : "None");
+        std::abort();
+      }
+#endif  // NDEBUG
+      return GetConcreteValue<T>();
+    case Kind::kIndirect:
+#ifndef NDEBUG
+      // TODO(ezhulenev): Use `DLOG_IF` when absl logging is available.
+      if (s != State::kConcrete) {
+        std::cerr << "Cannot call get() when IndirectAsyncValue"  // Crash OK
+                  << " isn't concrete; state: " << s.DebugString() << ","
+                  << " error message: "
+                  << (IsError() ? GetError().message() : "None");
+        std::abort();
+      }
+#endif  // NDEBUG
+      auto* iv_value = static_cast<const IndirectAsyncValue*>(this)->value_;
+      assert(iv_value && "Indirect value not resolved");
+      return iv_value->get<T>();
+  }
+  assert(false && "unexpected AsyncValue kind");
+}
+
+template <typename T>
+T& AsyncValue::get() {
+  return const_cast<T&>(static_cast<const AsyncValue*>(this)->get<T>());
+}
+
+inline void AsyncValue::SetStateConcrete() {
+  assert(IsConstructed() && kind() == Kind::kConcrete);
+  NotifyAvailable(State::kConcrete);
+}
+
+template <typename T, typename... Args>
+void AsyncValue::emplace(Args&&... args) {
+  assert(GetTypeId<T>() == type_id_ && "Incorrect accessor");
+  assert(IsUnconstructed() && kind() == Kind::kConcrete);
+
+  static_cast<internal::ConcreteAsyncValue<T>*>(this)->emplace(
+      std::forward<Args>(args)...);
+}
+
+// Returns the underlying error, or nullptr if there is none.
+inline const absl::Status* AsyncValue::GetErrorIfPresent() const {
+  switch (kind()) {
+    case Kind::kConcrete: {
+      if (state() != State::kError) return nullptr;
+      return &GetTypeInfo().get_error(this);
+    }
+    case Kind::kIndirect: {
+      auto* iv_value = static_cast<const IndirectAsyncValue*>(this)->value_;
+      // Unresolved IndirectAsyncValues are not errors.
+      if (!iv_value) return nullptr;
+
+      assert(iv_value->kind() != Kind::kIndirect);
+      return iv_value->GetErrorIfPresent();
+    }
+  }
+}
+
+inline const absl::Status& AsyncValue::GetError() const {
+  auto* result = GetErrorIfPresent();
+  assert(result && "Cannot call GetError() when error isn't available.");
+  return *result;
+}
+
+template <typename WaiterT>
+void AsyncValue::AndThen(WaiterT waiter) {
+  // Clients generally want to use AndThen without them each having to check
+  // to see if the value is present. Check for them, and immediately run the
+  // lambda if it is already here.
+  auto old_value = waiters_and_state_.load(std::memory_order_acquire);
+  if (old_value.state() == State::kConcrete ||
+      old_value.state() == State::kError) {
+    assert(old_value.waiter() == nullptr);
+    waiter();
+    return;
+  }
+  EnqueueWaiter(std::forward<WaiterT>(waiter), old_value);
+}
+
+inline void AsyncValue::Destroy() {
+  // Copy `is_refcounted` flag before destroying the async value object.
+  bool was_ref_counted = is_refcounted_;
+
+  if (kind() == Kind::kIndirect) {
+    // Depending on what the benchmarks say, it might make sense to remove this
+    // explicit check and instead make ~IndirectAsyncValue go through the
+    // GetTypeInfo().destructor case below.
+    static_cast<IndirectAsyncValue*>(this)->~IndirectAsyncValue();
+    if (was_ref_counted) internal::AlignedFree(this);
+    return;
+  }
+
+  GetTypeInfo().destructor(this);
+  if (was_ref_counted) internal::AlignedFree(this);
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_H_
diff --git a/tensorflow/tsl/concurrency/async_value_ptr_test.cc b/tensorflow/tsl/concurrency/async_value_ptr_test.cc
new file mode 100644
index 00000000000..82019f61284
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value_ptr_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/async_value_ref.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+
+TEST(AsyncValuePtrTest, Construct) {
+  AsyncValueRef<int32_t> ref = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_EQ(ptr.get(), 42);
+}
+
+TEST(AsyncValuePtrTest, CopyRef) {
+  AsyncValueRef<int32_t> ref0 = MakeAvailableAsyncValueRef<int32_t>(42);
+  AsyncValuePtr<int32_t> ptr = ref0.AsPtr();
+
+  EXPECT_TRUE(ref0.IsUnique());  // pointer doesn't change the reference count
+
+  AsyncValueRef<int32_t> ref1 = ptr.CopyRef();
+
+  EXPECT_FALSE(ref0.IsUnique());
+  EXPECT_FALSE(ref1.IsUnique());
+}
+
+TEST(AsyncValuePtrTest, Emplace) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  ptr.emplace(42);
+  EXPECT_EQ(ptr.get(), 42);
+}
+
+TEST(AsyncValuePtrTest, SetError) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  ptr.SetError(absl::InternalError("test error"));
+
+  EXPECT_TRUE(ptr.IsAvailable());
+  EXPECT_TRUE(ptr.IsError());
+}
+
+TEST(AsyncValuePtrTest, AndThen) {
+  AsyncValueRef<int32_t> ref = MakeUnconstructedAsyncValueRef<int32_t>();
+  AsyncValuePtr<int32_t> ptr = ref.AsPtr();
+
+  EXPECT_FALSE(ptr.IsConcrete());
+  EXPECT_FALSE(ptr.IsAvailable());
+
+  bool executed = false;
+  ptr.AndThen([&]() { executed = true; });
+
+  ptr.emplace(42);
+  EXPECT_TRUE(executed);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/async_value_ref.cc b/tensorflow/tsl/concurrency/async_value_ref.cc
new file mode 100644
index 00000000000..396f4885acf
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value_ref.cc
@@ -0,0 +1,38 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/async_value_ref.h"
+
+#include <string_view>
+#include <utility>
+
+namespace tsl {
+
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue() {
+  return TakeRef(internal::AllocateAndConstruct<IndirectAsyncValue>());
+}
+
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(absl::Status status) {
+  auto* error_value =
+      internal::AllocateAndConstruct<ErrorAsyncValue>(std::move(status));
+
+  return TakeRef(error_value);
+}
+
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(std::string_view message) {
+  return MakeErrorAsyncValueRef(absl::InternalError(message));
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/async_value_ref.h b/tensorflow/tsl/concurrency/async_value_ref.h
new file mode 100644
index 00000000000..c1d14f126c5
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value_ref.h
@@ -0,0 +1,468 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+#define TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
+
+#include <cstddef>
+#include <cstdlib>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/tsl/concurrency/async_value.h"
+#include "tensorflow/tsl/concurrency/ref_count.h"
+
+namespace tsl {
+
+// Forward declare non-owning typed async value pointer.
+template <typename T>
+class AsyncValuePtr;
+
+// RCReference<AsyncValue> wrapper.
+//
+// AsyncValueRef<T> is an alias for RCReference<AsyncValue> that carries payload
+// type information. The user does not need to pass the payload data type to
+// get() or emplace().
+//
+// Like RCReference<AsyncValue>, it represents one reference on the underlying
+// AsyncValue. When a callee returns an AsyncValueRef to a caller, the callee
+// also transfers their ownership of a reference on the underlying AsyncValue.
+template <typename T>
+class AsyncValueRef {
+ public:
+  AsyncValueRef() = default;
+  AsyncValueRef(std::nullptr_t) {}  // NOLINT
+
+  explicit AsyncValueRef(RCReference<AsyncValue> value)
+      : value_(std::move(value)) {}
+
+  // Support implicit conversion from AsyncValueRef<Derived> to
+  // AsyncValueRef<Base>.
+  template <typename DerivedT,
+            std::enable_if_t<std::is_base_of<T, DerivedT>::value>* = nullptr>
+  AsyncValueRef(AsyncValueRef<DerivedT>&& u)  // NOLINT
+      : value_(u.ReleaseRCRef()) {}
+
+  // Support implicit conversion from RCReference<AsyncValue>.
+  AsyncValueRef(RCReference<ErrorAsyncValue> value)  // NOLINT
+      : value_(std::move(value)) {}
+
+  AsyncValueRef& operator=(RCReference<ErrorAsyncValue> new_value) {
+    value_ = std::move(new_value);
+    return *this;
+  }
+
+  // Allow implicit conversion to type-erased RCReference<AsyncValue>
+  operator RCReference<AsyncValue>() && { return std::move(value_); }  // NOLINT
+
+  // Return true if the AsyncValue is resolved to a concrete value or error.
+  bool IsAvailable() const { return value_->IsAvailable(); }
+  bool IsUnavailable() const { return value_->IsUnavailable(); }
+
+  // Return true if the AsyncValue contains a concrete value.
+  bool IsConcrete() const { return value_->IsConcrete(); }
+
+  // Return true if state is kUnconstructed.
+  bool IsUnconstructed() const { return value_->IsUnconstructed(); }
+
+  // Return the stored value. The AsyncValueRef must be available.
+  T& get() const { return value_->get<T>(); }
+
+  // Return the stored value as a subclass type. The AsyncValueRef must be
+  // available.
+  template <typename SubclassT,
+            std::enable_if_t<std::is_base_of<T, SubclassT>::value>* = nullptr>
+  SubclassT& get() const {
+    return value_->get<SubclassT>();
+  }
+
+  T* operator->() const { return &get(); }
+
+  T& operator*() const { return get(); }
+
+  template <typename WaiterT>
+  void AndThen(WaiterT&& waiter) const {
+    AsPtr().AndThen(std::forward<WaiterT>(waiter));
+  }
+
+  // Make the AsyncValueRef available.
+  void SetStateConcrete() const { value_->SetStateConcrete(); }
+
+  // Set the stored value. The AsyncValueRef must be unavailable. After this
+  // returns, the AsyncValueRef will be available.
+  template <typename... Args>
+  void emplace(Args&&... args) const {
+    value_->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  void emplace(absl::StatusOr<T> v) const {
+    if (v.ok()) {
+      emplace(std::move(*v));
+    } else {
+      SetError(std::move(v.status()));
+    }
+  }
+
+  // Return true if this AsyncValueRef represents an error.
+  bool IsError() const { return value_->IsError(); }
+
+  // Returns the underlying error. IsError() must be true.
+  const absl::Status& GetError() const { return value_->GetError(); }
+
+  // Returns the underlying error, or nullptr if there is none.
+  const absl::Status* GetErrorIfPresent() const {
+    return value_->GetErrorIfPresent();
+  }
+
+  void SetError(absl::Status status) const {
+    assert(!status.ok() && "expected non-ok status");
+    return value_->SetError(std::move(status));
+  }
+
+  void SetError(std::string_view message) const {
+    SetError(absl::InternalError(message));
+  }
+
+  explicit operator bool() const { return value_.get() != nullptr; }
+  bool operator==(const AsyncValueRef& r) const { return value_ == r.value_; }
+  bool operator!=(const AsyncValueRef& r) const { return value_ != r.value_; }
+
+  // Return a raw pointer to the AsyncValue.
+  AsyncValue* GetAsyncValue() const { return value_.get(); }
+
+  // Returns a non-owning pointer to the underlying async value.
+  AsyncValuePtr<T> AsPtr() const { return AsyncValuePtr<T>(GetAsyncValue()); }
+
+  // Return true if this is the only ref to the AsyncValue.
+  // This function requires the internal AsyncValue to be set (value_ !=
+  // nullptr).
+  bool IsUnique() const { return value_->IsUnique(); }
+
+  // Make an explicit copy of this AsyncValueRef, increasing value_'s refcount
+  // by one.
+  AsyncValueRef<T> CopyRef() const { return AsyncValueRef(CopyRCRef()); }
+
+  // Make a copy of value_, increasing value_'s refcount by one.
+  RCReference<AsyncValue> CopyRCRef() const { return value_; }
+
+  // Release ownership of one reference on the AsyncValue and return a raw
+  // pointer to it.
+  AsyncValue* release() { return value_.release(); }
+
+  void reset() { value_.reset(); }
+
+  // Transfer ownership of one reference on the AsyncValue to the returned
+  // RCReference<AsyncValue>.
+  RCReference<AsyncValue> ReleaseRCRef() { return std::move(value_); }
+
+ private:
+  RCReference<AsyncValue> value_;
+};
+
+// Non owning typed pointer for the AsyncValue. Can be cheaply passed around
+// when the lifetime of the underlying async value is clear from the context.
+// It is the user responsibility to construct an owning AsyncValueRef to extend
+// the lifetime of the underlying value if needed.
+template <typename T>
+class AsyncValuePtr {
+ public:
+  AsyncValuePtr() : value_(nullptr) {}
+
+  explicit AsyncValuePtr(AsyncValue* value) : value_(value) {}
+  explicit AsyncValuePtr(const AsyncValueRef<T>& ref)
+      : value_(ref.GetAsyncValue()) {}
+
+  AsyncValue* value() const { return value_; }
+
+  AsyncValueRef<T> CopyRef() const { return AsyncValueRef<T>(FormRef(value_)); }
+
+  T& get() const { return value_->template get<T>(); }
+  T* operator->() const { return &get(); }
+  T& operator*() const { return get(); }
+
+  explicit operator bool() const { return value_ != nullptr; }
+  bool operator!=(std::nullptr_t) const { return value_ != nullptr; }
+  AsyncValuePtr& operator=(std::nullptr_t) {
+    value_ = nullptr;
+    return *this;
+  }
+
+  bool IsAvailable() const { return value_->IsAvailable(); }
+  bool IsUnavailable() const { return value_->IsUnavailable(); }
+
+  bool IsConcrete() const { return value_->IsConcrete(); }
+  void SetStateConcrete() const { value_->SetStateConcrete(); }
+
+  template <typename... Args>
+  void emplace(Args&&... args) const {
+    value_->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  bool IsError() const { return value_->IsError(); }
+
+  const absl::Status& GetError() const { return value_->GetError(); }
+
+  void SetError(absl::Status status) const {
+    assert(!status.ok() && "expected non-ok status");
+    return value_->SetError(std::move(status));
+  }
+
+  // If the AsyncValueRef is available, run the waiter immediately. Otherwise,
+  // run the waiter when the AsyncValueRef becomes available.
+  //
+  // Sample usage:
+  //
+  // async_value_ref.AndThen([] {
+  //   // async_value_ref is now ready.
+  // });
+  template <typename WaiterT,
+            std::enable_if_t<std::is_invocable_v<WaiterT>>* = nullptr>
+  void AndThen(WaiterT&& waiter) const {
+    value_->AndThen(std::forward<WaiterT>(waiter));
+  }
+
+  // This AndThen() function takes a functor that takes absl::StatusOr<T*> as
+  // argument. This makes it easy for the callback function to use the value of
+  // the AsyncValue when it becomes available.
+  //
+  // Sample usage:
+  //
+  // async_value_ref.AndThen([] (absl::StatusOr<T*> status_or) {
+  //   // async_value_ref is now ready and its value/error is in the provided
+  //   // `status_or` argument.
+  //   if (!status_or.ok()) {
+  //      // Handle the error in `status_or.status()`.
+  //   } else {
+  //      // Handle the value in `*status_or`.
+  //   }
+  // });
+  template <typename WaiterT, std::enable_if_t<std::is_invocable_v<
+                                  WaiterT, absl::StatusOr<T*>>>* = nullptr>
+  void AndThen(WaiterT&& waiter) const {
+    AndThen([waiter = std::forward<WaiterT>(waiter), av_ptr = *this]() mutable {
+      if (av_ptr.IsError()) {
+        return std::forward<WaiterT>(waiter)(av_ptr.GetError());
+      } else {
+        return std::forward<WaiterT>(waiter)(&av_ptr.get());
+      }
+    });
+  }
+
+  // This AndThen() function takes a functor that takes an absl::Status as
+  // argument. This makes it easy for the callback function to use the error of
+  // the AsyncValue when it becomes available. This is useful when the callback
+  // function only cares about the error value of the AsyncValue, e.g. for
+  // AsyncValueRef<Chain>.
+  //
+  // Sample usage:
+  //
+  // async_value_ref.AndThen([] (absl::Status status) {
+  //   // async_value_ref is now ready and its status is in the provided
+  //   // `status` argument.
+  //   if (!status.ok()) {
+  //     // Handle the error.
+  //   } else {
+  //     // No error occurred.
+  //   }
+  // });
+  template <typename WaiterT,
+            std::enable_if_t<
+                (std::is_invocable_v<WaiterT, absl::Status> &&
+                 !std::is_invocable_v<WaiterT, absl::StatusOr<T*>>)>* = nullptr>
+  void AndThen(WaiterT&& waiter) const {
+    AndThen([waiter = std::forward<WaiterT>(waiter), av_ptr = *this]() mutable {
+      if (av_ptr.IsError()) {
+        return std::forward<WaiterT>(waiter)(av_ptr.GetError());
+      } else {
+        return std::forward<WaiterT>(waiter)(absl::OkStatus());
+      }
+    });
+  }
+
+ private:
+  AsyncValue* value_;  // doesn't own the async value
+};
+
+// Create a ConcreteAsyncValue in error state with the given status.
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(absl::Status status);
+
+ABSL_DEPRECATED("Use the error async value constructor that takes absl::Status")
+RCReference<ErrorAsyncValue> MakeErrorAsyncValueRef(std::string_view message);
+
+// Construct an empty IndirectAsyncValue, not forwarding to anything.
+RCReference<IndirectAsyncValue> MakeIndirectAsyncValue();
+
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+template <typename T, typename... Args>
+T* PlacementConstruct(void* buf, Args&&... args) {
+  return new (buf) T(std::forward<Args>(args)...);
+}
+
+template <typename T, typename... Args>
+T* AllocateAndConstruct(Args&&... args) {
+  // TODO(ezhulenev): `port::AlignedMalloc` has a different order of arguments!
+  void* buf = internal::AlignedAlloc(alignof(T), sizeof(T));
+  return PlacementConstruct<T, Args...>(buf, std::forward<Args>(args)...);
+}
+
+}  // namespace internal
+
+//===----------------------------------------------------------------------===//
+// Constructing reference-counted async values on the heap.
+//===----------------------------------------------------------------------===//
+
+// Allocate an unconstructed AsyncValueRef. The AsyncValueRef should be made
+// available later by invoking AsyncValueRef::emplace or
+// AsyncValueRef::SetError.
+template <typename T>
+AsyncValueRef<T> MakeUnconstructedAsyncValueRef() {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::UnconstructedPayload{})));
+}
+
+// Allocate and construct an AsyncValueRef without making it available for
+// consumption. The AsyncValueRef should be made available later by invoking
+// AsyncValueRef::SetStateConcrete or AsyncValueRef::SetError.
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeConstructedAsyncValueRef(Args&&... args) {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::ConstructedPayload{},
+          std::forward<Args>(args)...)));
+}
+
+// Allocate and construct an available AsyncValueRef.
+template <typename T, typename... Args>
+AsyncValueRef<T> MakeAvailableAsyncValueRef(Args&&... args) {
+  return AsyncValueRef<T>(tsl::TakeRef(
+      internal::AllocateAndConstruct<internal::ConcreteAsyncValue<T>>(
+          typename internal::ConcreteAsyncValue<T>::ConcretePayload{},
+          std::forward<Args>(args)...)));
+}
+
+//===----------------------------------------------------------------------===//
+// Constructing non-reference-counted values in user provided storage.
+//===----------------------------------------------------------------------===//
+
+namespace internal {
+
+// Properly sized and aligned storage for allocating async values of given type.
+template <typename T>
+struct AsyncValueStorage {
+  using Payload = ConcreteAsyncValue<T>;
+
+  AsyncValueStorage() = default;
+
+  AsyncValueStorage(const AsyncValueStorage&) = delete;
+  AsyncValueStorage& operator=(const AsyncValueStorage&) = delete;
+
+  void* buf() { return &storage[0]; }
+
+  alignas(Payload) std::byte storage[sizeof(Payload)];
+};
+
+}  // namespace internal
+
+// Exclusive owner of the non reference-counted async value (e.g. allocated in
+// the user provided storage) that is responsible for destructing it. If you'd
+// look at `AsyncValueRef` as `std::shared_ptr`, then this is `std::unique_ptr`.
+template <typename T>
+class AsyncValueOwningRef {
+ public:
+  AsyncValueOwningRef() = default;
+  ~AsyncValueOwningRef() { Destroy(); }
+
+  AsyncValueOwningRef(const AsyncValueOwningRef&) = delete;
+  AsyncValueOwningRef& operator=(const AsyncValueOwningRef&) = delete;
+
+  AsyncValueOwningRef& operator=(AsyncValueOwningRef&& other) {
+    Destroy();
+    std::swap(value_, other.value_);
+    return *this;
+  }
+
+  AsyncValueOwningRef(AsyncValueOwningRef&& other) {
+    Destroy();
+    std::swap(value_, other.value_);
+  }
+
+  AsyncValueRef<T> AsRef() const { return AsyncValueRef<T>(FormRef(value_)); }
+  AsyncValuePtr<T> AsPtr() const { return AsyncValuePtr<T>(value_); }
+
+  T* operator->() const { return &value_->get(); }
+  T& operator*() const { return value_->get(); }
+
+ private:
+  template <typename U, typename... Args>
+  friend AsyncValueOwningRef<U> MakeConstructedAsyncValueRef(
+      internal::AsyncValueStorage<U>&, Args&&...);
+
+  template <typename U, typename... Args>
+  friend AsyncValueOwningRef<U> MakeAvailableAsyncValueRef(
+      internal::AsyncValueStorage<U>&, Args&&...);
+
+  explicit AsyncValueOwningRef(internal::ConcreteAsyncValue<T>* value)
+      : value_(value) {}
+
+  void Destroy() {
+    if (value_) {
+      CallDestructor(value_);
+      value_ = nullptr;
+    }
+  }
+
+  // Work around NVCC compilation error.
+  template <typename U>
+  void CallDestructor(U* ptr) {
+    ptr->~U();
+  }
+
+  internal::ConcreteAsyncValue<T>* value_ = nullptr;
+};
+
+// Constructs an AsyncValueRef in the provided storage without making it
+// available for consumption. The AsyncValueRef should be made available later
+// by invoking AsyncValueRef::SetStateConcrete or AsyncValueRef::SetError.
+template <typename T, typename... Args>
+AsyncValueOwningRef<T> MakeConstructedAsyncValueRef(
+    internal::AsyncValueStorage<T>& storage, Args&&... args) {
+  return AsyncValueOwningRef<T>(
+      internal::PlacementConstruct<internal::ConcreteAsyncValue<T>>(
+          storage.buf(),
+          typename internal::ConcreteAsyncValue<T>::ConstructedPayload{false},
+          std::forward<Args>(args)...));
+}
+
+// Construct an available AsyncValueRef in the provided storage.
+template <typename T, typename... Args>
+AsyncValueOwningRef<T> MakeAvailableAsyncValueRef(
+    internal::AsyncValueStorage<T>& storage, Args&&... args) {
+  return AsyncValueOwningRef<T>(
+      internal::PlacementConstruct<internal::ConcreteAsyncValue<T>>(
+          storage.buf(),
+          typename internal::ConcreteAsyncValue<T>::ConcretePayload{false},
+          std::forward<Args>(args)...));
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_CONCURRENCY_ASYNC_VALUE_REF_H_
diff --git a/tensorflow/tsl/concurrency/async_value_ref_test.cc b/tensorflow/tsl/concurrency/async_value_ref_test.cc
new file mode 100644
index 00000000000..06376a4e0b5
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value_ref_test.cc
@@ -0,0 +1,190 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/async_value_ref.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+
+class WrappedInt32 {
+ public:
+  explicit WrappedInt32(int32_t value) : value_(value) {}
+  int32_t value() const { return value_; }
+
+ private:
+  int32_t value_;
+};
+
+constexpr int32_t kTestValue = 42;
+
+TEST(AsyncValueRefTest, ValueCheck) {
+  auto wrapped_int_value = MakeAvailableAsyncValueRef<WrappedInt32>(kTestValue);
+  EXPECT_EQ(wrapped_int_value.get().value(), kTestValue);
+  EXPECT_EQ(wrapped_int_value->value(), kTestValue);
+  EXPECT_EQ((*wrapped_int_value).value(), kTestValue);
+}
+
+TEST(AsyncValueRefTest, ValueCheckFromRCReference) {
+  auto wrapped_int_value = MakeAvailableAsyncValueRef<WrappedInt32>(kTestValue);
+  RCReference<AsyncValue> generic_value = std::move(wrapped_int_value);
+  EXPECT_EQ(generic_value->get<WrappedInt32>().value(), kTestValue);
+}
+
+TEST(AsyncValueRefTest, ValueCheckFromAliasedRCReference) {
+  auto wrapped_int_value = MakeAvailableAsyncValueRef<WrappedInt32>(kTestValue);
+  RCReference<AsyncValue> generic_value = std::move(wrapped_int_value);
+  AsyncValueRef<WrappedInt32> aliased_int_value(std::move(generic_value));
+  EXPECT_EQ(aliased_int_value.get().value(), kTestValue);
+  EXPECT_EQ(aliased_int_value->value(), kTestValue);
+  EXPECT_EQ((*aliased_int_value).value(), kTestValue);
+}
+
+TEST(AsyncValueRefTest, ConstructedToError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  EXPECT_FALSE(value.IsConcrete());
+  EXPECT_FALSE(value.IsAvailable());
+
+  value.AndThen([] {});
+  value.SetError(absl::InternalError("test error"));
+
+  EXPECT_TRUE(value.IsAvailable());
+  EXPECT_FALSE(value.IsConcrete());
+  EXPECT_TRUE(value.IsError());
+}
+
+TEST(AsyncValueRefTest, ConstructedToConcrete) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  EXPECT_FALSE(value.IsConcrete());
+  EXPECT_FALSE(value.IsAvailable());
+
+  value.AndThen([] {});
+  value.SetStateConcrete();
+
+  EXPECT_TRUE(value.IsAvailable());
+  EXPECT_TRUE(value.IsConcrete());
+  EXPECT_FALSE(value.IsError());
+
+  EXPECT_EQ(kTestValue, value.get());
+}
+
+TEST(AsyncValueRefTest, UnconstructedEmplace) {
+  auto value = MakeUnconstructedAsyncValueRef<int32_t>();
+
+  EXPECT_FALSE(value.IsConcrete());
+  EXPECT_FALSE(value.IsAvailable());
+
+  value.AndThen([] {});
+
+  value.emplace(kTestValue);
+  EXPECT_TRUE(value.IsAvailable());
+  EXPECT_TRUE(value.IsConcrete());
+
+  EXPECT_EQ(kTestValue, value.get());
+}
+
+TEST(AsyncValueRefTest, CopyRef) {
+  auto value = MakeAvailableAsyncValueRef<int32_t>(kTestValue);
+
+  EXPECT_TRUE(value.IsConcrete());
+
+  EXPECT_TRUE(value.IsUnique());
+  auto copied_value = value.CopyRef();
+  EXPECT_FALSE(value.IsUnique());
+
+  EXPECT_EQ(value.GetAsyncValue(), copied_value.GetAsyncValue());
+}
+
+TEST(AsyncValueRefTest, AndThenError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  auto diag = absl::InternalError("test error");
+  value.AndThen([&](absl::Status status) { EXPECT_EQ(status, diag); });
+
+  value.SetError(diag);
+}
+
+TEST(AsyncValueRefTest, AndThenNoError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  value.AndThen([](absl::Status status) { EXPECT_TRUE(status.ok()); });
+
+  value.SetStateConcrete();
+}
+
+TEST(AsyncValueRefTest, AndThenStatusOrError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  auto diag = absl::InternalError("test error");
+  value.AndThen([&](absl::StatusOr<int32_t*> v) {
+    EXPECT_FALSE(v.ok());
+    EXPECT_EQ(v.status(), diag);
+  });
+
+  value.SetError(diag);
+}
+
+TEST(AsyncValueRefTest, PtrAndThenStatusOrError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  auto diag = absl::InternalError("test error");
+  value.AsPtr().AndThen([&](absl::StatusOr<int32_t*> v) {
+    EXPECT_FALSE(v.ok());
+    EXPECT_EQ(v.status(), diag);
+  });
+
+  value.SetError(diag);
+}
+
+TEST(AsyncValueRefTest, AndThenStatusOrNoError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  value.AndThen([](absl::StatusOr<int32_t*> v) {
+    EXPECT_TRUE(v.ok());
+    EXPECT_EQ(**v, kTestValue);
+  });
+
+  value.SetStateConcrete();
+}
+
+TEST(AsyncValueRefTest, PtrAndThenStatusOrNoError) {
+  auto value = MakeConstructedAsyncValueRef<int32_t>(kTestValue);
+
+  value.AsPtr().AndThen([](absl::StatusOr<int32_t*> v) {
+    EXPECT_TRUE(v.ok());
+    EXPECT_EQ(**v, kTestValue);
+  });
+
+  value.SetStateConcrete();
+}
+
+TEST(AsyncValueRefTest, Nullptr) {
+  // Test constructing from nullptr.
+  AsyncValueRef<int> av_int = nullptr;
+  EXPECT_FALSE(av_int);
+
+  // Test assignment to nullptr.
+  AsyncValueRef<int> av_int2 = MakeConstructedAsyncValueRef<int>(kTestValue);
+  EXPECT_TRUE(av_int2);
+  av_int2 = nullptr;
+  EXPECT_FALSE(av_int2);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/async_value_test.cc b/tensorflow/tsl/concurrency/async_value_test.cc
new file mode 100644
index 00000000000..afbe1b76aa7
--- /dev/null
+++ b/tensorflow/tsl/concurrency/async_value_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/async_value.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/tsl/concurrency/async_value_ref.h"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+
+TEST(AsyncValueTest, ConstructedToError) {
+  AsyncValue* value = MakeConstructedAsyncValueRef<int32_t>(123).release();
+  bool callback_triggered = false;
+
+  EXPECT_TRUE(value->IsConstructed());
+  EXPECT_FALSE(value->IsConcrete());
+  EXPECT_FALSE(value->IsAvailable());
+
+  value->AndThen([&] { callback_triggered = true; });
+  EXPECT_FALSE(callback_triggered);
+  value->SetError(absl::InternalError("test error"));
+  EXPECT_TRUE(callback_triggered);
+
+  EXPECT_TRUE(value->IsAvailable());
+  EXPECT_FALSE(value->IsConcrete());
+  EXPECT_TRUE(value->IsError());
+  value->DropRef();
+}
+
+TEST(AsyncValueTest, ConstructedToConcrete) {
+  AsyncValue* value = MakeConstructedAsyncValueRef<int32_t>(123).release();
+
+  EXPECT_TRUE(value->IsConstructed());
+  EXPECT_FALSE(value->IsConcrete());
+  EXPECT_FALSE(value->IsAvailable());
+
+  value->AndThen([] {});
+  value->SetStateConcrete();
+
+  EXPECT_TRUE(value->IsAvailable());
+  EXPECT_TRUE(value->IsConcrete());
+  EXPECT_FALSE(value->IsError());
+
+  EXPECT_EQ(123, value->get<int32_t>());
+  value->DropRef();
+}
+
+TEST(AsyncValueTest, UnconstructedEmplace) {
+  AsyncValue* value = MakeUnconstructedAsyncValueRef<int32_t>().release();
+
+  EXPECT_FALSE(value->IsConstructed());
+  EXPECT_FALSE(value->IsConcrete());
+  EXPECT_FALSE(value->IsAvailable());
+
+  value->AndThen([] {});
+
+  value->emplace<int32_t>(123);
+  EXPECT_FALSE(value->IsConstructed());
+  EXPECT_TRUE(value->IsAvailable());
+  EXPECT_TRUE(value->IsConcrete());
+
+  EXPECT_EQ(123, value->get<int32_t>());
+
+  value->DropRef();
+}
+
+TEST(AsyncValueTest, AddAndDropRef) {
+  AsyncValue* value = MakeConstructedAsyncValueRef<int32_t>(123).release();
+
+  value->AndThen([] {});
+  value->SetStateConcrete();
+
+  EXPECT_TRUE(value->IsConcrete());
+
+  EXPECT_TRUE(value->IsUnique());
+  value->AddRef();
+  EXPECT_FALSE(value->IsUnique());
+
+  EXPECT_EQ(123, value->get<int32_t>());
+
+  value->DropRef();
+  EXPECT_TRUE(value->IsUnique());
+
+  value->DropRef();
+}
+
+TEST(AsyncValueTest, KeepPayloadOnError) {
+  int payload_value = 0;
+
+  struct Payload : internal::KeepAsyncValuePayloadOnError {
+    explicit Payload(int* value) : value{value} { *value = 1; }
+    ~Payload() { *value = 2; }
+
+    int* value;
+  };
+
+  {
+    // Test non-error case.
+    AsyncValueRef<Payload> value =
+        MakeConstructedAsyncValueRef<Payload>(&payload_value);
+
+    EXPECT_EQ(1, *value->value);
+
+    value.SetStateConcrete();
+
+    EXPECT_EQ(1, *value->value);
+    EXPECT_TRUE(!value.IsError());
+  }
+  EXPECT_EQ(2, payload_value);
+
+  {
+    // Test error case.
+    AsyncValueRef<Payload> value =
+        MakeConstructedAsyncValueRef<Payload>(&payload_value);
+
+    EXPECT_TRUE(!value.IsError());
+
+    value.SetError("error");
+
+    EXPECT_EQ(1, *value->value);
+    EXPECT_TRUE(value.IsError());
+    EXPECT_EQ("error", value.GetError().message());
+  }
+
+  EXPECT_EQ(2, payload_value);
+}
+
+TEST(AsyncValueTest, StackAllocatedAsyncValue) {
+  int32_t counter = 0;
+
+  class Payload {
+   public:
+    explicit Payload(int32_t& counter) : counter_{counter} { counter_++; }
+    ~Payload() { counter_++; }
+
+    int32_t count() const { return counter_; }
+
+   private:
+    int32_t& counter_;
+  };
+
+  // Stack allocated storage for the async value.
+  internal::AsyncValueStorage<Payload> storage;
+
+  // Construct async value in the provided storage.
+  AsyncValueOwningRef<Payload> owner =
+      MakeConstructedAsyncValueRef<Payload>(storage, counter);
+
+  AsyncValuePtr<Payload> ptr = owner.AsPtr();
+  AsyncValue* value = ptr.value();
+
+  EXPECT_TRUE(value->IsConstructed());
+  EXPECT_FALSE(value->IsAvailable());
+
+  EXPECT_EQ(1, counter);
+  EXPECT_EQ(1, ptr->count());
+
+  ptr.SetStateConcrete();
+
+  EXPECT_TRUE(ptr.IsAvailable());
+
+  // Check that when owner is destructed it calls the payload destructor.
+  std::make_unique<AsyncValueOwningRef<Payload>>(std::move(owner));
+  EXPECT_EQ(2, counter);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/chain.h b/tensorflow/tsl/concurrency/chain.h
new file mode 100644
index 00000000000..91cc7bb929f
--- /dev/null
+++ b/tensorflow/tsl/concurrency/chain.h
@@ -0,0 +1,25 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
+#define TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
+
+namespace tsl {
+
+class Chain {};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_CONCURRENCY_CHAIN_H_
diff --git a/tensorflow/tsl/concurrency/concurrent_vector.h b/tensorflow/tsl/concurrency/concurrent_vector.h
new file mode 100644
index 00000000000..ff3c38f8c0d
--- /dev/null
+++ b/tensorflow/tsl/concurrency/concurrent_vector.h
@@ -0,0 +1,171 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+#define TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+
+namespace tsl {
+namespace internal {
+
+// A simple concurrent sequential container that allows concurrent reads and
+// writes and is optimized for read access. It is designed for the usage pattern
+// where objects are inserted once but are read many times. The key difference
+// between this data structure and std::vector is that when we re-allocate the
+// underlying buffer, we do not free the previous buffer. This allows us to
+// implement read access with a single atomic load.
+//
+// Sample usage:
+//
+// ConcurrentVector<T> vec;
+//
+// On the writer side, concurrent writers are allowed;
+//
+// size_t index1 = vec.emplace_back(args);
+// size_t index2 = vec.emplace_back(args);
+//
+// On the reader side, concurrent readers are allowed.
+//
+// auto& t1 = vec[index1];
+// auto& t2 = vec[index1];
+//
+// Requirements:
+//
+// Type T needs to be copyable.
+
+template <typename T>
+class ConcurrentVector {
+ public:
+  // Initialize the vector with the given initial_capapcity
+  explicit ConcurrentVector(size_t initial_capacity) : state_(0ull) {
+    // ConcurrentVector does not support inserting more than 2^64 elements,
+    // which should be more than enough for any reasonable use case.
+    all_allocated_elements_.reserve(65);
+    all_allocated_elements_.emplace_back();
+    auto& v = all_allocated_elements_.back();
+    v.reserve(std::max(static_cast<size_t>(1), initial_capacity));
+  }
+
+  T& operator[](size_t index) {
+    auto state = State::Decode(state_.load(std::memory_order_acquire));
+    assert(index < state.size);
+    return all_allocated_elements_[state.last_allocated][index];
+  }
+
+  const T& operator[](size_t index) const {
+    auto state = State::Decode(state_.load(std::memory_order_acquire));
+    assert(index < state.size);
+    return all_allocated_elements_[state.last_allocated][index];
+  }
+
+  absl::Span<const T> ToConstSpan() const {
+    auto state = State::Decode(state_.load(std::memory_order_acquire));
+    auto& storage = all_allocated_elements_[state.last_allocated];
+    return absl::MakeConstSpan(storage.data(), state.size);
+  }
+
+  // Return the number of elements currently valid in this vector.  The vector
+  // only grows, so this is conservative w.r.t. the execution of the current
+  // thread.
+  size_t size() const {
+    return State::Decode(state_.load(std::memory_order_relaxed)).size;
+  }
+
+  // Insert a new element at the end. If the current buffer is full, we allocate
+  // a new buffer with twice as much capacity and copy the items in the
+  // previous buffer over.
+  //
+  // Returns the index of the newly inserted item.
+  template <typename... Args>
+  size_t emplace_back(Args&&... args) {
+    absl::MutexLock lock(&mutex_);
+
+    auto& last = all_allocated_elements_.back();
+
+    if (last.size() < last.capacity()) {
+      // There is still room in the current vector without reallocation. Just
+      // add the new element there.
+      last.emplace_back(std::forward<Args>(args)...);
+
+      // Increment the size of the concurrent vector.
+      auto state = State::Decode(state_.load(std::memory_order_relaxed));
+      state.size += 1;
+      state_.store(state.Encode(), std::memory_order_release);
+
+      return state.size - 1;  // return insertion index
+    }
+    // There is no more room in the current vector without reallocation.
+    // Allocate a new vector with twice as much capacity, copy the elements
+    // from the previous vector, and set elements_ to point to the data of the
+    // new vector.
+    auto& new_last = all_allocated_elements_.emplace_back();
+    auto& prev = *(all_allocated_elements_.rbegin() + 1);
+    new_last.reserve(prev.capacity() * 2);
+    assert(prev.size() == prev.capacity());
+
+    // Copy over the previous vector to the new vector.
+    new_last.insert(new_last.begin(), prev.begin(), prev.end());
+    new_last.emplace_back(std::forward<Args>(args)...);
+
+    // Increment the size of the concurrent vector and index of the last
+    // allocated vector.
+    auto state = State::Decode(state_.load(std::memory_order_relaxed));
+    state.last_allocated += 1;
+    state.size += 1;
+    state_.store(state.Encode(), std::memory_order_release);
+
+    return state.size - 1;  // return insertion index
+  }
+
+ private:
+  // Concurrent vector state layout:
+  // - Low 32 bits encode the index of the last allocated vector.
+  // - High 32 bits encode the size of the concurrent vector.
+  static constexpr uint64_t kLastAllocatedMask = (1ull << 32) - 1;
+  static constexpr uint64_t kSizeMask = ((1ull << 32) - 1) << 32;
+
+  struct State {
+    uint64_t last_allocated;  // index of last allocated vector
+    uint64_t size;            // size of the concurrent vector
+
+    static State Decode(uint64_t state) {
+      uint64_t last_allocated = (state & kLastAllocatedMask);
+      uint64_t size = (state & kSizeMask) >> 32;
+      return {last_allocated, size};
+    }
+
+    uint64_t Encode() const { return (size << 32) | last_allocated; }
+  };
+
+  // Stores/loads to/from this atomic used to enforce happens-before
+  // relationship between emplace_back and operator[].
+  std::atomic<uint64_t> state_;
+
+  absl::Mutex mutex_;
+  std::vector<std::vector<T>> all_allocated_elements_;
+};
+
+}  // namespace internal
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_CONCURRENCY_CONCURRENT_VECTOR_H_
diff --git a/tensorflow/tsl/concurrency/concurrent_vector_test.cc b/tensorflow/tsl/concurrency/concurrent_vector_test.cc
new file mode 100644
index 00000000000..d3aed313095
--- /dev/null
+++ b/tensorflow/tsl/concurrency/concurrent_vector_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/concurrency/concurrent_vector.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+
+namespace tsl {
+
+using ::tsl::internal::ConcurrentVector;
+
+TEST(ConcurrentVectorTest, SingleThreaded) {
+  ConcurrentVector<int> vec(1);
+
+  constexpr int kCount = 1000;
+
+  for (int i = 0; i < kCount; ++i) {
+    ASSERT_EQ(i, vec.emplace_back(i));
+  }
+
+  for (int i = 0; i < kCount; ++i) {
+    EXPECT_EQ(i, vec[i]);
+  }
+}
+
+TEST(ConcurrentVectorTest, OneWriterOneReader) {
+  ConcurrentVector<int> vec(1);
+
+  thread::ThreadPool pool(Env::Default(), "concurrent-vector", 4);
+  constexpr int kCount = 1000;
+
+  pool.Schedule([&] {
+    for (int i = 0; i < kCount; ++i) {
+      ASSERT_EQ(i, vec.emplace_back(i));
+    }
+  });
+
+  pool.Schedule([&] {
+    for (int i = 0; i < kCount; ++i) {
+      while (i >= vec.size()) {
+        // spin loop
+      }
+      EXPECT_EQ(i, vec[i]);
+    }
+  });
+}
+
+TEST(ConcurrentVectorTest, TwoWritersTwoReaders) {
+  ConcurrentVector<int> vec(1);
+
+  thread::ThreadPool pool(Env::Default(), "concurrent-vector", 4);
+  constexpr int kCount = 1000;
+
+  // Each writer stores from 0 to kCount/2 - 1 to the vector.
+  auto writer = [&] {
+    for (int i = 0; i < kCount / 2; ++i) {
+      vec.emplace_back(i);
+    }
+  };
+
+  pool.Schedule(writer);
+  pool.Schedule(writer);
+
+  // Reader reads all the data from the vector and verifies its content.
+  auto reader = [&] {
+    std::vector<int> stored;
+
+    for (int i = 0; i < kCount; ++i) {
+      while (i >= vec.size()) {
+        // spin loop
+      }
+      stored.emplace_back(vec[i]);
+    }
+
+    std::sort(stored.begin(), stored.end());
+
+    for (int i = 0; i < kCount / 2; ++i) {
+      ASSERT_EQ(stored[2 * i], i);
+      ASSERT_EQ(stored[2 * i + 1], i);
+    }
+  };
+
+  pool.Schedule(reader);
+  pool.Schedule(reader);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/concurrency/ref_count.h b/tensorflow/tsl/concurrency/ref_count.h
new file mode 100644
index 00000000000..c10921ef2b6
--- /dev/null
+++ b/tensorflow/tsl/concurrency/ref_count.h
@@ -0,0 +1,260 @@
+/* Copyright 2022 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
+#define TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace tsl {
+
+#ifndef NDEBUG
+inline std::atomic<size_t> total_reference_counted_objects;
+
+// Return the total number of reference-counted objects that are currently
+// live in the process.  This is intended for debugging/assertions only, and
+// shouldn't be used for mainline logic in the runtime.
+inline size_t GetNumReferenceCountedObjects() {
+  return total_reference_counted_objects.load(std::memory_order_relaxed);
+}
+inline void AddNumReferenceCountedObjects() {
+  total_reference_counted_objects.fetch_add(1, std::memory_order_relaxed);
+}
+inline void DropNumReferenceCountedObjects() {
+  total_reference_counted_objects.fetch_sub(1, std::memory_order_relaxed);
+}
+#else
+inline void AddNumReferenceCountedObjects() {}
+inline void DropNumReferenceCountedObjects() {}
+#endif
+
+// This class is a common base class for things that need an atomic reference
+// count for ownership management.
+//
+// Subclasses of this are allowed to implement a Destroy() instance method,
+// which allows custom allocation/deallocation logic.
+//
+// This class intentionally doesn't have a virtual destructor or anything else
+// that would require a vtable, but subclasses can have one if they choose.
+template <typename SubClass>
+class ReferenceCounted {
+ public:
+  ReferenceCounted() : ReferenceCounted(1) {}
+  explicit ReferenceCounted(unsigned ref_count) : ref_count_(ref_count) {
+    AddNumReferenceCountedObjects();
+  }
+
+  ~ReferenceCounted() {
+    assert(ref_count_.load() == 0 &&
+           "Shouldn't destroy a reference counted object with references!");
+    DropNumReferenceCountedObjects();
+  }
+
+  // Not copyable or movable.
+  ReferenceCounted(const ReferenceCounted&) = delete;
+  ReferenceCounted& operator=(const ReferenceCounted&) = delete;
+
+  // Add a new reference to this object.
+  void AddRef() {
+    assert(ref_count_.load(std::memory_order_relaxed) >= 1);
+    // It is OK to use std::memory_order_relaxed here as it does not affect the
+    // ownership state of the object.
+    ref_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  // Drop a reference to this object, potentially deallocating it.
+  void DropRef() {
+    assert(ref_count_.load(std::memory_order_relaxed) > 0);
+
+    // If ref_count_==1, this object is owned only by the caller. Bypass a
+    // locked op in that case.
+    if (ref_count_.load(std::memory_order_acquire) == 1 ||
+        ref_count_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      // Make assert in ~ReferenceCounted happy
+      assert((ref_count_.store(0, std::memory_order_relaxed), true));
+      static_cast<SubClass*>(this)->Destroy();
+    }
+  }
+
+  // Return reference count. This should be used for testing and debugging only.
+  uint32_t NumRef() const { return ref_count_.load(); }
+
+  // Return true if reference count is 1.
+  bool IsUnique() const {
+    return ref_count_.load(std::memory_order_acquire) == 1;
+  }
+
+ protected:
+  // Subclasses are allowed to customize this, but the default implementation of
+  // Destroy() just deletes the pointer.
+  void Destroy() { delete static_cast<SubClass*>(this); }
+
+ private:
+  std::atomic<unsigned> ref_count_;
+};
+
+// This is a smart pointer that keeps the specified reference counted value
+// around.  It is move-only to avoid accidental copies, but it can be copied
+// explicitly.
+template <typename T>
+class RCReference {
+ public:
+  RCReference() : pointer_(nullptr) {}
+
+  RCReference(RCReference&& other) : pointer_(other.pointer_) {
+    other.pointer_ = nullptr;
+  }
+
+  RCReference(const RCReference& other) : pointer_(other.pointer_) {
+    if (pointer_) pointer_->AddRef();
+  }
+
+  RCReference& operator=(RCReference&& other) {
+    reset(other.pointer_);
+    other.pointer_ = nullptr;
+    return *this;
+  }
+
+  RCReference& operator=(const RCReference& other) {
+    reset(other.pointer_);
+    if (pointer_) pointer_->AddRef();
+    return *this;
+  }
+
+  // Support implicit conversion from RCReference<Derived> to RCReference<Base>.
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  RCReference(RCReference<U>&& u) : pointer_(u.pointer_) {  // NOLINT
+    u.pointer_ = nullptr;
+  }
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  RCReference(const RCReference<U>& u) : pointer_(u.pointer_) {  // NOLINT
+    if (pointer_) pointer_->AddRef();
+  }
+
+  ~RCReference() {
+    if (pointer_ != nullptr) pointer_->DropRef();
+  }
+
+  void reset(T* pointer = nullptr) {
+    if (pointer_ != nullptr) pointer_->DropRef();
+    pointer_ = pointer;
+  }
+
+  T* release() {
+    T* tmp = pointer_;
+    pointer_ = nullptr;
+    return tmp;
+  }
+
+  T& operator*() const {
+    assert(pointer_ && "null RCReference");
+    return *pointer_;
+  }
+
+  T* operator->() const {
+    assert(pointer_ && "null RCReference");
+    return pointer_;
+  }
+
+  // Return a raw pointer.
+  T* get() const { return pointer_; }
+
+  // Make an explicit copy of this RCReference, increasing the refcount by one.
+  [[deprecated("Use copy constructor instead.")]] RCReference CopyRef() const;
+
+  explicit operator bool() const { return pointer_ != nullptr; }
+
+  void swap(RCReference& other) {
+    using std::swap;
+    swap(pointer_, other.pointer_);
+  }
+
+  bool operator==(const RCReference& ref) const {
+    return pointer_ == ref.pointer_;
+  }
+  bool operator!=(const RCReference& ref) const {
+    return pointer_ != ref.pointer_;
+  }
+
+  friend bool operator==(const RCReference& ref, std::nullptr_t) {
+    return ref.pointer_ == nullptr;
+  }
+  friend bool operator==(std::nullptr_t, const RCReference& ref) {
+    return ref.pointer_ == nullptr;
+  }
+  friend bool operator!=(const RCReference& ref, std::nullptr_t) {
+    return ref.pointer_ != nullptr;
+  }
+  friend bool operator!=(std::nullptr_t, const RCReference& ref) {
+    return ref.pointer_ != nullptr;
+  }
+
+  template <typename R>
+  friend RCReference<R> FormRef(R*);
+  template <typename R>
+  friend RCReference<R> TakeRef(R*);
+
+ private:
+  T* pointer_;
+
+  template <typename R>
+  friend class RCReference;
+};
+
+// Add a new reference to the specified pointer.
+template <typename T>
+RCReference<T> FormRef(T* pointer) {
+  RCReference<T> ref;
+  ref.pointer_ = pointer;
+  pointer->AddRef();
+  return ref;
+}
+
+// Return an RCReference for the specified object and *takes ownership* of a
+// +1 reference.  When destroyed, this will drop the reference.
+template <typename T>
+RCReference<T> TakeRef(T* pointer) {
+  RCReference<T> ref;
+  ref.pointer_ = pointer;
+  return ref;
+}
+
+template <typename T>
+RCReference<T> RCReference<T>::CopyRef() const {
+  if (!pointer_) return RCReference();
+  return FormRef(get());
+}
+
+// Create a new reference counted object, similar to std::make_shared.
+template <typename T, typename... Args>
+RCReference<T> MakeRef(Args&&... args) {
+  auto t = new T(std::forward<Args>(args)...);
+  return TakeRef(t);
+}
+// For ADL style swap.
+template <typename T>
+void swap(RCReference<T>& a, RCReference<T>& b) {
+  a.swap(b);
+}
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_CONCURRENCY_REF_COUNT_H_
diff --git a/tensorflow/tsl/cuda/BUILD b/tensorflow/tsl/cuda/BUILD
index 73e1877b09c..75480a3932c 100644
--- a/tensorflow/tsl/cuda/BUILD
+++ b/tensorflow/tsl/cuda/BUILD
@@ -15,6 +15,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/tsl/cuda/cuda_12_0.inc b/tensorflow/tsl/cuda/cuda_12_0.inc
new file mode 100644
index 00000000000..7ec62ecb7c1
--- /dev/null
+++ b/tensorflow/tsl/cuda/cuda_12_0.inc
@@ -0,0 +1,3323 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuInit(unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(Flags);
+}
+
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, ordinal);
+}
+
+CUresult CUDAAPI cuDeviceGetCount(int *count) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(name, len, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask,
+                                 CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, unsigned int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetLuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(luid, deviceNodeMask, dev);
+}
+
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(bytes, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
+                                                    CUarray_format format,
+                                                    unsigned numChannels,
+                                                    CUdevice dev) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(size_t *, CUarray_format, unsigned int, CUdevice);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuDeviceGetTexture1DLinearMaxWidth");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(maxWidthInElements, format, numChannels, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                      CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
+                                                CUdevice dev, int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, dev, flags);
+}
+
+CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUmemoryPool);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pool);
+}
+
+CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out,
+                                           CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetDefaultMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool_out, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi,
+                                                CUexecAffinityType type,
+                                                CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUexecAffinityType, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetExecAffinitySupport");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, type, dev);
+}
+
+CUresult CUDAAPI
+cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target,
+                           CUflushGPUDirectRDMAWritesScope scope) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUflushGPUDirectRDMAWritesTarget,
+                                      CUflushGPUDirectRDMAWritesScope);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFlushGPUDirectRDMAWrites");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(target, scope);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
+                                                         CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
+                                                             int *minor,
+                                                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(major, minor, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
+                                            int *active) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags, active);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
+                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx,
+                                CUexecAffinityParam *paramsArray, int numParams,
+                                unsigned int flags, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUexecAffinityParam *, int,
+                                      unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, paramsArray, numParams, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, ctxId);
+}
+
+CUresult CUDAAPI cuCtxSynchronize(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pvalue, limit);
+}
+
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pconfig);
+}
+
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, version);
+}
+
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
+                                             int *greatestPriority) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity,
+                                      CUexecAffinityType type) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUexecAffinityParam *, CUexecAffinityType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetExecAffinity");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pExecAffinity, type);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fname);
+}
+
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
+                                      CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fatCubin);
+}
+
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod);
+}
+
+CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmoduleLoadingMode *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetLoadingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                     const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
+                                   CUmodule hmod, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, hmod, name);
+}
+
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+                              void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
+                               void *data, size_t size, const char *name,
+                               unsigned int numOptions, CUjit_option *options,
+                               void **optionValues) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
+                          const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, data, size, name, numOptions, options,
+                  optionValues);
+}
+
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
+                               const char *path, unsigned int numOptions,
+                               CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
+                                      unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
+                                size_t *sizeOut) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, cubinOut, sizeOut);
+}
+
+CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef,
+                                                     CUmodule hmod,
+                                                     const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef, hmod, name);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef,
+                                                      CUmodule hmod,
+                                                      const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfRef, hmod, name);
+}
+
+CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
+                                   CUjit_option *jitOptions,
+                                   void **jitOptionsValues,
+                                   unsigned int numJitOptions,
+                                   CUlibraryOption *libraryOptions,
+                                   void **libraryOptionValues,
+                                   unsigned int numLibraryOptions) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary *, const void *, CUjit_option *,
+                                      void **, unsigned int, CUlibraryOption *,
+                                      void **, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(library, code, jitOptions, jitOptionsValues, numJitOptions,
+                  libraryOptions, libraryOptionValues, numLibraryOptions);
+}
+
+CUresult CUDAAPI cuLibraryLoadFromFile(CUlibrary *library, const char *fileName,
+                                       CUjit_option *jitOptions,
+                                       void **jitOptionsValues,
+                                       unsigned int numJitOptions,
+                                       CUlibraryOption *libraryOptions,
+                                       void **libraryOptionValues,
+                                       unsigned int numLibraryOptions) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary *, const char *, CUjit_option *,
+                                      void **, unsigned int, CUlibraryOption *,
+                                      void **, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryLoadFromFile");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(library, fileName, jitOptions, jitOptionsValues,
+                  numJitOptions, libraryOptions, libraryOptionValues,
+                  numLibraryOptions);
+}
+
+CUresult CUDAAPI cuLibraryUnload(CUlibrary library) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlibrary);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(library);
+}
+
+CUresult CUDAAPI cuLibraryGetKernel(CUkernel *pKernel, CUlibrary library,
+                                    const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUkernel *, CUlibrary, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pKernel, library, name);
+}
+
+CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUlibrary);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMod, library);
+}
+
+CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUkernel);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFunc, kernel);
+}
+
+CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes,
+                                    CUlibrary library, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUlibrary, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetGlobal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, library, name);
+}
+
+CUresult CUDAAPI cuLibraryGetManaged(CUdeviceptr *dptr, size_t *bytes,
+                                     CUlibrary library, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUlibrary, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, library, name);
+}
+
+CUresult CUDAAPI cuLibraryGetUnifiedFunction(void **fptr, CUlibrary library,
+                                             const char *symbol) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, CUlibrary, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLibraryGetUnifiedFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(fptr, library, symbol);
+}
+
+CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib,
+                                      CUkernel kernel, CUdevice dev) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUkernel, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, kernel, dev);
+}
+
+CUresult CUDAAPI cuKernelSetAttribute(CUfunction_attribute attrib, int val,
+                                      CUkernel kernel, CUdevice dev) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUfunction_attribute, int, CUkernel, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attrib, val, kernel, dev);
+}
+
+CUresult CUDAAPI cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config,
+                                        CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUkernel, CUfunc_cache, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuKernelSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(kernel, config, dev);
+}
+
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize);
+}
+
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
+                                 size_t WidthInBytes, size_t Height,
+                                 unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
+                                      unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
+}
+
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
+                                      CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbase, psize, dptr);
+}
+
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize);
+}
+
+CUresult CUDAAPI cuMemFreeHost(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
+                                unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
+                                           unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, p, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, p);
+}
+
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+                                   unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, flags);
+}
+
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pciBusId);
+}
+
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, dev);
+}
+
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, event);
+}
+
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
+                                      CUipcEventHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, handle);
+}
+
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, dptr);
+}
+
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
+                                    unsigned int Flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, handle, Flags);
+}
+
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostUnregister(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
+                              CUdeviceptr srcDevice, CUcontext srcContext,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
+                              CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
+                              size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
+                              const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
+                              CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
+                               size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
+                                   CUdeviceptr srcDevice, CUcontext srcContext,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
+                  hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                   const void *srcHost, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
+                                   size_t srcOffset, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
+                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N);
+}
+
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
+                             size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N);
+}
+
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N);
+}
+
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
+                              unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+                                 size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                   unsigned char uc, size_t Width,
+                                   size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned short us, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned int ui, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
+                                      size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
+                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
+                                      CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuArrayGetSparseProperties(
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, array);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_ARRAY_SPARSE_PROPERTIES *, CUmipmappedArray);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMipmappedArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, mipmap);
+}
+
+CUresult CUDAAPI
+cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements,
+                             CUarray array, CUdevice device) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_ARRAY_MEMORY_REQUIREMENTS *, CUarray, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, array, device);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap,
+    CUdevice device) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_MEMORY_REQUIREMENTS *,
+                                      CUmipmappedArray, CUdevice);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMipmappedArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, mipmap, device);
+}
+
+CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray,
+                                 unsigned int planeIdx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetPlane");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pPlaneArray, hArray, planeIdx);
+}
+
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hArray);
+}
+
+CUresult CUDAAPI cuArray3DCreate(
+    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArray3DGetDescriptor(
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI
+cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
+                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
+                       unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
+                                          CUmipmappedArray hMipmappedArray,
+                                          unsigned int level) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pLevelArray, hMipmappedArray, level);
+}
+
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hMipmappedArray);
+}
+
+CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr,
+                                               size_t size,
+                                               CUmemRangeHandleType handleType,
+                                               unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t,
+                                      CUmemRangeHandleType, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetHandleForAddressRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dptr, size, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
+                                     size_t alignment, CUdeviceptr addr,
+                                     unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
+                                      CUdeviceptr, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, alignment, addr, flags);
+}
+
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                             const CUmemAllocationProp *prop,
+                             unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
+                          const CUmemAllocationProp *, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, size, prop, flags);
+}
+
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+                          CUmemGenericAllocationHandle handle,
+                          unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
+                          CUmemGenericAllocationHandle, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, offset, handle, flags);
+}
+
+CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList,
+                                    unsigned int count, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarrayMapInfo *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMapArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mapInfoList, count, hStream);
+}
+
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                                const CUmemAccessDesc *desc, size_t count) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, desc, count);
+}
+
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
+                                const CUmemLocation *location,
+                                CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
+                                      const CUmemLocation *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, location, ptr);
+}
+
+CUresult CUDAAPI cuMemExportToShareableHandle(
+    void *shareableHandle, CUmemGenericAllocationHandle handle,
+    CUmemAllocationHandleType handleType, unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
+                          CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, handle, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemImportFromShareableHandle(
+    CUmemGenericAllocationHandle *handle, void *osHandle,
+    CUmemAllocationHandleType shHandleType) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
+                                      CUmemAllocationHandleType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, osHandle, shHandleType);
+}
+
+CUresult CUDAAPI cuMemGetAllocationGranularity(
+    size_t *granularity, const CUmemAllocationProp *prop,
+    CUmemAllocationGranularity_flags option) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
+                                      CUmemAllocationGranularity_flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(granularity, prop, option);
+}
+
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
+    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, handle);
+}
+
+CUresult CUDAAPI
+cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, addr);
+}
+
+CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, hStream);
+}
+
+CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize,
+                                 CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, hStream);
+}
+
+CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolTrimTo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, minBytesToKeep);
+}
+
+CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool,
+                                       CUmemPool_attribute attr, void *value) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, attr, value);
+}
+
+CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool,
+                                       CUmemPool_attribute attr, void *value) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemoryPool, CUmemPool_attribute, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, attr, value);
+}
+
+CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool,
+                                    const CUmemAccessDesc *map, size_t count) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemoryPool, const CUmemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, map, count);
+}
+
+CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags,
+                                    CUmemoryPool memPool,
+                                    CUmemLocation *location) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemAccess_flags *, CUmemoryPool, CUmemLocation *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, memPool, location);
+}
+
+CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool,
+                                 const CUmemPoolProps *poolProps) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool *, const CUmemPoolProps *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool, poolProps);
+}
+
+CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemoryPool);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool);
+}
+
+CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize,
+                                         CUmemoryPool pool, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t, CUmemoryPool, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocFromPoolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, pool, hStream);
+}
+
+CUresult CUDAAPI cuMemPoolExportToShareableHandle(
+    void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType,
+    unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      void *, CUmemoryPool, CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMemPoolExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle_out, pool, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
+    CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType,
+    unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUmemoryPool *, void *, CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMemPoolImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pool_out, handle, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out,
+                                        CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemPoolPtrExportData *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolExportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareData_out, ptr);
+}
+
+CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool,
+                                        CUmemPoolPtrExportData *shareData) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUmemoryPool,
+                                      CUmemPoolPtrExportData *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPoolImportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr_out, pool, shareData);
+}
+
+CUresult CUDAAPI cuPointerGetAttribute(void *data,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, attribute, ptr);
+}
+
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
+                                    CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, hStream);
+}
+
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
+                             CUmem_advise advice, CUdevice device) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
+                                        CUmem_range_attribute attribute,
+                                        CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
+                                         CUmem_range_attribute *attributes,
+                                         size_t numAttributes,
+                                         CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+CUresult CUDAAPI cuPointerSetAttribute(const void *value,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attribute, ptr);
+}
+
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
+                                        CUpointer_attribute *attributes,
+                                        void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
+                                      void **, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numAttributes, attributes, data, ptr);
+}
+
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, Flags);
+}
+
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
+                                            unsigned int flags, int priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, flags, priority);
+}
+
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, streamId);
+}
+
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, pctx);
+}
+
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, hEvent, Flags);
+}
+
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
+                                     CUstreamCallback callback, void *userData,
+                                     unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, callback, userData, flags);
+}
+
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
+                                      CUstreamCaptureMode mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, mode);
+}
+
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, phGraph);
+}
+
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
+                                     CUstreamCaptureStatus *captureStatus) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus);
+}
+
+CUresult CUDAAPI cuStreamGetCaptureInfo(
+    CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+    cuuint64_t *id_out, CUgraph *graph_out,
+    const CUgraphNode **dependencies_out, size_t *numDependencies_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *,
+                          CUgraph *, const CUgraphNode **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus_out, id_out, graph_out,
+                  dependencies_out, numDependencies_out);
+}
+
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream,
+                                                   CUgraphNode *dependencies,
+                                                   size_t numDependencies,
+                                                   unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUgraphNode *, size_t, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuStreamUpdateCaptureDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dependencies, numDependencies, flags);
+}
+
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
+                                        size_t length, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dptr, length, flags);
+}
+
+CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, Flags);
+}
+
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream);
+}
+
+CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream,
+                                        unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecordWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream, flags);
+}
+
+CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                    CUevent hEnd) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMilliseconds, hStart, hEnd);
+}
+
+CUresult CUDAAPI
+cuImportExternalMemory(CUexternalMemory *extMem_out,
+                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
+                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
+    CUdeviceptr *devPtr, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
+                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
+    CUmipmappedArray *mipmap, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
+                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+CUresult CUDAAPI cuImportExternalSemaphore(
+    CUexternalSemaphore *extSem_out,
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *,
+      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
+      unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
+                                     cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
+                                     cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
+                                      cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
+                                      cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
+                                    CUstreamBatchMemOpParams *paramArray,
+                                    unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
+                                      CUstreamBatchMemOpParams *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, count, paramArray, flags);
+}
+
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                    CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, hfunc);
+}
+
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
+                                    CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, attrib, value);
+}
+
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
+                                          CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod, hfunc);
+}
+
+CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                unsigned int gridDimY, unsigned int gridDimZ,
+                                unsigned int blockDimX, unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes, CUstream hStream,
+                                void **kernelParams, void **extra) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f,
+                                  void **kernelParams, void **extra) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(const CUlaunchConfig *, CUfunction, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernelEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config, f, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
+    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
+                                  void *userData) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, fn, userData);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
+                                                       int y, int z) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, x, y, z);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
+                                                       unsigned int bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
+                                                  unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
+                                               unsigned int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
+                                               float value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
+                                               void *ptr,
+                                               unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, ptr, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
+                                                int grid_height) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
+                                                     int grid_width,
+                                                     int grid_height,
+                                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height, hStream);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
+                                                    int texunit,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, texunit, hTexRef);
+}
+
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraph, flags);
+}
+
+CUresult CUDAAPI cuGraphAddKernelNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetParams(
+    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetParams(
+    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                      const CUgraphNode *dependencies,
+                                      size_t numDependencies,
+                                      const CUDA_MEMCPY3D *copyParams,
+                                      CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
+                                            CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
+                                            const CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemsetNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
+    CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(
+    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(
+    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                    const CUgraphNode *dependencies,
+                                    size_t numDependencies,
+                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
+                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeSetParams(
+    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
+                                          CUgraph hGraph,
+                                          const CUgraphNode *dependencies,
+                                          size_t numDependencies,
+                                          CUgraph childGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  childGraph);
+}
+
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
+                                               CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, phGraph);
+}
+
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                     const CUgraphNode *dependencies,
+                                     size_t numDependencies) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode,
+                                           CUgraph hGraph,
+                                           const CUgraphNode *dependencies,
+                                           size_t numDependencies,
+                                           CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventRecordNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
+}
+
+CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode,
+                                                CUevent *event_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, event_out);
+}
+
+CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode,
+                                                CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, event);
+}
+
+CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode,
+                                         CUgraph hGraph,
+                                         const CUgraphNode *dependencies,
+                                         size_t numDependencies,
+                                         CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEventWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, event);
+}
+
+CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode,
+                                              CUevent *event_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, event_out);
+}
+
+CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode,
+                                              CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, event);
+}
+
+CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresSignalNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(
+    CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(
+    CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphAddExternalSemaphoresWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(
+    CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(
+    CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddBatchMemOpNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddBatchMemOpNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(
+    CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphBatchMemOpNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams_out);
+}
+
+CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(
+    CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphBatchMemOpNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_BATCH_MEM_OP_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecBatchMemOpNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemAllocNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          CUDA_MEM_ALLOC_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemAllocNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemAllocNodeGetParams(
+    CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEM_ALLOC_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemAllocNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                       const CUgraphNode *dependencies,
+                                       size_t numDependencies,
+                                       CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemFreeNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, dptr);
+}
+
+CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode,
+                                             CUdeviceptr *dptr_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUdeviceptr *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemFreeNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dptr_out);
+}
+
+CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGraphMemTrim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device,
+                                              CUgraphMem_attribute attr,
+                                              void *value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUgraphMem_attribute, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device,
+                                              CUgraphMem_attribute attr,
+                                              void *value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, CUgraphMem_attribute, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceSetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphClone, originalGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
+                                        CUgraphNode hOriginalNode,
+                                        CUgraph hClonedGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phNode, hOriginalNode, hClonedGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, type);
+}
+
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
+                                 size_t *numNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, nodes, numNodes);
+}
+
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
+                                     size_t *numRootNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, rootNodes, numRootNodes);
+}
+
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
+                                 CUgraphNode *to, size_t *numEdges) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numEdges);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
+                                            CUgraphNode *dependencies,
+                                            size_t *numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
+                                              CUgraphNode *dependentNodes,
+                                              size_t *numDependentNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependentNodes, numDependentNodes);
+}
+
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
+                                        const CUgraphNode *to,
+                                        size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
+                                           const CUgraphNode *from,
+                                           const CUgraphNode *to,
+                                           size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode);
+}
+
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
+                                    unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, flags);
+}
+
+CUresult CUDAAPI
+cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph,
+                             CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph,
+                                      CUDA_GRAPH_INSTANTIATE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiateWithParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, instantiateParams);
+}
+
+CUresult CUDAAPI cuGraphExecGetFlags(CUgraphExec hGraphExec,
+                                     cuuint64_t *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, cuuint64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, flags);
+}
+
+CUresult CUDAAPI
+cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
+                                                CUgraphNode hNode,
+                                                const CUDA_MEMCPY3D *copyParams,
+                                                CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
+}
+
+CUresult CUDAAPI
+cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec,
+                                                    CUgraphNode hNode,
+                                                    CUgraph childGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUgraph);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecChildGraphNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, childGraph);
+}
+
+CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec,
+                                                    CUgraphNode hNode,
+                                                    CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec,
+                                                  CUgraphNode hNode,
+                                                  CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, CUevent);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_EXT_SEM_WAIT_NODE_PARAMS *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphExecExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec,
+                                       CUgraphNode hNode,
+                                       unsigned int isEnabled) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeSetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec,
+                                       CUgraphNode hNode,
+                                       unsigned int *isEnabled) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphUpload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec);
+}
+
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph);
+}
+
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
+                                   CUgraphExecUpdateResultInfo *resultInfo) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphExecUpdateResultInfo *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, resultInfo);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
+                                                 CUgraphNode src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              CUkernelNodeAttrValue *value_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              const CUkernelNodeAttrValue *value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      const CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path,
+                                      unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const char *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDebugDotPrint");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, path, flags);
+}
+
+CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr,
+                                    CUhostFn destroy,
+                                    unsigned int initialRefcount,
+                                    unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject *, void *, CUhostFn,
+                                      unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
+}
+
+CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuserObject, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuUserObjectRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object,
+                                         unsigned int count,
+                                         unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRetainUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count, flags);
+}
+
+CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object,
+                                          unsigned int count) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUuserObject, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphReleaseUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
+                                      CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit, flags);
+}
+
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(
+    int *clusterSize, CUfunction func, const CUlaunchConfig *config) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, const CUlaunchConfig *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialClusterSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(clusterSize, func, config);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func,
+                                              const CUlaunchConfig *config) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, const CUlaunchConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveClusters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numClusters, func, config);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
+                                                    CUarray hArray,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
+    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hMipmappedArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
+                                                      CUtexref hTexRef,
+                                                      CUdeviceptr dptr,
+                                                      size_t bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
+                     CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, desc, dptr, Pitch);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
+                                                     CUarray_format fmt,
+                                                     int NumPackedComponents) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fmt, NumPackedComponents);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
+                                                          int dim,
+                                                          CUaddress_mode am) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, dim, am);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
+                                                         CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
+                                                              float bias) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, bias);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
+    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, maxAniso);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
+                                                          float *pBorderColor) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, pBorderColor);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
+                                                      CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
+    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phMipmappedArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
+                                                          CUtexref hTexRef,
+                                                          int dim) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pam, hTexRef, dim);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
+                                                         CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
+                                                     int *pNumChannels,
+                                                     CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFormat, pNumChannels, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbias, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
+                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
+                                                            CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pmaxAniso, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
+                                                          CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pBorderColor, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
+                                                     CUarray hArray,
+                                                     unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSurfRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
+                                                     CUsurfref hSurfRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hSurfRef);
+}
+
+CUresult CUDAAPI
+cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
+                  const CUDA_TEXTURE_DESC *pTexDesc,
+                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
+                                      const CUDA_TEXTURE_DESC *,
+                                      const CUDA_RESOURCE_VIEW_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                            CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
+                                           CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
+                                    const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                             CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+CUresult CUDAAPI cuTensorMapEncodeTiled(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUtensorMap *, CUtensorMapDataType, cuuint32_t, void *,
+      const cuuint64_t *, const cuuint64_t *, const cuuint32_t *,
+      const cuuint32_t *, CUtensorMapInterleave, CUtensorMapSwizzle,
+      CUtensorMapL2promotion, CUtensorMapFloatOOBfill);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapEncodeTiled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorMap, tensorDataType, tensorRank, globalAddress,
+                  globalDim, globalStrides, boxDim, elementStrides, interleave,
+                  swizzle, l2Promotion, oobFill);
+}
+
+CUresult CUDAAPI cuTensorMapEncodeIm2col(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
+    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
+    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUtensorMap *, CUtensorMapDataType, cuuint32_t, void *,
+      const cuuint64_t *, const cuuint64_t *, const int *, const int *,
+      cuuint32_t, cuuint32_t, const cuuint32_t *, CUtensorMapInterleave,
+      CUtensorMapSwizzle, CUtensorMapL2promotion, CUtensorMapFloatOOBfill);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapEncodeIm2col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorMap, tensorDataType, tensorRank, globalAddress,
+                  globalDim, globalStrides, pixelBoxLowerCorner,
+                  pixelBoxUpperCorner, channelsPerPixel, pixelsPerColumn,
+                  elementStrides, interleave, swizzle, l2Promotion, oobFill);
+}
+
+CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap,
+                                           void *globalAddress) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtensorMap *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTensorMapReplaceAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorMap, globalAddress);
+}
+
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
+                                       CUdevice peerDev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, dev, peerDev);
+}
+
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
+                                       unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext, Flags);
+}
+
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext);
+}
+
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
+                                         CUdevice_P2PAttribute attrib,
+                                         CUdevice srcDevice,
+                                         CUdevice dstDevice) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attrib, srcDevice, dstDevice);
+}
+
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
+    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
+    unsigned int mipLevel) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
+                                      unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArray, resource, arrayIndex, mipLevel);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
+    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMipmappedArray, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
+    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevPtr, pSize, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
+                                        CUgraphicsResource *resources,
+                                        CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
+                                          CUgraphicsResource *resources,
+                                          CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGetProcAddress(
+    const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags,
+    CUdriverProcAddressQueryResult *symbolStatus) {
+  using FuncPtr = CUresult(CUDAAPI *)(const char *, void **, int, cuuint64_t,
+                                      CUdriverProcAddressQueryResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetProcAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, pfn, cudaVersion, flags, symbolStatus);
+}
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
+                                  const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/tsl/cuda/cuda_runtime_11_8.inc b/tensorflow/tsl/cuda/cuda_runtime_11_8.inc
new file mode 100644
index 00000000000..8000ce1f926
--- /dev/null
+++ b/tensorflow/tsl/cuda/cuda_runtime_11_8.inc
@@ -0,0 +1,2771 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetTexture1DLinearMaxWidth(
+    size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc,
+    int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct cudaChannelFormatDesc *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetTexture1DLinearMaxWidth");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(maxWidthInElements, fmtDesc, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(
+    enum cudaFlushGPUDirectRDMAWritesTarget target,
+    enum cudaFlushGPUDirectRDMAWritesScope scope) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(enum cudaFlushGPUDirectRDMAWritesTarget,
+                               enum cudaFlushGPUDirectRDMAWritesScope);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceFlushGPUDirectRDMAWrites");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(target, scope);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, memPool);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
+    void *nvSciSyncAttrList, int device, int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, device, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
+                       cudaStreamAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
+                                           cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamSetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
+                       const cudaStreamAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
+                                           const cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0), unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
+    unsigned long long *pId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus, pId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
+    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
+    const cudaGraphNode_t **dependencies_out __dv(0),
+    size_t *numDependencies_out __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
+      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, captureStatus_out, id_out, graph_out,
+                  dependencies_out, numDependencies_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
+    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
+                                           size_t, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, dependencies, numDependencies, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0),
+                         unsigned int flags __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecordWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
+    cudaExternalMemory_t *extMem_out,
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
+    void **devPtr, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
+                               const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, cudaExternalMemory_t,
+      const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
+    cudaExternalSemaphore_t *extSem_out,
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreSignalParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreSignalParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "__CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreWaitParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreWaitParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "__CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(
+    const cudaLaunchConfig_t *config, const void *func, void **args) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const cudaLaunchConfig_t *,
+                                           const void *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernelExC");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config, func, args);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
+                                                         cudaHostFn_t fn,
+                                                         void *userData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
+                                          const void *func, int numBlocks,
+                                          int blockSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func,
+                                     const cudaLaunchConfig_t *launchConfig) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxPotentialClusterSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(clusterSize, func, launchConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveClusters(int *numClusters, const void *func,
+                               const cudaLaunchConfig_t *launchConfig) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveClusters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numClusters, func, launchConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
+    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pPlaneArray, hArray, planeIdx);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(
+    struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array,
+    int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
+                                           cudaArray_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, array, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(
+    struct cudaArrayMemoryRequirements *memoryRequirements,
+    cudaMipmappedArray_t mipmap, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
+                                           cudaMipmappedArray_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, mipmap, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(
+    struct cudaArraySparseProperties *sparseProperties, cudaArray_t array) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *, cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(
+    struct cudaArraySparseProperties *sparseProperties,
+    cudaMipmappedArray_t mipmap) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *,
+                                           cudaMipmappedArray_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, mipmap);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
+                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
+                         cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
+                                                      size_t size,
+                                                      cudaStream_t hStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, hStream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
+                                                    cudaStream_t hStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, hStream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
+                                                        size_t minBytesToKeep) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, minBytesToKeep);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
+    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
+    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolSetAccess(cudaMemPool_t memPool,
+                     const struct cudaMemAccessDesc *descList, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, descList, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
+                     struct cudaMemLocation *location) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, memPool, location);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
+    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
+                                           const struct cudaMemPoolProps *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, poolProps);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolDestroy(cudaMemPool_t memPool) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
+    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, memPool, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
+    void *shareableHandle, cudaMemPool_t memPool,
+    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, memPool, handleType, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
+    cudaMemPool_t *memPool, void *shareableHandle,
+    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, shareableHandle, handleType, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
+    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(exportData, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
+                         struct cudaMemPoolPtrExportData *exportData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
+                                           struct cudaMemPoolPtrExportData *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, memPool, exportData);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
+    size_t *offset, const struct textureReference *texref, const void *devPtr,
+    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
+                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
+                  size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
+    const struct textureReference *texref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTextureToMipmappedArray(const struct textureReference *texref,
+                                cudaMipmappedArray_const_t mipmappedArray,
+                                const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaMipmappedArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaGetTextureAlignmentOffset(size_t *offset,
+                              const struct textureReference *texref) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
+    const struct textureReference **texref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
+    const struct surfaceReference *surfref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct surfaceReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
+    const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
+    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
+  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
+      int, int, int, int, enum cudaChannelFormatKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
+  return func_ptr(x, y, z, w, f);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject_v2(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc_v2 *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc_v2 *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc_v2(
+    struct cudaTextureDesc_v2 *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaTextureDesc_v2 *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
+                                                      unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
+    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
+    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSrc, hDst);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
+    cudaKernelNodeAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, cudaKernelNodeAttrID, cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
+    const cudaKernelNodeAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, cudaKernelNodeAttrID, const cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, symbol,
+                  src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
+    const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
+      const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst,
+                  symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
+    const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
+      const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst, src,
+                  count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node, const void *symbol, const void *src, size_t count,
+    size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, const void *, const void *,
+                               size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node, void *dst, const void *symbol, size_t count,
+    size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *, const void *, size_t,
+                               size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dst, symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void *dst, const void *src,
+                               size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, void *, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
+    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
+    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                            const cudaGraphNode_t *pDependencies,
+                            size_t numDependencies, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                          const cudaGraphNode_t *pDependencies,
+                          size_t numDependencies, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresSignalNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresSignalNodeGetParams(
+    cudaGraphNode_t hNode,
+    struct cudaExternalSemaphoreSignalNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresSignalNodeSetParams(
+    cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresWaitNodeGetParams(
+    cudaGraphNode_t hNode,
+    struct cudaExternalSemaphoreWaitNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresWaitNodeSetParams(
+    cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    struct cudaMemAllocNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           struct cudaMemAllocNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemAllocNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           struct cudaMemAllocNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemAllocNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemFreeNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemFreeNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dptr_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGraphMemTrim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(
+    int device, enum cudaGraphMemAttributeType attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(
+    int device, enum cudaGraphMemAttributeType attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
+                         cudaGraph_t clonedGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
+                                                        cudaGraphNode_t *nodes,
+                                                        size_t *numNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
+    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
+                                                        cudaGraphNode_t *from,
+                                                        cudaGraphNode_t *to,
+                                                        size_t *numEdges) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
+                                           cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
+    size_t *pNumDependencies) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
+    size_t *pNumDependentNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                         const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                            const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
+    char *pLogBuffer, size_t bufferSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           cudaGraphNode_t *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void *symbol,
+    const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec,
+                                           cudaGraphNode_t node, void *dst,
+                                           const void *symbol, size_t count,
+                                           size_t offset,
+                                           enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, dst, symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void *dst,
+    const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, void *,
+                               const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+                               const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecChildGraphNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecExternalSemaphoresSignalNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphExec_t, cudaGraphNode_t,
+      const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecExternalSemaphoresWaitNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphExec_t, cudaGraphNode_t,
+      const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeSetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+                        unsigned int *isEnabled) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
+                    cudaGraphNode_t *hErrorNode_out,
+                    enum cudaGraphExecUpdateResult *updateResult_out) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
+                               enum cudaGraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphUpload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(
+    cudaGraph_t graph, const char *path, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, const char *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDebugDotPrint");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, path, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
+    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
+    unsigned int initialRefcount, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
+    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
+                                           unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
+    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
+    const char *symbol, void **funcPtr, unsigned long long flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, funcPtr, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+extern __host__ cudaError_t CUDARTAPI_CDECL
+cudaGetFuncBySymbol(cudaFunction_t *functionPtr, const void *symbolPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaFunction_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("_CDECL cudaGetFuncBySymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(functionPtr, symbolPtr);
+}
+
+}  // extern "C"
diff --git a/tensorflow/tsl/cuda/cuda_runtime_12_0.inc b/tensorflow/tsl/cuda/cuda_runtime_12_0.inc
new file mode 100644
index 00000000000..343db231322
--- /dev/null
+++ b/tensorflow/tsl/cuda/cuda_runtime_12_0.inc
@@ -0,0 +1,2676 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetTexture1DLinearMaxWidth(
+    size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc,
+    int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct cudaChannelFormatDesc *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetTexture1DLinearMaxWidth");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(maxWidthInElements, fmtDesc, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(
+    enum cudaFlushGPUDirectRDMAWritesTarget target,
+    enum cudaFlushGPUDirectRDMAWritesScope scope) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(enum cudaFlushGPUDirectRDMAWritesTarget,
+                               enum cudaFlushGPUDirectRDMAWritesScope);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceFlushGPUDirectRDMAWrites");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(target, scope);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetDefaultMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetMemPool(int device, cudaMemPool_t memPool) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, cudaMemPool_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, memPool);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetMemPool");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
+    void *nvSciSyncAttrList, int device, int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, device, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaInitDevice(int device,
+                                                     unsigned int deviceFlags,
+                                                     unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaInitDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, deviceFlags, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetId(cudaStream_t hStream, unsigned long long *streamId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, streamId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
+                       cudaStreamAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
+                                           cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamSetAttribute(cudaStream_t hStream, cudaStreamAttrID attr,
+                       const cudaStreamAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamAttrID,
+                                           const cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0), unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out,
+    unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0),
+    const cudaGraphNode_t **dependencies_out __dv(0),
+    size_t *numDependencies_out __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *,
+      cudaGraph_t *, const cudaGraphNode_t **, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("__CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v2)");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, captureStatus_out, id_out, graph_out,
+                  dependencies_out, numDependencies_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(
+    cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraphNode_t *,
+                                           size_t, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaStreamUpdateCaptureDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, dependencies, numDependencies, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0),
+                         unsigned int flags __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecordWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
+    cudaExternalMemory_t *extMem_out,
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
+    void **devPtr, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
+                               const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, cudaExternalMemory_t,
+      const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
+    cudaExternalSemaphore_t *extSem_out,
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreSignalParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreSignalParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "__CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreWaitParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreWaitParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "__CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(
+    const cudaLaunchConfig_t *config, const void *func, void **args) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const cudaLaunchConfig_t *,
+                                           const void *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernelExC");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config, func, args);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
+                                                         cudaHostFn_t fn,
+                                                         void *userData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
+                                          const void *func, int numBlocks,
+                                          int blockSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func,
+                                     const cudaLaunchConfig_t *launchConfig) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxPotentialClusterSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(clusterSize, func, launchConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveClusters(int *numClusters, const void *func,
+                               const cudaLaunchConfig_t *launchConfig) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, const cudaLaunchConfig_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveClusters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numClusters, func, launchConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMallocManaged(void **devPtr, size_t size, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(
+    cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t *, cudaArray_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetPlane");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pPlaneArray, hArray, planeIdx);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(
+    struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array,
+    int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
+                                           cudaArray_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, array, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(
+    struct cudaArrayMemoryRequirements *memoryRequirements,
+    cudaMipmappedArray_t mipmap, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArrayMemoryRequirements *,
+                                           cudaMipmappedArray_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetMemoryRequirements");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memoryRequirements, mipmap, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(
+    struct cudaArraySparseProperties *sparseProperties, cudaArray_t array) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *, cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(
+    struct cudaArraySparseProperties *sparseProperties,
+    cudaMipmappedArray_t mipmap) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaArraySparseProperties *,
+                                           cudaMipmappedArray_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMipmappedArrayGetSparseProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(sparseProperties, mipmap);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
+                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
+                         cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr,
+                                                      size_t size,
+                                                      cudaStream_t hStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, hStream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr,
+                                                    cudaStream_t hStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, hStream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool,
+                                                        size_t minBytesToKeep) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolTrimTo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, minBytesToKeep);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(
+    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(
+    cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMemPool_t, enum cudaMemPoolAttr, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolSetAccess(cudaMemPool_t memPool,
+                     const struct cudaMemAccessDesc *descList, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMemPool_t, const struct cudaMemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, descList, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool,
+                     struct cudaMemLocation *location) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      enum cudaMemAccessFlags *, cudaMemPool_t, struct cudaMemLocation *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, memPool, location);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(
+    cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t *,
+                                           const struct cudaMemPoolProps *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, poolProps);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolDestroy(cudaMemPool_t memPool) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMemPool_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(
+    void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t, cudaMemPool_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocFromPoolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, memPool, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
+    void *shareableHandle, cudaMemPool_t memPool,
+    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, cudaMemPool_t, enum cudaMemAllocationHandleType, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMemPoolExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, memPool, handleType, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
+    cudaMemPool_t *memPool, void *shareableHandle,
+    enum cudaMemAllocationHandleType handleType, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMemPool_t *, void *, enum cudaMemAllocationHandleType, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaMemPoolImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(memPool, shareableHandle, handleType, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(
+    struct cudaMemPoolPtrExportData *exportData, void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaMemPoolPtrExportData *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolExportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(exportData, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool,
+                         struct cudaMemPoolPtrExportData *exportData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, cudaMemPool_t,
+                                           struct cudaMemPoolPtrExportData *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPoolImportPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, memPool, exportData);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(
+    int x, int y, int z, int w, enum cudaChannelFormatKind f) {
+  using FuncPtr = struct cudaChannelFormatDesc(CUDARTAPI *)(
+      int, int, int, int, enum cudaChannelFormatKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateChannelDesc");
+  return func_ptr(x, y, z, w, f);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
+                                                      unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
+    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
+    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSrc, hDst);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
+    cudaKernelNodeAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, cudaKernelNodeAttrID, cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode, cudaKernelNodeAttrID attr,
+    const cudaKernelNodeAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, cudaKernelNodeAttrID, const cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, symbol,
+                  src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
+    const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
+      const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNodeFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst,
+                  symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dst,
+    const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *,
+      const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dst, src,
+                  count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node, const void *symbol, const void *src, size_t count,
+    size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, const void *, const void *,
+                               size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node, void *dst, const void *symbol, size_t count,
+    size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *, const void *, size_t,
+                               size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParamsFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dst, symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void *dst, const void *src,
+                               size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, void *, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
+    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
+    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                            const cudaGraphNode_t *pDependencies,
+                            size_t numDependencies, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventRecordNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                          const cudaGraphNode_t *pDependencies,
+                          size_t numDependencies, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEventWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeGetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresSignalNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresSignalNodeGetParams(
+    cudaGraphNode_t hNode,
+    struct cudaExternalSemaphoreSignalNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresSignalNodeSetParams(
+    cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t,
+      const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphAddExternalSemaphoresWaitNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresWaitNodeGetParams(
+    cudaGraphNode_t hNode,
+    struct cudaExternalSemaphoreWaitNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExternalSemaphoresWaitNodeSetParams(
+    cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t, const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    struct cudaMemAllocNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           struct cudaMemAllocNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemAllocNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           struct cudaMemAllocNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemAllocNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, params_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemFreeNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, dptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemFreeNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, dptr_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGraphMemTrim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(
+    int device, enum cudaGraphMemAttributeType attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(
+    int device, enum cudaGraphMemAttributeType attr, void *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, enum cudaGraphMemAttributeType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetGraphMemAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
+                         cudaGraph_t clonedGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
+                                                        cudaGraphNode_t *nodes,
+                                                        size_t *numNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
+    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
+                                                        cudaGraphNode_t *from,
+                                                        cudaGraphNode_t *to,
+                                                        size_t *numEdges) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
+                                           cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
+    size_t *pNumDependencies) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
+    size_t *pNumDependentNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                         const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                            const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
+                     unsigned long long flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
+                              unsigned long long flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphInstantiateWithParams(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
+                               cudaGraphInstantiateParams *instantiateParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           cudaGraphInstantiateParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiateWithParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, instantiateParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecGetFlags(cudaGraphExec_t graphExec, unsigned long long *flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void *symbol,
+    const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec,
+                                           cudaGraphNode_t node, void *dst,
+                                           const void *symbol, size_t count,
+                                           size_t offset,
+                                           enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParamsFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, dst, symbol, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void *dst,
+    const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, void *,
+                               const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams1D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+                               const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecChildGraphNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecEventRecordNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, cudaEvent_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecEventWaitNodeSetEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecExternalSemaphoresSignalNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphExec_t, cudaGraphNode_t,
+      const struct cudaExternalSemaphoreSignalNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresSignalNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecExternalSemaphoresWaitNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaGraphExec_t, cudaGraphNode_t,
+      const struct cudaExternalSemaphoreWaitNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecExternalSemaphoresWaitNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeSetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode,
+                        unsigned int *isEnabled) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetEnabled");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, isEnabled);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
+                    cudaGraphExecUpdateResultInfo *resultInfo) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t,
+                                           cudaGraphExecUpdateResultInfo *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, resultInfo);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphUpload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(
+    cudaGraph_t graph, const char *path, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, const char *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDebugDotPrint");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, path, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(
+    cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy,
+    unsigned int initialRefcount, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaUserObject_t *, void *, cudaHostFn_t, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object_out, ptr, destroy, initialRefcount, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUserObjectRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(
+    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1),
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t,
+                                           unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRetainUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(
+    cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaUserObject_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphReleaseUserObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, object, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(
+    const char *symbol, void **funcPtr, unsigned long long flags,
+    enum cudaDriverEntryPointQueryResult *driverStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const char *, void **, unsigned long long,
+                               enum cudaDriverEntryPointQueryResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDriverEntryPoint");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, funcPtr, flags, driverStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/tsl/cuda/cuda_stub.cc b/tensorflow/tsl/cuda/cuda_stub.cc
index aad180ac670..53d061a1a32 100644
--- a/tensorflow/tsl/cuda/cuda_stub.cc
+++ b/tensorflow/tsl/cuda/cuda_stub.cc
@@ -53,45 +53,10 @@ CUresult GetSymbolNotFoundError() {
 #error CUDA version earlier than 8 is not supported.
 #endif
 
-// Forward-declare types introduced in CUDA 9.0.
-typedef struct CUDA_LAUNCH_PARAMS_st CUDA_LAUNCH_PARAMS;
-
 #ifndef __CUDA_DEPRECATED
 #define __CUDA_DEPRECATED
 #endif
 
-#if CUDA_VERSION < 10000
-// Define fake enums introduced in CUDA 10.0.
-typedef enum CUgraphNodeType_enum {} CUgraphNodeType;
-typedef enum CUstreamCaptureStatus_enum {} CUstreamCaptureStatus;
-typedef enum CUexternalMemoryHandleType_enum {} CUexternalMemoryHandleType;
-typedef enum CUexternalSemaphoreHandleType_enum {
-} CUexternalSemaphoreHandleType;
-#endif
-
-// Forward-declare types introduced in CUDA 10.0.
-typedef struct CUextMemory_st* CUexternalMemory;
-typedef struct CUextSemaphore_st* CUexternalSemaphore;
-typedef struct CUgraph_st* CUgraph;
-typedef struct CUgraphNode_st* CUgraphNode;
-typedef struct CUgraphExec_st* CUgraphExec;
-typedef struct CUDA_KERNEL_NODE_PARAMS_st CUDA_KERNEL_NODE_PARAMS;
-typedef struct CUDA_MEMSET_NODE_PARAMS_st CUDA_MEMSET_NODE_PARAMS;
-typedef struct CUDA_HOST_NODE_PARAMS_st CUDA_HOST_NODE_PARAMS;
-typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st
-    CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
-typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st
-    CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
-typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st
-    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
-typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st
-    CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
-typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st
-    CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
-typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st
-    CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
-typedef void(CUDA_CB* CUhostFn)(void* userData);
-
 #if CUDA_VERSION < 10000
 #include "tensorflow/tsl/cuda/cuda_9_0.inc"
 #elif CUDA_VERSION < 10010
@@ -102,6 +67,8 @@ typedef void(CUDA_CB* CUhostFn)(void* userData);
 #include "tensorflow/tsl/cuda/cuda_10_2.inc"
 #elif CUDA_VERSION < 11020
 #include "tensorflow/tsl/cuda/cuda_11_0.inc"
-#else
+#elif CUDA_VERSION < 12000
 #include "tensorflow/tsl/cuda/cuda_11_2.inc"
+#else
+#include "tensorflow/tsl/cuda/cuda_12_0.inc"
 #endif
diff --git a/tensorflow/tsl/cuda/cudart_stub.cc b/tensorflow/tsl/cuda/cudart_stub.cc
index 01c48d6c221..5f9f6c0fe23 100644
--- a/tensorflow/tsl/cuda/cudart_stub.cc
+++ b/tensorflow/tsl/cuda/cudart_stub.cc
@@ -25,8 +25,8 @@ void *GetDsoHandle() {
   static auto handle = []() -> void * {
     auto handle_or = tsl::internal::DsoLoader::GetCudaRuntimeDsoHandle();
     if (!handle_or.ok()) {
-      LOG(INFO) << "Ignore above cudart dlerror if you do not have a GPU set "
-                   "up on your machine.";
+      LOG(INFO) << "Could not find cuda drivers on your machine, "
+                   "GPU will not be used.";
       return nullptr;
     }
     return handle_or.value();
@@ -60,8 +60,12 @@ cudaError_t GetSymbolNotFoundError() {
 #include "tensorflow/tsl/cuda/cuda_runtime_10_2.inc"
 #elif CUDART_VERSION < 11020
 #include "tensorflow/tsl/cuda/cuda_runtime_11_0.inc"
-#else
+#elif CUDART_VERSION < 11080
 #include "tensorflow/tsl/cuda/cuda_runtime_11_2.inc"
+#elif CUDART_VERSION < 12000
+#include "tensorflow/tsl/cuda/cuda_runtime_11_8.inc"
+#else
+#include "tensorflow/tsl/cuda/cuda_runtime_12_0.inc"
 #endif
 #undef __dv
 #undef __CUDA_DEPRECATED
diff --git a/tensorflow/tsl/cuda/cusparse_12_0.inc b/tensorflow/tsl/cuda/cusparse_12_0.inc
new file mode 100644
index 00000000000..91641482860
--- /dev/null
+++ b/tensorflow/tsl/cuda/cusparse_12_0.inc
@@ -0,0 +1,6080 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
+  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
+  return func_ptr(status);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
+  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
+                                                  cusparsePointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetCallback(cusparseLoggerCallback_t callback) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseLoggerCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(callback);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseLoggerSetFile(FILE *file) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(FILE *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetFile");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(file);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseLoggerOpenFile(const char *logFile) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerOpenFile");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logFile);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseLoggerSetLevel(int level) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(level);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseLoggerSetMask(int mask) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerSetMask");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseLoggerForceDisable(void) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseLoggerForceDisable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
+                                                cusparseMatrixType_t type) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, type);
+}
+
+cusparseMatrixType_t CUSPARSEAPI
+cusparseGetMatType(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseMatrixType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatType");
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, fillMode);
+}
+
+cusparseFillMode_t CUSPARSEAPI
+cusparseGetMatFillMode(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseFillMode_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatFillMode");
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, diagType);
+}
+
+cusparseDiagType_t CUSPARSEAPI
+cusparseGetMatDiagType(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseDiagType_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatDiagType");
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
+                                                     cusparseIndexBase_t base) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, base);
+}
+
+cusparseIndexBase_t CUSPARSEAPI
+cusparseGetMatIndexBase(const cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseIndexBase_t(CUSPARSEAPI *)(const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetMatIndexBase");
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t alg) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t *alg) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
+                                                  cusparseColorAlg_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const float *alpha, const float *A, int lda, int nnz,
+               const float *xVal, const int *xInd, const float *beta, float *y,
+               cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const float *, int, int, const float *, const int *, const float *,
+      float *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const double *alpha, const double *A, int lda, int nnz,
+               const double *xVal, const int *xInd, const double *beta,
+               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const double *, int, int, const double *, const int *, const double *,
+      double *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
+    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, int, const cuComplex *, const int *,
+      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
+    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
+    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+               cusparseOperation_t transA, int mb, int nb, int nnzb,
+               const cuComplex *alpha, const cusparseMatDescr_t descrA,
+               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+               const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const cuComplex *, const cuComplex *,
+      cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const float *x,
+                const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const int *, const int *, int, const float *, const float *,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const double *x,
+                const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const int *, const int *, int, const double *,
+      const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+    const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const int *, const int *, int,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const int *,
+      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, bsrsv2Info_t, const float *, float *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, bsrsv2Info_t, const double *, double *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
+      cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const float *B,
+    const int ldb, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const int, const float *, const int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const double *B,
+    const int ldb, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const int, const double *, const int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
+    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
+    const int blockSize, const cuDoubleComplex *B, const int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, const int, const cuDoubleComplex *, const int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuComplex *B, int ldb, cuComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
+    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
+    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                                                        csric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                                                        bsric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const float *dl,
+                                            const float *d, const float *du,
+                                            float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const double *dl,
+                                            const double *d, const double *du,
+                                            double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuComplex *dl,
+                                            const cuComplex *d,
+                                            const cuComplex *du, cuComplex *B,
+                                            int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuDoubleComplex *dl,
+                                            const cuDoubleComplex *d,
+                                            const cuDoubleComplex *du,
+                                            cuDoubleComplex *B, int ldb,
+                                            void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
+    int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      const float *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      const double *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      float *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
+                           const double *d, const double *du, double *x,
+                           int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      double *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
+    int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
+    float *x, int batchCount, void *pBuffer) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
+                                      float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
+    double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  double *, double *, double *,
+                                                  double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
+    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
+    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
+    const float *d, const float *du, const float *dw, const float *x,
+    int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *ds,
+    const double *dl, const double *d, const double *du, const double *dw,
+    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, const double *, const double *, int,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
+    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
+    const cuComplex *dw, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, const cuComplex *,
+      const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
+    const cuDoubleComplex *dl, const cuDoubleComplex *d,
+    const cuDoubleComplex *du, const cuDoubleComplex *dw,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
+    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
+    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, double *, double *, double *, double *,
+      double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
+    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
+    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
+    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, workspace);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const double *, int *,
+      int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const float *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const double *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
+      const int *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
+      const int *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
+      const int *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, int, const int *, float *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, int, const int *, double *, int *, int *,
+      double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const int *, cuComplex *, int *, int *,
+      cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
+    int *csrSortedRowPtrC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, const int *,
+      cuDoubleComplex *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
+                                              const int *cooRowInd, int nnz,
+                                              int m, int *csrSortedRowPtr,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
+                                              const int *csrSortedRowPtr,
+                                              int nnz, int m, int *cooRowInd,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
+    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
+                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, const cusparseMatDescr_t,
+      float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, const cusparseMatDescr_t,
+      double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
+      cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
+    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
+    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const int *, const int *, int, int,
+      const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
+                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
+    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
+      int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, p);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
+    const int *cooColsA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
+                                                   int m, int n, int nnz,
+                                                   int *cooRowsA, int *cooColsA,
+                                                   int *P, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
+                                                      int m, int n, int nnz,
+                                                      int *cooRowsA,
+                                                      int *cooColsA, int *P,
+                                                      void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
+    const int *csrColIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *csrRowPtrA,
+                                              int *csrColIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
+    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *cscColPtrA,
+                                              int *cscRowIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, float *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, double *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      const float *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      const double *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, const float *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const double *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, const double *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, float *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, double *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
+                    void *indices, void *values, cusparseIndexType_t idxType,
+                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateConstSpVec(
+    cusparseConstSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
+    const void *indices, const void *values, cusparseIndexType_t idxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpVecDescr_t *, int64_t, int64_t, const void *, const void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseConstSpVecDescr_t spVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                                              int64_t *size, int64_t *nnz,
+                                              void **indices, void **values,
+                                              cusparseIndexType_t *idxType,
+                                              cusparseIndexBase_t *idxBase,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstSpVecGet(
+    cusparseConstSpVecDescr_t spVecDescr, int64_t *size, int64_t *nnz,
+    const void **indices, const void **values, cusparseIndexType_t *idxType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpVecDescr_t, int64_t *, int64_t *, const void **,
+      const void **, cusparseIndexType_t *, cusparseIndexBase_t *,
+      cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
+    cusparseConstSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstSpVecGetValues(
+    cusparseConstSpVecDescr_t spVecDescr, const void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpVecDescr_t, const void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
+                    void *values, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstDnVec(cusparseConstDnVecDescr_t *dnVecDescr, int64_t size,
+                         const void *values, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstDnVecDescr_t *, int64_t, const void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseConstDnVecDescr_t dnVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                                              int64_t *size, void **values,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnVecGet(cusparseConstDnVecDescr_t dnVecDescr, int64_t *size,
+                      const void **values, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstDnVecDescr_t, int64_t *, const void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstDnVecGetValues(
+    cusparseConstDnVecDescr_t dnVecDescr, const void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnVecDescr_t, const void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseConstSpMatDescr_t spMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
+    cusparseConstSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t,
+                                                  cusparseFormat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, format);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
+    cusparseConstSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstSpMatGetValues(
+    cusparseConstSpMatDescr_t spMatDescr, const void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t, const void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseConstSpMatDescr_t spMatDescr, int64_t *rows,
+                     int64_t *cols, int64_t *nnz) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetStridedBatch(
+    cusparseConstSpMatDescr_t spMatDescr, int *batchCount) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstSpMatDescr_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooSetStridedBatch(
+    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrSetStridedBatch(
+    cusparseSpMatDescr_t spMatDescr, int batchCount, int64_t offsetsBatchStride,
+    int64_t columnsValuesBatchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int,
+                                                  int64_t, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount, offsetsBatchStride,
+                  columnsValuesBatchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetAttribute(
+    cusparseConstSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
+    void *data, size_t dataSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, attribute, data, dataSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatSetAttribute(
+    cusparseSpMatDescr_t spMatDescr, cusparseSpMatAttribute_t attribute,
+    void *data, size_t dataSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, cusparseSpMatAttribute_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, attribute, data, dataSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *csrRowOffsets, void *csrColInd, void *csrValues,
+    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateConstCsr(
+    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
+    int64_t nnz, const void *csrRowOffsets, const void *csrColInd,
+    const void *csrValues, cusparseIndexType_t csrRowOffsetsType,
+    cusparseIndexType_t csrColIndType, cusparseIndexBase_t idxBase,
+    cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
+      const void *, const void *, cusparseIndexType_t, cusparseIndexType_t,
+      cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsc(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *cscColOffsets, void *cscRowInd, void *cscValues,
+    cusparseIndexType_t cscColOffsetsType, cusparseIndexType_t cscRowIndType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
+                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateConstCsc(
+    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
+    int64_t nnz, const void *cscColOffsets, const void *cscRowInd,
+    const void *cscValues, cusparseIndexType_t cscColOffsetsType,
+    cusparseIndexType_t cscRowIndType, cusparseIndexBase_t idxBase,
+    cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
+      const void *, const void *, cusparseIndexType_t, cusparseIndexType_t,
+      cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
+                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **csrRowOffsets, void **csrColInd, void **csrValues,
+    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstCsrGet(
+    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *nnz, const void **csrRowOffsets, const void **csrColInd,
+    const void **csrValues, cusparseIndexType_t *csrRowOffsetsType,
+    cusparseIndexType_t *csrColIndType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
+      const void **, const void **, cusparseIndexType_t *,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCscGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **cscColOffsets, void **cscRowInd, void **cscValues,
+    cusparseIndexType_t *cscColOffsetsType, cusparseIndexType_t *cscRowIndType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCscGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
+                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstCscGet(
+    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *nnz, const void **cscColOffsets, const void **cscRowInd,
+    const void **cscValues, cusparseIndexType_t *cscColOffsetsType,
+    cusparseIndexType_t *cscRowIndType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
+      const void **, const void **, cusparseIndexType_t *,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCscGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cscColOffsets, cscRowInd,
+                  cscValues, cscColOffsetsType, cscRowIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, void *csrRowOffsets,
+                       void *csrColInd, void *csrValues) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
+                                                  void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr, void *cscColOffsets,
+                       void *cscRowInd, void *cscValues) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
+                                                  void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCscSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, cscColOffsets, cscRowInd, cscValues);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
+                                               int64_t rows, int64_t cols,
+                                               int64_t nnz, void *cooRowInd,
+                                               void *cooColInd, void *cooValues,
+                                               cusparseIndexType_t cooIdxType,
+                                               cusparseIndexBase_t idxBase,
+                                               cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateConstCoo(
+    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
+    int64_t nnz, const void *cooRowInd, const void *cooColInd,
+    const void *cooValues, cusparseIndexType_t cooIdxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, const void *,
+      const void *, const void *, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **cooRowInd,  // COO row indices
+    void **cooColInd,  // COO column indices
+    void **cooValues,  // COO values
+    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *,
+      void **,  // COO row indices
+      void **,  // COO column indices
+      void **,  // COO values
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstCooGet(cusparseConstSpMatDescr_t spMatDescr, int64_t *rows,
+                    int64_t *cols, int64_t *nnz,
+                    const void **cooRowInd,  // COO row indices
+                    const void **cooColInd,  // COO column indices
+                    const void **cooValues,  // COO values
+                    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+                    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *,
+      const void **,  // COO row indices
+      const void **,  // COO column indices
+      const void **,  // COO values
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr, void *cooRows,
+                       void *cooColumns, void *cooValues) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
+                                                  void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, cooRows, cooColumns, cooValues);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBlockedEll(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
+    int64_t ellBlockSize, int64_t ellCols, void *ellColInd, void *ellValue,
+    cusparseIndexType_t ellIdxType, cusparseIndexBase_t idxBase,
+    cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, int64_t, void *,
+      void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBlockedEll");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
+                  ellValue, ellIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateConstBlockedEll(
+    cusparseConstSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols,
+    int64_t ellBlockSize, int64_t ellCols, const void *ellColInd,
+    const void *ellValue, cusparseIndexType_t ellIdxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t *, int64_t, int64_t, int64_t, int64_t,
+      const void *, const void *, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstBlockedEll");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
+                  ellValue, ellIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseBlockedEllGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *ellBlockSize, int64_t *ellCols, void **ellColInd, void **ellValue,
+    cusparseIndexType_t *ellIdxType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, int64_t *, void **,
+      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseBlockedEllGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
+                  ellValue, ellIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstBlockedEllGet(
+    cusparseConstSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols,
+    int64_t *ellBlockSize, int64_t *ellCols, const void **ellColInd,
+    const void **ellValue, cusparseIndexType_t *ellIdxType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstSpMatDescr_t, int64_t *, int64_t *, int64_t *, int64_t *,
+      const void **, const void **, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstBlockedEllGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, ellBlockSize, ellCols, ellColInd,
+                  ellValue, ellIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
+    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
+    void *values, cudaDataType valueType, cusparseOrder_t order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
+      cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstDnMat(cusparseConstDnMatDescr_t *dnMatDescr, int64_t rows,
+                         int64_t cols, int64_t ld, const void *values,
+                         cudaDataType valueType, cusparseOrder_t order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstDnMatDescr_t *, int64_t, int64_t, int64_t, const void *,
+      cudaDataType, cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateConstDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseConstDnMatDescr_t dnMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                                              int64_t *rows, int64_t *cols,
+                                              int64_t *ld, void **values,
+                                              cudaDataType *type,
+                                              cusparseOrder_t *order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnMatGet(cusparseConstDnMatDescr_t dnMatDescr, int64_t *rows,
+                      int64_t *cols, int64_t *ld, const void **values,
+                      cudaDataType *type, cusparseOrder_t *order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseConstDnMatDescr_t, int64_t *, int64_t *, int64_t *, const void **,
+      cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstDnMatGetValues(
+    cusparseConstDnMatDescr_t dnMatDescr, const void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t, const void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetStridedBatch(cusparseConstDnMatDescr_t dnMatDescr,
+                             int *batchCount, int64_t *batchStride) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseConstDnMatDescr_t,
+                                                  int *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseAxpby(cusparseHandle_t handle,
+                                           const void *alpha,
+                                           cusparseConstSpVecDescr_t vecX,
+                                           const void *beta,
+                                           cusparseDnVecDescr_t vecY) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const void *, cusparseConstSpVecDescr_t, const void *,
+      cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseAxpby");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, vecX, beta, vecY);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGather(cusparseHandle_t handle,
+                                            cusparseConstDnVecDescr_t vecY,
+                                            cusparseSpVecDescr_t vecX) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstDnVecDescr_t, cusparseSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGather");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, vecY, vecX);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScatter(cusparseHandle_t handle,
+                                             cusparseConstSpVecDescr_t vecX,
+                                             cusparseDnVecDescr_t vecY) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstSpVecDescr_t, cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScatter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, vecX, vecY);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseRot(cusparseHandle_t handle,
+                                         const void *c_coeff,
+                                         const void *s_coeff,
+                                         cusparseSpVecDescr_t vecX,
+                                         cusparseDnVecDescr_t vecY) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const void *, const void *, cusparseSpVecDescr_t,
+      cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseRot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, c_coeff, s_coeff, vecX, vecY);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opX,
+    cusparseConstSpVecDescr_t vecX, cusparseConstDnVecDescr_t vecY,
+    const void *result, cudaDataType computeType, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseConstSpVecDescr_t,
+      cusparseConstDnVecDescr_t, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
+             cusparseConstSpVecDescr_t vecX, cusparseConstDnVecDescr_t vecY,
+             void *result, cudaDataType computeType, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseConstSpVecDescr_t,
+      cusparseConstDnVecDescr_t, void *, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSparseToDense_bufferSize(
+    cusparseHandle_t handle, cusparseConstSpMatDescr_t matA,
+    cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstSpMatDescr_t, cusparseDnMatDescr_t,
+      cusparseSparseToDenseAlg_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSparseToDense_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, matA, matB, alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSparseToDense(cusparseHandle_t handle, cusparseConstSpMatDescr_t matA,
+                      cusparseDnMatDescr_t matB, cusparseSparseToDenseAlg_t alg,
+                      void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstSpMatDescr_t, cusparseDnMatDescr_t,
+      cusparseSparseToDenseAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSparseToDense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, matA, matB, alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_bufferSize(
+    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
+    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
+      cusparseDenseToSparseAlg_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDenseToSparse_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, matA, matB, alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_analysis(
+    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
+    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
+      cusparseDenseToSparseAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, matA, matB, alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDenseToSparse_convert(
+    cusparseHandle_t handle, cusparseConstDnMatDescr_t matA,
+    cusparseSpMatDescr_t matB, cusparseDenseToSparseAlg_t alg,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseConstDnMatDescr_t, cusparseSpMatDescr_t,
+      cusparseDenseToSparseAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDenseToSparse_convert");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, matA, matB, alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
+    const void *beta, cusparseDnVecDescr_t vecY, cudaDataType computeType,
+    cusparseSpMVAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void *,
+      cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
+    const void *beta, cusparseDnVecDescr_t vecY, cudaDataType computeType,
+    cusparseSpMVAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void *,
+      cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_createDescr(cusparseSpSVDescr_t *descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSVDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
+    cusparseSpSVDescr_t spsvDescr, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
+      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
+      cusparseSpSVDescr_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
+                  spsvDescr, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSV_analysis(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
+    cusparseSpSVDescr_t spsvDescr, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
+      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
+      cusparseSpSVDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
+                  spsvDescr, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSV_solve(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpSVAlg_t alg,
+    cusparseSpSVDescr_t spsvDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t,
+      cusparseDnVecDescr_t, cudaDataType, cusparseSpSVAlg_t,
+      cusparseSpSVDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSV_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, vecY, computeType, alg,
+                  spsvDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_createDescr(cusparseSpSMDescr_t *descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSMDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_destroyDescr(cusparseSpSMDescr_t descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpSMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpSMAlg_t alg,
+    cusparseSpSMDescr_t spsmDescr, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
+      cusparseSpSMDescr_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
+                  spsmDescr, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSM_analysis(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpSMAlg_t alg,
+    cusparseSpSMDescr_t spsmDescr, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
+      cusparseSpSMDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
+                  spsmDescr, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpSM_solve(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpSMAlg_t alg,
+    cusparseSpSMDescr_t spsmDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpSMAlg_t,
+      cusparseSpSMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpSM_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, matC, computeType, alg,
+                  spsmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpMMAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM_preprocess(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_preprocess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseDnMatDescr_t matC,
+    cudaDataType computeType, cusparseSpMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t *descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_workEstimation(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize1,
+    void *externalBuffer1) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize1, externalBuffer1);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_getNumProducts(
+    cusparseSpGEMMDescr_t spgemmDescr, int64_t *num_prods) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_getNumProducts");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spgemmDescr, num_prods);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_estimateMemory(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, float chunk_fraction,
+    size_t *bufferSize3, void *externalBuffer3, size_t *bufferSize2) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, float, size_t *, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_estimateMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, chunk_fraction, bufferSize3,
+                  externalBuffer3, bufferSize2);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_compute(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize2,
+    void *externalBuffer2) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize2, externalBuffer2);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_copy(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_workEstimation(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
+    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize1,
+    void *externalBuffer1) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
+      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
+      size_t *, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_workEstimation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
+                  bufferSize1, externalBuffer1);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_nnz(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
+    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize2,
+    void *externalBuffer2, size_t *bufferSize3, void *externalBuffer3,
+    size_t *bufferSize4, void *externalBuffer4) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
+      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
+      size_t *, void *, size_t *, void *, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
+                  bufferSize2, externalBuffer2, bufferSize3, externalBuffer3,
+                  bufferSize4, externalBuffer4);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_copy(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    cusparseConstSpMatDescr_t matA, cusparseConstSpMatDescr_t matB,
+    cusparseSpMatDescr_t matC, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr, size_t *bufferSize5,
+    void *externalBuffer5) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t,
+      cusparseSpMatDescr_t, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t,
+      size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_copy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, matA, matB, matC, alg, spgemmDescr,
+                  bufferSize5, externalBuffer5);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMMreuse_compute(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstSpMatDescr_t matA,
+    cusparseConstSpMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSpGEMMAlg_t alg,
+    cusparseSpGEMMDescr_t spgemmDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstSpMatDescr_t, cusparseConstSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMMreuse_compute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSDDMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstDnMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSDDMMAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSDDMM_preprocess(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstDnMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSDDMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM_preprocess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSDDMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseConstDnMatDescr_t matA,
+    cusparseConstDnMatDescr_t matB, const void *beta, cusparseSpMatDescr_t matC,
+    cudaDataType computeType, cusparseSDDMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseConstDnMatDescr_t, cusparseConstDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSDDMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSDDMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMMOp_createPlan(
+    cusparseHandle_t handle, cusparseSpMMOpPlan_t *plan,
+    cusparseOperation_t opA, cusparseOperation_t opB,
+    cusparseConstSpMatDescr_t matA, cusparseConstDnMatDescr_t matB,
+    cusparseDnMatDescr_t matC, cudaDataType computeType,
+    cusparseSpMMOpAlg_t alg, const void *addOperationNvvmBuffer,
+    size_t addOperationBufferSize, const void *mulOperationNvvmBuffer,
+    size_t mulOperationBufferSize, const void *epilogueNvvmBuffer,
+    size_t epilogueBufferSize, size_t *SpMMWorkspaceSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseSpMMOpPlan_t *, cusparseOperation_t,
+      cusparseOperation_t, cusparseConstSpMatDescr_t, cusparseConstDnMatDescr_t,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMOpAlg_t, const void *,
+      size_t, const void *, size_t, const void *, size_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp_createPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, plan, opA, opB, matA, matB, matC, computeType, alg,
+                  addOperationNvvmBuffer, addOperationBufferSize,
+                  mulOperationNvvmBuffer, mulOperationBufferSize,
+                  epilogueNvvmBuffer, epilogueBufferSize, SpMMWorkspaceSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMMOp(cusparseSpMMOpPlan_t plan,
+                                            void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMMOpPlan_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp_destroyPlan(cusparseSpMMOpPlan_t plan) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMMOpPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMMOp_destroyPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+}  // extern "C"
diff --git a/tensorflow/tsl/cuda/cusparse_stub.cc b/tensorflow/tsl/cuda/cusparse_stub.cc
index 9fa9f644874..fbbb5079009 100644
--- a/tensorflow/tsl/cuda/cusparse_stub.cc
+++ b/tensorflow/tsl/cuda/cusparse_stub.cc
@@ -59,6 +59,8 @@ cusparseStatus_t GetSymbolNotFoundError() {
 #include "tensorflow/tsl/cuda/cusparse_10_1.inc"
 #elif CUDA_VERSION < 11000
 #include "tensorflow/tsl/cuda/cusparse_10_2.inc"
-#else
+#elif CUDA_VERSION < 12000
 #include "tensorflow/tsl/cuda/cusparse_11_0.inc"
+#else
+#include "tensorflow/tsl/cuda/cusparse_12_0.inc"
 #endif
diff --git a/tensorflow/tsl/distributed_runtime/BUILD b/tensorflow/tsl/distributed_runtime/BUILD
index 615362d7be6..cfd7320128c 100644
--- a/tensorflow/tsl/distributed_runtime/BUILD
+++ b/tensorflow/tsl/distributed_runtime/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl:internal",
     ],
diff --git a/tensorflow/tsl/distributed_runtime/coordination/BUILD b/tensorflow/tsl/distributed_runtime/coordination/BUILD
index 8ac1e9977a2..2d019f30d8f 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/BUILD
+++ b/tensorflow/tsl/distributed_runtime/coordination/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_t
 load("//tensorflow/tsl:tsl.bzl", "if_oss", "tsl_gpu_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl:internal",
     ],
@@ -126,6 +127,102 @@ tsl_cc_test(
     ],
 )
 
+tsl_gpu_library(
+    name = "coordination_service_agent",
+    srcs = ["coordination_service_agent.cc"],
+    hdrs = ["coordination_service_agent.h"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service_error_util",
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/framework:cancellation",
+        "//tensorflow/tsl/lib/monitoring:gauge",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:random",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tsl_cc_test(
+    name = "coordination_service_agent_test",
+    srcs = ["coordination_service_agent_test.cc"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/protobuf:coordination_config_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc_impl",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "coordination_service_rpc_handler",
+    srcs = ["coordination_service_rpc_handler.cc"],
+    hdrs = [
+        "coordination_service_rpc_handler.h",
+    ],
+    deps = [
+        ":coordination_service",
+        ":coordination_service_agent",
+        ":coordination_service_error_util",
+        "//tensorflow/tsl/platform:casts",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tsl_cc_test(
+    name = "coordination_service_recoverable_job_test",
+    srcs = ["coordination_service_recoverable_job_test.cc"],
+    deps = [
+        ":coordination_client",
+        ":coordination_service",
+        ":coordination_service_agent",
+        ":coordination_service_impl",
+        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/protobuf:coordination_config_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
index d60a90551f1..bbaa4af9612 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <map>
@@ -188,6 +189,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
     CoordinatedTaskState GetState() { return state_; }
     Status GetStatus() { return status_; }
+    uint64_t GetTaskIncarnation() { return task_incarnation_; }
     void SetConnected(uint64_t task_incarnation);
     void Disconnect(uint64_t grace_period_duration_us);
     Status RecordHeartbeat(uint64_t task_incarnation);
@@ -387,8 +389,8 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
               }
               const bool is_stale = task_state->TimeSinceLastHeartbeatMs() >
                                     heartbeat_timeout_ms_;
-              VLOG(1) << "Checking staleness for " << task_name
-                      << " stale?=" << is_stale;
+              VLOG(10) << "Checking staleness for " << task_name
+                       << " stale?=" << is_stale;
               if (is_stale) {
                 stale_task_names.push_back(task_name);
                 status = MakeCoordinationError(errors::Unavailable(
@@ -470,7 +472,6 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
   }
   {
     mutex_lock l(state_mu_);
-    cluster_state_.clear();
     for (auto& [barrier_id, barrier] : barriers_) {
       if (!barrier.passed) {
         Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
@@ -480,6 +481,9 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
       }
     }
     barriers_.clear();
+    // Cluster state is used in `PassBarrier` and it needs to be cleared after
+    // it.
+    cluster_state_.clear();
   }
   {
     mutex_lock l(check_staleness_thread_shutdown_mu_);
@@ -495,7 +499,8 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
 
-  Status status;
+  Status error;
+  std::string error_message;
   {
     mutex_lock l(state_mu_);
     if (!cluster_state_.contains(task_name)) {
@@ -504,28 +509,57 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
       return MakeCoordinationError(errors::InvalidArgument(
           "Unexpected task registered with task_name=", task_name));
     }
-    if (cluster_state_[task_name]->GetState() ==
-        CoordinatedTaskState::TASKSTATE_DISCONNECTED) {
+
+    auto* task_cluster_state = cluster_state_[task_name].get();
+    const auto task_state = task_cluster_state->GetState();
+    const auto task_status = task_cluster_state->GetStatus();
+
+    if (task_state == CoordinatedTaskState::TASKSTATE_DISCONNECTED ||
+        (errors::IsUnavailable(task_status) &&
+         task_status.GetPayload(CoordinationErrorPayloadKey()))) {
       // This task is currently disconnected (registering for the first time or
-      // has called ResetTask() previously).
-      cluster_state_[task_name]->SetConnected(incarnation);
+      // has called ResetTask() previously), or being unavailable, e.g. due
+      // to preemption, but does not have chance to be reset. We should allow
+      // the connection.
+      task_cluster_state->SetConnected(incarnation);
       LOG(INFO) << task_name
                 << " has connected to coordination service. Incarnation: "
                 << incarnation;
+      return OkStatus();
+    } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
+      // This may happen if the service processes the initial RegisterTask(),
+      // but the agent did not receive the response so the agent retries again.
+      if (task_cluster_state->GetTaskIncarnation() == incarnation) {
+        // This should be a no-op, but we update the last heartbeat timestamp
+        // to give a longer grace period for the agent to start sending
+        // heartbeats.
+        task_cluster_state->SetConnected(incarnation);
+        LOG(INFO) << task_name
+                  << " has connected to coordination service with the same "
+                  << "incarnation again: " << incarnation;
+        return OkStatus();
+      } else {
+        error_message =
+            absl::StrCat(task_name,
+                         " unexpectedly tried to connect with a different "
+                         "incarnation. It has likely restarted.");
+      }
     } else {
       // This task is connected or already in error, which implies it has
       // registered previously.
-      status = MakeCoordinationError(
-          errors::Aborted("Duplicate task registration with task_name=",
-                          task_name),
-          task);
-      SetTaskError(task_name, status);
+      error_message =
+          absl::StrCat(task_name,
+                       " unexpectedly tried to connect while it is already in "
+                       "error. ResetTask() should be called before a "
+                       "subsequent connect attempt.");
     }
+    LOG(ERROR) << error_message;
+    error = MakeCoordinationError(errors::Aborted(error_message), task);
+    SetTaskError(task_name, error);
   }
-  if (!status.ok()) {
-    PropagateError(task);
-  }
-  return status;
+  assert(!error.ok());
+  PropagateError(task);
+  return error;
 }
 
 void CoordinationServiceStandaloneImpl::WaitForAllTasks(
@@ -815,6 +849,7 @@ std::string NormalizeKey(const StringPiece orig_key) {
 
 Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     const std::string& key, const std::string& value) {
+  VLOG(3) << "InsertKeyValue(): " << key << ": " << value;
   const std::string& norm_key = NormalizeKey(key);
   mutex_lock l(kv_mu_);
   if (kv_store_.find(norm_key) != kv_store_.end()) {
@@ -834,6 +869,7 @@ Status CoordinationServiceStandaloneImpl::InsertKeyValue(
 
 void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
     const std::string& key, StatusOrValueCallback done) {
+  VLOG(3) << "GetKeyValue(): " << key;
   const std::string& norm_key = NormalizeKey(key);
   mutex_lock l(kv_mu_);
   const auto& iter = kv_store_.find(norm_key);
@@ -851,6 +887,7 @@ void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
 
 StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
     const std::string& key) {
+  VLOG(3) << "TryGetKeyValue(): " << key;
   const std::string& norm_key = NormalizeKey(key);
   mutex_lock l(kv_mu_);
   const auto& iter = kv_store_.find(norm_key);
@@ -862,6 +899,7 @@ StatusOr<std::string> CoordinationServiceStandaloneImpl::TryGetKeyValue(
 
 std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
     absl::string_view directory_key) {
+  VLOG(3) << "TryGetKeyValueDir(): " << directory_key;
   std::vector<KeyValueEntry> kvs_in_directory;
   const std::string norm_key = NormalizeKey(directory_key);
   const std::string dir = absl::StrCat(norm_key, "/");
@@ -889,6 +927,7 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
 
 Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
     const std::string& key) {
+  VLOG(3) << "DeleteKeyValue(): " << key;
   const std::string& norm_key = NormalizeKey(key);
   mutex_lock l(kv_mu_);
   // Delete directory: find key range that match directory prefix
@@ -928,6 +967,8 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
     const CoordinatedTask& task,
     const std::vector<CoordinatedTask>& participating_tasks,
     StatusCallback done) {
+  VLOG(3) << "Task " << GetTaskName(task) << "invoked BarrierAsync("
+          << barrier_id << ").";
   mutex_lock l(state_mu_);
   auto pair = barriers_.try_emplace(barrier_id);
   auto it = pair.first;
@@ -1069,6 +1110,7 @@ Status CoordinationServiceStandaloneImpl::CancelBarrier(
       "Barrier (", barrier_id, ") is cancelled by task: ", GetTaskName(task))));
   PassBarrier(barrier_id, cancelled, barrier);
 
+  VLOG(3) << "Barrier (" << barrier_id << ") is cancelled.";
   return OkStatus();
 }
 
@@ -1077,6 +1119,7 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
     absl::string_view barrier_id, Status result, BarrierState* barrier) {
   barrier->passed = true;
   barrier->result = result;
+  VLOG(3) << "Barrier(" << barrier_id << ") has passed with status: " << result;
   // Special hook for device propagation barrier to set global device ids.
   if (barrier_id == device_propagation_barrier_id_) {
     AggregateClusterDevices();
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.h b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.h
index f96c9cbf017..e9c79e3d149 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.h
@@ -30,10 +30,6 @@ limitations under the License.
 #include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 
-namespace tensorflow {
-class CoordinationServiceRpcHandler;
-}
-
 namespace tsl {
 class Env;
 
@@ -116,6 +112,10 @@ class CoordinationServiceInterface {
           post_aggregate_device_fn) = 0;
 
   // Register a task to the service.
+  // Possible service errors:
+  //   - InvalidArgument: Unexpected task request.
+  //   - Aborted: (1) task is in error state, or (2) task is in connected state
+  //       with a different incarnation, indicating that it restarted.
   virtual Status RegisterTask(const tensorflow::CoordinatedTask& task,
                               uint64_t incarnation) = 0;
 
@@ -226,7 +226,7 @@ class CoordinationServiceInterface {
                                const tensorflow::CoordinatedTask& task) = 0;
 
  private:
-  friend class tensorflow::CoordinationServiceRpcHandler;
+  friend class CoordinationServiceRpcHandler;
   friend class CoordinationServiceTest_ListClusterDevices_TfDevice_Test;
   friend class CoordinationServiceTest_ListClusterDevices_XlaDevice_Test;
   friend class
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
similarity index 88%
rename from tensorflow/core/distributed_runtime/coordination/coordination_service_agent.cc
rename to tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 13eeb17b02e..f6d35f5abd2 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 #include <algorithm>
 #include <iterator>
@@ -31,25 +31,31 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/call_options.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_error_util.h"
-#include "tensorflow/core/framework/cancellation.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/random.h"
-#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "tensorflow/tsl/framework/cancellation.h"
+#include "tensorflow/tsl/lib/monitoring/gauge.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/random.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
-namespace tensorflow {
+namespace tsl {
+using tensorflow::CoordinatedTask;
+using tensorflow::CoordinatedTaskState;
+using tensorflow::CoordinatedTaskStateInfo;
+using tensorflow::CoordinationServiceConfig;
+using tensorflow::DeviceInfo;
+using tensorflow::KeyValueEntry;
 
+namespace {
 auto* enabled_usage_metric =
     monitoring::Gauge<bool, 0>::New("/coordination_service/agent/enabled",
                                     "Tracks usage of coordination service.");
-namespace {
 
 constexpr absl::Duration kDefaultClusterRegisterTimeout = absl::Hours(1);
 constexpr absl::Duration kDefaultHeartbeatTimeout = absl::Seconds(10);
@@ -61,9 +67,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   CoordinationServiceAgentImpl() = default;
   ~CoordinationServiceAgentImpl() override {
     Status s = Shutdown();
-    if (!s.ok()) {
-      LOG(ERROR) << "Coordination agent shutdown failed with status: " << s;
-    }
+    VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
   Status Initialize(Env* env, const std::string& job_name, int task_id,
                     const CoordinationServiceConfig& configs,
@@ -222,6 +226,7 @@ void CoordinationServiceAgentImpl::StopHeartbeat() {
 }
 
 Status CoordinationServiceAgentImpl::Connect() {
+  VLOG(3) << "Agent has started trying to Connect().";
   {
     mutex_lock l(state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_DISCONNECTED) {
@@ -277,9 +282,14 @@ Status CoordinationServiceAgentImpl::Connect() {
            // 2. aborted duplicate task registration error - this means that
            // this task restarted and is trying to reconnect but the service
            // has not restarted yet.
-           (connect_status.GetPayload(
-                tensorflow::CoordinationErrorPayloadKey()) == std::nullopt ||
-            errors::IsAborted(connect_status)));
+           // 3. service has not been enabled - this could happen in the single
+           // client scenario, where the server has been started but the service
+           // cannot be used yet (nullptr). Presumably the service is in the
+           // process of being enabled.
+           (connect_status.GetPayload(CoordinationErrorPayloadKey()) ==
+                std::nullopt ||
+            errors::IsAborted(connect_status) ||
+            errors::IsInternal(connect_status)));
   if (!connect_status.ok()) {
     SetError(connect_status);
     return connect_status;
@@ -304,12 +314,14 @@ Status CoordinationServiceAgentImpl::Connect() {
           absl::Notification n;
           // Heartbeat RPC implementation automatically retries to tolerate
           // transient network failures.
+          VLOG(10) << "HeartbeatRequest: " << request.DebugString();
           leader_client_->HeartbeatAsync(&call_opts, &request, &response,
                                          [&](Status s) {
                                            status = s;
                                            n.Notify();
                                          });
           n.WaitForNotification();
+          VLOG(10) << "HeartbeatResponse: " << status;
           {
             mutex_lock l(heartbeat_thread_shutdown_mu_);
             // Ignore heartbeat errors and exit thread if shutting down. For
@@ -360,6 +372,7 @@ Status CoordinationServiceAgentImpl::WaitForAllTasks(
   });
   n.WaitForNotification();
   if (!status.ok()) {
+    VLOG(3) << "WaitForAllTasksResponse: " << status;
     SetError(status);
     return status;
   }
@@ -422,10 +435,12 @@ Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
   request.set_error_code(error.code());
   request.set_error_message(error.error_message());
   *request.mutable_error_origin() = task_;
+  VLOG(5) << "ReportErrorToServiceRequest: " << request.DebugString();
   ReportErrorToServiceResponse response;
 
   absl::Notification n;
   leader_client_->ReportErrorToServiceAsync(&request, &response, [&](Status s) {
+    VLOG(5) << "ReportErrorToServiceResponse: " << s;
     if (!s.ok()) {
       LOG(ERROR) << "Encountered another error when reporting error to "
                     "coordination service: "
@@ -438,7 +453,6 @@ Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
 }
 
 Status CoordinationServiceAgentImpl::Shutdown() {
-  LOG(INFO) << "Coordination agent has initiated Shutdown().";
   Status status = OkStatus();
   bool is_connected = false;
   {
@@ -447,6 +461,7 @@ Status CoordinationServiceAgentImpl::Shutdown() {
   }
   // Disconnect agent from service.
   if (!configs_.agent_destruction_without_shutdown() && is_connected) {
+    LOG(INFO) << "Coordination agent has initiated Shutdown().";
     ShutdownTaskRequest request;
     *request.mutable_source_task() = task_;
     ShutdownTaskResponse response;
@@ -478,11 +493,14 @@ Status CoordinationServiceAgentImpl::Shutdown() {
   {
     mutex_lock l(state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
-      status = MakeCoordinationError(errors::FailedPrecondition(absl::StrCat(
+      const std::string status_message = absl::StrCat(
           "Shutdown() was called while coordination agent is in error state, "
           "implying that distributed execution failed. Note: agent will still "
           "shutdown anyway. Agent status: ",
-          status_.ToString())));
+          status_.ToString());
+      status =
+          MakeCoordinationError(errors::FailedPrecondition(status_message));
+      LOG(ERROR) << status_message;
     }
     state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
   }
@@ -503,6 +521,7 @@ Status CoordinationServiceAgentImpl::Reset() {
 
   ResetTaskRequest request;
   *request.mutable_source_task() = task_;
+  VLOG(3) << "ResetTaskRequest: " << request.DebugString();
   ResetTaskResponse response;
 
   Status status;
@@ -512,6 +531,7 @@ Status CoordinationServiceAgentImpl::Reset() {
     n.Notify();
   });
   n.WaitForNotification();
+  VLOG(3) << "ResetTaskResponse: " << status;
   if (!status.ok()) {
     return status;
   }
@@ -548,6 +568,7 @@ StatusOr<std::string> CoordinationServiceAgentImpl::GetKeyValue(
   bool call_completed_before_timeout =
       n->WaitForNotificationWithTimeout(timeout);
   if (!call_completed_before_timeout) {
+    VLOG(3) << "GetKeyValue(" << key << ") timed out after " << timeout;
     return MakeCoordinationError(errors::DeadlineExceeded(absl::Substitute(
         "GetKeyValue() timed out with key: $0 and duration: $1", key,
         absl::FormatDuration(timeout))));
@@ -559,6 +580,7 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
     const std::string& key, StatusOrValueCallback done) {
   auto request = std::make_shared<GetKeyValueRequest>();
   request->set_key(key);
+  VLOG(3) << "GetKeyValueRequest: " << request->DebugString();
   auto response = std::make_shared<GetKeyValueResponse>();
   auto call_opts = std::make_shared<CallOptions>();
 
@@ -581,8 +603,10 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
         // Retrieve server response.
         if (!s.ok()) {
           done(s);
+          VLOG(3) << "GetKeyValueResponse: " << s;
         } else {
           done(response->kv().value());
+          VLOG(3) << "GetKeyValueResponse: " << response->DebugString();
         }
       });
   return call_opts;
@@ -594,17 +618,23 @@ StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
   StatusOr<std::string> result;
   TryGetKeyValueRequest request;
   request.set_key(key);
+  VLOG(3) << "TryGetKeyValueRequest: " << request.DebugString();
   TryGetKeyValueResponse response;
   leader_client_->TryGetKeyValueAsync(&request, &response,
                                       [&](const Status& s) {
                                         if (s.ok()) {
                                           result = response.kv().value();
+                                          VLOG(3) << "TryGetKeyValueResponse: "
+                                                  << result.value();
                                         } else {
                                           result = s;
+                                          VLOG(3) << "TryGetKeyValueResponse: "
+                                                  << s;
                                         }
                                         n.Notify();
                                       });
   n.WaitForNotification();
+
   return result;
 }
 
@@ -626,13 +656,16 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
     const std::string& key, StatusOrValueDirCallback done) {
   auto request = std::make_shared<GetKeyValueDirRequest>();
   request->set_directory_key(key);
+  VLOG(3) << "GetKeyValueDirRequest: " << request->DebugString();
   auto response = std::make_shared<GetKeyValueDirResponse>();
   leader_client_->GetKeyValueDirAsync(
       request.get(), response.get(),
       [request, response, done = std::move(done)](const Status& s) {
         if (!s.ok()) {
           done(s);
+          VLOG(3) << "GetKeyValueDirResponse: " << s;
         } else {
+          VLOG(3) << "GetKeyValueDirResponse: " << response->DebugString();
           std::vector<KeyValueEntry> kv_in_directory = {
               std::make_move_iterator(response->kv().begin()),
               std::make_move_iterator(response->kv().end())};
@@ -646,6 +679,7 @@ Status CoordinationServiceAgentImpl::InsertKeyValue(const std::string& key,
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
+  VLOG(3) << "InsertKeyValueRequest: " << request.DebugString();
   InsertKeyValueResponse response;
 
   Status status;
@@ -655,6 +689,7 @@ Status CoordinationServiceAgentImpl::InsertKeyValue(const std::string& key,
     n.Notify();
   });
   n.WaitForNotification();
+  VLOG(3) << "InsertKeyValueResponse: " << status;
   return status;
 }
 
@@ -662,6 +697,7 @@ Status CoordinationServiceAgentImpl::DeleteKeyValue(const std::string& key) {
   DeleteKeyValueRequest request;
   request.set_key(key);
   request.set_is_directory(true);
+  VLOG(3) << "DeleteKeyValueRequest: " << request.DebugString();
   DeleteKeyValueResponse response;
 
   Status status;
@@ -671,6 +707,7 @@ Status CoordinationServiceAgentImpl::DeleteKeyValue(const std::string& key) {
     n.Notify();
   });
   n.WaitForNotification();
+  VLOG(3) << "DeleteKeyValueResponse " << status;
   return OkStatus();
 }
 
@@ -748,9 +785,13 @@ void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
   request->set_barrier_timeout_in_ms(timeout / absl::Milliseconds(1));
   *request->mutable_source_task() = task_;
   *request->mutable_tasks() = {tasks.begin(), tasks.end()};
-  leader_client_->BarrierAsync(request.get(), response.get(),
-                               [request, response, done = std::move(done)](
-                                   const Status& s) { done(s); });
+  VLOG(3) << "WaitAtBarrierRequest: " << request->DebugString();
+  leader_client_->BarrierAsync(
+      request.get(), response.get(),
+      [request, response, done = std::move(done)](const Status& s) {
+        done(s);
+        VLOG(3) << "WaitAtBarrierResponse: " << s;
+      });
 }
 
 Status CoordinationServiceAgentImpl::CancelBarrier(
@@ -777,10 +818,12 @@ void CoordinationServiceAgentImpl::CancelBarrierAsync(
   auto response = std::make_shared<CancelBarrierResponse>();
   request->set_barrier_id(barrier_id);
   *request->mutable_source_task() = task_;
+  VLOG(3) << "CancelBarrierRequest: " << request->DebugString();
   leader_client_->CancelBarrierAsync(
       request.get(), response.get(),
       [request, response, done = std::move(done)](const Status& s) {
         done(s);
+        VLOG(3) << "CancelBarrierResponse: " << s;
       });
 }
 
@@ -830,4 +873,4 @@ std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent() {
   return std::make_unique<CoordinationServiceAgentImpl>();
 }
 
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h
similarity index 80%
rename from tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h
rename to tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 107afd8fc43..d2aeb007fc7 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
 
 #include <functional>
 #include <map>
@@ -24,18 +24,18 @@ limitations under the License.
 #include <vector>
 
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/call_options.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
-namespace tsl {
-class Env;
-}  // namespace tsl
 namespace tensorflow {
 class CoordinationServiceConfig;
-class CoordinatedTask;
+};  // namespace tensorflow
+
+namespace tsl {
+class Env;
 
 // CoordinationServiceAgent defines the interface for tasks to communicate with
 // the coordination service instance (which implements
@@ -60,28 +60,30 @@ class CoordinatedTask;
 //                           crashed or got preempted).
 //    - errors::InvalidArgument: Unexpected heartbeat from remote task (not
 //                               registered or wrong config).
+// TODO(hanyangtay): Migrate to string_view for string parameters.
 class CoordinationServiceAgent {
  public:
   using StatusOrValueCallback =
       std::function<void(const StatusOr<std::string>&)>;
   // Collection of key-value pairs in the same directory.
-  using StatusOrValueDirCallback =
-      std::function<void(const StatusOr<std::vector<KeyValueEntry>>&)>;
+  using StatusOrValueDirCallback = std::function<void(
+      const StatusOr<std::vector<tensorflow::KeyValueEntry>>&)>;
   using ChangedKeyValuesCallback =
       std::function<void(const std::map<std::string, std::string>&)>;
 
-  virtual ~CoordinationServiceAgent() {}
+  virtual ~CoordinationServiceAgent() = default;
 
   // Initialize coordination service agent.
-  virtual Status Initialize(tsl::Env* env, const std::string& job_name,
-                            int task_id,
-                            const CoordinationServiceConfig& configs,
-                            std::unique_ptr<CoordinationClient> leader_client,
-                            StatusCallback error_fn) = 0;
-  virtual Status Initialize(tsl::Env* env, const CoordinatedTask& task,
-                            const CoordinationServiceConfig& configs,
-                            std::unique_ptr<CoordinationClient> leader_client,
-                            StatusCallback error_fn) = 0;
+  virtual Status Initialize(
+      tsl::Env* env, const std::string& job_name, int task_id,
+      const tensorflow::CoordinationServiceConfig& configs,
+      std::unique_ptr<CoordinationClient> leader_client,
+      StatusCallback error_fn) = 0;
+  virtual Status Initialize(
+      tsl::Env* env, const tensorflow::CoordinatedTask& task,
+      const tensorflow::CoordinationServiceConfig& configs,
+      std::unique_ptr<CoordinationClient> leader_client,
+      StatusCallback error_fn) = 0;
 
   // Return true if the coordination service agent has been initialized.
   virtual bool IsInitialized() = 0;
@@ -109,10 +111,11 @@ class CoordinationServiceAgent {
   // Possible service errors:
   //   - FailedPrecondition: Agent is not in CONNECTED state.
   //   - InvalidArgument: Unexpected task request
-  virtual Status WaitForAllTasks(const DeviceInfo& local_devices) = 0;
+  virtual Status WaitForAllTasks(
+      const tensorflow::DeviceInfo& local_devices) = 0;
 
   // Get the device attributes of tasks from remote tasks in the cluster.
-  virtual const DeviceInfo& GetClusterDeviceInfo() = 0;
+  virtual const tensorflow::DeviceInfo& GetClusterDeviceInfo() = 0;
 
   // State transition in coordination service agent:
   //
@@ -123,11 +126,11 @@ class CoordinationServiceAgent {
   //                                         Reset
 
   // Get task associated with this agent.
-  virtual StatusOr<CoordinatedTask> GetOwnTask() = 0;
+  virtual StatusOr<tensorflow::CoordinatedTask> GetOwnTask() = 0;
 
   // Get status of a remote task.
-  virtual StatusOr<std::vector<CoordinatedTaskStateInfo>> GetTaskState(
-      const std::vector<CoordinatedTask>& task) = 0;
+  virtual StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
+  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task) = 0;
 
   // Report error to coordination service. This will invoke the error callback.
   // Note that the error payload will set `is_reported_error` to true, to
@@ -156,11 +159,14 @@ class CoordinationServiceAgent {
   //       disconnected.
   virtual Status Reset() = 0;
 
+  // Key-value store API.
+  // The agent does not need to be connected to utilize the key-value store.
+  // There are no concurrency guarantees. To avoid a race / impose an ordering
+  // on potentially concurrent ops (e.g. set, delete), use WaitAtBarrier().
+
   // Get config key-value from the service.
   // If the key-value is not inserted yet, this is a blocking call that waits
   // until the corresponding key is inserted.
-  // Agent does not need to be connected to utilize the distributed key-value
-  // store.
   //   - errors::DeadlineExceeded: timed out waiting for key.
   virtual StatusOr<std::string> GetKeyValue(const std::string& key) = 0;
   virtual StatusOr<std::string> GetKeyValue(const std::string& key,
@@ -171,19 +177,15 @@ class CoordinationServiceAgent {
       const std::string& key, StatusOrValueCallback done) = 0;
 
   // Get config key-value from the service.
-  // If the key-value does not exist, this call returns NotFound error.
-  // Agent does not need to be connected to utilize the distributed key-value
-  // store.
   //   - errors::NotFound: the requested key does not exist.
   virtual StatusOr<std::string> TryGetKeyValue(const std::string& key) = 0;
 
   // Get all values under a directory (key).
   // A value is considered to be in the directory if its key is prefixed with
   // the directory.
-  // This is not a blocking call.
-  // Agent does not need to be connected to utilize the distributed key-value
-  // store.
-  virtual StatusOr<std::vector<KeyValueEntry>> GetKeyValueDir(
+  // This is not a blocking call. If no keys are found, an empty vector is
+  // returned immediately.
+  virtual StatusOr<std::vector<tensorflow::KeyValueEntry>> GetKeyValueDir(
       const std::string& key) = 0;
   virtual void GetKeyValueDirAsync(const std::string& key,
                                    StatusOrValueDirCallback done) = 0;
@@ -238,14 +240,14 @@ class CoordinationServiceAgent {
   //       list of participating tasks.
   //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state. Or the
   //       same barrier_id was already used previously.
-  virtual Status WaitAtBarrier(const std::string& barrier_id,
-                               absl::Duration timeout,
-                               const std::vector<CoordinatedTask>& tasks) = 0;
+  virtual Status WaitAtBarrier(
+      const std::string& barrier_id, absl::Duration timeout,
+      const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
 
-  virtual void WaitAtBarrierAsync(const std::string& barrier_id,
-                                  absl::Duration timeout,
-                                  const std::vector<CoordinatedTask>& tasks,
-                                  StatusCallback done) = 0;
+  virtual void WaitAtBarrierAsync(
+      const std::string& barrier_id, absl::Duration timeout,
+      const std::vector<tensorflow::CoordinatedTask>& tasks,
+      StatusCallback done) = 0;
 
   // Aborts the barrier if it is ongoing.
   // Current and future WaitAtBarrier() calls with the same id will return a
@@ -275,6 +277,6 @@ class CoordinationServiceAgent {
 
 std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
 
-}  // namespace tensorflow
+}  // namespace tsl
 
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_AGENT_H_
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
similarity index 92%
rename from tensorflow/core/distributed_runtime/coordination/coordination_service_agent_test.cc
rename to tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index d6b3547e0b1..6f69848d825 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 #include <memory>
 #include <string>
@@ -23,18 +23,22 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/call_options.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace {
+using tensorflow::CoordinatedTask;
+using tensorflow::CoordinationServiceConfig;
+using tensorflow::KeyValueEntry;
+
 using ::testing::_;
 using ::testing::DoAll;
 using ::testing::InvokeArgument;
@@ -58,8 +62,8 @@ KeyValueEntry CreateKv(const std::string& key, const std::string& value) {
 class TestCoordinationClient : public CoordinationClient {
  public:
   TestCoordinationClient() = default;
-  // MOCK_METHOD does not work on Windows build, using deprecated MOCK_METHOD3
-  // instead.
+  // NOLINTBEGIN (MOCK_METHOD macro fails in Windows build, so use deprecated
+  // macro instead)
   MOCK_METHOD4(GetKeyValueAsync,
                void(CallOptions* call_opts, const GetKeyValueRequest*,
                     GetKeyValueResponse*, StatusCallback));
@@ -82,6 +86,9 @@ class TestCoordinationClient : public CoordinationClient {
                void(const BarrierRequest*, BarrierResponse*, StatusCallback));
   MOCK_METHOD3(GetTaskStateAsync, void(const GetTaskStateRequest*,
                                        GetTaskStateResponse*, StatusCallback));
+  MOCK_METHOD4(HeartbeatAsync, void(CallOptions*, const HeartbeatRequest*,
+                                    HeartbeatResponse*, StatusCallback));
+  // NOLINTEND
 
 #define UNIMPLEMENTED(method)                                         \
   void method##Async(const method##Request* request,                  \
@@ -95,11 +102,6 @@ class TestCoordinationClient : public CoordinationClient {
   UNIMPLEMENTED(DeleteKeyValue);
   UNIMPLEMENTED(CancelBarrier);
 #undef UNIMPLEMENTED
-  void HeartbeatAsync(CallOptions* call_opts, const HeartbeatRequest* request,
-                      HeartbeatResponse* response,
-                      StatusCallback done) override {
-    done(errors::Unimplemented("HeartbeatAsync"));
-  }
   void ReportErrorToTaskAsync(CallOptions* call_opts,
                               const ReportErrorToTaskRequest* request,
                               ReportErrorToTaskResponse* response,
@@ -113,6 +115,8 @@ class CoordinationServiceAgentTest : public ::testing::Test {
   void SetUp() override {
     ON_CALL(*client_, RegisterTaskAsync(_, _, _, _))
         .WillByDefault(InvokeArgument<3>(OkStatus()));
+    ON_CALL(*client_, HeartbeatAsync(_, _, _, _))
+        .WillByDefault(InvokeArgument<3>(OkStatus()));
     ON_CALL(*client_, ShutdownTaskAsync(_, _, _, _))
         .WillByDefault(InvokeArgument<3>(OkStatus()));
     ON_CALL(*client_, ReportErrorToServiceAsync(_, _, _))
@@ -470,5 +474,17 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldFailEventually) {
   EXPECT_TRUE(errors::IsAborted(s));
 }
 
+TEST_F(CoordinationServiceAgentTest, Connect_InternalErrorShouldBeRetried) {
+  EXPECT_CALL(*GetClient(), RegisterTaskAsync(_, _, _, _))
+      .WillOnce(InvokeArgument<3>(
+          errors::Internal("Coordination service is not enabled.")))
+      .WillOnce(InvokeArgument<3>(
+          errors::Internal("Coordination service is not enabled.")))
+      .WillOnce(InvokeArgument<3>(OkStatus()));
+  InitializeAgent();
+
+  TF_EXPECT_OK(agent_->Connect());
+}
+
 }  // namespace
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h
index 829c221da84..90db5285589 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h
@@ -29,7 +29,7 @@ constexpr absl::string_view CoordinationErrorPayloadKey() {
 // Mark error as a coordination service error (as opposed to RPC
 // errors).
 inline Status MakeCoordinationError(Status s) {
-  s.SetPayload(CoordinationErrorPayloadKey(), "");
+  s.SetPayload(CoordinationErrorPayloadKey(), absl::Cord(""));
   return s;
 }
 
@@ -43,14 +43,16 @@ inline Status MakeCoordinationError(Status s,
   tensorflow::CoordinationServiceError error;
   *error.mutable_source_task() = origin;
   error.set_is_reported_error(is_reported_error);
-  s.SetPayload(CoordinationErrorPayloadKey(), error.SerializeAsString());
+  s.SetPayload(CoordinationErrorPayloadKey(),
+               absl::Cord(error.SerializeAsString()));
   return s;
 }
 
 // Mark error as a coordination service error with payload.
 inline Status MakeCoordinationError(
     Status s, const tensorflow::CoordinationServiceError& payload) {
-  s.SetPayload(CoordinationErrorPayloadKey(), payload.SerializeAsString());
+  s.SetPayload(CoordinationErrorPayloadKey(),
+               absl::Cord(payload.SerializeAsString()));
   return s;
 }
 }  // namespace tsl
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
similarity index 82%
rename from tensorflow/core/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
rename to tensorflow/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
index 90464adb963..4cfbe6c760c 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_recoverable_job_test.cc
@@ -19,23 +19,24 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace {
-using tsl::CoordinationServiceInterface;
+using tensorflow::CoordinatedJob;
+using tensorflow::CoordinationServiceConfig;
 
 constexpr char kParameterServerJobName[] = "parameter_server";
 constexpr char kWorkerJobName[] = "worker";
@@ -71,19 +72,21 @@ class TestCoordinationClientCache : public CoordinationClientCache {
 
 class TestCoordinationServiceTaskState {
  public:
-  TestCoordinationServiceTaskState() {}
+  TestCoordinationServiceTaskState() = default;
 
-  ~TestCoordinationServiceTaskState() {}
+  ~TestCoordinationServiceTaskState() = default;
 
   void Shutdown() {
     coord_client_.reset();
     coord_agent_.reset();
     coord_compute_pool_.reset();
+    static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
+        ->SetCoordinationServiceInstance(nullptr);
     grpc_server_->Shutdown();
     coord_rpc_service_->Shutdown();
   }
 
-  void StartCoordinationService() {
+  void StartGrpcServer() {
     ::grpc::ServerBuilder builder;
     coord_compute_pool_ = std::make_unique<thread::ThreadPool>(
         Env::Default(), /*name=*/"CoordinationServiceRpcHandler",
@@ -101,6 +104,12 @@ class TestCoordinationServiceTaskState {
         [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
   }
 
+  void SetCoordinationService(CoordinationServiceInterface* service) {
+    auto* grpc_coord_service =
+        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
+    grpc_coord_service->SetCoordinationServiceInstance(service);
+  }
+
   void InitializeAndConnectCoordinationAgents(
       const std::string& job_name, int task_id,
       const CoordinationServiceConfig& coordination_config) {
@@ -128,7 +137,7 @@ class TestCoordinationServiceTaskState {
  private:
   std::unique_ptr<::grpc::Server> grpc_server_;
   std::unique_ptr<thread::ThreadPool> coord_compute_pool_;
-  std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
+  std::unique_ptr<AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<Thread> coord_rpc_thread_;
   std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
       CreateCoordinationServiceAgent();
@@ -139,10 +148,10 @@ class TestCoordinationServiceTaskState {
 class CoordinationServiceRecoverableJobTest : public ::testing::Test {
  public:
   void SetUp() override {
-    state_ps_0_.StartCoordinationService();
-    state_ps_1_.StartCoordinationService();
-    state_worker_0_.StartCoordinationService();
-    state_worker_1_.StartCoordinationService();
+    state_ps_0_.StartGrpcServer();
+    state_ps_1_.StartGrpcServer();
+    state_worker_0_.StartGrpcServer();
+    state_worker_1_.StartGrpcServer();
   }
 
   void TearDown() override {
@@ -169,6 +178,13 @@ class CoordinationServiceRecoverableJobTest : public ::testing::Test {
         state_worker_1_.GetCoordinationClient());
     coord_service_ = CoordinationServiceInterface::EnableCoordinationService(
         Env::Default(), coordination_config_, std::move(client_cache));
+    // Set the service pointer for all the tasks since it is needed for handling
+    // error propagations. In reality, every task has its own service pointer.
+    // To mimic that, we need multi-process tests.
+    state_ps_0_.SetCoordinationService(coord_service_.get());
+    state_ps_1_.SetCoordinationService(coord_service_.get());
+    state_worker_0_.SetCoordinationService(coord_service_.get());
+    state_worker_1_.SetCoordinationService(coord_service_.get());
     state_ps_0_.InitializeAndConnectCoordinationAgents(kParameterServerJobName,
                                                        /*task_id=*/0,
                                                        coordination_config_);
@@ -248,4 +264,4 @@ TEST_F(CoordinationServiceRecoverableJobTest,
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
new file mode 100644
index 00000000000..55ae8b55b8e
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -0,0 +1,297 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/time/time.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_error_util.h"
+#include "tensorflow/tsl/platform/casts.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+namespace {
+using tensorflow::CoordinatedTask;
+using tensorflow::CoordinationServiceError;
+using tensorflow::KeyValueEntry;
+}  // namespace
+
+void CoordinationServiceRpcHandler::SetAgentInstance(
+    CoordinationServiceAgent* agent) {
+  mutex_lock l(mu_);
+  agent_ = agent;
+}
+
+void CoordinationServiceRpcHandler::SetServiceInstance(
+    CoordinationServiceInterface* service) {
+  mutex_lock l(mu_);
+  service_ = service;
+}
+
+void CoordinationServiceRpcHandler::RegisterTaskAsync(
+    const RegisterTaskRequest* request, RegisterTaskResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  const CoordinatedTask& task = request->source_task();
+  const uint64_t incarnation = request->incarnation();
+  const uint64_t leader_incarnation = service_->GetServiceIncarnation();
+  response->set_leader_incarnation(leader_incarnation);
+  done(service_->RegisterTask(task, incarnation));
+}
+
+void CoordinationServiceRpcHandler::HeartbeatAsync(
+    const HeartbeatRequest* request, HeartbeatResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  const CoordinatedTask& task = request->source_task();
+  const uint64_t incarnation = request->incarnation();
+  const uint64_t leader_incarnation = service_->GetServiceIncarnation();
+  Status s = service_->RecordHeartbeat(task, incarnation);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  response->set_leader_incarnation(leader_incarnation);
+  done(OkStatus());
+}
+
+void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
+    const WaitForAllTasksRequest* request, WaitForAllTasksResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  service_->WaitForAllTasks(
+      request->source_task(), request->device_info(),
+      [response, service = service_, done = std::move(done)](Status s) {
+        if (s.ok()) {
+          *response->mutable_device_info() = service->ListClusterDevices();
+        }
+        done(s);
+      });
+}
+
+void CoordinationServiceRpcHandler::ShutdownTaskAsync(
+    const ShutdownTaskRequest* request, ShutdownTaskResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  service_->ShutdownTaskAsync(request->source_task(),
+                              [done](Status s) { done(s); });
+}
+
+void CoordinationServiceRpcHandler::ResetTaskAsync(
+    const ResetTaskRequest* request, ResetTaskResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  done(service_->ResetTask(request->source_task()));
+}
+
+void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
+    const ReportErrorToTaskRequest* request,
+    ReportErrorToTaskResponse* response, StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (agent_ == nullptr) {
+    done(MakeCoordinationError(errors::Internal(
+        "CoordinationServiceAgent is uninitialized or has already shutdown.")));
+    return;
+  }
+  const CoordinationServiceError& error_payload = request->error_payload();
+  Status error(static_cast<error::Code>(request->error_code()),
+               strings::StrCat("Error reported from /job:",
+                               error_payload.source_task().job_name(),
+                               "/task:", error_payload.source_task().task_id(),
+                               ": ", request->error_message()));
+  error = MakeCoordinationError(error, error_payload);
+  agent_->SetError(error);
+  done(OkStatus());
+}
+
+void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
+    const ReportErrorToServiceRequest* request,
+    ReportErrorToServiceResponse* response, StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  done(service_->ReportTaskError(
+      request->error_origin(),
+      MakeCoordinationError(
+          Status{static_cast<error::Code>(request->error_code()),
+                 request->error_message()},
+          request->error_origin(),
+          /*is_reported_error=*/true)));
+}
+
+void CoordinationServiceRpcHandler::GetTaskStateAsync(
+    const GetTaskStateRequest* request, GetTaskStateResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  auto result = service_->GetTaskState(
+      {request->source_task().begin(), request->source_task().end()});
+  absl::c_move(result,
+               RepeatedFieldBackInserter(response->mutable_task_state()));
+  done(OkStatus());
+}
+
+void CoordinationServiceRpcHandler::InsertKeyValueAsync(
+    const InsertKeyValueRequest* request, InsertKeyValueResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  done(service_->InsertKeyValue(request->kv().key(), request->kv().value()));
+}
+
+void CoordinationServiceRpcHandler::GetKeyValueAsync(
+    const GetKeyValueRequest* request, GetKeyValueResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  response->mutable_kv()->set_key(request->key());
+  service_->GetKeyValueAsync(
+      request->key(), [response, done = std::move(done)](
+                          const StatusOr<std::string>& status_or_value) {
+        if (status_or_value.ok()) {
+          response->mutable_kv()->set_value(status_or_value.value());
+        }
+        done(status_or_value.status());
+      });
+}
+
+void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
+    const TryGetKeyValueRequest* request, TryGetKeyValueResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  auto result = service_->TryGetKeyValue(request->key());
+  if (!result.ok()) {
+    done(MakeCoordinationError(result.status()));
+    return;
+  }
+  response->mutable_kv()->set_key(request->key());
+  response->mutable_kv()->set_value(result.value());
+  done(OkStatus());
+}
+
+void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
+    const GetKeyValueDirRequest* request, GetKeyValueDirResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  std::vector<KeyValueEntry> results =
+      service_->GetKeyValueDir(request->directory_key());
+  *response->mutable_kv() = {std::make_move_iterator(results.begin()),
+                             std::make_move_iterator(results.end())};
+  done(OkStatus());
+}
+
+void CoordinationServiceRpcHandler::DeleteKeyValueAsync(
+    const DeleteKeyValueRequest* request, DeleteKeyValueResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  done(service_->DeleteKeyValue(request->key()));
+}
+
+void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
+                                                 BarrierResponse* response,
+                                                 StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  std::vector<CoordinatedTask> tasks = {request->tasks().begin(),
+                                        request->tasks().end()};
+  service_->BarrierAsync(
+      request->barrier_id(),
+      absl::Milliseconds(request->barrier_timeout_in_ms()),
+      request->source_task(), tasks,
+      [done = std::move(done)](const Status& status) { done(status); });
+}
+
+void CoordinationServiceRpcHandler::CancelBarrierAsync(
+    const CancelBarrierRequest* request, CancelBarrierResponse* response,
+    StatusCallback done) {
+  tf_shared_lock l(mu_);
+  if (service_ == nullptr) {
+    done(MakeCoordinationError(
+        errors::Internal("Coordination service is not enabled.")));
+    return;
+  }
+  done(service_->CancelBarrier(request->barrier_id(), request->source_task()));
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
new file mode 100644
index 00000000000..3734842457b
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h
@@ -0,0 +1,102 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
+
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+class CoordinationServiceRpcHandler {
+ public:
+  explicit CoordinationServiceRpcHandler() {}
+
+  void SetAgentInstance(CoordinationServiceAgent* agent);
+
+  void SetServiceInstance(CoordinationServiceInterface* service);
+
+  void RegisterTaskAsync(const tensorflow::RegisterTaskRequest* request,
+                         tensorflow::RegisterTaskResponse* response,
+                         StatusCallback done);
+
+  void HeartbeatAsync(const tensorflow::HeartbeatRequest* request,
+                      tensorflow::HeartbeatResponse* response,
+                      StatusCallback done);
+
+  void WaitForAllTasksAsync(const tensorflow::WaitForAllTasksRequest* request,
+                            tensorflow::WaitForAllTasksResponse* response,
+                            StatusCallback done);
+
+  void ShutdownTaskAsync(const tensorflow::ShutdownTaskRequest* request,
+                         tensorflow::ShutdownTaskResponse* response,
+                         StatusCallback done);
+
+  void ResetTaskAsync(const tensorflow::ResetTaskRequest* request,
+                      tensorflow::ResetTaskResponse* response,
+                      StatusCallback done);
+
+  void ReportErrorToTaskAsync(
+      const tensorflow::ReportErrorToTaskRequest* request,
+      tensorflow::ReportErrorToTaskResponse* response, StatusCallback done);
+
+  void ReportErrorToServiceAsync(
+      const tensorflow::ReportErrorToServiceRequest* request,
+      tensorflow::ReportErrorToServiceResponse* response, StatusCallback done);
+
+  void GetTaskStateAsync(const tensorflow::GetTaskStateRequest* request,
+                         tensorflow::GetTaskStateResponse* response,
+                         StatusCallback done);
+
+  void InsertKeyValueAsync(const tensorflow::InsertKeyValueRequest* request,
+                           tensorflow::InsertKeyValueResponse* response,
+                           StatusCallback done);
+
+  void GetKeyValueAsync(const tensorflow::GetKeyValueRequest* request,
+                        tensorflow::GetKeyValueResponse* response,
+                        StatusCallback done);
+
+  void TryGetKeyValueAsync(const tensorflow::TryGetKeyValueRequest* request,
+                           tensorflow::TryGetKeyValueResponse* response,
+                           StatusCallback done);
+
+  void GetKeyValueDirAsync(const tensorflow::GetKeyValueDirRequest* request,
+                           tensorflow::GetKeyValueDirResponse* response,
+                           StatusCallback done);
+
+  void DeleteKeyValueAsync(const tensorflow::DeleteKeyValueRequest* request,
+                           tensorflow::DeleteKeyValueResponse* response,
+                           StatusCallback done);
+
+  void BarrierAsync(const tensorflow::BarrierRequest* request,
+                    tensorflow::BarrierResponse* response, StatusCallback done);
+
+  void CancelBarrierAsync(const tensorflow::CancelBarrierRequest* request,
+                          tensorflow::CancelBarrierResponse* response,
+                          StatusCallback done);
+
+ private:
+  mutex mu_;
+  CoordinationServiceAgent* agent_ TF_GUARDED_BY(mu_) = nullptr;
+  CoordinationServiceInterface* service_ TF_GUARDED_BY(mu_) = nullptr;
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_COORDINATION_COORDINATION_SERVICE_RPC_HANDLER_H_
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 6c028ce0c8f..271a228db58 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -351,9 +351,13 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   // Registering the evaluator task is unexpected
   Status status = coord_service->RegisterTask(evaluator, /*incarnation=*/0);
   EXPECT_TRUE(errors::IsInvalidArgument(status)) << status;
+  EXPECT_TRUE(!status.error_message().empty());
 }
 
-TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Fails) {
+// RegisterTask() may succeed in the service, but the agent response times out.
+// In this case, the agent would retry Connect() and should succeed if it has
+// the same incarnation.
+TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
   const CoordinationServiceConfig config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
@@ -366,10 +370,33 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Fails) {
   // Task connects to coordination service.
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
-  // Registration should fail since task already registered previously.
+  // Registration should succeed since it is the same task.
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
+  TF_EXPECT_OK(status) << status;
+}
+
+TEST(CoordinationServiceTest,
+     RegisterTask_AlreadyConnectedDifferentIncarnation_Fails) {
+  const CoordinationServiceConfig config =
+      GetCoordinationServiceConfig(/*num_tasks=*/1);
+  CoordinatedTask task_0;
+  task_0.set_job_name("worker");
+  task_0.set_task_id(0);
+  std::unique_ptr<CoordinationServiceInterface> coord_service =
+      CoordinationServiceInterface::EnableCoordinationService(
+          Env::Default(), config,
+          /*cache=*/nullptr);
+  // Task connects to coordination service.
+  TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
+
+  // Registration should fail since task already registered previously with a
+  // different incarnation. Note that incarnation usually changes if an agent
+  // restarts.
+  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/1);
+
   EXPECT_TRUE(errors::IsAborted(status)) << status;
+  EXPECT_TRUE(!status.error_message().empty());
 }
 
 TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
@@ -392,6 +419,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   EXPECT_TRUE(errors::IsAborted(status)) << status;
+  EXPECT_TRUE(!status.error_message().empty());
 }
 
 TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
diff --git a/tensorflow/tsl/distributed_runtime/preemption/BUILD b/tensorflow/tsl/distributed_runtime/preemption/BUILD
index f9a0d9c2f98..eb34ccadf5c 100644
--- a/tensorflow/tsl/distributed_runtime/preemption/BUILD
+++ b/tensorflow/tsl/distributed_runtime/preemption/BUILD
@@ -1,7 +1,9 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud", "tsl_grpc_cc_dependencies")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl:internal",
     ],
@@ -12,7 +14,7 @@ cc_library(
     name = "preemption_notifier",
     srcs = ["preemption_notifier.cc"],
     hdrs = ["preemption_notifier.h"],
-    # copybara:uncomment compatible_with = ["//buildenv/target:gce"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
@@ -42,3 +44,52 @@ tsl_cc_test(
         "@com_google_absl//absl/time",
     ],
 )
+
+cc_library(
+    name = "preemption_sync_manager",
+    srcs = ["preemption_sync_manager.cc"],
+    hdrs = ["preemption_sync_manager.h"],
+    deps = [
+        ":preemption_notifier",
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/lib/monitoring:gauge",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tsl_cc_test(
+    name = "preemption_sync_manager_test",
+    size = "small",
+    srcs = ["preemption_sync_manager_test.cc"],
+    deps = [
+        ":preemption_notifier",
+        ":preemption_sync_manager",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_impl",
+        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
+        "//tensorflow/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/protobuf:coordination_config_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc_impl",
+        "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
+    ] + tsl_grpc_cc_dependencies(),
+)
diff --git a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.cc b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
similarity index 83%
rename from tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.cc
rename to tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
index 912c58f7133..07307205d64 100644
--- a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.cc
+++ b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
@@ -12,28 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 #include <algorithm>
 #include <functional>
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/call_options.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "tensorflow/tsl/lib/monitoring/gauge.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace {
-using tsl::PreemptionNotifier;
+using tensorflow::CoordinatedTask;
+using tensorflow::KeyValueEntry;
 
 constexpr int64_t kPreemptionSyncUnsetCounter = -1;
 constexpr char kPreemptionNoticeKey[] = "RECEIVED_PREEMPTION_NOTICE";
@@ -41,6 +44,23 @@ constexpr char kPreemptionCounterDirKey[] = "PREEMPTION_CURRENT_COUNTER/";
 constexpr char kPreemptionBarrier[] = "PREEMPTION_SYNC_BARRIER";
 constexpr absl::Duration kPreemptionBarrierTimeout = absl::Minutes(3);
 
+auto* sync_usage_metric = monitoring::Gauge<bool, 0>::New(
+    "/coordination_service/preempt_manager/reached_sync_point_usage",
+    "Records if preempt sync manager's ReachSyncPoint() was called at least "
+    "once.");
+
+auto* notified_metric = monitoring::Gauge<bool, 0>::New(
+    "/coordination_service/preempt_manager/notified",
+    "Records receipt of preemption notification.");
+
+auto* set_sync_point_metric = monitoring::Gauge<bool, 0>::New(
+    "/coordination_service/preempt_manager/set_sync_point",
+    "Records that sync point is set.");
+
+auto* reached_sync_point_metric = monitoring::Gauge<bool, 0>::New(
+    "/coordination_service/preempt_manager/reached_sync_point",
+    "Records that sync point is reached.");
+
 // Only start protocol if death time is within `kProtocolDuration`, so that we
 // don't synchronize too early.
 // TODO(b/230630494): Make this configurable so that users can extend this to
@@ -117,13 +137,15 @@ Status PreemptionSyncManagerImpl::Initialize(
           // its being destructed.
           if (errors::IsCancelled(death_time.status())) {
             LOG(INFO) << "Preemption sync protocol cancelled by notifier: "
-                      << death_time.status();
+                      << death_time.status()
+                      << ". This is expected during program shutdown.";
           } else {
             LOG(ERROR) << "Error from preemption notifier: "
                        << death_time.status();
           }
           return;
         }
+        notified_metric->GetCell()->Set(true);
         // Notify coordination service about preemption notice.
         const Status s = agent->InsertKeyValue(kPreemptionNoticeKey,
                                                absl::FormatTime(*death_time));
@@ -141,12 +163,16 @@ Status PreemptionSyncManagerImpl::Initialize(
         if (errors::IsCancelled(status_or_death_time.status())) {
           // The agent cancels pending GetKeyValue RPCs because of shutdown,
           // so simply log and return.
-          LOG(INFO) << "Cancelled call to retrive preemption notice.";
+          LOG(INFO) << "Cancelled call to retrieve preemption notice. This is "
+                       "expected upon program shutdown.";
           return;
         } else if (!status_or_death_time.ok()) {
-          LOG(ERROR) << "Failed to retrieve preemption notice from "
-                        "coordination service: "
-                     << status_or_death_time.status();
+          LOG(WARNING)
+              << "Failed to retrieve preemption notice from "
+                 "coordination service: "
+              << status_or_death_time.status()
+              << ". This is only expected if one of the tasks is unhealthy."
+                 " Check the logs for the actual root cause.";
           // Notify other tasks to not wait at the barrier. Note:
           // CancelPreemptionBarrier() cannot be used because this may be
           // triggered after preemption sync manager has been destroyed.
@@ -172,9 +198,6 @@ Status PreemptionSyncManagerImpl::Initialize(
           return;
         }
 
-        LOG(INFO) << "Received preemption notice with death time: "
-                  << death_time;
-
         // Trigger protocol in a separate thread: compute max call counter.
         sync_protocol_thread_ = absl::WrapUnique(env_->StartThread(
             {}, "PreemptionSyncManager_SyncProtocol",
@@ -231,7 +254,7 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
   StatusOr<std::vector<KeyValueEntry>> all_counters =
       agent_->GetKeyValueDir(kPreemptionCounterDirKey);
   if (!all_counters.ok()) {
-    LOG(ERROR) << "Preemption sync failed - unable to retrieve call counters : "
+    LOG(ERROR) << "Preemption sync failed - unable to retrieve call counters: "
                << all_counters.status();
     return;
   }
@@ -260,6 +283,7 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
   // 6. Set sync point to be the next possible call counter of the fastest task.
   preemption_sync_counter_ = max_counter + 1;
   LOG(INFO) << "Preemption sync counter is set: " << preemption_sync_counter_;
+  set_sync_point_metric->GetCell()->Set(true);
 }
 
 void PreemptionSyncManagerImpl::CancelPreemptionBarrier() {
@@ -271,6 +295,8 @@ void PreemptionSyncManagerImpl::CancelPreemptionBarrier() {
 }
 
 bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
+  // Record that this API was called at least once.
+  sync_usage_metric->GetCell()->Set(true);
   // Note: if a preemption notice has been received and ComputeSyncCallCounter()
   // is ongoing , this method will be blocked until it acquires the lock. This
   // prevents updates to `call_counter_` while `preemption_sync_counter_` is
@@ -281,11 +307,15 @@ bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
   VLOG(3) << "Current call counter: " << call_counter_
           << ", Preemption sync point: " << preemption_sync_counter_;
 
-  // Check if we have reached the sync point.
-  return preemption_sync_counter_ == call_counter_;
+  const bool reached_sync_point = preemption_sync_counter_ == call_counter_;
+  if (reached_sync_point) {
+    // Record that this job reached the sync point.
+    reached_sync_point_metric->GetCell()->Set(true);
+  }
+  return reached_sync_point;
 }
 }  // namespace
 std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager() {
   return std::make_unique<PreemptionSyncManagerImpl>();
 }
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h
new file mode 100644
index 00000000000..aee8d75fe3c
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h
@@ -0,0 +1,65 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/preemption/preemption_notifier.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tsl {
+
+// Enables multiple tasks to coordinate on a safe sync point if any of the tasks
+// receive a preemption notice. Example: tasks agree on a safe checkpointing
+// step after a preemption notice so that training can resume with minimal
+// disruption after the preemption.
+// Note: the sync point can only be set once whenever the first preemption
+// occurs.
+// TODO(b/230630494): Add Reset() to allow multiple sync points to be set.
+class PreemptionSyncManager {
+ public:
+  virtual ~PreemptionSyncManager() = default;
+
+  virtual Status Initialize(CoordinationServiceAgent* agent) = 0;
+  virtual Status Initialize(CoordinationServiceAgent* agent,
+                            const std::string& preemption_notifier_type) = 0;
+  virtual Status Initialize(CoordinationServiceAgent* agent,
+                            std::unique_ptr<PreemptionNotifier> notifier) = 0;
+
+  // Check if the synchronized point has been reached. When a task has been
+  // preempted, a safe sync point will be determined by using the fastest task's
+  // next possible sync point, which is then propagated to all tasks via this
+  // method.
+  // Notes:
+  // 1) This must be called during every possible sync point so that the library
+  //    is aware of each task's progress.
+  // 2) This assumes that each task begins from the same point.
+  //    Internally, it updates a counter to track the last `step_counter` passed
+  //    in as argument to record each task's current progress.
+  // Example use case: this can be called during every training step for every
+  // task. Once a preemption notice is received, all tasks will agree on a safe
+  // step to pause training and handle the preemption (e.g. save checkpoint and
+  // exit, or wait for preempted task to restart, then resume training).
+  virtual bool ReachedSyncPoint(int step_counter) = 0;
+};
+
+std::unique_ptr<PreemptionSyncManager> CreatePreemptionSyncManager();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_PREEMPTION_PREEMPTION_SYNC_MANAGER_H_
diff --git a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager_test.cc b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
similarity index 89%
rename from tensorflow/core/distributed_runtime/preemption/preemption_sync_manager_test.cc
rename to tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
index d7d6cdf7c32..5114cb6eb71 100644
--- a/tensorflow/core/distributed_runtime/preemption/preemption_sync_manager_test.cc
+++ b/tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/distributed_runtime/preemption/preemption_sync_manager.h"
+#include "tensorflow/tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 
 #include <memory>
 #include <string>
@@ -24,30 +24,32 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tensorflow/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/tsl/distributed_runtime/preemption/preemption_notifier.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/threadpool.h"
 #include "tensorflow/tsl/protobuf/coordination_config.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace {
-using tsl::CoordinationServiceInterface;
+using tensorflow::CoordinatedJob;
+using tensorflow::CoordinatedTask;
+using tensorflow::CoordinationServiceConfig;
 
 constexpr char kJobName[] = "test_worker";
 
 // Send fake preemption notices at any time for testing.
-class FakePreemptionNotifier : public tsl::PreemptionNotifier {
+class FakePreemptionNotifier : public PreemptionNotifier {
  public:
-  FakePreemptionNotifier() : tsl::PreemptionNotifier(/*env=*/nullptr) {}
+  FakePreemptionNotifier() : PreemptionNotifier(/*env=*/nullptr) {}
 
   ~FakePreemptionNotifier() override {
     NotifyRegisteredListeners(
@@ -87,6 +89,8 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     coord_agent_ = nullptr;
     coord_agent2_ = nullptr;
     coord_service_ = nullptr;
+    static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
+        ->SetCoordinationServiceInstance(nullptr);
     grpc_server_->Shutdown();
     coord_rpc_service_->Shutdown();
   }
@@ -130,6 +134,9 @@ class PreemptionSyncManagerTest : public ::testing::Test {
         /*num_threads=*/1);
     coord_rpc_service_ = std::make_unique<GrpcCoordinationServiceImpl>(
         coord_compute_pool_.get(), &builder);
+    auto* grpc_coord_service =
+        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
+    grpc_coord_service->SetCoordinationServiceInstance(coord_service_.get());
     grpc_server_ = builder.BuildAndStart();
     coord_rpc_thread_ = absl::WrapUnique(Env::Default()->StartThread(
         /*thread_options=*/{}, /*name=*/"CoordinationServiceHandleRPCsLoop",
@@ -170,7 +177,7 @@ class PreemptionSyncManagerTest : public ::testing::Test {
   std::unique_ptr<CoordinationServiceInterface> coord_service_;
   std::unique_ptr<::grpc::Server> grpc_server_;
   std::unique_ptr<thread::ThreadPool> coord_compute_pool_;
-  std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
+  std::unique_ptr<AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<Thread> coord_rpc_thread_;
   // Owned by task 1.
   std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
@@ -285,4 +292,4 @@ TEST_F(PreemptionSyncManagerTest, PreemptFastTask) {
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/rpc/BUILD b/tensorflow/tsl/distributed_runtime/rpc/BUILD
index eaccd044ef8..b3486ed8a68 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/BUILD
+++ b/tensorflow/tsl/distributed_runtime/rpc/BUILD
@@ -2,10 +2,11 @@
 #   RPC communication interfaces and implementations for TensorFlow.
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl:internal",
     ],
@@ -38,6 +39,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:stringpiece",
         "//tensorflow/tsl/platform:stringprintf",
+        "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc",
     ] + tsl_grpc_cc_dependencies(),
 )
@@ -50,9 +52,11 @@ tsl_cc_test(
         "no_mac",
     ],
     deps = [
+        ":test_request_proto_cc_impl",
         ":grpc_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/protobuf:distributed_runtime_payloads_proto_cc_impl",
     ] + tsl_grpc_cc_dependencies(),
@@ -109,3 +113,42 @@ tsl_cc_test(
         "//tensorflow/tsl/util:device_name_utils",
     ],
 )
+
+cc_library(
+    name = "grpc_state",
+    hdrs = ["grpc_state.h"],
+    deps = [
+        ":grpc_client_cq_tag",
+        ":grpc_util",
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:strcat",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/util:env_var",
+    ] + tsl_grpc_cc_dependencies(),
+)
+
+cc_library(
+    name = "grpc_client_cq_tag",
+    srcs = [],
+    hdrs = ["grpc_client_cq_tag.h"],
+    deps = [
+        "//tensorflow/tsl/platform:macros",
+    ],
+)
+
+tf_proto_library(
+    name = "test_request_proto",
+    testonly = 1,
+    srcs = ["test_request.proto"],
+    create_java_proto = False,
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
diff --git a/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD b/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD
new file mode 100644
index 00000000000..e294a97401d
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -0,0 +1,47 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/tsl:internal",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "grpc_coordination_client",
+    srcs = ["grpc_coordination_client.cc"],
+    hdrs = ["grpc_coordination_client.h"],
+    deps = [
+        "//tensorflow/tsl/distributed_runtime:call_options",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_client",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_client_cq_tag",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_state",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "grpc_coordination_service_impl",
+    srcs = ["grpc_coordination_service_impl.cc"],
+    hdrs = ["grpc_coordination_service_impl.h"],
+    deps = [
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
+        "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/tsl/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "//tensorflow/tsl/protobuf:coordination_service_cc_grpc_proto",
+    ] + tsl_grpc_cc_dependencies(),
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.cc b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
similarity index 84%
rename from tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
rename to tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
index 8a53300eaf0..12e76599261 100644
--- a/tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 
 #include <memory>
 #include <string>
@@ -21,15 +21,50 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/coordination/coordination_client.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
-#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 #include "tensorflow/tsl/protobuf/coordination_service.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace {
+using tensorflow::BarrierRequest;
+using tensorflow::BarrierResponse;
+using tensorflow::CancelBarrierRequest;
+using tensorflow::CancelBarrierResponse;
+using tensorflow::DeleteKeyValueRequest;
+using tensorflow::DeleteKeyValueResponse;
+using tensorflow::GetKeyValueDirRequest;
+using tensorflow::GetKeyValueDirResponse;
+using tensorflow::GetKeyValueRequest;
+using tensorflow::GetKeyValueResponse;
+using tensorflow::GetTaskStateRequest;
+using tensorflow::GetTaskStateResponse;
+using tensorflow::HeartbeatRequest;
+using tensorflow::HeartbeatResponse;
+using tensorflow::InsertKeyValueRequest;
+using tensorflow::InsertKeyValueResponse;
+using tensorflow::RegisterTaskRequest;
+using tensorflow::RegisterTaskResponse;
+using tensorflow::ReportErrorToServiceRequest;
+using tensorflow::ReportErrorToServiceResponse;
+using tensorflow::ReportErrorToTaskRequest;
+using tensorflow::ReportErrorToTaskResponse;
+using tensorflow::ResetTaskRequest;
+using tensorflow::ResetTaskResponse;
+using tensorflow::ShutdownTaskRequest;
+using tensorflow::ShutdownTaskResponse;
+using tensorflow::TryGetKeyValueRequest;
+using tensorflow::TryGetKeyValueResponse;
+using tensorflow::WaitForAllTasksRequest;
+using tensorflow::WaitForAllTasksResponse;
 
 class GrpcCoordinationClientThread {
  public:
@@ -71,7 +106,7 @@ class GrpcCoordinationClient : public CoordinationClient {
     client_thread_ = std::make_unique<GrpcCoordinationClientThread>();
     cq_ = client_thread_->completion_queue();
   }
-  ~GrpcCoordinationClient() override {}
+  ~GrpcCoordinationClient() override = default;
 
   void RegisterTaskAsync(CallOptions* call_opts,
                          const RegisterTaskRequest* request,
@@ -231,7 +266,7 @@ class GrpcCoordinationClient : public CoordinationClient {
  private:
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
-  const string target_;
+  const std::string target_;
   std::unique_ptr<GrpcCoordinationClientThread> client_thread_;
 };
 
@@ -243,9 +278,9 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
         channel_cache_(channel_cache),
         threads_(4) {}
 
-  ~GrpcCoordinationClientCache() override {}
+  ~GrpcCoordinationClientCache() override = default;
 
-  CoordinationClient* GetClient(const string& target) override {
+  CoordinationClient* GetClient(const std::string& target) override {
     mutex_lock l(clients_mu_);
     auto it = clients_.find(target);
     if (it == clients_.end()) {
@@ -262,7 +297,7 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
   }
 
   std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const string& target) override {
+      const std::string& target) override {
     SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
     if (channel == nullptr) {
       VLOG(2) << "Coordination client for target " << target << " not found.";
@@ -276,7 +311,7 @@ class GrpcCoordinationClientCache : public CoordinationClientCache {
       TF_GUARDED_BY(assignment_mu_);
   size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
 
-  size_t AssignClientToThread(const string& target) {
+  size_t AssignClientToThread(const std::string& target) {
     // Round-robin target assignment, but keeps the same target on the same
     // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
@@ -311,4 +346,4 @@ CoordinationClient* NewGrpcCoordinationClient(
       channel, /*target=*/"unknown_target_for_coordination_leader");
 }
 
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
new file mode 100644
index 00000000000..e468edfce06
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h
@@ -0,0 +1,34 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
+
+#include <memory>
+
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_client.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tsl {
+
+CoordinationClientCache* NewGrpcCoordinationClientCache(
+    std::shared_ptr<GrpcChannelCache> channel);
+
+CoordinationClient* NewGrpcCoordinationClient(
+    std::shared_ptr<::grpc::Channel> channel);
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_CLIENT_H_
diff --git a/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
new file mode 100644
index 00000000000..5b3036fb58b
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.cc
@@ -0,0 +1,91 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
+
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+
+namespace tsl {
+
+GrpcCoordinationServiceImpl::GrpcCoordinationServiceImpl(
+    thread::ThreadPool* compute_pool, ::grpc::ServerBuilder* server_builder)
+    : compute_pool_(*compute_pool), shutdown_(false) {
+  server_builder->RegisterService(&service_);
+  cq_ = server_builder->AddCompletionQueue();
+}
+
+void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
+#define ENQUEUE_REQUEST(method)                                               \
+  do {                                                                        \
+    tf_shared_lock l(shutdown_mu_);                                           \
+    if (shutdown_) {                                                          \
+      continue;                                                               \
+    }                                                                         \
+    Call<GrpcCoordinationServiceImpl,                                         \
+         tensorflow::grpc::CoordinationService::AsyncService,                 \
+         tensorflow::method##Request, tensorflow::method##Response>::         \
+        EnqueueRequest(&service_, cq_.get(),                                  \
+                       &tensorflow::grpc::CoordinationService::AsyncService:: \
+                           Request##method,                                   \
+                       &GrpcCoordinationServiceImpl::method##Handler, false); \
+  } while (0)
+  ENQUEUE_REQUEST(RegisterTask);
+  ENQUEUE_REQUEST(WaitForAllTasks);
+  ENQUEUE_REQUEST(ShutdownTask);
+  ENQUEUE_REQUEST(ResetTask);
+  ENQUEUE_REQUEST(Heartbeat);
+  ENQUEUE_REQUEST(ReportErrorToTask);
+  ENQUEUE_REQUEST(ReportErrorToService);
+  ENQUEUE_REQUEST(GetTaskState);
+  ENQUEUE_REQUEST(InsertKeyValue);
+  ENQUEUE_REQUEST(GetKeyValue);
+  ENQUEUE_REQUEST(TryGetKeyValue);
+  ENQUEUE_REQUEST(GetKeyValueDir);
+  ENQUEUE_REQUEST(DeleteKeyValue);
+  ENQUEUE_REQUEST(Barrier);
+  ENQUEUE_REQUEST(CancelBarrier);
+#undef ENQUEUE_REQUEST
+
+  void* tag;  // Matches the operation started against this cq_.
+  bool ok;
+
+  while (true) {
+    if (!cq_->Next(&tag, &ok)) {
+      // The queue is shutting down.
+      break;
+    }
+    GrpcCallTag<GrpcCoordinationServiceImpl>* callback_tag =
+        static_cast<GrpcCallTag<GrpcCoordinationServiceImpl>*>(tag);
+
+    if (callback_tag) {
+      callback_tag->OnCompleted(this, ok);
+    } else {
+      cq_->Shutdown();
+      break;
+    }
+  }
+}
+
+void GrpcCoordinationServiceImpl::Shutdown() {
+  mutex_lock l(shutdown_mu_);
+  shutdown_ = true;
+  // This enqueues a special event (with a null tag) that causes the completion
+  // queue to be shut down on the polling thread.
+  shutdown_alarm_ = std::make_unique<::grpc::Alarm>(
+      cq_.get(), gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
new file mode 100644
index 00000000000..cb62152d6e1
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h
@@ -0,0 +1,116 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "grpcpp/alarm.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/server_builder.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/protobuf/coordination_service.grpc.pb.h"
+#include "tensorflow/tsl/protobuf/coordination_service.pb.h"
+
+namespace tsl {
+
+class GrpcCoordinationServiceImpl : public AsyncServiceInterface {
+ public:
+  template <class RequestMessage, class ResponseMessage>
+  using CoordCall = Call<GrpcCoordinationServiceImpl,
+                         tensorflow::grpc::CoordinationService::AsyncService,
+                         RequestMessage, ResponseMessage>;
+
+  GrpcCoordinationServiceImpl(thread::ThreadPool* compute_pool,
+                              ::grpc::ServerBuilder* server_builder);
+  ~GrpcCoordinationServiceImpl() override {}
+
+  void HandleRPCsLoop() override;
+  void Shutdown() override;
+  void SetCoordinationServiceAgentInstance(CoordinationServiceAgent* agent) {
+    rpc_handler_.SetAgentInstance(agent);
+  }
+  void SetCoordinationServiceInstance(CoordinationServiceInterface* service) {
+    rpc_handler_.SetServiceInstance(service);
+  }
+  CoordinationServiceRpcHandler* GetRpcHandler() { return &rpc_handler_; }
+
+ private:
+#define HANDLER(method)                                                       \
+  void method##Handler(CoordCall<tensorflow::method##Request,                 \
+                                 tensorflow::method##Response>* call) {       \
+    tf_shared_lock l(shutdown_mu_);                                           \
+    if (shutdown_) {                                                          \
+      call->SendResponse(ToGrpcStatus(                                        \
+          errors::Internal("Coordination service has been shut down.")));     \
+      return;                                                                 \
+    }                                                                         \
+    compute_pool_.Schedule([this, call]() {                                   \
+      rpc_handler_.method##Async(&call->request, &call->response,             \
+                                 [call](const Status& s) {                    \
+                                   call->ClearCancelCallback();               \
+                                   call->SendResponse(ToGrpcStatus(s));       \
+                                 });                                          \
+    });                                                                       \
+    Call<GrpcCoordinationServiceImpl,                                         \
+         tensorflow::grpc::CoordinationService::AsyncService,                 \
+         tensorflow::method##Request, tensorflow::method##Response>::         \
+        EnqueueRequest(&service_, cq_.get(),                                  \
+                       &tensorflow::grpc::CoordinationService::AsyncService:: \
+                           Request##method,                                   \
+                       &GrpcCoordinationServiceImpl::method##Handler,         \
+                       /*supports_cancel=*/false);                            \
+  }
+  HANDLER(RegisterTask);
+  HANDLER(WaitForAllTasks);
+  HANDLER(ShutdownTask);
+  HANDLER(ResetTask);
+  HANDLER(Heartbeat);
+  HANDLER(ReportErrorToTask);
+  HANDLER(ReportErrorToService);
+  HANDLER(GetTaskState);
+  HANDLER(InsertKeyValue);
+  HANDLER(GetKeyValue);
+  HANDLER(TryGetKeyValue);
+  HANDLER(GetKeyValueDir);
+  HANDLER(DeleteKeyValue);
+  HANDLER(Barrier);
+  HANDLER(CancelBarrier);
+#undef HANDLER
+
+  thread::ThreadPool& compute_pool_;
+  CoordinationServiceRpcHandler rpc_handler_;
+
+  mutex shutdown_mu_;
+  bool shutdown_ TF_GUARDED_BY(shutdown_mu_);
+  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
+
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  tensorflow::grpc::CoordinationService::AsyncService service_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcCoordinationServiceImpl);
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_COORDINATION_GRPC_COORDINATION_SERVICE_IMPL_H_
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
new file mode 100644
index 00000000000..61193dafc2e
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+
+#include "tensorflow/tsl/platform/macros.h"
+
+namespace tsl {
+
+// Represents a pending asynchronous client call as a tag that can be
+// stored in a `grpc::CompletionQueue`.
+class GrpcClientCQTag {
+ public:
+  GrpcClientCQTag() = default;
+  virtual ~GrpcClientCQTag() = default;
+
+  // OnCompleted is invoked when the RPC has finished.
+  // Implementations of OnCompleted can delete *this.
+  virtual void OnCompleted(bool ok) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcClientCQTag);
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
new file mode 100644
index 00000000000..ac9a5d3e513
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
@@ -0,0 +1,251 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+
+#include <queue>
+#include <utility>
+
+#include "grpcpp/generic/generic_stub.h"
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/tsl/distributed_runtime/call_options.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/strcat.h"
+#include "tensorflow/tsl/platform/threadpool.h"
+#include "tensorflow/tsl/util/env_var.h"
+
+namespace tsl {
+
+// Object allocated per active RPC.
+// Manage the state of a single asynchronous RPC request.  If `max_retries`
+// is greater than 0, the request will be retried for any transient failures.
+// Note: `parse_proto_fn` is used solely to allow TensorFlow's worker service
+// to pass in an optimized function that avoids an unnecessary copy of tensors.
+// That is not implemented as an overload of tsl::GrpcMaybeParseProto because it
+// has dependencies on many TensorFlow-specific absractions.
+template <class Response>
+class RPCState : public GrpcClientCQTag {
+ public:
+  RPCState(
+      ::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+      const ::grpc::string& method, const protobuf::Message& request,
+      Response* response, StatusCallback done, CallOptions* call_opts,
+      thread::ThreadPool* threadpool, int32_t max_retries = 0,
+      bool fail_fast = true, const string* target = nullptr,
+      std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
+          [](::grpc::ByteBuffer* src, Response* dst) {
+            return tsl::GrpcMaybeParseProto(src, dst);
+          })
+      : RPCState(
+            stub, cq, method, request, response, std::move(done), call_opts,
+            threadpool,
+            // 1) If GRPC_FAIL_FAST is set to 'true' or 'false',
+            // fail_fast=$GRPC_FAIL_FAST. See b/141948186.
+            // 2) Otherwise if GRPC_FAIL_FAST is set to 'use_caller', use the
+            // fail_fast from the caller. See b/140260119.
+            //
+            // Current default: use caller's fail_fast argument.
+            //
+            // NOTE: Callers mostly set fail_fast=true to prevent job hanging
+            // on worker task failures, except a few cases such as GetStatus
+            // in cluster initialization and collective param resolution.
+            [fail_fast, &done]() -> bool {
+              string fail_fast_env;
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
+                                               &fail_fast_env));
+              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              if (fail_fast_env_lower == "true") {
+                return true;
+              } else if (fail_fast_env_lower == "use_caller") {
+                return fail_fast;
+              } else if (fail_fast_env_lower == "false") {
+                return false;
+              } else {
+                string error_message = strings::StrCat(
+                    "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
+                LOG(WARNING) << error_message;
+                done(errors::InvalidArgument(error_message));
+                return false;
+              }
+            }(),
+            (call_opts != nullptr ? call_opts->GetTimeout() : 0), max_retries,
+            target, parse_proto_fn) {}
+
+  template <typename Request>
+  RPCState(
+      ::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+      const ::grpc::string& method, const Request& request, Response* response,
+      StatusCallback done, CallOptions* call_opts,
+      thread::ThreadPool* threadpool, bool fail_fast, int64_t timeout_in_ms,
+      int32_t max_retries, const string* target,
+      std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn =
+          [](::grpc::ByteBuffer* src, Response* dst) {
+            return tsl::GrpcMaybeParseProto(src, dst);
+          })
+      : call_opts_(call_opts),
+        threadpool_(threadpool),
+        done_(std::move(done)),
+        timeout_in_ms_(timeout_in_ms),
+        max_retries_(max_retries),
+        cq_(cq),
+        stub_(stub),
+        method_(method),
+        fail_fast_(fail_fast),
+        target_(target),
+        parse_proto_fn_(std::move(parse_proto_fn)) {
+    response_ = response;
+    ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
+    if (!s.ok()) {
+      LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
+                 << s.error_message();
+      // Skip retry logic if we fail to parse our request.
+      done_(FromGrpcStatus(s));
+      delete this;
+      return;
+    }
+    StartCall();
+  }
+
+  void StartCall() {
+    context_.reset(new ::grpc::ClientContext());
+    context_->set_wait_for_ready(!fail_fast_);
+    if (timeout_in_ms_ > 0) {
+      context_->set_deadline(
+          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
+    }
+    if (call_opts_) {
+      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
+    }
+
+    VLOG(2) << "Starting call: " << method_;
+
+    call_ = stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_);
+    call_->StartCall();
+    call_->Finish(&response_buf_, &status_, this);
+  }
+
+  void OnCompleted(bool ok) override {
+    if (call_opts_) {
+      call_opts_->ClearCancelCallback();
+    }
+
+    VLOG(2) << "Completed call: " << method_;
+
+    Status s = FromGrpcStatus(status_);
+    if (s.ok() && !ok) {
+      // Since this function is only being used for processing the response
+      // to Finish for client-side unary calls, ok should never be false
+      s.Update(
+          errors::Internal("GRPC status is okay but CompletionQueueStatus is "
+                           "not.  This should never happen."));
+    }
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+      }
+      return;
+    }
+
+    VLOG(1) << method_ << " returned with non-ok status: " << s
+            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
+            << context_->debug_error_string();
+    // Retry if we have any attempts left
+    if (++num_retries_ <= max_retries_ &&
+        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
+      response_buf_.Clear();
+      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
+              << " of " << max_retries_;
+
+      ComputeRetryBackoffMs(/*min_backoff_ms=*/1, /*max_backoff_ms=*/10000);
+      int64_t backoff_us = retry_backoff_ms_ * 1000;
+      Env::Default()->SchedClosureAfter(/*micros=*/backoff_us,
+                                        [this]() { StartCall(); });
+    } else {
+      // Attach additional GRPC error information if any to the final status
+      string error_msg = s.error_message();
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = errors::CreateWithUpdatedMessage(s, error_msg);
+      // Always treat gRPC cancellation as a derived error. This ensures that
+      // other error types are preferred during status aggregation. (gRPC
+      // cancellation messages do not contain the original status message).
+      if (s.code() == tensorflow::error::Code::CANCELLED) {
+        s = StatusGroup::MakeDerived(s);
+      }
+
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    Status s;
+    if (!parse_proto_fn_(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
+    }
+    done_(s);
+    delete this;
+  }
+
+ private:
+  void ComputeRetryBackoffMs(int min_backoff_ms, int max_backoff_ms) {
+    constexpr float kBackoffBase = 1.3;
+    if (retry_backoff_ms_ < 0) {
+      retry_backoff_ms_ = min_backoff_ms;
+    } else {
+      retry_backoff_ms_ *= kBackoffBase;
+      if (retry_backoff_ms_ > max_backoff_ms) {
+        retry_backoff_ms_ = max_backoff_ms;
+      }
+    }
+  }
+
+  CallOptions* call_opts_;
+  std::unique_ptr<::grpc::ClientContext> context_;
+  thread::ThreadPool* threadpool_;
+  std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
+  Response* response_;
+  ::grpc::ByteBuffer request_buf_;
+  ::grpc::ByteBuffer response_buf_;
+  ::grpc::Status status_;
+  StatusCallback done_;
+  int64_t timeout_in_ms_;
+
+  size_t num_retries_ = 0;
+  size_t max_retries_;
+  double retry_backoff_ms_ = -1;
+
+  ::grpc::CompletionQueue* cq_;
+  ::grpc::GenericStub* stub_;
+  ::grpc::string method_;
+  bool fail_fast_;
+  const string* target_;
+  std::function<bool(::grpc::ByteBuffer*, Response*)> parse_proto_fn_ = nullptr;
+};
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.cc b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.cc
index 9188034d0df..4595eb06371 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
 
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/random.h"
 
 namespace tsl {
@@ -69,4 +75,53 @@ int64_t ComputeBackoffMicroseconds(int current_retry_attempt, int64_t min_delay,
 
   return std::max(static_cast<int64_t>(first_term + second_term), min_delay);
 }
+
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     grpc::ByteBuffer* dst) {
+  bool own_buffer;
+  return ::grpc::GenericSerialize<::grpc::ProtoBufferWriter,
+                                  protobuf::Message>(src, dst, &own_buffer);
+}
+
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst) {
+  ::grpc::ProtoBufferReader reader(src);
+  return dst->ParseFromZeroCopyStream(&reader);
+}
+
+// GrpcMaybeUnparseProto from a string simply copies the string to the
+// ByteBuffer.
+::grpc::Status GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+  ::grpc::Slice s(src.data(), src.size());
+  ::grpc::ByteBuffer buffer(&s, 1);
+  dst->Swap(&buffer);
+  return ::grpc::Status::OK;
+}
+
+// GrpcMaybeParseProto simply copies bytes into the string.
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, string* dst) {
+  dst->clear();
+  dst->reserve(src->Length());
+  std::vector<::grpc::Slice> slices;
+  if (!src->Dump(&slices).ok()) {
+    return false;
+  }
+  for (const ::grpc::Slice& s : slices) {
+    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  return true;
+}
+
+// GrpcMaybeParseProto simply copies bytes into the tstring.
+bool GrpcMaybeParseProto(grpc::ByteBuffer* src, tstring* dst) {
+  dst->clear();
+  dst->reserve(src->Length());
+  std::vector<::grpc::Slice> slices;
+  if (!src->Dump(&slices).ok()) {
+    return false;
+  }
+  for (const ::grpc::Slice& s : slices) {
+    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  return true;
+}
 }  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
index 22727da96e3..a60ded995d7 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_util.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 
 #include "grpcpp/grpcpp.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/stringpiece.h"
 #include "tensorflow/tsl/platform/stringprintf.h"
@@ -69,12 +71,12 @@ inline void InsertSerializedPayloads(Status& s, std::string payloads) {
   tensorflow::distributed_runtime::GrpcPayloadContainer container;
   if (container.ParseFromString(payloads)) {
     for (const auto& key_val : container.payloads()) {
-      s.SetPayload(key_val.first, key_val.second);
+      s.SetPayload(key_val.first, absl::Cord(key_val.second));
     }
   } else {
     s.SetPayload(kGrpcPayloadsLost,
-                 tensorflow::distributed_runtime::GrpcPayloadsLost()
-                     .SerializeAsString());
+                 absl::Cord(tensorflow::distributed_runtime::GrpcPayloadsLost()
+                                .SerializeAsString()));
   }
 }
 
@@ -114,6 +116,22 @@ inline ::grpc::Status ToGrpcStatus(const Status& s) {
 
 typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 
+// Serialize src and store in *dst.
+::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
+                                     ::grpc::ByteBuffer* dst);
+
+// Parse contents of src and initialize *dst with them.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, protobuf::Message* dst);
+
+// Copy string src to grpc buffer *dst.
+::grpc::Status GrpcMaybeUnparseProto(const string& src,
+                                     ::grpc::ByteBuffer* dst);
+
+// Copy grpc buffer src to string *dst.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, string* dst);
+
+// Copy grpc buffer src to tstring *dst.
+bool GrpcMaybeParseProto(::grpc::ByteBuffer* src, tstring* dst);
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/tsl/distributed_runtime/rpc/grpc_util_test.cc
index f802b7ce862..47f8efccb73 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_util_test.cc
@@ -15,15 +15,63 @@ limitations under the License.
 
 #include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
 
+#include <algorithm>
+#include <vector>
+
 #include "grpcpp/grpcpp.h"
+#include "tensorflow/tsl/distributed_runtime/rpc/test_request.pb.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/test_benchmark.h"
 
 namespace tsl {
 
+namespace {
+using tsl::test::TestRequest;
+
+string ToString(const grpc::ByteBuffer& buf) {
+  std::vector<grpc::Slice> slices;
+  CHECK(buf.Dump(&slices).ok());
+  string result;
+  for (const grpc::Slice& s : slices) {
+    result.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  return result;
+}
+
+// Return a ByteBuffer that contains str split up into num_slices slices.
+grpc::ByteBuffer MakeBuffer(const string& str, int num_slices) {
+  // Convert to a ByteBuffer.
+  std::vector<::grpc::Slice> slices;
+  const size_t per_slice = (str.size() + num_slices - 1) / num_slices;
+  for (size_t pos = 0; pos < str.size();) {
+    const size_t n = std::min(str.size() - pos, per_slice);
+    slices.emplace_back(&str[pos], n);
+    pos += n;
+  }
+  if (slices.empty()) {
+    slices.emplace_back();
+  }
+  return ::grpc::ByteBuffer(&slices[0], slices.size());
+}
+
+// Make a proto with approximately the specified length.
+TestRequest MakeProto(int size) {
+  int approx_size = 0;
+  TestRequest proto;
+  int index = 0;
+  while (approx_size < size) {
+    int item_size = std::min(size - approx_size, 1024);
+    proto.add_data(string(item_size, 'a' + static_cast<char>(index % 26)));
+    approx_size += item_size + 3;  // +3 for encoding overhead.
+    index++;
+  }
+  return proto;
+}
+
 TEST(PayloadSerialization, PayloadsAreTransmitted) {
   Status status = errors::InvalidArgument("invalid arg message");
-  status.SetPayload("a", "\\xFF\\x02\\x03");
+  status.SetPayload("a", absl::Cord("\\xFF\\x02\\x03"));
   Status status_recovered = FromGrpcStatus(ToGrpcStatus(status));
 
   ASSERT_TRUE(status_recovered.GetPayload("a").has_value());
@@ -39,4 +87,129 @@ TEST(PayloadSerialization, PayloadsCorrupted) {
   EXPECT_TRUE(converted.GetPayload(kGrpcPayloadsLost).has_value());
 }
 
+TEST(GrpcProto, Unparse) {
+  TestRequest proto;
+  proto.add_data("hello");
+  proto.add_data("world");
+  grpc::ByteBuffer buf;
+  ASSERT_TRUE(GrpcMaybeUnparseProto(proto, &buf).ok());
+  TestRequest parsed;
+  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
+  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+}
+
+TEST(GrpcProto, UnparseToString) {
+  TestRequest proto;
+  proto.add_data("hello");
+  proto.add_data("world");
+  string str;
+  CHECK(proto.SerializeToString(&str));
+  grpc::ByteBuffer buf;
+  ASSERT_TRUE(GrpcMaybeUnparseProto(str, &buf).ok());
+  TestRequest parsed;
+  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
+  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+}
+
+TEST(GrpcProto, Parse) {
+  // Test with serialization broken up into a bunch of slices.
+  struct Case {
+    int length;
+    int slices;
+  };
+  for (Case c : std::vector<Case>{
+           {0, 1},
+           {20, 1},
+           {100, 1},
+           {1 << 20, 1},
+           {100, 5},
+           {10000, 50},
+       }) {
+    TestRequest proto = MakeProto(c.length);
+    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
+    TestRequest parsed;
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed))
+        << c.length << " " << c.slices;
+    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  }
+}
+
+TEST(GrpcProto, ParseFromString) {
+  // Test with serialization broken up into a bunch of slices.
+  struct Case {
+    int length;
+    int slices;
+  };
+  for (Case c : std::vector<Case>{
+           {0, 1},
+           {20, 1},
+           {100, 1},
+           {1 << 20, 1},
+           {100, 5},
+           {10000, 50},
+       }) {
+    TestRequest proto = MakeProto(c.length);
+    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
+    string parsed_str;
+    TestRequest parsed;
+    ASSERT_TRUE(GrpcMaybeParseProto(&src, &parsed_str))
+        << c.length << " " << c.slices;
+    ASSERT_TRUE(parsed.ParseFromString(parsed_str));
+    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  }
+}
+
+static void BM_UnparseGrpc(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  auto proto = MakeProto(size);
+  for (auto s : state) {
+    grpc::ByteBuffer buf;
+    CHECK(GrpcMaybeUnparseProto(proto, &buf).ok());
+  }
+}
+BENCHMARK(BM_UnparseGrpc)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+
+static void BM_UnparseString(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  auto proto = MakeProto(size);
+
+  for (auto s : state) {
+    string buf;
+    proto.SerializeToString(&buf);
+  }
+}
+BENCHMARK(BM_UnparseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+
+static void BM_ParseGrpc(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+  const int num_slices = state.range(1);
+
+  TestRequest proto = MakeProto(size);
+  auto buf = MakeBuffer(proto.SerializeAsString(), num_slices);
+
+  for (auto s : state) {
+    CHECK(GrpcMaybeParseProto(&buf, &proto));
+  }
+}
+BENCHMARK(BM_ParseGrpc)
+    ->ArgPair(1, 1)
+    ->ArgPair(1 << 10, 1)
+    ->ArgPair(1 << 10, 4)
+    ->ArgPair(1 << 20, 1)
+    ->ArgPair(1 << 20, 4);
+
+static void BM_ParseString(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  TestRequest proto = MakeProto(size);
+  string serial = proto.SerializeAsString();
+
+  for (auto s : state) {
+    CHECK(proto.ParseFromString(serial));
+  }
+}
+BENCHMARK(BM_ParseString)->Arg(1)->Arg(1 << 10)->Arg(1 << 20);
+}  // namespace
 }  // namespace tsl
diff --git a/tensorflow/tsl/distributed_runtime/rpc/test_request.proto b/tensorflow/tsl/distributed_runtime/rpc/test_request.proto
new file mode 100644
index 00000000000..f6378cd0e44
--- /dev/null
+++ b/tensorflow/tsl/distributed_runtime/rpc/test_request.proto
@@ -0,0 +1,23 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tsl.test;
+
+// Dummy proto for testing.
+message TestRequest {
+  repeated string data = 1;
+}
diff --git a/tensorflow/tsl/framework/BUILD b/tensorflow/tsl/framework/BUILD
index aaa3280e027..a8164369d66 100644
--- a/tensorflow/tsl/framework/BUILD
+++ b/tensorflow/tsl/framework/BUILD
@@ -19,6 +19,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -152,14 +153,13 @@ cc_library(
         "tracking_allocator.h",
     ],
     visibility = [
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
+        "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:__subpackages__",
     ],
     deps = [
         ":numeric_types",
         ":type_traits",
-        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/tsl/lib/gtl:inlined_vector",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
@@ -169,6 +169,7 @@ cc_library(
         "//tensorflow/tsl/platform:stringprintf",
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -190,23 +191,20 @@ cc_library(
         ":allocator",
         ":metrics",
         ":shared_counter",
-        "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/tsl/lib/core:bits",
-        "//tensorflow/tsl/lib/monitoring:counter",
-        "//tensorflow/tsl/lib/monitoring:gauge",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:mutex",
         "//tensorflow/tsl/platform:numbers",
-        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:stacktrace",
         "//tensorflow/tsl/platform:str_util",
         "//tensorflow/tsl/platform:strcat",
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//tensorflow/tsl/protobuf:bfc_memory_map_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/tsl/framework/allocator.h b/tensorflow/tsl/framework/allocator.h
index 0a1d7dbf079..4932735b948 100644
--- a/tensorflow/tsl/framework/allocator.h
+++ b/tensorflow/tsl/framework/allocator.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <functional>
 #include <limits>
+#include <optional>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
@@ -75,17 +76,22 @@ struct AllocatorStats {
 
   // The upper limit of bytes of user allocatable device memory, if such a limit
   // is known.
-  absl::optional<int64_t> bytes_limit;
+  std::optional<int64_t> bytes_limit;
 
   // Stats for reserved memory usage.
   int64_t bytes_reserved;       // Number of bytes reserved.
   int64_t peak_bytes_reserved;  // The peak number of bytes reserved.
   // The upper limit on the number bytes of reservable memory,
   // if such a limit is known.
-  absl::optional<int64_t> bytes_reservable_limit;
+  std::optional<int64_t> bytes_reservable_limit;
 
   int64_t largest_free_block_bytes;  // Largest free block's size in heap.
 
+  // Number of bytes of memory held by the allocator.  This may be higher than
+  // bytes_in_use if the allocator holds a pool of memory (e.g. BFCAllocator).
+  std::optional<int64_t> pool_bytes;
+  std::optional<int64_t> peak_pool_bytes;
+
   AllocatorStats()
       : num_allocs(0),
         bytes_in_use(0),
diff --git a/tensorflow/tsl/framework/bfc_allocator.cc b/tensorflow/tsl/framework/bfc_allocator.cc
index 1ae0560b9b7..8e78339480b 100644
--- a/tensorflow/tsl/framework/bfc_allocator.cc
+++ b/tensorflow/tsl/framework/bfc_allocator.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <atomic>
+#include <memory>
 #include <utility>
 
 #include "absl/strings/string_view.h"
@@ -26,13 +27,11 @@ limitations under the License.
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/mutex.h"
 #include "tensorflow/tsl/platform/numbers.h"
+#include "tensorflow/tsl/platform/stacktrace.h"
 #include "tensorflow/tsl/platform/str_util.h"
 #include "tensorflow/tsl/platform/strcat.h"
 #include "tensorflow/tsl/platform/types.h"
-#ifdef TENSORFLOW_MEM_DEBUG
-#include "tensorflow/tsl/stacktrace.h"
-#endif
-#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
+#include "tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 #include "tensorflow/tsl/protobuf/bfc_memory_map.pb.h"
 
@@ -58,6 +57,11 @@ BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
     curr_region_allocation_bytes_ = RoundedBytes(total_memory);
   }
 
+  // Initially, we have not allocated any memory from the sub-allocator; our
+  // pool of memory is empty.
+  stats_.pool_bytes = 0;
+  stats_.peak_pool_bytes = 0;
+
   // Allocate the requested amount of memory.
   memory_limit_ = total_memory;
   stats_.bytes_limit = static_cast<int64_t>(total_memory);
@@ -108,7 +112,7 @@ const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
 }
 
 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
-  size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
+  size_t available_bytes = memory_limit_ - *stats_.pool_bytes;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
 
@@ -158,9 +162,11 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
           << strings::HumanReadableNumBytes(bytes_received) << " bytes for "
           << Name() << ".";
 
-  total_region_allocated_bytes_ += bytes_received;
+  *stats_.pool_bytes += bytes_received;
+  *stats_.peak_pool_bytes =
+      std::max(*stats_.pool_bytes, *stats_.peak_pool_bytes);
   VLOG(1) << "Total allocated bytes: "
-          << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
+          << strings::HumanReadableNumBytes(*stats_.pool_bytes);
 
   VLOG(1) << "Allocated memory at " << mem_addr << " to "
           << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
@@ -352,7 +358,7 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
 
   // Rough estimation to check whether deallocation can help.
   size_t available_bytes =
-      memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
+      memory_limit_ - *stats_.pool_bytes + total_free_bytes;
   if (rounded_bytes > available_bytes) {
     return false;
   }
@@ -403,7 +409,7 @@ void BFCAllocator::DeallocateRegions(
 
     // Deallocate the memory.
     sub_allocator_->Free(it->ptr(), it->memory_size());
-    total_region_allocated_bytes_ -= it->memory_size();
+    *stats_.pool_bytes -= it->memory_size();
     it = region_manager_.RemoveAllocationRegion(it);
   }
 }
@@ -481,9 +487,8 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
         << " (rounded to " << rounded_bytes << ")"
         << "requested by op "
-        << tensorflow::profiler::ScopedMemoryDebugAnnotation::
-               CurrentAnnotation()
-                   .pending_op_name
+        << tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation()
+               .pending_op_name
         << "\nIf the cause is memory fragmentation maybe the environment "
         << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
         << "improve the situation. \nCurrent allocation summary follows."
@@ -504,7 +509,7 @@ int64_t BFCAllocator::LargestFreeChunk() {
 }
 
 double BFCAllocator::GetFragmentation() {
-  int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
+  int64_t bytes_available = *stats_.pool_bytes - stats_.bytes_in_use;
   DCHECK_GT(bytes_available, 0);
   return static_cast<double>(bytes_available - LargestFreeChunk()) /
          bytes_available;
@@ -523,8 +528,8 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
           TF_NO_THREAD_SAFETY_ANALYSIS {
             int64_t bytes_available =
                 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
-            const auto& annotation = tensorflow::profiler::
-                ScopedMemoryDebugAnnotation::CurrentAnnotation();
+            const auto& annotation =
+                tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
             const auto op_name = annotation.pending_op_name
                                      ? annotation.pending_op_name
                                      : "(null)";
@@ -1101,10 +1106,9 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   }
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
-  LOG(INFO) << "total_region_allocated_bytes_: "
-            << total_region_allocated_bytes_
-            << " memory_limit_: " << memory_limit_ << " available bytes: "
-            << (memory_limit_ - total_region_allocated_bytes_)
+  LOG(INFO) << "Total bytes in pool: " << *stats_.pool_bytes
+            << " memory_limit_: " << memory_limit_
+            << " available bytes: " << (memory_limit_ - *stats_.pool_bytes)
             << " curr_region_allocation_bytes_: "
             << curr_region_allocation_bytes_;
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
@@ -1189,9 +1193,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
 #ifdef TENSORFLOW_MEM_DEBUG
   // Record the recent size history
   int history_len = static_cast<int>(std::min(
-      action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
+      action_counter_, static_cast<int64>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
   for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
-    SnapShot* ss = md.add_snap_shot();
+    tensorflow::SnapShot* ss = md.add_snap_shot();
     ss->set_action_count(i);
     int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
     ss->set_size(size_history_[slot]);
diff --git a/tensorflow/tsl/framework/bfc_allocator.h b/tensorflow/tsl/framework/bfc_allocator.h
index f902c1c5d43..56bba4594db 100644
--- a/tensorflow/tsl/framework/bfc_allocator.h
+++ b/tensorflow/tsl/framework/bfc_allocator.h
@@ -578,9 +578,6 @@ class BFCAllocator : public Allocator {
   // The size of the current region allocation.
   size_t curr_region_allocation_bytes_;
 
-  // The total number of allocated bytes by the allocator.
-  size_t total_region_allocated_bytes_ = 0;
-
   // An indicator that expansion of a region has hit the limits
   // of the available memory.
   bool started_backpedal_ = false;
@@ -615,7 +612,7 @@ class BFCAllocator : public Allocator {
   // Stats.
   AllocatorStats stats_ TF_GUARDED_BY(lock_);
 #ifdef TENSORFLOW_MEM_DEBUG
-  int64 action_counter_ = 0 TF_GUARDED_BY(lock_);
+  int64 action_counter_ TF_GUARDED_BY(lock_) = 0;
 #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
   int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
 #endif
diff --git a/tensorflow/tsl/framework/contraction/BUILD b/tensorflow/tsl/framework/contraction/BUILD
new file mode 100644
index 00000000000..f65670b8058
--- /dev/null
+++ b/tensorflow/tsl/framework/contraction/BUILD
@@ -0,0 +1,144 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = [
+        # Required since headers are not self-contained.
+        "-parse_headers",
+    ],
+    licenses = ["notice"],
+)
+
+config_setting(
+    # Add "--define tensorflow_mkldnn_contraction_kernel=0" to your build command to disable mkldnn
+    # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). The mkldnn
+    # kernels are generated at runtime and use avx/avx2/fma/avx512 based on cpu status registers
+    # (https://en.wikipedia.org/wiki/CPUID). Default Eigen contraction kernel is
+    # Eigen::internal::gebp_kernel (general block-panel kernel).
+    name = "no_mkldnn_contraction_kernel",
+    define_values = {
+        "tensorflow_mkldnn_contraction_kernel": "0",
+    },
+)
+
+# Depending on a build configuration this target provides custom kernel for Eigen
+# tensor contractions (small matrix multiplication kernel used to multiple together
+# blocks of the original tensors).
+#
+# 1) Default:
+#    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
+#    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
+#
+# 2) Eigen: --define tensorflow_mkldnn_contraction_kernel=0 (disable mkldnn)
+#    Use Eigen contraction kernel: Eigen::internal::gebp_kernel.
+#
+# If you use `tensor.contract(other_tensor)` in your code, you must include additional header
+# to get the benefit of custom contraction kernel:
+#
+#   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#   #include "third_party/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
+#   #endif
+#
+# We define a two-level target because if we just add
+#   ":no_mkldnn_contraction_kernel": []
+# in the same select list with //third_party/tensorflow:{android,arm,ios,ppc},
+# there can be more than one match, e.g., when building for android and MKL-DNN
+# contraction kernel is disabled. Bazel doesn't allow multiple matches.
+# See more details in
+#   https://github.com/tensorflow/tensorflow/issues/24414
+cc_library(
+    name = "eigen_contraction_kernel",
+    hdrs = ["eigen_contraction_kernel.h"],
+    compatible_with = get_compatible_with_portable(),
+    # Hack to disable breaking AVX512 special GemmKernel. There is a conflicting
+    # specialization there causing build breakages.  This must be added here
+    # as "defines" so that the header is excluded in all dependent targets.
+    # TODO(b/238649163): remove this once no longer necessary.
+    defines = ["GEMM_KERNEL_H"],
+    deps = select({
+        ":no_mkldnn_contraction_kernel": [":eigen_contraction_kernel_no_mkl"],
+        "//conditions:default": [":eigen_contraction_kernel_with_mkl"],
+    }) + ["@com_google_absl//absl/base"],
+)
+
+cc_library(
+    name = "eigen_contraction_kernel_with_mkl",
+    srcs = ["eigen_contraction_kernel.cc"],
+    hdrs = ["eigen_contraction_kernel.h"],
+    defines = select({
+        "//tensorflow/tsl:android_x86": [],
+        "//tensorflow/tsl:arm_any": [
+            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
+        ],
+        "//tensorflow/tsl:fuchsia_x86_64": [],
+        "//tensorflow/tsl:ios": [],
+        "//tensorflow/tsl:linux_ppc64le": [],
+        "//tensorflow/tsl:linux_s390x": [],
+        "//tensorflow/tsl:macos_arm64": [],
+        "//conditions:default": [
+            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
+            "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
+        ],
+    }),
+    deps = [
+        "@com_google_absl//absl/base",
+        "//third_party/eigen3",
+        "//tensorflow/tsl/platform:dynamic_annotations",
+        "//tensorflow/tsl/framework/fixedpoint",
+    ] + select({
+        "//tensorflow/tsl:android_x86": [],
+        "//tensorflow/tsl:arm_any": [],
+        "//tensorflow/tsl:fuchsia_x86_64": [],
+        "//tensorflow/tsl:ios": [],
+        "//tensorflow/tsl:linux_ppc64le": [],
+        "//tensorflow/tsl:linux_s390x": [],
+        "//tensorflow/tsl:macos_arm64": [],
+        "//conditions:default": ["@mkl_dnn_v1//:mkl_dnn"],
+    }),
+)
+
+# Portable Tensorflow for Android/iOS requires these files directly rather than as libraries, so
+# export them to be used there.
+exports_files(
+    srcs = [
+        "eigen_contraction_kernel.cc",
+        "eigen_contraction_kernel.h",
+    ],
+)
+
+cc_library(
+    name = "eigen_contraction_kernel_no_mkl",
+    srcs = ["eigen_contraction_kernel.cc"],
+    hdrs = ["eigen_contraction_kernel.h"],
+    compatible_with = get_compatible_with_portable(),
+    # Somehow the following code works with fixedpoint, but not here.
+    # visibility = [
+    #     "//tensorflow:__subpackages__",
+    #     "//tensorflow/tsl:internal",
+    # ],
+    deps = [
+        "//tensorflow/tsl/framework/fixedpoint",
+        "//tensorflow/tsl/platform:dynamic_annotations",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
+    ],
+)
+
+# Maintain the same name as other directories until a principled refactor is done, as these files
+# used to all be a single target.
+filegroup(
+    name = "xla_cpu_runtime_hdrs",
+    srcs = [
+        "eigen_contraction_kernel.h",
+    ],
+)
+
+# Maintain the same name as other directories until a principled refactor is done, as these files
+# used to all be a single target.
+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = [
+        "eigen_contraction_kernel.cc",
+    ],
+)
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.cc b/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.cc
similarity index 96%
rename from tensorflow/core/kernels/eigen_contraction_kernel.cc
rename to tensorflow/tsl/framework/contraction/eigen_contraction_kernel.cc
index 4959651569c..8c09ef24838 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.cc
+++ b/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 
 #include <mutex>  // NOLINT(build/c++11)
 
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h
similarity index 99%
rename from tensorflow/core/kernels/eigen_contraction_kernel.h
rename to tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h
index b56dfd30955..c8dec022416 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+#ifndef TENSORFLOW_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
+#define TENSORFLOW_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
 
 // Depending on a build configuration this header provides custom kernel for
 // Eigen tensor contractions (small matrix multiplication kernel used to
@@ -29,7 +29,8 @@ limitations under the License.
 // this header to get the benefit of custom contraction kernel:
 //
 //   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-//   #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+//   #include
+//   "third_party/tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 //   #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -39,7 +40,7 @@ limitations under the License.
 #include "dnnl.h"
 #endif
 
-#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/tsl/platform/dynamic_annotations.h"
 
 namespace Eigen {
 namespace internal {
@@ -901,4 +902,4 @@ REGISTER_TENSOR_CONTRACTION_KERNEL_NO_FALLBACK(Eigen::QInt32, Eigen::QInt8,
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+#endif  // TENSORFLOW_TSL_FRAMEWORK_CONTRACTION_EIGEN_CONTRACTION_KERNEL_H_
diff --git a/tensorflow/tsl/framework/convolution/BUILD b/tensorflow/tsl/framework/convolution/BUILD
new file mode 100644
index 00000000000..a495f319628
--- /dev/null
+++ b/tensorflow/tsl/framework/convolution/BUILD
@@ -0,0 +1,112 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load(
+    "//tensorflow/tsl/platform:build_config.bzl",
+    "tsl_cc_test",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = [
+        # Required since headers are not self-contained.
+        "-parse_headers",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "eigen_spatial_convolutions-inl",
+    hdrs = [
+        "eigen_spatial_convolutions-inl.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    # Hack to disable breaking AVX512 special GemmKernel. There is a conflicting
+    # specialization there causing build breakages.  This must be added here
+    # as "defines" so that the header is excluded in all dependent targets.
+    # TODO(b/238649163): remove this once no longer necessary.
+    defines = ["GEMM_KERNEL_H"],
+    deps = [
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers",
+    ],
+)
+
+cc_library(
+    name = "eigen_convolution_helpers",
+    hdrs = [
+        "eigen_convolution_helpers.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    defines = [
+        "EIGEN_ALTIVEC_USE_CUSTOM_PACK=0",
+    ],
+)
+
+# Tensorflow also has an eigen_helpers that is closely related, so maintain the same name.
+cc_library(
+    name = "eigen_helpers",
+    hdrs = [
+        "eigen_spatial_convolutions.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    defines = ["EIGEN_NEON_GEBP_NR=4"],
+    deps = [
+        "//tensorflow/tsl/framework/contraction:eigen_contraction_kernel",
+        "//tensorflow/tsl/framework/convolution:eigen_convolution_helpers",
+        "//tensorflow/tsl/framework/convolution:eigen_spatial_convolutions-inl",
+        "//third_party/eigen3",
+    ],
+)
+
+# Maintain the same name as other directories until a principled refactor is done, as these files
+# used to all be a single target.
+filegroup(
+    name = "xla_cpu_runtime_hdrs",
+    srcs = [
+        "eigen_convolution_helpers.h",
+        "eigen_spatial_convolutions.h",
+        "eigen_spatial_convolutions-inl.h",
+    ],
+    # Somehow the following code works with fixedpoint, but not here.
+    # visibility = [
+    #     "//tensorflow:__subpackages__",
+    #     "//tensorflow/tsl:internal",
+    # ],
+)
+
+# Test against platforms and architecures that don't support CUDA.
+# TODO(b/153737462): Automatically filter tests to create the appropriate
+# portable test list.
+test_suite(
+    name = "portable_kernel_tests",
+    tags = [
+        "manual",  # Avoid redundancy when using wildcard test patterns.
+    ],
+    tests = [
+        ":eigen_spatial_convolutions_test",
+    ],
+)
+
+# Portable Tensorflow for Android/iOS requires these files directly rather than as libraries, so
+# export them to be used there.
+exports_files(
+    srcs = [
+        "eigen_convolution_helpers.h",
+        "eigen_spatial_convolutions.h",
+        "eigen_spatial_convolutions-inl.h",
+    ],
+)
+
+tsl_cc_test(
+    name = "spatial_convolutions_test",
+    size = "small",
+    srcs = [
+        "eigen_spatial_convolutions_test.cc",
+    ],
+    deps = [
+        ":eigen_helpers",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/kernels/eigen_convolution_helpers.h b/tensorflow/tsl/framework/convolution/eigen_convolution_helpers.h
similarity index 90%
rename from tensorflow/core/kernels/eigen_convolution_helpers.h
rename to tensorflow/tsl/framework/convolution/eigen_convolution_helpers.h
index 965a2835c8a..93415136551 100644
--- a/tensorflow/core/kernels/eigen_convolution_helpers.h
+++ b/tensorflow/tsl/framework/convolution/eigen_convolution_helpers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
+#ifndef TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
+#define TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -59,9 +59,8 @@ class TensorEvaluatorHasPartialPacket {
   template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
   static auto functionExistsSfinae(...) -> std::false_type;
 
-  typedef decltype(
-      functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(
-          nullptr)) status;
+  typedef decltype(functionExistsSfinae<TensorEvaluatorType, PacketType,
+                                        IndexType>(nullptr)) status;
 
   static constexpr bool value = status::value;
 };
@@ -85,4 +84,4 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
+#endif  // TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_CONVOLUTION_HELPERS_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions-inl.h
similarity index 99%
rename from tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
rename to tensorflow/tsl/framework/convolution/eigen_spatial_convolutions-inl.h
index e0a1c0de4cd..ae9566b0eaf 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions-inl.h
+++ b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions-inl.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#ifndef TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#define TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
 
-#include "tensorflow/core/kernels/eigen_convolution_helpers.h"
+#include "tensorflow/tsl/framework/convolution/eigen_convolution_helpers.h"
 
 // Note this header is used in both TF and TFLite.
 namespace Eigen {
@@ -1769,4 +1769,4 @@ SpatialConvolution(const Input& input, const Kernel& kernel,
 
 }  // end namespace Eigen
 
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
+#endif  // TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_INL_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h
similarity index 97%
rename from tensorflow/core/kernels/eigen_spatial_convolutions.h
rename to tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h
index b8da0024135..4ce057e9b9a 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h
@@ -13,17 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Note the following header is used in both TF and TFLite. Particularly, it's
 // used for float TFLite Conv2D.
-#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions-inl.h"
 
 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/tsl/framework/contraction/eigen_contraction_kernel.h"
 
 namespace Eigen {
 namespace internal {
@@ -442,4 +442,4 @@ struct gemm_pack_colmajor_block<
 }  // namespace internal
 }  // namespace Eigen
 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
-#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_TSL_FRAMEWORK_CONVOLUTION_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions_test.cc b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions_test.cc
new file mode 100644
index 00000000000..6c82eb6ea93
--- /dev/null
+++ b/tensorflow/tsl/framework/convolution/eigen_spatial_convolutions_test.cc
@@ -0,0 +1,1434 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/framework/convolution/eigen_spatial_convolutions.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/test_benchmark.h"
+
+namespace Eigen {
+
+#define EigenApprox(a, b) \
+  { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); }
+static int ceil_div(int a, int b) { return (a + b - 1) / b; }
+
+TEST(EigenSpatialConvolutionsTest, Simple) {
+  const int input_depth = 7;
+  const int input_rows = 4;
+  const int input_cols = 5;
+  const int output_depth = 10;
+  const int patch_rows = 3;
+  const int patch_cols = 4;
+  const int output_rows = input_rows;
+  const int output_cols = input_cols;
+
+  Tensor<float, 3> input(input_depth, input_rows, input_cols);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 3> result(output_depth, output_rows, output_cols);
+
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = SpatialConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+
+  for (int od = 0; od < output_depth; ++od) {
+    for (int i = 0; i < output_rows; ++i) {
+      for (int j = 0; j < output_cols; ++j) {
+        float expected = 0.0f;
+        for (int c = 0; c < patch_cols; ++c) {
+          for (int r = 0; r < patch_rows; ++r) {
+            for (int id = 0; id < input_depth; ++id) {
+              if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
+                  c - 1 + j < output_cols) {
+                expected +=
+                    input(id, r - 1 + i, c - 1 + j) * kernel(od, id, r, c);
+              }
+            }
+          }
+        }
+        EigenApprox(result(od, i, j), expected);
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, SimpleRowMajor) {
+  const int input_depth = 7;
+  const int input_rows = 4;
+  const int input_cols = 5;
+  const int output_depth = 10;
+  const int patch_rows = 3;
+  const int patch_cols = 4;
+  const int output_rows = input_rows;
+  const int output_cols = input_cols;
+
+  Tensor<float, 3, RowMajor> input(input_cols, input_rows, input_depth);
+  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
+                                    output_depth);
+  Tensor<float, 3, RowMajor> result(output_cols, output_rows, output_depth);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = SpatialConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(0), output_cols);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_depth);
+
+  for (int od = 0; od < output_depth; ++od) {
+    for (int i = 0; i < output_rows; ++i) {
+      for (int j = 0; j < output_cols; ++j) {
+        float expected = 0.0f;
+        for (int c = 0; c < patch_cols; ++c) {
+          for (int r = 0; r < patch_rows; ++r) {
+            for (int id = 0; id < input_depth; ++id) {
+              if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
+                  c - 1 + j < output_cols) {
+                expected +=
+                    input(c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
+              }
+            }
+          }
+        }
+        EigenApprox(result(j, i, od), expected);
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolution) {
+  Tensor<float, 4> input(10, 5, 5, 13);
+  Tensor<float, 4> kernel(7, 10, 3, 3);
+  Tensor<float, 4> result(7, 5, 5, 13);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = SpatialConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(0), 7);
+  EXPECT_EQ(result.dimension(1), 5);
+  EXPECT_EQ(result.dimension(2), 5);
+
+  for (int b = 0; b < 13; ++b) {
+    for (int od = 0; od < 7; ++od) {
+      for (int i = 0; i < 5; ++i) {
+        for (int j = 0; j < 5; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < 3; ++c) {
+            for (int r = 0; r < 3; ++r) {
+              for (int id = 0; id < 10; ++id) {
+                if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
+                    c - 1 + j < 5) {
+                  expected +=
+                      input(id, r - 1 + i, c - 1 + j, b) * kernel(od, id, r, c);
+                }
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolutionRowMajor) {
+  Tensor<float, 4, RowMajor> input(13, 5, 5, 10);
+  Tensor<float, 4, RowMajor> kernel(3, 3, 10, 7);
+  Tensor<float, 4, RowMajor> result(13, 5, 5, 7);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  result = SpatialConvolution(input, kernel);
+
+  EXPECT_EQ(result.dimension(1), 5);
+  EXPECT_EQ(result.dimension(2), 5);
+  EXPECT_EQ(result.dimension(3), 7);
+
+  for (int b = 0; b < 13; ++b) {
+    for (int od = 0; od < 7; ++od) {
+      for (int i = 0; i < 5; ++i) {
+        for (int j = 0; j < 5; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < 3; ++c) {
+            for (int r = 0; r < 3; ++r) {
+              for (int id = 0; id < 10; ++id) {
+                if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
+                    c - 1 + j < 5) {
+                  expected +=
+                      input(b, c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
+                }
+              }
+            }
+          }
+          EigenApprox(result(b, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolution) {
+  const int input_depth = 10;
+  const int input_rows = 5;
+  const int input_cols = 5;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 4;
+  const int patch_cols = 4;
+  const int output_rows = input_rows - patch_rows + 1;
+  const int output_cols = input_cols - patch_cols + 1;
+
+  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
+  // of 1.
+  const int stride = 1;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+  EXPECT_EQ(result.dimension(3), num_batches);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(id, r + i, c + j, b) * kernel(od, id, r, c);
+              }
+            }
+          }
+          if (result(od, i, j, b) != expected) {
+            std::cout << "at od=" << od << " b=" << b << " i=" << i
+                      << " j=" << j << " " << result(od, i, j, b) << " vs "
+                      << expected << std::endl;
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionUnequalStrides) {
+  const int input_depth = 10;
+  const int input_rows = 5;
+  const int input_cols = 5;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 4;
+  const int patch_cols = 4;
+
+  const int row_stride = 1;
+  const int col_stride = 2;
+  const int output_rows = 2;
+  const int output_cols = 1;
+
+  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
+  // of 1.
+  result =
+      SpatialConvolution(input, kernel, row_stride, col_stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+  EXPECT_EQ(result.dimension(3), num_batches);
+  if (true) return;
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected +=
+                    input(id, r + row_stride * i, c + col_stride * j, b) *
+                    kernel(od, id, r, c);
+              }
+            }
+          }
+          if (result(od, i, j, b) != expected) {
+            std::cout << "at od=" << od << " b=" << b << " i=" << i
+                      << " j=" << j << " " << result(od, i, j, b) << " vs "
+                      << expected << std::endl;
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionRowMajor) {
+  const int input_depth = 10;
+  const int input_rows = 5;
+  const int input_cols = 5;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 4;
+  const int patch_cols = 4;
+  const int output_rows = input_rows - patch_rows + 1;
+  const int output_cols = input_cols - patch_cols + 1;
+
+  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
+                                   input_depth);
+  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
+                                    output_depth);
+  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
+                                    output_depth);
+
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
+  // of 1.
+  const int stride = 1;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), num_batches);
+  EXPECT_EQ(result.dimension(1), output_cols);
+  EXPECT_EQ(result.dimension(2), output_rows);
+  EXPECT_EQ(result.dimension(3), output_depth);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_rows; ++c) {
+            for (int r = 0; r < patch_cols; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(b, c + j, r + i, id) * kernel(c, r, id, od);
+              }
+            }
+          }
+          if (result(b, j, i, od) != expected) {
+            std::cout << "at od=" << od << " b=" << b << " i=" << i
+                      << " j=" << j << " " << result(b, j, i, od) << " vs "
+                      << expected << std::endl;
+          }
+          EigenApprox(result(b, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolution) {
+  const int input_depth = 10;
+  const int input_rows = 5;
+  const int input_cols = 5;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 3;
+  const int patch_cols = 3;
+  const int output_rows = 2;
+  const int output_cols = 2;
+
+  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
+  // of 2.
+  int stride = 2;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+  EXPECT_EQ(result.dimension(3), num_batches);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(id, r + stride * i, c + stride * j, b) *
+                            kernel(od, id, r, c);
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, KernelSmallerThanStride) {
+  const int input_depth = 2;
+  const int input_rows = 3;
+  const int input_cols = 3;
+  const int num_batches = 5;
+  const int output_depth = 6;
+  const int patch_rows = 1;
+  const int patch_cols = 1;
+  const int output_rows = 2;
+  const int output_cols = 2;
+
+  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 1x1 kernel, valid padding, and a stride
+  // of 2.
+  int stride = 2;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+  EXPECT_EQ(result.dimension(3), num_batches);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(id, r + stride * i, c + stride * j, b) *
+                            kernel(od, id, r, c);
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolutionRowMajor) {
+  const int input_depth = 10;
+  const int input_rows = 5;
+  const int input_cols = 5;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 3;
+  const int patch_cols = 3;
+  const int output_rows = 2;
+  const int output_cols = 2;
+
+  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
+                                   input_depth);
+  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
+                                    output_depth);
+  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
+                                    output_depth);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
+  // of 2.
+  int stride = 2;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
+
+  EXPECT_EQ(result.dimension(0), num_batches);
+  EXPECT_EQ(result.dimension(1), output_cols);
+  EXPECT_EQ(result.dimension(2), output_rows);
+  EXPECT_EQ(result.dimension(3), output_depth);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(b, c + stride * j, r + stride * i, id) *
+                            kernel(c, r, id, od);
+              }
+            }
+          }
+          EigenApprox(result(b, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, AtrousSpatial) {
+  const int input_depth = 10;
+  const int input_rows = 7;
+  const int input_cols = 7;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 3;
+  const int patch_cols = 3;
+  const int output_rows = 3;
+  const int output_cols = 3;
+
+  Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
+  Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
+  Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 3x3 kernel, valid padding
+  // output (standard) stride 1, and input (atrous) stride of 2.
+  int stride = 1;
+  int in_stride = 2;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
+                              in_stride, in_stride);
+
+  EXPECT_EQ(result.dimension(0), output_depth);
+  EXPECT_EQ(result.dimension(1), output_rows);
+  EXPECT_EQ(result.dimension(2), output_cols);
+  EXPECT_EQ(result.dimension(3), num_batches);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(id, in_stride * r + stride * i,
+                                  in_stride * c + stride * j, b) *
+                            kernel(od, id, r, c);
+              }
+            }
+          }
+          EigenApprox(result(od, i, j, b), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajor) {
+  const int input_depth = 10;
+  const int input_rows = 7;
+  const int input_cols = 7;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 3;
+  const int patch_cols = 3;
+  const int output_rows = 3;
+  const int output_cols = 3;
+
+  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
+                                   input_depth);
+  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
+                                    output_depth);
+  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
+                                    output_depth);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 3x3 kernel, valid padding
+  // output (standard) stride 1, and input (atrous) stride of 2.
+  int stride = 1;
+  int in_stride = 2;
+  result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
+                              in_stride, in_stride);
+
+  EXPECT_EQ(result.dimension(0), num_batches);
+  EXPECT_EQ(result.dimension(1), output_cols);
+  EXPECT_EQ(result.dimension(2), output_rows);
+  EXPECT_EQ(result.dimension(3), output_depth);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(b, in_stride * c + stride * j,
+                                  in_stride * r + stride * i, id) *
+                            kernel(c, r, id, od);
+              }
+            }
+          }
+          EigenApprox(result(b, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajorUnequalStrides) {
+  const int input_depth = 10;
+  const int input_rows = 7;
+  const int input_cols = 7;
+  const int num_batches = 13;
+  const int output_depth = 7;
+  const int patch_rows = 3;
+  const int patch_cols = 3;
+  const int output_rows = 1;
+  const int output_cols = 3;
+
+  Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
+                                   input_depth);
+  Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
+                                    output_depth);
+  Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
+                                    output_depth);
+  input = input.constant(11.0f) + input.random();
+  kernel = kernel.constant(2.0f) + kernel.random();
+  result.setRandom();
+
+  // Apply a spatial convolution using a 3x3 kernel, valid padding
+  // output (standard) stride 1, and input (atrous) stride of 2.
+  int row_stride = 1;
+  int col_stride = 2;
+  int row_in_stride = 3;
+  int col_in_stride = 1;
+  result = SpatialConvolution(input, kernel, row_stride, col_stride,
+                              PADDING_VALID, row_in_stride, col_in_stride);
+
+  EXPECT_EQ(result.dimension(0), num_batches);
+  EXPECT_EQ(result.dimension(1), output_cols);
+  EXPECT_EQ(result.dimension(2), output_rows);
+  EXPECT_EQ(result.dimension(3), output_depth);
+
+  for (int b = 0; b < num_batches; ++b) {
+    for (int od = 0; od < output_depth; ++od) {
+      for (int i = 0; i < output_rows; ++i) {
+        for (int j = 0; j < output_cols; ++j) {
+          float expected = 0.0f;
+          for (int c = 0; c < patch_cols; ++c) {
+            for (int r = 0; r < patch_rows; ++r) {
+              for (int id = 0; id < input_depth; ++id) {
+                expected += input(b, col_in_stride * c + col_stride * j,
+                                  row_in_stride * r + row_stride * i, id) *
+                            kernel(c, r, id, od);
+              }
+            }
+          }
+          EigenApprox(result(b, j, i, od), expected);
+        }
+      }
+    }
+  }
+}
+
+// A test case discovered when testing backward spatial convolution where the
+// special tensor contraction mapper for spatial convolution contains a bug.
+TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
+  // We have a 3x4 input image with 2x2 patch and stride of 2.
+  // The output has size 1x2.
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Tensor<float, 4> out(1, 1, 2, 1);
+  Tensor<float, 4> kern(1, 1, 2, 2);
+  for (int i = 0; i < kern.size(); ++i) {
+    kern.coeffRef(i) = static_cast<float>(i) + 1;
+  }
+  for (int i = 0; i < out.size(); ++i) {
+    out.coeffRef(i) = static_cast<float>(i) + 1;
+  }
+
+  DSizes<ptrdiff_t, 4> strides;
+  strides[0] = 1;
+  strides[1] = 2;
+  strides[2] = 2;
+  strides[3] = 1;
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(1, 2);
+  paddings[2] = std::make_pair(1, 1);
+  paddings[3] = std::make_pair(0, 0);
+
+  DSizes<ptrdiff_t, 3> out_dim;
+  out_dim[0] = 1;
+  out_dim[1] = 4;
+  out_dim[2] = 12;
+
+  array<bool, 4> kernel_reverse;
+  kernel_reverse[0] = false;
+  kernel_reverse[1] = false;
+  kernel_reverse[2] = true;
+  kernel_reverse[3] = true;
+
+  DSizes<ptrdiff_t, 3> k_dims;
+  k_dims[0] = 1;
+  k_dims[1] = 1;
+  k_dims[2] = 4;
+
+  array<DimPair, 2> contract_dims;
+  contract_dims[0] = DimPair(0, 0);
+  contract_dims[1] = DimPair(2, 1);
+
+  DSizes<ptrdiff_t, 4> in_dim;
+  in_dim[0] = 1;
+  in_dim[1] = 3;
+  in_dim[2] = 4;
+  in_dim[3] = 1;
+
+  DSizes<ptrdiff_t, 2> in_dbg_dim;
+  in_dbg_dim[0] = 3;
+  in_dbg_dim[1] = 4;
+
+  DSizes<ptrdiff_t, 2> out_dbg_dim;
+  out_dbg_dim[0] = 4;
+  out_dbg_dim[1] = 12;
+
+  // This is the formula for computing the backward prop for input with a
+  // spatial convolution.
+  Tensor<float, 4> direct =
+      kern.reverse(kernel_reverse)
+          .reshape(k_dims)
+          .contract(
+              out.extract_image_patches(2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 0)
+                  .reshape(out_dim),
+              contract_dims)
+          .reshape(in_dim);
+
+  Tensor<float, 4> indirect =
+      kern.reverse(kernel_reverse)
+          .reshape(k_dims)
+          .contract(
+              out.inflate(strides)
+                  .pad(paddings)
+                  .extract_image_patches(2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
+                  .reshape(out_dim),
+              contract_dims)
+          .reshape(in_dim);
+
+  eigen_assert(dimensions_match(direct.dimensions(), indirect.dimensions()));
+  for (size_t i = 0; i < direct.dimensions().TotalSize(); ++i) {
+    EigenApprox(direct.data()[i], indirect.data()[i]);
+  }
+  EigenApprox(1.0f, direct(0, 0, 0, 0));
+  EigenApprox(3.0f, direct(0, 0, 1, 0));
+  EigenApprox(2.0f, direct(0, 0, 2, 0));
+  EigenApprox(6.0f, direct(0, 0, 3, 0));
+
+  EigenApprox(2.0f, direct(0, 1, 0, 0));
+  EigenApprox(4.0f, direct(0, 1, 1, 0));
+  EigenApprox(4.0f, direct(0, 1, 2, 0));
+  EigenApprox(8.0f, direct(0, 1, 3, 0));
+}
+
+template <typename T>
+static void PackRhsHelper(::testing::benchmark::State& state,
+                          /* Input dimensions: */
+                          int input_batches, int input_cols, int input_rows,
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          Eigen::PaddingType padding,
+                          /* Input strides: */
+                          int col_strides, int row_strides,
+                          /* Patch inflate strides: */
+                          int patch_col_inflate_stride,
+                          int patch_row_inflate_stride,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (channels aka depth in this case).
+  Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
+
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
+
+  // Reshape dimensions.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<T>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
+
+  using Evaluator = TensorEvaluator<
+      const TensorReshapingOp<
+          NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
+      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
+                                                ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_rhs<T, Eigen::Index, SubMapper,  //
+                                     Traits::nr,                  //
+                                     ColMajor,                    //
+                                     /*Conjugate*/ false,         //
+                                     /*PanelMode*/ false>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // Actual contract dimensions are not important.
+  const Eigen::Index not_important = -1234;
+  nocontract_t nocontract_dim = {not_important};
+  contract_t contract_dim = {not_important};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<T, 4> packed(input_dims);
+
+  // We generate multiple input tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = input_dims.TotalSize() * sizeof(T);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_inputs =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<T, 4>> inputs;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  inputs.reserve(num_inputs);
+  evaluators.reserve(num_inputs);
+  input_mappers.reserve(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs.emplace_back(input_dims);
+    inputs[i].setRandom();
+
+    ArgType tensor_map(inputs[i].data(), input_dims);
+
+    // 1. Extract image patches from input tensor. All strides are `1`.
+    const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
+        tensor_map,                                          //
+        filter_rows, filter_cols,                            //
+        row_strides, col_strides,                            //
+        /*in_row_strides=*/1, /*in_col_strides=*/1,          //
+        patch_row_inflate_stride, patch_col_inflate_stride,  //
+        padding, /*padding_value=*/0.0);
+
+    // 2. Reshape extracted patches into "virtual" 2d tensor.
+    Index input_rows_eff = (input_rows - 1) * patch_row_inflate_stride + 1;
+    Index input_cols_eff = (input_cols - 1) * patch_col_inflate_stride + 1;
+
+    Index output_rows = 0;
+    Index output_cols = 0;
+
+    if (padding == Eigen::PADDING_SAME) {
+      output_rows = input_rows_eff / row_strides;
+      output_cols = input_cols_eff / col_strides;
+    } else if (padding == Eigen::PADDING_VALID) {
+      output_rows =
+          numext::ceil((input_rows_eff - filter_rows + 1.f) / row_strides);
+      output_cols =
+          numext::ceil((input_cols_eff - filter_cols + 1.f) / col_strides);
+    } else {
+      eigen_assert(false && "not supported");
+    }
+
+    NewDimension reshape_dims;
+    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
+    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
+            image_patch_op, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
+                               contract_dim, contract_dim);
+  }
+
+  // We read properties of extracted image patches directly from evaluator.
+  const Index patch_depth = evaluators[0].impl().dimensions()[0];
+  const Index patch_rows = evaluators[0].impl().dimensions()[1];
+  const Index patch_cols = evaluators[0].impl().dimensions()[2];
+
+  // Number of patches is the same as the maximum column available through the
+  // InputMapper (SubMapper).
+  const Index num_patches = evaluators[0].impl().dimensions()[3];
+
+  // The size of a single patch, it's the same as the maximum depth available
+  // through the InputMapper (SubMapper).
+  const Index patch_size = patch_depth * patch_rows * patch_cols;
+
+  PackRhsImpl pack_rhs;
+
+  const Index packed_total_size = input_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  for (auto s : state) {
+    int input_idx =
+        num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
+
+    // Depth offset must be a multiple packet size.
+    Index depth_offset =
+        (patch_size > block_rows)
+            ? round_up(internal::random<Index>(0, patch_size - 10))
+            : 0;
+    Index col_offset = internal::random<Index>(0, num_patches - 10);
+
+    Index depth = std::min(block_rows, patch_size - depth_offset);
+    Index cols = std::min(block_cols, num_patches - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_size = depth * cols;
+    Index packed_offset =
+        internal::random<Index>(0, packed_total_size - packed_size - 1);
+
+    SubMapper sub_mapper =
+        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
+    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
+  }
+
+  state.SetLabel(
+      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
+                   "; num_patches=", num_patches, " patch_size=", patch_size,
+                   " num_inputs=", num_inputs, " padding=", padding));
+}
+
+template <typename T>
+static void PackLhsHelper(::testing::benchmark::State& state,
+                          /* Input dimensions: */
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  eigen_assert(block_rows <= filter_count);
+  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (`filter count` aka `kernel filers`).
+  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
+
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
+
+  // We are going to reshape filter into 2D tensor.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the ReshapeOp. It is the tensorflow TTypes<T>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
+
+  using Evaluator =
+      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
+                      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
+                                                ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_lhs<T, Eigen::Index, SubMapper,          //
+                                     Traits::mr,                          //
+                                     Traits::LhsProgress,                 //
+                                     typename Traits::LhsPacket4Packing,  //
+                                     ColMajor>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // We will reshape kernel into 2D tensor.
+  NewDimension reshape_dims;
+  reshape_dims[0] = filter_count;
+  reshape_dims[1] = input_depth * filter_rows * filter_cols;
+
+  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
+  nocontract_t nocontract_dim = {0};
+  contract_t contract_dim = {1};
+
+  // These values computed using the algorithm in TensorContraction.h, with
+  // 'nocontract_dim' and 'contract_dim' values specified above.
+  nocontract_t nocontract_strides = {1};
+  contract_t contract_strides = {filter_count};
+  nocontract_t i_strides = {1};
+  contract_t k_strides = {1};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<T, 4> packed(filter_dims);
+
+  // We generate multiple filter tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(T);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_filters =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<T, 4>> filters;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  filters.reserve(num_filters);
+  evaluators.reserve(num_filters);
+  input_mappers.reserve(num_filters);
+
+  for (int i = 0; i < num_filters; ++i) {
+    filters.emplace_back(filter_dims);
+    filters[i].setRandom();
+
+    ArgType tensor_map(filters[i].data(), filter_dims);
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
+                               contract_strides, k_strides);
+  }
+
+  PackLhsImpl pack_lhs;
+
+  const Index packed_total_size = filter_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  // Block rows is in the [0, filter_count) range.
+  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
+
+  const Index max_row = filter_count;
+  const Index max_col = filter_rows * filter_cols * input_depth;
+
+  for (auto s : state) {
+    int filter_idx =
+        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
+
+    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
+    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
+
+    Index rows = std::min(block_rows, max_row - row_offset);
+    Index cols = std::min(block_cols, max_col - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_offset = round_up(
+        internal::random<Index>(0, packed_total_size - rows * cols - 1));
+
+    SubMapper sub_mapper =
+        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
+
+// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+// and accepts block rows and cols in the same order for lhs and rhs.
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
+#else
+    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
+#endif
+  }
+  state.SetLabel(absl::StrCat(
+      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
+      "; input: depth=", input_depth, "; num_filers=", num_filters));
+}
+
+// -------------------------------------------------------------------------- //
+// Pack RHS
+//
+// Macro argument names:
+//    N: batch size
+//    H: height
+//    W: width
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   SH: stride in height dimensions
+//   SW: stride in width dimensions
+//  ISH: patch inflate stride in height dimension
+//  ISW: patch inflate stride in width dimension
+//   BR: block rows
+//   BC: block cols
+
+#define BM_CONCAT(a, b) a##b
+
+#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, \
+                    BR, BC)                                                   \
+  BM_CONCAT(                                                                  \
+      BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW,     \
+      _##PAD##_s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC)
+
+#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC)  \
+  static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW,    \
+                          ISH, ISW, BR,                                       \
+                          BC)(::testing::benchmark::State & state) {          \
+    PackRhsHelper<T>(state, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
+                     ISH, ISW, BR, BC);                                       \
+  }                                                                           \
+  BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \
+                        ISW, BR, BC))                                         \
+      ->UseRealTime()
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+// Fast path: input channel dimension is the multiple of the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Slow path: input channel dimension is not the multiple of the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 30,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Slow path with input channel dimension smaller than the packet size.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 4,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 256, 256,             //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 8, 8,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 4,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 4,                 //
+           /*num_filters*/ 16,             //
+           /*filter*/ 3, 3,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 2, 2,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 36, 432);
+
+// Non standard patches with inflated strides.
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 32, 32,               //
+           /*channels*/ 96,                //
+           /*num_filters*/ 96,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 2, 2,  //
+           /*block*/ 272, 240);
+
+BM_PackRhs(/*type*/ float,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 32, 32,               //
+           /*channels*/ 96,                //
+           /*num_filters*/ 96,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ VALID,              //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 2, 2,  //
+           /*block*/ 272, 240);
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+using qint8 = Eigen::QInt8;
+BM_PackRhs(/*type*/ qint8,                 //
+           /*batch*/ 32,                   //
+           /*image*/ 64, 64,               //
+           /*channels*/ 32,                //
+           /*num_filters*/ 64,             //
+           /*filter*/ 5, 5,                //
+           /*padding*/ SAME,               //
+           /*stride*/ 1, 1,                //
+           /*patch inflate stride*/ 1, 1,  //
+           /*block*/ 256, 56);
+#endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+// -------------------------------------------------------------------------- //
+// Pack LHS
+//
+// Macro argument names:
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   BR: block rows
+//   BC: block cols
+
+#define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+
+#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                         \
+  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR,             \
+                          BC)(::testing::benchmark::State & state) { \
+    PackLhsHelper<T>(state, C, FC, FH, FW, BR, BC);                  \
+  }                                                                  \
+  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))->UseRealTime()
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 56, 256);
+
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 30,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 50,   //
+           /*filter channels*/ 64,  //
+           /*filter dims*/ 3, 3,    //
+           /*block*/ 56, 256);
+}  // namespace Eigen
diff --git a/tensorflow/tsl/framework/cpu_allocator_impl.cc b/tensorflow/tsl/framework/cpu_allocator_impl.cc
index 096f8581079..e4dc8a819b3 100644
--- a/tensorflow/tsl/framework/cpu_allocator_impl.cc
+++ b/tensorflow/tsl/framework/cpu_allocator_impl.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <atomic>
 
-#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/framework/allocator_registry.h"
 #include "tensorflow/tsl/framework/tracking_allocator.h"
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/strcat.h"
 #include "tensorflow/tsl/platform/stringprintf.h"
 #include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace tsl {
@@ -71,7 +72,7 @@ class CPUAllocator : public Allocator {
       : single_allocation_warning_count_(0),
         total_allocation_warning_count_(0) {}
 
-  ~CPUAllocator() override {}
+  ~CPUAllocator() override = default;
 
   string Name() override { return "cpu"; }
 
@@ -125,8 +126,8 @@ class CPUAllocator : public Allocator {
     tsl::profiler::TraceMe::InstantActivity(
         [this, traceme_name, chunk_ptr, req_bytes,
          alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS {
-          const auto& annotation = tensorflow::profiler::
-              ScopedMemoryDebugAnnotation::CurrentAnnotation();
+          const auto& annotation =
+              tsl::profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
           return tsl::profiler::TraceMeEncode(
               traceme_name, {{"allocator_name", Name()},
                              {"bytes_reserved", stats_.bytes_reserved},
@@ -195,11 +196,13 @@ class CPUAllocatorFactory : public AllocatorFactory {
 
     void* Alloc(size_t alignment, size_t num_bytes,
                 size_t* bytes_received) override {
+      tsl::profiler::TraceMe traceme("CPUSubAllocator::Alloc");
       *bytes_received = num_bytes;
       return cpu_allocator_->AllocateRaw(alignment, num_bytes);
     }
 
     void Free(void* ptr, size_t num_bytes) override {
+      tsl::profiler::TraceMe traceme("CPUSubAllocator::Free");
       cpu_allocator_->DeallocateRaw(ptr);
     }
 
diff --git a/tensorflow/tsl/framework/fixedpoint/BUILD b/tensorflow/tsl/framework/fixedpoint/BUILD
index 6459850f58c..84528e7a566 100644
--- a/tensorflow/tsl/framework/fixedpoint/BUILD
+++ b/tensorflow/tsl/framework/fixedpoint/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
diff --git a/tensorflow/tsl/framework/type_traits.h b/tensorflow/tsl/framework/type_traits.h
index 52512341c9a..08811101aa3 100644
--- a/tensorflow/tsl/framework/type_traits.h
+++ b/tensorflow/tsl/framework/type_traits.h
@@ -69,7 +69,9 @@ struct is_simple_type {
   static constexpr bool value =
       std::is_trivial<T>::value || std::is_same<T, Eigen::half>::value ||
       std::is_same<T, complex64>::value || std::is_same<T, complex128>::value ||
-      is_quantized<T>::value || std::is_same<T, bfloat16>::value;
+      is_quantized<T>::value || std::is_same<T, bfloat16>::value ||
+      std::is_same<T, float8_e4m3fn>::value ||
+      std::is_same<T, float8_e5m2>::value;
 };
 
 }  // namespace tsl
diff --git a/tensorflow/tsl/lib/core/BUILD b/tensorflow/tsl/lib/core/BUILD
index 56647b09d39..a6d99b9bb5a 100644
--- a/tensorflow/tsl/lib/core/BUILD
+++ b/tensorflow/tsl/lib/core/BUILD
@@ -12,6 +12,7 @@ load(
 
 # TODO(rdzhabarov): Tighten visibility after migration is complete.
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/tsl/lib/gtl/BUILD b/tensorflow/tsl/lib/gtl/BUILD
index 2b4ea2dd631..9390d65fa65 100644
--- a/tensorflow/tsl/lib/gtl/BUILD
+++ b/tensorflow/tsl/lib/gtl/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/tsl/lib/gtl/subtle/BUILD b/tensorflow/tsl/lib/gtl/subtle/BUILD
index 351e256343e..43d27177b14 100644
--- a/tensorflow/tsl/lib/gtl/subtle/BUILD
+++ b/tensorflow/tsl/lib/gtl/subtle/BUILD
@@ -4,6 +4,7 @@
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/tsl/lib/hash/BUILD b/tensorflow/tsl/lib/hash/BUILD
index 53bb26ce294..cb29acc2604 100644
--- a/tensorflow/tsl/lib/hash/BUILD
+++ b/tensorflow/tsl/lib/hash/BUILD
@@ -14,6 +14,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/tsl/lib/io/table_builder.cc uses crc functionality
         "//tensorflow/tsl/lib/io:__pkg__",
diff --git a/tensorflow/tsl/lib/histogram/BUILD b/tensorflow/tsl/lib/histogram/BUILD
index b3835d74c6b..94b8e7b223b 100644
--- a/tensorflow/tsl/lib/histogram/BUILD
+++ b/tensorflow/tsl/lib/histogram/BUILD
@@ -9,6 +9,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/lib/histogram:__pkg__",
         "//tensorflow/tsl/lib/monitoring:__pkg__",
diff --git a/tensorflow/tsl/lib/io/BUILD b/tensorflow/tsl/lib/io/BUILD
index b426af11f76..8401da057aa 100644
--- a/tensorflow/tsl/lib/io/BUILD
+++ b/tensorflow/tsl/lib/io/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/c/experimental/filesystem:__pkg__",
         "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
@@ -14,6 +15,7 @@ package(
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/lib/io:__subpackages__",
+        "//tensorflow/tsl/profiler:__subpackages__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tsl/lib/io/snappy/BUILD b/tensorflow/tsl/lib/io/snappy/BUILD
index 0ace104d38c..34b63cf3da7 100644
--- a/tensorflow/tsl/lib/io/snappy/BUILD
+++ b/tensorflow/tsl/lib/io/snappy/BUILD
@@ -11,6 +11,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/lib/io:__pkg__",
         "//tensorflow/tsl/lib/io:__pkg__",
diff --git a/tensorflow/tsl/lib/math/BUILD b/tensorflow/tsl/lib/math/BUILD
index 8e9e38fcda2..5fbbe06c47f 100644
--- a/tensorflow/tsl/lib/math/BUILD
+++ b/tensorflow/tsl/lib/math/BUILD
@@ -5,6 +5,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/tensorflow/tsl/lib/monitoring/BUILD b/tensorflow/tsl/lib/monitoring/BUILD
index b826f2dc90e..0fc35a9c313 100644
--- a/tensorflow/tsl/lib/monitoring/BUILD
+++ b/tensorflow/tsl/lib/monitoring/BUILD
@@ -5,8 +5,9 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//learning/brain/google/monitoring:__pkg__",
+        "//learning/brain/google/monitoring:__subpackages__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
         # tensorflow/core/platform:monitoring depends on this package
@@ -17,9 +18,11 @@ package(
         # tensorflow/compiler/mlir/tfrt:tf_jitrt depends on this package
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
+        "//tensorflow/compiler/xla/hlo/experimental:__subpackages__",
         "//tensorflow/core/lib/monitoring:__subpackages__",
         "//tensorflow/compiler/xla/service:__subpackages__",
         "//tensorflow/tsl/framework:__subpackages__",
+        "//tensorflow/tsl/distributed_runtime:__subpackages__",
     ],
     licenses = ["notice"],
 )
diff --git a/tensorflow/tsl/lib/random/BUILD b/tensorflow/tsl/lib/random/BUILD
index 9d43b1423ea..a9ad0451e6b 100644
--- a/tensorflow/tsl/lib/random/BUILD
+++ b/tensorflow/tsl/lib/random/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_porta
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/tsl/lib/strings/BUILD b/tensorflow/tsl/lib/strings/BUILD
index d1d2caa0435..fb58a16cd44 100644
--- a/tensorflow/tsl/lib/strings/BUILD
+++ b/tensorflow/tsl/lib/strings/BUILD
@@ -4,6 +4,8 @@ load(
     "cc_library",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "proto_serialization",
     srcs = ["proto_serialization.cc"],
diff --git a/tensorflow/tsl/mkl/BUILD b/tensorflow/tsl/mkl/BUILD
new file mode 100644
index 00000000000..8eca8ad0cf9
--- /dev/null
+++ b/tensorflow/tsl/mkl/BUILD
@@ -0,0 +1,146 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "//tensorflow/tsl:tsl.bzl",
+    "clean_dep",
+)
+
+licenses(["notice"])  # 3-Clause BSD
+
+config_setting(
+    name = "build_with_mkl",
+    define_values = {
+        "build_with_mkl": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "build_with_mkl_lnx_x64",
+    define_values = {
+        "build_with_mkl": "true",
+    },
+    values = {
+        "cpu": "k8",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "build_with_mkl_lnx_openmp",
+    constraint_values = [
+        "@platforms//os:linux",
+    ],
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "build_with_mkl_windows_openmp",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "build_with_mkl_aarch64",
+    define_values = {
+        "build_with_mkl_aarch64": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "enable_mkl",
+    define_values = {
+        "enable_mkl": "true",
+        "build_with_mkl": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "LICENSE",
+    srcs = [
+        "MKL_LICENSE",
+        "@llvm_openmp//:LICENSE.txt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# TODO(Intel-tf) Remove the following 3 calls to cc_library and replace all uses
+# of mkl_libs_* with @llvm_openmp//:libiomp5.* directly.
+
+cc_library(
+    name = "mkl_libs_linux",
+    srcs = [
+        "@llvm_openmp//:libiomp5.so",
+    ],
+    hdrs = ["@llvm_openmp//:config_omp"],
+    target_compatible_with = select({
+        "//tensorflow/tsl/mkl:build_with_mkl": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+# MacOS build configuration is provided for completness, it has not been tested
+cc_library(
+    name = "mkl_libs_darwin",
+    srcs = [
+        "@llvm_openmp//:libiomp5.dylib",
+    ],
+    hdrs = ["@llvm_openmp//:config_omp"],
+    target_compatible_with = select({
+        "//tensorflow/tsl/mkl:build_with_mkl": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "mkl_libs_windows",
+    srcs = [
+        "@llvm_openmp//:libiomp5md.dll",
+    ],
+    hdrs = ["@llvm_openmp//:config_omp"],
+    target_compatible_with = select({
+        "//tensorflow/tsl/mkl:build_with_mkl": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "intel_binary_blob",
+    target_compatible_with = select({
+        "//tensorflow/tsl/mkl:build_with_mkl": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    visibility = ["//visibility:public"],
+    deps = select({
+        clean_dep("//tensorflow/tsl:linux_x86_64"): [
+            ":mkl_libs_linux",
+        ],
+        clean_dep("//tensorflow/tsl:macos"): [
+            ":mkl_libs_darwin",
+        ],
+        clean_dep("//tensorflow/tsl:windows"): [
+            ":mkl_libs_windows",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mkl/LICENSE b/tensorflow/tsl/mkl/LICENSE
similarity index 100%
rename from third_party/mkl/LICENSE
rename to tensorflow/tsl/mkl/LICENSE
diff --git a/third_party/mkl/MKL_LICENSE b/tensorflow/tsl/mkl/MKL_LICENSE
similarity index 100%
rename from third_party/mkl/MKL_LICENSE
rename to tensorflow/tsl/mkl/MKL_LICENSE
diff --git a/tensorflow/tsl/mkl/build_defs.bzl b/tensorflow/tsl/mkl/build_defs.bzl
new file mode 100644
index 00000000000..74e9a75d8a4
--- /dev/null
+++ b/tensorflow/tsl/mkl/build_defs.bzl
@@ -0,0 +1,147 @@
+"""Starlark macros for MKL.
+
+if_mkl is a conditional to check if we are building with MKL.
+if_mkl_ml is a conditional to check if we are building with MKL-ML.
+if_mkl_ml_only is a conditional to check for MKL-ML-only (no MKL-DNN) mode.
+if_mkl_lnx_x64 is a conditional to check for MKL
+if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
+
+mkl_repository is a repository rule for creating MKL repository rule that can
+be pointed to either a local folder, or download it from the internet.
+mkl_repository depends on the following environment variables:
+  * `TF_MKL_ROOT`: The root folder where a copy of libmkl is located.
+"""
+
+_TF_MKL_ROOT = "TF_MKL_ROOT"
+
+def if_mkl(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with oneDNN.
+
+      OneDNN gets built if we are building on platforms that support oneDNN
+      (x86 linux/windows) or if specifcially configured to use oneDNN.
+
+    Args:
+      if_true: expression to evaluate if building with oneDNN.
+      if_false: expression to evaluate if building without oneDNN.
+
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
+
+    TODO(intel-tf):
+      the first "if_true" line is kept because non-x86 platforms (e.g., ARM)
+      may need it. It may be deleted in future with refactoring.
+    """
+    return select({
+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": if_true,
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": if_true,
+        "@org_tensorflow//tensorflow/tsl:windows": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_mkl_ml(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with MKL-ML.
+
+    Args:
+      if_true: expression to evaluate if building with MKL-ML.
+      if_false: expression to evaluate if building without MKL-ML
+        (i.e. without MKL at all, or with MKL-DNN only).
+
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_opensource": if_false,
+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_mkl_lnx_x64(if_true, if_false = []):
+    """Shorthand to select() if building with MKL and the target is Linux x86-64.
+
+    Args:
+      if_true: expression to evaluate if building with MKL is enabled and the
+        target platform is Linux x86-64.
+      if_false: expression to evaluate if building without MKL or for a
+        different platform.
+
+    Returns:
+      a select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_x64": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_enable_mkl(if_true, if_false = []):
+    """Shorthand to select() if we are building with MKL and MKL is enabled.
+
+    This is only effective when built with MKL.
+
+    Args:
+      if_true: expression to evaluate if building with MKL and MKL is enabled
+      if_false: expression to evaluate if building without MKL or MKL is not enabled.
+
+    Returns:
+      A select evaluating to either if_true or if_false as appropriate.
+    """
+    return select({
+        "@org_tensorflow//tensorflow/tsl/mkl:enable_mkl": if_true,
+        "//conditions:default": if_false,
+    })
+
+def mkl_deps():
+    """Returns the correct set of oneDNN library dependencies.
+
+      Shorthand for select() to pull in the correct set of oneDNN library deps
+      depending on the platform. x86 Linux/Windows with or without --config=mkl
+      will always build with oneDNN library.
+
+    Returns:
+      a select evaluating to a list of library dependencies, suitable for
+      inclusion in the deps attribute of rules.
+    """
+    return select({
+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//tensorflow/tsl:windows": ["@mkl_dnn_v1//:mkl_dnn"],
+        "//conditions:default": [],
+    })
+
+def _enable_local_mkl(repository_ctx):
+    return _TF_MKL_ROOT in repository_ctx.os.environ
+
+def _mkl_autoconf_impl(repository_ctx):
+    """Implementation of the local_mkl_autoconf repository rule."""
+
+    if _enable_local_mkl(repository_ctx):
+        # Symlink lib and include local folders.
+        mkl_root = repository_ctx.os.environ[_TF_MKL_ROOT]
+        mkl_lib_path = "%s/lib" % mkl_root
+        repository_ctx.symlink(mkl_lib_path, "lib")
+        mkl_include_path = "%s/include" % mkl_root
+        repository_ctx.symlink(mkl_include_path, "include")
+        mkl_license_path = "%s/license.txt" % mkl_root
+        repository_ctx.symlink(mkl_license_path, "license.txt")
+    else:
+        # setup remote mkl repository.
+        repository_ctx.download_and_extract(
+            repository_ctx.attr.urls,
+            sha256 = repository_ctx.attr.sha256,
+            stripPrefix = repository_ctx.attr.strip_prefix,
+        )
+
+    # Also setup BUILD file.
+    repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
+
+mkl_repository = repository_rule(
+    implementation = _mkl_autoconf_impl,
+    environ = [
+        _TF_MKL_ROOT,
+    ],
+    attrs = {
+        "build_file": attr.label(),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
diff --git a/tensorflow/tsl/platform/BUILD b/tensorflow/tsl/platform/BUILD
index 1d0f57e80b1..86a058b75c0 100644
--- a/tensorflow/tsl/platform/BUILD
+++ b/tensorflow/tsl/platform/BUILD
@@ -4,7 +4,15 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load("@bazel_skylib//lib:selects.bzl", "selects")
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_cc_test",
+    "cuda_library",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "rocm_copts",
+)
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/tsl:tsl.bzl",
@@ -38,6 +46,7 @@ load(
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
@@ -70,6 +79,7 @@ cc_library(
 cc_library(
     name = "blocking_counter",
     hdrs = ["blocking_counter.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":logging",
         ":mutex",
@@ -145,13 +155,12 @@ cc_library(
     deps = [
         ":logging",
         ":macros",
-        ":platform",
         ":status",
         ":str_util",
         ":strcat",
-        ":types",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
     ],
 )
 
@@ -254,20 +263,21 @@ cc_library(
         ":logging",
         ":macros",
         ":mutex",
+        ":platform",
         ":stack_frame",
         ":stacktrace",
         ":str_util",
         ":strcat",
         ":stringprintf",
         ":types",
-        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/types:optional",
-    ],
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+    ] + tf_platform_deps("status"),
 )
 
 cc_library(
@@ -541,7 +551,7 @@ filegroup(
         "threadpool.h",
         "threadpool_interface.h",
         "tracing.h",
-    ] + selects.with_or({
+    ] + select({
         "//tensorflow/tsl:fuchsia": tf_google_mobile_srcs_no_runtime(),
         "//conditions:default": [
             "//tensorflow/tsl/platform/default:mobile_srcs_no_runtime",
@@ -650,7 +660,10 @@ exports_files(
         "tracing.h",
         "tracing.cc",
     ],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [
+        ":__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
 )
 
 filegroup(
@@ -968,6 +981,7 @@ cc_library(
     deps = [
         ":platform",
         ":bfloat16",
+        ":float8",
         ":tstring",
     ] + tf_platform_deps("types"),
 )
@@ -1004,6 +1018,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "float8",
+    hdrs = ["float8.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//third_party/eigen3",
+    ],
+)
+
+cuda_library(
+    name = "float8_test_lib",
+    testonly = 1,
+    srcs = ["float8_test.cu.cc"],
+    copts = rocm_copts(),
+    deps = [
+        ":float8",
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+cuda_cc_test(
+    name = "float8_test",
+    deps = [":float8_test_lib"],
+)
+
 cc_library(
     name = "cuda",
     deps = [
@@ -1279,6 +1320,7 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
+    visibility = ["//visibility:public"],
 )
 
 tsl_cc_test(
@@ -1431,7 +1473,6 @@ tsl_cc_test(
     ],
     deps = [
         ":path",
-        ":resource_loader",
         ":strcat",
         ":subprocess",
         ":test",
@@ -1637,6 +1678,7 @@ tsl_cc_test(
         ":test",
         ":test_main",
         ":types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1746,3 +1788,10 @@ tsl_cc_test(
         "//tensorflow/tsl/lib/core:status_test_util",
     ],
 )
+
+cc_library(
+    name = "static_threadlocal",
+    hdrs = [
+        "static_threadlocal.h",
+    ],
+)
diff --git a/tensorflow/tsl/platform/build_config.bzl b/tensorflow/tsl/platform/build_config.bzl
index 62d7bb6046b..a257152eea8 100644
--- a/tensorflow/tsl/platform/build_config.bzl
+++ b/tensorflow/tsl/platform/build_config.bzl
@@ -18,7 +18,6 @@ load(
     _tf_google_mobile_srcs_no_runtime = "tf_google_mobile_srcs_no_runtime",
     _tf_google_mobile_srcs_only_runtime = "tf_google_mobile_srcs_only_runtime",
     _tf_jspb_proto_library = "tf_jspb_proto_library",
-    _tf_kernel_tests_linkstatic = "tf_kernel_tests_linkstatic",
     _tf_lib_proto_parsing_deps = "tf_lib_proto_parsing_deps",
     _tf_logging_deps = "tf_logging_deps",
     _tf_platform_alias = "tf_platform_alias",
@@ -59,7 +58,6 @@ tf_fingerprint_deps = _tf_fingerprint_deps
 tf_google_mobile_srcs_no_runtime = _tf_google_mobile_srcs_no_runtime
 tf_google_mobile_srcs_only_runtime = _tf_google_mobile_srcs_only_runtime
 tf_jspb_proto_library = _tf_jspb_proto_library
-tf_kernel_tests_linkstatic = _tf_kernel_tests_linkstatic
 tf_lib_proto_parsing_deps = _tf_lib_proto_parsing_deps
 tf_logging_deps = _tf_logging_deps
 tf_platform_alias = _tf_platform_alias
diff --git a/tensorflow/tsl/platform/cloud/BUILD b/tensorflow/tsl/platform/cloud/BUILD
index c4104e8b38d..8c2db9cbee0 100644
--- a/tensorflow/tsl/platform/cloud/BUILD
+++ b/tensorflow/tsl/platform/cloud/BUILD
@@ -10,14 +10,15 @@ load(
 load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        ":dependency_whitelist",
+        ":dependency_allowlist",
     ],
     licenses = ["notice"],
 )
 
 package_group(
-    name = "dependency_whitelist",
+    name = "dependency_allowlist",
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
@@ -98,7 +99,6 @@ cc_library(
     srcs = ["gcs_file_system.cc"],
     hdrs = ["gcs_file_system.h"],
     copts = tsl_copts(),
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_metadata_client",
@@ -142,7 +142,6 @@ cc_library(
         "-DTPU_GCS_FS",
         "-DPLATFORM_CLOUD_TPU",
     ],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_metadata_client",
@@ -453,7 +452,6 @@ tsl_cc_test(
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/platform:resource_loader",
         "//tensorflow/tsl/platform:scanner",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -476,7 +474,6 @@ tsl_cc_test(
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/platform:resource_loader",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
diff --git a/tensorflow/tsl/platform/cloud/google_auth_provider_test.cc b/tensorflow/tsl/platform/cloud/google_auth_provider_test.cc
index 743026ea72a..a6ffb107949 100644
--- a/tensorflow/tsl/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/tsl/platform/cloud/google_auth_provider_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/cloud/http_request_fake.h"
 #include "tensorflow/tsl/platform/path.h"
-#include "tensorflow/tsl/platform/resource_loader.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace tsl {
@@ -28,7 +27,7 @@ namespace tsl {
 namespace {
 
 string TestData() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "cloud", "testdata");
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "cloud", "testdata");
 }
 
 class FakeEnv : public EnvWrapper {
@@ -83,11 +82,9 @@ class GoogleAuthProviderTest : public ::testing::Test {
 
 TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
   setenv("GOOGLE_APPLICATION_CREDENTIALS",
-         GetDataDependencyFilepath(
-             io::JoinPath(TestData(), "service_account_credentials.json"))
-             .c_str(),
+         io::JoinPath(TestData(), "service_account_credentials.json").c_str(),
          1);
-  setenv("CLOUDSDK_CONFIG", GetDataDependencyFilepath(TestData()).c_str(),
+  setenv("CLOUDSDK_CONFIG", TestData().c_str(),
          1);  // Will not be used.
 
   auto oauth_client = new FakeOAuthClient;
@@ -124,7 +121,7 @@ TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
 }
 
 TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
-  setenv("CLOUDSDK_CONFIG", GetDataDependencyFilepath(TestData()).c_str(), 1);
+  setenv("CLOUDSDK_CONFIG", TestData().c_str(), 1);
 
   auto oauth_client = new FakeOAuthClient;
   std::vector<HttpRequest*> requests;
diff --git a/tensorflow/tsl/platform/cloud/oauth_client_test.cc b/tensorflow/tsl/platform/cloud/oauth_client_test.cc
index 0956c70780a..10b9e79cbfc 100644
--- a/tensorflow/tsl/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/tsl/platform/cloud/oauth_client_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/tsl/platform/cloud/http_request_fake.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/path.h"
-#include "tensorflow/tsl/platform/resource_loader.h"
 #include "tensorflow/tsl/platform/scanner.h"
 #include "tensorflow/tsl/platform/test.h"
 
@@ -33,7 +32,7 @@ namespace tsl {
 namespace {
 
 string TestData() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "cloud", "testdata");
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "cloud", "testdata");
 }
 
 constexpr char kTokenJson[] = R"(
@@ -95,8 +94,8 @@ TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
 }
 
 TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
-  std::ifstream credentials(GetDataDependencyFilepath(
-      io::JoinPath(TestData(), "service_account_credentials.json")));
+  std::ifstream credentials(
+      io::JoinPath(TestData(), "service_account_credentials.json"));
   ASSERT_TRUE(credentials.is_open());
   Json::Value json;
   Json::Reader reader;
@@ -137,8 +136,8 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   // Check that 'signature' signs 'header_dot_claim'.
 
   // Read the serialized public key.
-  std::ifstream public_key_stream(GetDataDependencyFilepath(
-      io::JoinPath(TestData(), "service_account_public_key.txt")));
+  std::ifstream public_key_stream(
+      io::JoinPath(TestData(), "service_account_public_key.txt"));
   string public_key_serialized(
       (std::istreambuf_iterator<char>(public_key_stream)),
       (std::istreambuf_iterator<char>()));
diff --git a/tensorflow/tsl/platform/cloud/testdata/BUILD b/tensorflow/tsl/platform/cloud/testdata/BUILD
index a110973ce2d..737763db5bf 100644
--- a/tensorflow/tsl/platform/cloud/testdata/BUILD
+++ b/tensorflow/tsl/platform/cloud/testdata/BUILD
@@ -3,6 +3,7 @@
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/tsl/platform/default/BUILD b/tensorflow/tsl/platform/default/BUILD
index 8dec9335217..09e0823f021 100644
--- a/tensorflow/tsl/platform/default/BUILD
+++ b/tensorflow/tsl/platform/default/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/tsl:tsl.bzl", "if_not_fuchsia", "if_not_windows", "tsl_copts"
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/platform:__pkg__",
         "//tensorflow/tsl/platform:__pkg__",
@@ -385,6 +386,7 @@ cc_library(
     deps = [
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
+        "//tensorflow/tsl/platform:test",
         "@bazel_tools//tools/cpp/runfiles",
     ],
 )
@@ -420,7 +422,7 @@ cc_library(
 cc_library(
     name = "stacktrace_handler",
     srcs = ["stacktrace_handler.cc"],
-    hdrs = ["//tensorflow/tsl/platform:stacktrace_handler.h"],
+    hdrs = ["//tensorflow/tsl/platform:stacktrace_handler_hdrs"],
     linkstatic = 1,
     deps = [
         "//tensorflow/tsl/platform",
@@ -544,6 +546,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "status",
+    tags = [
+        "manual",
+        "no_oss",
+        "nobuilder",
+    ],
+    textual_hdrs = ["status.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 bzl_library(
     name = "cuda_build_defs_bzl",
     srcs = ["cuda_build_defs.bzl"],
@@ -574,6 +594,7 @@ filegroup(
         "posix_file_system.cc",
         "posix_file_system.h",
         "stacktrace.h",
+        "status.h",
         "tracing_impl.h",
         "//tensorflow/tsl/platform/profile_utils:cpu_utils.h",
         "//tensorflow/tsl/platform/profile_utils:i_cpu_utils_helper.h",
diff --git a/tensorflow/tsl/platform/default/build_config.bzl b/tensorflow/tsl/platform/default/build_config.bzl
index 6147e565732..c6b49ba7ca4 100644
--- a/tensorflow/tsl/platform/default/build_config.bzl
+++ b/tensorflow/tsl/platform/default/build_config.bzl
@@ -401,6 +401,13 @@ def py_proto_library(
         # is not explicitly listed in py_libs. Instead, host system is assumed to
         # have grpc installed.
 
+    genproto_deps = []
+    for dep in deps:
+        if dep != "@com_google_protobuf//:protobuf_python":
+            genproto_deps.append(dep + "_genproto")
+        else:
+            genproto_deps.append("@com_google_protobuf//:well_known_types_py_pb2_genproto")
+
     proto_gen(
         name = name + "_genproto",
         srcs = srcs,
@@ -411,7 +418,7 @@ def py_proto_library(
         plugin_language = "grpc",
         protoc = protoc,
         visibility = ["//visibility:public"],
-        deps = [s + "_genproto" for s in deps],
+        deps = genproto_deps,
     )
 
     if default_runtime and not default_runtime in py_libs + deps:
@@ -644,29 +651,30 @@ def tf_proto_library(
 
 def tf_additional_lib_hdrs():
     return [
-        "//tensorflow/tsl/platform/default:casts.h",
-        "//tensorflow/tsl/platform/default:context.h",
-        "//tensorflow/tsl/platform/default:cord.h",
-        "//tensorflow/tsl/platform/default:dynamic_annotations.h",
-        "//tensorflow/tsl/platform/default:integral_types.h",
-        "//tensorflow/tsl/platform/default:logging.h",
-        "//tensorflow/tsl/platform/default:mutex.h",
-        "//tensorflow/tsl/platform/default:mutex_data.h",
-        "//tensorflow/tsl/platform/default:notification.h",
-        "//tensorflow/tsl/platform/default:stacktrace.h",
-        "//tensorflow/tsl/platform/default:tracing_impl.h",
-        "//tensorflow/tsl/platform/default:unbounded_work_queue.h",
+        clean_dep("//tensorflow/tsl/platform/default:casts.h"),
+        clean_dep("//tensorflow/tsl/platform/default:context.h"),
+        clean_dep("//tensorflow/tsl/platform/default:cord.h"),
+        clean_dep("//tensorflow/tsl/platform/default:dynamic_annotations.h"),
+        clean_dep("//tensorflow/tsl/platform/default:integral_types.h"),
+        clean_dep("//tensorflow/tsl/platform/default:logging.h"),
+        clean_dep("//tensorflow/tsl/platform/default:mutex.h"),
+        clean_dep("//tensorflow/tsl/platform/default:mutex_data.h"),
+        clean_dep("//tensorflow/tsl/platform/default:notification.h"),
+        clean_dep("//tensorflow/tsl/platform/default:stacktrace.h"),
+        clean_dep("//tensorflow/tsl/platform/default:status.h"),
+        clean_dep("//tensorflow/tsl/platform/default:tracing_impl.h"),
+        clean_dep("//tensorflow/tsl/platform/default:unbounded_work_queue.h"),
     ] + select({
-        "//tensorflow/tsl:windows": [
-            "//tensorflow/tsl/platform/windows:intrinsics_port.h",
-            "//tensorflow/tsl/platform/windows:stacktrace.h",
-            "//tensorflow/tsl/platform/windows:subprocess.h",
-            "//tensorflow/tsl/platform/windows:wide_char.h",
-            "//tensorflow/tsl/platform/windows:windows_file_system.h",
+        clean_dep("//tensorflow/tsl:windows"): [
+            clean_dep("//tensorflow/tsl/platform/windows:intrinsics_port.h"),
+            clean_dep("//tensorflow/tsl/platform/windows:stacktrace.h"),
+            clean_dep("//tensorflow/tsl/platform/windows:subprocess.h"),
+            clean_dep("//tensorflow/tsl/platform/windows:wide_char.h"),
+            clean_dep("//tensorflow/tsl/platform/windows:windows_file_system.h"),
         ],
         "//conditions:default": [
-            "//tensorflow/tsl/platform/default:posix_file_system.h",
-            "//tensorflow/tsl/platform/default:subprocess.h",
+            clean_dep("//tensorflow/tsl/platform/default:posix_file_system.h"),
+            clean_dep("//tensorflow/tsl/platform/default:subprocess.h"),
         ],
     })
 
@@ -676,9 +684,9 @@ def tf_additional_all_protos():
 def tf_protos_all():
     return if_static(
         extra_deps = [
-            clean_dep("//tensorflow/core/protobuf:autotuning_proto_cc_impl"),
             clean_dep("//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl"),
             clean_dep("//tensorflow/core:protos_all_cc_impl"),
+            clean_dep("//tensorflow/tsl/protobuf:autotuning_proto_cc_impl"),
             clean_dep("//tensorflow/tsl/protobuf:protos_all_cc_impl"),
         ],
         otherwise = [clean_dep("//tensorflow/core:protos_all_cc")],
@@ -686,9 +694,9 @@ def tf_protos_all():
 
 def tf_protos_profiler_service():
     return [
-        clean_dep("//tensorflow/core/profiler:profiler_analysis_proto_cc_impl"),
-        clean_dep("//tensorflow/core/profiler:profiler_service_proto_cc_impl"),
-        clean_dep("//tensorflow/core/profiler:profiler_service_monitor_result_proto_cc_impl"),
+        clean_dep("//tensorflow/tsl/profiler/protobuf:profiler_analysis_proto_cc_impl"),
+        clean_dep("//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc_impl"),
+        clean_dep("//tensorflow/tsl/profiler/protobuf:profiler_service_monitor_result_proto_cc_impl"),
     ]
 
 def tf_protos_grappler_impl():
@@ -709,9 +717,6 @@ def tf_additional_device_tracer_srcs():
 def tf_additional_test_deps():
     return []
 
-def tf_kernel_tests_linkstatic():
-    return 0
-
 def tf_additional_lib_deps():
     """Additional dependencies needed to build TF libraries."""
     return [
@@ -730,7 +735,7 @@ def tf_additional_core_deps():
         clean_dep("//tensorflow/tsl:ios"): [],
         clean_dep("//tensorflow/tsl:linux_s390x"): [],
         "//conditions:default": [
-            "//tensorflow/tsl/platform/cloud:gcs_file_system",
+            clean_dep("//tensorflow/tsl/platform/cloud:gcs_file_system"),
         ],
     })
 
@@ -788,8 +793,12 @@ def tsl_cc_test(
             [],
             [
                 clean_dep("@com_google_protobuf//:protobuf"),
-                "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc_impl",
-                "//tensorflow/tsl/protobuf:histogram_proto_cc_impl",
+                # TODO(ddunleavy) remove these and add proto deps to tests
+                # granularly
+                clean_dep("//tensorflow/tsl/protobuf:error_codes_proto_impl_cc_impl"),
+                clean_dep("//tensorflow/tsl/protobuf:histogram_proto_cc_impl"),
+                clean_dep("//tensorflow/tsl/profiler/protobuf:xplane_proto_cc_impl"),
+                clean_dep("//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc_impl"),
             ],
         ),
         **kwargs
@@ -829,10 +838,10 @@ def tf_platform_alias(name, platform_dir = "//tensorflow/tsl/platform/"):
     return [platform_dir + "default:" + name]
 
 def tf_logging_deps():
-    return ["//tensorflow/tsl/platform/default:logging"]
+    return [clean_dep("//tensorflow/tsl/platform/default:logging")]
 
 def tf_resource_deps():
-    return ["//tensorflow/tsl/platform/default:resource"]
+    return [clean_dep("//tensorflow/tsl/platform/default:resource")]
 
 def tf_portable_deps_no_runtime():
     return [
@@ -854,7 +863,7 @@ def if_llvm_aarch64_available(then, otherwise = []):
 
 def if_llvm_system_z_available(then, otherwise = []):
     return select({
-        "//tensorflow/tsl:linux_s390x": then,
+        clean_dep("//tensorflow/tsl:linux_s390x"): then,
         "//conditions:default": otherwise,
     })
 
diff --git a/tensorflow/tsl/platform/default/build_config/BUILD b/tensorflow/tsl/platform/default/build_config/BUILD
index de18198b017..e18805bfd8b 100644
--- a/tensorflow/tsl/platform/default/build_config/BUILD
+++ b/tensorflow/tsl/platform/default/build_config/BUILD
@@ -3,6 +3,10 @@
 
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl:tsl.bzl", "tsl_copts")
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "if_cuda",
+)
 
 package(default_visibility = ["//tensorflow/tsl:internal"])
 
@@ -53,7 +57,7 @@ cc_library(
     name = "proto_parsing",
     copts = tsl_copts(),
     deps = [
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
     ],
 )
 
@@ -66,21 +70,6 @@ cc_library(
     copts = tsl_copts(),
 )
 
-cc_library(
-    name = "platformlib",
-    copts = tsl_copts(),
-    deps = [
-        ":gif",
-        ":jpeg",
-        "//tensorflow/core:protos_all_cc",
-        "@com_googlesource_code_re2//:re2",
-        "@farmhash_archive//:farmhash",
-        "@fft2d",
-        "@highwayhash//:sip_hash",
-        "@zlib",
-    ],
-)
-
 cc_library(
     name = "gif",
     copts = tsl_copts(),
@@ -112,7 +101,6 @@ cc_library(
     linkstatic = 1,
     deps = [],
 )
-
 cc_library(
     name = "cuda",
     data = [
@@ -128,7 +116,7 @@ cc_library(
             "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
         ],
     }),
-    deps = [
+    deps = if_cuda([
         "@local_config_cuda//cuda:cudart",
-    ],
+    ]),
 )
diff --git a/tensorflow/tsl/platform/default/dso_loader.cc b/tensorflow/tsl/platform/default/dso_loader.cc
index 645bd3614fc..18daf3d02cb 100644
--- a/tensorflow/tsl/platform/default/dso_loader.cc
+++ b/tensorflow/tsl/platform/default/dso_loader.cc
@@ -38,6 +38,7 @@ namespace internal {
 namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
 string GetCudaRtVersion() { return TF_CUDART_VERSION; }
+string GetCuptiVersion() { return TF_CUPTI_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 string GetCublasVersion() { return TF_CUBLAS_VERSION; }
 string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
@@ -63,7 +64,7 @@ StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
     message += absl::StrCat("; LD_LIBRARY_PATH: ", ld_library_path);
   }
 #endif
-  LOG(WARNING) << message;
+  VLOG(1) << message;
   return Status(error::FAILED_PRECONDITION, message);
 }
 }  // namespace
@@ -113,7 +114,7 @@ StatusOr<void*> GetCurandDsoHandle() {
 
 StatusOr<void*> GetCuptiDsoHandle() {
   // Load specific version of CUPTI this is built.
-  auto status_or_handle = GetDsoHandle("cupti", GetCudaVersion());
+  auto status_or_handle = GetDsoHandle("cupti", GetCuptiVersion());
   if (status_or_handle.ok()) return status_or_handle;
   // Load whatever libcupti.so user specified.
   return GetDsoHandle("cupti", "");
@@ -144,7 +145,7 @@ StatusOr<void*> GetRocblasDsoHandle() { return GetDsoHandle("rocblas", ""); }
 StatusOr<void*> GetMiopenDsoHandle() { return GetDsoHandle("MIOpen", ""); }
 
 StatusOr<void*> GetHipfftDsoHandle() {
-#if TF_ROCM_VERSION < 40100 || TENSORFLOW_USE_DCU
+#if TF_ROCM_VERSION < 40100
   return GetDsoHandle("rocfft", "");
 #else
   return GetDsoHandle("hipfft", "");
@@ -157,7 +158,7 @@ StatusOr<void*> GetRocsolverDsoHandle() {
   return GetDsoHandle("rocsolver", "");
 }
 
-#if TF_ROCM_VERSION >= 40500
+#if TF_ROCM_VERSION >= 40500 || TENSORFLOW_USE_DCU
 StatusOr<void*> GetHipsolverDsoHandle() {
   return GetDsoHandle("hipsolver", "");
 }
@@ -256,7 +257,7 @@ StatusOr<void*> GetRocsolverDsoHandle() {
   return *result;
 }
 
-#if TF_ROCM_VERSION >= 40500
+#if TF_ROCM_VERSION >= 40500 || TENSORFLOW_USE_DCU
 StatusOr<void*> GetHipsolverDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipsolverDsoHandle());
   return *result;
diff --git a/tensorflow/tsl/platform/default/logging.cc b/tensorflow/tsl/platform/default/logging.cc
index e03743f5965..9f978e29599 100644
--- a/tensorflow/tsl/platform/default/logging.cc
+++ b/tensorflow/tsl/platform/default/logging.cc
@@ -574,6 +574,7 @@ void TFDefaultLogSink::Send(const TFLogEntry& entry) {
   fprintf(vlog_file.FilePtr(), "%s.%06d: %c%s %s:%d] %s\n", time_buffer,
           micros_remainder, sev, tid_buffer, entry.FName().c_str(),
           entry.Line(), entry.ToString().c_str());
+  fflush(vlog_file.FilePtr());  // Ensure logs are written immediately.
 #endif  // PLATFORM_POSIX_ANDROID
 }
 
diff --git a/tensorflow/tsl/platform/default/logging.h b/tensorflow/tsl/platform/default/logging.h
index 49b5f0cac8c..417d7d81f83 100644
--- a/tensorflow/tsl/platform/default/logging.h
+++ b/tensorflow/tsl/platform/default/logging.h
@@ -40,6 +40,29 @@ limitations under the License.
 // TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
 #undef ERROR
 
+// Undef everything in case we're being mixed with some other Google library
+// which already defined them itself.  Presumably all Google libraries will
+// support the same syntax for these so it should not be a big deal if they
+// end up using our definitions instead.
+#undef LOG
+#undef VLOG
+
+#undef CHECK
+#undef CHECK_EQ
+#undef CHECK_NE
+#undef CHECK_LT
+#undef CHECK_LE
+#undef CHECK_GT
+#undef CHECK_GE
+
+#undef DCHECK
+#undef DCHECK_EQ
+#undef DCHECK_NE
+#undef DCHECK_LT
+#undef DCHECK_LE
+#undef DCHECK_GT
+#undef DCHECK_GE
+
 namespace tsl {
 const int INFO = 0;            // base_logging::INFO;
 const int WARNING = 1;         // base_logging::WARNING;
diff --git a/tensorflow/tsl/platform/default/port.cc b/tensorflow/tsl/platform/default/port.cc
index 58e71ca3576..0a1052aded5 100644
--- a/tensorflow/tsl/platform/default/port.cc
+++ b/tensorflow/tsl/platform/default/port.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/snappy.h"
 #include "tensorflow/tsl/platform/types.h"
 
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__linux__)
 #include <sched.h>
 #include <sys/sysinfo.h>
 #else
@@ -89,7 +89,7 @@ int64_t JobUid() { return -1; }
 int64_t TaskId() { return -1; }
 
 int NumSchedulableCPUs() {
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__linux__)
   for (int ncpus = 1024; ncpus < std::numeric_limits<int>::max() / 2;
        ncpus *= 2) {
     size_t setsize = CPU_ALLOC_SIZE(ncpus);
@@ -136,7 +136,7 @@ int NumTotalCPUs() {
 int GetCurrentCPU() {
 #if defined(__EMSCRIPTEN__)
   return sched_getcpu();
-#elif defined(__linux__) && !defined(__ANDROID__)
+#elif defined(__linux__)
   return sched_getcpu();
   // Attempt to use cpuid on all other platforms.  If that fails, perform a
   // syscall.
@@ -432,7 +432,7 @@ std::size_t MallocExtension_GetAllocatedSize(const void* p) {
 
 MemoryInfo GetMemoryInfo() {
   MemoryInfo mem_info = {INT64_MAX, INT64_MAX};
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__linux__)
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
diff --git a/tensorflow/tsl/platform/default/resource_loader.cc b/tensorflow/tsl/platform/default/resource_loader.cc
index 6c9bb57bcee..2a74a0167bd 100644
--- a/tensorflow/tsl/platform/default/resource_loader.cc
+++ b/tensorflow/tsl/platform/default/resource_loader.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/resource_loader.h"
 
+#include <cstdlib>
+
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/path.h"
+#include "tensorflow/tsl/platform/test.h"
 #include "tools/cpp/runfiles/runfiles.h"
 
 using bazel::tools::cpp::runfiles::Runfiles;
@@ -31,7 +34,10 @@ std::string GetDataDependencyFilepath(const std::string& relative_path) {
     LOG(FATAL) << "Unable to access the data dependencies of this test.\n"
                   "Make sure you are running this test using bazel.";
   }
-  return runfiles->Rlocation(io::JoinPath("org_tensorflow", relative_path));
+
+  const char* workspace_cstr = std::getenv("TEST_WORKSPACE");
+  EXPECT_THAT(workspace_cstr, ::testing::NotNull());
+  return runfiles->Rlocation(io::JoinPath(workspace_cstr, relative_path));
 }
 
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/default/status.h b/tensorflow/tsl/platform/default/status.h
new file mode 100644
index 00000000000..9737de0709d
--- /dev/null
+++ b/tensorflow/tsl/platform/default/status.h
@@ -0,0 +1,70 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
+#define TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
+
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+
+namespace tsl {
+#if ABSL_HAVE_BUILTIN(__builtin_LINE) && ABSL_HAVE_BUILTIN(__builtin_FILE)
+#define TF_INTERNAL_HAVE_BUILTIN_LINE_FILE 1
+#endif
+
+class SourceLocationImpl {
+ public:
+  uint32_t line() const { return line_; }
+  const char* file_name() const { return file_name_; }
+
+#ifdef TF_INTERNAL_HAVE_BUILTIN_LINE_FILE
+  static SourceLocationImpl current(uint32_t line = __builtin_LINE(),
+                                    const char* file_name = __builtin_FILE()) {
+    return SourceLocationImpl(line, file_name);
+  }
+#else
+  static SourceLocationImpl current(uint32_t line = 0,
+                                    const char* file_name = nullptr) {
+    return SourceLocationImpl(line, file_name);
+  }
+#endif
+ private:
+  SourceLocationImpl(uint32_t line, const char* file_name)
+      : line_(line), file_name_(file_name) {}
+  uint32_t line_;
+  const char* file_name_;
+};
+
+namespace internal {
+
+inline absl::Status MakeAbslStatus(
+    ::tensorflow::error::Code code, absl::string_view message,
+    absl::Span<const SourceLocationImpl>,
+    SourceLocationImpl loc = SourceLocationImpl::current()) {
+  return absl::Status(static_cast<absl::StatusCode>(code), message);
+}
+
+inline absl::Span<const SourceLocationImpl> GetSourceLocations(
+    const absl::Status& status) {
+  return {};
+}
+
+}  // namespace internal
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
diff --git a/tensorflow/tsl/platform/default/test.cc b/tensorflow/tsl/platform/default/test.cc
index f5be4336dd4..bfeb1f361b8 100644
--- a/tensorflow/tsl/platform/default/test.cc
+++ b/tensorflow/tsl/platform/default/test.cc
@@ -28,11 +28,11 @@ namespace testing {
 
 string TmpDir() {
   // 'bazel test' sets TEST_TMPDIR
-  const char* env = getenv("TEST_TMPDIR");
+  const char* env = std::getenv("TEST_TMPDIR");
   if (env && env[0] != '\0') {
     return env;
   }
-  env = getenv("TMPDIR");
+  env = std::getenv("TMPDIR");
   if (env && env[0] != '\0') {
     return env;
   }
@@ -45,7 +45,7 @@ string SrcDir() {
 }
 
 int RandomSeed() {
-  const char* env = getenv("TEST_RANDOM_SEED");
+  const char* env = std::getenv("TEST_RANDOM_SEED");
   int result;
   if (env && sscanf(env, "%d", &result) == 1) {
     return result;
@@ -58,8 +58,8 @@ int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 string TensorFlowSrcRoot() {
   // 'bazel test' sets TEST_SRCDIR, and also TEST_WORKSPACE if a new
   // enough version of bazel is used.
-  const char* env = getenv("TEST_SRCDIR");
-  const char* workspace = getenv("TEST_WORKSPACE");
+  const char* env = std::getenv("TEST_SRCDIR");
+  const char* workspace = std::getenv("TEST_WORKSPACE");
   if (env && env[0] != '\0') {
     if (workspace && workspace[0] != '\0') {
       return io::JoinPath(env, workspace, "tensorflow");
@@ -71,5 +71,43 @@ string TensorFlowSrcRoot() {
   return "tensorflow";
 }
 
+// Returns the path to XLA in the directory containing data
+// dependencies.
+string XlaSrcRoot() {
+  // 'bazel test' sets TEST_SRCDIR, and also TEST_WORKSPACE if a new
+  // enough version of bazel is used.
+  const char* env = std::getenv("TEST_SRCDIR");
+  const char* workspace = std::getenv("TEST_WORKSPACE");
+  const char* xla_path = "tensorflow/compiler/xla";
+  if (env && env[0] != '\0') {
+    if (workspace && workspace[0] != '\0') {
+      return io::JoinPath(env, workspace, xla_path);
+    }
+    return io::JoinPath(env, xla_path);
+  }
+  LOG(WARNING) << "TEST_SRCDIR environment variable not set: "
+               << "using $PWD/xla as XlaSrcRoot() for tests.";
+  return io::JoinPath(env, xla_path);
+}
+
+// Returns the path to TSL in the directory containing data
+// dependencies.
+string TslSrcRoot() {
+  // 'bazel test' sets TEST_SRCDIR, and also TEST_WORKSPACE if a new
+  // enough version of bazel is used.
+  const char* env = std::getenv("TEST_SRCDIR");
+  const char* workspace = std::getenv("TEST_WORKSPACE");
+  const char* tsl_path = "tensorflow/tsl";
+  if (env && env[0] != '\0') {
+    if (workspace && workspace[0] != '\0') {
+      return io::JoinPath(env, workspace, tsl_path);
+    }
+    return io::JoinPath(env, tsl_path);
+  }
+  LOG(WARNING) << "TEST_SRCDIR environment variable not set: "
+               << "using $PWD/tsl as TslSrcRoot() for tests.";
+  return io::JoinPath(env, tsl_path);
+}
+
 }  // namespace testing
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/errors.h b/tensorflow/tsl/platform/errors.h
index f7cab5b26d8..02dfc74ebab 100644
--- a/tensorflow/tsl/platform/errors.h
+++ b/tensorflow/tsl/platform/errors.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/attributes.h"
+#include "absl/strings/cord.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/macros.h"
@@ -102,7 +103,7 @@ inline void InsertPayloads(
     ::tsl::Status& status,
     const std::unordered_map<std::string, std::string>& payloads) {
   for (const auto& payload : payloads) {
-    status.SetPayload(payload.first, payload.second);
+    status.SetPayload(payload.first, absl::Cord(payload.second));
   }
 }
 
@@ -110,15 +111,16 @@ inline void InsertPayloads(
 // payloads in the destination if they exist with the same key.
 inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
   from.ForEachPayload([&to](tsl::StringPiece key, tsl::StringPiece value) {
-    to.SetPayload(key, value);
+    to.SetPayload(key, absl::Cord(value));
   });
 }
 
 // Creates a new status with the given code, message and payloads.
 inline ::tsl::Status Create(
     Code code, ::tsl::StringPiece message,
-    const std::unordered_map<std::string, std::string>& payloads) {
-  Status status(code, message);
+    const std::unordered_map<std::string, std::string>& payloads,
+    SourceLocation loc = SourceLocation::current()) {
+  Status status(code, message, loc);
   InsertPayloads(status, payloads);
   return status;
 }
@@ -184,12 +186,41 @@ ::tsl::Status InvalidArgument(Args... args) {
                        ::tsl::strings::StrCat(
                            ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
+// Specialized overloads to capture source location for up to three arguments.
+template <typename Arg1, typename Arg2, typename Arg3>
+::tsl::Status InvalidArgument(Arg1 arg1, Arg2 arg2, Arg3 arg3,
+                              SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::INVALID_ARGUMENT,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)),
+      loc);
+}
+template <typename Arg1, typename Arg2>
+::tsl::Status InvalidArgument(Arg1 arg1, Arg2 arg2,
+                              SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::INVALID_ARGUMENT,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)),
+      loc);
+}
+template <typename Arg1>
+::tsl::Status InvalidArgument(Arg1 arg1,
+                              SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::INVALID_ARGUMENT,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
+      loc);
+}
 template <typename... Args>
 ::tsl::Status InvalidArgumentWithPayloads(
     const ::tsl::StringPiece& message,
-    const std::unordered_map<std::string, std::string>& payloads) {
-  return errors::Create(::tsl::error::Code::INVALID_ARGUMENT, message,
-                        payloads);
+    const std::unordered_map<std::string, std::string>& payloads,
+    SourceLocation loc = SourceLocation::current()) {
+  return errors::Create(::tsl::error::Code::INVALID_ARGUMENT, message, payloads,
+                        loc);
 }
 
 // NotFound
@@ -199,11 +230,40 @@ ::tsl::Status NotFound(Args... args) {
                        ::tsl::strings::StrCat(
                            ::tsl::errors::internal::PrepareForStrCat(args)...));
 }
+// Specialized overloads to capture source location for up to three arguments.
+template <typename Arg1, typename Arg2, typename Arg3>
+::tsl::Status NotFound(Arg1 arg1, Arg2 arg2, Arg3 arg3,
+                       SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::NOT_FOUND,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3)),
+      loc);
+}
+template <typename Arg1, typename Arg2>
+::tsl::Status NotFound(Arg1 arg1, Arg2 arg2,
+                       SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::NOT_FOUND,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2)),
+      loc);
+}
+template <typename Arg1>
+::tsl::Status NotFound(Arg1 arg1,
+                       SourceLocation loc = SourceLocation::current()) {
+  return ::tsl::Status(
+      ::tsl::error::Code::NOT_FOUND,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1)),
+      loc);
+}
 template <typename... Args>
 ::tsl::Status NotFoundWithPayloads(
     const ::tsl::StringPiece& message,
-    const std::unordered_map<std::string, std::string>& payloads) {
-  return errors::Create(::tsl::error::Code::NOT_FOUND, message, payloads);
+    const std::unordered_map<std::string, std::string>& payloads,
+    SourceLocation loc = SourceLocation::current()) {
+  return errors::Create(::tsl::error::Code::NOT_FOUND, message, payloads, loc);
 }
 
 // AlreadyExists
diff --git a/tensorflow/tsl/platform/errors_test.cc b/tensorflow/tsl/platform/errors_test.cc
index de13486c797..003f74a9ff4 100644
--- a/tensorflow/tsl/platform/errors_test.cc
+++ b/tensorflow/tsl/platform/errors_test.cc
@@ -21,17 +21,17 @@ namespace tsl {
 
 TEST(AppendToMessageTest, PayloadsAreCopied) {
   Status status = errors::Aborted("Aborted Error Message");
-  status.SetPayload("payload_key", "payload_value");
+  status.SetPayload("payload_key", absl::Cord("payload_value"));
   errors::AppendToMessage(&status, "Appended Message");
 
   EXPECT_EQ(status.error_message(),
             "Aborted Error Message\n\tAppended Message");
-  EXPECT_EQ(status.GetPayload("payload_key"), "payload_value");
+  EXPECT_EQ(status.GetPayload("payload_key"), absl::Cord("payload_value"));
 }
 
 TEST(Status, GetAllPayloads) {
   Status s_error(error::INTERNAL, "Error message");
-  s_error.SetPayload("Error key", "foo");
+  s_error.SetPayload("Error key", absl::Cord("foo"));
   auto payloads_error_status = errors::GetPayloads(s_error);
   ASSERT_EQ(payloads_error_status.size(), 1);
   ASSERT_EQ(payloads_error_status["Error key"], "foo");
@@ -44,7 +44,7 @@ TEST(Status, GetAllPayloads) {
 TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
   // An OK status will should not change after InsertPayloads() calls.
   Status s_error(error::INTERNAL, "Error message");
-  s_error.SetPayload("Error key", "foo");
+  s_error.SetPayload("Error key", absl::Cord("foo"));
   Status s_ok = Status();
 
   errors::InsertPayloads(s_ok, errors::GetPayloads(s_error));
@@ -55,7 +55,7 @@ TEST(Status, OKStatusInsertPayloadsFromErrorStatus) {
 TEST(Status, ErrorStatusInsertPayloadsFromOKStatus) {
   // An InsertPayloads() call should not take effect from empty inputs.
   Status s_error(error::INTERNAL, "Error message");
-  s_error.SetPayload("Error key", "foo");
+  s_error.SetPayload("Error key", absl::Cord("foo"));
   Status s_ok = Status();
 
   errors::InsertPayloads(s_error, errors::GetPayloads(s_ok));
@@ -64,10 +64,10 @@ TEST(Status, ErrorStatusInsertPayloadsFromOKStatus) {
 
 TEST(Status, ErrorStatusInsertPayloadsFromErrorStatus) {
   Status s_error1(error::INTERNAL, "Error message");
-  s_error1.SetPayload("Error key 1", "foo");
-  s_error1.SetPayload("Error key 2", "bar");
+  s_error1.SetPayload("Error key 1", absl::Cord("foo"));
+  s_error1.SetPayload("Error key 2", absl::Cord("bar"));
   Status s_error2(error::INTERNAL, "Error message");
-  s_error2.SetPayload("Error key", "bar");
+  s_error2.SetPayload("Error key", absl::Cord("bar"));
   ASSERT_EQ(s_error2.GetPayload("Error key"), "bar");
 
   errors::InsertPayloads(s_error2, errors::GetPayloads(s_error1));
diff --git a/tensorflow/tsl/platform/float8.h b/tensorflow/tsl/platform/float8.h
new file mode 100644
index 00000000000..daff4a36145
--- /dev/null
+++ b/tensorflow/tsl/platform/float8.h
@@ -0,0 +1,884 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_FLOAT8_H_
+#define TENSORFLOW_TSL_PLATFORM_FLOAT8_H_
+
+// 8-bit Floating Point Interchange Format, as described by
+//   https://arxiv.org/abs/2209.05433
+
+#include <cstdint>
+#include <ostream>
+
+#include "third_party/eigen3/Eigen/Core"
+
+namespace tsl {
+
+namespace float8_internal {
+
+// Forward-declarations of classes.
+class float8_e4m3fn;
+class float8_e5m2;
+
+template <typename Derived>
+class float8_base {
+ protected:
+  // Constructor tag to allow constexpr construction from bit representation.
+  struct ConstructFromRepTag {};
+
+  constexpr float8_base() : rep_(0) {}
+  constexpr float8_base(uint8_t rep, ConstructFromRepTag) : rep_{rep} {}
+
+ public:
+  constexpr uint8_t rep() const { return rep_; }
+
+  constexpr Derived operator-() const {
+    return Derived(static_cast<uint8_t>(rep() ^ 0x80), ConstructFromRepTag{});
+  }
+
+  constexpr bool operator==(const Derived& other) const {
+    if (Eigen::numext::isnan(derived()) || Eigen::numext::isnan(other)) {
+      return false;
+    } else if ((rep() & 0x7F) == 0) {
+      return (other.rep() & 0x7F) == 0;
+    }
+    return rep() == other.rep();
+  }
+
+  constexpr bool operator!=(const Derived& other) const {
+    return !(derived() == other);
+  }
+
+  constexpr const Derived& derived() const {
+    return *static_cast<const Derived*>(this);
+  }
+
+  constexpr Derived& derived() { return *static_cast<Derived*>(this); }
+
+  static constexpr Derived FromRep(uint8_t rep) {
+    return Derived(rep, ConstructFromRepTag{});
+  }
+
+  // Conversions allowing saturation and truncation.
+  template <bool kSaturate = false, bool kTruncate = false, typename From>
+  static inline EIGEN_DEVICE_FUNC Derived ConvertFrom(const From& from);
+
+  template <typename To, bool kSaturate = false, bool kTruncate = false>
+  static inline EIGEN_DEVICE_FUNC To ConvertTo(const Derived& from);
+
+  // Operators via float32.
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived
+  operator+(const Derived& other) const {
+    return Derived{float{derived()} + float{other}};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived
+  operator-(const Derived& other) const {
+    return Derived{float{derived()} - float{other}};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived
+  operator*(const Derived& other) const {
+    return Derived{float{derived()} * float{other}};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived
+  operator/(const Derived& other) const {
+    return Derived{float{derived()} / float{other}};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(
+      const Derived& other) const {
+    return float{derived()} < float{other};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(
+      const Derived& other) const {
+    return float{derived()} <= float{other};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(
+      const Derived& other) const {
+    return float{derived()} > float{other};
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(
+      const Derived& other) const {
+    return float{derived()} >= float{other};
+  }
+
+  // Compound assignment.
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived& operator+=(
+      const Derived& other) {
+    derived() = derived() + other;
+    return derived();
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived& operator-=(
+      const Derived& other) {
+    derived() = derived() - other;
+    return derived();
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived& operator*=(
+      const Derived& other) {
+    derived() = derived() * other;
+    return derived();
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Derived& operator/=(
+      const Derived& other) {
+    derived() = derived() / other;
+    return derived();
+  }
+
+ private:
+  uint8_t rep_;
+};
+
+class float8_e4m3fn : public float8_base<float8_e4m3fn> {
+  // Exponent: 4, Mantissa: 3, bias: 7.
+  // Extended range: no inf, NaN represented by 0bS111'1111.
+  // The "fn" suffix is for consistency with the corresponding LLVM/MLIR type,
+  // signaling this type is not consistent with IEEE-754.  The "f" indicates
+  // it is finite values only. The "n" indicates it includes NaNs, but only
+  // at the outer range.
+ private:
+  using Base = float8_base<float8_e4m3fn>;
+  friend class float8_base<float8_e4m3fn>;
+
+  constexpr float8_e4m3fn(uint8_t rep, ConstructFromRepTag)
+      : Base(rep, ConstructFromRepTag{}) {}
+
+ public:
+  constexpr float8_e4m3fn() = default;
+
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(T f)
+      : float8_e4m3fn(ConvertFrom(static_cast<float>(f))) {}
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(double f64)
+      : float8_e4m3fn(ConvertFrom(f64)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(float f32)
+      : float8_e4m3fn(ConvertFrom(f32)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(Eigen::bfloat16 bf16)
+      : float8_e4m3fn(ConvertFrom(bf16)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(Eigen::half f16)
+      : float8_e4m3fn(ConvertFrom(f16)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e4m3fn(const float8_e5m2& f8)
+      : float8_e4m3fn(ConvertFrom(f8)) {}
+
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC operator T() const {
+    return static_cast<T>(static_cast<float>(*this));
+  }
+  explicit EIGEN_DEVICE_FUNC operator double() const {
+    return ConvertTo<double>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator float() const {
+    return ConvertTo<float>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
+    return ConvertTo<Eigen::bfloat16>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
+    return ConvertTo<Eigen::half>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator bool() const {
+    return (rep() & 0x7F) != 0;
+  }
+};
+
+class float8_e5m2 : public float8_base<float8_e5m2> {
+  // Exponent: 5, Mantissa: 2, bias: 15.
+  // IEEE 754.
+ private:
+  using Base = float8_base<float8_e5m2>;
+  friend class float8_base<float8_e5m2>;
+
+  constexpr float8_e5m2(uint8_t rep, ConstructFromRepTag)
+      : Base(rep, ConstructFromRepTag{}) {}
+
+ public:
+  constexpr float8_e5m2() = default;
+
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(T f)
+      : float8_e5m2(ConvertFrom(static_cast<float>(f))) {}
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(double f64)
+      : float8_e5m2(ConvertFrom(f64)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(float f32)
+      : float8_e5m2(ConvertFrom(f32)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(Eigen::bfloat16 bf16)
+      : float8_e5m2(ConvertFrom(bf16)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(Eigen::half f16)
+      : float8_e5m2(ConvertFrom(f16)) {}
+  explicit EIGEN_DEVICE_FUNC float8_e5m2(float8_e4m3fn f8)
+      : float8_e5m2(ConvertFrom(f8)) {}
+
+  template <typename T,
+            typename EnableIf = std::enable_if<std::is_arithmetic_v<T>>>
+  explicit EIGEN_DEVICE_FUNC operator T() const {
+    return static_cast<T>(static_cast<float>(*this));
+  }
+  explicit EIGEN_DEVICE_FUNC operator double() const {
+    return ConvertTo<double>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator float() const {
+    return ConvertTo<float>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::bfloat16() const {
+    return ConvertTo<Eigen::bfloat16>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator Eigen::half() const {
+    return ConvertTo<Eigen::half>(*this);
+  }
+  explicit EIGEN_DEVICE_FUNC operator bool() const {
+    return (rep() & 0x7F) != 0;
+  }
+};
+
+// Structures for use in specializing std::numeric_limits.
+struct numeric_limits_float8_base {
+  // NOLINTBEGIN: these names must match std::numeric_limits.
+  static inline constexpr const bool is_specialized = true;
+  static inline constexpr const bool is_signed = true;
+  static inline constexpr const bool is_integer = false;
+  static inline constexpr const bool is_exact = false;
+  static inline constexpr const bool has_quiet_NaN = true;
+  static inline constexpr const std::float_denorm_style has_denorm =
+      std::denorm_present;
+  static inline constexpr const bool has_denorm_loss = false;
+  static inline constexpr const std::float_round_style round_style =
+      std::round_to_nearest;
+  static inline constexpr const bool is_bounded = true;
+  static inline constexpr const bool is_modulo = false;
+  static inline constexpr const int radix = std::numeric_limits<float>::radix;
+  static inline constexpr const bool traps = std::numeric_limits<float>::traps;
+  static inline constexpr const bool tinyness_before =
+      std::numeric_limits<float>::tinyness_before;
+  // NOLINTEND
+};
+
+template <typename Derived>
+struct numeric_limits_float8 {
+  // NOLINTBEGIN: these names must match std::numeric_limits.
+  static inline constexpr const int digits = 0;
+  static inline constexpr const int digits10 = 0;
+  static inline constexpr const int max_digits10 = 0;
+  static inline constexpr const int min_exponent = 0;
+  static inline constexpr const int min_exponent10 = 0;
+  static inline constexpr const int max_exponent = 0;
+  static inline constexpr const int max_exponent10 = 0;
+  static inline constexpr const bool is_iec559 = false;
+  static inline constexpr const bool has_infinity = false;
+  static inline constexpr const bool has_signaling_NaN = false;
+  // NOLINTEND
+};
+
+template <>
+struct numeric_limits_float8<float8_e4m3fn>
+    : public numeric_limits_float8_base {
+  // NOLINTBEGIN: these names must match std::numeric_limits.
+  static inline constexpr const int digits = 4;
+  static inline constexpr const int digits10 = 0;      // floor(3 * log10(2));
+  static inline constexpr const int max_digits10 = 3;  // ceil(4 * log10(2) + 1)
+  static inline constexpr const int min_exponent = -5;
+  static inline constexpr const int min_exponent10 = -1;
+  static inline constexpr const int max_exponent = 9;  // Extended format.
+  static inline constexpr const int max_exponent10 = 2;
+  static inline constexpr const bool is_iec559 = false;
+  static inline constexpr const bool has_infinity = false;
+  static inline constexpr const bool has_signaling_NaN = false;
+  // NOLINTEND
+
+  static constexpr float8_e4m3fn min() { return float8_e4m3fn::FromRep(0x08); }
+  static constexpr float8_e4m3fn lowest() {
+    return float8_e4m3fn::FromRep(0xFE);
+  }
+  static constexpr float8_e4m3fn max() { return float8_e4m3fn::FromRep(0x7E); }
+  static constexpr float8_e4m3fn epsilon() {
+    return float8_e4m3fn::FromRep(0x20);
+  }
+  static constexpr float8_e4m3fn round_error() {
+    return float8_e4m3fn::FromRep(0x30);
+  }
+  static constexpr float8_e4m3fn infinity() {
+    return float8_e4m3fn::FromRep(0x7F);
+  }  // NaN.
+  static constexpr float8_e4m3fn quiet_NaN() {
+    return float8_e4m3fn::FromRep(0x7F);
+  }
+  static constexpr float8_e4m3fn signaling_NaN() {
+    return float8_e4m3fn::FromRep(0x7F);
+  }
+  static constexpr float8_e4m3fn denorm_min() {
+    return float8_e4m3fn::FromRep(0x01);
+  }
+};
+
+template <>
+struct numeric_limits_float8<float8_e5m2> : public numeric_limits_float8_base {
+  // NOLINTBEGIN: these names must match std::numeric_limits.
+  static inline constexpr const int digits = 3;
+  static inline constexpr const int digits10 = 0;      // floor(2 * log10(2))
+  static inline constexpr const int max_digits10 = 2;  // ceil(3 * log10(2) + 1)
+  static inline constexpr const int min_exponent = -13;
+  static inline constexpr const int min_exponent10 = -4;
+  static inline constexpr const int max_exponent = 16;
+  static inline constexpr const int max_exponent10 = 4;
+  static inline constexpr const bool is_iec559 = true;
+  static inline constexpr const bool has_infinity = true;
+  static inline constexpr const bool has_signaling_NaN = true;
+  // NOLINTEND
+
+  static constexpr float8_e5m2 min() { return float8_e5m2::FromRep(0x04); }
+  static constexpr float8_e5m2 lowest() { return float8_e5m2::FromRep(0xFB); }
+  static constexpr float8_e5m2 max() { return float8_e5m2::FromRep(0x7B); }
+  static constexpr float8_e5m2 epsilon() { return float8_e5m2::FromRep(0x34); }
+  static constexpr float8_e5m2 round_error() {
+    return float8_e5m2::FromRep(0x38);
+  }
+  static constexpr float8_e5m2 infinity() { return float8_e5m2::FromRep(0x7C); }
+  static constexpr float8_e5m2 quiet_NaN() {
+    return float8_e5m2::FromRep(0x7F);
+  }
+  static constexpr float8_e5m2 signaling_NaN() {
+    return float8_e5m2::FromRep(0x7D);
+  }
+  static constexpr float8_e5m2 denorm_min() {
+    return float8_e5m2::FromRep(0x01);
+  }
+};
+
+// Free-functions for use with ADL and in Eigen.
+constexpr inline float8_e4m3fn abs(const float8_e4m3fn& a) {
+  return float8_e4m3fn::FromRep(a.rep() & 0x7F);
+}
+
+constexpr inline bool isnan(const float8_e4m3fn& a) {
+  return (a.rep() & 0x7F) == 0x7F;
+}
+
+constexpr inline bool isinf(const float8_e4m3fn& a) {
+  return false;  // No inf representation.
+}
+
+constexpr inline float8_e5m2 abs(const float8_e5m2& a) {
+  return float8_e5m2::FromRep(a.rep() & 0x7F);
+}
+
+constexpr inline bool isnan(const float8_e5m2& a) {
+  return (a.rep() & 0x7F) > 0x7C;
+}
+
+constexpr inline bool isinf(const float8_e5m2& a) {
+  return (a.rep() & 0x7F) == 0x7C;
+}
+
+template <typename Float8>
+std::ostream& operator<<(std::ostream& os, const float8_base<Float8>& f8) {
+  os << static_cast<float>(f8.derived());
+  return os;
+}
+
+//==============================================================================
+// Inline conversion routines between float8 and other types.
+//==============================================================================
+
+// Helper struct for getting a bit representation provided a byte size.
+template <int kNumBytes>
+struct GetUnsignedInteger;
+
+template <>
+struct GetUnsignedInteger<1> {
+  using type = uint8_t;
+};
+
+template <>
+struct GetUnsignedInteger<2> {
+  using type = uint16_t;
+};
+
+template <>
+struct GetUnsignedInteger<4> {
+  using type = uint32_t;
+};
+
+template <>
+struct GetUnsignedInteger<8> {
+  using type = uint64_t;
+};
+
+// Converts between two floating-point types.
+template <typename From, typename To, bool kSaturate, bool kTruncate,
+          typename EnableIf = void>
+struct ConvertImpl;
+
+// Convert to same type.  We need explicit specializations for all combinations
+// of template parameters to avoid ambiguities.
+template <typename Scalar>
+struct IdentityConversion {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& from) {
+    return from;
+  }
+};
+
+template <typename Scalar>
+struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/false, /*kTruncate=*/false,
+                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
+template <typename Scalar>
+struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/false, /*kTruncate=*/true,
+                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
+template <typename Scalar>
+struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/true, /*kTruncate=*/false,
+                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
+template <typename Scalar>
+struct ConvertImpl<Scalar, Scalar, /*kSaturate=*/true, /*kTruncate=*/true,
+                   /*EnableIf=*/void> : public IdentityConversion<Scalar> {};
+
+// Convert float8 to larger types.
+template <typename From, typename To, bool kSaturate, bool kTruncate>
+struct ConvertImpl<
+    From, To, kSaturate, kTruncate,
+    std::enable_if_t<std::is_base_of_v<float8_base<From>, From> &&
+                     (sizeof(From) < sizeof(To))>> {
+  using FromBits = typename GetUnsignedInteger<sizeof(From)>::type;
+  static constexpr int kFromBits = sizeof(From) * CHAR_BIT;
+  static constexpr int kFromMantissaBits = Eigen::NumTraits<From>::digits() - 1;
+  static constexpr int kFromExponentBits = kFromBits - kFromMantissaBits - 1;
+  static constexpr int kFromExponentBias = (1 << (kFromExponentBits - 1)) - 1;
+  static constexpr FromBits kFromExponentMask =
+      ((static_cast<FromBits>(1) << kFromExponentBits) - 1)
+      << kFromMantissaBits;
+
+  using ToBits = typename GetUnsignedInteger<sizeof(To)>::type;
+  static constexpr int kToBits = sizeof(To) * CHAR_BIT;
+  static constexpr int kToMantissaBits = Eigen::NumTraits<To>::digits() - 1;
+  static constexpr int kToExponentBits = kToBits - kToMantissaBits - 1;
+  static constexpr int kToExponentBias = (1 << (kToExponentBits - 1)) - 1;
+
+  static constexpr int kExponentOffset = kToExponentBias - kFromExponentBias;
+  static constexpr int kDigitShift = kToMantissaBits - kFromMantissaBits;
+
+  static EIGEN_DEVICE_FUNC inline To run(const From& from) {
+    // Shift bits to destination type, without sign bit.
+    const FromBits from_bits = from.rep() & 0x7F;
+    ToBits bits = ToBits{from_bits} << kDigitShift;
+
+    // Adjust the exponent.
+    // Special cases.
+    if (Eigen::numext::isinf(from) || Eigen::numext::isnan(from)) {
+      // Inf or NaN, fill exponent bits with all ones and preserve digits.
+      bits |= ((ToBits{1} << kToExponentBits) - 1) << kToMantissaBits;
+    } else if ((from.rep() & kFromExponentMask) == 0) {
+      // Subnormals.
+
+      // All float8 subnormals become normalized when casting to a type
+      // with a larger number of exponent bits.  To do the conversion, we
+      // construct an explicit map of all subnormal values to the
+      // corresponding normalized values in the destination type.  We do this
+      // by setting the normalized mantissa bits in the source type, shifting
+      // it up to the destination type, then inserting the exponent bits.
+      if constexpr (kFromMantissaBits == 2) {
+        // e5m2, only 4 options:
+        constexpr ToBits kNormalized[4] = {
+            // Mantissa | Exponent
+            ToBits{0x00},
+            ToBits{0x00} | ToBits{kExponentOffset - 1} << kToMantissaBits,
+            ToBits{0x00} | ToBits{kExponentOffset} << kToMantissaBits,
+            (ToBits{0x02} << kDigitShift) |
+                (ToBits{kExponentOffset} << kToMantissaBits),
+        };
+        bits = kNormalized[from_bits];
+      } else if constexpr (kFromMantissaBits == 3) {
+        // e4m3, only 8 options
+        constexpr ToBits kNormalized[8] = {
+            // Mantissa | Exponent
+            ToBits{0x00},
+            ToBits{0x00} | (ToBits{kExponentOffset - 2} << kToMantissaBits),
+            ToBits{0x00} | (ToBits{kExponentOffset - 1} << kToMantissaBits),
+            (ToBits{0x04} << kDigitShift) |
+                (ToBits{kExponentOffset - 1} << kToMantissaBits),
+            ToBits{0x00} | (ToBits{kExponentOffset} << kToMantissaBits),
+            (ToBits{0x02} << kDigitShift) |
+                (ToBits{kExponentOffset} << kToMantissaBits),
+            (ToBits{0x04} << kDigitShift) |
+                (ToBits{kExponentOffset} << kToMantissaBits),
+            (ToBits{0x06} << kDigitShift) |
+                (ToBits{kExponentOffset} << kToMantissaBits),
+        };
+        bits = kNormalized[from_bits];
+      }
+    } else {
+      // Increase exponent by offset difference.
+      bits += ToBits{kExponentOffset} << kToMantissaBits;
+    }
+
+    // Insert sign bit.
+    bits |= static_cast<ToBits>(from.rep() & 0x80) << (kToBits - kFromBits);
+    return Eigen::numext::bit_cast<To>(bits);
+  }
+};
+
+template <typename Bits>
+constexpr inline Bits RoundBitsToNearestEven(Bits bits, int roundoff) {
+  // Round to nearest even by adding a bias term.
+  // Consider a bit pattern
+  //   FFF...FLRTT...T,
+  // where bits RTT...T need to be rounded-off.  We add a bias term to the
+  // bit pattern s.t. a carry is introduced to round up only if
+  // - L is 1, R is 1, OR
+  // - L is 0, R is 1, any T is one.
+  // We do this by adding L to a bit pattern consisting of all T = 1.
+  Bits bias = roundoff == 0 ? 0
+                            : ((bits >> roundoff) & 1) +
+                                  (static_cast<Bits>(1) << (roundoff - 1)) - 1;
+  return bits + bias;
+}
+
+// Convert larger types to float8.
+template <typename From, typename To, bool kSaturate, bool kTruncate>
+struct ConvertImpl<From, To, kSaturate, kTruncate,
+                   std::enable_if_t<std::is_base_of_v<float8_base<To>, To> &&
+                                    (sizeof(To) < sizeof(From))>> {
+  using FromBits = typename GetUnsignedInteger<sizeof(From)>::type;
+  static constexpr int kFromBits = sizeof(From) * CHAR_BIT;
+  static constexpr int kFromMantissaBits = Eigen::NumTraits<From>::digits() - 1;
+  static constexpr int kFromExponentBits = kFromBits - kFromMantissaBits - 1;
+  static constexpr int kFromExponentBias = (1 << (kFromExponentBits - 1)) - 1;
+  static constexpr FromBits kFromExponentMask =
+      ((static_cast<FromBits>(1) << kFromExponentBits) - 1)
+      << kFromMantissaBits;
+
+  using ToBits = typename GetUnsignedInteger<sizeof(To)>::type;
+  static constexpr int kToBits = sizeof(To) * CHAR_BIT;
+  static constexpr int kToMantissaBits = Eigen::NumTraits<To>::digits() - 1;
+  static constexpr int kToExponentBits = kToBits - kToMantissaBits - 1;
+  static constexpr int kToExponentBias = (1 << (kToExponentBits - 1)) - 1;
+
+  static constexpr int kExponentOffset = kFromExponentBias - kToExponentBias;
+  static constexpr int kDigitShift = kFromMantissaBits - kToMantissaBits;
+
+  static_assert(kFromExponentBits > kToExponentBits,
+                "This implementation assumes down-casting to types with fewer "
+                "exponent bits.");
+  static_assert(kDigitShift > 0,
+                "This implementations assumes down-casting to types with fewer "
+                "mantissa bits.");
+
+  // Shift bits in the appropriate directions and add the exponent offset
+  // to convert between bit representations.  The input `in` must be a
+  // positive normalized value.
+  static constexpr inline FromBits ToFromBits(ToBits in) {
+    FromBits out = static_cast<FromBits>(in) << kDigitShift;
+    out += static_cast<FromBits>(kExponentOffset) << kFromMantissaBits;
+    return out;
+  }
+
+  static constexpr inline FromBits SetFromBit(int idx) {
+    return static_cast<FromBits>(1) << idx;
+  }
+
+  static EIGEN_DEVICE_FUNC inline To run(const From& from) {
+    FromBits from_bits = Eigen::numext::bit_cast<FromBits>(from);
+    const FromBits from_sign = from_bits & SetFromBit(kFromBits - 1);
+    const ToBits sign = from_sign >> (kFromBits - kToBits);
+    from_bits ^= from_sign;  // Zeros sign bit to obtain absolute value.
+
+    // Special values, preserving sign.
+    if (Eigen::numext::isinf(from)) {
+      return sign != 0 ? -Eigen::NumTraits<To>::infinity()
+                       : Eigen::NumTraits<To>::infinity();
+    } else if (Eigen::numext::isnan(from)) {
+      return Eigen::numext::bit_cast<To>(
+          static_cast<uint8_t>(Eigen::NumTraits<To>::quiet_NaN().rep() | sign));
+    }
+
+    // Adjust mantissa.
+    FromBits rounded_from_bits = from_bits;
+    if constexpr (!kTruncate) {
+      rounded_from_bits = RoundBitsToNearestEven(from_bits, kDigitShift);
+    }
+    // Zero-out tail bits.
+    rounded_from_bits &= ~(SetFromBit(kDigitShift) - 1);
+
+    // Check for overflows.
+    if constexpr (kExponentOffset > 0) {
+      // Shift up exponent and mantissa, add offset to adjust exponent to
+      // source type.
+      constexpr ToBits kToHighest = Eigen::NumTraits<To>::highest().rep();
+      constexpr FromBits kHighest = ToFromBits(kToHighest);
+
+      if (rounded_from_bits > kHighest) {
+        ToBits bits =
+            kSaturate ? kToHighest : Eigen::NumTraits<To>::infinity().rep();
+        return Eigen::numext::bit_cast<To>(static_cast<ToBits>(bits | sign));
+      }
+    }
+
+    // Subnormals and zero.
+    constexpr FromBits kLowestNormal =
+        ToFromBits(std::numeric_limits<To>::min().rep());
+    if (rounded_from_bits < kLowestNormal) {
+      // Round and shift mantissa down.
+      constexpr FromBits kMantissaMask = SetFromBit(kFromMantissaBits) - 1;
+      int exponent = ((from_bits >> kFromMantissaBits) - kFromExponentBias);
+      int exponent_shift = kDigitShift - exponent - kToExponentBias + 1;
+
+      // Insert the implicit leading 1 bit on the mantissa.  This assumes
+      // the input is normalized.  If it is not, then the mantissa bits -
+      // including the implicit one - will be shifted to zero.
+      // NOTE: we need to round again from the original from_bits, otherwise
+      // the lower precision bits may already be lost.  There is an edge-case
+      // where rounding to a normalized value would normally round down,
+      // but for a subnormal, we need to round up.
+      rounded_from_bits =
+          (SetFromBit(kFromMantissaBits) | (from_bits & kMantissaMask));
+      ToBits bits = 0;
+      // To avoid UB, limit rounding and shifting to the full mantissa plus
+      // leading 1.
+      if (exponent_shift <= kFromMantissaBits + 1) {
+        if constexpr (!kTruncate) {
+          rounded_from_bits =
+              RoundBitsToNearestEven(rounded_from_bits, exponent_shift);
+        }
+        bits = (rounded_from_bits >> exponent_shift);
+      }
+      // Insert sign and return.
+      return Eigen::numext::bit_cast<To>(static_cast<ToBits>(bits | sign));
+    }
+
+    // Adjust exponent.
+    rounded_from_bits += static_cast<FromBits>(-kExponentOffset)
+                         << kFromMantissaBits;
+
+    // Shift bits and insert sign.
+    ToBits bits =
+        static_cast<ToBits>((rounded_from_bits >> kDigitShift) | sign);
+    return Eigen::numext::bit_cast<To>(bits);
+  }
+};
+
+template <bool kSaturate, bool kTruncate>
+struct ConvertImpl<float8_e5m2, float8_e4m3fn, kSaturate, kTruncate> {
+  static EIGEN_DEVICE_FUNC inline float8_e4m3fn run(const float8_e5m2& from) {
+    uint8_t from_bits = from.rep();
+    uint8_t sign = from_bits & 0x80;
+    from_bits ^= sign;
+
+    // Special values (NaN/Inf).
+    if (from_bits > 0x7C) {
+      return float8_e4m3fn::FromRep(sign | 0x7F);
+    }
+
+    // Subnormals or overflow.
+    if (from_bits < 0x24) {
+      // Subnormal output.
+      int negative_exponent = 15 - (from_bits >> 2);
+      int exponent_shift = negative_exponent - 7;
+      uint8_t bits = ((from_bits & 0x03) | 0x04);
+      if constexpr (!kTruncate) {
+        bits = RoundBitsToNearestEven(bits, exponent_shift);
+      }
+      bits >>= exponent_shift;
+      return float8_e4m3fn::FromRep(sign | bits);
+    } else if (from_bits > 0x5F) {
+      uint8_t bits = kSaturate ? 0x7E : 0x7F;
+      return float8_e4m3fn::FromRep(sign | bits);
+    }
+
+    // Subtract exponent offset and shift.
+    uint8_t bits = (from_bits - 0x20) << 1;
+    return float8_e4m3fn::FromRep(sign | bits);
+  }
+};
+
+template <bool kTruncate>
+struct ConvertImpl<float8_e4m3fn, float8_e5m2, kTruncate, false> {
+  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const float8_e4m3fn& from) {
+    uint8_t from_bits = from.rep();
+    uint8_t sign = from_bits & 0x80;
+    from_bits ^= sign;
+
+    // Special values (NaN).
+    if (from_bits == 0x7F) {
+      return float8_e5m2::FromRep(sign | from_bits);
+    }
+
+    // Subnormals.
+    if (from_bits < 0x08) {
+      // Complete map between types, all are normal in e5m3.
+      static constexpr uint8_t kNormalized[8] = {0x00, 0x18, 0x1C, 0x1E,
+                                                 0x20, 0x21, 0x22, 0x23};
+      uint8_t bits = kNormalized[from_bits];
+      return float8_e5m2::FromRep(sign | bits);
+    }
+
+    // Round, truncate to destination type, and add exponent offset.
+    if (!kTruncate) {
+      from_bits = RoundBitsToNearestEven(from_bits, 1);
+    }
+    from_bits = (from_bits >> 1) + 0x20;
+    return float8_e5m2::FromRep(sign | from_bits);
+  }
+};
+
+// Saturation has no impact when casting e4m3 to e5m2.
+template <bool kSaturate, bool kTruncate>
+struct ConvertImpl<float8_e4m3fn, float8_e5m2, kSaturate, kTruncate> {
+  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const float8_e4m3fn& from) {
+    return ConvertImpl<float8_e4m3fn, float8_e5m2, kTruncate, false>::run(from);
+  }
+};
+
+template <bool kTruncate>
+struct ConvertImpl<Eigen::half, float8_e5m2, kTruncate, false> {
+  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const Eigen::half& from) {
+    uint16_t from_bits = Eigen::numext::bit_cast<uint16_t>(from);
+
+    // Special values (Inf or NaN).
+    uint16_t abs_bits = from_bits & 0x7FFF;
+    if (abs_bits == 0x7C00) {
+      return float8_e5m2::FromRep(from_bits >> 8);
+    } else if (abs_bits > 0x7C00) {
+      return float8_e5m2::FromRep((from_bits >> 8) | 0x01);
+    }
+
+    if constexpr (!kTruncate) {
+      from_bits = RoundBitsToNearestEven(from_bits, 8);
+    }
+    return float8_e5m2::FromRep(from_bits >> 8);
+  }
+};
+
+// Saturation has no impact when casting Eigen::half to e5m2.
+template <bool kSaturate, bool kTruncate>
+struct ConvertImpl<Eigen::half, float8_e5m2, kSaturate, kTruncate> {
+  static EIGEN_DEVICE_FUNC inline float8_e5m2 run(const Eigen::half& from) {
+    return ConvertImpl<Eigen::half, float8_e5m2, kTruncate, false>::run(from);
+  }
+};
+
+template <>
+struct ConvertImpl<float8_e5m2, Eigen::half, false, false> {
+  static EIGEN_DEVICE_FUNC inline Eigen::half run(const float8_e5m2& from) {
+    return Eigen::numext::bit_cast<Eigen::half>(
+        static_cast<uint16_t>(static_cast<uint16_t>(from.rep()) << 8));
+  }
+};
+
+// Saturation and truncation have no impact when casting e5m2 to Eigen::half.
+template <bool kSaturate, bool kTruncate>
+struct ConvertImpl<float8_e5m2, Eigen::half, kSaturate, kTruncate> {
+  static EIGEN_DEVICE_FUNC inline Eigen::half run(const float8_e5m2& from) {
+    return ConvertImpl<float8_e5m2, Eigen::half, false, false>::run(from);
+  }
+};
+
+template <typename Derived>
+template <bool kSaturate, bool kTruncate, typename From>
+EIGEN_DEVICE_FUNC Derived float8_base<Derived>::ConvertFrom(const From& from) {
+  return ConvertImpl<From, Derived, kSaturate, kTruncate>::run(from);
+}
+
+template <typename Derived>
+template <typename To, bool kSaturate, bool kTruncate>
+EIGEN_DEVICE_FUNC To float8_base<Derived>::ConvertTo(const Derived& from) {
+  return ConvertImpl<Derived, To, kSaturate, kTruncate>::run(from);
+}
+
+}  // namespace float8_internal
+
+// Exported types.
+using float8_e4m3fn = float8_internal::float8_e4m3fn;
+using float8_e5m2 = float8_internal::float8_e5m2;
+
+}  // namespace tsl
+
+// Standard-library overrides.  Note that these are picked up by Eigen as well.
+namespace std {
+template <>
+struct numeric_limits<tsl::float8_e4m3fn>
+    : public tsl::float8_internal::numeric_limits_float8<tsl::float8_e4m3fn> {};
+
+template <>
+struct numeric_limits<tsl::float8_e5m2>
+    : public tsl::float8_internal::numeric_limits_float8<tsl::float8_e5m2> {};
+
+}  // namespace std
+
+// Eigen-specific overrides.
+namespace Eigen {
+namespace numext {
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tsl::float8_e4m3fn
+bit_cast<tsl::float8_e4m3fn, uint8_t>(const uint8_t& src) {
+  return tsl::float8_e4m3fn::FromRep(src);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
+bit_cast<uint8_t, tsl::float8_e4m3fn>(const tsl::float8_e4m3fn& src) {
+  return src.rep();
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC tsl::float8_e5m2
+bit_cast<tsl::float8_e5m2, uint8_t>(const uint8_t& src) {
+  return tsl::float8_e5m2::FromRep(src);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint8_t
+bit_cast<uint8_t, tsl::float8_e5m2>(const tsl::float8_e5m2& src) {
+  return src.rep();
+}
+
+}  // namespace numext
+
+// Work-around for isinf/isnan issue on aarch64.
+namespace internal {
+template <>
+EIGEN_DEVICE_FUNC inline bool isinf_impl<tsl::float8_e4m3fn>(
+    const tsl::float8_e4m3fn& x) {
+  return tsl::float8_internal::isinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline bool isinf_impl<tsl::float8_e5m2>(
+    const tsl::float8_e5m2& x) {
+  return tsl::float8_internal::isinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline bool isnan_impl<tsl::float8_e4m3fn>(
+    const tsl::float8_e4m3fn& x) {
+  return tsl::float8_internal::isnan(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline bool isnan_impl<tsl::float8_e5m2>(
+    const tsl::float8_e5m2& x) {
+  return tsl::float8_internal::isnan(x);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_TSL_PLATFORM_FLOAT8_H_
diff --git a/tensorflow/tsl/platform/float8_test.cu.cc b/tensorflow/tsl/platform/float8_test.cu.cc
new file mode 100644
index 00000000000..21e997031de
--- /dev/null
+++ b/tensorflow/tsl/platform/float8_test.cu.cc
@@ -0,0 +1,573 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Enable CPU or GPU device, depending on build configuration.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#else
+#define EIGEN_USE_THREADS
+#endif
+
+#include "tensorflow/tsl/platform/float8.h"
+
+#include <cmath>
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/tsl/platform/test.h"
+
+namespace tsl {
+namespace {
+
+template <typename Float8_>
+class Float8Test : public ::testing::Test {};
+
+// Helper utility for prettier test names.
+struct Float8TestParamNames {
+  template <typename TypeParam>
+  static std::string GetName(int idx) {
+    if constexpr (std::is_same_v<TypeParam, float8_e4m3fn>) {
+      return "float8_e4m3fn";
+    } else if constexpr (std::is_same_v<TypeParam, float8_e5m2>) {
+      return "float8_e5m2";
+    }
+    return absl::StrCat(idx);
+  }
+};
+
+using Float8Types = ::testing::Types<float8_e4m3fn, float8_e5m2>;
+TYPED_TEST_SUITE(Float8Test, Float8Types, Float8TestParamNames);
+
+TEST(Float8E4m3Test, NumericLimits) {
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e4m3fn>::quiet_NaN()));
+  EXPECT_TRUE(Eigen::numext::isnan(
+      std::numeric_limits<float8_e4m3fn>::signaling_NaN()));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3fn>::min()),
+            std::exp2(-6));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3fn>::max()), 448);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3fn>::lowest()),
+            -448);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e4m3fn>::epsilon()),
+            0.125);
+  EXPECT_EQ(
+      static_cast<float>(std::numeric_limits<float8_e4m3fn>::round_error()),
+      0.5);
+  // No infinity, represent as NaN.
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e4m3fn>::infinity()));
+  EXPECT_EQ(
+      static_cast<float>(std::numeric_limits<float8_e4m3fn>::denorm_min()),
+      std::exp2(-9));
+}
+
+TEST(Float8E5m2Test, NumericLimits) {
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e5m2>::quiet_NaN()));
+  EXPECT_TRUE(
+      Eigen::numext::isnan(std::numeric_limits<float8_e5m2>::signaling_NaN()));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::min()),
+            std::exp2(-14));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::max()), 57344);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::lowest()),
+            -57344);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::epsilon()),
+            0.25);
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::round_error()),
+            0.5);
+  EXPECT_TRUE(
+      Eigen::numext::isinf(std::numeric_limits<float8_e5m2>::infinity()));
+  EXPECT_EQ(static_cast<float>(std::numeric_limits<float8_e5m2>::denorm_min()),
+            std::exp2(-16));
+}
+
+TYPED_TEST(Float8Test, FromRep) {
+  using Float8 = TypeParam;
+  Float8 x = Float8::FromRep(0x4F);
+  EXPECT_EQ(x.rep(), 0x4F);
+}
+
+TYPED_TEST(Float8Test, Negate) {
+  using Float8 = TypeParam;
+  Float8 x = -Float8::FromRep(0x4F);
+  EXPECT_EQ(x.rep(), 0x80 | 0x4F);
+
+  Float8 nan = -std::numeric_limits<Float8>::quiet_NaN();
+  EXPECT_TRUE(Eigen::numext::isnan(nan));
+}
+
+TYPED_TEST(Float8Test, BitCasts) {
+  using Float8 = TypeParam;
+  Float8 x = Float8::FromRep(0x47);
+  EXPECT_EQ(Eigen::numext::bit_cast<uint8_t>(x), 0x47);
+  EXPECT_EQ(Eigen::numext::bit_cast<Float8>(x.rep()).rep(), 0x47);
+}
+
+TYPED_TEST(Float8Test, UpCasts) {
+  using Float8 = TypeParam;
+
+  // Loop through each float8 value.
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    // Cast up to each other floating-point type, and verify they are the same.
+    Float8 f8 = Float8::FromRep(i);
+    double f64 = static_cast<double>(f8);
+    float f32 = static_cast<float>(f8);
+    Eigen::bfloat16 bf16 = static_cast<Eigen::bfloat16>(f8);
+    Eigen::half f16 = static_cast<Eigen::half>(f8);
+
+    if (Eigen::numext::isnan(f8)) {
+      EXPECT_TRUE(Eigen::numext::isnan(f64));
+      EXPECT_TRUE(Eigen::numext::isnan(f32));
+      EXPECT_TRUE(Eigen::numext::isnan(bf16));
+      EXPECT_TRUE(Eigen::numext::isnan(f16));
+    } else {
+      EXPECT_EQ(f64, f32);
+      EXPECT_EQ(f32, bf16);
+      EXPECT_EQ(bf16, f16);
+    }
+  }
+}
+
+TYPED_TEST(Float8Test, DownCasts) {
+  using Float8 = TypeParam;
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    float x = static_cast<float>(Float8::FromRep(i));
+
+    Float8 f64 = static_cast<Float8>(static_cast<double>(x));
+    Float8 f32 = static_cast<Float8>(static_cast<float>(x));
+    Float8 bf16 = static_cast<Float8>(static_cast<Eigen::bfloat16>(x));
+    Float8 f16 = static_cast<Float8>(static_cast<Eigen::half>(x));
+
+    if (Eigen::numext::isnan(x)) {
+      EXPECT_TRUE(Eigen::numext::isnan(f64));
+      EXPECT_TRUE(Eigen::numext::isnan(f32));
+      EXPECT_TRUE(Eigen::numext::isnan(bf16));
+      EXPECT_TRUE(Eigen::numext::isnan(f16));
+    } else {
+      EXPECT_EQ(f64.rep(), i) << i;
+      EXPECT_EQ(f32.rep(), i) << i;
+      EXPECT_EQ(bf16.rep(), i) << i;
+      EXPECT_EQ(f16.rep(), i) << i;
+    }
+  }
+}
+
+TYPED_TEST(Float8Test, ConvertFromWithSaturation) {
+  using Float8 = TypeParam;
+
+  // Saturation above max value.
+  Float8 upper =
+      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+          static_cast<float>(std::numeric_limits<Float8>::max()) * 2);
+  EXPECT_EQ(upper, std::numeric_limits<Float8>::max());
+
+  Float8 lower =
+      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/false>(
+          static_cast<float>(std::numeric_limits<Float8>::lowest()) * 2);
+  EXPECT_EQ(lower, std::numeric_limits<Float8>::lowest());
+
+  // Special values remain with saturation.
+  Float8 nan =
+      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
+          std::numeric_limits<float>::quiet_NaN());
+  EXPECT_TRUE(Eigen::numext::isnan(nan));
+  Float8 inf =
+      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
+          std::numeric_limits<float>::infinity());
+  // E4M3 doesn't have inf, so check inf -> NaN conversion.
+  EXPECT_TRUE(std::numeric_limits<Float8>::has_infinity
+                  ? Eigen::numext::isinf(inf)
+                  : Eigen::numext::isnan(inf));
+  Float8 ninf =
+      Float8::template ConvertFrom</*kSaturate=*/true, /*kTruncate=*/true>(
+          -std::numeric_limits<float>::infinity());
+  EXPECT_TRUE(std::numeric_limits<Float8>::has_infinity
+                  ? Eigen::numext::isinf(ninf)
+                  : Eigen::numext::isnan(ninf));
+}
+
+TYPED_TEST(Float8Test, ConvertFromWithTruncation) {
+  using Float8 = TypeParam;
+
+  // Truncation and rounding of a number ever-so-slightly less than 2.
+  float less_than_two = Eigen::numext::bit_cast<float>(0x3FFFFFFF);
+  Float8 truncated =
+      Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+          less_than_two);
+  EXPECT_LT(static_cast<float>(truncated), 2);
+
+  Float8 rounded =
+      Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+          less_than_two);
+  EXPECT_EQ(static_cast<float>(rounded), 2);
+
+  // Truncation and rounding of a subnormal.
+  for (int i = 0x01; i < 0x04; ++i) {
+    float less_than_subnorm =
+        std::nexttoward(static_cast<float>(Float8::FromRep(i)), 0);
+
+    Float8 truncated_subnorm =
+        Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+            less_than_subnorm);
+    EXPECT_EQ(truncated_subnorm.rep(), i - 1);
+
+    Float8 rounded_subnorm =
+        Float8::template ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+            less_than_subnorm);
+    EXPECT_EQ(rounded_subnorm.rep(), i);
+  }
+}
+
+TYPED_TEST(Float8Test, ConvertTo) {
+  using Float8 = TypeParam;
+
+  // Converting to higher precision types doesn't result in either
+  // truncation or saturation, so let's just ensure they all provide the
+  // same results.
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    // Cast up to each other floating-point type, and verify they are the same.
+    Float8 f8 = Float8::FromRep(i);
+    float f32 = static_cast<float>(f8);
+    if (Eigen::numext::isnan(f8)) {
+      EXPECT_TRUE(
+          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/false,
+                                                /*kTruncate=*/false>(f8)));
+      EXPECT_TRUE(
+          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/false,
+                                                /*kTruncate=*/true>(f8)));
+      EXPECT_TRUE(
+          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/true,
+                                                /*kTruncate=*/false>(f8)));
+      EXPECT_TRUE(
+          std::isnan(Float8::template ConvertTo<float, /*kSaturate=*/true,
+                                                /*kTruncate=*/true>(f8)));
+    } else {
+      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/false,
+                                                 /*kTruncate=*/false>(f8)));
+      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/false,
+                                                 /*kTruncate=*/true>(f8)));
+      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/true,
+                                                 /*kTruncate=*/false>(f8)));
+      EXPECT_EQ(f32, (Float8::template ConvertTo<float, /*kSaturate=*/true,
+                                                 /*kTruncate=*/true>(f8)));
+    }
+  }
+}
+
+TEST(Float8Test, Float8E5m2_To_Float8E4m3) {
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    float8_e5m2 e5m2 = float8_e5m2::FromRep(i);
+    float8_e4m3fn e4m3 = static_cast<float8_e4m3fn>(e5m2);
+    float8_e4m3fn expected =
+        static_cast<float8_e4m3fn>(static_cast<float>(e5m2));
+    EXPECT_EQ(e4m3.rep(), expected.rep()) << i;
+  }
+
+  // Saturation.
+  float8_e5m2 max = std::numeric_limits<float8_e5m2>::max();
+  float8_e4m3fn saturated = float8_e4m3fn::ConvertFrom</*kSaturate=*/true>(max);
+  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3fn>::max());
+  saturated = float8_e5m2::ConvertTo<float8_e4m3fn, /*kSaturate=*/true>(max);
+  EXPECT_EQ(saturated, std::numeric_limits<float8_e4m3fn>::max());
+
+  // Truncation - only occurs for e4m3 subnormals.
+  float8_e5m2 less_than_subnorm = float8_e5m2::FromRep(0x1F);  // 2^-7 - 2^-10.
+  float8_e4m3fn rounded_subnorm =
+      float8_e4m3fn::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/false>(
+          less_than_subnorm);
+  EXPECT_EQ(rounded_subnorm.rep(), 0x04);
+  float8_e4m3fn truncated_subnorm =
+      float8_e4m3fn::ConvertFrom</*kSaturate=*/false, /*kTruncate=*/true>(
+          less_than_subnorm);
+  EXPECT_EQ(truncated_subnorm.rep(), 0x03);
+}
+
+TEST(Float8Test, Float8E4m3_To_Float8E5m2) {
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    float8_e4m3fn e4m3 = float8_e4m3fn::FromRep(i);
+    float8_e5m2 e5m2 = static_cast<float8_e5m2>(e4m3);
+    float8_e5m2 expected = static_cast<float8_e5m2>(static_cast<float>(e4m3));
+    EXPECT_EQ(e5m2.rep(), expected.rep()) << i;
+  }
+
+  // Truncation and rounding of a number ever-so-slightly less than 2.
+  float8_e4m3fn less_than_two = float8_e4m3fn::FromRep(0x3F);
+  float8_e5m2 truncated =
+      float8_e5m2::template ConvertFrom</*kSaturate=*/false,
+                                        /*kTruncate=*/true>(less_than_two);
+  EXPECT_LT(static_cast<float>(truncated), 2);
+
+  float8_e5m2 rounded =
+      float8_e5m2::template ConvertFrom</*kSaturate=*/false,
+                                        /*kTruncate=*/false>(less_than_two);
+  EXPECT_EQ(static_cast<float>(rounded), 2);
+}
+
+TEST(Float8Test, Half_To_Float8E5m2) {
+  // Special values, NaN.
+  Eigen::half inf =
+      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x7C00));
+  EXPECT_EQ(static_cast<float8_e5m2>(inf).rep(), 0x7C);
+  Eigen::half ninf =
+      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0xFC00));
+  EXPECT_EQ(static_cast<float8_e5m2>(ninf).rep(), 0xFC);
+
+  Eigen::half nan =
+      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x7C01));
+  EXPECT_EQ(static_cast<float8_e5m2>(nan).rep(), 0x7D);
+  Eigen::half nnan =
+      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0xFC01));
+  EXPECT_EQ(static_cast<float8_e5m2>(nnan).rep(), 0xFD);
+
+  // Rounding vs truncation.
+  Eigen::half less_than_two =
+      Eigen::numext::bit_cast<Eigen::half>(static_cast<uint16_t>(0x3FFF));
+  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false,
+                                      /*kTruncate=*/false>(less_than_two)
+                 .rep()),
+            0x40);
+  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false,
+                                      /*kTruncate=*/true>(less_than_two)
+                 .rep()),
+            0x3F);
+  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false,
+                                      /*kTruncate=*/false>(-less_than_two)
+                 .rep()),
+            0xC0);
+  EXPECT_EQ((float8_e5m2::ConvertFrom</*kSaturate=*/false,
+                                      /*kTruncate=*/true>(-less_than_two)
+                 .rep()),
+            0xBF);
+}
+
+using ::testing::Eq;
+using ::testing::IsTrue;
+MATCHER_P(EqOrIsNan, other, "") {
+  if (Eigen::numext::isnan(other)) {
+    return ExplainMatchResult(IsTrue(), Eigen::numext::isnan(arg),
+                              result_listener);
+  }
+  return ExplainMatchResult(Eq(other), arg, result_listener);
+}
+
+TYPED_TEST(Float8Test, CallTheOperator) {
+  using Float8 = TypeParam;
+
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    Float8 a = Float8::FromRep(i);
+    for (int j = 0x00; j <= 0xFF; ++j) {
+      Float8 b = Float8::FromRep(j);
+
+      EXPECT_THAT(a + b, EqOrIsNan(Float8{float{a} + float{b}}));
+      EXPECT_THAT(a - b, EqOrIsNan(Float8{float{a} - float{b}}));
+      EXPECT_THAT(a * b, EqOrIsNan(Float8{float{a} * float{b}}));
+      EXPECT_THAT(a / b, EqOrIsNan(Float8{float{a} / float{b}}));
+
+      Float8 c;
+      EXPECT_THAT((c = a, c += b), EqOrIsNan(Float8{float{a} + float{b}}));
+      EXPECT_THAT((c = a, c -= b), EqOrIsNan(Float8{float{a} - float{b}}));
+      EXPECT_THAT((c = a, c *= b), EqOrIsNan(Float8{float{a} * float{b}}));
+      EXPECT_THAT((c = a, c /= b), EqOrIsNan(Float8{float{a} / float{b}}));
+
+      EXPECT_EQ(a == b, float{a} == float{b}) << float{a} << " vs " << float{b};
+      EXPECT_EQ(a != b, float{a} != float{b});
+      EXPECT_EQ(a < b, float{a} < float{b});
+      EXPECT_EQ(a <= b, float{a} <= float{b});
+      EXPECT_EQ(a > b, float{a} > float{b});
+      EXPECT_EQ(a >= b, float{a} >= float{b});
+    }
+  }
+}
+
+TYPED_TEST(Float8Test, CallTheConstOperator) {
+  using Float8 = TypeParam;
+
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    const Float8 a = Float8::FromRep(i);
+    for (int j = 0x00; j <= 0xFF; ++j) {
+      const Float8 b = Float8::FromRep(j);
+
+      EXPECT_THAT(a + b, EqOrIsNan(Float8{float{a} + float{b}}));
+      EXPECT_THAT(a - b, EqOrIsNan(Float8{float{a} - float{b}}));
+      EXPECT_THAT(a * b, EqOrIsNan(Float8{float{a} * float{b}}));
+      EXPECT_THAT(a / b, EqOrIsNan(Float8{float{a} / float{b}}));
+
+      Float8 c;
+      EXPECT_THAT((c = a, c += b), EqOrIsNan(Float8{float{a} + float{b}}));
+      EXPECT_THAT((c = a, c -= b), EqOrIsNan(Float8{float{a} - float{b}}));
+      EXPECT_THAT((c = a, c *= b), EqOrIsNan(Float8{float{a} * float{b}}));
+      EXPECT_THAT((c = a, c /= b), EqOrIsNan(Float8{float{a} / float{b}}));
+
+      EXPECT_EQ(a == b, float{a} == float{b}) << float{a} << " vs " << float{b};
+      EXPECT_EQ(a != b, float{a} != float{b});
+      EXPECT_EQ(a < b, float{a} < float{b});
+      EXPECT_EQ(a <= b, float{a} <= float{b});
+      EXPECT_EQ(a > b, float{a} > float{b});
+      EXPECT_EQ(a >= b, float{a} >= float{b});
+    }
+  }
+}
+
+// Helper utility for prettier test names.
+struct Float8CastTestParamNames {
+  template <typename TypeParam>
+  static std::string GetName(int idx) {
+    using first_type = typename TypeParam::first_type;
+    using second_type = typename TypeParam::second_type;
+    return absl::StrCat(::testing::internal::GetTypeName<first_type>(), "_",
+                        ::testing::internal::GetTypeName<second_type>());
+  }
+};
+
+using Float8CastTypePairs = ::testing::Types<
+#if !defined(EIGEN_USE_GPU) && !defined(EIGEN_GPU_COMPILE_PHASE)
+    // long double doesn't work on GPU - it is treated as a regular 8-byte
+    // double, which differs in size from the 16-byte long double on intel CPU.
+    std::pair<float8_e5m2, long double>, std::pair<float8_e4m3fn, long double>,
+#endif
+    std::pair<float8_e4m3fn, double>, std::pair<float8_e4m3fn, float>,
+    std::pair<float8_e4m3fn, Eigen::bfloat16>,
+    std::pair<float8_e4m3fn, Eigen::half>, std::pair<float8_e4m3fn, bool>,
+    std::pair<float8_e4m3fn, int32_t>, std::pair<float8_e4m3fn, int64_t>,
+    std::pair<float8_e5m2, double>, std::pair<float8_e5m2, float>,
+    std::pair<float8_e5m2, Eigen::bfloat16>,
+    std::pair<float8_e5m2, Eigen::half>, std::pair<float8_e5m2, bool>,
+    std::pair<float8_e5m2, int32_t>, std::pair<float8_e5m2, int64_t> >;
+
+template <typename CastPair>
+class Float8CastTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Float8CastTest, Float8CastTypePairs, Float8CastTestParamNames);
+
+TYPED_TEST(Float8CastTest, CastThroughFloat) {
+  using Float8 = typename TypeParam::first_type;
+  using DestType = typename TypeParam::second_type;
+
+  for (int i = 0x00; i <= 0xFF; ++i) {
+    Float8 f8 = Float8::FromRep(i);
+
+    if ((!Eigen::numext::isnan(f8) ||
+         std::numeric_limits<DestType>::has_quiet_NaN) &&
+        (!Eigen::numext::isinf(f8) ||
+         std::numeric_limits<DestType>::has_infinity)) {
+      DestType dest = static_cast<DestType>(f8);
+      DestType expected = static_cast<DestType>(static_cast<float>(f8));
+      EXPECT_THAT(dest, EqOrIsNan(expected));
+    }
+  }
+}
+
+// Work-around for lack of consistent .synchronize() method in Eigen.
+template <typename Device>
+void synchronize(Device& device) {
+  // Nothing.
+}
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <>
+void synchronize<Eigen::GpuDevice>(Eigen::GpuDevice& device) {
+  device.synchronize();
+}
+#endif
+
+TYPED_TEST(Float8CastTest, DeviceCast) {
+  using Float8 = typename TypeParam::first_type;
+  using DestType = typename TypeParam::second_type;
+
+#if defined(EIGEN_USE_GPU)
+  Eigen::GpuStreamDevice stream;
+  Eigen::GpuDevice device(&stream);
+#elif defined(EIGEN_USE_THREADS)
+  constexpr int kThreads = 4;
+  Eigen::ThreadPool tp(kThreads);
+  Eigen::ThreadPoolDevice device(&tp, kThreads);
+#else
+  Eigen::DefaultDevice device;
+#endif
+
+  const int kNumElems = 256;
+  // Allocate device buffers and create device tensors.
+  Float8* src_device_buffer =
+      (Float8*)device.allocate(kNumElems * sizeof(Float8));
+  DestType* dst_device_buffer =
+      (DestType*)device.allocate(kNumElems * sizeof(DestType));
+
+  Eigen::TensorMap<Eigen::Tensor<Float8, 1>, Eigen::Aligned> src_device(
+      src_device_buffer, kNumElems);
+  Eigen::TensorMap<Eigen::Tensor<DestType, 1>, Eigen::Aligned> dst_device(
+      dst_device_buffer, kNumElems);
+
+  // Allocate host buffers and initially src memory.
+  Eigen::Tensor<Float8, 1> src_cpu(kNumElems);
+  Eigen::Tensor<DestType, 1> dst_cpu(kNumElems);
+  for (int i = 0; i < kNumElems; ++i) {
+    src_cpu(i) = Eigen::numext::bit_cast<Float8>(static_cast<uint8_t>(i));
+    // If src is inf or nan but DestType doesn't support these values
+    // (e.g. integer types), replace the input with a zero.
+    if ((!std::numeric_limits<DestType>::has_quiet_NaN &&
+         Eigen::numext::isnan(src_cpu(i))) ||
+        (!std::numeric_limits<DestType>::has_infinity &&
+         Eigen::numext::isinf(src_cpu(i)))) {
+      src_cpu(i) = Float8(0.0);
+    }
+  }
+
+  // Transfer data to device, perform a cast to DestType, then transfer result
+  // back to host.
+  device.memcpyHostToDevice(src_device_buffer, src_cpu.data(),
+                            kNumElems * sizeof(Float8));
+  dst_device.device(device) = src_device.template cast<DestType>();
+  device.memcpyDeviceToHost(dst_cpu.data(), dst_device_buffer,
+                            kNumElems * sizeof(DestType));
+  synchronize(device);
+
+  for (int i = 0; i < kNumElems; ++i) {
+    DestType expected = static_cast<DestType>(src_cpu(i));
+    EXPECT_THAT(dst_cpu(i), EqOrIsNan(expected));
+  }
+
+  // Cast back from DestType to Float8.
+  // First clear out the device src buffer, since that will be the destination.
+  src_cpu.setZero();
+  device.memcpyHostToDevice(src_device_buffer, src_cpu.data(),
+                            kNumElems * sizeof(Float8));
+  src_device.device(device) = dst_device.template cast<Float8>();
+  device.memcpyDeviceToHost(src_cpu.data(), src_device_buffer,
+                            kNumElems * sizeof(Float8));
+  synchronize(device);
+
+  for (int i = 0; i < kNumElems; ++i) {
+    Float8 expected = static_cast<Float8>(dst_cpu(i));
+    EXPECT_THAT(src_cpu(i), EqOrIsNan(expected));
+  }
+
+  // Clean up.
+  device.deallocate(src_device_buffer);
+  device.deallocate(dst_device_buffer);
+  synchronize(device);
+}
+
+TEST(Float8Test, SmallCastToDenormal) {
+  // Special edge-case where rounding to a normalized value would
+  // normally round down, but rounding to a subnormal rounds up.
+  float x = std::ldexp(1.3125, -15);
+  float8_e5m2 y = static_cast<float8_e5m2>(x);
+  float z = static_cast<float>(y);
+  EXPECT_EQ(z, std::ldexp(1.5, -15));
+}
+
+}  // namespace
+}  // namespace tsl
diff --git a/tensorflow/tsl/platform/profile_utils/BUILD b/tensorflow/tsl/platform/profile_utils/BUILD
index 6cf7bb803dd..27fdc169182 100644
--- a/tensorflow/tsl/platform/profile_utils/BUILD
+++ b/tensorflow/tsl/platform/profile_utils/BUILD
@@ -12,6 +12,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/xla/stream_executor:__subpackages__",
         "//tensorflow/core/platform:__subpackages__",
diff --git a/tensorflow/tsl/platform/protobuf.h b/tensorflow/tsl/platform/protobuf.h
index 95c74d1589b..ba1a06a31a6 100644
--- a/tensorflow/tsl/platform/protobuf.h
+++ b/tensorflow/tsl/platform/protobuf.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
 #define TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
 
+#include <cstdint>
+
 #include "tensorflow/tsl/platform/platform.h"
 #include "tensorflow/tsl/platform/types.h"
 
@@ -25,28 +27,28 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#include "google/protobuf/io/coded_stream.h"  // IWYU pragma: export
-#include "google/protobuf/io/tokenizer.h"     // IWYU pragma: export
-#include "google/protobuf/io/zero_copy_stream.h"  // IWYU pragma: export
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"  // IWYU pragma: export
-#include "google/protobuf/descriptor.pb.h"     // IWYU pragma: export
-#include "google/protobuf/arena.h"            // IWYU pragma: export
-#include "google/protobuf/descriptor.h"       // IWYU pragma: export
-#include "google/protobuf/dynamic_message.h"  // IWYU pragma: export
-#include "google/protobuf/map.h"              // IWYU pragma: export
-#include "google/protobuf/message.h"          // IWYU pragma: export
-#include "google/protobuf/repeated_field.h"   // IWYU pragma: export
-#include "google/protobuf/text_format.h"      // IWYU pragma: export
-#include "google/protobuf/util/field_comparator.h"  // IWYU pragma: export
-#include "google/protobuf/util/json_util.h"  // IWYU pragma: export
-#include "google/protobuf/util/message_differencer.h"  // IWYU pragma: export
-#include "google/protobuf/util/type_resolver_util.h"  // IWYU pragma: export
+#include "google/protobuf/descriptor.pb.h"         // IWYU pragma:export
+#include "google/protobuf/arena.h"                // IWYU pragma:export
+#include "google/protobuf/descriptor.h"           // IWYU pragma:export
+#include "google/protobuf/dynamic_message.h"      // IWYU pragma:export
+#include "google/protobuf/io/coded_stream.h"      // IWYU pragma:export
+#include "google/protobuf/io/tokenizer.h"         // IWYU pragma:export
+#include "google/protobuf/io/zero_copy_stream.h"  // IWYU pragma:export
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"  // IWYU pragma:export
+#include "google/protobuf/map.h"                    // IWYU pragma:export
+#include "google/protobuf/message.h"                // IWYU pragma:export
+#include "google/protobuf/repeated_field.h"         // IWYU pragma:export
+#include "google/protobuf/text_format.h"            // IWYU pragma:export
+#include "google/protobuf/util/field_comparator.h"  // IWYU pragma:export
+#include "google/protobuf/util/json_util.h"         // IWYU pragma:export
+#include "google/protobuf/util/message_differencer.h"  // IWYU pragma:export
+#include "google/protobuf/util/type_resolver_util.h"  // IWYU pragma:export
 
 namespace tsl {
 
 namespace protobuf = ::google::protobuf;
-using protobuf_int64 = ::google::protobuf::int64;
-using protobuf_uint64 = ::google::protobuf::uint64;
+using protobuf_int64 = int64_t;
+using protobuf_uint64 = uint64_t;
 extern const char* kProtobufInt64Typename;
 extern const char* kProtobufUint64Typename;
 
diff --git a/tensorflow/tsl/platform/static_threadlocal.h b/tensorflow/tsl/platform/static_threadlocal.h
new file mode 100644
index 00000000000..80364cd3b9a
--- /dev/null
+++ b/tensorflow/tsl/platform/static_threadlocal.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PLATFORM_STATIC_THREADLOCAL_H_
+#define TENSORFLOW_TSL_PLATFORM_STATIC_THREADLOCAL_H_
+
+#ifdef _MSC_VER
+#define __thread __declspec(thread)
+#endif
+
+// For POD types in TLS mode, s_obj_VAR is the thread-local variable.
+#define TSL_STATIC_THREAD_LOCAL_POD(_Type_, _var_)                 \
+  static __thread _Type_ s_obj_##_var_;                            \
+  namespace {                                                      \
+  class ThreadLocal_##_var_ {                                      \
+   public:                                                         \
+    ThreadLocal_##_var_() {}                                       \
+    void Init() {}                                                 \
+    inline _Type_ *pointer() const { return &s_obj_##_var_; }      \
+    inline _Type_ *safe_pointer() const { return &s_obj_##_var_; } \
+    _Type_ &get() const { return s_obj_##_var_; }                  \
+    bool is_native_tls() const { return true; }                    \
+                                                                   \
+   private:                                                        \
+    SE_DISALLOW_COPY_AND_ASSIGN(ThreadLocal_##_var_);              \
+  } _var_;                                                         \
+  }
+
+#endif  // TENSORFLOW_TSL_PLATFORM_STATIC_THREADLOCAL_H_
diff --git a/tensorflow/tsl/platform/status.cc b/tensorflow/tsl/platform/status.cc
index 259bc0c691e..8abb4817450 100644
--- a/tensorflow/tsl/platform/status.cc
+++ b/tensorflow/tsl/platform/status.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <memory>
+#include <ostream>
 #include <string>
+#include <unordered_map>
+#include <utility>
 
 #include "absl/base/call_once.h"
 #include "absl/strings/cord.h"
@@ -154,13 +157,13 @@ void Status::MaybeAddSourceLocation(SourceLocation loc) {
   if (state_ == nullptr) {
     return;
   }
-  if (loc.line <= 0) {
+  if (loc.line() <= 0) {
     return;
   }
-  if (loc.file_name == nullptr) {
+  if (loc.file_name() == nullptr) {
     return;
   }
-  if (loc.file_name[0] == '\0') {
+  if (loc.file_name()[0] == '\0') {
     return;
   }
   state_->source_locations.push_back(loc);
@@ -269,10 +272,10 @@ std::string Status::ToString() const {
     result += ": ";
     result += state_->msg;
 
-    for (const std::pair<const std::string, std::string>& element :
+    for (const std::pair<const std::string, absl::Cord>& element :
          state_->payloads) {
       absl::StrAppend(&result, " [", element.first, "='",
-                      absl::CHexEscape(element.second), "']");
+                      absl::CHexEscape(std::string(element.second)), "']");
     }
 
     return result;
@@ -283,9 +286,9 @@ void Status::IgnoreError() const {
   // no-op
 }
 
-void Status::SetPayload(absl::string_view type_url, absl::string_view payload) {
+void Status::SetPayload(absl::string_view type_url, absl::Cord payload) {
   if (ok()) return;
-  state_->payloads[std::string(type_url)] = std::string(payload);
+  state_->payloads[std::string(type_url)] = payload;
 }
 
 absl::optional<absl::Cord> Status::GetPayload(
@@ -293,7 +296,7 @@ absl::optional<absl::Cord> Status::GetPayload(
   if (ok()) return absl::nullopt;
   auto payload_iter = state_->payloads.find(std::string(type_url));
   if (payload_iter == state_->payloads.end()) return absl::nullopt;
-  return absl::Cord(payload_iter->second);
+  return payload_iter->second;
 }
 
 bool Status::ErasePayload(absl::string_view type_url) {
@@ -309,7 +312,7 @@ void Status::ForEachPayload(
     const {
   if (ok()) return;
   for (const auto& payload : state_->payloads) {
-    visitor(payload.first, payload.second);
+    visitor(payload.first, std::string(payload.second));
   }
 }
 
@@ -320,26 +323,31 @@ std::ostream& operator<<(std::ostream& os, const Status& x) {
 
 Status OkStatus() { return Status(); }
 
-Status FromAbslStatus(const absl::Status& s) {
+Status FromAbslStatus(const absl::Status& s, SourceLocation loc) {
   if (s.ok()) {
     return Status();
   }
-  Status converted(static_cast<tsl::error::Code>(s.code()), s.message());
+  absl::Span<const SourceLocation> locs = internal::GetSourceLocations(s);
+  const SourceLocation first_loc = locs.empty() ? loc : locs[0];
+  Status converted(static_cast<tsl::error::Code>(s.code()), s.message(),
+                   first_loc);
+  for (int i = 1; i < locs.size(); ++i) {
+    converted.MaybeAddSourceLocation(locs[i]);
+  }
   s.ForEachPayload(
       [&converted](absl::string_view key, const absl::Cord& value) {
-        converted.SetPayload(key, std::string(value));
+        converted.SetPayload(key, value);
       });
-
   return converted;
 }
 
-absl::Status ToAbslStatus(const ::tsl::Status& s) {
+absl::Status ToAbslStatus(const ::tsl::Status& s, SourceLocation loc) {
   if (s.ok()) {
     return absl::OkStatus();
   }
 
-  absl::Status converted(static_cast<absl::StatusCode>(s.code()),
-                         s.error_message());
+  absl::Status converted = internal::MakeAbslStatus(
+      s.code(), s.error_message(), s.GetSourceLocations(), loc);
   s.ForEachPayload([&converted](tsl::StringPiece key, tsl::StringPiece value) {
     converted.SetPayload(key, absl::Cord(value));
   });
@@ -375,7 +383,7 @@ Status StatusGroup::MakeDerived(const Status& s) {
     // TODO(b/200167936): Serialize an instance of DerivedStatus proto instead
     // of using the string directly. The string is never used so it is not
     // causing any issues at the moment.
-    derived.SetPayload(kDerivedStatusProtoUrl, "");
+    derived.SetPayload(kDerivedStatusProtoUrl, absl::Cord(""));
     return derived;
   }
 }
@@ -404,13 +412,12 @@ void StatusGroup::Update(const Status& s) {
 static constexpr int kMaxAggregatedStatusMessageSize = 8 * 1024;
 static constexpr int kMaxAttachedLogMessageSize = 512;
 
-std::unordered_map<std::string, std::string> StatusGroup::GetPayloads() const {
-  std::unordered_map<std::string, std::string> payloads;
+std::unordered_map<std::string, absl::Cord> StatusGroup::GetPayloads() const {
+  std::unordered_map<std::string, absl::Cord> payloads;
   auto capture_payload = [&payloads](absl::string_view key,
                                      absl::string_view value) {
-    payloads[std::string(key)] = std::string(value);
+    payloads[std::string(key)] = absl::Cord(value);
   };
-
   for (const auto& status : derived_) {
     status.ForEachPayload(capture_payload);
   }
@@ -426,9 +433,8 @@ std::unordered_map<std::string, std::string> StatusGroup::GetPayloads() const {
   return payloads;
 }
 
-Status MakeStatus(
-    tsl::error::Code code, absl::string_view message,
-    const std::unordered_map<std::string, std::string>& payloads) {
+Status MakeStatus(tensorflow::error::Code code, absl::string_view message,
+                  const std::unordered_map<std::string, absl::Cord>& payloads) {
   Status status(code, message);
   for (const auto& payload : payloads) {
     status.SetPayload(payload.first, payload.second);
diff --git a/tensorflow/tsl/platform/status.h b/tensorflow/tsl/platform/status.h
index 1afb9e56d02..0fbc67df9c0 100644
--- a/tensorflow/tsl/platform/status.h
+++ b/tensorflow/tsl/platform/status.h
@@ -31,42 +31,25 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/platform.h"
 #include "tensorflow/tsl/platform/stack_frame.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
+// Include appropriate platform-dependent parts of status.
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/tsl/platform/google/status.h"  // IWYU pragma: export
+#else
+#include "tensorflow/tsl/platform/default/status.h"  // IWYU pragma: export
+#endif
+
 namespace tsl {
 
 #if TF_HAS_CPP_ATTRIBUTE(nodiscard)
 class [[nodiscard]] Status;
 #endif
 
-#if ABSL_HAVE_BUILTIN(__builtin_LINE) && ABSL_HAVE_BUILTIN(__builtin_FILE)
-#define TF_INTERNAL_HAVE_BUILTIN_LINE_FILE 1
-#endif
-
-struct SourceLocation {
-  uint32_t line;
-  const char* file_name;
-
-#ifdef TF_INTERNAL_HAVE_BUILTIN_LINE_FILE
-  static SourceLocation current(uint32_t line = __builtin_LINE(),
-                                const char* file_name = __builtin_FILE()) {
-    SourceLocation loc;
-    loc.line = line;
-    loc.file_name = file_name;
-    return loc;
-  }
-#else
-  static SourceLocation current(uint32_t line = 0,
-                                const char* file_name = nullptr) {
-    SourceLocation loc;
-    loc.line = line;
-    loc.file_name = file_name;
-    return loc;
-  }
-#endif
-};
+typedef SourceLocationImpl SourceLocation;
 
 namespace errors {
 typedef ::tensorflow::error::Code Code;
@@ -96,14 +79,6 @@ class Status {
   Status& operator=(Status&& s) noexcept;
 #endif  // SWIG
 
-  // Prefer using OkStatus().
-#ifndef SWIG
-  ABSL_DEPRECATED(
-      "Use `OkStatus()` (preferred) or `Status()` (which is backward "
-      "compatible with TF v2.9 and lower) instead.")
-#endif
-  static Status OK() { return Status(); }
-
   /// Returns true iff the status indicates success.
   bool ok() const { return (state_ == nullptr); }
 
@@ -188,7 +163,7 @@ class Status {
   // any existing payload for that `type_url`.
   //
   // This function does nothing if the Status is ok.
-  void SetPayload(absl::string_view type_url, absl::string_view payload);
+  void SetPayload(absl::string_view type_url, absl::Cord payload);
 
   // Erases the payload corresponding to the `type_url` key.  Returns `true` if
   // the payload was present.
@@ -216,6 +191,8 @@ class Status {
   absl::Span<const SourceLocation> GetSourceLocations() const;
 
  private:
+  friend Status FromAbslStatus(const absl::Status& s, SourceLocation loc);
+
   void MaybeAddSourceLocation(SourceLocation loc);
 
   static const std::string& empty_string();
@@ -227,7 +204,7 @@ class Status {
 
     tsl::error::Code code;
     std::string msg;
-    std::unordered_map<std::string, std::string> payloads;
+    std::unordered_map<std::string, absl::Cord> payloads;
     absl::InlinedVector<SourceLocation, 4> source_locations;
     std::vector<StackFrame> stack_trace;
   };
@@ -246,8 +223,10 @@ class Status {
 // usage of `OkStatus()` when constructing such an OK status.
 Status OkStatus();
 
-Status FromAbslStatus(const absl::Status& s);
-absl::Status ToAbslStatus(const ::tsl::Status& s);
+Status FromAbslStatus(const absl::Status& s,
+                      SourceLocation loc = SourceLocation::current());
+absl::Status ToAbslStatus(const ::tsl::Status& s,
+                          SourceLocation loc = SourceLocation::current());
 
 // TODO(b/197552541) Move this namespace to errors.h.
 namespace errors {
@@ -279,7 +258,7 @@ class StatusGroup {
   // otherwise one payload value will be chosen in an unspecified but
   // deterministic order.
   // NOTE: The payload marking derived statuses as derived will not be returned.
-  std::unordered_map<std::string, std::string> GetPayloads() const;
+  std::unordered_map<std::string, absl::Cord> GetPayloads() const;
 
   // Return a merged status with combined child status messages with a summary.
   Status as_summary_status() const;
diff --git a/tensorflow/tsl/platform/status_test.cc b/tensorflow/tsl/platform/status_test.cc
index 1212903d6cc..dec8eb1f029 100644
--- a/tensorflow/tsl/platform/status_test.cc
+++ b/tensorflow/tsl/platform/status_test.cc
@@ -24,8 +24,8 @@ namespace tsl {
 
 TEST(ToStringTest, PayloadsArePrinted) {
   Status status = errors::Aborted("Aborted Error Message");
-  status.SetPayload("payload_key",
-                    absl::StrFormat("payload_value %c%c%c", 1, 2, 3));
+  status.SetPayload("payload_key", absl::Cord(absl::StrFormat(
+                                       "payload_value %c%c%c", 1, 2, 3)));
 
   EXPECT_EQ(status.ToString(),
             "ABORTED: Aborted Error Message [payload_key='payload_value "
@@ -34,8 +34,8 @@ TEST(ToStringTest, PayloadsArePrinted) {
 
 TEST(ToStringTest, MatchesAbslStatus) {
   Status status = errors::Aborted("Aborted Error Message");
-  status.SetPayload("payload_key",
-                    absl::StrFormat("payload_value %c%c%c", 1, 2, 3));
+  status.SetPayload("payload_key", absl::Cord(absl::StrFormat(
+                                       "payload_value %c%c%c", 1, 2, 3)));
 
   absl::Status absl_status =
       absl::Status(absl::StatusCode::kAborted, status.error_message());
@@ -69,11 +69,11 @@ TEST(StatusGroupTest, DeterministicOrderWithoutPayloads) {
 
 TEST(StatusGroupTest, DeterministicOrderWithPayloads) {
   Status status_a = errors::Aborted("Status A");
-  status_a.SetPayload("payload_key", "payload_value_a");
+  status_a.SetPayload("payload_key", absl::Cord("payload_value_a"));
   Status status_b = errors::Aborted("Status B");
-  status_b.SetPayload("payload_key", "payload_value_b");
+  status_b.SetPayload("payload_key", absl::Cord("payload_value_b"));
   Status status_c = errors::Aborted("Status C");
-  status_c.SetPayload("payload_key", "payload_value_c");
+  status_c.SetPayload("payload_key", absl::Cord("payload_value_c"));
 
   Status combined =
       StatusGroup({status_a, status_b, status_c}).as_summary_status();
@@ -102,15 +102,18 @@ TEST(StatusGroupTest, DeterministicOrderWithPayloads) {
 
 TEST(StatusGroupTest, PayloadsMergedProperly) {
   Status status_a = errors::Aborted("Status A");
-  status_a.SetPayload("payload_key_a", std::string("payload_value_a"));
+  status_a.SetPayload("payload_key_a",
+                      absl::Cord(std::string("payload_value_a")));
   Status status_b = errors::Aborted("Status B");
-  status_b.SetPayload("payload_key_b", std::string("payload_value_b"));
+  status_b.SetPayload("payload_key_b",
+                      absl::Cord(std::string("payload_value_b")));
   Status status_c = errors::Aborted("Status C");
-  status_c.SetPayload("payload_key_c", std::string("payload_value_c"));
+  status_c.SetPayload("payload_key_c",
+                      absl::Cord(std::string("payload_value_c")));
   Status derived_status_c =
       StatusGroup::MakeDerived(errors::Aborted("Status C"));
-  derived_status_c.SetPayload("payload_key_c",
-                              std::string("derived_payload_value_c"));
+  derived_status_c.SetPayload(
+      "payload_key_c", absl::Cord(std::string("derived_payload_value_c")));
 
   StatusGroup status_group({status_a, status_b, status_c, derived_status_c});
   EXPECT_THAT(status_group.GetPayloads(), ::testing::SizeIs(3));
@@ -123,9 +126,9 @@ TEST(StatusGroupTest, PayloadsMergedProperly) {
 
 TEST(Status, ErrorStatusForEachPayloadIteratesOverAll) {
   Status s(error::INTERNAL, "Error message");
-  s.SetPayload("key1", "value1");
-  s.SetPayload("key2", "value2");
-  s.SetPayload("key3", "value3");
+  s.SetPayload("key1", absl::Cord("value1"));
+  s.SetPayload("key2", absl::Cord("value2"));
+  s.SetPayload("key3", absl::Cord("value3"));
 
   std::unordered_map<std::string, std::string> payloads;
   s.ForEachPayload([&payloads](StringPiece key, StringPiece value) {
@@ -140,9 +143,9 @@ TEST(Status, ErrorStatusForEachPayloadIteratesOverAll) {
 
 TEST(Status, OkStatusForEachPayloadNoIteration) {
   Status s = OkStatus();
-  s.SetPayload("key1", "value1");
-  s.SetPayload("key2", "value2");
-  s.SetPayload("key3", "value3");
+  s.SetPayload("key1", absl::Cord("value1"));
+  s.SetPayload("key2", absl::Cord("value2"));
+  s.SetPayload("key3", absl::Cord("value3"));
 
   std::unordered_map<std::string, std::string> payloads;
   s.ForEachPayload([&payloads](StringPiece key, StringPiece value) {
diff --git a/tensorflow/tsl/platform/strcat.cc b/tensorflow/tsl/platform/strcat.cc
index a83f613efa8..ed2716fac8a 100644
--- a/tensorflow/tsl/platform/strcat.cc
+++ b/tensorflow/tsl/platform/strcat.cc
@@ -57,13 +57,19 @@ AlphaNum::AlphaNum(Hex hex) {
 // after the area just overwritten.  It comes in multiple flavors to minimize
 // call overhead.
 static char *Append1(char *out, const AlphaNum &x) {
+  if (x.data() == nullptr) return out;
+
   memcpy(out, x.data(), x.size());
   return out + x.size();
 }
 
 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
-  memcpy(out, x1.data(), x1.size());
-  out += x1.size();
+  if (x1.data() != nullptr) {
+    memcpy(out, x1.data(), x1.size());
+    out += x1.size();
+  }
+
+  if (x2.data() == nullptr) return out;
 
   memcpy(out, x2.data(), x2.size());
   return out + x2.size();
@@ -71,14 +77,22 @@ static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
 
 static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
                      const AlphaNum &x3, const AlphaNum &x4) {
-  memcpy(out, x1.data(), x1.size());
-  out += x1.size();
+  if (x1.data() != nullptr) {
+    memcpy(out, x1.data(), x1.size());
+    out += x1.size();
+  }
 
-  memcpy(out, x2.data(), x2.size());
-  out += x2.size();
+  if (x2.data() != nullptr) {
+    memcpy(out, x2.data(), x2.size());
+    out += x2.size();
+  }
+
+  if (x3.data() != nullptr) {
+    memcpy(out, x3.data(), x3.size());
+    out += x3.size();
+  }
 
-  memcpy(out, x3.data(), x3.size());
-  out += x3.size();
+  if (x4.data() == nullptr) return out;
 
   memcpy(out, x4.data(), x4.size());
   return out + x4.size();
diff --git a/tensorflow/tsl/platform/strcat_test.cc b/tensorflow/tsl/platform/strcat_test.cc
index abf85ddae10..9c7e47379ec 100644
--- a/tensorflow/tsl/platform/strcat_test.cc
+++ b/tensorflow/tsl/platform/strcat_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/tsl/platform/stringprintf.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/platform/types.h"
@@ -76,6 +77,44 @@ TEST(StrCat, Floats) {
   EXPECT_EQ(answer, "01.5");
 }
 
+TEST(StrCat, Nulls) {
+  string result;
+  // When passed to StrCat the below will produce a NULL data pointer
+  absl::string_view v;
+
+  string strs[] = {"Hello", "Cruel", "World"};
+
+  result = StrCat(v);
+  EXPECT_EQ(result, "");
+
+  result = StrCat(strs[0], v);
+  EXPECT_EQ(result, "Hello");
+
+  result = StrCat(v, strs[0]);
+  EXPECT_EQ(result, "Hello");
+
+  result = StrCat(v, strs[0], strs[1]);
+  EXPECT_EQ(result, "HelloCruel");
+
+  result = StrCat(strs[0], v, strs[1]);
+  EXPECT_EQ(result, "HelloCruel");
+
+  result = StrCat(strs[0], strs[1], v);
+  EXPECT_EQ(result, "HelloCruel");
+
+  result = StrCat(v, strs[0], strs[1], strs[2]);
+  EXPECT_EQ(result, "HelloCruelWorld");
+
+  result = StrCat(strs[0], v, strs[1], strs[2]);
+  EXPECT_EQ(result, "HelloCruelWorld");
+
+  result = StrCat(strs[0], strs[1], v, strs[2]);
+  EXPECT_EQ(result, "HelloCruelWorld");
+
+  result = StrCat(strs[0], strs[1], strs[2], v);
+  EXPECT_EQ(result, "HelloCruelWorld");
+}
+
 TEST(StrCat, Basics) {
   string result;
 
diff --git a/tensorflow/tsl/platform/subprocess_test.cc b/tensorflow/tsl/platform/subprocess_test.cc
index fc0a1ca8bf5..d9de15766c2 100644
--- a/tensorflow/tsl/platform/subprocess_test.cc
+++ b/tensorflow/tsl/platform/subprocess_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/path.h"
-#include "tensorflow/tsl/platform/resource_loader.h"
 #include "tensorflow/tsl/platform/strcat.h"
 #include "tensorflow/tsl/platform/test.h"
 
@@ -41,27 +40,29 @@ static string GetDataFilePath(const string& relative_path) {
 #ifdef PLATFORM_WINDOWS
   // While CreateProcess on windows is resilient to not having ".exe" suffix,
   // Bazel_tools has to have the exact file path to return the resource.
-  return GetDataDependencyFilepath(strings::StrCat(relative_path, ".exe"));
+  return strings::StrCat(relative_path, ".exe");
 #else
-  return GetDataDependencyFilepath(relative_path);
+  return relative_path;
 #endif
 }
 
 string EchoProgram() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "testdata", "test_echo");
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                      "test_echo");
 }
 
 string EchoArgv1Program() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "testdata",
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
                       "test_echo_argv_1");
 }
 
 string NoopProgram() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "testdata", "test_noop");
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                      "test_noop");
 }
 
 string StdErrProgram() {
-  return io::JoinPath("tensorflow", "tsl", "platform", "testdata",
+  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
                       "test_stderr");
 }
 
diff --git a/tensorflow/tsl/platform/test.h b/tensorflow/tsl/platform/test.h
index 35e96c90f6c..ef0f57cb218 100644
--- a/tensorflow/tsl/platform/test.h
+++ b/tensorflow/tsl/platform/test.h
@@ -40,7 +40,7 @@ limitations under the License.
 // better error messages, more maintainable tests and more test coverage.
 #if !defined(PLATFORM_GOOGLE) && !defined(PLATFORM_GOOGLE_ANDROID) && \
     !defined(PLATFORM_CHROMIUMOS)
-#include <gmock/gmock-generated-matchers.h>  // IWYU pragma: export
+#include <gmock/gmock-actions.h>
 #include <gmock/gmock-matchers.h>            // IWYU pragma: export
 #include <gmock/gmock-more-matchers.h>       // IWYU pragma: export
 #endif
@@ -69,6 +69,14 @@ std::string TmpDir();
 // avoiding test specific APIs.
 std::string TensorFlowSrcRoot();
 
+// Returns the path to XLA in the directory containing data
+// dependencies.
+std::string XlaSrcRoot();
+
+// Returns the path to TSL in the directory containing data
+// dependencies.
+std::string TslSrcRoot();
+
 // Return a random number generator seed to use in randomized tests.
 // Returns the same value for the lifetime of the process.
 int RandomSeed();
@@ -77,6 +85,13 @@ int RandomSeed();
 // NOTE: This function is not thread-safe.
 int PickUnusedPortOrDie();
 
+// Constant which is false internally and true in open source.
+#ifdef PLATFORM_GOOGLE
+inline constexpr bool kIsOpenSource = false;
+#else
+inline constexpr bool kIsOpenSource = true;
+#endif  // PLATFORM_GOOGLE
+
 }  // namespace testing
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/platform/testdata/BUILD b/tensorflow/tsl/platform/testdata/BUILD
index 9fcec90f960..f1f513a92b4 100644
--- a/tensorflow/tsl/platform/testdata/BUILD
+++ b/tensorflow/tsl/platform/testdata/BUILD
@@ -3,6 +3,7 @@
 #   Thus helping write cross platform tests.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl/platform:__pkg__",
     ],
diff --git a/tensorflow/tsl/platform/types.h b/tensorflow/tsl/platform/types.h
index 4b68caa6a57..d3c5ca86b71 100644
--- a/tensorflow/tsl/platform/types.h
+++ b/tensorflow/tsl/platform/types.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/tsl/platform/bfloat16.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/platform.h"
 #include "tensorflow/tsl/platform/tstring.h"
 
diff --git a/tensorflow/tsl/platform/windows/BUILD b/tensorflow/tsl/platform/windows/BUILD
index a822326b24c..b8726f7c4ea 100644
--- a/tensorflow/tsl/platform/windows/BUILD
+++ b/tensorflow/tsl/platform/windows/BUILD
@@ -11,6 +11,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core/platform:__pkg__",
         "//tensorflow/tsl/platform:__pkg__",
diff --git a/tensorflow/tsl/platform/windows/windows_file_system.cc b/tensorflow/tsl/platform/windows/windows_file_system.cc
index 4b80de11bc0..f1a53eac926 100644
--- a/tensorflow/tsl/platform/windows/windows_file_system.cc
+++ b/tensorflow/tsl/platform/windows/windows_file_system.cc
@@ -308,11 +308,71 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 
 }  // namespace
 
+#define MAX_LONGPATH_LENGTH 400
+
+static std::wstring GetUncPathName(const std::wstring& path) {
+  WCHAR wcPath[MAX_LONGPATH_LENGTH];
+
+  // boundary case check
+  if (path.size() >= MAX_LONGPATH_LENGTH) {
+    string context = "ERROR: GetUncPathName cannot handle path size >= " +
+                     std::to_string(MAX_LONGPATH_LENGTH) + ", " +
+                     WideCharToUtf8(path);
+    LOG(ERROR) << context;
+    return std::wstring(path);
+  }
+
+  auto rcode =
+      GetFullPathNameW(path.c_str(), MAX_LONGPATH_LENGTH, wcPath, NULL);
+  std::wstring ws_final_path(wcPath);
+  std::wstring uncPath;
+  if (wcPath[0] == '\\' && wcPath[1] == '\\' && wcPath[2] == '?' &&
+      wcPath[3] == '\\') {
+    uncPath = ws_final_path;
+  } else {
+    uncPath = L"\\\\?\\" + ws_final_path;
+  }
+
+  return uncPath;
+}
+
+static std::wstring GetUncPathName(const std::string& path) {
+  return GetUncPathName(Utf8ToWideChar(path));
+}
+
+static std::wstring GetSymbolicLinkTarget(const std::wstring& linkPath) {
+  WCHAR path[MAX_LONGPATH_LENGTH];
+
+  std::wstring uncLinkPath = GetUncPathName(linkPath);
+
+  HANDLE hFile = ::CreateFileW(
+      uncLinkPath.c_str(), GENERIC_READ,
+      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, 0, OPEN_EXISTING,
+      FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED, 0);
+
+  if (INVALID_HANDLE_VALUE != hFile) {
+    auto rcode = GetFinalPathNameByHandleW(hFile, path, MAX_LONGPATH_LENGTH,
+                                           FILE_NAME_NORMALIZED);
+    ::CloseHandle(hFile);
+    if (rcode) {
+      return std::wstring(path, path + rcode);
+    }
+  } else {
+    DWORD dwErr = GetLastError();
+    LOG(ERROR) << "ERROR: GetSymbolicLinkTarget cannot open file for "
+               << WideCharToUtf8(uncLinkPath).c_str()
+               << " GetLastError: " << dwErr << "\n";
+  }
+
+  return uncLinkPath;
+}
+
 Status WindowsFileSystem::NewRandomAccessFile(
     const string& fname, TransactionToken* token,
     std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
   std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
+  std::wstring ws_final_fname = GetSymbolicLinkTarget(ws_translated_fname);
   result->reset();
 
   // Open the file for read-only random access
@@ -323,9 +383,8 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile =
-      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
-                    OPEN_EXISTING, file_flags, NULL);
+  HANDLE hfile = ::CreateFileW(ws_final_fname.c_str(), GENERIC_READ, share_mode,
+                               NULL, OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "NewRandomAccessFile failed to Create/Open: " + fname;
@@ -339,21 +398,20 @@ Status WindowsFileSystem::NewRandomAccessFile(
 Status WindowsFileSystem::NewWritableFile(
     const string& fname, TransactionToken* token,
     std::unique_ptr<WritableFile>* result) {
-  string translated_fname = TranslateName(fname);
-  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
+  std::wstring ws_final_fname = GetUncPathName(TranslateName(fname));
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
   HANDLE hfile =
-      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
-                    NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+      ::CreateFileW(ws_final_fname.c_str(), GENERIC_WRITE, share_mode, NULL,
+                    CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewWriteableFile: " + fname;
     return IOErrorFromWindowsError(context);
   }
 
-  result->reset(new WindowsWritableFile(translated_fname, hfile));
+  result->reset(new WindowsWritableFile(WideCharToUtf8(ws_final_fname), hfile));
   return OkStatus();
 }
 
@@ -479,11 +537,10 @@ Status WindowsFileSystem::FileExists(const string& fname,
 Status WindowsFileSystem::GetChildren(const string& dir,
                                       TransactionToken* token,
                                       std::vector<string>* result) {
-  string translated_dir = TranslateName(dir);
-  std::wstring ws_translated_dir = Utf8ToWideChar(translated_dir);
+  std::wstring ws_fname_final = GetUncPathName(TranslateName(dir));
   result->clear();
 
-  std::wstring pattern = ws_translated_dir;
+  std::wstring pattern = ws_fname_final;
   if (!pattern.empty() && pattern.back() != '\\' && pattern.back() != '/') {
     pattern += L"\\*";
   } else {
@@ -493,7 +550,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   WIN32_FIND_DATAW find_data;
   HANDLE find_handle = ::FindFirstFileW(pattern.c_str(), &find_data);
   if (find_handle == INVALID_HANDLE_VALUE) {
-    string context = "FindFirstFile failed for: " + translated_dir;
+    string context =
+        "FindFirstFile failed for: " + WideCharToUtf8(ws_fname_final);
     return IOErrorFromWindowsError(context);
   }
 
@@ -506,7 +564,7 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   } while (::FindNextFileW(find_handle, &find_data));
 
   if (!::FindClose(find_handle)) {
-    string context = "FindClose failed for: " + translated_dir;
+    string context = "FindClose failed for: " + WideCharToUtf8(ws_fname_final);
     return IOErrorFromWindowsError(context);
   }
 
@@ -516,8 +574,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
 Status WindowsFileSystem::DeleteFile(const string& fname,
                                      TransactionToken* token) {
   Status result;
-  std::wstring file_name = Utf8ToWideChar(fname);
-  if (_wunlink(file_name.c_str()) != 0) {
+  std::wstring ws_fname_final = GetUncPathName(TranslateName(fname));
+  if (_wunlink(ws_fname_final.c_str()) != 0) {
     result = IOError("Failed to delete a file: " + fname, errno);
   }
   return result;
@@ -539,20 +597,36 @@ Status WindowsFileSystem::CreateDir(const string& name,
 Status WindowsFileSystem::DeleteDir(const string& name,
                                     TransactionToken* token) {
   Status result;
-  std::wstring ws_name = Utf8ToWideChar(name);
-  if (_wrmdir(ws_name.c_str()) != 0) {
-    result = IOError("Failed to remove a directory: " + name, errno);
+  WIN32_FIND_DATAW ffd;
+  LARGE_INTEGER filesize;
+
+  std::wstring ws_name = GetUncPathName(TranslateName(name));
+  if (RemoveDirectoryW(ws_name.c_str()) == 0) {
+    DWORD lastError = ::GetLastError();
+    result = IOError("Failed to remove a directory: " + name, lastError);
   }
   return result;
 }
 
+Status WindowsFileSystem::DeleteRecursively(const std::string& dirname,
+                                            TransactionToken* token,
+                                            int64_t* undeleted_files,
+                                            int64_t* undeleted_dirs) {
+  Status result;
+  std::wstring ws1 = GetUncPathName(TranslateName(dirname));
+  std::string dirname_final(ws1.begin(), ws1.end());
+  return FileSystem::DeleteRecursively(dirname_final, token, undeleted_files,
+                                       undeleted_dirs);
+}
+
 Status WindowsFileSystem::GetFileSize(const string& fname,
                                       TransactionToken* token, uint64* size) {
   string translated_fname = TranslateName(fname);
-  std::wstring ws_translated_dir = Utf8ToWideChar(translated_fname);
+  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
+  std::wstring ws_final_fname = GetSymbolicLinkTarget(ws_translated_fname);
   Status result;
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (TRUE == ::GetFileAttributesExW(ws_translated_dir.c_str(),
+  if (TRUE == ::GetFileAttributesExW(ws_final_fname.c_str(),
                                      GetFileExInfoStandard, &attrs)) {
     ULARGE_INTEGER file_size;
     file_size.HighPart = attrs.nFileSizeHigh;
@@ -567,9 +641,10 @@ Status WindowsFileSystem::GetFileSize(const string& fname,
 
 Status WindowsFileSystem::IsDirectory(const string& fname,
                                       TransactionToken* token) {
-  TF_RETURN_IF_ERROR(FileExists(fname));
-  std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
-  if (PathIsDirectoryW(ws_translated_fname.c_str())) {
+  std::wstring ws_final_fname = GetUncPathName(TranslateName(fname));
+  std::string str_final_fname(ws_final_fname.begin(), ws_final_fname.end());
+  TF_RETURN_IF_ERROR(FileExists(str_final_fname));
+  if (PathIsDirectoryW(ws_final_fname.c_str())) {
     return OkStatus();
   }
   return Status(tsl::error::FAILED_PRECONDITION, "Not a directory");
diff --git a/tensorflow/tsl/platform/windows/windows_file_system.h b/tensorflow/tsl/platform/windows/windows_file_system.h
index 2738718994f..44becaa3e4a 100644
--- a/tensorflow/tsl/platform/windows/windows_file_system.h
+++ b/tensorflow/tsl/platform/windows/windows_file_system.h
@@ -67,6 +67,10 @@ class WindowsFileSystem : public FileSystem {
 
   Status DeleteDir(const string& name, TransactionToken* token) override;
 
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64_t* undeleted_files,
+                           int64_t* undeleted_dirs) override;
+
   Status GetFileSize(const string& fname, TransactionToken* token,
                      uint64* size) override;
 
diff --git a/tensorflow/tsl/profiler/BUILD b/tensorflow/tsl/profiler/BUILD
index 7a303bcc88a..1bcde013ee4 100644
--- a/tensorflow/tsl/profiler/BUILD
+++ b/tensorflow/tsl/profiler/BUILD
@@ -1,3 +1,5 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 package_group(
     name = "friends",
     includes = ["//tensorflow/tsl:internal"],
@@ -6,6 +8,7 @@ package_group(
 package_group(
     name = "internal",
     packages = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu/...",
         "//tensorflow/core/profiler/...",
         "//tensorflow/python/eager/...",
         "//tensorflow/python/profiler/...",
@@ -13,3 +16,15 @@ package_group(
         "//tensorflow/tsl/profiler/...",
     ],
 )
+
+package_group(
+    name = "xla_profiler_backends",
+    packages = ["//tensorflow/compiler/xla/backends/profiler/..."],
+)
+
+package_group(
+    name = "xla_internal",
+    packages = [
+        "//tensorflow/compiler/xla/...",
+    ],
+)
diff --git a/tensorflow/tsl/profiler/backends/cpu/BUILD b/tensorflow/tsl/profiler/backends/cpu/BUILD
index 2a6df77d594..cdcf59f1392 100644
--- a/tensorflow/tsl/profiler/backends/cpu/BUILD
+++ b/tensorflow/tsl/profiler/backends/cpu/BUILD
@@ -1,15 +1,17 @@
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 cc_library(
     name = "traceme_recorder",
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/core/profiler:internal",
         "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_profiler_backends",
     ],
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
@@ -30,12 +32,11 @@ cc_library(
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
-        "//tensorflow/core/profiler:__pkg__",
-        "//tensorflow/core/profiler/backends/cpu:__pkg__",
         "//tensorflow/python:__pkg__",
         "//tensorflow/tsl/platform/cloud:__pkg__",
         "//tensorflow/tsl/profiler:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_internal",
     ],
     deps = [
         "//tensorflow/tsl/platform:env",
@@ -49,11 +50,12 @@ cc_library(
     alwayslink = True,
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "traceme_recorder_test",
     srcs = ["traceme_recorder_test.cc"],
     deps = [
         ":traceme_recorder",
+        ":traceme_recorder_impl",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:env_impl",
         "//tensorflow/tsl/platform:logging",
@@ -63,6 +65,7 @@ tf_cc_test(
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/utils:math_utils",
         "//tensorflow/tsl/profiler/utils:time_utils",
+        "//tensorflow/tsl/profiler/utils:time_utils_impl",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -92,8 +95,8 @@ cc_library(
     ],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/core/profiler/backends/cpu:__pkg__",
-        "//tensorflow/tsl/profiler:__pkg__",
+        "//tensorflow/compiler/xla:__subpackages__",
+        "//tensorflow/tsl/profiler:internal",
     ],
     deps = [
         "//tensorflow/tsl/platform:macros",
@@ -109,13 +112,13 @@ cc_library(
     hdrs = ["host_tracer_utils.h"],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/core/profiler:internal",
         "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_internal",
     ],
     deps = [
         ":traceme_recorder",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/utils:parse_annotation",
         "//tensorflow/tsl/profiler/utils:tf_op_utils",
         "//tensorflow/tsl/profiler/utils:xplane_builder",
diff --git a/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.cc b/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.cc
index 171068abd9e..9d22762dfa2 100644
--- a/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.cc
+++ b/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/backends/cpu/traceme_recorder.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/profiler/utils/parse_annotation.h"
 #include "tensorflow/tsl/profiler/utils/tf_op_utils.h"
 #include "tensorflow/tsl/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.h b/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.h
index bda872fc39a..570f233e2f2 100644
--- a/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.h
+++ b/tensorflow/tsl/profiler/backends/cpu/host_tracer_utils.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/backends/cpu/traceme_recorder.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/tensorflow/tsl/profiler/builds/BUILD b/tensorflow/tsl/profiler/builds/BUILD
index abc213cb55a..7837636a416 100644
--- a/tensorflow/tsl/profiler/builds/BUILD
+++ b/tensorflow/tsl/profiler/builds/BUILD
@@ -1,4 +1,11 @@
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/tsl/profiler:internal"],
     licenses = ["notice"],
 )
+
+# ONLY FOR DEV TESTING. DO NOT USE IF YOU DO NOT KNOW ABOUT IT ALREADY.
+config_setting(
+    name = "profiler_build_oss",
+    define_values = {"profiler_build": "oss"},
+)
diff --git a/tensorflow/tsl/profiler/builds/build_config.bzl b/tensorflow/tsl/profiler/builds/build_config.bzl
index 7b938eb9ffe..cd2e2f5558d 100644
--- a/tensorflow/tsl/profiler/builds/build_config.bzl
+++ b/tensorflow/tsl/profiler/builds/build_config.bzl
@@ -1,4 +1,20 @@
 """Provides a redirection point for platform specific implementations of Starlark utilities."""
 
+load(
+    "//tensorflow/tsl/profiler/builds/oss:build_config.bzl",
+    _tf_profiler_alias = "tf_profiler_alias",
+    _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
+)
+load("//tensorflow/tsl:tsl.bzl", "clean_dep")
+
+tf_profiler_pybind_cc_library_wrapper = _tf_profiler_pybind_cc_library_wrapper
+tf_profiler_alias = _tf_profiler_alias
+
 def tf_profiler_copts():
     return []
+
+def if_profiler_oss(if_true, if_false = []):
+    return select({
+        clean_dep("//tensorflow/tsl/profiler/builds:profiler_build_oss"): if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/tensorflow/tsl/profiler/builds/oss/BUILD b/tensorflow/tsl/profiler/builds/oss/BUILD
new file mode 100644
index 00000000000..d406ee17088
--- /dev/null
+++ b/tensorflow/tsl/profiler/builds/oss/BUILD
@@ -0,0 +1,6 @@
+# Tensorflow default + linux implementations of tensorflow/core/profiler libraries.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
diff --git a/tensorflow/core/profiler/builds/oss/build_config.bzl b/tensorflow/tsl/profiler/builds/oss/build_config.bzl
similarity index 75%
rename from tensorflow/core/profiler/builds/oss/build_config.bzl
rename to tensorflow/tsl/profiler/builds/oss/build_config.bzl
index 07b51dcdb39..34e84c8563c 100644
--- a/tensorflow/core/profiler/builds/oss/build_config.bzl
+++ b/tensorflow/tsl/profiler/builds/oss/build_config.bzl
@@ -3,7 +3,7 @@
 TF profiler build macros for use in OSS.
 """
 
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "clean_dep")
+load("//tensorflow/tsl:tsl.bzl", "cc_header_only_library")
 
 def tf_profiler_alias(target_dir, name):
     return target_dir + "oss:" + name
@@ -17,8 +17,3 @@ def tf_profiler_pybind_cc_library_wrapper(name, actual, **kwargs):
     pywrap_tensorflow_internal.so
     """
     cc_header_only_library(name = name, deps = [actual], **kwargs)
-
-def tf_profiler_xla_proto_header():
-    return [
-        clean_dep("//tensorflow/compiler/xla/service:hlo_proto_cc_headers_only"),
-    ]
diff --git a/tensorflow/tsl/profiler/convert/BUILD b/tensorflow/tsl/profiler/convert/BUILD
index d6a0a5215ef..3c044dc8331 100644
--- a/tensorflow/tsl/profiler/convert/BUILD
+++ b/tensorflow/tsl/profiler/convert/BUILD
@@ -2,21 +2,26 @@ load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//tensorflow/core/profiler:internal"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/tsl/profiler:internal"],
     licenses = ["notice"],
 )
 
 cc_library(
     name = "xla_op_utils",
     hdrs = ["xla_op_utils.h"],
+    visibility = [
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_profiler_backends",
+    ],
     deps = ["@com_google_absl//absl/strings"],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "xla_op_utils_test",
     size = "small",
     srcs = ["xla_op_utils_test.cc"],
@@ -32,10 +37,82 @@ cc_library(
     srcs = ["post_process_single_host_xplane.cc"],
     hdrs = ["post_process_single_host_xplane.h"],
     copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/tsl/profiler:internal"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/utils:xplane_schema",
         "//tensorflow/tsl/profiler/utils:xplane_utils",
     ],
 )
+
+cc_library(
+    name = "trace_events_to_json",
+    srcs = ["trace_events_to_json.cc"],
+    hdrs = ["trace_events_to_json.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/tsl/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/tsl/profiler/utils:format_utils",
+        "//tensorflow/tsl/profiler/utils:math_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+tsl_cc_test(
+    name = "trace_events_to_json_test",
+    srcs = ["trace_events_to_json_test.cc"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/tsl/platform:protobuf",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_trace_events",
+    srcs = ["xplane_to_trace_events.cc"],
+    hdrs = ["xplane_to_trace_events.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/tsl/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/tsl/profiler/utils:trace_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tsl_cc_test(
+    name = "xplane_to_trace_events_test",
+    size = "small",
+    srcs = ["xplane_to_trace_events_test.cc"],
+    deps = [
+        ":xplane_to_trace_events",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:trace_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_builder",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+    ],
+)
diff --git a/tensorflow/tsl/profiler/convert/post_process_single_host_xplane.h b/tensorflow/tsl/profiler/convert/post_process_single_host_xplane.h
index 22a0ea7a414..0957b67c96e 100644
--- a/tensorflow/tsl/profiler/convert/post_process_single_host_xplane.h
+++ b/tensorflow/tsl/profiler/convert/post_process_single_host_xplane.h
@@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
 #define TENSORFLOW_TSL_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
 
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/tsl/profiler/convert/trace_events_to_json.cc
similarity index 90%
rename from tensorflow/core/profiler/convert/trace_events_to_json.cc
rename to tensorflow/tsl/profiler/convert/trace_events_to_json.cc
index 5562c78ab4b..a1c70ff9a27 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/convert/trace_events_to_json.h"
+#include "tensorflow/tsl/profiler/convert/trace_events_to_json.h"
 
 #include <algorithm>
 #include <string>
@@ -22,16 +22,21 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "json/json.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/utils/format_utils.h"
-#include "tensorflow/core/profiler/utils/math_utils.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/tsl/profiler/utils/format_utils.h"
+#include "tensorflow/tsl/profiler/utils/math_utils.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 namespace {
 
+using tensorflow::profiler::Device;
+using tensorflow::profiler::Resource;
+using tensorflow::profiler::Trace;
+using tensorflow::profiler::TraceEvent;
+
 // Converts the given time from picoseconds to microseconds and then to a string
 // using maximum precision.
 inline std::string PicosToMicrosString(uint64 ps) {
@@ -128,4 +133,4 @@ std::string TraceEventsToJson(const Trace& trace) {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/convert/trace_events_to_json.h b/tensorflow/tsl/profiler/convert/trace_events_to_json.h
new file mode 100644
index 00000000000..0e7da834fb0
--- /dev/null
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
+
+#include <string>
+
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Converts trace events in the trace proto to a JSON string that can be
+// consumed by catapult trace viewer.
+std::string TraceEventsToJson(const tensorflow::profiler::Trace& trace);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc b/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
similarity index 92%
rename from tensorflow/core/profiler/convert/trace_events_to_json_test.cc
rename to tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
index 2181197e3fc..13ed48000f0 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
+++ b/tensorflow/tsl/profiler/convert/trace_events_to_json_test.cc
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/convert/trace_events_to_json.h"
+#include "tensorflow/tsl/profiler/convert/trace_events_to_json.h"
 
 #include <string>
 
 #include "json/json.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 namespace {
 
+using tensorflow::profiler::Trace;
+
 std::string ConvertTextFormattedTraceToJson(const std::string& trace_str) {
   Trace trace;
   EXPECT_TRUE(protobuf::TextFormat::ParseFromString(trace_str, &trace));
@@ -124,4 +126,4 @@ TEST(TraceEventsToJson, JsonConversion) {
 
 }  // namespace
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
similarity index 82%
rename from tensorflow/core/profiler/convert/xplane_to_trace_events.cc
rename to tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
index 8fe82d56318..e86ea745741 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
+#include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
 
 #include <stddef.h>
 
@@ -24,20 +24,26 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
-#include "tensorflow/core/profiler/utils/trace_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/tsl/profiler/utils/trace_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tsl {
 namespace profiler {
 
 namespace {
 
+using tensorflow::profiler::Device;
+using tensorflow::profiler::Resource;
+using tensorflow::profiler::Trace;
+using tensorflow::profiler::TraceEvent;
+using tensorflow::profiler::XSpace;
+
 void BuildDeviceAndResources(uint32 device_id, const XPlaneVisitor& plane,
                              Device* device) {
   device->set_name(std::string(plane.Name()));
@@ -67,6 +73,9 @@ void ConvertXPlaneToTraceEvents(uint32 device_id, const XPlaneVisitor& xplane,
   // Convert events.
   xplane.ForEachLine([device_id, trace](const XLineVisitor& xline) {
     uint32 resource_id = xline.DisplayId();
+    if (xline.DisplayName() == tsl::profiler::kXlaAsyncOpLineName) {
+      return;
+    }
     xline.ForEachEvent(
         [device_id, resource_id, trace](const XEventVisitor& xevent) {
           int64_t event_type =
@@ -123,6 +132,17 @@ void MaybeDropEventsForTraceViewer(Trace* trace, uint32 limit) {
                       trace_events->end());
 }
 
+uint64 GetTraceViewerMaxEvents() {
+  constexpr uint64 kMaxEvents = 1000000;
+  // Testing only env variable, not recommended for use
+  char* max_events = getenv("TF_PROFILER_TRACE_VIEWER_MAX_EVENTS");
+  if (max_events != nullptr) {
+    return std::stoull(max_events, nullptr, 10);
+  } else {
+    return kMaxEvents;
+  }
+}
+
 void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace) {
   const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreadsPlaneName);
   if (host_plane != nullptr) {
@@ -147,8 +167,8 @@ void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace) {
 
   // Trace viewer (non-streaming) has scalability issues, we need to drop
   // events to avoid loading failure for trace viewer.
-  constexpr uint64 kMaxEvents = 1000000;
-  MaybeDropEventsForTraceViewer(trace, kMaxEvents);
+  uint64 viewer_max_events = GetTraceViewerMaxEvents();
+  MaybeDropEventsForTraceViewer(trace, viewer_max_events);
 }
 
 void ConvertXSpaceToTraceEventsString(const XSpace& xspace,
@@ -159,4 +179,4 @@ void ConvertXSpaceToTraceEventsString(const XSpace& xspace,
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h
new file mode 100644
index 00000000000..f3fdbe7d2df
--- /dev/null
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
+#define TENSORFLOW_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
+
+#include <string>
+
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+void ConvertXSpaceToTraceEvents(const tensorflow::profiler::XSpace& xspace,
+                                tensorflow::profiler::Trace* trace);
+
+void ConvertXSpaceToTraceEventsString(
+    const tensorflow::profiler::XSpace& xspace, std::string* content);
+
+// Not Public API, Testing only.
+void MaybeDropEventsForTraceViewer(tensorflow::profiler::Trace* trace,
+                                   uint32 limit);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
diff --git a/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc
new file mode 100644
index 00000000000..61a88ca4a40
--- /dev/null
+++ b/tensorflow/tsl/profiler/convert/xplane_to_trace_events_test.cc
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
+
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/trace_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using tensorflow::profiler::Trace;
+using tensorflow::profiler::XSpace;
+
+void CreateXSpace(XSpace* space) {
+  XPlaneBuilder host_plane(space->add_planes());
+  host_plane.SetName(kHostThreadsPlaneName);
+  XLineBuilder thread1 = host_plane.GetOrCreateLine(10);
+  thread1.SetName("thread1");
+  XEventBuilder event1 =
+      thread1.AddEvent(*host_plane.GetOrCreateEventMetadata("event1"));
+  event1.SetTimestampNs(150000);
+  event1.SetDurationNs(10000);
+  event1.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Relu"));
+  XLineBuilder thread2 = host_plane.GetOrCreateLine(20);
+  thread2.SetName("thread2");
+  XEventBuilder event2 =
+      thread2.AddEvent(*host_plane.GetOrCreateEventMetadata("event2"));
+  event2.SetTimestampNs(160000);
+  event2.SetDurationNs(10000);
+  event2.AddStatValue(*host_plane.GetOrCreateStatMetadata("tf_op"),
+                      *host_plane.GetOrCreateStatMetadata("Conv2D"));
+
+  XPlaneBuilder device_plane(space->add_planes());
+  device_plane.SetName(GpuPlaneName(0));
+  device_plane.SetId(0);
+  XLineBuilder stream1 = device_plane.GetOrCreateLine(30);
+  stream1.SetName("gpu stream 1");
+  XEventBuilder event3 =
+      stream1.AddEvent(*device_plane.GetOrCreateEventMetadata("kernel1"));
+  event3.SetTimestampNs(180000);
+  event3.SetDurationNs(10000);
+  event3.AddStatValue(*device_plane.GetOrCreateStatMetadata("correlation id"),
+                      55);
+}
+
+TEST(ConvertXPlaneToTraceEvents, Convert) {
+  XSpace xspace;
+  CreateXSpace(&xspace);
+
+  Trace trace;
+  ConvertXSpaceToTraceEvents(xspace, &trace);
+
+  ASSERT_EQ(trace.devices_size(), 2);
+  EXPECT_EQ(trace.devices().at(kHostThreadsDeviceId).resources_size(), 2);
+  EXPECT_EQ(trace.devices().at(kFirstDeviceId).resources_size(), 1);
+  EXPECT_EQ(trace.trace_events_size(), 3);
+}
+
+TEST(ConvertXPlaneToTraceEvents, SkipAsyncOps) {
+  XSpace xspace;
+  XPlaneBuilder device_plane(xspace.add_planes());
+  device_plane.SetName(GpuPlaneName(0));
+
+  XLineBuilder async_ops = device_plane.GetOrCreateLine(10);
+  async_ops.SetName(kXlaAsyncOpLineName);
+
+  XEventBuilder event1 =
+      async_ops.AddEvent(*device_plane.GetOrCreateEventMetadata("event1"));
+  event1.SetTimestampNs(100);
+  event1.SetDurationNs(1);
+
+  Trace trace;
+  ConvertXSpaceToTraceEvents(xspace, &trace);
+
+  ASSERT_THAT(trace.trace_events(), ::testing::IsEmpty());
+}
+
+TEST(ConvertXPlaneToTraceEvents, Drop) {
+  Trace trace;
+  for (int i = 0; i < 100; i++) {
+    trace.add_trace_events()->set_timestamp_ps((100 - i) % 50);
+  }
+
+  MaybeDropEventsForTraceViewer(&trace, 150);
+  EXPECT_EQ(trace.trace_events_size(), 100);  // No dropping.
+
+  MaybeDropEventsForTraceViewer(&trace, 50);
+  EXPECT_EQ(trace.trace_events_size(), 50);
+  for (const auto& event : trace.trace_events()) {
+    EXPECT_LT(event.timestamp_ps(), 25);
+  }
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/lib/BUILD b/tensorflow/tsl/profiler/lib/BUILD
index 6a0d37c5b96..ed41b7dfa9d 100644
--- a/tensorflow/tsl/profiler/lib/BUILD
+++ b/tensorflow/tsl/profiler/lib/BUILD
@@ -1,12 +1,16 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
-load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cc_test")
+load("//tensorflow/tsl:tsl.bzl", "if_not_android")
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "//tensorflow/tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "context_types_hdrs",
     hdrs = ["context_types.h"],
@@ -24,6 +28,8 @@ filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
         "scoped_annotation.h",
+        "scoped_memory_debug_annotation.cc",
+        "scoped_memory_debug_annotation.h",
         "traceme.h",
         "traceme_encode.h",
     ],
@@ -33,6 +39,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_only_runtime",
     srcs = [
+        "connected_traceme.h",
         "context_types.h",
     ],
     visibility = ["//visibility:public"],
@@ -43,14 +50,14 @@ cc_library(
     srcs = ["profiler_controller.cc"],
     hdrs = ["profiler_controller.h"],
     visibility = [
-        "//tensorflow/core/profiler/lib:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
     ],
     deps = [
         ":profiler_interface",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -58,12 +65,12 @@ cc_library(
     name = "profiler_factory",
     hdrs = ["profiler_factory.h"],
     visibility = [
-        "//tensorflow/compiler/xla/backends/profiler:friends",
-        "//tensorflow/core/profiler/lib:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_profiler_backends",
     ],
     deps = [
         ":profiler_interface",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -77,29 +84,29 @@ cc_library(
     ],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/core/profiler:__pkg__",
-        "//tensorflow/core/profiler/lib:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
     ],
     deps = [
         ":profiler_interface",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_controller",
         "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/profiler/lib:profiler_controller",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "profiler_factory_test",
     srcs = ["profiler_factory_test.cc"],
     deps = [
         ":profiler_factory",
+        ":profiler_factory_impl",
         ":profiler_interface",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
     ],
@@ -110,9 +117,9 @@ cc_library(
     hdrs = ["profiler_interface.h"],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/compiler/xla/backends/profiler:friends",
-        "//tensorflow/core/profiler:internal",
         "//tensorflow/tsl:internal",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_profiler_backends",
     ],
     deps = [
         "//tensorflow/tsl/platform:status",
@@ -125,7 +132,7 @@ cc_library(
     srcs = ["profiler_lock.cc"],
     hdrs = ["profiler_lock.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//tensorflow/core/profiler:internal"],
+    visibility = ["//tensorflow/tsl/profiler:internal"],
     deps = [
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:macros",
@@ -134,7 +141,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "profiler_lock_test",
     srcs = ["profiler_lock_test.cc"],
     deps = [
@@ -144,6 +151,61 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "profiler_session",
+    hdrs = ["profiler_session.h"],
+    visibility = ["//tensorflow/tsl:internal"],
+    deps = [
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:platform",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+    ] + if_not_android([
+        ":profiler_interface",
+        ":profiler_lock",
+    ]) + if_static([
+        ":profiler_session_impl",
+    ]),
+)
+
+cc_library(
+    name = "profiler_session_impl",
+    srcs = [
+        "profiler_session.cc",
+        "profiler_session.h",
+    ],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "@com_google_absl//absl/memory",
+    ] + if_not_android([
+        ":profiler_collection",
+        ":profiler_factory",
+        ":profiler_interface",
+        ":profiler_lock",
+        "//tensorflow/tsl/platform",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/convert:post_process_single_host_xplane",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+    ]),
+    alwayslink = True,
+)
+
 cc_library(
     name = "traceme_encode",
     hdrs = ["traceme_encode.h"],
@@ -151,11 +213,12 @@ cc_library(
     deps = [
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:macros",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "traceme_encode_test",
     srcs = ["traceme_encode_test.cc"],
     deps = [
@@ -164,9 +227,16 @@ tf_cc_test(
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
+tf_profiler_pybind_cc_library_wrapper(
+    name = "traceme_for_pybind",
+    actual = ":traceme",
+    visibility = ["//tensorflow/tsl/profiler:xla_internal"],
+)
+
 cc_library(
     name = "traceme",
     hdrs = ["traceme.h"],
@@ -184,14 +254,30 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "nvtx_utils",
+    hdrs = ["nvtx_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:types",
+    ] + if_not_android([
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
+    ]),
+)
+
 cc_library(
     name = "scoped_annotation",
     hdrs = ["scoped_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:macros",
         "//tensorflow/tsl/platform:types",
     ] + if_not_android([
+        ":nvtx_utils",
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
     ]),
 )
@@ -200,17 +286,18 @@ cc_library(
     name = "scoped_annotation_stack",
     hdrs = ["scoped_annotation_stack.h"],
     visibility = [
-        "//tensorflow/compiler/xla:runtime",
-        "//tensorflow/core/profiler/lib:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler:xla_internal",
     ],
     deps = [
         "@com_google_absl//absl/strings",
     ] + if_not_android([
+        ":nvtx_utils",
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
     ]),
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "scoped_annotation_test",
     size = "small",
     srcs = ["scoped_annotation_test.cc"],
@@ -221,6 +308,40 @@ tf_cc_test(
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/profiler/backends/cpu:annotation_stack",
+        "//tensorflow/tsl/profiler/backends/cpu:annotation_stack_impl",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "connected_traceme",
+    hdrs = ["connected_traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":context_types_hdrs",
+        ":traceme",
+        ":traceme_encode",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "profiler_collection",
+    srcs = ["profiler_collection.cc"],
+    hdrs = ["profiler_collection.h"],
+    deps = [
+        ":profiler_interface",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "scoped_memory_debug_annotation",
+    srcs = ["scoped_memory_debug_annotation.cc"],
+    hdrs = ["scoped_memory_debug_annotation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
     ],
 )
diff --git a/tensorflow/tsl/profiler/lib/connected_traceme.h b/tensorflow/tsl/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000000..adae8a1e8e5
--- /dev/null
+++ b/tensorflow/tsl/profiler/lib/connected_traceme.h
@@ -0,0 +1,118 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/tsl/profiler/lib/context_types.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
+#include "tensorflow/tsl/profiler/lib/traceme_encode.h"
+
+namespace tsl {
+namespace profiler {
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context type. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context type and id. The user is responsible for
+ *     providing the same context type and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor, user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, ContextType::kTfExecutor, user_context_id);
+ *
+ * (2) Using the user-provided context type and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context type to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     ContextType::kTfExecutor);
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, ContextType::kTfExecutor, context_id);
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer : public TraceMe {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT&& name,
+                           ContextType context_type = ContextType::kGeneric,
+                           absl::optional<uint64> context_id = absl::nullopt,
+                           int level = 2)
+      : TraceMe(std::forward<NameT>(name), level),
+        context_id_(context_id.has_value() ? context_id.value()
+                                           : TraceMe::NewActivityId()) {
+    AppendMetadata([&] {
+      return TraceMeEncode({{"_pt", context_type}, {"_p", context_id_}});
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  uint64 context_id_;
+};
+
+class TraceMeConsumer : public TraceMe {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT&& name, ContextType context_type, uint64 context_id,
+                  int level = 2)
+      : TraceMe(std::forward<NameT>(name), level) {
+    AppendMetadata([&] {
+      return TraceMeEncode({{"_ct", context_type}, {"_c", context_id}});
+    });
+  }
+
+  template <typename NameT>
+  TraceMeConsumer(NameT&& name, uint64 context_id, int level = 2)
+      : TraceMeConsumer(std::forward<NameT>(name), ContextType::kGeneric,
+                        context_id, level) {}
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/tensorflow/tsl/profiler/lib/context_types.h b/tensorflow/tsl/profiler/lib/context_types.h
index b41a2ab0b1b..dc4a0ebe1f5 100644
--- a/tensorflow/tsl/profiler/lib/context_types.h
+++ b/tensorflow/tsl/profiler/lib/context_types.h
@@ -20,6 +20,7 @@ limitations under the License.
 namespace tsl {
 namespace profiler {
 
+// Note: Please add new context type after all existing ones.
 enum class ContextType : int {
   kGeneric = 0,
   kLegacy,
diff --git a/tensorflow/tsl/profiler/lib/nvtx_utils.h b/tensorflow/tsl/profiler/lib/nvtx_utils.h
new file mode 100644
index 00000000000..1ed8da47878
--- /dev/null
+++ b/tensorflow/tsl/profiler/lib/nvtx_utils.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
+
+#include <optional>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/macros.h"
+
+#if GOOGLE_CUDA
+#include "nvtx3/nvToolsExt.h"
+#endif
+
+namespace tsl {
+namespace profiler {
+namespace nvtx {
+
+// Some typedef to help build without NVTX.
+#if !GOOGLE_CUDA
+typedef void* nvtxEventAttributes_t;
+typedef void* nvtxDomainHandle_t;
+#endif
+
+// A helper function that return the domains to use if NVTX profiling
+// is enabled.
+inline std::optional<nvtxDomainHandle_t> GetNVTXDomain() {
+#if GOOGLE_CUDA
+  static nvtxDomainHandle_t domain;
+  static bool is_enabled = [] {
+    bool _is_enabled = false;
+    // Force NVTX marker if a tool triggered the profiler.
+    domain = nvtxDomainCreateA("TSL");
+    if (domain) {
+      _is_enabled = true;
+    }
+    VLOG(1) << "Is NVTX marker enabled? " << _is_enabled;
+    return _is_enabled;
+  }();
+  if (is_enabled) return domain;
+#endif
+  return {};
+}
+
+// A helper function to decide whether to enable CUDA NVTX profiling ranges.
+inline bool RangesEnabled() {
+#if GOOGLE_CUDA
+  return GetNVTXDomain().has_value();
+#else
+  return false;
+#endif
+}
+
+// Note: The memory backing msg must persist until the result of this function
+// has been consumed by an NVTX API.
+inline void MakeAttributes(const char* msg, nvtxEventAttributes_t* result) {
+  *result = {0};
+#if GOOGLE_CUDA
+  result->version = NVTX_VERSION;
+  result->size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  result->messageType = NVTX_MESSAGE_TYPE_ASCII;
+  result->message.ascii = msg;
+#endif
+}
+
+}  // namespace nvtx
+}  // namespace profiler
+}  // namespace tsl
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/tensorflow/core/profiler/lib/profiler_collection.cc b/tensorflow/tsl/profiler/lib/profiler_collection.cc
similarity index 84%
rename from tensorflow/core/profiler/lib/profiler_collection.cc
rename to tensorflow/tsl/profiler/lib/profiler_collection.cc
index 3929838ad81..0fe5cd9657e 100644
--- a/tensorflow/core/profiler/lib/profiler_collection.cc
+++ b/tensorflow/tsl/profiler/lib/profiler_collection.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/lib/profiler_collection.h"
+#include "tensorflow/tsl/profiler/lib/profiler_collection.h"
 
 #include <memory>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 ProfilerCollection::ProfilerCollection(
@@ -55,4 +55,4 @@ Status ProfilerCollection::CollectData(tensorflow::profiler::XSpace* space) {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/profiler/lib/profiler_collection.h b/tensorflow/tsl/profiler/lib/profiler_collection.h
similarity index 75%
rename from tensorflow/core/profiler/lib/profiler_collection.h
rename to tensorflow/tsl/profiler/lib/profiler_collection.h
index 994e3ed0a0e..85124dc52b7 100644
--- a/tensorflow/core/profiler/lib/profiler_collection.h
+++ b/tensorflow/tsl/profiler/lib/profiler_collection.h
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_COLLECTION_H_
-#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_COLLECTION_H_
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 // ProfilerCollection multiplexes ProfilerInterface calls into a collection of
@@ -43,6 +43,6 @@ class ProfilerCollection : public ProfilerInterface {
 };
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
 
-#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_COLLECTION_H_
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_COLLECTION_H_
diff --git a/tensorflow/tsl/profiler/lib/profiler_controller.cc b/tensorflow/tsl/profiler/lib/profiler_controller.cc
index 152999155e8..143b310c6d1 100644
--- a/tensorflow/tsl/profiler/lib/profiler_controller.cc
+++ b/tensorflow/tsl/profiler/lib/profiler_controller.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/tensorflow/tsl/profiler/lib/profiler_controller.h b/tensorflow/tsl/profiler/lib/profiler_controller.h
index 9f3716f4940..9d31dc37c56 100644
--- a/tensorflow/tsl/profiler/lib/profiler_controller.h
+++ b/tensorflow/tsl/profiler/lib/profiler_controller.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/tensorflow/tsl/profiler/lib/profiler_factory.cc b/tensorflow/tsl/profiler/lib/profiler_factory.cc
index 9166598452b..7bbb36c9a3f 100644
--- a/tensorflow/tsl/profiler/lib/profiler_factory.cc
+++ b/tensorflow/tsl/profiler/lib/profiler_factory.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/tsl/platform/mutex.h"
-#include "tensorflow/core/profiler/lib/profiler_controller.h"
+#include "tensorflow/tsl/profiler/lib/profiler_controller.h"
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tsl {
 namespace profiler {
@@ -50,8 +50,7 @@ std::vector<std::unique_ptr<profiler::ProfilerInterface>> CreateProfilers(
     // A factory might return nullptr based on options.
     if (profiler == nullptr) continue;
     result.emplace_back(
-        std::make_unique<tensorflow::profiler::ProfilerController>(
-          std::move(profiler)));
+        std::make_unique<ProfilerController>(std::move(profiler)));
   }
   return result;
 }
diff --git a/tensorflow/tsl/profiler/lib/profiler_factory.h b/tensorflow/tsl/profiler/lib/profiler_factory.h
index 869ced1c9ad..1ecfb4344c5 100644
--- a/tensorflow/tsl/profiler/lib/profiler_factory.h
+++ b/tensorflow/tsl/profiler/lib/profiler_factory.h
@@ -20,16 +20,15 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 
 namespace tsl {
 namespace profiler {
 
 // A ProfilerFactory returns an instance of ProfilerInterface if ProfileOptions
 // require it. Otherwise, it might return nullptr.
-using ProfilerFactory =
-    std::function<std::unique_ptr<ProfilerInterface>(
-        const tensorflow::ProfileOptions&)>;
+using ProfilerFactory = std::function<std::unique_ptr<ProfilerInterface>(
+    const tensorflow::ProfileOptions&)>;
 
 // Registers a profiler factory. Should be invoked at most once per factory.
 void RegisterProfilerFactory(ProfilerFactory factory);
diff --git a/tensorflow/tsl/profiler/lib/profiler_factory_test.cc b/tensorflow/tsl/profiler/lib/profiler_factory_test.cc
index ecfead42105..dd542916262 100644
--- a/tensorflow/tsl/profiler/lib/profiler_factory_test.cc
+++ b/tensorflow/tsl/profiler/lib/profiler_factory_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
-#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
diff --git a/tensorflow/tsl/profiler/lib/profiler_lock.cc b/tensorflow/tsl/profiler/lib/profiler_lock.cc
index 5cde9ce4a5d..ac44861ed97 100644
--- a/tensorflow/tsl/profiler/lib/profiler_lock.cc
+++ b/tensorflow/tsl/profiler/lib/profiler_lock.cc
@@ -51,7 +51,7 @@ static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
   }
   int already_active = g_session_active.exchange(1, std::memory_order_acq_rel);
   if (already_active) {
-    return errors::AlreadyExists("Another profiling session active.");
+    return errors::AlreadyExists(kProfilerLockContention);
   }
   return ProfilerLock(/*active=*/true);
 }
diff --git a/tensorflow/tsl/profiler/lib/profiler_lock.h b/tensorflow/tsl/profiler/lib/profiler_lock.h
index 5663c618d44..e10bbd52eb8 100644
--- a/tensorflow/tsl/profiler/lib/profiler_lock.h
+++ b/tensorflow/tsl/profiler/lib/profiler_lock.h
@@ -15,11 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_LOCK_H_
 #define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_LOCK_H_
 
+#include <utility>
+
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace tsl {
 namespace profiler {
 
+constexpr absl::string_view kProfilerLockContention =
+    "Another profiling session active.";
+
 // Handle for the profiler lock. At most one instance of this class, the
 // "active" instance, owns the profiler lock.
 class ProfilerLock {
diff --git a/tensorflow/tsl/profiler/lib/profiler_session.cc b/tensorflow/tsl/profiler/lib/profiler_session.cc
new file mode 100644
index 00000000000..c0c8ad2e83e
--- /dev/null
+++ b/tensorflow/tsl/profiler/lib/profiler_session.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
+
+#include <memory>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/platform.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/profiler/convert/post_process_single_host_xplane.h"
+#include "tensorflow/tsl/profiler/lib/profiler_collection.h"
+#include "tensorflow/tsl/profiler/lib/profiler_factory.h"
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/lib/profiler_lock.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+#endif
+
+namespace tsl {
+namespace {
+
+using tensorflow::ProfileOptions;
+using tensorflow::profiler::XSpace;
+
+ProfileOptions GetOptions(const ProfileOptions& opts) {
+  if (opts.version()) return opts;
+  ProfileOptions options = ProfilerSession::DefaultOptions();
+  options.set_include_dataset_ops(opts.include_dataset_ops());
+  return options;
+}
+
+};  // namespace
+
+/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
+    const ProfileOptions& options) {
+  return absl::WrapUnique(new ProfilerSession(options));
+}
+
+Status ProfilerSession::Status() {
+  mutex_lock l(mutex_);
+  return status_;
+}
+
+#if !defined(IS_MOBILE_PLATFORM)
+Status ProfilerSession::CollectDataInternal(XSpace* space) {
+  mutex_lock l(mutex_);
+  TF_RETURN_IF_ERROR(status_);
+  LOG(INFO) << "Profiler session collecting data.";
+  if (profilers_ != nullptr) {
+    profilers_->Stop().IgnoreError();
+    profilers_->CollectData(space).IgnoreError();
+    profilers_.reset();  // data has been collected.
+  }
+  // Allow another session to start.
+  profiler_lock_.ReleaseIfActive();
+  return OkStatus();
+}
+#endif
+
+Status ProfilerSession::CollectData(XSpace* space) {
+#if !defined(IS_MOBILE_PLATFORM)
+  space->add_hostnames(port::Hostname());
+  TF_RETURN_IF_ERROR(CollectDataInternal(space));
+  profiler::PostProcessSingleHostXSpace(space, start_time_ns_);
+#endif
+  return OkStatus();
+}
+
+ProfilerSession::ProfilerSession(const ProfileOptions& options)
+#if defined(IS_MOBILE_PLATFORM)
+    : status_(errors::Unimplemented(
+          "Profiler is unimplemented for mobile platforms.")) {
+#else
+    : options_(GetOptions(options)) {
+  auto profiler_lock = profiler::ProfilerLock::Acquire();
+  if (!profiler_lock.ok()) {
+    status_ = profiler_lock.status();
+    return;
+  }
+  profiler_lock_ = *std::move(profiler_lock);
+
+  LOG(INFO) << "Profiler session initializing.";
+  // Sleep until it is time to start profiling.
+  if (options_.start_timestamp_ns() > 0) {
+    int64_t sleep_duration_ns =
+        options_.start_timestamp_ns() - profiler::GetCurrentTimeNanos();
+    if (sleep_duration_ns < 0) {
+      LOG(WARNING) << "Profiling is late by " << -sleep_duration_ns
+                   << " nanoseconds and will start immediately.";
+    } else {
+      LOG(INFO) << "Delaying start of profiler session by "
+                << sleep_duration_ns;
+      profiler::SleepForNanos(sleep_duration_ns);
+    }
+  }
+
+  LOG(INFO) << "Profiler session started.";
+  start_time_ns_ = profiler::GetCurrentTimeNanos();
+
+  DCHECK(profiler_lock_.Active());
+  profilers_ = std::make_unique<tsl::profiler::ProfilerCollection>(
+      profiler::CreateProfilers(options_));
+  profilers_->Start().IgnoreError();
+#endif
+}
+
+ProfilerSession::~ProfilerSession() {
+#if !defined(IS_MOBILE_PLATFORM)
+  LOG(INFO) << "Profiler session tear down.";
+#endif
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/lib/profiler_session.h b/tensorflow/tsl/profiler/lib/profiler_session.h
new file mode 100644
index 00000000000..a75256e0c58
--- /dev/null
+++ b/tensorflow/tsl/profiler/lib/profiler_session.h
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/platform.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/tsl/profiler/lib/profiler_interface.h"
+#include "tensorflow/tsl/profiler/lib/profiler_lock.h"
+#endif
+
+namespace tsl {
+
+// A profiler which will start profiling when creating the object and will stop
+// when either the object is destroyed or CollectData is called.
+// Multiple instances can be created, but at most one of them will profile.
+// Status() will return OK only for the instance that is profiling.
+// Thread-safety: ProfilerSession is thread-safe.
+class ProfilerSession {
+ public:
+  // Creates a ProfilerSession and starts profiling.
+  static std::unique_ptr<ProfilerSession> Create(
+    const tensorflow::ProfileOptions& options);
+
+  static tensorflow::ProfileOptions DefaultOptions() {
+    tensorflow::ProfileOptions options;
+    options.set_version(1);
+    options.set_device_tracer_level(1);
+    options.set_host_tracer_level(2);
+    options.set_device_type(tensorflow::ProfileOptions::UNSPECIFIED);
+    options.set_python_tracer_level(0);
+    options.set_enable_hlo_proto(true);
+    options.set_include_dataset_ops(true);
+    return options;
+  }
+
+  // Deletes an existing Profiler and enables starting a new one.
+  ~ProfilerSession();
+
+  tsl::Status Status() TF_LOCKS_EXCLUDED(mutex_);
+
+  // Collects profile data into XSpace.
+  tsl::Status CollectData(tensorflow::profiler::XSpace* space)
+      TF_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit ProfilerSession(const tensorflow::ProfileOptions& options);
+
+  // ProfilerSession is neither copyable or movable.
+  ProfilerSession(const ProfilerSession&) = delete;
+  ProfilerSession& operator=(const ProfilerSession&) = delete;
+
+#if !defined(IS_MOBILE_PLATFORM)
+  // Collects profile data into XSpace without post-processsing.
+  tsl::Status CollectDataInternal(tensorflow::profiler::XSpace* space);
+
+  profiler::ProfilerLock profiler_lock_ TF_GUARDED_BY(mutex_);
+
+  std::unique_ptr<profiler::ProfilerInterface> profilers_ TF_GUARDED_BY(mutex_);
+
+  uint64 start_time_ns_;
+  tensorflow::ProfileOptions options_;
+#endif
+  tsl::Status status_ TF_GUARDED_BY(mutex_);
+  mutex mutex_;
+};
+
+}  // namespace tsl
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/tsl/profiler/lib/scoped_annotation.h b/tensorflow/tsl/profiler/lib/scoped_annotation.h
index 2a3720bfdea..bdc18b3b3e9 100644
--- a/tensorflow/tsl/profiler/lib/scoped_annotation.h
+++ b/tensorflow/tsl/profiler/lib/scoped_annotation.h
@@ -18,15 +18,18 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/macros.h"
 #include "tensorflow/tsl/platform/types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tensorflow/tsl/profiler/lib/nvtx_utils.h"
 #endif
 
 namespace tsl {
@@ -41,51 +44,98 @@ namespace profiler {
 //          LaunchKernel2(); // Launches a CUDA kernel.
 //        }
 // This will add 'my kernels' to both kernels in the profiler UI
-class ScopedAnnotation {
+template <bool always_annotate = false>
+class ScopedAnnotationT {
  public:
-  explicit ScopedAnnotation(absl::string_view name) {
+  explicit ScopedAnnotationT(absl::string_view name) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      nvtxEventAttributes_t attrs;
+      std::string name_str(name);
+      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
       old_length_ = AnnotationStack::PushAnnotation(name);
     }
 #endif
   }
 
-  explicit ScopedAnnotation(const char* name)
-      : ScopedAnnotation(absl::string_view(name)) {}
+  explicit ScopedAnnotationT(const char* name)
+      : ScopedAnnotationT(absl::string_view(name)) {}
 
-  explicit ScopedAnnotation(const string& name) {
+  explicit ScopedAnnotationT(const string& name) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      nvtxEventAttributes_t attrs;
+      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
       old_length_ = AnnotationStack::PushAnnotation(name);
     }
 #endif
   }
 
-  explicit ScopedAnnotation(string&& name) {
+  explicit ScopedAnnotationT(string&& name) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      nvtxEventAttributes_t attrs;
+      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
       old_length_ = AnnotationStack::PushAnnotation(std::move(name));
     }
 #endif
   }
 
   template <typename NameGeneratorT>
-  explicit ScopedAnnotation(NameGeneratorT name_generator) {
+  explicit ScopedAnnotationT(NameGeneratorT name_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      old_length_ = AnnotationStack::PushAnnotation(name_generator());
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      auto name = name_generator();
+      nvtxEventAttributes_t attrs;
+      tsl::profiler::nvtx::MakeAttributes(name.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+      auto name = name_generator();
+      old_length_ = AnnotationStack::PushAnnotation(name);
     }
 #endif
   }
 
   // Pops the name passed in the constructor from the current annotation.
-  ~ScopedAnnotation() {
+  ~ScopedAnnotationT() {
     // TODO(b/137971921): without this memory fence, two presubmit tests will
     // fail probably due to compiler in that presubmit config.
     std::atomic_thread_fence(std::memory_order_acquire);
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      ::nvtxDomainRangePop(domain.value());
+    } else  // NOLINT
+#endif
+        if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) {
       AnnotationStack::PopAnnotation(old_length_);
     }
 #endif
@@ -103,11 +153,14 @@ class ScopedAnnotation {
   // signals that annotation is disabled at the constructor.
   static constexpr size_t kInvalidLength = static_cast<size_t>(-1);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedAnnotation);
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedAnnotationT);
 
   size_t old_length_ = kInvalidLength;
 };
 
+using ScopedAnnotation = ScopedAnnotationT<false>;
+using ScopedAnnotationAlways = ScopedAnnotationT<true>;
+
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h b/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
index 75058aee80f..04d8c5a8527 100644
--- a/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
+++ b/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tensorflow/tsl/profiler/lib/nvtx_utils.h"
 #endif
 
 namespace tsl {
@@ -48,7 +49,17 @@ class ScopedAnnotationStack {
  public:
   static int64_t ActivityStart(std::string name) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      nvtxEventAttributes_t attrs;
+      std::string name_str(name);
+      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
       return AnnotationStack::PushAnnotation(std::move(name));
     }
 #endif
@@ -66,7 +77,18 @@ class ScopedAnnotationStack {
   template <typename NameGeneratorT>
   static int64_t ActivityStart(NameGeneratorT name_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      auto name = name_generator();
+      nvtxEventAttributes_t attrs;
+      std::string name_str(name);
+      tsl::profiler::nvtx::MakeAttributes(name_str.c_str(), &attrs);
+      ::nvtxDomainRangePushEx(domain.value(), &attrs);
+    } else  // NOLINT
+#endif
+        if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
       return AnnotationStack::PushAnnotation(name_generator());
     }
 #endif
@@ -75,7 +97,14 @@ class ScopedAnnotationStack {
 
   static void ActivityEnd(int64_t activity_id) {
 #if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(activity_id != kInvalidActivity)) {
+#if GOOGLE_CUDA
+    std::optional<nvtxDomainHandle_t> domain =
+        tsl::profiler::nvtx::GetNVTXDomain();
+    if (TF_PREDICT_FALSE(domain.has_value())) {
+      ::nvtxDomainRangePop(domain.value());
+    } else  // NOLINT
+#endif
+        if (TF_PREDICT_FALSE(activity_id != kInvalidActivity)) {
       AnnotationStack::PopAnnotation(activity_id);
     }
 #endif
diff --git a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.cc b/tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.cc
similarity index 87%
rename from tensorflow/core/profiler/lib/scoped_memory_debug_annotation.cc
rename to tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.cc
index d899bca7726..46203e6ea40 100644
--- a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.cc
+++ b/tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
+#include "tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 /*static*/ MemoryDebugAnnotation*
@@ -24,4 +24,4 @@ ScopedMemoryDebugAnnotation::ThreadMemoryDebugAnnotation() {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h b/tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h
new file mode 100644
index 00000000000..5f86bf1b2e4
--- /dev/null
+++ b/tensorflow/tsl/profiler/lib/scoped_memory_debug_annotation.h
@@ -0,0 +1,112 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+#define TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
+
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+
+namespace tsl {
+namespace profiler {
+
+// Annotations for memory profiling and debugging purpose.
+// ScopedMemoryDebugAnnotation will cache the annotations in thread-local
+// memory, and some allocators will try to tag allocations with the annotations.
+struct MemoryDebugAnnotation {
+  const char* pending_op_name = nullptr;
+  int64_t pending_step_id = 0;
+  const char* pending_region_type = nullptr;
+  int32_t pending_data_type = 0;
+  // A lambda function, when invoked, it will generate the string that describe
+  // the shape of the pending tensor. By default, the TensorShape string is an
+  // empty string.
+  std::function<std::string()> pending_shape_func = []() { return ""; };
+};
+
+// Wrapper class of MemoryDebugAnnotation for RAII.
+class ScopedMemoryDebugAnnotation {
+ public:
+  static const MemoryDebugAnnotation& CurrentAnnotation() {
+    return *ThreadMemoryDebugAnnotation();
+  }
+
+  explicit ScopedMemoryDebugAnnotation(const char* op_name) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    *thread_local_annotation = MemoryDebugAnnotation();
+    thread_local_annotation->pending_op_name = op_name;
+  }
+
+  explicit ScopedMemoryDebugAnnotation(const char* op_name, int64_t step_id) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    *thread_local_annotation = MemoryDebugAnnotation();
+    thread_local_annotation->pending_op_name = op_name;
+    thread_local_annotation->pending_step_id = step_id;
+  }
+
+  // This constructor keeps the pending_op_name and pending_step_id from parent
+  // (if any).  Otherwise it overwrites with op_name.
+  explicit ScopedMemoryDebugAnnotation(
+      const char* op_name, const char* region_type, int32_t data_type,
+      std::function<std::string()>&& pending_shape_func) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    if (!thread_local_annotation->pending_op_name) {
+      thread_local_annotation->pending_op_name = op_name;
+    }
+    thread_local_annotation->pending_region_type = region_type;
+    thread_local_annotation->pending_data_type = data_type;
+    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
+  }
+
+  explicit ScopedMemoryDebugAnnotation(
+      const char* op_name, int64_t step_id, const char* region_type,
+      int32_t data_type, std::function<std::string()>&& pending_shape_func) {
+    MemoryDebugAnnotation* thread_local_annotation =
+        ThreadMemoryDebugAnnotation();
+    last_annotation_ = *thread_local_annotation;
+    thread_local_annotation->pending_op_name = op_name;
+    thread_local_annotation->pending_step_id = step_id;
+    thread_local_annotation->pending_region_type = region_type;
+    thread_local_annotation->pending_data_type = data_type;
+    thread_local_annotation->pending_shape_func = std::move(pending_shape_func);
+  }
+
+  ~ScopedMemoryDebugAnnotation() {
+    *ThreadMemoryDebugAnnotation() = last_annotation_;
+  }
+
+ private:
+  // Returns a pointer to the MemoryDebugAnnotation for the current thread.
+  static MemoryDebugAnnotation* ThreadMemoryDebugAnnotation();
+
+  // Stores the previous values in case the annotations are nested.
+  MemoryDebugAnnotation last_annotation_;
+
+  ScopedMemoryDebugAnnotation(const ScopedMemoryDebugAnnotation&) = delete;
+  ScopedMemoryDebugAnnotation& operator=(const ScopedMemoryDebugAnnotation&) =
+      delete;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_MEMORY_DEBUG_ANNOTATION_H_
diff --git a/tensorflow/tsl/profiler/lib/traceme_encode.h b/tensorflow/tsl/profiler/lib/traceme_encode.h
index 1e94f32e55c..dac5f08515a 100644
--- a/tensorflow/tsl/profiler/lib/traceme_encode.h
+++ b/tensorflow/tsl/profiler/lib/traceme_encode.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "absl/base/attributes.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -31,14 +32,21 @@ namespace profiler {
 
 // An argument passed to TraceMeEncode.
 struct TraceMeArg {
-  // This constructor is required because absl::AlphaNum is non-copyable.
-  template <typename Value>
-  TraceMeArg(absl::string_view k, Value v) : key(k), value(v) {}
+  // String conversions of value types are supported via AlphaNum. We keep a
+  // reference to the AlphaNum's internal buffer here, so it must remain valid
+  // for the lifetime of this object. We cannot store it by value because it is
+  // not safe to construct an AlphaNum as a member of a class, particularly when
+  // AbslStringify is being used (it may reference default arguments that are on
+  // the caller's stack, if we constructed it here those default arguments would
+  // be destroyed before they are used).
+  TraceMeArg(absl::string_view k,
+             const absl::AlphaNum& v ABSL_ATTRIBUTE_LIFETIME_BOUND)
+      : key(k), value(v.Piece()) {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(TraceMeArg);
 
   absl::string_view key;
-  absl::AlphaNum value;
+  absl::string_view value;
 };
 
 namespace traceme_internal {
@@ -74,7 +82,7 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
     for (const auto& arg : args) {
       out = Append(out, arg.key);
       *out++ = '=';
-      out = Append(out, arg.value.Piece());
+      out = Append(out, arg.value);
       *out++ = ',';
     }
     *(out - 1) = '#';
diff --git a/tensorflow/tsl/profiler/lib/traceme_encode_test.cc b/tensorflow/tsl/profiler/lib/traceme_encode_test.cc
index f7fdf53ad3c..d058e27bc09 100644
--- a/tensorflow/tsl/profiler/lib/traceme_encode_test.cc
+++ b/tensorflow/tsl/profiler/lib/traceme_encode_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/tsl/platform/platform.h"
 #include "tensorflow/tsl/platform/test.h"
 
@@ -53,6 +54,27 @@ TEST(TraceMeEncodeTest, TemporaryStringTest) {
 }
 #endif
 
+// This can be removed when the absl version has been updated to include
+// AbslStringify for open source builds.
+#if defined(PLATFORM_GOOGLE)
+
+struct Point {
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Point& p) {
+    absl::Format(&sink, "(%d, %d)", p.x, p.y);
+  }
+
+  int x;
+  int y;
+};
+
+TEST(TraceMeEncodeTest, AbslStringifyTest) {
+  EXPECT_EQ(TraceMeEncode("Plot", {{"point", Point{10, 20}}}),
+            "Plot#point=(10, 20)#");
+}
+
+#endif
+
 TEST(TraceMeEncodeTest, NoNameTest) {
   EXPECT_EQ(TraceMeEncode({{"context", "World"}, {"request_id", 42}}),
             "#context=World,request_id=42#");
diff --git a/tensorflow/tsl/profiler/protobuf/BUILD b/tensorflow/tsl/profiler/protobuf/BUILD
index 3efc8ec8867..9a3a9e75edd 100644
--- a/tensorflow/tsl/profiler/protobuf/BUILD
+++ b/tensorflow/tsl/profiler/protobuf/BUILD
@@ -1,5 +1,8 @@
+# copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 package_group(
     name = "friends",
     includes = [
@@ -15,6 +18,58 @@ tf_proto_library(
     visibility = [":friends"],
 )
 
+tf_proto_library(
+    name = "profiler_options_proto",
+    srcs = ["profiler_options.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_service_monitor_result_proto",
+    srcs = ["profiler_service_monitor_result.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_service_proto",
+    srcs = ["profiler_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    create_grpc_library = True,
+    create_java_proto = False,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":profiler_options_proto",
+        ":profiler_service_monitor_result_proto",
+    ],
+    use_grpc_namespace = True,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_analysis_proto",
+    srcs = ["profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    create_grpc_library = True,
+    create_java_proto = False,
+    make_default_target_header_only = True,
+    protodeps = [":profiler_service_proto"],
+    use_grpc_namespace = True,
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "trace_events_proto",
+    srcs = ["trace_events.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
+
 # This is needed because of how tf_android_core_proto_sources parses proto paths.
 exports_files(
     srcs = ["xplane.proto"],
@@ -24,12 +79,23 @@ exports_files(
     ],
 )
 
+tf_proto_library(
+    name = "profile_proto",
+    srcs = ["profile.proto"],
+    cc_api_version = 2,
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+    ],
+)
+
 tf_proto_library(
     name = "protos_all",
     create_go_proto = False,
     make_default_target_header_only = True,
     protodeps = [
         ":xplane_proto",
+        ":profiler_options_proto",
     ],
     visibility = ["//visibility:public"],
 )
@@ -41,5 +107,4 @@ tf_proto_library(
 #     visibility = [":friends"],
 #     deps = [":xplane_proto"],
 # )
-#
 # copybara:uncomment_end
diff --git a/tensorflow/tsl/profiler/protobuf/profile.proto b/tensorflow/tsl/profiler/protobuf/profile.proto
new file mode 100644
index 00000000000..27aa904c4a5
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/profile.proto
@@ -0,0 +1,71 @@
+// This proto intends to match format expected by pprof tool.
+syntax = "proto3";
+
+package tensorflow.tfprof.pprof;
+
+message Profile {
+  repeated ValueType sample_type = 1;
+  repeated Sample sample = 2;
+  repeated Mapping mapping = 3;
+  repeated Location location = 4;
+  repeated Function function = 5;
+  repeated string string_table = 6;
+  int64 drop_frames = 7;
+  int64 keep_frames = 8;
+  int64 time_nanos = 9;
+  int64 duration_nanos = 10;
+  ValueType period_type = 11;
+  int64 period = 12;
+  repeated int64 comment = 13;
+  int64 default_sample_type = 14;
+}
+
+message ValueType {
+  int64 type = 1;
+  int64 unit = 2;
+}
+
+message Sample {
+  repeated uint64 location_id = 1;
+  repeated int64 value = 2;
+  repeated Label label = 3;
+}
+
+message Label {
+  int64 key = 1;
+  int64 str = 2;
+  int64 num = 3;
+}
+
+message Mapping {
+  uint64 id = 1;
+  uint64 memory_start = 2;
+  uint64 memory_limit = 3;
+  uint64 file_offset = 4;
+  int64 filename = 5;
+  int64 build_id = 6;
+  bool has_functions = 7;
+  bool has_filenames = 8;
+  bool has_line_numbers = 9;
+  bool has_inline_frames = 10;
+}
+
+message Location {
+  uint64 id = 1;
+  uint64 mapping_id = 2;
+  uint64 address = 3;
+  repeated Line line = 4;
+}
+
+message Line {
+  uint64 function_id = 1;
+  int64 line = 2;
+}
+
+message Function {
+  uint64 id = 1;
+  int64 name = 2;
+  int64 system_name = 3;
+  int64 filename = 4;
+  int64 start_line = 5;
+}
diff --git a/tensorflow/tsl/profiler/protobuf/profiler_analysis.proto b/tensorflow/tsl/profiler/protobuf/profiler_analysis.proto
new file mode 100644
index 00000000000..2092e93cf2b
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/profiler_analysis.proto
@@ -0,0 +1,81 @@
+syntax = "proto3";
+
+package tensorflow;
+
+import "tensorflow/tsl/profiler/protobuf/profiler_service.proto";
+
+message NewProfileSessionRequest {
+  ProfileRequest request = 1;
+  // The place where we will dump profile data. We will normally use
+  // MODEL_DIR/plugins/profile as the repository root.
+  string repository_root = 2;
+  repeated string hosts = 3;  // host or host:port, port will be ignored.
+  string session_id = 4;
+}
+
+message NewProfileSessionResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+
+  // Whether all hosts had returned a empty trace.
+  bool empty_trace = 2;
+}
+
+message EnumProfileSessionsAndToolsRequest {
+  string repository_root = 1;
+}
+
+message ProfileSessionInfo {
+  string session_id = 1;
+  // Which tool data is available for consumption.
+  repeated string available_tools = 2;
+}
+
+message EnumProfileSessionsAndToolsResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+  // If success, the returned sessions information are stored here.
+  repeated ProfileSessionInfo sessions = 2;
+}
+
+message ProfileSessionDataRequest {
+  // The place where we will read profile data. We will normally use
+  // MODEL_DIR/plugins/profile as the repository root.
+  string repository_root = 1;
+  string session_id = 2;
+  // Which host the data is associated. if empty, data from all hosts are
+  // aggregated.
+  string host_name = 5;
+  // Which tool
+  string tool_name = 3;
+  // Tool's specific parameters. e.g. TraceViewer's viewport etc
+  map<string, string> parameters = 4;
+}
+
+message ProfileSessionDataResponse {
+  // Auxiliary error_message.
+  string error_message = 1;
+
+  // Output format. e.g. "json" or "proto" or "blob"
+  string output_format = 2;
+
+  // TODO(jiesun): figure out whether to put bytes or oneof tool specific proto.
+  bytes output = 3;
+}
+////////////////////////////////////////////////////////////////////////////////
+// ProfileAnalysis service provide entry point for profiling TPU and for
+// serving profiled data to TensorBoard through GRPC
+////////////////////////////////////////////////////////////////////////////////
+service ProfileAnalysis {
+  // Starts a profiling session, blocks until it completes.
+  // TPUProfileAnalysis service delegate this to TPUProfiler service.
+  // Populate the profiled data in repository, then return status to caller.
+  rpc NewSession(NewProfileSessionRequest) returns (NewProfileSessionResponse) {
+  }
+  // Enumerate existing sessions and return available profile tools.
+  rpc EnumSessions(EnumProfileSessionsAndToolsRequest)
+      returns (EnumProfileSessionsAndToolsResponse) {}
+  // Retrieve specific tool's data for specific session.
+  rpc GetSessionToolData(ProfileSessionDataRequest)
+      returns (ProfileSessionDataResponse) {}
+}
diff --git a/tensorflow/tsl/profiler/protobuf/profiler_options.proto b/tensorflow/tsl/profiler/protobuf/profiler_options.proto
new file mode 100644
index 00000000000..687a2f101c0
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/profiler_options.proto
@@ -0,0 +1,88 @@
+syntax = "proto3";
+
+package tensorflow;
+
+// Next ID: 11
+message ProfileOptions {
+  // Some default value of option are not proto3 default value. Use this version
+  // to determine if we should use default option value instead of proto3
+  // default value.
+  uint32 version = 5;
+
+  enum DeviceType {
+    UNSPECIFIED = 0;
+    CPU = 1;
+    GPU = 2;
+    TPU = 3;
+    PLUGGABLE_DEVICE = 4;
+  }
+
+  // Device type to profile/trace: (version >= 1)
+  // DeviceType::UNSPECIFIED: All registered device profiler will be enabled.
+  // DeviceType::CPU: only CPU will be profiled.
+  // DeviceType::GPU: only CPU/GPU will be profiled.
+  // DeviceType::TPU: only CPU/TPU will be profiled.
+  // DeviceType::PLUGGABLE_DEVICE: only CPU/pluggable devices with profilers
+  // will be profiled.
+  DeviceType device_type = 6;
+
+  // We don't collect the dataset ops by default for better trace-viewer
+  // scalability. The caller can manually set this field to include the ops.
+  bool include_dataset_ops = 1;
+
+  // Levels of host tracing: (version >= 1)
+  // - Level 0 is used to disable host traces.
+  // - Level 1 enables tracing of only user instrumented (or default) TraceMe.
+  // - Level 2 enables tracing of all level 1 TraceMe(s) and instrumented high
+  //           level program execution details (expensive TF ops, XLA ops, etc).
+  //           This is the default.
+  // - Level 3 enables tracing of all level 2 TraceMe(s) and more verbose
+  //           (low-level) program execution details (cheap TF ops, etc).
+  uint32 host_tracer_level = 2;
+
+  // Levels of device tracing: (version >= 1)
+  // - Level 0 is used to disable device traces.
+  // - Level 1 is used to enable device traces.
+  // - More levels might be defined for specific device for controlling the
+  //   verbosity of the trace.
+  uint32 device_tracer_level = 3;
+
+  // Whether enable python function calls tracing. Runtime overhead ensues if
+  // enabled. Default off. (version >= 1)
+  uint32 python_tracer_level = 4;
+
+  // Whether serialize hlo_proto when XLA is used. (version >= 1)
+  bool enable_hlo_proto = 7;
+
+  // The local profiler starts profiling at this Unix timestamp in nanoseconds.
+  uint64 start_timestamp_ns = 8;
+
+  // The local profiler collects `duration_ms` milliseconds of data. If the
+  // value is 0, profiling continues until interrupted.
+  uint64 duration_ms = 9;
+
+  // Directory to save profile data to. No-op when empty.
+  string repository_path = 10;
+}
+
+// Options for remote profiler session manager.
+// Next ID: 6
+message RemoteProfilerSessionManagerOptions {
+  // Options for each local profiler.
+  ProfileOptions profiler_options = 1;
+
+  // List of servers to profile. Supported formats: host:port.
+  repeated string service_addresses = 2;
+
+  // Unix timestamp of when the session was started.
+  uint64 session_creation_timestamp_ns = 3;
+
+  // Maximum time (in milliseconds) a profiling session manager waits for all
+  // profilers to finish after issuing gRPC request. If value is 0, session
+  // continues until interrupted. Otherwise, value must be greater than
+  // profiler_options.duration_ms.
+  uint64 max_session_duration_ms = 4;
+
+  // Start of profiling is delayed by this much (in milliseconds).
+  uint64 delay_ms = 5;
+}
diff --git a/tensorflow/tsl/profiler/protobuf/profiler_service.proto b/tensorflow/tsl/profiler/protobuf/profiler_service.proto
new file mode 100644
index 00000000000..66e019237a1
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/profiler_service.proto
@@ -0,0 +1,121 @@
+syntax = "proto3";
+
+package tensorflow;
+
+import "tensorflow/tsl/profiler/protobuf/profiler_options.proto";
+import "tensorflow/tsl/profiler/protobuf/profiler_service_monitor_result.proto";
+
+// The ProfilerService service retrieves performance information about
+// the programs running on connected devices over a period of time.
+service ProfilerService {
+  // Starts a profiling session, blocks until it completes, and returns data.
+  rpc Profile(ProfileRequest) returns (ProfileResponse) {}
+  // Signal to terminate the Profile rpc for a on-going profiling session,
+  // The Profile rpc will return successfully and prematurely without timeout.
+  // This is used by programmatic mode to end the session in workers.
+  rpc Terminate(TerminateRequest) returns (TerminateResponse) {}
+  // Collects profiling data and returns user-friendly metrics.
+  rpc Monitor(MonitorRequest) returns (MonitorResponse) {}
+}
+
+message ToolRequestOptions {
+  // Required formats for the tool, it should be one of "json", "proto", "raw"
+  // etc. If not specified (backward compatible), use default format, i.e. most
+  // tools use json format.
+  string output_formats = 2;
+
+  // Whether save the result directly to repository or pass it back to caller.
+  // Default to false for backward compatibilities.
+  bool save_to_repo = 3;
+}
+
+// Next-ID: 9
+message ProfileRequest {
+  // In future, the caller will be able to customize when profiling starts and
+  // stops. For now, it collects `duration_ms` milliseconds worth of data.
+  uint64 duration_ms = 1;
+
+  // The maximum number of events to return. By default (value 0), return all
+  // events.
+  uint64 max_events = 2;
+
+  // Required profiling tools name such as "input_pipeline_analyzer" etc
+  repeated string tools = 3;
+
+  // Specifies the requirement for each tools.
+  map<string, ToolRequestOptions> tool_options = 8;
+
+  // Optional profiling options that control how a TF session will be profiled.
+  ProfileOptions opts = 4;
+
+  // The place where we will dump profile data. We will normally use
+  // MODEL_DIR/plugins/profile/ as the repository root.
+  string repository_root = 5;
+
+  // The user provided profile session identifier.
+  string session_id = 6;
+
+  // The hostname of system where the profile should happen.
+  // We use it as identifier in part of our output filename.
+  string host_name = 7;
+
+  // In future, the caller will indicate which TF session is being profiled, and
+  // only data relating to that program will be returned. For now, we assume
+  // all activity during the profiling period is relevant.
+}
+
+message ProfileToolData {
+  // The file name which this data is associated (e.g. "input_pipeline.json",
+  // "cluster_xxx.memory_viewer.json").
+  string name = 1;
+
+  // The data payload (likely json) for the specific tool.
+  bytes data = 2;
+}
+
+// Next-ID: 8
+message ProfileResponse {
+  // Data payload for each required tools.
+  repeated ProfileToolData tool_data = 6;
+
+  // When we write profiling data directly to repository directory, we need a
+  // way to figure out whether the captured trace is empty.
+  bool empty_trace = 7;
+
+  reserved 1, 2, 3, 4, 5;
+}
+
+message TerminateRequest {
+  // Which session id to terminate.
+  string session_id = 1;
+}
+
+message TerminateResponse {}
+
+// Next-ID: 4
+message MonitorRequest {
+  // Duration for which to profile between each update.
+  uint64 duration_ms = 1;
+
+  // Indicates the level at which we want to monitor. Currently, two levels are
+  // supported:
+  // Level 1: An ultra lightweight mode that captures only some utilization
+  // metrics.
+  // Level 2: More verbose than level 1. Collects utilization metrics, device
+  // information, step time information, etc. Do not use this option if the TPU
+  // host is being very heavily used.
+  int32 monitoring_level = 2;
+  // True to display timestamp in monitoring result.
+  bool timestamp = 3;
+}
+
+// Next-ID: 11
+message MonitorResponse {
+  // Properly formatted string data that can be directly returned back to user.
+  string data = 1;
+
+  // A collection of monitoring results for each field show in data.
+  ProfilerServiceMonitorResult monitor_result = 10;
+
+  reserved 2, 3, 4, 5, 6, 7, 8, 9;
+}
diff --git a/tensorflow/tsl/profiler/protobuf/profiler_service_monitor_result.proto b/tensorflow/tsl/profiler/protobuf/profiler_service_monitor_result.proto
new file mode 100644
index 00000000000..48ec2113e2c
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/profiler_service_monitor_result.proto
@@ -0,0 +1,39 @@
+syntax = "proto3";
+
+package tensorflow;
+
+message ProfilerServiceMonitorResult {
+  // Represents the different types of responses from the profiling service.
+  enum ResponseType {
+    // No result is returned from the profiling service.
+    EMPTY_RESULT = 0;
+    // Only device utilization is available.
+    UTIL_ONLY = 1;
+    // Both device utilization and device idle time are available.
+    UTIL_IDLE = 2;
+    // Device utilization, device idle time, step time, and infeed percentage
+    // are all available.
+    UTIL_IDLE_STEP = 3;
+  }
+
+  // Type of profiling responses.
+  ResponseType response_type = 1;
+  // Percentage of time when device is idle.
+  double device_idle_time_percent = 2;
+  // TPU matrix unit utilization percentage.
+  double matrix_unit_utilization_percent = 3;
+  // Average step time in millisecond.
+  double step_time_ms_avg = 4;
+  // Minimum step time in millisecond.
+  double step_time_ms_min = 5;
+  // Maximum step time in millisecond.
+  double step_time_ms_max = 6;
+  // Average infeed percentage.
+  double infeed_percent_avg = 7;
+  // Minimum infeed percentage.
+  double infeed_percent_min = 8;
+  // Maximum infeed percentage.
+  double infeed_percent_max = 9;
+
+  // next-field: 10
+}
diff --git a/tensorflow/tsl/profiler/protobuf/trace_events.proto b/tensorflow/tsl/profiler/protobuf/trace_events.proto
new file mode 100644
index 00000000000..ea1ca85bb8c
--- /dev/null
+++ b/tensorflow/tsl/profiler/protobuf/trace_events.proto
@@ -0,0 +1,72 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+option cc_enable_arenas = true;
+option java_outer_classname = "TraceEventsProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto";
+
+// A 'Trace' contains metadata for the individual traces of a system.
+message Trace {
+  // The devices that this trace has information about. Maps from device_id to
+  // more data about the specific device.
+  map<uint32, Device> devices = 1;
+
+  // All trace events capturing in the profiling period.
+  repeated TraceEvent trace_events = 4;
+}
+
+// A 'device' is a physical entity in the system and is comprised of several
+// resources.
+message Device {
+  // The name of the device.
+  string name = 1;
+
+  // The id of this device, unique in a single trace.
+  uint32 device_id = 2;
+
+  // The resources on this device, keyed by resource_id;
+  map<uint32, Resource> resources = 3;
+}
+
+// A 'resource' generally is a specific computation component on a device. These
+// can range from threads on CPUs to specific arithmetic units on hardware
+// devices.
+message Resource {
+  // The name of the resource.
+  string name = 1;
+
+  // The id of the resource. Unique within a device.
+  uint32 resource_id = 2;
+
+  // The sort index of the resource. Resources within a device are ordered by
+  // this value. if absent, use resource id as sort index.
+  uint32 sort_index = 3;
+}
+
+message TraceEvent {
+  // The id of the device that this event occurred on. The full dataset should
+  // have this device present in the Trace object.
+  uint32 device_id = 1;
+
+  // The id of the resource that this event occurred on. The full dataset should
+  // have this resource present in the Device object of the Trace object. A
+  // resource_id is unique on a specific device, but not necessarily within the
+  // trace.
+  uint32 resource_id = 2;
+
+  // The name of this trace event.
+  string name = 3;
+
+  // The timestamp that this event occurred at (in picos since tracing started).
+  uint64 timestamp_ps = 9;
+
+  // The duration of the event in picoseconds if applicable.
+  // Events without duration are called instant events.
+  uint64 duration_ps = 10;
+
+  // Extra arguments that will be displayed in trace view.
+  map<string, string> args = 11;
+}
diff --git a/tensorflow/tsl/profiler/rpc/BUILD b/tensorflow/tsl/profiler/rpc/BUILD
new file mode 100644
index 00000000000..bbadf860f89
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/BUILD
@@ -0,0 +1,79 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "//tensorflow/tsl/profiler/builds:build_config.bzl",
+    "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
+)
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/tsl/profiler:internal"],
+    licenses = ["notice"],
+)
+
+# Linked to pywrap_tensorflow.
+cc_library(
+    name = "profiler_service_impl",
+    srcs = ["profiler_service_impl.cc"],
+    hdrs = ["profiler_service_impl.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/data/service:__pkg__",
+        "//tensorflow/core/distributed_runtime/rpc:__pkg__",
+        "//tensorflow/core/profiler/rpc:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/tsl/profiler/rpc/client:__pkg__",
+        "//tensorflow_serving/model_servers:__pkg__",
+    ],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_time",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/rpc/client:save_profile",
+        "//tensorflow/tsl/profiler/utils:file_system_utils",
+        "//tensorflow/tsl/profiler/utils:math_utils",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_utils",
+    ] + tsl_grpc_cc_dependencies(),
+)
+
+tf_profiler_pybind_cc_library_wrapper(
+    name = "profiler_server_for_pybind",
+    actual = ":profiler_server_impl",
+    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
+)
+
+cc_library(
+    name = "profiler_server_impl",
+    srcs = ["profiler_server.cc"],
+    hdrs = ["profiler_server.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla:__subpackages__",
+        "//tensorflow/core/profiler/rpc:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler/rpc/client:__pkg__",
+    ],
+    deps = [
+        ":profiler_service_impl",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+    ] + tsl_grpc_cc_dependencies(),
+    alwayslink = True,
+)
diff --git a/tensorflow/tsl/profiler/rpc/client/BUILD b/tensorflow/tsl/profiler/rpc/client/BUILD
new file mode 100644
index 00000000000..5d1d234c562
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/client/BUILD
@@ -0,0 +1,214 @@
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load(
+    "//tensorflow/tsl/platform:build_config.bzl",
+    "tf_protos_profiler_service",
+    "tsl_cc_test",
+)
+load(
+    "//tensorflow/tsl/profiler/builds:build_config.bzl",
+    "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/tsl/profiler:internal",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "capture_profile",
+    srcs = ["capture_profile.cc"],
+    hdrs = ["capture_profile.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [
+        ":profiler_client_for_pybind",
+        ":remote_profiler_session_manager",
+        ":save_profile",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/convert:trace_events_to_json",
+        "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "save_profile",
+    srcs = ["save_profile.cc"],
+    hdrs = ["save_profile.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+        "//tensorflow/tsl/profiler/rpc:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/tsl/lib/io:zlib_compression_options",
+        "//tensorflow/tsl/lib/io:zlib_outputbuffer",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:file_system_utils",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tf_profiler_pybind_cc_library_wrapper(
+    name = "profiler_client_for_pybind",
+    actual = ":profiler_client",
+    visibility = [
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+)
+
+cc_library(
+    name = "profiler_client",
+    hdrs = ["profiler_client.h"],
+    visibility = [
+        "//tensorflow/compiler/xla:__subpackages__",
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [
+        ":profiler_client_impl",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+# Linked to pywrap_tensorflow to avoid ODR violation due to tf_grpc_cc_dependencies().
+cc_library(
+    name = "profiler_client_impl",
+    srcs = [
+        "profiler_client.cc",
+        "profiler_client.h",
+    ],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_analysis_cc_grpc_proto",
+        "//tensorflow/tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
+    ] + tsl_grpc_cc_dependencies(),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "profiler_client_test_util",
+    testonly = 1,
+    hdrs = ["profiler_client_test_util.h"],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+    ] + tf_protos_profiler_service(),
+)
+
+tsl_cc_test(
+    name = "profiler_client_test",
+    srcs = ["profiler_client_test.cc"],
+    deps = [
+        ":profiler_client",
+        ":profiler_client_impl",  # for oss
+        ":profiler_client_test_util",
+        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/lib:profiler_factory_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_session_impl",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
+        "//tensorflow/tsl/profiler/utils:time_utils_impl",
+    ] + tf_protos_profiler_service(),
+)
+
+cc_library(
+    name = "remote_profiler_session_manager",
+    srcs = ["remote_profiler_session_manager.cc"],
+    hdrs = ["remote_profiler_session_manager.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":profiler_client_for_pybind",
+        "//tensorflow/tsl/platform:env_time",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:macros",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/utils:time_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tsl_cc_test(
+    name = "remote_profiler_session_manager_test",
+    srcs = ["remote_profiler_session_manager_test.cc"],
+    deps = [
+        ":profiler_client_impl",  # for oss
+        ":profiler_client_test_util",
+        ":remote_profiler_session_manager",
+        "@com_google_absl//absl/time",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "//tensorflow/tsl/profiler/lib:profiler_factory_impl",
+        "//tensorflow/tsl/profiler/lib:profiler_session_impl",
+        "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
+        "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
+        "//tensorflow/tsl/profiler/utils:time_utils_impl",
+    ] + tf_protos_profiler_service(),
+)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
similarity index 80%
rename from tensorflow/core/profiler/rpc/client/capture_profile.cc
rename to tensorflow/tsl/profiler/rpc/client/capture_profile.cc
index a23b9949b5c..84bb2853af6 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/tsl/profiler/rpc/client/capture_profile.h"
 
 #include <iostream>
 #include <limits>
@@ -24,23 +24,34 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/host_info.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_analysis.pb.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
-#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
-#include "tensorflow/core/profiler/rpc/client/save_profile.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/host_info.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/convert/trace_events_to_json.h"
+#include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_analysis.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
+#include "tensorflow/tsl/profiler/rpc/client/save_profile.h"
+
+namespace tsl {
 namespace profiler {
 namespace {
 
-using ::tensorflow::profiler::RemoteProfilerSessionManager;
-using Response = ::tensorflow::profiler::RemoteProfilerSessionManager::Response;
+using ::tsl::profiler::RemoteProfilerSessionManager;
+using Response = ::tsl::profiler::RemoteProfilerSessionManager::Response;
+
+using tensorflow::MonitorRequest;
+using tensorflow::MonitorResponse;
+using tensorflow::NewProfileSessionRequest;
+using tensorflow::NewProfileSessionResponse;
+using tensorflow::ProfileRequest;
+using tensorflow::ProfileResponse;
+using tensorflow::RemoteProfilerSessionManagerOptions;
+using tensorflow::profiler::XSpace;
 
 constexpr uint64 kMaxEvents = 1000000;
 const absl::string_view kXPlanePb = "xplane.pb";
@@ -177,9 +188,9 @@ Status NewSession(absl::string_view repository_root,
 
 }  // namespace
 
-Status Trace(const std::string& logdir, int num_tracing_attempts,
-             RemoteProfilerSessionManagerOptions& opts,
-             bool is_cloud_tpu_session) {
+Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
+                          RemoteProfilerSessionManagerOptions& opts,
+                          bool is_cloud_tpu_session) {
   DCHECK_GT(opts.profiler_options().duration_ms(), 0);
   DCHECK(!opts.service_addresses().empty());
 
@@ -187,7 +198,6 @@ Status Trace(const std::string& logdir, int num_tracing_attempts,
   std::string session_id = GetCurrentTimeStampAsString();
   std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
   auto duration_ms = opts.profiler_options().duration_ms();
-  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
 
   Status status;
   int remaining_attempts = num_tracing_attempts;
@@ -235,11 +245,23 @@ Status Monitor(const std::string& service_addr, int duration_ms,
   return OkStatus();
 }
 
-Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir) {
-  TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
-  return SaveXSpace(GetTensorBoardProfilePluginDir(logdir),
-                    GetCurrentTimeStampAsString(), port::Hostname(), xspace);
+Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir,
+                           bool also_export_trace_json) {
+  std::string repository_root =
+      tsl::profiler::GetTensorBoardProfilePluginDir(logdir);
+  std::string run = tsl::profiler::GetCurrentTimeStampAsString();
+  std::string host = tsl::port::Hostname();
+  TF_RETURN_IF_ERROR(
+      tsl::profiler::SaveXSpace(repository_root, run, host, xspace));
+  if (also_export_trace_json) {
+    tensorflow::profiler::Trace trace;
+    tsl::profiler::ConvertXSpaceToTraceEvents(xspace, &trace);
+    return tsl::profiler::SaveGzippedToolData(
+        repository_root, run, host, "trace.json.gz",
+        tsl::profiler::TraceEventsToJson(trace));
+  }
+  return OkStatus();
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/client/capture_profile.h b/tensorflow/tsl/profiler/rpc/client/capture_profile.h
new file mode 100644
index 00000000000..f941ac960ad
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/client/capture_profile.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+
+#include <string>
+
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Convert XSpace to tool data and saves under <logdir>/plugins/profile/.
+Status ExportToTensorBoard(const tensorflow::profiler::XSpace& xspace,
+                           const std::string& logdir,
+                           bool also_export_trace_json = false);
+
+// Collects one sample of monitoring profile and shows user-friendly metrics.
+// If timestamp flag is true, timestamp will be displayed in "%H:%M:%S" format.
+Status Monitor(const std::string& service_addr, int duration_ms,
+               int monitoring_level, bool display_timestamp,
+               std::string* result);
+
+// Starts tracing on a single or multiple hosts. Each host will save the result
+// in the given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts. Assumes that options have been validated.
+Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
+                          tensorflow::RemoteProfilerSessionManagerOptions& opts,
+                          bool is_cloud_tpu_session);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/tsl/profiler/rpc/client/profiler_client.cc
similarity index 83%
rename from tensorflow/core/profiler/rpc/client/profiler_client.cc
rename to tensorflow/tsl/profiler/rpc/client/profiler_client.cc
index 151695656e8..b4f14d24769 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/tsl/profiler/rpc/client/profiler_client.cc
@@ -12,24 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
 
 #include <limits>
+#include <memory>
 
 #include "grpcpp/grpcpp.h"
 #include "absl/memory/memory.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 namespace {
 
+using tensorflow::MonitorRequest;
+using tensorflow::MonitorResponse;
+using tensorflow::NewProfileSessionRequest;
+using tensorflow::NewProfileSessionResponse;
+using tensorflow::ProfileRequest;
+using tensorflow::ProfileResponse;
+
 inline Status FromGrpcStatus(const ::grpc::Status& s) {
   return s.ok() ? OkStatus()
                 : Status(static_cast<error::Code>(s.error_code()),
@@ -55,8 +63,8 @@ std::unique_ptr<typename T::Stub> CreateStub(
 Status ProfileGrpc(const std::string& service_address,
                    const ProfileRequest& request, ProfileResponse* response) {
   ::grpc::ClientContext context;
-  std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      CreateStub<grpc::ProfilerService>(service_address);
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub =
+      CreateStub<tensorflow::grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Profile(&context, request, response)));
   return OkStatus();
@@ -66,8 +74,8 @@ Status NewSessionGrpc(const std::string& service_address,
                       const NewProfileSessionRequest& request,
                       NewProfileSessionResponse* response) {
   ::grpc::ClientContext context;
-  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
-      CreateStub<grpc::ProfileAnalysis>(service_address);
+  std::unique_ptr<tensorflow::grpc::ProfileAnalysis::Stub> stub =
+      CreateStub<tensorflow::grpc::ProfileAnalysis>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->NewSession(&context, request, response)));
   return OkStatus();
@@ -76,8 +84,8 @@ Status NewSessionGrpc(const std::string& service_address,
 Status MonitorGrpc(const std::string& service_address,
                    const MonitorRequest& request, MonitorResponse* response) {
   ::grpc::ClientContext context;
-  std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      CreateStub<grpc::ProfilerService>(service_address);
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub =
+      CreateStub<tensorflow::grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Monitor(&context, request, response)));
   return OkStatus();
@@ -97,7 +105,7 @@ RemoteProfilerSession::RemoteProfilerSession(
     const ProfileRequest& profile_request)
     : response_(absl::make_unique<ProfileResponse>()),
       service_address_(service_address),
-      stub_(CreateStub<grpc::ProfilerService>(service_address_)),
+      stub_(CreateStub<tensorflow::grpc::ProfilerService>(service_address_)),
       deadline_(deadline),
       profile_request_(profile_request) {
   response_->set_empty_trace(true);
@@ -159,4 +167,4 @@ std::unique_ptr<ProfileResponse> RemoteProfilerSession::WaitForCompletion(
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/client/profiler_client.h b/tensorflow/tsl/profiler/rpc/client/profiler_client.h
new file mode 100644
index 00000000000..3c8a1f6fd5d
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/client/profiler_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_analysis.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Note that tensorflow/tools/def_file_filter/symbols_pybind.txt is incompatible
+// with absl::string_view.
+Status ProfileGrpc(const std::string& service_address,
+                   const tensorflow::ProfileRequest& request,
+                   tensorflow::ProfileResponse* response);
+
+Status NewSessionGrpc(const std::string& service_address,
+                      const tensorflow::NewProfileSessionRequest& request,
+                      tensorflow::NewProfileSessionResponse* response);
+
+Status MonitorGrpc(const std::string& service_address,
+                   const tensorflow::MonitorRequest& request,
+                   tensorflow::MonitorResponse* response);
+
+class RemoteProfilerSession {
+ public:
+  // Creates an instance and starts a remote profiling session immediately.
+  // This is a non-blocking call and does not wait for a response.
+  // Response must outlive the instantiation.
+  static std::unique_ptr<RemoteProfilerSession> Create(
+      const std::string& service_address, absl::Time deadline,
+      const tensorflow::ProfileRequest& profile_request);
+
+  // Not copyable or movable.
+  RemoteProfilerSession(const RemoteProfilerSession&) = delete;
+  RemoteProfilerSession& operator=(const RemoteProfilerSession&) = delete;
+
+  ~RemoteProfilerSession();
+
+  absl::string_view GetServiceAddress() const { return service_address_; }
+
+  // Blocks until a response has been received or until deadline expiry,
+  // whichever is first. Subsequent calls after the first will yield nullptr and
+  // an error status.
+  std::unique_ptr<tensorflow::ProfileResponse> WaitForCompletion(
+      Status& out_status);
+
+ private:
+  explicit RemoteProfilerSession(
+      const std::string& service_addr, absl::Time deadline,
+      const tensorflow::ProfileRequest& profile_request);
+
+  // Starts a remote profiling session. This is a non-blocking call.
+  // Will be called exactly once during instantiation.
+  // RPC will write to response.profile_response eagerly. However, since
+  // response.status requires a conversion from grpc::Status, it can only be
+  //  evaluated lazily at WaitForCompletion() time.
+  void ProfileAsync();
+
+  Status status_on_completion_;
+  std::unique_ptr<tensorflow::ProfileResponse> response_;
+  // Client address and connection attributes.
+  std::string service_address_;
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Stub> stub_;
+  absl::Time deadline_;
+  ::grpc::ClientContext grpc_context_;
+  std::unique_ptr<
+      ::grpc::ClientAsyncResponseReader<tensorflow::ProfileResponse>>
+      rpc_;
+  ::grpc::Status grpc_status_ = ::grpc::Status::OK;
+
+  // Asynchronous completion queue states.
+  ::grpc::CompletionQueue cq_;
+
+  tensorflow::ProfileRequest profile_request_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test.cc b/tensorflow/tsl/profiler/rpc/client/profiler_client_test.cc
similarity index 90%
rename from tensorflow/core/profiler/rpc/client/profiler_client_test.cc
rename to tensorflow/tsl/profiler/rpc/client/profiler_client_test.cc
index 0783c3c97a5..c2881ff9075 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
+++ b/tensorflow/tsl/profiler/rpc/client/profiler_client_test.cc
@@ -12,27 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
 
 #include <memory>
 #include <string>
 
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/rpc/client/profiler_client_test_util.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client_test_util.h"
+
+namespace tsl {
 namespace profiler {
 namespace {
 
-using ::tensorflow::profiler::test::DurationApproxLess;
-using ::tensorflow::profiler::test::DurationNear;
-using ::tensorflow::profiler::test::StartServer;
+using tensorflow::ProfileRequest;
+using ::tsl::profiler::test::DurationApproxLess;
+using ::tsl::profiler::test::DurationNear;
+using ::tsl::profiler::test::StartServer;
 
 TEST(RemoteProfilerSession, Simple) {
   absl::Duration duration = absl::Milliseconds(10);
@@ -149,4 +150,4 @@ TEST(RemoteProfilerSession, LongDuration) {
 
 }  // namespace
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h b/tensorflow/tsl/profiler/rpc/client/profiler_client_test_util.h
similarity index 78%
rename from tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
rename to tensorflow/tsl/profiler/rpc/client/profiler_client_test_util.h
index a38f72b9473..3c797e83eb4 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
+++ b/tensorflow/tsl/profiler/rpc/client/profiler_client_test_util.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // GRPC client to perform on-demand profiling
 
-#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
-#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
 
 #include <memory>
 #include <string>
@@ -24,18 +24,20 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/rpc/profiler_server.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_server.h"
+
+namespace tsl {
 namespace profiler {
 namespace test {
 
+using tensorflow::ProfileRequest;
+
 inline std::unique_ptr<ProfilerServer> StartServer(
     absl::Duration duration, std::string* service_address,
     ProfileRequest* request = nullptr) {
@@ -76,6 +78,6 @@ inline ::testing::Matcher<absl::Duration> DurationApproxLess(
 
 }  // namespace test
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
 
-#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_UTIL_H_
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
similarity index 87%
rename from tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
rename to tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
index 8db7ac7685b..928a0ff68fc 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
+++ b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
+#include "tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
 
 #include <cstddef>
 #include <memory>
@@ -22,20 +22,23 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
-#include "tensorflow/core/profiler/utils/time_utils.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/env_time.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+
+namespace tsl {
 namespace profiler {
 
+using tensorflow::ProfileRequest;
+using tensorflow::RemoteProfilerSessionManagerOptions;
+
 /*static*/ std::unique_ptr<RemoteProfilerSessionManager>
 RemoteProfilerSessionManager::Create(
     const RemoteProfilerSessionManagerOptions& options,
-    const ProfileRequest& request, tensorflow::Status& out_status,
+    const ProfileRequest& request, Status& out_status,
     AddressResolver resolver) {
   VLOG(1) << "Creating a RemoteProfilerSessionManager.";
   auto session_manager = absl::WrapUnique(
@@ -114,4 +117,4 @@ RemoteProfilerSessionManager::WaitForCompletion() {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h
new file mode 100644
index 00000000000..a4b734a0f27
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
+
+namespace tsl {
+namespace profiler {
+
+using AddressResolver = std::function<std::string(absl::string_view)>;
+
+// Manages one or more remote profiling sessions.
+class RemoteProfilerSessionManager {
+ public:
+  struct Response {
+    std::string service_address;
+    std::unique_ptr<tensorflow::ProfileResponse> profile_response;
+    Status status;
+  };
+  // Instantiates a collection of RemoteProfilerSessions starts profiling on
+  // each of them immediately. Assumes that options have already been validated.
+  static std::unique_ptr<RemoteProfilerSessionManager> Create(
+      const tensorflow::RemoteProfilerSessionManagerOptions& options,
+      const tensorflow::ProfileRequest& request, Status& out_status,
+      AddressResolver resolver = nullptr);
+
+  // Awaits for responses from remote profiler sessions and returns them as a
+  // list. Subsequent calls beyond the first will yield a list of errors.
+  std::vector<Response> WaitForCompletion();
+
+  // Not copyable or movable.
+  RemoteProfilerSessionManager(const RemoteProfilerSessionManager&) = delete;
+  RemoteProfilerSessionManager& operator=(const RemoteProfilerSessionManager&) =
+      delete;
+
+  ~RemoteProfilerSessionManager();
+
+ private:
+  explicit RemoteProfilerSessionManager(
+      tensorflow::RemoteProfilerSessionManagerOptions options,
+      tensorflow::ProfileRequest request, AddressResolver resolver);
+
+  // Initialization of all client contexts.
+  Status Init();
+
+  mutex mutex_;
+  // Remote profiler session options.
+  tensorflow::RemoteProfilerSessionManagerOptions options_
+      TF_GUARDED_BY(mutex_);
+  tensorflow::ProfileRequest request_ TF_GUARDED_BY(mutex_);
+  // List of clients, each connects to a profiling service.
+  std::vector<std::unique_ptr<RemoteProfilerSession>> clients_
+      TF_GUARDED_BY(mutex_);
+  // Resolves an address into a format that gRPC understands.
+  AddressResolver resolver_ TF_GUARDED_BY(mutex_);
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
similarity index 85%
rename from tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
rename to tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
index 216a5c3c1d9..365267e2d8a 100644
--- a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
+#include "tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
 
 #include <memory>
 #include <string>
@@ -20,23 +20,25 @@ limitations under the License.
 
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/rpc/client/profiler_client_test_util.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/profiler_client_test_util.h"
+
+namespace tsl {
 namespace profiler {
 namespace {
 
-using ::tensorflow::profiler::test::DurationApproxLess;
-using ::tensorflow::profiler::test::DurationNear;
-using ::tensorflow::profiler::test::StartServer;
-using ::tensorflow::testing::TmpDir;
-using Response = tensorflow::profiler::RemoteProfilerSessionManager::Response;
+using tensorflow::ProfileRequest;
+using tensorflow::RemoteProfilerSessionManagerOptions;
+using ::tsl::profiler::test::DurationApproxLess;
+using ::tsl::profiler::test::DurationNear;
+using ::tsl::profiler::test::StartServer;
+using ::tsl::testing::TmpDir;
+using Response = tsl::profiler::RemoteProfilerSessionManager::Response;
 
 // Tests have intemittently failed with 2s grace period, so setting this to
 // a large enough value.
@@ -66,8 +68,7 @@ ProfileRequest PopulateProfileRequest(
 TEST(RemoteProfilerSessionManagerTest, Simple) {
   absl::Duration duration = absl::Milliseconds(30);
   RemoteProfilerSessionManagerOptions options;
-  *options.mutable_profiler_options() =
-      tensorflow::ProfilerSession::DefaultOptions();
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
@@ -98,8 +99,7 @@ TEST(RemoteProfilerSessionManagerTest, Simple) {
 TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   absl::Duration duration = absl::Milliseconds(30);
   RemoteProfilerSessionManagerOptions options;
-  *options.mutable_profiler_options() =
-      tensorflow::ProfilerSession::DefaultOptions();
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
@@ -131,8 +131,7 @@ TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
 TEST(RemoteProfilerSessionManagerTest, LongSession) {
   absl::Duration duration = absl::Seconds(3);
   RemoteProfilerSessionManagerOptions options;
-  *options.mutable_profiler_options() =
-      tensorflow::ProfilerSession::DefaultOptions();
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
   options.mutable_profiler_options()->set_duration_ms(
       absl::ToInt64Milliseconds(duration));
 
@@ -163,4 +162,4 @@ TEST(RemoteProfilerSessionManagerTest, LongSession) {
 
 }  // namespace
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/tsl/profiler/rpc/client/save_profile.cc
similarity index 75%
rename from tensorflow/core/profiler/rpc/client/save_profile.cc
rename to tensorflow/tsl/profiler/rpc/client/save_profile.cc
index a1b3f6ec5da..5a8150e6cca 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/tsl/profiler/rpc/client/save_profile.cc
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/rpc/client/save_profile.h"
+#include "tensorflow/tsl/profiler/rpc/client/save_profile.h"
 
 #include <memory>
+#include <ostream>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -27,33 +28,27 @@ limitations under the License.
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/lib/io/zlib_compression_options.h"
-#include "tensorflow/core/lib/io/zlib_outputbuffer.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/file_system.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/profiler_service.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/file_system_utils.h"
-
-// Windows.h #defines ERROR, but it is also used in
-// tensorflow/core/util/event.proto
-#undef ERROR
-#include "tensorflow/core/util/events_writer.h"
-
-namespace tensorflow {
+#include "tensorflow/tsl/lib/io/zlib_compression_options.h"
+#include "tensorflow/tsl/lib/io/zlib_outputbuffer.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/file_system.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/file_system_utils.h"
+
+namespace tsl {
 namespace profiler {
 namespace {
 
-
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 constexpr char kXPlanePb[] = "xplane.pb";
 
 Status DumpToolData(absl::string_view run_dir, absl::string_view host,
-                    const ProfileToolData& tool, std::ostream* os) {
+                    const tensorflow::ProfileToolData& tool, std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix)) return OkStatus();
   std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
@@ -97,25 +92,9 @@ std::string GetTensorBoardProfilePluginDir(const std::string& logdir) {
   return ProfilerJoinPath(logdir, kPluginName, kProfileName);
 }
 
-Status MaybeCreateEmptyEventFile(const std::string& logdir) {
-  // Suffix for an empty event file.  it should be kept in sync with
-  // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
-  constexpr char kProfileEmptySuffix[] = ".profile-empty";
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(logdir));
-
-  std::vector<std::string> children;
-  TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
-  for (const std::string& child : children) {
-    if (absl::EndsWith(child, kProfileEmptySuffix)) {
-      return OkStatus();
-    }
-  }
-  EventsWriter event_writer(ProfilerJoinPath(logdir, "events"));
-  return event_writer.InitWithSuffix(kProfileEmptySuffix);
-}
-
 Status SaveProfile(const std::string& repository_root, const std::string& run,
-                   const std::string& host, const ProfileResponse& response,
+                   const std::string& host,
+                   const tensorflow::ProfileResponse& response,
                    std::ostream* os) {
   if (response.tool_data().empty()) return OkStatus();
   std::string run_dir;
@@ -151,7 +130,8 @@ std::string GetCurrentTimeStampAsString() {
 }
 
 Status SaveXSpace(const std::string& repository_root, const std::string& run,
-                  const std::string& host, const XSpace& xspace) {
+                  const std::string& host,
+                  const tensorflow::profiler::XSpace& xspace) {
   std::string log_dir = ProfilerJoinPath(repository_root, run);
   VLOG(1) << "Creating " << log_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(log_dir));
@@ -167,4 +147,4 @@ Status SaveXSpace(const std::string& repository_root, const std::string& run,
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/client/save_profile.h b/tensorflow/tsl/profiler/rpc/client/save_profile.h
new file mode 100644
index 00000000000..4c527c47fcf
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/client/save_profile.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
+
+#include <ostream>
+#include <string>
+
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+std::string GetCurrentTimeStampAsString();
+
+// Returns the profile plugin directory given a logdir to TensorBoard.
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
+
+// Saves all profiling tool data in a profile to <repository_root>/<run>/.
+// This writes user-facing log messages to `os`.
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
+Status SaveProfile(const std::string& repository_root, const std::string& run,
+                   const std::string& host,
+                   const tensorflow::ProfileResponse& response,
+                   std::ostream* os);
+
+// Gzip the data and save to <repository_root>/<run>/.
+Status SaveGzippedToolData(const std::string& repository_root,
+                           const std::string& run, const std::string& host,
+                           const std::string& tool_name,
+                           const std::string& data);
+
+// Save XSpace to <repository_root>/<run>/<host>_<port>.<kXPlanePb>.
+Status SaveXSpace(const std::string& repository_root, const std::string& run,
+                  const std::string& host,
+                  const tensorflow::profiler::XSpace& xspace);
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/tsl/profiler/rpc/profiler_server.cc
similarity index 83%
rename from tensorflow/core/profiler/rpc/profiler_server.cc
rename to tensorflow/tsl/profiler/rpc/profiler_server.cc
index 26bbf1f40dd..73bd3e2b4d0 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/tsl/profiler/rpc/profiler_server.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_server.h"
 
 #include <memory>
 #include <string>
 
 #include "grpcpp/grpcpp.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
-#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/rpc/profiler_service_impl.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 void ProfilerServer::StartProfilerServer(int32_t port) {
@@ -57,4 +57,4 @@ ProfilerServer::~ProfilerServer() {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/profiler_server.h b/tensorflow/tsl/profiler/rpc/profiler_server.h
new file mode 100644
index 00000000000..be0d33627c9
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/profiler_server.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVER_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVER_H_
+
+#include <memory>
+
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+class ProfilerServer {
+ public:
+  ~ProfilerServer();
+  // Starts a profiler server with a given port.
+  void StartProfilerServer(int32_t port);
+
+ private:
+  std::unique_ptr<tensorflow::grpc::ProfilerService::Service> service_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc b/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc
new file mode 100644
index 00000000000..f920ab277b9
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/profiler_service_impl.cc
@@ -0,0 +1,137 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/rpc/profiler_service_impl.h"
+
+#include <memory>
+
+#include "grpcpp/support/status.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_replace.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/env_time.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/macros.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/rpc/client/save_profile.h"
+#include "tensorflow/tsl/profiler/utils/file_system_utils.h"
+#include "tensorflow/tsl/profiler/utils/math_utils.h"
+#include "tensorflow/tsl/profiler/utils/time_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_utils.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using tensorflow::MonitorRequest;
+using tensorflow::MonitorResponse;
+using tensorflow::ProfileRequest;
+using tensorflow::ProfileResponse;
+using tensorflow::TerminateRequest;
+using tensorflow::TerminateResponse;
+
+// Collects data in XSpace format. The data is saved to a repository
+// unconditionally.
+Status CollectDataToRepository(const ProfileRequest& request,
+                               ProfilerSession* profiler,
+                               ProfileResponse* response) {
+  response->set_empty_trace(true);
+  // Read the profile data into xspace.
+  XSpace xspace;
+  TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
+  VLOG(3) << "Collected XSpace to repository.";
+  response->set_empty_trace(IsEmpty(xspace));
+
+  return SaveXSpace(request.repository_root(), request.session_id(),
+                    request.host_name(), xspace);
+}
+
+class ProfilerServiceImpl : public tensorflow::grpc::ProfilerService::Service {
+ public:
+  ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
+                         MonitorResponse* response) override {
+    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
+  }
+
+  ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
+                         ProfileResponse* response) override {
+    VLOG(1) << "Received a profile request: " << req->DebugString();
+    std::unique_ptr<ProfilerSession> profiler =
+        ProfilerSession::Create(req->opts());
+    Status status = profiler->Status();
+    if (!status.ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            status.error_message());
+    }
+
+    Env* env = Env::Default();
+    uint64 duration_ns = MilliToNano(req->opts().duration_ms());
+    uint64 deadline = GetCurrentTimeNanos() + duration_ns;
+    while (GetCurrentTimeNanos() < deadline) {
+      env->SleepForMicroseconds(EnvTime::kMillisToMicros);
+      if (ctx->IsCancelled()) {
+        return ::grpc::Status::CANCELLED;
+      }
+      if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
+        mutex_lock lock(mutex_);
+        stop_signals_per_session_.erase(req->session_id());
+        break;
+      }
+    }
+
+    status = CollectDataToRepository(*req, profiler.get(), response);
+    if (!status.ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            status.error_message());
+    }
+
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Terminate(::grpc::ServerContext* ctx,
+                           const TerminateRequest* req,
+                           TerminateResponse* response) override {
+    mutex_lock lock(mutex_);
+    stop_signals_per_session_[req->session_id()] = true;
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  bool IsStopped(const std::string& session_id) {
+    mutex_lock lock(mutex_);
+    auto it = stop_signals_per_session_.find(session_id);
+    return it != stop_signals_per_session_.end() && it->second;
+  }
+
+  mutex mutex_;
+  absl::flat_hash_map<std::string, bool> stop_signals_per_session_
+      ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace
+
+std::unique_ptr<tensorflow::grpc::ProfilerService::Service>
+CreateProfilerService() {
+  return std::make_unique<ProfilerServiceImpl>();
+}
+
+}  // namespace profiler
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/profiler_service_impl.h b/tensorflow/tsl/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 00000000000..ac2150e0607
--- /dev/null
+++ b/tensorflow/tsl/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "tensorflow/tsl/profiler/protobuf/profiler_service.grpc.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+std::unique_ptr<tensorflow::grpc::ProfilerService::Service>
+CreateProfilerService();
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/tsl/profiler/utils/BUILD b/tensorflow/tsl/profiler/utils/BUILD
index 844a5efcba0..0677301e63e 100644
--- a/tensorflow/tsl/profiler/utils/BUILD
+++ b/tensorflow/tsl/profiler/utils/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/tsl/profiler:internal",
     ],
@@ -13,7 +14,6 @@ package(
 package_group(
     name = "friends",
     includes = [
-        "//tensorflow/core/profiler:friends",
         "//tensorflow/tsl/profiler:friends",
     ],
 )
@@ -23,6 +23,14 @@ cc_library(
     hdrs = ["math_utils.h"],
 )
 
+cc_library(
+    name = "format_utils",
+    hdrs = ["format_utils.h"],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+    ],
+)
+
 cc_library(
     name = "time_utils",
     hdrs = ["time_utils.h"],
@@ -43,7 +51,7 @@ cc_library(
     ],
     copts = tf_profiler_copts(),
     visibility = [
-        "//tensorflow/compiler/xla/stream_executor:__subpackages__",
+        "//tensorflow/compiler/xla:__subpackages__",
         "//tensorflow/tsl/platform/cloud:__pkg__",
         "//tensorflow/tsl/profiler:internal",
     ],
@@ -66,7 +74,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "timespan_test",
     srcs = ["timespan_test.cc"],
     deps = [
@@ -88,7 +96,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "tf_op_utils_test",
     size = "small",
     srcs = ["tf_op_utils_test.cc"],
@@ -157,7 +165,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "xplane_builder_test",
     size = "small",
     srcs = ["xplane_builder_test.cc"],
@@ -175,6 +183,10 @@ cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
     copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+    ],
     deps = [
         "//tensorflow/tsl/platform:types",
     ],
@@ -208,7 +220,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "xplane_utils_test",
     srcs = ["xplane_utils_test.cc"],
     deps = [
@@ -217,11 +229,11 @@ tf_cc_test(
         ":xplane_schema",
         ":xplane_utils",
         ":xplane_visitor",
-        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:tf_xplane_visitor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -251,7 +263,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "parse_annotation_test",
     srcs = ["parse_annotation_test.cc"],
     deps = [
@@ -306,7 +318,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "group_events_test",
     srcs = ["group_events_test.cc"],
     deps = [
@@ -337,7 +349,7 @@ cc_library(
     ],
 )
 
-tf_cc_test(
+tsl_cc_test(
     name = "tpu_xplane_utils_test",
     srcs = ["tpu_xplane_utils_test.cc"],
     deps = [
@@ -349,3 +361,40 @@ tf_cc_test(
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
+
+cc_library(
+    name = "file_system_utils",
+    hdrs = ["file_system_utils.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/tsl/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "buffer_pool",
+    srcs = ["buffer_pool.cc"],
+    hdrs = ["buffer_pool.h"],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/compiler/xla/backends/profiler/gpu:__pkg__",
+        "//tensorflow/tsl/profiler:internal",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:mutex",
+        "//tensorflow/tsl/platform:platform_port",
+        "//tensorflow/tsl/platform:thread_annotations",
+    ],
+)
+
+tsl_cc_test(
+    name = "buffer_pool_test",
+    srcs = ["buffer_pool_test.cc"],
+    deps = [
+        ":buffer_pool",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/buffer_pool.cc b/tensorflow/tsl/profiler/utils/buffer_pool.cc
similarity index 91%
rename from tensorflow/core/profiler/utils/buffer_pool.cc
rename to tensorflow/tsl/profiler/utils/buffer_pool.cc
index 102f9149cb8..7c51c1a8519 100644
--- a/tensorflow/core/profiler/utils/buffer_pool.cc
+++ b/tensorflow/tsl/profiler/utils/buffer_pool.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/utils/buffer_pool.h"
+#include "tensorflow/tsl/profiler/utils/buffer_pool.h"
 
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/mem.h"
+#include "tensorflow/tsl/platform/mutex.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 BufferPool::BufferPool(size_t buffer_size_in_bytes)
@@ -81,4 +81,4 @@ size_t BufferPool::GetBufferSizeInBytes() const {
 }
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/core/profiler/utils/buffer_pool.h b/tensorflow/tsl/profiler/utils/buffer_pool.h
similarity index 85%
rename from tensorflow/core/profiler/utils/buffer_pool.h
rename to tensorflow/tsl/profiler/utils/buffer_pool.h
index 49ebb90049d..7f0bb8508fd 100644
--- a/tensorflow/core/profiler/utils/buffer_pool.h
+++ b/tensorflow/tsl/profiler/utils/buffer_pool.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_UTILS_BUFFER_POOL_H_
-#define TENSORFLOW_CORE_PROFILER_UTILS_BUFFER_POOL_H_
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_BUFFER_POOL_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_BUFFER_POOL_H_
 
 #include <vector>
 
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/tsl/platform/mutex.h"
+#include "tensorflow/tsl/platform/thread_annotations.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 
 // A lightweight buffer management class for tracking fixed sized buffers that
@@ -57,6 +57,6 @@ class BufferPool {
 };
 
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
 
-#endif  // TENSORFLOW_CORE_PROFILER_UTILS_BUFFER_POOL_H_
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_BUFFER_POOL_H_
diff --git a/tensorflow/core/profiler/utils/buffer_pool_test.cc b/tensorflow/tsl/profiler/utils/buffer_pool_test.cc
similarity index 95%
rename from tensorflow/core/profiler/utils/buffer_pool_test.cc
rename to tensorflow/tsl/profiler/utils/buffer_pool_test.cc
index 29b23b6ba63..8b561b08032 100644
--- a/tensorflow/core/profiler/utils/buffer_pool_test.cc
+++ b/tensorflow/tsl/profiler/utils/buffer_pool_test.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/utils/buffer_pool.h"
+#include "tensorflow/tsl/profiler/utils/buffer_pool.h"
 
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/platform/test.h"
 
-namespace tensorflow {
+namespace tsl {
 namespace profiler {
 namespace {
 
@@ -91,4 +91,4 @@ TEST(BufferPoolTest, DestroyAllBuffers) {
 
 }  // namespace
 }  // namespace profiler
-}  // namespace tensorflow
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/file_system_utils.h b/tensorflow/tsl/profiler/utils/file_system_utils.h
new file mode 100644
index 00000000000..11038c11db7
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/file_system_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/tsl/platform/platform.h"
+
+#ifdef PLATFORM_WINDOWS
+const absl::string_view kPathSep = "\\";
+#else
+const absl::string_view kPathSep = "/";
+#endif
+
+namespace tsl {
+namespace profiler {
+
+inline std::string ProfilerJoinPathImpl(
+    std::initializer_list<absl::string_view> paths) {
+  std::string result;
+  for (absl::string_view path : paths) {
+    if (path.empty()) continue;
+
+    if (result.empty()) {
+      result = std::string(path);
+      continue;
+    }
+
+    path = absl::StripPrefix(path, kPathSep);
+    if (absl::EndsWith(result, kPathSep)) {
+      absl::StrAppend(&result, path);
+    } else {
+      absl::StrAppend(&result, kPathSep, path);
+    }
+  }
+
+  return result;
+}
+
+// A local duplication of ::tensorflow::io::JoinPath that supports windows.
+// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
+template <typename... T>
+std::string ProfilerJoinPath(const T&... args) {
+  return ProfilerJoinPathImpl({args...});
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
diff --git a/tensorflow/tsl/profiler/utils/format_utils.h b/tensorflow/tsl/profiler/utils/format_utils.h
new file mode 100644
index 00000000000..52684efb2bd
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/format_utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
+
+#include <stdio.h>
+
+#include <string>
+
+#include "tensorflow/tsl/platform/logging.h"
+
+namespace tsl {
+namespace profiler {
+namespace internal {
+
+inline std::string FormatDouble(const char* fmt, double d) {
+  constexpr int kBufferSize = 32;
+  char buffer[kBufferSize];
+  int result = snprintf(buffer, kBufferSize, fmt, d);
+  DCHECK(result > 0 && result < kBufferSize);
+  return std::string(buffer);
+}
+
+}  // namespace internal
+
+// Formats d with one digit after the decimal point.
+inline std::string OneDigit(double d) {
+  return internal::FormatDouble("%.1f", d);
+}
+
+// Formats d with 2 digits after the decimal point.
+inline std::string TwoDigits(double d) {
+  return internal::FormatDouble("%.2f", d);
+}
+
+// Formats d with 3 digits after the decimal point.
+inline std::string ThreeDigits(double d) {
+  return internal::FormatDouble("%.3f", d);
+}
+
+// Formats d with maximum precision to allow parsing the result back to the same
+// number.
+inline std::string MaxPrecision(double d) {
+  return internal::FormatDouble("%.17g", d);
+}
+
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_FORMAT_UTILS_H_
diff --git a/tensorflow/tsl/profiler/utils/group_events.cc b/tensorflow/tsl/profiler/utils/group_events.cc
index 5a718d0614c..7226a204c07 100644
--- a/tensorflow/tsl/profiler/utils/group_events.cc
+++ b/tensorflow/tsl/profiler/utils/group_events.cc
@@ -76,25 +76,9 @@ int64_t GetEventType(bool is_host_plane, const XEventVisitor& event) {
   }
 }
 
-bool IsLegacyProducerEvent(const XEventVisitor& event) {
-  static const auto* const kProducerEvents = new absl::flat_hash_set<int64_t>{
-      HostEventType::kTraceContext, HostEventType::kFunctionRun,
-      HostEventType::kSessionRun, HostEventType::kRunGraph};
-  return event.Type().has_value() && kProducerEvents->contains(*event.Type());
-}
-
-bool IsLegacyConsumerEvent(const XEventVisitor& event) {
-  static const auto* const kConsumerEvents = new absl::flat_hash_set<int64_t>{
-      HostEventType::kExecutorStateProcess,
-      HostEventType::kExecutorDoneCallback, HostEventType::kRunGraphDone};
-  return event.Type().has_value() && kConsumerEvents->contains(*event.Type());
-}
-
 bool IsLegacyRootEvent(const XEventVisitor& event) {
-  static const auto* const kRootEvents = new absl::flat_hash_set<int64_t>{
-      HostEventType::kTraceContext, HostEventType::kFunctionRun,
-      HostEventType::kSessionRun, HostEventType::kRunGraph};
-  return event.Type().has_value() && kRootEvents->contains(*event.Type());
+  // Returns true iff event has a type and type was kTraceContext.
+  return event.Type() == HostEventType::kTraceContext;
 }
 
 // Stats used in ConnectIntraThread.
@@ -139,18 +123,6 @@ GroupingEventStats::GroupingEventStats(const XEventVisitor& event) {
         break;
     }
   });
-  if (!producer_type.has_value() || !producer_id.has_value()) {
-    if (step_id.has_value() && IsLegacyProducerEvent(event)) {
-      producer_type = static_cast<int>(ContextType::kTfExecutor);
-      producer_id = *step_id;
-    }
-  }
-  if (!consumer_type.has_value() || !consumer_id.has_value()) {
-    if (step_id.has_value() && IsLegacyConsumerEvent(event)) {
-      consumer_type = static_cast<int>(ContextType::kTfExecutor);
-      consumer_id = *step_id;
-    }
-  }
   if (!root_level.has_value() && IsLegacyRootEvent(event)) {
     root_level = 1;
   }
@@ -172,6 +144,16 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   for (auto& type_id_group : context_groups) {
     for (auto& id_group : type_id_group.second) {
       const ContextGroup& group = id_group.second;
+      if (group.producers.size() >= 64 && group.consumers.size() >= 64) {
+        LOG_EVERY_N(WARNING, 1000)
+            << "id:" << id_group.first
+            << " producers:" << group.producers.size() << " : "
+            << group.producers[0]->GetEventVisitor().Name()
+            << " consumers:" << group.consumers.size() << " : "
+            << group.consumers[0]->GetEventVisitor().Name();
+        continue;
+      }
+
       for (EventNode* parent : group.producers) {
         for (EventNode* child : group.consumers) {
           parent->AddChild(child);
@@ -181,15 +163,6 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   }
 }
 
-bool HasFunctionRun(EventNode* event_node) {
-  for (EventNode* child : event_node->GetChildren()) {
-    if (child->GetEventVisitor().Type() == HostEventType::kFunctionRun) {
-      return true;
-    }
-  }
-  return false;
-}
-
 bool IsImplicitRootEvent(const XEventVisitor& event) {
   static const auto* const kImplicitRootEvents =
       new absl::flat_hash_set<int64_t>{
@@ -241,7 +214,7 @@ const EventNode* FindParentWithComparator(const Comparator& comparator,
   return nullptr;
 }
 
-bool IsIteratorEventType(absl::optional<int64_t> event_type) {
+bool IsIteratorEventType(std::optional<int64_t> event_type) {
   return event_type == HostEventType::kIterator ||
          event_type == HostEventType::kDeviceInputPipelineSecondIterator;
 }
@@ -252,7 +225,7 @@ bool IsIteratorEventType(absl::optional<int64_t> event_type) {
 bool CheckLoopOp(const XSpace& space) {
   for (const XPlane& plane : space.planes()) {
     for (const auto& event_metadata : plane.event_metadata()) {
-      absl::optional<int64_t> event_type =
+      std::optional<int64_t> event_type =
           FindHostEventType(event_metadata.second.name());
       if (!event_type.has_value()) continue;
       switch (*event_type) {
@@ -270,15 +243,14 @@ bool CheckLoopOp(const XSpace& space) {
   return false;
 }
 
-absl::optional<XStatVisitor> EventNode::GetContextStat(
-    int64_t stat_type) const {
+std::optional<XStatVisitor> EventNode::GetContextStat(int64_t stat_type) const {
   std::queue<const EventNode*> nodes;
   absl::flat_hash_set<const EventNode*> seen = {this};
   nodes.push(this);
   while (!nodes.empty()) {
     const EventNode* node = nodes.front();
     nodes.pop();
-    if (absl::optional<XStatVisitor> stat = node->visitor_.GetStat(stat_type)) {
+    if (std::optional<XStatVisitor> stat = node->visitor_.GetStat(stat_type)) {
       return stat;
     }
     for (const EventNode* parent : node->GetParents()) {
@@ -292,16 +264,15 @@ absl::optional<XStatVisitor> EventNode::GetContextStat(
 
 std::string EventNode::GetGroupName() const {
   std::string name;
-  if (absl::optional<XStatVisitor> stat =
-          GetContextStat(StatType::kGraphType)) {
+  if (std::optional<XStatVisitor> stat = GetContextStat(StatType::kGraphType)) {
     absl::StrAppend(&name, stat->StrOrRefValue(), " ");
   } else if (!(IsImplicitRootEvent(visitor_))) {
     absl::StrAppend(&name, GetEventVisitor().Name(), " ");
   }
   int64_t step_num = group_id_.value_or(0);
-  if (absl::optional<XStatVisitor> stat = GetContextStat(StatType::kIterNum)) {
+  if (std::optional<XStatVisitor> stat = GetContextStat(StatType::kIterNum)) {
     step_num = stat->IntValue();
-  } else if (absl::optional<XStatVisitor> stat =
+  } else if (std::optional<XStatVisitor> stat =
                  GetContextStat(StatType::kStepNum)) {
     step_num = stat->IntValue();
   }
@@ -330,7 +301,7 @@ void EventNode::PropagateGroupId(int64_t group_id,
   while (!nodes.empty()) {
     EventNode* node = nodes.front();
     nodes.pop();
-    absl::optional<int64_t> node_group_id = node->GetGroupId();
+    std::optional<int64_t> node_group_id = node->GetGroupId();
     if (node_group_id.has_value()) {
       if (*node_group_id != group_id) {
         (*group_metadata_map)[group_id].children.insert(*node_group_id);
@@ -437,7 +408,7 @@ void EventForest::ConnectInterThread(
       for (EventNode& parent_event_node : *parent_event_node_list) {
         std::vector<uint64> stats;
         for (auto stat_type : parent_stat_types) {
-          absl::optional<XStatVisitor> stat =
+          std::optional<XStatVisitor> stat =
               parent_event_node.GetContextStat(stat_type);
           if (!stat) break;
           stats.push_back(stat->IntOrUintValue());
@@ -452,7 +423,7 @@ void EventForest::ConnectInterThread(
       for (EventNode& child_event_node : *child_event_node_list) {
         std::vector<uint64> stats;
         for (auto stat_type : *child_stat_types) {
-          absl::optional<XStatVisitor> stat =
+          std::optional<XStatVisitor> stat =
               child_event_node.GetContextStat(stat_type);
           if (!stat) break;
           stats.push_back(stat->IntOrUintValue());
@@ -511,7 +482,7 @@ void EventForest::CreateEventGroups() {
   for (auto& [event_type, events] : event_node_map_) {
     for (EventNode& event : events) {
       if (!event.RootLevel()) continue;
-      absl::optional<XStatVisitor> step_id_stat =
+      std::optional<XStatVisitor> step_id_stat =
           event.GetEventVisitor().GetStat(StatType::kStepId);
       // If this is a root event that associated with tf.data, skip.
       if (step_id_stat && tf_data_step_ids_.contains(step_id_stat->IntValue()))
@@ -557,7 +528,7 @@ void EventForest::ProcessTfDataSteps() {
     auto tf_data_events = gtl::FindOrNull(event_node_map_, tf_data_event_type);
     if (!tf_data_events) continue;
     for (const EventNode& tf_data_event : *tf_data_events) {
-      absl::optional<XStatVisitor> step_id_stat =
+      std::optional<XStatVisitor> step_id_stat =
           tf_data_event.GetEventVisitor().GetStat(StatType::kStepId);
       if (!step_id_stat) continue;
       tf_data_step_ids_.insert(step_id_stat->IntValue());
@@ -579,9 +550,9 @@ void EventForest::ProcessTensorFlowLoop() {
       gtl::FindOrNull(event_node_map_, HostEventType::kExecutorStateProcess);
   if (!executor_event_list) return;
   for (EventNode& executor_event : *executor_event_list) {
-    absl::optional<XStatVisitor> step_id_stat =
+    std::optional<XStatVisitor> step_id_stat =
         executor_event.GetEventVisitor().GetStat(StatType::kStepId);
-    absl::optional<XStatVisitor> iter_num_stat =
+    std::optional<XStatVisitor> iter_num_stat =
         executor_event.GetEventVisitor().GetStat(StatType::kIterNum);
     if (!step_id_stat || !iter_num_stat) continue;
     int64_t step_id = step_id_stat->IntValue();
@@ -622,25 +593,6 @@ void EventForest::ProcessTensorFlowLoop() {
   }
 }
 
-void EventForest::ProcessWorker() {
-  auto eager_kernel_execute_event_list =
-      gtl::FindOrNull(event_node_map_, HostEventType::kEagerKernelExecute);
-  if (!eager_kernel_execute_event_list) return;
-  // The last EagerKernelExecute with a FunctionRun child.
-  EventNode* root_event = nullptr;
-  for (EventNode& eager_kernel_execute_event :
-       *eager_kernel_execute_event_list) {
-    if (HasFunctionRun(&eager_kernel_execute_event)) {
-      // A function op becomes a new root.
-      root_event = &eager_kernel_execute_event;
-      root_event->SetRootLevel(1);
-    } else if (root_event) {
-      // Add non-function eager ops as child.
-      root_event->AddChild(&eager_kernel_execute_event);
-    }
-  }
-}
-
 void EventForest::AddPlane(
     const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
     XPlane* plane) {
@@ -692,12 +644,12 @@ void EventForest::ConnectTfDataEvents() {
     VLOG(1) << produce_event_list->size() << " "
             << GetHostEventTypeStr(event_type) << " events found.";
     for (EventNode& produce_event : *produce_event_list) {
-      absl::optional<XStatVisitor> element_id =
+      std::optional<XStatVisitor> element_id =
           produce_event.GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
       for (EventNode* produce_iterator : produce_event.GetChildren()) {
         if (IsIteratorEventType(produce_iterator->GetEventVisitor().Type())) {
-          absl::optional<XStatVisitor> iterator_id =
+          std::optional<XStatVisitor> iterator_id =
               produce_iterator->GetEventVisitor().GetStat(StatType::kParentId);
           if (!iterator_id.has_value()) break;
           produce_iterator_map[{iterator_id->IntValue(),
@@ -722,7 +674,7 @@ void EventForest::ConnectTfDataEvents() {
     VLOG(1) << consume_event_list->size() << " "
             << GetHostEventTypeStr(event_type) << " events found.";
     for (EventNode& consume_event : *consume_event_list) {
-      absl::optional<XStatVisitor> element_id =
+      std::optional<XStatVisitor> element_id =
           consume_event.GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
       if (consume_event.GetParents().empty()) continue;
@@ -733,7 +685,7 @@ void EventForest::ConnectTfDataEvents() {
           !IsIteratorEventType(consume_iterator->GetEventVisitor().Type())) {
         continue;
       }
-      absl::optional<XStatVisitor> iterator_id =
+      std::optional<XStatVisitor> iterator_id =
           consume_iterator->GetEventVisitor().GetStat(StatType::kStepId);
       if (!iterator_id.has_value()) continue;
       if (auto produce_iterators = gtl::FindOrNull(
@@ -752,7 +704,6 @@ void EventForest::ConnectTfDataEvents() {
 void EventForest::GroupEvents() {
   ProcessTfDataSteps();
   ProcessTensorFlowLoop();
-  ProcessWorker();
   CreateEventGroups();
   MarkEagerlyExecutedGpuKernels();
   MarkEagerlyExecutedCpuTfOps();
diff --git a/tensorflow/tsl/profiler/utils/group_events.h b/tensorflow/tsl/profiler/utils/group_events.h
index af0b5af3adb..803e094ee57 100644
--- a/tensorflow/tsl/profiler/utils/group_events.h
+++ b/tensorflow/tsl/profiler/utils/group_events.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -73,7 +74,7 @@ class EventNode {
     child->parents_.push_back(this);
   }
 
-  absl::optional<int64_t> GetGroupId() const { return group_id_; }
+  std::optional<int64_t> GetGroupId() const { return group_id_; }
 
   std::string GetGroupName() const;
 
@@ -84,7 +85,7 @@ class EventNode {
 
   const XEventVisitor& GetEventVisitor() const { return visitor_; }
 
-  absl::optional<XStatVisitor> GetContextStat(int64_t stat_type) const;
+  std::optional<XStatVisitor> GetContextStat(int64_t stat_type) const;
 
   void AddStepName(absl::string_view step_name);
 
@@ -116,7 +117,7 @@ class EventNode {
   XEventVisitor visitor_;
   std::vector<EventNode*> parents_;
   std::vector<EventNode*> children_;
-  absl::optional<int64_t> group_id_;
+  std::optional<int64_t> group_id_;
   // Root event level.
   // By default root_level_ is set to 0, which means it is not a root event.
   // Events with root_level_ greater than 0 are considered as root events.
@@ -204,10 +205,6 @@ class EventForest {
   // iteraton to `tf_loop_root_events_`.
   void ProcessTensorFlowLoop();
 
-  // Processes the worker thread by connecting a FunctionRun with the following
-  // eager ops (e.g., for Keras callback).
-  void ProcessWorker();
-
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
   // std::deque for pointer stability.
diff --git a/tensorflow/tsl/profiler/utils/group_events_test.cc b/tensorflow/tsl/profiler/utils/group_events_test.cc
index 92a2fcbc354..48b080e74c7 100644
--- a/tensorflow/tsl/profiler/utils/group_events_test.cc
+++ b/tensorflow/tsl/profiler/utils/group_events_test.cc
@@ -32,6 +32,8 @@ namespace tsl {
 namespace profiler {
 namespace {
 
+constexpr int64_t kTfExecutor = static_cast<int64_t>(ContextType::kTfExecutor);
+
 TEST(GroupEventsTest, GroupGpuTraceLegacyRootTest) {
   constexpr int64_t kStepNum = 123;
   constexpr int64_t kStepId = 0;
@@ -46,12 +48,17 @@ TEST(GroupEventsTest, GroupGpuTraceLegacyRootTest) {
       &host_plane_builder, &main_thread, HostEventType::kTraceContext, 0, 100,
       {{StatType::kGraphType, "train"}, {StatType::kStepNum, kStepNum}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, kTfExecutor},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70,
                {{StatType::kCorrelationId, kCorrelationId}});
 
@@ -90,12 +97,17 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
       &host_plane_builder, &main_thread, "train", 0, 100,
       {{StatType::kStepNum, kStepNum}, {StatType::kIsRoot, int64_t{1}}});
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, kTfExecutor},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70,
                {{StatType::kCorrelationId, kCorrelationId}});
 
@@ -132,10 +144,16 @@ TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 5, 10,
-               {{StatType::kStepId, kStepId}, {StatType::kIterNum, kIterNum}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kIterNum, kIterNum},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}, {StatType::kIterNum, kIterNum}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kIterNum, kIterNum},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70,
                {{StatType::kCorrelationId, kCorrelationId}});
 
@@ -184,20 +202,28 @@ TEST(GroupEventsTest, GroupMultipleTensorFlowLoopsTest) {
   CreateXEvent(&host_plane_builder, &first_tf_executor_thread,
                HostEventType::kExecutorStateProcess, 220, 80,
                {{StatType::kStepId, kSecondStepId},
-                {StatType::kIterNum, kSecondIterNumStart}});
+                {StatType::kIterNum, kSecondIterNumStart},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kSecondStepId}});
   CreateXEvent(&host_plane_builder, &first_tf_executor_thread,
                HostEventType::kExecutorStateProcess, 320, 80,
                {{StatType::kStepId, kSecondStepId},
-                {StatType::kIterNum, kSecondIterNumStart + 1}});
+                {StatType::kIterNum, kSecondIterNumStart + 1},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kSecondStepId}});
   auto second_tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &second_tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
                {{StatType::kStepId, kFirstStepId},
-                {StatType::kIterNum, kFirstIterNumStart}});
+                {StatType::kIterNum, kFirstIterNumStart},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kFirstStepId}});
   CreateXEvent(&host_plane_builder, &second_tf_executor_thread,
                HostEventType::kExecutorStateProcess, 120, 80,
                {{StatType::kStepId, kFirstStepId},
-                {StatType::kIterNum, kFirstIterNumStart + 1}});
+                {StatType::kIterNum, kFirstIterNumStart + 1},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kFirstStepId}});
 
   EventForest event_forest;
   GroupTfEvents(&space, &event_forest);
@@ -354,12 +380,18 @@ TEST(GroupEventsTest, FunctionOpTest) {
   CreateXEvent(&host_plane_builder, &main_thread,
                HostEventType::kEagerKernelExecute, 10, 90);
   CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
-               10, 90, {{StatType::kStepId, kStepId}});
+               10, 90,
+               {{StatType::kStepId, kStepId},
+                {StatType::kProducerType, kTfExecutor},
+                {StatType::kProducerId, kStepId}});
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(1);
   CreateXEvent(&host_plane_builder, &tf_executor_thread,
                HostEventType::kExecutorStateProcess, 20, 80,
-               {{StatType::kStepId, kStepId}});
+               {{StatType::kStepId, kStepId},
+                {StatType::kConsumerType, kTfExecutor},
+                {StatType::kConsumerId, kStepId}});
+
   // GPU kernel scheduled inside tf.function.
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 30,
                {{StatType::kCorrelationId, kCorrelationId}});
@@ -550,60 +582,6 @@ TEST(GroupEventsTest, AsyncEventTest) {
   });
 }
 
-TEST(GroupEventsTest, WorkerTest) {
-  constexpr uint64 kEagerKernelExecuteDuration = 100;
-  constexpr uint64 kFunctionRunDuration = 50;
-  constexpr uint64 kFirstEagerKernelExecuteStartTime = 0;
-  constexpr uint64 kSecondEagerKernelExecuteStartTime = 200;
-  constexpr uint64 kThirdEagerKernelExecuteStartTime = 400;
-  constexpr uint64 kFourthEagerKernelExecuteStartTime = 600;
-  constexpr uint64 kFirstFunctionRunStartTime = 210;
-  constexpr uint64 kSecondFunctionRunStartTime = 610;
-
-  XSpace raw_space;
-  XPlane* raw_plane = raw_space.add_planes();
-  XPlaneBuilder plane(raw_plane);
-  plane.ReserveLines(1);
-  auto line = plane.GetOrCreateLine(0);
-  // Eager op. It doesn't belong to any group.
-  CreateXEvent(&plane, &line, HostEventType::kEagerKernelExecute,
-               kFirstEagerKernelExecuteStartTime, kEagerKernelExecuteDuration);
-  // First function. It creates the first group.
-  CreateXEvent(&plane, &line, HostEventType::kEagerKernelExecute,
-               kSecondEagerKernelExecuteStartTime, kEagerKernelExecuteDuration);
-  CreateXEvent(&plane, &line, HostEventType::kFunctionRun,
-               kFirstFunctionRunStartTime, kFunctionRunDuration);
-  // Eager op. It belongs to the first group.
-  CreateXEvent(&plane, &line, HostEventType::kEagerKernelExecute,
-               kThirdEagerKernelExecuteStartTime, kEagerKernelExecuteDuration);
-  // Second function. It creates the second group.
-  CreateXEvent(&plane, &line, HostEventType::kEagerKernelExecute,
-               kFourthEagerKernelExecuteStartTime, kEagerKernelExecuteDuration);
-  CreateXEvent(&plane, &line, HostEventType::kFunctionRun,
-               kSecondFunctionRunStartTime, kFunctionRunDuration);
-
-  GroupTfEvents(&raw_space);
-  CreateTfXPlaneVisitor(raw_plane).ForEachLine([&](const XLineVisitor& line) {
-    EXPECT_EQ(line.NumEvents(), 6);
-    line.ForEachEvent([&](const XEventVisitor& event) {
-      std::optional<int64_t> group_id;
-      if (std::optional<XStatVisitor> stat =
-              event.GetStat(StatType::kGroupId)) {
-        group_id = stat->IntValue();
-      }
-      if (event.TimestampPs() < kSecondEagerKernelExecuteStartTime) {
-        EXPECT_FALSE(group_id.has_value());
-      } else if (event.TimestampPs() < kFourthEagerKernelExecuteStartTime) {
-        EXPECT_TRUE(group_id.has_value());
-        EXPECT_EQ(*group_id, 0);
-      } else {
-        EXPECT_TRUE(group_id.has_value());
-        EXPECT_EQ(*group_id, 1);
-      }
-    });
-  });
-}
-
 TEST(GroupEventsTest, BatchingSessionTest) {
   constexpr absl::string_view kSchedule = "Schedule";
   constexpr int64_t kBatchContextType =
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.cc b/tensorflow/tsl/profiler/utils/xplane_schema.cc
index 33883f0faeb..e82d3b66be3 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.cc
@@ -266,6 +266,10 @@ const StatTypeMap& GetStatTypeMap() {
       {"compute_cap_minor", kDevCapComputeCapMinor},
       {"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond},
       {"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond},
+      {"peak_sram_rd_bw_gigabytes_per_second",
+       kDevCapPeakSramRdBwGigabytesPerSecond},
+      {"peak_sram_wr_bw_gigabytes_per_second",
+       kDevCapPeakSramWrBwGigabytesPerSecond},
       {"device_vendor", kDevVendor},
       // Batching related.
       {"batch_size_after_padding", kBatchSizeAfterPadding},
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.h b/tensorflow/tsl/profiler/utils/xplane_schema.h
index 10147089d08..72833792e0e 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.h
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.h
@@ -258,6 +258,8 @@ enum StatType {
   kDevCapComputeCapMinor,
   kDevCapPeakTeraflopsPerSecond,
   kDevCapPeakHbmBwGigabytesPerSecond,
+  kDevCapPeakSramRdBwGigabytesPerSecond,
+  kDevCapPeakSramWrBwGigabytesPerSecond,
   kDevVendor,
   // Batching related.
   kBatchSizeAfterPadding,
diff --git a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
index d1e2a6ef078..263445ab129 100644
--- a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/platform/types.h"
 #include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/tsl/profiler/utils/math_utils.h"
+#include "tensorflow/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/tsl/profiler/utils/xplane_builder.h"
 #include "tensorflow/tsl/profiler/utils/xplane_schema.h"
 #include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
@@ -285,7 +285,7 @@ TEST(XPlaneUtilsTest, MergeXPlaneTest) {
   // Check plane level stats,
   EXPECT_EQ(dst_plane.stats_size(), 3);
   absl::flat_hash_map<absl::string_view, absl::string_view> plane_stats;
-  plane.ForEachStat([&](const tensorflow::profiler::XStatVisitor& stat) {
+  plane.ForEachStat([&](const XStatVisitor& stat) {
     if (stat.Name() == "plane_stat1") {
       EXPECT_EQ(stat.IntValue(), 1);
     } else if (stat.Name() == "plane_stat2") {
diff --git a/tensorflow/tsl/protobuf/BUILD b/tensorflow/tsl/protobuf/BUILD
index c863cf86fc0..722fa93147b 100644
--- a/tensorflow/tsl/protobuf/BUILD
+++ b/tensorflow/tsl/protobuf/BUILD
@@ -8,6 +8,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/core:__subpackages__",
         "//tensorflow/tsl:internal",
@@ -23,6 +24,19 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    make_default_target_header_only = True,
+    protodeps = [":dnn_proto"],
+)
+
+tf_proto_library(
+    name = "dnn_proto",
+    srcs = ["dnn.proto"],
+    make_default_target_header_only = True,
+)
+
 tf_proto_library(
     name = "error_codes_proto_impl",
     srcs = ["error_codes.proto"],
@@ -89,6 +103,10 @@ tf_proto_library(
     create_go_proto = False,
     make_default_target_header_only = True,
     protodeps = [
+        # TODO(tlongeri): Conceptually, these fit into protos_all but adding them currently causes
+        # breakages (and they are not actually used).
+        # ":autotuning_proto",
+        # ":dnn_proto",
         ":bfc_memory_map_proto",
         ":coordination_config_proto",
         ":distributed_runtime_payloads_proto",
diff --git a/tensorflow/tsl/protobuf/autotuning.proto b/tensorflow/tsl/protobuf/autotuning.proto
new file mode 100644
index 00000000000..d29ef27a4a2
--- /dev/null
+++ b/tensorflow/tsl/protobuf/autotuning.proto
@@ -0,0 +1,106 @@
+// This file defines protos that store the results of autotuning various
+// operations.
+//
+// They are in proto format because we want to log them structured. They offer
+// tremendous statistical, testing, and debugging value.
+syntax = "proto3";
+
+package tensorflow;
+
+import "google/protobuf/any.proto";
+import "google/protobuf/duration.proto";
+import "tensorflow/tsl/protobuf/dnn.proto";
+
+option go_package = "github.com/google/tsl/tsl/go/protobuf/for_core_protos_go_proto";
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message AutotuneResult {
+  enum FailureKind {
+    UNKNOWN = 0;
+
+    // Algorithm wrote memory outside its output buffers.
+    REDZONE_MODIFIED = 1;
+
+    // Algorithm gave a different result from a reference algorithm.
+    WRONG_RESULT = 2;
+
+    // Algorithm was rejected for failing to run or for known bugs.
+    DISQUALIFIED = 3;
+  }
+
+  message FailureResult {
+    FailureKind kind = 1;
+    string msg = 2;
+
+    // For failure_kind == WRONG_RESULT, this field indicates the reference
+    // configuration that we compared against.
+    //
+    // Note that the reference algorithm isn't always correct.  However,
+    // empirically it's more correct, as it's "algo 0", less fancy than the
+    // compared one.
+    oneof key {
+      ConvKey reference_conv = 11;
+      GemmKey reference_gemm = 12;
+      CudaConvPlanKey reference_cuda_conv_plan = 14;
+      stream_executor.dnn.AlgorithmProto reference_algorithm = 15;
+    }
+
+    int64 buffer_address = 13;
+  }
+
+  // Legacy and unused in new data; superseded by AlgorithmProto.
+  message ConvKey {
+    int64 algorithm = 1;
+    bool tensor_ops_enabled = 2;
+  }
+
+  message GemmKey {
+    int64 algorithm = 1;
+  }
+
+  // Legacy and unused in new data; superseded by AlgorithmProto.
+  message CudaConvPlanKey {
+    string exec_plan_id = 1;
+  }
+
+  int64 scratch_bytes = 8;
+  google.protobuf.Duration run_time = 9;
+
+  FailureResult failure = 7;
+
+  oneof key {
+    ConvKey conv = 5;
+    GemmKey gemm = 6;
+    CudaConvPlanKey cuda_conv_plan = 15;
+    stream_executor.dnn.AlgorithmProto algorithm = 16;
+  }
+
+  // Next ID: 17
+}
+
+message AutotuningLog {
+  google.protobuf.Any instr = 1;
+
+  // Records all auto-tuning results per algorithm.
+  repeated AutotuneResult results = 2;
+
+  CudnnVersion cudnn_version = 3;
+  ComputeCapability compute_capability = 4;
+
+  // stream_executor::DeviceDescription::pci_bus_id.
+  string device_pci_bus_id = 5;
+
+  string blas_version = 6;
+
+  // Next ID: 7
+}
diff --git a/tensorflow/tsl/protobuf/dnn.proto b/tensorflow/tsl/protobuf/dnn.proto
new file mode 100644
index 00000000000..21fcb461cef
--- /dev/null
+++ b/tensorflow/tsl/protobuf/dnn.proto
@@ -0,0 +1,173 @@
+// LINT: LEGACY_NAMES
+syntax = "proto3";
+
+package stream_executor.dnn;
+
+import "google/protobuf/wrappers.proto";
+
+option go_package = "github.com/google/tsl/tsl/go/stream_executor";
+
+// Specifies the data type used by an operation.
+enum DataType {
+  kFloat = 0;
+  kDouble = 1;
+  kHalf = 2;
+  kInt8 = 3;
+  kInt32 = 4;
+  kComplexFloat = 5;
+  kComplexDouble = 6;
+  kBF16 = 7;
+  kF8E5M2 = 8;
+  kF8E4M3FN = 9;
+}
+
+// Describes how a convolution input or output layer's data is formatted.
+enum DataLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Batch <-> batch, or N
+  // Depth <-> feature, or channel
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  //
+  // Note: In cudnn, kBatchDepthYX4 and kBatchDepthYX32 are the same layout
+  // (namely, NCHW_VECT_C).  It differentiates between these two by using a
+  // different data type (int8x4 vs int8x32).  In StreamExecutor we use
+  // different layouts for these, because we don't usually pass an explicit data
+  // type to StreamExecutor functions.
+  kYXDepthBatch = 0;
+  kYXBatchDepth = 1;
+  kBatchYXDepth = 2;    // cuDNN's NHWC layout
+  kBatchDepthYX = 3;    // cuDNN's NCHW layout
+  kBatchDepthYX4 = 4;   // cuDNN's NCHW_VECT_C with 4-elem vectors (e.g. int8x4)
+  kBatchDepthYX32 = 5;  // cuDNN's NCHW_VECT_C with 32-elem vects (e.g. int8x32)
+}
+
+// Describes how a convolution filter is laid out in the memory.
+enum FilterLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Output <-> output feature, or N
+  // Input <-> input feature, or N
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kOutputInputYX = 0;    // cuDNN's NCHW layout
+  kOutputYXInput = 1;    // cuDNN's NHWC layout
+  kOutputInputYX4 = 2;   // cuDNN's NCHW_VECT_C layout with 4-elem vectors
+  kOutputInputYX32 = 5;  // cuDNN's NCHW_VECT_C layout with 32-elem vectors
+  kInputYXOutput = 3;
+  kYXInputOutput = 4;
+}
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum ActivationMode {
+  kNone = 0;
+  kSigmoid = 1;
+  // Rectified linear activation: f(x) = x < 0 ? 0 : x
+  kRelu = 2;
+  // Rectified linear activation; where upper maximum is 6.0.
+  kRelu6 = 3;
+  // Rectified linear activation; where upper maximum specified by
+  // BatchDescriptor::value_max().
+  kReluX = 4;
+  kTanh = 5;
+  // Like ReluX; but passes all values in the range [-X,X].
+  kBandPass = 6;
+  // Exponential linear activation: f(x) = x < 0 ? e^x - 1 : x
+  kElu = 7;
+  // Leaky Rectified linear activation: f(x) = x < 0 ? alpha * x : x
+  kLeakyRelu = 8;
+  // Gaussian Error linear unit activation:
+  //   x * P(X <= x) = 0.5 * x * (1 + erf(x / sqrt(2))), where P(X) ~ N(0, 1).
+  kGeluExact = 9;
+}
+
+// Describe the math definition for the conv op. The popular behavior is
+// actually called cross-correlation in math, despite the operation is often
+// referred as convolution. See cuDNN cudnnConvolutionMode_t.
+enum ConvolutionMode {
+  CROSS_CORRELATION = 0;
+  CONVOLUTION = 1;
+}
+
+enum ConvolutionKind {
+  INVALID = 0;
+  FORWARD = 1;
+  BACKWARD_FILTER = 2;
+  BACKWARD_DATA = 3;
+  FORWARD_BIAS_ACTIVATION = 4;
+}
+
+// Generic tensor representation.
+message TensorDescriptorProto {
+  repeated int64 dimensions = 1;
+  DataType data_type = 2;
+  oneof layout_oneof {
+    DataLayout data_layout = 3;
+    FilterLayout filter_layout = 4;
+  }
+}
+
+// Generic algorithm representation.
+message AlgorithmProto {
+  enum MathType {
+    DEFAULT_MATH = 0;
+    // The GPU may operate 4x4 matrix FMA.
+    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
+    TENSOR_OP_MATH = 1;
+  }
+  int64 algo_id = 1;
+  MathType math_type = 2;
+  reserved 3;
+
+  map<int64, int64> tuning_knobs = 4;
+  // Legacy algorithm enums and cuDNN Frontend engine numbers need to coexist in
+  // the same proto medium-term, until we can be confident of no longer needing
+  // the legacy cuDNN convolution API.  Once the migration is complete, we can
+  // stop producing legacy algorithm enums and remove this field.
+  bool is_cudnn_frontend = 5;
+
+  // For ROCm only, it's impossible to re-query the required workspace size
+  // after running the algorithm search, so we must store the workspace size
+  // along with the choice of algorithm.  For consistency and convenience,
+  // cuDNN uses this field in the same way, even though it would be possible to
+  // re-query the workspace size from cuDNN at each use.
+  //
+  // Since this message is persisted in files, we need to be able to distinguish
+  // 0 workspace size from unknown workspace size in an old message, so this is
+  // a message field.
+  google.protobuf.UInt64Value workspace_size = 6;
+}
+
+// Proto definition of AlgorithmConfig in "dnn.h".
+// TODO(ruochengw): After cl/380702564 is submitted, add support for algorithm
+// configs with cuDNN Frontend APIs.
+message AlgorithmConfigProto {
+  // Use oneof to emulate optional semantics in proto2 since older
+  // version of proto3 cannot distinguish "unset field" and "default field".
+  oneof optional_algorithm {
+    AlgorithmProto algorithm = 1;
+  }
+  oneof optional_algorithm_no_scratch {
+    AlgorithmProto algorithm_no_scratch = 2;
+  }
+  oneof optional_scratch_size {
+    int64 scratch_size = 3;
+  }
+}
+
+// Convolution-specific parameters.
+message ConvolutionDescriptorProto {
+  repeated int64 paddings = 1;
+  repeated int64 strides = 2;
+  repeated int64 dilations = 3;
+  // The "accumulator" type. For example, use F32 as an accumulator for F16
+  // convolutions.
+  // See cuDNN's cudnnConvolutionMode_t.
+  DataType compute_mode = 4;
+  // See cuDNN's group count.
+  int32 group_count = 5;
+  ConvolutionMode convolution_mode = 6;
+  // Tensorflow node name, same as in NodeDef, for debugging purposes.
+  string name = 7;
+}
diff --git a/tensorflow/tsl/python/lib/core/BUILD b/tensorflow/tsl/python/lib/core/BUILD
new file mode 100644
index 00000000000..30efee28aca
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/BUILD
@@ -0,0 +1,136 @@
+# Description:
+# Implementation of custom numpy floats.
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "numpy_hdr",
+    srcs = ["numpy.h"],
+)
+
+filegroup(
+    name = "basic_hdrs",
+    srcs = [
+        "numpy.h",
+    ],
+)
+
+cc_library(
+    name = "bfloat16_hdr",
+    hdrs = [
+        "bfloat16.h",
+    ],
+    deps = [
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+    ],
+)
+
+cc_library(
+    name = "bfloat16_lib",
+    srcs = [
+        "bfloat16.cc",
+        "float8_e4m3b11.cc",
+    ],
+    hdrs = [
+        "bfloat16.h",
+        "float8_e4m3b11.h",
+    ],
+    deps = [
+        ":custom_float",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "//third_party/eigen3",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "custom_float",
+    hdrs = [
+        "custom_float.h",
+    ],
+    deps = [
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "//third_party/eigen3",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "float8_hdr",
+    hdrs = [
+        "float8.h",
+    ],
+    deps = [
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+    ],
+)
+
+cc_library(
+    name = "float8_lib",
+    srcs = [
+        "float8.cc",
+    ],
+    hdrs = [
+        "float8.h",
+    ],
+    deps = [
+        ":custom_float",
+        "//tensorflow/tsl/platform:logging",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "//third_party/eigen3",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "custom_casts_lib",
+    srcs = ["custom_casts.cc"],
+    hdrs = ["custom_casts.h"],
+    deps = [
+        ":bfloat16_lib",
+        ":float8_lib",
+        "//tensorflow/tsl/platform:types",
+        "//tensorflow/tsl/python/lib/core:numpy",
+        "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+    ],
+)
+
+cc_library(
+    name = "custom_casts_hdr",
+    hdrs = [
+        "custom_casts.h",
+    ],
+)
+
+cc_library(
+    name = "numpy",
+    srcs = ["numpy.cc"],
+    hdrs = ["numpy.h"],
+    deps = [
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+# Directory-level target.
+cc_library(
+    name = "core",
+    deps = [
+        ":bfloat16_lib",
+        ":custom_casts_lib",
+        ":float8_lib",
+        ":numpy",
+    ],
+)
diff --git a/tensorflow/tsl/python/lib/core/bfloat16.cc b/tensorflow/tsl/python/lib/core/bfloat16.cc
new file mode 100644
index 00000000000..378115c500e
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/bfloat16.cc
@@ -0,0 +1,248 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
+
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+
+#include <array>   // NOLINT
+#include <cmath>   // NOLINT
+#include <limits>  // NOLINT
+#include <locale>  // NOLINT
+
+// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
+#include <Python.h>
+
+#include "absl/strings/str_cat.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/python/lib/core/custom_float.h"
+#include "tensorflow/tsl/python/lib/core/float8_e4m3b11.h"
+
+namespace tsl {
+namespace custom_float_internal {
+
+namespace ufuncs {
+
+template <>
+struct CopySign<bfloat16> {
+  bfloat16 operator()(bfloat16 a, bfloat16 b) {
+    // LLVM is smart enough to turn this into (a & 0x7fff) | (b & 0x8000).
+    bfloat16 abs_a = Eigen::numext::abs(a);
+    return std::signbit(static_cast<float>(b)) ? -abs_a : abs_a;
+  }
+};
+
+template <>
+struct NextAfter<bfloat16> {
+  bfloat16 operator()(bfloat16 from, bfloat16 to) {
+    uint16_t from_as_int, to_as_int;
+    const uint16_t sign_mask = 1 << 15;
+    float from_as_float(from), to_as_float(to);
+    memcpy(&from_as_int, &from, sizeof(bfloat16));
+    memcpy(&to_as_int, &to, sizeof(bfloat16));
+    if (Eigen::numext::isnan(from_as_float) ||
+        Eigen::numext::isnan(to_as_float)) {
+      return bfloat16(std::numeric_limits<float>::quiet_NaN());
+    }
+    if (from_as_int == to_as_int) {
+      return to;
+    }
+    if (from_as_float == 0) {
+      if (to_as_float == 0) {
+        return to;
+      } else {
+        // Smallest subnormal signed like `to`.
+        uint16_t out_int = (to_as_int & sign_mask) | 1;
+        bfloat16 out;
+        memcpy(&out, &out_int, sizeof(bfloat16));
+        return out;
+      }
+    }
+    uint16_t from_sign = from_as_int & sign_mask;
+    uint16_t to_sign = to_as_int & sign_mask;
+    uint16_t from_abs = from_as_int & ~sign_mask;
+    uint16_t to_abs = to_as_int & ~sign_mask;
+    uint16_t magnitude_adjustment =
+        (from_abs > to_abs || from_sign != to_sign) ? 0xFFFF : 0x0001;
+    uint16_t out_int = from_as_int + magnitude_adjustment;
+    bfloat16 out;
+    memcpy(&out, &out_int, sizeof(bfloat16));
+    return out;
+  }
+};
+
+}  // namespace ufuncs
+
+using bfloat16 = Eigen::bfloat16;
+
+template <>
+struct TypeDescriptor<bfloat16>
+    : custom_float_internal::CustomFloatTypeDescriptor<bfloat16> {
+  typedef bfloat16 T;
+  static constexpr const char* kTypeName = "bfloat16";
+  static constexpr const char* kTpDoc = "bfloat16 floating-point values";
+  // We must register bfloat16 with a kind other than "f", because numpy
+  // considers two types with the same kind and size to be equal, but
+  // float16 != bfloat16.
+  // The downside of this is that NumPy scalar promotion does not work with
+  // bfloat16 values.
+  static constexpr char kNpyDescrKind = 'V';
+  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+  // character is unique.
+  static constexpr char kNpyDescrType = 'E';
+  static constexpr char kNpyDescrByteorder = '=';
+};
+
+template <>
+struct TypeDescriptor<float8_e4m3b11>
+    : CustomFloatTypeDescriptor<float8_e4m3b11> {
+  typedef float8_e4m3b11 T;
+  static constexpr const char* kTypeName = "float8_e4m3b11";
+  static constexpr const char* kTpDoc = "float8_e4m3b11 floating-point values";
+  // We must register float8_e4m3b11 with a kind other than "f", because numpy
+  // considers two types with the same kind and size to be equal, and we
+  // expect multiple 1 byte floating point types.
+  // The downside of this is that NumPy scalar promotion does not work with
+  // float8_e4m3b11 values.
+  static constexpr char kNpyDescrKind = 'V';
+  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+  // character is unique.
+  static constexpr char kNpyDescrType = 'L';
+  static constexpr char kNpyDescrByteorder = '=';
+};
+
+namespace ufuncs {
+
+template <>
+struct CopySign<float8_e4m3b11> {
+  float8_e4m3b11 operator()(float8_e4m3b11 a, float8_e4m3b11 b) {
+    return float8_e4m3b11::FromRep((a.rep() & 0x7f) | (b.rep() & 0x80));
+  }
+};
+
+template <>
+struct NextAfter<float8_e4m3b11> {
+  float8_e4m3b11 operator()(float8_e4m3b11 from, float8_e4m3b11 to) {
+    uint8_t from_rep = from.rep();
+    uint8_t to_rep = to.rep();
+    if (from_rep == 0x80 || to_rep == 0x80) {
+      return float8_e4m3b11::FromRep(0x80);
+    }
+    if (from_rep == to_rep) {
+      return to;
+    }
+    if (from_rep == 0) {
+      return float8_e4m3b11::FromRep(0x01 | (to_rep & 0x80));
+    }
+    const uint16_t sign_mask = 0x80;
+    uint8_t from_sign = from_rep & sign_mask;
+    uint8_t to_sign = to_rep & sign_mask;
+    uint8_t from_abs = from_rep & ~sign_mask;
+    uint8_t to_abs = to_rep & ~sign_mask;
+    uint8_t magnitude_adjustment =
+        (from_abs > to_abs || from_sign != to_sign) ? 0xFF : 0x0001;
+    uint8_t out_int = from_rep + magnitude_adjustment;
+    if (out_int == 0x80) {
+      out_int = 0x0;
+    }
+    return float8_e4m3b11::FromRep(out_int);
+  }
+};
+
+}  // namespace ufuncs
+
+}  // namespace custom_float_internal
+
+namespace {
+
+// Initializes the module.
+bool Initialize() {
+  tsl::ImportNumpy();
+  import_umath1(false);
+
+  custom_float_internal::Safe_PyObjectPtr numpy_str =
+      custom_float_internal::make_safe(PyUnicode_FromString("numpy"));
+  if (!numpy_str) {
+    return false;
+  }
+  custom_float_internal::Safe_PyObjectPtr numpy =
+      custom_float_internal::make_safe(PyImport_Import(numpy_str.get()));
+  if (!numpy) {
+    return false;
+  }
+
+  if (!custom_float_internal::RegisterNumpyDtype<bfloat16>(numpy.get())) {
+    return false;
+  }
+  bool float8_already_registered;
+  if (!custom_float_internal::RegisterNumpyDtype<float8_e4m3b11>(
+          numpy.get(), &float8_already_registered)) {
+    return false;
+  }
+
+  // Casts between bfloat16 and float8_e4m3b11. Only perform the cast if
+  // float8_e4m3b11 hasn't been previously registered, presumably by a different
+  // library. In this case, we assume the cast has also already been registered,
+  // and registering it again can cause segfaults due to accessing an
+  // uninitialized type descriptor in this library.
+  if (!float8_already_registered &&
+      !custom_float_internal::RegisterCustomFloatCast<float8_e4m3b11,
+                                                      bfloat16>()) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+bool RegisterNumpyBfloat16() {
+  if (custom_float_internal::TypeDescriptor<bfloat16>::Dtype() != NPY_NOTYPE) {
+    // Already initialized.
+    return true;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
+    }
+    PyErr_Print();
+    return false;
+  }
+  return true;
+}
+
+PyObject* Bfloat16Dtype() {
+  return reinterpret_cast<PyObject*>(
+      custom_float_internal::TypeDescriptor<bfloat16>::type_ptr);
+}
+
+int Bfloat16NumpyType() {
+  return custom_float_internal::TypeDescriptor<bfloat16>::Dtype();
+}
+
+PyObject* Float8_E4M3B11Dtype() {
+  return reinterpret_cast<PyObject*>(
+      custom_float_internal::TypeDescriptor<float8_e4m3b11>::type_ptr);
+}
+
+int Float8_E4M3B11NumpyType() {
+  return custom_float_internal::TypeDescriptor<float8_e4m3b11>::Dtype();
+}
+
+}  // namespace tsl
diff --git a/tensorflow/compiler/xla/python/bfloat16.h b/tensorflow/tsl/python/lib/core/bfloat16.h
similarity index 78%
rename from tensorflow/compiler/xla/python/bfloat16.h
rename to tensorflow/tsl/python/lib/core/bfloat16.h
index 9082e8b37a7..f852144cf47 100644
--- a/tensorflow/compiler/xla/python/bfloat16.h
+++ b/tensorflow/tsl/python/lib/core/bfloat16.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_BFLOAT16_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_BFLOAT16_H_
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_BFLOAT16_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_BFLOAT16_H_
 
 #include <Python.h>
 
-namespace xla {
+namespace tsl {
 
 // Register the bfloat16 numpy type. Returns true on success.
 bool RegisterNumpyBfloat16();
@@ -32,6 +32,9 @@ int Bfloat16NumpyType();
 // Returns a pointer to the float8_e4m3b11 dtype object.
 PyObject* Float8_E4M3B11Dtype();
 
-}  // namespace xla
+// Returns the id number of the float8_e4m3b11 numpy type.
+int Float8_E4M3B11NumpyType();
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_BFLOAT16_H_
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_BFLOAT16_H_
diff --git a/tensorflow/tsl/python/lib/core/custom_casts.cc b/tensorflow/tsl/python/lib/core/custom_casts.cc
new file mode 100644
index 00000000000..dcc3f8254ee
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/custom_casts.cc
@@ -0,0 +1,99 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" // NOLINT
+// clang-format on
+
+#include <locale>  // NOLINT
+
+// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
+#include <Python.h>
+
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/python/lib/core/bfloat16.h"
+#include "tensorflow/tsl/python/lib/core/custom_casts.h"
+#include "tensorflow/tsl/python/lib/core/float8.h"
+#include "tensorflow/tsl/python/lib/core/float8_e4m3b11.h"
+
+namespace tsl {
+
+namespace {
+
+template <typename T>
+int GetNumpyType();
+
+template <>
+int GetNumpyType<float8_e4m3b11>() {
+  return Float8_E4M3B11NumpyType();
+}
+
+template <>
+int GetNumpyType<bfloat16>() {
+  return Bfloat16NumpyType();
+}
+
+template <>
+int GetNumpyType<float8_e4m3fn>() {
+  return Float8e4m3fnNumpyType();
+}
+
+template <>
+int GetNumpyType<float8_e5m2>() {
+  return Float8e5m2NumpyType();
+}
+
+// Performs a NumPy array cast from type 'From' to 'To' via float.
+template <typename From, typename To>
+void FloatPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
+                 void* toarr) {
+  const auto* from = static_cast<From*>(from_void);
+  auto* to = static_cast<To*>(to_void);
+  for (npy_intp i = 0; i < n; ++i) {
+    to[i] = static_cast<To>(static_cast<float>(from[i]));
+  }
+}
+
+template <typename Type1, typename Type2>
+bool RegisterTwoWayCustomCast() {
+  int nptype1 = GetNumpyType<Type1>();
+  int nptype2 = GetNumpyType<Type2>();
+  PyArray_Descr* descr1 = PyArray_DescrFromType(nptype1);
+  if (PyArray_RegisterCastFunc(descr1, nptype2, FloatPyCast<Type1, Type2>) <
+      0) {
+    return false;
+  }
+  PyArray_Descr* descr2 = PyArray_DescrFromType(nptype2);
+  if (PyArray_RegisterCastFunc(descr2, nptype1, FloatPyCast<Type2, Type1>) <
+      0) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+bool RegisterCustomCasts() {
+  bool success = true;
+  // Continue trying to register casts, just in case some types are not
+  // registered (i.e. float8_e4m3b11)
+  success &= RegisterTwoWayCustomCast<float8_e4m3b11, float8_e4m3fn>();
+  success &= RegisterTwoWayCustomCast<float8_e4m3b11, float8_e5m2>();
+  success &= RegisterTwoWayCustomCast<bfloat16, float8_e4m3fn>();
+  success &= RegisterTwoWayCustomCast<bfloat16, float8_e5m2>();
+  return success;
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/python/lib/core/custom_casts.h b/tensorflow/tsl/python/lib/core/custom_casts.h
new file mode 100644
index 00000000000..9700a7f53cb
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/custom_casts.h
@@ -0,0 +1,28 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Registers casts between custom types.  This is to help reduce dependencies
+// between the types themselves.
+
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_CASTS_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_CASTS_H_
+
+namespace tsl {
+
+bool RegisterCustomCasts();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_CASTS_H_
diff --git a/tensorflow/tsl/python/lib/core/custom_float.h b/tensorflow/tsl/python/lib/core/custom_float.h
new file mode 100644
index 00000000000..a7efb24be62
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/custom_float.h
@@ -0,0 +1,1714 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_FLOAT_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_FLOAT_H_
+
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" // NOLINT
+// clang-format on
+
+// Support utilities for adding custom floating-point dtypes to TensorFlow,
+// such as bfloat16, and float8_*.
+
+#include <array>   // NOLINT
+#include <cmath>   // NOLINT
+#include <limits>  // NOLINT
+#include <locale>  // NOLINT
+#include <memory>  // NOLINT
+
+// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
+#include <Python.h>
+
+#include "absl/strings/str_cat.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/tsl/platform/logging.h"
+
+#undef copysign  // TODO(ddunleavy): temporary fix for Windows bazel build
+                 // Possible this has to do with numpy.h being included before
+                 // system headers and in bfloat16.{cc,h}?
+
+namespace tsl {
+namespace custom_float_internal {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
+inline Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
+inline bool PyLong_CheckNoOverflow(PyObject* object) {
+  if (!PyLong_Check(object)) {
+    return false;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(object, &overflow);
+  return (overflow == 0);
+}
+
+template <typename T, typename Enable = void>
+struct TypeDescriptor {
+  // typedef ... T;  // Representation type in memory for NumPy values of type
+  // static int Dtype() { return NPY_...; }  // Numpy type number for T.
+};
+
+template <typename T>
+struct CustomFloatTypeDescriptor {
+  static int Dtype() { return npy_type; }
+
+  // Registered numpy type ID. Global variable populated by the registration
+  // code. Protected by the GIL.
+  static int npy_type;
+
+  static PyTypeObject type;
+  // Pointer to the python type object we are using. This is either a pointer
+  // to type, if we choose to register it, or to the python type
+  // registered by another system into NumPy.
+  static PyTypeObject* type_ptr;
+
+  static PyNumberMethods number_methods;
+
+  static PyArray_ArrFuncs arr_funcs;
+
+  static PyArray_Descr npy_descr;
+};
+template <typename T>
+int CustomFloatTypeDescriptor<T>::npy_type = NPY_NOTYPE;
+template <typename T>
+PyTypeObject* CustomFloatTypeDescriptor<T>::type_ptr = nullptr;
+
+// Representation of a Python custom float object.
+template <typename T>
+struct PyCustomFloat {
+  PyObject_HEAD;  // Python object header
+  T value;
+};
+
+// Returns true if 'object' is a PyCustomFloat.
+template <typename T>
+bool PyCustomFloat_Check(PyObject* object) {
+  return PyObject_IsInstance(
+      object, reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type));
+}
+
+// Extracts the value of a PyCustomFloat object.
+template <typename T>
+T PyCustomFloat_CustomFloat(PyObject* object) {
+  return reinterpret_cast<PyCustomFloat<T>*>(object)->value;
+}
+
+// Constructs a PyCustomFloat object from PyCustomFloat<T>::T.
+template <typename T>
+Safe_PyObjectPtr PyCustomFloat_FromT(T x) {
+  Safe_PyObjectPtr ref =
+      make_safe(TypeDescriptor<T>::type.tp_alloc(&TypeDescriptor<T>::type, 0));
+  PyCustomFloat<T>* p = reinterpret_cast<PyCustomFloat<T>*>(ref.get());
+  if (p) {
+    p->value = x;
+  }
+  return ref;
+}
+
+// Converts a Python object to a reduced float value. Returns true on success,
+// returns false and reports a Python error on failure.
+template <typename T>
+bool CastToCustomFloat(PyObject* arg, T* output) {
+  if (PyCustomFloat_Check<T>(arg)) {
+    *output = PyCustomFloat_CustomFloat<T>(arg);
+    return true;
+  }
+  if (PyFloat_Check(arg)) {
+    double d = PyFloat_AsDouble(arg);
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = T(d);
+    return true;
+  }
+  if (PyLong_CheckNoOverflow(arg)) {
+    long l = PyLong_AsLong(arg);  // NOLINT
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = T(static_cast<float>(l));
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Half)) {
+    Eigen::half f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = T(f);
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Float)) {
+    float f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = T(f);
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Double)) {
+    double f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = T(f);
+    return true;
+  }
+  if (PyArray_IsScalar(arg, LongDouble)) {
+    long double f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = T(f);
+    return true;
+  }
+  if (PyArray_IsZeroDim(arg)) {
+    Safe_PyObjectPtr ref;
+    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
+    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
+      ref = make_safe(PyArray_Cast(arr, TypeDescriptor<T>::Dtype()));
+      if (PyErr_Occurred()) {
+        return false;
+      }
+      arg = ref.get();
+      arr = reinterpret_cast<PyArrayObject*>(arg);
+    }
+    *output = *reinterpret_cast<T*>(PyArray_DATA(arr));
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+bool SafeCastToCustomFloat(PyObject* arg, T* output) {
+  if (PyCustomFloat_Check<T>(arg)) {
+    *output = PyCustomFloat_CustomFloat<T>(arg);
+    return true;
+  }
+  return false;
+}
+
+// Converts a PyReduceFloat into a PyFloat.
+template <typename T>
+PyObject* PyCustomFloat_Float(PyObject* self) {
+  T x = PyCustomFloat_CustomFloat<T>(self);
+  return PyFloat_FromDouble(static_cast<double>(static_cast<float>(x)));
+}
+
+// Converts a PyReduceFloat into a PyInt.
+template <typename T>
+PyObject* PyCustomFloat_Int(PyObject* self) {
+  T x = PyCustomFloat_CustomFloat<T>(self);
+  long y = static_cast<long>(static_cast<float>(x));  // NOLINT
+  return PyLong_FromLong(y);
+}
+
+// Negates a PyCustomFloat.
+template <typename T>
+PyObject* PyCustomFloat_Negative(PyObject* self) {
+  T x = PyCustomFloat_CustomFloat<T>(self);
+  return PyCustomFloat_FromT<T>(-x).release();
+}
+
+template <typename T>
+PyObject* PyCustomFloat_Add(PyObject* a, PyObject* b) {
+  T x, y;
+  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
+    return PyCustomFloat_FromT<T>(x + y).release();
+  }
+  return PyArray_Type.tp_as_number->nb_add(a, b);
+}
+
+template <typename T>
+PyObject* PyCustomFloat_Subtract(PyObject* a, PyObject* b) {
+  T x, y;
+  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
+    return PyCustomFloat_FromT<T>(x - y).release();
+  }
+  return PyArray_Type.tp_as_number->nb_subtract(a, b);
+}
+
+template <typename T>
+PyObject* PyCustomFloat_Multiply(PyObject* a, PyObject* b) {
+  T x, y;
+  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
+    return PyCustomFloat_FromT<T>(x * y).release();
+  }
+  return PyArray_Type.tp_as_number->nb_multiply(a, b);
+}
+
+template <typename T>
+PyObject* PyCustomFloat_TrueDivide(PyObject* a, PyObject* b) {
+  T x, y;
+  if (SafeCastToCustomFloat<T>(a, &x) && SafeCastToCustomFloat<T>(b, &y)) {
+    return PyCustomFloat_FromT<T>(x / y).release();
+  }
+  return PyArray_Type.tp_as_number->nb_true_divide(a, b);
+}
+
+// Python number methods for PyCustomFloat objects.
+template <typename T>
+PyNumberMethods CustomFloatTypeDescriptor<T>::number_methods = {
+    PyCustomFloat_Add<T>,       // nb_add
+    PyCustomFloat_Subtract<T>,  // nb_subtract
+    PyCustomFloat_Multiply<T>,  // nb_multiply
+    nullptr,                    // nb_remainder
+    nullptr,                    // nb_divmod
+    nullptr,                    // nb_power
+    PyCustomFloat_Negative<T>,  // nb_negative
+    nullptr,                    // nb_positive
+    nullptr,                    // nb_absolute
+    nullptr,                    // nb_nonzero
+    nullptr,                    // nb_invert
+    nullptr,                    // nb_lshift
+    nullptr,                    // nb_rshift
+    nullptr,                    // nb_and
+    nullptr,                    // nb_xor
+    nullptr,                    // nb_or
+    PyCustomFloat_Int<T>,       // nb_int
+    nullptr,                    // reserved
+    PyCustomFloat_Float<T>,     // nb_float
+
+    nullptr,  // nb_inplace_add
+    nullptr,  // nb_inplace_subtract
+    nullptr,  // nb_inplace_multiply
+    nullptr,  // nb_inplace_remainder
+    nullptr,  // nb_inplace_power
+    nullptr,  // nb_inplace_lshift
+    nullptr,  // nb_inplace_rshift
+    nullptr,  // nb_inplace_and
+    nullptr,  // nb_inplace_xor
+    nullptr,  // nb_inplace_or
+
+    nullptr,                      // nb_floor_divide
+    PyCustomFloat_TrueDivide<T>,  // nb_true_divide
+    nullptr,                      // nb_inplace_floor_divide
+    nullptr,                      // nb_inplace_true_divide
+    nullptr,                      // nb_index
+};
+
+// Constructs a new PyCustomFloat.
+template <typename T>
+PyObject* PyCustomFloat_New(PyTypeObject* type, PyObject* args,
+                            PyObject* kwds) {
+  if (kwds && PyDict_Size(kwds)) {
+    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
+    return nullptr;
+  }
+  Py_ssize_t size = PyTuple_Size(args);
+  if (size != 1) {
+    PyErr_Format(PyExc_TypeError,
+                 "expected number as argument to %s constructor",
+                 TypeDescriptor<T>::kTypeName);
+    return nullptr;
+  }
+  PyObject* arg = PyTuple_GetItem(args, 0);
+
+  T value;
+  if (PyCustomFloat_Check<T>(arg)) {
+    Py_INCREF(arg);
+    return arg;
+  } else if (CastToCustomFloat<T>(arg, &value)) {
+    return PyCustomFloat_FromT<T>(value).release();
+  } else if (PyArray_Check(arg)) {
+    PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(arg);
+    if (PyArray_TYPE(arr) != TypeDescriptor<T>::Dtype()) {
+      return PyArray_Cast(arr, TypeDescriptor<T>::Dtype());
+    } else {
+      Py_INCREF(arg);
+      return arg;
+    }
+  } else if (PyUnicode_Check(arg) || PyBytes_Check(arg)) {
+    // Parse float from string, then cast to T.
+    PyObject* f = PyFloat_FromString(arg);
+    if (CastToCustomFloat<T>(f, &value)) {
+      return PyCustomFloat_FromT<T>(value).release();
+    }
+  }
+  PyErr_Format(PyExc_TypeError, "expected number, got %s",
+               Py_TYPE(arg)->tp_name);
+  return nullptr;
+}
+
+// Comparisons on PyCustomFloats.
+template <typename T>
+PyObject* PyCustomFloat_RichCompare(PyObject* a, PyObject* b, int op) {
+  T x, y;
+  if (!SafeCastToCustomFloat<T>(a, &x) || !SafeCastToCustomFloat<T>(b, &y)) {
+    return PyGenericArrType_Type.tp_richcompare(a, b, op);
+  }
+  bool result;
+  switch (op) {
+    case Py_LT:
+      result = x < y;
+      break;
+    case Py_LE:
+      result = x <= y;
+      break;
+    case Py_EQ:
+      result = x == y;
+      break;
+    case Py_NE:
+      result = x != y;
+      break;
+    case Py_GT:
+      result = x > y;
+      break;
+    case Py_GE:
+      result = x >= y;
+      break;
+    default:
+      LOG(ERROR) << "Invalid op type " << op;
+      result = false;
+  }
+  return PyBool_FromLong(result);
+}
+
+// Implementation of repr() for PyCustomFloat.
+template <typename T>
+PyObject* PyCustomFloat_Repr(PyObject* self) {
+  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
+  std::string v = absl::StrCat(static_cast<float>(x));
+  return PyUnicode_FromString(v.c_str());
+}
+
+// Implementation of str() for PyCustomFloat.
+template <typename T>
+PyObject* PyCustomFloat_Str(PyObject* self) {
+  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
+  std::string v = absl::StrCat(static_cast<float>(x));
+  return PyUnicode_FromString(v.c_str());
+}
+
+// _Py_HashDouble changed its prototype for Python 3.10 so we use an overload to
+// handle the two possibilities.
+// NOLINTNEXTLINE(clang-diagnostic-unused-function)
+inline Py_hash_t HashImpl(Py_hash_t (*hash_double)(PyObject*, double),
+                          PyObject* self, double value) {
+  return hash_double(self, value);
+}
+
+// NOLINTNEXTLINE(clang-diagnostic-unused-function)
+inline Py_hash_t HashImpl(Py_hash_t (*hash_double)(double), PyObject* self,
+                          double value) {
+  return hash_double(value);
+}
+
+// Hash function for PyCustomFloat.
+template <typename T>
+Py_hash_t PyCustomFloat_Hash(PyObject* self) {
+  T x = reinterpret_cast<PyCustomFloat<T>*>(self)->value;
+  return HashImpl(&_Py_HashDouble, self, static_cast<double>(x));
+}
+
+// Python type for PyCustomFloat objects.
+template <typename T>
+PyTypeObject CustomFloatTypeDescriptor<T>::type = {
+    PyVarObject_HEAD_INIT(nullptr, 0) TypeDescriptor<T>::kTypeName,  // tp_name
+    sizeof(PyCustomFloat<T>),  // tp_basicsize
+    0,                         // tp_itemsize
+    nullptr,                   // tp_dealloc
+#if PY_VERSION_HEX < 0x03080000
+    nullptr,  // tp_print
+#else
+    0,  // tp_vectorcall_offset
+#endif
+    nullptr,                                        // tp_getattr
+    nullptr,                                        // tp_setattr
+    nullptr,                                        // tp_compare / tp_reserved
+    PyCustomFloat_Repr<T>,                          // tp_repr
+    &CustomFloatTypeDescriptor<T>::number_methods,  // tp_as_number
+    nullptr,                                        // tp_as_sequence
+    nullptr,                                        // tp_as_mapping
+    PyCustomFloat_Hash<T>,                          // tp_hash
+    nullptr,                                        // tp_call
+    PyCustomFloat_Str<T>,                           // tp_str
+    nullptr,                                        // tp_getattro
+    nullptr,                                        // tp_setattro
+    nullptr,                                        // tp_as_buffer
+                                                    // tp_flags
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
+    TypeDescriptor<T>::kTpDoc,     // tp_doc
+    nullptr,                       // tp_traverse
+    nullptr,                       // tp_clear
+    PyCustomFloat_RichCompare<T>,  // tp_richcompare
+    0,                             // tp_weaklistoffset
+    nullptr,                       // tp_iter
+    nullptr,                       // tp_iternext
+    nullptr,                       // tp_methods
+    nullptr,                       // tp_members
+    nullptr,                       // tp_getset
+    nullptr,                       // tp_base
+    nullptr,                       // tp_dict
+    nullptr,                       // tp_descr_get
+    nullptr,                       // tp_descr_set
+    0,                             // tp_dictoffset
+    nullptr,                       // tp_init
+    nullptr,                       // tp_alloc
+    PyCustomFloat_New<T>,          // tp_new
+    nullptr,                       // tp_free
+    nullptr,                       // tp_is_gc
+    nullptr,                       // tp_bases
+    nullptr,                       // tp_mro
+    nullptr,                       // tp_cache
+    nullptr,                       // tp_subclasses
+    nullptr,                       // tp_weaklist
+    nullptr,                       // tp_del
+    0,                             // tp_version_tag
+};
+
+// Numpy support
+template <typename T>
+PyArray_ArrFuncs CustomFloatTypeDescriptor<T>::arr_funcs;
+
+template <typename T>
+PyArray_Descr CustomFloatTypeDescriptor<T>::npy_descr = {
+    PyObject_HEAD_INIT(nullptr)  //
+                                 /*typeobj=*/
+    (&TypeDescriptor<T>::type),
+    /*kind=*/TypeDescriptor<T>::kNpyDescrKind,
+    /*type=*/TypeDescriptor<T>::kNpyDescrType,
+    /*byteorder=*/TypeDescriptor<T>::kNpyDescrByteorder,
+    /*flags=*/NPY_NEEDS_PYAPI | NPY_USE_SETITEM,
+    /*type_num=*/0,
+    /*elsize=*/sizeof(T),
+    /*alignment=*/alignof(T),
+    /*subarray=*/nullptr,
+    /*fields=*/nullptr,
+    /*names=*/nullptr,
+    /*f=*/&CustomFloatTypeDescriptor<T>::arr_funcs,
+    /*metadata=*/nullptr,
+    /*c_metadata=*/nullptr,
+    /*hash=*/-1,  // -1 means "not computed yet".
+};
+
+// Implementations of NumPy array methods.
+
+template <typename T>
+PyObject* NPyCustomFloat_GetItem(void* data, void* arr) {
+  T x;
+  memcpy(&x, data, sizeof(T));
+  return PyFloat_FromDouble(static_cast<float>(x));
+}
+
+template <typename T>
+int NPyCustomFloat_SetItem(PyObject* item, void* data, void* arr) {
+  T x;
+  if (!CastToCustomFloat<T>(item, &x)) {
+    PyErr_Format(PyExc_TypeError, "expected number, got %s",
+                 Py_TYPE(item)->tp_name);
+    return -1;
+  }
+  memcpy(data, &x, sizeof(T));
+  return 0;
+}
+
+inline void ByteSwap16(void* value) {
+  char* p = reinterpret_cast<char*>(value);
+  std::swap(p[0], p[1]);
+}
+
+template <typename T>
+int NPyCustomFloat_Compare(const void* a, const void* b, void* arr) {
+  T x;
+  memcpy(&x, a, sizeof(T));
+
+  T y;
+  memcpy(&y, b, sizeof(T));
+  float fy(y);
+  float fx(x);
+
+  if (fx < fy) {
+    return -1;
+  }
+  if (fy < fx) {
+    return 1;
+  }
+  // NaNs sort to the end.
+  if (!Eigen::numext::isnan(fx) && Eigen::numext::isnan(fy)) {
+    return -1;
+  }
+  if (Eigen::numext::isnan(fx) && !Eigen::numext::isnan(fy)) {
+    return 1;
+  }
+  return 0;
+}
+
+template <typename T>
+void NPyCustomFloat_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
+                              npy_intp sstride, npy_intp n, int swap,
+                              void* arr) {
+  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
+                "Not supported");
+  char* dst = reinterpret_cast<char*>(dstv);
+  char* src = reinterpret_cast<char*>(srcv);
+  if (!src) {
+    return;
+  }
+  if (swap && sizeof(T) == sizeof(int16_t)) {
+    for (npy_intp i = 0; i < n; i++) {
+      char* r = dst + dstride * i;
+      memcpy(r, src + sstride * i, sizeof(T));
+      ByteSwap16(r);
+    }
+  } else if (dstride == sizeof(T) && sstride == sizeof(T)) {
+    memcpy(dst, src, n * sizeof(T));
+  } else {
+    for (npy_intp i = 0; i < n; i++) {
+      memcpy(dst + dstride * i, src + sstride * i, sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void NPyCustomFloat_CopySwap(void* dst, void* src, int swap, void* arr) {
+  if (!src) {
+    return;
+  }
+  memcpy(dst, src, sizeof(T));
+  static_assert(sizeof(T) == sizeof(int16_t) || sizeof(T) == sizeof(int8_t),
+                "Not supported");
+  if (swap && sizeof(T) == sizeof(int16_t)) {
+    ByteSwap16(dst);
+  }
+}
+
+template <typename T>
+npy_bool NPyCustomFloat_NonZero(void* data, void* arr) {
+  T x;
+  memcpy(&x, data, sizeof(x));
+  return x != static_cast<T>(0);
+}
+
+template <typename T>
+int NPyCustomFloat_Fill(void* buffer_raw, npy_intp length, void* ignored) {
+  T* const buffer = reinterpret_cast<T*>(buffer_raw);
+  const float start(buffer[0]);
+  const float delta = static_cast<float>(buffer[1]) - start;
+  for (npy_intp i = 2; i < length; ++i) {
+    buffer[i] = static_cast<T>(start + i * delta);
+  }
+  return 0;
+}
+
+template <typename T>
+void NPyCustomFloat_DotFunc(void* ip1, npy_intp is1, void* ip2, npy_intp is2,
+                            void* op, npy_intp n, void* arr) {
+  char* c1 = reinterpret_cast<char*>(ip1);
+  char* c2 = reinterpret_cast<char*>(ip2);
+  float acc = 0.0f;
+  for (npy_intp i = 0; i < n; ++i) {
+    T* const b1 = reinterpret_cast<T*>(c1);
+    T* const b2 = reinterpret_cast<T*>(c2);
+    acc += static_cast<float>(*b1) * static_cast<float>(*b2);
+    c1 += is1;
+    c2 += is2;
+  }
+  T* out = reinterpret_cast<T*>(op);
+  *out = static_cast<T>(acc);
+}
+
+template <typename T>
+int NPyCustomFloat_CompareFunc(const void* v1, const void* v2, void* arr) {
+  T b1 = *reinterpret_cast<const T*>(v1);
+  T b2 = *reinterpret_cast<const T*>(v2);
+  if (b1 < b2) {
+    return -1;
+  }
+  if (b1 > b2) {
+    return 1;
+  }
+  return 0;
+}
+
+template <typename T>
+int NPyCustomFloat_ArgMaxFunc(void* data, npy_intp n, npy_intp* max_ind,
+                              void* arr) {
+  const T* bdata = reinterpret_cast<const T*>(data);
+  // Start with a max_val of NaN, this results in the first iteration preferring
+  // bdata[0].
+  float max_val = std::numeric_limits<float>::quiet_NaN();
+  for (npy_intp i = 0; i < n; ++i) {
+    // This condition is chosen so that NaNs are always considered "max".
+    if (!(static_cast<float>(bdata[i]) <= max_val)) {
+      max_val = static_cast<float>(bdata[i]);
+      *max_ind = i;
+      // NumPy stops at the first NaN.
+      if (Eigen::numext::isnan(max_val)) {
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+template <typename T>
+int NPyCustomFloat_ArgMinFunc(void* data, npy_intp n, npy_intp* min_ind,
+                              void* arr) {
+  const T* bdata = reinterpret_cast<const T*>(data);
+  float min_val = std::numeric_limits<float>::quiet_NaN();
+  // Start with a min_val of NaN, this results in the first iteration preferring
+  // bdata[0].
+  for (npy_intp i = 0; i < n; ++i) {
+    // This condition is chosen so that NaNs are always considered "min".
+    if (!(static_cast<float>(bdata[i]) >= min_val)) {
+      min_val = static_cast<float>(bdata[i]);
+      *min_ind = i;
+      // NumPy stops at the first NaN.
+      if (Eigen::numext::isnan(min_val)) {
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+template <>
+struct TypeDescriptor<unsigned char> {
+  typedef unsigned char T;
+  static int Dtype() { return NPY_UBYTE; }
+};
+
+template <>
+struct TypeDescriptor<unsigned short> {  // NOLINT
+  typedef unsigned short T;              // NOLINT
+  static int Dtype() { return NPY_USHORT; }
+};
+
+// We register "int", "long", and "long long" types for portability across
+// Linux, where "int" and "long" are the same type, and Windows, where "long"
+// and "longlong" are the same type.
+template <>
+struct TypeDescriptor<unsigned int> {
+  typedef unsigned int T;
+  static int Dtype() { return NPY_UINT; }
+};
+
+template <>
+struct TypeDescriptor<unsigned long> {  // NOLINT
+  typedef unsigned long T;              // NOLINT
+  static int Dtype() { return NPY_ULONG; }
+};
+
+template <>
+struct TypeDescriptor<unsigned long long> {  // NOLINT
+  typedef unsigned long long T;              // NOLINT
+  static int Dtype() { return NPY_ULONGLONG; }
+};
+
+template <>
+struct TypeDescriptor<signed char> {
+  typedef signed char T;
+  static int Dtype() { return NPY_BYTE; }
+};
+
+template <>
+struct TypeDescriptor<short> {  // NOLINT
+  typedef short T;              // NOLINT
+  static int Dtype() { return NPY_SHORT; }
+};
+
+template <>
+struct TypeDescriptor<int> {
+  typedef int T;
+  static int Dtype() { return NPY_INT; }
+};
+
+template <>
+struct TypeDescriptor<long> {  // NOLINT
+  typedef long T;              // NOLINT
+  static int Dtype() { return NPY_LONG; }
+};
+
+template <>
+struct TypeDescriptor<long long> {  // NOLINT
+  typedef long long T;              // NOLINT
+  static int Dtype() { return NPY_LONGLONG; }
+};
+
+template <>
+struct TypeDescriptor<bool> {
+  typedef unsigned char T;
+  static int Dtype() { return NPY_BOOL; }
+};
+
+template <>
+struct TypeDescriptor<Eigen::half> {
+  typedef Eigen::half T;
+  static int Dtype() { return NPY_HALF; }
+};
+
+template <>
+struct TypeDescriptor<float> {
+  typedef float T;
+  static int Dtype() { return NPY_FLOAT; }
+};
+
+template <>
+struct TypeDescriptor<double> {
+  typedef double T;
+  static int Dtype() { return NPY_DOUBLE; }
+};
+
+template <>
+struct TypeDescriptor<long double> {
+  typedef long double T;
+  static int Dtype() { return NPY_LONGDOUBLE; }
+};
+
+template <>
+struct TypeDescriptor<std::complex<float>> {
+  typedef std::complex<float> T;
+  static int Dtype() { return NPY_CFLOAT; }
+};
+
+template <>
+struct TypeDescriptor<std::complex<double>> {
+  typedef std::complex<double> T;
+  static int Dtype() { return NPY_CDOUBLE; }
+};
+
+template <>
+struct TypeDescriptor<std::complex<long double>> {
+  typedef std::complex<long double> T;
+  static int Dtype() { return NPY_CLONGDOUBLE; }
+};
+
+template <typename T>
+float CastToFloat(T value) {
+  return static_cast<float>(value);
+}
+
+template <typename T>
+float CastToFloat(std::complex<T> value) {
+  return CastToFloat(value.real());
+}
+
+// Performs a NumPy array cast from type 'From' to 'To'.
+template <typename From, typename To>
+void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
+             void* toarr) {
+  const auto* from =
+      reinterpret_cast<typename TypeDescriptor<From>::T*>(from_void);
+  auto* to = reinterpret_cast<typename TypeDescriptor<To>::T*>(to_void);
+  for (npy_intp i = 0; i < n; ++i) {
+    to[i] = static_cast<typename TypeDescriptor<To>::T>(
+        static_cast<To>(CastToFloat(from[i])));
+  }
+}
+
+// Registers a cast between T (a reduced float) and type 'OtherT'. 'numpy_type'
+// is the NumPy type corresponding to 'OtherT'.
+template <typename T, typename OtherT>
+bool RegisterCustomFloatCast(int numpy_type = TypeDescriptor<OtherT>::Dtype()) {
+  PyArray_Descr* descr = PyArray_DescrFromType(numpy_type);
+  if (PyArray_RegisterCastFunc(descr, TypeDescriptor<T>::Dtype(),
+                               NPyCast<OtherT, T>) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCastFunc(&CustomFloatTypeDescriptor<T>::npy_descr,
+                               numpy_type, NPyCast<T, OtherT>) < 0) {
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+bool RegisterCasts() {
+  if (!RegisterCustomFloatCast<T, Eigen::half>(NPY_HALF)) {
+    return false;
+  }
+
+  if (!RegisterCustomFloatCast<T, float>(NPY_FLOAT)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, double>(NPY_DOUBLE)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, long double>(NPY_LONGDOUBLE)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, bool>(NPY_BOOL)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, unsigned char>(NPY_UBYTE)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, unsigned short>(NPY_USHORT)) {  // NOLINT
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, unsigned int>(NPY_UINT)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, unsigned long>(NPY_ULONG)) {  // NOLINT
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, unsigned long long>(  // NOLINT
+          NPY_ULONGLONG)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, signed char>(NPY_BYTE)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, short>(NPY_SHORT)) {  // NOLINT
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, int>(NPY_INT)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, long>(NPY_LONG)) {  // NOLINT
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, long long>(NPY_LONGLONG)) {  // NOLINT
+    return false;
+  }
+  // Following the numpy convention. imag part is dropped when converting to
+  // float.
+  if (!RegisterCustomFloatCast<T, std::complex<float>>(NPY_CFLOAT)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, std::complex<double>>(NPY_CDOUBLE)) {
+    return false;
+  }
+  if (!RegisterCustomFloatCast<T, std::complex<long double>>(NPY_CLONGDOUBLE)) {
+    return false;
+  }
+
+  // Safe casts from T to other types
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_FLOAT,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_DOUBLE,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_LONGDOUBLE,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CFLOAT,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CDOUBLE,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(&TypeDescriptor<T>::npy_descr, NPY_CLONGDOUBLE,
+                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+
+  // Safe casts to T from other types
+  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BOOL),
+                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_UBYTE),
+                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCanCast(PyArray_DescrFromType(NPY_BYTE),
+                              TypeDescriptor<T>::Dtype(), NPY_NOSCALAR) < 0) {
+    return false;
+  }
+
+  return true;
+}
+
+template <typename InType, typename OutType, typename Functor>
+struct UnaryUFunc {
+  static std::vector<int> Types() {
+    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype()};
+  }
+  static void Call(char** args, const npy_intp* dimensions,
+                   const npy_intp* steps, void* data) {
+    const char* i0 = args[0];
+    char* o = args[1];
+    for (npy_intp k = 0; k < *dimensions; k++) {
+      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
+      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) = Functor()(x);
+      i0 += steps[0];
+      o += steps[1];
+    }
+  }
+};
+
+template <typename InType, typename OutType, typename OutType2,
+          typename Functor>
+struct UnaryUFunc2 {
+  static std::vector<int> Types() {
+    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<OutType>::Dtype(),
+            TypeDescriptor<OutType2>::Dtype()};
+  }
+  static void Call(char** args, const npy_intp* dimensions,
+                   const npy_intp* steps, void* data) {
+    const char* i0 = args[0];
+    char* o0 = args[1];
+    char* o1 = args[2];
+    for (npy_intp k = 0; k < *dimensions; k++) {
+      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
+      std::tie(*reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o0),
+               *reinterpret_cast<typename TypeDescriptor<OutType2>::T*>(o1)) =
+          Functor()(x);
+      i0 += steps[0];
+      o0 += steps[1];
+      o1 += steps[2];
+    }
+  }
+};
+
+template <typename InType, typename OutType, typename Functor>
+struct BinaryUFunc {
+  static std::vector<int> Types() {
+    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType>::Dtype(),
+            TypeDescriptor<OutType>::Dtype()};
+  }
+  static void Call(char** args, const npy_intp* dimensions,
+                   const npy_intp* steps, void* data) {
+    const char* i0 = args[0];
+    const char* i1 = args[1];
+    char* o = args[2];
+    for (npy_intp k = 0; k < *dimensions; k++) {
+      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
+      auto y = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i1);
+      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
+          Functor()(x, y);
+      i0 += steps[0];
+      i1 += steps[1];
+      o += steps[2];
+    }
+  }
+};
+
+template <typename InType, typename InType2, typename OutType, typename Functor>
+struct BinaryUFunc2 {
+  static std::vector<int> Types() {
+    return {TypeDescriptor<InType>::Dtype(), TypeDescriptor<InType2>::Dtype(),
+            TypeDescriptor<OutType>::Dtype()};
+  }
+  static void Call(char** args, const npy_intp* dimensions,
+                   const npy_intp* steps, void* data) {
+    const char* i0 = args[0];
+    const char* i1 = args[1];
+    char* o = args[2];
+    for (npy_intp k = 0; k < *dimensions; k++) {
+      auto x = *reinterpret_cast<const typename TypeDescriptor<InType>::T*>(i0);
+      auto y =
+          *reinterpret_cast<const typename TypeDescriptor<InType2>::T*>(i1);
+      *reinterpret_cast<typename TypeDescriptor<OutType>::T*>(o) =
+          Functor()(x, y);
+      i0 += steps[0];
+      i1 += steps[1];
+      o += steps[2];
+    }
+  }
+};
+
+template <typename UFunc, typename CustomFloatT>
+bool RegisterUFunc(PyObject* numpy, const char* name) {
+  std::vector<int> types = UFunc::Types();
+  PyUFuncGenericFunction fn =
+      reinterpret_cast<PyUFuncGenericFunction>(UFunc::Call);
+  Safe_PyObjectPtr ufunc_obj = make_safe(PyObject_GetAttrString(numpy, name));
+  if (!ufunc_obj) {
+    return false;
+  }
+  PyUFuncObject* ufunc = reinterpret_cast<PyUFuncObject*>(ufunc_obj.get());
+  if (static_cast<int>(types.size()) != ufunc->nargs) {
+    PyErr_Format(PyExc_AssertionError,
+                 "ufunc %s takes %d arguments, loop takes %lu", name,
+                 ufunc->nargs, types.size());
+    return false;
+  }
+  if (PyUFunc_RegisterLoopForType(ufunc, TypeDescriptor<CustomFloatT>::Dtype(),
+                                  fn, const_cast<int*>(types.data()),
+                                  nullptr) < 0) {
+    return false;
+  }
+  return true;
+}
+
+namespace ufuncs {
+
+template <typename T>
+struct Add {
+  T operator()(T a, T b) { return a + b; }
+};
+template <typename T>
+struct Subtract {
+  T operator()(T a, T b) { return a - b; }
+};
+template <typename T>
+struct Multiply {
+  T operator()(T a, T b) { return a * b; }
+};
+template <typename T>
+struct TrueDivide {
+  T operator()(T a, T b) { return a / b; }
+};
+
+inline std::pair<float, float> divmod(float a, float b) {
+  if (b == 0.0f) {
+    float nan = std::numeric_limits<float>::quiet_NaN();
+    return {nan, nan};
+  }
+  float mod = std::fmod(a, b);
+  float div = (a - mod) / b;
+  if (mod != 0.0f) {
+    if ((b < 0.0f) != (mod < 0.0f)) {
+      mod += b;
+      div -= 1.0f;
+    }
+  } else {
+    mod = std::copysign(0.0f, b);
+  }
+
+  float floordiv;
+  if (div != 0.0f) {
+    floordiv = std::floor(div);
+    if (div - floordiv > 0.5f) {
+      floordiv += 1.0f;
+    }
+  } else {
+    floordiv = std::copysign(0.0f, a / b);
+  }
+  return {floordiv, mod};
+}
+
+template <typename T>
+struct FloorDivide {
+  T operator()(T a, T b) {
+    return T(divmod(static_cast<float>(a), static_cast<float>(b)).first);
+  }
+};
+template <typename T>
+struct Remainder {
+  T operator()(T a, T b) {
+    return T(divmod(static_cast<float>(a), static_cast<float>(b)).second);
+  }
+};
+template <typename T>
+struct DivmodUFunc {
+  static std::vector<int> Types() {
+    return {TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype(),
+            TypeDescriptor<T>::Dtype(), TypeDescriptor<T>::Dtype()};
+  }
+  static void Call(char** args, npy_intp* dimensions, npy_intp* steps,
+                   void* data) {
+    const char* i0 = args[0];
+    const char* i1 = args[1];
+    char* o0 = args[2];
+    char* o1 = args[3];
+    for (npy_intp k = 0; k < *dimensions; k++) {
+      T x = *reinterpret_cast<const T*>(i0);
+      T y = *reinterpret_cast<const T*>(i1);
+      float floordiv, mod;
+      std::tie(floordiv, mod) =
+          divmod(static_cast<float>(x), static_cast<float>(y));
+      *reinterpret_cast<T*>(o0) = T(floordiv);
+      *reinterpret_cast<T*>(o1) = T(mod);
+      i0 += steps[0];
+      i1 += steps[1];
+      o0 += steps[2];
+      o1 += steps[3];
+    }
+  }
+};
+template <typename T>
+struct Fmod {
+  T operator()(T a, T b) {
+    return T(std::fmod(static_cast<float>(a), static_cast<float>(b)));
+  }
+};
+template <typename T>
+struct Negative {
+  T operator()(T a) { return -a; }
+};
+template <typename T>
+struct Positive {
+  T operator()(T a) { return a; }
+};
+template <typename T>
+struct Power {
+  T operator()(T a, T b) {
+    return T(std::pow(static_cast<float>(a), static_cast<float>(b)));
+  }
+};
+template <typename T>
+struct Abs {
+  T operator()(T a) { return T(std::abs(static_cast<float>(a))); }
+};
+template <typename T>
+struct Cbrt {
+  T operator()(T a) { return T(std::cbrt(static_cast<float>(a))); }
+};
+template <typename T>
+struct Ceil {
+  T operator()(T a) { return T(std::ceil(static_cast<float>(a))); }
+};
+template <typename T>
+struct CopySign;
+
+template <typename T>
+struct Exp {
+  T operator()(T a) { return T(std::exp(static_cast<float>(a))); }
+};
+template <typename T>
+struct Exp2 {
+  T operator()(T a) { return T(std::exp2(static_cast<float>(a))); }
+};
+template <typename T>
+struct Expm1 {
+  T operator()(T a) { return T(std::expm1(static_cast<float>(a))); }
+};
+template <typename T>
+struct Floor {
+  T operator()(T a) { return T(std::floor(static_cast<float>(a))); }
+};
+template <typename T>
+struct Frexp {
+  std::pair<T, int> operator()(T a) {
+    int exp;
+    float f = std::frexp(static_cast<float>(a), &exp);
+    return {T(f), exp};
+  }
+};
+template <typename T>
+struct Heaviside {
+  T operator()(T bx, T h0) {
+    float x = static_cast<float>(bx);
+    if (Eigen::numext::isnan(x)) {
+      return bx;
+    }
+    if (x < 0) {
+      return T(0.0f);
+    }
+    if (x > 0) {
+      return T(1.0f);
+    }
+    return h0;  // x == 0
+  }
+};
+template <typename T>
+struct Conjugate {
+  T operator()(T a) { return a; }
+};
+template <typename T>
+struct IsFinite {
+  bool operator()(T a) { return std::isfinite(static_cast<float>(a)); }
+};
+template <typename T>
+struct IsInf {
+  bool operator()(T a) { return std::isinf(static_cast<float>(a)); }
+};
+template <typename T>
+struct IsNan {
+  bool operator()(T a) { return Eigen::numext::isnan(static_cast<float>(a)); }
+};
+template <typename T>
+struct Ldexp {
+  T operator()(T a, int exp) {
+    return T(std::ldexp(static_cast<float>(a), exp));
+  }
+};
+template <typename T>
+struct Log {
+  T operator()(T a) { return T(std::log(static_cast<float>(a))); }
+};
+template <typename T>
+struct Log2 {
+  T operator()(T a) { return T(std::log2(static_cast<float>(a))); }
+};
+template <typename T>
+struct Log10 {
+  T operator()(T a) { return T(std::log10(static_cast<float>(a))); }
+};
+template <typename T>
+struct Log1p {
+  T operator()(T a) { return T(std::log1p(static_cast<float>(a))); }
+};
+template <typename T>
+struct LogAddExp {
+  T operator()(T bx, T by) {
+    float x = static_cast<float>(bx);
+    float y = static_cast<float>(by);
+    if (x == y) {
+      // Handles infinities of the same sign.
+      return T(x + std::log(2.0f));
+    }
+    float out = std::numeric_limits<float>::quiet_NaN();
+    if (x > y) {
+      out = x + std::log1p(std::exp(y - x));
+    } else if (x < y) {
+      out = y + std::log1p(std::exp(x - y));
+    }
+    return T(out);
+  }
+};
+template <typename T>
+struct LogAddExp2 {
+  T operator()(T bx, T by) {
+    float x = static_cast<float>(bx);
+    float y = static_cast<float>(by);
+    if (x == y) {
+      // Handles infinities of the same sign.
+      return T(x + 1.0f);
+    }
+    float out = std::numeric_limits<float>::quiet_NaN();
+    if (x > y) {
+      out = x + std::log1p(std::exp2(y - x)) / std::log(2.0f);
+    } else if (x < y) {
+      out = y + std::log1p(std::exp2(x - y)) / std::log(2.0f);
+    }
+    return T(out);
+  }
+};
+template <typename T>
+struct Modf {
+  std::pair<T, T> operator()(T a) {
+    float integral;
+    float f = std::modf(static_cast<float>(a), &integral);
+    return {T(f), T(integral)};
+  }
+};
+
+template <typename T>
+struct Reciprocal {
+  T operator()(T a) { return T(1.f / static_cast<float>(a)); }
+};
+template <typename T>
+struct Rint {
+  T operator()(T a) { return T(std::rint(static_cast<float>(a))); }
+};
+template <typename T>
+struct Sign {
+  T operator()(T a) {
+    float f(a);
+    if (f < 0) {
+      return T(-1);
+    }
+    if (f > 0) {
+      return T(1);
+    }
+    return a;
+  }
+};
+template <typename T>
+struct SignBit {
+  bool operator()(T a) { return std::signbit(static_cast<float>(a)); }
+};
+template <typename T>
+struct Sqrt {
+  T operator()(T a) { return T(std::sqrt(static_cast<float>(a))); }
+};
+template <typename T>
+struct Square {
+  T operator()(T a) {
+    float f(a);
+    return T(f * f);
+  }
+};
+template <typename T>
+struct Trunc {
+  T operator()(T a) { return T(std::trunc(static_cast<float>(a))); }
+};
+
+// Trigonometric functions
+template <typename T>
+struct Sin {
+  T operator()(T a) { return T(std::sin(static_cast<float>(a))); }
+};
+template <typename T>
+struct Cos {
+  T operator()(T a) { return T(std::cos(static_cast<float>(a))); }
+};
+template <typename T>
+struct Tan {
+  T operator()(T a) { return T(std::tan(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arcsin {
+  T operator()(T a) { return T(std::asin(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arccos {
+  T operator()(T a) { return T(std::acos(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arctan {
+  T operator()(T a) { return T(std::atan(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arctan2 {
+  T operator()(T a, T b) {
+    return T(std::atan2(static_cast<float>(a), static_cast<float>(b)));
+  }
+};
+template <typename T>
+struct Hypot {
+  T operator()(T a, T b) {
+    return T(std::hypot(static_cast<float>(a), static_cast<float>(b)));
+  }
+};
+template <typename T>
+struct Sinh {
+  T operator()(T a) { return T(std::sinh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Cosh {
+  T operator()(T a) { return T(std::cosh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Tanh {
+  T operator()(T a) { return T(std::tanh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arcsinh {
+  T operator()(T a) { return T(std::asinh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arccosh {
+  T operator()(T a) { return T(std::acosh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Arctanh {
+  T operator()(T a) { return T(std::atanh(static_cast<float>(a))); }
+};
+template <typename T>
+struct Deg2rad {
+  T operator()(T a) {
+    static constexpr float radians_per_degree = M_PI / 180.0f;
+    return T(static_cast<float>(a) * radians_per_degree);
+  }
+};
+template <typename T>
+struct Rad2deg {
+  T operator()(T a) {
+    static constexpr float degrees_per_radian = 180.0f / M_PI;
+    return T(static_cast<float>(a) * degrees_per_radian);
+  }
+};
+
+template <typename T>
+struct Eq {
+  npy_bool operator()(T a, T b) { return a == b; }
+};
+template <typename T>
+struct Ne {
+  npy_bool operator()(T a, T b) { return a != b; }
+};
+template <typename T>
+struct Lt {
+  npy_bool operator()(T a, T b) { return a < b; }
+};
+template <typename T>
+struct Gt {
+  npy_bool operator()(T a, T b) { return a > b; }
+};
+template <typename T>
+struct Le {
+  npy_bool operator()(T a, T b) { return a <= b; }
+};
+template <typename T>
+struct Ge {
+  npy_bool operator()(T a, T b) { return a >= b; }
+};
+template <typename T>
+struct Maximum {
+  T operator()(T a, T b) {
+    float fa(a), fb(b);
+    return Eigen::numext::isnan(fa) || fa > fb ? a : b;
+  }
+};
+template <typename T>
+struct Minimum {
+  T operator()(T a, T b) {
+    float fa(a), fb(b);
+    return Eigen::numext::isnan(fa) || fa < fb ? a : b;
+  }
+};
+template <typename T>
+struct Fmax {
+  T operator()(T a, T b) {
+    float fa(a), fb(b);
+    return Eigen::numext::isnan(fb) || fa > fb ? a : b;
+  }
+};
+template <typename T>
+struct Fmin {
+  T operator()(T a, T b) {
+    float fa(a), fb(b);
+    return Eigen::numext::isnan(fb) || fa < fb ? a : b;
+  }
+};
+
+template <typename T>
+struct LogicalNot {
+  npy_bool operator()(T a) { return !static_cast<bool>(a); }
+};
+template <typename T>
+struct LogicalAnd {
+  npy_bool operator()(T a, T b) {
+    return static_cast<bool>(a) && static_cast<bool>(b);
+  }
+};
+template <typename T>
+struct LogicalOr {
+  npy_bool operator()(T a, T b) {
+    return static_cast<bool>(a) || static_cast<bool>(b);
+  }
+};
+template <typename T>
+struct LogicalXor {
+  npy_bool operator()(T a, T b) {
+    return static_cast<bool>(a) ^ static_cast<bool>(b);
+  }
+};
+
+template <typename T>
+struct NextAfter;
+
+template <typename T>
+struct Spacing {
+  T operator()(T x) {
+    // Compute the distance between the input and the next number with greater
+    // magnitude. The result should have the sign of the input.
+    T away(std::copysign(std::numeric_limits<float>::infinity(),
+                         static_cast<float>(x)));
+    return NextAfter<T>()(x, away) - x;
+  }
+};
+
+template <typename T>
+bool RegisterUFuncs(PyObject* numpy) {
+  bool ok =
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Add<T>>, T>(numpy, "add") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Subtract<T>>, T>(numpy,
+                                                               "subtract") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Multiply<T>>, T>(numpy,
+                                                               "multiply") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(numpy,
+                                                                 "divide") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp<T>>, T>(numpy,
+                                                                "logaddexp") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::LogAddExp2<T>>, T>(
+          numpy, "logaddexp2") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Negative<T>>, T>(numpy,
+                                                              "negative") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Positive<T>>, T>(numpy,
+                                                              "positive") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::TrueDivide<T>>, T>(
+          numpy, "true_divide") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::FloorDivide<T>>, T>(
+          numpy, "floor_divide") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Power<T>>, T>(numpy, "power") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy,
+                                                                "remainder") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Remainder<T>>, T>(numpy, "mod") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmod<T>>, T>(numpy, "fmod") &&
+      RegisterUFunc<ufuncs::DivmodUFunc<T>, T>(numpy, "divmod") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "absolute") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Abs<T>>, T>(numpy, "fabs") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rint<T>>, T>(numpy, "rint") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sign<T>>, T>(numpy, "sign") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Heaviside<T>>, T>(numpy,
+                                                                "heaviside") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Conjugate<T>>, T>(numpy,
+                                                               "conjugate") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp<T>>, T>(numpy, "exp") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Exp2<T>>, T>(numpy, "exp2") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Expm1<T>>, T>(numpy, "expm1") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log<T>>, T>(numpy, "log") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log2<T>>, T>(numpy, "log2") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log10<T>>, T>(numpy, "log10") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Log1p<T>>, T>(numpy, "log1p") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sqrt<T>>, T>(numpy, "sqrt") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Square<T>>, T>(numpy, "square") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cbrt<T>>, T>(numpy, "cbrt") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Reciprocal<T>>, T>(numpy,
+                                                                "reciprocal") &&
+
+      // Trigonometric functions
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sin<T>>, T>(numpy, "sin") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cos<T>>, T>(numpy, "cos") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tan<T>>, T>(numpy, "tan") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsin<T>>, T>(numpy, "arcsin") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccos<T>>, T>(numpy, "arccos") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctan<T>>, T>(numpy, "arctan") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Arctan2<T>>, T>(numpy,
+                                                              "arctan2") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Hypot<T>>, T>(numpy, "hypot") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Sinh<T>>, T>(numpy, "sinh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Cosh<T>>, T>(numpy, "cosh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Tanh<T>>, T>(numpy, "tanh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arcsinh<T>>, T>(numpy,
+                                                             "arcsinh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arccosh<T>>, T>(numpy,
+                                                             "arccosh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Arctanh<T>>, T>(numpy,
+                                                             "arctanh") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Deg2rad<T>>, T>(numpy,
+                                                             "deg2rad") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Rad2deg<T>>, T>(numpy,
+                                                             "rad2deg") &&
+
+      // Comparison functions
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Eq<T>>, T>(numpy, "equal") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ne<T>>, T>(numpy,
+                                                            "not_equal") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Lt<T>>, T>(numpy, "less") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Gt<T>>, T>(numpy, "greater") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Le<T>>, T>(numpy,
+                                                            "less_equal") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::Ge<T>>, T>(numpy,
+                                                            "greater_equal") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Maximum<T>>, T>(numpy,
+                                                              "maximum") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Minimum<T>>, T>(numpy,
+                                                              "minimum") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmax<T>>, T>(numpy, "fmax") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::Fmin<T>>, T>(numpy, "fmin") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalAnd<T>>, T>(
+          numpy, "logical_and") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalOr<T>>, T>(
+          numpy, "logical_or") &&
+      RegisterUFunc<BinaryUFunc<T, bool, ufuncs::LogicalXor<T>>, T>(
+          numpy, "logical_xor") &&
+      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::LogicalNot<T>>, T>(
+          numpy, "logical_not") &&
+
+      // Floating point functions
+      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsFinite<T>>, T>(numpy,
+                                                                 "isfinite") &&
+      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsInf<T>>, T>(numpy, "isinf") &&
+      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::IsNan<T>>, T>(numpy, "isnan") &&
+      RegisterUFunc<UnaryUFunc<T, bool, ufuncs::SignBit<T>>, T>(numpy,
+                                                                "signbit") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::CopySign<T>>, T>(numpy,
+                                                               "copysign") &&
+      RegisterUFunc<UnaryUFunc2<T, T, T, ufuncs::Modf<T>>, T>(numpy, "modf") &&
+      RegisterUFunc<BinaryUFunc2<T, int, T, ufuncs::Ldexp<T>>, T>(numpy,
+                                                                  "ldexp") &&
+      RegisterUFunc<UnaryUFunc2<T, T, int, ufuncs::Frexp<T>>, T>(numpy,
+                                                                 "frexp") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Floor<T>>, T>(numpy, "floor") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Ceil<T>>, T>(numpy, "ceil") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Trunc<T>>, T>(numpy, "trunc") &&
+      RegisterUFunc<BinaryUFunc<T, T, ufuncs::NextAfter<T>>, T>(numpy,
+                                                                "nextafter") &&
+      RegisterUFunc<UnaryUFunc<T, T, ufuncs::Spacing<T>>, T>(numpy, "spacing");
+
+  return ok;
+}
+
+}  // namespace ufuncs
+
+// Returns true if the numpy type for T is successfully registered, including if
+// it was already registered (e.g. by a different library). If
+// `already_registered` is non-null, it's set to true if the type was already
+// registered and false otherwise.
+template <typename T>
+bool RegisterNumpyDtype(PyObject* numpy, bool* already_registered = nullptr) {
+  if (already_registered != nullptr) {
+    *already_registered = false;
+  }
+  // If another module (presumably either TF or JAX) has registered a bfloat16
+  // type, use it. We don't want two bfloat16 types if we can avoid it since it
+  // leads to confusion if we have two different types with the same name. This
+  // assumes that the other module has a sufficiently complete bfloat16
+  // implementation. The only known NumPy bfloat16 extension at the time of
+  // writing is this one (distributed in TF and JAX).
+  // TODO(phawkins): distribute the bfloat16 extension as its own pip package,
+  // so we can unambiguously refer to a single canonical definition of bfloat16.
+  int typenum =
+      PyArray_TypeNumFromName(const_cast<char*>(TypeDescriptor<T>::kTypeName));
+  if (typenum != NPY_NOTYPE) {
+    PyArray_Descr* descr = PyArray_DescrFromType(typenum);
+    // The test for an argmax function here is to verify that the
+    // bfloat16 implementation is sufficiently new, and, say, not from
+    // an older version of TF or JAX.
+    if (descr && descr->f && descr->f->argmax) {
+      TypeDescriptor<T>::npy_type = typenum;
+      TypeDescriptor<T>::type_ptr = descr->typeobj;
+      if (already_registered != nullptr) {
+        *already_registered = true;
+      }
+      return true;
+    }
+  }
+
+  TypeDescriptor<T>::type.tp_base = &PyGenericArrType_Type;
+
+  if (PyType_Ready(&TypeDescriptor<T>::type) < 0) {
+    return false;
+  }
+
+  // Initializes the NumPy descriptor.
+  PyArray_ArrFuncs& arr_funcs = CustomFloatTypeDescriptor<T>::arr_funcs;
+  PyArray_InitArrFuncs(&arr_funcs);
+  arr_funcs.getitem = NPyCustomFloat_GetItem<T>;
+  arr_funcs.setitem = NPyCustomFloat_SetItem<T>;
+  arr_funcs.compare = NPyCustomFloat_Compare<T>;
+  arr_funcs.copyswapn = NPyCustomFloat_CopySwapN<T>;
+  arr_funcs.copyswap = NPyCustomFloat_CopySwap<T>;
+  arr_funcs.nonzero = NPyCustomFloat_NonZero<T>;
+  arr_funcs.fill = NPyCustomFloat_Fill<T>;
+  arr_funcs.dotfunc = NPyCustomFloat_DotFunc<T>;
+  arr_funcs.compare = NPyCustomFloat_CompareFunc<T>;
+  arr_funcs.argmax = NPyCustomFloat_ArgMaxFunc<T>;
+  arr_funcs.argmin = NPyCustomFloat_ArgMinFunc<T>;
+
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+  Py_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr) = &PyArrayDescr_Type;
+#else
+  Py_SET_TYPE(&CustomFloatTypeDescriptor<T>::npy_descr, &PyArrayDescr_Type);
+#endif
+  TypeDescriptor<T>::npy_type =
+      PyArray_RegisterDataType(&CustomFloatTypeDescriptor<T>::npy_descr);
+  TypeDescriptor<T>::type_ptr = &TypeDescriptor<T>::type;
+  if (TypeDescriptor<T>::Dtype() < 0) {
+    return false;
+  }
+
+  Safe_PyObjectPtr typeDict_obj =
+      make_safe(PyObject_GetAttrString(numpy, "sctypeDict"));
+  if (!typeDict_obj) return false;
+  // Add the type object to `numpy.typeDict`: that makes
+  // `numpy.dtype(type_name)` work.
+  if (PyDict_SetItemString(
+          typeDict_obj.get(), TypeDescriptor<T>::kTypeName,
+          reinterpret_cast<PyObject*>(&TypeDescriptor<T>::type)) < 0) {
+    return false;
+  }
+
+  // Support dtype(type_name)
+  if (PyDict_SetItemString(TypeDescriptor<T>::type.tp_dict, "dtype",
+                           reinterpret_cast<PyObject*>(
+                               &CustomFloatTypeDescriptor<T>::npy_descr)) < 0) {
+    return false;
+  }
+
+  return RegisterCasts<T>() && ufuncs::RegisterUFuncs<T>(numpy);
+}
+
+}  // namespace custom_float_internal
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_CUSTOM_FLOAT_H_
diff --git a/tensorflow/tsl/python/lib/core/float8.cc b/tensorflow/tsl/python/lib/core/float8.cc
new file mode 100644
index 00000000000..30c51e91593
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/float8.cc
@@ -0,0 +1,239 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Must be included first
+// clang-format off
+#include "tensorflow/tsl/python/lib/core/numpy.h" //NOLINT
+// clang-format on
+
+#include "tensorflow/tsl/python/lib/core/float8.h"
+
+#include <array>   // NOLINT
+#include <cmath>   // NOLINT
+#include <limits>  // NOLINT
+#include <locale>  // NOLINT
+
+// Place `<locale>` before <Python.h> to avoid a build failure in macOS.
+#include <Python.h>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/tsl/platform/types.h"
+#include "tensorflow/tsl/python/lib/core/custom_float.h"
+
+namespace tsl {
+namespace custom_float_internal {
+
+namespace ufuncs {
+
+template <typename T>
+struct CopySignFloat8 {
+  T operator()(T a, T b) {
+    constexpr uint8_t kSignMask = static_cast<uint8_t>(1)
+                                  << (sizeof(T) * 8 - 1);
+    return Eigen::numext::bit_cast<T>(static_cast<uint8_t>(
+        (Eigen::numext::bit_cast<uint8_t>(a) & ~kSignMask) |
+        (Eigen::numext::bit_cast<uint8_t>(b) & kSignMask)));
+  }
+};
+
+template <>
+struct CopySign<float8_e4m3fn> : CopySignFloat8<float8_e4m3fn> {};
+
+template <>
+struct CopySign<float8_e5m2> : CopySignFloat8<float8_e5m2> {};
+
+template <typename T>
+struct NextAfterFloat8 {
+  T operator()(T from, T to) {
+    if (Eigen::numext::isnan(from) || Eigen::numext::isnan(to)) {
+      return std::numeric_limits<T>::quiet_NaN();
+    }
+    uint8_t from_as_int = Eigen::numext::bit_cast<uint8_t>(from);
+    uint8_t to_as_int = Eigen::numext::bit_cast<uint8_t>(to);
+    if (from_as_int == to_as_int) {
+      return to;
+    }
+
+    constexpr uint8_t kSignMask = static_cast<uint8_t>(1)
+                                  << (sizeof(T) * 8 - 1);
+    uint8_t from_abs = from_as_int & ~kSignMask;
+    uint8_t to_abs = to_as_int & ~kSignMask;
+    if (from_abs == 0) {
+      if (to_abs == 0) {
+        return to;
+      } else {
+        // Smallest subnormal signed like `to`.
+        return Eigen::numext::bit_cast<T>(
+            static_cast<uint8_t>((to_as_int & kSignMask) | 1));
+      }
+    }
+    uint8_t from_sign = from_as_int & kSignMask;
+    uint8_t to_sign = to_as_int & kSignMask;
+    uint8_t magnitude_adjustment = (from_abs > to_abs || from_sign != to_sign)
+                                       ? static_cast<uint8_t>(-1)
+                                       : static_cast<uint8_t>(1);
+    uint8_t out_int = from_as_int + magnitude_adjustment;
+    return Eigen::numext::bit_cast<T>(out_int);
+  }
+};
+
+template <>
+struct NextAfter<float8_e4m3fn> : NextAfterFloat8<float8_e4m3fn> {};
+
+template <>
+struct NextAfter<float8_e5m2> : NextAfterFloat8<float8_e5m2> {};
+
+// Since float8_e4m3fn doesn't have `inf`, we need to modify to use `max`.
+template <>
+struct Spacing<float8_e4m3fn> {
+  float8_e4m3fn operator()(float8_e4m3fn x) {
+    CopySign<float8_e4m3fn> copysign;
+    if (Eigen::numext::abs(x) == std::numeric_limits<float8_e4m3fn>::max()) {
+      return copysign(std::numeric_limits<float8_e4m3fn>::quiet_NaN(), x);
+    }
+    float8_e4m3fn away = copysign(std::numeric_limits<float8_e4m3fn>::max(), x);
+    return NextAfter<float8_e4m3fn>()(x, away) - x;
+  }
+};
+
+}  // namespace ufuncs
+
+template <>
+struct TypeDescriptor<float8_e4m3fn>
+    : custom_float_internal::CustomFloatTypeDescriptor<float8_e4m3fn> {
+  typedef float8_e4m3fn T;
+  static constexpr const char* kTypeName = "float8_e4m3fn";
+  static constexpr const char* kTpDoc = "float8_e4m3fn floating-point values";
+  // We must register float8_e4m3fn with a unique kind, because numpy
+  // considers two types with the same kind and size to be equal.
+  // The downside of this is that NumPy scalar promotion does not work with
+  // float8 values.  Using 'V' to mirror bfloat16 vs float16.
+  static constexpr char kNpyDescrKind = 'V';
+  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+  // character is unique.
+  static constexpr char kNpyDescrType = '4';
+  static constexpr char kNpyDescrByteorder = '=';
+};
+
+template <>
+struct TypeDescriptor<float8_e5m2>
+    : custom_float_internal::CustomFloatTypeDescriptor<float8_e5m2> {
+  typedef float8_e5m2 T;
+  static constexpr const char* kTypeName = "float8_e5m2";
+  static constexpr const char* kTpDoc = "float8_e5m2 floating-point values";
+  // Treating e5m2 as the natural "float" type since it is IEEE-754 compliant.
+  static constexpr char kNpyDescrKind = 'f';
+  // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+  // character is unique.
+  static constexpr char kNpyDescrType = '5';
+  static constexpr char kNpyDescrByteorder = '=';
+};
+
+}  // namespace custom_float_internal
+
+namespace {
+
+// Initializes the module.
+bool Initialize() {
+  tsl::ImportNumpy();
+  import_umath1(false);
+
+  tsl::custom_float_internal::Safe_PyObjectPtr numpy_str =
+      tsl::custom_float_internal::make_safe(PyUnicode_FromString("numpy"));
+  if (!numpy_str) {
+    return false;
+  }
+  tsl::custom_float_internal::Safe_PyObjectPtr numpy =
+      tsl::custom_float_internal::make_safe(PyImport_Import(numpy_str.get()));
+  if (!numpy) {
+    return false;
+  }
+
+  bool float8_already_registered;
+  if (!tsl::custom_float_internal::RegisterNumpyDtype<float8_e4m3fn>(
+          numpy.get(), &float8_already_registered)) {
+    return false;
+  }
+  if (!tsl::custom_float_internal::RegisterNumpyDtype<float8_e5m2>(
+          numpy.get())) {
+    return false;
+  }
+
+  // Register casts between float8 types. Only perform the cast if
+  // float8_e4m3b11 hasn't been previously registered, presumably by a different
+  // library. In this case, we assume the cast has also already been registered,
+  // and registering it again can cause segfaults due to accessing an
+  // uninitialized type descriptor in this library.
+  if (!float8_already_registered &&
+      !tsl::custom_float_internal::RegisterCustomFloatCast<float8_e4m3fn,
+                                                           float8_e5m2>()) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+bool RegisterNumpyFloat8e4m3fn() {
+  if (tsl::custom_float_internal::TypeDescriptor<float8_e4m3fn>::Dtype() !=
+      NPY_NOTYPE) {
+    // Already initialized.
+    return true;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load float8_e4m3fn module.");
+    }
+    PyErr_Print();
+    return false;
+  }
+  return true;
+}
+
+PyObject* Float8e4m3fnDtype() {
+  return reinterpret_cast<PyObject*>(
+      tsl::custom_float_internal::TypeDescriptor<float8_e4m3fn>::type_ptr);
+}
+
+int Float8e4m3fnNumpyType() {
+  return tsl::custom_float_internal::TypeDescriptor<float8_e4m3fn>::Dtype();
+}
+
+bool RegisterNumpyFloat8e5m2() {
+  if (tsl::custom_float_internal::TypeDescriptor<float8_e5m2>::Dtype() !=
+      NPY_NOTYPE) {
+    // Already initialized.
+    return true;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load float8_e5m2 module.");
+    }
+    PyErr_Print();
+    return false;
+  }
+  return true;
+}
+
+PyObject* Float8e5m2Dtype() {
+  return reinterpret_cast<PyObject*>(
+      tsl::custom_float_internal::TypeDescriptor<float8_e5m2>::type_ptr);
+}
+
+int Float8e5m2NumpyType() {
+  return tsl::custom_float_internal::TypeDescriptor<float8_e5m2>::Dtype();
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/python/lib/core/float8.h b/tensorflow/tsl/python/lib/core/float8.h
new file mode 100644
index 00000000000..4fa09dec789
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/float8.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_H_
+
+#include <Python.h>
+
+namespace tsl {
+
+// Register the float8_e4m3fn numpy type. Returns true on success.
+bool RegisterNumpyFloat8e4m3fn();
+
+// Returns a pointer to the float8_e4m3fn dtype object.
+PyObject* Float8e4m3fnDtype();
+
+// Returns the id number of the float8_e4m3fn numpy type.
+int Float8e4m3fnNumpyType();
+
+// Register the float8_e5m2 numpy type. Returns true on success.
+bool RegisterNumpyFloat8e5m2();
+
+// Returns a pointer to the float8_e5m2 dtype object.
+PyObject* Float8e5m2Dtype();
+
+// Returns the id number of the float8_e5m2 numpy type.
+int Float8e5m2NumpyType();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_H_
diff --git a/tensorflow/tsl/python/lib/core/float8_e4m3b11.cc b/tensorflow/tsl/python/lib/core/float8_e4m3b11.cc
new file mode 100644
index 00000000000..94cfa2cf4a7
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/float8_e4m3b11.cc
@@ -0,0 +1,87 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/python/lib/core/float8_e4m3b11.h"
+
+#include <stdio.h>
+
+namespace tsl {
+
+uint8_t float_to_float8_e4m3b11(float v) {
+  static_assert(sizeof(float) == sizeof(uint32_t), "Invalid");
+  uint32_t tmp = *reinterpret_cast<uint32_t*>(&v);
+
+  uint32_t sign = (tmp & 0x80000000) >> 24;
+  uint32_t exponent = (tmp >> 23) & 0xff;
+  uint32_t mantissa = tmp & 0x7fffff;
+  // subnormals
+  if (exponent < 127 - 10) {
+    if (exponent < 127 - 14) {
+      return 0x00;
+    }
+    uint32_t shifted_mantissa =
+        (0x800000 | mantissa) >> (10 - ((exponent - 127)));
+    if (shifted_mantissa == 0) return 0x00;
+    return sign | shifted_mantissa;
+  }
+  if (exponent > 127 + 4) {
+    if (exponent == 255 && mantissa != 0) {
+      return 0x80;  // nan.
+    }
+    return 0x7f | sign;
+  }
+  exponent = exponent - (127 - 11);
+  uint8_t result = sign | (exponent << 3) | (mantissa >> 20);
+  if (result == 0x80) {
+    result = 0;
+  }
+  return result;
+}
+
+static uint32_t clz_uint32(uint32_t x) {
+#ifdef __GNUC__
+  return __builtin_clz(x);
+#else
+  uint32_t out = 32;
+  while (x != 0) {
+    x = x >> 1;
+    out -= 1;
+  }
+  return out;
+#endif
+}
+
+float float8_e4m3b11_to_float(uint8_t v) {
+  if (v == 0x80) {
+    return NAN;
+  }
+  if (v == 0) {
+    return 0;
+  }
+  uint32_t sign = (0x80 & v) << 24;
+  uint32_t exponent = (((v & 0x78) >> 3) + (127 - 11));
+  uint32_t mantissa = (v & 0x7) << 20;
+  // subnormals
+  if ((v & 0x78) == 0) {
+    uint32_t nzeros = clz_uint32(v & 0x7);
+    mantissa = ((v & 0x7) << (nzeros - 29 + 21)) & (0x3 << 21);
+    uint32_t tmp = sign | ((0x72 - nzeros + 31) << 23) | mantissa;
+    return *reinterpret_cast<float*>(&tmp);
+  }
+  uint32_t tmp = sign | (exponent << 23) | mantissa;
+  return *reinterpret_cast<float*>(&tmp);
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/python/lib/core/float8_e4m3b11.h b/tensorflow/tsl/python/lib/core/float8_e4m3b11.h
new file mode 100644
index 00000000000..4c0be010c3c
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/float8_e4m3b11.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
+
+#include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+#include <memory>
+
+namespace tsl {
+
+uint8_t float_to_float8_e4m3b11(float v);
+float float8_e4m3b11_to_float(uint8_t v);
+
+class float8_e4m3b11 {
+ public:
+  // Exponent: 4, Mantissa: 3, bias: 11
+  float8_e4m3b11() {}
+  float8_e4m3b11(float v) : rep_(float_to_float8_e4m3b11(v)) {}  // NOLINT
+
+  operator float() const {  // NOLINT: Allow implicit conversion to float,
+                            // because it is lossless.
+    return float8_e4m3b11_to_float(rep_);
+  }
+
+  float8_e4m3b11 operator-() const {
+    if ((rep_ & 0x7f) == 0x00) {
+      return *this;
+    }  // nan or 0.
+    float8_e4m3b11 result = *this;
+    result.rep_ = result.rep_ ^ 0x80;
+    return result;
+  }
+
+  uint8_t rep() const { return rep_; }
+
+  static float8_e4m3b11 FromRep(uint8_t rep) {
+    float8_e4m3b11 result;
+    memcpy(&result, &rep, sizeof(float8_e4m3b11));
+    return result;
+  }
+
+ private:
+  uint8_t rep_;
+};
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_FLOAT8_E4M3B11_H_
diff --git a/tensorflow/tsl/python/lib/core/numpy.cc b/tensorflow/tsl/python/lib/core/numpy.cc
new file mode 100644
index 00000000000..ea0143d34b4
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/numpy.cc
@@ -0,0 +1,28 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// We define the PY_ARRAY_UNIQUE_SYMBOL in this .cc file and provide an
+// ImportNumpy function to populate it.
+#define XLA_IMPORT_NUMPY
+
+#include "tensorflow/tsl/python/lib/core/numpy.h"
+
+namespace tsl {
+
+void ImportNumpy() {
+  import_array1();
+}
+
+}  // namespace tsl
diff --git a/tensorflow/tsl/python/lib/core/numpy.h b/tensorflow/tsl/python/lib/core/numpy.h
new file mode 100644
index 00000000000..8cbe3ab74e2
--- /dev/null
+++ b/tensorflow/tsl/python/lib/core/numpy.h
@@ -0,0 +1,50 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#define TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// We import_array in the XLA init function only.
+#define PY_ARRAY_UNIQUE_SYMBOL _xla_numpy_api
+#ifndef XLA_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
+#include <locale>
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+namespace tsl {
+
+// Import numpy.  This wrapper function exists so that the
+// PY_ARRAY_UNIQUE_SYMBOL can be safely defined in a .cc file to
+// avoid weird linking issues.  Should be called only from our
+// module initialization function.
+void ImportNumpy();
+
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
diff --git a/tensorflow/tsl/tsl.bzl b/tensorflow/tsl/tsl.bzl
index 54e47bba03a..beb237b5c7b 100644
--- a/tensorflow/tsl/tsl.bzl
+++ b/tensorflow/tsl/tsl.bzl
@@ -6,7 +6,9 @@ load(
 )
 load(
     "//tensorflow/tsl/platform:rules_cc.bzl",
+    "cc_binary",
     "cc_library",
+    "cc_shared_library",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -18,7 +20,7 @@ load(
     "if_rocm_is_configured",
 )
 load(
-    "//third_party/mkl:build_defs.bzl",
+    "//tensorflow/tsl/mkl:build_defs.bzl",
     "if_enable_mkl",
     "if_mkl",
 )
@@ -111,6 +113,12 @@ def if_libtpu(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def if_macos(a, otherwise = []):
+    return select({
+        clean_dep("//tensorflow/tsl:macos"): a,
+        "//conditions:default": otherwise,
+    })
+
 def if_windows(a, otherwise = []):
     return select({
         clean_dep("//tensorflow/tsl:windows"): a,
@@ -147,6 +155,12 @@ def if_android_arm(a):
         "//conditions:default": [],
     })
 
+def if_not_android(a):
+    return select({
+        clean_dep("//tensorflow/tsl:android"): [],
+        "//conditions:default": a,
+    })
+
 def if_linux_x86_64(a):
     return select({
         clean_dep("//tensorflow/tsl:linux_x86_64"): a,
@@ -165,6 +179,22 @@ def if_no_default_logger(a):
         "//conditions:default": [],
     })
 
+# Enabled unless Windows or actively disabled, even without --config=cuda.
+# Combine with 'if_gpu_is_configured' (XLA) or 'if_cuda_or_rocm' (otherwise).
+def if_nccl(if_true, if_false = []):
+    return select({
+        clean_dep("//tensorflow/tsl:no_nccl_support"): if_false,
+        clean_dep("//tensorflow/tsl:windows"): if_false,
+        "//conditions:default": if_true,
+    })
+
+def if_with_tpu_support(if_true, if_false = []):
+    """Shorthand for select()ing whether to build API support for TPUs when building TSL"""
+    return select({
+        clean_dep("//tensorflow/tsl:with_tpu_support"): if_true,
+        "//conditions:default": if_false,
+    })
+
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         "/DPLATFORM_WINDOWS",
@@ -248,10 +278,11 @@ def tsl_copts(
 
 def tf_openmp_copts():
     # We assume when compiling on Linux gcc/clang will be used and MSVC on Windows
+    # TODO(zacmustin): Update OSS to use TSL's MKL.
     return select({
         # copybara:uncomment_begin
-        # "//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
-        # "//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp"],
+        # "//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
+        # "//tensorflow/tsl/mkl:build_with_mkl_windows_openmp": ["/openmp"],
         # copybara:uncomment_end_and_comment_begin
         "@org_tensorflow//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp:llvm"],
@@ -376,6 +407,9 @@ def get_compatible_with_portable():
 def filegroup(**kwargs):
     native.filegroup(**kwargs)
 
+def internal_hlo_deps():
+    return []
+
 # Config setting selector used when building for products
 # which requires restricted licenses to be avoided.
 def if_not_mobile_or_arm_or_lgpl_restricted(a):
@@ -385,4 +419,320 @@ def if_not_mobile_or_arm_or_lgpl_restricted(a):
     })
 
 def tsl_grpc_cc_dependencies():
-    return ["//tensorflow:grpc++"]
+    return [clean_dep("//tensorflow/tsl:grpc++")]
+
+# Bazel rule for collecting the header files that a target depends on.
+def _transitive_hdrs_impl(ctx):
+    outputs = _get_transitive_headers([], ctx.attr.deps)
+    return struct(files = outputs)
+
+_transitive_hdrs = rule(
+    attrs = {
+        "deps": attr.label_list(
+            allow_files = True,
+            providers = [CcInfo],
+        ),
+    },
+    implementation = _transitive_hdrs_impl,
+)
+
+def transitive_hdrs(name, deps = [], **kwargs):
+    _transitive_hdrs(name = name + "_gather", deps = deps)
+    native.filegroup(name = name, srcs = [":" + name + "_gather"])
+
+# Create a header only library that includes all the headers exported by
+# the libraries in deps.
+#
+# **NOTE**: The headers brought in are **NOT** fully transitive; certain
+# deep headers may be missing.  If this creates problems, you must find
+# a header-only version of the cc_library rule you care about and link it
+# *directly* in addition to your use of the cc_header_only_library
+# intermediary.
+#
+# For:
+#   * Eigen: it's a header-only library.  Add it directly to your deps.
+#   * GRPC: add a direct dep on @com_github_grpc_grpc//:grpc++_public_hdrs.
+#
+def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], compatible_with = None, **kwargs):
+    _transitive_hdrs(
+        name = name + "_gather",
+        deps = deps,
+        compatible_with = compatible_with,
+    )
+    _transitive_parameters_library(
+        name = name + "_gathered_parameters",
+        original_deps = deps,
+        compatible_with = compatible_with,
+    )
+    cc_library(
+        name = name,
+        hdrs = [":" + name + "_gather"],
+        includes = includes,
+        compatible_with = compatible_with,
+        deps = [":" + name + "_gathered_parameters"] + extra_deps,
+        **kwargs
+    )
+
+def _get_transitive_headers(hdrs, deps):
+    """Obtain the header files for a target and its transitive dependencies.
+
+      Args:
+        hdrs: a list of header files
+        deps: a list of targets that are direct dependencies
+
+      Returns:
+        a collection of the transitive headers
+      """
+    return depset(
+        hdrs,
+        transitive = [dep[CcInfo].compilation_context.headers for dep in deps],
+    )
+
+# Bazel rule for collecting the transitive parameters from a set of dependencies into a library.
+# Propagates defines and includes.
+def _transitive_parameters_library_impl(ctx):
+    defines = depset(
+        transitive = [dep[CcInfo].compilation_context.defines for dep in ctx.attr.original_deps],
+    )
+    system_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.system_includes for dep in ctx.attr.original_deps],
+    )
+    includes = depset(
+        transitive = [dep[CcInfo].compilation_context.includes for dep in ctx.attr.original_deps],
+    )
+    quote_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.quote_includes for dep in ctx.attr.original_deps],
+    )
+    framework_includes = depset(
+        transitive = [dep[CcInfo].compilation_context.framework_includes for dep in ctx.attr.original_deps],
+    )
+    return CcInfo(
+        compilation_context = cc_common.create_compilation_context(
+            defines = depset(direct = defines.to_list()),
+            system_includes = depset(direct = system_includes.to_list()),
+            includes = depset(direct = includes.to_list()),
+            quote_includes = depset(direct = quote_includes.to_list()),
+            framework_includes = depset(direct = framework_includes.to_list()),
+        ),
+    )
+
+_transitive_parameters_library = rule(
+    attrs = {
+        "original_deps": attr.label_list(
+            allow_empty = True,
+            allow_files = True,
+            providers = [CcInfo],
+        ),
+    },
+    implementation = _transitive_parameters_library_impl,
+)
+
+# buildozer: disable=function-docstring-args
+def tsl_pybind_extension_opensource(
+        name,
+        srcs,
+        module_name = None,  # @unused
+        hdrs = [],
+        dynamic_deps = [],
+        static_deps = [],
+        deps = [],
+        additional_exported_symbols = [],
+        compatible_with = None,
+        copts = [],
+        data = [],
+        defines = [],
+        deprecation = None,
+        features = [],
+        licenses = None,
+        linkopts = [],
+        pytype_deps = [],
+        pytype_srcs = [],
+        restricted_to = None,
+        srcs_version = "PY3",
+        testonly = None,
+        visibility = None,
+        win_def_file = None):  # @unused
+    """Builds a generic Python extension module."""
+    p = name.rfind("/")
+    if p == -1:
+        sname = name
+        prefix = ""
+    else:
+        sname = name[p + 1:]
+        prefix = name[:p + 1]
+    so_file = "%s%s.so" % (prefix, sname)
+    filegroup_name = "%s_filegroup" % name
+    pyd_file = "%s%s.pyd" % (prefix, sname)
+    exported_symbols = [
+        "init%s" % sname,
+        "init_%s" % sname,
+        "PyInit_%s" % sname,
+    ] + additional_exported_symbols
+
+    exported_symbols_file = "%s-exported-symbols.lds" % name
+    version_script_file = "%s-version-script.lds" % name
+
+    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
+    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
+
+    native.genrule(
+        name = name + "_exported_symbols",
+        outs = [exported_symbols_file],
+        cmd = "echo '%s' >$@" % exported_symbols_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    native.genrule(
+        name = name + "_version_script",
+        outs = [version_script_file],
+        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    if static_deps:
+        cc_library_name = so_file + "_cclib"
+        cc_library(
+            name = cc_library_name,
+            hdrs = hdrs,
+            srcs = srcs + hdrs,
+            data = data,
+            deps = deps,
+            compatible_with = compatible_with,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("//tensorflow/tsl:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            restricted_to = restricted_to,
+            testonly = testonly,
+            visibility = visibility,
+        )
+
+        cc_shared_library(
+            name = so_file,
+            roots = [cc_library_name],
+            dynamic_deps = dynamic_deps,
+            static_deps = static_deps,
+            additional_linker_inputs = [exported_symbols_file, version_script_file],
+            compatible_with = compatible_with,
+            deprecation = deprecation,
+            features = features + ["-use_header_modules"],
+            licenses = licenses,
+            restricted_to = restricted_to,
+            shared_lib_name = so_file,
+            testonly = testonly,
+            user_link_flags = linkopts + select({
+                clean_dep("//tensorflow/tsl:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("//tensorflow/tsl:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            visibility = visibility,
+        )
+
+        # cc_shared_library can generate more than one file.
+        # Solution to avoid the error "variable '$<' : more than one input file."
+        filegroup(
+            name = filegroup_name,
+            srcs = [so_file],
+            output_group = "main_shared_library_output",
+            testonly = testonly,
+        )
+
+    else:
+        cc_binary(
+            name = so_file,
+            srcs = srcs + hdrs,
+            data = data,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("//tensorflow/tsl:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            linkopts = linkopts + select({
+                clean_dep("//tensorflow/tsl:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("//tensorflow/tsl:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            deps = deps + [
+                exported_symbols_file,
+                version_script_file,
+            ],
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            linkshared = 1,
+            testonly = testonly,
+            licenses = licenses,
+            visibility = visibility,
+            deprecation = deprecation,
+            restricted_to = restricted_to,
+            compatible_with = compatible_with,
+        )
+
+        # For Windows, emulate the above filegroup with the shared object.
+        native.alias(
+            name = filegroup_name,
+            actual = so_file,
+        )
+
+    # For Windows only.
+    native.genrule(
+        name = name + "_pyd_copy",
+        srcs = [filegroup_name],
+        outs = [pyd_file],
+        cmd = "cp $< $@",
+        output_to_bindir = True,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+
+    native.py_library(
+        name = name,
+        data = select({
+            clean_dep("//tensorflow/tsl:windows"): [pyd_file],
+            "//conditions:default": [so_file],
+        }) + pytype_srcs,
+        deps = pytype_deps,
+        srcs_version = srcs_version,
+        licenses = licenses,
+        testonly = testonly,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+    )
+
+# Export open source version of pybind_extension under base name as well.
+tsl_pybind_extension = tsl_pybind_extension_opensource
diff --git a/tensorflow/tsl/tsl.default.bzl b/tensorflow/tsl/tsl.default.bzl
index 34b0d1750c5..37b772a9f3b 100644
--- a/tensorflow/tsl/tsl.default.bzl
+++ b/tensorflow/tsl/tsl.default.bzl
@@ -7,7 +7,9 @@ load(
     _filegroup = "filegroup",
     _get_compatible_with_portable = "get_compatible_with_portable",
     _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
+    _internal_hlo_deps = "internal_hlo_deps",
     _tsl_grpc_cc_dependencies = "tsl_grpc_cc_dependencies",
+    _tsl_pybind_extension = "tsl_pybind_extension",
 )
 load(
     "//tensorflow/tsl/platform:build_config.bzl",
@@ -25,7 +27,12 @@ load(
 get_compatible_with_portable = _get_compatible_with_portable
 filegroup = _filegroup
 if_not_mobile_or_arm_or_lgpl_restricted = _if_not_mobile_or_arm_or_lgpl_restricted
+internal_hlo_deps = _internal_hlo_deps
 tsl_grpc_cc_dependencies = _tsl_grpc_cc_dependencies
+tsl_pybind_extension = _tsl_pybind_extension
+
+def get_compatible_with_cloud():
+    return []
 
 def tsl_gpu_cc_test(
         name,
@@ -77,7 +84,7 @@ def tsl_gpu_cc_test(
         linkopts = linkopts,
         linkstatic = select({
             # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-            clean_dep("//tensorflow:macos"): 1,
+            clean_dep("//tensorflow/tsl:macos"): 1,
             "@local_config_cuda//cuda:using_nvcc": 1,
             "@local_config_cuda//cuda:using_clang": 1,
             "//conditions:default": 0,
@@ -100,7 +107,7 @@ def tsl_gpu_cc_test(
             linkopts = linkopts,
             linkstatic = select({
                 # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
-                clean_dep("//tensorflow:macos"): 1,
+                clean_dep("//tensorflow/tsl:macos"): 1,
                 "@local_config_cuda//cuda:using_nvcc": 1,
                 "@local_config_cuda//cuda:using_clang": 1,
                 "//conditions:default": 0,
diff --git a/tensorflow/tsl/util/BUILD b/tensorflow/tsl/util/BUILD
index ea3c65b7732..0b92436eff7 100644
--- a/tensorflow/tsl/util/BUILD
+++ b/tensorflow/tsl/util/BUILD
@@ -24,6 +24,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/tsl/util/command_line_flags.h b/tensorflow/tsl/util/command_line_flags.h
index 861299603d9..0ba1346a343 100644
--- a/tensorflow/tsl/util/command_line_flags.h
+++ b/tensorflow/tsl/util/command_line_flags.h
@@ -91,8 +91,6 @@ class Flag {
   Flag(const char* name, std::function<bool(string)> string_hook,
        string default_value_for_display, const string& usage_text);
 
-  bool is_default_initialized() const { return default_initialized_; }
-
  private:
   friend class Flags;
 
@@ -123,7 +121,6 @@ class Flag {
   string string_default_for_display_;
 
   string usage_text_;
-  bool default_initialized_ = true;
 };
 
 class Flags {
diff --git a/tensorflow/tsl/util/proto/BUILD b/tensorflow/tsl/util/proto/BUILD
index abc0e16c9d9..9dbfe711ae4 100644
--- a/tensorflow/tsl/util/proto/BUILD
+++ b/tensorflow/tsl/util/proto/BUILD
@@ -4,6 +4,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/tsl/util/stats_calculator.cc b/tensorflow/tsl/util/stats_calculator.cc
index b81f9c7532a..b7915b3f7f0 100644
--- a/tensorflow/tsl/util/stats_calculator.cc
+++ b/tensorflow/tsl/util/stats_calculator.cc
@@ -23,6 +23,8 @@ limitations under the License.
 
 namespace tsl {
 
+constexpr int kNodeTypeWidth = 40;
+
 StatsCalculator::StatsCalculator(const StatSummarizerOptions& options)
     : options_(options) {}
 
@@ -55,7 +57,7 @@ std::string StatsCalculator::HeaderString(const std::string& title) const {
     stream << "node type, first, avg_ms, %, cdf%, mem KB, times called, "
               "name";
   } else {
-    InitField(stream, 24) << "[node type]";
+    InitField(stream, kNodeTypeWidth) << "[node type]";
     InitField(stream, 9) << "[first]";
     InitField(stream, 9) << "[avg ms]";
     InitField(stream, 8) << "[%]";
@@ -86,7 +88,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
            << detail.mem_used.newest() / 1000.0 << ", " << times_called << ", "
            << name;
   } else {
-    InitField(stream, 24) << detail.type;
+    InitField(stream, kNodeTypeWidth) << detail.type;
     InitField(stream, 9) << first_time_ms;
     InitField(stream, 9) << avg_time_ms;
     InitField(stream, 7) << percentage << "%";
@@ -199,7 +201,7 @@ std::string StatsCalculator::GetStatsByNodeType() const {
   if (options_.format_as_csv) {
     stream << "node type, count, avg_ms, avg %, cdf %, mem KB, times called\n";
   } else {
-    InitField(stream, 24) << "[Node type]";
+    InitField(stream, kNodeTypeWidth) << "[Node type]";
     InitField(stream, 9) << "[count]";
     InitField(stream, 10) << "[avg ms]";
     InitField(stream, 11) << "[avg %]";
@@ -230,7 +232,7 @@ std::string StatsCalculator::GetStatsByNodeType() const {
              << memory << ", " << node_type_map_times_called[node_type]
              << std::endl;
     } else {
-      InitField(stream, 24) << node_type;
+      InitField(stream, kNodeTypeWidth) << node_type;
       InitField(stream, 9) << node_type_map_count[node_type];
       InitField(stream, 10) << time_per_run_ms;
       InitField(stream, 10) << percentage << "%";
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index c9d9c56ce64..85ef343e3ed 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -9,6 +9,7 @@ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
+load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
 load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
@@ -42,6 +43,7 @@ load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
+load("//third_party/triton:workspace.bzl", triton = "repo")
 
 # Import external repository rules.
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
@@ -79,6 +81,9 @@ def _initialize_third_party():
     stablehlo()
     vulkan_headers()
     tensorrt()
+    triton()
+
+    # copybara: tsl vendor
 
 # Toolchains & platforms required by Tensorflow to build.
 def _tf_toolchains():
@@ -112,6 +117,9 @@ def _tf_toolchains():
         remote_config_repo_aarch64 = "../aarch64_compiler",
     )
 
+    # Load aarch64 toolchain
+    aarch64_compiler_configure()
+
     # TFLite crossbuild toolchain for embeddeds Linux
     arm_linux_toolchain_configure(
         name = "local_config_embedded_arm",
@@ -133,9 +141,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "ca3a5316b8161214f8f22a578fb638f1fccd0585eee40301363ffd026310379a",
-        strip_prefix = "XNNPACK-a50369c0fdd15f0f35b1a91c964644327a88d480",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/a50369c0fdd15f0f35b1a91c964644327a88d480.zip"),
+        sha256 = "0a7ad183dcb4db36e82c4c24376a28281c30e986bd2d71311e624405229a7618",
+        strip_prefix = "XNNPACK-659147817805d17c7be2d60bd7bbca7e780f9c82",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/659147817805d17c7be2d60bd7bbca7e780f9c82.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -153,27 +161,20 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/Maratyszcza/pthreadpool/archive/b8374f80e42010941bda6c85b0e3f1a1bd77a1e0.zip"),
     )
 
-    tf_http_archive(
-        name = "clog",
-        strip_prefix = "cpuinfo-5e63739504f0f8e18e941bd63b2d6d42536c7d90",
-        sha256 = "18eca9bc8d9c4ce5496d0d2be9f456d55cbbb5f0639a551ce9c8bac2e84d85fe",
-        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/5e63739504f0f8e18e941bd63b2d6d42536c7d90.tar.gz"),
-    )
-
     tf_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-5e63739504f0f8e18e941bd63b2d6d42536c7d90",
-        sha256 = "18eca9bc8d9c4ce5496d0d2be9f456d55cbbb5f0639a551ce9c8bac2e84d85fe",
-        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/5e63739504f0f8e18e941bd63b2d6d42536c7d90.tar.gz"),
+        strip_prefix = "cpuinfo-3dc310302210c1891ffcfb12ae67b11a3ad3a150",
+        sha256 = "ba668f9f8ea5b4890309b7db1ed2e152aaaf98af6f9a8a63dbe1b75c04e52cb9",
+        urls = tf_mirror_urls("https://github.com/pytorch/cpuinfo/archive/3dc310302210c1891ffcfb12ae67b11a3ad3a150.zip"),
     )
 
     tf_http_archive(
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "6ca6e7d4affdff59c749865d6d0428c849968b0873a1d1b849f56d7be624f27b",
-        strip_prefix = "cudnn-frontend-0.7.1",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.1.zip"),
+        sha256 = "3c7b842cd67989810955b220fa1116e7e2ed10660a8cfb632118146a64992c30",
+        strip_prefix = "cudnn-frontend-0.7.3",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.3.zip"),
     )
 
     tf_http_archive(
@@ -187,27 +188,32 @@ def _tf_repositories():
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
-        sha256 = "dc2b9bc851cd8d5a6c4622f7dc215bdb6b32349962875f8bf55cceed45a4c449",
-        strip_prefix = "oneDNN-2.7.1",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v2.7.1.tar.gz"),
+        sha256 = "a50993aa6265b799b040fe745e0010502f9f7103cc53a9525d59646aef006633",
+        strip_prefix = "oneDNN-2.7.3",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v2.7.3.tar.gz"),
     )
 
     tf_http_archive(
         name = "mkl_dnn_acl_compatible",
         build_file = "//third_party/mkl_dnn:mkldnn_acl.BUILD",
-        patch_file = ["//third_party/mkl_dnn:onednn_acl_threadcap.patch", "//third_party/mkl_dnn:onednn_acl_fixed_format_kernels.patch", "//third_party/mkl_dnn:onednn_acl_depthwise_convolution.patch"],
-        sha256 = "fc2b617ec8dbe907bb10853ea47c46f7acd8817bc4012748623d911aca43afbb",
-        strip_prefix = "oneDNN-2.7",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/v2.7.tar.gz"),
+        patch_file = [
+            "//third_party/mkl_dnn:onednn_acl_threadcap.patch",
+            "//third_party/mkl_dnn:onednn_acl_fixed_format_kernels.patch",
+            "//third_party/mkl_dnn:onednn_acl_depthwise_convolution.patch",
+            "//third_party/mkl_dnn:onednn_acl_threadpool_scheduler.patch",
+        ],
+        sha256 = "a50993aa6265b799b040fe745e0010502f9f7103cc53a9525d59646aef006633",
+        strip_prefix = "oneDNN-2.7.3",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/v2.7.3.tar.gz"),
     )
 
     tf_http_archive(
         name = "compute_library",
-        sha256 = "ac2ce7b5636e99f175b084362f83fe24d72e6ceb0bd62ee5866772f7355d024d",
-        strip_prefix = "ComputeLibrary-22.08",
+        sha256 = "e20a060d3c4f803889d96c2f0b865004ba3ef4e228299a44339ea1c1ba827c85",
+        strip_prefix = "ComputeLibrary-22.11",
         build_file = "//third_party/compute_library:BUILD",
-        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_depthwise_updateable_weights.patch", "//third_party/compute_library:acl_fixup_SVE_merges.patch"],
-        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.08.tar.gz"),
+        patch_file = ["//third_party/compute_library:compute_library.patch", "//third_party/compute_library:acl_fixed_format_kernels_striding.patch", "//third_party/compute_library:acl_openmp_fix.patch"],
+        urls = tf_mirror_urls("https://github.com/ARM-software/ComputeLibrary/archive/v22.11.tar.gz"),
     )
 
     tf_http_archive(
@@ -232,17 +238,17 @@ def _tf_repositories():
     tf_http_archive(
         name = "aarch64_linux_toolchain",
         build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:aarch64-linux-toolchain.BUILD",
-        sha256 = "8ce3e7688a47d8cd2d8e8323f147104ae1c8139520eca50ccf8a7fa933002731",
-        strip_prefix = "gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu",
-        urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz"),
+        sha256 = "50cdef6c5baddaa00f60502cc8b59cc11065306ae575ad2f51e412a9b2a90364",
+        strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu",
+        urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-aarch64-none-linux-gnu.tar.xz"),
     )
 
     tf_http_archive(
         name = "armhf_linux_toolchain",
         build_file = "//tensorflow/tools/toolchains/embedded/arm-linux:armhf-linux-toolchain.BUILD",
-        sha256 = "d4f6480ecaa99e977e3833cc8a8e1263f9eecd1ce2d022bb548a24c4f32670f5",
-        strip_prefix = "gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf",
-        urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz"),
+        sha256 = "3f76650b1d048036473b16b647b8fd005ffccd1a2869c10994967e0e49f26ac2",
+        strip_prefix = "arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf",
+        urls = tf_mirror_urls("https://developer.arm.com/-/media/Files/downloads/gnu/11.3.rel1/binrel/arm-gnu-toolchain-11.3.rel1-x86_64-arm-none-linux-gnueabihf.tar.xz"),
     )
 
     tf_http_archive(
@@ -304,10 +310,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "org_sqlite",
         build_file = "//third_party:sqlite.BUILD",
-        sha256 = "9c99955b21d2374f3a385d67a1f64cbacb1d4130947473d25c77ad609c03b4cd",
-        strip_prefix = "sqlite-amalgamation-3390400",
+        sha256 = "49112cc7328392aa4e3e5dae0b2f6736d0153430143d21f69327788ff4efe734",
+        strip_prefix = "sqlite-amalgamation-3400100",
         system_build_file = "//third_party/systemlibs:sqlite.BUILD",
-        urls = tf_mirror_urls("https://www.sqlite.org/2022/sqlite-amalgamation-3390400.zip"),
+        urls = tf_mirror_urls("https://www.sqlite.org/2022/sqlite-amalgamation-3400100.zip"),
     )
 
     tf_http_archive(
@@ -426,9 +432,9 @@ def _tf_repositories():
         name = "dill_archive",
         build_file = "//third_party:dill.BUILD",
         system_build_file = "//third_party/systemlibs:dill.BUILD",
-        urls = tf_mirror_urls("https://pypi.python.org/packages/source/d/dill/dill-0.3.4.zip"),
-        sha256 = "9f9734205146b2b353ab3fec9af0070237b6ddae78452af83d2fca84d739e675",
-        strip_prefix = "dill-0.3.4",
+        urls = tf_mirror_urls("https://github.com/uqfoundation/dill/releases/download/dill-0.3.6/dill-0.3.6.zip"),
+        sha256 = "2159ca9e7568ff47dc7be2e35a6edf18014351da95ad1b59c0930a14dcf37be7",
+        strip_prefix = "dill-0.3.6",
     )
 
     tf_http_archive(
@@ -451,14 +457,14 @@ def _tf_repositories():
     tf_http_archive(
         name = "com_google_protobuf",
         patch_file = ["//third_party/protobuf:protobuf.patch"],
-        sha256 = "cfcba2df10feec52a84208693937c17a4b5df7775e1635c1e3baffc487b24c9b",
-        strip_prefix = "protobuf-3.9.2",
+        sha256 = "f66073dee0bc159157b0bd7f502d7d1ee0bc76b3c1eac9836927511bdc4b3fc1",
+        strip_prefix = "protobuf-3.21.9",
         system_build_file = "//third_party/systemlibs:protobuf.BUILD",
         system_link_files = {
             "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
             "//third_party/systemlibs:protobuf_deps.bzl": "protobuf_deps.bzl",
         },
-        urls = tf_mirror_urls("https://github.com/protocolbuffers/protobuf/archive/v3.9.2.zip"),
+        urls = tf_mirror_urls("https://github.com/protocolbuffers/protobuf/archive/v3.21.9.zip"),
     )
 
     tf_http_archive(
@@ -472,16 +478,16 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "bc1cc26d1120f5a7e9eb450751c0b24160734e46a02823a573f3c6b6c0a574a7",
-        strip_prefix = "googletest-e2c06aa2497e330bab1c1a03d02f7c5096eb5b0b",
-        urls = tf_mirror_urls("https://github.com/google/googletest/archive/e2c06aa2497e330bab1c1a03d02f7c5096eb5b0b.zip"),
+        sha256 = "81964fe578e9bd7c94dfdb09c8e4d6e6759e19967e397dbea48d1c10e45d0df2",
+        strip_prefix = "googletest-release-1.12.1",
+        urls = tf_mirror_urls("https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz"),
     )
 
     tf_http_archive(
         name = "com_google_fuzztest",
-        sha256 = "3fe79ede8e860ba7331987b2c1f84d3eeaf5bea00fd76398d6ff0006635586c6",
-        strip_prefix = "fuzztest-6d79ceb1dc2398e02a39efc23ce40d68baa16a42",
-        urls = tf_mirror_urls("https://github.com/google/fuzztest/archive/6d79ceb1dc2398e02a39efc23ce40d68baa16a42.zip"),
+        sha256 = "c75f224b34c3c62ee901381fb743f6326f7b91caae0ceb8fe62f3fd36f187627",
+        strip_prefix = "fuzztest-58b4e7065924f1a284952b84ea827ce35a87e4dc",
+        urls = tf_mirror_urls("https://github.com/google/fuzztest/archive/58b4e7065924f1a284952b84ea827ce35a87e4dc.zip"),
     )
 
     tf_http_archive(
@@ -494,10 +500,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "curl",
         build_file = "//third_party:curl.BUILD",
-        sha256 = "78a06f918bd5fde3c4573ef4f9806f56372b32ec1829c9ec474799eeee641c27",
-        strip_prefix = "curl-7.85.0",
+        sha256 = "8a063d664d1c23d35526b87a2bf15514962ffdd8ef7fd40519191b3c23e39548",
+        strip_prefix = "curl-7.87.0",
         system_build_file = "//third_party/systemlibs:curl.BUILD",
-        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-7.85.0.tar.gz"),
+        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-7.87.0.tar.gz"),
     )
 
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
@@ -561,10 +567,10 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "boringssl",
-        sha256 = "fd0e06a8a57dcba1132f91fef1c1327191e913b6c50a84633f7175090972196c",
-        strip_prefix = "boringssl-f9eff21461cf79556a0fb8ca9b1bf60c3b283ce8",
+        sha256 = "534fa658bd845fd974b50b10f444d392dfd0d93768c4a51b61263fd37d851c40",
+        strip_prefix = "boringssl-b9232f9e27e5668bc0414879dcdedb2a59ea75f2",
         system_build_file = "//third_party/systemlibs:boringssl.BUILD",
-        urls = tf_mirror_urls("https://github.com/google/boringssl/archive/f9eff21461cf79556a0fb8ca9b1bf60c3b283ce8.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/google/boringssl/archive/b9232f9e27e5668bc0414879dcdedb2a59ea75f2.tar.gz"),
     )
 
     # Note: if you update this, you have to update libpng too. See cl/437813808
@@ -600,9 +606,9 @@ def _tf_repositories():
         name = "nccl_archive",
         build_file = "//third_party:nccl/archive.BUILD",
         patch_file = ["//third_party/nccl:archive.patch"],
-        sha256 = "d5f5243200d4e40683c56f04435bfd6defa379cb4f2b8c07b0f191df0f66c3d9",
-        strip_prefix = "nccl-2.13.4-1",
-        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.13.4-1.tar.gz"),
+        sha256 = "7f7c738511a8876403fc574d13d48e7c250d934d755598d82e14bab12236fc64",
+        strip_prefix = "nccl-2.16.2-1",
+        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.16.2-1.tar.gz"),
     )
 
     java_import_external(
@@ -692,13 +698,21 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/NVlabs/cub/archive/1.9.9.zip"),
     )
 
+    tf_http_archive(
+        name = "nvtx_archive",
+        build_file = "//third_party:nvtx.BUILD",
+        sha256 = "bb8d1536aad708ec807bc675e12e5838c2f84481dec4005cd7a9bbd49e326ba1",
+        strip_prefix = "NVTX-3.0.1/c/include",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/NVTX/archive/v3.0.1.tar.gz"),
+    )
+
     tf_http_archive(
         name = "cython",
         build_file = "//third_party:cython.BUILD",
-        sha256 = "d530216e015fd365bf3327a176e0073d0e5b8d48781f387336459f10032d776f",
-        strip_prefix = "cython-3.0.0a10",
+        sha256 = "08dbdb6aa003f03e65879de8f899f87c8c718cd874a31ae9c29f8726da2f5ab0",
+        strip_prefix = "cython-3.0.0a11",
         system_build_file = "//third_party/systemlibs:cython.BUILD",
-        urls = tf_mirror_urls("https://github.com/cython/cython/archive/3.0.0a10.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/cython/cython/archive/3.0.0a11.tar.gz"),
     )
 
     # LINT.IfChange
@@ -862,10 +876,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "wrapt",
         build_file = "//third_party:wrapt.BUILD",
-        sha256 = "8a6fb40e8f8b6a66b4ba81a4044c68e6a7b1782f21cfabc06fb765332b4c3e51",
-        strip_prefix = "wrapt-1.11.1/src/wrapt",
+        sha256 = "866211ed43c2639a2452cd017bd38589e83687b1d843817c96b99d2d9d32e8d7",
+        strip_prefix = "wrapt-1.14.1/src/wrapt",
         system_build_file = "//third_party/systemlibs:wrapt.BUILD",
-        urls = tf_mirror_urls("https://github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/GrahamDumpleton/wrapt/archive/1.14.1.tar.gz"),
     )
 
     tf_http_archive(
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index 5cfa0553579..91871db22c8 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -37,11 +37,11 @@ def workspace():
     )
 
     # Maven dependencies.
-    RULES_JVM_EXTERNAL_TAG = "3.2"
+    RULES_JVM_EXTERNAL_TAG = "4.3"
     http_archive(
         name = "rules_jvm_external",
         strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG,
-        sha256 = "82262ff4223c5fda6fb7ff8bd63db8131b51b413d26eb49e3131037e79e324af",
+        sha256 = "6274687f6fc5783b589f56a2f1ed60de3ce1f99bc4e8f9edef3de43bdf7c6e74",
         url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
     )
 
diff --git a/third_party/FP16/BUILD b/third_party/FP16/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/FP16/BUILD
+++ b/third_party/FP16/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/absl/BUILD b/third_party/absl/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/absl/BUILD
+++ b/third_party/absl/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch b/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch
index 25650764b3e..f5c03435b12 100644
--- a/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch
+++ b/third_party/absl/com_google_absl_fix_mac_and_nvcc_build.patch
@@ -252,3 +252,16 @@ index 7304d40d..75d8e086 100644
      visibility = ["//visibility:public"],
      deps = [
          ":civil_time",
+diff --git a/absl/base/config.h b/absl/base/config.h
+index 8533aea..07b4e80 100644
+--- a/absl/base/config.h.orig
++++ b/absl/base/config.h
+@@ -906,7 +906,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' ||
+ // SIMD).
+ #ifdef ABSL_INTERNAL_HAVE_ARM_NEON
+ #error ABSL_INTERNAL_HAVE_ARM_NEON cannot be directly set
+-#elif defined(__ARM_NEON)
++#elif defined(__ARM_NEON) && !defined(__CUDACC__)
+ #define ABSL_INTERNAL_HAVE_ARM_NEON 1
+ #endif
+ 
diff --git a/third_party/benchmark/BUILD b/third_party/benchmark/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/benchmark/BUILD
+++ b/third_party/benchmark/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/common.bzl b/third_party/common.bzl
deleted file mode 100644
index 4f622cd7da1..00000000000
--- a/third_party/common.bzl
+++ /dev/null
@@ -1,42 +0,0 @@
-# Rule for simple expansion of template files. This performs a simple
-# search over the template file for the keys in substitutions,
-# and replaces them with the corresponding values.
-#
-# Typical usage:
-#   load("/tools/build_rules/template_rule", "expand_header_template")
-#   template_rule(
-#       name = "ExpandMyTemplate",
-#       src = "my.template",
-#       out = "my.txt",
-#       substitutions = {
-#         "$VAR1": "foo",
-#         "$VAR2": "bar",
-#       }
-#   )
-#
-# Args:
-#   name: The name of the rule.
-#   template: The template file to expand
-#   out: The destination of the expanded file
-#   substitutions: A dictionary mapping strings to their substitutions
-
-def template_rule_impl(ctx):
-    ctx.actions.expand_template(
-        template = ctx.file.src,
-        output = ctx.outputs.out,
-        substitutions = ctx.attr.substitutions,
-    )
-
-template_rule = rule(
-    attrs = {
-        "src": attr.label(
-            mandatory = True,
-            allow_single_file = True,
-        ),
-        "substitutions": attr.string_dict(mandatory = True),
-        "out": attr.output(mandatory = True),
-    },
-    # output_to_genfiles is required for header files.
-    output_to_genfiles = True,
-    implementation = template_rule_impl,
-)
diff --git a/third_party/compute_library/acl_depthwise_updateable_weights.patch b/third_party/compute_library/acl_depthwise_updateable_weights.patch
deleted file mode 100644
index d4365f3df68..00000000000
--- a/third_party/compute_library/acl_depthwise_updateable_weights.patch
+++ /dev/null
@@ -1,101 +0,0 @@
- *******************************************************************************
- Copyright 2022 Arm Limited and affiliates.
- SPDX-License-Identifier: Apache-2.0
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- *******************************************************************************
-
-diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
-index c93ffb113..9e1baecf6 100644
---- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
-+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
-@@ -88,6 +88,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
-     _is_nchw      = src->data_layout() == DataLayout::NCHW;
-     _permute      = _is_nchw;
-     _is_prepared  = false;
-+    _are_weights_const = weights->are_values_constant();
- 
-     // Configure pipeline
-     _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
-@@ -218,6 +219,25 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
- 
- void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
- {
-+    // if weights are not constant then we need to repack so that weights
-+    // can be updated in-place
-+    if(!_are_weights_const)
-+    {
-+        auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-+        auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-+        auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
-+
-+        ITensorPack pack_opt;
-+        pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
-+        pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
-+        pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
-+
-+        // Prepare optimized function
-+        _dwc_optimized_func->prepare(pack_opt);
-+
-+        return;
-+    }
-+
-     if(!_is_prepared)
-     {
-         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
-index 15e52ef51..61dc8978c 100644
---- a/src/cpu/operators/CpuDepthwiseConv2d.h
-+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
-@@ -143,6 +143,7 @@ private:
-         bool                                                _permute{ false };
-         bool                                                _is_activationlayer_enabled{ false };
-         bool                                                _is_prepared{ false };
-+        bool                                                _are_weights_const{ true };
-     };
- 
-     /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
-diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-index e75b082ca..0d02834c2 100644
---- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
-@@ -40,6 +40,7 @@ struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
- {
-     std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
-     bool                                                              is_prepared{ false };
-+    bool                                                              are_weights_const{ true };
-     experimental::MemoryRequirements                                  mem_req{};
- };
- 
-@@ -62,6 +63,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
-     const CPUInfo     &ci          = NEScheduler::get().cpu_info();
-     const unsigned int num_threads = NEScheduler::get().num_threads();
-     _pImpl->is_prepared            = false;
-+    _pImpl->are_weights_const      = weights->are_values_constant();
- 
-     // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-     if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
-@@ -107,10 +109,11 @@ void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
- 
- void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
- {
--    if(!_pImpl->is_prepared)
-+    const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-+
-+    if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
-     {
-         // Pack weights and bias
--        const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-         const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
-         ITensor       *storage = tensors.get_tensor(TensorType::ACL_INT_1);
- 
diff --git a/third_party/compute_library/acl_fixup_SVE_merges.patch b/third_party/compute_library/acl_fixup_SVE_merges.patch
deleted file mode 100644
index c3132ff9d87..00000000000
--- a/third_party/compute_library/acl_fixup_SVE_merges.patch
+++ /dev/null
@@ -1,3122 +0,0 @@
-From ce79ac6297e6eb2407abd24846b8504dee43770f Mon Sep 17 00:00:00 2001
-From: David Mansell <David.Mansell@arm.com>
-Date: Fri, 23 Sep 2022 09:57:43 +0100
-Subject: CPU GEMM: Fix overreads in SVE merges.
-
-SVE merges for interleaved kernels were not guarding bias reads with the
-correct predicates, leading to overreads and crashes in some cases.  Fix to
-use the appropriate predicate.
-
-Resolves: COMPMID-5627
-Change-Id: Ib049531c4a3bea56e90623b7b9f0d6a7ab4db2c8
-Signed-off-by: David Mansell <David.Mansell@arm.com>
-Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8315
-Benchmark: Arm Jenkins <bsgcomp@arm.com>
-Tested-by: Arm Jenkins <bsgcomp@arm.com>
-Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
----
- .../arm_gemm/merges/sve_merge_fp16_3VLx8.hpp       | 600 ++++++++++-----------
- .../arm_gemm/merges/sve_merge_fp32_3VLx8.hpp       | 600 ++++++++++-----------
- .../arm_gemm/merges/sve_merge_s32_3VLx8.hpp        | 340 ++++++------
- .../arm_gemm/merges/sve_merge_u32_3VLx8.hpp        | 340 ++++++------
- 4 files changed, 940 insertions(+), 940 deletions(-)
-
-diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
-index 4da32b459c..a211a03697 100644
---- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
-+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2019-2020 Arm Limited.
-+ * Copyright (c) 2019-2020,2022 Arm Limited.
-  *
-  * SPDX-License-Identifier: MIT
-  *
-@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
--                            "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1h z13.h, p0/z, [%[inptr]]\n"
-+                            "inch %[p], all, mul #1\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "fadd z13.h, z13.h, z2.h\n"
-                             "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-                             "whilelt p2.h, %[p], %[w]\n"
--                            "fadd z13.h, z13.h, z2.h\n"
-+                            "fmin z13.h, p0/m, z13.h, z0.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "fadd z14.h, z14.h, z3.h\n"
-                             "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z13.h, p0/m, z13.h, z0.h\n"
-+                            "fmax z13.h, p0/m, z13.h, z1.h\n"
-                             "fmin z14.h, p1/m, z14.h, z0.h\n"
-                             "fadd z15.h, z15.h, z4.h\n"
--                            "fmax z13.h, p0/m, z13.h, z1.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-                             "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
-@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z13.h, z13.h, z2.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-+                            "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "fmin z13.h, p0/m, z13.h, z0.h\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "fadd z14.h, z14.h, z3.h\n"
--                            "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "fmax z13.h, p0/m, z13.h, z1.h\n"
-                             "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z13.h, p0/m, z13.h, z0.h\n"
-                             "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-                             "fadd z15.h, z15.h, z4.h\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "fmax z13.h, p0/m, z13.h, z1.h\n"
-+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                             "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "fmin z16.h, p0/m, z16.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
-                             "fadd z18.h, z18.h, z4.h\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
--                            "fmax z16.h, p0/m, z16.h, z1.h\n"
--                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-                             "fmin z18.h, p2/m, z18.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
--                            "fmax z17.h, p1/m, z17.h, z1.h\n"
--                            "st1h z16.h, p0, [%[outptr1]]\n"
-                             "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "st1h z16.h, p0, [%[outptr1]]\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "fadd z18.h, z18.h, z4.h\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmax z15.h, p2/m, z15.h, z1.h\n"
--                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "fadd z18.h, z18.h, z4.h\n"
-                             "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "fmin z18.h, p2/m, z18.h, z0.h\n"
--                            "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-+                            "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-+                            "fadd z20.h, z20.h, z3.h\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
--                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "fmin z18.h, p2/m, z18.h, z0.h\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
--                            "fadd z20.h, z20.h, z3.h\n"
-+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fadd z13.h, z13.h, z4.h\n"
-+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-+                            "fmin z13.h, p2/m, z13.h, z0.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
--                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
--                            "fmin z13.h, p2/m, z13.h, z0.h\n"
--                            "st1h z19.h, p0, [%[outptr2]]\n"
--                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "fmax z13.h, p2/m, z13.h, z1.h\n"
-+                            "st1h z19.h, p0, [%[outptr2]]\n"
-                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "fadd z18.h, z18.h, z4.h\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-                             "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "fadd z18.h, z18.h, z4.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "fmin z18.h, p2/m, z18.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
--                            "fadd z20.h, z20.h, z3.h\n"
-+                            "fmin z18.h, p2/m, z18.h, z0.h\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
--                            "fmax z18.h, p2/m, z18.h, z1.h\n"
--                            "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
--                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-+                            "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "fadd z20.h, z20.h, z3.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fadd z13.h, z13.h, z4.h\n"
--                            "fadd z14.h, z14.h, z2.h\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-+                            "fadd z14.h, z14.h, z2.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
--                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.h, p2/m, z13.h, z0.h\n"
-                             "st1h z19.h, p0, [%[outptr2]]\n"
-                             "fmin z14.h, p0/m, z14.h, z0.h\n"
-+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "fadd z15.h, z15.h, z3.h\n"
--                            "fadd z16.h, z16.h, z4.h\n"
--                            "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z13.h, p2/m, z13.h, z1.h\n"
-                             "fmax z14.h, p0/m, z14.h, z1.h\n"
-+                            "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-+                            "fadd z16.h, z16.h, z4.h\n"
-                             "fmin z15.h, p1/m, z15.h, z0.h\n"
--                            "fmin z16.h, p2/m, z16.h, z0.h\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmax z15.h, p1/m, z15.h, z1.h\n"
-+                            "fmin z16.h, p2/m, z16.h, z0.h\n"
-                             "st1h z14.h, p0, [%[outptr3]]\n"
-                             "fmax z16.h, p2/m, z16.h, z1.h\n"
-                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
--                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.h, z18.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.h, p1/m, z17.h, z1.h\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.h, p2/m, z18.h, z0.h\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
-+                            "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "fadd z20.h, z20.h, z3.h\n"
--                            "fadd z13.h, z13.h, z4.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
--                            "fmax z18.h, p2/m, z18.h, z1.h\n"
--                            "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-+                            "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
-+                            "fadd z13.h, z13.h, z4.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z20.h, p1/m, z20.h, z0.h\n"
--                            "fmin z13.h, p2/m, z13.h, z0.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.h, z14.h, z2.h\n"
-                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
--                            "fadd z15.h, z15.h, z3.h\n"
-+                            "fmin z13.h, p2/m, z13.h, z0.h\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "st1h z19.h, p0, [%[outptr2]]\n"
--                            "fmax z13.h, p2/m, z13.h, z1.h\n"
--                            "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
-                             "fmin z14.h, p0/m, z14.h, z0.h\n"
--                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-+                            "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
-+                            "fmax z13.h, p2/m, z13.h, z1.h\n"
-+                            "fadd z15.h, z15.h, z3.h\n"
-                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fadd z16.h, z16.h, z4.h\n"
--                            "fadd z17.h, z17.h, z2.h\n"
-                             "fmax z14.h, p0/m, z14.h, z1.h\n"
-+                            "fadd z17.h, z17.h, z2.h\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
--                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.h, p2/m, z16.h, z0.h\n"
-                             "st1h z14.h, p0, [%[outptr3]]\n"
-                             "fmin z17.h, p0/m, z17.h, z0.h\n"
-+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-                             "fadd z18.h, z18.h, z3.h\n"
--                            "fadd z19.h, z19.h, z4.h\n"
--                            "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fmax z16.h, p2/m, z16.h, z1.h\n"
-                             "fmax z17.h, p0/m, z17.h, z1.h\n"
-+                            "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-+                            "fadd z19.h, z19.h, z4.h\n"
-                             "fmin z18.h, p1/m, z18.h, z0.h\n"
--                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.h, p1/m, z18.h, z1.h\n"
-+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "st1h z17.h, p0, [%[outptr4]]\n"
-                             "fmax z19.h, p2/m, z19.h, z1.h\n"
-                             "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
--                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.h, z18.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.h, p1/m, z17.h, z1.h\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.h, p2/m, z18.h, z0.h\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.h, z20.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.h, z20.h, z3.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.h, p1/m, z20.h, z0.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fadd z13.h, z13.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.h, z14.h, z2.h\n"
-                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.h, p1/m, z20.h, z1.h\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.h, p2/m, z13.h, z0.h\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "st1h z19.h, p0, [%[outptr2]]\n"
-                             "fmin z14.h, p0/m, z14.h, z0.h\n"
-                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
-+                            "fmax z13.h, p2/m, z13.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fadd z15.h, z15.h, z3.h\n"
--                            "fadd z16.h, z16.h, z4.h\n"
-                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
--                            "fmax z13.h, p2/m, z13.h, z1.h\n"
--                            "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
-                             "fmax z14.h, p0/m, z14.h, z1.h\n"
-+                            "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
-+                            "fadd z16.h, z16.h, z4.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z15.h, p1/m, z15.h, z0.h\n"
--                            "fmin z16.h, p2/m, z16.h, z0.h\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.h, z17.h, z2.h\n"
-                             "ld1h z13.h, p1/z, [x8]\n"
--                            "fadd z18.h, z18.h, z3.h\n"
-+                            "fmin z16.h, p2/m, z16.h, z0.h\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmax z15.h, p1/m, z15.h, z1.h\n"
-                             "st1h z14.h, p0, [%[outptr3]]\n"
--                            "fmax z16.h, p2/m, z16.h, z1.h\n"
--                            "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
-                             "fmin z17.h, p0/m, z17.h, z0.h\n"
--                            "fmin z18.h, p1/m, z18.h, z0.h\n"
-+                            "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
-+                            "fmax z16.h, p2/m, z16.h, z1.h\n"
-+                            "fadd z18.h, z18.h, z3.h\n"
-                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fadd z19.h, z19.h, z4.h\n"
--                            "fadd z20.h, z20.h, z2.h\n"
-                             "fmax z17.h, p0/m, z17.h, z1.h\n"
-+                            "fadd z20.h, z20.h, z2.h\n"
-                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
--                            "fmax z18.h, p1/m, z18.h, z1.h\n"
-+                            "fmin z18.h, p1/m, z18.h, z0.h\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "st1h z17.h, p0, [%[outptr4]]\n"
-                             "fmin z20.h, p0/m, z20.h, z0.h\n"
-+                            "fmax z18.h, p1/m, z18.h, z1.h\n"
-                             "fadd z13.h, z13.h, z3.h\n"
--                            "fadd z14.h, z14.h, z4.h\n"
--                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fmax z19.h, p2/m, z19.h, z1.h\n"
-                             "fmax z20.h, p0/m, z20.h, z1.h\n"
-+                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-+                            "fadd z14.h, z14.h, z4.h\n"
-                             "fmin z13.h, p1/m, z13.h, z0.h\n"
--                            "fmin z14.h, p2/m, z14.h, z0.h\n"
-                             "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
-                             "fmax z13.h, p1/m, z13.h, z1.h\n"
-+                            "fmin z14.h, p2/m, z14.h, z0.h\n"
-                             "st1h z20.h, p0, [%[outptr5]]\n"
-                             "fmax z14.h, p2/m, z14.h, z1.h\n"
-                             "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
-@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
--                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.h, z18.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.h, p1/m, z17.h, z1.h\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.h, p2/m, z18.h, z0.h\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.h, z20.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.h, z20.h, z3.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.h, p1/m, z20.h, z0.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "fadd z13.h, z13.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.h, z14.h, z2.h\n"
-                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.h, p1/m, z20.h, z1.h\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.h, p2/m, z13.h, z0.h\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "st1h z19.h, p0, [%[outptr2]]\n"
-                             "fmin z14.h, p0/m, z14.h, z0.h\n"
-                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
--                            "fadd z15.h, z15.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fmax z13.h, p2/m, z13.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "fadd z15.h, z15.h, z3.h\n"
-                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z14.h, p0/m, z14.h, z1.h\n"
-                             "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
--                            "fmin z15.h, p1/m, z15.h, z0.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fadd z16.h, z16.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.h, z17.h, z2.h\n"
-                             "ld1h z13.h, p1/z, [x8]\n"
--                            "fmax z15.h, p1/m, z15.h, z1.h\n"
--                            "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.h, p2/m, z16.h, z0.h\n"
-+                            "addvl %[outptr2], %[outptr2], #3\n"
-+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-                             "st1h z14.h, p0, [%[outptr3]]\n"
-                             "fmin z17.h, p0/m, z17.h, z0.h\n"
-                             "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
-+                            "fmax z16.h, p2/m, z16.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fadd z18.h, z18.h, z3.h\n"
--                            "fadd z19.h, z19.h, z4.h\n"
-                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
--                            "fmax z16.h, p2/m, z16.h, z1.h\n"
--                            "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
-                             "fmax z17.h, p0/m, z17.h, z1.h\n"
-+                            "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
-+                            "fadd z19.h, z19.h, z4.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z18.h, p1/m, z18.h, z0.h\n"
--                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
-                             "fadd z20.h, z20.h, z2.h\n"
-                             "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
--                            "fadd z13.h, z13.h, z3.h\n"
-+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.h, p1/m, z18.h, z1.h\n"
-                             "st1h z17.h, p0, [%[outptr4]]\n"
--                            "fmax z19.h, p2/m, z19.h, z1.h\n"
--                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
-                             "fmin z20.h, p0/m, z20.h, z0.h\n"
--                            "fmin z13.h, p1/m, z13.h, z0.h\n"
-+                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
-+                            "fmax z19.h, p2/m, z19.h, z1.h\n"
-+                            "fadd z13.h, z13.h, z3.h\n"
-                             "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fadd z14.h, z14.h, z4.h\n"
--                            "fadd z15.h, z15.h, z2.h\n"
-                             "fmax z20.h, p0/m, z20.h, z1.h\n"
-+                            "fadd z15.h, z15.h, z2.h\n"
-                             "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
--                            "fmax z13.h, p1/m, z13.h, z1.h\n"
-+                            "fmin z13.h, p1/m, z13.h, z0.h\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
-                             "fmin z14.h, p2/m, z14.h, z0.h\n"
-                             "st1h z20.h, p0, [%[outptr5]]\n"
-                             "fmin z15.h, p0/m, z15.h, z0.h\n"
-+                            "fmax z13.h, p1/m, z13.h, z1.h\n"
-                             "fadd z16.h, z16.h, z3.h\n"
--                            "fadd z17.h, z17.h, z4.h\n"
--                            "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
-                             "fmax z14.h, p2/m, z14.h, z1.h\n"
-                             "fmax z15.h, p0/m, z15.h, z1.h\n"
-+                            "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z4.h\n"
-                             "fmin z16.h, p1/m, z16.h, z0.h\n"
--                            "fmin z17.h, p2/m, z17.h, z0.h\n"
-                             "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
-                             "addvl %[outptr5], %[outptr5], #3\n"
-                             "fmax z16.h, p1/m, z16.h, z1.h\n"
-+                            "fmin z17.h, p2/m, z17.h, z0.h\n"
-                             "st1h z15.h, p0, [%[outptr6]]\n"
-                             "fmax z17.h, p2/m, z17.h, z1.h\n"
-                             "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
-@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, co
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1h z2.h, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.h, %[p], %[w]\n"
--                            "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z13.h, p0/z, [%[inptr]]\n"
-                             "inch %[p], all, mul #1\n"
--                            "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1h z13.h, p0/z, [%[inptr]]\n"
--                            "whilelt p2.h, %[p], %[w]\n"
--                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.h, z13.h, z2.h\n"
--                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z3.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.h, %[p], %[w]\n"
-+                            "fadd z16.h, z16.h, z2.h\n"
-                             "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.h, p0/m, z13.h, z0.h\n"
--                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.h, z15.h, z4.h\n"
-                             "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.h, z16.h, z2.h\n"
--                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.h, p1/m, z14.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.h, z14.h, z3.h\n"
-+                            "ld1h z4.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.h, p0/m, z13.h, z1.h\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.h, p1/m, z14.h, z1.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.h, p2/m, z15.h, z0.h\n"
--                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.h, p1/m, z14.h, z0.h\n"
-+                            "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.h, z15.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.h, p0/m, z16.h, z0.h\n"
-+                            "st1h z13.h, p0, [%[outptr0]]\n"
-+                            "fmax z14.h, p1/m, z14.h, z1.h\n"
-                             "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.h, z17.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.h, p2/m, z15.h, z1.h\n"
--                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.h, p2/m, z15.h, z0.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.h, p0/m, z16.h, z1.h\n"
-+                            "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.h, z17.h, z3.h\n"
-                             "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.h, p1/m, z17.h, z0.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.h, p2/m, z15.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.h, z18.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.h, p1/m, z17.h, z0.h\n"
-                             "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.h, z19.h, z2.h\n"
-                             "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.h, p1/m, z17.h, z1.h\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.h, p2/m, z18.h, z0.h\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.h, p1/m, z17.h, z1.h\n"
-                             "st1h z16.h, p0, [%[outptr1]]\n"
-                             "fmin z19.h, p0/m, z19.h, z0.h\n"
-                             "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.h, z20.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.h, p2/m, z18.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.h, z20.h, z3.h\n"
-                             "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.h, p0/m, z19.h, z1.h\n"
-                             "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.h, p1/m, z20.h, z0.h\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "fadd z13.h, z13.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.h, p1/m, z20.h, z0.h\n"
-                             "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.h, z14.h, z2.h\n"
-                             "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.h, p1/m, z20.h, z1.h\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.h, p2/m, z13.h, z0.h\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.h, p1/m, z20.h, z1.h\n"
-                             "st1h z19.h, p0, [%[outptr2]]\n"
-                             "fmin z14.h, p0/m, z14.h, z0.h\n"
-                             "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
--                            "fadd z15.h, z15.h, z3.h\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fmax z13.h, p2/m, z13.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "fadd z15.h, z15.h, z3.h\n"
-                             "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z14.h, p0/m, z14.h, z1.h\n"
-                             "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
--                            "fmin z15.h, p1/m, z15.h, z0.h\n"
--                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-                             "fadd z16.h, z16.h, z4.h\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "fmin z15.h, p1/m, z15.h, z0.h\n"
-                             "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.h, z17.h, z2.h\n"
-                             "ld1h z13.h, p1/z, [x8]\n"
--                            "fmax z15.h, p1/m, z15.h, z1.h\n"
--                            "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.h, p2/m, z16.h, z0.h\n"
-+                            "addvl %[outptr2], %[outptr2], #3\n"
-+                            "fmax z15.h, p1/m, z15.h, z1.h\n"
-                             "st1h z14.h, p0, [%[outptr3]]\n"
-                             "fmin z17.h, p0/m, z17.h, z0.h\n"
-                             "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
--                            "fadd z18.h, z18.h, z3.h\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmax z16.h, p2/m, z16.h, z1.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "fadd z18.h, z18.h, z3.h\n"
-                             "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fmax z17.h, p0/m, z17.h, z1.h\n"
-                             "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
--                            "fmin z18.h, p1/m, z18.h, z0.h\n"
-                             "fadd z19.h, z19.h, z4.h\n"
-+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-+                            "fmin z18.h, p1/m, z18.h, z0.h\n"
-                             "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
-                             "fadd z20.h, z20.h, z2.h\n"
-                             "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
--                            "fadd z13.h, z13.h, z3.h\n"
-+                            "fmin z19.h, p2/m, z19.h, z0.h\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.h, p1/m, z18.h, z1.h\n"
-                             "st1h z17.h, p0, [%[outptr4]]\n"
--                            "fmin z19.h, p2/m, z19.h, z0.h\n"
--                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
-                             "fmin z20.h, p0/m, z20.h, z0.h\n"
--                            "fmin z13.h, p1/m, z13.h, z0.h\n"
--                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
--                            "fadd z14.h, z14.h, z4.h\n"
--                            "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
-+                            "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
-                             "fmax z19.h, p2/m, z19.h, z1.h\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-+                            "fadd z13.h, z13.h, z3.h\n"
-+                            "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fmax z20.h, p0/m, z20.h, z1.h\n"
--                            "fmax z13.h, p1/m, z13.h, z1.h\n"
--                            "fmin z14.h, p2/m, z14.h, z0.h\n"
--                            "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
-+                            "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
-+                            "fadd z14.h, z14.h, z4.h\n"
-                             "fadd z15.h, z15.h, z2.h\n"
-+                            "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
-+                            "fmin z13.h, p1/m, z13.h, z0.h\n"
-                             "ld1h z19.h, p1/z, [x8, #6, MUL VL]\n"
-                             "fadd z16.h, z16.h, z3.h\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
--                            "fmax z14.h, p2/m, z14.h, z1.h\n"
-+                            "fmin z14.h, p2/m, z14.h, z0.h\n"
-                             "st1h z20.h, p0, [%[outptr5]]\n"
--                            "fmin z15.h, p0/m, z15.h, z0.h\n"
-+                            "fmax z13.h, p1/m, z13.h, z1.h\n"
-                             "ld1h z20.h, p2/z, [x8, #7, MUL VL]\n"
-+                            "fmin z15.h, p0/m, z15.h, z0.h\n"
-                             "fmin z16.h, p1/m, z16.h, z0.h\n"
--                            "fadd z17.h, z17.h, z4.h\n"
-+                            "fmax z14.h, p2/m, z14.h, z1.h\n"
-                             "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
--                            "fadd z18.h, z18.h, z2.h\n"
-+                            "fadd z17.h, z17.h, z4.h\n"
-                             "fmax z15.h, p0/m, z15.h, z1.h\n"
-                             "fmax z16.h, p1/m, z16.h, z1.h\n"
-                             "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
--                            "fmin z17.h, p2/m, z17.h, z0.h\n"
-+                            "fadd z18.h, z18.h, z2.h\n"
-                             "addvl %[outptr5], %[outptr5], #3\n"
--                            "fmin z18.h, p0/m, z18.h, z0.h\n"
-+                            "fmin z17.h, p2/m, z17.h, z0.h\n"
-                             "st1h z15.h, p0, [%[outptr6]]\n"
-                             "fadd z19.h, z19.h, z3.h\n"
--                            "fmax z17.h, p2/m, z17.h, z1.h\n"
-+                            "fmin z18.h, p0/m, z18.h, z0.h\n"
-                             "fadd z20.h, z20.h, z4.h\n"
-                             "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
--                            "fmax z18.h, p0/m, z18.h, z1.h\n"
-+                            "fmax z17.h, p2/m, z17.h, z1.h\n"
-                             "fmin z19.h, p1/m, z19.h, z0.h\n"
-+                            "fmax z18.h, p0/m, z18.h, z1.h\n"
-                             "fmin z20.h, p2/m, z20.h, z0.h\n"
-                             "st1h z17.h, p2, [%[outptr6], #2, MUL VL]\n"
-                             "addvl %[outptr6], %[outptr6], #3\n"
-diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
-index 5505f1efe4..2da48922e3 100644
---- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
-+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2019-2020 Arm Limited.
-+ * Copyright (c) 2019-2020,2022 Arm Limited.
-  *
-  * SPDX-License-Identifier: MIT
-  *
-@@ -1029,25 +1029,25 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
--                            "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z13.s, p0/z, [%[inptr]]\n"
-+                            "incw %[p], all, mul #1\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "fadd z13.s, z13.s, z2.s\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                             "whilelt p2.s, %[p], %[w]\n"
--                            "fadd z13.s, z13.s, z2.s\n"
-+                            "fmin z13.s, p0/m, z13.s, z0.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "fadd z14.s, z14.s, z3.s\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z13.s, p0/m, z13.s, z0.s\n"
-+                            "fmax z13.s, p0/m, z13.s, z1.s\n"
-                             "fmin z14.s, p1/m, z14.s, z0.s\n"
-                             "fadd z15.s, z15.s, z4.s\n"
--                            "fmax z13.s, p0/m, z13.s, z1.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-                             "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
-@@ -1073,42 +1073,42 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "fmin z13.s, p0/m, z13.s, z0.s\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "fadd z14.s, z14.s, z3.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "fmax z13.s, p0/m, z13.s, z1.s\n"
-                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z13.s, p0/m, z13.s, z0.s\n"
-                             "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-                             "fadd z15.s, z15.s, z4.s\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "fmax z13.s, p0/m, z13.s, z1.s\n"
-+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                             "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "fmin z16.s, p0/m, z16.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
-                             "fadd z18.s, z18.s, z4.s\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "fmax z16.s, p0/m, z16.s, z1.s\n"
--                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-                             "fmin z18.s, p2/m, z18.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
--                            "fmax z17.s, p1/m, z17.s, z1.s\n"
--                            "st1w z16.s, p0, [%[outptr1]]\n"
-                             "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "st1w z16.s, p0, [%[outptr1]]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-@@ -1135,60 +1135,60 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "fadd z18.s, z18.s, z4.s\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmax z15.s, p2/m, z15.s, z1.s\n"
--                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "fadd z18.s, z18.s, z4.s\n"
-                             "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "fmin z18.s, p2/m, z18.s, z0.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-+                            "fadd z20.s, z20.s, z3.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "fmin z18.s, p2/m, z18.s, z0.s\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
--                            "fadd z20.s, z20.s, z3.s\n"
-+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fadd z13.s, z13.s, z4.s\n"
-+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-+                            "fmin z13.s, p2/m, z13.s, z0.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
--                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
--                            "fmin z13.s, p2/m, z13.s, z0.s\n"
--                            "st1w z19.s, p0, [%[outptr2]]\n"
--                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "fmax z13.s, p2/m, z13.s, z1.s\n"
-+                            "st1w z19.s, p0, [%[outptr2]]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-@@ -1215,75 +1215,75 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "fadd z18.s, z18.s, z4.s\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "fadd z18.s, z18.s, z4.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "fmin z18.s, p2/m, z18.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "fadd z20.s, z20.s, z3.s\n"
-+                            "fmin z18.s, p2/m, z18.s, z0.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "fmax z18.s, p2/m, z18.s, z1.s\n"
--                            "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
--                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-+                            "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "fadd z20.s, z20.s, z3.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fadd z13.s, z13.s, z4.s\n"
--                            "fadd z14.s, z14.s, z2.s\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-+                            "fadd z14.s, z14.s, z2.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
--                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.s, p2/m, z13.s, z0.s\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
-                             "fmin z14.s, p0/m, z14.s, z0.s\n"
-+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "fadd z15.s, z15.s, z3.s\n"
--                            "fadd z16.s, z16.s, z4.s\n"
--                            "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z13.s, p2/m, z13.s, z1.s\n"
-                             "fmax z14.s, p0/m, z14.s, z1.s\n"
-+                            "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-+                            "fadd z16.s, z16.s, z4.s\n"
-                             "fmin z15.s, p1/m, z15.s, z0.s\n"
--                            "fmin z16.s, p2/m, z16.s, z0.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmax z15.s, p1/m, z15.s, z1.s\n"
-+                            "fmin z16.s, p2/m, z16.s, z0.s\n"
-                             "st1w z14.s, p0, [%[outptr3]]\n"
-                             "fmax z16.s, p2/m, z16.s, z1.s\n"
-                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-@@ -1312,93 +1312,93 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.s, z18.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.s, p1/m, z17.s, z1.s\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.s, p2/m, z18.s, z0.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-+                            "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "fadd z20.s, z20.s, z3.s\n"
--                            "fadd z13.s, z13.s, z4.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "fmax z18.s, p2/m, z18.s, z1.s\n"
--                            "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-+                            "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-+                            "fadd z13.s, z13.s, z4.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z20.s, p1/m, z20.s, z0.s\n"
--                            "fmin z13.s, p2/m, z13.s, z0.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.s, z14.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "fadd z15.s, z15.s, z3.s\n"
-+                            "fmin z13.s, p2/m, z13.s, z0.s\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "fmax z13.s, p2/m, z13.s, z1.s\n"
--                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                             "fmin z14.s, p0/m, z14.s, z0.s\n"
--                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-+                            "fmax z13.s, p2/m, z13.s, z1.s\n"
-+                            "fadd z15.s, z15.s, z3.s\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fadd z16.s, z16.s, z4.s\n"
--                            "fadd z17.s, z17.s, z2.s\n"
-                             "fmax z14.s, p0/m, z14.s, z1.s\n"
-+                            "fadd z17.s, z17.s, z2.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
--                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.s, p2/m, z16.s, z0.s\n"
-                             "st1w z14.s, p0, [%[outptr3]]\n"
-                             "fmin z17.s, p0/m, z17.s, z0.s\n"
-+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-                             "fadd z18.s, z18.s, z3.s\n"
--                            "fadd z19.s, z19.s, z4.s\n"
--                            "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fmax z16.s, p2/m, z16.s, z1.s\n"
-                             "fmax z17.s, p0/m, z17.s, z1.s\n"
-+                            "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-+                            "fadd z19.s, z19.s, z4.s\n"
-                             "fmin z18.s, p1/m, z18.s, z0.s\n"
--                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.s, p1/m, z18.s, z1.s\n"
-+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "st1w z17.s, p0, [%[outptr4]]\n"
-                             "fmax z19.s, p2/m, z19.s, z1.s\n"
-                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-@@ -1427,111 +1427,111 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.s, z18.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.s, p1/m, z17.s, z1.s\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.s, p2/m, z18.s, z0.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.s, z20.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.s, z20.s, z3.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.s, p1/m, z20.s, z0.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fadd z13.s, z13.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.s, z14.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.s, p1/m, z20.s, z1.s\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.s, p2/m, z13.s, z0.s\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
-                             "fmin z14.s, p0/m, z14.s, z0.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-+                            "fmax z13.s, p2/m, z13.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fadd z15.s, z15.s, z3.s\n"
--                            "fadd z16.s, z16.s, z4.s\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "fmax z13.s, p2/m, z13.s, z1.s\n"
--                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-                             "fmax z14.s, p0/m, z14.s, z1.s\n"
-+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "fadd z16.s, z16.s, z4.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z15.s, p1/m, z15.s, z0.s\n"
--                            "fmin z16.s, p2/m, z16.s, z0.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.s, z17.s, z2.s\n"
-                             "ld1w z13.s, p1/z, [x8]\n"
--                            "fadd z18.s, z18.s, z3.s\n"
-+                            "fmin z16.s, p2/m, z16.s, z0.s\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmax z15.s, p1/m, z15.s, z1.s\n"
-                             "st1w z14.s, p0, [%[outptr3]]\n"
--                            "fmax z16.s, p2/m, z16.s, z1.s\n"
--                            "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
-                             "fmin z17.s, p0/m, z17.s, z0.s\n"
--                            "fmin z18.s, p1/m, z18.s, z0.s\n"
-+                            "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
-+                            "fmax z16.s, p2/m, z16.s, z1.s\n"
-+                            "fadd z18.s, z18.s, z3.s\n"
-                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fadd z19.s, z19.s, z4.s\n"
--                            "fadd z20.s, z20.s, z2.s\n"
-                             "fmax z17.s, p0/m, z17.s, z1.s\n"
-+                            "fadd z20.s, z20.s, z2.s\n"
-                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
--                            "fmax z18.s, p1/m, z18.s, z1.s\n"
-+                            "fmin z18.s, p1/m, z18.s, z0.s\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "st1w z17.s, p0, [%[outptr4]]\n"
-                             "fmin z20.s, p0/m, z20.s, z0.s\n"
-+                            "fmax z18.s, p1/m, z18.s, z1.s\n"
-                             "fadd z13.s, z13.s, z3.s\n"
--                            "fadd z14.s, z14.s, z4.s\n"
--                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fmax z19.s, p2/m, z19.s, z1.s\n"
-                             "fmax z20.s, p0/m, z20.s, z1.s\n"
-+                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-+                            "fadd z14.s, z14.s, z4.s\n"
-                             "fmin z13.s, p1/m, z13.s, z0.s\n"
--                            "fmin z14.s, p2/m, z14.s, z0.s\n"
-                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
-                             "fmax z13.s, p1/m, z13.s, z1.s\n"
-+                            "fmin z14.s, p2/m, z14.s, z0.s\n"
-                             "st1w z20.s, p0, [%[outptr5]]\n"
-                             "fmax z14.s, p2/m, z14.s, z1.s\n"
-                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
-@@ -1560,129 +1560,129 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.s, z18.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.s, p1/m, z17.s, z1.s\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.s, p2/m, z18.s, z0.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.s, z20.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.s, z20.s, z3.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.s, p1/m, z20.s, z0.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "fadd z13.s, z13.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.s, z14.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.s, p1/m, z20.s, z1.s\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.s, p2/m, z13.s, z0.s\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
-                             "fmin z14.s, p0/m, z14.s, z0.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "fadd z15.s, z15.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fmax z13.s, p2/m, z13.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "fadd z15.s, z15.s, z3.s\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z14.s, p0/m, z14.s, z1.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
--                            "fmin z15.s, p1/m, z15.s, z0.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fadd z16.s, z16.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.s, z17.s, z2.s\n"
-                             "ld1w z13.s, p1/z, [x8]\n"
--                            "fmax z15.s, p1/m, z15.s, z1.s\n"
--                            "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.s, p2/m, z16.s, z0.s\n"
-+                            "addvl %[outptr2], %[outptr2], #3\n"
-+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-                             "st1w z14.s, p0, [%[outptr3]]\n"
-                             "fmin z17.s, p0/m, z17.s, z0.s\n"
-                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
-+                            "fmax z16.s, p2/m, z16.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fadd z18.s, z18.s, z3.s\n"
--                            "fadd z19.s, z19.s, z4.s\n"
-                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
--                            "fmax z16.s, p2/m, z16.s, z1.s\n"
--                            "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
-                             "fmax z17.s, p0/m, z17.s, z1.s\n"
-+                            "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
-+                            "fadd z19.s, z19.s, z4.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmin z18.s, p1/m, z18.s, z0.s\n"
--                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
-                             "fadd z20.s, z20.s, z2.s\n"
-                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
--                            "fadd z13.s, z13.s, z3.s\n"
-+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.s, p1/m, z18.s, z1.s\n"
-                             "st1w z17.s, p0, [%[outptr4]]\n"
--                            "fmax z19.s, p2/m, z19.s, z1.s\n"
--                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
-                             "fmin z20.s, p0/m, z20.s, z0.s\n"
--                            "fmin z13.s, p1/m, z13.s, z0.s\n"
-+                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
-+                            "fmax z19.s, p2/m, z19.s, z1.s\n"
-+                            "fadd z13.s, z13.s, z3.s\n"
-                             "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fadd z14.s, z14.s, z4.s\n"
--                            "fadd z15.s, z15.s, z2.s\n"
-                             "fmax z20.s, p0/m, z20.s, z1.s\n"
-+                            "fadd z15.s, z15.s, z2.s\n"
-                             "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
--                            "fmax z13.s, p1/m, z13.s, z1.s\n"
-+                            "fmin z13.s, p1/m, z13.s, z0.s\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
-                             "fmin z14.s, p2/m, z14.s, z0.s\n"
-                             "st1w z20.s, p0, [%[outptr5]]\n"
-                             "fmin z15.s, p0/m, z15.s, z0.s\n"
-+                            "fmax z13.s, p1/m, z13.s, z1.s\n"
-                             "fadd z16.s, z16.s, z3.s\n"
--                            "fadd z17.s, z17.s, z4.s\n"
--                            "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
-                             "fmax z14.s, p2/m, z14.s, z1.s\n"
-                             "fmax z15.s, p0/m, z15.s, z1.s\n"
-+                            "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z4.s\n"
-                             "fmin z16.s, p1/m, z16.s, z0.s\n"
--                            "fmin z17.s, p2/m, z17.s, z0.s\n"
-                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
-                             "addvl %[outptr5], %[outptr5], #3\n"
-                             "fmax z16.s, p1/m, z16.s, z1.s\n"
-+                            "fmin z17.s, p2/m, z17.s, z0.s\n"
-                             "st1w z15.s, p0, [%[outptr6]]\n"
-                             "fmax z17.s, p2/m, z17.s, z1.s\n"
-                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
-@@ -1712,142 +1712,142 @@ void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, cons
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
--                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fadd z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "fadd z16.s, z16.s, z2.s\n"
-                             "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "fmin z13.s, p0/m, z13.s, z0.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "fadd z15.s, z15.s, z4.s\n"
-                             "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "fadd z16.s, z16.s, z2.s\n"
--                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "fmin z14.s, p1/m, z14.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "fadd z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
-                             "fmax z13.s, p0/m, z13.s, z1.s\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
--                            "fmax z14.s, p1/m, z14.s, z1.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "fmin z15.s, p2/m, z15.s, z0.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "fmin z14.s, p1/m, z14.s, z0.s\n"
-+                            "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "fadd z15.s, z15.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
-                             "fmin z16.s, p0/m, z16.s, z0.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "fmax z14.s, p1/m, z14.s, z1.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "fadd z17.s, z17.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
--                            "fmax z15.s, p2/m, z15.s, z1.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fmin z15.s, p2/m, z15.s, z0.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "fmax z16.s, p0/m, z16.s, z1.s\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "fadd z17.s, z17.s, z3.s\n"
-                             "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "fmin z17.s, p1/m, z17.s, z0.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmax z15.s, p2/m, z15.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "fadd z18.s, z18.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-+                            "fmin z17.s, p1/m, z17.s, z0.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "fadd z19.s, z19.s, z2.s\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "fmax z17.s, p1/m, z17.s, z1.s\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "fmin z18.s, p2/m, z18.s, z0.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "fmax z17.s, p1/m, z17.s, z1.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-                             "fmin z19.s, p0/m, z19.s, z0.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "fadd z20.s, z20.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "fmax z18.s, p2/m, z18.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "fadd z20.s, z20.s, z3.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "fmax z19.s, p0/m, z19.s, z1.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "fmin z20.s, p1/m, z20.s, z0.s\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "fadd z13.s, z13.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "fmin z20.s, p1/m, z20.s, z0.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "fadd z14.s, z14.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "fmax z20.s, p1/m, z20.s, z1.s\n"
--                            "addvl %[outptr1], %[outptr1], #3\n"
-                             "fmin z13.s, p2/m, z13.s, z0.s\n"
-+                            "addvl %[outptr1], %[outptr1], #3\n"
-+                            "fmax z20.s, p1/m, z20.s, z1.s\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
-                             "fmin z14.s, p0/m, z14.s, z0.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "fadd z15.s, z15.s, z3.s\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "fmax z13.s, p2/m, z13.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "fadd z15.s, z15.s, z3.s\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
-                             "fmax z14.s, p0/m, z14.s, z1.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
--                            "fmin z15.s, p1/m, z15.s, z0.s\n"
--                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-                             "fadd z16.s, z16.s, z4.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "fmin z15.s, p1/m, z15.s, z0.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "fadd z17.s, z17.s, z2.s\n"
-                             "ld1w z13.s, p1/z, [x8]\n"
--                            "fmax z15.s, p1/m, z15.s, z1.s\n"
--                            "addvl %[outptr2], %[outptr2], #3\n"
-                             "fmin z16.s, p2/m, z16.s, z0.s\n"
-+                            "addvl %[outptr2], %[outptr2], #3\n"
-+                            "fmax z15.s, p1/m, z15.s, z1.s\n"
-                             "st1w z14.s, p0, [%[outptr3]]\n"
-                             "fmin z17.s, p0/m, z17.s, z0.s\n"
-                             "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
--                            "fadd z18.s, z18.s, z3.s\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-                             "fmax z16.s, p2/m, z16.s, z1.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "fadd z18.s, z18.s, z3.s\n"
-                             "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
-                             "fmax z17.s, p0/m, z17.s, z1.s\n"
-                             "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
--                            "fmin z18.s, p1/m, z18.s, z0.s\n"
-                             "fadd z19.s, z19.s, z4.s\n"
-+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-+                            "fmin z18.s, p1/m, z18.s, z0.s\n"
-                             "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
-                             "fadd z20.s, z20.s, z2.s\n"
-                             "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
--                            "fadd z13.s, z13.s, z3.s\n"
-+                            "fmin z19.s, p2/m, z19.s, z0.s\n"
-                             "addvl %[outptr3], %[outptr3], #3\n"
-                             "fmax z18.s, p1/m, z18.s, z1.s\n"
-                             "st1w z17.s, p0, [%[outptr4]]\n"
--                            "fmin z19.s, p2/m, z19.s, z0.s\n"
--                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
-                             "fmin z20.s, p0/m, z20.s, z0.s\n"
--                            "fmin z13.s, p1/m, z13.s, z0.s\n"
--                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
--                            "fadd z14.s, z14.s, z4.s\n"
--                            "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
-+                            "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
-                             "fmax z19.s, p2/m, z19.s, z1.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-+                            "fadd z13.s, z13.s, z3.s\n"
-+                            "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
-                             "fmax z20.s, p0/m, z20.s, z1.s\n"
--                            "fmax z13.s, p1/m, z13.s, z1.s\n"
--                            "fmin z14.s, p2/m, z14.s, z0.s\n"
--                            "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
-+                            "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
-+                            "fadd z14.s, z14.s, z4.s\n"
-                             "fadd z15.s, z15.s, z2.s\n"
-+                            "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
-+                            "fmin z13.s, p1/m, z13.s, z0.s\n"
-                             "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
-                             "fadd z16.s, z16.s, z3.s\n"
-                             "addvl %[outptr4], %[outptr4], #3\n"
--                            "fmax z14.s, p2/m, z14.s, z1.s\n"
-+                            "fmin z14.s, p2/m, z14.s, z0.s\n"
-                             "st1w z20.s, p0, [%[outptr5]]\n"
--                            "fmin z15.s, p0/m, z15.s, z0.s\n"
-+                            "fmax z13.s, p1/m, z13.s, z1.s\n"
-                             "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
-+                            "fmin z15.s, p0/m, z15.s, z0.s\n"
-                             "fmin z16.s, p1/m, z16.s, z0.s\n"
--                            "fadd z17.s, z17.s, z4.s\n"
-+                            "fmax z14.s, p2/m, z14.s, z1.s\n"
-                             "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
--                            "fadd z18.s, z18.s, z2.s\n"
-+                            "fadd z17.s, z17.s, z4.s\n"
-                             "fmax z15.s, p0/m, z15.s, z1.s\n"
-                             "fmax z16.s, p1/m, z16.s, z1.s\n"
-                             "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
--                            "fmin z17.s, p2/m, z17.s, z0.s\n"
-+                            "fadd z18.s, z18.s, z2.s\n"
-                             "addvl %[outptr5], %[outptr5], #3\n"
--                            "fmin z18.s, p0/m, z18.s, z0.s\n"
-+                            "fmin z17.s, p2/m, z17.s, z0.s\n"
-                             "st1w z15.s, p0, [%[outptr6]]\n"
-                             "fadd z19.s, z19.s, z3.s\n"
--                            "fmax z17.s, p2/m, z17.s, z1.s\n"
-+                            "fmin z18.s, p0/m, z18.s, z0.s\n"
-                             "fadd z20.s, z20.s, z4.s\n"
-                             "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
--                            "fmax z18.s, p0/m, z18.s, z1.s\n"
-+                            "fmax z17.s, p2/m, z17.s, z1.s\n"
-                             "fmin z19.s, p1/m, z19.s, z0.s\n"
-+                            "fmax z18.s, p0/m, z18.s, z1.s\n"
-                             "fmin z20.s, p2/m, z20.s, z0.s\n"
-                             "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
-                             "addvl %[outptr6], %[outptr6], #3\n"
-diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
-index c009881254..115ba59459 100644
---- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
-+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2019-2020 Arm Limited.
-+ * Copyright (c) 2019-2020,2022 Arm Limited.
-  *
-  * SPDX-License-Identifier: MIT
-  *
-@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
--                            "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z13.s, p0/z, [%[inptr]]\n"
-+                            "incw %[p], all, mul #1\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                             "whilelt p2.s, %[p], %[w]\n"
--                            "add z13.s, z13.s, z2.s\n"
-                             "add z14.s, z14.s, z3.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
-@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "add z14.s, z14.s, z3.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "add z17.s, z17.s, z3.s\n"
-                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                             "add z15.s, z15.s, z4.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "add z17.s, z17.s, z3.s\n"
-                             "add z18.s, z18.s, z4.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "add z19.s, z19.s, z2.s\n"
--                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
-+                            "add z18.s, z18.s, z4.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z19.s, z19.s, z2.s\n"
-+                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "add z18.s, z18.s, z3.s\n"
-+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "add z19.s, z19.s, z4.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "add z19.s, z19.s, z4.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
-@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "add z18.s, z18.s, z3.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "add z19.s, z19.s, z4.s\n"
-+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
-@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout,
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "add z18.s, z18.s, z3.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-                             "add z19.s, z19.s, z4.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
-diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
-index e992f6722c..358ed79989 100644
---- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
-+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2019-2020 Arm Limited.
-+ * Copyright (c) 2019-2020,2022 Arm Limited.
-  *
-  * SPDX-License-Identifier: MIT
-  *
-@@ -781,19 +781,19 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
--                            "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z13.s, p0/z, [%[inptr]]\n"
-+                            "incw %[p], all, mul #1\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-                             "whilelt p2.s, %[p], %[w]\n"
--                            "add z13.s, z13.s, z2.s\n"
-                             "add z14.s, z14.s, z3.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                         : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
-@@ -817,27 +817,27 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
--                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "add z14.s, z14.s, z3.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
-+                            "add z17.s, z17.s, z3.s\n"
-                             "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
-+                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                             "add z15.s, z15.s, z4.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "add z17.s, z17.s, z3.s\n"
-                             "add z18.s, z18.s, z4.s\n"
--                            "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-@@ -865,38 +865,38 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
--                            "add z19.s, z19.s, z2.s\n"
--                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                             "addvl %[inptr], %[inptr], #24\n"
-+                            "add z18.s, z18.s, z4.s\n"
-                             "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z19.s, z19.s, z2.s\n"
-+                            "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "addvl %[outptr0], %[outptr0], #3\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-@@ -925,44 +925,44 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-@@ -996,49 +996,49 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-@@ -1079,61 +1079,61 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "add z18.s, z18.s, z3.s\n"
-+                            "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "add z19.s, z19.s, z4.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "add z19.s, z19.s, z4.s\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
-@@ -1174,63 +1174,63 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "add z18.s, z18.s, z3.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "add z19.s, z19.s, z4.s\n"
-+                            "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
-@@ -1282,64 +1282,64 @@ void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout
-                             "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
-                             "ld1w z2.s, p0/z, [%[biasptr]]\n"
-                             "whilelt p1.s, %[p], %[w]\n"
--                            "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
-+                            "ld1w z13.s, p0/z, [%[inptr]]\n"
-                             "incw %[p], all, mul #1\n"
--                            "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
-+                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
--                            "ld1w z13.s, p0/z, [%[inptr]]\n"
--                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z13.s, z13.s, z2.s\n"
-+                            "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                             "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
-+                            "whilelt p2.s, %[p], %[w]\n"
-+                            "add z16.s, z16.s, z2.s\n"
-+                            "st1w z13.s, p0, [%[outptr0]]\n"
-+                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
-                             "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
--                            "add z13.s, z13.s, z2.s\n"
-+                            "add z14.s, z14.s, z3.s\n"
-+                            "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                             "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
--                            "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
-                             "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
--                            "add z14.s, z14.s, z3.s\n"
--                            "st1w z13.s, p0, [%[outptr0]]\n"
--                            "add z15.s, z15.s, z4.s\n"
--                            "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
--                            "add z16.s, z16.s, z2.s\n"
--                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
--                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-                             "add z17.s, z17.s, z3.s\n"
-                             "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
--                            "add z18.s, z18.s, z4.s\n"
-+                            "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
-+                            "add z15.s, z15.s, z4.s\n"
-+                            "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
-                             "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
-+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-+                            "add z18.s, z18.s, z4.s\n"
-+                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
-                             "add z19.s, z19.s, z2.s\n"
-                             "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
--                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
-                             "add z20.s, z20.s, z3.s\n"
--                            "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
--                            "add z13.s, z13.s, z4.s\n"
-+                            "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
-                             "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
--                            "add z14.s, z14.s, z2.s\n"
-                             "addvl %[outptr0], %[outptr0], #3\n"
-+                            "add z13.s, z13.s, z4.s\n"
-                             "st1w z16.s, p0, [%[outptr1]]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
--                            "add z15.s, z15.s, z3.s\n"
-+                            "add z14.s, z14.s, z2.s\n"
-                             "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-+                            "add z15.s, z15.s, z3.s\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
-                             "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
-                             "add z16.s, z16.s, z4.s\n"
-                             "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
-                             "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
-                             "addvl %[outptr1], %[outptr1], #3\n"
-                             "add z17.s, z17.s, z2.s\n"
-                             "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
--                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
-                             "st1w z19.s, p0, [%[outptr2]]\n"
--                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
-                             "add z18.s, z18.s, z3.s\n"
-                             "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
--                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
-                             "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
--                            "addvl %[inptr], %[inptr], #24\n"
-+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
-                             "add z19.s, z19.s, z4.s\n"
-                             "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
-+                            "addvl %[inptr], %[inptr], #24\n"
-                             "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
-                             "addvl %[outptr2], %[outptr2], #3\n"
-                             "add z20.s, z20.s, z2.s\n"
--- 
-cgit v1.2.1
-
diff --git a/third_party/compute_library/acl_openmp_fix.patch b/third_party/compute_library/acl_openmp_fix.patch
new file mode 100644
index 00000000000..512148c8eca
--- /dev/null
+++ b/third_party/compute_library/acl_openmp_fix.patch
@@ -0,0 +1,46 @@
+ *******************************************************************************
+ Copyright 2022 Arm Limited and affiliates.
+ SPDX-License-Identifier: Apache-2.0
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ *******************************************************************************
+
+diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
+index aad24b4f0..78d1523af 100644
+--- a/src/runtime/OMP/OMPScheduler.cpp
++++ b/src/runtime/OMP/OMPScheduler.cpp
+@@ -90,18 +116,21 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
+ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
+ {
+     const unsigned int amount_of_work = static_cast<unsigned int>(workloads.size());
+-    if(amount_of_work < 1 || _num_threads == 1)
++    const unsigned int                  num_threads_to_use = std::min(_num_threads, amount_of_work );
++
++    if(amount_of_work < 1 || num_threads_to_use == 1)
+     {
+         return;
+     }
+ 
+     ThreadInfo info;
+     info.cpu_info    = &cpu_info();
+-    info.num_threads = _num_threads;
+-    #pragma omp parallel for firstprivate(info) num_threads(_num_threads) default(shared) proc_bind(close) schedule(static, 1)
++    info.num_threads = num_threads_to_use;
++    #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1)
+     for(unsigned int wid = 0; wid < amount_of_work; ++wid)
+     {
+         const int tid  = omp_get_thread_num();
++
+         info.thread_id = tid;
+         workloads[wid](info);
+     }
diff --git a/third_party/compute_library/compute_library.patch b/third_party/compute_library/compute_library.patch
index c46a71cb974..2b9619dd035 100644
--- a/third_party/compute_library/compute_library.patch
+++ b/third_party/compute_library/compute_library.patch
@@ -4,5 +4,5 @@ index 000000000..c986ad52a
 --- /dev/null
 +++ b/arm_compute_version.embed
 @@ -0,0 +1,1 @@
-+"arm_compute_version=v22.08 Build options: {} Git hash=b'aabef6c0584f06f4c0f4b61fb787d80374240619'"
++"arm_compute_version=v22.11 Build options: {} Git hash=b'1b3192e8a23513031163dc14d248f47671986121'"
 \ No newline at end of file
diff --git a/third_party/cudnn_frontend_header_fix.patch b/third_party/cudnn_frontend_header_fix.patch
index c34ab8e162d..a5fee9cae53 100644
--- a/third_party/cudnn_frontend_header_fix.patch
+++ b/third_party/cudnn_frontend_header_fix.patch
@@ -1,4 +1,4 @@
-From 8c4aece07c6993587198de3f626cd7b885a7fed1 Mon Sep 17 00:00:00 2001
+From 5c4069558d42fd61d083878335a704eb6f888ca9 Mon Sep 17 00:00:00 2001
 From: Kaixi Hou <kaixih@nvidia.com>
 Date: Tue, 4 May 2021 15:21:11 -0700
 Subject: [PATCH] Update headers path to TF-compat
@@ -50,7 +50,7 @@ index ded7e67..68341e1 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_Engine.h b/include/cudnn_frontend_Engine.h
-index 83d3f80..a1b0301 100644
+index 7e18cd7..d26f4ee 100644
 --- a/include/cudnn_frontend_Engine.h
 +++ b/include/cudnn_frontend_Engine.h
 @@ -30,8 +30,8 @@
@@ -65,7 +65,7 @@ index 83d3f80..a1b0301 100644
  #include "cudnn_frontend_OperationGraph.h"
  #include "cudnn_frontend_utils.h"
 diff --git a/include/cudnn_frontend_EngineConfig.h b/include/cudnn_frontend_EngineConfig.h
-index a9cdb7f..0bb47ae 100644
+index ea68554..0888858 100644
 --- a/include/cudnn_frontend_EngineConfig.h
 +++ b/include/cudnn_frontend_EngineConfig.h
 @@ -29,8 +29,8 @@
@@ -93,7 +93,7 @@ index 323106a..d90a1ea 100644
  #include "cudnn_frontend_Heuristics.h"
  
 diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
-index 7bed4b4..d79cfff 100644
+index e361821..88f5790 100644
 --- a/include/cudnn_frontend_ExecutionPlan.h
 +++ b/include/cudnn_frontend_ExecutionPlan.h
 @@ -30,8 +30,8 @@
@@ -121,7 +121,7 @@ index aac4086..ed1f343 100644
  namespace cudnn_frontend {
  
 diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
-index 376f654..2ff07c9 100644
+index b1c5e98..2ccd737 100644
 --- a/include/cudnn_frontend_Heuristics.h
 +++ b/include/cudnn_frontend_Heuristics.h
 @@ -25,8 +25,8 @@
@@ -151,7 +151,7 @@ index a7d0714..c5ccb90 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
-index b084046..dc3c840 100644
+index d69decd..15c32d2 100644
 --- a/include/cudnn_frontend_Operation.h
 +++ b/include/cudnn_frontend_Operation.h
 @@ -30,8 +30,8 @@
@@ -166,7 +166,7 @@ index b084046..dc3c840 100644
  #include "cudnn_frontend_ConvDesc.h"
  #include "cudnn_frontend_PointWiseDesc.h"
 diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index 1478ce8..1903102 100644
+index 8c708b7..ce5b000 100644
 --- a/include/cudnn_frontend_OperationGraph.h
 +++ b/include/cudnn_frontend_OperationGraph.h
 @@ -30,8 +30,8 @@
@@ -226,7 +226,7 @@ index 2174509..d0d7e3b 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_VariantPack.h b/include/cudnn_frontend_VariantPack.h
-index 0fe4f2f..41a2552 100644
+index dc68207..8b47fce 100644
 --- a/include/cudnn_frontend_VariantPack.h
 +++ b/include/cudnn_frontend_VariantPack.h
 @@ -30,8 +30,8 @@
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 508de07e36b..09346af8ef6 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -44,6 +44,8 @@ cc_library(
         "lib/bufref.h",
         "lib/c-hyper.c",
         "lib/c-hyper.h",
+        "lib/cfilters.c",
+        "lib/cfilters.h",
         "lib/config-amigaos.h",
         "lib/config-dos.h",
         "lib/config-mac.h",
@@ -63,7 +65,6 @@ cc_library(
         "lib/curl_addrinfo.c",
         "lib/curl_addrinfo.h",
         "lib/curl_base64.h",
-        "lib/curl_ctype.c",
         "lib/curl_ctype.h",
         "lib/curl_des.c",
         "lib/curl_des.h",
@@ -112,8 +113,6 @@ cc_library(
         "lib/dict.h",
         "lib/doh.c",
         "lib/doh.h",
-        "lib/dotdot.c",
-        "lib/dotdot.h",
         "lib/dynbuf.c",
         "lib/dynbuf.h",
         "lib/easy.c",
@@ -136,6 +135,7 @@ cc_library(
         "lib/ftp.h",
         "lib/ftplistparser.c",
         "lib/ftplistparser.h",
+        "lib/functypes.h",
         "lib/getenv.c",
         "lib/getinfo.c",
         "lib/getinfo.h",
@@ -172,7 +172,8 @@ cc_library(
         "lib/http_proxy.h",
         "lib/http_aws_sigv4.c",
         "lib/http_aws_sigv4.h",
-        "lib/idn_win32.c",
+        "lib/idn.c",
+        "lib/idn.h",
         "lib/if2ip.c",
         "lib/if2ip.h",
         "lib/imap.c",
@@ -200,6 +201,8 @@ cc_library(
         "lib/netrc.h",
         "lib/nonblock.c",
         "lib/nonblock.h",
+        "lib/noproxy.c",
+        "lib/noproxy.h",
         "lib/openldap.c",
         "lib/parsedate.c",
         "lib/parsedate.h",
@@ -279,6 +282,8 @@ cc_library(
         "lib/warnless.h",
         "lib/wildcard.c",
         "lib/wildcard.h",
+        "lib/ws.c",
+        "lib/ws.h",
         "lib/vauth/cleartext.c",
         "lib/vauth/cram.c",
         "lib/vauth/digest.c",
@@ -331,18 +336,19 @@ cc_library(
         "lib/vtls/sectransp.h",
         "lib/vtls/vtls.c",
         "lib/vtls/vtls.h",
+        "lib/vtls/vtls_int.h",
         "lib/vtls/wolfssl.c",
         "lib/vtls/wolfssl.h",
         "lib/vtls/x509asn1.c",
         "lib/vtls/x509asn1.h",
     ] + select({
-        "@org_tensorflow//tensorflow:macos": [
+        "@org_tensorflow//tensorflow/tsl:macos": [
             "lib/vtls/sectransp.c",
         ],
-        "@org_tensorflow//tensorflow:ios": [
+        "@org_tensorflow//tensorflow/tsl:ios": [
             "lib/vtls/sectransp.c",
         ],
-        "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow/tsl:windows": CURL_WIN_SRCS,
         "//conditions:default": [
         ],
     }),
@@ -358,9 +364,10 @@ cc_library(
         "include/curl/system.h",
         "include/curl/typecheck-gcc.h",
         "include/curl/urlapi.h",
+        "include/curl/websockets.h",
     ],
     copts = select({
-        "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow/tsl:windows": CURL_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
@@ -373,10 +380,10 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@org_tensorflow//tensorflow:macos": [
+        "@org_tensorflow//tensorflow/tsl:macos": [
             "-fno-constant-cfstrings",
         ],
-        "@org_tensorflow//tensorflow:windows": [
+        "@org_tensorflow//tensorflow/tsl:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -387,10 +394,10 @@ cc_library(
     defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@org_tensorflow//tensorflow:android": [
+        "@org_tensorflow//tensorflow/tsl:android": [
             "-pie",
         ],
-        "@org_tensorflow//tensorflow:macos": [
+        "@org_tensorflow//tensorflow/tsl:macos": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
@@ -398,8 +405,8 @@ cc_library(
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@org_tensorflow//tensorflow:ios": [],
-        "@org_tensorflow//tensorflow:windows": [
+        "@org_tensorflow//tensorflow/tsl:ios": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [
             "-DEFAULTLIB:ws2_32.lib",
             "-DEFAULTLIB:advapi32.lib",
             "-DEFAULTLIB:crypt32.lib",
@@ -413,8 +420,8 @@ cc_library(
     deps = [
         "@zlib",
     ] + select({
-        "@org_tensorflow//tensorflow:ios": [],
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:ios": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -520,7 +527,7 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@org_tensorflow//tensorflow:windows": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow/tsl:windows": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
             "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
diff --git a/third_party/dlpack/BUILD b/third_party/dlpack/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/dlpack/BUILD
+++ b/third_party/dlpack/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
index 91f471c3def..67e625d6842 100644
--- a/third_party/eigen3/workspace.bzl
+++ b/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "3bb6a48d8c171cf20b5f8e48bfb4e424fbd4f79e"
-    EIGEN_SHA256 = "eca9847b3fe6249e0234a342b78f73feec07d29f534e914ba5f920f3e09383a3"
+    EIGEN_COMMIT = "3460f3558e7b469efb8a225894e21929c8c77629"
+    EIGEN_SHA256 = "fa17c4ce22fd7bdfc5ce71120b42b2f29bc26491123251b53d9ba479586f6937"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/farmhash/BUILD b/third_party/farmhash/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/farmhash/BUILD
+++ b/third_party/farmhash/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/flatbuffers/BUILD b/third_party/flatbuffers/BUILD
index 82bab3ffd96..94210a033a8 100644
--- a/third_party/flatbuffers/BUILD
+++ b/third_party/flatbuffers/BUILD
@@ -1 +1,3 @@
 # This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 9fe15a3747f..5587a8cc67a 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -362,7 +362,7 @@ _gen_flatbuffer_srcs = rule(
         "_flatc": attr.label(
             default = Label("@flatbuffers//:flatc"),
             executable = True,
-            cfg = "host",
+            cfg = "exec",
         ),
     },
     output_to_genfiles = True,
diff --git a/third_party/gemmlowp/BUILD b/third_party/gemmlowp/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/gemmlowp/BUILD
+++ b/third_party/gemmlowp/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index 1b089008b77..c79ad6c7392 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -354,7 +354,6 @@ def _features(cpu, compiler, ctx):
                         actions = all_cpp_compile_actions(),
                         flag_groups = [
                             flag_group(flags = [
-                                "-fexperimental-new-pass-manager",
                                 "-fmerge-all-constants",
                             ]),
                         ] if compiler == "clang" else [],
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index c243d1f0f68..acb7a594e92 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -23,6 +23,25 @@ def if_cuda_clang(if_true, if_false = []):
        "//conditions:default": if_false
    })
 
+def cuda_compiler(if_cuda_clang, if_nvcc, neither = []):
+    """Shorthand for select()'ing on wheteher we're building with cuda-clang or nvcc.
+
+     Returns a select statement which evaluates to if_cuda_clang if we're building
+     with cuda-clang, if_nvcc if we're building with NVCC.
+     Otherwise, the select statement evaluates to neither.
+
+    """
+    if %{cuda_is_configured}:
+        return select({
+            "@local_config_cuda//cuda:using_clang": if_cuda_clang,
+            "@local_config_cuda//:is_cuda_compiler_nvcc": if_nvcc,
+            "//conditions:default": neither
+        })
+    else:
+        return select({
+            "//conditions:default": neither
+        })
+
 def if_cuda_clang_opt(if_true, if_false = []):
    """Shorthand for select()'ing on wheteher we're building with cuda-clang
    in opt mode.
@@ -42,10 +61,12 @@ def cuda_default_copts():
     return if_cuda([
         "-x", "cuda",
         "-DGOOGLE_CUDA=1",
-        "-Xcuda-fatbinary=--compress-all",
     ] + %{cuda_extra_copts}) + if_cuda_clang_opt(
         # Some important CUDA optimizations are only enabled at O3.
         ["-O3"]
+    ) + cuda_compiler(
+        if_cuda_clang = [ "-Xcuda-fatbinary", "--compress-all"],
+        if_nvcc = [ "-Xcuda-fatbinary=--compress-all"]
     )
 
 def cuda_gpu_architectures():
@@ -95,6 +116,10 @@ def cuda_library(copts = [], **kwargs):
     """Wrapper over cc_library which adds default CUDA options."""
     native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
 
+def cuda_cc_test(copts = [], **kwargs):
+    """Wrapper over cc_test which adds default CUDA options."""
+    native.cc_test(copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]), **kwargs)
+
 EnableCudaInfo = provider()
 
 def _enable_cuda_flag_impl(ctx):
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index 9eb6e644732..03ecd0159f4 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -18,6 +18,7 @@ limitations under the License.
 
 #define TF_CUDA_VERSION "%{cuda_version}"
 #define TF_CUDART_VERSION "%{cudart_version}"
+#define TF_CUPTI_VERSION "%{cupti_version}"
 #define TF_CUBLAS_VERSION "%{cublas_version}"
 #define TF_CUSOLVER_VERSION "%{cusolver_version}"
 #define TF_CURAND_VERSION "%{curand_version}"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 50591a4726d..922d26deef9 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -265,6 +265,18 @@ def _normalize_include_path(repository_ctx, path):
         return path[len(crosstool_folder) + 1:]
     return path
 
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
     """Compute the list of default C or C++ include directories."""
     if lang_is_cpp:
@@ -292,6 +304,18 @@ def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
     else:
         inc_dirs = stderr[index1 + 1:index2].strip()
 
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
     return [
         _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
         for p in inc_dirs.split("\n")
@@ -595,7 +619,7 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             "cupti",
             cpu_value,
             cuda_config.config["cupti_library_dir"],
-            cuda_config.cuda_version,
+            cuda_config.cupti_version,
             static = False,
         ),
         "cusparse": _check_cuda_lib_params(
@@ -684,8 +708,10 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
         # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
         if int(cuda_major) == 11:
             cudart_version = "64_110" if is_windows else "11.0"
+            cupti_version = cuda_version
         else:
             cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
+            cupti_version = cudart_version
         cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
         cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
         curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
@@ -696,6 +722,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
         # It changed from 'x.y' to just 'x' in CUDA 10.1.
         cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
         cudart_version = cuda_version
+        cupti_version = cuda_version
         cublas_version = cuda_lib_version
         cusolver_version = cuda_lib_version
         curand_version = cuda_lib_version
@@ -703,6 +730,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
         cusparse_version = cuda_lib_version
     else:
         cudart_version = cuda_version
+        cupti_version = cuda_version
         cublas_version = cuda_version
         cusolver_version = cuda_version
         curand_version = cuda_version
@@ -712,6 +740,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     return struct(
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
+        cupti_version = cupti_version,
         cuda_version_major = cuda_major,
         cudart_version = cudart_version,
         cublas_version = cublas_version,
@@ -840,6 +869,7 @@ filegroup(name="cudnn-include")
         {
             "%{cuda_version}": "",
             "%{cudart_version}": "",
+            "%{cupti_version}": "",
             "%{cublas_version}": "",
             "%{cusolver_version}": "",
             "%{curand_version}": "",
@@ -933,7 +963,7 @@ def _tf_sysroot(repository_ctx):
     return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    copts = []
+    copts = ["--no-cuda-include-ptx=all"] if _use_cuda_clang(repository_ctx) else []
     for capability in compute_capabilities:
         if capability.startswith("compute_"):
             capability = capability.replace("compute_", "sm_")
@@ -1304,6 +1334,7 @@ def _create_local_cuda_repository(repository_ctx):
         {
             "%{cuda_version}": cuda_config.cuda_version,
             "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
             "%{cublas_version}": cuda_config.cublas_version,
             "%{cusolver_version}": cuda_config.cusolver_version,
             "%{curand_version}": cuda_config.curand_version,
diff --git a/third_party/gpus/find_rocm_config.py b/third_party/gpus/find_rocm_config.py
index f5f9f3d5099..889f147e3b2 100644
--- a/third_party/gpus/find_rocm_config.py
+++ b/third_party/gpus/find_rocm_config.py
@@ -69,36 +69,27 @@ def _get_header_version(path, name):
 
 def _find_rocm_config(rocm_install_path):
 
-  def rocm_version_numbers_pre_rocm50(path, prior_err):
-    version_file = os.path.join(path, ".info/version-dev")
-    if not os.path.exists(version_file):
-      raise ConfigError("{} ROCm version file ".format(prior_err) +
-                        '"{}" not found either.'.format(version_file))
-    version_numbers = []
-    with open(version_file) as f:
-      version_string = f.read().strip()
-      version_numbers = version_string.split(".")
-    major = int(version_numbers[0])
-    minor = int(version_numbers[1])
-    patch = int(version_numbers[2].split("-")[0])
-    return major, minor, patch
+  def rocm_version_numbers(path):
+    possible_version_files = [
+        "include/rocm-core/rocm_version.h",  # ROCm 5.2
+        "include/rocm_version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError(
+          "ROCm version file not found in {}".format(possible_version_files))
 
-  def rocm_version_numbers_post_rocm50(path):
-    version_file = os.path.join(path, "include/rocm_version.h")
-    if not os.path.exists(version_file):
-      return False, 'ROCm version file "{}" not found.'.format(version_file) +\
-        " Trying an alternate approach to determine the ROCm version.", 0,0,0
     major = _get_header_version(version_file, "ROCM_VERSION_MAJOR")
     minor = _get_header_version(version_file, "ROCM_VERSION_MINOR")
     patch = _get_header_version(version_file, "ROCM_VERSION_PATCH")
-    return True, "", major, minor, patch
-
-  status, error_msg, major, minor, patch = \
-    rocm_version_numbers_post_rocm50(rocm_install_path)
+    return major, minor, patch
 
-  if not status:
-    major, minor, patch = \
-      rocm_version_numbers_pre_rocm50(rocm_install_path, error_msg)
+  major, minor, patch = rocm_version_numbers(rocm_install_path)
 
   rocm_config = {
       "rocm_version_number": _get_composite_version_number(major, minor, patch)
@@ -110,10 +101,20 @@ def rocm_version_numbers_post_rocm50(path):
 def _find_hipruntime_config(rocm_install_path):
 
   def hipruntime_version_number(path):
-    version_file = os.path.join(path, "hip/include/hip/hip_version.h")
-    if not os.path.exists(version_file):
-      raise ConfigError(
-          'HIP Runtime version file "{}" not found'.format(version_file))
+    possible_version_files = [
+        "include/hip/hip_version.h",  # ROCm 5.2
+        "hip/include/hip/hip_version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError("HIP Runtime version file not found in {}".format(
+          possible_version_files))
+
     # This header file has an explicit #define for HIP_VERSION, whose value
     # is (HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR)
     # Retreive the major + minor and re-calculate here, since we do not
@@ -132,8 +133,17 @@ def hipruntime_version_number(path):
 def _find_miopen_config(rocm_install_path):
 
   def miopen_version_numbers(path):
-    version_file = os.path.join(path, "miopen/include/miopen/version.h")
-    if not os.path.exists(version_file):
+    possible_version_files = [
+        "include/miopen/version.h",  # ROCm 5.2 and prior
+        "miopen/include/miopen/version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
       raise ConfigError(
           'MIOpen version file "{}" not found'.format(version_file))
     major = _get_header_version(version_file, "MIOPEN_VERSION_MAJOR")
@@ -155,8 +165,8 @@ def _find_rocblas_config(rocm_install_path):
 
   def rocblas_version_numbers(path):
     possible_version_files = [
-        "rocblas/include/rocblas-version.h",  # ROCm 3.7 and prior
-        "rocblas/include/internal/rocblas-version.h",  # ROCm 3.8
+        "include/rocblas/internal/rocblas-version.h",  # ROCm 5.2
+        "rocblas/include/internal/rocblas-version.h",  # ROCm 5.1 and prior
     ]
     version_file = None
     for f in possible_version_files:
@@ -214,10 +224,19 @@ def rocrand_version_number(path):
 def _find_rocfft_config(rocm_install_path):
 
   def rocfft_version_numbers(path):
-    version_file = os.path.join(path, "rocfft/include/rocfft-version.h")
-    if not os.path.exists(version_file):
+    possible_version_files = [
+        "include/rocfft/rocfft-version.h",  # ROCm 5.2
+        "rocfft/include/rocfft-version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
       raise ConfigError(
-          'rocfft version file "{}" not found'.format(version_file))
+          "rocfft version file not found in {}".format(possible_version_files))
     major = _get_header_version(version_file, "rocfft_version_major")
     minor = _get_header_version(version_file, "rocfft_version_minor")
     patch = _get_header_version(version_file, "rocfft_version_patch")
@@ -236,10 +255,19 @@ def rocfft_version_numbers(path):
 def _find_hipfft_config(rocm_install_path):
 
   def hipfft_version_numbers(path):
-    version_file = os.path.join(path, "hipfft/include/hipfft-version.h")
-    if not os.path.exists(version_file):
+    possible_version_files = [
+        "include/hipfft/hipfft-version.h",  # ROCm 5.2
+        "hipfft/include/hipfft-version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
       raise ConfigError(
-          'hipfft version file "{}" not found'.format(version_file))
+          "hipfft version file not found in {}".format(possible_version_files))
     major = _get_header_version(version_file, "hipfftVersionMajor")
     minor = _get_header_version(version_file, "hipfftVersionMinor")
     patch = _get_header_version(version_file, "hipfftVersionPatch")
@@ -258,10 +286,19 @@ def hipfft_version_numbers(path):
 def _find_roctracer_config(rocm_install_path):
 
   def roctracer_version_numbers(path):
-    version_file = os.path.join(path, "roctracer/include/roctracer.h")
-    if not os.path.exists(version_file):
-      raise ConfigError(
-          'roctracer version file "{}" not found'.format(version_file))
+    possible_version_files = [
+        "include/roctracer/roctracer.h",  # ROCm 5.2
+        "roctracer/include/roctracer.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError("roctracer version file not found in {}".format(
+          possible_version_files))
     major = _get_header_version(version_file, "ROCTRACER_VERSION_MAJOR")
     minor = _get_header_version(version_file, "ROCTRACER_VERSION_MINOR")
     # roctracer header does not have a patch version number
@@ -281,10 +318,19 @@ def roctracer_version_numbers(path):
 def _find_hipsparse_config(rocm_install_path):
 
   def hipsparse_version_numbers(path):
-    version_file = os.path.join(path, "hipsparse/include/hipsparse-version.h")
-    if not os.path.exists(version_file):
-      raise ConfigError(
-          'hipsparse version file "{}" not found'.format(version_file))
+    possible_version_files = [
+        "include/hipsparse/hipsparse-version.h",  # ROCm 5.2
+        "hipsparse/include/hipsparse-version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError("hipsparse version file not found in {}".format(
+          possible_version_files))
     major = _get_header_version(version_file, "hipsparseVersionMajor")
     minor = _get_header_version(version_file, "hipsparseVersionMinor")
     patch = _get_header_version(version_file, "hipsparseVersionPatch")
@@ -303,8 +349,9 @@ def _find_hipsolver_config(rocm_install_path):
 
   def hipsolver_version_numbers(path):
     possible_version_files = [
-        "hipsolver/include/hipsolver-version.h",  # ROCm 5.0 and prior
+        "include/hipsolver/internal/hipsolver-version.h",  # ROCm 5.2
         "hipsolver/include/internal/hipsolver-version.h",  # ROCm 5.1
+        "hipsolver/include/hipsolver-version.h",  # ROCm 5.0 and prior
     ]
     version_file = None
     for f in possible_version_files:
@@ -333,10 +380,19 @@ def hipsolver_version_numbers(path):
 def _find_rocsolver_config(rocm_install_path):
 
   def rocsolver_version_numbers(path):
-    version_file = os.path.join(path, "rocsolver/include/rocsolver-version.h")
-    if not os.path.exists(version_file):
-      raise ConfigError(
-          'rocsolver version file "{}" not found'.format(version_file))
+    possible_version_files = [
+        "include/rocsolver/rocsolver-version.h",  # ROCm 5.2
+        "rocsolver/include/rocsolver-version.h",  # ROCm 5.1 and prior
+    ]
+    version_file = None
+    for f in possible_version_files:
+      version_file_path = os.path.join(path, f)
+      if os.path.exists(version_file_path):
+        version_file = version_file_path
+        break
+    if not version_file:
+      raise ConfigError("rocsolver version file not found in {}".format(
+          possible_version_files))
     major = _get_header_version(version_file, "ROCSOLVER_VERSION_MAJOR")
     minor = _get_header_version(version_file, "ROCSOLVER_VERSION_MINOR")
     patch = _get_header_version(version_file, "ROCSOLVER_VERSION_PATCH")
diff --git a/third_party/gpus/find_rocm_config.py.gz.base64 b/third_party/gpus/find_rocm_config.py.gz.base64
index 8bbf8c0b66d..f2b1b2edb7c 100644
--- a/third_party/gpus/find_rocm_config.py.gz.base64
+++ b/third_party/gpus/find_rocm_config.py.gz.base64
@@ -1 +1 @@
-eJzdW21v2zgS/q5fwVNQRL44itO74hY55ABvkqK+a5PCcbtYNIEg27StrSx6SSppUPS/3wypF1KW/BI7/bBZLBpLw4fDmWcejmTmgFywxROPpjNJXnded8hgRsmAJoLxtzF7JN1UzhgXPunGMemjmSB9Kih/oGPfOXAOyPtoBOZ0TNJkTDmRML67CEfwT3anTT5TLiKWkNd+h3ho4Ga33Na/AeGJpWQePpGESZIKChCRIJMopoR+G9GFJFFCRmy+iKMwGVHyGMmZmiYDATfI7xkEG8oQrEOwX8CniWlHQqkcxp+ZlIuzk5PHx0c/VM76jE9PYm0oTt73Lq6ub6+OwWE15FMSUyEIp3+mEYelDp9IuAB/RuEQvIzDR8I4Caecwj3J0N9HHskombaJYBP5GHIKKONISB4NU2kFK/cO1mwaQLjChLjdW9K7dcmv3dvebRswfusN3t18GpDfuv1+93rQu7olN31ycXN92Rv0bq7h01vSvf6d/K93fdkmFEIF09BvC47+g5MRhlGljtxSajkwYdohsaCjaBKNYF3JNA2nlEzZA+UJLIcsKJ9HApMpwL0xoMTRPJKhVFeWFoXTnO/1x3Fd9yOPEqThzcUcph/ykD+hM2RGQ5x/DCkaScYjqnwkD5p9QCkGDmJg1SqfhKRz33GQ8GLEI+CZoCEHLggViiZ4JKawUdqQcYyaFA5cnCMFxlRiqBIV4ojnTiighfYfx49YMommKVcBxHFCjlkqfeXVIkSisxwcGZLlBmk24yydzpAkNHmIOEvmNJHkIeSRIqUH/n8IPnYH71q+05tAccG9OBpXpoyysLT1cnQccgeVO5RzlWpOZcpV2glcggCN2Jja8ZPhV6rXlefgyfAYigZvFX7V+u2beDFjX3UydOx1PvOc6ESoap+FfHyM/owhhxLq3hHp0OTBhLM5GYYiC2omDKVvhb8+gViVLkJ4QJWcwlCFCcryhC3kCWejuYsmKcpfCL5IyPskTGNcT5xSB9nqOFBzjEP6WP4bE/lvoAvZb8Akx3FGcQh1eqFSdIVR9q6UBEKqWmcOAe8FmsEsJJhSGWTTBehKgEvzlJnOlemmOUgZA6lkGMfGIPD1MmetjnSe8ixtc5IN0tTBkT4uEOarIpLzRv/APJoQt4ixiylkws/YgJ7UA5Y2X4zR92B/0GSvXOQ0jNXUS0atMlRL98yA4ebDRCRpkJVxkKTzIeXePPyD8TaBiOE/MGw0M+N/2oEf8neizMgRfsZPaA2flLk5jSZ5PoeHXrRJEs5pnp5+lg9Q8wU4DJIOQDBcJQyieGFqVJYarJ4YdQjuR8xnC5oju9yFHSKBsgFlP3dTOTn+xW3p+M/RN4ghp7761ePugZ6IvBLkyLsbH7Vc8kp511b4LTUOMqvsNQrRZQA46qI/BclaeKet7GYWJdAaT9m1HAxdGEGVmhVwmM/sfv+hyk3vanfJoQ+LA2RPxYgcZbD2j0tUX4EbIOoMREF1Ft9/wL55l7g5hKJDM0QEcqjXohUBVRD8plOK0YWiCWO3lecSnB1rxmutrSHeGS4VjdUtm1UiAGfV+DedLFUgyIwHILtZevIBaikl0/9gUZFcP0om7CQzPB7TB7fIEPqfD6HfoOcQngnYypO3nArIgBaCfDtTDpRBLNxsiCT+HKo0og96P9bZ9ItkWq60rOVm8YEVf7lXN1QrqChtjUItnhQUzO5gawUdzDmZoCKMvZaPVxZeq2JXTmKP9AV0TtJz/SyQuqjPNX3tsV8695mNKvV6m9PMZpGVWp3N6/t81mO3VaBmdVMjPqtZxYQ0abU5l6JkFKdjemLC+rNnEEo7/jaM8angsIZJFjUaOEGO7gpyuWTAnzCtUI9hDHWYhJJib84ZdPVLjZhFXh/kr9OG/6x01imxOX0727o+X/VvoeEOPnT/e9N37XRvjdG7LjByOmyLAZvhxTvXIsiAp2gGy2ygCgiSTEVb93PBXExrLcGZu3JTXsWqmg3WKQiiJzsrg90wS9M8pSYuTWOsQO8hpfYC7PcM1q3Bdc+escMD3g+n0jro2awdYBYteJrIaE432AcM44oH25Uq4Jzk5Yq/w/87VezSFmDo+uG73kfS116vquNV0n5ABvi0b/Qt0Mzrh4xv+IANLXW+/WMnAzPmjG+TxxkD53SnrbEAyTNMdHFCy4WN1xGx7mDJ5S5AX8Vp9KAlIm/WdDHrNvh4FMajNEZtgb0KikpE6k0EPOgw9YCgcR5D6DCATqjlTIENU4HNmcAnjkUIq0ep4rhr4TO52FZ5lta2vfAsBcHWjFOrYVW4SNElLpt11chdqK5mXtd242ZdLc1pVdc8wn1/g8rKDCt6smVZaZCisrKPL1ZYH3o3gP/cmtqCUDDRx6vrXTlVRXnedlZBqdnQGrax+t2kIe/1e5TFJpPbtSjumZGsnfcPa+7qM8QwDsVmjxHKcgXLwT8RDePSQ/3+BHrpspvKYE6Mhg8/H5dEbyu9xC7qH/6/8rdZoBGNEPiUBF1ZvAbrF4VwX1eP1yzR+o4bwASf4OqXUm348WL1XYBR05O88YeqXVGxgRHCGueWTAvDITxlfDVlwTTdSAbyWNo6UD45QSRAE3IhqDxw1cdoe4WA/Pz6vnu7h37Xhnl2y2vB7CoSTXVTrxJ2PVZayxqc/eqEPXtVKDjU4mZCoSybm8xNdMLQB0Q7qaBatf3GP7XUQQ2oADQM7FQE5q8kD5I/meyw87FxMfS715d5MbitAi/73szQFXwlQs0JgSoySrLO2Sgf25Gd9Ysb38Cs1q9GwdKFZ/C7Ung1fIbGs4Hoa7tOe6pqlU0mcrMiQ8PdWk4NYlYKfDR2zz23nBr/J7ScleCokdvvJ1UUHLn9dlJBUSN3203q8t64mZRsqlB6GWXvW0k5d/WlxWYczwx347gGMV9YvCjHNf5P4Lie6LO++OF5DLcxnsdvC+PjjuxuyHg9uy0eVV4WvDS7rbmrCi55OIKAbSTime3OOq5xTCnXV15ExTX0TyA5tB+Dfvfiqr+HZ4MqkPF0cFBmojjhwaj+Fm4WPlASZgzNV2x0Ljl1O7tIegMLGlXdYlhF2Gux9q7tlgdVeRf4HnLDV9KZ7c4ir3FMnddXXlTq9RQ/R+31XDsLvg3zbM23YPYg+w00aFR+i2IV8a/F2rv+Wx5UC4DFDxvtAKXtjq/VCiCrAtSV2tdhy0++DTDFy7V1eKd/uefnZ75eK4O49Vu1fb1RKzzYXS4smOfLhQmzD7moL5pmuTALsioXdVj7lwvTg2rHuLFelLY7d4yVOi+uvOjzf11dvFTneHvz/vNeOscq0LPfK1eA9vBmeas6qBKt0jj+hDqoepDVwdJptuppxJCMoxGeCMVDuWyitxzlSELxGHG2HjyPtu64aM3JVKeJ43WFuIbmh7fFgd/yiK9iePFEocALmi/PoXNFBZ7xhQz9KD9+0edLJGPx10gqa/deE6FysjQf4aeLcSipt8mBwVbDqE0OmTSNXfcVetO4tV9Lrhi4+muKFQNXvpXCcTTB8+PBeJRaZ4V94JXnDt4G11dXl8HlxSc89ZWxyjPG/O2cuKduS/VdnpXRSs3dk/+ck392TjudnHD1aVnrcMNC172gWMGEdY92a5a9warfdDorF71ur1yx7vVDDalSEJlAzUPYOlUuiu9UsJX9Sp/a+ZndhAjGJR17y2Lmg1jOhdcqtkn1xw+e+0qckVcCzzh7JZIm2qovV8ST8PUfTfj4J0DUc++Sq37/pn8GYnGXGCeOheQeALaKYSA9Eg9HOw7kKAjwUHMQkHNIUBDgGoNA6b1ervN/2WwVxA==
\ No newline at end of file
+eJztW21v2zgS/q5fQSgoam9dJSlugUMOOcCbZlHftUlhZ7tYtIVB27TNrSz6SCppUPS/7wxJyRIt2YqjvX6JgTa2NPNwOC8Ph7Z4RC7E+l7yxVKTVyevTsjNkpEblighf43FHemneimkikg/jskQxRQZMsXkLZtFwVFwRN7yKYizGUmTGZNEg35/Tafwx93pkQ9MKi4S8io6IR0UCN2tsPsvQLgXKVnRe5IITVLFAIIrMucxI+zrlK014QmZitU65jSZMnLH9dIM40DADPKHgxATTUGagvwaPs2LcoRqYzC+llqvz46P7+7uImqMjYRcHMdWUB2/HVxcXo0uX4LBRuW3JGZKEcn+l3IJU53cE7oGe6Z0AlbG9I4ISehCMrinBdp7J7nmyaJHlJjrOyoZoMy40pJPUl1yVmYdzLkoAO6iCQn7IzIYheSX/mgw6gHG74ObN9e/3ZDf+8Nh/+pmcDki10NycX31enAzuL6CT7+S/tUf5L+Dq9c9wsBVMAz7upZoPxjJ0Y0mdGTEWMmAubAGqTWb8jmfwrySRUoXjCzELZMJTIesmVxxhcFUYN4MUGK+4ppqc2VrUjjMeauvIAzD95InmIbXFysYfiKpvEdjyJJRHH8GIZpqITkzNpJbm32QUgIMRMeaWd4rzVZREGDCq6nkkGeKUQm5oIwr6uAxMVUZpQcRR69pFcDFFabAjGl0VWJczGVmhAFaW/tRfyqSOV+k0jgQ9ZSeiVRHxqo1xUQXGThmiIsNptlSinSxxCRhyS2XIlmxRJNbKrlJyg7Y/278vn/zphsFgzkUF9yL+cwbkju39Ox0rB8yA405TEoTasl0Kk3YCVwCB03FjJX9p+kXZueVxeC+YDEUDd7K7aq0OyrixUJ8scGwvrfxzGJiA2GqfUnl7CXaM4MYaqj7QKWTYh7MpViRCVXOqY4YNrbl9kYEfLUxEdwDrBTkgsZNUJbHYq2PpZiuQhRJkf4o2KIh7nOaxjifOGUBZmsQQM0JCeET2TuhsnfAC+4dZFIQBNOYQp1emBBdopc7l4YCIVTds4CA9QrFYBQyXjA9dsON0ZQxTq1jxGysimYWlYwwJJWmcVxQAltfZ1lrPZ2F3IVtRZySTR3UjHCCMJ6PSM5r7QNxPidh7uMQQyhU5LIBLakG3Mh8LGh/BvmjOnljomQ0NkNvCXU3rtq6V3QYLj5Ccc3GrozHSbqaMNlZ0T+F7BHwGP4Btemy6P/TE3iRn4gRIy/wM35CafhkxIvD2CTPxuigFT2S0BXLwjN08QA2X4PBQOkABOomYODFiyJHudBg9cTIQ3Cfi0isWYYcyhBWiATKBpj9PEz1/OU/w671/wptAx9KFpm3HRke2YHIM0VedD7NXnRD8sxY1zP4XaMHkTXyFoXYMgAcczFaAGWtO6ddd9N5CbimY+S6AbqOcqjSYgU8z0YOv3035WZXtU/J8wgmB8gd4yPywsGWXyExfQUugMgz4AXTWXz7DuvmpyTMIEw61ENwoEM7F8sIyIJgN1sw9C4UDY3DbhZLMHZmM95ybUXineFUUdjcKmeV6jgRHBvyTnHgxVzGst45+ZibGvJkGqczZgodWFDad5lGtIQwQ4mYAv45elWtVyN9mi1YQhq9z+b/oi1gypVImLmOuTZHF1dbnSdF4aJfrH8KnuXnPMsTSKrsNvsKLZLqbCF0z/JpecZtieaCEyCGL1nWYkyLohnedjoWUiQ0XsqWdjMg4tg+A9wA+Zrll5dY1Q7qmgogji/OK1mhqNBzNPrhcjiC5m/8rv+f62Fo3WZZ5gCMwVWOsXYs8FAMIOaLNw7DFXkFU+JcKy4j61QVRQV7ZyuPLTNQ/ObcHFYghGcHkDngfQ+8VcKOVir2JV/LNNF8xRqUfEHYs+DAugdA/Le/4FGwmdJT3e+s+zeD92RoY/g3lP8RbMJhmSms5dDg2sb7K246oc3MlkT0PBiTVV6P3C0F2G27T4sFSJ2CiCUJaEOwGXlBSnew9LtODXoNyfit3SJmDYwlFdsavpzSeJpCLwjWMQkcoLjZnUPzL0zTbHHuKKy6UHe4XAoDNkkVNiwKu/A1hfnDvhK6f0gH3KeqhzLg1tweToBbTihz12mpiTO4GKitoi8SUG2RAw3VE0Blh1okoK0xSzS04tjeNaAgJ/i4vqPEQRbxuMQm7rVhIo9UchCnvAvriZkaMdPzd4Nr8F2ZlEzfnDNT3jUXwbvdh5YdDPT+8uqxleejHNZ8eCiPbT9qqqO6ASnVXJEBKlHCs0KwHt2OlMb2dx+TmKpmGxAj2d4eBOGOcXMkExpnF17ubU42mhapIcITH+zsVJzz2u1SHkgVEKlf3vZHLWxTyjAH71RKMC1sVioLqHa/UihMb8tSgdMuYZRH9xlDQiU1Ywwj2c7mxaEde6h+lZd4wih4ADWKJ0/0sIceZOE3kt30sJMPytnQuBSH/avXWSmWi7AM6GqnkKJe7VSkJLTaNbm6t88uD+UXynyum9UJCra3sAKa+9NoNUXxsvbTEnpgjYDvfugK6qWS0Xz4AuqjoObD108PxWg+bvmsqpLa1XNTex4BbKO0vnZuxva//WvGCE6wJUawaO7PfkZw4mXtJ0Y4hBGs734oI1gTPtiL7w7jgzLGYWxQwnj/SC6oqY9qLihVnffV29/NBaWx/e5ASzoFhzVqEJxsez2CBdy829khOOEt9Sc2eGB/YN32ozfZN8P+xeWwhW22D1TYaB9tsjZ/3kow+5v4kt4yQl01Z75w7XuRVk4e0yzUVExtv1CqRq9lqMRqvWsoWeA3Dgp/AWn4q6GTba99sICbd42aCKe0BfPUShzYSlj3/ehuwlrx6IaiDHNwT1GCaaGtqCmd2s6iVJZec1GJ1Xp/UbLAJw0R3zbqMDayLZKGAdx8PZ9fasYembb3Jf8+lNNdKPuUn74E3OKgjRcfTDxtko6x4PGkU4I5nHSKMG2QTnXp1ZNOsax90qnCap90ihb4+5rGrLORbW9f4yo+f9foG1CPJvYpP7Uq+/Y5BzJGm6wBsRpdv/3Qyj7HBzr4B0UPqIWfFB/EHX5xetuc/wN3+BY47th6Etp/kp2SGZ/iaQI80CHmthKNIQnDIyhuPjyZi31HDSpONQR5tnslVUVeewrg+Sg/LLI5HmKerMn3vwY8f7xmewwbK6bwfAhE6Pvm40f7wKoWIv7CtZEOP2cPwpZOJWQaUbqeUc064wYPm3drtJo8tVqnu+9Rszq9vQ+m7FDc/fv0DsWd3867LCkFwiuVz+Tf5+QfJ6cnJy5Nqp25d5ga8/Z9Ybgjfvu+Pmg+uZ/3TG7f6r9jfvtVC0RiIBx9rCisj6Y0tbw/y1fdL+y+l53GSIgSUrNZZ5tqIqCylep08+XSHGvrhM/UGXmm8PRKZ4Nk7HfHTQv1j6e63PKo7lVkj8NFeLiTdcJPyeVweD08g1L+lBTOkigtOwDYzdWAGDQeewkCiMV4jMdVxmNyfk7C8RjnOB4bNrbTDf4CEAHDYg==
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 1a34b0006ed..a48ba68a1d5 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -186,11 +186,13 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/hsa/include")
 
     # Add HIP headers (needs to match $HIP_PATH)
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include")
     if int(rocm_config.rocm_version_number) >= 50200:
         inc_dirs.append(rocm_config.rocm_toolkit_path + "/include")
         inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip")
         inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocprim")
         inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocsolver")
+        inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/rocblas")
     else:
         inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include")
 
@@ -330,7 +332,7 @@ def _select_rocm_lib_paths(repository_ctx, libs_paths, bash_bin):
 
     return libs
 
-def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin):
+def _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin, enable_dcu):
     """Returns the ROCm libraries on the system.
 
     Args:
@@ -561,6 +563,7 @@ def _create_local_rocm_repository(repository_ctx):
 
     bash_bin = get_bash_bin(repository_ctx)
     rocm_config = _get_rocm_config(repository_ctx, bash_bin, find_rocm_config_script)
+    enable_dcu = get_host_environ(repository_ctx, "TF_NEED_DCU") == '1'
 
     # For ROCm 4.1 and above use hipfft, older ROCm versions use rocfft
     rocm_version_number = int(rocm_config.rocm_version_number)
@@ -620,7 +623,7 @@ def _create_local_rocm_repository(repository_ctx):
             ),
         )
 
-    rocm_libs = _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin)
+    rocm_libs = _find_libs(repository_ctx, rocm_config, hipfft_or_rocfft, miopen_path, rccl_path, bash_bin, enable_dcu)
     rocm_lib_srcs = []
     rocm_lib_outs = []
     for lib in rocm_libs.values():
@@ -753,6 +756,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{hip_runtime_library}": "amdhip64",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
+            "%{using_dcu}": str(enable_dcu),
         },
     )
 
diff --git a/third_party/hexagon/BUILD b/third_party/hexagon/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/hexagon/BUILD
+++ b/third_party/hexagon/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/highwayhash/BUILD b/third_party/highwayhash/BUILD
index 2f5d02becb9..9e2309bed15 100644
--- a/third_party/highwayhash/BUILD
+++ b/third_party/highwayhash/BUILD
@@ -1 +1,3 @@
 # Dummy BUILD file to make this directory a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/hwloc/BUILD b/third_party/hwloc/BUILD
index ce6d3712564..db5c9ec8873 100644
--- a/third_party/hwloc/BUILD
+++ b/third_party/hwloc/BUILD
@@ -1,6 +1,9 @@
 # BUILD file to make this directory a package.
 
-package(licenses = ["notice"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
 
 exports_files(
     ["static-components.h"],
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
index 50d4e88def2..6d316eb1f83 100644
--- a/third_party/hwloc/hwloc.BUILD
+++ b/third_party/hwloc/hwloc.BUILD
@@ -9,7 +9,7 @@ licenses(["notice"])
 exports_files(["COPYING"])
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@org_tensorflow//third_party:common.bzl", "template_rule")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 COMMON_INCLUDE_COPTS = [
     "-I.",
@@ -46,14 +46,14 @@ _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update({
     "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
 })
 
-template_rule(
+expand_template(
     name = "include_hwloc_autogen_config_h",
-    src = "include/hwloc/autogen/config.h.in",
     out = "include/hwloc/autogen/config.h",
     substitutions = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
         "//conditions:default": _INCLUDE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
     }),
+    template = "include/hwloc/autogen/config.h.in",
 )
 
 _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
@@ -210,21 +210,21 @@ _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS.update(_INCLUDE_PRIVATE_HWLO
 
 _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS.update(_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS)
 
-template_rule(
+expand_template(
     name = "include_private_hwloc_autogen__config_h",
-    src = "include/private/autogen/config.h.in",
     out = "include/private/autogen/config.h",
     substitutions = if_cuda(
         _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS,
         if_false = _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_LINUX_SUBS,
     ),
+    template = "include/private/autogen/config.h.in",
 )
 
-template_rule(
+expand_template(
     name = "move_static_components_h",
-    src = "@org_tensorflow//third_party/hwloc:static-components.h",
     out = "hwloc/static-components.h",
     substitutions = {"&hwloc_linuxio_component": "//&hwloc_linuxio_component"},
+    template = "@org_tensorflow//third_party/hwloc:static-components.h",
 )
 
 cc_library(
@@ -259,21 +259,21 @@ cc_library(
         "include/private/private.h",
         "include/private/xml.h",
     ] + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
             "hwloc/topology-x86.c",
             "include/private/cpuid-x86.h",
         ],
-        "@org_tensorflow//tensorflow:linux_aarch64": [
+        "@org_tensorflow//tensorflow/tsl:linux_aarch64": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@org_tensorflow//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow/tsl:linux_ppc64le": [
             "hwloc/topology-linux.c",
             "include/hwloc/linux.h",
         ],
-        "@org_tensorflow//tensorflow:freebsd": [
+        "@org_tensorflow//tensorflow/tsl:freebsd": [
             "hwloc/topology-freebsd.c",
             "hwloc/topology-x86.c",
             "include/private/cpuid-x86.h",
diff --git a/third_party/icu/BUILD b/third_party/icu/BUILD
index 82bab3ffd96..94210a033a8 100644
--- a/third_party/icu/BUILD
+++ b/third_party/icu/BUILD
@@ -1 +1,3 @@
 # This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/jpeg/BUILD b/third_party/jpeg/BUILD
index e3aec1fce93..ed1568c32f3 100644
--- a/third_party/jpeg/BUILD
+++ b/third_party/jpeg/BUILD
@@ -1 +1,3 @@
 # Needed to make this a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index e0b5013f4db..8b3f9f0b401 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -2,6 +2,7 @@
 #   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
 
 licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
@@ -126,6 +127,7 @@ cc_library(
     copts = libjpegturbo_copts,
     visibility = ["//visibility:public"],
     deps = select({
+        ":nosimd": [":simd_none"],
         ":k8": [":simd_x86_64"],
         ":armeabi-v7a": [":simd_armv7a"],
         ":arm64-v8a": [":simd_armv8a"],
@@ -757,8 +759,19 @@ genrule(
           "EOF",
 )
 
+string_flag(
+    name = "noasm",
+    build_setting_default = "no",
+)
+
+config_setting(
+    name = "nosimd",
+    flag_values = {":noasm": "yes"},
+)
+
 config_setting(
     name = "k8",
+    flag_values = {":noasm": "no"},
     values = {"cpu": "k8"},
 )
 
@@ -769,20 +782,24 @@ config_setting(
 
 config_setting(
     name = "armeabi-v7a",
+    flag_values = {":noasm": "no"},
     values = {"cpu": "armeabi-v7a"},
 )
 
 config_setting(
     name = "arm64-v8a",
+    flag_values = {":noasm": "no"},
     values = {"cpu": "arm64-v8a"},
 )
 
 config_setting(
     name = "windows",
+    flag_values = {":noasm": "no"},
     values = {"cpu": "x64_windows"},
 )
 
 config_setting(
     name = "linux_ppc64le",
+    flag_values = {":noasm": "no"},
     values = {"cpu": "ppc"},
 )
diff --git a/third_party/kissfft/BUILD b/third_party/kissfft/BUILD
index 82bab3ffd96..94210a033a8 100644
--- a/third_party/kissfft/BUILD
+++ b/third_party/kissfft/BUILD
@@ -1 +1,3 @@
 # This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/libprotobuf_mutator/BUILD b/third_party/libprotobuf_mutator/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/libprotobuf_mutator/BUILD
+++ b/third_party/libprotobuf_mutator/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/llvm/0001-llvm-nvptx-Fix-error-GVN-on-shared-memory-load.patch b/third_party/llvm/0001-llvm-nvptx-Fix-error-GVN-on-shared-memory-load.patch
index 02facaf60c4..d7678fd068d 100644
--- a/third_party/llvm/0001-llvm-nvptx-Fix-error-GVN-on-shared-memory-load.patch
+++ b/third_party/llvm/0001-llvm-nvptx-Fix-error-GVN-on-shared-memory-load.patch
@@ -1,9 +1,9 @@
 diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
-index ddfaf9c2c9fb..5cd78f480175 100644
+index e90db02427df..8a6699608298 100644
 --- a/llvm/lib/Transforms/Scalar/GVN.cpp
 +++ b/llvm/lib/Transforms/Scalar/GVN.cpp
-@@ -1108,6 +1108,40 @@ tryToConvertLoadOfPtrSelect(BasicBlock *DepBB, BasicBlock::iterator End,
-   return AvailableValue::getSelect(Sel);
+@@ -1154,6 +1154,40 @@ static Value *findDominatingValue(const MemoryLocation &Loc, Type *LoadTy,
+   return nullptr;
  }
 
 +/// Get instructions between two given instructions.
@@ -40,11 +40,11 @@ index ddfaf9c2c9fb..5cd78f480175 100644
 +  }
 +}
 +
- bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
-                                       Value *Address, AvailableValue &Res) {
-   if (!DepInfo.isDef() && !DepInfo.isClobber()) {
-@@ -1128,6 +1162,25 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
-   const DataLayout &DL = Load->getModule()->getDataLayout();
+ std::optional<AvailableValue>
+ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
+                                  Value *Address) {
+@@ -1161,6 +1195,24 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
+   assert(DepInfo.isLocal() && "expected a local dependence");
 
    Instruction *DepInst = DepInfo.getInst();
 +  // Deal with shared memory load of nvptx. Cannot forward if there is a call
@@ -60,12 +60,11 @@ index ddfaf9c2c9fb..5cd78f480175 100644
 +          // Thread-block level barrier may be called directly or indirectly.
 +          // e.g. direct call:
 +          //     call void @llvm.nvvm.barrier0()
-+          return false;
++          return std::nullopt;
 +        }
 +      }
 +    }
 +  }
-+
+
+   const DataLayout &DL = Load->getModule()->getDataLayout();
    if (DepInfo.isClobber()) {
-     // If the dependence is to a store that writes to a superset of the bits
-     // read by the load, we can extract the bits we need for the load from the
diff --git a/third_party/llvm/BUILD b/third_party/llvm/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/llvm/BUILD
+++ b/third_party/llvm/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
new file mode 100644
index 00000000000..8e8ed64ea3b
--- /dev/null
+++ b/third_party/llvm/generated.patch
@@ -0,0 +1,33 @@
+Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
+--- a/mlir/include/mlir/IR/OpDefinition.h
++++ b/mlir/include/mlir/IR/OpDefinition.h
+@@ -631,13 +631,14 @@
+   class Impl
+       : public TraitBase<ConcreteType, OneTypedResult<ResultType>::Impl> {
+   public:
+-    TypedValue<ResultType> getResult() {
+-      return cast<TypedValue<ResultType>>(this->getOperation()->getResult(0));
++   mlir::TypedValue<ResultType> getResult() {
++      return cast<mlir::TypedValue<ResultType>>(
++          this->getOperation()->getResult(0));
+     }
+ 
+     /// If the operation returns a single value, then the Op can be implicitly
+     /// converted to a Value. This yields the value of the only result.
+-    operator TypedValue<ResultType>() { return getResult(); }
++    operator mlir::TypedValue<ResultType>() { return getResult(); }
+ 
+     ResultType getType() { return getResult().getType(); }
+   };
+diff -ruN --strip-trailing-cr a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
++++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+@@ -269,6 +269,7 @@
+     llvm::dbgs() << "\nEOF\n";
+     llvm::dbgs() << "===================================\n";
+   });
++  (void)root;
+   DEBUG_WITH_TYPE(DEBUG_TYPE_DUMP_FILE, {
+     saveReproToTempFile(llvm::dbgs(), target, transform, passName,
+                         debugPayloadRootTag, debugTransformRootTag, binaryName);
diff --git a/third_party/llvm/toolchains.patch b/third_party/llvm/toolchains.patch
index 532f8bc1d74..39bd1f5d485 100644
--- a/third_party/llvm/toolchains.patch
+++ b/third_party/llvm/toolchains.patch
@@ -1,8 +1,8 @@
 diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-index 72558360228a..76fe5e5d939e 100644
+index d745b6e30a25..949328283b35 100644
 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
 +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-@@ -16,6 +16,29 @@ package(
+@@ -19,6 +19,29 @@ licenses(["notice"])
  
  exports_files(["LICENSE.TXT"])
  
@@ -33,10 +33,10 @@ index 72558360228a..76fe5e5d939e 100644
  # The necessary warnings and other compile flags should be provided by the
  # toolchain or the `.bazelrc` file. This is just a workaround until we have a
 diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
-index 5beb3cc7c410..698476e74e00 100644
+index 5507f80efa0b..4ed9f196e52e 100644
 --- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
 +++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
-@@ -82,11 +82,11 @@ os_defines = select({
+@@ -90,11 +90,11 @@ os_defines = select({
  # TODO: We should split out host vs. target here.
  llvm_config_defines = os_defines + select({
      "@bazel_tools//src/conditions:windows": native_arch_defines("X86", "x86_64-pc-win32"),
@@ -52,4 +52,4 @@ index 5beb3cc7c410..698476e74e00 100644
 +    "//llvm:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
      "//conditions:default": native_arch_defines("X86", "x86_64-unknown-linux-gnu"),
  }) + [
-     # These shouldn't be needed by the C++11 standard, but are for some
+     "LLVM_VERSION_MAJOR={}".format(LLVM_VERSION_MAJOR),
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index dac9d7eb091..b70be1ce29f 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "aee2a35ac4ab4fe62bb0ce4e314966ab9207efd1"
-    LLVM_SHA256 = "348058d7034aac64d7f58ba7711b6fde5d5b3d1bff36f1099fabc9a5c507961e"
+    LLVM_COMMIT = "8c712296fb75ff73db08f92444b35c438c01a405"
+    LLVM_SHA256 = "fc4c884b886a001275c7753dedebc005e0d16eb53115d9f63dc8fccc348e3074"
 
     tf_http_archive(
         name = name,
@@ -17,10 +17,10 @@ def repo(name):
         ],
         build_file = "//third_party/llvm:llvm.BUILD",
         patch_file = [
+            "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
-            "//third_party/llvm:temporary.patch",  # Cherry-picks and temporary reverts. Do not remove even if temporary.patch is empty.
             "//third_party/llvm:0001-mlir-ROCm-Add-shfl.sync.bfly-lowering.patch",
             "//third_party/llvm:0001-llvm-nvptx-Fix-error-GVN-on-shared-memory-load.patch",
             "//third_party/llvm:0001-mlir-not-fold-UnrealizedConversionCastOp-with-ui-si.patch",
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
index 1450b47fce9..cf582c2a0e8 100644
--- a/third_party/llvm_openmp/BUILD
+++ b/third_party/llvm_openmp/BUILD
@@ -1,7 +1,7 @@
 # Build file for OpenMP library that is part of llvm
 
 load(
-    "@org_tensorflow//tensorflow:tensorflow.bzl",
+    "@org_tensorflow//tensorflow/tsl:tsl.bzl",
     "if_linux_x86_64",
     "if_macos",
     "if_windows",
@@ -36,7 +36,7 @@ py_binary(
 )
 
 kmp_i18n_os_type = select({
-    "@org_tensorflow//tensorflow:windows": "win",
+    "@org_tensorflow//tensorflow/tsl:windows": "win",
     "//conditions:default": "lin",
 })
 
@@ -114,7 +114,7 @@ omp_vars_win = {
 }
 
 omp_all_cmake_vars = select({
-    "@org_tensorflow//tensorflow:windows": cmake_var_string(
+    "@org_tensorflow//tensorflow/tsl:windows": cmake_var_string(
         dict_add(
             omp_vars,
             omp_vars_win,
diff --git a/third_party/llvm_openmp/openmp.bzl b/third_party/llvm_openmp/openmp.bzl
index ad005c34919..b8e7b349c6e 100644
--- a/third_party/llvm_openmp/openmp.bzl
+++ b/third_party/llvm_openmp/openmp.bzl
@@ -4,7 +4,7 @@ after the TF 2.4 branch cut has passed.
 """
 
 load(
-    "//tensorflow/core/platform:rules_cc.bzl",
+    "//tensorflow/tsl/platform:rules_cc.bzl",
     "cc_binary",
 )
 
@@ -33,9 +33,9 @@ def dict_add(*dictionaries):
 
 def select_os_specific(L, M, W):
     return select({
-        "@org_tensorflow//tensorflow:linux_x86_64": L,
-        "@org_tensorflow//tensorflow:macos": M,
-        "@org_tensorflow//tensorflow:windows": W,
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": L,
+        "@org_tensorflow//tensorflow/tsl:macos": M,
+        "@org_tensorflow//tensorflow/tsl:windows": W,
         "//conditions:default": L,
     })
 
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 196897fea46..4b33634d714 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -2,121 +2,61 @@ licenses(["notice"])  # 3-Clause BSD
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
-config_setting(
+package(default_visibility = ["//visibility:public"])
+
+alias(
     name = "build_with_mkl",
-    define_values = {
-        "build_with_mkl": "true",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:build_with_mkl",
 )
 
-config_setting(
+alias(
     name = "build_with_mkl_lnx_x64",
-    define_values = {
-        "build_with_mkl": "true",
-    },
-    values = {
-        "cpu": "k8",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:build_with_mkl_lnx_x64",
 )
 
-config_setting(
+alias(
     name = "build_with_mkl_lnx_openmp",
-    constraint_values = [
-        "@platforms//os:linux",
-    ],
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_openmp": "true",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp",
 )
 
-config_setting(
+alias(
     name = "build_with_mkl_windows_openmp",
-    constraint_values = [
-        "@platforms//os:windows",
-    ],
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_openmp": "true",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:build_with_mkl_windows_openmp",
 )
 
-config_setting(
+alias(
     name = "build_with_mkl_aarch64",
-    define_values = {
-        "build_with_mkl_aarch64": "true",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:build_with_mkl_aarch64",
 )
 
-config_setting(
+alias(
     name = "enable_mkl",
-    define_values = {
-        "enable_mkl": "true",
-        "build_with_mkl": "true",
-    },
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:enable_mkl",
 )
 
-filegroup(
-    name = "LICENSE",
-    srcs = [
-        "MKL_LICENSE",
-        "@llvm_openmp//:LICENSE.txt",
-    ],
-    visibility = ["//visibility:public"],
+alias(
+    name = "intel_binary_blob",
+    actual = "//tensorflow/tsl/mkl:intel_binary_blob",
 )
 
-# TODO(Intel-tf) Remove the following 3 calls to cc_library and replace all uses
-# of mkl_libs_* with @llvm_openmp//:libiomp5.* directly.
+alias(
+    name = "LICENSE",
+    actual = "//tensorflow/tsl/mkl:LICENSE",
+)
 
-cc_library(
+alias(
     name = "mkl_libs_linux",
-    srcs = [
-        "@llvm_openmp//:libiomp5.so",
-    ],
-    hdrs = ["@llvm_openmp//:config_omp"],
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:mkl_libs_linux",
 )
 
-# MacOS build configuration is provided for completness, it has not been tested
-cc_library(
+alias(
     name = "mkl_libs_darwin",
-    srcs = [
-        "@llvm_openmp//:libiomp5.dylib",
-    ],
-    hdrs = ["@llvm_openmp//:config_omp"],
-    visibility = ["//visibility:public"],
+    actual = "//tensorflow/tsl/mkl:mkl_libs_darwin",
 )
 
-cc_library(
+alias(
     name = "mkl_libs_windows",
-    srcs = [
-        "@llvm_openmp//:libiomp5md.dll",
-    ],
-    hdrs = ["@llvm_openmp//:config_omp"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "intel_binary_blob",
-    visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            ":mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            ":mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            ":mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
+    actual = "//tensorflow/tsl/mkl:mkl_libs_windows",
 )
 
 bzl_library(
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 806b157bad6..d59bda83315 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -7,141 +7,24 @@ if_mkl_lnx_x64 is a conditional to check for MKL
 if_enable_mkl is a conditional to check if building with MKL and MKL is enabled.
 
 mkl_repository is a repository rule for creating MKL repository rule that can
-be pointed to either a local folder, or download it from the internet.
+be pointed to either a local folder, or downloaded from the internet.
 mkl_repository depends on the following environment variables:
   * `TF_MKL_ROOT`: The root folder where a copy of libmkl is located.
 """
 
-_TF_MKL_ROOT = "TF_MKL_ROOT"
-
-def if_mkl(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with oneDNN.
-
-      OneDNN gets built if we are building on platforms that support oneDNN
-      (x86 linux/windows) or if specifcially configured to use oneDNN.
-
-    Args:
-      if_true: expression to evaluate if building with oneDNN.
-      if_false: expression to evaluate if building without oneDNN.
-
-    Returns:
-      a select evaluating to either if_true or if_false as appropriate.
-
-    TODO(intel-tf):
-      the first "if_true" line is kept because non-x86 platforms (e.g., ARM)
-      may need it. It may be deleted in future with refactoring.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl:build_with_mkl_aarch64": if_true,
-        "@org_tensorflow//tensorflow:linux_x86_64": if_true,
-        "@org_tensorflow//tensorflow:windows": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_mkl_ml(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with MKL-ML.
-
-    Args:
-      if_true: expression to evaluate if building with MKL-ML.
-      if_false: expression to evaluate if building without MKL-ML
-        (i.e. without MKL at all, or with MKL-DNN only).
-
-    Returns:
-      a select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_opensource": if_false,
-        "@org_tensorflow//third_party/mkl:build_with_mkl": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_mkl_lnx_x64(if_true, if_false = []):
-    """Shorthand to select() if building with MKL and the target is Linux x86-64.
-
-    Args:
-      if_true: expression to evaluate if building with MKL is enabled and the
-        target platform is Linux x86-64.
-      if_false: expression to evaluate if building without MKL or for a
-        different platform.
-
-    Returns:
-      a select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl:build_with_mkl_lnx_x64": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_enable_mkl(if_true, if_false = []):
-    """Shorthand to select() if we are building with MKL and MKL is enabled.
-
-    This is only effective when built with MKL.
-
-    Args:
-      if_true: expression to evaluate if building with MKL and MKL is enabled
-      if_false: expression to evaluate if building without MKL or MKL is not enabled.
-
-    Returns:
-      A select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl:enable_mkl": if_true,
-        "//conditions:default": if_false,
-    })
-
-def mkl_deps():
-    """Returns the correct set of oneDNN library dependencies.
-
-      Shorthand for select() to pull in the correct set of oneDNN library deps
-      depending on the platform. x86 Linux/Windows with or without --config=mkl
-      will always build with oneDNN library.
-
-    Returns:
-      a select evaluating to a list of library dependencies, suitable for
-      inclusion in the deps attribute of rules.
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
-        "@org_tensorflow//tensorflow:linux_x86_64": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//tensorflow:windows": ["@mkl_dnn_v1//:mkl_dnn"],
-        "//conditions:default": [],
-    })
-
-def _enable_local_mkl(repository_ctx):
-    return _TF_MKL_ROOT in repository_ctx.os.environ
-
-def _mkl_autoconf_impl(repository_ctx):
-    """Implementation of the local_mkl_autoconf repository rule."""
-
-    if _enable_local_mkl(repository_ctx):
-        # Symlink lib and include local folders.
-        mkl_root = repository_ctx.os.environ[_TF_MKL_ROOT]
-        mkl_lib_path = "%s/lib" % mkl_root
-        repository_ctx.symlink(mkl_lib_path, "lib")
-        mkl_include_path = "%s/include" % mkl_root
-        repository_ctx.symlink(mkl_include_path, "include")
-        mkl_license_path = "%s/license.txt" % mkl_root
-        repository_ctx.symlink(mkl_license_path, "license.txt")
-    else:
-        # setup remote mkl repository.
-        repository_ctx.download_and_extract(
-            repository_ctx.attr.urls,
-            sha256 = repository_ctx.attr.sha256,
-            stripPrefix = repository_ctx.attr.strip_prefix,
-        )
-
-    # Also setup BUILD file.
-    repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
-
-mkl_repository = repository_rule(
-    implementation = _mkl_autoconf_impl,
-    environ = [
-        _TF_MKL_ROOT,
-    ],
-    attrs = {
-        "build_file": attr.label(),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
+load(
+    "//tensorflow/tsl/mkl:build_defs.bzl",
+    _if_enable_mkl = "if_enable_mkl",
+    _if_mkl = "if_mkl",
+    _if_mkl_lnx_x64 = "if_mkl_lnx_x64",
+    _if_mkl_ml = "if_mkl_ml",
+    _mkl_deps = "mkl_deps",
+    _mkl_repository = "mkl_repository",
 )
+
+if_mkl = _if_mkl
+if_mkl_ml = _if_mkl_ml
+if_mkl_lnx_x64 = _if_mkl_lnx_x64
+if_enable_mkl = _if_enable_mkl
+mkl_deps = _mkl_deps
+mkl_repository = _mkl_repository
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 89f20d39477..7fc25ea49d5 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index da43c568c69..22fcdf7f58c 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -1,19 +1,3 @@
-def if_mkl_open_source_only(if_true, if_false = []):
-    """Returns `if_true` if MKL-DNN v0.x is used.
-
-    Shorthand for select()'ing on whether we're building with
-    MKL-DNN v0.x open source library only, without depending on MKL binary form.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL-DNN v0.x open source library only. Otherwise, the select statement
-    evaluates to if_false.
-
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_opensource": if_true,
-        "//conditions:default": if_false,
-    })
-
 def if_mkldnn_openmp(if_true, if_false = []):
     """Returns `if_true` if OpenMP is used with oneDNN.
 
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 8123cc6c12a..8dc4e5fb1a2 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -1,18 +1,15 @@
 exports_files(["LICENSE"])
 
-load(
-    "@org_tensorflow//third_party:common.bzl",
-    "template_rule",
-)
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
-template_rule(
+expand_template(
     name = "mkldnn_config_h",
-    src = "include/mkldnn_config.h.in",
     out = "include/mkldnn_config.h",
     substitutions = {
         "#cmakedefine MKLDNN_CPU_BACKEND MKLDNN_BACKEND_${MKLDNN_CPU_BACKEND}": "#define MKLDNN_CPU_BACKEND MKLDNN_BACKEND_NATIVE",
         "#cmakedefine MKLDNN_GPU_BACKEND MKLDNN_BACKEND_${MKLDNN_GPU_BACKEND}": "#define MKLDNN_GPU_BACKEND MKLDNN_BACKEND_NONE",
     },
+    template = "include/mkldnn_config.h.in",
 )
 
 # Create the file mkldnn_version.h with MKL-DNN version numbers.
@@ -25,9 +22,8 @@ template_rule(
 # TODO(bhavanis): MKL-DNN minor version needs to be updated for MKL-DNN v1.x.
 # The current version numbers will work only if MKL-DNN v0.21 is used.
 
-template_rule(
+expand_template(
     name = "mkldnn_version_h",
-    src = "include/mkldnn_version.h.in",
     out = "include/mkldnn_version.h",
     substitutions = {
         "@MKLDNN_VERSION_MAJOR@": "0",
@@ -35,6 +31,7 @@ template_rule(
         "@MKLDNN_VERSION_PATCH@": "3",
         "@MKLDNN_VERSION_HASH@": "N/A",
     },
+    template = "include/mkldnn_version.h.in",
 )
 
 cc_library(
@@ -50,7 +47,7 @@ cc_library(
     ]) + [":mkldnn_version_h"],
     hdrs = glob(["include/*"]),
     copts = select({
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": ["-fexceptions"],
     }) + [
         "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
index c2642201602..a1085427ec0 100644
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -1,9 +1,6 @@
 exports_files(["LICENSE"])
 
-load(
-    "@org_tensorflow//third_party:common.bzl",
-    "template_rule",
-)
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 _DNNL_COPTS_THREADPOOL = [
     "-fopenmp-simd",
@@ -113,26 +110,26 @@ _DNNL_RUNTIME_OMP = {
     "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0",
 }
 
-template_rule(
+expand_template(
     name = "dnnl_config_h",
-    src = "include/oneapi/dnnl/dnnl_config.h.in",
     out = "include/oneapi/dnnl/dnnl_config.h",
     substitutions = select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP,
         "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
     }),
+    template = "include/oneapi/dnnl/dnnl_config.h.in",
 )
 
-template_rule(
+expand_template(
     name = "dnnl_version_h",
-    src = "include/oneapi/dnnl/dnnl_version.h.in",
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "2",
-        "@DNNL_VERSION_MINOR@": "6",
-        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_MINOR@": "7",
+        "@DNNL_VERSION_PATCH@": "3",
         "@DNNL_VERSION_HASH@": "N/A",
     },
+    template = "include/oneapi/dnnl/dnnl_version.h.in",
 )
 
 cc_library(
@@ -161,7 +158,6 @@ cc_library(
         "src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64",
         "src/cpu/gemm",
     ],
-    linkopts = ["-lgomp"],
     textual_hdrs = glob(
         [
             "include/**/*",
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index a60fe78a57a..26c61c7bffe 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,25 +1,10 @@
-exports_files(["LICENSE"])
+load("@org_tensorflow//tensorflow/tsl:tsl.bzl", "tf_openmp_copts")
+load("@org_tensorflow//third_party/mkl:build_defs.bzl", "if_mkl")
+load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
+load("@org_tensorflow//third_party/mkl:build_defs.bzl", "if_mkl_ml")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
-load(
-    "@org_tensorflow//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
-load(
-    "@org_tensorflow//tensorflow:tensorflow.bzl",
-    "tf_openmp_copts",
-)
-load(
-    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkldnn_openmp",
-)
-load(
-    "@org_tensorflow//third_party/mkl:build_defs.bzl",
-    "if_mkl_ml",
-)
-load(
-    "@org_tensorflow//third_party:common.bzl",
-    "template_rule",
-)
+exports_files(["LICENSE"])
 
 _CMAKE_COMMON_LIST = {
     "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
@@ -80,14 +65,14 @@ _DNNL_RUNTIME_THREADPOOL = {
 
 _DNNL_RUNTIME_THREADPOOL.update(_CMAKE_COMMON_LIST)
 
-template_rule(
+expand_template(
     name = "dnnl_config_h",
-    src = "include/oneapi/dnnl/dnnl_config.h.in",
     out = "include/oneapi/dnnl/dnnl_config.h",
     substitutions = select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_openmp": _DNNL_RUNTIME_OMP,
         "//conditions:default": _DNNL_RUNTIME_THREADPOOL,
     }),
+    template = "include/oneapi/dnnl/dnnl_config.h.in",
 )
 
 # Create the file dnnl_version.h with DNNL version numbers.
@@ -97,20 +82,20 @@ template_rule(
 # set to "version_major.version_minor.version_patch". The git hash version can
 # be set to NA.
 # TODO(agramesh1): Automatically get the version numbers from CMakeLists.txt.
-template_rule(
+expand_template(
     name = "dnnl_version_h",
-    src = "include/oneapi/dnnl/dnnl_version.h.in",
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "2",
         "@DNNL_VERSION_MINOR@": "7",
-        "@DNNL_VERSION_PATCH@": "1",
+        "@DNNL_VERSION_PATCH@": "3",
         "@DNNL_VERSION_HASH@": "N/A",
     },
+    template = "include/oneapi/dnnl/dnnl_version.h.in",
 )
 
 _COPTS_LIST = select({
-    "@org_tensorflow//tensorflow:windows": [],
+    "@org_tensorflow//tensorflow/tsl:windows": [],
     "//conditions:default": ["-fexceptions"],
 }) + [
     "-UUSE_MKL",
@@ -177,9 +162,9 @@ cc_library(
     includes = _INCLUDES_LIST,
     # TODO(penpornk): Use lrt_if_needed from tensorflow.bzl instead.
     linkopts = select({
-        "@org_tensorflow//tensorflow:linux_aarch64": ["-lrt"],
-        "@org_tensorflow//tensorflow:linux_x86_64": ["-lrt"],
-        "@org_tensorflow//tensorflow:linux_ppc64le": ["-lrt"],
+        "@org_tensorflow//tensorflow/tsl:linux_aarch64": ["-lrt"],
+        "@org_tensorflow//tensorflow/tsl:linux_x86_64": ["-lrt"],
+        "@org_tensorflow//tensorflow/tsl:linux_ppc64le": ["-lrt"],
         "//conditions:default": [],
     }),
     textual_hdrs = _TEXTUAL_HDRS_LIST,
diff --git a/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch b/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch
new file mode 100644
index 00000000000..7e3725af270
--- /dev/null
+++ b/third_party/mkl_dnn/onednn_acl_threadpool_scheduler.patch
@@ -0,0 +1,28 @@
+diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+index 418d7f30f..439ca862e 100644
+--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
++++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+@@ -1,5 +1,5 @@
+ /*******************************************************************************
+-* Copyright 2022 Arm Ltd. and affiliates
++* Copyright 2022-2023 Arm Ltd. and affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+@@ -117,14 +117,14 @@ void ThreadpoolScheduler::run_workloads(
+     if (is_async) b.init(num_threads);
+     tp->parallel_for(num_threads, [&](int ithr, int nthr) {
+         bool is_main = get_active_threadpool() == tp;
+-        if (is_main) activate_threadpool(tp);
++        if (!is_main) activate_threadpool(tp);
+         // Make ThreadInfo local to avoid race conditions
+         ThreadInfo info;
+         info.cpu_info = &cpu_info();
+         info.num_threads = nthr;
+         info.thread_id = ithr;
+         process_workloads(workloads, feeder, info);
+-        if (is_main) deactivate_threadpool();
++        if (!is_main) deactivate_threadpool();
+         if (is_async) b.notify();
+     });
+     if (is_async) b.wait();
diff --git a/third_party/nasm/BUILD b/third_party/nasm/BUILD
index e3aec1fce93..ed1568c32f3 100644
--- a/third_party/nasm/BUILD
+++ b/third_party/nasm/BUILD
@@ -1 +1,3 @@
 # Needed to make this a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index d65f2be5baa..915672d0432 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -5,7 +5,11 @@ licenses(["notice"])
 
 exports_files(["LICENSE.txt"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_library",
+    "if_cuda_clang",
+)
 load(
     "@local_config_nccl//:build_defs.bzl",
     "cuda_rdc_library",
@@ -76,10 +80,7 @@ cc_library(
         "src/transport/coll_net.cc",
         "src/transport/net.cc",
     ],
-    linkopts = select({
-        "@org_tensorflow//tensorflow:macos": [],
-        "//conditions:default": ["-lrt"],
-    }),
+    linkopts = ["-lrt"],
     deps = [
         ":include_hdrs",
         ":src_hdrs",
@@ -99,6 +100,7 @@ cc_library(
             "src/collectives/device/**",
             "src/transport/coll_net.cc",
             "src/transport/net.cc",
+            "src/enqueue.cc",
         ],
     ) + [
         # Required for header inclusion checking (see
@@ -110,16 +112,35 @@ cc_library(
     ],
     hdrs = ["src/nccl.h"],
     include_prefix = "third_party/nccl",
-    linkopts = select({
-        "@org_tensorflow//tensorflow:macos": [],
-        "//conditions:default": ["-lrt"],
-    }),
+    linkopts = ["-lrt"],
     strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = [
         ":device",
+        ":enqueue",
         ":include_hdrs",
         ":net",
         ":src_hdrs",
     ],
 )
+
+cc_library(
+    name = "enqueue",
+    srcs = [
+        "src/enqueue.cc",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = if_cuda_clang([
+        "-x",
+        "cuda",
+    ]),
+    include_prefix = "third_party/nccl",
+    linkopts = ["-lrt"],
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index c8adf8fe052..7934adf6e1d 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -55,3 +55,16 @@ index 985274e..7ebb1e1 100644
  #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
+diff --git a/src/include/nvtx.h b/src/include/nvtx.h
+index 2aeb932..cdc67d2 100644
+--- a/src/include/nvtx.h
++++ b/src/include/nvtx.h
+@@ -37,7 +37,7 @@ struct nccl_domain{static constexpr char const* name{"NCCL"};};
+
+ class payload_schema {
+  public:
+-  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
++  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+   {
+     schema_attr.name = schemaName;
+     schema_attr.entries = entries;
\ No newline at end of file
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 31584a9cb97..bc643b06cc5 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -5,6 +5,7 @@ load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
+_cuda_clang = %{cuda_clang}
 
 def _gen_device_srcs_impl(ctx):
     ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
@@ -360,8 +361,8 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         out = dlink_cc,
         gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
-            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
-            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            "@org_tensorflow//tensorflow/tsl:linux_x86_64": ["--cpu-arch=X86_64"],
+            "@org_tensorflow//tensorflow/tsl:linux_ppc64le": ["--cpu-arch=PPC64LE"],
             "//conditions:default": [],
         }),
     )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 5432c35fac0..10cc2729ee9 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -8,6 +8,7 @@
     files.
   * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
     `/usr/local/cuda,usr/`.
+  * `TF_CUDA_CLANG`: "1" if using Clang, "0" if using NVCC.
 
 """
 
@@ -29,6 +30,8 @@ _NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NEED_CUDA = "TF_NEED_CUDA"
+_TF_CUDA_PATHS = "TF_CUDA_PATHS"
+_TF_CUDA_CLANG = "TF_CUDA_CLANG"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -84,7 +87,10 @@ def _create_local_nccl_repository(repository_ctx):
         repository_ctx.template(
             "build_defs.bzl",
             _label("build_defs.bzl.tpl"),
-            {"%{cuda_version}": "(%s, %s)" % tuple(cuda_version)},
+            {
+                "%{cuda_version}": "(%s, %s)" % tuple(cuda_version),
+                "%{cuda_clang}": repr(get_host_environ(repository_ctx, _TF_CUDA_CLANG)),
+            },
         )
     else:
         # Create target for locally installed NCCL.
@@ -128,7 +134,8 @@ _ENVIRONS = [
     _TF_NCCL_VERSION,
     _TF_CUDA_COMPUTE_CAPABILITIES,
     _TF_NEED_CUDA,
-    "TF_CUDA_PATHS",
+    _TF_CUDA_PATHS,
+    _TF_CUDA_CLANG,
 ]
 
 remote_nccl_configure = repository_rule(
diff --git a/third_party/nvtx.BUILD b/third_party/nvtx.BUILD
new file mode 100644
index 00000000000..d3d42582fef
--- /dev/null
+++ b/third_party/nvtx.BUILD
@@ -0,0 +1,20 @@
+#Description : NVIDIA Tools Extension (NVTX) library for adding profiling annotations to applications.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["restricted"])  # NVIDIA proprietary license
+
+filegroup(
+    name = "nvtx_header_files",
+    srcs = glob([
+        "nvtx3/**",
+    ]),
+)
+
+cc_library(
+    name = "nvtx",
+    hdrs = [":nvtx_header_files"],
+    include_prefix = "third_party",
+)
diff --git a/third_party/opencl_headers/BUILD b/third_party/opencl_headers/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/opencl_headers/BUILD
+++ b/third_party/opencl_headers/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/ortools/ortools.patch b/third_party/ortools/ortools.patch
index d637c1cb37c..7dfd842a8e8 100644
--- a/third_party/ortools/ortools.patch
+++ b/third_party/ortools/ortools.patch
@@ -124,4 +124,25 @@ diff '--color=auto' -u -r or-tools-7.3/ortools/util/file_util.cc or-tools-7.3-ne
 +      break;
    }
    const std::string output_filename = absl::StrCat(filename, file_type_suffix);
-   VLOG(1) << "Writing " << output_string.size() << " bytes to "
\ No newline at end of file
+   VLOG(1) << "Writing " << output_string.size() << " bytes to "
+diff --git a/ortools/util/fp_utils.h b/ortools/util/fp_utils.h
+index 569eeddf47..7188abb29a 100644
+--- a/ortools/util/fp_utils.h
++++ b/ortools/util/fp_utils.h
+@@ -78,11 +78,14 @@ class ScopedFloatingPointEnv {
+   void EnableExceptions(int excepts) {
+ #if defined(_MSC_VER)
+     // _controlfp(static_cast<unsigned int>(excepts), _MCW_EM);
+-#elif defined(ARCH_K8)
++#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) && \
++    !defined(__ANDROID__)
+     CHECK_EQ(0, fegetenv(&fenv_));
+     excepts &= FE_ALL_EXCEPT;
+-#ifdef __APPLE__
++#if defined(__APPLE__)
+     fenv_.__control &= ~excepts;
++#elif defined(__FreeBSD__)
++    fenv_.__x87.__control &= ~excepts;
+ #else  // Linux
+     fenv_.__control_word &= ~excepts;
+ #endif
\ No newline at end of file
diff --git a/third_party/pasta/BUILD b/third_party/pasta/BUILD
index 9bd256a5793..f830649f6f4 100644
--- a/third_party/pasta/BUILD
+++ b/third_party/pasta/BUILD
@@ -1 +1,3 @@
 # Empty BUILD file to force build system to see this directory at all.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 63e973613ce..3f3fcf7e54c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -33,7 +33,7 @@ cc_library(
             "intel/intel_init.c",
             "intel/filter_sse2_intrinsics.c",
         ],
-        "@org_tensorflow//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow/tsl:linux_ppc64le": [
             "powerpc/powerpc_init.c",
             "powerpc/filter_vsx_intrinsics.c",
         ],
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
index dd38a09cd0c..b7d33118665 100644
--- a/third_party/pprof.BUILD
+++ b/third_party/pprof.BUILD
@@ -4,7 +4,7 @@ package(
 
 licenses(["notice"])  # MIT
 
-load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
+load("@org_tensorflow//tensorflow/tsl/platform/default:build_config.bzl", "py_proto_library")
 
 exports_files(["pprof/LICENSE"])
 
diff --git a/third_party/protobuf/protobuf.patch b/third_party/protobuf/protobuf.patch
index 1aae0459185..9d928ba175f 100644
--- a/third_party/protobuf/protobuf.patch
+++ b/third_party/protobuf/protobuf.patch
@@ -1,201 +1,141 @@
-diff --git a/BUILD b/BUILD
-index dbae719ff..4e276c854 100644
---- a/BUILD
-+++ b/BUILD
-@@ -23,7 +23,7 @@ config_setting(
- # ZLIB configuration
- ################################################################################
- 
--ZLIB_DEPS = ["@zlib//:zlib"]
-+ZLIB_DEPS = ["@zlib"]
- 
- ################################################################################
- # Protobuf Runtime Library
-@@ -100,6 +100,7 @@ LINK_OPTS = select({
- 
- load(
-     ":protobuf.bzl",
-+    "adapt_proto_library",
-     "cc_proto_library",
-     "py_proto_library",
-     "internal_copied_filegroup",
-@@ -143,6 +144,7 @@ cc_library(
+diff --git a/BUILD.bazel b/BUILD.bazel
+--- a/BUILD.bazel	(revision 90b73ac3f0b10320315c2ca0d03a5a9b095d2f66)
++++ b/BUILD.bazel	(date 1670471682469)
+@@ -68,6 +68,7 @@
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
 +    alwayslink = 1,
      visibility = ["//visibility:public"],
  )
- 
-@@ -213,6 +215,7 @@ cc_library(
+
+@@ -135,6 +136,7 @@
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
 +    alwayslink = 1,
      visibility = ["//visibility:public"],
-     deps = [":protobuf_lite"] + PROTOBUF_DEPS,
- )
-@@ -255,13 +258,15 @@ filegroup(
-     visibility = ["//visibility:public"],
- )
+     deps = [":protobuf_lite"] + select({
+         "//build_defs:config_msvc": [],
+diff --git a/python/google/protobuf/pyext/descriptor.cc b/python/google/protobuf/pyext/descriptor.cc
+index 162531226..e93ec4809 100644
+--- a/python/google/protobuf/pyext/descriptor.cc
++++ b/python/google/protobuf/pyext/descriptor.cc
+@@ -58,6 +58,37 @@
+               : 0)                                               \
+        : PyBytes_AsStringAndSize(ob, (charpp), (sizep)))
  
--cc_proto_library(
-+adapt_proto_library(
-+    name = "cc_wkt_protos_genproto",
-+    deps = [proto + "_proto" for proto in WELL_KNOWN_PROTO_MAP.keys()],
-+    visibility = ["//visibility:public"],
-+)
++#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
++static PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
++{
++    Py_INCREF(frame->f_code);
++    return frame->f_code;
++}
 +
-+cc_library(
-     name = "cc_wkt_protos",
--    srcs = WELL_KNOWN_PROTOS,
--    include = "src",
--    default_runtime = ":protobuf",
--    internal_bootstrap_hack = 1,
--    protoc = ":protoc",
-+    deprecation = "Only for backward compatibility. Do not use.",
-     visibility = ["//visibility:public"],
- )
- 
-@@ -978,10 +983,10 @@ cc_library(
- 
- proto_lang_toolchain(
-     name = "cc_toolchain",
-+    blacklisted_protos = [proto + "_proto" for proto in WELL_KNOWN_PROTO_MAP.keys()],
-     command_line = "--cpp_out=$(OUT)",
-     runtime = ":protobuf",
-     visibility = ["//visibility:public"],
--    blacklisted_protos = [":_internal_wkt_protos_genrule"],
- )
- 
- proto_lang_toolchain(
-diff --git a/protobuf.bzl b/protobuf.bzl
-index e0653321f..4156a1275 100644
---- a/protobuf.bzl
-+++ b/protobuf.bzl
-@@ -1,4 +1,5 @@
- load("@bazel_skylib//lib:versions.bzl", "versions")
-+load("@rules_proto//proto:defs.bzl", "ProtoInfo")
- 
- def _GetPath(ctx, path):
-     if ctx.label.workspace_root:
-@@ -85,6 +86,8 @@ def _proto_gen_impl(ctx):
-     for dep in ctx.attr.deps:
-         import_flags += dep.proto.import_flags
-         deps += dep.proto.deps
-+    import_flags = depset(import_flags).to_list()
-+    deps = depset(deps).to_list()
- 
-     if not ctx.attr.gen_cc and not ctx.attr.gen_py and not ctx.executable.plugin:
-         return struct(
-@@ -222,6 +225,29 @@ Args:
-   outs: a list of labels of the expected outputs from the protocol compiler.
- """
- 
-+def _adapt_proto_library_impl(ctx):
-+    deps = [dep[ProtoInfo] for dep in ctx.attr.deps]
++static PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
++{
++    Py_XINCREF(frame->f_back);
++    return frame->f_back;
++}
++#endif
 +
-+    srcs = [src for dep in deps for src in dep.direct_sources]
-+    return struct(
-+        proto = struct(
-+            srcs = srcs,
-+            import_flags = ["-I{}".format(path) for dep in deps for path in dep.transitive_proto_path.to_list()],
-+            deps = srcs,
-+        ),
-+    )
++#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
++static PyObject* PyFrame_GetLocals(PyFrameObject *frame)
++{
++    if (PyFrame_FastToLocalsWithError(frame) < 0) {
++        return NULL;
++    }
++    Py_INCREF(frame->f_locals);
++    return frame->f_locals;
++}
 +
-+adapt_proto_library = rule(
-+    implementation = _adapt_proto_library_impl,
-+    attrs = {
-+        "deps": attr.label_list(
-+            mandatory = True,
-+            providers = [ProtoInfo],
-+        ),
-+    },
-+    doc = "Adapts `proto_library` from `@rules_proto` to be used with `{cc,py}_proto_library` from this file.",
-+)
++static PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
++{
++    Py_INCREF(frame->f_globals);
++    return frame->f_globals;
++}
++#endif
 +
- def cc_proto_library(
-         name,
-         srcs = [],
-@@ -229,7 +255,6 @@ def cc_proto_library(
-         cc_libs = [],
-         include = None,
-         protoc = "@com_google_protobuf//:protoc",
--        internal_bootstrap_hack = False,
-         use_grpc_plugin = False,
-         default_runtime = "@com_google_protobuf//:protobuf",
-         **kargs):
-@@ -247,41 +272,17 @@ def cc_proto_library(
-           cc_library.
-       include: a string indicating the include path of the .proto files.
-       protoc: the label of the protocol compiler to generate the sources.
--      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
--          for bootstraping. When it is set to True, no files will be generated.
--          The rule will simply be a provider for .proto files, so that other
--          cc_proto_library can depend on it.
-       use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
-           when processing the proto files.
-       default_runtime: the implicitly default runtime which will be depended on by
-           the generated cc_library target.
-       **kargs: other keyword arguments that are passed to cc_library.
--
-     """
- 
-     includes = []
-     if include != None:
-         includes = [include]
+ namespace google {
+ namespace protobuf {
+ namespace python {
+@@ -96,48 +127,66 @@ bool _CalledFromGeneratedFile(int stacklevel) {
+   // This check is not critical and is somewhat difficult to implement correctly
+   // in PyPy.
+   PyFrameObject* frame = PyEval_GetFrame();
++  PyCodeObject* frame_code = nullptr;
++  PyObject* frame_globals = nullptr;
++  PyObject* frame_locals = nullptr;
++  bool result = false;
++
+   if (frame == nullptr) {
+-    return false;
++    goto exit;
+   }
++  Py_INCREF(frame);
+   while (stacklevel-- > 0) {
+-    frame = frame->f_back;
++    PyFrameObject* next_frame = PyFrame_GetBack(frame);
++    Py_DECREF(frame);
++    frame = next_frame;
+     if (frame == nullptr) {
+-      return false;
++      goto exit;
+     }
+   }
  
--    if internal_bootstrap_hack:
--        # For pre-checked-in generated files, we add the internal_bootstrap_hack
--        # which will skip the codegen action.
--        proto_gen(
--            name = name + "_genproto",
--            srcs = srcs,
--            deps = [s + "_genproto" for s in deps],
--            includes = includes,
--            protoc = protoc,
--            visibility = ["//visibility:public"],
--        )
--
--        # An empty cc_library to make rule dependency consistent.
--        native.cc_library(
--            name = name,
--            **kargs
--        )
--        return
--
-     grpc_cpp_plugin = None
-     if use_grpc_plugin:
-         grpc_cpp_plugin = "//external:grpc_cpp_plugin"
-diff --git a/python/google/protobuf/pyext/message.cc b/python/google/protobuf/pyext/message.cc
-index 3530a9b37..c31fa8fcc 100644
---- a/python/google/protobuf/pyext/message.cc
-+++ b/python/google/protobuf/pyext/message.cc
-@@ -2991,8 +2991,12 @@ bool InitProto2MessageModule(PyObject *m) {
-         reinterpret_cast<PyObject*>(
-             &RepeatedCompositeContainer_Type));
+-  if (frame->f_code->co_filename == nullptr) {
+-    return false;
++  frame_code = PyFrame_GetCode(frame);
++  if (frame_code->co_filename == nullptr) {
++    goto exit;
+   }
+   char* filename;
+   Py_ssize_t filename_size;
+-  if (PyString_AsStringAndSize(frame->f_code->co_filename,
++  if (PyString_AsStringAndSize(frame_code->co_filename,
+                                &filename, &filename_size) < 0) {
+     // filename is not a string.
+     PyErr_Clear();
+-    return false;
++    goto exit;
+   }
+   if ((filename_size < 3) ||
+       (strcmp(&filename[filename_size - 3], ".py") != 0)) {
+     // Cython's stack does not have .py file name and is not at global module
+     // scope.
+-    return true;
++    result = true;
++    goto exit;
+   }
+   if (filename_size < 7) {
+     // filename is too short.
+-    return false;
++    goto exit;
+   }
+   if (strcmp(&filename[filename_size - 7], "_pb2.py") != 0) {
+     // Filename is not ending with _pb2.
+-    return false;
++    goto exit;
+   }
  
--    // Register them as collections.Sequence
-+    // Register them as MutableSequence.
-+#if PY_MAJOR_VERSION >= 3
-+    ScopedPyObjectPtr collections(PyImport_ImportModule("collections.abc"));
-+#else
-     ScopedPyObjectPtr collections(PyImport_ImportModule("collections"));
-+#endif
-     if (collections == NULL) {
-       return false;
-     }
-diff --git a/python/google/protobuf/pyext/unknown_fields.cc b/python/google/protobuf/pyext/unknown_fields.cc
-index c3679c0d3..e80a1d97a 100755
---- a/python/google/protobuf/pyext/unknown_fields.cc
-+++ b/python/google/protobuf/pyext/unknown_fields.cc
-@@ -221,7 +221,7 @@ const UnknownField* GetUnknownField(PyUnknownFieldRef* self) {
-                  "The parent message might be cleared.");
-     return NULL;
+-  if (frame->f_globals != frame->f_locals) {
++  frame_globals = PyFrame_GetGlobals(frame);
++  frame_locals = PyFrame_GetLocals(frame);
++  if (frame_globals != frame_locals) {
+     // Not at global module scope
+-    return false;
++    goto exit;
    }
--  ssize_t total_size = fields->field_count();
-+  Py_ssize_t total_size = fields->field_count();
-   if (self->index >= total_size) {
-     PyErr_Format(PyExc_ValueError,
-                  "UnknownField does not exist. "
+ #endif
+-  return true;
++  result = true;
++exit:
++  Py_XDECREF(frame_globals);
++  Py_XDECREF(frame_locals);
++  Py_XDECREF(frame_code);
++  Py_XDECREF(frame);
++  return result;
+ }
+ 
+ // If the calling code is not a _pb2.py file, raise AttributeError.
\ No newline at end of file
diff --git a/third_party/psimd/BUILD b/third_party/psimd/BUILD
index 82bab3ffd96..94210a033a8 100644
--- a/third_party/psimd/BUILD
+++ b/third_party/psimd/BUILD
@@ -1 +1,3 @@
 # This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/py/numpy/tf_numpy_api/BUILD b/third_party/py/numpy/tf_numpy_api/BUILD
index fc95ad62e40..070f8ab8a65 100644
--- a/third_party/py/numpy/tf_numpy_api/BUILD
+++ b/third_party/py/numpy/tf_numpy_api/BUILD
@@ -1,6 +1,7 @@
 # TensorFlow API backwards compatibility test goldens for tf.experimental.numpy.
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/third_party/pybind11_abseil/BUILD b/third_party/pybind11_abseil/BUILD
index 287c52184b4..3b946e563d4 100644
--- a/third_party/pybind11_abseil/BUILD
+++ b/third_party/pybind11_abseil/BUILD
@@ -1 +1,3 @@
 # Necessary for bazel to recognize this as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/pybind11_abseil/remove_license.patch b/third_party/pybind11_abseil/remove_license.patch
new file mode 100644
index 00000000000..91852c2f398
--- /dev/null
+++ b/third_party/pybind11_abseil/remove_license.patch
@@ -0,0 +1,13 @@
+diff --git a/pybind11_abseil/BUILD b/pybind11_abseil/BUILD
+index 41482e7..ed9e4af 100644
+--- a/pybind11_abseil/BUILD
++++ b/pybind11_abseil/BUILD
+@@ -6,8 +6,6 @@ package(default_visibility = ["//visibility:public"])
+
+ licenses(["notice"])
+
+-exports_files(["LICENSE"])
+-
+ pybind_library(
+     name = "absl_casters",
+     hdrs = ["absl_casters.h"],
\ No newline at end of file
diff --git a/third_party/pybind11_abseil/workspace.bzl b/third_party/pybind11_abseil/workspace.bzl
index b46fa810277..31f479658c8 100644
--- a/third_party/pybind11_abseil/workspace.bzl
+++ b/third_party/pybind11_abseil/workspace.bzl
@@ -16,6 +16,7 @@ def repo():
         strip_prefix = "pybind11_abseil-{commit}".format(commit = PA_COMMIT),
         urls = tf_mirror_urls("https://github.com/pybind/pybind11_abseil/archive/{commit}.tar.gz".format(commit = PA_COMMIT)),
         build_file = "//third_party/pybind11_abseil:BUILD",
+        patch_file = ["//third_party/pybind11_abseil:remove_license.patch"],
     )
 
     # pybind11_bazel is a dependency of pybind11_abseil.
diff --git a/third_party/ruy/BUILD b/third_party/ruy/BUILD
index be4e6bb1268..676e6379f92 100644
--- a/third_party/ruy/BUILD
+++ b/third_party/ruy/BUILD
@@ -1,6 +1,7 @@
 # Ruy is not BLAS
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index a2ab4924f29..9bcff36a2a9 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -19,7 +19,7 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = ["-DHAVE_CONFIG_H"] + select({
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": [
             "-fno-exceptions",
             "-Wno-sign-compare",
@@ -28,7 +28,7 @@ cc_library(
         ],
     }),
     defines = select({
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": ["HAVE_SYS_UIO_H"],
     }),
 )
diff --git a/third_party/sobol_data/BUILD b/third_party/sobol_data/BUILD
index 82bab3ffd96..94210a033a8 100644
--- a/third_party/sobol_data/BUILD
+++ b/third_party/sobol_data/BUILD
@@ -1 +1,3 @@
 # This empty BUILD file is required to make Bazel treat this directory as a package.
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/sqlite.BUILD b/third_party/sqlite.BUILD
index 58c5f3f0c8f..1d82bd1441e 100644
--- a/third_party/sqlite.BUILD
+++ b/third_party/sqlite.BUILD
@@ -11,10 +11,10 @@ SQLITE_COPTS = [
     "-D_FILE_OFFSET_BITS=64",
     "-D_REENTRANT=1",
 ] + select({
-    "@org_tensorflow//tensorflow:windows": [
+    "@org_tensorflow//tensorflow/tsl:windows": [
         "-DSQLITE_MAX_TRIGGER_DEPTH=100",
     ],
-    "@org_tensorflow//tensorflow:macos": [
+    "@org_tensorflow//tensorflow/tsl:macos": [
         "-Os",
         "-DHAVE_GMTIME_R=1",
         "-DHAVE_LOCALTIME_R=1",
@@ -46,7 +46,7 @@ cc_library(
         "SQLITE_OMIT_DEPRECATED",
     ],
     linkopts = select({
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": [
             "-ldl",
             "-lpthread",
diff --git a/third_party/stablehlo/BUILD b/third_party/stablehlo/BUILD
index 042b00763f6..ad212b2dd33 100644
--- a/third_party/stablehlo/BUILD
+++ b/third_party/stablehlo/BUILD
@@ -263,6 +263,262 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "version",
+    srcs = [
+        "stablehlo/dialect/Version.cpp",
+    ],
+    hdrs = [
+        "stablehlo/dialect/Version.h",
+    ],
+    includes = ["."],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_attr_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-attr-interface-decls"],
+            "stablehlo/dialect/VhloAttrInterfaces.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "stablehlo/dialect/VhloAttrInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloAttrs.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_attrs_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-attrdef-decls"],
+            "stablehlo/dialect/VhloAttrs.h.inc",
+        ),
+        (
+            ["-gen-attrdef-defs"],
+            "stablehlo/dialect/VhloAttrs.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloOps.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_enums_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-enum-decls"],
+            "stablehlo/dialect/VhloEnums.h.inc",
+        ),
+        (
+            ["-gen-enum-defs"],
+            "stablehlo/dialect/VhloEnums.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloEnums.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_op_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "stablehlo/dialect/VhloOpInterfaces.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "stablehlo/dialect/VhloOpInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloOps.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "vhlo_ops",
+    srcs = [
+        "stablehlo/dialect/VhloBytecode.cpp",
+        "stablehlo/dialect/VhloOps.cpp",
+    ],
+    hdrs = [
+        "stablehlo/dialect/VhloBytecode.h",
+        "stablehlo/dialect/VhloOps.h",
+    ],
+    includes = ["."],
+    deps = [
+        ":base",
+        ":stablehlo_assembly_format",
+        ":version",
+        ":vhlo_attr_interfaces_inc_gen",
+        ":vhlo_attrs_inc_gen",
+        ":vhlo_enums_inc_gen",
+        ":vhlo_op_interfaces_inc_gen",
+        ":vhlo_ops_inc_gen",
+        ":vhlo_type_interfaces_inc_gen",
+        ":vhlo_types_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "stablehlo/dialect/VhloOps.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "stablehlo/dialect/VhloOps.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloOps.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+td_library(
+    name = "vhlo_ops_td_files",
+    srcs = [
+        "stablehlo/dialect/VhloAttrs.td",
+        "stablehlo/dialect/VhloBase.td",
+        "stablehlo/dialect/VhloDialect.td",
+        "stablehlo/dialect/VhloEnums.td",
+        "stablehlo/dialect/VhloOps.td",
+        "stablehlo/dialect/VhloTypes.td",
+    ],
+    includes = ["."],
+    deps = [
+        "@llvm-project//mlir:BuiltinDialectTdFiles",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:ShapeOpsTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_type_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-type-interface-decls"],
+            "stablehlo/dialect/VhloTypeInterfaces.h.inc",
+        ),
+        (
+            ["-gen-type-interface-defs"],
+            "stablehlo/dialect/VhloTypeInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloTypes.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "vhlo_types_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-typedef-decls"],
+            "stablehlo/dialect/VhloTypeDefs.h.inc",
+        ),
+        (
+            ["-gen-typedef-defs"],
+            "stablehlo/dialect/VhloTypeDefs.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/dialect/VhloOps.td",
+    deps = [
+        ":vhlo_ops_td_files",
+    ],
+)
+
+gentbl_cc_library(
+    name = "stablehlo_pass_inc_gen",
+    strip_include_prefix = ".",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+            ],
+            "stablehlo/transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "stablehlo/transforms/Passes.td",
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "stablehlo_passes",
+    srcs = [
+        "stablehlo/transforms/StablehloLegalizeToVhlo.cpp",
+        "stablehlo/transforms/VhloLegalizeToStablehlo.cpp",
+        "stablehlo/transforms/VhloToVersion.cpp",
+    ],
+    hdrs = [
+        "stablehlo/transforms/MapStablehloToVhlo.h",
+        "stablehlo/transforms/Passes.h",
+    ],
+    includes = ["."],
+    deps = [
+        ":stablehlo_ops",
+        ":stablehlo_ops_inc_gen",
+        ":stablehlo_pass_inc_gen",
+        ":stablehlo_type_conversion",
+        ":version",
+        ":vhlo_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "stablehlo_type_conversion",
+    srcs = ["stablehlo/transforms/TypeConversion.cpp"],
+    hdrs = ["stablehlo/transforms/TypeConversion.h"],
+    includes = ["."],
+    deps = [
+        ":stablehlo_ops",
+        ":vhlo_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncTransforms",
+        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -381,7 +637,9 @@ cc_library(
     deps = [
         ":chlo_ops",
         ":stablehlo_ops",
+        ":vhlo_ops",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SparseTensorDialect",
     ],
 )
 
@@ -569,9 +827,11 @@ cc_library(
     includes = ["."],
     deps = [
         ":base",
+        ":stablehlo_assembly_format",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -633,6 +893,7 @@ cc_binary(
     ],
     deps = [
         ":register",
+        ":stablehlo_passes",
         ":test_utils",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
@@ -678,6 +939,7 @@ cc_library(
     ],
     includes = ["."],
     deps = [
+        ":stablehlo_assembly_format",
         ":test_utils_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index 8b137891791..46fc0b517e1 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1 +1,25 @@
+diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.cpp b/stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+--- stablehlo/stablehlo/dialect/AssemblyFormat.cpp
++++ stablehlo/stablehlo/dialect/AssemblyFormat.cpp
+@@ -349,7 +349,7 @@
+ }
+ 
+ ParseResult parseDimSizes(AsmParser& parser,
+-                          FailureOr<SmallVector<int64_t>>& dimSizes) {
++                          SmallVectorImpl<int64_t>& dimSizes) {
+   auto failOrDimSizes = parseDimSizes(parser);
+   if (failed(failOrDimSizes)) {
+     return failure();
+diff --ruN a/stablehlo/stablehlo/dialect/AssemblyFormat.h b/stablehlo/stablehlo/dialect/AssemblyFormat.h
+--- stablehlo/stablehlo/dialect/AssemblyFormat.h
++++ stablehlo/stablehlo/dialect/AssemblyFormat.h
+@@ -186,7 +186,7 @@
+ 
+ FailureOr<SmallVector<int64_t>> parseDimSizes(AsmParser& parser);
+ ParseResult parseDimSizes(AsmParser& parser,
+-                          FailureOr<SmallVector<int64_t>>& dimSizes);
++                          SmallVectorImpl<int64_t>& dimSizes);
+ 
+ // ExponentMantissa - Abbreviated printing of exponent and mantissa as e#m#.
+ //
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 3625ae6607d..c87363ecb75 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "f70b37a3aa578541f8f83d4d0230b86ea5bc77f6"
-    STABLEHLO_SHA256 = "cda192b3e94388f16f65ea83bb0ea9b3cabbcd9cf1e3695e1740b5d9188f162d"
+    STABLEHLO_COMMIT = "51f005f0a8ff6e28f535adfec4de936cb4097aa4"
+    STABLEHLO_SHA256 = "7dd331bb9e954d040bf90f18b3150c922dc7cc26371793d69093736a2982bdb7"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
@@ -15,6 +15,6 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/stablehlo/archive/{commit}.zip".format(commit = STABLEHLO_COMMIT)),
         build_file = "//third_party/stablehlo:BUILD",
         patch_file = [
-            "//third_party/stablehlo:temporary.patch",  # Cherry-picks and temporary reverts. Do not remove even if temporary.patch is empty.
+            "//third_party/stablehlo:temporary.patch",  # Autogenerated, don't remove.
         ],
     )
diff --git a/third_party/systemlibs/grpc.bazel.generate_cc.bzl b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
index cd7304a9515..3c3f20c06ec 100644
--- a/third_party/systemlibs/grpc.bazel.generate_cc.bzl
+++ b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
@@ -155,7 +155,7 @@ _generate_cc = rule(
         "plugin": attr.label(
             executable = True,
             providers = ["files_to_run"],
-            cfg = "host",
+            cfg = "exec",
         ),
         "flags": attr.string_list(
             mandatory = False,
@@ -169,7 +169,7 @@ _generate_cc = rule(
         "_protoc": attr.label(
             default = Label("@com_google_protobuf//:protoc"),
             executable = True,
-            cfg = "host",
+            cfg = "exec",
         ),
     },
     # We generate .h files, so we need to output to genfiles.
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 4c847237faa..d3ca3f57a50 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -125,6 +125,10 @@ def _get_tensorrt_static_path(repository_ctx):
     """Returns the path for TensorRT static libraries."""
     return get_host_environ(repository_ctx, _TF_TENSORRT_STATIC_PATH, None)
 
+def _get_tensorrt_full_version(repository_ctx):
+    """Returns the full version for TensorRT."""
+    return get_host_environ(repository_ctx, _TF_TENSORRT_VERSION, None)
+
 def _create_local_tensorrt_repository(repository_ctx):
     # Resolve all labels before doing any real work. Resolving causes the
     # function to be restarted with all previous state being lost. This
@@ -139,8 +143,11 @@ def _create_local_tensorrt_repository(repository_ctx):
         "plugin.BUILD": _tpl_path(repository_ctx, "plugin.BUILD"),
     }
 
-    config = find_cuda_config(repository_ctx, find_cuda_config_path, ["tensorrt"])
+    config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda", "tensorrt"])
+    cuda_version = config["cuda_version"]
+    cuda_library_path = config["cuda_library_dir"] + "/"
     trt_version = config["tensorrt_version"]
+    trt_full_version = _get_tensorrt_full_version(repository_ctx)
     cpu_value = get_cpu_value(repository_ctx)
 
     # Copy the library and header files.
@@ -167,21 +174,29 @@ def _create_local_tensorrt_repository(repository_ctx):
     tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)
     if tensorrt_static_path:
         tensorrt_static_path = tensorrt_static_path + "/"
-        if _at_least_version(trt_version, "8"):
+        if _at_least_version(trt_full_version, "8.4.1") and _at_least_version(cuda_version, "11.4"):
+            raw_static_library_names = _TF_TENSORRT_LIBS
+            nvrtc_ptxjit_static_raw_names = ["nvrtc", "nvrtc-builtins", "nvptxcompiler"]
+            nvrtc_ptxjit_static_names = ["%s_static" % name for name in nvrtc_ptxjit_static_raw_names]
+            nvrtc_ptxjit_static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in nvrtc_ptxjit_static_names]
+        elif _at_least_version(trt_version, "8"):
             raw_static_library_names = _TF_TENSORRT_LIBS
+            nvrtc_ptxjit_static_libraries = []
         else:
             raw_static_library_names = _TF_TENSORRT_LIBS + ["nvrtc", "myelin_compiler", "myelin_executor", "myelin_pattern_library", "myelin_pattern_runtime"]
+            nvrtc_ptxjit_static_libraries = []
         static_library_names = ["%s_static" % name for name in raw_static_library_names]
         static_libraries = [lib_name(lib, cpu_value, trt_version, static = True) for lib in static_library_names]
-        if tensorrt_static_path != None:
-            copy_rules = copy_rules + [
-                make_copy_files_rule(
-                    repository_ctx,
-                    name = "tensorrt_static_lib",
-                    srcs = [tensorrt_static_path + library for library in static_libraries],
-                    outs = ["tensorrt/lib/" + library for library in static_libraries],
-                ),
-            ]
+        copy_rules = copy_rules + [
+            make_copy_files_rule(
+                repository_ctx,
+                name = "tensorrt_static_lib",
+                srcs = [tensorrt_static_path + library for library in static_libraries] +
+                       [cuda_library_path + library for library in nvrtc_ptxjit_static_libraries],
+                outs = ["tensorrt/lib/" + library for library in static_libraries] +
+                       ["tensorrt/lib/" + library for library in nvrtc_ptxjit_static_libraries],
+            ),
+        ]
 
     # Set up config file.
     repository_ctx.template(
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 882cccb8b9a..fd92fdf5ac8 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "8678704dfcf48b2a7039e56fde0e9bd58bce7828"
-    TFRT_SHA256 = "46cd465aab34eec5f21f1ff746076b494cae3f3295b20265376a96745de29da8"
+    TFRT_COMMIT = "e94b53450349f6837d11cc39f614af86c825ef94"
+    TFRT_SHA256 = "aca5261fe3990ecc6310d5e6222f0afca808ac64d6b9539f8694cf1972858071"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/BUILD b/third_party/triton/BUILD
new file mode 100644
index 00000000000..3c413807167
--- /dev/null
+++ b/third_party/triton/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
new file mode 100644
index 00000000000..761e32796a5
--- /dev/null
+++ b/third_party/triton/workspace.bzl
@@ -0,0 +1,16 @@
+"""Provides the repository macro to import Triton."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports Triton."""
+
+    TRITON_COMMIT = "2c3853269281da6742cf469a5ca5772947d271ce"
+    TRITON_SHA256 = "a7066a14ba5bae6e02436feeb7db26c0265bcd724d9712a751311f94893d1851"
+
+    tf_http_archive(
+        name = "triton",
+        sha256 = TRITON_SHA256,
+        strip_prefix = "triton-{commit}".format(commit = TRITON_COMMIT),
+        urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
+    )
diff --git a/third_party/vulkan_headers/BUILD b/third_party/vulkan_headers/BUILD
index e69de29bb2d..3c413807167 100644
--- a/third_party/vulkan_headers/BUILD
+++ b/third_party/vulkan_headers/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/vulkan_headers/vulkan_headers.BUILD b/third_party/vulkan_headers/vulkan_headers.BUILD
index b4b1cd48714..b05e0055925 100644
--- a/third_party/vulkan_headers/vulkan_headers.BUILD
+++ b/third_party/vulkan_headers/vulkan_headers.BUILD
@@ -53,8 +53,8 @@ cc_library(
     name = "vulkan_hpp",
     srcs =
         select({
-            "@org_tensorflow//tensorflow:macos": [],
-            "@org_tensorflow//tensorflow:ios": [],
+            "@org_tensorflow//tensorflow/tsl:macos": [],
+            "@org_tensorflow//tensorflow/tsl:ios": [],
             "//conditions:default": ["tensorflow/vulkan_hpp_dispatch_loader_dynamic.cc"],
         }),
     hdrs = ["include/vulkan/vulkan.hpp"],
@@ -65,8 +65,8 @@ cc_library(
         "VULKAN_HPP_TYPESAFE_CONVERSION",
         "VULKAN_HPP_TYPESAFE_EXPLICIT",
     ] + select({
-        "@org_tensorflow//tensorflow:macos": [],
-        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow/tsl:macos": [],
+        "@org_tensorflow//tensorflow/tsl:ios": [],
         "//conditions:default": ["VULKAN_HPP_DISPATCH_LOADER_DYNAMIC"],
     }),
     includes = ["include"],
diff --git a/third_party/wrapt.BUILD b/third_party/wrapt.BUILD
index bed5332edaf..da65ea90300 100644
--- a/third_party/wrapt.BUILD
+++ b/third_party/wrapt.BUILD
@@ -2,6 +2,7 @@ py_library(
     name = "wrapt",
     srcs = [
         "__init__.py",
+        "arguments.py",
         "decorators.py",
         "importer.py",
         "wrappers.py",
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index 33694eaaaed..49e44f95019 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -33,7 +33,7 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow/tsl:windows": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",